From a902c236c9f0883a1cdec365490c57857e278147 Mon Sep 17 00:00:00 2001 From: Mario Fetka Date: Sat, 24 Nov 2012 17:08:51 +0100 Subject: [PATCH] Initial patches commit --- ...bility-patch-for-v5-network-controll.patch | 553 + ...-I-O-context-code-for-BFQ-v5-for-3.2.patch | 299 + ...compatibility-patch-for-v5-interface.patch | 391 + ...ps-kconfig-build-bits-for-BFQ-v5-3.2.patch | 46 + ...fa-backward-compatibility-with-broke.patch | 69 + ...troduce-the-BFQ-v5-I-O-sched-for-3.2.patch | 5986 + 3.2.34/01patch-2.6.33_atopcnt.patch | 174 + 3.2.34/02patch-2.6.33_atopacct.patch | 125 + 3.2.34/3.2.0-ck1.patch | 9093 + 3.2.34/3rd-3rdparty-1.0-tree.patch | 181 + .../3rd-3rdparty-button_hotplug-0.4.1.patch | 372 + ...3rd-3rdparty-gpio_button_hotplug-0.1.patch | 472 + 3.2.34/3rd-3rdparty-gpio_event_drv-0.1.patch | 1354 + 3.2.34/3rd-3rdparty-merge.patch | 156 + 3.2.34/3rd-3rdparty-netatop-0.1.1.patch | 1769 + 3.2.34/910-kobject_uevent.patch | 21 + 3.2.34/911-kobject_add_broadcast_uevent.patch | 85 + .../Add_CONFIG_VFAT_FS_DUALNAMES_option.patch | 145 + 3.2.34/accessfs-3.2-0.26.patch | 1036 + ...over-ide-drivers-when-both-are-built.patch | 36 + 3.2.34/aufs3-standalone-3.2.patch | 30657 +++ 3.2.34/bump/1021_linux-3.2.22.patch | 1245 + 3.2.34/bump/1022_linux-3.2.23.patch | 1862 + 3.2.34/bump/1023_linux-3.2.24.patch | 4684 + 3.2.34/bump/1024_linux-3.2.25.patch | 4503 + 3.2.34/bump/1025_linux-3.2.26.patch | 238 + 3.2.34/bump/1026_linux-3.2.27.patch | 3188 + 3.2.34/bump/1027_linux-3.2.28.patch | 1114 + 3.2.34/bump/1028_linux-3.2.29.patch | 4279 + 3.2.34/bump/1029_linux-3.2.30.patch | 5552 + 3.2.34/bump/1030_linux-3.2.31.patch | 3327 + 3.2.34/bump/1031_linux-3.2.32.patch | 6206 + 3.2.34/bump/1032_linux-3.2.33.patch | 3450 + 3.2.34/bump/1033_linux-3.2.34.patch | 3678 + 3.2.34/cloneconfig.patch | 41 + 3.2.34/colored-printk-3.2.33.patch | 337 + 3.2.34/hz-432-kconfig-option.patch | 25 + 3.2.34/hz-864-kconfig-option.patch | 25 + 3.2.34/imqmq-3.2.patch | 1603 + ...press-kernel-modules-on-installation.patch | 137 + 3.2.34/kernel-3.2-lsproduo.patch | 569 + 3.2.34/kernel-3.2-lsql.patch | 439 + 3.2.34/kernel-3.2-lsxhl.patch | 387 + 3.2.34/kernel-3.4.0-layer7-2.22.patch | 2132 + 3.2.34/kirkwood-jumbo-frame.patch | 135 + 3.2.34/linux-2.6-defaults-fat-utf8.patch | 15 + 3.2.34/linux-2.6-x86-tune-generic.patch | 13 + 3.2.34/linux-3.2-e2c-0.4.58.patch | 7807 + 3.2.34/linux-3.2.33-zfs.patch | 201830 +++++++++++++++ 3.2.34/lschlv2.patch | 256 + ...net-netfilter-IFWLOG-2.6.35-buildfix.patch | 32 + ...net-netfilter-IFWLOG-2.6.37-buildfix.patch | 15 + 3.2.34/net-netfilter-IFWLOG-mdv.patch | 264 + 3.2.34/net-netfilter-IFWLOG.patch | 269 + .../net-netfilter-psd-2.6.35-buildfix.patch | 11 + 3.2.34/net-netfilter-psd-mdv.patch | 235 + 3.2.34/net-netfilter-psd.patch | 420 + ...implement-rfc-1123-for-ftp-conntrack.patch | 190 + 3.2.34/netfilter-ip_conntrack_slp.patch | 185 + ...2-btrfs-Introduce-btrfs_get_maps_dev.patch | 39 + ...btrfs-0900-add-allocator-tracepoints.patch | 304 + ...-8001-rewrite-btrfs_trim_block_group.patch | 299 + ...k-and-disable-irq-during-space-alloc.patch | 40 + ...-8013-sector-size-check-during-mount.patch | 43 + ...ctl-to-determine-size-of-compressed-.patch | 158 + ...e-lzo-the-default-compression-scheme.patch | 68 + ...8024-workaround-for-cleaner-deadlock.patch | 32 + ...bal-block_rsv-when-creating-a-new-bl.patch | 61 + ...le-deadlock-when-opening-a-seed-devi.patch | 84 + ...rfs-allow-cross-subvolume-file-clone.patch | 47 + 3.2.34/series | 68 + 3.2.34/uksm-0.1.2.1-for-v3.2.ge.31.patch | 7032 + ...rkwood-Add-support-for-Buffalo-LS-VL.patch | 381 + ...kwood-Add-support-for-Buffalo-LS-WVL.patch | 538 + 3.2.34/vserver-3.2.34-vs2.3.2.15.patch | 26125 ++ 3.2.34/wrapfs-v3.2.2-45-ga5296eb.patch | 2084 + ...bility-patch-for-v5-network-controll.patch | 553 + ...ps-kconfig-build-bits-for-BFQ-v5-3.3.patch | 99 + ...compatibility-patch-for-v5-interface.patch | 391 + ...troduce-the-BFQ-v5-I-O-sched-for-3.3.patch | 5624 + ...fa-backward-compatibility-with-broke.patch | 69 + 3.3.8/01patch-2.6.33_atopcnt.patch | 174 + 3.3.8/02patch-2.6.33_atopacct.patch | 125 + 3.3.8/3.3-ck1.patch | 8782 + 3.3.8/3rd-3rdparty-1.0-tree.patch | 181 + 3.3.8/3rd-3rdparty-button_hotplug-0.4.1.patch | 372 + ...3rd-3rdparty-gpio_button_hotplug-0.1.patch | 472 + 3.3.8/3rd-3rdparty-gpio_event_drv-0.1.patch | 1354 + 3.3.8/3rd-3rdparty-merge.patch | 156 + 3.3.8/3rd-3rdparty-netatop-0.1.1.patch | 1769 + 3.3.8/600-netfilter_layer7_2.22.patch | 2142 + 3.3.8/601-netfilter_layer7_pktmatch.patch | 108 + 3.3.8/602-netfilter_layer7_match.patch | 51 + 3.3.8/603-netfilter_layer7_2.6.36_fix.patch | 61 + 3.3.8/604-netfilter_cisco_794x_iphone.patch | 118 + ...etfilter_match_bypass_default_checks.patch | 93 + ...netfilter_match_bypass_default_table.patch | 81 + ...netfilter_match_reduce_memory_access.patch | 16 + ...-netfilter_optional_tcp_window_check.patch | 36 + 3.3.8/620-sched_esfq.patch | 791 + 3.3.8/621-sched_act_connmark.patch | 172 + 3.3.8/910-kobject_uevent.patch | 21 + 3.3.8/911-kobject_add_broadcast_uevent.patch | 85 + .../Add_CONFIG_VFAT_FS_DUALNAMES_option.patch | 145 + 3.3.8/accessfs-3.2-0.26.patch | 1036 + ...over-ide-drivers-when-both-are-built.patch | 36 + 3.3.8/aufs-3.x-rcN.patch | 29364 +++ 3.3.8/cloneconfig.patch | 41 + 3.3.8/colored-printk-3.3.8.patch | 337 + ...-directory-updates-during-log-replay.patch | 54 + ...ockdep-warning-in-miscdev-operations.patch | 103 + ...v-file-ops-on-inherited-passed-files.patch | 95 + ...ly-flag-before-doing-privileged-open.patch | 42 + ...le_check_list-on-ELOOP_CVE-2012-3375.patch | 35 + ...drop_write-call-in-ext4_ioc_move_ext.patch | 31 + ...on-for-ext3-file-systems-w-uninit_bg.patch | 73 + ...riggerable-bug-from-generic_setlease.patch | 39 + ...le-length-is-corrupted_CVE-2012-3400.patch | 51 + ...ading-of-sparing-table_CVE-2012-3400.patch | 132 + ...-of-abusing-i-in-udf_load_logicalvol.patch | 32 + 3.3.8/hz-432-kconfig-option.patch | 25 + 3.3.8/hz-864-kconfig-option.patch | 25 + 3.3.8/imqmq-3.3.patch | 1613 + ...press-kernel-modules-on-installation.patch | 137 + 3.3.8/kirkwood-jumbo-frame.patch | 135 + 3.3.8/linux-2.6-defaults-fat-utf8.patch | 15 + 3.3.8/linux-2.6-x86-tune-generic.patch | 13 + 3.3.8/linux-3.4-e2c-0.4.59.patch | 7781 + 3.3.8/lschlv2.patch | 256 + ...net-netfilter-IFWLOG-2.6.35-buildfix.patch | 32 + ...net-netfilter-IFWLOG-2.6.37-buildfix.patch | 15 + 3.3.8/net-netfilter-IFWLOG-mdv.patch | 264 + 3.3.8/net-netfilter-IFWLOG.patch | 269 + 3.3.8/net-netfilter-psd-2.6.35-buildfix.patch | 11 + 3.3.8/net-netfilter-psd-mdv.patch | 235 + 3.3.8/net-netfilter-psd.patch | 420 + ...implement-rfc-1123-for-ftp-conntrack.patch | 190 + 3.3.8/netfilter-ip_conntrack_slp.patch | 185 + 3.3.8/series | 87 + 3.3.8/uksm-0.1.2.1-for-v3.3.ge.8.patch | 7023 + ...rkwood-Add-support-for-Buffalo-LS-VL.patch | 381 + ...kwood-Add-support-for-Buffalo-LS-WVL.patch | 538 + ...on-Add-support-for-Buffalo-LS-PRODUO.patch | 569 + ...ood-Add-support-for-Buffalo-LS-CHLv2.patch | 278 + ...kwood-Add-support-for-Buffalo-LS-XHL.patch | 388 + ...-orion-Add-support-for-Buffalo-LS-QL.patch | 439 + 3.3.8/vserver-3.3.8-vs2.3.3.4.patch | 26065 ++ 3.3.8/wrapfs-v3.3-rc1-429-g65388bc.patch | 1913 + 148 files changed, 455965 insertions(+) create mode 100644 3.2.34/0001-AppArmor-compatibility-patch-for-v5-network-controll.patch create mode 100644 3.2.34/0001-block-prepare-I-O-context-code-for-BFQ-v5-for-3.2.patch create mode 100644 3.2.34/0002-AppArmor-compatibility-patch-for-v5-interface.patch create mode 100644 3.2.34/0002-block-cgroups-kconfig-build-bits-for-BFQ-v5-3.2.patch create mode 100644 3.2.34/0003-AppArmor-Allow-dfa-backward-compatibility-with-broke.patch create mode 100644 3.2.34/0003-block-introduce-the-BFQ-v5-I-O-sched-for-3.2.patch create mode 100644 3.2.34/01patch-2.6.33_atopcnt.patch create mode 100644 3.2.34/02patch-2.6.33_atopacct.patch create mode 100644 3.2.34/3.2.0-ck1.patch create mode 100644 3.2.34/3rd-3rdparty-1.0-tree.patch create mode 100644 3.2.34/3rd-3rdparty-button_hotplug-0.4.1.patch create mode 100644 3.2.34/3rd-3rdparty-gpio_button_hotplug-0.1.patch create mode 100644 3.2.34/3rd-3rdparty-gpio_event_drv-0.1.patch create mode 100644 3.2.34/3rd-3rdparty-merge.patch create mode 100644 3.2.34/3rd-3rdparty-netatop-0.1.1.patch create mode 100644 3.2.34/910-kobject_uevent.patch create mode 100644 3.2.34/911-kobject_add_broadcast_uevent.patch create mode 100644 3.2.34/Add_CONFIG_VFAT_FS_DUALNAMES_option.patch create mode 100644 3.2.34/accessfs-3.2-0.26.patch create mode 100644 3.2.34/ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch create mode 100644 3.2.34/aufs3-standalone-3.2.patch create mode 100644 3.2.34/bump/1021_linux-3.2.22.patch create mode 100644 3.2.34/bump/1022_linux-3.2.23.patch create mode 100644 3.2.34/bump/1023_linux-3.2.24.patch create mode 100644 3.2.34/bump/1024_linux-3.2.25.patch create mode 100644 3.2.34/bump/1025_linux-3.2.26.patch create mode 100644 3.2.34/bump/1026_linux-3.2.27.patch create mode 100644 3.2.34/bump/1027_linux-3.2.28.patch create mode 100644 3.2.34/bump/1028_linux-3.2.29.patch create mode 100644 3.2.34/bump/1029_linux-3.2.30.patch create mode 100644 3.2.34/bump/1030_linux-3.2.31.patch create mode 100644 3.2.34/bump/1031_linux-3.2.32.patch create mode 100644 3.2.34/bump/1032_linux-3.2.33.patch create mode 100644 3.2.34/bump/1033_linux-3.2.34.patch create mode 100644 3.2.34/cloneconfig.patch create mode 100644 3.2.34/colored-printk-3.2.33.patch create mode 100644 3.2.34/hz-432-kconfig-option.patch create mode 100644 3.2.34/hz-864-kconfig-option.patch create mode 100644 3.2.34/imqmq-3.2.patch create mode 100644 3.2.34/kbuild-compress-kernel-modules-on-installation.patch create mode 100644 3.2.34/kernel-3.2-lsproduo.patch create mode 100644 3.2.34/kernel-3.2-lsql.patch create mode 100644 3.2.34/kernel-3.2-lsxhl.patch create mode 100644 3.2.34/kernel-3.4.0-layer7-2.22.patch create mode 100644 3.2.34/kirkwood-jumbo-frame.patch create mode 100644 3.2.34/linux-2.6-defaults-fat-utf8.patch create mode 100644 3.2.34/linux-2.6-x86-tune-generic.patch create mode 100644 3.2.34/linux-3.2-e2c-0.4.58.patch create mode 100644 3.2.34/linux-3.2.33-zfs.patch create mode 100644 3.2.34/lschlv2.patch create mode 100644 3.2.34/net-netfilter-IFWLOG-2.6.35-buildfix.patch create mode 100644 3.2.34/net-netfilter-IFWLOG-2.6.37-buildfix.patch create mode 100644 3.2.34/net-netfilter-IFWLOG-mdv.patch create mode 100644 3.2.34/net-netfilter-IFWLOG.patch create mode 100644 3.2.34/net-netfilter-psd-2.6.35-buildfix.patch create mode 100644 3.2.34/net-netfilter-psd-mdv.patch create mode 100644 3.2.34/net-netfilter-psd.patch create mode 100644 3.2.34/netfilter-implement-rfc-1123-for-ftp-conntrack.patch create mode 100644 3.2.34/netfilter-ip_conntrack_slp.patch create mode 100644 3.2.34/patches.suse/0002-btrfs-Introduce-btrfs_get_maps_dev.patch create mode 100644 3.2.34/patches.suse/btrfs-0900-add-allocator-tracepoints.patch create mode 100644 3.2.34/patches.suse/btrfs-8001-rewrite-btrfs_trim_block_group.patch create mode 100644 3.2.34/patches.suse/btrfs-8007-lock-and-disable-irq-during-space-alloc.patch create mode 100644 3.2.34/patches.suse/btrfs-8013-sector-size-check-during-mount.patch create mode 100644 3.2.34/patches.suse/btrfs-8014-add-new-ioctl-to-determine-size-of-compressed-.patch create mode 100644 3.2.34/patches.suse/btrfs-8015-make-lzo-the-default-compression-scheme.patch create mode 100644 3.2.34/patches.suse/btrfs-8024-workaround-for-cleaner-deadlock.patch create mode 100644 3.2.34/patches.suse/btrfs-8025-update-global-block_rsv-when-creating-a-new-bl.patch create mode 100644 3.2.34/patches.suse/btrfs-8026-fix-possible-deadlock-when-opening-a-seed-devi.patch create mode 100644 3.2.34/patches.suse/btrfs-allow-cross-subvolume-file-clone.patch create mode 100644 3.2.34/series create mode 100644 3.2.34/uksm-0.1.2.1-for-v3.2.ge.31.patch create mode 100644 3.2.34/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-VL.patch create mode 100644 3.2.34/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-WVL.patch create mode 100644 3.2.34/vserver-3.2.34-vs2.3.2.15.patch create mode 100644 3.2.34/wrapfs-v3.2.2-45-ga5296eb.patch create mode 100644 3.3.8/0001-AppArmor-compatibility-patch-for-v5-network-controll.patch create mode 100644 3.3.8/0001-block-cgroups-kconfig-build-bits-for-BFQ-v5-3.3.patch create mode 100644 3.3.8/0002-AppArmor-compatibility-patch-for-v5-interface.patch create mode 100644 3.3.8/0002-block-introduce-the-BFQ-v5-I-O-sched-for-3.3.patch create mode 100644 3.3.8/0003-AppArmor-Allow-dfa-backward-compatibility-with-broke.patch create mode 100644 3.3.8/01patch-2.6.33_atopcnt.patch create mode 100644 3.3.8/02patch-2.6.33_atopacct.patch create mode 100644 3.3.8/3.3-ck1.patch create mode 100644 3.3.8/3rd-3rdparty-1.0-tree.patch create mode 100644 3.3.8/3rd-3rdparty-button_hotplug-0.4.1.patch create mode 100644 3.3.8/3rd-3rdparty-gpio_button_hotplug-0.1.patch create mode 100644 3.3.8/3rd-3rdparty-gpio_event_drv-0.1.patch create mode 100644 3.3.8/3rd-3rdparty-merge.patch create mode 100644 3.3.8/3rd-3rdparty-netatop-0.1.1.patch create mode 100644 3.3.8/600-netfilter_layer7_2.22.patch create mode 100644 3.3.8/601-netfilter_layer7_pktmatch.patch create mode 100644 3.3.8/602-netfilter_layer7_match.patch create mode 100644 3.3.8/603-netfilter_layer7_2.6.36_fix.patch create mode 100644 3.3.8/604-netfilter_cisco_794x_iphone.patch create mode 100644 3.3.8/610-netfilter_match_bypass_default_checks.patch create mode 100644 3.3.8/611-netfilter_match_bypass_default_table.patch create mode 100644 3.3.8/612-netfilter_match_reduce_memory_access.patch create mode 100644 3.3.8/613-netfilter_optional_tcp_window_check.patch create mode 100644 3.3.8/620-sched_esfq.patch create mode 100644 3.3.8/621-sched_act_connmark.patch create mode 100644 3.3.8/910-kobject_uevent.patch create mode 100644 3.3.8/911-kobject_add_broadcast_uevent.patch create mode 100644 3.3.8/Add_CONFIG_VFAT_FS_DUALNAMES_option.patch create mode 100644 3.3.8/accessfs-3.2-0.26.patch create mode 100644 3.3.8/ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch create mode 100644 3.3.8/aufs-3.x-rcN.patch create mode 100644 3.3.8/cloneconfig.patch create mode 100644 3.3.8/colored-printk-3.3.8.patch create mode 100644 3.3.8/fs-btrfs-run-delayed-directory-updates-during-log-replay.patch create mode 100644 3.3.8/fs-ecryptfs-fix-lockdep-warning-in-miscdev-operations.patch create mode 100644 3.3.8/fs-ecryptfs-gracefully-refuse-miscdev-file-ops-on-inherited-passed-files.patch create mode 100644 3.3.8/fs-ecryptfs-properly-check-for-o_rdonly-flag-before-doing-privileged-open.patch create mode 100644 3.3.8/fs-epoll-clear-the-tfile_check_list-on-ELOOP_CVE-2012-3375.patch create mode 100644 3.3.8/fs-ext4-fix-duplicated-mnt_drop_write-call-in-ext4_ioc_move_ext.patch create mode 100644 3.3.8/fs-ext4-fix-the-free-blocks-calculation-for-ext3-file-systems-w-uninit_bg.patch create mode 100644 3.3.8/fs-remove-easily-user-triggerable-bug-from-generic_setlease.patch create mode 100644 3.3.8/fs-udf-avoid-run-away-loop-when-partition-table-length-is-corrupted_CVE-2012-3400.patch create mode 100644 3.3.8/fs-udf-fortify-loading-of-sparing-table_CVE-2012-3400.patch create mode 100644 3.3.8/fs-udf-use-ret-instead-of-abusing-i-in-udf_load_logicalvol.patch create mode 100644 3.3.8/hz-432-kconfig-option.patch create mode 100644 3.3.8/hz-864-kconfig-option.patch create mode 100644 3.3.8/imqmq-3.3.patch create mode 100644 3.3.8/kbuild-compress-kernel-modules-on-installation.patch create mode 100644 3.3.8/kirkwood-jumbo-frame.patch create mode 100644 3.3.8/linux-2.6-defaults-fat-utf8.patch create mode 100644 3.3.8/linux-2.6-x86-tune-generic.patch create mode 100644 3.3.8/linux-3.4-e2c-0.4.59.patch create mode 100644 3.3.8/lschlv2.patch create mode 100644 3.3.8/net-netfilter-IFWLOG-2.6.35-buildfix.patch create mode 100644 3.3.8/net-netfilter-IFWLOG-2.6.37-buildfix.patch create mode 100644 3.3.8/net-netfilter-IFWLOG-mdv.patch create mode 100644 3.3.8/net-netfilter-IFWLOG.patch create mode 100644 3.3.8/net-netfilter-psd-2.6.35-buildfix.patch create mode 100644 3.3.8/net-netfilter-psd-mdv.patch create mode 100644 3.3.8/net-netfilter-psd.patch create mode 100644 3.3.8/netfilter-implement-rfc-1123-for-ftp-conntrack.patch create mode 100644 3.3.8/netfilter-ip_conntrack_slp.patch create mode 100644 3.3.8/series create mode 100644 3.3.8/uksm-0.1.2.1-for-v3.3.ge.8.patch create mode 100644 3.3.8/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-VL.patch create mode 100644 3.3.8/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-WVL.patch create mode 100644 3.3.8/v3.2-ARM-orion-Add-support-for-Buffalo-LS-PRODUO.patch create mode 100644 3.3.8/v3.3-ARM-kirkwood-Add-support-for-Buffalo-LS-CHLv2.patch create mode 100644 3.3.8/v3.3-ARM-kirkwood-Add-support-for-Buffalo-LS-XHL.patch create mode 100644 3.3.8/v3.3-ARM-orion-Add-support-for-Buffalo-LS-QL.patch create mode 100644 3.3.8/vserver-3.3.8-vs2.3.3.4.patch create mode 100644 3.3.8/wrapfs-v3.3-rc1-429-g65388bc.patch diff --git a/3.2.34/0001-AppArmor-compatibility-patch-for-v5-network-controll.patch b/3.2.34/0001-AppArmor-compatibility-patch-for-v5-network-controll.patch new file mode 100644 index 0000000..00c8712 --- /dev/null +++ b/3.2.34/0001-AppArmor-compatibility-patch-for-v5-network-controll.patch @@ -0,0 +1,553 @@ +From dc13dec93dbd04bfa7a9ba67df1b8ed3431d8d48 Mon Sep 17 00:00:00 2001 +From: John Johansen +Date: Wed, 10 Aug 2011 22:02:39 -0700 +Subject: [PATCH 1/3] AppArmor: compatibility patch for v5 network controll + +Add compatibility for v5 network rules. + +Signed-off-by: John Johansen +--- + include/linux/lsm_audit.h | 4 + + security/apparmor/Makefile | 19 ++++- + security/apparmor/include/net.h | 40 +++++++++ + security/apparmor/include/policy.h | 3 + + security/apparmor/lsm.c | 112 +++++++++++++++++++++++ + security/apparmor/net.c | 170 ++++++++++++++++++++++++++++++++++++ + security/apparmor/policy.c | 1 + + security/apparmor/policy_unpack.c | 48 ++++++++++- + 8 files changed, 394 insertions(+), 3 deletions(-) + create mode 100644 security/apparmor/include/net.h + create mode 100644 security/apparmor/net.c + +diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h +index 88e78de..c63979a 100644 +--- a/include/linux/lsm_audit.h ++++ b/include/linux/lsm_audit.h +@@ -124,6 +124,10 @@ struct common_audit_data { + u32 denied; + uid_t ouid; + } fs; ++ struct { ++ int type, protocol; ++ struct sock *sk; ++ } net; + }; + } apparmor_audit_data; + #endif +diff --git a/security/apparmor/Makefile b/security/apparmor/Makefile +index 2dafe50..7cefef9 100644 +--- a/security/apparmor/Makefile ++++ b/security/apparmor/Makefile +@@ -4,9 +4,9 @@ obj-$(CONFIG_SECURITY_APPARMOR) += apparmor.o + + apparmor-y := apparmorfs.o audit.o capability.o context.o ipc.o lib.o match.o \ + path.o domain.o policy.o policy_unpack.o procattr.o lsm.o \ +- resource.o sid.o file.o ++ resource.o sid.o file.o net.o + +-clean-files := capability_names.h rlim_names.h ++clean-files := capability_names.h rlim_names.h af_names.h + + + # Build a lower case string table of capability names +@@ -44,9 +44,24 @@ cmd_make-rlim = echo "static const char *rlim_names[] = {" > $@ ;\ + sed -r -n "s/^\# ?define[ \t]+(RLIMIT_[A-Z0-9_]+).*/\1,/p" $< >> $@ ;\ + echo "};" >> $@ + ++# Build a lower case string table of address family names. ++# Transform lines from ++# #define AF_INET 2 /* Internet IP Protocol */ ++# to ++# [2] = "inet", ++quiet_cmd_make-af = GEN $@ ++cmd_make-af = echo "static const char *address_family_names[] = {" > $@ ;\ ++ sed $< >> $@ -r -n -e "/AF_MAX/d" -e "/AF_LOCAL/d" -e \ ++ 's/^\#define[ \t]+AF_([A-Z0-9_]+)[ \t]+([0-9]+).*/[\2] = "\L\1",/p';\ ++ echo "};" >> $@ ++ ++ + $(obj)/capability.o : $(obj)/capability_names.h + $(obj)/resource.o : $(obj)/rlim_names.h ++$(obj)/net.o : $(obj)/af_names.h + $(obj)/capability_names.h : $(srctree)/include/linux/capability.h + $(call cmd,make-caps) + $(obj)/rlim_names.h : $(srctree)/include/asm-generic/resource.h + $(call cmd,make-rlim) ++$(obj)/af_names.h : $(srctree)/include/linux/socket.h ++ $(call cmd,make-af) +\ No newline at end of file +diff --git a/security/apparmor/include/net.h b/security/apparmor/include/net.h +new file mode 100644 +index 0000000..3c7d599 +--- /dev/null ++++ b/security/apparmor/include/net.h +@@ -0,0 +1,40 @@ ++/* ++ * AppArmor security module ++ * ++ * This file contains AppArmor network mediation definitions. ++ * ++ * Copyright (C) 1998-2008 Novell/SUSE ++ * Copyright 2009-2010 Canonical Ltd. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License as ++ * published by the Free Software Foundation, version 2 of the ++ * License. ++ */ ++ ++#ifndef __AA_NET_H ++#define __AA_NET_H ++ ++#include ++ ++/* struct aa_net - network confinement data ++ * @allowed: basic network families permissions ++ * @audit_network: which network permissions to force audit ++ * @quiet_network: which network permissions to quiet rejects ++ */ ++struct aa_net { ++ u16 allow[AF_MAX]; ++ u16 audit[AF_MAX]; ++ u16 quiet[AF_MAX]; ++}; ++ ++extern int aa_net_perm(int op, struct aa_profile *profile, u16 family, ++ int type, int protocol, struct sock *sk); ++extern int aa_revalidate_sk(int op, struct sock *sk); ++ ++static inline void aa_free_net_rules(struct aa_net *new) ++{ ++ /* NOP */ ++} ++ ++#endif /* __AA_NET_H */ +diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h +index aeda5cf..6776929 100644 +--- a/security/apparmor/include/policy.h ++++ b/security/apparmor/include/policy.h +@@ -27,6 +27,7 @@ + #include "capability.h" + #include "domain.h" + #include "file.h" ++#include "net.h" + #include "resource.h" + + extern const char *profile_mode_names[]; +@@ -145,6 +146,7 @@ struct aa_namespace { + * @size: the memory consumed by this profiles rules + * @file: The set of rules governing basic file access and domain transitions + * @caps: capabilities for the profile ++ * @net: network controls for the profile + * @rlimits: rlimits for the profile + * + * The AppArmor profile contains the basic confinement data. Each profile +@@ -181,6 +183,7 @@ struct aa_profile { + + struct aa_file_rules file; + struct aa_caps caps; ++ struct aa_net net; + struct aa_rlimit rlimits; + }; + +diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c +index 3d2fd14..aa293ae 100644 +--- a/security/apparmor/lsm.c ++++ b/security/apparmor/lsm.c +@@ -32,6 +32,7 @@ + #include "include/context.h" + #include "include/file.h" + #include "include/ipc.h" ++#include "include/net.h" + #include "include/path.h" + #include "include/policy.h" + #include "include/procattr.h" +@@ -621,6 +622,104 @@ static int apparmor_task_setrlimit(struct task_struct *task, + return error; + } + ++static int apparmor_socket_create(int family, int type, int protocol, int kern) ++{ ++ struct aa_profile *profile; ++ int error = 0; ++ ++ if (kern) ++ return 0; ++ ++ profile = __aa_current_profile(); ++ if (!unconfined(profile)) ++ error = aa_net_perm(OP_CREATE, profile, family, type, protocol, ++ NULL); ++ return error; ++} ++ ++static int apparmor_socket_bind(struct socket *sock, ++ struct sockaddr *address, int addrlen) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_BIND, sk); ++} ++ ++static int apparmor_socket_connect(struct socket *sock, ++ struct sockaddr *address, int addrlen) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_CONNECT, sk); ++} ++ ++static int apparmor_socket_listen(struct socket *sock, int backlog) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_LISTEN, sk); ++} ++ ++static int apparmor_socket_accept(struct socket *sock, struct socket *newsock) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_ACCEPT, sk); ++} ++ ++static int apparmor_socket_sendmsg(struct socket *sock, ++ struct msghdr *msg, int size) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_SENDMSG, sk); ++} ++ ++static int apparmor_socket_recvmsg(struct socket *sock, ++ struct msghdr *msg, int size, int flags) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_RECVMSG, sk); ++} ++ ++static int apparmor_socket_getsockname(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_GETSOCKNAME, sk); ++} ++ ++static int apparmor_socket_getpeername(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_GETPEERNAME, sk); ++} ++ ++static int apparmor_socket_getsockopt(struct socket *sock, int level, ++ int optname) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_GETSOCKOPT, sk); ++} ++ ++static int apparmor_socket_setsockopt(struct socket *sock, int level, ++ int optname) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_SETSOCKOPT, sk); ++} ++ ++static int apparmor_socket_shutdown(struct socket *sock, int how) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_SOCK_SHUTDOWN, sk); ++} ++ + static struct security_operations apparmor_ops = { + .name = "apparmor", + +@@ -652,6 +751,19 @@ static struct security_operations apparmor_ops = { + .getprocattr = apparmor_getprocattr, + .setprocattr = apparmor_setprocattr, + ++ .socket_create = apparmor_socket_create, ++ .socket_bind = apparmor_socket_bind, ++ .socket_connect = apparmor_socket_connect, ++ .socket_listen = apparmor_socket_listen, ++ .socket_accept = apparmor_socket_accept, ++ .socket_sendmsg = apparmor_socket_sendmsg, ++ .socket_recvmsg = apparmor_socket_recvmsg, ++ .socket_getsockname = apparmor_socket_getsockname, ++ .socket_getpeername = apparmor_socket_getpeername, ++ .socket_getsockopt = apparmor_socket_getsockopt, ++ .socket_setsockopt = apparmor_socket_setsockopt, ++ .socket_shutdown = apparmor_socket_shutdown, ++ + .cred_alloc_blank = apparmor_cred_alloc_blank, + .cred_free = apparmor_cred_free, + .cred_prepare = apparmor_cred_prepare, +diff --git a/security/apparmor/net.c b/security/apparmor/net.c +new file mode 100644 +index 0000000..1765901 +--- /dev/null ++++ b/security/apparmor/net.c +@@ -0,0 +1,170 @@ ++/* ++ * AppArmor security module ++ * ++ * This file contains AppArmor network mediation ++ * ++ * Copyright (C) 1998-2008 Novell/SUSE ++ * Copyright 2009-2010 Canonical Ltd. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License as ++ * published by the Free Software Foundation, version 2 of the ++ * License. ++ */ ++ ++#include "include/apparmor.h" ++#include "include/audit.h" ++#include "include/context.h" ++#include "include/net.h" ++#include "include/policy.h" ++ ++#include "af_names.h" ++ ++static const char *sock_type_names[] = { ++ "unknown(0)", ++ "stream", ++ "dgram", ++ "raw", ++ "rdm", ++ "seqpacket", ++ "dccp", ++ "unknown(7)", ++ "unknown(8)", ++ "unknown(9)", ++ "packet", ++}; ++ ++/* audit callback for net specific fields */ ++static void audit_cb(struct audit_buffer *ab, void *va) ++{ ++ struct common_audit_data *sa = va; ++ ++ audit_log_format(ab, " family="); ++ if (address_family_names[sa->u.net.family]) { ++ audit_log_string(ab, address_family_names[sa->u.net.family]); ++ } else { ++ audit_log_format(ab, " \"unknown(%d)\"", sa->u.net.family); ++ } ++ ++ audit_log_format(ab, " sock_type="); ++ if (sock_type_names[sa->aad.net.type]) { ++ audit_log_string(ab, sock_type_names[sa->aad.net.type]); ++ } else { ++ audit_log_format(ab, "\"unknown(%d)\"", sa->aad.net.type); ++ } ++ ++ audit_log_format(ab, " protocol=%d", sa->aad.net.protocol); ++} ++ ++/** ++ * audit_net - audit network access ++ * @profile: profile being enforced (NOT NULL) ++ * @op: operation being checked ++ * @family: network family ++ * @type: network type ++ * @protocol: network protocol ++ * @sk: socket auditing is being applied to ++ * @error: error code for failure else 0 ++ * ++ * Returns: %0 or sa->error else other errorcode on failure ++ */ ++static int audit_net(struct aa_profile *profile, int op, u16 family, int type, ++ int protocol, struct sock *sk, int error) ++{ ++ int audit_type = AUDIT_APPARMOR_AUTO; ++ struct common_audit_data sa; ++ if (sk) { ++ COMMON_AUDIT_DATA_INIT(&sa, NET); ++ } else { ++ COMMON_AUDIT_DATA_INIT(&sa, NONE); ++ } ++ /* todo fill in socket addr info */ ++ ++ sa.aad.op = op, ++ sa.u.net.family = family; ++ sa.u.net.sk = sk; ++ sa.aad.net.type = type; ++ sa.aad.net.protocol = protocol; ++ sa.aad.error = error; ++ ++ if (likely(!sa.aad.error)) { ++ u16 audit_mask = profile->net.audit[sa.u.net.family]; ++ if (likely((AUDIT_MODE(profile) != AUDIT_ALL) && ++ !(1 << sa.aad.net.type & audit_mask))) ++ return 0; ++ audit_type = AUDIT_APPARMOR_AUDIT; ++ } else { ++ u16 quiet_mask = profile->net.quiet[sa.u.net.family]; ++ u16 kill_mask = 0; ++ u16 denied = (1 << sa.aad.net.type) & ~quiet_mask; ++ ++ if (denied & kill_mask) ++ audit_type = AUDIT_APPARMOR_KILL; ++ ++ if ((denied & quiet_mask) && ++ AUDIT_MODE(profile) != AUDIT_NOQUIET && ++ AUDIT_MODE(profile) != AUDIT_ALL) ++ return COMPLAIN_MODE(profile) ? 0 : sa.aad.error; ++ } ++ ++ return aa_audit(audit_type, profile, GFP_KERNEL, &sa, audit_cb); ++} ++ ++/** ++ * aa_net_perm - very course network access check ++ * @op: operation being checked ++ * @profile: profile being enforced (NOT NULL) ++ * @family: network family ++ * @type: network type ++ * @protocol: network protocol ++ * ++ * Returns: %0 else error if permission denied ++ */ ++int aa_net_perm(int op, struct aa_profile *profile, u16 family, int type, ++ int protocol, struct sock *sk) ++{ ++ u16 family_mask; ++ int error; ++ ++ if ((family < 0) || (family >= AF_MAX)) ++ return -EINVAL; ++ ++ if ((type < 0) || (type >= SOCK_MAX)) ++ return -EINVAL; ++ ++ /* unix domain and netlink sockets are handled by ipc */ ++ if (family == AF_UNIX || family == AF_NETLINK) ++ return 0; ++ ++ family_mask = profile->net.allow[family]; ++ ++ error = (family_mask & (1 << type)) ? 0 : -EACCES; ++ ++ return audit_net(profile, op, family, type, protocol, sk, error); ++} ++ ++/** ++ * aa_revalidate_sk - Revalidate access to a sock ++ * @op: operation being checked ++ * @sk: sock being revalidated (NOT NULL) ++ * ++ * Returns: %0 else error if permission denied ++ */ ++int aa_revalidate_sk(int op, struct sock *sk) ++{ ++ struct aa_profile *profile; ++ int error = 0; ++ ++ /* aa_revalidate_sk should not be called from interrupt context ++ * don't mediate these calls as they are not task related ++ */ ++ if (in_interrupt()) ++ return 0; ++ ++ profile = __aa_current_profile(); ++ if (!unconfined(profile)) ++ error = aa_net_perm(op, profile, sk->sk_family, sk->sk_type, ++ sk->sk_protocol, sk); ++ ++ return error; ++} +diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c +index 4f0eade..4d5ce13 100644 +--- a/security/apparmor/policy.c ++++ b/security/apparmor/policy.c +@@ -745,6 +745,7 @@ static void free_profile(struct aa_profile *profile) + + aa_free_file_rules(&profile->file); + aa_free_cap_rules(&profile->caps); ++ aa_free_net_rules(&profile->net); + aa_free_rlimit_rules(&profile->rlimits); + + aa_free_sid(profile->sid); +diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c +index d6d9a57..f4874c4 100644 +--- a/security/apparmor/policy_unpack.c ++++ b/security/apparmor/policy_unpack.c +@@ -190,6 +190,19 @@ fail: + return 0; + } + ++static bool unpack_u16(struct aa_ext *e, u16 *data, const char *name) ++{ ++ if (unpack_nameX(e, AA_U16, name)) { ++ if (!inbounds(e, sizeof(u16))) ++ return 0; ++ if (data) ++ *data = le16_to_cpu(get_unaligned((u16 *) e->pos)); ++ e->pos += sizeof(u16); ++ return 1; ++ } ++ return 0; ++} ++ + static bool unpack_u32(struct aa_ext *e, u32 *data, const char *name) + { + if (unpack_nameX(e, AA_U32, name)) { +@@ -468,7 +481,8 @@ static struct aa_profile *unpack_profile(struct aa_ext *e) + { + struct aa_profile *profile = NULL; + const char *name = NULL; +- int error = -EPROTO; ++ size_t size = 0; ++ int i, error = -EPROTO; + kernel_cap_t tmpcap; + u32 tmp; + +@@ -559,6 +573,38 @@ static struct aa_profile *unpack_profile(struct aa_ext *e) + if (!unpack_rlimits(e, profile)) + goto fail; + ++ size = unpack_array(e, "net_allowed_af"); ++ if (size) { ++ ++ for (i = 0; i < size; i++) { ++ /* discard extraneous rules that this kernel will ++ * never request ++ */ ++ if (i >= AF_MAX) { ++ u16 tmp; ++ if (!unpack_u16(e, &tmp, NULL) || ++ !unpack_u16(e, &tmp, NULL) || ++ !unpack_u16(e, &tmp, NULL)) ++ goto fail; ++ continue; ++ } ++ if (!unpack_u16(e, &profile->net.allow[i], NULL)) ++ goto fail; ++ if (!unpack_u16(e, &profile->net.audit[i], NULL)) ++ goto fail; ++ if (!unpack_u16(e, &profile->net.quiet[i], NULL)) ++ goto fail; ++ } ++ if (!unpack_nameX(e, AA_ARRAYEND, NULL)) ++ goto fail; ++ /* ++ * allow unix domain and netlink sockets they are handled ++ * by IPC ++ */ ++ } ++ profile->net.allow[AF_UNIX] = 0xffff; ++ profile->net.allow[AF_NETLINK] = 0xffff; ++ + /* get file rules */ + profile->file.dfa = unpack_dfa(e); + if (IS_ERR(profile->file.dfa)) { +-- +1.7.5.4 + diff --git a/3.2.34/0001-block-prepare-I-O-context-code-for-BFQ-v5-for-3.2.patch b/3.2.34/0001-block-prepare-I-O-context-code-for-BFQ-v5-for-3.2.patch new file mode 100644 index 0000000..ea52c70 --- /dev/null +++ b/3.2.34/0001-block-prepare-I-O-context-code-for-BFQ-v5-for-3.2.patch @@ -0,0 +1,299 @@ +From f9072731bedac6f6373dd75798b5a801ce614c02 Mon Sep 17 00:00:00 2001 +From: Arianna Avanzini +Date: Mon, 19 Dec 2011 16:33:41 +0100 +Subject: [PATCH 1/3] block: prepare I/O context code for BFQ-v5 for 3.2 + +BFQ uses struct cfq_io_context to store its per-process per-device data, +reusing the same code for cic handling of CFQ. The code is not shared +ATM to minimize the impact of these patches. + +This patch introduces a new hlist to each io_context to store all the +cic's allocated by BFQ to allow calling the right destructor on module +unload; the radix tree used for cic lookup needs to be duplicated +because it can contain dead keys inserted by a scheduler and later +retrieved by the other one. + +Update the io_context exit and free paths to take care also of +the BFQ cic's. + +Change the type of cfqq inside struct cfq_io_context to void * +to use it also for BFQ per-queue data. + +A new bfq-specific ioprio_changed field is necessary, too, to avoid +clobbering cfq's one, so switch ioprio_changed to a bitmap, with one +element per scheduler. + +Signed-off-by: Fabio Checconi +Signed-off-by: Paolo Valente +Signed-off-by: Arianna Avanzini +--- + block/Kconfig.iosched | 26 ++++++++++++++++++++++++++ + block/blk-ioc.c | 30 +++++++++++++++++------------- + block/cfq-iosched.c | 10 +++++++--- + fs/ioprio.c | 9 +++++++-- + include/linux/iocontext.h | 18 +++++++++++++++--- + 5 files changed, 72 insertions(+), 21 deletions(-) + +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched +index 3199b76..5905452 100644 +--- a/block/Kconfig.iosched ++++ b/block/Kconfig.iosched +@@ -43,6 +43,28 @@ config CFQ_GROUP_IOSCHED + ---help--- + Enable group IO scheduling in CFQ. + ++config IOSCHED_BFQ ++ tristate "BFQ I/O scheduler" ++ depends on EXPERIMENTAL ++ default n ++ ---help--- ++ The BFQ I/O scheduler tries to distribute bandwidth among ++ all processes according to their weights. ++ It aims at distributing the bandwidth as desired, independently of ++ the disk parameters and with any workload. It also tries to ++ guarantee low latency to interactive and soft real-time ++ applications. If compiled built-in (saying Y here), BFQ can ++ be configured to support hierarchical scheduling. ++ ++config CGROUP_BFQIO ++ bool "BFQ hierarchical scheduling support" ++ depends on CGROUPS && IOSCHED_BFQ=y ++ default n ++ ---help--- ++ Enable hierarchical scheduling in BFQ, using the cgroups ++ filesystem interface. The name of the subsystem will be ++ bfqio. ++ + choice + prompt "Default I/O scheduler" + default DEFAULT_CFQ +@@ -56,6 +78,9 @@ choice + config DEFAULT_CFQ + bool "CFQ" if IOSCHED_CFQ=y + ++ config DEFAULT_BFQ ++ bool "BFQ" if IOSCHED_BFQ=y ++ + config DEFAULT_NOOP + bool "No-op" + +@@ -65,6 +90,7 @@ config DEFAULT_IOSCHED + string + default "deadline" if DEFAULT_DEADLINE + default "cfq" if DEFAULT_CFQ ++ default "bfq" if DEFAULT_BFQ + default "noop" if DEFAULT_NOOP + + endmenu +diff --git a/block/blk-ioc.c b/block/blk-ioc.c +index 6f9bbd9..d0d16d4 100644 +--- a/block/blk-ioc.c ++++ b/block/blk-ioc.c +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + #include + #include /* for max_pfn/max_low_pfn */ + #include +@@ -16,13 +17,12 @@ + */ + static struct kmem_cache *iocontext_cachep; + +-static void cfq_dtor(struct io_context *ioc) ++static void hlist_sched_dtor(struct io_context *ioc, struct hlist_head *list) + { +- if (!hlist_empty(&ioc->cic_list)) { ++ if (!hlist_empty(list)) { + struct cfq_io_context *cic; + +- cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, +- cic_list); ++ cic = hlist_entry(list->first, struct cfq_io_context, cic_list); + cic->dtor(ioc); + } + } +@@ -40,7 +40,9 @@ int put_io_context(struct io_context *ioc) + + if (atomic_long_dec_and_test(&ioc->refcount)) { + rcu_read_lock(); +- cfq_dtor(ioc); ++ ++ hlist_sched_dtor(ioc, &ioc->cic_list); ++ hlist_sched_dtor(ioc, &ioc->bfq_cic_list); + rcu_read_unlock(); + + kmem_cache_free(iocontext_cachep, ioc); +@@ -50,15 +52,14 @@ int put_io_context(struct io_context *ioc) + } + EXPORT_SYMBOL(put_io_context); + +-static void cfq_exit(struct io_context *ioc) ++static void hlist_sched_exit(struct io_context *ioc, struct hlist_head *list) + { + rcu_read_lock(); + +- if (!hlist_empty(&ioc->cic_list)) { ++ if (!hlist_empty(list)) { + struct cfq_io_context *cic; + +- cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, +- cic_list); ++ cic = hlist_entry(list->first, struct cfq_io_context, cic_list); + cic->exit(ioc); + } + rcu_read_unlock(); +@@ -74,9 +75,10 @@ void exit_io_context(struct task_struct *task) + task->io_context = NULL; + task_unlock(task); + +- if (atomic_dec_and_test(&ioc->nr_tasks)) +- cfq_exit(ioc); +- ++ if (atomic_dec_and_test(&ioc->nr_tasks)) { ++ hlist_sched_exit(ioc, &ioc->cic_list); ++ hlist_sched_exit(ioc, &ioc->bfq_cic_list); ++ } + put_io_context(ioc); + } + +@@ -89,12 +91,14 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) + atomic_long_set(&ioc->refcount, 1); + atomic_set(&ioc->nr_tasks, 1); + spin_lock_init(&ioc->lock); +- ioc->ioprio_changed = 0; ++ bitmap_zero(ioc->ioprio_changed, IOC_IOPRIO_CHANGED_BITS); + ioc->ioprio = 0; + ioc->last_waited = 0; /* doesn't matter... */ + ioc->nr_batch_requests = 0; /* because this is 0 */ + INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH); + INIT_HLIST_HEAD(&ioc->cic_list); ++ INIT_RADIX_TREE(&ioc->bfq_radix_root, GFP_ATOMIC | __GFP_HIGH); ++ INIT_HLIST_HEAD(&ioc->bfq_cic_list); + ioc->ioc_data = NULL; + #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) + ioc->cgroup_changed = 0; +diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c +index 3548705..a120a31 100644 +--- a/block/cfq-iosched.c ++++ b/block/cfq-iosched.c +@@ -2946,7 +2946,6 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) + static void cfq_ioc_set_ioprio(struct io_context *ioc) + { + call_for_each_cic(ioc, changed_ioprio); +- ioc->ioprio_changed = 0; + } + + static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, +@@ -3238,8 +3237,13 @@ retry: + goto err_free; + + out: +- smp_read_barrier_depends(); +- if (unlikely(ioc->ioprio_changed)) ++ /* ++ * test_and_clear_bit() implies a memory barrier, paired with ++ * the wmb() in fs/ioprio.c, so the value seen for ioprio is the ++ * new one. ++ */ ++ if (unlikely(test_and_clear_bit(IOC_CFQ_IOPRIO_CHANGED, ++ ioc->ioprio_changed))) + cfq_ioc_set_ioprio(ioc); + + #ifdef CONFIG_CFQ_GROUP_IOSCHED +diff --git a/fs/ioprio.c b/fs/ioprio.c +index f79dab8..6b0cb885 100644 +--- a/fs/ioprio.c ++++ b/fs/ioprio.c +@@ -31,7 +31,7 @@ + + int set_task_ioprio(struct task_struct *task, int ioprio) + { +- int err; ++ int err, i; + struct io_context *ioc; + const struct cred *cred = current_cred(), *tcred; + +@@ -61,12 +61,17 @@ int set_task_ioprio(struct task_struct *task, int ioprio) + err = -ENOMEM; + break; + } ++ /* let other ioc users see the new values */ ++ smp_wmb(); + task->io_context = ioc; + } while (1); + + if (!err) { + ioc->ioprio = ioprio; +- ioc->ioprio_changed = 1; ++ /* make sure schedulers see the new ioprio value */ ++ wmb(); ++ for (i = 0; i < IOC_IOPRIO_CHANGED_BITS; i++) ++ set_bit(i, ioc->ioprio_changed); + } + + task_unlock(task); +diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h +index 5037a0a..69fdd58 100644 +--- a/include/linux/iocontext.h ++++ b/include/linux/iocontext.h +@@ -1,10 +1,10 @@ + #ifndef IOCONTEXT_H + #define IOCONTEXT_H + ++#include + #include + #include + +-struct cfq_queue; + struct cfq_ttime { + unsigned long last_end_request; + +@@ -16,7 +16,7 @@ struct cfq_ttime { + struct cfq_io_context { + void *key; + +- struct cfq_queue *cfqq[2]; ++ void *cfqq[2]; + + struct io_context *ioc; + +@@ -32,6 +32,16 @@ struct cfq_io_context { + }; + + /* ++ * Indexes into the ioprio_changed bitmap. A bit set indicates that ++ * the corresponding I/O scheduler needs to see a ioprio update. ++ */ ++enum { ++ IOC_CFQ_IOPRIO_CHANGED, ++ IOC_BFQ_IOPRIO_CHANGED, ++ IOC_IOPRIO_CHANGED_BITS ++}; ++ ++/* + * I/O subsystem state of the associated processes. It is refcounted + * and kmalloc'ed. These could be shared between processes. + */ +@@ -43,7 +53,7 @@ struct io_context { + spinlock_t lock; + + unsigned short ioprio; +- unsigned short ioprio_changed; ++ DECLARE_BITMAP(ioprio_changed, IOC_IOPRIO_CHANGED_BITS); + + #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) + unsigned short cgroup_changed; +@@ -57,6 +67,8 @@ struct io_context { + + struct radix_tree_root radix_root; + struct hlist_head cic_list; ++ struct radix_tree_root bfq_radix_root; ++ struct hlist_head bfq_cic_list; + void __rcu *ioc_data; + }; + +-- +1.7.10.4 + diff --git a/3.2.34/0002-AppArmor-compatibility-patch-for-v5-interface.patch b/3.2.34/0002-AppArmor-compatibility-patch-for-v5-interface.patch new file mode 100644 index 0000000..10d4640 --- /dev/null +++ b/3.2.34/0002-AppArmor-compatibility-patch-for-v5-interface.patch @@ -0,0 +1,391 @@ +From a2515f25ad5a7833ddc5a032d34eee6a5ddee3a2 Mon Sep 17 00:00:00 2001 +From: John Johansen +Date: Wed, 10 Aug 2011 22:02:40 -0700 +Subject: [PATCH 2/3] AppArmor: compatibility patch for v5 interface + +Signed-off-by: John Johansen +--- + security/apparmor/Kconfig | 9 + + security/apparmor/Makefile | 1 + + security/apparmor/apparmorfs-24.c | 287 ++++++++++++++++++++++++++++++++ + security/apparmor/apparmorfs.c | 18 ++- + security/apparmor/include/apparmorfs.h | 6 + + 5 files changed, 319 insertions(+), 2 deletions(-) + create mode 100644 security/apparmor/apparmorfs-24.c + +diff --git a/security/apparmor/Kconfig b/security/apparmor/Kconfig +index 9b9013b..51ebf96 100644 +--- a/security/apparmor/Kconfig ++++ b/security/apparmor/Kconfig +@@ -29,3 +29,12 @@ config SECURITY_APPARMOR_BOOTPARAM_VALUE + boot. + + If you are unsure how to answer this question, answer 1. ++ ++config SECURITY_APPARMOR_COMPAT_24 ++ bool "Enable AppArmor 2.4 compatability" ++ depends on SECURITY_APPARMOR ++ default y ++ help ++ This option enables compatability with AppArmor 2.4. It is ++ recommended if compatability with older versions of AppArmor ++ is desired. +diff --git a/security/apparmor/Makefile b/security/apparmor/Makefile +index 7cefef9..0bb604b 100644 +--- a/security/apparmor/Makefile ++++ b/security/apparmor/Makefile +@@ -5,6 +5,7 @@ obj-$(CONFIG_SECURITY_APPARMOR) += apparmor.o + apparmor-y := apparmorfs.o audit.o capability.o context.o ipc.o lib.o match.o \ + path.o domain.o policy.o policy_unpack.o procattr.o lsm.o \ + resource.o sid.o file.o net.o ++apparmor-$(CONFIG_SECURITY_APPARMOR_COMPAT_24) += apparmorfs-24.o + + clean-files := capability_names.h rlim_names.h af_names.h + +diff --git a/security/apparmor/apparmorfs-24.c b/security/apparmor/apparmorfs-24.c +new file mode 100644 +index 0000000..dc8c744 +--- /dev/null ++++ b/security/apparmor/apparmorfs-24.c +@@ -0,0 +1,287 @@ ++/* ++ * AppArmor security module ++ * ++ * This file contains AppArmor /sys/kernel/secrutiy/apparmor interface functions ++ * ++ * Copyright (C) 1998-2008 Novell/SUSE ++ * Copyright 2009-2010 Canonical Ltd. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License as ++ * published by the Free Software Foundation, version 2 of the ++ * License. ++ * ++ * ++ * This file contain functions providing an interface for <= AppArmor 2.4 ++ * compatibility. It is dependent on CONFIG_SECURITY_APPARMOR_COMPAT_24 ++ * being set (see Makefile). ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "include/apparmor.h" ++#include "include/audit.h" ++#include "include/context.h" ++#include "include/policy.h" ++ ++ ++/* apparmor/matching */ ++static ssize_t aa_matching_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ const char matching[] = "pattern=aadfa audit perms=crwxamlk/ " ++ "user::other"; ++ ++ return simple_read_from_buffer(buf, size, ppos, matching, ++ sizeof(matching) - 1); ++} ++ ++const struct file_operations aa_fs_matching_fops = { ++ .read = aa_matching_read, ++}; ++ ++/* apparmor/features */ ++static ssize_t aa_features_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ const char features[] = "file=3.1 capability=2.0 network=1.0 " ++ "change_hat=1.5 change_profile=1.1 " "aanamespaces=1.1 rlimit=1.1"; ++ ++ return simple_read_from_buffer(buf, size, ppos, features, ++ sizeof(features) - 1); ++} ++ ++const struct file_operations aa_fs_features_fops = { ++ .read = aa_features_read, ++}; ++ ++/** ++ * __next_namespace - find the next namespace to list ++ * @root: root namespace to stop search at (NOT NULL) ++ * @ns: current ns position (NOT NULL) ++ * ++ * Find the next namespace from @ns under @root and handle all locking needed ++ * while switching current namespace. ++ * ++ * Returns: next namespace or NULL if at last namespace under @root ++ * NOTE: will not unlock root->lock ++ */ ++static struct aa_namespace *__next_namespace(struct aa_namespace *root, ++ struct aa_namespace *ns) ++{ ++ struct aa_namespace *parent; ++ ++ /* is next namespace a child */ ++ if (!list_empty(&ns->sub_ns)) { ++ struct aa_namespace *next; ++ next = list_first_entry(&ns->sub_ns, typeof(*ns), base.list); ++ read_lock(&next->lock); ++ return next; ++ } ++ ++ /* check if the next ns is a sibling, parent, gp, .. */ ++ parent = ns->parent; ++ while (parent) { ++ read_unlock(&ns->lock); ++ list_for_each_entry_continue(ns, &parent->sub_ns, base.list) { ++ read_lock(&ns->lock); ++ return ns; ++ } ++ if (parent == root) ++ return NULL; ++ ns = parent; ++ parent = parent->parent; ++ } ++ ++ return NULL; ++} ++ ++/** ++ * __first_profile - find the first profile in a namespace ++ * @root: namespace that is root of profiles being displayed (NOT NULL) ++ * @ns: namespace to start in (NOT NULL) ++ * ++ * Returns: unrefcounted profile or NULL if no profile ++ */ ++static struct aa_profile *__first_profile(struct aa_namespace *root, ++ struct aa_namespace *ns) ++{ ++ for ( ; ns; ns = __next_namespace(root, ns)) { ++ if (!list_empty(&ns->base.profiles)) ++ return list_first_entry(&ns->base.profiles, ++ struct aa_profile, base.list); ++ } ++ return NULL; ++} ++ ++/** ++ * __next_profile - step to the next profile in a profile tree ++ * @profile: current profile in tree (NOT NULL) ++ * ++ * Perform a depth first taversal on the profile tree in a namespace ++ * ++ * Returns: next profile or NULL if done ++ * Requires: profile->ns.lock to be held ++ */ ++static struct aa_profile *__next_profile(struct aa_profile *p) ++{ ++ struct aa_profile *parent; ++ struct aa_namespace *ns = p->ns; ++ ++ /* is next profile a child */ ++ if (!list_empty(&p->base.profiles)) ++ return list_first_entry(&p->base.profiles, typeof(*p), ++ base.list); ++ ++ /* is next profile a sibling, parent sibling, gp, subling, .. */ ++ parent = p->parent; ++ while (parent) { ++ list_for_each_entry_continue(p, &parent->base.profiles, ++ base.list) ++ return p; ++ p = parent; ++ parent = parent->parent; ++ } ++ ++ /* is next another profile in the namespace */ ++ list_for_each_entry_continue(p, &ns->base.profiles, base.list) ++ return p; ++ ++ return NULL; ++} ++ ++/** ++ * next_profile - step to the next profile in where ever it may be ++ * @root: root namespace (NOT NULL) ++ * @profile: current profile (NOT NULL) ++ * ++ * Returns: next profile or NULL if there isn't one ++ */ ++static struct aa_profile *next_profile(struct aa_namespace *root, ++ struct aa_profile *profile) ++{ ++ struct aa_profile *next = __next_profile(profile); ++ if (next) ++ return next; ++ ++ /* finished all profiles in namespace move to next namespace */ ++ return __first_profile(root, __next_namespace(root, profile->ns)); ++} ++ ++/** ++ * p_start - start a depth first traversal of profile tree ++ * @f: seq_file to fill ++ * @pos: current position ++ * ++ * Returns: first profile under current namespace or NULL if none found ++ * ++ * acquires first ns->lock ++ */ ++static void *p_start(struct seq_file *f, loff_t *pos) ++ __acquires(root->lock) ++{ ++ struct aa_profile *profile = NULL; ++ struct aa_namespace *root = aa_current_profile()->ns; ++ loff_t l = *pos; ++ f->private = aa_get_namespace(root); ++ ++ ++ /* find the first profile */ ++ read_lock(&root->lock); ++ profile = __first_profile(root, root); ++ ++ /* skip to position */ ++ for (; profile && l > 0; l--) ++ profile = next_profile(root, profile); ++ ++ return profile; ++} ++ ++/** ++ * p_next - read the next profile entry ++ * @f: seq_file to fill ++ * @p: profile previously returned ++ * @pos: current position ++ * ++ * Returns: next profile after @p or NULL if none ++ * ++ * may acquire/release locks in namespace tree as necessary ++ */ ++static void *p_next(struct seq_file *f, void *p, loff_t *pos) ++{ ++ struct aa_profile *profile = p; ++ struct aa_namespace *root = f->private; ++ (*pos)++; ++ ++ return next_profile(root, profile); ++} ++ ++/** ++ * p_stop - stop depth first traversal ++ * @f: seq_file we are filling ++ * @p: the last profile writen ++ * ++ * Release all locking done by p_start/p_next on namespace tree ++ */ ++static void p_stop(struct seq_file *f, void *p) ++ __releases(root->lock) ++{ ++ struct aa_profile *profile = p; ++ struct aa_namespace *root = f->private, *ns; ++ ++ if (profile) { ++ for (ns = profile->ns; ns && ns != root; ns = ns->parent) ++ read_unlock(&ns->lock); ++ } ++ read_unlock(&root->lock); ++ aa_put_namespace(root); ++} ++ ++/** ++ * seq_show_profile - show a profile entry ++ * @f: seq_file to file ++ * @p: current position (profile) (NOT NULL) ++ * ++ * Returns: error on failure ++ */ ++static int seq_show_profile(struct seq_file *f, void *p) ++{ ++ struct aa_profile *profile = (struct aa_profile *)p; ++ struct aa_namespace *root = f->private; ++ ++ if (profile->ns != root) ++ seq_printf(f, ":%s://", aa_ns_name(root, profile->ns)); ++ seq_printf(f, "%s (%s)\n", profile->base.hname, ++ COMPLAIN_MODE(profile) ? "complain" : "enforce"); ++ ++ return 0; ++} ++ ++static const struct seq_operations aa_fs_profiles_op = { ++ .start = p_start, ++ .next = p_next, ++ .stop = p_stop, ++ .show = seq_show_profile, ++}; ++ ++static int profiles_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &aa_fs_profiles_op); ++} ++ ++static int profiles_release(struct inode *inode, struct file *file) ++{ ++ return seq_release(inode, file); ++} ++ ++const struct file_operations aa_fs_profiles_fops = { ++ .open = profiles_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = profiles_release, ++}; +diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c +index 0848292..28c52ac 100644 +--- a/security/apparmor/apparmorfs.c ++++ b/security/apparmor/apparmorfs.c +@@ -187,7 +187,11 @@ void __init aa_destroy_aafs(void) + aafs_remove(".remove"); + aafs_remove(".replace"); + aafs_remove(".load"); +- ++#ifdef CONFIG_SECURITY_APPARMOR_COMPAT_24 ++ aafs_remove("profiles"); ++ aafs_remove("matching"); ++ aafs_remove("features"); ++#endif + securityfs_remove(aa_fs_dentry); + aa_fs_dentry = NULL; + } +@@ -218,7 +222,17 @@ int __init aa_create_aafs(void) + aa_fs_dentry = NULL; + goto error; + } +- ++#ifdef CONFIG_SECURITY_APPARMOR_COMPAT_24 ++ error = aafs_create("matching", 0444, &aa_fs_matching_fops); ++ if (error) ++ goto error; ++ error = aafs_create("features", 0444, &aa_fs_features_fops); ++ if (error) ++ goto error; ++#endif ++ error = aafs_create("profiles", 0440, &aa_fs_profiles_fops); ++ if (error) ++ goto error; + error = aafs_create(".load", 0640, &aa_fs_profile_load); + if (error) + goto error; +diff --git a/security/apparmor/include/apparmorfs.h b/security/apparmor/include/apparmorfs.h +index cb1e93a..14f955c 100644 +--- a/security/apparmor/include/apparmorfs.h ++++ b/security/apparmor/include/apparmorfs.h +@@ -17,4 +17,10 @@ + + extern void __init aa_destroy_aafs(void); + ++#ifdef CONFIG_SECURITY_APPARMOR_COMPAT_24 ++extern const struct file_operations aa_fs_matching_fops; ++extern const struct file_operations aa_fs_features_fops; ++extern const struct file_operations aa_fs_profiles_fops; ++#endif ++ + #endif /* __AA_APPARMORFS_H */ +-- +1.7.5.4 + diff --git a/3.2.34/0002-block-cgroups-kconfig-build-bits-for-BFQ-v5-3.2.patch b/3.2.34/0002-block-cgroups-kconfig-build-bits-for-BFQ-v5-3.2.patch new file mode 100644 index 0000000..18b9dae --- /dev/null +++ b/3.2.34/0002-block-cgroups-kconfig-build-bits-for-BFQ-v5-3.2.patch @@ -0,0 +1,46 @@ +From 9396d61706a827dfc4dcdfdc8f687e0e9e24f53d Mon Sep 17 00:00:00 2001 +From: Arianna Avanzini +Date: Mon, 19 Dec 2011 16:34:01 +0100 +Subject: [PATCH 2/3] block: cgroups, kconfig, build bits for BFQ-v5-3.2 + +Add a Kconfig option and do the related Makefile changes to compile +the BFQ I/O scheduler. Also let the cgroups subsystem know about the +BFQ I/O controller. + +Signed-off-by: Fabio Checconi +Signed-off-by: Paolo Valente +Signed-off-by: Arianna Avanzini +--- + block/Makefile | 1 + + include/linux/cgroup_subsys.h | 6 ++++++ + 2 files changed, 7 insertions(+) + +diff --git a/block/Makefile b/block/Makefile +index 514c6e4..653d27b 100644 +--- a/block/Makefile ++++ b/block/Makefile +@@ -14,6 +14,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o + obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o + obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o + obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o ++obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o + + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o + obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o +diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h +index ac663c1..c966638 100644 +--- a/include/linux/cgroup_subsys.h ++++ b/include/linux/cgroup_subsys.h +@@ -64,3 +64,9 @@ SUBSYS(perf) + #endif + + /* */ ++ ++#ifdef CONFIG_CGROUP_BFQIO ++SUBSYS(bfqio) ++#endif ++ ++/* */ +-- +1.7.10.4 + diff --git a/3.2.34/0003-AppArmor-Allow-dfa-backward-compatibility-with-broke.patch b/3.2.34/0003-AppArmor-Allow-dfa-backward-compatibility-with-broke.patch new file mode 100644 index 0000000..be32585 --- /dev/null +++ b/3.2.34/0003-AppArmor-Allow-dfa-backward-compatibility-with-broke.patch @@ -0,0 +1,69 @@ +From 7a10d093f9779f42cb8d6affcb6a4436d3ebd6d3 Mon Sep 17 00:00:00 2001 +From: John Johansen +Date: Wed, 10 Aug 2011 22:02:41 -0700 +Subject: [PATCH 3/3] AppArmor: Allow dfa backward compatibility with broken + userspace + +The apparmor_parser when compiling policy could generate invalid dfas +that did not have sufficient padding to avoid invalid references, when +used by the kernel. The kernels check to verify the next/check table +size was broken meaning invalid dfas were being created by userspace +and not caught. + +To remain compatible with old tools that are not fixed, pad the loaded +dfas next/check table. The dfa's themselves are valid except for the +high padding for potentially invalid transitions (high bounds error), +which have a maximimum is 256 entries. So just allocate an extra null filled +256 entries for the next/check tables. This will guarentee all bounds +are good and invalid transitions go to the null (0) state. + +Signed-off-by: John Johansen +--- + security/apparmor/match.c | 17 +++++++++++++++++ + 1 files changed, 17 insertions(+), 0 deletions(-) + +diff --git a/security/apparmor/match.c b/security/apparmor/match.c +index 94de6b4..081491e 100644 +--- a/security/apparmor/match.c ++++ b/security/apparmor/match.c +@@ -57,8 +57,17 @@ static struct table_header *unpack_table(char *blob, size_t bsize) + if (bsize < tsize) + goto out; + ++ /* Pad table allocation for next/check by 256 entries to remain ++ * backwards compatible with old (buggy) tools and remain safe without ++ * run time checks ++ */ ++ if (th.td_id == YYTD_ID_NXT || th.td_id == YYTD_ID_CHK) ++ tsize += 256 * th.td_flags; ++ + table = kvmalloc(tsize); + if (table) { ++ /* ensure the pad is clear, else there will be errors */ ++ memset(table, 0, tsize); + *table = th; + if (th.td_flags == YYTD_DATA8) + UNPACK_ARRAY(table->td_data, blob, th.td_lolen, +@@ -134,11 +143,19 @@ static int verify_dfa(struct aa_dfa *dfa, int flags) + goto out; + + if (flags & DFA_FLAG_VERIFY_STATES) { ++ int warning = 0; + for (i = 0; i < state_count; i++) { + if (DEFAULT_TABLE(dfa)[i] >= state_count) + goto out; + /* TODO: do check that DEF state recursion terminates */ + if (BASE_TABLE(dfa)[i] + 255 >= trans_count) { ++ if (warning) ++ continue; ++ printk(KERN_WARNING "AppArmor DFA next/check " ++ "upper bounds error fixed, upgrade " ++ "user space tools \n"); ++ warning = 1; ++ } else if (BASE_TABLE(dfa)[i] >= trans_count) { + printk(KERN_ERR "AppArmor DFA next/check upper " + "bounds error\n"); + goto out; +-- +1.7.5.4 + diff --git a/3.2.34/0003-block-introduce-the-BFQ-v5-I-O-sched-for-3.2.patch b/3.2.34/0003-block-introduce-the-BFQ-v5-I-O-sched-for-3.2.patch new file mode 100644 index 0000000..077d54f --- /dev/null +++ b/3.2.34/0003-block-introduce-the-BFQ-v5-I-O-sched-for-3.2.patch @@ -0,0 +1,5986 @@ +From 8502cfecae9cfffbd1dc0379b2b2ab48d05cf48a Mon Sep 17 00:00:00 2001 +From: Arianna Avanzini +Date: Mon, 19 Dec 2011 16:34:45 +0100 +Subject: [PATCH 3/3] block: introduce the BFQ-v5 I/O sched for 3.2 + +Add the BFQ-v5 I/O scheduler to 3.2. +The general structure is borrowed from CFQ, as much code. A (bfq_)queue is +associated to each task doing I/O on a device, and each time a scheduling +decision has to be taken a queue is selected and it is served until it expires. + + - Slices are given in the service domain: tasks are assigned budgets, + measured in number of sectors. Once got the disk, a task must + however consume its assigned budget within a configurable maximum time + (by default, the maximum possible value of the budgets is automatically + computed to comply with this timeout). This allows the desired latency + vs "throughput boosting" tradeoff to be set. + + - Budgets are scheduled according to a variant of WF2Q+, implemented + using an augmented rb-tree to take eligibility into account while + preserving an O(log N) overall complexity. + + - A low-latency tunable is provided; if enabled, both interactive and soft + real-time applications are guaranteed very low latency. + + - Latency guarantees are preserved also in presence of NCQ. + + - High throughput with flash-based devices, while still preserving + latency guarantees. + + - Useful features borrowed from CFQ: cooperating-queues merging (with + some additional optimizations with respect to the original CFQ version), + static fallback queue for OOM. + + - BFQ supports full hierarchical scheduling, exporting a cgroups + interface. Each node has a full scheduler, so each group can + be assigned its own ioprio and an ioprio_class. + + - If the cgroups interface is used, weights can be explictly assigned, + otherwise ioprio values are mapped to weights using the relation + weight = IOPRIO_BE_NR - ioprio. + + - ioprio classes are served in strict priority order, i.e., lower + priority queues are not served as long as there are higher priority + queues. Among queues in the same class the bandwidth is distributed + in proportion to the weights of each queue. A very thin extra bandwidth + is however guaranteed to the Idle class, to prevent it from starving. + +Signed-off-by: Paolo Valente +Signed-off-by: Arianna Avanzini +--- + block/bfq-cgroup.c | 831 ++++++++++++++ + block/bfq-ioc.c | 380 +++++++ + block/bfq-iosched.c | 3021 +++++++++++++++++++++++++++++++++++++++++++++++++++ + block/bfq-sched.c | 1066 ++++++++++++++++++ + block/bfq.h | 593 ++++++++++ + 5 files changed, 5891 insertions(+) + create mode 100644 block/bfq-cgroup.c + create mode 100644 block/bfq-ioc.c + create mode 100644 block/bfq-iosched.c + create mode 100644 block/bfq-sched.c + create mode 100644 block/bfq.h + +diff --git a/block/bfq-cgroup.c block/bfq-cgroup.c +new file mode 100644 +index 0000000..74ae73b +--- /dev/null ++++ block/bfq-cgroup.c +@@ -0,0 +1,831 @@ ++/* ++ * BFQ: CGROUPS support. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. ++ */ ++ ++#ifdef CONFIG_CGROUP_BFQIO ++static struct bfqio_cgroup bfqio_root_cgroup = { ++ .weight = BFQ_DEFAULT_GRP_WEIGHT, ++ .ioprio = BFQ_DEFAULT_GRP_IOPRIO, ++ .ioprio_class = BFQ_DEFAULT_GRP_CLASS, ++}; ++ ++static inline void bfq_init_entity(struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++ entity->weight = entity->new_weight; ++ entity->orig_weight = entity->new_weight; ++ entity->ioprio = entity->new_ioprio; ++ entity->ioprio_class = entity->new_ioprio_class; ++ entity->parent = bfqg->my_entity; ++ entity->sched_data = &bfqg->sched_data; ++} ++ ++static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup) ++{ ++ return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id), ++ struct bfqio_cgroup, css); ++} ++ ++/* ++ * Search the bfq_group for bfqd into the hash table (by now only a list) ++ * of bgrp. Must be called under rcu_read_lock(). ++ */ ++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp, ++ struct bfq_data *bfqd) ++{ ++ struct bfq_group *bfqg; ++ struct hlist_node *n; ++ void *key; ++ ++ hlist_for_each_entry_rcu(bfqg, n, &bgrp->group_data, group_node) { ++ key = rcu_dereference(bfqg->bfqd); ++ if (key == bfqd) ++ return bfqg; ++ } ++ ++ return NULL; ++} ++ ++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp, ++ struct bfq_group *bfqg) ++{ ++ struct bfq_entity *entity = &bfqg->entity; ++ ++ entity->weight = entity->new_weight = bgrp->weight; ++ entity->orig_weight = entity->new_weight; ++ entity->ioprio = entity->new_ioprio = bgrp->ioprio; ++ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class; ++ entity->ioprio_changed = 1; ++ entity->my_sched_data = &bfqg->sched_data; ++} ++ ++static inline void bfq_group_set_parent(struct bfq_group *bfqg, ++ struct bfq_group *parent) ++{ ++ struct bfq_entity *entity; ++ ++ BUG_ON(parent == NULL); ++ BUG_ON(bfqg == NULL); ++ ++ entity = &bfqg->entity; ++ entity->parent = parent->my_entity; ++ entity->sched_data = &parent->sched_data; ++} ++ ++/** ++ * bfq_group_chain_alloc - allocate a chain of groups. ++ * @bfqd: queue descriptor. ++ * @cgroup: the leaf cgroup this chain starts from. ++ * ++ * Allocate a chain of groups starting from the one belonging to ++ * @cgroup up to the root cgroup. Stop if a cgroup on the chain ++ * to the root has already an allocated group on @bfqd. ++ */ ++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd, ++ struct cgroup *cgroup) ++{ ++ struct bfqio_cgroup *bgrp; ++ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL; ++ ++ for (; cgroup != NULL; cgroup = cgroup->parent) { ++ bgrp = cgroup_to_bfqio(cgroup); ++ ++ bfqg = bfqio_lookup_group(bgrp, bfqd); ++ if (bfqg != NULL) { ++ /* ++ * All the cgroups in the path from there to the ++ * root must have a bfq_group for bfqd, so we don't ++ * need any more allocations. ++ */ ++ break; ++ } ++ ++ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC); ++ if (bfqg == NULL) ++ goto cleanup; ++ ++ bfq_group_init_entity(bgrp, bfqg); ++ bfqg->my_entity = &bfqg->entity; ++ ++ if (leaf == NULL) { ++ leaf = bfqg; ++ prev = leaf; ++ } else { ++ bfq_group_set_parent(prev, bfqg); ++ /* ++ * Build a list of allocated nodes using the bfqd ++ * filed, that is still unused and will be initialized ++ * only after the node will be connected. ++ */ ++ prev->bfqd = bfqg; ++ prev = bfqg; ++ } ++ } ++ ++ return leaf; ++ ++cleanup: ++ while (leaf != NULL) { ++ prev = leaf; ++ leaf = leaf->bfqd; ++ kfree(prev); ++ } ++ ++ return NULL; ++} ++ ++/** ++ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy. ++ * @bfqd: the queue descriptor. ++ * @cgroup: the leaf cgroup to start from. ++ * @leaf: the leaf group (to be associated to @cgroup). ++ * ++ * Try to link a chain of groups to a cgroup hierarchy, connecting the ++ * nodes bottom-up, so we can be sure that when we find a cgroup in the ++ * hierarchy that already as a group associated to @bfqd all the nodes ++ * in the path to the root cgroup have one too. ++ * ++ * On locking: the queue lock protects the hierarchy (there is a hierarchy ++ * per device) while the bfqio_cgroup lock protects the list of groups ++ * belonging to the same cgroup. ++ */ ++static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup, ++ struct bfq_group *leaf) ++{ ++ struct bfqio_cgroup *bgrp; ++ struct bfq_group *bfqg, *next, *prev = NULL; ++ unsigned long flags; ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ ++ for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) { ++ bgrp = cgroup_to_bfqio(cgroup); ++ next = leaf->bfqd; ++ ++ bfqg = bfqio_lookup_group(bgrp, bfqd); ++ BUG_ON(bfqg != NULL); ++ ++ spin_lock_irqsave(&bgrp->lock, flags); ++ ++ rcu_assign_pointer(leaf->bfqd, bfqd); ++ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data); ++ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list); ++ ++ spin_unlock_irqrestore(&bgrp->lock, flags); ++ ++ prev = leaf; ++ leaf = next; ++ } ++ ++ BUG_ON(cgroup == NULL && leaf != NULL); ++ if (cgroup != NULL && prev != NULL) { ++ bgrp = cgroup_to_bfqio(cgroup); ++ bfqg = bfqio_lookup_group(bgrp, bfqd); ++ bfq_group_set_parent(prev, bfqg); ++ } ++} ++ ++/** ++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup. ++ * @bfqd: queue descriptor. ++ * @cgroup: cgroup being searched for. ++ * ++ * Return a group associated to @bfqd in @cgroup, allocating one if ++ * necessary. When a group is returned all the cgroups in the path ++ * to the root have a group associated to @bfqd. ++ * ++ * If the allocation fails, return the root group: this breaks guarantees ++ * but is a safe fallbak. If this loss becames a problem it can be ++ * mitigated using the equivalent weight (given by the product of the ++ * weights of the groups in the path from @group to the root) in the ++ * root scheduler. ++ * ++ * We allocate all the missing nodes in the path from the leaf cgroup ++ * to the root and we connect the nodes only after all the allocations ++ * have been successful. ++ */ ++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, ++ struct cgroup *cgroup) ++{ ++ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup); ++ struct bfq_group *bfqg; ++ ++ bfqg = bfqio_lookup_group(bgrp, bfqd); ++ if (bfqg != NULL) ++ return bfqg; ++ ++ bfqg = bfq_group_chain_alloc(bfqd, cgroup); ++ if (bfqg != NULL) ++ bfq_group_chain_link(bfqd, cgroup, bfqg); ++ else ++ bfqg = bfqd->root_group; ++ ++ return bfqg; ++} ++ ++/** ++ * bfq_bfqq_move - migrate @bfqq to @bfqg. ++ * @bfqd: queue descriptor. ++ * @bfqq: the queue to move. ++ * @entity: @bfqq's entity. ++ * @bfqg: the group to move to. ++ * ++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating ++ * it on the new one. Avoid putting the entity on the old group idle tree. ++ * ++ * Must be called under the queue lock; the cgroup owning @bfqg must ++ * not disappear (by now this just means that we are called under ++ * rcu_read_lock()). ++ */ ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_entity *entity, struct bfq_group *bfqg) ++{ ++ int busy, resume; ++ ++ busy = bfq_bfqq_busy(bfqq); ++ resume = !RB_EMPTY_ROOT(&bfqq->sort_list); ++ ++ BUG_ON(resume && !entity->on_st); ++ BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue); ++ ++ if (busy) { ++ BUG_ON(atomic_read(&bfqq->ref) < 2); ++ ++ if (!resume) ++ bfq_del_bfqq_busy(bfqd, bfqq, 0); ++ else ++ bfq_deactivate_bfqq(bfqd, bfqq, 0); ++ } else if (entity->on_st) ++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); ++ ++ /* ++ * Here we use a reference to bfqg. We don't need a refcounter ++ * as the cgroup reference will not be dropped, so that its ++ * destroy() callback will not be invoked. ++ */ ++ entity->parent = bfqg->my_entity; ++ entity->sched_data = &bfqg->sched_data; ++ ++ if (busy && resume) ++ bfq_activate_bfqq(bfqd, bfqq); ++} ++ ++/** ++ * __bfq_cic_change_cgroup - move @cic to @cgroup. ++ * @bfqd: the queue descriptor. ++ * @cic: the cic to move. ++ * @cgroup: the cgroup to move to. ++ * ++ * Move cic to cgroup, assuming that bfqd->queue is locked; the caller ++ * has to make sure that the reference to cgroup is valid across the call. ++ * ++ * NOTE: an alternative approach might have been to store the current ++ * cgroup in bfqq and getting a reference to it, reducing the lookup ++ * time here, at the price of slightly more complex code. ++ */ ++static struct bfq_group *__bfq_cic_change_cgroup(struct bfq_data *bfqd, ++ struct cfq_io_context *cic, ++ struct cgroup *cgroup) ++{ ++ struct bfq_queue *async_bfqq = cic_to_bfqq(cic, 0); ++ struct bfq_queue *sync_bfqq = cic_to_bfqq(cic, 1); ++ struct bfq_entity *entity; ++ struct bfq_group *bfqg; ++ ++ bfqg = bfq_find_alloc_group(bfqd, cgroup); ++ if (async_bfqq != NULL) { ++ entity = &async_bfqq->entity; ++ ++ if (entity->sched_data != &bfqg->sched_data) { ++ cic_set_bfqq(cic, NULL, 0); ++ bfq_log_bfqq(bfqd, async_bfqq, ++ "cic_change_group: %p %d", ++ async_bfqq, atomic_read(&async_bfqq->ref)); ++ bfq_put_queue(async_bfqq); ++ } ++ } ++ ++ if (sync_bfqq != NULL) { ++ entity = &sync_bfqq->entity; ++ if (entity->sched_data != &bfqg->sched_data) ++ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); ++ } ++ ++ return bfqg; ++} ++ ++/** ++ * bfq_cic_change_cgroup - move @cic to @cgroup. ++ * @cic: the cic being migrated. ++ * @cgroup: the destination cgroup. ++ * ++ * When the task owning @cic is moved to @cgroup, @cic is immediately ++ * moved into its new parent group. ++ */ ++static void bfq_cic_change_cgroup(struct cfq_io_context *cic, ++ struct cgroup *cgroup) ++{ ++ struct bfq_data *bfqd; ++ unsigned long uninitialized_var(flags); ++ ++ bfqd = bfq_get_bfqd_locked(&cic->key, &flags); ++ if (bfqd != NULL && ++ !strncmp(bfqd->queue->elevator->elevator_type->elevator_name, ++ "bfq", ELV_NAME_MAX)) { ++ __bfq_cic_change_cgroup(bfqd, cic, cgroup); ++ bfq_put_bfqd_unlock(bfqd, &flags); ++ } ++} ++ ++/** ++ * bfq_cic_update_cgroup - update the cgroup of @cic. ++ * @cic: the @cic to update. ++ * ++ * Make sure that @cic is enqueued in the cgroup of the current task. ++ * We need this in addition to moving cics during the cgroup attach ++ * phase because the task owning @cic could be at its first disk ++ * access or we may end up in the root cgroup as the result of a ++ * memory allocation failure and here we try to move to the right ++ * group. ++ * ++ * Must be called under the queue lock. It is safe to use the returned ++ * value even after the rcu_read_unlock() as the migration/destruction ++ * paths act under the queue lock too. IOW it is impossible to race with ++ * group migration/destruction and end up with an invalid group as: ++ * a) here cgroup has not yet been destroyed, nor its destroy callback ++ * has started execution, as current holds a reference to it, ++ * b) if it is destroyed after rcu_read_unlock() [after current is ++ * migrated to a different cgroup] its attach() callback will have ++ * taken care of remove all the references to the old cgroup data. ++ */ ++static struct bfq_group *bfq_cic_update_cgroup(struct cfq_io_context *cic) ++{ ++ struct bfq_data *bfqd = cic->key; ++ struct bfq_group *bfqg; ++ struct cgroup *cgroup; ++ ++ BUG_ON(bfqd == NULL); ++ ++ rcu_read_lock(); ++ cgroup = task_cgroup(current, bfqio_subsys_id); ++ bfqg = __bfq_cic_change_cgroup(bfqd, cic, cgroup); ++ rcu_read_unlock(); ++ ++ return bfqg; ++} ++ ++/** ++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. ++ * @st: the service tree being flushed. ++ */ ++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *entity = st->first_idle; ++ ++ for (; entity != NULL; entity = st->first_idle) ++ __bfq_deactivate_entity(entity, 0); ++} ++ ++/** ++ * bfq_reparent_leaf_entity - move leaf entity to the root_group. ++ * @bfqd: the device data structure with the root group. ++ * @entity: the entity to move. ++ */ ++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ BUG_ON(bfqq == NULL); ++ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); ++ return; ++} ++ ++/** ++ * bfq_reparent_active_entities - move to the root group all active entities. ++ * @bfqd: the device data structure with the root group. ++ * @bfqg: the group to move from. ++ * @st: the service tree with the entities. ++ * ++ * Needs queue_lock to be taken and reference to be valid over the call. ++ */ ++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ struct bfq_service_tree *st) ++{ ++ struct rb_root *active = &st->active; ++ struct bfq_entity *entity = NULL; ++ ++ if (!RB_EMPTY_ROOT(&st->active)) ++ entity = bfq_entity_of(rb_first(active)); ++ ++ for (; entity != NULL ; entity = bfq_entity_of(rb_first(active))) ++ bfq_reparent_leaf_entity(bfqd, entity); ++ ++ if (bfqg->sched_data.active_entity != NULL) ++ bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity); ++ ++ return; ++} ++ ++/** ++ * bfq_destroy_group - destroy @bfqg. ++ * @bgrp: the bfqio_cgroup containing @bfqg. ++ * @bfqg: the group being destroyed. ++ * ++ * Destroy @bfqg, making sure that it is not referenced from its parent. ++ */ ++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) ++{ ++ struct bfq_data *bfqd; ++ struct bfq_service_tree *st; ++ struct bfq_entity *entity = bfqg->my_entity; ++ unsigned long uninitialized_var(flags); ++ int i; ++ ++ hlist_del(&bfqg->group_node); ++ ++ /* ++ * Empty all service_trees belonging to this group before deactivating ++ * the group itself. ++ */ ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { ++ st = bfqg->sched_data.service_tree + i; ++ ++ /* ++ * The idle tree may still contain bfq_queues belonging ++ * to exited task because they never migrated to a different ++ * cgroup from the one being destroyed now. Noone else ++ * can access them so it's safe to act without any lock. ++ */ ++ bfq_flush_idle_tree(st); ++ ++ /* ++ * It may happen that some queues are still active ++ * (busy) upon group destruction (if the corresponding ++ * processes have been forced to terminate). We move ++ * all the leaf entities corresponding to these queues ++ * to the root_group. ++ * Also, it may happen that the group has an entity ++ * under service, which is disconnected from the active ++ * tree: it must be moved, too. ++ * There is no need to put the sync queues, as the ++ * scheduler has taken no reference. ++ */ ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); ++ if (bfqd != NULL) { ++ bfq_reparent_active_entities(bfqd, bfqg, st); ++ bfq_put_bfqd_unlock(bfqd, &flags); ++ } ++ BUG_ON(!RB_EMPTY_ROOT(&st->active)); ++ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); ++ } ++ BUG_ON(bfqg->sched_data.next_active != NULL); ++ BUG_ON(bfqg->sched_data.active_entity != NULL); ++ ++ /* ++ * We may race with device destruction, take extra care when ++ * dereferencing bfqg->bfqd. ++ */ ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); ++ if (bfqd != NULL) { ++ hlist_del(&bfqg->bfqd_node); ++ __bfq_deactivate_entity(entity, 0); ++ bfq_put_async_queues(bfqd, bfqg); ++ bfq_put_bfqd_unlock(bfqd, &flags); ++ } ++ BUG_ON(entity->tree != NULL); ++ ++ /* ++ * No need to defer the kfree() to the end of the RCU grace ++ * period: we are called from the destroy() callback of our ++ * cgroup, so we can be sure that noone is a) still using ++ * this cgroup or b) doing lookups in it. ++ */ ++ kfree(bfqg); ++} ++ ++/** ++ * bfq_disconnect_groups - diconnect @bfqd from all its groups. ++ * @bfqd: the device descriptor being exited. ++ * ++ * When the device exits we just make sure that no lookup can return ++ * the now unused group structures. They will be deallocated on cgroup ++ * destruction. ++ */ ++static void bfq_disconnect_groups(struct bfq_data *bfqd) ++{ ++ struct hlist_node *pos, *n; ++ struct bfq_group *bfqg; ++ ++ bfq_log(bfqd, "disconnect_groups beginning") ; ++ hlist_for_each_entry_safe(bfqg, pos, n, &bfqd->group_list, bfqd_node) { ++ hlist_del(&bfqg->bfqd_node); ++ ++ __bfq_deactivate_entity(bfqg->my_entity, 0); ++ ++ /* ++ * Don't remove from the group hash, just set an ++ * invalid key. No lookups can race with the ++ * assignment as bfqd is being destroyed; this ++ * implies also that new elements cannot be added ++ * to the list. ++ */ ++ rcu_assign_pointer(bfqg->bfqd, NULL); ++ ++ bfq_log(bfqd, "disconnect_groups: put async for group %p", ++ bfqg) ; ++ bfq_put_async_queues(bfqd, bfqg); ++ } ++} ++ ++static inline void bfq_free_root_group(struct bfq_data *bfqd) ++{ ++ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup; ++ struct bfq_group *bfqg = bfqd->root_group; ++ ++ bfq_put_async_queues(bfqd, bfqg); ++ ++ spin_lock_irq(&bgrp->lock); ++ hlist_del_rcu(&bfqg->group_node); ++ spin_unlock_irq(&bgrp->lock); ++ ++ /* ++ * No need to synchronize_rcu() here: since the device is gone ++ * there cannot be any read-side access to its root_group. ++ */ ++ kfree(bfqg); ++} ++ ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) ++{ ++ struct bfq_group *bfqg; ++ struct bfqio_cgroup *bgrp; ++ int i; ++ ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); ++ if (bfqg == NULL) ++ return NULL; ++ ++ bfqg->entity.parent = NULL; ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ ++ bgrp = &bfqio_root_cgroup; ++ spin_lock_irq(&bgrp->lock); ++ rcu_assign_pointer(bfqg->bfqd, bfqd); ++ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data); ++ spin_unlock_irq(&bgrp->lock); ++ ++ return bfqg; ++} ++ ++#define SHOW_FUNCTION(__VAR) \ ++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup, \ ++ struct cftype *cftype) \ ++{ \ ++ struct bfqio_cgroup *bgrp; \ ++ u64 ret; \ ++ \ ++ if (!cgroup_lock_live_group(cgroup)) \ ++ return -ENODEV; \ ++ \ ++ bgrp = cgroup_to_bfqio(cgroup); \ ++ spin_lock_irq(&bgrp->lock); \ ++ ret = bgrp->__VAR; \ ++ spin_unlock_irq(&bgrp->lock); \ ++ \ ++ cgroup_unlock(); \ ++ \ ++ return ret; \ ++} ++ ++SHOW_FUNCTION(weight); ++SHOW_FUNCTION(ioprio); ++SHOW_FUNCTION(ioprio_class); ++#undef SHOW_FUNCTION ++ ++#define STORE_FUNCTION(__VAR, __MIN, __MAX) \ ++static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \ ++ struct cftype *cftype, \ ++ u64 val) \ ++{ \ ++ struct bfqio_cgroup *bgrp; \ ++ struct bfq_group *bfqg; \ ++ struct hlist_node *n; \ ++ \ ++ if (val < (__MIN) || val > (__MAX)) \ ++ return -EINVAL; \ ++ \ ++ if (!cgroup_lock_live_group(cgroup)) \ ++ return -ENODEV; \ ++ \ ++ bgrp = cgroup_to_bfqio(cgroup); \ ++ \ ++ spin_lock_irq(&bgrp->lock); \ ++ bgrp->__VAR = (unsigned short)val; \ ++ hlist_for_each_entry(bfqg, n, &bgrp->group_data, group_node) { \ ++ bfqg->entity.new_##__VAR = (unsigned short)val; \ ++ smp_wmb(); \ ++ bfqg->entity.ioprio_changed = 1; \ ++ } \ ++ spin_unlock_irq(&bgrp->lock); \ ++ \ ++ cgroup_unlock(); \ ++ \ ++ return 0; \ ++} ++ ++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT); ++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1); ++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); ++#undef STORE_FUNCTION ++ ++static struct cftype bfqio_files[] = { ++ { ++ .name = "weight", ++ .read_u64 = bfqio_cgroup_weight_read, ++ .write_u64 = bfqio_cgroup_weight_write, ++ }, ++ { ++ .name = "ioprio", ++ .read_u64 = bfqio_cgroup_ioprio_read, ++ .write_u64 = bfqio_cgroup_ioprio_write, ++ }, ++ { ++ .name = "ioprio_class", ++ .read_u64 = bfqio_cgroup_ioprio_class_read, ++ .write_u64 = bfqio_cgroup_ioprio_class_write, ++ }, ++}; ++ ++static int bfqio_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) ++{ ++ return cgroup_add_files(cgroup, subsys, bfqio_files, ++ ARRAY_SIZE(bfqio_files)); ++} ++ ++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys *subsys, ++ struct cgroup *cgroup) ++{ ++ struct bfqio_cgroup *bgrp; ++ ++ if (cgroup->parent != NULL) { ++ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL); ++ if (bgrp == NULL) ++ return ERR_PTR(-ENOMEM); ++ } else ++ bgrp = &bfqio_root_cgroup; ++ ++ spin_lock_init(&bgrp->lock); ++ INIT_HLIST_HEAD(&bgrp->group_data); ++ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO; ++ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS; ++ ++ return &bgrp->css; ++} ++ ++/* ++ * We cannot support shared io contexts, as we have no mean to support ++ * two tasks with the same ioc in two different groups without major rework ++ * of the main cic/bfqq data structures. By now we allow a task to change ++ * its cgroup only if it's the only owner of its ioc; the drawback of this ++ * behavior is that a group containing a task that forked using CLONE_IO ++ * will not be destroyed until the tasks sharing the ioc die. ++ */ ++static int bfqio_can_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, ++ struct task_struct *tsk) ++{ ++ struct io_context *ioc; ++ int ret = 0; ++ ++ /* task_lock() is needed to avoid races with exit_io_context() */ ++ task_lock(tsk); ++ ioc = tsk->io_context; ++ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) ++ /* ++ * ioc == NULL means that the task is either too young or ++ * exiting: if it has still no ioc the ioc can't be shared, ++ * if the task is exiting the attach will fail anyway, no ++ * matter what we return here. ++ */ ++ ret = -EINVAL; ++ task_unlock(tsk); ++ ++ return ret; ++} ++ ++static void bfqio_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, ++ struct cgroup *prev, struct task_struct *tsk) ++{ ++ struct io_context *ioc; ++ struct cfq_io_context *cic; ++ struct hlist_node *n; ++ ++ task_lock(tsk); ++ ioc = tsk->io_context; ++ if (ioc != NULL) { ++ BUG_ON(atomic_long_read(&ioc->refcount) == 0); ++ atomic_long_inc(&ioc->refcount); ++ } ++ task_unlock(tsk); ++ ++ if (ioc == NULL) ++ return; ++ ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(cic, n, &ioc->bfq_cic_list, cic_list) ++ bfq_cic_change_cgroup(cic, cgroup); ++ rcu_read_unlock(); ++ ++ put_io_context(ioc); ++} ++ ++static void bfqio_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) ++{ ++ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup); ++ struct hlist_node *n, *tmp; ++ struct bfq_group *bfqg; ++ ++ /* ++ * Since we are destroying the cgroup, there are no more tasks ++ * referencing it, and all the RCU grace periods that may have ++ * referenced it are ended (as the destruction of the parent ++ * cgroup is RCU-safe); bgrp->group_data will not be accessed by ++ * anything else and we don't need any synchronization. ++ */ ++ hlist_for_each_entry_safe(bfqg, n, tmp, &bgrp->group_data, group_node) ++ bfq_destroy_group(bgrp, bfqg); ++ ++ BUG_ON(!hlist_empty(&bgrp->group_data)); ++ ++ kfree(bgrp); ++} ++ ++struct cgroup_subsys bfqio_subsys = { ++ .name = "bfqio", ++ .create = bfqio_create, ++ .can_attach = bfqio_can_attach, ++ .attach = bfqio_attach, ++ .destroy = bfqio_destroy, ++ .populate = bfqio_populate, ++ .subsys_id = bfqio_subsys_id, ++}; ++#else ++static inline void bfq_init_entity(struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++ entity->weight = entity->new_weight; ++ entity->orig_weight = entity->new_weight; ++ entity->ioprio = entity->new_ioprio; ++ entity->ioprio_class = entity->new_ioprio_class; ++ entity->sched_data = &bfqg->sched_data; ++} ++ ++static inline struct bfq_group * ++bfq_cic_update_cgroup(struct cfq_io_context *cic) ++{ ++ struct bfq_data *bfqd = cic->key; ++ return bfqd->root_group; ++} ++ ++static inline void bfq_bfqq_move(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++} ++ ++static inline void bfq_disconnect_groups(struct bfq_data *bfqd) ++{ ++ bfq_put_async_queues(bfqd, bfqd->root_group); ++} ++ ++static inline void bfq_free_root_group(struct bfq_data *bfqd) ++{ ++ kfree(bfqd->root_group); ++} ++ ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) ++{ ++ struct bfq_group *bfqg; ++ int i; ++ ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); ++ if (bfqg == NULL) ++ return NULL; ++ ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ ++ return bfqg; ++} ++#endif +diff --git a/block/bfq-ioc.c block/bfq-ioc.c +new file mode 100644 +index 0000000..8f2b6c6 +--- /dev/null ++++ block/bfq-ioc.c +@@ -0,0 +1,380 @@ ++/* ++ * BFQ: I/O context handling. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ */ ++ ++/** ++ * bfq_cic_free_rcu - deferred cic freeing. ++ * @head: RCU head of the cic to free. ++ * ++ * Free the cic containing @head and, if it was the last one and ++ * the module is exiting wake up anyone waiting for its deallocation ++ * (see bfq_exit()). ++ */ ++static void bfq_cic_free_rcu(struct rcu_head *head) ++{ ++ struct cfq_io_context *cic; ++ ++ cic = container_of(head, struct cfq_io_context, rcu_head); ++ ++ kmem_cache_free(bfq_ioc_pool, cic); ++ elv_ioc_count_dec(bfq_ioc_count); ++ ++ if (bfq_ioc_gone != NULL) { ++ spin_lock(&bfq_ioc_gone_lock); ++ if (bfq_ioc_gone != NULL && ++ !elv_ioc_count_read(bfq_ioc_count)) { ++ complete(bfq_ioc_gone); ++ bfq_ioc_gone = NULL; ++ } ++ spin_unlock(&bfq_ioc_gone_lock); ++ } ++} ++ ++static void bfq_cic_free(struct cfq_io_context *cic) ++{ ++ call_rcu(&cic->rcu_head, bfq_cic_free_rcu); ++} ++ ++/** ++ * cic_free_func - disconnect a cic ready to be freed. ++ * @ioc: the io_context @cic belongs to. ++ * @cic: the cic to be freed. ++ * ++ * Remove @cic from the @ioc radix tree hash and from its cic list, ++ * deferring the deallocation of @cic to the end of the current RCU ++ * grace period. This assumes that __bfq_exit_single_io_context() ++ * has already been called for @cic. ++ */ ++static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) ++{ ++ unsigned long flags; ++ unsigned long dead_key = (unsigned long) cic->key; ++ ++ BUG_ON(!(dead_key & CIC_DEAD_KEY)); ++ ++ spin_lock_irqsave(&ioc->lock, flags); ++ radix_tree_delete(&ioc->bfq_radix_root, ++ dead_key >> CIC_DEAD_INDEX_SHIFT); ++ hlist_del_init_rcu(&cic->cic_list); ++ spin_unlock_irqrestore(&ioc->lock, flags); ++ ++ bfq_cic_free(cic); ++} ++ ++static void bfq_free_io_context(struct io_context *ioc) ++{ ++ /* ++ * ioc->refcount is zero here, or we are called from elv_unregister(), ++ * so no more cic's are allowed to be linked into this ioc. So it ++ * should be ok to iterate over the known list, we will see all cic's ++ * since no new ones are added. ++ */ ++ call_for_each_cic(ioc, cic_free_func); ++} ++ ++/** ++ * __bfq_exit_single_io_context - deassociate @cic from any running task. ++ * @bfqd: bfq_data on which @cic is valid. ++ * @cic: the cic being exited. ++ * ++ * Whenever no more tasks are using @cic or @bfqd is deallocated we ++ * need to invalidate its entry in the radix tree hash table and to ++ * release the queues it refers to. ++ * ++ * Called under the queue lock. ++ */ ++static void __bfq_exit_single_io_context(struct bfq_data *bfqd, ++ struct cfq_io_context *cic) ++{ ++ struct io_context *ioc = cic->ioc; ++ ++ list_del_init(&cic->queue_list); ++ ++ /* ++ * Make sure dead mark is seen for dead queues ++ */ ++ smp_wmb(); ++ rcu_assign_pointer(cic->key, bfqd_dead_key(bfqd)); ++ ++ /* ++ * No write-side locking as no task is using @ioc (they're exited ++ * or bfqd is being deallocated. ++ */ ++ rcu_read_lock(); ++ if (rcu_dereference(ioc->ioc_data) == cic) { ++ rcu_read_unlock(); ++ spin_lock(&ioc->lock); ++ rcu_assign_pointer(ioc->ioc_data, NULL); ++ spin_unlock(&ioc->lock); ++ } else ++ rcu_read_unlock(); ++ ++ if (cic->cfqq[BLK_RW_ASYNC] != NULL) { ++ bfq_exit_bfqq(bfqd, cic->cfqq[BLK_RW_ASYNC]); ++ cic->cfqq[BLK_RW_ASYNC] = NULL; ++ } ++ ++ if (cic->cfqq[BLK_RW_SYNC] != NULL) { ++ bfq_exit_bfqq(bfqd, cic->cfqq[BLK_RW_SYNC]); ++ cic->cfqq[BLK_RW_SYNC] = NULL; ++ } ++} ++ ++/** ++ * bfq_exit_single_io_context - deassociate @cic from @ioc (unlocked version). ++ * @ioc: the io_context @cic belongs to. ++ * @cic: the cic being exited. ++ * ++ * Take the queue lock and call __bfq_exit_single_io_context() to do the ++ * rest of the work. We take care of possible races with bfq_exit_queue() ++ * using bfq_get_bfqd_locked() (and abusing a little bit the RCU mechanism). ++ */ ++static void bfq_exit_single_io_context(struct io_context *ioc, ++ struct cfq_io_context *cic) ++{ ++ struct bfq_data *bfqd; ++ unsigned long uninitialized_var(flags); ++ ++ bfqd = bfq_get_bfqd_locked(&cic->key, &flags); ++ if (bfqd != NULL) { ++ __bfq_exit_single_io_context(bfqd, cic); ++ bfq_put_bfqd_unlock(bfqd, &flags); ++ } ++} ++ ++/** ++ * bfq_exit_io_context - deassociate @ioc from all cics it owns. ++ * @ioc: the @ioc being exited. ++ * ++ * No more processes are using @ioc we need to clean up and put the ++ * internal structures we have that belongs to that process. Loop ++ * through all its cics, locking their queues and exiting them. ++ */ ++static void bfq_exit_io_context(struct io_context *ioc) ++{ ++ call_for_each_cic(ioc, bfq_exit_single_io_context); ++} ++ ++static struct cfq_io_context *bfq_alloc_io_context(struct bfq_data *bfqd, ++ gfp_t gfp_mask) ++{ ++ struct cfq_io_context *cic; ++ ++ cic = kmem_cache_alloc_node(bfq_ioc_pool, gfp_mask | __GFP_ZERO, ++ bfqd->queue->node); ++ if (cic != NULL) { ++ cic->ttime.last_end_request = jiffies; ++ INIT_LIST_HEAD(&cic->queue_list); ++ INIT_HLIST_NODE(&cic->cic_list); ++ cic->dtor = bfq_free_io_context; ++ cic->exit = bfq_exit_io_context; ++ elv_ioc_count_inc(bfq_ioc_count); ++ } ++ ++ return cic; ++} ++ ++/** ++ * bfq_drop_dead_cic - free an exited cic. ++ * @bfqd: bfq data for the device in use. ++ * @ioc: io_context owning @cic. ++ * @cic: the @cic to free. ++ * ++ * We drop cfq io contexts lazily, so we may find a dead one. ++ */ ++static void bfq_drop_dead_cic(struct bfq_data *bfqd, struct io_context *ioc, ++ struct cfq_io_context *cic) ++{ ++ unsigned long flags; ++ ++ WARN_ON(!list_empty(&cic->queue_list)); ++ BUG_ON(cic->key != bfqd_dead_key(bfqd)); ++ ++ spin_lock_irqsave(&ioc->lock, flags); ++ ++ BUG_ON(ioc->ioc_data == cic); ++ ++ /* ++ * With shared I/O contexts two lookups may race and drop the ++ * same cic more than one time: RCU guarantees that the storage ++ * will not be freed too early, here we make sure that we do ++ * not try to remove the cic from the hashing structures multiple ++ * times. ++ */ ++ if (!hlist_unhashed(&cic->cic_list)) { ++ radix_tree_delete(&ioc->bfq_radix_root, bfqd->cic_index); ++ hlist_del_init_rcu(&cic->cic_list); ++ bfq_cic_free(cic); ++ } ++ ++ spin_unlock_irqrestore(&ioc->lock, flags); ++} ++ ++/** ++ * bfq_cic_lookup - search into @ioc a cic associated to @bfqd. ++ * @bfqd: the lookup key. ++ * @ioc: the io_context of the process doing I/O. ++ * ++ * If @ioc already has a cic associated to @bfqd return it, return %NULL ++ * otherwise. ++ */ ++static struct cfq_io_context *bfq_cic_lookup(struct bfq_data *bfqd, ++ struct io_context *ioc) ++{ ++ struct cfq_io_context *cic; ++ unsigned long flags; ++ void *k; ++ ++ if (unlikely(ioc == NULL)) ++ return NULL; ++ ++ rcu_read_lock(); ++ ++ /* We maintain a last-hit cache, to avoid browsing over the tree. */ ++ cic = rcu_dereference(ioc->ioc_data); ++ if (cic != NULL) { ++ k = rcu_dereference(cic->key); ++ if (k == bfqd) ++ goto out; ++ } ++ ++ do { ++ cic = radix_tree_lookup(&ioc->bfq_radix_root, ++ bfqd->cic_index); ++ if (cic == NULL) ++ goto out; ++ ++ k = rcu_dereference(cic->key); ++ if (unlikely(k != bfqd)) { ++ rcu_read_unlock(); ++ bfq_drop_dead_cic(bfqd, ioc, cic); ++ rcu_read_lock(); ++ continue; ++ } ++ ++ spin_lock_irqsave(&ioc->lock, flags); ++ rcu_assign_pointer(ioc->ioc_data, cic); ++ spin_unlock_irqrestore(&ioc->lock, flags); ++ break; ++ } while (1); ++ ++out: ++ rcu_read_unlock(); ++ ++ return cic; ++} ++ ++/** ++ * bfq_cic_link - add @cic to @ioc. ++ * @bfqd: bfq_data @cic refers to. ++ * @ioc: io_context @cic belongs to. ++ * @cic: the cic to link. ++ * @gfp_mask: the mask to use for radix tree preallocations. ++ * ++ * Add @cic to @ioc, using @bfqd as the search key. This enables us to ++ * lookup the process specific cfq io context when entered from the block ++ * layer. Also adds @cic to a per-bfqd list, used when this queue is ++ * removed. ++ */ ++static int bfq_cic_link(struct bfq_data *bfqd, struct io_context *ioc, ++ struct cfq_io_context *cic, gfp_t gfp_mask) ++{ ++ unsigned long flags; ++ int ret; ++ ++ ret = radix_tree_preload(gfp_mask); ++ if (ret == 0) { ++ cic->ioc = ioc; ++ ++ /* No write-side locking, cic is not published yet. */ ++ rcu_assign_pointer(cic->key, bfqd); ++ ++ spin_lock_irqsave(&ioc->lock, flags); ++ ret = radix_tree_insert(&ioc->bfq_radix_root, ++ bfqd->cic_index, cic); ++ if (ret == 0) ++ hlist_add_head_rcu(&cic->cic_list, &ioc->bfq_cic_list); ++ spin_unlock_irqrestore(&ioc->lock, flags); ++ ++ radix_tree_preload_end(); ++ ++ if (ret == 0) { ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags); ++ list_add(&cic->queue_list, &bfqd->cic_list); ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); ++ } ++ } ++ ++ if (ret != 0) ++ printk(KERN_ERR "bfq: cic link failed!\n"); ++ ++ return ret; ++} ++ ++/** ++ * bfq_ioc_set_ioprio - signal a priority change to the cics belonging to @ioc. ++ * @ioc: the io_context changing its priority. ++ */ ++static inline void bfq_ioc_set_ioprio(struct io_context *ioc) ++{ ++ call_for_each_cic(ioc, bfq_changed_ioprio); ++} ++ ++/** ++ * bfq_get_io_context - return the @cic associated to @bfqd in @ioc. ++ * @bfqd: the search key. ++ * @gfp_mask: the mask to use for cic allocation. ++ * ++ * Setup general io context and cfq io context. There can be several cfq ++ * io contexts per general io context, if this process is doing io to more ++ * than one device managed by cfq. ++ */ ++static struct cfq_io_context *bfq_get_io_context(struct bfq_data *bfqd, ++ gfp_t gfp_mask) ++{ ++ struct io_context *ioc = NULL; ++ struct cfq_io_context *cic; ++ ++ might_sleep_if(gfp_mask & __GFP_WAIT); ++ ++ ioc = get_io_context(gfp_mask, bfqd->queue->node); ++ if (ioc == NULL) ++ return NULL; ++ ++ /* Lookup for an existing cic. */ ++ cic = bfq_cic_lookup(bfqd, ioc); ++ if (cic != NULL) ++ goto out; ++ ++ /* Alloc one if needed. */ ++ cic = bfq_alloc_io_context(bfqd, gfp_mask); ++ if (cic == NULL) ++ goto err; ++ ++ /* Link it into the ioc's radix tree and cic list. */ ++ if (bfq_cic_link(bfqd, ioc, cic, gfp_mask) != 0) ++ goto err_free; ++ ++out: ++ /* ++ * test_and_clear_bit() implies a memory barrier, paired with ++ * the wmb() in fs/ioprio.c, so the value seen for ioprio is the ++ * new one. ++ */ ++ if (unlikely(test_and_clear_bit(IOC_BFQ_IOPRIO_CHANGED, ++ ioc->ioprio_changed))) ++ bfq_ioc_set_ioprio(ioc); ++ ++ return cic; ++err_free: ++ bfq_cic_free(cic); ++err: ++ put_io_context(ioc); ++ return NULL; ++} +diff --git a/block/bfq-iosched.c block/bfq-iosched.c +new file mode 100644 +index 0000000..576cd03 +--- /dev/null ++++ block/bfq-iosched.c +@@ -0,0 +1,3021 @@ ++/* ++ * BFQ, or Budget Fair Queueing, disk scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. ++ * ++ * BFQ is a proportional share disk scheduling algorithm based on the ++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, ++ * measured in number of sectors, to tasks instead of time slices. ++ * The disk is not granted to the active task for a given time slice, ++ * but until it has exahusted its assigned budget. This change from ++ * the time to the service domain allows BFQ to distribute the disk ++ * bandwidth among tasks as desired, without any distortion due to ++ * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc ++ * internal scheduler, called B-WF2Q+, to schedule tasks according to ++ * their budgets. Thanks to this accurate scheduler, BFQ can afford ++ * to assign high budgets to disk-bound non-seeky tasks (to boost the ++ * throughput), and yet guarantee low latencies to interactive and ++ * soft real-time applications. ++ * ++ * BFQ has been introduced in [1], where the interested reader can ++ * find an accurate description of the algorithm, the bandwidth ++ * distribution and latency guarantees it provides, plus formal proofs ++ * of all the properties. With respect to the algorithm presented in ++ * the paper, this implementation adds several little heuristics, and ++ * a hierarchical extension, based on H-WF2Q+. ++ * ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) ++ * complexity derives from the one introduced with EEVDF in [3]. ++ * ++ * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling ++ * with Deterministic Guarantees on Bandwidth Distribution,'', ++ * IEEE Transactions on Computer, May 2010. ++ * ++ * http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf ++ * ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, ++ * Oct 1997. ++ * ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz ++ * ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline ++ * First: A Flexible and Accurate Mechanism for Proportional Share ++ * Resource Allocation,'' technical report. ++ * ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "bfq.h" ++ ++/* Max number of dispatches in one round of service. */ ++static const int bfq_quantum = 4; ++ ++/* Expiration time of sync (0) and async (1) requests, in jiffies. */ ++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; ++ ++/* Maximum backwards seek, in KiB. */ ++static const int bfq_back_max = 16 * 1024; ++ ++/* Penalty of a backwards seek, in number of sectors. */ ++static const int bfq_back_penalty = 2; ++ ++/* Idling period duration, in jiffies. */ ++static int bfq_slice_idle = HZ / 125; ++ ++/* Default maximum budget values, in sectors and number of requests. */ ++static const int bfq_default_max_budget = 16 * 1024; ++static const int bfq_max_budget_async_rq = 4; ++ ++/* ++ * Async to sync throughput distribution is controlled as follows: ++ * when an async request is served, the entity is charged the number ++ * of sectors of the request, multipled by the factor below ++ */ ++static const int bfq_async_charge_factor = 10; ++ ++/* Default timeout values, in jiffies, approximating CFQ defaults. */ ++static const int bfq_timeout_sync = HZ / 8; ++static int bfq_timeout_async = HZ / 25; ++ ++struct kmem_cache *bfq_pool; ++struct kmem_cache *bfq_ioc_pool; ++ ++static DEFINE_PER_CPU(unsigned long, bfq_ioc_count); ++static struct completion *bfq_ioc_gone; ++static DEFINE_SPINLOCK(bfq_ioc_gone_lock); ++ ++static DEFINE_SPINLOCK(cic_index_lock); ++static DEFINE_IDA(cic_index_ida); ++ ++/* Below this threshold (in ms), we consider thinktime immediate. */ ++#define BFQ_MIN_TT 2 ++ ++/* hw_tag detection: parallel requests threshold and min samples needed. */ ++#define BFQ_HW_QUEUE_THRESHOLD 4 ++#define BFQ_HW_QUEUE_SAMPLES 32 ++ ++#define BFQQ_SEEK_THR (sector_t)(8 * 1024) ++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) ++ ++/* Min samples used for peak rate estimation (for autotuning). */ ++#define BFQ_PEAK_RATE_SAMPLES 32 ++ ++/* Shift used for peak rate fixed precision calculations. */ ++#define BFQ_RATE_SHIFT 16 ++ ++/* ++ * The duration of the weight raising for interactive applications is ++ * computed automatically (as default behaviour), using the following ++ * formula: duration = (R / r) * T, where r is the peak rate of the ++ * disk, and R and T are two reference parameters. In particular, R is ++ * the peak rate of a reference disk, and T is about the maximum time ++ * for starting popular large applications on that disk, under BFQ and ++ * while reading two files in parallel. Finally, BFQ uses two ++ * different pairs (R, T) depending on whether the disk is rotational ++ * or non-rotational. ++ */ ++#define T_rot (msecs_to_jiffies(5500)) ++#define T_nonrot (msecs_to_jiffies(2000)) ++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */ ++#define R_rot 17415 ++#define R_nonrot 34791 ++ ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) ++ ++#define RQ_CIC(rq) \ ++ ((struct cfq_io_context *) (rq)->elevator_private[0]) ++#define RQ_BFQQ(rq) ((rq)->elevator_private[1]) ++ ++#include "bfq-ioc.c" ++#include "bfq-sched.c" ++#include "bfq-cgroup.c" ++ ++#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\ ++ IOPRIO_CLASS_IDLE) ++#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\ ++ IOPRIO_CLASS_RT) ++ ++#define bfq_sample_valid(samples) ((samples) > 80) ++ ++/* ++ * We regard a request as SYNC, if either it's a read or has the SYNC bit ++ * set (in which case it could also be a direct WRITE). ++ */ ++static inline int bfq_bio_sync(struct bio *bio) ++{ ++ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) ++ return 1; ++ ++ return 0; ++} ++ ++/* ++ * Scheduler run of queue, if there are requests pending and no one in the ++ * driver that will restart queueing. ++ */ ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd) ++{ ++ if (bfqd->queued != 0) { ++ bfq_log(bfqd, "schedule dispatch"); ++ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work); ++ } ++} ++ ++/* ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now. ++ * We choose the request that is closesr to the head right now. Distance ++ * behind the head is penalized and only allowed to a certain extent. ++ */ ++static struct request *bfq_choose_req(struct bfq_data *bfqd, ++ struct request *rq1, ++ struct request *rq2, ++ sector_t last) ++{ ++ sector_t s1, s2, d1 = 0, d2 = 0; ++ unsigned long back_max; ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ ++ unsigned wrap = 0; /* bit mask: requests behind the disk head? */ ++ ++ if (rq1 == NULL || rq1 == rq2) ++ return rq2; ++ if (rq2 == NULL) ++ return rq1; ++ ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) ++ return rq1; ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) ++ return rq2; ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) ++ return rq1; ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) ++ return rq2; ++ ++ s1 = blk_rq_pos(rq1); ++ s2 = blk_rq_pos(rq2); ++ ++ /* ++ * By definition, 1KiB is 2 sectors. ++ */ ++ back_max = bfqd->bfq_back_max * 2; ++ ++ /* ++ * Strict one way elevator _except_ in the case where we allow ++ * short backward seeks which are biased as twice the cost of a ++ * similar forward seek. ++ */ ++ if (s1 >= last) ++ d1 = s1 - last; ++ else if (s1 + back_max >= last) ++ d1 = (last - s1) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ1_WRAP; ++ ++ if (s2 >= last) ++ d2 = s2 - last; ++ else if (s2 + back_max >= last) ++ d2 = (last - s2) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ2_WRAP; ++ ++ /* Found required data */ ++ ++ /* ++ * By doing switch() on the bit mask "wrap" we avoid having to ++ * check two variables for all permutations: --> faster! ++ */ ++ switch (wrap) { ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ ++ if (d1 < d2) ++ return rq1; ++ else if (d2 < d1) ++ return rq2; ++ else { ++ if (s1 >= s2) ++ return rq1; ++ else ++ return rq2; ++ } ++ ++ case BFQ_RQ2_WRAP: ++ return rq1; ++ case BFQ_RQ1_WRAP: ++ return rq2; ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ ++ default: ++ /* ++ * Since both rqs are wrapped, ++ * start with the one that's further behind head ++ * (--> only *one* back seek required), ++ * since back seek takes more time than forward. ++ */ ++ if (s1 <= s2) ++ return rq1; ++ else ++ return rq2; ++ } ++} ++ ++static struct bfq_queue * ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, ++ sector_t sector, struct rb_node **ret_parent, ++ struct rb_node ***rb_link) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *bfqq = NULL; ++ ++ parent = NULL; ++ p = &root->rb_node; ++ while (*p) { ++ struct rb_node **n; ++ ++ parent = *p; ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ ++ /* ++ * Sort strictly based on sector. Smallest to the left, ++ * largest to the right. ++ */ ++ if (sector > blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_right; ++ else if (sector < blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_left; ++ else ++ break; ++ p = n; ++ bfqq = NULL; ++ } ++ ++ *ret_parent = parent; ++ if (rb_link) ++ *rb_link = p; ++ ++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", ++ (long long unsigned)sector, ++ bfqq != NULL ? bfqq->pid : 0); ++ ++ return bfqq; ++} ++ ++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *__bfqq; ++ ++ if (bfqq->pos_root != NULL) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ ++ if (bfq_class_idle(bfqq)) ++ return; ++ if (!bfqq->next_rq) ++ return; ++ ++ bfqq->pos_root = &bfqd->rq_pos_tree; ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, ++ blk_rq_pos(bfqq->next_rq), &parent, &p); ++ if (__bfqq == NULL) { ++ rb_link_node(&bfqq->pos_node, parent, p); ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); ++ } else ++ bfqq->pos_root = NULL; ++} ++ ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct rb_node *rbnext = rb_next(&last->rb_node); ++ struct rb_node *rbprev = rb_prev(&last->rb_node); ++ struct request *next = NULL, *prev = NULL; ++ ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); ++ ++ if (rbprev != NULL) ++ prev = rb_entry_rq(rbprev); ++ ++ if (rbnext != NULL) ++ next = rb_entry_rq(rbnext); ++ else { ++ rbnext = rb_first(&bfqq->sort_list); ++ if (rbnext && rbnext != &last->rb_node) ++ next = rb_entry_rq(rbnext); ++ } ++ ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); ++} ++ ++static void bfq_del_rq_rb(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ const int sync = rq_is_sync(rq); ++ ++ BUG_ON(bfqq->queued[sync] == 0); ++ bfqq->queued[sync]--; ++ bfqd->queued--; ++ ++ elv_rb_del(&bfqq->sort_list, rq); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue) ++ bfq_del_bfqq_busy(bfqd, bfqq, 1); ++ /* ++ * Remove queue from request-position tree as it is empty. ++ */ ++ if (bfqq->pos_root != NULL) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ } ++} ++ ++/* see the definition of bfq_async_charge_factor for details */ ++static inline unsigned long bfq_serv_to_charge(struct request *rq, ++ struct bfq_queue *bfqq) ++{ ++ return blk_rq_sectors(rq) * ++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) * ++ bfq_async_charge_factor)); ++} ++ ++/** ++ * bfq_updated_next_req - update the queue after a new next_rq selection. ++ * @bfqd: the device data the queue belongs to. ++ * @bfqq: the queue to update. ++ * ++ * If the first request of a queue changes we make sure that the queue ++ * has enough budget to serve at least its first request (if the ++ * request has grown). We do this because if the queue has not enough ++ * budget for its first request, it has to go through two dispatch ++ * rounds to actually get it dispatched. ++ */ ++static void bfq_updated_next_req(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ struct request *next_rq = bfqq->next_rq; ++ unsigned long new_budget; ++ ++ if (next_rq == NULL) ++ return; ++ ++ if (bfqq == bfqd->active_queue) ++ /* ++ * In order not to break guarantees, budgets cannot be ++ * changed after an entity has been selected. ++ */ ++ return; ++ ++ BUG_ON(entity->tree != &st->active); ++ BUG_ON(entity == entity->sched_data->active_entity); ++ ++ new_budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ entity->budget = new_budget; ++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); ++ bfq_activate_bfqq(bfqd, bfqq); ++} ++ ++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd) ++{ ++ u64 dur; ++ ++ if (bfqd->bfq_raising_max_time > 0) ++ return bfqd->bfq_raising_max_time; ++ ++ dur = bfqd->RT_prod; ++ do_div(dur, bfqd->peak_rate); ++ ++ return dur; ++} ++ ++static void bfq_add_rq_rb(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *next_rq, *prev; ++ unsigned long old_raising_coeff = bfqq->raising_coeff; ++ int idle_for_long_time = bfqq->budget_timeout + ++ bfqd->bfq_raising_min_idle_time < jiffies; ++ ++ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq)); ++ bfqq->queued[rq_is_sync(rq)]++; ++ bfqd->queued++; ++ ++ elv_rb_add(&bfqq->sort_list, rq); ++ ++ /* ++ * Check if this request is a better next-serve candidate. ++ */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); ++ BUG_ON(next_rq == NULL); ++ bfqq->next_rq = next_rq; ++ ++ /* ++ * Adjust priority tree position, if next_rq changes. ++ */ ++ if (prev != bfqq->next_rq) ++ bfq_rq_pos_tree_add(bfqd, bfqq); ++ ++ if (!bfq_bfqq_busy(bfqq)) { ++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 && ++ bfqq->soft_rt_next_start < jiffies; ++ entity->budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ ++ if (! bfqd->low_latency) ++ goto add_bfqq_busy; ++ ++ /* ++ * If the queue is not being boosted and has been idle ++ * for enough time, start a weight-raising period ++ */ ++ if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) { ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff; ++ if (idle_for_long_time) ++ bfqq->raising_cur_max_time = ++ bfq_wrais_duration(bfqd); ++ else ++ bfqq->raising_cur_max_time = ++ bfqd->bfq_raising_rt_max_time; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais starting at %llu msec," ++ "rais_max_time %u", ++ bfqq->last_rais_start_finish, ++ jiffies_to_msecs(bfqq-> ++ raising_cur_max_time)); ++ } else if (old_raising_coeff > 1) { ++ if (idle_for_long_time) ++ bfqq->raising_cur_max_time = ++ bfq_wrais_duration(bfqd); ++ else if (bfqq->raising_cur_max_time == ++ bfqd->bfq_raising_rt_max_time && ++ !soft_rt) { ++ bfqq->raising_coeff = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais ending at %llu msec," ++ "rais_max_time %u", ++ bfqq->last_rais_start_finish, ++ jiffies_to_msecs(bfqq-> ++ raising_cur_max_time)); ++ } ++ } ++ if (old_raising_coeff != bfqq->raising_coeff) ++ entity->ioprio_changed = 1; ++add_bfqq_busy: ++ bfq_add_bfqq_busy(bfqd, bfqq); ++ } else { ++ if(bfqd->low_latency && old_raising_coeff == 1 && ++ !rq_is_sync(rq) && ++ bfqq->last_rais_start_finish + ++ bfqd->bfq_raising_min_inter_arr_async < jiffies) { ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff; ++ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd); ++ ++ entity->ioprio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "non-idle wrais starting at %llu msec," ++ "rais_max_time %u", ++ bfqq->last_rais_start_finish, ++ jiffies_to_msecs(bfqq-> ++ raising_cur_max_time)); ++ } ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ if(bfqd->low_latency && ++ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 || ++ idle_for_long_time)) ++ bfqq->last_rais_start_finish = jiffies; ++} ++ ++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq) ++{ ++ elv_rb_del(&bfqq->sort_list, rq); ++ bfqq->queued[rq_is_sync(rq)]--; ++ bfqq->bfqd->queued--; ++ bfq_add_rq_rb(rq); ++} ++ ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, ++ struct bio *bio) ++{ ++ struct task_struct *tsk = current; ++ struct cfq_io_context *cic; ++ struct bfq_queue *bfqq; ++ ++ cic = bfq_cic_lookup(bfqd, tsk->io_context); ++ if (cic == NULL) ++ return NULL; ++ ++ bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio)); ++ if (bfqq != NULL) { ++ sector_t sector = bio->bi_sector + bio_sectors(bio); ++ ++ return elv_rb_find(&bfqq->sort_list, sector); ++ } ++ ++ return NULL; ++} ++ ++static void bfq_activate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ ++ bfqd->rq_in_driver++; ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", ++ (long long unsigned)bfqd->last_position); ++} ++ ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ ++ WARN_ON(bfqd->rq_in_driver == 0); ++ bfqd->rq_in_driver--; ++} ++ ++static void bfq_remove_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ ++ if (bfqq->next_rq == rq) { ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ list_del_init(&rq->queuelist); ++ bfq_del_rq_rb(rq); ++ ++ if (rq->cmd_flags & REQ_META) { ++ WARN_ON(bfqq->meta_pending == 0); ++ bfqq->meta_pending--; ++ } ++} ++ ++static int bfq_merge(struct request_queue *q, struct request **req, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct request *__rq; ++ ++ __rq = bfq_find_rq_fmerge(bfqd, bio); ++ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) { ++ *req = __rq; ++ return ELEVATOR_FRONT_MERGE; ++ } ++ ++ return ELEVATOR_NO_MERGE; ++} ++ ++static void bfq_merged_request(struct request_queue *q, struct request *req, ++ int type) ++{ ++ if (type == ELEVATOR_FRONT_MERGE) { ++ struct bfq_queue *bfqq = RQ_BFQQ(req); ++ ++ bfq_reposition_rq_rb(bfqq, req); ++ } ++} ++ ++static void bfq_merged_requests(struct request_queue *q, struct request *rq, ++ struct request *next) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ /* ++ * Reposition in fifo if next is older than rq. ++ */ ++ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && ++ time_before(rq_fifo_time(next), rq_fifo_time(rq))) { ++ list_move(&rq->queuelist, &next->queuelist); ++ rq_set_fifo_time(rq, rq_fifo_time(next)); ++ } ++ ++ if (bfqq->next_rq == next) ++ bfqq->next_rq = rq; ++ ++ bfq_remove_request(next); ++} ++ ++static int bfq_allow_merge(struct request_queue *q, struct request *rq, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct cfq_io_context *cic; ++ struct bfq_queue *bfqq; ++ ++ /* Disallow merge of a sync bio into an async request. */ ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq)) ++ return 0; ++ ++ /* ++ * Lookup the bfqq that this bio will be queued with. Allow ++ * merge only if rq is queued there. ++ */ ++ cic = bfq_cic_lookup(bfqd, current->io_context); ++ if (cic == NULL) ++ return 0; ++ ++ bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio)); ++ return bfqq == RQ_BFQQ(rq); ++} ++ ++static void __bfq_set_active_queue(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ if (bfqq != NULL) { ++ bfq_mark_bfqq_must_alloc(bfqq); ++ bfq_mark_bfqq_budget_new(bfqq); ++ bfq_clear_bfqq_fifo_expire(bfqq); ++ ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; ++ ++ bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu", ++ bfqq->entity.budget); ++ } ++ ++ bfqd->active_queue = bfqq; ++} ++ ++/* ++ * Get and set a new active queue for service. ++ */ ++static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ if (!bfqq) ++ bfqq = bfq_get_next_queue(bfqd); ++ else ++ bfq_get_next_queue_forced(bfqd, bfqq); ++ ++ __bfq_set_active_queue(bfqd, bfqq); ++ return bfqq; ++} ++ ++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd, ++ struct request *rq) ++{ ++ if (blk_rq_pos(rq) >= bfqd->last_position) ++ return blk_rq_pos(rq) - bfqd->last_position; ++ else ++ return bfqd->last_position - blk_rq_pos(rq); ++} ++ ++/* ++ * Return true if bfqq has no request pending and rq is close enough to ++ * bfqd->last_position, or if rq is closer to bfqd->last_position than ++ * bfqq->next_rq ++ */ ++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq) ++{ ++ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR; ++} ++ ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) ++{ ++ struct rb_root *root = &bfqd->rq_pos_tree; ++ struct rb_node *parent, *node; ++ struct bfq_queue *__bfqq; ++ sector_t sector = bfqd->last_position; ++ ++ if (RB_EMPTY_ROOT(root)) ++ return NULL; ++ ++ /* ++ * First, if we find a request starting at the end of the last ++ * request, choose it. ++ */ ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); ++ if (__bfqq != NULL) ++ return __bfqq; ++ ++ /* ++ * If the exact sector wasn't found, the parent of the NULL leaf ++ * will contain the closest sector (rq_pos_tree sorted by next_request ++ * position). ++ */ ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ if (bfq_rq_close(bfqd, __bfqq->next_rq)) ++ return __bfqq; ++ ++ if (blk_rq_pos(__bfqq->next_rq) < sector) ++ node = rb_next(&__bfqq->pos_node); ++ else ++ node = rb_prev(&__bfqq->pos_node); ++ if (node == NULL) ++ return NULL; ++ ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node); ++ if (bfq_rq_close(bfqd, __bfqq->next_rq)) ++ return __bfqq; ++ ++ return NULL; ++} ++ ++/* ++ * bfqd - obvious ++ * cur_bfqq - passed in so that we don't decide that the current queue ++ * is closely cooperating with itself. ++ * ++ * We are assuming that cur_bfqq has dispatched at least one request, ++ * and that bfqd->last_position reflects a position on the disk associated ++ * with the I/O issued by cur_bfqq. ++ */ ++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, ++ struct bfq_queue *cur_bfqq) ++{ ++ struct bfq_queue *bfqq; ++ ++ if (bfq_class_idle(cur_bfqq)) ++ return NULL; ++ if (!bfq_bfqq_sync(cur_bfqq)) ++ return NULL; ++ if (BFQQ_SEEKY(cur_bfqq)) ++ return NULL; ++ ++ /* If device has only one backlogged bfq_queue, don't search. */ ++ if (bfqd->busy_queues == 1) ++ return NULL; ++ ++ /* ++ * We should notice if some of the queues are cooperating, e.g. ++ * working closely on the same area of the disk. In that case, ++ * we can group them together and don't waste time idling. ++ */ ++ bfqq = bfqq_close(bfqd); ++ if (bfqq == NULL || bfqq == cur_bfqq) ++ return NULL; ++ ++ /* ++ * Do not merge queues from different bfq_groups. ++ */ ++ if (bfqq->entity.parent != cur_bfqq->entity.parent) ++ return NULL; ++ ++ /* ++ * It only makes sense to merge sync queues. ++ */ ++ if (!bfq_bfqq_sync(bfqq)) ++ return NULL; ++ if (BFQQ_SEEKY(bfqq)) ++ return NULL; ++ ++ /* ++ * Do not merge queues of different priority classes. ++ */ ++ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq)) ++ return NULL; ++ ++ return bfqq; ++} ++ ++/* ++ * If enough samples have been computed, return the current max budget ++ * stored in bfqd, which is dynamically updated according to the ++ * estimated disk peak rate; otherwise return the default max budget ++ */ ++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < 194) ++ return bfq_default_max_budget; ++ else ++ return bfqd->bfq_max_budget; ++} ++ ++/* ++ * Return min budget, which is a fraction of the current or default ++ * max budget (trying with 1/32) ++ */ ++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < 194) ++ return bfq_default_max_budget; ++ else ++ return bfqd->bfq_max_budget / 32; ++} ++ ++/* ++ * Decides whether idling should be done for given device and ++ * given active queue. ++ */ ++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd, ++ struct bfq_queue *active_bfqq) ++{ ++ if (active_bfqq == NULL) ++ return false; ++ /* ++ * If device is SSD it has no seek penalty, disable idling; but ++ * do so only if: ++ * - device does not support queuing, otherwise we still have ++ * a problem with sync vs async workloads; ++ * - the queue is not weight-raised, to preserve guarantees. ++ */ ++ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag && ++ active_bfqq->raising_coeff == 1); ++} ++ ++static void bfq_arm_slice_timer(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->active_queue; ++ struct cfq_io_context *cic; ++ unsigned long sl; ++ ++ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (bfq_queue_nonrot_noidle(bfqd, bfqq)) ++ return; ++ ++ /* Idling is disabled, either manually or by past process history. */ ++ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_idle_window(bfqq)) ++ return; ++ ++ /* Tasks have exited, don't wait. */ ++ cic = bfqd->active_cic; ++ if (cic == NULL || atomic_read(&cic->ioc->nr_tasks) == 0) ++ return; ++ ++ bfq_mark_bfqq_wait_request(bfqq); ++ ++ /* ++ * We don't want to idle for seeks, but we do want to allow ++ * fair distribution of slice time for a process doing back-to-back ++ * seeks. So allow a little bit of time for him to submit a new rq. ++ * ++ * To prevent processes with (partly) seeky workloads from ++ * being too ill-treated, grant them a small fraction of the ++ * assigned budget before reducing the waiting time to ++ * BFQ_MIN_TT. This happened to help reduce latency. ++ */ ++ sl = bfqd->bfq_slice_idle; ++ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) && ++ bfqq->entity.service > bfq_max_budget(bfqd) / 8 && ++ bfqq->raising_coeff == 1) ++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); ++ else if (bfqq->raising_coeff > 1) ++ sl = sl * 3; ++ bfqd->last_idling_start = ktime_get(); ++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl); ++ bfq_log(bfqd, "arm idle: %u/%u ms", ++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); ++} ++ ++/* ++ * Set the maximum time for the active queue to consume its ++ * budget. This prevents seeky processes from lowering the disk ++ * throughput (always guaranteed with a time slice scheme as in CFQ). ++ */ ++static void bfq_set_budget_timeout(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->active_queue; ++ unsigned int timeout_coeff; ++ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time) ++ timeout_coeff = 1; ++ else ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; ++ ++ bfqd->last_budget_start = ktime_get(); ++ ++ bfq_clear_bfqq_budget_new(bfqq); ++ bfqq->budget_timeout = jiffies + ++ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; ++ ++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", ++ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * ++ timeout_coeff)); ++} ++ ++/* ++ * Move request from internal lists to the request queue dispatch list. ++ */ ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ bfq_remove_request(rq); ++ bfqq->dispatched++; ++ elv_dispatch_sort(q, rq); ++ ++ if (bfq_bfqq_sync(bfqq)) ++ bfqd->sync_flight++; ++} ++ ++/* ++ * Return expired entry, or NULL to just start from scratch in rbtree. ++ */ ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq) ++{ ++ struct request *rq = NULL; ++ ++ if (bfq_bfqq_fifo_expire(bfqq)) ++ return NULL; ++ ++ bfq_mark_bfqq_fifo_expire(bfqq); ++ ++ if (list_empty(&bfqq->fifo)) ++ return NULL; ++ ++ rq = rq_entry_fifo(bfqq->fifo.next); ++ ++ if (time_before(jiffies, rq_fifo_time(rq))) ++ return NULL; ++ ++ return rq; ++} ++ ++/* ++ * Must be called with the queue_lock held. ++ */ ++static int bfqq_process_refs(struct bfq_queue *bfqq) ++{ ++ int process_refs, io_refs; ++ ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; ++ BUG_ON(process_refs < 0); ++ return process_refs; ++} ++ ++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ int process_refs, new_process_refs; ++ struct bfq_queue *__bfqq; ++ ++ /* ++ * If there are no process references on the new_bfqq, then it is ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain ++ * may have dropped their last reference (not just their last process ++ * reference). ++ */ ++ if (!bfqq_process_refs(new_bfqq)) ++ return; ++ ++ /* Avoid a circular list and skip interim queue merges. */ ++ while ((__bfqq = new_bfqq->new_bfqq)) { ++ if (__bfqq == bfqq) ++ return; ++ new_bfqq = __bfqq; ++ } ++ ++ process_refs = bfqq_process_refs(bfqq); ++ new_process_refs = bfqq_process_refs(new_bfqq); ++ /* ++ * If the process for the bfqq has gone away, there is no ++ * sense in merging the queues. ++ */ ++ if (process_refs == 0 || new_process_refs == 0) ++ return; ++ ++ /* ++ * Merge in the direction of the lesser amount of work. ++ */ ++ if (new_process_refs >= process_refs) { ++ bfqq->new_bfqq = new_bfqq; ++ atomic_add(process_refs, &new_bfqq->ref); ++ } else { ++ new_bfqq->new_bfqq = bfqq; ++ atomic_add(new_process_refs, &bfqq->ref); ++ } ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", ++ new_bfqq->pid); ++} ++ ++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ return entity->budget - entity->service; ++} ++ ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfqq != bfqd->active_queue); ++ ++ __bfq_bfqd_reset_active(bfqd); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ bfq_del_bfqq_busy(bfqd, bfqq, 1); ++ /* ++ * overloading budget_timeout field to store when ++ * the queue remains with no backlog, used by ++ * the weight-raising mechanism ++ */ ++ bfqq->budget_timeout = jiffies ; ++ } ++ else { ++ bfq_activate_bfqq(bfqd, bfqq); ++ /* ++ * Resort priority tree of potential close cooperators. ++ */ ++ bfq_rq_pos_tree_add(bfqd, bfqq); ++ } ++ ++ /* ++ * If this bfqq is shared between multiple processes, check ++ * to make sure that those processes are still issuing I/Os ++ * within the mean seek distance. If not, it may be time to ++ * break the queues apart again. ++ */ ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) ++ bfq_mark_bfqq_split_coop(bfqq); ++} ++ ++/** ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. ++ * @bfqd: device data. ++ * @bfqq: queue to update. ++ * @reason: reason for expiration. ++ * ++ * Handle the feedback on @bfqq budget. See the body for detailed ++ * comments. ++ */ ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ enum bfqq_expiration reason) ++{ ++ struct request *next_rq; ++ unsigned long budget, min_budget; ++ ++ budget = bfqq->max_budget; ++ min_budget = bfq_min_budget(bfqd); ++ ++ BUG_ON(bfqq != bfqd->active_queue); ++ ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu", ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu", ++ budget, bfq_min_budget(bfqd)); ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue)); ++ ++ if (bfq_bfqq_sync(bfqq)) { ++ switch (reason) { ++ /* ++ * Caveat: in all the following cases we trade latency ++ * for throughput. ++ */ ++ case BFQ_BFQQ_TOO_IDLE: ++ /* ++ * This is the only case where we may reduce ++ * the budget: if there is no requets of the ++ * process still waiting for completion, then ++ * we assume (tentatively) that the timer has ++ * expired because the batch of requests of ++ * the process could have been served with a ++ * smaller budget. Hence, betting that ++ * process will behave in the same way when it ++ * becomes backlogged again, we reduce its ++ * next budget. As long as we guess right, ++ * this budget cut reduces the latency ++ * experienced by the process. ++ * ++ * However, if there are still outstanding ++ * requests, then the process may have not yet ++ * issued its next request just because it is ++ * still waiting for the completion of some of ++ * the still oustanding ones. So in this ++ * subcase we do not reduce its budget, on the ++ * contrary we increase it to possibly boost ++ * the throughput, as discussed in the ++ * comments to the BUDGET_TIMEOUT case. ++ */ ++ if (bfqq->dispatched > 0) /* still oustanding reqs */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ else { ++ if (budget > 5 * min_budget) ++ budget -= 4 * min_budget; ++ else ++ budget = min_budget; ++ } ++ break; ++ case BFQ_BFQQ_BUDGET_TIMEOUT: ++ /* ++ * We double the budget here because: 1) it ++ * gives the chance to boost the throughput if ++ * this is not a seeky process (which may have ++ * bumped into this timeout because of, e.g., ++ * ZBR), 2) together with charge_full_budget ++ * it helps give seeky processes higher ++ * timestamps, and hence be served less ++ * frequently. ++ */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_BUDGET_EXHAUSTED: ++ /* ++ * The process still has backlog, and did not ++ * let either the budget timeout or the disk ++ * idling timeout expire. Hence it is not ++ * seeky, has a short thinktime and may be ++ * happy with a higher budget too. So ++ * definitely increase the budget of this good ++ * candidate to boost the disk throughput. ++ */ ++ budget = min(budget * 4, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_NO_MORE_REQUESTS: ++ /* ++ * Leave the budget unchanged. ++ */ ++ default: ++ return; ++ } ++ } else /* async queue */ ++ /* async queues get always the maximum possible budget ++ * (their ability to dispatch is limited by ++ * @bfqd->bfq_max_budget_async_rq). ++ */ ++ budget = bfqd->bfq_max_budget; ++ ++ bfqq->max_budget = budget; ++ ++ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 && ++ bfqq->max_budget > bfqd->bfq_max_budget) ++ bfqq->max_budget = bfqd->bfq_max_budget; ++ ++ /* ++ * Make sure that we have enough budget for the next request. ++ * Since the finish time of the bfqq must be kept in sync with ++ * the budget, be sure to call __bfq_bfqq_expire() after the ++ * update. ++ */ ++ next_rq = bfqq->next_rq; ++ if (next_rq != NULL) ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ else ++ bfqq->entity.budget = bfqq->max_budget; ++ ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu", ++ next_rq != NULL ? blk_rq_sectors(next_rq) : 0, ++ bfqq->entity.budget); ++} ++ ++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) ++{ ++ unsigned long max_budget; ++ ++ /* ++ * The max_budget calculated when autotuning is equal to the ++ * amount of sectors transfered in timeout_sync at the ++ * estimated peak rate. ++ */ ++ max_budget = (unsigned long)(peak_rate * 1000 * ++ timeout >> BFQ_RATE_SHIFT); ++ ++ return max_budget; ++} ++ ++/* ++ * In addition to updating the peak rate, checks whether the process ++ * is "slow", and returns 1 if so. This slow flag is used, in addition ++ * to the budget timeout, to reduce the amount of service provided to ++ * seeky processes, and hence reduce their chances to lower the ++ * throughput. See the code for more details. ++ */ ++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ int compensate, enum bfqq_expiration reason) ++{ ++ u64 bw, usecs, expected, timeout; ++ ktime_t delta; ++ int update = 0; ++ ++ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) ++ return 0; ++ ++ if (compensate) ++ delta = bfqd->last_idling_start; ++ else ++ delta = ktime_get(); ++ delta = ktime_sub(delta, bfqd->last_budget_start); ++ usecs = ktime_to_us(delta); ++ ++ /* Don't trust short/unrealistic values. */ ++ if (usecs < 100 || usecs >= LONG_MAX) ++ return 0; ++ ++ /* ++ * Calculate the bandwidth for the last slice. We use a 64 bit ++ * value to store the peak rate, in sectors per usec in fixed ++ * point math. We do so to have enough precision in the estimate ++ * and to avoid overflows. ++ */ ++ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; ++ do_div(bw, (unsigned long)usecs); ++ ++ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); ++ ++ /* ++ * Use only long (> 20ms) intervals to filter out spikes for ++ * the peak rate estimation. ++ */ ++ if (usecs > 20000) { ++ if (bw > bfqd->peak_rate || ++ (!BFQQ_SEEKY(bfqq) && ++ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { ++ bfq_log(bfqd, "measured bw =%llu", bw); ++ /* ++ * To smooth oscillations use a low-pass filter with ++ * alpha=7/8, i.e., ++ * new_rate = (7/8) * old_rate + (1/8) * bw ++ */ ++ do_div(bw, 8); ++ bfqd->peak_rate *= 7; ++ do_div(bfqd->peak_rate, 8); ++ bfqd->peak_rate += bw; ++ update = 1; ++ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); ++ } ++ ++ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; ++ ++ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) ++ bfqd->peak_rate_samples++; ++ ++ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && ++ update && bfqd->bfq_user_max_budget == 0) { ++ bfqd->bfq_max_budget = ++ bfq_calc_max_budget(bfqd->peak_rate, timeout); ++ bfq_log(bfqd, "new max_budget=%lu", ++ bfqd->bfq_max_budget); ++ } ++ } ++ ++ /* ++ * If the process has been served for a too short time ++ * interval to let its possible sequential accesses prevail on ++ * the initial seek time needed to move the disk head on the ++ * first sector it requested, then give the process a chance ++ * and for the moment return false. ++ */ ++ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) ++ return 0; ++ ++ /* ++ * A process is considered ``slow'' (i.e., seeky, so that we ++ * cannot treat it fairly in the service domain, as it would ++ * slow down too much the other processes) if, when a slice ++ * ends for whatever reason, it has received service at a ++ * rate that would not be high enough to complete the budget ++ * before the budget timeout expiration. ++ */ ++ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; ++ ++ /* ++ * Caveat: processes doing IO in the slower disk zones will ++ * tend to be slow(er) even if not seeky. And the estimated ++ * peak rate will actually be an average over the disk ++ * surface. Hence, to not be too harsh with unlucky processes, ++ * we keep a budget/3 margin of safety before declaring a ++ * process slow. ++ */ ++ return expected > (4 * bfqq->entity.budget) / 3; ++} ++ ++/** ++ * bfq_bfqq_expire - expire a queue. ++ * @bfqd: device owning the queue. ++ * @bfqq: the queue to expire. ++ * @compensate: if true, compensate for the time spent idling. ++ * @reason: the reason causing the expiration. ++ * ++ * ++ * If the process associated to the queue is slow (i.e., seeky), or in ++ * case of budget timeout, or, finally, if it is async, we ++ * artificially charge it an entire budget (independently of the ++ * actual service it received). As a consequence, the queue will get ++ * higher timestamps than the correct ones upon reactivation, and ++ * hence it will be rescheduled as if it had received more service ++ * than what it actually received. In the end, this class of processes ++ * will receive less service in proportion to how slowly they consume ++ * their budgets (and hence how seriously they tend to lower the ++ * throughput). ++ * ++ * In contrast, when a queue expires because it has been idling for ++ * too much or because it exhausted its budget, we do not touch the ++ * amount of service it has received. Hence when the queue will be ++ * reactivated and its timestamps updated, the latter will be in sync ++ * with the actual service received by the queue until expiration. ++ * ++ * Charging a full budget to the first type of queues and the exact ++ * service to the others has the effect of using the WF2Q+ policy to ++ * schedule the former on a timeslice basis, without violating the ++ * service domain guarantees of the latter. ++ */ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ int compensate, ++ enum bfqq_expiration reason) ++{ ++ int slow; ++ BUG_ON(bfqq != bfqd->active_queue); ++ ++ /* Update disk peak rate for autotuning and check whether the ++ * process is slow (see bfq_update_peak_rate). ++ */ ++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); ++ ++ /* ++ * As above explained, 'punish' slow (i.e., seeky), timed-out ++ * and async queues, to favor sequential sync workloads. ++ * ++ * Processes doing IO in the slower disk zones will tend to be ++ * slow(er) even if not seeky. Hence, since the estimated peak ++ * rate is actually an average over the disk surface, these ++ * processes may timeout just for bad luck. To avoid punishing ++ * them we do not charge a full budget to a process that ++ * succeeded in consuming at least 2/3 of its budget. ++ */ ++ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) ++ bfq_bfqq_charge_full_budget(bfqq); ++ ++ if (bfqd->low_latency && bfqq->raising_coeff == 1) ++ bfqq->last_rais_start_finish = jiffies; ++ ++ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) { ++ if(reason != BFQ_BFQQ_BUDGET_TIMEOUT) ++ bfqq->soft_rt_next_start = ++ jiffies + ++ HZ * bfqq->entity.service / ++ bfqd->bfq_raising_max_softrt_rate; ++ else ++ bfqq->soft_rt_next_start = -1; /* infinity */ ++ } ++ bfq_log_bfqq(bfqd, bfqq, ++ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow, ++ bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); ++ ++ /* Increase, decrease or leave budget unchanged according to reason */ ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); ++ __bfq_bfqq_expire(bfqd, bfqq); ++} ++ ++/* ++ * Budget timeout is not implemented through a dedicated timer, but ++ * just checked on request arrivals and completions, as well as on ++ * idle timer expirations. ++ */ ++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_budget_new(bfqq)) ++ return 0; ++ ++ if (time_before(jiffies, bfqq->budget_timeout)) ++ return 0; ++ ++ return 1; ++} ++ ++/* ++ * If we expire a queue that is waiting for the arrival of a new ++ * request, we may prevent the fictitious timestamp backshifting that ++ * allows the guarantees of the queue to be preserved (see [1] for ++ * this tricky aspect). Hence we return true only if this condition ++ * does not hold, or if the queue is slow enough to deserve only to be ++ * kicked off for preserving a high throughput. ++*/ ++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "may_budget_timeout: wr %d left %d timeout %d", ++ bfq_bfqq_wait_request(bfqq), ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, ++ bfq_bfqq_budget_timeout(bfqq)); ++ ++ return (!bfq_bfqq_wait_request(bfqq) || ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) ++ && ++ bfq_bfqq_budget_timeout(bfqq); ++} ++ ++/* ++ * Select a queue for service. If we have a current active queue, ++ * check whether to continue servicing it, or retrieve and set a new one. ++ */ ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq, *new_bfqq = NULL; ++ struct request *next_rq; ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ ++ bfqq = bfqd->active_queue; ++ if (bfqq == NULL) ++ goto new_queue; ++ ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue"); ++ ++ /* ++ * If another queue has a request waiting within our mean seek ++ * distance, let it run. The expire code will check for close ++ * cooperators and put the close queue at the front of the ++ * service tree. If possible, merge the expiring queue with the ++ * new bfqq. ++ */ ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq); ++ if (new_bfqq != NULL && bfqq->new_bfqq == NULL) ++ bfq_setup_merge(bfqq, new_bfqq); ++ ++ if (bfq_may_expire_for_budg_timeout(bfqq)) ++ goto expire; ++ ++ next_rq = bfqq->next_rq; ++ /* ++ * If bfqq has requests queued and it has enough budget left to ++ * serve them, keep the queue, otherwise expire it. ++ */ ++ if (next_rq != NULL) { ++ if (bfq_serv_to_charge(next_rq, bfqq) > ++ bfq_bfqq_budget_left(bfqq)) { ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; ++ goto expire; ++ } else { ++ /* ++ * The idle timer may be pending because we may not ++ * disable disk idling even when a new request arrives ++ */ ++ if (timer_pending(&bfqd->idle_slice_timer)) { ++ /* ++ * If we get here: 1) at least a new request ++ * has arrived but we have not disabled the ++ * timer because the request was too small, ++ * 2) then the block layer has unplugged the ++ * device, causing the dispatch to be invoked. ++ * ++ * Since the device is unplugged, now the ++ * requests are probably large enough to ++ * provide a reasonable throughput. ++ * So we disable idling. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ del_timer(&bfqd->idle_slice_timer); ++ } ++ if (new_bfqq == NULL) ++ goto keep_queue; ++ else ++ goto expire; ++ } ++ } ++ ++ /* ++ * No requests pending. If there is no cooperator, and the active ++ * queue still has requests in flight or is idling for a new request, ++ * then keep it. ++ */ ++ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) || ++ (bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq) && ++ !bfq_queue_nonrot_noidle(bfqd, bfqq)))) { ++ bfqq = NULL; ++ goto keep_queue; ++ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) { ++ /* ++ * Expiring the queue because there is a close cooperator, ++ * cancel timer. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ del_timer(&bfqd->idle_slice_timer); ++ } ++ ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS; ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, 0, reason); ++new_queue: ++ bfqq = bfq_set_active_queue(bfqd, new_bfqq); ++ bfq_log(bfqd, "select_queue: new queue %d returned", ++ bfqq != NULL ? bfqq->pid : 0); ++keep_queue: ++ return bfqq; ++} ++ ++static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ if (bfqq->raising_coeff > 1) { /* queue is being boosted */ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, " ++ "old raising coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - ++ bfqq->last_rais_start_finish), ++ jiffies_to_msecs(bfqq->raising_cur_max_time), ++ bfqq->raising_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ BUG_ON(bfqq != bfqd->active_queue && entity->weight != ++ entity->orig_weight * bfqq->raising_coeff); ++ if(entity->ioprio_changed) ++ bfq_log_bfqq(bfqd, bfqq, ++ "WARN: pending prio change"); ++ /* ++ * If too much time has elapsed from the beginning ++ * of this weight-raising period and process is not soft ++ * real-time, stop it ++ */ ++ if (jiffies - bfqq->last_rais_start_finish > ++ bfqq->raising_cur_max_time) { ++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 && ++ bfqq->soft_rt_next_start < jiffies; ++ ++ bfqq->last_rais_start_finish = jiffies; ++ if (soft_rt) ++ bfqq->raising_cur_max_time = ++ bfqd->bfq_raising_rt_max_time; ++ else { ++ bfqq->raising_coeff = 1; ++ entity->ioprio_changed = 1; ++ __bfq_entity_update_weight_prio( ++ bfq_entity_service_tree(entity), ++ entity); ++ } ++ } ++ } ++} ++ ++ ++/* ++ * Dispatch one request from bfqq, moving it to the request queue ++ * dispatch list. ++ */ ++static int bfq_dispatch_request(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ struct request *rq; ++ unsigned long service_to_charge; ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ /* Follow expired path, else get first next available. */ ++ rq = bfq_check_fifo(bfqq); ++ if (rq == NULL) ++ rq = bfqq->next_rq; ++ service_to_charge = bfq_serv_to_charge(rq, bfqq); ++ ++ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { ++ /* ++ * This may happen if the next rq is chosen ++ * in fifo order instead of sector order. ++ * The budget is properly dimensioned ++ * to be always sufficient to serve the next request ++ * only if it is chosen in sector order. The reason is ++ * that it would be quite inefficient and little useful ++ * to always make sure that the budget is large enough ++ * to serve even the possible next rq in fifo order. ++ * In fact, requests are seldom served in fifo order. ++ * ++ * Expire the queue for budget exhaustion, and ++ * make sure that the next act_budget is enough ++ * to serve the next request, even if it comes ++ * from the fifo expired path. ++ */ ++ bfqq->next_rq = rq; ++ /* ++ * Since this dispatch is failed, make sure that ++ * a new one will be performed ++ */ ++ if (!bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++ goto expire; ++ } ++ ++ /* Finally, insert request into driver dispatch list. */ ++ bfq_bfqq_served(bfqq, service_to_charge); ++ bfq_dispatch_insert(bfqd->queue, rq); ++ ++ update_raising_data(bfqd, bfqq); ++ ++ bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), " ++ "budg left %lu", ++ blk_rq_sectors(rq), ++ (long long unsigned)blk_rq_pos(rq), ++ bfq_bfqq_budget_left(bfqq)); ++ ++ dispatched++; ++ ++ if (bfqd->active_cic == NULL) { ++ atomic_long_inc(&RQ_CIC(rq)->ioc->refcount); ++ bfqd->active_cic = RQ_CIC(rq); ++ } ++ ++ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && ++ dispatched >= bfqd->bfq_max_budget_async_rq) || ++ bfq_class_idle(bfqq))) ++ goto expire; ++ ++ return dispatched; ++ ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED); ++ return dispatched; ++} ++ ++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ ++ while (bfqq->next_rq != NULL) { ++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); ++ dispatched++; ++ } ++ ++ BUG_ON(!list_empty(&bfqq->fifo)); ++ return dispatched; ++} ++ ++/* ++ * Drain our current requests. Used for barriers and when switching ++ * io schedulers on-the-fly. ++ */ ++static int bfq_forced_dispatch(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq, *n; ++ struct bfq_service_tree *st; ++ int dispatched = 0; ++ ++ bfqq = bfqd->active_queue; ++ if (bfqq != NULL) ++ __bfq_bfqq_expire(bfqd, bfqq); ++ ++ /* ++ * Loop through classes, and be careful to leave the scheduler ++ * in a consistent state, as feedback mechanisms and vtime ++ * updates cannot be disabled during the process. ++ */ ++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { ++ st = bfq_entity_service_tree(&bfqq->entity); ++ ++ dispatched += __bfq_forced_dispatch_bfqq(bfqq); ++ bfqq->max_budget = bfq_max_budget(bfqd); ++ ++ bfq_forget_idle(st); ++ } ++ ++ BUG_ON(bfqd->busy_queues != 0); ++ ++ return dispatched; ++} ++ ++static int bfq_dispatch_requests(struct request_queue *q, int force) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq; ++ int max_dispatch; ++ ++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); ++ if (bfqd->busy_queues == 0) ++ return 0; ++ ++ if (unlikely(force)) ++ return bfq_forced_dispatch(bfqd); ++ ++ if((bfqq = bfq_select_queue(bfqd)) == NULL) ++ return 0; ++ ++ max_dispatch = bfqd->bfq_quantum; ++ if (bfq_class_idle(bfqq)) ++ max_dispatch = 1; ++ ++ if (!bfq_bfqq_sync(bfqq)) ++ max_dispatch = bfqd->bfq_max_budget_async_rq; ++ ++ if (bfqq->dispatched >= max_dispatch) { ++ if (bfqd->busy_queues > 1) ++ return 0; ++ if (bfqq->dispatched >= 4 * max_dispatch) ++ return 0; ++ } ++ ++ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) ++ return 0; ++ ++ bfq_clear_bfqq_wait_request(bfqq); ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); ++ ++ if (! bfq_dispatch_request(bfqd, bfqq)) ++ return 0; ++ ++ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d" ++ "(max_disp %d)", bfqq->pid, max_dispatch); ++ ++ return 1; ++} ++ ++/* ++ * Task holds one reference to the queue, dropped when task exits. Each rq ++ * in-flight on this queue also holds a reference, dropped when rq is freed. ++ * ++ * Queue lock must be held here. ++ */ ++static void bfq_put_queue(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++ ++ BUG_ON(atomic_read(&bfqq->ref) <= 0); ++ ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ if (!atomic_dec_and_test(&bfqq->ref)) ++ return; ++ ++ BUG_ON(rb_first(&bfqq->sort_list) != NULL); ++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); ++ BUG_ON(bfqq->entity.tree != NULL); ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ BUG_ON(bfqd->active_queue == bfqq); ++ ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); ++ ++ kmem_cache_free(bfq_pool, bfqq); ++} ++ ++static void bfq_put_cooperator(struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *__bfqq, *next; ++ ++ /* ++ * If this queue was scheduled to merge with another queue, be ++ * sure to drop the reference taken on that queue (and others in ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. ++ */ ++ __bfqq = bfqq->new_bfqq; ++ while (__bfqq) { ++ if (__bfqq == bfqq) { ++ WARN(1, "bfqq->new_bfqq loop detected.\n"); ++ break; ++ } ++ next = __bfqq->new_bfqq; ++ bfq_put_queue(__bfqq); ++ __bfqq = next; ++ } ++} ++ ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ if (bfqq == bfqd->active_queue) { ++ __bfq_bfqq_expire(bfqd, bfqq); ++ bfq_schedule_dispatch(bfqd); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); ++} ++ ++/* ++ * Update the entity prio values; note that the new values will not ++ * be used until the next (re)activation. ++ */ ++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct io_context *ioc) ++{ ++ struct task_struct *tsk = current; ++ int ioprio_class; ++ ++ if (!bfq_bfqq_prio_changed(bfqq)) ++ return; ++ ++ ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); ++ switch (ioprio_class) { ++ default: ++ printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class); ++ case IOPRIO_CLASS_NONE: ++ /* ++ * No prio set, inherit CPU scheduling settings. ++ */ ++ bfqq->entity.new_ioprio = task_nice_ioprio(tsk); ++ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk); ++ break; ++ case IOPRIO_CLASS_RT: ++ bfqq->entity.new_ioprio = task_ioprio(ioc); ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT; ++ break; ++ case IOPRIO_CLASS_BE: ++ bfqq->entity.new_ioprio = task_ioprio(ioc); ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; ++ break; ++ case IOPRIO_CLASS_IDLE: ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE; ++ bfqq->entity.new_ioprio = 7; ++ bfq_clear_bfqq_idle_window(bfqq); ++ break; ++ } ++ ++ bfqq->entity.ioprio_changed = 1; ++ ++ /* ++ * Keep track of original prio settings in case we have to temporarily ++ * elevate the priority of this queue. ++ */ ++ bfqq->org_ioprio = bfqq->entity.new_ioprio; ++ bfq_clear_bfqq_prio_changed(bfqq); ++} ++ ++static void bfq_changed_ioprio(struct io_context *ioc, ++ struct cfq_io_context *cic) ++{ ++ struct bfq_data *bfqd; ++ struct bfq_queue *bfqq, *new_bfqq; ++ struct bfq_group *bfqg; ++ unsigned long uninitialized_var(flags); ++ ++ bfqd = bfq_get_bfqd_locked(&cic->key, &flags); ++ if (unlikely(bfqd == NULL)) ++ return; ++ ++ bfqq = cic->cfqq[BLK_RW_ASYNC]; ++ if (bfqq != NULL) { ++ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group, ++ sched_data); ++ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, cic->ioc, ++ GFP_ATOMIC); ++ if (new_bfqq != NULL) { ++ cic->cfqq[BLK_RW_ASYNC] = new_bfqq; ++ bfq_log_bfqq(bfqd, bfqq, ++ "changed_ioprio: bfqq %p %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ } ++ } ++ ++ bfqq = cic->cfqq[BLK_RW_SYNC]; ++ if (bfqq != NULL) ++ bfq_mark_bfqq_prio_changed(bfqq); ++ ++ bfq_put_bfqd_unlock(bfqd, &flags); ++} ++ ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ pid_t pid, int is_sync) ++{ ++ RB_CLEAR_NODE(&bfqq->entity.rb_node); ++ INIT_LIST_HEAD(&bfqq->fifo); ++ ++ atomic_set(&bfqq->ref, 0); ++ bfqq->bfqd = bfqd; ++ ++ bfq_mark_bfqq_prio_changed(bfqq); ++ ++ if (is_sync) { ++ if (!bfq_class_idle(bfqq)) ++ bfq_mark_bfqq_idle_window(bfqq); ++ bfq_mark_bfqq_sync(bfqq); ++ } ++ ++ /* Tentative initial value to trade off between thr and lat */ ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; ++ bfqq->pid = pid; ++ ++ bfqq->raising_coeff = 1; ++ bfqq->last_rais_start_finish = 0; ++ bfqq->soft_rt_next_start = -1; ++} ++ ++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ int is_sync, ++ struct io_context *ioc, ++ gfp_t gfp_mask) ++{ ++ struct bfq_queue *bfqq, *new_bfqq = NULL; ++ struct cfq_io_context *cic; ++ ++retry: ++ cic = bfq_cic_lookup(bfqd, ioc); ++ /* cic always exists here */ ++ bfqq = cic_to_bfqq(cic, is_sync); ++ ++ /* ++ * Always try a new alloc if we fall back to the OOM bfqq ++ * originally, since it should just be a temporary situation. ++ */ ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { ++ bfqq = NULL; ++ if (new_bfqq != NULL) { ++ bfqq = new_bfqq; ++ new_bfqq = NULL; ++ } else if (gfp_mask & __GFP_WAIT) { ++ spin_unlock_irq(bfqd->queue->queue_lock); ++ new_bfqq = kmem_cache_alloc_node(bfq_pool, ++ gfp_mask | __GFP_ZERO, ++ bfqd->queue->node); ++ spin_lock_irq(bfqd->queue->queue_lock); ++ if (new_bfqq != NULL) ++ goto retry; ++ } else { ++ bfqq = kmem_cache_alloc_node(bfq_pool, ++ gfp_mask | __GFP_ZERO, ++ bfqd->queue->node); ++ } ++ ++ if (bfqq != NULL) { ++ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync); ++ bfq_log_bfqq(bfqd, bfqq, "allocated"); ++ } else { ++ bfqq = &bfqd->oom_bfqq; ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); ++ } ++ ++ bfq_init_prio_data(bfqq, ioc); ++ bfq_init_entity(&bfqq->entity, bfqg); ++ } ++ ++ if (new_bfqq != NULL) ++ kmem_cache_free(bfq_pool, new_bfqq); ++ ++ return bfqq; ++} ++ ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ int ioprio_class, int ioprio) ++{ ++ switch (ioprio_class) { ++ case IOPRIO_CLASS_RT: ++ return &bfqg->async_bfqq[0][ioprio]; ++ case IOPRIO_CLASS_BE: ++ return &bfqg->async_bfqq[1][ioprio]; ++ case IOPRIO_CLASS_IDLE: ++ return &bfqg->async_idle_bfqq; ++ default: ++ BUG(); ++ } ++} ++ ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, int is_sync, ++ struct io_context *ioc, gfp_t gfp_mask) ++{ ++ const int ioprio = task_ioprio(ioc); ++ const int ioprio_class = task_ioprio_class(ioc); ++ struct bfq_queue **async_bfqq = NULL; ++ struct bfq_queue *bfqq = NULL; ++ ++ if (!is_sync) { ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ++ ioprio); ++ bfqq = *async_bfqq; ++ } ++ ++ if (bfqq == NULL) ++ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, ioc, gfp_mask); ++ ++ /* ++ * Pin the queue now that it's allocated, scheduler exit will prune it. ++ */ ++ if (!is_sync && *async_bfqq == NULL) { ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ *async_bfqq = bfqq; ++ } ++ ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ return bfqq; ++} ++ ++static void bfq_update_io_thinktime(struct bfq_data *bfqd, ++ struct cfq_io_context *cic) ++{ ++ unsigned long elapsed = jiffies - cic->ttime.last_end_request; ++ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); ++ ++ cic->ttime.ttime_samples = (7*cic->ttime.ttime_samples + 256) / 8; ++ cic->ttime.ttime_total = (7*cic->ttime.ttime_total + 256*ttime) / 8; ++ cic->ttime.ttime_mean = (cic->ttime.ttime_total + 128) / cic->ttime.ttime_samples; ++} ++ ++static void bfq_update_io_seektime(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ sector_t sdist; ++ u64 total; ++ ++ if (bfqq->last_request_pos < blk_rq_pos(rq)) ++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos; ++ else ++ sdist = bfqq->last_request_pos - blk_rq_pos(rq); ++ ++ /* ++ * Don't allow the seek distance to get too large from the ++ * odd fragment, pagein, etc. ++ */ ++ if (bfqq->seek_samples == 0) /* first request, not really a seek */ ++ sdist = 0; ++ else if (bfqq->seek_samples <= 60) /* second & third seek */ ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); ++ else ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); ++ ++ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; ++ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; ++ total = bfqq->seek_total + (bfqq->seek_samples/2); ++ do_div(total, bfqq->seek_samples); ++ if (bfq_bfqq_coop(bfqq)) { ++ /* ++ * If the mean seektime increases for a (non-seeky) shared ++ * queue, some cooperator is likely to be idling too much. ++ * On the contrary, if it decreases, some cooperator has ++ * probably waked up. ++ * ++ */ ++ if ((sector_t)total < bfqq->seek_mean) ++ bfq_mark_bfqq_some_coop_idle(bfqq) ; ++ else if ((sector_t)total > bfqq->seek_mean) ++ bfq_clear_bfqq_some_coop_idle(bfqq) ; ++ } ++ bfqq->seek_mean = (sector_t)total; ++ ++ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, ++ (u64)bfqq->seek_mean); ++} ++ ++/* ++ * Disable idle window if the process thinks too long or seeks so much that ++ * it doesn't matter. ++ */ ++static void bfq_update_idle_window(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct cfq_io_context *cic) ++{ ++ int enable_idle; ++ ++ /* Don't idle for async or idle io prio class. */ ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) ++ return; ++ ++ enable_idle = bfq_bfqq_idle_window(bfqq); ++ ++ if (atomic_read(&cic->ioc->nr_tasks) == 0 || ++ bfqd->bfq_slice_idle == 0 || ++ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && ++ bfqq->raising_coeff == 1)) ++ enable_idle = 0; ++ else if (bfq_sample_valid(cic->ttime.ttime_samples)) { ++ if (cic->ttime.ttime_mean > bfqd->bfq_slice_idle && ++ bfqq->raising_coeff == 1) ++ enable_idle = 0; ++ else ++ enable_idle = 1; ++ } ++ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", ++ enable_idle); ++ ++ if (enable_idle) ++ bfq_mark_bfqq_idle_window(bfqq); ++ else ++ bfq_clear_bfqq_idle_window(bfqq); ++} ++ ++/* ++ * Called when a new fs request (rq) is added to bfqq. Check if there's ++ * something we should do about it. ++ */ ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ struct cfq_io_context *cic = RQ_CIC(rq); ++ ++ if (rq->cmd_flags & REQ_META) ++ bfqq->meta_pending++; ++ ++ bfq_update_io_thinktime(bfqd, cic); ++ bfq_update_io_seektime(bfqd, bfqq, rq); ++ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || ++ !BFQQ_SEEKY(bfqq)) ++ bfq_update_idle_window(bfqd, bfqq, cic); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", ++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), ++ (long long unsigned)bfqq->seek_mean); ++ ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ ++ if (bfqq == bfqd->active_queue) { ++ /* ++ * If there is just this request queued and the request ++ * is small, just exit. ++ * In this way, if the disk is being idled to wait for a new ++ * request from the active queue, we avoid unplugging the ++ * device now. ++ * ++ * By doing so, we spare the disk to be committed ++ * to serve just a small request. On the contrary, we wait for ++ * the block layer to decide when to unplug the device: ++ * hopefully, new requests will be merged to this ++ * one quickly, then the device will be unplugged ++ * and larger requests will be dispatched. ++ */ ++ if (bfqq->queued[rq_is_sync(rq)] == 1 && ++ blk_rq_sectors(rq) < 32) { ++ return; ++ } ++ if (bfq_bfqq_wait_request(bfqq)) { ++ /* ++ * If we are waiting for a request for this queue, let ++ * it rip immediately and flag that we must not expire ++ * this queue just now. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ del_timer(&bfqd->idle_slice_timer); ++ /* ++ * Here we can safely expire the queue, in ++ * case of budget timeout, without wasting ++ * guarantees ++ */ ++ if (bfq_bfqq_budget_timeout(bfqq)) ++ bfq_bfqq_expire(bfqd, bfqq, 0, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ __blk_run_queue(bfqd->queue); ++ } ++ } ++} ++ ++static void bfq_insert_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ bfq_init_prio_data(bfqq, RQ_CIC(rq)->ioc); ++ ++ bfq_add_rq_rb(rq); ++ ++ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]); ++ list_add_tail(&rq->queuelist, &bfqq->fifo); ++ ++ bfq_rq_enqueued(bfqd, bfqq, rq); ++} ++ ++static void bfq_update_hw_tag(struct bfq_data *bfqd) ++{ ++ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, ++ bfqd->rq_in_driver); ++ ++ if (bfqd->hw_tag == 1) ++ return; ++ ++ /* ++ * This sample is valid if the number of outstanding requests ++ * is large enough to allow a queueing behavior. Note that the ++ * sum is not exact, as it's not taking into account deactivated ++ * requests. ++ */ ++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) ++ return; ++ ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) ++ return; ++ ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; ++ bfqd->max_rq_in_driver = 0; ++ bfqd->hw_tag_samples = 0; ++} ++ ++static void bfq_completed_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ const int sync = rq_is_sync(rq); ++ ++ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)", ++ blk_rq_sectors(rq), sync); ++ ++ bfq_update_hw_tag(bfqd); ++ ++ WARN_ON(!bfqd->rq_in_driver); ++ WARN_ON(!bfqq->dispatched); ++ bfqd->rq_in_driver--; ++ bfqq->dispatched--; ++ ++ if (bfq_bfqq_sync(bfqq)) ++ bfqd->sync_flight--; ++ ++ if (sync) ++ RQ_CIC(rq)->ttime.last_end_request = jiffies; ++ ++ /* ++ * If this is the active queue, check if it needs to be expired, ++ * or if we want to idle in case it has no pending requests. ++ */ ++ if (bfqd->active_queue == bfqq) { ++ if (bfq_bfqq_budget_new(bfqq)) ++ bfq_set_budget_timeout(bfqd); ++ ++ /* Idling is disabled also for cooperation issues: ++ * 1) there is a close cooperator for the queue, or ++ * 2) the queue is shared and some cooperator is likely ++ * to be idle (in this case, by not arming the idle timer, ++ * we try to slow down the queue, to prevent the zones ++ * of the disk accessed by the active cooperators to become ++ * too distant from the zone that will be accessed by the ++ * currently idle cooperators) ++ */ ++ if (bfq_may_expire_for_budg_timeout(bfqq)) ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); ++ else if (sync && ++ (bfqd->rq_in_driver == 0 || ++ bfqq->raising_coeff > 1) ++ && RB_EMPTY_ROOT(&bfqq->sort_list) ++ && !bfq_close_cooperator(bfqd, bfqq) ++ && (!bfq_bfqq_coop(bfqq) || ++ !bfq_bfqq_some_coop_idle(bfqq))) ++ bfq_arm_slice_timer(bfqd); ++ } ++ ++ if (!bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++} ++ ++static inline int __bfq_may_queue(struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { ++ bfq_clear_bfqq_must_alloc(bfqq); ++ return ELV_MQUEUE_MUST; ++ } ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++static int bfq_may_queue(struct request_queue *q, int rw) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct task_struct *tsk = current; ++ struct cfq_io_context *cic; ++ struct bfq_queue *bfqq; ++ ++ /* ++ * Don't force setup of a queue from here, as a call to may_queue ++ * does not necessarily imply that a request actually will be queued. ++ * So just lookup a possibly existing queue, or return 'may queue' ++ * if that fails. ++ */ ++ cic = bfq_cic_lookup(bfqd, tsk->io_context); ++ if (cic == NULL) ++ return ELV_MQUEUE_MAY; ++ ++ bfqq = cic_to_bfqq(cic, rw_is_sync(rw)); ++ if (bfqq != NULL) { ++ bfq_init_prio_data(bfqq, cic->ioc); ++ ++ return __bfq_may_queue(bfqq); ++ } ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++/* ++ * Queue lock held here. ++ */ ++static void bfq_put_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ if (bfqq != NULL) { ++ const int rw = rq_data_dir(rq); ++ ++ BUG_ON(!bfqq->allocated[rw]); ++ bfqq->allocated[rw]--; ++ ++ put_io_context(RQ_CIC(rq)->ioc); ++ ++ rq->elevator_private[0] = NULL; ++ rq->elevator_private[1] = NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ } ++} ++ ++static struct bfq_queue * ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct cfq_io_context *cic, ++ struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", ++ (long unsigned)bfqq->new_bfqq->pid); ++ cic_set_bfqq(cic, bfqq->new_bfqq, 1); ++ bfq_mark_bfqq_coop(bfqq->new_bfqq); ++ bfq_put_queue(bfqq); ++ return cic_to_bfqq(cic, 1); ++} ++ ++/* ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this ++ * was the last process referring to said bfqq. ++ */ ++static struct bfq_queue * ++bfq_split_bfqq(struct cfq_io_context *cic, struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); ++ if (bfqq_process_refs(bfqq) == 1) { ++ bfqq->pid = current->pid; ++ bfq_clear_bfqq_some_coop_idle(bfqq); ++ bfq_clear_bfqq_coop(bfqq); ++ bfq_clear_bfqq_split_coop(bfqq); ++ return bfqq; ++ } ++ ++ cic_set_bfqq(cic, NULL, 1); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); ++ return NULL; ++} ++ ++/* ++ * Allocate bfq data structures associated with this request. ++ */ ++static int bfq_set_request(struct request_queue *q, struct request *rq, ++ gfp_t gfp_mask) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct cfq_io_context *cic; ++ const int rw = rq_data_dir(rq); ++ const int is_sync = rq_is_sync(rq); ++ struct bfq_queue *bfqq; ++ struct bfq_group *bfqg; ++ unsigned long flags; ++ ++ might_sleep_if(gfp_mask & __GFP_WAIT); ++ ++ cic = bfq_get_io_context(bfqd, gfp_mask); ++ ++ spin_lock_irqsave(q->queue_lock, flags); ++ ++ if (cic == NULL) ++ goto queue_fail; ++ ++ bfqg = bfq_cic_update_cgroup(cic); ++ ++new_queue: ++ bfqq = cic_to_bfqq(cic, is_sync); ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { ++ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, cic->ioc, gfp_mask); ++ cic_set_bfqq(cic, bfqq, is_sync); ++ } else { ++ /* ++ * If the queue was seeky for too long, break it apart. ++ */ ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); ++ bfqq = bfq_split_bfqq(cic, bfqq); ++ if (!bfqq) ++ goto new_queue; ++ } ++ ++ /* ++ * Check to see if this queue is scheduled to merge with ++ * another closely cooperating queue. The merging of queues ++ * happens here as it must be done in process context. ++ * The reference on new_bfqq was taken in merge_bfqqs. ++ */ ++ if (bfqq->new_bfqq != NULL) ++ bfqq = bfq_merge_bfqqs(bfqd, cic, bfqq); ++ } ++ ++ bfqq->allocated[rw]++; ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ rq->elevator_private[0] = cic; ++ rq->elevator_private[1] = bfqq; ++ ++ return 0; ++ ++queue_fail: ++ if (cic != NULL) ++ put_io_context(cic->ioc); ++ ++ bfq_schedule_dispatch(bfqd); ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return 1; ++} ++ ++static void bfq_kick_queue(struct work_struct *work) ++{ ++ struct bfq_data *bfqd = ++ container_of(work, struct bfq_data, unplug_work); ++ struct request_queue *q = bfqd->queue; ++ ++ spin_lock_irq(q->queue_lock); ++ __blk_run_queue(q); ++ spin_unlock_irq(q->queue_lock); ++} ++ ++/* ++ * Handler of the expiration of the timer running if the active_queue ++ * is idling inside its time slice. ++ */ ++static void bfq_idle_slice_timer(unsigned long data) ++{ ++ struct bfq_data *bfqd = (struct bfq_data *)data; ++ struct bfq_queue *bfqq; ++ unsigned long flags; ++ enum bfqq_expiration reason; ++ ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags); ++ ++ bfqq = bfqd->active_queue; ++ /* ++ * Theoretical race here: active_queue can be NULL or different ++ * from the queue that was idling if the timer handler spins on ++ * the queue_lock and a new request arrives for the current ++ * queue and there is a full dispatch cycle that changes the ++ * active_queue. This can hardly happen, but in the worst case ++ * we just expire a queue too early. ++ */ ++ if (bfqq != NULL) { ++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); ++ if (bfq_bfqq_budget_timeout(bfqq)) ++ /* ++ * Also here the queue can be safely expired ++ * for budget timeout without wasting ++ * guarantees ++ */ ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) ++ /* ++ * The queue may not be empty upon timer expiration, ++ * because we may not disable the timer when the first ++ * request of the active queue arrives during ++ * disk idling ++ */ ++ reason = BFQ_BFQQ_TOO_IDLE; ++ else ++ goto schedule_dispatch; ++ ++ bfq_bfqq_expire(bfqd, bfqq, 1, reason); ++ } ++ ++schedule_dispatch: ++ bfq_schedule_dispatch(bfqd); ++ ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); ++} ++ ++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) ++{ ++ del_timer_sync(&bfqd->idle_slice_timer); ++ cancel_work_sync(&bfqd->unplug_work); ++} ++ ++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd, ++ struct bfq_queue **bfqq_ptr) ++{ ++ struct bfq_group *root_group = bfqd->root_group; ++ struct bfq_queue *bfqq = *bfqq_ptr; ++ ++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); ++ if (bfqq != NULL) { ++ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); ++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ *bfqq_ptr = NULL; ++ } ++} ++ ++/* ++ * Release all the bfqg references to its async queues. If we are ++ * deallocating the group these queues may still contain requests, so ++ * we reparent them to the root cgroup (i.e., the only one that will ++ * exist for sure untill all the requests on a device are gone). ++ */ ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); ++ ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); ++} ++ ++static void bfq_exit_queue(struct elevator_queue *e) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ struct request_queue *q = bfqd->queue; ++ struct bfq_queue *bfqq, *n; ++ struct cfq_io_context *cic; ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ spin_lock_irq(q->queue_lock); ++ ++ while (!list_empty(&bfqd->cic_list)) { ++ cic = list_entry(bfqd->cic_list.next, struct cfq_io_context, ++ queue_list); ++ __bfq_exit_single_io_context(bfqd, cic); ++ } ++ ++ BUG_ON(bfqd->active_queue != NULL); ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) ++ bfq_deactivate_bfqq(bfqd, bfqq, 0); ++ ++ bfq_disconnect_groups(bfqd); ++ spin_unlock_irq(q->queue_lock); ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ spin_lock(&cic_index_lock); ++ ida_remove(&cic_index_ida, bfqd->cic_index); ++ spin_unlock(&cic_index_lock); ++ ++ /* Wait for cic->key accessors to exit their grace periods. */ ++ synchronize_rcu(); ++ ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); ++ ++ bfq_free_root_group(bfqd); ++ kfree(bfqd); ++} ++ ++static int bfq_alloc_cic_index(void) ++{ ++ int index, error; ++ ++ do { ++ if (!ida_pre_get(&cic_index_ida, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ spin_lock(&cic_index_lock); ++ error = ida_get_new(&cic_index_ida, &index); ++ spin_unlock(&cic_index_lock); ++ if (error && error != -EAGAIN) ++ return error; ++ } while (error); ++ ++ return index; ++} ++ ++static void *bfq_init_queue(struct request_queue *q) ++{ ++ struct bfq_group *bfqg; ++ struct bfq_data *bfqd; ++ int i; ++ ++ i = bfq_alloc_cic_index(); ++ if (i < 0) ++ return NULL; ++ ++ bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node); ++ if (bfqd == NULL) ++ return NULL; ++ ++ bfqd->cic_index = i; ++ ++ /* ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. ++ * Grab a permanent reference to it, so that the normal code flow ++ * will not attempt to free it. ++ */ ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0); ++ atomic_inc(&bfqd->oom_bfqq.ref); ++ ++ INIT_LIST_HEAD(&bfqd->cic_list); ++ ++ bfqd->queue = q; ++ ++ bfqg = bfq_alloc_root_group(bfqd, q->node); ++ if (bfqg == NULL) { ++ kfree(bfqd); ++ return NULL; ++ } ++ ++ bfqd->root_group = bfqg; ++ ++ init_timer(&bfqd->idle_slice_timer); ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; ++ bfqd->idle_slice_timer.data = (unsigned long)bfqd; ++ ++ bfqd->rq_pos_tree = RB_ROOT; ++ ++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); ++ ++ INIT_LIST_HEAD(&bfqd->active_list); ++ INIT_LIST_HEAD(&bfqd->idle_list); ++ ++ bfqd->hw_tag = -1; ++ ++ bfqd->bfq_max_budget = bfq_default_max_budget; ++ ++ bfqd->bfq_quantum = bfq_quantum; ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; ++ bfqd->bfq_back_max = bfq_back_max; ++ bfqd->bfq_back_penalty = bfq_back_penalty; ++ bfqd->bfq_slice_idle = bfq_slice_idle; ++ bfqd->bfq_class_idle_last_service = 0; ++ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; ++ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; ++ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; ++ ++ bfqd->low_latency = true; ++ ++ bfqd->bfq_raising_coeff = 20; ++ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300); ++ bfqd->bfq_raising_max_time = 0; ++ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000); ++ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500); ++ bfqd->bfq_raising_max_softrt_rate = 7000; ++ ++ /* Initially estimate the device's peak rate as the reference rate */ ++ if (blk_queue_nonrot(bfqd->queue)) { ++ bfqd->RT_prod = R_nonrot * T_nonrot; ++ bfqd->peak_rate = R_nonrot; ++ } else { ++ bfqd->RT_prod = R_rot * T_rot; ++ bfqd->peak_rate = R_rot; ++ } ++ ++ return bfqd; ++} ++ ++static void bfq_slab_kill(void) ++{ ++ if (bfq_pool != NULL) ++ kmem_cache_destroy(bfq_pool); ++ if (bfq_ioc_pool != NULL) ++ kmem_cache_destroy(bfq_ioc_pool); ++} ++ ++static int __init bfq_slab_setup(void) ++{ ++ bfq_pool = KMEM_CACHE(bfq_queue, 0); ++ if (bfq_pool == NULL) ++ goto fail; ++ ++ bfq_ioc_pool = kmem_cache_create("bfq_io_context", ++ sizeof(struct cfq_io_context), ++ __alignof__(struct cfq_io_context), ++ 0, NULL); ++ if (bfq_ioc_pool == NULL) ++ goto fail; ++ ++ return 0; ++fail: ++ bfq_slab_kill(); ++ return -ENOMEM; ++} ++ ++static ssize_t bfq_var_show(unsigned int var, char *page) ++{ ++ return sprintf(page, "%d\n", var); ++} ++ ++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count) ++{ ++ unsigned long new_val; ++ int ret = strict_strtoul(page, 10, &new_val); ++ ++ if (ret == 0) ++ *var = new_val; ++ ++ return count; ++} ++ ++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ? ++ bfqd->bfq_raising_max_time : ++ bfq_wrais_duration(bfqd)); ++} ++ ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_queue *bfqq; ++ struct bfq_data *bfqd = e->elevator_data; ++ ssize_t num_char = 0; ++ ++ num_char += sprintf(page + num_char, "Active:\n"); ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, dur %d/%u\n", ++ bfqq->pid, ++ bfqq->entity.weight, ++ jiffies_to_msecs(jiffies - ++ bfqq->last_rais_start_finish), ++ jiffies_to_msecs(bfqq->raising_cur_max_time)); ++ } ++ num_char += sprintf(page + num_char, "Idle:\n"); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, dur %d/%u\n", ++ bfqq->pid, ++ bfqq->entity.weight, ++ jiffies_to_msecs(jiffies - ++ bfqq->last_rais_start_finish), ++ jiffies_to_msecs(bfqq->raising_cur_max_time)); ++ } ++ return num_char; ++} ++ ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned int __data = __VAR; \ ++ if (__CONV) \ ++ __data = jiffies_to_msecs(__data); \ ++ return bfq_var_show(__data, (page)); \ ++} ++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0); ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); ++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0); ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); ++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); ++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0); ++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1); ++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time, ++ 1); ++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show, ++ bfqd->bfq_raising_min_inter_arr_async, ++ 1); ++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show, ++ bfqd->bfq_raising_max_softrt_rate, 0); ++#undef SHOW_FUNCTION ++ ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ ++static ssize_t \ ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long __data; \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ if (__CONV) \ ++ *(__PTR) = msecs_to_jiffies(__data); \ ++ else \ ++ *(__PTR) = __data; \ ++ return ret; \ ++} ++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0); ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, ++ INT_MAX, 0); ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, ++ 1, INT_MAX, 0); ++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1, ++ INT_MAX, 0); ++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_raising_min_idle_time_store, ++ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store, ++ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_raising_max_softrt_rate_store, ++ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0); ++#undef STORE_FUNCTION ++ ++/* do nothing for the moment */ ++static ssize_t bfq_weights_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ return count; ++} ++ ++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) ++{ ++ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); ++ ++ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) ++ return bfq_calc_max_budget(bfqd->peak_rate, timeout); ++ else ++ return bfq_default_max_budget; ++} ++ ++static ssize_t bfq_max_budget_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long __data; ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data == 0) ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); ++ else { ++ if (__data > INT_MAX) ++ __data = INT_MAX; ++ bfqd->bfq_max_budget = __data; ++ } ++ ++ bfqd->bfq_user_max_budget = __data; ++ ++ return ret; ++} ++ ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long __data; ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data < 1) ++ __data = 1; ++ else if (__data > INT_MAX) ++ __data = INT_MAX; ++ ++ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); ++ if (bfqd->bfq_user_max_budget == 0) ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); ++ ++ return ret; ++} ++ ++static ssize_t bfq_low_latency_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long __data; ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ bfqd->low_latency = __data; ++ ++ return ret; ++} ++ ++#define BFQ_ATTR(name) \ ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) ++ ++static struct elv_fs_entry bfq_attrs[] = { ++ BFQ_ATTR(quantum), ++ BFQ_ATTR(fifo_expire_sync), ++ BFQ_ATTR(fifo_expire_async), ++ BFQ_ATTR(back_seek_max), ++ BFQ_ATTR(back_seek_penalty), ++ BFQ_ATTR(slice_idle), ++ BFQ_ATTR(max_budget), ++ BFQ_ATTR(max_budget_async_rq), ++ BFQ_ATTR(timeout_sync), ++ BFQ_ATTR(timeout_async), ++ BFQ_ATTR(low_latency), ++ BFQ_ATTR(raising_coeff), ++ BFQ_ATTR(raising_max_time), ++ BFQ_ATTR(raising_rt_max_time), ++ BFQ_ATTR(raising_min_idle_time), ++ BFQ_ATTR(raising_min_inter_arr_async), ++ BFQ_ATTR(raising_max_softrt_rate), ++ BFQ_ATTR(weights), ++ __ATTR_NULL ++}; ++ ++static struct elevator_type iosched_bfq = { ++ .ops = { ++ .elevator_merge_fn = bfq_merge, ++ .elevator_merged_fn = bfq_merged_request, ++ .elevator_merge_req_fn = bfq_merged_requests, ++ .elevator_allow_merge_fn = bfq_allow_merge, ++ .elevator_dispatch_fn = bfq_dispatch_requests, ++ .elevator_add_req_fn = bfq_insert_request, ++ .elevator_activate_req_fn = bfq_activate_request, ++ .elevator_deactivate_req_fn = bfq_deactivate_request, ++ .elevator_completed_req_fn = bfq_completed_request, ++ .elevator_former_req_fn = elv_rb_former_request, ++ .elevator_latter_req_fn = elv_rb_latter_request, ++ .elevator_set_req_fn = bfq_set_request, ++ .elevator_put_req_fn = bfq_put_request, ++ .elevator_may_queue_fn = bfq_may_queue, ++ .elevator_init_fn = bfq_init_queue, ++ .elevator_exit_fn = bfq_exit_queue, ++ .trim = bfq_free_io_context, ++ }, ++ .elevator_attrs = bfq_attrs, ++ .elevator_name = "bfq", ++ .elevator_owner = THIS_MODULE, ++}; ++ ++static int __init bfq_init(void) ++{ ++ /* ++ * Can be 0 on HZ < 1000 setups. ++ */ ++ if (bfq_slice_idle == 0) ++ bfq_slice_idle = 1; ++ ++ if (bfq_timeout_async == 0) ++ bfq_timeout_async = 1; ++ ++ if (bfq_slab_setup()) ++ return -ENOMEM; ++ ++ elv_register(&iosched_bfq); ++ ++ return 0; ++} ++ ++static void __exit bfq_exit(void) ++{ ++ DECLARE_COMPLETION_ONSTACK(all_gone); ++ elv_unregister(&iosched_bfq); ++ bfq_ioc_gone = &all_gone; ++ /* bfq_ioc_gone's update must be visible before reading bfq_ioc_count */ ++ smp_wmb(); ++ if (elv_ioc_count_read(bfq_ioc_count) != 0) ++ wait_for_completion(&all_gone); ++ ida_destroy(&cic_index_ida); ++ bfq_slab_kill(); ++} ++ ++module_init(bfq_init); ++module_exit(bfq_exit); ++ ++MODULE_AUTHOR("Fabio Checconi, Paolo Valente"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler"); +diff --git a/block/bfq-sched.c block/bfq-sched.c +new file mode 100644 +index 0000000..fd50b7f +--- /dev/null ++++ block/bfq-sched.c +@@ -0,0 +1,1066 @@ ++/* ++ * BFQ: Hierarchical B-WF2Q+ scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ */ ++ ++#ifdef CONFIG_CGROUP_BFQIO ++#define for_each_entity(entity) \ ++ for (; entity != NULL; entity = entity->parent) ++ ++#define for_each_entity_safe(entity, parent) \ ++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) ++ ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, ++ int extract, ++ struct bfq_data *bfqd); ++ ++static inline void bfq_update_budget(struct bfq_entity *next_active) ++{ ++ struct bfq_entity *bfqg_entity; ++ struct bfq_group *bfqg; ++ struct bfq_sched_data *group_sd; ++ ++ BUG_ON(next_active == NULL); ++ ++ group_sd = next_active->sched_data; ++ ++ bfqg = container_of(group_sd, struct bfq_group, sched_data); ++ /* ++ * bfq_group's my_entity field is not NULL only if the group ++ * is not the root group. We must not touch the root entity ++ * as it must never become an active entity. ++ */ ++ bfqg_entity = bfqg->my_entity; ++ if (bfqg_entity != NULL) ++ bfqg_entity->budget = next_active->budget; ++} ++ ++static int bfq_update_next_active(struct bfq_sched_data *sd) ++{ ++ struct bfq_entity *next_active; ++ ++ if (sd->active_entity != NULL) ++ /* will update/requeue at the end of service */ ++ return 0; ++ ++ /* ++ * NOTE: this can be improved in many ways, such as returning ++ * 1 (and thus propagating upwards the update) only when the ++ * budget changes, or caching the bfqq that will be scheduled ++ * next from this subtree. By now we worry more about ++ * correctness than about performance... ++ */ ++ next_active = bfq_lookup_next_entity(sd, 0, NULL); ++ sd->next_active = next_active; ++ ++ if (next_active != NULL) ++ bfq_update_budget(next_active); ++ ++ return 1; ++} ++ ++static inline void bfq_check_next_active(struct bfq_sched_data *sd, ++ struct bfq_entity *entity) ++{ ++ BUG_ON(sd->next_active != entity); ++} ++#else ++#define for_each_entity(entity) \ ++ for (; entity != NULL; entity = NULL) ++ ++#define for_each_entity_safe(entity, parent) \ ++ for (parent = NULL; entity != NULL; entity = parent) ++ ++static inline int bfq_update_next_active(struct bfq_sched_data *sd) ++{ ++ return 0; ++} ++ ++static inline void bfq_check_next_active(struct bfq_sched_data *sd, ++ struct bfq_entity *entity) ++{ ++} ++ ++static inline void bfq_update_budget(struct bfq_entity *next_active) ++{ ++} ++#endif ++ ++/* ++ * Shift for timestamp calculations. This actually limits the maximum ++ * service allowed in one timestamp delta (small shift values increase it), ++ * the maximum total weight that can be used for the queues in the system ++ * (big shift values increase it), and the period of virtual time wraparounds. ++ */ ++#define WFQ_SERVICE_SHIFT 22 ++ ++/** ++ * bfq_gt - compare two timestamps. ++ * @a: first ts. ++ * @b: second ts. ++ * ++ * Return @a > @b, dealing with wrapping correctly. ++ */ ++static inline int bfq_gt(u64 a, u64 b) ++{ ++ return (s64)(a - b) > 0; ++} ++ ++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = NULL; ++ ++ BUG_ON(entity == NULL); ++ ++ if (entity->my_sched_data == NULL) ++ bfqq = container_of(entity, struct bfq_queue, entity); ++ ++ return bfqq; ++} ++ ++ ++/** ++ * bfq_delta - map service into the virtual time domain. ++ * @service: amount of service. ++ * @weight: scale factor (weight of an entity or weight sum). ++ */ ++static inline u64 bfq_delta(unsigned long service, ++ unsigned long weight) ++{ ++ u64 d = (u64)service << WFQ_SERVICE_SHIFT; ++ ++ do_div(d, weight); ++ return d; ++} ++ ++/** ++ * bfq_calc_finish - assign the finish time to an entity. ++ * @entity: the entity to act upon. ++ * @service: the service to be charged to the entity. ++ */ ++static inline void bfq_calc_finish(struct bfq_entity *entity, ++ unsigned long service) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ BUG_ON(entity->weight == 0); ++ ++ entity->finish = entity->start + ++ bfq_delta(service, entity->weight); ++ ++ if (bfqq != NULL) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "calc_finish: serv %lu, w %d", ++ service, entity->weight); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "calc_finish: start %llu, finish %llu, delta %llu", ++ entity->start, entity->finish, ++ bfq_delta(service, entity->weight)); ++ } ++} ++ ++/** ++ * bfq_entity_of - get an entity from a node. ++ * @node: the node field of the entity. ++ * ++ * Convert a node pointer to the relative entity. This is used only ++ * to simplify the logic of some functions and not as the generic ++ * conversion mechanism because, e.g., in the tree walking functions, ++ * the check for a %NULL value would be redundant. ++ */ ++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node) ++{ ++ struct bfq_entity *entity = NULL; ++ ++ if (node != NULL) ++ entity = rb_entry(node, struct bfq_entity, rb_node); ++ ++ return entity; ++} ++ ++/** ++ * bfq_extract - remove an entity from a tree. ++ * @root: the tree root. ++ * @entity: the entity to remove. ++ */ ++static inline void bfq_extract(struct rb_root *root, ++ struct bfq_entity *entity) ++{ ++ BUG_ON(entity->tree != root); ++ ++ entity->tree = NULL; ++ rb_erase(&entity->rb_node, root); ++} ++ ++/** ++ * bfq_idle_extract - extract an entity from the idle tree. ++ * @st: the service tree of the owning @entity. ++ * @entity: the entity being removed. ++ */ ++static void bfq_idle_extract(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *next; ++ ++ BUG_ON(entity->tree != &st->idle); ++ ++ if (entity == st->first_idle) { ++ next = rb_next(&entity->rb_node); ++ st->first_idle = bfq_entity_of(next); ++ } ++ ++ if (entity == st->last_idle) { ++ next = rb_prev(&entity->rb_node); ++ st->last_idle = bfq_entity_of(next); ++ } ++ ++ bfq_extract(&st->idle, entity); ++ ++ if (bfqq != NULL) ++ list_del(&bfqq->bfqq_list); ++} ++ ++/** ++ * bfq_insert - generic tree insertion. ++ * @root: tree root. ++ * @entity: entity to insert. ++ * ++ * This is used for the idle and the active tree, since they are both ++ * ordered by finish time. ++ */ ++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) ++{ ++ struct bfq_entity *entry; ++ struct rb_node **node = &root->rb_node; ++ struct rb_node *parent = NULL; ++ ++ BUG_ON(entity->tree != NULL); ++ ++ while (*node != NULL) { ++ parent = *node; ++ entry = rb_entry(parent, struct bfq_entity, rb_node); ++ ++ if (bfq_gt(entry->finish, entity->finish)) ++ node = &parent->rb_left; ++ else ++ node = &parent->rb_right; ++ } ++ ++ rb_link_node(&entity->rb_node, parent, node); ++ rb_insert_color(&entity->rb_node, root); ++ ++ entity->tree = root; ++} ++ ++/** ++ * bfq_update_min - update the min_start field of a entity. ++ * @entity: the entity to update. ++ * @node: one of its children. ++ * ++ * This function is called when @entity may store an invalid value for ++ * min_start due to updates to the active tree. The function assumes ++ * that the subtree rooted at @node (which may be its left or its right ++ * child) has a valid min_start value. ++ */ ++static inline void bfq_update_min(struct bfq_entity *entity, ++ struct rb_node *node) ++{ ++ struct bfq_entity *child; ++ ++ if (node != NULL) { ++ child = rb_entry(node, struct bfq_entity, rb_node); ++ if (bfq_gt(entity->min_start, child->min_start)) ++ entity->min_start = child->min_start; ++ } ++} ++ ++/** ++ * bfq_update_active_node - recalculate min_start. ++ * @node: the node to update. ++ * ++ * @node may have changed position or one of its children may have moved, ++ * this function updates its min_start value. The left and right subtrees ++ * are assumed to hold a correct min_start value. ++ */ ++static inline void bfq_update_active_node(struct rb_node *node) ++{ ++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); ++ ++ entity->min_start = entity->start; ++ bfq_update_min(entity, node->rb_right); ++ bfq_update_min(entity, node->rb_left); ++} ++ ++/** ++ * bfq_update_active_tree - update min_start for the whole active tree. ++ * @node: the starting node. ++ * ++ * @node must be the deepest modified node after an update. This function ++ * updates its min_start using the values held by its children, assuming ++ * that they did not change, and then updates all the nodes that may have ++ * changed in the path to the root. The only nodes that may have changed ++ * are the ones in the path or their siblings. ++ */ ++static void bfq_update_active_tree(struct rb_node *node) ++{ ++ struct rb_node *parent; ++ ++up: ++ bfq_update_active_node(node); ++ ++ parent = rb_parent(node); ++ if (parent == NULL) ++ return; ++ ++ if (node == parent->rb_left && parent->rb_right != NULL) ++ bfq_update_active_node(parent->rb_right); ++ else if (parent->rb_left != NULL) ++ bfq_update_active_node(parent->rb_left); ++ ++ node = parent; ++ goto up; ++} ++ ++/** ++ * bfq_active_insert - insert an entity in the active tree of its group/device. ++ * @st: the service tree of the entity. ++ * @entity: the entity being inserted. ++ * ++ * The active tree is ordered by finish time, but an extra key is kept ++ * per each node, containing the minimum value for the start times of ++ * its children (and the node itself), so it's possible to search for ++ * the eligible node with the lowest finish time in logarithmic time. ++ */ ++static void bfq_active_insert(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *node = &entity->rb_node; ++ ++ bfq_insert(&st->active, entity); ++ ++ if (node->rb_left != NULL) ++ node = node->rb_left; ++ else if (node->rb_right != NULL) ++ node = node->rb_right; ++ ++ bfq_update_active_tree(node); ++ ++ if (bfqq != NULL) ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); ++} ++ ++/** ++ * bfq_ioprio_to_weight - calc a weight from an ioprio. ++ * @ioprio: the ioprio value to convert. ++ */ ++static unsigned short bfq_ioprio_to_weight(int ioprio) ++{ ++ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); ++ return IOPRIO_BE_NR - ioprio; ++} ++ ++/** ++ * bfq_weight_to_ioprio - calc an ioprio from a weight. ++ * @weight: the weight value to convert. ++ * ++ * To preserve as mush as possible the old only-ioprio user interface, ++ * 0 is used as an escape ioprio value for weights (numerically) equal or ++ * larger than IOPRIO_BE_NR ++ */ ++static unsigned short bfq_weight_to_ioprio(int weight) ++{ ++ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); ++ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; ++} ++ ++static inline void bfq_get_entity(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ if (bfqq != NULL) { ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ } ++} ++ ++/** ++ * bfq_find_deepest - find the deepest node that an extraction can modify. ++ * @node: the node being removed. ++ * ++ * Do the first step of an extraction in an rb tree, looking for the ++ * node that will replace @node, and returning the deepest node that ++ * the following modifications to the tree can touch. If @node is the ++ * last node in the tree return %NULL. ++ */ ++static struct rb_node *bfq_find_deepest(struct rb_node *node) ++{ ++ struct rb_node *deepest; ++ ++ if (node->rb_right == NULL && node->rb_left == NULL) ++ deepest = rb_parent(node); ++ else if (node->rb_right == NULL) ++ deepest = node->rb_left; ++ else if (node->rb_left == NULL) ++ deepest = node->rb_right; ++ else { ++ deepest = rb_next(node); ++ if (deepest->rb_right != NULL) ++ deepest = deepest->rb_right; ++ else if (rb_parent(deepest) != node) ++ deepest = rb_parent(deepest); ++ } ++ ++ return deepest; ++} ++ ++/** ++ * bfq_active_extract - remove an entity from the active tree. ++ * @st: the service_tree containing the tree. ++ * @entity: the entity being removed. ++ */ ++static void bfq_active_extract(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *node; ++ ++ node = bfq_find_deepest(&entity->rb_node); ++ bfq_extract(&st->active, entity); ++ ++ if (node != NULL) ++ bfq_update_active_tree(node); ++ ++ if (bfqq != NULL) ++ list_del(&bfqq->bfqq_list); ++} ++ ++/** ++ * bfq_idle_insert - insert an entity into the idle tree. ++ * @st: the service tree containing the tree. ++ * @entity: the entity to insert. ++ */ ++static void bfq_idle_insert(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct bfq_entity *first_idle = st->first_idle; ++ struct bfq_entity *last_idle = st->last_idle; ++ ++ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish)) ++ st->first_idle = entity; ++ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish)) ++ st->last_idle = entity; ++ ++ bfq_insert(&st->idle, entity); ++ ++ if (bfqq != NULL) ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); ++} ++ ++/** ++ * bfq_forget_entity - remove an entity from the wfq trees. ++ * @st: the service tree. ++ * @entity: the entity being removed. ++ * ++ * Update the device status and forget everything about @entity, putting ++ * the device reference to it, if it is a queue. Entities belonging to ++ * groups are not refcounted. ++ */ ++static void bfq_forget_entity(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ BUG_ON(!entity->on_st); ++ ++ entity->on_st = 0; ++ st->wsum -= entity->weight; ++ if (bfqq != NULL) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ } ++} ++ ++/** ++ * bfq_put_idle_entity - release the idle tree ref of an entity. ++ * @st: service tree for the entity. ++ * @entity: the entity being released. ++ */ ++static void bfq_put_idle_entity(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ bfq_idle_extract(st, entity); ++ bfq_forget_entity(st, entity); ++} ++ ++/** ++ * bfq_forget_idle - update the idle tree if necessary. ++ * @st: the service tree to act upon. ++ * ++ * To preserve the global O(log N) complexity we only remove one entry here; ++ * as the idle tree will not grow indefinitely this can be done safely. ++ */ ++static void bfq_forget_idle(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *first_idle = st->first_idle; ++ struct bfq_entity *last_idle = st->last_idle; ++ ++ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL && ++ !bfq_gt(last_idle->finish, st->vtime)) { ++ /* ++ * Forget the whole idle tree, increasing the vtime past ++ * the last finish time of idle entities. ++ */ ++ st->vtime = last_idle->finish; ++ } ++ ++ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime)) ++ bfq_put_idle_entity(st, first_idle); ++} ++ ++static struct bfq_service_tree * ++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_service_tree *new_st = old_st; ++ ++ if (entity->ioprio_changed) { ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ BUG_ON(old_st->wsum < entity->weight); ++ old_st->wsum -= entity->weight; ++ ++ if (entity->new_weight != entity->orig_weight) { ++ entity->orig_weight = entity->new_weight; ++ entity->ioprio = ++ bfq_weight_to_ioprio(entity->orig_weight); ++ } else if (entity->new_ioprio != entity->ioprio) { ++ entity->ioprio = entity->new_ioprio; ++ entity->orig_weight = ++ bfq_ioprio_to_weight(entity->ioprio); ++ } else ++ entity->new_weight = entity->orig_weight = ++ bfq_ioprio_to_weight(entity->ioprio); ++ ++ entity->ioprio_class = entity->new_ioprio_class; ++ entity->ioprio_changed = 0; ++ ++ /* ++ * NOTE: here we may be changing the weight too early, ++ * this will cause unfairness. The correct approach ++ * would have required additional complexity to defer ++ * weight changes to the proper time instants (i.e., ++ * when entity->finish <= old_st->vtime). ++ */ ++ new_st = bfq_entity_service_tree(entity); ++ entity->weight = entity->orig_weight * ++ (bfqq != NULL ? bfqq->raising_coeff : 1); ++ new_st->wsum += entity->weight; ++ ++ if (new_st != old_st) ++ entity->start = new_st->vtime; ++ } ++ ++ return new_st; ++} ++ ++/** ++ * bfq_bfqq_served - update the scheduler status after selection for service. ++ * @bfqq: the queue being served. ++ * @served: bytes to transfer. ++ * ++ * NOTE: this can be optimized, as the timestamps of upper level entities ++ * are synchronized every time a new bfqq is selected for service. By now, ++ * we keep it to better check consistency. ++ */ ++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st; ++ ++ for_each_entity(entity) { ++ st = bfq_entity_service_tree(entity); ++ ++ entity->service += served; ++ BUG_ON(entity->service > entity->budget); ++ BUG_ON(st->wsum == 0); ++ ++ st->vtime += bfq_delta(served, st->wsum); ++ bfq_forget_idle(st); ++ } ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served); ++} ++ ++/** ++ * bfq_bfqq_charge_full_budget - set the service to the entity budget. ++ * @bfqq: the queue that needs a service update. ++ * ++ * When it's not possible to be fair in the service domain, because ++ * a queue is not consuming its budget fast enough (the meaning of ++ * fast depends on the timeout parameter), we charge it a full ++ * budget. In this way we should obtain a sort of time-domain ++ * fairness among all the seeky/slow queues. ++ */ ++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); ++ ++ bfq_bfqq_served(bfqq, entity->budget - entity->service); ++} ++ ++/** ++ * __bfq_activate_entity - activate an entity. ++ * @entity: the entity being activated. ++ * ++ * Called whenever an entity is activated, i.e., it is not active and one ++ * of its children receives a new request, or has to be reactivated due to ++ * budget exhaustion. It uses the current budget of the entity (and the ++ * service received if @entity is active) of the queue to calculate its ++ * timestamps. ++ */ ++static void __bfq_activate_entity(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ ++ if (entity == sd->active_entity) { ++ BUG_ON(entity->tree != NULL); ++ /* ++ * If we are requeueing the current entity we have ++ * to take care of not charging to it service it has ++ * not received. ++ */ ++ bfq_calc_finish(entity, entity->service); ++ entity->start = entity->finish; ++ sd->active_entity = NULL; ++ } else if (entity->tree == &st->active) { ++ /* ++ * Requeueing an entity due to a change of some ++ * next_active entity below it. We reuse the old ++ * start time. ++ */ ++ bfq_active_extract(st, entity); ++ } else if (entity->tree == &st->idle) { ++ /* ++ * Must be on the idle tree, bfq_idle_extract() will ++ * check for that. ++ */ ++ bfq_idle_extract(st, entity); ++ entity->start = bfq_gt(st->vtime, entity->finish) ? ++ st->vtime : entity->finish; ++ } else { ++ /* ++ * The finish time of the entity may be invalid, and ++ * it is in the past for sure, otherwise the queue ++ * would have been on the idle tree. ++ */ ++ entity->start = st->vtime; ++ st->wsum += entity->weight; ++ bfq_get_entity(entity); ++ ++ BUG_ON(entity->on_st); ++ entity->on_st = 1; ++ } ++ ++ st = __bfq_entity_update_weight_prio(st, entity); ++ bfq_calc_finish(entity, entity->budget); ++ bfq_active_insert(st, entity); ++} ++ ++/** ++ * bfq_activate_entity - activate an entity and its ancestors if necessary. ++ * @entity: the entity to activate. ++ * ++ * Activate @entity and all the entities on the path from it to the root. ++ */ ++static void bfq_activate_entity(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sd; ++ ++ for_each_entity(entity) { ++ __bfq_activate_entity(entity); ++ ++ sd = entity->sched_data; ++ if (!bfq_update_next_active(sd)) ++ /* ++ * No need to propagate the activation to the ++ * upper entities, as they will be updated when ++ * the active entity is rescheduled. ++ */ ++ break; ++ } ++} ++ ++/** ++ * __bfq_deactivate_entity - deactivate an entity from its service tree. ++ * @entity: the entity to deactivate. ++ * @requeue: if false, the entity will not be put into the idle tree. ++ * ++ * Deactivate an entity, independently from its previous state. If the ++ * entity was not on a service tree just return, otherwise if it is on ++ * any scheduler tree, extract it from that tree, and if necessary ++ * and if the caller did not specify @requeue, put it on the idle tree. ++ * ++ * Return %1 if the caller should update the entity hierarchy, i.e., ++ * if the entity was under service or if it was the next_active for ++ * its sched_data; return %0 otherwise. ++ */ ++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ int was_active = entity == sd->active_entity; ++ int ret = 0; ++ ++ if (!entity->on_st) ++ return 0; ++ ++ BUG_ON(was_active && entity->tree != NULL); ++ ++ if (was_active) { ++ bfq_calc_finish(entity, entity->service); ++ sd->active_entity = NULL; ++ } else if (entity->tree == &st->active) ++ bfq_active_extract(st, entity); ++ else if (entity->tree == &st->idle) ++ bfq_idle_extract(st, entity); ++ else if (entity->tree != NULL) ++ BUG(); ++ ++ if (was_active || sd->next_active == entity) ++ ret = bfq_update_next_active(sd); ++ ++ if (!requeue || !bfq_gt(entity->finish, st->vtime)) ++ bfq_forget_entity(st, entity); ++ else ++ bfq_idle_insert(st, entity); ++ ++ BUG_ON(sd->active_entity == entity); ++ BUG_ON(sd->next_active == entity); ++ ++ return ret; ++} ++ ++/** ++ * bfq_deactivate_entity - deactivate an entity. ++ * @entity: the entity to deactivate. ++ * @requeue: true if the entity can be put on the idle tree ++ */ ++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) ++{ ++ struct bfq_sched_data *sd; ++ struct bfq_entity *parent; ++ ++ for_each_entity_safe(entity, parent) { ++ sd = entity->sched_data; ++ ++ if (!__bfq_deactivate_entity(entity, requeue)) ++ /* ++ * The parent entity is still backlogged, and ++ * we don't need to update it as it is still ++ * under service. ++ */ ++ break; ++ ++ if (sd->next_active != NULL) ++ /* ++ * The parent entity is still backlogged and ++ * the budgets on the path towards the root ++ * need to be updated. ++ */ ++ goto update; ++ ++ /* ++ * If we reach there the parent is no more backlogged and ++ * we want to propagate the dequeue upwards. ++ */ ++ requeue = 1; ++ } ++ ++ return; ++ ++update: ++ entity = parent; ++ for_each_entity(entity) { ++ __bfq_activate_entity(entity); ++ ++ sd = entity->sched_data; ++ if (!bfq_update_next_active(sd)) ++ break; ++ } ++} ++ ++/** ++ * bfq_update_vtime - update vtime if necessary. ++ * @st: the service tree to act upon. ++ * ++ * If necessary update the service tree vtime to have at least one ++ * eligible entity, skipping to its start time. Assumes that the ++ * active tree of the device is not empty. ++ * ++ * NOTE: this hierarchical implementation updates vtimes quite often, ++ * we may end up with reactivated tasks getting timestamps after a ++ * vtime skip done because we needed a ->first_active entity on some ++ * intermediate node. ++ */ ++static void bfq_update_vtime(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *entry; ++ struct rb_node *node = st->active.rb_node; ++ ++ entry = rb_entry(node, struct bfq_entity, rb_node); ++ if (bfq_gt(entry->min_start, st->vtime)) { ++ st->vtime = entry->min_start; ++ bfq_forget_idle(st); ++ } ++} ++ ++/** ++ * bfq_first_active - find the eligible entity with the smallest finish time ++ * @st: the service tree to select from. ++ * ++ * This function searches the first schedulable entity, starting from the ++ * root of the tree and going on the left every time on this side there is ++ * a subtree with at least one eligible (start >= vtime) entity. The path ++ * on the right is followed only if a) the left subtree contains no eligible ++ * entities and b) no eligible entity has been found yet. ++ */ ++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *entry, *first = NULL; ++ struct rb_node *node = st->active.rb_node; ++ ++ while (node != NULL) { ++ entry = rb_entry(node, struct bfq_entity, rb_node); ++left: ++ if (!bfq_gt(entry->start, st->vtime)) ++ first = entry; ++ ++ BUG_ON(bfq_gt(entry->min_start, st->vtime)); ++ ++ if (node->rb_left != NULL) { ++ entry = rb_entry(node->rb_left, ++ struct bfq_entity, rb_node); ++ if (!bfq_gt(entry->min_start, st->vtime)) { ++ node = node->rb_left; ++ goto left; ++ } ++ } ++ if (first != NULL) ++ break; ++ node = node->rb_right; ++ } ++ ++ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active)); ++ return first; ++} ++ ++/** ++ * __bfq_lookup_next_entity - return the first eligible entity in @st. ++ * @st: the service tree. ++ * ++ * Update the virtual time in @st and return the first eligible entity ++ * it contains. ++ */ ++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, ++ bool force) ++{ ++ struct bfq_entity *entity, *new_next_active = NULL; ++ ++ if (RB_EMPTY_ROOT(&st->active)) ++ return NULL; ++ ++ bfq_update_vtime(st); ++ entity = bfq_first_active_entity(st); ++ BUG_ON(bfq_gt(entity->start, st->vtime)); ++ ++ /* ++ * If the chosen entity does not match with the sched_data's ++ * next_active and we are forcedly serving the IDLE priority ++ * class tree, bubble up budget update. ++ */ ++ if (unlikely(force && entity != entity->sched_data->next_active)) { ++ new_next_active = entity; ++ for_each_entity(new_next_active) ++ bfq_update_budget(new_next_active); ++ } ++ ++ return entity; ++} ++ ++/** ++ * bfq_lookup_next_entity - return the first eligible entity in @sd. ++ * @sd: the sched_data. ++ * @extract: if true the returned entity will be also extracted from @sd. ++ * ++ * NOTE: since we cache the next_active entity at each level of the ++ * hierarchy, the complexity of the lookup can be decreased with ++ * absolutely no effort just returning the cached next_active value; ++ * we prefer to do full lookups to test the consistency of * the data ++ * structures. ++ */ ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, ++ int extract, ++ struct bfq_data *bfqd) ++{ ++ struct bfq_service_tree *st = sd->service_tree; ++ struct bfq_entity *entity; ++ int i=0; ++ ++ BUG_ON(sd->active_entity != NULL); ++ ++ if (bfqd != NULL && ++ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { ++ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true); ++ if (entity != NULL) { ++ i = BFQ_IOPRIO_CLASSES - 1; ++ bfqd->bfq_class_idle_last_service = jiffies; ++ sd->next_active = entity; ++ } ++ } ++ for (; i < BFQ_IOPRIO_CLASSES; i++) { ++ entity = __bfq_lookup_next_entity(st + i, false); ++ if (entity != NULL) { ++ if (extract) { ++ bfq_check_next_active(sd, entity); ++ bfq_active_extract(st + i, entity); ++ sd->active_entity = entity; ++ sd->next_active = NULL; ++ } ++ break; ++ } ++ } ++ ++ return entity; ++} ++ ++/* ++ * Get next queue for service. ++ */ ++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_entity *entity = NULL; ++ struct bfq_sched_data *sd; ++ struct bfq_queue *bfqq; ++ ++ BUG_ON(bfqd->active_queue != NULL); ++ ++ if (bfqd->busy_queues == 0) ++ return NULL; ++ ++ sd = &bfqd->root_group->sched_data; ++ for (; sd != NULL; sd = entity->my_sched_data) { ++ entity = bfq_lookup_next_entity(sd, 1, bfqd); ++ BUG_ON(entity == NULL); ++ entity->service = 0; ++ } ++ ++ bfqq = bfq_entity_to_bfqq(entity); ++ BUG_ON(bfqq == NULL); ++ ++ return bfqq; ++} ++ ++/* ++ * Forced extraction of the given queue. ++ */ ++static void bfq_get_next_queue_forced(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity; ++ struct bfq_sched_data *sd; ++ ++ BUG_ON(bfqd->active_queue != NULL); ++ ++ entity = &bfqq->entity; ++ /* ++ * Bubble up extraction/update from the leaf to the root. ++ */ ++ for_each_entity(entity) { ++ sd = entity->sched_data; ++ bfq_update_budget(entity); ++ bfq_update_vtime(bfq_entity_service_tree(entity)); ++ bfq_active_extract(bfq_entity_service_tree(entity), entity); ++ sd->active_entity = entity; ++ sd->next_active = NULL; ++ entity->service = 0; ++ } ++ ++ return; ++} ++ ++static void __bfq_bfqd_reset_active(struct bfq_data *bfqd) ++{ ++ if (bfqd->active_cic != NULL) { ++ put_io_context(bfqd->active_cic->ioc); ++ bfqd->active_cic = NULL; ++ } ++ ++ bfqd->active_queue = NULL; ++ del_timer(&bfqd->idle_slice_timer); ++} ++ ++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ int requeue) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (bfqq == bfqd->active_queue) ++ __bfq_bfqd_reset_active(bfqd); ++ ++ bfq_deactivate_entity(entity, requeue); ++} ++ ++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_activate_entity(entity); ++} ++ ++/* ++ * Called when the bfqq no longer has requests pending, remove it from ++ * the service tree. ++ */ ++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ int requeue) ++{ ++ BUG_ON(!bfq_bfqq_busy(bfqq)); ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ bfq_log_bfqq(bfqd, bfqq, "del from busy"); ++ ++ bfq_clear_bfqq_busy(bfqq); ++ ++ BUG_ON(bfqd->busy_queues == 0); ++ bfqd->busy_queues--; ++ ++ bfq_deactivate_bfqq(bfqd, bfqq, requeue); ++} ++ ++/* ++ * Called when an inactive queue receives a new request. ++ */ ++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ BUG_ON(bfqq == bfqd->active_queue); ++ ++ bfq_log_bfqq(bfqd, bfqq, "add to busy"); ++ ++ bfq_activate_bfqq(bfqd, bfqq); ++ ++ bfq_mark_bfqq_busy(bfqq); ++ bfqd->busy_queues++; ++} +diff --git a/block/bfq.h block/bfq.h +new file mode 100644 +index 0000000..f23a9a5 +--- /dev/null ++++ block/bfq.h +@@ -0,0 +1,593 @@ ++/* ++ * BFQ-v5 for 3.2.0: data structures and common functions prototypes. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ */ ++ ++#ifndef _BFQ_H ++#define _BFQ_H ++ ++#include ++#include ++#include ++#include ++ ++#define BFQ_IOPRIO_CLASSES 3 ++#define BFQ_CL_IDLE_TIMEOUT HZ/5 ++ ++#define BFQ_MIN_WEIGHT 1 ++#define BFQ_MAX_WEIGHT 1000 ++ ++#define BFQ_DEFAULT_GRP_WEIGHT 10 ++#define BFQ_DEFAULT_GRP_IOPRIO 0 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE ++ ++struct bfq_entity; ++ ++/** ++ * struct bfq_service_tree - per ioprio_class service tree. ++ * @active: tree for active entities (i.e., those backlogged). ++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). ++ * @first_idle: idle entity with minimum F_i. ++ * @last_idle: idle entity with maximum F_i. ++ * @vtime: scheduler virtual time. ++ * @wsum: scheduler weight sum; active and idle entities contribute to it. ++ * ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each ++ * ioprio_class has its own independent scheduler, and so its own ++ * bfq_service_tree. All the fields are protected by the queue lock ++ * of the containing bfqd. ++ */ ++struct bfq_service_tree { ++ struct rb_root active; ++ struct rb_root idle; ++ ++ struct bfq_entity *first_idle; ++ struct bfq_entity *last_idle; ++ ++ u64 vtime; ++ unsigned long wsum; ++}; ++ ++/** ++ * struct bfq_sched_data - multi-class scheduler. ++ * @active_entity: entity under service. ++ * @next_active: head-of-the-line entity in the scheduler. ++ * @service_tree: array of service trees, one per ioprio_class. ++ * ++ * bfq_sched_data is the basic scheduler queue. It supports three ++ * ioprio_classes, and can be used either as a toplevel queue or as ++ * an intermediate queue on a hierarchical setup. ++ * @next_active points to the active entity of the sched_data service ++ * trees that will be scheduled next. ++ * ++ * The supported ioprio_classes are the same as in CFQ, in descending ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. ++ * Requests from higher priority queues are served before all the ++ * requests from lower priority queues; among requests of the same ++ * queue requests are served according to B-WF2Q+. ++ * All the fields are protected by the queue lock of the containing bfqd. ++ */ ++struct bfq_sched_data { ++ struct bfq_entity *active_entity; ++ struct bfq_entity *next_active; ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; ++}; ++ ++/** ++ * struct bfq_entity - schedulable entity. ++ * @rb_node: service_tree member. ++ * @on_st: flag, true if the entity is on a tree (either the active or ++ * the idle one of its service_tree). ++ * @finish: B-WF2Q+ finish timestamp (aka F_i). ++ * @start: B-WF2Q+ start timestamp (aka S_i). ++ * @tree: tree the entity is enqueued into; %NULL if not on a tree. ++ * @min_start: minimum start time of the (active) subtree rooted at ++ * this entity; used for O(log N) lookups into active trees. ++ * @service: service received during the last round of service. ++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. ++ * @weight: weight of the queue ++ * @parent: parent entity, for hierarchical scheduling. ++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the ++ * associated scheduler queue, %NULL on leaf nodes. ++ * @sched_data: the scheduler queue this entity belongs to. ++ * @ioprio: the ioprio in use. ++ * @new_weight: when a weight change is requested, the new weight value. ++ * @orig_weight: original weight, used to implement weight boosting ++ * @new_ioprio: when an ioprio change is requested, the new ioprio value. ++ * @ioprio_class: the ioprio_class in use. ++ * @new_ioprio_class: when an ioprio_class change is requested, the new ++ * ioprio_class value. ++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or ++ * ioprio_class change. ++ * ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each ++ * entity belongs to the sched_data of the parent group in the cgroup ++ * hierarchy. Non-leaf entities have also their own sched_data, stored ++ * in @my_sched_data. ++ * ++ * Each entity stores independently its priority values; this would ++ * allow different weights on different devices, but this ++ * functionality is not exported to userspace by now. Priorities and ++ * weights are updated lazily, first storing the new values into the ++ * new_* fields, then setting the @ioprio_changed flag. As soon as ++ * there is a transition in the entity state that allows the priority ++ * update to take place the effective and the requested priority ++ * values are synchronized. ++ * ++ * Unless cgroups are used, the weight value is calculated from the ++ * ioprio to export the same interface as CFQ. When dealing with ++ * ``well-behaved'' queues (i.e., queues that do not spend too much ++ * time to consume their budget and have true sequential behavior, and ++ * when there are no external factors breaking anticipation) the ++ * relative weights at each level of the cgroups hierarchy should be ++ * guaranteed. All the fields are protected by the queue lock of the ++ * containing bfqd. ++ */ ++struct bfq_entity { ++ struct rb_node rb_node; ++ ++ int on_st; ++ ++ u64 finish; ++ u64 start; ++ ++ struct rb_root *tree; ++ ++ u64 min_start; ++ ++ unsigned long service, budget; ++ unsigned short weight, new_weight; ++ unsigned short orig_weight; ++ ++ struct bfq_entity *parent; ++ ++ struct bfq_sched_data *my_sched_data; ++ struct bfq_sched_data *sched_data; ++ ++ unsigned short ioprio, new_ioprio; ++ unsigned short ioprio_class, new_ioprio_class; ++ ++ int ioprio_changed; ++}; ++ ++struct bfq_group; ++ ++/** ++ * struct bfq_queue - leaf schedulable entity. ++ * @ref: reference counter. ++ * @bfqd: parent bfq_data. ++ * @new_bfqq: shared bfq_queue if queue is cooperating with ++ * one or more other queues. ++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree). ++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree). ++ * @sort_list: sorted list of pending requests. ++ * @next_rq: if fifo isn't expired, next request to serve. ++ * @queued: nr of requests queued in @sort_list. ++ * @allocated: currently allocated requests. ++ * @meta_pending: pending metadata requests. ++ * @fifo: fifo list of requests in sort_list. ++ * @entity: entity representing this queue in the scheduler. ++ * @max_budget: maximum budget allowed from the feedback mechanism. ++ * @budget_timeout: budget expiration (in jiffies). ++ * @dispatched: number of requests on the dispatch list or inside driver. ++ * @org_ioprio: saved ioprio during boosted periods. ++ * @flags: status flags. ++ * @bfqq_list: node for active/idle bfqq list inside our bfqd. ++ * @seek_samples: number of seeks sampled ++ * @seek_total: sum of the distances of the seeks sampled ++ * @seek_mean: mean seek distance ++ * @last_request_pos: position of the last request enqueued ++ * @pid: pid of the process owning the queue, used for logging purposes. ++ * @last_rais_start_time: last (idle -> weight-raised) transition attempt ++ * @raising_cur_max_time: current max raising time for this queue ++ * ++ * A bfq_queue is a leaf request queue; it can be associated to an io_context ++ * or more (if it is an async one). @cgroup holds a reference to the ++ * cgroup, to be sure that it does not disappear while a bfqq still ++ * references it (mostly to avoid races between request issuing and task ++ * migration followed by cgroup distruction). ++ * All the fields are protected by the queue lock of the containing bfqd. ++ */ ++struct bfq_queue { ++ atomic_t ref; ++ struct bfq_data *bfqd; ++ ++ /* fields for cooperating queues handling */ ++ struct bfq_queue *new_bfqq; ++ struct rb_node pos_node; ++ struct rb_root *pos_root; ++ ++ struct rb_root sort_list; ++ struct request *next_rq; ++ int queued[2]; ++ int allocated[2]; ++ int meta_pending; ++ struct list_head fifo; ++ ++ struct bfq_entity entity; ++ ++ unsigned long max_budget; ++ unsigned long budget_timeout; ++ ++ int dispatched; ++ ++ unsigned short org_ioprio; ++ ++ unsigned int flags; ++ ++ struct list_head bfqq_list; ++ ++ unsigned int seek_samples; ++ u64 seek_total; ++ sector_t seek_mean; ++ sector_t last_request_pos; ++ ++ pid_t pid; ++ ++ /* weight-raising fields */ ++ unsigned int raising_cur_max_time; ++ u64 last_rais_start_finish, soft_rt_next_start; ++ unsigned int raising_coeff; ++}; ++ ++/** ++ * struct bfq_data - per device data structure. ++ * @queue: request queue for the managed device. ++ * @root_group: root bfq_group for the device. ++ * @rq_pos_tree: rbtree sorted by next_request position, ++ * used when determining if two or more queues ++ * have interleaving requests (see bfq_close_cooperator). ++ * @busy_queues: number of bfq_queues containing requests (including the ++ * queue under service, even if it is idling). ++ * @queued: number of queued requests. ++ * @rq_in_driver: number of requests dispatched and waiting for completion. ++ * @sync_flight: number of sync requests in the driver. ++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples ++ * completed requests . ++ * @hw_tag_samples: nr of samples used to calculate hw_tag. ++ * @hw_tag: flag set to one if the driver is showing a queueing behavior. ++ * @budgets_assigned: number of budgets assigned. ++ * @idle_slice_timer: timer set when idling for the next sequential request ++ * from the queue under service. ++ * @unplug_work: delayed work to restart dispatching on the request queue. ++ * @active_queue: bfq_queue under service. ++ * @active_cic: cfq_io_context (cic) associated with the @active_queue. ++ * @last_position: on-disk position of the last served request. ++ * @last_budget_start: beginning of the last budget. ++ * @last_idling_start: beginning of the last idle slice. ++ * @peak_rate: peak transfer rate observed for a budget. ++ * @peak_rate_samples: number of samples used to calculate @peak_rate. ++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling. ++ * @cic_index: use small consequent indexes as radix tree keys to reduce depth ++ * @cic_list: list of all the cics active on the bfq_data device. ++ * @group_list: list of all the bfq_groups active on the device. ++ * @active_list: list of all the bfq_queues active on the device. ++ * @idle_list: list of all the bfq_queues idle on the device. ++ * @bfq_quantum: max number of requests dispatched per dispatch round. ++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires ++ * requests are served in fifo order. ++ * @bfq_back_penalty: weight of backward seeks wrt forward ones. ++ * @bfq_back_max: maximum allowed backward seek. ++ * @bfq_slice_idle: maximum idling time. ++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning). ++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to ++ * async queues. ++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to ++ * to prevent seeky queues to impose long latencies to well ++ * behaved ones (this also implies that seeky queues cannot ++ * receive guarantees in the service domain; after a timeout ++ * they are charged for the whole allocated budget, to try ++ * to preserve a behavior reasonably fair among them, but ++ * without service-domain guarantees). ++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted ++ * queue is multiplied ++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies) ++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes ++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising ++ * may be reactivated for a queue (in jiffies) ++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals ++ * after which weight-raising may be ++ * reactivated for an already busy queue ++ * (in jiffies) ++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue, ++ * sectors per seconds ++ * @RT_prod: cached value of the product R*T used for computing the maximum ++ * duration of the weight raising automatically ++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions ++ * ++ * All the fields are protected by the @queue lock. ++ */ ++struct bfq_data { ++ struct request_queue *queue; ++ ++ struct bfq_group *root_group; ++ ++ struct rb_root rq_pos_tree; ++ ++ int busy_queues; ++ int queued; ++ int rq_in_driver; ++ int sync_flight; ++ ++ int max_rq_in_driver; ++ int hw_tag_samples; ++ int hw_tag; ++ ++ int budgets_assigned; ++ ++ struct timer_list idle_slice_timer; ++ struct work_struct unplug_work; ++ ++ struct bfq_queue *active_queue; ++ struct cfq_io_context *active_cic; ++ ++ sector_t last_position; ++ ++ ktime_t last_budget_start; ++ ktime_t last_idling_start; ++ int peak_rate_samples; ++ u64 peak_rate; ++ unsigned long bfq_max_budget; ++ ++ unsigned int cic_index; ++ struct list_head cic_list; ++ struct hlist_head group_list; ++ struct list_head active_list; ++ struct list_head idle_list; ++ ++ unsigned int bfq_quantum; ++ unsigned int bfq_fifo_expire[2]; ++ unsigned int bfq_back_penalty; ++ unsigned int bfq_back_max; ++ unsigned int bfq_slice_idle; ++ u64 bfq_class_idle_last_service; ++ ++ unsigned int bfq_user_max_budget; ++ unsigned int bfq_max_budget_async_rq; ++ unsigned int bfq_timeout[2]; ++ ++ bool low_latency; ++ ++ /* parameters of the low_latency heuristics */ ++ unsigned int bfq_raising_coeff; ++ unsigned int bfq_raising_max_time; ++ unsigned int bfq_raising_rt_max_time; ++ unsigned int bfq_raising_min_idle_time; ++ unsigned int bfq_raising_min_inter_arr_async; ++ unsigned int bfq_raising_max_softrt_rate; ++ u64 RT_prod; ++ ++ struct bfq_queue oom_bfqq; ++}; ++ ++enum bfqq_state_flags { ++ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */ ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ ++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ ++ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ ++ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */ ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ ++ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ ++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */ ++ BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */ ++}; ++ ++#define BFQ_BFQQ_FNS(name) \ ++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ ++{ \ ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ ++} ++ ++BFQ_BFQQ_FNS(busy); ++BFQ_BFQQ_FNS(wait_request); ++BFQ_BFQQ_FNS(must_alloc); ++BFQ_BFQQ_FNS(fifo_expire); ++BFQ_BFQQ_FNS(idle_window); ++BFQ_BFQQ_FNS(prio_changed); ++BFQ_BFQQ_FNS(sync); ++BFQ_BFQQ_FNS(budget_new); ++BFQ_BFQQ_FNS(coop); ++BFQ_BFQQ_FNS(split_coop); ++BFQ_BFQQ_FNS(some_coop_idle); ++#undef BFQ_BFQQ_FNS ++ ++/* Logging facilities. */ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) ++ ++/* Expiration reasons. */ ++enum bfqq_expiration { ++ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */ ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ ++}; ++ ++#ifdef CONFIG_CGROUP_BFQIO ++/** ++ * struct bfq_group - per (device, cgroup) data structure. ++ * @entity: schedulable entity to insert into the parent group sched_data. ++ * @sched_data: own sched_data, to contain child entities (they may be ++ * both bfq_queues and bfq_groups). ++ * @group_node: node to be inserted into the bfqio_cgroup->group_data ++ * list of the containing cgroup's bfqio_cgroup. ++ * @bfqd_node: node to be inserted into the @bfqd->group_list list ++ * of the groups active on the same device; used for cleanup. ++ * @bfqd: the bfq_data for the device this group acts upon. ++ * @async_bfqq: array of async queues for all the tasks belonging to ++ * the group, one queue per ioprio value per ioprio_class, ++ * except for the idle class that has only one queue. ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used ++ * to avoid too many special cases during group creation/migration. ++ * ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup ++ * there is a set of bfq_groups, each one collecting the lower-level ++ * entities belonging to the group that are acting on the same device. ++ * ++ * Locking works as follows: ++ * o @group_node is protected by the bfqio_cgroup lock, and is accessed ++ * via RCU from its readers. ++ * o @bfqd is protected by the queue lock, RCU is used to access it ++ * from the readers. ++ * o All the other fields are protected by the @bfqd queue lock. ++ */ ++struct bfq_group { ++ struct bfq_entity entity; ++ struct bfq_sched_data sched_data; ++ ++ struct hlist_node group_node; ++ struct hlist_node bfqd_node; ++ ++ void *bfqd; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct bfq_entity *my_entity; ++}; ++ ++/** ++ * struct bfqio_cgroup - bfq cgroup data structure. ++ * @css: subsystem state for bfq in the containing cgroup. ++ * @weight: cgroup weight. ++ * @ioprio: cgroup ioprio. ++ * @ioprio_class: cgroup ioprio_class. ++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data. ++ * @group_data: list containing the bfq_group belonging to this cgroup. ++ * ++ * @group_data is accessed using RCU, with @lock protecting the updates, ++ * @ioprio and @ioprio_class are protected by @lock. ++ */ ++struct bfqio_cgroup { ++ struct cgroup_subsys_state css; ++ ++ unsigned short weight, ioprio, ioprio_class; ++ ++ spinlock_t lock; ++ struct hlist_head group_data; ++}; ++#else ++struct bfq_group { ++ struct bfq_sched_data sched_data; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++}; ++#endif ++ ++static inline struct bfq_service_tree * ++bfq_entity_service_tree(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sched_data = entity->sched_data; ++ unsigned int idx = entity->ioprio_class - 1; ++ ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); ++ BUG_ON(sched_data == NULL); ++ ++ return sched_data->service_tree + idx; ++} ++ ++static inline struct bfq_queue *cic_to_bfqq(struct cfq_io_context *cic, ++ int is_sync) ++{ ++ return cic->cfqq[!!is_sync]; ++} ++ ++static inline void cic_set_bfqq(struct cfq_io_context *cic, ++ struct bfq_queue *bfqq, int is_sync) ++{ ++ cic->cfqq[!!is_sync] = bfqq; ++} ++ ++static inline void call_for_each_cic(struct io_context *ioc, ++ void (*func)(struct io_context *, ++ struct cfq_io_context *)) ++{ ++ struct cfq_io_context *cic; ++ struct hlist_node *n; ++ ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(cic, n, &ioc->bfq_cic_list, cic_list) ++ func(ioc, cic); ++ rcu_read_unlock(); ++} ++ ++#define CIC_DEAD_KEY 1ul ++#define CIC_DEAD_INDEX_SHIFT 1 ++ ++static inline void *bfqd_dead_key(struct bfq_data *bfqd) ++{ ++ return (void *)(bfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); ++} ++ ++/** ++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. ++ * @ptr: a pointer to a bfqd. ++ * @flags: storage for the flags to be saved. ++ * ++ * This function allows cic->key and bfqg->bfqd to be protected by the ++ * queue lock of the bfqd they reference; the pointer is dereferenced ++ * under RCU, so the storage for bfqd is assured to be safe as long ++ * as the RCU read side critical section does not end. After the ++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be ++ * sure that no other writer accessed it. If we raced with a writer, ++ * the function returns NULL, with the queue unlocked, otherwise it ++ * returns the dereferenced pointer, with the queue locked. ++ */ ++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr, ++ unsigned long *flags) ++{ ++ struct bfq_data *bfqd; ++ ++ rcu_read_lock(); ++ bfqd = rcu_dereference(*(struct bfq_data **)ptr); ++ ++ if (bfqd != NULL && !((unsigned long) bfqd & CIC_DEAD_KEY)) { ++ spin_lock_irqsave(bfqd->queue->queue_lock, *flags); ++ if (*ptr == bfqd) ++ goto out; ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); ++ } ++ ++ bfqd = NULL; ++out: ++ rcu_read_unlock(); ++ return bfqd; ++} ++ ++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd, ++ unsigned long *flags) ++{ ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); ++} ++ ++static void bfq_changed_ioprio(struct io_context *ioc, ++ struct cfq_io_context *cic); ++static void bfq_put_queue(struct bfq_queue *bfqq); ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, int is_sync, ++ struct io_context *ioc, gfp_t gfp_mask); ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); ++#endif +-- +1.7.10.4 + diff --git a/3.2.34/01patch-2.6.33_atopcnt.patch b/3.2.34/01patch-2.6.33_atopcnt.patch new file mode 100644 index 0000000..28bf733 --- /dev/null +++ b/3.2.34/01patch-2.6.33_atopcnt.patch @@ -0,0 +1,174 @@ +diff --git a/block/blk-core.c b/block/blk-core.c +index d1a9a0a..8b54acb 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -73,6 +73,17 @@ static void drive_stat_acct(struct request *rq, int new_io) + part_inc_in_flight(part, rw); + } + ++ switch (rw) { /* ATOP */ ++ case READ: /* ATOP */ ++ current->group_leader->stat.dsk_rio += new_io; /* ATOP */ ++ current->group_leader->stat.dsk_rsz += blk_rq_sectors(rq); /* ATOP */ ++ break; /* ATOP */ ++ case WRITE: /* ATOP */ ++ current->group_leader->stat.dsk_wio += new_io; /* ATOP */ ++ current->group_leader->stat.dsk_wsz += blk_rq_sectors(rq); /* ATOP */ ++ break; /* ATOP */ ++ } /* ATOP */ ++ + part_stat_unlock(); + } + +diff --git a/fs/proc/array.c b/fs/proc/array.c +index 13b5d07..cac522e 100644 +--- a/fs/proc/array.c ++++ b/fs/proc/array.c +@@ -515,6 +515,25 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, + (unsigned long long)delayacct_blkio_ticks(task), + cputime_to_clock_t(gtime), + cputime_to_clock_t(cgtime)); ++ ++ seq_printf(m, /* ATOP */ ++ "%lu %llu %lu %llu %lu %llu %lu " /* ATOP */ ++ "%llu %lu %llu %lu %llu %lu %lu\n", /* ATOP */ ++ task->stat.dsk_rio, /* ATOP */ ++ task->stat.dsk_rsz, /* ATOP */ ++ task->stat.dsk_wio, /* ATOP */ ++ task->stat.dsk_wsz, /* ATOP */ ++ task->stat.tcp_snd, /* ATOP */ ++ task->stat.tcp_ssz, /* ATOP */ ++ task->stat.tcp_rcv, /* ATOP */ ++ task->stat.tcp_rsz, /* ATOP */ ++ task->stat.udp_snd, /* ATOP */ ++ task->stat.udp_ssz, /* ATOP */ ++ task->stat.udp_rcv, /* ATOP */ ++ task->stat.udp_rsz, /* ATOP */ ++ task->stat.raw_snd, /* ATOP */ ++ task->stat.raw_rcv); /* ATOP */ ++ + if (mm) + mmput(mm); + return 0; +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 78efe7c..22391bf 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1512,6 +1512,17 @@ struct task_struct { + #endif + atomic_t fs_excl; /* holding fs exclusive resources */ + struct rcu_head rcu; ++ ++ struct { /* ATOP */ ++ unsigned long dsk_rio, dsk_wio; /* ATOP */ ++ unsigned long long dsk_rsz, dsk_wsz; /* ATOP */ ++ unsigned long tcp_snd, tcp_rcv; /* ATOP */ ++ unsigned long long tcp_ssz, tcp_rsz; /* ATOP */ ++ unsigned long udp_snd, udp_rcv; /* ATOP */ ++ unsigned long long udp_ssz, udp_rsz; /* ATOP */ ++ unsigned long raw_snd, raw_rcv; /* ATOP */ ++ } stat; /* ATOP */ ++ + + /* + * cache last used pipe for splice +diff --git a/kernel/acct.c b/kernel/acct.c +index a6605ca..d5df53a 100644 +--- a/kernel/acct.c ++++ b/kernel/acct.c +@@ -565,7 +565,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, + ac.ac_exitcode = pacct->ac_exitcode; + spin_unlock_irq(¤t->sighand->siglock); + ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ +- ac.ac_rw = encode_comp_t(ac.ac_io / 1024); ++ ac.ac_rw = encode_comp_t(current->stat.dsk_rio + current->stat.dsk_wio); /* ATOP */ + ac.ac_swaps = encode_comp_t(0); + + /* +diff --git a/kernel/fork.c b/kernel/fork.c +index f88bd98..bab2085 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -683,6 +683,14 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) + + tsk->min_flt = tsk->maj_flt = 0; + tsk->nvcsw = tsk->nivcsw = 0; ++ tsk->stat.dsk_rio = tsk->stat.dsk_wio = 0; /* ATOP */ ++ tsk->stat.dsk_rsz = tsk->stat.dsk_wsz = 0; /* ATOP */ ++ tsk->stat.tcp_snd = tsk->stat.tcp_rcv = 0; /* ATOP */ ++ tsk->stat.tcp_ssz = tsk->stat.tcp_rsz = 0; /* ATOP */ ++ tsk->stat.udp_snd = tsk->stat.udp_rcv = 0; /* ATOP */ ++ tsk->stat.udp_ssz = tsk->stat.udp_rsz = 0; /* ATOP */ ++ tsk->stat.raw_snd = tsk->stat.raw_rcv = 0; /* ATOP */ ++ + #ifdef CONFIG_DETECT_HUNG_TASK + tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; + #endif +diff --git a/net/socket.c b/net/socket.c +index 769c386..3ba19f6 100644 +--- a/net/socket.c ++++ b/net/socket.c +@@ -547,10 +547,28 @@ static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, + si->size = size; + + err = security_socket_sendmsg(sock, msg, size); +- if (err) +- return err; +- +- return sock->ops->sendmsg(iocb, sock, msg, size); ++ if (!err) ++ err = sock->ops->sendmsg(iocb, sock, msg, size); ++ ++ if (err >= 0 && sock->sk) { /* ATOP */ ++ switch (sock->sk->sk_family) { /* ATOP */ ++ case PF_INET: /* ATOP */ ++ case PF_INET6: /* ATOP */ ++ switch (sock->sk->sk_type) { /* ATOP */ ++ case SOCK_STREAM: /* ATOP */ ++ current->group_leader->stat.tcp_snd++; /* ATOP */ ++ current->group_leader->stat.tcp_ssz+=size;/* ATOP */ ++ break; /* ATOP */ ++ case SOCK_DGRAM: /* ATOP */ ++ current->group_leader->stat.udp_snd++; /* ATOP */ ++ current->group_leader->stat.udp_ssz+=size;/* ATOP */ ++ break; /* ATOP */ ++ case SOCK_RAW: /* ATOP */ ++ current->group_leader->stat.raw_snd++; /* ATOP */ ++ } /* ATOP */ ++ } /* ATOP */ ++ } /* ATOP */ ++ return err; + } + + int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +@@ -682,7 +700,29 @@ static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, + { + int err = security_socket_recvmsg(sock, msg, size, flags); + +- return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags); ++ if (!err) ++ err = __sock_recvmsg_nosec(iocb, sock, msg, size, flags); ++ ++ if (err >= 0 && sock->sk) { /* ATOP */ ++ switch (sock->sk->sk_family) { /* ATOP */ ++ case PF_INET: /* ATOP */ ++ case PF_INET6: /* ATOP */ ++ switch (sock->sk->sk_type) { /* ATOP */ ++ case SOCK_STREAM: /* ATOP */ ++ current->group_leader->stat.tcp_rcv++; /* ATOP */ ++ current->group_leader->stat.tcp_rsz+=err; /* ATOP */ ++ break; /* ATOP */ ++ case SOCK_DGRAM: /* ATOP */ ++ current->group_leader->stat.udp_rcv++; /* ATOP */ ++ current->group_leader->stat.udp_rsz+=err; /* ATOP */ ++ break; /* ATOP */ ++ case SOCK_RAW: /* ATOP */ ++ current->group_leader->stat.raw_rcv++; /* ATOP */ ++ break; /* ATOP */ ++ } /* ATOP */ ++ } /* ATOP */ ++ } /* ATOP */ ++ return err; + } + + int sock_recvmsg(struct socket *sock, struct msghdr *msg, diff --git a/3.2.34/02patch-2.6.33_atopacct.patch b/3.2.34/02patch-2.6.33_atopacct.patch new file mode 100644 index 0000000..74e6a1c --- /dev/null +++ b/3.2.34/02patch-2.6.33_atopacct.patch @@ -0,0 +1,125 @@ +Index: linux-2.6.28/include/linux/acct.h +=================================================================== +--- linux-2.6.28.orig/include/linux/acct.h 2009-01-14 13:02:24.000000000 +0100 ++++ linux-2.6.28/include/linux/acct.h 2009-01-14 13:03:33.000000000 +0100 +@@ -97,6 +97,54 @@ + char ac_comm[ACCT_COMM]; /* Command Name */ + }; + ++struct acct_atop ++{ ++ char ac_flag; /* Flags */ ++ char ac_version; /* Always set to ACCT_VERSION */ ++ __u32 ac_pid; /* Process ID */ ++ __u32 ac_ppid; /* Parent Process ID */ ++ __u16 ac_uid16; /* LSB of Real User ID */ ++ __u16 ac_gid16; /* LSB of Real Group ID */ ++ __u16 ac_tty; /* Control Terminal */ ++ __u32 ac_btime; /* Process Creation Time */ ++ comp_t ac_utime; /* User Time */ ++ comp_t ac_stime; /* System Time */ ++ comp_t ac_etime; /* Elapsed Time */ ++ comp_t ac_mem; /* Virtual Memory */ ++ comp_t ac_rss; /* Resident Memory */ ++ comp_t ac_io; /* Chars Transferred */ ++ comp_t ac_rw; /* Blocks Read or Written */ ++ comp_t ac_bread; /* Blocks Read */ ++ comp_t ac_bwrite; /* Blocks Written */ ++ comp2_t ac_dskrsz; /* Cum. blocks read */ ++ comp2_t ac_dskwsz; /* Cum. blocks written */ ++ comp_t ac_tcpsnd; /* TCP send requests */ ++ comp_t ac_tcprcv; /* TCP recv requests */ ++ comp2_t ac_tcpssz; /* TCP cum. length */ ++ comp2_t ac_tcprsz; /* TCP cum. length */ ++ comp_t ac_udpsnd; /* UDP send requests */ ++ comp_t ac_udprcv; /* UDP recv requests */ ++ comp2_t ac_udpssz; /* UDP cum. length */ ++ comp2_t ac_udprsz; /* UDP cum. length */ ++ comp_t ac_rawsnd; /* RAW send requests */ ++ comp_t ac_rawrcv; /* RAW recv requests */ ++ comp_t ac_minflt; /* Minor Pagefaults */ ++ comp_t ac_majflt; /* Major Pagefaults */ ++ comp_t ac_swaps; /* Number of Swaps */ ++/* m68k had no padding here. */ ++#if !defined(CONFIG_M68K) || !defined(__KERNEL__) ++ __u16 ac_ahz; /* AHZ */ ++#endif ++ __u32 ac_exitcode; /* Exitcode */ ++ char ac_comm[ACCT_COMM + 1]; /* Command Name */ ++ __u8 ac_etime_hi; /* Elapsed Time MSB */ ++ __u16 ac_etime_lo; /* Elapsed Time LSB */ ++ __u32 ac_uid; /* Real User ID */ ++ __u32 ac_gid; /* Real Group ID */ ++}; ++ ++ ++ + /* + * accounting flags + */ +@@ -146,7 +194,13 @@ + * 5: new binary incompatible format (128 bytes, second half) + * + */ ++#define CONFIG_PROCESS_ACCT_ATOP + ++#ifdef CONFIG_PROCESS_ACCT_ATOP ++#define ACCT_VERSION 6 ++#define AHZ (USER_HZ) ++typedef struct acct_atop acct_t; ++#else + #ifdef CONFIG_BSD_PROCESS_ACCT_V3 + #define ACCT_VERSION 3 + #define AHZ 100 +@@ -160,6 +214,7 @@ + #define AHZ (USER_HZ) + typedef struct acct acct_t; + #endif ++#endif + + #else + #define ACCT_VERSION 2 +Index: linux-2.6.28/kernel/acct.c +=================================================================== +--- linux-2.6.28.orig/kernel/acct.c 2009-01-14 13:03:31.000000000 +0100 ++++ linux-2.6.28/kernel/acct.c 2009-01-14 13:03:33.000000000 +0100 +@@ -405,7 +405,7 @@ + return exp; + } + +-#if ACCT_VERSION==1 || ACCT_VERSION==2 ++#if ACCT_VERSION==1 || ACCT_VERSION==2 || ACCT_VERSION==6 + /* + * encode an u64 into a comp2_t (24 bits) + * +@@ -552,6 +552,30 @@ + ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); + rcu_read_unlock(); + #endif ++#if ACCT_VERSION==6 /* ATOP */ ++ ac.ac_pid = current->pid; ++ ac.ac_ppid = current->parent->pid; ++ ac.ac_uid16 = ac.ac_uid; ++ ac.ac_gid16 = ac.ac_gid; ++ ac.ac_ahz = AHZ; ++ ac.ac_bread = encode_comp_t(current->stat.dsk_rio); ++ ac.ac_bwrite = encode_comp_t(current->stat.dsk_wio); ++ ac.ac_dskrsz = encode_comp2_t(current->stat.dsk_rsz); ++ ac.ac_dskwsz = encode_comp2_t(current->stat.dsk_wsz); ++ ac.ac_tcpsnd = encode_comp_t(current->stat.tcp_snd); ++ ac.ac_tcprcv = encode_comp_t(current->stat.tcp_rcv); ++ ac.ac_tcpssz = encode_comp2_t(current->stat.tcp_ssz); ++ ac.ac_tcprsz = encode_comp2_t(current->stat.tcp_rsz); ++ ac.ac_udpsnd = encode_comp_t(current->stat.udp_snd); ++ ac.ac_udprcv = encode_comp_t(current->stat.udp_rcv); ++ ac.ac_udpssz = encode_comp2_t(current->stat.udp_ssz); ++ ac.ac_udprsz = encode_comp2_t(current->stat.udp_rsz); ++ ac.ac_rawsnd = encode_comp_t(current->stat.raw_snd); ++ ac.ac_rawrcv = encode_comp_t(current->stat.raw_rcv); ++ ac.ac_rss = current->mm ? ++ encode_comp_t(get_mm_rss(current->mm)<<(PAGE_SHIFT-10)) : ++ encode_comp_t(0); ++#endif + + spin_lock_irq(¤t->sighand->siglock); + tty = current->signal->tty; /* Safe as we hold the siglock */ diff --git a/3.2.34/3.2.0-ck1.patch b/3.2.34/3.2.0-ck1.patch new file mode 100644 index 0000000..a81b2c4 --- /dev/null +++ b/3.2.34/3.2.0-ck1.patch @@ -0,0 +1,9093 @@ +Index: linux-3.2-ck1/arch/powerpc/platforms/cell/spufs/sched.c +=================================================================== +--- linux-3.2-ck1.orig/arch/powerpc/platforms/cell/spufs/sched.c 2012-01-16 10:07:27.897097267 +1100 ++++ linux-3.2-ck1/arch/powerpc/platforms/cell/spufs/sched.c 2012-01-16 10:07:31.336097029 +1100 +@@ -63,11 +63,6 @@ static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + + /* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- +-/* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. + */ +Index: linux-3.2-ck1/Documentation/scheduler/sched-BFS.txt +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-3.2-ck1/Documentation/scheduler/sched-BFS.txt 2012-01-16 10:07:31.336097029 +1100 +@@ -0,0 +1,347 @@ ++BFS - The Brain Fuck Scheduler by Con Kolivas. ++ ++Goals. ++ ++The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to ++completely do away with the complex designs of the past for the cpu process ++scheduler and instead implement one that is very simple in basic design. ++The main focus of BFS is to achieve excellent desktop interactivity and ++responsiveness without heuristics and tuning knobs that are difficult to ++understand, impossible to model and predict the effect of, and when tuned to ++one workload cause massive detriment to another. ++ ++ ++Design summary. ++ ++BFS is best described as a single runqueue, O(n) lookup, earliest effective ++virtual deadline first design, loosely based on EEVDF (earliest eligible virtual ++deadline first) and my previous Staircase Deadline scheduler. Each component ++shall be described in order to understand the significance of, and reasoning for ++it. The codebase when the first stable version was released was approximately ++9000 lines less code than the existing mainline linux kernel scheduler (in ++2.6.31). This does not even take into account the removal of documentation and ++the cgroups code that is not used. ++ ++Design reasoning. ++ ++The single runqueue refers to the queued but not running processes for the ++entire system, regardless of the number of CPUs. The reason for going back to ++a single runqueue design is that once multiple runqueues are introduced, ++per-CPU or otherwise, there will be complex interactions as each runqueue will ++be responsible for the scheduling latency and fairness of the tasks only on its ++own runqueue, and to achieve fairness and low latency across multiple CPUs, any ++advantage in throughput of having CPU local tasks causes other disadvantages. ++This is due to requiring a very complex balancing system to at best achieve some ++semblance of fairness across CPUs and can only maintain relatively low latency ++for tasks bound to the same CPUs, not across them. To increase said fairness ++and latency across CPUs, the advantage of local runqueue locking, which makes ++for better scalability, is lost due to having to grab multiple locks. ++ ++A significant feature of BFS is that all accounting is done purely based on CPU ++used and nowhere is sleep time used in any way to determine entitlement or ++interactivity. Interactivity "estimators" that use some kind of sleep/run ++algorithm are doomed to fail to detect all interactive tasks, and to falsely tag ++tasks that aren't interactive as being so. The reason for this is that it is ++close to impossible to determine that when a task is sleeping, whether it is ++doing it voluntarily, as in a userspace application waiting for input in the ++form of a mouse click or otherwise, or involuntarily, because it is waiting for ++another thread, process, I/O, kernel activity or whatever. Thus, such an ++estimator will introduce corner cases, and more heuristics will be required to ++cope with those corner cases, introducing more corner cases and failed ++interactivity detection and so on. Interactivity in BFS is built into the design ++by virtue of the fact that tasks that are waking up have not used up their quota ++of CPU time, and have earlier effective deadlines, thereby making it very likely ++they will preempt any CPU bound task of equivalent nice level. See below for ++more information on the virtual deadline mechanism. Even if they do not preempt ++a running task, because the rr interval is guaranteed to have a bound upper ++limit on how long a task will wait for, it will be scheduled within a timeframe ++that will not cause visible interface jitter. ++ ++ ++Design details. ++ ++Task insertion. ++ ++BFS inserts tasks into each relevant queue as an O(1) insertion into a double ++linked list. On insertion, *every* running queue is checked to see if the newly ++queued task can run on any idle queue, or preempt the lowest running task on the ++system. This is how the cross-CPU scheduling of BFS achieves significantly lower ++latency per extra CPU the system has. In this case the lookup is, in the worst ++case scenario, O(n) where n is the number of CPUs on the system. ++ ++Data protection. ++ ++BFS has one single lock protecting the process local data of every task in the ++global queue. Thus every insertion, removal and modification of task data in the ++global runqueue needs to grab the global lock. However, once a task is taken by ++a CPU, the CPU has its own local data copy of the running process' accounting ++information which only that CPU accesses and modifies (such as during a ++timer tick) thus allowing the accounting data to be updated lockless. Once a ++CPU has taken a task to run, it removes it from the global queue. Thus the ++global queue only ever has, at most, ++ ++ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 ++ ++tasks in the global queue. This value is relevant for the time taken to look up ++tasks during scheduling. This will increase if many tasks with CPU affinity set ++in their policy to limit which CPUs they're allowed to run on if they outnumber ++the number of CPUs. The +1 is because when rescheduling a task, the CPU's ++currently running task is put back on the queue. Lookup will be described after ++the virtual deadline mechanism is explained. ++ ++Virtual deadline. ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in BFS is entirely in the virtual deadline mechanism. The one ++tunable in BFS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in jiffies by this equation: ++ ++ jiffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. Once a task is descheduled, it is put back on the queue, and an ++O(n) lookup of all queued-but-not-running tasks is done to determine which has ++the earliest deadline and that task is chosen to receive CPU next. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (jiffies) is ++constantly moving. ++ ++Task lookup. ++ ++BFS has 103 priority queues. 100 of these are dedicated to the static priority ++of realtime tasks, and the remaining 3 are, in order of best to worst priority, ++SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority ++scheduling). When a task of these priorities is queued, a bitmap of running ++priorities is set showing which of these priorities has tasks waiting for CPU ++time. When a CPU is made to reschedule, the lookup for the next task to get ++CPU time is performed in the following way: ++ ++First the bitmap is checked to see what static priority tasks are queued. If ++any realtime priorities are found, the corresponding queue is checked and the ++first task listed there is taken (provided CPU affinity is suitable) and lookup ++is complete. If the priority corresponds to a SCHED_ISO task, they are also ++taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds ++to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this ++stage, every task in the runlist that corresponds to that priority is checked ++to see which has the earliest set deadline, and (provided it has suitable CPU ++affinity) it is taken off the runqueue and given the CPU. If a task has an ++expired deadline, it is taken and the rest of the lookup aborted (as they are ++chosen in FIFO order). ++ ++Thus, the lookup is O(n) in the worst case only, where n is as described ++earlier, as tasks may be chosen before the whole task list is looked over. ++ ++ ++Scalability. ++ ++The major limitations of BFS will be that of scalability, as the separate ++runqueue designs will have less lock contention as the number of CPUs rises. ++However they do not scale linearly even with separate runqueues as multiple ++runqueues will need to be locked concurrently on such designs to be able to ++achieve fair CPU balancing, to try and achieve some sort of nice-level fairness ++across CPUs, and to achieve low enough latency for tasks on a busy CPU when ++other CPUs would be more suited. BFS has the advantage that it requires no ++balancing algorithm whatsoever, as balancing occurs by proxy simply because ++all CPUs draw off the global runqueue, in priority and deadline order. Despite ++the fact that scalability is _not_ the prime concern of BFS, it both shows very ++good scalability to smaller numbers of CPUs and is likely a more scalable design ++at these numbers of CPUs. ++ ++It also has some very low overhead scalability features built into the design ++when it has been deemed their overhead is so marginal that they're worth adding. ++The first is the local copy of the running process' data to the CPU it's running ++on to allow that data to be updated lockless where possible. Then there is ++deference paid to the last CPU a task was running on, by trying that CPU first ++when looking for an idle CPU to use the next time it's scheduled. Finally there ++is the notion of "sticky" tasks that are flagged when they are involuntarily ++descheduled, meaning they still want further CPU time. This sticky flag is ++used to bias heavily against those tasks being scheduled on a different CPU ++unless that CPU would be otherwise idle. When a cpu frequency governor is used ++that scales with CPU load, such as ondemand, sticky tasks are not scheduled ++on a different CPU at all, preferring instead to go idle. This means the CPU ++they were bound to is more likely to increase its speed while the other CPU ++will go idle, thus speeding up total task execution time and likely decreasing ++power usage. This is the only scenario where BFS will allow a CPU to go idle ++in preference to scheduling a task on the earliest available spare CPU. ++ ++The real cost of migrating a task from one CPU to another is entirely dependant ++on the cache footprint of the task, how cache intensive the task is, how long ++it's been running on that CPU to take up the bulk of its cache, how big the CPU ++cache is, how fast and how layered the CPU cache is, how fast a context switch ++is... and so on. In other words, it's close to random in the real world where we ++do more than just one sole workload. The only thing we can be sure of is that ++it's not free. So BFS uses the principle that an idle CPU is a wasted CPU and ++utilising idle CPUs is more important than cache locality, and cache locality ++only plays a part after that. ++ ++When choosing an idle CPU for a waking task, the cache locality is determined ++according to where the task last ran and then idle CPUs are ranked from best ++to worst to choose the most suitable idle CPU based on cache locality, NUMA ++node locality and hyperthread sibling business. They are chosen in the ++following preference (if idle): ++ ++* Same core, idle or busy cache, idle threads ++* Other core, same cache, idle or busy cache, idle threads. ++* Same node, other CPU, idle cache, idle threads. ++* Same node, other CPU, busy cache, idle threads. ++* Same core, busy threads. ++* Other core, same cache, busy threads. ++* Same node, other CPU, busy threads. ++* Other node, other CPU, idle cache, idle threads. ++* Other node, other CPU, busy cache, idle threads. ++* Other node, other CPU, busy threads. ++ ++This shows the SMT or "hyperthread" awareness in the design as well which will ++choose a real idle core first before a logical SMT sibling which already has ++tasks on the physical CPU. ++ ++Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. ++However this benchmarking was performed on an earlier design that was far less ++scalable than the current one so it's hard to know how scalable it is in terms ++of both CPUs (due to the global runqueue) and heavily loaded machines (due to ++O(n) lookup) at this stage. Note that in terms of scalability, the number of ++_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) ++quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark ++results are very promising indeed, without needing to tweak any knobs, features ++or options. Benchmark contributions are most welcome. ++ ++ ++Features ++ ++As the initial prime target audience for BFS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval ++and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition ++to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is ++support for CGROUPS. The average user should neither need to know what these ++are, nor should they need to be using them to have good desktop behaviour. ++ ++rr_interval ++ ++There is only one "scheduler" tunable, the round robin interval. This can be ++accessed in ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6ms. Valid values ++are from 1 to 1000. Decreasing the value will decrease latencies at the cost of ++decreasing throughput, while increasing it will improve throughput, but at the ++cost of worsening latencies. The accuracy of the rr interval is limited by HZ ++resolution of the kernel configuration. Thus, the worst case latencies are ++usually slightly higher than this actual value. BFS uses "dithering" to try and ++minimise the effect the Hz limitation has. The default value of 6 is not an ++arbitrary one. It is based on the fact that humans can detect jitter at ++approximately 7ms, so aiming for much lower latencies is pointless under most ++circumstances. It is worth noting this fact when comparing the latency ++performance of BFS to other schedulers. Worst case latencies being higher than ++7ms are far worse than average latencies not being in the microsecond range. ++Experimentation has shown that rr intervals being increased up to 300 can ++improve throughput but beyond that, scheduling noise from elsewhere prevents ++further demonstrable throughput. ++ ++Isochronous scheduling. ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of _total CPU_ available across the machine, configurable ++as a percentage in the following "resource handling" tunable (as opposed to a ++scheduler tunable): ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of BFS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++Because some applications constantly set their policy as well as their nice ++level, there is potential for them to undo the override specified by the user ++on the command line of setting the policy to SCHED_ISO. To counter this, once ++a task has been set to SCHED_ISO policy, it needs superuser privileges to set ++it back to SCHED_NORMAL. This will ensure the task remains ISO and all child ++processes and threads will also inherit the ISO policy. ++ ++Idleprio scheduling. ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start ++a video encode or so on without any slowdown of other tasks. To avoid this ++policy from grabbing shared resources and holding them indefinitely, if it ++detects a state where the task is waiting on I/O, the machine is about to ++suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As ++per the Isochronous task management, once a task has been scheduled as IDLEPRIO, ++it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can ++be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++ schedtool -D -e ./mprime ++ ++Subtick accounting. ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the ++timer tick frequency (HZ) is lowered. It is possible to create an application ++which uses almost 100% CPU, yet by being descheduled at the right time, records ++zero CPU usage. While the main problem with this is that there are possible ++security implications, it is also difficult to determine how much CPU a task ++really does use. BFS tries to use the sub-tick accounting from the TSC clock, ++where possible, to determine real CPU usage. This is not entirely reliable, but ++is far more likely to produce accurate CPU usage data than the existing designs ++and will not show tasks as consuming no CPU usage when they actually are. Thus, ++the amount of CPU reported as being used by BFS will more accurately represent ++how much CPU the task itself is using (as is shown for example by the 'time' ++application), so the reported values may be quite different to other schedulers. ++Values reported as the 'load' are more prone to problems with this design, but ++per process values are closer to real usage. When comparing throughput of BFS ++to other designs, it is important to compare the actual completed work in terms ++of total wall clock time taken and total work done, rather than the reported ++"cpu usage". ++ ++ ++Con Kolivas Tue, 5 Apr 2011 +Index: linux-3.2-ck1/Documentation/sysctl/kernel.txt +=================================================================== +--- linux-3.2-ck1.orig/Documentation/sysctl/kernel.txt 2012-01-16 10:07:27.895097268 +1100 ++++ linux-3.2-ck1/Documentation/sysctl/kernel.txt 2012-01-16 10:07:31.336097029 +1100 +@@ -33,6 +33,7 @@ show up in /proc/sys/kernel: + - domainname + - hostname + - hotplug ++- iso_cpu + - kptr_restrict + - kstack_depth_to_print [ X86 only ] + - l2cr [ PPC only ] +@@ -58,6 +59,7 @@ show up in /proc/sys/kernel: + - randomize_va_space + - real-root-dev ==> Documentation/initrd.txt + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - sem +@@ -300,6 +302,16 @@ kernel stack. + + ============================================================== + ++iso_cpu: (BFS CPU scheduler only). ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling five ++seconds over the -whole- system, meaning all cpus. ++ ++Set to 70 (percent) by default. ++ ++============================================================== ++ + l2cr: (PPC only) + + This flag controls the L2 cache of G3 processor boards. If +@@ -495,6 +507,20 @@ rebooting. ??? + + ============================================================== + ++rr_interval: (BFS CPU scheduler only) ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. Conversely decreasing it will decrease average and maximum ++latencies but at the expense of throughput. This value is in ++milliseconds and the default value chosen depends on the number of ++cpus available at scheduler initialisation with a minimum of 6. ++ ++Valid values are from 1-1000. ++ ++============================================================== ++ + rtsig-max & rtsig-nr: + + The file rtsig-max can be used to tune the maximum number +Index: linux-3.2-ck1/fs/proc/base.c +=================================================================== +--- linux-3.2-ck1.orig/fs/proc/base.c 2012-01-16 10:07:27.896097267 +1100 ++++ linux-3.2-ck1/fs/proc/base.c 2012-01-16 10:07:31.337097029 +1100 +@@ -411,7 +411,7 @@ static int proc_pid_stack(struct seq_fil + static int proc_pid_schedstat(struct task_struct *task, char *buffer) + { + return sprintf(buffer, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + } +Index: linux-3.2-ck1/include/linux/init_task.h +=================================================================== +--- linux-3.2-ck1.orig/include/linux/init_task.h 2012-01-16 10:07:27.896097267 +1100 ++++ linux-3.2-ck1/include/linux/init_task.h 2012-01-16 10:07:31.337097029 +1100 +@@ -126,12 +126,70 @@ extern struct cred init_cred; + # define INIT_PERF_EVENTS(tsk) + #endif + +-#define INIT_TASK_COMM "swapper" +- + /* + * INIT_TASK is used to set up the first task table, touch at + * your own risk!. Base=0, limit=0x1fffff (=2MB) + */ ++#ifdef CONFIG_SCHED_BFS ++#define INIT_TASK_COMM "BFS" ++#define INIT_TASK(tsk) \ ++{ \ ++ .state = 0, \ ++ .stack = &init_thread_info, \ ++ .usage = ATOMIC_INIT(2), \ ++ .flags = PF_KTHREAD, \ ++ .prio = NORMAL_PRIO, \ ++ .static_prio = MAX_PRIO-20, \ ++ .normal_prio = NORMAL_PRIO, \ ++ .deadline = 0, \ ++ .policy = SCHED_NORMAL, \ ++ .cpus_allowed = CPU_MASK_ALL, \ ++ .mm = NULL, \ ++ .active_mm = &init_mm, \ ++ .run_list = LIST_HEAD_INIT(tsk.run_list), \ ++ .time_slice = HZ, \ ++ .tasks = LIST_HEAD_INIT(tsk.tasks), \ ++ INIT_PUSHABLE_TASKS(tsk) \ ++ .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ ++ .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ ++ .real_parent = &tsk, \ ++ .parent = &tsk, \ ++ .children = LIST_HEAD_INIT(tsk.children), \ ++ .sibling = LIST_HEAD_INIT(tsk.sibling), \ ++ .group_leader = &tsk, \ ++ RCU_INIT_POINTER(.real_cred, &init_cred), \ ++ RCU_INIT_POINTER(.cred, &init_cred), \ ++ .comm = INIT_TASK_COMM, \ ++ .thread = INIT_THREAD, \ ++ .fs = &init_fs, \ ++ .files = &init_files, \ ++ .signal = &init_signals, \ ++ .sighand = &init_sighand, \ ++ .nsproxy = &init_nsproxy, \ ++ .pending = { \ ++ .list = LIST_HEAD_INIT(tsk.pending.list), \ ++ .signal = {{0}}}, \ ++ .blocked = {{0}}, \ ++ .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ ++ .journal_info = NULL, \ ++ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ ++ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ ++ .timer_slack_ns = 50000, /* 50 usec default slack */ \ ++ .pids = { \ ++ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ ++ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ ++ [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ ++ }, \ ++ INIT_IDS \ ++ INIT_PERF_EVENTS(tsk) \ ++ INIT_TRACE_IRQFLAGS \ ++ INIT_LOCKDEP \ ++ INIT_FTRACE_GRAPH \ ++ INIT_TRACE_RECURSION \ ++ INIT_TASK_RCU_PREEMPT(tsk) \ ++} ++#else /* CONFIG_SCHED_BFS */ ++#define INIT_TASK_COMM "swapper" + #define INIT_TASK(tsk) \ + { \ + .state = 0, \ +@@ -194,7 +252,7 @@ extern struct cred init_cred; + INIT_TRACE_RECURSION \ + INIT_TASK_RCU_PREEMPT(tsk) \ + } +- ++#endif /* CONFIG_SCHED_BFS */ + + #define INIT_CPU_TIMERS(cpu_timers) \ + { \ +Index: linux-3.2-ck1/include/linux/ioprio.h +=================================================================== +--- linux-3.2-ck1.orig/include/linux/ioprio.h 2012-01-16 10:07:27.896097267 +1100 ++++ linux-3.2-ck1/include/linux/ioprio.h 2012-01-16 10:07:31.338097029 +1100 +@@ -64,6 +64,8 @@ static inline int task_ioprio_class(stru + + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (iso_task(task)) ++ return 0; + return (task_nice(task) + 20) / 5; + } + +Index: linux-3.2-ck1/include/linux/sched.h +=================================================================== +--- linux-3.2-ck1.orig/include/linux/sched.h 2012-01-16 10:07:27.896097267 +1100 ++++ linux-3.2-ck1/include/linux/sched.h 2012-01-16 10:07:32.577096941 +1100 +@@ -37,8 +37,15 @@ + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented on BFS only */ + #define SCHED_IDLE 5 ++#define SCHED_IDLEPRIO SCHED_IDLE ++#ifdef CONFIG_SCHED_BFS ++#define SCHED_ISO 4 ++#define SCHED_MAX (SCHED_IDLEPRIO) ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++#endif ++ + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ + #define SCHED_RESET_ON_FORK 0x40000000 + +@@ -269,8 +276,6 @@ extern asmlinkage void schedule_tail(str + extern void init_idle(struct task_struct *idle, int cpu); + extern void init_idle_bootup_task(struct task_struct *idle); + +-extern int runqueue_is_locked(int cpu); +- + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) + extern void select_nohz_load_balancer(int stop_tick); + extern int get_nohz_timer_target(void); +@@ -1226,15 +1231,31 @@ struct task_struct { + + #ifdef CONFIG_SMP + struct llist_node wake_entry; +- int on_cpu; + #endif +- int on_rq; ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_BFS) ++ bool on_cpu; ++#endif ++#ifndef CONFIG_SCHED_BFS ++ bool on_rq; ++#endif + + int prio, static_prio, normal_prio; + unsigned int rt_priority; ++#ifdef CONFIG_SCHED_BFS ++ int time_slice; ++ u64 deadline; ++ struct list_head run_list; ++ u64 last_ran; ++ u64 sched_time; /* sched_clock time spent running */ ++#ifdef CONFIG_SMP ++ bool sticky; /* Soft affined flag */ ++#endif ++ unsigned long rt_timeout; ++#else /* CONFIG_SCHED_BFS */ + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++#endif + + #ifdef CONFIG_PREEMPT_NOTIFIERS + /* list of struct preempt_notifier: */ +@@ -1341,6 +1362,9 @@ struct task_struct { + int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ + + cputime_t utime, stime, utimescaled, stimescaled; ++#ifdef CONFIG_SCHED_BFS ++ unsigned long utime_pc, stime_pc; ++#endif + cputime_t gtime; + #ifndef CONFIG_VIRT_CPU_ACCOUNTING + cputime_t prev_utime, prev_stime; +@@ -1574,6 +1598,67 @@ struct task_struct { + #endif + }; + ++#ifdef CONFIG_SCHED_BFS ++bool grunqueue_is_locked(void); ++void grq_unlock_wait(void); ++void cpu_scaling(int cpu); ++void cpu_nonscaling(int cpu); ++int above_background_load(void); ++#define tsk_seruntime(t) ((t)->sched_time) ++#define tsk_rttimeout(t) ((t)->rt_timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++} ++ ++static inline int runqueue_is_locked(int cpu) ++{ ++ return grunqueue_is_locked(); ++} ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO"BFS CPU scheduler v0.416 by Con Kolivas.\n"); ++} ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return (p->policy == SCHED_ISO); ++} ++#else /* CFS */ ++extern int runqueue_is_locked(int cpu); ++static inline void cpu_scaling(int cpu) ++{ ++} ++ ++static inline void cpu_nonscaling(int cpu) ++{ ++} ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++ p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; ++} ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO"CFS CPU scheduler.\n"); ++} ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return false; ++} ++ ++/* Anyone feel like implementing this? */ ++static inline int above_background_load(void) ++{ ++ return 1; ++} ++#endif /* CONFIG_SCHED_BFS */ ++ + /* Future-safe accessor for struct task_struct's cpus_allowed. */ + #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) + +@@ -1591,10 +1676,20 @@ struct task_struct { + */ + + #define MAX_USER_RT_PRIO 100 +-#define MAX_RT_PRIO MAX_USER_RT_PRIO ++#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) ++#define DEFAULT_PRIO (MAX_RT_PRIO + 20) + ++#ifdef CONFIG_SCHED_BFS ++#define PRIO_RANGE (40) ++#define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE) ++#define ISO_PRIO (MAX_RT_PRIO) ++#define NORMAL_PRIO (MAX_RT_PRIO + 1) ++#define IDLE_PRIO (MAX_RT_PRIO + 2) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* CONFIG_SCHED_BFS */ + #define MAX_PRIO (MAX_RT_PRIO + 40) +-#define DEFAULT_PRIO (MAX_RT_PRIO + 20) ++#define NORMAL_PRIO DEFAULT_PRIO ++#endif /* CONFIG_SCHED_BFS */ + + static inline int rt_prio(int prio) + { +@@ -1961,7 +2056,7 @@ extern unsigned long long + task_sched_runtime(struct task_struct *task); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BFS) + extern void sched_exec(void); + #else + #define sched_exec() {} +@@ -2606,7 +2701,7 @@ static inline unsigned int task_cpu(cons + return 0; + } + +-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) ++static inline void set_task_cpu(struct task_struct *p, int cpu) + { + } + +Index: linux-3.2-ck1/init/Kconfig +=================================================================== +--- linux-3.2-ck1.orig/init/Kconfig 2012-01-16 10:07:27.897097267 +1100 ++++ linux-3.2-ck1/init/Kconfig 2012-01-16 10:07:31.338097029 +1100 +@@ -29,6 +29,19 @@ config IRQ_WORK + + menu "General setup" + ++config SCHED_BFS ++ bool "BFS cpu scheduler" ++ ---help--- ++ The Brain Fuck CPU Scheduler for excellent interactivity and ++ responsiveness on the desktop and solid scalability on normal ++ hardware. Not recommended for 4096 CPUs. ++ ++ Currently incompatible with the Group CPU scheduler, and RCU TORTURE ++ TEST so these options are disabled. ++ ++ Say Y here. ++ default y ++ + config EXPERIMENTAL + bool "Prompt for development and/or incomplete code/drivers" + ---help--- +@@ -626,6 +639,7 @@ config PROC_PID_CPUSET + + config CGROUP_CPUACCT + bool "Simple CPU accounting cgroup subsystem" ++ depends on !SCHED_BFS + help + Provides a simple Resource Controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -702,7 +716,7 @@ config CGROUP_PERF + + menuconfig CGROUP_SCHED + bool "Group CPU scheduler" +- depends on EXPERIMENTAL ++ depends on EXPERIMENTAL && !SCHED_BFS + default n + help + This feature lets CPU scheduler recognize task groups and control CPU +@@ -828,6 +842,7 @@ endif # NAMESPACES + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_BFS + select EVENTFD + select CGROUPS + select CGROUP_SCHED +Index: linux-3.2-ck1/init/main.c +=================================================================== +--- linux-3.2-ck1.orig/init/main.c 2012-01-16 10:07:27.897097267 +1100 ++++ linux-3.2-ck1/init/main.c 2012-01-16 10:07:31.339097029 +1100 +@@ -763,6 +763,7 @@ static noinline int init_post(void) + system_state = SYSTEM_RUNNING; + numa_default_policy(); + ++ print_scheduler_version(); + + current->signal->flags |= SIGNAL_UNKILLABLE; + +Index: linux-3.2-ck1/kernel/delayacct.c +=================================================================== +--- linux-3.2-ck1.orig/kernel/delayacct.c 2012-01-16 10:07:27.897097267 +1100 ++++ linux-3.2-ck1/kernel/delayacct.c 2012-01-16 10:07:31.339097029 +1100 +@@ -130,7 +130,7 @@ int __delayacct_add_tsk(struct taskstats + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +Index: linux-3.2-ck1/kernel/exit.c +=================================================================== +--- linux-3.2-ck1.orig/kernel/exit.c 2012-01-16 10:07:27.897097267 +1100 ++++ linux-3.2-ck1/kernel/exit.c 2012-01-16 10:07:31.339097029 +1100 +@@ -131,7 +131,7 @@ static void __exit_signal(struct task_st + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + } + + sig->nr_threads--; +Index: linux-3.2-ck1/kernel/posix-cpu-timers.c +=================================================================== +--- linux-3.2-ck1.orig/kernel/posix-cpu-timers.c 2012-01-16 10:07:27.897097267 +1100 ++++ linux-3.2-ck1/kernel/posix-cpu-timers.c 2012-01-16 10:07:31.340097028 +1100 +@@ -512,7 +512,7 @@ static void cleanup_timers(struct list_h + void posix_cpu_timers_exit(struct task_struct *tsk) + { + cleanup_timers(tsk->cpu_timers, +- tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); ++ tsk->utime, tsk->stime, tsk_seruntime(tsk)); + + } + void posix_cpu_timers_exit_group(struct task_struct *tsk) +@@ -522,7 +522,7 @@ void posix_cpu_timers_exit_group(struct + cleanup_timers(tsk->signal->cpu_timers, + cputime_add(tsk->utime, sig->utime), + cputime_add(tsk->stime, sig->stime), +- tsk->se.sum_exec_runtime + sig->sum_sched_runtime); ++ tsk_seruntime(tsk) + sig->sum_sched_runtime); + } + + static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) +@@ -953,7 +953,7 @@ static void check_thread_timers(struct t + struct cpu_timer_list *t = list_first_entry(timers, + struct cpu_timer_list, + entry); +- if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { ++ if (!--maxfire || tsk_seruntime(tsk) < t->expires.sched) { + tsk->cputime_expires.sched_exp = t->expires.sched; + break; + } +@@ -970,7 +970,7 @@ static void check_thread_timers(struct t + ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); + + if (hard != RLIM_INFINITY && +- tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { ++ tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { + /* + * At the hard limit, we just die. + * No need to calculate anything else now. +@@ -978,7 +978,7 @@ static void check_thread_timers(struct t + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } +- if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { ++ if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { + /* + * At the soft limit, send a SIGXCPU every second. + */ +@@ -1280,7 +1280,7 @@ static inline int fastpath_timer_check(s + struct task_cputime task_sample = { + .utime = tsk->utime, + .stime = tsk->stime, +- .sum_exec_runtime = tsk->se.sum_exec_runtime ++ .sum_exec_runtime = tsk_seruntime(tsk) + }; + + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) +Index: linux-3.2-ck1/kernel/sched_bfs.c +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-3.2-ck1/kernel/sched_bfs.c 2012-01-16 10:07:31.343097028 +1100 +@@ -0,0 +1,7197 @@ ++/* ++ * kernel/sched_bfs.c, was sched.c ++ * ++ * Kernel scheduler and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and ++ * make semaphores SMP safe ++ * 1998-11-19 Implemented schedule_timeout() and related stuff ++ * by Andrea Arcangeli ++ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: ++ * hybrid priority-list and round-robin design with ++ * an array-switch method of distributing timeslices ++ * and per-CPU runqueues. Cleanups and useful suggestions ++ * by Davide Libenzi, preemptible kernel bits by Robert Love. ++ * 2003-09-03 Interactivity tuning by Con Kolivas. ++ * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-04-15 Work begun on replacing all interactivity tuning with a ++ * fair scheduling design by Con Kolivas. ++ * 2007-05-05 Load balancing (smp-nice) and other improvements ++ * by Peter Williams ++ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith ++ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri ++ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, ++ * Thomas Gleixner, Mike Kravetz ++ * now Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#ifdef CONFIG_PARAVIRT ++#include ++#endif ++ ++#include "sched_cpupri.h" ++#include "workqueue_sched.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_queue(rq) rt_prio((rq)->rq_prio) ++#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) ++#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) ++#define iso_task(p) unlikely((p)->policy == SCHED_ISO) ++#define iso_queue(rq) unlikely((rq)->rq_policy == SCHED_ISO) ++#define ISO_PERIOD ((5 * HZ * grq.noc) + 1) ++ ++/* ++ * Convert user-nice values [ -20 ... 0 ... 19 ] ++ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], ++ * and back. ++ */ ++#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) ++#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) ++#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) ++ ++/* ++ * 'User priority' is the nice value converted to something we ++ * can work with better when scaling various scheduler parameters, ++ * it's a [ 0 ... 39 ] range. ++ */ ++#define USER_PRIO(p) ((p) - MAX_RT_PRIO) ++#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) ++#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) ++#define SCHED_PRIO(p) ((p) + MAX_RT_PRIO) ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) ++#define JIFFY_NS (1000000000 / HZ) ++#define HALF_JIFFY_NS (1000000000 / HZ / 2) ++#define HALF_JIFFY_US (1000000 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run five seconds as real time tasks. This is the total over ++ * all online cpus. ++ */ ++int sched_iso_cpu __read_mostly = 70; ++ ++/* ++ * The relative length of deadline for each priority(nice) level. ++ */ ++static int prio_ratios[PRIO_RANGE] __read_mostly; ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++/* ++ * The global runqueue data that all CPUs work off. Data is protected either ++ * by the global grq lock, or the discrete lock that precedes the data in this ++ * struct. ++ */ ++struct global_rq { ++ raw_spinlock_t lock; ++ unsigned long nr_running; ++ unsigned long nr_uninterruptible; ++ unsigned long long nr_switches; ++ struct list_head queue[PRIO_LIMIT]; ++ DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1); ++#ifdef CONFIG_SMP ++ unsigned long qnr; /* queued not running */ ++ cpumask_t cpu_idle_map; ++ bool idle_cpus; ++#endif ++ int noc; /* num_online_cpus stored and updated when it changes */ ++ u64 niffies; /* Nanosecond jiffies */ ++ unsigned long last_jiffy; /* Last jiffy we updated niffies */ ++ ++ raw_spinlock_t iso_lock; ++ int iso_ticks; ++ int iso_refractory; ++}; ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * We add the notion of a root-domain which will be used to define per-domain ++ * variables. Each exclusive cpuset essentially defines an island domain by ++ * fully partitioning the member cpus from any other cpuset. Whenever a new ++ * exclusive cpuset is created, we also create and attach a new root-domain ++ * object. ++ * ++ */ ++struct root_domain { ++ atomic_t refcount; ++ atomic_t rto_count; ++ struct rcu_head rcu; ++ cpumask_var_t span; ++ cpumask_var_t online; ++ ++ /* ++ * The "RT overload" flag: it gets set if a CPU has more than ++ * one runnable RT task. ++ */ ++ cpumask_var_t rto_mask; ++ struct cpupri cpupri; ++}; ++ ++/* ++ * By default the system creates a single root-domain with all cpus as ++ * members (mimicking the global state we have today). ++ */ ++static struct root_domain def_root_domain; ++ ++#endif /* CONFIG_SMP */ ++ ++/* There can be only one */ ++static struct global_rq grq; ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ ++ u64 nohz_stamp; ++ unsigned char in_nohz_recently; ++#endif ++#endif ++ ++ struct task_struct *curr, *idle, *stop; ++ struct mm_struct *prev_mm; ++ ++ /* Stored data about rq->curr to work outside grq lock */ ++ u64 rq_deadline; ++ unsigned int rq_policy; ++ int rq_time_slice; ++ u64 rq_last_ran; ++ int rq_prio; ++ bool rq_running; /* There is a task running */ ++ ++ /* Accurate timekeeping data */ ++ u64 timekeep_clock; ++ unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc, ++ iowait_pc, idle_pc; ++ long account_pc; ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ bool scaling; /* This CPU is managed by a scaling CPU freq governor */ ++ struct task_struct *sticky_task; ++ ++ struct root_domain *rd; ++ struct sched_domain *sd; ++ int *cpu_locality; /* CPU relative cache distance */ ++#ifdef CONFIG_SCHED_SMT ++ bool (*siblings_idle)(int cpu); ++ /* See if all smt siblings are idle */ ++ cpumask_t smt_siblings; ++#endif ++#ifdef CONFIG_SCHED_MC ++ bool (*cache_idle)(int cpu); ++ /* See if all cache siblings are idle */ ++ cpumask_t cache_siblings; ++#endif ++ u64 last_niffy; /* Last time this RQ updated grq.niffies */ ++#endif ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif ++ ++ u64 clock, old_clock, last_tick; ++ u64 clock_task; ++ bool dither; ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif ++}; ++ ++static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++#ifdef CONFIG_SMP ++/* ++ * sched_domains_mutex serialises calls to init_sched_domains, ++ * detach_destroy_domains and partition_sched_domains. ++ */ ++static DEFINE_MUTEX(sched_domains_mutex); ++ ++/* ++ * By default the system creates a single root-domain with all cpus as ++ * members (mimicking the global state we have today). ++ */ ++static struct root_domain def_root_domain; ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++#endif ++ ++#define rcu_dereference_check_sched_domain(p) \ ++ rcu_dereference_check((p), \ ++ lockdep_is_held(&sched_domains_mutex)) ++ ++/* ++ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. ++ * See detach_destroy_domains: synchronize_sched for details. ++ * ++ * The domain tree of any CPU may only be accessed from within ++ * preempt-disabled sections. ++ */ ++#define for_each_domain(cpu, __sd) \ ++ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) ++ ++static inline void update_rq_clock(struct rq *rq); ++ ++/* ++ * Sanity check should sched_clock return bogus values. We make sure it does ++ * not appear to go backwards, and use jiffies to determine the maximum and ++ * minimum it could possibly have increased, and round down to the nearest ++ * jiffy when it falls outside this. ++ */ ++static inline void niffy_diff(s64 *niff_diff, int jiff_diff) ++{ ++ unsigned long min_diff, max_diff; ++ ++ if (jiff_diff > 1) ++ min_diff = JIFFIES_TO_NS(jiff_diff - 1); ++ else ++ min_diff = 1; ++ /* Round up to the nearest tick for maximum */ ++ max_diff = JIFFIES_TO_NS(jiff_diff + 1); ++ ++ if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff)) ++ *niff_diff = min_diff; ++} ++ ++#ifdef CONFIG_SMP ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() (&__get_cpu_var(runqueues)) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++static inline int cpu_of(struct rq *rq) ++{ ++ return rq->cpu; ++} ++ ++/* ++ * Niffies are a globally increasing nanosecond counter. Whenever a runqueue ++ * clock is updated with the grq.lock held, it is an opportunity to update the ++ * niffies value. Any CPU can update it by adding how much its clock has ++ * increased since it last updated niffies, minus any added niffies by other ++ * CPUs. ++ */ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff; ++ long jdiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ /* old_clock is only updated when we are updating niffies */ ++ rq->old_clock = rq->clock; ++ ndiff -= grq.niffies - rq->last_niffy; ++ jdiff = jiffies - grq.last_jiffy; ++ niffy_diff(&ndiff, jdiff); ++ grq.last_jiffy += jdiff; ++ grq.niffies += ndiff; ++ rq->last_niffy = grq.niffies; ++} ++#else /* CONFIG_SMP */ ++static struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++static inline int cpu_of(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff; ++ long jdiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ rq->old_clock = rq->clock; ++ jdiff = jiffies - grq.last_jiffy; ++ niffy_diff(&ndiff, jdiff); ++ grq.last_jiffy += jdiff; ++ grq.niffies += ndiff; ++} ++#endif ++#define raw_rq() (&__raw_get_cpu_var(runqueues)) ++ ++#include "sched_stats.h" ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_switch ++# define finish_arch_switch(prev) do { } while (0) ++#endif ++ ++/* ++ * All common locking functions performed on grq.lock. rq->clock is local to ++ * the CPU accessing it so it can be modified just with interrupts disabled ++ * when we're not updating niffies. ++ * Looking up task_rq must be done under grq.lock to be safe. ++ */ ++static void update_rq_clock_task(struct rq *rq, s64 delta); ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++static inline void grq_lock(void) ++ __acquires(grq.lock) ++{ ++ raw_spin_lock(&grq.lock); ++} ++ ++static inline void grq_unlock(void) ++ __releases(grq.lock) ++{ ++ raw_spin_unlock(&grq.lock); ++} ++ ++static inline void grq_lock_irq(void) ++ __acquires(grq.lock) ++{ ++ raw_spin_lock_irq(&grq.lock); ++} ++ ++static inline void time_lock_grq(struct rq *rq) ++ __acquires(grq.lock) ++{ ++ grq_lock(); ++ update_clocks(rq); ++} ++ ++static inline void grq_unlock_irq(void) ++ __releases(grq.lock) ++{ ++ raw_spin_unlock_irq(&grq.lock); ++} ++ ++static inline void grq_lock_irqsave(unsigned long *flags) ++ __acquires(grq.lock) ++{ ++ raw_spin_lock_irqsave(&grq.lock, *flags); ++} ++ ++static inline void grq_unlock_irqrestore(unsigned long *flags) ++ __releases(grq.lock) ++{ ++ raw_spin_unlock_irqrestore(&grq.lock, *flags); ++} ++ ++static inline struct rq ++*task_grq_lock(struct task_struct *p, unsigned long *flags) ++ __acquires(grq.lock) ++{ ++ grq_lock_irqsave(flags); ++ return task_rq(p); ++} ++ ++static inline struct rq ++*time_task_grq_lock(struct task_struct *p, unsigned long *flags) ++ __acquires(grq.lock) ++{ ++ struct rq *rq = task_grq_lock(p, flags); ++ update_clocks(rq); ++ return rq; ++} ++ ++static inline struct rq *task_grq_lock_irq(struct task_struct *p) ++ __acquires(grq.lock) ++{ ++ grq_lock_irq(); ++ return task_rq(p); ++} ++ ++static inline void time_task_grq_lock_irq(struct task_struct *p) ++ __acquires(grq.lock) ++{ ++ struct rq *rq = task_grq_lock_irq(p); ++ update_clocks(rq); ++} ++ ++static inline void task_grq_unlock_irq(void) ++ __releases(grq.lock) ++{ ++ grq_unlock_irq(); ++} ++ ++static inline void task_grq_unlock(unsigned long *flags) ++ __releases(grq.lock) ++{ ++ grq_unlock_irqrestore(flags); ++} ++ ++/** ++ * grunqueue_is_locked ++ * ++ * Returns true if the global runqueue is locked. ++ * This interface allows printk to be called with the runqueue lock ++ * held and know whether or not it is OK to wake up the klogd. ++ */ ++bool grunqueue_is_locked(void) ++{ ++ return raw_spin_is_locked(&grq.lock); ++} ++ ++void grq_unlock_wait(void) ++ __releases(grq.lock) ++{ ++ smp_mb(); /* spin-unlock-wait is not a full memory barrier */ ++ raw_spin_unlock_wait(&grq.lock); ++} ++ ++static inline void time_grq_lock(struct rq *rq, unsigned long *flags) ++ __acquires(grq.lock) ++{ ++ local_irq_save(*flags); ++ time_lock_grq(rq); ++} ++ ++static inline struct rq *__task_grq_lock(struct task_struct *p) ++ __acquires(grq.lock) ++{ ++ grq_lock(); ++ return task_rq(p); ++} ++ ++static inline void __task_grq_unlock(void) ++ __releases(grq.lock) ++{ ++ grq_unlock(); ++} ++ ++/* ++ * Look for any tasks *anywhere* that are running nice 0 or better. We do ++ * this lockless for overhead reasons since the occasional wrong result ++ * is harmless. ++ */ ++int above_background_load(void) ++{ ++ struct task_struct *cpu_curr; ++ unsigned long cpu; ++ ++ for_each_online_cpu(cpu) { ++ cpu_curr = cpu_rq(cpu)->curr; ++ if (unlikely(!cpu_curr)) ++ continue; ++ if (PRIO_TO_NICE(cpu_curr->static_prio) < 1) ++ return 1; ++ } ++ return 0; ++} ++ ++#ifndef __ARCH_WANT_UNLOCKED_CTXSW ++static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++} ++ ++static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ++{ ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ grq.lock.owner = current; ++#endif ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_); ++ ++ grq_unlock_irq(); ++} ++ ++#else /* __ARCH_WANT_UNLOCKED_CTXSW */ ++ ++static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW ++ grq_unlock_irq(); ++#else ++ grq_unlock(); ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ++{ ++ smp_wmb(); ++#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW ++ local_irq_enable(); ++#endif ++} ++#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ ++ ++static inline bool deadline_before(u64 deadline, u64 time) ++{ ++ return (deadline < time); ++} ++ ++static inline bool deadline_after(u64 deadline, u64 time) ++{ ++ return (deadline > time); ++} ++ ++/* ++ * A task that is queued but not running will be on the grq run list. ++ * A task that is not running or queued will not be on the grq run list. ++ * A task that is currently running will have ->on_cpu set but not on the ++ * grq run list. ++ */ ++static inline bool task_queued(struct task_struct *p) ++{ ++ return (!list_empty(&p->run_list)); ++} ++ ++/* ++ * Removing from the global runqueue. Enter with grq locked. ++ */ ++static void dequeue_task(struct task_struct *p) ++{ ++ list_del_init(&p->run_list); ++ if (list_empty(grq.queue + p->prio)) ++ __clear_bit(p->prio, grq.prio_bitmap); ++} ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!freezing(p) && !signal_pending(p) && ++ !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); ++} ++ ++/* ++ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check ++ * that the iso_refractory flag is not set. ++ */ ++static bool isoprio_suitable(void) ++{ ++ return !grq.iso_refractory; ++} ++ ++/* ++ * Adding to the global runqueue. Enter with grq locked. ++ */ ++static void enqueue_task(struct task_struct *p) ++{ ++ if (!rt_task(p)) { ++ /* Check it hasn't gotten rt from PI */ ++ if ((idleprio_task(p) && idleprio_suitable(p)) || ++ (iso_task(p) && isoprio_suitable())) ++ p->prio = p->normal_prio; ++ else ++ p->prio = NORMAL_PRIO; ++ } ++ __set_bit(p->prio, grq.prio_bitmap); ++ list_add_tail(&p->run_list, grq.queue + p->prio); ++ sched_info_queued(p); ++} ++ ++/* Only idle task does this as a real time task*/ ++static inline void enqueue_task_head(struct task_struct *p) ++{ ++ __set_bit(p->prio, grq.prio_bitmap); ++ list_add(&p->run_list, grq.queue + p->prio); ++ sched_info_queued(p); ++} ++ ++static inline void requeue_task(struct task_struct *p) ++{ ++ sched_info_queued(p); ++} ++ ++/* ++ * Returns the relative length of deadline all compared to the shortest ++ * deadline which is that of nice -20. ++ */ ++static inline int task_prio_ratio(struct task_struct *p) ++{ ++ return prio_ratios[TASK_USER_PRIO(p)]; ++} ++ ++/* ++ * task_timeslice - all tasks of all priorities get the exact same timeslice ++ * length. CPU distribution is handled by giving different deadlines to ++ * tasks of different priorities. Use 128 as the base value for fast shifts. ++ */ ++static inline int task_timeslice(struct task_struct *p) ++{ ++ return (rr_interval * task_prio_ratio(p) / 128); ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * qnr is the "queued but not running" count which is the total number of ++ * tasks on the global runqueue list waiting for cpu time but not actually ++ * currently running on a cpu. ++ */ ++static inline void inc_qnr(void) ++{ ++ grq.qnr++; ++} ++ ++static inline void dec_qnr(void) ++{ ++ grq.qnr--; ++} ++ ++static inline int queued_notrunning(void) ++{ ++ return grq.qnr; ++} ++ ++/* ++ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to ++ * allow easy lookup of whether any suitable idle CPUs are available. ++ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the ++ * idle_cpus variable than to do a full bitmask check when we are busy. ++ */ ++static inline void set_cpuidle_map(int cpu) ++{ ++ if (likely(cpu_online(cpu))) { ++ cpu_set(cpu, grq.cpu_idle_map); ++ grq.idle_cpus = true; ++ } ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++ cpu_clear(cpu, grq.cpu_idle_map); ++ if (cpus_empty(grq.cpu_idle_map)) ++ grq.idle_cpus = false; ++} ++ ++static bool suitable_idle_cpus(struct task_struct *p) ++{ ++ if (!grq.idle_cpus) ++ return false; ++ return (cpus_intersects(p->cpus_allowed, grq.cpu_idle_map)); ++} ++ ++#define CPUIDLE_DIFF_THREAD (1) ++#define CPUIDLE_DIFF_CORE (2) ++#define CPUIDLE_CACHE_BUSY (4) ++#define CPUIDLE_DIFF_CPU (8) ++#define CPUIDLE_THREAD_BUSY (16) ++#define CPUIDLE_DIFF_NODE (32) ++ ++static void resched_task(struct task_struct *p); ++ ++/* ++ * The best idle CPU is chosen according to the CPUIDLE ranking above where the ++ * lowest value would give the most suitable CPU to schedule p onto next. The ++ * order works out to be the following: ++ * ++ * Same core, idle or busy cache, idle threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Same core, busy threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ */ ++static void ++resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask) ++{ ++ unsigned int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | ++ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | ++ CPUIDLE_DIFF_THREAD; ++ int cpu_tmp; ++ ++ if (cpu_isset(best_cpu, *tmpmask)) ++ goto out; ++ ++ for_each_cpu_mask(cpu_tmp, *tmpmask) { ++ unsigned int ranking; ++ struct rq *tmp_rq; ++ ++ ranking = 0; ++ tmp_rq = cpu_rq(cpu_tmp); ++ ++#ifdef CONFIG_NUMA ++ if (rq->cpu_locality[cpu_tmp] > 3) ++ ranking |= CPUIDLE_DIFF_NODE; ++ else ++#endif ++ if (rq->cpu_locality[cpu_tmp] > 2) ++ ranking |= CPUIDLE_DIFF_CPU; ++#ifdef CONFIG_SCHED_MC ++ if (rq->cpu_locality[cpu_tmp] == 2) ++ ranking |= CPUIDLE_DIFF_CORE; ++ if (!(tmp_rq->cache_idle(cpu_tmp))) ++ ranking |= CPUIDLE_CACHE_BUSY; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ if (rq->cpu_locality[cpu_tmp] == 1) ++ ranking |= CPUIDLE_DIFF_THREAD; ++ if (!(tmp_rq->siblings_idle(cpu_tmp))) ++ ranking |= CPUIDLE_THREAD_BUSY; ++#endif ++ if (ranking < best_ranking) { ++ best_cpu = cpu_tmp; ++ best_ranking = ranking; ++ } ++ } ++out: ++ resched_task(cpu_rq(best_cpu)->curr); ++} ++ ++static void resched_best_idle(struct task_struct *p) ++{ ++ cpumask_t tmpmask; ++ ++ cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map); ++ resched_best_mask(task_cpu(p), task_rq(p), &tmpmask); ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p); ++} ++/* ++ * Flags to tell us whether this CPU is running a CPU frequency governor that ++ * has slowed its speed or not. No locking required as the very rare wrongly ++ * read value would be harmless. ++ */ ++void cpu_scaling(int cpu) ++{ ++ cpu_rq(cpu)->scaling = true; ++} ++ ++void cpu_nonscaling(int cpu) ++{ ++ cpu_rq(cpu)->scaling = false; ++} ++ ++static inline bool scaling_rq(struct rq *rq) ++{ ++ return rq->scaling; ++} ++#else /* CONFIG_SMP */ ++static inline void inc_qnr(void) ++{ ++} ++ ++static inline void dec_qnr(void) ++{ ++} ++ ++static inline int queued_notrunning(void) ++{ ++ return grq.nr_running; ++} ++ ++static inline void set_cpuidle_map(int cpu) ++{ ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++} ++ ++static inline bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return uprq->curr == uprq->idle; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++} ++ ++void cpu_scaling(int __unused) ++{ ++} ++ ++void cpu_nonscaling(int __unused) ++{ ++} ++ ++/* ++ * Although CPUs can scale in UP, there is nowhere else for tasks to go so this ++ * always returns 0. ++ */ ++static inline bool scaling_rq(struct rq *rq) ++{ ++ return false; ++} ++#endif /* CONFIG_SMP */ ++EXPORT_SYMBOL_GPL(cpu_scaling); ++EXPORT_SYMBOL_GPL(cpu_nonscaling); ++ ++/* ++ * activate_idle_task - move idle task to the _front_ of runqueue. ++ */ ++static inline void activate_idle_task(struct task_struct *p) ++{ ++ enqueue_task_head(p); ++ grq.nr_running++; ++ inc_qnr(); ++} ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ if (idleprio_task(p)) ++ return IDLE_PRIO; ++ if (iso_task(p)) ++ return ISO_PRIO; ++ return NORMAL_PRIO; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. Enter with grq locked. ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ update_clocks(rq); ++ ++ /* ++ * Sleep time is in units of nanosecs, so shift by 20 to get a ++ * milliseconds-range estimation of the amount of time that the task ++ * spent sleeping: ++ */ ++ if (unlikely(prof_on == SLEEP_PROFILING)) { ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), ++ (rq->clock - p->last_ran) >> 20); ++ } ++ ++ p->prio = effective_prio(p); ++ if (task_contributes_to_load(p)) ++ grq.nr_uninterruptible--; ++ enqueue_task(p); ++ grq.nr_running++; ++ inc_qnr(); ++} ++ ++static inline void clear_sticky(struct task_struct *p); ++ ++/* ++ * deactivate_task - If it's running, it's not on the grq and we can just ++ * decrement the nr_running. Enter with grq locked. ++ */ ++static inline void deactivate_task(struct task_struct *p) ++{ ++ if (task_contributes_to_load(p)) ++ grq.nr_uninterruptible++; ++ grq.nr_running--; ++ clear_sticky(p); ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold grq lock. ++ */ ++ WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock)); ++#endif ++ trace_sched_migrate_task(p, cpu); ++ if (task_cpu(p) != cpu) ++ perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); ++ ++ /* ++ * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ task_thread_info(p)->cpu = cpu; ++} ++ ++static inline void clear_sticky(struct task_struct *p) ++{ ++ p->sticky = false; ++} ++ ++static inline bool task_sticky(struct task_struct *p) ++{ ++ return p->sticky; ++} ++ ++/* Reschedule the best idle CPU that is not this one. */ ++static void ++resched_closest_idle(struct rq *rq, int cpu, struct task_struct *p) ++{ ++ cpumask_t tmpmask; ++ ++ cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map); ++ cpu_clear(cpu, tmpmask); ++ if (cpus_empty(tmpmask)) ++ return; ++ resched_best_mask(cpu, rq, &tmpmask); ++} ++ ++/* ++ * We set the sticky flag on a task that is descheduled involuntarily meaning ++ * it is awaiting further CPU time. If the last sticky task is still sticky ++ * but unlucky enough to not be the next task scheduled, we unstick it and try ++ * to find it an idle CPU. Realtime tasks do not stick to minimise their ++ * latency at all times. ++ */ ++static inline void ++swap_sticky(struct rq *rq, int cpu, struct task_struct *p) ++{ ++ if (rq->sticky_task) { ++ if (rq->sticky_task == p) { ++ p->sticky = true; ++ return; ++ } ++ if (task_sticky(rq->sticky_task)) { ++ clear_sticky(rq->sticky_task); ++ resched_closest_idle(rq, cpu, rq->sticky_task); ++ } ++ } ++ if (!rt_task(p)) { ++ p->sticky = true; ++ rq->sticky_task = p; ++ } else { ++ resched_closest_idle(rq, cpu, p); ++ rq->sticky_task = NULL; ++ } ++} ++ ++static inline void unstick_task(struct rq *rq, struct task_struct *p) ++{ ++ rq->sticky_task = NULL; ++ clear_sticky(p); ++} ++#else ++static inline void clear_sticky(struct task_struct *p) ++{ ++} ++ ++static inline bool task_sticky(struct task_struct *p) ++{ ++ return false; ++} ++ ++static inline void ++swap_sticky(struct rq *rq, int cpu, struct task_struct *p) ++{ ++} ++ ++static inline void unstick_task(struct rq *rq, struct task_struct *p) ++{ ++} ++#endif ++ ++/* ++ * Move a task off the global queue and take it to a cpu for it will ++ * become the running task. ++ */ ++static inline void take_task(int cpu, struct task_struct *p) ++{ ++ set_task_cpu(p, cpu); ++ dequeue_task(p); ++ clear_sticky(p); ++ dec_qnr(); ++} ++ ++/* ++ * Returns a descheduling task to the grq runqueue unless it is being ++ * deactivated. ++ */ ++static inline void return_task(struct task_struct *p, int deactivate) ++{ ++ if (deactivate) ++ deactivate_task(p); ++ else { ++ inc_qnr(); ++ enqueue_task(p); ++ } ++} ++ ++/* ++ * resched_task - mark a task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++#ifdef CONFIG_SMP ++ ++#ifndef tsk_is_polling ++#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) ++#endif ++ ++static void resched_task(struct task_struct *p) ++{ ++ int cpu; ++ ++ assert_raw_spin_locked(&grq.lock); ++ ++ if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) ++ return; ++ ++ set_tsk_thread_flag(p, TIF_NEED_RESCHED); ++ ++ cpu = task_cpu(p); ++ if (cpu == smp_processor_id()) ++ return; ++ ++ /* NEED_RESCHED must be visible before we test polling */ ++ smp_mb(); ++ if (!tsk_is_polling(p)) ++ smp_send_reschedule(cpu); ++} ++ ++#else ++static inline void resched_task(struct task_struct *p) ++{ ++ assert_raw_spin_locked(&grq.lock); ++ set_tsk_need_resched(p); ++} ++#endif ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++struct migration_req { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ ++ for (;;) { ++ /* ++ * We do the initial early heuristics without holding ++ * any task-queue locks at all. We'll only try to get ++ * the runqueue lock when things look like they will ++ * work out! In the unlikely event rq is dereferenced ++ * since we're lockless, grab it again. ++ */ ++#ifdef CONFIG_SMP ++retry_rq: ++ rq = task_rq(p); ++ if (unlikely(!rq)) ++ goto retry_rq; ++#else /* CONFIG_SMP */ ++ rq = task_rq(p); ++#endif ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the grq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ rq = task_grq_lock(p, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = task_queued(p); ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_grq_unlock(&flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ); ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++#endif ++ ++#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) ++ ++/* ++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the ++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or ++ * between themselves, they cooperatively multitask. An idle rq scores as ++ * prio PRIO_LIMIT so it is always preempted. ++ */ ++static inline bool ++can_preempt(struct task_struct *p, int prio, u64 deadline) ++{ ++ /* Better static priority RT task or better policy preemption */ ++ if (p->prio < prio) ++ return true; ++ if (p->prio > prio) ++ return false; ++ /* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */ ++ if (!deadline_before(p->deadline, deadline)) ++ return false; ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Check to see if there is a task that is affined only to offline CPUs but ++ * still wants runtime. This happens to kernel threads during suspend/halt and ++ * disabling of CPUs. ++ */ ++static inline bool online_cpus(struct task_struct *p) ++{ ++ return (likely(cpus_intersects(cpu_online_map, p->cpus_allowed))); ++} ++#else /* CONFIG_HOTPLUG_CPU */ ++/* All available CPUs are always online without hotplug. */ ++static inline bool online_cpus(struct task_struct *p) ++{ ++ return true; ++} ++#endif ++ ++/* ++ * Check to see if p can run on cpu, and if not, whether there are any online ++ * CPUs it can run on instead. ++ */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) ++ return true; ++ return false; ++} ++ ++/* ++ * When all else is equal, still prefer this_rq. ++ */ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ struct rq *highest_prio_rq; ++ int cpu, highest_prio; ++ u64 latest_deadline; ++ cpumask_t tmp; ++ ++ /* ++ * We clear the sticky flag here because for a task to have called ++ * try_preempt with the sticky flag enabled means some complicated ++ * re-scheduling has occurred and we should ignore the sticky flag. ++ */ ++ clear_sticky(p); ++ ++ if (suitable_idle_cpus(p)) { ++ resched_best_idle(p); ++ return; ++ } ++ ++ /* IDLEPRIO tasks never preempt anything */ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ ++ if (likely(online_cpus(p))) ++ cpus_and(tmp, cpu_online_map, p->cpus_allowed); ++ else ++ return; ++ ++ highest_prio = p->prio; ++ highest_prio_rq = this_rq; ++ latest_deadline = this_rq->rq_deadline; ++ ++ for_each_cpu_mask(cpu, tmp) { ++ struct rq *rq; ++ int rq_prio; ++ ++ rq = cpu_rq(cpu); ++ rq_prio = rq->rq_prio; ++ if (rq_prio < highest_prio) ++ continue; ++ ++ if (rq_prio > highest_prio || ++ deadline_after(rq->rq_deadline, latest_deadline)) { ++ latest_deadline = rq->rq_deadline; ++ highest_prio = rq_prio; ++ highest_prio_rq = rq; ++ } ++ } ++ ++ if (!can_preempt(p, highest_prio, highest_prio_rq->rq_deadline)) ++ return; ++ ++ resched_task(highest_prio_rq->curr); ++} ++#else /* CONFIG_SMP */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ return false; ++} ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) ++ resched_task(uprq->curr); ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++#ifdef CONFIG_SCHEDSTATS ++ struct rq *rq = this_rq(); ++ ++#ifdef CONFIG_SMP ++ int this_cpu = smp_processor_id(); ++ ++ if (cpu == this_cpu) ++ schedstat_inc(rq, ttwu_local); ++ else { ++ struct sched_domain *sd; ++ ++ rcu_read_lock(); ++ for_each_domain(this_cpu, sd) { ++ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { ++ schedstat_inc(sd, ttwu_wake_remote); ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ schedstat_inc(rq, ttwu_count); ++#endif /* CONFIG_SCHEDSTATS */ ++} ++ ++static inline void ttwu_activate(struct task_struct *p, struct rq *rq, ++ bool is_sync) ++{ ++ activate_task(p, rq); ++ ++ /* ++ * Sync wakeups (i.e. those types of wakeups where the waker ++ * has indicated that it will leave the CPU in short order) ++ * don't trigger a preemption if there are no idle cpus, ++ * instead waiting for current to deschedule. ++ */ ++ if (!is_sync || suitable_idle_cpus(p)) ++ try_preempt(p, rq); ++} ++ ++static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, ++ bool success) ++{ ++ trace_sched_wakeup(p, success); ++ p->state = TASK_RUNNING; ++ ++ /* ++ * if a worker is waking up, notify workqueue. Note that on BFS, we ++ * don't really know what cpu it will be, so we fake it for ++ * wq_worker_waking_up :/ ++ */ ++ if ((p->flags & PF_WQ_WORKER) && success) ++ wq_worker_waking_up(p, cpu_of(rq)); ++} ++ ++#ifdef CONFIG_SMP ++void scheduler_ipi(void) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Returns %true if @p was woken up, %false if it was already running ++ * or @state didn't match @p's state. ++ */ ++static bool try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ bool success = false; ++ unsigned long flags; ++ struct rq *rq; ++ int cpu; ++ ++ get_cpu(); ++ ++ /* This barrier is undocumented, probably for p->state? ãã */ ++ smp_wmb(); ++ ++ /* ++ * No need to do time_lock_grq as we only need to update the rq clock ++ * if we activate the task ++ */ ++ rq = task_grq_lock(p, &flags); ++ cpu = task_cpu(p); ++ ++ /* state is a volatile long, ã©ã†ã—ã¦ã€åˆ†ã‹ã‚‰ãªã„ */ ++ if (!((unsigned int)p->state & state)) ++ goto out_unlock; ++ ++ if (task_queued(p) || task_running(p)) ++ goto out_running; ++ ++ ttwu_activate(p, rq, wake_flags & WF_SYNC); ++ success = true; ++ ++out_running: ++ ttwu_post_activation(p, rq, success); ++out_unlock: ++ task_grq_unlock(&flags); ++ ++ ttwu_stat(p, cpu, wake_flags); ++ ++ put_cpu(); ++ ++ return success; ++} ++ ++/** ++ * try_to_wake_up_local - try to wake up a local task with grq lock held ++ * @p: the thread to be awakened ++ * ++ * Put @p on the run-queue if it's not already there. The caller must ++ * ensure that grq is locked and, @p is not the current task. ++ * grq stays locked over invocation. ++ */ ++static void try_to_wake_up_local(struct task_struct *p) ++{ ++ struct rq *rq = task_rq(p); ++ bool success = false; ++ ++ lockdep_assert_held(&grq.lock); ++ ++ if (!(p->state & TASK_NORMAL)) ++ return; ++ ++ if (!task_queued(p)) { ++ if (likely(!task_running(p))) { ++ schedstat_inc(rq, ttwu_count); ++ schedstat_inc(rq, ttwu_local); ++ } ++ ttwu_activate(p, rq, false); ++ ttwu_stat(p, smp_processor_id(), 0); ++ success = true; ++ } ++ ttwu_post_activation(p, rq, success); ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. Returns 1 if the process was woken up, 0 if it was already ++ * running. ++ * ++ * It may be assumed that this function implies a write memory barrier before ++ * changing the task state if and only if any tasks are woken up. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_ALL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++static void time_slice_expired(struct task_struct *p); ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++void sched_fork(struct task_struct *p) ++{ ++ struct task_struct *curr; ++ int cpu = get_cpu(); ++ struct rq *rq; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ /* ++ * We mark the process as running here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_RUNNING; ++ set_task_cpu(p, cpu); ++ ++ /* Should be reset in fork.c but done here for ease of bfs patching */ ++ p->sched_time = p->stime_pc = p->utime_pc = 0; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { ++ p->policy = SCHED_NORMAL; ++ p->normal_prio = normal_prio(p); ++ } ++ ++ if (PRIO_TO_NICE(p->static_prio) < 0) { ++ p->static_prio = NICE_TO_PRIO(0); ++ p->normal_prio = p->static_prio; ++ } ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ curr = current; ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = curr->normal_prio; ++ ++ INIT_LIST_HEAD(&p->run_list); ++#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ ++ p->on_cpu = false; ++ clear_sticky(p); ++ ++#ifdef CONFIG_PREEMPT_COUNT ++ /* Want to start with kernel preemption disabled. */ ++ task_thread_info(p)->preempt_count = 1; ++#endif ++ if (unlikely(p->policy == SCHED_FIFO)) ++ goto out; ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. If it's negative, it won't ++ * matter since that's the same as being 0. current's time_slice is ++ * actually in rq_time_slice when it's running, as is its last_ran ++ * value. rq->rq_deadline is only modified within schedule() so it ++ * is always equal to current->deadline. ++ */ ++ rq = task_grq_lock_irq(curr); ++ if (likely(rq->rq_time_slice >= RESCHED_US * 2)) { ++ rq->rq_time_slice /= 2; ++ p->time_slice = rq->rq_time_slice; ++ } else { ++ /* ++ * Forking task has run out of timeslice. Reschedule it and ++ * start its child with a new time slice and deadline. The ++ * child will end up running first because its deadline will ++ * be slightly earlier. ++ */ ++ rq->rq_time_slice = 0; ++ set_tsk_need_resched(curr); ++ time_slice_expired(p); ++ } ++ p->last_ran = rq->rq_last_ran; ++ task_grq_unlock_irq(); ++out: ++ put_cpu(); ++} ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ struct task_struct *parent; ++ unsigned long flags; ++ struct rq *rq; ++ ++ rq = task_grq_lock(p, &flags); ++ p->state = TASK_RUNNING; ++ parent = p->parent; ++ /* Unnecessary but small chance that the parent changed CPU */ ++ set_task_cpu(p, task_cpu(parent)); ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p, 1); ++ if (rq->curr == parent && !suitable_idle_cpus(p)) { ++ /* ++ * The VM isn't cloned, so we're in a good position to ++ * do child-runs-first in anticipation of an exec. This ++ * usually avoids a lot of COW overhead. ++ */ ++ resched_task(parent); ++ } else ++ try_preempt(p, rq); ++ task_grq_unlock(&flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ struct hlist_node *node; ++ ++ hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ struct hlist_node *node; ++ ++ hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ sched_info_switch(prev, next); ++ perf_event_task_sched_out(prev, next); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_lock_switch(rq, next); ++ prepare_arch_switch(next); ++ trace_sched_switch(prev, next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ */ ++static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) ++ __releases(grq.lock) ++{ ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * The test for TASK_DEAD must occur while the runqueue locks are ++ * still held, otherwise prev could be scheduled on another cpu, die ++ * there before we look at prev->state, and then the reference would ++ * be dropped twice. ++ * Manfred Spraul ++ */ ++ prev_state = prev->state; ++ finish_arch_switch(prev); ++#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW ++ local_irq_disable(); ++#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ ++ perf_event_task_sched_in(prev, current); ++#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW ++ local_irq_enable(); ++#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ ++ finish_lock_switch(rq, prev); ++ ++ fire_sched_in_preempt_notifiers(current); ++ if (mm) ++ mmdrop(mm); ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ put_task_struct(prev); ++ } ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage void schedule_tail(struct task_struct *prev) ++ __releases(grq.lock) ++{ ++ struct rq *rq = this_rq(); ++ ++ finish_task_switch(rq, prev); ++#ifdef __ARCH_WANT_UNLOCKED_CTXSW ++ /* In this case, finish_task_switch does not reenable preemption */ ++ preempt_enable(); ++#endif ++ if (current->set_child_tid) ++ put_user(current->pid, current->set_child_tid); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new ++ * thread's register state. ++ */ ++static inline void ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ struct mm_struct *mm, *oldmm; ++ ++ prepare_task_switch(rq, prev, next); ++ ++ mm = next->mm; ++ oldmm = prev->active_mm; ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ if (!mm) { ++ next->active_mm = oldmm; ++ atomic_inc(&oldmm->mm_count); ++ enter_lazy_tlb(oldmm, next); ++ } else ++ switch_mm(oldmm, mm, next); ++ ++ if (!prev->mm) { ++ prev->active_mm = NULL; ++ rq->prev_mm = oldmm; ++ } ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++#ifndef __ARCH_WANT_UNLOCKED_CTXSW ++ spin_release(&grq.lock.dep_map, 1, _THIS_IP_); ++#endif ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ ++ barrier(); ++ /* ++ * this_rq must be evaluated again because prev may have moved ++ * CPUs since it called schedule(), thus the 'rq' on its stack ++ * frame will be invalid. ++ */ ++ finish_task_switch(this_rq(), prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, current number of uninterruptible-sleeping threads, total ++ * number of context switches performed since bootup. All are measured ++ * without grabbing the grq lock but the occasional inaccurate result ++ * doesn't matter so long as it's positive. ++ */ ++unsigned long nr_running(void) ++{ ++ long nr = grq.nr_running; ++ ++ if (unlikely(nr < 0)) ++ nr = 0; ++ return (unsigned long)nr; ++} ++ ++unsigned long nr_uninterruptible(void) ++{ ++ long nu = grq.nr_uninterruptible; ++ ++ if (unlikely(nu < 0)) ++ nu = 0; ++ return nu; ++} ++ ++unsigned long long nr_context_switches(void) ++{ ++ long long ns = grq.nr_switches; ++ ++ /* This is of course impossible */ ++ if (unlikely(ns < 0)) ++ ns = 1; ++ return (unsigned long long)ns; ++} ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += atomic_read(&cpu_rq(i)->nr_iowait); ++ ++ return sum; ++} ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ struct rq *this = cpu_rq(cpu); ++ return atomic_read(&this->nr_iowait); ++} ++ ++unsigned long nr_active(void) ++{ ++ return nr_running() + nr_uninterruptible(); ++} ++ ++/* Beyond a task running on this CPU, load is equal everywhere on BFS */ ++unsigned long this_cpu_load(void) ++{ ++ return this_rq()->rq_running + ++ ((queued_notrunning() + nr_uninterruptible()) / grq.noc); ++} ++ ++/* Variables and functions for calc_load */ ++static unsigned long calc_load_update; ++unsigned long avenrun[3]; ++EXPORT_SYMBOL(avenrun); ++ ++/** ++ * get_avenrun - get the load average array ++ * @loads: pointer to dest load array ++ * @offset: offset to add ++ * @shift: shift count to shift the result left ++ * ++ * These values are estimates at best, so no need for locking. ++ */ ++void get_avenrun(unsigned long *loads, unsigned long offset, int shift) ++{ ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++} ++ ++static unsigned long ++calc_load(unsigned long load, unsigned long exp, unsigned long active) ++{ ++ load *= exp; ++ load += active * (FIXED_1 - exp); ++ return load >> FSHIFT; ++} ++ ++/* ++ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. ++ */ ++void calc_global_load(unsigned long ticks) ++{ ++ long active; ++ ++ if (time_before(jiffies, calc_load_update)) ++ return; ++ active = nr_active() * FIXED_1; ++ ++ avenrun[0] = calc_load(avenrun[0], EXP_1, active); ++ avenrun[1] = calc_load(avenrun[1], EXP_5, active); ++ avenrun[2] = calc_load(avenrun[2], EXP_15, active); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ ++/* ++ * There are no locks covering percpu hardirq/softirq time. ++ * They are only modified in account_system_vtime, on corresponding CPU ++ * with interrupts disabled. So, writes are safe. ++ * They are read and saved off onto struct rq in update_rq_clock(). ++ * This may result in other CPU reading this CPU's irq time and can ++ * race with irq/account_system_vtime on this CPU. We would either get old ++ * or new value with a side effect of accounting a slice of irq time to wrong ++ * task when irq is in progress while we read rq->clock. That is a worthy ++ * compromise in place of having locks on each irq in account_system_time. ++ */ ++static DEFINE_PER_CPU(u64, cpu_hardirq_time); ++static DEFINE_PER_CPU(u64, cpu_softirq_time); ++ ++static DEFINE_PER_CPU(u64, irq_start_time); ++static int sched_clock_irqtime; ++ ++void enable_sched_clock_irqtime(void) ++{ ++ sched_clock_irqtime = 1; ++} ++ ++void disable_sched_clock_irqtime(void) ++{ ++ sched_clock_irqtime = 0; ++} ++ ++#ifndef CONFIG_64BIT ++static DEFINE_PER_CPU(seqcount_t, irq_time_seq); ++ ++static inline void irq_time_write_begin(void) ++{ ++ __this_cpu_inc(irq_time_seq.sequence); ++ smp_wmb(); ++} ++ ++static inline void irq_time_write_end(void) ++{ ++ smp_wmb(); ++ __this_cpu_inc(irq_time_seq.sequence); ++} ++ ++static inline u64 irq_time_read(int cpu) ++{ ++ u64 irq_time; ++ unsigned seq; ++ ++ do { ++ seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); ++ irq_time = per_cpu(cpu_softirq_time, cpu) + ++ per_cpu(cpu_hardirq_time, cpu); ++ } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); ++ ++ return irq_time; ++} ++#else /* CONFIG_64BIT */ ++static inline void irq_time_write_begin(void) ++{ ++} ++ ++static inline void irq_time_write_end(void) ++{ ++} ++ ++static inline u64 irq_time_read(int cpu) ++{ ++ return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); ++} ++#endif /* CONFIG_64BIT */ ++ ++/* ++ * Called before incrementing preempt_count on {soft,}irq_enter ++ * and before decrementing preempt_count on {soft,}irq_exit. ++ */ ++void account_system_vtime(struct task_struct *curr) ++{ ++ unsigned long flags; ++ s64 delta; ++ int cpu; ++ ++ if (!sched_clock_irqtime) ++ return; ++ ++ local_irq_save(flags); ++ ++ cpu = smp_processor_id(); ++ delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); ++ __this_cpu_add(irq_start_time, delta); ++ ++ irq_time_write_begin(); ++ /* ++ * We do not account for softirq time from ksoftirqd here. ++ * We want to continue accounting softirq time to ksoftirqd thread ++ * in that case, so as not to confuse scheduler with a special task ++ * that do not consume any time, but still wants to run. ++ */ ++ if (hardirq_count()) ++ __this_cpu_add(cpu_hardirq_time, delta); ++ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) ++ __this_cpu_add(cpu_softirq_time, delta); ++ ++ irq_time_write_end(); ++ local_irq_restore(flags); ++} ++EXPORT_SYMBOL_GPL(account_system_vtime); ++ ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_PARAVIRT ++static inline u64 steal_ticks(u64 steal) ++{ ++ if (unlikely(steal > NSEC_PER_SEC)) ++ return div_u64(steal, TICK_NSEC); ++ ++ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); ++} ++#endif ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_branch((¶virt_steal_rq_enabled))) { ++ u64 st, steal = paravirt_steal_clock(cpu_of(rq)); ++ ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ st = steal_ticks(steal); ++ steal = st * TICK_NSEC; ++ ++ rq->prev_steal_time_rq += steal; ++ ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++} ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++static void irqtime_account_hi_si(void) ++{ ++ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; ++ u64 latest_ns; ++ s64 ns_diff; ++ ++ latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)); ++ ns_diff = latest_ns - cpustat->irq; ++ if (ns_diff > 0) ++ cpustat->irq = cputime64_add(cpustat->irq, ns_diff); ++ ++ latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)); ++ ns_diff = latest_ns - cpustat->softirq; ++ if (ns_diff > 0) ++ cpustat->softirq = cputime64_add(cpustat->softirq, ns_diff); ++} ++#else /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#define sched_clock_irqtime (0) ++ ++static inline void irqtime_account_hi_si(void) ++{ ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++static __always_inline bool steal_account_process_tick(void) ++{ ++#ifdef CONFIG_PARAVIRT ++ if (static_branch(¶virt_steal_enabled)) { ++ u64 steal, st = 0; ++ ++ steal = paravirt_steal_clock(smp_processor_id()); ++ steal -= this_rq()->prev_steal_time; ++ ++ st = steal_ticks(steal); ++ this_rq()->prev_steal_time += st * TICK_NSEC; ++ ++ account_steal_time(st); ++ return st; ++ } ++#endif ++ return false; ++} ++ ++/* ++ * On each tick, see what percentage of that tick was attributed to each ++ * component and add the percentage to the _pc values. Once a _pc value has ++ * accumulated one tick's worth, account for that. This means the total ++ * percentage of load components will always be 128 (pseudo 100) per tick. ++ */ ++static void pc_idle_time(struct rq *rq, unsigned long pc) ++{ ++ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; ++ cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); ++ ++ if (atomic_read(&rq->nr_iowait) > 0) { ++ rq->iowait_pc += pc; ++ if (rq->iowait_pc >= 128) { ++ rq->iowait_pc %= 128; ++ cpustat->iowait = cputime64_add(cpustat->iowait, tmp); ++ } ++ } else { ++ rq->idle_pc += pc; ++ if (rq->idle_pc >= 128) { ++ rq->idle_pc %= 128; ++ cpustat->idle = cputime64_add(cpustat->idle, tmp); ++ } ++ } ++} ++ ++static void ++pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset, ++ unsigned long pc, unsigned long ns) ++{ ++ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; ++ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); ++ cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); ++ ++ p->stime_pc += pc; ++ if (p->stime_pc >= 128) { ++ p->stime_pc %= 128; ++ p->stime = cputime_add(p->stime, cputime_one_jiffy); ++ p->stimescaled = cputime_add(p->stimescaled, one_jiffy_scaled); ++ account_group_system_time(p, cputime_one_jiffy); ++ acct_update_integrals(p); ++ } ++ p->sched_time += ns; ++ ++ if (hardirq_count() - hardirq_offset) { ++ rq->irq_pc += pc; ++ if (rq->irq_pc >= 128) { ++ rq->irq_pc %= 128; ++ cpustat->irq = cputime64_add(cpustat->irq, tmp); ++ } ++ } else if (in_serving_softirq()) { ++ rq->softirq_pc += pc; ++ if (rq->softirq_pc >= 128) { ++ rq->softirq_pc %= 128; ++ cpustat->softirq = cputime64_add(cpustat->softirq, tmp); ++ } ++ } else { ++ rq->system_pc += pc; ++ if (rq->system_pc >= 128) { ++ rq->system_pc %= 128; ++ cpustat->system = cputime64_add(cpustat->system, tmp); ++ } ++ } ++} ++ ++static void pc_user_time(struct rq *rq, struct task_struct *p, ++ unsigned long pc, unsigned long ns) ++{ ++ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; ++ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); ++ cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); ++ ++ p->utime_pc += pc; ++ if (p->utime_pc >= 128) { ++ p->utime_pc %= 128; ++ p->utime = cputime_add(p->utime, cputime_one_jiffy); ++ p->utimescaled = cputime_add(p->utimescaled, one_jiffy_scaled); ++ account_group_user_time(p, cputime_one_jiffy); ++ acct_update_integrals(p); ++ } ++ p->sched_time += ns; ++ ++ if (this_cpu_ksoftirqd() == p) { ++ /* ++ * ksoftirqd time do not get accounted in cpu_softirq_time. ++ * So, we have to handle it separately here. ++ */ ++ rq->softirq_pc += pc; ++ if (rq->softirq_pc >= 128) { ++ rq->softirq_pc %= 128; ++ cpustat->softirq = cputime64_add(cpustat->softirq, tmp); ++ } ++ } ++ ++ if (TASK_NICE(p) > 0 || idleprio_task(p)) { ++ rq->nice_pc += pc; ++ if (rq->nice_pc >= 128) { ++ rq->nice_pc %= 128; ++ cpustat->nice = cputime64_add(cpustat->nice, tmp); ++ } ++ } else { ++ rq->user_pc += pc; ++ if (rq->user_pc >= 128) { ++ rq->user_pc %= 128; ++ cpustat->user = cputime64_add(cpustat->user, tmp); ++ } ++ } ++} ++ ++/* ++ * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast ++ * shifts instead of 100 ++ */ ++#define NS_TO_PC(NS) (NS * 128 / JIFFY_NS) ++ ++/* ++ * This is called on clock ticks and on context switches. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void ++update_cpu_clock(struct rq *rq, struct task_struct *p, int tick) ++{ ++ long account_ns = rq->clock - rq->timekeep_clock; ++ struct task_struct *idle = rq->idle; ++ unsigned long account_pc; ++ ++ if (unlikely(account_ns < 0)) ++ account_ns = 0; ++ ++ account_pc = NS_TO_PC(account_ns); ++ ++ if (tick) { ++ int user_tick; ++ ++ /* Accurate tick timekeeping */ ++ rq->account_pc += account_pc - 128; ++ if (rq->account_pc < 0) { ++ /* ++ * Small errors in micro accounting may not make the ++ * accounting add up to 128 each tick so we keep track ++ * of the percentage and round it up when less than 128 ++ */ ++ account_pc += -rq->account_pc; ++ rq->account_pc = 0; ++ } ++ if (steal_account_process_tick()) ++ goto ts_account; ++ ++ user_tick = user_mode(get_irq_regs()); ++ ++ if (user_tick) ++ pc_user_time(rq, p, account_pc, account_ns); ++ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) ++ pc_system_time(rq, p, HARDIRQ_OFFSET, ++ account_pc, account_ns); ++ else ++ pc_idle_time(rq, account_pc); ++ ++ if (sched_clock_irqtime) ++ irqtime_account_hi_si(); ++ } else { ++ /* Accurate subtick timekeeping */ ++ rq->account_pc += account_pc; ++ if (p == idle) ++ pc_idle_time(rq, account_pc); ++ else ++ pc_user_time(rq, p, account_pc, account_ns); ++ } ++ ++ts_account: ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (rq->rq_policy != SCHED_FIFO && p != idle) { ++ s64 time_diff = rq->clock - rq->rq_last_ran; ++ ++ niffy_diff(&time_diff, 1); ++ rq->rq_time_slice -= NS_TO_US(time_diff); ++ } ++ rq->rq_last_ran = rq->timekeep_clock = rq->clock; ++} ++ ++/* ++ * Return any ns on the sched_clock that have not yet been accounted in ++ * @p in case that task is currently running. ++ * ++ * Called with task_grq_lock() held. ++ */ ++static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) ++{ ++ u64 ns = 0; ++ ++ if (p == rq->curr) { ++ update_clocks(rq); ++ ns = rq->clock_task - rq->rq_last_ran; ++ if (unlikely((s64)ns < 0)) ++ ns = 0; ++ } ++ ++ return ns; ++} ++ ++unsigned long long task_delta_exec(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ u64 ns; ++ ++ rq = task_grq_lock(p, &flags); ++ ns = do_task_delta_exec(p, rq); ++ task_grq_unlock(&flags); ++ ++ return ns; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * In case the task is currently running, return the runtime plus current's ++ * pending runtime that have not been accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ u64 ns; ++ ++ rq = task_grq_lock(p, &flags); ++ ns = p->sched_time + do_task_delta_exec(p, rq); ++ task_grq_unlock(&flags); ++ ++ return ns; ++} ++ ++/* Compatibility crap for removal */ ++void account_user_time(struct task_struct *p, cputime_t cputime, ++ cputime_t cputime_scaled) ++{ ++} ++ ++void account_idle_time(cputime_t cputime) ++{ ++} ++ ++/* ++ * Account guest cpu time to a process. ++ * @p: the process that the cpu time gets accounted to ++ * @cputime: the cpu time spent in virtual machine since the last update ++ * @cputime_scaled: cputime scaled by cpu frequency ++ */ ++static void account_guest_time(struct task_struct *p, cputime_t cputime, ++ cputime_t cputime_scaled) ++{ ++ cputime64_t tmp; ++ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; ++ ++ tmp = cputime_to_cputime64(cputime); ++ ++ /* Add guest time to process. */ ++ p->utime = cputime_add(p->utime, cputime); ++ p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); ++ account_group_user_time(p, cputime); ++ p->gtime = cputime_add(p->gtime, cputime); ++ ++ /* Add guest time to cpustat. */ ++ if (TASK_NICE(p) > 0) { ++ cpustat->nice = cputime64_add(cpustat->nice, tmp); ++ cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); ++ } else { ++ cpustat->user = cputime64_add(cpustat->user, tmp); ++ cpustat->guest = cputime64_add(cpustat->guest, tmp); ++ } ++} ++ ++/* ++ * Account system cpu time to a process and desired cpustat field ++ * @p: the process that the cpu time gets accounted to ++ * @cputime: the cpu time spent in kernel space since the last update ++ * @cputime_scaled: cputime scaled by cpu frequency ++ * @target_cputime64: pointer to cpustat field that has to be updated ++ */ ++static inline ++void __account_system_time(struct task_struct *p, cputime_t cputime, ++ cputime_t cputime_scaled, cputime64_t *target_cputime64) ++{ ++ cputime64_t tmp = cputime_to_cputime64(cputime); ++ ++ /* Add system time to process. */ ++ p->stime = cputime_add(p->stime, cputime); ++ p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); ++ account_group_system_time(p, cputime); ++ ++ /* Add system time to cpustat. */ ++ *target_cputime64 = cputime64_add(*target_cputime64, tmp); ++ ++ /* Account for system time used */ ++ acct_update_integrals(p); ++} ++ ++/* ++ * Account system cpu time to a process. ++ * @p: the process that the cpu time gets accounted to ++ * @hardirq_offset: the offset to subtract from hardirq_count() ++ * @cputime: the cpu time spent in kernel space since the last update ++ * @cputime_scaled: cputime scaled by cpu frequency ++ * This is for guest only now. ++ */ ++void account_system_time(struct task_struct *p, int hardirq_offset, ++ cputime_t cputime, cputime_t cputime_scaled) ++{ ++ ++ if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) ++ account_guest_time(p, cputime, cputime_scaled); ++} ++ ++/* ++ * Account for involuntary wait time. ++ * @steal: the cpu time spent in involuntary wait ++ */ ++void account_steal_time(cputime_t cputime) ++{ ++ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; ++ cputime64_t cputime64 = cputime_to_cputime64(cputime); ++ ++ cpustat->steal = cputime64_add(cpustat->steal, cputime64); ++} ++ ++/* ++ * Account for idle time. ++ * @cputime: the cpu time spent in idle wait ++ */ ++static void account_idle_times(cputime_t cputime) ++{ ++ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; ++ cputime64_t cputime64 = cputime_to_cputime64(cputime); ++ struct rq *rq = this_rq(); ++ ++ if (atomic_read(&rq->nr_iowait) > 0) ++ cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); ++ else ++ cpustat->idle = cputime64_add(cpustat->idle, cputime64); ++} ++ ++#ifndef CONFIG_VIRT_CPU_ACCOUNTING ++ ++void account_process_tick(struct task_struct *p, int user_tick) ++{ ++} ++ ++/* ++ * Account multiple ticks of steal time. ++ * @p: the process from which the cpu time has been stolen ++ * @ticks: number of stolen ticks ++ */ ++void account_steal_ticks(unsigned long ticks) ++{ ++ account_steal_time(jiffies_to_cputime(ticks)); ++} ++ ++/* ++ * Account multiple ticks of idle time. ++ * @ticks: number of stolen ticks ++ */ ++void account_idle_ticks(unsigned long ticks) ++{ ++ account_idle_times(jiffies_to_cputime(ticks)); ++} ++#endif ++ ++static inline void grq_iso_lock(void) ++ __acquires(grq.iso_lock) ++{ ++ raw_spin_lock(&grq.iso_lock); ++} ++ ++static inline void grq_iso_unlock(void) ++ __releases(grq.iso_lock) ++{ ++ raw_spin_unlock(&grq.iso_lock); ++} ++ ++/* ++ * Functions to test for when SCHED_ISO tasks have used their allocated ++ * quota as real time scheduling and convert them back to SCHED_NORMAL. ++ * Where possible, the data is tested lockless, to avoid grabbing iso_lock ++ * because the occasional inaccurate result won't matter. However the ++ * tick data is only ever modified under lock. iso_refractory is only simply ++ * set to 0 or 1 so it's not worth grabbing the lock yet again for that. ++ */ ++static void set_iso_refractory(void) ++{ ++ grq.iso_refractory = 1; ++} ++ ++static void clear_iso_refractory(void) ++{ ++ grq.iso_refractory = 0; ++} ++ ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a ++ * slow division. ++ */ ++static unsigned int test_ret_isorefractory(struct rq *rq) ++{ ++ if (likely(!grq.iso_refractory)) { ++ if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu) ++ set_iso_refractory(); ++ } else { ++ if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) ++ clear_iso_refractory(); ++ } ++ return grq.iso_refractory; ++} ++ ++static void iso_tick(void) ++{ ++ grq_iso_lock(); ++ grq.iso_ticks += 100; ++ grq_iso_unlock(); ++} ++ ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(void) ++{ ++ if (grq.iso_ticks) { ++ grq_iso_lock(); ++ grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1; ++ if (unlikely(grq.iso_refractory && grq.iso_ticks < ++ ISO_PERIOD * (sched_iso_cpu * 115 / 128))) ++ clear_iso_refractory(); ++ grq_iso_unlock(); ++ } ++} ++ ++static bool rq_running_iso(struct rq *rq) ++{ ++ return rq->rq_prio == ISO_PRIO; ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq) ++{ ++ struct task_struct *p; ++ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) { ++ if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128) ++ iso_tick(); ++ } else ++ no_iso_tick(); ++ ++ if (iso_queue(rq)) { ++ if (unlikely(test_ret_isorefractory(rq))) { ++ if (rq_running_iso(rq)) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Force it to reschedule as ++ * SCHED_NORMAL by zeroing its time_slice ++ */ ++ rq->rq_time_slice = 0; ++ } ++ } ++ } ++ ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (rq->rq_policy == SCHED_FIFO) ++ return; ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ */ ++ if (rq->dither) { ++ if (rq->rq_time_slice > HALF_JIFFY_US) ++ return; ++ else ++ rq->rq_time_slice = 0; ++ } else if (rq->rq_time_slice >= RESCHED_US) ++ return; ++ ++ /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */ ++ p = rq->curr; ++ requeue_task(p); ++ grq_lock(); ++ set_tsk_need_resched(p); ++ grq_unlock(); ++} ++ ++void wake_up_idle_cpu(int cpu); ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. The data modified is all ++ * local to struct rq so we don't need to grab grq lock. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ /* grq lock not grabbed, so only update rq clock */ ++ update_rq_clock(rq); ++ update_cpu_clock(rq, rq->curr, 1); ++ if (!rq_idle(rq)) ++ task_running_tick(rq); ++ else ++ no_iso_tick(); ++ rq->last_tick = rq->clock; ++ perf_event_task_tick(); ++} ++ ++notrace unsigned long get_parent_ip(unsigned long addr) ++{ ++ if (in_lock_functions(addr)) { ++ addr = CALLER_ADDR2; ++ if (in_lock_functions(addr)) ++ addr = CALLER_ADDR3; ++ } ++ return addr; ++} ++ ++#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++void __kprobes add_preempt_count(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ preempt_count() += val; ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ if (preempt_count() == val) ++ trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); ++} ++EXPORT_SYMBOL(add_preempt_count); ++ ++void __kprobes sub_preempt_count(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); ++ preempt_count() -= val; ++} ++EXPORT_SYMBOL(sub_preempt_count); ++#endif ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes cpu fairly amongst tasks of the ++ * same nice value, it proportions cpu according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 prio_deadline_diff(int user_prio) ++{ ++ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); ++} ++ ++static inline u64 task_deadline_diff(struct task_struct *p) ++{ ++ return prio_deadline_diff(TASK_USER_PRIO(p)); ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return prio_deadline_diff(USER_PRIO(static_prio)); ++} ++ ++static inline int longest_deadline_diff(void) ++{ ++ return prio_deadline_diff(39); ++} ++ ++static inline int ms_longest_deadline_diff(void) ++{ ++ return NS_TO_MS(longest_deadline_diff()); ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline. ++ */ ++static void time_slice_expired(struct task_struct *p) ++{ ++ p->time_slice = timeslice(); ++ p->deadline = grq.niffies + task_deadline_diff(p); ++} ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p) ++{ ++ if (p->time_slice < RESCHED_US || batch_task(p)) ++ time_slice_expired(p); ++} ++ ++/* ++ * O(n) lookup of all tasks in the global runqueue. The real brainfuck ++ * of lock contention and O(n). It's not really O(n) as only the queued, ++ * but not running tasks are scanned, and is O(n) queued in the worst case ++ * scenario only because the right task can be found before scanning all of ++ * them. ++ * Tasks are selected in this order: ++ * Real time tasks are selected purely by their static priority and in the ++ * order they were queued, so the lowest value idx, and the first queued task ++ * of that priority value is chosen. ++ * If no real time tasks are found, the SCHED_ISO priority is checked, and ++ * all SCHED_ISO tasks have the same priority value, so they're selected by ++ * the earliest deadline value. ++ * If no SCHED_ISO tasks are found, SCHED_NORMAL tasks are selected by the ++ * earliest deadline. ++ * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are ++ * selected by the earliest deadline. ++ */ ++static inline struct ++task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ u64 dl, uninitialized_var(earliest_deadline); ++ struct task_struct *p, *edt = idle; ++ struct list_head *queue; ++ int idx = 0; ++ ++retry: ++ idx = find_next_bit(grq.prio_bitmap, PRIO_LIMIT, idx); ++ if (idx >= PRIO_LIMIT) ++ goto out; ++ queue = grq.queue + idx; ++ ++ if (idx < MAX_RT_PRIO) { ++ /* We found an rt task */ ++ list_for_each_entry(p, queue, run_list) { ++ /* Make sure cpu affinity is ok */ ++ if (needs_other_cpu(p, cpu)) ++ continue; ++ edt = p; ++ goto out_take; ++ } ++ /* None of the RT tasks at this priority can run on this cpu */ ++ ++idx; ++ goto retry; ++ } ++ ++ list_for_each_entry(p, queue, run_list) { ++ /* Make sure cpu affinity is ok */ ++ if (needs_other_cpu(p, cpu)) ++ continue; ++ ++ /* ++ * Soft affinity happens here by not scheduling a task with ++ * its sticky flag set that ran on a different CPU last when ++ * the CPU is scaling, or by greatly biasing against its ++ * deadline when not. ++ */ ++ if (task_rq(p) != rq && task_sticky(p)) { ++ if (scaling_rq(rq)) ++ continue; ++ else ++ dl = p->deadline + longest_deadline_diff(); ++ } else ++ dl = p->deadline; ++ ++ /* ++ * No rt tasks. Find the earliest deadline task. Now we're in ++ * O(n) territory. This is what we silenced the compiler for ++ * with uninitialized_var(): edt will always start as idle. ++ */ ++ if (edt == idle || ++ deadline_before(dl, earliest_deadline)) { ++ earliest_deadline = dl; ++ edt = p; ++ } ++ } ++ if (edt == idle) { ++ if (++idx < PRIO_LIMIT) ++ goto retry; ++ goto out; ++ } ++out_take: ++ take_task(cpu, edt); ++out: ++ return edt; ++} ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ struct pt_regs *regs = get_irq_regs(); ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ ++ if (regs) ++ show_regs(regs); ++ else ++ dump_stack(); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev) ++{ ++ /* ++ * Test if we are atomic. Since do_exit() needs to call into ++ * schedule() atomically, we ignore that path for now. ++ * Otherwise, whine if we are scheduling when we should not be. ++ */ ++ if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) ++ __schedule_bug(prev); ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq(), sched_count); ++} ++ ++/* ++ * The currently running task's information is all stored in rq local data ++ * which is only modified by the local CPU, thereby allowing the data to be ++ * changed without grabbing the grq lock. ++ */ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ rq->rq_time_slice = p->time_slice; ++ rq->rq_deadline = p->deadline; ++ rq->rq_last_ran = p->last_ran = rq->clock; ++ rq->rq_policy = p->policy; ++ rq->rq_prio = p->prio; ++ if (p != rq->idle) ++ rq->rq_running = true; ++ else ++ rq->rq_running = false; ++} ++ ++static void reset_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ rq->rq_policy = p->policy; ++ rq->rq_prio = p->prio; ++} ++ ++/* ++ * schedule() is the main scheduler function. ++ */ ++asmlinkage void __sched schedule(void) ++{ ++ struct task_struct *prev, *next, *idle; ++ unsigned long *switch_count; ++ int deactivate, cpu; ++ struct rq *rq; ++ ++need_resched: ++ preempt_disable(); ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ idle = rq->idle; ++ rcu_note_context_switch(cpu); ++ prev = rq->curr; ++ ++ deactivate = 0; ++ schedule_debug(prev); ++ ++ grq_lock_irq(); ++ ++ switch_count = &prev->nivcsw; ++ if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { ++ if (unlikely(signal_pending_state(prev->state, prev))) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate = 1; ++ /* ++ * If a worker is going to sleep, notify and ++ * ask workqueue whether it wants to wake up a ++ * task to maintain concurrency. If so, wake ++ * up the task. ++ */ ++ if (prev->flags & PF_WQ_WORKER) { ++ struct task_struct *to_wakeup; ++ ++ to_wakeup = wq_worker_sleeping(prev, cpu); ++ if (to_wakeup) { ++ /* This shouldn't happen, but does */ ++ if (unlikely(to_wakeup == prev)) ++ deactivate = 0; ++ else ++ try_to_wake_up_local(to_wakeup); ++ } ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, make ++ * sure to submit it to avoid deadlocks. ++ */ ++ if (unlikely(deactivate && blk_needs_flush_plug(prev))) { ++ grq_unlock_irq(); ++ preempt_enable_no_resched(); ++ blk_schedule_flush_plug(prev); ++ goto need_resched; ++ } ++ ++ update_clocks(rq); ++ update_cpu_clock(rq, prev, 0); ++ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) ++ rq->dither = false; ++ else ++ rq->dither = true; ++ ++ clear_tsk_need_resched(prev); ++ ++ if (prev != idle) { ++ /* Update all the information stored on struct rq */ ++ prev->time_slice = rq->rq_time_slice; ++ prev->deadline = rq->rq_deadline; ++ check_deadline(prev); ++ prev->last_ran = rq->clock; ++ ++ /* Task changed affinity off this CPU */ ++ if (needs_other_cpu(prev, cpu)) ++ resched_suitable_idle(prev); ++ else if (!deactivate) { ++ if (!queued_notrunning()) { ++ /* ++ * We now know prev is the only thing that is ++ * awaiting CPU so we can bypass rechecking for ++ * the earliest deadline task and just run it ++ * again. ++ */ ++ grq_unlock_irq(); ++ goto rerun_prev_unlocked; ++ } else ++ swap_sticky(rq, cpu, prev); ++ } ++ return_task(prev, deactivate); ++ } ++ ++ if (unlikely(!queued_notrunning())) { ++ /* ++ * This CPU is now truly idle as opposed to when idle is ++ * scheduled as a high priority task in its own right. ++ */ ++ next = idle; ++ schedstat_inc(rq, sched_goidle); ++ set_cpuidle_map(cpu); ++ } else { ++ next = earliest_deadline_task(rq, cpu, idle); ++ if (likely(next->prio != PRIO_LIMIT)) ++ clear_cpuidle_map(cpu); ++ else ++ set_cpuidle_map(cpu); ++ } ++ ++ if (likely(prev != next)) { ++ /* ++ * Don't stick tasks when a real time task is going to run as ++ * they may literally get stuck. ++ */ ++ if (rt_task(next)) ++ unstick_task(rq, prev); ++ set_rq_task(rq, next); ++ grq.nr_switches++; ++ prev->on_cpu = false; ++ next->on_cpu = true; ++ rq->curr = next; ++ ++*switch_count; ++ ++ context_switch(rq, prev, next); /* unlocks the grq */ ++ /* ++ * The context switch have flipped the stack from under us ++ * and restored the local variables which were saved when ++ * this task called schedule() in the past. prev == current ++ * is still correct, but it can be moved to another cpu/rq. ++ */ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ idle = rq->idle; ++ } else ++ grq_unlock_irq(); ++ ++rerun_prev_unlocked: ++ preempt_enable_no_resched(); ++ if (need_resched()) ++ goto need_resched; ++} ++EXPORT_SYMBOL(schedule); ++ ++#ifdef CONFIG_MUTEX_SPIN_ON_OWNER ++ ++static inline bool owner_running(struct mutex *lock, struct task_struct *owner) ++{ ++ if (lock->owner != owner) ++ return false; ++ ++ /* ++ * Ensure we emit the owner->on_cpu, dereference _after_ checking ++ * lock->owner still matches owner, if that fails, owner might ++ * point to free()d memory, if it still matches, the rcu_read_lock() ++ * ensures the memory stays valid. ++ */ ++ barrier(); ++ ++ return owner->on_cpu; ++} ++ ++/* ++ * Look out! "owner" is an entirely speculative pointer ++ * access and not reliable. ++ */ ++int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) ++{ ++ rcu_read_lock(); ++ while (owner_running(lock, owner)) { ++ if (need_resched()) ++ break; ++ ++ arch_mutex_cpu_relax(); ++ } ++ rcu_read_unlock(); ++ ++ /* ++ * We break out the loop above on need_resched() and when the ++ * owner changed, which is a sign for heavy contention. Return ++ * success only when lock->owner is NULL. ++ */ ++ return lock->owner == NULL; ++} ++#endif ++ ++#ifdef CONFIG_PREEMPT ++/* ++ * this is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. Kernel preemptions off return from interrupt ++ * occur there and call schedule directly. ++ */ ++asmlinkage void __sched notrace preempt_schedule(void) ++{ ++ struct thread_info *ti = current_thread_info(); ++ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(ti->preempt_count || irqs_disabled())) ++ return; ++ ++ do { ++ add_preempt_count_notrace(PREEMPT_ACTIVE); ++ schedule(); ++ sub_preempt_count_notrace(PREEMPT_ACTIVE); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ barrier(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL(preempt_schedule); ++ ++/* ++ * this is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage void __sched preempt_schedule_irq(void) ++{ ++ struct thread_info *ti = current_thread_info(); ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(ti->preempt_count || !irqs_disabled()); ++ ++ do { ++ add_preempt_count(PREEMPT_ACTIVE); ++ local_irq_enable(); ++ schedule(); ++ local_irq_disable(); ++ sub_preempt_count(PREEMPT_ACTIVE); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ barrier(); ++ } while (need_resched()); ++} ++ ++#endif /* CONFIG_PREEMPT */ ++ ++int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++/* ++ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just ++ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve ++ * number) then we wake all the non-exclusive tasks and one exclusive task. ++ * ++ * There are circumstances in which we can try to wake a task which has already ++ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns ++ * zero in this (rare) case, and we handle it by continuing to scan the queue. ++ */ ++static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, ++ int nr_exclusive, int wake_flags, void *key) ++{ ++ struct list_head *tmp, *next; ++ ++ list_for_each_safe(tmp, next, &q->task_list) { ++ wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); ++ unsigned int flags = curr->flags; ++ ++ if (curr->func(curr, mode, wake_flags, key) && ++ (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) ++ break; ++ } ++} ++ ++/** ++ * __wake_up - wake up threads blocked on a waitqueue. ++ * @q: the waitqueue ++ * @mode: which threads ++ * @nr_exclusive: how many wake-one or wake-many threads to wake up ++ * @key: is directly passed to the wakeup function ++ * ++ * It may be assumed that this function implies a write memory barrier before ++ * changing the task state if and only if any tasks are woken up. ++ */ ++void __wake_up(wait_queue_head_t *q, unsigned int mode, ++ int nr_exclusive, void *key) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&q->lock, flags); ++ __wake_up_common(q, mode, nr_exclusive, 0, key); ++ spin_unlock_irqrestore(&q->lock, flags); ++} ++EXPORT_SYMBOL(__wake_up); ++ ++/* ++ * Same as __wake_up but called with the spinlock in wait_queue_head_t held. ++ */ ++void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) ++{ ++ __wake_up_common(q, mode, 1, 0, NULL); ++} ++EXPORT_SYMBOL_GPL(__wake_up_locked); ++ ++void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) ++{ ++ __wake_up_common(q, mode, 1, 0, key); ++} ++EXPORT_SYMBOL_GPL(__wake_up_locked_key); ++ ++/** ++ * __wake_up_sync_key - wake up threads blocked on a waitqueue. ++ * @q: the waitqueue ++ * @mode: which threads ++ * @nr_exclusive: how many wake-one or wake-many threads to wake up ++ * @key: opaque value to be passed to wakeup targets ++ * ++ * The sync wakeup differs that the waker knows that it will schedule ++ * away soon, so while the target thread will be woken up, it will not ++ * be migrated to another CPU - ie. the two threads are 'synchronised' ++ * with each other. This can prevent needless bouncing between CPUs. ++ * ++ * On UP it can prevent extra preemption. ++ * ++ * It may be assumed that this function implies a write memory barrier before ++ * changing the task state if and only if any tasks are woken up. ++ */ ++void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, ++ int nr_exclusive, void *key) ++{ ++ unsigned long flags; ++ int wake_flags = WF_SYNC; ++ ++ if (unlikely(!q)) ++ return; ++ ++ if (unlikely(!nr_exclusive)) ++ wake_flags = 0; ++ ++ spin_lock_irqsave(&q->lock, flags); ++ __wake_up_common(q, mode, nr_exclusive, wake_flags, key); ++ spin_unlock_irqrestore(&q->lock, flags); ++} ++EXPORT_SYMBOL_GPL(__wake_up_sync_key); ++ ++/** ++ * __wake_up_sync - wake up threads blocked on a waitqueue. ++ * @q: the waitqueue ++ * @mode: which threads ++ * @nr_exclusive: how many wake-one or wake-many threads to wake up ++ * ++ * The sync wakeup differs that the waker knows that it will schedule ++ * away soon, so while the target thread will be woken up, it will not ++ * be migrated to another CPU - ie. the two threads are 'synchronised' ++ * with each other. This can prevent needless bouncing between CPUs. ++ * ++ * On UP it can prevent extra preemption. ++ */ ++void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) ++{ ++ unsigned long flags; ++ int sync = 1; ++ ++ if (unlikely(!q)) ++ return; ++ ++ if (unlikely(!nr_exclusive)) ++ sync = 0; ++ ++ spin_lock_irqsave(&q->lock, flags); ++ __wake_up_common(q, mode, nr_exclusive, sync, NULL); ++ spin_unlock_irqrestore(&q->lock, flags); ++} ++EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ ++ ++/** ++ * complete: - signals a single thread waiting on this completion ++ * @x: holds the state of this particular completion ++ * ++ * This will wake up a single thread waiting on this completion. Threads will be ++ * awakened in the same order in which they were queued. ++ * ++ * See also complete_all(), wait_for_completion() and related routines. ++ * ++ * It may be assumed that this function implies a write memory barrier before ++ * changing the task state if and only if any tasks are woken up. ++ */ ++void complete(struct completion *x) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&x->wait.lock, flags); ++ x->done++; ++ __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); ++ spin_unlock_irqrestore(&x->wait.lock, flags); ++} ++EXPORT_SYMBOL(complete); ++ ++/** ++ * complete_all: - signals all threads waiting on this completion ++ * @x: holds the state of this particular completion ++ * ++ * This will wake up all threads waiting on this particular completion event. ++ * ++ * It may be assumed that this function implies a write memory barrier before ++ * changing the task state if and only if any tasks are woken up. ++ */ ++void complete_all(struct completion *x) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&x->wait.lock, flags); ++ x->done += UINT_MAX/2; ++ __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); ++ spin_unlock_irqrestore(&x->wait.lock, flags); ++} ++EXPORT_SYMBOL(complete_all); ++ ++static inline long __sched ++do_wait_for_common(struct completion *x, long timeout, int state) ++{ ++ if (!x->done) { ++ DECLARE_WAITQUEUE(wait, current); ++ ++ __add_wait_queue_tail_exclusive(&x->wait, &wait); ++ do { ++ if (signal_pending_state(state, current)) { ++ timeout = -ERESTARTSYS; ++ break; ++ } ++ __set_current_state(state); ++ spin_unlock_irq(&x->wait.lock); ++ timeout = schedule_timeout(timeout); ++ spin_lock_irq(&x->wait.lock); ++ } while (!x->done && timeout); ++ __remove_wait_queue(&x->wait, &wait); ++ if (!x->done) ++ return timeout; ++ } ++ x->done--; ++ return timeout ?: 1; ++} ++ ++static long __sched ++wait_for_common(struct completion *x, long timeout, int state) ++{ ++ might_sleep(); ++ ++ spin_lock_irq(&x->wait.lock); ++ timeout = do_wait_for_common(x, timeout, state); ++ spin_unlock_irq(&x->wait.lock); ++ return timeout; ++} ++ ++/** ++ * wait_for_completion: - waits for completion of a task ++ * @x: holds the state of this particular completion ++ * ++ * This waits to be signaled for completion of a specific task. It is NOT ++ * interruptible and there is no timeout. ++ * ++ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout ++ * and interrupt capability. Also see complete(). ++ */ ++void __sched wait_for_completion(struct completion *x) ++{ ++ wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); ++} ++EXPORT_SYMBOL(wait_for_completion); ++ ++/** ++ * wait_for_completion_timeout: - waits for completion of a task (w/timeout) ++ * @x: holds the state of this particular completion ++ * @timeout: timeout value in jiffies ++ * ++ * This waits for either a completion of a specific task to be signaled or for a ++ * specified timeout to expire. The timeout is in jiffies. It is not ++ * interruptible. ++ * ++ * The return value is 0 if timed out, and positive (at least 1, or number of ++ * jiffies left till timeout) if completed. ++ */ ++unsigned long __sched ++wait_for_completion_timeout(struct completion *x, unsigned long timeout) ++{ ++ return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); ++} ++EXPORT_SYMBOL(wait_for_completion_timeout); ++ ++/** ++ * wait_for_completion_interruptible: - waits for completion of a task (w/intr) ++ * @x: holds the state of this particular completion ++ * ++ * This waits for completion of a specific task to be signaled. It is ++ * interruptible. ++ * ++ * The return value is -ERESTARTSYS if interrupted, 0 if completed. ++ */ ++int __sched wait_for_completion_interruptible(struct completion *x) ++{ ++ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); ++ if (t == -ERESTARTSYS) ++ return t; ++ return 0; ++} ++EXPORT_SYMBOL(wait_for_completion_interruptible); ++ ++/** ++ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) ++ * @x: holds the state of this particular completion ++ * @timeout: timeout value in jiffies ++ * ++ * This waits for either a completion of a specific task to be signaled or for a ++ * specified timeout to expire. It is interruptible. The timeout is in jiffies. ++ * ++ * The return value is -ERESTARTSYS if interrupted, 0 if timed out, ++ * positive (at least 1, or number of jiffies left till timeout) if completed. ++ */ ++long __sched ++wait_for_completion_interruptible_timeout(struct completion *x, ++ unsigned long timeout) ++{ ++ return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); ++} ++EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); ++ ++/** ++ * wait_for_completion_killable: - waits for completion of a task (killable) ++ * @x: holds the state of this particular completion ++ * ++ * This waits to be signaled for completion of a specific task. It can be ++ * interrupted by a kill signal. ++ * ++ * The return value is -ERESTARTSYS if interrupted, 0 if timed out, ++ * positive (at least 1, or number of jiffies left till timeout) if completed. ++ */ ++int __sched wait_for_completion_killable(struct completion *x) ++{ ++ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); ++ if (t == -ERESTARTSYS) ++ return t; ++ return 0; ++} ++EXPORT_SYMBOL(wait_for_completion_killable); ++ ++/** ++ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) ++ * @x: holds the state of this particular completion ++ * @timeout: timeout value in jiffies ++ * ++ * This waits for either a completion of a specific task to be ++ * signaled or for a specified timeout to expire. It can be ++ * interrupted by a kill signal. The timeout is in jiffies. ++ */ ++long __sched ++wait_for_completion_killable_timeout(struct completion *x, ++ unsigned long timeout) ++{ ++ return wait_for_common(x, timeout, TASK_KILLABLE); ++} ++EXPORT_SYMBOL(wait_for_completion_killable_timeout); ++ ++/** ++ * try_wait_for_completion - try to decrement a completion without blocking ++ * @x: completion structure ++ * ++ * Returns: 0 if a decrement cannot be done without blocking ++ * 1 if a decrement succeeded. ++ * ++ * If a completion is being used as a counting completion, ++ * attempt to decrement the counter without blocking. This ++ * enables us to avoid waiting if the resource the completion ++ * is protecting is not available. ++ */ ++bool try_wait_for_completion(struct completion *x) ++{ ++ unsigned long flags; ++ int ret = 1; ++ ++ spin_lock_irqsave(&x->wait.lock, flags); ++ if (!x->done) ++ ret = 0; ++ else ++ x->done--; ++ spin_unlock_irqrestore(&x->wait.lock, flags); ++ return ret; ++} ++EXPORT_SYMBOL(try_wait_for_completion); ++ ++/** ++ * completion_done - Test to see if a completion has any waiters ++ * @x: completion structure ++ * ++ * Returns: 0 if there are waiters (wait_for_completion() in progress) ++ * 1 if there are no waiters. ++ * ++ */ ++bool completion_done(struct completion *x) ++{ ++ unsigned long flags; ++ int ret = 1; ++ ++ spin_lock_irqsave(&x->wait.lock, flags); ++ if (!x->done) ++ ret = 0; ++ spin_unlock_irqrestore(&x->wait.lock, flags); ++ return ret; ++} ++EXPORT_SYMBOL(completion_done); ++ ++static long __sched ++sleep_on_common(wait_queue_head_t *q, int state, long timeout) ++{ ++ unsigned long flags; ++ wait_queue_t wait; ++ ++ init_waitqueue_entry(&wait, current); ++ ++ __set_current_state(state); ++ ++ spin_lock_irqsave(&q->lock, flags); ++ __add_wait_queue(q, &wait); ++ spin_unlock(&q->lock); ++ timeout = schedule_timeout(timeout); ++ spin_lock_irq(&q->lock); ++ __remove_wait_queue(q, &wait); ++ spin_unlock_irqrestore(&q->lock, flags); ++ ++ return timeout; ++} ++ ++void __sched interruptible_sleep_on(wait_queue_head_t *q) ++{ ++ sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); ++} ++EXPORT_SYMBOL(interruptible_sleep_on); ++ ++long __sched ++interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) ++{ ++ return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); ++} ++EXPORT_SYMBOL(interruptible_sleep_on_timeout); ++ ++void __sched sleep_on(wait_queue_head_t *q) ++{ ++ sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); ++} ++EXPORT_SYMBOL(sleep_on); ++ ++long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) ++{ ++ return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); ++} ++EXPORT_SYMBOL(sleep_on_timeout); ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task ++ * @prio: prio value (kernel-internal form) ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance logic. ++ */ ++void rt_mutex_setprio(struct task_struct *p, int prio) ++{ ++ unsigned long flags; ++ int queued, oldprio; ++ struct rq *rq; ++ ++ BUG_ON(prio < 0 || prio > MAX_PRIO); ++ ++ rq = task_grq_lock(p, &flags); ++ ++ trace_sched_pi_setprio(p, prio); ++ oldprio = p->prio; ++ queued = task_queued(p); ++ if (queued) ++ dequeue_task(p); ++ p->prio = prio; ++ if (task_running(p) && prio > oldprio) ++ resched_task(p); ++ if (queued) { ++ enqueue_task(p); ++ try_preempt(p, rq); ++ } ++ ++ task_grq_unlock(&flags); ++} ++ ++#endif ++ ++/* ++ * Adjust the deadline for when the priority is to change, before it's ++ * changed. ++ */ ++static inline void adjust_deadline(struct task_struct *p, int new_prio) ++{ ++ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); ++} ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int queued, new_static, old_static; ++ unsigned long flags; ++ struct rq *rq; ++ ++ if (TASK_NICE(p) == nice || nice < -20 || nice > 19) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ rq = time_task_grq_lock(p, &flags); ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (has_rt_policy(p)) { ++ p->static_prio = new_static; ++ goto out_unlock; ++ } ++ queued = task_queued(p); ++ if (queued) ++ dequeue_task(p); ++ ++ adjust_deadline(p, new_static); ++ old_static = p->static_prio; ++ p->static_prio = new_static; ++ p->prio = effective_prio(p); ++ ++ if (queued) { ++ enqueue_task(p); ++ if (new_static < old_static) ++ try_preempt(p, rq); ++ } else if (task_running(p)) { ++ reset_rq_task(rq, p); ++ if (old_static < new_static) ++ resched_task(p); ++ } ++out_unlock: ++ task_grq_unlock(&flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = 20 - nice; ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ if (increment < -40) ++ increment = -40; ++ if (increment > 40) ++ increment = 40; ++ ++ nice = TASK_NICE(current) + increment; ++ if (nice < -20) ++ nice = -20; ++ if (nice > 19) ++ nice = 19; ++ ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * This is the priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int delta, prio = p->prio - MAX_RT_PRIO; ++ ++ /* rt tasks and iso tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ /* Convert to ms to avoid overflows */ ++ delta = NS_TO_MS(p->deadline - grq.niffies); ++ delta = delta * 40 / ms_longest_deadline_diff(); ++ if (delta > 0 && delta <= 80) ++ prio += delta; ++ if (idleprio_task(p)) ++ prio += 40; ++out: ++ return prio; ++} ++ ++/** ++ * task_nice - return the nice value of a given task. ++ * @p: the task in question. ++ */ ++int task_nice(const struct task_struct *p) ++{ ++ return TASK_NICE(p); ++} ++EXPORT_SYMBOL_GPL(task_nice); ++ ++/** ++ * idle_cpu - is a given cpu idle currently? ++ * @cpu: the processor in question. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * idle_task - return the idle task for a given cpu. ++ * @cpu: the processor in question. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* Actually do priority change: must hold grq lock. */ ++static void ++__setscheduler(struct task_struct *p, struct rq *rq, int policy, int prio) ++{ ++ int oldrtprio, oldprio; ++ ++ p->policy = policy; ++ oldrtprio = p->rt_priority; ++ p->rt_priority = prio; ++ p->normal_prio = normal_prio(p); ++ oldprio = p->prio; ++ /* we are holding p->pi_lock already */ ++ p->prio = rt_mutex_getprio(p); ++ if (task_running(p)) { ++ reset_rq_task(rq, p); ++ /* Resched only if we might now be preempted */ ++ if (p->prio > oldprio || p->rt_priority > oldrtprio) ++ resched_task(p); ++ } ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ if (cred->user->user_ns == pcred->user->user_ns) ++ match = (cred->euid == pcred->euid || ++ cred->euid == pcred->uid); ++ else ++ match = false; ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool user) ++{ ++ struct sched_param zero_param = { .sched_priority = 0 }; ++ int queued, retval, oldpolicy = -1; ++ unsigned long flags, rlim_rtprio = 0; ++ int reset_on_fork; ++ struct rq *rq; ++ ++ /* may grab non-irq protected spin_locks */ ++ BUG_ON(in_interrupt()); ++ ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ unsigned long lflags; ++ ++ if (!lock_task_sighand(p, &lflags)) ++ return -ESRCH; ++ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); ++ unlock_task_sighand(p, &lflags); ++ if (rlim_rtprio) ++ goto recheck; ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ param = &zero_param; ++ } ++recheck: ++ /* double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); ++ policy &= ~SCHED_RESET_ON_FORK; ++ ++ if (!SCHED_RANGE(policy)) ++ return -EINVAL; ++ } ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH is 0. ++ */ ++ if (param->sched_priority < 0 || ++ (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && param->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if (is_rt_policy(policy) != (param->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (is_rt_policy(policy)) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* can't increase priority */ ++ if (param->sched_priority > p->rt_priority && ++ param->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy == SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } ++ } ++ ++ /* can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ /* ++ * make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * To be able to change p->policy safely, the grunqueue lock must be ++ * held. ++ */ ++ rq = __task_grq_lock(p); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ __task_grq_unlock(); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ return -EINVAL; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || ++ param->sched_priority == p->rt_priority))) { ++ ++ __task_grq_unlock(); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ return 0; ++ } ++ ++ /* recheck policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_grq_unlock(); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ goto recheck; ++ } ++ update_clocks(rq); ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ queued = task_queued(p); ++ if (queued) ++ dequeue_task(p); ++ __setscheduler(p, rq, policy, param->sched_priority); ++ if (queued) { ++ enqueue_task(p); ++ try_preempt(p, rq); ++ } ++ __task_grq_unlock(); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ rt_mutex_adjust_pi(p); ++out: ++ return 0; ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return __sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return __sched_setscheduler(p, policy, param, false); ++} ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setscheduler(p, policy, &lparam); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ */ ++asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, ++ struct sched_param __user *param) ++{ ++ /* negative values for policy are not valid */ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, -1, param); ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ put_online_cpus(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) ++ goto out_unlock; ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = set_cpus_allowed_ptr(p, new_mask); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ put_online_cpus(); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ cpumask_t *new_mask) ++{ ++ if (len < sizeof(cpumask_t)) { ++ memset(new_mask, 0, sizeof(cpumask_t)); ++ } else if (len > sizeof(cpumask_t)) { ++ len = sizeof(cpumask_t); ++ } ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++ ++/** ++ * sys_sched_setaffinity - set the cpu affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new cpu mask ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ unsigned long flags; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ grq_lock_irqsave(&flags); ++ cpumask_and(mask, tsk_cpus_allowed(p), cpu_online_mask); ++ grq_unlock_irqrestore(&flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ put_online_cpus(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the cpu affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current cpu mask ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ size_t retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ */ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ struct task_struct *p; ++ ++ p = current; ++ grq_lock_irq(); ++ schedstat_inc(task_rq(p), yld_count); ++ requeue_task(p); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ __release(grq.lock); ++ spin_release(&grq.lock.dep_map, 1, _THIS_IP_); ++ do_raw_spin_unlock(&grq.lock); ++ preempt_enable_no_resched(); ++ ++ schedule(); ++ ++ return 0; ++} ++ ++static inline bool should_resched(void) ++{ ++ return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); ++} ++ ++static void __cond_resched(void) ++{ ++ /* NOT a real fix but will make voluntary preempt work. 馬鹿ãªäº‹ */ ++ if (unlikely(system_state != SYSTEM_RUNNING)) ++ return; ++ ++ add_preempt_count(PREEMPT_ACTIVE); ++ schedule(); ++ sub_preempt_count(PREEMPT_ACTIVE); ++} ++ ++int __sched _cond_resched(void) ++{ ++ if (should_resched()) { ++ __cond_resched(); ++ return 1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ __cond_resched(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++int __sched __cond_resched_softirq(void) ++{ ++ BUG_ON(!in_softirq()); ++ ++ if (should_resched()) { ++ local_bh_enable(); ++ __cond_resched(); ++ local_bh_disable(); ++ return 1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(__cond_resched_softirq); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * This is a shortcut for kernel-space yielding - it marks the ++ * thread runnable and calls sys_sched_yield(). ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ sys_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * Returns true if we indeed boosted the target task. ++ */ ++bool __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ unsigned long flags; ++ bool yielded = 0; ++ struct rq *rq; ++ ++ rq = this_rq(); ++ grq_lock_irqsave(&flags); ++ if (task_running(p) || p->state) ++ goto out_unlock; ++ yielded = 1; ++ if (p->deadline > rq->rq_deadline) ++ p->deadline = rq->rq_deadline; ++ p->time_slice += rq->rq_time_slice; ++ rq->rq_time_slice = 0; ++ if (p->time_slice > timeslice()) ++ p->time_slice = timeslice(); ++ set_tsk_need_resched(rq->curr); ++out_unlock: ++ grq_unlock_irqrestore(&flags); ++ ++ if (yielded) ++ schedule(); ++ return yielded; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++void __sched io_schedule(void) ++{ ++ struct rq *rq = raw_rq(); ++ ++ delayacct_blkio_start(); ++ atomic_inc(&rq->nr_iowait); ++ blk_flush_plug(current); ++ current->in_iowait = 1; ++ schedule(); ++ current->in_iowait = 0; ++ atomic_dec(&rq->nr_iowait); ++ delayacct_blkio_end(); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ struct rq *rq = raw_rq(); ++ long ret; ++ ++ delayacct_blkio_start(); ++ atomic_inc(&rq->nr_iowait); ++ blk_flush_plug(current); ++ current->in_iowait = 1; ++ ret = schedule_timeout(timeout); ++ current->in_iowait = 0; ++ atomic_dec(&rq->nr_iowait); ++ delayacct_blkio_end(); ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * this syscall returns the maximum rt_priority that can be used ++ * by a given scheduling class. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * this syscall returns the minimum rt_priority that can be used ++ * by a given scheduling class. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * this syscall writes the default timeslice value of a given process ++ * into the user-space timespec buffer. A value of '0' means infinity. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct timespec __user *, interval) ++{ ++ struct task_struct *p; ++ unsigned int time_slice; ++ unsigned long flags; ++ int retval; ++ struct timespec t; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ grq_lock_irqsave(&flags); ++ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); ++ grq_unlock_irqrestore(&flags); ++ ++ rcu_read_unlock(); ++ t = ns_to_timespec(time_slice); ++ retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ unsigned state; ++ ++ state = p->state ? __ffs(p->state) + 1 : 0; ++ printk(KERN_INFO "%-15.15s %c", p->comm, ++ state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); ++#if BITS_PER_LONG == 32 ++ if (state == TASK_RUNNING) ++ printk(KERN_CONT " running "); ++ else ++ printk(KERN_CONT " %08lx ", thread_saved_pc(p)); ++#else ++ if (state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++ else ++ printk(KERN_CONT " %016lx ", thread_saved_pc(p)); ++#endif ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), task_pid_nr(p->real_parent), ++ (unsigned long)task_thread_info(p)->flags); ++ ++ show_stack(p, NULL); ++} ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ do_each_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ */ ++ touch_nmi_watchdog(); ++ if (!state_filter || (p->state & state_filter)) ++ sched_show_task(p); ++ } while_each_thread(g, p); ++ ++ touch_all_softlockup_watchdogs(); ++ ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++#ifdef CONFIG_SMP ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(tsk_cpus_allowed(p), new_mask); ++} ++#endif ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ time_grq_lock(rq, &flags); ++ idle->last_ran = rq->clock; ++ idle->state = TASK_RUNNING; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ set_rq_task(rq, idle); ++ do_set_cpus_allowed(idle, &cpumask_of_cpu(cpu)); ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ rq->curr = rq->idle = idle; ++ idle->on_cpu = 1; ++ grq_unlock_irqrestore(&flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ task_thread_info(idle)->preempt_count = 0; ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++#if defined(CONFIG_SMP) ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) ++/** ++ * lowest_flag_domain - Return lowest sched_domain containing flag. ++ * @cpu: The cpu whose lowest level of sched domain is to ++ * be returned. ++ * @flag: The flag to check for the lowest sched_domain ++ * for the given cpu. ++ * ++ * Returns the lowest sched_domain of a cpu which contains the given flag. ++ */ ++static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd; ++ ++ for_each_domain(cpu, sd) ++ if (sd && (sd->flags & flag)) ++ break; ++ ++ return sd; ++} ++ ++/** ++ * for_each_flag_domain - Iterates over sched_domains containing the flag. ++ * @cpu: The cpu whose domains we're iterating over. ++ * @sd: variable holding the value of the power_savings_sd ++ * for cpu. ++ * @flag: The flag to filter the sched_domains to be iterated. ++ * ++ * Iterates over all the scheduler domains for a given cpu that has the 'flag' ++ * set, starting from the lowest sched_domain to the highest. ++ */ ++#define for_each_flag_domain(cpu, sd, flag) \ ++ for (sd = lowest_flag_domain(cpu, flag); \ ++ (sd && (sd->flags & flag)); sd = sd->parent) ++ ++#endif /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ ++ ++static inline void resched_cpu(int cpu) ++{ ++ unsigned long flags; ++ ++ grq_lock_irqsave(&flags); ++ resched_task(cpu_curr(cpu)); ++ grq_unlock_irqrestore(&flags); ++} ++ ++/* ++ * In the semi idle case, use the nearest busy cpu for migrating timers ++ * from an idle cpu. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle cpu will add more delays to the timers than intended ++ * (as that cpu's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int cpu = smp_processor_id(); ++ int i; ++ struct sched_domain *sd; ++ ++ rcu_read_lock(); ++ for_each_domain(cpu, sd) { ++ for_each_cpu(i, sched_domain_span(sd)) { ++ if (!idle_cpu(i)) ++ cpu = i; ++ goto unlock; ++ } ++ } ++unlock: ++ rcu_read_unlock(); ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ struct task_struct *idle; ++ struct rq *rq; ++ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ rq = cpu_rq(cpu); ++ idle = rq->idle; ++ ++ /* ++ * This is safe, as this function is called with the timer ++ * wheel base lock of (cpu) held. When the CPU is on the way ++ * to idle and has not yet set rq->curr to idle then it will ++ * be serialised on the timer wheel base lock and take the new ++ * timer into account automatically. ++ */ ++ if (unlikely(rq->curr != idle)) ++ return; ++ ++ /* ++ * We can set TIF_RESCHED on the idle task of the other CPU ++ * lockless. The worst case is that the other CPU runs the ++ * idle task through an additional NOOP schedule() ++ */ ++ set_tsk_need_resched(idle); ++ ++ /* NEED_RESCHED must be visible before we test polling */ ++ smp_mb(); ++ if (!tsk_is_polling(idle)) ++ smp_send_reschedule(cpu); ++} ++ ++#endif /* CONFIG_NO_HZ */ ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ unsigned long flags; ++ int running_wrong = 0; ++ int queued = 0; ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = task_grq_lock(p, &flags); ++ ++ if (cpumask_equal(tsk_cpus_allowed(p), new_mask)) ++ goto out; ++ ++ if (!cpumask_intersects(new_mask, cpu_active_mask)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ queued = task_queued(p); ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p)) { ++ /* Task is running on the wrong cpu now, reschedule it. */ ++ if (rq == this_rq()) { ++ set_tsk_need_resched(p); ++ running_wrong = 1; ++ } else ++ resched_task(p); ++ } else ++ set_task_cpu(p, cpumask_any_and(cpu_active_mask, new_mask)); ++ ++out: ++ if (queued) ++ try_preempt(p, rq); ++ task_grq_unlock(&flags); ++ ++ if (running_wrong) ++ _cond_resched(); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* Run through task list and find tasks affined to just the dead cpu, then ++ * allocate a new affinity */ ++static void break_sole_affinity(int src_cpu, struct task_struct *idle) ++{ ++ struct task_struct *p, *t; ++ ++ do_each_thread(t, p) { ++ if (p != idle && !online_cpus(p)) { ++ cpumask_copy(tsk_cpus_allowed(p), cpu_possible_mask); ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk(KERN_INFO "process %d (%s) no " ++ "longer affine to cpu %d\n", ++ task_pid_nr(p), p->comm, src_cpu); ++ } ++ } ++ clear_sticky(p); ++ } while_each_thread(t, p); ++} ++ ++/* ++ * Schedules idle task to be the next runnable task on current CPU. ++ * It does so by boosting its priority to highest possible. ++ * Used by CPU offline code. ++ */ ++void sched_idle_next(struct rq *rq, int this_cpu, struct task_struct *idle) ++{ ++ /* cpu has to be offline */ ++ BUG_ON(cpu_online(this_cpu)); ++ ++ __setscheduler(idle, rq, SCHED_FIFO, STOP_PRIO); ++ ++ activate_idle_task(idle); ++ set_tsk_need_resched(rq->curr); ++} ++ ++/* ++ * Ensures that the idle task is using init_mm right before its cpu goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) ++ switch_mm(mm, &init_mm, current); ++ mmdrop(mm); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = MAX_USER_RT_PRIO - 1 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal rt scheduling prio so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_FIFO, &start_param); ++ } ++} ++ ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++ ++static struct ctl_table sd_ctl_dir[] = { ++ { ++ .procname = "sched_domain", ++ .mode = 0555, ++ }, ++ {} ++}; ++ ++static struct ctl_table sd_ctl_root[] = { ++ { ++ .procname = "kernel", ++ .mode = 0555, ++ .child = sd_ctl_dir, ++ }, ++ {} ++}; ++ ++static struct ctl_table *sd_alloc_ctl_entry(int n) ++{ ++ struct ctl_table *entry = ++ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); ++ ++ return entry; ++} ++ ++static void sd_free_ctl_entry(struct ctl_table **tablep) ++{ ++ struct ctl_table *entry; ++ ++ /* ++ * In the intermediate directories, both the child directory and ++ * procname are dynamically allocated and could fail but the mode ++ * will always be set. In the lowest directory the names are ++ * static strings and all have proc handlers. ++ */ ++ for (entry = *tablep; entry->mode; entry++) { ++ if (entry->child) ++ sd_free_ctl_entry(&entry->child); ++ if (entry->proc_handler == NULL) ++ kfree(entry->procname); ++ } ++ ++ kfree(*tablep); ++ *tablep = NULL; ++} ++ ++static void ++set_table_entry(struct ctl_table *entry, ++ const char *procname, void *data, int maxlen, ++ mode_t mode, proc_handler *proc_handler) ++{ ++ entry->procname = procname; ++ entry->data = data; ++ entry->maxlen = maxlen; ++ entry->mode = mode; ++ entry->proc_handler = proc_handler; ++} ++ ++static struct ctl_table * ++sd_alloc_ctl_domain_table(struct sched_domain *sd) ++{ ++ struct ctl_table *table = sd_alloc_ctl_entry(13); ++ ++ if (table == NULL) ++ return NULL; ++ ++ set_table_entry(&table[0], "min_interval", &sd->min_interval, ++ sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[1], "max_interval", &sd->max_interval, ++ sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[2], "busy_idx", &sd->busy_idx, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[3], "idle_idx", &sd->idle_idx, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[5], "wake_idx", &sd->wake_idx, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[7], "busy_factor", &sd->busy_factor, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[9], "cache_nice_tries", ++ &sd->cache_nice_tries, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[10], "flags", &sd->flags, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[11], "name", sd->name, ++ CORENAME_MAX_SIZE, 0444, proc_dostring); ++ /* &table[12] is terminator */ ++ ++ return table; ++} ++ ++static ctl_table *sd_alloc_ctl_cpu_table(int cpu) ++{ ++ struct ctl_table *entry, *table; ++ struct sched_domain *sd; ++ int domain_num = 0, i; ++ char buf[32]; ++ ++ for_each_domain(cpu, sd) ++ domain_num++; ++ entry = table = sd_alloc_ctl_entry(domain_num + 1); ++ if (table == NULL) ++ return NULL; ++ ++ i = 0; ++ for_each_domain(cpu, sd) { ++ snprintf(buf, 32, "domain%d", i); ++ entry->procname = kstrdup(buf, GFP_KERNEL); ++ entry->mode = 0555; ++ entry->child = sd_alloc_ctl_domain_table(sd); ++ entry++; ++ i++; ++ } ++ return table; ++} ++ ++static struct ctl_table_header *sd_sysctl_header; ++static void register_sched_domain_sysctl(void) ++{ ++ int i, cpu_num = num_possible_cpus(); ++ struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); ++ char buf[32]; ++ ++ WARN_ON(sd_ctl_dir[0].child); ++ sd_ctl_dir[0].child = entry; ++ ++ if (entry == NULL) ++ return; ++ ++ for_each_possible_cpu(i) { ++ snprintf(buf, 32, "cpu%d", i); ++ entry->procname = kstrdup(buf, GFP_KERNEL); ++ entry->mode = 0555; ++ entry->child = sd_alloc_ctl_cpu_table(i); ++ entry++; ++ } ++ ++ WARN_ON(sd_sysctl_header); ++ sd_sysctl_header = register_sysctl_table(sd_ctl_root); ++} ++ ++/* may be called multiple times per register */ ++static void unregister_sched_domain_sysctl(void) ++{ ++ if (sd_sysctl_header) ++ unregister_sysctl_table(sd_sysctl_header); ++ sd_sysctl_header = NULL; ++ if (sd_ctl_dir[0].child) ++ sd_free_ctl_entry(&sd_ctl_dir[0].child); ++} ++#else ++static void register_sched_domain_sysctl(void) ++{ ++} ++static void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) { ++ cpumask_set_cpu(cpu_of(rq), rq->rd->online); ++ rq->online = true; ++ } ++} ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) { ++ cpumask_clear_cpu(cpu_of(rq), rq->rd->online); ++ rq->online = false; ++ } ++} ++ ++/* ++ * migration_call - callback that gets triggered when a CPU is added. ++ */ ++static int __cpuinit ++migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) ++{ ++ int cpu = (long)hcpu; ++ unsigned long flags; ++ struct rq *rq = cpu_rq(cpu); ++#ifdef CONFIG_HOTPLUG_CPU ++ struct task_struct *idle = rq->idle; ++#endif ++ ++ switch (action & ~CPU_TASKS_FROZEN) { ++ ++ case CPU_UP_PREPARE: ++ break; ++ ++ case CPU_ONLINE: ++ /* Update our root-domain */ ++ grq_lock_irqsave(&flags); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ ++ set_rq_online(rq); ++ } ++ grq.noc = num_online_cpus(); ++ grq_unlock_irqrestore(&flags); ++ break; ++ ++#ifdef CONFIG_HOTPLUG_CPU ++ case CPU_DEAD: ++ /* Idle task back to normal (off runqueue, low prio) */ ++ grq_lock_irq(); ++ return_task(idle, 1); ++ idle->static_prio = MAX_PRIO; ++ __setscheduler(idle, rq, SCHED_NORMAL, 0); ++ idle->prio = PRIO_LIMIT; ++ set_rq_task(rq, idle); ++ update_clocks(rq); ++ grq_unlock_irq(); ++ break; ++ ++ case CPU_DYING: ++ /* Update our root-domain */ ++ grq_lock_irqsave(&flags); ++ sched_idle_next(rq, cpu, idle); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ break_sole_affinity(cpu, idle); ++ grq.noc = num_online_cpus(); ++ grq_unlock_irqrestore(&flags); ++ break; ++#endif ++ } ++ return NOTIFY_OK; ++} ++ ++/* ++ * Register at high priority so that task migration (migrate_all_tasks) ++ * happens before everything else. This has to be lower priority than ++ * the notifier in the perf_counter subsystem, though. ++ */ ++static struct notifier_block __cpuinitdata migration_notifier = { ++ .notifier_call = migration_call, ++ .priority = CPU_PRI_MIGRATION, ++}; ++ ++static int __cpuinit sched_cpu_active(struct notifier_block *nfb, ++ unsigned long action, void *hcpu) ++{ ++ switch (action & ~CPU_TASKS_FROZEN) { ++ case CPU_ONLINE: ++ case CPU_DOWN_FAILED: ++ set_cpu_active((long)hcpu, true); ++ return NOTIFY_OK; ++ default: ++ return NOTIFY_DONE; ++ } ++} ++ ++static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, ++ unsigned long action, void *hcpu) ++{ ++ switch (action & ~CPU_TASKS_FROZEN) { ++ case CPU_DOWN_PREPARE: ++ set_cpu_active((long)hcpu, false); ++ return NOTIFY_OK; ++ default: ++ return NOTIFY_DONE; ++ } ++} ++ ++int __init migration_init(void) ++{ ++ void *cpu = (void *)(long)smp_processor_id(); ++ int err; ++ ++ /* Initialise migration for the boot CPU */ ++ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); ++ BUG_ON(err == NOTIFY_BAD); ++ migration_call(&migration_notifier, CPU_ONLINE, cpu); ++ register_cpu_notifier(&migration_notifier); ++ ++ /* Register cpu active notifiers */ ++ cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); ++ cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); ++ ++ return 0; ++} ++early_initcall(migration_init); ++#endif ++ ++#ifdef CONFIG_SMP ++ ++static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ ++ ++#ifdef CONFIG_SCHED_DEBUG ++ ++static __read_mostly int sched_domain_debug_enabled; ++ ++static int __init sched_domain_debug_setup(char *str) ++{ ++ sched_domain_debug_enabled = 1; ++ ++ return 0; ++} ++early_param("sched_debug", sched_domain_debug_setup); ++ ++static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, ++ struct cpumask *groupmask) ++{ ++ struct sched_group *group = sd->groups; ++ char str[256]; ++ ++ cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); ++ cpumask_clear(groupmask); ++ ++ printk(KERN_DEBUG "%*s domain %d: ", level, "", level); ++ ++ if (!(sd->flags & SD_LOAD_BALANCE)) { ++ printk("does not load-balance\n"); ++ if (sd->parent) ++ printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" ++ " has parent"); ++ return -1; ++ } ++ ++ printk(KERN_CONT "span %s level %s\n", str, sd->name); ++ ++ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { ++ printk(KERN_ERR "ERROR: domain->span does not contain " ++ "CPU%d\n", cpu); ++ } ++ if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { ++ printk(KERN_ERR "ERROR: domain->groups does not contain" ++ " CPU%d\n", cpu); ++ } ++ ++ printk(KERN_DEBUG "%*s groups:", level + 1, ""); ++ do { ++ if (!group) { ++ printk("\n"); ++ printk(KERN_ERR "ERROR: group is NULL\n"); ++ break; ++ } ++ ++ if (!group->sgp->power) { ++ printk(KERN_CONT "\n"); ++ printk(KERN_ERR "ERROR: domain->cpu_power not " ++ "set\n"); ++ break; ++ } ++ ++ if (!cpumask_weight(sched_group_cpus(group))) { ++ printk(KERN_CONT "\n"); ++ printk(KERN_ERR "ERROR: empty group\n"); ++ break; ++ } ++ ++ if (cpumask_intersects(groupmask, sched_group_cpus(group))) { ++ printk(KERN_CONT "\n"); ++ printk(KERN_ERR "ERROR: repeated CPUs\n"); ++ break; ++ } ++ ++ cpumask_or(groupmask, groupmask, sched_group_cpus(group)); ++ ++ cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); ++ ++ printk(KERN_CONT " %s", str); ++ if (group->sgp->power != SCHED_POWER_SCALE) { ++ printk(KERN_CONT " (cpu_power = %d)", ++ group->sgp->power); ++ } ++ ++ group = group->next; ++ } while (group != sd->groups); ++ printk(KERN_CONT "\n"); ++ ++ if (!cpumask_equal(sched_domain_span(sd), groupmask)) ++ printk(KERN_ERR "ERROR: groups don't span domain->span\n"); ++ ++ if (sd->parent && ++ !cpumask_subset(groupmask, sched_domain_span(sd->parent))) ++ printk(KERN_ERR "ERROR: parent span is not a superset " ++ "of domain->span\n"); ++ return 0; ++} ++ ++static void sched_domain_debug(struct sched_domain *sd, int cpu) ++{ ++ int level = 0; ++ ++ if (!sched_domain_debug_enabled) ++ return; ++ ++ if (!sd) { ++ printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); ++ return; ++ } ++ ++ printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); ++ ++ for (;;) { ++ if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) ++ break; ++ level++; ++ sd = sd->parent; ++ if (!sd) ++ break; ++ } ++} ++#else /* !CONFIG_SCHED_DEBUG */ ++# define sched_domain_debug(sd, cpu) do { } while (0) ++#endif /* CONFIG_SCHED_DEBUG */ ++ ++static int sd_degenerate(struct sched_domain *sd) ++{ ++ if (cpumask_weight(sched_domain_span(sd)) == 1) ++ return 1; ++ ++ /* Following flags need at least 2 groups */ ++ if (sd->flags & (SD_LOAD_BALANCE | ++ SD_BALANCE_NEWIDLE | ++ SD_BALANCE_FORK | ++ SD_BALANCE_EXEC | ++ SD_SHARE_CPUPOWER | ++ SD_SHARE_PKG_RESOURCES)) { ++ if (sd->groups != sd->groups->next) ++ return 0; ++ } ++ ++ /* Following flags don't use groups */ ++ if (sd->flags & (SD_WAKE_AFFINE)) ++ return 0; ++ ++ return 1; ++} ++ ++static int ++sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) ++{ ++ unsigned long cflags = sd->flags, pflags = parent->flags; ++ ++ if (sd_degenerate(parent)) ++ return 1; ++ ++ if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) ++ return 0; ++ ++ /* Flags needing groups don't count if only 1 group in parent */ ++ if (parent->groups == parent->groups->next) { ++ pflags &= ~(SD_LOAD_BALANCE | ++ SD_BALANCE_NEWIDLE | ++ SD_BALANCE_FORK | ++ SD_BALANCE_EXEC | ++ SD_SHARE_CPUPOWER | ++ SD_SHARE_PKG_RESOURCES); ++ if (nr_node_ids == 1) ++ pflags &= ~SD_SERIALIZE; ++ } ++ if (~cflags & pflags) ++ return 0; ++ ++ return 1; ++} ++ ++static void free_rootdomain(struct rcu_head *rcu) ++{ ++ struct root_domain *rd = container_of(rcu, struct root_domain, rcu); ++ ++ cpupri_cleanup(&rd->cpupri); ++ free_cpumask_var(rd->rto_mask); ++ free_cpumask_var(rd->online); ++ free_cpumask_var(rd->span); ++ kfree(rd); ++} ++ ++static void rq_attach_root(struct rq *rq, struct root_domain *rd) ++{ ++ struct root_domain *old_rd = NULL; ++ unsigned long flags; ++ ++ grq_lock_irqsave(&flags); ++ ++ if (rq->rd) { ++ old_rd = rq->rd; ++ ++ if (cpumask_test_cpu(rq->cpu, old_rd->online)) ++ set_rq_offline(rq); ++ ++ cpumask_clear_cpu(rq->cpu, old_rd->span); ++ ++ /* ++ * If we dont want to free the old_rt yet then ++ * set old_rd to NULL to skip the freeing later ++ * in this function: ++ */ ++ if (!atomic_dec_and_test(&old_rd->refcount)) ++ old_rd = NULL; ++ } ++ ++ atomic_inc(&rd->refcount); ++ rq->rd = rd; ++ ++ cpumask_set_cpu(rq->cpu, rd->span); ++ if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) ++ set_rq_online(rq); ++ ++ grq_unlock_irqrestore(&flags); ++ ++ if (old_rd) ++ call_rcu_sched(&old_rd->rcu, free_rootdomain); ++} ++ ++static int init_rootdomain(struct root_domain *rd) ++{ ++ memset(rd, 0, sizeof(*rd)); ++ ++ if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) ++ goto out; ++ if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) ++ goto free_span; ++ if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) ++ goto free_online; ++ ++ if (cpupri_init(&rd->cpupri) != 0) ++ goto free_rto_mask; ++ return 0; ++ ++free_rto_mask: ++ free_cpumask_var(rd->rto_mask); ++free_online: ++ free_cpumask_var(rd->online); ++free_span: ++ free_cpumask_var(rd->span); ++out: ++ return -ENOMEM; ++} ++ ++static void init_defrootdomain(void) ++{ ++ init_rootdomain(&def_root_domain); ++ ++ atomic_set(&def_root_domain.refcount, 1); ++} ++ ++static struct root_domain *alloc_rootdomain(void) ++{ ++ struct root_domain *rd; ++ ++ rd = kmalloc(sizeof(*rd), GFP_KERNEL); ++ if (!rd) ++ return NULL; ++ ++ if (init_rootdomain(rd) != 0) { ++ kfree(rd); ++ return NULL; ++ } ++ ++ return rd; ++} ++ ++static void free_sched_groups(struct sched_group *sg, int free_sgp) ++{ ++ struct sched_group *tmp, *first; ++ ++ if (!sg) ++ return; ++ ++ first = sg; ++ do { ++ tmp = sg->next; ++ ++ if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) ++ kfree(sg->sgp); ++ ++ kfree(sg); ++ sg = tmp; ++ } while (sg != first); ++} ++ ++static void free_sched_domain(struct rcu_head *rcu) ++{ ++ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); ++ ++ /* ++ * If its an overlapping domain it has private groups, iterate and ++ * nuke them all. ++ */ ++ if (sd->flags & SD_OVERLAP) { ++ free_sched_groups(sd->groups, 1); ++ } else if (atomic_dec_and_test(&sd->groups->ref)) { ++ kfree(sd->groups->sgp); ++ kfree(sd->groups); ++ } ++ kfree(sd); ++} ++ ++static void destroy_sched_domain(struct sched_domain *sd, int cpu) ++{ ++ call_rcu(&sd->rcu, free_sched_domain); ++} ++ ++static void destroy_sched_domains(struct sched_domain *sd, int cpu) ++{ ++ for (; sd; sd = sd->parent) ++ destroy_sched_domain(sd, cpu); ++} ++ ++/* ++ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must ++ * hold the hotplug lock. ++ */ ++static void ++cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct sched_domain *tmp; ++ ++ /* Remove the sched domains which do not contribute to scheduling. */ ++ for (tmp = sd; tmp; ) { ++ struct sched_domain *parent = tmp->parent; ++ if (!parent) ++ break; ++ ++ if (sd_parent_degenerate(tmp, parent)) { ++ tmp->parent = parent->parent; ++ if (parent->parent) ++ parent->parent->child = tmp; ++ destroy_sched_domain(parent, cpu); ++ } else ++ tmp = tmp->parent; ++ } ++ ++ if (sd && sd_degenerate(sd)) { ++ tmp = sd; ++ sd = sd->parent; ++ destroy_sched_domain(tmp, cpu); ++ if (sd) ++ sd->child = NULL; ++ } ++ ++ sched_domain_debug(sd, cpu); ++ ++ rq_attach_root(rq, rd); ++ tmp = rq->sd; ++ rcu_assign_pointer(rq->sd, sd); ++ destroy_sched_domains(tmp, cpu); ++} ++ ++/* cpus with isolated domains */ ++static cpumask_var_t cpu_isolated_map; ++ ++/* Setup the mask of cpus configured for isolated domains */ ++static int __init isolated_cpu_setup(char *str) ++{ ++ alloc_bootmem_cpumask_var(&cpu_isolated_map); ++ cpulist_parse(str, cpu_isolated_map); ++ return 1; ++} ++ ++__setup("isolcpus=", isolated_cpu_setup); ++ ++#define SD_NODES_PER_DOMAIN 16 ++ ++#ifdef CONFIG_NUMA ++ ++/** ++ * find_next_best_node - find the next node to include in a sched_domain ++ * @node: node whose sched_domain we're building ++ * @used_nodes: nodes already in the sched_domain ++ * ++ * Find the next node to include in a given scheduling domain. Simply ++ * finds the closest node not already in the @used_nodes map. ++ * ++ * Should use nodemask_t. ++ */ ++static int find_next_best_node(int node, nodemask_t *used_nodes) ++{ ++ int i, n, val, min_val, best_node = -1; ++ ++ min_val = INT_MAX; ++ ++ for (i = 0; i < nr_node_ids; i++) { ++ /* Start at @node */ ++ n = (node + i) % nr_node_ids; ++ ++ if (!nr_cpus_node(n)) ++ continue; ++ ++ /* Skip already used nodes */ ++ if (node_isset(n, *used_nodes)) ++ continue; ++ ++ /* Simple min distance search */ ++ val = node_distance(node, n); ++ ++ if (val < min_val) { ++ min_val = val; ++ best_node = n; ++ } ++ } ++ ++ if (best_node != -1) ++ node_set(best_node, *used_nodes); ++ return best_node; ++} ++ ++/** ++ * sched_domain_node_span - get a cpumask for a node's sched_domain ++ * @node: node whose cpumask we're constructing ++ * @span: resulting cpumask ++ * ++ * Given a node, construct a good cpumask for its sched_domain to span. It ++ * should be one that prevents unnecessary balancing, but also spreads tasks ++ * out optimally. ++ */ ++static void sched_domain_node_span(int node, struct cpumask *span) ++{ ++ nodemask_t used_nodes; ++ int i; ++ ++ cpumask_clear(span); ++ nodes_clear(used_nodes); ++ ++ cpumask_or(span, span, cpumask_of_node(node)); ++ node_set(node, used_nodes); ++ ++ for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { ++ int next_node = find_next_best_node(node, &used_nodes); ++ if (next_node < 0) ++ break; ++ cpumask_or(span, span, cpumask_of_node(next_node)); ++ } ++} ++ ++static const struct cpumask *cpu_node_mask(int cpu) ++{ ++ lockdep_assert_held(&sched_domains_mutex); ++ ++ sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); ++ ++ return sched_domains_tmpmask; ++} ++ ++static const struct cpumask *cpu_allnodes_mask(int cpu) ++{ ++ return cpu_possible_mask; ++} ++#endif /* CONFIG_NUMA */ ++ ++static const struct cpumask *cpu_cpu_mask(int cpu) ++{ ++ return cpumask_of_node(cpu_to_node(cpu)); ++} ++ ++int sched_smt_power_savings = 0, sched_mc_power_savings = 0; ++ ++struct sd_data { ++ struct sched_domain **__percpu sd; ++ struct sched_group **__percpu sg; ++ struct sched_group_power **__percpu sgp; ++}; ++ ++struct s_data { ++ struct sched_domain ** __percpu sd; ++ struct root_domain *rd; ++}; ++ ++enum s_alloc { ++ sa_rootdomain, ++ sa_sd, ++ sa_sd_storage, ++ sa_none, ++}; ++ ++struct sched_domain_topology_level; ++ ++typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); ++typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); ++ ++#define SDTL_OVERLAP 0x01 ++ ++struct sched_domain_topology_level { ++ sched_domain_init_f init; ++ sched_domain_mask_f mask; ++ int flags; ++ struct sd_data data; ++}; ++ ++static int ++build_overlap_sched_groups(struct sched_domain *sd, int cpu) ++{ ++ struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; ++ const struct cpumask *span = sched_domain_span(sd); ++ struct cpumask *covered = sched_domains_tmpmask; ++ struct sd_data *sdd = sd->private; ++ struct sched_domain *child; ++ int i; ++ ++ cpumask_clear(covered); ++ ++ for_each_cpu(i, span) { ++ struct cpumask *sg_span; ++ ++ if (cpumask_test_cpu(i, covered)) ++ continue; ++ ++ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), ++ GFP_KERNEL, cpu_to_node(i)); ++ ++ if (!sg) ++ goto fail; ++ ++ sg_span = sched_group_cpus(sg); ++ ++ child = *per_cpu_ptr(sdd->sd, i); ++ if (child->child) { ++ child = child->child; ++ cpumask_copy(sg_span, sched_domain_span(child)); ++ } else ++ cpumask_set_cpu(i, sg_span); ++ ++ cpumask_or(covered, covered, sg_span); ++ ++ sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); ++ atomic_inc(&sg->sgp->ref); ++ ++ if (cpumask_test_cpu(cpu, sg_span)) ++ groups = sg; ++ ++ if (!first) ++ first = sg; ++ if (last) ++ last->next = sg; ++ last = sg; ++ last->next = first; ++ } ++ sd->groups = groups; ++ ++ return 0; ++ ++fail: ++ free_sched_groups(first, 0); ++ ++ return -ENOMEM; ++} ++ ++static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) ++{ ++ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); ++ struct sched_domain *child = sd->child; ++ ++ if (child) ++ cpu = cpumask_first(sched_domain_span(child)); ++ ++ if (sg) { ++ *sg = *per_cpu_ptr(sdd->sg, cpu); ++ (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); ++ atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ ++ } ++ ++ return cpu; ++} ++ ++/* ++ * build_sched_groups will build a circular linked list of the groups ++ * covered by the given span, and will set each group's ->cpumask correctly, ++ * and ->cpu_power to 0. ++ * ++ * Assumes the sched_domain tree is fully constructed ++ */ ++static int ++build_sched_groups(struct sched_domain *sd, int cpu) ++{ ++ struct sched_group *first = NULL, *last = NULL; ++ struct sd_data *sdd = sd->private; ++ const struct cpumask *span = sched_domain_span(sd); ++ struct cpumask *covered; ++ int i; ++ ++ get_group(cpu, sdd, &sd->groups); ++ atomic_inc(&sd->groups->ref); ++ ++ if (cpu != cpumask_first(sched_domain_span(sd))) ++ return 0; ++ ++ lockdep_assert_held(&sched_domains_mutex); ++ covered = sched_domains_tmpmask; ++ ++ cpumask_clear(covered); ++ ++ for_each_cpu(i, span) { ++ struct sched_group *sg; ++ int group = get_group(i, sdd, &sg); ++ int j; ++ ++ if (cpumask_test_cpu(i, covered)) ++ continue; ++ ++ cpumask_clear(sched_group_cpus(sg)); ++ sg->sgp->power = 0; ++ ++ for_each_cpu(j, span) { ++ if (get_group(j, sdd, NULL) != group) ++ continue; ++ ++ cpumask_set_cpu(j, covered); ++ cpumask_set_cpu(j, sched_group_cpus(sg)); ++ } ++ ++ if (!first) ++ first = sg; ++ if (last) ++ last->next = sg; ++ last = sg; ++ } ++ last->next = first; ++ ++ return 0; ++} ++ ++/* ++ * Initializers for schedule domains ++ * Non-inlined to reduce accumulated stack pressure in build_sched_domains() ++ */ ++ ++#ifdef CONFIG_SCHED_DEBUG ++# define SD_INIT_NAME(sd, type) sd->name = #type ++#else ++# define SD_INIT_NAME(sd, type) do { } while (0) ++#endif ++ ++#define SD_INIT_FUNC(type) \ ++static noinline struct sched_domain * \ ++sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ ++{ \ ++ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ ++ *sd = SD_##type##_INIT; \ ++ SD_INIT_NAME(sd, type); \ ++ sd->private = &tl->data; \ ++ return sd; \ ++} ++ ++SD_INIT_FUNC(CPU) ++#ifdef CONFIG_NUMA ++ SD_INIT_FUNC(ALLNODES) ++ SD_INIT_FUNC(NODE) ++#endif ++#ifdef CONFIG_SCHED_SMT ++ SD_INIT_FUNC(SIBLING) ++#endif ++#ifdef CONFIG_SCHED_MC ++ SD_INIT_FUNC(MC) ++#endif ++#ifdef CONFIG_SCHED_BOOK ++ SD_INIT_FUNC(BOOK) ++#endif ++ ++static int default_relax_domain_level = -1; ++int sched_domain_level_max; ++ ++static int __init setup_relax_domain_level(char *str) ++{ ++ unsigned long val; ++ ++ val = simple_strtoul(str, NULL, 0); ++ if (val < sched_domain_level_max) ++ default_relax_domain_level = val; ++ ++ return 1; ++} ++__setup("relax_domain_level=", setup_relax_domain_level); ++ ++static void set_domain_attribute(struct sched_domain *sd, ++ struct sched_domain_attr *attr) ++{ ++ int request; ++ ++ if (!attr || attr->relax_domain_level < 0) { ++ if (default_relax_domain_level < 0) ++ return; ++ else ++ request = default_relax_domain_level; ++ } else ++ request = attr->relax_domain_level; ++ if (request < sd->level) { ++ /* turn off idle balance on this domain */ ++ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); ++ } else { ++ /* turn on idle balance on this domain */ ++ sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); ++ } ++} ++ ++static void __sdt_free(const struct cpumask *cpu_map); ++static int __sdt_alloc(const struct cpumask *cpu_map); ++ ++static void __free_domain_allocs(struct s_data *d, enum s_alloc what, ++ const struct cpumask *cpu_map) ++{ ++ switch (what) { ++ case sa_rootdomain: ++ if (!atomic_read(&d->rd->refcount)) ++ free_rootdomain(&d->rd->rcu); /* fall through */ ++ case sa_sd: ++ free_percpu(d->sd); /* fall through */ ++ case sa_sd_storage: ++ __sdt_free(cpu_map); /* fall through */ ++ case sa_none: ++ break; ++ } ++} ++ ++static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, ++ const struct cpumask *cpu_map) ++{ ++ memset(d, 0, sizeof(*d)); ++ ++ if (__sdt_alloc(cpu_map)) ++ return sa_sd_storage; ++ d->sd = alloc_percpu(struct sched_domain *); ++ if (!d->sd) ++ return sa_sd_storage; ++ d->rd = alloc_rootdomain(); ++ if (!d->rd) ++ return sa_sd; ++ return sa_rootdomain; ++} ++ ++/* ++ * NULL the sd_data elements we've used to build the sched_domain and ++ * sched_group structure so that the subsequent __free_domain_allocs() ++ * will not free the data we're using. ++ */ ++static void claim_allocations(int cpu, struct sched_domain *sd) ++{ ++ struct sd_data *sdd = sd->private; ++ ++ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); ++ *per_cpu_ptr(sdd->sd, cpu) = NULL; ++ ++ if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) ++ *per_cpu_ptr(sdd->sg, cpu) = NULL; ++ ++ if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) ++ *per_cpu_ptr(sdd->sgp, cpu) = NULL; ++} ++ ++#ifdef CONFIG_SCHED_SMT ++static const struct cpumask *cpu_smt_mask(int cpu) ++{ ++ return topology_thread_cpumask(cpu); ++} ++#endif ++ ++/* ++ * Topology list, bottom-up. ++ */ ++static struct sched_domain_topology_level default_topology[] = { ++#ifdef CONFIG_SCHED_SMT ++ { sd_init_SIBLING, cpu_smt_mask, }, ++#endif ++#ifdef CONFIG_SCHED_MC ++ { sd_init_MC, cpu_coregroup_mask, }, ++#endif ++#ifdef CONFIG_SCHED_BOOK ++ { sd_init_BOOK, cpu_book_mask, }, ++#endif ++ { sd_init_CPU, cpu_cpu_mask, }, ++#ifdef CONFIG_NUMA ++ { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, ++ { sd_init_ALLNODES, cpu_allnodes_mask, }, ++#endif ++ { NULL, }, ++}; ++ ++static struct sched_domain_topology_level *sched_domain_topology = default_topology; ++ ++static int __sdt_alloc(const struct cpumask *cpu_map) ++{ ++ struct sched_domain_topology_level *tl; ++ int j; ++ ++ for (tl = sched_domain_topology; tl->init; tl++) { ++ struct sd_data *sdd = &tl->data; ++ ++ sdd->sd = alloc_percpu(struct sched_domain *); ++ if (!sdd->sd) ++ return -ENOMEM; ++ ++ sdd->sg = alloc_percpu(struct sched_group *); ++ if (!sdd->sg) ++ return -ENOMEM; ++ ++ sdd->sgp = alloc_percpu(struct sched_group_power *); ++ if (!sdd->sgp) ++ return -ENOMEM; ++ ++ for_each_cpu(j, cpu_map) { ++ struct sched_domain *sd; ++ struct sched_group *sg; ++ struct sched_group_power *sgp; ++ ++ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), ++ GFP_KERNEL, cpu_to_node(j)); ++ if (!sd) ++ return -ENOMEM; ++ ++ *per_cpu_ptr(sdd->sd, j) = sd; ++ ++ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), ++ GFP_KERNEL, cpu_to_node(j)); ++ if (!sg) ++ return -ENOMEM; ++ ++ *per_cpu_ptr(sdd->sg, j) = sg; ++ ++ sgp = kzalloc_node(sizeof(struct sched_group_power), ++ GFP_KERNEL, cpu_to_node(j)); ++ if (!sgp) ++ return -ENOMEM; ++ ++ *per_cpu_ptr(sdd->sgp, j) = sgp; ++ } ++ } ++ ++ return 0; ++} ++ ++static void __sdt_free(const struct cpumask *cpu_map) ++{ ++ struct sched_domain_topology_level *tl; ++ int j; ++ ++ for (tl = sched_domain_topology; tl->init; tl++) { ++ struct sd_data *sdd = &tl->data; ++ ++ for_each_cpu(j, cpu_map) { ++ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); ++ if (sd && (sd->flags & SD_OVERLAP)) ++ free_sched_groups(sd->groups, 0); ++ kfree(*per_cpu_ptr(sdd->sd, j)); ++ kfree(*per_cpu_ptr(sdd->sg, j)); ++ kfree(*per_cpu_ptr(sdd->sgp, j)); ++ } ++ free_percpu(sdd->sd); ++ free_percpu(sdd->sg); ++ free_percpu(sdd->sgp); ++ } ++} ++ ++struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, ++ struct s_data *d, const struct cpumask *cpu_map, ++ struct sched_domain_attr *attr, struct sched_domain *child, ++ int cpu) ++{ ++ struct sched_domain *sd = tl->init(tl, cpu); ++ if (!sd) ++ return child; ++ ++ set_domain_attribute(sd, attr); ++ cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); ++ if (child) { ++ sd->level = child->level + 1; ++ sched_domain_level_max = max(sched_domain_level_max, sd->level); ++ child->parent = sd; ++ } ++ sd->child = child; ++ ++ return sd; ++} ++ ++/* ++ * Build sched domains for a given set of cpus and attach the sched domains ++ * to the individual cpus ++ */ ++static int build_sched_domains(const struct cpumask *cpu_map, ++ struct sched_domain_attr *attr) ++{ ++ enum s_alloc alloc_state = sa_none; ++ struct sched_domain *sd; ++ struct s_data d; ++ int i, ret = -ENOMEM; ++ ++ alloc_state = __visit_domain_allocation_hell(&d, cpu_map); ++ if (alloc_state != sa_rootdomain) ++ goto error; ++ ++ /* Set up domains for cpus specified by the cpu_map. */ ++ for_each_cpu(i, cpu_map) { ++ struct sched_domain_topology_level *tl; ++ ++ sd = NULL; ++ for (tl = sched_domain_topology; tl->init; tl++) { ++ sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); ++ if (tl->flags & SDTL_OVERLAP) ++ sd->flags |= SD_OVERLAP; ++ if (cpumask_equal(cpu_map, sched_domain_span(sd))) ++ break; ++ } ++ ++ while (sd->child) ++ sd = sd->child; ++ ++ *per_cpu_ptr(d.sd, i) = sd; ++ } ++ ++ /* Build the groups for the domains */ ++ for_each_cpu(i, cpu_map) { ++ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { ++ sd->span_weight = cpumask_weight(sched_domain_span(sd)); ++ if (sd->flags & SD_OVERLAP) { ++ if (build_overlap_sched_groups(sd, i)) ++ goto error; ++ } else { ++ if (build_sched_groups(sd, i)) ++ goto error; ++ } ++ } ++ } ++ ++ /* Calculate CPU power for physical packages and nodes */ ++ for (i = nr_cpumask_bits-1; i >= 0; i--) { ++ if (!cpumask_test_cpu(i, cpu_map)) ++ continue; ++ ++ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { ++ claim_allocations(i, sd); ++ } ++ } ++ ++ /* Attach the domains */ ++ rcu_read_lock(); ++ for_each_cpu(i, cpu_map) { ++ sd = *per_cpu_ptr(d.sd, i); ++ cpu_attach_domain(sd, d.rd, i); ++ } ++ rcu_read_unlock(); ++ ++ ret = 0; ++error: ++ __free_domain_allocs(&d, alloc_state, cpu_map); ++ return ret; ++} ++ ++static cpumask_var_t *doms_cur; /* current sched domains */ ++static int ndoms_cur; /* number of sched domains in 'doms_cur' */ ++static struct sched_domain_attr *dattr_cur; ++ /* attribues of custom domains in 'doms_cur' */ ++ ++/* ++ * Special case: If a kmalloc of a doms_cur partition (array of ++ * cpumask) fails, then fallback to a single sched domain, ++ * as determined by the single cpumask fallback_doms. ++ */ ++static cpumask_var_t fallback_doms; ++ ++/* ++ * arch_update_cpu_topology lets virtualized architectures update the ++ * cpu core maps. It is supposed to return 1 if the topology changed ++ * or 0 if it stayed the same. ++ */ ++int __attribute__((weak)) arch_update_cpu_topology(void) ++{ ++ return 0; ++} ++ ++cpumask_var_t *alloc_sched_domains(unsigned int ndoms) ++{ ++ int i; ++ cpumask_var_t *doms; ++ ++ doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); ++ if (!doms) ++ return NULL; ++ for (i = 0; i < ndoms; i++) { ++ if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { ++ free_sched_domains(doms, i); ++ return NULL; ++ } ++ } ++ return doms; ++} ++ ++void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) ++{ ++ unsigned int i; ++ for (i = 0; i < ndoms; i++) ++ free_cpumask_var(doms[i]); ++ kfree(doms); ++} ++ ++/* ++ * Set up scheduler domains and groups. Callers must hold the hotplug lock. ++ * For now this just excludes isolated cpus, but could be used to ++ * exclude other special cases in the future. ++ */ ++static int init_sched_domains(const struct cpumask *cpu_map) ++{ ++ int err; ++ ++ arch_update_cpu_topology(); ++ ndoms_cur = 1; ++ doms_cur = alloc_sched_domains(ndoms_cur); ++ if (!doms_cur) ++ doms_cur = &fallback_doms; ++ cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); ++ dattr_cur = NULL; ++ err = build_sched_domains(doms_cur[0], NULL); ++ register_sched_domain_sysctl(); ++ ++ return err; ++} ++ ++/* ++ * Detach sched domains from a group of cpus specified in cpu_map ++ * These cpus will now be attached to the NULL domain ++ */ ++static void detach_destroy_domains(const struct cpumask *cpu_map) ++{ ++ int i; ++ ++ rcu_read_lock(); ++ for_each_cpu(i, cpu_map) ++ cpu_attach_domain(NULL, &def_root_domain, i); ++ rcu_read_unlock(); ++} ++ ++/* handle null as "default" */ ++static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, ++ struct sched_domain_attr *new, int idx_new) ++{ ++ struct sched_domain_attr tmp; ++ ++ /* fast path */ ++ if (!new && !cur) ++ return 1; ++ ++ tmp = SD_ATTR_INIT; ++ return !memcmp(cur ? (cur + idx_cur) : &tmp, ++ new ? (new + idx_new) : &tmp, ++ sizeof(struct sched_domain_attr)); ++} ++ ++/* ++ * Partition sched domains as specified by the 'ndoms_new' ++ * cpumasks in the array doms_new[] of cpumasks. This compares ++ * doms_new[] to the current sched domain partitioning, doms_cur[]. ++ * It destroys each deleted domain and builds each new domain. ++ * ++ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. ++ * The masks don't intersect (don't overlap.) We should setup one ++ * sched domain for each mask. CPUs not in any of the cpumasks will ++ * not be load balanced. If the same cpumask appears both in the ++ * current 'doms_cur' domains and in the new 'doms_new', we can leave ++ * it as it is. ++ * ++ * The passed in 'doms_new' should be allocated using ++ * alloc_sched_domains. This routine takes ownership of it and will ++ * free_sched_domains it when done with it. If the caller failed the ++ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, ++ * and partition_sched_domains() will fallback to the single partition ++ * 'fallback_doms', it also forces the domains to be rebuilt. ++ * ++ * If doms_new == NULL it will be replaced with cpu_online_mask. ++ * ndoms_new == 0 is a special case for destroying existing domains, ++ * and it will not create the default domain. ++ * ++ * Call with hotplug lock held ++ */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{ ++ int i, j, n; ++ int new_topology; ++ ++ mutex_lock(&sched_domains_mutex); ++ ++ /* always unregister in case we don't destroy any domains */ ++ unregister_sched_domain_sysctl(); ++ ++ /* Let architecture update cpu core mappings. */ ++ new_topology = arch_update_cpu_topology(); ++ ++ n = doms_new ? ndoms_new : 0; ++ ++ /* Destroy deleted domains */ ++ for (i = 0; i < ndoms_cur; i++) { ++ for (j = 0; j < n && !new_topology; j++) { ++ if (cpumask_equal(doms_cur[i], doms_new[j]) ++ && dattrs_equal(dattr_cur, i, dattr_new, j)) ++ goto match1; ++ } ++ /* no match - a current sched domain not in new doms_new[] */ ++ detach_destroy_domains(doms_cur[i]); ++match1: ++ ; ++ } ++ ++ if (doms_new == NULL) { ++ ndoms_cur = 0; ++ doms_new = &fallback_doms; ++ cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); ++ WARN_ON_ONCE(dattr_new); ++ } ++ ++ /* Build new domains */ ++ for (i = 0; i < ndoms_new; i++) { ++ for (j = 0; j < ndoms_cur && !new_topology; j++) { ++ if (cpumask_equal(doms_new[i], doms_cur[j]) ++ && dattrs_equal(dattr_new, i, dattr_cur, j)) ++ goto match2; ++ } ++ /* no match - add a new doms_new */ ++ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); ++match2: ++ ; ++ } ++ ++ /* Remember the new sched domains */ ++ if (doms_cur != &fallback_doms) ++ free_sched_domains(doms_cur, ndoms_cur); ++ kfree(dattr_cur); /* kfree(NULL) is safe */ ++ doms_cur = doms_new; ++ dattr_cur = dattr_new; ++ ndoms_cur = ndoms_new; ++ ++ register_sched_domain_sysctl(); ++ ++ mutex_unlock(&sched_domains_mutex); ++} ++ ++#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) ++static void reinit_sched_domains(void) ++{ ++ get_online_cpus(); ++ ++ /* Destroy domains first to force the rebuild */ ++ partition_sched_domains(0, NULL, NULL); ++ ++ rebuild_sched_domains(); ++ put_online_cpus(); ++} ++ ++static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) ++{ ++ unsigned int level = 0; ++ ++ if (sscanf(buf, "%u", &level) != 1) ++ return -EINVAL; ++ ++ /* ++ * level is always be positive so don't check for ++ * level < POWERSAVINGS_BALANCE_NONE which is 0 ++ * What happens on 0 or 1 byte write, ++ * need to check for count as well? ++ */ ++ ++ if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) ++ return -EINVAL; ++ ++ if (smt) ++ sched_smt_power_savings = level; ++ else ++ sched_mc_power_savings = level; ++ ++ reinit_sched_domains(); ++ ++ return count; ++} ++ ++#ifdef CONFIG_SCHED_MC ++static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, ++ struct sysdev_class_attribute *attr, ++ char *page) ++{ ++ return sprintf(page, "%u\n", sched_mc_power_savings); ++} ++static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, ++ struct sysdev_class_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return sched_power_savings_store(buf, count, 0); ++} ++static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, ++ sched_mc_power_savings_show, ++ sched_mc_power_savings_store); ++#endif ++ ++#ifdef CONFIG_SCHED_SMT ++static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, ++ struct sysdev_class_attribute *attr, ++ char *page) ++{ ++ return sprintf(page, "%u\n", sched_smt_power_savings); ++} ++static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, ++ struct sysdev_class_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return sched_power_savings_store(buf, count, 1); ++} ++static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, ++ sched_smt_power_savings_show, ++ sched_smt_power_savings_store); ++#endif ++ ++int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) ++{ ++ int err = 0; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (smt_capable()) ++ err = sysfs_create_file(&cls->kset.kobj, ++ &attr_sched_smt_power_savings.attr); ++#endif ++#ifdef CONFIG_SCHED_MC ++ if (!err && mc_capable()) ++ err = sysfs_create_file(&cls->kset.kobj, ++ &attr_sched_mc_power_savings.attr); ++#endif ++ return err; ++} ++#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ */ ++static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, ++ void *hcpu) ++{ ++ switch (action & ~CPU_TASKS_FROZEN) { ++ case CPU_ONLINE: ++ case CPU_DOWN_FAILED: ++ cpuset_update_active_cpus(); ++ return NOTIFY_OK; ++ default: ++ return NOTIFY_DONE; ++ } ++} ++ ++static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, ++ void *hcpu) ++{ ++ switch (action & ~CPU_TASKS_FROZEN) { ++ case CPU_DOWN_PREPARE: ++ cpuset_update_active_cpus(); ++ return NOTIFY_OK; ++ default: ++ return NOTIFY_DONE; ++ } ++} ++ ++#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) ++/* ++ * Cheaper version of the below functions in case support for SMT and MC is ++ * compiled in but CPUs have no siblings. ++ */ ++static bool sole_cpu_idle(int cpu) ++{ ++ return rq_idle(cpu_rq(cpu)); ++} ++#endif ++#ifdef CONFIG_SCHED_SMT ++/* All this CPU's SMT siblings are idle */ ++static bool siblings_cpu_idle(int cpu) ++{ ++ return cpumask_subset(&(cpu_rq(cpu)->smt_siblings), ++ &grq.cpu_idle_map); ++} ++#endif ++#ifdef CONFIG_SCHED_MC ++/* All this CPU's shared cache siblings are idle */ ++static bool cache_cpu_idle(int cpu) ++{ ++ return cpumask_subset(&(cpu_rq(cpu)->cache_siblings), ++ &grq.cpu_idle_map); ++} ++#endif ++ ++enum sched_domain_level { ++ SD_LV_NONE = 0, ++ SD_LV_SIBLING, ++ SD_LV_MC, ++ SD_LV_BOOK, ++ SD_LV_CPU, ++ SD_LV_NODE, ++ SD_LV_ALLNODES, ++ SD_LV_MAX ++}; ++ ++void __init sched_init_smp(void) ++{ ++ struct sched_domain *sd; ++ int cpu; ++ ++ cpumask_var_t non_isolated_cpus; ++ ++ alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); ++ alloc_cpumask_var(&fallback_doms, GFP_KERNEL); ++ ++ get_online_cpus(); ++ mutex_lock(&sched_domains_mutex); ++ init_sched_domains(cpu_active_mask); ++ cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); ++ if (cpumask_empty(non_isolated_cpus)) ++ cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); ++ mutex_unlock(&sched_domains_mutex); ++ put_online_cpus(); ++ ++ hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); ++ hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); ++ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) ++ BUG(); ++ free_cpumask_var(non_isolated_cpus); ++ ++ grq_lock_irq(); ++ /* ++ * Set up the relative cache distance of each online cpu from each ++ * other in a simple array for quick lookup. Locality is determined ++ * by the closest sched_domain that CPUs are separated by. CPUs with ++ * shared cache in SMT and MC are treated as local. Separate CPUs ++ * (within the same package or physically) within the same node are ++ * treated as not local. CPUs not even in the same domain (different ++ * nodes) are treated as very distant. ++ */ ++ for_each_online_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ for_each_domain(cpu, sd) { ++ int locality, other_cpu; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (sd->level == SD_LV_SIBLING) { ++ for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) ++ cpumask_set_cpu(other_cpu, &rq->smt_siblings); ++ } ++#endif ++#ifdef CONFIG_SCHED_MC ++ if (sd->level == SD_LV_MC) { ++ for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) ++ cpumask_set_cpu(other_cpu, &rq->cache_siblings); ++ } ++#endif ++ if (sd->level <= SD_LV_SIBLING) ++ locality = 1; ++ else if (sd->level <= SD_LV_MC) ++ locality = 2; ++ else if (sd->level <= SD_LV_NODE) ++ locality = 3; ++ else ++ continue; ++ ++ for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) { ++ if (locality < rq->cpu_locality[other_cpu]) ++ rq->cpu_locality[other_cpu] = locality; ++ } ++ } ++ ++/* ++ * Each runqueue has its own function in case it doesn't have ++ * siblings of its own allowing mixed topologies. ++ */ ++#ifdef CONFIG_SCHED_SMT ++ if (cpus_weight(rq->smt_siblings) > 1) ++ rq->siblings_idle = siblings_cpu_idle; ++#endif ++#ifdef CONFIG_SCHED_MC ++ if (cpus_weight(rq->cache_siblings) > 1) ++ rq->cache_idle = cache_cpu_idle; ++#endif ++ } ++ grq_unlock_irq(); ++} ++#else ++void __init sched_init_smp(void) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++unsigned int sysctl_timer_migration = 1; ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ prio_ratios[0] = 128; ++ for (i = 1 ; i < PRIO_RANGE ; i++) ++ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; ++ ++ raw_spin_lock_init(&grq.lock); ++ grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0; ++ grq.niffies = 0; ++ grq.last_jiffy = jiffies; ++ raw_spin_lock_init(&grq.iso_lock); ++ grq.iso_ticks = grq.iso_refractory = 0; ++ grq.noc = 1; ++#ifdef CONFIG_SMP ++ init_defrootdomain(); ++ grq.qnr = grq.idle_cpus = 0; ++ cpumask_clear(&grq.cpu_idle_map); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc = ++ rq->iowait_pc = rq->idle_pc = 0; ++ rq->dither = false; ++#ifdef CONFIG_SMP ++ rq->sticky_task = NULL; ++ rq->last_niffy = 0; ++ rq->sd = NULL; ++ rq->rd = NULL; ++ rq->online = false; ++ rq->cpu = i; ++ rq_attach_root(rq, &def_root_domain); ++#endif ++ atomic_set(&rq->nr_iowait, 0); ++ } ++ ++#ifdef CONFIG_SMP ++ nr_cpu_ids = i; ++ /* ++ * Set the base locality for cpu cache distance calculation to ++ * "distant" (3). Make sure the distance from a CPU to itself is 0. ++ */ ++ for_each_possible_cpu(i) { ++ int j; ++ ++ rq = cpu_rq(i); ++#ifdef CONFIG_SCHED_SMT ++ cpumask_clear(&rq->smt_siblings); ++ cpumask_set_cpu(i, &rq->smt_siblings); ++ rq->siblings_idle = sole_cpu_idle; ++ cpumask_set_cpu(i, &rq->smt_siblings); ++#endif ++#ifdef CONFIG_SCHED_MC ++ cpumask_clear(&rq->cache_siblings); ++ cpumask_set_cpu(i, &rq->cache_siblings); ++ rq->cache_idle = sole_cpu_idle; ++ cpumask_set_cpu(i, &rq->cache_siblings); ++#endif ++ rq->cpu_locality = kmalloc(nr_cpu_ids * sizeof(int *), GFP_ATOMIC); ++ for_each_possible_cpu(j) { ++ if (i == j) ++ rq->cpu_locality[j] = 0; ++ else ++ rq->cpu_locality[j] = 4; ++ } ++ } ++#endif ++ ++ for (i = 0; i < PRIO_LIMIT; i++) ++ INIT_LIST_HEAD(grq.queue + i); ++ /* delimiter for bitsearch */ ++ __set_bit(PRIO_LIMIT, grq.prio_bitmap); ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&init_task.preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_RT_MUTEXES ++ plist_head_init(&init_task.pi_waiters); ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ atomic_inc(&init_mm.mm_count); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++#ifdef CONFIG_SMP ++ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); ++ /* May be allocated at isolcpus cmdline parse time */ ++ if (cpu_isolated_map == NULL) ++ zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); ++#endif /* SMP */ ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; /* ratelimiting */ ++ ++ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || ++ system_state != SYSTEM_RUNNING || oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ dump_stack(); ++} ++EXPORT_SYMBOL(__might_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ unsigned long flags; ++ struct rq *rq; ++ int queued; ++ ++ read_lock_irq(&tasklist_lock); ++ ++ do_each_thread(g, p) { ++ if (!rt_task(p) && !iso_task(p)) ++ continue; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_grq_lock(p); ++ ++ queued = task_queued(p); ++ if (queued) ++ dequeue_task(p); ++ __setscheduler(p, rq, SCHED_NORMAL, 0); ++ if (queued) { ++ enqueue_task(p); ++ try_preempt(p, rq); ++ } ++ ++ __task_grq_unlock(); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ } while_each_thread(g, p); ++ ++ read_unlock_irq(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given cpu. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * set_curr_task - set the current task for a given cpu. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a cpu in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++/* ++ * Use precise platform statistics if available: ++ */ ++#ifdef CONFIG_VIRT_CPU_ACCOUNTING ++void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) ++{ ++ *ut = p->utime; ++ *st = p->stime; ++} ++ ++void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) ++{ ++ struct task_cputime cputime; ++ ++ thread_group_cputime(p, &cputime); ++ ++ *ut = cputime.utime; ++ *st = cputime.stime; ++} ++#else ++ ++#ifndef nsecs_to_cputime ++# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) ++#endif ++ ++void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) ++{ ++ cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); ++ ++ rtime = nsecs_to_cputime(p->sched_time); ++ ++ if (total) { ++ u64 temp; ++ ++ temp = (u64)(rtime * utime); ++ do_div(temp, total); ++ utime = (cputime_t)temp; ++ } else ++ utime = rtime; ++ ++ /* ++ * Compare with previous values, to keep monotonicity: ++ */ ++ p->prev_utime = max(p->prev_utime, utime); ++ p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); ++ ++ *ut = p->prev_utime; ++ *st = p->prev_stime; ++} ++ ++/* ++ * Must be called with siglock held. ++ */ ++void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) ++{ ++ struct signal_struct *sig = p->signal; ++ struct task_cputime cputime; ++ cputime_t rtime, utime, total; ++ ++ thread_group_cputime(p, &cputime); ++ ++ total = cputime_add(cputime.utime, cputime.stime); ++ rtime = nsecs_to_cputime(cputime.sum_exec_runtime); ++ ++ if (total) { ++ u64 temp; ++ ++ temp = (u64)(rtime * cputime.utime); ++ do_div(temp, total); ++ utime = (cputime_t)temp; ++ } else ++ utime = rtime; ++ ++ sig->prev_utime = max(sig->prev_utime, utime); ++ sig->prev_stime = max(sig->prev_stime, ++ cputime_sub(rtime, sig->prev_utime)); ++ ++ *ut = sig->prev_utime; ++ *st = sig->prev_stime; ++} ++#endif ++ ++inline cputime_t task_gtime(struct task_struct *p) ++{ ++ return p->gtime; ++} ++ ++void __cpuinit init_idle_bootup_task(struct task_struct *idle) ++{} ++ ++#ifdef CONFIG_SCHED_DEBUG ++void proc_sched_show_task(struct task_struct *p, struct seq_file *m) ++{} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_SMP ++unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) ++{ ++ return SCHED_LOAD_SCALE; ++} ++ ++unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) ++{ ++ unsigned long weight = cpumask_weight(sched_domain_span(sd)); ++ unsigned long smt_gain = sd->smt_gain; ++ ++ smt_gain /= weight; ++ ++ return smt_gain; ++} ++#endif +Index: linux-3.2-ck1/kernel/sched.c +=================================================================== +--- linux-3.2-ck1.orig/kernel/sched.c 2012-01-16 10:07:27.897097267 +1100 ++++ linux-3.2-ck1/kernel/sched.c 2012-01-16 10:07:31.344097027 +1100 +@@ -1,3 +1,6 @@ ++#ifdef CONFIG_SCHED_BFS ++#include "sched_bfs.c" ++#else + /* + * kernel/sched.c + * +@@ -9783,3 +9786,4 @@ struct cgroup_subsys cpuacct_subsys = { + .subsys_id = cpuacct_subsys_id, + }; + #endif /* CONFIG_CGROUP_CPUACCT */ ++#endif /* CONFIG_SCHED_BFS */ +Index: linux-3.2-ck1/kernel/sysctl.c +=================================================================== +--- linux-3.2-ck1.orig/kernel/sysctl.c 2012-01-16 10:07:27.897097267 +1100 ++++ linux-3.2-ck1/kernel/sysctl.c 2012-01-16 10:07:31.345097026 +1100 +@@ -121,7 +121,12 @@ static int __maybe_unused one = 1; + static int __maybe_unused two = 2; + static int __maybe_unused three = 3; + static unsigned long one_ul = 1; +-static int one_hundred = 100; ++static int __maybe_unused one_hundred = 100; ++#ifdef CONFIG_SCHED_BFS ++extern int rr_interval; ++extern int sched_iso_cpu; ++static int __read_mostly one_thousand = 1000; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -251,7 +256,7 @@ static struct ctl_table root_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -266,6 +271,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_BFS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -383,6 +389,7 @@ static struct ctl_table kern_table[] = { + .extra1 = &one, + }, + #endif ++#endif /* !CONFIG_SCHED_BFS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -841,6 +848,26 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_BFS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +Index: linux-3.2-ck1/lib/Kconfig.debug +=================================================================== +--- linux-3.2-ck1.orig/lib/Kconfig.debug 2012-01-16 10:07:27.895097268 +1100 ++++ linux-3.2-ck1/lib/Kconfig.debug 2012-01-16 10:07:31.345097026 +1100 +@@ -875,7 +875,7 @@ config BOOT_PRINTK_DELAY + + config RCU_TORTURE_TEST + tristate "torture tests for RCU" +- depends on DEBUG_KERNEL ++ depends on DEBUG_KERNEL && !SCHED_BFS + default n + help + This option provides a kernel module that runs torture tests +Index: linux-3.2-ck1/include/linux/jiffies.h +=================================================================== +--- linux-3.2-ck1.orig/include/linux/jiffies.h 2012-01-16 10:07:27.896097267 +1100 ++++ linux-3.2-ck1/include/linux/jiffies.h 2012-01-16 10:07:31.345097026 +1100 +@@ -164,7 +164,7 @@ static inline u64 get_jiffies_64(void) + * Have the 32 bit jiffies value wrap 5 minutes after boot + * so jiffies wrap bugs show up earlier. + */ +-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) ++#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) + + /* + * Change timeval to jiffies, trying to avoid the +Index: linux-3.2-ck1/drivers/cpufreq/cpufreq.c +=================================================================== +--- linux-3.2-ck1.orig/drivers/cpufreq/cpufreq.c 2012-01-16 10:07:27.894097269 +1100 ++++ linux-3.2-ck1/drivers/cpufreq/cpufreq.c 2012-01-16 10:07:31.346097026 +1100 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -1444,6 +1445,12 @@ int __cpufreq_driver_target(struct cpufr + target_freq, relation); + if (cpu_online(policy->cpu) && cpufreq_driver->target) + retval = cpufreq_driver->target(policy, target_freq, relation); ++ if (likely(retval != -EINVAL)) { ++ if (target_freq == policy->max) ++ cpu_nonscaling(policy->cpu); ++ else ++ cpu_scaling(policy->cpu); ++ } + + return retval; + } +Index: linux-3.2-ck1/drivers/cpufreq/cpufreq_ondemand.c +=================================================================== +--- linux-3.2-ck1.orig/drivers/cpufreq/cpufreq_ondemand.c 2012-01-16 10:07:27.894097269 +1100 ++++ linux-3.2-ck1/drivers/cpufreq/cpufreq_ondemand.c 2012-01-16 10:07:31.346097026 +1100 +@@ -28,8 +28,8 @@ + * It helps to keep variable names smaller, simpler + */ + +-#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (10) +-#define DEF_FREQUENCY_UP_THRESHOLD (80) ++#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (26) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (100000) + #define MICRO_FREQUENCY_DOWN_DIFFERENTIAL (3) +@@ -417,10 +417,10 @@ static void dbs_check_cpu(struct cpu_dbs + + /* + * Every sampling_rate, we check, if current idle time is less +- * than 20% (default), then we try to increase frequency ++ * than 37% (default), then we try to increase frequency + * Every sampling_rate, we look for a the lowest + * frequency which can sustain the load while keeping idle time over +- * 30%. If such a frequency exist, we try to decrease to this frequency. ++ * 63%. If such a frequency exist, we try to decrease to this frequency. + * + * Any frequency increase takes it to the maximum frequency. + * Frequency reduction happens at minimum steps of +Index: linux-3.2-ck1/drivers/cpufreq/cpufreq_conservative.c +=================================================================== +--- linux-3.2-ck1.orig/drivers/cpufreq/cpufreq_conservative.c 2012-01-16 10:07:27.894097269 +1100 ++++ linux-3.2-ck1/drivers/cpufreq/cpufreq_conservative.c 2012-01-16 10:07:31.346097026 +1100 +@@ -29,8 +29,8 @@ + * It helps to keep variable names smaller, simpler + */ + +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_FREQUENCY_DOWN_THRESHOLD (20) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define DEF_FREQUENCY_DOWN_THRESHOLD (26) + + /* + * The polling frequency of this governor depends on the capability of +Index: linux-3.2-ck1/mm/vmscan.c +=================================================================== +--- linux-3.2-ck1.orig/mm/vmscan.c 2012-01-16 10:07:27.813097272 +1100 ++++ linux-3.2-ck1/mm/vmscan.c 2012-01-16 10:07:32.578096942 +1100 +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -146,7 +147,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 10; + long vm_total_pages; /* The total number of pages which the VM controls */ + + static LIST_HEAD(shrinker_list); +@@ -986,7 +987,7 @@ cull_mlocked: + + activate_locked: + /* Not a candidate for swapping, so reclaim swap space. */ +- if (PageSwapCache(page) && vm_swap_full()) ++ if (PageSwapCache(page)) + try_to_free_swap(page); + VM_BUG_ON(PageActive(page)); + SetPageActive(page); +@@ -2089,6 +2090,35 @@ restart: + } + + /* ++ * Helper functions to adjust nice level of kswapd, based on the priority of ++ * the task (p) that called it. If it is already higher priority we do not ++ * demote its nice level since it is still working on behalf of a higher ++ * priority task. With kernel threads we leave it at nice 0. ++ * ++ * We don't ever run kswapd real time, so if a real time task calls kswapd we ++ * set it to highest SCHED_NORMAL priority. ++ */ ++static inline int effective_sc_prio(struct task_struct *p) ++{ ++ if (likely(p->mm)) { ++ if (rt_task(p)) ++ return -20; ++ if (p->policy == SCHED_IDLEPRIO) ++ return 19; ++ return task_nice(p); ++ } ++ return 0; ++} ++ ++static void set_kswapd_nice(struct task_struct *kswapd, int active) ++{ ++ long nice = effective_sc_prio(current); ++ ++ if (task_nice(kswapd) > nice || !active) ++ set_user_nice(kswapd, nice); ++} ++ ++/* + * This is the direct reclaim path, for page-allocating processes. We only + * try to reclaim pages from zones which will satisfy the caller's allocation + * request. +@@ -2835,6 +2865,8 @@ static void kswapd_try_to_sleep(pg_data_ + finish_wait(&pgdat->kswapd_wait, &wait); + } + ++#define WT_EXPIRY (HZ * 5) /* Time to wakeup watermark_timer */ ++ + /* + * The background pageout daemon, started as a kernel thread + * from the init process. +@@ -2890,6 +2922,9 @@ static int kswapd(void *p) + for ( ; ; ) { + int ret; + ++ /* kswapd has been busy so delay watermark_timer */ ++ mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY); ++ + /* + * If the last balance_pgdat was unsuccessful it's unlikely a + * new request of a similar or harder type will succeed soon +@@ -2945,6 +2980,7 @@ static int kswapd(void *p) + void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) + { + pg_data_t *pgdat; ++ int active; + + if (!populated_zone(zone)) + return; +@@ -2956,7 +2992,9 @@ void wakeup_kswapd(struct zone *zone, in + pgdat->kswapd_max_order = order; + pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); + } +- if (!waitqueue_active(&pgdat->kswapd_wait)) ++ active = waitqueue_active(&pgdat->kswapd_wait); ++ set_kswapd_nice(pgdat->kswapd, active); ++ if (!active) + return; + if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + return; +@@ -3068,20 +3106,57 @@ static int __devinit cpu_callback(struct + } + + /* ++ * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots ++ */ ++static void watermark_wakeup(unsigned long data) ++{ ++ pg_data_t *pgdat = (pg_data_t *)data; ++ struct timer_list *wt = &pgdat->watermark_timer; ++ int i; ++ ++ if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load()) ++ goto out; ++ for (i = pgdat->nr_zones - 1; i >= 0; i--) { ++ struct zone *z = pgdat->node_zones + i; ++ ++ if (!populated_zone(z) || is_highmem(z)) { ++ /* We are better off leaving highmem full */ ++ continue; ++ } ++ if (!zone_watermark_ok(z, 0, lots_wmark_pages(z), 0, 0)) { ++ wake_up_interruptible(&pgdat->kswapd_wait); ++ goto out; ++ } ++ } ++out: ++ mod_timer(wt, jiffies + WT_EXPIRY); ++ return; ++} ++ ++/* + * This kswapd start function will be called by init and node-hot-add. + * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. + */ + int kswapd_run(int nid) + { + pg_data_t *pgdat = NODE_DATA(nid); ++ struct timer_list *wt; + int ret = 0; + + if (pgdat->kswapd) + return 0; + ++ wt = &pgdat->watermark_timer; ++ init_timer(wt); ++ wt->data = (unsigned long)pgdat; ++ wt->function = watermark_wakeup; ++ wt->expires = jiffies + WT_EXPIRY; ++ add_timer(wt); ++ + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ ++ del_timer(wt); + BUG_ON(system_state == SYSTEM_BOOTING); + printk("Failed to start kswapd on node %d\n",nid); + ret = -1; +Index: linux-3.2-ck1/include/linux/swap.h +=================================================================== +--- linux-3.2-ck1.orig/include/linux/swap.h 2012-01-16 10:07:27.777097278 +1100 ++++ linux-3.2-ck1/include/linux/swap.h 2012-01-16 10:07:32.751096930 +1100 +@@ -201,7 +201,7 @@ struct swap_list_t { + int next; /* swapfile to be used next */ + }; + +-/* Swap 50% full? Release swapcache more aggressively.. */ ++/* Swap 50% full? */ + #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) + + /* linux/mm/page_alloc.c */ +@@ -215,6 +215,7 @@ extern unsigned int nr_free_pagecache_pa + + + /* linux/mm/swap.c */ ++extern void ____lru_cache_add(struct page *, enum lru_list lru, bool tail); + extern void __lru_cache_add(struct page *, enum lru_list lru); + extern void lru_cache_add_lru(struct page *, enum lru_list lru); + extern void lru_add_page_tail(struct zone* zone, +@@ -238,9 +239,14 @@ static inline void lru_cache_add_anon(st + __lru_cache_add(page, LRU_INACTIVE_ANON); + } + ++static inline void lru_cache_add_file_tail(struct page *page, bool tail) ++{ ++ ____lru_cache_add(page, LRU_INACTIVE_FILE, tail); ++} ++ + static inline void lru_cache_add_file(struct page *page) + { +- __lru_cache_add(page, LRU_INACTIVE_FILE); ++ ____lru_cache_add(page, LRU_INACTIVE_FILE, false); + } + + /* linux/mm/vmscan.c */ +@@ -350,9 +356,10 @@ extern void grab_swap_token(struct mm_st + extern void __put_swap_token(struct mm_struct *); + extern void disable_swap_token(struct mem_cgroup *memcg); + ++/* Only allow swap token to have effect if swap is full */ + static inline int has_swap_token(struct mm_struct *mm) + { +- return (mm == swap_token_mm); ++ return (mm == swap_token_mm && vm_swap_full()); + } + + static inline void put_swap_token(struct mm_struct *mm) +Index: linux-3.2-ck1/mm/memory.c +=================================================================== +--- linux-3.2-ck1.orig/mm/memory.c 2012-01-16 10:07:27.745097280 +1100 ++++ linux-3.2-ck1/mm/memory.c 2012-01-16 10:07:32.052096979 +1100 +@@ -2984,7 +2984,7 @@ static int do_swap_page(struct mm_struct + mem_cgroup_commit_charge_swapin(page, ptr); + + swap_free(entry); +- if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) ++ if ((vma->vm_flags & VM_LOCKED) || PageMlocked(page)) + try_to_free_swap(page); + unlock_page(page); + if (swapcache) { +Index: linux-3.2-ck1/mm/swapfile.c +=================================================================== +--- linux-3.2-ck1.orig/mm/swapfile.c 2012-01-16 10:07:27.745097280 +1100 ++++ linux-3.2-ck1/mm/swapfile.c 2012-01-16 10:07:32.053096979 +1100 +@@ -288,7 +288,7 @@ checks: + scan_base = offset = si->lowest_bit; + + /* reuse swap entry of cache-only swap if not busy. */ +- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { ++ if (si->swap_map[offset] == SWAP_HAS_CACHE) { + int swap_was_freed; + spin_unlock(&swap_lock); + swap_was_freed = __try_to_reclaim_swap(si, offset); +@@ -377,7 +377,7 @@ scan: + spin_lock(&swap_lock); + goto checks; + } +- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { ++ if (si->swap_map[offset] == SWAP_HAS_CACHE) { + spin_lock(&swap_lock); + goto checks; + } +@@ -392,7 +392,7 @@ scan: + spin_lock(&swap_lock); + goto checks; + } +- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { ++ if (si->swap_map[offset] == SWAP_HAS_CACHE) { + spin_lock(&swap_lock); + goto checks; + } +@@ -706,8 +706,7 @@ int free_swap_and_cache(swp_entry_t entr + * Not mapped elsewhere, or swap space full? Free it! + * Also recheck PageSwapCache now page is locked (above). + */ +- if (PageSwapCache(page) && !PageWriteback(page) && +- (!page_mapped(page) || vm_swap_full())) { ++ if (PageSwapCache(page) && !PageWriteback(page)) { + delete_from_swap_cache(page); + SetPageDirty(page); + } +Index: linux-3.2-ck1/include/linux/mmzone.h +=================================================================== +--- linux-3.2-ck1.orig/include/linux/mmzone.h 2012-01-16 10:07:27.669097282 +1100 ++++ linux-3.2-ck1/include/linux/mmzone.h 2012-01-16 10:07:32.405096951 +1100 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -181,12 +182,14 @@ enum zone_watermarks { + WMARK_MIN, + WMARK_LOW, + WMARK_HIGH, ++ WMARK_LOTS, + NR_WMARK + }; + + #define min_wmark_pages(z) (z->watermark[WMARK_MIN]) + #define low_wmark_pages(z) (z->watermark[WMARK_LOW]) + #define high_wmark_pages(z) (z->watermark[WMARK_HIGH]) ++#define lots_wmark_pages(z) (z->watermark[WMARK_LOTS]) + + struct per_cpu_pages { + int count; /* number of pages in the list */ +@@ -358,7 +361,7 @@ struct zone { + ZONE_PADDING(_pad1_) + + /* Fields commonly accessed by the page reclaim scanner */ +- spinlock_t lru_lock; ++ spinlock_t lru_lock; + struct zone_lru { + struct list_head list; + } lru[NR_LRU_LISTS]; +@@ -654,6 +657,7 @@ typedef struct pglist_data { + wait_queue_head_t kswapd_wait; + struct task_struct *kswapd; + int kswapd_max_order; ++ struct timer_list watermark_timer; + enum zone_type classzone_idx; + } pg_data_t; + +Index: linux-3.2-ck1/include/linux/mm_inline.h +=================================================================== +--- linux-3.2-ck1.orig/include/linux/mm_inline.h 2012-01-16 10:07:27.614097289 +1100 ++++ linux-3.2-ck1/include/linux/mm_inline.h 2012-01-16 10:07:32.751096930 +1100 +@@ -23,9 +23,12 @@ static inline int page_is_file_cache(str + + static inline void + __add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, +- struct list_head *head) ++ struct list_head *head, bool tail) + { +- list_add(&page->lru, head); ++ if (tail) ++ list_add_tail(&page->lru, head); ++ else ++ list_add(&page->lru, head); + __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); + mem_cgroup_add_lru_list(page, l); + } +@@ -33,7 +36,13 @@ __add_page_to_lru_list(struct zone *zone + static inline void + add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) + { +- __add_page_to_lru_list(zone, page, l, &zone->lru[l].list); ++ __add_page_to_lru_list(zone, page, l, &zone->lru[l].list, false); ++} ++ ++static inline void ++add_page_to_lru_list_tail(struct zone *zone, struct page *page, enum lru_list l) ++{ ++ __add_page_to_lru_list(zone, page, l, &zone->lru[l].list, 1); + } + + static inline void +Index: linux-3.2-ck1/mm/filemap.c +=================================================================== +--- linux-3.2-ck1.orig/mm/filemap.c 2012-01-16 10:07:27.615097289 +1100 ++++ linux-3.2-ck1/mm/filemap.c 2012-01-16 10:07:32.752096930 +1100 +@@ -495,16 +495,22 @@ out: + } + EXPORT_SYMBOL(add_to_page_cache_locked); + +-int add_to_page_cache_lru(struct page *page, struct address_space *mapping, +- pgoff_t offset, gfp_t gfp_mask) ++int __add_to_page_cache_lru(struct page *page, struct address_space *mapping, ++ pgoff_t offset, gfp_t gfp_mask, bool tail) + { + int ret; + + ret = add_to_page_cache(page, mapping, offset, gfp_mask); + if (ret == 0) +- lru_cache_add_file(page); ++ lru_cache_add_file_tail(page, tail); + return ret; + } ++ ++int add_to_page_cache_lru(struct page *page, struct address_space *mapping, ++ pgoff_t offset, gfp_t gfp_mask) ++{ ++ return __add_to_page_cache_lru(page, mapping, offset, gfp_mask, false); ++} + EXPORT_SYMBOL_GPL(add_to_page_cache_lru); + + #ifdef CONFIG_NUMA +Index: linux-3.2-ck1/mm/swap.c +=================================================================== +--- linux-3.2-ck1.orig/mm/swap.c 2012-01-16 10:07:27.615097289 +1100 ++++ linux-3.2-ck1/mm/swap.c 2012-01-16 10:07:32.753096930 +1100 +@@ -371,15 +371,23 @@ void mark_page_accessed(struct page *pag + + EXPORT_SYMBOL(mark_page_accessed); + +-void __lru_cache_add(struct page *page, enum lru_list lru) ++void ______pagevec_lru_add(struct pagevec *pvec, enum lru_list lru, bool tail); ++ ++void ____lru_cache_add(struct page *page, enum lru_list lru, bool tail) + { + struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; + + page_cache_get(page); + if (!pagevec_add(pvec, page)) +- ____pagevec_lru_add(pvec, lru); ++ ______pagevec_lru_add(pvec, lru, tail); + put_cpu_var(lru_add_pvecs); + } ++EXPORT_SYMBOL(____lru_cache_add); ++ ++void __lru_cache_add(struct page *page, enum lru_list lru) ++{ ++ ____lru_cache_add(page, lru, false); ++} + EXPORT_SYMBOL(__lru_cache_add); + + /** +@@ -387,7 +395,7 @@ EXPORT_SYMBOL(__lru_cache_add); + * @page: the page to be added to the LRU. + * @lru: the LRU list to which the page is added. + */ +-void lru_cache_add_lru(struct page *page, enum lru_list lru) ++void __lru_cache_add_lru(struct page *page, enum lru_list lru, bool tail) + { + if (PageActive(page)) { + VM_BUG_ON(PageUnevictable(page)); +@@ -398,7 +406,12 @@ void lru_cache_add_lru(struct page *page + } + + VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); +- __lru_cache_add(page, lru); ++ ____lru_cache_add(page, lru, tail); ++} ++ ++void lru_cache_add_lru(struct page *page, enum lru_list lru) ++{ ++ __lru_cache_add_lru(page, lru, false); + } + + /** +@@ -685,7 +698,7 @@ void lru_add_page_tail(struct zone* zone + head = page->lru.prev; + else + head = &zone->lru[lru].list; +- __add_page_to_lru_list(zone, page_tail, lru, head); ++ __add_page_to_lru_list(zone, page_tail, lru, head, false); + } else { + SetPageUnevictable(page_tail); + add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); +@@ -714,13 +727,18 @@ static void ____pagevec_lru_add_fn(struc + * Add the passed pages to the LRU, then drop the caller's refcount + * on them. Reinitialises the caller's pagevec. + */ +-void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) ++void ______pagevec_lru_add(struct pagevec *pvec, enum lru_list lru, bool tail) + { + VM_BUG_ON(is_unevictable_lru(lru)); + + pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); + } + ++void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) ++{ ++ ______pagevec_lru_add(pvec, lru, false); ++} ++ + EXPORT_SYMBOL(____pagevec_lru_add); + + /* +Index: linux-3.2-ck1/mm/readahead.c +=================================================================== +--- linux-3.2-ck1.orig/mm/readahead.c 2012-01-16 10:07:27.615097289 +1100 ++++ linux-3.2-ck1/mm/readahead.c 2012-01-16 10:07:32.753096930 +1100 +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + + /* + * Initialise a struct file's readahead state. Assumes that the caller has +@@ -107,7 +108,7 @@ int read_cache_pages(struct address_spac + EXPORT_SYMBOL(read_cache_pages); + + static int read_pages(struct address_space *mapping, struct file *filp, +- struct list_head *pages, unsigned nr_pages) ++ struct list_head *pages, unsigned nr_pages, bool tail) + { + struct blk_plug plug; + unsigned page_idx; +@@ -125,8 +126,8 @@ static int read_pages(struct address_spa + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_to_page(pages); + list_del(&page->lru); +- if (!add_to_page_cache_lru(page, mapping, +- page->index, GFP_KERNEL)) { ++ if (!__add_to_page_cache_lru(page, mapping, ++ page->index, GFP_KERNEL, tail)) { + mapping->a_ops->readpage(filp, page); + } + page_cache_release(page); +@@ -139,6 +140,28 @@ out: + return ret; + } + ++static inline int nr_mapped(void) ++{ ++ return global_page_state(NR_FILE_MAPPED) + ++ global_page_state(NR_ANON_PAGES); ++} ++ ++/* ++ * This examines how large in pages a file size is and returns 1 if it is ++ * more than half the unmapped ram. Avoid doing read_page_state which is ++ * expensive unless we already know it is likely to be large enough. ++ */ ++static int large_isize(unsigned long nr_pages) ++{ ++ if (nr_pages * 6 > vm_total_pages) { ++ unsigned long unmapped_ram = vm_total_pages - nr_mapped(); ++ ++ if (nr_pages * 2 > unmapped_ram) ++ return 1; ++ } ++ return 0; ++} ++ + /* + * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all + * the pages first, then submits them all for I/O. This avoids the very bad +@@ -196,7 +219,8 @@ __do_page_cache_readahead(struct address + * will then handle the error. + */ + if (ret) +- read_pages(mapping, filp, &page_pool, ret); ++ read_pages(mapping, filp, &page_pool, ret, ++ large_isize(end_index)); + BUG_ON(!list_empty(&page_pool)); + out: + return ret; +Index: linux-3.2-ck1/include/linux/pagemap.h +=================================================================== +--- linux-3.2-ck1.orig/include/linux/pagemap.h 2012-01-16 10:07:27.615097289 +1100 ++++ linux-3.2-ck1/include/linux/pagemap.h 2012-01-16 10:07:32.754096930 +1100 +@@ -456,6 +456,8 @@ int add_to_page_cache_locked(struct page + pgoff_t index, gfp_t gfp_mask); + int add_to_page_cache_lru(struct page *page, struct address_space *mapping, + pgoff_t index, gfp_t gfp_mask); ++int __add_to_page_cache_lru(struct page *page, struct address_space *mapping, ++ pgoff_t offset, gfp_t gfp_mask, bool tail); + extern void delete_from_page_cache(struct page *page); + extern void __delete_from_page_cache(struct page *page); + int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); +Index: linux-3.2-ck1/mm/page-writeback.c +=================================================================== +--- linux-3.2-ck1.orig/mm/page-writeback.c 2012-01-16 10:07:27.594097290 +1100 ++++ linux-3.2-ck1/mm/page-writeback.c 2012-01-16 10:07:32.967096915 +1100 +@@ -59,7 +59,7 @@ static long ratelimit_pages = 32; + /* + * Start background writeback (via writeback threads) at this percentage + */ +-int dirty_background_ratio = 10; ++int dirty_background_ratio = 1; + + /* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of +@@ -76,7 +76,7 @@ int vm_highmem_is_dirtyable; + /* + * The generator of dirty data starts writeback at this percentage + */ +-int vm_dirty_ratio = 20; ++int vm_dirty_ratio = 1; + + /* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of +Index: linux-3.2-ck1/arch/x86/Kconfig +=================================================================== +--- linux-3.2-ck1.orig/arch/x86/Kconfig 2012-01-16 10:07:27.563097292 +1100 ++++ linux-3.2-ck1/arch/x86/Kconfig 2012-01-16 10:07:33.128096904 +1100 +@@ -1076,7 +1076,7 @@ endchoice + + choice + depends on EXPERIMENTAL +- prompt "Memory split" if EXPERT ++ prompt "Memory split" + default VMSPLIT_3G + depends on X86_32 + ---help--- +@@ -1096,17 +1096,17 @@ choice + option alone! + + config VMSPLIT_3G +- bool "3G/1G user/kernel split" ++ bool "Default 896MB lowmem (3G/1G user/kernel split)" + config VMSPLIT_3G_OPT + depends on !X86_PAE +- bool "3G/1G user/kernel split (for full 1G low memory)" ++ bool "1GB lowmem (3G/1G user/kernel split)" + config VMSPLIT_2G +- bool "2G/2G user/kernel split" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_2G_OPT + depends on !X86_PAE +- bool "2G/2G user/kernel split (for full 2G low memory)" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_1G +- bool "1G/3G user/kernel split" ++ bool "3GB lowmem (1G/3G user/kernel split)" + endchoice + + config PAGE_OFFSET +Index: linux-3.2-ck1/kernel/Kconfig.hz +=================================================================== +--- linux-3.2-ck1.orig/kernel/Kconfig.hz 2012-01-16 10:07:27.544097294 +1100 ++++ linux-3.2-ck1/kernel/Kconfig.hz 2012-01-16 10:07:33.619096868 +1100 +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_1000 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -23,13 +23,14 @@ choice + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + +- config HZ_250 ++ config HZ_250_NODEFAULT + bool "250 HZ" + help +- 250 Hz is a good compromise choice allowing server performance +- while also showing good interactive responsiveness even +- on SMP and NUMA systems. If you are going to be using NTSC video +- or multimedia, selected 300Hz instead. ++ 250 HZ is a lousy compromise choice allowing server interactivity ++ while also showing desktop throughput and no extra power saving on ++ laptops. No good for anything. ++ ++ Recommend 100 or 1000 instead. + + config HZ_300 + bool "300 HZ" +@@ -43,16 +44,82 @@ choice + bool "1000 HZ" + help + 1000 Hz is the preferred choice for desktop systems and other +- systems requiring fast interactive responses to events. ++ systems requiring fast interactive responses to events. Laptops ++ can also benefit from this choice without sacrificing battery life ++ if dynticks is also enabled. ++ ++ config HZ_1500 ++ bool "1500 HZ" ++ help ++ 1500 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_2000 ++ bool "2000 HZ" ++ help ++ 2000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_3000 ++ bool "3000 HZ" ++ help ++ 3000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_4000 ++ bool "4000 HZ" ++ help ++ 4000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_5000 ++ bool "5000 HZ" ++ help ++ 5000 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_7500 ++ bool "7500 HZ" ++ help ++ 7500 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_10000 ++ bool "10000 HZ" ++ help ++ 10000 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ + + endchoice + + config HZ + int + default 100 if HZ_100 +- default 250 if HZ_250 ++ default 250 if HZ_250_NODEFAULT + default 300 if HZ_300 + default 1000 if HZ_1000 ++ default 1500 if HZ_1500 ++ default 2000 if HZ_2000 ++ default 3000 if HZ_3000 ++ default 4000 if HZ_4000 ++ default 5000 if HZ_5000 ++ default 7500 if HZ_7500 ++ default 10000 if HZ_10000 + + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) +Index: linux-3.2-ck1/arch/x86/kernel/cpu/proc.c +=================================================================== +--- linux-3.2-ck1.orig/arch/x86/kernel/cpu/proc.c 2012-01-16 10:07:27.477097298 +1100 ++++ linux-3.2-ck1/arch/x86/kernel/cpu/proc.c 2012-01-16 10:07:33.618096869 +1100 +@@ -111,7 +111,7 @@ static int show_cpuinfo(struct seq_file + + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", + c->loops_per_jiffy/(500000/HZ), +- (c->loops_per_jiffy/(5000/HZ)) % 100); ++ (c->loops_per_jiffy * 10 /(50000/HZ)) % 100); + + #ifdef CONFIG_X86_64 + if (c->x86_tlbsize > 0) +Index: linux-3.2-ck1/arch/x86/kernel/smpboot.c +=================================================================== +--- linux-3.2-ck1.orig/arch/x86/kernel/smpboot.c 2012-01-16 10:07:27.477097298 +1100 ++++ linux-3.2-ck1/arch/x86/kernel/smpboot.c 2012-01-16 10:07:33.619096868 +1100 +@@ -430,7 +430,7 @@ static void impress_friends(void) + "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + num_online_cpus(), + bogosum/(500000/HZ), +- (bogosum/(5000/HZ))%100); ++ (bogosum * 10/(50000/HZ))%100); + + pr_debug("Before bogocount - setting activated=1.\n"); + } +Index: linux-3.2-ck1/include/linux/nfsd/stats.h +=================================================================== +--- linux-3.2-ck1.orig/include/linux/nfsd/stats.h 2012-01-16 10:07:27.477097298 +1100 ++++ linux-3.2-ck1/include/linux/nfsd/stats.h 2012-01-16 10:07:33.619096868 +1100 +@@ -11,8 +11,8 @@ + + #include + +-/* thread usage wraps very million seconds (approx one fortnight) */ +-#define NFSD_USAGE_WRAP (HZ*1000000) ++/* thread usage wraps every one hundred thousand seconds (approx one day) */ ++#define NFSD_USAGE_WRAP (HZ*100000) + + #ifdef __KERNEL__ + +Index: linux-3.2-ck1/include/net/inet_timewait_sock.h +=================================================================== +--- linux-3.2-ck1.orig/include/net/inet_timewait_sock.h 2012-01-16 10:07:27.477097298 +1100 ++++ linux-3.2-ck1/include/net/inet_timewait_sock.h 2012-01-16 10:07:33.619096868 +1100 +@@ -38,8 +38,8 @@ struct inet_hashinfo; + * If time > 4sec, it is "slow" path, no recycling is required, + * so that we select tick to get range about 4 seconds. + */ +-#if HZ <= 16 || HZ > 4096 +-# error Unsupported: HZ <= 16 or HZ > 4096 ++#if HZ <= 16 || HZ > 16384 ++# error Unsupported: HZ <= 16 or HZ > 16384 + #elif HZ <= 32 + # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #elif HZ <= 64 +@@ -54,8 +54,12 @@ struct inet_hashinfo; + # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #elif HZ <= 2048 + # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +-#else ++#elif HZ <= 4096 + # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) ++#elif HZ <= 8192 ++# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) ++#else ++# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #endif + + /* TIME_WAIT reaping mechanism. */ +Index: linux-3.2-ck1/init/calibrate.c +=================================================================== +--- linux-3.2-ck1.orig/init/calibrate.c 2012-01-16 10:07:27.477097298 +1100 ++++ linux-3.2-ck1/init/calibrate.c 2012-01-16 10:07:33.619096868 +1100 +@@ -278,7 +278,7 @@ void __cpuinit calibrate_delay(void) + if (!printed) + pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n", + lpj/(500000/HZ), +- (lpj/(5000/HZ)) % 100, lpj); ++ (lpj * 10 /(50000 / HZ)) % 100, lpj); + + loops_per_jiffy = lpj; + printed = true; +Index: linux-3.2-ck1/kernel/Kconfig.preempt +=================================================================== +--- linux-3.2-ck1.orig/kernel/Kconfig.preempt 2012-01-16 10:07:27.461097300 +1100 ++++ linux-3.2-ck1/kernel/Kconfig.preempt 2012-01-16 10:07:33.836096853 +1100 +@@ -1,7 +1,7 @@ + + choice + prompt "Preemption Model" +- default PREEMPT_NONE ++ default PREEMPT + + config PREEMPT_NONE + bool "No Forced Preemption (Server)" +@@ -17,7 +17,7 @@ config PREEMPT_NONE + latencies. + + config PREEMPT_VOLUNTARY +- bool "Voluntary Kernel Preemption (Desktop)" ++ bool "Voluntary Kernel Preemption (Nothing)" + help + This option reduces the latency of the kernel by adding more + "explicit preemption points" to the kernel code. These new +@@ -31,7 +31,8 @@ config PREEMPT_VOLUNTARY + applications to run more 'smoothly' even when the system is + under load. + +- Select this if you are building a kernel for a desktop system. ++ Select this for no system in particular (choose Preemptible ++ instead on a desktop if you know what's good for you). + + config PREEMPT + bool "Preemptible Kernel (Low-Latency Desktop)" +Index: linux-3.2-ck1/Makefile +=================================================================== +--- linux-3.2-ck1.orig/Makefile 2012-01-16 10:07:27.444097302 +1100 ++++ linux-3.2-ck1/Makefile 2012-01-16 10:07:33.997096843 +1100 +@@ -10,6 +10,10 @@ NAME = Saber-toothed Squirrel + # Comments in this file are targeted only to the developer, do not + # expect to learn how to build the kernel reading this file. + ++CKVERSION = -ck1 ++CKNAME = BFS Powered ++EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) ++ + # Do not: + # o use make's built-in rules and variables + # (this increases performance and avoids hard-to-debug behaviour); diff --git a/3.2.34/3rd-3rdparty-1.0-tree.patch b/3.2.34/3rd-3rdparty-1.0-tree.patch new file mode 100644 index 0000000..2a6ed72 --- /dev/null +++ b/3.2.34/3rd-3rdparty-1.0-tree.patch @@ -0,0 +1,181 @@ + + 3rdparty/mkbuild.pl | 92 +++++++++++++++++++++++++++++++++++++++++++++ + Documentation/3rdparty.txt | 76 +++++++++++++++++++++++++++++++++++++ + 2 files changed, 168 insertions(+) + +diff -Nurp linux-2.6.37/3rdparty/mkbuild.pl 3rdparty/mkbuild.pl +--- linux-2.6.37/3rdparty/mkbuild.pl 1970-01-01 02:00:00.000000000 +0200 ++++ 3rdparty/mkbuild.pl 2004-04-23 14:59:03.000000000 +0300 +@@ -0,0 +1,92 @@ ++#!/usr/bin/perl -w ++# ++# Version 1.0 ++# ++# Copyright 2001 Jeff Garzik ++# Copyright 2002 Juan Quintela ++# Copyright 2003 Nicolas Planel ++# ++# This software may be used and distributed according to the terms ++# of the GNU General Public License, incorporated herein by reference. ++# ++# ++# Run "mkbuild.pl" ++# ++# This program generates the following files ++# Makefile ++# Makefile.drivers ++# Config.in ++# using the information in the subdirs of this directory. ++# ++# subdirs need to have: ++# a Config.in file ++# a Makefile with a O_TARGET/L_TARGET targets ++# The config.in should set a CONFIG_ to m/y. ++ ++use strict; ++ ++opendir(THISDIR, "."); ++# get dirs without . and .. garbage ++my (@modules) = grep(!/\.\.?$/,grep(-d, readdir(THISDIR))); ++closedir(THISDIR); ++ ++generate_kconfig(@modules); ++generate_makefile(@modules); ++exit(0); ++ ++########################################################################## ++ ++sub generate_makefile { ++ my (@modules) = @_; ++ ++ local *F; ++ open F, "> Makefile" or die "Cannot create new Makefile: $!\n"; ++ print F <<'EOM'; ++# ++# THIS IS AN AUTOMATICALLY GENERATED FILE. DO NOT EDIT. ++# ++ ++EOM ++ printf F "obj- := 3rdparty.o # Dummy rule to force built-in.o to be made\n"; ++ printf F "obj-\$(%s) += %s\n", to_CONFIG($_), $_ . '/' foreach @modules; ++} ++ ++sub generate_kconfig { ++ my (@modules) = @_; ++ ++ local *F; ++ open F, "> Kconfig" or die "Cannot create Kconfig: $!\n"; ++ print F <<"EOM"; ++# ++# THIS IS AN AUTOMATICALLY GENERATED FILE. DO NOT EDIT. ++# ++ ++menu "Unofficial 3rd party kernel additions" ++ ++EOM ++ ++ foreach (@modules) { ++ die "No Kconfig in $_.\n" if ! -r "$_/Kconfig"; ++ print F "source 3rdparty/$_/Kconfig\n"; ++ } ++ print F "\n\nendmenu\n"; ++} ++ ++sub to_CONFIG { ++ local $_ = $_[0]; ++ tr/a-z/A-Z/; ++ s/[\-\. ]/_/g; ++ "CONFIG_$_"; ++} ++ ++sub find_target { ++ my ($module_dir) = @_; ++ ++ local *F; ++ open(F, "$module_dir/Makefile") or die "$module_dir/Makefile: $!\n"; ++ while () { ++ chomp; ++ return $1 if (/[LO]_TARGET.*:=\s+(\S+)/); ++ } ++} ++ +diff -Nurp linux-2.6.37/Documentation/3rdparty.txt Documentation/3rdparty.txt +--- linux-2.6.37/Documentation/3rdparty.txt 1970-01-01 02:00:00.000000000 +0200 ++++ Documentation/3rdparty.txt 2003-11-22 01:07:26.000000000 +0200 +@@ -0,0 +1,76 @@ ++ ++Third-Party Kernel Source Module Support, or ++an easy way to add modules to your kernel build. ++ ++ ++ ++Vendors quite often add additional drivers and features to the kernel ++which require nothing more than modifying Kconfig, Makefile, and ++adding one or more files to a sub-directory. As a single discrete task, ++this is not a problem. However, using patches to add modules to the ++kernel very often results in patch conflicts, resulting in needless time ++wastage as developers regenerate an otherwise working kernel patch. ++ ++This is designed as a solution to these problems. It is NOT designed as ++a replacement for the kernel build system, but merely as a tool for ++vendors and system administrators to ease the pain of patch management. ++ ++The key feature of this system is the distinct lack of patches. Drivers ++are installed via unpacking a tarball. ++ ++ ++ ++Adding a directory to the build (usually from a tarball) ++-------------------------------------------------------- ++If a directory exists inside the 3rdparty sub-directory that contains a ++proper Makefile, it can be added to the build. It also needs a ++Kconfig file. ++ ++ cd /usr/src/linux-2.4.3/3rdparty ++ bzcat /tmp/my-driver2.tar.bz2 | tar xf - # creates "my2" dir ++ ++ ++Limitations ++----------- ++There are some limitations to this system. This system is only ++designed to support a very common case. If you find yourself running ++into limitations (kernel build experts can spot them right off), ++then you should probably be patching the kernel instead of using ++mkbuild.pl for that particular module. ++ ++FIXME: actually list the limitations ++ ++ ++ ++Other notes ++----------- ++Link order is controlled by the order of mkbuild.pl executions. ++ ++"make mrproper" will erase Makefile.meta, and empty Kconfig, Makefile, ++and Makefile.drivers. ++ ++IMPORTANT NOTE: Because this feature modifies the kernel's makefiles and ++configuration system, you MUST complete all mkbuild.pl runs before ++running any "make" command. ++ ++Building in the 3rdparty dir ++---------------------------- ++ ++If you use modules that: ++ - are contained in one subdir with the name of the module ++ - has a Makefile ++ - has a Kconfig file ++ ++The system calls the ./mkbuild.pl script. It will search for ++subdirectories, and will try to build each of them as a module. ++Things to note: ++ ++ The dependencies will be done in a module called: ++ ++ 3rdparty// ++ ++depending of CONFIG_. ++ ++ is the value of O_TARGET/L_TARGET. ++ ++ diff --git a/3.2.34/3rd-3rdparty-button_hotplug-0.4.1.patch b/3.2.34/3rd-3rdparty-button_hotplug-0.4.1.patch new file mode 100644 index 0000000..a7b4a15 --- /dev/null +++ b/3.2.34/3rd-3rdparty-button_hotplug-0.4.1.patch @@ -0,0 +1,372 @@ +Submitted By: Mario Fetka (mario dot fetka at gmail dot com) +Date: 2012-11-18 +Initial Package Version: 3.2.33 +Origin: openwtr.org packages/system/button-hotplug +Upstream Status: unknown +Description: create uevents from button usage + +diff -Naur linux-3.2.33-go.orig/3rdparty/button_hotplug/Kconfig 3rdparty/button_hotplug/Kconfig +--- linux-3.2.33-go.orig/3rdparty/button_hotplug/Kconfig 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/button_hotplug/Kconfig 2012-11-18 14:45:26.000000000 +0000 +@@ -0,0 +1,2 @@ ++config BUTTON_HOTPLUG ++ tristate "Button Hotplug driver" +diff -Naur linux-3.2.33-go.orig/3rdparty/button_hotplug/Makefile 3rdparty/button_hotplug/Makefile +--- linux-3.2.33-go.orig/3rdparty/button_hotplug/Makefile 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/button_hotplug/Makefile 2012-11-18 14:45:26.000000000 +0000 +@@ -0,0 +1 @@ ++obj-${CONFIG_BUTTON_HOTPLUG} += button-hotplug.o +\ No newline at end of file +diff -Naur linux-3.2.33-go.orig/3rdparty/button_hotplug/button-hotplug.c 3rdparty/button_hotplug/button-hotplug.c +--- linux-3.2.33-go.orig/3rdparty/button_hotplug/button-hotplug.c 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/button_hotplug/button-hotplug.c 2012-11-18 14:45:26.000000000 +0000 +@@ -0,0 +1,349 @@ ++/* ++ * Button Hotplug driver ++ * ++ * Copyright (C) 2008-2010 Gabor Juhos ++ * ++ * Based on the diag.c - GPIO interface driver for Broadcom boards ++ * Copyright (C) 2006 Mike Baker , ++ * Copyright (C) 2006-2007 Felix Fietkau ++ * Copyright (C) 2008 Andy Boyett ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 as published ++ * by the Free Software Foundation. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#define DRV_NAME "button-hotplug" ++#define DRV_VERSION "0.4.1" ++#define DRV_DESC "Button Hotplug driver" ++ ++#define BH_SKB_SIZE 2048 ++ ++#define PFX DRV_NAME ": " ++ ++#undef BH_DEBUG ++ ++#ifdef BH_DEBUG ++#define BH_DBG(fmt, args...) printk(KERN_DEBUG "%s: " fmt, DRV_NAME, ##args ) ++#else ++#define BH_DBG(fmt, args...) do {} while (0) ++#endif ++ ++#define BH_ERR(fmt, args...) printk(KERN_ERR "%s: " fmt, DRV_NAME, ##args ) ++ ++#ifndef BIT_MASK ++#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) ++#endif ++ ++struct bh_priv { ++ unsigned long *seen; ++ struct input_handle handle; ++}; ++ ++struct bh_event { ++ const char *name; ++ char *action; ++ unsigned long seen; ++ ++ struct sk_buff *skb; ++ struct work_struct work; ++}; ++ ++struct bh_map { ++ unsigned int code; ++ const char *name; ++}; ++ ++extern u64 uevent_next_seqnum(void); ++ ++#define BH_MAP(_code, _name) \ ++ { \ ++ .code = (_code), \ ++ .name = (_name), \ ++ } ++ ++static struct bh_map button_map[] = { ++ BH_MAP(BTN_0, "BTN_0"), ++ BH_MAP(BTN_1, "BTN_1"), ++ BH_MAP(BTN_2, "BTN_2"), ++ BH_MAP(BTN_3, "BTN_3"), ++ BH_MAP(BTN_4, "BTN_4"), ++ BH_MAP(BTN_5, "BTN_5"), ++ BH_MAP(BTN_6, "BTN_6"), ++ BH_MAP(BTN_7, "BTN_7"), ++ BH_MAP(BTN_8, "BTN_8"), ++ BH_MAP(BTN_9, "BTN_9"), ++ BH_MAP(KEY_RESTART, "reset"), ++#ifdef KEY_WPS_BUTTON ++ BH_MAP(KEY_WPS_BUTTON, "wps"), ++#endif /* KEY_WPS_BUTTON */ ++}; ++ ++/* -------------------------------------------------------------------------*/ ++ ++static int bh_event_add_var(struct bh_event *event, int argv, ++ const char *format, ...) ++{ ++ static char buf[128]; ++ char *s; ++ va_list args; ++ int len; ++ ++ if (argv) ++ return 0; ++ ++ va_start(args, format); ++ len = vsnprintf(buf, sizeof(buf), format, args); ++ va_end(args); ++ ++ if (len >= sizeof(buf)) { ++ BH_ERR("buffer size too small\n"); ++ WARN_ON(1); ++ return -ENOMEM; ++ } ++ ++ s = skb_put(event->skb, len + 1); ++ strcpy(s, buf); ++ ++ BH_DBG("added variable '%s'\n", s); ++ ++ return 0; ++} ++ ++static int button_hotplug_fill_event(struct bh_event *event) ++{ ++ int ret; ++ ++ ret = bh_event_add_var(event, 0, "HOME=%s", "/"); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "PATH=%s", ++ "/sbin:/bin:/usr/sbin:/usr/bin"); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "SUBSYSTEM=%s", "button"); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "ACTION=%s", event->action); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "BUTTON=%s", event->name); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "SEEN=%ld", event->seen); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "SEQNUM=%llu", uevent_next_seqnum()); ++ ++ return ret; ++} ++ ++static void button_hotplug_work(struct work_struct *work) ++{ ++ struct bh_event *event = container_of(work, struct bh_event, work); ++ int ret = 0; ++ ++ event->skb = alloc_skb(BH_SKB_SIZE, GFP_KERNEL); ++ if (!event->skb) ++ goto out_free_event; ++ ++ ret = bh_event_add_var(event, 0, "%s@", event->action); ++ if (ret) ++ goto out_free_skb; ++ ++ ret = button_hotplug_fill_event(event); ++ if (ret) ++ goto out_free_skb; ++ ++ NETLINK_CB(event->skb).dst_group = 1; ++ broadcast_uevent(event->skb, 0, 1, GFP_KERNEL); ++ ++ out_free_skb: ++ if (ret) { ++ BH_ERR("work error %d\n", ret); ++ kfree_skb(event->skb); ++ } ++ out_free_event: ++ kfree(event); ++} ++ ++static int button_hotplug_create_event(const char *name, unsigned long seen, ++ int pressed) ++{ ++ struct bh_event *event; ++ ++ BH_DBG("create event, name=%s, seen=%lu, pressed=%d\n", ++ name, seen, pressed); ++ ++ event = kzalloc(sizeof(*event), GFP_KERNEL); ++ if (!event) ++ return -ENOMEM; ++ ++ event->name = name; ++ event->seen = seen; ++ event->action = pressed ? "pressed" : "released"; ++ ++ INIT_WORK(&event->work, (void *)(void *)button_hotplug_work); ++ schedule_work(&event->work); ++ ++ return 0; ++} ++ ++/* -------------------------------------------------------------------------*/ ++ ++#ifdef CONFIG_HOTPLUG ++static int button_get_index(unsigned int code) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(button_map); i++) ++ if (button_map[i].code == code) ++ return i; ++ ++ return -1; ++} ++static void button_hotplug_event(struct input_handle *handle, ++ unsigned int type, unsigned int code, int value) ++{ ++ struct bh_priv *priv = handle->private; ++ unsigned long seen = jiffies; ++ int btn; ++ ++ BH_DBG("event type=%u, code=%u, value=%d\n", type, code, value); ++ ++ if (type != EV_KEY) ++ return; ++ ++ btn = button_get_index(code); ++ if (btn < 0) ++ return; ++ ++ button_hotplug_create_event(button_map[btn].name, ++ (seen - priv->seen[btn]) / HZ, value); ++ priv->seen[btn] = seen; ++} ++#else ++static void button_hotplug_event(struct input_handle *handle, ++ unsigned int type, unsigned int code, int value) ++{ ++} ++#endif /* CONFIG_HOTPLUG */ ++ ++static int button_hotplug_connect(struct input_handler *handler, ++ struct input_dev *dev, const struct input_device_id *id) ++{ ++ struct bh_priv *priv; ++ int ret; ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(button_map); i++) ++ if (test_bit(button_map[i].code, dev->keybit)) ++ break; ++ ++ if (i == ARRAY_SIZE(button_map)) ++ return -ENODEV; ++ ++ priv = kzalloc(sizeof(*priv) + ++ (sizeof(unsigned long) * ARRAY_SIZE(button_map)), ++ GFP_KERNEL); ++ if (!priv) ++ return -ENOMEM; ++ ++ priv->seen = (unsigned long *) &priv[1]; ++ priv->handle.private = priv; ++ priv->handle.dev = dev; ++ priv->handle.handler = handler; ++ priv->handle.name = DRV_NAME; ++ ++ ret = input_register_handle(&priv->handle); ++ if (ret) ++ goto err_free_priv; ++ ++ ret = input_open_device(&priv->handle); ++ if (ret) ++ goto err_unregister_handle; ++ ++ BH_DBG("connected to %s\n", dev->name); ++ ++ return 0; ++ ++ err_unregister_handle: ++ input_unregister_handle(&priv->handle); ++ ++ err_free_priv: ++ kfree(priv); ++ return ret; ++} ++ ++static void button_hotplug_disconnect(struct input_handle *handle) ++{ ++ struct bh_priv *priv = handle->private; ++ ++ input_close_device(handle); ++ input_unregister_handle(handle); ++ ++ kfree(priv); ++} ++ ++static const struct input_device_id button_hotplug_ids[] = { ++ { ++ .flags = INPUT_DEVICE_ID_MATCH_EVBIT, ++ .evbit = { BIT_MASK(EV_KEY) }, ++ }, ++ { ++ /* Terminating entry */ ++ }, ++}; ++ ++MODULE_DEVICE_TABLE(input, button_hotplug_ids); ++ ++static struct input_handler button_hotplug_handler = { ++ .event = button_hotplug_event, ++ .connect = button_hotplug_connect, ++ .disconnect = button_hotplug_disconnect, ++ .name = DRV_NAME, ++ .id_table = button_hotplug_ids, ++}; ++ ++/* -------------------------------------------------------------------------*/ ++ ++static int __init button_hotplug_init(void) ++{ ++ int ret; ++ ++ printk(KERN_INFO DRV_DESC " version " DRV_VERSION "\n"); ++ ret = input_register_handler(&button_hotplug_handler); ++ if (ret) ++ BH_ERR("unable to register input handler\n"); ++ ++ return ret; ++} ++module_init(button_hotplug_init); ++ ++static void __exit button_hotplug_exit(void) ++{ ++ input_unregister_handler(&button_hotplug_handler); ++} ++module_exit(button_hotplug_exit); ++ ++MODULE_DESCRIPTION(DRV_DESC); ++MODULE_VERSION(DRV_VERSION); ++MODULE_AUTHOR("Gabor Juhos "); ++MODULE_LICENSE("GPL v2"); ++ diff --git a/3.2.34/3rd-3rdparty-gpio_button_hotplug-0.1.patch b/3.2.34/3rd-3rdparty-gpio_button_hotplug-0.1.patch new file mode 100644 index 0000000..6b2e78e --- /dev/null +++ b/3.2.34/3rd-3rdparty-gpio_button_hotplug-0.1.patch @@ -0,0 +1,472 @@ +Submitted By: Mario Fetka (mario dot fetka at gmail dot com) +Date: 2012-11-18 +Initial Package Version: 3.2.33 +Origin: openwtr.org packages/system/gpio-button-hotplug +Upstream Status: unknown +Description: gpio button uevent + +diff -Naur linux-3.2.33-go.orig/3rdparty/gpio_button_hotplug/Kconfig 3rdparty/gpio_button_hotplug/Kconfig +--- linux-3.2.33-go.orig/3rdparty/gpio_button_hotplug/Kconfig 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/gpio_button_hotplug/Kconfig 2012-11-18 18:41:43.048939468 +0000 +@@ -0,0 +1,2 @@ ++config GPIO_BUTTON_HOTPLUG ++ tristate "GPIO Button Hotplug driver" +diff -Naur linux-3.2.33-go.orig/3rdparty/gpio_button_hotplug/Makefile 3rdparty/gpio_button_hotplug/Makefile +--- linux-3.2.33-go.orig/3rdparty/gpio_button_hotplug/Makefile 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/gpio_button_hotplug/Makefile 2012-11-18 14:45:26.000000000 +0000 +@@ -0,0 +1 @@ ++obj-${CONFIG_GPIO_BUTTON_HOTPLUG} += gpio-button-hotplug.o +diff -Naur linux-3.2.33-go.orig/3rdparty/gpio_button_hotplug/gpio-button-hotplug.c 3rdparty/gpio_button_hotplug/gpio-button-hotplug.c +--- linux-3.2.33-go.orig/3rdparty/gpio_button_hotplug/gpio-button-hotplug.c 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/gpio_button_hotplug/gpio-button-hotplug.c 2012-11-18 14:45:26.000000000 +0000 +@@ -0,0 +1,450 @@ ++/* ++ * GPIO Button Hotplug driver ++ * ++ * Copyright (C) 2012 Felix Fietkau ++ * Copyright (C) 2008-2010 Gabor Juhos ++ * ++ * Based on the diag.c - GPIO interface driver for Broadcom boards ++ * Copyright (C) 2006 Mike Baker , ++ * Copyright (C) 2006-2007 Felix Fietkau ++ * Copyright (C) 2008 Andy Boyett ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 as published ++ * by the Free Software Foundation. ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define DRV_NAME "gpio-keys-polled" ++ ++#define BH_SKB_SIZE 2048 ++ ++#define PFX DRV_NAME ": " ++ ++#undef BH_DEBUG ++ ++#ifdef BH_DEBUG ++#define BH_DBG(fmt, args...) printk(KERN_DEBUG "%s: " fmt, DRV_NAME, ##args ) ++#else ++#define BH_DBG(fmt, args...) do {} while (0) ++#endif ++ ++#define BH_ERR(fmt, args...) printk(KERN_ERR "%s: " fmt, DRV_NAME, ##args ) ++ ++struct bh_priv { ++ unsigned long seen; ++}; ++ ++struct bh_event { ++ const char *name; ++ char *action; ++ unsigned long seen; ++ ++ struct sk_buff *skb; ++ struct work_struct work; ++}; ++ ++struct bh_map { ++ unsigned int code; ++ const char *name; ++}; ++ ++struct gpio_keys_button_data { ++ struct delayed_work work; ++ struct bh_priv bh; ++ int last_state; ++ int count; ++ int threshold; ++ int can_sleep; ++}; ++ ++extern u64 uevent_next_seqnum(void); ++ ++#define BH_MAP(_code, _name) \ ++ { \ ++ .code = (_code), \ ++ .name = (_name), \ ++ } ++ ++static struct bh_map button_map[] = { ++ BH_MAP(BTN_0, "BTN_0"), ++ BH_MAP(BTN_1, "BTN_1"), ++ BH_MAP(BTN_2, "BTN_2"), ++ BH_MAP(BTN_3, "BTN_3"), ++ BH_MAP(BTN_4, "BTN_4"), ++ BH_MAP(BTN_5, "BTN_5"), ++ BH_MAP(BTN_6, "BTN_6"), ++ BH_MAP(BTN_7, "BTN_7"), ++ BH_MAP(BTN_8, "BTN_8"), ++ BH_MAP(BTN_9, "BTN_9"), ++ BH_MAP(KEY_RESTART, "reset"), ++#ifdef KEY_WPS_BUTTON ++ BH_MAP(KEY_WPS_BUTTON, "wps"), ++#endif /* KEY_WPS_BUTTON */ ++}; ++ ++/* -------------------------------------------------------------------------*/ ++ ++static int bh_event_add_var(struct bh_event *event, int argv, ++ const char *format, ...) ++{ ++ static char buf[128]; ++ char *s; ++ va_list args; ++ int len; ++ ++ if (argv) ++ return 0; ++ ++ va_start(args, format); ++ len = vsnprintf(buf, sizeof(buf), format, args); ++ va_end(args); ++ ++ if (len >= sizeof(buf)) { ++ BH_ERR("buffer size too small\n"); ++ WARN_ON(1); ++ return -ENOMEM; ++ } ++ ++ s = skb_put(event->skb, len + 1); ++ strcpy(s, buf); ++ ++ BH_DBG("added variable '%s'\n", s); ++ ++ return 0; ++} ++ ++static int button_hotplug_fill_event(struct bh_event *event) ++{ ++ int ret; ++ ++ ret = bh_event_add_var(event, 0, "HOME=%s", "/"); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "PATH=%s", ++ "/sbin:/bin:/usr/sbin:/usr/bin"); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "SUBSYSTEM=%s", "button"); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "ACTION=%s", event->action); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "BUTTON=%s", event->name); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "SEEN=%ld", event->seen); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "SEQNUM=%llu", uevent_next_seqnum()); ++ ++ return ret; ++} ++ ++static void button_hotplug_work(struct work_struct *work) ++{ ++ struct bh_event *event = container_of(work, struct bh_event, work); ++ int ret = 0; ++ ++ event->skb = alloc_skb(BH_SKB_SIZE, GFP_KERNEL); ++ if (!event->skb) ++ goto out_free_event; ++ ++ ret = bh_event_add_var(event, 0, "%s@", event->action); ++ if (ret) ++ goto out_free_skb; ++ ++ ret = button_hotplug_fill_event(event); ++ if (ret) ++ goto out_free_skb; ++ ++ NETLINK_CB(event->skb).dst_group = 1; ++ broadcast_uevent(event->skb, 0, 1, GFP_KERNEL); ++ ++ out_free_skb: ++ if (ret) { ++ BH_ERR("work error %d\n", ret); ++ kfree_skb(event->skb); ++ } ++ out_free_event: ++ kfree(event); ++} ++ ++static int button_hotplug_create_event(const char *name, unsigned long seen, ++ int pressed) ++{ ++ struct bh_event *event; ++ ++ BH_DBG("create event, name=%s, seen=%lu, pressed=%d\n", ++ name, seen, pressed); ++ ++ event = kzalloc(sizeof(*event), GFP_KERNEL); ++ if (!event) ++ return -ENOMEM; ++ ++ event->name = name; ++ event->seen = seen; ++ event->action = pressed ? "pressed" : "released"; ++ ++ INIT_WORK(&event->work, (void *)(void *)button_hotplug_work); ++ schedule_work(&event->work); ++ ++ return 0; ++} ++ ++/* -------------------------------------------------------------------------*/ ++ ++#ifdef CONFIG_HOTPLUG ++static int button_get_index(unsigned int code) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(button_map); i++) ++ if (button_map[i].code == code) ++ return i; ++ ++ return -1; ++} ++static void button_hotplug_event(struct gpio_keys_button_data *data, ++ unsigned int type, unsigned int code, int value) ++{ ++ struct bh_priv *priv = &data->bh; ++ unsigned long seen = jiffies; ++ int btn; ++ ++ BH_DBG("event type=%u, code=%u, value=%d\n", type, code, value); ++ ++ if (type != EV_KEY) ++ return; ++ ++ btn = button_get_index(code); ++ if (btn < 0) ++ return; ++ ++ button_hotplug_create_event(button_map[btn].name, ++ (seen - priv->seen) / HZ, value); ++ priv->seen = seen; ++} ++#else ++static void button_hotplug_event(struct gpio_keys_button_data *data, ++ unsigned int type, unsigned int code, int value) ++{ ++} ++#endif /* CONFIG_HOTPLUG */ ++ ++struct gpio_keys_polled_dev { ++ struct delayed_work work; ++ ++ struct device *dev; ++ struct gpio_keys_platform_data *pdata; ++ struct gpio_keys_button_data data[0]; ++}; ++ ++static void gpio_keys_polled_check_state(struct gpio_keys_button *button, ++ struct gpio_keys_button_data *bdata) ++{ ++ int state; ++ ++ if (bdata->can_sleep) ++ state = !!gpio_get_value_cansleep(button->gpio); ++ else ++ state = !!gpio_get_value(button->gpio); ++ ++ state = !!(state ^ button->active_low); ++ if (state != bdata->last_state) { ++ unsigned int type = button->type ?: EV_KEY; ++ ++ button_hotplug_event(bdata, type, button->code, state); ++ bdata->count = 0; ++ bdata->last_state = state; ++ } ++} ++ ++static void gpio_keys_polled_queue_work(struct gpio_keys_polled_dev *bdev) ++{ ++ struct gpio_keys_platform_data *pdata = bdev->pdata; ++ unsigned long delay = msecs_to_jiffies(pdata->poll_interval); ++ ++ if (delay >= HZ) ++ delay = round_jiffies_relative(delay); ++ schedule_delayed_work(&bdev->work, delay); ++} ++ ++static void gpio_keys_polled_poll(struct work_struct *work) ++{ ++ struct gpio_keys_polled_dev *bdev = ++ container_of(work, struct gpio_keys_polled_dev, work.work); ++ struct gpio_keys_platform_data *pdata = bdev->pdata; ++ int i; ++ ++ for (i = 0; i < bdev->pdata->nbuttons; i++) { ++ struct gpio_keys_button_data *bdata = &bdev->data[i]; ++ ++ if (bdata->count < bdata->threshold) ++ bdata->count++; ++ else ++ gpio_keys_polled_check_state(&pdata->buttons[i], bdata); ++ } ++ gpio_keys_polled_queue_work(bdev); ++} ++ ++static void __devinit gpio_keys_polled_open(struct gpio_keys_polled_dev *bdev) ++{ ++ struct gpio_keys_platform_data *pdata = bdev->pdata; ++ int i; ++ ++ if (pdata->enable) ++ pdata->enable(bdev->dev); ++ ++ /* report initial state of the buttons */ ++ for (i = 0; i < pdata->nbuttons; i++) ++ gpio_keys_polled_check_state(&pdata->buttons[i], &bdev->data[i]); ++ ++ gpio_keys_polled_queue_work(bdev); ++} ++ ++static void __devexit gpio_keys_polled_close(struct gpio_keys_polled_dev *bdev) ++{ ++ struct gpio_keys_platform_data *pdata = bdev->pdata; ++ ++ cancel_delayed_work_sync(&bdev->work); ++ ++ if (pdata->disable) ++ pdata->disable(bdev->dev); ++} ++ ++static int __devinit gpio_keys_polled_probe(struct platform_device *pdev) ++{ ++ struct gpio_keys_platform_data *pdata = pdev->dev.platform_data; ++ struct device *dev = &pdev->dev; ++ struct gpio_keys_polled_dev *bdev; ++ int error; ++ int i; ++ ++ if (!pdata || !pdata->poll_interval) ++ return -EINVAL; ++ ++ bdev = kzalloc(sizeof(struct gpio_keys_polled_dev) + ++ pdata->nbuttons * sizeof(struct gpio_keys_button_data), ++ GFP_KERNEL); ++ if (!bdev) { ++ dev_err(dev, "no memory for private data\n"); ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < pdata->nbuttons; i++) { ++ struct gpio_keys_button *button = &pdata->buttons[i]; ++ struct gpio_keys_button_data *bdata = &bdev->data[i]; ++ unsigned int gpio = button->gpio; ++ ++ if (button->wakeup) { ++ dev_err(dev, DRV_NAME " does not support wakeup\n"); ++ error = -EINVAL; ++ goto err_free_gpio; ++ } ++ ++ error = gpio_request(gpio, ++ button->desc ? button->desc : DRV_NAME); ++ if (error) { ++ dev_err(dev, "unable to claim gpio %u, err=%d\n", ++ gpio, error); ++ goto err_free_gpio; ++ } ++ ++ error = gpio_direction_input(gpio); ++ if (error) { ++ dev_err(dev, ++ "unable to set direction on gpio %u, err=%d\n", ++ gpio, error); ++ goto err_free_gpio; ++ } ++ ++ bdata->can_sleep = gpio_cansleep(gpio); ++ bdata->last_state = 0; ++ bdata->threshold = DIV_ROUND_UP(button->debounce_interval, ++ pdata->poll_interval); ++ } ++ ++ bdev->dev = &pdev->dev; ++ bdev->pdata = pdata; ++ platform_set_drvdata(pdev, bdev); ++ ++ INIT_DELAYED_WORK(&bdev->work, gpio_keys_polled_poll); ++ ++ gpio_keys_polled_open(bdev); ++ ++ return 0; ++ ++err_free_gpio: ++ while (--i >= 0) ++ gpio_free(pdata->buttons[i].gpio); ++ ++ kfree(bdev); ++ platform_set_drvdata(pdev, NULL); ++ ++ return error; ++} ++ ++static int __devexit gpio_keys_polled_remove(struct platform_device *pdev) ++{ ++ struct gpio_keys_polled_dev *bdev = platform_get_drvdata(pdev); ++ struct gpio_keys_platform_data *pdata = bdev->pdata; ++ int i = pdata->nbuttons; ++ ++ gpio_keys_polled_close(bdev); ++ ++ while (--i >= 0) ++ gpio_free(pdata->buttons[i].gpio); ++ ++ kfree(bdev); ++ platform_set_drvdata(pdev, NULL); ++ ++ return 0; ++} ++ ++static struct platform_driver gpio_keys_polled_driver = { ++ .probe = gpio_keys_polled_probe, ++ .remove = __devexit_p(gpio_keys_polled_remove), ++ .driver = { ++ .name = DRV_NAME, ++ .owner = THIS_MODULE, ++ }, ++}; ++ ++static int __init gpio_keys_polled_init(void) ++{ ++ return platform_driver_register(&gpio_keys_polled_driver); ++} ++ ++static void __exit gpio_keys_polled_exit(void) ++{ ++ platform_driver_unregister(&gpio_keys_polled_driver); ++} ++ ++module_init(gpio_keys_polled_init); ++module_exit(gpio_keys_polled_exit); ++ ++MODULE_AUTHOR("Gabor Juhos "); ++MODULE_AUTHOR("Felix Fietkau "); ++MODULE_DESCRIPTION("Polled GPIO Buttons hotplug driver"); ++MODULE_LICENSE("GPL v2"); ++MODULE_ALIAS("platform:" DRV_NAME); diff --git a/3.2.34/3rd-3rdparty-gpio_event_drv-0.1.patch b/3.2.34/3rd-3rdparty-gpio_event_drv-0.1.patch new file mode 100644 index 0000000..3a75e28 --- /dev/null +++ b/3.2.34/3rd-3rdparty-gpio_event_drv-0.1.patch @@ -0,0 +1,1354 @@ +Submitted By: Mario Fetka (mario dot fetka at gmail dot com) +Date: 2012-11-18 +Initial Package Version: 3.2.33 +Origin: http://wiki.gumstix.org/index.php?title=GPIO_Event_Driver +Upstream Status: unknown +Description: The gpio-event driver consists of a loadable kernel module, +which registers an interrupt handler, along with an example user-mode program, +which allows the settings to be manipulated and changes to be reported. + +diff -Naur linux-3.2.33-go.orig/3rdparty/gpio_event_drv/Kconfig 3rdparty/gpio_event_drv/Kconfig +--- linux-3.2.33-go.orig/3rdparty/gpio_event_drv/Kconfig 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/gpio_event_drv/Kconfig 2012-11-18 19:03:08.020733547 +0000 +@@ -0,0 +1,2 @@ ++config GPIO_EVENT_DRV ++ tristate "GPIO Event Driver (requires userspace app)" +diff -Naur linux-3.2.33-go.orig/3rdparty/gpio_event_drv/Makefile 3rdparty/gpio_event_drv/Makefile +--- linux-3.2.33-go.orig/3rdparty/gpio_event_drv/Makefile 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/gpio_event_drv/Makefile 2012-11-18 19:02:20.409297191 +0000 +@@ -0,0 +1 @@ ++obj-${CONFIG_GPIO_EVENT_DRV} += gpio-event-drv.o +\ No newline at end of file +diff -Naur linux-3.2.33-go.orig/3rdparty/gpio_event_drv/gpio-event-drv.c 3rdparty/gpio_event_drv/gpio-event-drv.c +--- linux-3.2.33-go.orig/3rdparty/gpio_event_drv/gpio-event-drv.c 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/gpio_event_drv/gpio-event-drv.c 2012-11-18 10:24:14.000000000 +0000 +@@ -0,0 +1,1210 @@ ++/**************************************************************************** ++* ++* Copyright (c) 2006 Dave Hylands ++* ++* This program is free software; you can redistribute it and/or modify ++* it under the terms of the GNU General Public License version 2 as ++* published by the Free Software Foundation. ++* ++* Alternatively, this software may be distributed under the terms of BSD ++* license. ++* ++* See README and COPYING for more details. ++* ++**************************************************************************** ++* ++* This driver allows multiple GPIO pins to be monitored and allows a user ++* mode program to be notified when the pin changes. ++* ++****************************************************************************/ ++ ++/* ---- Include Files ---------------------------------------------------- */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++ ++#include "gpio-event-drv.h" ++ ++/* ---- Public Variables ------------------------------------------------- */ ++/* ---- Private Constants and Types -------------------------------------- */ ++ ++#define GPIO_EVENT_DEV_NAME "gpio-event" ++ ++#define DEBUG_ENABLED 1 ++ ++#if DEBUG_ENABLED ++# define DEBUG( flag, fmt, args... ) do { if ( gDebug ## flag ) printk( "%s: " fmt, __FUNCTION__ , ## args ); } while (0) ++#else ++# define DEBUG( flag, fmt, args... ) ++#endif ++ ++/* ---- Private Variables ------------------------------------------------ */ ++ ++static char gBanner[] __initdata = KERN_INFO "GPIO Event Monitor 0.1 Compiled: " __DATE__ " at " __TIME__ "\n"; ++ ++static int gDebugTrace = 0; ++static int gDebugIoctl = 0; ++static int gDebugError = 1; ++static int gLostEvents = 0; ++ ++static struct ctl_table_header *gSysCtlHeader; ++ ++#if ( LINUX_VERSION_CODE >= KERNEL_VERSION( 2, 6, 33 )) ++#define CTL_NAME(x) ++#else ++#define CTL_NAME(x) .ctl_name = x, ++#endif ++ ++static struct ctl_table gSysCtlSample[] = ++{ ++ { ++ CTL_NAME(1) ++ .procname = "lost-events", ++ .data = &gLostEvents, ++ .maxlen = sizeof( int ), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { ++ CTL_NAME(101) ++ .procname = "debug-trace", ++ .data = &gDebugTrace, ++ .maxlen = sizeof( int ), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { ++ CTL_NAME(102) ++ .procname = "debug-ioctl", ++ .data = &gDebugIoctl, ++ .maxlen = sizeof( int ), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { ++ CTL_NAME(103) ++ .procname = "debug-error", ++ .data = &gDebugError, ++ .maxlen = sizeof( int ), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { 0 } ++}; ++ ++static struct ctl_table gSysCtl[] = ++{ ++ { ++ CTL_NAME(CTL_GPIO_EVENT) ++ .procname = "gpio-event", ++ .mode = 0555, ++ .child = gSysCtlSample ++ }, ++ { 0 } ++}; ++ ++/* ++ * An instance of GPIO_FileData_t is maintained for file open ++ */ ++ ++#define GPIO_EVENT_QUEUE_LEN 20 ++ ++// GPIO_EVENT_BUFFER_SIZE needs to be big enough to hold the ASCII version ++// of the GPIO_Event_t as well as the binary version of the GPIO_Event_t ++ ++#define GPIO_EVENT_BUFFER_SIZE 32 ++ ++typedef struct ++{ ++ struct list_head list; ++ wait_queue_head_t waitQueue; ++ ++ spinlock_t queueLock; ++ GPIO_Event_t queueData[ GPIO_EVENT_QUEUE_LEN ]; ++ volatile int getIndex; ++ volatile int putIndex; ++ volatile int numEvents; ++ ++ GPIO_EventReadMode_t readMode; ++ ++ char buffer[ GPIO_EVENT_BUFFER_SIZE ]; ++ int bufBytes; ++ ++} GPIO_FileData_t; ++ ++/* ++ * An instance of GPIO_PinData_t is maintained for each GPIO line which is ++ * monitored, ++ */ ++ ++typedef enum ++{ ++ PIN_LOW = 0, // Matches level of GPIO line ++ PIN_HIGH = 1, ++ PIN_BOUNCING_LOW, ++ PIN_BOUNCING_HIGH, ++} PinState_t; ++ ++typedef struct ++{ ++ struct list_head list; // list of all pins ++ ++ int gpio; // The gpio line being monitored ++ ++ // We maintain two lists, a global list of pins, and a list associated with each open ++ ++ ++ struct timer_list debounceTimer; // Timer to wake u up after an edge ++ uint8_t debounceMilliSec; // debounce time in milliseconds ++ char devName[ 16 ]; // gpio xx event ++ ++ GPIO_EventEdgeType_t edgeType; // Type of edge(s) we're looking for. ++ ++ PinState_t pinState; // Was the GPIO line low or high? ++ ++} GPIO_PinData_t; ++ ++static volatile int gReportLostEvents = 1; ++ ++static struct class *gGpioEventClass = NULL; ++static struct cdev gGpioEventCDev; ++static dev_t gGpioEventDevNum = 0; ++ ++static DEFINE_SPINLOCK( gFileListLock ); ++static DEFINE_SPINLOCK( gPinListLock ); ++ ++static LIST_HEAD( gFileList ); ++static LIST_HEAD( gPinList ); ++ ++static struct proc_dir_entry *gProcGpioEvent; ++static struct proc_dir_entry *gProcPins; ++ ++ ++/* ---- Private Function Prototypes -------------------------------------- */ ++/* ---- Functions -------------------------------------------------------- */ ++ ++typedef struct ++{ ++ unsigned long flags; ++ struct list_head *list; ++ ++} pin_seq_t; ++ ++/**************************************************************************** ++* ++* pin_seq_start ++* ++* seq_file iterator which goes through the pins being monitored ++* ++****************************************************************************/ ++ ++static void *pin_seq_start( struct seq_file *s, loff_t *pos ) ++{ ++ pin_seq_t *ps; ++ loff_t i; ++ ++ s->private = NULL; ++ ++ if (( ps = kcalloc( 1, sizeof( pin_seq_t ), GFP_KERNEL )) == NULL ) ++ { ++ return ERR_PTR( -ENOMEM ); ++ } ++ s->private = ps; ++ ++ spin_lock_irqsave( &gPinListLock, ps->flags ); ++ ++ if ( list_empty( &gPinList )) ++ { ++ DEBUG( Trace, "list_empty\n" ); ++ return NULL; ++ } ++ ps->list = gPinList.next; ++ ++ for ( i = 0; i < *pos; i++ ) ++ { ++ if ( list_is_last( ps->list, &gPinList )) ++ { ++ DEBUG( Trace, "No item @ %llu\n", i + 1 ); ++ return NULL; ++ } ++ ps->list = ps->list->next; ++ } ++ ++ ++ DEBUG( Trace, "ps->list = 0x%08lx, *pos = %llu\n", (long)ps->list, *pos ); ++ ++ return ps->list; ++ ++} // pin_seq_start ++ ++/**************************************************************************** ++* ++* pin_seq_show ++* ++* seq_file iterator which goes through the pins being monitored ++* ++****************************************************************************/ ++ ++static int pin_seq_show( struct seq_file *s, void *v ) ++{ ++ GPIO_PinData_t *pin = list_entry( v, GPIO_PinData_t, list ); ++ char *edgeTypeStr; ++ ++ DEBUG( Trace, "v = 0x%08lx\n", (long)v ); ++ ++ switch ( pin->edgeType ) ++ { ++ case GPIO_EventRisingEdge: edgeTypeStr = "Rising "; break; ++ case GPIO_EventFallingEdge: edgeTypeStr = "Falling"; break; ++ case GPIO_EventBothEdges: edgeTypeStr = "Both "; break; ++ default: edgeTypeStr = "Unknown"; break; ++ } ++ ++ seq_printf( s, "GPIO: %3d Edge: %s Debounce: %d msec\n", pin->gpio, edgeTypeStr, pin->debounceMilliSec ); ++ ++ return 0; ++ ++} // pin_seq_show ++ ++/**************************************************************************** ++* ++* pin_seq_next ++* ++* seq_file iterator which goes through the pins being monitored ++* ++****************************************************************************/ ++ ++static void *pin_seq_next( struct seq_file *s, void *v, loff_t *pos ) ++{ ++ pin_seq_t *ps = s->private; ++ ++ DEBUG( Trace, "v = 0x%08lx *pos = %llu\n", (long)v, *pos ); ++ ++ if ( list_is_last( ps->list, &gPinList )) ++ { ++ DEBUG( Trace, "ps->list = 0x%08lx (end of list)\n", (long)ps->list ); ++ ++ return NULL; ++ } ++ (*pos)++; ++ ps->list = ps->list->next; ++ ++ DEBUG( Trace, "ps->list = 0x%08lx\n", (long)ps->list ); ++ ++ return ps->list; ++ ++} // pin_seq_next ++ ++/**************************************************************************** ++* ++* pin_seq_stop ++* ++* seq_file iterator which goes through the pins being monitored ++* ++****************************************************************************/ ++ ++static void pin_seq_stop( struct seq_file *s, void *v ) ++{ ++ pin_seq_t *ps = s->private; ++ ++ DEBUG( Trace, "v = 0x%08lx\n", (long)v ); ++ ++ if ( ps != NULL ) ++ { ++ spin_unlock_irqrestore( &gPinListLock, ps->flags ); ++ kfree( ps ); ++ } ++ ++} // pin_seq_stop ++ ++/**************************************************************************** ++* ++* pin_seq_ops ++* ++* Ties all of the pin_seq_xxx routines together. ++* ++****************************************************************************/ ++ ++static struct seq_operations pin_seq_ops = ++{ ++ .start = pin_seq_start, ++ .next = pin_seq_next, ++ .stop = pin_seq_stop, ++ .show = pin_seq_show ++}; ++ ++/**************************************************************************** ++* ++* pins_proc_open ++* ++* Open method for /proc/gpio-event/pin ++* ++****************************************************************************/ ++ ++static int pins_proc_open( struct inode *inode, struct file *file ) ++{ ++ DEBUG( Trace, "called\n" ); ++ ++ return seq_open( file, &pin_seq_ops ); ++} ++ ++/**************************************************************************** ++* ++* pin_proc_ops ++* ++* File operations for our /proc/gpio-event/pins file ++* ++****************************************************************************/ ++ ++static struct file_operations pins_proc_ops = ++{ ++ .owner = THIS_MODULE, ++ .open = pins_proc_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release ++}; ++ ++ ++ ++/**************************************************************************** ++* ++* find_pin ++* ++* Searches the list to see if 'gpio' is currently being monitored. ++* ++****************************************************************************/ ++ ++static GPIO_PinData_t *find_pin( int gpio ) ++{ ++ struct list_head *pin; ++ ++ assert_spin_locked( &gPinListLock ); ++ ++ list_for_each( pin, &gPinList ) ++ { ++ GPIO_PinData_t *pinData = list_entry( pin, GPIO_PinData_t, list ); ++ ++ if ( pinData->gpio == gpio ) ++ { ++ return pinData; ++ } ++ } ++ ++ return NULL; ++ ++} // find_pin ++ ++/**************************************************************************** ++* ++* gpio_event_queue_event ++* ++* Queues an sample event from the bottom half to the top half. This ++* function queues up the event on every file that's open. ++* ++****************************************************************************/ ++ ++static void gpio_event_queue_event( const GPIO_Event_t *gpioEvent ) ++{ ++ unsigned long flags; ++ struct list_head *file; ++ ++ DEBUG( Trace, "gpio %d:%c@%ld.%06ld\n", ++ gpioEvent->gpio, ++ gpioEvent->edgeType == GPIO_EventRisingEdge ? 'R' : 'F', ++ gpioEvent->time.tv_sec, ++ gpioEvent->time.tv_usec ); ++ ++ // Queue up the event on all of the open files ++ // ++ // This function is only called from the ISR, with interrupts already ++ // disabled. ++ ++ spin_lock_irqsave( &gFileListLock, flags ); ++ ++ list_for_each( file, &gFileList ) ++ { ++ GPIO_FileData_t *fileData = list_entry( file, GPIO_FileData_t, list ); ++ ++ spin_lock( &fileData->queueLock ); ++ { ++ if ( fileData->numEvents >= GPIO_EVENT_QUEUE_LEN ) ++ { ++ // Queue is full - Only report first event lost ++ ++ if ( gReportLostEvents ) ++ { ++ printk( KERN_ERR "GPIO Event: event lost due to queue full\n" ); ++ gReportLostEvents = 0; ++ } ++ gLostEvents++; ++ } ++ else ++ { ++ fileData->queueData[ fileData->putIndex++ ] = *gpioEvent; ++ if ( fileData->putIndex >= GPIO_EVENT_QUEUE_LEN ) ++ { ++ fileData->putIndex = 0; ++ } ++ fileData->numEvents++; ++ } ++ } ++ spin_unlock( &fileData->queueLock ); ++ ++ wake_up_interruptible( &fileData->waitQueue ); ++ } ++ spin_unlock_irqrestore( &gFileListLock, flags ); ++ ++} // gpio_event_queue_event ++ ++/**************************************************************************** ++* ++* gpio_event_dequeue_event ++* ++* Removes an event from the queue ++* ++****************************************************************************/ ++ ++static int gpio_event_dequeue_event( GPIO_FileData_t *fileData, GPIO_Event_t *gpioEvent ) ++{ ++ unsigned long flags; ++ int eventAvailable = 0; ++ ++ spin_lock_irqsave( &fileData->queueLock, flags ); ++ { ++ if ( fileData->numEvents > 0 ) ++ { ++ *gpioEvent = fileData->queueData[ fileData->getIndex++ ]; ++ if ( fileData->getIndex >= GPIO_EVENT_QUEUE_LEN ) ++ { ++ fileData->getIndex = 0; ++ } ++ fileData->numEvents--; ++ ++ eventAvailable = 1; ++ ++ if ( fileData->numEvents == 0 ) ++ { ++ // Since somebody is reading the queue now, indicate that we ++ // can report lost events again ++ ++ gReportLostEvents = 1; ++ } ++ } ++ } ++ spin_unlock_irqrestore( &fileData->queueLock, flags ); ++ ++ DEBUG( Trace, "gpio %d:%c@%ld.%06ld\n", ++ gpioEvent->gpio, ++ gpioEvent->edgeType == GPIO_EventRisingEdge ? 'R' : 'F', ++ gpioEvent->time.tv_sec, ++ gpioEvent->time.tv_usec ); ++ ++ return eventAvailable; ++ ++} // gpio_event_dequeue_event ++ ++/**************************************************************************** ++* ++* gpio_event_irq ++* ++****************************************************************************/ ++ ++static irqreturn_t gpio_event_irq( int irq, void *dev_id ) ++{ ++ GPIO_PinData_t *pinData = (GPIO_PinData_t *)dev_id; ++ GPIO_Event_t gpioEvent; ++ int currLevel = gpio_get_value( pinData->gpio ); ++ ++ // We're called with interrupts disabled. ++ ++ (void)irq; ++ ++ do_gettimeofday( &gpioEvent.time ); ++ gpioEvent.gpio = pinData->gpio; ++ ++ if ( pinData->debounceMilliSec == 0 ) ++ { ++ // We assume that this is a clean signal ++ ++ pinData->pinState = (PinState_t)currLevel; ++ ++ if ( pinData->edgeType == GPIO_EventBothEdges ) ++ { ++ // There's no register to tell which edge just occurred. So we ++ // assume that it just changed into its current level. ++ ++ if ( currLevel ) ++ { ++ // Pin is currently high, so this must be a rising edge ++ ++ gpioEvent.edgeType = GPIO_EventRisingEdge; ++ } ++ else ++ { ++ // Pin is currently low, so this must be a falling edge ++ ++ gpioEvent.edgeType = GPIO_EventFallingEdge; ++ } ++ } ++ else ++ { ++ // If we're only monitoring one type of edge, then that's the one ++ // that happened. ++ ++ gpioEvent.edgeType = pinData->edgeType; ++ } ++ gpio_event_queue_event( &gpioEvent ); ++ } ++ else ++ { ++ gpioEvent.edgeType = 0; ++ ++ // If we need to debounce, then we need to monitor both edges, and ++ // use the debounce timer to figure out the real state. So we don't ++ // actually know which edge we just got. We use a state machine ++ // to track things. ++ ++ switch ( pinData->pinState ) ++ { ++ case PIN_LOW: ++ { ++ pinData->pinState = PIN_BOUNCING_HIGH; ++ gpioEvent.edgeType = GPIO_EventRisingEdge; ++ break; ++ } ++ ++ case PIN_HIGH: ++ { ++ pinData->pinState = PIN_BOUNCING_LOW; ++ gpioEvent.edgeType = GPIO_EventFallingEdge; ++ break; ++ } ++ ++ default: ++ { ++ break; ++ } ++ } ++ ++ if (( pinData->edgeType & gpioEvent.edgeType ) != 0 ) ++ { ++ // This is an edge that the user is interested in - send it along. ++ ++ gpio_event_queue_event( &gpioEvent ); ++ } ++ ++ // Disable interrupts for our gpio to allow debounce to occur. The ++ // timer will re-enable the interrupt. ++ ++ disable_irq_nosync( irq ); ++ ++ // Since we have no idea when in the current jiffy that the edge ++ // occurred, we add 1 to the calculation to guarantee at least one ++ // whole jiffy. ++ ++ mod_timer( &pinData->debounceTimer, jiffies + msecs_to_jiffies( pinData->debounceMilliSec ) + 1 ); ++ } ++ ++ return IRQ_HANDLED; ++ ++} // gpio_event_irq ++ ++/**************************************************************************** ++* ++* gpio_event_timer ++* ++****************************************************************************/ ++ ++void gpio_event_timer( unsigned long data ) ++{ ++ GPIO_PinData_t *pinData = (GPIO_PinData_t *)data; ++ ++ // This function is called when the debounce timer for a gpio expires. ++ // We record the state of the pin so that we can figure out what the ++ // next edge will be. ++ ++ pinData->pinState = ( gpio_get_value( pinData->gpio ) != 0 ); ++ ++ // Turn interrupts back on so we can catch the next edge ++ ++ enable_irq( gpio_to_irq( pinData->gpio )); ++ ++} // gpio_event_timer ++ ++/**************************************************************************** ++* ++* gpio_event_monitor ++* ++****************************************************************************/ ++ ++static int gpio_event_monitor( GPIO_EventMonitor_t *monitor ) ++{ ++ int rc = 0; ++ unsigned long flags; ++ GPIO_PinData_t *pinData; ++ unsigned long irqFlags; ++ ++ spin_lock_irqsave( &gPinListLock, flags ); ++ ++ if ( monitor->onOff ) ++ { ++ // Check to make sure we aren't already monitoring the gpio ++ ++ if (( pinData = find_pin( monitor->gpio )) != NULL ) ++ { ++ // We are already monitoring the pin. Unmonitor the pin and then ++ // proceed. ++ ++ monitor->onOff = 0; ++ ++ spin_unlock_irqrestore( &gPinListLock, flags ); ++ gpio_event_monitor( monitor ); ++ spin_lock_irqsave( &gPinListLock, flags ); ++ } ++ ++ if (( pinData = kcalloc( 1, sizeof( *pinData ), GFP_KERNEL )) == NULL ) ++ { ++ DEBUG( Error, "GPIO %d: Out of memory\n", monitor->gpio ); ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ INIT_LIST_HEAD( &pinData->list ); ++ ++ snprintf( pinData->devName, sizeof( pinData->devName ), "gpio %d event", monitor->gpio ); ++ ++ // Note: ++ // Calling request_irq will automatically set the pin to be an input. ++ ++ irqFlags = 0; ++ ++ if ( monitor->debounceMilliSec == 0 ) ++ { ++ // A clean signal is being presented, so we can just look for ++ // a particular edge ++ ++ if (( monitor->edgeType & GPIO_EventRisingEdge ) != 0 ) ++ { ++ irqFlags |= IRQF_TRIGGER_RISING; ++ } ++ if (( monitor->edgeType & GPIO_EventFallingEdge ) != 0 ) ++ { ++ irqFlags |= IRQF_TRIGGER_FALLING; ++ } ++ } ++ else ++ { ++ // Since we need to debounce, we need to look for both types of ++ // edges, since we get both types of edges whenever a bounce ++ // happens. ++ ++ irqFlags |= IRQF_TRIGGER_RISING; ++ irqFlags |= IRQF_TRIGGER_FALLING; ++ } ++ ++ if (( rc = request_irq( gpio_to_irq( monitor->gpio ), gpio_event_irq, irqFlags, pinData->devName, pinData )) != 0 ) ++ { ++ DEBUG( Error, "Unable to register irq for GPIO %d\n", monitor->gpio ); ++ kfree( pinData ); ++ goto out; ++ } ++ ++ pinData->gpio = monitor->gpio; ++ pinData->edgeType = monitor->edgeType; ++ pinData->debounceMilliSec = monitor->debounceMilliSec; ++ ++ init_timer( &pinData->debounceTimer ); ++ ++ pinData->debounceTimer.data = (unsigned long)pinData; ++ pinData->debounceTimer.function = gpio_event_timer; ++ ++ list_add_tail( &pinData->list, &gPinList ); ++ ++ if ( gpio_get_value( pinData->gpio ) == 0 ) ++ { ++ pinData->pinState = PIN_LOW; ++ } ++ else ++ { ++ pinData->pinState = PIN_HIGH; ++ } ++ } ++ else ++ { ++ if (( pinData = find_pin( monitor->gpio )) == NULL ) ++ { ++ DEBUG( Error, "GPIO %d isn't being monitored\n", monitor->gpio ); ++ rc = -ENXIO; ++ goto out; ++ } ++ ++ // We've found the gpio being monitored - turn things off. ++ ++ free_irq( gpio_to_irq( pinData->gpio ), pinData ); ++ ++ del_timer_sync( &pinData->debounceTimer ); ++ list_del( &pinData->list ); ++ ++ kfree( pinData ); ++ } ++ ++out: ++ ++ spin_unlock_irqrestore( &gPinListLock, flags ); ++ ++ return rc; ++ ++} // gpio_event_monitor ++ ++/**************************************************************************** ++* ++* gpio_event_ioctl ++* ++* Called to process ioctl requests ++* ++*****************************************************************************/ ++ ++long gpio_event_ioctl( struct file *file, unsigned int cmd, unsigned long arg ) ++{ ++ GPIO_FileData_t *fileData; ++ ++ DEBUG( Trace, "type: '%c' cmd: 0x%x\n", _IOC_TYPE( cmd ), _IOC_NR( cmd )); ++ ++ fileData = file->private_data; ++ ++ switch ( cmd ) ++ { ++ case GPIO_EVENT_IOCTL_MONITOR_GPIO: ++ { ++ GPIO_EventMonitor_t monitor; ++ ++ if ( copy_from_user( &monitor, (void *)arg, sizeof( monitor )) != 0 ) ++ { ++ return -EFAULT; ++ } ++ return gpio_event_monitor( &monitor ); ++ } ++ ++ case GPIO_EVENT_IOCTL_SET_READ_MODE: ++ { ++ fileData->readMode = (GPIO_EventReadMode_t)arg; ++ break; ++ } ++ ++ case TCGETS: ++ { ++ // When cat opens this device, we get this ioctl ++ return -ENOTTY; ++ } ++ ++ default: ++ { ++ DEBUG( Error, "Unrecognized ioctl: '0x%x'\n", cmd ); ++ return -ENOTTY; ++ } ++ } ++ ++ return 0; ++ ++} // gpio_event_ioctl ++ ++/**************************************************************************** ++* ++* gpio_event_open ++* ++****************************************************************************/ ++ ++static int gpio_event_open( struct inode *inode, struct file *file ) ++{ ++ unsigned long flags; ++ GPIO_FileData_t *fileData; ++ ++ DEBUG( Trace, "gpio_event_open called, major = %d, minor = %d\n", MAJOR( inode->i_rdev ), MINOR( inode->i_rdev )); ++ ++ // Allocate a per-open data structure ++ ++ if (( fileData = kcalloc( 1, sizeof( *fileData ), GFP_KERNEL )) == NULL ) ++ { ++ return -ENOMEM; ++ } ++ ++ INIT_LIST_HEAD( &fileData->list ); ++ ++ init_waitqueue_head( &fileData->waitQueue ); ++ ++ spin_lock_init( &fileData->queueLock ); ++ ++ fileData->getIndex = 0; ++ fileData->putIndex = 0; ++ fileData->numEvents = 0; ++ fileData->bufBytes = 0; ++ ++ fileData->readMode = GPIO_EventReadModeAscii; ++ ++ file->private_data = fileData; ++ ++ spin_lock_irqsave( &gFileListLock, flags ); ++ { ++ list_add_tail( &fileData->list, &gFileList ); ++ } ++ spin_unlock_irqrestore( &gFileListLock, flags ); ++ ++ return 0; ++ ++} // gpio_event_open ++ ++/**************************************************************************** ++* ++* gpio_event_read ++* ++****************************************************************************/ ++ ++static ssize_t gpio_event_read( struct file *file, char *buffer, size_t spaceRemaining, loff_t *ppos ) ++{ ++ int rc; ++ ssize_t bytesCopied = 0; ++ ssize_t bytesToCopy; ++ GPIO_FileData_t *fileData = file->private_data; ++ ++ DEBUG( Trace, "gpio_event_read called, major = %d, minor = %d\n", MAJOR( file->f_dentry->d_inode->i_rdev ), MINOR( file->f_dentry->d_inode->i_rdev )); ++ ++ if ( spaceRemaining == 0 ) ++ { ++ return 0; ++ } ++ ++ // First of all, return any unread data from the previous call ++ ++ if ( fileData->bufBytes > 0 ) ++ { ++ if ( spaceRemaining < fileData->bufBytes ) ++ { ++ bytesCopied = spaceRemaining; ++ } ++ else ++ { ++ bytesCopied = fileData->bufBytes; ++ } ++ ++ if ( copy_to_user( &buffer[0], &fileData->buffer[0], bytesCopied ) != 0 ) ++ { ++ return -EFAULT; ++ } ++ if ( fileData->bufBytes > bytesCopied ) ++ { ++ memmove( &fileData->buffer[ 0 ], &fileData->buffer[ bytesCopied ], fileData->bufBytes - bytesCopied ); ++ } ++ fileData->bufBytes -= bytesCopied; ++ ++ if ( fileData->bufBytes > 0 ) ++ { ++ // We copied some data, but not all of it. Return early. ++ ++ return bytesCopied; ++ } ++ } ++ ++ do ++ { ++ if ((( file->f_flags & O_NONBLOCK ) != 0 ) && ( fileData->numEvents == 0 )) ++ { ++ // File was opened non-blocking and no more data is available ++ // We don't want to wait for an event, so exit from the loop ++ ++ break; ++ } ++ ++ rc = wait_event_interruptible( fileData->waitQueue, ( fileData->numEvents > 0 )); ++ if ( rc != 0 ) ++ { ++ return rc; ++ } ++ ++ if ( fileData->readMode == GPIO_EventReadModeBinary ) ++ { ++ gpio_event_dequeue_event( fileData, (GPIO_Event_t *)&fileData->buffer[0] ); ++ ++ fileData->bufBytes = sizeof( GPIO_Event_t ); ++ ++ } ++ else ++ { ++ GPIO_Event_t gpioEvent; ++ ++ gpio_event_dequeue_event( fileData, &gpioEvent ); ++ ++ // ASCII Mode output: ++ // ++ // nn E tttttttt.tttttt ++ // ++ // Where nn is the base-10 GPIO number ++ // E is R or F (for rising or falling edge) ++ // tttttttt.tttttt is the timestamp with microsecond resolution ++ ++ fileData->bufBytes = snprintf( fileData->buffer, sizeof( fileData->buffer ), ++ "%2d %c %ld.%06ld\n", ++ gpioEvent.gpio, ++ (( gpioEvent.edgeType == GPIO_EventRisingEdge ) ? 'R' : 'F' ), ++ gpioEvent.time.tv_sec, ++ gpioEvent.time.tv_usec ); ++ } ++ ++ if ( spaceRemaining >= fileData->bufBytes ) ++ { ++ bytesToCopy = fileData->bufBytes; ++ } ++ else ++ { ++ bytesToCopy = spaceRemaining; ++ } ++ ++ if ( copy_to_user( &buffer[ bytesCopied ], &fileData->buffer[0], bytesToCopy ) != 0 ) ++ { ++ return -EFAULT; ++ } ++ spaceRemaining -= bytesToCopy; ++ bytesCopied += bytesToCopy; ++ fileData->bufBytes -= bytesToCopy; ++ ++ if ( fileData->bufBytes > 0 ) ++ { ++ // We couldn't copy all of the data out of the buffer. Move the ++ // remaining data to the beginning of the buffer and exit. ++ ++ memmove( &fileData->buffer[ 0 ], &fileData->buffer[ bytesToCopy ], fileData->bufBytes ); ++ return bytesCopied; ++ } ++ } while (( fileData->numEvents > 0 ) && ( spaceRemaining > 0 )); ++ ++ if ((( file->f_flags & O_NONBLOCK ) != 0 ) && ( bytesCopied == 0 )) ++ { ++ // File was opened non-blocking and we didn't copy any data. ++ ++ return -EAGAIN; ++ } ++ ++ return bytesCopied; ++ ++} // gpio_event_read ++ ++/**************************************************************************** ++* ++* gpio_event_poll - used by select & poll ++* ++****************************************************************************/ ++ ++static unsigned int gpio_event_poll(struct file *file, poll_table *wait) ++{ ++ unsigned long flags; ++ GPIO_FileData_t *fileData = file->private_data; ++ unsigned int mask = 0; ++ ++ poll_wait( file, &fileData->waitQueue, wait ); ++ ++ spin_lock_irqsave( &fileData->queueLock, flags ); ++ { ++ if (( fileData->bufBytes > 0 ) || ( fileData->numEvents > 0 )) ++ { ++ mask |= POLLIN | POLLRDNORM; // readable ++ } ++ } ++ spin_unlock_irqrestore( &fileData->queueLock, flags ); ++ ++ return mask; ++ ++} // gpio_event_poll ++ ++/**************************************************************************** ++* ++* gpio_event_release ++* ++****************************************************************************/ ++ ++static int gpio_event_release( struct inode *inode, struct file *file ) ++{ ++ unsigned long flags; ++ GPIO_FileData_t *fileData = file->private_data; ++ ++ DEBUG( Trace, "gpio_event_release called\n" ); ++ ++ spin_lock_irqsave( &gFileListLock, flags ); ++ { ++ list_del( &fileData->list ); ++ } ++ spin_unlock_irqrestore( &gFileListLock, flags ); ++ ++ kfree( fileData ); ++ ++ return 0; ++ ++} // gpio_event_release ++ ++/**************************************************************************** ++* ++* File Operations (these are the device driver entry points) ++* ++****************************************************************************/ ++ ++struct file_operations gpio_event_fops = ++{ ++ owner: THIS_MODULE, ++ unlocked_ioctl: gpio_event_ioctl, ++ open: gpio_event_open, ++ poll: gpio_event_poll, ++ release: gpio_event_release, ++ read: gpio_event_read, ++}; ++ ++/**************************************************************************** ++* ++* gpio_event_init ++* ++* Called to perform module initialization when the module is loaded ++* ++****************************************************************************/ ++ ++static int __init gpio_event_init( void ) ++{ ++ int rc; ++ ++ DEBUG( Trace, "called\n" ); ++ ++ printk( gBanner ); ++ ++ // Get a major number ++ ++ if (( rc = alloc_chrdev_region( &gGpioEventDevNum, 0, 1, GPIO_EVENT_DEV_NAME )) < 0 ) ++ { ++ printk( KERN_WARNING "sample: Unable to allocate major, err: %d\n", rc ); ++ return rc; ++ } ++ DEBUG( Trace, "allocated major:%d minor:%d\n", MAJOR( gGpioEventDevNum ), MINOR( gGpioEventDevNum )); ++ ++ // Register our proc entries. ++ ++ gProcGpioEvent = create_proc_entry( "gpio-event", S_IFDIR | S_IRUGO | S_IXUGO, NULL ); ++ if ( gProcGpioEvent == NULL ) ++ { ++ return -ENOMEM; ++ } ++ gProcPins = create_proc_entry( "pins", 0444, gProcGpioEvent ); ++ if ( gProcPins != NULL ) ++ { ++ gProcPins->proc_fops = &pins_proc_ops; ++ } ++ ++#if ( LINUX_VERSION_CODE <= KERNEL_VERSION( 2, 6, 20 )) ++ gSysCtlHeader = register_sysctl_table( gSysCtl, 0 ); ++ if ( gSysCtlHeader != NULL ) ++ { ++ gSysCtlHeader->ctl_table->child->de->owner = THIS_MODULE; ++ } ++#else ++ gSysCtlHeader = register_sysctl_table( gSysCtl ); ++#endif ++ ++ // Register our device. The device becomes "active" as soon as cdev_add ++ // is called. ++ ++ cdev_init( &gGpioEventCDev, &gpio_event_fops ); ++ gGpioEventCDev.owner = THIS_MODULE; ++ ++ if (( rc = cdev_add( &gGpioEventCDev, gGpioEventDevNum, 1 )) != 0 ) ++ { ++ printk( KERN_WARNING "sample: cdev_add failed: %d\n", rc ); ++ return rc; ++ } ++ ++ // Create a class, so that udev will make the /dev entry ++ ++ gGpioEventClass = class_create( THIS_MODULE, GPIO_EVENT_DEV_NAME ); ++ if ( IS_ERR( gGpioEventClass )) ++ { ++ printk( KERN_WARNING "sample: Unable to create class\n" ); ++ return -1; ++ } ++ ++ device_create( gGpioEventClass, NULL, gGpioEventDevNum, NULL, GPIO_EVENT_DEV_NAME ); ++ ++ return 0; ++ ++} // gpio_event_init ++ ++/**************************************************************************** ++* ++* gpio_event_exit ++* ++* Called to perform module cleanup when the module is unloaded. ++* ++****************************************************************************/ ++ ++static void __exit gpio_event_exit( void ) ++{ ++ struct list_head *next; ++ struct list_head *pin; ++ GPIO_EventMonitor_t monitor; ++ ++ DEBUG( Trace, "called\n" ); ++ ++ // If there are any pins which are currently being monitored, then we ++ // need to unmonitor them. ++ ++ memset( &monitor, 0, sizeof( monitor )); ++ ++ list_for_each_safe( pin, next, &gPinList ) ++ { ++ GPIO_PinData_t *pinData = list_entry( pin, GPIO_PinData_t, list ); ++ ++ monitor.gpio = pinData->gpio; ++ ++ gpio_event_monitor( &monitor ); ++ } ++ ++ // Deregister our driver ++ ++ device_destroy( gGpioEventClass, gGpioEventDevNum ); ++ class_destroy( gGpioEventClass ); ++ ++ cdev_del( &gGpioEventCDev ); ++ ++ if ( gSysCtlHeader != NULL ) ++ { ++ unregister_sysctl_table( gSysCtlHeader ); ++ } ++ remove_proc_entry( "pins", gProcGpioEvent ); ++ remove_proc_entry( "gpio-event", NULL ); ++ ++ unregister_chrdev_region( gGpioEventDevNum, 1 ); ++ ++} // gpio_event_exit ++ ++/****************************************************************************/ ++ ++module_init(gpio_event_init); ++module_exit(gpio_event_exit); ++ ++MODULE_AUTHOR("Dave Hylands"); ++MODULE_DESCRIPTION("GPIO Event Driver"); ++MODULE_LICENSE("Dual BSD/GPL"); ++ +diff -Naur linux-3.2.33-go.orig/3rdparty/gpio_event_drv/gpio-event-drv.h 3rdparty/gpio_event_drv/gpio-event-drv.h +--- linux-3.2.33-go.orig/3rdparty/gpio_event_drv/gpio-event-drv.h 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/gpio_event_drv/gpio-event-drv.h 2012-11-18 10:24:14.000000000 +0000 +@@ -0,0 +1,115 @@ ++/**************************************************************************** ++* ++* Copyright (c) 2006 Dave Hylands ++* ++* This program is free software; you can redistribute it and/or modify ++* it under the terms of the GNU General Public License version 2 as ++* published by the Free Software Foundation. ++* ++* Alternatively, this software may be distributed under the terms of BSD ++* license. ++* ++* See README and COPYING for more details. ++* ++**************************************************************************** ++* ++* This driver allows multiple GPIO pins to be monitored and allows a user ++* mode program to be notified when the pin changes. ++* ++****************************************************************************/ ++ ++#if !defined( GPIO_EVENT_DRV_H ) ++#define GPIO_EVENT_DRV_H ++ ++/* ---- Include Files ----------------------------------------------------- */ ++ ++#if defined( __KERNEL__ ) ++# include ++# include ++# include ++#else ++# include ++# include ++# include ++#endif ++ ++ ++/* ---- Constants and Types ----------------------------------------------- */ ++ ++// The ioctl "magic" is just some character value which is used to help ++// detect when incorrect ioctl values are sent down to a driver. ++ ++#define GPIO_EVENT_IOCTL_MAGIC 'G' ++ ++/** ++ * Deefines for each of the ioctl commands. Note that since we want to reduce ++ * the possibility that a user mode program gets out of sync with a given ++ * driver, we explicitly assign a value to each enumeration. This makes ++ * it more difficult to stick new ioctl's in the middle of the list. ++ */ ++ ++typedef enum ++{ ++ GPIO_EVENT_CMD_FIRST = 0x80, ++ ++ GPIO_EVENT_CMD_MONITOR_GPIO = 0x80, ++ GPIO_EVENT_CMD_SET_READ_MODE = 0x81, ++ ++ /* Insert new ioctls here */ ++ ++ GPIO_EVENT_CMD_LAST, ++ ++} GPIO_EVENT_CMD; ++ ++typedef enum ++{ ++ GPIO_EventRisingEdge = 0x01, ++ GPIO_EventFallingEdge = 0x02, ++ GPIO_EventBothEdges = GPIO_EventRisingEdge | GPIO_EventFallingEdge, ++ ++} GPIO_EventEdgeType_t; ++ ++typedef struct ++{ ++ uint8_t gpio; // gpio to monitor ++ uint8_t onOff; // 0 = stop monitoring, 1 = start monitoring ++ GPIO_EventEdgeType_t edgeType; // Monitor rising/falling/both edges? ++ uint8_t debounceMilliSec; // debounce time in milliseconds ++ ++} GPIO_EventMonitor_t; ++ ++typedef enum ++{ ++ GPIO_EventReadModeAscii = 0x00, // Reads return ASCII data (default) ++ GPIO_EventReadModeBinary = 0x01, // Reads return Binary data ++ ++} GPIO_EventReadMode_t; ++ ++/* ++ * Definitions for the actual ioctl commands ++ */ ++ ++#define GPIO_EVENT_IOCTL_MONITOR_GPIO _IOW( GPIO_EVENT_IOCTL_MAGIC, GPIO_EVENT_CMD_MONITOR_GPIO, GPIO_EventMonitor_t ) // arg is GPIO_EventMonitor * ++#define GPIO_EVENT_IOCTL_SET_READ_MODE _IO( GPIO_EVENT_IOCTL_MAGIC, GPIO_EVENT_CMD_SET_READ_MODE ) // arg is int ++ ++/* ++ * Definitions for sysctl. The top level define has to be unique system wide. ++ * The kernel defines values 1 thru about 10 (see include/linunx/sysctl.h) ++ */ ++ ++#define CTL_GPIO_EVENT 0x47504576 // 'GPEv' in hex form ++ ++/* ++ * Reads return GPIO_Event_t structures ++ */ ++ ++typedef struct ++{ ++ uint8_t gpio; // GPIO that this event is for ++ GPIO_EventEdgeType_t edgeType; // Type of edge detected ++ struct timeval time; // Time the event occurred ++ ++} GPIO_Event_t; ++ ++#endif // GPIO_EVENT_DRV_H ++ diff --git a/3.2.34/3rd-3rdparty-merge.patch b/3.2.34/3rd-3rdparty-merge.patch new file mode 100644 index 0000000..dff4679 --- /dev/null +++ b/3.2.34/3rd-3rdparty-merge.patch @@ -0,0 +1,156 @@ +diff -uNr linux-3.2.33-go.orig/arch/alpha/Kconfig linux-3.2.33-go/arch/alpha/Kconfig +--- linux-3.2.33-go.orig/arch/alpha/Kconfig 2012-11-15 22:08:02.768806792 +0100 ++++ linux-3.2.33-go/arch/alpha/Kconfig 2012-11-15 22:08:29.937483632 +0100 +@@ -673,3 +673,4 @@ + + source "lib/Kconfig" + ++source "3rdparty/Kconfig" +diff -uNr linux-3.2.33-go.orig/arch/arm/Kconfig linux-3.2.33-go/arch/arm/Kconfig +--- linux-3.2.33-go.orig/arch/arm/Kconfig 2012-11-15 22:07:59.952839378 +0100 ++++ linux-3.2.33-go/arch/arm/Kconfig 2012-11-15 22:14:01.950566716 +0100 +@@ -2259,3 +2259,5 @@ + source "crypto/Kconfig" + + source "lib/Kconfig" ++ ++source "3rdparty/Kconfig" +diff -uNr linux-3.2.33-go.orig/arch/ia64/Kconfig linux-3.2.33-go/arch/ia64/Kconfig +--- linux-3.2.33-go.orig/arch/ia64/Kconfig 2012-11-15 22:08:00.893828523 +0100 ++++ linux-3.2.33-go/arch/ia64/Kconfig 2012-11-15 22:08:29.938483621 +0100 +@@ -669,3 +669,5 @@ + + config IOMMU_HELPER + def_bool (IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB || IA64_GENERIC || SWIOTLB) ++ ++source "3rdparty/Kconfig" +diff -uNr linux-3.2.33-go.orig/arch/mips/Kconfig linux-3.2.33-go/arch/mips/Kconfig +--- linux-3.2.33-go.orig/arch/mips/Kconfig 2012-11-15 22:08:02.698807597 +0100 ++++ linux-3.2.33-go/arch/mips/Kconfig 2012-11-15 22:08:29.939483610 +0100 +@@ -2485,3 +2485,5 @@ + source "crypto/Kconfig" + + source "lib/Kconfig" ++ ++source "3rdparty/Kconfig" +diff -uNr linux-3.2.33-go.orig/arch/powerpc/Kconfig linux-3.2.33-go/arch/powerpc/Kconfig +--- linux-3.2.33-go.orig/arch/powerpc/Kconfig 2012-11-15 22:08:01.893816938 +0100 ++++ linux-3.2.33-go/arch/powerpc/Kconfig 2012-11-15 22:08:29.940483598 +0100 +@@ -980,3 +980,5 @@ + bool + + source "arch/powerpc/kvm/Kconfig" ++ ++source "3rdparty/Kconfig" +diff -uNr linux-3.2.33-go.orig/arch/sparc/Kconfig linux-3.2.33-go/arch/sparc/Kconfig +--- linux-3.2.33-go.orig/arch/sparc/Kconfig 2012-11-15 22:08:00.130837331 +0100 ++++ linux-3.2.33-go/arch/sparc/Kconfig 2012-11-15 22:08:29.941483586 +0100 +@@ -605,3 +605,5 @@ + source "crypto/Kconfig" + + source "lib/Kconfig" ++ ++source "3rdparty/Kconfig" +diff -uNr linux-3.2.33-go.orig/arch/x86/Kconfig linux-3.2.33-go/arch/x86/Kconfig +--- linux-3.2.33-go.orig/arch/x86/Kconfig 2012-11-15 22:08:00.435833823 +0100 ++++ linux-3.2.33-go/arch/x86/Kconfig 2012-11-15 22:08:29.945483540 +0100 +@@ -2179,3 +2179,5 @@ + source "arch/x86/kvm/Kconfig" + + source "lib/Kconfig" ++ ++source "3rdparty/Kconfig" +diff -uNr linux-3.2.33-go.orig/Makefile linux-3.2.33-go/Makefile +--- linux-3.2.33-go.orig/Makefile 2012-11-15 22:08:03.435799123 +0100 ++++ linux-3.2.33-go/Makefile 2012-11-15 22:08:29.946483529 +0100 +@@ -507,7 +507,7 @@ + + # Objects we will link into vmlinux / subdirs we need to visit + init-y := init/ +-drivers-y := drivers/ sound/ firmware/ ++drivers-y := drivers/ sound/ firmware/ 3rdparty/ + net-y := net/ + libs-y := lib/ + core-y := usr/ +diff -uNr linux-3.2.33-go.orig/scripts/kconfig/Makefile linux-3.2.33-go/scripts/kconfig/Makefile +--- linux-3.2.33-go.orig/scripts/kconfig/Makefile 2012-11-15 22:07:58.064861094 +0100 ++++ linux-3.2.33-go/scripts/kconfig/Makefile 2012-11-15 22:08:55.603180188 +0100 +@@ -11,29 +11,29 @@ + Kconfig := Kconfig + endif + +-xconfig: $(obj)/qconf ++xconfig: $(obj)/qconf 3rdparty/Makefile + $< $(Kconfig) + +-gconfig: $(obj)/gconf ++gconfig: $(obj)/gconf 3rdparty/Makefile + $< $(Kconfig) + +-menuconfig: $(obj)/mconf ++menuconfig: $(obj)/mconf 3rdparty/Makefile + $< $(Kconfig) + +-config: $(obj)/conf ++config: $(obj)/conf 3rdparty/Makefile + $< --oldaskconfig $(Kconfig) + +-nconfig: $(obj)/nconf ++nconfig: $(obj)/nconf 3rdparty/Makefile + $< $(Kconfig) + +-oldconfig: $(obj)/conf ++oldconfig: $(obj)/conf 3rdparty/Makefile + $< --$@ $(Kconfig) + +-silentoldconfig: $(obj)/conf ++silentoldconfig: $(obj)/conf 3rdparty/Makefile + $(Q)mkdir -p include/generated + $< --$@ $(Kconfig) + +-localyesconfig localmodconfig: $(obj)/streamline_config.pl $(obj)/conf ++localyesconfig localmodconfig: $(obj)/streamline_config.pl $(obj)/conf 3rdparty/Makefile + $(Q)mkdir -p include/generated + $(Q)perl $< --$@ $(srctree) $(Kconfig) > .tmp.config + $(Q)if [ -f .config ]; then \ +@@ -90,18 +90,18 @@ + *) cat $(CLONECONFIG) > .config.running ;; \ + esac && \ + echo -e "Cloning configuration file $(CLONECONFIG)\n" +- $(Q)$< --defconfig=.config.running arch/$(SRCARCH)/Kconfig ++ $(Q)$< --defconfig=.config.running arch/$(SRCARCH)/Kconfig 3rdparty/Makefile + + + PHONY += listnewconfig oldnoconfig savedefconfig defconfig + +-listnewconfig oldnoconfig: $(obj)/conf ++listnewconfig oldnoconfig: $(obj)/conf 3rdparty/Makefile + $< --$@ $(Kconfig) + +-savedefconfig: $(obj)/conf ++savedefconfig: $(obj)/conf 3rdparty/Makefile + $< --$@=defconfig $(Kconfig) + +-defconfig: $(obj)/conf ++defconfig: $(obj)/conf 3rdparty/Makefile + ifeq ($(KBUILD_DEFCONFIG),) + $< --defconfig $(Kconfig) + else +@@ -109,7 +109,7 @@ + $(Q)$< --defconfig=arch/$(SRCARCH)/configs/$(KBUILD_DEFCONFIG) $(Kconfig) + endif + +-%_defconfig: $(obj)/conf ++%_defconfig: $(obj)/conf 3rdparty/Makefile + $(Q)$< --defconfig=arch/$(SRCARCH)/configs/$@ $(Kconfig) + + # Help text used by make help +@@ -186,6 +186,8 @@ + gconf-target := 1 + endif + ++3rdparty/Makefile: ++ pushd $(srctree)/3rdparty ; $(PERL) ./mkbuild.pl ; popd + + ifeq ($(qconf-target),1) + hostprogs-y += qconf diff --git a/3.2.34/3rd-3rdparty-netatop-0.1.1.patch b/3.2.34/3rd-3rdparty-netatop-0.1.1.patch new file mode 100644 index 0000000..a06a77d --- /dev/null +++ b/3.2.34/3rd-3rdparty-netatop-0.1.1.patch @@ -0,0 +1,1769 @@ +diff -uNr linux-3.2.33-go.orig/3rdparty/netatop/Kconfig 3rdparty/netatop/Kconfig +--- linux-3.2.33-go.orig/3rdparty/netatop/Kconfig 1970-01-01 01:00:00.000000000 +0100 ++++ 3rdparty/netatop/Kconfig 2012-11-15 22:48:00.753390796 +0100 +@@ -0,0 +1,8 @@ ++config NETATOP ++ tristate "Netatop kernel module" ++ help ++ The optional kernel module netatop can be loaded to gather statistics ++ about the TCP and UDP packets that have been transmitted/received ++ per process and per thread ++ ++ If unsure, see you again in six months. +diff -uNr linux-3.2.33-go.orig/3rdparty/netatop/Makefile 3rdparty/netatop/Makefile +--- linux-3.2.33-go.orig/3rdparty/netatop/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ 3rdparty/netatop/Makefile 2012-11-15 22:50:01.332957868 +0100 +@@ -0,0 +1,5 @@ ++# ++# THIS IS AN AUTOMATICALLY GENERATED FILE. DO NOT EDIT. ++# ++ ++obj-$(CONFIG_NETATOP) += netatop.o +diff -uNr linux-3.2.33-go.orig/3rdparty/netatop/netatop.c 3rdparty/netatop/netatop.c +--- linux-3.2.33-go.orig/3rdparty/netatop/netatop.c 1970-01-01 01:00:00.000000000 +0100 ++++ 3rdparty/netatop/netatop.c 2012-11-15 22:57:52.989419565 +0100 +@@ -0,0 +1,1687 @@ ++/* ++** This module uses the netfilter interface to maintain statistics ++** about the network traffic per task, on level of thread group ++** and individual thread. ++** ++** General setup ++** ------------- ++** Once the module is active, it is called for every packet that is ++** transmitted by a local process and every packet that is received ++** from an interface. Not only the packets that contain the user data ++** are passed but also the TCP related protocol packets (SYN, ACK, ...). ++** ++** When the module discovers a packet for a connection (TCP) or local ++** port (UDP) that is new, it creates a sockinfo structure. As soon as ++** possible the sockinfo struct will be connected to a taskinfo struct ++** that represents the proces or thread that is related to the socket. ++** However, the task can only be determined when a packet is transmitted, ++** i.e. the module is called during system call handling in the context ++** of the transmitting process. At that moment the tgid (process) and ++** pid (thread) can be obtained from the process administration to ++** be stored in the module's own taskinfo structs (one for the process, ++** one for the thread). ++** For the time that the sockinfo struct can not be related to a taskinfo ++** struct (e.g. when only packets are received), counters are maintained ++** temporarily in the sockinfo struct. As soon as a related taskinfo struct ++** is discovered when the task transmits, counters will be maintained in ++** the taskinfo struct itself. ++** When packets are only received for a socket (e.g. another machine is ++** sending UDP packets to the local machine) while the local task ++** never responds, no match to a process can be made and the packets ++** remain unidentified by the netatop module. At least one packet should ++** have been sent by a local process to be able to match packets for such ++** socket. ++** In the file /proc/netatop counters can be found that show the total ++** number of packets sent/received and how many of these packets were ++** unidentified (i.e. not accounted to a process/thread). ++** ++** Garbage collection ++** ------------------ ++** The module uses a garbage collector to cleanup the unused sockinfo ++** structs if connections do not exist any more (TCP) or have not been ++** used for some time (TCP/UDP). ++** Furthermore, the garbage collector checks if the taskinfo structs ++** still represent existing processes or threads. If not, the taskinfo struct ++** is destroyed (in case of a thread) or it is moved to a separate list of ++** finished processes (in case of a process). Analysis programs can read ++** the taskinfo of such finished process. When the taskinfo of a finished ++** process is not read within 15 seconds, the taskinfo will be destroyed. ++** ++** A garbage collector cycle can be triggered by issueing a getsockopt ++** call from an analysis program (e.g. atop). Apart from that, a time-based ++** garbage collector cycle is issued anyhow every 15 seconds. ++** ++** Interface with user mode ++** ------------------------ ++** Programs can open an IP socket and use the getsockopt() system call ++** to issue commands to this module. With the command ATOP_GETCNT_TGID ++** the current counters can be obtained on process level (thread group) ++** and with the command ATOP_GETCNT_PID the counters on thread level. ++** For both commands, the tgid/pid has to be passed of the required thread ++** (group). When the required thread (group) does not exist, an errno ESRCH ++** is given. ++** ++** The command ATOP_GETCNT_EXIT can be issued to obtain the counters of ++** an exited process. As stated above, such command has to be issued ++** within 15 seconds after a process has been declared 'finished' by ++** the garbage collector. Whenever this command is issued and no exited ++** process is in the exitlist, the requesting process is blocked until ++** an exited process is available. ++** ++** The command NETATOP_FORCE_GC activates the garbage collector of the ++** netatop module to determine if sockinfo's of old connections/ports ++** can be destroyed and if taskinfo's of exited processes can be ++** The command NETATOP_EMPTY_EXIT can be issued to wait until the exitlist ++** with the taskinfo's of exited processes is empty. ++** ---------------------------------------------------------------------- ++** Copyright (C) 2012 Gerlof Langeveld (gerlof.langeveld@atoptool.nl) ++** ++** This program is free software; you can redistribute it and/or modify ++** it under the terms of the GNU General Public License version 2 as ++** published by the Free Software Foundation. ++*/ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "netatop.h" ++#include "netatopversion.h" ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Gerlof Langeveld "); ++MODULE_DESCRIPTION("Per-task network statistics"); ++MODULE_VERSION(NETATOPVERSION); ++ ++#define GCINTERVAL (HZ*15) // interval garbage collector (jiffies) ++#define GCMAXUDP (HZ*16) // max inactivity for UDP (jiffies) ++#define GCMAXTCP (HZ*1800) // max inactivity for TCP (jiffies) ++#define GCMAXUNREF (HZ*60) // max time without taskref (jiffies) ++ ++#define SILIMIT (2048*1024) // maximum memory for sockinfo structs ++#define TILIMIT (1024*1024) // maximum memory for taskinfo structs ++ ++#define NF_IP_PRE_ROUTING 0 ++#define NF_IP_LOCAL_IN 1 ++#define NF_IP_FORWARD 2 ++#define NF_IP_LOCAL_OUT 3 ++#define NF_IP_POST_ROUTING 4 ++ ++/* ++** struct that maintains statistics about the network ++** traffic caused per thread or thread group ++*/ ++struct chainer { ++ void *next; ++ void *prev; ++}; ++ ++struct taskinfobucket; ++ ++struct taskinfo { ++ struct chainer ch; ++ ++ pid_t id; // tgid or pid ++ char type; // 'g' (thread group) or ++ // 't' (thread) ++ unsigned char state; // see below ++ char command[COMLEN]; ++ unsigned long btime; // start time of process ++ unsigned long long exittime; // time inserted in exitlist ++ ++ struct taskcount tc; ++}; ++ ++// state values above ++#define CHECKED 1 // verified that task still exists ++#define INDELETE 2 // task exited but still in hash list ++#define FINISHED 3 // task on exit list ++ ++/* ++** hash tables to find a particular thread group or thread ++*/ ++#define TBUCKS 1024 // must be multiple of 2! ++#define THASH(x, t) (((x)+t)&(TBUCKS-1)) ++ ++struct taskinfobucket { ++ struct chainer ch; ++ spinlock_t lock; ++} thash[TBUCKS]; ++ ++static unsigned long nrt; // current number of taskinfo allocated ++static unsigned long nrt_ovf; // no taskinfo allocated due to overflow ++static DEFINE_SPINLOCK(nrtlock); ++ ++ ++static struct taskinfo *exithead; // linked list of exited processes ++static struct taskinfo *exittail; ++static DEFINE_SPINLOCK(exitlock); ++ ++static DECLARE_WAIT_QUEUE_HEAD(exitlist_filled); ++static DECLARE_WAIT_QUEUE_HEAD(exitlist_empty); ++ ++static unsigned long nre; // current number of taskinfo on exitlist ++ ++/* ++** structs that uniquely identify a TCP connection (host endian format) ++*/ ++struct tcpv4_ident { ++ uint32_t laddr; /* local IP address */ ++ uint32_t raddr; /* remote IP address */ ++ uint16_t lport; /* local port number */ ++ uint16_t rport; /* remote port number */ ++}; ++ ++struct tcpv6_ident { ++ struct in6_addr laddr; /* local IP address */ ++ struct in6_addr raddr; /* remote IP address */ ++ uint16_t lport; /* local port number */ ++ uint16_t rport; /* remote port number */ ++}; ++ ++/* ++** struct to maintain the reference from a socket ++** to a thread and thread-group ++*/ ++struct sockinfo { ++ struct chainer ch; ++ ++ unsigned char last_state; // last known state of socket ++ uint8_t proto; // protocol ++ ++ union keydef { ++ uint16_t udp; // UDP ident (only portnumber) ++ struct tcpv4_ident tcp4; // TCP connection ident IPv4 ++ struct tcpv6_ident tcp6; // TCP connection ident IPv6 ++ } key; ++ ++ struct taskinfo *tgp; // ref to thread group ++ struct taskinfo *thp; // ref to thread (or NULL) ++ ++ short tgh; // hash number of thread group ++ short thh; // hash number of thread ++ ++ unsigned long sndpacks; // temporary counters in case ++ unsigned long sndbytes; // no relation to process is ++ unsigned long rcvpacks; // known yet ++ unsigned long rcvbytes; ++ ++ unsigned long long lastact; // last updated (jiffies) ++}; ++ ++/* ++** hash table to find a socket reference ++*/ ++#define SBUCKS 1024 // must be multiple of 2! ++#define SHASHTCP4(x) (((x).raddr+(x).lport+(x).rport)&(SBUCKS-1)) ++#define SHASHUDP(x) ((x)&(SBUCKS-1)) ++ ++struct { ++ struct chainer ch; ++ spinlock_t lock; ++} shash[SBUCKS]; ++ ++static unsigned long nrs; // current number sockinfo allocated ++static unsigned long nrs_ovf; // no sockinfo allocated due to overflow ++static DEFINE_SPINLOCK(nrslock); ++ ++/* ++** various static counters ++*/ ++static unsigned long icmpsndbytes; ++static unsigned long icmpsndpacks; ++static unsigned long icmprcvbytes; ++static unsigned long icmprcvpacks; ++ ++static unsigned long tcpsndpacks; ++static unsigned long tcprcvpacks; ++static unsigned long udpsndpacks; ++static unsigned long udprcvpacks; ++static unsigned long unidentudpsndpacks; ++static unsigned long unidentudprcvpacks; ++static unsigned long unidenttcpsndpacks; ++static unsigned long unidenttcprcvpacks; ++ ++static unsigned long unknownproto; ++ ++static struct timer_list timer; ++static DEFINE_SPINLOCK(gclock); ++static unsigned long long gclast; // last garbage collection (jiffies) ++ ++static struct timespec boottime; ++ ++/* ++** function prototypes ++*/ ++static void analyze_tcpv4_packet(struct sk_buff *, ++ const struct net_device *, int, char, ++ struct iphdr *, void *); ++ ++static void analyze_udp_packet(struct sk_buff *, ++ const struct net_device *, int, char, ++ struct iphdr *, void *); ++ ++static int sock2task(char, struct sockinfo *, ++ struct taskinfo **, short *, ++ struct sk_buff *, const struct net_device *, ++ int, char); ++ ++static void update_taskcounters(struct sk_buff *, ++ const struct net_device *, ++ struct taskinfo *, char); ++ ++static void update_sockcounters(struct sk_buff *, ++ const struct net_device *, ++ struct sockinfo *, char); ++ ++static void sock2task_sync(struct sk_buff *, ++ struct sockinfo *, struct taskinfo *); ++ ++static void register_unident(struct sockinfo *); ++ ++static int calc_reallen(struct sk_buff *, ++ const struct net_device *); ++ ++static void get_tcpv4_ident(struct iphdr *, void *, ++ char, union keydef *); ++ ++static struct sockinfo *find_sockinfo(int, union keydef *, int, int); ++static struct sockinfo *make_sockinfo(int, union keydef *, int, int); ++ ++static void wipesockinfo(void); ++static void wipetaskinfo(void); ++static void wipetaskexit(void); ++ ++static void garbage_collector(void); ++static void gcperiodic(unsigned long unused); ++static void gctaskexit(void); ++static void gcsockinfo(void); ++static void gctaskinfo(void); ++ ++static void move_taskinfo(struct taskinfo *); ++static void delete_taskinfo(struct taskinfo *); ++static void delete_sockinfo(struct sockinfo *); ++ ++static struct taskinfo *get_taskinfo(pid_t, char); ++ ++static int getsockopt(struct sock *, int, void *, int *); ++ ++/* ++** hook definitions ++*/ ++static struct nf_hook_ops hookin_ipv4; ++static struct nf_hook_ops hookout_ipv4; ++ ++/* ++** getsockopt definitions for communication with user space ++*/ ++static struct nf_sockopt_ops sockopts = { ++ .pf = PF_INET, ++ .get_optmin = NETATOP_BASE_CTL, ++ .get_optmax = NETATOP_BASE_CTL+6, ++ .get = getsockopt, ++ .owner = THIS_MODULE, ++}; ++ ++/* ++** hook function to be called for every incoming local packet ++*/ ++static unsigned int ++ipv4_hookin(unsigned int hooknum, ++ struct sk_buff *skb, ++ const struct net_device *in, ++ const struct net_device *out, ++ int (*okfn)(struct sk_buff *)) ++{ ++ struct iphdr *iph; ++ void *trh; ++ ++ if (skb == NULL) // useless socket buffer? ++ return NF_ACCEPT; ++ ++ /* ++ ** get pointer to IP header and transport header ++ */ ++ iph = (struct iphdr *)skb_network_header(skb); ++ trh = ((char *)iph + (iph->ihl * 4)); ++ ++ /* ++ ** react on protocol number ++ */ ++ switch (iph->protocol) { ++ case IPPROTO_TCP: ++ tcprcvpacks++; ++ analyze_tcpv4_packet(skb, in, 0, 'i', iph, trh); ++ break; ++ ++ case IPPROTO_UDP: ++ udprcvpacks++; ++ analyze_udp_packet(skb, in, 0, 'i', iph, trh); ++ break; ++ ++ case IPPROTO_ICMP: ++ icmprcvpacks++; ++ icmprcvbytes += skb->len + in->hard_header_len + 4; ++ break; ++ ++ default: ++ unknownproto++; ++ } ++ ++ // accept every packet after stats gathering ++ return NF_ACCEPT; ++} ++ ++/* ++** hook function to be called for every outgoing local packet ++*/ ++static unsigned int ++ipv4_hookout(unsigned int hooknum, ++ struct sk_buff *skb, ++ const struct net_device *in, ++ const struct net_device *out, ++ int (*okfn)(struct sk_buff *)) ++{ ++ int in_syscall = !in_interrupt(); ++ struct iphdr *iph; ++ void *trh; ++ ++ if (skb == NULL) // useless socket buffer? ++ return NF_ACCEPT; ++ ++ /* ++ ** get pointer to IP header and transport header ++ */ ++ iph = (struct iphdr *)skb_network_header(skb); ++ trh = skb_transport_header(skb); ++ ++ /* ++ ** react on protocol number ++ */ ++ switch (iph->protocol) { ++ case IPPROTO_TCP: ++ tcpsndpacks++; ++ analyze_tcpv4_packet(skb, out, in_syscall, 'o', iph, trh); ++ break; ++ ++ case IPPROTO_UDP: ++ udpsndpacks++; ++ analyze_udp_packet(skb, out, in_syscall, 'o', iph, trh); ++ break; ++ ++ case IPPROTO_ICMP: ++ icmpsndpacks++; ++ icmpsndbytes += skb->len + out->hard_header_len + 4; ++ break; ++ ++ default: ++ unknownproto++; ++ } ++ ++ // accept every packet after stats gathering ++ return NF_ACCEPT; ++} ++ ++/* ++** generic function (for input and output) to analyze the current packet ++*/ ++static void ++analyze_tcpv4_packet(struct sk_buff *skb, ++ const struct net_device *ndev, // interface description ++ int in_syscall, // called during system call? ++ char direction, // incoming ('i') or outgoing ('o') ++ struct iphdr *iph, void *trh) ++{ ++ union keydef key; ++ struct sockinfo *sip; ++ int bs; // hash bucket for sockinfo ++ unsigned long sflags; ++ ++ /* ++ ** determine tcpv4_ident that identifies this TCP packet ++ ** and calculate hash bucket in sockinfo hash ++ */ ++ get_tcpv4_ident(iph, trh, direction, &key); ++ ++ /* ++ ** check if we have seen this tcpv4_ident before with a ++ ** corresponding thread and thread group ++ */ ++ bs = SHASHTCP4(key.tcp4); ++ ++ spin_lock_irqsave(&shash[bs].lock, sflags); ++ ++ if ( (sip = find_sockinfo(IPPROTO_TCP, &key, sizeof key.tcp4, bs)) ++ == NULL) { ++ // no sockinfo yet: create one ++ if ( (sip = make_sockinfo(IPPROTO_TCP, &key, ++ sizeof key.tcp4, bs)) == NULL) { ++ if (direction == 'i') ++ unidenttcprcvpacks++; ++ else ++ unidenttcpsndpacks++; ++ goto unlocks; ++ } ++ } ++ ++ if (skb->sk) ++ sip->last_state = skb->sk->sk_state; ++ ++ /* ++ ** if needed (re)connect the sockinfo to a taskinfo and update ++ ** the counters ++ */ ++ ++ // connect to thread group and update ++ if (sock2task('g', sip, &sip->tgp, &sip->tgh, ++ skb, ndev, in_syscall, direction)) { ++ // connect to thread and update ++ (void) sock2task('t', sip, &sip->thp, &sip->thh, ++ skb, ndev, in_syscall, direction); ++ } ++ ++unlocks: ++ spin_unlock_irqrestore(&shash[bs].lock, sflags); ++} ++ ++ ++/* ++** generic function (for input and output) to analyze the current packet ++*/ ++static void ++analyze_udp_packet(struct sk_buff *skb, ++ const struct net_device *ndev, // interface description ++ int in_syscall, // called during system call? ++ char direction, // incoming ('i') or outgoing ('o') ++ struct iphdr *iph, void *trh) ++{ ++ struct udphdr *udph = (struct udphdr *)trh; ++ uint16_t udplocal = (direction == 'i' ? ++ ntohs(udph->dest) : ntohs(udph->source)); ++ int bs; // hash bucket for sockinfo ++ ++ union keydef key; ++ struct sockinfo *sip; ++ unsigned long sflags; ++ ++ /* ++ ** check if we have seen this local UDP port before with a ++ ** corresponding thread and thread group ++ */ ++ key.udp = udplocal; ++ bs = SHASHUDP(udplocal); ++ ++ spin_lock_irqsave(&shash[bs].lock, sflags); ++ ++ if ( (sip = find_sockinfo(IPPROTO_UDP, &key, sizeof key.udp, bs)) ++ == NULL) { ++ // no sockinfo yet: create one ++ if ( (sip = make_sockinfo(IPPROTO_UDP, &key, ++ sizeof key.udp, bs)) == NULL) { ++ if (direction == 'i') ++ unidentudprcvpacks++; ++ else ++ unidentudpsndpacks++; ++ goto unlocks; ++ } ++ } ++ ++ /* ++ ** if needed (re)connect the sockinfo to a taskinfo and update ++ ** the counters ++ */ ++ ++ // connect to thread group and update ++ if (sock2task('g', sip, &sip->tgp, &sip->tgh, ++ skb, ndev, in_syscall, direction)) { ++ // connect to thread and update ++ (void) sock2task('t', sip, &sip->thp, &sip->thh, ++ skb, ndev, in_syscall, direction); ++ } ++ ++unlocks: ++ spin_unlock_irqrestore(&shash[bs].lock, sflags); ++} ++ ++/* ++** connect the sockinfo to the correct taskinfo and update the counters ++*/ ++static int ++sock2task(char idtype, struct sockinfo *sip, struct taskinfo **tipp, ++ short *hash, struct sk_buff *skb, const struct net_device *ndev, ++ int in_syscall, char direction) ++{ ++ pid_t curid; ++ unsigned long tflags; ++ ++ if (*tipp == NULL) { ++ /* ++ ** no taskinfo connected yet for this reference from ++ ** sockinfo; to connect to a taskinfo, we must ++ ** be in system call handling now --> verify ++ */ ++ if (!in_syscall) { ++ if (idtype == 'g') ++ update_sockcounters(skb, ndev, sip, direction); ++ ++ return 0; // failed ++ } ++ ++ /* ++ ** try to find existing taskinfo or create new taskinfo ++ */ ++ curid = (idtype == 'g' ? current->tgid : current->pid); ++ ++ *hash = THASH(curid, idtype); // calc hashQ ++ ++ spin_lock_irqsave(&thash[*hash].lock, tflags); ++ ++ if ( (*tipp = get_taskinfo(curid, idtype)) == NULL) { ++ /* ++ ** not possible to connect ++ */ ++ spin_unlock_irqrestore(&thash[*hash].lock, tflags); ++ ++ if (idtype == 'g') ++ update_sockcounters(skb, ndev, sip, direction); ++ ++ return 0; // failed ++ } ++ ++ /* ++ ** new connection made: ++ ** update task counters with sock counters ++ */ ++ sock2task_sync(skb, sip, *tipp); ++ } else { ++ /* ++ ** already related to thread group or thread ++ ** lock existing task ++ */ ++ spin_lock_irqsave(&thash[*hash].lock, tflags); ++ ++ /* ++ ** check if socket has been passed to another process in the ++ ** meantime, like programs as xinetd use to do ++ ** if so, connect sockinfo to the new task ++ */ ++ if (in_syscall) { ++ curid = (idtype == 'g' ? current->tgid : current->pid); ++ ++ if ((*tipp)->id != curid) { ++ spin_unlock_irqrestore(&thash[*hash].lock, ++ tflags); ++ *hash = THASH(curid, idtype); ++ ++ spin_lock_irqsave(&thash[*hash].lock, tflags); ++ ++ if ( (*tipp = get_taskinfo(curid, idtype)) ++ == NULL) { ++ spin_unlock_irqrestore( ++ &thash[*hash].lock, tflags); ++ return 0; ++ } ++ } ++ } ++ } ++ ++ update_taskcounters(skb, ndev, *tipp, direction); ++ ++ spin_unlock_irqrestore(&thash[*hash].lock, tflags); ++ ++ return 1; ++} ++ ++/* ++** update the statistics of a particular thread group or thread ++*/ ++static void ++update_taskcounters(struct sk_buff *skb, const struct net_device *ndev, ++ struct taskinfo *tip, char direction) ++{ ++ struct iphdr *iph = (struct iphdr *)skb_network_header(skb); ++ int reallen = calc_reallen(skb, ndev); ++ ++ switch (iph->protocol) { ++ case IPPROTO_TCP: ++ if (direction == 'i') { ++ tip->tc.tcprcvpacks++; ++ tip->tc.tcprcvbytes += reallen; ++ } else { ++ tip->tc.tcpsndpacks++; ++ tip->tc.tcpsndbytes += reallen; ++ } ++ break; ++ ++ case IPPROTO_UDP: ++ if (direction == 'i') { ++ tip->tc.udprcvpacks++; ++ tip->tc.udprcvbytes += reallen; ++ } else { ++ tip->tc.udpsndpacks++; ++ tip->tc.udpsndbytes += reallen; ++ } ++ } ++} ++ ++/* ++** update the statistics of a sockinfo without a connected task ++*/ ++static void ++update_sockcounters(struct sk_buff *skb, const struct net_device *ndev, ++ struct sockinfo *sip, char direction) ++{ ++ int reallen = calc_reallen(skb, ndev); ++ ++ if (direction == 'i') { ++ sip->rcvpacks++; ++ sip->rcvbytes += reallen; ++ } else { ++ sip->sndpacks++; ++ sip->sndbytes += reallen; ++ } ++} ++ ++/* ++** add the temporary counters in the sockinfo to the new connected task ++*/ ++static void ++sock2task_sync(struct sk_buff *skb, struct sockinfo *sip, struct taskinfo *tip) ++{ ++ struct iphdr *iph = (struct iphdr *)skb_network_header(skb); ++ ++ switch (iph->protocol) { ++ case IPPROTO_TCP: ++ tip->tc.tcprcvpacks += sip->rcvpacks; ++ tip->tc.tcprcvbytes += sip->rcvbytes; ++ tip->tc.tcpsndpacks += sip->sndpacks; ++ tip->tc.tcpsndbytes += sip->sndbytes; ++ break; ++ ++ case IPPROTO_UDP: ++ tip->tc.udprcvpacks += sip->rcvpacks; ++ tip->tc.udprcvbytes += sip->rcvbytes; ++ tip->tc.udpsndpacks += sip->sndpacks; ++ tip->tc.udpsndbytes += sip->sndbytes; ++ } ++} ++ ++static void ++register_unident(struct sockinfo *sip) ++{ ++ switch (sip->proto) { ++ case IPPROTO_TCP: ++ unidenttcprcvpacks += sip->rcvpacks; ++ unidenttcpsndpacks += sip->sndpacks; ++ break; ++ ++ case IPPROTO_UDP: ++ unidentudprcvpacks += sip->rcvpacks; ++ unidentudpsndpacks += sip->sndpacks; ++ } ++} ++ ++/* ++** calculate the number of bytes that are really sent or received ++*/ ++static int ++calc_reallen(struct sk_buff *skb, const struct net_device *ndev) ++{ ++ /* ++ ** calculate the real load of this packet on the network: ++ ** ++ ** - length of IP header, TCP/UDP header and data (skb->len) ++ ** ++ ** since packet assembly/disassembly is done by the IP layer ++ ** (we get an input packet that has been assembled already and ++ ** an output packet that still has to be assembled), additional ++ ** IP headers/interface headers and interface headers have ++ ** to be calculated for packets that are larger than the mtu ++ ** ++ ** - interface header length + 4 bytes crc ++ */ ++ int reallen = skb->len; ++ ++ if (reallen > ndev->mtu) ++ reallen += (reallen / ndev->mtu) * ++ (sizeof(struct iphdr) + ndev->hard_header_len + 4); ++ ++ reallen += ndev->hard_header_len + 4; ++ ++ return reallen; ++} ++ ++/* ++** find the tcpv4_ident for the current packet, represented by ++** the skb_buff ++*/ ++static void ++get_tcpv4_ident(struct iphdr *iph, void *trh, char direction, union keydef *key) ++{ ++ struct tcphdr *tcph = (struct tcphdr *)trh; ++ ++ memset(key, 0, sizeof *key); // important for memcmp later on ++ ++ /* ++ ** determine local/remote IP address and ++ ** determine local/remote port number ++ */ ++ switch (direction) { ++ case 'i': // incoming packet ++ key->tcp4.laddr = ntohl(iph->daddr); ++ key->tcp4.raddr = ntohl(iph->saddr); ++ key->tcp4.lport = ntohs(tcph->dest); ++ key->tcp4.rport = ntohs(tcph->source); ++ break; ++ ++ case 'o': // outgoing packet ++ key->tcp4.laddr = ntohl(iph->saddr); ++ key->tcp4.raddr = ntohl(iph->daddr); ++ key->tcp4.lport = ntohs(tcph->source); ++ key->tcp4.rport = ntohs(tcph->dest); ++ } ++} ++ ++/* ++** search for the sockinfo holding the given address info ++** the appropriate hash bucket must have been locked before calling ++*/ ++static struct sockinfo * ++find_sockinfo(int proto, union keydef *identp, int identsz, int hash) ++{ ++ struct sockinfo *sip = shash[hash].ch.next; ++ ++ /* ++ ** search for appropriate struct ++ */ ++ while (sip != (void *)&shash[hash].ch) { ++ if ( memcmp(&sip->key, identp, identsz) == 0 && ++ sip->proto == proto) { ++ sip->lastact = jiffies_64; ++ return sip; ++ } ++ ++ sip = sip->ch.next; ++ } ++ ++ return NULL; // not existing ++} ++ ++/* ++** create a new sockinfo and fill ++** the appropriate hash bucket must have been locked before calling ++*/ ++static struct sockinfo * ++make_sockinfo(int proto, union keydef *identp, int identsz, int hash) ++{ ++ struct sockinfo *sip; ++ unsigned long flags; ++ ++ /* ++ ** check if the threshold of memory used for sockinfo structs ++ ** is reached to avoid that a fork bomb of processes opening ++ ** a socket leads to memory overload ++ */ ++ if ( (nrs+1) * sizeof(struct sockinfo) > SILIMIT) { ++ spin_lock_irqsave(&nrslock, flags); ++ nrs_ovf++; ++ spin_unlock_irqrestore(&nrslock, flags); ++ return NULL; ++ } ++ ++ if ( (sip = kzalloc(sizeof *sip, GFP_ATOMIC)) == NULL) ++ return NULL; ++ ++ spin_lock_irqsave(&nrslock, flags); ++ nrs++; ++ spin_unlock_irqrestore(&nrslock, flags); ++ ++ /* ++ ** insert new struct in doubly linked list ++ */ ++ sip->ch.next = &shash[hash].ch; ++ sip->ch.prev = shash[hash].ch.prev; ++ ((struct sockinfo *)shash[hash].ch.prev)->ch.next = sip; ++ shash[hash].ch.prev = sip; ++ ++ sip->proto = proto; ++ sip->lastact = jiffies_64; ++ sip->key = *identp; ++ ++ return sip; ++} ++ ++/* ++** search the taskinfo structure holding the info about the given id/type ++** if such taskinfo is not yet present, create a new one ++*/ ++static struct taskinfo * ++get_taskinfo(pid_t id, char type) ++{ ++ int bt = THASH(id, type); ++ struct taskinfo *tip = thash[bt].ch.next; ++ unsigned long tflags; ++ ++ /* ++ ** search if id exists already ++ */ ++ while (tip != (void *)&thash[bt].ch) { ++ if (tip->id == id && tip->type == type) ++ return tip; ++ ++ tip = tip->ch.next; ++ } ++ ++ /* ++ ** check if the threshold of memory used for taskinfo structs ++ ** is reached to avoid that a fork bomb of processes opening ++ ** a socket lead to memory overload ++ */ ++ if ( (nre+nrt+1) * sizeof(struct taskinfo) > TILIMIT) { ++ spin_lock_irqsave(&nrtlock, tflags); ++ nrt_ovf++; ++ spin_unlock_irqrestore(&nrtlock, tflags); ++ return NULL; ++ } ++ ++ /* ++ ** id not known yet ++ ** add new entry to hash list ++ */ ++ if ( (tip = kzalloc(sizeof *tip, GFP_ATOMIC)) == NULL) ++ return NULL; ++ ++ spin_lock_irqsave(&nrtlock, tflags); ++ nrt++; ++ spin_unlock_irqrestore(&nrtlock, tflags); ++ ++ /* ++ ** insert new struct in doubly linked list ++ ** and fill values ++ */ ++ tip->ch.next = &thash[bt].ch; ++ tip->ch.prev = thash[bt].ch.prev; ++ ((struct taskinfo *)thash[bt].ch.prev)->ch.next = tip; ++ thash[bt].ch.prev = tip; ++ ++ tip->id = id; ++ tip->type = type; ++ ++ tip->btime = current->real_start_time.tv_sec + boottime.tv_sec; ++ ++ if (current->real_start_time.tv_nsec + boottime.tv_nsec > NSEC_PER_SEC) ++ tip->btime++; ++ ++ strncpy(tip->command, current->comm, COMLEN); ++ ++ return tip; ++} ++ ++/* ++** function that runs every second to see if a ++** time-based garbage collection cycle has to be ++** forced (i.e. if no process forces it) ++*/ ++static void ++gcperiodic(unsigned long unused) ++{ ++ if (jiffies_64 >= gclast + GCINTERVAL) ++ garbage_collector(); ++ ++ /* ++ ** set timer for next second ++ */ ++ timer.expires = jiffies_64 + HZ; ++ timer.function = gcperiodic; ++ add_timer(&timer); ++} ++ ++/* ++** garbage collector that removes: ++** - exited tasks that are not by user mode programs ++** - sockinfo's that are not used any more ++** - taskinfo's that do not exist any more ++** ++** a lock avoids that the garbage collector runs several times in parallel ++*/ ++static void ++garbage_collector(void) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&gclock, flags); ++ ++ if (jiffies_64 < gclast + (HZ/2)) { // maximum 2 GC cycles per second ++ spin_unlock_irqrestore(&gclock, flags); ++ return; ++ } ++ ++ gctaskexit(); // remove remaining taskinfo structs from exit list ++ ++ gcsockinfo(); // clean up sockinfo structs in shash list ++ ++ gctaskinfo(); // clean up taskinfo structs in thash list ++ ++ gclast = jiffies_64; ++ ++ spin_unlock_irqrestore(&gclock, flags); ++} ++ ++/* ++** tasks in the exitlist can be read by a user mode process for a limited ++** amount of time; this function removes all taskinfo structures that have ++** not been read within that period of time ++** notice that exited processes are chained to the tail, so the oldest ++** can be found at the head ++*/ ++static void ++gctaskexit() ++{ ++ unsigned long flags; ++ struct taskinfo *tip; ++ ++ spin_lock_irqsave(&exitlock, flags); ++ ++ for (tip=exithead; tip;) { ++ if (jiffies_64 < tip->exittime + GCINTERVAL) ++ break; ++ ++ // remove taskinfo from exitlist ++ exithead = tip->ch.next; ++ kfree(tip); ++ nre--; ++ tip = exithead; ++ } ++ ++ /* ++ ** if list empty now, then exithead and exittail both NULL ++ ** wakeup waiters for emptylist ++ */ ++ if (nre == 0) { ++ exittail = NULL; ++ wake_up_interruptible(&exitlist_empty); ++ } ++ ++ spin_unlock_irqrestore(&exitlock, flags); ++} ++ ++/* ++** cleanup sockinfo structures that are connected to finished processes ++*/ ++static void ++gcsockinfo() ++{ ++ int i; ++ struct sockinfo *sip, *sipsave; ++ unsigned long sflags, tflags; ++ ++ /* ++ ** go through all sockinfo hash buckets ++ */ ++ for (i=0; i < SBUCKS; i++) { ++ if (shash[i].ch.next == (void *)&shash[i].ch) ++ continue; // quick return without lock ++ ++ spin_lock_irqsave(&shash[i].lock, sflags); ++ ++ sip = shash[i].ch.next; ++ ++ /* ++ ** search all sockinfo structs chained in one bucket ++ */ ++ while (sip != (void *)&shash[i].ch) { ++ /* ++ ** TCP connections that were not in ++ ** state ESTABLISHED or LISTEN can be ++ ** eliminated ++ */ ++ if (sip->proto == IPPROTO_TCP) { ++ switch (sip->last_state) { ++ case TCP_ESTABLISHED: ++ case TCP_LISTEN: ++ break; ++ ++ default: ++ sipsave = sip->ch.next; ++ delete_sockinfo(sip); ++ sip = sipsave; ++ continue; ++ } ++ } ++ ++ /* ++ ** check if this sockinfo has no relation ++ ** for a while with a thread group ++ ** if so, delete the sockinfo ++ */ ++ if (sip->tgp == NULL) { ++ if (sip->lastact + GCMAXUNREF < jiffies_64) { ++ register_unident(sip); ++ sipsave = sip->ch.next; ++ delete_sockinfo(sip); ++ sip = sipsave; ++ } else { ++ sip = sip->ch.next; ++ } ++ continue; ++ } ++ ++ /* ++ ** check if referred thread group is ++ ** already marked as 'indelete' during this ++ ** sockinfo search ++ ** if so, delete this sockinfo ++ */ ++ spin_lock_irqsave(&thash[sip->tgh].lock, tflags); ++ ++ if (sip->tgp->state == INDELETE) { ++ spin_unlock_irqrestore(&thash[sip->tgh].lock, ++ tflags); ++ sipsave = sip->ch.next; ++ delete_sockinfo(sip); ++ sip = sipsave; ++ continue; ++ } ++ ++ /* ++ ** check if referred thread group still exists; ++ ** this step will be skipped if we already verified ++ ** the existance of the thread group earlier during ++ ** this garbage collection cycle ++ */ ++ if (sip->tgp->state != CHECKED) { ++ /* ++ ** connected thread group not yet verified ++ ** during this cycle, so check if it still ++ ** exists ++ ** if not, mark the thread group as 'indelete' ++ ** (it can not be deleted right now because ++ ** we might find other sockinfo's referring ++ ** to this thread group during the current ++ ** cycle) and delete this sockinfo ++ ** if the thread group exists, just mark ++ ** it as 'checked' for this cycle ++ */ ++ if (find_vpid(sip->tgp->id) == NULL) { ++ sip->tgp->state = INDELETE; ++ spin_unlock_irqrestore( ++ &thash[sip->tgh].lock, tflags); ++ ++ sipsave = sip->ch.next; ++ delete_sockinfo(sip); ++ sip = sipsave; ++ continue; ++ } else { ++ sip->tgp->state = CHECKED; ++ } ++ } ++ ++ spin_unlock_irqrestore(&thash[sip->tgh].lock, tflags); ++ ++ /* ++ ** check if this sockinfo has a relation with a thread ++ ** if not, skip further handling of this sockinfo ++ */ ++ if (sip->thp == NULL) { ++ sip = sip->ch.next; ++ continue; ++ } ++ ++ /* ++ ** check if referred thread is already marked ++ ** as 'indelete' during this sockinfo search ++ ** if so, break connection ++ */ ++ spin_lock_irqsave(&thash[sip->thh].lock, tflags); ++ ++ if (sip->thp->state == INDELETE) { ++ spin_unlock_irqrestore(&thash[sip->thh].lock, ++ tflags); ++ sip->thp = NULL; ++ sip = sip->ch.next; ++ continue; ++ } ++ ++ /* ++ ** check if referred thread is already checked ++ ** during this sockinfo search ++ */ ++ if (sip->thp->state == CHECKED) { ++ spin_unlock_irqrestore(&thash[sip->thh].lock, ++ tflags); ++ sip = sip->ch.next; ++ continue; ++ } ++ ++ /* ++ ** connected thread not yet verified ++ ** check if it still exists ++ ** if not, mark it as 'indelete' and break connection ++ ** if thread exists, mark it 'checked' ++ */ ++ if (find_vpid(sip->thp->id) == NULL) { ++ sip->thp->state = INDELETE; ++ sip->thp = NULL; ++ } else { ++ sip->thp->state = CHECKED; ++ } ++ ++ spin_unlock_irqrestore(&thash[sip->thh].lock, tflags); ++ ++ /* ++ ** check if a TCP port has not been used ++ ** for some time --> destroy even if the thread ++ ** (group) is still there ++ */ ++ if (sip->proto == IPPROTO_TCP && ++ sip->lastact + GCMAXTCP < jiffies_64) { ++ sipsave = sip->ch.next; ++ delete_sockinfo(sip); ++ sip = sipsave; ++ continue; ++ } ++ ++ /* ++ ** check if a UDP port has not been used ++ ** for some time --> destroy even if the thread ++ ** (group) is still there ++ ** e.g. outgoing DNS requests (to remote port 53) are ++ ** issued every time with another source port being ++ ** a new object that should not be kept too long; ++ ** local well-known ports are useful to keep ++ */ ++ if (sip->proto == IPPROTO_UDP && ++ sip->lastact + GCMAXUDP < jiffies_64 && ++ sip->key.udp > 1024) { ++ sipsave = sip->ch.next; ++ delete_sockinfo(sip); ++ sip = sipsave; ++ continue; ++ } ++ ++ sip = sip->ch.next; ++ } ++ ++ spin_unlock_irqrestore(&shash[i].lock, sflags); ++ } ++} ++ ++/* ++** remove taskinfo structures of finished tasks from hash list ++*/ ++static void ++gctaskinfo() ++{ ++ int i; ++ struct taskinfo *tip, *tipsave; ++ unsigned long tflags; ++ ++ /* ++ ** go through all taskinfo hash buckets ++ */ ++ for (i=0; i < TBUCKS; i++) { ++ if (thash[i].ch.next == (void *)&thash[i].ch) ++ continue; // quick return without lock ++ ++ spin_lock_irqsave(&thash[i].lock, tflags); ++ ++ tip = thash[i].ch.next; ++ ++ /* ++ ** check all taskinfo structs chained to this bucket ++ */ ++ while (tip != (void *)&thash[i].ch) { ++ switch (tip->state) { ++ /* ++ ** remove INDELETE tasks from the hash buckets ++ ** -- move thread group to exitlist ++ ** -- destroy thread right away ++ */ ++ case INDELETE: ++ tipsave = tip->ch.next; ++ ++ if (tip->type == 'g') ++ move_taskinfo(tip); // thread group ++ else ++ delete_taskinfo(tip); // thread ++ ++ tip = tipsave; ++ break; ++ ++ case CHECKED: ++ tip->state = 0; ++ tip = tip->ch.next; ++ break; ++ ++ default: // not checked yet ++ if (find_vpid(tip->id) == NULL) { ++ tipsave = tip->ch.next; ++ ++ if (tip->type == 'g') ++ move_taskinfo(tip); ++ else ++ delete_taskinfo(tip); ++ ++ tip = tipsave; ++ } else { ++ tip = tip->ch.next; ++ } ++ } ++ } ++ ++ spin_unlock_irqrestore(&thash[i].lock, tflags); ++ } ++} ++ ++ ++/* ++** remove all sockinfo structs ++*/ ++static void ++wipesockinfo() ++{ ++ struct sockinfo *sip, *sipsave; ++ int i; ++ unsigned long sflags; ++ ++ for (i=0; i < SBUCKS; i++) { ++ spin_lock_irqsave(&shash[i].lock, sflags); ++ ++ sip = shash[i].ch.next; ++ ++ /* ++ ** free all structs chained in one bucket ++ */ ++ while (sip != (void *)&shash[i].ch) { ++ sipsave = sip->ch.next; ++ delete_sockinfo(sip); ++ sip = sipsave; ++ } ++ ++ spin_unlock_irqrestore(&shash[i].lock, sflags); ++ } ++} ++ ++/* ++** remove all taskinfo structs from hash list ++*/ ++static void ++wipetaskinfo() ++{ ++ struct taskinfo *tip, *tipsave; ++ int i; ++ unsigned long tflags; ++ ++ for (i=0; i < TBUCKS; i++) { ++ spin_lock_irqsave(&thash[i].lock, tflags); ++ ++ tip = thash[i].ch.next; ++ ++ /* ++ ** free all structs chained in one bucket ++ */ ++ while (tip != (void *)&thash[i].ch) { ++ tipsave = tip->ch.next; ++ delete_taskinfo(tip); ++ tip = tipsave; ++ } ++ ++ spin_unlock_irqrestore(&thash[i].lock, tflags); ++ } ++} ++ ++/* ++** remove all taskinfo structs from exit list ++*/ ++static void ++wipetaskexit() ++{ ++ gctaskexit(); ++} ++ ++/* ++** move one taskinfo struct from hash bucket to exitlist ++*/ ++static void ++move_taskinfo(struct taskinfo *tip) ++{ ++ unsigned long flags; ++ ++ /* ++ ** remove from hash list ++ */ ++ ((struct taskinfo *)tip->ch.next)->ch.prev = tip->ch.prev; ++ ((struct taskinfo *)tip->ch.prev)->ch.next = tip->ch.next; ++ ++ spin_lock_irqsave(&nrtlock, flags); ++ nrt--; ++ spin_unlock_irqrestore(&nrtlock, flags); ++ ++ /* ++ ** add to exitlist ++ */ ++ tip->ch.next = NULL; ++ tip->state = FINISHED; ++ tip->exittime = jiffies_64; ++ ++ spin_lock_irqsave(&exitlock, flags); ++ ++ if (exittail) { // list filled? ++ exittail->ch.next = tip; ++ exittail = tip; ++ } else { // list empty ++ exithead = exittail = tip; ++ } ++ ++ nre++; ++ ++ wake_up_interruptible(&exitlist_filled); ++ ++ spin_unlock_irqrestore(&exitlock, flags); ++} ++ ++/* ++** remove one taskinfo struct for the hash bucket chain ++*/ ++static void ++delete_taskinfo(struct taskinfo *tip) ++{ ++ unsigned long flags; ++ ++ ((struct taskinfo *)tip->ch.next)->ch.prev = tip->ch.prev; ++ ((struct taskinfo *)tip->ch.prev)->ch.next = tip->ch.next; ++ ++ kfree(tip); ++ ++ spin_lock_irqsave(&nrtlock, flags); ++ nrt--; ++ spin_unlock_irqrestore(&nrtlock, flags); ++} ++ ++/* ++** remove one sockinfo struct for the hash bucket chain ++*/ ++static void ++delete_sockinfo(struct sockinfo *sip) ++{ ++ unsigned long flags; ++ ++ ((struct sockinfo *)sip->ch.next)->ch.prev = sip->ch.prev; ++ ((struct sockinfo *)sip->ch.prev)->ch.next = sip->ch.next; ++ ++ kfree(sip); ++ ++ spin_lock_irqsave(&nrslock, flags); ++ nrs--; ++ spin_unlock_irqrestore(&nrslock, flags); ++} ++ ++/* ++** read function for /proc/netatop ++*/ ++static int ++netatop_read_proc(char *buf, char **start, off_t offset, ++ int count, int *eof, void *data) ++{ ++ return sprintf(buf, "tcpsndpacks: %9lu (unident: %9lu)\n" ++ "tcprcvpacks: %9lu (unident: %9lu)\n" ++ "udpsndpacks: %9lu (unident: %9lu)\n" ++ "udprcvpacks: %9lu (unident: %9lu)\n\n" ++ "icmpsndpacks: %9lu\n" ++ "icmprcvpacks: %9lu\n\n" ++ "#sockinfo: %9lu (overflow: %8lu)\n" ++ "#taskinfo: %9lu (overflow: %8lu)\n" ++ "#taskexit: %9lu\n", ++ tcpsndpacks, unidenttcpsndpacks, ++ tcprcvpacks, unidenttcprcvpacks, ++ udpsndpacks, unidentudpsndpacks, ++ udprcvpacks, unidentudprcvpacks, ++ icmpsndpacks, icmprcvpacks, ++ nrs, nrs_ovf, ++ nrt, nrt_ovf, ++ nre); ++} ++ ++/* ++** called when user spce issues system call getsockopt() ++*/ ++static int ++getsockopt(struct sock *sk, int cmd, void __user *user, int *len) ++{ ++ int bt; ++ struct taskinfo *tip; ++ char tasktype = 't'; ++ struct netpertask npt; ++ unsigned long tflags; ++ ++ /* ++ ** verify the proper privileges ++ */ ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ ++ /* ++ ** react on command ++ */ ++ switch (cmd) { ++ case NETATOP_PROBE: ++ break; ++ ++ case NETATOP_FORCE_GC: ++ garbage_collector(); ++ break; ++ ++ case NETATOP_EMPTY_EXIT: ++ while (nre > 0) { ++ if (wait_event_interruptible(exitlist_empty, nre == 0)) ++ return -ERESTARTSYS; ++ } ++ break; ++ ++ case NETATOP_GETCNT_EXIT: ++ if (nre == 0) ++ wake_up_interruptible(&exitlist_empty); ++ ++ if (*len < sizeof(pid_t)) ++ return -EINVAL; ++ ++ if (*len > sizeof npt) ++ *len = sizeof npt; ++ ++ spin_lock_irqsave(&exitlock, tflags); ++ ++ /* ++ ** check if an exited process is present ++ ** if not, wait for it... ++ */ ++ while (nre == 0) { ++ spin_unlock_irqrestore(&exitlock, tflags); ++ ++ if ( wait_event_interruptible(exitlist_filled, nre > 0)) ++ return -ERESTARTSYS; ++ ++ spin_lock_irqsave(&exitlock, tflags); ++ } ++ ++ /* ++ ** get first eprocess from exitlist and remove it from there ++ */ ++ tip = exithead; ++ ++ if ( (exithead = tip->ch.next) == NULL) ++ exittail = NULL; ++ ++ nre--; ++ ++ spin_unlock_irqrestore(&exitlock, tflags); ++ ++ /* ++ ** pass relevant info to user mode ++ ** and free taskinfo struct ++ */ ++ npt.id = tip->id; ++ npt.tc = tip->tc; ++ npt.btime = tip->btime; ++ memcpy(npt.command, tip->command, COMLEN); ++ ++ if (copy_to_user(user, &npt, *len) != 0) ++ return -EFAULT; ++ ++ kfree(tip); ++ ++ return 0; ++ ++ case NETATOP_GETCNT_TGID: ++ tasktype = 'g'; ++ ++ case NETATOP_GETCNT_PID: ++ if (*len < sizeof(pid_t)) ++ return -EINVAL; ++ ++ if (*len > sizeof npt) ++ *len = sizeof npt; ++ ++ if (copy_from_user(&npt, user, *len) != 0) ++ return -EFAULT; ++ ++ /* ++ ** search requested id in taskinfo hash ++ */ ++ bt = THASH(npt.id, tasktype); // calculate hash ++ ++ if (thash[bt].ch.next == (void *)&thash[bt].ch) ++ return -ESRCH; // quick return without lock ++ ++ spin_lock_irqsave(&thash[bt].lock, tflags); ++ ++ tip = thash[bt].ch.next; ++ ++ while (tip != (void *)&thash[bt].ch) { ++ // is this the one? ++ if (tip->id == npt.id && tip->type == tasktype) { ++ /* ++ ** found: copy results to user space ++ */ ++ memcpy(npt.command, tip->command, COMLEN); ++ npt.tc = tip->tc; ++ npt.btime = tip->btime; ++ ++ spin_unlock_irqrestore(&thash[bt].lock, tflags); ++ ++ if (copy_to_user(user, &npt, *len) != 0) ++ return -EFAULT; ++ else ++ return 0; ++ } ++ ++ tip = tip->ch.next; ++ } ++ ++ spin_unlock_irqrestore(&thash[bt].lock, tflags); ++ return -ESRCH; ++ ++ default: ++ printk(KERN_INFO "unknown getsockopt command %d\n", cmd); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++/* ++** called when module loaded ++*/ ++int ++init_module() ++{ ++ int i; ++ ++ /* ++ ** initialize various admi ++ */ ++ for (i=0; i < TBUCKS; i++) { ++ thash[i].ch.next = &thash[i].ch; ++ thash[i].ch.prev = &thash[i].ch; ++ spin_lock_init(&thash[i].lock); ++ } ++ ++ for (i=0; i < SBUCKS; i++) { ++ shash[i].ch.next = &shash[i].ch; ++ shash[i].ch.prev = &shash[i].ch; ++ spin_lock_init(&shash[i].lock); ++ } ++ ++ getboottime(&boottime); ++ ++ /* ++ ** register getsockopt for user space communication ++ */ ++ if (nf_register_sockopt(&sockopts) < 0) ++ return -1; ++ ++ /* ++ ** prepare hooks and register ++ */ ++ hookin_ipv4.hooknum = NF_IP_LOCAL_IN; // input packs ++ hookin_ipv4.hook = ipv4_hookin; // func to call ++ hookin_ipv4.pf = PF_INET; // IPV4 packets ++ hookin_ipv4.priority = NF_IP_PRI_FIRST; // highest prio ++ ++ hookout_ipv4.hooknum = NF_IP_LOCAL_OUT; // output packs ++ hookout_ipv4.hook = ipv4_hookout; // func to call ++ hookout_ipv4.pf = PF_INET; // IPV4 packets ++ hookout_ipv4.priority = NF_IP_PRI_FIRST; // highest prio ++ ++ nf_register_hook(&hookin_ipv4); // register hook ++ nf_register_hook(&hookout_ipv4); // register hook ++ ++ /* ++ ** create a /proc-entry to produce status-info on request ++ */ ++ create_proc_read_entry("netatop", 0444, NULL, netatop_read_proc, NULL); ++ ++ /* ++ ** activate timer for periodic call of garbage collector ++ */ ++ init_timer(&timer); ++ ++ timer.expires = jiffies_64 + HZ; ++ timer.function = gcperiodic; ++ add_timer(&timer); ++ ++ return 0; // return success ++} ++ ++/* ++** called when module unloaded ++*/ ++void ++cleanup_module() ++{ ++ nf_unregister_hook(&hookin_ipv4); ++ nf_unregister_hook(&hookout_ipv4); ++ ++ remove_proc_entry("netatop", NULL); ++ ++ del_timer(&timer); ++ ++ nf_unregister_sockopt(&sockopts); ++ ++ /* ++ ** destroy allocated stats ++ */ ++ wipesockinfo(); ++ wipetaskinfo(); ++ wipetaskexit(); ++} +diff -uNr linux-3.2.33-go.orig/3rdparty/netatop/netatop.h 3rdparty/netatop/netatop.h +--- linux-3.2.33-go.orig/3rdparty/netatop/netatop.h 1970-01-01 01:00:00.000000000 +0100 ++++ 3rdparty/netatop/netatop.h 2012-11-12 18:08:29.000000000 +0100 +@@ -0,0 +1,47 @@ ++#define COMLEN 16 ++ ++struct taskcount { ++ unsigned long long tcpsndpacks; ++ unsigned long long tcpsndbytes; ++ unsigned long long tcprcvpacks; ++ unsigned long long tcprcvbytes; ++ ++ unsigned long long udpsndpacks; ++ unsigned long long udpsndbytes; ++ unsigned long long udprcvpacks; ++ unsigned long long udprcvbytes; ++ ++ /* space for future extensions */ ++}; ++ ++struct netpertask { ++ pid_t id; // tgid or tid (depending on command) ++ unsigned long btime; ++ char command[COMLEN]; ++ ++ struct taskcount tc; ++}; ++ ++ ++/* ++** getsocktop commands ++*/ ++#define NETATOP_BASE_CTL 15661 ++ ++// just probe if the netatop module is active ++#define NETATOP_PROBE (NETATOP_BASE_CTL) ++ ++// force garbage collection to make finished processes available ++#define NETATOP_FORCE_GC (NETATOP_BASE_CTL+1) ++ ++// wait until all finished processes are read (blocks until done) ++#define NETATOP_EMPTY_EXIT (NETATOP_BASE_CTL+2) ++ ++// get info for finished process (blocks until available) ++#define NETATOP_GETCNT_EXIT (NETATOP_BASE_CTL+3) ++ ++// get counters for thread group (i.e. process): input is 'id' (pid) ++#define NETATOP_GETCNT_TGID (NETATOP_BASE_CTL+4) ++ ++// get counters for thread: input is 'id' (tid) ++#define NETATOP_GETCNT_PID (NETATOP_BASE_CTL+5) +diff -uNr linux-3.2.33-go.orig/3rdparty/netatop/netatopversion.h 3rdparty/netatop/netatopversion.h +--- linux-3.2.33-go.orig/3rdparty/netatop/netatopversion.h 1970-01-01 01:00:00.000000000 +0100 ++++ 3rdparty/netatop/netatopversion.h 2012-11-12 18:08:29.000000000 +0100 +@@ -0,0 +1,2 @@ ++#define NETATOPVERSION "0.1.1" ++#define NETATOPDATE "2012/11/12 18:08:23" diff --git a/3.2.34/910-kobject_uevent.patch b/3.2.34/910-kobject_uevent.patch new file mode 100644 index 0000000..aa9a40f --- /dev/null +++ b/3.2.34/910-kobject_uevent.patch @@ -0,0 +1,21 @@ +--- a/lib/kobject_uevent.c ++++ b/lib/kobject_uevent.c +@@ -50,6 +50,18 @@ static const char *kobject_actions[] = { + [KOBJ_OFFLINE] = "offline", + }; + ++u64 uevent_next_seqnum(void) ++{ ++ u64 seq; ++ ++ mutex_lock(&uevent_sock_mutex); ++ seq = ++uevent_seqnum; ++ mutex_unlock(&uevent_sock_mutex); ++ ++ return seq; ++} ++EXPORT_SYMBOL_GPL(uevent_next_seqnum); ++ + /** + * kobject_action_type - translate action string to numeric type + * diff --git a/3.2.34/911-kobject_add_broadcast_uevent.patch b/3.2.34/911-kobject_add_broadcast_uevent.patch new file mode 100644 index 0000000..104df13 --- /dev/null +++ b/3.2.34/911-kobject_add_broadcast_uevent.patch @@ -0,0 +1,85 @@ +--- a/include/linux/kobject.h ++++ b/include/linux/kobject.h +@@ -31,6 +31,8 @@ + #define UEVENT_NUM_ENVP 32 /* number of env pointers */ + #define UEVENT_BUFFER_SIZE 2048 /* buffer for the variables */ + ++struct sk_buff; ++ + /* path to the userspace helper executed on an event */ + extern char uevent_helper[]; + +@@ -213,6 +215,10 @@ int add_uevent_var(struct kobj_uevent_en + + int kobject_action_type(const char *buf, size_t count, + enum kobject_action *type); ++ ++int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group, ++ gfp_t allocation); ++ + #else + static inline int kobject_uevent(struct kobject *kobj, + enum kobject_action action) +@@ -229,6 +235,16 @@ int add_uevent_var(struct kobj_uevent_en + static inline int kobject_action_type(const char *buf, size_t count, + enum kobject_action *type) + { return -EINVAL; } ++ ++void kfree_skb(struct sk_buff *); ++ ++static inline int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group, ++ gfp_t allocation) ++{ ++ kfree_skb(skb); ++ return 0; ++} ++ + #endif + + #endif /* _KOBJECT_H_ */ +--- a/lib/kobject_uevent.c ++++ b/lib/kobject_uevent.c +@@ -381,6 +381,43 @@ int add_uevent_var(struct kobj_uevent_en + EXPORT_SYMBOL_GPL(add_uevent_var); + + #if defined(CONFIG_NET) ++int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group, ++ gfp_t allocation) ++{ ++ struct uevent_sock *ue_sk; ++ int err = 0; ++ ++ /* send netlink message */ ++ mutex_lock(&uevent_sock_mutex); ++ list_for_each_entry(ue_sk, &uevent_sock_list, list) { ++ struct sock *uevent_sock = ue_sk->sk; ++ struct sk_buff *skb2; ++ ++ skb2 = skb_clone(skb, allocation); ++ if (!skb2) ++ break; ++ ++ err = netlink_broadcast(uevent_sock, skb2, pid, group, ++ allocation); ++ if (err) ++ break; ++ } ++ mutex_unlock(&uevent_sock_mutex); ++ ++ kfree_skb(skb); ++ return err; ++} ++#else ++int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group, ++ gfp_t allocation) ++{ ++ kfree_skb(skb); ++ return 0; ++} ++#endif ++EXPORT_SYMBOL_GPL(broadcast_uevent); ++ ++#if defined(CONFIG_NET) + static int uevent_net_init(struct net *net) + { + struct uevent_sock *ue_sk; diff --git a/3.2.34/Add_CONFIG_VFAT_FS_DUALNAMES_option.patch b/3.2.34/Add_CONFIG_VFAT_FS_DUALNAMES_option.patch new file mode 100644 index 0000000..5e3cfe1 --- /dev/null +++ b/3.2.34/Add_CONFIG_VFAT_FS_DUALNAMES_option.patch @@ -0,0 +1,145 @@ +diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig +index 182f9ff..907a5de 100644 +--- a/fs/fat/Kconfig ++++ b/fs/fat/Kconfig +@@ -74,6 +74,26 @@ config VFAT_FS + To compile this as a module, choose M here: the module will be called + vfat. + ++config VFAT_FS_DUALNAMES ++ bool "VFAT dual names support" ++ depends on VFAT_FS ++ help ++ This option provides support for dual filenames on VFAT filesystems. ++ If this option is disabled then file creation will either put ++ a short (8.3) name or a long name on the file, but never both. ++ The field where a shortname would normally go is filled with ++ invalid characters such that it cannot be considered a valid ++ short filename. ++ ++ That means that long filenames created with this option ++ disabled will not be accessible at all to operating systems ++ that do not understand the VFAT extensions. ++ ++ Users considering enabling this option should consider the implications ++ of any patents that may exist on dual filenames in VFAT. ++ ++ If unsure, say N ++ + config FAT_DEFAULT_CODEPAGE + int "Default codepage for FAT" + depends on MSDOS_FS || VFAT_FS +diff --git a/fs/fat/dir.c b/fs/fat/dir.c +index 38ff75a..cd5d3ec 100644 +--- a/fs/fat/dir.c ++++ b/fs/fat/dir.c +@@ -415,14 +415,13 @@ + } + i += chl; + } +- if (!last_u) +- continue; +- +- /* Compare shortname */ +- bufuname[last_u] = 0x0000; +- len = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname)); +- if (fat_name_match(sbi, name, name_len, bufname, len)) +- goto found; ++ if (last_u) { ++ /* Compare shortname */ ++ bufuname[last_u] = 0x0000; ++ len = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname)); ++ if (fat_name_match(sbi, name, name_len, bufname, len)) ++ goto found; ++ } + + if (nr_slots) { + void *longname = unicode + FAT_MAX_UNI_CHARS; +diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c +index 73471b7..894f44d 100644 +--- a/fs/fat/namei_vfat.c ++++ b/fs/fat/namei_vfat.c +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include "fat.h" + + /* +@@ -586,6 +587,59 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, + return 0; + } + ++#ifndef CONFIG_VFAT_FS_DUALNAMES ++/* ++ * build a 11 byte 8.3 buffer which is not a short filename. We want 11 ++ * bytes which: ++ * - will be seen as a constant string to all APIs on Linux and Windows ++ * - cannot be matched with wildcard patterns ++ * - cannot be used to access the file ++ * - has a low probability of collision within a directory ++ * - has an invalid 3 byte extension ++ * - contains at least one non-space and non-nul byte ++ */ ++static void vfat_build_dummy_83_buffer(struct inode *dir, char *msdos_name) ++{ ++ u32 rand_num = random32() & 0x3FFFFFFF; ++ int i; ++ ++ /* a value of zero would leave us with only nul and spaces, ++ * which would not work with older linux systems ++ */ ++ if (rand_num == 0) ++ rand_num = 1; ++ ++ /* we start with a space followed by nul as spaces at the ++ * start of an entry are trimmed in FAT, which means that ++ * starting the 11 bytes with 0x20 0x00 gives us a value which ++ * cannot be used to access the file. It also means that the ++ * value as seen from all Windows and Linux APIs is a constant ++ */ ++ msdos_name[0] = ' '; ++ msdos_name[1] = 0; ++ ++ /* we use / and 2 nul bytes for the extension. These are ++ * invalid in FAT and mean that utilities that show the ++ * directory show no extension, but still work via the long ++ * name for old Linux kernels ++ */ ++ msdos_name[8] = '/'; ++ msdos_name[9] = 0; ++ msdos_name[10] = 0; ++ ++ /* ++ * fill the remaining 6 bytes with random invalid values ++ * This gives us a low collision rate, which means a low ++ * chance of problems with chkdsk.exe and WindowsXP ++ */ ++ for (i = 2; i < 8; i++) { ++ msdos_name[i] = rand_num & 0x1F; ++ rand_num >>= 5; ++ } ++} ++#endif ++ ++ + static int vfat_build_slots(struct inode *dir, const unsigned char *name, + int len, int is_dir, int cluster, + struct timespec *ts, +@@ -628,6 +682,11 @@ static int vfat_build_slots(struct inode *dir, const unsigned char *name, + goto shortname; + } + ++#ifndef CONFIG_VFAT_FS_DUALNAMES ++ vfat_build_dummy_83_buffer(dir, msdos_name); ++ lcase = 0; ++#endif ++ + /* build the entry of long file name */ + cksum = fat_checksum(msdos_name); + +-- +1.6.0.4 + + diff --git a/3.2.34/accessfs-3.2-0.26.patch b/3.2.34/accessfs-3.2-0.26.patch new file mode 100644 index 0000000..f36e634 --- /dev/null +++ b/3.2.34/accessfs-3.2-0.26.patch @@ -0,0 +1,1036 @@ +diff --git a/Documentation/filesystems/accessfs.txt b/Documentation/filesystems/accessfs.txt +new file mode 100644 +index 0000000..bf135b5 +--- /dev/null ++++ b/Documentation/filesystems/accessfs.txt +@@ -0,0 +1,41 @@ ++Accessfs is a permission managing filesystem. It allows to control access to ++system resources, based on file permissions. The recommended mount point for ++this file-system is /proc/access, which will appear automatically in the ++/proc filesystem. ++ ++Currently there are two modules using accessfs, userports and usercaps. ++ ++With userports, you will be able to control access to IP ports based ++on user-/groupid. ++ ++There's no need anymore to run internet daemons as root. You can ++individually configure which user/program can bind to protected ports ++(by default, below 1024). ++ ++For example, you can say, user www is allowed to bind to port 80 or ++user mail is allowed to bind to port 25. Then, you can run apache as ++user www and sendmail as user mail. Now, you don't have to rely on ++apache or sendmail giving up superuser rights to enhance security. ++ ++To use this option, you need to mount the access file system ++and do a chown on the appropriate ports: ++ ++# mount -t accessfs none /proc/access ++# chown www /proc/access/net/ip/bind/80 ++# chown mail /proc/access/net/ip/bind/25 ++ ++You can grant access to a group for individual ports as well. Just say: ++ ++# chgrp lp /proc/access/net/ip/bind/515 ++# chown g+x /proc/access/net/ip/bind/515 ++ ++With usercaps, you will be able to grant capabilities based on ++user-/groupid (root by default). ++ ++For example you can create a group raw and change the capability ++net_raw to this group: ++ ++# chgrp raw /proc/access/capabilities/net_raw ++# chmod ug+x /proc/access/capabilities/net_raw ++# chgrp raw /sbin/ping ++# chmod u-s /sbin/ping; chmod g+s /sbin/ping +diff --git a/fs/Kconfig b/fs/Kconfig +index 5f4c45d..24f7348 100644 +--- a/fs/Kconfig ++++ b/fs/Kconfig +@@ -210,6 +210,7 @@ + # UBIFS File system configuration + source "fs/ubifs/Kconfig" + source "fs/logfs/Kconfig" ++source "fs/accessfs/Kconfig" + source "fs/cramfs/Kconfig" + source "fs/squashfs/Kconfig" + source "fs/freevxfs/Kconfig" +diff --git a/fs/Makefile b/fs/Makefile +index d2c3353..fea1cfc 100644 +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -121,5 +121,6 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ + obj-$(CONFIG_BTRFS_FS) += btrfs/ + obj-$(CONFIG_GFS2_FS) += gfs2/ + obj-y += exofs/ # Multiple modules ++obj-$(CONFIG_ACCESS_FS) += accessfs/ + obj-$(CONFIG_CEPH_FS) += ceph/ + obj-$(CONFIG_PSTORE) += pstore/ +diff --git a/fs/accessfs/Kconfig b/fs/accessfs/Kconfig +new file mode 100644 +index 0000000..539d6e9 +--- /dev/null ++++ b/fs/accessfs/Kconfig +@@ -0,0 +1,61 @@ ++config ACCESS_FS ++ tristate "Accessfs support (Experimental)" ++ depends on EXPERIMENTAL ++ default n ++ help ++ This is a new file system to manage permissions. It is not very ++ useful on its own. You need to enable other options below. ++ ++ If you're unsure, say N. ++ ++config ACCESSFS_USER_PORTS ++ tristate "User permission based IP ports" ++ depends on ACCESS_FS && INET ++ select NET_HOOKS ++ default n ++ help ++ If you say Y here, you will be able to control access to IP ports ++ based on user-/groupid. ++ ++ If you're unsure, say N. ++ ++config ACCESSFS_PROT_SOCK ++ int "Range of protected ports (1024-65536)" ++ depends on ACCESSFS_USER_PORTS ++ default 1024 ++ help ++ Here you can extend the range of protected ports. This is ++ from 1-1023 inclusive on normal unix systems. One use for this ++ could be to reserve ports for X11 (port 6000) or database ++ servers (port 3306 for mysql), so nobody else could grab this port. ++ The default permission for extended ports is --x--x--x. ++ ++ If you build this as a module, you can specify the range of ++ protected ports at module load time (max_prot_sock). ++ ++ If you're unsure, say 1024. ++ ++config ACCESSFS_IGNORE_NET_BIND_SERVICE ++ bool "Ignore CAP_NET_BIND_SERVICE capability" ++ depends on ACCESSFS_USER_PORTS ++ default n ++ help ++ This option lets you decide, wether a user with ++ CAP_NET_BIND_SERVICE capability is able to override ++ your userport configuration. ++ ++ If you build this as a module, you can specify this ++ option at module load time (ignore_net_bind_service). ++ ++ If you're unsure, say n. ++ ++config ACCESSFS_USER_CAPABILITIES ++ bool "User permission based capabilities" ++ depends on ACCESS_FS = y ++ select SECURITY ++ default n ++ help ++ If you say Y here, you will be able to grant capabilities based on ++ user-/groupid (root by default). ++ ++ If you're unsure, say N. +diff --git a/fs/accessfs/Makefile b/fs/accessfs/Makefile +new file mode 100644 +index 0000000..63a5647 +--- /dev/null ++++ b/fs/accessfs/Makefile +@@ -0,0 +1,11 @@ ++# ++# Makefile for the linux accessfs routines. ++# ++ ++obj-$(CONFIG_ACCESS_FS) += accessfs.o ++obj-$(CONFIG_ACCESSFS_USER_CAPABILITIES) += usercaps.o ++obj-$(CONFIG_ACCESSFS_USER_PORTS) += userports.o ++ ++accessfs-objs := inode.o ++usercaps-objs := capabilities.o ++userports-objs := ip.o +diff --git a/fs/accessfs/capabilities.c b/fs/accessfs/capabilities.c +new file mode 100644 +index 0000000..1c43f36 +--- /dev/null ++++ b/fs/accessfs/capabilities.c +@@ -0,0 +1,109 @@ ++/* Copyright (c) 2002-2006 Olaf Dietsche ++ * ++ * User based capabilities for Linux. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++/* perl -n -e 'print "\"", lc($1), "\",\n" if (m/^#define\s+CAP_(.+?)\s+\d+$/);' include/linux/capability.h */ ++static const char *names[] = { ++ "chown", ++ "dac_override", ++ "dac_read_search", ++ "fowner", ++ "fsetid", ++ "kill", ++ "setgid", ++ "setuid", ++ "setpcap", ++ "linux_immutable", ++ "net_bind_service", ++ "net_broadcast", ++ "net_admin", ++ "net_raw", ++ "ipc_lock", ++ "ipc_owner", ++ "sys_module", ++ "sys_rawio", ++ "sys_chroot", ++ "sys_ptrace", ++ "sys_pacct", ++ "sys_admin", ++ "sys_boot", ++ "sys_nice", ++ "sys_resource", ++ "sys_time", ++ "sys_tty_config", ++ "mknod", ++ "lease", ++ "audit_write", ++ "audit_control", ++ "setfcap", ++ "mac_override", ++ "mac_admin", ++ "syslog", ++ "wake_alarm", ++}; ++ ++static struct access_attr caps[ARRAY_SIZE(names)]; ++ ++static int accessfs_capable(struct task_struct *tsk, const struct cred *cred, struct user_namespace *ns, int cap, int audit) ++{ ++ if (accessfs_permitted(&caps[cap], MAY_EXEC)) { ++ /* capability granted */ ++ return 0; ++ } ++ ++ /* capability denied */ ++ return -EPERM; ++} ++ ++static struct security_operations accessfs_security_ops = { ++ .name = "usercaps", ++ .capable = accessfs_capable, ++}; ++ ++static void unregister_capabilities(struct accessfs_direntry *dir, int n) ++{ ++ int i; ++ for (i = 0; i < n; ++i) ++ accessfs_unregister(dir, names[i]); ++} ++ ++static int __init init_capabilities(void) ++{ ++ struct accessfs_direntry *dir; ++ int i, err; ++ dir = accessfs_make_dirpath("capabilities"); ++ if (dir == 0) ++ return -ENOTDIR; ++ ++ for (i = 0; i < ARRAY_SIZE(caps); ++i) { ++ caps[i].uid = 0; ++ caps[i].gid = 0; ++ caps[i].mode = S_IXUSR; ++ err = accessfs_register(dir, names[i], &caps[i]); ++ if (err) { ++ unregister_capabilities(dir, i); ++ return err; ++ } ++ } ++ ++ if (!security_module_enable(&accessfs_security_ops)) ++ return -EAGAIN; ++ ++ err = register_security(&accessfs_security_ops); ++ if (err != 0) ++ unregister_capabilities(dir, ARRAY_SIZE(names)); ++ ++ return err; ++} ++ ++security_initcall(init_capabilities); ++ ++MODULE_AUTHOR("Olaf Dietsche"); ++MODULE_DESCRIPTION("User based capabilities"); ++MODULE_LICENSE("GPL v2"); +diff --git a/fs/accessfs/inode.c b/fs/accessfs/inode.c +new file mode 100644 +index 0000000..a2247e2 +--- /dev/null ++++ b/fs/accessfs/inode.c +@@ -0,0 +1,431 @@ ++/* Copyright (c) 2001-2006 Olaf Dietsche ++ * ++ * Access permission filesystem for Linux. ++ * ++ * 2002 Ben Clifford, create mount point at /proc/access ++ * 2002 Ben Clifford, trying to make it work under 2.5.5-dj2 ++ * (see comments: BENC255 for reminders and todos) ++ * ++ * ++ * BENC255: the kernel doesn't lock BKL for us when entering methods ++ * (see Documentation/fs/porting.txt) ++ * Need to look at code here and see if we need either the BKL ++ * or our own lock - I think probably not. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define ACCESSFS_MAGIC 0x3c1d36e7 ++ ++static struct proc_dir_entry *mountdir = NULL; ++ ++static DEFINE_MUTEX(accessfs_sem); ++ ++static struct inode_operations accessfs_inode_operations; ++static struct file_operations accessfs_dir_file_operations; ++static struct inode_operations accessfs_dir_inode_operations; ++ ++static inline void accessfs_readdir_aux(struct file *filp, ++ struct accessfs_direntry *dir, ++ int start, void *dirent, ++ filldir_t filldir) ++{ ++ struct list_head *list; ++ int i = 2; ++ list_for_each(list, &dir->children) { ++ struct accessfs_entry *de; ++ if (i++ < start) ++ continue; ++ ++ de = list_entry(list, struct accessfs_entry, siblings); ++ if (filldir(dirent, de->name, strlen(de->name), filp->f_pos, ++ de->ino, DT_UNKNOWN) < 0) ++ break; ++ ++ ++filp->f_pos; ++ } ++} ++ ++static int accessfs_readdir(struct file *filp, void *dirent, filldir_t filldir) ++{ ++ int i; ++ struct dentry *dentry = filp->f_dentry; ++ struct accessfs_direntry *dir; ++ ++ i = filp->f_pos; ++ switch (i) { ++ case 0: ++ if (filldir(dirent, ".", 1, i, dentry->d_inode->i_ino, ++ DT_DIR) < 0) ++ break; ++ ++ ++i; ++ ++filp->f_pos; ++ /* NO break; */ ++ case 1: ++ if (filldir(dirent, "..", 2, i, ++ dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) ++ break; ++ ++ ++i; ++ ++filp->f_pos; ++ /* NO break; */ ++ default: ++ mutex_lock(&accessfs_sem); ++ dir = dentry->d_inode->i_private; ++ accessfs_readdir_aux(filp, dir, i, dirent, filldir); ++ mutex_unlock(&accessfs_sem); ++ break; ++ } ++ ++ return 0; ++} ++ ++static struct accessfs_entry *accessfs_lookup_entry(struct accessfs_entry *pe, ++ const char *name, int len) ++{ ++ struct list_head *list; ++ struct accessfs_direntry *dir; ++ if (!S_ISDIR(pe->attr->mode)) ++ return NULL; ++ ++ dir = (struct accessfs_direntry *) pe; ++ list_for_each(list, &dir->children) { ++ struct accessfs_entry *de = list_entry(list, struct accessfs_entry, siblings); ++ if (strncmp(de->name, name, len) == 0 && de->name[len] == 0) ++ return de; ++ } ++ ++ return NULL; ++} ++ ++static struct accessfs_direntry accessfs_rootdir = { ++ { "/", ++ LIST_HEAD_INIT(accessfs_rootdir.node.hash), ++ LIST_HEAD_INIT(accessfs_rootdir.node.siblings), ++ 1, &accessfs_rootdir.attr }, ++ NULL, LIST_HEAD_INIT(accessfs_rootdir.children), ++ { 0, 0, S_IFDIR | 0755 } ++}; ++ ++static void accessfs_init_inode(struct inode *inode, struct accessfs_entry *pe) ++{ ++ static const struct timespec epoch = {0, 0}; ++ inode->i_private = pe; ++ inode->i_uid = pe->attr->uid; ++ inode->i_gid = pe->attr->gid; ++ inode->i_mode = pe->attr->mode; ++/* ++ inode->i_blksize = PAGE_CACHE_SIZE; ++ inode->i_blocks = 0; ++ inode->i_rdev = NODEV; ++*/ ++ inode->i_atime = inode->i_mtime = inode->i_ctime = epoch; ++ switch (inode->i_mode & S_IFMT) { ++ case S_IFREG: ++ inode->i_op = &accessfs_inode_operations; ++ break; ++ case S_IFDIR: ++ inode->i_op = &accessfs_dir_inode_operations; ++ inode->i_fop = &accessfs_dir_file_operations; ++ break; ++ default: ++ BUG(); ++ break; ++ } ++} ++ ++static struct inode *accessfs_get_root_inode(struct super_block *sb) ++{ ++ struct inode *inode = new_inode(sb); ++ if (inode) { ++ mutex_lock(&accessfs_sem); ++/* inode->i_ino = accessfs_rootdir.node.ino; */ ++ accessfs_init_inode(inode, &accessfs_rootdir.node); ++ accessfs_rootdir.node.ino = inode->i_ino; ++ mutex_unlock(&accessfs_sem); ++ } ++ ++ return inode; ++} ++ ++static LIST_HEAD(hash); ++ ++static int accessfs_node_init(struct accessfs_direntry *parent, ++ struct accessfs_entry *de, const char *name, ++ size_t len, struct access_attr *attr, mode_t mode) ++{ ++ static unsigned long ino = 1; ++ de->name = kmalloc(len + 1, GFP_KERNEL); ++ if (de->name == NULL) ++ return -ENOMEM; ++ ++ strncpy(de->name, name, len); ++ de->name[len] = 0; ++ de->ino = ++ino; ++ de->attr = attr; ++ de->attr->uid = 0; ++ de->attr->gid = 0; ++ de->attr->mode = mode; ++ ++ list_add_tail(&de->hash, &hash); ++ list_add_tail(&de->siblings, &parent->children); ++ return 0; ++} ++ ++static int accessfs_mknod(struct accessfs_direntry *dir, const char *name, ++ struct access_attr *attr) ++{ ++ struct accessfs_entry *pe; ++ pe = kmalloc(sizeof(struct accessfs_entry), GFP_KERNEL); ++ if (pe == NULL) ++ return -ENOMEM; ++ ++ accessfs_node_init(dir, pe, name, strlen(name), attr, ++ S_IFREG | attr->mode); ++ return 0; ++} ++ ++static struct accessfs_direntry *accessfs_mkdir(struct accessfs_direntry *parent, ++ const char *name, size_t len) ++{ ++ int err; ++ struct accessfs_direntry *dir; ++ dir = kmalloc(sizeof(struct accessfs_direntry), GFP_KERNEL); ++ if (dir == NULL) ++ return NULL; ++ ++ dir->parent = parent; ++ INIT_LIST_HEAD(&dir->children); ++ err = accessfs_node_init(parent, &dir->node, name, len, &dir->attr, ++ S_IFDIR | 0755); ++ if (err) { ++ kfree(dir); ++ dir = 0; ++ } ++ ++ return dir; ++} ++ ++struct accessfs_direntry *accessfs_make_dirpath(const char *name) ++{ ++ struct accessfs_direntry *dir = &accessfs_rootdir; ++ const char *slash; ++ mutex_lock(&accessfs_sem); ++ do { ++ struct accessfs_entry *de; ++ size_t len; ++ while (*name == '/') ++ ++name; ++ ++ slash = strchr(name, '/'); ++ len = slash ? slash - name : strlen(name); ++ de = accessfs_lookup_entry(&dir->node, name, len); ++ if (de == NULL) { ++ dir = accessfs_mkdir(dir, name, len); ++ } else if (S_ISDIR(de->attr->mode)) { ++ dir = (struct accessfs_direntry *) de; ++ } else { ++ dir = NULL; ++ } ++ ++ if (dir == NULL) ++ break; ++ ++ name = slash + 1; ++ } while (slash != NULL); ++ ++ mutex_unlock(&accessfs_sem); ++ return dir; ++} ++ ++static void accessfs_unlink(struct accessfs_entry *pe) ++{ ++ list_del_init(&pe->hash); ++ list_del_init(&pe->siblings); ++ kfree(pe->name); ++ kfree(pe); ++} ++ ++static int accessfs_notify_change(struct dentry *dentry, struct iattr *iattr) ++{ ++ struct accessfs_entry *pe; ++ struct inode *i = dentry->d_inode; ++ int err; ++ err = inode_change_ok(i, iattr); ++ if (err) ++ return err; ++ ++ setattr_copy(i, iattr); ++ ++ pe = (struct accessfs_entry *) i->i_private; ++ pe->attr->uid = i->i_uid; ++ pe->attr->gid = i->i_gid; ++ pe->attr->mode = i->i_mode; ++ return 0; ++} ++ ++static struct inode *accessfs_iget(struct super_block *sb, unsigned long ino) ++{ ++ struct list_head *list; ++ struct inode *inode = iget_locked(sb, ino); ++ if (!inode) ++ return ERR_PTR(-ENOMEM); ++ ++ if (!(inode->i_state & I_NEW)) ++ return inode; ++ ++ mutex_lock(&accessfs_sem); ++ list_for_each(list, &hash) { ++ struct accessfs_entry *pe; ++ pe = list_entry(list, struct accessfs_entry, hash); ++ if (pe->ino == ino) { ++ accessfs_init_inode(inode, pe); ++ break; ++ } ++ } ++ ++ mutex_unlock(&accessfs_sem); ++ return inode; ++} ++ ++static struct dentry *accessfs_lookup(struct inode *dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct inode *inode = NULL; ++ struct accessfs_entry *pe; ++ mutex_lock(&accessfs_sem); ++ pe = accessfs_lookup_entry(dir->i_private, dentry->d_name.name, ++ dentry->d_name.len); ++ mutex_unlock(&accessfs_sem); ++ if (pe) ++ inode = accessfs_iget(dir->i_sb, pe->ino); ++ ++ d_add(dentry, inode); ++ return NULL; ++} ++ ++static struct inode_operations accessfs_inode_operations = { ++ .setattr = accessfs_notify_change, ++}; ++ ++static struct inode_operations accessfs_dir_inode_operations = { ++ .lookup = accessfs_lookup, ++ .setattr = accessfs_notify_change, ++}; ++ ++static struct file_operations accessfs_dir_file_operations = { ++ .readdir = accessfs_readdir, ++}; ++ ++static struct super_operations accessfs_ops = { ++ .statfs = simple_statfs, ++}; ++ ++static int accessfs_fill_super(struct super_block *sb, void *data, int silent) ++{ ++ struct inode *inode; ++ struct dentry *root; ++ ++ sb->s_blocksize = PAGE_CACHE_SIZE; ++ sb->s_blocksize_bits = PAGE_CACHE_SHIFT; ++ sb->s_magic = ACCESSFS_MAGIC; ++ sb->s_op = &accessfs_ops; ++ inode = accessfs_get_root_inode(sb); ++ if (!inode) ++ return -ENOMEM; ++ ++ root = d_alloc_root(inode); ++ if (!root) { ++ iput(inode); ++ return -ENOMEM; ++ } ++ ++ sb->s_root = root; ++ return 0; ++} ++ ++static struct dentry *accessfs_mount(struct file_system_type *fs_type, ++ int flags, const char *dev_name, void *data) ++{ ++ return mount_single(fs_type, flags, data, accessfs_fill_super); ++} ++ ++int accessfs_permitted(struct access_attr *p, int mask) ++{ ++ mode_t mode = p->mode; ++ if (current_fsuid() == p->uid) ++ mode >>= 6; ++ else if (in_group_p(p->gid)) ++ mode >>= 3; ++ ++ return (mode & mask) == mask; ++} ++ ++int accessfs_register(struct accessfs_direntry *dir, const char *name, ++ struct access_attr *attr) ++{ ++ int err; ++ if (dir == 0) ++ return -EINVAL; ++ ++ mutex_lock(&accessfs_sem); ++ err = accessfs_mknod(dir, name, attr); ++ mutex_unlock(&accessfs_sem); ++ return err; ++} ++ ++void accessfs_unregister(struct accessfs_direntry *dir, const char *name) ++{ ++ struct accessfs_entry *pe; ++ mutex_lock(&accessfs_sem); ++ pe = accessfs_lookup_entry(&dir->node, name, strlen(name)); ++ if (pe) ++ accessfs_unlink(pe); ++ ++ mutex_unlock(&accessfs_sem); ++} ++ ++static struct file_system_type accessfs_fs_type = { ++ .owner = THIS_MODULE, ++ .name = "accessfs", ++ .mount = accessfs_mount, ++ .kill_sb = kill_anon_super, ++}; ++ ++static int __init init_accessfs_fs(void) ++{ ++ ++ /* create mount point for accessfs */ ++ mountdir = proc_mkdir("access", NULL); ++ return register_filesystem(&accessfs_fs_type); ++} ++ ++static void __exit exit_accessfs_fs(void) ++{ ++ unregister_filesystem(&accessfs_fs_type); ++ remove_proc_entry("access", NULL); ++} ++ ++module_init(init_accessfs_fs) ++module_exit(exit_accessfs_fs) ++ ++MODULE_AUTHOR("Olaf Dietsche"); ++MODULE_DESCRIPTION("Access Filesystem"); ++MODULE_LICENSE("GPL v2"); ++ ++EXPORT_SYMBOL(accessfs_permitted); ++EXPORT_SYMBOL(accessfs_make_dirpath); ++EXPORT_SYMBOL(accessfs_register); ++EXPORT_SYMBOL(accessfs_unregister); +diff --git a/fs/accessfs/ip.c b/fs/accessfs/ip.c +new file mode 100644 +index 0000000..bddd2f0 +--- /dev/null ++++ b/fs/accessfs/ip.c +@@ -0,0 +1,101 @@ ++/* Copyright (c) 2002-2006 Olaf Dietsche ++ * ++ * User permission based port access for Linux. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++static int max_prot_sock = CONFIG_ACCESSFS_PROT_SOCK; ++#ifndef CONFIG_ACCESSFS_IGNORE_NET_BIND_SERVICE ++#define CONFIG_ACCESSFS_IGNORE_NET_BIND_SERVICE 0 ++#endif ++static int ignore_net_bind_service = CONFIG_ACCESSFS_IGNORE_NET_BIND_SERVICE; ++static struct access_attr *bind_to_port; ++ ++static int accessfs_ip_prot_sock(struct socket *sock, ++ struct sockaddr *uaddr, int addr_len) ++{ ++ struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; ++ unsigned short snum = ntohs(addr->sin_port); ++ if (snum && snum < max_prot_sock ++ && !accessfs_permitted(&bind_to_port[snum], MAY_EXEC) ++ && (ignore_net_bind_service || !capable(CAP_NET_BIND_SERVICE))) ++ return -EACCES; ++ ++ return 0; ++} ++ ++static int accessfs_ip6_prot_sock(struct socket *sock, ++ struct sockaddr *uaddr, int addr_len) ++{ ++ struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; ++ unsigned short snum = ntohs(addr->sin6_port); ++ if (snum && snum < max_prot_sock ++ && !accessfs_permitted(&bind_to_port[snum], MAY_EXEC) ++ && !capable(CAP_NET_BIND_SERVICE)) ++ return -EACCES; ++ ++ return 0; ++} ++ ++static struct net_hook_operations ip_net_ops = { ++ .ip_prot_sock = accessfs_ip_prot_sock, ++ .ip6_prot_sock = accessfs_ip6_prot_sock, ++}; ++ ++static int __init init_ip(void) ++{ ++ struct accessfs_direntry *dir = accessfs_make_dirpath("net/ip/bind"); ++ int i; ++ ++ if (max_prot_sock < PROT_SOCK) ++ max_prot_sock = PROT_SOCK; ++ else if (max_prot_sock > 65536) ++ max_prot_sock = 65536; ++ ++ bind_to_port = kmalloc(max_prot_sock * sizeof(*bind_to_port), ++ GFP_KERNEL); ++ if (bind_to_port == 0) ++ return -ENOMEM; ++ ++ for (i = 1; i < max_prot_sock; ++i) { ++ char buf[sizeof("65536")]; ++ bind_to_port[i].uid = 0; ++ bind_to_port[i].gid = 0; ++ bind_to_port[i].mode = i < PROT_SOCK ? S_IXUSR : S_IXUGO; ++ sprintf(buf, "%d", i); ++ accessfs_register(dir, buf, &bind_to_port[i]); ++ } ++ ++ net_hooks_register(&ip_net_ops); ++ return 0; ++} ++ ++static void __exit exit_ip(void) ++{ ++ struct accessfs_direntry *dir = accessfs_make_dirpath("net/ip/bind"); ++ int i; ++ net_hooks_unregister(&ip_net_ops); ++ for (i = 1; i < max_prot_sock; ++i) { ++ char buf[sizeof("65536")]; ++ sprintf(buf, "%d", i); ++ accessfs_unregister(dir, buf); ++ } ++ ++ if (bind_to_port != NULL) ++ kfree(bind_to_port); ++} ++ ++module_init(init_ip) ++module_exit(exit_ip) ++ ++MODULE_AUTHOR("Olaf Dietsche"); ++MODULE_DESCRIPTION("User based IP ports permission"); ++MODULE_LICENSE("GPL v2"); ++module_param(max_prot_sock, int, 0444); ++MODULE_PARM_DESC(max_prot_sock, "Number of protected ports"); ++module_param(ignore_net_bind_service, bool, 0644); ++MODULE_PARM_DESC(ignore_net_bind_service, "Ignore CAP_NET_BIND_SERVICE capability"); +diff --git a/include/linux/accessfs_fs.h b/include/linux/accessfs_fs.h +new file mode 100644 +index 0000000..ecd914e +--- /dev/null ++++ b/include/linux/accessfs_fs.h +@@ -0,0 +1,42 @@ ++/* -*- mode: c -*- */ ++#ifndef __accessfs_fs_h_included__ ++#define __accessfs_fs_h_included__ 1 ++ ++/* Copyright (c) 2001 Olaf Dietsche ++ * ++ * Access permission filesystem for Linux. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++struct access_attr { ++ uid_t uid; ++ gid_t gid; ++ mode_t mode; ++}; ++ ++struct accessfs_entry { ++ char *name; ++ struct list_head hash; ++ struct list_head siblings; ++ ino_t ino; ++ struct access_attr *attr; ++}; ++ ++struct accessfs_direntry { ++ struct accessfs_entry node; ++ struct accessfs_direntry *parent; ++ struct list_head children; ++ struct access_attr attr; ++}; ++ ++extern int accessfs_permitted(struct access_attr *p, int mask); ++extern struct accessfs_direntry *accessfs_make_dirpath(const char *name); ++extern int accessfs_register(struct accessfs_direntry *dir, const char *name, struct access_attr *attr); ++extern void accessfs_unregister(struct accessfs_direntry *dir, const char *name); ++ ++#endif +diff --git a/include/net/sock.h b/include/net/sock.h +index 32e3937..5fa9348 100644 +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -1860,4 +1860,47 @@ extern int sysctl_optmem_max; + extern __u32 sysctl_wmem_default; + extern __u32 sysctl_rmem_default; + ++/* Networking hooks */ ++extern int default_ip_prot_sock(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len); ++extern int default_ip6_prot_sock(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len); ++#ifdef CONFIG_NET_HOOKS ++struct net_hook_operations { ++ int (*ip_prot_sock)(struct socket *sock, ++ struct sockaddr *uaddr, int addr_len); ++ int (*ip6_prot_sock)(struct socket *sock, ++ struct sockaddr *uaddr, int addr_len); ++}; ++ ++extern struct net_hook_operations *net_ops; ++ ++extern void net_hooks_register(struct net_hook_operations *ops); ++extern void net_hooks_unregister(struct net_hook_operations *ops); ++ ++static inline int ip_prot_sock(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len) ++{ ++ return net_ops->ip_prot_sock(sock, uaddr, addr_len); ++} ++ ++static inline int ip6_prot_sock(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len) ++{ ++ return net_ops->ip6_prot_sock(sock, uaddr, addr_len); ++} ++#else ++static inline int ip_prot_sock(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len) ++{ ++ return default_ip_prot_sock(sock, uaddr, addr_len); ++} ++ ++static inline int ip6_prot_sock(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len) ++{ ++ return default_ip6_prot_sock(sock, uaddr, addr_len); ++} ++#endif ++ + #endif /* _SOCK_H */ +diff --git a/net/Kconfig b/net/Kconfig +index a073148..bb5fb42 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -75,6 +75,18 @@ config INET + if INET + source "net/ipv4/Kconfig" + source "net/ipv6/Kconfig" ++ ++config NET_HOOKS ++ bool "IP: Networking hooks (Experimental)" ++ depends on INET && EXPERIMENTAL ++ default n ++ help ++ This option enables other kernel parts or modules to hook into the ++ networking area and provide fine grained control over the access to ++ IP ports. ++ ++ If you're unsure, say N. ++ + source "net/netlabel/Kconfig" + + endif # if INET +diff --git a/net/Makefile b/net/Makefile +index acdde49..4e5dc79 100644 +--- a/net/Makefile ++++ b/net/Makefile +@@ -61,6 +61,7 @@ + obj-$(CONFIG_IEEE802154) += ieee802154/ + obj-$(CONFIG_MAC802154) += mac802154/ + ++obj-$(CONFIG_NET) += hooks.o + ifeq ($(CONFIG_NET),y) + obj-$(CONFIG_SYSCTL) += sysctl_net.o + endif +diff --git a/net/hooks.c b/net/hooks.c +new file mode 100644 +index 0000000..33100e6 +--- /dev/null ++++ b/net/hooks.c +@@ -0,0 +1,55 @@ ++/* Copyright (c) 2002 Olaf Dietsche ++ * ++ * Networking hooks. Currently for IPv4 and IPv6 only. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++int default_ip_prot_sock(struct socket *sock, struct sockaddr *uaddr, int addr_len) ++{ ++ struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; ++ unsigned short snum = ntohs(addr->sin_port); ++ if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) ++ return -EACCES; ++ ++ return 0; ++} ++ ++int default_ip6_prot_sock(struct socket *sock, struct sockaddr *uaddr, int addr_len) ++{ ++ struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; ++ unsigned short snum = ntohs(addr->sin6_port); ++ if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) ++ return -EACCES; ++ ++ return 0; ++} ++ ++EXPORT_SYMBOL(default_ip_prot_sock); ++EXPORT_SYMBOL(default_ip6_prot_sock); ++ ++#ifdef CONFIG_NET_HOOKS ++static struct net_hook_operations default_net_ops = { ++ .ip_prot_sock = default_ip_prot_sock, ++ .ip6_prot_sock = default_ip6_prot_sock, ++}; ++ ++struct net_hook_operations *net_ops = &default_net_ops; ++ ++void net_hooks_register(struct net_hook_operations *ops) ++{ ++ net_ops = ops; ++} ++ ++void net_hooks_unregister(struct net_hook_operations *ops) ++{ ++ net_ops = &default_net_ops; ++} ++ ++EXPORT_SYMBOL(net_ops); ++EXPORT_SYMBOL(net_hooks_register); ++EXPORT_SYMBOL(net_hooks_unregister); ++#endif +diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c +index 1b5096a..9460a3c 100644 +--- a/net/ipv4/af_inet.c ++++ b/net/ipv4/af_inet.c +@@ -495,7 +495,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) + + snum = ntohs(addr->sin_port); + err = -EACCES; +- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) ++ if (ip_prot_sock(sock, uaddr, addr_len)) + goto out; + + /* We keep a pair of addresses. rcv_saddr is the one +diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c +index d27c797..154b1ec 100644 +--- a/net/ipv6/af_inet6.c ++++ b/net/ipv6/af_inet6.c +@@ -281,7 +281,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) + return -EINVAL; + + snum = ntohs(addr->sin6_port); +- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) ++ if (ip6_prot_sock(sock, uaddr, addr_len)) + return -EACCES; + + lock_sock(sk); + diff --git a/3.2.34/ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch b/3.2.34/ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch new file mode 100644 index 0000000..7af90e4 --- /dev/null +++ b/3.2.34/ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch @@ -0,0 +1,36 @@ +>From 9f04e51293b130474504216a477bb2a73cbf59e1 Mon Sep 17 00:00:00 2001 +From: Anssi Hannula +Date: Thu, 22 Mar 2012 22:29:11 +0200 +Subject: [PATCH] ata: prefer ata drivers over ide drivers when both are built + +Currently the old IDE drivers are preferred over ATA drivers when both +are built, since ide/ is listed first in drivers/Makefile and therefore +the IDE drivers end up before ATA drivers in modules.order which is used +by depmod/modprobe for module ordering. + +Change it so that ATA drivers are preferred over IDE driver by moving +the ide/ entry under ata/ in drivers/Makefile. + +Signed-off-by: Anssi Hannula +--- + drivers/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/Makefile b/drivers/Makefile +index 932e8bf..e8df3d0 100644 +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -47,9 +47,9 @@ obj-$(CONFIG_PARPORT) += parport/ + obj-y += base/ block/ misc/ mfd/ nfc/ + obj-$(CONFIG_NUBUS) += nubus/ + obj-y += macintosh/ +-obj-$(CONFIG_IDE) += ide/ + obj-$(CONFIG_SCSI) += scsi/ + obj-$(CONFIG_ATA) += ata/ ++obj-$(CONFIG_IDE) += ide/ + obj-$(CONFIG_TARGET_CORE) += target/ + obj-$(CONFIG_MTD) += mtd/ + obj-$(CONFIG_SPI) += spi/ +-- +1.7.9.3 + diff --git a/3.2.34/aufs3-standalone-3.2.patch b/3.2.34/aufs3-standalone-3.2.patch new file mode 100644 index 0000000..9a3c1db --- /dev/null +++ b/3.2.34/aufs3-standalone-3.2.patch @@ -0,0 +1,30657 @@ +diff -uNr linux-3.2.0-gentoo-r1.orig//Documentation/ABI/testing/debugfs-aufs linux-3.2.0-gentoo-r1/Documentation/ABI/testing/debugfs-aufs +--- linux-3.2.0-gentoo-r1.orig//Documentation/ABI/testing/debugfs-aufs 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/Documentation/ABI/testing/debugfs-aufs 2012-01-17 12:11:16.226894357 +0100 +@@ -0,0 +1,37 @@ ++What: /debug/aufs/si_/ ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ Under /debug/aufs, a directory named si_ is created ++ per aufs mount, where is a unique id generated ++ internally. ++ ++What: /debug/aufs/si_/xib ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ It shows the consumed blocks by xib (External Inode Number ++ Bitmap), its block size and file size. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see the aufs manual. ++ ++What: /debug/aufs/si_/xino0, xino1 ... xinoN ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ It shows the consumed blocks by xino (External Inode Number ++ Translation Table), its link count, block size and file ++ size. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see the aufs manual. ++ ++What: /debug/aufs/si_/xigen ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ It shows the consumed blocks by xigen (External Inode ++ Generation Table), its block size and file size. ++ If CONFIG_AUFS_EXPORT is disabled, this entry will not ++ be created. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see the aufs manual. +diff -uNr linux-3.2.0-gentoo-r1.orig//Documentation/ABI/testing/sysfs-aufs linux-3.2.0-gentoo-r1/Documentation/ABI/testing/sysfs-aufs +--- linux-3.2.0-gentoo-r1.orig//Documentation/ABI/testing/sysfs-aufs 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/Documentation/ABI/testing/sysfs-aufs 2012-01-17 12:11:16.226894357 +0100 +@@ -0,0 +1,24 @@ ++What: /sys/fs/aufs/si_/ ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ Under /sys/fs/aufs, a directory named si_ is created ++ per aufs mount, where is a unique id generated ++ internally. ++ ++What: /sys/fs/aufs/si_/br0, br1 ... brN ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ It shows the abolute path of a member directory (which ++ is called branch) in aufs, and its permission. ++ ++What: /sys/fs/aufs/si_/xi_path ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ It shows the abolute path of XINO (External Inode Number ++ Bitmap, Translation Table and Generation Table) file ++ even if it is the default path. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see the aufs manual. +diff -uNr linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/01intro.txt linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/01intro.txt +--- linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/01intro.txt 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/01intro.txt 2012-01-17 12:11:16.263931727 +0100 +@@ -0,0 +1,162 @@ ++ ++# Copyright (C) 2005-2011 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write to the Free Software ++# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ ++Introduction ++---------------------------------------- ++ ++aufs [ei ju: ef es] | [a u f s] ++1. abbrev. for "advanced multi-layered unification filesystem". ++2. abbrev. for "another unionfs". ++3. abbrev. for "auf das" in German which means "on the" in English. ++ Ex. "Butter aufs Brot"(G) means "butter onto bread"(E). ++ But "Filesystem aufs Filesystem" is hard to understand. ++ ++AUFS is a filesystem with features: ++- multi layered stackable unification filesystem, the member directory ++ is called as a branch. ++- branch permission and attribute, 'readonly', 'real-readonly', ++ 'readwrite', 'whiteout-able', 'link-able whiteout' and their ++ combination. ++- internal "file copy-on-write". ++- logical deletion, whiteout. ++- dynamic branch manipulation, adding, deleting and changing permission. ++- allow bypassing aufs, user's direct branch access. ++- external inode number translation table and bitmap which maintains the ++ persistent aufs inode number. ++- seekable directory, including NFS readdir. ++- file mapping, mmap and sharing pages. ++- pseudo-link, hardlink over branches. ++- loopback mounted filesystem as a branch. ++- several policies to select one among multiple writable branches. ++- revert a single systemcall when an error occurs in aufs. ++- and more... ++ ++ ++Multi Layered Stackable Unification Filesystem ++---------------------------------------------------------------------- ++Most people already knows what it is. ++It is a filesystem which unifies several directories and provides a ++merged single directory. When users access a file, the access will be ++passed/re-directed/converted (sorry, I am not sure which English word is ++correct) to the real file on the member filesystem. The member ++filesystem is called 'lower filesystem' or 'branch' and has a mode ++'readonly' and 'readwrite.' And the deletion for a file on the lower ++readonly branch is handled by creating 'whiteout' on the upper writable ++branch. ++ ++On LKML, there have been discussions about UnionMount (Jan Blunck, ++Bharata B Rao and Valerie Aurora) and Unionfs (Erez Zadok). They took ++different approaches to implement the merged-view. ++The former tries putting it into VFS, and the latter implements as a ++separate filesystem. ++(If I misunderstand about these implementations, please let me know and ++I shall correct it. Because it is a long time ago when I read their ++source files last time). ++ ++UnionMount's approach will be able to small, but may be hard to share ++branches between several UnionMount since the whiteout in it is ++implemented in the inode on branch filesystem and always ++shared. According to Bharata's post, readdir does not seems to be ++finished yet. ++There are several missing features known in this implementations such as ++- for users, the inode number may change silently. eg. copy-up. ++- link(2) may break by copy-up. ++- read(2) may get an obsoleted filedata (fstat(2) too). ++- fcntl(F_SETLK) may be broken by copy-up. ++- unnecessary copy-up may happen, for example mmap(MAP_PRIVATE) after ++ open(O_RDWR). ++ ++Unionfs has a longer history. When I started implementing a stacking filesystem ++(Aug 2005), it already existed. It has virtual super_block, inode, ++dentry and file objects and they have an array pointing lower same kind ++objects. After contributing many patches for Unionfs, I re-started my ++project AUFS (Jun 2006). ++ ++In AUFS, the structure of filesystem resembles to Unionfs, but I ++implemented my own ideas, approaches and enhancements and it became ++totally different one. ++ ++Comparing DM snapshot and fs based implementation ++- the number of bytes to be copied between devices is much smaller. ++- the type of filesystem must be one and only. ++- the fs must be writable, no readonly fs, even for the lower original ++ device. so the compression fs will not be usable. but if we use ++ loopback mount, we may address this issue. ++ for instance, ++ mount /cdrom/squashfs.img /sq ++ losetup /sq/ext2.img ++ losetup /somewhere/cow ++ dmsetup "snapshot /dev/loop0 /dev/loop1 ..." ++- it will be difficult (or needs more operations) to extract the ++ difference between the original device and COW. ++- DM snapshot-merge may help a lot when users try merging. in the ++ fs-layer union, users will use rsync(1). ++ ++ ++Several characters/aspects of aufs ++---------------------------------------------------------------------- ++ ++Aufs has several characters or aspects. ++1. a filesystem, callee of VFS helper ++2. sub-VFS, caller of VFS helper for branches ++3. a virtual filesystem which maintains persistent inode number ++4. reader/writer of files on branches such like an application ++ ++1. Callee of VFS Helper ++As an ordinary linux filesystem, aufs is a callee of VFS. For instance, ++unlink(2) from an application reaches sys_unlink() kernel function and ++then vfs_unlink() is called. vfs_unlink() is one of VFS helper and it ++calls filesystem specific unlink operation. Actually aufs implements the ++unlink operation but it behaves like a redirector. ++ ++2. Caller of VFS Helper for Branches ++aufs_unlink() passes the unlink request to the branch filesystem as if ++it were called from VFS. So the called unlink operation of the branch ++filesystem acts as usual. As a caller of VFS helper, aufs should handle ++every necessary pre/post operation for the branch filesystem. ++- acquire the lock for the parent dir on a branch ++- lookup in a branch ++- revalidate dentry on a branch ++- mnt_want_write() for a branch ++- vfs_unlink() for a branch ++- mnt_drop_write() for a branch ++- release the lock on a branch ++ ++3. Persistent Inode Number ++One of the most important issue for a filesystem is to maintain inode ++numbers. This is particularly important to support exporting a ++filesystem via NFS. Aufs is a virtual filesystem which doesn't have a ++backend block device for its own. But some storage is necessary to ++maintain inode number. It may be a large space and may not suit to keep ++in memory. Aufs rents some space from its first writable branch ++filesystem (by default) and creates file(s) on it. These files are ++created by aufs internally and removed soon (currently) keeping opened. ++Note: Because these files are removed, they are totally gone after ++ unmounting aufs. It means the inode numbers are not persistent ++ across unmount or reboot. I have a plan to make them really ++ persistent which will be important for aufs on NFS server. ++ ++4. Read/Write Files Internally (copy-on-write) ++Because a branch can be readonly, when you write a file on it, aufs will ++"copy-up" it to the upper writable branch internally. And then write the ++originally requested thing to the file. Generally kernel doesn't ++open/read/write file actively. In aufs, even a single write may cause a ++internal "file copy". This behaviour is very similar to cp(1) command. ++ ++Some people may think it is better to pass such work to user space ++helper, instead of doing in kernel space. Actually I am still thinking ++about it. But currently I have implemented it in kernel space. +diff -uNr linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/02struct.txt linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/02struct.txt +--- linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/02struct.txt 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/02struct.txt 2012-01-17 12:11:16.280135577 +0100 +@@ -0,0 +1,226 @@ ++ ++# Copyright (C) 2005-2011 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write to the Free Software ++# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ ++Basic Aufs Internal Structure ++ ++Superblock/Inode/Dentry/File Objects ++---------------------------------------------------------------------- ++As like an ordinary filesystem, aufs has its own ++superblock/inode/dentry/file objects. All these objects have a ++dynamically allocated array and store the same kind of pointers to the ++lower filesystem, branch. ++For example, when you build a union with one readwrite branch and one ++readonly, mounted /au, /rw and /ro respectively. ++- /au = /rw + /ro ++- /ro/fileA exists but /rw/fileA ++ ++Aufs lookup operation finds /ro/fileA and gets dentry for that. These ++pointers are stored in a aufs dentry. The array in aufs dentry will be, ++- [0] = NULL ++- [1] = /ro/fileA ++ ++This style of an array is essentially same to the aufs ++superblock/inode/dentry/file objects. ++ ++Because aufs supports manipulating branches, ie. add/delete/change ++dynamically, these objects has its own generation. When branches are ++changed, the generation in aufs superblock is incremented. And a ++generation in other object are compared when it is accessed. ++When a generation in other objects are obsoleted, aufs refreshes the ++internal array. ++ ++ ++Superblock ++---------------------------------------------------------------------- ++Additionally aufs superblock has some data for policies to select one ++among multiple writable branches, XIB files, pseudo-links and kobject. ++See below in detail. ++About the policies which supports copy-down a directory, see policy.txt ++too. ++ ++ ++Branch and XINO(External Inode Number Translation Table) ++---------------------------------------------------------------------- ++Every branch has its own xino (external inode number translation table) ++file. The xino file is created and unlinked by aufs internally. When two ++members of a union exist on the same filesystem, they share the single ++xino file. ++The struct of a xino file is simple, just a sequence of aufs inode ++numbers which is indexed by the lower inode number. ++In the above sample, assume the inode number of /ro/fileA is i111 and ++aufs assigns the inode number i999 for fileA. Then aufs writes 999 as ++4(8) bytes at 111 * 4(8) bytes offset in the xino file. ++ ++When the inode numbers are not contiguous, the xino file will be sparse ++which has a hole in it and doesn't consume as much disk space as it ++might appear. If your branch filesystem consumes disk space for such ++holes, then you should specify 'xino=' option at mounting aufs. ++ ++Also a writable branch has three kinds of "whiteout bases". All these ++are existed when the branch is joined to aufs and the names are ++whiteout-ed doubly, so that users will never see their names in aufs ++hierarchy. ++1. a regular file which will be linked to all whiteouts. ++2. a directory to store a pseudo-link. ++3. a directory to store an "orphan-ed" file temporary. ++ ++1. Whiteout Base ++ When you remove a file on a readonly branch, aufs handles it as a ++ logical deletion and creates a whiteout on the upper writable branch ++ as a hardlink of this file in order not to consume inode on the ++ writable branch. ++2. Pseudo-link Dir ++ See below, Pseudo-link. ++3. Step-Parent Dir ++ When "fileC" exists on the lower readonly branch only and it is ++ opened and removed with its parent dir, and then user writes ++ something into it, then aufs copies-up fileC to this ++ directory. Because there is no other dir to store fileC. After ++ creating a file under this dir, the file is unlinked. ++ ++Because aufs supports manipulating branches, ie. add/delete/change ++dynamically, a branch has its own id. When the branch order changes, aufs ++finds the new index by searching the branch id. ++ ++ ++Pseudo-link ++---------------------------------------------------------------------- ++Assume "fileA" exists on the lower readonly branch only and it is ++hardlinked to "fileB" on the branch. When you write something to fileA, ++aufs copies-up it to the upper writable branch. Additionally aufs ++creates a hardlink under the Pseudo-link Directory of the writable ++branch. The inode of a pseudo-link is kept in aufs super_block as a ++simple list. If fileB is read after unlinking fileA, aufs returns ++filedata from the pseudo-link instead of the lower readonly ++branch. Because the pseudo-link is based upon the inode, to keep the ++inode number by xino (see above) is important. ++ ++All the hardlinks under the Pseudo-link Directory of the writable branch ++should be restored in a proper location later. Aufs provides a utility ++to do this. The userspace helpers executed at remounting and unmounting ++aufs by default. ++During this utility is running, it puts aufs into the pseudo-link ++maintenance mode. In this mode, only the process which began the ++maintenance mode (and its child processes) is allowed to operate in ++aufs. Some other processes which are not related to the pseudo-link will ++be allowed to run too, but the rest have to return an error or wait ++until the maintenance mode ends. If a process already acquires an inode ++mutex (in VFS), it has to return an error. ++ ++ ++XIB(external inode number bitmap) ++---------------------------------------------------------------------- ++Addition to the xino file per a branch, aufs has an external inode number ++bitmap in a superblock object. It is also a file such like a xino file. ++It is a simple bitmap to mark whether the aufs inode number is in-use or ++not. ++To reduce the file I/O, aufs prepares a single memory page to cache xib. ++ ++Aufs implements a feature to truncate/refresh both of xino and xib to ++reduce the number of consumed disk blocks for these files. ++ ++ ++Virtual or Vertical Dir, and Readdir in Userspace ++---------------------------------------------------------------------- ++In order to support multiple layers (branches), aufs readdir operation ++constructs a virtual dir block on memory. For readdir, aufs calls ++vfs_readdir() internally for each dir on branches, merges their entries ++with eliminating the whiteout-ed ones, and sets it to file (dir) ++object. So the file object has its entry list until it is closed. The ++entry list will be updated when the file position is zero and becomes ++old. This decision is made in aufs automatically. ++ ++The dynamically allocated memory block for the name of entries has a ++unit of 512 bytes (by default) and stores the names contiguously (no ++padding). Another block for each entry is handled by kmem_cache too. ++During building dir blocks, aufs creates hash list and judging whether ++the entry is whiteouted by its upper branch or already listed. ++The merged result is cached in the corresponding inode object and ++maintained by a customizable life-time option. ++ ++Some people may call it can be a security hole or invite DoS attack ++since the opened and once readdir-ed dir (file object) holds its entry ++list and becomes a pressure for system memory. But I'd say it is similar ++to files under /proc or /sys. The virtual files in them also holds a ++memory page (generally) while they are opened. When an idea to reduce ++memory for them is introduced, it will be applied to aufs too. ++For those who really hate this situation, I've developed readdir(3) ++library which operates this merging in userspace. You just need to set ++LD_PRELOAD environment variable, and aufs will not consume no memory in ++kernel space for readdir(3). ++ ++ ++Workqueue ++---------------------------------------------------------------------- ++Aufs sometimes requires privilege access to a branch. For instance, ++in copy-up/down operation. When a user process is going to make changes ++to a file which exists in the lower readonly branch only, and the mode ++of one of ancestor directories may not be writable by a user ++process. Here aufs copy-up the file with its ancestors and they may ++require privilege to set its owner/group/mode/etc. ++This is a typical case of a application character of aufs (see ++Introduction). ++ ++Aufs uses workqueue synchronously for this case. It creates its own ++workqueue. The workqueue is a kernel thread and has privilege. Aufs ++passes the request to call mkdir or write (for example), and wait for ++its completion. This approach solves a problem of a signal handler ++simply. ++If aufs didn't adopt the workqueue and changed the privilege of the ++process, and if the mkdir/write call arises SIGXFSZ or other signal, ++then the user process might gain a privilege or the generated core file ++was owned by a superuser. ++ ++Also aufs uses the system global workqueue ("events" kernel thread) too ++for asynchronous tasks, such like handling inotify/fsnotify, re-creating a ++whiteout base and etc. This is unrelated to a privilege. ++Most of aufs operation tries acquiring a rw_semaphore for aufs ++superblock at the beginning, at the same time waits for the completion ++of all queued asynchronous tasks. ++ ++ ++Whiteout ++---------------------------------------------------------------------- ++The whiteout in aufs is very similar to Unionfs's. That is represented ++by its filename. UnionMount takes an approach of a file mode, but I am ++afraid several utilities (find(1) or something) will have to support it. ++ ++Basically the whiteout represents "logical deletion" which stops aufs to ++lookup further, but also it represents "dir is opaque" which also stop ++lookup. ++ ++In aufs, rmdir(2) and rename(2) for dir uses whiteout alternatively. ++In order to make several functions in a single systemcall to be ++revertible, aufs adopts an approach to rename a directory to a temporary ++unique whiteouted name. ++For example, in rename(2) dir where the target dir already existed, aufs ++renames the target dir to a temporary unique whiteouted name before the ++actual rename on a branch and then handles other actions (make it opaque, ++update the attributes, etc). If an error happens in these actions, aufs ++simply renames the whiteouted name back and returns an error. If all are ++succeeded, aufs registers a function to remove the whiteouted unique ++temporary name completely and asynchronously to the system global ++workqueue. ++ ++ ++Copy-up ++---------------------------------------------------------------------- ++It is a well-known feature or concept. ++When user modifies a file on a readonly branch, aufs operate "copy-up" ++internally and makes change to the new file on the upper writable branch. ++When the trigger systemcall does not update the timestamps of the parent ++dir, aufs reverts it after copy-up. +diff -uNr linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/03lookup.txt linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/03lookup.txt +--- linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/03lookup.txt 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/03lookup.txt 2012-01-17 12:11:16.294024590 +0100 +@@ -0,0 +1,106 @@ ++ ++# Copyright (C) 2005-2011 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write to the Free Software ++# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ ++Lookup in a Branch ++---------------------------------------------------------------------- ++Since aufs has a character of sub-VFS (see Introduction), it operates ++lookup for branches as VFS does. It may be a heavy work. Generally ++speaking struct nameidata is a bigger structure and includes many ++information. But almost all lookup operation in aufs is the simplest ++case, ie. lookup only an entry directly connected to its parent. Digging ++down the directory hierarchy is unnecessary. ++ ++VFS has a function lookup_one_len() for that use, but it is not usable ++for a branch filesystem which requires struct nameidata. So aufs ++implements a simple lookup wrapper function. When a branch filesystem ++allows NULL as nameidata, it calls lookup_one_len(). Otherwise it builds ++a simplest nameidata and calls lookup_hash(). ++Here aufs applies "a principle in NFSD", ie. if the filesystem supports ++NFS-export, then it has to support NULL as a nameidata parameter for ++->create(), ->lookup() and ->d_revalidate(). So the lookup wrapper in ++aufs tests if ->s_export_op in the branch is NULL or not. ++ ++When a branch is a remote filesystem, aufs basically trusts its ++->d_revalidate(), also aufs forces the hardest revalidate tests for ++them. ++For d_revalidate, aufs implements three levels of revalidate tests. See ++"Revalidate Dentry and UDBA" in detail. ++ ++ ++Loopback Mount ++---------------------------------------------------------------------- ++Basically aufs supports any type of filesystem and block device for a ++branch (actually there are some exceptions). But it is prohibited to add ++a loopback mounted one whose backend file exists in a filesystem which is ++already added to aufs. The reason is to protect aufs from a recursive ++lookup. If it was allowed, the aufs lookup operation might re-enter a ++lookup for the loopback mounted branch in the same context, and will ++cause a deadlock. ++ ++ ++Revalidate Dentry and UDBA (User's Direct Branch Access) ++---------------------------------------------------------------------- ++Generally VFS helpers re-validate a dentry as a part of lookup. ++0. digging down the directory hierarchy. ++1. lock the parent dir by its i_mutex. ++2. lookup the final (child) entry. ++3. revalidate it. ++4. call the actual operation (create, unlink, etc.) ++5. unlock the parent dir ++ ++If the filesystem implements its ->d_revalidate() (step 3), then it is ++called. Actually aufs implements it and checks the dentry on a branch is ++still valid. ++But it is not enough. Because aufs has to release the lock for the ++parent dir on a branch at the end of ->lookup() (step 2) and ++->d_revalidate() (step 3) while the i_mutex of the aufs dir is still ++held by VFS. ++If the file on a branch is changed directly, eg. bypassing aufs, after ++aufs released the lock, then the subsequent operation may cause ++something unpleasant result. ++ ++This situation is a result of VFS architecture, ->lookup() and ++->d_revalidate() is separated. But I never say it is wrong. It is a good ++design from VFS's point of view. It is just not suitable for sub-VFS ++character in aufs. ++ ++Aufs supports such case by three level of revalidation which is ++selectable by user. ++1. Simple Revalidate ++ Addition to the native flow in VFS's, confirm the child-parent ++ relationship on the branch just after locking the parent dir on the ++ branch in the "actual operation" (step 4). When this validation ++ fails, aufs returns EBUSY. ->d_revalidate() (step 3) in aufs still ++ checks the validation of the dentry on branches. ++2. Monitor Changes Internally by Inotify/Fsnotify ++ Addition to above, in the "actual operation" (step 4) aufs re-lookup ++ the dentry on the branch, and returns EBUSY if it finds different ++ dentry. ++ Additionally, aufs sets the inotify/fsnotify watch for every dir on branches ++ during it is in cache. When the event is notified, aufs registers a ++ function to kernel 'events' thread by schedule_work(). And the ++ function sets some special status to the cached aufs dentry and inode ++ private data. If they are not cached, then aufs has nothing to ++ do. When the same file is accessed through aufs (step 0-3) later, ++ aufs will detect the status and refresh all necessary data. ++ In this mode, aufs has to ignore the event which is fired by aufs ++ itself. ++3. No Extra Validation ++ This is the simplest test and doesn't add any additional revalidation ++ test, and skip therevalidatin in step 4. It is useful and improves ++ aufs performance when system surely hide the aufs branches from user, ++ by over-mounting something (or another method). +diff -uNr linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/04branch.txt linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/04branch.txt +--- linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/04branch.txt 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/04branch.txt 2012-01-17 12:11:16.310228440 +0100 +@@ -0,0 +1,76 @@ ++ ++# Copyright (C) 2005-2011 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write to the Free Software ++# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ ++Branch Manipulation ++ ++Since aufs supports dynamic branch manipulation, ie. add/remove a branch ++and changing its permission/attribute, there are a lot of works to do. ++ ++ ++Add a Branch ++---------------------------------------------------------------------- ++o Confirm the adding dir exists outside of aufs, including loopback ++ mount. ++- and other various attributes... ++o Initialize the xino file and whiteout bases if necessary. ++ See struct.txt. ++ ++o Check the owner/group/mode of the directory ++ When the owner/group/mode of the adding directory differs from the ++ existing branch, aufs issues a warning because it may impose a ++ security risk. ++ For example, when a upper writable branch has a world writable empty ++ top directory, a malicious user can create any files on the writable ++ branch directly, like copy-up and modify manually. If something like ++ /etc/{passwd,shadow} exists on the lower readonly branch but the upper ++ writable branch, and the writable branch is world-writable, then a ++ malicious guy may create /etc/passwd on the writable branch directly ++ and the infected file will be valid in aufs. ++ I am afraid it can be a security issue, but nothing to do except ++ producing a warning. ++ ++ ++Delete a Branch ++---------------------------------------------------------------------- ++o Confirm the deleting branch is not busy ++ To be general, there is one merit to adopt "remount" interface to ++ manipulate branches. It is to discard caches. At deleting a branch, ++ aufs checks the still cached (and connected) dentries and inodes. If ++ there are any, then they are all in-use. An inode without its ++ corresponding dentry can be alive alone (for example, inotify/fsnotify case). ++ ++ For the cached one, aufs checks whether the same named entry exists on ++ other branches. ++ If the cached one is a directory, because aufs provides a merged view ++ to users, as long as one dir is left on any branch aufs can show the ++ dir to users. In this case, the branch can be removed from aufs. ++ Otherwise aufs rejects deleting the branch. ++ ++ If any file on the deleting branch is opened by aufs, then aufs ++ rejects deleting. ++ ++ ++Modify the Permission of a Branch ++---------------------------------------------------------------------- ++o Re-initialize or remove the xino file and whiteout bases if necessary. ++ See struct.txt. ++ ++o rw --> ro: Confirm the modifying branch is not busy ++ Aufs rejects the request if any of these conditions are true. ++ - a file on the branch is mmap-ed. ++ - a regular file on the branch is opened for write and there is no ++ same named entry on the upper branch. +diff -uNr linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/05wbr_policy.txt linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/05wbr_policy.txt +--- linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/05wbr_policy.txt 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/05wbr_policy.txt 2012-01-17 12:11:16.333376797 +0100 +@@ -0,0 +1,65 @@ ++ ++# Copyright (C) 2005-2011 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write to the Free Software ++# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ ++Policies to Select One among Multiple Writable Branches ++---------------------------------------------------------------------- ++When the number of writable branch is more than one, aufs has to decide ++the target branch for file creation or copy-up. By default, the highest ++writable branch which has the parent (or ancestor) dir of the target ++file is chosen (top-down-parent policy). ++By user's request, aufs implements some other policies to select the ++writable branch, for file creation two policies, round-robin and ++most-free-space policies. For copy-up three policies, top-down-parent, ++bottom-up-parent and bottom-up policies. ++ ++As expected, the round-robin policy selects the branch in circular. When ++you have two writable branches and creates 10 new files, 5 files will be ++created for each branch. mkdir(2) systemcall is an exception. When you ++create 10 new directories, all will be created on the same branch. ++And the most-free-space policy selects the one which has most free ++space among the writable branches. The amount of free space will be ++checked by aufs internally, and users can specify its time interval. ++ ++The policies for copy-up is more simple, ++top-down-parent is equivalent to the same named on in create policy, ++bottom-up-parent selects the writable branch where the parent dir ++exists and the nearest upper one from the copyup-source, ++bottom-up selects the nearest upper writable branch from the ++copyup-source, regardless the existence of the parent dir. ++ ++There are some rules or exceptions to apply these policies. ++- If there is a readonly branch above the policy-selected branch and ++ the parent dir is marked as opaque (a variation of whiteout), or the ++ target (creating) file is whiteout-ed on the upper readonly branch, ++ then the result of the policy is ignored and the target file will be ++ created on the nearest upper writable branch than the readonly branch. ++- If there is a writable branch above the policy-selected branch and ++ the parent dir is marked as opaque or the target file is whiteouted ++ on the branch, then the result of the policy is ignored and the target ++ file will be created on the highest one among the upper writable ++ branches who has diropq or whiteout. In case of whiteout, aufs removes ++ it as usual. ++- link(2) and rename(2) systemcalls are exceptions in every policy. ++ They try selecting the branch where the source exists as possible ++ since copyup a large file will take long time. If it can't be, ++ ie. the branch where the source exists is readonly, then they will ++ follow the copyup policy. ++- There is an exception for rename(2) when the target exists. ++ If the rename target exists, aufs compares the index of the branches ++ where the source and the target exists and selects the higher ++ one. If the selected branch is readonly, then aufs follows the ++ copyup policy. +diff -uNr linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/06mmap.txt linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/06mmap.txt +--- linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/06mmap.txt 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/06mmap.txt 2012-01-17 12:11:16.333376797 +0100 +@@ -0,0 +1,47 @@ ++ ++# Copyright (C) 2005-2011 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write to the Free Software ++# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ ++mmap(2) -- File Memory Mapping ++---------------------------------------------------------------------- ++In aufs, the file-mapped pages are handled by a branch fs directly, no ++interaction with aufs. It means aufs_mmap() calls the branch fs's ++->mmap(). ++This approach is simple and good, but there is one problem. ++Under /proc, several entries show the mmap-ped files by its path (with ++device and inode number), and the printed path will be the path on the ++branch fs's instead of virtual aufs's. ++This is not a problem in most cases, but some utilities lsof(1) (and its ++user) may expect the path on aufs. ++ ++To address this issue, aufs adds a new member called vm_prfile in struct ++vm_area_struct (and struct vm_region). The original vm_file points to ++the file on the branch fs in order to handle everything correctly as ++usual. The new vm_prfile points to a virtual file in aufs, and the ++show-functions in procfs refers to vm_prfile if it is set. ++Also we need to maintain several other places where touching vm_file ++such like ++- fork()/clone() copies vma and the reference count of vm_file is ++ incremented. ++- merging vma maintains the ref count too. ++ ++This is not a good approach. It just faking the printed path. But it ++leaves all behaviour around f_mapping unchanged. This is surely an ++advantage. ++Actually aufs had adopted another complicated approach which calls ++generic_file_mmap() and handles struct vm_operations_struct. In this ++approach, aufs met a hard problem and I could not solve it without ++switching the approach. +diff -uNr linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/07export.txt linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/07export.txt +--- linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/07export.txt 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/07export.txt 2012-01-17 12:11:16.338006469 +0100 +@@ -0,0 +1,59 @@ ++ ++# Copyright (C) 2005-2011 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write to the Free Software ++# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ ++Export Aufs via NFS ++---------------------------------------------------------------------- ++Here is an approach. ++- like xino/xib, add a new file 'xigen' which stores aufs inode ++ generation. ++- iget_locked(): initialize aufs inode generation for a new inode, and ++ store it in xigen file. ++- destroy_inode(): increment aufs inode generation and store it in xigen ++ file. it is necessary even if it is not unlinked, because any data of ++ inode may be changed by UDBA. ++- encode_fh(): for a root dir, simply return FILEID_ROOT. otherwise ++ build file handle by ++ + branch id (4 bytes) ++ + superblock generation (4 bytes) ++ + inode number (4 or 8 bytes) ++ + parent dir inode number (4 or 8 bytes) ++ + inode generation (4 bytes)) ++ + return value of exportfs_encode_fh() for the parent on a branch (4 ++ bytes) ++ + file handle for a branch (by exportfs_encode_fh()) ++- fh_to_dentry(): ++ + find the index of a branch from its id in handle, and check it is ++ still exist in aufs. ++ + 1st level: get the inode number from handle and search it in cache. ++ + 2nd level: if not found, get the parent inode number from handle and ++ search it in cache. and then open the parent dir, find the matching ++ inode number by vfs_readdir() and get its name, and call ++ lookup_one_len() for the target dentry. ++ + 3rd level: if the parent dir is not cached, call ++ exportfs_decode_fh() for a branch and get the parent on a branch, ++ build a pathname of it, convert it a pathname in aufs, call ++ path_lookup(). now aufs gets a parent dir dentry, then handle it as ++ the 2nd level. ++ + to open the dir, aufs needs struct vfsmount. aufs keeps vfsmount ++ for every branch, but not itself. to get this, (currently) aufs ++ searches in current->nsproxy->mnt_ns list. it may not be a good ++ idea, but I didn't get other approach. ++ + test the generation of the gotten inode. ++- every inode operation: they may get EBUSY due to UDBA. in this case, ++ convert it into ESTALE for NFSD. ++- readdir(): call lockdep_on/off() because filldir in NFSD calls ++ lookup_one_len(), vfs_getattr(), encode_fh() and others. +diff -uNr linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/08shwh.txt linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/08shwh.txt +--- linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/08shwh.txt 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/08shwh.txt 2012-01-17 12:11:16.338006469 +0100 +@@ -0,0 +1,53 @@ ++ ++# Copyright (C) 2005-2011 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write to the Free Software ++# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ ++Show Whiteout Mode (shwh) ++---------------------------------------------------------------------- ++Generally aufs hides the name of whiteouts. But in some cases, to show ++them is very useful for users. For instance, creating a new middle layer ++(branch) by merging existing layers. ++ ++(borrowing aufs1 HOW-TO from a user, Michael Towers) ++When you have three branches, ++- Bottom: 'system', squashfs (underlying base system), read-only ++- Middle: 'mods', squashfs, read-only ++- Top: 'overlay', ram (tmpfs), read-write ++ ++The top layer is loaded at boot time and saved at shutdown, to preserve ++the changes made to the system during the session. ++When larger changes have been made, or smaller changes have accumulated, ++the size of the saved top layer data grows. At this point, it would be ++nice to be able to merge the two overlay branches ('mods' and 'overlay') ++and rewrite the 'mods' squashfs, clearing the top layer and thus ++restoring save and load speed. ++ ++This merging is simplified by the use of another aufs mount, of just the ++two overlay branches using the 'shwh' option. ++# mount -t aufs -o ro,shwh,br:/livesys/overlay=ro+wh:/livesys/mods=rr+wh \ ++ aufs /livesys/merge_union ++ ++A merged view of these two branches is then available at ++/livesys/merge_union, and the new feature is that the whiteouts are ++visible! ++Note that in 'shwh' mode the aufs mount must be 'ro', which will disable ++writing to all branches. Also the default mode for all branches is 'ro'. ++It is now possible to save the combined contents of the two overlay ++branches to a new squashfs, e.g.: ++# mksquashfs /livesys/merge_union /path/to/newmods.squash ++ ++This new squashfs archive can be stored on the boot device and the ++initramfs will use it to replace the old one at the next boot. +diff -uNr linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/10dynop.txt linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/10dynop.txt +--- linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/10dynop.txt 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/10dynop.txt 2012-01-17 12:11:16.340321305 +0100 +@@ -0,0 +1,47 @@ ++ ++# Copyright (C) 2010-2011 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write to the Free Software ++# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ ++Dynamically customizable FS operations ++---------------------------------------------------------------------- ++Generally FS operations (struct inode_operations, struct ++address_space_operations, struct file_operations, etc.) are defined as ++"static const", but it never means that FS have only one set of ++operation. Some FS have multiple sets of them. For instance, ext2 has ++three sets, one for XIP, for NOBH, and for normal. ++Since aufs overrides and redirects these operations, sometimes aufs has ++to change its behaviour according to the branch FS type. More imporantly ++VFS acts differently if a function (member in the struct) is set or ++not. It means aufs should have several sets of operations and select one ++among them according to the branch FS definition. ++ ++In order to solve this problem and not to affect the behavour of VFS, ++aufs defines these operations dynamically. For instance, aufs defines ++aio_read function for struct file_operations, but it may not be set to ++the file_operations. When the branch FS doesn't have it, aufs doesn't ++set it to its file_operations while the function definition itself is ++still alive. So the behaviour of io_submit(2) will not change, and it ++will return an error when aio_read is not defined. ++ ++The lifetime of these dynamically generated operation object is ++maintained by aufs branch object. When the branch is removed from aufs, ++the reference counter of the object is decremented. When it reaches ++zero, the dynamically generated operation object will be freed. ++ ++This approach is designed to support AIO (io_submit), Direcit I/O and ++XIP mainly. ++Currently this approach is applied to file_operations and ++vm_operations_struct for regular files only. +diff -uNr linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/99plan.txt linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/99plan.txt +--- linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/design/99plan.txt 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/design/99plan.txt 2012-01-17 12:11:16.356525154 +0100 +@@ -0,0 +1,96 @@ ++ ++# Copyright (C) 2005-2011 Junjiro R. Okajima ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write to the Free Software ++# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ ++Plan ++ ++Restoring some features which was implemented in aufs1. ++They were dropped in aufs2 in order to make source files simpler and ++easier to be reviewed. ++ ++ ++Test Only the Highest One for the Directory Permission (dirperm1 option) ++---------------------------------------------------------------------- ++Let's try case study. ++- aufs has two branches, upper readwrite and lower readonly. ++ /au = /rw + /ro ++- "dirA" exists under /ro, but /rw. and its mode is 0700. ++- user invoked "chmod a+rx /au/dirA" ++- then "dirA" becomes world readable? ++ ++In this case, /ro/dirA is still 0700 since it exists in readonly branch, ++or it may be a natively readonly filesystem. If aufs respects the lower ++branch, it should not respond readdir request from other users. But user ++allowed it by chmod. Should really aufs rejects showing the entries ++under /ro/dirA? ++ ++To be honest, I don't have a best solution for this case. So I ++implemented 'dirperm1' and 'nodirperm1' option in aufs1, and leave it to ++users. ++When dirperm1 is specified, aufs checks only the highest one for the ++directory permission, and shows the entries. Otherwise, as usual, checks ++every dir existing on all branches and rejects the request. ++ ++As a side effect, dirperm1 option improves the performance of aufs ++because the number of permission check is reduced. ++ ++ ++Being Another Aufs's Readonly Branch (robr) ++---------------------------------------------------------------------- ++Aufs1 allows aufs to be another aufs's readonly branch. ++This feature was developed by a user's request. But it may not be used ++currecnly. ++ ++ ++Copy-up on Open (coo=) ++---------------------------------------------------------------------- ++By default the internal copy-up is executed when it is really necessary. ++It is not done when a file is opened for writing, but when write(2) is ++done. Users who have many (over 100) branches want to know and analyse ++when and what file is copied-up. To insert a new upper branch which ++contains such files only may improve the performance of aufs. ++ ++Aufs1 implemented "coo=none | leaf | all" option. ++ ++ ++Refresh the Opened File (refrof) ++---------------------------------------------------------------------- ++This option is implemented in aufs1 but incomplete. ++ ++When user reads from a file, he expects to get its latest filedata ++generally. If the file is removed and a new same named file is created, ++the content he gets is unchanged, ie. the unlinked filedata. ++ ++Let's try case study again. ++- aufs has two branches. ++ /au = /rw + /ro ++- "fileA" exists under /ro, but /rw. ++- user opened "/au/fileA". ++- he or someone else inserts a branch (/new) between /rw and /ro. ++ /au = /rw + /new + /ro ++- the new branch has "fileA". ++- user reads from the opened "fileA" ++- which filedata should aufs return, from /ro or /new? ++ ++Some people says it has to be "from /ro" and it is a semantics of Unix. ++The others say it should be "from /new" because the file is not removed ++and it is equivalent to the case of someone else modifies the file. ++ ++Here again I don't have a best and final answer. I got an idea to ++implement 'refrof' and 'norefrof' option. When 'refrof' (REFResh the ++Opened File) is specified (by default), aufs returns the filedata from ++/new. ++Otherwise from /new. +diff -uNr linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/README linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/README +--- linux-3.2.0-gentoo-r1.orig//Documentation/filesystems/aufs/README 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/Documentation/filesystems/aufs/README 2012-01-17 12:11:16.229209192 +0100 +@@ -0,0 +1,328 @@ ++ ++Aufs3 -- advanced multi layered unification filesystem version 3.x ++http://aufs.sf.net ++Junjiro R. Okajima ++ ++ ++0. Introduction ++---------------------------------------- ++In the early days, aufs was entirely re-designed and re-implemented ++Unionfs Version 1.x series. After many original ideas, approaches, ++improvements and implementations, it becomes totally different from ++Unionfs while keeping the basic features. ++Recently, Unionfs Version 2.x series begin taking some of the same ++approaches to aufs1's. ++Unionfs is being developed by Professor Erez Zadok at Stony Brook ++University and his team. ++ ++Aufs3 supports linux-3.0 and later. ++If you want older kernel version support, try aufs2-2.6.git or ++aufs2-standalone.git repository, aufs1 from CVS on SourceForge. ++ ++Note: it becomes clear that "Aufs was rejected. Let's give it up." ++According to Christoph Hellwig, linux rejects all union-type filesystems ++but UnionMount. ++ ++ ++ ++1. Features ++---------------------------------------- ++- unite several directories into a single virtual filesystem. The member ++ directory is called as a branch. ++- you can specify the permission flags to the branch, which are 'readonly', ++ 'readwrite' and 'whiteout-able.' ++- by upper writable branch, internal copyup and whiteout, files/dirs on ++ readonly branch are modifiable logically. ++- dynamic branch manipulation, add, del. ++- etc... ++ ++Also there are many enhancements in aufs1, such as: ++- readdir(3) in userspace. ++- keep inode number by external inode number table ++- keep the timestamps of file/dir in internal copyup operation ++- seekable directory, supporting NFS readdir. ++- whiteout is hardlinked in order to reduce the consumption of inodes ++ on branch ++- do not copyup, nor create a whiteout when it is unnecessary ++- revert a single systemcall when an error occurs in aufs ++- remount interface instead of ioctl ++- maintain /etc/mtab by an external command, /sbin/mount.aufs. ++- loopback mounted filesystem as a branch ++- kernel thread for removing the dir who has a plenty of whiteouts ++- support copyup sparse file (a file which has a 'hole' in it) ++- default permission flags for branches ++- selectable permission flags for ro branch, whether whiteout can ++ exist or not ++- export via NFS. ++- support /fs/aufs and /aufs. ++- support multiple writable branches, some policies to select one ++ among multiple writable branches. ++- a new semantics for link(2) and rename(2) to support multiple ++ writable branches. ++- no glibc changes are required. ++- pseudo hardlink (hardlink over branches) ++- allow a direct access manually to a file on branch, e.g. bypassing aufs. ++ including NFS or remote filesystem branch. ++- userspace wrapper for pathconf(3)/fpathconf(3) with _PC_LINK_MAX. ++- and more... ++ ++Currently these features are dropped temporary from aufs3. ++See design/08plan.txt in detail. ++- test only the highest one for the directory permission (dirperm1) ++- copyup on open (coo=) ++- nested mount, i.e. aufs as readonly no-whiteout branch of another aufs ++ (robr) ++- statistics of aufs thread (/sys/fs/aufs/stat) ++- delegation mode (dlgt) ++ a delegation of the internal branch access to support task I/O ++ accounting, which also supports Linux Security Modules (LSM) mainly ++ for Suse AppArmor. ++- intent.open/create (file open in a single lookup) ++ ++Features or just an idea in the future (see also design/*.txt), ++- reorder the branch index without del/re-add. ++- permanent xino files for NFSD ++- an option for refreshing the opened files after add/del branches ++- 'move' policy for copy-up between two writable branches, after ++ checking free space. ++- light version, without branch manipulation. (unnecessary?) ++- copyup in userspace ++- inotify in userspace ++- readv/writev ++- xattr, acl ++ ++ ++2. Download ++---------------------------------------- ++There were three GIT trees for aufs3, aufs3-linux.git, ++aufs3-standalone.git, and aufs-util.git. Note that there is no "3" in ++"aufs-util.git." ++While the aufs-util is always necessary, you need either of aufs3-linux ++or aufs3-standalone. ++ ++The aufs3-linux tree includes the whole linux mainline GIT tree, ++git://git.kernel.org/.../torvalds/linux.git. ++And you cannot select CONFIG_AUFS_FS=m for this version, eg. you cannot ++build aufs3 as an externel kernel module. ++ ++On the other hand, the aufs3-standalone tree has only aufs source files ++and necessary patches, and you can select CONFIG_AUFS_FS=m. ++ ++You will find GIT branches whose name is in form of "aufs3.x" where "x" ++represents the linux kernel version, "linux-3.x". For instance, ++"aufs3.0" is for linux-3.0. For latest "linux-3.x-rcN", use ++"aufs3.x-rcN" branch. ++ ++o aufs3-linux tree ++$ git clone --reference /your/linux/git/tree \ ++ git://aufs.git.sourceforge.net/gitroot/aufs/aufs3-linux.git \ ++ aufs3-linux.git ++- if you don't have linux GIT tree, then remove "--reference ..." ++$ cd aufs3-linux.git ++$ git checkout origin/aufs3.0 ++ ++o aufs3-standalone tree ++$ git clone git://aufs.git.sourceforge.net/gitroot/aufs/aufs3-standalone.git \ ++ aufs3-standalone.git ++$ cd aufs3-standalone.git ++$ git checkout origin/aufs3.0 ++ ++o aufs-util tree ++$ git clone git://aufs.git.sourceforge.net/gitroot/aufs/aufs-util.git \ ++ aufs-util.git ++$ cd aufs-util.git ++$ git checkout origin/aufs3.0 ++ ++Note: The 3.x-rcN branch is to be used with `rc' kernel versions ONLY. ++The minor version number, 'x' in '3.x', of aufs may not always ++follow the minor version number of the kernel. ++Because changes in the kernel that cause the use of a new ++minor version number do not always require changes to aufs-util. ++ ++Since aufs-util has its own minor version number, you may not be ++able to find a GIT branch in aufs-util for your kernel's ++exact minor version number. ++In this case, you should git-checkout the branch for the ++nearest lower number. ++ ++For (an unreleased) example: ++If you are using "linux-3.10" and the "aufs3.10" branch ++does not exit in aufs-util repository, then "aufs3.9", "aufs3.8" ++or something numerically smaller is the branch for your kernel. ++ ++Also you can view all branches by ++ $ git branch -a ++ ++ ++3. Configuration and Compilation ++---------------------------------------- ++Make sure you have git-checkout'ed the correct branch. ++ ++For aufs3-linux tree, ++- enable CONFIG_EXPERIMENTAL and CONFIG_AUFS_FS. ++- set other aufs configurations if necessary. ++ ++For aufs3-standalone tree, ++There are several ways to build. ++ ++1. ++- apply ./aufs3-kbuild.patch to your kernel source files. ++- apply ./aufs3-base.patch too. ++- apply ./aufs3-proc_map.patch too, if you want to make /proc/PID/maps (and ++ others including lsof(1)) show the file path on aufs instead of the ++ path on the branch fs. ++- apply ./aufs3-standalone.patch too, if you have a plan to set ++ CONFIG_AUFS_FS=m. otherwise you don't need ./aufs3-standalone.patch. ++- copy ./{Documentation,fs,include/linux/aufs_type.h} files to your ++ kernel source tree. Never copy ./include/linux/Kbuild. ++- enable CONFIG_EXPERIMENTAL and CONFIG_AUFS_FS, you can select either ++ =m or =y. ++- and build your kernel as usual. ++- install the built kernel. ++- install the header files too by "make headers_install". ++- and reboot your system. ++ ++2. ++- module only (CONFIG_AUFS_FS=m). ++- apply ./aufs3-base.patch to your kernel source files. ++- apply ./aufs3-proc_map.patch too to your kernel source files, ++ if you want to make /proc/PID/maps (and others including lsof(1)) show ++ the file path on aufs instead of the path on the branch fs. ++- apply ./aufs3-standalone.patch too. ++- build your kernel, don't forget "make headers_install", and reboot. ++- edit ./config.mk and set other aufs configurations if necessary. ++ Note: You should read ./fs/aufs/Kconfig carefully which describes ++ every aufs configurations. ++- build the module by simple "make". ++- you can specify ${KDIR} make variable which points to your kernel ++ source tree. ++- install the files ++ + run "make install" to install the aufs module, or copy the built ++ ./aufs.ko to /lib/modules/... and run depmod -a (or reboot simply). ++ + run "make headers_install" to install the aufs header file (you can ++ specify DESTDIR), or copty ./usr/include/linux/aufs_type.h to ++ /usr/include/linux or wherever you like. ++- no need to apply aufs3-kbuild.patch, nor copying source files to your ++ kernel source tree. ++ ++Note: The haeder file aufs_type.h is necessary to build aufs-util ++ as well as "make headers_install" in the kernel source tree. ++ headers_install is subject to be forgotten, but it is essentially ++ necessary, not only for building aufs-util. ++ You may not meet problems without headers_install in some older ++ version though. ++ ++And then, ++- read README in aufs-util, build and install it ++- note that your distribution may contain an obsoleted version of ++ aufs_type.h in /usr/include/linux or something. When you build aufs ++ utilities, make sure that your compiler refers the correct aufs header ++ file which is built by "make headers_install." ++- if you want to use readdir(3) in userspace or pathconf(3) wrapper, ++ then run "make install_ulib" too. And refer to the aufs manual in ++ detail. ++ ++ ++4. Usage ++---------------------------------------- ++At first, make sure aufs-util are installed, and please read the aufs ++manual, aufs.5 in aufs-util.git tree. ++$ man -l aufs.5 ++ ++And then, ++$ mkdir /tmp/rw /tmp/aufs ++# mount -t aufs -o br=/tmp/rw:${HOME} none /tmp/aufs ++ ++Here is another example. The result is equivalent. ++# mount -t aufs -o br=/tmp/rw=rw:${HOME}=ro none /tmp/aufs ++ Or ++# mount -t aufs -o br:/tmp/rw none /tmp/aufs ++# mount -o remount,append:${HOME} /tmp/aufs ++ ++Then, you can see whole tree of your home dir through /tmp/aufs. If ++you modify a file under /tmp/aufs, the one on your home directory is ++not affected, instead the same named file will be newly created under ++/tmp/rw. And all of your modification to a file will be applied to ++the one under /tmp/rw. This is called the file based Copy on Write ++(COW) method. ++Aufs mount options are described in aufs.5. ++If you run chroot or something and make your aufs as a root directory, ++then you need to customize the shutdown script. See the aufs manual in ++detail. ++ ++Additionally, there are some sample usages of aufs which are a ++diskless system with network booting, and LiveCD over NFS. ++See sample dir in CVS tree on SourceForge. ++ ++ ++5. Contact ++---------------------------------------- ++When you have any problems or strange behaviour in aufs, please let me ++know with: ++- /proc/mounts (instead of the output of mount(8)) ++- /sys/module/aufs/* ++- /sys/fs/aufs/* (if you have them) ++- /debug/aufs/* (if you have them) ++- linux kernel version ++ if your kernel is not plain, for example modified by distributor, ++ the url where i can download its source is necessary too. ++- aufs version which was printed at loading the module or booting the ++ system, instead of the date you downloaded. ++- configuration (define/undefine CONFIG_AUFS_xxx) ++- kernel configuration or /proc/config.gz (if you have it) ++- behaviour which you think to be incorrect ++- actual operation, reproducible one is better ++- mailto: aufs-users at lists.sourceforge.net ++ ++Usually, I don't watch the Public Areas(Bugs, Support Requests, Patches, ++and Feature Requests) on SourceForge. Please join and write to ++aufs-users ML. ++ ++ ++6. Acknowledgements ++---------------------------------------- ++Thanks to everyone who have tried and are using aufs, whoever ++have reported a bug or any feedback. ++ ++Especially donators: ++Tomas Matejicek(slax.org) made a donation (much more than once). ++ Since Apr 2010, Tomas M (the author of Slax and Linux Live ++ scripts) is making "doubling" donations. ++ Unfortunately I cannot list all of the donators, but I really ++ appriciate. ++ It ends Aug 2010, but the ordinary donation URL is still available. ++ ++Dai Itasaka made a donation (2007/8). ++Chuck Smith made a donation (2008/4, 10 and 12). ++Henk Schoneveld made a donation (2008/9). ++Chih-Wei Huang, ASUS, CTC donated Eee PC 4G (2008/10). ++Francois Dupoux made a donation (2008/11). ++Bruno Cesar Ribas and Luis Carlos Erpen de Bona, C3SL serves public ++ aufs2 GIT tree (2009/2). ++William Grant made a donation (2009/3). ++Patrick Lane made a donation (2009/4). ++The Mail Archive (mail-archive.com) made donations (2009/5). ++Nippy Networks (Ed Wildgoose) made a donation (2009/7). ++New Dream Network, LLC (www.dreamhost.com) made a donation (2009/11). ++Pavel Pronskiy made a donation (2011/2). ++Iridium and Inmarsat satellite phone retailer (www.mailasail.com), Nippy ++ Networks (Ed Wildgoose) made a donation for hardware (2011/3). ++Max Lekomcev (DOM-TV project) made a donation (2011/7 and 12). ++Sam Liddicott made a donation (2011/9). ++ ++Thank you very much. ++Donations are always, including future donations, very important and ++helpful for me to keep on developing aufs. ++ ++ ++7. ++---------------------------------------- ++If you are an experienced user, no explanation is needed. Aufs is ++just a linux filesystem. ++ ++ ++Enjoy! ++ ++# Local variables: ; ++# mode: text; ++# End: ; +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/aufs.h linux-3.2.0-gentoo-r1/fs/aufs/aufs.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/aufs.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/aufs.h 2012-01-17 12:11:24.486228052 +0100 +@@ -0,0 +1,60 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * all header files ++ */ ++ ++#ifndef __AUFS_H__ ++#define __AUFS_H__ ++ ++#ifdef __KERNEL__ ++ ++#define AuStub(type, name, body, ...) \ ++ static inline type name(__VA_ARGS__) { body; } ++ ++#define AuStubVoid(name, ...) \ ++ AuStub(void, name, , __VA_ARGS__) ++#define AuStubInt0(name, ...) \ ++ AuStub(int, name, return 0, __VA_ARGS__) ++ ++#include "debug.h" ++ ++#include "branch.h" ++#include "cpup.h" ++#include "dcsub.h" ++#include "dbgaufs.h" ++#include "dentry.h" ++#include "dir.h" ++#include "dynop.h" ++#include "file.h" ++#include "fstype.h" ++#include "inode.h" ++#include "loop.h" ++#include "module.h" ++#include "opts.h" ++#include "rwsem.h" ++#include "spl.h" ++#include "super.h" ++#include "sysaufs.h" ++#include "vfsub.h" ++#include "whout.h" ++#include "wkq.h" ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/branch.c linux-3.2.0-gentoo-r1/fs/aufs/branch.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/branch.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/branch.c 2012-01-17 12:11:24.486228052 +0100 +@@ -0,0 +1,1169 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * branch management ++ */ ++ ++#include ++#include ++#include "aufs.h" ++ ++/* ++ * free a single branch ++ */ ++static void au_br_do_free(struct au_branch *br) ++{ ++ int i; ++ struct au_wbr *wbr; ++ struct au_dykey **key; ++ ++ au_hnotify_fin_br(br); ++ ++ if (br->br_xino.xi_file) ++ fput(br->br_xino.xi_file); ++ mutex_destroy(&br->br_xino.xi_nondir_mtx); ++ ++ AuDebugOn(atomic_read(&br->br_count)); ++ ++ wbr = br->br_wbr; ++ if (wbr) { ++ for (i = 0; i < AuBrWh_Last; i++) ++ dput(wbr->wbr_wh[i]); ++ AuDebugOn(atomic_read(&wbr->wbr_wh_running)); ++ AuRwDestroy(&wbr->wbr_wh_rwsem); ++ } ++ ++ key = br->br_dykey; ++ for (i = 0; i < AuBrDynOp; i++, key++) ++ if (*key) ++ au_dy_put(*key); ++ else ++ break; ++ ++ mntput(br->br_mnt); ++ kfree(wbr); ++ kfree(br); ++} ++ ++/* ++ * frees all branches ++ */ ++void au_br_free(struct au_sbinfo *sbinfo) ++{ ++ aufs_bindex_t bmax; ++ struct au_branch **br; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ bmax = sbinfo->si_bend + 1; ++ br = sbinfo->si_branch; ++ while (bmax--) ++ au_br_do_free(*br++); ++} ++ ++/* ++ * find the index of a branch which is specified by @br_id. ++ */ ++int au_br_index(struct super_block *sb, aufs_bindex_t br_id) ++{ ++ aufs_bindex_t bindex, bend; ++ ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) ++ if (au_sbr_id(sb, bindex) == br_id) ++ return bindex; ++ return -1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * add a branch ++ */ ++ ++static int test_overlap(struct super_block *sb, struct dentry *h_adding, ++ struct dentry *h_root) ++{ ++ if (unlikely(h_adding == h_root ++ || au_test_loopback_overlap(sb, h_adding))) ++ return 1; ++ if (h_adding->d_sb != h_root->d_sb) ++ return 0; ++ return au_test_subdir(h_adding, h_root) ++ || au_test_subdir(h_root, h_adding); ++} ++ ++/* ++ * returns a newly allocated branch. @new_nbranch is a number of branches ++ * after adding a branch. ++ */ ++static struct au_branch *au_br_alloc(struct super_block *sb, int new_nbranch, ++ int perm) ++{ ++ struct au_branch *add_branch; ++ struct dentry *root; ++ int err; ++ ++ err = -ENOMEM; ++ root = sb->s_root; ++ add_branch = kmalloc(sizeof(*add_branch), GFP_NOFS); ++ if (unlikely(!add_branch)) ++ goto out; ++ ++ err = au_hnotify_init_br(add_branch, perm); ++ if (unlikely(err)) ++ goto out_br; ++ ++ add_branch->br_wbr = NULL; ++ if (au_br_writable(perm)) { ++ /* may be freed separately at changing the branch permission */ ++ add_branch->br_wbr = kmalloc(sizeof(*add_branch->br_wbr), ++ GFP_NOFS); ++ if (unlikely(!add_branch->br_wbr)) ++ goto out_hnotify; ++ } ++ ++ err = au_sbr_realloc(au_sbi(sb), new_nbranch); ++ if (!err) ++ err = au_di_realloc(au_di(root), new_nbranch); ++ if (!err) ++ err = au_ii_realloc(au_ii(root->d_inode), new_nbranch); ++ if (!err) ++ return add_branch; /* success */ ++ ++ kfree(add_branch->br_wbr); ++ ++out_hnotify: ++ au_hnotify_fin_br(add_branch); ++out_br: ++ kfree(add_branch); ++out: ++ return ERR_PTR(err); ++} ++ ++/* ++ * test if the branch permission is legal or not. ++ */ ++static int test_br(struct inode *inode, int brperm, char *path) ++{ ++ int err; ++ ++ err = (au_br_writable(brperm) && IS_RDONLY(inode)); ++ if (!err) ++ goto out; ++ ++ err = -EINVAL; ++ pr_err("write permission for readonly mount or inode, %s\n", path); ++ ++out: ++ return err; ++} ++ ++/* ++ * returns: ++ * 0: success, the caller will add it ++ * plus: success, it is already unified, the caller should ignore it ++ * minus: error ++ */ ++static int test_add(struct super_block *sb, struct au_opt_add *add, int remount) ++{ ++ int err; ++ aufs_bindex_t bend, bindex; ++ struct dentry *root; ++ struct inode *inode, *h_inode; ++ ++ root = sb->s_root; ++ bend = au_sbend(sb); ++ if (unlikely(bend >= 0 ++ && au_find_dbindex(root, add->path.dentry) >= 0)) { ++ err = 1; ++ if (!remount) { ++ err = -EINVAL; ++ pr_err("%s duplicated\n", add->pathname); ++ } ++ goto out; ++ } ++ ++ err = -ENOSPC; /* -E2BIG; */ ++ if (unlikely(AUFS_BRANCH_MAX <= add->bindex ++ || AUFS_BRANCH_MAX - 1 <= bend)) { ++ pr_err("number of branches exceeded %s\n", add->pathname); ++ goto out; ++ } ++ ++ err = -EDOM; ++ if (unlikely(add->bindex < 0 || bend + 1 < add->bindex)) { ++ pr_err("bad index %d\n", add->bindex); ++ goto out; ++ } ++ ++ inode = add->path.dentry->d_inode; ++ err = -ENOENT; ++ if (unlikely(!inode->i_nlink)) { ++ pr_err("no existence %s\n", add->pathname); ++ goto out; ++ } ++ ++ err = -EINVAL; ++ if (unlikely(inode->i_sb == sb)) { ++ pr_err("%s must be outside\n", add->pathname); ++ goto out; ++ } ++ ++ if (unlikely(au_test_fs_unsuppoted(inode->i_sb))) { ++ pr_err("unsupported filesystem, %s (%s)\n", ++ add->pathname, au_sbtype(inode->i_sb)); ++ goto out; ++ } ++ ++ err = test_br(add->path.dentry->d_inode, add->perm, add->pathname); ++ if (unlikely(err)) ++ goto out; ++ ++ if (bend < 0) ++ return 0; /* success */ ++ ++ err = -EINVAL; ++ for (bindex = 0; bindex <= bend; bindex++) ++ if (unlikely(test_overlap(sb, add->path.dentry, ++ au_h_dptr(root, bindex)))) { ++ pr_err("%s is overlapped\n", add->pathname); ++ goto out; ++ } ++ ++ err = 0; ++ if (au_opt_test(au_mntflags(sb), WARN_PERM)) { ++ h_inode = au_h_dptr(root, 0)->d_inode; ++ if ((h_inode->i_mode & S_IALLUGO) != (inode->i_mode & S_IALLUGO) ++ || h_inode->i_uid != inode->i_uid ++ || h_inode->i_gid != inode->i_gid) ++ pr_warning("uid/gid/perm %s %u/%u/0%o, %u/%u/0%o\n", ++ add->pathname, ++ inode->i_uid, inode->i_gid, ++ (inode->i_mode & S_IALLUGO), ++ h_inode->i_uid, h_inode->i_gid, ++ (h_inode->i_mode & S_IALLUGO)); ++ } ++ ++out: ++ return err; ++} ++ ++/* ++ * initialize or clean the whiteouts for an adding branch ++ */ ++static int au_br_init_wh(struct super_block *sb, struct au_branch *br, ++ int new_perm, struct dentry *h_root) ++{ ++ int err, old_perm; ++ aufs_bindex_t bindex; ++ struct mutex *h_mtx; ++ struct au_wbr *wbr; ++ struct au_hinode *hdir; ++ ++ wbr = br->br_wbr; ++ old_perm = br->br_perm; ++ br->br_perm = new_perm; ++ hdir = NULL; ++ h_mtx = NULL; ++ bindex = au_br_index(sb, br->br_id); ++ if (0 <= bindex) { ++ hdir = au_hi(sb->s_root->d_inode, bindex); ++ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ } else { ++ h_mtx = &h_root->d_inode->i_mutex; ++ mutex_lock_nested(h_mtx, AuLsc_I_PARENT); ++ } ++ if (!wbr) ++ err = au_wh_init(h_root, br, sb); ++ else { ++ wbr_wh_write_lock(wbr); ++ err = au_wh_init(h_root, br, sb); ++ wbr_wh_write_unlock(wbr); ++ } ++ if (hdir) ++ au_hn_imtx_unlock(hdir); ++ else ++ mutex_unlock(h_mtx); ++ br->br_perm = old_perm; ++ ++ if (!err && wbr && !au_br_writable(new_perm)) { ++ kfree(wbr); ++ br->br_wbr = NULL; ++ } ++ ++ return err; ++} ++ ++static int au_wbr_init(struct au_branch *br, struct super_block *sb, ++ int perm, struct path *path) ++{ ++ int err; ++ struct kstatfs kst; ++ struct au_wbr *wbr; ++ struct dentry *h_dentry; ++ ++ wbr = br->br_wbr; ++ au_rw_init(&wbr->wbr_wh_rwsem); ++ memset(wbr->wbr_wh, 0, sizeof(wbr->wbr_wh)); ++ atomic_set(&wbr->wbr_wh_running, 0); ++ wbr->wbr_bytes = 0; ++ ++ /* ++ * a limit for rmdir/rename a dir ++ * cf. AUFS_MAX_NAMELEN in include/linux/aufs_type.h ++ */ ++ err = vfs_statfs(path, &kst); ++ if (unlikely(err)) ++ goto out; ++ err = -EINVAL; ++ h_dentry = path->dentry; ++ if (kst.f_namelen >= NAME_MAX) ++ err = au_br_init_wh(sb, br, perm, h_dentry); ++ else ++ pr_err("%.*s(%s), unsupported namelen %ld\n", ++ AuDLNPair(h_dentry), au_sbtype(h_dentry->d_sb), ++ kst.f_namelen); ++ ++out: ++ return err; ++} ++ ++/* intialize a new branch */ ++static int au_br_init(struct au_branch *br, struct super_block *sb, ++ struct au_opt_add *add) ++{ ++ int err; ++ ++ err = 0; ++ memset(&br->br_xino, 0, sizeof(br->br_xino)); ++ mutex_init(&br->br_xino.xi_nondir_mtx); ++ br->br_perm = add->perm; ++ br->br_mnt = add->path.mnt; /* set first, mntget() later */ ++ spin_lock_init(&br->br_dykey_lock); ++ memset(br->br_dykey, 0, sizeof(br->br_dykey)); ++ atomic_set(&br->br_count, 0); ++ br->br_xino_upper = AUFS_XINO_TRUNC_INIT; ++ atomic_set(&br->br_xino_running, 0); ++ br->br_id = au_new_br_id(sb); ++ AuDebugOn(br->br_id < 0); ++ ++ if (au_br_writable(add->perm)) { ++ err = au_wbr_init(br, sb, add->perm, &add->path); ++ if (unlikely(err)) ++ goto out_err; ++ } ++ ++ if (au_opt_test(au_mntflags(sb), XINO)) { ++ err = au_xino_br(sb, br, add->path.dentry->d_inode->i_ino, ++ au_sbr(sb, 0)->br_xino.xi_file, /*do_test*/1); ++ if (unlikely(err)) { ++ AuDebugOn(br->br_xino.xi_file); ++ goto out_err; ++ } ++ } ++ ++ sysaufs_br_init(br); ++ mntget(add->path.mnt); ++ goto out; /* success */ ++ ++out_err: ++ br->br_mnt = NULL; ++out: ++ return err; ++} ++ ++static void au_br_do_add_brp(struct au_sbinfo *sbinfo, aufs_bindex_t bindex, ++ struct au_branch *br, aufs_bindex_t bend, ++ aufs_bindex_t amount) ++{ ++ struct au_branch **brp; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ brp = sbinfo->si_branch + bindex; ++ memmove(brp + 1, brp, sizeof(*brp) * amount); ++ *brp = br; ++ sbinfo->si_bend++; ++ if (unlikely(bend < 0)) ++ sbinfo->si_bend = 0; ++} ++ ++static void au_br_do_add_hdp(struct au_dinfo *dinfo, aufs_bindex_t bindex, ++ aufs_bindex_t bend, aufs_bindex_t amount) ++{ ++ struct au_hdentry *hdp; ++ ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ hdp = dinfo->di_hdentry + bindex; ++ memmove(hdp + 1, hdp, sizeof(*hdp) * amount); ++ au_h_dentry_init(hdp); ++ dinfo->di_bend++; ++ if (unlikely(bend < 0)) ++ dinfo->di_bstart = 0; ++} ++ ++static void au_br_do_add_hip(struct au_iinfo *iinfo, aufs_bindex_t bindex, ++ aufs_bindex_t bend, aufs_bindex_t amount) ++{ ++ struct au_hinode *hip; ++ ++ AuRwMustWriteLock(&iinfo->ii_rwsem); ++ ++ hip = iinfo->ii_hinode + bindex; ++ memmove(hip + 1, hip, sizeof(*hip) * amount); ++ hip->hi_inode = NULL; ++ au_hn_init(hip); ++ iinfo->ii_bend++; ++ if (unlikely(bend < 0)) ++ iinfo->ii_bstart = 0; ++} ++ ++static void au_br_do_add(struct super_block *sb, struct dentry *h_dentry, ++ struct au_branch *br, aufs_bindex_t bindex) ++{ ++ struct dentry *root; ++ struct inode *root_inode; ++ aufs_bindex_t bend, amount; ++ ++ root = sb->s_root; ++ root_inode = root->d_inode; ++ bend = au_sbend(sb); ++ amount = bend + 1 - bindex; ++ au_sbilist_lock(); ++ au_br_do_add_brp(au_sbi(sb), bindex, br, bend, amount); ++ au_br_do_add_hdp(au_di(root), bindex, bend, amount); ++ au_br_do_add_hip(au_ii(root_inode), bindex, bend, amount); ++ au_set_h_dptr(root, bindex, dget(h_dentry)); ++ au_set_h_iptr(root_inode, bindex, au_igrab(h_dentry->d_inode), ++ /*flags*/0); ++ au_sbilist_unlock(); ++} ++ ++int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount) ++{ ++ int err; ++ aufs_bindex_t bend, add_bindex; ++ struct dentry *root, *h_dentry; ++ struct inode *root_inode; ++ struct au_branch *add_branch; ++ ++ root = sb->s_root; ++ root_inode = root->d_inode; ++ IMustLock(root_inode); ++ err = test_add(sb, add, remount); ++ if (unlikely(err < 0)) ++ goto out; ++ if (err) { ++ err = 0; ++ goto out; /* success */ ++ } ++ ++ bend = au_sbend(sb); ++ add_branch = au_br_alloc(sb, bend + 2, add->perm); ++ err = PTR_ERR(add_branch); ++ if (IS_ERR(add_branch)) ++ goto out; ++ ++ err = au_br_init(add_branch, sb, add); ++ if (unlikely(err)) { ++ au_br_do_free(add_branch); ++ goto out; ++ } ++ ++ add_bindex = add->bindex; ++ h_dentry = add->path.dentry; ++ if (!remount) ++ au_br_do_add(sb, h_dentry, add_branch, add_bindex); ++ else { ++ sysaufs_brs_del(sb, add_bindex); ++ au_br_do_add(sb, h_dentry, add_branch, add_bindex); ++ sysaufs_brs_add(sb, add_bindex); ++ } ++ ++ if (!add_bindex) { ++ au_cpup_attr_all(root_inode, /*force*/1); ++ sb->s_maxbytes = h_dentry->d_sb->s_maxbytes; ++ } else ++ au_add_nlink(root_inode, h_dentry->d_inode); ++ ++ /* ++ * this test/set prevents aufs from handling unnecesary notify events ++ * of xino files, in case of re-adding a writable branch which was ++ * once detached from aufs. ++ */ ++ if (au_xino_brid(sb) < 0 ++ && au_br_writable(add_branch->br_perm) ++ && !au_test_fs_bad_xino(h_dentry->d_sb) ++ && add_branch->br_xino.xi_file ++ && add_branch->br_xino.xi_file->f_dentry->d_parent == h_dentry) ++ au_xino_brid_set(sb, add_branch->br_id); ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * delete a branch ++ */ ++ ++/* to show the line number, do not make it inlined function */ ++#define AuVerbose(do_info, fmt, ...) do { \ ++ if (do_info) \ ++ pr_info(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++static int au_test_ibusy(struct inode *inode, aufs_bindex_t bstart, ++ aufs_bindex_t bend) ++{ ++ return (inode && !S_ISDIR(inode->i_mode)) || bstart == bend; ++} ++ ++static int au_test_dbusy(struct dentry *dentry, aufs_bindex_t bstart, ++ aufs_bindex_t bend) ++{ ++ return au_test_ibusy(dentry->d_inode, bstart, bend); ++} ++ ++/* ++ * test if the branch is deletable or not. ++ */ ++static int test_dentry_busy(struct dentry *root, aufs_bindex_t bindex, ++ unsigned int sigen, const unsigned int verbose) ++{ ++ int err, i, j, ndentry; ++ aufs_bindex_t bstart, bend; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry *d; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_dcsub_pages(&dpages, root, NULL, NULL); ++ if (unlikely(err)) ++ goto out_dpages; ++ ++ for (i = 0; !err && i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ ndentry = dpage->ndentry; ++ for (j = 0; !err && j < ndentry; j++) { ++ d = dpage->dentries[j]; ++ AuDebugOn(!d->d_count); ++ if (!au_digen_test(d, sigen)) { ++ di_read_lock_child(d, AuLock_IR); ++ if (unlikely(au_dbrange_test(d))) { ++ di_read_unlock(d, AuLock_IR); ++ continue; ++ } ++ } else { ++ di_write_lock_child(d); ++ if (unlikely(au_dbrange_test(d))) { ++ di_write_unlock(d); ++ continue; ++ } ++ err = au_reval_dpath(d, sigen); ++ if (!err) ++ di_downgrade_lock(d, AuLock_IR); ++ else { ++ di_write_unlock(d); ++ break; ++ } ++ } ++ ++ /* AuDbgDentry(d); */ ++ bstart = au_dbstart(d); ++ bend = au_dbend(d); ++ if (bstart <= bindex ++ && bindex <= bend ++ && au_h_dptr(d, bindex) ++ && au_test_dbusy(d, bstart, bend)) { ++ err = -EBUSY; ++ AuVerbose(verbose, "busy %.*s\n", AuDLNPair(d)); ++ AuDbgDentry(d); ++ } ++ di_read_unlock(d, AuLock_IR); ++ } ++ } ++ ++out_dpages: ++ au_dpages_free(&dpages); ++out: ++ return err; ++} ++ ++static int test_inode_busy(struct super_block *sb, aufs_bindex_t bindex, ++ unsigned int sigen, const unsigned int verbose) ++{ ++ int err; ++ unsigned long long max, ull; ++ struct inode *i, **array; ++ aufs_bindex_t bstart, bend; ++ ++ array = au_iarray_alloc(sb, &max); ++ err = PTR_ERR(array); ++ if (IS_ERR(array)) ++ goto out; ++ ++ err = 0; ++ AuDbg("b%d\n", bindex); ++ for (ull = 0; !err && ull < max; ull++) { ++ i = array[ull]; ++ if (i->i_ino == AUFS_ROOT_INO) ++ continue; ++ ++ /* AuDbgInode(i); */ ++ if (au_iigen(i) == sigen) ++ ii_read_lock_child(i); ++ else { ++ ii_write_lock_child(i); ++ err = au_refresh_hinode_self(i); ++ au_iigen_dec(i); ++ if (!err) ++ ii_downgrade_lock(i); ++ else { ++ ii_write_unlock(i); ++ break; ++ } ++ } ++ ++ bstart = au_ibstart(i); ++ bend = au_ibend(i); ++ if (bstart <= bindex ++ && bindex <= bend ++ && au_h_iptr(i, bindex) ++ && au_test_ibusy(i, bstart, bend)) { ++ err = -EBUSY; ++ AuVerbose(verbose, "busy i%lu\n", i->i_ino); ++ AuDbgInode(i); ++ } ++ ii_read_unlock(i); ++ } ++ au_iarray_free(array, max); ++ ++out: ++ return err; ++} ++ ++static int test_children_busy(struct dentry *root, aufs_bindex_t bindex, ++ const unsigned int verbose) ++{ ++ int err; ++ unsigned int sigen; ++ ++ sigen = au_sigen(root->d_sb); ++ DiMustNoWaiters(root); ++ IiMustNoWaiters(root->d_inode); ++ di_write_unlock(root); ++ err = test_dentry_busy(root, bindex, sigen, verbose); ++ if (!err) ++ err = test_inode_busy(root->d_sb, bindex, sigen, verbose); ++ di_write_lock_child(root); /* aufs_write_lock() calls ..._child() */ ++ ++ return err; ++} ++ ++static void au_br_do_del_brp(struct au_sbinfo *sbinfo, ++ const aufs_bindex_t bindex, ++ const aufs_bindex_t bend) ++{ ++ struct au_branch **brp, **p; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ brp = sbinfo->si_branch + bindex; ++ if (bindex < bend) ++ memmove(brp, brp + 1, sizeof(*brp) * (bend - bindex)); ++ sbinfo->si_branch[0 + bend] = NULL; ++ sbinfo->si_bend--; ++ ++ p = krealloc(sbinfo->si_branch, sizeof(*p) * bend, AuGFP_SBILIST); ++ if (p) ++ sbinfo->si_branch = p; ++ /* harmless error */ ++} ++ ++static void au_br_do_del_hdp(struct au_dinfo *dinfo, const aufs_bindex_t bindex, ++ const aufs_bindex_t bend) ++{ ++ struct au_hdentry *hdp, *p; ++ ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ hdp = dinfo->di_hdentry; ++ if (bindex < bend) ++ memmove(hdp + bindex, hdp + bindex + 1, ++ sizeof(*hdp) * (bend - bindex)); ++ hdp[0 + bend].hd_dentry = NULL; ++ dinfo->di_bend--; ++ ++ p = krealloc(hdp, sizeof(*p) * bend, AuGFP_SBILIST); ++ if (p) ++ dinfo->di_hdentry = p; ++ /* harmless error */ ++} ++ ++static void au_br_do_del_hip(struct au_iinfo *iinfo, const aufs_bindex_t bindex, ++ const aufs_bindex_t bend) ++{ ++ struct au_hinode *hip, *p; ++ ++ AuRwMustWriteLock(&iinfo->ii_rwsem); ++ ++ hip = iinfo->ii_hinode + bindex; ++ if (bindex < bend) ++ memmove(hip, hip + 1, sizeof(*hip) * (bend - bindex)); ++ iinfo->ii_hinode[0 + bend].hi_inode = NULL; ++ au_hn_init(iinfo->ii_hinode + bend); ++ iinfo->ii_bend--; ++ ++ p = krealloc(iinfo->ii_hinode, sizeof(*p) * bend, AuGFP_SBILIST); ++ if (p) ++ iinfo->ii_hinode = p; ++ /* harmless error */ ++} ++ ++static void au_br_do_del(struct super_block *sb, aufs_bindex_t bindex, ++ struct au_branch *br) ++{ ++ aufs_bindex_t bend; ++ struct au_sbinfo *sbinfo; ++ struct dentry *root, *h_root; ++ struct inode *inode, *h_inode; ++ struct au_hinode *hinode; ++ ++ SiMustWriteLock(sb); ++ ++ root = sb->s_root; ++ inode = root->d_inode; ++ sbinfo = au_sbi(sb); ++ bend = sbinfo->si_bend; ++ ++ h_root = au_h_dptr(root, bindex); ++ hinode = au_hi(inode, bindex); ++ h_inode = au_igrab(hinode->hi_inode); ++ au_hiput(hinode); ++ ++ au_sbilist_lock(); ++ au_br_do_del_brp(sbinfo, bindex, bend); ++ au_br_do_del_hdp(au_di(root), bindex, bend); ++ au_br_do_del_hip(au_ii(inode), bindex, bend); ++ au_sbilist_unlock(); ++ ++ dput(h_root); ++ iput(h_inode); ++ au_br_do_free(br); ++} ++ ++int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount) ++{ ++ int err, rerr, i; ++ unsigned int mnt_flags; ++ aufs_bindex_t bindex, bend, br_id; ++ unsigned char do_wh, verbose; ++ struct au_branch *br; ++ struct au_wbr *wbr; ++ ++ err = 0; ++ bindex = au_find_dbindex(sb->s_root, del->h_path.dentry); ++ if (bindex < 0) { ++ if (remount) ++ goto out; /* success */ ++ err = -ENOENT; ++ pr_err("%s no such branch\n", del->pathname); ++ goto out; ++ } ++ AuDbg("bindex b%d\n", bindex); ++ ++ err = -EBUSY; ++ mnt_flags = au_mntflags(sb); ++ verbose = !!au_opt_test(mnt_flags, VERBOSE); ++ bend = au_sbend(sb); ++ if (unlikely(!bend)) { ++ AuVerbose(verbose, "no more branches left\n"); ++ goto out; ++ } ++ br = au_sbr(sb, bindex); ++ i = atomic_read(&br->br_count); ++ if (unlikely(i)) { ++ AuVerbose(verbose, "%d file(s) opened\n", i); ++ goto out; ++ } ++ ++ wbr = br->br_wbr; ++ do_wh = wbr && (wbr->wbr_whbase || wbr->wbr_plink || wbr->wbr_orph); ++ if (do_wh) { ++ /* instead of WbrWhMustWriteLock(wbr) */ ++ SiMustWriteLock(sb); ++ for (i = 0; i < AuBrWh_Last; i++) { ++ dput(wbr->wbr_wh[i]); ++ wbr->wbr_wh[i] = NULL; ++ } ++ } ++ ++ err = test_children_busy(sb->s_root, bindex, verbose); ++ if (unlikely(err)) { ++ if (do_wh) ++ goto out_wh; ++ goto out; ++ } ++ ++ err = 0; ++ br_id = br->br_id; ++ if (!remount) ++ au_br_do_del(sb, bindex, br); ++ else { ++ sysaufs_brs_del(sb, bindex); ++ au_br_do_del(sb, bindex, br); ++ sysaufs_brs_add(sb, bindex); ++ } ++ ++ if (!bindex) { ++ au_cpup_attr_all(sb->s_root->d_inode, /*force*/1); ++ sb->s_maxbytes = au_sbr_sb(sb, 0)->s_maxbytes; ++ } else ++ au_sub_nlink(sb->s_root->d_inode, del->h_path.dentry->d_inode); ++ if (au_opt_test(mnt_flags, PLINK)) ++ au_plink_half_refresh(sb, br_id); ++ ++ if (au_xino_brid(sb) == br_id) ++ au_xino_brid_set(sb, -1); ++ goto out; /* success */ ++ ++out_wh: ++ /* revert */ ++ rerr = au_br_init_wh(sb, br, br->br_perm, del->h_path.dentry); ++ if (rerr) ++ pr_warning("failed re-creating base whiteout, %s. (%d)\n", ++ del->pathname, rerr); ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_ibusy(struct super_block *sb, struct aufs_ibusy __user *arg) ++{ ++ int err; ++ aufs_bindex_t bstart, bend; ++ struct aufs_ibusy ibusy; ++ struct inode *inode, *h_inode; ++ ++ err = -EPERM; ++ if (unlikely(!capable(CAP_SYS_ADMIN))) ++ goto out; ++ ++ err = copy_from_user(&ibusy, arg, sizeof(ibusy)); ++ if (!err) ++ err = !access_ok(VERIFY_WRITE, &arg->h_ino, sizeof(arg->h_ino)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ goto out; ++ } ++ ++ err = -EINVAL; ++ si_read_lock(sb, AuLock_FLUSH); ++ if (unlikely(ibusy.bindex < 0 || ibusy.bindex > au_sbend(sb))) ++ goto out_unlock; ++ ++ err = 0; ++ ibusy.h_ino = 0; /* invalid */ ++ inode = ilookup(sb, ibusy.ino); ++ if (!inode ++ || inode->i_ino == AUFS_ROOT_INO ++ || is_bad_inode(inode)) ++ goto out_unlock; ++ ++ ii_read_lock_child(inode); ++ bstart = au_ibstart(inode); ++ bend = au_ibend(inode); ++ if (bstart <= ibusy.bindex && ibusy.bindex <= bend) { ++ h_inode = au_h_iptr(inode, ibusy.bindex); ++ if (h_inode && au_test_ibusy(inode, bstart, bend)) ++ ibusy.h_ino = h_inode->i_ino; ++ } ++ ii_read_unlock(inode); ++ iput(inode); ++ ++out_unlock: ++ si_read_unlock(sb); ++ if (!err) { ++ err = __put_user(ibusy.h_ino, &arg->h_ino); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ } ++ } ++out: ++ return err; ++} ++ ++long au_ibusy_ioctl(struct file *file, unsigned long arg) ++{ ++ return au_ibusy(file->f_dentry->d_sb, (void __user *)arg); ++} ++ ++#ifdef CONFIG_COMPAT ++long au_ibusy_compat_ioctl(struct file *file, unsigned long arg) ++{ ++ return au_ibusy(file->f_dentry->d_sb, compat_ptr(arg)); ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * change a branch permission ++ */ ++ ++static void au_warn_ima(void) ++{ ++#ifdef CONFIG_IMA ++ /* since it doesn't support mark_files_ro() */ ++ AuWarn1("RW -> RO makes IMA to produce wrong message\n"); ++#endif ++} ++ ++static int do_need_sigen_inc(int a, int b) ++{ ++ return au_br_whable(a) && !au_br_whable(b); ++} ++ ++static int need_sigen_inc(int old, int new) ++{ ++ return do_need_sigen_inc(old, new) ++ || do_need_sigen_inc(new, old); ++} ++ ++static unsigned long long au_farray_cb(void *a, ++ unsigned long long max __maybe_unused, ++ void *arg) ++{ ++ unsigned long long n; ++ struct file **p, *f; ++ struct super_block *sb = arg; ++ ++ n = 0; ++ p = a; ++ lg_global_lock(files_lglock); ++ do_file_list_for_each_entry(sb, f) { ++ if (au_fi(f) ++ && file_count(f) ++ && !special_file(f->f_dentry->d_inode->i_mode)) { ++ get_file(f); ++ *p++ = f; ++ n++; ++ AuDebugOn(n > max); ++ } ++ } while_file_list_for_each_entry; ++ lg_global_unlock(files_lglock); ++ ++ return n; ++} ++ ++static struct file **au_farray_alloc(struct super_block *sb, ++ unsigned long long *max) ++{ ++ *max = atomic_long_read(&au_sbi(sb)->si_nfiles); ++ return au_array_alloc(max, au_farray_cb, sb); ++} ++ ++static void au_farray_free(struct file **a, unsigned long long max) ++{ ++ unsigned long long ull; ++ ++ for (ull = 0; ull < max; ull++) ++ if (a[ull]) ++ fput(a[ull]); ++ au_array_free(a); ++} ++ ++static int au_br_mod_files_ro(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ int err, do_warn; ++ unsigned int mnt_flags; ++ unsigned long long ull, max; ++ aufs_bindex_t br_id; ++ unsigned char verbose; ++ struct file *file, *hf, **array; ++ struct inode *inode; ++ struct au_hfile *hfile; ++ ++ mnt_flags = au_mntflags(sb); ++ verbose = !!au_opt_test(mnt_flags, VERBOSE); ++ ++ array = au_farray_alloc(sb, &max); ++ err = PTR_ERR(array); ++ if (IS_ERR(array)) ++ goto out; ++ ++ do_warn = 0; ++ br_id = au_sbr_id(sb, bindex); ++ for (ull = 0; ull < max; ull++) { ++ file = array[ull]; ++ ++ /* AuDbg("%.*s\n", AuDLNPair(file->f_dentry)); */ ++ fi_read_lock(file); ++ if (unlikely(au_test_mmapped(file))) { ++ err = -EBUSY; ++ AuVerbose(verbose, "mmapped %.*s\n", ++ AuDLNPair(file->f_dentry)); ++ AuDbgFile(file); ++ FiMustNoWaiters(file); ++ fi_read_unlock(file); ++ goto out_array; ++ } ++ ++ inode = file->f_dentry->d_inode; ++ hfile = &au_fi(file)->fi_htop; ++ hf = hfile->hf_file; ++ if (!S_ISREG(inode->i_mode) ++ || !(file->f_mode & FMODE_WRITE) ++ || hfile->hf_br->br_id != br_id ++ || !(hf->f_mode & FMODE_WRITE)) ++ array[ull] = NULL; ++ else { ++ do_warn = 1; ++ get_file(file); ++ } ++ ++ FiMustNoWaiters(file); ++ fi_read_unlock(file); ++ fput(file); ++ } ++ ++ err = 0; ++ if (do_warn) ++ au_warn_ima(); ++ ++ for (ull = 0; ull < max; ull++) { ++ file = array[ull]; ++ if (!file) ++ continue; ++ ++ /* todo: already flushed? */ ++ /* cf. fs/super.c:mark_files_ro() */ ++ /* fi_read_lock(file); */ ++ hfile = &au_fi(file)->fi_htop; ++ hf = hfile->hf_file; ++ /* fi_read_unlock(file); */ ++ spin_lock(&hf->f_lock); ++ hf->f_mode &= ~FMODE_WRITE; ++ spin_unlock(&hf->f_lock); ++ if (!file_check_writeable(hf)) { ++ file_release_write(hf); ++ mnt_drop_write(hf->f_vfsmnt); ++ } ++ } ++ ++out_array: ++ au_farray_free(array, max); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, ++ int *do_refresh) ++{ ++ int err, rerr; ++ aufs_bindex_t bindex; ++ struct path path; ++ struct dentry *root; ++ struct au_branch *br; ++ ++ root = sb->s_root; ++ bindex = au_find_dbindex(root, mod->h_root); ++ if (bindex < 0) { ++ if (remount) ++ return 0; /* success */ ++ err = -ENOENT; ++ pr_err("%s no such branch\n", mod->path); ++ goto out; ++ } ++ AuDbg("bindex b%d\n", bindex); ++ ++ err = test_br(mod->h_root->d_inode, mod->perm, mod->path); ++ if (unlikely(err)) ++ goto out; ++ ++ br = au_sbr(sb, bindex); ++ if (br->br_perm == mod->perm) ++ return 0; /* success */ ++ ++ if (au_br_writable(br->br_perm)) { ++ /* remove whiteout base */ ++ err = au_br_init_wh(sb, br, mod->perm, mod->h_root); ++ if (unlikely(err)) ++ goto out; ++ ++ if (!au_br_writable(mod->perm)) { ++ /* rw --> ro, file might be mmapped */ ++ DiMustNoWaiters(root); ++ IiMustNoWaiters(root->d_inode); ++ di_write_unlock(root); ++ err = au_br_mod_files_ro(sb, bindex); ++ /* aufs_write_lock() calls ..._child() */ ++ di_write_lock_child(root); ++ ++ if (unlikely(err)) { ++ rerr = -ENOMEM; ++ br->br_wbr = kmalloc(sizeof(*br->br_wbr), ++ GFP_NOFS); ++ if (br->br_wbr) { ++ path.mnt = br->br_mnt; ++ path.dentry = mod->h_root; ++ rerr = au_wbr_init(br, sb, br->br_perm, ++ &path); ++ } ++ if (unlikely(rerr)) { ++ AuIOErr("nested error %d (%d)\n", ++ rerr, err); ++ br->br_perm = mod->perm; ++ } ++ } ++ } ++ } else if (au_br_writable(mod->perm)) { ++ /* ro --> rw */ ++ err = -ENOMEM; ++ br->br_wbr = kmalloc(sizeof(*br->br_wbr), GFP_NOFS); ++ if (br->br_wbr) { ++ path.mnt = br->br_mnt; ++ path.dentry = mod->h_root; ++ err = au_wbr_init(br, sb, mod->perm, &path); ++ if (unlikely(err)) { ++ kfree(br->br_wbr); ++ br->br_wbr = NULL; ++ } ++ } ++ } ++ ++ if (!err) { ++ *do_refresh |= need_sigen_inc(br->br_perm, mod->perm); ++ br->br_perm = mod->perm; ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/branch.h linux-3.2.0-gentoo-r1/fs/aufs/branch.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/branch.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/branch.h 2012-01-17 12:11:24.488542888 +0100 +@@ -0,0 +1,230 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * branch filesystems and xino for them ++ */ ++ ++#ifndef __AUFS_BRANCH_H__ ++#define __AUFS_BRANCH_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include "dynop.h" ++#include "rwsem.h" ++#include "super.h" ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* a xino file */ ++struct au_xino_file { ++ struct file *xi_file; ++ struct mutex xi_nondir_mtx; ++ ++ /* todo: make xino files an array to support huge inode number */ ++ ++#ifdef CONFIG_DEBUG_FS ++ struct dentry *xi_dbgaufs; ++#endif ++}; ++ ++/* members for writable branch only */ ++enum {AuBrWh_BASE, AuBrWh_PLINK, AuBrWh_ORPH, AuBrWh_Last}; ++struct au_wbr { ++ struct au_rwsem wbr_wh_rwsem; ++ struct dentry *wbr_wh[AuBrWh_Last]; ++ atomic_t wbr_wh_running; ++#define wbr_whbase wbr_wh[AuBrWh_BASE] /* whiteout base */ ++#define wbr_plink wbr_wh[AuBrWh_PLINK] /* pseudo-link dir */ ++#define wbr_orph wbr_wh[AuBrWh_ORPH] /* dir for orphans */ ++ ++ /* mfs mode */ ++ unsigned long long wbr_bytes; ++}; ++ ++/* ext2 has 3 types of operations at least, ext3 has 4 */ ++#define AuBrDynOp (AuDyLast * 4) ++ ++/* protected by superblock rwsem */ ++struct au_branch { ++ struct au_xino_file br_xino; ++ ++ aufs_bindex_t br_id; ++ ++ int br_perm; ++ struct vfsmount *br_mnt; ++ spinlock_t br_dykey_lock; ++ struct au_dykey *br_dykey[AuBrDynOp]; ++ atomic_t br_count; ++ ++ struct au_wbr *br_wbr; ++ ++ /* xino truncation */ ++ blkcnt_t br_xino_upper; /* watermark in blocks */ ++ atomic_t br_xino_running; ++ ++#ifdef CONFIG_AUFS_HFSNOTIFY ++ struct fsnotify_group *br_hfsn_group; ++ struct fsnotify_ops br_hfsn_ops; ++#endif ++ ++#ifdef CONFIG_SYSFS ++ /* an entry under sysfs per mount-point */ ++ char br_name[8]; ++ struct attribute br_attr; ++#endif ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* branch permissions and attributes */ ++#define AuBrPerm_RW 1 /* writable, hardlinkable wh */ ++#define AuBrPerm_RO (1 << 1) /* readonly */ ++#define AuBrPerm_RR (1 << 2) /* natively readonly */ ++#define AuBrPerm_Mask (AuBrPerm_RW | AuBrPerm_RO | AuBrPerm_RR) ++ ++#define AuBrRAttr_WH (1 << 3) /* whiteout-able */ ++ ++#define AuBrWAttr_NoLinkWH (1 << 4) /* un-hardlinkable whiteouts */ ++ ++static inline int au_br_writable(int brperm) ++{ ++ return brperm & AuBrPerm_RW; ++} ++ ++static inline int au_br_whable(int brperm) ++{ ++ return brperm & (AuBrPerm_RW | AuBrRAttr_WH); ++} ++ ++static inline int au_br_wh_linkable(int brperm) ++{ ++ return !(brperm & AuBrWAttr_NoLinkWH); ++} ++ ++static inline int au_br_rdonly(struct au_branch *br) ++{ ++ return ((br->br_mnt->mnt_sb->s_flags & MS_RDONLY) ++ || !au_br_writable(br->br_perm)) ++ ? -EROFS : 0; ++} ++ ++static inline int au_br_hnotifyable(int brperm __maybe_unused) ++{ ++#ifdef CONFIG_AUFS_HNOTIFY ++ return !(brperm & AuBrPerm_RR); ++#else ++ return 0; ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* branch.c */ ++struct au_sbinfo; ++void au_br_free(struct au_sbinfo *sinfo); ++int au_br_index(struct super_block *sb, aufs_bindex_t br_id); ++struct au_opt_add; ++int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount); ++struct au_opt_del; ++int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount); ++long au_ibusy_ioctl(struct file *file, unsigned long arg); ++#ifdef CONFIG_COMPAT ++long au_ibusy_compat_ioctl(struct file *file, unsigned long arg); ++#endif ++struct au_opt_mod; ++int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, ++ int *do_refresh); ++ ++/* xino.c */ ++static const loff_t au_loff_max = LLONG_MAX; ++ ++int au_xib_trunc(struct super_block *sb); ++ssize_t xino_fread(au_readf_t func, struct file *file, void *buf, size_t size, ++ loff_t *pos); ++ssize_t xino_fwrite(au_writef_t func, struct file *file, void *buf, size_t size, ++ loff_t *pos); ++struct file *au_xino_create2(struct file *base_file, struct file *copy_src); ++struct file *au_xino_create(struct super_block *sb, char *fname, int silent); ++ino_t au_xino_new_ino(struct super_block *sb); ++void au_xino_delete_inode(struct inode *inode, const int unlinked); ++int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t ino); ++int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t *ino); ++int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t hino, ++ struct file *base_file, int do_test); ++int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex); ++ ++struct au_opt_xino; ++int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount); ++void au_xino_clr(struct super_block *sb); ++struct file *au_xino_def(struct super_block *sb); ++int au_xino_path(struct seq_file *seq, struct file *file); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* Superblock to branch */ ++static inline ++aufs_bindex_t au_sbr_id(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_sbr(sb, bindex)->br_id; ++} ++ ++static inline ++struct vfsmount *au_sbr_mnt(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_sbr(sb, bindex)->br_mnt; ++} ++ ++static inline ++struct super_block *au_sbr_sb(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_sbr_mnt(sb, bindex)->mnt_sb; ++} ++ ++static inline void au_sbr_put(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ atomic_dec(&au_sbr(sb, bindex)->br_count); ++} ++ ++static inline int au_sbr_perm(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_sbr(sb, bindex)->br_perm; ++} ++ ++static inline int au_sbr_whable(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_br_whable(au_sbr_perm(sb, bindex)); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * wbr_wh_read_lock, wbr_wh_write_lock ++ * wbr_wh_read_unlock, wbr_wh_write_unlock, wbr_wh_downgrade_lock ++ */ ++AuSimpleRwsemFuncs(wbr_wh, struct au_wbr *wbr, &wbr->wbr_wh_rwsem); ++ ++#define WbrWhMustNoWaiters(wbr) AuRwMustNoWaiters(&wbr->wbr_wh_rwsem) ++#define WbrWhMustAnyLock(wbr) AuRwMustAnyLock(&wbr->wbr_wh_rwsem) ++#define WbrWhMustWriteLock(wbr) AuRwMustWriteLock(&wbr->wbr_wh_rwsem) ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_BRANCH_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/conf.mk linux-3.2.0-gentoo-r1/fs/aufs/conf.mk +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/conf.mk 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/conf.mk 2012-01-17 12:11:24.488542888 +0100 +@@ -0,0 +1,38 @@ ++ ++AuConfStr = CONFIG_AUFS_FS=${CONFIG_AUFS_FS} ++ ++define AuConf ++ifdef ${1} ++AuConfStr += ${1}=${${1}} ++endif ++endef ++ ++AuConfAll = BRANCH_MAX_127 BRANCH_MAX_511 BRANCH_MAX_1023 BRANCH_MAX_32767 \ ++ SBILIST \ ++ HNOTIFY HFSNOTIFY \ ++ EXPORT INO_T_64 \ ++ RDU \ ++ PROC_MAP \ ++ SP_IATTR \ ++ SHWH \ ++ BR_RAMFS \ ++ BR_FUSE POLL \ ++ BR_HFSPLUS \ ++ BDEV_LOOP \ ++ DEBUG MAGIC_SYSRQ ++$(foreach i, ${AuConfAll}, \ ++ $(eval $(call AuConf,CONFIG_AUFS_${i}))) ++ ++AuConfName = ${obj}/conf.str ++${AuConfName}.tmp: FORCE ++ @echo ${AuConfStr} | tr ' ' '\n' | sed -e 's/^/"/' -e 's/$$/\\n"/' > $@ ++${AuConfName}: ${AuConfName}.tmp ++ @diff -q $< $@ > /dev/null 2>&1 || { \ ++ echo ' GEN ' $@; \ ++ cp -p $< $@; \ ++ } ++FORCE: ++clean-files += ${AuConfName} ${AuConfName}.tmp ++${obj}/sysfs.o: ${AuConfName} ++ ++-include ${srctree}/${src}/conf_priv.mk +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/cpup.c linux-3.2.0-gentoo-r1/fs/aufs/cpup.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/cpup.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/cpup.c 2012-01-17 12:11:24.511691245 +0100 +@@ -0,0 +1,1079 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * copy-up functions, see wbr_policy.c for copy-down ++ */ ++ ++#include ++#include ++#include "aufs.h" ++ ++void au_cpup_attr_flags(struct inode *dst, struct inode *src) ++{ ++ const unsigned int mask = S_DEAD | S_SWAPFILE | S_PRIVATE ++ | S_NOATIME | S_NOCMTIME; ++ ++ dst->i_flags |= src->i_flags & ~mask; ++ if (au_test_fs_notime(dst->i_sb)) ++ dst->i_flags |= S_NOATIME | S_NOCMTIME; ++} ++ ++void au_cpup_attr_timesizes(struct inode *inode) ++{ ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, au_ibstart(inode)); ++ fsstack_copy_attr_times(inode, h_inode); ++ fsstack_copy_inode_size(inode, h_inode); ++} ++ ++void au_cpup_attr_nlink(struct inode *inode, int force) ++{ ++ struct inode *h_inode; ++ struct super_block *sb; ++ aufs_bindex_t bindex, bend; ++ ++ sb = inode->i_sb; ++ bindex = au_ibstart(inode); ++ h_inode = au_h_iptr(inode, bindex); ++ if (!force ++ && !S_ISDIR(h_inode->i_mode) ++ && au_opt_test(au_mntflags(sb), PLINK) ++ && au_plink_test(inode)) ++ return; ++ ++ set_nlink(inode, h_inode->i_nlink); ++ ++ /* ++ * fewer nlink makes find(1) noisy, but larger nlink doesn't. ++ * it may includes whplink directory. ++ */ ++ if (S_ISDIR(h_inode->i_mode)) { ++ bend = au_ibend(inode); ++ for (bindex++; bindex <= bend; bindex++) { ++ h_inode = au_h_iptr(inode, bindex); ++ if (h_inode) ++ au_add_nlink(inode, h_inode); ++ } ++ } ++} ++ ++void au_cpup_attr_changeable(struct inode *inode) ++{ ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, au_ibstart(inode)); ++ inode->i_mode = h_inode->i_mode; ++ inode->i_uid = h_inode->i_uid; ++ inode->i_gid = h_inode->i_gid; ++ au_cpup_attr_timesizes(inode); ++ au_cpup_attr_flags(inode, h_inode); ++} ++ ++void au_cpup_igen(struct inode *inode, struct inode *h_inode) ++{ ++ struct au_iinfo *iinfo = au_ii(inode); ++ ++ IiMustWriteLock(inode); ++ ++ iinfo->ii_higen = h_inode->i_generation; ++ iinfo->ii_hsb1 = h_inode->i_sb; ++} ++ ++void au_cpup_attr_all(struct inode *inode, int force) ++{ ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, au_ibstart(inode)); ++ au_cpup_attr_changeable(inode); ++ if (inode->i_nlink > 0) ++ au_cpup_attr_nlink(inode, force); ++ inode->i_rdev = h_inode->i_rdev; ++ inode->i_blkbits = h_inode->i_blkbits; ++ au_cpup_igen(inode, h_inode); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* Note: dt_dentry and dt_h_dentry are not dget/dput-ed */ ++ ++/* keep the timestamps of the parent dir when cpup */ ++void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, ++ struct path *h_path) ++{ ++ struct inode *h_inode; ++ ++ dt->dt_dentry = dentry; ++ dt->dt_h_path = *h_path; ++ h_inode = h_path->dentry->d_inode; ++ dt->dt_atime = h_inode->i_atime; ++ dt->dt_mtime = h_inode->i_mtime; ++ /* smp_mb(); */ ++} ++ ++void au_dtime_revert(struct au_dtime *dt) ++{ ++ struct iattr attr; ++ int err; ++ ++ attr.ia_atime = dt->dt_atime; ++ attr.ia_mtime = dt->dt_mtime; ++ attr.ia_valid = ATTR_FORCE | ATTR_MTIME | ATTR_MTIME_SET ++ | ATTR_ATIME | ATTR_ATIME_SET; ++ ++ err = vfsub_notify_change(&dt->dt_h_path, &attr); ++ if (unlikely(err)) ++ pr_warning("restoring timestamps failed(%d). ignored\n", err); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static noinline_for_stack ++int cpup_iattr(struct dentry *dst, aufs_bindex_t bindex, struct dentry *h_src) ++{ ++ int err, sbits; ++ struct iattr ia; ++ struct path h_path; ++ struct inode *h_isrc, *h_idst; ++ ++ h_path.dentry = au_h_dptr(dst, bindex); ++ h_idst = h_path.dentry->d_inode; ++ h_path.mnt = au_sbr_mnt(dst->d_sb, bindex); ++ h_isrc = h_src->d_inode; ++ ia.ia_valid = ATTR_FORCE | ATTR_UID | ATTR_GID ++ | ATTR_ATIME | ATTR_MTIME ++ | ATTR_ATIME_SET | ATTR_MTIME_SET; ++ ia.ia_uid = h_isrc->i_uid; ++ ia.ia_gid = h_isrc->i_gid; ++ ia.ia_atime = h_isrc->i_atime; ++ ia.ia_mtime = h_isrc->i_mtime; ++ if (h_idst->i_mode != h_isrc->i_mode ++ && !S_ISLNK(h_idst->i_mode)) { ++ ia.ia_valid |= ATTR_MODE; ++ ia.ia_mode = h_isrc->i_mode; ++ } ++ sbits = !!(h_isrc->i_mode & (S_ISUID | S_ISGID)); ++ au_cpup_attr_flags(h_idst, h_isrc); ++ err = vfsub_notify_change(&h_path, &ia); ++ ++ /* is this nfs only? */ ++ if (!err && sbits && au_test_nfs(h_path.dentry->d_sb)) { ++ ia.ia_valid = ATTR_FORCE | ATTR_MODE; ++ ia.ia_mode = h_isrc->i_mode; ++ err = vfsub_notify_change(&h_path, &ia); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_copy_file(struct file *dst, struct file *src, loff_t len, ++ char *buf, unsigned long blksize) ++{ ++ int err; ++ size_t sz, rbytes, wbytes; ++ unsigned char all_zero; ++ char *p, *zp; ++ struct mutex *h_mtx; ++ /* reduce stack usage */ ++ struct iattr *ia; ++ ++ zp = page_address(ZERO_PAGE(0)); ++ if (unlikely(!zp)) ++ return -ENOMEM; /* possible? */ ++ ++ err = 0; ++ all_zero = 0; ++ while (len) { ++ AuDbg("len %lld\n", len); ++ sz = blksize; ++ if (len < blksize) ++ sz = len; ++ ++ rbytes = 0; ++ /* todo: signal_pending? */ ++ while (!rbytes || err == -EAGAIN || err == -EINTR) { ++ rbytes = vfsub_read_k(src, buf, sz, &src->f_pos); ++ err = rbytes; ++ } ++ if (unlikely(err < 0)) ++ break; ++ ++ all_zero = 0; ++ if (len >= rbytes && rbytes == blksize) ++ all_zero = !memcmp(buf, zp, rbytes); ++ if (!all_zero) { ++ wbytes = rbytes; ++ p = buf; ++ while (wbytes) { ++ size_t b; ++ ++ b = vfsub_write_k(dst, p, wbytes, &dst->f_pos); ++ err = b; ++ /* todo: signal_pending? */ ++ if (unlikely(err == -EAGAIN || err == -EINTR)) ++ continue; ++ if (unlikely(err < 0)) ++ break; ++ wbytes -= b; ++ p += b; ++ } ++ } else { ++ loff_t res; ++ ++ AuLabel(hole); ++ res = vfsub_llseek(dst, rbytes, SEEK_CUR); ++ err = res; ++ if (unlikely(res < 0)) ++ break; ++ } ++ len -= rbytes; ++ err = 0; ++ } ++ ++ /* the last block may be a hole */ ++ if (!err && all_zero) { ++ AuLabel(last hole); ++ ++ err = 1; ++ if (au_test_nfs(dst->f_dentry->d_sb)) { ++ /* nfs requires this step to make last hole */ ++ /* is this only nfs? */ ++ do { ++ /* todo: signal_pending? */ ++ err = vfsub_write_k(dst, "\0", 1, &dst->f_pos); ++ } while (err == -EAGAIN || err == -EINTR); ++ if (err == 1) ++ dst->f_pos--; ++ } ++ ++ if (err == 1) { ++ ia = (void *)buf; ++ ia->ia_size = dst->f_pos; ++ ia->ia_valid = ATTR_SIZE | ATTR_FILE; ++ ia->ia_file = dst; ++ h_mtx = &dst->f_dentry->d_inode->i_mutex; ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD2); ++ err = vfsub_notify_change(&dst->f_path, ia); ++ mutex_unlock(h_mtx); ++ } ++ } ++ ++ return err; ++} ++ ++int au_copy_file(struct file *dst, struct file *src, loff_t len) ++{ ++ int err; ++ unsigned long blksize; ++ unsigned char do_kfree; ++ char *buf; ++ ++ err = -ENOMEM; ++ blksize = dst->f_dentry->d_sb->s_blocksize; ++ if (!blksize || PAGE_SIZE < blksize) ++ blksize = PAGE_SIZE; ++ AuDbg("blksize %lu\n", blksize); ++ do_kfree = (blksize != PAGE_SIZE && blksize >= sizeof(struct iattr *)); ++ if (do_kfree) ++ buf = kmalloc(blksize, GFP_NOFS); ++ else ++ buf = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!buf)) ++ goto out; ++ ++ if (len > (1 << 22)) ++ AuDbg("copying a large file %lld\n", (long long)len); ++ ++ src->f_pos = 0; ++ dst->f_pos = 0; ++ err = au_do_copy_file(dst, src, len, buf, blksize); ++ if (do_kfree) ++ kfree(buf); ++ else ++ free_page((unsigned long)buf); ++ ++out: ++ return err; ++} ++ ++/* ++ * to support a sparse file which is opened with O_APPEND, ++ * we need to close the file. ++ */ ++static int au_cp_regular(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len) ++{ ++ int err, i; ++ enum { SRC, DST }; ++ struct { ++ aufs_bindex_t bindex; ++ unsigned int flags; ++ struct dentry *dentry; ++ struct file *file; ++ void *label, *label_file; ++ } *f, file[] = { ++ { ++ .bindex = bsrc, ++ .flags = O_RDONLY | O_NOATIME | O_LARGEFILE, ++ .file = NULL, ++ .label = &&out, ++ .label_file = &&out_src ++ }, ++ { ++ .bindex = bdst, ++ .flags = O_WRONLY | O_NOATIME | O_LARGEFILE, ++ .file = NULL, ++ .label = &&out_src, ++ .label_file = &&out_dst ++ } ++ }; ++ struct super_block *sb; ++ ++ /* bsrc branch can be ro/rw. */ ++ sb = dentry->d_sb; ++ f = file; ++ for (i = 0; i < 2; i++, f++) { ++ f->dentry = au_h_dptr(dentry, f->bindex); ++ f->file = au_h_open(dentry, f->bindex, f->flags, /*file*/NULL); ++ err = PTR_ERR(f->file); ++ if (IS_ERR(f->file)) ++ goto *f->label; ++ err = -EINVAL; ++ if (unlikely(!f->file->f_op)) ++ goto *f->label_file; ++ } ++ ++ /* try stopping to update while we copyup */ ++ IMustLock(file[SRC].dentry->d_inode); ++ err = au_copy_file(file[DST].file, file[SRC].file, len); ++ ++out_dst: ++ fput(file[DST].file); ++ au_sbr_put(sb, file[DST].bindex); ++out_src: ++ fput(file[SRC].file); ++ au_sbr_put(sb, file[SRC].bindex); ++out: ++ return err; ++} ++ ++static int au_do_cpup_regular(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len, ++ struct inode *h_dir, struct path *h_path) ++{ ++ int err, rerr; ++ loff_t l; ++ ++ err = 0; ++ l = i_size_read(au_h_iptr(dentry->d_inode, bsrc)); ++ if (len == -1 || l < len) ++ len = l; ++ if (len) ++ err = au_cp_regular(dentry, bdst, bsrc, len); ++ if (!err) ++ goto out; /* success */ ++ ++ rerr = vfsub_unlink(h_dir, h_path, /*force*/0); ++ if (rerr) { ++ AuIOErr("failed unlinking cpup-ed %.*s(%d, %d)\n", ++ AuDLNPair(h_path->dentry), err, rerr); ++ err = -EIO; ++ } ++ ++out: ++ return err; ++} ++ ++static int au_do_cpup_symlink(struct path *h_path, struct dentry *h_src, ++ struct inode *h_dir) ++{ ++ int err, symlen; ++ mm_segment_t old_fs; ++ union { ++ char *k; ++ char __user *u; ++ } sym; ++ ++ err = -ENOSYS; ++ if (unlikely(!h_src->d_inode->i_op->readlink)) ++ goto out; ++ ++ err = -ENOMEM; ++ sym.k = __getname_gfp(GFP_NOFS); ++ if (unlikely(!sym.k)) ++ goto out; ++ ++ /* unnecessary to support mmap_sem since symlink is not mmap-able */ ++ old_fs = get_fs(); ++ set_fs(KERNEL_DS); ++ symlen = h_src->d_inode->i_op->readlink(h_src, sym.u, PATH_MAX); ++ err = symlen; ++ set_fs(old_fs); ++ ++ if (symlen > 0) { ++ sym.k[symlen] = 0; ++ err = vfsub_symlink(h_dir, h_path, sym.k); ++ } ++ __putname(sym.k); ++ ++out: ++ return err; ++} ++ ++/* return with the lower dst inode is locked */ ++static noinline_for_stack ++int cpup_entry(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len, unsigned int flags, ++ struct dentry *dst_parent) ++{ ++ int err; ++ umode_t mode; ++ unsigned int mnt_flags; ++ unsigned char isdir; ++ const unsigned char do_dt = !!au_ftest_cpup(flags, DTIME); ++ struct au_dtime dt; ++ struct path h_path; ++ struct dentry *h_src, *h_dst, *h_parent; ++ struct inode *h_inode, *h_dir; ++ struct super_block *sb; ++ ++ /* bsrc branch can be ro/rw. */ ++ h_src = au_h_dptr(dentry, bsrc); ++ h_inode = h_src->d_inode; ++ AuDebugOn(h_inode != au_h_iptr(dentry->d_inode, bsrc)); ++ ++ /* try stopping to be referenced while we are creating */ ++ h_dst = au_h_dptr(dentry, bdst); ++ h_parent = h_dst->d_parent; /* dir inode is locked */ ++ h_dir = h_parent->d_inode; ++ IMustLock(h_dir); ++ AuDebugOn(h_parent != h_dst->d_parent); ++ ++ sb = dentry->d_sb; ++ h_path.mnt = au_sbr_mnt(sb, bdst); ++ if (do_dt) { ++ h_path.dentry = h_parent; ++ au_dtime_store(&dt, dst_parent, &h_path); ++ } ++ h_path.dentry = h_dst; ++ ++ isdir = 0; ++ mode = h_inode->i_mode; ++ switch (mode & S_IFMT) { ++ case S_IFREG: ++ /* try stopping to update while we are referencing */ ++ IMustLock(h_inode); ++ err = vfsub_create(h_dir, &h_path, mode | S_IWUSR); ++ if (!err) ++ err = au_do_cpup_regular ++ (dentry, bdst, bsrc, len, ++ au_h_iptr(dst_parent->d_inode, bdst), &h_path); ++ break; ++ case S_IFDIR: ++ isdir = 1; ++ err = vfsub_mkdir(h_dir, &h_path, mode); ++ if (!err) { ++ /* ++ * strange behaviour from the users view, ++ * particularry setattr case ++ */ ++ if (au_ibstart(dst_parent->d_inode) == bdst) ++ au_cpup_attr_nlink(dst_parent->d_inode, ++ /*force*/1); ++ au_cpup_attr_nlink(dentry->d_inode, /*force*/1); ++ } ++ break; ++ case S_IFLNK: ++ err = au_do_cpup_symlink(&h_path, h_src, h_dir); ++ break; ++ case S_IFCHR: ++ case S_IFBLK: ++ AuDebugOn(!capable(CAP_MKNOD)); ++ /*FALLTHROUGH*/ ++ case S_IFIFO: ++ case S_IFSOCK: ++ err = vfsub_mknod(h_dir, &h_path, mode, h_inode->i_rdev); ++ break; ++ default: ++ AuIOErr("Unknown inode type 0%o\n", mode); ++ err = -EIO; ++ } ++ ++ mnt_flags = au_mntflags(sb); ++ if (!au_opt_test(mnt_flags, UDBA_NONE) ++ && !isdir ++ && au_opt_test(mnt_flags, XINO) ++ && h_inode->i_nlink == 1 ++ /* todo: unnecessary? */ ++ /* && dentry->d_inode->i_nlink == 1 */ ++ && bdst < bsrc ++ && !au_ftest_cpup(flags, KEEPLINO)) ++ au_xino_write(sb, bsrc, h_inode->i_ino, /*ino*/0); ++ /* ignore this error */ ++ ++ if (do_dt) ++ au_dtime_revert(&dt); ++ return err; ++} ++ ++/* ++ * copyup the @dentry from @bsrc to @bdst. ++ * the caller must set the both of lower dentries. ++ * @len is for truncating when it is -1 copyup the entire file. ++ * in link/rename cases, @dst_parent may be different from the real one. ++ */ ++static int au_cpup_single(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len, unsigned int flags, ++ struct dentry *dst_parent) ++{ ++ int err, rerr; ++ aufs_bindex_t old_ibstart; ++ unsigned char isdir, plink; ++ struct au_dtime dt; ++ struct path h_path; ++ struct dentry *h_src, *h_dst, *h_parent; ++ struct inode *dst_inode, *h_dir, *inode; ++ struct super_block *sb; ++ ++ AuDebugOn(bsrc <= bdst); ++ ++ sb = dentry->d_sb; ++ h_path.mnt = au_sbr_mnt(sb, bdst); ++ h_dst = au_h_dptr(dentry, bdst); ++ h_parent = h_dst->d_parent; /* dir inode is locked */ ++ h_dir = h_parent->d_inode; ++ IMustLock(h_dir); ++ ++ h_src = au_h_dptr(dentry, bsrc); ++ inode = dentry->d_inode; ++ ++ if (!dst_parent) ++ dst_parent = dget_parent(dentry); ++ else ++ dget(dst_parent); ++ ++ plink = !!au_opt_test(au_mntflags(sb), PLINK); ++ dst_inode = au_h_iptr(inode, bdst); ++ if (dst_inode) { ++ if (unlikely(!plink)) { ++ err = -EIO; ++ AuIOErr("hi%lu(i%lu) exists on b%d " ++ "but plink is disabled\n", ++ dst_inode->i_ino, inode->i_ino, bdst); ++ goto out; ++ } ++ ++ if (dst_inode->i_nlink) { ++ const int do_dt = au_ftest_cpup(flags, DTIME); ++ ++ h_src = au_plink_lkup(inode, bdst); ++ err = PTR_ERR(h_src); ++ if (IS_ERR(h_src)) ++ goto out; ++ if (unlikely(!h_src->d_inode)) { ++ err = -EIO; ++ AuIOErr("i%lu exists on a upper branch " ++ "but not pseudo-linked\n", ++ inode->i_ino); ++ dput(h_src); ++ goto out; ++ } ++ ++ if (do_dt) { ++ h_path.dentry = h_parent; ++ au_dtime_store(&dt, dst_parent, &h_path); ++ } ++ h_path.dentry = h_dst; ++ err = vfsub_link(h_src, h_dir, &h_path); ++ if (do_dt) ++ au_dtime_revert(&dt); ++ dput(h_src); ++ goto out; ++ } else ++ /* todo: cpup_wh_file? */ ++ /* udba work */ ++ au_update_ibrange(inode, /*do_put_zero*/1); ++ } ++ ++ old_ibstart = au_ibstart(inode); ++ err = cpup_entry(dentry, bdst, bsrc, len, flags, dst_parent); ++ if (unlikely(err)) ++ goto out; ++ dst_inode = h_dst->d_inode; ++ mutex_lock_nested(&dst_inode->i_mutex, AuLsc_I_CHILD2); ++ ++ err = cpup_iattr(dentry, bdst, h_src); ++ isdir = S_ISDIR(dst_inode->i_mode); ++ if (!err) { ++ if (bdst < old_ibstart) { ++ if (S_ISREG(inode->i_mode)) { ++ err = au_dy_iaop(inode, bdst, dst_inode); ++ if (unlikely(err)) ++ goto out_rev; ++ } ++ au_set_ibstart(inode, bdst); ++ } ++ au_set_h_iptr(inode, bdst, au_igrab(dst_inode), ++ au_hi_flags(inode, isdir)); ++ mutex_unlock(&dst_inode->i_mutex); ++ if (!isdir ++ && h_src->d_inode->i_nlink > 1 ++ && plink) ++ au_plink_append(inode, bdst, h_dst); ++ goto out; /* success */ ++ } ++ ++ /* revert */ ++out_rev: ++ h_path.dentry = h_parent; ++ mutex_unlock(&dst_inode->i_mutex); ++ au_dtime_store(&dt, dst_parent, &h_path); ++ h_path.dentry = h_dst; ++ if (!isdir) ++ rerr = vfsub_unlink(h_dir, &h_path, /*force*/0); ++ else ++ rerr = vfsub_rmdir(h_dir, &h_path); ++ au_dtime_revert(&dt); ++ if (rerr) { ++ AuIOErr("failed removing broken entry(%d, %d)\n", err, rerr); ++ err = -EIO; ++ } ++ ++out: ++ dput(dst_parent); ++ return err; ++} ++ ++struct au_cpup_single_args { ++ int *errp; ++ struct dentry *dentry; ++ aufs_bindex_t bdst, bsrc; ++ loff_t len; ++ unsigned int flags; ++ struct dentry *dst_parent; ++}; ++ ++static void au_call_cpup_single(void *args) ++{ ++ struct au_cpup_single_args *a = args; ++ *a->errp = au_cpup_single(a->dentry, a->bdst, a->bsrc, a->len, ++ a->flags, a->dst_parent); ++} ++ ++/* ++ * prevent SIGXFSZ in copy-up. ++ * testing CAP_MKNOD is for generic fs, ++ * but CAP_FSETID is for xfs only, currently. ++ */ ++static int au_cpup_sio_test(struct super_block *sb, umode_t mode) ++{ ++ int do_sio; ++ ++ do_sio = 0; ++ if (!au_wkq_test() ++ && (!au_sbi(sb)->si_plink_maint_pid ++ || au_plink_maint(sb, AuLock_NOPLM))) { ++ switch (mode & S_IFMT) { ++ case S_IFREG: ++ /* no condition about RLIMIT_FSIZE and the file size */ ++ do_sio = 1; ++ break; ++ case S_IFCHR: ++ case S_IFBLK: ++ do_sio = !capable(CAP_MKNOD); ++ break; ++ } ++ if (!do_sio) ++ do_sio = ((mode & (S_ISUID | S_ISGID)) ++ && !capable(CAP_FSETID)); ++ } ++ ++ return do_sio; ++} ++ ++int au_sio_cpup_single(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len, unsigned int flags, ++ struct dentry *dst_parent) ++{ ++ int err, wkq_err; ++ struct dentry *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bsrc); ++ if (!au_cpup_sio_test(dentry->d_sb, h_dentry->d_inode->i_mode)) ++ err = au_cpup_single(dentry, bdst, bsrc, len, flags, ++ dst_parent); ++ else { ++ struct au_cpup_single_args args = { ++ .errp = &err, ++ .dentry = dentry, ++ .bdst = bdst, ++ .bsrc = bsrc, ++ .len = len, ++ .flags = flags, ++ .dst_parent = dst_parent ++ }; ++ wkq_err = au_wkq_wait(au_call_cpup_single, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} ++ ++/* ++ * copyup the @dentry from the first active lower branch to @bdst, ++ * using au_cpup_single(). ++ */ ++static int au_cpup_simple(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ unsigned int flags) ++{ ++ int err; ++ aufs_bindex_t bsrc, bend; ++ ++ bend = au_dbend(dentry); ++ for (bsrc = bdst + 1; bsrc <= bend; bsrc++) ++ if (au_h_dptr(dentry, bsrc)) ++ break; ++ ++ err = au_lkup_neg(dentry, bdst); ++ if (!err) { ++ err = au_cpup_single(dentry, bdst, bsrc, len, flags, NULL); ++ if (!err) ++ return 0; /* success */ ++ ++ /* revert */ ++ au_set_h_dptr(dentry, bdst, NULL); ++ au_set_dbstart(dentry, bsrc); ++ } ++ ++ return err; ++} ++ ++struct au_cpup_simple_args { ++ int *errp; ++ struct dentry *dentry; ++ aufs_bindex_t bdst; ++ loff_t len; ++ unsigned int flags; ++}; ++ ++static void au_call_cpup_simple(void *args) ++{ ++ struct au_cpup_simple_args *a = args; ++ *a->errp = au_cpup_simple(a->dentry, a->bdst, a->len, a->flags); ++} ++ ++int au_sio_cpup_simple(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ unsigned int flags) ++{ ++ int err, wkq_err; ++ struct dentry *parent; ++ struct inode *h_dir; ++ ++ parent = dget_parent(dentry); ++ h_dir = au_h_iptr(parent->d_inode, bdst); ++ if (!au_test_h_perm_sio(h_dir, MAY_EXEC | MAY_WRITE) ++ && !au_cpup_sio_test(dentry->d_sb, dentry->d_inode->i_mode)) ++ err = au_cpup_simple(dentry, bdst, len, flags); ++ else { ++ struct au_cpup_simple_args args = { ++ .errp = &err, ++ .dentry = dentry, ++ .bdst = bdst, ++ .len = len, ++ .flags = flags ++ }; ++ wkq_err = au_wkq_wait(au_call_cpup_simple, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ dput(parent); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * copyup the deleted file for writing. ++ */ ++static int au_do_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, ++ struct dentry *wh_dentry, struct file *file, ++ loff_t len) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ struct au_dinfo *dinfo; ++ struct dentry *h_d_dst, *h_d_start; ++ struct au_hdentry *hdp; ++ ++ dinfo = au_di(dentry); ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ bstart = dinfo->di_bstart; ++ hdp = dinfo->di_hdentry; ++ h_d_dst = hdp[0 + bdst].hd_dentry; ++ dinfo->di_bstart = bdst; ++ hdp[0 + bdst].hd_dentry = wh_dentry; ++ if (file) { ++ h_d_start = hdp[0 + bstart].hd_dentry; ++ hdp[0 + bstart].hd_dentry = au_hf_top(file)->f_dentry; ++ } ++ err = au_cpup_single(dentry, bdst, bstart, len, !AuCpup_DTIME, ++ /*h_parent*/NULL); ++ if (file) { ++ if (!err) ++ err = au_reopen_nondir(file); ++ hdp[0 + bstart].hd_dentry = h_d_start; ++ } ++ hdp[0 + bdst].hd_dentry = h_d_dst; ++ dinfo->di_bstart = bstart; ++ ++ return err; ++} ++ ++static int au_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ struct file *file) ++{ ++ int err; ++ struct au_dtime dt; ++ struct dentry *parent, *h_parent, *wh_dentry; ++ struct au_branch *br; ++ struct path h_path; ++ ++ br = au_sbr(dentry->d_sb, bdst); ++ parent = dget_parent(dentry); ++ h_parent = au_h_dptr(parent, bdst); ++ wh_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out; ++ ++ h_path.dentry = h_parent; ++ h_path.mnt = br->br_mnt; ++ au_dtime_store(&dt, parent, &h_path); ++ err = au_do_cpup_wh(dentry, bdst, wh_dentry, file, len); ++ if (unlikely(err)) ++ goto out_wh; ++ ++ dget(wh_dentry); ++ h_path.dentry = wh_dentry; ++ if (!S_ISDIR(wh_dentry->d_inode->i_mode)) ++ err = vfsub_unlink(h_parent->d_inode, &h_path, /*force*/0); ++ else ++ err = vfsub_rmdir(h_parent->d_inode, &h_path); ++ if (unlikely(err)) { ++ AuIOErr("failed remove copied-up tmp file %.*s(%d)\n", ++ AuDLNPair(wh_dentry), err); ++ err = -EIO; ++ } ++ au_dtime_revert(&dt); ++ au_set_hi_wh(dentry->d_inode, bdst, wh_dentry); ++ ++out_wh: ++ dput(wh_dentry); ++out: ++ dput(parent); ++ return err; ++} ++ ++struct au_cpup_wh_args { ++ int *errp; ++ struct dentry *dentry; ++ aufs_bindex_t bdst; ++ loff_t len; ++ struct file *file; ++}; ++ ++static void au_call_cpup_wh(void *args) ++{ ++ struct au_cpup_wh_args *a = args; ++ *a->errp = au_cpup_wh(a->dentry, a->bdst, a->len, a->file); ++} ++ ++int au_sio_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ struct file *file) ++{ ++ int err, wkq_err; ++ struct dentry *parent, *h_orph, *h_parent, *h_dentry; ++ struct inode *dir, *h_dir, *h_tmpdir, *h_inode; ++ struct au_wbr *wbr; ++ ++ parent = dget_parent(dentry); ++ dir = parent->d_inode; ++ h_orph = NULL; ++ h_parent = NULL; ++ h_dir = au_igrab(au_h_iptr(dir, bdst)); ++ h_tmpdir = h_dir; ++ if (!h_dir->i_nlink) { ++ wbr = au_sbr(dentry->d_sb, bdst)->br_wbr; ++ h_orph = wbr->wbr_orph; ++ ++ h_parent = dget(au_h_dptr(parent, bdst)); ++ au_set_h_dptr(parent, bdst, dget(h_orph)); ++ h_tmpdir = h_orph->d_inode; ++ au_set_h_iptr(dir, bdst, au_igrab(h_tmpdir), /*flags*/0); ++ ++ /* this temporary unlock is safe */ ++ if (file) ++ h_dentry = au_hf_top(file)->f_dentry; ++ else ++ h_dentry = au_h_dptr(dentry, au_dbstart(dentry)); ++ h_inode = h_dentry->d_inode; ++ IMustLock(h_inode); ++ mutex_unlock(&h_inode->i_mutex); ++ mutex_lock_nested(&h_tmpdir->i_mutex, AuLsc_I_PARENT3); ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ /* todo: au_h_open_pre()? */ ++ } ++ ++ if (!au_test_h_perm_sio(h_tmpdir, MAY_EXEC | MAY_WRITE) ++ && !au_cpup_sio_test(dentry->d_sb, dentry->d_inode->i_mode)) ++ err = au_cpup_wh(dentry, bdst, len, file); ++ else { ++ struct au_cpup_wh_args args = { ++ .errp = &err, ++ .dentry = dentry, ++ .bdst = bdst, ++ .len = len, ++ .file = file ++ }; ++ wkq_err = au_wkq_wait(au_call_cpup_wh, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ if (h_orph) { ++ mutex_unlock(&h_tmpdir->i_mutex); ++ /* todo: au_h_open_post()? */ ++ au_set_h_iptr(dir, bdst, au_igrab(h_dir), /*flags*/0); ++ au_set_h_dptr(parent, bdst, h_parent); ++ } ++ iput(h_dir); ++ dput(parent); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * generic routine for both of copy-up and copy-down. ++ */ ++/* cf. revalidate function in file.c */ ++int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, ++ int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, ++ struct dentry *h_parent, void *arg), ++ void *arg) ++{ ++ int err; ++ struct au_pin pin; ++ struct dentry *d, *parent, *h_parent, *real_parent; ++ ++ err = 0; ++ parent = dget_parent(dentry); ++ if (IS_ROOT(parent)) ++ goto out; ++ ++ au_pin_init(&pin, dentry, bdst, AuLsc_DI_PARENT2, AuLsc_I_PARENT2, ++ au_opt_udba(dentry->d_sb), AuPin_MNT_WRITE); ++ ++ /* do not use au_dpage */ ++ real_parent = parent; ++ while (1) { ++ dput(parent); ++ parent = dget_parent(dentry); ++ h_parent = au_h_dptr(parent, bdst); ++ if (h_parent) ++ goto out; /* success */ ++ ++ /* find top dir which is necessary to cpup */ ++ do { ++ d = parent; ++ dput(parent); ++ parent = dget_parent(d); ++ di_read_lock_parent3(parent, !AuLock_IR); ++ h_parent = au_h_dptr(parent, bdst); ++ di_read_unlock(parent, !AuLock_IR); ++ } while (!h_parent); ++ ++ if (d != real_parent) ++ di_write_lock_child3(d); ++ ++ /* somebody else might create while we were sleeping */ ++ if (!au_h_dptr(d, bdst) || !au_h_dptr(d, bdst)->d_inode) { ++ if (au_h_dptr(d, bdst)) ++ au_update_dbstart(d); ++ ++ au_pin_set_dentry(&pin, d); ++ err = au_do_pin(&pin); ++ if (!err) { ++ err = cp(d, bdst, h_parent, arg); ++ au_unpin(&pin); ++ } ++ } ++ ++ if (d != real_parent) ++ di_write_unlock(d); ++ if (unlikely(err)) ++ break; ++ } ++ ++out: ++ dput(parent); ++ return err; ++} ++ ++static int au_cpup_dir(struct dentry *dentry, aufs_bindex_t bdst, ++ struct dentry *h_parent __maybe_unused , ++ void *arg __maybe_unused) ++{ ++ return au_sio_cpup_simple(dentry, bdst, -1, AuCpup_DTIME); ++} ++ ++int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) ++{ ++ return au_cp_dirs(dentry, bdst, au_cpup_dir, NULL); ++} ++ ++int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) ++{ ++ int err; ++ struct dentry *parent; ++ struct inode *dir; ++ ++ parent = dget_parent(dentry); ++ dir = parent->d_inode; ++ err = 0; ++ if (au_h_iptr(dir, bdst)) ++ goto out; ++ ++ di_read_unlock(parent, AuLock_IR); ++ di_write_lock_parent(parent); ++ /* someone else might change our inode while we were sleeping */ ++ if (!au_h_iptr(dir, bdst)) ++ err = au_cpup_dirs(dentry, bdst); ++ di_downgrade_lock(parent, AuLock_IR); ++ ++out: ++ dput(parent); ++ return err; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/cpup.h linux-3.2.0-gentoo-r1/fs/aufs/cpup.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/cpup.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/cpup.h 2012-01-17 12:11:24.532524766 +0100 +@@ -0,0 +1,81 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * copy-up/down functions ++ */ ++ ++#ifndef __AUFS_CPUP_H__ ++#define __AUFS_CPUP_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++struct inode; ++struct file; ++ ++void au_cpup_attr_flags(struct inode *dst, struct inode *src); ++void au_cpup_attr_timesizes(struct inode *inode); ++void au_cpup_attr_nlink(struct inode *inode, int force); ++void au_cpup_attr_changeable(struct inode *inode); ++void au_cpup_igen(struct inode *inode, struct inode *h_inode); ++void au_cpup_attr_all(struct inode *inode, int force); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* cpup flags */ ++#define AuCpup_DTIME 1 /* do dtime_store/revert */ ++#define AuCpup_KEEPLINO (1 << 1) /* do not clear the lower xino, ++ for link(2) */ ++#define au_ftest_cpup(flags, name) ((flags) & AuCpup_##name) ++#define au_fset_cpup(flags, name) \ ++ do { (flags) |= AuCpup_##name; } while (0) ++#define au_fclr_cpup(flags, name) \ ++ do { (flags) &= ~AuCpup_##name; } while (0) ++ ++int au_copy_file(struct file *dst, struct file *src, loff_t len); ++int au_sio_cpup_single(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len, unsigned int flags, ++ struct dentry *dst_parent); ++int au_sio_cpup_simple(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ unsigned int flags); ++int au_sio_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ struct file *file); ++ ++int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, ++ int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, ++ struct dentry *h_parent, void *arg), ++ void *arg); ++int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); ++int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* keep timestamps when copyup */ ++struct au_dtime { ++ struct dentry *dt_dentry; ++ struct path dt_h_path; ++ struct timespec dt_atime, dt_mtime; ++}; ++void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, ++ struct path *h_path); ++void au_dtime_revert(struct au_dtime *dt); ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_CPUP_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/dbgaufs.c linux-3.2.0-gentoo-r1/fs/aufs/dbgaufs.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/dbgaufs.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/dbgaufs.c 2012-01-17 12:11:24.532524766 +0100 +@@ -0,0 +1,334 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * debugfs interface ++ */ ++ ++#include ++#include "aufs.h" ++ ++#ifndef CONFIG_SYSFS ++#error DEBUG_FS depends upon SYSFS ++#endif ++ ++static struct dentry *dbgaufs; ++static const mode_t dbgaufs_mode = S_IRUSR | S_IRGRP | S_IROTH; ++ ++/* 20 is max digits length of ulong 64 */ ++struct dbgaufs_arg { ++ int n; ++ char a[20 * 4]; ++}; ++ ++/* ++ * common function for all XINO files ++ */ ++static int dbgaufs_xi_release(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static int dbgaufs_xi_open(struct file *xf, struct file *file, int do_fcnt) ++{ ++ int err; ++ struct kstat st; ++ struct dbgaufs_arg *p; ++ ++ err = -ENOMEM; ++ p = kmalloc(sizeof(*p), GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ ++ err = 0; ++ p->n = 0; ++ file->private_data = p; ++ if (!xf) ++ goto out; ++ ++ err = vfs_getattr(xf->f_vfsmnt, xf->f_dentry, &st); ++ if (!err) { ++ if (do_fcnt) ++ p->n = snprintf ++ (p->a, sizeof(p->a), "%ld, %llux%lu %lld\n", ++ (long)file_count(xf), st.blocks, st.blksize, ++ (long long)st.size); ++ else ++ p->n = snprintf(p->a, sizeof(p->a), "%llux%lu %lld\n", ++ st.blocks, st.blksize, ++ (long long)st.size); ++ AuDebugOn(p->n >= sizeof(p->a)); ++ } else { ++ p->n = snprintf(p->a, sizeof(p->a), "err %d\n", err); ++ err = 0; ++ } ++ ++out: ++ return err; ++ ++} ++ ++static ssize_t dbgaufs_xi_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct dbgaufs_arg *p; ++ ++ p = file->private_data; ++ return simple_read_from_buffer(buf, count, ppos, p->a, p->n); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int dbgaufs_xib_open(struct inode *inode, struct file *file) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ ++ sbinfo = inode->i_private; ++ sb = sbinfo->si_sb; ++ si_noflush_read_lock(sb); ++ err = dbgaufs_xi_open(sbinfo->si_xib, file, /*do_fcnt*/0); ++ si_read_unlock(sb); ++ return err; ++} ++ ++static const struct file_operations dbgaufs_xib_fop = { ++ .owner = THIS_MODULE, ++ .open = dbgaufs_xib_open, ++ .release = dbgaufs_xi_release, ++ .read = dbgaufs_xi_read ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define DbgaufsXi_PREFIX "xi" ++ ++static int dbgaufs_xino_open(struct inode *inode, struct file *file) ++{ ++ int err; ++ long l; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ struct file *xf; ++ struct qstr *name; ++ ++ err = -ENOENT; ++ xf = NULL; ++ name = &file->f_dentry->d_name; ++ if (unlikely(name->len < sizeof(DbgaufsXi_PREFIX) ++ || memcmp(name->name, DbgaufsXi_PREFIX, ++ sizeof(DbgaufsXi_PREFIX) - 1))) ++ goto out; ++ err = kstrtol(name->name + sizeof(DbgaufsXi_PREFIX) - 1, 10, &l); ++ if (unlikely(err)) ++ goto out; ++ ++ sbinfo = inode->i_private; ++ sb = sbinfo->si_sb; ++ si_noflush_read_lock(sb); ++ if (l <= au_sbend(sb)) { ++ xf = au_sbr(sb, (aufs_bindex_t)l)->br_xino.xi_file; ++ err = dbgaufs_xi_open(xf, file, /*do_fcnt*/1); ++ } else ++ err = -ENOENT; ++ si_read_unlock(sb); ++ ++out: ++ return err; ++} ++ ++static const struct file_operations dbgaufs_xino_fop = { ++ .owner = THIS_MODULE, ++ .open = dbgaufs_xino_open, ++ .release = dbgaufs_xi_release, ++ .read = dbgaufs_xi_read ++}; ++ ++void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ aufs_bindex_t bend; ++ struct au_branch *br; ++ struct au_xino_file *xi; ++ ++ if (!au_sbi(sb)->si_dbgaufs) ++ return; ++ ++ bend = au_sbend(sb); ++ for (; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ xi = &br->br_xino; ++ if (xi->xi_dbgaufs) { ++ debugfs_remove(xi->xi_dbgaufs); ++ xi->xi_dbgaufs = NULL; ++ } ++ } ++} ++ ++void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ struct au_sbinfo *sbinfo; ++ struct dentry *parent; ++ struct au_branch *br; ++ struct au_xino_file *xi; ++ aufs_bindex_t bend; ++ char name[sizeof(DbgaufsXi_PREFIX) + 5]; /* "xi" bindex NULL */ ++ ++ sbinfo = au_sbi(sb); ++ parent = sbinfo->si_dbgaufs; ++ if (!parent) ++ return; ++ ++ bend = au_sbend(sb); ++ for (; bindex <= bend; bindex++) { ++ snprintf(name, sizeof(name), DbgaufsXi_PREFIX "%d", bindex); ++ br = au_sbr(sb, bindex); ++ xi = &br->br_xino; ++ AuDebugOn(xi->xi_dbgaufs); ++ xi->xi_dbgaufs = debugfs_create_file(name, dbgaufs_mode, parent, ++ sbinfo, &dbgaufs_xino_fop); ++ /* ignore an error */ ++ if (unlikely(!xi->xi_dbgaufs)) ++ AuWarn1("failed %s under debugfs\n", name); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_EXPORT ++static int dbgaufs_xigen_open(struct inode *inode, struct file *file) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ ++ sbinfo = inode->i_private; ++ sb = sbinfo->si_sb; ++ si_noflush_read_lock(sb); ++ err = dbgaufs_xi_open(sbinfo->si_xigen, file, /*do_fcnt*/0); ++ si_read_unlock(sb); ++ return err; ++} ++ ++static const struct file_operations dbgaufs_xigen_fop = { ++ .owner = THIS_MODULE, ++ .open = dbgaufs_xigen_open, ++ .release = dbgaufs_xi_release, ++ .read = dbgaufs_xi_read ++}; ++ ++static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) ++{ ++ int err; ++ ++ /* ++ * This function is a dynamic '__init' fucntion actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++ ++ err = -EIO; ++ sbinfo->si_dbgaufs_xigen = debugfs_create_file ++ ("xigen", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, ++ &dbgaufs_xigen_fop); ++ if (sbinfo->si_dbgaufs_xigen) ++ err = 0; ++ ++ return err; ++} ++#else ++static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) ++{ ++ return 0; ++} ++#endif /* CONFIG_AUFS_EXPORT */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++void dbgaufs_si_fin(struct au_sbinfo *sbinfo) ++{ ++ /* ++ * This function is a dynamic '__init' fucntion actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++ ++ debugfs_remove_recursive(sbinfo->si_dbgaufs); ++ sbinfo->si_dbgaufs = NULL; ++ kobject_put(&sbinfo->si_kobj); ++} ++ ++int dbgaufs_si_init(struct au_sbinfo *sbinfo) ++{ ++ int err; ++ char name[SysaufsSiNameLen]; ++ ++ /* ++ * This function is a dynamic '__init' fucntion actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++ ++ err = -ENOENT; ++ if (!dbgaufs) { ++ AuErr1("/debug/aufs is uninitialized\n"); ++ goto out; ++ } ++ ++ err = -EIO; ++ sysaufs_name(sbinfo, name); ++ sbinfo->si_dbgaufs = debugfs_create_dir(name, dbgaufs); ++ if (unlikely(!sbinfo->si_dbgaufs)) ++ goto out; ++ kobject_get(&sbinfo->si_kobj); ++ ++ sbinfo->si_dbgaufs_xib = debugfs_create_file ++ ("xib", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, ++ &dbgaufs_xib_fop); ++ if (unlikely(!sbinfo->si_dbgaufs_xib)) ++ goto out_dir; ++ ++ err = dbgaufs_xigen_init(sbinfo); ++ if (!err) ++ goto out; /* success */ ++ ++out_dir: ++ dbgaufs_si_fin(sbinfo); ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void dbgaufs_fin(void) ++{ ++ debugfs_remove(dbgaufs); ++} ++ ++int __init dbgaufs_init(void) ++{ ++ int err; ++ ++ err = -EIO; ++ dbgaufs = debugfs_create_dir(AUFS_NAME, NULL); ++ if (dbgaufs) ++ err = 0; ++ return err; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/dbgaufs.h linux-3.2.0-gentoo-r1/fs/aufs/dbgaufs.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/dbgaufs.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/dbgaufs.h 2012-01-17 12:11:24.532524766 +0100 +@@ -0,0 +1,49 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * debugfs interface ++ */ ++ ++#ifndef __DBGAUFS_H__ ++#define __DBGAUFS_H__ ++ ++#ifdef __KERNEL__ ++ ++struct super_block; ++struct au_sbinfo; ++ ++#ifdef CONFIG_DEBUG_FS ++/* dbgaufs.c */ ++void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); ++void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); ++void dbgaufs_si_fin(struct au_sbinfo *sbinfo); ++int dbgaufs_si_init(struct au_sbinfo *sbinfo); ++void dbgaufs_fin(void); ++int __init dbgaufs_init(void); ++#else ++AuStubVoid(dbgaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex) ++AuStubVoid(dbgaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex) ++AuStubVoid(dbgaufs_si_fin, struct au_sbinfo *sbinfo) ++AuStubInt0(dbgaufs_si_init, struct au_sbinfo *sbinfo) ++AuStubVoid(dbgaufs_fin, void) ++AuStubInt0(__init dbgaufs_init, void) ++#endif /* CONFIG_DEBUG_FS */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __DBGAUFS_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/dcsub.c linux-3.2.0-gentoo-r1/fs/aufs/dcsub.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/dcsub.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/dcsub.c 2012-01-17 12:11:24.534839602 +0100 +@@ -0,0 +1,243 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sub-routines for dentry cache ++ */ ++ ++#include "aufs.h" ++ ++static void au_dpage_free(struct au_dpage *dpage) ++{ ++ int i; ++ struct dentry **p; ++ ++ p = dpage->dentries; ++ for (i = 0; i < dpage->ndentry; i++) ++ dput(*p++); ++ free_page((unsigned long)dpage->dentries); ++} ++ ++int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp) ++{ ++ int err; ++ void *p; ++ ++ err = -ENOMEM; ++ dpages->dpages = kmalloc(sizeof(*dpages->dpages), gfp); ++ if (unlikely(!dpages->dpages)) ++ goto out; ++ ++ p = (void *)__get_free_page(gfp); ++ if (unlikely(!p)) ++ goto out_dpages; ++ ++ dpages->dpages[0].ndentry = 0; ++ dpages->dpages[0].dentries = p; ++ dpages->ndpage = 1; ++ return 0; /* success */ ++ ++out_dpages: ++ kfree(dpages->dpages); ++out: ++ return err; ++} ++ ++void au_dpages_free(struct au_dcsub_pages *dpages) ++{ ++ int i; ++ struct au_dpage *p; ++ ++ p = dpages->dpages; ++ for (i = 0; i < dpages->ndpage; i++) ++ au_dpage_free(p++); ++ kfree(dpages->dpages); ++} ++ ++static int au_dpages_append(struct au_dcsub_pages *dpages, ++ struct dentry *dentry, gfp_t gfp) ++{ ++ int err, sz; ++ struct au_dpage *dpage; ++ void *p; ++ ++ dpage = dpages->dpages + dpages->ndpage - 1; ++ sz = PAGE_SIZE / sizeof(dentry); ++ if (unlikely(dpage->ndentry >= sz)) { ++ AuLabel(new dpage); ++ err = -ENOMEM; ++ sz = dpages->ndpage * sizeof(*dpages->dpages); ++ p = au_kzrealloc(dpages->dpages, sz, ++ sz + sizeof(*dpages->dpages), gfp); ++ if (unlikely(!p)) ++ goto out; ++ ++ dpages->dpages = p; ++ dpage = dpages->dpages + dpages->ndpage; ++ p = (void *)__get_free_page(gfp); ++ if (unlikely(!p)) ++ goto out; ++ ++ dpage->ndentry = 0; ++ dpage->dentries = p; ++ dpages->ndpage++; ++ } ++ ++ AuDebugOn(!dentry->d_count); ++ dpage->dentries[dpage->ndentry++] = dget_dlock(dentry); ++ return 0; /* success */ ++ ++out: ++ return err; ++} ++ ++int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, ++ au_dpages_test test, void *arg) ++{ ++ int err; ++ struct dentry *this_parent; ++ struct list_head *next; ++ struct super_block *sb = root->d_sb; ++ ++ err = 0; ++ write_seqlock(&rename_lock); ++ this_parent = root; ++ spin_lock(&this_parent->d_lock); ++repeat: ++ next = this_parent->d_subdirs.next; ++resume: ++ if (this_parent->d_sb == sb ++ && !IS_ROOT(this_parent) ++ && au_di(this_parent) ++ && this_parent->d_count ++ && (!test || test(this_parent, arg))) { ++ err = au_dpages_append(dpages, this_parent, GFP_ATOMIC); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ while (next != &this_parent->d_subdirs) { ++ struct list_head *tmp = next; ++ struct dentry *dentry = list_entry(tmp, struct dentry, ++ d_u.d_child); ++ ++ next = tmp->next; ++ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); ++ if (dentry->d_count) { ++ if (!list_empty(&dentry->d_subdirs)) { ++ spin_unlock(&this_parent->d_lock); ++ spin_release(&dentry->d_lock.dep_map, 1, ++ _RET_IP_); ++ this_parent = dentry; ++ spin_acquire(&this_parent->d_lock.dep_map, 0, 1, ++ _RET_IP_); ++ goto repeat; ++ } ++ if (dentry->d_sb == sb ++ && au_di(dentry) ++ && (!test || test(dentry, arg))) ++ err = au_dpages_append(dpages, dentry, ++ GFP_ATOMIC); ++ } ++ spin_unlock(&dentry->d_lock); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ if (this_parent != root) { ++ struct dentry *tmp; ++ struct dentry *child; ++ ++ tmp = this_parent->d_parent; ++ rcu_read_lock(); ++ spin_unlock(&this_parent->d_lock); ++ child = this_parent; ++ this_parent = tmp; ++ spin_lock(&this_parent->d_lock); ++ rcu_read_unlock(); ++ next = child->d_u.d_child.next; ++ goto resume; ++ } ++ ++out: ++ spin_unlock(&this_parent->d_lock); ++ write_sequnlock(&rename_lock); ++ return err; ++} ++ ++int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, ++ int do_include, au_dpages_test test, void *arg) ++{ ++ int err; ++ ++ err = 0; ++ write_seqlock(&rename_lock); ++ spin_lock(&dentry->d_lock); ++ if (do_include ++ && dentry->d_count ++ && (!test || test(dentry, arg))) ++ err = au_dpages_append(dpages, dentry, GFP_ATOMIC); ++ spin_unlock(&dentry->d_lock); ++ if (unlikely(err)) ++ goto out; ++ ++ /* ++ * vfsmount_lock is unnecessary since this is a traverse in a single ++ * mount ++ */ ++ while (!IS_ROOT(dentry)) { ++ dentry = dentry->d_parent; /* rename_lock is locked */ ++ spin_lock(&dentry->d_lock); ++ if (dentry->d_count ++ && (!test || test(dentry, arg))) ++ err = au_dpages_append(dpages, dentry, GFP_ATOMIC); ++ spin_unlock(&dentry->d_lock); ++ if (unlikely(err)) ++ break; ++ } ++ ++out: ++ write_sequnlock(&rename_lock); ++ return err; ++} ++ ++static inline int au_dcsub_dpages_aufs(struct dentry *dentry, void *arg) ++{ ++ return au_di(dentry) && dentry->d_sb == arg; ++} ++ ++int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages, ++ struct dentry *dentry, int do_include) ++{ ++ return au_dcsub_pages_rev(dpages, dentry, do_include, ++ au_dcsub_dpages_aufs, dentry->d_sb); ++} ++ ++int au_test_subdir(struct dentry *d1, struct dentry *d2) ++{ ++ struct path path[2] = { ++ { ++ .dentry = d1 ++ }, ++ { ++ .dentry = d2 ++ } ++ }; ++ ++ return path_is_under(path + 0, path + 1); ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/dcsub.h linux-3.2.0-gentoo-r1/fs/aufs/dcsub.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/dcsub.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/dcsub.h 2012-01-17 12:11:24.534839602 +0100 +@@ -0,0 +1,94 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sub-routines for dentry cache ++ */ ++ ++#ifndef __AUFS_DCSUB_H__ ++#define __AUFS_DCSUB_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++ ++struct dentry; ++ ++struct au_dpage { ++ int ndentry; ++ struct dentry **dentries; ++}; ++ ++struct au_dcsub_pages { ++ int ndpage; ++ struct au_dpage *dpages; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dcsub.c */ ++int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp); ++void au_dpages_free(struct au_dcsub_pages *dpages); ++typedef int (*au_dpages_test)(struct dentry *dentry, void *arg); ++int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, ++ au_dpages_test test, void *arg); ++int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, ++ int do_include, au_dpages_test test, void *arg); ++int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages, ++ struct dentry *dentry, int do_include); ++int au_test_subdir(struct dentry *d1, struct dentry *d2); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline int au_d_hashed_positive(struct dentry *d) ++{ ++ int err; ++ struct inode *inode = d->d_inode; ++ err = 0; ++ if (unlikely(d_unhashed(d) || !inode || !inode->i_nlink)) ++ err = -ENOENT; ++ return err; ++} ++ ++static inline int au_d_alive(struct dentry *d) ++{ ++ int err; ++ struct inode *inode; ++ err = 0; ++ if (!IS_ROOT(d)) ++ err = au_d_hashed_positive(d); ++ else { ++ inode = d->d_inode; ++ if (unlikely(d_unlinked(d) || !inode || !inode->i_nlink)) ++ err = -ENOENT; ++ } ++ return err; ++} ++ ++static inline int au_alive_dir(struct dentry *d) ++{ ++ int err; ++ err = au_d_alive(d); ++ if (unlikely(err || IS_DEADDIR(d->d_inode))) ++ err = -ENOENT; ++ return err; ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DCSUB_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/debug.c linux-3.2.0-gentoo-r1/fs/aufs/debug.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/debug.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/debug.c 2012-01-17 12:11:24.553358287 +0100 +@@ -0,0 +1,489 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * debug print functions ++ */ ++ ++#include ++#include "aufs.h" ++ ++int aufs_debug; ++MODULE_PARM_DESC(debug, "debug print"); ++module_param_named(debug, aufs_debug, int, S_IRUGO | S_IWUSR | S_IWGRP); ++ ++char *au_plevel = KERN_DEBUG; ++#define dpri(fmt, ...) do { \ ++ if ((au_plevel \ ++ && strcmp(au_plevel, KERN_DEBUG)) \ ++ || au_debug_test()) \ ++ printk("%s" fmt, au_plevel, ##__VA_ARGS__); \ ++} while (0) ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_dpri_whlist(struct au_nhash *whlist) ++{ ++ unsigned long ul, n; ++ struct hlist_head *head; ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos; ++ ++ n = whlist->nh_num; ++ head = whlist->nh_head; ++ for (ul = 0; ul < n; ul++) { ++ hlist_for_each_entry(tpos, pos, head, wh_hash) ++ dpri("b%d, %.*s, %d\n", ++ tpos->wh_bindex, ++ tpos->wh_str.len, tpos->wh_str.name, ++ tpos->wh_str.len); ++ head++; ++ } ++} ++ ++void au_dpri_vdir(struct au_vdir *vdir) ++{ ++ unsigned long ul; ++ union au_vdir_deblk_p p; ++ unsigned char *o; ++ ++ if (!vdir || IS_ERR(vdir)) { ++ dpri("err %ld\n", PTR_ERR(vdir)); ++ return; ++ } ++ ++ dpri("deblk %u, nblk %lu, deblk %p, last{%lu, %p}, ver %lu\n", ++ vdir->vd_deblk_sz, vdir->vd_nblk, vdir->vd_deblk, ++ vdir->vd_last.ul, vdir->vd_last.p.deblk, vdir->vd_version); ++ for (ul = 0; ul < vdir->vd_nblk; ul++) { ++ p.deblk = vdir->vd_deblk[ul]; ++ o = p.deblk; ++ dpri("[%lu]: %p\n", ul, o); ++ } ++} ++ ++static int do_pri_inode(aufs_bindex_t bindex, struct inode *inode, int hn, ++ struct dentry *wh) ++{ ++ char *n = NULL; ++ int l = 0; ++ ++ if (!inode || IS_ERR(inode)) { ++ dpri("i%d: err %ld\n", bindex, PTR_ERR(inode)); ++ return -1; ++ } ++ ++ /* the type of i_blocks depends upon CONFIG_LSF */ ++ BUILD_BUG_ON(sizeof(inode->i_blocks) != sizeof(unsigned long) ++ && sizeof(inode->i_blocks) != sizeof(u64)); ++ if (wh) { ++ n = (void *)wh->d_name.name; ++ l = wh->d_name.len; ++ } ++ ++ dpri("i%d: %p, i%lu, %s, cnt %d, nl %u, 0%o, sz %llu, blk %llu," ++ " hn %d, ct %lld, np %lu, st 0x%lx, f 0x%x, v %llu, g %x%s%.*s\n", ++ bindex, inode, ++ inode->i_ino, inode->i_sb ? au_sbtype(inode->i_sb) : "??", ++ atomic_read(&inode->i_count), inode->i_nlink, inode->i_mode, ++ i_size_read(inode), (unsigned long long)inode->i_blocks, ++ hn, (long long)timespec_to_ns(&inode->i_ctime) & 0x0ffff, ++ inode->i_mapping ? inode->i_mapping->nrpages : 0, ++ inode->i_state, inode->i_flags, inode->i_version, ++ inode->i_generation, ++ l ? ", wh " : "", l, n); ++ return 0; ++} ++ ++void au_dpri_inode(struct inode *inode) ++{ ++ struct au_iinfo *iinfo; ++ aufs_bindex_t bindex; ++ int err, hn; ++ ++ err = do_pri_inode(-1, inode, -1, NULL); ++ if (err || !au_test_aufs(inode->i_sb)) ++ return; ++ ++ iinfo = au_ii(inode); ++ if (!iinfo) ++ return; ++ dpri("i-1: bstart %d, bend %d, gen %d\n", ++ iinfo->ii_bstart, iinfo->ii_bend, au_iigen(inode)); ++ if (iinfo->ii_bstart < 0) ++ return; ++ hn = 0; ++ for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; bindex++) { ++ hn = !!au_hn(iinfo->ii_hinode + bindex); ++ do_pri_inode(bindex, iinfo->ii_hinode[0 + bindex].hi_inode, hn, ++ iinfo->ii_hinode[0 + bindex].hi_whdentry); ++ } ++} ++ ++void au_dpri_dalias(struct inode *inode) ++{ ++ struct dentry *d; ++ ++ spin_lock(&inode->i_lock); ++ list_for_each_entry(d, &inode->i_dentry, d_alias) ++ au_dpri_dentry(d); ++ spin_unlock(&inode->i_lock); ++} ++ ++static int do_pri_dentry(aufs_bindex_t bindex, struct dentry *dentry) ++{ ++ struct dentry *wh = NULL; ++ int hn; ++ ++ if (!dentry || IS_ERR(dentry)) { ++ dpri("d%d: err %ld\n", bindex, PTR_ERR(dentry)); ++ return -1; ++ } ++ /* do not call dget_parent() here */ ++ /* note: access d_xxx without d_lock */ ++ dpri("d%d: %.*s?/%.*s, %s, cnt %d, flags 0x%x\n", ++ bindex, ++ AuDLNPair(dentry->d_parent), AuDLNPair(dentry), ++ dentry->d_sb ? au_sbtype(dentry->d_sb) : "??", ++ dentry->d_count, dentry->d_flags); ++ hn = -1; ++ if (bindex >= 0 && dentry->d_inode && au_test_aufs(dentry->d_sb)) { ++ struct au_iinfo *iinfo = au_ii(dentry->d_inode); ++ if (iinfo) { ++ hn = !!au_hn(iinfo->ii_hinode + bindex); ++ wh = iinfo->ii_hinode[0 + bindex].hi_whdentry; ++ } ++ } ++ do_pri_inode(bindex, dentry->d_inode, hn, wh); ++ return 0; ++} ++ ++void au_dpri_dentry(struct dentry *dentry) ++{ ++ struct au_dinfo *dinfo; ++ aufs_bindex_t bindex; ++ int err; ++ struct au_hdentry *hdp; ++ ++ err = do_pri_dentry(-1, dentry); ++ if (err || !au_test_aufs(dentry->d_sb)) ++ return; ++ ++ dinfo = au_di(dentry); ++ if (!dinfo) ++ return; ++ dpri("d-1: bstart %d, bend %d, bwh %d, bdiropq %d, gen %d\n", ++ dinfo->di_bstart, dinfo->di_bend, ++ dinfo->di_bwh, dinfo->di_bdiropq, au_digen(dentry)); ++ if (dinfo->di_bstart < 0) ++ return; ++ hdp = dinfo->di_hdentry; ++ for (bindex = dinfo->di_bstart; bindex <= dinfo->di_bend; bindex++) ++ do_pri_dentry(bindex, hdp[0 + bindex].hd_dentry); ++} ++ ++static int do_pri_file(aufs_bindex_t bindex, struct file *file) ++{ ++ char a[32]; ++ ++ if (!file || IS_ERR(file)) { ++ dpri("f%d: err %ld\n", bindex, PTR_ERR(file)); ++ return -1; ++ } ++ a[0] = 0; ++ if (bindex < 0 ++ && file->f_dentry ++ && au_test_aufs(file->f_dentry->d_sb) ++ && au_fi(file)) ++ snprintf(a, sizeof(a), ", gen %d, mmapped %d", ++ au_figen(file), atomic_read(&au_fi(file)->fi_mmapped)); ++ dpri("f%d: mode 0x%x, flags 0%o, cnt %ld, v %llu, pos %llu%s\n", ++ bindex, file->f_mode, file->f_flags, (long)file_count(file), ++ file->f_version, file->f_pos, a); ++ if (file->f_dentry) ++ do_pri_dentry(bindex, file->f_dentry); ++ return 0; ++} ++ ++void au_dpri_file(struct file *file) ++{ ++ struct au_finfo *finfo; ++ struct au_fidir *fidir; ++ struct au_hfile *hfile; ++ aufs_bindex_t bindex; ++ int err; ++ ++ err = do_pri_file(-1, file); ++ if (err || !file->f_dentry || !au_test_aufs(file->f_dentry->d_sb)) ++ return; ++ ++ finfo = au_fi(file); ++ if (!finfo) ++ return; ++ if (finfo->fi_btop < 0) ++ return; ++ fidir = finfo->fi_hdir; ++ if (!fidir) ++ do_pri_file(finfo->fi_btop, finfo->fi_htop.hf_file); ++ else ++ for (bindex = finfo->fi_btop; ++ bindex >= 0 && bindex <= fidir->fd_bbot; ++ bindex++) { ++ hfile = fidir->fd_hfile + bindex; ++ do_pri_file(bindex, hfile ? hfile->hf_file : NULL); ++ } ++} ++ ++static int do_pri_br(aufs_bindex_t bindex, struct au_branch *br) ++{ ++ struct vfsmount *mnt; ++ struct super_block *sb; ++ ++ if (!br || IS_ERR(br)) ++ goto out; ++ mnt = br->br_mnt; ++ if (!mnt || IS_ERR(mnt)) ++ goto out; ++ sb = mnt->mnt_sb; ++ if (!sb || IS_ERR(sb)) ++ goto out; ++ ++ dpri("s%d: {perm 0x%x, id %d, cnt %d, wbr %p}, " ++ "%s, dev 0x%02x%02x, flags 0x%lx, cnt %d, active %d, " ++ "xino %d\n", ++ bindex, br->br_perm, br->br_id, atomic_read(&br->br_count), ++ br->br_wbr, au_sbtype(sb), MAJOR(sb->s_dev), MINOR(sb->s_dev), ++ sb->s_flags, sb->s_count, ++ atomic_read(&sb->s_active), !!br->br_xino.xi_file); ++ return 0; ++ ++out: ++ dpri("s%d: err %ld\n", bindex, PTR_ERR(br)); ++ return -1; ++} ++ ++void au_dpri_sb(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ aufs_bindex_t bindex; ++ int err; ++ /* to reuduce stack size */ ++ struct { ++ struct vfsmount mnt; ++ struct au_branch fake; ++ } *a; ++ ++ /* this function can be called from magic sysrq */ ++ a = kzalloc(sizeof(*a), GFP_ATOMIC); ++ if (unlikely(!a)) { ++ dpri("no memory\n"); ++ return; ++ } ++ ++ a->mnt.mnt_sb = sb; ++ a->fake.br_perm = 0; ++ a->fake.br_mnt = &a->mnt; ++ a->fake.br_xino.xi_file = NULL; ++ atomic_set(&a->fake.br_count, 0); ++ smp_mb(); /* atomic_set */ ++ err = do_pri_br(-1, &a->fake); ++ kfree(a); ++ dpri("dev 0x%x\n", sb->s_dev); ++ if (err || !au_test_aufs(sb)) ++ return; ++ ++ sbinfo = au_sbi(sb); ++ if (!sbinfo) ++ return; ++ dpri("nw %d, gen %u, kobj %d\n", ++ atomic_read(&sbinfo->si_nowait.nw_len), sbinfo->si_generation, ++ atomic_read(&sbinfo->si_kobj.kref.refcount)); ++ for (bindex = 0; bindex <= sbinfo->si_bend; bindex++) ++ do_pri_br(bindex, sbinfo->si_branch[0 + bindex]); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_dbg_sleep_jiffy(int jiffy) ++{ ++ while (jiffy) ++ jiffy = schedule_timeout_uninterruptible(jiffy); ++} ++ ++void au_dbg_iattr(struct iattr *ia) ++{ ++#define AuBit(name) if (ia->ia_valid & ATTR_ ## name) \ ++ dpri(#name "\n") ++ AuBit(MODE); ++ AuBit(UID); ++ AuBit(GID); ++ AuBit(SIZE); ++ AuBit(ATIME); ++ AuBit(MTIME); ++ AuBit(CTIME); ++ AuBit(ATIME_SET); ++ AuBit(MTIME_SET); ++ AuBit(FORCE); ++ AuBit(ATTR_FLAG); ++ AuBit(KILL_SUID); ++ AuBit(KILL_SGID); ++ AuBit(FILE); ++ AuBit(KILL_PRIV); ++ AuBit(OPEN); ++ AuBit(TIMES_SET); ++#undef AuBit ++ dpri("ia_file %p\n", ia->ia_file); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line) ++{ ++ struct inode *h_inode, *inode = dentry->d_inode; ++ struct dentry *h_dentry; ++ aufs_bindex_t bindex, bend, bi; ++ ++ if (!inode /* || au_di(dentry)->di_lsc == AuLsc_DI_TMP */) ++ return; ++ ++ bend = au_dbend(dentry); ++ bi = au_ibend(inode); ++ if (bi < bend) ++ bend = bi; ++ bindex = au_dbstart(dentry); ++ bi = au_ibstart(inode); ++ if (bi > bindex) ++ bindex = bi; ++ ++ for (; bindex <= bend; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ h_inode = au_h_iptr(inode, bindex); ++ if (unlikely(h_inode != h_dentry->d_inode)) { ++ int old = au_debug_test(); ++ if (!old) ++ au_debug(1); ++ AuDbg("b%d, %s:%d\n", bindex, func, line); ++ AuDbgDentry(dentry); ++ AuDbgInode(inode); ++ if (!old) ++ au_debug(0); ++ BUG(); ++ } ++ } ++} ++ ++void au_dbg_verify_dir_parent(struct dentry *dentry, unsigned int sigen) ++{ ++ struct dentry *parent; ++ ++ parent = dget_parent(dentry); ++ AuDebugOn(!S_ISDIR(dentry->d_inode->i_mode)); ++ AuDebugOn(IS_ROOT(dentry)); ++ AuDebugOn(au_digen_test(parent, sigen)); ++ dput(parent); ++} ++ ++void au_dbg_verify_nondir_parent(struct dentry *dentry, unsigned int sigen) ++{ ++ struct dentry *parent; ++ struct inode *inode; ++ ++ parent = dget_parent(dentry); ++ inode = dentry->d_inode; ++ AuDebugOn(inode && S_ISDIR(dentry->d_inode->i_mode)); ++ AuDebugOn(au_digen_test(parent, sigen)); ++ dput(parent); ++} ++ ++void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen) ++{ ++ int err, i, j; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ AuDebugOn(err); ++ err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/1); ++ AuDebugOn(err); ++ for (i = dpages.ndpage - 1; !err && i >= 0; i--) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ for (j = dpage->ndentry - 1; !err && j >= 0; j--) ++ AuDebugOn(au_digen_test(dentries[j], sigen)); ++ } ++ au_dpages_free(&dpages); ++} ++ ++void au_dbg_verify_kthread(void) ++{ ++ if (au_wkq_test()) { ++ au_dbg_blocked(); ++ /* ++ * It may be recursive, but udba=notify between two aufs mounts, ++ * where a single ro branch is shared, is not a problem. ++ */ ++ /* WARN_ON(1); */ ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_debug_sbinfo_init(struct au_sbinfo *sbinfo __maybe_unused) ++{ ++#ifdef AuForceNoPlink ++ au_opt_clr(sbinfo->si_mntflags, PLINK); ++#endif ++#ifdef AuForceNoXino ++ au_opt_clr(sbinfo->si_mntflags, XINO); ++#endif ++#ifdef AuForceNoRefrof ++ au_opt_clr(sbinfo->si_mntflags, REFROF); ++#endif ++#ifdef AuForceHnotify ++ au_opt_set_udba(sbinfo->si_mntflags, UDBA_HNOTIFY); ++#endif ++#ifdef AuForceRd0 ++ sbinfo->si_rdblk = 0; ++ sbinfo->si_rdhash = 0; ++#endif ++} ++ ++int __init au_debug_init(void) ++{ ++ aufs_bindex_t bindex; ++ struct au_vdir_destr destr; ++ ++ bindex = -1; ++ AuDebugOn(bindex >= 0); ++ ++ destr.len = -1; ++ AuDebugOn(destr.len < NAME_MAX); ++ ++#ifdef CONFIG_4KSTACKS ++ pr_warning("CONFIG_4KSTACKS is defined.\n"); ++#endif ++ ++#ifdef AuForceNoBrs ++ sysaufs_brs = 0; ++#endif ++ ++ return 0; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/debug.h linux-3.2.0-gentoo-r1/fs/aufs/debug.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/debug.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/debug.h 2012-01-17 12:11:24.553358287 +0100 +@@ -0,0 +1,243 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * debug print functions ++ */ ++ ++#ifndef __AUFS_DEBUG_H__ ++#define __AUFS_DEBUG_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_AUFS_DEBUG ++#define AuDebugOn(a) BUG_ON(a) ++ ++/* module parameter */ ++extern int aufs_debug; ++static inline void au_debug(int n) ++{ ++ aufs_debug = n; ++ smp_mb(); ++} ++ ++static inline int au_debug_test(void) ++{ ++ return aufs_debug; ++} ++#else ++#define AuDebugOn(a) do {} while (0) ++AuStubVoid(au_debug, int n) ++AuStubInt0(au_debug_test, void) ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* debug print */ ++ ++#define AuDbg(fmt, ...) do { \ ++ if (au_debug_test()) \ ++ pr_debug("DEBUG: " fmt, ##__VA_ARGS__); \ ++} while (0) ++#define AuLabel(l) AuDbg(#l "\n") ++#define AuIOErr(fmt, ...) pr_err("I/O Error, " fmt, ##__VA_ARGS__) ++#define AuWarn1(fmt, ...) do { \ ++ static unsigned char _c; \ ++ if (!_c++) \ ++ pr_warning(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++#define AuErr1(fmt, ...) do { \ ++ static unsigned char _c; \ ++ if (!_c++) \ ++ pr_err(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++#define AuIOErr1(fmt, ...) do { \ ++ static unsigned char _c; \ ++ if (!_c++) \ ++ AuIOErr(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++#define AuUnsupportMsg "This operation is not supported." \ ++ " Please report this application to aufs-users ML." ++#define AuUnsupport(fmt, ...) do { \ ++ pr_err(AuUnsupportMsg "\n" fmt, ##__VA_ARGS__); \ ++ dump_stack(); \ ++} while (0) ++ ++#define AuTraceErr(e) do { \ ++ if (unlikely((e) < 0)) \ ++ AuDbg("err %d\n", (int)(e)); \ ++} while (0) ++ ++#define AuTraceErrPtr(p) do { \ ++ if (IS_ERR(p)) \ ++ AuDbg("err %ld\n", PTR_ERR(p)); \ ++} while (0) ++ ++/* dirty macros for debug print, use with "%.*s" and caution */ ++#define AuLNPair(qstr) (qstr)->len, (qstr)->name ++#define AuDLNPair(d) AuLNPair(&(d)->d_name) ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_sbinfo; ++struct au_finfo; ++struct dentry; ++#ifdef CONFIG_AUFS_DEBUG ++extern char *au_plevel; ++struct au_nhash; ++void au_dpri_whlist(struct au_nhash *whlist); ++struct au_vdir; ++void au_dpri_vdir(struct au_vdir *vdir); ++struct inode; ++void au_dpri_inode(struct inode *inode); ++void au_dpri_dalias(struct inode *inode); ++void au_dpri_dentry(struct dentry *dentry); ++struct file; ++void au_dpri_file(struct file *filp); ++struct super_block; ++void au_dpri_sb(struct super_block *sb); ++ ++void au_dbg_sleep_jiffy(int jiffy); ++struct iattr; ++void au_dbg_iattr(struct iattr *ia); ++ ++#define au_dbg_verify_dinode(d) __au_dbg_verify_dinode(d, __func__, __LINE__) ++void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line); ++void au_dbg_verify_dir_parent(struct dentry *dentry, unsigned int sigen); ++void au_dbg_verify_nondir_parent(struct dentry *dentry, unsigned int sigen); ++void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen); ++void au_dbg_verify_kthread(void); ++ ++int __init au_debug_init(void); ++void au_debug_sbinfo_init(struct au_sbinfo *sbinfo); ++#define AuDbgWhlist(w) do { \ ++ AuDbg(#w "\n"); \ ++ au_dpri_whlist(w); \ ++} while (0) ++ ++#define AuDbgVdir(v) do { \ ++ AuDbg(#v "\n"); \ ++ au_dpri_vdir(v); \ ++} while (0) ++ ++#define AuDbgInode(i) do { \ ++ AuDbg(#i "\n"); \ ++ au_dpri_inode(i); \ ++} while (0) ++ ++#define AuDbgDAlias(i) do { \ ++ AuDbg(#i "\n"); \ ++ au_dpri_dalias(i); \ ++} while (0) ++ ++#define AuDbgDentry(d) do { \ ++ AuDbg(#d "\n"); \ ++ au_dpri_dentry(d); \ ++} while (0) ++ ++#define AuDbgFile(f) do { \ ++ AuDbg(#f "\n"); \ ++ au_dpri_file(f); \ ++} while (0) ++ ++#define AuDbgSb(sb) do { \ ++ AuDbg(#sb "\n"); \ ++ au_dpri_sb(sb); \ ++} while (0) ++ ++#define AuDbgSleep(sec) do { \ ++ AuDbg("sleep %d sec\n", sec); \ ++ ssleep(sec); \ ++} while (0) ++ ++#define AuDbgSleepJiffy(jiffy) do { \ ++ AuDbg("sleep %d jiffies\n", jiffy); \ ++ au_dbg_sleep_jiffy(jiffy); \ ++} while (0) ++ ++#define AuDbgIAttr(ia) do { \ ++ AuDbg("ia_valid 0x%x\n", (ia)->ia_valid); \ ++ au_dbg_iattr(ia); \ ++} while (0) ++ ++#define AuDbgSym(addr) do { \ ++ char sym[KSYM_SYMBOL_LEN]; \ ++ sprint_symbol(sym, (unsigned long)addr); \ ++ AuDbg("%s\n", sym); \ ++} while (0) ++ ++#define AuInfoSym(addr) do { \ ++ char sym[KSYM_SYMBOL_LEN]; \ ++ sprint_symbol(sym, (unsigned long)addr); \ ++ AuInfo("%s\n", sym); \ ++} while (0) ++#else ++AuStubVoid(au_dbg_verify_dinode, struct dentry *dentry) ++AuStubVoid(au_dbg_verify_dir_parent, struct dentry *dentry, unsigned int sigen) ++AuStubVoid(au_dbg_verify_nondir_parent, struct dentry *dentry, ++ unsigned int sigen) ++AuStubVoid(au_dbg_verify_gen, struct dentry *parent, unsigned int sigen) ++AuStubVoid(au_dbg_verify_kthread, void) ++AuStubInt0(__init au_debug_init, void) ++AuStubVoid(au_debug_sbinfo_init, struct au_sbinfo *sbinfo) ++ ++#define AuDbgWhlist(w) do {} while (0) ++#define AuDbgVdir(v) do {} while (0) ++#define AuDbgInode(i) do {} while (0) ++#define AuDbgDAlias(i) do {} while (0) ++#define AuDbgDentry(d) do {} while (0) ++#define AuDbgFile(f) do {} while (0) ++#define AuDbgSb(sb) do {} while (0) ++#define AuDbgSleep(sec) do {} while (0) ++#define AuDbgSleepJiffy(jiffy) do {} while (0) ++#define AuDbgIAttr(ia) do {} while (0) ++#define AuDbgSym(addr) do {} while (0) ++#define AuInfoSym(addr) do {} while (0) ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_MAGIC_SYSRQ ++int __init au_sysrq_init(void); ++void au_sysrq_fin(void); ++ ++#ifdef CONFIG_HW_CONSOLE ++#define au_dbg_blocked() do { \ ++ WARN_ON(1); \ ++ handle_sysrq('w'); \ ++} while (0) ++#else ++AuStubVoid(au_dbg_blocked, void) ++#endif ++ ++#else ++AuStubInt0(__init au_sysrq_init, void) ++AuStubVoid(au_sysrq_fin, void) ++AuStubVoid(au_dbg_blocked, void) ++#endif /* CONFIG_AUFS_MAGIC_SYSRQ */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DEBUG_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/dentry.c linux-3.2.0-gentoo-r1/fs/aufs/dentry.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/dentry.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/dentry.c 2012-01-17 12:11:24.562617629 +0100 +@@ -0,0 +1,1140 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * lookup and dentry operations ++ */ ++ ++#include ++#include "aufs.h" ++ ++static void au_h_nd(struct nameidata *h_nd, struct nameidata *nd) ++{ ++ if (nd) { ++ *h_nd = *nd; ++ ++ /* ++ * gave up supporting LOOKUP_CREATE/OPEN for lower fs, ++ * due to whiteout and branch permission. ++ */ ++ h_nd->flags &= ~(/*LOOKUP_PARENT |*/ LOOKUP_OPEN | LOOKUP_CREATE ++ | LOOKUP_FOLLOW | LOOKUP_EXCL); ++ /* unnecessary? */ ++ h_nd->intent.open.file = NULL; ++ } else ++ memset(h_nd, 0, sizeof(*h_nd)); ++} ++ ++struct au_lkup_one_args { ++ struct dentry **errp; ++ struct qstr *name; ++ struct dentry *h_parent; ++ struct au_branch *br; ++ struct nameidata *nd; ++}; ++ ++struct dentry *au_lkup_one(struct qstr *name, struct dentry *h_parent, ++ struct au_branch *br, struct nameidata *nd) ++{ ++ struct dentry *h_dentry; ++ int err; ++ struct nameidata h_nd; ++ ++ if (au_test_fs_null_nd(h_parent->d_sb)) ++ return vfsub_lookup_one_len(name->name, h_parent, name->len); ++ ++ au_h_nd(&h_nd, nd); ++ h_nd.path.dentry = h_parent; ++ h_nd.path.mnt = br->br_mnt; ++ ++ err = vfsub_name_hash(name->name, &h_nd.last, name->len); ++ h_dentry = ERR_PTR(err); ++ if (!err) { ++ path_get(&h_nd.path); ++ h_dentry = vfsub_lookup_hash(&h_nd); ++ path_put(&h_nd.path); ++ } ++ ++ AuTraceErrPtr(h_dentry); ++ return h_dentry; ++} ++ ++static void au_call_lkup_one(void *args) ++{ ++ struct au_lkup_one_args *a = args; ++ *a->errp = au_lkup_one(a->name, a->h_parent, a->br, a->nd); ++} ++ ++#define AuLkup_ALLOW_NEG 1 ++#define au_ftest_lkup(flags, name) ((flags) & AuLkup_##name) ++#define au_fset_lkup(flags, name) \ ++ do { (flags) |= AuLkup_##name; } while (0) ++#define au_fclr_lkup(flags, name) \ ++ do { (flags) &= ~AuLkup_##name; } while (0) ++ ++struct au_do_lookup_args { ++ unsigned int flags; ++ mode_t type; ++ struct nameidata *nd; ++}; ++ ++/* ++ * returns positive/negative dentry, NULL or an error. ++ * NULL means whiteout-ed or not-found. ++ */ ++static struct dentry* ++au_do_lookup(struct dentry *h_parent, struct dentry *dentry, ++ aufs_bindex_t bindex, struct qstr *wh_name, ++ struct au_do_lookup_args *args) ++{ ++ struct dentry *h_dentry; ++ struct inode *h_inode, *inode; ++ struct au_branch *br; ++ int wh_found, opq; ++ unsigned char wh_able; ++ const unsigned char allow_neg = !!au_ftest_lkup(args->flags, ALLOW_NEG); ++ ++ wh_found = 0; ++ br = au_sbr(dentry->d_sb, bindex); ++ wh_able = !!au_br_whable(br->br_perm); ++ if (wh_able) ++ wh_found = au_wh_test(h_parent, wh_name, br, /*try_sio*/0); ++ h_dentry = ERR_PTR(wh_found); ++ if (!wh_found) ++ goto real_lookup; ++ if (unlikely(wh_found < 0)) ++ goto out; ++ ++ /* We found a whiteout */ ++ /* au_set_dbend(dentry, bindex); */ ++ au_set_dbwh(dentry, bindex); ++ if (!allow_neg) ++ return NULL; /* success */ ++ ++real_lookup: ++ h_dentry = au_lkup_one(&dentry->d_name, h_parent, br, args->nd); ++ if (IS_ERR(h_dentry)) ++ goto out; ++ ++ h_inode = h_dentry->d_inode; ++ if (!h_inode) { ++ if (!allow_neg) ++ goto out_neg; ++ } else if (wh_found ++ || (args->type && args->type != (h_inode->i_mode & S_IFMT))) ++ goto out_neg; ++ ++ if (au_dbend(dentry) <= bindex) ++ au_set_dbend(dentry, bindex); ++ if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry)) ++ au_set_dbstart(dentry, bindex); ++ au_set_h_dptr(dentry, bindex, h_dentry); ++ ++ inode = dentry->d_inode; ++ if (!h_inode || !S_ISDIR(h_inode->i_mode) || !wh_able ++ || (inode && !S_ISDIR(inode->i_mode))) ++ goto out; /* success */ ++ ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ opq = au_diropq_test(h_dentry, br); ++ mutex_unlock(&h_inode->i_mutex); ++ if (opq > 0) ++ au_set_dbdiropq(dentry, bindex); ++ else if (unlikely(opq < 0)) { ++ au_set_h_dptr(dentry, bindex, NULL); ++ h_dentry = ERR_PTR(opq); ++ } ++ goto out; ++ ++out_neg: ++ dput(h_dentry); ++ h_dentry = NULL; ++out: ++ return h_dentry; ++} ++ ++static int au_test_shwh(struct super_block *sb, const struct qstr *name) ++{ ++ if (unlikely(!au_opt_test(au_mntflags(sb), SHWH) ++ && !strncmp(name->name, AUFS_WH_PFX, AUFS_WH_PFX_LEN))) ++ return -EPERM; ++ return 0; ++} ++ ++/* ++ * returns the number of lower positive dentries, ++ * otherwise an error. ++ * can be called at unlinking with @type is zero. ++ */ ++int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type, ++ struct nameidata *nd) ++{ ++ int npositive, err; ++ aufs_bindex_t bindex, btail, bdiropq; ++ unsigned char isdir; ++ struct qstr whname; ++ struct au_do_lookup_args args = { ++ .flags = 0, ++ .type = type, ++ .nd = nd ++ }; ++ const struct qstr *name = &dentry->d_name; ++ struct dentry *parent; ++ struct inode *inode; ++ ++ err = au_test_shwh(dentry->d_sb, name); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_wh_name_alloc(&whname, name); ++ if (unlikely(err)) ++ goto out; ++ ++ inode = dentry->d_inode; ++ isdir = !!(inode && S_ISDIR(inode->i_mode)); ++ if (!type) ++ au_fset_lkup(args.flags, ALLOW_NEG); ++ ++ npositive = 0; ++ parent = dget_parent(dentry); ++ btail = au_dbtaildir(parent); ++ for (bindex = bstart; bindex <= btail; bindex++) { ++ struct dentry *h_parent, *h_dentry; ++ struct inode *h_inode, *h_dir; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry) { ++ if (h_dentry->d_inode) ++ npositive++; ++ if (type != S_IFDIR) ++ break; ++ continue; ++ } ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent) ++ continue; ++ h_dir = h_parent->d_inode; ++ if (!h_dir || !S_ISDIR(h_dir->i_mode)) ++ continue; ++ ++ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); ++ h_dentry = au_do_lookup(h_parent, dentry, bindex, &whname, ++ &args); ++ mutex_unlock(&h_dir->i_mutex); ++ err = PTR_ERR(h_dentry); ++ if (IS_ERR(h_dentry)) ++ goto out_parent; ++ au_fclr_lkup(args.flags, ALLOW_NEG); ++ ++ if (au_dbwh(dentry) >= 0) ++ break; ++ if (!h_dentry) ++ continue; ++ h_inode = h_dentry->d_inode; ++ if (!h_inode) ++ continue; ++ npositive++; ++ if (!args.type) ++ args.type = h_inode->i_mode & S_IFMT; ++ if (args.type != S_IFDIR) ++ break; ++ else if (isdir) { ++ /* the type of lower may be different */ ++ bdiropq = au_dbdiropq(dentry); ++ if (bdiropq >= 0 && bdiropq <= bindex) ++ break; ++ } ++ } ++ ++ if (npositive) { ++ AuLabel(positive); ++ au_update_dbstart(dentry); ++ } ++ err = npositive; ++ if (unlikely(!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE) ++ && au_dbstart(dentry) < 0)) { ++ err = -EIO; ++ AuIOErr("both of real entry and whiteout found, %.*s, err %d\n", ++ AuDLNPair(dentry), err); ++ } ++ ++out_parent: ++ dput(parent); ++ kfree(whname.name); ++out: ++ return err; ++} ++ ++struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent, ++ struct au_branch *br) ++{ ++ struct dentry *dentry; ++ int wkq_err; ++ ++ if (!au_test_h_perm_sio(parent->d_inode, MAY_EXEC)) ++ dentry = au_lkup_one(name, parent, br, /*nd*/NULL); ++ else { ++ struct au_lkup_one_args args = { ++ .errp = &dentry, ++ .name = name, ++ .h_parent = parent, ++ .br = br, ++ .nd = NULL ++ }; ++ ++ wkq_err = au_wkq_wait(au_call_lkup_one, &args); ++ if (unlikely(wkq_err)) ++ dentry = ERR_PTR(wkq_err); ++ } ++ ++ return dentry; ++} ++ ++/* ++ * lookup @dentry on @bindex which should be negative. ++ */ ++int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ int err; ++ struct dentry *parent, *h_parent, *h_dentry; ++ ++ parent = dget_parent(dentry); ++ h_parent = au_h_dptr(parent, bindex); ++ h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent, ++ au_sbr(dentry->d_sb, bindex)); ++ err = PTR_ERR(h_dentry); ++ if (IS_ERR(h_dentry)) ++ goto out; ++ if (unlikely(h_dentry->d_inode)) { ++ err = -EIO; ++ AuIOErr("%.*s should be negative on b%d.\n", ++ AuDLNPair(h_dentry), bindex); ++ dput(h_dentry); ++ goto out; ++ } ++ ++ err = 0; ++ if (bindex < au_dbstart(dentry)) ++ au_set_dbstart(dentry, bindex); ++ if (au_dbend(dentry) < bindex) ++ au_set_dbend(dentry, bindex); ++ au_set_h_dptr(dentry, bindex, h_dentry); ++ ++out: ++ dput(parent); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* subset of struct inode */ ++struct au_iattr { ++ unsigned long i_ino; ++ /* unsigned int i_nlink; */ ++ uid_t i_uid; ++ gid_t i_gid; ++ u64 i_version; ++/* ++ loff_t i_size; ++ blkcnt_t i_blocks; ++*/ ++ umode_t i_mode; ++}; ++ ++static void au_iattr_save(struct au_iattr *ia, struct inode *h_inode) ++{ ++ ia->i_ino = h_inode->i_ino; ++ /* ia->i_nlink = h_inode->i_nlink; */ ++ ia->i_uid = h_inode->i_uid; ++ ia->i_gid = h_inode->i_gid; ++ ia->i_version = h_inode->i_version; ++/* ++ ia->i_size = h_inode->i_size; ++ ia->i_blocks = h_inode->i_blocks; ++*/ ++ ia->i_mode = (h_inode->i_mode & S_IFMT); ++} ++ ++static int au_iattr_test(struct au_iattr *ia, struct inode *h_inode) ++{ ++ return ia->i_ino != h_inode->i_ino ++ /* || ia->i_nlink != h_inode->i_nlink */ ++ || ia->i_uid != h_inode->i_uid ++ || ia->i_gid != h_inode->i_gid ++ || ia->i_version != h_inode->i_version ++/* ++ || ia->i_size != h_inode->i_size ++ || ia->i_blocks != h_inode->i_blocks ++*/ ++ || ia->i_mode != (h_inode->i_mode & S_IFMT); ++} ++ ++static int au_h_verify_dentry(struct dentry *h_dentry, struct dentry *h_parent, ++ struct au_branch *br) ++{ ++ int err; ++ struct au_iattr ia; ++ struct inode *h_inode; ++ struct dentry *h_d; ++ struct super_block *h_sb; ++ ++ err = 0; ++ memset(&ia, -1, sizeof(ia)); ++ h_sb = h_dentry->d_sb; ++ h_inode = h_dentry->d_inode; ++ if (h_inode) ++ au_iattr_save(&ia, h_inode); ++ else if (au_test_nfs(h_sb) || au_test_fuse(h_sb)) ++ /* nfs d_revalidate may return 0 for negative dentry */ ++ /* fuse d_revalidate always return 0 for negative dentry */ ++ goto out; ++ ++ /* main purpose is namei.c:cached_lookup() and d_revalidate */ ++ h_d = au_lkup_one(&h_dentry->d_name, h_parent, br, /*nd*/NULL); ++ err = PTR_ERR(h_d); ++ if (IS_ERR(h_d)) ++ goto out; ++ ++ err = 0; ++ if (unlikely(h_d != h_dentry ++ || h_d->d_inode != h_inode ++ || (h_inode && au_iattr_test(&ia, h_inode)))) ++ err = au_busy_or_stale(); ++ dput(h_d); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, ++ struct dentry *h_parent, struct au_branch *br) ++{ ++ int err; ++ ++ err = 0; ++ if (udba == AuOpt_UDBA_REVAL ++ && !au_test_fs_remote(h_dentry->d_sb)) { ++ IMustLock(h_dir); ++ err = (h_dentry->d_parent->d_inode != h_dir); ++ } else if (udba != AuOpt_UDBA_NONE) ++ err = au_h_verify_dentry(h_dentry, h_parent, br); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_refresh_hdentry(struct dentry *dentry, struct dentry *parent) ++{ ++ int err; ++ aufs_bindex_t new_bindex, bindex, bend, bwh, bdiropq; ++ struct au_hdentry tmp, *p, *q; ++ struct au_dinfo *dinfo; ++ struct super_block *sb; ++ ++ DiMustWriteLock(dentry); ++ ++ sb = dentry->d_sb; ++ dinfo = au_di(dentry); ++ bend = dinfo->di_bend; ++ bwh = dinfo->di_bwh; ++ bdiropq = dinfo->di_bdiropq; ++ p = dinfo->di_hdentry + dinfo->di_bstart; ++ for (bindex = dinfo->di_bstart; bindex <= bend; bindex++, p++) { ++ if (!p->hd_dentry) ++ continue; ++ ++ new_bindex = au_br_index(sb, p->hd_id); ++ if (new_bindex == bindex) ++ continue; ++ ++ if (dinfo->di_bwh == bindex) ++ bwh = new_bindex; ++ if (dinfo->di_bdiropq == bindex) ++ bdiropq = new_bindex; ++ if (new_bindex < 0) { ++ au_hdput(p); ++ p->hd_dentry = NULL; ++ continue; ++ } ++ ++ /* swap two lower dentries, and loop again */ ++ q = dinfo->di_hdentry + new_bindex; ++ tmp = *q; ++ *q = *p; ++ *p = tmp; ++ if (tmp.hd_dentry) { ++ bindex--; ++ p--; ++ } ++ } ++ ++ dinfo->di_bwh = -1; ++ if (bwh >= 0 && bwh <= au_sbend(sb) && au_sbr_whable(sb, bwh)) ++ dinfo->di_bwh = bwh; ++ ++ dinfo->di_bdiropq = -1; ++ if (bdiropq >= 0 ++ && bdiropq <= au_sbend(sb) ++ && au_sbr_whable(sb, bdiropq)) ++ dinfo->di_bdiropq = bdiropq; ++ ++ err = -EIO; ++ dinfo->di_bstart = -1; ++ dinfo->di_bend = -1; ++ bend = au_dbend(parent); ++ p = dinfo->di_hdentry; ++ for (bindex = 0; bindex <= bend; bindex++, p++) ++ if (p->hd_dentry) { ++ dinfo->di_bstart = bindex; ++ break; ++ } ++ ++ if (dinfo->di_bstart >= 0) { ++ p = dinfo->di_hdentry + bend; ++ for (bindex = bend; bindex >= 0; bindex--, p--) ++ if (p->hd_dentry) { ++ dinfo->di_bend = bindex; ++ err = 0; ++ break; ++ } ++ } ++ ++ return err; ++} ++ ++static void au_do_hide(struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ inode = dentry->d_inode; ++ if (inode) { ++ if (!S_ISDIR(inode->i_mode)) { ++ if (inode->i_nlink && !d_unhashed(dentry)) ++ drop_nlink(inode); ++ } else { ++ clear_nlink(inode); ++ /* stop next lookup */ ++ inode->i_flags |= S_DEAD; ++ } ++ smp_mb(); /* necessary? */ ++ } ++ d_drop(dentry); ++} ++ ++static int au_hide_children(struct dentry *parent) ++{ ++ int err, i, j, ndentry; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry *dentry; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_dcsub_pages(&dpages, parent, NULL, NULL); ++ if (unlikely(err)) ++ goto out_dpages; ++ ++ /* in reverse order */ ++ for (i = dpages.ndpage - 1; i >= 0; i--) { ++ dpage = dpages.dpages + i; ++ ndentry = dpage->ndentry; ++ for (j = ndentry - 1; j >= 0; j--) { ++ dentry = dpage->dentries[j]; ++ if (dentry != parent) ++ au_do_hide(dentry); ++ } ++ } ++ ++out_dpages: ++ au_dpages_free(&dpages); ++out: ++ return err; ++} ++ ++static void au_hide(struct dentry *dentry) ++{ ++ int err; ++ struct inode *inode; ++ ++ AuDbgDentry(dentry); ++ inode = dentry->d_inode; ++ if (inode && S_ISDIR(inode->i_mode)) { ++ /* shrink_dcache_parent(dentry); */ ++ err = au_hide_children(dentry); ++ if (unlikely(err)) ++ AuIOErr("%.*s, failed hiding children, ignored %d\n", ++ AuDLNPair(dentry), err); ++ } ++ au_do_hide(dentry); ++} ++ ++/* ++ * By adding a dirty branch, a cached dentry may be affected in various ways. ++ * ++ * a dirty branch is added ++ * - on the top of layers ++ * - in the middle of layers ++ * - to the bottom of layers ++ * ++ * on the added branch there exists ++ * - a whiteout ++ * - a diropq ++ * - a same named entry ++ * + exist ++ * * negative --> positive ++ * * positive --> positive ++ * - type is unchanged ++ * - type is changed ++ * + doesn't exist ++ * * negative --> negative ++ * * positive --> negative (rejected by au_br_del() for non-dir case) ++ * - none ++ */ ++static int au_refresh_by_dinfo(struct dentry *dentry, struct au_dinfo *dinfo, ++ struct au_dinfo *tmp) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ struct { ++ struct dentry *dentry; ++ struct inode *inode; ++ mode_t mode; ++ } orig_h, tmp_h; ++ struct au_hdentry *hd; ++ struct inode *inode, *h_inode; ++ struct dentry *h_dentry; ++ ++ err = 0; ++ AuDebugOn(dinfo->di_bstart < 0); ++ orig_h.dentry = dinfo->di_hdentry[dinfo->di_bstart].hd_dentry; ++ orig_h.inode = orig_h.dentry->d_inode; ++ orig_h.mode = 0; ++ if (orig_h.inode) ++ orig_h.mode = orig_h.inode->i_mode & S_IFMT; ++ memset(&tmp_h, 0, sizeof(tmp_h)); ++ if (tmp->di_bstart >= 0) { ++ tmp_h.dentry = tmp->di_hdentry[tmp->di_bstart].hd_dentry; ++ tmp_h.inode = tmp_h.dentry->d_inode; ++ if (tmp_h.inode) ++ tmp_h.mode = tmp_h.inode->i_mode & S_IFMT; ++ } ++ ++ inode = dentry->d_inode; ++ if (!orig_h.inode) { ++ AuDbg("nagative originally\n"); ++ if (inode) { ++ au_hide(dentry); ++ goto out; ++ } ++ AuDebugOn(inode); ++ AuDebugOn(dinfo->di_bstart != dinfo->di_bend); ++ AuDebugOn(dinfo->di_bdiropq != -1); ++ ++ if (!tmp_h.inode) { ++ AuDbg("negative --> negative\n"); ++ /* should have only one negative lower */ ++ if (tmp->di_bstart >= 0 ++ && tmp->di_bstart < dinfo->di_bstart) { ++ AuDebugOn(tmp->di_bstart != tmp->di_bend); ++ AuDebugOn(dinfo->di_bstart != dinfo->di_bend); ++ au_set_h_dptr(dentry, dinfo->di_bstart, NULL); ++ au_di_cp(dinfo, tmp); ++ hd = tmp->di_hdentry + tmp->di_bstart; ++ au_set_h_dptr(dentry, tmp->di_bstart, ++ dget(hd->hd_dentry)); ++ } ++ au_dbg_verify_dinode(dentry); ++ } else { ++ AuDbg("negative --> positive\n"); ++ /* ++ * similar to the behaviour of creating with bypassing ++ * aufs. ++ * unhash it in order to force an error in the ++ * succeeding create operation. ++ * we should not set S_DEAD here. ++ */ ++ d_drop(dentry); ++ /* au_di_swap(tmp, dinfo); */ ++ au_dbg_verify_dinode(dentry); ++ } ++ } else { ++ AuDbg("positive originally\n"); ++ /* inode may be NULL */ ++ AuDebugOn(inode && (inode->i_mode & S_IFMT) != orig_h.mode); ++ if (!tmp_h.inode) { ++ AuDbg("positive --> negative\n"); ++ /* or bypassing aufs */ ++ au_hide(dentry); ++ if (tmp->di_bwh >= 0 && tmp->di_bwh <= dinfo->di_bstart) ++ dinfo->di_bwh = tmp->di_bwh; ++ if (inode) ++ err = au_refresh_hinode_self(inode); ++ au_dbg_verify_dinode(dentry); ++ } else if (orig_h.mode == tmp_h.mode) { ++ AuDbg("positive --> positive, same type\n"); ++ if (!S_ISDIR(orig_h.mode) ++ && dinfo->di_bstart > tmp->di_bstart) { ++ /* ++ * similar to the behaviour of removing and ++ * creating. ++ */ ++ au_hide(dentry); ++ if (inode) ++ err = au_refresh_hinode_self(inode); ++ au_dbg_verify_dinode(dentry); ++ } else { ++ /* fill empty slots */ ++ if (dinfo->di_bstart > tmp->di_bstart) ++ dinfo->di_bstart = tmp->di_bstart; ++ if (dinfo->di_bend < tmp->di_bend) ++ dinfo->di_bend = tmp->di_bend; ++ dinfo->di_bwh = tmp->di_bwh; ++ dinfo->di_bdiropq = tmp->di_bdiropq; ++ hd = tmp->di_hdentry; ++ bend = dinfo->di_bend; ++ for (bindex = tmp->di_bstart; bindex <= bend; ++ bindex++) { ++ if (au_h_dptr(dentry, bindex)) ++ continue; ++ h_dentry = hd[bindex].hd_dentry; ++ if (!h_dentry) ++ continue; ++ h_inode = h_dentry->d_inode; ++ AuDebugOn(!h_inode); ++ AuDebugOn(orig_h.mode ++ != (h_inode->i_mode ++ & S_IFMT)); ++ au_set_h_dptr(dentry, bindex, ++ dget(h_dentry)); ++ } ++ err = au_refresh_hinode(inode, dentry); ++ au_dbg_verify_dinode(dentry); ++ } ++ } else { ++ AuDbg("positive --> positive, different type\n"); ++ /* similar to the behaviour of removing and creating */ ++ au_hide(dentry); ++ if (inode) ++ err = au_refresh_hinode_self(inode); ++ au_dbg_verify_dinode(dentry); ++ } ++ } ++ ++out: ++ return err; ++} ++ ++int au_refresh_dentry(struct dentry *dentry, struct dentry *parent) ++{ ++ int err, ebrange; ++ unsigned int sigen; ++ struct au_dinfo *dinfo, *tmp; ++ struct super_block *sb; ++ struct inode *inode; ++ ++ DiMustWriteLock(dentry); ++ AuDebugOn(IS_ROOT(dentry)); ++ AuDebugOn(!parent->d_inode); ++ ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ sigen = au_sigen(sb); ++ err = au_digen_test(parent, sigen); ++ if (unlikely(err)) ++ goto out; ++ ++ dinfo = au_di(dentry); ++ err = au_di_realloc(dinfo, au_sbend(sb) + 1); ++ if (unlikely(err)) ++ goto out; ++ ebrange = au_dbrange_test(dentry); ++ if (!ebrange) ++ ebrange = au_do_refresh_hdentry(dentry, parent); ++ ++ if (d_unhashed(dentry) || ebrange) { ++ AuDebugOn(au_dbstart(dentry) < 0 && au_dbend(dentry) >= 0); ++ if (inode) ++ err = au_refresh_hinode_self(inode); ++ au_dbg_verify_dinode(dentry); ++ if (!err) ++ goto out_dgen; /* success */ ++ goto out; ++ } ++ ++ /* temporary dinfo */ ++ AuDbgDentry(dentry); ++ err = -ENOMEM; ++ tmp = au_di_alloc(sb, AuLsc_DI_TMP); ++ if (unlikely(!tmp)) ++ goto out; ++ au_di_swap(tmp, dinfo); ++ /* returns the number of positive dentries */ ++ /* ++ * if current working dir is removed, it returns an error. ++ * but the dentry is legal. ++ */ ++ err = au_lkup_dentry(dentry, /*bstart*/0, /*type*/0, /*nd*/NULL); ++ AuDbgDentry(dentry); ++ au_di_swap(tmp, dinfo); ++ if (err == -ENOENT) ++ err = 0; ++ if (err >= 0) { ++ /* compare/refresh by dinfo */ ++ AuDbgDentry(dentry); ++ err = au_refresh_by_dinfo(dentry, dinfo, tmp); ++ au_dbg_verify_dinode(dentry); ++ AuTraceErr(err); ++ } ++ au_rw_write_unlock(&tmp->di_rwsem); ++ au_di_free(tmp); ++ if (unlikely(err)) ++ goto out; ++ ++out_dgen: ++ au_update_digen(dentry); ++out: ++ if (unlikely(err && !(dentry->d_flags & DCACHE_NFSFS_RENAMED))) { ++ AuIOErr("failed refreshing %.*s, %d\n", ++ AuDLNPair(dentry), err); ++ AuDbgDentry(dentry); ++ } ++ AuTraceErr(err); ++ return err; ++} ++ ++static noinline_for_stack ++int au_do_h_d_reval(struct dentry *h_dentry, struct nameidata *nd, ++ struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ int err, valid; ++ int (*reval)(struct dentry *, struct nameidata *); ++ ++ err = 0; ++ if (!(h_dentry->d_flags & DCACHE_OP_REVALIDATE)) ++ goto out; ++ reval = h_dentry->d_op->d_revalidate; ++ ++ AuDbg("b%d\n", bindex); ++ if (au_test_fs_null_nd(h_dentry->d_sb)) ++ /* it may return tri-state */ ++ valid = reval(h_dentry, NULL); ++ else { ++ struct nameidata h_nd; ++ int locked; ++ struct dentry *parent; ++ ++ au_h_nd(&h_nd, nd); ++ parent = nd->path.dentry; ++ locked = (nd && nd->path.dentry != dentry); ++ if (locked) ++ di_read_lock_parent(parent, AuLock_IR); ++ BUG_ON(bindex > au_dbend(parent)); ++ h_nd.path.dentry = au_h_dptr(parent, bindex); ++ BUG_ON(!h_nd.path.dentry); ++ h_nd.path.mnt = au_sbr(parent->d_sb, bindex)->br_mnt; ++ path_get(&h_nd.path); ++ valid = reval(h_dentry, &h_nd); ++ path_put(&h_nd.path); ++ if (locked) ++ di_read_unlock(parent, AuLock_IR); ++ } ++ ++ if (unlikely(valid < 0)) ++ err = valid; ++ else if (!valid) ++ err = -EINVAL; ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* todo: remove this */ ++static int h_d_revalidate(struct dentry *dentry, struct inode *inode, ++ struct nameidata *nd, int do_udba) ++{ ++ int err; ++ umode_t mode, h_mode; ++ aufs_bindex_t bindex, btail, bstart, ibs, ibe; ++ unsigned char plus, unhashed, is_root, h_plus; ++ struct inode *h_inode, *h_cached_inode; ++ struct dentry *h_dentry; ++ struct qstr *name, *h_name; ++ ++ err = 0; ++ plus = 0; ++ mode = 0; ++ ibs = -1; ++ ibe = -1; ++ unhashed = !!d_unhashed(dentry); ++ is_root = !!IS_ROOT(dentry); ++ name = &dentry->d_name; ++ ++ /* ++ * Theoretically, REVAL test should be unnecessary in case of ++ * {FS,I}NOTIFY. ++ * But {fs,i}notify doesn't fire some necessary events, ++ * IN_ATTRIB for atime/nlink/pageio ++ * IN_DELETE for NFS dentry ++ * Let's do REVAL test too. ++ */ ++ if (do_udba && inode) { ++ mode = (inode->i_mode & S_IFMT); ++ plus = (inode->i_nlink > 0); ++ ibs = au_ibstart(inode); ++ ibe = au_ibend(inode); ++ } ++ ++ bstart = au_dbstart(dentry); ++ btail = bstart; ++ if (inode && S_ISDIR(inode->i_mode)) ++ btail = au_dbtaildir(dentry); ++ for (bindex = bstart; bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ ++ AuDbg("b%d, %.*s\n", bindex, AuDLNPair(h_dentry)); ++ spin_lock(&h_dentry->d_lock); ++ h_name = &h_dentry->d_name; ++ if (unlikely(do_udba ++ && !is_root ++ && (unhashed != !!d_unhashed(h_dentry) ++ || name->len != h_name->len ++ || memcmp(name->name, h_name->name, name->len)) ++ )) { ++ AuDbg("unhash 0x%x 0x%x, %.*s %.*s\n", ++ unhashed, d_unhashed(h_dentry), ++ AuDLNPair(dentry), AuDLNPair(h_dentry)); ++ spin_unlock(&h_dentry->d_lock); ++ goto err; ++ } ++ spin_unlock(&h_dentry->d_lock); ++ ++ err = au_do_h_d_reval(h_dentry, nd, dentry, bindex); ++ if (unlikely(err)) ++ /* do not goto err, to keep the errno */ ++ break; ++ ++ /* todo: plink too? */ ++ if (!do_udba) ++ continue; ++ ++ /* UDBA tests */ ++ h_inode = h_dentry->d_inode; ++ if (unlikely(!!inode != !!h_inode)) ++ goto err; ++ ++ h_plus = plus; ++ h_mode = mode; ++ h_cached_inode = h_inode; ++ if (h_inode) { ++ h_mode = (h_inode->i_mode & S_IFMT); ++ h_plus = (h_inode->i_nlink > 0); ++ } ++ if (inode && ibs <= bindex && bindex <= ibe) ++ h_cached_inode = au_h_iptr(inode, bindex); ++ ++ if (unlikely(plus != h_plus ++ || mode != h_mode ++ || h_cached_inode != h_inode)) ++ goto err; ++ continue; ++ ++ err: ++ err = -EINVAL; ++ break; ++ } ++ ++ return err; ++} ++ ++/* todo: consolidate with do_refresh() and au_reval_for_attr() */ ++static int simple_reval_dpath(struct dentry *dentry, unsigned int sigen) ++{ ++ int err; ++ struct dentry *parent; ++ ++ if (!au_digen_test(dentry, sigen)) ++ return 0; ++ ++ parent = dget_parent(dentry); ++ di_read_lock_parent(parent, AuLock_IR); ++ AuDebugOn(au_digen_test(parent, sigen)); ++ au_dbg_verify_gen(parent, sigen); ++ err = au_refresh_dentry(dentry, parent); ++ di_read_unlock(parent, AuLock_IR); ++ dput(parent); ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_reval_dpath(struct dentry *dentry, unsigned int sigen) ++{ ++ int err; ++ struct dentry *d, *parent; ++ struct inode *inode; ++ ++ if (!au_ftest_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR)) ++ return simple_reval_dpath(dentry, sigen); ++ ++ /* slow loop, keep it simple and stupid */ ++ /* cf: au_cpup_dirs() */ ++ err = 0; ++ parent = NULL; ++ while (au_digen_test(dentry, sigen)) { ++ d = dentry; ++ while (1) { ++ dput(parent); ++ parent = dget_parent(d); ++ if (!au_digen_test(parent, sigen)) ++ break; ++ d = parent; ++ } ++ ++ inode = d->d_inode; ++ if (d != dentry) ++ di_write_lock_child2(d); ++ ++ /* someone might update our dentry while we were sleeping */ ++ if (au_digen_test(d, sigen)) { ++ /* ++ * todo: consolidate with simple_reval_dpath(), ++ * do_refresh() and au_reval_for_attr(). ++ */ ++ di_read_lock_parent(parent, AuLock_IR); ++ err = au_refresh_dentry(d, parent); ++ di_read_unlock(parent, AuLock_IR); ++ } ++ ++ if (d != dentry) ++ di_write_unlock(d); ++ dput(parent); ++ if (unlikely(err)) ++ break; ++ } ++ ++ return err; ++} ++ ++/* ++ * if valid returns 1, otherwise 0. ++ */ ++static int aufs_d_revalidate(struct dentry *dentry, struct nameidata *nd) ++{ ++ int valid, err; ++ unsigned int sigen; ++ unsigned char do_udba; ++ struct super_block *sb; ++ struct inode *inode; ++ ++ /* todo: support rcu-walk? */ ++ if (nd && (nd->flags & LOOKUP_RCU)) ++ return -ECHILD; ++ ++ valid = 0; ++ if (unlikely(!au_di(dentry))) ++ goto out; ++ ++ inode = dentry->d_inode; ++ if (inode && is_bad_inode(inode)) ++ goto out; ++ ++ valid = 1; ++ sb = dentry->d_sb; ++ /* ++ * todo: very ugly ++ * i_mutex of parent dir may be held, ++ * but we should not return 'invalid' due to busy. ++ */ ++ err = aufs_read_lock(dentry, AuLock_FLUSH | AuLock_DW | AuLock_NOPLM); ++ if (unlikely(err)) { ++ valid = err; ++ AuTraceErr(err); ++ goto out; ++ } ++ if (unlikely(au_dbrange_test(dentry))) { ++ err = -EINVAL; ++ AuTraceErr(err); ++ goto out_dgrade; ++ } ++ ++ sigen = au_sigen(sb); ++ if (au_digen_test(dentry, sigen)) { ++ AuDebugOn(IS_ROOT(dentry)); ++ err = au_reval_dpath(dentry, sigen); ++ if (unlikely(err)) { ++ AuTraceErr(err); ++ goto out_dgrade; ++ } ++ } ++ di_downgrade_lock(dentry, AuLock_IR); ++ ++ err = -EINVAL; ++ if (inode && (IS_DEADDIR(inode) || !inode->i_nlink)) ++ goto out_inval; ++ ++ do_udba = !au_opt_test(au_mntflags(sb), UDBA_NONE); ++ if (do_udba && inode) { ++ aufs_bindex_t bstart = au_ibstart(inode); ++ struct inode *h_inode; ++ ++ if (bstart >= 0) { ++ h_inode = au_h_iptr(inode, bstart); ++ if (h_inode && au_test_higen(inode, h_inode)) ++ goto out_inval; ++ } ++ } ++ ++ err = h_d_revalidate(dentry, inode, nd, do_udba); ++ if (unlikely(!err && do_udba && au_dbstart(dentry) < 0)) { ++ err = -EIO; ++ AuDbg("both of real entry and whiteout found, %.*s, err %d\n", ++ AuDLNPair(dentry), err); ++ } ++ goto out_inval; ++ ++out_dgrade: ++ di_downgrade_lock(dentry, AuLock_IR); ++out_inval: ++ aufs_read_unlock(dentry, AuLock_IR); ++ AuTraceErr(err); ++ valid = !err; ++out: ++ if (!valid) { ++ AuDbg("%.*s invalid, %d\n", AuDLNPair(dentry), valid); ++ d_drop(dentry); ++ } ++ return valid; ++} ++ ++static void aufs_d_release(struct dentry *dentry) ++{ ++ if (au_di(dentry)) { ++ au_di_fin(dentry); ++ au_hn_di_reinit(dentry); ++ } ++} ++ ++const struct dentry_operations aufs_dop = { ++ .d_revalidate = aufs_d_revalidate, ++ .d_release = aufs_d_release ++}; +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/dentry.h linux-3.2.0-gentoo-r1/fs/aufs/dentry.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/dentry.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/dentry.h 2012-01-17 12:11:24.576506644 +0100 +@@ -0,0 +1,237 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * lookup and dentry operations ++ */ ++ ++#ifndef __AUFS_DENTRY_H__ ++#define __AUFS_DENTRY_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include "rwsem.h" ++ ++struct au_hdentry { ++ struct dentry *hd_dentry; ++ aufs_bindex_t hd_id; ++}; ++ ++struct au_dinfo { ++ atomic_t di_generation; ++ ++ struct au_rwsem di_rwsem; ++ aufs_bindex_t di_bstart, di_bend, di_bwh, di_bdiropq; ++ struct au_hdentry *di_hdentry; ++} ____cacheline_aligned_in_smp; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dentry.c */ ++extern const struct dentry_operations aufs_dop; ++struct au_branch; ++struct dentry *au_lkup_one(struct qstr *name, struct dentry *h_parent, ++ struct au_branch *br, struct nameidata *nd); ++struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent, ++ struct au_branch *br); ++int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, ++ struct dentry *h_parent, struct au_branch *br); ++ ++int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type, ++ struct nameidata *nd); ++int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex); ++int au_refresh_dentry(struct dentry *dentry, struct dentry *parent); ++int au_reval_dpath(struct dentry *dentry, unsigned int sigen); ++ ++/* dinfo.c */ ++void au_di_init_once(void *_di); ++struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc); ++void au_di_free(struct au_dinfo *dinfo); ++void au_di_swap(struct au_dinfo *a, struct au_dinfo *b); ++void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src); ++int au_di_init(struct dentry *dentry); ++void au_di_fin(struct dentry *dentry); ++int au_di_realloc(struct au_dinfo *dinfo, int nbr); ++ ++void di_read_lock(struct dentry *d, int flags, unsigned int lsc); ++void di_read_unlock(struct dentry *d, int flags); ++void di_downgrade_lock(struct dentry *d, int flags); ++void di_write_lock(struct dentry *d, unsigned int lsc); ++void di_write_unlock(struct dentry *d); ++void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir); ++void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir); ++void di_write_unlock2(struct dentry *d1, struct dentry *d2); ++ ++struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex); ++struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex); ++aufs_bindex_t au_dbtail(struct dentry *dentry); ++aufs_bindex_t au_dbtaildir(struct dentry *dentry); ++ ++void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_dentry); ++int au_digen_test(struct dentry *dentry, unsigned int sigen); ++int au_dbrange_test(struct dentry *dentry); ++void au_update_digen(struct dentry *dentry); ++void au_update_dbrange(struct dentry *dentry, int do_put_zero); ++void au_update_dbstart(struct dentry *dentry); ++void au_update_dbend(struct dentry *dentry); ++int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_dinfo *au_di(struct dentry *dentry) ++{ ++ return dentry->d_fsdata; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock subclass for dinfo */ ++enum { ++ AuLsc_DI_CHILD, /* child first */ ++ AuLsc_DI_CHILD2, /* rename(2), link(2), and cpup at hnotify */ ++ AuLsc_DI_CHILD3, /* copyup dirs */ ++ AuLsc_DI_PARENT, ++ AuLsc_DI_PARENT2, ++ AuLsc_DI_PARENT3, ++ AuLsc_DI_TMP /* temp for replacing dinfo */ ++}; ++ ++/* ++ * di_read_lock_child, di_write_lock_child, ++ * di_read_lock_child2, di_write_lock_child2, ++ * di_read_lock_child3, di_write_lock_child3, ++ * di_read_lock_parent, di_write_lock_parent, ++ * di_read_lock_parent2, di_write_lock_parent2, ++ * di_read_lock_parent3, di_write_lock_parent3, ++ */ ++#define AuReadLockFunc(name, lsc) \ ++static inline void di_read_lock_##name(struct dentry *d, int flags) \ ++{ di_read_lock(d, flags, AuLsc_DI_##lsc); } ++ ++#define AuWriteLockFunc(name, lsc) \ ++static inline void di_write_lock_##name(struct dentry *d) \ ++{ di_write_lock(d, AuLsc_DI_##lsc); } ++ ++#define AuRWLockFuncs(name, lsc) \ ++ AuReadLockFunc(name, lsc) \ ++ AuWriteLockFunc(name, lsc) ++ ++AuRWLockFuncs(child, CHILD); ++AuRWLockFuncs(child2, CHILD2); ++AuRWLockFuncs(child3, CHILD3); ++AuRWLockFuncs(parent, PARENT); ++AuRWLockFuncs(parent2, PARENT2); ++AuRWLockFuncs(parent3, PARENT3); ++ ++#undef AuReadLockFunc ++#undef AuWriteLockFunc ++#undef AuRWLockFuncs ++ ++#define DiMustNoWaiters(d) AuRwMustNoWaiters(&au_di(d)->di_rwsem) ++#define DiMustAnyLock(d) AuRwMustAnyLock(&au_di(d)->di_rwsem) ++#define DiMustWriteLock(d) AuRwMustWriteLock(&au_di(d)->di_rwsem) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* todo: memory barrier? */ ++static inline unsigned int au_digen(struct dentry *d) ++{ ++ return atomic_read(&au_di(d)->di_generation); ++} ++ ++static inline void au_h_dentry_init(struct au_hdentry *hdentry) ++{ ++ hdentry->hd_dentry = NULL; ++} ++ ++static inline void au_hdput(struct au_hdentry *hd) ++{ ++ if (hd) ++ dput(hd->hd_dentry); ++} ++ ++static inline aufs_bindex_t au_dbstart(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_bstart; ++} ++ ++static inline aufs_bindex_t au_dbend(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_bend; ++} ++ ++static inline aufs_bindex_t au_dbwh(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_bwh; ++} ++ ++static inline aufs_bindex_t au_dbdiropq(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_bdiropq; ++} ++ ++/* todo: hard/soft set? */ ++static inline void au_set_dbstart(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ au_di(dentry)->di_bstart = bindex; ++} ++ ++static inline void au_set_dbend(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ au_di(dentry)->di_bend = bindex; ++} ++ ++static inline void au_set_dbwh(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ /* dbwh can be outside of bstart - bend range */ ++ au_di(dentry)->di_bwh = bindex; ++} ++ ++static inline void au_set_dbdiropq(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ au_di(dentry)->di_bdiropq = bindex; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_HNOTIFY ++static inline void au_digen_dec(struct dentry *d) ++{ ++ atomic_dec(&au_di(d)->di_generation); ++} ++ ++static inline void au_hn_di_reinit(struct dentry *dentry) ++{ ++ dentry->d_fsdata = NULL; ++} ++#else ++AuStubVoid(au_hn_di_reinit, struct dentry *dentry __maybe_unused) ++#endif /* CONFIG_AUFS_HNOTIFY */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DENTRY_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/dinfo.c linux-3.2.0-gentoo-r1/fs/aufs/dinfo.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/dinfo.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/dinfo.c 2012-01-17 12:11:24.576506644 +0100 +@@ -0,0 +1,543 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * dentry private data ++ */ ++ ++#include "aufs.h" ++ ++void au_di_init_once(void *_dinfo) ++{ ++ struct au_dinfo *dinfo = _dinfo; ++ static struct lock_class_key aufs_di; ++ ++ au_rw_init(&dinfo->di_rwsem); ++ au_rw_class(&dinfo->di_rwsem, &aufs_di); ++} ++ ++struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc) ++{ ++ struct au_dinfo *dinfo; ++ int nbr, i; ++ ++ dinfo = au_cache_alloc_dinfo(); ++ if (unlikely(!dinfo)) ++ goto out; ++ ++ nbr = au_sbend(sb) + 1; ++ if (nbr <= 0) ++ nbr = 1; ++ dinfo->di_hdentry = kcalloc(nbr, sizeof(*dinfo->di_hdentry), GFP_NOFS); ++ if (dinfo->di_hdentry) { ++ au_rw_write_lock_nested(&dinfo->di_rwsem, lsc); ++ dinfo->di_bstart = -1; ++ dinfo->di_bend = -1; ++ dinfo->di_bwh = -1; ++ dinfo->di_bdiropq = -1; ++ for (i = 0; i < nbr; i++) ++ dinfo->di_hdentry[i].hd_id = -1; ++ goto out; ++ } ++ ++ au_cache_free_dinfo(dinfo); ++ dinfo = NULL; ++ ++out: ++ return dinfo; ++} ++ ++void au_di_free(struct au_dinfo *dinfo) ++{ ++ struct au_hdentry *p; ++ aufs_bindex_t bend, bindex; ++ ++ /* dentry may not be revalidated */ ++ bindex = dinfo->di_bstart; ++ if (bindex >= 0) { ++ bend = dinfo->di_bend; ++ p = dinfo->di_hdentry + bindex; ++ while (bindex++ <= bend) ++ au_hdput(p++); ++ } ++ kfree(dinfo->di_hdentry); ++ au_cache_free_dinfo(dinfo); ++} ++ ++void au_di_swap(struct au_dinfo *a, struct au_dinfo *b) ++{ ++ struct au_hdentry *p; ++ aufs_bindex_t bi; ++ ++ AuRwMustWriteLock(&a->di_rwsem); ++ AuRwMustWriteLock(&b->di_rwsem); ++ ++#define DiSwap(v, name) \ ++ do { \ ++ v = a->di_##name; \ ++ a->di_##name = b->di_##name; \ ++ b->di_##name = v; \ ++ } while (0) ++ ++ DiSwap(p, hdentry); ++ DiSwap(bi, bstart); ++ DiSwap(bi, bend); ++ DiSwap(bi, bwh); ++ DiSwap(bi, bdiropq); ++ /* smp_mb(); */ ++ ++#undef DiSwap ++} ++ ++void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src) ++{ ++ AuRwMustWriteLock(&dst->di_rwsem); ++ AuRwMustWriteLock(&src->di_rwsem); ++ ++ dst->di_bstart = src->di_bstart; ++ dst->di_bend = src->di_bend; ++ dst->di_bwh = src->di_bwh; ++ dst->di_bdiropq = src->di_bdiropq; ++ /* smp_mb(); */ ++} ++ ++int au_di_init(struct dentry *dentry) ++{ ++ int err; ++ struct super_block *sb; ++ struct au_dinfo *dinfo; ++ ++ err = 0; ++ sb = dentry->d_sb; ++ dinfo = au_di_alloc(sb, AuLsc_DI_CHILD); ++ if (dinfo) { ++ atomic_set(&dinfo->di_generation, au_sigen(sb)); ++ /* smp_mb(); */ /* atomic_set */ ++ dentry->d_fsdata = dinfo; ++ } else ++ err = -ENOMEM; ++ ++ return err; ++} ++ ++void au_di_fin(struct dentry *dentry) ++{ ++ struct au_dinfo *dinfo; ++ ++ dinfo = au_di(dentry); ++ AuRwDestroy(&dinfo->di_rwsem); ++ au_di_free(dinfo); ++} ++ ++int au_di_realloc(struct au_dinfo *dinfo, int nbr) ++{ ++ int err, sz; ++ struct au_hdentry *hdp; ++ ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ err = -ENOMEM; ++ sz = sizeof(*hdp) * (dinfo->di_bend + 1); ++ if (!sz) ++ sz = sizeof(*hdp); ++ hdp = au_kzrealloc(dinfo->di_hdentry, sz, sizeof(*hdp) * nbr, GFP_NOFS); ++ if (hdp) { ++ dinfo->di_hdentry = hdp; ++ err = 0; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void do_ii_write_lock(struct inode *inode, unsigned int lsc) ++{ ++ switch (lsc) { ++ case AuLsc_DI_CHILD: ++ ii_write_lock_child(inode); ++ break; ++ case AuLsc_DI_CHILD2: ++ ii_write_lock_child2(inode); ++ break; ++ case AuLsc_DI_CHILD3: ++ ii_write_lock_child3(inode); ++ break; ++ case AuLsc_DI_PARENT: ++ ii_write_lock_parent(inode); ++ break; ++ case AuLsc_DI_PARENT2: ++ ii_write_lock_parent2(inode); ++ break; ++ case AuLsc_DI_PARENT3: ++ ii_write_lock_parent3(inode); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static void do_ii_read_lock(struct inode *inode, unsigned int lsc) ++{ ++ switch (lsc) { ++ case AuLsc_DI_CHILD: ++ ii_read_lock_child(inode); ++ break; ++ case AuLsc_DI_CHILD2: ++ ii_read_lock_child2(inode); ++ break; ++ case AuLsc_DI_CHILD3: ++ ii_read_lock_child3(inode); ++ break; ++ case AuLsc_DI_PARENT: ++ ii_read_lock_parent(inode); ++ break; ++ case AuLsc_DI_PARENT2: ++ ii_read_lock_parent2(inode); ++ break; ++ case AuLsc_DI_PARENT3: ++ ii_read_lock_parent3(inode); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++void di_read_lock(struct dentry *d, int flags, unsigned int lsc) ++{ ++ au_rw_read_lock_nested(&au_di(d)->di_rwsem, lsc); ++ if (d->d_inode) { ++ if (au_ftest_lock(flags, IW)) ++ do_ii_write_lock(d->d_inode, lsc); ++ else if (au_ftest_lock(flags, IR)) ++ do_ii_read_lock(d->d_inode, lsc); ++ } ++} ++ ++void di_read_unlock(struct dentry *d, int flags) ++{ ++ if (d->d_inode) { ++ if (au_ftest_lock(flags, IW)) { ++ au_dbg_verify_dinode(d); ++ ii_write_unlock(d->d_inode); ++ } else if (au_ftest_lock(flags, IR)) { ++ au_dbg_verify_dinode(d); ++ ii_read_unlock(d->d_inode); ++ } ++ } ++ au_rw_read_unlock(&au_di(d)->di_rwsem); ++} ++ ++void di_downgrade_lock(struct dentry *d, int flags) ++{ ++ if (d->d_inode && au_ftest_lock(flags, IR)) ++ ii_downgrade_lock(d->d_inode); ++ au_rw_dgrade_lock(&au_di(d)->di_rwsem); ++} ++ ++void di_write_lock(struct dentry *d, unsigned int lsc) ++{ ++ au_rw_write_lock_nested(&au_di(d)->di_rwsem, lsc); ++ if (d->d_inode) ++ do_ii_write_lock(d->d_inode, lsc); ++} ++ ++void di_write_unlock(struct dentry *d) ++{ ++ au_dbg_verify_dinode(d); ++ if (d->d_inode) ++ ii_write_unlock(d->d_inode); ++ au_rw_write_unlock(&au_di(d)->di_rwsem); ++} ++ ++void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir) ++{ ++ AuDebugOn(d1 == d2 ++ || d1->d_inode == d2->d_inode ++ || d1->d_sb != d2->d_sb); ++ ++ if (isdir && au_test_subdir(d1, d2)) { ++ di_write_lock_child(d1); ++ di_write_lock_child2(d2); ++ } else { ++ /* there should be no races */ ++ di_write_lock_child(d2); ++ di_write_lock_child2(d1); ++ } ++} ++ ++void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir) ++{ ++ AuDebugOn(d1 == d2 ++ || d1->d_inode == d2->d_inode ++ || d1->d_sb != d2->d_sb); ++ ++ if (isdir && au_test_subdir(d1, d2)) { ++ di_write_lock_parent(d1); ++ di_write_lock_parent2(d2); ++ } else { ++ /* there should be no races */ ++ di_write_lock_parent(d2); ++ di_write_lock_parent2(d1); ++ } ++} ++ ++void di_write_unlock2(struct dentry *d1, struct dentry *d2) ++{ ++ di_write_unlock(d1); ++ if (d1->d_inode == d2->d_inode) ++ au_rw_write_unlock(&au_di(d2)->di_rwsem); ++ else ++ di_write_unlock(d2); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ struct dentry *d; ++ ++ DiMustAnyLock(dentry); ++ ++ if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry)) ++ return NULL; ++ AuDebugOn(bindex < 0); ++ d = au_di(dentry)->di_hdentry[0 + bindex].hd_dentry; ++ AuDebugOn(d && d->d_count <= 0); ++ return d; ++} ++ ++/* ++ * extended version of au_h_dptr(). ++ * returns a hashed and positive h_dentry in bindex, NULL, or error. ++ */ ++struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ struct dentry *h_dentry; ++ struct inode *inode, *h_inode; ++ ++ inode = dentry->d_inode; ++ AuDebugOn(!inode); ++ ++ h_dentry = NULL; ++ if (au_dbstart(dentry) <= bindex ++ && bindex <= au_dbend(dentry)) ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry && !au_d_hashed_positive(h_dentry)) { ++ dget(h_dentry); ++ goto out; /* success */ ++ } ++ ++ AuDebugOn(bindex < au_ibstart(inode)); ++ AuDebugOn(au_ibend(inode) < bindex); ++ h_inode = au_h_iptr(inode, bindex); ++ h_dentry = d_find_alias(h_inode); ++ if (h_dentry) { ++ if (!IS_ERR(h_dentry)) { ++ if (!au_d_hashed_positive(h_dentry)) ++ goto out; /* success */ ++ dput(h_dentry); ++ } else ++ goto out; ++ } ++ ++ if (au_opt_test(au_mntflags(dentry->d_sb), PLINK)) { ++ h_dentry = au_plink_lkup(inode, bindex); ++ AuDebugOn(!h_dentry); ++ if (!IS_ERR(h_dentry)) { ++ if (!au_d_hashed_positive(h_dentry)) ++ goto out; /* success */ ++ dput(h_dentry); ++ h_dentry = NULL; ++ } ++ } ++ ++out: ++ AuDbgDentry(h_dentry); ++ return h_dentry; ++} ++ ++aufs_bindex_t au_dbtail(struct dentry *dentry) ++{ ++ aufs_bindex_t bend, bwh; ++ ++ bend = au_dbend(dentry); ++ if (0 <= bend) { ++ bwh = au_dbwh(dentry); ++ if (!bwh) ++ return bwh; ++ if (0 < bwh && bwh < bend) ++ return bwh - 1; ++ } ++ return bend; ++} ++ ++aufs_bindex_t au_dbtaildir(struct dentry *dentry) ++{ ++ aufs_bindex_t bend, bopq; ++ ++ bend = au_dbtail(dentry); ++ if (0 <= bend) { ++ bopq = au_dbdiropq(dentry); ++ if (0 <= bopq && bopq < bend) ++ bend = bopq; ++ } ++ return bend; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_dentry) ++{ ++ struct au_hdentry *hd = au_di(dentry)->di_hdentry + bindex; ++ struct au_branch *br; ++ ++ DiMustWriteLock(dentry); ++ ++ au_hdput(hd); ++ hd->hd_dentry = h_dentry; ++ if (h_dentry) { ++ br = au_sbr(dentry->d_sb, bindex); ++ hd->hd_id = br->br_id; ++ } ++} ++ ++int au_dbrange_test(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bstart, bend; ++ ++ err = 0; ++ bstart = au_dbstart(dentry); ++ bend = au_dbend(dentry); ++ if (bstart >= 0) ++ AuDebugOn(bend < 0 && bstart > bend); ++ else { ++ err = -EIO; ++ AuDebugOn(bend >= 0); ++ } ++ ++ return err; ++} ++ ++int au_digen_test(struct dentry *dentry, unsigned int sigen) ++{ ++ int err; ++ ++ err = 0; ++ if (unlikely(au_digen(dentry) != sigen ++ || au_iigen_test(dentry->d_inode, sigen))) ++ err = -EIO; ++ ++ return err; ++} ++ ++void au_update_digen(struct dentry *dentry) ++{ ++ atomic_set(&au_di(dentry)->di_generation, au_sigen(dentry->d_sb)); ++ /* smp_mb(); */ /* atomic_set */ ++} ++ ++void au_update_dbrange(struct dentry *dentry, int do_put_zero) ++{ ++ struct au_dinfo *dinfo; ++ struct dentry *h_d; ++ struct au_hdentry *hdp; ++ ++ DiMustWriteLock(dentry); ++ ++ dinfo = au_di(dentry); ++ if (!dinfo || dinfo->di_bstart < 0) ++ return; ++ ++ hdp = dinfo->di_hdentry; ++ if (do_put_zero) { ++ aufs_bindex_t bindex, bend; ++ ++ bend = dinfo->di_bend; ++ for (bindex = dinfo->di_bstart; bindex <= bend; bindex++) { ++ h_d = hdp[0 + bindex].hd_dentry; ++ if (h_d && !h_d->d_inode) ++ au_set_h_dptr(dentry, bindex, NULL); ++ } ++ } ++ ++ dinfo->di_bstart = -1; ++ while (++dinfo->di_bstart <= dinfo->di_bend) ++ if (hdp[0 + dinfo->di_bstart].hd_dentry) ++ break; ++ if (dinfo->di_bstart > dinfo->di_bend) { ++ dinfo->di_bstart = -1; ++ dinfo->di_bend = -1; ++ return; ++ } ++ ++ dinfo->di_bend++; ++ while (0 <= --dinfo->di_bend) ++ if (hdp[0 + dinfo->di_bend].hd_dentry) ++ break; ++ AuDebugOn(dinfo->di_bstart > dinfo->di_bend || dinfo->di_bend < 0); ++} ++ ++void au_update_dbstart(struct dentry *dentry) ++{ ++ aufs_bindex_t bindex, bend; ++ struct dentry *h_dentry; ++ ++ bend = au_dbend(dentry); ++ for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ if (h_dentry->d_inode) { ++ au_set_dbstart(dentry, bindex); ++ return; ++ } ++ au_set_h_dptr(dentry, bindex, NULL); ++ } ++} ++ ++void au_update_dbend(struct dentry *dentry) ++{ ++ aufs_bindex_t bindex, bstart; ++ struct dentry *h_dentry; ++ ++ bstart = au_dbstart(dentry); ++ for (bindex = au_dbend(dentry); bindex >= bstart; bindex--) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ if (h_dentry->d_inode) { ++ au_set_dbend(dentry, bindex); ++ return; ++ } ++ au_set_h_dptr(dentry, bindex, NULL); ++ } ++} ++ ++int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry) ++{ ++ aufs_bindex_t bindex, bend; ++ ++ bend = au_dbend(dentry); ++ for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) ++ if (au_h_dptr(dentry, bindex) == h_dentry) ++ return bindex; ++ return -1; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/dir.c linux-3.2.0-gentoo-r1/fs/aufs/dir.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/dir.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/dir.c 2012-01-17 12:11:24.583451150 +0100 +@@ -0,0 +1,634 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * directory operations ++ */ ++ ++#include ++#include "aufs.h" ++ ++void au_add_nlink(struct inode *dir, struct inode *h_dir) ++{ ++ unsigned int nlink; ++ ++ AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); ++ ++ nlink = dir->i_nlink; ++ nlink += h_dir->i_nlink - 2; ++ if (h_dir->i_nlink < 2) ++ nlink += 2; ++ set_nlink(dir, nlink); ++} ++ ++void au_sub_nlink(struct inode *dir, struct inode *h_dir) ++{ ++ unsigned int nlink; ++ ++ AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); ++ ++ nlink = dir->i_nlink; ++ nlink -= h_dir->i_nlink - 2; ++ if (h_dir->i_nlink < 2) ++ nlink -= 2; ++ set_nlink(dir, nlink); ++} ++ ++loff_t au_dir_size(struct file *file, struct dentry *dentry) ++{ ++ loff_t sz; ++ aufs_bindex_t bindex, bend; ++ struct file *h_file; ++ struct dentry *h_dentry; ++ ++ sz = 0; ++ if (file) { ++ AuDebugOn(!file->f_dentry); ++ AuDebugOn(!file->f_dentry->d_inode); ++ AuDebugOn(!S_ISDIR(file->f_dentry->d_inode->i_mode)); ++ ++ bend = au_fbend_dir(file); ++ for (bindex = au_fbstart(file); ++ bindex <= bend && sz < KMALLOC_MAX_SIZE; ++ bindex++) { ++ h_file = au_hf_dir(file, bindex); ++ if (h_file ++ && h_file->f_dentry ++ && h_file->f_dentry->d_inode) ++ sz += i_size_read(h_file->f_dentry->d_inode); ++ } ++ } else { ++ AuDebugOn(!dentry); ++ AuDebugOn(!dentry->d_inode); ++ AuDebugOn(!S_ISDIR(dentry->d_inode->i_mode)); ++ ++ bend = au_dbtaildir(dentry); ++ for (bindex = au_dbstart(dentry); ++ bindex <= bend && sz < KMALLOC_MAX_SIZE; ++ bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry && h_dentry->d_inode) ++ sz += i_size_read(h_dentry->d_inode); ++ } ++ } ++ if (sz < KMALLOC_MAX_SIZE) ++ sz = roundup_pow_of_two(sz); ++ if (sz > KMALLOC_MAX_SIZE) ++ sz = KMALLOC_MAX_SIZE; ++ else if (sz < NAME_MAX) { ++ BUILD_BUG_ON(AUFS_RDBLK_DEF < NAME_MAX); ++ sz = AUFS_RDBLK_DEF; ++ } ++ return sz; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int reopen_dir(struct file *file) ++{ ++ int err; ++ unsigned int flags; ++ aufs_bindex_t bindex, btail, bstart; ++ struct dentry *dentry, *h_dentry; ++ struct file *h_file; ++ ++ /* open all lower dirs */ ++ dentry = file->f_dentry; ++ bstart = au_dbstart(dentry); ++ for (bindex = au_fbstart(file); bindex < bstart; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ au_set_fbstart(file, bstart); ++ ++ btail = au_dbtaildir(dentry); ++ for (bindex = au_fbend_dir(file); btail < bindex; bindex--) ++ au_set_h_fptr(file, bindex, NULL); ++ au_set_fbend_dir(file, btail); ++ ++ flags = vfsub_file_flags(file); ++ for (bindex = bstart; bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ h_file = au_hf_dir(file, bindex); ++ if (h_file) ++ continue; ++ ++ h_file = au_h_open(dentry, bindex, flags, file); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; /* close all? */ ++ au_set_h_fptr(file, bindex, h_file); ++ } ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ err = 0; ++ ++out: ++ return err; ++} ++ ++static int do_open_dir(struct file *file, int flags) ++{ ++ int err; ++ aufs_bindex_t bindex, btail; ++ struct dentry *dentry, *h_dentry; ++ struct file *h_file; ++ ++ FiMustWriteLock(file); ++ ++ dentry = file->f_dentry; ++ err = au_alive_dir(dentry); ++ if (unlikely(err)) ++ goto out; ++ ++ file->f_version = dentry->d_inode->i_version; ++ bindex = au_dbstart(dentry); ++ au_set_fbstart(file, bindex); ++ btail = au_dbtaildir(dentry); ++ au_set_fbend_dir(file, btail); ++ for (; !err && bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ ++ h_file = au_h_open(dentry, bindex, flags, file); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ break; ++ } ++ au_set_h_fptr(file, bindex, h_file); ++ } ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ if (!err) ++ return 0; /* success */ ++ ++ /* close all */ ++ for (bindex = au_fbstart(file); bindex <= btail; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ au_set_fbstart(file, -1); ++ au_set_fbend_dir(file, -1); ++ ++out: ++ return err; ++} ++ ++static int aufs_open_dir(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ int err; ++ struct super_block *sb; ++ struct au_fidir *fidir; ++ ++ err = -ENOMEM; ++ sb = file->f_dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ fidir = au_fidir_alloc(sb); ++ if (fidir) { ++ err = au_do_open(file, do_open_dir, fidir); ++ if (unlikely(err)) ++ kfree(fidir); ++ } ++ si_read_unlock(sb); ++ return err; ++} ++ ++static int aufs_release_dir(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ struct au_vdir *vdir_cache; ++ struct au_finfo *finfo; ++ struct au_fidir *fidir; ++ aufs_bindex_t bindex, bend; ++ ++ finfo = au_fi(file); ++ fidir = finfo->fi_hdir; ++ if (fidir) { ++ /* remove me from sb->s_files */ ++ file_sb_list_del(file); ++ ++ vdir_cache = fidir->fd_vdir_cache; /* lock-free */ ++ if (vdir_cache) ++ au_vdir_free(vdir_cache); ++ ++ bindex = finfo->fi_btop; ++ if (bindex >= 0) { ++ /* ++ * calls fput() instead of filp_close(), ++ * since no dnotify or lock for the lower file. ++ */ ++ bend = fidir->fd_bbot; ++ for (; bindex <= bend; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ } ++ kfree(fidir); ++ finfo->fi_hdir = NULL; ++ } ++ au_finfo_fin(file); ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_flush_dir(struct file *file, fl_owner_t id) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ struct file *h_file; ++ ++ err = 0; ++ bend = au_fbend_dir(file); ++ for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) { ++ h_file = au_hf_dir(file, bindex); ++ if (h_file) ++ err = vfsub_flush(h_file, id); ++ } ++ return err; ++} ++ ++static int aufs_flush_dir(struct file *file, fl_owner_t id) ++{ ++ return au_do_flush(file, id, au_do_flush_dir); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_fsync_dir_no_file(struct dentry *dentry, int datasync) ++{ ++ int err; ++ aufs_bindex_t bend, bindex; ++ struct inode *inode; ++ struct super_block *sb; ++ ++ err = 0; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ IMustLock(inode); ++ bend = au_dbend(dentry); ++ for (bindex = au_dbstart(dentry); !err && bindex <= bend; bindex++) { ++ struct path h_path; ++ ++ if (au_test_ro(sb, bindex, inode)) ++ continue; ++ h_path.dentry = au_h_dptr(dentry, bindex); ++ if (!h_path.dentry) ++ continue; ++ ++ h_path.mnt = au_sbr_mnt(sb, bindex); ++ err = vfsub_fsync(NULL, &h_path, datasync); ++ } ++ ++ return err; ++} ++ ++static int au_do_fsync_dir(struct file *file, int datasync) ++{ ++ int err; ++ aufs_bindex_t bend, bindex; ++ struct file *h_file; ++ struct super_block *sb; ++ struct inode *inode; ++ ++ err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ sb = file->f_dentry->d_sb; ++ inode = file->f_dentry->d_inode; ++ bend = au_fbend_dir(file); ++ for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) { ++ h_file = au_hf_dir(file, bindex); ++ if (!h_file || au_test_ro(sb, bindex, inode)) ++ continue; ++ ++ err = vfsub_fsync(h_file, &h_file->f_path, datasync); ++ } ++ ++out: ++ return err; ++} ++ ++/* ++ * @file may be NULL ++ */ ++static int aufs_fsync_dir(struct file *file, loff_t start, loff_t end, ++ int datasync) ++{ ++ int err; ++ struct dentry *dentry; ++ struct super_block *sb; ++ struct mutex *mtx; ++ ++ err = 0; ++ dentry = file->f_dentry; ++ mtx = &dentry->d_inode->i_mutex; ++ mutex_lock(mtx); ++ sb = dentry->d_sb; ++ si_noflush_read_lock(sb); ++ if (file) ++ err = au_do_fsync_dir(file, datasync); ++ else { ++ di_write_lock_child(dentry); ++ err = au_do_fsync_dir_no_file(dentry, datasync); ++ } ++ au_cpup_attr_timesizes(dentry->d_inode); ++ di_write_unlock(dentry); ++ if (file) ++ fi_write_unlock(file); ++ ++ si_read_unlock(sb); ++ mutex_unlock(mtx); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_readdir(struct file *file, void *dirent, filldir_t filldir) ++{ ++ int err; ++ struct dentry *dentry; ++ struct inode *inode, *h_inode; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ inode = dentry->d_inode; ++ IMustLock(inode); ++ ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ err = au_alive_dir(dentry); ++ if (!err) ++ err = au_vdir_init(file); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ h_inode = au_h_iptr(inode, au_ibstart(inode)); ++ if (!au_test_nfsd()) { ++ err = au_vdir_fill_de(file, dirent, filldir); ++ fsstack_copy_attr_atime(inode, h_inode); ++ } else { ++ /* ++ * nfsd filldir may call lookup_one_len(), vfs_getattr(), ++ * encode_fh() and others. ++ */ ++ atomic_inc(&h_inode->i_count); ++ di_read_unlock(dentry, AuLock_IR); ++ si_read_unlock(sb); ++ err = au_vdir_fill_de(file, dirent, filldir); ++ fsstack_copy_attr_atime(inode, h_inode); ++ fi_write_unlock(file); ++ iput(h_inode); ++ ++ AuTraceErr(err); ++ return err; ++ } ++ ++out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AuTestEmpty_WHONLY 1 ++#define AuTestEmpty_CALLED (1 << 1) ++#define AuTestEmpty_SHWH (1 << 2) ++#define au_ftest_testempty(flags, name) ((flags) & AuTestEmpty_##name) ++#define au_fset_testempty(flags, name) \ ++ do { (flags) |= AuTestEmpty_##name; } while (0) ++#define au_fclr_testempty(flags, name) \ ++ do { (flags) &= ~AuTestEmpty_##name; } while (0) ++ ++#ifndef CONFIG_AUFS_SHWH ++#undef AuTestEmpty_SHWH ++#define AuTestEmpty_SHWH 0 ++#endif ++ ++struct test_empty_arg { ++ struct au_nhash *whlist; ++ unsigned int flags; ++ int err; ++ aufs_bindex_t bindex; ++}; ++ ++static int test_empty_cb(void *__arg, const char *__name, int namelen, ++ loff_t offset __maybe_unused, u64 ino, ++ unsigned int d_type) ++{ ++ struct test_empty_arg *arg = __arg; ++ char *name = (void *)__name; ++ ++ arg->err = 0; ++ au_fset_testempty(arg->flags, CALLED); ++ /* smp_mb(); */ ++ if (name[0] == '.' ++ && (namelen == 1 || (name[1] == '.' && namelen == 2))) ++ goto out; /* success */ ++ ++ if (namelen <= AUFS_WH_PFX_LEN ++ || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { ++ if (au_ftest_testempty(arg->flags, WHONLY) ++ && !au_nhash_test_known_wh(arg->whlist, name, namelen)) ++ arg->err = -ENOTEMPTY; ++ goto out; ++ } ++ ++ name += AUFS_WH_PFX_LEN; ++ namelen -= AUFS_WH_PFX_LEN; ++ if (!au_nhash_test_known_wh(arg->whlist, name, namelen)) ++ arg->err = au_nhash_append_wh ++ (arg->whlist, name, namelen, ino, d_type, arg->bindex, ++ au_ftest_testempty(arg->flags, SHWH)); ++ ++out: ++ /* smp_mb(); */ ++ AuTraceErr(arg->err); ++ return arg->err; ++} ++ ++static int do_test_empty(struct dentry *dentry, struct test_empty_arg *arg) ++{ ++ int err; ++ struct file *h_file; ++ ++ h_file = au_h_open(dentry, arg->bindex, ++ O_RDONLY | O_NONBLOCK | O_DIRECTORY | O_LARGEFILE, ++ /*file*/NULL); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ ++ err = 0; ++ if (!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE) ++ && !h_file->f_dentry->d_inode->i_nlink) ++ goto out_put; ++ ++ do { ++ arg->err = 0; ++ au_fclr_testempty(arg->flags, CALLED); ++ /* smp_mb(); */ ++ err = vfsub_readdir(h_file, test_empty_cb, arg); ++ if (err >= 0) ++ err = arg->err; ++ } while (!err && au_ftest_testempty(arg->flags, CALLED)); ++ ++out_put: ++ fput(h_file); ++ au_sbr_put(dentry->d_sb, arg->bindex); ++out: ++ return err; ++} ++ ++struct do_test_empty_args { ++ int *errp; ++ struct dentry *dentry; ++ struct test_empty_arg *arg; ++}; ++ ++static void call_do_test_empty(void *args) ++{ ++ struct do_test_empty_args *a = args; ++ *a->errp = do_test_empty(a->dentry, a->arg); ++} ++ ++static int sio_test_empty(struct dentry *dentry, struct test_empty_arg *arg) ++{ ++ int err, wkq_err; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ ++ h_dentry = au_h_dptr(dentry, arg->bindex); ++ h_inode = h_dentry->d_inode; ++ /* todo: i_mode changes anytime? */ ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ err = au_test_h_perm_sio(h_inode, MAY_EXEC | MAY_READ); ++ mutex_unlock(&h_inode->i_mutex); ++ if (!err) ++ err = do_test_empty(dentry, arg); ++ else { ++ struct do_test_empty_args args = { ++ .errp = &err, ++ .dentry = dentry, ++ .arg = arg ++ }; ++ unsigned int flags = arg->flags; ++ ++ wkq_err = au_wkq_wait(call_do_test_empty, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ arg->flags = flags; ++ } ++ ++ return err; ++} ++ ++int au_test_empty_lower(struct dentry *dentry) ++{ ++ int err; ++ unsigned int rdhash; ++ aufs_bindex_t bindex, bstart, btail; ++ struct au_nhash whlist; ++ struct test_empty_arg arg; ++ ++ SiMustAnyLock(dentry->d_sb); ++ ++ rdhash = au_sbi(dentry->d_sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, dentry)); ++ err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ ++ arg.flags = 0; ++ arg.whlist = &whlist; ++ bstart = au_dbstart(dentry); ++ if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) ++ au_fset_testempty(arg.flags, SHWH); ++ arg.bindex = bstart; ++ err = do_test_empty(dentry, &arg); ++ if (unlikely(err)) ++ goto out_whlist; ++ ++ au_fset_testempty(arg.flags, WHONLY); ++ btail = au_dbtaildir(dentry); ++ for (bindex = bstart + 1; !err && bindex <= btail; bindex++) { ++ struct dentry *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry && h_dentry->d_inode) { ++ arg.bindex = bindex; ++ err = do_test_empty(dentry, &arg); ++ } ++ } ++ ++out_whlist: ++ au_nhash_wh_free(&whlist); ++out: ++ return err; ++} ++ ++int au_test_empty(struct dentry *dentry, struct au_nhash *whlist) ++{ ++ int err; ++ struct test_empty_arg arg; ++ aufs_bindex_t bindex, btail; ++ ++ err = 0; ++ arg.whlist = whlist; ++ arg.flags = AuTestEmpty_WHONLY; ++ if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) ++ au_fset_testempty(arg.flags, SHWH); ++ btail = au_dbtaildir(dentry); ++ for (bindex = au_dbstart(dentry); !err && bindex <= btail; bindex++) { ++ struct dentry *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry && h_dentry->d_inode) { ++ arg.bindex = bindex; ++ err = sio_test_empty(dentry, &arg); ++ } ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++const struct file_operations aufs_dir_fop = { ++ .owner = THIS_MODULE, ++ .llseek = default_llseek, ++ .read = generic_read_dir, ++ .readdir = aufs_readdir, ++ .unlocked_ioctl = aufs_ioctl_dir, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = aufs_compat_ioctl_dir, ++#endif ++ .open = aufs_open_dir, ++ .release = aufs_release_dir, ++ .flush = aufs_flush_dir, ++ .fsync = aufs_fsync_dir ++}; +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/dir.h linux-3.2.0-gentoo-r1/fs/aufs/dir.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/dir.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/dir.h 2012-01-17 12:11:24.604284671 +0100 +@@ -0,0 +1,137 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * directory operations ++ */ ++ ++#ifndef __AUFS_DIR_H__ ++#define __AUFS_DIR_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* need to be faster and smaller */ ++ ++struct au_nhash { ++ unsigned int nh_num; ++ struct hlist_head *nh_head; ++}; ++ ++struct au_vdir_destr { ++ unsigned char len; ++ unsigned char name[0]; ++} __packed; ++ ++struct au_vdir_dehstr { ++ struct hlist_node hash; ++ struct au_vdir_destr *str; ++} ____cacheline_aligned_in_smp; ++ ++struct au_vdir_de { ++ ino_t de_ino; ++ unsigned char de_type; ++ /* caution: packed */ ++ struct au_vdir_destr de_str; ++} __packed; ++ ++struct au_vdir_wh { ++ struct hlist_node wh_hash; ++#ifdef CONFIG_AUFS_SHWH ++ ino_t wh_ino; ++ aufs_bindex_t wh_bindex; ++ unsigned char wh_type; ++#else ++ aufs_bindex_t wh_bindex; ++#endif ++ /* caution: packed */ ++ struct au_vdir_destr wh_str; ++} __packed; ++ ++union au_vdir_deblk_p { ++ unsigned char *deblk; ++ struct au_vdir_de *de; ++}; ++ ++struct au_vdir { ++ unsigned char **vd_deblk; ++ unsigned long vd_nblk; ++ struct { ++ unsigned long ul; ++ union au_vdir_deblk_p p; ++ } vd_last; ++ ++ unsigned long vd_version; ++ unsigned int vd_deblk_sz; ++ unsigned long vd_jiffy; ++} ____cacheline_aligned_in_smp; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dir.c */ ++extern const struct file_operations aufs_dir_fop; ++void au_add_nlink(struct inode *dir, struct inode *h_dir); ++void au_sub_nlink(struct inode *dir, struct inode *h_dir); ++loff_t au_dir_size(struct file *file, struct dentry *dentry); ++int au_test_empty_lower(struct dentry *dentry); ++int au_test_empty(struct dentry *dentry, struct au_nhash *whlist); ++ ++/* vdir.c */ ++unsigned int au_rdhash_est(loff_t sz); ++int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp); ++void au_nhash_wh_free(struct au_nhash *whlist); ++int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, ++ int limit); ++int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen); ++int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, ++ unsigned int d_type, aufs_bindex_t bindex, ++ unsigned char shwh); ++void au_vdir_free(struct au_vdir *vdir); ++int au_vdir_init(struct file *file); ++int au_vdir_fill_de(struct file *file, void *dirent, filldir_t filldir); ++ ++/* ioctl.c */ ++long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg); ++ ++#ifdef CONFIG_AUFS_RDU ++/* rdu.c */ ++long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg); ++#ifdef CONFIG_COMPAT ++long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, ++ unsigned long arg); ++#endif ++#else ++static inline long au_rdu_ioctl(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ return -EINVAL; ++} ++#ifdef CONFIG_COMPAT ++static inline long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ return -EINVAL; ++} ++#endif ++#endif ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DIR_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/dynop.c linux-3.2.0-gentoo-r1/fs/aufs/dynop.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/dynop.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/dynop.c 2012-01-17 12:11:24.613544015 +0100 +@@ -0,0 +1,377 @@ ++/* ++ * Copyright (C) 2010-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * dynamically customizable operations for regular files ++ */ ++ ++#include "aufs.h" ++ ++#define DyPrSym(key) AuDbgSym(key->dk_op.dy_hop) ++ ++/* ++ * How large will these lists be? ++ * Usually just a few elements, 20-30 at most for each, I guess. ++ */ ++static struct au_splhead dynop[AuDyLast]; ++ ++static struct au_dykey *dy_gfind_get(struct au_splhead *spl, const void *h_op) ++{ ++ struct au_dykey *key, *tmp; ++ struct list_head *head; ++ ++ key = NULL; ++ head = &spl->head; ++ rcu_read_lock(); ++ list_for_each_entry_rcu(tmp, head, dk_list) ++ if (tmp->dk_op.dy_hop == h_op) { ++ key = tmp; ++ kref_get(&key->dk_kref); ++ break; ++ } ++ rcu_read_unlock(); ++ ++ return key; ++} ++ ++static struct au_dykey *dy_bradd(struct au_branch *br, struct au_dykey *key) ++{ ++ struct au_dykey **k, *found; ++ const void *h_op = key->dk_op.dy_hop; ++ int i; ++ ++ found = NULL; ++ k = br->br_dykey; ++ for (i = 0; i < AuBrDynOp; i++) ++ if (k[i]) { ++ if (k[i]->dk_op.dy_hop == h_op) { ++ found = k[i]; ++ break; ++ } ++ } else ++ break; ++ if (!found) { ++ spin_lock(&br->br_dykey_lock); ++ for (; i < AuBrDynOp; i++) ++ if (k[i]) { ++ if (k[i]->dk_op.dy_hop == h_op) { ++ found = k[i]; ++ break; ++ } ++ } else { ++ k[i] = key; ++ break; ++ } ++ spin_unlock(&br->br_dykey_lock); ++ BUG_ON(i == AuBrDynOp); /* expand the array */ ++ } ++ ++ return found; ++} ++ ++/* kref_get() if @key is already added */ ++static struct au_dykey *dy_gadd(struct au_splhead *spl, struct au_dykey *key) ++{ ++ struct au_dykey *tmp, *found; ++ struct list_head *head; ++ const void *h_op = key->dk_op.dy_hop; ++ ++ found = NULL; ++ head = &spl->head; ++ spin_lock(&spl->spin); ++ list_for_each_entry(tmp, head, dk_list) ++ if (tmp->dk_op.dy_hop == h_op) { ++ kref_get(&tmp->dk_kref); ++ found = tmp; ++ break; ++ } ++ if (!found) ++ list_add_rcu(&key->dk_list, head); ++ spin_unlock(&spl->spin); ++ ++ if (!found) ++ DyPrSym(key); ++ return found; ++} ++ ++static void dy_free_rcu(struct rcu_head *rcu) ++{ ++ struct au_dykey *key; ++ ++ key = container_of(rcu, struct au_dykey, dk_rcu); ++ DyPrSym(key); ++ kfree(key); ++} ++ ++static void dy_free(struct kref *kref) ++{ ++ struct au_dykey *key; ++ struct au_splhead *spl; ++ ++ key = container_of(kref, struct au_dykey, dk_kref); ++ spl = dynop + key->dk_op.dy_type; ++ au_spl_del_rcu(&key->dk_list, spl); ++ call_rcu(&key->dk_rcu, dy_free_rcu); ++} ++ ++void au_dy_put(struct au_dykey *key) ++{ ++ kref_put(&key->dk_kref, dy_free); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define DyDbgSize(cnt, op) AuDebugOn(cnt != sizeof(op)/sizeof(void *)) ++ ++#ifdef CONFIG_AUFS_DEBUG ++#define DyDbgDeclare(cnt) unsigned int cnt = 0 ++#define DyDbgInc(cnt) do { cnt++; } while (0) ++#else ++#define DyDbgDeclare(cnt) do {} while (0) ++#define DyDbgInc(cnt) do {} while (0) ++#endif ++ ++#define DySet(func, dst, src, h_op, h_sb) do { \ ++ DyDbgInc(cnt); \ ++ if (h_op->func) { \ ++ if (src.func) \ ++ dst.func = src.func; \ ++ else \ ++ AuDbg("%s %s\n", au_sbtype(h_sb), #func); \ ++ } \ ++} while (0) ++ ++#define DySetForce(func, dst, src) do { \ ++ AuDebugOn(!src.func); \ ++ DyDbgInc(cnt); \ ++ dst.func = src.func; \ ++} while (0) ++ ++#define DySetAop(func) \ ++ DySet(func, dyaop->da_op, aufs_aop, h_aop, h_sb) ++#define DySetAopForce(func) \ ++ DySetForce(func, dyaop->da_op, aufs_aop) ++ ++static void dy_aop(struct au_dykey *key, const void *h_op, ++ struct super_block *h_sb __maybe_unused) ++{ ++ struct au_dyaop *dyaop = (void *)key; ++ const struct address_space_operations *h_aop = h_op; ++ DyDbgDeclare(cnt); ++ ++ AuDbg("%s\n", au_sbtype(h_sb)); ++ ++ DySetAop(writepage); ++ DySetAopForce(readpage); /* force */ ++ DySetAop(writepages); ++ DySetAop(set_page_dirty); ++ DySetAop(readpages); ++ DySetAop(write_begin); ++ DySetAop(write_end); ++ DySetAop(bmap); ++ DySetAop(invalidatepage); ++ DySetAop(releasepage); ++ DySetAop(freepage); ++ /* these two will be changed according to an aufs mount option */ ++ DySetAop(direct_IO); ++ DySetAop(get_xip_mem); ++ DySetAop(migratepage); ++ DySetAop(launder_page); ++ DySetAop(is_partially_uptodate); ++ DySetAop(error_remove_page); ++ ++ DyDbgSize(cnt, *h_aop); ++ dyaop->da_get_xip_mem = h_aop->get_xip_mem; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void dy_bug(struct kref *kref) ++{ ++ BUG(); ++} ++ ++static struct au_dykey *dy_get(struct au_dynop *op, struct au_branch *br) ++{ ++ struct au_dykey *key, *old; ++ struct au_splhead *spl; ++ struct op { ++ unsigned int sz; ++ void (*set)(struct au_dykey *key, const void *h_op, ++ struct super_block *h_sb __maybe_unused); ++ }; ++ static const struct op a[] = { ++ [AuDy_AOP] = { ++ .sz = sizeof(struct au_dyaop), ++ .set = dy_aop ++ } ++ }; ++ const struct op *p; ++ ++ spl = dynop + op->dy_type; ++ key = dy_gfind_get(spl, op->dy_hop); ++ if (key) ++ goto out_add; /* success */ ++ ++ p = a + op->dy_type; ++ key = kzalloc(p->sz, GFP_NOFS); ++ if (unlikely(!key)) { ++ key = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ key->dk_op.dy_hop = op->dy_hop; ++ kref_init(&key->dk_kref); ++ p->set(key, op->dy_hop, br->br_mnt->mnt_sb); ++ old = dy_gadd(spl, key); ++ if (old) { ++ kfree(key); ++ key = old; ++ } ++ ++out_add: ++ old = dy_bradd(br, key); ++ if (old) ++ /* its ref-count should never be zero here */ ++ kref_put(&key->dk_kref, dy_bug); ++out: ++ return key; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * Aufs prohibits O_DIRECT by defaut even if the branch supports it. ++ * This behaviour is neccessary to return an error from open(O_DIRECT) instead ++ * of the succeeding I/O. The dio mount option enables O_DIRECT and makes ++ * open(O_DIRECT) always succeed, but the succeeding I/O may return an error. ++ * See the aufs manual in detail. ++ * ++ * To keep this behaviour, aufs has to set NULL to ->get_xip_mem too, and the ++ * performance of fadvise() and madvise() may be affected. ++ */ ++static void dy_adx(struct au_dyaop *dyaop, int do_dx) ++{ ++ if (!do_dx) { ++ dyaop->da_op.direct_IO = NULL; ++ dyaop->da_op.get_xip_mem = NULL; ++ } else { ++ dyaop->da_op.direct_IO = aufs_aop.direct_IO; ++ dyaop->da_op.get_xip_mem = aufs_aop.get_xip_mem; ++ if (!dyaop->da_get_xip_mem) ++ dyaop->da_op.get_xip_mem = NULL; ++ } ++} ++ ++static struct au_dyaop *dy_aget(struct au_branch *br, ++ const struct address_space_operations *h_aop, ++ int do_dx) ++{ ++ struct au_dyaop *dyaop; ++ struct au_dynop op; ++ ++ op.dy_type = AuDy_AOP; ++ op.dy_haop = h_aop; ++ dyaop = (void *)dy_get(&op, br); ++ if (IS_ERR(dyaop)) ++ goto out; ++ dy_adx(dyaop, do_dx); ++ ++out: ++ return dyaop; ++} ++ ++int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex, ++ struct inode *h_inode) ++{ ++ int err, do_dx; ++ struct super_block *sb; ++ struct au_branch *br; ++ struct au_dyaop *dyaop; ++ ++ AuDebugOn(!S_ISREG(h_inode->i_mode)); ++ IiMustWriteLock(inode); ++ ++ sb = inode->i_sb; ++ br = au_sbr(sb, bindex); ++ do_dx = !!au_opt_test(au_mntflags(sb), DIO); ++ dyaop = dy_aget(br, h_inode->i_mapping->a_ops, do_dx); ++ err = PTR_ERR(dyaop); ++ if (IS_ERR(dyaop)) ++ /* unnecessary to call dy_fput() */ ++ goto out; ++ ++ err = 0; ++ inode->i_mapping->a_ops = &dyaop->da_op; ++ ++out: ++ return err; ++} ++ ++/* ++ * Is it safe to replace a_ops during the inode/file is in operation? ++ * Yes, I hope so. ++ */ ++int au_dy_irefresh(struct inode *inode) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ struct inode *h_inode; ++ ++ err = 0; ++ if (S_ISREG(inode->i_mode)) { ++ bstart = au_ibstart(inode); ++ h_inode = au_h_iptr(inode, bstart); ++ err = au_dy_iaop(inode, bstart, h_inode); ++ } ++ return err; ++} ++ ++void au_dy_arefresh(int do_dx) ++{ ++ struct au_splhead *spl; ++ struct list_head *head; ++ struct au_dykey *key; ++ ++ spl = dynop + AuDy_AOP; ++ head = &spl->head; ++ spin_lock(&spl->spin); ++ list_for_each_entry(key, head, dk_list) ++ dy_adx((void *)key, do_dx); ++ spin_unlock(&spl->spin); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void __init au_dy_init(void) ++{ ++ int i; ++ ++ /* make sure that 'struct au_dykey *' can be any type */ ++ BUILD_BUG_ON(offsetof(struct au_dyaop, da_key)); ++ ++ for (i = 0; i < AuDyLast; i++) ++ au_spl_init(dynop + i); ++} ++ ++void au_dy_fin(void) ++{ ++ int i; ++ ++ for (i = 0; i < AuDyLast; i++) ++ WARN_ON(!list_empty(&dynop[i].head)); ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/dynop.h linux-3.2.0-gentoo-r1/fs/aufs/dynop.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/dynop.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/dynop.h 2012-01-17 12:11:24.613544015 +0100 +@@ -0,0 +1,76 @@ ++/* ++ * Copyright (C) 2010-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * dynamically customizable operations (for regular files only) ++ */ ++ ++#ifndef __AUFS_DYNOP_H__ ++#define __AUFS_DYNOP_H__ ++ ++#ifdef __KERNEL__ ++ ++#include "inode.h" ++ ++enum {AuDy_AOP, AuDyLast}; ++ ++struct au_dynop { ++ int dy_type; ++ union { ++ const void *dy_hop; ++ const struct address_space_operations *dy_haop; ++ }; ++}; ++ ++struct au_dykey { ++ union { ++ struct list_head dk_list; ++ struct rcu_head dk_rcu; ++ }; ++ struct au_dynop dk_op; ++ ++ /* ++ * during I am in the branch local array, kref is gotten. when the ++ * branch is removed, kref is put. ++ */ ++ struct kref dk_kref; ++}; ++ ++/* stop unioning since their sizes are very different from each other */ ++struct au_dyaop { ++ struct au_dykey da_key; ++ struct address_space_operations da_op; /* not const */ ++ int (*da_get_xip_mem)(struct address_space *, pgoff_t, int, ++ void **, unsigned long *); ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dynop.c */ ++struct au_branch; ++void au_dy_put(struct au_dykey *key); ++int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex, ++ struct inode *h_inode); ++int au_dy_irefresh(struct inode *inode); ++void au_dy_arefresh(int do_dio); ++ ++void __init au_dy_init(void); ++void au_dy_fin(void); ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DYNOP_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/export.c linux-3.2.0-gentoo-r1/fs/aufs/export.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/export.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/export.c 2012-01-17 12:11:24.629747864 +0100 +@@ -0,0 +1,804 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * export via nfs ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "aufs.h" ++ ++union conv { ++#ifdef CONFIG_AUFS_INO_T_64 ++ __u32 a[2]; ++#else ++ __u32 a[1]; ++#endif ++ ino_t ino; ++}; ++ ++static ino_t decode_ino(__u32 *a) ++{ ++ union conv u; ++ ++ BUILD_BUG_ON(sizeof(u.ino) != sizeof(u.a)); ++ u.a[0] = a[0]; ++#ifdef CONFIG_AUFS_INO_T_64 ++ u.a[1] = a[1]; ++#endif ++ return u.ino; ++} ++ ++static void encode_ino(__u32 *a, ino_t ino) ++{ ++ union conv u; ++ ++ u.ino = ino; ++ a[0] = u.a[0]; ++#ifdef CONFIG_AUFS_INO_T_64 ++ a[1] = u.a[1]; ++#endif ++} ++ ++/* NFS file handle */ ++enum { ++ Fh_br_id, ++ Fh_sigen, ++#ifdef CONFIG_AUFS_INO_T_64 ++ /* support 64bit inode number */ ++ Fh_ino1, ++ Fh_ino2, ++ Fh_dir_ino1, ++ Fh_dir_ino2, ++#else ++ Fh_ino1, ++ Fh_dir_ino1, ++#endif ++ Fh_igen, ++ Fh_h_type, ++ Fh_tail, ++ ++ Fh_ino = Fh_ino1, ++ Fh_dir_ino = Fh_dir_ino1 ++}; ++ ++static int au_test_anon(struct dentry *dentry) ++{ ++ /* note: read d_flags without d_lock */ ++ return !!(dentry->d_flags & DCACHE_DISCONNECTED); ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* inode generation external table */ ++ ++void au_xigen_inc(struct inode *inode) ++{ ++ loff_t pos; ++ ssize_t sz; ++ __u32 igen; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++ sb = inode->i_sb; ++ AuDebugOn(!au_opt_test(au_mntflags(sb), XINO)); ++ ++ sbinfo = au_sbi(sb); ++ pos = inode->i_ino; ++ pos *= sizeof(igen); ++ igen = inode->i_generation + 1; ++ sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xigen, &igen, ++ sizeof(igen), &pos); ++ if (sz == sizeof(igen)) ++ return; /* success */ ++ ++ if (unlikely(sz >= 0)) ++ AuIOErr("xigen error (%zd)\n", sz); ++} ++ ++int au_xigen_new(struct inode *inode) ++{ ++ int err; ++ loff_t pos; ++ ssize_t sz; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ ++ err = 0; ++ /* todo: dirty, at mount time */ ++ if (inode->i_ino == AUFS_ROOT_INO) ++ goto out; ++ sb = inode->i_sb; ++ SiMustAnyLock(sb); ++ if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) ++ goto out; ++ ++ err = -EFBIG; ++ pos = inode->i_ino; ++ if (unlikely(au_loff_max / sizeof(inode->i_generation) - 1 < pos)) { ++ AuIOErr1("too large i%lld\n", pos); ++ goto out; ++ } ++ pos *= sizeof(inode->i_generation); ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ file = sbinfo->si_xigen; ++ BUG_ON(!file); ++ ++ if (i_size_read(file->f_dentry->d_inode) ++ < pos + sizeof(inode->i_generation)) { ++ inode->i_generation = atomic_inc_return(&sbinfo->si_xigen_next); ++ sz = xino_fwrite(sbinfo->si_xwrite, file, &inode->i_generation, ++ sizeof(inode->i_generation), &pos); ++ } else ++ sz = xino_fread(sbinfo->si_xread, file, &inode->i_generation, ++ sizeof(inode->i_generation), &pos); ++ if (sz == sizeof(inode->i_generation)) ++ goto out; /* success */ ++ ++ err = sz; ++ if (unlikely(sz >= 0)) { ++ err = -EIO; ++ AuIOErr("xigen error (%zd)\n", sz); ++ } ++ ++out: ++ return err; ++} ++ ++int au_xigen_set(struct super_block *sb, struct file *base) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ file = au_xino_create2(base, sbinfo->si_xigen); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ err = 0; ++ if (sbinfo->si_xigen) ++ fput(sbinfo->si_xigen); ++ sbinfo->si_xigen = file; ++ ++out: ++ return err; ++} ++ ++void au_xigen_clr(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ if (sbinfo->si_xigen) { ++ fput(sbinfo->si_xigen); ++ sbinfo->si_xigen = NULL; ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry *decode_by_ino(struct super_block *sb, ino_t ino, ++ ino_t dir_ino) ++{ ++ struct dentry *dentry, *d; ++ struct inode *inode; ++ unsigned int sigen; ++ ++ dentry = NULL; ++ inode = ilookup(sb, ino); ++ if (!inode) ++ goto out; ++ ++ dentry = ERR_PTR(-ESTALE); ++ sigen = au_sigen(sb); ++ if (unlikely(is_bad_inode(inode) ++ || IS_DEADDIR(inode) ++ || sigen != au_iigen(inode))) ++ goto out_iput; ++ ++ dentry = NULL; ++ if (!dir_ino || S_ISDIR(inode->i_mode)) ++ dentry = d_find_alias(inode); ++ else { ++ spin_lock(&inode->i_lock); ++ list_for_each_entry(d, &inode->i_dentry, d_alias) { ++ spin_lock(&d->d_lock); ++ if (!au_test_anon(d) ++ && d->d_parent->d_inode->i_ino == dir_ino) { ++ dentry = dget_dlock(d); ++ spin_unlock(&d->d_lock); ++ break; ++ } ++ spin_unlock(&d->d_lock); ++ } ++ spin_unlock(&inode->i_lock); ++ } ++ if (unlikely(dentry && au_digen_test(dentry, sigen))) { ++ /* need to refresh */ ++ dput(dentry); ++ dentry = NULL; ++ } ++ ++out_iput: ++ iput(inode); ++out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* todo: dirty? */ ++/* if exportfs_decode_fh() passed vfsmount*, we could be happy */ ++ ++struct au_compare_mnt_args { ++ /* input */ ++ struct super_block *sb; ++ ++ /* output */ ++ struct vfsmount *mnt; ++}; ++ ++static int au_compare_mnt(struct vfsmount *mnt, void *arg) ++{ ++ struct au_compare_mnt_args *a = arg; ++ ++ if (mnt->mnt_sb != a->sb) ++ return 0; ++ a->mnt = mntget(mnt); ++ return 1; ++} ++ ++static struct vfsmount *au_mnt_get(struct super_block *sb) ++{ ++ int err; ++ struct au_compare_mnt_args args = { ++ .sb = sb ++ }; ++ struct mnt_namespace *ns; ++ ++ br_read_lock(vfsmount_lock); ++ /* no get/put ?? */ ++ AuDebugOn(!current->nsproxy); ++ ns = current->nsproxy->mnt_ns; ++ AuDebugOn(!ns); ++ err = iterate_mounts(au_compare_mnt, &args, ns->root); ++ br_read_unlock(vfsmount_lock); ++ AuDebugOn(!err); ++ AuDebugOn(!args.mnt); ++ return args.mnt; ++} ++ ++struct au_nfsd_si_lock { ++ unsigned int sigen; ++ aufs_bindex_t bindex, br_id; ++ unsigned char force_lock; ++}; ++ ++static int si_nfsd_read_lock(struct super_block *sb, ++ struct au_nfsd_si_lock *nsi_lock) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ ++ si_read_lock(sb, AuLock_FLUSH); ++ ++ /* branch id may be wrapped around */ ++ err = 0; ++ bindex = au_br_index(sb, nsi_lock->br_id); ++ if (bindex >= 0 && nsi_lock->sigen + AUFS_BRANCH_MAX > au_sigen(sb)) ++ goto out; /* success */ ++ ++ err = -ESTALE; ++ bindex = -1; ++ if (!nsi_lock->force_lock) ++ si_read_unlock(sb); ++ ++out: ++ nsi_lock->bindex = bindex; ++ return err; ++} ++ ++struct find_name_by_ino { ++ int called, found; ++ ino_t ino; ++ char *name; ++ int namelen; ++}; ++ ++static int ++find_name_by_ino(void *arg, const char *name, int namelen, loff_t offset, ++ u64 ino, unsigned int d_type) ++{ ++ struct find_name_by_ino *a = arg; ++ ++ a->called++; ++ if (a->ino != ino) ++ return 0; ++ ++ memcpy(a->name, name, namelen); ++ a->namelen = namelen; ++ a->found = 1; ++ return 1; ++} ++ ++static struct dentry *au_lkup_by_ino(struct path *path, ino_t ino, ++ struct au_nfsd_si_lock *nsi_lock) ++{ ++ struct dentry *dentry, *parent; ++ struct file *file; ++ struct inode *dir; ++ struct find_name_by_ino arg; ++ int err; ++ ++ parent = path->dentry; ++ if (nsi_lock) ++ si_read_unlock(parent->d_sb); ++ file = vfsub_dentry_open(path, au_dir_roflags); ++ dentry = (void *)file; ++ if (IS_ERR(file)) ++ goto out; ++ ++ dentry = ERR_PTR(-ENOMEM); ++ arg.name = __getname_gfp(GFP_NOFS); ++ if (unlikely(!arg.name)) ++ goto out_file; ++ arg.ino = ino; ++ arg.found = 0; ++ do { ++ arg.called = 0; ++ /* smp_mb(); */ ++ err = vfsub_readdir(file, find_name_by_ino, &arg); ++ } while (!err && !arg.found && arg.called); ++ dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_name; ++ dentry = ERR_PTR(-ENOENT); ++ if (!arg.found) ++ goto out_name; ++ ++ /* do not call au_lkup_one() */ ++ dir = parent->d_inode; ++ mutex_lock(&dir->i_mutex); ++ dentry = vfsub_lookup_one_len(arg.name, parent, arg.namelen); ++ mutex_unlock(&dir->i_mutex); ++ AuTraceErrPtr(dentry); ++ if (IS_ERR(dentry)) ++ goto out_name; ++ AuDebugOn(au_test_anon(dentry)); ++ if (unlikely(!dentry->d_inode)) { ++ dput(dentry); ++ dentry = ERR_PTR(-ENOENT); ++ } ++ ++out_name: ++ __putname(arg.name); ++out_file: ++ fput(file); ++out: ++ if (unlikely(nsi_lock ++ && si_nfsd_read_lock(parent->d_sb, nsi_lock) < 0)) ++ if (!IS_ERR(dentry)) { ++ dput(dentry); ++ dentry = ERR_PTR(-ESTALE); ++ } ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++static struct dentry *decode_by_dir_ino(struct super_block *sb, ino_t ino, ++ ino_t dir_ino, ++ struct au_nfsd_si_lock *nsi_lock) ++{ ++ struct dentry *dentry; ++ struct path path; ++ ++ if (dir_ino != AUFS_ROOT_INO) { ++ path.dentry = decode_by_ino(sb, dir_ino, 0); ++ dentry = path.dentry; ++ if (!path.dentry || IS_ERR(path.dentry)) ++ goto out; ++ AuDebugOn(au_test_anon(path.dentry)); ++ } else ++ path.dentry = dget(sb->s_root); ++ ++ path.mnt = au_mnt_get(sb); ++ dentry = au_lkup_by_ino(&path, ino, nsi_lock); ++ path_put(&path); ++ ++out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int h_acceptable(void *expv, struct dentry *dentry) ++{ ++ return 1; ++} ++ ++static char *au_build_path(struct dentry *h_parent, struct path *h_rootpath, ++ char *buf, int len, struct super_block *sb) ++{ ++ char *p; ++ int n; ++ struct path path; ++ ++ p = d_path(h_rootpath, buf, len); ++ if (IS_ERR(p)) ++ goto out; ++ n = strlen(p); ++ ++ path.mnt = h_rootpath->mnt; ++ path.dentry = h_parent; ++ p = d_path(&path, buf, len); ++ if (IS_ERR(p)) ++ goto out; ++ if (n != 1) ++ p += n; ++ ++ path.mnt = au_mnt_get(sb); ++ path.dentry = sb->s_root; ++ p = d_path(&path, buf, len - strlen(p)); ++ mntput(path.mnt); ++ if (IS_ERR(p)) ++ goto out; ++ if (n != 1) ++ p[strlen(p)] = '/'; ++ ++out: ++ AuTraceErrPtr(p); ++ return p; ++} ++ ++static ++struct dentry *decode_by_path(struct super_block *sb, ino_t ino, __u32 *fh, ++ int fh_len, struct au_nfsd_si_lock *nsi_lock) ++{ ++ struct dentry *dentry, *h_parent, *root; ++ struct super_block *h_sb; ++ char *pathname, *p; ++ struct vfsmount *h_mnt; ++ struct au_branch *br; ++ int err; ++ struct path path; ++ ++ br = au_sbr(sb, nsi_lock->bindex); ++ h_mnt = br->br_mnt; ++ h_sb = h_mnt->mnt_sb; ++ /* todo: call lower fh_to_dentry()? fh_to_parent()? */ ++ h_parent = exportfs_decode_fh(h_mnt, (void *)(fh + Fh_tail), ++ fh_len - Fh_tail, fh[Fh_h_type], ++ h_acceptable, /*context*/NULL); ++ dentry = h_parent; ++ if (unlikely(!h_parent || IS_ERR(h_parent))) { ++ AuWarn1("%s decode_fh failed, %ld\n", ++ au_sbtype(h_sb), PTR_ERR(h_parent)); ++ goto out; ++ } ++ dentry = NULL; ++ if (unlikely(au_test_anon(h_parent))) { ++ AuWarn1("%s decode_fh returned a disconnected dentry\n", ++ au_sbtype(h_sb)); ++ goto out_h_parent; ++ } ++ ++ dentry = ERR_PTR(-ENOMEM); ++ pathname = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!pathname)) ++ goto out_h_parent; ++ ++ root = sb->s_root; ++ path.mnt = h_mnt; ++ di_read_lock_parent(root, !AuLock_IR); ++ path.dentry = au_h_dptr(root, nsi_lock->bindex); ++ di_read_unlock(root, !AuLock_IR); ++ p = au_build_path(h_parent, &path, pathname, PAGE_SIZE, sb); ++ dentry = (void *)p; ++ if (IS_ERR(p)) ++ goto out_pathname; ++ ++ si_read_unlock(sb); ++ err = vfsub_kern_path(p, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); ++ dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_relock; ++ ++ dentry = ERR_PTR(-ENOENT); ++ AuDebugOn(au_test_anon(path.dentry)); ++ if (unlikely(!path.dentry->d_inode)) ++ goto out_path; ++ ++ if (ino != path.dentry->d_inode->i_ino) ++ dentry = au_lkup_by_ino(&path, ino, /*nsi_lock*/NULL); ++ else ++ dentry = dget(path.dentry); ++ ++out_path: ++ path_put(&path); ++out_relock: ++ if (unlikely(si_nfsd_read_lock(sb, nsi_lock) < 0)) ++ if (!IS_ERR(dentry)) { ++ dput(dentry); ++ dentry = ERR_PTR(-ESTALE); ++ } ++out_pathname: ++ free_page((unsigned long)pathname); ++out_h_parent: ++ dput(h_parent); ++out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry * ++aufs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, ++ int fh_type) ++{ ++ struct dentry *dentry; ++ __u32 *fh = fid->raw; ++ struct au_branch *br; ++ ino_t ino, dir_ino; ++ struct au_nfsd_si_lock nsi_lock = { ++ .force_lock = 0 ++ }; ++ ++ dentry = ERR_PTR(-ESTALE); ++ /* it should never happen, but the file handle is unreliable */ ++ if (unlikely(fh_len < Fh_tail)) ++ goto out; ++ nsi_lock.sigen = fh[Fh_sigen]; ++ nsi_lock.br_id = fh[Fh_br_id]; ++ ++ /* branch id may be wrapped around */ ++ br = NULL; ++ if (unlikely(si_nfsd_read_lock(sb, &nsi_lock))) ++ goto out; ++ nsi_lock.force_lock = 1; ++ ++ /* is this inode still cached? */ ++ ino = decode_ino(fh + Fh_ino); ++ /* it should never happen */ ++ if (unlikely(ino == AUFS_ROOT_INO)) ++ goto out; ++ ++ dir_ino = decode_ino(fh + Fh_dir_ino); ++ dentry = decode_by_ino(sb, ino, dir_ino); ++ if (IS_ERR(dentry)) ++ goto out_unlock; ++ if (dentry) ++ goto accept; ++ ++ /* is the parent dir cached? */ ++ br = au_sbr(sb, nsi_lock.bindex); ++ atomic_inc(&br->br_count); ++ dentry = decode_by_dir_ino(sb, ino, dir_ino, &nsi_lock); ++ if (IS_ERR(dentry)) ++ goto out_unlock; ++ if (dentry) ++ goto accept; ++ ++ /* lookup path */ ++ dentry = decode_by_path(sb, ino, fh, fh_len, &nsi_lock); ++ if (IS_ERR(dentry)) ++ goto out_unlock; ++ if (unlikely(!dentry)) ++ /* todo?: make it ESTALE */ ++ goto out_unlock; ++ ++accept: ++ if (!au_digen_test(dentry, au_sigen(sb)) ++ && dentry->d_inode->i_generation == fh[Fh_igen]) ++ goto out_unlock; /* success */ ++ ++ dput(dentry); ++ dentry = ERR_PTR(-ESTALE); ++out_unlock: ++ if (br) ++ atomic_dec(&br->br_count); ++ si_read_unlock(sb); ++out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++#if 0 /* reserved for future use */ ++/* support subtreecheck option */ ++static struct dentry *aufs_fh_to_parent(struct super_block *sb, struct fid *fid, ++ int fh_len, int fh_type) ++{ ++ struct dentry *parent; ++ __u32 *fh = fid->raw; ++ ino_t dir_ino; ++ ++ dir_ino = decode_ino(fh + Fh_dir_ino); ++ parent = decode_by_ino(sb, dir_ino, 0); ++ if (IS_ERR(parent)) ++ goto out; ++ if (!parent) ++ parent = decode_by_path(sb, au_br_index(sb, fh[Fh_br_id]), ++ dir_ino, fh, fh_len); ++ ++out: ++ AuTraceErrPtr(parent); ++ return parent; ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, ++ int connectable) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ struct super_block *sb, *h_sb; ++ struct inode *inode; ++ struct dentry *parent, *h_parent; ++ struct au_branch *br; ++ ++ AuDebugOn(au_test_anon(dentry)); ++ ++ parent = NULL; ++ err = -ENOSPC; ++ if (unlikely(*max_len <= Fh_tail)) { ++ AuWarn1("NFSv2 client (max_len %d)?\n", *max_len); ++ goto out; ++ } ++ ++ err = FILEID_ROOT; ++ if (IS_ROOT(dentry)) { ++ AuDebugOn(dentry->d_inode->i_ino != AUFS_ROOT_INO); ++ goto out; ++ } ++ ++ h_parent = NULL; ++ err = aufs_read_lock(dentry, AuLock_FLUSH | AuLock_IR | AuLock_GEN); ++ if (unlikely(err)) ++ goto out; ++ ++ inode = dentry->d_inode; ++ AuDebugOn(!inode); ++ sb = dentry->d_sb; ++#ifdef CONFIG_AUFS_DEBUG ++ if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) ++ AuWarn1("NFS-exporting requires xino\n"); ++#endif ++ err = -EIO; ++ parent = dget_parent(dentry); ++ di_read_lock_parent(parent, !AuLock_IR); ++ bend = au_dbtaildir(parent); ++ for (bindex = au_dbstart(parent); bindex <= bend; bindex++) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (h_parent) { ++ dget(h_parent); ++ break; ++ } ++ } ++ if (unlikely(!h_parent)) ++ goto out_unlock; ++ ++ err = -EPERM; ++ br = au_sbr(sb, bindex); ++ h_sb = br->br_mnt->mnt_sb; ++ if (unlikely(!h_sb->s_export_op)) { ++ AuErr1("%s branch is not exportable\n", au_sbtype(h_sb)); ++ goto out_dput; ++ } ++ ++ fh[Fh_br_id] = br->br_id; ++ fh[Fh_sigen] = au_sigen(sb); ++ encode_ino(fh + Fh_ino, inode->i_ino); ++ encode_ino(fh + Fh_dir_ino, parent->d_inode->i_ino); ++ fh[Fh_igen] = inode->i_generation; ++ ++ *max_len -= Fh_tail; ++ fh[Fh_h_type] = exportfs_encode_fh(h_parent, (void *)(fh + Fh_tail), ++ max_len, ++ /*connectable or subtreecheck*/0); ++ err = fh[Fh_h_type]; ++ *max_len += Fh_tail; ++ /* todo: macros? */ ++ if (err != 255) ++ err = 99; ++ else ++ AuWarn1("%s encode_fh failed\n", au_sbtype(h_sb)); ++ ++out_dput: ++ dput(h_parent); ++out_unlock: ++ di_read_unlock(parent, !AuLock_IR); ++ dput(parent); ++ aufs_read_unlock(dentry, AuLock_IR); ++out: ++ if (unlikely(err < 0)) ++ err = 255; ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_commit_metadata(struct inode *inode) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct super_block *sb; ++ struct inode *h_inode; ++ int (*f)(struct inode *inode); ++ ++ sb = inode->i_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ ii_write_lock_child(inode); ++ bindex = au_ibstart(inode); ++ AuDebugOn(bindex < 0); ++ h_inode = au_h_iptr(inode, bindex); ++ ++ f = h_inode->i_sb->s_export_op->commit_metadata; ++ if (f) ++ err = f(h_inode); ++ else { ++ struct writeback_control wbc = { ++ .sync_mode = WB_SYNC_ALL, ++ .nr_to_write = 0 /* metadata only */ ++ }; ++ ++ err = sync_inode(h_inode, &wbc); ++ } ++ ++ au_cpup_attr_timesizes(inode); ++ ii_write_unlock(inode); ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct export_operations aufs_export_op = { ++ .fh_to_dentry = aufs_fh_to_dentry, ++ /* .fh_to_parent = aufs_fh_to_parent, */ ++ .encode_fh = aufs_encode_fh, ++ .commit_metadata = aufs_commit_metadata ++}; ++ ++void au_export_init(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ __u32 u; ++ ++ sb->s_export_op = &aufs_export_op; ++ sbinfo = au_sbi(sb); ++ sbinfo->si_xigen = NULL; ++ get_random_bytes(&u, sizeof(u)); ++ BUILD_BUG_ON(sizeof(u) != sizeof(int)); ++ atomic_set(&sbinfo->si_xigen_next, u); ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/file.c linux-3.2.0-gentoo-r1/fs/aufs/file.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/file.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/file.c 2012-01-17 12:11:24.655211057 +0100 +@@ -0,0 +1,673 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * handling file/dir, and address_space operation ++ */ ++ ++#include ++#include "aufs.h" ++ ++/* drop flags for writing */ ++unsigned int au_file_roflags(unsigned int flags) ++{ ++ flags &= ~(O_WRONLY | O_RDWR | O_APPEND | O_CREAT | O_TRUNC); ++ flags |= O_RDONLY | O_NOATIME; ++ return flags; ++} ++ ++/* common functions to regular file and dir */ ++struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, ++ struct file *file) ++{ ++ struct file *h_file; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ struct super_block *sb; ++ struct au_branch *br; ++ struct path h_path; ++ int err, exec_flag; ++ ++ /* a race condition can happen between open and unlink/rmdir */ ++ h_file = ERR_PTR(-ENOENT); ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (au_test_nfsd() && !h_dentry) ++ goto out; ++ h_inode = h_dentry->d_inode; ++ if (au_test_nfsd() && !h_inode) ++ goto out; ++ spin_lock(&h_dentry->d_lock); ++ err = (!d_unhashed(dentry) && d_unlinked(h_dentry)) ++ || !h_inode ++ /* || !dentry->d_inode->i_nlink */ ++ ; ++ spin_unlock(&h_dentry->d_lock); ++ if (unlikely(err)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ br = au_sbr(sb, bindex); ++ h_file = ERR_PTR(-EACCES); ++ exec_flag = flags & __FMODE_EXEC; ++ if (exec_flag && (br->br_mnt->mnt_flags & MNT_NOEXEC)) ++ goto out; ++ ++ /* drop flags for writing */ ++ if (au_test_ro(sb, bindex, dentry->d_inode)) ++ flags = au_file_roflags(flags); ++ flags &= ~O_CREAT; ++ atomic_inc(&br->br_count); ++ h_path.dentry = h_dentry; ++ h_path.mnt = br->br_mnt; ++ if (!au_special_file(h_inode->i_mode)) ++ h_file = vfsub_dentry_open(&h_path, flags); ++ else { ++ /* this block depends upon the configuration */ ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ si_read_unlock(sb); ++ h_file = vfsub_dentry_open(&h_path, flags); ++ si_noflush_read_lock(sb); ++ fi_write_lock(file); ++ di_read_lock_child(dentry, AuLock_IR); ++ } ++ if (IS_ERR(h_file)) ++ goto out_br; ++ ++ if (exec_flag) { ++ err = deny_write_access(h_file); ++ if (unlikely(err)) { ++ fput(h_file); ++ h_file = ERR_PTR(err); ++ goto out_br; ++ } ++ } ++ fsnotify_open(h_file); ++ goto out; /* success */ ++ ++out_br: ++ atomic_dec(&br->br_count); ++out: ++ return h_file; ++} ++ ++int au_do_open(struct file *file, int (*open)(struct file *file, int flags), ++ struct au_fidir *fidir) ++{ ++ int err; ++ struct dentry *dentry; ++ ++ err = au_finfo_init(file, fidir); ++ if (unlikely(err)) ++ goto out; ++ ++ dentry = file->f_dentry; ++ di_read_lock_child(dentry, AuLock_IR); ++ err = open(file, vfsub_file_flags(file)); ++ di_read_unlock(dentry, AuLock_IR); ++ ++ fi_write_unlock(file); ++ if (unlikely(err)) { ++ au_fi(file)->fi_hdir = NULL; ++ au_finfo_fin(file); ++ } ++ ++out: ++ return err; ++} ++ ++int au_reopen_nondir(struct file *file) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ struct dentry *dentry; ++ struct file *h_file, *h_file_tmp; ++ ++ dentry = file->f_dentry; ++ AuDebugOn(au_special_file(dentry->d_inode->i_mode)); ++ bstart = au_dbstart(dentry); ++ h_file_tmp = NULL; ++ if (au_fbstart(file) == bstart) { ++ h_file = au_hf_top(file); ++ if (file->f_mode == h_file->f_mode) ++ return 0; /* success */ ++ h_file_tmp = h_file; ++ get_file(h_file_tmp); ++ au_set_h_fptr(file, bstart, NULL); ++ } ++ AuDebugOn(au_fi(file)->fi_hdir); ++ AuDebugOn(au_fbstart(file) < bstart); ++ ++ h_file = au_h_open(dentry, bstart, vfsub_file_flags(file) & ~O_TRUNC, ++ file); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; /* todo: close all? */ ++ ++ err = 0; ++ au_set_fbstart(file, bstart); ++ au_set_h_fptr(file, bstart, h_file); ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ ++out: ++ if (h_file_tmp) ++ fput(h_file_tmp); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_reopen_wh(struct file *file, aufs_bindex_t btgt, ++ struct dentry *hi_wh) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ struct au_dinfo *dinfo; ++ struct dentry *h_dentry; ++ struct au_hdentry *hdp; ++ ++ dinfo = au_di(file->f_dentry); ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ bstart = dinfo->di_bstart; ++ dinfo->di_bstart = btgt; ++ hdp = dinfo->di_hdentry; ++ h_dentry = hdp[0 + btgt].hd_dentry; ++ hdp[0 + btgt].hd_dentry = hi_wh; ++ err = au_reopen_nondir(file); ++ hdp[0 + btgt].hd_dentry = h_dentry; ++ dinfo->di_bstart = bstart; ++ ++ return err; ++} ++ ++static int au_ready_to_write_wh(struct file *file, loff_t len, ++ aufs_bindex_t bcpup) ++{ ++ int err; ++ struct inode *inode, *h_inode; ++ struct dentry *dentry, *h_dentry, *hi_wh; ++ ++ dentry = file->f_dentry; ++ au_update_dbstart(dentry); ++ inode = dentry->d_inode; ++ h_inode = NULL; ++ if (au_dbstart(dentry) <= bcpup && au_dbend(dentry) >= bcpup) { ++ h_dentry = au_h_dptr(dentry, bcpup); ++ if (h_dentry) ++ h_inode = h_dentry->d_inode; ++ } ++ hi_wh = au_hi_wh(inode, bcpup); ++ if (!hi_wh && !h_inode) ++ err = au_sio_cpup_wh(dentry, bcpup, len, file); ++ else ++ /* already copied-up after unlink */ ++ err = au_reopen_wh(file, bcpup, hi_wh); ++ ++ if (!err ++ && inode->i_nlink > 1 ++ && au_opt_test(au_mntflags(dentry->d_sb), PLINK)) ++ au_plink_append(inode, bcpup, au_h_dptr(dentry, bcpup)); ++ ++ return err; ++} ++ ++/* ++ * prepare the @file for writing. ++ */ ++int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin) ++{ ++ int err; ++ aufs_bindex_t bstart, bcpup, dbstart; ++ struct dentry *dentry, *parent, *h_dentry; ++ struct inode *h_inode, *inode; ++ struct super_block *sb; ++ struct file *h_file; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ AuDebugOn(au_special_file(inode->i_mode)); ++ bstart = au_fbstart(file); ++ err = au_test_ro(sb, bstart, inode); ++ if (!err && (au_hf_top(file)->f_mode & FMODE_WRITE)) { ++ err = au_pin(pin, dentry, bstart, AuOpt_UDBA_NONE, /*flags*/0); ++ goto out; ++ } ++ ++ /* need to cpup or reopen */ ++ parent = dget_parent(dentry); ++ di_write_lock_parent(parent); ++ err = AuWbrCopyup(au_sbi(sb), dentry); ++ bcpup = err; ++ if (unlikely(err < 0)) ++ goto out_dgrade; ++ err = 0; ++ ++ if (!d_unhashed(dentry) && !au_h_dptr(parent, bcpup)) { ++ err = au_cpup_dirs(dentry, bcpup); ++ if (unlikely(err)) ++ goto out_dgrade; ++ } ++ ++ err = au_pin(pin, dentry, bcpup, AuOpt_UDBA_NONE, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (unlikely(err)) ++ goto out_dgrade; ++ ++ h_dentry = au_hf_top(file)->f_dentry; ++ h_inode = h_dentry->d_inode; ++ dbstart = au_dbstart(dentry); ++ if (dbstart <= bcpup) { ++ h_dentry = au_h_dptr(dentry, bcpup); ++ AuDebugOn(!h_dentry); ++ h_inode = h_dentry->d_inode; ++ AuDebugOn(!h_inode); ++ bstart = bcpup; ++ } ++ ++ if (dbstart <= bcpup /* just reopen */ ++ || !d_unhashed(dentry) /* copyup and reopen */ ++ ) { ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ h_file = au_h_open_pre(dentry, bstart); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ h_file = NULL; ++ } else { ++ di_downgrade_lock(parent, AuLock_IR); ++ if (dbstart > bcpup) ++ err = au_sio_cpup_simple(dentry, bcpup, len, ++ AuCpup_DTIME); ++ if (!err) ++ err = au_reopen_nondir(file); ++ } ++ mutex_unlock(&h_inode->i_mutex); ++ au_h_open_post(dentry, bstart, h_file); ++ } else { /* copyup as wh and reopen */ ++ /* ++ * since writable hfsplus branch is not supported, ++ * h_open_pre/post() are unnecessary. ++ */ ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ err = au_ready_to_write_wh(file, len, bcpup); ++ di_downgrade_lock(parent, AuLock_IR); ++ mutex_unlock(&h_inode->i_mutex); ++ } ++ ++ if (!err) { ++ au_pin_set_parent_lflag(pin, /*lflag*/0); ++ goto out_dput; /* success */ ++ } ++ au_unpin(pin); ++ goto out_unlock; ++ ++out_dgrade: ++ di_downgrade_lock(parent, AuLock_IR); ++out_unlock: ++ di_read_unlock(parent, AuLock_IR); ++out_dput: ++ dput(parent); ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_do_flush(struct file *file, fl_owner_t id, ++ int (*flush)(struct file *file, fl_owner_t id)) ++{ ++ int err; ++ struct dentry *dentry; ++ struct super_block *sb; ++ struct inode *inode; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ si_noflush_read_lock(sb); ++ fi_read_lock(file); ++ ii_read_lock_child(inode); ++ ++ err = flush(file, id); ++ au_cpup_attr_timesizes(inode); ++ ++ ii_read_unlock(inode); ++ fi_read_unlock(file); ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_file_refresh_by_inode(struct file *file, int *need_reopen) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ struct au_pin pin; ++ struct au_finfo *finfo; ++ struct dentry *dentry, *parent, *hi_wh; ++ struct inode *inode; ++ struct super_block *sb; ++ ++ FiMustWriteLock(file); ++ ++ err = 0; ++ finfo = au_fi(file); ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ bstart = au_ibstart(inode); ++ if (bstart == finfo->fi_btop || IS_ROOT(dentry)) ++ goto out; ++ ++ parent = dget_parent(dentry); ++ if (au_test_ro(sb, bstart, inode)) { ++ di_read_lock_parent(parent, !AuLock_IR); ++ err = AuWbrCopyup(au_sbi(sb), dentry); ++ bstart = err; ++ di_read_unlock(parent, !AuLock_IR); ++ if (unlikely(err < 0)) ++ goto out_parent; ++ err = 0; ++ } ++ ++ di_read_lock_parent(parent, AuLock_IR); ++ hi_wh = au_hi_wh(inode, bstart); ++ if (!S_ISDIR(inode->i_mode) ++ && au_opt_test(au_mntflags(sb), PLINK) ++ && au_plink_test(inode) ++ && !d_unhashed(dentry)) { ++ err = au_test_and_cpup_dirs(dentry, bstart); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ /* always superio. */ ++ err = au_pin(&pin, dentry, bstart, AuOpt_UDBA_NONE, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (!err) ++ err = au_sio_cpup_simple(dentry, bstart, -1, ++ AuCpup_DTIME); ++ au_unpin(&pin); ++ } else if (hi_wh) { ++ /* already copied-up after unlink */ ++ err = au_reopen_wh(file, bstart, hi_wh); ++ *need_reopen = 0; ++ } ++ ++out_unlock: ++ di_read_unlock(parent, AuLock_IR); ++out_parent: ++ dput(parent); ++out: ++ return err; ++} ++ ++static void au_do_refresh_dir(struct file *file) ++{ ++ aufs_bindex_t bindex, bend, new_bindex, brid; ++ struct au_hfile *p, tmp, *q; ++ struct au_finfo *finfo; ++ struct super_block *sb; ++ struct au_fidir *fidir; ++ ++ FiMustWriteLock(file); ++ ++ sb = file->f_dentry->d_sb; ++ finfo = au_fi(file); ++ fidir = finfo->fi_hdir; ++ AuDebugOn(!fidir); ++ p = fidir->fd_hfile + finfo->fi_btop; ++ brid = p->hf_br->br_id; ++ bend = fidir->fd_bbot; ++ for (bindex = finfo->fi_btop; bindex <= bend; bindex++, p++) { ++ if (!p->hf_file) ++ continue; ++ ++ new_bindex = au_br_index(sb, p->hf_br->br_id); ++ if (new_bindex == bindex) ++ continue; ++ if (new_bindex < 0) { ++ au_set_h_fptr(file, bindex, NULL); ++ continue; ++ } ++ ++ /* swap two lower inode, and loop again */ ++ q = fidir->fd_hfile + new_bindex; ++ tmp = *q; ++ *q = *p; ++ *p = tmp; ++ if (tmp.hf_file) { ++ bindex--; ++ p--; ++ } ++ } ++ ++ p = fidir->fd_hfile; ++ if (!au_test_mmapped(file) && !d_unlinked(file->f_dentry)) { ++ bend = au_sbend(sb); ++ for (finfo->fi_btop = 0; finfo->fi_btop <= bend; ++ finfo->fi_btop++, p++) ++ if (p->hf_file) { ++ if (p->hf_file->f_dentry ++ && p->hf_file->f_dentry->d_inode) ++ break; ++ else ++ au_hfput(p, file); ++ } ++ } else { ++ bend = au_br_index(sb, brid); ++ for (finfo->fi_btop = 0; finfo->fi_btop < bend; ++ finfo->fi_btop++, p++) ++ if (p->hf_file) ++ au_hfput(p, file); ++ bend = au_sbend(sb); ++ } ++ ++ p = fidir->fd_hfile + bend; ++ for (fidir->fd_bbot = bend; fidir->fd_bbot >= finfo->fi_btop; ++ fidir->fd_bbot--, p--) ++ if (p->hf_file) { ++ if (p->hf_file->f_dentry ++ && p->hf_file->f_dentry->d_inode) ++ break; ++ else ++ au_hfput(p, file); ++ } ++ AuDebugOn(fidir->fd_bbot < finfo->fi_btop); ++} ++ ++/* ++ * after branch manipulating, refresh the file. ++ */ ++static int refresh_file(struct file *file, int (*reopen)(struct file *file)) ++{ ++ int err, need_reopen; ++ aufs_bindex_t bend, bindex; ++ struct dentry *dentry; ++ struct au_finfo *finfo; ++ struct au_hfile *hfile; ++ ++ dentry = file->f_dentry; ++ finfo = au_fi(file); ++ if (!finfo->fi_hdir) { ++ hfile = &finfo->fi_htop; ++ AuDebugOn(!hfile->hf_file); ++ bindex = au_br_index(dentry->d_sb, hfile->hf_br->br_id); ++ AuDebugOn(bindex < 0); ++ if (bindex != finfo->fi_btop) ++ au_set_fbstart(file, bindex); ++ } else { ++ err = au_fidir_realloc(finfo, au_sbend(dentry->d_sb) + 1); ++ if (unlikely(err)) ++ goto out; ++ au_do_refresh_dir(file); ++ } ++ ++ err = 0; ++ need_reopen = 1; ++ if (!au_test_mmapped(file)) ++ err = au_file_refresh_by_inode(file, &need_reopen); ++ if (!err && need_reopen && !d_unlinked(dentry)) ++ err = reopen(file); ++ if (!err) { ++ au_update_figen(file); ++ goto out; /* success */ ++ } ++ ++ /* error, close all lower files */ ++ if (finfo->fi_hdir) { ++ bend = au_fbend_dir(file); ++ for (bindex = au_fbstart(file); bindex <= bend; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ } ++ ++out: ++ return err; ++} ++ ++/* common function to regular file and dir */ ++int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), ++ int wlock) ++{ ++ int err; ++ unsigned int sigen, figen; ++ aufs_bindex_t bstart; ++ unsigned char pseudo_link; ++ struct dentry *dentry; ++ struct inode *inode; ++ ++ err = 0; ++ dentry = file->f_dentry; ++ inode = dentry->d_inode; ++ AuDebugOn(au_special_file(inode->i_mode)); ++ sigen = au_sigen(dentry->d_sb); ++ fi_write_lock(file); ++ figen = au_figen(file); ++ di_write_lock_child(dentry); ++ bstart = au_dbstart(dentry); ++ pseudo_link = (bstart != au_ibstart(inode)); ++ if (sigen == figen && !pseudo_link && au_fbstart(file) == bstart) { ++ if (!wlock) { ++ di_downgrade_lock(dentry, AuLock_IR); ++ fi_downgrade_lock(file); ++ } ++ goto out; /* success */ ++ } ++ ++ AuDbg("sigen %d, figen %d\n", sigen, figen); ++ if (au_digen_test(dentry, sigen)) { ++ err = au_reval_dpath(dentry, sigen); ++ AuDebugOn(!err && au_digen_test(dentry, sigen)); ++ } ++ ++ if (!err) ++ err = refresh_file(file, reopen); ++ if (!err) { ++ if (!wlock) { ++ di_downgrade_lock(dentry, AuLock_IR); ++ fi_downgrade_lock(file); ++ } ++ } else { ++ di_write_unlock(dentry); ++ fi_write_unlock(file); ++ } ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* cf. aufs_nopage() */ ++/* for madvise(2) */ ++static int aufs_readpage(struct file *file __maybe_unused, struct page *page) ++{ ++ unlock_page(page); ++ return 0; ++} ++ ++/* it will never be called, but necessary to support O_DIRECT */ ++static ssize_t aufs_direct_IO(int rw, struct kiocb *iocb, ++ const struct iovec *iov, loff_t offset, ++ unsigned long nr_segs) ++{ BUG(); return 0; } ++ ++/* ++ * it will never be called, but madvise and fadvise behaves differently ++ * when get_xip_mem is defined ++ */ ++static int aufs_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, ++ int create, void **kmem, unsigned long *pfn) ++{ BUG(); return 0; } ++ ++/* they will never be called. */ ++#ifdef CONFIG_AUFS_DEBUG ++static int aufs_write_begin(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned flags, ++ struct page **pagep, void **fsdata) ++{ AuUnsupport(); return 0; } ++static int aufs_write_end(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned copied, ++ struct page *page, void *fsdata) ++{ AuUnsupport(); return 0; } ++static int aufs_writepage(struct page *page, struct writeback_control *wbc) ++{ AuUnsupport(); return 0; } ++ ++static int aufs_set_page_dirty(struct page *page) ++{ AuUnsupport(); return 0; } ++static void aufs_invalidatepage(struct page *page, unsigned long offset) ++{ AuUnsupport(); } ++static int aufs_releasepage(struct page *page, gfp_t gfp) ++{ AuUnsupport(); return 0; } ++static int aufs_migratepage(struct address_space *mapping, struct page *newpage, ++ struct page *page) ++{ AuUnsupport(); return 0; } ++static int aufs_launder_page(struct page *page) ++{ AuUnsupport(); return 0; } ++static int aufs_is_partially_uptodate(struct page *page, ++ read_descriptor_t *desc, ++ unsigned long from) ++{ AuUnsupport(); return 0; } ++static int aufs_error_remove_page(struct address_space *mapping, ++ struct page *page) ++{ AuUnsupport(); return 0; } ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++const struct address_space_operations aufs_aop = { ++ .readpage = aufs_readpage, ++ .direct_IO = aufs_direct_IO, ++ .get_xip_mem = aufs_get_xip_mem, ++#ifdef CONFIG_AUFS_DEBUG ++ .writepage = aufs_writepage, ++ /* no writepages, because of writepage */ ++ .set_page_dirty = aufs_set_page_dirty, ++ /* no readpages, because of readpage */ ++ .write_begin = aufs_write_begin, ++ .write_end = aufs_write_end, ++ /* no bmap, no block device */ ++ .invalidatepage = aufs_invalidatepage, ++ .releasepage = aufs_releasepage, ++ .migratepage = aufs_migratepage, ++ .launder_page = aufs_launder_page, ++ .is_partially_uptodate = aufs_is_partially_uptodate, ++ .error_remove_page = aufs_error_remove_page ++#endif /* CONFIG_AUFS_DEBUG */ ++}; +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/file.h linux-3.2.0-gentoo-r1/fs/aufs/file.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/file.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/file.h 2012-01-17 12:11:24.664470399 +0100 +@@ -0,0 +1,298 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * file operations ++ */ ++ ++#ifndef __AUFS_FILE_H__ ++#define __AUFS_FILE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++#include "rwsem.h" ++ ++struct au_branch; ++struct au_hfile { ++ struct file *hf_file; ++ struct au_branch *hf_br; ++}; ++ ++struct au_vdir; ++struct au_fidir { ++ aufs_bindex_t fd_bbot; ++ aufs_bindex_t fd_nent; ++ struct au_vdir *fd_vdir_cache; ++ struct au_hfile fd_hfile[]; ++}; ++ ++static inline int au_fidir_sz(int nent) ++{ ++ AuDebugOn(nent < 0); ++ return sizeof(struct au_fidir) + sizeof(struct au_hfile) * nent; ++} ++ ++struct au_finfo { ++ atomic_t fi_generation; ++ ++ struct au_rwsem fi_rwsem; ++ aufs_bindex_t fi_btop; ++ ++ /* do not union them */ ++ struct { /* for non-dir */ ++ struct au_hfile fi_htop; ++ atomic_t fi_mmapped; ++ }; ++ struct au_fidir *fi_hdir; /* for dir only */ ++} ____cacheline_aligned_in_smp; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* file.c */ ++extern const struct address_space_operations aufs_aop; ++unsigned int au_file_roflags(unsigned int flags); ++struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, ++ struct file *file); ++int au_do_open(struct file *file, int (*open)(struct file *file, int flags), ++ struct au_fidir *fidir); ++int au_reopen_nondir(struct file *file); ++struct au_pin; ++int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin); ++int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), ++ int wlock); ++int au_do_flush(struct file *file, fl_owner_t id, ++ int (*flush)(struct file *file, fl_owner_t id)); ++ ++/* poll.c */ ++#ifdef CONFIG_AUFS_POLL ++unsigned int aufs_poll(struct file *file, poll_table *wait); ++#endif ++ ++#ifdef CONFIG_AUFS_BR_HFSPLUS ++/* hfsplus.c */ ++struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex); ++void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex, ++ struct file *h_file); ++#else ++static inline ++struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ return NULL; ++} ++ ++AuStubVoid(au_h_open_post, struct dentry *dentry, aufs_bindex_t bindex, ++ struct file *h_file); ++#endif ++ ++/* f_op.c */ ++extern const struct file_operations aufs_file_fop; ++int au_do_open_nondir(struct file *file, int flags); ++int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file); ++ ++#ifdef CONFIG_AUFS_SP_IATTR ++/* f_op_sp.c */ ++int au_special_file(umode_t mode); ++void au_init_special_fop(struct inode *inode, umode_t mode, dev_t rdev); ++#else ++AuStubInt0(au_special_file, umode_t mode) ++static inline void au_init_special_fop(struct inode *inode, umode_t mode, ++ dev_t rdev) ++{ ++ init_special_inode(inode, mode, rdev); ++} ++#endif ++ ++/* finfo.c */ ++void au_hfput(struct au_hfile *hf, struct file *file); ++void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, ++ struct file *h_file); ++ ++void au_update_figen(struct file *file); ++struct au_fidir *au_fidir_alloc(struct super_block *sb); ++int au_fidir_realloc(struct au_finfo *finfo, int nbr); ++ ++void au_fi_init_once(void *_fi); ++void au_finfo_fin(struct file *file); ++int au_finfo_init(struct file *file, struct au_fidir *fidir); ++ ++/* ioctl.c */ ++long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg); ++#ifdef CONFIG_COMPAT ++long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd, ++ unsigned long arg); ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_finfo *au_fi(struct file *file) ++{ ++ return file->private_data; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * fi_read_lock, fi_write_lock, ++ * fi_read_unlock, fi_write_unlock, fi_downgrade_lock ++ */ ++AuSimpleRwsemFuncs(fi, struct file *f, &au_fi(f)->fi_rwsem); ++ ++#define FiMustNoWaiters(f) AuRwMustNoWaiters(&au_fi(f)->fi_rwsem) ++#define FiMustAnyLock(f) AuRwMustAnyLock(&au_fi(f)->fi_rwsem) ++#define FiMustWriteLock(f) AuRwMustWriteLock(&au_fi(f)->fi_rwsem) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* todo: hard/soft set? */ ++static inline aufs_bindex_t au_fbstart(struct file *file) ++{ ++ FiMustAnyLock(file); ++ return au_fi(file)->fi_btop; ++} ++ ++static inline aufs_bindex_t au_fbend_dir(struct file *file) ++{ ++ FiMustAnyLock(file); ++ AuDebugOn(!au_fi(file)->fi_hdir); ++ return au_fi(file)->fi_hdir->fd_bbot; ++} ++ ++static inline struct au_vdir *au_fvdir_cache(struct file *file) ++{ ++ FiMustAnyLock(file); ++ AuDebugOn(!au_fi(file)->fi_hdir); ++ return au_fi(file)->fi_hdir->fd_vdir_cache; ++} ++ ++static inline void au_set_fbstart(struct file *file, aufs_bindex_t bindex) ++{ ++ FiMustWriteLock(file); ++ au_fi(file)->fi_btop = bindex; ++} ++ ++static inline void au_set_fbend_dir(struct file *file, aufs_bindex_t bindex) ++{ ++ FiMustWriteLock(file); ++ AuDebugOn(!au_fi(file)->fi_hdir); ++ au_fi(file)->fi_hdir->fd_bbot = bindex; ++} ++ ++static inline void au_set_fvdir_cache(struct file *file, ++ struct au_vdir *vdir_cache) ++{ ++ FiMustWriteLock(file); ++ AuDebugOn(!au_fi(file)->fi_hdir); ++ au_fi(file)->fi_hdir->fd_vdir_cache = vdir_cache; ++} ++ ++static inline struct file *au_hf_top(struct file *file) ++{ ++ FiMustAnyLock(file); ++ AuDebugOn(au_fi(file)->fi_hdir); ++ return au_fi(file)->fi_htop.hf_file; ++} ++ ++static inline struct file *au_hf_dir(struct file *file, aufs_bindex_t bindex) ++{ ++ FiMustAnyLock(file); ++ AuDebugOn(!au_fi(file)->fi_hdir); ++ return au_fi(file)->fi_hdir->fd_hfile[0 + bindex].hf_file; ++} ++ ++/* todo: memory barrier? */ ++static inline unsigned int au_figen(struct file *f) ++{ ++ return atomic_read(&au_fi(f)->fi_generation); ++} ++ ++static inline void au_set_mmapped(struct file *f) ++{ ++ if (atomic_inc_return(&au_fi(f)->fi_mmapped)) ++ return; ++ pr_warning("fi_mmapped wrapped around\n"); ++ while (!atomic_inc_return(&au_fi(f)->fi_mmapped)) ++ ; ++} ++ ++static inline void au_unset_mmapped(struct file *f) ++{ ++ atomic_dec(&au_fi(f)->fi_mmapped); ++} ++ ++static inline int au_test_mmapped(struct file *f) ++{ ++ return atomic_read(&au_fi(f)->fi_mmapped); ++} ++ ++/* customize vma->vm_file */ ++ ++static inline void au_do_vm_file_reset(struct vm_area_struct *vma, ++ struct file *file) ++{ ++ struct file *f; ++ ++ f = vma->vm_file; ++ get_file(file); ++ vma->vm_file = file; ++ fput(f); ++} ++ ++#ifdef CONFIG_MMU ++#define AuDbgVmRegion(file, vma) do {} while (0) ++ ++static inline void au_vm_file_reset(struct vm_area_struct *vma, ++ struct file *file) ++{ ++ au_do_vm_file_reset(vma, file); ++} ++#else ++#define AuDbgVmRegion(file, vma) \ ++ AuDebugOn((vma)->vm_region && (vma)->vm_region->vm_file != (file)) ++ ++static inline void au_vm_file_reset(struct vm_area_struct *vma, ++ struct file *file) ++{ ++ struct file *f; ++ ++ au_do_vm_file_reset(vma, file); ++ f = vma->vm_region->vm_file; ++ get_file(file); ++ vma->vm_region->vm_file = file; ++ fput(f); ++} ++#endif /* CONFIG_MMU */ ++ ++/* handle vma->vm_prfile */ ++static inline void au_vm_prfile_set(struct vm_area_struct *vma, ++ struct file *file) ++{ ++#ifdef CONFIG_AUFS_PROC_MAP ++ get_file(file); ++ vma->vm_prfile = file; ++#ifndef CONFIG_MMU ++ get_file(file); ++ vma->vm_region->vm_prfile = file; ++#endif ++#endif ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_FILE_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/finfo.c linux-3.2.0-gentoo-r1/fs/aufs/finfo.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/finfo.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/finfo.c 2012-01-17 12:11:24.689933591 +0100 +@@ -0,0 +1,156 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * file private data ++ */ ++ ++#include "aufs.h" ++ ++void au_hfput(struct au_hfile *hf, struct file *file) ++{ ++ /* todo: direct access f_flags */ ++ if (vfsub_file_flags(file) & __FMODE_EXEC) ++ allow_write_access(hf->hf_file); ++ fput(hf->hf_file); ++ hf->hf_file = NULL; ++ atomic_dec(&hf->hf_br->br_count); ++ hf->hf_br = NULL; ++} ++ ++void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, struct file *val) ++{ ++ struct au_finfo *finfo = au_fi(file); ++ struct au_hfile *hf; ++ struct au_fidir *fidir; ++ ++ fidir = finfo->fi_hdir; ++ if (!fidir) { ++ AuDebugOn(finfo->fi_btop != bindex); ++ hf = &finfo->fi_htop; ++ } else ++ hf = fidir->fd_hfile + bindex; ++ ++ if (hf && hf->hf_file) ++ au_hfput(hf, file); ++ if (val) { ++ FiMustWriteLock(file); ++ hf->hf_file = val; ++ hf->hf_br = au_sbr(file->f_dentry->d_sb, bindex); ++ } ++} ++ ++void au_update_figen(struct file *file) ++{ ++ atomic_set(&au_fi(file)->fi_generation, au_digen(file->f_dentry)); ++ /* smp_mb(); */ /* atomic_set */ ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_fidir *au_fidir_alloc(struct super_block *sb) ++{ ++ struct au_fidir *fidir; ++ int nbr; ++ ++ nbr = au_sbend(sb) + 1; ++ if (nbr < 2) ++ nbr = 2; /* initial allocate for 2 branches */ ++ fidir = kzalloc(au_fidir_sz(nbr), GFP_NOFS); ++ if (fidir) { ++ fidir->fd_bbot = -1; ++ fidir->fd_nent = nbr; ++ fidir->fd_vdir_cache = NULL; ++ } ++ ++ return fidir; ++} ++ ++int au_fidir_realloc(struct au_finfo *finfo, int nbr) ++{ ++ int err; ++ struct au_fidir *fidir, *p; ++ ++ AuRwMustWriteLock(&finfo->fi_rwsem); ++ fidir = finfo->fi_hdir; ++ AuDebugOn(!fidir); ++ ++ err = -ENOMEM; ++ p = au_kzrealloc(fidir, au_fidir_sz(fidir->fd_nent), au_fidir_sz(nbr), ++ GFP_NOFS); ++ if (p) { ++ p->fd_nent = nbr; ++ finfo->fi_hdir = p; ++ err = 0; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_finfo_fin(struct file *file) ++{ ++ struct au_finfo *finfo; ++ ++ au_nfiles_dec(file->f_dentry->d_sb); ++ ++ finfo = au_fi(file); ++ AuDebugOn(finfo->fi_hdir); ++ AuRwDestroy(&finfo->fi_rwsem); ++ au_cache_free_finfo(finfo); ++} ++ ++void au_fi_init_once(void *_finfo) ++{ ++ struct au_finfo *finfo = _finfo; ++ static struct lock_class_key aufs_fi; ++ ++ au_rw_init(&finfo->fi_rwsem); ++ au_rw_class(&finfo->fi_rwsem, &aufs_fi); ++} ++ ++int au_finfo_init(struct file *file, struct au_fidir *fidir) ++{ ++ int err, lc_idx; ++ struct au_finfo *finfo; ++ struct dentry *dentry; ++ ++ err = -ENOMEM; ++ dentry = file->f_dentry; ++ finfo = au_cache_alloc_finfo(); ++ if (unlikely(!finfo)) ++ goto out; ++ ++ err = 0; ++ au_nfiles_inc(dentry->d_sb); ++ lc_idx = AuLcNonDir_FIINFO; ++ if (fidir) ++ lc_idx = AuLcDir_FIINFO; ++ au_rw_class(&finfo->fi_rwsem, au_lc_key + lc_idx); ++ au_rw_write_lock(&finfo->fi_rwsem); ++ finfo->fi_btop = -1; ++ finfo->fi_hdir = fidir; ++ atomic_set(&finfo->fi_generation, au_digen(dentry)); ++ /* smp_mb(); */ /* atomic_set */ ++ ++ file->private_data = finfo; ++ ++out: ++ return err; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/f_op.c linux-3.2.0-gentoo-r1/fs/aufs/f_op.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/f_op.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/f_op.c 2012-01-17 12:11:24.643636878 +0100 +@@ -0,0 +1,729 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * file and vm operations ++ */ ++ ++#include ++#include ++#include ++#include "aufs.h" ++ ++int au_do_open_nondir(struct file *file, int flags) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct file *h_file; ++ struct dentry *dentry; ++ struct au_finfo *finfo; ++ ++ FiMustWriteLock(file); ++ ++ dentry = file->f_dentry; ++ err = au_d_alive(dentry); ++ if (unlikely(err)) ++ goto out; ++ ++ finfo = au_fi(file); ++ memset(&finfo->fi_htop, 0, sizeof(finfo->fi_htop)); ++ atomic_set(&finfo->fi_mmapped, 0); ++ bindex = au_dbstart(dentry); ++ h_file = au_h_open(dentry, bindex, flags, file); ++ if (IS_ERR(h_file)) ++ err = PTR_ERR(h_file); ++ else { ++ au_set_fbstart(file, bindex); ++ au_set_h_fptr(file, bindex, h_file); ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ } ++ ++out: ++ return err; ++} ++ ++static int aufs_open_nondir(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ int err; ++ struct super_block *sb; ++ ++ AuDbg("%.*s, f_flags 0x%x, f_mode 0x%x\n", ++ AuDLNPair(file->f_dentry), vfsub_file_flags(file), ++ file->f_mode); ++ ++ sb = file->f_dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_do_open(file, au_do_open_nondir, /*fidir*/NULL); ++ si_read_unlock(sb); ++ return err; ++} ++ ++int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file) ++{ ++ struct au_finfo *finfo; ++ aufs_bindex_t bindex; ++ ++ finfo = au_fi(file); ++ bindex = finfo->fi_btop; ++ if (bindex >= 0) { ++ /* remove me from sb->s_files */ ++ file_sb_list_del(file); ++ au_set_h_fptr(file, bindex, NULL); ++ } ++ ++ au_finfo_fin(file); ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_flush_nondir(struct file *file, fl_owner_t id) ++{ ++ int err; ++ struct file *h_file; ++ ++ err = 0; ++ h_file = au_hf_top(file); ++ if (h_file) ++ err = vfsub_flush(h_file, id); ++ return err; ++} ++ ++static int aufs_flush_nondir(struct file *file, fl_owner_t id) ++{ ++ return au_do_flush(file, id, au_do_flush_nondir); ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * read and write functions acquire [fdi]_rwsem once, but release before ++ * mmap_sem. This is because to stop a race condition between mmap(2). ++ * Releasing these aufs-rwsem should be safe, no branch-mamagement (by keeping ++ * si_rwsem), no harmful copy-up should happen. Actually copy-up may happen in ++ * read functions after [fdi]_rwsem are released, but it should be harmless. ++ */ ++ ++static ssize_t aufs_read(struct file *file, char __user *buf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ struct dentry *dentry; ++ struct file *h_file; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ di_read_unlock(dentry, AuLock_IR); ++ fi_read_unlock(file); ++ ++ /* filedata may be obsoleted by concurrent copyup, but no problem */ ++ err = vfsub_read_u(h_file, buf, count, ppos); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ /* update without lock, I don't think it a problem */ ++ fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode); ++ fput(h_file); ++ ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ++ * todo: very ugly ++ * it locks both of i_mutex and si_rwsem for read in safe. ++ * if the plink maintenance mode continues forever (that is the problem), ++ * may loop forever. ++ */ ++static void au_mtx_and_read_lock(struct inode *inode) ++{ ++ int err; ++ struct super_block *sb = inode->i_sb; ++ ++ while (1) { ++ mutex_lock(&inode->i_mutex); ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (!err) ++ break; ++ mutex_unlock(&inode->i_mutex); ++ si_read_lock(sb, AuLock_NOPLMW); ++ si_read_unlock(sb); ++ } ++} ++ ++static ssize_t aufs_write(struct file *file, const char __user *ubuf, ++ size_t count, loff_t *ppos) ++{ ++ ssize_t err; ++ struct au_pin pin; ++ struct dentry *dentry; ++ struct super_block *sb; ++ struct inode *inode; ++ struct file *h_file; ++ char __user *buf = (char __user *)ubuf; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ au_mtx_and_read_lock(inode); ++ ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) { ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ goto out; ++ } ++ ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ au_unpin(&pin); ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ ++ err = vfsub_write_u(h_file, buf, count, ppos); ++ ii_write_lock_child(inode); ++ au_cpup_attr_timesizes(inode); ++ inode->i_mode = h_file->f_dentry->d_inode->i_mode; ++ ii_write_unlock(inode); ++ fput(h_file); ++ ++out: ++ si_read_unlock(sb); ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++ ++static ssize_t au_do_aio(struct file *h_file, int rw, struct kiocb *kio, ++ const struct iovec *iov, unsigned long nv, loff_t pos) ++{ ++ ssize_t err; ++ struct file *file; ++ ssize_t (*func)(struct kiocb *, const struct iovec *, unsigned long, ++ loff_t); ++ ++ err = security_file_permission(h_file, rw); ++ if (unlikely(err)) ++ goto out; ++ ++ err = -ENOSYS; ++ func = NULL; ++ if (rw == MAY_READ) ++ func = h_file->f_op->aio_read; ++ else if (rw == MAY_WRITE) ++ func = h_file->f_op->aio_write; ++ if (func) { ++ file = kio->ki_filp; ++ kio->ki_filp = h_file; ++ lockdep_off(); ++ err = func(kio, iov, nv, pos); ++ lockdep_on(); ++ kio->ki_filp = file; ++ } else ++ /* currently there is no such fs */ ++ WARN_ON_ONCE(1); ++ ++out: ++ return err; ++} ++ ++static ssize_t aufs_aio_read(struct kiocb *kio, const struct iovec *iov, ++ unsigned long nv, loff_t pos) ++{ ++ ssize_t err; ++ struct file *file, *h_file; ++ struct dentry *dentry; ++ struct super_block *sb; ++ ++ file = kio->ki_filp; ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ di_read_unlock(dentry, AuLock_IR); ++ fi_read_unlock(file); ++ ++ err = au_do_aio(h_file, MAY_READ, kio, iov, nv, pos); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ /* update without lock, I don't think it a problem */ ++ fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode); ++ fput(h_file); ++ ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++static ssize_t aufs_aio_write(struct kiocb *kio, const struct iovec *iov, ++ unsigned long nv, loff_t pos) ++{ ++ ssize_t err; ++ struct au_pin pin; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct file *file, *h_file; ++ struct super_block *sb; ++ ++ file = kio->ki_filp; ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ au_mtx_and_read_lock(inode); ++ ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) { ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ goto out; ++ } ++ ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ au_unpin(&pin); ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ ++ err = au_do_aio(h_file, MAY_WRITE, kio, iov, nv, pos); ++ ii_write_lock_child(inode); ++ au_cpup_attr_timesizes(inode); ++ inode->i_mode = h_file->f_dentry->d_inode->i_mode; ++ ii_write_unlock(inode); ++ fput(h_file); ++ ++out: ++ si_read_unlock(sb); ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++ ++static ssize_t aufs_splice_read(struct file *file, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags) ++{ ++ ssize_t err; ++ struct file *h_file; ++ struct dentry *dentry; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ err = -EINVAL; ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ if (au_test_loopback_kthread()) { ++ au_warn_loopback(h_file->f_dentry->d_sb); ++ if (file->f_mapping != h_file->f_mapping) { ++ file->f_mapping = h_file->f_mapping; ++ smp_mb(); /* unnecessary? */ ++ } ++ } ++ di_read_unlock(dentry, AuLock_IR); ++ fi_read_unlock(file); ++ ++ err = vfsub_splice_to(h_file, ppos, pipe, len, flags); ++ /* todo: necessasry? */ ++ /* file->f_ra = h_file->f_ra; */ ++ /* update without lock, I don't think it a problem */ ++ fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode); ++ fput(h_file); ++ ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++static ssize_t ++aufs_splice_write(struct pipe_inode_info *pipe, struct file *file, loff_t *ppos, ++ size_t len, unsigned int flags) ++{ ++ ssize_t err; ++ struct au_pin pin; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct file *h_file; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ au_mtx_and_read_lock(inode); ++ ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) { ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ goto out; ++ } ++ ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ au_unpin(&pin); ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ ++ err = vfsub_splice_from(pipe, h_file, ppos, len, flags); ++ ii_write_lock_child(inode); ++ au_cpup_attr_timesizes(inode); ++ inode->i_mode = h_file->f_dentry->d_inode->i_mode; ++ ii_write_unlock(inode); ++ fput(h_file); ++ ++out: ++ si_read_unlock(sb); ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * The locking order around current->mmap_sem. ++ * - in most and regular cases ++ * file I/O syscall -- aufs_read() or something ++ * -- si_rwsem for read -- mmap_sem ++ * (Note that [fdi]i_rwsem are released before mmap_sem). ++ * - in mmap case ++ * mmap(2) -- mmap_sem -- aufs_mmap() -- si_rwsem for read -- [fdi]i_rwsem ++ * This AB-BA order is definitly bad, but is not a problem since "si_rwsem for ++ * read" allows muliple processes to acquire it and [fdi]i_rwsem are not held in ++ * file I/O. Aufs needs to stop lockdep in aufs_mmap() though. ++ * It means that when aufs acquires si_rwsem for write, the process should never ++ * acquire mmap_sem. ++ * ++ * Actually aufs_readdir() holds [fdi]i_rwsem before mmap_sem, but this is not a ++ * problem either since any directory is not able to be mmap-ed. ++ * The similar scenario is applied to aufs_readlink() too. ++ */ ++ ++/* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */ ++#define AuConv_VM_PROT(f, b) _calc_vm_trans(f, VM_##b, PROT_##b) ++ ++static unsigned long au_arch_prot_conv(unsigned long flags) ++{ ++ /* currently ppc64 only */ ++#ifdef CONFIG_PPC64 ++ /* cf. linux/arch/powerpc/include/asm/mman.h */ ++ AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO); ++ return AuConv_VM_PROT(flags, SAO); ++#else ++ AuDebugOn(arch_calc_vm_prot_bits(-1)); ++ return 0; ++#endif ++} ++ ++static unsigned long au_prot_conv(unsigned long flags) ++{ ++ return AuConv_VM_PROT(flags, READ) ++ | AuConv_VM_PROT(flags, WRITE) ++ | AuConv_VM_PROT(flags, EXEC) ++ | au_arch_prot_conv(flags); ++} ++ ++/* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */ ++#define AuConv_VM_MAP(f, b) _calc_vm_trans(f, VM_##b, MAP_##b) ++ ++static unsigned long au_flag_conv(unsigned long flags) ++{ ++ return AuConv_VM_MAP(flags, GROWSDOWN) ++ | AuConv_VM_MAP(flags, DENYWRITE) ++ | AuConv_VM_MAP(flags, EXECUTABLE) ++ | AuConv_VM_MAP(flags, LOCKED); ++} ++ ++static int aufs_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ int err; ++ unsigned long prot; ++ aufs_bindex_t bstart; ++ const unsigned char wlock ++ = (file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED); ++ struct dentry *dentry; ++ struct super_block *sb; ++ struct file *h_file; ++ struct au_branch *br; ++ struct au_pin pin; ++ ++ AuDbgVmRegion(file, vma); ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ lockdep_off(); ++ si_read_lock(sb, AuLock_NOPLMW); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ if (wlock) { ++ err = au_ready_to_write(file, -1, &pin); ++ di_write_unlock(dentry); ++ if (unlikely(err)) { ++ fi_write_unlock(file); ++ goto out; ++ } ++ au_unpin(&pin); ++ } else ++ di_write_unlock(dentry); ++ ++ bstart = au_fbstart(file); ++ br = au_sbr(sb, bstart); ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ au_set_mmapped(file); ++ fi_write_unlock(file); ++ lockdep_on(); ++ ++ au_vm_file_reset(vma, h_file); ++ prot = au_prot_conv(vma->vm_flags); ++ err = security_file_mmap(h_file, /*reqprot*/prot, prot, ++ au_flag_conv(vma->vm_flags), vma->vm_start, 0); ++ if (!err) ++ err = h_file->f_op->mmap(h_file, vma); ++ if (unlikely(err)) ++ goto out_reset; ++ ++ au_vm_prfile_set(vma, file); ++ /* update without lock, I don't think it a problem */ ++ fsstack_copy_attr_atime(file->f_dentry->d_inode, ++ h_file->f_dentry->d_inode); ++ goto out_fput; /* success */ ++ ++out_reset: ++ au_unset_mmapped(file); ++ au_vm_file_reset(vma, file); ++out_fput: ++ fput(h_file); ++ lockdep_off(); ++out: ++ si_read_unlock(sb); ++ lockdep_on(); ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_fsync_nondir(struct file *file, loff_t start, loff_t end, ++ int datasync) ++{ ++ int err; ++ struct au_pin pin; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct file *h_file; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ inode = dentry->d_inode; ++ sb = dentry->d_sb; ++ mutex_lock(&inode->i_mutex); ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out; ++ ++ err = 0; /* -EBADF; */ /* posix? */ ++ if (unlikely(!(file->f_mode & FMODE_WRITE))) ++ goto out_si; ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out_si; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ au_unpin(&pin); ++ ++ err = -EINVAL; ++ h_file = au_hf_top(file); ++ err = vfsub_fsync(h_file, &h_file->f_path, datasync); ++ au_cpup_attr_timesizes(inode); ++ ++out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++out_si: ++ si_read_unlock(sb); ++out: ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++ ++/* no one supports this operation, currently */ ++#if 0 ++static int aufs_aio_fsync_nondir(struct kiocb *kio, int datasync) ++{ ++ int err; ++ struct au_pin pin; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct file *file, *h_file; ++ ++ file = kio->ki_filp; ++ dentry = file->f_dentry; ++ inode = dentry->d_inode; ++ au_mtx_and_read_lock(inode); ++ ++ err = 0; /* -EBADF; */ /* posix? */ ++ if (unlikely(!(file->f_mode & FMODE_WRITE))) ++ goto out; ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ au_unpin(&pin); ++ ++ err = -ENOSYS; ++ h_file = au_hf_top(file); ++ if (h_file->f_op && h_file->f_op->aio_fsync) { ++ struct dentry *h_d; ++ struct mutex *h_mtx; ++ ++ h_d = h_file->f_dentry; ++ h_mtx = &h_d->d_inode->i_mutex; ++ if (!is_sync_kiocb(kio)) { ++ get_file(h_file); ++ fput(file); ++ } ++ kio->ki_filp = h_file; ++ err = h_file->f_op->aio_fsync(kio, datasync); ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ if (!err) ++ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); ++ /*ignore*/ ++ au_cpup_attr_timesizes(inode); ++ mutex_unlock(h_mtx); ++ } ++ ++out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++out: ++ si_read_unlock(inode->sb); ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++#endif ++ ++static int aufs_fasync(int fd, struct file *file, int flag) ++{ ++ int err; ++ struct file *h_file; ++ struct dentry *dentry; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ h_file = au_hf_top(file); ++ if (h_file->f_op && h_file->f_op->fasync) ++ err = h_file->f_op->fasync(fd, h_file, flag); ++ ++ di_read_unlock(dentry, AuLock_IR); ++ fi_read_unlock(file); ++ ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* no one supports this operation, currently */ ++#if 0 ++static ssize_t aufs_sendpage(struct file *file, struct page *page, int offset, ++ size_t len, loff_t *pos , int more) ++{ ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++const struct file_operations aufs_file_fop = { ++ .owner = THIS_MODULE, ++ ++ .llseek = default_llseek, ++ ++ .read = aufs_read, ++ .write = aufs_write, ++ .aio_read = aufs_aio_read, ++ .aio_write = aufs_aio_write, ++#ifdef CONFIG_AUFS_POLL ++ .poll = aufs_poll, ++#endif ++ .unlocked_ioctl = aufs_ioctl_nondir, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = aufs_ioctl_nondir, /* same */ ++#endif ++ .mmap = aufs_mmap, ++ .open = aufs_open_nondir, ++ .flush = aufs_flush_nondir, ++ .release = aufs_release_nondir, ++ .fsync = aufs_fsync_nondir, ++ /* .aio_fsync = aufs_aio_fsync_nondir, */ ++ .fasync = aufs_fasync, ++ /* .sendpage = aufs_sendpage, */ ++ .splice_write = aufs_splice_write, ++ .splice_read = aufs_splice_read, ++#if 0 ++ .aio_splice_write = aufs_aio_splice_write, ++ .aio_splice_read = aufs_aio_splice_read ++#endif ++}; +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/f_op_sp.c linux-3.2.0-gentoo-r1/fs/aufs/f_op_sp.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/f_op_sp.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/f_op_sp.c 2012-01-17 12:11:24.652896221 +0100 +@@ -0,0 +1,298 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * file operations for special files. ++ * while they exist in aufs virtually, ++ * their file I/O is handled out of aufs. ++ */ ++ ++#include "aufs.h" ++ ++static ssize_t aufs_aio_read_sp(struct kiocb *kio, const struct iovec *iov, ++ unsigned long nv, loff_t pos) ++{ ++ ssize_t err; ++ aufs_bindex_t bstart; ++ unsigned char wbr; ++ struct file *file, *h_file; ++ struct super_block *sb; ++ ++ file = kio->ki_filp; ++ sb = file->f_dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ fi_read_lock(file); ++ bstart = au_fbstart(file); ++ h_file = au_hf_top(file); ++ fi_read_unlock(file); ++ wbr = !!au_br_writable(au_sbr(sb, bstart)->br_perm); ++ si_read_unlock(sb); ++ ++ /* do not change the file in kio */ ++ AuDebugOn(!h_file->f_op || !h_file->f_op->aio_read); ++ err = h_file->f_op->aio_read(kio, iov, nv, pos); ++ if (err > 0 && wbr) ++ file_accessed(h_file); ++ ++ return err; ++} ++ ++static ssize_t aufs_aio_write_sp(struct kiocb *kio, const struct iovec *iov, ++ unsigned long nv, loff_t pos) ++{ ++ ssize_t err; ++ aufs_bindex_t bstart; ++ unsigned char wbr; ++ struct super_block *sb; ++ struct file *file, *h_file; ++ ++ file = kio->ki_filp; ++ sb = file->f_dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ fi_read_lock(file); ++ bstart = au_fbstart(file); ++ h_file = au_hf_top(file); ++ fi_read_unlock(file); ++ wbr = !!au_br_writable(au_sbr(sb, bstart)->br_perm); ++ si_read_unlock(sb); ++ ++ /* do not change the file in kio */ ++ AuDebugOn(!h_file->f_op || !h_file->f_op->aio_write); ++ err = h_file->f_op->aio_write(kio, iov, nv, pos); ++ if (err > 0 && wbr) ++ file_update_time(h_file); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_release_sp(struct inode *inode, struct file *file) ++{ ++ int err; ++ struct file *h_file; ++ ++ fi_read_lock(file); ++ h_file = au_hf_top(file); ++ fi_read_unlock(file); ++ /* close this fifo in aufs */ ++ err = h_file->f_op->release(inode, file); /* ignore */ ++ aufs_release_nondir(inode, file); /* ignore */ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* currently, support only FIFO */ ++enum { ++ AuSp_FIFO, AuSp_FIFO_R, AuSp_FIFO_W, AuSp_FIFO_RW, ++ /* AuSp_SOCK, AuSp_CHR, AuSp_BLK, */ ++ AuSp_Last ++}; ++static int aufs_open_sp(struct inode *inode, struct file *file); ++static struct au_sp_fop { ++ int done; ++ struct file_operations fop; /* not 'const' */ ++ spinlock_t spin; ++} au_sp_fop[AuSp_Last] = { ++ [AuSp_FIFO] = { ++ .fop = { ++ .owner = THIS_MODULE, ++ .open = aufs_open_sp ++ } ++ } ++}; ++ ++static void au_init_fop_sp(struct file *file) ++{ ++ struct au_sp_fop *p; ++ int i; ++ struct file *h_file; ++ ++ p = au_sp_fop; ++ if (unlikely(!p->done)) { ++ /* initialize first time only */ ++ static DEFINE_SPINLOCK(spin); ++ ++ spin_lock(&spin); ++ if (!p->done) { ++ BUILD_BUG_ON(sizeof(au_sp_fop)/sizeof(*au_sp_fop) ++ != AuSp_Last); ++ for (i = 0; i < AuSp_Last; i++) ++ spin_lock_init(&p[i].spin); ++ p->done = 1; ++ } ++ spin_unlock(&spin); ++ } ++ ++ switch (file->f_mode & (FMODE_READ | FMODE_WRITE)) { ++ case FMODE_READ: ++ i = AuSp_FIFO_R; ++ break; ++ case FMODE_WRITE: ++ i = AuSp_FIFO_W; ++ break; ++ case FMODE_READ | FMODE_WRITE: ++ i = AuSp_FIFO_RW; ++ break; ++ default: ++ BUG(); ++ } ++ ++ p += i; ++ if (unlikely(!p->done)) { ++ /* initialize first time only */ ++ h_file = au_hf_top(file); ++ spin_lock(&p->spin); ++ if (!p->done) { ++ p->fop = *h_file->f_op; ++ p->fop.owner = THIS_MODULE; ++ if (p->fop.aio_read) ++ p->fop.aio_read = aufs_aio_read_sp; ++ if (p->fop.aio_write) ++ p->fop.aio_write = aufs_aio_write_sp; ++ p->fop.release = aufs_release_sp; ++ p->done = 1; ++ } ++ spin_unlock(&p->spin); ++ } ++ file->f_op = &p->fop; ++} ++ ++static int au_cpup_sp(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bcpup; ++ struct au_pin pin; ++ struct au_wr_dir_args wr_dir_args = { ++ .force_btgt = -1, ++ .flags = 0 ++ }; ++ ++ AuDbg("%.*s\n", AuDLNPair(dentry)); ++ ++ di_read_unlock(dentry, AuLock_IR); ++ di_write_lock_child(dentry); ++ err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args); ++ if (unlikely(err < 0)) ++ goto out; ++ bcpup = err; ++ err = 0; ++ if (bcpup == au_dbstart(dentry)) ++ goto out; /* success */ ++ ++ err = au_pin(&pin, dentry, bcpup, au_opt_udba(dentry->d_sb), ++ AuPin_MNT_WRITE); ++ if (!err) { ++ err = au_sio_cpup_simple(dentry, bcpup, -1, AuCpup_DTIME); ++ au_unpin(&pin); ++ } ++ ++out: ++ di_downgrade_lock(dentry, AuLock_IR); ++ return err; ++} ++ ++static int au_do_open_sp(struct file *file, int flags) ++{ ++ int err; ++ struct dentry *dentry; ++ struct super_block *sb; ++ struct file *h_file; ++ struct inode *h_inode; ++ ++ dentry = file->f_dentry; ++ AuDbg("%.*s\n", AuDLNPair(dentry)); ++ ++ /* ++ * try copying-up. ++ * operate on the ro branch is not an error. ++ */ ++ au_cpup_sp(dentry); /* ignore */ ++ ++ /* prepare h_file */ ++ err = au_do_open_nondir(file, vfsub_file_flags(file)); ++ if (unlikely(err)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ h_file = au_hf_top(file); ++ h_inode = h_file->f_dentry->d_inode; ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ si_read_unlock(sb); ++ /* open this fifo in aufs */ ++ err = h_inode->i_fop->open(file->f_dentry->d_inode, file); ++ si_noflush_read_lock(sb); ++ fi_write_lock(file); ++ di_read_lock_child(dentry, AuLock_IR); ++ if (!err) ++ au_init_fop_sp(file); ++ ++out: ++ return err; ++} ++ ++static int aufs_open_sp(struct inode *inode, struct file *file) ++{ ++ int err; ++ struct super_block *sb; ++ ++ sb = file->f_dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_do_open(file, au_do_open_sp, /*fidir*/NULL); ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_init_special_fop(struct inode *inode, umode_t mode, dev_t rdev) ++{ ++ init_special_inode(inode, mode, rdev); ++ ++ switch (mode & S_IFMT) { ++ case S_IFIFO: ++ inode->i_fop = &au_sp_fop[AuSp_FIFO].fop; ++ /*FALLTHROUGH*/ ++ case S_IFCHR: ++ case S_IFBLK: ++ case S_IFSOCK: ++ break; ++ default: ++ AuDebugOn(1); ++ } ++} ++ ++int au_special_file(umode_t mode) ++{ ++ int ret; ++ ++ ret = 0; ++ switch (mode & S_IFMT) { ++ case S_IFIFO: ++#if 0 ++ case S_IFCHR: ++ case S_IFBLK: ++ case S_IFSOCK: ++#endif ++ ret = 1; ++ } ++ ++ return ret; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/fstype.h linux-3.2.0-gentoo-r1/fs/aufs/fstype.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/fstype.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/fstype.h 2012-01-17 12:11:24.701507771 +0100 +@@ -0,0 +1,496 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * judging filesystem type ++ */ ++ ++#ifndef __AUFS_FSTYPE_H__ ++#define __AUFS_FSTYPE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++ ++static inline int au_test_aufs(struct super_block *sb) ++{ ++ return sb->s_magic == AUFS_SUPER_MAGIC; ++} ++ ++static inline const char *au_sbtype(struct super_block *sb) ++{ ++ return sb->s_type->name; ++} ++ ++static inline int au_test_iso9660(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_ROMFS_FS) || defined(CONFIG_ROMFS_FS_MODULE) ++ return sb->s_magic == ROMFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_romfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_ISO9660_FS) || defined(CONFIG_ISO9660_FS_MODULE) ++ return sb->s_magic == ISOFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_cramfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_CRAMFS) || defined(CONFIG_CRAMFS_MODULE) ++ return sb->s_magic == CRAMFS_MAGIC; ++#endif ++ return 0; ++} ++ ++static inline int au_test_nfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE) ++ return sb->s_magic == NFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_fuse(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE) ++ return sb->s_magic == FUSE_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_xfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_XFS_FS) || defined(CONFIG_XFS_FS_MODULE) ++ return sb->s_magic == XFS_SB_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_tmpfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_TMPFS ++ return sb->s_magic == TMPFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ecryptfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_ECRYPT_FS) || defined(CONFIG_ECRYPT_FS_MODULE) ++ return !strcmp(au_sbtype(sb), "ecryptfs"); ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_smbfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_SMB_FS) || defined(CONFIG_SMB_FS_MODULE) ++ return sb->s_magic == SMB_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ocfs2(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_OCFS2_FS) || defined(CONFIG_OCFS2_FS_MODULE) ++ return sb->s_magic == OCFS2_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ocfs2_dlmfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_OCFS2_FS_O2CB) || defined(CONFIG_OCFS2_FS_O2CB_MODULE) ++ return sb->s_magic == DLMFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_coda(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_CODA_FS) || defined(CONFIG_CODA_FS_MODULE) ++ return sb->s_magic == CODA_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_v9fs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_9P_FS) || defined(CONFIG_9P_FS_MODULE) ++ return sb->s_magic == V9FS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ext4(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_EXT4DEV_FS) || defined(CONFIG_EXT4DEV_FS_MODULE) ++ return sb->s_magic == EXT4_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_sysv(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_SYSV_FS) || defined(CONFIG_SYSV_FS_MODULE) ++ return !strcmp(au_sbtype(sb), "sysv"); ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ramfs(struct super_block *sb) ++{ ++ return sb->s_magic == RAMFS_MAGIC; ++} ++ ++static inline int au_test_ubifs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_UBIFS_FS) || defined(CONFIG_UBIFS_FS_MODULE) ++ return sb->s_magic == UBIFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_procfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_PROC_FS ++ return sb->s_magic == PROC_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_sysfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_SYSFS ++ return sb->s_magic == SYSFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_configfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_CONFIGFS_FS) || defined(CONFIG_CONFIGFS_FS_MODULE) ++ return sb->s_magic == CONFIGFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_minix(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_MINIX_FS) || defined(CONFIG_MINIX_FS_MODULE) ++ return sb->s_magic == MINIX3_SUPER_MAGIC ++ || sb->s_magic == MINIX2_SUPER_MAGIC ++ || sb->s_magic == MINIX2_SUPER_MAGIC2 ++ || sb->s_magic == MINIX_SUPER_MAGIC ++ || sb->s_magic == MINIX_SUPER_MAGIC2; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_cifs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_CIFS_FS) || defined(CONFIGCIFS_FS_MODULE) ++ return sb->s_magic == CIFS_MAGIC_NUMBER; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_fat(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_FAT_FS) || defined(CONFIG_FAT_FS_MODULE) ++ return sb->s_magic == MSDOS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_msdos(struct super_block *sb) ++{ ++ return au_test_fat(sb); ++} ++ ++static inline int au_test_vfat(struct super_block *sb) ++{ ++ return au_test_fat(sb); ++} ++ ++static inline int au_test_securityfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_SECURITYFS ++ return sb->s_magic == SECURITYFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_squashfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_SQUASHFS) || defined(CONFIG_SQUASHFS_MODULE) ++ return sb->s_magic == SQUASHFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_btrfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_BTRFS_FS) || defined(CONFIG_BTRFS_FS_MODULE) ++ return sb->s_magic == BTRFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_xenfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_XENFS) || defined(CONFIG_XENFS_MODULE) ++ return sb->s_magic == XENFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_debugfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_DEBUG_FS ++ return sb->s_magic == DEBUGFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_nilfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_NILFS) || defined(CONFIG_NILFS_MODULE) ++ return sb->s_magic == NILFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_hfsplus(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_HFSPLUS_FS) || defined(CONFIG_HFSPLUS_FS_MODULE) ++ return sb->s_magic == HFSPLUS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * they can't be an aufs branch. ++ */ ++static inline int au_test_fs_unsuppoted(struct super_block *sb) ++{ ++ return ++#ifndef CONFIG_AUFS_BR_RAMFS ++ au_test_ramfs(sb) || ++#endif ++ au_test_procfs(sb) ++ || au_test_sysfs(sb) ++ || au_test_configfs(sb) ++ || au_test_debugfs(sb) ++ || au_test_securityfs(sb) ++ || au_test_xenfs(sb) ++ || au_test_ecryptfs(sb) ++ /* || !strcmp(au_sbtype(sb), "unionfs") */ ++ || au_test_aufs(sb); /* will be supported in next version */ ++} ++ ++/* ++ * If the filesystem supports NFS-export, then it has to support NULL as ++ * a nameidata parameter for ->create(), ->lookup() and ->d_revalidate(). ++ * We can apply this principle when we handle a lower filesystem. ++ */ ++static inline int au_test_fs_null_nd(struct super_block *sb) ++{ ++ return !!sb->s_export_op; ++} ++ ++static inline int au_test_fs_remote(struct super_block *sb) ++{ ++ return !au_test_tmpfs(sb) ++#ifdef CONFIG_AUFS_BR_RAMFS ++ && !au_test_ramfs(sb) ++#endif ++ && !(sb->s_type->fs_flags & FS_REQUIRES_DEV); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * Note: these functions (below) are created after reading ->getattr() in all ++ * filesystems under linux/fs. it means we have to do so in every update... ++ */ ++ ++/* ++ * some filesystems require getattr to refresh the inode attributes before ++ * referencing. ++ * in most cases, we can rely on the inode attribute in NFS (or every remote fs) ++ * and leave the work for d_revalidate() ++ */ ++static inline int au_test_fs_refresh_iattr(struct super_block *sb) ++{ ++ return au_test_nfs(sb) ++ || au_test_fuse(sb) ++ /* || au_test_smbfs(sb) */ /* untested */ ++ /* || au_test_ocfs2(sb) */ /* untested */ ++ /* || au_test_btrfs(sb) */ /* untested */ ++ /* || au_test_coda(sb) */ /* untested */ ++ /* || au_test_v9fs(sb) */ /* untested */ ++ ; ++} ++ ++/* ++ * filesystems which don't maintain i_size or i_blocks. ++ */ ++static inline int au_test_fs_bad_iattr_size(struct super_block *sb) ++{ ++ return au_test_xfs(sb) ++ || au_test_btrfs(sb) ++ || au_test_ubifs(sb) ++ || au_test_hfsplus(sb) /* maintained, but incorrect */ ++ /* || au_test_ext4(sb) */ /* untested */ ++ /* || au_test_ocfs2(sb) */ /* untested */ ++ /* || au_test_ocfs2_dlmfs(sb) */ /* untested */ ++ /* || au_test_sysv(sb) */ /* untested */ ++ /* || au_test_minix(sb) */ /* untested */ ++ ; ++} ++ ++/* ++ * filesystems which don't store the correct value in some of their inode ++ * attributes. ++ */ ++static inline int au_test_fs_bad_iattr(struct super_block *sb) ++{ ++ return au_test_fs_bad_iattr_size(sb) ++ /* || au_test_cifs(sb) */ /* untested */ ++ || au_test_fat(sb) ++ || au_test_msdos(sb) ++ || au_test_vfat(sb); ++} ++ ++/* they don't check i_nlink in link(2) */ ++static inline int au_test_fs_no_limit_nlink(struct super_block *sb) ++{ ++ return au_test_tmpfs(sb) ++#ifdef CONFIG_AUFS_BR_RAMFS ++ || au_test_ramfs(sb) ++#endif ++ || au_test_ubifs(sb) ++ || au_test_btrfs(sb) ++ || au_test_hfsplus(sb); ++} ++ ++/* ++ * filesystems which sets S_NOATIME and S_NOCMTIME. ++ */ ++static inline int au_test_fs_notime(struct super_block *sb) ++{ ++ return au_test_nfs(sb) ++ || au_test_fuse(sb) ++ || au_test_ubifs(sb) ++ /* || au_test_cifs(sb) */ /* untested */ ++ ; ++} ++ ++/* ++ * filesystems which requires replacing i_mapping. ++ */ ++static inline int au_test_fs_bad_mapping(struct super_block *sb) ++{ ++ return au_test_fuse(sb) ++ || au_test_ubifs(sb); ++} ++ ++/* temporary support for i#1 in cramfs */ ++static inline int au_test_fs_unique_ino(struct inode *inode) ++{ ++ if (au_test_cramfs(inode->i_sb)) ++ return inode->i_ino != 1; ++ return 1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * the filesystem where the xino files placed must support i/o after unlink and ++ * maintain i_size and i_blocks. ++ */ ++static inline int au_test_fs_bad_xino(struct super_block *sb) ++{ ++ return au_test_fs_remote(sb) ++ || au_test_fs_bad_iattr_size(sb) ++#ifdef CONFIG_AUFS_BR_RAMFS ++ || !(au_test_ramfs(sb) || au_test_fs_null_nd(sb)) ++#else ++ || !au_test_fs_null_nd(sb) /* to keep xino code simple */ ++#endif ++ /* don't want unnecessary work for xino */ ++ || au_test_aufs(sb) ++ || au_test_ecryptfs(sb) ++ || au_test_nilfs(sb); ++} ++ ++static inline int au_test_fs_trunc_xino(struct super_block *sb) ++{ ++ return au_test_tmpfs(sb) ++ || au_test_ramfs(sb); ++} ++ ++/* ++ * test if the @sb is real-readonly. ++ */ ++static inline int au_test_fs_rr(struct super_block *sb) ++{ ++ return au_test_squashfs(sb) ++ || au_test_iso9660(sb) ++ || au_test_cramfs(sb) ++ || au_test_romfs(sb); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_FSTYPE_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/hfsnotify.c linux-3.2.0-gentoo-r1/fs/aufs/hfsnotify.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/hfsnotify.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/hfsnotify.c 2012-01-17 12:11:24.729285799 +0100 +@@ -0,0 +1,247 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * fsnotify for the lower directories ++ */ ++ ++#include "aufs.h" ++ ++/* FS_IN_IGNORED is unnecessary */ ++static const __u32 AuHfsnMask = (FS_MOVED_TO | FS_MOVED_FROM | FS_DELETE ++ | FS_CREATE | FS_EVENT_ON_CHILD); ++static DECLARE_WAIT_QUEUE_HEAD(au_hfsn_wq); ++ ++static void au_hfsn_free_mark(struct fsnotify_mark *mark) ++{ ++ struct au_hnotify *hn = container_of(mark, struct au_hnotify, ++ hn_mark); ++ AuDbg("here\n"); ++ hn->hn_mark_dead = 1; ++ smp_mb(); ++ wake_up_all(&au_hfsn_wq); ++} ++ ++static int au_hfsn_alloc(struct au_hinode *hinode) ++{ ++ struct au_hnotify *hn; ++ struct super_block *sb; ++ struct au_branch *br; ++ struct fsnotify_mark *mark; ++ aufs_bindex_t bindex; ++ ++ hn = hinode->hi_notify; ++ sb = hn->hn_aufs_inode->i_sb; ++ bindex = au_br_index(sb, hinode->hi_id); ++ br = au_sbr(sb, bindex); ++ hn->hn_mark_dead = 0; ++ mark = &hn->hn_mark; ++ fsnotify_init_mark(mark, au_hfsn_free_mark); ++ mark->mask = AuHfsnMask; ++ /* ++ * by udba rename or rmdir, aufs assign a new inode to the known ++ * h_inode, so specify 1 to allow dups. ++ */ ++ return fsnotify_add_mark(mark, br->br_hfsn_group, hinode->hi_inode, ++ /*mnt*/NULL, /*allow_dups*/1); ++} ++ ++static void au_hfsn_free(struct au_hinode *hinode) ++{ ++ struct au_hnotify *hn; ++ struct fsnotify_mark *mark; ++ ++ hn = hinode->hi_notify; ++ mark = &hn->hn_mark; ++ fsnotify_destroy_mark(mark); ++ fsnotify_put_mark(mark); ++ ++ /* TODO: bad approach */ ++ wait_event(au_hfsn_wq, hn->hn_mark_dead); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_hfsn_ctl(struct au_hinode *hinode, int do_set) ++{ ++ struct fsnotify_mark *mark; ++ ++ mark = &hinode->hi_notify->hn_mark; ++ spin_lock(&mark->lock); ++ if (do_set) { ++ AuDebugOn(mark->mask & AuHfsnMask); ++ mark->mask |= AuHfsnMask; ++ } else { ++ AuDebugOn(!(mark->mask & AuHfsnMask)); ++ mark->mask &= ~AuHfsnMask; ++ } ++ spin_unlock(&mark->lock); ++ /* fsnotify_recalc_inode_mask(hinode->hi_inode); */ ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* #define AuDbgHnotify */ ++#ifdef AuDbgHnotify ++static char *au_hfsn_name(u32 mask) ++{ ++#ifdef CONFIG_AUFS_DEBUG ++#define test_ret(flag) if (mask & flag) \ ++ return #flag; ++ test_ret(FS_ACCESS); ++ test_ret(FS_MODIFY); ++ test_ret(FS_ATTRIB); ++ test_ret(FS_CLOSE_WRITE); ++ test_ret(FS_CLOSE_NOWRITE); ++ test_ret(FS_OPEN); ++ test_ret(FS_MOVED_FROM); ++ test_ret(FS_MOVED_TO); ++ test_ret(FS_CREATE); ++ test_ret(FS_DELETE); ++ test_ret(FS_DELETE_SELF); ++ test_ret(FS_MOVE_SELF); ++ test_ret(FS_UNMOUNT); ++ test_ret(FS_Q_OVERFLOW); ++ test_ret(FS_IN_IGNORED); ++ test_ret(FS_IN_ISDIR); ++ test_ret(FS_IN_ONESHOT); ++ test_ret(FS_EVENT_ON_CHILD); ++ return ""; ++#undef test_ret ++#else ++ return "??"; ++#endif ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_hfsn_handle_event(struct fsnotify_group *group, ++ struct fsnotify_mark *inode_mark, ++ struct fsnotify_mark *vfsmount_mark, ++ struct fsnotify_event *event) ++{ ++ int err; ++ struct au_hnotify *hnotify; ++ struct inode *h_dir, *h_inode; ++ __u32 mask; ++ struct qstr h_child_qstr = { ++ .name = event->file_name, ++ .len = event->name_len ++ }; ++ ++ AuDebugOn(event->data_type != FSNOTIFY_EVENT_INODE); ++ ++ err = 0; ++ /* if FS_UNMOUNT happens, there must be another bug */ ++ mask = event->mask; ++ AuDebugOn(mask & FS_UNMOUNT); ++ if (mask & (FS_IN_IGNORED | FS_UNMOUNT)) ++ goto out; ++ ++ h_dir = event->to_tell; ++ h_inode = event->inode; ++#ifdef AuDbgHnotify ++ au_debug(1); ++ if (1 || h_child_qstr.len != sizeof(AUFS_XINO_FNAME) - 1 ++ || strncmp(h_child_qstr.name, AUFS_XINO_FNAME, h_child_qstr.len)) { ++ AuDbg("i%lu, mask 0x%x %s, hcname %.*s, hi%lu\n", ++ h_dir->i_ino, mask, au_hfsn_name(mask), ++ AuLNPair(&h_child_qstr), h_inode ? h_inode->i_ino : 0); ++ /* WARN_ON(1); */ ++ } ++ au_debug(0); ++#endif ++ ++ AuDebugOn(!inode_mark); ++ hnotify = container_of(inode_mark, struct au_hnotify, hn_mark); ++ err = au_hnotify(h_dir, hnotify, mask, &h_child_qstr, h_inode); ++ ++out: ++ return err; ++} ++ ++/* isn't it waste to ask every registered 'group'? */ ++/* copied from linux/fs/notify/inotify/inotify_fsnotiry.c */ ++/* it should be exported to modules */ ++static bool au_hfsn_should_send_event(struct fsnotify_group *group, ++ struct inode *h_inode, ++ struct fsnotify_mark *inode_mark, ++ struct fsnotify_mark *vfsmount_mark, ++ __u32 mask, void *data, int data_type) ++{ ++ mask = (mask & ~FS_EVENT_ON_CHILD); ++ return inode_mark->mask & mask; ++} ++ ++static struct fsnotify_ops au_hfsn_ops = { ++ .should_send_event = au_hfsn_should_send_event, ++ .handle_event = au_hfsn_handle_event ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_hfsn_fin_br(struct au_branch *br) ++{ ++ if (br->br_hfsn_group) ++ fsnotify_put_group(br->br_hfsn_group); ++} ++ ++static int au_hfsn_init_br(struct au_branch *br, int perm) ++{ ++ br->br_hfsn_group = NULL; ++ br->br_hfsn_ops = au_hfsn_ops; ++ return 0; ++} ++ ++static int au_hfsn_reset_br(unsigned int udba, struct au_branch *br, int perm) ++{ ++ int err; ++ ++ err = 0; ++ if (udba != AuOpt_UDBA_HNOTIFY ++ || !au_br_hnotifyable(perm)) { ++ au_hfsn_fin_br(br); ++ br->br_hfsn_group = NULL; ++ goto out; ++ } ++ ++ if (br->br_hfsn_group) ++ goto out; ++ ++ br->br_hfsn_group = fsnotify_alloc_group(&br->br_hfsn_ops); ++ if (IS_ERR(br->br_hfsn_group)) { ++ err = PTR_ERR(br->br_hfsn_group); ++ pr_err("fsnotify_alloc_group() failed, %d\n", err); ++ br->br_hfsn_group = NULL; ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++const struct au_hnotify_op au_hnotify_op = { ++ .ctl = au_hfsn_ctl, ++ .alloc = au_hfsn_alloc, ++ .free = au_hfsn_free, ++ ++ .reset_br = au_hfsn_reset_br, ++ .fin_br = au_hfsn_fin_br, ++ .init_br = au_hfsn_init_br ++}; +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/hfsplus.c linux-3.2.0-gentoo-r1/fs/aufs/hfsplus.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/hfsplus.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/hfsplus.c 2012-01-17 12:11:24.729285799 +0100 +@@ -0,0 +1,57 @@ ++/* ++ * Copyright (C) 2010-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * special support for filesystems which aqucires an inode mutex ++ * at final closing a file, eg, hfsplus. ++ * ++ * This trick is very simple and stupid, just to open the file before really ++ * neceeary open to tell hfsplus that this is not the final closing. ++ * The caller should call au_h_open_pre() after acquiring the inode mutex, ++ * and au_h_open_post() after releasing it. ++ */ ++ ++#include "aufs.h" ++ ++struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ struct file *h_file; ++ struct dentry *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ AuDebugOn(!h_dentry); ++ AuDebugOn(!h_dentry->d_inode); ++ IMustLock(h_dentry->d_inode); ++ ++ h_file = NULL; ++ if (au_test_hfsplus(h_dentry->d_sb) ++ && S_ISREG(h_dentry->d_inode->i_mode)) ++ h_file = au_h_open(dentry, bindex, ++ O_RDONLY | O_NOATIME | O_LARGEFILE, ++ /*file*/NULL); ++ return h_file; ++} ++ ++void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex, ++ struct file *h_file) ++{ ++ if (h_file) { ++ fput(h_file); ++ au_sbr_put(dentry->d_sb, bindex); ++ } ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/hnotify.c linux-3.2.0-gentoo-r1/fs/aufs/hnotify.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/hnotify.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/hnotify.c 2012-01-17 12:11:24.736230305 +0100 +@@ -0,0 +1,712 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * abstraction to notify the direct changes on lower directories ++ */ ++ ++#include "aufs.h" ++ ++int au_hn_alloc(struct au_hinode *hinode, struct inode *inode) ++{ ++ int err; ++ struct au_hnotify *hn; ++ ++ err = -ENOMEM; ++ hn = au_cache_alloc_hnotify(); ++ if (hn) { ++ hn->hn_aufs_inode = inode; ++ hinode->hi_notify = hn; ++ err = au_hnotify_op.alloc(hinode); ++ AuTraceErr(err); ++ if (unlikely(err)) { ++ hinode->hi_notify = NULL; ++ au_cache_free_hnotify(hn); ++ /* ++ * The upper dir was removed by udba, but the same named ++ * dir left. In this case, aufs assignes a new inode ++ * number and set the monitor again. ++ * For the lower dir, the old monitnor is still left. ++ */ ++ if (err == -EEXIST) ++ err = 0; ++ } ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++void au_hn_free(struct au_hinode *hinode) ++{ ++ struct au_hnotify *hn; ++ ++ hn = hinode->hi_notify; ++ if (hn) { ++ au_hnotify_op.free(hinode); ++ au_cache_free_hnotify(hn); ++ hinode->hi_notify = NULL; ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_hn_ctl(struct au_hinode *hinode, int do_set) ++{ ++ if (hinode->hi_notify) ++ au_hnotify_op.ctl(hinode, do_set); ++} ++ ++void au_hn_reset(struct inode *inode, unsigned int flags) ++{ ++ aufs_bindex_t bindex, bend; ++ struct inode *hi; ++ struct dentry *iwhdentry; ++ ++ bend = au_ibend(inode); ++ for (bindex = au_ibstart(inode); bindex <= bend; bindex++) { ++ hi = au_h_iptr(inode, bindex); ++ if (!hi) ++ continue; ++ ++ /* mutex_lock_nested(&hi->i_mutex, AuLsc_I_CHILD); */ ++ iwhdentry = au_hi_wh(inode, bindex); ++ if (iwhdentry) ++ dget(iwhdentry); ++ au_igrab(hi); ++ au_set_h_iptr(inode, bindex, NULL, 0); ++ au_set_h_iptr(inode, bindex, au_igrab(hi), ++ flags & ~AuHi_XINO); ++ iput(hi); ++ dput(iwhdentry); ++ /* mutex_unlock(&hi->i_mutex); */ ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int hn_xino(struct inode *inode, struct inode *h_inode) ++{ ++ int err; ++ aufs_bindex_t bindex, bend, bfound, bstart; ++ struct inode *h_i; ++ ++ err = 0; ++ if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { ++ pr_warning("branch root dir was changed\n"); ++ goto out; ++ } ++ ++ bfound = -1; ++ bend = au_ibend(inode); ++ bstart = au_ibstart(inode); ++#if 0 /* reserved for future use */ ++ if (bindex == bend) { ++ /* keep this ino in rename case */ ++ goto out; ++ } ++#endif ++ for (bindex = bstart; bindex <= bend; bindex++) ++ if (au_h_iptr(inode, bindex) == h_inode) { ++ bfound = bindex; ++ break; ++ } ++ if (bfound < 0) ++ goto out; ++ ++ for (bindex = bstart; bindex <= bend; bindex++) { ++ h_i = au_h_iptr(inode, bindex); ++ if (!h_i) ++ continue; ++ ++ err = au_xino_write(inode->i_sb, bindex, h_i->i_ino, /*ino*/0); ++ /* ignore this error */ ++ /* bad action? */ ++ } ++ ++ /* children inode number will be broken */ ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int hn_gen_tree(struct dentry *dentry) ++{ ++ int err, i, j, ndentry; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_dcsub_pages(&dpages, dentry, NULL, NULL); ++ if (unlikely(err)) ++ goto out_dpages; ++ ++ for (i = 0; i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ ndentry = dpage->ndentry; ++ for (j = 0; j < ndentry; j++) { ++ struct dentry *d; ++ ++ d = dentries[j]; ++ if (IS_ROOT(d)) ++ continue; ++ ++ au_digen_dec(d); ++ if (d->d_inode) ++ /* todo: reset children xino? ++ cached children only? */ ++ au_iigen_dec(d->d_inode); ++ } ++ } ++ ++out_dpages: ++ au_dpages_free(&dpages); ++ ++#if 0 ++ /* discard children */ ++ dentry_unhash(dentry); ++ dput(dentry); ++#endif ++out: ++ return err; ++} ++ ++/* ++ * return 0 if processed. ++ */ ++static int hn_gen_by_inode(char *name, unsigned int nlen, struct inode *inode, ++ const unsigned int isdir) ++{ ++ int err; ++ struct dentry *d; ++ struct qstr *dname; ++ ++ err = 1; ++ if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { ++ pr_warning("branch root dir was changed\n"); ++ err = 0; ++ goto out; ++ } ++ ++ if (!isdir) { ++ AuDebugOn(!name); ++ au_iigen_dec(inode); ++ spin_lock(&inode->i_lock); ++ list_for_each_entry(d, &inode->i_dentry, d_alias) { ++ spin_lock(&d->d_lock); ++ dname = &d->d_name; ++ if (dname->len != nlen ++ && memcmp(dname->name, name, nlen)) { ++ spin_unlock(&d->d_lock); ++ continue; ++ } ++ err = 0; ++ au_digen_dec(d); ++ spin_unlock(&d->d_lock); ++ break; ++ } ++ spin_unlock(&inode->i_lock); ++ } else { ++ au_fset_si(au_sbi(inode->i_sb), FAILED_REFRESH_DIR); ++ d = d_find_alias(inode); ++ if (!d) { ++ au_iigen_dec(inode); ++ goto out; ++ } ++ ++ spin_lock(&d->d_lock); ++ dname = &d->d_name; ++ if (dname->len == nlen && !memcmp(dname->name, name, nlen)) { ++ spin_unlock(&d->d_lock); ++ err = hn_gen_tree(d); ++ spin_lock(&d->d_lock); ++ } ++ spin_unlock(&d->d_lock); ++ dput(d); ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int hn_gen_by_name(struct dentry *dentry, const unsigned int isdir) ++{ ++ int err; ++ struct inode *inode; ++ ++ inode = dentry->d_inode; ++ if (IS_ROOT(dentry) ++ /* || (inode && inode->i_ino == AUFS_ROOT_INO) */ ++ ) { ++ pr_warning("branch root dir was changed\n"); ++ return 0; ++ } ++ ++ err = 0; ++ if (!isdir) { ++ au_digen_dec(dentry); ++ if (inode) ++ au_iigen_dec(inode); ++ } else { ++ au_fset_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR); ++ if (inode) ++ err = hn_gen_tree(dentry); ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* hnotify job flags */ ++#define AuHnJob_XINO0 1 ++#define AuHnJob_GEN (1 << 1) ++#define AuHnJob_DIRENT (1 << 2) ++#define AuHnJob_ISDIR (1 << 3) ++#define AuHnJob_TRYXINO0 (1 << 4) ++#define AuHnJob_MNTPNT (1 << 5) ++#define au_ftest_hnjob(flags, name) ((flags) & AuHnJob_##name) ++#define au_fset_hnjob(flags, name) \ ++ do { (flags) |= AuHnJob_##name; } while (0) ++#define au_fclr_hnjob(flags, name) \ ++ do { (flags) &= ~AuHnJob_##name; } while (0) ++ ++enum { ++ AuHn_CHILD, ++ AuHn_PARENT, ++ AuHnLast ++}; ++ ++struct au_hnotify_args { ++ struct inode *h_dir, *dir, *h_child_inode; ++ u32 mask; ++ unsigned int flags[AuHnLast]; ++ unsigned int h_child_nlen; ++ char h_child_name[]; ++}; ++ ++struct hn_job_args { ++ unsigned int flags; ++ struct inode *inode, *h_inode, *dir, *h_dir; ++ struct dentry *dentry; ++ char *h_name; ++ int h_nlen; ++}; ++ ++static int hn_job(struct hn_job_args *a) ++{ ++ const unsigned int isdir = au_ftest_hnjob(a->flags, ISDIR); ++ ++ /* reset xino */ ++ if (au_ftest_hnjob(a->flags, XINO0) && a->inode) ++ hn_xino(a->inode, a->h_inode); /* ignore this error */ ++ ++ if (au_ftest_hnjob(a->flags, TRYXINO0) ++ && a->inode ++ && a->h_inode) { ++ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); ++ if (!a->h_inode->i_nlink) ++ hn_xino(a->inode, a->h_inode); /* ignore this error */ ++ mutex_unlock(&a->h_inode->i_mutex); ++ } ++ ++ /* make the generation obsolete */ ++ if (au_ftest_hnjob(a->flags, GEN)) { ++ int err = -1; ++ if (a->inode) ++ err = hn_gen_by_inode(a->h_name, a->h_nlen, a->inode, ++ isdir); ++ if (err && a->dentry) ++ hn_gen_by_name(a->dentry, isdir); ++ /* ignore this error */ ++ } ++ ++ /* make dir entries obsolete */ ++ if (au_ftest_hnjob(a->flags, DIRENT) && a->inode) { ++ struct au_vdir *vdir; ++ ++ vdir = au_ivdir(a->inode); ++ if (vdir) ++ vdir->vd_jiffy = 0; ++ /* IMustLock(a->inode); */ ++ /* a->inode->i_version++; */ ++ } ++ ++ /* can do nothing but warn */ ++ if (au_ftest_hnjob(a->flags, MNTPNT) ++ && a->dentry ++ && d_mountpoint(a->dentry)) ++ pr_warning("mount-point %.*s is removed or renamed\n", ++ AuDLNPair(a->dentry)); ++ ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry *lookup_wlock_by_name(char *name, unsigned int nlen, ++ struct inode *dir) ++{ ++ struct dentry *dentry, *d, *parent; ++ struct qstr *dname; ++ ++ parent = d_find_alias(dir); ++ if (!parent) ++ return NULL; ++ ++ dentry = NULL; ++ spin_lock(&parent->d_lock); ++ list_for_each_entry(d, &parent->d_subdirs, d_u.d_child) { ++ /* AuDbg("%.*s\n", AuDLNPair(d)); */ ++ spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); ++ dname = &d->d_name; ++ if (dname->len != nlen || memcmp(dname->name, name, nlen)) ++ goto cont_unlock; ++ if (au_di(d)) ++ au_digen_dec(d); ++ else ++ goto cont_unlock; ++ if (d->d_count) { ++ dentry = dget_dlock(d); ++ spin_unlock(&d->d_lock); ++ break; ++ } ++ ++ cont_unlock: ++ spin_unlock(&d->d_lock); ++ } ++ spin_unlock(&parent->d_lock); ++ dput(parent); ++ ++ if (dentry) ++ di_write_lock_child(dentry); ++ ++ return dentry; ++} ++ ++static struct inode *lookup_wlock_by_ino(struct super_block *sb, ++ aufs_bindex_t bindex, ino_t h_ino) ++{ ++ struct inode *inode; ++ ino_t ino; ++ int err; ++ ++ inode = NULL; ++ err = au_xino_read(sb, bindex, h_ino, &ino); ++ if (!err && ino) ++ inode = ilookup(sb, ino); ++ if (!inode) ++ goto out; ++ ++ if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { ++ pr_warning("wrong root branch\n"); ++ iput(inode); ++ inode = NULL; ++ goto out; ++ } ++ ++ ii_write_lock_child(inode); ++ ++out: ++ return inode; ++} ++ ++static void au_hn_bh(void *_args) ++{ ++ struct au_hnotify_args *a = _args; ++ struct super_block *sb; ++ aufs_bindex_t bindex, bend, bfound; ++ unsigned char xino, try_iput; ++ int err; ++ struct inode *inode; ++ ino_t h_ino; ++ struct hn_job_args args; ++ struct dentry *dentry; ++ struct au_sbinfo *sbinfo; ++ ++ AuDebugOn(!_args); ++ AuDebugOn(!a->h_dir); ++ AuDebugOn(!a->dir); ++ AuDebugOn(!a->mask); ++ AuDbg("mask 0x%x, i%lu, hi%lu, hci%lu\n", ++ a->mask, a->dir->i_ino, a->h_dir->i_ino, ++ a->h_child_inode ? a->h_child_inode->i_ino : 0); ++ ++ inode = NULL; ++ dentry = NULL; ++ /* ++ * do not lock a->dir->i_mutex here ++ * because of d_revalidate() may cause a deadlock. ++ */ ++ sb = a->dir->i_sb; ++ AuDebugOn(!sb); ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!sbinfo); ++ si_write_lock(sb, AuLock_NOPLMW); ++ ++ ii_read_lock_parent(a->dir); ++ bfound = -1; ++ bend = au_ibend(a->dir); ++ for (bindex = au_ibstart(a->dir); bindex <= bend; bindex++) ++ if (au_h_iptr(a->dir, bindex) == a->h_dir) { ++ bfound = bindex; ++ break; ++ } ++ ii_read_unlock(a->dir); ++ if (unlikely(bfound < 0)) ++ goto out; ++ ++ xino = !!au_opt_test(au_mntflags(sb), XINO); ++ h_ino = 0; ++ if (a->h_child_inode) ++ h_ino = a->h_child_inode->i_ino; ++ ++ if (a->h_child_nlen ++ && (au_ftest_hnjob(a->flags[AuHn_CHILD], GEN) ++ || au_ftest_hnjob(a->flags[AuHn_CHILD], MNTPNT))) ++ dentry = lookup_wlock_by_name(a->h_child_name, a->h_child_nlen, ++ a->dir); ++ try_iput = 0; ++ if (dentry) ++ inode = dentry->d_inode; ++ if (xino && !inode && h_ino ++ && (au_ftest_hnjob(a->flags[AuHn_CHILD], XINO0) ++ || au_ftest_hnjob(a->flags[AuHn_CHILD], TRYXINO0) ++ || au_ftest_hnjob(a->flags[AuHn_CHILD], GEN))) { ++ inode = lookup_wlock_by_ino(sb, bfound, h_ino); ++ try_iput = 1; ++ } ++ ++ args.flags = a->flags[AuHn_CHILD]; ++ args.dentry = dentry; ++ args.inode = inode; ++ args.h_inode = a->h_child_inode; ++ args.dir = a->dir; ++ args.h_dir = a->h_dir; ++ args.h_name = a->h_child_name; ++ args.h_nlen = a->h_child_nlen; ++ err = hn_job(&args); ++ if (dentry) { ++ if (au_di(dentry)) ++ di_write_unlock(dentry); ++ dput(dentry); ++ } ++ if (inode && try_iput) { ++ ii_write_unlock(inode); ++ iput(inode); ++ } ++ ++ ii_write_lock_parent(a->dir); ++ args.flags = a->flags[AuHn_PARENT]; ++ args.dentry = NULL; ++ args.inode = a->dir; ++ args.h_inode = a->h_dir; ++ args.dir = NULL; ++ args.h_dir = NULL; ++ args.h_name = NULL; ++ args.h_nlen = 0; ++ err = hn_job(&args); ++ ii_write_unlock(a->dir); ++ ++out: ++ iput(a->h_child_inode); ++ iput(a->h_dir); ++ iput(a->dir); ++ si_write_unlock(sb); ++ au_nwt_done(&sbinfo->si_nowait); ++ kfree(a); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask, ++ struct qstr *h_child_qstr, struct inode *h_child_inode) ++{ ++ int err, len; ++ unsigned int flags[AuHnLast], f; ++ unsigned char isdir, isroot, wh; ++ struct inode *dir; ++ struct au_hnotify_args *args; ++ char *p, *h_child_name; ++ ++ err = 0; ++ AuDebugOn(!hnotify || !hnotify->hn_aufs_inode); ++ dir = igrab(hnotify->hn_aufs_inode); ++ if (!dir) ++ goto out; ++ ++ isroot = (dir->i_ino == AUFS_ROOT_INO); ++ wh = 0; ++ h_child_name = (void *)h_child_qstr->name; ++ len = h_child_qstr->len; ++ if (h_child_name) { ++ if (len > AUFS_WH_PFX_LEN ++ && !memcmp(h_child_name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { ++ h_child_name += AUFS_WH_PFX_LEN; ++ len -= AUFS_WH_PFX_LEN; ++ wh = 1; ++ } ++ } ++ ++ isdir = 0; ++ if (h_child_inode) ++ isdir = !!S_ISDIR(h_child_inode->i_mode); ++ flags[AuHn_PARENT] = AuHnJob_ISDIR; ++ flags[AuHn_CHILD] = 0; ++ if (isdir) ++ flags[AuHn_CHILD] = AuHnJob_ISDIR; ++ au_fset_hnjob(flags[AuHn_PARENT], DIRENT); ++ au_fset_hnjob(flags[AuHn_CHILD], GEN); ++ switch (mask & FS_EVENTS_POSS_ON_CHILD) { ++ case FS_MOVED_FROM: ++ case FS_MOVED_TO: ++ au_fset_hnjob(flags[AuHn_CHILD], XINO0); ++ au_fset_hnjob(flags[AuHn_CHILD], MNTPNT); ++ /*FALLTHROUGH*/ ++ case FS_CREATE: ++ AuDebugOn(!h_child_name || !h_child_inode); ++ break; ++ ++ case FS_DELETE: ++ /* ++ * aufs never be able to get this child inode. ++ * revalidation should be in d_revalidate() ++ * by checking i_nlink, i_generation or d_unhashed(). ++ */ ++ AuDebugOn(!h_child_name); ++ au_fset_hnjob(flags[AuHn_CHILD], TRYXINO0); ++ au_fset_hnjob(flags[AuHn_CHILD], MNTPNT); ++ break; ++ ++ default: ++ AuDebugOn(1); ++ } ++ ++ if (wh) ++ h_child_inode = NULL; ++ ++ err = -ENOMEM; ++ /* iput() and kfree() will be called in au_hnotify() */ ++ args = kmalloc(sizeof(*args) + len + 1, GFP_NOFS); ++ if (unlikely(!args)) { ++ AuErr1("no memory\n"); ++ iput(dir); ++ goto out; ++ } ++ args->flags[AuHn_PARENT] = flags[AuHn_PARENT]; ++ args->flags[AuHn_CHILD] = flags[AuHn_CHILD]; ++ args->mask = mask; ++ args->dir = dir; ++ args->h_dir = igrab(h_dir); ++ if (h_child_inode) ++ h_child_inode = igrab(h_child_inode); /* can be NULL */ ++ args->h_child_inode = h_child_inode; ++ args->h_child_nlen = len; ++ if (len) { ++ p = (void *)args; ++ p += sizeof(*args); ++ memcpy(p, h_child_name, len); ++ p[len] = 0; ++ } ++ ++ f = 0; ++ if (!dir->i_nlink) ++ f = AuWkq_NEST; ++ err = au_wkq_nowait(au_hn_bh, args, dir->i_sb, f); ++ if (unlikely(err)) { ++ pr_err("wkq %d\n", err); ++ iput(args->h_child_inode); ++ iput(args->h_dir); ++ iput(args->dir); ++ kfree(args); ++ } ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm) ++{ ++ int err; ++ ++ AuDebugOn(!(udba & AuOptMask_UDBA)); ++ ++ err = 0; ++ if (au_hnotify_op.reset_br) ++ err = au_hnotify_op.reset_br(udba, br, perm); ++ ++ return err; ++} ++ ++int au_hnotify_init_br(struct au_branch *br, int perm) ++{ ++ int err; ++ ++ err = 0; ++ if (au_hnotify_op.init_br) ++ err = au_hnotify_op.init_br(br, perm); ++ ++ return err; ++} ++ ++void au_hnotify_fin_br(struct au_branch *br) ++{ ++ if (au_hnotify_op.fin_br) ++ au_hnotify_op.fin_br(br); ++} ++ ++static void au_hn_destroy_cache(void) ++{ ++ kmem_cache_destroy(au_cachep[AuCache_HNOTIFY]); ++ au_cachep[AuCache_HNOTIFY] = NULL; ++} ++ ++int __init au_hnotify_init(void) ++{ ++ int err; ++ ++ err = -ENOMEM; ++ au_cachep[AuCache_HNOTIFY] = AuCache(au_hnotify); ++ if (au_cachep[AuCache_HNOTIFY]) { ++ err = 0; ++ if (au_hnotify_op.init) ++ err = au_hnotify_op.init(); ++ if (unlikely(err)) ++ au_hn_destroy_cache(); ++ } ++ AuTraceErr(err); ++ return err; ++} ++ ++void au_hnotify_fin(void) ++{ ++ if (au_hnotify_op.fin) ++ au_hnotify_op.fin(); ++ /* cf. au_cache_fin() */ ++ if (au_cachep[AuCache_HNOTIFY]) ++ au_hn_destroy_cache(); ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/iinfo.c linux-3.2.0-gentoo-r1/fs/aufs/iinfo.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/iinfo.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/iinfo.c 2012-01-17 12:11:24.780212183 +0100 +@@ -0,0 +1,264 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode private data ++ */ ++ ++#include "aufs.h" ++ ++struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex) ++{ ++ struct inode *h_inode; ++ ++ IiMustAnyLock(inode); ++ ++ h_inode = au_ii(inode)->ii_hinode[0 + bindex].hi_inode; ++ AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); ++ return h_inode; ++} ++ ++/* todo: hard/soft set? */ ++void au_hiput(struct au_hinode *hinode) ++{ ++ au_hn_free(hinode); ++ dput(hinode->hi_whdentry); ++ iput(hinode->hi_inode); ++} ++ ++unsigned int au_hi_flags(struct inode *inode, int isdir) ++{ ++ unsigned int flags; ++ const unsigned int mnt_flags = au_mntflags(inode->i_sb); ++ ++ flags = 0; ++ if (au_opt_test(mnt_flags, XINO)) ++ au_fset_hi(flags, XINO); ++ if (isdir && au_opt_test(mnt_flags, UDBA_HNOTIFY)) ++ au_fset_hi(flags, HNOTIFY); ++ return flags; ++} ++ ++void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, ++ struct inode *h_inode, unsigned int flags) ++{ ++ struct au_hinode *hinode; ++ struct inode *hi; ++ struct au_iinfo *iinfo = au_ii(inode); ++ ++ IiMustWriteLock(inode); ++ ++ hinode = iinfo->ii_hinode + bindex; ++ hi = hinode->hi_inode; ++ AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); ++ ++ if (hi) ++ au_hiput(hinode); ++ hinode->hi_inode = h_inode; ++ if (h_inode) { ++ int err; ++ struct super_block *sb = inode->i_sb; ++ struct au_branch *br; ++ ++ AuDebugOn(inode->i_mode ++ && (h_inode->i_mode & S_IFMT) ++ != (inode->i_mode & S_IFMT)); ++ if (bindex == iinfo->ii_bstart) ++ au_cpup_igen(inode, h_inode); ++ br = au_sbr(sb, bindex); ++ hinode->hi_id = br->br_id; ++ if (au_ftest_hi(flags, XINO)) { ++ err = au_xino_write(sb, bindex, h_inode->i_ino, ++ inode->i_ino); ++ if (unlikely(err)) ++ AuIOErr1("failed au_xino_write() %d\n", err); ++ } ++ ++ if (au_ftest_hi(flags, HNOTIFY) ++ && au_br_hnotifyable(br->br_perm)) { ++ err = au_hn_alloc(hinode, inode); ++ if (unlikely(err)) ++ AuIOErr1("au_hn_alloc() %d\n", err); ++ } ++ } ++} ++ ++void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_wh) ++{ ++ struct au_hinode *hinode; ++ ++ IiMustWriteLock(inode); ++ ++ hinode = au_ii(inode)->ii_hinode + bindex; ++ AuDebugOn(hinode->hi_whdentry); ++ hinode->hi_whdentry = h_wh; ++} ++ ++void au_update_iigen(struct inode *inode) ++{ ++ atomic_set(&au_ii(inode)->ii_generation, au_sigen(inode->i_sb)); ++ /* smp_mb(); */ /* atomic_set */ ++} ++ ++/* it may be called at remount time, too */ ++void au_update_ibrange(struct inode *inode, int do_put_zero) ++{ ++ struct au_iinfo *iinfo; ++ aufs_bindex_t bindex, bend; ++ ++ iinfo = au_ii(inode); ++ if (!iinfo) ++ return; ++ ++ IiMustWriteLock(inode); ++ ++ if (do_put_zero && iinfo->ii_bstart >= 0) { ++ for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; ++ bindex++) { ++ struct inode *h_i; ++ ++ h_i = iinfo->ii_hinode[0 + bindex].hi_inode; ++ if (h_i && !h_i->i_nlink) ++ au_set_h_iptr(inode, bindex, NULL, 0); ++ } ++ } ++ ++ iinfo->ii_bstart = -1; ++ iinfo->ii_bend = -1; ++ bend = au_sbend(inode->i_sb); ++ for (bindex = 0; bindex <= bend; bindex++) ++ if (iinfo->ii_hinode[0 + bindex].hi_inode) { ++ iinfo->ii_bstart = bindex; ++ break; ++ } ++ if (iinfo->ii_bstart >= 0) ++ for (bindex = bend; bindex >= iinfo->ii_bstart; bindex--) ++ if (iinfo->ii_hinode[0 + bindex].hi_inode) { ++ iinfo->ii_bend = bindex; ++ break; ++ } ++ AuDebugOn(iinfo->ii_bstart > iinfo->ii_bend); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_icntnr_init_once(void *_c) ++{ ++ struct au_icntnr *c = _c; ++ struct au_iinfo *iinfo = &c->iinfo; ++ static struct lock_class_key aufs_ii; ++ ++ au_rw_init(&iinfo->ii_rwsem); ++ au_rw_class(&iinfo->ii_rwsem, &aufs_ii); ++ inode_init_once(&c->vfs_inode); ++} ++ ++int au_iinfo_init(struct inode *inode) ++{ ++ struct au_iinfo *iinfo; ++ struct super_block *sb; ++ int nbr, i; ++ ++ sb = inode->i_sb; ++ iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); ++ nbr = au_sbend(sb) + 1; ++ if (unlikely(nbr <= 0)) ++ nbr = 1; ++ iinfo->ii_hinode = kcalloc(nbr, sizeof(*iinfo->ii_hinode), GFP_NOFS); ++ if (iinfo->ii_hinode) { ++ au_ninodes_inc(sb); ++ for (i = 0; i < nbr; i++) ++ iinfo->ii_hinode[i].hi_id = -1; ++ ++ atomic_set(&iinfo->ii_generation, au_sigen(sb)); ++ /* smp_mb(); */ /* atomic_set */ ++ iinfo->ii_bstart = -1; ++ iinfo->ii_bend = -1; ++ iinfo->ii_vdir = NULL; ++ return 0; ++ } ++ return -ENOMEM; ++} ++ ++int au_ii_realloc(struct au_iinfo *iinfo, int nbr) ++{ ++ int err, sz; ++ struct au_hinode *hip; ++ ++ AuRwMustWriteLock(&iinfo->ii_rwsem); ++ ++ err = -ENOMEM; ++ sz = sizeof(*hip) * (iinfo->ii_bend + 1); ++ if (!sz) ++ sz = sizeof(*hip); ++ hip = au_kzrealloc(iinfo->ii_hinode, sz, sizeof(*hip) * nbr, GFP_NOFS); ++ if (hip) { ++ iinfo->ii_hinode = hip; ++ err = 0; ++ } ++ ++ return err; ++} ++ ++void au_iinfo_fin(struct inode *inode) ++{ ++ struct au_iinfo *iinfo; ++ struct au_hinode *hi; ++ struct super_block *sb; ++ aufs_bindex_t bindex, bend; ++ const unsigned char unlinked = !inode->i_nlink; ++ ++ iinfo = au_ii(inode); ++ /* bad_inode case */ ++ if (!iinfo) ++ return; ++ ++ sb = inode->i_sb; ++ au_ninodes_dec(sb); ++ if (si_pid_test(sb)) ++ au_xino_delete_inode(inode, unlinked); ++ else { ++ /* ++ * it is safe to hide the dependency between sbinfo and ++ * sb->s_umount. ++ */ ++ lockdep_off(); ++ si_noflush_read_lock(sb); ++ au_xino_delete_inode(inode, unlinked); ++ si_read_unlock(sb); ++ lockdep_on(); ++ } ++ ++ if (iinfo->ii_vdir) ++ au_vdir_free(iinfo->ii_vdir); ++ ++ bindex = iinfo->ii_bstart; ++ if (bindex >= 0) { ++ hi = iinfo->ii_hinode + bindex; ++ bend = iinfo->ii_bend; ++ while (bindex++ <= bend) { ++ if (hi->hi_inode) ++ au_hiput(hi); ++ hi++; ++ } ++ } ++ kfree(iinfo->ii_hinode); ++ iinfo->ii_hinode = NULL; ++ AuRwDestroy(&iinfo->ii_rwsem); ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/inode.c linux-3.2.0-gentoo-r1/fs/aufs/inode.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/inode.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/inode.c 2012-01-17 12:11:24.791786361 +0100 +@@ -0,0 +1,471 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode functions ++ */ ++ ++#include "aufs.h" ++ ++struct inode *au_igrab(struct inode *inode) ++{ ++ if (inode) { ++ AuDebugOn(!atomic_read(&inode->i_count)); ++ ihold(inode); ++ } ++ return inode; ++} ++ ++static void au_refresh_hinode_attr(struct inode *inode, int do_version) ++{ ++ au_cpup_attr_all(inode, /*force*/0); ++ au_update_iigen(inode); ++ if (do_version) ++ inode->i_version++; ++} ++ ++static int au_ii_refresh(struct inode *inode, int *update) ++{ ++ int err, e; ++ umode_t type; ++ aufs_bindex_t bindex, new_bindex; ++ struct super_block *sb; ++ struct au_iinfo *iinfo; ++ struct au_hinode *p, *q, tmp; ++ ++ IiMustWriteLock(inode); ++ ++ *update = 0; ++ sb = inode->i_sb; ++ type = inode->i_mode & S_IFMT; ++ iinfo = au_ii(inode); ++ err = au_ii_realloc(iinfo, au_sbend(sb) + 1); ++ if (unlikely(err)) ++ goto out; ++ ++ AuDebugOn(iinfo->ii_bstart < 0); ++ p = iinfo->ii_hinode + iinfo->ii_bstart; ++ for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; ++ bindex++, p++) { ++ if (!p->hi_inode) ++ continue; ++ ++ AuDebugOn(type != (p->hi_inode->i_mode & S_IFMT)); ++ new_bindex = au_br_index(sb, p->hi_id); ++ if (new_bindex == bindex) ++ continue; ++ ++ if (new_bindex < 0) { ++ *update = 1; ++ au_hiput(p); ++ p->hi_inode = NULL; ++ continue; ++ } ++ ++ if (new_bindex < iinfo->ii_bstart) ++ iinfo->ii_bstart = new_bindex; ++ if (iinfo->ii_bend < new_bindex) ++ iinfo->ii_bend = new_bindex; ++ /* swap two lower inode, and loop again */ ++ q = iinfo->ii_hinode + new_bindex; ++ tmp = *q; ++ *q = *p; ++ *p = tmp; ++ if (tmp.hi_inode) { ++ bindex--; ++ p--; ++ } ++ } ++ au_update_ibrange(inode, /*do_put_zero*/0); ++ e = au_dy_irefresh(inode); ++ if (unlikely(e && !err)) ++ err = e; ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_refresh_hinode_self(struct inode *inode) ++{ ++ int err, update; ++ ++ err = au_ii_refresh(inode, &update); ++ if (!err) ++ au_refresh_hinode_attr(inode, update && S_ISDIR(inode->i_mode)); ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_refresh_hinode(struct inode *inode, struct dentry *dentry) ++{ ++ int err, e, update; ++ unsigned int flags; ++ umode_t mode; ++ aufs_bindex_t bindex, bend; ++ unsigned char isdir; ++ struct au_hinode *p; ++ struct au_iinfo *iinfo; ++ ++ err = au_ii_refresh(inode, &update); ++ if (unlikely(err)) ++ goto out; ++ ++ update = 0; ++ iinfo = au_ii(inode); ++ p = iinfo->ii_hinode + iinfo->ii_bstart; ++ mode = (inode->i_mode & S_IFMT); ++ isdir = S_ISDIR(mode); ++ flags = au_hi_flags(inode, isdir); ++ bend = au_dbend(dentry); ++ for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) { ++ struct inode *h_i; ++ struct dentry *h_d; ++ ++ h_d = au_h_dptr(dentry, bindex); ++ if (!h_d || !h_d->d_inode) ++ continue; ++ ++ AuDebugOn(mode != (h_d->d_inode->i_mode & S_IFMT)); ++ if (iinfo->ii_bstart <= bindex && bindex <= iinfo->ii_bend) { ++ h_i = au_h_iptr(inode, bindex); ++ if (h_i) { ++ if (h_i == h_d->d_inode) ++ continue; ++ err = -EIO; ++ break; ++ } ++ } ++ if (bindex < iinfo->ii_bstart) ++ iinfo->ii_bstart = bindex; ++ if (iinfo->ii_bend < bindex) ++ iinfo->ii_bend = bindex; ++ au_set_h_iptr(inode, bindex, au_igrab(h_d->d_inode), flags); ++ update = 1; ++ } ++ au_update_ibrange(inode, /*do_put_zero*/0); ++ e = au_dy_irefresh(inode); ++ if (unlikely(e && !err)) ++ err = e; ++ if (!err) ++ au_refresh_hinode_attr(inode, update && isdir); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int set_inode(struct inode *inode, struct dentry *dentry) ++{ ++ int err; ++ unsigned int flags; ++ umode_t mode; ++ aufs_bindex_t bindex, bstart, btail; ++ unsigned char isdir; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ struct au_iinfo *iinfo; ++ ++ IiMustWriteLock(inode); ++ ++ err = 0; ++ isdir = 0; ++ bstart = au_dbstart(dentry); ++ h_inode = au_h_dptr(dentry, bstart)->d_inode; ++ mode = h_inode->i_mode; ++ switch (mode & S_IFMT) { ++ case S_IFREG: ++ btail = au_dbtail(dentry); ++ inode->i_op = &aufs_iop; ++ inode->i_fop = &aufs_file_fop; ++ err = au_dy_iaop(inode, bstart, h_inode); ++ if (unlikely(err)) ++ goto out; ++ break; ++ case S_IFDIR: ++ isdir = 1; ++ btail = au_dbtaildir(dentry); ++ inode->i_op = &aufs_dir_iop; ++ inode->i_fop = &aufs_dir_fop; ++ break; ++ case S_IFLNK: ++ btail = au_dbtail(dentry); ++ inode->i_op = &aufs_symlink_iop; ++ break; ++ case S_IFBLK: ++ case S_IFCHR: ++ case S_IFIFO: ++ case S_IFSOCK: ++ btail = au_dbtail(dentry); ++ inode->i_op = &aufs_iop; ++ au_init_special_fop(inode, mode, h_inode->i_rdev); ++ break; ++ default: ++ AuIOErr("Unknown file type 0%o\n", mode); ++ err = -EIO; ++ goto out; ++ } ++ ++ /* do not set hnotify for whiteouted dirs (SHWH mode) */ ++ flags = au_hi_flags(inode, isdir); ++ if (au_opt_test(au_mntflags(dentry->d_sb), SHWH) ++ && au_ftest_hi(flags, HNOTIFY) ++ && dentry->d_name.len > AUFS_WH_PFX_LEN ++ && !memcmp(dentry->d_name.name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) ++ au_fclr_hi(flags, HNOTIFY); ++ iinfo = au_ii(inode); ++ iinfo->ii_bstart = bstart; ++ iinfo->ii_bend = btail; ++ for (bindex = bstart; bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry) ++ au_set_h_iptr(inode, bindex, ++ au_igrab(h_dentry->d_inode), flags); ++ } ++ au_cpup_attr_all(inode, /*force*/1); ++ ++out: ++ return err; ++} ++ ++/* ++ * successful returns with iinfo write_locked ++ * minus: errno ++ * zero: success, matched ++ * plus: no error, but unmatched ++ */ ++static int reval_inode(struct inode *inode, struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ struct inode *h_inode, *h_dinode; ++ ++ /* ++ * before this function, if aufs got any iinfo lock, it must be only ++ * one, the parent dir. ++ * it can happen by UDBA and the obsoleted inode number. ++ */ ++ err = -EIO; ++ if (unlikely(inode->i_ino == parent_ino(dentry))) ++ goto out; ++ ++ err = 1; ++ ii_write_lock_new_child(inode); ++ h_dinode = au_h_dptr(dentry, au_dbstart(dentry))->d_inode; ++ bend = au_ibend(inode); ++ for (bindex = au_ibstart(inode); bindex <= bend; bindex++) { ++ h_inode = au_h_iptr(inode, bindex); ++ if (h_inode && h_inode == h_dinode) { ++ err = 0; ++ if (au_iigen_test(inode, au_digen(dentry))) ++ err = au_refresh_hinode(inode, dentry); ++ break; ++ } ++ } ++ ++ if (unlikely(err)) ++ ii_write_unlock(inode); ++out: ++ return err; ++} ++ ++int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ unsigned int d_type, ino_t *ino) ++{ ++ int err; ++ struct mutex *mtx; ++ ++ /* prevent hardlinked inode number from race condition */ ++ mtx = NULL; ++ if (d_type != DT_DIR) { ++ mtx = &au_sbr(sb, bindex)->br_xino.xi_nondir_mtx; ++ mutex_lock(mtx); ++ } ++ err = au_xino_read(sb, bindex, h_ino, ino); ++ if (unlikely(err)) ++ goto out; ++ ++ if (!*ino) { ++ err = -EIO; ++ *ino = au_xino_new_ino(sb); ++ if (unlikely(!*ino)) ++ goto out; ++ err = au_xino_write(sb, bindex, h_ino, *ino); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++out: ++ if (mtx) ++ mutex_unlock(mtx); ++ return err; ++} ++ ++/* successful returns with iinfo write_locked */ ++/* todo: return with unlocked? */ ++struct inode *au_new_inode(struct dentry *dentry, int must_new) ++{ ++ struct inode *inode, *h_inode; ++ struct dentry *h_dentry; ++ struct super_block *sb; ++ struct mutex *mtx; ++ ino_t h_ino, ino; ++ int err; ++ aufs_bindex_t bstart; ++ ++ sb = dentry->d_sb; ++ bstart = au_dbstart(dentry); ++ h_dentry = au_h_dptr(dentry, bstart); ++ h_inode = h_dentry->d_inode; ++ h_ino = h_inode->i_ino; ++ ++ /* ++ * stop 'race'-ing between hardlinks under different ++ * parents. ++ */ ++ mtx = NULL; ++ if (!S_ISDIR(h_inode->i_mode)) ++ mtx = &au_sbr(sb, bstart)->br_xino.xi_nondir_mtx; ++ ++new_ino: ++ if (mtx) ++ mutex_lock(mtx); ++ err = au_xino_read(sb, bstart, h_ino, &ino); ++ inode = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ if (!ino) { ++ ino = au_xino_new_ino(sb); ++ if (unlikely(!ino)) { ++ inode = ERR_PTR(-EIO); ++ goto out; ++ } ++ } ++ ++ AuDbg("i%lu\n", (unsigned long)ino); ++ inode = au_iget_locked(sb, ino); ++ err = PTR_ERR(inode); ++ if (IS_ERR(inode)) ++ goto out; ++ ++ AuDbg("%lx, new %d\n", inode->i_state, !!(inode->i_state & I_NEW)); ++ if (inode->i_state & I_NEW) { ++ ii_write_lock_new_child(inode); ++ err = set_inode(inode, dentry); ++ if (!err) { ++ unlock_new_inode(inode); ++ goto out; /* success */ ++ } ++ ++ /* ++ * iget_failed() calls iput(), but we need to call ++ * ii_write_unlock() after iget_failed(). so dirty hack for ++ * i_count. ++ */ ++ atomic_inc(&inode->i_count); ++ iget_failed(inode); ++ ii_write_unlock(inode); ++ au_xino_write(sb, bstart, h_ino, /*ino*/0); ++ /* ignore this error */ ++ goto out_iput; ++ } else if (!must_new && !IS_DEADDIR(inode) && inode->i_nlink) { ++ /* ++ * horrible race condition between lookup, readdir and copyup ++ * (or something). ++ */ ++ if (mtx) ++ mutex_unlock(mtx); ++ err = reval_inode(inode, dentry); ++ if (unlikely(err < 0)) { ++ mtx = NULL; ++ goto out_iput; ++ } ++ ++ if (!err) { ++ mtx = NULL; ++ goto out; /* success */ ++ } else if (mtx) ++ mutex_lock(mtx); ++ } ++ ++ if (unlikely(au_test_fs_unique_ino(h_dentry->d_inode))) ++ AuWarn1("Warning: Un-notified UDBA or repeatedly renamed dir," ++ " b%d, %s, %.*s, hi%lu, i%lu.\n", ++ bstart, au_sbtype(h_dentry->d_sb), AuDLNPair(dentry), ++ (unsigned long)h_ino, (unsigned long)ino); ++ ino = 0; ++ err = au_xino_write(sb, bstart, h_ino, /*ino*/0); ++ if (!err) { ++ iput(inode); ++ if (mtx) ++ mutex_unlock(mtx); ++ goto new_ino; ++ } ++ ++out_iput: ++ iput(inode); ++ inode = ERR_PTR(err); ++out: ++ if (mtx) ++ mutex_unlock(mtx); ++ return inode; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, ++ struct inode *inode) ++{ ++ int err; ++ ++ err = au_br_rdonly(au_sbr(sb, bindex)); ++ ++ /* pseudo-link after flushed may happen out of bounds */ ++ if (!err ++ && inode ++ && au_ibstart(inode) <= bindex ++ && bindex <= au_ibend(inode)) { ++ /* ++ * permission check is unnecessary since vfsub routine ++ * will be called later ++ */ ++ struct inode *hi = au_h_iptr(inode, bindex); ++ if (hi) ++ err = IS_IMMUTABLE(hi) ? -EROFS : 0; ++ } ++ ++ return err; ++} ++ ++int au_test_h_perm(struct inode *h_inode, int mask) ++{ ++ if (!current_fsuid()) ++ return 0; ++ return inode_permission(h_inode, mask); ++} ++ ++int au_test_h_perm_sio(struct inode *h_inode, int mask) ++{ ++ if (au_test_nfs(h_inode->i_sb) ++ && (mask & MAY_WRITE) ++ && S_ISDIR(h_inode->i_mode)) ++ mask |= MAY_READ; /* force permission check */ ++ return au_test_h_perm(h_inode, mask); ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/inode.h linux-3.2.0-gentoo-r1/fs/aufs/inode.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/inode.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/inode.h 2012-01-17 12:11:24.803360540 +0100 +@@ -0,0 +1,554 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode operations ++ */ ++ ++#ifndef __AUFS_INODE_H__ ++#define __AUFS_INODE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include "rwsem.h" ++ ++struct vfsmount; ++ ++struct au_hnotify { ++#ifdef CONFIG_AUFS_HNOTIFY ++#ifdef CONFIG_AUFS_HFSNOTIFY ++ /* never use fsnotify_add_vfsmount_mark() */ ++ struct fsnotify_mark hn_mark; ++ int hn_mark_dead; ++#endif ++ struct inode *hn_aufs_inode; /* no get/put */ ++#endif ++} ____cacheline_aligned_in_smp; ++ ++struct au_hinode { ++ struct inode *hi_inode; ++ aufs_bindex_t hi_id; ++#ifdef CONFIG_AUFS_HNOTIFY ++ struct au_hnotify *hi_notify; ++#endif ++ ++ /* reference to the copied-up whiteout with get/put */ ++ struct dentry *hi_whdentry; ++}; ++ ++struct au_vdir; ++struct au_iinfo { ++ atomic_t ii_generation; ++ struct super_block *ii_hsb1; /* no get/put */ ++ ++ struct au_rwsem ii_rwsem; ++ aufs_bindex_t ii_bstart, ii_bend; ++ __u32 ii_higen; ++ struct au_hinode *ii_hinode; ++ struct au_vdir *ii_vdir; ++}; ++ ++struct au_icntnr { ++ struct au_iinfo iinfo; ++ struct inode vfs_inode; ++} ____cacheline_aligned_in_smp; ++ ++/* au_pin flags */ ++#define AuPin_DI_LOCKED 1 ++#define AuPin_MNT_WRITE (1 << 1) ++#define au_ftest_pin(flags, name) ((flags) & AuPin_##name) ++#define au_fset_pin(flags, name) \ ++ do { (flags) |= AuPin_##name; } while (0) ++#define au_fclr_pin(flags, name) \ ++ do { (flags) &= ~AuPin_##name; } while (0) ++ ++struct au_pin { ++ /* input */ ++ struct dentry *dentry; ++ unsigned int udba; ++ unsigned char lsc_di, lsc_hi, flags; ++ aufs_bindex_t bindex; ++ ++ /* output */ ++ struct dentry *parent; ++ struct au_hinode *hdir; ++ struct vfsmount *h_mnt; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_iinfo *au_ii(struct inode *inode) ++{ ++ struct au_iinfo *iinfo; ++ ++ iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); ++ if (iinfo->ii_hinode) ++ return iinfo; ++ return NULL; /* debugging bad_inode case */ ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* inode.c */ ++struct inode *au_igrab(struct inode *inode); ++int au_refresh_hinode_self(struct inode *inode); ++int au_refresh_hinode(struct inode *inode, struct dentry *dentry); ++int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ unsigned int d_type, ino_t *ino); ++struct inode *au_new_inode(struct dentry *dentry, int must_new); ++int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, ++ struct inode *inode); ++int au_test_h_perm(struct inode *h_inode, int mask); ++int au_test_h_perm_sio(struct inode *h_inode, int mask); ++ ++static inline int au_wh_ino(struct super_block *sb, aufs_bindex_t bindex, ++ ino_t h_ino, unsigned int d_type, ino_t *ino) ++{ ++#ifdef CONFIG_AUFS_SHWH ++ return au_ino(sb, bindex, h_ino, d_type, ino); ++#else ++ return 0; ++#endif ++} ++ ++/* i_op.c */ ++extern struct inode_operations aufs_iop, aufs_symlink_iop, aufs_dir_iop; ++ ++/* au_wr_dir flags */ ++#define AuWrDir_ADD_ENTRY 1 ++#define AuWrDir_ISDIR (1 << 1) ++#define au_ftest_wrdir(flags, name) ((flags) & AuWrDir_##name) ++#define au_fset_wrdir(flags, name) \ ++ do { (flags) |= AuWrDir_##name; } while (0) ++#define au_fclr_wrdir(flags, name) \ ++ do { (flags) &= ~AuWrDir_##name; } while (0) ++ ++struct au_wr_dir_args { ++ aufs_bindex_t force_btgt; ++ unsigned char flags; ++}; ++int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, ++ struct au_wr_dir_args *args); ++ ++struct dentry *au_pinned_h_parent(struct au_pin *pin); ++void au_pin_init(struct au_pin *pin, struct dentry *dentry, ++ aufs_bindex_t bindex, int lsc_di, int lsc_hi, ++ unsigned int udba, unsigned char flags); ++int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int udba, unsigned char flags) __must_check; ++int au_do_pin(struct au_pin *pin) __must_check; ++void au_unpin(struct au_pin *pin); ++ ++/* i_op_add.c */ ++int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir); ++int aufs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev); ++int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname); ++int aufs_create(struct inode *dir, struct dentry *dentry, int mode, ++ struct nameidata *nd); ++int aufs_link(struct dentry *src_dentry, struct inode *dir, ++ struct dentry *dentry); ++int aufs_mkdir(struct inode *dir, struct dentry *dentry, int mode); ++ ++/* i_op_del.c */ ++int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup); ++int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir); ++int aufs_unlink(struct inode *dir, struct dentry *dentry); ++int aufs_rmdir(struct inode *dir, struct dentry *dentry); ++ ++/* i_op_ren.c */ ++int au_wbr(struct dentry *dentry, aufs_bindex_t btgt); ++int aufs_rename(struct inode *src_dir, struct dentry *src_dentry, ++ struct inode *dir, struct dentry *dentry); ++ ++/* iinfo.c */ ++struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex); ++void au_hiput(struct au_hinode *hinode); ++void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_wh); ++unsigned int au_hi_flags(struct inode *inode, int isdir); ++ ++/* hinode flags */ ++#define AuHi_XINO 1 ++#define AuHi_HNOTIFY (1 << 1) ++#define au_ftest_hi(flags, name) ((flags) & AuHi_##name) ++#define au_fset_hi(flags, name) \ ++ do { (flags) |= AuHi_##name; } while (0) ++#define au_fclr_hi(flags, name) \ ++ do { (flags) &= ~AuHi_##name; } while (0) ++ ++#ifndef CONFIG_AUFS_HNOTIFY ++#undef AuHi_HNOTIFY ++#define AuHi_HNOTIFY 0 ++#endif ++ ++void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, ++ struct inode *h_inode, unsigned int flags); ++ ++void au_update_iigen(struct inode *inode); ++void au_update_ibrange(struct inode *inode, int do_put_zero); ++ ++void au_icntnr_init_once(void *_c); ++int au_iinfo_init(struct inode *inode); ++void au_iinfo_fin(struct inode *inode); ++int au_ii_realloc(struct au_iinfo *iinfo, int nbr); ++ ++#ifdef CONFIG_PROC_FS ++/* plink.c */ ++int au_plink_maint(struct super_block *sb, int flags); ++void au_plink_maint_leave(struct au_sbinfo *sbinfo); ++int au_plink_maint_enter(struct super_block *sb); ++#ifdef CONFIG_AUFS_DEBUG ++void au_plink_list(struct super_block *sb); ++#else ++AuStubVoid(au_plink_list, struct super_block *sb) ++#endif ++int au_plink_test(struct inode *inode); ++struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex); ++void au_plink_append(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_dentry); ++void au_plink_put(struct super_block *sb, int verbose); ++void au_plink_clean(struct super_block *sb, int verbose); ++void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id); ++#else ++AuStubInt0(au_plink_maint, struct super_block *sb, int flags); ++AuStubVoid(au_plink_maint_leave, struct au_sbinfo *sbinfo); ++AuStubInt0(au_plink_maint_enter, struct super_block *sb); ++AuStubVoid(au_plink_list, struct super_block *sb); ++AuStubInt0(au_plink_test, struct inode *inode); ++AuStub(struct dentry *, au_plink_lkup, return NULL, ++ struct inode *inode, aufs_bindex_t bindex); ++AuStubVoid(au_plink_append, struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_dentry); ++AuStubVoid(au_plink_put, struct super_block *sb, int verbose); ++AuStubVoid(au_plink_clean, struct super_block *sb, int verbose); ++AuStubVoid(au_plink_half_refresh, struct super_block *sb, aufs_bindex_t br_id); ++#endif /* CONFIG_PROC_FS */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock subclass for iinfo */ ++enum { ++ AuLsc_II_CHILD, /* child first */ ++ AuLsc_II_CHILD2, /* rename(2), link(2), and cpup at hnotify */ ++ AuLsc_II_CHILD3, /* copyup dirs */ ++ AuLsc_II_PARENT, /* see AuLsc_I_PARENT in vfsub.h */ ++ AuLsc_II_PARENT2, ++ AuLsc_II_PARENT3, /* copyup dirs */ ++ AuLsc_II_NEW_CHILD ++}; ++ ++/* ++ * ii_read_lock_child, ii_write_lock_child, ++ * ii_read_lock_child2, ii_write_lock_child2, ++ * ii_read_lock_child3, ii_write_lock_child3, ++ * ii_read_lock_parent, ii_write_lock_parent, ++ * ii_read_lock_parent2, ii_write_lock_parent2, ++ * ii_read_lock_parent3, ii_write_lock_parent3, ++ * ii_read_lock_new_child, ii_write_lock_new_child, ++ */ ++#define AuReadLockFunc(name, lsc) \ ++static inline void ii_read_lock_##name(struct inode *i) \ ++{ \ ++ au_rw_read_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ ++} ++ ++#define AuWriteLockFunc(name, lsc) \ ++static inline void ii_write_lock_##name(struct inode *i) \ ++{ \ ++ au_rw_write_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ ++} ++ ++#define AuRWLockFuncs(name, lsc) \ ++ AuReadLockFunc(name, lsc) \ ++ AuWriteLockFunc(name, lsc) ++ ++AuRWLockFuncs(child, CHILD); ++AuRWLockFuncs(child2, CHILD2); ++AuRWLockFuncs(child3, CHILD3); ++AuRWLockFuncs(parent, PARENT); ++AuRWLockFuncs(parent2, PARENT2); ++AuRWLockFuncs(parent3, PARENT3); ++AuRWLockFuncs(new_child, NEW_CHILD); ++ ++#undef AuReadLockFunc ++#undef AuWriteLockFunc ++#undef AuRWLockFuncs ++ ++/* ++ * ii_read_unlock, ii_write_unlock, ii_downgrade_lock ++ */ ++AuSimpleUnlockRwsemFuncs(ii, struct inode *i, &au_ii(i)->ii_rwsem); ++ ++#define IiMustNoWaiters(i) AuRwMustNoWaiters(&au_ii(i)->ii_rwsem) ++#define IiMustAnyLock(i) AuRwMustAnyLock(&au_ii(i)->ii_rwsem) ++#define IiMustWriteLock(i) AuRwMustWriteLock(&au_ii(i)->ii_rwsem) ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline void au_icntnr_init(struct au_icntnr *c) ++{ ++#ifdef CONFIG_AUFS_DEBUG ++ c->vfs_inode.i_mode = 0; ++#endif ++} ++ ++static inline unsigned int au_iigen(struct inode *inode) ++{ ++ return atomic_read(&au_ii(inode)->ii_generation); ++} ++ ++/* tiny test for inode number */ ++/* tmpfs generation is too rough */ ++static inline int au_test_higen(struct inode *inode, struct inode *h_inode) ++{ ++ struct au_iinfo *iinfo; ++ ++ iinfo = au_ii(inode); ++ AuRwMustAnyLock(&iinfo->ii_rwsem); ++ return !(iinfo->ii_hsb1 == h_inode->i_sb ++ && iinfo->ii_higen == h_inode->i_generation); ++} ++ ++static inline void au_iigen_dec(struct inode *inode) ++{ ++ atomic_dec(&au_ii(inode)->ii_generation); ++} ++ ++static inline int au_iigen_test(struct inode *inode, unsigned int sigen) ++{ ++ int err; ++ ++ err = 0; ++ if (unlikely(inode && au_iigen(inode) != sigen)) ++ err = -EIO; ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline aufs_bindex_t au_ii_br_id(struct inode *inode, ++ aufs_bindex_t bindex) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_hinode[0 + bindex].hi_id; ++} ++ ++static inline aufs_bindex_t au_ibstart(struct inode *inode) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_bstart; ++} ++ ++static inline aufs_bindex_t au_ibend(struct inode *inode) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_bend; ++} ++ ++static inline struct au_vdir *au_ivdir(struct inode *inode) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_vdir; ++} ++ ++static inline struct dentry *au_hi_wh(struct inode *inode, aufs_bindex_t bindex) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_hinode[0 + bindex].hi_whdentry; ++} ++ ++static inline void au_set_ibstart(struct inode *inode, aufs_bindex_t bindex) ++{ ++ IiMustWriteLock(inode); ++ au_ii(inode)->ii_bstart = bindex; ++} ++ ++static inline void au_set_ibend(struct inode *inode, aufs_bindex_t bindex) ++{ ++ IiMustWriteLock(inode); ++ au_ii(inode)->ii_bend = bindex; ++} ++ ++static inline void au_set_ivdir(struct inode *inode, struct au_vdir *vdir) ++{ ++ IiMustWriteLock(inode); ++ au_ii(inode)->ii_vdir = vdir; ++} ++ ++static inline struct au_hinode *au_hi(struct inode *inode, aufs_bindex_t bindex) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_hinode + bindex; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct dentry *au_pinned_parent(struct au_pin *pin) ++{ ++ if (pin) ++ return pin->parent; ++ return NULL; ++} ++ ++static inline struct inode *au_pinned_h_dir(struct au_pin *pin) ++{ ++ if (pin && pin->hdir) ++ return pin->hdir->hi_inode; ++ return NULL; ++} ++ ++static inline struct au_hinode *au_pinned_hdir(struct au_pin *pin) ++{ ++ if (pin) ++ return pin->hdir; ++ return NULL; ++} ++ ++static inline void au_pin_set_dentry(struct au_pin *pin, struct dentry *dentry) ++{ ++ if (pin) ++ pin->dentry = dentry; ++} ++ ++static inline void au_pin_set_parent_lflag(struct au_pin *pin, ++ unsigned char lflag) ++{ ++ if (pin) { ++ if (lflag) ++ au_fset_pin(pin->flags, DI_LOCKED); ++ else ++ au_fclr_pin(pin->flags, DI_LOCKED); ++ } ++} ++ ++static inline void au_pin_set_parent(struct au_pin *pin, struct dentry *parent) ++{ ++ if (pin) { ++ dput(pin->parent); ++ pin->parent = dget(parent); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_branch; ++#ifdef CONFIG_AUFS_HNOTIFY ++struct au_hnotify_op { ++ void (*ctl)(struct au_hinode *hinode, int do_set); ++ int (*alloc)(struct au_hinode *hinode); ++ void (*free)(struct au_hinode *hinode); ++ ++ void (*fin)(void); ++ int (*init)(void); ++ ++ int (*reset_br)(unsigned int udba, struct au_branch *br, int perm); ++ void (*fin_br)(struct au_branch *br); ++ int (*init_br)(struct au_branch *br, int perm); ++}; ++ ++/* hnotify.c */ ++int au_hn_alloc(struct au_hinode *hinode, struct inode *inode); ++void au_hn_free(struct au_hinode *hinode); ++void au_hn_ctl(struct au_hinode *hinode, int do_set); ++void au_hn_reset(struct inode *inode, unsigned int flags); ++int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask, ++ struct qstr *h_child_qstr, struct inode *h_child_inode); ++int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm); ++int au_hnotify_init_br(struct au_branch *br, int perm); ++void au_hnotify_fin_br(struct au_branch *br); ++int __init au_hnotify_init(void); ++void au_hnotify_fin(void); ++ ++/* hfsnotify.c */ ++extern const struct au_hnotify_op au_hnotify_op; ++ ++static inline ++void au_hn_init(struct au_hinode *hinode) ++{ ++ hinode->hi_notify = NULL; ++} ++ ++static inline struct au_hnotify *au_hn(struct au_hinode *hinode) ++{ ++ return hinode->hi_notify; ++} ++ ++#else ++static inline ++int au_hn_alloc(struct au_hinode *hinode __maybe_unused, ++ struct inode *inode __maybe_unused) ++{ ++ return -EOPNOTSUPP; ++} ++ ++static inline struct au_hnotify *au_hn(struct au_hinode *hinode) ++{ ++ return NULL; ++} ++ ++AuStubVoid(au_hn_free, struct au_hinode *hinode __maybe_unused) ++AuStubVoid(au_hn_ctl, struct au_hinode *hinode __maybe_unused, ++ int do_set __maybe_unused) ++AuStubVoid(au_hn_reset, struct inode *inode __maybe_unused, ++ unsigned int flags __maybe_unused) ++AuStubInt0(au_hnotify_reset_br, unsigned int udba __maybe_unused, ++ struct au_branch *br __maybe_unused, ++ int perm __maybe_unused) ++AuStubInt0(au_hnotify_init_br, struct au_branch *br __maybe_unused, ++ int perm __maybe_unused) ++AuStubVoid(au_hnotify_fin_br, struct au_branch *br __maybe_unused) ++AuStubInt0(__init au_hnotify_init, void) ++AuStubVoid(au_hnotify_fin, void) ++AuStubVoid(au_hn_init, struct au_hinode *hinode __maybe_unused) ++#endif /* CONFIG_AUFS_HNOTIFY */ ++ ++static inline void au_hn_suspend(struct au_hinode *hdir) ++{ ++ au_hn_ctl(hdir, /*do_set*/0); ++} ++ ++static inline void au_hn_resume(struct au_hinode *hdir) ++{ ++ au_hn_ctl(hdir, /*do_set*/1); ++} ++ ++static inline void au_hn_imtx_lock(struct au_hinode *hdir) ++{ ++ mutex_lock(&hdir->hi_inode->i_mutex); ++ au_hn_suspend(hdir); ++} ++ ++static inline void au_hn_imtx_lock_nested(struct au_hinode *hdir, ++ unsigned int sc __maybe_unused) ++{ ++ mutex_lock_nested(&hdir->hi_inode->i_mutex, sc); ++ au_hn_suspend(hdir); ++} ++ ++static inline void au_hn_imtx_unlock(struct au_hinode *hdir) ++{ ++ au_hn_resume(hdir); ++ mutex_unlock(&hdir->hi_inode->i_mutex); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_INODE_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/ioctl.c linux-3.2.0-gentoo-r1/fs/aufs/ioctl.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/ioctl.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/ioctl.c 2012-01-17 12:11:24.803360540 +0100 +@@ -0,0 +1,196 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * ioctl ++ * plink-management and readdir in userspace. ++ * assist the pathconf(3) wrapper library. ++ */ ++ ++#include "aufs.h" ++ ++static int au_wbr_fd(struct path *path, struct aufs_wbr_fd __user *arg) ++{ ++ int err, fd; ++ aufs_bindex_t wbi, bindex, bend; ++ struct file *h_file; ++ struct super_block *sb; ++ struct dentry *root; ++ struct au_branch *br; ++ struct aufs_wbr_fd wbrfd = { ++ .oflags = au_dir_roflags, ++ .brid = -1 ++ }; ++ const int valid = O_RDONLY | O_NONBLOCK | O_LARGEFILE | O_DIRECTORY ++ | O_NOATIME | O_CLOEXEC; ++ ++ AuDebugOn(wbrfd.oflags & ~valid); ++ ++ if (arg) { ++ err = copy_from_user(&wbrfd, arg, sizeof(wbrfd)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ goto out; ++ } ++ ++ err = -EINVAL; ++ AuDbg("wbrfd{0%o, %d}\n", wbrfd.oflags, wbrfd.brid); ++ wbrfd.oflags |= au_dir_roflags; ++ AuDbg("0%o\n", wbrfd.oflags); ++ if (unlikely(wbrfd.oflags & ~valid)) ++ goto out; ++ } ++ ++ fd = get_unused_fd(); ++ err = fd; ++ if (unlikely(fd < 0)) ++ goto out; ++ ++ h_file = ERR_PTR(-EINVAL); ++ wbi = 0; ++ br = NULL; ++ sb = path->dentry->d_sb; ++ root = sb->s_root; ++ aufs_read_lock(root, AuLock_IR); ++ bend = au_sbend(sb); ++ if (wbrfd.brid >= 0) { ++ wbi = au_br_index(sb, wbrfd.brid); ++ if (unlikely(wbi < 0 || wbi > bend)) ++ goto out_unlock; ++ } ++ ++ h_file = ERR_PTR(-ENOENT); ++ br = au_sbr(sb, wbi); ++ if (!au_br_writable(br->br_perm)) { ++ if (arg) ++ goto out_unlock; ++ ++ bindex = wbi + 1; ++ wbi = -1; ++ for (; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (au_br_writable(br->br_perm)) { ++ wbi = bindex; ++ br = au_sbr(sb, wbi); ++ break; ++ } ++ } ++ } ++ AuDbg("wbi %d\n", wbi); ++ if (wbi >= 0) ++ h_file = au_h_open(root, wbi, wbrfd.oflags, NULL); ++ ++out_unlock: ++ aufs_read_unlock(root, AuLock_IR); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out_fd; ++ ++ atomic_dec(&br->br_count); /* cf. au_h_open() */ ++ fd_install(fd, h_file); ++ err = fd; ++ goto out; /* success */ ++ ++out_fd: ++ put_unused_fd(fd); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ long err; ++ ++ switch (cmd) { ++ case AUFS_CTL_RDU: ++ case AUFS_CTL_RDU_INO: ++ err = au_rdu_ioctl(file, cmd, arg); ++ break; ++ ++ case AUFS_CTL_WBR_FD: ++ err = au_wbr_fd(&file->f_path, (void __user *)arg); ++ break; ++ ++ case AUFS_CTL_IBUSY: ++ err = au_ibusy_ioctl(file, arg); ++ break; ++ ++ default: ++ /* do not call the lower */ ++ AuDbg("0x%x\n", cmd); ++ err = -ENOTTY; ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ long err; ++ ++ switch (cmd) { ++ case AUFS_CTL_WBR_FD: ++ err = au_wbr_fd(&file->f_path, (void __user *)arg); ++ break; ++ ++ default: ++ /* do not call the lower */ ++ AuDbg("0x%x\n", cmd); ++ err = -ENOTTY; ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++#ifdef CONFIG_COMPAT ++long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ long err; ++ ++ switch (cmd) { ++ case AUFS_CTL_RDU: ++ case AUFS_CTL_RDU_INO: ++ err = au_rdu_compat_ioctl(file, cmd, arg); ++ break; ++ ++ case AUFS_CTL_IBUSY: ++ err = au_ibusy_compat_ioctl(file, arg); ++ break; ++ ++ default: ++ err = aufs_ioctl_dir(file, cmd, arg); ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++#if 0 /* unused yet */ ++long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ return aufs_ioctl_nondir(file, cmd, (unsigned long)compat_ptr(arg)); ++} ++#endif ++#endif +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/i_op_add.c linux-3.2.0-gentoo-r1/fs/aufs/i_op_add.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/i_op_add.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/i_op_add.c 2012-01-17 12:11:24.757063826 +0100 +@@ -0,0 +1,711 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode operations (add entry) ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * final procedure of adding a new entry, except link(2). ++ * remove whiteout, instantiate, copyup the parent dir's times and size ++ * and update version. ++ * if it failed, re-create the removed whiteout. ++ */ ++static int epilog(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct dentry *dentry) ++{ ++ int err, rerr; ++ aufs_bindex_t bwh; ++ struct path h_path; ++ struct inode *inode, *h_dir; ++ struct dentry *wh; ++ ++ bwh = -1; ++ if (wh_dentry) { ++ h_dir = wh_dentry->d_parent->d_inode; /* dir inode is locked */ ++ IMustLock(h_dir); ++ AuDebugOn(au_h_iptr(dir, bindex) != h_dir); ++ bwh = au_dbwh(dentry); ++ h_path.dentry = wh_dentry; ++ h_path.mnt = au_sbr_mnt(dir->i_sb, bindex); ++ err = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, ++ dentry); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ inode = au_new_inode(dentry, /*must_new*/1); ++ if (!IS_ERR(inode)) { ++ d_instantiate(dentry, inode); ++ dir = dentry->d_parent->d_inode; /* dir inode is locked */ ++ IMustLock(dir); ++ if (au_ibstart(dir) == au_dbstart(dentry)) ++ au_cpup_attr_timesizes(dir); ++ dir->i_version++; ++ return 0; /* success */ ++ } ++ ++ err = PTR_ERR(inode); ++ if (!wh_dentry) ++ goto out; ++ ++ /* revert */ ++ /* dir inode is locked */ ++ wh = au_wh_create(dentry, bwh, wh_dentry->d_parent); ++ rerr = PTR_ERR(wh); ++ if (IS_ERR(wh)) { ++ AuIOErr("%.*s reverting whiteout failed(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ err = -EIO; ++ } else ++ dput(wh); ++ ++out: ++ return err; ++} ++ ++static int au_d_may_add(struct dentry *dentry) ++{ ++ int err; ++ ++ err = 0; ++ if (unlikely(d_unhashed(dentry))) ++ err = -ENOENT; ++ if (unlikely(dentry->d_inode)) ++ err = -EEXIST; ++ return err; ++} ++ ++/* ++ * simple tests for the adding inode operations. ++ * following the checks in vfs, plus the parent-child relationship. ++ */ ++int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir) ++{ ++ int err; ++ umode_t h_mode; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ ++ err = -ENAMETOOLONG; ++ if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) ++ goto out; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ h_inode = h_dentry->d_inode; ++ if (!dentry->d_inode) { ++ err = -EEXIST; ++ if (unlikely(h_inode)) ++ goto out; ++ } else { ++ /* rename(2) case */ ++ err = -EIO; ++ if (unlikely(!h_inode || !h_inode->i_nlink)) ++ goto out; ++ ++ h_mode = h_inode->i_mode; ++ if (!isdir) { ++ err = -EISDIR; ++ if (unlikely(S_ISDIR(h_mode))) ++ goto out; ++ } else if (unlikely(!S_ISDIR(h_mode))) { ++ err = -ENOTDIR; ++ goto out; ++ } ++ } ++ ++ err = 0; ++ /* expected parent dir is locked */ ++ if (unlikely(h_parent != h_dentry->d_parent)) ++ err = -EIO; ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ++ * initial procedure of adding a new entry. ++ * prepare writable branch and the parent dir, lock it, ++ * and lookup whiteout for the new entry. ++ */ ++static struct dentry* ++lock_hdir_lkup_wh(struct dentry *dentry, struct au_dtime *dt, ++ struct dentry *src_dentry, struct au_pin *pin, ++ struct au_wr_dir_args *wr_dir_args) ++{ ++ struct dentry *wh_dentry, *h_parent; ++ struct super_block *sb; ++ struct au_branch *br; ++ int err; ++ unsigned int udba; ++ aufs_bindex_t bcpup; ++ ++ AuDbg("%.*s\n", AuDLNPair(dentry)); ++ ++ err = au_wr_dir(dentry, src_dentry, wr_dir_args); ++ bcpup = err; ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err < 0)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ udba = au_opt_udba(sb); ++ err = au_pin(pin, dentry, bcpup, udba, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ h_parent = au_pinned_h_parent(pin); ++ if (udba != AuOpt_UDBA_NONE ++ && au_dbstart(dentry) == bcpup) ++ err = au_may_add(dentry, bcpup, h_parent, ++ au_ftest_wrdir(wr_dir_args->flags, ISDIR)); ++ else if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) ++ err = -ENAMETOOLONG; ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_unpin; ++ ++ br = au_sbr(sb, bcpup); ++ if (dt) { ++ struct path tmp = { ++ .dentry = h_parent, ++ .mnt = br->br_mnt ++ }; ++ au_dtime_store(dt, au_pinned_parent(pin), &tmp); ++ } ++ ++ wh_dentry = NULL; ++ if (bcpup != au_dbwh(dentry)) ++ goto out; /* success */ ++ ++ wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, br); ++ ++out_unpin: ++ if (IS_ERR(wh_dentry)) ++ au_unpin(pin); ++out: ++ return wh_dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++enum { Mknod, Symlink, Creat }; ++struct simple_arg { ++ int type; ++ union { ++ struct { ++ int mode; ++ struct nameidata *nd; ++ } c; ++ struct { ++ const char *symname; ++ } s; ++ struct { ++ int mode; ++ dev_t dev; ++ } m; ++ } u; ++}; ++ ++static int add_simple(struct inode *dir, struct dentry *dentry, ++ struct simple_arg *arg) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ unsigned char created; ++ struct au_dtime dt; ++ struct au_pin pin; ++ struct path h_path; ++ struct dentry *wh_dentry, *parent; ++ struct inode *h_dir; ++ struct au_wr_dir_args wr_dir_args = { ++ .force_btgt = -1, ++ .flags = AuWrDir_ADD_ENTRY ++ }; ++ ++ AuDbg("%.*s\n", AuDLNPair(dentry)); ++ IMustLock(dir); ++ ++ parent = dentry->d_parent; /* dir inode is locked */ ++ err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); ++ if (unlikely(err)) ++ goto out; ++ err = au_d_may_add(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ di_write_lock_parent(parent); ++ wh_dentry = lock_hdir_lkup_wh(dentry, &dt, /*src_dentry*/NULL, &pin, ++ &wr_dir_args); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_parent; ++ ++ bstart = au_dbstart(dentry); ++ h_path.dentry = au_h_dptr(dentry, bstart); ++ h_path.mnt = au_sbr_mnt(dentry->d_sb, bstart); ++ h_dir = au_pinned_h_dir(&pin); ++ switch (arg->type) { ++ case Creat: ++ err = vfsub_create(h_dir, &h_path, arg->u.c.mode); ++ break; ++ case Symlink: ++ err = vfsub_symlink(h_dir, &h_path, arg->u.s.symname); ++ break; ++ case Mknod: ++ err = vfsub_mknod(h_dir, &h_path, arg->u.m.mode, arg->u.m.dev); ++ break; ++ default: ++ BUG(); ++ } ++ created = !err; ++ if (!err) ++ err = epilog(dir, bstart, wh_dentry, dentry); ++ ++ /* revert */ ++ if (unlikely(created && err && h_path.dentry->d_inode)) { ++ int rerr; ++ rerr = vfsub_unlink(h_dir, &h_path, /*force*/0); ++ if (rerr) { ++ AuIOErr("%.*s revert failure(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ err = -EIO; ++ } ++ au_dtime_revert(&dt); ++ } ++ ++ au_unpin(&pin); ++ dput(wh_dentry); ++ ++out_parent: ++ di_write_unlock(parent); ++out_unlock: ++ if (unlikely(err)) { ++ au_update_dbstart(dentry); ++ d_drop(dentry); ++ } ++ aufs_read_unlock(dentry, AuLock_DW); ++out: ++ return err; ++} ++ ++int aufs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) ++{ ++ struct simple_arg arg = { ++ .type = Mknod, ++ .u.m = { ++ .mode = mode, ++ .dev = dev ++ } ++ }; ++ return add_simple(dir, dentry, &arg); ++} ++ ++int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) ++{ ++ struct simple_arg arg = { ++ .type = Symlink, ++ .u.s.symname = symname ++ }; ++ return add_simple(dir, dentry, &arg); ++} ++ ++int aufs_create(struct inode *dir, struct dentry *dentry, int mode, ++ struct nameidata *nd) ++{ ++ struct simple_arg arg = { ++ .type = Creat, ++ .u.c = { ++ .mode = mode, ++ .nd = nd ++ } ++ }; ++ return add_simple(dir, dentry, &arg); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_link_args { ++ aufs_bindex_t bdst, bsrc; ++ struct au_pin pin; ++ struct path h_path; ++ struct dentry *src_parent, *parent; ++}; ++ ++static int au_cpup_before_link(struct dentry *src_dentry, ++ struct au_link_args *a) ++{ ++ int err; ++ struct dentry *h_src_dentry; ++ struct mutex *h_mtx; ++ struct file *h_file; ++ ++ di_read_lock_parent(a->src_parent, AuLock_IR); ++ err = au_test_and_cpup_dirs(src_dentry, a->bdst); ++ if (unlikely(err)) ++ goto out; ++ ++ h_src_dentry = au_h_dptr(src_dentry, a->bsrc); ++ h_mtx = &h_src_dentry->d_inode->i_mutex; ++ err = au_pin(&a->pin, src_dentry, a->bdst, ++ au_opt_udba(src_dentry->d_sb), ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (unlikely(err)) ++ goto out; ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ h_file = au_h_open_pre(src_dentry, a->bsrc); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ h_file = NULL; ++ } else ++ err = au_sio_cpup_simple(src_dentry, a->bdst, -1, ++ AuCpup_DTIME /* | AuCpup_KEEPLINO */); ++ mutex_unlock(h_mtx); ++ au_h_open_post(src_dentry, a->bsrc, h_file); ++ au_unpin(&a->pin); ++ ++out: ++ di_read_unlock(a->src_parent, AuLock_IR); ++ return err; ++} ++ ++static int au_cpup_or_link(struct dentry *src_dentry, struct au_link_args *a) ++{ ++ int err; ++ unsigned char plink; ++ struct inode *h_inode, *inode; ++ struct dentry *h_src_dentry; ++ struct super_block *sb; ++ struct file *h_file; ++ ++ plink = 0; ++ h_inode = NULL; ++ sb = src_dentry->d_sb; ++ inode = src_dentry->d_inode; ++ if (au_ibstart(inode) <= a->bdst) ++ h_inode = au_h_iptr(inode, a->bdst); ++ if (!h_inode || !h_inode->i_nlink) { ++ /* copyup src_dentry as the name of dentry. */ ++ au_set_dbstart(src_dentry, a->bdst); ++ au_set_h_dptr(src_dentry, a->bdst, dget(a->h_path.dentry)); ++ h_inode = au_h_dptr(src_dentry, a->bsrc)->d_inode; ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ h_file = au_h_open_pre(src_dentry, a->bsrc); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ h_file = NULL; ++ } else ++ err = au_sio_cpup_single(src_dentry, a->bdst, a->bsrc, ++ -1, AuCpup_KEEPLINO, ++ a->parent); ++ mutex_unlock(&h_inode->i_mutex); ++ au_h_open_post(src_dentry, a->bsrc, h_file); ++ au_set_h_dptr(src_dentry, a->bdst, NULL); ++ au_set_dbstart(src_dentry, a->bsrc); ++ } else { ++ /* the inode of src_dentry already exists on a.bdst branch */ ++ h_src_dentry = d_find_alias(h_inode); ++ if (!h_src_dentry && au_plink_test(inode)) { ++ plink = 1; ++ h_src_dentry = au_plink_lkup(inode, a->bdst); ++ err = PTR_ERR(h_src_dentry); ++ if (IS_ERR(h_src_dentry)) ++ goto out; ++ ++ if (unlikely(!h_src_dentry->d_inode)) { ++ dput(h_src_dentry); ++ h_src_dentry = NULL; ++ } ++ ++ } ++ if (h_src_dentry) { ++ err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), ++ &a->h_path); ++ dput(h_src_dentry); ++ } else { ++ AuIOErr("no dentry found for hi%lu on b%d\n", ++ h_inode->i_ino, a->bdst); ++ err = -EIO; ++ } ++ } ++ ++ if (!err && !plink) ++ au_plink_append(inode, a->bdst, a->h_path.dentry); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++int aufs_link(struct dentry *src_dentry, struct inode *dir, ++ struct dentry *dentry) ++{ ++ int err, rerr; ++ struct au_dtime dt; ++ struct au_link_args *a; ++ struct dentry *wh_dentry, *h_src_dentry; ++ struct inode *inode; ++ struct super_block *sb; ++ struct au_wr_dir_args wr_dir_args = { ++ /* .force_btgt = -1, */ ++ .flags = AuWrDir_ADD_ENTRY ++ }; ++ ++ IMustLock(dir); ++ inode = src_dentry->d_inode; ++ IMustLock(inode); ++ ++ err = -ENOMEM; ++ a = kzalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ a->parent = dentry->d_parent; /* dir inode is locked */ ++ err = aufs_read_and_write_lock2(dentry, src_dentry, ++ AuLock_NOPLM | AuLock_GEN); ++ if (unlikely(err)) ++ goto out_kfree; ++ err = au_d_hashed_positive(src_dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ err = au_d_may_add(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ a->src_parent = dget_parent(src_dentry); ++ wr_dir_args.force_btgt = au_ibstart(inode); ++ ++ di_write_lock_parent(a->parent); ++ wr_dir_args.force_btgt = au_wbr(dentry, wr_dir_args.force_btgt); ++ wh_dentry = lock_hdir_lkup_wh(dentry, &dt, src_dentry, &a->pin, ++ &wr_dir_args); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_parent; ++ ++ err = 0; ++ sb = dentry->d_sb; ++ a->bdst = au_dbstart(dentry); ++ a->h_path.dentry = au_h_dptr(dentry, a->bdst); ++ a->h_path.mnt = au_sbr_mnt(sb, a->bdst); ++ a->bsrc = au_ibstart(inode); ++ h_src_dentry = au_h_d_alias(src_dentry, a->bsrc); ++ if (!h_src_dentry) { ++ a->bsrc = au_dbstart(src_dentry); ++ h_src_dentry = au_h_d_alias(src_dentry, a->bsrc); ++ AuDebugOn(!h_src_dentry); ++ } else if (IS_ERR(h_src_dentry)) ++ goto out_parent; ++ ++ if (au_opt_test(au_mntflags(sb), PLINK)) { ++ if (a->bdst < a->bsrc ++ /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) ++ err = au_cpup_or_link(src_dentry, a); ++ else ++ err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), ++ &a->h_path); ++ dput(h_src_dentry); ++ } else { ++ /* ++ * copyup src_dentry to the branch we process, ++ * and then link(2) to it. ++ */ ++ dput(h_src_dentry); ++ if (a->bdst < a->bsrc ++ /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) { ++ au_unpin(&a->pin); ++ di_write_unlock(a->parent); ++ err = au_cpup_before_link(src_dentry, a); ++ di_write_lock_parent(a->parent); ++ if (!err) ++ err = au_pin(&a->pin, dentry, a->bdst, ++ au_opt_udba(sb), ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (unlikely(err)) ++ goto out_wh; ++ } ++ if (!err) { ++ h_src_dentry = au_h_dptr(src_dentry, a->bdst); ++ err = -ENOENT; ++ if (h_src_dentry && h_src_dentry->d_inode) ++ err = vfsub_link(h_src_dentry, ++ au_pinned_h_dir(&a->pin), ++ &a->h_path); ++ } ++ } ++ if (unlikely(err)) ++ goto out_unpin; ++ ++ if (wh_dentry) { ++ a->h_path.dentry = wh_dentry; ++ err = au_wh_unlink_dentry(au_pinned_h_dir(&a->pin), &a->h_path, ++ dentry); ++ if (unlikely(err)) ++ goto out_revert; ++ } ++ ++ dir->i_version++; ++ if (au_ibstart(dir) == au_dbstart(dentry)) ++ au_cpup_attr_timesizes(dir); ++ inc_nlink(inode); ++ inode->i_ctime = dir->i_ctime; ++ d_instantiate(dentry, au_igrab(inode)); ++ if (d_unhashed(a->h_path.dentry)) ++ /* some filesystem calls d_drop() */ ++ d_drop(dentry); ++ goto out_unpin; /* success */ ++ ++out_revert: ++ rerr = vfsub_unlink(au_pinned_h_dir(&a->pin), &a->h_path, /*force*/0); ++ if (unlikely(rerr)) { ++ AuIOErr("%.*s reverting failed(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ err = -EIO; ++ } ++ au_dtime_revert(&dt); ++out_unpin: ++ au_unpin(&a->pin); ++out_wh: ++ dput(wh_dentry); ++out_parent: ++ di_write_unlock(a->parent); ++ dput(a->src_parent); ++out_unlock: ++ if (unlikely(err)) { ++ au_update_dbstart(dentry); ++ d_drop(dentry); ++ } ++ aufs_read_and_write_unlock2(dentry, src_dentry); ++out_kfree: ++ kfree(a); ++out: ++ return err; ++} ++ ++int aufs_mkdir(struct inode *dir, struct dentry *dentry, int mode) ++{ ++ int err, rerr; ++ aufs_bindex_t bindex; ++ unsigned char diropq; ++ struct path h_path; ++ struct dentry *wh_dentry, *parent, *opq_dentry; ++ struct mutex *h_mtx; ++ struct super_block *sb; ++ struct { ++ struct au_pin pin; ++ struct au_dtime dt; ++ } *a; /* reduce the stack usage */ ++ struct au_wr_dir_args wr_dir_args = { ++ .force_btgt = -1, ++ .flags = AuWrDir_ADD_ENTRY | AuWrDir_ISDIR ++ }; ++ ++ IMustLock(dir); ++ ++ err = -ENOMEM; ++ a = kmalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); ++ if (unlikely(err)) ++ goto out_free; ++ err = au_d_may_add(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_write_lock_parent(parent); ++ wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL, ++ &a->pin, &wr_dir_args); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_parent; ++ ++ sb = dentry->d_sb; ++ bindex = au_dbstart(dentry); ++ h_path.dentry = au_h_dptr(dentry, bindex); ++ h_path.mnt = au_sbr_mnt(sb, bindex); ++ err = vfsub_mkdir(au_pinned_h_dir(&a->pin), &h_path, mode); ++ if (unlikely(err)) ++ goto out_unpin; ++ ++ /* make the dir opaque */ ++ diropq = 0; ++ h_mtx = &h_path.dentry->d_inode->i_mutex; ++ if (wh_dentry ++ || au_opt_test(au_mntflags(sb), ALWAYS_DIROPQ)) { ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ opq_dentry = au_diropq_create(dentry, bindex); ++ mutex_unlock(h_mtx); ++ err = PTR_ERR(opq_dentry); ++ if (IS_ERR(opq_dentry)) ++ goto out_dir; ++ dput(opq_dentry); ++ diropq = 1; ++ } ++ ++ err = epilog(dir, bindex, wh_dentry, dentry); ++ if (!err) { ++ inc_nlink(dir); ++ goto out_unpin; /* success */ ++ } ++ ++ /* revert */ ++ if (diropq) { ++ AuLabel(revert opq); ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ rerr = au_diropq_remove(dentry, bindex); ++ mutex_unlock(h_mtx); ++ if (rerr) { ++ AuIOErr("%.*s reverting diropq failed(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ err = -EIO; ++ } ++ } ++ ++out_dir: ++ AuLabel(revert dir); ++ rerr = vfsub_rmdir(au_pinned_h_dir(&a->pin), &h_path); ++ if (rerr) { ++ AuIOErr("%.*s reverting dir failed(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ err = -EIO; ++ } ++ au_dtime_revert(&a->dt); ++out_unpin: ++ au_unpin(&a->pin); ++ dput(wh_dentry); ++out_parent: ++ di_write_unlock(parent); ++out_unlock: ++ if (unlikely(err)) { ++ au_update_dbstart(dentry); ++ d_drop(dentry); ++ } ++ aufs_read_unlock(dentry, AuLock_DW); ++out_free: ++ kfree(a); ++out: ++ return err; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/i_op.c linux-3.2.0-gentoo-r1/fs/aufs/i_op.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/i_op.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/i_op.c 2012-01-17 12:11:24.736230305 +0100 +@@ -0,0 +1,992 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode operations (except add/del/rename) ++ */ ++ ++#include ++#include ++#include ++#include ++#include "aufs.h" ++ ++static int h_permission(struct inode *h_inode, int mask, ++ struct vfsmount *h_mnt, int brperm) ++{ ++ int err; ++ const unsigned char write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); ++ ++ err = -EACCES; ++ if ((write_mask && IS_IMMUTABLE(h_inode)) ++ || ((mask & MAY_EXEC) ++ && S_ISREG(h_inode->i_mode) ++ && ((h_mnt->mnt_flags & MNT_NOEXEC) ++ || !(h_inode->i_mode & S_IXUGO)))) ++ goto out; ++ ++ /* ++ * - skip the lower fs test in the case of write to ro branch. ++ * - nfs dir permission write check is optimized, but a policy for ++ * link/rename requires a real check. ++ */ ++ if ((write_mask && !au_br_writable(brperm)) ++ || (au_test_nfs(h_inode->i_sb) && S_ISDIR(h_inode->i_mode) ++ && write_mask && !(mask & MAY_READ)) ++ || !h_inode->i_op->permission) { ++ /* AuLabel(generic_permission); */ ++ err = generic_permission(h_inode, mask); ++ } else { ++ /* AuLabel(h_inode->permission); */ ++ err = h_inode->i_op->permission(h_inode, mask); ++ AuTraceErr(err); ++ } ++ ++ if (!err) ++ err = devcgroup_inode_permission(h_inode, mask); ++ if (!err) ++ err = security_inode_permission(h_inode, mask); ++ ++#if 0 ++ if (!err) { ++ /* todo: do we need to call ima_path_check()? */ ++ struct path h_path = { ++ .dentry = ++ .mnt = h_mnt ++ }; ++ err = ima_path_check(&h_path, ++ mask & (MAY_READ | MAY_WRITE | MAY_EXEC), ++ IMA_COUNT_LEAVE); ++ } ++#endif ++ ++out: ++ return err; ++} ++ ++static int aufs_permission(struct inode *inode, int mask) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ const unsigned char isdir = !!S_ISDIR(inode->i_mode), ++ write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); ++ struct inode *h_inode; ++ struct super_block *sb; ++ struct au_branch *br; ++ ++ /* todo: support rcu-walk? */ ++ if (mask & MAY_NOT_BLOCK) ++ return -ECHILD; ++ ++ sb = inode->i_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ ii_read_lock_child(inode); ++#if 0 ++ err = au_iigen_test(inode, au_sigen(sb)); ++ if (unlikely(err)) ++ goto out; ++#endif ++ ++ if (!isdir || write_mask) { ++ err = au_busy_or_stale(); ++ h_inode = au_h_iptr(inode, au_ibstart(inode)); ++ if (unlikely(!h_inode ++ || (h_inode->i_mode & S_IFMT) ++ != (inode->i_mode & S_IFMT))) ++ goto out; ++ ++ err = 0; ++ bindex = au_ibstart(inode); ++ br = au_sbr(sb, bindex); ++ err = h_permission(h_inode, mask, br->br_mnt, br->br_perm); ++ if (write_mask ++ && !err ++ && !special_file(h_inode->i_mode)) { ++ /* test whether the upper writable branch exists */ ++ err = -EROFS; ++ for (; bindex >= 0; bindex--) ++ if (!au_br_rdonly(au_sbr(sb, bindex))) { ++ err = 0; ++ break; ++ } ++ } ++ goto out; ++ } ++ ++ /* non-write to dir */ ++ err = 0; ++ bend = au_ibend(inode); ++ for (bindex = au_ibstart(inode); !err && bindex <= bend; bindex++) { ++ h_inode = au_h_iptr(inode, bindex); ++ if (h_inode) { ++ err = au_busy_or_stale(); ++ if (unlikely(!S_ISDIR(h_inode->i_mode))) ++ break; ++ ++ br = au_sbr(sb, bindex); ++ err = h_permission(h_inode, mask, br->br_mnt, ++ br->br_perm); ++ } ++ } ++ ++out: ++ ii_read_unlock(inode); ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry *aufs_lookup(struct inode *dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct dentry *ret, *parent; ++ struct inode *inode; ++ struct super_block *sb; ++ int err, npositive, lc_idx; ++ ++ IMustLock(dir); ++ ++ sb = dir->i_sb; ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ ret = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ ret = ERR_PTR(-ENAMETOOLONG); ++ if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) ++ goto out_si; ++ err = au_di_init(dentry); ++ ret = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_si; ++ ++ inode = NULL; ++ npositive = 0; /* suppress a warning */ ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_read_lock_parent(parent, AuLock_IR); ++ err = au_alive_dir(parent); ++ if (!err) ++ err = au_digen_test(parent, au_sigen(sb)); ++ if (!err) { ++ npositive = au_lkup_dentry(dentry, au_dbstart(parent), ++ /*type*/0, nd); ++ err = npositive; ++ } ++ di_read_unlock(parent, AuLock_IR); ++ ret = ERR_PTR(err); ++ if (unlikely(err < 0)) ++ goto out_unlock; ++ ++ if (npositive) { ++ inode = au_new_inode(dentry, /*must_new*/0); ++ ret = (void *)inode; ++ } ++ if (IS_ERR(inode)) { ++ inode = NULL; ++ goto out_unlock; ++ } ++ ++ ret = d_splice_alias(inode, dentry); ++ if (unlikely(IS_ERR(ret) && inode)) { ++ ii_write_unlock(inode); ++ lc_idx = AuLcNonDir_IIINFO; ++ if (S_ISLNK(inode->i_mode)) ++ lc_idx = AuLcSymlink_IIINFO; ++ else if (S_ISDIR(inode->i_mode)) ++ lc_idx = AuLcDir_IIINFO; ++ au_rw_class(&au_ii(inode)->ii_rwsem, au_lc_key + lc_idx); ++ iput(inode); ++ } ++ ++out_unlock: ++ di_write_unlock(dentry); ++ if (unlikely(IS_ERR(ret) && inode)) { ++ lc_idx = AuLcNonDir_DIINFO; ++ if (S_ISLNK(inode->i_mode)) ++ lc_idx = AuLcSymlink_DIINFO; ++ else if (S_ISDIR(inode->i_mode)) ++ lc_idx = AuLcDir_DIINFO; ++ au_rw_class(&au_di(dentry)->di_rwsem, au_lc_key + lc_idx); ++ } ++out_si: ++ si_read_unlock(sb); ++out: ++ return ret; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_wr_dir_cpup(struct dentry *dentry, struct dentry *parent, ++ const unsigned char add_entry, aufs_bindex_t bcpup, ++ aufs_bindex_t bstart) ++{ ++ int err; ++ struct dentry *h_parent; ++ struct inode *h_dir; ++ ++ if (add_entry) ++ IMustLock(parent->d_inode); ++ else ++ di_write_lock_parent(parent); ++ ++ err = 0; ++ if (!au_h_dptr(parent, bcpup)) { ++ if (bstart < bcpup) ++ err = au_cpdown_dirs(dentry, bcpup); ++ else ++ err = au_cpup_dirs(dentry, bcpup); ++ } ++ if (!err && add_entry) { ++ h_parent = au_h_dptr(parent, bcpup); ++ h_dir = h_parent->d_inode; ++ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); ++ err = au_lkup_neg(dentry, bcpup); ++ /* todo: no unlock here */ ++ mutex_unlock(&h_dir->i_mutex); ++ ++ AuDbg("bcpup %d\n", bcpup); ++ if (!err) { ++ if (!dentry->d_inode) ++ au_set_h_dptr(dentry, bstart, NULL); ++ au_update_dbrange(dentry, /*do_put_zero*/0); ++ } ++ } ++ ++ if (!add_entry) ++ di_write_unlock(parent); ++ if (!err) ++ err = bcpup; /* success */ ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ++ * decide the branch and the parent dir where we will create a new entry. ++ * returns new bindex or an error. ++ * copyup the parent dir if needed. ++ */ ++int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, ++ struct au_wr_dir_args *args) ++{ ++ int err; ++ aufs_bindex_t bcpup, bstart, src_bstart; ++ const unsigned char add_entry = !!au_ftest_wrdir(args->flags, ++ ADD_ENTRY); ++ struct super_block *sb; ++ struct dentry *parent; ++ struct au_sbinfo *sbinfo; ++ ++ sb = dentry->d_sb; ++ sbinfo = au_sbi(sb); ++ parent = dget_parent(dentry); ++ bstart = au_dbstart(dentry); ++ bcpup = bstart; ++ if (args->force_btgt < 0) { ++ if (src_dentry) { ++ src_bstart = au_dbstart(src_dentry); ++ if (src_bstart < bstart) ++ bcpup = src_bstart; ++ } else if (add_entry) { ++ err = AuWbrCreate(sbinfo, dentry, ++ au_ftest_wrdir(args->flags, ISDIR)); ++ bcpup = err; ++ } ++ ++ if (bcpup < 0 || au_test_ro(sb, bcpup, dentry->d_inode)) { ++ if (add_entry) ++ err = AuWbrCopyup(sbinfo, dentry); ++ else { ++ if (!IS_ROOT(dentry)) { ++ di_read_lock_parent(parent, !AuLock_IR); ++ err = AuWbrCopyup(sbinfo, dentry); ++ di_read_unlock(parent, !AuLock_IR); ++ } else ++ err = AuWbrCopyup(sbinfo, dentry); ++ } ++ bcpup = err; ++ if (unlikely(err < 0)) ++ goto out; ++ } ++ } else { ++ bcpup = args->force_btgt; ++ AuDebugOn(au_test_ro(sb, bcpup, dentry->d_inode)); ++ } ++ ++ AuDbg("bstart %d, bcpup %d\n", bstart, bcpup); ++ err = bcpup; ++ if (bcpup == bstart) ++ goto out; /* success */ ++ ++ /* copyup the new parent into the branch we process */ ++ err = au_wr_dir_cpup(dentry, parent, add_entry, bcpup, bstart); ++ if (err >= 0) { ++ if (!dentry->d_inode) { ++ au_set_h_dptr(dentry, bstart, NULL); ++ au_set_dbstart(dentry, bcpup); ++ au_set_dbend(dentry, bcpup); ++ } ++ AuDebugOn(add_entry && !au_h_dptr(dentry, bcpup)); ++ } ++ ++out: ++ dput(parent); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct dentry *au_pinned_h_parent(struct au_pin *pin) ++{ ++ if (pin && pin->parent) ++ return au_h_dptr(pin->parent, pin->bindex); ++ return NULL; ++} ++ ++void au_unpin(struct au_pin *p) ++{ ++ if (p->h_mnt && au_ftest_pin(p->flags, MNT_WRITE)) ++ mnt_drop_write(p->h_mnt); ++ if (!p->hdir) ++ return; ++ ++ au_hn_imtx_unlock(p->hdir); ++ if (!au_ftest_pin(p->flags, DI_LOCKED)) ++ di_read_unlock(p->parent, AuLock_IR); ++ iput(p->hdir->hi_inode); ++ dput(p->parent); ++ p->parent = NULL; ++ p->hdir = NULL; ++ p->h_mnt = NULL; ++} ++ ++int au_do_pin(struct au_pin *p) ++{ ++ int err; ++ struct super_block *sb; ++ struct dentry *h_dentry, *h_parent; ++ struct au_branch *br; ++ struct inode *h_dir; ++ ++ err = 0; ++ sb = p->dentry->d_sb; ++ br = au_sbr(sb, p->bindex); ++ if (IS_ROOT(p->dentry)) { ++ if (au_ftest_pin(p->flags, MNT_WRITE)) { ++ p->h_mnt = br->br_mnt; ++ err = mnt_want_write(p->h_mnt); ++ if (unlikely(err)) { ++ au_fclr_pin(p->flags, MNT_WRITE); ++ goto out_err; ++ } ++ } ++ goto out; ++ } ++ ++ h_dentry = NULL; ++ if (p->bindex <= au_dbend(p->dentry)) ++ h_dentry = au_h_dptr(p->dentry, p->bindex); ++ ++ p->parent = dget_parent(p->dentry); ++ if (!au_ftest_pin(p->flags, DI_LOCKED)) ++ di_read_lock(p->parent, AuLock_IR, p->lsc_di); ++ ++ h_dir = NULL; ++ h_parent = au_h_dptr(p->parent, p->bindex); ++ p->hdir = au_hi(p->parent->d_inode, p->bindex); ++ if (p->hdir) ++ h_dir = p->hdir->hi_inode; ++ ++ /* ++ * udba case, or ++ * if DI_LOCKED is not set, then p->parent may be different ++ * and h_parent can be NULL. ++ */ ++ if (unlikely(!p->hdir || !h_dir || !h_parent)) { ++ err = -EBUSY; ++ if (!au_ftest_pin(p->flags, DI_LOCKED)) ++ di_read_unlock(p->parent, AuLock_IR); ++ dput(p->parent); ++ p->parent = NULL; ++ goto out_err; ++ } ++ ++ au_igrab(h_dir); ++ au_hn_imtx_lock_nested(p->hdir, p->lsc_hi); ++ ++ if (unlikely(p->hdir->hi_inode != h_parent->d_inode)) { ++ err = -EBUSY; ++ goto out_unpin; ++ } ++ if (h_dentry) { ++ err = au_h_verify(h_dentry, p->udba, h_dir, h_parent, br); ++ if (unlikely(err)) { ++ au_fclr_pin(p->flags, MNT_WRITE); ++ goto out_unpin; ++ } ++ } ++ ++ if (au_ftest_pin(p->flags, MNT_WRITE)) { ++ p->h_mnt = br->br_mnt; ++ err = mnt_want_write(p->h_mnt); ++ if (unlikely(err)) { ++ au_fclr_pin(p->flags, MNT_WRITE); ++ goto out_unpin; ++ } ++ } ++ goto out; /* success */ ++ ++out_unpin: ++ au_unpin(p); ++out_err: ++ pr_err("err %d\n", err); ++ err = au_busy_or_stale(); ++out: ++ return err; ++} ++ ++void au_pin_init(struct au_pin *p, struct dentry *dentry, ++ aufs_bindex_t bindex, int lsc_di, int lsc_hi, ++ unsigned int udba, unsigned char flags) ++{ ++ p->dentry = dentry; ++ p->udba = udba; ++ p->lsc_di = lsc_di; ++ p->lsc_hi = lsc_hi; ++ p->flags = flags; ++ p->bindex = bindex; ++ ++ p->parent = NULL; ++ p->hdir = NULL; ++ p->h_mnt = NULL; ++} ++ ++int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int udba, unsigned char flags) ++{ ++ au_pin_init(pin, dentry, bindex, AuLsc_DI_PARENT, AuLsc_I_PARENT2, ++ udba, flags); ++ return au_do_pin(pin); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * ->setattr() and ->getattr() are called in various cases. ++ * chmod, stat: dentry is revalidated. ++ * fchmod, fstat: file and dentry are not revalidated, additionally they may be ++ * unhashed. ++ * for ->setattr(), ia->ia_file is passed from ftruncate only. ++ */ ++/* todo: consolidate with do_refresh() and simple_reval_dpath() */ ++static int au_reval_for_attr(struct dentry *dentry, unsigned int sigen) ++{ ++ int err; ++ struct inode *inode; ++ struct dentry *parent; ++ ++ err = 0; ++ inode = dentry->d_inode; ++ if (au_digen_test(dentry, sigen)) { ++ parent = dget_parent(dentry); ++ di_read_lock_parent(parent, AuLock_IR); ++ err = au_refresh_dentry(dentry, parent); ++ di_read_unlock(parent, AuLock_IR); ++ dput(parent); ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++#define AuIcpup_DID_CPUP 1 ++#define au_ftest_icpup(flags, name) ((flags) & AuIcpup_##name) ++#define au_fset_icpup(flags, name) \ ++ do { (flags) |= AuIcpup_##name; } while (0) ++#define au_fclr_icpup(flags, name) \ ++ do { (flags) &= ~AuIcpup_##name; } while (0) ++ ++struct au_icpup_args { ++ unsigned char flags; ++ unsigned char pin_flags; ++ aufs_bindex_t btgt; ++ unsigned int udba; ++ struct au_pin pin; ++ struct path h_path; ++ struct inode *h_inode; ++}; ++ ++static int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia, ++ struct au_icpup_args *a) ++{ ++ int err; ++ loff_t sz; ++ aufs_bindex_t bstart, ibstart; ++ struct dentry *hi_wh, *parent; ++ struct inode *inode; ++ struct file *h_file; ++ struct au_wr_dir_args wr_dir_args = { ++ .force_btgt = -1, ++ .flags = 0 ++ }; ++ ++ bstart = au_dbstart(dentry); ++ inode = dentry->d_inode; ++ if (S_ISDIR(inode->i_mode)) ++ au_fset_wrdir(wr_dir_args.flags, ISDIR); ++ /* plink or hi_wh() case */ ++ ibstart = au_ibstart(inode); ++ if (bstart != ibstart && !au_test_ro(inode->i_sb, ibstart, inode)) ++ wr_dir_args.force_btgt = ibstart; ++ err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args); ++ if (unlikely(err < 0)) ++ goto out; ++ a->btgt = err; ++ if (err != bstart) ++ au_fset_icpup(a->flags, DID_CPUP); ++ ++ err = 0; ++ a->pin_flags = AuPin_MNT_WRITE; ++ parent = NULL; ++ if (!IS_ROOT(dentry)) { ++ au_fset_pin(a->pin_flags, DI_LOCKED); ++ parent = dget_parent(dentry); ++ di_write_lock_parent(parent); ++ } ++ ++ err = au_pin(&a->pin, dentry, a->btgt, a->udba, a->pin_flags); ++ if (unlikely(err)) ++ goto out_parent; ++ ++ a->h_path.dentry = au_h_dptr(dentry, bstart); ++ a->h_inode = a->h_path.dentry->d_inode; ++ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); ++ sz = -1; ++ if ((ia->ia_valid & ATTR_SIZE) && ia->ia_size < i_size_read(a->h_inode)) ++ sz = ia->ia_size; ++ ++ h_file = NULL; ++ hi_wh = NULL; ++ if (au_ftest_icpup(a->flags, DID_CPUP) && d_unlinked(dentry)) { ++ hi_wh = au_hi_wh(inode, a->btgt); ++ if (!hi_wh) { ++ err = au_sio_cpup_wh(dentry, a->btgt, sz, /*file*/NULL); ++ if (unlikely(err)) ++ goto out_unlock; ++ hi_wh = au_hi_wh(inode, a->btgt); ++ /* todo: revalidate hi_wh? */ ++ } ++ } ++ ++ if (parent) { ++ au_pin_set_parent_lflag(&a->pin, /*lflag*/0); ++ di_downgrade_lock(parent, AuLock_IR); ++ dput(parent); ++ parent = NULL; ++ } ++ if (!au_ftest_icpup(a->flags, DID_CPUP)) ++ goto out; /* success */ ++ ++ if (!d_unhashed(dentry)) { ++ h_file = au_h_open_pre(dentry, bstart); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ h_file = NULL; ++ } else ++ err = au_sio_cpup_simple(dentry, a->btgt, sz, ++ AuCpup_DTIME); ++ if (!err) ++ a->h_path.dentry = au_h_dptr(dentry, a->btgt); ++ } else if (!hi_wh) ++ a->h_path.dentry = au_h_dptr(dentry, a->btgt); ++ else ++ a->h_path.dentry = hi_wh; /* do not dget here */ ++ ++out_unlock: ++ mutex_unlock(&a->h_inode->i_mutex); ++ au_h_open_post(dentry, bstart, h_file); ++ a->h_inode = a->h_path.dentry->d_inode; ++ if (!err) { ++ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); ++ goto out; /* success */ ++ } ++ ++ au_unpin(&a->pin); ++out_parent: ++ if (parent) { ++ di_write_unlock(parent); ++ dput(parent); ++ } ++out: ++ return err; ++} ++ ++static int aufs_setattr(struct dentry *dentry, struct iattr *ia) ++{ ++ int err; ++ struct inode *inode; ++ struct super_block *sb; ++ struct file *file; ++ struct au_icpup_args *a; ++ ++ inode = dentry->d_inode; ++ IMustLock(inode); ++ ++ err = -ENOMEM; ++ a = kzalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) ++ ia->ia_valid &= ~ATTR_MODE; ++ ++ file = NULL; ++ sb = dentry->d_sb; ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out_kfree; ++ ++ if (ia->ia_valid & ATTR_FILE) { ++ /* currently ftruncate(2) only */ ++ AuDebugOn(!S_ISREG(inode->i_mode)); ++ file = ia->ia_file; ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out_si; ++ ia->ia_file = au_hf_top(file); ++ a->udba = AuOpt_UDBA_NONE; ++ } else { ++ /* fchmod() doesn't pass ia_file */ ++ a->udba = au_opt_udba(sb); ++ di_write_lock_child(dentry); ++ /* no d_unlinked(), to set UDBA_NONE for root */ ++ if (d_unhashed(dentry)) ++ a->udba = AuOpt_UDBA_NONE; ++ if (a->udba != AuOpt_UDBA_NONE) { ++ AuDebugOn(IS_ROOT(dentry)); ++ err = au_reval_for_attr(dentry, au_sigen(sb)); ++ if (unlikely(err)) ++ goto out_dentry; ++ } ++ } ++ ++ err = au_pin_and_icpup(dentry, ia, a); ++ if (unlikely(err < 0)) ++ goto out_dentry; ++ if (au_ftest_icpup(a->flags, DID_CPUP)) { ++ ia->ia_file = NULL; ++ ia->ia_valid &= ~ATTR_FILE; ++ } ++ ++ a->h_path.mnt = au_sbr_mnt(sb, a->btgt); ++ if ((ia->ia_valid & (ATTR_MODE | ATTR_CTIME)) ++ == (ATTR_MODE | ATTR_CTIME)) { ++ err = security_path_chmod(a->h_path.dentry, a->h_path.mnt, ++ ia->ia_mode); ++ if (unlikely(err)) ++ goto out_unlock; ++ } else if ((ia->ia_valid & (ATTR_UID | ATTR_GID)) ++ && (ia->ia_valid & ATTR_CTIME)) { ++ err = security_path_chown(&a->h_path, ia->ia_uid, ia->ia_gid); ++ if (unlikely(err)) ++ goto out_unlock; ++ } ++ ++ if (ia->ia_valid & ATTR_SIZE) { ++ struct file *f; ++ ++ if (ia->ia_size < i_size_read(inode)) ++ /* unmap only */ ++ truncate_setsize(inode, ia->ia_size); ++ ++ f = NULL; ++ if (ia->ia_valid & ATTR_FILE) ++ f = ia->ia_file; ++ mutex_unlock(&a->h_inode->i_mutex); ++ err = vfsub_trunc(&a->h_path, ia->ia_size, ia->ia_valid, f); ++ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); ++ } else ++ err = vfsub_notify_change(&a->h_path, ia); ++ if (!err) ++ au_cpup_attr_changeable(inode); ++ ++out_unlock: ++ mutex_unlock(&a->h_inode->i_mutex); ++ au_unpin(&a->pin); ++ if (unlikely(err)) ++ au_update_dbstart(dentry); ++out_dentry: ++ di_write_unlock(dentry); ++ if (file) { ++ fi_write_unlock(file); ++ ia->ia_file = file; ++ ia->ia_valid |= ATTR_FILE; ++ } ++out_si: ++ si_read_unlock(sb); ++out_kfree: ++ kfree(a); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static void au_refresh_iattr(struct inode *inode, struct kstat *st, ++ unsigned int nlink) ++{ ++ unsigned int n; ++ ++ inode->i_mode = st->mode; ++ inode->i_uid = st->uid; ++ inode->i_gid = st->gid; ++ inode->i_atime = st->atime; ++ inode->i_mtime = st->mtime; ++ inode->i_ctime = st->ctime; ++ ++ au_cpup_attr_nlink(inode, /*force*/0); ++ if (S_ISDIR(inode->i_mode)) { ++ n = inode->i_nlink; ++ n -= nlink; ++ n += st->nlink; ++ set_nlink(inode, n); ++ } ++ ++ spin_lock(&inode->i_lock); ++ inode->i_blocks = st->blocks; ++ i_size_write(inode, st->size); ++ spin_unlock(&inode->i_lock); ++} ++ ++static int aufs_getattr(struct vfsmount *mnt __maybe_unused, ++ struct dentry *dentry, struct kstat *st) ++{ ++ int err; ++ unsigned int mnt_flags; ++ aufs_bindex_t bindex; ++ unsigned char udba_none, positive; ++ struct super_block *sb, *h_sb; ++ struct inode *inode; ++ struct vfsmount *h_mnt; ++ struct dentry *h_dentry; ++ ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out; ++ mnt_flags = au_mntflags(sb); ++ udba_none = !!au_opt_test(mnt_flags, UDBA_NONE); ++ ++ /* support fstat(2) */ ++ if (!d_unlinked(dentry) && !udba_none) { ++ unsigned int sigen = au_sigen(sb); ++ err = au_digen_test(dentry, sigen); ++ if (!err) { ++ di_read_lock_child(dentry, AuLock_IR); ++ err = au_dbrange_test(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ } else { ++ AuDebugOn(IS_ROOT(dentry)); ++ di_write_lock_child(dentry); ++ err = au_dbrange_test(dentry); ++ if (!err) ++ err = au_reval_for_attr(dentry, sigen); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ } ++ } else ++ di_read_lock_child(dentry, AuLock_IR); ++ ++ bindex = au_ibstart(inode); ++ h_mnt = au_sbr_mnt(sb, bindex); ++ h_sb = h_mnt->mnt_sb; ++ if (!au_test_fs_bad_iattr(h_sb) && udba_none) ++ goto out_fill; /* success */ ++ ++ h_dentry = NULL; ++ if (au_dbstart(dentry) == bindex) ++ h_dentry = dget(au_h_dptr(dentry, bindex)); ++ else if (au_opt_test(mnt_flags, PLINK) && au_plink_test(inode)) { ++ h_dentry = au_plink_lkup(inode, bindex); ++ if (IS_ERR(h_dentry)) ++ goto out_fill; /* pretending success */ ++ } ++ /* illegally overlapped or something */ ++ if (unlikely(!h_dentry)) ++ goto out_fill; /* pretending success */ ++ ++ positive = !!h_dentry->d_inode; ++ if (positive) ++ err = vfs_getattr(h_mnt, h_dentry, st); ++ dput(h_dentry); ++ if (!err) { ++ if (positive) ++ au_refresh_iattr(inode, st, h_dentry->d_inode->i_nlink); ++ goto out_fill; /* success */ ++ } ++ AuTraceErr(err); ++ goto out_unlock; ++ ++out_fill: ++ generic_fillattr(inode, st); ++out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ si_read_unlock(sb); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int h_readlink(struct dentry *dentry, int bindex, char __user *buf, ++ int bufsiz) ++{ ++ int err; ++ struct super_block *sb; ++ struct dentry *h_dentry; ++ ++ err = -EINVAL; ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (unlikely(!h_dentry->d_inode->i_op->readlink)) ++ goto out; ++ ++ err = security_inode_readlink(h_dentry); ++ if (unlikely(err)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ if (!au_test_ro(sb, bindex, dentry->d_inode)) { ++ vfsub_touch_atime(au_sbr_mnt(sb, bindex), h_dentry); ++ fsstack_copy_attr_atime(dentry->d_inode, h_dentry->d_inode); ++ } ++ err = h_dentry->d_inode->i_op->readlink(h_dentry, buf, bufsiz); ++ ++out: ++ return err; ++} ++ ++static int aufs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) ++{ ++ int err; ++ ++ err = aufs_read_lock(dentry, AuLock_IR | AuLock_GEN); ++ if (unlikely(err)) ++ goto out; ++ err = au_d_hashed_positive(dentry); ++ if (!err) ++ err = h_readlink(dentry, au_dbstart(dentry), buf, bufsiz); ++ aufs_read_unlock(dentry, AuLock_IR); ++ ++out: ++ return err; ++} ++ ++static void *aufs_follow_link(struct dentry *dentry, struct nameidata *nd) ++{ ++ int err; ++ mm_segment_t old_fs; ++ union { ++ char *k; ++ char __user *u; ++ } buf; ++ ++ err = -ENOMEM; ++ buf.k = __getname_gfp(GFP_NOFS); ++ if (unlikely(!buf.k)) ++ goto out; ++ ++ err = aufs_read_lock(dentry, AuLock_IR | AuLock_GEN); ++ if (unlikely(err)) ++ goto out_name; ++ ++ err = au_d_hashed_positive(dentry); ++ if (!err) { ++ old_fs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = h_readlink(dentry, au_dbstart(dentry), buf.u, PATH_MAX); ++ set_fs(old_fs); ++ } ++ aufs_read_unlock(dentry, AuLock_IR); ++ ++ if (err >= 0) { ++ buf.k[err] = 0; ++ /* will be freed by put_link */ ++ nd_set_link(nd, buf.k); ++ return NULL; /* success */ ++ } ++ ++out_name: ++ __putname(buf.k); ++out: ++ path_put(&nd->path); ++ AuTraceErr(err); ++ return ERR_PTR(err); ++} ++ ++static void aufs_put_link(struct dentry *dentry __maybe_unused, ++ struct nameidata *nd, void *cookie __maybe_unused) ++{ ++ __putname(nd_get_link(nd)); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void aufs_truncate_range(struct inode *inode __maybe_unused, ++ loff_t start __maybe_unused, ++ loff_t end __maybe_unused) ++{ ++ AuUnsupport(); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct inode_operations aufs_symlink_iop = { ++ .permission = aufs_permission, ++ .setattr = aufs_setattr, ++ .getattr = aufs_getattr, ++ .readlink = aufs_readlink, ++ .follow_link = aufs_follow_link, ++ .put_link = aufs_put_link ++}; ++ ++struct inode_operations aufs_dir_iop = { ++ .create = aufs_create, ++ .lookup = aufs_lookup, ++ .link = aufs_link, ++ .unlink = aufs_unlink, ++ .symlink = aufs_symlink, ++ .mkdir = aufs_mkdir, ++ .rmdir = aufs_rmdir, ++ .mknod = aufs_mknod, ++ .rename = aufs_rename, ++ ++ .permission = aufs_permission, ++ .setattr = aufs_setattr, ++ .getattr = aufs_getattr ++}; ++ ++struct inode_operations aufs_iop = { ++ .permission = aufs_permission, ++ .setattr = aufs_setattr, ++ .getattr = aufs_getattr, ++ .truncate_range = aufs_truncate_range ++}; +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/i_op_del.c linux-3.2.0-gentoo-r1/fs/aufs/i_op_del.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/i_op_del.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/i_op_del.c 2012-01-17 12:11:24.759378661 +0100 +@@ -0,0 +1,478 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode operations (del entry) ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * decide if a new whiteout for @dentry is necessary or not. ++ * when it is necessary, prepare the parent dir for the upper branch whose ++ * branch index is @bcpup for creation. the actual creation of the whiteout will ++ * be done by caller. ++ * return value: ++ * 0: wh is unnecessary ++ * plus: wh is necessary ++ * minus: error ++ */ ++int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup) ++{ ++ int need_wh, err; ++ aufs_bindex_t bstart; ++ struct super_block *sb; ++ ++ sb = dentry->d_sb; ++ bstart = au_dbstart(dentry); ++ if (*bcpup < 0) { ++ *bcpup = bstart; ++ if (au_test_ro(sb, bstart, dentry->d_inode)) { ++ err = AuWbrCopyup(au_sbi(sb), dentry); ++ *bcpup = err; ++ if (unlikely(err < 0)) ++ goto out; ++ } ++ } else ++ AuDebugOn(bstart < *bcpup ++ || au_test_ro(sb, *bcpup, dentry->d_inode)); ++ AuDbg("bcpup %d, bstart %d\n", *bcpup, bstart); ++ ++ if (*bcpup != bstart) { ++ err = au_cpup_dirs(dentry, *bcpup); ++ if (unlikely(err)) ++ goto out; ++ need_wh = 1; ++ } else { ++ struct au_dinfo *dinfo, *tmp; ++ ++ need_wh = -ENOMEM; ++ dinfo = au_di(dentry); ++ tmp = au_di_alloc(sb, AuLsc_DI_TMP); ++ if (tmp) { ++ au_di_cp(tmp, dinfo); ++ au_di_swap(tmp, dinfo); ++ /* returns the number of positive dentries */ ++ need_wh = au_lkup_dentry(dentry, bstart + 1, /*type*/0, ++ /*nd*/NULL); ++ au_di_swap(tmp, dinfo); ++ au_rw_write_unlock(&tmp->di_rwsem); ++ au_di_free(tmp); ++ } ++ } ++ AuDbg("need_wh %d\n", need_wh); ++ err = need_wh; ++ ++out: ++ return err; ++} ++ ++/* ++ * simple tests for the del-entry operations. ++ * following the checks in vfs, plus the parent-child relationship. ++ */ ++int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir) ++{ ++ int err; ++ umode_t h_mode; ++ struct dentry *h_dentry, *h_latest; ++ struct inode *h_inode; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ h_inode = h_dentry->d_inode; ++ if (dentry->d_inode) { ++ err = -ENOENT; ++ if (unlikely(!h_inode || !h_inode->i_nlink)) ++ goto out; ++ ++ h_mode = h_inode->i_mode; ++ if (!isdir) { ++ err = -EISDIR; ++ if (unlikely(S_ISDIR(h_mode))) ++ goto out; ++ } else if (unlikely(!S_ISDIR(h_mode))) { ++ err = -ENOTDIR; ++ goto out; ++ } ++ } else { ++ /* rename(2) case */ ++ err = -EIO; ++ if (unlikely(h_inode)) ++ goto out; ++ } ++ ++ err = -ENOENT; ++ /* expected parent dir is locked */ ++ if (unlikely(h_parent != h_dentry->d_parent)) ++ goto out; ++ err = 0; ++ ++ /* ++ * rmdir a dir may break the consistency on some filesystem. ++ * let's try heavy test. ++ */ ++ err = -EACCES; ++ if (unlikely(au_test_h_perm(h_parent->d_inode, MAY_EXEC | MAY_WRITE))) ++ goto out; ++ ++ h_latest = au_sio_lkup_one(&dentry->d_name, h_parent, ++ au_sbr(dentry->d_sb, bindex)); ++ err = -EIO; ++ if (IS_ERR(h_latest)) ++ goto out; ++ if (h_latest == h_dentry) ++ err = 0; ++ dput(h_latest); ++ ++out: ++ return err; ++} ++ ++/* ++ * decide the branch where we operate for @dentry. the branch index will be set ++ * @rbcpup. after diciding it, 'pin' it and store the timestamps of the parent ++ * dir for reverting. ++ * when a new whiteout is necessary, create it. ++ */ ++static struct dentry* ++lock_hdir_create_wh(struct dentry *dentry, int isdir, aufs_bindex_t *rbcpup, ++ struct au_dtime *dt, struct au_pin *pin) ++{ ++ struct dentry *wh_dentry; ++ struct super_block *sb; ++ struct path h_path; ++ int err, need_wh; ++ unsigned int udba; ++ aufs_bindex_t bcpup; ++ ++ need_wh = au_wr_dir_need_wh(dentry, isdir, rbcpup); ++ wh_dentry = ERR_PTR(need_wh); ++ if (unlikely(need_wh < 0)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ udba = au_opt_udba(sb); ++ bcpup = *rbcpup; ++ err = au_pin(pin, dentry, bcpup, udba, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ h_path.dentry = au_pinned_h_parent(pin); ++ if (udba != AuOpt_UDBA_NONE ++ && au_dbstart(dentry) == bcpup) { ++ err = au_may_del(dentry, bcpup, h_path.dentry, isdir); ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_unpin; ++ } ++ ++ h_path.mnt = au_sbr_mnt(sb, bcpup); ++ au_dtime_store(dt, au_pinned_parent(pin), &h_path); ++ wh_dentry = NULL; ++ if (!need_wh) ++ goto out; /* success, no need to create whiteout */ ++ ++ wh_dentry = au_wh_create(dentry, bcpup, h_path.dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_unpin; ++ ++ /* returns with the parent is locked and wh_dentry is dget-ed */ ++ goto out; /* success */ ++ ++out_unpin: ++ au_unpin(pin); ++out: ++ return wh_dentry; ++} ++ ++/* ++ * when removing a dir, rename it to a unique temporary whiteout-ed name first ++ * in order to be revertible and save time for removing many child whiteouts ++ * under the dir. ++ * returns 1 when there are too many child whiteout and caller should remove ++ * them asynchronously. returns 0 when the number of children is enough small to ++ * remove now or the branch fs is a remote fs. ++ * otherwise return an error. ++ */ ++static int renwh_and_rmdir(struct dentry *dentry, aufs_bindex_t bindex, ++ struct au_nhash *whlist, struct inode *dir) ++{ ++ int rmdir_later, err, dirwh; ++ struct dentry *h_dentry; ++ struct super_block *sb; ++ ++ sb = dentry->d_sb; ++ SiMustAnyLock(sb); ++ h_dentry = au_h_dptr(dentry, bindex); ++ err = au_whtmp_ren(h_dentry, au_sbr(sb, bindex)); ++ if (unlikely(err)) ++ goto out; ++ ++ /* stop monitoring */ ++ au_hn_free(au_hi(dentry->d_inode, bindex)); ++ ++ if (!au_test_fs_remote(h_dentry->d_sb)) { ++ dirwh = au_sbi(sb)->si_dirwh; ++ rmdir_later = (dirwh <= 1); ++ if (!rmdir_later) ++ rmdir_later = au_nhash_test_longer_wh(whlist, bindex, ++ dirwh); ++ if (rmdir_later) ++ return rmdir_later; ++ } ++ ++ err = au_whtmp_rmdir(dir, bindex, h_dentry, whlist); ++ if (unlikely(err)) { ++ AuIOErr("rmdir %.*s, b%d failed, %d. ignored\n", ++ AuDLNPair(h_dentry), bindex, err); ++ err = 0; ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ++ * final procedure for deleting a entry. ++ * maintain dentry and iattr. ++ */ ++static void epilog(struct inode *dir, struct dentry *dentry, ++ aufs_bindex_t bindex) ++{ ++ struct inode *inode; ++ ++ inode = dentry->d_inode; ++ d_drop(dentry); ++ inode->i_ctime = dir->i_ctime; ++ ++ if (au_ibstart(dir) == bindex) ++ au_cpup_attr_timesizes(dir); ++ dir->i_version++; ++} ++ ++/* ++ * when an error happened, remove the created whiteout and revert everything. ++ */ ++static int do_revert(int err, struct inode *dir, aufs_bindex_t bindex, ++ aufs_bindex_t bwh, struct dentry *wh_dentry, ++ struct dentry *dentry, struct au_dtime *dt) ++{ ++ int rerr; ++ struct path h_path = { ++ .dentry = wh_dentry, ++ .mnt = au_sbr_mnt(dir->i_sb, bindex) ++ }; ++ ++ rerr = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, dentry); ++ if (!rerr) { ++ au_set_dbwh(dentry, bwh); ++ au_dtime_revert(dt); ++ return 0; ++ } ++ ++ AuIOErr("%.*s reverting whiteout failed(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ return -EIO; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int aufs_unlink(struct inode *dir, struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bwh, bindex, bstart; ++ struct au_dtime dt; ++ struct au_pin pin; ++ struct path h_path; ++ struct inode *inode, *h_dir; ++ struct dentry *parent, *wh_dentry; ++ ++ IMustLock(dir); ++ ++ err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); ++ if (unlikely(err)) ++ goto out; ++ err = au_d_hashed_positive(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ inode = dentry->d_inode; ++ IMustLock(inode); ++ err = -EISDIR; ++ if (unlikely(S_ISDIR(inode->i_mode))) ++ goto out_unlock; /* possible? */ ++ ++ bstart = au_dbstart(dentry); ++ bwh = au_dbwh(dentry); ++ bindex = -1; ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_write_lock_parent(parent); ++ wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/0, &bindex, &dt, &pin); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_parent; ++ ++ h_path.mnt = au_sbr_mnt(dentry->d_sb, bstart); ++ h_path.dentry = au_h_dptr(dentry, bstart); ++ dget(h_path.dentry); ++ if (bindex == bstart) { ++ h_dir = au_pinned_h_dir(&pin); ++ err = vfsub_unlink(h_dir, &h_path, /*force*/0); ++ } else { ++ /* dir inode is locked */ ++ h_dir = wh_dentry->d_parent->d_inode; ++ IMustLock(h_dir); ++ err = 0; ++ } ++ ++ if (!err) { ++ vfsub_drop_nlink(inode); ++ epilog(dir, dentry, bindex); ++ ++ /* update target timestamps */ ++ if (bindex == bstart) { ++ vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/ ++ inode->i_ctime = h_path.dentry->d_inode->i_ctime; ++ } else ++ /* todo: this timestamp may be reverted later */ ++ inode->i_ctime = h_dir->i_ctime; ++ goto out_unpin; /* success */ ++ } ++ ++ /* revert */ ++ if (wh_dentry) { ++ int rerr; ++ ++ rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry, &dt); ++ if (rerr) ++ err = rerr; ++ } ++ ++out_unpin: ++ au_unpin(&pin); ++ dput(wh_dentry); ++ dput(h_path.dentry); ++out_parent: ++ di_write_unlock(parent); ++out_unlock: ++ aufs_read_unlock(dentry, AuLock_DW); ++out: ++ return err; ++} ++ ++int aufs_rmdir(struct inode *dir, struct dentry *dentry) ++{ ++ int err, rmdir_later; ++ aufs_bindex_t bwh, bindex, bstart; ++ struct au_dtime dt; ++ struct au_pin pin; ++ struct inode *inode; ++ struct dentry *parent, *wh_dentry, *h_dentry; ++ struct au_whtmp_rmdir *args; ++ ++ IMustLock(dir); ++ ++ err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN); ++ if (unlikely(err)) ++ goto out; ++ err = au_alive_dir(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ inode = dentry->d_inode; ++ IMustLock(inode); ++ err = -ENOTDIR; ++ if (unlikely(!S_ISDIR(inode->i_mode))) ++ goto out_unlock; /* possible? */ ++ ++ err = -ENOMEM; ++ args = au_whtmp_rmdir_alloc(dir->i_sb, GFP_NOFS); ++ if (unlikely(!args)) ++ goto out_unlock; ++ ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_write_lock_parent(parent); ++ err = au_test_empty(dentry, &args->whlist); ++ if (unlikely(err)) ++ goto out_parent; ++ ++ bstart = au_dbstart(dentry); ++ bwh = au_dbwh(dentry); ++ bindex = -1; ++ wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/1, &bindex, &dt, &pin); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_parent; ++ ++ h_dentry = au_h_dptr(dentry, bstart); ++ dget(h_dentry); ++ rmdir_later = 0; ++ if (bindex == bstart) { ++ err = renwh_and_rmdir(dentry, bstart, &args->whlist, dir); ++ if (err > 0) { ++ rmdir_later = err; ++ err = 0; ++ } ++ } else { ++ /* stop monitoring */ ++ au_hn_free(au_hi(inode, bstart)); ++ ++ /* dir inode is locked */ ++ IMustLock(wh_dentry->d_parent->d_inode); ++ err = 0; ++ } ++ ++ if (!err) { ++ vfsub_dead_dir(inode); ++ au_set_dbdiropq(dentry, -1); ++ epilog(dir, dentry, bindex); ++ ++ if (rmdir_later) { ++ au_whtmp_kick_rmdir(dir, bstart, h_dentry, args); ++ args = NULL; ++ } ++ ++ goto out_unpin; /* success */ ++ } ++ ++ /* revert */ ++ AuLabel(revert); ++ if (wh_dentry) { ++ int rerr; ++ ++ rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry, &dt); ++ if (rerr) ++ err = rerr; ++ } ++ ++out_unpin: ++ au_unpin(&pin); ++ dput(wh_dentry); ++ dput(h_dentry); ++out_parent: ++ di_write_unlock(parent); ++ if (args) ++ au_whtmp_rmdir_free(args); ++out_unlock: ++ aufs_read_unlock(dentry, AuLock_DW); ++out: ++ AuTraceErr(err); ++ return err; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/i_op_ren.c linux-3.2.0-gentoo-r1/fs/aufs/i_op_ren.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/i_op_ren.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/i_op_ren.c 2012-01-17 12:11:24.777897347 +0100 +@@ -0,0 +1,1017 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode operation (rename entry) ++ * todo: this is crazy monster ++ */ ++ ++#include "aufs.h" ++ ++enum { AuSRC, AuDST, AuSrcDst }; ++enum { AuPARENT, AuCHILD, AuParentChild }; ++ ++#define AuRen_ISDIR 1 ++#define AuRen_ISSAMEDIR (1 << 1) ++#define AuRen_WHSRC (1 << 2) ++#define AuRen_WHDST (1 << 3) ++#define AuRen_MNT_WRITE (1 << 4) ++#define AuRen_DT_DSTDIR (1 << 5) ++#define AuRen_DIROPQ (1 << 6) ++#define AuRen_CPUP (1 << 7) ++#define au_ftest_ren(flags, name) ((flags) & AuRen_##name) ++#define au_fset_ren(flags, name) \ ++ do { (flags) |= AuRen_##name; } while (0) ++#define au_fclr_ren(flags, name) \ ++ do { (flags) &= ~AuRen_##name; } while (0) ++ ++struct au_ren_args { ++ struct { ++ struct dentry *dentry, *h_dentry, *parent, *h_parent, ++ *wh_dentry; ++ struct inode *dir, *inode; ++ struct au_hinode *hdir; ++ struct au_dtime dt[AuParentChild]; ++ aufs_bindex_t bstart; ++ } sd[AuSrcDst]; ++ ++#define src_dentry sd[AuSRC].dentry ++#define src_dir sd[AuSRC].dir ++#define src_inode sd[AuSRC].inode ++#define src_h_dentry sd[AuSRC].h_dentry ++#define src_parent sd[AuSRC].parent ++#define src_h_parent sd[AuSRC].h_parent ++#define src_wh_dentry sd[AuSRC].wh_dentry ++#define src_hdir sd[AuSRC].hdir ++#define src_h_dir sd[AuSRC].hdir->hi_inode ++#define src_dt sd[AuSRC].dt ++#define src_bstart sd[AuSRC].bstart ++ ++#define dst_dentry sd[AuDST].dentry ++#define dst_dir sd[AuDST].dir ++#define dst_inode sd[AuDST].inode ++#define dst_h_dentry sd[AuDST].h_dentry ++#define dst_parent sd[AuDST].parent ++#define dst_h_parent sd[AuDST].h_parent ++#define dst_wh_dentry sd[AuDST].wh_dentry ++#define dst_hdir sd[AuDST].hdir ++#define dst_h_dir sd[AuDST].hdir->hi_inode ++#define dst_dt sd[AuDST].dt ++#define dst_bstart sd[AuDST].bstart ++ ++ struct dentry *h_trap; ++ struct au_branch *br; ++ struct au_hinode *src_hinode; ++ struct path h_path; ++ struct au_nhash whlist; ++ aufs_bindex_t btgt, src_bwh, src_bdiropq; ++ ++ unsigned int flags; ++ ++ struct au_whtmp_rmdir *thargs; ++ struct dentry *h_dst; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * functions for reverting. ++ * when an error happened in a single rename systemcall, we should revert ++ * everything as if nothing happend. ++ * we don't need to revert the copied-up/down the parent dir since they are ++ * harmless. ++ */ ++ ++#define RevertFailure(fmt, ...) do { \ ++ AuIOErr("revert failure: " fmt " (%d, %d)\n", \ ++ ##__VA_ARGS__, err, rerr); \ ++ err = -EIO; \ ++} while (0) ++ ++static void au_ren_rev_diropq(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); ++ rerr = au_diropq_remove(a->src_dentry, a->btgt); ++ au_hn_imtx_unlock(a->src_hinode); ++ au_set_dbdiropq(a->src_dentry, a->src_bdiropq); ++ if (rerr) ++ RevertFailure("remove diropq %.*s", AuDLNPair(a->src_dentry)); ++} ++ ++static void au_ren_rev_rename(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ a->h_path.dentry = au_lkup_one(&a->src_dentry->d_name, a->src_h_parent, ++ a->br, /*nd*/NULL); ++ rerr = PTR_ERR(a->h_path.dentry); ++ if (IS_ERR(a->h_path.dentry)) { ++ RevertFailure("au_lkup_one %.*s", AuDLNPair(a->src_dentry)); ++ return; ++ } ++ ++ rerr = vfsub_rename(a->dst_h_dir, ++ au_h_dptr(a->src_dentry, a->btgt), ++ a->src_h_dir, &a->h_path); ++ d_drop(a->h_path.dentry); ++ dput(a->h_path.dentry); ++ /* au_set_h_dptr(a->src_dentry, a->btgt, NULL); */ ++ if (rerr) ++ RevertFailure("rename %.*s", AuDLNPair(a->src_dentry)); ++} ++ ++static void au_ren_rev_cpup(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ a->h_path.dentry = a->dst_h_dentry; ++ rerr = vfsub_unlink(a->dst_h_dir, &a->h_path, /*force*/0); ++ au_set_h_dptr(a->src_dentry, a->btgt, NULL); ++ au_set_dbstart(a->src_dentry, a->src_bstart); ++ if (rerr) ++ RevertFailure("unlink %.*s", AuDLNPair(a->dst_h_dentry)); ++} ++ ++static void au_ren_rev_whtmp(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ a->h_path.dentry = au_lkup_one(&a->dst_dentry->d_name, a->dst_h_parent, ++ a->br, /*nd*/NULL); ++ rerr = PTR_ERR(a->h_path.dentry); ++ if (IS_ERR(a->h_path.dentry)) { ++ RevertFailure("lookup %.*s", AuDLNPair(a->dst_dentry)); ++ return; ++ } ++ if (a->h_path.dentry->d_inode) { ++ d_drop(a->h_path.dentry); ++ dput(a->h_path.dentry); ++ return; ++ } ++ ++ rerr = vfsub_rename(a->dst_h_dir, a->h_dst, a->dst_h_dir, &a->h_path); ++ d_drop(a->h_path.dentry); ++ dput(a->h_path.dentry); ++ if (!rerr) ++ au_set_h_dptr(a->dst_dentry, a->btgt, dget(a->h_dst)); ++ else ++ RevertFailure("rename %.*s", AuDLNPair(a->h_dst)); ++} ++ ++static void au_ren_rev_whsrc(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ a->h_path.dentry = a->src_wh_dentry; ++ rerr = au_wh_unlink_dentry(a->src_h_dir, &a->h_path, a->src_dentry); ++ au_set_dbwh(a->src_dentry, a->src_bwh); ++ if (rerr) ++ RevertFailure("unlink %.*s", AuDLNPair(a->src_wh_dentry)); ++} ++#undef RevertFailure ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * when we have to copyup the renaming entry, do it with the rename-target name ++ * in order to minimize the cost (the later actual rename is unnecessary). ++ * otherwise rename it on the target branch. ++ */ ++static int au_ren_or_cpup(struct au_ren_args *a) ++{ ++ int err; ++ struct dentry *d; ++ ++ d = a->src_dentry; ++ if (au_dbstart(d) == a->btgt) { ++ a->h_path.dentry = a->dst_h_dentry; ++ if (au_ftest_ren(a->flags, DIROPQ) ++ && au_dbdiropq(d) == a->btgt) ++ au_fclr_ren(a->flags, DIROPQ); ++ AuDebugOn(au_dbstart(d) != a->btgt); ++ err = vfsub_rename(a->src_h_dir, au_h_dptr(d, a->btgt), ++ a->dst_h_dir, &a->h_path); ++ } else { ++ struct mutex *h_mtx = &a->src_h_dentry->d_inode->i_mutex; ++ struct file *h_file; ++ ++ au_fset_ren(a->flags, CPUP); ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ au_set_dbstart(d, a->btgt); ++ au_set_h_dptr(d, a->btgt, dget(a->dst_h_dentry)); ++ h_file = au_h_open_pre(d, a->src_bstart); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ h_file = NULL; ++ } else ++ err = au_sio_cpup_single(d, a->btgt, a->src_bstart, -1, ++ !AuCpup_DTIME, a->dst_parent); ++ mutex_unlock(h_mtx); ++ au_h_open_post(d, a->src_bstart, h_file); ++ if (!err) { ++ d = a->dst_dentry; ++ au_set_h_dptr(d, a->btgt, NULL); ++ au_update_dbstart(d); ++ } else { ++ au_set_h_dptr(d, a->btgt, NULL); ++ au_set_dbstart(d, a->src_bstart); ++ } ++ } ++ if (!err && a->h_dst) ++ /* it will be set to dinfo later */ ++ dget(a->h_dst); ++ ++ return err; ++} ++ ++/* cf. aufs_rmdir() */ ++static int au_ren_del_whtmp(struct au_ren_args *a) ++{ ++ int err; ++ struct inode *dir; ++ ++ dir = a->dst_dir; ++ SiMustAnyLock(dir->i_sb); ++ if (!au_nhash_test_longer_wh(&a->whlist, a->btgt, ++ au_sbi(dir->i_sb)->si_dirwh) ++ || au_test_fs_remote(a->h_dst->d_sb)) { ++ err = au_whtmp_rmdir(dir, a->btgt, a->h_dst, &a->whlist); ++ if (unlikely(err)) ++ pr_warning("failed removing whtmp dir %.*s (%d), " ++ "ignored.\n", AuDLNPair(a->h_dst), err); ++ } else { ++ au_nhash_wh_free(&a->thargs->whlist); ++ a->thargs->whlist = a->whlist; ++ a->whlist.nh_num = 0; ++ au_whtmp_kick_rmdir(dir, a->btgt, a->h_dst, a->thargs); ++ dput(a->h_dst); ++ a->thargs = NULL; ++ } ++ ++ return 0; ++} ++ ++/* make it 'opaque' dir. */ ++static int au_ren_diropq(struct au_ren_args *a) ++{ ++ int err; ++ struct dentry *diropq; ++ ++ err = 0; ++ a->src_bdiropq = au_dbdiropq(a->src_dentry); ++ a->src_hinode = au_hi(a->src_inode, a->btgt); ++ au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); ++ diropq = au_diropq_create(a->src_dentry, a->btgt); ++ au_hn_imtx_unlock(a->src_hinode); ++ if (IS_ERR(diropq)) ++ err = PTR_ERR(diropq); ++ dput(diropq); ++ ++ return err; ++} ++ ++static int do_rename(struct au_ren_args *a) ++{ ++ int err; ++ struct dentry *d, *h_d; ++ ++ /* prepare workqueue args for asynchronous rmdir */ ++ h_d = a->dst_h_dentry; ++ if (au_ftest_ren(a->flags, ISDIR) && h_d->d_inode) { ++ err = -ENOMEM; ++ a->thargs = au_whtmp_rmdir_alloc(a->src_dentry->d_sb, GFP_NOFS); ++ if (unlikely(!a->thargs)) ++ goto out; ++ a->h_dst = dget(h_d); ++ } ++ ++ /* create whiteout for src_dentry */ ++ if (au_ftest_ren(a->flags, WHSRC)) { ++ a->src_bwh = au_dbwh(a->src_dentry); ++ AuDebugOn(a->src_bwh >= 0); ++ a->src_wh_dentry ++ = au_wh_create(a->src_dentry, a->btgt, a->src_h_parent); ++ err = PTR_ERR(a->src_wh_dentry); ++ if (IS_ERR(a->src_wh_dentry)) ++ goto out_thargs; ++ } ++ ++ /* lookup whiteout for dentry */ ++ if (au_ftest_ren(a->flags, WHDST)) { ++ h_d = au_wh_lkup(a->dst_h_parent, &a->dst_dentry->d_name, ++ a->br); ++ err = PTR_ERR(h_d); ++ if (IS_ERR(h_d)) ++ goto out_whsrc; ++ if (!h_d->d_inode) ++ dput(h_d); ++ else ++ a->dst_wh_dentry = h_d; ++ } ++ ++ /* rename dentry to tmpwh */ ++ if (a->thargs) { ++ err = au_whtmp_ren(a->dst_h_dentry, a->br); ++ if (unlikely(err)) ++ goto out_whdst; ++ ++ d = a->dst_dentry; ++ au_set_h_dptr(d, a->btgt, NULL); ++ err = au_lkup_neg(d, a->btgt); ++ if (unlikely(err)) ++ goto out_whtmp; ++ a->dst_h_dentry = au_h_dptr(d, a->btgt); ++ } ++ ++ /* cpup src */ ++ if (a->dst_h_dentry->d_inode && a->src_bstart != a->btgt) { ++ struct mutex *h_mtx = &a->src_h_dentry->d_inode->i_mutex; ++ struct file *h_file; ++ ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ AuDebugOn(au_dbstart(a->src_dentry) != a->src_bstart); ++ h_file = au_h_open_pre(a->src_dentry, a->src_bstart); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ h_file = NULL; ++ } else ++ err = au_sio_cpup_simple(a->src_dentry, a->btgt, -1, ++ !AuCpup_DTIME); ++ mutex_unlock(h_mtx); ++ au_h_open_post(a->src_dentry, a->src_bstart, h_file); ++ if (unlikely(err)) ++ goto out_whtmp; ++ } ++ ++ /* rename by vfs_rename or cpup */ ++ d = a->dst_dentry; ++ if (au_ftest_ren(a->flags, ISDIR) ++ && (a->dst_wh_dentry ++ || au_dbdiropq(d) == a->btgt ++ /* hide the lower to keep xino */ ++ || a->btgt < au_dbend(d) ++ || au_opt_test(au_mntflags(d->d_sb), ALWAYS_DIROPQ))) ++ au_fset_ren(a->flags, DIROPQ); ++ err = au_ren_or_cpup(a); ++ if (unlikely(err)) ++ /* leave the copied-up one */ ++ goto out_whtmp; ++ ++ /* make dir opaque */ ++ if (au_ftest_ren(a->flags, DIROPQ)) { ++ err = au_ren_diropq(a); ++ if (unlikely(err)) ++ goto out_rename; ++ } ++ ++ /* update target timestamps */ ++ AuDebugOn(au_dbstart(a->src_dentry) != a->btgt); ++ a->h_path.dentry = au_h_dptr(a->src_dentry, a->btgt); ++ vfsub_update_h_iattr(&a->h_path, /*did*/NULL); /*ignore*/ ++ a->src_inode->i_ctime = a->h_path.dentry->d_inode->i_ctime; ++ ++ /* remove whiteout for dentry */ ++ if (a->dst_wh_dentry) { ++ a->h_path.dentry = a->dst_wh_dentry; ++ err = au_wh_unlink_dentry(a->dst_h_dir, &a->h_path, ++ a->dst_dentry); ++ if (unlikely(err)) ++ goto out_diropq; ++ } ++ ++ /* remove whtmp */ ++ if (a->thargs) ++ au_ren_del_whtmp(a); /* ignore this error */ ++ ++ err = 0; ++ goto out_success; ++ ++out_diropq: ++ if (au_ftest_ren(a->flags, DIROPQ)) ++ au_ren_rev_diropq(err, a); ++out_rename: ++ if (!au_ftest_ren(a->flags, CPUP)) ++ au_ren_rev_rename(err, a); ++ else ++ au_ren_rev_cpup(err, a); ++ dput(a->h_dst); ++out_whtmp: ++ if (a->thargs) ++ au_ren_rev_whtmp(err, a); ++out_whdst: ++ dput(a->dst_wh_dentry); ++ a->dst_wh_dentry = NULL; ++out_whsrc: ++ if (a->src_wh_dentry) ++ au_ren_rev_whsrc(err, a); ++out_success: ++ dput(a->src_wh_dentry); ++ dput(a->dst_wh_dentry); ++out_thargs: ++ if (a->thargs) { ++ dput(a->h_dst); ++ au_whtmp_rmdir_free(a->thargs); ++ a->thargs = NULL; ++ } ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * test if @dentry dir can be rename destination or not. ++ * success means, it is a logically empty dir. ++ */ ++static int may_rename_dstdir(struct dentry *dentry, struct au_nhash *whlist) ++{ ++ return au_test_empty(dentry, whlist); ++} ++ ++/* ++ * test if @dentry dir can be rename source or not. ++ * if it can, return 0 and @children is filled. ++ * success means, ++ * - it is a logically empty dir. ++ * - or, it exists on writable branch and has no children including whiteouts ++ * on the lower branch. ++ */ ++static int may_rename_srcdir(struct dentry *dentry, aufs_bindex_t btgt) ++{ ++ int err; ++ unsigned int rdhash; ++ aufs_bindex_t bstart; ++ ++ bstart = au_dbstart(dentry); ++ if (bstart != btgt) { ++ struct au_nhash whlist; ++ ++ SiMustAnyLock(dentry->d_sb); ++ rdhash = au_sbi(dentry->d_sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, ++ dentry)); ++ err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_test_empty(dentry, &whlist); ++ au_nhash_wh_free(&whlist); ++ goto out; ++ } ++ ++ if (bstart == au_dbtaildir(dentry)) ++ return 0; /* success */ ++ ++ err = au_test_empty_lower(dentry); ++ ++out: ++ if (err == -ENOTEMPTY) { ++ AuWarn1("renaming dir who has child(ren) on multiple branches," ++ " is not supported\n"); ++ err = -EXDEV; ++ } ++ return err; ++} ++ ++/* side effect: sets whlist and h_dentry */ ++static int au_ren_may_dir(struct au_ren_args *a) ++{ ++ int err; ++ unsigned int rdhash; ++ struct dentry *d; ++ ++ d = a->dst_dentry; ++ SiMustAnyLock(d->d_sb); ++ ++ err = 0; ++ if (au_ftest_ren(a->flags, ISDIR) && a->dst_inode) { ++ rdhash = au_sbi(d->d_sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, d)); ++ err = au_nhash_alloc(&a->whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ ++ au_set_dbstart(d, a->dst_bstart); ++ err = may_rename_dstdir(d, &a->whlist); ++ au_set_dbstart(d, a->btgt); ++ } ++ a->dst_h_dentry = au_h_dptr(d, au_dbstart(d)); ++ if (unlikely(err)) ++ goto out; ++ ++ d = a->src_dentry; ++ a->src_h_dentry = au_h_dptr(d, au_dbstart(d)); ++ if (au_ftest_ren(a->flags, ISDIR)) { ++ err = may_rename_srcdir(d, a->btgt); ++ if (unlikely(err)) { ++ au_nhash_wh_free(&a->whlist); ++ a->whlist.nh_num = 0; ++ } ++ } ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * simple tests for rename. ++ * following the checks in vfs, plus the parent-child relationship. ++ */ ++static int au_may_ren(struct au_ren_args *a) ++{ ++ int err, isdir; ++ struct inode *h_inode; ++ ++ if (a->src_bstart == a->btgt) { ++ err = au_may_del(a->src_dentry, a->btgt, a->src_h_parent, ++ au_ftest_ren(a->flags, ISDIR)); ++ if (unlikely(err)) ++ goto out; ++ err = -EINVAL; ++ if (unlikely(a->src_h_dentry == a->h_trap)) ++ goto out; ++ } ++ ++ err = 0; ++ if (a->dst_bstart != a->btgt) ++ goto out; ++ ++ err = -ENOTEMPTY; ++ if (unlikely(a->dst_h_dentry == a->h_trap)) ++ goto out; ++ ++ err = -EIO; ++ h_inode = a->dst_h_dentry->d_inode; ++ isdir = !!au_ftest_ren(a->flags, ISDIR); ++ if (!a->dst_dentry->d_inode) { ++ if (unlikely(h_inode)) ++ goto out; ++ err = au_may_add(a->dst_dentry, a->btgt, a->dst_h_parent, ++ isdir); ++ } else { ++ if (unlikely(!h_inode || !h_inode->i_nlink)) ++ goto out; ++ err = au_may_del(a->dst_dentry, a->btgt, a->dst_h_parent, ++ isdir); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++out: ++ if (unlikely(err == -ENOENT || err == -EEXIST)) ++ err = -EIO; ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * locking order ++ * (VFS) ++ * - src_dir and dir by lock_rename() ++ * - inode if exitsts ++ * (aufs) ++ * - lock all ++ * + src_dentry and dentry by aufs_read_and_write_lock2() which calls, ++ * + si_read_lock ++ * + di_write_lock2_child() ++ * + di_write_lock_child() ++ * + ii_write_lock_child() ++ * + di_write_lock_child2() ++ * + ii_write_lock_child2() ++ * + src_parent and parent ++ * + di_write_lock_parent() ++ * + ii_write_lock_parent() ++ * + di_write_lock_parent2() ++ * + ii_write_lock_parent2() ++ * + lower src_dir and dir by vfsub_lock_rename() ++ * + verify the every relationships between child and parent. if any ++ * of them failed, unlock all and return -EBUSY. ++ */ ++static void au_ren_unlock(struct au_ren_args *a) ++{ ++ struct super_block *sb; ++ ++ sb = a->dst_dentry->d_sb; ++ if (au_ftest_ren(a->flags, MNT_WRITE)) ++ mnt_drop_write(a->br->br_mnt); ++ vfsub_unlock_rename(a->src_h_parent, a->src_hdir, ++ a->dst_h_parent, a->dst_hdir); ++} ++ ++static int au_ren_lock(struct au_ren_args *a) ++{ ++ int err; ++ unsigned int udba; ++ ++ err = 0; ++ a->src_h_parent = au_h_dptr(a->src_parent, a->btgt); ++ a->src_hdir = au_hi(a->src_dir, a->btgt); ++ a->dst_h_parent = au_h_dptr(a->dst_parent, a->btgt); ++ a->dst_hdir = au_hi(a->dst_dir, a->btgt); ++ a->h_trap = vfsub_lock_rename(a->src_h_parent, a->src_hdir, ++ a->dst_h_parent, a->dst_hdir); ++ udba = au_opt_udba(a->src_dentry->d_sb); ++ if (unlikely(a->src_hdir->hi_inode != a->src_h_parent->d_inode ++ || a->dst_hdir->hi_inode != a->dst_h_parent->d_inode)) ++ err = au_busy_or_stale(); ++ if (!err && au_dbstart(a->src_dentry) == a->btgt) ++ err = au_h_verify(a->src_h_dentry, udba, ++ a->src_h_parent->d_inode, a->src_h_parent, ++ a->br); ++ if (!err && au_dbstart(a->dst_dentry) == a->btgt) ++ err = au_h_verify(a->dst_h_dentry, udba, ++ a->dst_h_parent->d_inode, a->dst_h_parent, ++ a->br); ++ if (!err) { ++ err = mnt_want_write(a->br->br_mnt); ++ if (unlikely(err)) ++ goto out_unlock; ++ au_fset_ren(a->flags, MNT_WRITE); ++ goto out; /* success */ ++ } ++ ++ err = au_busy_or_stale(); ++ ++out_unlock: ++ au_ren_unlock(a); ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_ren_refresh_dir(struct au_ren_args *a) ++{ ++ struct inode *dir; ++ ++ dir = a->dst_dir; ++ dir->i_version++; ++ if (au_ftest_ren(a->flags, ISDIR)) { ++ /* is this updating defined in POSIX? */ ++ au_cpup_attr_timesizes(a->src_inode); ++ au_cpup_attr_nlink(dir, /*force*/1); ++ } ++ ++ if (au_ibstart(dir) == a->btgt) ++ au_cpup_attr_timesizes(dir); ++ ++ if (au_ftest_ren(a->flags, ISSAMEDIR)) ++ return; ++ ++ dir = a->src_dir; ++ dir->i_version++; ++ if (au_ftest_ren(a->flags, ISDIR)) ++ au_cpup_attr_nlink(dir, /*force*/1); ++ if (au_ibstart(dir) == a->btgt) ++ au_cpup_attr_timesizes(dir); ++} ++ ++static void au_ren_refresh(struct au_ren_args *a) ++{ ++ aufs_bindex_t bend, bindex; ++ struct dentry *d, *h_d; ++ struct inode *i, *h_i; ++ struct super_block *sb; ++ ++ d = a->dst_dentry; ++ d_drop(d); ++ if (a->h_dst) ++ /* already dget-ed by au_ren_or_cpup() */ ++ au_set_h_dptr(d, a->btgt, a->h_dst); ++ ++ i = a->dst_inode; ++ if (i) { ++ if (!au_ftest_ren(a->flags, ISDIR)) ++ vfsub_drop_nlink(i); ++ else { ++ vfsub_dead_dir(i); ++ au_cpup_attr_timesizes(i); ++ } ++ au_update_dbrange(d, /*do_put_zero*/1); ++ } else { ++ bend = a->btgt; ++ for (bindex = au_dbstart(d); bindex < bend; bindex++) ++ au_set_h_dptr(d, bindex, NULL); ++ bend = au_dbend(d); ++ for (bindex = a->btgt + 1; bindex <= bend; bindex++) ++ au_set_h_dptr(d, bindex, NULL); ++ au_update_dbrange(d, /*do_put_zero*/0); ++ } ++ ++ d = a->src_dentry; ++ au_set_dbwh(d, -1); ++ bend = au_dbend(d); ++ for (bindex = a->btgt + 1; bindex <= bend; bindex++) { ++ h_d = au_h_dptr(d, bindex); ++ if (h_d) ++ au_set_h_dptr(d, bindex, NULL); ++ } ++ au_set_dbend(d, a->btgt); ++ ++ sb = d->d_sb; ++ i = a->src_inode; ++ if (au_opt_test(au_mntflags(sb), PLINK) && au_plink_test(i)) ++ return; /* success */ ++ ++ bend = au_ibend(i); ++ for (bindex = a->btgt + 1; bindex <= bend; bindex++) { ++ h_i = au_h_iptr(i, bindex); ++ if (h_i) { ++ au_xino_write(sb, bindex, h_i->i_ino, /*ino*/0); ++ /* ignore this error */ ++ au_set_h_iptr(i, bindex, NULL, 0); ++ } ++ } ++ au_set_ibend(i, a->btgt); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* mainly for link(2) and rename(2) */ ++int au_wbr(struct dentry *dentry, aufs_bindex_t btgt) ++{ ++ aufs_bindex_t bdiropq, bwh; ++ struct dentry *parent; ++ struct au_branch *br; ++ ++ parent = dentry->d_parent; ++ IMustLock(parent->d_inode); /* dir is locked */ ++ ++ bdiropq = au_dbdiropq(parent); ++ bwh = au_dbwh(dentry); ++ br = au_sbr(dentry->d_sb, btgt); ++ if (au_br_rdonly(br) ++ || (0 <= bdiropq && bdiropq < btgt) ++ || (0 <= bwh && bwh < btgt)) ++ btgt = -1; ++ ++ AuDbg("btgt %d\n", btgt); ++ return btgt; ++} ++ ++/* sets src_bstart, dst_bstart and btgt */ ++static int au_ren_wbr(struct au_ren_args *a) ++{ ++ int err; ++ struct au_wr_dir_args wr_dir_args = { ++ /* .force_btgt = -1, */ ++ .flags = AuWrDir_ADD_ENTRY ++ }; ++ ++ a->src_bstart = au_dbstart(a->src_dentry); ++ a->dst_bstart = au_dbstart(a->dst_dentry); ++ if (au_ftest_ren(a->flags, ISDIR)) ++ au_fset_wrdir(wr_dir_args.flags, ISDIR); ++ wr_dir_args.force_btgt = a->src_bstart; ++ if (a->dst_inode && a->dst_bstart < a->src_bstart) ++ wr_dir_args.force_btgt = a->dst_bstart; ++ wr_dir_args.force_btgt = au_wbr(a->dst_dentry, wr_dir_args.force_btgt); ++ err = au_wr_dir(a->dst_dentry, a->src_dentry, &wr_dir_args); ++ a->btgt = err; ++ ++ return err; ++} ++ ++static void au_ren_dt(struct au_ren_args *a) ++{ ++ a->h_path.dentry = a->src_h_parent; ++ au_dtime_store(a->src_dt + AuPARENT, a->src_parent, &a->h_path); ++ if (!au_ftest_ren(a->flags, ISSAMEDIR)) { ++ a->h_path.dentry = a->dst_h_parent; ++ au_dtime_store(a->dst_dt + AuPARENT, a->dst_parent, &a->h_path); ++ } ++ ++ au_fclr_ren(a->flags, DT_DSTDIR); ++ if (!au_ftest_ren(a->flags, ISDIR)) ++ return; ++ ++ a->h_path.dentry = a->src_h_dentry; ++ au_dtime_store(a->src_dt + AuCHILD, a->src_dentry, &a->h_path); ++ if (a->dst_h_dentry->d_inode) { ++ au_fset_ren(a->flags, DT_DSTDIR); ++ a->h_path.dentry = a->dst_h_dentry; ++ au_dtime_store(a->dst_dt + AuCHILD, a->dst_dentry, &a->h_path); ++ } ++} ++ ++static void au_ren_rev_dt(int err, struct au_ren_args *a) ++{ ++ struct dentry *h_d; ++ struct mutex *h_mtx; ++ ++ au_dtime_revert(a->src_dt + AuPARENT); ++ if (!au_ftest_ren(a->flags, ISSAMEDIR)) ++ au_dtime_revert(a->dst_dt + AuPARENT); ++ ++ if (au_ftest_ren(a->flags, ISDIR) && err != -EIO) { ++ h_d = a->src_dt[AuCHILD].dt_h_path.dentry; ++ h_mtx = &h_d->d_inode->i_mutex; ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ au_dtime_revert(a->src_dt + AuCHILD); ++ mutex_unlock(h_mtx); ++ ++ if (au_ftest_ren(a->flags, DT_DSTDIR)) { ++ h_d = a->dst_dt[AuCHILD].dt_h_path.dentry; ++ h_mtx = &h_d->d_inode->i_mutex; ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ au_dtime_revert(a->dst_dt + AuCHILD); ++ mutex_unlock(h_mtx); ++ } ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int aufs_rename(struct inode *_src_dir, struct dentry *_src_dentry, ++ struct inode *_dst_dir, struct dentry *_dst_dentry) ++{ ++ int err, flags; ++ /* reduce stack space */ ++ struct au_ren_args *a; ++ ++ AuDbg("%.*s, %.*s\n", AuDLNPair(_src_dentry), AuDLNPair(_dst_dentry)); ++ IMustLock(_src_dir); ++ IMustLock(_dst_dir); ++ ++ err = -ENOMEM; ++ BUILD_BUG_ON(sizeof(*a) > PAGE_SIZE); ++ a = kzalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ a->src_dir = _src_dir; ++ a->src_dentry = _src_dentry; ++ a->src_inode = a->src_dentry->d_inode; ++ a->src_parent = a->src_dentry->d_parent; /* dir inode is locked */ ++ a->dst_dir = _dst_dir; ++ a->dst_dentry = _dst_dentry; ++ a->dst_inode = a->dst_dentry->d_inode; ++ a->dst_parent = a->dst_dentry->d_parent; /* dir inode is locked */ ++ if (a->dst_inode) { ++ IMustLock(a->dst_inode); ++ au_igrab(a->dst_inode); ++ } ++ ++ err = -ENOTDIR; ++ flags = AuLock_FLUSH | AuLock_NOPLM | AuLock_GEN; ++ if (S_ISDIR(a->src_inode->i_mode)) { ++ au_fset_ren(a->flags, ISDIR); ++ if (unlikely(a->dst_inode && !S_ISDIR(a->dst_inode->i_mode))) ++ goto out_free; ++ err = aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, ++ AuLock_DIR | flags); ++ } else ++ err = aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, ++ flags); ++ if (unlikely(err)) ++ goto out_free; ++ ++ err = au_d_hashed_positive(a->src_dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ err = -ENOENT; ++ if (a->dst_inode) { ++ /* ++ * If it is a dir, VFS unhash dst_dentry before this ++ * function. It means we cannot rely upon d_unhashed(). ++ */ ++ if (unlikely(!a->dst_inode->i_nlink)) ++ goto out_unlock; ++ if (!S_ISDIR(a->dst_inode->i_mode)) { ++ err = au_d_hashed_positive(a->dst_dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ } else if (unlikely(IS_DEADDIR(a->dst_inode))) ++ goto out_unlock; ++ } else if (unlikely(d_unhashed(a->dst_dentry))) ++ goto out_unlock; ++ ++ au_fset_ren(a->flags, ISSAMEDIR); /* temporary */ ++ di_write_lock_parent(a->dst_parent); ++ ++ /* which branch we process */ ++ err = au_ren_wbr(a); ++ if (unlikely(err < 0)) ++ goto out_parent; ++ a->br = au_sbr(a->dst_dentry->d_sb, a->btgt); ++ a->h_path.mnt = a->br->br_mnt; ++ ++ /* are they available to be renamed */ ++ err = au_ren_may_dir(a); ++ if (unlikely(err)) ++ goto out_children; ++ ++ /* prepare the writable parent dir on the same branch */ ++ if (a->dst_bstart == a->btgt) { ++ au_fset_ren(a->flags, WHDST); ++ } else { ++ err = au_cpup_dirs(a->dst_dentry, a->btgt); ++ if (unlikely(err)) ++ goto out_children; ++ } ++ ++ if (a->src_dir != a->dst_dir) { ++ /* ++ * this temporary unlock is safe, ++ * because both dir->i_mutex are locked. ++ */ ++ di_write_unlock(a->dst_parent); ++ di_write_lock_parent(a->src_parent); ++ err = au_wr_dir_need_wh(a->src_dentry, ++ au_ftest_ren(a->flags, ISDIR), ++ &a->btgt); ++ di_write_unlock(a->src_parent); ++ di_write_lock2_parent(a->src_parent, a->dst_parent, /*isdir*/1); ++ au_fclr_ren(a->flags, ISSAMEDIR); ++ } else ++ err = au_wr_dir_need_wh(a->src_dentry, ++ au_ftest_ren(a->flags, ISDIR), ++ &a->btgt); ++ if (unlikely(err < 0)) ++ goto out_children; ++ if (err) ++ au_fset_ren(a->flags, WHSRC); ++ ++ /* lock them all */ ++ err = au_ren_lock(a); ++ if (unlikely(err)) ++ goto out_children; ++ ++ if (!au_opt_test(au_mntflags(a->dst_dir->i_sb), UDBA_NONE)) ++ err = au_may_ren(a); ++ else if (unlikely(a->dst_dentry->d_name.len > AUFS_MAX_NAMELEN)) ++ err = -ENAMETOOLONG; ++ if (unlikely(err)) ++ goto out_hdir; ++ ++ /* store timestamps to be revertible */ ++ au_ren_dt(a); ++ ++ /* here we go */ ++ err = do_rename(a); ++ if (unlikely(err)) ++ goto out_dt; ++ ++ /* update dir attributes */ ++ au_ren_refresh_dir(a); ++ ++ /* dput/iput all lower dentries */ ++ au_ren_refresh(a); ++ ++ goto out_hdir; /* success */ ++ ++out_dt: ++ au_ren_rev_dt(err, a); ++out_hdir: ++ au_ren_unlock(a); ++out_children: ++ au_nhash_wh_free(&a->whlist); ++ if (err && a->dst_inode && a->dst_bstart != a->btgt) { ++ AuDbg("bstart %d, btgt %d\n", a->dst_bstart, a->btgt); ++ au_set_h_dptr(a->dst_dentry, a->btgt, NULL); ++ au_set_dbstart(a->dst_dentry, a->dst_bstart); ++ } ++out_parent: ++ if (!err) ++ d_move(a->src_dentry, a->dst_dentry); ++ else { ++ au_update_dbstart(a->dst_dentry); ++ if (!a->dst_inode) ++ d_drop(a->dst_dentry); ++ } ++ if (au_ftest_ren(a->flags, ISSAMEDIR)) ++ di_write_unlock(a->dst_parent); ++ else ++ di_write_unlock2(a->src_parent, a->dst_parent); ++out_unlock: ++ aufs_read_and_write_unlock2(a->dst_dentry, a->src_dentry); ++out_free: ++ iput(a->dst_inode); ++ if (a->thargs) ++ au_whtmp_rmdir_free(a->thargs); ++ kfree(a); ++out: ++ AuTraceErr(err); ++ return err; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/Kconfig linux-3.2.0-gentoo-r1/fs/aufs/Kconfig +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/Kconfig 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/Kconfig 2012-01-17 12:11:24.463079695 +0100 +@@ -0,0 +1,203 @@ ++config AUFS_FS ++ tristate "Aufs (Advanced multi layered unification filesystem) support" ++ depends on EXPERIMENTAL ++ help ++ Aufs is a stackable unification filesystem such as Unionfs, ++ which unifies several directories and provides a merged single ++ directory. ++ In the early days, aufs was entirely re-designed and ++ re-implemented Unionfs Version 1.x series. Introducing many ++ original ideas, approaches and improvements, it becomes totally ++ different from Unionfs while keeping the basic features. ++ ++if AUFS_FS ++choice ++ prompt "Maximum number of branches" ++ default AUFS_BRANCH_MAX_127 ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_127 ++ bool "127" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_511 ++ bool "511" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_1023 ++ bool "1023" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_32767 ++ bool "32767" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++endchoice ++ ++config AUFS_SBILIST ++ bool ++ depends on AUFS_MAGIC_SYSRQ || PROC_FS ++ default y ++ help ++ Automatic configuration for internal use. ++ When aufs supports Magic SysRq or /proc, enabled automatically. ++ ++config AUFS_HNOTIFY ++ bool "Detect direct branch access (bypassing aufs)" ++ help ++ If you want to modify files on branches directly, eg. bypassing aufs, ++ and want aufs to detect the changes of them fully, then enable this ++ option and use 'udba=notify' mount option. ++ Currently there is only one available configuration, "fsnotify". ++ It will have a negative impact to the performance. ++ See detail in aufs.5. ++ ++choice ++ prompt "method" if AUFS_HNOTIFY ++ default AUFS_HFSNOTIFY ++config AUFS_HFSNOTIFY ++ bool "fsnotify" ++ select FSNOTIFY ++endchoice ++ ++config AUFS_EXPORT ++ bool "NFS-exportable aufs" ++ depends on EXPORTFS ++ help ++ If you want to export your mounted aufs via NFS, then enable this ++ option. There are several requirements for this configuration. ++ See detail in aufs.5. ++ ++config AUFS_INO_T_64 ++ bool ++ depends on AUFS_EXPORT ++ depends on 64BIT && !(ALPHA || S390) ++ default y ++ help ++ Automatic configuration for internal use. ++ /* typedef unsigned long/int __kernel_ino_t */ ++ /* alpha and s390x are int */ ++ ++config AUFS_RDU ++ bool "Readdir in userspace" ++ help ++ Aufs has two methods to provide a merged view for a directory, ++ by a user-space library and by kernel-space natively. The latter ++ is always enabled but sometimes large and slow. ++ If you enable this option, install the library in aufs2-util ++ package, and set some environment variables for your readdir(3), ++ then the work will be handled in user-space which generally ++ shows better performance in most cases. ++ See detail in aufs.5. ++ ++config AUFS_PROC_MAP ++ bool "support for /proc/maps and lsof(1)" ++ depends on PROC_FS ++ help ++ When you issue mmap(2) in aufs, it is actually a direct mmap(2) ++ call to the file on the branch fs since the file in aufs is ++ purely virtual. And the file path printed in /proc/maps (and ++ others) will be the path on the branch fs. In most cases, it ++ does no harm. But some utilities like lsof(1) may confuse since ++ the utility or user may expect the file path in aufs to be ++ printed. ++ To address this issue, aufs provides a patch which introduces a ++ new member called vm_prfile into struct vm_are_struct. The patch ++ is meaningless without enabling this configuration since nobody ++ sets the new vm_prfile member. ++ If you don't apply the patch, then enabling this configuration ++ will cause a compile error. ++ This approach is fragile since if someone else make some changes ++ around vm_file, then vm_prfile may not work anymore. As a ++ workaround such case, aufs provides this configuration. If you ++ disable it, then lsof(1) may produce incorrect result but the ++ problem will be gone even if the aufs patch is applied (I hope). ++ ++config AUFS_SP_IATTR ++ bool "Respect the attributes (mtime/ctime mainly) of special files" ++ help ++ When you write something to a special file, some attributes of it ++ (mtime/ctime mainly) may be updated. Generally such updates are ++ less important (actually some device drivers and NFS ignore ++ it). But some applications (such like test program) requires ++ such updates. If you need these updates, then enable this ++ configuration which introduces some overhead. ++ Currently this configuration handles FIFO only. ++ ++config AUFS_SHWH ++ bool "Show whiteouts" ++ help ++ If you want to make the whiteouts in aufs visible, then enable ++ this option and specify 'shwh' mount option. Although it may ++ sounds like philosophy or something, but in technically it ++ simply shows the name of whiteout with keeping its behaviour. ++ ++config AUFS_BR_RAMFS ++ bool "Ramfs (initramfs/rootfs) as an aufs branch" ++ help ++ If you want to use ramfs as an aufs branch fs, then enable this ++ option. Generally tmpfs is recommended. ++ Aufs prohibited them to be a branch fs by default, because ++ initramfs becomes unusable after switch_root or something ++ generally. If you sets initramfs as an aufs branch and boot your ++ system by switch_root, you will meet a problem easily since the ++ files in initramfs may be inaccessible. ++ Unless you are going to use ramfs as an aufs branch fs without ++ switch_root or something, leave it N. ++ ++config AUFS_BR_FUSE ++ bool "Fuse fs as an aufs branch" ++ depends on FUSE_FS ++ select AUFS_POLL ++ help ++ If you want to use fuse-based userspace filesystem as an aufs ++ branch fs, then enable this option. ++ It implements the internal poll(2) operation which is ++ implemented by fuse only (curretnly). ++ ++config AUFS_POLL ++ bool ++ help ++ Automatic configuration for internal use. ++ ++config AUFS_BR_HFSPLUS ++ bool "Hfsplus as an aufs branch" ++ depends on HFSPLUS_FS ++ default y ++ help ++ If you want to use hfsplus fs as an aufs branch fs, then enable ++ this option. This option introduces a small overhead at ++ copying-up a file on hfsplus. ++ ++config AUFS_BDEV_LOOP ++ bool ++ depends on BLK_DEV_LOOP ++ default y ++ help ++ Automatic configuration for internal use. ++ Convert =[ym] into =y. ++ ++config AUFS_DEBUG ++ bool "Debug aufs" ++ help ++ Enable this to compile aufs internal debug code. ++ It will have a negative impact to the performance. ++ ++config AUFS_MAGIC_SYSRQ ++ bool ++ depends on AUFS_DEBUG && MAGIC_SYSRQ ++ default y ++ help ++ Automatic configuration for internal use. ++ When aufs supports Magic SysRq, enabled automatically. ++endif +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/loop.c linux-3.2.0-gentoo-r1/fs/aufs/loop.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/loop.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/loop.c 2012-01-17 12:11:24.826508897 +0100 +@@ -0,0 +1,133 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * support for loopback block device as a branch ++ */ ++ ++#include ++#include "aufs.h" ++ ++/* ++ * test if two lower dentries have overlapping branches. ++ */ ++int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding) ++{ ++ struct super_block *h_sb; ++ struct loop_device *l; ++ ++ h_sb = h_adding->d_sb; ++ if (MAJOR(h_sb->s_dev) != LOOP_MAJOR) ++ return 0; ++ ++ l = h_sb->s_bdev->bd_disk->private_data; ++ h_adding = l->lo_backing_file->f_dentry; ++ /* ++ * h_adding can be local NFS. ++ * in this case aufs cannot detect the loop. ++ */ ++ if (unlikely(h_adding->d_sb == sb)) ++ return 1; ++ return !!au_test_subdir(h_adding, sb->s_root); ++} ++ ++/* true if a kernel thread named 'loop[0-9].*' accesses a file */ ++int au_test_loopback_kthread(void) ++{ ++ int ret; ++ struct task_struct *tsk = current; ++ ++ ret = 0; ++ if (tsk->flags & PF_KTHREAD) { ++ const char c = tsk->comm[4]; ++ ret = ('0' <= c && c <= '9' ++ && !strncmp(tsk->comm, "loop", 4)); ++ } ++ ++ return ret; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define au_warn_loopback_step 16 ++static int au_warn_loopback_nelem = au_warn_loopback_step; ++static unsigned long *au_warn_loopback_array; ++ ++void au_warn_loopback(struct super_block *h_sb) ++{ ++ int i, new_nelem; ++ unsigned long *a, magic; ++ static DEFINE_SPINLOCK(spin); ++ ++ magic = h_sb->s_magic; ++ spin_lock(&spin); ++ a = au_warn_loopback_array; ++ for (i = 0; i < au_warn_loopback_nelem && *a; i++) ++ if (a[i] == magic) { ++ spin_unlock(&spin); ++ return; ++ } ++ ++ /* h_sb is new to us, print it */ ++ if (i < au_warn_loopback_nelem) { ++ a[i] = magic; ++ goto pr; ++ } ++ ++ /* expand the array */ ++ new_nelem = au_warn_loopback_nelem + au_warn_loopback_step; ++ a = au_kzrealloc(au_warn_loopback_array, ++ au_warn_loopback_nelem * sizeof(unsigned long), ++ new_nelem * sizeof(unsigned long), GFP_ATOMIC); ++ if (a) { ++ au_warn_loopback_nelem = new_nelem; ++ au_warn_loopback_array = a; ++ a[i] = magic; ++ goto pr; ++ } ++ ++ spin_unlock(&spin); ++ AuWarn1("realloc failed, ignored\n"); ++ return; ++ ++pr: ++ spin_unlock(&spin); ++ pr_warning("you may want to try another patch for loopback file " ++ "on %s(0x%lx) branch\n", au_sbtype(h_sb), magic); ++} ++ ++int au_loopback_init(void) ++{ ++ int err; ++ struct super_block *sb __maybe_unused; ++ ++ AuDebugOn(sizeof(sb->s_magic) != sizeof(unsigned long)); ++ ++ err = 0; ++ au_warn_loopback_array = kcalloc(au_warn_loopback_step, ++ sizeof(unsigned long), GFP_NOFS); ++ if (unlikely(!au_warn_loopback_array)) ++ err = -ENOMEM; ++ ++ return err; ++} ++ ++void au_loopback_fin(void) ++{ ++ kfree(au_warn_loopback_array); ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/loop.h linux-3.2.0-gentoo-r1/fs/aufs/loop.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/loop.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/loop.h 2012-01-17 12:11:24.828823733 +0100 +@@ -0,0 +1,50 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * support for loopback mount as a branch ++ */ ++ ++#ifndef __AUFS_LOOP_H__ ++#define __AUFS_LOOP_H__ ++ ++#ifdef __KERNEL__ ++ ++struct dentry; ++struct super_block; ++ ++#ifdef CONFIG_AUFS_BDEV_LOOP ++/* loop.c */ ++int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding); ++int au_test_loopback_kthread(void); ++void au_warn_loopback(struct super_block *h_sb); ++ ++int au_loopback_init(void); ++void au_loopback_fin(void); ++#else ++AuStubInt0(au_test_loopback_overlap, struct super_block *sb, ++ struct dentry *h_adding) ++AuStubInt0(au_test_loopback_kthread, void) ++AuStubVoid(au_warn_loopback, struct super_block *h_sb) ++ ++AuStubInt0(au_loopback_init, void) ++AuStubVoid(au_loopback_fin, void) ++#endif /* BLK_DEV_LOOP */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_LOOP_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/magic.mk linux-3.2.0-gentoo-r1/fs/aufs/magic.mk +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/magic.mk 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/magic.mk 2012-01-17 12:11:24.828823733 +0100 +@@ -0,0 +1,54 @@ ++ ++# defined in ${srctree}/fs/fuse/inode.c ++# tristate ++ifdef CONFIG_FUSE_FS ++ccflags-y += -DFUSE_SUPER_MAGIC=0x65735546 ++endif ++ ++# defined in ${srctree}/fs/ocfs2/ocfs2_fs.h ++# tristate ++ifdef CONFIG_OCFS2_FS ++ccflags-y += -DOCFS2_SUPER_MAGIC=0x7461636f ++endif ++ ++# defined in ${srctree}/fs/ocfs2/dlm/userdlm.h ++# tristate ++ifdef CONFIG_OCFS2_FS_O2CB ++ccflags-y += -DDLMFS_MAGIC=0x76a9f425 ++endif ++ ++# defined in ${srctree}/fs/cifs/cifsfs.c ++# tristate ++ifdef CONFIG_CIFS_FS ++ccflags-y += -DCIFS_MAGIC_NUMBER=0xFF534D42 ++endif ++ ++# defined in ${srctree}/fs/xfs/xfs_sb.h ++# tristate ++ifdef CONFIG_XFS_FS ++ccflags-y += -DXFS_SB_MAGIC=0x58465342 ++endif ++ ++# defined in ${srctree}/fs/configfs/mount.c ++# tristate ++ifdef CONFIG_CONFIGFS_FS ++ccflags-y += -DCONFIGFS_MAGIC=0x62656570 ++endif ++ ++# defined in ${srctree}/fs/9p/v9fs.h ++# tristate ++ifdef CONFIG_9P_FS ++ccflags-y += -DV9FS_MAGIC=0x01021997 ++endif ++ ++# defined in ${srctree}/fs/ubifs/ubifs.h ++# tristate ++ifdef CONFIG_UBIFS_FS ++ccflags-y += -DUBIFS_SUPER_MAGIC=0x24051905 ++endif ++ ++# defined in ${srctree}/fs/hfsplus/hfsplus_raw.h ++# tristate ++ifdef CONFIG_HFSPLUS_FS ++ccflags-y += -DHFSPLUS_SUPER_MAGIC=0x482b ++endif +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/Makefile linux-3.2.0-gentoo-r1/fs/aufs/Makefile +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/Makefile 2012-01-17 12:11:24.465394530 +0100 +@@ -0,0 +1,42 @@ ++ ++include ${src}/magic.mk ++ifeq (${CONFIG_AUFS_FS},m) ++include ${src}/conf.mk ++endif ++-include ${src}/priv_def.mk ++ ++# cf. include/linux/kernel.h ++# enable pr_debug ++ccflags-y += -DDEBUG ++# sparse requires the full pathname ++ifdef M ++ccflags-y += -include ${M}/../../include/linux/aufs_type.h ++else ++ccflags-y += -include ${srctree}/include/linux/aufs_type.h ++endif ++ ++obj-$(CONFIG_AUFS_FS) += aufs.o ++aufs-y := module.o sbinfo.o super.o branch.o xino.o sysaufs.o opts.o \ ++ wkq.o vfsub.o dcsub.o \ ++ cpup.o whout.o wbr_policy.o \ ++ dinfo.o dentry.o \ ++ dynop.o \ ++ finfo.o file.o f_op.o \ ++ dir.o vdir.o \ ++ iinfo.o inode.o i_op.o i_op_add.o i_op_del.o i_op_ren.o \ ++ ioctl.o ++ ++# all are boolean ++aufs-$(CONFIG_PROC_FS) += procfs.o plink.o ++aufs-$(CONFIG_SYSFS) += sysfs.o ++aufs-$(CONFIG_DEBUG_FS) += dbgaufs.o ++aufs-$(CONFIG_AUFS_BDEV_LOOP) += loop.o ++aufs-$(CONFIG_AUFS_HNOTIFY) += hnotify.o ++aufs-$(CONFIG_AUFS_HFSNOTIFY) += hfsnotify.o ++aufs-$(CONFIG_AUFS_EXPORT) += export.o ++aufs-$(CONFIG_AUFS_POLL) += poll.o ++aufs-$(CONFIG_AUFS_RDU) += rdu.o ++aufs-$(CONFIG_AUFS_SP_IATTR) += f_op_sp.o ++aufs-$(CONFIG_AUFS_BR_HFSPLUS) += hfsplus.o ++aufs-$(CONFIG_AUFS_DEBUG) += debug.o ++aufs-$(CONFIG_AUFS_MAGIC_SYSRQ) += sysrq.o +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/module.c linux-3.2.0-gentoo-r1/fs/aufs/module.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/module.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/module.c 2012-01-17 12:11:24.828823733 +0100 +@@ -0,0 +1,195 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * module global variables and operations ++ */ ++ ++#include ++#include ++#include "aufs.h" ++ ++void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp) ++{ ++ if (new_sz <= nused) ++ return p; ++ ++ p = krealloc(p, new_sz, gfp); ++ if (p) ++ memset(p + nused, 0, new_sz - nused); ++ return p; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * aufs caches ++ */ ++struct kmem_cache *au_cachep[AuCache_Last]; ++static int __init au_cache_init(void) ++{ ++ au_cachep[AuCache_DINFO] = AuCacheCtor(au_dinfo, au_di_init_once); ++ if (au_cachep[AuCache_DINFO]) ++ /* SLAB_DESTROY_BY_RCU */ ++ au_cachep[AuCache_ICNTNR] = AuCacheCtor(au_icntnr, ++ au_icntnr_init_once); ++ if (au_cachep[AuCache_ICNTNR]) ++ au_cachep[AuCache_FINFO] = AuCacheCtor(au_finfo, ++ au_fi_init_once); ++ if (au_cachep[AuCache_FINFO]) ++ au_cachep[AuCache_VDIR] = AuCache(au_vdir); ++ if (au_cachep[AuCache_VDIR]) ++ au_cachep[AuCache_DEHSTR] = AuCache(au_vdir_dehstr); ++ if (au_cachep[AuCache_DEHSTR]) ++ return 0; ++ ++ return -ENOMEM; ++} ++ ++static void au_cache_fin(void) ++{ ++ int i; ++ ++ /* including AuCache_HNOTIFY */ ++ for (i = 0; i < AuCache_Last; i++) ++ if (au_cachep[i]) { ++ kmem_cache_destroy(au_cachep[i]); ++ au_cachep[i] = NULL; ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_dir_roflags; ++ ++#ifdef CONFIG_AUFS_SBILIST ++/* ++ * iterate_supers_type() doesn't protect us from ++ * remounting (branch management) ++ */ ++struct au_splhead au_sbilist; ++#endif ++ ++struct lock_class_key au_lc_key[AuLcKey_Last]; ++ ++/* ++ * functions for module interface. ++ */ ++MODULE_LICENSE("GPL"); ++/* MODULE_LICENSE("GPL v2"); */ ++MODULE_AUTHOR("Junjiro R. Okajima "); ++MODULE_DESCRIPTION(AUFS_NAME ++ " -- Advanced multi layered unification filesystem"); ++MODULE_VERSION(AUFS_VERSION); ++ ++/* this module parameter has no meaning when SYSFS is disabled */ ++int sysaufs_brs = 1; ++MODULE_PARM_DESC(brs, "use /fs/aufs/si_*/brN"); ++module_param_named(brs, sysaufs_brs, int, S_IRUGO); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static char au_esc_chars[0x20 + 3]; /* 0x01-0x20, backslash, del, and NULL */ ++ ++int au_seq_path(struct seq_file *seq, struct path *path) ++{ ++ return seq_path(seq, path, au_esc_chars); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int __init aufs_init(void) ++{ ++ int err, i; ++ char *p; ++ ++ p = au_esc_chars; ++ for (i = 1; i <= ' '; i++) ++ *p++ = i; ++ *p++ = '\\'; ++ *p++ = '\x7f'; ++ *p = 0; ++ ++ au_dir_roflags = au_file_roflags(O_DIRECTORY | O_LARGEFILE); ++ ++ au_sbilist_init(); ++ sysaufs_brs_init(); ++ au_debug_init(); ++ au_dy_init(); ++ err = sysaufs_init(); ++ if (unlikely(err)) ++ goto out; ++ err = au_procfs_init(); ++ if (unlikely(err)) ++ goto out_sysaufs; ++ err = au_wkq_init(); ++ if (unlikely(err)) ++ goto out_procfs; ++ err = au_loopback_init(); ++ if (unlikely(err)) ++ goto out_wkq; ++ err = au_hnotify_init(); ++ if (unlikely(err)) ++ goto out_loopback; ++ err = au_sysrq_init(); ++ if (unlikely(err)) ++ goto out_hin; ++ err = au_cache_init(); ++ if (unlikely(err)) ++ goto out_sysrq; ++ err = register_filesystem(&aufs_fs_type); ++ if (unlikely(err)) ++ goto out_cache; ++ /* since we define pr_fmt, call printk directly */ ++ printk(KERN_INFO AUFS_NAME " " AUFS_VERSION "\n"); ++ goto out; /* success */ ++ ++out_cache: ++ au_cache_fin(); ++out_sysrq: ++ au_sysrq_fin(); ++out_hin: ++ au_hnotify_fin(); ++out_loopback: ++ au_loopback_fin(); ++out_wkq: ++ au_wkq_fin(); ++out_procfs: ++ au_procfs_fin(); ++out_sysaufs: ++ sysaufs_fin(); ++ au_dy_fin(); ++out: ++ return err; ++} ++ ++static void __exit aufs_exit(void) ++{ ++ unregister_filesystem(&aufs_fs_type); ++ au_cache_fin(); ++ au_sysrq_fin(); ++ au_hnotify_fin(); ++ au_loopback_fin(); ++ au_wkq_fin(); ++ au_procfs_fin(); ++ sysaufs_fin(); ++ au_dy_fin(); ++} ++ ++module_init(aufs_init); ++module_exit(aufs_exit); +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/module.h linux-3.2.0-gentoo-r1/fs/aufs/module.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/module.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/module.h 2012-01-17 12:11:24.840397910 +0100 +@@ -0,0 +1,107 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * module initialization and module-global ++ */ ++ ++#ifndef __AUFS_MODULE_H__ ++#define __AUFS_MODULE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++struct path; ++struct seq_file; ++ ++/* module parameters */ ++extern int sysaufs_brs; ++ ++/* ---------------------------------------------------------------------- */ ++ ++extern int au_dir_roflags; ++ ++enum { ++ AuLcNonDir_FIINFO, ++ AuLcNonDir_DIINFO, ++ AuLcNonDir_IIINFO, ++ ++ AuLcDir_FIINFO, ++ AuLcDir_DIINFO, ++ AuLcDir_IIINFO, ++ ++ AuLcSymlink_DIINFO, ++ AuLcSymlink_IIINFO, ++ ++ AuLcKey_Last ++}; ++extern struct lock_class_key au_lc_key[AuLcKey_Last]; ++ ++void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp); ++int au_seq_path(struct seq_file *seq, struct path *path); ++ ++#ifdef CONFIG_PROC_FS ++/* procfs.c */ ++int __init au_procfs_init(void); ++void au_procfs_fin(void); ++#else ++AuStubInt0(au_procfs_init, void); ++AuStubVoid(au_procfs_fin, void); ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* kmem cache */ ++enum { ++ AuCache_DINFO, ++ AuCache_ICNTNR, ++ AuCache_FINFO, ++ AuCache_VDIR, ++ AuCache_DEHSTR, ++#ifdef CONFIG_AUFS_HNOTIFY ++ AuCache_HNOTIFY, ++#endif ++ AuCache_Last ++}; ++ ++#define AuCacheFlags (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD) ++#define AuCache(type) KMEM_CACHE(type, AuCacheFlags) ++#define AuCacheCtor(type, ctor) \ ++ kmem_cache_create(#type, sizeof(struct type), \ ++ __alignof__(struct type), AuCacheFlags, ctor) ++ ++extern struct kmem_cache *au_cachep[]; ++ ++#define AuCacheFuncs(name, index) \ ++static inline struct au_##name *au_cache_alloc_##name(void) \ ++{ return kmem_cache_alloc(au_cachep[AuCache_##index], GFP_NOFS); } \ ++static inline void au_cache_free_##name(struct au_##name *p) \ ++{ kmem_cache_free(au_cachep[AuCache_##index], p); } ++ ++AuCacheFuncs(dinfo, DINFO); ++AuCacheFuncs(icntnr, ICNTNR); ++AuCacheFuncs(finfo, FINFO); ++AuCacheFuncs(vdir, VDIR); ++AuCacheFuncs(vdir_dehstr, DEHSTR); ++#ifdef CONFIG_AUFS_HNOTIFY ++AuCacheFuncs(hnotify, HNOTIFY); ++#endif ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_MODULE_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/opts.c linux-3.2.0-gentoo-r1/fs/aufs/opts.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/opts.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/opts.c 2012-01-17 12:11:24.868175939 +0100 +@@ -0,0 +1,1677 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * mount options/flags ++ */ ++ ++#include ++#include /* a distribution requires */ ++#include ++#include "aufs.h" ++ ++/* ---------------------------------------------------------------------- */ ++ ++enum { ++ Opt_br, ++ Opt_add, Opt_del, Opt_mod, Opt_reorder, Opt_append, Opt_prepend, ++ Opt_idel, Opt_imod, Opt_ireorder, ++ Opt_dirwh, Opt_rdcache, Opt_rdblk, Opt_rdhash, Opt_rendir, ++ Opt_rdblk_def, Opt_rdhash_def, ++ Opt_xino, Opt_zxino, Opt_noxino, ++ Opt_trunc_xino, Opt_trunc_xino_v, Opt_notrunc_xino, ++ Opt_trunc_xino_path, Opt_itrunc_xino, ++ Opt_trunc_xib, Opt_notrunc_xib, ++ Opt_shwh, Opt_noshwh, ++ Opt_plink, Opt_noplink, Opt_list_plink, ++ Opt_udba, ++ Opt_dio, Opt_nodio, ++ /* Opt_lock, Opt_unlock, */ ++ Opt_cmd, Opt_cmd_args, ++ Opt_diropq_a, Opt_diropq_w, ++ Opt_warn_perm, Opt_nowarn_perm, ++ Opt_wbr_copyup, Opt_wbr_create, ++ Opt_refrof, Opt_norefrof, ++ Opt_verbose, Opt_noverbose, ++ Opt_sum, Opt_nosum, Opt_wsum, ++ Opt_tail, Opt_ignore, Opt_ignore_silent, Opt_err ++}; ++ ++static match_table_t options = { ++ {Opt_br, "br=%s"}, ++ {Opt_br, "br:%s"}, ++ ++ {Opt_add, "add=%d:%s"}, ++ {Opt_add, "add:%d:%s"}, ++ {Opt_add, "ins=%d:%s"}, ++ {Opt_add, "ins:%d:%s"}, ++ {Opt_append, "append=%s"}, ++ {Opt_append, "append:%s"}, ++ {Opt_prepend, "prepend=%s"}, ++ {Opt_prepend, "prepend:%s"}, ++ ++ {Opt_del, "del=%s"}, ++ {Opt_del, "del:%s"}, ++ /* {Opt_idel, "idel:%d"}, */ ++ {Opt_mod, "mod=%s"}, ++ {Opt_mod, "mod:%s"}, ++ /* {Opt_imod, "imod:%d:%s"}, */ ++ ++ {Opt_dirwh, "dirwh=%d"}, ++ ++ {Opt_xino, "xino=%s"}, ++ {Opt_noxino, "noxino"}, ++ {Opt_trunc_xino, "trunc_xino"}, ++ {Opt_trunc_xino_v, "trunc_xino_v=%d:%d"}, ++ {Opt_notrunc_xino, "notrunc_xino"}, ++ {Opt_trunc_xino_path, "trunc_xino=%s"}, ++ {Opt_itrunc_xino, "itrunc_xino=%d"}, ++ /* {Opt_zxino, "zxino=%s"}, */ ++ {Opt_trunc_xib, "trunc_xib"}, ++ {Opt_notrunc_xib, "notrunc_xib"}, ++ ++#ifdef CONFIG_PROC_FS ++ {Opt_plink, "plink"}, ++#else ++ {Opt_ignore_silent, "plink"}, ++#endif ++ ++ {Opt_noplink, "noplink"}, ++ ++#ifdef CONFIG_AUFS_DEBUG ++ {Opt_list_plink, "list_plink"}, ++#endif ++ ++ {Opt_udba, "udba=%s"}, ++ ++ {Opt_dio, "dio"}, ++ {Opt_nodio, "nodio"}, ++ ++ {Opt_diropq_a, "diropq=always"}, ++ {Opt_diropq_a, "diropq=a"}, ++ {Opt_diropq_w, "diropq=whiteouted"}, ++ {Opt_diropq_w, "diropq=w"}, ++ ++ {Opt_warn_perm, "warn_perm"}, ++ {Opt_nowarn_perm, "nowarn_perm"}, ++ ++ /* keep them temporary */ ++ {Opt_ignore_silent, "coo=%s"}, ++ {Opt_ignore_silent, "nodlgt"}, ++ {Opt_ignore_silent, "nodirperm1"}, ++ {Opt_ignore_silent, "clean_plink"}, ++ ++#ifdef CONFIG_AUFS_SHWH ++ {Opt_shwh, "shwh"}, ++#endif ++ {Opt_noshwh, "noshwh"}, ++ ++ {Opt_rendir, "rendir=%d"}, ++ ++ {Opt_refrof, "refrof"}, ++ {Opt_norefrof, "norefrof"}, ++ ++ {Opt_verbose, "verbose"}, ++ {Opt_verbose, "v"}, ++ {Opt_noverbose, "noverbose"}, ++ {Opt_noverbose, "quiet"}, ++ {Opt_noverbose, "q"}, ++ {Opt_noverbose, "silent"}, ++ ++ {Opt_sum, "sum"}, ++ {Opt_nosum, "nosum"}, ++ {Opt_wsum, "wsum"}, ++ ++ {Opt_rdcache, "rdcache=%d"}, ++ {Opt_rdblk, "rdblk=%d"}, ++ {Opt_rdblk_def, "rdblk=def"}, ++ {Opt_rdhash, "rdhash=%d"}, ++ {Opt_rdhash_def, "rdhash=def"}, ++ ++ {Opt_wbr_create, "create=%s"}, ++ {Opt_wbr_create, "create_policy=%s"}, ++ {Opt_wbr_copyup, "cpup=%s"}, ++ {Opt_wbr_copyup, "copyup=%s"}, ++ {Opt_wbr_copyup, "copyup_policy=%s"}, ++ ++ /* internal use for the scripts */ ++ {Opt_ignore_silent, "si=%s"}, ++ ++ {Opt_br, "dirs=%s"}, ++ {Opt_ignore, "debug=%d"}, ++ {Opt_ignore, "delete=whiteout"}, ++ {Opt_ignore, "delete=all"}, ++ {Opt_ignore, "imap=%s"}, ++ ++ /* temporary workaround, due to old mount(8)? */ ++ {Opt_ignore_silent, "relatime"}, ++ ++ {Opt_err, NULL} ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static const char *au_parser_pattern(int val, struct match_token *token) ++{ ++ while (token->pattern) { ++ if (token->token == val) ++ return token->pattern; ++ token++; ++ } ++ BUG(); ++ return "??"; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static match_table_t brperm = { ++ {AuBrPerm_RO, AUFS_BRPERM_RO}, ++ {AuBrPerm_RR, AUFS_BRPERM_RR}, ++ {AuBrPerm_RW, AUFS_BRPERM_RW}, ++ {0, NULL} ++}; ++ ++static match_table_t brrattr = { ++ {AuBrRAttr_WH, AUFS_BRRATTR_WH}, ++ {0, NULL} ++}; ++ ++static match_table_t brwattr = { ++ {AuBrWAttr_NoLinkWH, AUFS_BRWATTR_NLWH}, ++ {0, NULL} ++}; ++ ++#define AuBrStr_LONGEST AUFS_BRPERM_RW "+" AUFS_BRWATTR_NLWH ++ ++static int br_attr_val(char *str, match_table_t table, substring_t args[]) ++{ ++ int attr, v; ++ char *p; ++ ++ attr = 0; ++ do { ++ p = strchr(str, '+'); ++ if (p) ++ *p = 0; ++ v = match_token(str, table, args); ++ if (v) ++ attr |= v; ++ else { ++ if (p) ++ *p = '+'; ++ pr_warning("ignored branch attribute %s\n", str); ++ break; ++ } ++ if (p) ++ str = p + 1; ++ } while (p); ++ ++ return attr; ++} ++ ++static int noinline_for_stack br_perm_val(char *perm) ++{ ++ int val; ++ char *p; ++ substring_t args[MAX_OPT_ARGS]; ++ ++ p = strchr(perm, '+'); ++ if (p) ++ *p = 0; ++ val = match_token(perm, brperm, args); ++ if (!val) { ++ if (p) ++ *p = '+'; ++ pr_warning("ignored branch permission %s\n", perm); ++ val = AuBrPerm_RO; ++ goto out; ++ } ++ if (!p) ++ goto out; ++ ++ switch (val) { ++ case AuBrPerm_RO: ++ case AuBrPerm_RR: ++ val |= br_attr_val(p + 1, brrattr, args); ++ break; ++ case AuBrPerm_RW: ++ val |= br_attr_val(p + 1, brwattr, args); ++ break; ++ } ++ ++out: ++ return val; ++} ++ ++/* Caller should free the return value */ ++char *au_optstr_br_perm(int brperm) ++{ ++ char *p, a[sizeof(AuBrStr_LONGEST)]; ++ int sz; ++ ++#define SetPerm(str) do { \ ++ sz = sizeof(str); \ ++ memcpy(a, str, sz); \ ++ p = a + sz - 1; \ ++ } while (0) ++ ++#define AppendAttr(flag, str) do { \ ++ if (brperm & flag) { \ ++ sz = sizeof(str); \ ++ *p++ = '+'; \ ++ memcpy(p, str, sz); \ ++ p += sz - 1; \ ++ } \ ++ } while (0) ++ ++ switch (brperm & AuBrPerm_Mask) { ++ case AuBrPerm_RO: ++ SetPerm(AUFS_BRPERM_RO); ++ break; ++ case AuBrPerm_RR: ++ SetPerm(AUFS_BRPERM_RR); ++ break; ++ case AuBrPerm_RW: ++ SetPerm(AUFS_BRPERM_RW); ++ break; ++ default: ++ AuDebugOn(1); ++ } ++ ++ AppendAttr(AuBrRAttr_WH, AUFS_BRRATTR_WH); ++ AppendAttr(AuBrWAttr_NoLinkWH, AUFS_BRWATTR_NLWH); ++ ++ AuDebugOn(strlen(a) >= sizeof(a)); ++ return kstrdup(a, GFP_NOFS); ++#undef SetPerm ++#undef AppendAttr ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static match_table_t udbalevel = { ++ {AuOpt_UDBA_REVAL, "reval"}, ++ {AuOpt_UDBA_NONE, "none"}, ++#ifdef CONFIG_AUFS_HNOTIFY ++ {AuOpt_UDBA_HNOTIFY, "notify"}, /* abstraction */ ++#ifdef CONFIG_AUFS_HFSNOTIFY ++ {AuOpt_UDBA_HNOTIFY, "fsnotify"}, ++#endif ++#endif ++ {-1, NULL} ++}; ++ ++static int noinline_for_stack udba_val(char *str) ++{ ++ substring_t args[MAX_OPT_ARGS]; ++ ++ return match_token(str, udbalevel, args); ++} ++ ++const char *au_optstr_udba(int udba) ++{ ++ return au_parser_pattern(udba, (void *)udbalevel); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static match_table_t au_wbr_create_policy = { ++ {AuWbrCreate_TDP, "tdp"}, ++ {AuWbrCreate_TDP, "top-down-parent"}, ++ {AuWbrCreate_RR, "rr"}, ++ {AuWbrCreate_RR, "round-robin"}, ++ {AuWbrCreate_MFS, "mfs"}, ++ {AuWbrCreate_MFS, "most-free-space"}, ++ {AuWbrCreate_MFSV, "mfs:%d"}, ++ {AuWbrCreate_MFSV, "most-free-space:%d"}, ++ ++ {AuWbrCreate_MFSRR, "mfsrr:%d"}, ++ {AuWbrCreate_MFSRRV, "mfsrr:%d:%d"}, ++ {AuWbrCreate_PMFS, "pmfs"}, ++ {AuWbrCreate_PMFSV, "pmfs:%d"}, ++ ++ {-1, NULL} ++}; ++ ++/* ++ * cf. linux/lib/parser.c and cmdline.c ++ * gave up calling memparse() since it uses simple_strtoull() instead of ++ * kstrto...(). ++ */ ++static int noinline_for_stack ++au_match_ull(substring_t *s, unsigned long long *result) ++{ ++ int err; ++ unsigned int len; ++ char a[32]; ++ ++ err = -ERANGE; ++ len = s->to - s->from; ++ if (len + 1 <= sizeof(a)) { ++ memcpy(a, s->from, len); ++ a[len] = '\0'; ++ err = kstrtoull(a, 0, result); ++ } ++ return err; ++} ++ ++static int au_wbr_mfs_wmark(substring_t *arg, char *str, ++ struct au_opt_wbr_create *create) ++{ ++ int err; ++ unsigned long long ull; ++ ++ err = 0; ++ if (!au_match_ull(arg, &ull)) ++ create->mfsrr_watermark = ull; ++ else { ++ pr_err("bad integer in %s\n", str); ++ err = -EINVAL; ++ } ++ ++ return err; ++} ++ ++static int au_wbr_mfs_sec(substring_t *arg, char *str, ++ struct au_opt_wbr_create *create) ++{ ++ int n, err; ++ ++ err = 0; ++ if (!match_int(arg, &n) && 0 <= n && n <= AUFS_MFS_MAX_SEC) ++ create->mfs_second = n; ++ else { ++ pr_err("bad integer in %s\n", str); ++ err = -EINVAL; ++ } ++ ++ return err; ++} ++ ++static int noinline_for_stack ++au_wbr_create_val(char *str, struct au_opt_wbr_create *create) ++{ ++ int err, e; ++ substring_t args[MAX_OPT_ARGS]; ++ ++ err = match_token(str, au_wbr_create_policy, args); ++ create->wbr_create = err; ++ switch (err) { ++ case AuWbrCreate_MFSRRV: ++ e = au_wbr_mfs_wmark(&args[0], str, create); ++ if (!e) ++ e = au_wbr_mfs_sec(&args[1], str, create); ++ if (unlikely(e)) ++ err = e; ++ break; ++ case AuWbrCreate_MFSRR: ++ e = au_wbr_mfs_wmark(&args[0], str, create); ++ if (unlikely(e)) { ++ err = e; ++ break; ++ } ++ /*FALLTHROUGH*/ ++ case AuWbrCreate_MFS: ++ case AuWbrCreate_PMFS: ++ create->mfs_second = AUFS_MFS_DEF_SEC; ++ break; ++ case AuWbrCreate_MFSV: ++ case AuWbrCreate_PMFSV: ++ e = au_wbr_mfs_sec(&args[0], str, create); ++ if (unlikely(e)) ++ err = e; ++ break; ++ } ++ ++ return err; ++} ++ ++const char *au_optstr_wbr_create(int wbr_create) ++{ ++ return au_parser_pattern(wbr_create, (void *)au_wbr_create_policy); ++} ++ ++static match_table_t au_wbr_copyup_policy = { ++ {AuWbrCopyup_TDP, "tdp"}, ++ {AuWbrCopyup_TDP, "top-down-parent"}, ++ {AuWbrCopyup_BUP, "bup"}, ++ {AuWbrCopyup_BUP, "bottom-up-parent"}, ++ {AuWbrCopyup_BU, "bu"}, ++ {AuWbrCopyup_BU, "bottom-up"}, ++ {-1, NULL} ++}; ++ ++static int noinline_for_stack au_wbr_copyup_val(char *str) ++{ ++ substring_t args[MAX_OPT_ARGS]; ++ ++ return match_token(str, au_wbr_copyup_policy, args); ++} ++ ++const char *au_optstr_wbr_copyup(int wbr_copyup) ++{ ++ return au_parser_pattern(wbr_copyup, (void *)au_wbr_copyup_policy); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static const int lkup_dirflags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; ++ ++static void dump_opts(struct au_opts *opts) ++{ ++#ifdef CONFIG_AUFS_DEBUG ++ /* reduce stack space */ ++ union { ++ struct au_opt_add *add; ++ struct au_opt_del *del; ++ struct au_opt_mod *mod; ++ struct au_opt_xino *xino; ++ struct au_opt_xino_itrunc *xino_itrunc; ++ struct au_opt_wbr_create *create; ++ } u; ++ struct au_opt *opt; ++ ++ opt = opts->opt; ++ while (opt->type != Opt_tail) { ++ switch (opt->type) { ++ case Opt_add: ++ u.add = &opt->add; ++ AuDbg("add {b%d, %s, 0x%x, %p}\n", ++ u.add->bindex, u.add->pathname, u.add->perm, ++ u.add->path.dentry); ++ break; ++ case Opt_del: ++ case Opt_idel: ++ u.del = &opt->del; ++ AuDbg("del {%s, %p}\n", ++ u.del->pathname, u.del->h_path.dentry); ++ break; ++ case Opt_mod: ++ case Opt_imod: ++ u.mod = &opt->mod; ++ AuDbg("mod {%s, 0x%x, %p}\n", ++ u.mod->path, u.mod->perm, u.mod->h_root); ++ break; ++ case Opt_append: ++ u.add = &opt->add; ++ AuDbg("append {b%d, %s, 0x%x, %p}\n", ++ u.add->bindex, u.add->pathname, u.add->perm, ++ u.add->path.dentry); ++ break; ++ case Opt_prepend: ++ u.add = &opt->add; ++ AuDbg("prepend {b%d, %s, 0x%x, %p}\n", ++ u.add->bindex, u.add->pathname, u.add->perm, ++ u.add->path.dentry); ++ break; ++ case Opt_dirwh: ++ AuDbg("dirwh %d\n", opt->dirwh); ++ break; ++ case Opt_rdcache: ++ AuDbg("rdcache %d\n", opt->rdcache); ++ break; ++ case Opt_rdblk: ++ AuDbg("rdblk %u\n", opt->rdblk); ++ break; ++ case Opt_rdblk_def: ++ AuDbg("rdblk_def\n"); ++ break; ++ case Opt_rdhash: ++ AuDbg("rdhash %u\n", opt->rdhash); ++ break; ++ case Opt_rdhash_def: ++ AuDbg("rdhash_def\n"); ++ break; ++ case Opt_xino: ++ u.xino = &opt->xino; ++ AuDbg("xino {%s %.*s}\n", ++ u.xino->path, ++ AuDLNPair(u.xino->file->f_dentry)); ++ break; ++ case Opt_trunc_xino: ++ AuLabel(trunc_xino); ++ break; ++ case Opt_notrunc_xino: ++ AuLabel(notrunc_xino); ++ break; ++ case Opt_trunc_xino_path: ++ case Opt_itrunc_xino: ++ u.xino_itrunc = &opt->xino_itrunc; ++ AuDbg("trunc_xino %d\n", u.xino_itrunc->bindex); ++ break; ++ ++ case Opt_noxino: ++ AuLabel(noxino); ++ break; ++ case Opt_trunc_xib: ++ AuLabel(trunc_xib); ++ break; ++ case Opt_notrunc_xib: ++ AuLabel(notrunc_xib); ++ break; ++ case Opt_shwh: ++ AuLabel(shwh); ++ break; ++ case Opt_noshwh: ++ AuLabel(noshwh); ++ break; ++ case Opt_plink: ++ AuLabel(plink); ++ break; ++ case Opt_noplink: ++ AuLabel(noplink); ++ break; ++ case Opt_list_plink: ++ AuLabel(list_plink); ++ break; ++ case Opt_udba: ++ AuDbg("udba %d, %s\n", ++ opt->udba, au_optstr_udba(opt->udba)); ++ break; ++ case Opt_dio: ++ AuLabel(dio); ++ break; ++ case Opt_nodio: ++ AuLabel(nodio); ++ break; ++ case Opt_diropq_a: ++ AuLabel(diropq_a); ++ break; ++ case Opt_diropq_w: ++ AuLabel(diropq_w); ++ break; ++ case Opt_warn_perm: ++ AuLabel(warn_perm); ++ break; ++ case Opt_nowarn_perm: ++ AuLabel(nowarn_perm); ++ break; ++ case Opt_refrof: ++ AuLabel(refrof); ++ break; ++ case Opt_norefrof: ++ AuLabel(norefrof); ++ break; ++ case Opt_verbose: ++ AuLabel(verbose); ++ break; ++ case Opt_noverbose: ++ AuLabel(noverbose); ++ break; ++ case Opt_sum: ++ AuLabel(sum); ++ break; ++ case Opt_nosum: ++ AuLabel(nosum); ++ break; ++ case Opt_wsum: ++ AuLabel(wsum); ++ break; ++ case Opt_wbr_create: ++ u.create = &opt->wbr_create; ++ AuDbg("create %d, %s\n", u.create->wbr_create, ++ au_optstr_wbr_create(u.create->wbr_create)); ++ switch (u.create->wbr_create) { ++ case AuWbrCreate_MFSV: ++ case AuWbrCreate_PMFSV: ++ AuDbg("%d sec\n", u.create->mfs_second); ++ break; ++ case AuWbrCreate_MFSRR: ++ AuDbg("%llu watermark\n", ++ u.create->mfsrr_watermark); ++ break; ++ case AuWbrCreate_MFSRRV: ++ AuDbg("%llu watermark, %d sec\n", ++ u.create->mfsrr_watermark, ++ u.create->mfs_second); ++ break; ++ } ++ break; ++ case Opt_wbr_copyup: ++ AuDbg("copyup %d, %s\n", opt->wbr_copyup, ++ au_optstr_wbr_copyup(opt->wbr_copyup)); ++ break; ++ default: ++ BUG(); ++ } ++ opt++; ++ } ++#endif ++} ++ ++void au_opts_free(struct au_opts *opts) ++{ ++ struct au_opt *opt; ++ ++ opt = opts->opt; ++ while (opt->type != Opt_tail) { ++ switch (opt->type) { ++ case Opt_add: ++ case Opt_append: ++ case Opt_prepend: ++ path_put(&opt->add.path); ++ break; ++ case Opt_del: ++ case Opt_idel: ++ path_put(&opt->del.h_path); ++ break; ++ case Opt_mod: ++ case Opt_imod: ++ dput(opt->mod.h_root); ++ break; ++ case Opt_xino: ++ fput(opt->xino.file); ++ break; ++ } ++ opt++; ++ } ++} ++ ++static int opt_add(struct au_opt *opt, char *opt_str, unsigned long sb_flags, ++ aufs_bindex_t bindex) ++{ ++ int err; ++ struct au_opt_add *add = &opt->add; ++ char *p; ++ ++ add->bindex = bindex; ++ add->perm = AuBrPerm_RO; ++ add->pathname = opt_str; ++ p = strchr(opt_str, '='); ++ if (p) { ++ *p++ = 0; ++ if (*p) ++ add->perm = br_perm_val(p); ++ } ++ ++ err = vfsub_kern_path(add->pathname, lkup_dirflags, &add->path); ++ if (!err) { ++ if (!p) { ++ add->perm = AuBrPerm_RO; ++ if (au_test_fs_rr(add->path.dentry->d_sb)) ++ add->perm = AuBrPerm_RR; ++ else if (!bindex && !(sb_flags & MS_RDONLY)) ++ add->perm = AuBrPerm_RW; ++ } ++ opt->type = Opt_add; ++ goto out; ++ } ++ pr_err("lookup failed %s (%d)\n", add->pathname, err); ++ err = -EINVAL; ++ ++out: ++ return err; ++} ++ ++static int au_opts_parse_del(struct au_opt_del *del, substring_t args[]) ++{ ++ int err; ++ ++ del->pathname = args[0].from; ++ AuDbg("del path %s\n", del->pathname); ++ ++ err = vfsub_kern_path(del->pathname, lkup_dirflags, &del->h_path); ++ if (unlikely(err)) ++ pr_err("lookup failed %s (%d)\n", del->pathname, err); ++ ++ return err; ++} ++ ++#if 0 /* reserved for future use */ ++static int au_opts_parse_idel(struct super_block *sb, aufs_bindex_t bindex, ++ struct au_opt_del *del, substring_t args[]) ++{ ++ int err; ++ struct dentry *root; ++ ++ err = -EINVAL; ++ root = sb->s_root; ++ aufs_read_lock(root, AuLock_FLUSH); ++ if (bindex < 0 || au_sbend(sb) < bindex) { ++ pr_err("out of bounds, %d\n", bindex); ++ goto out; ++ } ++ ++ err = 0; ++ del->h_path.dentry = dget(au_h_dptr(root, bindex)); ++ del->h_path.mnt = mntget(au_sbr_mnt(sb, bindex)); ++ ++out: ++ aufs_read_unlock(root, !AuLock_IR); ++ return err; ++} ++#endif ++ ++static int noinline_for_stack ++au_opts_parse_mod(struct au_opt_mod *mod, substring_t args[]) ++{ ++ int err; ++ struct path path; ++ char *p; ++ ++ err = -EINVAL; ++ mod->path = args[0].from; ++ p = strchr(mod->path, '='); ++ if (unlikely(!p)) { ++ pr_err("no permssion %s\n", args[0].from); ++ goto out; ++ } ++ ++ *p++ = 0; ++ err = vfsub_kern_path(mod->path, lkup_dirflags, &path); ++ if (unlikely(err)) { ++ pr_err("lookup failed %s (%d)\n", mod->path, err); ++ goto out; ++ } ++ ++ mod->perm = br_perm_val(p); ++ AuDbg("mod path %s, perm 0x%x, %s\n", mod->path, mod->perm, p); ++ mod->h_root = dget(path.dentry); ++ path_put(&path); ++ ++out: ++ return err; ++} ++ ++#if 0 /* reserved for future use */ ++static int au_opts_parse_imod(struct super_block *sb, aufs_bindex_t bindex, ++ struct au_opt_mod *mod, substring_t args[]) ++{ ++ int err; ++ struct dentry *root; ++ ++ err = -EINVAL; ++ root = sb->s_root; ++ aufs_read_lock(root, AuLock_FLUSH); ++ if (bindex < 0 || au_sbend(sb) < bindex) { ++ pr_err("out of bounds, %d\n", bindex); ++ goto out; ++ } ++ ++ err = 0; ++ mod->perm = br_perm_val(args[1].from); ++ AuDbg("mod path %s, perm 0x%x, %s\n", ++ mod->path, mod->perm, args[1].from); ++ mod->h_root = dget(au_h_dptr(root, bindex)); ++ ++out: ++ aufs_read_unlock(root, !AuLock_IR); ++ return err; ++} ++#endif ++ ++static int au_opts_parse_xino(struct super_block *sb, struct au_opt_xino *xino, ++ substring_t args[]) ++{ ++ int err; ++ struct file *file; ++ ++ file = au_xino_create(sb, args[0].from, /*silent*/0); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ ++ err = -EINVAL; ++ if (unlikely(file->f_dentry->d_sb == sb)) { ++ fput(file); ++ pr_err("%s must be outside\n", args[0].from); ++ goto out; ++ } ++ ++ err = 0; ++ xino->file = file; ++ xino->path = args[0].from; ++ ++out: ++ return err; ++} ++ ++static int noinline_for_stack ++au_opts_parse_xino_itrunc_path(struct super_block *sb, ++ struct au_opt_xino_itrunc *xino_itrunc, ++ substring_t args[]) ++{ ++ int err; ++ aufs_bindex_t bend, bindex; ++ struct path path; ++ struct dentry *root; ++ ++ err = vfsub_kern_path(args[0].from, lkup_dirflags, &path); ++ if (unlikely(err)) { ++ pr_err("lookup failed %s (%d)\n", args[0].from, err); ++ goto out; ++ } ++ ++ xino_itrunc->bindex = -1; ++ root = sb->s_root; ++ aufs_read_lock(root, AuLock_FLUSH); ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ if (au_h_dptr(root, bindex) == path.dentry) { ++ xino_itrunc->bindex = bindex; ++ break; ++ } ++ } ++ aufs_read_unlock(root, !AuLock_IR); ++ path_put(&path); ++ ++ if (unlikely(xino_itrunc->bindex < 0)) { ++ pr_err("no such branch %s\n", args[0].from); ++ err = -EINVAL; ++ } ++ ++out: ++ return err; ++} ++ ++/* called without aufs lock */ ++int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts) ++{ ++ int err, n, token; ++ aufs_bindex_t bindex; ++ unsigned char skipped; ++ struct dentry *root; ++ struct au_opt *opt, *opt_tail; ++ char *opt_str; ++ /* reduce the stack space */ ++ union { ++ struct au_opt_xino_itrunc *xino_itrunc; ++ struct au_opt_wbr_create *create; ++ } u; ++ struct { ++ substring_t args[MAX_OPT_ARGS]; ++ } *a; ++ ++ err = -ENOMEM; ++ a = kmalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ root = sb->s_root; ++ err = 0; ++ bindex = 0; ++ opt = opts->opt; ++ opt_tail = opt + opts->max_opt - 1; ++ opt->type = Opt_tail; ++ while (!err && (opt_str = strsep(&str, ",")) && *opt_str) { ++ err = -EINVAL; ++ skipped = 0; ++ token = match_token(opt_str, options, a->args); ++ switch (token) { ++ case Opt_br: ++ err = 0; ++ while (!err && (opt_str = strsep(&a->args[0].from, ":")) ++ && *opt_str) { ++ err = opt_add(opt, opt_str, opts->sb_flags, ++ bindex++); ++ if (unlikely(!err && ++opt > opt_tail)) { ++ err = -E2BIG; ++ break; ++ } ++ opt->type = Opt_tail; ++ skipped = 1; ++ } ++ break; ++ case Opt_add: ++ if (unlikely(match_int(&a->args[0], &n))) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ bindex = n; ++ err = opt_add(opt, a->args[1].from, opts->sb_flags, ++ bindex); ++ if (!err) ++ opt->type = token; ++ break; ++ case Opt_append: ++ err = opt_add(opt, a->args[0].from, opts->sb_flags, ++ /*dummy bindex*/1); ++ if (!err) ++ opt->type = token; ++ break; ++ case Opt_prepend: ++ err = opt_add(opt, a->args[0].from, opts->sb_flags, ++ /*bindex*/0); ++ if (!err) ++ opt->type = token; ++ break; ++ case Opt_del: ++ err = au_opts_parse_del(&opt->del, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#if 0 /* reserved for future use */ ++ case Opt_idel: ++ del->pathname = "(indexed)"; ++ if (unlikely(match_int(&args[0], &n))) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ err = au_opts_parse_idel(sb, n, &opt->del, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#endif ++ case Opt_mod: ++ err = au_opts_parse_mod(&opt->mod, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#ifdef IMOD /* reserved for future use */ ++ case Opt_imod: ++ u.mod->path = "(indexed)"; ++ if (unlikely(match_int(&a->args[0], &n))) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ err = au_opts_parse_imod(sb, n, &opt->mod, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#endif ++ case Opt_xino: ++ err = au_opts_parse_xino(sb, &opt->xino, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++ ++ case Opt_trunc_xino_path: ++ err = au_opts_parse_xino_itrunc_path ++ (sb, &opt->xino_itrunc, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++ ++ case Opt_itrunc_xino: ++ u.xino_itrunc = &opt->xino_itrunc; ++ if (unlikely(match_int(&a->args[0], &n))) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ u.xino_itrunc->bindex = n; ++ aufs_read_lock(root, AuLock_FLUSH); ++ if (n < 0 || au_sbend(sb) < n) { ++ pr_err("out of bounds, %d\n", n); ++ aufs_read_unlock(root, !AuLock_IR); ++ break; ++ } ++ aufs_read_unlock(root, !AuLock_IR); ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_dirwh: ++ if (unlikely(match_int(&a->args[0], &opt->dirwh))) ++ break; ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_rdcache: ++ if (unlikely(match_int(&a->args[0], &n))) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ if (unlikely(n > AUFS_RDCACHE_MAX)) { ++ pr_err("rdcache must be smaller than %d\n", ++ AUFS_RDCACHE_MAX); ++ break; ++ } ++ opt->rdcache = n; ++ err = 0; ++ opt->type = token; ++ break; ++ case Opt_rdblk: ++ if (unlikely(match_int(&a->args[0], &n) ++ || n < 0 ++ || n > KMALLOC_MAX_SIZE)) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ if (unlikely(n && n < NAME_MAX)) { ++ pr_err("rdblk must be larger than %d\n", ++ NAME_MAX); ++ break; ++ } ++ opt->rdblk = n; ++ err = 0; ++ opt->type = token; ++ break; ++ case Opt_rdhash: ++ if (unlikely(match_int(&a->args[0], &n) ++ || n < 0 ++ || n * sizeof(struct hlist_head) ++ > KMALLOC_MAX_SIZE)) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ opt->rdhash = n; ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_trunc_xino: ++ case Opt_notrunc_xino: ++ case Opt_noxino: ++ case Opt_trunc_xib: ++ case Opt_notrunc_xib: ++ case Opt_shwh: ++ case Opt_noshwh: ++ case Opt_plink: ++ case Opt_noplink: ++ case Opt_list_plink: ++ case Opt_dio: ++ case Opt_nodio: ++ case Opt_diropq_a: ++ case Opt_diropq_w: ++ case Opt_warn_perm: ++ case Opt_nowarn_perm: ++ case Opt_refrof: ++ case Opt_norefrof: ++ case Opt_verbose: ++ case Opt_noverbose: ++ case Opt_sum: ++ case Opt_nosum: ++ case Opt_wsum: ++ case Opt_rdblk_def: ++ case Opt_rdhash_def: ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_udba: ++ opt->udba = udba_val(a->args[0].from); ++ if (opt->udba >= 0) { ++ err = 0; ++ opt->type = token; ++ } else ++ pr_err("wrong value, %s\n", opt_str); ++ break; ++ ++ case Opt_wbr_create: ++ u.create = &opt->wbr_create; ++ u.create->wbr_create ++ = au_wbr_create_val(a->args[0].from, u.create); ++ if (u.create->wbr_create >= 0) { ++ err = 0; ++ opt->type = token; ++ } else ++ pr_err("wrong value, %s\n", opt_str); ++ break; ++ case Opt_wbr_copyup: ++ opt->wbr_copyup = au_wbr_copyup_val(a->args[0].from); ++ if (opt->wbr_copyup >= 0) { ++ err = 0; ++ opt->type = token; ++ } else ++ pr_err("wrong value, %s\n", opt_str); ++ break; ++ ++ case Opt_ignore: ++ pr_warning("ignored %s\n", opt_str); ++ /*FALLTHROUGH*/ ++ case Opt_ignore_silent: ++ skipped = 1; ++ err = 0; ++ break; ++ case Opt_err: ++ pr_err("unknown option %s\n", opt_str); ++ break; ++ } ++ ++ if (!err && !skipped) { ++ if (unlikely(++opt > opt_tail)) { ++ err = -E2BIG; ++ opt--; ++ opt->type = Opt_tail; ++ break; ++ } ++ opt->type = Opt_tail; ++ } ++ } ++ ++ kfree(a); ++ dump_opts(opts); ++ if (unlikely(err)) ++ au_opts_free(opts); ++ ++out: ++ return err; ++} ++ ++static int au_opt_wbr_create(struct super_block *sb, ++ struct au_opt_wbr_create *create) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ err = 1; /* handled */ ++ sbinfo = au_sbi(sb); ++ if (sbinfo->si_wbr_create_ops->fin) { ++ err = sbinfo->si_wbr_create_ops->fin(sb); ++ if (!err) ++ err = 1; ++ } ++ ++ sbinfo->si_wbr_create = create->wbr_create; ++ sbinfo->si_wbr_create_ops = au_wbr_create_ops + create->wbr_create; ++ switch (create->wbr_create) { ++ case AuWbrCreate_MFSRRV: ++ case AuWbrCreate_MFSRR: ++ sbinfo->si_wbr_mfs.mfsrr_watermark = create->mfsrr_watermark; ++ /*FALLTHROUGH*/ ++ case AuWbrCreate_MFS: ++ case AuWbrCreate_MFSV: ++ case AuWbrCreate_PMFS: ++ case AuWbrCreate_PMFSV: ++ sbinfo->si_wbr_mfs.mfs_expire ++ = msecs_to_jiffies(create->mfs_second * MSEC_PER_SEC); ++ break; ++ } ++ ++ if (sbinfo->si_wbr_create_ops->init) ++ sbinfo->si_wbr_create_ops->init(sb); /* ignore */ ++ ++ return err; ++} ++ ++/* ++ * returns, ++ * plus: processed without an error ++ * zero: unprocessed ++ */ ++static int au_opt_simple(struct super_block *sb, struct au_opt *opt, ++ struct au_opts *opts) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ err = 1; /* handled */ ++ sbinfo = au_sbi(sb); ++ switch (opt->type) { ++ case Opt_udba: ++ sbinfo->si_mntflags &= ~AuOptMask_UDBA; ++ sbinfo->si_mntflags |= opt->udba; ++ opts->given_udba |= opt->udba; ++ break; ++ ++ case Opt_plink: ++ au_opt_set(sbinfo->si_mntflags, PLINK); ++ break; ++ case Opt_noplink: ++ if (au_opt_test(sbinfo->si_mntflags, PLINK)) ++ au_plink_put(sb, /*verbose*/1); ++ au_opt_clr(sbinfo->si_mntflags, PLINK); ++ break; ++ case Opt_list_plink: ++ if (au_opt_test(sbinfo->si_mntflags, PLINK)) ++ au_plink_list(sb); ++ break; ++ ++ case Opt_dio: ++ au_opt_set(sbinfo->si_mntflags, DIO); ++ au_fset_opts(opts->flags, REFRESH_DYAOP); ++ break; ++ case Opt_nodio: ++ au_opt_clr(sbinfo->si_mntflags, DIO); ++ au_fset_opts(opts->flags, REFRESH_DYAOP); ++ break; ++ ++ case Opt_diropq_a: ++ au_opt_set(sbinfo->si_mntflags, ALWAYS_DIROPQ); ++ break; ++ case Opt_diropq_w: ++ au_opt_clr(sbinfo->si_mntflags, ALWAYS_DIROPQ); ++ break; ++ ++ case Opt_warn_perm: ++ au_opt_set(sbinfo->si_mntflags, WARN_PERM); ++ break; ++ case Opt_nowarn_perm: ++ au_opt_clr(sbinfo->si_mntflags, WARN_PERM); ++ break; ++ ++ case Opt_refrof: ++ au_opt_set(sbinfo->si_mntflags, REFROF); ++ break; ++ case Opt_norefrof: ++ au_opt_clr(sbinfo->si_mntflags, REFROF); ++ break; ++ ++ case Opt_verbose: ++ au_opt_set(sbinfo->si_mntflags, VERBOSE); ++ break; ++ case Opt_noverbose: ++ au_opt_clr(sbinfo->si_mntflags, VERBOSE); ++ break; ++ ++ case Opt_sum: ++ au_opt_set(sbinfo->si_mntflags, SUM); ++ break; ++ case Opt_wsum: ++ au_opt_clr(sbinfo->si_mntflags, SUM); ++ au_opt_set(sbinfo->si_mntflags, SUM_W); ++ case Opt_nosum: ++ au_opt_clr(sbinfo->si_mntflags, SUM); ++ au_opt_clr(sbinfo->si_mntflags, SUM_W); ++ break; ++ ++ case Opt_wbr_create: ++ err = au_opt_wbr_create(sb, &opt->wbr_create); ++ break; ++ case Opt_wbr_copyup: ++ sbinfo->si_wbr_copyup = opt->wbr_copyup; ++ sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + opt->wbr_copyup; ++ break; ++ ++ case Opt_dirwh: ++ sbinfo->si_dirwh = opt->dirwh; ++ break; ++ ++ case Opt_rdcache: ++ sbinfo->si_rdcache ++ = msecs_to_jiffies(opt->rdcache * MSEC_PER_SEC); ++ break; ++ case Opt_rdblk: ++ sbinfo->si_rdblk = opt->rdblk; ++ break; ++ case Opt_rdblk_def: ++ sbinfo->si_rdblk = AUFS_RDBLK_DEF; ++ break; ++ case Opt_rdhash: ++ sbinfo->si_rdhash = opt->rdhash; ++ break; ++ case Opt_rdhash_def: ++ sbinfo->si_rdhash = AUFS_RDHASH_DEF; ++ break; ++ ++ case Opt_shwh: ++ au_opt_set(sbinfo->si_mntflags, SHWH); ++ break; ++ case Opt_noshwh: ++ au_opt_clr(sbinfo->si_mntflags, SHWH); ++ break; ++ ++ case Opt_trunc_xino: ++ au_opt_set(sbinfo->si_mntflags, TRUNC_XINO); ++ break; ++ case Opt_notrunc_xino: ++ au_opt_clr(sbinfo->si_mntflags, TRUNC_XINO); ++ break; ++ ++ case Opt_trunc_xino_path: ++ case Opt_itrunc_xino: ++ err = au_xino_trunc(sb, opt->xino_itrunc.bindex); ++ if (!err) ++ err = 1; ++ break; ++ ++ case Opt_trunc_xib: ++ au_fset_opts(opts->flags, TRUNC_XIB); ++ break; ++ case Opt_notrunc_xib: ++ au_fclr_opts(opts->flags, TRUNC_XIB); ++ break; ++ ++ default: ++ err = 0; ++ break; ++ } ++ ++ return err; ++} ++ ++/* ++ * returns tri-state. ++ * plus: processed without an error ++ * zero: unprocessed ++ * minus: error ++ */ ++static int au_opt_br(struct super_block *sb, struct au_opt *opt, ++ struct au_opts *opts) ++{ ++ int err, do_refresh; ++ ++ err = 0; ++ switch (opt->type) { ++ case Opt_append: ++ opt->add.bindex = au_sbend(sb) + 1; ++ if (opt->add.bindex < 0) ++ opt->add.bindex = 0; ++ goto add; ++ case Opt_prepend: ++ opt->add.bindex = 0; ++ add: ++ case Opt_add: ++ err = au_br_add(sb, &opt->add, ++ au_ftest_opts(opts->flags, REMOUNT)); ++ if (!err) { ++ err = 1; ++ au_fset_opts(opts->flags, REFRESH); ++ } ++ break; ++ ++ case Opt_del: ++ case Opt_idel: ++ err = au_br_del(sb, &opt->del, ++ au_ftest_opts(opts->flags, REMOUNT)); ++ if (!err) { ++ err = 1; ++ au_fset_opts(opts->flags, TRUNC_XIB); ++ au_fset_opts(opts->flags, REFRESH); ++ } ++ break; ++ ++ case Opt_mod: ++ case Opt_imod: ++ err = au_br_mod(sb, &opt->mod, ++ au_ftest_opts(opts->flags, REMOUNT), ++ &do_refresh); ++ if (!err) { ++ err = 1; ++ if (do_refresh) ++ au_fset_opts(opts->flags, REFRESH); ++ } ++ break; ++ } ++ ++ return err; ++} ++ ++static int au_opt_xino(struct super_block *sb, struct au_opt *opt, ++ struct au_opt_xino **opt_xino, ++ struct au_opts *opts) ++{ ++ int err; ++ aufs_bindex_t bend, bindex; ++ struct dentry *root, *parent, *h_root; ++ ++ err = 0; ++ switch (opt->type) { ++ case Opt_xino: ++ err = au_xino_set(sb, &opt->xino, ++ !!au_ftest_opts(opts->flags, REMOUNT)); ++ if (unlikely(err)) ++ break; ++ ++ *opt_xino = &opt->xino; ++ au_xino_brid_set(sb, -1); ++ ++ /* safe d_parent access */ ++ parent = opt->xino.file->f_dentry->d_parent; ++ root = sb->s_root; ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ h_root = au_h_dptr(root, bindex); ++ if (h_root == parent) { ++ au_xino_brid_set(sb, au_sbr_id(sb, bindex)); ++ break; ++ } ++ } ++ break; ++ ++ case Opt_noxino: ++ au_xino_clr(sb); ++ au_xino_brid_set(sb, -1); ++ *opt_xino = (void *)-1; ++ break; ++ } ++ ++ return err; ++} ++ ++int au_opts_verify(struct super_block *sb, unsigned long sb_flags, ++ unsigned int pending) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ unsigned char do_plink, skip, do_free; ++ struct au_branch *br; ++ struct au_wbr *wbr; ++ struct dentry *root; ++ struct inode *dir, *h_dir; ++ struct au_sbinfo *sbinfo; ++ struct au_hinode *hdir; ++ ++ SiMustAnyLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!(sbinfo->si_mntflags & AuOptMask_UDBA)); ++ ++ if (!(sb_flags & MS_RDONLY)) { ++ if (unlikely(!au_br_writable(au_sbr_perm(sb, 0)))) ++ pr_warning("first branch should be rw\n"); ++ if (unlikely(au_opt_test(sbinfo->si_mntflags, SHWH))) ++ pr_warning("shwh should be used with ro\n"); ++ } ++ ++ if (au_opt_test((sbinfo->si_mntflags | pending), UDBA_HNOTIFY) ++ && !au_opt_test(sbinfo->si_mntflags, XINO)) ++ pr_warning("udba=*notify requires xino\n"); ++ ++ err = 0; ++ root = sb->s_root; ++ dir = root->d_inode; ++ do_plink = !!au_opt_test(sbinfo->si_mntflags, PLINK); ++ bend = au_sbend(sb); ++ for (bindex = 0; !err && bindex <= bend; bindex++) { ++ skip = 0; ++ h_dir = au_h_iptr(dir, bindex); ++ br = au_sbr(sb, bindex); ++ do_free = 0; ++ ++ wbr = br->br_wbr; ++ if (wbr) ++ wbr_wh_read_lock(wbr); ++ ++ if (!au_br_writable(br->br_perm)) { ++ do_free = !!wbr; ++ skip = (!wbr ++ || (!wbr->wbr_whbase ++ && !wbr->wbr_plink ++ && !wbr->wbr_orph)); ++ } else if (!au_br_wh_linkable(br->br_perm)) { ++ /* skip = (!br->br_whbase && !br->br_orph); */ ++ skip = (!wbr || !wbr->wbr_whbase); ++ if (skip && wbr) { ++ if (do_plink) ++ skip = !!wbr->wbr_plink; ++ else ++ skip = !wbr->wbr_plink; ++ } ++ } else { ++ /* skip = (br->br_whbase && br->br_ohph); */ ++ skip = (wbr && wbr->wbr_whbase); ++ if (skip) { ++ if (do_plink) ++ skip = !!wbr->wbr_plink; ++ else ++ skip = !wbr->wbr_plink; ++ } ++ } ++ if (wbr) ++ wbr_wh_read_unlock(wbr); ++ ++ if (skip) ++ continue; ++ ++ hdir = au_hi(dir, bindex); ++ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ if (wbr) ++ wbr_wh_write_lock(wbr); ++ err = au_wh_init(au_h_dptr(root, bindex), br, sb); ++ if (wbr) ++ wbr_wh_write_unlock(wbr); ++ au_hn_imtx_unlock(hdir); ++ ++ if (!err && do_free) { ++ kfree(wbr); ++ br->br_wbr = NULL; ++ } ++ } ++ ++ return err; ++} ++ ++int au_opts_mount(struct super_block *sb, struct au_opts *opts) ++{ ++ int err; ++ unsigned int tmp; ++ aufs_bindex_t bindex, bend; ++ struct au_opt *opt; ++ struct au_opt_xino *opt_xino, xino; ++ struct au_sbinfo *sbinfo; ++ struct au_branch *br; ++ ++ SiMustWriteLock(sb); ++ ++ err = 0; ++ opt_xino = NULL; ++ opt = opts->opt; ++ while (err >= 0 && opt->type != Opt_tail) ++ err = au_opt_simple(sb, opt++, opts); ++ if (err > 0) ++ err = 0; ++ else if (unlikely(err < 0)) ++ goto out; ++ ++ /* disable xino and udba temporary */ ++ sbinfo = au_sbi(sb); ++ tmp = sbinfo->si_mntflags; ++ au_opt_clr(sbinfo->si_mntflags, XINO); ++ au_opt_set_udba(sbinfo->si_mntflags, UDBA_REVAL); ++ ++ opt = opts->opt; ++ while (err >= 0 && opt->type != Opt_tail) ++ err = au_opt_br(sb, opt++, opts); ++ if (err > 0) ++ err = 0; ++ else if (unlikely(err < 0)) ++ goto out; ++ ++ bend = au_sbend(sb); ++ if (unlikely(bend < 0)) { ++ err = -EINVAL; ++ pr_err("no branches\n"); ++ goto out; ++ } ++ ++ if (au_opt_test(tmp, XINO)) ++ au_opt_set(sbinfo->si_mntflags, XINO); ++ opt = opts->opt; ++ while (!err && opt->type != Opt_tail) ++ err = au_opt_xino(sb, opt++, &opt_xino, opts); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_opts_verify(sb, sb->s_flags, tmp); ++ if (unlikely(err)) ++ goto out; ++ ++ /* restore xino */ ++ if (au_opt_test(tmp, XINO) && !opt_xino) { ++ xino.file = au_xino_def(sb); ++ err = PTR_ERR(xino.file); ++ if (IS_ERR(xino.file)) ++ goto out; ++ ++ err = au_xino_set(sb, &xino, /*remount*/0); ++ fput(xino.file); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ /* restore udba */ ++ tmp &= AuOptMask_UDBA; ++ sbinfo->si_mntflags &= ~AuOptMask_UDBA; ++ sbinfo->si_mntflags |= tmp; ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ err = au_hnotify_reset_br(tmp, br, br->br_perm); ++ if (unlikely(err)) ++ AuIOErr("hnotify failed on br %d, %d, ignored\n", ++ bindex, err); ++ /* go on even if err */ ++ } ++ if (au_opt_test(tmp, UDBA_HNOTIFY)) { ++ struct inode *dir = sb->s_root->d_inode; ++ au_hn_reset(dir, au_hi_flags(dir, /*isdir*/1) & ~AuHi_XINO); ++ } ++ ++out: ++ return err; ++} ++ ++int au_opts_remount(struct super_block *sb, struct au_opts *opts) ++{ ++ int err, rerr; ++ struct inode *dir; ++ struct au_opt_xino *opt_xino; ++ struct au_opt *opt; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ dir = sb->s_root->d_inode; ++ sbinfo = au_sbi(sb); ++ err = 0; ++ opt_xino = NULL; ++ opt = opts->opt; ++ while (err >= 0 && opt->type != Opt_tail) { ++ err = au_opt_simple(sb, opt, opts); ++ if (!err) ++ err = au_opt_br(sb, opt, opts); ++ if (!err) ++ err = au_opt_xino(sb, opt, &opt_xino, opts); ++ opt++; ++ } ++ if (err > 0) ++ err = 0; ++ AuTraceErr(err); ++ /* go on even err */ ++ ++ rerr = au_opts_verify(sb, opts->sb_flags, /*pending*/0); ++ if (unlikely(rerr && !err)) ++ err = rerr; ++ ++ if (au_ftest_opts(opts->flags, TRUNC_XIB)) { ++ rerr = au_xib_trunc(sb); ++ if (unlikely(rerr && !err)) ++ err = rerr; ++ } ++ ++ /* will be handled by the caller */ ++ if (!au_ftest_opts(opts->flags, REFRESH) ++ && (opts->given_udba || au_opt_test(sbinfo->si_mntflags, XINO))) ++ au_fset_opts(opts->flags, REFRESH); ++ ++ AuDbg("status 0x%x\n", opts->flags); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++unsigned int au_opt_udba(struct super_block *sb) ++{ ++ return au_mntflags(sb) & AuOptMask_UDBA; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/opts.h linux-3.2.0-gentoo-r1/fs/aufs/opts.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/opts.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/opts.h 2012-01-17 12:11:24.868175939 +0100 +@@ -0,0 +1,209 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * mount options/flags ++ */ ++ ++#ifndef __AUFS_OPTS_H__ ++#define __AUFS_OPTS_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++struct file; ++struct super_block; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* mount flags */ ++#define AuOpt_XINO 1 /* external inode number bitmap ++ and translation table */ ++#define AuOpt_TRUNC_XINO (1 << 1) /* truncate xino files */ ++#define AuOpt_UDBA_NONE (1 << 2) /* users direct branch access */ ++#define AuOpt_UDBA_REVAL (1 << 3) ++#define AuOpt_UDBA_HNOTIFY (1 << 4) ++#define AuOpt_SHWH (1 << 5) /* show whiteout */ ++#define AuOpt_PLINK (1 << 6) /* pseudo-link */ ++#define AuOpt_DIRPERM1 (1 << 7) /* unimplemented */ ++#define AuOpt_REFROF (1 << 8) /* unimplemented */ ++#define AuOpt_ALWAYS_DIROPQ (1 << 9) /* policy to creating diropq */ ++#define AuOpt_SUM (1 << 10) /* summation for statfs(2) */ ++#define AuOpt_SUM_W (1 << 11) /* unimplemented */ ++#define AuOpt_WARN_PERM (1 << 12) /* warn when add-branch */ ++#define AuOpt_VERBOSE (1 << 13) /* busy inode when del-branch */ ++#define AuOpt_DIO (1 << 14) /* direct io */ ++ ++#ifndef CONFIG_AUFS_HNOTIFY ++#undef AuOpt_UDBA_HNOTIFY ++#define AuOpt_UDBA_HNOTIFY 0 ++#endif ++#ifndef CONFIG_AUFS_SHWH ++#undef AuOpt_SHWH ++#define AuOpt_SHWH 0 ++#endif ++ ++#define AuOpt_Def (AuOpt_XINO \ ++ | AuOpt_UDBA_REVAL \ ++ | AuOpt_PLINK \ ++ /* | AuOpt_DIRPERM1 */ \ ++ | AuOpt_WARN_PERM) ++#define AuOptMask_UDBA (AuOpt_UDBA_NONE \ ++ | AuOpt_UDBA_REVAL \ ++ | AuOpt_UDBA_HNOTIFY) ++ ++#define au_opt_test(flags, name) (flags & AuOpt_##name) ++#define au_opt_set(flags, name) do { \ ++ BUILD_BUG_ON(AuOpt_##name & AuOptMask_UDBA); \ ++ ((flags) |= AuOpt_##name); \ ++} while (0) ++#define au_opt_set_udba(flags, name) do { \ ++ (flags) &= ~AuOptMask_UDBA; \ ++ ((flags) |= AuOpt_##name); \ ++} while (0) ++#define au_opt_clr(flags, name) do { \ ++ ((flags) &= ~AuOpt_##name); \ ++} while (0) ++ ++static inline unsigned int au_opts_plink(unsigned int mntflags) ++{ ++#ifdef CONFIG_PROC_FS ++ return mntflags; ++#else ++ return mntflags & ~AuOpt_PLINK; ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policies to select one among multiple writable branches */ ++enum { ++ AuWbrCreate_TDP, /* top down parent */ ++ AuWbrCreate_RR, /* round robin */ ++ AuWbrCreate_MFS, /* most free space */ ++ AuWbrCreate_MFSV, /* mfs with seconds */ ++ AuWbrCreate_MFSRR, /* mfs then rr */ ++ AuWbrCreate_MFSRRV, /* mfs then rr with seconds */ ++ AuWbrCreate_PMFS, /* parent and mfs */ ++ AuWbrCreate_PMFSV, /* parent and mfs with seconds */ ++ ++ AuWbrCreate_Def = AuWbrCreate_TDP ++}; ++ ++enum { ++ AuWbrCopyup_TDP, /* top down parent */ ++ AuWbrCopyup_BUP, /* bottom up parent */ ++ AuWbrCopyup_BU, /* bottom up */ ++ ++ AuWbrCopyup_Def = AuWbrCopyup_TDP ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_opt_add { ++ aufs_bindex_t bindex; ++ char *pathname; ++ int perm; ++ struct path path; ++}; ++ ++struct au_opt_del { ++ char *pathname; ++ struct path h_path; ++}; ++ ++struct au_opt_mod { ++ char *path; ++ int perm; ++ struct dentry *h_root; ++}; ++ ++struct au_opt_xino { ++ char *path; ++ struct file *file; ++}; ++ ++struct au_opt_xino_itrunc { ++ aufs_bindex_t bindex; ++}; ++ ++struct au_opt_wbr_create { ++ int wbr_create; ++ int mfs_second; ++ unsigned long long mfsrr_watermark; ++}; ++ ++struct au_opt { ++ int type; ++ union { ++ struct au_opt_xino xino; ++ struct au_opt_xino_itrunc xino_itrunc; ++ struct au_opt_add add; ++ struct au_opt_del del; ++ struct au_opt_mod mod; ++ int dirwh; ++ int rdcache; ++ unsigned int rdblk; ++ unsigned int rdhash; ++ int udba; ++ struct au_opt_wbr_create wbr_create; ++ int wbr_copyup; ++ }; ++}; ++ ++/* opts flags */ ++#define AuOpts_REMOUNT 1 ++#define AuOpts_REFRESH (1 << 1) ++#define AuOpts_TRUNC_XIB (1 << 2) ++#define AuOpts_REFRESH_DYAOP (1 << 3) ++#define au_ftest_opts(flags, name) ((flags) & AuOpts_##name) ++#define au_fset_opts(flags, name) \ ++ do { (flags) |= AuOpts_##name; } while (0) ++#define au_fclr_opts(flags, name) \ ++ do { (flags) &= ~AuOpts_##name; } while (0) ++ ++struct au_opts { ++ struct au_opt *opt; ++ int max_opt; ++ ++ unsigned int given_udba; ++ unsigned int flags; ++ unsigned long sb_flags; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++char *au_optstr_br_perm(int brperm); ++const char *au_optstr_udba(int udba); ++const char *au_optstr_wbr_copyup(int wbr_copyup); ++const char *au_optstr_wbr_create(int wbr_create); ++ ++void au_opts_free(struct au_opts *opts); ++int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts); ++int au_opts_verify(struct super_block *sb, unsigned long sb_flags, ++ unsigned int pending); ++int au_opts_mount(struct super_block *sb, struct au_opts *opts); ++int au_opts_remount(struct super_block *sb, struct au_opts *opts); ++ ++unsigned int au_opt_udba(struct super_block *sb); ++ ++/* ---------------------------------------------------------------------- */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_OPTS_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/plink.c linux-3.2.0-gentoo-r1/fs/aufs/plink.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/plink.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/plink.c 2012-01-17 12:11:24.870490775 +0100 +@@ -0,0 +1,515 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * pseudo-link ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * the pseudo-link maintenance mode. ++ * during a user process maintains the pseudo-links, ++ * prohibit adding a new plink and branch manipulation. ++ * ++ * Flags ++ * NOPLM: ++ * For entry functions which will handle plink, and i_mutex is already held ++ * in VFS. ++ * They cannot wait and should return an error at once. ++ * Callers has to check the error. ++ * NOPLMW: ++ * For entry functions which will handle plink, but i_mutex is not held ++ * in VFS. ++ * They can wait the plink maintenance mode to finish. ++ * ++ * They behave like F_SETLK and F_SETLKW. ++ * If the caller never handle plink, then both flags are unnecessary. ++ */ ++ ++int au_plink_maint(struct super_block *sb, int flags) ++{ ++ int err; ++ pid_t pid, ppid; ++ struct au_sbinfo *sbi; ++ ++ SiMustAnyLock(sb); ++ ++ err = 0; ++ if (!au_opt_test(au_mntflags(sb), PLINK)) ++ goto out; ++ ++ sbi = au_sbi(sb); ++ pid = sbi->si_plink_maint_pid; ++ if (!pid || pid == current->pid) ++ goto out; ++ ++ /* todo: it highly depends upon /sbin/mount.aufs */ ++ rcu_read_lock(); ++ ppid = task_pid_vnr(rcu_dereference(current->real_parent)); ++ rcu_read_unlock(); ++ if (pid == ppid) ++ goto out; ++ ++ if (au_ftest_lock(flags, NOPLMW)) { ++ /* if there is no i_mutex lock in VFS, we don't need to wait */ ++ /* AuDebugOn(!lockdep_depth(current)); */ ++ while (sbi->si_plink_maint_pid) { ++ si_read_unlock(sb); ++ /* gave up wake_up_bit() */ ++ wait_event(sbi->si_plink_wq, !sbi->si_plink_maint_pid); ++ ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&sbi->si_nowait); ++ si_noflush_read_lock(sb); ++ } ++ } else if (au_ftest_lock(flags, NOPLM)) { ++ AuDbg("ppid %d, pid %d\n", ppid, pid); ++ err = -EAGAIN; ++ } ++ ++out: ++ return err; ++} ++ ++void au_plink_maint_leave(struct au_sbinfo *sbinfo) ++{ ++ spin_lock(&sbinfo->si_plink_maint_lock); ++ sbinfo->si_plink_maint_pid = 0; ++ spin_unlock(&sbinfo->si_plink_maint_lock); ++ wake_up_all(&sbinfo->si_plink_wq); ++} ++ ++int au_plink_maint_enter(struct super_block *sb) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ /* make sure i am the only one in this fs */ ++ si_write_lock(sb, AuLock_FLUSH); ++ if (au_opt_test(au_mntflags(sb), PLINK)) { ++ spin_lock(&sbinfo->si_plink_maint_lock); ++ if (!sbinfo->si_plink_maint_pid) ++ sbinfo->si_plink_maint_pid = current->pid; ++ else ++ err = -EBUSY; ++ spin_unlock(&sbinfo->si_plink_maint_lock); ++ } ++ si_write_unlock(sb); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct pseudo_link { ++ union { ++ struct list_head list; ++ struct rcu_head rcu; ++ }; ++ struct inode *inode; ++}; ++ ++#ifdef CONFIG_AUFS_DEBUG ++void au_plink_list(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ struct list_head *plink_list; ++ struct pseudo_link *plink; ++ ++ SiMustAnyLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); ++ ++ plink_list = &sbinfo->si_plink.head; ++ rcu_read_lock(); ++ list_for_each_entry_rcu(plink, plink_list, list) ++ AuDbg("%lu\n", plink->inode->i_ino); ++ rcu_read_unlock(); ++} ++#endif ++ ++/* is the inode pseudo-linked? */ ++int au_plink_test(struct inode *inode) ++{ ++ int found; ++ struct au_sbinfo *sbinfo; ++ struct list_head *plink_list; ++ struct pseudo_link *plink; ++ ++ sbinfo = au_sbi(inode->i_sb); ++ AuRwMustAnyLock(&sbinfo->si_rwsem); ++ AuDebugOn(!au_opt_test(au_mntflags(inode->i_sb), PLINK)); ++ AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM)); ++ ++ found = 0; ++ plink_list = &sbinfo->si_plink.head; ++ rcu_read_lock(); ++ list_for_each_entry_rcu(plink, plink_list, list) ++ if (plink->inode == inode) { ++ found = 1; ++ break; ++ } ++ rcu_read_unlock(); ++ return found; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * generate a name for plink. ++ * the file will be stored under AUFS_WH_PLINKDIR. ++ */ ++/* 20 is max digits length of ulong 64 */ ++#define PLINK_NAME_LEN ((20 + 1) * 2) ++ ++static int plink_name(char *name, int len, struct inode *inode, ++ aufs_bindex_t bindex) ++{ ++ int rlen; ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, bindex); ++ rlen = snprintf(name, len, "%lu.%lu", inode->i_ino, h_inode->i_ino); ++ return rlen; ++} ++ ++struct au_do_plink_lkup_args { ++ struct dentry **errp; ++ struct qstr *tgtname; ++ struct dentry *h_parent; ++ struct au_branch *br; ++}; ++ ++static struct dentry *au_do_plink_lkup(struct qstr *tgtname, ++ struct dentry *h_parent, ++ struct au_branch *br) ++{ ++ struct dentry *h_dentry; ++ struct mutex *h_mtx; ++ ++ h_mtx = &h_parent->d_inode->i_mutex; ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD2); ++ h_dentry = au_lkup_one(tgtname, h_parent, br, /*nd*/NULL); ++ mutex_unlock(h_mtx); ++ return h_dentry; ++} ++ ++static void au_call_do_plink_lkup(void *args) ++{ ++ struct au_do_plink_lkup_args *a = args; ++ *a->errp = au_do_plink_lkup(a->tgtname, a->h_parent, a->br); ++} ++ ++/* lookup the plink-ed @inode under the branch at @bindex */ ++struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex) ++{ ++ struct dentry *h_dentry, *h_parent; ++ struct au_branch *br; ++ struct inode *h_dir; ++ int wkq_err; ++ char a[PLINK_NAME_LEN]; ++ struct qstr tgtname = { ++ .name = a ++ }; ++ ++ AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM)); ++ ++ br = au_sbr(inode->i_sb, bindex); ++ h_parent = br->br_wbr->wbr_plink; ++ h_dir = h_parent->d_inode; ++ tgtname.len = plink_name(a, sizeof(a), inode, bindex); ++ ++ if (current_fsuid()) { ++ struct au_do_plink_lkup_args args = { ++ .errp = &h_dentry, ++ .tgtname = &tgtname, ++ .h_parent = h_parent, ++ .br = br ++ }; ++ ++ wkq_err = au_wkq_wait(au_call_do_plink_lkup, &args); ++ if (unlikely(wkq_err)) ++ h_dentry = ERR_PTR(wkq_err); ++ } else ++ h_dentry = au_do_plink_lkup(&tgtname, h_parent, br); ++ ++ return h_dentry; ++} ++ ++/* create a pseudo-link */ ++static int do_whplink(struct qstr *tgt, struct dentry *h_parent, ++ struct dentry *h_dentry, struct au_branch *br) ++{ ++ int err; ++ struct path h_path = { ++ .mnt = br->br_mnt ++ }; ++ struct inode *h_dir; ++ ++ h_dir = h_parent->d_inode; ++ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_CHILD2); ++again: ++ h_path.dentry = au_lkup_one(tgt, h_parent, br, /*nd*/NULL); ++ err = PTR_ERR(h_path.dentry); ++ if (IS_ERR(h_path.dentry)) ++ goto out; ++ ++ err = 0; ++ /* wh.plink dir is not monitored */ ++ /* todo: is it really safe? */ ++ if (h_path.dentry->d_inode ++ && h_path.dentry->d_inode != h_dentry->d_inode) { ++ err = vfsub_unlink(h_dir, &h_path, /*force*/0); ++ dput(h_path.dentry); ++ h_path.dentry = NULL; ++ if (!err) ++ goto again; ++ } ++ if (!err && !h_path.dentry->d_inode) ++ err = vfsub_link(h_dentry, h_dir, &h_path); ++ dput(h_path.dentry); ++ ++out: ++ mutex_unlock(&h_dir->i_mutex); ++ return err; ++} ++ ++struct do_whplink_args { ++ int *errp; ++ struct qstr *tgt; ++ struct dentry *h_parent; ++ struct dentry *h_dentry; ++ struct au_branch *br; ++}; ++ ++static void call_do_whplink(void *args) ++{ ++ struct do_whplink_args *a = args; ++ *a->errp = do_whplink(a->tgt, a->h_parent, a->h_dentry, a->br); ++} ++ ++static int whplink(struct dentry *h_dentry, struct inode *inode, ++ aufs_bindex_t bindex, struct au_branch *br) ++{ ++ int err, wkq_err; ++ struct au_wbr *wbr; ++ struct dentry *h_parent; ++ struct inode *h_dir; ++ char a[PLINK_NAME_LEN]; ++ struct qstr tgtname = { ++ .name = a ++ }; ++ ++ wbr = au_sbr(inode->i_sb, bindex)->br_wbr; ++ h_parent = wbr->wbr_plink; ++ h_dir = h_parent->d_inode; ++ tgtname.len = plink_name(a, sizeof(a), inode, bindex); ++ ++ /* always superio. */ ++ if (current_fsuid()) { ++ struct do_whplink_args args = { ++ .errp = &err, ++ .tgt = &tgtname, ++ .h_parent = h_parent, ++ .h_dentry = h_dentry, ++ .br = br ++ }; ++ wkq_err = au_wkq_wait(call_do_whplink, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } else ++ err = do_whplink(&tgtname, h_parent, h_dentry, br); ++ ++ return err; ++} ++ ++/* free a single plink */ ++static void do_put_plink(struct pseudo_link *plink, int do_del) ++{ ++ if (do_del) ++ list_del(&plink->list); ++ iput(plink->inode); ++ kfree(plink); ++} ++ ++static void do_put_plink_rcu(struct rcu_head *rcu) ++{ ++ struct pseudo_link *plink; ++ ++ plink = container_of(rcu, struct pseudo_link, rcu); ++ iput(plink->inode); ++ kfree(plink); ++} ++ ++/* ++ * create a new pseudo-link for @h_dentry on @bindex. ++ * the linked inode is held in aufs @inode. ++ */ ++void au_plink_append(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_dentry) ++{ ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ struct list_head *plink_list; ++ struct pseudo_link *plink, *tmp; ++ int found, err, cnt; ++ ++ sb = inode->i_sb; ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); ++ ++ cnt = 0; ++ found = 0; ++ plink_list = &sbinfo->si_plink.head; ++ rcu_read_lock(); ++ list_for_each_entry_rcu(plink, plink_list, list) { ++ cnt++; ++ if (plink->inode == inode) { ++ found = 1; ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ if (found) ++ return; ++ ++ tmp = kmalloc(sizeof(*plink), GFP_NOFS); ++ if (tmp) ++ tmp->inode = au_igrab(inode); ++ else { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ spin_lock(&sbinfo->si_plink.spin); ++ list_for_each_entry(plink, plink_list, list) { ++ if (plink->inode == inode) { ++ found = 1; ++ break; ++ } ++ } ++ if (!found) ++ list_add_rcu(&tmp->list, plink_list); ++ spin_unlock(&sbinfo->si_plink.spin); ++ if (!found) { ++ cnt++; ++ WARN_ONCE(cnt > AUFS_PLINK_WARN, ++ "unexpectedly many pseudo links, %d\n", cnt); ++ err = whplink(h_dentry, inode, bindex, au_sbr(sb, bindex)); ++ } else { ++ do_put_plink(tmp, 0); ++ return; ++ } ++ ++out: ++ if (unlikely(err)) { ++ pr_warning("err %d, damaged pseudo link.\n", err); ++ if (tmp) { ++ au_spl_del_rcu(&tmp->list, &sbinfo->si_plink); ++ call_rcu(&tmp->rcu, do_put_plink_rcu); ++ } ++ } ++} ++ ++/* free all plinks */ ++void au_plink_put(struct super_block *sb, int verbose) ++{ ++ struct au_sbinfo *sbinfo; ++ struct list_head *plink_list; ++ struct pseudo_link *plink, *tmp; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); ++ ++ plink_list = &sbinfo->si_plink.head; ++ /* no spin_lock since sbinfo is write-locked */ ++ WARN(verbose && !list_empty(plink_list), "pseudo-link is not flushed"); ++ list_for_each_entry_safe(plink, tmp, plink_list, list) ++ do_put_plink(plink, 0); ++ INIT_LIST_HEAD(plink_list); ++} ++ ++void au_plink_clean(struct super_block *sb, int verbose) ++{ ++ struct dentry *root; ++ ++ root = sb->s_root; ++ aufs_write_lock(root); ++ if (au_opt_test(au_mntflags(sb), PLINK)) ++ au_plink_put(sb, verbose); ++ aufs_write_unlock(root); ++} ++ ++/* free the plinks on a branch specified by @br_id */ ++void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id) ++{ ++ struct au_sbinfo *sbinfo; ++ struct list_head *plink_list; ++ struct pseudo_link *plink, *tmp; ++ struct inode *inode; ++ aufs_bindex_t bstart, bend, bindex; ++ unsigned char do_put; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); ++ ++ plink_list = &sbinfo->si_plink.head; ++ /* no spin_lock since sbinfo is write-locked */ ++ list_for_each_entry_safe(plink, tmp, plink_list, list) { ++ do_put = 0; ++ inode = au_igrab(plink->inode); ++ ii_write_lock_child(inode); ++ bstart = au_ibstart(inode); ++ bend = au_ibend(inode); ++ if (bstart >= 0) { ++ for (bindex = bstart; bindex <= bend; bindex++) { ++ if (!au_h_iptr(inode, bindex) ++ || au_ii_br_id(inode, bindex) != br_id) ++ continue; ++ au_set_h_iptr(inode, bindex, NULL, 0); ++ do_put = 1; ++ break; ++ } ++ } else ++ do_put_plink(plink, 1); ++ ++ if (do_put) { ++ for (bindex = bstart; bindex <= bend; bindex++) ++ if (au_h_iptr(inode, bindex)) { ++ do_put = 0; ++ break; ++ } ++ if (do_put) ++ do_put_plink(plink, 1); ++ } ++ ii_write_unlock(inode); ++ iput(inode); ++ } ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/poll.c linux-3.2.0-gentoo-r1/fs/aufs/poll.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/poll.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/poll.c 2012-01-17 12:11:24.870490775 +0100 +@@ -0,0 +1,56 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * poll operation ++ * There is only one filesystem which implements ->poll operation, currently. ++ */ ++ ++#include "aufs.h" ++ ++unsigned int aufs_poll(struct file *file, poll_table *wait) ++{ ++ unsigned int mask; ++ int err; ++ struct file *h_file; ++ struct dentry *dentry; ++ struct super_block *sb; ++ ++ /* We should pretend an error happened. */ ++ mask = POLLERR /* | POLLIN | POLLOUT */; ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ /* it is not an error if h_file has no operation */ ++ mask = DEFAULT_POLLMASK; ++ h_file = au_hf_top(file); ++ if (h_file->f_op && h_file->f_op->poll) ++ mask = h_file->f_op->poll(h_file, wait); ++ ++ di_read_unlock(dentry, AuLock_IR); ++ fi_read_unlock(file); ++ ++out: ++ si_read_unlock(sb); ++ AuTraceErr((int)mask); ++ return mask; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/procfs.c linux-3.2.0-gentoo-r1/fs/aufs/procfs.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/procfs.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/procfs.c 2012-01-17 12:11:24.893639131 +0100 +@@ -0,0 +1,170 @@ ++/* ++ * Copyright (C) 2010-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * procfs interfaces ++ */ ++ ++#include ++#include "aufs.h" ++ ++static int au_procfs_plm_release(struct inode *inode, struct file *file) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ sbinfo = file->private_data; ++ if (sbinfo) { ++ au_plink_maint_leave(sbinfo); ++ kobject_put(&sbinfo->si_kobj); ++ } ++ ++ return 0; ++} ++ ++static void au_procfs_plm_write_clean(struct file *file) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ sbinfo = file->private_data; ++ if (sbinfo) ++ au_plink_clean(sbinfo->si_sb, /*verbose*/0); ++} ++ ++static int au_procfs_plm_write_si(struct file *file, unsigned long id) ++{ ++ int err; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++ err = -EBUSY; ++ if (unlikely(file->private_data)) ++ goto out; ++ ++ sb = NULL; ++ /* don't use au_sbilist_lock() here */ ++ spin_lock(&au_sbilist.spin); ++ list_for_each_entry(sbinfo, &au_sbilist.head, si_list) ++ if (id == sysaufs_si_id(sbinfo)) { ++ kobject_get(&sbinfo->si_kobj); ++ sb = sbinfo->si_sb; ++ break; ++ } ++ spin_unlock(&au_sbilist.spin); ++ ++ err = -EINVAL; ++ if (unlikely(!sb)) ++ goto out; ++ ++ err = au_plink_maint_enter(sb); ++ if (!err) ++ /* keep kobject_get() */ ++ file->private_data = sbinfo; ++ else ++ kobject_put(&sbinfo->si_kobj); ++out: ++ return err; ++} ++ ++/* ++ * Accept a valid "si=xxxx" only. ++ * Once it is accepted successfully, accept "clean" too. ++ */ ++static ssize_t au_procfs_plm_write(struct file *file, const char __user *ubuf, ++ size_t count, loff_t *ppos) ++{ ++ ssize_t err; ++ unsigned long id; ++ /* last newline is allowed */ ++ char buf[3 + sizeof(unsigned long) * 2 + 1]; ++ ++ err = -EACCES; ++ if (unlikely(!capable(CAP_SYS_ADMIN))) ++ goto out; ++ ++ err = -EINVAL; ++ if (unlikely(count > sizeof(buf))) ++ goto out; ++ ++ err = copy_from_user(buf, ubuf, count); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ goto out; ++ } ++ buf[count] = 0; ++ ++ err = -EINVAL; ++ if (!strcmp("clean", buf)) { ++ au_procfs_plm_write_clean(file); ++ goto out_success; ++ } else if (unlikely(strncmp("si=", buf, 3))) ++ goto out; ++ ++ err = kstrtoul(buf + 3, 16, &id); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_procfs_plm_write_si(file, id); ++ if (unlikely(err)) ++ goto out; ++ ++out_success: ++ err = count; /* success */ ++out: ++ return err; ++} ++ ++static const struct file_operations au_procfs_plm_fop = { ++ .write = au_procfs_plm_write, ++ .release = au_procfs_plm_release, ++ .owner = THIS_MODULE ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct proc_dir_entry *au_procfs_dir; ++ ++void au_procfs_fin(void) ++{ ++ remove_proc_entry(AUFS_PLINK_MAINT_NAME, au_procfs_dir); ++ remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL); ++} ++ ++int __init au_procfs_init(void) ++{ ++ int err; ++ struct proc_dir_entry *entry; ++ ++ err = -ENOMEM; ++ au_procfs_dir = proc_mkdir(AUFS_PLINK_MAINT_DIR, NULL); ++ if (unlikely(!au_procfs_dir)) ++ goto out; ++ ++ entry = proc_create(AUFS_PLINK_MAINT_NAME, S_IFREG | S_IWUSR, ++ au_procfs_dir, &au_procfs_plm_fop); ++ if (unlikely(!entry)) ++ goto out_dir; ++ ++ err = 0; ++ goto out; /* success */ ++ ++ ++out_dir: ++ remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL); ++out: ++ return err; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/rdu.c linux-3.2.0-gentoo-r1/fs/aufs/rdu.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/rdu.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/rdu.c 2012-01-17 12:11:24.893639131 +0100 +@@ -0,0 +1,383 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * readdir in userspace. ++ */ ++ ++#include ++#include ++#include ++#include "aufs.h" ++ ++/* bits for struct aufs_rdu.flags */ ++#define AuRdu_CALLED 1 ++#define AuRdu_CONT (1 << 1) ++#define AuRdu_FULL (1 << 2) ++#define au_ftest_rdu(flags, name) ((flags) & AuRdu_##name) ++#define au_fset_rdu(flags, name) \ ++ do { (flags) |= AuRdu_##name; } while (0) ++#define au_fclr_rdu(flags, name) \ ++ do { (flags) &= ~AuRdu_##name; } while (0) ++ ++struct au_rdu_arg { ++ struct aufs_rdu *rdu; ++ union au_rdu_ent_ul ent; ++ unsigned long end; ++ ++ struct super_block *sb; ++ int err; ++}; ++ ++static int au_rdu_fill(void *__arg, const char *name, int nlen, ++ loff_t offset, u64 h_ino, unsigned int d_type) ++{ ++ int err, len; ++ struct au_rdu_arg *arg = __arg; ++ struct aufs_rdu *rdu = arg->rdu; ++ struct au_rdu_ent ent; ++ ++ err = 0; ++ arg->err = 0; ++ au_fset_rdu(rdu->cookie.flags, CALLED); ++ len = au_rdu_len(nlen); ++ if (arg->ent.ul + len < arg->end) { ++ ent.ino = h_ino; ++ ent.bindex = rdu->cookie.bindex; ++ ent.type = d_type; ++ ent.nlen = nlen; ++ if (unlikely(nlen > AUFS_MAX_NAMELEN)) ++ ent.type = DT_UNKNOWN; ++ ++ /* unnecessary to support mmap_sem since this is a dir */ ++ err = -EFAULT; ++ if (copy_to_user(arg->ent.e, &ent, sizeof(ent))) ++ goto out; ++ if (copy_to_user(arg->ent.e->name, name, nlen)) ++ goto out; ++ /* the terminating NULL */ ++ if (__put_user(0, arg->ent.e->name + nlen)) ++ goto out; ++ err = 0; ++ /* AuDbg("%p, %.*s\n", arg->ent.p, nlen, name); */ ++ arg->ent.ul += len; ++ rdu->rent++; ++ } else { ++ err = -EFAULT; ++ au_fset_rdu(rdu->cookie.flags, FULL); ++ rdu->full = 1; ++ rdu->tail = arg->ent; ++ } ++ ++out: ++ /* AuTraceErr(err); */ ++ return err; ++} ++ ++static int au_rdu_do(struct file *h_file, struct au_rdu_arg *arg) ++{ ++ int err; ++ loff_t offset; ++ struct au_rdu_cookie *cookie = &arg->rdu->cookie; ++ ++ offset = vfsub_llseek(h_file, cookie->h_pos, SEEK_SET); ++ err = offset; ++ if (unlikely(offset != cookie->h_pos)) ++ goto out; ++ ++ err = 0; ++ do { ++ arg->err = 0; ++ au_fclr_rdu(cookie->flags, CALLED); ++ /* smp_mb(); */ ++ err = vfsub_readdir(h_file, au_rdu_fill, arg); ++ if (err >= 0) ++ err = arg->err; ++ } while (!err ++ && au_ftest_rdu(cookie->flags, CALLED) ++ && !au_ftest_rdu(cookie->flags, FULL)); ++ cookie->h_pos = h_file->f_pos; ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_rdu(struct file *file, struct aufs_rdu *rdu) ++{ ++ int err; ++ aufs_bindex_t bend; ++ struct au_rdu_arg arg; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct file *h_file; ++ struct au_rdu_cookie *cookie = &rdu->cookie; ++ ++ err = !access_ok(VERIFY_WRITE, rdu->ent.e, rdu->sz); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ goto out; ++ } ++ rdu->rent = 0; ++ rdu->tail = rdu->ent; ++ rdu->full = 0; ++ arg.rdu = rdu; ++ arg.ent = rdu->ent; ++ arg.end = arg.ent.ul; ++ arg.end += rdu->sz; ++ ++ err = -ENOTDIR; ++ if (unlikely(!file->f_op || !file->f_op->readdir)) ++ goto out; ++ ++ err = security_file_permission(file, MAY_READ); ++ AuTraceErr(err); ++ if (unlikely(err)) ++ goto out; ++ ++ dentry = file->f_dentry; ++ inode = dentry->d_inode; ++#if 1 ++ mutex_lock(&inode->i_mutex); ++#else ++ err = mutex_lock_killable(&inode->i_mutex); ++ AuTraceErr(err); ++ if (unlikely(err)) ++ goto out; ++#endif ++ ++ arg.sb = inode->i_sb; ++ err = si_read_lock(arg.sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out_mtx; ++ err = au_alive_dir(dentry); ++ if (unlikely(err)) ++ goto out_si; ++ /* todo: reval? */ ++ fi_read_lock(file); ++ ++ err = -EAGAIN; ++ if (unlikely(au_ftest_rdu(cookie->flags, CONT) ++ && cookie->generation != au_figen(file))) ++ goto out_unlock; ++ ++ err = 0; ++ if (!rdu->blk) { ++ rdu->blk = au_sbi(arg.sb)->si_rdblk; ++ if (!rdu->blk) ++ rdu->blk = au_dir_size(file, /*dentry*/NULL); ++ } ++ bend = au_fbstart(file); ++ if (cookie->bindex < bend) ++ cookie->bindex = bend; ++ bend = au_fbend_dir(file); ++ /* AuDbg("b%d, b%d\n", cookie->bindex, bend); */ ++ for (; !err && cookie->bindex <= bend; ++ cookie->bindex++, cookie->h_pos = 0) { ++ h_file = au_hf_dir(file, cookie->bindex); ++ if (!h_file) ++ continue; ++ ++ au_fclr_rdu(cookie->flags, FULL); ++ err = au_rdu_do(h_file, &arg); ++ AuTraceErr(err); ++ if (unlikely(au_ftest_rdu(cookie->flags, FULL) || err)) ++ break; ++ } ++ AuDbg("rent %llu\n", rdu->rent); ++ ++ if (!err && !au_ftest_rdu(cookie->flags, CONT)) { ++ rdu->shwh = !!au_opt_test(au_sbi(arg.sb)->si_mntflags, SHWH); ++ au_fset_rdu(cookie->flags, CONT); ++ cookie->generation = au_figen(file); ++ } ++ ++ ii_read_lock_child(inode); ++ fsstack_copy_attr_atime(inode, au_h_iptr(inode, au_ibstart(inode))); ++ ii_read_unlock(inode); ++ ++out_unlock: ++ fi_read_unlock(file); ++out_si: ++ si_read_unlock(arg.sb); ++out_mtx: ++ mutex_unlock(&inode->i_mutex); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_rdu_ino(struct file *file, struct aufs_rdu *rdu) ++{ ++ int err; ++ ino_t ino; ++ unsigned long long nent; ++ union au_rdu_ent_ul *u; ++ struct au_rdu_ent ent; ++ struct super_block *sb; ++ ++ err = 0; ++ nent = rdu->nent; ++ u = &rdu->ent; ++ sb = file->f_dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ while (nent-- > 0) { ++ /* unnecessary to support mmap_sem since this is a dir */ ++ err = copy_from_user(&ent, u->e, sizeof(ent)); ++ if (!err) ++ err = !access_ok(VERIFY_WRITE, &u->e->ino, sizeof(ino)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ break; ++ } ++ ++ /* AuDbg("b%d, i%llu\n", ent.bindex, ent.ino); */ ++ if (!ent.wh) ++ err = au_ino(sb, ent.bindex, ent.ino, ent.type, &ino); ++ else ++ err = au_wh_ino(sb, ent.bindex, ent.ino, ent.type, ++ &ino); ++ if (unlikely(err)) { ++ AuTraceErr(err); ++ break; ++ } ++ ++ err = __put_user(ino, &u->e->ino); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ break; ++ } ++ u->ul += au_rdu_len(ent.nlen); ++ } ++ si_read_unlock(sb); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_rdu_verify(struct aufs_rdu *rdu) ++{ ++ AuDbg("rdu{%llu, %p, %u | %u | %llu, %u, %u | " ++ "%llu, b%d, 0x%x, g%u}\n", ++ rdu->sz, rdu->ent.e, rdu->verify[AufsCtlRduV_SZ], ++ rdu->blk, ++ rdu->rent, rdu->shwh, rdu->full, ++ rdu->cookie.h_pos, rdu->cookie.bindex, rdu->cookie.flags, ++ rdu->cookie.generation); ++ ++ if (rdu->verify[AufsCtlRduV_SZ] == sizeof(*rdu)) ++ return 0; ++ ++ AuDbg("%u:%u\n", ++ rdu->verify[AufsCtlRduV_SZ], (unsigned int)sizeof(*rdu)); ++ return -EINVAL; ++} ++ ++long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ long err, e; ++ struct aufs_rdu rdu; ++ void __user *p = (void __user *)arg; ++ ++ err = copy_from_user(&rdu, p, sizeof(rdu)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ goto out; ++ } ++ err = au_rdu_verify(&rdu); ++ if (unlikely(err)) ++ goto out; ++ ++ switch (cmd) { ++ case AUFS_CTL_RDU: ++ err = au_rdu(file, &rdu); ++ if (unlikely(err)) ++ break; ++ ++ e = copy_to_user(p, &rdu, sizeof(rdu)); ++ if (unlikely(e)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ } ++ break; ++ case AUFS_CTL_RDU_INO: ++ err = au_rdu_ino(file, &rdu); ++ break; ++ ++ default: ++ /* err = -ENOTTY; */ ++ err = -EINVAL; ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++#ifdef CONFIG_COMPAT ++long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ long err, e; ++ struct aufs_rdu rdu; ++ void __user *p = compat_ptr(arg); ++ ++ /* todo: get_user()? */ ++ err = copy_from_user(&rdu, p, sizeof(rdu)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ goto out; ++ } ++ rdu.ent.e = compat_ptr(rdu.ent.ul); ++ err = au_rdu_verify(&rdu); ++ if (unlikely(err)) ++ goto out; ++ ++ switch (cmd) { ++ case AUFS_CTL_RDU: ++ err = au_rdu(file, &rdu); ++ if (unlikely(err)) ++ break; ++ ++ rdu.ent.ul = ptr_to_compat(rdu.ent.e); ++ rdu.tail.ul = ptr_to_compat(rdu.tail.e); ++ e = copy_to_user(p, &rdu, sizeof(rdu)); ++ if (unlikely(e)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ } ++ break; ++ case AUFS_CTL_RDU_INO: ++ err = au_rdu_ino(file, &rdu); ++ break; ++ ++ default: ++ /* err = -ENOTTY; */ ++ err = -EINVAL; ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++#endif +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/rwsem.h linux-3.2.0-gentoo-r1/fs/aufs/rwsem.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/rwsem.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/rwsem.h 2012-01-17 12:11:24.916787487 +0100 +@@ -0,0 +1,188 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * simple read-write semaphore wrappers ++ */ ++ ++#ifndef __AUFS_RWSEM_H__ ++#define __AUFS_RWSEM_H__ ++ ++#ifdef __KERNEL__ ++ ++#include "debug.h" ++ ++struct au_rwsem { ++ struct rw_semaphore rwsem; ++#ifdef CONFIG_AUFS_DEBUG ++ /* just for debugging, not almighty counter */ ++ atomic_t rcnt, wcnt; ++#endif ++}; ++ ++#ifdef CONFIG_AUFS_DEBUG ++#define AuDbgCntInit(rw) do { \ ++ atomic_set(&(rw)->rcnt, 0); \ ++ atomic_set(&(rw)->wcnt, 0); \ ++ smp_mb(); /* atomic set */ \ ++} while (0) ++ ++#define AuDbgRcntInc(rw) atomic_inc(&(rw)->rcnt) ++#define AuDbgRcntDec(rw) WARN_ON(atomic_dec_return(&(rw)->rcnt) < 0) ++#define AuDbgWcntInc(rw) atomic_inc(&(rw)->wcnt) ++#define AuDbgWcntDec(rw) WARN_ON(atomic_dec_return(&(rw)->wcnt) < 0) ++#else ++#define AuDbgCntInit(rw) do {} while (0) ++#define AuDbgRcntInc(rw) do {} while (0) ++#define AuDbgRcntDec(rw) do {} while (0) ++#define AuDbgWcntInc(rw) do {} while (0) ++#define AuDbgWcntDec(rw) do {} while (0) ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++/* to debug easier, do not make them inlined functions */ ++#define AuRwMustNoWaiters(rw) AuDebugOn(!list_empty(&(rw)->rwsem.wait_list)) ++/* rwsem_is_locked() is unusable */ ++#define AuRwMustReadLock(rw) AuDebugOn(atomic_read(&(rw)->rcnt) <= 0) ++#define AuRwMustWriteLock(rw) AuDebugOn(atomic_read(&(rw)->wcnt) <= 0) ++#define AuRwMustAnyLock(rw) AuDebugOn(atomic_read(&(rw)->rcnt) <= 0 \ ++ && atomic_read(&(rw)->wcnt) <= 0) ++#define AuRwDestroy(rw) AuDebugOn(atomic_read(&(rw)->rcnt) \ ++ || atomic_read(&(rw)->wcnt)) ++ ++#define au_rw_class(rw, key) lockdep_set_class(&(rw)->rwsem, key) ++ ++static inline void au_rw_init(struct au_rwsem *rw) ++{ ++ AuDbgCntInit(rw); ++ init_rwsem(&rw->rwsem); ++} ++ ++static inline void au_rw_init_wlock(struct au_rwsem *rw) ++{ ++ au_rw_init(rw); ++ down_write(&rw->rwsem); ++ AuDbgWcntInc(rw); ++} ++ ++static inline void au_rw_init_wlock_nested(struct au_rwsem *rw, ++ unsigned int lsc) ++{ ++ au_rw_init(rw); ++ down_write_nested(&rw->rwsem, lsc); ++ AuDbgWcntInc(rw); ++} ++ ++static inline void au_rw_read_lock(struct au_rwsem *rw) ++{ ++ down_read(&rw->rwsem); ++ AuDbgRcntInc(rw); ++} ++ ++static inline void au_rw_read_lock_nested(struct au_rwsem *rw, unsigned int lsc) ++{ ++ down_read_nested(&rw->rwsem, lsc); ++ AuDbgRcntInc(rw); ++} ++ ++static inline void au_rw_read_unlock(struct au_rwsem *rw) ++{ ++ AuRwMustReadLock(rw); ++ AuDbgRcntDec(rw); ++ up_read(&rw->rwsem); ++} ++ ++static inline void au_rw_dgrade_lock(struct au_rwsem *rw) ++{ ++ AuRwMustWriteLock(rw); ++ AuDbgRcntInc(rw); ++ AuDbgWcntDec(rw); ++ downgrade_write(&rw->rwsem); ++} ++ ++static inline void au_rw_write_lock(struct au_rwsem *rw) ++{ ++ down_write(&rw->rwsem); ++ AuDbgWcntInc(rw); ++} ++ ++static inline void au_rw_write_lock_nested(struct au_rwsem *rw, ++ unsigned int lsc) ++{ ++ down_write_nested(&rw->rwsem, lsc); ++ AuDbgWcntInc(rw); ++} ++ ++static inline void au_rw_write_unlock(struct au_rwsem *rw) ++{ ++ AuRwMustWriteLock(rw); ++ AuDbgWcntDec(rw); ++ up_write(&rw->rwsem); ++} ++ ++/* why is not _nested version defined */ ++static inline int au_rw_read_trylock(struct au_rwsem *rw) ++{ ++ int ret = down_read_trylock(&rw->rwsem); ++ if (ret) ++ AuDbgRcntInc(rw); ++ return ret; ++} ++ ++static inline int au_rw_write_trylock(struct au_rwsem *rw) ++{ ++ int ret = down_write_trylock(&rw->rwsem); ++ if (ret) ++ AuDbgWcntInc(rw); ++ return ret; ++} ++ ++#undef AuDbgCntInit ++#undef AuDbgRcntInc ++#undef AuDbgRcntDec ++#undef AuDbgWcntInc ++#undef AuDbgWcntDec ++ ++#define AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ ++static inline void prefix##_read_lock(param) \ ++{ au_rw_read_lock(rwsem); } \ ++static inline void prefix##_write_lock(param) \ ++{ au_rw_write_lock(rwsem); } \ ++static inline int prefix##_read_trylock(param) \ ++{ return au_rw_read_trylock(rwsem); } \ ++static inline int prefix##_write_trylock(param) \ ++{ return au_rw_write_trylock(rwsem); } ++/* why is not _nested version defined */ ++/* static inline void prefix##_read_trylock_nested(param, lsc) ++{ au_rw_read_trylock_nested(rwsem, lsc)); } ++static inline void prefix##_write_trylock_nestd(param, lsc) ++{ au_rw_write_trylock_nested(rwsem, lsc); } */ ++ ++#define AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) \ ++static inline void prefix##_read_unlock(param) \ ++{ au_rw_read_unlock(rwsem); } \ ++static inline void prefix##_write_unlock(param) \ ++{ au_rw_write_unlock(rwsem); } \ ++static inline void prefix##_downgrade_lock(param) \ ++{ au_rw_dgrade_lock(rwsem); } ++ ++#define AuSimpleRwsemFuncs(prefix, param, rwsem) \ ++ AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ ++ AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_RWSEM_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/sbinfo.c linux-3.2.0-gentoo-r1/fs/aufs/sbinfo.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/sbinfo.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/sbinfo.c 2012-01-17 12:11:24.916787487 +0100 +@@ -0,0 +1,343 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * superblock private data ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * they are necessary regardless sysfs is disabled. ++ */ ++void au_si_free(struct kobject *kobj) ++{ ++ struct au_sbinfo *sbinfo; ++ char *locked __maybe_unused; /* debug only */ ++ ++ sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); ++ AuDebugOn(!list_empty(&sbinfo->si_plink.head)); ++ AuDebugOn(atomic_read(&sbinfo->si_nowait.nw_len)); ++ ++ au_rw_write_lock(&sbinfo->si_rwsem); ++ au_br_free(sbinfo); ++ au_rw_write_unlock(&sbinfo->si_rwsem); ++ ++ AuDebugOn(radix_tree_gang_lookup ++ (&sbinfo->au_si_pid.tree, (void **)&locked, ++ /*first_index*/PID_MAX_DEFAULT - 1, ++ /*max_items*/sizeof(locked)/sizeof(*locked))); ++ ++ kfree(sbinfo->si_branch); ++ kfree(sbinfo->au_si_pid.bitmap); ++ mutex_destroy(&sbinfo->si_xib_mtx); ++ AuRwDestroy(&sbinfo->si_rwsem); ++ ++ kfree(sbinfo); ++} ++ ++int au_si_alloc(struct super_block *sb) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ static struct lock_class_key aufs_si; ++ ++ err = -ENOMEM; ++ sbinfo = kzalloc(sizeof(*sbinfo), GFP_NOFS); ++ if (unlikely(!sbinfo)) ++ goto out; ++ ++ BUILD_BUG_ON(sizeof(unsigned long) != ++ sizeof(*sbinfo->au_si_pid.bitmap)); ++ sbinfo->au_si_pid.bitmap = kcalloc(BITS_TO_LONGS(PID_MAX_DEFAULT), ++ sizeof(*sbinfo->au_si_pid.bitmap), ++ GFP_NOFS); ++ if (unlikely(!sbinfo->au_si_pid.bitmap)) ++ goto out_sbinfo; ++ ++ /* will be reallocated separately */ ++ sbinfo->si_branch = kzalloc(sizeof(*sbinfo->si_branch), GFP_NOFS); ++ if (unlikely(!sbinfo->si_branch)) ++ goto out_pidmap; ++ ++ err = sysaufs_si_init(sbinfo); ++ if (unlikely(err)) ++ goto out_br; ++ ++ au_nwt_init(&sbinfo->si_nowait); ++ au_rw_init_wlock(&sbinfo->si_rwsem); ++ au_rw_class(&sbinfo->si_rwsem, &aufs_si); ++ spin_lock_init(&sbinfo->au_si_pid.tree_lock); ++ INIT_RADIX_TREE(&sbinfo->au_si_pid.tree, GFP_ATOMIC | __GFP_NOFAIL); ++ ++ atomic_long_set(&sbinfo->si_ninodes, 0); ++ atomic_long_set(&sbinfo->si_nfiles, 0); ++ ++ sbinfo->si_bend = -1; ++ ++ sbinfo->si_wbr_copyup = AuWbrCopyup_Def; ++ sbinfo->si_wbr_create = AuWbrCreate_Def; ++ sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + sbinfo->si_wbr_copyup; ++ sbinfo->si_wbr_create_ops = au_wbr_create_ops + sbinfo->si_wbr_create; ++ ++ sbinfo->si_mntflags = au_opts_plink(AuOpt_Def); ++ ++ mutex_init(&sbinfo->si_xib_mtx); ++ sbinfo->si_xino_brid = -1; ++ /* leave si_xib_last_pindex and si_xib_next_bit */ ++ ++ sbinfo->si_rdcache = msecs_to_jiffies(AUFS_RDCACHE_DEF * MSEC_PER_SEC); ++ sbinfo->si_rdblk = AUFS_RDBLK_DEF; ++ sbinfo->si_rdhash = AUFS_RDHASH_DEF; ++ sbinfo->si_dirwh = AUFS_DIRWH_DEF; ++ ++ au_spl_init(&sbinfo->si_plink); ++ init_waitqueue_head(&sbinfo->si_plink_wq); ++ spin_lock_init(&sbinfo->si_plink_maint_lock); ++ ++ /* leave other members for sysaufs and si_mnt. */ ++ sbinfo->si_sb = sb; ++ sb->s_fs_info = sbinfo; ++ si_pid_set(sb); ++ au_debug_sbinfo_init(sbinfo); ++ return 0; /* success */ ++ ++out_br: ++ kfree(sbinfo->si_branch); ++out_pidmap: ++ kfree(sbinfo->au_si_pid.bitmap); ++out_sbinfo: ++ kfree(sbinfo); ++out: ++ return err; ++} ++ ++int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr) ++{ ++ int err, sz; ++ struct au_branch **brp; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ err = -ENOMEM; ++ sz = sizeof(*brp) * (sbinfo->si_bend + 1); ++ if (unlikely(!sz)) ++ sz = sizeof(*brp); ++ brp = au_kzrealloc(sbinfo->si_branch, sz, sizeof(*brp) * nbr, GFP_NOFS); ++ if (brp) { ++ sbinfo->si_branch = brp; ++ err = 0; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++unsigned int au_sigen_inc(struct super_block *sb) ++{ ++ unsigned int gen; ++ ++ SiMustWriteLock(sb); ++ ++ gen = ++au_sbi(sb)->si_generation; ++ au_update_digen(sb->s_root); ++ au_update_iigen(sb->s_root->d_inode); ++ sb->s_root->d_inode->i_version++; ++ return gen; ++} ++ ++aufs_bindex_t au_new_br_id(struct super_block *sb) ++{ ++ aufs_bindex_t br_id; ++ int i; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ for (i = 0; i <= AUFS_BRANCH_MAX; i++) { ++ br_id = ++sbinfo->si_last_br_id; ++ AuDebugOn(br_id < 0); ++ if (br_id && au_br_index(sb, br_id) < 0) ++ return br_id; ++ } ++ ++ return -1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* it is ok that new 'nwt' tasks are appended while we are sleeping */ ++int si_read_lock(struct super_block *sb, int flags) ++{ ++ int err; ++ ++ err = 0; ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ ++ si_noflush_read_lock(sb); ++ err = au_plink_maint(sb, flags); ++ if (unlikely(err)) ++ si_read_unlock(sb); ++ ++ return err; ++} ++ ++int si_write_lock(struct super_block *sb, int flags) ++{ ++ int err; ++ ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ ++ si_noflush_write_lock(sb); ++ err = au_plink_maint(sb, flags); ++ if (unlikely(err)) ++ si_write_unlock(sb); ++ ++ return err; ++} ++ ++/* dentry and super_block lock. call at entry point */ ++int aufs_read_lock(struct dentry *dentry, int flags) ++{ ++ int err; ++ struct super_block *sb; ++ ++ sb = dentry->d_sb; ++ err = si_read_lock(sb, flags); ++ if (unlikely(err)) ++ goto out; ++ ++ if (au_ftest_lock(flags, DW)) ++ di_write_lock_child(dentry); ++ else ++ di_read_lock_child(dentry, flags); ++ ++ if (au_ftest_lock(flags, GEN)) { ++ err = au_digen_test(dentry, au_sigen(sb)); ++ AuDebugOn(!err && au_dbrange_test(dentry)); ++ if (unlikely(err)) ++ aufs_read_unlock(dentry, flags); ++ } ++ ++out: ++ return err; ++} ++ ++void aufs_read_unlock(struct dentry *dentry, int flags) ++{ ++ if (au_ftest_lock(flags, DW)) ++ di_write_unlock(dentry); ++ else ++ di_read_unlock(dentry, flags); ++ si_read_unlock(dentry->d_sb); ++} ++ ++void aufs_write_lock(struct dentry *dentry) ++{ ++ si_write_lock(dentry->d_sb, AuLock_FLUSH | AuLock_NOPLMW); ++ di_write_lock_child(dentry); ++} ++ ++void aufs_write_unlock(struct dentry *dentry) ++{ ++ di_write_unlock(dentry); ++ si_write_unlock(dentry->d_sb); ++} ++ ++int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags) ++{ ++ int err; ++ unsigned int sigen; ++ struct super_block *sb; ++ ++ sb = d1->d_sb; ++ err = si_read_lock(sb, flags); ++ if (unlikely(err)) ++ goto out; ++ ++ di_write_lock2_child(d1, d2, au_ftest_lock(flags, DIR)); ++ ++ if (au_ftest_lock(flags, GEN)) { ++ sigen = au_sigen(sb); ++ err = au_digen_test(d1, sigen); ++ AuDebugOn(!err && au_dbrange_test(d1)); ++ if (!err) { ++ err = au_digen_test(d2, sigen); ++ AuDebugOn(!err && au_dbrange_test(d2)); ++ } ++ if (unlikely(err)) ++ aufs_read_and_write_unlock2(d1, d2); ++ } ++ ++out: ++ return err; ++} ++ ++void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2) ++{ ++ di_write_unlock2(d1, d2); ++ si_read_unlock(d1->d_sb); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int si_pid_test_slow(struct super_block *sb) ++{ ++ void *p; ++ ++ rcu_read_lock(); ++ p = radix_tree_lookup(&au_sbi(sb)->au_si_pid.tree, current->pid); ++ rcu_read_unlock(); ++ ++ return (long)!!p; ++} ++ ++void si_pid_set_slow(struct super_block *sb) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ ++ AuDebugOn(si_pid_test_slow(sb)); ++ ++ sbinfo = au_sbi(sb); ++ err = radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); ++ AuDebugOn(err); ++ spin_lock(&sbinfo->au_si_pid.tree_lock); ++ err = radix_tree_insert(&sbinfo->au_si_pid.tree, current->pid, ++ /*any valid ptr*/sb); ++ spin_unlock(&sbinfo->au_si_pid.tree_lock); ++ AuDebugOn(err); ++ radix_tree_preload_end(); ++} ++ ++void si_pid_clr_slow(struct super_block *sb) ++{ ++ void *p; ++ struct au_sbinfo *sbinfo; ++ ++ AuDebugOn(!si_pid_test_slow(sb)); ++ ++ sbinfo = au_sbi(sb); ++ spin_lock(&sbinfo->au_si_pid.tree_lock); ++ p = radix_tree_delete(&sbinfo->au_si_pid.tree, current->pid); ++ spin_unlock(&sbinfo->au_si_pid.tree_lock); ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/spl.h linux-3.2.0-gentoo-r1/fs/aufs/spl.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/spl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/spl.h 2012-01-17 12:11:24.916787487 +0100 +@@ -0,0 +1,62 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * simple list protected by a spinlock ++ */ ++ ++#ifndef __AUFS_SPL_H__ ++#define __AUFS_SPL_H__ ++ ++#ifdef __KERNEL__ ++ ++struct au_splhead { ++ spinlock_t spin; ++ struct list_head head; ++}; ++ ++static inline void au_spl_init(struct au_splhead *spl) ++{ ++ spin_lock_init(&spl->spin); ++ INIT_LIST_HEAD(&spl->head); ++} ++ ++static inline void au_spl_add(struct list_head *list, struct au_splhead *spl) ++{ ++ spin_lock(&spl->spin); ++ list_add(list, &spl->head); ++ spin_unlock(&spl->spin); ++} ++ ++static inline void au_spl_del(struct list_head *list, struct au_splhead *spl) ++{ ++ spin_lock(&spl->spin); ++ list_del(list); ++ spin_unlock(&spl->spin); ++} ++ ++static inline void au_spl_del_rcu(struct list_head *list, ++ struct au_splhead *spl) ++{ ++ spin_lock(&spl->spin); ++ list_del_rcu(list); ++ spin_unlock(&spl->spin); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_SPL_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/super.c linux-3.2.0-gentoo-r1/fs/aufs/super.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/super.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/super.c 2012-01-17 12:11:24.930676503 +0100 +@@ -0,0 +1,938 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * mount and super_block operations ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "aufs.h" ++ ++/* ++ * super_operations ++ */ ++static struct inode *aufs_alloc_inode(struct super_block *sb __maybe_unused) ++{ ++ struct au_icntnr *c; ++ ++ c = au_cache_alloc_icntnr(); ++ if (c) { ++ au_icntnr_init(c); ++ c->vfs_inode.i_version = 1; /* sigen(sb); */ ++ c->iinfo.ii_hinode = NULL; ++ return &c->vfs_inode; ++ } ++ return NULL; ++} ++ ++static void aufs_destroy_inode_cb(struct rcu_head *head) ++{ ++ struct inode *inode = container_of(head, struct inode, i_rcu); ++ ++ INIT_LIST_HEAD(&inode->i_dentry); ++ au_cache_free_icntnr(container_of(inode, struct au_icntnr, vfs_inode)); ++} ++ ++static void aufs_destroy_inode(struct inode *inode) ++{ ++ au_iinfo_fin(inode); ++ call_rcu(&inode->i_rcu, aufs_destroy_inode_cb); ++} ++ ++struct inode *au_iget_locked(struct super_block *sb, ino_t ino) ++{ ++ struct inode *inode; ++ int err; ++ ++ inode = iget_locked(sb, ino); ++ if (unlikely(!inode)) { ++ inode = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ if (!(inode->i_state & I_NEW)) ++ goto out; ++ ++ err = au_xigen_new(inode); ++ if (!err) ++ err = au_iinfo_init(inode); ++ if (!err) ++ inode->i_version++; ++ else { ++ iget_failed(inode); ++ inode = ERR_PTR(err); ++ } ++ ++out: ++ /* never return NULL */ ++ AuDebugOn(!inode); ++ AuTraceErrPtr(inode); ++ return inode; ++} ++ ++/* lock free root dinfo */ ++static int au_show_brs(struct seq_file *seq, struct super_block *sb) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ struct path path; ++ struct au_hdentry *hdp; ++ struct au_branch *br; ++ char *perm; ++ ++ err = 0; ++ bend = au_sbend(sb); ++ hdp = au_di(sb->s_root)->di_hdentry; ++ for (bindex = 0; !err && bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ path.mnt = br->br_mnt; ++ path.dentry = hdp[bindex].hd_dentry; ++ err = au_seq_path(seq, &path); ++ if (err > 0) { ++ perm = au_optstr_br_perm(br->br_perm); ++ if (perm) { ++ err = seq_printf(seq, "=%s", perm); ++ kfree(perm); ++ if (err == -1) ++ err = -E2BIG; ++ } else ++ err = -ENOMEM; ++ } ++ if (!err && bindex != bend) ++ err = seq_putc(seq, ':'); ++ } ++ ++ return err; ++} ++ ++static void au_show_wbr_create(struct seq_file *m, int v, ++ struct au_sbinfo *sbinfo) ++{ ++ const char *pat; ++ ++ AuRwMustAnyLock(&sbinfo->si_rwsem); ++ ++ seq_printf(m, ",create="); ++ pat = au_optstr_wbr_create(v); ++ switch (v) { ++ case AuWbrCreate_TDP: ++ case AuWbrCreate_RR: ++ case AuWbrCreate_MFS: ++ case AuWbrCreate_PMFS: ++ seq_printf(m, pat); ++ break; ++ case AuWbrCreate_MFSV: ++ seq_printf(m, /*pat*/"mfs:%lu", ++ jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) ++ / MSEC_PER_SEC); ++ break; ++ case AuWbrCreate_PMFSV: ++ seq_printf(m, /*pat*/"pmfs:%lu", ++ jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) ++ / MSEC_PER_SEC); ++ break; ++ case AuWbrCreate_MFSRR: ++ seq_printf(m, /*pat*/"mfsrr:%llu", ++ sbinfo->si_wbr_mfs.mfsrr_watermark); ++ break; ++ case AuWbrCreate_MFSRRV: ++ seq_printf(m, /*pat*/"mfsrr:%llu:%lu", ++ sbinfo->si_wbr_mfs.mfsrr_watermark, ++ jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) ++ / MSEC_PER_SEC); ++ break; ++ } ++} ++ ++static int au_show_xino(struct seq_file *seq, struct vfsmount *mnt) ++{ ++#ifdef CONFIG_SYSFS ++ return 0; ++#else ++ int err; ++ const int len = sizeof(AUFS_XINO_FNAME) - 1; ++ aufs_bindex_t bindex, brid; ++ struct super_block *sb; ++ struct qstr *name; ++ struct file *f; ++ struct dentry *d, *h_root; ++ struct au_hdentry *hdp; ++ ++ AuRwMustAnyLock(&sbinfo->si_rwsem); ++ ++ err = 0; ++ sb = mnt->mnt_sb; ++ f = au_sbi(sb)->si_xib; ++ if (!f) ++ goto out; ++ ++ /* stop printing the default xino path on the first writable branch */ ++ h_root = NULL; ++ brid = au_xino_brid(sb); ++ if (brid >= 0) { ++ bindex = au_br_index(sb, brid); ++ hdp = au_di(sb->s_root)->di_hdentry; ++ h_root = hdp[0 + bindex].hd_dentry; ++ } ++ d = f->f_dentry; ++ name = &d->d_name; ++ /* safe ->d_parent because the file is unlinked */ ++ if (d->d_parent == h_root ++ && name->len == len ++ && !memcmp(name->name, AUFS_XINO_FNAME, len)) ++ goto out; ++ ++ seq_puts(seq, ",xino="); ++ err = au_xino_path(seq, f); ++ ++out: ++ return err; ++#endif ++} ++ ++/* seq_file will re-call me in case of too long string */ ++static int aufs_show_options(struct seq_file *m, struct vfsmount *mnt) ++{ ++ int err; ++ unsigned int mnt_flags, v; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++#define AuBool(name, str) do { \ ++ v = au_opt_test(mnt_flags, name); \ ++ if (v != au_opt_test(AuOpt_Def, name)) \ ++ seq_printf(m, ",%s" #str, v ? "" : "no"); \ ++} while (0) ++ ++#define AuStr(name, str) do { \ ++ v = mnt_flags & AuOptMask_##name; \ ++ if (v != (AuOpt_Def & AuOptMask_##name)) \ ++ seq_printf(m, "," #str "=%s", au_optstr_##str(v)); \ ++} while (0) ++ ++#define AuUInt(name, str, val) do { \ ++ if (val != AUFS_##name##_DEF) \ ++ seq_printf(m, "," #str "=%u", val); \ ++} while (0) ++ ++ /* lock free root dinfo */ ++ sb = mnt->mnt_sb; ++ si_noflush_read_lock(sb); ++ sbinfo = au_sbi(sb); ++ seq_printf(m, ",si=%lx", sysaufs_si_id(sbinfo)); ++ ++ mnt_flags = au_mntflags(sb); ++ if (au_opt_test(mnt_flags, XINO)) { ++ err = au_show_xino(m, mnt); ++ if (unlikely(err)) ++ goto out; ++ } else ++ seq_puts(m, ",noxino"); ++ ++ AuBool(TRUNC_XINO, trunc_xino); ++ AuStr(UDBA, udba); ++ AuBool(SHWH, shwh); ++ AuBool(PLINK, plink); ++ AuBool(DIO, dio); ++ /* AuBool(DIRPERM1, dirperm1); */ ++ /* AuBool(REFROF, refrof); */ ++ ++ v = sbinfo->si_wbr_create; ++ if (v != AuWbrCreate_Def) ++ au_show_wbr_create(m, v, sbinfo); ++ ++ v = sbinfo->si_wbr_copyup; ++ if (v != AuWbrCopyup_Def) ++ seq_printf(m, ",cpup=%s", au_optstr_wbr_copyup(v)); ++ ++ v = au_opt_test(mnt_flags, ALWAYS_DIROPQ); ++ if (v != au_opt_test(AuOpt_Def, ALWAYS_DIROPQ)) ++ seq_printf(m, ",diropq=%c", v ? 'a' : 'w'); ++ ++ AuUInt(DIRWH, dirwh, sbinfo->si_dirwh); ++ ++ v = jiffies_to_msecs(sbinfo->si_rdcache) / MSEC_PER_SEC; ++ AuUInt(RDCACHE, rdcache, v); ++ ++ AuUInt(RDBLK, rdblk, sbinfo->si_rdblk); ++ AuUInt(RDHASH, rdhash, sbinfo->si_rdhash); ++ ++ AuBool(SUM, sum); ++ /* AuBool(SUM_W, wsum); */ ++ AuBool(WARN_PERM, warn_perm); ++ AuBool(VERBOSE, verbose); ++ ++out: ++ /* be sure to print "br:" last */ ++ if (!sysaufs_brs) { ++ seq_puts(m, ",br:"); ++ au_show_brs(m, sb); ++ } ++ si_read_unlock(sb); ++ return 0; ++ ++#undef AuBool ++#undef AuStr ++#undef AuUInt ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* sum mode which returns the summation for statfs(2) */ ++ ++static u64 au_add_till_max(u64 a, u64 b) ++{ ++ u64 old; ++ ++ old = a; ++ a += b; ++ if (old < a) ++ return a; ++ return ULLONG_MAX; ++} ++ ++static int au_statfs_sum(struct super_block *sb, struct kstatfs *buf) ++{ ++ int err; ++ u64 blocks, bfree, bavail, files, ffree; ++ aufs_bindex_t bend, bindex, i; ++ unsigned char shared; ++ struct path h_path; ++ struct super_block *h_sb; ++ ++ blocks = 0; ++ bfree = 0; ++ bavail = 0; ++ files = 0; ++ ffree = 0; ++ ++ err = 0; ++ bend = au_sbend(sb); ++ for (bindex = bend; bindex >= 0; bindex--) { ++ h_path.mnt = au_sbr_mnt(sb, bindex); ++ h_sb = h_path.mnt->mnt_sb; ++ shared = 0; ++ for (i = bindex + 1; !shared && i <= bend; i++) ++ shared = (au_sbr_sb(sb, i) == h_sb); ++ if (shared) ++ continue; ++ ++ /* sb->s_root for NFS is unreliable */ ++ h_path.dentry = h_path.mnt->mnt_root; ++ err = vfs_statfs(&h_path, buf); ++ if (unlikely(err)) ++ goto out; ++ ++ blocks = au_add_till_max(blocks, buf->f_blocks); ++ bfree = au_add_till_max(bfree, buf->f_bfree); ++ bavail = au_add_till_max(bavail, buf->f_bavail); ++ files = au_add_till_max(files, buf->f_files); ++ ffree = au_add_till_max(ffree, buf->f_ffree); ++ } ++ ++ buf->f_blocks = blocks; ++ buf->f_bfree = bfree; ++ buf->f_bavail = bavail; ++ buf->f_files = files; ++ buf->f_ffree = ffree; ++ ++out: ++ return err; ++} ++ ++static int aufs_statfs(struct dentry *dentry, struct kstatfs *buf) ++{ ++ int err; ++ struct path h_path; ++ struct super_block *sb; ++ ++ /* lock free root dinfo */ ++ sb = dentry->d_sb; ++ si_noflush_read_lock(sb); ++ if (!au_opt_test(au_mntflags(sb), SUM)) { ++ /* sb->s_root for NFS is unreliable */ ++ h_path.mnt = au_sbr_mnt(sb, 0); ++ h_path.dentry = h_path.mnt->mnt_root; ++ err = vfs_statfs(&h_path, buf); ++ } else ++ err = au_statfs_sum(sb, buf); ++ si_read_unlock(sb); ++ ++ if (!err) { ++ buf->f_type = AUFS_SUPER_MAGIC; ++ buf->f_namelen = AUFS_MAX_NAMELEN; ++ memset(&buf->f_fsid, 0, sizeof(buf->f_fsid)); ++ } ++ /* buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; */ ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* final actions when unmounting a file system */ ++static void aufs_put_super(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ sbinfo = au_sbi(sb); ++ if (!sbinfo) ++ return; ++ ++ dbgaufs_si_fin(sbinfo); ++ kobject_put(&sbinfo->si_kobj); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_array_free(void *array) ++{ ++ if (array) { ++ if (!is_vmalloc_addr(array)) ++ kfree(array); ++ else ++ vfree(array); ++ } ++} ++ ++void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb, void *arg) ++{ ++ void *array; ++ unsigned long long n; ++ ++ array = NULL; ++ n = 0; ++ if (!*hint) ++ goto out; ++ ++ if (*hint > ULLONG_MAX / sizeof(array)) { ++ array = ERR_PTR(-EMFILE); ++ pr_err("hint %llu\n", *hint); ++ goto out; ++ } ++ ++ array = kmalloc(sizeof(array) * *hint, GFP_NOFS); ++ if (unlikely(!array)) ++ array = vmalloc(sizeof(array) * *hint); ++ if (unlikely(!array)) { ++ array = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ n = cb(array, *hint, arg); ++ AuDebugOn(n > *hint); ++ ++out: ++ *hint = n; ++ return array; ++} ++ ++static unsigned long long au_iarray_cb(void *a, ++ unsigned long long max __maybe_unused, ++ void *arg) ++{ ++ unsigned long long n; ++ struct inode **p, *inode; ++ struct list_head *head; ++ ++ n = 0; ++ p = a; ++ head = arg; ++ spin_lock(&inode_sb_list_lock); ++ list_for_each_entry(inode, head, i_sb_list) { ++ if (!is_bad_inode(inode) ++ && au_ii(inode)->ii_bstart >= 0) { ++ spin_lock(&inode->i_lock); ++ if (atomic_read(&inode->i_count)) { ++ au_igrab(inode); ++ *p++ = inode; ++ n++; ++ AuDebugOn(n > max); ++ } ++ spin_unlock(&inode->i_lock); ++ } ++ } ++ spin_unlock(&inode_sb_list_lock); ++ ++ return n; ++} ++ ++struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max) ++{ ++ *max = atomic_long_read(&au_sbi(sb)->si_ninodes); ++ return au_array_alloc(max, au_iarray_cb, &sb->s_inodes); ++} ++ ++void au_iarray_free(struct inode **a, unsigned long long max) ++{ ++ unsigned long long ull; ++ ++ for (ull = 0; ull < max; ull++) ++ iput(a[ull]); ++ au_array_free(a); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * refresh dentry and inode at remount time. ++ */ ++/* todo: consolidate with simple_reval_dpath() and au_reval_for_attr() */ ++static int au_do_refresh(struct dentry *dentry, unsigned int dir_flags, ++ struct dentry *parent) ++{ ++ int err; ++ ++ di_write_lock_child(dentry); ++ di_read_lock_parent(parent, AuLock_IR); ++ err = au_refresh_dentry(dentry, parent); ++ if (!err && dir_flags) ++ au_hn_reset(dentry->d_inode, dir_flags); ++ di_read_unlock(parent, AuLock_IR); ++ di_write_unlock(dentry); ++ ++ return err; ++} ++ ++static int au_do_refresh_d(struct dentry *dentry, unsigned int sigen, ++ struct au_sbinfo *sbinfo, ++ const unsigned int dir_flags) ++{ ++ int err; ++ struct dentry *parent; ++ struct inode *inode; ++ ++ err = 0; ++ parent = dget_parent(dentry); ++ if (!au_digen_test(parent, sigen) && au_digen_test(dentry, sigen)) { ++ inode = dentry->d_inode; ++ if (inode) { ++ if (!S_ISDIR(inode->i_mode)) ++ err = au_do_refresh(dentry, /*dir_flags*/0, ++ parent); ++ else { ++ err = au_do_refresh(dentry, dir_flags, parent); ++ if (unlikely(err)) ++ au_fset_si(sbinfo, FAILED_REFRESH_DIR); ++ } ++ } else ++ err = au_do_refresh(dentry, /*dir_flags*/0, parent); ++ AuDbgDentry(dentry); ++ } ++ dput(parent); ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_refresh_d(struct super_block *sb) ++{ ++ int err, i, j, ndentry, e; ++ unsigned int sigen; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries, *d; ++ struct au_sbinfo *sbinfo; ++ struct dentry *root = sb->s_root; ++ const unsigned int dir_flags = au_hi_flags(root->d_inode, /*isdir*/1); ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_dcsub_pages(&dpages, root, NULL, NULL); ++ if (unlikely(err)) ++ goto out_dpages; ++ ++ sigen = au_sigen(sb); ++ sbinfo = au_sbi(sb); ++ for (i = 0; i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ ndentry = dpage->ndentry; ++ for (j = 0; j < ndentry; j++) { ++ d = dentries[j]; ++ e = au_do_refresh_d(d, sigen, sbinfo, dir_flags); ++ if (unlikely(e && !err)) ++ err = e; ++ /* go on even err */ ++ } ++ } ++ ++out_dpages: ++ au_dpages_free(&dpages); ++out: ++ return err; ++} ++ ++static int au_refresh_i(struct super_block *sb) ++{ ++ int err, e; ++ unsigned int sigen; ++ unsigned long long max, ull; ++ struct inode *inode, **array; ++ ++ array = au_iarray_alloc(sb, &max); ++ err = PTR_ERR(array); ++ if (IS_ERR(array)) ++ goto out; ++ ++ err = 0; ++ sigen = au_sigen(sb); ++ for (ull = 0; ull < max; ull++) { ++ inode = array[ull]; ++ if (au_iigen(inode) != sigen) { ++ ii_write_lock_child(inode); ++ e = au_refresh_hinode_self(inode); ++ ii_write_unlock(inode); ++ if (unlikely(e)) { ++ pr_err("error %d, i%lu\n", e, inode->i_ino); ++ if (!err) ++ err = e; ++ /* go on even if err */ ++ } ++ } ++ } ++ ++ au_iarray_free(array, max); ++ ++out: ++ return err; ++} ++ ++static void au_remount_refresh(struct super_block *sb) ++{ ++ int err, e; ++ unsigned int udba; ++ aufs_bindex_t bindex, bend; ++ struct dentry *root; ++ struct inode *inode; ++ struct au_branch *br; ++ ++ au_sigen_inc(sb); ++ au_fclr_si(au_sbi(sb), FAILED_REFRESH_DIR); ++ ++ root = sb->s_root; ++ DiMustNoWaiters(root); ++ inode = root->d_inode; ++ IiMustNoWaiters(inode); ++ ++ udba = au_opt_udba(sb); ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ err = au_hnotify_reset_br(udba, br, br->br_perm); ++ if (unlikely(err)) ++ AuIOErr("hnotify failed on br %d, %d, ignored\n", ++ bindex, err); ++ /* go on even if err */ ++ } ++ au_hn_reset(inode, au_hi_flags(inode, /*isdir*/1)); ++ ++ di_write_unlock(root); ++ err = au_refresh_d(sb); ++ e = au_refresh_i(sb); ++ if (unlikely(e && !err)) ++ err = e; ++ /* aufs_write_lock() calls ..._child() */ ++ di_write_lock_child(root); ++ ++ au_cpup_attr_all(inode, /*force*/1); ++ ++ if (unlikely(err)) ++ AuIOErr("refresh failed, ignored, %d\n", err); ++} ++ ++/* stop extra interpretation of errno in mount(8), and strange error messages */ ++static int cvt_err(int err) ++{ ++ AuTraceErr(err); ++ ++ switch (err) { ++ case -ENOENT: ++ case -ENOTDIR: ++ case -EEXIST: ++ case -EIO: ++ err = -EINVAL; ++ } ++ return err; ++} ++ ++static int aufs_remount_fs(struct super_block *sb, int *flags, char *data) ++{ ++ int err, do_dx; ++ unsigned int mntflags; ++ struct au_opts opts; ++ struct dentry *root; ++ struct inode *inode; ++ struct au_sbinfo *sbinfo; ++ ++ err = 0; ++ root = sb->s_root; ++ if (!data || !*data) { ++ err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (!err) { ++ di_write_lock_child(root); ++ err = au_opts_verify(sb, *flags, /*pending*/0); ++ aufs_write_unlock(root); ++ } ++ goto out; ++ } ++ ++ err = -ENOMEM; ++ memset(&opts, 0, sizeof(opts)); ++ opts.opt = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!opts.opt)) ++ goto out; ++ opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); ++ opts.flags = AuOpts_REMOUNT; ++ opts.sb_flags = *flags; ++ ++ /* parse it before aufs lock */ ++ err = au_opts_parse(sb, data, &opts); ++ if (unlikely(err)) ++ goto out_opts; ++ ++ sbinfo = au_sbi(sb); ++ inode = root->d_inode; ++ mutex_lock(&inode->i_mutex); ++ err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out_mtx; ++ di_write_lock_child(root); ++ ++ /* au_opts_remount() may return an error */ ++ err = au_opts_remount(sb, &opts); ++ au_opts_free(&opts); ++ ++ if (au_ftest_opts(opts.flags, REFRESH)) ++ au_remount_refresh(sb); ++ ++ if (au_ftest_opts(opts.flags, REFRESH_DYAOP)) { ++ mntflags = au_mntflags(sb); ++ do_dx = !!au_opt_test(mntflags, DIO); ++ au_dy_arefresh(do_dx); ++ } ++ ++ aufs_write_unlock(root); ++ ++out_mtx: ++ mutex_unlock(&inode->i_mutex); ++out_opts: ++ free_page((unsigned long)opts.opt); ++out: ++ err = cvt_err(err); ++ AuTraceErr(err); ++ return err; ++} ++ ++static const struct super_operations aufs_sop = { ++ .alloc_inode = aufs_alloc_inode, ++ .destroy_inode = aufs_destroy_inode, ++ /* always deleting, no clearing */ ++ .drop_inode = generic_delete_inode, ++ .show_options = aufs_show_options, ++ .statfs = aufs_statfs, ++ .put_super = aufs_put_super, ++ .remount_fs = aufs_remount_fs ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int alloc_root(struct super_block *sb) ++{ ++ int err; ++ struct inode *inode; ++ struct dentry *root; ++ ++ err = -ENOMEM; ++ inode = au_iget_locked(sb, AUFS_ROOT_INO); ++ err = PTR_ERR(inode); ++ if (IS_ERR(inode)) ++ goto out; ++ ++ inode->i_op = &aufs_dir_iop; ++ inode->i_fop = &aufs_dir_fop; ++ inode->i_mode = S_IFDIR; ++ set_nlink(inode, 2); ++ unlock_new_inode(inode); ++ ++ root = d_alloc_root(inode); ++ if (unlikely(!root)) ++ goto out_iput; ++ err = PTR_ERR(root); ++ if (IS_ERR(root)) ++ goto out_iput; ++ ++ err = au_di_init(root); ++ if (!err) { ++ sb->s_root = root; ++ return 0; /* success */ ++ } ++ dput(root); ++ goto out; /* do not iput */ ++ ++out_iput: ++ iget_failed(inode); ++out: ++ return err; ++ ++} ++ ++static int aufs_fill_super(struct super_block *sb, void *raw_data, ++ int silent __maybe_unused) ++{ ++ int err; ++ struct au_opts opts; ++ struct dentry *root; ++ struct inode *inode; ++ char *arg = raw_data; ++ ++ if (unlikely(!arg || !*arg)) { ++ err = -EINVAL; ++ pr_err("no arg\n"); ++ goto out; ++ } ++ ++ err = -ENOMEM; ++ memset(&opts, 0, sizeof(opts)); ++ opts.opt = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!opts.opt)) ++ goto out; ++ opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); ++ opts.sb_flags = sb->s_flags; ++ ++ err = au_si_alloc(sb); ++ if (unlikely(err)) ++ goto out_opts; ++ ++ /* all timestamps always follow the ones on the branch */ ++ sb->s_flags |= MS_NOATIME | MS_NODIRATIME; ++ sb->s_op = &aufs_sop; ++ sb->s_d_op = &aufs_dop; ++ sb->s_magic = AUFS_SUPER_MAGIC; ++ sb->s_maxbytes = 0; ++ au_export_init(sb); ++ ++ err = alloc_root(sb); ++ if (unlikely(err)) { ++ si_write_unlock(sb); ++ goto out_info; ++ } ++ root = sb->s_root; ++ inode = root->d_inode; ++ ++ /* ++ * actually we can parse options regardless aufs lock here. ++ * but at remount time, parsing must be done before aufs lock. ++ * so we follow the same rule. ++ */ ++ ii_write_lock_parent(inode); ++ aufs_write_unlock(root); ++ err = au_opts_parse(sb, arg, &opts); ++ if (unlikely(err)) ++ goto out_root; ++ ++ /* lock vfs_inode first, then aufs. */ ++ mutex_lock(&inode->i_mutex); ++ aufs_write_lock(root); ++ err = au_opts_mount(sb, &opts); ++ au_opts_free(&opts); ++ aufs_write_unlock(root); ++ mutex_unlock(&inode->i_mutex); ++ if (!err) ++ goto out_opts; /* success */ ++ ++out_root: ++ dput(root); ++ sb->s_root = NULL; ++out_info: ++ dbgaufs_si_fin(au_sbi(sb)); ++ kobject_put(&au_sbi(sb)->si_kobj); ++ sb->s_fs_info = NULL; ++out_opts: ++ free_page((unsigned long)opts.opt); ++out: ++ AuTraceErr(err); ++ err = cvt_err(err); ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry *aufs_mount(struct file_system_type *fs_type, int flags, ++ const char *dev_name __maybe_unused, ++ void *raw_data) ++{ ++ struct dentry *root; ++ struct super_block *sb; ++ ++ /* all timestamps always follow the ones on the branch */ ++ /* mnt->mnt_flags |= MNT_NOATIME | MNT_NODIRATIME; */ ++ root = mount_nodev(fs_type, flags, raw_data, aufs_fill_super); ++ if (IS_ERR(root)) ++ goto out; ++ ++ sb = root->d_sb; ++ si_write_lock(sb, !AuLock_FLUSH); ++ sysaufs_brs_add(sb, 0); ++ si_write_unlock(sb); ++ au_sbilist_add(sb); ++ ++out: ++ return root; ++} ++ ++static void aufs_kill_sb(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ sbinfo = au_sbi(sb); ++ if (sbinfo) { ++ au_sbilist_del(sb); ++ aufs_write_lock(sb->s_root); ++ if (sbinfo->si_wbr_create_ops->fin) ++ sbinfo->si_wbr_create_ops->fin(sb); ++ if (au_opt_test(sbinfo->si_mntflags, UDBA_HNOTIFY)) { ++ au_opt_set_udba(sbinfo->si_mntflags, UDBA_NONE); ++ au_remount_refresh(sb); ++ } ++ if (au_opt_test(sbinfo->si_mntflags, PLINK)) ++ au_plink_put(sb, /*verbose*/1); ++ au_xino_clr(sb); ++ sbinfo->si_sb = NULL; ++ aufs_write_unlock(sb->s_root); ++ au_nwt_flush(&sbinfo->si_nowait); ++ } ++ generic_shutdown_super(sb); ++} ++ ++struct file_system_type aufs_fs_type = { ++ .name = AUFS_FSTYPE, ++ .fs_flags = ++ FS_RENAME_DOES_D_MOVE /* a race between rename and others */ ++ | FS_REVAL_DOT, /* for NFS branch and udba */ ++ .mount = aufs_mount, ++ .kill_sb = aufs_kill_sb, ++ /* no need to __module_get() and module_put(). */ ++ .owner = THIS_MODULE, ++}; +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/super.h linux-3.2.0-gentoo-r1/fs/aufs/super.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/super.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/super.h 2012-01-17 12:11:24.937621009 +0100 +@@ -0,0 +1,546 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * super_block operations ++ */ ++ ++#ifndef __AUFS_SUPER_H__ ++#define __AUFS_SUPER_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include "rwsem.h" ++#include "spl.h" ++#include "wkq.h" ++ ++typedef ssize_t (*au_readf_t)(struct file *, char __user *, size_t, loff_t *); ++typedef ssize_t (*au_writef_t)(struct file *, const char __user *, size_t, ++ loff_t *); ++ ++/* policies to select one among multiple writable branches */ ++struct au_wbr_copyup_operations { ++ int (*copyup)(struct dentry *dentry); ++}; ++ ++struct au_wbr_create_operations { ++ int (*create)(struct dentry *dentry, int isdir); ++ int (*init)(struct super_block *sb); ++ int (*fin)(struct super_block *sb); ++}; ++ ++struct au_wbr_mfs { ++ struct mutex mfs_lock; /* protect this structure */ ++ unsigned long mfs_jiffy; ++ unsigned long mfs_expire; ++ aufs_bindex_t mfs_bindex; ++ ++ unsigned long long mfsrr_bytes; ++ unsigned long long mfsrr_watermark; ++}; ++ ++struct au_branch; ++struct au_sbinfo { ++ /* nowait tasks in the system-wide workqueue */ ++ struct au_nowait_tasks si_nowait; ++ ++ /* ++ * tried sb->s_umount, but failed due to the dependecy between i_mutex. ++ * rwsem for au_sbinfo is necessary. ++ */ ++ struct au_rwsem si_rwsem; ++ ++ /* prevent recursive locking in deleting inode */ ++ struct { ++ unsigned long *bitmap; ++ spinlock_t tree_lock; ++ struct radix_tree_root tree; ++ } au_si_pid; ++ ++ /* ++ * dirty approach to protect sb->sb_inodes and ->s_files from remount. ++ */ ++ atomic_long_t si_ninodes, si_nfiles; ++ ++ /* branch management */ ++ unsigned int si_generation; ++ ++ /* see above flags */ ++ unsigned char au_si_status; ++ ++ aufs_bindex_t si_bend; ++ ++ /* dirty trick to keep br_id plus */ ++ unsigned int si_last_br_id : ++ sizeof(aufs_bindex_t) * BITS_PER_BYTE - 1; ++ struct au_branch **si_branch; ++ ++ /* policy to select a writable branch */ ++ unsigned char si_wbr_copyup; ++ unsigned char si_wbr_create; ++ struct au_wbr_copyup_operations *si_wbr_copyup_ops; ++ struct au_wbr_create_operations *si_wbr_create_ops; ++ ++ /* round robin */ ++ atomic_t si_wbr_rr_next; ++ ++ /* most free space */ ++ struct au_wbr_mfs si_wbr_mfs; ++ ++ /* mount flags */ ++ /* include/asm-ia64/siginfo.h defines a macro named si_flags */ ++ unsigned int si_mntflags; ++ ++ /* external inode number (bitmap and translation table) */ ++ au_readf_t si_xread; ++ au_writef_t si_xwrite; ++ struct file *si_xib; ++ struct mutex si_xib_mtx; /* protect xib members */ ++ unsigned long *si_xib_buf; ++ unsigned long si_xib_last_pindex; ++ int si_xib_next_bit; ++ aufs_bindex_t si_xino_brid; ++ /* reserved for future use */ ++ /* unsigned long long si_xib_limit; */ /* Max xib file size */ ++ ++#ifdef CONFIG_AUFS_EXPORT ++ /* i_generation */ ++ struct file *si_xigen; ++ atomic_t si_xigen_next; ++#endif ++ ++ /* vdir parameters */ ++ unsigned long si_rdcache; /* max cache time in jiffies */ ++ unsigned int si_rdblk; /* deblk size */ ++ unsigned int si_rdhash; /* hash size */ ++ ++ /* ++ * If the number of whiteouts are larger than si_dirwh, leave all of ++ * them after au_whtmp_ren to reduce the cost of rmdir(2). ++ * future fsck.aufs or kernel thread will remove them later. ++ * Otherwise, remove all whiteouts and the dir in rmdir(2). ++ */ ++ unsigned int si_dirwh; ++ ++ /* ++ * rename(2) a directory with all children. ++ */ ++ /* reserved for future use */ ++ /* int si_rendir; */ ++ ++ /* pseudo_link list */ ++ struct au_splhead si_plink; ++ wait_queue_head_t si_plink_wq; ++ spinlock_t si_plink_maint_lock; ++ pid_t si_plink_maint_pid; ++ ++ /* ++ * sysfs and lifetime management. ++ * this is not a small structure and it may be a waste of memory in case ++ * of sysfs is disabled, particulary when many aufs-es are mounted. ++ * but using sysfs is majority. ++ */ ++ struct kobject si_kobj; ++#ifdef CONFIG_DEBUG_FS ++ struct dentry *si_dbgaufs, *si_dbgaufs_xib; ++#ifdef CONFIG_AUFS_EXPORT ++ struct dentry *si_dbgaufs_xigen; ++#endif ++#endif ++ ++#ifdef CONFIG_AUFS_SBILIST ++ struct list_head si_list; ++#endif ++ ++ /* dirty, necessary for unmounting, sysfs and sysrq */ ++ struct super_block *si_sb; ++}; ++ ++/* sbinfo status flags */ ++/* ++ * set true when refresh_dirs() failed at remount time. ++ * then try refreshing dirs at access time again. ++ * if it is false, refreshing dirs at access time is unnecesary ++ */ ++#define AuSi_FAILED_REFRESH_DIR 1 ++static inline unsigned char au_do_ftest_si(struct au_sbinfo *sbi, ++ unsigned int flag) ++{ ++ AuRwMustAnyLock(&sbi->si_rwsem); ++ return sbi->au_si_status & flag; ++} ++#define au_ftest_si(sbinfo, name) au_do_ftest_si(sbinfo, AuSi_##name) ++#define au_fset_si(sbinfo, name) do { \ ++ AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ ++ (sbinfo)->au_si_status |= AuSi_##name; \ ++} while (0) ++#define au_fclr_si(sbinfo, name) do { \ ++ AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ ++ (sbinfo)->au_si_status &= ~AuSi_##name; \ ++} while (0) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policy to select one among writable branches */ ++#define AuWbrCopyup(sbinfo, ...) \ ++ ((sbinfo)->si_wbr_copyup_ops->copyup(__VA_ARGS__)) ++#define AuWbrCreate(sbinfo, ...) \ ++ ((sbinfo)->si_wbr_create_ops->create(__VA_ARGS__)) ++ ++/* flags for si_read_lock()/aufs_read_lock()/di_read_lock() */ ++#define AuLock_DW 1 /* write-lock dentry */ ++#define AuLock_IR (1 << 1) /* read-lock inode */ ++#define AuLock_IW (1 << 2) /* write-lock inode */ ++#define AuLock_FLUSH (1 << 3) /* wait for 'nowait' tasks */ ++#define AuLock_DIR (1 << 4) /* target is a dir */ ++#define AuLock_NOPLM (1 << 5) /* return err in plm mode */ ++#define AuLock_NOPLMW (1 << 6) /* wait for plm mode ends */ ++#define AuLock_GEN (1 << 7) /* test digen/iigen */ ++#define au_ftest_lock(flags, name) ((flags) & AuLock_##name) ++#define au_fset_lock(flags, name) \ ++ do { (flags) |= AuLock_##name; } while (0) ++#define au_fclr_lock(flags, name) \ ++ do { (flags) &= ~AuLock_##name; } while (0) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* super.c */ ++extern struct file_system_type aufs_fs_type; ++struct inode *au_iget_locked(struct super_block *sb, ino_t ino); ++typedef unsigned long long (*au_arraycb_t)(void *array, unsigned long long max, ++ void *arg); ++void au_array_free(void *array); ++void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb, void *arg); ++struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max); ++void au_iarray_free(struct inode **a, unsigned long long max); ++ ++/* sbinfo.c */ ++void au_si_free(struct kobject *kobj); ++int au_si_alloc(struct super_block *sb); ++int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr); ++ ++unsigned int au_sigen_inc(struct super_block *sb); ++aufs_bindex_t au_new_br_id(struct super_block *sb); ++ ++int si_read_lock(struct super_block *sb, int flags); ++int si_write_lock(struct super_block *sb, int flags); ++int aufs_read_lock(struct dentry *dentry, int flags); ++void aufs_read_unlock(struct dentry *dentry, int flags); ++void aufs_write_lock(struct dentry *dentry); ++void aufs_write_unlock(struct dentry *dentry); ++int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags); ++void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2); ++ ++int si_pid_test_slow(struct super_block *sb); ++void si_pid_set_slow(struct super_block *sb); ++void si_pid_clr_slow(struct super_block *sb); ++ ++/* wbr_policy.c */ ++extern struct au_wbr_copyup_operations au_wbr_copyup_ops[]; ++extern struct au_wbr_create_operations au_wbr_create_ops[]; ++int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_sbinfo *au_sbi(struct super_block *sb) ++{ ++ return sb->s_fs_info; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_EXPORT ++void au_export_init(struct super_block *sb); ++ ++static inline int au_test_nfsd(void) ++{ ++ struct task_struct *tsk = current; ++ ++ return (tsk->flags & PF_KTHREAD) ++ && !strcmp(tsk->comm, "nfsd"); ++} ++ ++void au_xigen_inc(struct inode *inode); ++int au_xigen_new(struct inode *inode); ++int au_xigen_set(struct super_block *sb, struct file *base); ++void au_xigen_clr(struct super_block *sb); ++ ++static inline int au_busy_or_stale(void) ++{ ++ if (!au_test_nfsd()) ++ return -EBUSY; ++ return -ESTALE; ++} ++#else ++AuStubVoid(au_export_init, struct super_block *sb) ++AuStubInt0(au_test_nfsd, void) ++AuStubVoid(au_xigen_inc, struct inode *inode) ++AuStubInt0(au_xigen_new, struct inode *inode) ++AuStubInt0(au_xigen_set, struct super_block *sb, struct file *base) ++AuStubVoid(au_xigen_clr, struct super_block *sb) ++static inline int au_busy_or_stale(void) ++{ ++ return -EBUSY; ++} ++#endif /* CONFIG_AUFS_EXPORT */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_SBILIST ++/* module.c */ ++extern struct au_splhead au_sbilist; ++ ++static inline void au_sbilist_init(void) ++{ ++ au_spl_init(&au_sbilist); ++} ++ ++static inline void au_sbilist_add(struct super_block *sb) ++{ ++ au_spl_add(&au_sbi(sb)->si_list, &au_sbilist); ++} ++ ++static inline void au_sbilist_del(struct super_block *sb) ++{ ++ au_spl_del(&au_sbi(sb)->si_list, &au_sbilist); ++} ++ ++#ifdef CONFIG_AUFS_MAGIC_SYSRQ ++static inline void au_sbilist_lock(void) ++{ ++ spin_lock(&au_sbilist.spin); ++} ++ ++static inline void au_sbilist_unlock(void) ++{ ++ spin_unlock(&au_sbilist.spin); ++} ++#define AuGFP_SBILIST GFP_ATOMIC ++#else ++AuStubVoid(au_sbilist_lock, void) ++AuStubVoid(au_sbilist_unlock, void) ++#define AuGFP_SBILIST GFP_NOFS ++#endif /* CONFIG_AUFS_MAGIC_SYSRQ */ ++#else ++AuStubVoid(au_sbilist_init, void) ++AuStubVoid(au_sbilist_add, struct super_block*) ++AuStubVoid(au_sbilist_del, struct super_block*) ++AuStubVoid(au_sbilist_lock, void) ++AuStubVoid(au_sbilist_unlock, void) ++#define AuGFP_SBILIST GFP_NOFS ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline void dbgaufs_si_null(struct au_sbinfo *sbinfo) ++{ ++ /* ++ * This function is a dynamic '__init' fucntion actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++#ifdef CONFIG_DEBUG_FS ++ sbinfo->si_dbgaufs = NULL; ++ sbinfo->si_dbgaufs_xib = NULL; ++#ifdef CONFIG_AUFS_EXPORT ++ sbinfo->si_dbgaufs_xigen = NULL; ++#endif ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline pid_t si_pid_bit(void) ++{ ++ /* the origin of pid is 1, but the bitmap's is 0 */ ++ return current->pid - 1; ++} ++ ++static inline int si_pid_test(struct super_block *sb) ++{ ++ pid_t bit = si_pid_bit(); ++ if (bit < PID_MAX_DEFAULT) ++ return test_bit(bit, au_sbi(sb)->au_si_pid.bitmap); ++ else ++ return si_pid_test_slow(sb); ++} ++ ++static inline void si_pid_set(struct super_block *sb) ++{ ++ pid_t bit = si_pid_bit(); ++ if (bit < PID_MAX_DEFAULT) { ++ AuDebugOn(test_bit(bit, au_sbi(sb)->au_si_pid.bitmap)); ++ set_bit(bit, au_sbi(sb)->au_si_pid.bitmap); ++ /* smp_mb(); */ ++ } else ++ si_pid_set_slow(sb); ++} ++ ++static inline void si_pid_clr(struct super_block *sb) ++{ ++ pid_t bit = si_pid_bit(); ++ if (bit < PID_MAX_DEFAULT) { ++ AuDebugOn(!test_bit(bit, au_sbi(sb)->au_si_pid.bitmap)); ++ clear_bit(bit, au_sbi(sb)->au_si_pid.bitmap); ++ /* smp_mb(); */ ++ } else ++ si_pid_clr_slow(sb); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock superblock. mainly for entry point functions */ ++/* ++ * __si_read_lock, __si_write_lock, ++ * __si_read_unlock, __si_write_unlock, __si_downgrade_lock ++ */ ++AuSimpleRwsemFuncs(__si, struct super_block *sb, &au_sbi(sb)->si_rwsem); ++ ++#define SiMustNoWaiters(sb) AuRwMustNoWaiters(&au_sbi(sb)->si_rwsem) ++#define SiMustAnyLock(sb) AuRwMustAnyLock(&au_sbi(sb)->si_rwsem) ++#define SiMustWriteLock(sb) AuRwMustWriteLock(&au_sbi(sb)->si_rwsem) ++ ++static inline void si_noflush_read_lock(struct super_block *sb) ++{ ++ __si_read_lock(sb); ++ si_pid_set(sb); ++} ++ ++static inline int si_noflush_read_trylock(struct super_block *sb) ++{ ++ int locked = __si_read_trylock(sb); ++ if (locked) ++ si_pid_set(sb); ++ return locked; ++} ++ ++static inline void si_noflush_write_lock(struct super_block *sb) ++{ ++ __si_write_lock(sb); ++ si_pid_set(sb); ++} ++ ++static inline int si_noflush_write_trylock(struct super_block *sb) ++{ ++ int locked = __si_write_trylock(sb); ++ if (locked) ++ si_pid_set(sb); ++ return locked; ++} ++ ++#if 0 /* unused */ ++static inline int si_read_trylock(struct super_block *sb, int flags) ++{ ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ return si_noflush_read_trylock(sb); ++} ++#endif ++ ++static inline void si_read_unlock(struct super_block *sb) ++{ ++ si_pid_clr(sb); ++ __si_read_unlock(sb); ++} ++ ++#if 0 /* unused */ ++static inline int si_write_trylock(struct super_block *sb, int flags) ++{ ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ return si_noflush_write_trylock(sb); ++} ++#endif ++ ++static inline void si_write_unlock(struct super_block *sb) ++{ ++ si_pid_clr(sb); ++ __si_write_unlock(sb); ++} ++ ++#if 0 /* unused */ ++static inline void si_downgrade_lock(struct super_block *sb) ++{ ++ __si_downgrade_lock(sb); ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline aufs_bindex_t au_sbend(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_bend; ++} ++ ++static inline unsigned int au_mntflags(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_mntflags; ++} ++ ++static inline unsigned int au_sigen(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_generation; ++} ++ ++static inline void au_ninodes_inc(struct super_block *sb) ++{ ++ atomic_long_inc(&au_sbi(sb)->si_ninodes); ++} ++ ++static inline void au_ninodes_dec(struct super_block *sb) ++{ ++ AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_ninodes)); ++ atomic_long_dec(&au_sbi(sb)->si_ninodes); ++} ++ ++static inline void au_nfiles_inc(struct super_block *sb) ++{ ++ atomic_long_inc(&au_sbi(sb)->si_nfiles); ++} ++ ++static inline void au_nfiles_dec(struct super_block *sb) ++{ ++ AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_nfiles)); ++ atomic_long_dec(&au_sbi(sb)->si_nfiles); ++} ++ ++static inline struct au_branch *au_sbr(struct super_block *sb, ++ aufs_bindex_t bindex) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_branch[0 + bindex]; ++} ++ ++static inline void au_xino_brid_set(struct super_block *sb, aufs_bindex_t brid) ++{ ++ SiMustWriteLock(sb); ++ au_sbi(sb)->si_xino_brid = brid; ++} ++ ++static inline aufs_bindex_t au_xino_brid(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_xino_brid; ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_SUPER_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/sysaufs.c linux-3.2.0-gentoo-r1/fs/aufs/sysaufs.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/sysaufs.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/sysaufs.c 2012-01-17 12:11:24.958454530 +0100 +@@ -0,0 +1,105 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sysfs interface and lifetime management ++ * they are necessary regardless sysfs is disabled. ++ */ ++ ++#include ++#include "aufs.h" ++ ++unsigned long sysaufs_si_mask; ++struct kset *sysaufs_kset; ++ ++#define AuSiAttr(_name) { \ ++ .attr = { .name = __stringify(_name), .mode = 0444 }, \ ++ .show = sysaufs_si_##_name, \ ++} ++ ++static struct sysaufs_si_attr sysaufs_si_attr_xi_path = AuSiAttr(xi_path); ++struct attribute *sysaufs_si_attrs[] = { ++ &sysaufs_si_attr_xi_path.attr, ++ NULL, ++}; ++ ++static const struct sysfs_ops au_sbi_ops = { ++ .show = sysaufs_si_show ++}; ++ ++static struct kobj_type au_sbi_ktype = { ++ .release = au_si_free, ++ .sysfs_ops = &au_sbi_ops, ++ .default_attrs = sysaufs_si_attrs ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++int sysaufs_si_init(struct au_sbinfo *sbinfo) ++{ ++ int err; ++ ++ sbinfo->si_kobj.kset = sysaufs_kset; ++ /* cf. sysaufs_name() */ ++ err = kobject_init_and_add ++ (&sbinfo->si_kobj, &au_sbi_ktype, /*&sysaufs_kset->kobj*/NULL, ++ SysaufsSiNamePrefix "%lx", sysaufs_si_id(sbinfo)); ++ ++ dbgaufs_si_null(sbinfo); ++ if (!err) { ++ err = dbgaufs_si_init(sbinfo); ++ if (unlikely(err)) ++ kobject_put(&sbinfo->si_kobj); ++ } ++ return err; ++} ++ ++void sysaufs_fin(void) ++{ ++ dbgaufs_fin(); ++ sysfs_remove_group(&sysaufs_kset->kobj, sysaufs_attr_group); ++ kset_unregister(sysaufs_kset); ++} ++ ++int __init sysaufs_init(void) ++{ ++ int err; ++ ++ do { ++ get_random_bytes(&sysaufs_si_mask, sizeof(sysaufs_si_mask)); ++ } while (!sysaufs_si_mask); ++ ++ err = -EINVAL; ++ sysaufs_kset = kset_create_and_add(AUFS_NAME, NULL, fs_kobj); ++ if (unlikely(!sysaufs_kset)) ++ goto out; ++ err = PTR_ERR(sysaufs_kset); ++ if (IS_ERR(sysaufs_kset)) ++ goto out; ++ err = sysfs_create_group(&sysaufs_kset->kobj, sysaufs_attr_group); ++ if (unlikely(err)) { ++ kset_unregister(sysaufs_kset); ++ goto out; ++ } ++ ++ err = dbgaufs_init(); ++ if (unlikely(err)) ++ sysaufs_fin(); ++out: ++ return err; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/sysaufs.h linux-3.2.0-gentoo-r1/fs/aufs/sysaufs.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/sysaufs.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/sysaufs.h 2012-01-17 12:11:24.963084201 +0100 +@@ -0,0 +1,104 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sysfs interface and mount lifetime management ++ */ ++ ++#ifndef __SYSAUFS_H__ ++#define __SYSAUFS_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include "module.h" ++ ++struct super_block; ++struct au_sbinfo; ++ ++struct sysaufs_si_attr { ++ struct attribute attr; ++ int (*show)(struct seq_file *seq, struct super_block *sb); ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* sysaufs.c */ ++extern unsigned long sysaufs_si_mask; ++extern struct kset *sysaufs_kset; ++extern struct attribute *sysaufs_si_attrs[]; ++int sysaufs_si_init(struct au_sbinfo *sbinfo); ++int __init sysaufs_init(void); ++void sysaufs_fin(void); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* some people doesn't like to show a pointer in kernel */ ++static inline unsigned long sysaufs_si_id(struct au_sbinfo *sbinfo) ++{ ++ return sysaufs_si_mask ^ (unsigned long)sbinfo; ++} ++ ++#define SysaufsSiNamePrefix "si_" ++#define SysaufsSiNameLen (sizeof(SysaufsSiNamePrefix) + 16) ++static inline void sysaufs_name(struct au_sbinfo *sbinfo, char *name) ++{ ++ snprintf(name, SysaufsSiNameLen, SysaufsSiNamePrefix "%lx", ++ sysaufs_si_id(sbinfo)); ++} ++ ++struct au_branch; ++#ifdef CONFIG_SYSFS ++/* sysfs.c */ ++extern struct attribute_group *sysaufs_attr_group; ++ ++int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb); ++ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, ++ char *buf); ++ ++void sysaufs_br_init(struct au_branch *br); ++void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); ++void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); ++ ++#define sysaufs_brs_init() do {} while (0) ++ ++#else ++#define sysaufs_attr_group NULL ++ ++AuStubInt0(sysaufs_si_xi_path, struct seq_file *seq, struct super_block *sb) ++ ++static inline ++ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, ++ char *buf) ++{ ++ return 0; ++} ++ ++AuStubVoid(sysaufs_br_init, struct au_branch *br) ++AuStubVoid(sysaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex) ++AuStubVoid(sysaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex) ++ ++static inline void sysaufs_brs_init(void) ++{ ++ sysaufs_brs = 0; ++} ++ ++#endif /* CONFIG_SYSFS */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __SYSAUFS_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/sysfs.c linux-3.2.0-gentoo-r1/fs/aufs/sysfs.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/sysfs.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/sysfs.c 2012-01-17 12:11:24.983917722 +0100 +@@ -0,0 +1,257 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sysfs interface ++ */ ++ ++#include ++#include "aufs.h" ++ ++#ifdef CONFIG_AUFS_FS_MODULE ++/* this entry violates the "one line per file" policy of sysfs */ ++static ssize_t config_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ ssize_t err; ++ static char *conf = ++/* this file is generated at compiling */ ++#include "conf.str" ++ ; ++ ++ err = snprintf(buf, PAGE_SIZE, conf); ++ if (unlikely(err >= PAGE_SIZE)) ++ err = -EFBIG; ++ return err; ++} ++ ++static struct kobj_attribute au_config_attr = __ATTR_RO(config); ++#endif ++ ++static struct attribute *au_attr[] = { ++#ifdef CONFIG_AUFS_FS_MODULE ++ &au_config_attr.attr, ++#endif ++ NULL, /* need to NULL terminate the list of attributes */ ++}; ++ ++static struct attribute_group sysaufs_attr_group_body = { ++ .attrs = au_attr ++}; ++ ++struct attribute_group *sysaufs_attr_group = &sysaufs_attr_group_body; ++ ++/* ---------------------------------------------------------------------- */ ++ ++int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb) ++{ ++ int err; ++ ++ SiMustAnyLock(sb); ++ ++ err = 0; ++ if (au_opt_test(au_mntflags(sb), XINO)) { ++ err = au_xino_path(seq, au_sbi(sb)->si_xib); ++ seq_putc(seq, '\n'); ++ } ++ return err; ++} ++ ++/* ++ * the lifetime of branch is independent from the entry under sysfs. ++ * sysfs handles the lifetime of the entry, and never call ->show() after it is ++ * unlinked. ++ */ ++static int sysaufs_si_br(struct seq_file *seq, struct super_block *sb, ++ aufs_bindex_t bindex) ++{ ++ int err; ++ struct path path; ++ struct dentry *root; ++ struct au_branch *br; ++ char *perm; ++ ++ AuDbg("b%d\n", bindex); ++ ++ err = 0; ++ root = sb->s_root; ++ di_read_lock_parent(root, !AuLock_IR); ++ br = au_sbr(sb, bindex); ++ path.mnt = br->br_mnt; ++ path.dentry = au_h_dptr(root, bindex); ++ au_seq_path(seq, &path); ++ di_read_unlock(root, !AuLock_IR); ++ perm = au_optstr_br_perm(br->br_perm); ++ if (perm) { ++ err = seq_printf(seq, "=%s\n", perm); ++ kfree(perm); ++ if (err == -1) ++ err = -E2BIG; ++ } else ++ err = -ENOMEM; ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct seq_file *au_seq(char *p, ssize_t len) ++{ ++ struct seq_file *seq; ++ ++ seq = kzalloc(sizeof(*seq), GFP_NOFS); ++ if (seq) { ++ /* mutex_init(&seq.lock); */ ++ seq->buf = p; ++ seq->size = len; ++ return seq; /* success */ ++ } ++ ++ seq = ERR_PTR(-ENOMEM); ++ return seq; ++} ++ ++#define SysaufsBr_PREFIX "br" ++ ++/* todo: file size may exceed PAGE_SIZE */ ++ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, ++ char *buf) ++{ ++ ssize_t err; ++ long l; ++ aufs_bindex_t bend; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ struct seq_file *seq; ++ char *name; ++ struct attribute **cattr; ++ ++ sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); ++ sb = sbinfo->si_sb; ++ ++ /* ++ * prevent a race condition between sysfs and aufs. ++ * for instance, sysfs_file_read() calls sysfs_get_active_two() which ++ * prohibits maintaining the sysfs entries. ++ * hew we acquire read lock after sysfs_get_active_two(). ++ * on the other hand, the remount process may maintain the sysfs/aufs ++ * entries after acquiring write lock. ++ * it can cause a deadlock. ++ * simply we gave up processing read here. ++ */ ++ err = -EBUSY; ++ if (unlikely(!si_noflush_read_trylock(sb))) ++ goto out; ++ ++ seq = au_seq(buf, PAGE_SIZE); ++ err = PTR_ERR(seq); ++ if (IS_ERR(seq)) ++ goto out_unlock; ++ ++ name = (void *)attr->name; ++ cattr = sysaufs_si_attrs; ++ while (*cattr) { ++ if (!strcmp(name, (*cattr)->name)) { ++ err = container_of(*cattr, struct sysaufs_si_attr, attr) ++ ->show(seq, sb); ++ goto out_seq; ++ } ++ cattr++; ++ } ++ ++ bend = au_sbend(sb); ++ if (!strncmp(name, SysaufsBr_PREFIX, sizeof(SysaufsBr_PREFIX) - 1)) { ++ name += sizeof(SysaufsBr_PREFIX) - 1; ++ err = kstrtol(name, 10, &l); ++ if (!err) { ++ if (l <= bend) ++ err = sysaufs_si_br(seq, sb, (aufs_bindex_t)l); ++ else ++ err = -ENOENT; ++ } ++ goto out_seq; ++ } ++ BUG(); ++ ++out_seq: ++ if (!err) { ++ err = seq->count; ++ /* sysfs limit */ ++ if (unlikely(err == PAGE_SIZE)) ++ err = -EFBIG; ++ } ++ kfree(seq); ++out_unlock: ++ si_read_unlock(sb); ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void sysaufs_br_init(struct au_branch *br) ++{ ++ struct attribute *attr = &br->br_attr; ++ ++ sysfs_attr_init(attr); ++ attr->name = br->br_name; ++ attr->mode = S_IRUGO; ++} ++ ++void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ struct au_branch *br; ++ struct kobject *kobj; ++ aufs_bindex_t bend; ++ ++ dbgaufs_brs_del(sb, bindex); ++ ++ if (!sysaufs_brs) ++ return; ++ ++ kobj = &au_sbi(sb)->si_kobj; ++ bend = au_sbend(sb); ++ for (; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ sysfs_remove_file(kobj, &br->br_attr); ++ } ++} ++ ++void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ int err; ++ aufs_bindex_t bend; ++ struct kobject *kobj; ++ struct au_branch *br; ++ ++ dbgaufs_brs_add(sb, bindex); ++ ++ if (!sysaufs_brs) ++ return; ++ ++ kobj = &au_sbi(sb)->si_kobj; ++ bend = au_sbend(sb); ++ for (; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ snprintf(br->br_name, sizeof(br->br_name), SysaufsBr_PREFIX ++ "%d", bindex); ++ err = sysfs_create_file(kobj, &br->br_attr); ++ if (unlikely(err)) ++ pr_warning("failed %s under sysfs(%d)\n", ++ br->br_name, err); ++ } ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/sysrq.c linux-3.2.0-gentoo-r1/fs/aufs/sysrq.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/sysrq.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/sysrq.c 2012-01-17 12:11:25.009380916 +0100 +@@ -0,0 +1,148 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * magic sysrq hanlder ++ */ ++ ++/* #include */ ++#include ++#include "aufs.h" ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void sysrq_sb(struct super_block *sb) ++{ ++ char *plevel; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ ++ plevel = au_plevel; ++ au_plevel = KERN_WARNING; ++ ++ sbinfo = au_sbi(sb); ++ /* since we define pr_fmt, call printk directly */ ++ printk(KERN_WARNING "si=%lx\n", sysaufs_si_id(sbinfo)); ++ printk(KERN_WARNING AUFS_NAME ": superblock\n"); ++ au_dpri_sb(sb); ++ ++#if 0 ++ printk(KERN_WARNING AUFS_NAME ": root dentry\n"); ++ au_dpri_dentry(sb->s_root); ++ printk(KERN_WARNING AUFS_NAME ": root inode\n"); ++ au_dpri_inode(sb->s_root->d_inode); ++#endif ++ ++#if 0 ++ do { ++ int err, i, j, ndentry; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ ++ err = au_dpages_init(&dpages, GFP_ATOMIC); ++ if (unlikely(err)) ++ break; ++ err = au_dcsub_pages(&dpages, sb->s_root, NULL, NULL); ++ if (!err) ++ for (i = 0; i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ ndentry = dpage->ndentry; ++ for (j = 0; j < ndentry; j++) ++ au_dpri_dentry(dpage->dentries[j]); ++ } ++ au_dpages_free(&dpages); ++ } while (0); ++#endif ++ ++#if 1 ++ { ++ struct inode *i; ++ printk(KERN_WARNING AUFS_NAME ": isolated inode\n"); ++ spin_lock(&inode_sb_list_lock); ++ list_for_each_entry(i, &sb->s_inodes, i_sb_list) { ++ spin_lock(&i->i_lock); ++ if (1 || list_empty(&i->i_dentry)) ++ au_dpri_inode(i); ++ spin_unlock(&i->i_lock); ++ } ++ spin_unlock(&inode_sb_list_lock); ++ } ++#endif ++ printk(KERN_WARNING AUFS_NAME ": files\n"); ++ lg_global_lock(files_lglock); ++ do_file_list_for_each_entry(sb, file) { ++ umode_t mode; ++ mode = file->f_dentry->d_inode->i_mode; ++ if (!special_file(mode) || au_special_file(mode)) ++ au_dpri_file(file); ++ } while_file_list_for_each_entry; ++ lg_global_unlock(files_lglock); ++ printk(KERN_WARNING AUFS_NAME ": done\n"); ++ ++ au_plevel = plevel; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* module parameter */ ++static char *aufs_sysrq_key = "a"; ++module_param_named(sysrq, aufs_sysrq_key, charp, S_IRUGO); ++MODULE_PARM_DESC(sysrq, "MagicSysRq key for " AUFS_NAME); ++ ++static void au_sysrq(int key __maybe_unused) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ lockdep_off(); ++ au_sbilist_lock(); ++ list_for_each_entry(sbinfo, &au_sbilist.head, si_list) ++ sysrq_sb(sbinfo->si_sb); ++ au_sbilist_unlock(); ++ lockdep_on(); ++} ++ ++static struct sysrq_key_op au_sysrq_op = { ++ .handler = au_sysrq, ++ .help_msg = "Aufs", ++ .action_msg = "Aufs", ++ .enable_mask = SYSRQ_ENABLE_DUMP ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++int __init au_sysrq_init(void) ++{ ++ int err; ++ char key; ++ ++ err = -1; ++ key = *aufs_sysrq_key; ++ if ('a' <= key && key <= 'z') ++ err = register_sysrq_key(key, &au_sysrq_op); ++ if (unlikely(err)) ++ pr_err("err %d, sysrq=%c\n", err, key); ++ return err; ++} ++ ++void au_sysrq_fin(void) ++{ ++ int err; ++ err = unregister_sysrq_key(*aufs_sysrq_key, &au_sysrq_op); ++ if (unlikely(err)) ++ pr_err("err %d (ignored)\n", err); ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/vdir.c linux-3.2.0-gentoo-r1/fs/aufs/vdir.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/vdir.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/vdir.c 2012-01-17 12:11:25.027899601 +0100 +@@ -0,0 +1,885 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * virtual or vertical directory ++ */ ++ ++#include "aufs.h" ++ ++static unsigned int calc_size(int nlen) ++{ ++ return ALIGN(sizeof(struct au_vdir_de) + nlen, sizeof(ino_t)); ++} ++ ++static int set_deblk_end(union au_vdir_deblk_p *p, ++ union au_vdir_deblk_p *deblk_end) ++{ ++ if (calc_size(0) <= deblk_end->deblk - p->deblk) { ++ p->de->de_str.len = 0; ++ /* smp_mb(); */ ++ return 0; ++ } ++ return -1; /* error */ ++} ++ ++/* returns true or false */ ++static int is_deblk_end(union au_vdir_deblk_p *p, ++ union au_vdir_deblk_p *deblk_end) ++{ ++ if (calc_size(0) <= deblk_end->deblk - p->deblk) ++ return !p->de->de_str.len; ++ return 1; ++} ++ ++static unsigned char *last_deblk(struct au_vdir *vdir) ++{ ++ return vdir->vd_deblk[vdir->vd_nblk - 1]; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* estimate the apropriate size for name hash table */ ++unsigned int au_rdhash_est(loff_t sz) ++{ ++ unsigned int n; ++ ++ n = UINT_MAX; ++ sz >>= 10; ++ if (sz < n) ++ n = sz; ++ if (sz < AUFS_RDHASH_DEF) ++ n = AUFS_RDHASH_DEF; ++ /* pr_info("n %u\n", n); */ ++ return n; ++} ++ ++/* ++ * the allocated memory has to be freed by ++ * au_nhash_wh_free() or au_nhash_de_free(). ++ */ ++int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp) ++{ ++ struct hlist_head *head; ++ unsigned int u; ++ ++ head = kmalloc(sizeof(*nhash->nh_head) * num_hash, gfp); ++ if (head) { ++ nhash->nh_num = num_hash; ++ nhash->nh_head = head; ++ for (u = 0; u < num_hash; u++) ++ INIT_HLIST_HEAD(head++); ++ return 0; /* success */ ++ } ++ ++ return -ENOMEM; ++} ++ ++static void nhash_count(struct hlist_head *head) ++{ ++#if 0 ++ unsigned long n; ++ struct hlist_node *pos; ++ ++ n = 0; ++ hlist_for_each(pos, head) ++ n++; ++ pr_info("%lu\n", n); ++#endif ++} ++ ++static void au_nhash_wh_do_free(struct hlist_head *head) ++{ ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos, *node; ++ ++ hlist_for_each_entry_safe(tpos, pos, node, head, wh_hash) { ++ /* hlist_del(pos); */ ++ kfree(tpos); ++ } ++} ++ ++static void au_nhash_de_do_free(struct hlist_head *head) ++{ ++ struct au_vdir_dehstr *tpos; ++ struct hlist_node *pos, *node; ++ ++ hlist_for_each_entry_safe(tpos, pos, node, head, hash) { ++ /* hlist_del(pos); */ ++ au_cache_free_vdir_dehstr(tpos); ++ } ++} ++ ++static void au_nhash_do_free(struct au_nhash *nhash, ++ void (*free)(struct hlist_head *head)) ++{ ++ unsigned int n; ++ struct hlist_head *head; ++ ++ n = nhash->nh_num; ++ if (!n) ++ return; ++ ++ head = nhash->nh_head; ++ while (n-- > 0) { ++ nhash_count(head); ++ free(head++); ++ } ++ kfree(nhash->nh_head); ++} ++ ++void au_nhash_wh_free(struct au_nhash *whlist) ++{ ++ au_nhash_do_free(whlist, au_nhash_wh_do_free); ++} ++ ++static void au_nhash_de_free(struct au_nhash *delist) ++{ ++ au_nhash_do_free(delist, au_nhash_de_do_free); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, ++ int limit) ++{ ++ int num; ++ unsigned int u, n; ++ struct hlist_head *head; ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos; ++ ++ num = 0; ++ n = whlist->nh_num; ++ head = whlist->nh_head; ++ for (u = 0; u < n; u++, head++) ++ hlist_for_each_entry(tpos, pos, head, wh_hash) ++ if (tpos->wh_bindex == btgt && ++num > limit) ++ return 1; ++ return 0; ++} ++ ++static struct hlist_head *au_name_hash(struct au_nhash *nhash, ++ unsigned char *name, ++ unsigned int len) ++{ ++ unsigned int v; ++ /* const unsigned int magic_bit = 12; */ ++ ++ AuDebugOn(!nhash->nh_num || !nhash->nh_head); ++ ++ v = 0; ++ while (len--) ++ v += *name++; ++ /* v = hash_long(v, magic_bit); */ ++ v %= nhash->nh_num; ++ return nhash->nh_head + v; ++} ++ ++static int au_nhash_test_name(struct au_vdir_destr *str, const char *name, ++ int nlen) ++{ ++ return str->len == nlen && !memcmp(str->name, name, nlen); ++} ++ ++/* returns found or not */ ++int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen) ++{ ++ struct hlist_head *head; ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos; ++ struct au_vdir_destr *str; ++ ++ head = au_name_hash(whlist, name, nlen); ++ hlist_for_each_entry(tpos, pos, head, wh_hash) { ++ str = &tpos->wh_str; ++ AuDbg("%.*s\n", str->len, str->name); ++ if (au_nhash_test_name(str, name, nlen)) ++ return 1; ++ } ++ return 0; ++} ++ ++/* returns found(true) or not */ ++static int test_known(struct au_nhash *delist, char *name, int nlen) ++{ ++ struct hlist_head *head; ++ struct au_vdir_dehstr *tpos; ++ struct hlist_node *pos; ++ struct au_vdir_destr *str; ++ ++ head = au_name_hash(delist, name, nlen); ++ hlist_for_each_entry(tpos, pos, head, hash) { ++ str = tpos->str; ++ AuDbg("%.*s\n", str->len, str->name); ++ if (au_nhash_test_name(str, name, nlen)) ++ return 1; ++ } ++ return 0; ++} ++ ++static void au_shwh_init_wh(struct au_vdir_wh *wh, ino_t ino, ++ unsigned char d_type) ++{ ++#ifdef CONFIG_AUFS_SHWH ++ wh->wh_ino = ino; ++ wh->wh_type = d_type; ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, ++ unsigned int d_type, aufs_bindex_t bindex, ++ unsigned char shwh) ++{ ++ int err; ++ struct au_vdir_destr *str; ++ struct au_vdir_wh *wh; ++ ++ AuDbg("%.*s\n", nlen, name); ++ AuDebugOn(!whlist->nh_num || !whlist->nh_head); ++ ++ err = -ENOMEM; ++ wh = kmalloc(sizeof(*wh) + nlen, GFP_NOFS); ++ if (unlikely(!wh)) ++ goto out; ++ ++ err = 0; ++ wh->wh_bindex = bindex; ++ if (shwh) ++ au_shwh_init_wh(wh, ino, d_type); ++ str = &wh->wh_str; ++ str->len = nlen; ++ memcpy(str->name, name, nlen); ++ hlist_add_head(&wh->wh_hash, au_name_hash(whlist, name, nlen)); ++ /* smp_mb(); */ ++ ++out: ++ return err; ++} ++ ++static int append_deblk(struct au_vdir *vdir) ++{ ++ int err; ++ unsigned long ul; ++ const unsigned int deblk_sz = vdir->vd_deblk_sz; ++ union au_vdir_deblk_p p, deblk_end; ++ unsigned char **o; ++ ++ err = -ENOMEM; ++ o = krealloc(vdir->vd_deblk, sizeof(*o) * (vdir->vd_nblk + 1), ++ GFP_NOFS); ++ if (unlikely(!o)) ++ goto out; ++ ++ vdir->vd_deblk = o; ++ p.deblk = kmalloc(deblk_sz, GFP_NOFS); ++ if (p.deblk) { ++ ul = vdir->vd_nblk++; ++ vdir->vd_deblk[ul] = p.deblk; ++ vdir->vd_last.ul = ul; ++ vdir->vd_last.p.deblk = p.deblk; ++ deblk_end.deblk = p.deblk + deblk_sz; ++ err = set_deblk_end(&p, &deblk_end); ++ } ++ ++out: ++ return err; ++} ++ ++static int append_de(struct au_vdir *vdir, char *name, int nlen, ino_t ino, ++ unsigned int d_type, struct au_nhash *delist) ++{ ++ int err; ++ unsigned int sz; ++ const unsigned int deblk_sz = vdir->vd_deblk_sz; ++ union au_vdir_deblk_p p, *room, deblk_end; ++ struct au_vdir_dehstr *dehstr; ++ ++ p.deblk = last_deblk(vdir); ++ deblk_end.deblk = p.deblk + deblk_sz; ++ room = &vdir->vd_last.p; ++ AuDebugOn(room->deblk < p.deblk || deblk_end.deblk <= room->deblk ++ || !is_deblk_end(room, &deblk_end)); ++ ++ sz = calc_size(nlen); ++ if (unlikely(sz > deblk_end.deblk - room->deblk)) { ++ err = append_deblk(vdir); ++ if (unlikely(err)) ++ goto out; ++ ++ p.deblk = last_deblk(vdir); ++ deblk_end.deblk = p.deblk + deblk_sz; ++ /* smp_mb(); */ ++ AuDebugOn(room->deblk != p.deblk); ++ } ++ ++ err = -ENOMEM; ++ dehstr = au_cache_alloc_vdir_dehstr(); ++ if (unlikely(!dehstr)) ++ goto out; ++ ++ dehstr->str = &room->de->de_str; ++ hlist_add_head(&dehstr->hash, au_name_hash(delist, name, nlen)); ++ room->de->de_ino = ino; ++ room->de->de_type = d_type; ++ room->de->de_str.len = nlen; ++ memcpy(room->de->de_str.name, name, nlen); ++ ++ err = 0; ++ room->deblk += sz; ++ if (unlikely(set_deblk_end(room, &deblk_end))) ++ err = append_deblk(vdir); ++ /* smp_mb(); */ ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_vdir_free(struct au_vdir *vdir) ++{ ++ unsigned char **deblk; ++ ++ deblk = vdir->vd_deblk; ++ while (vdir->vd_nblk--) ++ kfree(*deblk++); ++ kfree(vdir->vd_deblk); ++ au_cache_free_vdir(vdir); ++} ++ ++static struct au_vdir *alloc_vdir(struct file *file) ++{ ++ struct au_vdir *vdir; ++ struct super_block *sb; ++ int err; ++ ++ sb = file->f_dentry->d_sb; ++ SiMustAnyLock(sb); ++ ++ err = -ENOMEM; ++ vdir = au_cache_alloc_vdir(); ++ if (unlikely(!vdir)) ++ goto out; ++ ++ vdir->vd_deblk = kzalloc(sizeof(*vdir->vd_deblk), GFP_NOFS); ++ if (unlikely(!vdir->vd_deblk)) ++ goto out_free; ++ ++ vdir->vd_deblk_sz = au_sbi(sb)->si_rdblk; ++ if (!vdir->vd_deblk_sz) { ++ /* estimate the apropriate size for deblk */ ++ vdir->vd_deblk_sz = au_dir_size(file, /*dentry*/NULL); ++ /* pr_info("vd_deblk_sz %u\n", vdir->vd_deblk_sz); */ ++ } ++ vdir->vd_nblk = 0; ++ vdir->vd_version = 0; ++ vdir->vd_jiffy = 0; ++ err = append_deblk(vdir); ++ if (!err) ++ return vdir; /* success */ ++ ++ kfree(vdir->vd_deblk); ++ ++out_free: ++ au_cache_free_vdir(vdir); ++out: ++ vdir = ERR_PTR(err); ++ return vdir; ++} ++ ++static int reinit_vdir(struct au_vdir *vdir) ++{ ++ int err; ++ union au_vdir_deblk_p p, deblk_end; ++ ++ while (vdir->vd_nblk > 1) { ++ kfree(vdir->vd_deblk[vdir->vd_nblk - 1]); ++ /* vdir->vd_deblk[vdir->vd_nblk - 1] = NULL; */ ++ vdir->vd_nblk--; ++ } ++ p.deblk = vdir->vd_deblk[0]; ++ deblk_end.deblk = p.deblk + vdir->vd_deblk_sz; ++ err = set_deblk_end(&p, &deblk_end); ++ /* keep vd_dblk_sz */ ++ vdir->vd_last.ul = 0; ++ vdir->vd_last.p.deblk = vdir->vd_deblk[0]; ++ vdir->vd_version = 0; ++ vdir->vd_jiffy = 0; ++ /* smp_mb(); */ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AuFillVdir_CALLED 1 ++#define AuFillVdir_WHABLE (1 << 1) ++#define AuFillVdir_SHWH (1 << 2) ++#define au_ftest_fillvdir(flags, name) ((flags) & AuFillVdir_##name) ++#define au_fset_fillvdir(flags, name) \ ++ do { (flags) |= AuFillVdir_##name; } while (0) ++#define au_fclr_fillvdir(flags, name) \ ++ do { (flags) &= ~AuFillVdir_##name; } while (0) ++ ++#ifndef CONFIG_AUFS_SHWH ++#undef AuFillVdir_SHWH ++#define AuFillVdir_SHWH 0 ++#endif ++ ++struct fillvdir_arg { ++ struct file *file; ++ struct au_vdir *vdir; ++ struct au_nhash delist; ++ struct au_nhash whlist; ++ aufs_bindex_t bindex; ++ unsigned int flags; ++ int err; ++}; ++ ++static int fillvdir(void *__arg, const char *__name, int nlen, ++ loff_t offset __maybe_unused, u64 h_ino, ++ unsigned int d_type) ++{ ++ struct fillvdir_arg *arg = __arg; ++ char *name = (void *)__name; ++ struct super_block *sb; ++ ino_t ino; ++ const unsigned char shwh = !!au_ftest_fillvdir(arg->flags, SHWH); ++ ++ arg->err = 0; ++ sb = arg->file->f_dentry->d_sb; ++ au_fset_fillvdir(arg->flags, CALLED); ++ /* smp_mb(); */ ++ if (nlen <= AUFS_WH_PFX_LEN ++ || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { ++ if (test_known(&arg->delist, name, nlen) ++ || au_nhash_test_known_wh(&arg->whlist, name, nlen)) ++ goto out; /* already exists or whiteouted */ ++ ++ sb = arg->file->f_dentry->d_sb; ++ arg->err = au_ino(sb, arg->bindex, h_ino, d_type, &ino); ++ if (!arg->err) { ++ if (unlikely(nlen > AUFS_MAX_NAMELEN)) ++ d_type = DT_UNKNOWN; ++ arg->err = append_de(arg->vdir, name, nlen, ino, ++ d_type, &arg->delist); ++ } ++ } else if (au_ftest_fillvdir(arg->flags, WHABLE)) { ++ name += AUFS_WH_PFX_LEN; ++ nlen -= AUFS_WH_PFX_LEN; ++ if (au_nhash_test_known_wh(&arg->whlist, name, nlen)) ++ goto out; /* already whiteouted */ ++ ++ if (shwh) ++ arg->err = au_wh_ino(sb, arg->bindex, h_ino, d_type, ++ &ino); ++ if (!arg->err) { ++ if (nlen <= AUFS_MAX_NAMELEN + AUFS_WH_PFX_LEN) ++ d_type = DT_UNKNOWN; ++ arg->err = au_nhash_append_wh ++ (&arg->whlist, name, nlen, ino, d_type, ++ arg->bindex, shwh); ++ } ++ } ++ ++out: ++ if (!arg->err) ++ arg->vdir->vd_jiffy = jiffies; ++ /* smp_mb(); */ ++ AuTraceErr(arg->err); ++ return arg->err; ++} ++ ++static int au_handle_shwh(struct super_block *sb, struct au_vdir *vdir, ++ struct au_nhash *whlist, struct au_nhash *delist) ++{ ++#ifdef CONFIG_AUFS_SHWH ++ int err; ++ unsigned int nh, u; ++ struct hlist_head *head; ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos, *n; ++ char *p, *o; ++ struct au_vdir_destr *destr; ++ ++ AuDebugOn(!au_opt_test(au_mntflags(sb), SHWH)); ++ ++ err = -ENOMEM; ++ o = p = __getname_gfp(GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ ++ err = 0; ++ nh = whlist->nh_num; ++ memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); ++ p += AUFS_WH_PFX_LEN; ++ for (u = 0; u < nh; u++) { ++ head = whlist->nh_head + u; ++ hlist_for_each_entry_safe(tpos, pos, n, head, wh_hash) { ++ destr = &tpos->wh_str; ++ memcpy(p, destr->name, destr->len); ++ err = append_de(vdir, o, destr->len + AUFS_WH_PFX_LEN, ++ tpos->wh_ino, tpos->wh_type, delist); ++ if (unlikely(err)) ++ break; ++ } ++ } ++ ++ __putname(o); ++ ++out: ++ AuTraceErr(err); ++ return err; ++#else ++ return 0; ++#endif ++} ++ ++static int au_do_read_vdir(struct fillvdir_arg *arg) ++{ ++ int err; ++ unsigned int rdhash; ++ loff_t offset; ++ aufs_bindex_t bend, bindex, bstart; ++ unsigned char shwh; ++ struct file *hf, *file; ++ struct super_block *sb; ++ ++ file = arg->file; ++ sb = file->f_dentry->d_sb; ++ SiMustAnyLock(sb); ++ ++ rdhash = au_sbi(sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(file, /*dentry*/NULL)); ++ err = au_nhash_alloc(&arg->delist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_nhash_alloc(&arg->whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out_delist; ++ ++ err = 0; ++ arg->flags = 0; ++ shwh = 0; ++ if (au_opt_test(au_mntflags(sb), SHWH)) { ++ shwh = 1; ++ au_fset_fillvdir(arg->flags, SHWH); ++ } ++ bstart = au_fbstart(file); ++ bend = au_fbend_dir(file); ++ for (bindex = bstart; !err && bindex <= bend; bindex++) { ++ hf = au_hf_dir(file, bindex); ++ if (!hf) ++ continue; ++ ++ offset = vfsub_llseek(hf, 0, SEEK_SET); ++ err = offset; ++ if (unlikely(offset)) ++ break; ++ ++ arg->bindex = bindex; ++ au_fclr_fillvdir(arg->flags, WHABLE); ++ if (shwh ++ || (bindex != bend ++ && au_br_whable(au_sbr_perm(sb, bindex)))) ++ au_fset_fillvdir(arg->flags, WHABLE); ++ do { ++ arg->err = 0; ++ au_fclr_fillvdir(arg->flags, CALLED); ++ /* smp_mb(); */ ++ err = vfsub_readdir(hf, fillvdir, arg); ++ if (err >= 0) ++ err = arg->err; ++ } while (!err && au_ftest_fillvdir(arg->flags, CALLED)); ++ } ++ ++ if (!err && shwh) ++ err = au_handle_shwh(sb, arg->vdir, &arg->whlist, &arg->delist); ++ ++ au_nhash_wh_free(&arg->whlist); ++ ++out_delist: ++ au_nhash_de_free(&arg->delist); ++out: ++ return err; ++} ++ ++static int read_vdir(struct file *file, int may_read) ++{ ++ int err; ++ unsigned long expire; ++ unsigned char do_read; ++ struct fillvdir_arg arg; ++ struct inode *inode; ++ struct au_vdir *vdir, *allocated; ++ ++ err = 0; ++ inode = file->f_dentry->d_inode; ++ IMustLock(inode); ++ SiMustAnyLock(inode->i_sb); ++ ++ allocated = NULL; ++ do_read = 0; ++ expire = au_sbi(inode->i_sb)->si_rdcache; ++ vdir = au_ivdir(inode); ++ if (!vdir) { ++ do_read = 1; ++ vdir = alloc_vdir(file); ++ err = PTR_ERR(vdir); ++ if (IS_ERR(vdir)) ++ goto out; ++ err = 0; ++ allocated = vdir; ++ } else if (may_read ++ && (inode->i_version != vdir->vd_version ++ || time_after(jiffies, vdir->vd_jiffy + expire))) { ++ do_read = 1; ++ err = reinit_vdir(vdir); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ if (!do_read) ++ return 0; /* success */ ++ ++ arg.file = file; ++ arg.vdir = vdir; ++ err = au_do_read_vdir(&arg); ++ if (!err) { ++ /* file->f_pos = 0; */ ++ vdir->vd_version = inode->i_version; ++ vdir->vd_last.ul = 0; ++ vdir->vd_last.p.deblk = vdir->vd_deblk[0]; ++ if (allocated) ++ au_set_ivdir(inode, allocated); ++ } else if (allocated) ++ au_vdir_free(allocated); ++ ++out: ++ return err; ++} ++ ++static int copy_vdir(struct au_vdir *tgt, struct au_vdir *src) ++{ ++ int err, rerr; ++ unsigned long ul, n; ++ const unsigned int deblk_sz = src->vd_deblk_sz; ++ ++ AuDebugOn(tgt->vd_nblk != 1); ++ ++ err = -ENOMEM; ++ if (tgt->vd_nblk < src->vd_nblk) { ++ unsigned char **p; ++ ++ p = krealloc(tgt->vd_deblk, sizeof(*p) * src->vd_nblk, ++ GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ tgt->vd_deblk = p; ++ } ++ ++ if (tgt->vd_deblk_sz != deblk_sz) { ++ unsigned char *p; ++ ++ tgt->vd_deblk_sz = deblk_sz; ++ p = krealloc(tgt->vd_deblk[0], deblk_sz, GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ tgt->vd_deblk[0] = p; ++ } ++ memcpy(tgt->vd_deblk[0], src->vd_deblk[0], deblk_sz); ++ tgt->vd_version = src->vd_version; ++ tgt->vd_jiffy = src->vd_jiffy; ++ ++ n = src->vd_nblk; ++ for (ul = 1; ul < n; ul++) { ++ tgt->vd_deblk[ul] = kmemdup(src->vd_deblk[ul], deblk_sz, ++ GFP_NOFS); ++ if (unlikely(!tgt->vd_deblk[ul])) ++ goto out; ++ tgt->vd_nblk++; ++ } ++ tgt->vd_nblk = n; ++ tgt->vd_last.ul = tgt->vd_last.ul; ++ tgt->vd_last.p.deblk = tgt->vd_deblk[tgt->vd_last.ul]; ++ tgt->vd_last.p.deblk += src->vd_last.p.deblk ++ - src->vd_deblk[src->vd_last.ul]; ++ /* smp_mb(); */ ++ return 0; /* success */ ++ ++out: ++ rerr = reinit_vdir(tgt); ++ BUG_ON(rerr); ++ return err; ++} ++ ++int au_vdir_init(struct file *file) ++{ ++ int err; ++ struct inode *inode; ++ struct au_vdir *vdir_cache, *allocated; ++ ++ err = read_vdir(file, !file->f_pos); ++ if (unlikely(err)) ++ goto out; ++ ++ allocated = NULL; ++ vdir_cache = au_fvdir_cache(file); ++ if (!vdir_cache) { ++ vdir_cache = alloc_vdir(file); ++ err = PTR_ERR(vdir_cache); ++ if (IS_ERR(vdir_cache)) ++ goto out; ++ allocated = vdir_cache; ++ } else if (!file->f_pos && vdir_cache->vd_version != file->f_version) { ++ err = reinit_vdir(vdir_cache); ++ if (unlikely(err)) ++ goto out; ++ } else ++ return 0; /* success */ ++ ++ inode = file->f_dentry->d_inode; ++ err = copy_vdir(vdir_cache, au_ivdir(inode)); ++ if (!err) { ++ file->f_version = inode->i_version; ++ if (allocated) ++ au_set_fvdir_cache(file, allocated); ++ } else if (allocated) ++ au_vdir_free(allocated); ++ ++out: ++ return err; ++} ++ ++static loff_t calc_offset(struct au_vdir *vdir) ++{ ++ loff_t offset; ++ union au_vdir_deblk_p p; ++ ++ p.deblk = vdir->vd_deblk[vdir->vd_last.ul]; ++ offset = vdir->vd_last.p.deblk - p.deblk; ++ offset += vdir->vd_deblk_sz * vdir->vd_last.ul; ++ return offset; ++} ++ ++/* returns true or false */ ++static int seek_vdir(struct file *file) ++{ ++ int valid; ++ unsigned int deblk_sz; ++ unsigned long ul, n; ++ loff_t offset; ++ union au_vdir_deblk_p p, deblk_end; ++ struct au_vdir *vdir_cache; ++ ++ valid = 1; ++ vdir_cache = au_fvdir_cache(file); ++ offset = calc_offset(vdir_cache); ++ AuDbg("offset %lld\n", offset); ++ if (file->f_pos == offset) ++ goto out; ++ ++ vdir_cache->vd_last.ul = 0; ++ vdir_cache->vd_last.p.deblk = vdir_cache->vd_deblk[0]; ++ if (!file->f_pos) ++ goto out; ++ ++ valid = 0; ++ deblk_sz = vdir_cache->vd_deblk_sz; ++ ul = div64_u64(file->f_pos, deblk_sz); ++ AuDbg("ul %lu\n", ul); ++ if (ul >= vdir_cache->vd_nblk) ++ goto out; ++ ++ n = vdir_cache->vd_nblk; ++ for (; ul < n; ul++) { ++ p.deblk = vdir_cache->vd_deblk[ul]; ++ deblk_end.deblk = p.deblk + deblk_sz; ++ offset = ul; ++ offset *= deblk_sz; ++ while (!is_deblk_end(&p, &deblk_end) && offset < file->f_pos) { ++ unsigned int l; ++ ++ l = calc_size(p.de->de_str.len); ++ offset += l; ++ p.deblk += l; ++ } ++ if (!is_deblk_end(&p, &deblk_end)) { ++ valid = 1; ++ vdir_cache->vd_last.ul = ul; ++ vdir_cache->vd_last.p = p; ++ break; ++ } ++ } ++ ++out: ++ /* smp_mb(); */ ++ AuTraceErr(!valid); ++ return valid; ++} ++ ++int au_vdir_fill_de(struct file *file, void *dirent, filldir_t filldir) ++{ ++ int err; ++ unsigned int l, deblk_sz; ++ union au_vdir_deblk_p deblk_end; ++ struct au_vdir *vdir_cache; ++ struct au_vdir_de *de; ++ ++ vdir_cache = au_fvdir_cache(file); ++ if (!seek_vdir(file)) ++ return 0; ++ ++ deblk_sz = vdir_cache->vd_deblk_sz; ++ while (1) { ++ deblk_end.deblk = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; ++ deblk_end.deblk += deblk_sz; ++ while (!is_deblk_end(&vdir_cache->vd_last.p, &deblk_end)) { ++ de = vdir_cache->vd_last.p.de; ++ AuDbg("%.*s, off%lld, i%lu, dt%d\n", ++ de->de_str.len, de->de_str.name, file->f_pos, ++ (unsigned long)de->de_ino, de->de_type); ++ err = filldir(dirent, de->de_str.name, de->de_str.len, ++ file->f_pos, de->de_ino, de->de_type); ++ if (unlikely(err)) { ++ AuTraceErr(err); ++ /* todo: ignore the error caused by udba? */ ++ /* return err; */ ++ return 0; ++ } ++ ++ l = calc_size(de->de_str.len); ++ vdir_cache->vd_last.p.deblk += l; ++ file->f_pos += l; ++ } ++ if (vdir_cache->vd_last.ul < vdir_cache->vd_nblk - 1) { ++ vdir_cache->vd_last.ul++; ++ vdir_cache->vd_last.p.deblk ++ = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; ++ file->f_pos = deblk_sz * vdir_cache->vd_last.ul; ++ continue; ++ } ++ break; ++ } ++ ++ /* smp_mb(); */ ++ return 0; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/vfsub.c linux-3.2.0-gentoo-r1/fs/aufs/vfsub.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/vfsub.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/vfsub.c 2012-01-17 12:11:25.032529271 +0100 +@@ -0,0 +1,835 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sub-routines for VFS ++ */ ++ ++#include ++#include ++#include ++#include ++#include "aufs.h" ++ ++int vfsub_update_h_iattr(struct path *h_path, int *did) ++{ ++ int err; ++ struct kstat st; ++ struct super_block *h_sb; ++ ++ /* for remote fs, leave work for its getattr or d_revalidate */ ++ /* for bad i_attr fs, handle them in aufs_getattr() */ ++ /* still some fs may acquire i_mutex. we need to skip them */ ++ err = 0; ++ if (!did) ++ did = &err; ++ h_sb = h_path->dentry->d_sb; ++ *did = (!au_test_fs_remote(h_sb) && au_test_fs_refresh_iattr(h_sb)); ++ if (*did) ++ err = vfs_getattr(h_path->mnt, h_path->dentry, &st); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct file *vfsub_dentry_open(struct path *path, int flags) ++{ ++ struct file *file; ++ ++ path_get(path); ++ file = dentry_open(path->dentry, path->mnt, ++ flags /* | __FMODE_NONOTIFY */, ++ current_cred()); ++ if (!IS_ERR_OR_NULL(file) ++ && (file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) ++ i_readcount_inc(path->dentry->d_inode); ++ ++ return file; ++} ++ ++struct file *vfsub_filp_open(const char *path, int oflags, int mode) ++{ ++ struct file *file; ++ ++ lockdep_off(); ++ file = filp_open(path, ++ oflags /* | __FMODE_NONOTIFY */, ++ mode); ++ lockdep_on(); ++ if (IS_ERR(file)) ++ goto out; ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ ++out: ++ return file; ++} ++ ++int vfsub_kern_path(const char *name, unsigned int flags, struct path *path) ++{ ++ int err; ++ ++ err = kern_path(name, flags, path); ++ if (!err && path->dentry->d_inode) ++ vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, ++ int len) ++{ ++ struct path path = { ++ .mnt = NULL ++ }; ++ ++ /* VFS checks it too, but by WARN_ON_ONCE() */ ++ IMustLock(parent->d_inode); ++ ++ path.dentry = lookup_one_len(name, parent, len); ++ if (IS_ERR(path.dentry)) ++ goto out; ++ if (path.dentry->d_inode) ++ vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/ ++ ++out: ++ AuTraceErrPtr(path.dentry); ++ return path.dentry; ++} ++ ++struct dentry *vfsub_lookup_hash(struct nameidata *nd) ++{ ++ struct path path = { ++ .mnt = nd->path.mnt ++ }; ++ ++ IMustLock(nd->path.dentry->d_inode); ++ ++ path.dentry = lookup_hash(nd); ++ if (IS_ERR(path.dentry)) ++ goto out; ++ if (path.dentry->d_inode) ++ vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/ ++ ++out: ++ AuTraceErrPtr(path.dentry); ++ return path.dentry; ++} ++ ++/* ++ * this is "VFS:__lookup_one_len()" which was removed and merged into ++ * VFS:lookup_one_len() by the commit. ++ * 6a96ba5 2011-03-14 kill __lookup_one_len() ++ * this function should always be equivalent to the corresponding part in ++ * VFS:lookup_one_len(). ++ */ ++int vfsub_name_hash(const char *name, struct qstr *this, int len) ++{ ++ unsigned long hash; ++ unsigned int c; ++ ++ this->name = name; ++ this->len = len; ++ if (!len) ++ return -EACCES; ++ ++ hash = init_name_hash(); ++ while (len--) { ++ c = *(const unsigned char *)name++; ++ if (c == '/' || c == '\0') ++ return -EACCES; ++ hash = partial_name_hash(c, hash); ++ } ++ this->hash = end_name_hash(hash); ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2) ++{ ++ struct dentry *d; ++ ++ lockdep_off(); ++ d = lock_rename(d1, d2); ++ lockdep_on(); ++ au_hn_suspend(hdir1); ++ if (hdir1 != hdir2) ++ au_hn_suspend(hdir2); ++ ++ return d; ++} ++ ++void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2) ++{ ++ au_hn_resume(hdir1); ++ if (hdir1 != hdir2) ++ au_hn_resume(hdir2); ++ lockdep_off(); ++ unlock_rename(d1, d2); ++ lockdep_on(); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int vfsub_create(struct inode *dir, struct path *path, int mode) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_mknod(path, d, mode, 0); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ if (au_test_fs_null_nd(dir->i_sb)) ++ err = vfs_create(dir, path->dentry, mode, NULL); ++ else { ++ struct nameidata h_nd; ++ ++ memset(&h_nd, 0, sizeof(h_nd)); ++ h_nd.flags = LOOKUP_CREATE; ++ h_nd.intent.open.flags = O_CREAT ++ | vfsub_fmode_to_uint(FMODE_READ); ++ h_nd.intent.open.create_mode = mode; ++ h_nd.path.dentry = path->dentry->d_parent; ++ h_nd.path.mnt = path->mnt; ++ path_get(&h_nd.path); ++ err = vfs_create(dir, path->dentry, mode, &h_nd); ++ path_put(&h_nd.path); ++ } ++ ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++int vfsub_symlink(struct inode *dir, struct path *path, const char *symname) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_symlink(path, d, symname); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ err = vfs_symlink(dir, path->dentry, symname); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_mknod(path, d, mode, new_encode_dev(dev)); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ err = vfs_mknod(dir, path->dentry, mode, dev); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++static int au_test_nlink(struct inode *inode) ++{ ++ const unsigned int link_max = UINT_MAX >> 1; /* rough margin */ ++ ++ if (!au_test_fs_no_limit_nlink(inode->i_sb) ++ || inode->i_nlink < link_max) ++ return 0; ++ return -EMLINK; ++} ++ ++int vfsub_link(struct dentry *src_dentry, struct inode *dir, struct path *path) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ err = au_test_nlink(src_dentry->d_inode); ++ if (unlikely(err)) ++ return err; ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_link(src_dentry, path, d); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_link(src_dentry, dir, path->dentry); ++ lockdep_on(); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ /* fuse has different memory inode for the same inumber */ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ tmp.dentry = src_dentry; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++int vfsub_rename(struct inode *src_dir, struct dentry *src_dentry, ++ struct inode *dir, struct path *path) ++{ ++ int err; ++ struct path tmp = { ++ .mnt = path->mnt ++ }; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ IMustLock(src_dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ tmp.dentry = src_dentry->d_parent; ++ err = security_path_rename(&tmp, src_dentry, path, d); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_rename(src_dir, src_dentry, dir, path->dentry); ++ lockdep_on(); ++ if (!err) { ++ int did; ++ ++ tmp.dentry = d->d_parent; ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = src_dentry; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ tmp.dentry = src_dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++int vfsub_mkdir(struct inode *dir, struct path *path, int mode) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_mkdir(path, d, mode); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ err = vfs_mkdir(dir, path->dentry, mode); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++int vfsub_rmdir(struct inode *dir, struct path *path) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_rmdir(path, d); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_rmdir(dir, path->dentry); ++ lockdep_on(); ++ if (!err) { ++ struct path tmp = { ++ .dentry = path->dentry->d_parent, ++ .mnt = path->mnt ++ }; ++ ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* todo: support mmap_sem? */ ++ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ ++ lockdep_off(); ++ err = vfs_read(file, ubuf, count, ppos); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++/* todo: kernel_read()? */ ++ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ union { ++ void *k; ++ char __user *u; ++ } buf; ++ ++ buf.k = kbuf; ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = vfsub_read_u(file, buf.u, count, ppos); ++ set_fs(oldfs); ++ return err; ++} ++ ++ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ ++ lockdep_off(); ++ err = vfs_write(file, ubuf, count, ppos); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, loff_t *ppos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ union { ++ void *k; ++ const char __user *u; ++ } buf; ++ ++ buf.k = kbuf; ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = vfsub_write_u(file, buf.u, count, ppos); ++ set_fs(oldfs); ++ return err; ++} ++ ++int vfsub_flush(struct file *file, fl_owner_t id) ++{ ++ int err; ++ ++ err = 0; ++ if (file->f_op && file->f_op->flush) { ++ if (!au_test_nfs(file->f_dentry->d_sb)) ++ err = file->f_op->flush(file, id); ++ else { ++ lockdep_off(); ++ err = file->f_op->flush(file, id); ++ lockdep_on(); ++ } ++ if (!err) ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); ++ /*ignore*/ ++ } ++ return err; ++} ++ ++int vfsub_readdir(struct file *file, filldir_t filldir, void *arg) ++{ ++ int err; ++ ++ lockdep_off(); ++ err = vfs_readdir(file, filldir, arg); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++long vfsub_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags) ++{ ++ long err; ++ ++ lockdep_off(); ++ err = do_splice_to(in, ppos, pipe, len, flags); ++ lockdep_on(); ++ file_accessed(in); ++ if (err >= 0) ++ vfsub_update_h_iattr(&in->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags) ++{ ++ long err; ++ ++ lockdep_off(); ++ err = do_splice_from(pipe, out, ppos, len, flags); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&out->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++int vfsub_fsync(struct file *file, struct path *path, int datasync) ++{ ++ int err; ++ ++ /* file can be NULL */ ++ lockdep_off(); ++ err = vfs_fsync(file, datasync); ++ lockdep_on(); ++ if (!err) { ++ if (!path) { ++ AuDebugOn(!file); ++ path = &file->f_path; ++ } ++ vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/ ++ } ++ return err; ++} ++ ++/* cf. open.c:do_sys_truncate() and do_sys_ftruncate() */ ++int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, ++ struct file *h_file) ++{ ++ int err; ++ struct inode *h_inode; ++ ++ h_inode = h_path->dentry->d_inode; ++ if (!h_file) { ++ err = mnt_want_write(h_path->mnt); ++ if (err) ++ goto out; ++ err = inode_permission(h_inode, MAY_WRITE); ++ if (err) ++ goto out_mnt; ++ err = get_write_access(h_inode); ++ if (err) ++ goto out_mnt; ++ err = break_lease(h_inode, O_WRONLY); ++ if (err) ++ goto out_inode; ++ } ++ ++ err = locks_verify_truncate(h_inode, h_file, length); ++ if (!err) ++ err = security_path_truncate(h_path); ++ if (!err) { ++ lockdep_off(); ++ err = do_truncate(h_path->dentry, length, attr, h_file); ++ lockdep_on(); ++ } ++ ++out_inode: ++ if (!h_file) ++ put_write_access(h_inode); ++out_mnt: ++ if (!h_file) ++ mnt_drop_write(h_path->mnt); ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_vfsub_mkdir_args { ++ int *errp; ++ struct inode *dir; ++ struct path *path; ++ int mode; ++}; ++ ++static void au_call_vfsub_mkdir(void *args) ++{ ++ struct au_vfsub_mkdir_args *a = args; ++ *a->errp = vfsub_mkdir(a->dir, a->path, a->mode); ++} ++ ++int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode) ++{ ++ int err, do_sio, wkq_err; ++ ++ do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); ++ if (!do_sio) ++ err = vfsub_mkdir(dir, path, mode); ++ else { ++ struct au_vfsub_mkdir_args args = { ++ .errp = &err, ++ .dir = dir, ++ .path = path, ++ .mode = mode ++ }; ++ wkq_err = au_wkq_wait(au_call_vfsub_mkdir, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} ++ ++struct au_vfsub_rmdir_args { ++ int *errp; ++ struct inode *dir; ++ struct path *path; ++}; ++ ++static void au_call_vfsub_rmdir(void *args) ++{ ++ struct au_vfsub_rmdir_args *a = args; ++ *a->errp = vfsub_rmdir(a->dir, a->path); ++} ++ ++int vfsub_sio_rmdir(struct inode *dir, struct path *path) ++{ ++ int err, do_sio, wkq_err; ++ ++ do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); ++ if (!do_sio) ++ err = vfsub_rmdir(dir, path); ++ else { ++ struct au_vfsub_rmdir_args args = { ++ .errp = &err, ++ .dir = dir, ++ .path = path ++ }; ++ wkq_err = au_wkq_wait(au_call_vfsub_rmdir, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct notify_change_args { ++ int *errp; ++ struct path *path; ++ struct iattr *ia; ++}; ++ ++static void call_notify_change(void *args) ++{ ++ struct notify_change_args *a = args; ++ struct inode *h_inode; ++ ++ h_inode = a->path->dentry->d_inode; ++ IMustLock(h_inode); ++ ++ *a->errp = -EPERM; ++ if (!IS_IMMUTABLE(h_inode) && !IS_APPEND(h_inode)) { ++ *a->errp = notify_change(a->path->dentry, a->ia); ++ if (!*a->errp) ++ vfsub_update_h_iattr(a->path, /*did*/NULL); /*ignore*/ ++ } ++ AuTraceErr(*a->errp); ++} ++ ++int vfsub_notify_change(struct path *path, struct iattr *ia) ++{ ++ int err; ++ struct notify_change_args args = { ++ .errp = &err, ++ .path = path, ++ .ia = ia ++ }; ++ ++ call_notify_change(&args); ++ ++ return err; ++} ++ ++int vfsub_sio_notify_change(struct path *path, struct iattr *ia) ++{ ++ int err, wkq_err; ++ struct notify_change_args args = { ++ .errp = &err, ++ .path = path, ++ .ia = ia ++ }; ++ ++ wkq_err = au_wkq_wait(call_notify_change, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct unlink_args { ++ int *errp; ++ struct inode *dir; ++ struct path *path; ++}; ++ ++static void call_unlink(void *args) ++{ ++ struct unlink_args *a = args; ++ struct dentry *d = a->path->dentry; ++ struct inode *h_inode; ++ const int stop_sillyrename = (au_test_nfs(d->d_sb) ++ && d->d_count == 1); ++ ++ IMustLock(a->dir); ++ ++ a->path->dentry = d->d_parent; ++ *a->errp = security_path_unlink(a->path, d); ++ a->path->dentry = d; ++ if (unlikely(*a->errp)) ++ return; ++ ++ if (!stop_sillyrename) ++ dget(d); ++ h_inode = d->d_inode; ++ if (h_inode) ++ ihold(h_inode); ++ ++ lockdep_off(); ++ *a->errp = vfs_unlink(a->dir, d); ++ lockdep_on(); ++ if (!*a->errp) { ++ struct path tmp = { ++ .dentry = d->d_parent, ++ .mnt = a->path->mnt ++ }; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ ++ } ++ ++ if (!stop_sillyrename) ++ dput(d); ++ if (h_inode) ++ iput(h_inode); ++ ++ AuTraceErr(*a->errp); ++} ++ ++/* ++ * @dir: must be locked. ++ * @dentry: target dentry. ++ */ ++int vfsub_unlink(struct inode *dir, struct path *path, int force) ++{ ++ int err; ++ struct unlink_args args = { ++ .errp = &err, ++ .dir = dir, ++ .path = path ++ }; ++ ++ if (!force) ++ call_unlink(&args); ++ else { ++ int wkq_err; ++ ++ wkq_err = au_wkq_wait(call_unlink, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/vfsub.h linux-3.2.0-gentoo-r1/fs/aufs/vfsub.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/vfsub.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/vfsub.h 2012-01-17 12:11:25.044103451 +0100 +@@ -0,0 +1,232 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sub-routines for VFS ++ */ ++ ++#ifndef __AUFS_VFSUB_H__ ++#define __AUFS_VFSUB_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include "debug.h" ++ ++/* copied from linux/fs/internal.h */ ++/* todo: BAD approach!! */ ++DECLARE_BRLOCK(vfsmount_lock); ++extern void file_sb_list_del(struct file *f); ++extern spinlock_t inode_sb_list_lock; ++ ++/* copied from linux/fs/file_table.c */ ++DECLARE_LGLOCK(files_lglock); ++#ifdef CONFIG_SMP ++/* ++ * These macros iterate all files on all CPUs for a given superblock. ++ * files_lglock must be held globally. ++ */ ++#define do_file_list_for_each_entry(__sb, __file) \ ++{ \ ++ int i; \ ++ for_each_possible_cpu(i) { \ ++ struct list_head *list; \ ++ list = per_cpu_ptr((__sb)->s_files, i); \ ++ list_for_each_entry((__file), list, f_u.fu_list) ++ ++#define while_file_list_for_each_entry \ ++ } \ ++} ++ ++#else ++ ++#define do_file_list_for_each_entry(__sb, __file) \ ++{ \ ++ struct list_head *list; \ ++ list = &(sb)->s_files; \ ++ list_for_each_entry((__file), list, f_u.fu_list) ++ ++#define while_file_list_for_each_entry \ ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock subclass for lower inode */ ++/* default MAX_LOCKDEP_SUBCLASSES(8) is not enough */ ++/* reduce? gave up. */ ++enum { ++ AuLsc_I_Begin = I_MUTEX_QUOTA, /* 4 */ ++ AuLsc_I_PARENT, /* lower inode, parent first */ ++ AuLsc_I_PARENT2, /* copyup dirs */ ++ AuLsc_I_PARENT3, /* copyup wh */ ++ AuLsc_I_CHILD, ++ AuLsc_I_CHILD2, ++ AuLsc_I_End ++}; ++ ++/* to debug easier, do not make them inlined functions */ ++#define MtxMustLock(mtx) AuDebugOn(!mutex_is_locked(mtx)) ++#define IMustLock(i) MtxMustLock(&(i)->i_mutex) ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline void vfsub_drop_nlink(struct inode *inode) ++{ ++ AuDebugOn(!inode->i_nlink); ++ drop_nlink(inode); ++} ++ ++static inline void vfsub_dead_dir(struct inode *inode) ++{ ++ AuDebugOn(!S_ISDIR(inode->i_mode)); ++ inode->i_flags |= S_DEAD; ++ clear_nlink(inode); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int vfsub_update_h_iattr(struct path *h_path, int *did); ++struct file *vfsub_dentry_open(struct path *path, int flags); ++struct file *vfsub_filp_open(const char *path, int oflags, int mode); ++int vfsub_kern_path(const char *name, unsigned int flags, struct path *path); ++struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, ++ int len); ++struct dentry *vfsub_lookup_hash(struct nameidata *nd); ++int vfsub_name_hash(const char *name, struct qstr *this, int len); ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_hinode; ++struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2); ++void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2); ++ ++int vfsub_create(struct inode *dir, struct path *path, int mode); ++int vfsub_symlink(struct inode *dir, struct path *path, ++ const char *symname); ++int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev); ++int vfsub_link(struct dentry *src_dentry, struct inode *dir, ++ struct path *path); ++int vfsub_rename(struct inode *src_hdir, struct dentry *src_dentry, ++ struct inode *hdir, struct path *path); ++int vfsub_mkdir(struct inode *dir, struct path *path, int mode); ++int vfsub_rmdir(struct inode *dir, struct path *path); ++ ++/* ---------------------------------------------------------------------- */ ++ ++ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, ++ loff_t *ppos); ++ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, ++ loff_t *ppos); ++ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, ++ loff_t *ppos); ++ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, ++ loff_t *ppos); ++int vfsub_flush(struct file *file, fl_owner_t id); ++int vfsub_readdir(struct file *file, filldir_t filldir, void *arg); ++ ++static inline unsigned int vfsub_file_flags(struct file *file) ++{ ++ unsigned int flags; ++ ++ spin_lock(&file->f_lock); ++ flags = file->f_flags; ++ spin_unlock(&file->f_lock); ++ ++ return flags; ++} ++ ++static inline void vfsub_file_accessed(struct file *h_file) ++{ ++ file_accessed(h_file); ++ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); /*ignore*/ ++} ++ ++static inline void vfsub_touch_atime(struct vfsmount *h_mnt, ++ struct dentry *h_dentry) ++{ ++ struct path h_path = { ++ .dentry = h_dentry, ++ .mnt = h_mnt ++ }; ++ touch_atime(h_mnt, h_dentry); ++ vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/ ++} ++ ++long vfsub_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags); ++long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags); ++int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, ++ struct file *h_file); ++int vfsub_fsync(struct file *file, struct path *path, int datasync); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline loff_t vfsub_llseek(struct file *file, loff_t offset, int origin) ++{ ++ loff_t err; ++ ++ lockdep_off(); ++ err = vfs_llseek(file, offset, origin); ++ lockdep_on(); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dirty workaround for strict type of fmode_t */ ++union vfsub_fmu { ++ fmode_t fm; ++ unsigned int ui; ++}; ++ ++static inline unsigned int vfsub_fmode_to_uint(fmode_t fm) ++{ ++ union vfsub_fmu u = { ++ .fm = fm ++ }; ++ ++ BUILD_BUG_ON(sizeof(u.fm) != sizeof(u.ui)); ++ ++ return u.ui; ++} ++ ++static inline fmode_t vfsub_uint_to_fmode(unsigned int ui) ++{ ++ union vfsub_fmu u = { ++ .ui = ui ++ }; ++ ++ return u.fm; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode); ++int vfsub_sio_rmdir(struct inode *dir, struct path *path); ++int vfsub_sio_notify_change(struct path *path, struct iattr *ia); ++int vfsub_notify_change(struct path *path, struct iattr *ia); ++int vfsub_unlink(struct inode *dir, struct path *path, int force); ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_VFSUB_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/wbr_policy.c linux-3.2.0-gentoo-r1/fs/aufs/wbr_policy.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/wbr_policy.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/wbr_policy.c 2012-01-17 12:11:25.044103451 +0100 +@@ -0,0 +1,700 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * policies for selecting one among multiple writable branches ++ */ ++ ++#include ++#include "aufs.h" ++ ++/* subset of cpup_attr() */ ++static noinline_for_stack ++int au_cpdown_attr(struct path *h_path, struct dentry *h_src) ++{ ++ int err, sbits; ++ struct iattr ia; ++ struct inode *h_isrc; ++ ++ h_isrc = h_src->d_inode; ++ ia.ia_valid = ATTR_FORCE | ATTR_MODE | ATTR_UID | ATTR_GID; ++ ia.ia_mode = h_isrc->i_mode; ++ ia.ia_uid = h_isrc->i_uid; ++ ia.ia_gid = h_isrc->i_gid; ++ sbits = !!(ia.ia_mode & (S_ISUID | S_ISGID)); ++ au_cpup_attr_flags(h_path->dentry->d_inode, h_isrc); ++ err = vfsub_sio_notify_change(h_path, &ia); ++ ++ /* is this nfs only? */ ++ if (!err && sbits && au_test_nfs(h_path->dentry->d_sb)) { ++ ia.ia_valid = ATTR_FORCE | ATTR_MODE; ++ ia.ia_mode = h_isrc->i_mode; ++ err = vfsub_sio_notify_change(h_path, &ia); ++ } ++ ++ return err; ++} ++ ++#define AuCpdown_PARENT_OPQ 1 ++#define AuCpdown_WHED (1 << 1) ++#define AuCpdown_MADE_DIR (1 << 2) ++#define AuCpdown_DIROPQ (1 << 3) ++#define au_ftest_cpdown(flags, name) ((flags) & AuCpdown_##name) ++#define au_fset_cpdown(flags, name) \ ++ do { (flags) |= AuCpdown_##name; } while (0) ++#define au_fclr_cpdown(flags, name) \ ++ do { (flags) &= ~AuCpdown_##name; } while (0) ++ ++struct au_cpdown_dir_args { ++ struct dentry *parent; ++ unsigned int flags; ++}; ++ ++static int au_cpdown_dir_opq(struct dentry *dentry, aufs_bindex_t bdst, ++ struct au_cpdown_dir_args *a) ++{ ++ int err; ++ struct dentry *opq_dentry; ++ ++ opq_dentry = au_diropq_create(dentry, bdst); ++ err = PTR_ERR(opq_dentry); ++ if (IS_ERR(opq_dentry)) ++ goto out; ++ dput(opq_dentry); ++ au_fset_cpdown(a->flags, DIROPQ); ++ ++out: ++ return err; ++} ++ ++static int au_cpdown_dir_wh(struct dentry *dentry, struct dentry *h_parent, ++ struct inode *dir, aufs_bindex_t bdst) ++{ ++ int err; ++ struct path h_path; ++ struct au_branch *br; ++ ++ br = au_sbr(dentry->d_sb, bdst); ++ h_path.dentry = au_wh_lkup(h_parent, &dentry->d_name, br); ++ err = PTR_ERR(h_path.dentry); ++ if (IS_ERR(h_path.dentry)) ++ goto out; ++ ++ err = 0; ++ if (h_path.dentry->d_inode) { ++ h_path.mnt = br->br_mnt; ++ err = au_wh_unlink_dentry(au_h_iptr(dir, bdst), &h_path, ++ dentry); ++ } ++ dput(h_path.dentry); ++ ++out: ++ return err; ++} ++ ++static int au_cpdown_dir(struct dentry *dentry, aufs_bindex_t bdst, ++ struct dentry *h_parent, void *arg) ++{ ++ int err, rerr; ++ aufs_bindex_t bopq, bstart; ++ struct path h_path; ++ struct dentry *parent; ++ struct inode *h_dir, *h_inode, *inode, *dir; ++ struct au_cpdown_dir_args *args = arg; ++ ++ bstart = au_dbstart(dentry); ++ /* dentry is di-locked */ ++ parent = dget_parent(dentry); ++ dir = parent->d_inode; ++ h_dir = h_parent->d_inode; ++ AuDebugOn(h_dir != au_h_iptr(dir, bdst)); ++ IMustLock(h_dir); ++ ++ err = au_lkup_neg(dentry, bdst); ++ if (unlikely(err < 0)) ++ goto out; ++ h_path.dentry = au_h_dptr(dentry, bdst); ++ h_path.mnt = au_sbr_mnt(dentry->d_sb, bdst); ++ err = vfsub_sio_mkdir(au_h_iptr(dir, bdst), &h_path, ++ S_IRWXU | S_IRUGO | S_IXUGO); ++ if (unlikely(err)) ++ goto out_put; ++ au_fset_cpdown(args->flags, MADE_DIR); ++ ++ bopq = au_dbdiropq(dentry); ++ au_fclr_cpdown(args->flags, WHED); ++ au_fclr_cpdown(args->flags, DIROPQ); ++ if (au_dbwh(dentry) == bdst) ++ au_fset_cpdown(args->flags, WHED); ++ if (!au_ftest_cpdown(args->flags, PARENT_OPQ) && bopq <= bdst) ++ au_fset_cpdown(args->flags, PARENT_OPQ); ++ h_inode = h_path.dentry->d_inode; ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ if (au_ftest_cpdown(args->flags, WHED)) { ++ err = au_cpdown_dir_opq(dentry, bdst, args); ++ if (unlikely(err)) { ++ mutex_unlock(&h_inode->i_mutex); ++ goto out_dir; ++ } ++ } ++ ++ err = au_cpdown_attr(&h_path, au_h_dptr(dentry, bstart)); ++ mutex_unlock(&h_inode->i_mutex); ++ if (unlikely(err)) ++ goto out_opq; ++ ++ if (au_ftest_cpdown(args->flags, WHED)) { ++ err = au_cpdown_dir_wh(dentry, h_parent, dir, bdst); ++ if (unlikely(err)) ++ goto out_opq; ++ } ++ ++ inode = dentry->d_inode; ++ if (au_ibend(inode) < bdst) ++ au_set_ibend(inode, bdst); ++ au_set_h_iptr(inode, bdst, au_igrab(h_inode), ++ au_hi_flags(inode, /*isdir*/1)); ++ goto out; /* success */ ++ ++ /* revert */ ++out_opq: ++ if (au_ftest_cpdown(args->flags, DIROPQ)) { ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ rerr = au_diropq_remove(dentry, bdst); ++ mutex_unlock(&h_inode->i_mutex); ++ if (unlikely(rerr)) { ++ AuIOErr("failed removing diropq for %.*s b%d (%d)\n", ++ AuDLNPair(dentry), bdst, rerr); ++ err = -EIO; ++ goto out; ++ } ++ } ++out_dir: ++ if (au_ftest_cpdown(args->flags, MADE_DIR)) { ++ rerr = vfsub_sio_rmdir(au_h_iptr(dir, bdst), &h_path); ++ if (unlikely(rerr)) { ++ AuIOErr("failed removing %.*s b%d (%d)\n", ++ AuDLNPair(dentry), bdst, rerr); ++ err = -EIO; ++ } ++ } ++out_put: ++ au_set_h_dptr(dentry, bdst, NULL); ++ if (au_dbend(dentry) == bdst) ++ au_update_dbend(dentry); ++out: ++ dput(parent); ++ return err; ++} ++ ++int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst) ++{ ++ int err; ++ struct au_cpdown_dir_args args = { ++ .parent = dget_parent(dentry), ++ .flags = 0 ++ }; ++ ++ err = au_cp_dirs(dentry, bdst, au_cpdown_dir, &args); ++ dput(args.parent); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policies for create */ ++ ++static int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ int err, i, j, ndentry; ++ aufs_bindex_t bopq; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries, *parent, *d; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ parent = dget_parent(dentry); ++ err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/0); ++ if (unlikely(err)) ++ goto out_free; ++ ++ err = bindex; ++ for (i = 0; i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ ndentry = dpage->ndentry; ++ for (j = 0; j < ndentry; j++) { ++ d = dentries[j]; ++ di_read_lock_parent2(d, !AuLock_IR); ++ bopq = au_dbdiropq(d); ++ di_read_unlock(d, !AuLock_IR); ++ if (bopq >= 0 && bopq < err) ++ err = bopq; ++ } ++ } ++ ++out_free: ++ dput(parent); ++ au_dpages_free(&dpages); ++out: ++ return err; ++} ++ ++static int au_wbr_bu(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ for (; bindex >= 0; bindex--) ++ if (!au_br_rdonly(au_sbr(sb, bindex))) ++ return bindex; ++ return -EROFS; ++} ++ ++/* top down parent */ ++static int au_wbr_create_tdp(struct dentry *dentry, int isdir __maybe_unused) ++{ ++ int err; ++ aufs_bindex_t bstart, bindex; ++ struct super_block *sb; ++ struct dentry *parent, *h_parent; ++ ++ sb = dentry->d_sb; ++ bstart = au_dbstart(dentry); ++ err = bstart; ++ if (!au_br_rdonly(au_sbr(sb, bstart))) ++ goto out; ++ ++ err = -EROFS; ++ parent = dget_parent(dentry); ++ for (bindex = au_dbstart(parent); bindex < bstart; bindex++) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent || !h_parent->d_inode) ++ continue; ++ ++ if (!au_br_rdonly(au_sbr(sb, bindex))) { ++ err = bindex; ++ break; ++ } ++ } ++ dput(parent); ++ ++ /* bottom up here */ ++ if (unlikely(err < 0)) { ++ err = au_wbr_bu(sb, bstart - 1); ++ if (err >= 0) ++ err = au_wbr_nonopq(dentry, err); ++ } ++ ++out: ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* an exception for the policy other than tdp */ ++static int au_wbr_create_exp(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bwh, bdiropq; ++ struct dentry *parent; ++ ++ err = -1; ++ bwh = au_dbwh(dentry); ++ parent = dget_parent(dentry); ++ bdiropq = au_dbdiropq(parent); ++ if (bwh >= 0) { ++ if (bdiropq >= 0) ++ err = min(bdiropq, bwh); ++ else ++ err = bwh; ++ AuDbg("%d\n", err); ++ } else if (bdiropq >= 0) { ++ err = bdiropq; ++ AuDbg("%d\n", err); ++ } ++ dput(parent); ++ ++ if (err >= 0) ++ err = au_wbr_nonopq(dentry, err); ++ ++ if (err >= 0 && au_br_rdonly(au_sbr(dentry->d_sb, err))) ++ err = -1; ++ ++ AuDbg("%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* round robin */ ++static int au_wbr_create_init_rr(struct super_block *sb) ++{ ++ int err; ++ ++ err = au_wbr_bu(sb, au_sbend(sb)); ++ atomic_set(&au_sbi(sb)->si_wbr_rr_next, -err); /* less important */ ++ /* smp_mb(); */ ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++static int au_wbr_create_rr(struct dentry *dentry, int isdir) ++{ ++ int err, nbr; ++ unsigned int u; ++ aufs_bindex_t bindex, bend; ++ struct super_block *sb; ++ atomic_t *next; ++ ++ err = au_wbr_create_exp(dentry); ++ if (err >= 0) ++ goto out; ++ ++ sb = dentry->d_sb; ++ next = &au_sbi(sb)->si_wbr_rr_next; ++ bend = au_sbend(sb); ++ nbr = bend + 1; ++ for (bindex = 0; bindex <= bend; bindex++) { ++ if (!isdir) { ++ err = atomic_dec_return(next) + 1; ++ /* modulo for 0 is meaningless */ ++ if (unlikely(!err)) ++ err = atomic_dec_return(next) + 1; ++ } else ++ err = atomic_read(next); ++ AuDbg("%d\n", err); ++ u = err; ++ err = u % nbr; ++ AuDbg("%d\n", err); ++ if (!au_br_rdonly(au_sbr(sb, err))) ++ break; ++ err = -EROFS; ++ } ++ ++ if (err >= 0) ++ err = au_wbr_nonopq(dentry, err); ++ ++out: ++ AuDbg("%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* most free space */ ++static void au_mfs(struct dentry *dentry) ++{ ++ struct super_block *sb; ++ struct au_branch *br; ++ struct au_wbr_mfs *mfs; ++ aufs_bindex_t bindex, bend; ++ int err; ++ unsigned long long b, bavail; ++ struct path h_path; ++ /* reduce the stack usage */ ++ struct kstatfs *st; ++ ++ st = kmalloc(sizeof(*st), GFP_NOFS); ++ if (unlikely(!st)) { ++ AuWarn1("failed updating mfs(%d), ignored\n", -ENOMEM); ++ return; ++ } ++ ++ bavail = 0; ++ sb = dentry->d_sb; ++ mfs = &au_sbi(sb)->si_wbr_mfs; ++ MtxMustLock(&mfs->mfs_lock); ++ mfs->mfs_bindex = -EROFS; ++ mfs->mfsrr_bytes = 0; ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (au_br_rdonly(br)) ++ continue; ++ ++ /* sb->s_root for NFS is unreliable */ ++ h_path.mnt = br->br_mnt; ++ h_path.dentry = h_path.mnt->mnt_root; ++ err = vfs_statfs(&h_path, st); ++ if (unlikely(err)) { ++ AuWarn1("failed statfs, b%d, %d\n", bindex, err); ++ continue; ++ } ++ ++ /* when the available size is equal, select the lower one */ ++ BUILD_BUG_ON(sizeof(b) < sizeof(st->f_bavail) ++ || sizeof(b) < sizeof(st->f_bsize)); ++ b = st->f_bavail * st->f_bsize; ++ br->br_wbr->wbr_bytes = b; ++ if (b >= bavail) { ++ bavail = b; ++ mfs->mfs_bindex = bindex; ++ mfs->mfs_jiffy = jiffies; ++ } ++ } ++ ++ mfs->mfsrr_bytes = bavail; ++ AuDbg("b%d\n", mfs->mfs_bindex); ++ kfree(st); ++} ++ ++static int au_wbr_create_mfs(struct dentry *dentry, int isdir __maybe_unused) ++{ ++ int err; ++ struct super_block *sb; ++ struct au_wbr_mfs *mfs; ++ ++ err = au_wbr_create_exp(dentry); ++ if (err >= 0) ++ goto out; ++ ++ sb = dentry->d_sb; ++ mfs = &au_sbi(sb)->si_wbr_mfs; ++ mutex_lock(&mfs->mfs_lock); ++ if (time_after(jiffies, mfs->mfs_jiffy + mfs->mfs_expire) ++ || mfs->mfs_bindex < 0 ++ || au_br_rdonly(au_sbr(sb, mfs->mfs_bindex))) ++ au_mfs(dentry); ++ mutex_unlock(&mfs->mfs_lock); ++ err = mfs->mfs_bindex; ++ ++ if (err >= 0) ++ err = au_wbr_nonopq(dentry, err); ++ ++out: ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++static int au_wbr_create_init_mfs(struct super_block *sb) ++{ ++ struct au_wbr_mfs *mfs; ++ ++ mfs = &au_sbi(sb)->si_wbr_mfs; ++ mutex_init(&mfs->mfs_lock); ++ mfs->mfs_jiffy = 0; ++ mfs->mfs_bindex = -EROFS; ++ ++ return 0; ++} ++ ++static int au_wbr_create_fin_mfs(struct super_block *sb __maybe_unused) ++{ ++ mutex_destroy(&au_sbi(sb)->si_wbr_mfs.mfs_lock); ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* most free space and then round robin */ ++static int au_wbr_create_mfsrr(struct dentry *dentry, int isdir) ++{ ++ int err; ++ struct au_wbr_mfs *mfs; ++ ++ err = au_wbr_create_mfs(dentry, isdir); ++ if (err >= 0) { ++ mfs = &au_sbi(dentry->d_sb)->si_wbr_mfs; ++ mutex_lock(&mfs->mfs_lock); ++ if (mfs->mfsrr_bytes < mfs->mfsrr_watermark) ++ err = au_wbr_create_rr(dentry, isdir); ++ mutex_unlock(&mfs->mfs_lock); ++ } ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++static int au_wbr_create_init_mfsrr(struct super_block *sb) ++{ ++ int err; ++ ++ au_wbr_create_init_mfs(sb); /* ignore */ ++ err = au_wbr_create_init_rr(sb); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* top down parent and most free space */ ++static int au_wbr_create_pmfs(struct dentry *dentry, int isdir) ++{ ++ int err, e2; ++ unsigned long long b; ++ aufs_bindex_t bindex, bstart, bend; ++ struct super_block *sb; ++ struct dentry *parent, *h_parent; ++ struct au_branch *br; ++ ++ err = au_wbr_create_tdp(dentry, isdir); ++ if (unlikely(err < 0)) ++ goto out; ++ parent = dget_parent(dentry); ++ bstart = au_dbstart(parent); ++ bend = au_dbtaildir(parent); ++ if (bstart == bend) ++ goto out_parent; /* success */ ++ ++ e2 = au_wbr_create_mfs(dentry, isdir); ++ if (e2 < 0) ++ goto out_parent; /* success */ ++ ++ /* when the available size is equal, select upper one */ ++ sb = dentry->d_sb; ++ br = au_sbr(sb, err); ++ b = br->br_wbr->wbr_bytes; ++ AuDbg("b%d, %llu\n", err, b); ++ ++ for (bindex = bstart; bindex <= bend; bindex++) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent || !h_parent->d_inode) ++ continue; ++ ++ br = au_sbr(sb, bindex); ++ if (!au_br_rdonly(br) && br->br_wbr->wbr_bytes > b) { ++ b = br->br_wbr->wbr_bytes; ++ err = bindex; ++ AuDbg("b%d, %llu\n", err, b); ++ } ++ } ++ ++ if (err >= 0) ++ err = au_wbr_nonopq(dentry, err); ++ ++out_parent: ++ dput(parent); ++out: ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policies for copyup */ ++ ++/* top down parent */ ++static int au_wbr_copyup_tdp(struct dentry *dentry) ++{ ++ return au_wbr_create_tdp(dentry, /*isdir, anything is ok*/0); ++} ++ ++/* bottom up parent */ ++static int au_wbr_copyup_bup(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bindex, bstart; ++ struct dentry *parent, *h_parent; ++ struct super_block *sb; ++ ++ err = -EROFS; ++ sb = dentry->d_sb; ++ parent = dget_parent(dentry); ++ bstart = au_dbstart(parent); ++ for (bindex = au_dbstart(dentry); bindex >= bstart; bindex--) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent || !h_parent->d_inode) ++ continue; ++ ++ if (!au_br_rdonly(au_sbr(sb, bindex))) { ++ err = bindex; ++ break; ++ } ++ } ++ dput(parent); ++ ++ /* bottom up here */ ++ if (unlikely(err < 0)) ++ err = au_wbr_bu(sb, bstart - 1); ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* bottom up */ ++static int au_wbr_copyup_bu(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ ++ bstart = au_dbstart(dentry); ++ err = au_wbr_bu(dentry->d_sb, bstart); ++ AuDbg("b%d\n", err); ++ if (err > bstart) ++ err = au_wbr_nonopq(dentry, err); ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_wbr_copyup_operations au_wbr_copyup_ops[] = { ++ [AuWbrCopyup_TDP] = { ++ .copyup = au_wbr_copyup_tdp ++ }, ++ [AuWbrCopyup_BUP] = { ++ .copyup = au_wbr_copyup_bup ++ }, ++ [AuWbrCopyup_BU] = { ++ .copyup = au_wbr_copyup_bu ++ } ++}; ++ ++struct au_wbr_create_operations au_wbr_create_ops[] = { ++ [AuWbrCreate_TDP] = { ++ .create = au_wbr_create_tdp ++ }, ++ [AuWbrCreate_RR] = { ++ .create = au_wbr_create_rr, ++ .init = au_wbr_create_init_rr ++ }, ++ [AuWbrCreate_MFS] = { ++ .create = au_wbr_create_mfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_MFSV] = { ++ .create = au_wbr_create_mfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_MFSRR] = { ++ .create = au_wbr_create_mfsrr, ++ .init = au_wbr_create_init_mfsrr, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_MFSRRV] = { ++ .create = au_wbr_create_mfsrr, ++ .init = au_wbr_create_init_mfsrr, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_PMFS] = { ++ .create = au_wbr_create_pmfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_PMFSV] = { ++ .create = au_wbr_create_pmfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ } ++}; +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/whout.c linux-3.2.0-gentoo-r1/fs/aufs/whout.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/whout.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/whout.c 2012-01-17 12:11:25.057992464 +0100 +@@ -0,0 +1,1049 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * whiteout for logical deletion and opaque directory ++ */ ++ ++#include "aufs.h" ++ ++#define WH_MASK S_IRUGO ++ ++/* ++ * If a directory contains this file, then it is opaque. We start with the ++ * .wh. flag so that it is blocked by lookup. ++ */ ++static struct qstr diropq_name = { ++ .name = AUFS_WH_DIROPQ, ++ .len = sizeof(AUFS_WH_DIROPQ) - 1 ++}; ++ ++/* ++ * generate whiteout name, which is NOT terminated by NULL. ++ * @name: original d_name.name ++ * @len: original d_name.len ++ * @wh: whiteout qstr ++ * returns zero when succeeds, otherwise error. ++ * succeeded value as wh->name should be freed by kfree(). ++ */ ++int au_wh_name_alloc(struct qstr *wh, const struct qstr *name) ++{ ++ char *p; ++ ++ if (unlikely(name->len > PATH_MAX - AUFS_WH_PFX_LEN)) ++ return -ENAMETOOLONG; ++ ++ wh->len = name->len + AUFS_WH_PFX_LEN; ++ p = kmalloc(wh->len, GFP_NOFS); ++ wh->name = p; ++ if (p) { ++ memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); ++ memcpy(p + AUFS_WH_PFX_LEN, name->name, name->len); ++ /* smp_mb(); */ ++ return 0; ++ } ++ return -ENOMEM; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * test if the @wh_name exists under @h_parent. ++ * @try_sio specifies the necessary of super-io. ++ */ ++int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, ++ struct au_branch *br, int try_sio) ++{ ++ int err; ++ struct dentry *wh_dentry; ++ ++ if (!try_sio) ++ wh_dentry = au_lkup_one(wh_name, h_parent, br, /*nd*/NULL); ++ else ++ wh_dentry = au_sio_lkup_one(wh_name, h_parent, br); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out; ++ ++ err = 0; ++ if (!wh_dentry->d_inode) ++ goto out_wh; /* success */ ++ ++ err = 1; ++ if (S_ISREG(wh_dentry->d_inode->i_mode)) ++ goto out_wh; /* success */ ++ ++ err = -EIO; ++ AuIOErr("%.*s Invalid whiteout entry type 0%o.\n", ++ AuDLNPair(wh_dentry), wh_dentry->d_inode->i_mode); ++ ++out_wh: ++ dput(wh_dentry); ++out: ++ return err; ++} ++ ++/* ++ * test if the @h_dentry sets opaque or not. ++ */ ++int au_diropq_test(struct dentry *h_dentry, struct au_branch *br) ++{ ++ int err; ++ struct inode *h_dir; ++ ++ h_dir = h_dentry->d_inode; ++ err = au_wh_test(h_dentry, &diropq_name, br, ++ au_test_h_perm_sio(h_dir, MAY_EXEC)); ++ return err; ++} ++ ++/* ++ * returns a negative dentry whose name is unique and temporary. ++ */ ++struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, ++ struct qstr *prefix) ++{ ++ struct dentry *dentry; ++ int i; ++ char defname[NAME_MAX - AUFS_MAX_NAMELEN + DNAME_INLINE_LEN + 1], ++ *name, *p; ++ /* strict atomic_t is unnecessary here */ ++ static unsigned short cnt; ++ struct qstr qs; ++ ++ BUILD_BUG_ON(sizeof(cnt) * 2 > AUFS_WH_TMP_LEN); ++ ++ name = defname; ++ qs.len = sizeof(defname) - DNAME_INLINE_LEN + prefix->len - 1; ++ if (unlikely(prefix->len > DNAME_INLINE_LEN)) { ++ dentry = ERR_PTR(-ENAMETOOLONG); ++ if (unlikely(qs.len > NAME_MAX)) ++ goto out; ++ dentry = ERR_PTR(-ENOMEM); ++ name = kmalloc(qs.len + 1, GFP_NOFS); ++ if (unlikely(!name)) ++ goto out; ++ } ++ ++ /* doubly whiteout-ed */ ++ memcpy(name, AUFS_WH_PFX AUFS_WH_PFX, AUFS_WH_PFX_LEN * 2); ++ p = name + AUFS_WH_PFX_LEN * 2; ++ memcpy(p, prefix->name, prefix->len); ++ p += prefix->len; ++ *p++ = '.'; ++ AuDebugOn(name + qs.len + 1 - p <= AUFS_WH_TMP_LEN); ++ ++ qs.name = name; ++ for (i = 0; i < 3; i++) { ++ sprintf(p, "%.*x", AUFS_WH_TMP_LEN, cnt++); ++ dentry = au_sio_lkup_one(&qs, h_parent, br); ++ if (IS_ERR(dentry) || !dentry->d_inode) ++ goto out_name; ++ dput(dentry); ++ } ++ /* pr_warning("could not get random name\n"); */ ++ dentry = ERR_PTR(-EEXIST); ++ AuDbg("%.*s\n", AuLNPair(&qs)); ++ BUG(); ++ ++out_name: ++ if (name != defname) ++ kfree(name); ++out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++/* ++ * rename the @h_dentry on @br to the whiteouted temporary name. ++ */ ++int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br) ++{ ++ int err; ++ struct path h_path = { ++ .mnt = br->br_mnt ++ }; ++ struct inode *h_dir; ++ struct dentry *h_parent; ++ ++ h_parent = h_dentry->d_parent; /* dir inode is locked */ ++ h_dir = h_parent->d_inode; ++ IMustLock(h_dir); ++ ++ h_path.dentry = au_whtmp_lkup(h_parent, br, &h_dentry->d_name); ++ err = PTR_ERR(h_path.dentry); ++ if (IS_ERR(h_path.dentry)) ++ goto out; ++ ++ /* under the same dir, no need to lock_rename() */ ++ err = vfsub_rename(h_dir, h_dentry, h_dir, &h_path); ++ AuTraceErr(err); ++ dput(h_path.dentry); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * functions for removing a whiteout ++ */ ++ ++static int do_unlink_wh(struct inode *h_dir, struct path *h_path) ++{ ++ int force; ++ ++ /* ++ * forces superio when the dir has a sticky bit. ++ * this may be a violation of unix fs semantics. ++ */ ++ force = (h_dir->i_mode & S_ISVTX) ++ && h_path->dentry->d_inode->i_uid != current_fsuid(); ++ return vfsub_unlink(h_dir, h_path, force); ++} ++ ++int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, ++ struct dentry *dentry) ++{ ++ int err; ++ ++ err = do_unlink_wh(h_dir, h_path); ++ if (!err && dentry) ++ au_set_dbwh(dentry, -1); ++ ++ return err; ++} ++ ++static int unlink_wh_name(struct dentry *h_parent, struct qstr *wh, ++ struct au_branch *br) ++{ ++ int err; ++ struct path h_path = { ++ .mnt = br->br_mnt ++ }; ++ ++ err = 0; ++ h_path.dentry = au_lkup_one(wh, h_parent, br, /*nd*/NULL); ++ if (IS_ERR(h_path.dentry)) ++ err = PTR_ERR(h_path.dentry); ++ else { ++ if (h_path.dentry->d_inode ++ && S_ISREG(h_path.dentry->d_inode->i_mode)) ++ err = do_unlink_wh(h_parent->d_inode, &h_path); ++ dput(h_path.dentry); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * initialize/clean whiteout for a branch ++ */ ++ ++static void au_wh_clean(struct inode *h_dir, struct path *whpath, ++ const int isdir) ++{ ++ int err; ++ ++ if (!whpath->dentry->d_inode) ++ return; ++ ++ err = mnt_want_write(whpath->mnt); ++ if (!err) { ++ if (isdir) ++ err = vfsub_rmdir(h_dir, whpath); ++ else ++ err = vfsub_unlink(h_dir, whpath, /*force*/0); ++ mnt_drop_write(whpath->mnt); ++ } ++ if (unlikely(err)) ++ pr_warning("failed removing %.*s (%d), ignored.\n", ++ AuDLNPair(whpath->dentry), err); ++} ++ ++static int test_linkable(struct dentry *h_root) ++{ ++ struct inode *h_dir = h_root->d_inode; ++ ++ if (h_dir->i_op->link) ++ return 0; ++ ++ pr_err("%.*s (%s) doesn't support link(2), use noplink and rw+nolwh\n", ++ AuDLNPair(h_root), au_sbtype(h_root->d_sb)); ++ return -ENOSYS; ++} ++ ++/* todo: should this mkdir be done in /sbin/mount.aufs helper? */ ++static int au_whdir(struct inode *h_dir, struct path *path) ++{ ++ int err; ++ ++ err = -EEXIST; ++ if (!path->dentry->d_inode) { ++ int mode = S_IRWXU; ++ ++ if (au_test_nfs(path->dentry->d_sb)) ++ mode |= S_IXUGO; ++ err = mnt_want_write(path->mnt); ++ if (!err) { ++ err = vfsub_mkdir(h_dir, path, mode); ++ mnt_drop_write(path->mnt); ++ } ++ } else if (S_ISDIR(path->dentry->d_inode->i_mode)) ++ err = 0; ++ else ++ pr_err("unknown %.*s exists\n", AuDLNPair(path->dentry)); ++ ++ return err; ++} ++ ++struct au_wh_base { ++ const struct qstr *name; ++ struct dentry *dentry; ++}; ++ ++static void au_wh_init_ro(struct inode *h_dir, struct au_wh_base base[], ++ struct path *h_path) ++{ ++ h_path->dentry = base[AuBrWh_BASE].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/0); ++ h_path->dentry = base[AuBrWh_PLINK].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++ h_path->dentry = base[AuBrWh_ORPH].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++} ++ ++/* ++ * returns tri-state, ++ * minus: error, caller should print the mesage ++ * zero: succuess ++ * plus: error, caller should NOT print the mesage ++ */ ++static int au_wh_init_rw_nolink(struct dentry *h_root, struct au_wbr *wbr, ++ int do_plink, struct au_wh_base base[], ++ struct path *h_path) ++{ ++ int err; ++ struct inode *h_dir; ++ ++ h_dir = h_root->d_inode; ++ h_path->dentry = base[AuBrWh_BASE].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/0); ++ h_path->dentry = base[AuBrWh_PLINK].dentry; ++ if (do_plink) { ++ err = test_linkable(h_root); ++ if (unlikely(err)) { ++ err = 1; ++ goto out; ++ } ++ ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); ++ } else ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++ h_path->dentry = base[AuBrWh_ORPH].dentry; ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); ++ ++out: ++ return err; ++} ++ ++/* ++ * for the moment, aufs supports the branch filesystem which does not support ++ * link(2). testing on FAT which does not support i_op->setattr() fully either, ++ * copyup failed. finally, such filesystem will not be used as the writable ++ * branch. ++ * ++ * returns tri-state, see above. ++ */ ++static int au_wh_init_rw(struct dentry *h_root, struct au_wbr *wbr, ++ int do_plink, struct au_wh_base base[], ++ struct path *h_path) ++{ ++ int err; ++ struct inode *h_dir; ++ ++ WbrWhMustWriteLock(wbr); ++ ++ err = test_linkable(h_root); ++ if (unlikely(err)) { ++ err = 1; ++ goto out; ++ } ++ ++ /* ++ * todo: should this create be done in /sbin/mount.aufs helper? ++ */ ++ err = -EEXIST; ++ h_dir = h_root->d_inode; ++ if (!base[AuBrWh_BASE].dentry->d_inode) { ++ err = mnt_want_write(h_path->mnt); ++ if (!err) { ++ h_path->dentry = base[AuBrWh_BASE].dentry; ++ err = vfsub_create(h_dir, h_path, WH_MASK); ++ mnt_drop_write(h_path->mnt); ++ } ++ } else if (S_ISREG(base[AuBrWh_BASE].dentry->d_inode->i_mode)) ++ err = 0; ++ else ++ pr_err("unknown %.*s/%.*s exists\n", ++ AuDLNPair(h_root), AuDLNPair(base[AuBrWh_BASE].dentry)); ++ if (unlikely(err)) ++ goto out; ++ ++ h_path->dentry = base[AuBrWh_PLINK].dentry; ++ if (do_plink) { ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); ++ } else ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++ wbr->wbr_whbase = dget(base[AuBrWh_BASE].dentry); ++ ++ h_path->dentry = base[AuBrWh_ORPH].dentry; ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); ++ ++out: ++ return err; ++} ++ ++/* ++ * initialize the whiteout base file/dir for @br. ++ */ ++int au_wh_init(struct dentry *h_root, struct au_branch *br, ++ struct super_block *sb) ++{ ++ int err, i; ++ const unsigned char do_plink ++ = !!au_opt_test(au_mntflags(sb), PLINK); ++ struct path path = { ++ .mnt = br->br_mnt ++ }; ++ struct inode *h_dir; ++ struct au_wbr *wbr = br->br_wbr; ++ static const struct qstr base_name[] = { ++ [AuBrWh_BASE] = { ++ .name = AUFS_BASE_NAME, ++ .len = sizeof(AUFS_BASE_NAME) - 1 ++ }, ++ [AuBrWh_PLINK] = { ++ .name = AUFS_PLINKDIR_NAME, ++ .len = sizeof(AUFS_PLINKDIR_NAME) - 1 ++ }, ++ [AuBrWh_ORPH] = { ++ .name = AUFS_ORPHDIR_NAME, ++ .len = sizeof(AUFS_ORPHDIR_NAME) - 1 ++ } ++ }; ++ struct au_wh_base base[] = { ++ [AuBrWh_BASE] = { ++ .name = base_name + AuBrWh_BASE, ++ .dentry = NULL ++ }, ++ [AuBrWh_PLINK] = { ++ .name = base_name + AuBrWh_PLINK, ++ .dentry = NULL ++ }, ++ [AuBrWh_ORPH] = { ++ .name = base_name + AuBrWh_ORPH, ++ .dentry = NULL ++ } ++ }; ++ ++ if (wbr) ++ WbrWhMustWriteLock(wbr); ++ ++ for (i = 0; i < AuBrWh_Last; i++) { ++ /* doubly whiteouted */ ++ struct dentry *d; ++ ++ d = au_wh_lkup(h_root, (void *)base[i].name, br); ++ err = PTR_ERR(d); ++ if (IS_ERR(d)) ++ goto out; ++ ++ base[i].dentry = d; ++ AuDebugOn(wbr ++ && wbr->wbr_wh[i] ++ && wbr->wbr_wh[i] != base[i].dentry); ++ } ++ ++ if (wbr) ++ for (i = 0; i < AuBrWh_Last; i++) { ++ dput(wbr->wbr_wh[i]); ++ wbr->wbr_wh[i] = NULL; ++ } ++ ++ err = 0; ++ if (!au_br_writable(br->br_perm)) { ++ h_dir = h_root->d_inode; ++ au_wh_init_ro(h_dir, base, &path); ++ } else if (!au_br_wh_linkable(br->br_perm)) { ++ err = au_wh_init_rw_nolink(h_root, wbr, do_plink, base, &path); ++ if (err > 0) ++ goto out; ++ else if (err) ++ goto out_err; ++ } else { ++ err = au_wh_init_rw(h_root, wbr, do_plink, base, &path); ++ if (err > 0) ++ goto out; ++ else if (err) ++ goto out_err; ++ } ++ goto out; /* success */ ++ ++out_err: ++ pr_err("an error(%d) on the writable branch %.*s(%s)\n", ++ err, AuDLNPair(h_root), au_sbtype(h_root->d_sb)); ++out: ++ for (i = 0; i < AuBrWh_Last; i++) ++ dput(base[i].dentry); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * whiteouts are all hard-linked usually. ++ * when its link count reaches a ceiling, we create a new whiteout base ++ * asynchronously. ++ */ ++ ++struct reinit_br_wh { ++ struct super_block *sb; ++ struct au_branch *br; ++}; ++ ++static void reinit_br_wh(void *arg) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct path h_path; ++ struct reinit_br_wh *a = arg; ++ struct au_wbr *wbr; ++ struct inode *dir; ++ struct dentry *h_root; ++ struct au_hinode *hdir; ++ ++ err = 0; ++ wbr = a->br->br_wbr; ++ /* big aufs lock */ ++ si_noflush_write_lock(a->sb); ++ if (!au_br_writable(a->br->br_perm)) ++ goto out; ++ bindex = au_br_index(a->sb, a->br->br_id); ++ if (unlikely(bindex < 0)) ++ goto out; ++ ++ di_read_lock_parent(a->sb->s_root, AuLock_IR); ++ dir = a->sb->s_root->d_inode; ++ hdir = au_hi(dir, bindex); ++ h_root = au_h_dptr(a->sb->s_root, bindex); ++ ++ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ wbr_wh_write_lock(wbr); ++ err = au_h_verify(wbr->wbr_whbase, au_opt_udba(a->sb), hdir->hi_inode, ++ h_root, a->br); ++ if (!err) { ++ err = mnt_want_write(a->br->br_mnt); ++ if (!err) { ++ h_path.dentry = wbr->wbr_whbase; ++ h_path.mnt = a->br->br_mnt; ++ err = vfsub_unlink(hdir->hi_inode, &h_path, /*force*/0); ++ mnt_drop_write(a->br->br_mnt); ++ } ++ } else { ++ pr_warning("%.*s is moved, ignored\n", ++ AuDLNPair(wbr->wbr_whbase)); ++ err = 0; ++ } ++ dput(wbr->wbr_whbase); ++ wbr->wbr_whbase = NULL; ++ if (!err) ++ err = au_wh_init(h_root, a->br, a->sb); ++ wbr_wh_write_unlock(wbr); ++ au_hn_imtx_unlock(hdir); ++ di_read_unlock(a->sb->s_root, AuLock_IR); ++ ++out: ++ if (wbr) ++ atomic_dec(&wbr->wbr_wh_running); ++ atomic_dec(&a->br->br_count); ++ si_write_unlock(a->sb); ++ au_nwt_done(&au_sbi(a->sb)->si_nowait); ++ kfree(arg); ++ if (unlikely(err)) ++ AuIOErr("err %d\n", err); ++} ++ ++static void kick_reinit_br_wh(struct super_block *sb, struct au_branch *br) ++{ ++ int do_dec, wkq_err; ++ struct reinit_br_wh *arg; ++ ++ do_dec = 1; ++ if (atomic_inc_return(&br->br_wbr->wbr_wh_running) != 1) ++ goto out; ++ ++ /* ignore ENOMEM */ ++ arg = kmalloc(sizeof(*arg), GFP_NOFS); ++ if (arg) { ++ /* ++ * dec(wh_running), kfree(arg) and dec(br_count) ++ * in reinit function ++ */ ++ arg->sb = sb; ++ arg->br = br; ++ atomic_inc(&br->br_count); ++ wkq_err = au_wkq_nowait(reinit_br_wh, arg, sb, /*flags*/0); ++ if (unlikely(wkq_err)) { ++ atomic_dec(&br->br_wbr->wbr_wh_running); ++ atomic_dec(&br->br_count); ++ kfree(arg); ++ } ++ do_dec = 0; ++ } ++ ++out: ++ if (do_dec) ++ atomic_dec(&br->br_wbr->wbr_wh_running); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create the whiteout @wh. ++ */ ++static int link_or_create_wh(struct super_block *sb, aufs_bindex_t bindex, ++ struct dentry *wh) ++{ ++ int err; ++ struct path h_path = { ++ .dentry = wh ++ }; ++ struct au_branch *br; ++ struct au_wbr *wbr; ++ struct dentry *h_parent; ++ struct inode *h_dir; ++ ++ h_parent = wh->d_parent; /* dir inode is locked */ ++ h_dir = h_parent->d_inode; ++ IMustLock(h_dir); ++ ++ br = au_sbr(sb, bindex); ++ h_path.mnt = br->br_mnt; ++ wbr = br->br_wbr; ++ wbr_wh_read_lock(wbr); ++ if (wbr->wbr_whbase) { ++ err = vfsub_link(wbr->wbr_whbase, h_dir, &h_path); ++ if (!err || err != -EMLINK) ++ goto out; ++ ++ /* link count full. re-initialize br_whbase. */ ++ kick_reinit_br_wh(sb, br); ++ } ++ ++ /* return this error in this context */ ++ err = vfsub_create(h_dir, &h_path, WH_MASK); ++ ++out: ++ wbr_wh_read_unlock(wbr); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create or remove the diropq. ++ */ ++static struct dentry *do_diropq(struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int flags) ++{ ++ struct dentry *opq_dentry, *h_dentry; ++ struct super_block *sb; ++ struct au_branch *br; ++ int err; ++ ++ sb = dentry->d_sb; ++ br = au_sbr(sb, bindex); ++ h_dentry = au_h_dptr(dentry, bindex); ++ opq_dentry = au_lkup_one(&diropq_name, h_dentry, br, /*nd*/NULL); ++ if (IS_ERR(opq_dentry)) ++ goto out; ++ ++ if (au_ftest_diropq(flags, CREATE)) { ++ err = link_or_create_wh(sb, bindex, opq_dentry); ++ if (!err) { ++ au_set_dbdiropq(dentry, bindex); ++ goto out; /* success */ ++ } ++ } else { ++ struct path tmp = { ++ .dentry = opq_dentry, ++ .mnt = br->br_mnt ++ }; ++ err = do_unlink_wh(au_h_iptr(dentry->d_inode, bindex), &tmp); ++ if (!err) ++ au_set_dbdiropq(dentry, -1); ++ } ++ dput(opq_dentry); ++ opq_dentry = ERR_PTR(err); ++ ++out: ++ return opq_dentry; ++} ++ ++struct do_diropq_args { ++ struct dentry **errp; ++ struct dentry *dentry; ++ aufs_bindex_t bindex; ++ unsigned int flags; ++}; ++ ++static void call_do_diropq(void *args) ++{ ++ struct do_diropq_args *a = args; ++ *a->errp = do_diropq(a->dentry, a->bindex, a->flags); ++} ++ ++struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int flags) ++{ ++ struct dentry *diropq, *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!au_test_h_perm_sio(h_dentry->d_inode, MAY_EXEC | MAY_WRITE)) ++ diropq = do_diropq(dentry, bindex, flags); ++ else { ++ int wkq_err; ++ struct do_diropq_args args = { ++ .errp = &diropq, ++ .dentry = dentry, ++ .bindex = bindex, ++ .flags = flags ++ }; ++ ++ wkq_err = au_wkq_wait(call_do_diropq, &args); ++ if (unlikely(wkq_err)) ++ diropq = ERR_PTR(wkq_err); ++ } ++ ++ return diropq; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * lookup whiteout dentry. ++ * @h_parent: lower parent dentry which must exist and be locked ++ * @base_name: name of dentry which will be whiteouted ++ * returns dentry for whiteout. ++ */ ++struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, ++ struct au_branch *br) ++{ ++ int err; ++ struct qstr wh_name; ++ struct dentry *wh_dentry; ++ ++ err = au_wh_name_alloc(&wh_name, base_name); ++ wh_dentry = ERR_PTR(err); ++ if (!err) { ++ wh_dentry = au_lkup_one(&wh_name, h_parent, br, /*nd*/NULL); ++ kfree(wh_name.name); ++ } ++ return wh_dentry; ++} ++ ++/* ++ * link/create a whiteout for @dentry on @bindex. ++ */ ++struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent) ++{ ++ struct dentry *wh_dentry; ++ struct super_block *sb; ++ int err; ++ ++ sb = dentry->d_sb; ++ wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, au_sbr(sb, bindex)); ++ if (!IS_ERR(wh_dentry) && !wh_dentry->d_inode) { ++ err = link_or_create_wh(sb, bindex, wh_dentry); ++ if (!err) ++ au_set_dbwh(dentry, bindex); ++ else { ++ dput(wh_dentry); ++ wh_dentry = ERR_PTR(err); ++ } ++ } ++ ++ return wh_dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* Delete all whiteouts in this directory on branch bindex. */ ++static int del_wh_children(struct dentry *h_dentry, struct au_nhash *whlist, ++ aufs_bindex_t bindex, struct au_branch *br) ++{ ++ int err; ++ unsigned long ul, n; ++ struct qstr wh_name; ++ char *p; ++ struct hlist_head *head; ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos; ++ struct au_vdir_destr *str; ++ ++ err = -ENOMEM; ++ p = __getname_gfp(GFP_NOFS); ++ wh_name.name = p; ++ if (unlikely(!wh_name.name)) ++ goto out; ++ ++ err = 0; ++ memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); ++ p += AUFS_WH_PFX_LEN; ++ n = whlist->nh_num; ++ head = whlist->nh_head; ++ for (ul = 0; !err && ul < n; ul++, head++) { ++ hlist_for_each_entry(tpos, pos, head, wh_hash) { ++ if (tpos->wh_bindex != bindex) ++ continue; ++ ++ str = &tpos->wh_str; ++ if (str->len + AUFS_WH_PFX_LEN <= PATH_MAX) { ++ memcpy(p, str->name, str->len); ++ wh_name.len = AUFS_WH_PFX_LEN + str->len; ++ err = unlink_wh_name(h_dentry, &wh_name, br); ++ if (!err) ++ continue; ++ break; ++ } ++ AuIOErr("whiteout name too long %.*s\n", ++ str->len, str->name); ++ err = -EIO; ++ break; ++ } ++ } ++ __putname(wh_name.name); ++ ++out: ++ return err; ++} ++ ++struct del_wh_children_args { ++ int *errp; ++ struct dentry *h_dentry; ++ struct au_nhash *whlist; ++ aufs_bindex_t bindex; ++ struct au_branch *br; ++}; ++ ++static void call_del_wh_children(void *args) ++{ ++ struct del_wh_children_args *a = args; ++ *a->errp = del_wh_children(a->h_dentry, a->whlist, a->bindex, a->br); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp) ++{ ++ struct au_whtmp_rmdir *whtmp; ++ int err; ++ unsigned int rdhash; ++ ++ SiMustAnyLock(sb); ++ ++ whtmp = kmalloc(sizeof(*whtmp), gfp); ++ if (unlikely(!whtmp)) { ++ whtmp = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ whtmp->dir = NULL; ++ whtmp->br = NULL; ++ whtmp->wh_dentry = NULL; ++ /* no estimation for dir size */ ++ rdhash = au_sbi(sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = AUFS_RDHASH_DEF; ++ err = au_nhash_alloc(&whtmp->whlist, rdhash, gfp); ++ if (unlikely(err)) { ++ kfree(whtmp); ++ whtmp = ERR_PTR(err); ++ } ++ ++out: ++ return whtmp; ++} ++ ++void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp) ++{ ++ if (whtmp->br) ++ atomic_dec(&whtmp->br->br_count); ++ dput(whtmp->wh_dentry); ++ iput(whtmp->dir); ++ au_nhash_wh_free(&whtmp->whlist); ++ kfree(whtmp); ++} ++ ++/* ++ * rmdir the whiteouted temporary named dir @h_dentry. ++ * @whlist: whiteouted children. ++ */ ++int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_nhash *whlist) ++{ ++ int err; ++ struct path h_tmp; ++ struct inode *wh_inode, *h_dir; ++ struct au_branch *br; ++ ++ h_dir = wh_dentry->d_parent->d_inode; /* dir inode is locked */ ++ IMustLock(h_dir); ++ ++ br = au_sbr(dir->i_sb, bindex); ++ wh_inode = wh_dentry->d_inode; ++ mutex_lock_nested(&wh_inode->i_mutex, AuLsc_I_CHILD); ++ ++ /* ++ * someone else might change some whiteouts while we were sleeping. ++ * it means this whlist may have an obsoleted entry. ++ */ ++ if (!au_test_h_perm_sio(wh_inode, MAY_EXEC | MAY_WRITE)) ++ err = del_wh_children(wh_dentry, whlist, bindex, br); ++ else { ++ int wkq_err; ++ struct del_wh_children_args args = { ++ .errp = &err, ++ .h_dentry = wh_dentry, ++ .whlist = whlist, ++ .bindex = bindex, ++ .br = br ++ }; ++ ++ wkq_err = au_wkq_wait(call_del_wh_children, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ mutex_unlock(&wh_inode->i_mutex); ++ ++ if (!err) { ++ h_tmp.dentry = wh_dentry; ++ h_tmp.mnt = br->br_mnt; ++ err = vfsub_rmdir(h_dir, &h_tmp); ++ } ++ ++ if (!err) { ++ if (au_ibstart(dir) == bindex) { ++ /* todo: dir->i_mutex is necessary */ ++ au_cpup_attr_timesizes(dir); ++ vfsub_drop_nlink(dir); ++ } ++ return 0; /* success */ ++ } ++ ++ pr_warning("failed removing %.*s(%d), ignored\n", ++ AuDLNPair(wh_dentry), err); ++ return err; ++} ++ ++static void call_rmdir_whtmp(void *args) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct au_whtmp_rmdir *a = args; ++ struct super_block *sb; ++ struct dentry *h_parent; ++ struct inode *h_dir; ++ struct au_hinode *hdir; ++ ++ /* rmdir by nfsd may cause deadlock with this i_mutex */ ++ /* mutex_lock(&a->dir->i_mutex); */ ++ err = -EROFS; ++ sb = a->dir->i_sb; ++ si_read_lock(sb, !AuLock_FLUSH); ++ if (!au_br_writable(a->br->br_perm)) ++ goto out; ++ bindex = au_br_index(sb, a->br->br_id); ++ if (unlikely(bindex < 0)) ++ goto out; ++ ++ err = -EIO; ++ ii_write_lock_parent(a->dir); ++ h_parent = dget_parent(a->wh_dentry); ++ h_dir = h_parent->d_inode; ++ hdir = au_hi(a->dir, bindex); ++ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ err = au_h_verify(a->wh_dentry, au_opt_udba(sb), h_dir, h_parent, ++ a->br); ++ if (!err) { ++ err = mnt_want_write(a->br->br_mnt); ++ if (!err) { ++ err = au_whtmp_rmdir(a->dir, bindex, a->wh_dentry, ++ &a->whlist); ++ mnt_drop_write(a->br->br_mnt); ++ } ++ } ++ au_hn_imtx_unlock(hdir); ++ dput(h_parent); ++ ii_write_unlock(a->dir); ++ ++out: ++ /* mutex_unlock(&a->dir->i_mutex); */ ++ au_whtmp_rmdir_free(a); ++ si_read_unlock(sb); ++ au_nwt_done(&au_sbi(sb)->si_nowait); ++ if (unlikely(err)) ++ AuIOErr("err %d\n", err); ++} ++ ++void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_whtmp_rmdir *args) ++{ ++ int wkq_err; ++ struct super_block *sb; ++ ++ IMustLock(dir); ++ ++ /* all post-process will be done in do_rmdir_whtmp(). */ ++ sb = dir->i_sb; ++ args->dir = au_igrab(dir); ++ args->br = au_sbr(sb, bindex); ++ atomic_inc(&args->br->br_count); ++ args->wh_dentry = dget(wh_dentry); ++ wkq_err = au_wkq_nowait(call_rmdir_whtmp, args, sb, /*flags*/0); ++ if (unlikely(wkq_err)) { ++ pr_warning("rmdir error %.*s (%d), ignored\n", ++ AuDLNPair(wh_dentry), wkq_err); ++ au_whtmp_rmdir_free(args); ++ } ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/whout.h linux-3.2.0-gentoo-r1/fs/aufs/whout.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/whout.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/whout.h 2012-01-17 12:11:25.078825986 +0100 +@@ -0,0 +1,88 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * whiteout for logical deletion and opaque directory ++ */ ++ ++#ifndef __AUFS_WHOUT_H__ ++#define __AUFS_WHOUT_H__ ++ ++#ifdef __KERNEL__ ++ ++#include "dir.h" ++ ++/* whout.c */ ++int au_wh_name_alloc(struct qstr *wh, const struct qstr *name); ++struct au_branch; ++int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, ++ struct au_branch *br, int try_sio); ++int au_diropq_test(struct dentry *h_dentry, struct au_branch *br); ++struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, ++ struct qstr *prefix); ++int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br); ++int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, ++ struct dentry *dentry); ++int au_wh_init(struct dentry *h_parent, struct au_branch *br, ++ struct super_block *sb); ++ ++/* diropq flags */ ++#define AuDiropq_CREATE 1 ++#define au_ftest_diropq(flags, name) ((flags) & AuDiropq_##name) ++#define au_fset_diropq(flags, name) \ ++ do { (flags) |= AuDiropq_##name; } while (0) ++#define au_fclr_diropq(flags, name) \ ++ do { (flags) &= ~AuDiropq_##name; } while (0) ++ ++struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int flags); ++struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, ++ struct au_branch *br); ++struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent); ++ ++/* real rmdir for the whiteout-ed dir */ ++struct au_whtmp_rmdir { ++ struct inode *dir; ++ struct au_branch *br; ++ struct dentry *wh_dentry; ++ struct au_nhash whlist; ++}; ++ ++struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp); ++void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp); ++int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_nhash *whlist); ++void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_whtmp_rmdir *args); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct dentry *au_diropq_create(struct dentry *dentry, ++ aufs_bindex_t bindex) ++{ ++ return au_diropq_sio(dentry, bindex, AuDiropq_CREATE); ++} ++ ++static inline int au_diropq_remove(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ return PTR_ERR(au_diropq_sio(dentry, bindex, !AuDiropq_CREATE)); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_WHOUT_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/wkq.c linux-3.2.0-gentoo-r1/fs/aufs/wkq.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/wkq.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/wkq.c 2012-01-17 12:11:25.088085327 +0100 +@@ -0,0 +1,214 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * workqueue for asynchronous/super-io operations ++ * todo: try new dredential scheme ++ */ ++ ++#include ++#include "aufs.h" ++ ++/* internal workqueue named AUFS_WKQ_NAME */ ++ ++static struct workqueue_struct *au_wkq; ++ ++struct au_wkinfo { ++ struct work_struct wk; ++ struct kobject *kobj; ++ ++ unsigned int flags; /* see wkq.h */ ++ ++ au_wkq_func_t func; ++ void *args; ++ ++ struct completion *comp; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void wkq_func(struct work_struct *wk) ++{ ++ struct au_wkinfo *wkinfo = container_of(wk, struct au_wkinfo, wk); ++ ++ AuDebugOn(current_fsuid()); ++ AuDebugOn(rlimit(RLIMIT_FSIZE) != RLIM_INFINITY); ++ ++ wkinfo->func(wkinfo->args); ++ if (au_ftest_wkq(wkinfo->flags, WAIT)) ++ complete(wkinfo->comp); ++ else { ++ kobject_put(wkinfo->kobj); ++ module_put(THIS_MODULE); /* todo: ?? */ ++ kfree(wkinfo); ++ } ++} ++ ++/* ++ * Since struct completion is large, try allocating it dynamically. ++ */ ++#if defined(CONFIG_4KSTACKS) || defined(AuTest4KSTACKS) ++#define AuWkqCompDeclare(name) struct completion *comp = NULL ++ ++static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) ++{ ++ *comp = kmalloc(sizeof(**comp), GFP_NOFS); ++ if (*comp) { ++ init_completion(*comp); ++ wkinfo->comp = *comp; ++ return 0; ++ } ++ return -ENOMEM; ++} ++ ++static void au_wkq_comp_free(struct completion *comp) ++{ ++ kfree(comp); ++} ++ ++#else ++ ++/* no braces */ ++#define AuWkqCompDeclare(name) \ ++ DECLARE_COMPLETION_ONSTACK(_ ## name); \ ++ struct completion *comp = &_ ## name ++ ++static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) ++{ ++ wkinfo->comp = *comp; ++ return 0; ++} ++ ++static void au_wkq_comp_free(struct completion *comp __maybe_unused) ++{ ++ /* empty */ ++} ++#endif /* 4KSTACKS */ ++ ++static void au_wkq_run(struct au_wkinfo *wkinfo) ++{ ++ if (au_ftest_wkq(wkinfo->flags, NEST)) { ++ if (au_wkq_test()) { ++ AuWarn1("wkq from wkq, due to a dead dir by UDBA?\n"); ++ AuDebugOn(au_ftest_wkq(wkinfo->flags, WAIT)); ++ } ++ } else ++ au_dbg_verify_kthread(); ++ ++ if (au_ftest_wkq(wkinfo->flags, WAIT)) { ++ INIT_WORK_ONSTACK(&wkinfo->wk, wkq_func); ++ queue_work(au_wkq, &wkinfo->wk); ++ } else { ++ INIT_WORK(&wkinfo->wk, wkq_func); ++ schedule_work(&wkinfo->wk); ++ } ++} ++ ++/* ++ * Be careful. It is easy to make deadlock happen. ++ * processA: lock, wkq and wait ++ * processB: wkq and wait, lock in wkq ++ * --> deadlock ++ */ ++int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args) ++{ ++ int err; ++ AuWkqCompDeclare(comp); ++ struct au_wkinfo wkinfo = { ++ .flags = flags, ++ .func = func, ++ .args = args ++ }; ++ ++ err = au_wkq_comp_alloc(&wkinfo, &comp); ++ if (!err) { ++ au_wkq_run(&wkinfo); ++ /* no timeout, no interrupt */ ++ wait_for_completion(wkinfo.comp); ++ au_wkq_comp_free(comp); ++ destroy_work_on_stack(&wkinfo.wk); ++ } ++ ++ return err; ++ ++} ++ ++/* ++ * Note: dget/dput() in func for aufs dentries are not supported. It will be a ++ * problem in a concurrent umounting. ++ */ ++int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb, ++ unsigned int flags) ++{ ++ int err; ++ struct au_wkinfo *wkinfo; ++ ++ atomic_inc(&au_sbi(sb)->si_nowait.nw_len); ++ ++ /* ++ * wkq_func() must free this wkinfo. ++ * it highly depends upon the implementation of workqueue. ++ */ ++ err = 0; ++ wkinfo = kmalloc(sizeof(*wkinfo), GFP_NOFS); ++ if (wkinfo) { ++ wkinfo->kobj = &au_sbi(sb)->si_kobj; ++ wkinfo->flags = flags & ~AuWkq_WAIT; ++ wkinfo->func = func; ++ wkinfo->args = args; ++ wkinfo->comp = NULL; ++ kobject_get(wkinfo->kobj); ++ __module_get(THIS_MODULE); /* todo: ?? */ ++ ++ au_wkq_run(wkinfo); ++ } else { ++ err = -ENOMEM; ++ au_nwt_done(&au_sbi(sb)->si_nowait); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_nwt_init(struct au_nowait_tasks *nwt) ++{ ++ atomic_set(&nwt->nw_len, 0); ++ /* smp_mb(); */ /* atomic_set */ ++ init_waitqueue_head(&nwt->nw_wq); ++} ++ ++void au_wkq_fin(void) ++{ ++ destroy_workqueue(au_wkq); ++} ++ ++int __init au_wkq_init(void) ++{ ++ int err; ++ ++ err = 0; ++ BUILD_BUG_ON(!WQ_RESCUER); ++ au_wkq = alloc_workqueue(AUFS_WKQ_NAME, !WQ_RESCUER, WQ_DFL_ACTIVE); ++ if (IS_ERR(au_wkq)) ++ err = PTR_ERR(au_wkq); ++ else if (!au_wkq) ++ err = -ENOMEM; ++ ++ return err; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/wkq.h linux-3.2.0-gentoo-r1/fs/aufs/wkq.h +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/wkq.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/wkq.h 2012-01-17 12:11:25.104289179 +0100 +@@ -0,0 +1,92 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * workqueue for asynchronous/super-io operations ++ * todo: try new credentials management scheme ++ */ ++ ++#ifndef __AUFS_WKQ_H__ ++#define __AUFS_WKQ_H__ ++ ++#ifdef __KERNEL__ ++ ++struct super_block; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * in the next operation, wait for the 'nowait' tasks in system-wide workqueue ++ */ ++struct au_nowait_tasks { ++ atomic_t nw_len; ++ wait_queue_head_t nw_wq; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++typedef void (*au_wkq_func_t)(void *args); ++ ++/* wkq flags */ ++#define AuWkq_WAIT 1 ++#define AuWkq_NEST (1 << 1) ++#define au_ftest_wkq(flags, name) ((flags) & AuWkq_##name) ++#define au_fset_wkq(flags, name) \ ++ do { (flags) |= AuWkq_##name; } while (0) ++#define au_fclr_wkq(flags, name) \ ++ do { (flags) &= ~AuWkq_##name; } while (0) ++ ++#ifndef CONFIG_AUFS_HNOTIFY ++#undef AuWkq_NEST ++#define AuWkq_NEST 0 ++#endif ++ ++/* wkq.c */ ++int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args); ++int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb, ++ unsigned int flags); ++void au_nwt_init(struct au_nowait_tasks *nwt); ++int __init au_wkq_init(void); ++void au_wkq_fin(void); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline int au_wkq_test(void) ++{ ++ return current->flags & PF_WQ_WORKER; ++} ++ ++static inline int au_wkq_wait(au_wkq_func_t func, void *args) ++{ ++ return au_wkq_do_wait(AuWkq_WAIT, func, args); ++} ++ ++static inline void au_nwt_done(struct au_nowait_tasks *nwt) ++{ ++ if (atomic_dec_and_test(&nwt->nw_len)) ++ wake_up_all(&nwt->nw_wq); ++} ++ ++static inline int au_nwt_flush(struct au_nowait_tasks *nwt) ++{ ++ wait_event(nwt->nw_wq, !atomic_read(&nwt->nw_len)); ++ return 0; ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_WKQ_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/aufs/xino.c linux-3.2.0-gentoo-r1/fs/aufs/xino.c +--- linux-3.2.0-gentoo-r1.orig//fs/aufs/xino.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/fs/aufs/xino.c 2012-01-17 12:11:25.129752370 +0100 +@@ -0,0 +1,1264 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * external inode number translation table and bitmap ++ */ ++ ++#include ++#include "aufs.h" ++ ++/* todo: unnecessary to support mmap_sem since kernel-space? */ ++ssize_t xino_fread(au_readf_t func, struct file *file, void *kbuf, size_t size, ++ loff_t *pos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ union { ++ void *k; ++ char __user *u; ++ } buf; ++ ++ buf.k = kbuf; ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ do { ++ /* todo: signal_pending? */ ++ err = func(file, buf.u, size, pos); ++ } while (err == -EAGAIN || err == -EINTR); ++ set_fs(oldfs); ++ ++#if 0 /* reserved for future use */ ++ if (err > 0) ++ fsnotify_access(file->f_dentry); ++#endif ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static ssize_t do_xino_fwrite(au_writef_t func, struct file *file, void *kbuf, ++ size_t size, loff_t *pos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ union { ++ void *k; ++ const char __user *u; ++ } buf; ++ ++ buf.k = kbuf; ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ do { ++ /* todo: signal_pending? */ ++ err = func(file, buf.u, size, pos); ++ } while (err == -EAGAIN || err == -EINTR); ++ set_fs(oldfs); ++ ++#if 0 /* reserved for future use */ ++ if (err > 0) ++ fsnotify_modify(file->f_dentry); ++#endif ++ ++ return err; ++} ++ ++struct do_xino_fwrite_args { ++ ssize_t *errp; ++ au_writef_t func; ++ struct file *file; ++ void *buf; ++ size_t size; ++ loff_t *pos; ++}; ++ ++static void call_do_xino_fwrite(void *args) ++{ ++ struct do_xino_fwrite_args *a = args; ++ *a->errp = do_xino_fwrite(a->func, a->file, a->buf, a->size, a->pos); ++} ++ ++ssize_t xino_fwrite(au_writef_t func, struct file *file, void *buf, size_t size, ++ loff_t *pos) ++{ ++ ssize_t err; ++ ++ /* todo: signal block and no wkq? */ ++ if (rlimit(RLIMIT_FSIZE) == RLIM_INFINITY) { ++ lockdep_off(); ++ err = do_xino_fwrite(func, file, buf, size, pos); ++ lockdep_on(); ++ } else { ++ /* ++ * it breaks RLIMIT_FSIZE and normal user's limit, ++ * users should care about quota and real 'filesystem full.' ++ */ ++ int wkq_err; ++ struct do_xino_fwrite_args args = { ++ .errp = &err, ++ .func = func, ++ .file = file, ++ .buf = buf, ++ .size = size, ++ .pos = pos ++ }; ++ ++ wkq_err = au_wkq_wait(call_do_xino_fwrite, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create a new xinofile at the same place/path as @base_file. ++ */ ++struct file *au_xino_create2(struct file *base_file, struct file *copy_src) ++{ ++ struct file *file; ++ struct dentry *base, *parent; ++ struct inode *dir; ++ struct qstr *name; ++ struct path path; ++ int err; ++ ++ base = base_file->f_dentry; ++ parent = base->d_parent; /* dir inode is locked */ ++ dir = parent->d_inode; ++ IMustLock(dir); ++ ++ file = ERR_PTR(-EINVAL); ++ name = &base->d_name; ++ path.dentry = vfsub_lookup_one_len(name->name, parent, name->len); ++ if (IS_ERR(path.dentry)) { ++ file = (void *)path.dentry; ++ pr_err("%.*s lookup err %ld\n", ++ AuLNPair(name), PTR_ERR(path.dentry)); ++ goto out; ++ } ++ ++ /* no need to mnt_want_write() since we call dentry_open() later */ ++ err = vfs_create(dir, path.dentry, S_IRUGO | S_IWUGO, NULL); ++ if (unlikely(err)) { ++ file = ERR_PTR(err); ++ pr_err("%.*s create err %d\n", AuLNPair(name), err); ++ goto out_dput; ++ } ++ ++ path.mnt = base_file->f_vfsmnt; ++ file = vfsub_dentry_open(&path, ++ O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE ++ /* | __FMODE_NONOTIFY */); ++ if (IS_ERR(file)) { ++ pr_err("%.*s open err %ld\n", AuLNPair(name), PTR_ERR(file)); ++ goto out_dput; ++ } ++ ++ err = vfsub_unlink(dir, &file->f_path, /*force*/0); ++ if (unlikely(err)) { ++ pr_err("%.*s unlink err %d\n", AuLNPair(name), err); ++ goto out_fput; ++ } ++ ++ if (copy_src) { ++ /* no one can touch copy_src xino */ ++ err = au_copy_file(file, copy_src, ++ i_size_read(copy_src->f_dentry->d_inode)); ++ if (unlikely(err)) { ++ pr_err("%.*s copy err %d\n", AuLNPair(name), err); ++ goto out_fput; ++ } ++ } ++ goto out_dput; /* success */ ++ ++out_fput: ++ fput(file); ++ file = ERR_PTR(err); ++out_dput: ++ dput(path.dentry); ++out: ++ return file; ++} ++ ++struct au_xino_lock_dir { ++ struct au_hinode *hdir; ++ struct dentry *parent; ++ struct mutex *mtx; ++}; ++ ++static void au_xino_lock_dir(struct super_block *sb, struct file *xino, ++ struct au_xino_lock_dir *ldir) ++{ ++ aufs_bindex_t brid, bindex; ++ ++ ldir->hdir = NULL; ++ bindex = -1; ++ brid = au_xino_brid(sb); ++ if (brid >= 0) ++ bindex = au_br_index(sb, brid); ++ if (bindex >= 0) { ++ ldir->hdir = au_hi(sb->s_root->d_inode, bindex); ++ au_hn_imtx_lock_nested(ldir->hdir, AuLsc_I_PARENT); ++ } else { ++ ldir->parent = dget_parent(xino->f_dentry); ++ ldir->mtx = &ldir->parent->d_inode->i_mutex; ++ mutex_lock_nested(ldir->mtx, AuLsc_I_PARENT); ++ } ++} ++ ++static void au_xino_unlock_dir(struct au_xino_lock_dir *ldir) ++{ ++ if (ldir->hdir) ++ au_hn_imtx_unlock(ldir->hdir); ++ else { ++ mutex_unlock(ldir->mtx); ++ dput(ldir->parent); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* trucate xino files asynchronously */ ++ ++int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ int err; ++ aufs_bindex_t bi, bend; ++ struct au_branch *br; ++ struct file *new_xino, *file; ++ struct super_block *h_sb; ++ struct au_xino_lock_dir ldir; ++ ++ err = -EINVAL; ++ bend = au_sbend(sb); ++ if (unlikely(bindex < 0 || bend < bindex)) ++ goto out; ++ br = au_sbr(sb, bindex); ++ file = br->br_xino.xi_file; ++ if (!file) ++ goto out; ++ ++ au_xino_lock_dir(sb, file, &ldir); ++ /* mnt_want_write() is unnecessary here */ ++ new_xino = au_xino_create2(file, file); ++ au_xino_unlock_dir(&ldir); ++ err = PTR_ERR(new_xino); ++ if (IS_ERR(new_xino)) ++ goto out; ++ err = 0; ++ fput(file); ++ br->br_xino.xi_file = new_xino; ++ ++ h_sb = br->br_mnt->mnt_sb; ++ for (bi = 0; bi <= bend; bi++) { ++ if (unlikely(bi == bindex)) ++ continue; ++ br = au_sbr(sb, bi); ++ if (br->br_mnt->mnt_sb != h_sb) ++ continue; ++ ++ fput(br->br_xino.xi_file); ++ br->br_xino.xi_file = new_xino; ++ get_file(new_xino); ++ } ++ ++out: ++ return err; ++} ++ ++struct xino_do_trunc_args { ++ struct super_block *sb; ++ struct au_branch *br; ++}; ++ ++static void xino_do_trunc(void *_args) ++{ ++ struct xino_do_trunc_args *args = _args; ++ struct super_block *sb; ++ struct au_branch *br; ++ struct inode *dir; ++ int err; ++ aufs_bindex_t bindex; ++ ++ err = 0; ++ sb = args->sb; ++ dir = sb->s_root->d_inode; ++ br = args->br; ++ ++ si_noflush_write_lock(sb); ++ ii_read_lock_parent(dir); ++ bindex = au_br_index(sb, br->br_id); ++ err = au_xino_trunc(sb, bindex); ++ if (!err ++ && br->br_xino.xi_file->f_dentry->d_inode->i_blocks ++ >= br->br_xino_upper) ++ br->br_xino_upper += AUFS_XINO_TRUNC_STEP; ++ ++ ii_read_unlock(dir); ++ if (unlikely(err)) ++ pr_warning("err b%d, (%d)\n", bindex, err); ++ atomic_dec(&br->br_xino_running); ++ atomic_dec(&br->br_count); ++ si_write_unlock(sb); ++ au_nwt_done(&au_sbi(sb)->si_nowait); ++ kfree(args); ++} ++ ++static void xino_try_trunc(struct super_block *sb, struct au_branch *br) ++{ ++ struct xino_do_trunc_args *args; ++ int wkq_err; ++ ++ if (br->br_xino.xi_file->f_dentry->d_inode->i_blocks ++ < br->br_xino_upper) ++ return; ++ ++ if (atomic_inc_return(&br->br_xino_running) > 1) ++ goto out; ++ ++ /* lock and kfree() will be called in trunc_xino() */ ++ args = kmalloc(sizeof(*args), GFP_NOFS); ++ if (unlikely(!args)) { ++ AuErr1("no memory\n"); ++ goto out_args; ++ } ++ ++ atomic_inc(&br->br_count); ++ args->sb = sb; ++ args->br = br; ++ wkq_err = au_wkq_nowait(xino_do_trunc, args, sb, /*flags*/0); ++ if (!wkq_err) ++ return; /* success */ ++ ++ pr_err("wkq %d\n", wkq_err); ++ atomic_dec(&br->br_count); ++ ++out_args: ++ kfree(args); ++out: ++ atomic_dec(&br->br_xino_running); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_xino_do_write(au_writef_t write, struct file *file, ++ ino_t h_ino, ino_t ino) ++{ ++ loff_t pos; ++ ssize_t sz; ++ ++ pos = h_ino; ++ if (unlikely(au_loff_max / sizeof(ino) - 1 < pos)) { ++ AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); ++ return -EFBIG; ++ } ++ pos *= sizeof(ino); ++ sz = xino_fwrite(write, file, &ino, sizeof(ino), &pos); ++ if (sz == sizeof(ino)) ++ return 0; /* success */ ++ ++ AuIOErr("write failed (%zd)\n", sz); ++ return -EIO; ++} ++ ++/* ++ * write @ino to the xinofile for the specified branch{@sb, @bindex} ++ * at the position of @h_ino. ++ * even if @ino is zero, it is written to the xinofile and means no entry. ++ * if the size of the xino file on a specific filesystem exceeds the watermark, ++ * try truncating it. ++ */ ++int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t ino) ++{ ++ int err; ++ unsigned int mnt_flags; ++ struct au_branch *br; ++ ++ BUILD_BUG_ON(sizeof(long long) != sizeof(au_loff_max) ++ || ((loff_t)-1) > 0); ++ SiMustAnyLock(sb); ++ ++ mnt_flags = au_mntflags(sb); ++ if (!au_opt_test(mnt_flags, XINO)) ++ return 0; ++ ++ br = au_sbr(sb, bindex); ++ err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, ++ h_ino, ino); ++ if (!err) { ++ if (au_opt_test(mnt_flags, TRUNC_XINO) ++ && au_test_fs_trunc_xino(br->br_mnt->mnt_sb)) ++ xino_try_trunc(sb, br); ++ return 0; /* success */ ++ } ++ ++ AuIOErr("write failed (%d)\n", err); ++ return -EIO; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* aufs inode number bitmap */ ++ ++static const int page_bits = (int)PAGE_SIZE * BITS_PER_BYTE; ++static ino_t xib_calc_ino(unsigned long pindex, int bit) ++{ ++ ino_t ino; ++ ++ AuDebugOn(bit < 0 || page_bits <= bit); ++ ino = AUFS_FIRST_INO + pindex * page_bits + bit; ++ return ino; ++} ++ ++static void xib_calc_bit(ino_t ino, unsigned long *pindex, int *bit) ++{ ++ AuDebugOn(ino < AUFS_FIRST_INO); ++ ino -= AUFS_FIRST_INO; ++ *pindex = ino / page_bits; ++ *bit = ino % page_bits; ++} ++ ++static int xib_pindex(struct super_block *sb, unsigned long pindex) ++{ ++ int err; ++ loff_t pos; ++ ssize_t sz; ++ struct au_sbinfo *sbinfo; ++ struct file *xib; ++ unsigned long *p; ++ ++ sbinfo = au_sbi(sb); ++ MtxMustLock(&sbinfo->si_xib_mtx); ++ AuDebugOn(pindex > ULONG_MAX / PAGE_SIZE ++ || !au_opt_test(sbinfo->si_mntflags, XINO)); ++ ++ if (pindex == sbinfo->si_xib_last_pindex) ++ return 0; ++ ++ xib = sbinfo->si_xib; ++ p = sbinfo->si_xib_buf; ++ pos = sbinfo->si_xib_last_pindex; ++ pos *= PAGE_SIZE; ++ sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); ++ if (unlikely(sz != PAGE_SIZE)) ++ goto out; ++ ++ pos = pindex; ++ pos *= PAGE_SIZE; ++ if (i_size_read(xib->f_dentry->d_inode) >= pos + PAGE_SIZE) ++ sz = xino_fread(sbinfo->si_xread, xib, p, PAGE_SIZE, &pos); ++ else { ++ memset(p, 0, PAGE_SIZE); ++ sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); ++ } ++ if (sz == PAGE_SIZE) { ++ sbinfo->si_xib_last_pindex = pindex; ++ return 0; /* success */ ++ } ++ ++out: ++ AuIOErr1("write failed (%zd)\n", sz); ++ err = sz; ++ if (sz >= 0) ++ err = -EIO; ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_xib_clear_bit(struct inode *inode) ++{ ++ int err, bit; ++ unsigned long pindex; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++ AuDebugOn(inode->i_nlink); ++ ++ sb = inode->i_sb; ++ xib_calc_bit(inode->i_ino, &pindex, &bit); ++ AuDebugOn(page_bits <= bit); ++ sbinfo = au_sbi(sb); ++ mutex_lock(&sbinfo->si_xib_mtx); ++ err = xib_pindex(sb, pindex); ++ if (!err) { ++ clear_bit(bit, sbinfo->si_xib_buf); ++ sbinfo->si_xib_next_bit = bit; ++ } ++ mutex_unlock(&sbinfo->si_xib_mtx); ++} ++ ++/* for s_op->delete_inode() */ ++void au_xino_delete_inode(struct inode *inode, const int unlinked) ++{ ++ int err; ++ unsigned int mnt_flags; ++ aufs_bindex_t bindex, bend, bi; ++ unsigned char try_trunc; ++ struct au_iinfo *iinfo; ++ struct super_block *sb; ++ struct au_hinode *hi; ++ struct inode *h_inode; ++ struct au_branch *br; ++ au_writef_t xwrite; ++ ++ sb = inode->i_sb; ++ mnt_flags = au_mntflags(sb); ++ if (!au_opt_test(mnt_flags, XINO) ++ || inode->i_ino == AUFS_ROOT_INO) ++ return; ++ ++ if (unlinked) { ++ au_xigen_inc(inode); ++ au_xib_clear_bit(inode); ++ } ++ ++ iinfo = au_ii(inode); ++ if (!iinfo) ++ return; ++ ++ bindex = iinfo->ii_bstart; ++ if (bindex < 0) ++ return; ++ ++ xwrite = au_sbi(sb)->si_xwrite; ++ try_trunc = !!au_opt_test(mnt_flags, TRUNC_XINO); ++ hi = iinfo->ii_hinode + bindex; ++ bend = iinfo->ii_bend; ++ for (; bindex <= bend; bindex++, hi++) { ++ h_inode = hi->hi_inode; ++ if (!h_inode ++ || (!unlinked && h_inode->i_nlink)) ++ continue; ++ ++ /* inode may not be revalidated */ ++ bi = au_br_index(sb, hi->hi_id); ++ if (bi < 0) ++ continue; ++ ++ br = au_sbr(sb, bi); ++ err = au_xino_do_write(xwrite, br->br_xino.xi_file, ++ h_inode->i_ino, /*ino*/0); ++ if (!err && try_trunc ++ && au_test_fs_trunc_xino(br->br_mnt->mnt_sb)) ++ xino_try_trunc(sb, br); ++ } ++} ++ ++/* get an unused inode number from bitmap */ ++ino_t au_xino_new_ino(struct super_block *sb) ++{ ++ ino_t ino; ++ unsigned long *p, pindex, ul, pend; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ int free_bit, err; ++ ++ if (!au_opt_test(au_mntflags(sb), XINO)) ++ return iunique(sb, AUFS_FIRST_INO); ++ ++ sbinfo = au_sbi(sb); ++ mutex_lock(&sbinfo->si_xib_mtx); ++ p = sbinfo->si_xib_buf; ++ free_bit = sbinfo->si_xib_next_bit; ++ if (free_bit < page_bits && !test_bit(free_bit, p)) ++ goto out; /* success */ ++ free_bit = find_first_zero_bit(p, page_bits); ++ if (free_bit < page_bits) ++ goto out; /* success */ ++ ++ pindex = sbinfo->si_xib_last_pindex; ++ for (ul = pindex - 1; ul < ULONG_MAX; ul--) { ++ err = xib_pindex(sb, ul); ++ if (unlikely(err)) ++ goto out_err; ++ free_bit = find_first_zero_bit(p, page_bits); ++ if (free_bit < page_bits) ++ goto out; /* success */ ++ } ++ ++ file = sbinfo->si_xib; ++ pend = i_size_read(file->f_dentry->d_inode) / PAGE_SIZE; ++ for (ul = pindex + 1; ul <= pend; ul++) { ++ err = xib_pindex(sb, ul); ++ if (unlikely(err)) ++ goto out_err; ++ free_bit = find_first_zero_bit(p, page_bits); ++ if (free_bit < page_bits) ++ goto out; /* success */ ++ } ++ BUG(); ++ ++out: ++ set_bit(free_bit, p); ++ sbinfo->si_xib_next_bit = free_bit + 1; ++ pindex = sbinfo->si_xib_last_pindex; ++ mutex_unlock(&sbinfo->si_xib_mtx); ++ ino = xib_calc_ino(pindex, free_bit); ++ AuDbg("i%lu\n", (unsigned long)ino); ++ return ino; ++out_err: ++ mutex_unlock(&sbinfo->si_xib_mtx); ++ AuDbg("i0\n"); ++ return 0; ++} ++ ++/* ++ * read @ino from xinofile for the specified branch{@sb, @bindex} ++ * at the position of @h_ino. ++ * if @ino does not exist and @do_new is true, get new one. ++ */ ++int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t *ino) ++{ ++ int err; ++ ssize_t sz; ++ loff_t pos; ++ struct file *file; ++ struct au_sbinfo *sbinfo; ++ ++ *ino = 0; ++ if (!au_opt_test(au_mntflags(sb), XINO)) ++ return 0; /* no xino */ ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ pos = h_ino; ++ if (unlikely(au_loff_max / sizeof(*ino) - 1 < pos)) { ++ AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); ++ return -EFBIG; ++ } ++ pos *= sizeof(*ino); ++ ++ file = au_sbr(sb, bindex)->br_xino.xi_file; ++ if (i_size_read(file->f_dentry->d_inode) < pos + sizeof(*ino)) ++ return 0; /* no ino */ ++ ++ sz = xino_fread(sbinfo->si_xread, file, ino, sizeof(*ino), &pos); ++ if (sz == sizeof(*ino)) ++ return 0; /* success */ ++ ++ err = sz; ++ if (unlikely(sz >= 0)) { ++ err = -EIO; ++ AuIOErr("xino read error (%zd)\n", sz); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* create and set a new xino file */ ++ ++struct file *au_xino_create(struct super_block *sb, char *fname, int silent) ++{ ++ struct file *file; ++ struct dentry *h_parent, *d; ++ struct inode *h_dir; ++ int err; ++ ++ /* ++ * at mount-time, and the xino file is the default path, ++ * hnotify is disabled so we have no notify events to ignore. ++ * when a user specified the xino, we cannot get au_hdir to be ignored. ++ */ ++ file = vfsub_filp_open(fname, O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE ++ /* | __FMODE_NONOTIFY */, ++ S_IRUGO | S_IWUGO); ++ if (IS_ERR(file)) { ++ if (!silent) ++ pr_err("open %s(%ld)\n", fname, PTR_ERR(file)); ++ return file; ++ } ++ ++ /* keep file count */ ++ h_parent = dget_parent(file->f_dentry); ++ h_dir = h_parent->d_inode; ++ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); ++ /* mnt_want_write() is unnecessary here */ ++ err = vfsub_unlink(h_dir, &file->f_path, /*force*/0); ++ mutex_unlock(&h_dir->i_mutex); ++ dput(h_parent); ++ if (unlikely(err)) { ++ if (!silent) ++ pr_err("unlink %s(%d)\n", fname, err); ++ goto out; ++ } ++ ++ err = -EINVAL; ++ d = file->f_dentry; ++ if (unlikely(sb == d->d_sb)) { ++ if (!silent) ++ pr_err("%s must be outside\n", fname); ++ goto out; ++ } ++ if (unlikely(au_test_fs_bad_xino(d->d_sb))) { ++ if (!silent) ++ pr_err("xino doesn't support %s(%s)\n", ++ fname, au_sbtype(d->d_sb)); ++ goto out; ++ } ++ return file; /* success */ ++ ++out: ++ fput(file); ++ file = ERR_PTR(err); ++ return file; ++} ++ ++/* ++ * find another branch who is on the same filesystem of the specified ++ * branch{@btgt}. search until @bend. ++ */ ++static int is_sb_shared(struct super_block *sb, aufs_bindex_t btgt, ++ aufs_bindex_t bend) ++{ ++ aufs_bindex_t bindex; ++ struct super_block *tgt_sb = au_sbr_sb(sb, btgt); ++ ++ for (bindex = 0; bindex < btgt; bindex++) ++ if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) ++ return bindex; ++ for (bindex++; bindex <= bend; bindex++) ++ if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) ++ return bindex; ++ return -1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * initialize the xinofile for the specified branch @br ++ * at the place/path where @base_file indicates. ++ * test whether another branch is on the same filesystem or not, ++ * if @do_test is true. ++ */ ++int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t h_ino, ++ struct file *base_file, int do_test) ++{ ++ int err; ++ ino_t ino; ++ aufs_bindex_t bend, bindex; ++ struct au_branch *shared_br, *b; ++ struct file *file; ++ struct super_block *tgt_sb; ++ ++ shared_br = NULL; ++ bend = au_sbend(sb); ++ if (do_test) { ++ tgt_sb = br->br_mnt->mnt_sb; ++ for (bindex = 0; bindex <= bend; bindex++) { ++ b = au_sbr(sb, bindex); ++ if (tgt_sb == b->br_mnt->mnt_sb) { ++ shared_br = b; ++ break; ++ } ++ } ++ } ++ ++ if (!shared_br || !shared_br->br_xino.xi_file) { ++ struct au_xino_lock_dir ldir; ++ ++ au_xino_lock_dir(sb, base_file, &ldir); ++ /* mnt_want_write() is unnecessary here */ ++ file = au_xino_create2(base_file, NULL); ++ au_xino_unlock_dir(&ldir); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ br->br_xino.xi_file = file; ++ } else { ++ br->br_xino.xi_file = shared_br->br_xino.xi_file; ++ get_file(br->br_xino.xi_file); ++ } ++ ++ ino = AUFS_ROOT_INO; ++ err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, ++ h_ino, ino); ++ if (unlikely(err)) { ++ fput(br->br_xino.xi_file); ++ br->br_xino.xi_file = NULL; ++ } ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* trucate a xino bitmap file */ ++ ++/* todo: slow */ ++static int do_xib_restore(struct super_block *sb, struct file *file, void *page) ++{ ++ int err, bit; ++ ssize_t sz; ++ unsigned long pindex; ++ loff_t pos, pend; ++ struct au_sbinfo *sbinfo; ++ au_readf_t func; ++ ino_t *ino; ++ unsigned long *p; ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ MtxMustLock(&sbinfo->si_xib_mtx); ++ p = sbinfo->si_xib_buf; ++ func = sbinfo->si_xread; ++ pend = i_size_read(file->f_dentry->d_inode); ++ pos = 0; ++ while (pos < pend) { ++ sz = xino_fread(func, file, page, PAGE_SIZE, &pos); ++ err = sz; ++ if (unlikely(sz <= 0)) ++ goto out; ++ ++ err = 0; ++ for (ino = page; sz > 0; ino++, sz -= sizeof(ino)) { ++ if (unlikely(*ino < AUFS_FIRST_INO)) ++ continue; ++ ++ xib_calc_bit(*ino, &pindex, &bit); ++ AuDebugOn(page_bits <= bit); ++ err = xib_pindex(sb, pindex); ++ if (!err) ++ set_bit(bit, p); ++ else ++ goto out; ++ } ++ } ++ ++out: ++ return err; ++} ++ ++static int xib_restore(struct super_block *sb) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ void *page; ++ ++ err = -ENOMEM; ++ page = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!page)) ++ goto out; ++ ++ err = 0; ++ bend = au_sbend(sb); ++ for (bindex = 0; !err && bindex <= bend; bindex++) ++ if (!bindex || is_sb_shared(sb, bindex, bindex - 1) < 0) ++ err = do_xib_restore ++ (sb, au_sbr(sb, bindex)->br_xino.xi_file, page); ++ else ++ AuDbg("b%d\n", bindex); ++ free_page((unsigned long)page); ++ ++out: ++ return err; ++} ++ ++int au_xib_trunc(struct super_block *sb) ++{ ++ int err; ++ ssize_t sz; ++ loff_t pos; ++ struct au_xino_lock_dir ldir; ++ struct au_sbinfo *sbinfo; ++ unsigned long *p; ++ struct file *file; ++ ++ SiMustWriteLock(sb); ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ if (!au_opt_test(sbinfo->si_mntflags, XINO)) ++ goto out; ++ ++ file = sbinfo->si_xib; ++ if (i_size_read(file->f_dentry->d_inode) <= PAGE_SIZE) ++ goto out; ++ ++ au_xino_lock_dir(sb, file, &ldir); ++ /* mnt_want_write() is unnecessary here */ ++ file = au_xino_create2(sbinfo->si_xib, NULL); ++ au_xino_unlock_dir(&ldir); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = file; ++ ++ p = sbinfo->si_xib_buf; ++ memset(p, 0, PAGE_SIZE); ++ pos = 0; ++ sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xib, p, PAGE_SIZE, &pos); ++ if (unlikely(sz != PAGE_SIZE)) { ++ err = sz; ++ AuIOErr("err %d\n", err); ++ if (sz >= 0) ++ err = -EIO; ++ goto out; ++ } ++ ++ mutex_lock(&sbinfo->si_xib_mtx); ++ /* mnt_want_write() is unnecessary here */ ++ err = xib_restore(sb); ++ mutex_unlock(&sbinfo->si_xib_mtx); ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * xino mount option handlers ++ */ ++static au_readf_t find_readf(struct file *h_file) ++{ ++ const struct file_operations *fop = h_file->f_op; ++ ++ if (fop) { ++ if (fop->read) ++ return fop->read; ++ if (fop->aio_read) ++ return do_sync_read; ++ } ++ return ERR_PTR(-ENOSYS); ++} ++ ++static au_writef_t find_writef(struct file *h_file) ++{ ++ const struct file_operations *fop = h_file->f_op; ++ ++ if (fop) { ++ if (fop->write) ++ return fop->write; ++ if (fop->aio_write) ++ return do_sync_write; ++ } ++ return ERR_PTR(-ENOSYS); ++} ++ ++/* xino bitmap */ ++static void xino_clear_xib(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ sbinfo->si_xread = NULL; ++ sbinfo->si_xwrite = NULL; ++ if (sbinfo->si_xib) ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = NULL; ++ free_page((unsigned long)sbinfo->si_xib_buf); ++ sbinfo->si_xib_buf = NULL; ++} ++ ++static int au_xino_set_xib(struct super_block *sb, struct file *base) ++{ ++ int err; ++ loff_t pos; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ file = au_xino_create2(base, sbinfo->si_xib); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ if (sbinfo->si_xib) ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = file; ++ sbinfo->si_xread = find_readf(file); ++ sbinfo->si_xwrite = find_writef(file); ++ ++ err = -ENOMEM; ++ if (!sbinfo->si_xib_buf) ++ sbinfo->si_xib_buf = (void *)get_zeroed_page(GFP_NOFS); ++ if (unlikely(!sbinfo->si_xib_buf)) ++ goto out_unset; ++ ++ sbinfo->si_xib_last_pindex = 0; ++ sbinfo->si_xib_next_bit = 0; ++ if (i_size_read(file->f_dentry->d_inode) < PAGE_SIZE) { ++ pos = 0; ++ err = xino_fwrite(sbinfo->si_xwrite, file, sbinfo->si_xib_buf, ++ PAGE_SIZE, &pos); ++ if (unlikely(err != PAGE_SIZE)) ++ goto out_free; ++ } ++ err = 0; ++ goto out; /* success */ ++ ++out_free: ++ free_page((unsigned long)sbinfo->si_xib_buf); ++ sbinfo->si_xib_buf = NULL; ++ if (err >= 0) ++ err = -EIO; ++out_unset: ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = NULL; ++ sbinfo->si_xread = NULL; ++ sbinfo->si_xwrite = NULL; ++out: ++ return err; ++} ++ ++/* xino for each branch */ ++static void xino_clear_br(struct super_block *sb) ++{ ++ aufs_bindex_t bindex, bend; ++ struct au_branch *br; ++ ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (!br || !br->br_xino.xi_file) ++ continue; ++ ++ fput(br->br_xino.xi_file); ++ br->br_xino.xi_file = NULL; ++ } ++} ++ ++static int au_xino_set_br(struct super_block *sb, struct file *base) ++{ ++ int err; ++ ino_t ino; ++ aufs_bindex_t bindex, bend, bshared; ++ struct { ++ struct file *old, *new; ++ } *fpair, *p; ++ struct au_branch *br; ++ struct inode *inode; ++ au_writef_t writef; ++ ++ SiMustWriteLock(sb); ++ ++ err = -ENOMEM; ++ bend = au_sbend(sb); ++ fpair = kcalloc(bend + 1, sizeof(*fpair), GFP_NOFS); ++ if (unlikely(!fpair)) ++ goto out; ++ ++ inode = sb->s_root->d_inode; ++ ino = AUFS_ROOT_INO; ++ writef = au_sbi(sb)->si_xwrite; ++ for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) { ++ br = au_sbr(sb, bindex); ++ bshared = is_sb_shared(sb, bindex, bindex - 1); ++ if (bshared >= 0) { ++ /* shared xino */ ++ *p = fpair[bshared]; ++ get_file(p->new); ++ } ++ ++ if (!p->new) { ++ /* new xino */ ++ p->old = br->br_xino.xi_file; ++ p->new = au_xino_create2(base, br->br_xino.xi_file); ++ err = PTR_ERR(p->new); ++ if (IS_ERR(p->new)) { ++ p->new = NULL; ++ goto out_pair; ++ } ++ } ++ ++ err = au_xino_do_write(writef, p->new, ++ au_h_iptr(inode, bindex)->i_ino, ino); ++ if (unlikely(err)) ++ goto out_pair; ++ } ++ ++ for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) { ++ br = au_sbr(sb, bindex); ++ if (br->br_xino.xi_file) ++ fput(br->br_xino.xi_file); ++ get_file(p->new); ++ br->br_xino.xi_file = p->new; ++ } ++ ++out_pair: ++ for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) ++ if (p->new) ++ fput(p->new); ++ else ++ break; ++ kfree(fpair); ++out: ++ return err; ++} ++ ++void au_xino_clr(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ au_xigen_clr(sb); ++ xino_clear_xib(sb); ++ xino_clear_br(sb); ++ sbinfo = au_sbi(sb); ++ /* lvalue, do not call au_mntflags() */ ++ au_opt_clr(sbinfo->si_mntflags, XINO); ++} ++ ++int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount) ++{ ++ int err, skip; ++ struct dentry *parent, *cur_parent; ++ struct qstr *dname, *cur_name; ++ struct file *cur_xino; ++ struct inode *dir; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ parent = dget_parent(xino->file->f_dentry); ++ if (remount) { ++ skip = 0; ++ dname = &xino->file->f_dentry->d_name; ++ cur_xino = sbinfo->si_xib; ++ if (cur_xino) { ++ cur_parent = dget_parent(cur_xino->f_dentry); ++ cur_name = &cur_xino->f_dentry->d_name; ++ skip = (cur_parent == parent ++ && dname->len == cur_name->len ++ && !memcmp(dname->name, cur_name->name, ++ dname->len)); ++ dput(cur_parent); ++ } ++ if (skip) ++ goto out; ++ } ++ ++ au_opt_set(sbinfo->si_mntflags, XINO); ++ dir = parent->d_inode; ++ mutex_lock_nested(&dir->i_mutex, AuLsc_I_PARENT); ++ /* mnt_want_write() is unnecessary here */ ++ err = au_xino_set_xib(sb, xino->file); ++ if (!err) ++ err = au_xigen_set(sb, xino->file); ++ if (!err) ++ err = au_xino_set_br(sb, xino->file); ++ mutex_unlock(&dir->i_mutex); ++ if (!err) ++ goto out; /* success */ ++ ++ /* reset all */ ++ AuIOErr("failed creating xino(%d).\n", err); ++ ++out: ++ dput(parent); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create a xinofile at the default place/path. ++ */ ++struct file *au_xino_def(struct super_block *sb) ++{ ++ struct file *file; ++ char *page, *p; ++ struct au_branch *br; ++ struct super_block *h_sb; ++ struct path path; ++ aufs_bindex_t bend, bindex, bwr; ++ ++ br = NULL; ++ bend = au_sbend(sb); ++ bwr = -1; ++ for (bindex = 0; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (au_br_writable(br->br_perm) ++ && !au_test_fs_bad_xino(br->br_mnt->mnt_sb)) { ++ bwr = bindex; ++ break; ++ } ++ } ++ ++ if (bwr >= 0) { ++ file = ERR_PTR(-ENOMEM); ++ page = __getname_gfp(GFP_NOFS); ++ if (unlikely(!page)) ++ goto out; ++ path.mnt = br->br_mnt; ++ path.dentry = au_h_dptr(sb->s_root, bwr); ++ p = d_path(&path, page, PATH_MAX - sizeof(AUFS_XINO_FNAME)); ++ file = (void *)p; ++ if (!IS_ERR(p)) { ++ strcat(p, "/" AUFS_XINO_FNAME); ++ AuDbg("%s\n", p); ++ file = au_xino_create(sb, p, /*silent*/0); ++ if (!IS_ERR(file)) ++ au_xino_brid_set(sb, br->br_id); ++ } ++ __putname(page); ++ } else { ++ file = au_xino_create(sb, AUFS_XINO_DEFPATH, /*silent*/0); ++ if (IS_ERR(file)) ++ goto out; ++ h_sb = file->f_dentry->d_sb; ++ if (unlikely(au_test_fs_bad_xino(h_sb))) { ++ pr_err("xino doesn't support %s(%s)\n", ++ AUFS_XINO_DEFPATH, au_sbtype(h_sb)); ++ fput(file); ++ file = ERR_PTR(-EINVAL); ++ } ++ if (!IS_ERR(file)) ++ au_xino_brid_set(sb, -1); ++ } ++ ++out: ++ return file; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_xino_path(struct seq_file *seq, struct file *file) ++{ ++ int err; ++ ++ err = au_seq_path(seq, &file->f_path); ++ if (unlikely(err < 0)) ++ goto out; ++ ++ err = 0; ++#define Deleted "\\040(deleted)" ++ seq->count -= sizeof(Deleted) - 1; ++ AuDebugOn(memcmp(seq->buf + seq->count, Deleted, ++ sizeof(Deleted) - 1)); ++#undef Deleted ++ ++out: ++ return err; ++} +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/file_table.c linux-3.2.0-gentoo-r1/fs/file_table.c +--- linux-3.2.0-gentoo-r1.orig//fs/file_table.c 2012-01-17 11:56:05.850634078 +0100 ++++ linux-3.2.0-gentoo-r1/fs/file_table.c 2012-01-17 12:09:55.948393073 +0100 +@@ -443,6 +443,8 @@ + } + } + ++EXPORT_SYMBOL(file_sb_list_del); ++ + #ifdef CONFIG_SMP + + /* +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/inode.c linux-3.2.0-gentoo-r1/fs/inode.c +--- linux-3.2.0-gentoo-r1.orig//fs/inode.c 2012-01-17 11:56:10.684010972 +0100 ++++ linux-3.2.0-gentoo-r1/fs/inode.c 2012-01-17 12:09:55.969226594 +0100 +@@ -65,6 +65,7 @@ + static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); + + __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); ++EXPORT_SYMBOL(inode_sb_list_lock); + + /* + * Empty aops. Can be used for the cases where the user does not +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/Kconfig linux-3.2.0-gentoo-r1/fs/Kconfig +--- linux-3.2.0-gentoo-r1.orig//fs/Kconfig 2012-01-17 11:56:10.732622521 +0100 ++++ linux-3.2.0-gentoo-r1/fs/Kconfig 2012-01-17 12:09:24.806908708 +0100 +@@ -215,6 +215,7 @@ + source "fs/sysv/Kconfig" + source "fs/ufs/Kconfig" + source "fs/exofs/Kconfig" ++source "fs/aufs/Kconfig" + + endif # MISC_FILESYSTEMS + +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/Makefile linux-3.2.0-gentoo-r1/fs/Makefile +--- linux-3.2.0-gentoo-r1.orig//fs/Makefile 2012-01-17 11:56:14.626176130 +0100 ++++ linux-3.2.0-gentoo-r1/fs/Makefile 2012-01-17 12:09:24.924965327 +0100 +@@ -123,3 +123,4 @@ + obj-y += exofs/ # Multiple modules + obj-$(CONFIG_CEPH_FS) += ceph/ + obj-$(CONFIG_PSTORE) += pstore/ ++obj-$(CONFIG_AUFS_FS) += aufs/ +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/namei.c linux-3.2.0-gentoo-r1/fs/namei.c +--- linux-3.2.0-gentoo-r1.orig//fs/namei.c 2012-01-17 11:56:13.778946273 +0100 ++++ linux-3.2.0-gentoo-r1/fs/namei.c 2012-01-17 12:09:55.971541430 +0100 +@@ -1753,10 +1753,11 @@ + * needs parent already locked. Doesn't follow mounts. + * SMP-safe. + */ +-static struct dentry *lookup_hash(struct nameidata *nd) ++struct dentry *lookup_hash(struct nameidata *nd) + { + return __lookup_hash(&nd->last, nd->path.dentry, nd); + } ++EXPORT_SYMBOL(lookup_hash); + + /** + * lookup_one_len - filesystem helper to lookup single pathname component +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/namespace.c linux-3.2.0-gentoo-r1/fs/namespace.c +--- linux-3.2.0-gentoo-r1.orig//fs/namespace.c 2012-01-17 11:56:14.563675567 +0100 ++++ linux-3.2.0-gentoo-r1/fs/namespace.c 2012-01-17 12:09:55.973856265 +0100 +@@ -1506,6 +1506,7 @@ + } + return 0; + } ++EXPORT_SYMBOL(iterate_mounts); + + static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) + { +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/notify/group.c linux-3.2.0-gentoo-r1/fs/notify/group.c +--- linux-3.2.0-gentoo-r1.orig//fs/notify/group.c 2012-01-17 11:56:07.028885437 +0100 ++++ linux-3.2.0-gentoo-r1/fs/notify/group.c 2012-01-17 12:09:56.087283212 +0100 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + + #include + #include "fsnotify.h" +@@ -70,6 +71,7 @@ + if (atomic_dec_and_test(&group->refcnt)) + fsnotify_destroy_group(group); + } ++EXPORT_SYMBOL(fsnotify_put_group); + + /* + * Create a new fsnotify_group and hold a reference for the group returned. +@@ -102,3 +104,4 @@ + + return group; + } ++EXPORT_SYMBOL(fsnotify_alloc_group); +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/notify/mark.c linux-3.2.0-gentoo-r1/fs/notify/mark.c +--- linux-3.2.0-gentoo-r1.orig//fs/notify/mark.c 2012-01-17 11:56:07.014996423 +0100 ++++ linux-3.2.0-gentoo-r1/fs/notify/mark.c 2012-01-17 12:09:56.128950255 +0100 +@@ -112,6 +112,7 @@ + if (atomic_dec_and_test(&mark->refcnt)) + mark->free_mark(mark); + } ++EXPORT_SYMBOL(fsnotify_put_mark); + + /* + * Any time a mark is getting freed we end up here. +@@ -189,6 +190,7 @@ + if (unlikely(atomic_dec_and_test(&group->num_marks))) + fsnotify_final_destroy_group(group); + } ++EXPORT_SYMBOL(fsnotify_destroy_mark); + + void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) + { +@@ -276,6 +278,7 @@ + + return ret; + } ++EXPORT_SYMBOL(fsnotify_add_mark); + + /* + * clear any marks in a group in which mark->flags & flags is true +@@ -331,6 +334,7 @@ + atomic_set(&mark->refcnt, 1); + mark->free_mark = free_mark; + } ++EXPORT_SYMBOL(fsnotify_init_mark); + + static int fsnotify_mark_destroy(void *ignored) + { +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/open.c linux-3.2.0-gentoo-r1/fs/open.c +--- linux-3.2.0-gentoo-r1.orig//fs/open.c 2012-01-17 11:56:14.554416224 +0100 ++++ linux-3.2.0-gentoo-r1/fs/open.c 2012-01-17 12:09:56.149783776 +0100 +@@ -60,6 +60,7 @@ + mutex_unlock(&dentry->d_inode->i_mutex); + return ret; + } ++EXPORT_SYMBOL(do_truncate); + + static long do_sys_truncate(const char __user *pathname, loff_t length) + { +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/proc/nommu.c linux-3.2.0-gentoo-r1/fs/proc/nommu.c +--- linux-3.2.0-gentoo-r1.orig//fs/proc/nommu.c 2012-01-17 11:56:13.538203362 +0100 ++++ linux-3.2.0-gentoo-r1/fs/proc/nommu.c 2012-01-17 12:09:43.920506893 +0100 +@@ -46,6 +46,10 @@ + + if (file) { + struct inode *inode = region->vm_file->f_path.dentry->d_inode; ++ if (region->vm_prfile) { ++ file = region->vm_prfile; ++ inode = file->f_path.dentry->d_inode; ++ } + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + } +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/proc/task_mmu.c linux-3.2.0-gentoo-r1/fs/proc/task_mmu.c +--- linux-3.2.0-gentoo-r1.orig//fs/proc/task_mmu.c 2012-01-17 11:56:13.517369840 +0100 ++++ linux-3.2.0-gentoo-r1/fs/proc/task_mmu.c 2012-01-17 12:09:43.955229428 +0100 +@@ -222,6 +222,10 @@ + + if (file) { + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; ++ if (vma->vm_prfile) { ++ file = vma->vm_prfile; ++ inode = file->f_path.dentry->d_inode; ++ } + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; +@@ -1033,6 +1037,8 @@ + + if (file) { + seq_printf(m, " file="); ++ if (vma->vm_prfile) ++ file = vma->vm_prfile; + seq_path(m, &file->f_path, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_printf(m, " heap"); +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/proc/task_nommu.c linux-3.2.0-gentoo-r1/fs/proc/task_nommu.c +--- linux-3.2.0-gentoo-r1.orig//fs/proc/task_nommu.c 2012-01-17 11:56:13.491906648 +0100 ++++ linux-3.2.0-gentoo-r1/fs/proc/task_nommu.c 2012-01-17 12:09:43.957544264 +0100 +@@ -148,6 +148,10 @@ + + if (file) { + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; ++ if (vma->vm_prfile) { ++ file = vma->vm_prfile; ++ inode = file->f_path.dentry->d_inode; ++ } + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; +diff -uNr linux-3.2.0-gentoo-r1.orig//fs/splice.c linux-3.2.0-gentoo-r1/fs/splice.c +--- linux-3.2.0-gentoo-r1.orig//fs/splice.c 2012-01-17 11:56:07.149256893 +0100 ++++ linux-3.2.0-gentoo-r1/fs/splice.c 2012-01-17 12:09:56.154413448 +0100 +@@ -1085,8 +1085,8 @@ + /* + * Attempt to initiate a splice from pipe to file. + */ +-static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, +- loff_t *ppos, size_t len, unsigned int flags) ++long do_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags) + { + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, + loff_t *, size_t, unsigned int); +@@ -1109,13 +1109,14 @@ + + return splice_write(pipe, out, ppos, len, flags); + } ++EXPORT_SYMBOL(do_splice_from); + + /* + * Attempt to initiate a splice from a file to a pipe. + */ +-static long do_splice_to(struct file *in, loff_t *ppos, +- struct pipe_inode_info *pipe, size_t len, +- unsigned int flags) ++long do_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags) + { + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); +@@ -1135,6 +1136,7 @@ + + return splice_read(in, ppos, pipe, len, flags); + } ++EXPORT_SYMBOL(do_splice_to); + + /** + * splice_direct_to_actor - splices data directly between two non-pipes +diff -uNr linux-3.2.0-gentoo-r1.orig//include/linux/aufs_type.h linux-3.2.0-gentoo-r1/include/linux/aufs_type.h +--- linux-3.2.0-gentoo-r1.orig//include/linux/aufs_type.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-gentoo-r1/include/linux/aufs_type.h 2012-01-17 12:11:46.699391210 +0100 +@@ -0,0 +1,233 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef __AUFS_TYPE_H__ ++#define __AUFS_TYPE_H__ ++ ++#define AUFS_NAME "aufs" ++ ++#ifdef __KERNEL__ ++/* ++ * define it before including all other headers. ++ * sched.h may use pr_* macros before defining "current", so define the ++ * no-current version first, and re-define later. ++ */ ++#define pr_fmt(fmt) AUFS_NAME " %s:%d: " fmt, __func__, __LINE__ ++#include ++#undef pr_fmt ++#define pr_fmt(fmt) AUFS_NAME " %s:%d:%s[%d]: " fmt, \ ++ __func__, __LINE__, current->comm, current->pid ++#else ++#include ++#include ++#endif /* __KERNEL__ */ ++ ++#include ++ ++#define AUFS_VERSION "3.2-20120109" ++ ++/* todo? move this to linux-2.6.19/include/magic.h */ ++#define AUFS_SUPER_MAGIC ('a' << 24 | 'u' << 16 | 'f' << 8 | 's') ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_BRANCH_MAX_127 ++typedef int8_t aufs_bindex_t; ++#define AUFS_BRANCH_MAX 127 ++#else ++typedef int16_t aufs_bindex_t; ++#ifdef CONFIG_AUFS_BRANCH_MAX_511 ++#define AUFS_BRANCH_MAX 511 ++#elif defined(CONFIG_AUFS_BRANCH_MAX_1023) ++#define AUFS_BRANCH_MAX 1023 ++#elif defined(CONFIG_AUFS_BRANCH_MAX_32767) ++#define AUFS_BRANCH_MAX 32767 ++#endif ++#endif ++ ++#ifdef __KERNEL__ ++#ifndef AUFS_BRANCH_MAX ++#error unknown CONFIG_AUFS_BRANCH_MAX value ++#endif ++#endif /* __KERNEL__ */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AUFS_FSTYPE AUFS_NAME ++ ++#define AUFS_ROOT_INO 2 ++#define AUFS_FIRST_INO 11 ++ ++#define AUFS_WH_PFX ".wh." ++#define AUFS_WH_PFX_LEN ((int)sizeof(AUFS_WH_PFX) - 1) ++#define AUFS_WH_TMP_LEN 4 ++/* a limit for rmdir/rename a dir */ ++#define AUFS_MAX_NAMELEN (NAME_MAX \ ++ - AUFS_WH_PFX_LEN * 2 /* doubly whiteouted */\ ++ - 1 /* dot */\ ++ - AUFS_WH_TMP_LEN) /* hex */ ++#define AUFS_XINO_FNAME "." AUFS_NAME ".xino" ++#define AUFS_XINO_DEFPATH "/tmp/" AUFS_XINO_FNAME ++#define AUFS_XINO_TRUNC_INIT 64 /* blocks */ ++#define AUFS_XINO_TRUNC_STEP 4 /* blocks */ ++#define AUFS_DIRWH_DEF 3 ++#define AUFS_RDCACHE_DEF 10 /* seconds */ ++#define AUFS_RDCACHE_MAX 3600 /* seconds */ ++#define AUFS_RDBLK_DEF 512 /* bytes */ ++#define AUFS_RDHASH_DEF 32 ++#define AUFS_WKQ_NAME AUFS_NAME "d" ++#define AUFS_MFS_DEF_SEC 30 /* seconds */ ++#define AUFS_MFS_MAX_SEC 3600 /* seconds */ ++#define AUFS_PLINK_WARN 100 /* number of plinks */ ++ ++/* pseudo-link maintenace under /proc */ ++#define AUFS_PLINK_MAINT_NAME "plink_maint" ++#define AUFS_PLINK_MAINT_DIR "fs/" AUFS_NAME ++#define AUFS_PLINK_MAINT_PATH AUFS_PLINK_MAINT_DIR "/" AUFS_PLINK_MAINT_NAME ++ ++#define AUFS_DIROPQ_NAME AUFS_WH_PFX ".opq" /* whiteouted doubly */ ++#define AUFS_WH_DIROPQ AUFS_WH_PFX AUFS_DIROPQ_NAME ++ ++#define AUFS_BASE_NAME AUFS_WH_PFX AUFS_NAME ++#define AUFS_PLINKDIR_NAME AUFS_WH_PFX "plnk" ++#define AUFS_ORPHDIR_NAME AUFS_WH_PFX "orph" ++ ++/* doubly whiteouted */ ++#define AUFS_WH_BASE AUFS_WH_PFX AUFS_BASE_NAME ++#define AUFS_WH_PLINKDIR AUFS_WH_PFX AUFS_PLINKDIR_NAME ++#define AUFS_WH_ORPHDIR AUFS_WH_PFX AUFS_ORPHDIR_NAME ++ ++/* branch permissions and attributes */ ++#define AUFS_BRPERM_RW "rw" ++#define AUFS_BRPERM_RO "ro" ++#define AUFS_BRPERM_RR "rr" ++#define AUFS_BRRATTR_WH "wh" ++#define AUFS_BRWATTR_NLWH "nolwh" ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ioctl */ ++enum { ++ /* readdir in userspace */ ++ AuCtl_RDU, ++ AuCtl_RDU_INO, ++ ++ /* pathconf wrapper */ ++ AuCtl_WBR_FD, ++ ++ /* busy inode */ ++ AuCtl_IBUSY ++}; ++ ++/* borrowed from linux/include/linux/kernel.h */ ++#ifndef ALIGN ++#define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) ++#define __ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) ++#endif ++ ++/* borrowed from linux/include/linux/compiler-gcc3.h */ ++#ifndef __aligned ++#define __aligned(x) __attribute__((aligned(x))) ++#endif ++ ++#ifdef __KERNEL__ ++#ifndef __packed ++#define __packed __attribute__((packed)) ++#endif ++#endif ++ ++struct au_rdu_cookie { ++ uint64_t h_pos; ++ int16_t bindex; ++ uint8_t flags; ++ uint8_t pad; ++ uint32_t generation; ++} __aligned(8); ++ ++struct au_rdu_ent { ++ uint64_t ino; ++ int16_t bindex; ++ uint8_t type; ++ uint8_t nlen; ++ uint8_t wh; ++ char name[0]; ++} __aligned(8); ++ ++static inline int au_rdu_len(int nlen) ++{ ++ /* include the terminating NULL */ ++ return ALIGN(sizeof(struct au_rdu_ent) + nlen + 1, ++ sizeof(uint64_t)); ++} ++ ++union au_rdu_ent_ul { ++ struct au_rdu_ent __user *e; ++ uint64_t ul; ++}; ++ ++enum { ++ AufsCtlRduV_SZ, ++ AufsCtlRduV_End ++}; ++ ++struct aufs_rdu { ++ /* input */ ++ union { ++ uint64_t sz; /* AuCtl_RDU */ ++ uint64_t nent; /* AuCtl_RDU_INO */ ++ }; ++ union au_rdu_ent_ul ent; ++ uint16_t verify[AufsCtlRduV_End]; ++ ++ /* input/output */ ++ uint32_t blk; ++ ++ /* output */ ++ union au_rdu_ent_ul tail; ++ /* number of entries which were added in a single call */ ++ uint64_t rent; ++ uint8_t full; ++ uint8_t shwh; ++ ++ struct au_rdu_cookie cookie; ++} __aligned(8); ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct aufs_wbr_fd { ++ uint32_t oflags; ++ int16_t brid; ++} __aligned(8); ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct aufs_ibusy { ++ uint64_t ino, h_ino; ++ int16_t bindex; ++} __aligned(8); ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AuCtlType 'A' ++#define AUFS_CTL_RDU _IOWR(AuCtlType, AuCtl_RDU, struct aufs_rdu) ++#define AUFS_CTL_RDU_INO _IOWR(AuCtlType, AuCtl_RDU_INO, struct aufs_rdu) ++#define AUFS_CTL_WBR_FD _IOW(AuCtlType, AuCtl_WBR_FD, \ ++ struct aufs_wbr_fd) ++#define AUFS_CTL_IBUSY _IOWR(AuCtlType, AuCtl_IBUSY, struct aufs_ibusy) ++ ++#endif /* __AUFS_TYPE_H__ */ +diff -uNr linux-3.2.0-gentoo-r1.orig//include/linux/Kbuild linux-3.2.0-gentoo-r1/include/linux/Kbuild +--- linux-3.2.0-gentoo-r1.orig//include/linux/Kbuild 2012-01-17 11:58:53.002603489 +0100 ++++ linux-3.2.0-gentoo-r1/include/linux/Kbuild 2012-01-17 12:09:24.941169177 +0100 +@@ -65,6 +65,7 @@ + header-y += atmsap.h + header-y += atmsvc.h + header-y += audit.h ++header-y += aufs_type.h + header-y += auto_fs.h + header-y += auto_fs4.h + header-y += auxvec.h +diff -uNr linux-3.2.0-gentoo-r1.orig//include/linux/mm_types.h linux-3.2.0-gentoo-r1/include/linux/mm_types.h +--- linux-3.2.0-gentoo-r1.orig//include/linux/mm_types.h 2012-01-17 11:58:48.143763404 +0100 ++++ linux-3.2.0-gentoo-r1/include/linux/mm_types.h 2012-01-17 12:09:43.966803608 +0100 +@@ -186,6 +186,7 @@ + unsigned long vm_top; /* region allocated to here */ + unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ + struct file *vm_file; /* the backing file or NULL */ ++ struct file *vm_prfile; /* the virtual backing file or NULL */ + + int vm_usage; /* region usage count (access under nommu_region_sem) */ + bool vm_icache_flushed : 1; /* true if the icache has been flushed for +@@ -245,6 +246,7 @@ + unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + units, *not* PAGE_CACHE_SIZE */ + struct file * vm_file; /* File we map to (can be NULL). */ ++ struct file *vm_prfile; /* shadow of vm_file */ + void * vm_private_data; /* was vm_pte (shared mem) */ + + #ifndef CONFIG_MMU +diff -uNr linux-3.2.0-gentoo-r1.orig//include/linux/namei.h linux-3.2.0-gentoo-r1/include/linux/namei.h +--- linux-3.2.0-gentoo-r1.orig//include/linux/namei.h 2012-01-17 11:58:55.565126585 +0100 ++++ linux-3.2.0-gentoo-r1/include/linux/namei.h 2012-01-17 12:09:34.642645500 +0100 +@@ -85,6 +85,7 @@ + extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry, + int (*open)(struct inode *, struct file *)); + ++extern struct dentry *lookup_hash(struct nameidata *nd); + extern struct dentry *lookup_one_len(const char *, struct dentry *, int); + + extern int follow_down_one(struct path *); +diff -uNr linux-3.2.0-gentoo-r1.orig//include/linux/splice.h linux-3.2.0-gentoo-r1/include/linux/splice.h +--- linux-3.2.0-gentoo-r1.orig//include/linux/splice.h 2012-01-17 11:58:47.639129226 +0100 ++++ linux-3.2.0-gentoo-r1/include/linux/splice.h 2012-01-17 12:09:34.658849350 +0100 +@@ -91,4 +91,10 @@ + extern void spd_release_page(struct splice_pipe_desc *, unsigned int); + + extern const struct pipe_buf_operations page_cache_pipe_buf_ops; ++ ++extern long do_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags); ++extern long do_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags); + #endif +diff -uNr linux-3.2.0-gentoo-r1.orig//kernel/fork.c linux-3.2.0-gentoo-r1/kernel/fork.c +--- linux-3.2.0-gentoo-r1.orig//kernel/fork.c 2012-01-17 11:56:03.646910513 +0100 ++++ linux-3.2.0-gentoo-r1/kernel/fork.c 2012-01-17 12:09:43.978377785 +0100 +@@ -376,6 +376,8 @@ + struct address_space *mapping = file->f_mapping; + + get_file(file); ++ if (tmp->vm_prfile) ++ get_file(tmp->vm_prfile); + if (tmp->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + mutex_lock(&mapping->i_mmap_mutex); +diff -uNr linux-3.2.0-gentoo-r1.orig//mm/memory.c linux-3.2.0-gentoo-r1/mm/memory.c +--- linux-3.2.0-gentoo-r1.orig//mm/memory.c 2012-01-17 11:56:05.519612577 +0100 ++++ linux-3.2.0-gentoo-r1/mm/memory.c 2012-01-17 12:09:44.008470650 +0100 +@@ -2622,6 +2622,8 @@ + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); ++ if (vma->vm_prfile) ++ file_update_time(vma->vm_prfile); + + return ret; + } +@@ -3307,6 +3309,8 @@ + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); ++ if (vma->vm_prfile) ++ file_update_time(vma->vm_prfile); + } else { + unlock_page(vmf.page); + if (anon) +diff -uNr linux-3.2.0-gentoo-r1.orig//mm/mmap.c linux-3.2.0-gentoo-r1/mm/mmap.c +--- linux-3.2.0-gentoo-r1.orig//mm/mmap.c 2012-01-17 11:56:05.480260369 +0100 ++++ linux-3.2.0-gentoo-r1/mm/mmap.c 2012-01-17 12:09:44.010785486 +0100 +@@ -232,6 +232,8 @@ + vma->vm_ops->close(vma); + if (vma->vm_file) { + fput(vma->vm_file); ++ if (vma->vm_prfile) ++ fput(vma->vm_prfile); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(vma->vm_mm); + } +@@ -619,6 +621,8 @@ + if (remove_next) { + if (file) { + fput(file); ++ if (vma->vm_prfile) ++ fput(vma->vm_prfile); + if (next->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + } +@@ -1965,6 +1969,8 @@ + + if (new->vm_file) { + get_file(new->vm_file); ++ if (new->vm_prfile) ++ get_file(new->vm_prfile); + if (vma->vm_flags & VM_EXECUTABLE) + added_exe_file_vma(mm); + } +@@ -1989,6 +1995,8 @@ + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + fput(new->vm_file); ++ if (new->vm_prfile) ++ fput(new->vm_prfile); + } + unlink_anon_vmas(new); + out_free_mpol: +@@ -2356,6 +2364,8 @@ + new_vma->vm_pgoff = pgoff; + if (new_vma->vm_file) { + get_file(new_vma->vm_file); ++ if (new_vma->vm_prfile) ++ get_file(new_vma->vm_prfile); + if (vma->vm_flags & VM_EXECUTABLE) + added_exe_file_vma(mm); + } +diff -uNr linux-3.2.0-gentoo-r1.orig//mm/nommu.c linux-3.2.0-gentoo-r1/mm/nommu.c +--- linux-3.2.0-gentoo-r1.orig//mm/nommu.c 2012-01-17 11:56:05.369148257 +0100 ++++ linux-3.2.0-gentoo-r1/mm/nommu.c 2012-01-17 12:09:44.020044827 +0100 +@@ -633,6 +633,8 @@ + + if (region->vm_file) + fput(region->vm_file); ++ if (region->vm_prfile) ++ fput(region->vm_prfile); + + /* IO memory and memory shared directly out of the pagecache + * from ramfs/tmpfs mustn't be released here */ +@@ -789,6 +791,8 @@ + vma->vm_ops->close(vma); + if (vma->vm_file) { + fput(vma->vm_file); ++ if (vma->vm_prfile) ++ fput(vma->vm_prfile); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + } +@@ -1362,6 +1366,8 @@ + } + } + fput(region->vm_file); ++ if (region->vm_prfile) ++ fput(region->vm_prfile); + kmem_cache_free(vm_region_jar, region); + region = pregion; + result = start; +@@ -1438,9 +1444,13 @@ + error: + if (region->vm_file) + fput(region->vm_file); ++ if (region->vm_prfile) ++ fput(region->vm_prfile); + kmem_cache_free(vm_region_jar, region); + if (vma->vm_file) + fput(vma->vm_file); ++ if (vma->vm_prfile) ++ fput(vma->vm_prfile); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(vma->vm_mm); + kmem_cache_free(vm_area_cachep, vma); +diff -uNr linux-3.2.0-gentoo-r1.orig//security/commoncap.c linux-3.2.0-gentoo-r1/security/commoncap.c +--- linux-3.2.0-gentoo-r1.orig//security/commoncap.c 2012-01-17 11:57:45.874683687 +0100 ++++ linux-3.2.0-gentoo-r1/security/commoncap.c 2012-01-17 12:09:56.182191475 +0100 +@@ -975,3 +975,4 @@ + } + return ret; + } ++EXPORT_SYMBOL(cap_file_mmap); +diff -uNr linux-3.2.0-gentoo-r1.orig//security/device_cgroup.c linux-3.2.0-gentoo-r1/security/device_cgroup.c +--- linux-3.2.0-gentoo-r1.orig//security/device_cgroup.c 2012-01-17 11:57:45.874683687 +0100 ++++ linux-3.2.0-gentoo-r1/security/device_cgroup.c 2012-01-17 12:09:56.221543683 +0100 +@@ -7,6 +7,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -500,6 +501,7 @@ + + return -EPERM; + } ++EXPORT_SYMBOL(__devcgroup_inode_permission); + + int devcgroup_inode_mknod(int mode, dev_t dev) + { +diff -uNr linux-3.2.0-gentoo-r1.orig//security/security.c linux-3.2.0-gentoo-r1/security/security.c +--- linux-3.2.0-gentoo-r1.orig//security/security.c 2012-01-17 11:57:45.592273735 +0100 ++++ linux-3.2.0-gentoo-r1/security/security.c 2012-01-17 12:09:56.235432696 +0100 +@@ -411,6 +411,7 @@ + return 0; + return security_ops->path_rmdir(dir, dentry); + } ++EXPORT_SYMBOL(security_path_rmdir); + + int security_path_unlink(struct path *dir, struct dentry *dentry) + { +@@ -427,6 +428,7 @@ + return 0; + return security_ops->path_symlink(dir, dentry, old_name); + } ++EXPORT_SYMBOL(security_path_symlink); + + int security_path_link(struct dentry *old_dentry, struct path *new_dir, + struct dentry *new_dentry) +@@ -435,6 +437,7 @@ + return 0; + return security_ops->path_link(old_dentry, new_dir, new_dentry); + } ++EXPORT_SYMBOL(security_path_link); + + int security_path_rename(struct path *old_dir, struct dentry *old_dentry, + struct path *new_dir, struct dentry *new_dentry) +@@ -453,6 +456,7 @@ + return 0; + return security_ops->path_truncate(path); + } ++EXPORT_SYMBOL(security_path_truncate); + + int security_path_chmod(struct dentry *dentry, struct vfsmount *mnt, + mode_t mode) +@@ -461,6 +465,7 @@ + return 0; + return security_ops->path_chmod(dentry, mnt, mode); + } ++EXPORT_SYMBOL(security_path_chmod); + + int security_path_chown(struct path *path, uid_t uid, gid_t gid) + { +@@ -468,6 +473,7 @@ + return 0; + return security_ops->path_chown(path, uid, gid); + } ++EXPORT_SYMBOL(security_path_chown); + + int security_path_chroot(struct path *path) + { +@@ -544,6 +550,7 @@ + return 0; + return security_ops->inode_readlink(dentry); + } ++EXPORT_SYMBOL(security_inode_readlink); + + int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd) + { +@@ -558,6 +565,7 @@ + return 0; + return security_ops->inode_permission(inode, mask); + } ++EXPORT_SYMBOL(security_inode_permission); + + int security_inode_setattr(struct dentry *dentry, struct iattr *attr) + { +@@ -673,6 +681,7 @@ + + return fsnotify_perm(file, mask); + } ++EXPORT_SYMBOL(security_file_permission); + + int security_file_alloc(struct file *file) + { +@@ -700,6 +709,7 @@ + return ret; + return ima_file_mmap(file, prot); + } ++EXPORT_SYMBOL(security_file_mmap); + + int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, + unsigned long prot) diff --git a/3.2.34/bump/1021_linux-3.2.22.patch b/3.2.34/bump/1021_linux-3.2.22.patch new file mode 100644 index 0000000..e6ad93a --- /dev/null +++ b/3.2.34/bump/1021_linux-3.2.22.patch @@ -0,0 +1,1245 @@ +diff --git a/Documentation/stable_kernel_rules.txt b/Documentation/stable_kernel_rules.txt +index 21fd05c..e1f856b 100644 +--- a/Documentation/stable_kernel_rules.txt ++++ b/Documentation/stable_kernel_rules.txt +@@ -12,6 +12,12 @@ Rules on what kind of patches are accepted, and which ones are not, into the + marked CONFIG_BROKEN), an oops, a hang, data corruption, a real + security issue, or some "oh, that's not good" issue. In short, something + critical. ++ - Serious issues as reported by a user of a distribution kernel may also ++ be considered if they fix a notable performance or interactivity issue. ++ As these fixes are not as obvious and have a higher risk of a subtle ++ regression they should only be submitted by a distribution kernel ++ maintainer and include an addendum linking to a bugzilla entry if it ++ exists and additional information on the user-visible impact. + - New device IDs and quirks are also accepted. + - No "theoretical race condition" issues, unless an explanation of how the + race can be exploited is also provided. +diff --git a/Makefile b/Makefile +index 7eb465e..9a7d921 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 2 +-SUBLEVEL = 21 ++SUBLEVEL = 22 + EXTRAVERSION = + NAME = Saber-toothed Squirrel + +diff --git a/arch/arm/plat-samsung/include/plat/map-s3c.h b/arch/arm/plat-samsung/include/plat/map-s3c.h +index 7d04875..c0c70a8 100644 +--- a/arch/arm/plat-samsung/include/plat/map-s3c.h ++++ b/arch/arm/plat-samsung/include/plat/map-s3c.h +@@ -22,7 +22,7 @@ + #define S3C24XX_VA_WATCHDOG S3C_VA_WATCHDOG + + #define S3C2412_VA_SSMC S3C_ADDR_CPU(0x00000000) +-#define S3C2412_VA_EBI S3C_ADDR_CPU(0x00010000) ++#define S3C2412_VA_EBI S3C_ADDR_CPU(0x00100000) + + #define S3C2410_PA_UART (0x50000000) + #define S3C24XX_PA_UART S3C2410_PA_UART +diff --git a/arch/arm/plat-samsung/include/plat/watchdog-reset.h b/arch/arm/plat-samsung/include/plat/watchdog-reset.h +index 40dbb2b..11b19ea 100644 +--- a/arch/arm/plat-samsung/include/plat/watchdog-reset.h ++++ b/arch/arm/plat-samsung/include/plat/watchdog-reset.h +@@ -24,7 +24,7 @@ static inline void arch_wdt_reset(void) + + __raw_writel(0, S3C2410_WTCON); /* disable watchdog, to be safe */ + +- if (s3c2410_wdtclk) ++ if (!IS_ERR(s3c2410_wdtclk)) + clk_enable(s3c2410_wdtclk); + + /* put initial values into count and data */ +diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h +index f3444f7..0c3b775 100644 +--- a/arch/x86/include/asm/cpufeature.h ++++ b/arch/x86/include/asm/cpufeature.h +@@ -175,7 +175,7 @@ + #define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */ + #define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ + #define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ +-#define X86_FEATURE_DTS (7*32+ 7) /* Digital Thermal Sensor */ ++#define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */ + + /* Virtualization flags: Linux defined, word 8 */ + #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ +diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h +index effff47..cb00ccc 100644 +--- a/arch/x86/include/asm/pgtable-3level.h ++++ b/arch/x86/include/asm/pgtable-3level.h +@@ -31,6 +31,60 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) + ptep->pte_low = pte.pte_low; + } + ++#define pmd_read_atomic pmd_read_atomic ++/* ++ * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with ++ * a "*pmdp" dereference done by gcc. Problem is, in certain places ++ * where pte_offset_map_lock is called, concurrent page faults are ++ * allowed, if the mmap_sem is hold for reading. An example is mincore ++ * vs page faults vs MADV_DONTNEED. On the page fault side ++ * pmd_populate rightfully does a set_64bit, but if we're reading the ++ * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen ++ * because gcc will not read the 64bit of the pmd atomically. To fix ++ * this all places running pmd_offset_map_lock() while holding the ++ * mmap_sem in read mode, shall read the pmdp pointer using this ++ * function to know if the pmd is null nor not, and in turn to know if ++ * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd ++ * operations. ++ * ++ * Without THP if the mmap_sem is hold for reading, the pmd can only ++ * transition from null to not null while pmd_read_atomic runs. So ++ * we can always return atomic pmd values with this function. ++ * ++ * With THP if the mmap_sem is hold for reading, the pmd can become ++ * trans_huge or none or point to a pte (and in turn become "stable") ++ * at any time under pmd_read_atomic. We could read it really ++ * atomically here with a atomic64_read for the THP enabled case (and ++ * it would be a whole lot simpler), but to avoid using cmpxchg8b we ++ * only return an atomic pmdval if the low part of the pmdval is later ++ * found stable (i.e. pointing to a pte). And we're returning a none ++ * pmdval if the low part of the pmd is none. In some cases the high ++ * and low part of the pmdval returned may not be consistent if THP is ++ * enabled (the low part may point to previously mapped hugepage, ++ * while the high part may point to a more recently mapped hugepage), ++ * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part ++ * of the pmd to be read atomically to decide if the pmd is unstable ++ * or not, with the only exception of when the low part of the pmd is ++ * zero in which case we return a none pmd. ++ */ ++static inline pmd_t pmd_read_atomic(pmd_t *pmdp) ++{ ++ pmdval_t ret; ++ u32 *tmp = (u32 *)pmdp; ++ ++ ret = (pmdval_t) (*tmp); ++ if (ret) { ++ /* ++ * If the low part is null, we must not read the high part ++ * or we can end up with a partial pmd. ++ */ ++ smp_rmb(); ++ ret |= ((pmdval_t)*(tmp + 1)) << 32; ++ } ++ ++ return (pmd_t) { ret }; ++} ++ + static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) + { + set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); +diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c +index c7f64e6..ea6106c 100644 +--- a/arch/x86/kernel/cpu/scattered.c ++++ b/arch/x86/kernel/cpu/scattered.c +@@ -31,7 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) + const struct cpuid_bit *cb; + + static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { +- { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 }, ++ { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 }, + { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, + { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, + { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, +diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c +index a43fa1a..1502c502 100644 +--- a/drivers/acpi/acpi_pad.c ++++ b/drivers/acpi/acpi_pad.c +@@ -36,6 +36,7 @@ + #define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator" + #define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80 + static DEFINE_MUTEX(isolated_cpus_lock); ++static DEFINE_MUTEX(round_robin_lock); + + static unsigned long power_saving_mwait_eax; + +@@ -107,7 +108,7 @@ static void round_robin_cpu(unsigned int tsk_index) + if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) + return; + +- mutex_lock(&isolated_cpus_lock); ++ mutex_lock(&round_robin_lock); + cpumask_clear(tmp); + for_each_cpu(cpu, pad_busy_cpus) + cpumask_or(tmp, tmp, topology_thread_cpumask(cpu)); +@@ -116,7 +117,7 @@ static void round_robin_cpu(unsigned int tsk_index) + if (cpumask_empty(tmp)) + cpumask_andnot(tmp, cpu_online_mask, pad_busy_cpus); + if (cpumask_empty(tmp)) { +- mutex_unlock(&isolated_cpus_lock); ++ mutex_unlock(&round_robin_lock); + return; + } + for_each_cpu(cpu, tmp) { +@@ -131,7 +132,7 @@ static void round_robin_cpu(unsigned int tsk_index) + tsk_in_cpu[tsk_index] = preferred_cpu; + cpumask_set_cpu(preferred_cpu, pad_busy_cpus); + cpu_weight[preferred_cpu]++; +- mutex_unlock(&isolated_cpus_lock); ++ mutex_unlock(&round_robin_lock); + + set_cpus_allowed_ptr(current, cpumask_of(preferred_cpu)); + } +diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c +index c3d2dfc..b96544a 100644 +--- a/drivers/base/power/main.c ++++ b/drivers/base/power/main.c +@@ -869,7 +869,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) + dpm_wait_for_children(dev, async); + + if (async_error) +- return 0; ++ goto Complete; + + pm_runtime_get_noresume(dev); + if (pm_runtime_barrier(dev) && device_may_wakeup(dev)) +@@ -878,7 +878,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) + if (pm_wakeup_pending()) { + pm_runtime_put_sync(dev); + async_error = -EBUSY; +- return 0; ++ goto Complete; + } + + device_lock(dev); +@@ -926,6 +926,8 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) + } + + device_unlock(dev); ++ ++ Complete: + complete_all(&dev->power.completion); + + if (error) { +diff --git a/drivers/char/hw_random/atmel-rng.c b/drivers/char/hw_random/atmel-rng.c +index 0477982..1b5675b 100644 +--- a/drivers/char/hw_random/atmel-rng.c ++++ b/drivers/char/hw_random/atmel-rng.c +@@ -34,7 +34,7 @@ static int atmel_trng_read(struct hwrng *rng, void *buf, size_t max, + u32 *data = buf; + + /* data ready? */ +- if (readl(trng->base + TRNG_ODATA) & 1) { ++ if (readl(trng->base + TRNG_ISR) & 1) { + *data = readl(trng->base + TRNG_ODATA); + /* + ensure data ready is only set again AFTER the next data +diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c +index 70ad892..b3ccefa 100644 +--- a/drivers/edac/i7core_edac.c ++++ b/drivers/edac/i7core_edac.c +@@ -1932,12 +1932,6 @@ static int i7core_mce_check_error(struct notifier_block *nb, unsigned long val, + if (mce->bank != 8) + return NOTIFY_DONE; + +-#ifdef CONFIG_SMP +- /* Only handle if it is the right mc controller */ +- if (mce->socketid != pvt->i7core_dev->socket) +- return NOTIFY_DONE; +-#endif +- + smp_rmb(); + if ((pvt->mce_out + 1) % MCE_LOG_LEN == pvt->mce_in) { + smp_wmb(); +@@ -2234,8 +2228,6 @@ static void i7core_unregister_mci(struct i7core_dev *i7core_dev) + if (pvt->enable_scrub) + disable_sdram_scrub_setting(mci); + +- atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &i7_mce_dec); +- + /* Disable EDAC polling */ + i7core_pci_ctl_release(pvt); + +@@ -2336,8 +2328,6 @@ static int i7core_register_mci(struct i7core_dev *i7core_dev) + /* DCLK for scrub rate setting */ + pvt->dclk_freq = get_dclk_freq(); + +- atomic_notifier_chain_register(&x86_mce_decoder_chain, &i7_mce_dec); +- + return 0; + + fail0: +@@ -2481,8 +2471,10 @@ static int __init i7core_init(void) + + pci_rc = pci_register_driver(&i7core_driver); + +- if (pci_rc >= 0) ++ if (pci_rc >= 0) { ++ atomic_notifier_chain_register(&x86_mce_decoder_chain, &i7_mce_dec); + return 0; ++ } + + i7core_printk(KERN_ERR, "Failed to register device with error %d.\n", + pci_rc); +@@ -2498,6 +2490,7 @@ static void __exit i7core_exit(void) + { + debugf2("MC: " __FILE__ ": %s()\n", __func__); + pci_unregister_driver(&i7core_driver); ++ atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &i7_mce_dec); + } + + module_init(i7core_init); +diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c +index 7a402bf..18a1293 100644 +--- a/drivers/edac/sb_edac.c ++++ b/drivers/edac/sb_edac.c +@@ -1661,9 +1661,6 @@ static void sbridge_unregister_mci(struct sbridge_dev *sbridge_dev) + debugf0("MC: " __FILE__ ": %s(): mci = %p, dev = %p\n", + __func__, mci, &sbridge_dev->pdev[0]->dev); + +- atomic_notifier_chain_unregister(&x86_mce_decoder_chain, +- &sbridge_mce_dec); +- + /* Remove MC sysfs nodes */ + edac_mc_del_mc(mci->dev); + +@@ -1731,8 +1728,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev) + goto fail0; + } + +- atomic_notifier_chain_register(&x86_mce_decoder_chain, +- &sbridge_mce_dec); + return 0; + + fail0: +@@ -1861,8 +1856,10 @@ static int __init sbridge_init(void) + + pci_rc = pci_register_driver(&sbridge_driver); + +- if (pci_rc >= 0) ++ if (pci_rc >= 0) { ++ atomic_notifier_chain_register(&x86_mce_decoder_chain, &sbridge_mce_dec); + return 0; ++ } + + sbridge_printk(KERN_ERR, "Failed to register device with error %d.\n", + pci_rc); +@@ -1878,6 +1875,7 @@ static void __exit sbridge_exit(void) + { + debugf2("MC: " __FILE__ ": %s()\n", __func__); + pci_unregister_driver(&sbridge_driver); ++ atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &sbridge_mce_dec); + } + + module_init(sbridge_init); +diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c +index 3e927ce..a1ee634 100644 +--- a/drivers/gpu/drm/drm_edid.c ++++ b/drivers/gpu/drm/drm_edid.c +@@ -585,7 +585,7 @@ static bool + drm_monitor_supports_rb(struct edid *edid) + { + if (edid->revision >= 4) { +- bool ret; ++ bool ret = false; + drm_for_each_detailed_block((u8 *)edid, is_rb, &ret); + return ret; + } +diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c +index 3e7c478..3e2edc6 100644 +--- a/drivers/gpu/drm/i915/i915_gem.c ++++ b/drivers/gpu/drm/i915/i915_gem.c +@@ -3312,6 +3312,10 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file) + + if (ret == 0 && atomic_read(&dev_priv->mm.wedged)) + ret = -EIO; ++ } else if (wait_for(i915_seqno_passed(ring->get_seqno(ring), ++ seqno) || ++ atomic_read(&dev_priv->mm.wedged), 3000)) { ++ ret = -EBUSY; + } + } + +diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c +index d3820c2..578ddfc 100644 +--- a/drivers/gpu/drm/i915/i915_irq.c ++++ b/drivers/gpu/drm/i915/i915_irq.c +@@ -424,6 +424,30 @@ static void gen6_pm_rps_work(struct work_struct *work) + mutex_unlock(&dev_priv->dev->struct_mutex); + } + ++static void gen6_queue_rps_work(struct drm_i915_private *dev_priv, ++ u32 pm_iir) ++{ ++ unsigned long flags; ++ ++ /* ++ * IIR bits should never already be set because IMR should ++ * prevent an interrupt from being shown in IIR. The warning ++ * displays a case where we've unsafely cleared ++ * dev_priv->pm_iir. Although missing an interrupt of the same ++ * type is not a problem, it displays a problem in the logic. ++ * ++ * The mask bit in IMR is cleared by rps_work. ++ */ ++ ++ spin_lock_irqsave(&dev_priv->rps_lock, flags); ++ dev_priv->pm_iir |= pm_iir; ++ I915_WRITE(GEN6_PMIMR, dev_priv->pm_iir); ++ POSTING_READ(GEN6_PMIMR); ++ spin_unlock_irqrestore(&dev_priv->rps_lock, flags); ++ ++ queue_work(dev_priv->wq, &dev_priv->rps_work); ++} ++ + static void pch_irq_handler(struct drm_device *dev, u32 pch_iir) + { + drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private; +@@ -529,16 +553,8 @@ static irqreturn_t ivybridge_irq_handler(DRM_IRQ_ARGS) + pch_irq_handler(dev, pch_iir); + } + +- if (pm_iir & GEN6_PM_DEFERRED_EVENTS) { +- unsigned long flags; +- spin_lock_irqsave(&dev_priv->rps_lock, flags); +- WARN(dev_priv->pm_iir & pm_iir, "Missed a PM interrupt\n"); +- dev_priv->pm_iir |= pm_iir; +- I915_WRITE(GEN6_PMIMR, dev_priv->pm_iir); +- POSTING_READ(GEN6_PMIMR); +- spin_unlock_irqrestore(&dev_priv->rps_lock, flags); +- queue_work(dev_priv->wq, &dev_priv->rps_work); +- } ++ if (pm_iir & GEN6_PM_DEFERRED_EVENTS) ++ gen6_queue_rps_work(dev_priv, pm_iir); + + /* should clear PCH hotplug event before clear CPU irq */ + I915_WRITE(SDEIIR, pch_iir); +@@ -634,25 +650,8 @@ static irqreturn_t ironlake_irq_handler(DRM_IRQ_ARGS) + i915_handle_rps_change(dev); + } + +- if (IS_GEN6(dev) && pm_iir & GEN6_PM_DEFERRED_EVENTS) { +- /* +- * IIR bits should never already be set because IMR should +- * prevent an interrupt from being shown in IIR. The warning +- * displays a case where we've unsafely cleared +- * dev_priv->pm_iir. Although missing an interrupt of the same +- * type is not a problem, it displays a problem in the logic. +- * +- * The mask bit in IMR is cleared by rps_work. +- */ +- unsigned long flags; +- spin_lock_irqsave(&dev_priv->rps_lock, flags); +- WARN(dev_priv->pm_iir & pm_iir, "Missed a PM interrupt\n"); +- dev_priv->pm_iir |= pm_iir; +- I915_WRITE(GEN6_PMIMR, dev_priv->pm_iir); +- POSTING_READ(GEN6_PMIMR); +- spin_unlock_irqrestore(&dev_priv->rps_lock, flags); +- queue_work(dev_priv->wq, &dev_priv->rps_work); +- } ++ if (IS_GEN6(dev) && pm_iir & GEN6_PM_DEFERRED_EVENTS) ++ gen6_queue_rps_work(dev_priv, pm_iir); + + /* should clear PCH hotplug event before clear CPU irq */ + I915_WRITE(SDEIIR, pch_iir); +diff --git a/drivers/gpu/drm/i915/i915_suspend.c b/drivers/gpu/drm/i915/i915_suspend.c +index a1eb83d..f38d196 100644 +--- a/drivers/gpu/drm/i915/i915_suspend.c ++++ b/drivers/gpu/drm/i915/i915_suspend.c +@@ -739,8 +739,11 @@ static void i915_restore_display(struct drm_device *dev) + if (HAS_PCH_SPLIT(dev)) { + I915_WRITE(BLC_PWM_PCH_CTL1, dev_priv->saveBLC_PWM_CTL); + I915_WRITE(BLC_PWM_PCH_CTL2, dev_priv->saveBLC_PWM_CTL2); +- I915_WRITE(BLC_PWM_CPU_CTL, dev_priv->saveBLC_CPU_PWM_CTL); ++ /* NOTE: BLC_PWM_CPU_CTL must be written after BLC_PWM_CPU_CTL2; ++ * otherwise we get blank eDP screen after S3 on some machines ++ */ + I915_WRITE(BLC_PWM_CPU_CTL2, dev_priv->saveBLC_CPU_PWM_CTL2); ++ I915_WRITE(BLC_PWM_CPU_CTL, dev_priv->saveBLC_CPU_PWM_CTL); + I915_WRITE(PCH_PP_ON_DELAYS, dev_priv->savePP_ON_DELAYS); + I915_WRITE(PCH_PP_OFF_DELAYS, dev_priv->savePP_OFF_DELAYS); + I915_WRITE(PCH_PP_DIVISOR, dev_priv->savePP_DIVISOR); +diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c +index 5c1cdb8..6aa7716 100644 +--- a/drivers/gpu/drm/i915/intel_display.c ++++ b/drivers/gpu/drm/i915/intel_display.c +@@ -2187,6 +2187,33 @@ intel_pipe_set_base_atomic(struct drm_crtc *crtc, struct drm_framebuffer *fb, + } + + static int ++intel_finish_fb(struct drm_framebuffer *old_fb) ++{ ++ struct drm_i915_gem_object *obj = to_intel_framebuffer(old_fb)->obj; ++ struct drm_i915_private *dev_priv = obj->base.dev->dev_private; ++ bool was_interruptible = dev_priv->mm.interruptible; ++ int ret; ++ ++ wait_event(dev_priv->pending_flip_queue, ++ atomic_read(&dev_priv->mm.wedged) || ++ atomic_read(&obj->pending_flip) == 0); ++ ++ /* Big Hammer, we also need to ensure that any pending ++ * MI_WAIT_FOR_EVENT inside a user batch buffer on the ++ * current scanout is retired before unpinning the old ++ * framebuffer. ++ * ++ * This should only fail upon a hung GPU, in which case we ++ * can safely continue. ++ */ ++ dev_priv->mm.interruptible = false; ++ ret = i915_gem_object_finish_gpu(obj); ++ dev_priv->mm.interruptible = was_interruptible; ++ ++ return ret; ++} ++ ++static int + intel_pipe_set_base(struct drm_crtc *crtc, int x, int y, + struct drm_framebuffer *old_fb) + { +@@ -2224,25 +2251,8 @@ intel_pipe_set_base(struct drm_crtc *crtc, int x, int y, + return ret; + } + +- if (old_fb) { +- struct drm_i915_private *dev_priv = dev->dev_private; +- struct drm_i915_gem_object *obj = to_intel_framebuffer(old_fb)->obj; +- +- wait_event(dev_priv->pending_flip_queue, +- atomic_read(&dev_priv->mm.wedged) || +- atomic_read(&obj->pending_flip) == 0); +- +- /* Big Hammer, we also need to ensure that any pending +- * MI_WAIT_FOR_EVENT inside a user batch buffer on the +- * current scanout is retired before unpinning the old +- * framebuffer. +- * +- * This should only fail upon a hung GPU, in which case we +- * can safely continue. +- */ +- ret = i915_gem_object_finish_gpu(obj); +- (void) ret; +- } ++ if (old_fb) ++ intel_finish_fb(old_fb); + + ret = intel_pipe_set_base_atomic(crtc, crtc->fb, x, y, + LEAVE_ATOMIC_MODE_SET); +@@ -3312,6 +3322,23 @@ static void intel_crtc_disable(struct drm_crtc *crtc) + struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; + struct drm_device *dev = crtc->dev; + ++ /* Flush any pending WAITs before we disable the pipe. Note that ++ * we need to drop the struct_mutex in order to acquire it again ++ * during the lowlevel dpms routines around a couple of the ++ * operations. It does not look trivial nor desirable to move ++ * that locking higher. So instead we leave a window for the ++ * submission of further commands on the fb before we can actually ++ * disable it. This race with userspace exists anyway, and we can ++ * only rely on the pipe being disabled by userspace after it ++ * receives the hotplug notification and has flushed any pending ++ * batches. ++ */ ++ if (crtc->fb) { ++ mutex_lock(&dev->struct_mutex); ++ intel_finish_fb(crtc->fb); ++ mutex_unlock(&dev->struct_mutex); ++ } ++ + crtc_funcs->dpms(crtc, DRM_MODE_DPMS_OFF); + + if (crtc->fb) { +diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c +index 933e66b..f6613dc 100644 +--- a/drivers/gpu/drm/i915/intel_ringbuffer.c ++++ b/drivers/gpu/drm/i915/intel_ringbuffer.c +@@ -306,7 +306,7 @@ static int init_ring_common(struct intel_ring_buffer *ring) + + I915_WRITE_CTL(ring, + ((ring->size - PAGE_SIZE) & RING_NR_PAGES) +- | RING_REPORT_64K | RING_VALID); ++ | RING_VALID); + + /* If the head is still not zero, the ring is dead */ + if ((I915_READ_CTL(ring) & RING_VALID) == 0 || +@@ -1157,18 +1157,6 @@ int intel_wait_ring_buffer(struct intel_ring_buffer *ring, int n) + struct drm_device *dev = ring->dev; + struct drm_i915_private *dev_priv = dev->dev_private; + unsigned long end; +- u32 head; +- +- /* If the reported head position has wrapped or hasn't advanced, +- * fallback to the slow and accurate path. +- */ +- head = intel_read_status_page(ring, 4); +- if (head > ring->head) { +- ring->head = head; +- ring->space = ring_space(ring); +- if (ring->space >= n) +- return 0; +- } + + trace_i915_ring_wait_begin(ring); + end = jiffies + 3 * HZ; +diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.c b/drivers/gpu/drm/nouveau/nouveau_fbcon.c +index 3a4cc32..cc0801d 100644 +--- a/drivers/gpu/drm/nouveau/nouveau_fbcon.c ++++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.c +@@ -499,7 +499,7 @@ int nouveau_fbcon_init(struct drm_device *dev) + nfbdev->helper.funcs = &nouveau_fbcon_helper_funcs; + + ret = drm_fb_helper_init(dev, &nfbdev->helper, +- nv_two_heads(dev) ? 2 : 1, 4); ++ dev->mode_config.num_crtc, 4); + if (ret) { + kfree(nfbdev); + return ret; +diff --git a/drivers/hwmon/applesmc.c b/drivers/hwmon/applesmc.c +index 4c07436..d99aa84 100644 +--- a/drivers/hwmon/applesmc.c ++++ b/drivers/hwmon/applesmc.c +@@ -215,7 +215,7 @@ static int read_smc(u8 cmd, const char *key, u8 *buffer, u8 len) + int i; + + if (send_command(cmd) || send_argument(key)) { +- pr_warn("%s: read arg fail\n", key); ++ pr_warn("%.4s: read arg fail\n", key); + return -EIO; + } + +@@ -223,7 +223,7 @@ static int read_smc(u8 cmd, const char *key, u8 *buffer, u8 len) + + for (i = 0; i < len; i++) { + if (__wait_status(0x05)) { +- pr_warn("%s: read data fail\n", key); ++ pr_warn("%.4s: read data fail\n", key); + return -EIO; + } + buffer[i] = inb(APPLESMC_DATA_PORT); +diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c +index 427468f..0790c98 100644 +--- a/drivers/hwmon/coretemp.c ++++ b/drivers/hwmon/coretemp.c +@@ -660,7 +660,7 @@ static void __cpuinit get_core_online(unsigned int cpu) + * sensors. We check this bit only, all the early CPUs + * without thermal sensors will be filtered out. + */ +- if (!cpu_has(c, X86_FEATURE_DTS)) ++ if (!cpu_has(c, X86_FEATURE_DTHERM)) + return; + + if (!pdev) { +diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c +index da2f021..532a902 100644 +--- a/drivers/md/dm-thin.c ++++ b/drivers/md/dm-thin.c +@@ -288,8 +288,10 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates) + + hlist_del(&cell->list); + +- bio_list_add(inmates, cell->holder); +- bio_list_merge(inmates, &cell->bios); ++ if (inmates) { ++ bio_list_add(inmates, cell->holder); ++ bio_list_merge(inmates, &cell->bios); ++ } + + mempool_free(cell, prison->cell_pool); + } +@@ -312,9 +314,10 @@ static void cell_release(struct cell *cell, struct bio_list *bios) + */ + static void __cell_release_singleton(struct cell *cell, struct bio *bio) + { +- hlist_del(&cell->list); + BUG_ON(cell->holder != bio); + BUG_ON(!bio_list_empty(&cell->bios)); ++ ++ __cell_release(cell, NULL); + } + + static void cell_release_singleton(struct cell *cell, struct bio *bio) +diff --git a/drivers/media/dvb/siano/smsusb.c b/drivers/media/dvb/siano/smsusb.c +index b7d1e3e..fb68805 100644 +--- a/drivers/media/dvb/siano/smsusb.c ++++ b/drivers/media/dvb/siano/smsusb.c +@@ -544,6 +544,8 @@ static const struct usb_device_id smsusb_id_table[] __devinitconst = { + .driver_info = SMS1XXX_BOARD_HAUPPAUGE_WINDHAM }, + { USB_DEVICE(0x2040, 0xc0a0), + .driver_info = SMS1XXX_BOARD_HAUPPAUGE_WINDHAM }, ++ { USB_DEVICE(0x2040, 0xf5a0), ++ .driver_info = SMS1XXX_BOARD_HAUPPAUGE_WINDHAM }, + { } /* Terminating entry */ + }; + +diff --git a/drivers/media/video/gspca/gspca.c b/drivers/media/video/gspca/gspca.c +index 2ca10df..981501f 100644 +--- a/drivers/media/video/gspca/gspca.c ++++ b/drivers/media/video/gspca/gspca.c +@@ -1697,7 +1697,7 @@ static int vidioc_streamoff(struct file *file, void *priv, + enum v4l2_buf_type buf_type) + { + struct gspca_dev *gspca_dev = priv; +- int ret; ++ int i, ret; + + if (buf_type != V4L2_BUF_TYPE_VIDEO_CAPTURE) + return -EINVAL; +@@ -1728,6 +1728,8 @@ static int vidioc_streamoff(struct file *file, void *priv, + wake_up_interruptible(&gspca_dev->wq); + + /* empty the transfer queues */ ++ for (i = 0; i < gspca_dev->nframes; i++) ++ gspca_dev->frame[i].v4l2_buf.flags &= ~BUF_ALL_FLAGS; + atomic_set(&gspca_dev->fr_q, 0); + atomic_set(&gspca_dev->fr_i, 0); + gspca_dev->fr_o = 0; +diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c +index 8dc84d6..86cd532 100644 +--- a/drivers/net/can/c_can/c_can.c ++++ b/drivers/net/can/c_can/c_can.c +@@ -590,8 +590,8 @@ static void c_can_chip_config(struct net_device *dev) + priv->write_reg(priv, &priv->regs->control, + CONTROL_ENABLE_AR); + +- if (priv->can.ctrlmode & (CAN_CTRLMODE_LISTENONLY & +- CAN_CTRLMODE_LOOPBACK)) { ++ if ((priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) && ++ (priv->can.ctrlmode & CAN_CTRLMODE_LOOPBACK)) { + /* loopback + silent mode : useful for hot self-test */ + priv->write_reg(priv, &priv->regs->control, CONTROL_EIE | + CONTROL_SIE | CONTROL_IE | CONTROL_TEST); +diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c +index e023379..e59d006 100644 +--- a/drivers/net/can/flexcan.c ++++ b/drivers/net/can/flexcan.c +@@ -933,12 +933,12 @@ static int __devinit flexcan_probe(struct platform_device *pdev) + u32 clock_freq = 0; + + if (pdev->dev.of_node) { +- const u32 *clock_freq_p; ++ const __be32 *clock_freq_p; + + clock_freq_p = of_get_property(pdev->dev.of_node, + "clock-frequency", NULL); + if (clock_freq_p) +- clock_freq = *clock_freq_p; ++ clock_freq = be32_to_cpup(clock_freq_p); + } + + if (!clock_freq) { +diff --git a/drivers/net/ethernet/intel/e1000e/82571.c b/drivers/net/ethernet/intel/e1000e/82571.c +index a3e65fd..e556fc3 100644 +--- a/drivers/net/ethernet/intel/e1000e/82571.c ++++ b/drivers/net/ethernet/intel/e1000e/82571.c +@@ -2080,8 +2080,9 @@ const struct e1000_info e1000_82574_info = { + | FLAG_HAS_SMART_POWER_DOWN + | FLAG_HAS_AMT + | FLAG_HAS_CTRLEXT_ON_LOAD, +- .flags2 = FLAG2_CHECK_PHY_HANG ++ .flags2 = FLAG2_CHECK_PHY_HANG + | FLAG2_DISABLE_ASPM_L0S ++ | FLAG2_DISABLE_ASPM_L1 + | FLAG2_NO_DISABLE_RX, + .pba = 32, + .max_hw_frame_size = DEFAULT_JUMBO, +diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c +index 4e933d1..64d3f98 100644 +--- a/drivers/net/ethernet/intel/e1000e/netdev.c ++++ b/drivers/net/ethernet/intel/e1000e/netdev.c +@@ -5132,14 +5132,6 @@ static int e1000_change_mtu(struct net_device *netdev, int new_mtu) + return -EINVAL; + } + +- /* 82573 Errata 17 */ +- if (((adapter->hw.mac.type == e1000_82573) || +- (adapter->hw.mac.type == e1000_82574)) && +- (max_frame > ETH_FRAME_LEN + ETH_FCS_LEN)) { +- adapter->flags2 |= FLAG2_DISABLE_ASPM_L1; +- e1000e_disable_aspm(adapter->pdev, PCIE_LINK_STATE_L1); +- } +- + while (test_and_set_bit(__E1000_RESETTING, &adapter->state)) + usleep_range(1000, 2000); + /* e1000e_down -> e1000e_reset dependent on max_frame_size & mtu */ +diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c +index 8b0c2ca..6973620 100644 +--- a/drivers/net/wireless/ath/ath9k/hw.c ++++ b/drivers/net/wireless/ath/ath9k/hw.c +@@ -718,13 +718,25 @@ static void ath9k_hw_init_qos(struct ath_hw *ah) + + u32 ar9003_get_pll_sqsum_dvc(struct ath_hw *ah) + { ++ struct ath_common *common = ath9k_hw_common(ah); ++ int i = 0; ++ + REG_CLR_BIT(ah, PLL3, PLL3_DO_MEAS_MASK); + udelay(100); + REG_SET_BIT(ah, PLL3, PLL3_DO_MEAS_MASK); + +- while ((REG_READ(ah, PLL4) & PLL4_MEAS_DONE) == 0) ++ while ((REG_READ(ah, PLL4) & PLL4_MEAS_DONE) == 0) { ++ + udelay(100); + ++ if (WARN_ON_ONCE(i >= 100)) { ++ ath_err(common, "PLL4 meaurement not done\n"); ++ break; ++ } ++ ++ i++; ++ } ++ + return (REG_READ(ah, PLL3) & SQSUM_DVC_MASK) >> 3; + } + EXPORT_SYMBOL(ar9003_get_pll_sqsum_dvc); +diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c +index f76a814..95437fc 100644 +--- a/drivers/net/wireless/ath/ath9k/main.c ++++ b/drivers/net/wireless/ath/ath9k/main.c +@@ -1042,6 +1042,15 @@ void ath_hw_pll_work(struct work_struct *work) + hw_pll_work.work); + u32 pll_sqsum; + ++ /* ++ * ensure that the PLL WAR is executed only ++ * after the STA is associated (or) if the ++ * beaconing had started in interfaces that ++ * uses beacons. ++ */ ++ if (!(sc->sc_flags & SC_OP_BEACONS)) ++ return; ++ + if (AR_SREV_9485(sc->sc_ah)) { + + ath9k_ps_wakeup(sc); +@@ -1486,15 +1495,6 @@ static int ath9k_add_interface(struct ieee80211_hw *hw, + } + } + +- if ((ah->opmode == NL80211_IFTYPE_ADHOC) || +- ((vif->type == NL80211_IFTYPE_ADHOC) && +- sc->nvifs > 0)) { +- ath_err(common, "Cannot create ADHOC interface when other" +- " interfaces already exist.\n"); +- ret = -EINVAL; +- goto out; +- } +- + ath_dbg(common, ATH_DBG_CONFIG, + "Attach a VIF of type: %d\n", vif->type); + +diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c +index 76fd277..c59c592 100644 +--- a/drivers/net/wireless/ath/ath9k/xmit.c ++++ b/drivers/net/wireless/ath/ath9k/xmit.c +@@ -936,13 +936,13 @@ static void ath_buf_set_rate(struct ath_softc *sc, struct ath_buf *bf, + } + + /* legacy rates */ ++ rate = &sc->sbands[tx_info->band].bitrates[rates[i].idx]; + if ((tx_info->band == IEEE80211_BAND_2GHZ) && + !(rate->flags & IEEE80211_RATE_ERP_G)) + phy = WLAN_RC_PHY_CCK; + else + phy = WLAN_RC_PHY_OFDM; + +- rate = &sc->sbands[tx_info->band].bitrates[rates[i].idx]; + info->rates[i].Rate = rate->hw_value; + if (rate->hw_value_short) { + if (rates[i].flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE) +diff --git a/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c b/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c +index 5815cf5..4661a64 100644 +--- a/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c ++++ b/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c +@@ -1777,6 +1777,7 @@ static ssize_t iwl_dbgfs_rx_queue_read(struct file *file, + return simple_read_from_buffer(user_buf, count, ppos, buf, pos); + } + ++#ifdef CONFIG_IWLWIFI_DEBUG + static ssize_t iwl_dbgfs_log_event_read(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +@@ -1814,6 +1815,7 @@ static ssize_t iwl_dbgfs_log_event_write(struct file *file, + + return count; + } ++#endif + + static ssize_t iwl_dbgfs_interrupt_read(struct file *file, + char __user *user_buf, +@@ -1941,7 +1943,9 @@ static ssize_t iwl_dbgfs_fh_reg_read(struct file *file, + return ret; + } + ++#ifdef CONFIG_IWLWIFI_DEBUG + DEBUGFS_READ_WRITE_FILE_OPS(log_event); ++#endif + DEBUGFS_READ_WRITE_FILE_OPS(interrupt); + DEBUGFS_READ_FILE_OPS(fh_reg); + DEBUGFS_READ_FILE_OPS(rx_queue); +@@ -1957,7 +1961,9 @@ static int iwl_trans_pcie_dbgfs_register(struct iwl_trans *trans, + { + DEBUGFS_ADD_FILE(rx_queue, dir, S_IRUSR); + DEBUGFS_ADD_FILE(tx_queue, dir, S_IRUSR); ++#ifdef CONFIG_IWLWIFI_DEBUG + DEBUGFS_ADD_FILE(log_event, dir, S_IWUSR | S_IRUSR); ++#endif + DEBUGFS_ADD_FILE(interrupt, dir, S_IWUSR | S_IRUSR); + DEBUGFS_ADD_FILE(csr, dir, S_IWUSR); + DEBUGFS_ADD_FILE(fh_reg, dir, S_IRUSR); +diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c +index 226faab..fc35308 100644 +--- a/drivers/net/xen-netfront.c ++++ b/drivers/net/xen-netfront.c +@@ -1922,14 +1922,14 @@ static int __devexit xennet_remove(struct xenbus_device *dev) + + dev_dbg(&dev->dev, "%s\n", dev->nodename); + +- unregister_netdev(info->netdev); +- + xennet_disconnect_backend(info); + +- del_timer_sync(&info->rx_refill_timer); +- + xennet_sysfs_delif(info->netdev); + ++ unregister_netdev(info->netdev); ++ ++ del_timer_sync(&info->rx_refill_timer); ++ + free_percpu(info->stats); + + free_netdev(info->netdev); +diff --git a/drivers/oprofile/oprofile_perf.c b/drivers/oprofile/oprofile_perf.c +index da14432..efc4b7f 100644 +--- a/drivers/oprofile/oprofile_perf.c ++++ b/drivers/oprofile/oprofile_perf.c +@@ -25,7 +25,7 @@ static int oprofile_perf_enabled; + static DEFINE_MUTEX(oprofile_perf_mutex); + + static struct op_counter_config *counter_config; +-static struct perf_event **perf_events[nr_cpumask_bits]; ++static struct perf_event **perf_events[NR_CPUS]; + static int num_counters; + + /* +diff --git a/drivers/staging/iio/adc/ad7606_core.c b/drivers/staging/iio/adc/ad7606_core.c +index 54423ab..2ee187f 100644 +--- a/drivers/staging/iio/adc/ad7606_core.c ++++ b/drivers/staging/iio/adc/ad7606_core.c +@@ -241,6 +241,7 @@ static const struct attribute_group ad7606_attribute_group = { + .indexed = 1, \ + .channel = num, \ + .address = num, \ ++ .info_mask = (1 << IIO_CHAN_INFO_SCALE_SHARED), \ + .scan_index = num, \ + .scan_type = IIO_ST('s', 16, 16, 0), \ + } +diff --git a/drivers/staging/rtl8712/usb_intf.c b/drivers/staging/rtl8712/usb_intf.c +index ec41d38..f4b738f 100644 +--- a/drivers/staging/rtl8712/usb_intf.c ++++ b/drivers/staging/rtl8712/usb_intf.c +@@ -102,6 +102,8 @@ static struct usb_device_id rtl871x_usb_id_tbl[] = { + /* - */ + {USB_DEVICE(0x20F4, 0x646B)}, + {USB_DEVICE(0x083A, 0xC512)}, ++ {USB_DEVICE(0x25D4, 0x4CA1)}, ++ {USB_DEVICE(0x25D4, 0x4CAB)}, + + /* RTL8191SU */ + /* Realtek */ +diff --git a/drivers/staging/rts_pstor/rtsx_transport.c b/drivers/staging/rts_pstor/rtsx_transport.c +index 4e3d2c1..9b2e5c9 100644 +--- a/drivers/staging/rts_pstor/rtsx_transport.c ++++ b/drivers/staging/rts_pstor/rtsx_transport.c +@@ -335,6 +335,7 @@ static int rtsx_transfer_sglist_adma_partial(struct rtsx_chip *chip, u8 card, + int sg_cnt, i, resid; + int err = 0; + long timeleft; ++ struct scatterlist *sg_ptr; + u32 val = TRIG_DMA; + + if ((sg == NULL) || (num_sg <= 0) || !offset || !index) +@@ -371,7 +372,7 @@ static int rtsx_transfer_sglist_adma_partial(struct rtsx_chip *chip, u8 card, + sg_cnt = dma_map_sg(&(rtsx->pci->dev), sg, num_sg, dma_dir); + + resid = size; +- ++ sg_ptr = sg; + chip->sgi = 0; + /* Usually the next entry will be @sg@ + 1, but if this sg element + * is part of a chained scatterlist, it could jump to the start of +@@ -379,14 +380,14 @@ static int rtsx_transfer_sglist_adma_partial(struct rtsx_chip *chip, u8 card, + * the proper sg + */ + for (i = 0; i < *index; i++) +- sg = sg_next(sg); ++ sg_ptr = sg_next(sg_ptr); + for (i = *index; i < sg_cnt; i++) { + dma_addr_t addr; + unsigned int len; + u8 option; + +- addr = sg_dma_address(sg); +- len = sg_dma_len(sg); ++ addr = sg_dma_address(sg_ptr); ++ len = sg_dma_len(sg_ptr); + + RTSX_DEBUGP("DMA addr: 0x%x, Len: 0x%x\n", + (unsigned int)addr, len); +@@ -415,7 +416,7 @@ static int rtsx_transfer_sglist_adma_partial(struct rtsx_chip *chip, u8 card, + if (!resid) + break; + +- sg = sg_next(sg); ++ sg_ptr = sg_next(sg_ptr); + } + + RTSX_DEBUGP("SG table count = %d\n", chip->sgi); +diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c +index aa0c43f..35e6b5f 100644 +--- a/drivers/usb/serial/cp210x.c ++++ b/drivers/usb/serial/cp210x.c +@@ -93,6 +93,7 @@ static const struct usb_device_id id_table[] = { + { USB_DEVICE(0x10C4, 0x814B) }, /* West Mountain Radio RIGtalk */ + { USB_DEVICE(0x10C4, 0x8156) }, /* B&G H3000 link cable */ + { USB_DEVICE(0x10C4, 0x815E) }, /* Helicomm IP-Link 1220-DVM */ ++ { USB_DEVICE(0x10C4, 0x815F) }, /* Timewave HamLinkUSB */ + { USB_DEVICE(0x10C4, 0x818B) }, /* AVIT Research USB to TTL */ + { USB_DEVICE(0x10C4, 0x819F) }, /* MJS USB Toslink Switcher */ + { USB_DEVICE(0x10C4, 0x81A6) }, /* ThinkOptics WavIt */ +@@ -134,7 +135,13 @@ static const struct usb_device_id id_table[] = { + { USB_DEVICE(0x10CE, 0xEA6A) }, /* Silicon Labs MobiData GPRS USB Modem 100EU */ + { USB_DEVICE(0x13AD, 0x9999) }, /* Baltech card reader */ + { USB_DEVICE(0x1555, 0x0004) }, /* Owen AC4 USB-RS485 Converter */ ++ { USB_DEVICE(0x166A, 0x0201) }, /* Clipsal 5500PACA C-Bus Pascal Automation Controller */ ++ { USB_DEVICE(0x166A, 0x0301) }, /* Clipsal 5800PC C-Bus Wireless PC Interface */ + { USB_DEVICE(0x166A, 0x0303) }, /* Clipsal 5500PCU C-Bus USB interface */ ++ { USB_DEVICE(0x166A, 0x0304) }, /* Clipsal 5000CT2 C-Bus Black and White Touchscreen */ ++ { USB_DEVICE(0x166A, 0x0305) }, /* Clipsal C-5000CT2 C-Bus Spectrum Colour Touchscreen */ ++ { USB_DEVICE(0x166A, 0x0401) }, /* Clipsal L51xx C-Bus Architectural Dimmer */ ++ { USB_DEVICE(0x166A, 0x0101) }, /* Clipsal 5560884 C-Bus Multi-room Audio Matrix Switcher */ + { USB_DEVICE(0x16D6, 0x0001) }, /* Jablotron serial interface */ + { USB_DEVICE(0x16DC, 0x0010) }, /* W-IE-NE-R Plein & Baus GmbH PL512 Power Supply */ + { USB_DEVICE(0x16DC, 0x0011) }, /* W-IE-NE-R Plein & Baus GmbH RCM Remote Control for MARATON Power Supply */ +@@ -146,7 +153,11 @@ static const struct usb_device_id id_table[] = { + { USB_DEVICE(0x1843, 0x0200) }, /* Vaisala USB Instrument Cable */ + { USB_DEVICE(0x18EF, 0xE00F) }, /* ELV USB-I2C-Interface */ + { USB_DEVICE(0x1BE3, 0x07A6) }, /* WAGO 750-923 USB Service Cable */ ++ { USB_DEVICE(0x1E29, 0x0102) }, /* Festo CPX-USB */ ++ { USB_DEVICE(0x1E29, 0x0501) }, /* Festo CMSP */ + { USB_DEVICE(0x3195, 0xF190) }, /* Link Instruments MSO-19 */ ++ { USB_DEVICE(0x3195, 0xF280) }, /* Link Instruments MSO-28 */ ++ { USB_DEVICE(0x3195, 0xF281) }, /* Link Instruments MSO-28 */ + { USB_DEVICE(0x413C, 0x9500) }, /* DW700 GPS USB interface */ + { } /* Terminating Entry */ + }; +diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c +index 61d6c31..21a4734 100644 +--- a/drivers/usb/serial/option.c ++++ b/drivers/usb/serial/option.c +@@ -235,6 +235,7 @@ static void option_instat_callback(struct urb *urb); + #define NOVATELWIRELESS_PRODUCT_G1 0xA001 + #define NOVATELWIRELESS_PRODUCT_G1_M 0xA002 + #define NOVATELWIRELESS_PRODUCT_G2 0xA010 ++#define NOVATELWIRELESS_PRODUCT_MC551 0xB001 + + /* AMOI PRODUCTS */ + #define AMOI_VENDOR_ID 0x1614 +@@ -496,6 +497,10 @@ static void option_instat_callback(struct urb *urb); + /* MediaTek products */ + #define MEDIATEK_VENDOR_ID 0x0e8d + ++/* Cellient products */ ++#define CELLIENT_VENDOR_ID 0x2692 ++#define CELLIENT_PRODUCT_MEN200 0x9005 ++ + /* some devices interfaces need special handling due to a number of reasons */ + enum option_blacklist_reason { + OPTION_BLACKLIST_NONE = 0, +@@ -730,6 +735,8 @@ static const struct usb_device_id option_ids[] = { + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_G1) }, + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_G1_M) }, + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_G2) }, ++ /* Novatel Ovation MC551 a.k.a. Verizon USB551L */ ++ { USB_DEVICE_AND_INTERFACE_INFO(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_MC551, 0xff, 0xff, 0xff) }, + + { USB_DEVICE(AMOI_VENDOR_ID, AMOI_PRODUCT_H01) }, + { USB_DEVICE(AMOI_VENDOR_ID, AMOI_PRODUCT_H01A) }, +@@ -1227,6 +1234,7 @@ static const struct usb_device_id option_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a1, 0xff, 0x02, 0x01) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a2, 0xff, 0x00, 0x00) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a2, 0xff, 0x02, 0x01) }, /* MediaTek MT6276M modem & app port */ ++ { USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MEN200) }, + { } /* Terminating entry */ + }; + MODULE_DEVICE_TABLE(usb, option_ids); +diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c +index 08a07a2..57ceaf3 100644 +--- a/fs/nilfs2/gcinode.c ++++ b/fs/nilfs2/gcinode.c +@@ -191,6 +191,8 @@ void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs) + while (!list_empty(head)) { + ii = list_first_entry(head, struct nilfs_inode_info, i_dirty); + list_del_init(&ii->i_dirty); ++ truncate_inode_pages(&ii->vfs_inode.i_data, 0); ++ nilfs_btnode_cache_clear(&ii->i_btnode_cache); + iput(&ii->vfs_inode); + } + } +diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c +index bb24ab6..6f24e67 100644 +--- a/fs/nilfs2/segment.c ++++ b/fs/nilfs2/segment.c +@@ -2309,6 +2309,8 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head) + if (!test_bit(NILFS_I_UPDATED, &ii->i_state)) + continue; + list_del_init(&ii->i_dirty); ++ truncate_inode_pages(&ii->vfs_inode.i_data, 0); ++ nilfs_btnode_cache_clear(&ii->i_btnode_cache); + iput(&ii->vfs_inode); + } + } +diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h +index a03c098..bc00876 100644 +--- a/include/asm-generic/pgtable.h ++++ b/include/asm-generic/pgtable.h +@@ -445,6 +445,18 @@ static inline int pmd_write(pmd_t pmd) + #endif /* __HAVE_ARCH_PMD_WRITE */ + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + ++#ifndef pmd_read_atomic ++static inline pmd_t pmd_read_atomic(pmd_t *pmdp) ++{ ++ /* ++ * Depend on compiler for an atomic pmd read. NOTE: this is ++ * only going to work, if the pmdval_t isn't larger than ++ * an unsigned long. ++ */ ++ return *pmdp; ++} ++#endif ++ + /* + * This function is meant to be used by sites walking pagetables with + * the mmap_sem hold in read mode to protect against MADV_DONTNEED and +@@ -458,14 +470,30 @@ static inline int pmd_write(pmd_t pmd) + * undefined so behaving like if the pmd was none is safe (because it + * can return none anyway). The compiler level barrier() is critically + * important to compute the two checks atomically on the same pmdval. ++ * ++ * For 32bit kernels with a 64bit large pmd_t this automatically takes ++ * care of reading the pmd atomically to avoid SMP race conditions ++ * against pmd_populate() when the mmap_sem is hold for reading by the ++ * caller (a special atomic read not done by "gcc" as in the generic ++ * version above, is also needed when THP is disabled because the page ++ * fault can populate the pmd from under us). + */ + static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) + { +- /* depend on compiler for an atomic pmd read */ +- pmd_t pmdval = *pmd; ++ pmd_t pmdval = pmd_read_atomic(pmd); + /* + * The barrier will stabilize the pmdval in a register or on + * the stack so that it will stop changing under the code. ++ * ++ * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE, ++ * pmd_read_atomic is allowed to return a not atomic pmdval ++ * (for example pointing to an hugepage that has never been ++ * mapped in the pmd). The below checks will only care about ++ * the low part of the pmd with 32bit PAE x86 anyway, with the ++ * exception of pmd_none(). So the important thing is that if ++ * the low part of the pmd is found null, the high part will ++ * be also null or the pmd_none() check below would be ++ * confused. + */ + #ifdef CONFIG_TRANSPARENT_HUGEPAGE + barrier(); +diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c +index f961cc5..da587ad 100644 +--- a/net/batman-adv/routing.c ++++ b/net/batman-adv/routing.c +@@ -619,6 +619,8 @@ int recv_tt_query(struct sk_buff *skb, struct hard_iface *recv_if) + /* packet needs to be linearized to access the TT changes */ + if (skb_linearize(skb) < 0) + goto out; ++ /* skb_linearize() possibly changed skb->data */ ++ tt_query = (struct tt_query_packet *)skb->data; + + if (is_my_mac(tt_query->dst)) + handle_tt_response(bat_priv, tt_query); +diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c +index 5f09a57..088af45 100644 +--- a/net/batman-adv/translation-table.c ++++ b/net/batman-adv/translation-table.c +@@ -1816,10 +1816,10 @@ bool is_ap_isolated(struct bat_priv *bat_priv, uint8_t *src, uint8_t *dst) + { + struct tt_local_entry *tt_local_entry = NULL; + struct tt_global_entry *tt_global_entry = NULL; +- bool ret = true; ++ bool ret = false; + + if (!atomic_read(&bat_priv->ap_isolation)) +- return false; ++ goto out; + + tt_local_entry = tt_local_hash_find(bat_priv, dst); + if (!tt_local_entry) +@@ -1829,10 +1829,10 @@ bool is_ap_isolated(struct bat_priv *bat_priv, uint8_t *src, uint8_t *dst) + if (!tt_global_entry) + goto out; + +- if (_is_ap_isolated(tt_local_entry, tt_global_entry)) ++ if (!_is_ap_isolated(tt_local_entry, tt_global_entry)) + goto out; + +- ret = false; ++ ret = true; + + out: + if (tt_global_entry) +diff --git a/net/wireless/reg.c b/net/wireless/reg.c +index c1c99dd..d57d05b 100644 +--- a/net/wireless/reg.c ++++ b/net/wireless/reg.c +@@ -1369,7 +1369,7 @@ static void reg_set_request_processed(void) + spin_unlock(®_requests_lock); + + if (last_request->initiator == NL80211_REGDOM_SET_BY_USER) +- cancel_delayed_work_sync(®_timeout); ++ cancel_delayed_work(®_timeout); + + if (need_more_processing) + schedule_work(®_work); +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index 0005bde..5f096a5 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -5988,6 +5988,7 @@ static const struct hda_codec_preset snd_hda_preset_realtek[] = { + { .id = 0x10ec0272, .name = "ALC272", .patch = patch_alc662 }, + { .id = 0x10ec0275, .name = "ALC275", .patch = patch_alc269 }, + { .id = 0x10ec0276, .name = "ALC276", .patch = patch_alc269 }, ++ { .id = 0x10ec0280, .name = "ALC280", .patch = patch_alc269 }, + { .id = 0x10ec0861, .rev = 0x100340, .name = "ALC660", + .patch = patch_alc861 }, + { .id = 0x10ec0660, .name = "ALC660-VD", .patch = patch_alc861vd }, +diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c +index 11224ed..323d4d9 100644 +--- a/tools/hv/hv_kvp_daemon.c ++++ b/tools/hv/hv_kvp_daemon.c +@@ -384,14 +384,18 @@ int main(void) + pfd.fd = fd; + + while (1) { ++ struct sockaddr *addr_p = (struct sockaddr *) &addr; ++ socklen_t addr_l = sizeof(addr); + pfd.events = POLLIN; + pfd.revents = 0; + poll(&pfd, 1, -1); + +- len = recv(fd, kvp_recv_buffer, sizeof(kvp_recv_buffer), 0); ++ len = recvfrom(fd, kvp_recv_buffer, sizeof(kvp_recv_buffer), 0, ++ addr_p, &addr_l); + +- if (len < 0) { +- syslog(LOG_ERR, "recv failed; error:%d", len); ++ if (len < 0 || addr.nl_pid) { ++ syslog(LOG_ERR, "recvfrom failed; pid:%u error:%d %s", ++ addr.nl_pid, errno, strerror(errno)); + close(fd); + return -1; + } diff --git a/3.2.34/bump/1022_linux-3.2.23.patch b/3.2.34/bump/1022_linux-3.2.23.patch new file mode 100644 index 0000000..3d796d0 --- /dev/null +++ b/3.2.34/bump/1022_linux-3.2.23.patch @@ -0,0 +1,1862 @@ +diff --git a/Makefile b/Makefile +index 9a7d921..40d1e3b 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 2 +-SUBLEVEL = 22 ++SUBLEVEL = 23 + EXTRAVERSION = + NAME = Saber-toothed Squirrel + +diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c +index e10e59a..1d1710e 100644 +--- a/arch/arm/kernel/smp.c ++++ b/arch/arm/kernel/smp.c +@@ -471,9 +471,7 @@ static DEFINE_PER_CPU(struct clock_event_device, percpu_clockevent); + static void ipi_timer(void) + { + struct clock_event_device *evt = &__get_cpu_var(percpu_clockevent); +- irq_enter(); + evt->event_handler(evt); +- irq_exit(); + } + + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +@@ -572,7 +570,9 @@ void handle_IPI(int ipinr, struct pt_regs *regs) + + switch (ipinr) { + case IPI_TIMER: ++ irq_enter(); + ipi_timer(); ++ irq_exit(); + break; + + case IPI_RESCHEDULE: +@@ -580,15 +580,21 @@ void handle_IPI(int ipinr, struct pt_regs *regs) + break; + + case IPI_CALL_FUNC: ++ irq_enter(); + generic_smp_call_function_interrupt(); ++ irq_exit(); + break; + + case IPI_CALL_FUNC_SINGLE: ++ irq_enter(); + generic_smp_call_function_single_interrupt(); ++ irq_exit(); + break; + + case IPI_CPU_STOP: ++ irq_enter(); + ipi_cpu_stop(cpu); ++ irq_exit(); + break; + + default: +diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S +index 44d8829..5e8dc08 100644 +--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S ++++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S +@@ -763,7 +763,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + lwz r3,VCORE_NAPPING_THREADS(r5) + lwz r4,VCPU_PTID(r9) + li r0,1 +- sldi r0,r0,r4 ++ sld r0,r0,r4 + andc. r3,r3,r0 /* no sense IPI'ing ourselves */ + beq 43f + mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ +diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c +index 03a217a..b7e63d8 100644 +--- a/arch/powerpc/xmon/xmon.c ++++ b/arch/powerpc/xmon/xmon.c +@@ -975,7 +975,7 @@ static int cpu_cmd(void) + /* print cpus waiting or in xmon */ + printf("cpus stopped:"); + count = 0; +- for (cpu = 0; cpu < NR_CPUS; ++cpu) { ++ for_each_possible_cpu(cpu) { + if (cpumask_test_cpu(cpu, &cpus_in_xmon)) { + if (count == 0) + printf(" %x", cpu); +diff --git a/drivers/block/umem.c b/drivers/block/umem.c +index aa27120..9a72277 100644 +--- a/drivers/block/umem.c ++++ b/drivers/block/umem.c +@@ -513,6 +513,44 @@ static void process_page(unsigned long data) + } + } + ++struct mm_plug_cb { ++ struct blk_plug_cb cb; ++ struct cardinfo *card; ++}; ++ ++static void mm_unplug(struct blk_plug_cb *cb) ++{ ++ struct mm_plug_cb *mmcb = container_of(cb, struct mm_plug_cb, cb); ++ ++ spin_lock_irq(&mmcb->card->lock); ++ activate(mmcb->card); ++ spin_unlock_irq(&mmcb->card->lock); ++ kfree(mmcb); ++} ++ ++static int mm_check_plugged(struct cardinfo *card) ++{ ++ struct blk_plug *plug = current->plug; ++ struct mm_plug_cb *mmcb; ++ ++ if (!plug) ++ return 0; ++ ++ list_for_each_entry(mmcb, &plug->cb_list, cb.list) { ++ if (mmcb->cb.callback == mm_unplug && mmcb->card == card) ++ return 1; ++ } ++ /* Not currently on the callback list */ ++ mmcb = kmalloc(sizeof(*mmcb), GFP_ATOMIC); ++ if (!mmcb) ++ return 0; ++ ++ mmcb->card = card; ++ mmcb->cb.callback = mm_unplug; ++ list_add(&mmcb->cb.list, &plug->cb_list); ++ return 1; ++} ++ + static void mm_make_request(struct request_queue *q, struct bio *bio) + { + struct cardinfo *card = q->queuedata; +@@ -523,6 +561,8 @@ static void mm_make_request(struct request_queue *q, struct bio *bio) + *card->biotail = bio; + bio->bi_next = NULL; + card->biotail = &bio->bi_next; ++ if (bio->bi_rw & REQ_SYNC || !mm_check_plugged(card)) ++ activate(card); + spin_unlock_irq(&card->lock); + + return; +diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c +index c4da951..ca67338 100644 +--- a/drivers/gpu/drm/i915/i915_dma.c ++++ b/drivers/gpu/drm/i915/i915_dma.c +@@ -1890,6 +1890,27 @@ ips_ping_for_i915_load(void) + } + } + ++static void i915_kick_out_firmware_fb(struct drm_i915_private *dev_priv) ++{ ++ struct apertures_struct *ap; ++ struct pci_dev *pdev = dev_priv->dev->pdev; ++ bool primary; ++ ++ ap = alloc_apertures(1); ++ if (!ap) ++ return; ++ ++ ap->ranges[0].base = dev_priv->dev->agp->base; ++ ap->ranges[0].size = ++ dev_priv->mm.gtt->gtt_mappable_entries << PAGE_SHIFT; ++ primary = ++ pdev->resource[PCI_ROM_RESOURCE].flags & IORESOURCE_ROM_SHADOW; ++ ++ remove_conflicting_framebuffers(ap, "inteldrmfb", primary); ++ ++ kfree(ap); ++} ++ + /** + * i915_driver_load - setup chip and create an initial config + * @dev: DRM device +@@ -1927,6 +1948,15 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags) + goto free_priv; + } + ++ dev_priv->mm.gtt = intel_gtt_get(); ++ if (!dev_priv->mm.gtt) { ++ DRM_ERROR("Failed to initialize GTT\n"); ++ ret = -ENODEV; ++ goto put_bridge; ++ } ++ ++ i915_kick_out_firmware_fb(dev_priv); ++ + /* overlay on gen2 is broken and can't address above 1G */ + if (IS_GEN2(dev)) + dma_set_coherent_mask(&dev->pdev->dev, DMA_BIT_MASK(30)); +@@ -1950,13 +1980,6 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags) + goto put_bridge; + } + +- dev_priv->mm.gtt = intel_gtt_get(); +- if (!dev_priv->mm.gtt) { +- DRM_ERROR("Failed to initialize GTT\n"); +- ret = -ENODEV; +- goto out_rmmap; +- } +- + agp_size = dev_priv->mm.gtt->gtt_mappable_entries << PAGE_SHIFT; + + dev_priv->mm.gtt_mapping = +diff --git a/drivers/md/persistent-data/dm-space-map-checker.c b/drivers/md/persistent-data/dm-space-map-checker.c +index 50ed53b..fc90c11 100644 +--- a/drivers/md/persistent-data/dm-space-map-checker.c ++++ b/drivers/md/persistent-data/dm-space-map-checker.c +@@ -8,6 +8,7 @@ + + #include + #include ++#include + + #ifdef CONFIG_DM_DEBUG_SPACE_MAPS + +@@ -89,13 +90,23 @@ static int ca_create(struct count_array *ca, struct dm_space_map *sm) + + ca->nr = nr_blocks; + ca->nr_free = nr_blocks; +- ca->counts = kzalloc(sizeof(*ca->counts) * nr_blocks, GFP_KERNEL); +- if (!ca->counts) +- return -ENOMEM; ++ ++ if (!nr_blocks) ++ ca->counts = NULL; ++ else { ++ ca->counts = vzalloc(sizeof(*ca->counts) * nr_blocks); ++ if (!ca->counts) ++ return -ENOMEM; ++ } + + return 0; + } + ++static void ca_destroy(struct count_array *ca) ++{ ++ vfree(ca->counts); ++} ++ + static int ca_load(struct count_array *ca, struct dm_space_map *sm) + { + int r; +@@ -126,12 +137,14 @@ static int ca_load(struct count_array *ca, struct dm_space_map *sm) + static int ca_extend(struct count_array *ca, dm_block_t extra_blocks) + { + dm_block_t nr_blocks = ca->nr + extra_blocks; +- uint32_t *counts = kzalloc(sizeof(*counts) * nr_blocks, GFP_KERNEL); ++ uint32_t *counts = vzalloc(sizeof(*counts) * nr_blocks); + if (!counts) + return -ENOMEM; + +- memcpy(counts, ca->counts, sizeof(*counts) * ca->nr); +- kfree(ca->counts); ++ if (ca->counts) { ++ memcpy(counts, ca->counts, sizeof(*counts) * ca->nr); ++ ca_destroy(ca); ++ } + ca->nr = nr_blocks; + ca->nr_free += extra_blocks; + ca->counts = counts; +@@ -151,11 +164,6 @@ static int ca_commit(struct count_array *old, struct count_array *new) + return 0; + } + +-static void ca_destroy(struct count_array *ca) +-{ +- kfree(ca->counts); +-} +- + /*----------------------------------------------------------------*/ + + struct sm_checker { +@@ -343,25 +351,25 @@ struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) + int r; + struct sm_checker *smc; + +- if (!sm) +- return NULL; ++ if (IS_ERR_OR_NULL(sm)) ++ return ERR_PTR(-EINVAL); + + smc = kmalloc(sizeof(*smc), GFP_KERNEL); + if (!smc) +- return NULL; ++ return ERR_PTR(-ENOMEM); + + memcpy(&smc->sm, &ops_, sizeof(smc->sm)); + r = ca_create(&smc->old_counts, sm); + if (r) { + kfree(smc); +- return NULL; ++ return ERR_PTR(r); + } + + r = ca_create(&smc->counts, sm); + if (r) { + ca_destroy(&smc->old_counts); + kfree(smc); +- return NULL; ++ return ERR_PTR(r); + } + + smc->real_sm = sm; +@@ -371,7 +379,7 @@ struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) + ca_destroy(&smc->counts); + ca_destroy(&smc->old_counts); + kfree(smc); +- return NULL; ++ return ERR_PTR(r); + } + + r = ca_commit(&smc->old_counts, &smc->counts); +@@ -379,7 +387,7 @@ struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) + ca_destroy(&smc->counts); + ca_destroy(&smc->old_counts); + kfree(smc); +- return NULL; ++ return ERR_PTR(r); + } + + return &smc->sm; +@@ -391,25 +399,25 @@ struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm) + int r; + struct sm_checker *smc; + +- if (!sm) +- return NULL; ++ if (IS_ERR_OR_NULL(sm)) ++ return ERR_PTR(-EINVAL); + + smc = kmalloc(sizeof(*smc), GFP_KERNEL); + if (!smc) +- return NULL; ++ return ERR_PTR(-ENOMEM); + + memcpy(&smc->sm, &ops_, sizeof(smc->sm)); + r = ca_create(&smc->old_counts, sm); + if (r) { + kfree(smc); +- return NULL; ++ return ERR_PTR(r); + } + + r = ca_create(&smc->counts, sm); + if (r) { + ca_destroy(&smc->old_counts); + kfree(smc); +- return NULL; ++ return ERR_PTR(r); + } + + smc->real_sm = sm; +diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c +index fc469ba..3d0ed53 100644 +--- a/drivers/md/persistent-data/dm-space-map-disk.c ++++ b/drivers/md/persistent-data/dm-space-map-disk.c +@@ -290,7 +290,16 @@ struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, + dm_block_t nr_blocks) + { + struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks); +- return dm_sm_checker_create_fresh(sm); ++ struct dm_space_map *smc; ++ ++ if (IS_ERR_OR_NULL(sm)) ++ return sm; ++ ++ smc = dm_sm_checker_create_fresh(sm); ++ if (IS_ERR(smc)) ++ dm_sm_destroy(sm); ++ ++ return smc; + } + EXPORT_SYMBOL_GPL(dm_sm_disk_create); + +diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c +index 6f8d387..ba54aac 100644 +--- a/drivers/md/persistent-data/dm-transaction-manager.c ++++ b/drivers/md/persistent-data/dm-transaction-manager.c +@@ -138,6 +138,9 @@ EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone); + + void dm_tm_destroy(struct dm_transaction_manager *tm) + { ++ if (!tm->is_clone) ++ wipe_shadow_table(tm); ++ + kfree(tm); + } + EXPORT_SYMBOL_GPL(dm_tm_destroy); +@@ -342,8 +345,10 @@ static int dm_tm_create_internal(struct dm_block_manager *bm, + } + + *sm = dm_sm_checker_create(inner); +- if (!*sm) ++ if (IS_ERR(*sm)) { ++ r = PTR_ERR(*sm); + goto bad2; ++ } + + } else { + r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location, +@@ -362,8 +367,10 @@ static int dm_tm_create_internal(struct dm_block_manager *bm, + } + + *sm = dm_sm_checker_create(inner); +- if (!*sm) ++ if (IS_ERR(*sm)) { ++ r = PTR_ERR(*sm); + goto bad2; ++ } + } + + return 0; +diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c +index b219449..7a9eef6 100644 +--- a/drivers/md/raid10.c ++++ b/drivers/md/raid10.c +@@ -1919,7 +1919,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 + if (r10_sync_page_io(rdev, + r10_bio->devs[sl].addr + + sect, +- s<<9, conf->tmppage, WRITE) ++ s, conf->tmppage, WRITE) + == 0) { + /* Well, this device is dead */ + printk(KERN_NOTICE +@@ -1956,7 +1956,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 + switch (r10_sync_page_io(rdev, + r10_bio->devs[sl].addr + + sect, +- s<<9, conf->tmppage, ++ s, conf->tmppage, + READ)) { + case 0: + /* Well, this device is dead */ +@@ -2119,7 +2119,7 @@ read_more: + rdev = conf->mirrors[mirror].rdev; + printk_ratelimited( + KERN_ERR +- "md/raid10:%s: %s: redirecting" ++ "md/raid10:%s: %s: redirecting " + "sector %llu to another mirror\n", + mdname(mddev), + bdevname(rdev->bdev, b), +@@ -2436,6 +2436,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, + /* want to reconstruct this device */ + rb2 = r10_bio; + sect = raid10_find_virt(conf, sector_nr, i); ++ if (sect >= mddev->resync_max_sectors) { ++ /* last stripe is not complete - don't ++ * try to recover this sector. ++ */ ++ continue; ++ } + /* Unless we are doing a full sync, we only need + * to recover the block if it is set in the bitmap + */ +diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c +index 858fdbb..6ba4954 100644 +--- a/drivers/md/raid5.c ++++ b/drivers/md/raid5.c +@@ -542,6 +542,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) + * a chance*/ + md_check_recovery(conf->mddev); + } ++ /* ++ * Because md_wait_for_blocked_rdev ++ * will dec nr_pending, we must ++ * increment it first. ++ */ ++ atomic_inc(&rdev->nr_pending); + md_wait_for_blocked_rdev(rdev, conf->mddev); + } else { + /* Acknowledged bad block - skip the write */ +@@ -3621,7 +3627,6 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) + raid_bio->bi_next = (void*)rdev; + align_bi->bi_bdev = rdev->bdev; + align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); +- align_bi->bi_sector += rdev->data_offset; + + if (!bio_fits_rdev(align_bi) || + is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, +@@ -3632,6 +3637,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) + return 0; + } + ++ /* No reshape active, so we can trust rdev->data_offset */ ++ align_bi->bi_sector += rdev->data_offset; ++ + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_stripe, + conf->quiesce == 0, +diff --git a/drivers/mtd/nand/cafe_nand.c b/drivers/mtd/nand/cafe_nand.c +index 72d3f23..68ecf48 100644 +--- a/drivers/mtd/nand/cafe_nand.c ++++ b/drivers/mtd/nand/cafe_nand.c +@@ -102,7 +102,7 @@ static const char *part_probes[] = { "cmdlinepart", "RedBoot", NULL }; + static int cafe_device_ready(struct mtd_info *mtd) + { + struct cafe_priv *cafe = mtd->priv; +- int result = !!(cafe_readl(cafe, NAND_STATUS) | 0x40000000); ++ int result = !!(cafe_readl(cafe, NAND_STATUS) & 0x40000000); + uint32_t irqs = cafe_readl(cafe, NAND_IRQ); + + cafe_writel(cafe, irqs, NAND_IRQ); +diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c +index f65e0b9..1a88e38 100644 +--- a/drivers/net/bonding/bond_main.c ++++ b/drivers/net/bonding/bond_main.c +@@ -77,6 +77,7 @@ + #include + #include + #include ++#include + #include "bonding.h" + #include "bond_3ad.h" + #include "bond_alb.h" +@@ -382,8 +383,6 @@ struct vlan_entry *bond_next_vlan(struct bonding *bond, struct vlan_entry *curr) + return next; + } + +-#define bond_queue_mapping(skb) (*(u16 *)((skb)->cb)) +- + /** + * bond_dev_queue_xmit - Prepare skb for xmit. + * +@@ -396,7 +395,9 @@ int bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, + { + skb->dev = slave_dev; + +- skb->queue_mapping = bond_queue_mapping(skb); ++ BUILD_BUG_ON(sizeof(skb->queue_mapping) != ++ sizeof(qdisc_skb_cb(skb)->bond_queue_mapping)); ++ skb->queue_mapping = qdisc_skb_cb(skb)->bond_queue_mapping; + + if (unlikely(netpoll_tx_running(slave_dev))) + bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb); +@@ -4151,7 +4152,7 @@ static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb) + /* + * Save the original txq to restore before passing to the driver + */ +- bond_queue_mapping(skb) = skb->queue_mapping; ++ qdisc_skb_cb(skb)->bond_queue_mapping = skb->queue_mapping; + + if (unlikely(txq >= dev->real_num_tx_queues)) { + do { +diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c +index eeac9ca..68fe73c 100644 +--- a/drivers/net/dummy.c ++++ b/drivers/net/dummy.c +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + + static int numdummies = 1; + +@@ -186,8 +187,10 @@ static int __init dummy_init_module(void) + rtnl_lock(); + err = __rtnl_link_register(&dummy_link_ops); + +- for (i = 0; i < numdummies && !err; i++) ++ for (i = 0; i < numdummies && !err; i++) { + err = dummy_init_one(); ++ cond_resched(); ++ } + if (err < 0) + __rtnl_link_unregister(&dummy_link_ops); + rtnl_unlock(); +diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c +index bf266a0..36c7c4e 100644 +--- a/drivers/net/ethernet/emulex/benet/be_main.c ++++ b/drivers/net/ethernet/emulex/benet/be_main.c +@@ -696,6 +696,8 @@ static netdev_tx_t be_xmit(struct sk_buff *skb, + + copied = make_tx_wrbs(adapter, txq, skb, wrb_cnt, dummy_wrb); + if (copied) { ++ int gso_segs = skb_shinfo(skb)->gso_segs; ++ + /* record the sent skb in the sent_skb table */ + BUG_ON(txo->sent_skb_list[start]); + txo->sent_skb_list[start] = skb; +@@ -713,8 +715,7 @@ static netdev_tx_t be_xmit(struct sk_buff *skb, + + be_txq_notify(adapter, txq->id, wrb_cnt); + +- be_tx_stats_update(txo, wrb_cnt, copied, +- skb_shinfo(skb)->gso_segs, stopped); ++ be_tx_stats_update(txo, wrb_cnt, copied, gso_segs, stopped); + } else { + txq->head = start; + dev_kfree_skb_any(skb); +diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c +index 65c51ff..11ddd838 100644 +--- a/drivers/net/ethernet/marvell/sky2.c ++++ b/drivers/net/ethernet/marvell/sky2.c +@@ -4361,10 +4361,12 @@ static int sky2_set_features(struct net_device *dev, u32 features) + struct sky2_port *sky2 = netdev_priv(dev); + u32 changed = dev->features ^ features; + +- if (changed & NETIF_F_RXCSUM) { +- u32 on = features & NETIF_F_RXCSUM; +- sky2_write32(sky2->hw, Q_ADDR(rxqaddr[sky2->port], Q_CSR), +- on ? BMU_ENA_RX_CHKSUM : BMU_DIS_RX_CHKSUM); ++ if ((changed & NETIF_F_RXCSUM) && ++ !(sky2->hw->flags & SKY2_HW_NEW_LE)) { ++ sky2_write32(sky2->hw, ++ Q_ADDR(rxqaddr[sky2->port], Q_CSR), ++ (features & NETIF_F_RXCSUM) ++ ? BMU_ENA_RX_CHKSUM : BMU_DIS_RX_CHKSUM); + } + + if (changed & NETIF_F_RXHASH) +diff --git a/drivers/net/wireless/ath/ath.h b/drivers/net/wireless/ath/ath.h +index 0f9ee46..4cc4a8b 100644 +--- a/drivers/net/wireless/ath/ath.h ++++ b/drivers/net/wireless/ath/ath.h +@@ -143,6 +143,7 @@ struct ath_common { + u32 keymax; + DECLARE_BITMAP(keymap, ATH_KEYMAX); + DECLARE_BITMAP(tkip_keymap, ATH_KEYMAX); ++ DECLARE_BITMAP(ccmp_keymap, ATH_KEYMAX); + enum ath_crypt_caps crypt_caps; + + unsigned int clockrate; +diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c +index 6973620..7f97164 100644 +--- a/drivers/net/wireless/ath/ath9k/hw.c ++++ b/drivers/net/wireless/ath/ath9k/hw.c +@@ -557,7 +557,7 @@ static int __ath9k_hw_init(struct ath_hw *ah) + + if (ah->config.serialize_regmode == SER_REG_MODE_AUTO) { + if (ah->hw_version.macVersion == AR_SREV_VERSION_5416_PCI || +- ((AR_SREV_9160(ah) || AR_SREV_9280(ah)) && ++ ((AR_SREV_9160(ah) || AR_SREV_9280(ah) || AR_SREV_9287(ah)) && + !ah->is_pciexpress)) { + ah->config.serialize_regmode = + SER_REG_MODE_ON; +diff --git a/drivers/net/wireless/ath/ath9k/recv.c b/drivers/net/wireless/ath/ath9k/recv.c +index 2f3aeac..e6d791c 100644 +--- a/drivers/net/wireless/ath/ath9k/recv.c ++++ b/drivers/net/wireless/ath/ath9k/recv.c +@@ -829,7 +829,8 @@ static bool ath9k_rx_accept(struct ath_common *common, + * descriptor does contain a valid key index. This has been observed + * mostly with CCMP encryption. + */ +- if (rx_stats->rs_keyix == ATH9K_RXKEYIX_INVALID) ++ if (rx_stats->rs_keyix == ATH9K_RXKEYIX_INVALID || ++ !test_bit(rx_stats->rs_keyix, common->ccmp_keymap)) + rx_stats->rs_status &= ~ATH9K_RXERR_KEYMISS; + + if (!rx_stats->rs_datalen) +diff --git a/drivers/net/wireless/ath/key.c b/drivers/net/wireless/ath/key.c +index 4cf7c5e..1ec3fa5 100644 +--- a/drivers/net/wireless/ath/key.c ++++ b/drivers/net/wireless/ath/key.c +@@ -556,6 +556,9 @@ int ath_key_config(struct ath_common *common, + return -EIO; + + set_bit(idx, common->keymap); ++ if (key->cipher == WLAN_CIPHER_SUITE_CCMP) ++ set_bit(idx, common->ccmp_keymap); ++ + if (key->cipher == WLAN_CIPHER_SUITE_TKIP) { + set_bit(idx + 64, common->keymap); + set_bit(idx, common->tkip_keymap); +@@ -582,6 +585,7 @@ void ath_key_delete(struct ath_common *common, struct ieee80211_key_conf *key) + return; + + clear_bit(key->hw_key_idx, common->keymap); ++ clear_bit(key->hw_key_idx, common->ccmp_keymap); + if (key->cipher != WLAN_CIPHER_SUITE_TKIP) + return; + +diff --git a/drivers/net/wireless/mwifiex/11n_rxreorder.c b/drivers/net/wireless/mwifiex/11n_rxreorder.c +index 7aa9aa0..39fd4d5 100644 +--- a/drivers/net/wireless/mwifiex/11n_rxreorder.c ++++ b/drivers/net/wireless/mwifiex/11n_rxreorder.c +@@ -267,7 +267,8 @@ mwifiex_11n_create_rx_reorder_tbl(struct mwifiex_private *priv, u8 *ta, + else + last_seq = priv->rx_seq[tid]; + +- if (last_seq >= new_node->start_win) ++ if (last_seq != MWIFIEX_DEF_11N_RX_SEQ_NUM && ++ last_seq >= new_node->start_win) + new_node->start_win = last_seq + 1; + + new_node->win_size = win_size; +@@ -611,5 +612,5 @@ void mwifiex_11n_cleanup_reorder_tbl(struct mwifiex_private *priv) + spin_unlock_irqrestore(&priv->rx_reorder_tbl_lock, flags); + + INIT_LIST_HEAD(&priv->rx_reorder_tbl_ptr); +- memset(priv->rx_seq, 0, sizeof(priv->rx_seq)); ++ mwifiex_reset_11n_rx_seq_num(priv); + } +diff --git a/drivers/net/wireless/mwifiex/11n_rxreorder.h b/drivers/net/wireless/mwifiex/11n_rxreorder.h +index 033c8ad..7128baa 100644 +--- a/drivers/net/wireless/mwifiex/11n_rxreorder.h ++++ b/drivers/net/wireless/mwifiex/11n_rxreorder.h +@@ -37,6 +37,13 @@ + + #define ADDBA_RSP_STATUS_ACCEPT 0 + ++#define MWIFIEX_DEF_11N_RX_SEQ_NUM 0xffff ++ ++static inline void mwifiex_reset_11n_rx_seq_num(struct mwifiex_private *priv) ++{ ++ memset(priv->rx_seq, 0xff, sizeof(priv->rx_seq)); ++} ++ + int mwifiex_11n_rx_reorder_pkt(struct mwifiex_private *, + u16 seqNum, + u16 tid, u8 *ta, +diff --git a/drivers/net/wireless/mwifiex/cfg80211.c b/drivers/net/wireless/mwifiex/cfg80211.c +index 462c710..01dcb1a 100644 +--- a/drivers/net/wireless/mwifiex/cfg80211.c ++++ b/drivers/net/wireless/mwifiex/cfg80211.c +@@ -1177,11 +1177,11 @@ struct net_device *mwifiex_add_virtual_intf(struct wiphy *wiphy, + void *mdev_priv; + + if (!priv) +- return NULL; ++ return ERR_PTR(-EFAULT); + + adapter = priv->adapter; + if (!adapter) +- return NULL; ++ return ERR_PTR(-EFAULT); + + switch (type) { + case NL80211_IFTYPE_UNSPECIFIED: +@@ -1190,7 +1190,7 @@ struct net_device *mwifiex_add_virtual_intf(struct wiphy *wiphy, + if (priv->bss_mode) { + wiphy_err(wiphy, "cannot create multiple" + " station/adhoc interfaces\n"); +- return NULL; ++ return ERR_PTR(-EINVAL); + } + + if (type == NL80211_IFTYPE_UNSPECIFIED) +@@ -1208,14 +1208,15 @@ struct net_device *mwifiex_add_virtual_intf(struct wiphy *wiphy, + break; + default: + wiphy_err(wiphy, "type not supported\n"); +- return NULL; ++ return ERR_PTR(-EINVAL); + } + + dev = alloc_netdev_mq(sizeof(struct mwifiex_private *), name, + ether_setup, 1); + if (!dev) { + wiphy_err(wiphy, "no memory available for netdevice\n"); +- goto error; ++ priv->bss_mode = NL80211_IFTYPE_UNSPECIFIED; ++ return ERR_PTR(-ENOMEM); + } + + dev_net_set(dev, wiphy_net(wiphy)); +@@ -1240,7 +1241,9 @@ struct net_device *mwifiex_add_virtual_intf(struct wiphy *wiphy, + /* Register network device */ + if (register_netdevice(dev)) { + wiphy_err(wiphy, "cannot register virtual network device\n"); +- goto error; ++ free_netdev(dev); ++ priv->bss_mode = NL80211_IFTYPE_UNSPECIFIED; ++ return ERR_PTR(-EFAULT); + } + + sema_init(&priv->async_sem, 1); +@@ -1252,12 +1255,6 @@ struct net_device *mwifiex_add_virtual_intf(struct wiphy *wiphy, + mwifiex_dev_debugfs_init(priv); + #endif + return dev; +-error: +- if (dev && (dev->reg_state == NETREG_UNREGISTERED)) +- free_netdev(dev); +- priv->bss_mode = NL80211_IFTYPE_UNSPECIFIED; +- +- return NULL; + } + EXPORT_SYMBOL_GPL(mwifiex_add_virtual_intf); + +diff --git a/drivers/net/wireless/mwifiex/wmm.c b/drivers/net/wireless/mwifiex/wmm.c +index 6c239c3..06fcf1e 100644 +--- a/drivers/net/wireless/mwifiex/wmm.c ++++ b/drivers/net/wireless/mwifiex/wmm.c +@@ -406,6 +406,8 @@ mwifiex_wmm_init(struct mwifiex_adapter *adapter) + priv->add_ba_param.tx_win_size = MWIFIEX_AMPDU_DEF_TXWINSIZE; + priv->add_ba_param.rx_win_size = MWIFIEX_AMPDU_DEF_RXWINSIZE; + ++ mwifiex_reset_11n_rx_seq_num(priv); ++ + atomic_set(&priv->wmm.tx_pkts_queued, 0); + atomic_set(&priv->wmm.highest_queued_prio, HIGH_PRIO_TID); + } +@@ -1209,10 +1211,12 @@ mwifiex_dequeue_tx_packet(struct mwifiex_adapter *adapter) + return 0; + } + +- if (!ptr->is_11n_enabled || mwifiex_is_ba_stream_setup(priv, ptr, tid) +- || ((priv->sec_info.wpa_enabled +- || priv->sec_info.wpa2_enabled) && !priv->wpa_is_gtk_set) +- ) { ++ if (!ptr->is_11n_enabled || ++ mwifiex_is_ba_stream_setup(priv, ptr, tid) || ++ priv->wps.session_enable || ++ ((priv->sec_info.wpa_enabled || ++ priv->sec_info.wpa2_enabled) && ++ !priv->wpa_is_gtk_set)) { + mwifiex_send_single_packet(priv, ptr, ptr_index, flags); + /* ra_list_spinlock has been freed in + mwifiex_send_single_packet() */ +diff --git a/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c b/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c +index 94a3e17..0302148 100644 +--- a/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c ++++ b/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c +@@ -311,9 +311,11 @@ static struct usb_device_id rtl8192c_usb_ids[] = { + {RTL_USB_DEVICE(0x07b8, 0x8188, rtl92cu_hal_cfg)}, /*Abocom - Abocom*/ + {RTL_USB_DEVICE(0x07b8, 0x8189, rtl92cu_hal_cfg)}, /*Funai - Abocom*/ + {RTL_USB_DEVICE(0x0846, 0x9041, rtl92cu_hal_cfg)}, /*NetGear WNA1000M*/ ++ {RTL_USB_DEVICE(0x0bda, 0x5088, rtl92cu_hal_cfg)}, /*Thinkware-CC&C*/ + {RTL_USB_DEVICE(0x0df6, 0x0052, rtl92cu_hal_cfg)}, /*Sitecom - Edimax*/ + {RTL_USB_DEVICE(0x0df6, 0x005c, rtl92cu_hal_cfg)}, /*Sitecom - Edimax*/ + {RTL_USB_DEVICE(0x0eb0, 0x9071, rtl92cu_hal_cfg)}, /*NO Brand - Etop*/ ++ {RTL_USB_DEVICE(0x4856, 0x0091, rtl92cu_hal_cfg)}, /*NetweeN - Feixun*/ + /* HP - Lite-On ,8188CUS Slim Combo */ + {RTL_USB_DEVICE(0x103c, 0x1629, rtl92cu_hal_cfg)}, + {RTL_USB_DEVICE(0x13d3, 0x3357, rtl92cu_hal_cfg)}, /* AzureWave */ +@@ -355,6 +357,7 @@ static struct usb_device_id rtl8192c_usb_ids[] = { + {RTL_USB_DEVICE(0x07b8, 0x8178, rtl92cu_hal_cfg)}, /*Funai -Abocom*/ + {RTL_USB_DEVICE(0x0846, 0x9021, rtl92cu_hal_cfg)}, /*Netgear-Sercomm*/ + {RTL_USB_DEVICE(0x0b05, 0x17ab, rtl92cu_hal_cfg)}, /*ASUS-Edimax*/ ++ {RTL_USB_DEVICE(0x0bda, 0x8186, rtl92cu_hal_cfg)}, /*Realtek 92CE-VAU*/ + {RTL_USB_DEVICE(0x0df6, 0x0061, rtl92cu_hal_cfg)}, /*Sitecom-Edimax*/ + {RTL_USB_DEVICE(0x0e66, 0x0019, rtl92cu_hal_cfg)}, /*Hawking-Edimax*/ + {RTL_USB_DEVICE(0x2001, 0x3307, rtl92cu_hal_cfg)}, /*D-Link-Cameo*/ +diff --git a/drivers/target/tcm_fc/tfc_sess.c b/drivers/target/tcm_fc/tfc_sess.c +index 3269213..64ddb63 100644 +--- a/drivers/target/tcm_fc/tfc_sess.c ++++ b/drivers/target/tcm_fc/tfc_sess.c +@@ -61,7 +61,8 @@ static struct ft_tport *ft_tport_create(struct fc_lport *lport) + struct ft_tport *tport; + int i; + +- tport = rcu_dereference(lport->prov[FC_TYPE_FCP]); ++ tport = rcu_dereference_protected(lport->prov[FC_TYPE_FCP], ++ lockdep_is_held(&ft_lport_lock)); + if (tport && tport->tpg) + return tport; + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 3568374..19b127c 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -692,6 +692,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, + kfree(name); + + iput(inode); ++ ++ btrfs_run_delayed_items(trans, root); + return ret; + } + +@@ -897,6 +899,7 @@ again: + ret = btrfs_unlink_inode(trans, root, dir, + inode, victim_name, + victim_name_len); ++ btrfs_run_delayed_items(trans, root); + } + kfree(victim_name); + ptr = (unsigned long)(victim_ref + 1) + victim_name_len; +@@ -1477,6 +1480,9 @@ again: + ret = btrfs_unlink_inode(trans, root, dir, inode, + name, name_len); + BUG_ON(ret); ++ ++ btrfs_run_delayed_items(trans, root); ++ + kfree(name); + iput(inode); + +diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c +index 9e0675a..b21670c 100644 +--- a/fs/cifs/connect.c ++++ b/fs/cifs/connect.c +@@ -2975,18 +2975,15 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) + * MS-CIFS indicates that servers are only limited by the client's + * bufsize for reads, testing against win98se shows that it throws + * INVALID_PARAMETER errors if you try to request too large a read. ++ * OS/2 just sends back short reads. + * +- * If the server advertises a MaxBufferSize of less than one page, +- * assume that it also can't satisfy reads larger than that either. +- * +- * FIXME: Is there a better heuristic for this? ++ * If the server doesn't advertise CAP_LARGE_READ_X, then assume that ++ * it can't handle a read request larger than its MaxBufferSize either. + */ + if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP)) + defsize = CIFS_DEFAULT_IOSIZE; + else if (server->capabilities & CAP_LARGE_READ_X) + defsize = CIFS_DEFAULT_NON_POSIX_RSIZE; +- else if (server->maxBuf >= PAGE_CACHE_SIZE) +- defsize = CIFSMaxBufSize; + else + defsize = server->maxBuf - sizeof(READ_RSP); + +diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c +index 6e39668..07ee5b4 100644 +--- a/fs/ocfs2/file.c ++++ b/fs/ocfs2/file.c +@@ -2422,8 +2422,10 @@ out_dio: + unaligned_dio = 0; + } + +- if (unaligned_dio) ++ if (unaligned_dio) { ++ ocfs2_iocb_clear_unaligned_aio(iocb); + atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); ++ } + + out: + if (rw_level != -1) +diff --git a/fs/open.c b/fs/open.c +index 22c41b5..e2b5d51 100644 +--- a/fs/open.c ++++ b/fs/open.c +@@ -396,10 +396,10 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd) + { + struct file *file; + struct inode *inode; +- int error; ++ int error, fput_needed; + + error = -EBADF; +- file = fget(fd); ++ file = fget_raw_light(fd, &fput_needed); + if (!file) + goto out; + +@@ -413,7 +413,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd) + if (!error) + set_fs_pwd(current->fs, &file->f_path); + out_putf: +- fput(file); ++ fput_light(file, fput_needed); + out: + return error; + } +diff --git a/fs/splice.c b/fs/splice.c +index 6d0dfb8..014fcb4 100644 +--- a/fs/splice.c ++++ b/fs/splice.c +@@ -274,13 +274,16 @@ void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) + * Check if we need to grow the arrays holding pages and partial page + * descriptions. + */ +-int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) ++int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) + { +- if (pipe->buffers <= PIPE_DEF_BUFFERS) ++ unsigned int buffers = ACCESS_ONCE(pipe->buffers); ++ ++ spd->nr_pages_max = buffers; ++ if (buffers <= PIPE_DEF_BUFFERS) + return 0; + +- spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL); +- spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL); ++ spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL); ++ spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL); + + if (spd->pages && spd->partial) + return 0; +@@ -290,10 +293,9 @@ int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) + return -ENOMEM; + } + +-void splice_shrink_spd(struct pipe_inode_info *pipe, +- struct splice_pipe_desc *spd) ++void splice_shrink_spd(struct splice_pipe_desc *spd) + { +- if (pipe->buffers <= PIPE_DEF_BUFFERS) ++ if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) + return; + + kfree(spd->pages); +@@ -316,6 +318,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, ++ .nr_pages_max = PIPE_DEF_BUFFERS, + .flags = flags, + .ops = &page_cache_pipe_buf_ops, + .spd_release = spd_release_page, +@@ -327,7 +330,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, + index = *ppos >> PAGE_CACHE_SHIFT; + loff = *ppos & ~PAGE_CACHE_MASK; + req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +- nr_pages = min(req_pages, pipe->buffers); ++ nr_pages = min(req_pages, spd.nr_pages_max); + + /* + * Lookup the (hopefully) full range of pages we need. +@@ -498,7 +501,7 @@ fill_it: + if (spd.nr_pages) + error = splice_to_pipe(pipe, &spd); + +- splice_shrink_spd(pipe, &spd); ++ splice_shrink_spd(&spd); + return error; + } + +@@ -599,6 +602,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, ++ .nr_pages_max = PIPE_DEF_BUFFERS, + .flags = flags, + .ops = &default_pipe_buf_ops, + .spd_release = spd_release_page, +@@ -609,8 +613,8 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, + + res = -ENOMEM; + vec = __vec; +- if (pipe->buffers > PIPE_DEF_BUFFERS) { +- vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL); ++ if (spd.nr_pages_max > PIPE_DEF_BUFFERS) { ++ vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL); + if (!vec) + goto shrink_ret; + } +@@ -618,7 +622,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, + offset = *ppos & ~PAGE_CACHE_MASK; + nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + +- for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) { ++ for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) { + struct page *page; + + page = alloc_page(GFP_USER); +@@ -666,7 +670,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, + shrink_ret: + if (vec != __vec) + kfree(vec); +- splice_shrink_spd(pipe, &spd); ++ splice_shrink_spd(&spd); + return res; + + err: +@@ -1616,6 +1620,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, ++ .nr_pages_max = PIPE_DEF_BUFFERS, + .flags = flags, + .ops = &user_page_pipe_buf_ops, + .spd_release = spd_release_page, +@@ -1631,13 +1636,13 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, + + spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, + spd.partial, flags & SPLICE_F_GIFT, +- pipe->buffers); ++ spd.nr_pages_max); + if (spd.nr_pages <= 0) + ret = spd.nr_pages; + else + ret = splice_to_pipe(pipe, &spd); + +- splice_shrink_spd(pipe, &spd); ++ splice_shrink_spd(&spd); + return ret; + } + +diff --git a/fs/udf/super.c b/fs/udf/super.c +index 87cb24a..270e135 100644 +--- a/fs/udf/super.c ++++ b/fs/udf/super.c +@@ -56,6 +56,7 @@ + #include + #include + #include ++#include + #include + + #include "udf_sb.h" +@@ -1217,16 +1218,65 @@ out_bh: + return ret; + } + ++static int udf_load_sparable_map(struct super_block *sb, ++ struct udf_part_map *map, ++ struct sparablePartitionMap *spm) ++{ ++ uint32_t loc; ++ uint16_t ident; ++ struct sparingTable *st; ++ struct udf_sparing_data *sdata = &map->s_type_specific.s_sparing; ++ int i; ++ struct buffer_head *bh; ++ ++ map->s_partition_type = UDF_SPARABLE_MAP15; ++ sdata->s_packet_len = le16_to_cpu(spm->packetLength); ++ if (!is_power_of_2(sdata->s_packet_len)) { ++ udf_err(sb, "error loading logical volume descriptor: " ++ "Invalid packet length %u\n", ++ (unsigned)sdata->s_packet_len); ++ return -EIO; ++ } ++ if (spm->numSparingTables > 4) { ++ udf_err(sb, "error loading logical volume descriptor: " ++ "Too many sparing tables (%d)\n", ++ (int)spm->numSparingTables); ++ return -EIO; ++ } ++ ++ for (i = 0; i < spm->numSparingTables; i++) { ++ loc = le32_to_cpu(spm->locSparingTable[i]); ++ bh = udf_read_tagged(sb, loc, loc, &ident); ++ if (!bh) ++ continue; ++ ++ st = (struct sparingTable *)bh->b_data; ++ if (ident != 0 || ++ strncmp(st->sparingIdent.ident, UDF_ID_SPARING, ++ strlen(UDF_ID_SPARING)) || ++ sizeof(*st) + le16_to_cpu(st->reallocationTableLen) > ++ sb->s_blocksize) { ++ brelse(bh); ++ continue; ++ } ++ ++ sdata->s_spar_map[i] = bh; ++ } ++ map->s_partition_func = udf_get_pblock_spar15; ++ return 0; ++} ++ + static int udf_load_logicalvol(struct super_block *sb, sector_t block, + struct kernel_lb_addr *fileset) + { + struct logicalVolDesc *lvd; +- int i, j, offset; ++ int i, offset; + uint8_t type; + struct udf_sb_info *sbi = UDF_SB(sb); + struct genericPartitionMap *gpm; + uint16_t ident; + struct buffer_head *bh; ++ unsigned int table_len; + int ret = 0; + + bh = udf_read_tagged(sb, block, block, &ident); +@@ -1234,15 +1284,20 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, + return 1; + BUG_ON(ident != TAG_IDENT_LVD); + lvd = (struct logicalVolDesc *)bh->b_data; +- +- i = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps)); +- if (i != 0) { +- ret = i; ++ table_len = le32_to_cpu(lvd->mapTableLength); ++ if (sizeof(*lvd) + table_len > sb->s_blocksize) { ++ udf_err(sb, "error loading logical volume descriptor: " ++ "Partition table too long (%u > %lu)\n", table_len, ++ sb->s_blocksize - sizeof(*lvd)); + goto out_bh; + } + ++ ret = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps)); ++ if (ret) ++ goto out_bh; ++ + for (i = 0, offset = 0; +- i < sbi->s_partitions && offset < le32_to_cpu(lvd->mapTableLength); ++ i < sbi->s_partitions && offset < table_len; + i++, offset += gpm->partitionMapLength) { + struct udf_part_map *map = &sbi->s_partmaps[i]; + gpm = (struct genericPartitionMap *) +@@ -1277,38 +1332,9 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, + } else if (!strncmp(upm2->partIdent.ident, + UDF_ID_SPARABLE, + strlen(UDF_ID_SPARABLE))) { +- uint32_t loc; +- struct sparingTable *st; +- struct sparablePartitionMap *spm = +- (struct sparablePartitionMap *)gpm; +- +- map->s_partition_type = UDF_SPARABLE_MAP15; +- map->s_type_specific.s_sparing.s_packet_len = +- le16_to_cpu(spm->packetLength); +- for (j = 0; j < spm->numSparingTables; j++) { +- struct buffer_head *bh2; +- +- loc = le32_to_cpu( +- spm->locSparingTable[j]); +- bh2 = udf_read_tagged(sb, loc, loc, +- &ident); +- map->s_type_specific.s_sparing. +- s_spar_map[j] = bh2; +- +- if (bh2 == NULL) +- continue; +- +- st = (struct sparingTable *)bh2->b_data; +- if (ident != 0 || strncmp( +- st->sparingIdent.ident, +- UDF_ID_SPARING, +- strlen(UDF_ID_SPARING))) { +- brelse(bh2); +- map->s_type_specific.s_sparing. +- s_spar_map[j] = NULL; +- } +- } +- map->s_partition_func = udf_get_pblock_spar15; ++ if (udf_load_sparable_map(sb, map, ++ (struct sparablePartitionMap *)gpm) < 0) ++ goto out_bh; + } else if (!strncmp(upm2->partIdent.ident, + UDF_ID_METADATA, + strlen(UDF_ID_METADATA))) { +diff --git a/include/linux/aio.h b/include/linux/aio.h +index 2314ad8..b1a520e 100644 +--- a/include/linux/aio.h ++++ b/include/linux/aio.h +@@ -140,6 +140,7 @@ struct kiocb { + (x)->ki_dtor = NULL; \ + (x)->ki_obj.tsk = tsk; \ + (x)->ki_user_data = 0; \ ++ (x)->private = NULL; \ + } while (0) + + #define AIO_RING_MAGIC 0xa10a10a1 +diff --git a/include/linux/splice.h b/include/linux/splice.h +index 26e5b61..09a545a 100644 +--- a/include/linux/splice.h ++++ b/include/linux/splice.h +@@ -51,7 +51,8 @@ struct partial_page { + struct splice_pipe_desc { + struct page **pages; /* page map */ + struct partial_page *partial; /* pages[] may not be contig */ +- int nr_pages; /* number of pages in map */ ++ int nr_pages; /* number of populated pages in map */ ++ unsigned int nr_pages_max; /* pages[] & partial[] arrays size */ + unsigned int flags; /* splice flags */ + const struct pipe_buf_operations *ops;/* ops associated with output pipe */ + void (*spd_release)(struct splice_pipe_desc *, unsigned int); +@@ -85,9 +86,8 @@ extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, + /* + * for dynamic pipe sizing + */ +-extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *); +-extern void splice_shrink_spd(struct pipe_inode_info *, +- struct splice_pipe_desc *); ++extern int splice_grow_spd(const struct pipe_inode_info *, struct splice_pipe_desc *); ++extern void splice_shrink_spd(struct splice_pipe_desc *); + extern void spd_release_page(struct splice_pipe_desc *, unsigned int); + + extern const struct pipe_buf_operations page_cache_pipe_buf_ops; +diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h +index 9808877..a7a683e 100644 +--- a/include/net/cipso_ipv4.h ++++ b/include/net/cipso_ipv4.h +@@ -42,6 +42,7 @@ + #include + #include + #include ++#include + + /* known doi values */ + #define CIPSO_V4_DOI_UNKNOWN 0x00000000 +@@ -285,7 +286,33 @@ static inline int cipso_v4_skbuff_getattr(const struct sk_buff *skb, + static inline int cipso_v4_validate(const struct sk_buff *skb, + unsigned char **option) + { +- return -ENOSYS; ++ unsigned char *opt = *option; ++ unsigned char err_offset = 0; ++ u8 opt_len = opt[1]; ++ u8 opt_iter; ++ ++ if (opt_len < 8) { ++ err_offset = 1; ++ goto out; ++ } ++ ++ if (get_unaligned_be32(&opt[2]) == 0) { ++ err_offset = 2; ++ goto out; ++ } ++ ++ for (opt_iter = 6; opt_iter < opt_len;) { ++ if (opt[opt_iter + 1] > (opt_len - opt_iter)) { ++ err_offset = opt_iter + 1; ++ goto out; ++ } ++ opt_iter += opt[opt_iter + 1]; ++ } ++ ++out: ++ *option = opt + err_offset; ++ return err_offset; ++ + } + #endif /* CONFIG_NETLABEL */ + +diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h +index 55ce96b..9d7d54a 100644 +--- a/include/net/sch_generic.h ++++ b/include/net/sch_generic.h +@@ -220,13 +220,16 @@ struct tcf_proto { + + struct qdisc_skb_cb { + unsigned int pkt_len; +- unsigned char data[24]; ++ u16 bond_queue_mapping; ++ u16 _pad; ++ unsigned char data[20]; + }; + + static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) + { + struct qdisc_skb_cb *qcb; +- BUILD_BUG_ON(sizeof(skb->cb) < sizeof(unsigned int) + sz); ++ ++ BUILD_BUG_ON(sizeof(skb->cb) < offsetof(struct qdisc_skb_cb, data) + sz); + BUILD_BUG_ON(sizeof(qcb->data) < sz); + } + +diff --git a/kernel/relay.c b/kernel/relay.c +index b6f803a..a535fc9 100644 +--- a/kernel/relay.c ++++ b/kernel/relay.c +@@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in, + struct splice_pipe_desc spd = { + .pages = pages, + .nr_pages = 0, ++ .nr_pages_max = PIPE_DEF_BUFFERS, + .partial = partial, + .flags = flags, + .ops = &relay_pipe_buf_ops, +@@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in, + ret += padding; + + out: +- splice_shrink_spd(pipe, &spd); +- return ret; ++ splice_shrink_spd(&spd); ++ return ret; + } + + static ssize_t relay_file_splice_read(struct file *in, +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index 697e49d..5638104 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -2541,10 +2541,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, + if (cpumask_test_cpu(cpu, tracing_cpumask) && + !cpumask_test_cpu(cpu, tracing_cpumask_new)) { + atomic_inc(&global_trace.data[cpu]->disabled); ++ ring_buffer_record_disable_cpu(global_trace.buffer, cpu); + } + if (!cpumask_test_cpu(cpu, tracing_cpumask) && + cpumask_test_cpu(cpu, tracing_cpumask_new)) { + atomic_dec(&global_trace.data[cpu]->disabled); ++ ring_buffer_record_enable_cpu(global_trace.buffer, cpu); + } + } + arch_spin_unlock(&ftrace_max_lock); +@@ -3456,6 +3458,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, + .pages = pages_def, + .partial = partial_def, + .nr_pages = 0, /* This gets updated below. */ ++ .nr_pages_max = PIPE_DEF_BUFFERS, + .flags = flags, + .ops = &tracing_pipe_buf_ops, + .spd_release = tracing_spd_release_pipe, +@@ -3527,7 +3530,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, + + ret = splice_to_pipe(pipe, &spd); + out: +- splice_shrink_spd(pipe, &spd); ++ splice_shrink_spd(&spd); + return ret; + + out_err: +@@ -4017,6 +4020,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct splice_pipe_desc spd = { + .pages = pages_def, + .partial = partial_def, ++ .nr_pages_max = PIPE_DEF_BUFFERS, + .flags = flags, + .ops = &buffer_pipe_buf_ops, + .spd_release = buffer_spd_release, +@@ -4104,7 +4108,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, + } + + ret = splice_to_pipe(pipe, &spd); +- splice_shrink_spd(pipe, &spd); ++ splice_shrink_spd(&spd); + out: + return ret; + } +diff --git a/mm/madvise.c b/mm/madvise.c +index 74bf193..23d3a6b 100644 +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + + /* + * Any behaviour which results in changes to the vma->vm_flags needs to +@@ -197,14 +198,16 @@ static long madvise_remove(struct vm_area_struct *vma, + struct address_space *mapping; + loff_t offset, endoff; + int error; ++ struct file *f; + + *prev = NULL; /* tell sys_madvise we drop mmap_sem */ + + if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) + return -EINVAL; + +- if (!vma->vm_file || !vma->vm_file->f_mapping +- || !vma->vm_file->f_mapping->host) { ++ f = vma->vm_file; ++ ++ if (!f || !f->f_mapping || !f->f_mapping->host) { + return -EINVAL; + } + +@@ -218,9 +221,16 @@ static long madvise_remove(struct vm_area_struct *vma, + endoff = (loff_t)(end - vma->vm_start - 1) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + +- /* vmtruncate_range needs to take i_mutex */ ++ /* ++ * vmtruncate_range may need to take i_mutex. We need to ++ * explicitly grab a reference because the vma (and hence the ++ * vma's reference to the file) can go away as soon as we drop ++ * mmap_sem. ++ */ ++ get_file(f); + up_read(¤t->mm->mmap_sem); + error = vmtruncate_range(mapping->host, offset, endoff); ++ fput(f); + down_read(¤t->mm->mmap_sem); + return error; + } +diff --git a/mm/shmem.c b/mm/shmem.c +index 6c253f7..7a82174 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -1359,6 +1359,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, ++ .nr_pages_max = PIPE_DEF_BUFFERS, + .flags = flags, + .ops = &page_cache_pipe_buf_ops, + .spd_release = spd_release_page, +@@ -1447,7 +1448,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, + if (spd.nr_pages) + error = splice_to_pipe(pipe, &spd); + +- splice_shrink_spd(pipe, &spd); ++ splice_shrink_spd(&spd); + + if (error > 0) { + *ppos += error; +diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c +index f603e5b..f3f75ad 100644 +--- a/net/bridge/br_if.c ++++ b/net/bridge/br_if.c +@@ -240,6 +240,7 @@ int br_add_bridge(struct net *net, const char *name) + return -ENOMEM; + + dev_net_set(dev, net); ++ dev->rtnl_link_ops = &br_link_ops; + + res = register_netdev(dev); + if (res) +diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c +index a1daf82..cbf9ccd 100644 +--- a/net/bridge/br_netlink.c ++++ b/net/bridge/br_netlink.c +@@ -211,7 +211,7 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[]) + return 0; + } + +-static struct rtnl_link_ops br_link_ops __read_mostly = { ++struct rtnl_link_ops br_link_ops __read_mostly = { + .kind = "bridge", + .priv_size = sizeof(struct net_bridge), + .setup = br_dev_setup, +diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h +index 93264df..b9bba8f 100644 +--- a/net/bridge/br_private.h ++++ b/net/bridge/br_private.h +@@ -536,6 +536,7 @@ extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) + #endif + + /* br_netlink.c */ ++extern struct rtnl_link_ops br_link_ops; + extern int br_netlink_init(void); + extern void br_netlink_fini(void); + extern void br_ifinfo_notify(int event, struct net_bridge_port *port); +diff --git a/net/core/ethtool.c b/net/core/ethtool.c +index 2b587ec..2367246 100644 +--- a/net/core/ethtool.c ++++ b/net/core/ethtool.c +@@ -1672,6 +1672,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) + case ETHTOOL_GRXCSUM: + case ETHTOOL_GTXCSUM: + case ETHTOOL_GSG: ++ case ETHTOOL_GSSET_INFO: + case ETHTOOL_GSTRINGS: + case ETHTOOL_GTSO: + case ETHTOOL_GPERMADDR: +diff --git a/net/core/netpoll.c b/net/core/netpoll.c +index ab0633f..db4bb7a 100644 +--- a/net/core/netpoll.c ++++ b/net/core/netpoll.c +@@ -351,22 +351,23 @@ EXPORT_SYMBOL(netpoll_send_skb_on_dev); + + void netpoll_send_udp(struct netpoll *np, const char *msg, int len) + { +- int total_len, eth_len, ip_len, udp_len; ++ int total_len, ip_len, udp_len; + struct sk_buff *skb; + struct udphdr *udph; + struct iphdr *iph; + struct ethhdr *eth; + + udp_len = len + sizeof(*udph); +- ip_len = eth_len = udp_len + sizeof(*iph); +- total_len = eth_len + ETH_HLEN + NET_IP_ALIGN; ++ ip_len = udp_len + sizeof(*iph); ++ total_len = ip_len + LL_RESERVED_SPACE(np->dev); + +- skb = find_skb(np, total_len, total_len - len); ++ skb = find_skb(np, total_len + np->dev->needed_tailroom, ++ total_len - len); + if (!skb) + return; + + skb_copy_to_linear_data(skb, msg, len); +- skb->len += len; ++ skb_put(skb, len); + + skb_push(skb, sizeof(*udph)); + skb_reset_transport_header(skb); +diff --git a/net/core/skbuff.c b/net/core/skbuff.c +index 2ec200de..af9c3c6 100644 +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -1663,6 +1663,7 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, ++ .nr_pages_max = MAX_SKB_FRAGS, + .flags = flags, + .ops = &sock_pipe_buf_ops, + .spd_release = sock_spd_release, +@@ -1709,7 +1710,7 @@ done: + lock_sock(sk); + } + +- splice_shrink_spd(pipe, &spd); ++ splice_shrink_spd(&spd); + return ret; + } + +diff --git a/net/core/sock.c b/net/core/sock.c +index b23f174..8d095b9 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -1497,6 +1497,11 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, + gfp_t gfp_mask; + long timeo; + int err; ++ int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; ++ ++ err = -EMSGSIZE; ++ if (npages > MAX_SKB_FRAGS) ++ goto failure; + + gfp_mask = sk->sk_allocation; + if (gfp_mask & __GFP_WAIT) +@@ -1515,14 +1520,12 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, + if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + skb = alloc_skb(header_len, gfp_mask); + if (skb) { +- int npages; + int i; + + /* No pages, we're done... */ + if (!data_len) + break; + +- npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + skb->truesize += data_len; + skb_shinfo(skb)->nr_frags = npages; + for (i = 0; i < npages; i++) { +diff --git a/net/ipv6/route.c b/net/ipv6/route.c +index 059b9d9..2e21751 100644 +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -2881,10 +2881,6 @@ static int __net_init ip6_route_net_init(struct net *net) + net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; + net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; + +-#ifdef CONFIG_PROC_FS +- proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops); +- proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops); +-#endif + net->ipv6.ip6_rt_gc_expire = 30*HZ; + + ret = 0; +@@ -2905,10 +2901,6 @@ out_ip6_dst_ops: + + static void __net_exit ip6_route_net_exit(struct net *net) + { +-#ifdef CONFIG_PROC_FS +- proc_net_remove(net, "ipv6_route"); +- proc_net_remove(net, "rt6_stats"); +-#endif + kfree(net->ipv6.ip6_null_entry); + #ifdef CONFIG_IPV6_MULTIPLE_TABLES + kfree(net->ipv6.ip6_prohibit_entry); +@@ -2917,11 +2909,33 @@ static void __net_exit ip6_route_net_exit(struct net *net) + dst_entries_destroy(&net->ipv6.ip6_dst_ops); + } + ++static int __net_init ip6_route_net_init_late(struct net *net) ++{ ++#ifdef CONFIG_PROC_FS ++ proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops); ++ proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops); ++#endif ++ return 0; ++} ++ ++static void __net_exit ip6_route_net_exit_late(struct net *net) ++{ ++#ifdef CONFIG_PROC_FS ++ proc_net_remove(net, "ipv6_route"); ++ proc_net_remove(net, "rt6_stats"); ++#endif ++} ++ + static struct pernet_operations ip6_route_net_ops = { + .init = ip6_route_net_init, + .exit = ip6_route_net_exit, + }; + ++static struct pernet_operations ip6_route_net_late_ops = { ++ .init = ip6_route_net_init_late, ++ .exit = ip6_route_net_exit_late, ++}; ++ + static struct notifier_block ip6_route_dev_notifier = { + .notifier_call = ip6_route_dev_notify, + .priority = 0, +@@ -2971,19 +2985,25 @@ int __init ip6_route_init(void) + if (ret) + goto xfrm6_init; + ++ ret = register_pernet_subsys(&ip6_route_net_late_ops); ++ if (ret) ++ goto fib6_rules_init; ++ + ret = -ENOBUFS; + if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) || + __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) || + __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL)) +- goto fib6_rules_init; ++ goto out_register_late_subsys; + + ret = register_netdevice_notifier(&ip6_route_dev_notifier); + if (ret) +- goto fib6_rules_init; ++ goto out_register_late_subsys; + + out: + return ret; + ++out_register_late_subsys: ++ unregister_pernet_subsys(&ip6_route_net_late_ops); + fib6_rules_init: + fib6_rules_cleanup(); + xfrm6_init: +@@ -3002,6 +3022,7 @@ out_kmem_cache: + void ip6_route_cleanup(void) + { + unregister_netdevice_notifier(&ip6_route_dev_notifier); ++ unregister_pernet_subsys(&ip6_route_net_late_ops); + fib6_rules_cleanup(); + xfrm6_fini(); + fib6_gc_cleanup(); +diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c +index d2726a7..3c55f63 100644 +--- a/net/l2tp/l2tp_eth.c ++++ b/net/l2tp/l2tp_eth.c +@@ -167,6 +167,7 @@ static void l2tp_eth_delete(struct l2tp_session *session) + if (dev) { + unregister_netdev(dev); + spriv->dev = NULL; ++ module_put(THIS_MODULE); + } + } + } +@@ -254,6 +255,7 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p + if (rc < 0) + goto out_del_dev; + ++ __module_get(THIS_MODULE); + /* Must be done after register_netdev() */ + strlcpy(session->ifname, dev->name, IFNAMSIZ); + +diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c +index 2fbbe1f..6c7e609 100644 +--- a/net/l2tp/l2tp_ip.c ++++ b/net/l2tp/l2tp_ip.c +@@ -515,10 +515,12 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m + sk->sk_bound_dev_if); + if (IS_ERR(rt)) + goto no_route; +- if (connected) ++ if (connected) { + sk_setup_caps(sk, &rt->dst); +- else +- dst_release(&rt->dst); /* safe since we hold rcu_read_lock */ ++ } else { ++ skb_dst_set(skb, &rt->dst); ++ goto xmit; ++ } + } + + /* We dont need to clone dst here, it is guaranteed to not disappear. +@@ -526,6 +528,7 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m + */ + skb_dst_set_noref(skb, &rt->dst); + ++xmit: + /* Queue the packet to IP for output */ + rc = ip_queue_xmit(skb, &inet->cork.fl); + rcu_read_unlock(); +diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c +index 064d20f..cda4875 100644 +--- a/net/mac80211/rx.c ++++ b/net/mac80211/rx.c +@@ -2389,7 +2389,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) + * frames that we didn't handle, including returning unknown + * ones. For all other modes we will return them to the sender, + * setting the 0x80 bit in the action category, as required by +- * 802.11-2007 7.3.1.11. ++ * 802.11-2012 9.24.4. + * Newer versions of hostapd shall also use the management frame + * registration mechanisms, but older ones still use cooked + * monitor interfaces so push all frames there. +@@ -2399,6 +2399,9 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) + sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) + return RX_DROP_MONITOR; + ++ if (is_multicast_ether_addr(mgmt->da)) ++ return RX_DROP_MONITOR; ++ + /* do not return rejected action frames */ + if (mgmt->u.action.category & 0x80) + return RX_DROP_UNUSABLE; +diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c +index 96633f5..12b6a80 100644 +--- a/net/nfc/nci/ntf.c ++++ b/net/nfc/nci/ntf.c +@@ -86,7 +86,7 @@ static int nci_rf_activate_nfca_passive_poll(struct nci_dev *ndev, + nfca_poll->sens_res = __le16_to_cpu(*((__u16 *)data)); + data += 2; + +- nfca_poll->nfcid1_len = *data++; ++ nfca_poll->nfcid1_len = min_t(__u8, *data++, sizeof(nfca_poll->nfcid1)); + + nfc_dbg("sens_res 0x%x, nfcid1_len %d", + nfca_poll->sens_res, +@@ -111,7 +111,7 @@ static int nci_rf_activate_nfca_passive_poll(struct nci_dev *ndev, + + switch (ntf->rf_interface_type) { + case NCI_RF_INTERFACE_ISO_DEP: +- nfca_poll_iso_dep->rats_res_len = *data++; ++ nfca_poll_iso_dep->rats_res_len = min_t(__u8, *data++, 20); + if (nfca_poll_iso_dep->rats_res_len > 0) { + memcpy(nfca_poll_iso_dep->rats_res, + data, +diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c +index ee7b2b3..7a167fc 100644 +--- a/net/nfc/rawsock.c ++++ b/net/nfc/rawsock.c +@@ -52,7 +52,10 @@ static int rawsock_release(struct socket *sock) + { + struct sock *sk = sock->sk; + +- nfc_dbg("sock=%p", sock); ++ nfc_dbg("sock=%p sk=%p", sock, sk); ++ ++ if (!sk) ++ return 0; + + sock_orphan(sk); + sock_put(sk); +diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c +index 7b7a516..2b973f5 100644 +--- a/sound/pci/hda/patch_sigmatel.c ++++ b/sound/pci/hda/patch_sigmatel.c +@@ -4457,7 +4457,7 @@ static int stac92xx_init(struct hda_codec *codec) + AC_PINCTL_IN_EN); + for (i = 0; i < spec->num_pwrs; i++) { + hda_nid_t nid = spec->pwr_nids[i]; +- int pinctl, def_conf; ++ unsigned int pinctl, def_conf; + + /* power on when no jack detection is available */ + /* or when the VREF is used for controlling LED */ +@@ -4484,7 +4484,7 @@ static int stac92xx_init(struct hda_codec *codec) + def_conf = get_defcfg_connect(def_conf); + /* skip any ports that don't have jacks since presence + * detection is useless */ +- if (def_conf != AC_JACK_PORT_NONE && ++ if (def_conf != AC_JACK_PORT_COMPLEX || + !is_jack_detectable(codec, nid)) { + stac_toggle_power_map(codec, nid, 1); + continue; +diff --git a/sound/soc/codecs/tlv320aic3x.c b/sound/soc/codecs/tlv320aic3x.c +index 87d5ef1..8b48801 100644 +--- a/sound/soc/codecs/tlv320aic3x.c ++++ b/sound/soc/codecs/tlv320aic3x.c +@@ -963,9 +963,7 @@ static int aic3x_hw_params(struct snd_pcm_substream *substream, + } + + found: +- data = snd_soc_read(codec, AIC3X_PLL_PROGA_REG); +- snd_soc_write(codec, AIC3X_PLL_PROGA_REG, +- data | (pll_p << PLLP_SHIFT)); ++ snd_soc_update_bits(codec, AIC3X_PLL_PROGA_REG, PLLP_MASK, pll_p); + snd_soc_write(codec, AIC3X_OVRF_STATUS_AND_PLLR_REG, + pll_r << PLLR_SHIFT); + snd_soc_write(codec, AIC3X_PLL_PROGB_REG, pll_j << PLLJ_SHIFT); +diff --git a/sound/soc/codecs/tlv320aic3x.h b/sound/soc/codecs/tlv320aic3x.h +index 06a1978..16d9999 100644 +--- a/sound/soc/codecs/tlv320aic3x.h ++++ b/sound/soc/codecs/tlv320aic3x.h +@@ -166,6 +166,7 @@ + + /* PLL registers bitfields */ + #define PLLP_SHIFT 0 ++#define PLLP_MASK 7 + #define PLLQ_SHIFT 3 + #define PLLR_SHIFT 0 + #define PLLJ_SHIFT 2 diff --git a/3.2.34/bump/1023_linux-3.2.24.patch b/3.2.34/bump/1023_linux-3.2.24.patch new file mode 100644 index 0000000..4692eb4 --- /dev/null +++ b/3.2.34/bump/1023_linux-3.2.24.patch @@ -0,0 +1,4684 @@ +diff --git a/Makefile b/Makefile +index 40d1e3b..80bb4fd 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 2 +-SUBLEVEL = 23 ++SUBLEVEL = 24 + EXTRAVERSION = + NAME = Saber-toothed Squirrel + +diff --git a/arch/arm/plat-samsung/adc.c b/arch/arm/plat-samsung/adc.c +index 33ecd0c..b1e05cc 100644 +--- a/arch/arm/plat-samsung/adc.c ++++ b/arch/arm/plat-samsung/adc.c +@@ -157,11 +157,13 @@ int s3c_adc_start(struct s3c_adc_client *client, + return -EINVAL; + } + +- if (client->is_ts && adc->ts_pend) +- return -EAGAIN; +- + spin_lock_irqsave(&adc->lock, flags); + ++ if (client->is_ts && adc->ts_pend) { ++ spin_unlock_irqrestore(&adc->lock, flags); ++ return -EAGAIN; ++ } ++ + client->channel = channel; + client->nr_samples = nr_samples; + +diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h +index 97f8bf6..adda036 100644 +--- a/arch/mips/include/asm/thread_info.h ++++ b/arch/mips/include/asm/thread_info.h +@@ -60,6 +60,8 @@ struct thread_info { + register struct thread_info *__current_thread_info __asm__("$28"); + #define current_thread_info() __current_thread_info + ++#endif /* !__ASSEMBLY__ */ ++ + /* thread information allocation */ + #if defined(CONFIG_PAGE_SIZE_4KB) && defined(CONFIG_32BIT) + #define THREAD_SIZE_ORDER (1) +@@ -97,8 +99,6 @@ register struct thread_info *__current_thread_info __asm__("$28"); + + #define free_thread_info(info) kfree(info) + +-#endif /* !__ASSEMBLY__ */ +- + #define PREEMPT_ACTIVE 0x10000000 + + /* +diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S +index a81176f..be281c6 100644 +--- a/arch/mips/kernel/vmlinux.lds.S ++++ b/arch/mips/kernel/vmlinux.lds.S +@@ -1,5 +1,6 @@ + #include + #include ++#include + #include + + #undef mips +@@ -73,7 +74,7 @@ SECTIONS + .data : { /* Data */ + . = . + DATAOFFSET; /* for CONFIG_MAPPED_KERNEL */ + +- INIT_TASK_DATA(PAGE_SIZE) ++ INIT_TASK_DATA(THREAD_SIZE) + NOSAVE_DATA + CACHELINE_ALIGNED_DATA(1 << CONFIG_MIPS_L1_CACHE_SHIFT) + READ_MOSTLY_DATA(1 << CONFIG_MIPS_L1_CACHE_SHIFT) +diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h +index 98b7c4b..fa3f921 100644 +--- a/arch/powerpc/include/asm/cputime.h ++++ b/arch/powerpc/include/asm/cputime.h +@@ -126,11 +126,11 @@ static inline u64 cputime64_to_jiffies64(const cputime_t ct) + /* + * Convert cputime <-> microseconds + */ +-extern u64 __cputime_msec_factor; ++extern u64 __cputime_usec_factor; + + static inline unsigned long cputime_to_usecs(const cputime_t ct) + { +- return mulhdu(ct, __cputime_msec_factor) * USEC_PER_MSEC; ++ return mulhdu(ct, __cputime_usec_factor); + } + + static inline cputime_t usecs_to_cputime(const unsigned long us) +@@ -143,7 +143,7 @@ static inline cputime_t usecs_to_cputime(const unsigned long us) + sec = us / 1000000; + if (ct) { + ct *= tb_ticks_per_sec; +- do_div(ct, 1000); ++ do_div(ct, 1000000); + } + if (sec) + ct += (cputime_t) sec * tb_ticks_per_sec; +diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c +index 5db163c..ec8affe 100644 +--- a/arch/powerpc/kernel/time.c ++++ b/arch/powerpc/kernel/time.c +@@ -168,13 +168,13 @@ EXPORT_SYMBOL_GPL(ppc_tb_freq); + #ifdef CONFIG_VIRT_CPU_ACCOUNTING + /* + * Factors for converting from cputime_t (timebase ticks) to +- * jiffies, milliseconds, seconds, and clock_t (1/USER_HZ seconds). ++ * jiffies, microseconds, seconds, and clock_t (1/USER_HZ seconds). + * These are all stored as 0.64 fixed-point binary fractions. + */ + u64 __cputime_jiffies_factor; + EXPORT_SYMBOL(__cputime_jiffies_factor); +-u64 __cputime_msec_factor; +-EXPORT_SYMBOL(__cputime_msec_factor); ++u64 __cputime_usec_factor; ++EXPORT_SYMBOL(__cputime_usec_factor); + u64 __cputime_sec_factor; + EXPORT_SYMBOL(__cputime_sec_factor); + u64 __cputime_clockt_factor; +@@ -192,8 +192,8 @@ static void calc_cputime_factors(void) + + div128_by_32(HZ, 0, tb_ticks_per_sec, &res); + __cputime_jiffies_factor = res.result_low; +- div128_by_32(1000, 0, tb_ticks_per_sec, &res); +- __cputime_msec_factor = res.result_low; ++ div128_by_32(1000000, 0, tb_ticks_per_sec, &res); ++ __cputime_usec_factor = res.result_low; + div128_by_32(1, 0, tb_ticks_per_sec, &res); + __cputime_sec_factor = res.result_low; + div128_by_32(USER_HZ, 0, tb_ticks_per_sec, &res); +diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c +index 4558f0d..479d03c 100644 +--- a/arch/x86/kernel/acpi/boot.c ++++ b/arch/x86/kernel/acpi/boot.c +@@ -416,12 +416,14 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header, + return 0; + } + +- if (intsrc->source_irq == 0 && intsrc->global_irq == 2) { ++ if (intsrc->source_irq == 0) { + if (acpi_skip_timer_override) { +- printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); ++ printk(PREFIX "BIOS IRQ0 override ignored.\n"); + return 0; + } +- if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { ++ ++ if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity ++ && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { + intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; + printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); + } +@@ -1327,17 +1329,12 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d) + } + + /* +- * Force ignoring BIOS IRQ0 pin2 override ++ * Force ignoring BIOS IRQ0 override + */ + static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) + { +- /* +- * The ati_ixp4x0_rev() early PCI quirk should have set +- * the acpi_skip_timer_override flag already: +- */ + if (!acpi_skip_timer_override) { +- WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); +- pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", ++ pr_notice("%s detected: Ignoring BIOS IRQ0 override\n", + d->ident); + acpi_skip_timer_override = 1; + } +@@ -1431,7 +1428,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { + * is enabled. This input is incorrectly designated the + * ISA IRQ 0 via an interrupt source override even though + * it is wired to the output of the master 8259A and INTIN0 +- * is not connected at all. Force ignoring BIOS IRQ0 pin2 ++ * is not connected at all. Force ignoring BIOS IRQ0 + * override in that cases. + */ + { +@@ -1466,6 +1463,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), + }, + }, ++ { ++ .callback = dmi_ignore_irq0_timer_override, ++ .ident = "FUJITSU SIEMENS", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), ++ DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"), ++ }, ++ }, + {} + }; + +diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c +index 37a458b..e61f79c 100644 +--- a/arch/x86/kernel/reboot.c ++++ b/arch/x86/kernel/reboot.c +@@ -460,6 +460,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), + }, + }, ++ { /* Handle problems with rebooting on the Precision M6600. */ ++ .callback = set_pci_reboot, ++ .ident = "Dell OptiPlex 990", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), ++ DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), ++ }, ++ }, + { } + }; + +diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c +index 688be8a..9e76a32 100644 +--- a/block/scsi_ioctl.c ++++ b/block/scsi_ioctl.c +@@ -721,11 +721,14 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) + break; + } + ++ if (capable(CAP_SYS_RAWIO)) ++ return 0; ++ + /* In particular, rule out all resets and host-specific ioctls. */ + printk_ratelimited(KERN_WARNING + "%s: sending ioctl %x to a partition!\n", current->comm, cmd); + +- return capable(CAP_SYS_RAWIO) ? 0 : -ENOTTY; ++ return -ENOTTY; + } + EXPORT_SYMBOL(scsi_verify_blk_ioctl); + +diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c +index c850de4..eff7222 100644 +--- a/drivers/acpi/processor_core.c ++++ b/drivers/acpi/processor_core.c +@@ -189,10 +189,12 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id) + * Processor (CPU3, 0x03, 0x00000410, 0x06) {} + * } + * +- * Ignores apic_id and always return 0 for CPU0's handle. ++ * Ignores apic_id and always returns 0 for the processor ++ * handle with acpi id 0 if nr_cpu_ids is 1. ++ * This should be the case if SMP tables are not found. + * Return -1 for other CPU's handle. + */ +- if (acpi_id == 0) ++ if (nr_cpu_ids <= 1 && acpi_id == 0) + return acpi_id; + else + return apic_id; +diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c +index ca191ff..ed6bc52 100644 +--- a/drivers/acpi/sleep.c ++++ b/drivers/acpi/sleep.c +@@ -702,8 +702,8 @@ int acpi_pm_device_sleep_state(struct device *dev, int *d_min_p) + * can wake the system. _S0W may be valid, too. + */ + if (acpi_target_sleep_state == ACPI_STATE_S0 || +- (device_may_wakeup(dev) && +- adev->wakeup.sleep_state <= acpi_target_sleep_state)) { ++ (device_may_wakeup(dev) && adev->wakeup.flags.valid && ++ adev->wakeup.sleep_state >= acpi_target_sleep_state)) { + acpi_status status; + + acpi_method[3] = 'W'; +diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c +index 9f66181..240a244 100644 +--- a/drivers/acpi/sysfs.c ++++ b/drivers/acpi/sysfs.c +@@ -173,7 +173,7 @@ static int param_set_trace_state(const char *val, struct kernel_param *kp) + { + int result = 0; + +- if (!strncmp(val, "enable", strlen("enable") - 1)) { ++ if (!strncmp(val, "enable", strlen("enable"))) { + result = acpi_debug_trace(trace_method_name, trace_debug_level, + trace_debug_layer, 0); + if (result) +@@ -181,7 +181,7 @@ static int param_set_trace_state(const char *val, struct kernel_param *kp) + goto exit; + } + +- if (!strncmp(val, "disable", strlen("disable") - 1)) { ++ if (!strncmp(val, "disable", strlen("disable"))) { + int name = 0; + result = acpi_debug_trace((char *)&name, trace_debug_level, + trace_debug_layer, 0); +diff --git a/drivers/gpio/gpio-wm8994.c b/drivers/gpio/gpio-wm8994.c +index 96198f3..a2da8f2 100644 +--- a/drivers/gpio/gpio-wm8994.c ++++ b/drivers/gpio/gpio-wm8994.c +@@ -89,8 +89,11 @@ static int wm8994_gpio_direction_out(struct gpio_chip *chip, + struct wm8994_gpio *wm8994_gpio = to_wm8994_gpio(chip); + struct wm8994 *wm8994 = wm8994_gpio->wm8994; + ++ if (value) ++ value = WM8994_GPN_LVL; ++ + return wm8994_set_bits(wm8994, WM8994_GPIO_1 + offset, +- WM8994_GPN_DIR, 0); ++ WM8994_GPN_DIR | WM8994_GPN_LVL, value); + } + + static void wm8994_gpio_set(struct gpio_chip *chip, unsigned offset, int value) +diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c +index 6aa7716..cc75c4b 100644 +--- a/drivers/gpu/drm/i915/intel_display.c ++++ b/drivers/gpu/drm/i915/intel_display.c +@@ -8043,8 +8043,8 @@ void gen6_enable_rps(struct drm_i915_private *dev_priv) + I915_WRITE(GEN6_RC6pp_THRESHOLD, 64000); /* unused */ + + if (intel_enable_rc6(dev_priv->dev)) +- rc6_mask = GEN6_RC_CTL_RC6p_ENABLE | +- GEN6_RC_CTL_RC6_ENABLE; ++ rc6_mask = GEN6_RC_CTL_RC6_ENABLE | ++ ((IS_GEN7(dev_priv->dev)) ? GEN6_RC_CTL_RC6p_ENABLE : 0); + + I915_WRITE(GEN6_RC_CONTROL, + rc6_mask | +diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c +index 299d238..899c712 100644 +--- a/drivers/hid/hid-apple.c ++++ b/drivers/hid/hid-apple.c +@@ -514,6 +514,12 @@ static const struct hid_device_id apple_devices[] = { + .driver_data = APPLE_HAS_FN | APPLE_ISO_KEYBOARD }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING5A_JIS), + .driver_data = APPLE_HAS_FN | APPLE_RDESC_JIS }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING7_ANSI), ++ .driver_data = APPLE_HAS_FN }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING7_ISO), ++ .driver_data = APPLE_HAS_FN | APPLE_ISO_KEYBOARD }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING7_JIS), ++ .driver_data = APPLE_HAS_FN | APPLE_RDESC_JIS }, + { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_ANSI), + .driver_data = APPLE_NUMLOCK_EMULATION | APPLE_HAS_FN }, + { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_ISO), +diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c +index c27b402..95430a0 100644 +--- a/drivers/hid/hid-core.c ++++ b/drivers/hid/hid-core.c +@@ -1374,6 +1374,9 @@ static const struct hid_device_id hid_have_special_driver[] = { + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING6A_ANSI) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING6A_ISO) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING6A_JIS) }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING7_ANSI) }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING7_ISO) }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING7_JIS) }, + { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_ANSI) }, + { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_ISO) }, + { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_JIS) }, +@@ -1884,6 +1887,7 @@ static const struct hid_device_id hid_ignore_list[] = { + { HID_USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_MCT) }, + { HID_USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_HYBRID) }, + { HID_USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_HEATCONTROL) }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_MADCATZ, USB_DEVICE_ID_MADCATZ_BEATPAD) }, + { HID_USB_DEVICE(USB_VENDOR_ID_MCC, USB_DEVICE_ID_MCC_PMD1024LS) }, + { HID_USB_DEVICE(USB_VENDOR_ID_MCC, USB_DEVICE_ID_MCC_PMD1208LS) }, + { HID_USB_DEVICE(USB_VENDOR_ID_MICROCHIP, USB_DEVICE_ID_PICKIT1) }, +@@ -1968,6 +1972,9 @@ static const struct hid_device_id hid_mouse_ignore_list[] = { + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING6A_ANSI) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING6A_ISO) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING6A_JIS) }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING7_ANSI) }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING7_ISO) }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_WELLSPRING7_JIS) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_FOUNTAIN_TP_ONLY) }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_GEYSER1_TP_ONLY) }, + { } +diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h +index fba3fc4..7db934d 100644 +--- a/drivers/hid/hid-ids.h ++++ b/drivers/hid/hid-ids.h +@@ -125,6 +125,9 @@ + #define USB_DEVICE_ID_APPLE_WELLSPRING6_ANSI 0x024c + #define USB_DEVICE_ID_APPLE_WELLSPRING6_ISO 0x024d + #define USB_DEVICE_ID_APPLE_WELLSPRING6_JIS 0x024e ++#define USB_DEVICE_ID_APPLE_WELLSPRING7_ANSI 0x0262 ++#define USB_DEVICE_ID_APPLE_WELLSPRING7_ISO 0x0263 ++#define USB_DEVICE_ID_APPLE_WELLSPRING7_JIS 0x0264 + #define USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_ANSI 0x0239 + #define USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_ISO 0x023a + #define USB_DEVICE_ID_APPLE_ALU_WIRELESS_2009_JIS 0x023b +@@ -491,6 +494,9 @@ + #define USB_DEVICE_ID_CRYSTALTOUCH 0x0006 + #define USB_DEVICE_ID_CRYSTALTOUCH_DUAL 0x0007 + ++#define USB_VENDOR_ID_MADCATZ 0x0738 ++#define USB_DEVICE_ID_MADCATZ_BEATPAD 0x4540 ++ + #define USB_VENDOR_ID_MCC 0x09db + #define USB_DEVICE_ID_MCC_PMD1024LS 0x0076 + #define USB_DEVICE_ID_MCC_PMD1208LS 0x007a +diff --git a/drivers/hwmon/it87.c b/drivers/hwmon/it87.c +index d912649..1ba7af2 100644 +--- a/drivers/hwmon/it87.c ++++ b/drivers/hwmon/it87.c +@@ -2086,7 +2086,7 @@ static void __devinit it87_init_device(struct platform_device *pdev) + + /* Start monitoring */ + it87_write_value(data, IT87_REG_CONFIG, +- (it87_read_value(data, IT87_REG_CONFIG) & 0x36) ++ (it87_read_value(data, IT87_REG_CONFIG) & 0x3e) + | (update_vbat ? 0x41 : 0x01)); + } + +diff --git a/drivers/hwspinlock/hwspinlock_core.c b/drivers/hwspinlock/hwspinlock_core.c +index 61c9cf1..1201a15 100644 +--- a/drivers/hwspinlock/hwspinlock_core.c ++++ b/drivers/hwspinlock/hwspinlock_core.c +@@ -345,7 +345,7 @@ int hwspin_lock_register(struct hwspinlock_device *bank, struct device *dev, + spin_lock_init(&hwlock->lock); + hwlock->bank = bank; + +- ret = hwspin_lock_register_single(hwlock, i); ++ ret = hwspin_lock_register_single(hwlock, base_id + i); + if (ret) + goto reg_failed; + } +@@ -354,7 +354,7 @@ int hwspin_lock_register(struct hwspinlock_device *bank, struct device *dev, + + reg_failed: + while (--i >= 0) +- hwspin_lock_unregister_single(i); ++ hwspin_lock_unregister_single(base_id + i); + return ret; + } + EXPORT_SYMBOL_GPL(hwspin_lock_register); +diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c +index d728875..2189cbf 100644 +--- a/drivers/input/joystick/xpad.c ++++ b/drivers/input/joystick/xpad.c +@@ -142,6 +142,7 @@ static const struct xpad_device { + { 0x0c12, 0x880a, "Pelican Eclipse PL-2023", 0, XTYPE_XBOX }, + { 0x0c12, 0x8810, "Zeroplus Xbox Controller", 0, XTYPE_XBOX }, + { 0x0c12, 0x9902, "HAMA VibraX - *FAULTY HARDWARE*", 0, XTYPE_XBOX }, ++ { 0x0d2f, 0x0002, "Andamiro Pump It Up pad", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX }, + { 0x0e4c, 0x1097, "Radica Gamester Controller", 0, XTYPE_XBOX }, + { 0x0e4c, 0x2390, "Radica Games Jtech Controller", 0, XTYPE_XBOX }, + { 0x0e6f, 0x0003, "Logic3 Freebird wireless Controller", 0, XTYPE_XBOX }, +@@ -164,6 +165,7 @@ static const struct xpad_device { + { 0x1bad, 0x0003, "Harmonix Rock Band Drumkit", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 }, + { 0x0f0d, 0x0016, "Hori Real Arcade Pro.EX", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 }, + { 0x0f0d, 0x000d, "Hori Fighting Stick EX2", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 }, ++ { 0x1689, 0xfd00, "Razer Onza Tournament Edition", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 }, + { 0xffff, 0xffff, "Chinese-made Xbox Controller", 0, XTYPE_XBOX }, + { 0x0000, 0x0000, "Generic X-Box pad", 0, XTYPE_UNKNOWN } + }; +@@ -238,12 +240,14 @@ static struct usb_device_id xpad_table [] = { + XPAD_XBOX360_VENDOR(0x045e), /* Microsoft X-Box 360 controllers */ + XPAD_XBOX360_VENDOR(0x046d), /* Logitech X-Box 360 style controllers */ + XPAD_XBOX360_VENDOR(0x0738), /* Mad Catz X-Box 360 controllers */ ++ { USB_DEVICE(0x0738, 0x4540) }, /* Mad Catz Beat Pad */ + XPAD_XBOX360_VENDOR(0x0e6f), /* 0x0e6f X-Box 360 controllers */ + XPAD_XBOX360_VENDOR(0x12ab), /* X-Box 360 dance pads */ + XPAD_XBOX360_VENDOR(0x1430), /* RedOctane X-Box 360 controllers */ + XPAD_XBOX360_VENDOR(0x146b), /* BigBen Interactive Controllers */ + XPAD_XBOX360_VENDOR(0x1bad), /* Harminix Rock Band Guitar and Drums */ +- XPAD_XBOX360_VENDOR(0x0f0d), /* Hori Controllers */ ++ XPAD_XBOX360_VENDOR(0x0f0d), /* Hori Controllers */ ++ XPAD_XBOX360_VENDOR(0x1689), /* Razer Onza */ + { } + }; + +diff --git a/drivers/input/mouse/bcm5974.c b/drivers/input/mouse/bcm5974.c +index 5ec617e..ec58f48 100644 +--- a/drivers/input/mouse/bcm5974.c ++++ b/drivers/input/mouse/bcm5974.c +@@ -79,6 +79,10 @@ + #define USB_DEVICE_ID_APPLE_WELLSPRING5A_ANSI 0x0252 + #define USB_DEVICE_ID_APPLE_WELLSPRING5A_ISO 0x0253 + #define USB_DEVICE_ID_APPLE_WELLSPRING5A_JIS 0x0254 ++/* MacbookPro10,1 (unibody, June 2012) */ ++#define USB_DEVICE_ID_APPLE_WELLSPRING7_ANSI 0x0262 ++#define USB_DEVICE_ID_APPLE_WELLSPRING7_ISO 0x0263 ++#define USB_DEVICE_ID_APPLE_WELLSPRING7_JIS 0x0264 + + #define BCM5974_DEVICE(prod) { \ + .match_flags = (USB_DEVICE_ID_MATCH_DEVICE | \ +@@ -128,6 +132,10 @@ static const struct usb_device_id bcm5974_table[] = { + BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING5A_ANSI), + BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING5A_ISO), + BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING5A_JIS), ++ /* MacbookPro10,1 */ ++ BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING7_ANSI), ++ BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING7_ISO), ++ BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING7_JIS), + /* Terminating entry */ + {} + }; +@@ -354,6 +362,18 @@ static const struct bcm5974_config bcm5974_config_table[] = { + { DIM_X, DIM_X / SN_COORD, -4620, 5140 }, + { DIM_Y, DIM_Y / SN_COORD, -150, 6600 } + }, ++ { ++ USB_DEVICE_ID_APPLE_WELLSPRING7_ANSI, ++ USB_DEVICE_ID_APPLE_WELLSPRING7_ISO, ++ USB_DEVICE_ID_APPLE_WELLSPRING7_JIS, ++ HAS_INTEGRATED_BUTTON, ++ 0x84, sizeof(struct bt_data), ++ 0x81, TYPE2, FINGER_TYPE2, FINGER_TYPE2 + SIZEOF_ALL_FINGERS, ++ { DIM_PRESSURE, DIM_PRESSURE / SN_PRESSURE, 0, 300 }, ++ { DIM_WIDTH, DIM_WIDTH / SN_WIDTH, 0, 2048 }, ++ { DIM_X, DIM_X / SN_COORD, -4750, 5280 }, ++ { DIM_Y, DIM_Y / SN_COORD, -150, 6730 } ++ }, + {} + }; + +diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c +index f1d5408..a1b8caa 100644 +--- a/drivers/iommu/amd_iommu.c ++++ b/drivers/iommu/amd_iommu.c +@@ -59,6 +59,8 @@ static struct protection_domain *pt_domain; + + static struct iommu_ops amd_iommu_ops; + ++static struct dma_map_ops amd_iommu_dma_ops; ++ + /* + * general struct to manage commands send to an IOMMU + */ +@@ -1878,6 +1880,11 @@ static int device_change_notifier(struct notifier_block *nb, + list_add_tail(&dma_domain->list, &iommu_pd_list); + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + ++ if (!iommu_pass_through) ++ dev->archdata.dma_ops = &amd_iommu_dma_ops; ++ else ++ dev->archdata.dma_ops = &nommu_dma_ops; ++ + break; + case BUS_NOTIFY_DEL_DEVICE: + +diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c +index 6269eb0..ef2d493 100644 +--- a/drivers/iommu/amd_iommu_init.c ++++ b/drivers/iommu/amd_iommu_init.c +@@ -1468,6 +1468,8 @@ static int __init amd_iommu_init(void) + + register_syscore_ops(&amd_iommu_syscore_ops); + ++ x86_platform.iommu_shutdown = disable_iommus; ++ + if (iommu_pass_through) + goto out; + +@@ -1476,7 +1478,6 @@ static int __init amd_iommu_init(void) + else + printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); + +- x86_platform.iommu_shutdown = disable_iommus; + out: + return ret; + +diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c +index 9bfd057..dae2b7a 100644 +--- a/drivers/md/dm-raid1.c ++++ b/drivers/md/dm-raid1.c +@@ -1080,6 +1080,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) + ti->split_io = dm_rh_get_region_size(ms->rh); + ti->num_flush_requests = 1; + ti->num_discard_requests = 1; ++ ti->discard_zeroes_data_unsupported = 1; + + ms->kmirrord_wq = alloc_workqueue("kmirrord", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); +@@ -1210,7 +1211,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, + * We need to dec pending if this was a write. + */ + if (rw == WRITE) { +- if (!(bio->bi_rw & REQ_FLUSH)) ++ if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) + dm_rh_dec(ms->rh, map_context->ll); + return error; + } +diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c +index 7771ed2..69732e0 100644 +--- a/drivers/md/dm-region-hash.c ++++ b/drivers/md/dm-region-hash.c +@@ -404,6 +404,9 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) + return; + } + ++ if (bio->bi_rw & REQ_DISCARD) ++ return; ++ + /* We must inform the log that the sync count has changed. */ + log->type->set_region_sync(log, region, 0); + +@@ -524,7 +527,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) + struct bio *bio; + + for (bio = bios->head; bio; bio = bio->bi_next) { +- if (bio->bi_rw & REQ_FLUSH) ++ if (bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)) + continue; + rh_inc(rh, dm_rh_bio_to_region(rh, bio)); + } +diff --git a/drivers/md/md.c b/drivers/md/md.c +index 700ecae..d8646d7 100644 +--- a/drivers/md/md.c ++++ b/drivers/md/md.c +@@ -3700,8 +3700,8 @@ array_state_show(struct mddev *mddev, char *page) + return sprintf(page, "%s\n", array_states[st]); + } + +-static int do_md_stop(struct mddev * mddev, int ro, int is_open); +-static int md_set_readonly(struct mddev * mddev, int is_open); ++static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev); ++static int md_set_readonly(struct mddev * mddev, struct block_device *bdev); + static int do_md_run(struct mddev * mddev); + static int restart_array(struct mddev *mddev); + +@@ -3717,14 +3717,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) + /* stopping an active array */ + if (atomic_read(&mddev->openers) > 0) + return -EBUSY; +- err = do_md_stop(mddev, 0, 0); ++ err = do_md_stop(mddev, 0, NULL); + break; + case inactive: + /* stopping an active array */ + if (mddev->pers) { + if (atomic_read(&mddev->openers) > 0) + return -EBUSY; +- err = do_md_stop(mddev, 2, 0); ++ err = do_md_stop(mddev, 2, NULL); + } else + err = 0; /* already inactive */ + break; +@@ -3732,7 +3732,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) + break; /* not supported yet */ + case readonly: + if (mddev->pers) +- err = md_set_readonly(mddev, 0); ++ err = md_set_readonly(mddev, NULL); + else { + mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); +@@ -3742,7 +3742,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) + case read_auto: + if (mddev->pers) { + if (mddev->ro == 0) +- err = md_set_readonly(mddev, 0); ++ err = md_set_readonly(mddev, NULL); + else if (mddev->ro == 1) + err = restart_array(mddev); + if (err == 0) { +@@ -5078,15 +5078,17 @@ void md_stop(struct mddev *mddev) + } + EXPORT_SYMBOL_GPL(md_stop); + +-static int md_set_readonly(struct mddev *mddev, int is_open) ++static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) + { + int err = 0; + mutex_lock(&mddev->open_mutex); +- if (atomic_read(&mddev->openers) > is_open) { ++ if (atomic_read(&mddev->openers) > !!bdev) { + printk("md: %s still in use.\n",mdname(mddev)); + err = -EBUSY; + goto out; + } ++ if (bdev) ++ sync_blockdev(bdev); + if (mddev->pers) { + __md_stop_writes(mddev); + +@@ -5108,18 +5110,26 @@ out: + * 0 - completely stop and dis-assemble array + * 2 - stop but do not disassemble array + */ +-static int do_md_stop(struct mddev * mddev, int mode, int is_open) ++static int do_md_stop(struct mddev * mddev, int mode, ++ struct block_device *bdev) + { + struct gendisk *disk = mddev->gendisk; + struct md_rdev *rdev; + + mutex_lock(&mddev->open_mutex); +- if (atomic_read(&mddev->openers) > is_open || ++ if (atomic_read(&mddev->openers) > !!bdev || + mddev->sysfs_active) { + printk("md: %s still in use.\n",mdname(mddev)); + mutex_unlock(&mddev->open_mutex); + return -EBUSY; + } ++ if (bdev) ++ /* It is possible IO was issued on some other ++ * open file which was closed before we took ->open_mutex. ++ * As that was not the last close __blkdev_put will not ++ * have called sync_blockdev, so we must. ++ */ ++ sync_blockdev(bdev); + + if (mddev->pers) { + if (mddev->ro) +@@ -5193,7 +5203,7 @@ static void autorun_array(struct mddev *mddev) + err = do_md_run(mddev); + if (err) { + printk(KERN_WARNING "md: do_md_run() returned %d\n", err); +- do_md_stop(mddev, 0, 0); ++ do_md_stop(mddev, 0, NULL); + } + } + +@@ -6184,11 +6194,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, + goto done_unlock; + + case STOP_ARRAY: +- err = do_md_stop(mddev, 0, 1); ++ err = do_md_stop(mddev, 0, bdev); + goto done_unlock; + + case STOP_ARRAY_RO: +- err = md_set_readonly(mddev, 1); ++ err = md_set_readonly(mddev, bdev); + goto done_unlock; + + case BLKROSET: +diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c +index 7af60ec..2d97bf0 100644 +--- a/drivers/md/raid1.c ++++ b/drivers/md/raid1.c +@@ -1713,8 +1713,14 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) + + if (atomic_dec_and_test(&r1_bio->remaining)) { + /* if we're here, all write(s) have completed, so clean up */ +- md_done_sync(mddev, r1_bio->sectors, 1); +- put_buf(r1_bio); ++ int s = r1_bio->sectors; ++ if (test_bit(R1BIO_MadeGood, &r1_bio->state) || ++ test_bit(R1BIO_WriteError, &r1_bio->state)) ++ reschedule_retry(r1_bio); ++ else { ++ put_buf(r1_bio); ++ md_done_sync(mddev, s, 1); ++ } + } + } + +@@ -2378,9 +2384,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp + */ + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + atomic_set(&r1_bio->remaining, read_targets); +- for (i=0; iraid_disks; i++) { ++ for (i = 0; i < conf->raid_disks && read_targets; i++) { + bio = r1_bio->bios[i]; + if (bio->bi_end_io == end_sync_read) { ++ read_targets--; + md_sync_acct(bio->bi_bdev, nr_sectors); + generic_make_request(bio); + } +diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c +index 6ba4954..26ef63a 100644 +--- a/drivers/md/raid5.c ++++ b/drivers/md/raid5.c +@@ -196,12 +196,14 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) + BUG_ON(!list_empty(&sh->lru)); + BUG_ON(atomic_read(&conf->active_stripes)==0); + if (test_bit(STRIPE_HANDLE, &sh->state)) { +- if (test_bit(STRIPE_DELAYED, &sh->state)) ++ if (test_bit(STRIPE_DELAYED, &sh->state) && ++ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + list_add_tail(&sh->lru, &conf->delayed_list); + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + sh->bm_seq - conf->seq_write > 0) + list_add_tail(&sh->lru, &conf->bitmap_list); + else { ++ clear_bit(STRIPE_DELAYED, &sh->state); + clear_bit(STRIPE_BIT_DELAY, &sh->state); + list_add_tail(&sh->lru, &conf->handle_list); + } +diff --git a/drivers/media/dvb/dvb-core/dvbdev.c b/drivers/media/dvb/dvb-core/dvbdev.c +index f732877..d5cda35 100644 +--- a/drivers/media/dvb/dvb-core/dvbdev.c ++++ b/drivers/media/dvb/dvb-core/dvbdev.c +@@ -243,6 +243,7 @@ int dvb_register_device(struct dvb_adapter *adap, struct dvb_device **pdvbdev, + if (minor == MAX_DVB_MINORS) { + kfree(dvbdevfops); + kfree(dvbdev); ++ up_write(&minor_rwsem); + mutex_unlock(&dvbdev_register_lock); + return -EINVAL; + } +diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c +index 34c03be..83e8e1b 100644 +--- a/drivers/mtd/nand/nandsim.c ++++ b/drivers/mtd/nand/nandsim.c +@@ -28,7 +28,7 @@ + #include + #include + #include +-#include ++#include + #include + #include + #include +@@ -547,12 +547,6 @@ static char *get_partition_name(int i) + return kstrdup(buf, GFP_KERNEL); + } + +-static uint64_t divide(uint64_t n, uint32_t d) +-{ +- do_div(n, d); +- return n; +-} +- + /* + * Initialize the nandsim structure. + * +@@ -581,7 +575,7 @@ static int init_nandsim(struct mtd_info *mtd) + ns->geom.oobsz = mtd->oobsize; + ns->geom.secsz = mtd->erasesize; + ns->geom.pgszoob = ns->geom.pgsz + ns->geom.oobsz; +- ns->geom.pgnum = divide(ns->geom.totsz, ns->geom.pgsz); ++ ns->geom.pgnum = div_u64(ns->geom.totsz, ns->geom.pgsz); + ns->geom.totszoob = ns->geom.totsz + (uint64_t)ns->geom.pgnum * ns->geom.oobsz; + ns->geom.secshift = ffs(ns->geom.secsz) - 1; + ns->geom.pgshift = chip->page_shift; +@@ -924,7 +918,7 @@ static int setup_wear_reporting(struct mtd_info *mtd) + + if (!rptwear) + return 0; +- wear_eb_count = divide(mtd->size, mtd->erasesize); ++ wear_eb_count = div_u64(mtd->size, mtd->erasesize); + mem = wear_eb_count * sizeof(unsigned long); + if (mem / sizeof(unsigned long) != wear_eb_count) { + NS_ERR("Too many erase blocks for wear reporting\n"); +diff --git a/drivers/net/bonding/bond_debugfs.c b/drivers/net/bonding/bond_debugfs.c +index 3680aa2..2cf084e 100644 +--- a/drivers/net/bonding/bond_debugfs.c ++++ b/drivers/net/bonding/bond_debugfs.c +@@ -6,7 +6,7 @@ + #include "bonding.h" + #include "bond_alb.h" + +-#ifdef CONFIG_DEBUG_FS ++#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_NET_NS) + + #include + #include +diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c +index 1a88e38..6c284d1 100644 +--- a/drivers/net/bonding/bond_main.c ++++ b/drivers/net/bonding/bond_main.c +@@ -3184,6 +3184,12 @@ static int bond_master_netdev_event(unsigned long event, + switch (event) { + case NETDEV_CHANGENAME: + return bond_event_changename(event_bond); ++ case NETDEV_UNREGISTER: ++ bond_remove_proc_entry(event_bond); ++ break; ++ case NETDEV_REGISTER: ++ bond_create_proc_entry(event_bond); ++ break; + default: + break; + } +@@ -4391,8 +4397,6 @@ static void bond_uninit(struct net_device *bond_dev) + + bond_work_cancel_all(bond); + +- bond_remove_proc_entry(bond); +- + bond_debug_unregister(bond); + + __hw_addr_flush(&bond->mc_list); +@@ -4794,7 +4798,6 @@ static int bond_init(struct net_device *bond_dev) + + bond_set_lockdep_class(bond_dev); + +- bond_create_proc_entry(bond); + list_add_tail(&bond->bond_list, &bn->dev_list); + + bond_prepare_sysfs_group(bond); +diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c +index eccdcff..5ae7df7 100644 +--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c ++++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c +@@ -267,7 +267,6 @@ static void atl1c_check_link_status(struct atl1c_adapter *adapter) + dev_warn(&pdev->dev, "stop mac failed\n"); + atl1c_set_aspm(hw, false); + netif_carrier_off(netdev); +- netif_stop_queue(netdev); + atl1c_phy_reset(hw); + atl1c_phy_init(&adapter->hw); + } else { +diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h +index aec7212..8dda46a 100644 +--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h ++++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h +@@ -723,21 +723,6 @@ struct bnx2x_fastpath { + + #define ETH_RX_ERROR_FALGS ETH_FAST_PATH_RX_CQE_PHY_DECODE_ERR_FLG + +-#define BNX2X_IP_CSUM_ERR(cqe) \ +- (!((cqe)->fast_path_cqe.status_flags & \ +- ETH_FAST_PATH_RX_CQE_IP_XSUM_NO_VALIDATION_FLG) && \ +- ((cqe)->fast_path_cqe.type_error_flags & \ +- ETH_FAST_PATH_RX_CQE_IP_BAD_XSUM_FLG)) +- +-#define BNX2X_L4_CSUM_ERR(cqe) \ +- (!((cqe)->fast_path_cqe.status_flags & \ +- ETH_FAST_PATH_RX_CQE_L4_XSUM_NO_VALIDATION_FLG) && \ +- ((cqe)->fast_path_cqe.type_error_flags & \ +- ETH_FAST_PATH_RX_CQE_L4_BAD_XSUM_FLG)) +- +-#define BNX2X_RX_CSUM_OK(cqe) \ +- (!(BNX2X_L4_CSUM_ERR(cqe) || BNX2X_IP_CSUM_ERR(cqe))) +- + #define BNX2X_PRS_FLAG_OVERETH_IPV4(flags) \ + (((le16_to_cpu(flags) & \ + PARSING_FLAGS_OVER_ETHERNET_PROTOCOL) >> \ +diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +index 580b44e..2c1a5c0 100644 +--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c ++++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +@@ -220,7 +220,7 @@ int bnx2x_tx_int(struct bnx2x *bp, struct bnx2x_fp_txdata *txdata) + + if ((netif_tx_queue_stopped(txq)) && + (bp->state == BNX2X_STATE_OPEN) && +- (bnx2x_tx_avail(bp, txdata) >= MAX_SKB_FRAGS + 3)) ++ (bnx2x_tx_avail(bp, txdata) >= MAX_SKB_FRAGS + 4)) + netif_tx_wake_queue(txq); + + __netif_tx_unlock(txq); +@@ -551,6 +551,26 @@ static inline void bnx2x_set_skb_rxhash(struct bnx2x *bp, union eth_rx_cqe *cqe, + le32_to_cpu(cqe->fast_path_cqe.rss_hash_result); + } + ++static void bnx2x_csum_validate(struct sk_buff *skb, union eth_rx_cqe *cqe, ++ struct bnx2x_fastpath *fp) ++{ ++ /* Do nothing if no IP/L4 csum validation was done */ ++ ++ if (cqe->fast_path_cqe.status_flags & ++ (ETH_FAST_PATH_RX_CQE_IP_XSUM_NO_VALIDATION_FLG | ++ ETH_FAST_PATH_RX_CQE_L4_XSUM_NO_VALIDATION_FLG)) ++ return; ++ ++ /* If both IP/L4 validation were done, check if an error was found. */ ++ ++ if (cqe->fast_path_cqe.type_error_flags & ++ (ETH_FAST_PATH_RX_CQE_IP_BAD_XSUM_FLG | ++ ETH_FAST_PATH_RX_CQE_L4_BAD_XSUM_FLG)) ++ fp->eth_q_stats.hw_csum_err++; ++ else ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++} ++ + int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget) + { + struct bnx2x *bp = fp->bp; +@@ -746,13 +766,9 @@ reuse_rx: + + skb_checksum_none_assert(skb); + +- if (bp->dev->features & NETIF_F_RXCSUM) { ++ if (bp->dev->features & NETIF_F_RXCSUM) ++ bnx2x_csum_validate(skb, cqe, fp); + +- if (likely(BNX2X_RX_CSUM_OK(cqe))) +- skb->ip_summed = CHECKSUM_UNNECESSARY; +- else +- fp->eth_q_stats.hw_csum_err++; +- } + } + + skb_record_rx_queue(skb, fp->index); +@@ -2238,8 +2254,6 @@ int bnx2x_poll(struct napi_struct *napi, int budget) + /* we split the first BD into headers and data BDs + * to ease the pain of our fellow microcode engineers + * we use one mapping for both BDs +- * So far this has only been observed to happen +- * in Other Operating Systems(TM) + */ + static noinline u16 bnx2x_tx_split(struct bnx2x *bp, + struct bnx2x_fp_txdata *txdata, +@@ -2890,7 +2904,7 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev) + + txdata->tx_bd_prod += nbd; + +- if (unlikely(bnx2x_tx_avail(bp, txdata) < MAX_SKB_FRAGS + 3)) { ++ if (unlikely(bnx2x_tx_avail(bp, txdata) < MAX_SKB_FRAGS + 4)) { + netif_tx_stop_queue(txq); + + /* paired memory barrier is in bnx2x_tx_int(), we have to keep +@@ -2899,7 +2913,7 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev) + smp_mb(); + + fp->eth_q_stats.driver_xoff++; +- if (bnx2x_tx_avail(bp, txdata) >= MAX_SKB_FRAGS + 3) ++ if (bnx2x_tx_avail(bp, txdata) >= MAX_SKB_FRAGS + 4) + netif_tx_wake_queue(txq); + } + txdata->tx_pkt++; +diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c +index 2dcac28..6b258d9 100644 +--- a/drivers/net/ethernet/broadcom/tg3.c ++++ b/drivers/net/ethernet/broadcom/tg3.c +@@ -14046,7 +14046,8 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) + } + } + +- if (tg3_flag(tp, 5755_PLUS)) ++ if (tg3_flag(tp, 5755_PLUS) || ++ GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5906) + tg3_flag_set(tp, SHORT_DMA_BUG); + + if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5719) +diff --git a/drivers/net/ethernet/intel/e1000e/82571.c b/drivers/net/ethernet/intel/e1000e/82571.c +index e556fc3..3072d35 100644 +--- a/drivers/net/ethernet/intel/e1000e/82571.c ++++ b/drivers/net/ethernet/intel/e1000e/82571.c +@@ -1571,6 +1571,9 @@ static s32 e1000_check_for_serdes_link_82571(struct e1000_hw *hw) + ctrl = er32(CTRL); + status = er32(STATUS); + rxcw = er32(RXCW); ++ /* SYNCH bit and IV bit are sticky */ ++ udelay(10); ++ rxcw = er32(RXCW); + + if ((rxcw & E1000_RXCW_SYNCH) && !(rxcw & E1000_RXCW_IV)) { + +diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c +index cc2565c..9e61d6b 100644 +--- a/drivers/net/ethernet/realtek/r8169.c ++++ b/drivers/net/ethernet/realtek/r8169.c +@@ -4185,6 +4185,7 @@ out: + return rc; + + err_out_msi_4: ++ netif_napi_del(&tp->napi); + rtl_disable_msi(pdev, tp); + iounmap(ioaddr); + err_out_free_res_3: +@@ -4210,6 +4211,8 @@ static void __devexit rtl8169_remove_one(struct pci_dev *pdev) + + cancel_delayed_work_sync(&tp->task); + ++ netif_napi_del(&tp->napi); ++ + unregister_netdev(dev); + + rtl_release_firmware(tp); +diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +index 72cd190..d4d2bc1 100644 +--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c ++++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +@@ -1174,6 +1174,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) + priv->hw->desc->prepare_tx_desc(desc, 0, len, csum_insertion); + wmb(); + priv->hw->desc->set_tx_owner(desc); ++ wmb(); + } + + /* Interrupt on completition only for the latest segment */ +@@ -1189,6 +1190,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) + + /* To avoid raise condition */ + priv->hw->desc->set_tx_owner(first); ++ wmb(); + + priv->cur_tx++; + +@@ -1252,6 +1254,7 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv) + } + wmb(); + priv->hw->desc->set_rx_owner(p + entry); ++ wmb(); + } + } + +diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c +index 1b7082d..26106c0 100644 +--- a/drivers/net/macvtap.c ++++ b/drivers/net/macvtap.c +@@ -504,10 +504,11 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, + if (copy > size) { + ++from; + --count; +- } ++ offset = 0; ++ } else ++ offset += size; + copy -= size; + offset1 += size; +- offset = 0; + } + + if (len == offset1) +@@ -517,24 +518,29 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, + struct page *page[MAX_SKB_FRAGS]; + int num_pages; + unsigned long base; ++ unsigned long truesize; + +- len = from->iov_len - offset1; ++ len = from->iov_len - offset; + if (!len) { +- offset1 = 0; ++ offset = 0; + ++from; + continue; + } +- base = (unsigned long)from->iov_base + offset1; ++ base = (unsigned long)from->iov_base + offset; + size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; ++ if (i + size > MAX_SKB_FRAGS) ++ return -EMSGSIZE; + num_pages = get_user_pages_fast(base, size, 0, &page[i]); +- if ((num_pages != size) || +- (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) +- /* put_page is in skb free */ ++ if (num_pages != size) { ++ for (i = 0; i < num_pages; i++) ++ put_page(page[i]); + return -EFAULT; ++ } ++ truesize = size * PAGE_SIZE; + skb->data_len += len; + skb->len += len; +- skb->truesize += len; +- atomic_add(len, &skb->sk->sk_wmem_alloc); ++ skb->truesize += truesize; ++ atomic_add(truesize, &skb->sk->sk_wmem_alloc); + while (len) { + int off = base & ~PAGE_MASK; + int size = min_t(int, len, PAGE_SIZE - off); +@@ -545,7 +551,7 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, + len -= size; + i++; + } +- offset1 = 0; ++ offset = 0; + ++from; + } + return 0; +@@ -645,7 +651,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, + int err; + struct virtio_net_hdr vnet_hdr = { 0 }; + int vnet_hdr_len = 0; +- int copylen; ++ int copylen = 0; + bool zerocopy = false; + + if (q->flags & IFF_VNET_HDR) { +@@ -674,15 +680,31 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, + if (unlikely(len < ETH_HLEN)) + goto err; + ++ err = -EMSGSIZE; ++ if (unlikely(count > UIO_MAXIOV)) ++ goto err; ++ + if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) + zerocopy = true; + + if (zerocopy) { ++ /* Userspace may produce vectors with count greater than ++ * MAX_SKB_FRAGS, so we need to linearize parts of the skb ++ * to let the rest of data to be fit in the frags. ++ */ ++ if (count > MAX_SKB_FRAGS) { ++ copylen = iov_length(iv, count - MAX_SKB_FRAGS); ++ if (copylen < vnet_hdr_len) ++ copylen = 0; ++ else ++ copylen -= vnet_hdr_len; ++ } + /* There are 256 bytes to be copied in skb, so there is enough + * room for skb expand head in case it is used. + * The rest buffer is mapped from userspace. + */ +- copylen = vnet_hdr.hdr_len; ++ if (copylen < vnet_hdr.hdr_len) ++ copylen = vnet_hdr.hdr_len; + if (!copylen) + copylen = GOODCOPY_LEN; + } else +@@ -693,10 +715,9 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, + if (!skb) + goto err; + +- if (zerocopy) { ++ if (zerocopy) + err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count); +- skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; +- } else ++ else + err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, + len); + if (err) +@@ -715,8 +736,10 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, + rcu_read_lock_bh(); + vlan = rcu_dereference_bh(q->vlan); + /* copy skb_ubuf_info for callback when skb has no error */ +- if (zerocopy) ++ if (zerocopy) { + skb_shinfo(skb)->destructor_arg = m->msg_control; ++ skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; ++ } + if (vlan) + macvlan_start_xmit(skb, vlan->dev); + else +diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c +index ad96164..00ed9c1 100644 +--- a/drivers/net/usb/ipheth.c ++++ b/drivers/net/usb/ipheth.c +@@ -59,6 +59,7 @@ + #define USB_PRODUCT_IPHONE_3G 0x1292 + #define USB_PRODUCT_IPHONE_3GS 0x1294 + #define USB_PRODUCT_IPHONE_4 0x1297 ++#define USB_PRODUCT_IPAD 0x129a + #define USB_PRODUCT_IPHONE_4_VZW 0x129c + #define USB_PRODUCT_IPHONE_4S 0x12a0 + +@@ -101,6 +102,10 @@ static struct usb_device_id ipheth_table[] = { + IPHETH_USBINTF_CLASS, IPHETH_USBINTF_SUBCLASS, + IPHETH_USBINTF_PROTO) }, + { USB_DEVICE_AND_INTERFACE_INFO( ++ USB_VENDOR_APPLE, USB_PRODUCT_IPAD, ++ IPHETH_USBINTF_CLASS, IPHETH_USBINTF_SUBCLASS, ++ IPHETH_USBINTF_PROTO) }, ++ { USB_DEVICE_AND_INTERFACE_INFO( + USB_VENDOR_APPLE, USB_PRODUCT_IPHONE_4_VZW, + IPHETH_USBINTF_CLASS, IPHETH_USBINTF_SUBCLASS, + IPHETH_USBINTF_PROTO) }, +diff --git a/drivers/net/wireless/brcm80211/brcmsmac/main.c b/drivers/net/wireless/brcm80211/brcmsmac/main.c +index 833cbef..8a40ff9 100644 +--- a/drivers/net/wireless/brcm80211/brcmsmac/main.c ++++ b/drivers/net/wireless/brcm80211/brcmsmac/main.c +@@ -900,8 +900,7 @@ brcms_c_dotxstatus(struct brcms_c_info *wlc, struct tx_status *txs) + */ + if (!(txs->status & TX_STATUS_AMPDU) + && (txs->status & TX_STATUS_INTERMEDIATE)) { +- wiphy_err(wlc->wiphy, "%s: INTERMEDIATE but not AMPDU\n", +- __func__); ++ BCMMSG(wlc->wiphy, "INTERMEDIATE but not AMPDU\n"); + return false; + } + +diff --git a/drivers/net/wireless/ipw2x00/ipw.h b/drivers/net/wireless/ipw2x00/ipw.h +new file mode 100644 +index 0000000..4007bf5 +--- /dev/null ++++ b/drivers/net/wireless/ipw2x00/ipw.h +@@ -0,0 +1,23 @@ ++/* ++ * Intel Pro/Wireless 2100, 2200BG, 2915ABG network connection driver ++ * ++ * Copyright 2012 Stanislav Yakovlev ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#ifndef __IPW_H__ ++#define __IPW_H__ ++ ++#include ++ ++static const u32 ipw_cipher_suites[] = { ++ WLAN_CIPHER_SUITE_WEP40, ++ WLAN_CIPHER_SUITE_WEP104, ++ WLAN_CIPHER_SUITE_TKIP, ++ WLAN_CIPHER_SUITE_CCMP, ++}; ++ ++#endif +diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c +index 127e9c6..10862d4 100644 +--- a/drivers/net/wireless/ipw2x00/ipw2100.c ++++ b/drivers/net/wireless/ipw2x00/ipw2100.c +@@ -166,6 +166,7 @@ that only one external action is invoked at a time. + #include + + #include "ipw2100.h" ++#include "ipw.h" + + #define IPW2100_VERSION "git-1.2.2" + +@@ -1955,6 +1956,9 @@ static int ipw2100_wdev_init(struct net_device *dev) + wdev->wiphy->bands[IEEE80211_BAND_2GHZ] = bg_band; + } + ++ wdev->wiphy->cipher_suites = ipw_cipher_suites; ++ wdev->wiphy->n_cipher_suites = ARRAY_SIZE(ipw_cipher_suites); ++ + set_wiphy_dev(wdev->wiphy, &priv->pci_dev->dev); + if (wiphy_register(wdev->wiphy)) { + ipw2100_down(priv); +diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c +index 827889b..56bd370 100644 +--- a/drivers/net/wireless/ipw2x00/ipw2200.c ++++ b/drivers/net/wireless/ipw2x00/ipw2200.c +@@ -34,6 +34,7 @@ + #include + #include + #include "ipw2200.h" ++#include "ipw.h" + + + #ifndef KBUILD_EXTMOD +@@ -11535,6 +11536,9 @@ static int ipw_wdev_init(struct net_device *dev) + wdev->wiphy->bands[IEEE80211_BAND_5GHZ] = a_band; + } + ++ wdev->wiphy->cipher_suites = ipw_cipher_suites; ++ wdev->wiphy->n_cipher_suites = ARRAY_SIZE(ipw_cipher_suites); ++ + set_wiphy_dev(wdev->wiphy, &priv->pci_dev->dev); + + /* With that information in place, we can now register the wiphy... */ +diff --git a/drivers/net/wireless/iwlegacy/iwl-4965-sta.c b/drivers/net/wireless/iwlegacy/iwl-4965-sta.c +index a262c23..0116ca8 100644 +--- a/drivers/net/wireless/iwlegacy/iwl-4965-sta.c ++++ b/drivers/net/wireless/iwlegacy/iwl-4965-sta.c +@@ -466,7 +466,7 @@ int iwl4965_remove_dynamic_key(struct iwl_priv *priv, + return 0; + } + +- if (priv->stations[sta_id].sta.key.key_offset == WEP_INVALID_OFFSET) { ++ if (priv->stations[sta_id].sta.key.key_flags & STA_KEY_FLG_INVALID) { + IWL_WARN(priv, "Removing wrong key %d 0x%x\n", + keyconf->keyidx, key_flags); + spin_unlock_irqrestore(&priv->sta_lock, flags); +@@ -483,7 +483,7 @@ int iwl4965_remove_dynamic_key(struct iwl_priv *priv, + sizeof(struct iwl4965_keyinfo)); + priv->stations[sta_id].sta.key.key_flags = + STA_KEY_FLG_NO_ENC | STA_KEY_FLG_INVALID; +- priv->stations[sta_id].sta.key.key_offset = WEP_INVALID_OFFSET; ++ priv->stations[sta_id].sta.key.key_offset = keyconf->hw_key_idx; + priv->stations[sta_id].sta.sta.modify_mask = STA_MODIFY_KEY_MASK; + priv->stations[sta_id].sta.mode = STA_CONTROL_MODIFY_MSK; + +diff --git a/drivers/net/wireless/iwlegacy/iwl-core.c b/drivers/net/wireless/iwlegacy/iwl-core.c +index 2bd5659..1bb64c9 100644 +--- a/drivers/net/wireless/iwlegacy/iwl-core.c ++++ b/drivers/net/wireless/iwlegacy/iwl-core.c +@@ -1884,14 +1884,12 @@ void iwl_legacy_bg_watchdog(unsigned long data) + return; + + /* monitor and check for other stuck queues */ +- if (iwl_legacy_is_any_associated(priv)) { +- for (cnt = 0; cnt < priv->hw_params.max_txq_num; cnt++) { +- /* skip as we already checked the command queue */ +- if (cnt == priv->cmd_queue) +- continue; +- if (iwl_legacy_check_stuck_queue(priv, cnt)) +- return; +- } ++ for (cnt = 0; cnt < priv->hw_params.max_txq_num; cnt++) { ++ /* skip as we already checked the command queue */ ++ if (cnt == priv->cmd_queue) ++ continue; ++ if (iwl_legacy_check_stuck_queue(priv, cnt)) ++ return; + } + + mod_timer(&priv->watchdog, jiffies + +diff --git a/drivers/net/wireless/rt2x00/rt2x00usb.c b/drivers/net/wireless/rt2x00/rt2x00usb.c +index 1e31050..ba28807 100644 +--- a/drivers/net/wireless/rt2x00/rt2x00usb.c ++++ b/drivers/net/wireless/rt2x00/rt2x00usb.c +@@ -426,8 +426,8 @@ void rt2x00usb_kick_queue(struct data_queue *queue) + case QID_RX: + if (!rt2x00queue_full(queue)) + rt2x00queue_for_each_entry(queue, +- Q_INDEX_DONE, + Q_INDEX, ++ Q_INDEX_DONE, + NULL, + rt2x00usb_kick_rx_entry); + break; +diff --git a/drivers/net/wireless/rtl818x/rtl8187/leds.c b/drivers/net/wireless/rtl818x/rtl8187/leds.c +index 2e0de2f..c2d5b49 100644 +--- a/drivers/net/wireless/rtl818x/rtl8187/leds.c ++++ b/drivers/net/wireless/rtl818x/rtl8187/leds.c +@@ -117,7 +117,7 @@ static void rtl8187_led_brightness_set(struct led_classdev *led_dev, + radio_on = true; + } else if (radio_on) { + radio_on = false; +- cancel_delayed_work_sync(&priv->led_on); ++ cancel_delayed_work(&priv->led_on); + ieee80211_queue_delayed_work(hw, &priv->led_off, 0); + } + } else if (radio_on) { +diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c +index 12d1e81..d024f83 100644 +--- a/drivers/pci/pci-driver.c ++++ b/drivers/pci/pci-driver.c +@@ -742,6 +742,18 @@ static int pci_pm_suspend_noirq(struct device *dev) + + pci_pm_set_unknown_state(pci_dev); + ++ /* ++ * Some BIOSes from ASUS have a bug: If a USB EHCI host controller's ++ * PCI COMMAND register isn't 0, the BIOS assumes that the controller ++ * hasn't been quiesced and tries to turn it off. If the controller ++ * is already in D3, this can hang or cause memory corruption. ++ * ++ * Since the value of the COMMAND register doesn't matter once the ++ * device has been suspended, we can safely set it to 0 here. ++ */ ++ if (pci_dev->class == PCI_CLASS_SERIAL_USB_EHCI) ++ pci_write_config_word(pci_dev, PCI_COMMAND, 0); ++ + return 0; + } + +diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c +index e5b75eb..6d4a531 100644 +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -1689,11 +1689,6 @@ int pci_prepare_to_sleep(struct pci_dev *dev) + if (target_state == PCI_POWER_ERROR) + return -EIO; + +- /* Some devices mustn't be in D3 during system sleep */ +- if (target_state == PCI_D3hot && +- (dev->dev_flags & PCI_DEV_FLAGS_NO_D3_DURING_SLEEP)) +- return 0; +- + pci_enable_wake(dev, target_state, device_may_wakeup(&dev->dev)); + + error = pci_set_power_state(dev, target_state); +diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c +index 3c56fec..78fda9c 100644 +--- a/drivers/pci/quirks.c ++++ b/drivers/pci/quirks.c +@@ -2940,32 +2940,6 @@ static void __devinit disable_igfx_irq(struct pci_dev *dev) + DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0102, disable_igfx_irq); + DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x010a, disable_igfx_irq); + +-/* +- * The Intel 6 Series/C200 Series chipset's EHCI controllers on many +- * ASUS motherboards will cause memory corruption or a system crash +- * if they are in D3 while the system is put into S3 sleep. +- */ +-static void __devinit asus_ehci_no_d3(struct pci_dev *dev) +-{ +- const char *sys_info; +- static const char good_Asus_board[] = "P8Z68-V"; +- +- if (dev->dev_flags & PCI_DEV_FLAGS_NO_D3_DURING_SLEEP) +- return; +- if (dev->subsystem_vendor != PCI_VENDOR_ID_ASUSTEK) +- return; +- sys_info = dmi_get_system_info(DMI_BOARD_NAME); +- if (sys_info && memcmp(sys_info, good_Asus_board, +- sizeof(good_Asus_board) - 1) == 0) +- return; +- +- dev_info(&dev->dev, "broken D3 during system sleep on ASUS\n"); +- dev->dev_flags |= PCI_DEV_FLAGS_NO_D3_DURING_SLEEP; +- device_set_wakeup_capable(&dev->dev, false); +-} +-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1c26, asus_ehci_no_d3); +-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1c2d, asus_ehci_no_d3); +- + static void pci_do_fixups(struct pci_dev *dev, struct pci_fixup *f, + struct pci_fixup *end) + { +diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c +index 809a3ae..b46ec11 100644 +--- a/drivers/platform/x86/intel_ips.c ++++ b/drivers/platform/x86/intel_ips.c +@@ -72,6 +72,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1505,6 +1506,24 @@ static DEFINE_PCI_DEVICE_TABLE(ips_id_table) = { + + MODULE_DEVICE_TABLE(pci, ips_id_table); + ++static int ips_blacklist_callback(const struct dmi_system_id *id) ++{ ++ pr_info("Blacklisted intel_ips for %s\n", id->ident); ++ return 1; ++} ++ ++static const struct dmi_system_id ips_blacklist[] = { ++ { ++ .callback = ips_blacklist_callback, ++ .ident = "HP ProBook", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), ++ DMI_MATCH(DMI_PRODUCT_NAME, "HP ProBook"), ++ }, ++ }, ++ { } /* terminating entry */ ++}; ++ + static int ips_probe(struct pci_dev *dev, const struct pci_device_id *id) + { + u64 platform_info; +@@ -1514,6 +1533,9 @@ static int ips_probe(struct pci_dev *dev, const struct pci_device_id *id) + u16 htshi, trc, trc_required_mask; + u8 tse; + ++ if (dmi_check_system(ips_blacklist)) ++ return -ENODEV; ++ + ips = kzalloc(sizeof(struct ips_driver), GFP_KERNEL); + if (!ips) + return -ENOMEM; +diff --git a/drivers/platform/x86/samsung-laptop.c b/drivers/platform/x86/samsung-laptop.c +index 09e26bf..af1e296 100644 +--- a/drivers/platform/x86/samsung-laptop.c ++++ b/drivers/platform/x86/samsung-laptop.c +@@ -540,245 +540,34 @@ static DEVICE_ATTR(performance_level, S_IWUSR | S_IRUGO, + get_performance_level, set_performance_level); + + +-static int __init dmi_check_cb(const struct dmi_system_id *id) +-{ +- pr_info("found laptop model '%s'\n", +- id->ident); +- return 1; +-} +- + static struct dmi_system_id __initdata samsung_dmi_table[] = { + { +- .ident = "N128", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, +- "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "N128"), +- DMI_MATCH(DMI_BOARD_NAME, "N128"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "N130", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, + "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "N130"), +- DMI_MATCH(DMI_BOARD_NAME, "N130"), ++ DMI_MATCH(DMI_CHASSIS_TYPE, "8"), /* Portable */ + }, +- .callback = dmi_check_cb, + }, + { +- .ident = "N510", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, + "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "N510"), +- DMI_MATCH(DMI_BOARD_NAME, "N510"), ++ DMI_MATCH(DMI_CHASSIS_TYPE, "9"), /* Laptop */ + }, +- .callback = dmi_check_cb, + }, + { +- .ident = "X125", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, + "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "X125"), +- DMI_MATCH(DMI_BOARD_NAME, "X125"), ++ DMI_MATCH(DMI_CHASSIS_TYPE, "10"), /* Notebook */ + }, +- .callback = dmi_check_cb, + }, + { +- .ident = "X120/X170", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, + "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "X120/X170"), +- DMI_MATCH(DMI_BOARD_NAME, "X120/X170"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "NC10", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, +- "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "NC10"), +- DMI_MATCH(DMI_BOARD_NAME, "NC10"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "NP-Q45", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, +- "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "SQ45S70S"), +- DMI_MATCH(DMI_BOARD_NAME, "SQ45S70S"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "X360", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, +- "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "X360"), +- DMI_MATCH(DMI_BOARD_NAME, "X360"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "R410 Plus", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, +- "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "R410P"), +- DMI_MATCH(DMI_BOARD_NAME, "R460"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "R518", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, +- "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "R518"), +- DMI_MATCH(DMI_BOARD_NAME, "R518"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "R519/R719", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, +- "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "R519/R719"), +- DMI_MATCH(DMI_BOARD_NAME, "R519/R719"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "N150/N210/N220", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, +- "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "N150/N210/N220"), +- DMI_MATCH(DMI_BOARD_NAME, "N150/N210/N220"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "N220", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, +- "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "N220"), +- DMI_MATCH(DMI_BOARD_NAME, "N220"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "N150/N210/N220/N230", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, +- "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "N150/N210/N220/N230"), +- DMI_MATCH(DMI_BOARD_NAME, "N150/N210/N220/N230"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "N150P/N210P/N220P", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, +- "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "N150P/N210P/N220P"), +- DMI_MATCH(DMI_BOARD_NAME, "N150P/N210P/N220P"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "R700", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "SR700"), +- DMI_MATCH(DMI_BOARD_NAME, "SR700"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "R530/R730", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "R530/R730"), +- DMI_MATCH(DMI_BOARD_NAME, "R530/R730"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "NF110/NF210/NF310", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "NF110/NF210/NF310"), +- DMI_MATCH(DMI_BOARD_NAME, "NF110/NF210/NF310"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "N145P/N250P/N260P", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "N145P/N250P/N260P"), +- DMI_MATCH(DMI_BOARD_NAME, "N145P/N250P/N260P"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "R70/R71", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, +- "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "R70/R71"), +- DMI_MATCH(DMI_BOARD_NAME, "R70/R71"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "P460", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "P460"), +- DMI_MATCH(DMI_BOARD_NAME, "P460"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "R528/R728", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "R528/R728"), +- DMI_MATCH(DMI_BOARD_NAME, "R528/R728"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "NC210/NC110", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "NC210/NC110"), +- DMI_MATCH(DMI_BOARD_NAME, "NC210/NC110"), +- }, +- .callback = dmi_check_cb, +- }, +- { +- .ident = "X520", +- .matches = { +- DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."), +- DMI_MATCH(DMI_PRODUCT_NAME, "X520"), +- DMI_MATCH(DMI_BOARD_NAME, "X520"), ++ DMI_MATCH(DMI_CHASSIS_TYPE, "14"), /* Sub-Notebook */ + }, +- .callback = dmi_check_cb, + }, + { }, + }; +@@ -819,7 +608,8 @@ static int __init samsung_init(void) + + f0000_segment = ioremap_nocache(0xf0000, 0xffff); + if (!f0000_segment) { +- pr_err("Can't map the segment at 0xf0000\n"); ++ if (debug || force) ++ pr_err("Can't map the segment at 0xf0000\n"); + return -EINVAL; + } + +@@ -832,7 +622,8 @@ static int __init samsung_init(void) + } + + if (loca == 0xffff) { +- pr_err("This computer does not support SABI\n"); ++ if (debug || force) ++ pr_err("This computer does not support SABI\n"); + goto error_no_signature; + } + +diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c +index 39e41fb..5160354 100644 +--- a/drivers/rtc/rtc-mxc.c ++++ b/drivers/rtc/rtc-mxc.c +@@ -191,10 +191,11 @@ static irqreturn_t mxc_rtc_interrupt(int irq, void *dev_id) + struct platform_device *pdev = dev_id; + struct rtc_plat_data *pdata = platform_get_drvdata(pdev); + void __iomem *ioaddr = pdata->ioaddr; ++ unsigned long flags; + u32 status; + u32 events = 0; + +- spin_lock_irq(&pdata->rtc->irq_lock); ++ spin_lock_irqsave(&pdata->rtc->irq_lock, flags); + status = readw(ioaddr + RTC_RTCISR) & readw(ioaddr + RTC_RTCIENR); + /* clear interrupt sources */ + writew(status, ioaddr + RTC_RTCISR); +@@ -217,7 +218,7 @@ static irqreturn_t mxc_rtc_interrupt(int irq, void *dev_id) + rtc_update_alarm(&pdev->dev, &pdata->g_rtc_alarm); + + rtc_update_irq(pdata->rtc, 1, events); +- spin_unlock_irq(&pdata->rtc->irq_lock); ++ spin_unlock_irqrestore(&pdata->rtc->irq_lock, flags); + + return IRQ_HANDLED; + } +diff --git a/drivers/scsi/aic94xx/aic94xx_task.c b/drivers/scsi/aic94xx/aic94xx_task.c +index 532d212..393e7ce 100644 +--- a/drivers/scsi/aic94xx/aic94xx_task.c ++++ b/drivers/scsi/aic94xx/aic94xx_task.c +@@ -201,7 +201,7 @@ static void asd_get_response_tasklet(struct asd_ascb *ascb, + + if (SAS_STATUS_BUF_SIZE >= sizeof(*resp)) { + resp->frame_len = le16_to_cpu(*(__le16 *)(r+6)); +- memcpy(&resp->ending_fis[0], r+16, 24); ++ memcpy(&resp->ending_fis[0], r+16, ATA_RESP_FIS_SIZE); + ts->buf_valid_size = sizeof(*resp); + } + } +diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c +index db9238f..4868fc9 100644 +--- a/drivers/scsi/libsas/sas_ata.c ++++ b/drivers/scsi/libsas/sas_ata.c +@@ -112,12 +112,12 @@ static void sas_ata_task_done(struct sas_task *task) + if (stat->stat == SAS_PROTO_RESPONSE || stat->stat == SAM_STAT_GOOD || + ((stat->stat == SAM_STAT_CHECK_CONDITION && + dev->sata_dev.command_set == ATAPI_COMMAND_SET))) { +- ata_tf_from_fis(resp->ending_fis, &dev->sata_dev.tf); ++ memcpy(dev->sata_dev.fis, resp->ending_fis, ATA_RESP_FIS_SIZE); + + if (!link->sactive) { +- qc->err_mask |= ac_err_mask(dev->sata_dev.tf.command); ++ qc->err_mask |= ac_err_mask(dev->sata_dev.fis[2]); + } else { +- link->eh_info.err_mask |= ac_err_mask(dev->sata_dev.tf.command); ++ link->eh_info.err_mask |= ac_err_mask(dev->sata_dev.fis[2]); + if (unlikely(link->eh_info.err_mask)) + qc->flags |= ATA_QCFLAG_FAILED; + } +@@ -138,8 +138,8 @@ static void sas_ata_task_done(struct sas_task *task) + qc->flags |= ATA_QCFLAG_FAILED; + } + +- dev->sata_dev.tf.feature = 0x04; /* status err */ +- dev->sata_dev.tf.command = ATA_ERR; ++ dev->sata_dev.fis[3] = 0x04; /* status err */ ++ dev->sata_dev.fis[2] = ATA_ERR; + } + } + +@@ -252,7 +252,7 @@ static bool sas_ata_qc_fill_rtf(struct ata_queued_cmd *qc) + { + struct domain_device *dev = qc->ap->private_data; + +- memcpy(&qc->result_tf, &dev->sata_dev.tf, sizeof(qc->result_tf)); ++ ata_tf_from_fis(dev->sata_dev.fis, &qc->result_tf); + return true; + } + +diff --git a/drivers/target/target_core_cdb.c b/drivers/target/target_core_cdb.c +index 65ea65a..93b9406 100644 +--- a/drivers/target/target_core_cdb.c ++++ b/drivers/target/target_core_cdb.c +@@ -1199,7 +1199,7 @@ int target_emulate_write_same(struct se_task *task) + if (num_blocks != 0) + range = num_blocks; + else +- range = (dev->transport->get_blocks(dev) - lba); ++ range = (dev->transport->get_blocks(dev) - lba) + 1; + + pr_debug("WRITE_SAME UNMAP: LBA: %llu Range: %llu\n", + (unsigned long long)lba, (unsigned long long)range); +diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c +index b75bc92..9145141 100644 +--- a/drivers/target/target_core_pr.c ++++ b/drivers/target/target_core_pr.c +@@ -2042,7 +2042,7 @@ static int __core_scsi3_write_aptpl_to_file( + if (IS_ERR(file) || !file || !file->f_dentry) { + pr_err("filp_open(%s) for APTPL metadata" + " failed\n", path); +- return (PTR_ERR(file) < 0 ? PTR_ERR(file) : -ENOENT); ++ return IS_ERR(file) ? PTR_ERR(file) : -ENOENT; + } + + iov[0].iov_base = &buf[0]; +@@ -3853,7 +3853,7 @@ int target_scsi3_emulate_pr_out(struct se_task *task) + " SPC-2 reservation is held, returning" + " RESERVATION_CONFLICT\n"); + cmd->scsi_sense_reason = TCM_RESERVATION_CONFLICT; +- ret = EINVAL; ++ ret = -EINVAL; + goto out; + } + +@@ -3863,7 +3863,8 @@ int target_scsi3_emulate_pr_out(struct se_task *task) + */ + if (!cmd->se_sess) { + cmd->scsi_sense_reason = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; +- return -EINVAL; ++ ret = -EINVAL; ++ goto out; + } + + if (cmd->data_length < 24) { +diff --git a/drivers/target/tcm_fc/tfc_cmd.c b/drivers/target/tcm_fc/tfc_cmd.c +index d95cfe2..278819c 100644 +--- a/drivers/target/tcm_fc/tfc_cmd.c ++++ b/drivers/target/tcm_fc/tfc_cmd.c +@@ -249,6 +249,8 @@ u32 ft_get_task_tag(struct se_cmd *se_cmd) + { + struct ft_cmd *cmd = container_of(se_cmd, struct ft_cmd, se_cmd); + ++ if (cmd->aborted) ++ return ~0; + return fc_seq_exch(cmd->seq)->rxid; + } + +diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c +index 19fb5fa..9aaed0d 100644 +--- a/drivers/usb/class/cdc-wdm.c ++++ b/drivers/usb/class/cdc-wdm.c +@@ -473,6 +473,8 @@ retry: + goto retry; + } + if (!desc->reslength) { /* zero length read */ ++ dev_dbg(&desc->intf->dev, "%s: zero length - clearing WDM_READ\n", __func__); ++ clear_bit(WDM_READ, &desc->flags); + spin_unlock_irq(&desc->iuspin); + goto retry; + } +diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c +index 52d27ed..175b6bb 100644 +--- a/drivers/usb/core/hub.c ++++ b/drivers/usb/core/hub.c +@@ -2039,12 +2039,16 @@ static unsigned hub_is_wusb(struct usb_hub *hub) + static int hub_port_reset(struct usb_hub *hub, int port1, + struct usb_device *udev, unsigned int delay, bool warm); + +-/* Is a USB 3.0 port in the Inactive state? */ +-static bool hub_port_inactive(struct usb_hub *hub, u16 portstatus) ++/* Is a USB 3.0 port in the Inactive or Complinance Mode state? ++ * Port worm reset is required to recover ++ */ ++static bool hub_port_warm_reset_required(struct usb_hub *hub, u16 portstatus) + { + return hub_is_superspeed(hub->hdev) && +- (portstatus & USB_PORT_STAT_LINK_STATE) == +- USB_SS_PORT_LS_SS_INACTIVE; ++ (((portstatus & USB_PORT_STAT_LINK_STATE) == ++ USB_SS_PORT_LS_SS_INACTIVE) || ++ ((portstatus & USB_PORT_STAT_LINK_STATE) == ++ USB_SS_PORT_LS_COMP_MOD)) ; + } + + static int hub_port_wait_reset(struct usb_hub *hub, int port1, +@@ -2080,7 +2084,7 @@ static int hub_port_wait_reset(struct usb_hub *hub, int port1, + * + * See https://bugzilla.kernel.org/show_bug.cgi?id=41752 + */ +- if (hub_port_inactive(hub, portstatus)) { ++ if (hub_port_warm_reset_required(hub, portstatus)) { + int ret; + + if ((portchange & USB_PORT_STAT_C_CONNECTION)) +@@ -3646,9 +3650,7 @@ static void hub_events(void) + /* Warm reset a USB3 protocol port if it's in + * SS.Inactive state. + */ +- if (hub_is_superspeed(hub->hdev) && +- (portstatus & USB_PORT_STAT_LINK_STATE) +- == USB_SS_PORT_LS_SS_INACTIVE) { ++ if (hub_port_warm_reset_required(hub, portstatus)) { + dev_dbg(hub_dev, "warm reset port %d\n", i); + hub_port_reset(hub, i, NULL, + HUB_BH_RESET_TIME, true); +diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c +index a8b2980..fd8a2c2 100644 +--- a/drivers/usb/host/xhci-hub.c ++++ b/drivers/usb/host/xhci-hub.c +@@ -438,6 +438,42 @@ void xhci_test_and_clear_bit(struct xhci_hcd *xhci, __le32 __iomem **port_array, + } + } + ++/* Updates Link Status for super Speed port */ ++static void xhci_hub_report_link_state(u32 *status, u32 status_reg) ++{ ++ u32 pls = status_reg & PORT_PLS_MASK; ++ ++ /* resume state is a xHCI internal state. ++ * Do not report it to usb core. ++ */ ++ if (pls == XDEV_RESUME) ++ return; ++ ++ /* When the CAS bit is set then warm reset ++ * should be performed on port ++ */ ++ if (status_reg & PORT_CAS) { ++ /* The CAS bit can be set while the port is ++ * in any link state. ++ * Only roothubs have CAS bit, so we ++ * pretend to be in compliance mode ++ * unless we're already in compliance ++ * or the inactive state. ++ */ ++ if (pls != USB_SS_PORT_LS_COMP_MOD && ++ pls != USB_SS_PORT_LS_SS_INACTIVE) { ++ pls = USB_SS_PORT_LS_COMP_MOD; ++ } ++ /* Return also connection bit - ++ * hub state machine resets port ++ * when this bit is set. ++ */ ++ pls |= USB_PORT_STAT_CONNECTION; ++ } ++ /* update status field */ ++ *status |= pls; ++} ++ + int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, + u16 wIndex, char *buf, u16 wLength) + { +@@ -579,13 +615,9 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, + else + status |= USB_PORT_STAT_POWER; + } +- /* Port Link State */ ++ /* Update Port Link State for super speed ports*/ + if (hcd->speed == HCD_USB3) { +- /* resume state is a xHCI internal state. +- * Do not report it to usb core. +- */ +- if ((temp & PORT_PLS_MASK) != XDEV_RESUME) +- status |= (temp & PORT_PLS_MASK); ++ xhci_hub_report_link_state(&status, temp); + } + if (bus_state->port_c_suspend & (1 << wIndex)) + status |= 1 << USB_PORT_FEAT_C_SUSPEND; +diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h +index 363b141..7a56805 100644 +--- a/drivers/usb/host/xhci.h ++++ b/drivers/usb/host/xhci.h +@@ -341,7 +341,11 @@ struct xhci_op_regs { + #define PORT_PLC (1 << 22) + /* port configure error change - port failed to configure its link partner */ + #define PORT_CEC (1 << 23) +-/* bit 24 reserved */ ++/* Cold Attach Status - xHC can set this bit to report device attached during ++ * Sx state. Warm port reset should be perfomed to clear this bit and move port ++ * to connected state. ++ */ ++#define PORT_CAS (1 << 24) + /* wake on connect (enable) */ + #define PORT_WKCONN_E (1 << 25) + /* wake on disconnect (enable) */ +diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c +index 21a4734..5971c95 100644 +--- a/drivers/usb/serial/option.c ++++ b/drivers/usb/serial/option.c +@@ -496,6 +496,15 @@ static void option_instat_callback(struct urb *urb); + + /* MediaTek products */ + #define MEDIATEK_VENDOR_ID 0x0e8d ++#define MEDIATEK_PRODUCT_DC_1COM 0x00a0 ++#define MEDIATEK_PRODUCT_DC_4COM 0x00a5 ++#define MEDIATEK_PRODUCT_DC_5COM 0x00a4 ++#define MEDIATEK_PRODUCT_7208_1COM 0x7101 ++#define MEDIATEK_PRODUCT_7208_2COM 0x7102 ++#define MEDIATEK_PRODUCT_FP_1COM 0x0003 ++#define MEDIATEK_PRODUCT_FP_2COM 0x0023 ++#define MEDIATEK_PRODUCT_FPDC_1COM 0x0043 ++#define MEDIATEK_PRODUCT_FPDC_2COM 0x0033 + + /* Cellient products */ + #define CELLIENT_VENDOR_ID 0x2692 +@@ -553,6 +562,10 @@ static const struct option_blacklist_info net_intf1_blacklist = { + .reserved = BIT(1), + }; + ++static const struct option_blacklist_info net_intf2_blacklist = { ++ .reserved = BIT(2), ++}; ++ + static const struct option_blacklist_info net_intf3_blacklist = { + .reserved = BIT(3), + }; +@@ -1093,6 +1106,8 @@ static const struct usb_device_id option_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1298, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1299, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1300, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1402, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf2_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x2002, 0xff, + 0xff, 0xff), .driver_info = (kernel_ulong_t)&zte_k3765_z_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x2003, 0xff, 0xff, 0xff) }, +@@ -1234,6 +1249,17 @@ static const struct usb_device_id option_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a1, 0xff, 0x02, 0x01) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a2, 0xff, 0x00, 0x00) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x00a2, 0xff, 0x02, 0x01) }, /* MediaTek MT6276M modem & app port */ ++ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_1COM, 0x0a, 0x00, 0x00) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_5COM, 0xff, 0x02, 0x01) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_5COM, 0xff, 0x00, 0x00) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_4COM, 0xff, 0x02, 0x01) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_4COM, 0xff, 0x00, 0x00) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_7208_1COM, 0x02, 0x00, 0x00) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_7208_2COM, 0x02, 0x02, 0x01) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FP_1COM, 0x0a, 0x00, 0x00) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FP_2COM, 0x0a, 0x00, 0x00) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FPDC_1COM, 0x0a, 0x00, 0x00) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_FPDC_2COM, 0x0a, 0x00, 0x00) }, + { USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MEN200) }, + { } /* Terminating entry */ + }; +diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c +index c14c42b..ae66278 100644 +--- a/drivers/vhost/vhost.c ++++ b/drivers/vhost/vhost.c +@@ -222,6 +222,8 @@ static int vhost_worker(void *data) + if (work) { + __set_current_state(TASK_RUNNING); + work->fn(work); ++ if (need_resched()) ++ schedule(); + } else + schedule(); + +diff --git a/fs/buffer.c b/fs/buffer.c +index c807931..4115eca 100644 +--- a/fs/buffer.c ++++ b/fs/buffer.c +@@ -1087,6 +1087,9 @@ grow_buffers(struct block_device *bdev, sector_t block, int size) + static struct buffer_head * + __getblk_slow(struct block_device *bdev, sector_t block, int size) + { ++ int ret; ++ struct buffer_head *bh; ++ + /* Size must be multiple of hard sectorsize */ + if (unlikely(size & (bdev_logical_block_size(bdev)-1) || + (size < 512 || size > PAGE_SIZE))) { +@@ -1099,20 +1102,21 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) + return NULL; + } + +- for (;;) { +- struct buffer_head * bh; +- int ret; ++retry: ++ bh = __find_get_block(bdev, block, size); ++ if (bh) ++ return bh; + ++ ret = grow_buffers(bdev, block, size); ++ if (ret == 0) { ++ free_more_memory(); ++ goto retry; ++ } else if (ret > 0) { + bh = __find_get_block(bdev, block, size); + if (bh) + return bh; +- +- ret = grow_buffers(bdev, block, size); +- if (ret < 0) +- return NULL; +- if (ret == 0) +- free_more_memory(); + } ++ return NULL; + } + + /* +diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c +index b21670c..56c152d 100644 +--- a/fs/cifs/connect.c ++++ b/fs/cifs/connect.c +@@ -2925,6 +2925,18 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, + #define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024) + #define CIFS_DEFAULT_NON_POSIX_WSIZE (65536) + ++/* ++ * On hosts with high memory, we can't currently support wsize/rsize that are ++ * larger than we can kmap at once. Cap the rsize/wsize at ++ * LAST_PKMAP * PAGE_SIZE. We'll never be able to fill a read or write request ++ * larger than that anyway. ++ */ ++#ifdef CONFIG_HIGHMEM ++#define CIFS_KMAP_SIZE_LIMIT (LAST_PKMAP * PAGE_CACHE_SIZE) ++#else /* CONFIG_HIGHMEM */ ++#define CIFS_KMAP_SIZE_LIMIT (1<<24) ++#endif /* CONFIG_HIGHMEM */ ++ + static unsigned int + cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) + { +@@ -2955,6 +2967,9 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) + wsize = min_t(unsigned int, wsize, + server->maxBuf - sizeof(WRITE_REQ) + 4); + ++ /* limit to the amount that we can kmap at once */ ++ wsize = min_t(unsigned int, wsize, CIFS_KMAP_SIZE_LIMIT); ++ + /* hard limit of CIFS_MAX_WSIZE */ + wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE); + +@@ -2996,6 +3011,9 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) + if (!(server->capabilities & CAP_LARGE_READ_X)) + rsize = min_t(unsigned int, CIFSMaxBufSize, rsize); + ++ /* limit to the amount that we can kmap at once */ ++ rsize = min_t(unsigned int, rsize, CIFS_KMAP_SIZE_LIMIT); ++ + /* hard limit of CIFS_MAX_RSIZE */ + rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE); + +diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c +index db4a138..4c37ed4 100644 +--- a/fs/cifs/readdir.c ++++ b/fs/cifs/readdir.c +@@ -86,9 +86,12 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name, + + dentry = d_lookup(parent, name); + if (dentry) { +- /* FIXME: check for inode number changes? */ +- if (dentry->d_inode != NULL) ++ inode = dentry->d_inode; ++ /* update inode in place if i_ino didn't change */ ++ if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { ++ cifs_fattr_to_inode(inode, fattr); + return dentry; ++ } + d_drop(dentry); + dput(dentry); + } +diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c +index 69f994a..0dbe58a 100644 +--- a/fs/ecryptfs/kthread.c ++++ b/fs/ecryptfs/kthread.c +@@ -149,7 +149,7 @@ int ecryptfs_privileged_open(struct file **lower_file, + (*lower_file) = dentry_open(lower_dentry, lower_mnt, flags, cred); + if (!IS_ERR(*lower_file)) + goto out; +- if (flags & O_RDONLY) { ++ if ((flags & O_ACCMODE) == O_RDONLY) { + rc = PTR_ERR((*lower_file)); + goto out; + } +diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c +index 0dc5a3d..de42310 100644 +--- a/fs/ecryptfs/miscdev.c ++++ b/fs/ecryptfs/miscdev.c +@@ -49,7 +49,10 @@ ecryptfs_miscdev_poll(struct file *file, poll_table *pt) + mutex_lock(&ecryptfs_daemon_hash_mux); + /* TODO: Just use file->private_data? */ + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); +- BUG_ON(rc || !daemon); ++ if (rc || !daemon) { ++ mutex_unlock(&ecryptfs_daemon_hash_mux); ++ return -EINVAL; ++ } + mutex_lock(&daemon->mux); + mutex_unlock(&ecryptfs_daemon_hash_mux); + if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { +@@ -122,6 +125,7 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file) + goto out_unlock_daemon; + } + daemon->flags |= ECRYPTFS_DAEMON_MISCDEV_OPEN; ++ file->private_data = daemon; + atomic_inc(&ecryptfs_num_miscdev_opens); + out_unlock_daemon: + mutex_unlock(&daemon->mux); +@@ -152,9 +156,9 @@ ecryptfs_miscdev_release(struct inode *inode, struct file *file) + + mutex_lock(&ecryptfs_daemon_hash_mux); + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); +- BUG_ON(rc || !daemon); ++ if (rc || !daemon) ++ daemon = file->private_data; + mutex_lock(&daemon->mux); +- BUG_ON(daemon->pid != task_pid(current)); + BUG_ON(!(daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN)); + daemon->flags &= ~ECRYPTFS_DAEMON_MISCDEV_OPEN; + atomic_dec(&ecryptfs_num_miscdev_opens); +@@ -191,31 +195,32 @@ int ecryptfs_send_miscdev(char *data, size_t data_size, + struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, + u16 msg_flags, struct ecryptfs_daemon *daemon) + { +- int rc = 0; ++ struct ecryptfs_message *msg; + +- mutex_lock(&msg_ctx->mux); +- msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size), +- GFP_KERNEL); +- if (!msg_ctx->msg) { +- rc = -ENOMEM; ++ msg = kmalloc((sizeof(*msg) + data_size), GFP_KERNEL); ++ if (!msg) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc(%zd, GFP_KERNEL)\n", __func__, +- (sizeof(*msg_ctx->msg) + data_size)); +- goto out_unlock; ++ (sizeof(*msg) + data_size)); ++ return -ENOMEM; + } ++ ++ mutex_lock(&msg_ctx->mux); ++ msg_ctx->msg = msg; + msg_ctx->msg->index = msg_ctx->index; + msg_ctx->msg->data_len = data_size; + msg_ctx->type = msg_type; + memcpy(msg_ctx->msg->data, data, data_size); + msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size); +- mutex_lock(&daemon->mux); + list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue); ++ mutex_unlock(&msg_ctx->mux); ++ ++ mutex_lock(&daemon->mux); + daemon->num_queued_msg_ctx++; + wake_up_interruptible(&daemon->wait); + mutex_unlock(&daemon->mux); +-out_unlock: +- mutex_unlock(&msg_ctx->mux); +- return rc; ++ ++ return 0; + } + + /** +@@ -246,8 +251,16 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count, + mutex_lock(&ecryptfs_daemon_hash_mux); + /* TODO: Just use file->private_data? */ + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); +- BUG_ON(rc || !daemon); ++ if (rc || !daemon) { ++ mutex_unlock(&ecryptfs_daemon_hash_mux); ++ return -EINVAL; ++ } + mutex_lock(&daemon->mux); ++ if (task_pid(current) != daemon->pid) { ++ mutex_unlock(&daemon->mux); ++ mutex_unlock(&ecryptfs_daemon_hash_mux); ++ return -EPERM; ++ } + if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { + rc = 0; + mutex_unlock(&ecryptfs_daemon_hash_mux); +@@ -284,9 +297,6 @@ check_list: + * message from the queue; try again */ + goto check_list; + } +- BUG_ON(euid != daemon->euid); +- BUG_ON(current_user_ns() != daemon->user_ns); +- BUG_ON(task_pid(current) != daemon->pid); + msg_ctx = list_first_entry(&daemon->msg_ctx_out_queue, + struct ecryptfs_msg_ctx, daemon_out_list); + BUG_ON(!msg_ctx); +diff --git a/fs/eventpoll.c b/fs/eventpoll.c +index 4d9d3a4..a6f3763 100644 +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -1629,8 +1629,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + if (op == EPOLL_CTL_ADD) { + if (is_file_epoll(tfile)) { + error = -ELOOP; +- if (ep_loop_check(ep, tfile) != 0) ++ if (ep_loop_check(ep, tfile) != 0) { ++ clear_tfile_check_list(); + goto error_tgt_fput; ++ } + } else + list_add(&tfile->f_tfile_llink, &tfile_check_list); + } +diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c +index 49cf230..24a49d4 100644 +--- a/fs/exofs/ore.c ++++ b/fs/exofs/ore.c +@@ -735,13 +735,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) + out: + ios->numdevs = devs_in_group; + ios->pages_consumed = cur_pg; +- if (unlikely(ret)) { +- if (length == ios->length) +- return ret; +- else +- ios->length -= length; +- } +- return 0; ++ return ret; + } + + int ore_create(struct ore_io_state *ios) +diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c +index d222c77..fff2070 100644 +--- a/fs/exofs/ore_raid.c ++++ b/fs/exofs/ore_raid.c +@@ -461,16 +461,12 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) + * ios->sp2d[p][*], xor is calculated the same way. These pages are + * allocated/freed and don't go through cache + */ +-static int _read_4_write(struct ore_io_state *ios) ++static int _read_4_write_first_stripe(struct ore_io_state *ios) + { +- struct ore_io_state *ios_read; + struct ore_striping_info read_si; + struct __stripe_pages_2d *sp2d = ios->sp2d; + u64 offset = ios->si.first_stripe_start; +- u64 last_stripe_end; +- unsigned bytes_in_stripe = ios->si.bytes_in_stripe; +- unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; +- int ret; ++ unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; + + if (offset == ios->offset) /* Go to start collect $200 */ + goto read_last_stripe; +@@ -478,6 +474,9 @@ static int _read_4_write(struct ore_io_state *ios) + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + ++ ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n", ++ offset, ios->offset, min_p, max_p); ++ + for (c = 0; ; c++) { + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + read_si.obj_offset += min_p * PAGE_SIZE; +@@ -512,6 +511,18 @@ static int _read_4_write(struct ore_io_state *ios) + } + + read_last_stripe: ++ return 0; ++} ++ ++static int _read_4_write_last_stripe(struct ore_io_state *ios) ++{ ++ struct ore_striping_info read_si; ++ struct __stripe_pages_2d *sp2d = ios->sp2d; ++ u64 offset; ++ u64 last_stripe_end; ++ unsigned bytes_in_stripe = ios->si.bytes_in_stripe; ++ unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; ++ + offset = ios->offset + ios->length; + if (offset % PAGE_SIZE) + _add_to_r4w_last_page(ios, &offset); +@@ -527,15 +538,15 @@ read_last_stripe: + c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, + ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); + +- BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); +- /* unaligned IO must be within a single stripe */ +- + if (min_p == sp2d->pages_in_unit) { + /* Didn't do it yet */ + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + } + ++ ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n", ++ offset, last_stripe_end, min_p, max_p); ++ + while (offset < last_stripe_end) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + +@@ -568,6 +579,15 @@ read_last_stripe: + } + + read_it: ++ return 0; ++} ++ ++static int _read_4_write_execute(struct ore_io_state *ios) ++{ ++ struct ore_io_state *ios_read; ++ unsigned i; ++ int ret; ++ + ios_read = ios->ios_read_4_write; + if (!ios_read) + return 0; +@@ -591,6 +611,8 @@ read_it: + } + + _mark_read4write_pages_uptodate(ios_read, ret); ++ ore_put_io_state(ios_read); ++ ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */ + return 0; + } + +@@ -626,8 +648,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios, + /* If first stripe, Read in all read4write pages + * (if needed) before we calculate the first parity. + */ +- _read_4_write(ios); ++ _read_4_write_first_stripe(ios); + } ++ if (!cur_len) /* If last stripe r4w pages of last stripe */ ++ _read_4_write_last_stripe(ios); ++ _read_4_write_execute(ios); + + for (i = 0; i < num_pages; i++) { + pages[i] = _raid_page_alloc(); +@@ -654,34 +679,14 @@ int _ore_add_parity_unit(struct ore_io_state *ios, + + int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) + { +- struct ore_layout *layout = ios->layout; +- + if (ios->parity_pages) { ++ struct ore_layout *layout = ios->layout; + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; +- unsigned stripe_size = ios->si.bytes_in_stripe; +- u64 last_stripe, first_stripe; + + if (_sp2d_alloc(pages_in_unit, layout->group_width, + layout->parity, &ios->sp2d)) { + return -ENOMEM; + } +- +- /* Round io down to last full strip */ +- first_stripe = div_u64(ios->offset, stripe_size); +- last_stripe = div_u64(ios->offset + ios->length, stripe_size); +- +- /* If an IO spans more then a single stripe it must end at +- * a stripe boundary. The reminder at the end is pushed into the +- * next IO. +- */ +- if (last_stripe != first_stripe) { +- ios->length = last_stripe * stripe_size - ios->offset; +- +- BUG_ON(!ios->length); +- ios->nr_pages = (ios->length + PAGE_SIZE - 1) / +- PAGE_SIZE; +- ios->si.length = ios->length; /*make it consistent */ +- } + } + return 0; + } +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index ab7aa3f..a93486e 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1097,7 +1097,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) + } + if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { + seq_printf(seq, ",max_batch_time=%u", +- (unsigned) sbi->s_min_batch_time); ++ (unsigned) sbi->s_max_batch_time); + } + + /* +diff --git a/fs/fifo.c b/fs/fifo.c +index b1a524d..cf6f434 100644 +--- a/fs/fifo.c ++++ b/fs/fifo.c +@@ -14,7 +14,7 @@ + #include + #include + +-static void wait_for_partner(struct inode* inode, unsigned int *cnt) ++static int wait_for_partner(struct inode* inode, unsigned int *cnt) + { + int cur = *cnt; + +@@ -23,6 +23,7 @@ static void wait_for_partner(struct inode* inode, unsigned int *cnt) + if (signal_pending(current)) + break; + } ++ return cur == *cnt ? -ERESTARTSYS : 0; + } + + static void wake_up_partner(struct inode* inode) +@@ -67,8 +68,7 @@ static int fifo_open(struct inode *inode, struct file *filp) + * seen a writer */ + filp->f_version = pipe->w_counter; + } else { +- wait_for_partner(inode, &pipe->w_counter); +- if(signal_pending(current)) ++ if (wait_for_partner(inode, &pipe->w_counter)) + goto err_rd; + } + } +@@ -90,8 +90,7 @@ static int fifo_open(struct inode *inode, struct file *filp) + wake_up_partner(inode); + + if (!pipe->readers) { +- wait_for_partner(inode, &pipe->r_counter); +- if (signal_pending(current)) ++ if (wait_for_partner(inode, &pipe->r_counter)) + goto err_wr; + } + break; +diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c +index 2d0ca24..ebc2f4d 100644 +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -592,9 +592,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) + spin_lock(&sbinfo->stat_lock); + /* If no limits set, just report 0 for max/free/used + * blocks, like simple_statfs() */ +- if (sbinfo->max_blocks >= 0) { +- buf->f_blocks = sbinfo->max_blocks; +- buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; ++ if (sbinfo->spool) { ++ long free_pages; ++ ++ spin_lock(&sbinfo->spool->lock); ++ buf->f_blocks = sbinfo->spool->max_hpages; ++ free_pages = sbinfo->spool->max_hpages ++ - sbinfo->spool->used_hpages; ++ buf->f_bavail = buf->f_bfree = free_pages; ++ spin_unlock(&sbinfo->spool->lock); + buf->f_files = sbinfo->max_inodes; + buf->f_ffree = sbinfo->free_inodes; + } +@@ -610,6 +616,10 @@ static void hugetlbfs_put_super(struct super_block *sb) + + if (sbi) { + sb->s_fs_info = NULL; ++ ++ if (sbi->spool) ++ hugepage_put_subpool(sbi->spool); ++ + kfree(sbi); + } + } +@@ -841,10 +851,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) + sb->s_fs_info = sbinfo; + sbinfo->hstate = config.hstate; + spin_lock_init(&sbinfo->stat_lock); +- sbinfo->max_blocks = config.nr_blocks; +- sbinfo->free_blocks = config.nr_blocks; + sbinfo->max_inodes = config.nr_inodes; + sbinfo->free_inodes = config.nr_inodes; ++ sbinfo->spool = NULL; ++ if (config.nr_blocks != -1) { ++ sbinfo->spool = hugepage_new_subpool(config.nr_blocks); ++ if (!sbinfo->spool) ++ goto out_free; ++ } + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = huge_page_size(config.hstate); + sb->s_blocksize_bits = huge_page_shift(config.hstate); +@@ -864,38 +878,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) + sb->s_root = root; + return 0; + out_free: ++ if (sbinfo->spool) ++ kfree(sbinfo->spool); + kfree(sbinfo); + return -ENOMEM; + } + +-int hugetlb_get_quota(struct address_space *mapping, long delta) +-{ +- int ret = 0; +- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); +- +- if (sbinfo->free_blocks > -1) { +- spin_lock(&sbinfo->stat_lock); +- if (sbinfo->free_blocks - delta >= 0) +- sbinfo->free_blocks -= delta; +- else +- ret = -ENOMEM; +- spin_unlock(&sbinfo->stat_lock); +- } +- +- return ret; +-} +- +-void hugetlb_put_quota(struct address_space *mapping, long delta) +-{ +- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); +- +- if (sbinfo->free_blocks > -1) { +- spin_lock(&sbinfo->stat_lock); +- sbinfo->free_blocks += delta; +- spin_unlock(&sbinfo->stat_lock); +- } +-} +- + static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) + { +diff --git a/fs/locks.c b/fs/locks.c +index 0d68f1f..6a64f15 100644 +--- a/fs/locks.c ++++ b/fs/locks.c +@@ -1465,7 +1465,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) + case F_WRLCK: + return generic_add_lease(filp, arg, flp); + default: +- BUG(); ++ return -EINVAL; + } + } + EXPORT_SYMBOL(generic_setlease); +diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c +index 47d1c6f..b122af8 100644 +--- a/fs/nfs/idmap.c ++++ b/fs/nfs/idmap.c +@@ -318,12 +318,12 @@ struct idmap_hashent { + unsigned long ih_expires; + __u32 ih_id; + size_t ih_namelen; +- char ih_name[IDMAP_NAMESZ]; ++ const char *ih_name; + }; + + struct idmap_hashtable { + __u8 h_type; +- struct idmap_hashent h_entries[IDMAP_HASH_SZ]; ++ struct idmap_hashent *h_entries; + }; + + struct idmap { +@@ -378,6 +378,28 @@ nfs_idmap_new(struct nfs_client *clp) + return 0; + } + ++static void ++idmap_alloc_hashtable(struct idmap_hashtable *h) ++{ ++ if (h->h_entries != NULL) ++ return; ++ h->h_entries = kcalloc(IDMAP_HASH_SZ, ++ sizeof(*h->h_entries), ++ GFP_KERNEL); ++} ++ ++static void ++idmap_free_hashtable(struct idmap_hashtable *h) ++{ ++ int i; ++ ++ if (h->h_entries == NULL) ++ return; ++ for (i = 0; i < IDMAP_HASH_SZ; i++) ++ kfree(h->h_entries[i].ih_name); ++ kfree(h->h_entries); ++} ++ + void + nfs_idmap_delete(struct nfs_client *clp) + { +@@ -387,6 +409,8 @@ nfs_idmap_delete(struct nfs_client *clp) + return; + rpc_unlink(idmap->idmap_dentry); + clp->cl_idmap = NULL; ++ idmap_free_hashtable(&idmap->idmap_user_hash); ++ idmap_free_hashtable(&idmap->idmap_group_hash); + kfree(idmap); + } + +@@ -396,6 +420,8 @@ nfs_idmap_delete(struct nfs_client *clp) + static inline struct idmap_hashent * + idmap_name_hash(struct idmap_hashtable* h, const char *name, size_t len) + { ++ if (h->h_entries == NULL) ++ return NULL; + return &h->h_entries[fnvhash32(name, len) % IDMAP_HASH_SZ]; + } + +@@ -404,6 +430,8 @@ idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len) + { + struct idmap_hashent *he = idmap_name_hash(h, name, len); + ++ if (he == NULL) ++ return NULL; + if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0) + return NULL; + if (time_after(jiffies, he->ih_expires)) +@@ -414,6 +442,8 @@ idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len) + static inline struct idmap_hashent * + idmap_id_hash(struct idmap_hashtable* h, __u32 id) + { ++ if (h->h_entries == NULL) ++ return NULL; + return &h->h_entries[fnvhash32(&id, sizeof(id)) % IDMAP_HASH_SZ]; + } + +@@ -421,6 +451,9 @@ static struct idmap_hashent * + idmap_lookup_id(struct idmap_hashtable *h, __u32 id) + { + struct idmap_hashent *he = idmap_id_hash(h, id); ++ ++ if (he == NULL) ++ return NULL; + if (he->ih_id != id || he->ih_namelen == 0) + return NULL; + if (time_after(jiffies, he->ih_expires)) +@@ -436,12 +469,14 @@ idmap_lookup_id(struct idmap_hashtable *h, __u32 id) + static inline struct idmap_hashent * + idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len) + { ++ idmap_alloc_hashtable(h); + return idmap_name_hash(h, name, len); + } + + static inline struct idmap_hashent * + idmap_alloc_id(struct idmap_hashtable *h, __u32 id) + { ++ idmap_alloc_hashtable(h); + return idmap_id_hash(h, id); + } + +@@ -449,9 +484,14 @@ static void + idmap_update_entry(struct idmap_hashent *he, const char *name, + size_t namelen, __u32 id) + { ++ char *str = kmalloc(namelen + 1, GFP_KERNEL); ++ if (str == NULL) ++ return; ++ kfree(he->ih_name); + he->ih_id = id; +- memcpy(he->ih_name, name, namelen); +- he->ih_name[namelen] = '\0'; ++ memcpy(str, name, namelen); ++ str[namelen] = '\0'; ++ he->ih_name = str; + he->ih_namelen = namelen; + he->ih_expires = jiffies + nfs_idmap_cache_timeout; + } +diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c +index 66020ac..07354b7 100644 +--- a/fs/nfs/nfs4state.c ++++ b/fs/nfs/nfs4state.c +@@ -1186,8 +1186,9 @@ restart: + spin_lock(&state->state_lock); + list_for_each_entry(lock, &state->lock_states, ls_locks) { + if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) +- printk("%s: Lock reclaim failed!\n", +- __func__); ++ pr_warn_ratelimited("NFS: " ++ "%s: Lock reclaim " ++ "failed!\n", __func__); + } + spin_unlock(&state->state_lock); + nfs4_put_open_state(state); +diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c +index 55d0128..a03ee52 100644 +--- a/fs/nfs/objlayout/objio_osd.c ++++ b/fs/nfs/objlayout/objio_osd.c +@@ -433,7 +433,10 @@ int objio_read_pagelist(struct nfs_read_data *rdata) + objios->ios->done = _read_done; + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + rdata->args.offset, rdata->args.count); +- return ore_read(objios->ios); ++ ret = ore_read(objios->ios); ++ if (unlikely(ret)) ++ objio_free_result(&objios->oir); ++ return ret; + } + + /* +@@ -464,8 +467,16 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) + struct objio_state *objios = priv; + struct nfs_write_data *wdata = objios->oir.rpcdata; + pgoff_t index = offset / PAGE_SIZE; +- struct page *page = find_get_page(wdata->inode->i_mapping, index); ++ struct page *page; ++ loff_t i_size = i_size_read(wdata->inode); ++ ++ if (offset >= i_size) { ++ *uptodate = true; ++ dprintk("%s: g_zero_page index=0x%lx\n", __func__, index); ++ return ZERO_PAGE(0); ++ } + ++ page = find_get_page(wdata->inode->i_mapping, index); + if (!page) { + page = find_or_create_page(wdata->inode->i_mapping, + index, GFP_NOFS); +@@ -486,8 +497,10 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) + + static void __r4w_put_page(void *priv, struct page *page) + { +- dprintk("%s: index=0x%lx\n", __func__, page->index); +- page_cache_release(page); ++ dprintk("%s: index=0x%lx\n", __func__, ++ (page == ZERO_PAGE(0)) ? -1UL : page->index); ++ if (ZERO_PAGE(0) != page) ++ page_cache_release(page); + return; + } + +@@ -517,8 +530,10 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how) + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + wdata->args.offset, wdata->args.count); + ret = ore_write(objios->ios); +- if (unlikely(ret)) ++ if (unlikely(ret)) { ++ objio_free_result(&objios->oir); + return ret; ++ } + + if (objios->sync) + _write_done(objios->ios, objios); +diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c +index 07ee5b4..1c7d45e 100644 +--- a/fs/ocfs2/file.c ++++ b/fs/ocfs2/file.c +@@ -1950,7 +1950,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, + if (ret < 0) + mlog_errno(ret); + +- if (file->f_flags & O_SYNC) ++ if (file && (file->f_flags & O_SYNC)) + handle->h_sync = 1; + + ocfs2_commit_trans(osb, handle); +diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c +index fbb0b47..d5378d0 100644 +--- a/fs/ramfs/file-nommu.c ++++ b/fs/ramfs/file-nommu.c +@@ -110,6 +110,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) + + /* prevent the page from being discarded on memory pressure */ + SetPageDirty(page); ++ SetPageUptodate(page); + + unlock_page(page); + put_page(page); +diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c +index 6094c5a..b73ecd8 100644 +--- a/fs/ubifs/sb.c ++++ b/fs/ubifs/sb.c +@@ -715,8 +715,12 @@ static int fixup_free_space(struct ubifs_info *c) + lnum = ubifs_next_log_lnum(c, lnum); + } + +- /* Fixup the current log head */ +- err = fixup_leb(c, c->lhead_lnum, c->lhead_offs); ++ /* ++ * Fixup the log head which contains the only a CS node at the ++ * beginning. ++ */ ++ err = fixup_leb(c, c->lhead_lnum, ++ ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size)); + if (err) + goto out; + +diff --git a/include/linux/Kbuild b/include/linux/Kbuild +index bd21ecd..a3ce901 100644 +--- a/include/linux/Kbuild ++++ b/include/linux/Kbuild +@@ -268,6 +268,7 @@ header-y += netfilter_ipv4.h + header-y += netfilter_ipv6.h + header-y += netlink.h + header-y += netrom.h ++header-y += nfc.h + header-y += nfs.h + header-y += nfs2.h + header-y += nfs3.h +diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h +index fd0dc30..cc07d27 100644 +--- a/include/linux/hrtimer.h ++++ b/include/linux/hrtimer.h +@@ -165,6 +165,7 @@ enum hrtimer_base_type { + * @lock: lock protecting the base and associated clock bases + * and timers + * @active_bases: Bitfield to mark bases with active timers ++ * @clock_was_set: Indicates that clock was set from irq context. + * @expires_next: absolute time of the next event which was scheduled + * via clock_set_next_event() + * @hres_active: State of high resolution mode +@@ -177,7 +178,8 @@ enum hrtimer_base_type { + */ + struct hrtimer_cpu_base { + raw_spinlock_t lock; +- unsigned long active_bases; ++ unsigned int active_bases; ++ unsigned int clock_was_set; + #ifdef CONFIG_HIGH_RES_TIMERS + ktime_t expires_next; + int hres_active; +@@ -286,6 +288,8 @@ extern void hrtimer_peek_ahead_timers(void); + # define MONOTONIC_RES_NSEC HIGH_RES_NSEC + # define KTIME_MONOTONIC_RES KTIME_HIGH_RES + ++extern void clock_was_set_delayed(void); ++ + #else + + # define MONOTONIC_RES_NSEC LOW_RES_NSEC +@@ -306,6 +310,9 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer) + { + return 0; + } ++ ++static inline void clock_was_set_delayed(void) { } ++ + #endif + + extern void clock_was_set(void); +@@ -320,6 +327,7 @@ extern ktime_t ktime_get(void); + extern ktime_t ktime_get_real(void); + extern ktime_t ktime_get_boottime(void); + extern ktime_t ktime_get_monotonic_offset(void); ++extern ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot); + + DECLARE_PER_CPU(struct tick_device, tick_cpu_device); + +diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h +index d9d6c86..c5ed2f1 100644 +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -14,6 +14,15 @@ struct user_struct; + #include + #include + ++struct hugepage_subpool { ++ spinlock_t lock; ++ long count; ++ long max_hpages, used_hpages; ++}; ++ ++struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); ++void hugepage_put_subpool(struct hugepage_subpool *spool); ++ + int PageHuge(struct page *page); + + void reset_vma_resv_huge_pages(struct vm_area_struct *vma); +@@ -138,12 +147,11 @@ struct hugetlbfs_config { + }; + + struct hugetlbfs_sb_info { +- long max_blocks; /* blocks allowed */ +- long free_blocks; /* blocks free */ + long max_inodes; /* inodes allowed */ + long free_inodes; /* inodes free */ + spinlock_t stat_lock; + struct hstate *hstate; ++ struct hugepage_subpool *spool; + }; + + +@@ -166,8 +174,6 @@ extern const struct file_operations hugetlbfs_file_operations; + extern const struct vm_operations_struct hugetlb_vm_ops; + struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, + struct user_struct **user, int creat_flags); +-int hugetlb_get_quota(struct address_space *mapping, long delta); +-void hugetlb_put_quota(struct address_space *mapping, long delta); + + static inline int is_file_hugepages(struct file *file) + { +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 188cb2f..905b1e1 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -652,7 +652,7 @@ typedef struct pglist_data { + range, including holes */ + int node_id; + wait_queue_head_t kswapd_wait; +- struct task_struct *kswapd; ++ struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ + int kswapd_max_order; + enum zone_type classzone_idx; + } pg_data_t; +diff --git a/include/linux/pci.h b/include/linux/pci.h +index c0cfa0d..7cda65b 100644 +--- a/include/linux/pci.h ++++ b/include/linux/pci.h +@@ -176,8 +176,6 @@ enum pci_dev_flags { + PCI_DEV_FLAGS_NO_D3 = (__force pci_dev_flags_t) 2, + /* Provide indication device is assigned by a Virtual Machine Manager */ + PCI_DEV_FLAGS_ASSIGNED = (__force pci_dev_flags_t) 4, +- /* Device causes system crash if in D3 during S3 sleep */ +- PCI_DEV_FLAGS_NO_D3_DURING_SLEEP = (__force pci_dev_flags_t) 8, + }; + + enum pci_irq_reroute_variant { +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 1c4f3e9..5afa2a3 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1892,6 +1892,14 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, + } + #endif + ++#ifdef CONFIG_NO_HZ ++void calc_load_enter_idle(void); ++void calc_load_exit_idle(void); ++#else ++static inline void calc_load_enter_idle(void) { } ++static inline void calc_load_exit_idle(void) { } ++#endif /* CONFIG_NO_HZ */ ++ + #ifndef CONFIG_CPUMASK_OFFSTACK + static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) + { +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index bdb4590..53dc7e7 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -213,11 +213,8 @@ enum { + /* device driver is going to provide hardware time stamp */ + SKBTX_IN_PROGRESS = 1 << 2, + +- /* ensure the originating sk reference is available on driver level */ +- SKBTX_DRV_NEEDS_SK_REF = 1 << 3, +- + /* device driver supports TX zero-copy buffers */ +- SKBTX_DEV_ZEROCOPY = 1 << 4, ++ SKBTX_DEV_ZEROCOPY = 1 << 3, + }; + + /* +diff --git a/include/linux/timex.h b/include/linux/timex.h +index aa60fe7..08e90fb 100644 +--- a/include/linux/timex.h ++++ b/include/linux/timex.h +@@ -266,7 +266,7 @@ static inline int ntp_synced(void) + /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ + extern u64 tick_length; + +-extern void second_overflow(void); ++extern int second_overflow(unsigned long secs); + extern void update_ntp_one_tick(void); + extern int do_adjtimex(struct timex *); + extern void hardpps(const struct timespec *, const struct timespec *); +diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h +index 6a308d4..1e100c6 100644 +--- a/include/scsi/libsas.h ++++ b/include/scsi/libsas.h +@@ -159,6 +159,8 @@ enum ata_command_set { + ATAPI_COMMAND_SET = 1, + }; + ++#define ATA_RESP_FIS_SIZE 24 ++ + struct sata_device { + enum ata_command_set command_set; + struct smp_resp rps_resp; /* report_phy_sata_resp */ +@@ -170,7 +172,7 @@ struct sata_device { + + struct ata_port *ap; + struct ata_host ata_host; +- struct ata_taskfile tf; ++ u8 fis[ATA_RESP_FIS_SIZE]; + u32 sstatus; + u32 serror; + u32 scontrol; +@@ -486,7 +488,7 @@ enum exec_status { + */ + struct ata_task_resp { + u16 frame_len; +- u8 ending_fis[24]; /* dev to host or data-in */ ++ u8 ending_fis[ATA_RESP_FIS_SIZE]; /* dev to host or data-in */ + u32 sstatus; + u32 serror; + u32 scontrol; +diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c +index ae34bf5..6db7a5e 100644 +--- a/kernel/hrtimer.c ++++ b/kernel/hrtimer.c +@@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, + return 0; + } + ++static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) ++{ ++ ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ++ ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ++ ++ return ktime_get_update_offsets(offs_real, offs_boot); ++} ++ + /* + * Retrigger next event is called after clock was set + * +@@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, + static void retrigger_next_event(void *arg) + { + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); +- struct timespec realtime_offset, xtim, wtm, sleep; + + if (!hrtimer_hres_active()) + return; + +- /* Optimized out for !HIGH_RES */ +- get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); +- set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); +- +- /* Adjust CLOCK_REALTIME offset */ + raw_spin_lock(&base->lock); +- base->clock_base[HRTIMER_BASE_REALTIME].offset = +- timespec_to_ktime(realtime_offset); +- base->clock_base[HRTIMER_BASE_BOOTTIME].offset = +- timespec_to_ktime(sleep); +- ++ hrtimer_update_base(base); + hrtimer_force_reprogram(base, 0); + raw_spin_unlock(&base->lock); + } +@@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void) + base->clock_base[i].resolution = KTIME_HIGH_RES; + + tick_setup_sched_timer(); +- + /* "Retrigger" the interrupt to get things going */ + retrigger_next_event(NULL); + local_irq_restore(flags); + return 1; + } + ++/* ++ * Called from timekeeping code to reprogramm the hrtimer interrupt ++ * device. If called from the timer interrupt context we defer it to ++ * softirq context. ++ */ ++void clock_was_set_delayed(void) ++{ ++ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); ++ ++ cpu_base->clock_was_set = 1; ++ __raise_softirq_irqoff(HRTIMER_SOFTIRQ); ++} ++ + #else + + static inline int hrtimer_hres_active(void) { return 0; } +@@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) + cpu_base->nr_events++; + dev->next_event.tv64 = KTIME_MAX; + +- entry_time = now = ktime_get(); ++ raw_spin_lock(&cpu_base->lock); ++ entry_time = now = hrtimer_update_base(cpu_base); + retry: + expires_next.tv64 = KTIME_MAX; +- +- raw_spin_lock(&cpu_base->lock); + /* + * We set expires_next to KTIME_MAX here with cpu_base->lock + * held to prevent that a timer is enqueued in our queue via +@@ -1330,8 +1339,12 @@ retry: + * We need to prevent that we loop forever in the hrtimer + * interrupt routine. We give it 3 attempts to avoid + * overreacting on some spurious event. ++ * ++ * Acquire base lock for updating the offsets and retrieving ++ * the current time. + */ +- now = ktime_get(); ++ raw_spin_lock(&cpu_base->lock); ++ now = hrtimer_update_base(cpu_base); + cpu_base->nr_retries++; + if (++retries < 3) + goto retry; +@@ -1343,6 +1356,7 @@ retry: + */ + cpu_base->nr_hangs++; + cpu_base->hang_detected = 1; ++ raw_spin_unlock(&cpu_base->lock); + delta = ktime_sub(now, entry_time); + if (delta.tv64 > cpu_base->max_hang_time.tv64) + cpu_base->max_hang_time = delta; +@@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void) + + static void run_hrtimer_softirq(struct softirq_action *h) + { ++ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); ++ ++ if (cpu_base->clock_was_set) { ++ cpu_base->clock_was_set = 0; ++ clock_was_set(); ++ } ++ + hrtimer_peek_ahead_timers(); + } + +diff --git a/kernel/power/swap.c b/kernel/power/swap.c +index b313086..64f8f97 100644 +--- a/kernel/power/swap.c ++++ b/kernel/power/swap.c +@@ -6,7 +6,7 @@ + * + * Copyright (C) 1998,2001-2005 Pavel Machek + * Copyright (C) 2006 Rafael J. Wysocki +- * Copyright (C) 2010 Bojan Smojver ++ * Copyright (C) 2010-2012 Bojan Smojver + * + * This file is released under the GPLv2. + * +@@ -283,14 +283,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) + return -ENOSPC; + + if (bio_chain) { +- src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); ++ src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | ++ __GFP_NORETRY); + if (src) { + copy_page(src, buf); + } else { + ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ + if (ret) + return ret; +- src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); ++ src = (void *)__get_free_page(__GFP_WAIT | ++ __GFP_NOWARN | ++ __GFP_NORETRY); + if (src) { + copy_page(src, buf); + } else { +@@ -368,12 +371,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, + clear_page(handle->cur); + handle->cur_swap = offset; + handle->k = 0; +- } +- if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { +- error = hib_wait_on_bio_chain(bio_chain); +- if (error) +- goto out; +- handle->reqd_free_pages = reqd_free_pages(); ++ ++ if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { ++ error = hib_wait_on_bio_chain(bio_chain); ++ if (error) ++ goto out; ++ /* ++ * Recalculate the number of required free pages, to ++ * make sure we never take more than half. ++ */ ++ handle->reqd_free_pages = reqd_free_pages(); ++ } + } + out: + return error; +@@ -420,8 +428,9 @@ static int swap_writer_finish(struct swap_map_handle *handle, + /* Maximum number of threads for compression/decompression. */ + #define LZO_THREADS 3 + +-/* Maximum number of pages for read buffering. */ +-#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) ++/* Minimum/maximum number of pages for read buffering. */ ++#define LZO_MIN_RD_PAGES 1024 ++#define LZO_MAX_RD_PAGES 8192 + + + /** +@@ -632,12 +641,6 @@ static int save_image_lzo(struct swap_map_handle *handle, + } + + /* +- * Adjust number of free pages after all allocations have been done. +- * We don't want to run out of pages when writing. +- */ +- handle->reqd_free_pages = reqd_free_pages(); +- +- /* + * Start the CRC32 thread. + */ + init_waitqueue_head(&crc->go); +@@ -658,6 +661,12 @@ static int save_image_lzo(struct swap_map_handle *handle, + goto out_clean; + } + ++ /* ++ * Adjust the number of required free pages after all allocations have ++ * been done. We don't want to run out of pages when writing. ++ */ ++ handle->reqd_free_pages = reqd_free_pages(); ++ + printk(KERN_INFO + "PM: Using %u thread(s) for compression.\n" + "PM: Compressing and saving image data (%u pages) ... ", +@@ -1067,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle, + unsigned i, thr, run_threads, nr_threads; + unsigned ring = 0, pg = 0, ring_size = 0, + have = 0, want, need, asked = 0; +- unsigned long read_pages; ++ unsigned long read_pages = 0; + unsigned char **page = NULL; + struct dec_data *data = NULL; + struct crc_data *crc = NULL; +@@ -1079,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle, + nr_threads = num_online_cpus() - 1; + nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + +- page = vmalloc(sizeof(*page) * LZO_READ_PAGES); ++ page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); + if (!page) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + ret = -ENOMEM; +@@ -1144,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle, + } + + /* +- * Adjust number of pages for read buffering, in case we are short. ++ * Set the number of pages for read buffering. ++ * This is complete guesswork, because we'll only know the real ++ * picture once prepare_image() is called, which is much later on ++ * during the image load phase. We'll assume the worst case and ++ * say that none of the image pages are from high memory. + */ +- read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; +- read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); ++ if (low_free_pages() > snapshot_get_image_size()) ++ read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; ++ read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES); + + for (i = 0; i < read_pages; i++) { + page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? + __GFP_WAIT | __GFP_HIGH : +- __GFP_WAIT); ++ __GFP_WAIT | __GFP_NOWARN | ++ __GFP_NORETRY); ++ + if (!page[i]) { + if (i < LZO_CMP_PAGES) { + ring_size = i; +diff --git a/kernel/sched.c b/kernel/sched.c +index 576a27f..52ac69b 100644 +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -1885,7 +1885,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) + + #endif + +-static void calc_load_account_idle(struct rq *this_rq); + static void update_sysctl(void); + static int get_update_sysctl_factor(void); + static void update_cpu_load(struct rq *this_rq); +@@ -3401,11 +3400,73 @@ unsigned long this_cpu_load(void) + } + + ++/* ++ * Global load-average calculations ++ * ++ * We take a distributed and async approach to calculating the global load-avg ++ * in order to minimize overhead. ++ * ++ * The global load average is an exponentially decaying average of nr_running + ++ * nr_uninterruptible. ++ * ++ * Once every LOAD_FREQ: ++ * ++ * nr_active = 0; ++ * for_each_possible_cpu(cpu) ++ * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; ++ * ++ * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) ++ * ++ * Due to a number of reasons the above turns in the mess below: ++ * ++ * - for_each_possible_cpu() is prohibitively expensive on machines with ++ * serious number of cpus, therefore we need to take a distributed approach ++ * to calculating nr_active. ++ * ++ * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 ++ * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } ++ * ++ * So assuming nr_active := 0 when we start out -- true per definition, we ++ * can simply take per-cpu deltas and fold those into a global accumulate ++ * to obtain the same result. See calc_load_fold_active(). ++ * ++ * Furthermore, in order to avoid synchronizing all per-cpu delta folding ++ * across the machine, we assume 10 ticks is sufficient time for every ++ * cpu to have completed this task. ++ * ++ * This places an upper-bound on the IRQ-off latency of the machine. Then ++ * again, being late doesn't loose the delta, just wrecks the sample. ++ * ++ * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because ++ * this would add another cross-cpu cacheline miss and atomic operation ++ * to the wakeup path. Instead we increment on whatever cpu the task ran ++ * when it went into uninterruptible state and decrement on whatever cpu ++ * did the wakeup. This means that only the sum of nr_uninterruptible over ++ * all cpus yields the correct result. ++ * ++ * This covers the NO_HZ=n code, for extra head-aches, see the comment below. ++ */ ++ + /* Variables and functions for calc_load */ + static atomic_long_t calc_load_tasks; + static unsigned long calc_load_update; + unsigned long avenrun[3]; +-EXPORT_SYMBOL(avenrun); ++EXPORT_SYMBOL(avenrun); /* should be removed */ ++ ++/** ++ * get_avenrun - get the load average array ++ * @loads: pointer to dest load array ++ * @offset: offset to add ++ * @shift: shift count to shift the result left ++ * ++ * These values are estimates at best, so no need for locking. ++ */ ++void get_avenrun(unsigned long *loads, unsigned long offset, int shift) ++{ ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++} + + static long calc_load_fold_active(struct rq *this_rq) + { +@@ -3422,6 +3483,9 @@ static long calc_load_fold_active(struct rq *this_rq) + return delta; + } + ++/* ++ * a1 = a0 * e + a * (1 - e) ++ */ + static unsigned long + calc_load(unsigned long load, unsigned long exp, unsigned long active) + { +@@ -3433,30 +3497,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) + + #ifdef CONFIG_NO_HZ + /* +- * For NO_HZ we delay the active fold to the next LOAD_FREQ update. ++ * Handle NO_HZ for the global load-average. ++ * ++ * Since the above described distributed algorithm to compute the global ++ * load-average relies on per-cpu sampling from the tick, it is affected by ++ * NO_HZ. ++ * ++ * The basic idea is to fold the nr_active delta into a global idle-delta upon ++ * entering NO_HZ state such that we can include this as an 'extra' cpu delta ++ * when we read the global state. ++ * ++ * Obviously reality has to ruin such a delightfully simple scheme: ++ * ++ * - When we go NO_HZ idle during the window, we can negate our sample ++ * contribution, causing under-accounting. ++ * ++ * We avoid this by keeping two idle-delta counters and flipping them ++ * when the window starts, thus separating old and new NO_HZ load. ++ * ++ * The only trick is the slight shift in index flip for read vs write. ++ * ++ * 0s 5s 10s 15s ++ * +10 +10 +10 +10 ++ * |-|-----------|-|-----------|-|-----------|-| ++ * r:0 0 1 1 0 0 1 1 0 ++ * w:0 1 1 0 0 1 1 0 0 ++ * ++ * This ensures we'll fold the old idle contribution in this window while ++ * accumlating the new one. ++ * ++ * - When we wake up from NO_HZ idle during the window, we push up our ++ * contribution, since we effectively move our sample point to a known ++ * busy state. ++ * ++ * This is solved by pushing the window forward, and thus skipping the ++ * sample, for this cpu (effectively using the idle-delta for this cpu which ++ * was in effect at the time the window opened). This also solves the issue ++ * of having to deal with a cpu having been in NOHZ idle for multiple ++ * LOAD_FREQ intervals. + * + * When making the ILB scale, we should try to pull this in as well. + */ +-static atomic_long_t calc_load_tasks_idle; ++static atomic_long_t calc_load_idle[2]; ++static int calc_load_idx; + +-static void calc_load_account_idle(struct rq *this_rq) ++static inline int calc_load_write_idx(void) + { ++ int idx = calc_load_idx; ++ ++ /* ++ * See calc_global_nohz(), if we observe the new index, we also ++ * need to observe the new update time. ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the folding window started, make sure we start writing in the ++ * next idle-delta. ++ */ ++ if (!time_before(jiffies, calc_load_update)) ++ idx++; ++ ++ return idx & 1; ++} ++ ++static inline int calc_load_read_idx(void) ++{ ++ return calc_load_idx & 1; ++} ++ ++void calc_load_enter_idle(void) ++{ ++ struct rq *this_rq = this_rq(); + long delta; + ++ /* ++ * We're going into NOHZ mode, if there's any pending delta, fold it ++ * into the pending idle delta. ++ */ + delta = calc_load_fold_active(this_rq); +- if (delta) +- atomic_long_add(delta, &calc_load_tasks_idle); ++ if (delta) { ++ int idx = calc_load_write_idx(); ++ atomic_long_add(delta, &calc_load_idle[idx]); ++ } + } + +-static long calc_load_fold_idle(void) ++void calc_load_exit_idle(void) + { +- long delta = 0; ++ struct rq *this_rq = this_rq(); ++ ++ /* ++ * If we're still before the sample window, we're done. ++ */ ++ if (time_before(jiffies, this_rq->calc_load_update)) ++ return; + + /* +- * Its got a race, we don't care... ++ * We woke inside or after the sample window, this means we're already ++ * accounted through the nohz accounting, so skip the entire deal and ++ * sync up for the next window. + */ +- if (atomic_long_read(&calc_load_tasks_idle)) +- delta = atomic_long_xchg(&calc_load_tasks_idle, 0); ++ this_rq->calc_load_update = calc_load_update; ++ if (time_before(jiffies, this_rq->calc_load_update + 10)) ++ this_rq->calc_load_update += LOAD_FREQ; ++} ++ ++static long calc_load_fold_idle(void) ++{ ++ int idx = calc_load_read_idx(); ++ long delta = 0; ++ ++ if (atomic_long_read(&calc_load_idle[idx])) ++ delta = atomic_long_xchg(&calc_load_idle[idx], 0); + + return delta; + } +@@ -3542,66 +3694,39 @@ static void calc_global_nohz(void) + { + long delta, active, n; + +- /* +- * If we crossed a calc_load_update boundary, make sure to fold +- * any pending idle changes, the respective CPUs might have +- * missed the tick driven calc_load_account_active() update +- * due to NO_HZ. +- */ +- delta = calc_load_fold_idle(); +- if (delta) +- atomic_long_add(delta, &calc_load_tasks); +- +- /* +- * It could be the one fold was all it took, we done! +- */ +- if (time_before(jiffies, calc_load_update + 10)) +- return; +- +- /* +- * Catch-up, fold however many we are behind still +- */ +- delta = jiffies - calc_load_update - 10; +- n = 1 + (delta / LOAD_FREQ); ++ if (!time_before(jiffies, calc_load_update + 10)) { ++ /* ++ * Catch-up, fold however many we are behind still ++ */ ++ delta = jiffies - calc_load_update - 10; ++ n = 1 + (delta / LOAD_FREQ); + +- active = atomic_long_read(&calc_load_tasks); +- active = active > 0 ? active * FIXED_1 : 0; ++ active = atomic_long_read(&calc_load_tasks); ++ active = active > 0 ? active * FIXED_1 : 0; + +- avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); +- avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); +- avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); ++ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); ++ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); ++ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + +- calc_load_update += n * LOAD_FREQ; +-} +-#else +-static void calc_load_account_idle(struct rq *this_rq) +-{ +-} ++ calc_load_update += n * LOAD_FREQ; ++ } + +-static inline long calc_load_fold_idle(void) +-{ +- return 0; ++ /* ++ * Flip the idle index... ++ * ++ * Make sure we first write the new time then flip the index, so that ++ * calc_load_write_idx() will see the new time when it reads the new ++ * index, this avoids a double flip messing things up. ++ */ ++ smp_wmb(); ++ calc_load_idx++; + } ++#else /* !CONFIG_NO_HZ */ + +-static void calc_global_nohz(void) +-{ +-} +-#endif ++static inline long calc_load_fold_idle(void) { return 0; } ++static inline void calc_global_nohz(void) { } + +-/** +- * get_avenrun - get the load average array +- * @loads: pointer to dest load array +- * @offset: offset to add +- * @shift: shift count to shift the result left +- * +- * These values are estimates at best, so no need for locking. +- */ +-void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +-{ +- loads[0] = (avenrun[0] + offset) << shift; +- loads[1] = (avenrun[1] + offset) << shift; +- loads[2] = (avenrun[2] + offset) << shift; +-} ++#endif /* CONFIG_NO_HZ */ + + /* + * calc_load - update the avenrun load estimates 10 ticks after the +@@ -3609,11 +3734,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) + */ + void calc_global_load(unsigned long ticks) + { +- long active; ++ long active, delta; + + if (time_before(jiffies, calc_load_update + 10)) + return; + ++ /* ++ * Fold the 'old' idle-delta to include all NO_HZ cpus. ++ */ ++ delta = calc_load_fold_idle(); ++ if (delta) ++ atomic_long_add(delta, &calc_load_tasks); ++ + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + +@@ -3624,12 +3756,7 @@ void calc_global_load(unsigned long ticks) + calc_load_update += LOAD_FREQ; + + /* +- * Account one period with whatever state we found before +- * folding in the nohz state and ageing the entire idle period. +- * +- * This avoids loosing a sample when we go idle between +- * calc_load_account_active() (10 ticks ago) and now and thus +- * under-accounting. ++ * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. + */ + calc_global_nohz(); + } +@@ -3646,7 +3773,6 @@ static void calc_load_account_active(struct rq *this_rq) + return; + + delta = calc_load_fold_active(this_rq); +- delta += calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + +@@ -3654,6 +3780,10 @@ static void calc_load_account_active(struct rq *this_rq) + } + + /* ++ * End of global load-average stuff ++ */ ++ ++/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * +diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c +index 0a51882..be92bfe 100644 +--- a/kernel/sched_idletask.c ++++ b/kernel/sched_idletask.c +@@ -23,7 +23,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl + static struct task_struct *pick_next_task_idle(struct rq *rq) + { + schedstat_inc(rq, sched_goidle); +- calc_load_account_idle(rq); + return rq->idle; + } + +diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c +index 4b85a7a..f1eb182 100644 +--- a/kernel/time/ntp.c ++++ b/kernel/time/ntp.c +@@ -31,8 +31,6 @@ unsigned long tick_nsec; + u64 tick_length; + static u64 tick_length_base; + +-static struct hrtimer leap_timer; +- + #define MAX_TICKADJ 500LL /* usecs */ + #define MAX_TICKADJ_SCALED \ + (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) +@@ -350,60 +348,60 @@ void ntp_clear(void) + } + + /* +- * Leap second processing. If in leap-insert state at the end of the +- * day, the system clock is set back one second; if in leap-delete +- * state, the system clock is set ahead one second. ++ * this routine handles the overflow of the microsecond field ++ * ++ * The tricky bits of code to handle the accurate clock support ++ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. ++ * They were originally developed for SUN and DEC kernels. ++ * All the kudos should go to Dave for this stuff. ++ * ++ * Also handles leap second processing, and returns leap offset + */ +-static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) ++int second_overflow(unsigned long secs) + { +- enum hrtimer_restart res = HRTIMER_NORESTART; +- +- write_seqlock(&xtime_lock); ++ int leap = 0; ++ s64 delta; + ++ /* ++ * Leap second processing. If in leap-insert state at the end of the ++ * day, the system clock is set back one second; if in leap-delete ++ * state, the system clock is set ahead one second. ++ */ + switch (time_state) { + case TIME_OK: ++ if (time_status & STA_INS) ++ time_state = TIME_INS; ++ else if (time_status & STA_DEL) ++ time_state = TIME_DEL; + break; + case TIME_INS: +- timekeeping_leap_insert(-1); +- time_state = TIME_OOP; +- printk(KERN_NOTICE +- "Clock: inserting leap second 23:59:60 UTC\n"); +- hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); +- res = HRTIMER_RESTART; ++ if (secs % 86400 == 0) { ++ leap = -1; ++ time_state = TIME_OOP; ++ time_tai++; ++ printk(KERN_NOTICE ++ "Clock: inserting leap second 23:59:60 UTC\n"); ++ } + break; + case TIME_DEL: +- timekeeping_leap_insert(1); +- time_tai--; +- time_state = TIME_WAIT; +- printk(KERN_NOTICE +- "Clock: deleting leap second 23:59:59 UTC\n"); ++ if ((secs + 1) % 86400 == 0) { ++ leap = 1; ++ time_tai--; ++ time_state = TIME_WAIT; ++ printk(KERN_NOTICE ++ "Clock: deleting leap second 23:59:59 UTC\n"); ++ } + break; + case TIME_OOP: +- time_tai++; + time_state = TIME_WAIT; +- /* fall through */ ++ break; ++ + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + break; + } + +- write_sequnlock(&xtime_lock); +- +- return res; +-} +- +-/* +- * this routine handles the overflow of the microsecond field +- * +- * The tricky bits of code to handle the accurate clock support +- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. +- * They were originally developed for SUN and DEC kernels. +- * All the kudos should go to Dave for this stuff. +- */ +-void second_overflow(void) +-{ +- s64 delta; + + /* Bump the maxerror field */ + time_maxerror += MAXFREQ / NSEC_PER_USEC; +@@ -423,23 +421,25 @@ void second_overflow(void) + pps_dec_valid(); + + if (!time_adjust) +- return; ++ goto out; + + if (time_adjust > MAX_TICKADJ) { + time_adjust -= MAX_TICKADJ; + tick_length += MAX_TICKADJ_SCALED; +- return; ++ goto out; + } + + if (time_adjust < -MAX_TICKADJ) { + time_adjust += MAX_TICKADJ; + tick_length -= MAX_TICKADJ_SCALED; +- return; ++ goto out; + } + + tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) + << NTP_SCALE_SHIFT; + time_adjust = 0; ++out: ++ return leap; + } + + #ifdef CONFIG_GENERIC_CMOS_UPDATE +@@ -501,27 +501,6 @@ static void notify_cmos_timer(void) + static inline void notify_cmos_timer(void) { } + #endif + +-/* +- * Start the leap seconds timer: +- */ +-static inline void ntp_start_leap_timer(struct timespec *ts) +-{ +- long now = ts->tv_sec; +- +- if (time_status & STA_INS) { +- time_state = TIME_INS; +- now += 86400 - now % 86400; +- hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); +- +- return; +- } +- +- if (time_status & STA_DEL) { +- time_state = TIME_DEL; +- now += 86400 - (now + 1) % 86400; +- hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); +- } +-} + + /* + * Propagate a new txc->status value into the NTP state: +@@ -546,22 +525,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) + time_status &= STA_RONLY; + time_status |= txc->status & ~STA_RONLY; + +- switch (time_state) { +- case TIME_OK: +- ntp_start_leap_timer(ts); +- break; +- case TIME_INS: +- case TIME_DEL: +- time_state = TIME_OK; +- ntp_start_leap_timer(ts); +- case TIME_WAIT: +- if (!(time_status & (STA_INS | STA_DEL))) +- time_state = TIME_OK; +- break; +- case TIME_OOP: +- hrtimer_restart(&leap_timer); +- break; +- } + } + /* + * Called with the xtime lock held, so we can access and modify +@@ -643,9 +606,6 @@ int do_adjtimex(struct timex *txc) + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) + return -EINVAL; +- +- if (txc->modes & ADJ_STATUS && time_state != TIME_OK) +- hrtimer_cancel(&leap_timer); + } + + if (txc->modes & ADJ_SETOFFSET) { +@@ -967,6 +927,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup); + void __init ntp_init(void) + { + ntp_clear(); +- hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); +- leap_timer.function = ntp_leap_second; + } +diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c +index c923640..9955ebd 100644 +--- a/kernel/time/tick-sched.c ++++ b/kernel/time/tick-sched.c +@@ -430,6 +430,7 @@ void tick_nohz_stop_sched_tick(int inidle) + */ + if (!ts->tick_stopped) { + select_nohz_load_balancer(1); ++ calc_load_enter_idle(); + + ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); + ts->tick_stopped = 1; +@@ -563,6 +564,7 @@ void tick_nohz_restart_sched_tick(void) + account_idle_ticks(ticks); + #endif + ++ calc_load_exit_idle(); + touch_softlockup_watchdog(); + /* + * Cancel the scheduled timer and restore the tick +diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c +index 2378413..03e67d4 100644 +--- a/kernel/time/timekeeping.c ++++ b/kernel/time/timekeeping.c +@@ -161,23 +161,43 @@ static struct timespec xtime __attribute__ ((aligned (16))); + static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); + static struct timespec total_sleep_time; + ++/* Offset clock monotonic -> clock realtime */ ++static ktime_t offs_real; ++ ++/* Offset clock monotonic -> clock boottime */ ++static ktime_t offs_boot; ++ + /* + * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. + */ + static struct timespec raw_time; + +-/* flag for if timekeeping is suspended */ +-int __read_mostly timekeeping_suspended; ++/* must hold write on xtime_lock */ ++static void update_rt_offset(void) ++{ ++ struct timespec tmp, *wtm = &wall_to_monotonic; + +-/* must hold xtime_lock */ +-void timekeeping_leap_insert(int leapsecond) ++ set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); ++ offs_real = timespec_to_ktime(tmp); ++} ++ ++/* must hold write on xtime_lock */ ++static void timekeeping_update(bool clearntp) + { +- xtime.tv_sec += leapsecond; +- wall_to_monotonic.tv_sec -= leapsecond; +- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, +- timekeeper.mult); ++ if (clearntp) { ++ timekeeper.ntp_error = 0; ++ ntp_clear(); ++ } ++ update_rt_offset(); ++ update_vsyscall(&xtime, &wall_to_monotonic, ++ timekeeper.clock, timekeeper.mult); + } + ++ ++ ++/* flag for if timekeeping is suspended */ ++int __read_mostly timekeeping_suspended; ++ + /** + * timekeeping_forward_now - update clock to the current time + * +@@ -375,11 +395,7 @@ int do_settimeofday(const struct timespec *tv) + + xtime = *tv; + +- timekeeper.ntp_error = 0; +- ntp_clear(); +- +- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, +- timekeeper.mult); ++ timekeeping_update(true); + + write_sequnlock_irqrestore(&xtime_lock, flags); + +@@ -412,11 +428,7 @@ int timekeeping_inject_offset(struct timespec *ts) + xtime = timespec_add(xtime, *ts); + wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); + +- timekeeper.ntp_error = 0; +- ntp_clear(); +- +- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, +- timekeeper.mult); ++ timekeeping_update(true); + + write_sequnlock_irqrestore(&xtime_lock, flags); + +@@ -591,6 +603,7 @@ void __init timekeeping_init(void) + } + set_normalized_timespec(&wall_to_monotonic, + -boot.tv_sec, -boot.tv_nsec); ++ update_rt_offset(); + total_sleep_time.tv_sec = 0; + total_sleep_time.tv_nsec = 0; + write_sequnlock_irqrestore(&xtime_lock, flags); +@@ -599,6 +612,12 @@ void __init timekeeping_init(void) + /* time in seconds when suspend began */ + static struct timespec timekeeping_suspend_time; + ++static void update_sleep_time(struct timespec t) ++{ ++ total_sleep_time = t; ++ offs_boot = timespec_to_ktime(t); ++} ++ + /** + * __timekeeping_inject_sleeptime - Internal function to add sleep interval + * @delta: pointer to a timespec delta value +@@ -616,7 +635,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) + + xtime = timespec_add(xtime, *delta); + wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); +- total_sleep_time = timespec_add(total_sleep_time, *delta); ++ update_sleep_time(timespec_add(total_sleep_time, *delta)); + } + + +@@ -645,10 +664,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) + + __timekeeping_inject_sleeptime(delta); + +- timekeeper.ntp_error = 0; +- ntp_clear(); +- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, +- timekeeper.mult); ++ timekeeping_update(true); + + write_sequnlock_irqrestore(&xtime_lock, flags); + +@@ -683,6 +699,7 @@ static void timekeeping_resume(void) + timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); + timekeeper.ntp_error = 0; + timekeeping_suspended = 0; ++ timekeeping_update(false); + write_sequnlock_irqrestore(&xtime_lock, flags); + + touch_softlockup_watchdog(); +@@ -942,9 +959,14 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) + + timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; + while (timekeeper.xtime_nsec >= nsecps) { ++ int leap; + timekeeper.xtime_nsec -= nsecps; + xtime.tv_sec++; +- second_overflow(); ++ leap = second_overflow(xtime.tv_sec); ++ xtime.tv_sec += leap; ++ wall_to_monotonic.tv_sec -= leap; ++ if (leap) ++ clock_was_set_delayed(); + } + + /* Accumulate raw time */ +@@ -1050,14 +1072,17 @@ static void update_wall_time(void) + * xtime.tv_nsec isn't larger then NSEC_PER_SEC + */ + if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { ++ int leap; + xtime.tv_nsec -= NSEC_PER_SEC; + xtime.tv_sec++; +- second_overflow(); ++ leap = second_overflow(xtime.tv_sec); ++ xtime.tv_sec += leap; ++ wall_to_monotonic.tv_sec -= leap; ++ if (leap) ++ clock_was_set_delayed(); + } + +- /* check to see if there is a new clocksource to use */ +- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, +- timekeeper.mult); ++ timekeeping_update(false); + } + + /** +@@ -1216,6 +1241,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, + } while (read_seqretry(&xtime_lock, seq)); + } + ++#ifdef CONFIG_HIGH_RES_TIMERS ++/** ++ * ktime_get_update_offsets - hrtimer helper ++ * @real: pointer to storage for monotonic -> realtime offset ++ * @_boot: pointer to storage for monotonic -> boottime offset ++ * ++ * Returns current monotonic time and updates the offsets ++ * Called from hrtimer_interupt() or retrigger_next_event() ++ */ ++ktime_t ktime_get_update_offsets(ktime_t *real, ktime_t *boot) ++{ ++ ktime_t now; ++ unsigned int seq; ++ u64 secs, nsecs; ++ ++ do { ++ seq = read_seqbegin(&xtime_lock); ++ ++ secs = xtime.tv_sec; ++ nsecs = xtime.tv_nsec; ++ nsecs += timekeeping_get_ns(); ++ /* If arch requires, add in gettimeoffset() */ ++ nsecs += arch_gettimeoffset(); ++ ++ *real = offs_real; ++ *boot = offs_boot; ++ } while (read_seqretry(&xtime_lock, seq)); ++ ++ now = ktime_add_ns(ktime_set(secs, 0), nsecs); ++ now = ktime_sub(now, *real); ++ return now; ++} ++#endif ++ + /** + * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format + */ +diff --git a/mm/compaction.c b/mm/compaction.c +index 8fb8a40..50f1c60 100644 +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -592,8 +592,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) + if (err) { + putback_lru_pages(&cc->migratepages); + cc->nr_migratepages = 0; ++ if (err == -ENOMEM) { ++ ret = COMPACT_PARTIAL; ++ goto out; ++ } + } +- + } + + out: +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index 5f5c545..7c535b0 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size; + */ + static DEFINE_SPINLOCK(hugetlb_lock); + ++static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) ++{ ++ bool free = (spool->count == 0) && (spool->used_hpages == 0); ++ ++ spin_unlock(&spool->lock); ++ ++ /* If no pages are used, and no other handles to the subpool ++ * remain, free the subpool the subpool remain */ ++ if (free) ++ kfree(spool); ++} ++ ++struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) ++{ ++ struct hugepage_subpool *spool; ++ ++ spool = kmalloc(sizeof(*spool), GFP_KERNEL); ++ if (!spool) ++ return NULL; ++ ++ spin_lock_init(&spool->lock); ++ spool->count = 1; ++ spool->max_hpages = nr_blocks; ++ spool->used_hpages = 0; ++ ++ return spool; ++} ++ ++void hugepage_put_subpool(struct hugepage_subpool *spool) ++{ ++ spin_lock(&spool->lock); ++ BUG_ON(!spool->count); ++ spool->count--; ++ unlock_or_release_subpool(spool); ++} ++ ++static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, ++ long delta) ++{ ++ int ret = 0; ++ ++ if (!spool) ++ return 0; ++ ++ spin_lock(&spool->lock); ++ if ((spool->used_hpages + delta) <= spool->max_hpages) { ++ spool->used_hpages += delta; ++ } else { ++ ret = -ENOMEM; ++ } ++ spin_unlock(&spool->lock); ++ ++ return ret; ++} ++ ++static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, ++ long delta) ++{ ++ if (!spool) ++ return; ++ ++ spin_lock(&spool->lock); ++ spool->used_hpages -= delta; ++ /* If hugetlbfs_put_super couldn't free spool due to ++ * an outstanding quota reference, free it now. */ ++ unlock_or_release_subpool(spool); ++} ++ ++static inline struct hugepage_subpool *subpool_inode(struct inode *inode) ++{ ++ return HUGETLBFS_SB(inode->i_sb)->spool; ++} ++ ++static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) ++{ ++ return subpool_inode(vma->vm_file->f_dentry->d_inode); ++} ++ + /* + * Region tracking -- allows tracking of reservations and instantiated pages + * across the pages in a mapping. +@@ -533,9 +611,9 @@ static void free_huge_page(struct page *page) + */ + struct hstate *h = page_hstate(page); + int nid = page_to_nid(page); +- struct address_space *mapping; ++ struct hugepage_subpool *spool = ++ (struct hugepage_subpool *)page_private(page); + +- mapping = (struct address_space *) page_private(page); + set_page_private(page, 0); + page->mapping = NULL; + BUG_ON(page_count(page)); +@@ -551,8 +629,7 @@ static void free_huge_page(struct page *page) + enqueue_huge_page(h, page); + } + spin_unlock(&hugetlb_lock); +- if (mapping) +- hugetlb_put_quota(mapping, 1); ++ hugepage_subpool_put_pages(spool, 1); + } + + static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +@@ -966,11 +1043,12 @@ static void return_unused_surplus_pages(struct hstate *h, + /* + * Determine if the huge page at addr within the vma has an associated + * reservation. Where it does not we will need to logically increase +- * reservation and actually increase quota before an allocation can occur. +- * Where any new reservation would be required the reservation change is +- * prepared, but not committed. Once the page has been quota'd allocated +- * an instantiated the change should be committed via vma_commit_reservation. +- * No action is required on failure. ++ * reservation and actually increase subpool usage before an allocation ++ * can occur. Where any new reservation would be required the ++ * reservation change is prepared, but not committed. Once the page ++ * has been allocated from the subpool and instantiated the change should ++ * be committed via vma_commit_reservation. No action is required on ++ * failure. + */ + static long vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +@@ -1019,24 +1097,24 @@ static void vma_commit_reservation(struct hstate *h, + static struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) + { ++ struct hugepage_subpool *spool = subpool_vma(vma); + struct hstate *h = hstate_vma(vma); + struct page *page; +- struct address_space *mapping = vma->vm_file->f_mapping; +- struct inode *inode = mapping->host; + long chg; + + /* +- * Processes that did not create the mapping will have no reserves and +- * will not have accounted against quota. Check that the quota can be +- * made before satisfying the allocation +- * MAP_NORESERVE mappings may also need pages and quota allocated +- * if no reserve mapping overlaps. ++ * Processes that did not create the mapping will have no ++ * reserves and will not have accounted against subpool ++ * limit. Check that the subpool limit can be made before ++ * satisfying the allocation MAP_NORESERVE mappings may also ++ * need pages and subpool limit allocated allocated if no reserve ++ * mapping overlaps. + */ + chg = vma_needs_reservation(h, vma, addr); + if (chg < 0) + return ERR_PTR(-VM_FAULT_OOM); + if (chg) +- if (hugetlb_get_quota(inode->i_mapping, chg)) ++ if (hugepage_subpool_get_pages(spool, chg)) + return ERR_PTR(-VM_FAULT_SIGBUS); + + spin_lock(&hugetlb_lock); +@@ -1046,12 +1124,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, + if (!page) { + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + if (!page) { +- hugetlb_put_quota(inode->i_mapping, chg); ++ hugepage_subpool_put_pages(spool, chg); + return ERR_PTR(-VM_FAULT_SIGBUS); + } + } + +- set_page_private(page, (unsigned long) mapping); ++ set_page_private(page, (unsigned long)spool); + + vma_commit_reservation(h, vma, addr); + +@@ -2081,6 +2159,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) + { + struct hstate *h = hstate_vma(vma); + struct resv_map *reservations = vma_resv_map(vma); ++ struct hugepage_subpool *spool = subpool_vma(vma); + unsigned long reserve; + unsigned long start; + unsigned long end; +@@ -2096,7 +2175,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) + + if (reserve) { + hugetlb_acct_memory(h, -reserve); +- hugetlb_put_quota(vma->vm_file->f_mapping, reserve); ++ hugepage_subpool_put_pages(spool, reserve); + } + } + } +@@ -2326,7 +2405,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + address = address & huge_page_mask(h); + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + + (vma->vm_pgoff >> PAGE_SHIFT); +- mapping = (struct address_space *)page_private(page); ++ mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + + /* + * Take the mapping lock for the duration of the table walk. As +@@ -2865,11 +2944,12 @@ int hugetlb_reserve_pages(struct inode *inode, + { + long ret, chg; + struct hstate *h = hstate_inode(inode); ++ struct hugepage_subpool *spool = subpool_inode(inode); + + /* + * Only apply hugepage reservation if asked. At fault time, an + * attempt will be made for VM_NORESERVE to allocate a page +- * and filesystem quota without using reserves ++ * without using reserves + */ + if (vm_flags & VM_NORESERVE) + return 0; +@@ -2898,19 +2978,19 @@ int hugetlb_reserve_pages(struct inode *inode, + goto out_err; + } + +- /* There must be enough filesystem quota for the mapping */ +- if (hugetlb_get_quota(inode->i_mapping, chg)) { ++ /* There must be enough pages in the subpool for the mapping */ ++ if (hugepage_subpool_get_pages(spool, chg)) { + ret = -ENOSPC; + goto out_err; + } + + /* + * Check enough hugepages are available for the reservation. +- * Hand back the quota if there are not ++ * Hand the pages back to the subpool if there are not + */ + ret = hugetlb_acct_memory(h, chg); + if (ret < 0) { +- hugetlb_put_quota(inode->i_mapping, chg); ++ hugepage_subpool_put_pages(spool, chg); + goto out_err; + } + +@@ -2938,12 +3018,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) + { + struct hstate *h = hstate_inode(inode); + long chg = region_truncate(&inode->i_mapping->private_list, offset); ++ struct hugepage_subpool *spool = subpool_inode(inode); + + spin_lock(&inode->i_lock); + inode->i_blocks -= (blocks_per_huge_page(h) * freed); + spin_unlock(&inode->i_lock); + +- hugetlb_put_quota(inode->i_mapping, (chg - freed)); ++ hugepage_subpool_put_pages(spool, (chg - freed)); + hugetlb_acct_memory(h, -(chg - freed)); + } + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index fbe2d2c..8342119 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2824,7 +2824,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) + * them before going back to sleep. + */ + set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); +- schedule(); ++ ++ if (!kthread_should_stop()) ++ schedule(); ++ + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); + } else { + if (remaining) +@@ -3090,14 +3093,17 @@ int kswapd_run(int nid) + } + + /* +- * Called by memory hotplug when all memory in a node is offlined. ++ * Called by memory hotplug when all memory in a node is offlined. Caller must ++ * hold lock_memory_hotplug(). + */ + void kswapd_stop(int nid) + { + struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + +- if (kswapd) ++ if (kswapd) { + kthread_stop(kswapd); ++ NODE_DATA(nid)->kswapd = NULL; ++ } + } + + static int __init kswapd_init(void) +diff --git a/net/can/raw.c b/net/can/raw.c +index cde1b4a..46cca3a 100644 +--- a/net/can/raw.c ++++ b/net/can/raw.c +@@ -681,9 +681,6 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock, + if (err < 0) + goto free_skb; + +- /* to be able to check the received tx sock reference in raw_rcv() */ +- skb_shinfo(skb)->tx_flags |= SKBTX_DRV_NEEDS_SK_REF; +- + skb->dev = dev; + skb->sk = sk; + +diff --git a/net/core/dev.c b/net/core/dev.c +index 1cbddc9..5738654 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -2079,25 +2079,6 @@ static int dev_gso_segment(struct sk_buff *skb, int features) + return 0; + } + +-/* +- * Try to orphan skb early, right before transmission by the device. +- * We cannot orphan skb if tx timestamp is requested or the sk-reference +- * is needed on driver level for other reasons, e.g. see net/can/raw.c +- */ +-static inline void skb_orphan_try(struct sk_buff *skb) +-{ +- struct sock *sk = skb->sk; +- +- if (sk && !skb_shinfo(skb)->tx_flags) { +- /* skb_tx_hash() wont be able to get sk. +- * We copy sk_hash into skb->rxhash +- */ +- if (!skb->rxhash) +- skb->rxhash = sk->sk_hash; +- skb_orphan(skb); +- } +-} +- + static bool can_checksum_protocol(unsigned long features, __be16 protocol) + { + return ((features & NETIF_F_GEN_CSUM) || +@@ -2182,8 +2163,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); + +- skb_orphan_try(skb); +- + features = netif_skb_features(skb); + + if (vlan_tx_tag_present(skb) && +@@ -2293,7 +2272,7 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, + if (skb->sk && skb->sk->sk_hash) + hash = skb->sk->sk_hash; + else +- hash = (__force u16) skb->protocol ^ skb->rxhash; ++ hash = (__force u16) skb->protocol; + hash = jhash_1word(hash, hashrnd); + + return (u16) (((u64) hash * qcount) >> 32) + qoffset; +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 9726927..32e6ca2 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -5836,6 +5836,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + goto discard; + + if (th->syn) { ++ if (th->fin) ++ goto discard; + if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) + return 1; + +diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c +index 274d150..cf98d62 100644 +--- a/net/iucv/af_iucv.c ++++ b/net/iucv/af_iucv.c +@@ -380,7 +380,6 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock, + skb_trim(skb, skb->dev->mtu); + } + skb->protocol = ETH_P_AF_IUCV; +- skb_shinfo(skb)->tx_flags |= SKBTX_DRV_NEEDS_SK_REF; + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; +diff --git a/net/wireless/util.c b/net/wireless/util.c +index d38815d..74d5292 100644 +--- a/net/wireless/util.c ++++ b/net/wireless/util.c +@@ -813,7 +813,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, + ntype == NL80211_IFTYPE_P2P_CLIENT)) + return -EBUSY; + +- if (ntype != otype) { ++ if (ntype != otype && netif_running(dev)) { + err = cfg80211_can_change_interface(rdev, dev->ieee80211_ptr, + ntype); + if (err) +diff --git a/scripts/depmod.sh b/scripts/depmod.sh +index a272356..2ae4817 100755 +--- a/scripts/depmod.sh ++++ b/scripts/depmod.sh +@@ -9,12 +9,6 @@ fi + DEPMOD=$1 + KERNELRELEASE=$2 + +-if ! "$DEPMOD" -V 2>/dev/null | grep -q module-init-tools; then +- echo "Warning: you may need to install module-init-tools" >&2 +- echo "See http://www.codemonkey.org.uk/docs/post-halloween-2.6.txt" >&2 +- sleep 1 +-fi +- + if ! test -r System.map -a -x "$DEPMOD"; then + exit 0 + fi +diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c +index 9f614b4..272407c 100644 +--- a/virt/kvm/irq_comm.c ++++ b/virt/kvm/irq_comm.c +@@ -318,6 +318,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, + */ + hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link) + if (ei->type == KVM_IRQ_ROUTING_MSI || ++ ue->type == KVM_IRQ_ROUTING_MSI || + ue->u.irqchip.irqchip == ei->irqchip.irqchip) + return r; + diff --git a/3.2.34/bump/1024_linux-3.2.25.patch b/3.2.34/bump/1024_linux-3.2.25.patch new file mode 100644 index 0000000..e95c213 --- /dev/null +++ b/3.2.34/bump/1024_linux-3.2.25.patch @@ -0,0 +1,4503 @@ +diff --git a/Makefile b/Makefile +index 80bb4fd..e13e4e7 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 2 +-SUBLEVEL = 24 ++SUBLEVEL = 25 + EXTRAVERSION = + NAME = Saber-toothed Squirrel + +diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h +index 559da19..578e5a0 100644 +--- a/arch/powerpc/include/asm/reg.h ++++ b/arch/powerpc/include/asm/reg.h +@@ -1016,7 +1016,8 @@ + /* Macros for setting and retrieving special purpose registers */ + #ifndef __ASSEMBLY__ + #define mfmsr() ({unsigned long rval; \ +- asm volatile("mfmsr %0" : "=r" (rval)); rval;}) ++ asm volatile("mfmsr %0" : "=r" (rval) : \ ++ : "memory"); rval;}) + #ifdef CONFIG_PPC_BOOK3S_64 + #define __mtmsrd(v, l) asm volatile("mtmsrd %0," __stringify(l) \ + : : "r" (v) : "memory") +diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c +index bf99cfa..6324008 100644 +--- a/arch/powerpc/kernel/ftrace.c ++++ b/arch/powerpc/kernel/ftrace.c +@@ -245,9 +245,9 @@ __ftrace_make_nop(struct module *mod, + + /* + * On PPC32 the trampoline looks like: +- * 0x3d, 0x60, 0x00, 0x00 lis r11,sym@ha +- * 0x39, 0x6b, 0x00, 0x00 addi r11,r11,sym@l +- * 0x7d, 0x69, 0x03, 0xa6 mtctr r11 ++ * 0x3d, 0x80, 0x00, 0x00 lis r12,sym@ha ++ * 0x39, 0x8c, 0x00, 0x00 addi r12,r12,sym@l ++ * 0x7d, 0x89, 0x03, 0xa6 mtctr r12 + * 0x4e, 0x80, 0x04, 0x20 bctr + */ + +@@ -262,9 +262,9 @@ __ftrace_make_nop(struct module *mod, + pr_devel(" %08x %08x ", jmp[0], jmp[1]); + + /* verify that this is what we expect it to be */ +- if (((jmp[0] & 0xffff0000) != 0x3d600000) || +- ((jmp[1] & 0xffff0000) != 0x396b0000) || +- (jmp[2] != 0x7d6903a6) || ++ if (((jmp[0] & 0xffff0000) != 0x3d800000) || ++ ((jmp[1] & 0xffff0000) != 0x398c0000) || ++ (jmp[2] != 0x7d8903a6) || + (jmp[3] != 0x4e800420)) { + printk(KERN_ERR "Not a trampoline\n"); + return -EINVAL; +diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c +index 6e0073e..07c7bf4 100644 +--- a/arch/s390/kernel/processor.c ++++ b/arch/s390/kernel/processor.c +@@ -26,12 +26,14 @@ static DEFINE_PER_CPU(struct cpuid, cpu_id); + void __cpuinit cpu_init(void) + { + struct cpuid *id = &per_cpu(cpu_id, smp_processor_id()); ++ struct s390_idle_data *idle = &__get_cpu_var(s390_idle); + + get_cpu_id(id); + atomic_inc(&init_mm.mm_count); + current->active_mm = &init_mm; + BUG_ON(current->mm); + enter_lazy_tlb(&init_mm, current); ++ memset(idle, 0, sizeof(*idle)); + } + + /* +diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c +index 3ea8728..1df64a8 100644 +--- a/arch/s390/kernel/smp.c ++++ b/arch/s390/kernel/smp.c +@@ -1020,14 +1020,11 @@ static int __cpuinit smp_cpu_notify(struct notifier_block *self, + unsigned int cpu = (unsigned int)(long)hcpu; + struct cpu *c = &per_cpu(cpu_devices, cpu); + struct sys_device *s = &c->sysdev; +- struct s390_idle_data *idle; + int err = 0; + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: +- idle = &per_cpu(s390_idle, cpu); +- memset(idle, 0, sizeof(struct s390_idle_data)); + err = sysfs_create_group(&s->kobj, &cpu_online_attr_group); + break; + case CPU_DEAD: +diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c +index 563a09d..29c95d7 100644 +--- a/arch/x86/kernel/microcode_core.c ++++ b/arch/x86/kernel/microcode_core.c +@@ -297,20 +297,31 @@ static ssize_t reload_store(struct sys_device *dev, + const char *buf, size_t size) + { + unsigned long val; +- int cpu = dev->id; +- int ret = 0; +- char *end; ++ int cpu; ++ ssize_t ret = 0, tmp_ret; + +- val = simple_strtoul(buf, &end, 0); +- if (end == buf) ++ /* allow reload only from the BSP */ ++ if (boot_cpu_data.cpu_index != dev->id) + return -EINVAL; + +- if (val == 1) { +- get_online_cpus(); +- if (cpu_online(cpu)) +- ret = reload_for_cpu(cpu); +- put_online_cpus(); ++ ret = kstrtoul(buf, 0, &val); ++ if (ret) ++ return ret; ++ ++ if (val != 1) ++ return size; ++ ++ get_online_cpus(); ++ for_each_online_cpu(cpu) { ++ tmp_ret = reload_for_cpu(cpu); ++ if (tmp_ret != 0) ++ pr_warn("Error reloading microcode on CPU %d\n", cpu); ++ ++ /* save retval of the first encountered reload error */ ++ if (!ret) ++ ret = tmp_ret; + } ++ put_online_cpus(); + + if (!ret) + ret = size; +diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c +index 6dd8955..0951b81 100644 +--- a/arch/x86/pci/fixup.c ++++ b/arch/x86/pci/fixup.c +@@ -521,3 +521,20 @@ static void sb600_disable_hpet_bar(struct pci_dev *dev) + } + } + DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar); ++ ++/* ++ * Twinhead H12Y needs us to block out a region otherwise we map devices ++ * there and any access kills the box. ++ * ++ * See: https://bugzilla.kernel.org/show_bug.cgi?id=10231 ++ * ++ * Match off the LPC and svid/sdid (older kernels lose the bridge subvendor) ++ */ ++static void __devinit twinhead_reserve_killing_zone(struct pci_dev *dev) ++{ ++ if (dev->subsystem_vendor == 0x14FF && dev->subsystem_device == 0xA003) { ++ pr_info("Reserving memory on Twinhead H12Y\n"); ++ request_mem_region(0xFFB00000, 0x100000, "twinhead"); ++ } ++} ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone); +diff --git a/block/blk-core.c b/block/blk-core.c +index 15de223..49d9e91 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -607,7 +607,7 @@ EXPORT_SYMBOL(blk_init_allocated_queue); + + int blk_get_queue(struct request_queue *q) + { +- if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { ++ if (likely(!blk_queue_dead(q))) { + kobject_get(&q->kobj); + return 0; + } +@@ -754,7 +754,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, + const bool is_sync = rw_is_sync(rw_flags) != 0; + int may_queue; + +- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) ++ if (unlikely(blk_queue_dead(q))) + return NULL; + + may_queue = elv_may_queue(q, rw_flags); +@@ -874,7 +874,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, + struct io_context *ioc; + struct request_list *rl = &q->rq; + +- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) ++ if (unlikely(blk_queue_dead(q))) + return NULL; + + prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, +diff --git a/block/blk-exec.c b/block/blk-exec.c +index a1ebceb..6053285 100644 +--- a/block/blk-exec.c ++++ b/block/blk-exec.c +@@ -50,7 +50,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, + { + int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; + +- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { ++ if (unlikely(blk_queue_dead(q))) { + rq->errors = -ENXIO; + if (rq->end_io) + rq->end_io(rq, rq->errors); +diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c +index e7f9f65..f0b2ca8 100644 +--- a/block/blk-sysfs.c ++++ b/block/blk-sysfs.c +@@ -425,7 +425,7 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) + if (!entry->show) + return -EIO; + mutex_lock(&q->sysfs_lock); +- if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { ++ if (blk_queue_dead(q)) { + mutex_unlock(&q->sysfs_lock); + return -ENOENT; + } +@@ -447,7 +447,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, + + q = container_of(kobj, struct request_queue, kobj); + mutex_lock(&q->sysfs_lock); +- if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { ++ if (blk_queue_dead(q)) { + mutex_unlock(&q->sysfs_lock); + return -ENOENT; + } +diff --git a/block/blk-throttle.c b/block/blk-throttle.c +index 4553245..5eed6a7 100644 +--- a/block/blk-throttle.c ++++ b/block/blk-throttle.c +@@ -310,7 +310,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td) + struct request_queue *q = td->queue; + + /* no throttling for dead queue */ +- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) ++ if (unlikely(blk_queue_dead(q))) + return NULL; + + rcu_read_lock(); +@@ -335,7 +335,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td) + spin_lock_irq(q->queue_lock); + + /* Make sure @q is still alive */ +- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { ++ if (unlikely(blk_queue_dead(q))) { + kfree(tg); + return NULL; + } +diff --git a/block/blk.h b/block/blk.h +index 3f6551b..e38691d 100644 +--- a/block/blk.h ++++ b/block/blk.h +@@ -85,7 +85,7 @@ static inline struct request *__elv_next_request(struct request_queue *q) + q->flush_queue_delayed = 1; + return NULL; + } +- if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) || ++ if (unlikely(blk_queue_dead(q)) || + !q->elevator->ops->elevator_dispatch_fn(q, 0)) + return NULL; + } +diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c +index 6512b20..d1fcbc0 100644 +--- a/drivers/acpi/ac.c ++++ b/drivers/acpi/ac.c +@@ -292,7 +292,9 @@ static int acpi_ac_add(struct acpi_device *device) + ac->charger.properties = ac_props; + ac->charger.num_properties = ARRAY_SIZE(ac_props); + ac->charger.get_property = get_ac_property; +- power_supply_register(&ac->device->dev, &ac->charger); ++ result = power_supply_register(&ac->device->dev, &ac->charger); ++ if (result) ++ goto end; + + printk(KERN_INFO PREFIX "%s [%s] (%s)\n", + acpi_device_name(device), acpi_device_bid(device), +diff --git a/drivers/gpu/drm/nouveau/nva3_copy.fuc b/drivers/gpu/drm/nouveau/nva3_copy.fuc +index eaf35f8..d894731 100644 +--- a/drivers/gpu/drm/nouveau/nva3_copy.fuc ++++ b/drivers/gpu/drm/nouveau/nva3_copy.fuc +@@ -118,9 +118,9 @@ dispatch_dma: + // mthd 0x030c-0x0340, various stuff + .b16 0xc3 14 + .b32 ctx_src_address_high ~0x000000ff +-.b32 ctx_src_address_low ~0xfffffff0 ++.b32 ctx_src_address_low ~0xffffffff + .b32 ctx_dst_address_high ~0x000000ff +-.b32 ctx_dst_address_low ~0xfffffff0 ++.b32 ctx_dst_address_low ~0xffffffff + .b32 ctx_src_pitch ~0x0007ffff + .b32 ctx_dst_pitch ~0x0007ffff + .b32 ctx_xcnt ~0x0000ffff +diff --git a/drivers/gpu/drm/nouveau/nva3_copy.fuc.h b/drivers/gpu/drm/nouveau/nva3_copy.fuc.h +index 2731de2..e2a0e88 100644 +--- a/drivers/gpu/drm/nouveau/nva3_copy.fuc.h ++++ b/drivers/gpu/drm/nouveau/nva3_copy.fuc.h +@@ -1,37 +1,72 @@ +-uint32_t nva3_pcopy_data[] = { ++u32 nva3_pcopy_data[] = { ++/* 0x0000: ctx_object */ + 0x00000000, ++/* 0x0004: ctx_dma */ ++/* 0x0004: ctx_dma_query */ + 0x00000000, ++/* 0x0008: ctx_dma_src */ + 0x00000000, ++/* 0x000c: ctx_dma_dst */ + 0x00000000, ++/* 0x0010: ctx_query_address_high */ + 0x00000000, ++/* 0x0014: ctx_query_address_low */ + 0x00000000, ++/* 0x0018: ctx_query_counter */ + 0x00000000, ++/* 0x001c: ctx_src_address_high */ + 0x00000000, ++/* 0x0020: ctx_src_address_low */ + 0x00000000, ++/* 0x0024: ctx_src_pitch */ + 0x00000000, ++/* 0x0028: ctx_src_tile_mode */ + 0x00000000, ++/* 0x002c: ctx_src_xsize */ + 0x00000000, ++/* 0x0030: ctx_src_ysize */ + 0x00000000, ++/* 0x0034: ctx_src_zsize */ + 0x00000000, ++/* 0x0038: ctx_src_zoff */ + 0x00000000, ++/* 0x003c: ctx_src_xoff */ + 0x00000000, ++/* 0x0040: ctx_src_yoff */ + 0x00000000, ++/* 0x0044: ctx_src_cpp */ + 0x00000000, ++/* 0x0048: ctx_dst_address_high */ + 0x00000000, ++/* 0x004c: ctx_dst_address_low */ + 0x00000000, ++/* 0x0050: ctx_dst_pitch */ + 0x00000000, ++/* 0x0054: ctx_dst_tile_mode */ + 0x00000000, ++/* 0x0058: ctx_dst_xsize */ + 0x00000000, ++/* 0x005c: ctx_dst_ysize */ + 0x00000000, ++/* 0x0060: ctx_dst_zsize */ + 0x00000000, ++/* 0x0064: ctx_dst_zoff */ + 0x00000000, ++/* 0x0068: ctx_dst_xoff */ + 0x00000000, ++/* 0x006c: ctx_dst_yoff */ + 0x00000000, ++/* 0x0070: ctx_dst_cpp */ + 0x00000000, ++/* 0x0074: ctx_format */ + 0x00000000, ++/* 0x0078: ctx_swz_const0 */ + 0x00000000, ++/* 0x007c: ctx_swz_const1 */ + 0x00000000, ++/* 0x0080: ctx_xcnt */ + 0x00000000, ++/* 0x0084: ctx_ycnt */ + 0x00000000, + 0x00000000, + 0x00000000, +@@ -63,6 +98,7 @@ uint32_t nva3_pcopy_data[] = { + 0x00000000, + 0x00000000, + 0x00000000, ++/* 0x0100: dispatch_table */ + 0x00010000, + 0x00000000, + 0x00000000, +@@ -73,6 +109,7 @@ uint32_t nva3_pcopy_data[] = { + 0x00010162, + 0x00000000, + 0x00030060, ++/* 0x0128: dispatch_dma */ + 0x00010170, + 0x00000000, + 0x00010170, +@@ -118,11 +155,11 @@ uint32_t nva3_pcopy_data[] = { + 0x0000001c, + 0xffffff00, + 0x00000020, +- 0x0000000f, ++ 0x00000000, + 0x00000048, + 0xffffff00, + 0x0000004c, +- 0x0000000f, ++ 0x00000000, + 0x00000024, + 0xfff80000, + 0x00000050, +@@ -146,7 +183,8 @@ uint32_t nva3_pcopy_data[] = { + 0x00000800, + }; + +-uint32_t nva3_pcopy_code[] = { ++u32 nva3_pcopy_code[] = { ++/* 0x0000: main */ + 0x04fe04bd, + 0x3517f000, + 0xf10010fe, +@@ -158,23 +196,31 @@ uint32_t nva3_pcopy_code[] = { + 0x17f11031, + 0x27f01200, + 0x0012d003, ++/* 0x002f: spin */ + 0xf40031f4, + 0x0ef40028, ++/* 0x0035: ih */ + 0x8001cffd, + 0xf40812c4, + 0x21f4060b, ++/* 0x0041: ih_no_chsw */ + 0x0412c472, + 0xf4060bf4, ++/* 0x004a: ih_no_cmd */ + 0x11c4c321, + 0x4001d00c, ++/* 0x0052: swctx */ + 0x47f101f8, + 0x4bfe7700, + 0x0007fe00, + 0xf00204b9, + 0x01f40643, + 0x0604fa09, ++/* 0x006b: swctx_load */ + 0xfa060ef4, ++/* 0x006e: swctx_done */ + 0x03f80504, ++/* 0x0072: chsw */ + 0x27f100f8, + 0x23cf1400, + 0x1e3fc800, +@@ -183,18 +229,22 @@ uint32_t nva3_pcopy_code[] = { + 0x1e3af052, + 0xf00023d0, + 0x24d00147, ++/* 0x0093: chsw_no_unload */ + 0xcf00f880, + 0x3dc84023, + 0x220bf41e, + 0xf40131f4, + 0x57f05221, + 0x0367f004, ++/* 0x00a8: chsw_load_ctx_dma */ + 0xa07856bc, + 0xb6018068, + 0x87d00884, + 0x0162b600, ++/* 0x00bb: chsw_finish_load */ + 0xf0f018f4, + 0x23d00237, ++/* 0x00c3: dispatch */ + 0xf100f880, + 0xcf190037, + 0x33cf4032, +@@ -202,6 +252,7 @@ uint32_t nva3_pcopy_code[] = { + 0x1024b607, + 0x010057f1, + 0x74bd64bd, ++/* 0x00dc: dispatch_loop */ + 0x58005658, + 0x50b60157, + 0x0446b804, +@@ -211,6 +262,7 @@ uint32_t nva3_pcopy_code[] = { + 0xb60276bb, + 0x57bb0374, + 0xdf0ef400, ++/* 0x0100: dispatch_valid_mthd */ + 0xb60246bb, + 0x45bb0344, + 0x01459800, +@@ -220,31 +272,41 @@ uint32_t nva3_pcopy_code[] = { + 0xb0014658, + 0x1bf40064, + 0x00538009, ++/* 0x0127: dispatch_cmd */ + 0xf4300ef4, + 0x55f90132, + 0xf40c01f4, ++/* 0x0132: dispatch_invalid_bitfield */ + 0x25f0250e, ++/* 0x0135: dispatch_illegal_mthd */ + 0x0125f002, ++/* 0x0138: dispatch_error */ + 0x100047f1, + 0xd00042d0, + 0x27f04043, + 0x0002d040, ++/* 0x0148: hostirq_wait */ + 0xf08002cf, + 0x24b04024, + 0xf71bf400, ++/* 0x0154: dispatch_done */ + 0x1d0027f1, + 0xd00137f0, + 0x00f80023, ++/* 0x0160: cmd_nop */ ++/* 0x0162: cmd_pm_trigger */ + 0x27f100f8, + 0x34bd2200, + 0xd00233f0, + 0x00f80023, ++/* 0x0170: cmd_dma */ + 0x012842b7, + 0xf00145b6, + 0x43801e39, + 0x0040b701, + 0x0644b606, + 0xf80043d0, ++/* 0x0189: cmd_exec_set_format */ + 0xf030f400, + 0xb00001b0, + 0x01b00101, +@@ -256,20 +318,26 @@ uint32_t nva3_pcopy_code[] = { + 0x70b63847, + 0x0232f401, + 0x94bd84bd, ++/* 0x01b4: ncomp_loop */ + 0xb60f4ac4, + 0xb4bd0445, ++/* 0x01bc: bpc_loop */ + 0xf404a430, + 0xa5ff0f18, + 0x00cbbbc0, + 0xf40231f4, ++/* 0x01ce: cmp_c0 */ + 0x1bf4220e, + 0x10c7f00c, + 0xf400cbbb, ++/* 0x01da: cmp_c1 */ + 0xa430160e, + 0x0c18f406, + 0xbb14c7f0, + 0x0ef400cb, ++/* 0x01e9: cmp_zero */ + 0x80c7f107, ++/* 0x01ed: bpc_next */ + 0x01c83800, + 0xb60180b6, + 0xb5b801b0, +@@ -280,6 +348,7 @@ uint32_t nva3_pcopy_code[] = { + 0x98110680, + 0x68fd2008, + 0x0502f400, ++/* 0x0216: dst_xcnt */ + 0x75fd64bd, + 0x1c078000, + 0xf10078fd, +@@ -304,6 +373,7 @@ uint32_t nva3_pcopy_code[] = { + 0x980056d0, + 0x56d01f06, + 0x1030f440, ++/* 0x0276: cmd_exec_set_surface_tiled */ + 0x579800f8, + 0x6879c70a, + 0xb66478c7, +@@ -311,9 +381,11 @@ uint32_t nva3_pcopy_code[] = { + 0x0e76b060, + 0xf0091bf4, + 0x0ef40477, ++/* 0x0291: xtile64 */ + 0x027cf00f, + 0xfd1170b6, + 0x77f00947, ++/* 0x029d: xtileok */ + 0x0f5a9806, + 0xfd115b98, + 0xb7f000ab, +@@ -371,6 +443,7 @@ uint32_t nva3_pcopy_code[] = { + 0x67d00600, + 0x0060b700, + 0x0068d004, ++/* 0x0382: cmd_exec_set_surface_linear */ + 0x6cf000f8, + 0x0260b702, + 0x0864b602, +@@ -381,13 +454,16 @@ uint32_t nva3_pcopy_code[] = { + 0xb70067d0, + 0x98040060, + 0x67d00957, ++/* 0x03ab: cmd_exec_wait */ + 0xf900f800, + 0xf110f900, + 0xb6080007, ++/* 0x03b6: loop */ + 0x01cf0604, + 0x0114f000, + 0xfcfa1bf4, + 0xf800fc10, ++/* 0x03c5: cmd_exec_query */ + 0x0d34c800, + 0xf5701bf4, + 0xf103ab21, +@@ -417,6 +493,7 @@ uint32_t nva3_pcopy_code[] = { + 0x47f10153, + 0x44b60800, + 0x0045d006, ++/* 0x0438: query_counter */ + 0x03ab21f5, + 0x080c47f1, + 0x980644b6, +@@ -439,11 +516,13 @@ uint32_t nva3_pcopy_code[] = { + 0x47f10153, + 0x44b60800, + 0x0045d006, ++/* 0x0492: cmd_exec */ + 0x21f500f8, + 0x3fc803ab, + 0x0e0bf400, + 0x018921f5, + 0x020047f1, ++/* 0x04a7: cmd_exec_no_format */ + 0xf11e0ef4, + 0xb6081067, + 0x77f00664, +@@ -451,19 +530,24 @@ uint32_t nva3_pcopy_code[] = { + 0x981c0780, + 0x67d02007, + 0x4067d000, ++/* 0x04c2: cmd_exec_init_src_surface */ + 0x32f444bd, + 0xc854bd02, + 0x0bf4043f, + 0x8221f50a, + 0x0a0ef403, ++/* 0x04d4: src_tiled */ + 0x027621f5, ++/* 0x04db: cmd_exec_init_dst_surface */ + 0xf40749f0, + 0x57f00231, + 0x083fc82c, + 0xf50a0bf4, + 0xf4038221, ++/* 0x04ee: dst_tiled */ + 0x21f50a0e, + 0x49f00276, ++/* 0x04f5: cmd_exec_kick */ + 0x0057f108, + 0x0654b608, + 0xd0210698, +@@ -473,6 +557,8 @@ uint32_t nva3_pcopy_code[] = { + 0xc80054d0, + 0x0bf40c3f, + 0xc521f507, ++/* 0x0519: cmd_exec_done */ ++/* 0x051b: cmd_wrcache_flush */ + 0xf100f803, + 0xbd220027, + 0x0133f034, +diff --git a/drivers/gpu/drm/nouveau/nvc0_copy.fuc.h b/drivers/gpu/drm/nouveau/nvc0_copy.fuc.h +index 4199038..9e87036 100644 +--- a/drivers/gpu/drm/nouveau/nvc0_copy.fuc.h ++++ b/drivers/gpu/drm/nouveau/nvc0_copy.fuc.h +@@ -1,34 +1,65 @@ +-uint32_t nvc0_pcopy_data[] = { ++u32 nvc0_pcopy_data[] = { ++/* 0x0000: ctx_object */ + 0x00000000, ++/* 0x0004: ctx_query_address_high */ + 0x00000000, ++/* 0x0008: ctx_query_address_low */ + 0x00000000, ++/* 0x000c: ctx_query_counter */ + 0x00000000, ++/* 0x0010: ctx_src_address_high */ + 0x00000000, ++/* 0x0014: ctx_src_address_low */ + 0x00000000, ++/* 0x0018: ctx_src_pitch */ + 0x00000000, ++/* 0x001c: ctx_src_tile_mode */ + 0x00000000, ++/* 0x0020: ctx_src_xsize */ + 0x00000000, ++/* 0x0024: ctx_src_ysize */ + 0x00000000, ++/* 0x0028: ctx_src_zsize */ + 0x00000000, ++/* 0x002c: ctx_src_zoff */ + 0x00000000, ++/* 0x0030: ctx_src_xoff */ + 0x00000000, ++/* 0x0034: ctx_src_yoff */ + 0x00000000, ++/* 0x0038: ctx_src_cpp */ + 0x00000000, ++/* 0x003c: ctx_dst_address_high */ + 0x00000000, ++/* 0x0040: ctx_dst_address_low */ + 0x00000000, ++/* 0x0044: ctx_dst_pitch */ + 0x00000000, ++/* 0x0048: ctx_dst_tile_mode */ + 0x00000000, ++/* 0x004c: ctx_dst_xsize */ + 0x00000000, ++/* 0x0050: ctx_dst_ysize */ + 0x00000000, ++/* 0x0054: ctx_dst_zsize */ + 0x00000000, ++/* 0x0058: ctx_dst_zoff */ + 0x00000000, ++/* 0x005c: ctx_dst_xoff */ + 0x00000000, ++/* 0x0060: ctx_dst_yoff */ + 0x00000000, ++/* 0x0064: ctx_dst_cpp */ + 0x00000000, ++/* 0x0068: ctx_format */ + 0x00000000, ++/* 0x006c: ctx_swz_const0 */ + 0x00000000, ++/* 0x0070: ctx_swz_const1 */ + 0x00000000, ++/* 0x0074: ctx_xcnt */ + 0x00000000, ++/* 0x0078: ctx_ycnt */ + 0x00000000, + 0x00000000, + 0x00000000, +@@ -63,6 +94,7 @@ uint32_t nvc0_pcopy_data[] = { + 0x00000000, + 0x00000000, + 0x00000000, ++/* 0x0100: dispatch_table */ + 0x00010000, + 0x00000000, + 0x00000000, +@@ -111,11 +143,11 @@ uint32_t nvc0_pcopy_data[] = { + 0x00000010, + 0xffffff00, + 0x00000014, +- 0x0000000f, ++ 0x00000000, + 0x0000003c, + 0xffffff00, + 0x00000040, +- 0x0000000f, ++ 0x00000000, + 0x00000018, + 0xfff80000, + 0x00000044, +@@ -139,7 +171,8 @@ uint32_t nvc0_pcopy_data[] = { + 0x00000800, + }; + +-uint32_t nvc0_pcopy_code[] = { ++u32 nvc0_pcopy_code[] = { ++/* 0x0000: main */ + 0x04fe04bd, + 0x3517f000, + 0xf10010fe, +@@ -151,15 +184,20 @@ uint32_t nvc0_pcopy_code[] = { + 0x17f11031, + 0x27f01200, + 0x0012d003, ++/* 0x002f: spin */ + 0xf40031f4, + 0x0ef40028, ++/* 0x0035: ih */ + 0x8001cffd, + 0xf40812c4, + 0x21f4060b, ++/* 0x0041: ih_no_chsw */ + 0x0412c4ca, + 0xf5070bf4, ++/* 0x004b: ih_no_cmd */ + 0xc4010221, + 0x01d00c11, ++/* 0x0053: swctx */ + 0xf101f840, + 0xfe770047, + 0x47f1004b, +@@ -188,8 +226,11 @@ uint32_t nvc0_pcopy_code[] = { + 0xf00204b9, + 0x01f40643, + 0x0604fa09, ++/* 0x00c3: swctx_load */ + 0xfa060ef4, ++/* 0x00c6: swctx_done */ + 0x03f80504, ++/* 0x00ca: chsw */ + 0x27f100f8, + 0x23cf1400, + 0x1e3fc800, +@@ -198,18 +239,22 @@ uint32_t nvc0_pcopy_code[] = { + 0x1e3af053, + 0xf00023d0, + 0x24d00147, ++/* 0x00eb: chsw_no_unload */ + 0xcf00f880, + 0x3dc84023, + 0x090bf41e, + 0xf40131f4, ++/* 0x00fa: chsw_finish_load */ + 0x37f05321, + 0x8023d002, ++/* 0x0102: dispatch */ + 0x37f100f8, + 0x32cf1900, + 0x0033cf40, + 0x07ff24e4, + 0xf11024b6, + 0xbd010057, ++/* 0x011b: dispatch_loop */ + 0x5874bd64, + 0x57580056, + 0x0450b601, +@@ -219,6 +264,7 @@ uint32_t nvc0_pcopy_code[] = { + 0xbb0f08f4, + 0x74b60276, + 0x0057bb03, ++/* 0x013f: dispatch_valid_mthd */ + 0xbbdf0ef4, + 0x44b60246, + 0x0045bb03, +@@ -229,24 +275,33 @@ uint32_t nvc0_pcopy_code[] = { + 0x64b00146, + 0x091bf400, + 0xf4005380, ++/* 0x0166: dispatch_cmd */ + 0x32f4300e, + 0xf455f901, + 0x0ef40c01, ++/* 0x0171: dispatch_invalid_bitfield */ + 0x0225f025, ++/* 0x0174: dispatch_illegal_mthd */ ++/* 0x0177: dispatch_error */ + 0xf10125f0, + 0xd0100047, + 0x43d00042, + 0x4027f040, ++/* 0x0187: hostirq_wait */ + 0xcf0002d0, + 0x24f08002, + 0x0024b040, ++/* 0x0193: dispatch_done */ + 0xf1f71bf4, + 0xf01d0027, + 0x23d00137, ++/* 0x019f: cmd_nop */ + 0xf800f800, ++/* 0x01a1: cmd_pm_trigger */ + 0x0027f100, + 0xf034bd22, + 0x23d00233, ++/* 0x01af: cmd_exec_set_format */ + 0xf400f800, + 0x01b0f030, + 0x0101b000, +@@ -258,20 +313,26 @@ uint32_t nvc0_pcopy_code[] = { + 0x3847c701, + 0xf40170b6, + 0x84bd0232, ++/* 0x01da: ncomp_loop */ + 0x4ac494bd, + 0x0445b60f, ++/* 0x01e2: bpc_loop */ + 0xa430b4bd, + 0x0f18f404, + 0xbbc0a5ff, + 0x31f400cb, + 0x220ef402, ++/* 0x01f4: cmp_c0 */ + 0xf00c1bf4, + 0xcbbb10c7, + 0x160ef400, ++/* 0x0200: cmp_c1 */ + 0xf406a430, + 0xc7f00c18, + 0x00cbbb14, ++/* 0x020f: cmp_zero */ + 0xf1070ef4, ++/* 0x0213: bpc_next */ + 0x380080c7, + 0x80b601c8, + 0x01b0b601, +@@ -283,6 +344,7 @@ uint32_t nvc0_pcopy_code[] = { + 0x1d08980e, + 0xf40068fd, + 0x64bd0502, ++/* 0x023c: dst_xcnt */ + 0x800075fd, + 0x78fd1907, + 0x1057f100, +@@ -307,15 +369,18 @@ uint32_t nvc0_pcopy_code[] = { + 0x1c069800, + 0xf44056d0, + 0x00f81030, ++/* 0x029c: cmd_exec_set_surface_tiled */ + 0xc7075798, + 0x78c76879, + 0x0380b664, + 0xb06077c7, + 0x1bf40e76, + 0x0477f009, ++/* 0x02b7: xtile64 */ + 0xf00f0ef4, + 0x70b6027c, + 0x0947fd11, ++/* 0x02c3: xtileok */ + 0x980677f0, + 0x5b980c5a, + 0x00abfd0e, +@@ -374,6 +439,7 @@ uint32_t nvc0_pcopy_code[] = { + 0xb70067d0, + 0xd0040060, + 0x00f80068, ++/* 0x03a8: cmd_exec_set_surface_linear */ + 0xb7026cf0, + 0xb6020260, + 0x57980864, +@@ -384,12 +450,15 @@ uint32_t nvc0_pcopy_code[] = { + 0x0060b700, + 0x06579804, + 0xf80067d0, ++/* 0x03d1: cmd_exec_wait */ + 0xf900f900, + 0x0007f110, + 0x0604b608, ++/* 0x03dc: loop */ + 0xf00001cf, + 0x1bf40114, + 0xfc10fcfa, ++/* 0x03eb: cmd_exec_query */ + 0xc800f800, + 0x1bf40d34, + 0xd121f570, +@@ -419,6 +488,7 @@ uint32_t nvc0_pcopy_code[] = { + 0x0153f026, + 0x080047f1, + 0xd00644b6, ++/* 0x045e: query_counter */ + 0x21f50045, + 0x47f103d1, + 0x44b6080c, +@@ -442,11 +512,13 @@ uint32_t nvc0_pcopy_code[] = { + 0x080047f1, + 0xd00644b6, + 0x00f80045, ++/* 0x04b8: cmd_exec */ + 0x03d121f5, + 0xf4003fc8, + 0x21f50e0b, + 0x47f101af, + 0x0ef40200, ++/* 0x04cd: cmd_exec_no_format */ + 0x1067f11e, + 0x0664b608, + 0x800177f0, +@@ -454,18 +526,23 @@ uint32_t nvc0_pcopy_code[] = { + 0x1d079819, + 0xd00067d0, + 0x44bd4067, ++/* 0x04e8: cmd_exec_init_src_surface */ + 0xbd0232f4, + 0x043fc854, + 0xf50a0bf4, + 0xf403a821, ++/* 0x04fa: src_tiled */ + 0x21f50a0e, + 0x49f0029c, ++/* 0x0501: cmd_exec_init_dst_surface */ + 0x0231f407, + 0xc82c57f0, + 0x0bf4083f, + 0xa821f50a, + 0x0a0ef403, ++/* 0x0514: dst_tiled */ + 0x029c21f5, ++/* 0x051b: cmd_exec_kick */ + 0xf10849f0, + 0xb6080057, + 0x06980654, +@@ -475,7 +552,9 @@ uint32_t nvc0_pcopy_code[] = { + 0x54d00546, + 0x0c3fc800, + 0xf5070bf4, ++/* 0x053f: cmd_exec_done */ + 0xf803eb21, ++/* 0x0541: cmd_wrcache_flush */ + 0x0027f100, + 0xf034bd22, + 0x23d00133, +diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c +index 552b436..3254d51 100644 +--- a/drivers/gpu/drm/radeon/atombios_dp.c ++++ b/drivers/gpu/drm/radeon/atombios_dp.c +@@ -22,6 +22,7 @@ + * + * Authors: Dave Airlie + * Alex Deucher ++ * Jerome Glisse + */ + #include "drmP.h" + #include "radeon_drm.h" +@@ -634,7 +635,6 @@ static bool radeon_dp_get_link_status(struct radeon_connector *radeon_connector, + ret = radeon_dp_aux_native_read(radeon_connector, DP_LANE0_1_STATUS, + link_status, DP_LINK_STATUS_SIZE, 100); + if (ret <= 0) { +- DRM_ERROR("displayport link status failed\n"); + return false; + } + +@@ -812,8 +812,10 @@ static int radeon_dp_link_train_cr(struct radeon_dp_link_train_info *dp_info) + else + mdelay(dp_info->rd_interval * 4); + +- if (!radeon_dp_get_link_status(dp_info->radeon_connector, dp_info->link_status)) ++ if (!radeon_dp_get_link_status(dp_info->radeon_connector, dp_info->link_status)) { ++ DRM_ERROR("displayport link status failed\n"); + break; ++ } + + if (dp_clock_recovery_ok(dp_info->link_status, dp_info->dp_lane_count)) { + clock_recovery = true; +@@ -875,8 +877,10 @@ static int radeon_dp_link_train_ce(struct radeon_dp_link_train_info *dp_info) + else + mdelay(dp_info->rd_interval * 4); + +- if (!radeon_dp_get_link_status(dp_info->radeon_connector, dp_info->link_status)) ++ if (!radeon_dp_get_link_status(dp_info->radeon_connector, dp_info->link_status)) { ++ DRM_ERROR("displayport link status failed\n"); + break; ++ } + + if (dp_channel_eq_ok(dp_info->link_status, dp_info->dp_lane_count)) { + channel_eq = true; +diff --git a/drivers/gpu/drm/radeon/radeon_connectors.c b/drivers/gpu/drm/radeon/radeon_connectors.c +index 4a4493f..87d494d 100644 +--- a/drivers/gpu/drm/radeon/radeon_connectors.c ++++ b/drivers/gpu/drm/radeon/radeon_connectors.c +@@ -64,14 +64,33 @@ void radeon_connector_hotplug(struct drm_connector *connector) + + /* just deal with DP (not eDP) here. */ + if (connector->connector_type == DRM_MODE_CONNECTOR_DisplayPort) { +- int saved_dpms = connector->dpms; +- +- /* Only turn off the display it it's physically disconnected */ +- if (!radeon_hpd_sense(rdev, radeon_connector->hpd.hpd)) +- drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF); +- else if (radeon_dp_needs_link_train(radeon_connector)) +- drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON); +- connector->dpms = saved_dpms; ++ struct radeon_connector_atom_dig *dig_connector = ++ radeon_connector->con_priv; ++ ++ /* if existing sink type was not DP no need to retrain */ ++ if (dig_connector->dp_sink_type != CONNECTOR_OBJECT_ID_DISPLAYPORT) ++ return; ++ ++ /* first get sink type as it may be reset after (un)plug */ ++ dig_connector->dp_sink_type = radeon_dp_getsinktype(radeon_connector); ++ /* don't do anything if sink is not display port, i.e., ++ * passive dp->(dvi|hdmi) adaptor ++ */ ++ if (dig_connector->dp_sink_type == CONNECTOR_OBJECT_ID_DISPLAYPORT) { ++ int saved_dpms = connector->dpms; ++ /* Only turn off the display if it's physically disconnected */ ++ if (!radeon_hpd_sense(rdev, radeon_connector->hpd.hpd)) { ++ drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF); ++ } else if (radeon_dp_needs_link_train(radeon_connector)) { ++ /* set it to OFF so that drm_helper_connector_dpms() ++ * won't return immediately since the current state ++ * is ON at this point. ++ */ ++ connector->dpms = DRM_MODE_DPMS_OFF; ++ drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON); ++ } ++ connector->dpms = saved_dpms; ++ } + } + } + +diff --git a/drivers/gpu/drm/radeon/radeon_cursor.c b/drivers/gpu/drm/radeon/radeon_cursor.c +index 986d608..2132109 100644 +--- a/drivers/gpu/drm/radeon/radeon_cursor.c ++++ b/drivers/gpu/drm/radeon/radeon_cursor.c +@@ -257,8 +257,14 @@ int radeon_crtc_cursor_move(struct drm_crtc *crtc, + if (!(cursor_end & 0x7f)) + w--; + } +- if (w <= 0) ++ if (w <= 0) { + w = 1; ++ cursor_end = x - xorigin + w; ++ if (!(cursor_end & 0x7f)) { ++ x--; ++ WARN_ON_ONCE(x < 0); ++ } ++ } + } + } + +diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c +index f3ae607..39497c7 100644 +--- a/drivers/gpu/drm/radeon/radeon_object.c ++++ b/drivers/gpu/drm/radeon/radeon_object.c +@@ -117,7 +117,6 @@ int radeon_bo_create(struct radeon_device *rdev, + return -ENOMEM; + } + +-retry: + bo = kzalloc(sizeof(struct radeon_bo), GFP_KERNEL); + if (bo == NULL) + return -ENOMEM; +@@ -130,6 +129,8 @@ retry: + bo->gem_base.driver_private = NULL; + bo->surface_reg = -1; + INIT_LIST_HEAD(&bo->list); ++ ++retry: + radeon_ttm_placement_from_domain(bo, domain); + /* Kernel allocation are uninterruptible */ + mutex_lock(&rdev->vram_mutex); +diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c +index a1b8caa..0f074e0 100644 +--- a/drivers/iommu/amd_iommu.c ++++ b/drivers/iommu/amd_iommu.c +@@ -1865,6 +1865,11 @@ static int device_change_notifier(struct notifier_block *nb, + + iommu_init_device(dev); + ++ if (iommu_pass_through) { ++ attach_device(dev, pt_domain); ++ break; ++ } ++ + domain = domain_for_device(dev); + + /* allocate a protection domain if a device is added */ +@@ -1880,10 +1885,7 @@ static int device_change_notifier(struct notifier_block *nb, + list_add_tail(&dma_domain->list, &iommu_pd_list); + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + +- if (!iommu_pass_through) +- dev->archdata.dma_ops = &amd_iommu_dma_ops; +- else +- dev->archdata.dma_ops = &nommu_dma_ops; ++ dev->archdata.dma_ops = &amd_iommu_dma_ops; + + break; + case BUS_NOTIFY_DEL_DEVICE: +diff --git a/drivers/media/video/cx25821/cx25821-core.c b/drivers/media/video/cx25821/cx25821-core.c +index a7fa38f..e572ce5 100644 +--- a/drivers/media/video/cx25821/cx25821-core.c ++++ b/drivers/media/video/cx25821/cx25821-core.c +@@ -914,9 +914,6 @@ static int cx25821_dev_setup(struct cx25821_dev *dev) + list_add_tail(&dev->devlist, &cx25821_devlist); + mutex_unlock(&cx25821_devlist_mutex); + +- strcpy(cx25821_boards[UNKNOWN_BOARD].name, "unknown"); +- strcpy(cx25821_boards[CX25821_BOARD].name, "cx25821"); +- + if (dev->pci->device != 0x8210) { + pr_info("%s(): Exiting. Incorrect Hardware device = 0x%02x\n", + __func__, dev->pci->device); +diff --git a/drivers/media/video/cx25821/cx25821.h b/drivers/media/video/cx25821/cx25821.h +index 2d2d009..bf54360 100644 +--- a/drivers/media/video/cx25821/cx25821.h ++++ b/drivers/media/video/cx25821/cx25821.h +@@ -187,7 +187,7 @@ enum port { + }; + + struct cx25821_board { +- char *name; ++ const char *name; + enum port porta; + enum port portb; + enum port portc; +diff --git a/drivers/mmc/host/sdhci-pci.c b/drivers/mmc/host/sdhci-pci.c +index 6878a94..83b51b5 100644 +--- a/drivers/mmc/host/sdhci-pci.c ++++ b/drivers/mmc/host/sdhci-pci.c +@@ -148,6 +148,7 @@ static const struct sdhci_pci_fixes sdhci_ene_714 = { + static const struct sdhci_pci_fixes sdhci_cafe = { + .quirks = SDHCI_QUIRK_NO_SIMULT_VDD_AND_POWER | + SDHCI_QUIRK_NO_BUSY_IRQ | ++ SDHCI_QUIRK_BROKEN_CARD_DETECTION | + SDHCI_QUIRK_BROKEN_TIMEOUT_VAL, + }; + +diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c +index 9e61d6b..ed1be8a 100644 +--- a/drivers/net/ethernet/realtek/r8169.c ++++ b/drivers/net/ethernet/realtek/r8169.c +@@ -3770,6 +3770,7 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp) + case RTL_GIGA_MAC_VER_22: + case RTL_GIGA_MAC_VER_23: + case RTL_GIGA_MAC_VER_24: ++ case RTL_GIGA_MAC_VER_34: + RTL_W32(RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST); + break; + default: +diff --git a/drivers/net/wireless/mwifiex/cfg80211.c b/drivers/net/wireless/mwifiex/cfg80211.c +index 01dcb1a..727c129 100644 +--- a/drivers/net/wireless/mwifiex/cfg80211.c ++++ b/drivers/net/wireless/mwifiex/cfg80211.c +@@ -545,9 +545,9 @@ mwifiex_dump_station_info(struct mwifiex_private *priv, + + /* + * Bit 0 in tx_htinfo indicates that current Tx rate is 11n rate. Valid +- * MCS index values for us are 0 to 7. ++ * MCS index values for us are 0 to 15. + */ +- if ((priv->tx_htinfo & BIT(0)) && (priv->tx_rate < 8)) { ++ if ((priv->tx_htinfo & BIT(0)) && (priv->tx_rate < 16)) { + sinfo->txrate.mcs = priv->tx_rate; + sinfo->txrate.flags |= RATE_INFO_FLAGS_MCS; + /* 40MHz rate */ +diff --git a/drivers/net/wireless/rt2x00/rt2800usb.c b/drivers/net/wireless/rt2x00/rt2800usb.c +index 0ffa111..bdf960b 100644 +--- a/drivers/net/wireless/rt2x00/rt2800usb.c ++++ b/drivers/net/wireless/rt2x00/rt2800usb.c +@@ -876,6 +876,7 @@ static struct usb_device_id rt2800usb_device_table[] = { + { USB_DEVICE(0x1482, 0x3c09) }, + /* AirTies */ + { USB_DEVICE(0x1eda, 0x2012) }, ++ { USB_DEVICE(0x1eda, 0x2210) }, + { USB_DEVICE(0x1eda, 0x2310) }, + /* Allwin */ + { USB_DEVICE(0x8516, 0x2070) }, +@@ -945,6 +946,7 @@ static struct usb_device_id rt2800usb_device_table[] = { + /* DVICO */ + { USB_DEVICE(0x0fe9, 0xb307) }, + /* Edimax */ ++ { USB_DEVICE(0x7392, 0x4085) }, + { USB_DEVICE(0x7392, 0x7711) }, + { USB_DEVICE(0x7392, 0x7717) }, + { USB_DEVICE(0x7392, 0x7718) }, +@@ -1020,6 +1022,7 @@ static struct usb_device_id rt2800usb_device_table[] = { + /* Philips */ + { USB_DEVICE(0x0471, 0x200f) }, + /* Planex */ ++ { USB_DEVICE(0x2019, 0x5201) }, + { USB_DEVICE(0x2019, 0xab25) }, + { USB_DEVICE(0x2019, 0xed06) }, + /* Quanta */ +@@ -1088,6 +1091,12 @@ static struct usb_device_id rt2800usb_device_table[] = { + #ifdef CONFIG_RT2800USB_RT33XX + /* Belkin */ + { USB_DEVICE(0x050d, 0x945b) }, ++ /* D-Link */ ++ { USB_DEVICE(0x2001, 0x3c17) }, ++ /* Panasonic */ ++ { USB_DEVICE(0x083a, 0xb511) }, ++ /* Philips */ ++ { USB_DEVICE(0x0471, 0x20dd) }, + /* Ralink */ + { USB_DEVICE(0x148f, 0x3370) }, + { USB_DEVICE(0x148f, 0x8070) }, +@@ -1099,6 +1108,8 @@ static struct usb_device_id rt2800usb_device_table[] = { + { USB_DEVICE(0x8516, 0x3572) }, + /* Askey */ + { USB_DEVICE(0x1690, 0x0744) }, ++ { USB_DEVICE(0x1690, 0x0761) }, ++ { USB_DEVICE(0x1690, 0x0764) }, + /* Cisco */ + { USB_DEVICE(0x167b, 0x4001) }, + /* EnGenius */ +@@ -1113,6 +1124,9 @@ static struct usb_device_id rt2800usb_device_table[] = { + /* Sitecom */ + { USB_DEVICE(0x0df6, 0x0041) }, + { USB_DEVICE(0x0df6, 0x0062) }, ++ { USB_DEVICE(0x0df6, 0x0065) }, ++ { USB_DEVICE(0x0df6, 0x0066) }, ++ { USB_DEVICE(0x0df6, 0x0068) }, + /* Toshiba */ + { USB_DEVICE(0x0930, 0x0a07) }, + /* Zinwell */ +@@ -1122,6 +1136,9 @@ static struct usb_device_id rt2800usb_device_table[] = { + /* Azurewave */ + { USB_DEVICE(0x13d3, 0x3329) }, + { USB_DEVICE(0x13d3, 0x3365) }, ++ /* D-Link */ ++ { USB_DEVICE(0x2001, 0x3c1c) }, ++ { USB_DEVICE(0x2001, 0x3c1d) }, + /* Ralink */ + { USB_DEVICE(0x148f, 0x5370) }, + { USB_DEVICE(0x148f, 0x5372) }, +@@ -1163,13 +1180,8 @@ static struct usb_device_id rt2800usb_device_table[] = { + /* D-Link */ + { USB_DEVICE(0x07d1, 0x3c0b) }, + { USB_DEVICE(0x07d1, 0x3c17) }, +- { USB_DEVICE(0x2001, 0x3c17) }, +- /* Edimax */ +- { USB_DEVICE(0x7392, 0x4085) }, + /* Encore */ + { USB_DEVICE(0x203d, 0x14a1) }, +- /* Fujitsu Stylistic 550 */ +- { USB_DEVICE(0x1690, 0x0761) }, + /* Gemtek */ + { USB_DEVICE(0x15a9, 0x0010) }, + /* Gigabyte */ +@@ -1190,7 +1202,6 @@ static struct usb_device_id rt2800usb_device_table[] = { + { USB_DEVICE(0x05a6, 0x0101) }, + { USB_DEVICE(0x1d4d, 0x0010) }, + /* Planex */ +- { USB_DEVICE(0x2019, 0x5201) }, + { USB_DEVICE(0x2019, 0xab24) }, + /* Qcom */ + { USB_DEVICE(0x18e8, 0x6259) }, +diff --git a/drivers/net/wireless/rtlwifi/rtl8192de/phy.c b/drivers/net/wireless/rtlwifi/rtl8192de/phy.c +index 2cf4c5f..de9faa9 100644 +--- a/drivers/net/wireless/rtlwifi/rtl8192de/phy.c ++++ b/drivers/net/wireless/rtlwifi/rtl8192de/phy.c +@@ -3462,21 +3462,21 @@ void rtl92d_phy_config_macphymode_info(struct ieee80211_hw *hw) + switch (rtlhal->macphymode) { + case DUALMAC_SINGLEPHY: + rtlphy->rf_type = RF_2T2R; +- rtlhal->version |= CHIP_92D_SINGLEPHY; ++ rtlhal->version |= RF_TYPE_2T2R; + rtlhal->bandset = BAND_ON_BOTH; + rtlhal->current_bandtype = BAND_ON_2_4G; + break; + + case SINGLEMAC_SINGLEPHY: + rtlphy->rf_type = RF_2T2R; +- rtlhal->version |= CHIP_92D_SINGLEPHY; ++ rtlhal->version |= RF_TYPE_2T2R; + rtlhal->bandset = BAND_ON_BOTH; + rtlhal->current_bandtype = BAND_ON_2_4G; + break; + + case DUALMAC_DUALPHY: + rtlphy->rf_type = RF_1T1R; +- rtlhal->version &= (~CHIP_92D_SINGLEPHY); ++ rtlhal->version &= RF_TYPE_1T1R; + /* Now we let MAC0 run on 5G band. */ + if (rtlhal->interfaceindex == 0) { + rtlhal->bandset = BAND_ON_5G; +diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c +index 351dc0b..ee77a58 100644 +--- a/drivers/scsi/hosts.c ++++ b/drivers/scsi/hosts.c +@@ -287,6 +287,7 @@ static void scsi_host_dev_release(struct device *dev) + struct Scsi_Host *shost = dev_to_shost(dev); + struct device *parent = dev->parent; + struct request_queue *q; ++ void *queuedata; + + scsi_proc_hostdir_rm(shost->hostt); + +@@ -296,9 +297,9 @@ static void scsi_host_dev_release(struct device *dev) + destroy_workqueue(shost->work_q); + q = shost->uspace_req_q; + if (q) { +- kfree(q->queuedata); +- q->queuedata = NULL; +- scsi_free_queue(q); ++ queuedata = q->queuedata; ++ blk_cleanup_queue(q); ++ kfree(queuedata); + } + + scsi_destroy_command_freelist(shost); +diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c +index e48ba4b..dbe3568 100644 +--- a/drivers/scsi/libsas/sas_expander.c ++++ b/drivers/scsi/libsas/sas_expander.c +@@ -774,7 +774,7 @@ static struct domain_device *sas_ex_discover_end_dev( + } + + /* See if this phy is part of a wide port */ +-static int sas_ex_join_wide_port(struct domain_device *parent, int phy_id) ++static bool sas_ex_join_wide_port(struct domain_device *parent, int phy_id) + { + struct ex_phy *phy = &parent->ex_dev.ex_phy[phy_id]; + int i; +@@ -790,11 +790,11 @@ static int sas_ex_join_wide_port(struct domain_device *parent, int phy_id) + sas_port_add_phy(ephy->port, phy->phy); + phy->port = ephy->port; + phy->phy_state = PHY_DEVICE_DISCOVERED; +- return 0; ++ return true; + } + } + +- return -ENODEV; ++ return false; + } + + static struct domain_device *sas_ex_discover_expander( +@@ -932,8 +932,7 @@ static int sas_ex_discover_dev(struct domain_device *dev, int phy_id) + return res; + } + +- res = sas_ex_join_wide_port(dev, phy_id); +- if (!res) { ++ if (sas_ex_join_wide_port(dev, phy_id)) { + SAS_DPRINTK("Attaching ex phy%d to wide port %016llx\n", + phy_id, SAS_ADDR(ex_phy->attached_sas_addr)); + return res; +@@ -978,8 +977,7 @@ static int sas_ex_discover_dev(struct domain_device *dev, int phy_id) + if (SAS_ADDR(ex->ex_phy[i].attached_sas_addr) == + SAS_ADDR(child->sas_addr)) { + ex->ex_phy[i].phy_state= PHY_DEVICE_DISCOVERED; +- res = sas_ex_join_wide_port(dev, i); +- if (!res) ++ if (sas_ex_join_wide_port(dev, i)) + SAS_DPRINTK("Attaching ex phy%d to wide port %016llx\n", + i, SAS_ADDR(ex->ex_phy[i].attached_sas_addr)); + +@@ -1849,32 +1847,20 @@ static int sas_discover_new(struct domain_device *dev, int phy_id) + { + struct ex_phy *ex_phy = &dev->ex_dev.ex_phy[phy_id]; + struct domain_device *child; +- bool found = false; +- int res, i; ++ int res; + + SAS_DPRINTK("ex %016llx phy%d new device attached\n", + SAS_ADDR(dev->sas_addr), phy_id); + res = sas_ex_phy_discover(dev, phy_id); + if (res) +- goto out; +- /* to support the wide port inserted */ +- for (i = 0; i < dev->ex_dev.num_phys; i++) { +- struct ex_phy *ex_phy_temp = &dev->ex_dev.ex_phy[i]; +- if (i == phy_id) +- continue; +- if (SAS_ADDR(ex_phy_temp->attached_sas_addr) == +- SAS_ADDR(ex_phy->attached_sas_addr)) { +- found = true; +- break; +- } +- } +- if (found) { +- sas_ex_join_wide_port(dev, phy_id); ++ return res; ++ ++ if (sas_ex_join_wide_port(dev, phy_id)) + return 0; +- } ++ + res = sas_ex_discover_devices(dev, phy_id); +- if (!res) +- goto out; ++ if (res) ++ return res; + list_for_each_entry(child, &dev->ex_dev.children, siblings) { + if (SAS_ADDR(child->sas_addr) == + SAS_ADDR(ex_phy->attached_sas_addr)) { +@@ -1884,7 +1870,6 @@ static int sas_discover_new(struct domain_device *dev, int phy_id) + break; + } + } +-out: + return res; + } + +@@ -1983,9 +1968,7 @@ int sas_ex_revalidate_domain(struct domain_device *port_dev) + struct domain_device *dev = NULL; + + res = sas_find_bcast_dev(port_dev, &dev); +- if (res) +- goto out; +- if (dev) { ++ while (res == 0 && dev) { + struct expander_device *ex = &dev->ex_dev; + int i = 0, phy_id; + +@@ -1997,8 +1980,10 @@ int sas_ex_revalidate_domain(struct domain_device *port_dev) + res = sas_rediscover(dev, phy_id); + i = phy_id + 1; + } while (i < ex->num_phys); ++ ++ dev = NULL; ++ res = sas_find_bcast_dev(port_dev, &dev); + } +-out: + return res; + } + +diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c +index 2aeb2e9..831db24 100644 +--- a/drivers/scsi/scsi.c ++++ b/drivers/scsi/scsi.c +@@ -785,7 +785,13 @@ static void scsi_done(struct scsi_cmnd *cmd) + /* Move this to a header if it becomes more generally useful */ + static struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd) + { +- return *(struct scsi_driver **)cmd->request->rq_disk->private_data; ++ struct scsi_driver **sdp; ++ ++ sdp = (struct scsi_driver **)cmd->request->rq_disk->private_data; ++ if (!sdp) ++ return NULL; ++ ++ return *sdp; + } + + /** +diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c +index dc6131e..456b131 100644 +--- a/drivers/scsi/scsi_error.c ++++ b/drivers/scsi/scsi_error.c +@@ -1673,6 +1673,20 @@ static void scsi_restart_operations(struct Scsi_Host *shost) + * requests are started. + */ + scsi_run_host_queues(shost); ++ ++ /* ++ * if eh is active and host_eh_scheduled is pending we need to re-run ++ * recovery. we do this check after scsi_run_host_queues() to allow ++ * everything pent up since the last eh run a chance to make forward ++ * progress before we sync again. Either we'll immediately re-run ++ * recovery or scsi_device_unbusy() will wake us again when these ++ * pending commands complete. ++ */ ++ spin_lock_irqsave(shost->host_lock, flags); ++ if (shost->host_eh_scheduled) ++ if (scsi_host_set_state(shost, SHOST_RECOVERY)) ++ WARN_ON(scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY)); ++ spin_unlock_irqrestore(shost->host_lock, flags); + } + + /** +diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c +index f0ab58e..6c4b620 100644 +--- a/drivers/scsi/scsi_lib.c ++++ b/drivers/scsi/scsi_lib.c +@@ -406,10 +406,6 @@ static void scsi_run_queue(struct request_queue *q) + LIST_HEAD(starved_list); + unsigned long flags; + +- /* if the device is dead, sdev will be NULL, so no queue to run */ +- if (!sdev) +- return; +- + shost = sdev->host; + if (scsi_target(sdev)->single_lun) + scsi_single_lun_run(sdev); +@@ -483,15 +479,26 @@ void scsi_requeue_run_queue(struct work_struct *work) + */ + static void scsi_requeue_command(struct request_queue *q, struct scsi_cmnd *cmd) + { ++ struct scsi_device *sdev = cmd->device; + struct request *req = cmd->request; + unsigned long flags; + ++ /* ++ * We need to hold a reference on the device to avoid the queue being ++ * killed after the unlock and before scsi_run_queue is invoked which ++ * may happen because scsi_unprep_request() puts the command which ++ * releases its reference on the device. ++ */ ++ get_device(&sdev->sdev_gendev); ++ + spin_lock_irqsave(q->queue_lock, flags); + scsi_unprep_request(req); + blk_requeue_request(q, req); + spin_unlock_irqrestore(q->queue_lock, flags); + + scsi_run_queue(q); ++ ++ put_device(&sdev->sdev_gendev); + } + + void scsi_next_command(struct scsi_cmnd *cmd) +@@ -1374,16 +1381,16 @@ static inline int scsi_host_queue_ready(struct request_queue *q, + * may be changed after request stacking drivers call the function, + * regardless of taking lock or not. + * +- * When scsi can't dispatch I/Os anymore and needs to kill I/Os +- * (e.g. !sdev), scsi needs to return 'not busy'. +- * Otherwise, request stacking drivers may hold requests forever. ++ * When scsi can't dispatch I/Os anymore and needs to kill I/Os scsi ++ * needs to return 'not busy'. Otherwise, request stacking drivers ++ * may hold requests forever. + */ + static int scsi_lld_busy(struct request_queue *q) + { + struct scsi_device *sdev = q->queuedata; + struct Scsi_Host *shost; + +- if (!sdev) ++ if (blk_queue_dead(q)) + return 0; + + shost = sdev->host; +@@ -1494,12 +1501,6 @@ static void scsi_request_fn(struct request_queue *q) + struct scsi_cmnd *cmd; + struct request *req; + +- if (!sdev) { +- while ((req = blk_peek_request(q)) != NULL) +- scsi_kill_request(req, q); +- return; +- } +- + if(!get_device(&sdev->sdev_gendev)) + /* We must be tearing the block queue down already */ + return; +@@ -1701,20 +1702,6 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev) + return q; + } + +-void scsi_free_queue(struct request_queue *q) +-{ +- unsigned long flags; +- +- WARN_ON(q->queuedata); +- +- /* cause scsi_request_fn() to kill all non-finished requests */ +- spin_lock_irqsave(q->queue_lock, flags); +- q->request_fn(q); +- spin_unlock_irqrestore(q->queue_lock, flags); +- +- blk_cleanup_queue(q); +-} +- + /* + * Function: scsi_block_requests() + * +diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h +index 5b475d0..d58adca 100644 +--- a/drivers/scsi/scsi_priv.h ++++ b/drivers/scsi/scsi_priv.h +@@ -85,7 +85,6 @@ extern void scsi_next_command(struct scsi_cmnd *cmd); + extern void scsi_io_completion(struct scsi_cmnd *, unsigned int); + extern void scsi_run_host_queues(struct Scsi_Host *shost); + extern struct request_queue *scsi_alloc_queue(struct scsi_device *sdev); +-extern void scsi_free_queue(struct request_queue *q); + extern int scsi_init_queue(void); + extern void scsi_exit_queue(void); + struct request_queue; +diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c +index 6e7ea4a..a48b59c 100644 +--- a/drivers/scsi/scsi_scan.c ++++ b/drivers/scsi/scsi_scan.c +@@ -1710,6 +1710,9 @@ static void scsi_sysfs_add_devices(struct Scsi_Host *shost) + { + struct scsi_device *sdev; + shost_for_each_device(sdev, shost) { ++ /* target removed before the device could be added */ ++ if (sdev->sdev_state == SDEV_DEL) ++ continue; + if (!scsi_host_scan_allowed(shost) || + scsi_sysfs_add_sdev(sdev) != 0) + __scsi_remove_device(sdev); +diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c +index 04c2a27..bb7c482 100644 +--- a/drivers/scsi/scsi_sysfs.c ++++ b/drivers/scsi/scsi_sysfs.c +@@ -971,11 +971,8 @@ void __scsi_remove_device(struct scsi_device *sdev) + sdev->host->hostt->slave_destroy(sdev); + transport_destroy_device(dev); + +- /* cause the request function to reject all I/O requests */ +- sdev->request_queue->queuedata = NULL; +- + /* Freeing the queue signals to block that we're done */ +- scsi_free_queue(sdev->request_queue); ++ blk_cleanup_queue(sdev->request_queue); + put_device(dev); + } + +@@ -1000,7 +997,6 @@ static void __scsi_remove_target(struct scsi_target *starget) + struct scsi_device *sdev; + + spin_lock_irqsave(shost->host_lock, flags); +- starget->reap_ref++; + restart: + list_for_each_entry(sdev, &shost->__devices, siblings) { + if (sdev->channel != starget->channel || +@@ -1014,14 +1010,6 @@ static void __scsi_remove_target(struct scsi_target *starget) + goto restart; + } + spin_unlock_irqrestore(shost->host_lock, flags); +- scsi_target_reap(starget); +-} +- +-static int __remove_child (struct device * dev, void * data) +-{ +- if (scsi_is_target_device(dev)) +- __scsi_remove_target(to_scsi_target(dev)); +- return 0; + } + + /** +@@ -1034,14 +1022,34 @@ static int __remove_child (struct device * dev, void * data) + */ + void scsi_remove_target(struct device *dev) + { +- if (scsi_is_target_device(dev)) { +- __scsi_remove_target(to_scsi_target(dev)); +- return; ++ struct Scsi_Host *shost = dev_to_shost(dev->parent); ++ struct scsi_target *starget, *found; ++ unsigned long flags; ++ ++ restart: ++ found = NULL; ++ spin_lock_irqsave(shost->host_lock, flags); ++ list_for_each_entry(starget, &shost->__targets, siblings) { ++ if (starget->state == STARGET_DEL) ++ continue; ++ if (starget->dev.parent == dev || &starget->dev == dev) { ++ found = starget; ++ found->reap_ref++; ++ break; ++ } + } ++ spin_unlock_irqrestore(shost->host_lock, flags); + +- get_device(dev); +- device_for_each_child(dev, NULL, __remove_child); +- put_device(dev); ++ if (found) { ++ __scsi_remove_target(found); ++ scsi_target_reap(found); ++ /* in the case where @dev has multiple starget children, ++ * continue removing. ++ * ++ * FIXME: does such a case exist? ++ */ ++ goto restart; ++ } + } + EXPORT_SYMBOL(scsi_remove_target); + +diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c +index 0842cc7..2ff1255 100644 +--- a/drivers/target/iscsi/iscsi_target.c ++++ b/drivers/target/iscsi/iscsi_target.c +@@ -427,19 +427,8 @@ int iscsit_reset_np_thread( + + int iscsit_del_np_comm(struct iscsi_np *np) + { +- if (!np->np_socket) +- return 0; +- +- /* +- * Some network transports allocate their own struct sock->file, +- * see if we need to free any additional allocated resources. +- */ +- if (np->np_flags & NPF_SCTP_STRUCT_FILE) { +- kfree(np->np_socket->file); +- np->np_socket->file = NULL; +- } +- +- sock_release(np->np_socket); ++ if (np->np_socket) ++ sock_release(np->np_socket); + return 0; + } + +@@ -4105,13 +4094,8 @@ int iscsit_close_connection( + kfree(conn->conn_ops); + conn->conn_ops = NULL; + +- if (conn->sock) { +- if (conn->conn_flags & CONNFLAG_SCTP_STRUCT_FILE) { +- kfree(conn->sock->file); +- conn->sock->file = NULL; +- } ++ if (conn->sock) + sock_release(conn->sock); +- } + conn->thread_set = NULL; + + pr_debug("Moving to TARG_CONN_STATE_FREE.\n"); +diff --git a/drivers/target/iscsi/iscsi_target_core.h b/drivers/target/iscsi/iscsi_target_core.h +index 7da2d6a..0f68197 100644 +--- a/drivers/target/iscsi/iscsi_target_core.h ++++ b/drivers/target/iscsi/iscsi_target_core.h +@@ -224,7 +224,6 @@ enum iscsi_timer_flags_table { + /* Used for struct iscsi_np->np_flags */ + enum np_flags_table { + NPF_IP_NETWORK = 0x00, +- NPF_SCTP_STRUCT_FILE = 0x01 /* Bugfix */ + }; + + /* Used for struct iscsi_np->np_thread_state */ +@@ -511,7 +510,6 @@ struct iscsi_conn { + u16 local_port; + int net_size; + u32 auth_id; +-#define CONNFLAG_SCTP_STRUCT_FILE 0x01 + u32 conn_flags; + /* Used for iscsi_tx_login_rsp() */ + u32 login_itt; +diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c +index bd2adec..2ec5339 100644 +--- a/drivers/target/iscsi/iscsi_target_login.c ++++ b/drivers/target/iscsi/iscsi_target_login.c +@@ -793,22 +793,6 @@ int iscsi_target_setup_login_socket( + } + np->np_socket = sock; + /* +- * The SCTP stack needs struct socket->file. +- */ +- if ((np->np_network_transport == ISCSI_SCTP_TCP) || +- (np->np_network_transport == ISCSI_SCTP_UDP)) { +- if (!sock->file) { +- sock->file = kzalloc(sizeof(struct file), GFP_KERNEL); +- if (!sock->file) { +- pr_err("Unable to allocate struct" +- " file for SCTP\n"); +- ret = -ENOMEM; +- goto fail; +- } +- np->np_flags |= NPF_SCTP_STRUCT_FILE; +- } +- } +- /* + * Setup the np->np_sockaddr from the passed sockaddr setup + * in iscsi_target_configfs.c code.. + */ +@@ -857,21 +841,15 @@ int iscsi_target_setup_login_socket( + + fail: + np->np_socket = NULL; +- if (sock) { +- if (np->np_flags & NPF_SCTP_STRUCT_FILE) { +- kfree(sock->file); +- sock->file = NULL; +- } +- ++ if (sock) + sock_release(sock); +- } + return ret; + } + + static int __iscsi_target_login_thread(struct iscsi_np *np) + { + u8 buffer[ISCSI_HDR_LEN], iscsi_opcode, zero_tsih = 0; +- int err, ret = 0, ip_proto, sock_type, set_sctp_conn_flag, stop; ++ int err, ret = 0, ip_proto, sock_type, stop; + struct iscsi_conn *conn = NULL; + struct iscsi_login *login; + struct iscsi_portal_group *tpg = NULL; +@@ -882,7 +860,6 @@ static int __iscsi_target_login_thread(struct iscsi_np *np) + struct sockaddr_in6 sock_in6; + + flush_signals(current); +- set_sctp_conn_flag = 0; + sock = np->np_socket; + ip_proto = np->np_ip_proto; + sock_type = np->np_sock_type; +@@ -907,35 +884,12 @@ static int __iscsi_target_login_thread(struct iscsi_np *np) + spin_unlock_bh(&np->np_thread_lock); + goto out; + } +- /* +- * The SCTP stack needs struct socket->file. +- */ +- if ((np->np_network_transport == ISCSI_SCTP_TCP) || +- (np->np_network_transport == ISCSI_SCTP_UDP)) { +- if (!new_sock->file) { +- new_sock->file = kzalloc( +- sizeof(struct file), GFP_KERNEL); +- if (!new_sock->file) { +- pr_err("Unable to allocate struct" +- " file for SCTP\n"); +- sock_release(new_sock); +- /* Get another socket */ +- return 1; +- } +- set_sctp_conn_flag = 1; +- } +- } +- + iscsi_start_login_thread_timer(np); + + conn = kzalloc(sizeof(struct iscsi_conn), GFP_KERNEL); + if (!conn) { + pr_err("Could not allocate memory for" + " new connection\n"); +- if (set_sctp_conn_flag) { +- kfree(new_sock->file); +- new_sock->file = NULL; +- } + sock_release(new_sock); + /* Get another socket */ + return 1; +@@ -945,9 +899,6 @@ static int __iscsi_target_login_thread(struct iscsi_np *np) + conn->conn_state = TARG_CONN_STATE_FREE; + conn->sock = new_sock; + +- if (set_sctp_conn_flag) +- conn->conn_flags |= CONNFLAG_SCTP_STRUCT_FILE; +- + pr_debug("Moving to TARG_CONN_STATE_XPT_UP.\n"); + conn->conn_state = TARG_CONN_STATE_XPT_UP; + +@@ -1195,13 +1146,8 @@ old_sess_out: + iscsi_release_param_list(conn->param_list); + conn->param_list = NULL; + } +- if (conn->sock) { +- if (conn->conn_flags & CONNFLAG_SCTP_STRUCT_FILE) { +- kfree(conn->sock->file); +- conn->sock->file = NULL; +- } ++ if (conn->sock) + sock_release(conn->sock); +- } + kfree(conn); + + if (tpg) { +diff --git a/drivers/target/target_core_cdb.c b/drivers/target/target_core_cdb.c +index 93b9406..717a8d4 100644 +--- a/drivers/target/target_core_cdb.c ++++ b/drivers/target/target_core_cdb.c +@@ -1114,11 +1114,11 @@ int target_emulate_unmap(struct se_task *task) + struct se_cmd *cmd = task->task_se_cmd; + struct se_device *dev = cmd->se_dev; + unsigned char *buf, *ptr = NULL; +- unsigned char *cdb = &cmd->t_task_cdb[0]; + sector_t lba; +- unsigned int size = cmd->data_length, range; +- int ret = 0, offset; +- unsigned short dl, bd_dl; ++ int size = cmd->data_length; ++ u32 range; ++ int ret = 0; ++ int dl, bd_dl; + + if (!dev->transport->do_discard) { + pr_err("UNMAP emulation not supported for: %s\n", +@@ -1127,24 +1127,41 @@ int target_emulate_unmap(struct se_task *task) + return -ENOSYS; + } + +- /* First UNMAP block descriptor starts at 8 byte offset */ +- offset = 8; +- size -= 8; +- dl = get_unaligned_be16(&cdb[0]); +- bd_dl = get_unaligned_be16(&cdb[2]); +- + buf = transport_kmap_data_sg(cmd); + +- ptr = &buf[offset]; +- pr_debug("UNMAP: Sub: %s Using dl: %hu bd_dl: %hu size: %hu" ++ dl = get_unaligned_be16(&buf[0]); ++ bd_dl = get_unaligned_be16(&buf[2]); ++ ++ size = min(size - 8, bd_dl); ++ if (size / 16 > dev->se_sub_dev->se_dev_attrib.max_unmap_block_desc_count) { ++ cmd->scsi_sense_reason = TCM_INVALID_PARAMETER_LIST; ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ /* First UNMAP block descriptor starts at 8 byte offset */ ++ ptr = &buf[8]; ++ pr_debug("UNMAP: Sub: %s Using dl: %u bd_dl: %u size: %u" + " ptr: %p\n", dev->transport->name, dl, bd_dl, size, ptr); + +- while (size) { ++ while (size >= 16) { + lba = get_unaligned_be64(&ptr[0]); + range = get_unaligned_be32(&ptr[8]); + pr_debug("UNMAP: Using lba: %llu and range: %u\n", + (unsigned long long)lba, range); + ++ if (range > dev->se_sub_dev->se_dev_attrib.max_unmap_lba_count) { ++ cmd->scsi_sense_reason = TCM_INVALID_PARAMETER_LIST; ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ if (lba + range > dev->transport->get_blocks(dev) + 1) { ++ cmd->scsi_sense_reason = TCM_ADDRESS_OUT_OF_RANGE; ++ ret = -EINVAL; ++ goto err; ++ } ++ + ret = dev->transport->do_discard(dev, lba, range); + if (ret < 0) { + pr_err("blkdev_issue_discard() failed: %d\n", +diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c +index 5660916..94c03d2 100644 +--- a/drivers/target/target_core_transport.c ++++ b/drivers/target/target_core_transport.c +@@ -1820,6 +1820,7 @@ static void transport_generic_request_failure(struct se_cmd *cmd) + case TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE: + case TCM_UNKNOWN_MODE_PAGE: + case TCM_WRITE_PROTECTED: ++ case TCM_ADDRESS_OUT_OF_RANGE: + case TCM_CHECK_CONDITION_ABORT_CMD: + case TCM_CHECK_CONDITION_UNIT_ATTENTION: + case TCM_CHECK_CONDITION_NOT_READY: +@@ -4496,6 +4497,15 @@ int transport_send_check_condition_and_sense( + /* WRITE PROTECTED */ + buffer[offset+SPC_ASC_KEY_OFFSET] = 0x27; + break; ++ case TCM_ADDRESS_OUT_OF_RANGE: ++ /* CURRENT ERROR */ ++ buffer[offset] = 0x70; ++ buffer[offset+SPC_ADD_SENSE_LEN_OFFSET] = 10; ++ /* ILLEGAL REQUEST */ ++ buffer[offset+SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST; ++ /* LOGICAL BLOCK ADDRESS OUT OF RANGE */ ++ buffer[offset+SPC_ASC_KEY_OFFSET] = 0x21; ++ break; + case TCM_CHECK_CONDITION_UNIT_ATTENTION: + /* CURRENT ERROR */ + buffer[offset] = 0x70; +diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c +index f6ff837..a9df218 100644 +--- a/drivers/usb/core/devio.c ++++ b/drivers/usb/core/devio.c +@@ -1555,10 +1555,14 @@ static int processcompl_compat(struct async *as, void __user * __user *arg) + void __user *addr = as->userurb; + unsigned int i; + +- if (as->userbuffer && urb->actual_length) +- if (copy_to_user(as->userbuffer, urb->transfer_buffer, +- urb->actual_length)) ++ if (as->userbuffer && urb->actual_length) { ++ if (urb->number_of_packets > 0) /* Isochronous */ ++ i = urb->transfer_buffer_length; ++ else /* Non-Isoc */ ++ i = urb->actual_length; ++ if (copy_to_user(as->userbuffer, urb->transfer_buffer, i)) + return -EFAULT; ++ } + if (put_user(as->status, &userurb->status)) + return -EFAULT; + if (put_user(urb->actual_length, &userurb->actual_length)) +diff --git a/drivers/usb/gadget/u_ether.c b/drivers/usb/gadget/u_ether.c +index 29c854b..4e1f0aa 100644 +--- a/drivers/usb/gadget/u_ether.c ++++ b/drivers/usb/gadget/u_ether.c +@@ -796,12 +796,6 @@ int gether_setup(struct usb_gadget *g, u8 ethaddr[ETH_ALEN]) + + SET_ETHTOOL_OPS(net, &ops); + +- /* two kinds of host-initiated state changes: +- * - iff DATA transfer is active, carrier is "on" +- * - tx queueing enabled if open *and* carrier is "on" +- */ +- netif_carrier_off(net); +- + dev->gadget = g; + SET_NETDEV_DEV(net, &g->dev); + SET_NETDEV_DEVTYPE(net, &gadget_type); +@@ -815,6 +809,12 @@ int gether_setup(struct usb_gadget *g, u8 ethaddr[ETH_ALEN]) + INFO(dev, "HOST MAC %pM\n", dev->host_mac); + + the_dev = dev; ++ ++ /* two kinds of host-initiated state changes: ++ * - iff DATA transfer is active, carrier is "on" ++ * - tx queueing enabled if open *and* carrier is "on" ++ */ ++ netif_carrier_off(net); + } + + return status; +diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c +index 5971c95..d89aac1 100644 +--- a/drivers/usb/serial/option.c ++++ b/drivers/usb/serial/option.c +@@ -932,8 +932,12 @@ static const struct usb_device_id option_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0165, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0167, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1008, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1010, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0326, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1008, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1010, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1012, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1057, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1058, 0xff, 0xff, 0xff) }, +diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c +index 0b39458..03321e5 100644 +--- a/fs/btrfs/async-thread.c ++++ b/fs/btrfs/async-thread.c +@@ -206,10 +206,17 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, + + work->ordered_func(work); + +- /* now take the lock again and call the freeing code */ ++ /* now take the lock again and drop our item from the list */ + spin_lock(&workers->order_lock); + list_del(&work->order_list); ++ spin_unlock(&workers->order_lock); ++ ++ /* ++ * we don't want to call the ordered free functions ++ * with the lock held though ++ */ + work->ordered_free(work); ++ spin_lock(&workers->order_lock); + } + + spin_unlock(&workers->order_lock); +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index f44b392..6b2a724 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -872,7 +872,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + + #ifdef CONFIG_MIGRATION + static int btree_migratepage(struct address_space *mapping, +- struct page *newpage, struct page *page) ++ struct page *newpage, struct page *page, ++ enum migrate_mode mode) + { + /* + * we can't safely write a btree page from here, +@@ -887,7 +888,7 @@ static int btree_migratepage(struct address_space *mapping, + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; +- return migrate_page(mapping, newpage, page); ++ return migrate_page(mapping, newpage, page, mode); + } + #endif + +diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c +index 6aa7457..c858a29 100644 +--- a/fs/cifs/cifssmb.c ++++ b/fs/cifs/cifssmb.c +@@ -89,6 +89,32 @@ static struct { + /* Forward declarations */ + static void cifs_readv_complete(struct work_struct *work); + ++#ifdef CONFIG_HIGHMEM ++/* ++ * On arches that have high memory, kmap address space is limited. By ++ * serializing the kmap operations on those arches, we ensure that we don't ++ * end up with a bunch of threads in writeback with partially mapped page ++ * arrays, stuck waiting for kmap to come back. That situation prevents ++ * progress and can deadlock. ++ */ ++static DEFINE_MUTEX(cifs_kmap_mutex); ++ ++static inline void ++cifs_kmap_lock(void) ++{ ++ mutex_lock(&cifs_kmap_mutex); ++} ++ ++static inline void ++cifs_kmap_unlock(void) ++{ ++ mutex_unlock(&cifs_kmap_mutex); ++} ++#else /* !CONFIG_HIGHMEM */ ++#define cifs_kmap_lock() do { ; } while(0) ++#define cifs_kmap_unlock() do { ; } while(0) ++#endif /* CONFIG_HIGHMEM */ ++ + /* Mark as invalid, all open files on tree connections since they + were closed when session to server was lost */ + static void mark_open_files_invalid(struct cifs_tcon *pTcon) +@@ -1540,6 +1566,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) + eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; + cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index); + ++ cifs_kmap_lock(); + list_for_each_entry_safe(page, tpage, &rdata->pages, lru) { + if (remaining >= PAGE_CACHE_SIZE) { + /* enough data to fill the page */ +@@ -1589,6 +1616,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) + page_cache_release(page); + } + } ++ cifs_kmap_unlock(); + + /* issue the read if we have any iovecs left to fill */ + if (rdata->nr_iov > 1) { +@@ -2171,6 +2199,7 @@ cifs_async_writev(struct cifs_writedata *wdata) + iov[0].iov_base = smb; + + /* marshal up the pages into iov array */ ++ cifs_kmap_lock(); + wdata->bytes = 0; + for (i = 0; i < wdata->nr_pages; i++) { + iov[i + 1].iov_len = min(inode->i_size - +@@ -2179,6 +2208,7 @@ cifs_async_writev(struct cifs_writedata *wdata) + iov[i + 1].iov_base = kmap(wdata->pages[i]); + wdata->bytes += iov[i + 1].iov_len; + } ++ cifs_kmap_unlock(); + + cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes); + +diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c +index 914bf9e..d6970f7 100644 +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -557,7 +557,8 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) + if (bitmap_bh == NULL) + continue; + +- x = ext4_count_free(bitmap_bh, sb->s_blocksize); ++ x = ext4_count_free(bitmap_bh->b_data, ++ EXT4_BLOCKS_PER_GROUP(sb) / 8); + printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", + i, ext4_free_group_clusters(sb, gdp), x); + bitmap_count += x; +diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c +index fa3af81..bbde5d5 100644 +--- a/fs/ext4/bitmap.c ++++ b/fs/ext4/bitmap.c +@@ -11,21 +11,15 @@ + #include + #include "ext4.h" + +-#ifdef EXT4FS_DEBUG +- + static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; + +-unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars) ++unsigned int ext4_count_free(char *bitmap, unsigned int numchars) + { + unsigned int i, sum = 0; + +- if (!map) +- return 0; + for (i = 0; i < numchars; i++) +- sum += nibblemap[map->b_data[i] & 0xf] + +- nibblemap[(map->b_data[i] >> 4) & 0xf]; ++ sum += nibblemap[bitmap[i] & 0xf] + ++ nibblemap[(bitmap[i] >> 4) & 0xf]; + return sum; + } + +-#endif /* EXT4FS_DEBUG */ +- +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 7b1cd5c..8cb184c 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1123,8 +1123,7 @@ struct ext4_sb_info { + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ +- unsigned long s_overhead_last; /* Last calculated overhead */ +- unsigned long s_blocks_last; /* Last seen block count */ ++ unsigned long s_overhead; /* # of fs overhead clusters */ + unsigned int s_cluster_ratio; /* Number of blocks per cluster */ + unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ +@@ -1757,7 +1756,7 @@ struct mmpd_data { + # define NORET_AND noreturn, + + /* bitmap.c */ +-extern unsigned int ext4_count_free(struct buffer_head *, unsigned); ++extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); + + /* balloc.c */ + extern unsigned int ext4_block_group(struct super_block *sb, +@@ -1925,6 +1924,7 @@ extern int ext4_group_extend(struct super_block *sb, + ext4_fsblk_t n_blocks_count); + + /* super.c */ ++extern int ext4_calculate_overhead(struct super_block *sb); + extern void *ext4_kvmalloc(size_t size, gfp_t flags); + extern void *ext4_kvzalloc(size_t size, gfp_t flags); + extern void ext4_kvfree(void *ptr); +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index 8fb6844..6266799 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -1057,7 +1057,8 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) + if (!bitmap_bh) + continue; + +- x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ++ x = ext4_count_free(bitmap_bh->b_data, ++ EXT4_INODES_PER_GROUP(sb) / 8); + printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", + (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); + bitmap_count += x; +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 3ce7613..8b01f9f 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -277,6 +277,15 @@ void ext4_da_update_reserve_space(struct inode *inode, + used = ei->i_reserved_data_blocks; + } + ++ if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) { ++ ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d " ++ "with only %d reserved metadata blocks\n", __func__, ++ inode->i_ino, ei->i_allocated_meta_blocks, ++ ei->i_reserved_meta_blocks); ++ WARN_ON(1); ++ ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks; ++ } ++ + /* Update per-inode reservations */ + ei->i_reserved_data_blocks -= used; + ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; +@@ -1102,6 +1111,17 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int md_needed; + int ret; ++ ext4_lblk_t save_last_lblock; ++ int save_len; ++ ++ /* ++ * We will charge metadata quota at writeout time; this saves ++ * us from metadata over-estimation, though we may go over by ++ * a small amount in the end. Here we just reserve for data. ++ */ ++ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); ++ if (ret) ++ return ret; + + /* + * recalculate the amount of metadata blocks to reserve +@@ -1110,32 +1130,31 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) + */ + repeat: + spin_lock(&ei->i_block_reservation_lock); ++ /* ++ * ext4_calc_metadata_amount() has side effects, which we have ++ * to be prepared undo if we fail to claim space. ++ */ ++ save_len = ei->i_da_metadata_calc_len; ++ save_last_lblock = ei->i_da_metadata_calc_last_lblock; + md_needed = EXT4_NUM_B2C(sbi, + ext4_calc_metadata_amount(inode, lblock)); + trace_ext4_da_reserve_space(inode, md_needed); +- spin_unlock(&ei->i_block_reservation_lock); + + /* +- * We will charge metadata quota at writeout time; this saves +- * us from metadata over-estimation, though we may go over by +- * a small amount in the end. Here we just reserve for data. +- */ +- ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); +- if (ret) +- return ret; +- /* + * We do still charge estimated metadata to the sb though; + * we cannot afford to run out of free blocks. + */ + if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) { +- dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); ++ ei->i_da_metadata_calc_len = save_len; ++ ei->i_da_metadata_calc_last_lblock = save_last_lblock; ++ spin_unlock(&ei->i_block_reservation_lock); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { + yield(); + goto repeat; + } ++ dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); + return -ENOSPC; + } +- spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks++; + ei->i_reserved_meta_blocks += md_needed; + spin_unlock(&ei->i_block_reservation_lock); +diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c +index 996780a..4eac337 100644 +--- a/fs/ext4/resize.c ++++ b/fs/ext4/resize.c +@@ -952,6 +952,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) + &sbi->s_flex_groups[flex_group].free_inodes); + } + ++ /* ++ * Update the fs overhead information ++ */ ++ ext4_calculate_overhead(sb); ++ + ext4_handle_dirty_super(handle, sb); + + exit_journal: +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index a93486e..a071348 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -3083,6 +3083,114 @@ static void ext4_destroy_lazyinit_thread(void) + kthread_stop(ext4_lazyinit_task); + } + ++/* ++ * Note: calculating the overhead so we can be compatible with ++ * historical BSD practice is quite difficult in the face of ++ * clusters/bigalloc. This is because multiple metadata blocks from ++ * different block group can end up in the same allocation cluster. ++ * Calculating the exact overhead in the face of clustered allocation ++ * requires either O(all block bitmaps) in memory or O(number of block ++ * groups**2) in time. We will still calculate the superblock for ++ * older file systems --- and if we come across with a bigalloc file ++ * system with zero in s_overhead_clusters the estimate will be close to ++ * correct especially for very large cluster sizes --- but for newer ++ * file systems, it's better to calculate this figure once at mkfs ++ * time, and store it in the superblock. If the superblock value is ++ * present (even for non-bigalloc file systems), we will use it. ++ */ ++static int count_overhead(struct super_block *sb, ext4_group_t grp, ++ char *buf) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++ struct ext4_group_desc *gdp; ++ ext4_fsblk_t first_block, last_block, b; ++ ext4_group_t i, ngroups = ext4_get_groups_count(sb); ++ int s, j, count = 0; ++ ++ first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + ++ (grp * EXT4_BLOCKS_PER_GROUP(sb)); ++ last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; ++ for (i = 0; i < ngroups; i++) { ++ gdp = ext4_get_group_desc(sb, i, NULL); ++ b = ext4_block_bitmap(sb, gdp); ++ if (b >= first_block && b <= last_block) { ++ ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); ++ count++; ++ } ++ b = ext4_inode_bitmap(sb, gdp); ++ if (b >= first_block && b <= last_block) { ++ ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); ++ count++; ++ } ++ b = ext4_inode_table(sb, gdp); ++ if (b >= first_block && b + sbi->s_itb_per_group <= last_block) ++ for (j = 0; j < sbi->s_itb_per_group; j++, b++) { ++ int c = EXT4_B2C(sbi, b - first_block); ++ ext4_set_bit(c, buf); ++ count++; ++ } ++ if (i != grp) ++ continue; ++ s = 0; ++ if (ext4_bg_has_super(sb, grp)) { ++ ext4_set_bit(s++, buf); ++ count++; ++ } ++ for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) { ++ ext4_set_bit(EXT4_B2C(sbi, s++), buf); ++ count++; ++ } ++ } ++ if (!count) ++ return 0; ++ return EXT4_CLUSTERS_PER_GROUP(sb) - ++ ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8); ++} ++ ++/* ++ * Compute the overhead and stash it in sbi->s_overhead ++ */ ++int ext4_calculate_overhead(struct super_block *sb) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++ struct ext4_super_block *es = sbi->s_es; ++ ext4_group_t i, ngroups = ext4_get_groups_count(sb); ++ ext4_fsblk_t overhead = 0; ++ char *buf = (char *) get_zeroed_page(GFP_KERNEL); ++ ++ memset(buf, 0, PAGE_SIZE); ++ if (!buf) ++ return -ENOMEM; ++ ++ /* ++ * Compute the overhead (FS structures). This is constant ++ * for a given filesystem unless the number of block groups ++ * changes so we cache the previous value until it does. ++ */ ++ ++ /* ++ * All of the blocks before first_data_block are overhead ++ */ ++ overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); ++ ++ /* ++ * Add the overhead found in each block group ++ */ ++ for (i = 0; i < ngroups; i++) { ++ int blks; ++ ++ blks = count_overhead(sb, i, buf); ++ overhead += blks; ++ if (blks) ++ memset(buf, 0, PAGE_SIZE); ++ cond_resched(); ++ } ++ sbi->s_overhead = overhead; ++ smp_wmb(); ++ free_page((unsigned long) buf); ++ return 0; ++} ++ + static int ext4_fill_super(struct super_block *sb, void *data, int silent) + { + char *orig_data = kstrdup(data, GFP_KERNEL); +@@ -3695,6 +3803,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) + + no_journal: + /* ++ * Get the # of file system overhead blocks from the ++ * superblock if present. ++ */ ++ if (es->s_overhead_clusters) ++ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); ++ else { ++ ret = ext4_calculate_overhead(sb); ++ if (ret) ++ goto failed_mount_wq; ++ } ++ ++ /* + * The maximum number of concurrent works can be high and + * concurrency isn't really necessary. Limit it to 1. + */ +@@ -4568,67 +4688,21 @@ restore_opts: + return err; + } + +-/* +- * Note: calculating the overhead so we can be compatible with +- * historical BSD practice is quite difficult in the face of +- * clusters/bigalloc. This is because multiple metadata blocks from +- * different block group can end up in the same allocation cluster. +- * Calculating the exact overhead in the face of clustered allocation +- * requires either O(all block bitmaps) in memory or O(number of block +- * groups**2) in time. We will still calculate the superblock for +- * older file systems --- and if we come across with a bigalloc file +- * system with zero in s_overhead_clusters the estimate will be close to +- * correct especially for very large cluster sizes --- but for newer +- * file systems, it's better to calculate this figure once at mkfs +- * time, and store it in the superblock. If the superblock value is +- * present (even for non-bigalloc file systems), we will use it. +- */ + static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) + { + struct super_block *sb = dentry->d_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; +- struct ext4_group_desc *gdp; ++ ext4_fsblk_t overhead = 0; + u64 fsid; + s64 bfree; + +- if (test_opt(sb, MINIX_DF)) { +- sbi->s_overhead_last = 0; +- } else if (es->s_overhead_clusters) { +- sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters); +- } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { +- ext4_group_t i, ngroups = ext4_get_groups_count(sb); +- ext4_fsblk_t overhead = 0; +- +- /* +- * Compute the overhead (FS structures). This is constant +- * for a given filesystem unless the number of block groups +- * changes so we cache the previous value until it does. +- */ +- +- /* +- * All of the blocks before first_data_block are +- * overhead +- */ +- overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); +- +- /* +- * Add the overhead found in each block group +- */ +- for (i = 0; i < ngroups; i++) { +- gdp = ext4_get_group_desc(sb, i, NULL); +- overhead += ext4_num_overhead_clusters(sb, i, gdp); +- cond_resched(); +- } +- sbi->s_overhead_last = overhead; +- smp_wmb(); +- sbi->s_blocks_last = ext4_blocks_count(es); +- } ++ if (!test_opt(sb, MINIX_DF)) ++ overhead = sbi->s_overhead; + + buf->f_type = EXT4_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; +- buf->f_blocks = (ext4_blocks_count(es) - +- EXT4_C2B(sbi, sbi->s_overhead_last)); ++ buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, sbi->s_overhead); + bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - + percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); + /* prevent underflow in case that few free space is available */ +diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c +index ebc2f4d..0aa424a 100644 +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -569,7 +569,8 @@ static int hugetlbfs_set_page_dirty(struct page *page) + } + + static int hugetlbfs_migrate_page(struct address_space *mapping, +- struct page *newpage, struct page *page) ++ struct page *newpage, struct page *page, ++ enum migrate_mode mode) + { + int rc; + +diff --git a/fs/locks.c b/fs/locks.c +index 6a64f15..fcc50ab 100644 +--- a/fs/locks.c ++++ b/fs/locks.c +@@ -308,7 +308,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock, + return 0; + } + +-static int assign_type(struct file_lock *fl, int type) ++static int assign_type(struct file_lock *fl, long type) + { + switch (type) { + case F_RDLCK: +@@ -445,7 +445,7 @@ static const struct lock_manager_operations lease_manager_ops = { + /* + * Initialize a lease, use the default lock manager operations + */ +-static int lease_init(struct file *filp, int type, struct file_lock *fl) ++static int lease_init(struct file *filp, long type, struct file_lock *fl) + { + if (assign_type(fl, type) != 0) + return -EINVAL; +@@ -463,7 +463,7 @@ static int lease_init(struct file *filp, int type, struct file_lock *fl) + } + + /* Allocate a file_lock initialised to this type of lease */ +-static struct file_lock *lease_alloc(struct file *filp, int type) ++static struct file_lock *lease_alloc(struct file *filp, long type) + { + struct file_lock *fl = locks_alloc_lock(); + int error = -ENOMEM; +diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h +index 3f4d957..68b3f20 100644 +--- a/fs/nfs/internal.h ++++ b/fs/nfs/internal.h +@@ -330,7 +330,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data); + + #ifdef CONFIG_MIGRATION + extern int nfs_migrate_page(struct address_space *, +- struct page *, struct page *); ++ struct page *, struct page *, enum migrate_mode); + #else + #define nfs_migrate_page NULL + #endif +diff --git a/fs/nfs/write.c b/fs/nfs/write.c +index 4efd421..c6e523a 100644 +--- a/fs/nfs/write.c ++++ b/fs/nfs/write.c +@@ -1711,7 +1711,7 @@ out_error: + + #ifdef CONFIG_MIGRATION + int nfs_migrate_page(struct address_space *mapping, struct page *newpage, +- struct page *page) ++ struct page *page, enum migrate_mode mode) + { + /* + * If PagePrivate is set, then the page is currently associated with +@@ -1726,7 +1726,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, + + nfs_fscache_release_page(page, GFP_KERNEL); + +- return migrate_page(mapping, newpage, page); ++ return migrate_page(mapping, newpage, page, mode); + } + #endif + +diff --git a/fs/udf/super.c b/fs/udf/super.c +index 270e135..516b7f0 100644 +--- a/fs/udf/super.c ++++ b/fs/udf/super.c +@@ -1285,7 +1285,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, + BUG_ON(ident != TAG_IDENT_LVD); + lvd = (struct logicalVolDesc *)bh->b_data; + table_len = le32_to_cpu(lvd->mapTableLength); +- if (sizeof(*lvd) + table_len > sb->s_blocksize) { ++ if (table_len > sb->s_blocksize - sizeof(*lvd)) { + udf_err(sb, "error loading logical volume descriptor: " + "Partition table too long (%u > %lu)\n", table_len, + sb->s_blocksize - sizeof(*lvd)); +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 0ed1eb0..ff039f0 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -481,6 +481,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) + + #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) + #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) ++#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) + #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) + #define blk_queue_noxmerges(q) \ + test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) +diff --git a/include/linux/cpu.h b/include/linux/cpu.h +index 6cb60fd..c692acc 100644 +--- a/include/linux/cpu.h ++++ b/include/linux/cpu.h +@@ -66,8 +66,9 @@ enum { + /* migration should happen before other stuff but after perf */ + CPU_PRI_PERF = 20, + CPU_PRI_MIGRATION = 10, +- /* prepare workqueues for other notifiers */ +- CPU_PRI_WORKQUEUE = 5, ++ /* bring up workqueues before normal notifiers and down after */ ++ CPU_PRI_WORKQUEUE_UP = 5, ++ CPU_PRI_WORKQUEUE_DOWN = -5, + }; + + #define CPU_ONLINE 0x0002 /* CPU (unsigned)v is up */ +diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h +index e9eaec5..7a7e5fd 100644 +--- a/include/linux/cpuset.h ++++ b/include/linux/cpuset.h +@@ -89,42 +89,33 @@ extern void rebuild_sched_domains(void); + extern void cpuset_print_task_mems_allowed(struct task_struct *p); + + /* +- * reading current mems_allowed and mempolicy in the fastpath must protected +- * by get_mems_allowed() ++ * get_mems_allowed is required when making decisions involving mems_allowed ++ * such as during page allocation. mems_allowed can be updated in parallel ++ * and depending on the new value an operation can fail potentially causing ++ * process failure. A retry loop with get_mems_allowed and put_mems_allowed ++ * prevents these artificial failures. + */ +-static inline void get_mems_allowed(void) ++static inline unsigned int get_mems_allowed(void) + { +- current->mems_allowed_change_disable++; +- +- /* +- * ensure that reading mems_allowed and mempolicy happens after the +- * update of ->mems_allowed_change_disable. +- * +- * the write-side task finds ->mems_allowed_change_disable is not 0, +- * and knows the read-side task is reading mems_allowed or mempolicy, +- * so it will clear old bits lazily. +- */ +- smp_mb(); ++ return read_seqcount_begin(¤t->mems_allowed_seq); + } + +-static inline void put_mems_allowed(void) ++/* ++ * If this returns false, the operation that took place after get_mems_allowed ++ * may have failed. It is up to the caller to retry the operation if ++ * appropriate. ++ */ ++static inline bool put_mems_allowed(unsigned int seq) + { +- /* +- * ensure that reading mems_allowed and mempolicy before reducing +- * mems_allowed_change_disable. +- * +- * the write-side task will know that the read-side task is still +- * reading mems_allowed or mempolicy, don't clears old bits in the +- * nodemask. +- */ +- smp_mb(); +- --ACCESS_ONCE(current->mems_allowed_change_disable); ++ return !read_seqcount_retry(¤t->mems_allowed_seq, seq); + } + + static inline void set_mems_allowed(nodemask_t nodemask) + { + task_lock(current); ++ write_seqcount_begin(¤t->mems_allowed_seq); + current->mems_allowed = nodemask; ++ write_seqcount_end(¤t->mems_allowed_seq); + task_unlock(current); + } + +@@ -234,12 +225,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) + { + } + +-static inline void get_mems_allowed(void) ++static inline unsigned int get_mems_allowed(void) + { ++ return 0; + } + +-static inline void put_mems_allowed(void) ++static inline bool put_mems_allowed(unsigned int seq) + { ++ return true; + } + + #endif /* !CONFIG_CPUSETS */ +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 43d36b7..29b6353 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -525,6 +525,7 @@ enum positive_aop_returns { + struct page; + struct address_space; + struct writeback_control; ++enum migrate_mode; + + struct iov_iter { + const struct iovec *iov; +@@ -609,9 +610,12 @@ struct address_space_operations { + loff_t offset, unsigned long nr_segs); + int (*get_xip_mem)(struct address_space *, pgoff_t, int, + void **, unsigned long *); +- /* migrate the contents of a page to the specified target */ ++ /* ++ * migrate the contents of a page to the specified target. If sync ++ * is false, it must not block. ++ */ + int (*migratepage) (struct address_space *, +- struct page *, struct page *); ++ struct page *, struct page *, enum migrate_mode); + int (*launder_page) (struct page *); + int (*is_partially_uptodate) (struct page *, read_descriptor_t *, + unsigned long); +@@ -2586,7 +2590,8 @@ extern int generic_check_addressable(unsigned, u64); + + #ifdef CONFIG_MIGRATION + extern int buffer_migrate_page(struct address_space *, +- struct page *, struct page *); ++ struct page *, struct page *, ++ enum migrate_mode); + #else + #define buffer_migrate_page NULL + #endif +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 32574ee..df53fdf 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -30,6 +30,13 @@ extern struct fs_struct init_fs; + #define INIT_THREADGROUP_FORK_LOCK(sig) + #endif + ++#ifdef CONFIG_CPUSETS ++#define INIT_CPUSET_SEQ \ ++ .mems_allowed_seq = SEQCNT_ZERO, ++#else ++#define INIT_CPUSET_SEQ ++#endif ++ + #define INIT_SIGNALS(sig) { \ + .nr_threads = 1, \ + .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ +@@ -193,6 +200,7 @@ extern struct cred init_cred; + INIT_FTRACE_GRAPH \ + INIT_TRACE_RECURSION \ + INIT_TASK_RCU_PREEMPT(tsk) \ ++ INIT_CPUSET_SEQ \ + } + + +diff --git a/include/linux/migrate.h b/include/linux/migrate.h +index e39aeec..eaf8674 100644 +--- a/include/linux/migrate.h ++++ b/include/linux/migrate.h +@@ -6,18 +6,31 @@ + + typedef struct page *new_page_t(struct page *, unsigned long private, int **); + ++/* ++ * MIGRATE_ASYNC means never block ++ * MIGRATE_SYNC_LIGHT in the current implementation means to allow blocking ++ * on most operations but not ->writepage as the potential stall time ++ * is too significant ++ * MIGRATE_SYNC will block when migrating pages ++ */ ++enum migrate_mode { ++ MIGRATE_ASYNC, ++ MIGRATE_SYNC_LIGHT, ++ MIGRATE_SYNC, ++}; ++ + #ifdef CONFIG_MIGRATION + #define PAGE_MIGRATION 1 + + extern void putback_lru_pages(struct list_head *l); + extern int migrate_page(struct address_space *, +- struct page *, struct page *); ++ struct page *, struct page *, enum migrate_mode); + extern int migrate_pages(struct list_head *l, new_page_t x, + unsigned long private, bool offlining, +- bool sync); ++ enum migrate_mode mode); + extern int migrate_huge_pages(struct list_head *l, new_page_t x, + unsigned long private, bool offlining, +- bool sync); ++ enum migrate_mode mode); + + extern int fail_migrate_page(struct address_space *, + struct page *, struct page *); +@@ -36,10 +49,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, + static inline void putback_lru_pages(struct list_head *l) {} + static inline int migrate_pages(struct list_head *l, new_page_t x, + unsigned long private, bool offlining, +- bool sync) { return -ENOSYS; } ++ enum migrate_mode mode) { return -ENOSYS; } + static inline int migrate_huge_pages(struct list_head *l, new_page_t x, + unsigned long private, bool offlining, +- bool sync) { return -ENOSYS; } ++ enum migrate_mode mode) { return -ENOSYS; } + + static inline int migrate_prep(void) { return -ENOSYS; } + static inline int migrate_prep_local(void) { return -ENOSYS; } +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 905b1e1..25842b6 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -173,6 +173,8 @@ static inline int is_unevictable_lru(enum lru_list l) + #define ISOLATE_CLEAN ((__force isolate_mode_t)0x4) + /* Isolate unmapped file */ + #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x8) ++/* Isolate for asynchronous migration */ ++#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x10) + + /* LRU Isolation modes. */ + typedef unsigned __bitwise__ isolate_mode_t; +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 5afa2a3..d336c35 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -145,6 +145,7 @@ extern unsigned long this_cpu_load(void); + + + extern void calc_global_load(unsigned long ticks); ++extern void update_cpu_load_nohz(void); + + extern unsigned long get_parent_ip(unsigned long addr); + +@@ -1481,7 +1482,7 @@ struct task_struct { + #endif + #ifdef CONFIG_CPUSETS + nodemask_t mems_allowed; /* Protected by alloc_lock */ +- int mems_allowed_change_disable; ++ seqcount_t mems_allowed_seq; /* Seqence no to catch updates */ + int cpuset_mem_spread_rotor; + int cpuset_slab_spread_rotor; + #endif +diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h +index 94bbec3..6ee550e 100644 +--- a/include/target/target_core_base.h ++++ b/include/target/target_core_base.h +@@ -157,6 +157,7 @@ enum tcm_sense_reason_table { + TCM_CHECK_CONDITION_UNIT_ATTENTION = 0x0e, + TCM_CHECK_CONDITION_NOT_READY = 0x0f, + TCM_RESERVATION_CONFLICT = 0x10, ++ TCM_ADDRESS_OUT_OF_RANGE = 0x11, + }; + + struct se_obj { +diff --git a/kernel/cpuset.c b/kernel/cpuset.c +index 0b1712d..46a1d3c 100644 +--- a/kernel/cpuset.c ++++ b/kernel/cpuset.c +@@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, + { + bool need_loop; + +-repeat: + /* + * Allow tasks that have access to memory reserves because they have + * been OOM killed to get memory anywhere. +@@ -983,45 +982,19 @@ repeat: + */ + need_loop = task_has_mempolicy(tsk) || + !nodes_intersects(*newmems, tsk->mems_allowed); +- nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); +- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); + +- /* +- * ensure checking ->mems_allowed_change_disable after setting all new +- * allowed nodes. +- * +- * the read-side task can see an nodemask with new allowed nodes and +- * old allowed nodes. and if it allocates page when cpuset clears newly +- * disallowed ones continuous, it can see the new allowed bits. +- * +- * And if setting all new allowed nodes is after the checking, setting +- * all new allowed nodes and clearing newly disallowed ones will be done +- * continuous, and the read-side task may find no node to alloc page. +- */ +- smp_mb(); ++ if (need_loop) ++ write_seqcount_begin(&tsk->mems_allowed_seq); + +- /* +- * Allocation of memory is very fast, we needn't sleep when waiting +- * for the read-side. +- */ +- while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { +- task_unlock(tsk); +- if (!task_curr(tsk)) +- yield(); +- goto repeat; +- } +- +- /* +- * ensure checking ->mems_allowed_change_disable before clearing all new +- * disallowed nodes. +- * +- * if clearing newly disallowed bits before the checking, the read-side +- * task may find no node to alloc page. +- */ +- smp_mb(); ++ nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); ++ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); + + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); + tsk->mems_allowed = *newmems; ++ ++ if (need_loop) ++ write_seqcount_end(&tsk->mems_allowed_seq); ++ + task_unlock(tsk); + } + +diff --git a/kernel/fork.c b/kernel/fork.c +index 79ee71f..222457a 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -979,6 +979,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) + #ifdef CONFIG_CGROUPS + init_rwsem(&sig->threadgroup_fork_lock); + #endif ++#ifdef CONFIG_CPUSETS ++ seqcount_init(&tsk->mems_allowed_seq); ++#endif + + sig->oom_adj = current->signal->oom_adj; + sig->oom_score_adj = current->signal->oom_score_adj; +diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c +index 7c0d578..013bd2e 100644 +--- a/kernel/power/hibernate.c ++++ b/kernel/power/hibernate.c +@@ -367,6 +367,7 @@ int hibernation_snapshot(int platform_mode) + } + + suspend_console(); ++ ftrace_stop(); + pm_restrict_gfp_mask(); + error = dpm_suspend(PMSG_FREEZE); + if (error) +@@ -392,6 +393,7 @@ int hibernation_snapshot(int platform_mode) + if (error || !in_suspend) + pm_restore_gfp_mask(); + ++ ftrace_start(); + resume_console(); + dpm_complete(msg); + +@@ -496,6 +498,7 @@ int hibernation_restore(int platform_mode) + + pm_prepare_console(); + suspend_console(); ++ ftrace_stop(); + pm_restrict_gfp_mask(); + error = dpm_suspend_start(PMSG_QUIESCE); + if (!error) { +@@ -503,6 +506,7 @@ int hibernation_restore(int platform_mode) + dpm_resume_end(PMSG_RECOVER); + } + pm_restore_gfp_mask(); ++ ftrace_start(); + resume_console(); + pm_restore_console(); + return error; +@@ -529,6 +533,7 @@ int hibernation_platform_enter(void) + + entering_platform_hibernation = true; + suspend_console(); ++ ftrace_stop(); + error = dpm_suspend_start(PMSG_HIBERNATE); + if (error) { + if (hibernation_ops->recover) +@@ -572,6 +577,7 @@ int hibernation_platform_enter(void) + Resume_devices: + entering_platform_hibernation = false; + dpm_resume_end(PMSG_RESTORE); ++ ftrace_start(); + resume_console(); + + Close: +diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c +index 4953dc0..af48faa 100644 +--- a/kernel/power/suspend.c ++++ b/kernel/power/suspend.c +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + #include + + #include "power.h" +@@ -220,6 +221,7 @@ int suspend_devices_and_enter(suspend_state_t state) + goto Close; + } + suspend_console(); ++ ftrace_stop(); + suspend_test_start(); + error = dpm_suspend_start(PMSG_SUSPEND); + if (error) { +@@ -239,6 +241,7 @@ int suspend_devices_and_enter(suspend_state_t state) + suspend_test_start(); + dpm_resume_end(PMSG_RESUME); + suspend_test_finish("resume devices"); ++ ftrace_start(); + resume_console(); + Close: + if (suspend_ops->end) +diff --git a/kernel/sched.c b/kernel/sched.c +index 52ac69b..9cd8ca7 100644 +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -1887,7 +1887,7 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) + + static void update_sysctl(void); + static int get_update_sysctl_factor(void); +-static void update_cpu_load(struct rq *this_rq); ++static void update_idle_cpu_load(struct rq *this_rq); + + static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) + { +@@ -3855,22 +3855,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +-static void update_cpu_load(struct rq *this_rq) ++static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, ++ unsigned long pending_updates) + { +- unsigned long this_load = this_rq->load.weight; +- unsigned long curr_jiffies = jiffies; +- unsigned long pending_updates; + int i, scale; + + this_rq->nr_load_updates++; + +- /* Avoid repeated calls on same jiffy, when moving in and out of idle */ +- if (curr_jiffies == this_rq->last_load_update_tick) +- return; +- +- pending_updates = curr_jiffies - this_rq->last_load_update_tick; +- this_rq->last_load_update_tick = curr_jiffies; +- + /* Update our load: */ + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { +@@ -3895,9 +3886,78 @@ static void update_cpu_load(struct rq *this_rq) + sched_avg_update(this_rq); + } + ++#ifdef CONFIG_NO_HZ ++/* ++ * There is no sane way to deal with nohz on smp when using jiffies because the ++ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading ++ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. ++ * ++ * Therefore we cannot use the delta approach from the regular tick since that ++ * would seriously skew the load calculation. However we'll make do for those ++ * updates happening while idle (nohz_idle_balance) or coming out of idle ++ * (tick_nohz_idle_exit). ++ * ++ * This means we might still be one tick off for nohz periods. ++ */ ++ ++/* ++ * Called from nohz_idle_balance() to update the load ratings before doing the ++ * idle balance. ++ */ ++static void update_idle_cpu_load(struct rq *this_rq) ++{ ++ unsigned long curr_jiffies = ACCESS_ONCE(jiffies); ++ unsigned long load = this_rq->load.weight; ++ unsigned long pending_updates; ++ ++ /* ++ * bail if there's load or we're actually up-to-date. ++ */ ++ if (load || curr_jiffies == this_rq->last_load_update_tick) ++ return; ++ ++ pending_updates = curr_jiffies - this_rq->last_load_update_tick; ++ this_rq->last_load_update_tick = curr_jiffies; ++ ++ __update_cpu_load(this_rq, load, pending_updates); ++} ++ ++/* ++ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. ++ */ ++void update_cpu_load_nohz(void) ++{ ++ struct rq *this_rq = this_rq(); ++ unsigned long curr_jiffies = ACCESS_ONCE(jiffies); ++ unsigned long pending_updates; ++ ++ if (curr_jiffies == this_rq->last_load_update_tick) ++ return; ++ ++ raw_spin_lock(&this_rq->lock); ++ pending_updates = curr_jiffies - this_rq->last_load_update_tick; ++ if (pending_updates) { ++ this_rq->last_load_update_tick = curr_jiffies; ++ /* ++ * We were idle, this means load 0, the current load might be ++ * !0 due to remote wakeups and the sort. ++ */ ++ __update_cpu_load(this_rq, 0, pending_updates); ++ } ++ raw_spin_unlock(&this_rq->lock); ++} ++#endif /* CONFIG_NO_HZ */ ++ ++/* ++ * Called from scheduler_tick() ++ */ + static void update_cpu_load_active(struct rq *this_rq) + { +- update_cpu_load(this_rq); ++ /* ++ * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). ++ */ ++ this_rq->last_load_update_tick = jiffies; ++ __update_cpu_load(this_rq, this_rq->load.weight, 1); + + calc_load_account_active(this_rq); + } +diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c +index 8a39fa3..66e4576 100644 +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -4735,7 +4735,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) + + raw_spin_lock_irq(&this_rq->lock); + update_rq_clock(this_rq); +- update_cpu_load(this_rq); ++ update_idle_cpu_load(this_rq); + raw_spin_unlock_irq(&this_rq->lock); + + rebalance_domains(balance_cpu, CPU_IDLE); +diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c +index 9955ebd..793548c 100644 +--- a/kernel/time/tick-sched.c ++++ b/kernel/time/tick-sched.c +@@ -549,6 +549,7 @@ void tick_nohz_restart_sched_tick(void) + /* Update jiffies first */ + select_nohz_load_balancer(0); + tick_do_update_jiffies64(now); ++ update_cpu_load_nohz(); + + #ifndef CONFIG_VIRT_CPU_ACCOUNTING + /* +diff --git a/kernel/workqueue.c b/kernel/workqueue.c +index 7947e16..a650bee 100644 +--- a/kernel/workqueue.c ++++ b/kernel/workqueue.c +@@ -3586,6 +3586,41 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, + return notifier_from_errno(0); + } + ++/* ++ * Workqueues should be brought up before normal priority CPU notifiers. ++ * This will be registered high priority CPU notifier. ++ */ ++static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, ++ unsigned long action, ++ void *hcpu) ++{ ++ switch (action & ~CPU_TASKS_FROZEN) { ++ case CPU_UP_PREPARE: ++ case CPU_UP_CANCELED: ++ case CPU_DOWN_FAILED: ++ case CPU_ONLINE: ++ return workqueue_cpu_callback(nfb, action, hcpu); ++ } ++ return NOTIFY_OK; ++} ++ ++/* ++ * Workqueues should be brought down after normal priority CPU notifiers. ++ * This will be registered as low priority CPU notifier. ++ */ ++static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, ++ unsigned long action, ++ void *hcpu) ++{ ++ switch (action & ~CPU_TASKS_FROZEN) { ++ case CPU_DOWN_PREPARE: ++ case CPU_DYING: ++ case CPU_POST_DEAD: ++ return workqueue_cpu_callback(nfb, action, hcpu); ++ } ++ return NOTIFY_OK; ++} ++ + #ifdef CONFIG_SMP + + struct work_for_cpu { +@@ -3779,7 +3814,8 @@ static int __init init_workqueues(void) + unsigned int cpu; + int i; + +- cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); ++ cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); ++ cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); + + /* initialize gcwqs */ + for_each_gcwq_cpu(cpu) { +diff --git a/mm/compaction.c b/mm/compaction.c +index 50f1c60..46973fb 100644 +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -372,7 +372,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, + } + + if (!cc->sync) +- mode |= ISOLATE_CLEAN; ++ mode |= ISOLATE_ASYNC_MIGRATE; + + /* Try isolate the page */ + if (__isolate_lru_page(page, mode, 0) != 0) +@@ -577,7 +577,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) + nr_migrate = cc->nr_migratepages; + err = migrate_pages(&cc->migratepages, compaction_alloc, + (unsigned long)cc, false, +- cc->sync); ++ cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); + update_nr_listpages(cc); + nr_remaining = cc->nr_migratepages; + +diff --git a/mm/filemap.c b/mm/filemap.c +index 03c5b0e..556858c 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -500,10 +500,13 @@ struct page *__page_cache_alloc(gfp_t gfp) + struct page *page; + + if (cpuset_do_page_mem_spread()) { +- get_mems_allowed(); +- n = cpuset_mem_spread_node(); +- page = alloc_pages_exact_node(n, gfp, 0); +- put_mems_allowed(); ++ unsigned int cpuset_mems_cookie; ++ do { ++ cpuset_mems_cookie = get_mems_allowed(); ++ n = cpuset_mem_spread_node(); ++ page = alloc_pages_exact_node(n, gfp, 0); ++ } while (!put_mems_allowed(cpuset_mems_cookie) && !page); ++ + return page; + } + return alloc_pages(gfp, 0); +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index 7c535b0..b1e1bad 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -538,8 +538,10 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, + struct zonelist *zonelist; + struct zone *zone; + struct zoneref *z; ++ unsigned int cpuset_mems_cookie; + +- get_mems_allowed(); ++retry_cpuset: ++ cpuset_mems_cookie = get_mems_allowed(); + zonelist = huge_zonelist(vma, address, + htlb_alloc_mask, &mpol, &nodemask); + /* +@@ -566,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, + } + } + } +-err: ++ + mpol_cond_put(mpol); +- put_mems_allowed(); ++ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ goto retry_cpuset; + return page; ++ ++err: ++ mpol_cond_put(mpol); ++ return NULL; + } + + static void update_and_free_page(struct hstate *h, struct page *page) +diff --git a/mm/memory-failure.c b/mm/memory-failure.c +index 06d3479..5bd5bb1 100644 +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -1427,8 +1427,8 @@ static int soft_offline_huge_page(struct page *page, int flags) + /* Keep page count to indicate a given hugepage is isolated. */ + + list_add(&hpage->lru, &pagelist); +- ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, +- true); ++ ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false, ++ MIGRATE_SYNC); + if (ret) { + struct page *page1, *page2; + list_for_each_entry_safe(page1, page2, &pagelist, lru) +@@ -1557,7 +1557,7 @@ int soft_offline_page(struct page *page, int flags) + page_is_file_cache(page)); + list_add(&page->lru, &pagelist); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, +- 0, true); ++ false, MIGRATE_SYNC); + if (ret) { + putback_lru_pages(&pagelist); + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", +diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c +index 2168489..6629faf 100644 +--- a/mm/memory_hotplug.c ++++ b/mm/memory_hotplug.c +@@ -809,7 +809,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) + } + /* this function returns # of failed pages */ + ret = migrate_pages(&source, hotremove_migrate_alloc, 0, +- true, true); ++ true, MIGRATE_SYNC); + if (ret) + putback_lru_pages(&source); + } +diff --git a/mm/mempolicy.c b/mm/mempolicy.c +index b26aae2..c0007f9 100644 +--- a/mm/mempolicy.c ++++ b/mm/mempolicy.c +@@ -942,7 +942,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, + + if (!list_empty(&pagelist)) { + err = migrate_pages(&pagelist, new_node_page, dest, +- false, true); ++ false, MIGRATE_SYNC); + if (err) + putback_lru_pages(&pagelist); + } +@@ -1843,18 +1843,24 @@ struct page * + alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr, int node) + { +- struct mempolicy *pol = get_vma_policy(current, vma, addr); ++ struct mempolicy *pol; + struct zonelist *zl; + struct page *page; ++ unsigned int cpuset_mems_cookie; ++ ++retry_cpuset: ++ pol = get_vma_policy(current, vma, addr); ++ cpuset_mems_cookie = get_mems_allowed(); + +- get_mems_allowed(); + if (unlikely(pol->mode == MPOL_INTERLEAVE)) { + unsigned nid; + + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); + mpol_cond_put(pol); + page = alloc_page_interleave(gfp, order, nid); +- put_mems_allowed(); ++ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ goto retry_cpuset; ++ + return page; + } + zl = policy_zonelist(gfp, pol, node); +@@ -1865,7 +1871,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, + struct page *page = __alloc_pages_nodemask(gfp, order, + zl, policy_nodemask(gfp, pol)); + __mpol_put(pol); +- put_mems_allowed(); ++ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ goto retry_cpuset; + return page; + } + /* +@@ -1873,7 +1880,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, + */ + page = __alloc_pages_nodemask(gfp, order, zl, + policy_nodemask(gfp, pol)); +- put_mems_allowed(); ++ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ goto retry_cpuset; + return page; + } + +@@ -1900,11 +1908,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) + { + struct mempolicy *pol = current->mempolicy; + struct page *page; ++ unsigned int cpuset_mems_cookie; + + if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) + pol = &default_policy; + +- get_mems_allowed(); ++retry_cpuset: ++ cpuset_mems_cookie = get_mems_allowed(); ++ + /* + * No reference counting needed for current->mempolicy + * nor system default_policy +@@ -1915,7 +1926,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) + page = __alloc_pages_nodemask(gfp, order, + policy_zonelist(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol)); +- put_mems_allowed(); ++ ++ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ goto retry_cpuset; ++ + return page; + } + EXPORT_SYMBOL(alloc_pages_current); +diff --git a/mm/migrate.c b/mm/migrate.c +index 177aca4..180d97f 100644 +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -220,6 +220,56 @@ out: + pte_unmap_unlock(ptep, ptl); + } + ++#ifdef CONFIG_BLOCK ++/* Returns true if all buffers are successfully locked */ ++static bool buffer_migrate_lock_buffers(struct buffer_head *head, ++ enum migrate_mode mode) ++{ ++ struct buffer_head *bh = head; ++ ++ /* Simple case, sync compaction */ ++ if (mode != MIGRATE_ASYNC) { ++ do { ++ get_bh(bh); ++ lock_buffer(bh); ++ bh = bh->b_this_page; ++ ++ } while (bh != head); ++ ++ return true; ++ } ++ ++ /* async case, we cannot block on lock_buffer so use trylock_buffer */ ++ do { ++ get_bh(bh); ++ if (!trylock_buffer(bh)) { ++ /* ++ * We failed to lock the buffer and cannot stall in ++ * async migration. Release the taken locks ++ */ ++ struct buffer_head *failed_bh = bh; ++ put_bh(failed_bh); ++ bh = head; ++ while (bh != failed_bh) { ++ unlock_buffer(bh); ++ put_bh(bh); ++ bh = bh->b_this_page; ++ } ++ return false; ++ } ++ ++ bh = bh->b_this_page; ++ } while (bh != head); ++ return true; ++} ++#else ++static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, ++ enum migrate_mode mode) ++{ ++ return true; ++} ++#endif /* CONFIG_BLOCK */ ++ + /* + * Replace the page in the mapping. + * +@@ -229,7 +279,8 @@ out: + * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. + */ + static int migrate_page_move_mapping(struct address_space *mapping, +- struct page *newpage, struct page *page) ++ struct page *newpage, struct page *page, ++ struct buffer_head *head, enum migrate_mode mode) + { + int expected_count; + void **pslot; +@@ -259,6 +310,20 @@ static int migrate_page_move_mapping(struct address_space *mapping, + } + + /* ++ * In the async migration case of moving a page with buffers, lock the ++ * buffers using trylock before the mapping is moved. If the mapping ++ * was moved, we later failed to lock the buffers and could not move ++ * the mapping back due to an elevated page count, we would have to ++ * block waiting on other references to be dropped. ++ */ ++ if (mode == MIGRATE_ASYNC && head && ++ !buffer_migrate_lock_buffers(head, mode)) { ++ page_unfreeze_refs(page, expected_count); ++ spin_unlock_irq(&mapping->tree_lock); ++ return -EAGAIN; ++ } ++ ++ /* + * Now we know that no one else is looking at the page. + */ + get_page(newpage); /* add cache reference */ +@@ -415,13 +480,14 @@ EXPORT_SYMBOL(fail_migrate_page); + * Pages are locked upon entry and exit. + */ + int migrate_page(struct address_space *mapping, +- struct page *newpage, struct page *page) ++ struct page *newpage, struct page *page, ++ enum migrate_mode mode) + { + int rc; + + BUG_ON(PageWriteback(page)); /* Writeback must be complete */ + +- rc = migrate_page_move_mapping(mapping, newpage, page); ++ rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); + + if (rc) + return rc; +@@ -438,28 +504,28 @@ EXPORT_SYMBOL(migrate_page); + * exist. + */ + int buffer_migrate_page(struct address_space *mapping, +- struct page *newpage, struct page *page) ++ struct page *newpage, struct page *page, enum migrate_mode mode) + { + struct buffer_head *bh, *head; + int rc; + + if (!page_has_buffers(page)) +- return migrate_page(mapping, newpage, page); ++ return migrate_page(mapping, newpage, page, mode); + + head = page_buffers(page); + +- rc = migrate_page_move_mapping(mapping, newpage, page); ++ rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); + + if (rc) + return rc; + +- bh = head; +- do { +- get_bh(bh); +- lock_buffer(bh); +- bh = bh->b_this_page; +- +- } while (bh != head); ++ /* ++ * In the async case, migrate_page_move_mapping locked the buffers ++ * with an IRQ-safe spinlock held. In the sync case, the buffers ++ * need to be locked now ++ */ ++ if (mode != MIGRATE_ASYNC) ++ BUG_ON(!buffer_migrate_lock_buffers(head, mode)); + + ClearPagePrivate(page); + set_page_private(newpage, page_private(page)); +@@ -536,10 +602,14 @@ static int writeout(struct address_space *mapping, struct page *page) + * Default handling if a filesystem does not provide a migration function. + */ + static int fallback_migrate_page(struct address_space *mapping, +- struct page *newpage, struct page *page) ++ struct page *newpage, struct page *page, enum migrate_mode mode) + { +- if (PageDirty(page)) ++ if (PageDirty(page)) { ++ /* Only writeback pages in full synchronous migration */ ++ if (mode != MIGRATE_SYNC) ++ return -EBUSY; + return writeout(mapping, page); ++ } + + /* + * Buffers may be managed in a filesystem specific way. +@@ -549,7 +619,7 @@ static int fallback_migrate_page(struct address_space *mapping, + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; + +- return migrate_page(mapping, newpage, page); ++ return migrate_page(mapping, newpage, page, mode); + } + + /* +@@ -564,7 +634,7 @@ static int fallback_migrate_page(struct address_space *mapping, + * == 0 - success + */ + static int move_to_new_page(struct page *newpage, struct page *page, +- int remap_swapcache, bool sync) ++ int remap_swapcache, enum migrate_mode mode) + { + struct address_space *mapping; + int rc; +@@ -585,29 +655,18 @@ static int move_to_new_page(struct page *newpage, struct page *page, + + mapping = page_mapping(page); + if (!mapping) +- rc = migrate_page(mapping, newpage, page); +- else { ++ rc = migrate_page(mapping, newpage, page, mode); ++ else if (mapping->a_ops->migratepage) + /* +- * Do not writeback pages if !sync and migratepage is +- * not pointing to migrate_page() which is nonblocking +- * (swapcache/tmpfs uses migratepage = migrate_page). ++ * Most pages have a mapping and most filesystems provide a ++ * migratepage callback. Anonymous pages are part of swap ++ * space which also has its own migratepage callback. This ++ * is the most common path for page migration. + */ +- if (PageDirty(page) && !sync && +- mapping->a_ops->migratepage != migrate_page) +- rc = -EBUSY; +- else if (mapping->a_ops->migratepage) +- /* +- * Most pages have a mapping and most filesystems +- * should provide a migration function. Anonymous +- * pages are part of swap space which also has its +- * own migration function. This is the most common +- * path for page migration. +- */ +- rc = mapping->a_ops->migratepage(mapping, +- newpage, page); +- else +- rc = fallback_migrate_page(mapping, newpage, page); +- } ++ rc = mapping->a_ops->migratepage(mapping, ++ newpage, page, mode); ++ else ++ rc = fallback_migrate_page(mapping, newpage, page, mode); + + if (rc) { + newpage->mapping = NULL; +@@ -622,7 +681,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, + } + + static int __unmap_and_move(struct page *page, struct page *newpage, +- int force, bool offlining, bool sync) ++ int force, bool offlining, enum migrate_mode mode) + { + int rc = -EAGAIN; + int remap_swapcache = 1; +@@ -631,7 +690,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, + struct anon_vma *anon_vma = NULL; + + if (!trylock_page(page)) { +- if (!force || !sync) ++ if (!force || mode == MIGRATE_ASYNC) + goto out; + + /* +@@ -677,10 +736,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage, + + if (PageWriteback(page)) { + /* +- * For !sync, there is no point retrying as the retry loop +- * is expected to be too short for PageWriteback to be cleared ++ * Only in the case of a full syncronous migration is it ++ * necessary to wait for PageWriteback. In the async case, ++ * the retry loop is too short and in the sync-light case, ++ * the overhead of stalling is too much + */ +- if (!sync) { ++ if (mode != MIGRATE_SYNC) { + rc = -EBUSY; + goto uncharge; + } +@@ -751,7 +812,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, + + skip_unmap: + if (!page_mapped(page)) +- rc = move_to_new_page(newpage, page, remap_swapcache, sync); ++ rc = move_to_new_page(newpage, page, remap_swapcache, mode); + + if (rc && remap_swapcache) + remove_migration_ptes(page, page); +@@ -774,7 +835,8 @@ out: + * to the newly allocated page in newpage. + */ + static int unmap_and_move(new_page_t get_new_page, unsigned long private, +- struct page *page, int force, bool offlining, bool sync) ++ struct page *page, int force, bool offlining, ++ enum migrate_mode mode) + { + int rc = 0; + int *result = NULL; +@@ -792,7 +854,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, + if (unlikely(split_huge_page(page))) + goto out; + +- rc = __unmap_and_move(page, newpage, force, offlining, sync); ++ rc = __unmap_and_move(page, newpage, force, offlining, mode); + out: + if (rc != -EAGAIN) { + /* +@@ -840,7 +902,8 @@ out: + */ + static int unmap_and_move_huge_page(new_page_t get_new_page, + unsigned long private, struct page *hpage, +- int force, bool offlining, bool sync) ++ int force, bool offlining, ++ enum migrate_mode mode) + { + int rc = 0; + int *result = NULL; +@@ -853,7 +916,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, + rc = -EAGAIN; + + if (!trylock_page(hpage)) { +- if (!force || !sync) ++ if (!force || mode != MIGRATE_SYNC) + goto out; + lock_page(hpage); + } +@@ -864,7 +927,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, + try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + + if (!page_mapped(hpage)) +- rc = move_to_new_page(new_hpage, hpage, 1, sync); ++ rc = move_to_new_page(new_hpage, hpage, 1, mode); + + if (rc) + remove_migration_ptes(hpage, hpage); +@@ -907,7 +970,7 @@ out: + */ + int migrate_pages(struct list_head *from, + new_page_t get_new_page, unsigned long private, bool offlining, +- bool sync) ++ enum migrate_mode mode) + { + int retry = 1; + int nr_failed = 0; +@@ -928,7 +991,7 @@ int migrate_pages(struct list_head *from, + + rc = unmap_and_move(get_new_page, private, + page, pass > 2, offlining, +- sync); ++ mode); + + switch(rc) { + case -ENOMEM: +@@ -958,7 +1021,7 @@ out: + + int migrate_huge_pages(struct list_head *from, + new_page_t get_new_page, unsigned long private, bool offlining, +- bool sync) ++ enum migrate_mode mode) + { + int retry = 1; + int nr_failed = 0; +@@ -975,7 +1038,7 @@ int migrate_huge_pages(struct list_head *from, + + rc = unmap_and_move_huge_page(get_new_page, + private, page, pass > 2, offlining, +- sync); ++ mode); + + switch(rc) { + case -ENOMEM: +@@ -1104,7 +1167,7 @@ set_status: + err = 0; + if (!list_empty(&pagelist)) { + err = migrate_pages(&pagelist, new_page_node, +- (unsigned long)pm, 0, true); ++ (unsigned long)pm, 0, MIGRATE_SYNC); + if (err) + putback_lru_pages(&pagelist); + } +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 485be89..065dbe8 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1886,14 +1886,20 @@ static struct page * + __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, +- int migratetype, unsigned long *did_some_progress, +- bool sync_migration) ++ int migratetype, bool sync_migration, ++ bool *deferred_compaction, ++ unsigned long *did_some_progress) + { + struct page *page; + +- if (!order || compaction_deferred(preferred_zone)) ++ if (!order) + return NULL; + ++ if (compaction_deferred(preferred_zone)) { ++ *deferred_compaction = true; ++ return NULL; ++ } ++ + current->flags |= PF_MEMALLOC; + *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, + nodemask, sync_migration); +@@ -1921,7 +1927,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + * but not enough to satisfy watermarks. + */ + count_vm_event(COMPACTFAIL); +- defer_compaction(preferred_zone); ++ ++ /* ++ * As async compaction considers a subset of pageblocks, only ++ * defer if the failure was a sync compaction failure. ++ */ ++ if (sync_migration) ++ defer_compaction(preferred_zone); + + cond_resched(); + } +@@ -1933,8 +1945,9 @@ static inline struct page * + __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, +- int migratetype, unsigned long *did_some_progress, +- bool sync_migration) ++ int migratetype, bool sync_migration, ++ bool *deferred_compaction, ++ unsigned long *did_some_progress) + { + return NULL; + } +@@ -2084,6 +2097,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, + unsigned long pages_reclaimed = 0; + unsigned long did_some_progress; + bool sync_migration = false; ++ bool deferred_compaction = false; + + /* + * In the slowpath, we sanity check order to avoid ever trying to +@@ -2164,12 +2178,22 @@ rebalance: + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, +- migratetype, &did_some_progress, +- sync_migration); ++ migratetype, sync_migration, ++ &deferred_compaction, ++ &did_some_progress); + if (page) + goto got_pg; + sync_migration = true; + ++ /* ++ * If compaction is deferred for high-order allocations, it is because ++ * sync compaction recently failed. In this is the case and the caller ++ * has requested the system not be heavily disrupted, fail the ++ * allocation now instead of entering direct reclaim ++ */ ++ if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) ++ goto nopage; ++ + /* Try direct reclaim and then allocating */ + page = __alloc_pages_direct_reclaim(gfp_mask, order, + zonelist, high_zoneidx, +@@ -2232,8 +2256,9 @@ rebalance: + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, +- migratetype, &did_some_progress, +- sync_migration); ++ migratetype, sync_migration, ++ &deferred_compaction, ++ &did_some_progress); + if (page) + goto got_pg; + } +@@ -2257,8 +2282,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + { + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + struct zone *preferred_zone; +- struct page *page; ++ struct page *page = NULL; + int migratetype = allocflags_to_migratetype(gfp_mask); ++ unsigned int cpuset_mems_cookie; + + gfp_mask &= gfp_allowed_mask; + +@@ -2277,15 +2303,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + if (unlikely(!zonelist->_zonerefs->zone)) + return NULL; + +- get_mems_allowed(); ++retry_cpuset: ++ cpuset_mems_cookie = get_mems_allowed(); ++ + /* The preferred zone is used for statistics later */ + first_zones_zonelist(zonelist, high_zoneidx, + nodemask ? : &cpuset_current_mems_allowed, + &preferred_zone); +- if (!preferred_zone) { +- put_mems_allowed(); +- return NULL; +- } ++ if (!preferred_zone) ++ goto out; + + /* First allocation attempt */ + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, +@@ -2295,9 +2321,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + page = __alloc_pages_slowpath(gfp_mask, order, + zonelist, high_zoneidx, nodemask, + preferred_zone, migratetype); +- put_mems_allowed(); + + trace_mm_page_alloc(page, order, gfp_mask, migratetype); ++ ++out: ++ /* ++ * When updating a task's mems_allowed, it is possible to race with ++ * parallel threads in such a way that an allocation can fail while ++ * the mask is being updated. If a page allocation is about to fail, ++ * check if the cpuset changed during allocation and if so, retry. ++ */ ++ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ goto retry_cpuset; ++ + return page; + } + EXPORT_SYMBOL(__alloc_pages_nodemask); +@@ -2521,13 +2557,15 @@ void si_meminfo_node(struct sysinfo *val, int nid) + bool skip_free_areas_node(unsigned int flags, int nid) + { + bool ret = false; ++ unsigned int cpuset_mems_cookie; + + if (!(flags & SHOW_MEM_FILTER_NODES)) + goto out; + +- get_mems_allowed(); +- ret = !node_isset(nid, cpuset_current_mems_allowed); +- put_mems_allowed(); ++ do { ++ cpuset_mems_cookie = get_mems_allowed(); ++ ret = !node_isset(nid, cpuset_current_mems_allowed); ++ } while (!put_mems_allowed(cpuset_mems_cookie)); + out: + return ret; + } +@@ -3407,25 +3445,33 @@ static void setup_zone_migrate_reserve(struct zone *zone) + if (page_to_nid(page) != zone_to_nid(zone)) + continue; + +- /* Blocks with reserved pages will never free, skip them. */ +- block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); +- if (pageblock_is_reserved(pfn, block_end_pfn)) +- continue; +- + block_migratetype = get_pageblock_migratetype(page); + +- /* If this block is reserved, account for it */ +- if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { +- reserve--; +- continue; +- } ++ /* Only test what is necessary when the reserves are not met */ ++ if (reserve > 0) { ++ /* ++ * Blocks with reserved pages will never free, skip ++ * them. ++ */ ++ block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); ++ if (pageblock_is_reserved(pfn, block_end_pfn)) ++ continue; + +- /* Suitable for reserving if this block is movable */ +- if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { +- set_pageblock_migratetype(page, MIGRATE_RESERVE); +- move_freepages_block(zone, page, MIGRATE_RESERVE); +- reserve--; +- continue; ++ /* If this block is reserved, account for it */ ++ if (block_migratetype == MIGRATE_RESERVE) { ++ reserve--; ++ continue; ++ } ++ ++ /* Suitable for reserving if this block is movable */ ++ if (block_migratetype == MIGRATE_MOVABLE) { ++ set_pageblock_migratetype(page, ++ MIGRATE_RESERVE); ++ move_freepages_block(zone, page, ++ MIGRATE_RESERVE); ++ reserve--; ++ continue; ++ } + } + + /* +diff --git a/mm/slab.c b/mm/slab.c +index 83311c9a..cd3ab93 100644 +--- a/mm/slab.c ++++ b/mm/slab.c +@@ -3267,12 +3267,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) + if (in_interrupt() || (flags & __GFP_THISNODE)) + return NULL; + nid_alloc = nid_here = numa_mem_id(); +- get_mems_allowed(); + if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) + nid_alloc = cpuset_slab_spread_node(); + else if (current->mempolicy) + nid_alloc = slab_node(current->mempolicy); +- put_mems_allowed(); + if (nid_alloc != nid_here) + return ____cache_alloc_node(cachep, flags, nid_alloc); + return NULL; +@@ -3295,14 +3293,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) + enum zone_type high_zoneidx = gfp_zone(flags); + void *obj = NULL; + int nid; ++ unsigned int cpuset_mems_cookie; + + if (flags & __GFP_THISNODE) + return NULL; + +- get_mems_allowed(); +- zonelist = node_zonelist(slab_node(current->mempolicy), flags); + local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); + ++retry_cpuset: ++ cpuset_mems_cookie = get_mems_allowed(); ++ zonelist = node_zonelist(slab_node(current->mempolicy), flags); ++ + retry: + /* + * Look through allowed nodes for objects available +@@ -3355,7 +3356,9 @@ retry: + } + } + } +- put_mems_allowed(); ++ ++ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) ++ goto retry_cpuset; + return obj; + } + +diff --git a/mm/slub.c b/mm/slub.c +index af47188..5710788 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -1582,6 +1582,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, + struct zone *zone; + enum zone_type high_zoneidx = gfp_zone(flags); + void *object; ++ unsigned int cpuset_mems_cookie; + + /* + * The defrag ratio allows a configuration of the tradeoffs between +@@ -1605,23 +1606,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, + get_cycles() % 1024 > s->remote_node_defrag_ratio) + return NULL; + +- get_mems_allowed(); +- zonelist = node_zonelist(slab_node(current->mempolicy), flags); +- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { +- struct kmem_cache_node *n; +- +- n = get_node(s, zone_to_nid(zone)); +- +- if (n && cpuset_zone_allowed_hardwall(zone, flags) && +- n->nr_partial > s->min_partial) { +- object = get_partial_node(s, n, c); +- if (object) { +- put_mems_allowed(); +- return object; ++ do { ++ cpuset_mems_cookie = get_mems_allowed(); ++ zonelist = node_zonelist(slab_node(current->mempolicy), flags); ++ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { ++ struct kmem_cache_node *n; ++ ++ n = get_node(s, zone_to_nid(zone)); ++ ++ if (n && cpuset_zone_allowed_hardwall(zone, flags) && ++ n->nr_partial > s->min_partial) { ++ object = get_partial_node(s, n, c); ++ if (object) { ++ /* ++ * Return the object even if ++ * put_mems_allowed indicated that ++ * the cpuset mems_allowed was ++ * updated in parallel. It's a ++ * harmless race between the alloc ++ * and the cpuset update. ++ */ ++ put_mems_allowed(cpuset_mems_cookie); ++ return object; ++ } + } + } +- } +- put_mems_allowed(); ++ } while (!put_mems_allowed(cpuset_mems_cookie)); + #endif + return NULL; + } +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 8342119..48febd7 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -715,7 +715,13 @@ static enum page_references page_check_references(struct page *page, + */ + SetPageReferenced(page); + +- if (referenced_page) ++ if (referenced_page || referenced_ptes > 1) ++ return PAGEREF_ACTIVATE; ++ ++ /* ++ * Activate file-backed executable pages after first usage. ++ */ ++ if (vm_flags & VM_EXEC) + return PAGEREF_ACTIVATE; + + return PAGEREF_KEEP; +@@ -1061,8 +1067,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) + + ret = -EBUSY; + +- if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) +- return ret; ++ /* ++ * To minimise LRU disruption, the caller can indicate that it only ++ * wants to isolate pages it will be able to operate on without ++ * blocking - clean pages for the most part. ++ * ++ * ISOLATE_CLEAN means that only clean pages should be isolated. This ++ * is used by reclaim when it is cannot write to backing storage ++ * ++ * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages ++ * that it is possible to migrate without blocking ++ */ ++ if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { ++ /* All the caller can do on PageWriteback is block */ ++ if (PageWriteback(page)) ++ return ret; ++ ++ if (PageDirty(page)) { ++ struct address_space *mapping; ++ ++ /* ISOLATE_CLEAN means only clean pages */ ++ if (mode & ISOLATE_CLEAN) ++ return ret; ++ ++ /* ++ * Only pages without mappings or that have a ++ * ->migratepage callback are possible to migrate ++ * without blocking ++ */ ++ mapping = page_mapping(page); ++ if (mapping && !mapping->a_ops->migratepage) ++ return ret; ++ } ++ } + + if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) + return ret; +@@ -1178,7 +1215,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, + * anon page which don't already have a swap slot is + * pointless. + */ +- if (nr_swap_pages <= 0 && PageAnon(cursor_page) && ++ if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) && + !PageSwapCache(cursor_page)) + break; + +@@ -1874,7 +1911,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, + * latencies, so it's better to scan a minimum amount there as + * well. + */ +- if (scanning_global_lru(sc) && current_is_kswapd()) ++ if (scanning_global_lru(sc) && current_is_kswapd() && ++ zone->all_unreclaimable) + force_scan = true; + if (!scanning_global_lru(sc)) + force_scan = true; +@@ -2012,8 +2050,9 @@ static inline bool should_continue_reclaim(struct zone *zone, + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = (2UL << sc->order); +- inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + +- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); ++ inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); ++ if (nr_swap_pages > 0) ++ inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + if (sc->nr_reclaimed < pages_for_compaction && + inactive_lru_pages > pages_for_compaction) + return true; +@@ -2088,6 +2127,42 @@ restart: + throttle_vm_writeout(sc->gfp_mask); + } + ++/* Returns true if compaction should go ahead for a high-order request */ ++static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) ++{ ++ unsigned long balance_gap, watermark; ++ bool watermark_ok; ++ ++ /* Do not consider compaction for orders reclaim is meant to satisfy */ ++ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) ++ return false; ++ ++ /* ++ * Compaction takes time to run and there are potentially other ++ * callers using the pages just freed. Continue reclaiming until ++ * there is a buffer of free pages available to give compaction ++ * a reasonable chance of completing and allocating the page ++ */ ++ balance_gap = min(low_wmark_pages(zone), ++ (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / ++ KSWAPD_ZONE_BALANCE_GAP_RATIO); ++ watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); ++ watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); ++ ++ /* ++ * If compaction is deferred, reclaim up to a point where ++ * compaction will have a chance of success when re-enabled ++ */ ++ if (compaction_deferred(zone)) ++ return watermark_ok; ++ ++ /* If compaction is not ready to start, keep reclaiming */ ++ if (!compaction_suitable(zone, sc->order)) ++ return false; ++ ++ return watermark_ok; ++} ++ + /* + * This is the direct reclaim path, for page-allocating processes. We only + * try to reclaim pages from zones which will satisfy the caller's allocation +@@ -2105,8 +2180,9 @@ restart: + * scan then give up on it. + * + * This function returns true if a zone is being reclaimed for a costly +- * high-order allocation and compaction is either ready to begin or deferred. +- * This indicates to the caller that it should retry the allocation or fail. ++ * high-order allocation and compaction is ready to begin. This indicates to ++ * the caller that it should consider retrying the allocation instead of ++ * further reclaim. + */ + static bool shrink_zones(int priority, struct zonelist *zonelist, + struct scan_control *sc) +@@ -2115,7 +2191,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, + struct zone *zone; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; +- bool should_abort_reclaim = false; ++ bool aborted_reclaim = false; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { +@@ -2140,10 +2216,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, + * noticable problem, like transparent huge page + * allocations. + */ +- if (sc->order > PAGE_ALLOC_COSTLY_ORDER && +- (compaction_suitable(zone, sc->order) || +- compaction_deferred(zone))) { +- should_abort_reclaim = true; ++ if (compaction_ready(zone, sc)) { ++ aborted_reclaim = true; + continue; + } + } +@@ -2165,7 +2239,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, + shrink_zone(priority, zone, sc); + } + +- return should_abort_reclaim; ++ return aborted_reclaim; + } + + static bool zone_reclaimable(struct zone *zone) +@@ -2219,8 +2293,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, + struct zoneref *z; + struct zone *zone; + unsigned long writeback_threshold; ++ bool aborted_reclaim; + +- get_mems_allowed(); + delayacct_freepages_start(); + + if (scanning_global_lru(sc)) +@@ -2230,8 +2304,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, + sc->nr_scanned = 0; + if (!priority) + disable_swap_token(sc->mem_cgroup); +- if (shrink_zones(priority, zonelist, sc)) +- break; ++ aborted_reclaim = shrink_zones(priority, zonelist, sc); + + /* + * Don't shrink slabs when reclaiming memory from +@@ -2285,7 +2358,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, + + out: + delayacct_freepages_end(); +- put_mems_allowed(); + + if (sc->nr_reclaimed) + return sc->nr_reclaimed; +@@ -2298,6 +2370,10 @@ out: + if (oom_killer_disabled) + return 0; + ++ /* Aborted reclaim to try compaction? don't OOM, then */ ++ if (aborted_reclaim) ++ return 1; ++ + /* top priority shrink_zones still had more to do? don't OOM, then */ + if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) + return 1; +diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c +index c505fd5..c119f33 100644 +--- a/sound/pci/hda/patch_hdmi.c ++++ b/sound/pci/hda/patch_hdmi.c +@@ -868,7 +868,6 @@ static int hdmi_pcm_open(struct hda_pcm_stream *hinfo, + struct hdmi_spec_per_pin *per_pin; + struct hdmi_eld *eld; + struct hdmi_spec_per_cvt *per_cvt = NULL; +- int pinctl; + + /* Validate hinfo */ + pin_idx = hinfo_to_pin_index(spec, hinfo); +@@ -904,11 +903,6 @@ static int hdmi_pcm_open(struct hda_pcm_stream *hinfo, + snd_hda_codec_write(codec, per_pin->pin_nid, 0, + AC_VERB_SET_CONNECT_SEL, + mux_idx); +- pinctl = snd_hda_codec_read(codec, per_pin->pin_nid, 0, +- AC_VERB_GET_PIN_WIDGET_CONTROL, 0); +- snd_hda_codec_write(codec, per_pin->pin_nid, 0, +- AC_VERB_SET_PIN_WIDGET_CONTROL, +- pinctl | PIN_OUT); + snd_hda_spdif_ctls_assign(codec, pin_idx, per_cvt->cvt_nid); + + /* Initially set the converter's capabilities */ +@@ -1147,11 +1141,17 @@ static int generic_hdmi_playback_pcm_prepare(struct hda_pcm_stream *hinfo, + struct hdmi_spec *spec = codec->spec; + int pin_idx = hinfo_to_pin_index(spec, hinfo); + hda_nid_t pin_nid = spec->pins[pin_idx].pin_nid; ++ int pinctl; + + hdmi_set_channel_count(codec, cvt_nid, substream->runtime->channels); + + hdmi_setup_audio_infoframe(codec, pin_idx, substream); + ++ pinctl = snd_hda_codec_read(codec, pin_nid, 0, ++ AC_VERB_GET_PIN_WIDGET_CONTROL, 0); ++ snd_hda_codec_write(codec, pin_nid, 0, ++ AC_VERB_SET_PIN_WIDGET_CONTROL, pinctl | PIN_OUT); ++ + return hdmi_setup_stream(codec, cvt_nid, pin_nid, stream_tag, format); + } + +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index 5f096a5..191fd78 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -5989,6 +5989,7 @@ static const struct hda_codec_preset snd_hda_preset_realtek[] = { + { .id = 0x10ec0275, .name = "ALC275", .patch = patch_alc269 }, + { .id = 0x10ec0276, .name = "ALC276", .patch = patch_alc269 }, + { .id = 0x10ec0280, .name = "ALC280", .patch = patch_alc269 }, ++ { .id = 0x10ec0282, .name = "ALC282", .patch = patch_alc269 }, + { .id = 0x10ec0861, .rev = 0x100340, .name = "ALC660", + .patch = patch_alc861 }, + { .id = 0x10ec0660, .name = "ALC660-VD", .patch = patch_alc861vd }, +diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c +index 90e93bf..0dc441c 100644 +--- a/sound/soc/soc-dapm.c ++++ b/sound/soc/soc-dapm.c +@@ -1381,7 +1381,15 @@ static int dapm_power_widgets(struct snd_soc_dapm_context *dapm, int event) + } + + list_for_each_entry(w, &card->widgets, list) { +- list_del_init(&w->dirty); ++ switch (w->id) { ++ case snd_soc_dapm_pre: ++ case snd_soc_dapm_post: ++ /* These widgets always need to be powered */ ++ break; ++ default: ++ list_del_init(&w->dirty); ++ break; ++ } + + if (w->power) { + d = w->dapm; diff --git a/3.2.34/bump/1025_linux-3.2.26.patch b/3.2.34/bump/1025_linux-3.2.26.patch new file mode 100644 index 0000000..44065b9 --- /dev/null +++ b/3.2.34/bump/1025_linux-3.2.26.patch @@ -0,0 +1,238 @@ +diff --git a/Makefile b/Makefile +index e13e4e7..fa5acc83 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 2 +-SUBLEVEL = 25 ++SUBLEVEL = 26 + EXTRAVERSION = + NAME = Saber-toothed Squirrel + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index bb3ee36..f7c89e2 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -99,7 +99,6 @@ struct cpuinfo_x86 { + u16 apicid; + u16 initial_apicid; + u16 x86_clflush_size; +-#ifdef CONFIG_SMP + /* number of cores as seen by the OS: */ + u16 booted_cores; + /* Physical processor id: */ +@@ -110,7 +109,6 @@ struct cpuinfo_x86 { + u8 compute_unit_id; + /* Index into per_cpu list: */ + u16 cpu_index; +-#endif + u32 microcode; + } __attribute__((__aligned__(SMP_CACHE_BYTES))); + +diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c +index bae1efe..be16854 100644 +--- a/arch/x86/kernel/amd_nb.c ++++ b/arch/x86/kernel/amd_nb.c +@@ -154,16 +154,14 @@ int amd_get_subcaches(int cpu) + { + struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; + unsigned int mask; +- int cuid = 0; ++ int cuid; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return 0; + + pci_read_config_dword(link, 0x1d4, &mask); + +-#ifdef CONFIG_SMP + cuid = cpu_data(cpu).compute_unit_id; +-#endif + return (mask >> (4 * cuid)) & 0xf; + } + +@@ -172,7 +170,7 @@ int amd_set_subcaches(int cpu, int mask) + static unsigned int reset, ban; + struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); + unsigned int reg; +- int cuid = 0; ++ int cuid; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) + return -EINVAL; +@@ -190,9 +188,7 @@ int amd_set_subcaches(int cpu, int mask) + pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); + } + +-#ifdef CONFIG_SMP + cuid = cpu_data(cpu).compute_unit_id; +-#endif + mask <<= 4 * cuid; + mask |= (0xf ^ (1 << cuid)) << 26; + +diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c +index 3524e1f..ff8557e 100644 +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -148,7 +148,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) + + static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) + { +-#ifdef CONFIG_SMP + /* calling is from identify_secondary_cpu() ? */ + if (!c->cpu_index) + return; +@@ -192,7 +191,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) + + valid_k7: + ; +-#endif + } + + static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index aa003b1..ca93cc7 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -676,9 +676,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); + +-#ifdef CONFIG_SMP + c->cpu_index = 0; +-#endif + filter_cpuid_features(c, false); + + setup_smep(c); +@@ -764,10 +762,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) + c->apicid = c->initial_apicid; + # endif + #endif +- +-#ifdef CONFIG_X86_HT + c->phys_proc_id = c->initial_apicid; +-#endif + } + + setup_smep(c); +diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c +index 5231312..3e6ff6c 100644 +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -181,7 +181,6 @@ static void __cpuinit trap_init_f00f_bug(void) + + static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) + { +-#ifdef CONFIG_SMP + /* calling is from identify_secondary_cpu() ? */ + if (!c->cpu_index) + return; +@@ -198,7 +197,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) + WARN_ONCE(1, "WARNING: SMP operation may be unreliable" + "with B stepping processors.\n"); + } +-#endif + } + + static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) +diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c +index b0f1271..3b67877 100644 +--- a/arch/x86/kernel/cpu/mcheck/mce.c ++++ b/arch/x86/kernel/cpu/mcheck/mce.c +@@ -119,9 +119,7 @@ void mce_setup(struct mce *m) + m->time = get_seconds(); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->cpuid = cpuid_eax(1); +-#ifdef CONFIG_SMP + m->socketid = cpu_data(m->extcpu).phys_proc_id; +-#endif + m->apicid = cpu_data(m->extcpu).initial_apicid; + rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); + } +diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c +index 445a61c..d4444be 100644 +--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c ++++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c +@@ -65,11 +65,9 @@ struct threshold_bank { + }; + static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); + +-#ifdef CONFIG_SMP + static unsigned char shared_bank[NR_BANKS] = { + 0, 0, 0, 0, 1 + }; +-#endif + + static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ + +@@ -227,10 +225,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) + + if (!block) + per_cpu(bank_map, cpu) |= (1 << bank); +-#ifdef CONFIG_SMP ++ + if (shared_bank[bank] && c->cpu_core_id) + break; +-#endif + + memset(&b, 0, sizeof(b)); + b.cpu = cpu; +diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c +index 14b2314..8022c66 100644 +--- a/arch/x86/kernel/cpu/proc.c ++++ b/arch/x86/kernel/cpu/proc.c +@@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) + static int show_cpuinfo(struct seq_file *m, void *v) + { + struct cpuinfo_x86 *c = v; +- unsigned int cpu = 0; ++ unsigned int cpu; + int i; + +-#ifdef CONFIG_SMP + cpu = c->cpu_index; +-#endif + seq_printf(m, "processor\t: %u\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" +diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c +index 18a1293..0db57b5 100644 +--- a/drivers/edac/sb_edac.c ++++ b/drivers/edac/sb_edac.c +@@ -1609,11 +1609,9 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val, + mce->cpuvendor, mce->cpuid, mce->time, + mce->socketid, mce->apicid); + +-#ifdef CONFIG_SMP + /* Only handle if it is the right mc controller */ + if (cpu_data(mce->cpu).phys_proc_id != pvt->sbridge_dev->mc) + return NOTIFY_DONE; +-#endif + + smp_rmb(); + if ((pvt->mce_out + 1) % MCE_LOG_LEN == pvt->mce_in) { +diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c +index 0790c98..19b4412 100644 +--- a/drivers/hwmon/coretemp.c ++++ b/drivers/hwmon/coretemp.c +@@ -57,16 +57,15 @@ MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius"); + #define TOTAL_ATTRS (MAX_CORE_ATTRS + 1) + #define MAX_CORE_DATA (NUM_REAL_CORES + BASE_SYSFS_ATTR_NO) + +-#ifdef CONFIG_SMP + #define TO_PHYS_ID(cpu) cpu_data(cpu).phys_proc_id + #define TO_CORE_ID(cpu) cpu_data(cpu).cpu_core_id ++#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO) ++ ++#ifdef CONFIG_SMP + #define for_each_sibling(i, cpu) for_each_cpu(i, cpu_sibling_mask(cpu)) + #else +-#define TO_PHYS_ID(cpu) (cpu) +-#define TO_CORE_ID(cpu) (cpu) + #define for_each_sibling(i, cpu) for (i = 0; false; ) + #endif +-#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO) + + /* + * Per-Core Temperature Data diff --git a/3.2.34/bump/1026_linux-3.2.27.patch b/3.2.34/bump/1026_linux-3.2.27.patch new file mode 100644 index 0000000..5878eb4 --- /dev/null +++ b/3.2.34/bump/1026_linux-3.2.27.patch @@ -0,0 +1,3188 @@ +diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt +index edad99a..69820b2 100644 +--- a/Documentation/sound/alsa/HD-Audio-Models.txt ++++ b/Documentation/sound/alsa/HD-Audio-Models.txt +@@ -60,10 +60,11 @@ ALC267/268 + ========== + N/A + +-ALC269 ++ALC269/270/275/276/280/282 + ====== + laptop-amic Laptops with analog-mic input + laptop-dmic Laptops with digital-mic input ++ lenovo-dock Enables docking station I/O for some Lenovos + + ALC662/663/272 + ============== +diff --git a/Documentation/stable_kernel_rules.txt b/Documentation/stable_kernel_rules.txt +index e1f856b..22bf11b 100644 +--- a/Documentation/stable_kernel_rules.txt ++++ b/Documentation/stable_kernel_rules.txt +@@ -1,4 +1,4 @@ +-Everything you ever wanted to know about Linux 2.6 -stable releases. ++Everything you ever wanted to know about Linux -stable releases. + + Rules on what kind of patches are accepted, and which ones are not, into the + "-stable" tree: +@@ -41,10 +41,10 @@ Procedure for submitting patches to the -stable tree: + cherry-picked than this can be specified in the following format in + the sign-off area: + +- Cc: # .32.x: a1f84a3: sched: Check for idle +- Cc: # .32.x: 1b9508f: sched: Rate-limit newidle +- Cc: # .32.x: fd21073: sched: Fix affinity logic +- Cc: # .32.x ++ Cc: # 3.3.x: a1f84a3: sched: Check for idle ++ Cc: # 3.3.x: 1b9508f: sched: Rate-limit newidle ++ Cc: # 3.3.x: fd21073: sched: Fix affinity logic ++ Cc: # 3.3.x + Signed-off-by: Ingo Molnar + + The tag sequence has the meaning of: +@@ -78,6 +78,15 @@ Review cycle: + security kernel team, and not go through the normal review cycle. + Contact the kernel security team for more details on this procedure. + ++Trees: ++ ++ - The queues of patches, for both completed versions and in progress ++ versions can be found at: ++ http://git.kernel.org/?p=linux/kernel/git/stable/stable-queue.git ++ - The finalized and tagged releases of all stable kernels can be found ++ in separate branches per version at: ++ http://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git ++ + + Review committee: + +diff --git a/Makefile b/Makefile +index fa5acc83..bdf851f 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 2 +-SUBLEVEL = 26 ++SUBLEVEL = 27 + EXTRAVERSION = + NAME = Saber-toothed Squirrel + +diff --git a/arch/arm/include/asm/mutex.h b/arch/arm/include/asm/mutex.h +index 93226cf..b1479fd 100644 +--- a/arch/arm/include/asm/mutex.h ++++ b/arch/arm/include/asm/mutex.h +@@ -7,121 +7,10 @@ + */ + #ifndef _ASM_MUTEX_H + #define _ASM_MUTEX_H +- +-#if __LINUX_ARM_ARCH__ < 6 +-/* On pre-ARMv6 hardware the swp based implementation is the most efficient. */ +-# include +-#else +- + /* +- * Attempting to lock a mutex on ARMv6+ can be done with a bastardized +- * atomic decrement (it is not a reliable atomic decrement but it satisfies +- * the defined semantics for our purpose, while being smaller and faster +- * than a real atomic decrement or atomic swap. The idea is to attempt +- * decrementing the lock value only once. If once decremented it isn't zero, +- * or if its store-back fails due to a dispute on the exclusive store, we +- * simply bail out immediately through the slow path where the lock will be +- * reattempted until it succeeds. ++ * On pre-ARMv6 hardware this results in a swp-based implementation, ++ * which is the most efficient. For ARMv6+, we emit a pair of exclusive ++ * accesses instead. + */ +-static inline void +-__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *)) +-{ +- int __ex_flag, __res; +- +- __asm__ ( +- +- "ldrex %0, [%2] \n\t" +- "sub %0, %0, #1 \n\t" +- "strex %1, %0, [%2] " +- +- : "=&r" (__res), "=&r" (__ex_flag) +- : "r" (&(count)->counter) +- : "cc","memory" ); +- +- __res |= __ex_flag; +- if (unlikely(__res != 0)) +- fail_fn(count); +-} +- +-static inline int +-__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *)) +-{ +- int __ex_flag, __res; +- +- __asm__ ( +- +- "ldrex %0, [%2] \n\t" +- "sub %0, %0, #1 \n\t" +- "strex %1, %0, [%2] " +- +- : "=&r" (__res), "=&r" (__ex_flag) +- : "r" (&(count)->counter) +- : "cc","memory" ); +- +- __res |= __ex_flag; +- if (unlikely(__res != 0)) +- __res = fail_fn(count); +- return __res; +-} +- +-/* +- * Same trick is used for the unlock fast path. However the original value, +- * rather than the result, is used to test for success in order to have +- * better generated assembly. +- */ +-static inline void +-__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *)) +-{ +- int __ex_flag, __res, __orig; +- +- __asm__ ( +- +- "ldrex %0, [%3] \n\t" +- "add %1, %0, #1 \n\t" +- "strex %2, %1, [%3] " +- +- : "=&r" (__orig), "=&r" (__res), "=&r" (__ex_flag) +- : "r" (&(count)->counter) +- : "cc","memory" ); +- +- __orig |= __ex_flag; +- if (unlikely(__orig != 0)) +- fail_fn(count); +-} +- +-/* +- * If the unlock was done on a contended lock, or if the unlock simply fails +- * then the mutex remains locked. +- */ +-#define __mutex_slowpath_needs_to_unlock() 1 +- +-/* +- * For __mutex_fastpath_trylock we use another construct which could be +- * described as a "single value cmpxchg". +- * +- * This provides the needed trylock semantics like cmpxchg would, but it is +- * lighter and less generic than a true cmpxchg implementation. +- */ +-static inline int +-__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) +-{ +- int __ex_flag, __res, __orig; +- +- __asm__ ( +- +- "1: ldrex %0, [%3] \n\t" +- "subs %1, %0, #1 \n\t" +- "strexeq %2, %1, [%3] \n\t" +- "movlt %0, #0 \n\t" +- "cmpeq %2, #0 \n\t" +- "bgt 1b " +- +- : "=&r" (__orig), "=&r" (__res), "=&r" (__ex_flag) +- : "r" (&count->counter) +- : "cc", "memory" ); +- +- return __orig; +-} +- +-#endif ++#include + #endif +diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S +index b145f16..ece0996 100644 +--- a/arch/arm/kernel/entry-armv.S ++++ b/arch/arm/kernel/entry-armv.S +@@ -242,6 +242,19 @@ svc_preempt: + b 1b + #endif + ++__und_fault: ++ @ Correct the PC such that it is pointing at the instruction ++ @ which caused the fault. If the faulting instruction was ARM ++ @ the PC will be pointing at the next instruction, and have to ++ @ subtract 4. Otherwise, it is Thumb, and the PC will be ++ @ pointing at the second half of the Thumb instruction. We ++ @ have to subtract 2. ++ ldr r2, [r0, #S_PC] ++ sub r2, r2, r1 ++ str r2, [r0, #S_PC] ++ b do_undefinstr ++ENDPROC(__und_fault) ++ + .align 5 + __und_svc: + #ifdef CONFIG_KPROBES +@@ -259,25 +272,32 @@ __und_svc: + @ + @ r0 - instruction + @ +-#ifndef CONFIG_THUMB2_KERNEL ++#ifndef CONFIG_THUMB2_KERNEL + ldr r0, [r4, #-4] + #else ++ mov r1, #2 + ldrh r0, [r4, #-2] @ Thumb instruction at LR - 2 + cmp r0, #0xe800 @ 32-bit instruction if xx >= 0 +- ldrhhs r9, [r4] @ bottom 16 bits +- orrhs r0, r9, r0, lsl #16 ++ blo __und_svc_fault ++ ldrh r9, [r4] @ bottom 16 bits ++ add r4, r4, #2 ++ str r4, [sp, #S_PC] ++ orr r0, r9, r0, lsl #16 + #endif +- adr r9, BSYM(1f) ++ adr r9, BSYM(__und_svc_finish) + mov r2, r4 + bl call_fpe + ++ mov r1, #4 @ PC correction to apply ++__und_svc_fault: + mov r0, sp @ struct pt_regs *regs +- bl do_undefinstr ++ bl __und_fault + + @ + @ IRQs off again before pulling preserved data off the stack + @ +-1: disable_irq_notrace ++__und_svc_finish: ++ disable_irq_notrace + + @ + @ restore SPSR and restart the instruction +@@ -421,25 +441,33 @@ __und_usr: + mov r2, r4 + mov r3, r5 + ++ @ r2 = regs->ARM_pc, which is either 2 or 4 bytes ahead of the ++ @ faulting instruction depending on Thumb mode. ++ @ r3 = regs->ARM_cpsr + @ +- @ fall through to the emulation code, which returns using r9 if +- @ it has emulated the instruction, or the more conventional lr +- @ if we are to treat this as a real undefined instruction +- @ +- @ r0 - instruction ++ @ The emulation code returns using r9 if it has emulated the ++ @ instruction, or the more conventional lr if we are to treat ++ @ this as a real undefined instruction + @ + adr r9, BSYM(ret_from_exception) +- adr lr, BSYM(__und_usr_unknown) ++ + tst r3, #PSR_T_BIT @ Thumb mode? +- itet eq @ explicit IT needed for the 1f label +- subeq r4, r2, #4 @ ARM instr at LR - 4 +- subne r4, r2, #2 @ Thumb instr at LR - 2 +-1: ldreqt r0, [r4] ++ bne __und_usr_thumb ++ sub r4, r2, #4 @ ARM instr at LR - 4 ++1: ldrt r0, [r4] + #ifdef CONFIG_CPU_ENDIAN_BE8 +- reveq r0, r0 @ little endian instruction ++ rev r0, r0 @ little endian instruction + #endif +- beq call_fpe ++ @ r0 = 32-bit ARM instruction which caused the exception ++ @ r2 = PC value for the following instruction (:= regs->ARM_pc) ++ @ r4 = PC value for the faulting instruction ++ @ lr = 32-bit undefined instruction function ++ adr lr, BSYM(__und_usr_fault_32) ++ b call_fpe ++ ++__und_usr_thumb: + @ Thumb instruction ++ sub r4, r2, #2 @ First half of thumb instr at LR - 2 + #if CONFIG_ARM_THUMB && __LINUX_ARM_ARCH__ >= 6 && CONFIG_CPU_V7 + /* + * Thumb-2 instruction handling. Note that because pre-v6 and >= v6 platforms +@@ -453,7 +481,7 @@ __und_usr: + ldr r5, .LCcpu_architecture + ldr r5, [r5] + cmp r5, #CPU_ARCH_ARMv7 +- blo __und_usr_unknown ++ blo __und_usr_fault_16 @ 16bit undefined instruction + /* + * The following code won't get run unless the running CPU really is v7, so + * coding round the lack of ldrht on older arches is pointless. Temporarily +@@ -461,15 +489,18 @@ __und_usr: + */ + .arch armv6t2 + #endif +-2: +- ARM( ldrht r5, [r4], #2 ) +- THUMB( ldrht r5, [r4] ) +- THUMB( add r4, r4, #2 ) ++2: ldrht r5, [r4] + cmp r5, #0xe800 @ 32bit instruction if xx != 0 +- blo __und_usr_unknown +-3: ldrht r0, [r4] ++ blo __und_usr_fault_16 @ 16bit undefined instruction ++3: ldrht r0, [r2] + add r2, r2, #2 @ r2 is PC + 2, make it PC + 4 ++ str r2, [sp, #S_PC] @ it's a 2x16bit instr, update + orr r0, r0, r5, lsl #16 ++ adr lr, BSYM(__und_usr_fault_32) ++ @ r0 = the two 16-bit Thumb instructions which caused the exception ++ @ r2 = PC value for the following Thumb instruction (:= regs->ARM_pc) ++ @ r4 = PC value for the first 16-bit Thumb instruction ++ @ lr = 32bit undefined instruction function + + #if __LINUX_ARM_ARCH__ < 7 + /* If the target arch was overridden, change it back: */ +@@ -480,17 +511,13 @@ __und_usr: + #endif + #endif /* __LINUX_ARM_ARCH__ < 7 */ + #else /* !(CONFIG_ARM_THUMB && __LINUX_ARM_ARCH__ >= 6 && CONFIG_CPU_V7) */ +- b __und_usr_unknown ++ b __und_usr_fault_16 + #endif +- UNWIND(.fnend ) ++ UNWIND(.fnend) + ENDPROC(__und_usr) + +- @ +- @ fallthrough to call_fpe +- @ +- + /* +- * The out of line fixup for the ldrt above. ++ * The out of line fixup for the ldrt instructions above. + */ + .pushsection .fixup, "ax" + 4: mov pc, r9 +@@ -521,11 +548,12 @@ ENDPROC(__und_usr) + * NEON handler code. + * + * Emulators may wish to make use of the following registers: +- * r0 = instruction opcode. +- * r2 = PC+4 ++ * r0 = instruction opcode (32-bit ARM or two 16-bit Thumb) ++ * r2 = PC value to resume execution after successful emulation + * r9 = normal "successful" return address +- * r10 = this threads thread_info structure. ++ * r10 = this threads thread_info structure + * lr = unrecognised instruction return address ++ * IRQs disabled, FIQs enabled. + */ + @ + @ Fall-through from Thumb-2 __und_usr +@@ -660,12 +688,17 @@ ENTRY(no_fp) + mov pc, lr + ENDPROC(no_fp) + +-__und_usr_unknown: +- enable_irq ++__und_usr_fault_32: ++ mov r1, #4 ++ b 1f ++__und_usr_fault_16: ++ mov r1, #2 ++1: enable_irq + mov r0, sp + adr lr, BSYM(ret_from_exception) +- b do_undefinstr +-ENDPROC(__und_usr_unknown) ++ b __und_fault ++ENDPROC(__und_usr_fault_32) ++ENDPROC(__und_usr_fault_16) + + .align 5 + __pabt_usr: +diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c +index 3d0c6fb..e68d251 100644 +--- a/arch/arm/kernel/process.c ++++ b/arch/arm/kernel/process.c +@@ -125,6 +125,7 @@ void arm_machine_restart(char mode, const char *cmd) + */ + mdelay(1000); + printk("Reboot failed -- System halted\n"); ++ local_irq_disable(); + while (1); + } + +@@ -240,6 +241,7 @@ void machine_shutdown(void) + void machine_halt(void) + { + machine_shutdown(); ++ local_irq_disable(); + while (1); + } + +diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c +index 160cb16..8380bd1 100644 +--- a/arch/arm/kernel/traps.c ++++ b/arch/arm/kernel/traps.c +@@ -362,18 +362,10 @@ static int call_undef_hook(struct pt_regs *regs, unsigned int instr) + + asmlinkage void __exception do_undefinstr(struct pt_regs *regs) + { +- unsigned int correction = thumb_mode(regs) ? 2 : 4; + unsigned int instr; + siginfo_t info; + void __user *pc; + +- /* +- * According to the ARM ARM, PC is 2 or 4 bytes ahead, +- * depending whether we're in Thumb mode or not. +- * Correct this offset. +- */ +- regs->ARM_pc -= correction; +- + pc = (void __user *)instruction_pointer(regs); + + if (processor_mode(regs) == SVC_MODE) { +diff --git a/arch/arm/mm/tlb-v7.S b/arch/arm/mm/tlb-v7.S +index 845f461..c202113 100644 +--- a/arch/arm/mm/tlb-v7.S ++++ b/arch/arm/mm/tlb-v7.S +@@ -38,11 +38,19 @@ ENTRY(v7wbi_flush_user_tlb_range) + dsb + mov r0, r0, lsr #PAGE_SHIFT @ align address + mov r1, r1, lsr #PAGE_SHIFT ++#ifdef CONFIG_ARM_ERRATA_720789 ++ mov r3, #0 ++#else + asid r3, r3 @ mask ASID ++#endif + orr r0, r3, r0, lsl #PAGE_SHIFT @ Create initial MVA + mov r1, r1, lsl #PAGE_SHIFT + 1: ++#ifdef CONFIG_ARM_ERRATA_720789 ++ ALT_SMP(mcr p15, 0, r0, c8, c3, 3) @ TLB invalidate U MVA all ASID (shareable) ++#else + ALT_SMP(mcr p15, 0, r0, c8, c3, 1) @ TLB invalidate U MVA (shareable) ++#endif + ALT_UP(mcr p15, 0, r0, c8, c7, 1) @ TLB invalidate U MVA + + add r0, r0, #PAGE_SZ +@@ -67,7 +75,11 @@ ENTRY(v7wbi_flush_kern_tlb_range) + mov r0, r0, lsl #PAGE_SHIFT + mov r1, r1, lsl #PAGE_SHIFT + 1: ++#ifdef CONFIG_ARM_ERRATA_720789 ++ ALT_SMP(mcr p15, 0, r0, c8, c3, 3) @ TLB invalidate U MVA all ASID (shareable) ++#else + ALT_SMP(mcr p15, 0, r0, c8, c3, 1) @ TLB invalidate U MVA (shareable) ++#endif + ALT_UP(mcr p15, 0, r0, c8, c7, 1) @ TLB invalidate U MVA + add r0, r0, #PAGE_SZ + cmp r0, r1 +diff --git a/arch/arm/vfp/entry.S b/arch/arm/vfp/entry.S +index 4fa9903..cc926c9 100644 +--- a/arch/arm/vfp/entry.S ++++ b/arch/arm/vfp/entry.S +@@ -7,18 +7,20 @@ + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. +- * +- * Basic entry code, called from the kernel's undefined instruction trap. +- * r0 = faulted instruction +- * r5 = faulted PC+4 +- * r9 = successful return +- * r10 = thread_info structure +- * lr = failure return + */ + #include + #include + #include "../kernel/entry-header.S" + ++@ VFP entry point. ++@ ++@ r0 = instruction opcode (32-bit ARM or two 16-bit Thumb) ++@ r2 = PC value to resume execution after successful emulation ++@ r9 = normal "successful" return address ++@ r10 = this threads thread_info structure ++@ lr = unrecognised instruction return address ++@ IRQs disabled. ++@ + ENTRY(do_vfp) + #ifdef CONFIG_PREEMPT + ldr r4, [r10, #TI_PREEMPT] @ get preempt count +diff --git a/arch/arm/vfp/vfphw.S b/arch/arm/vfp/vfphw.S +index 2d30c7f..3a0efaa 100644 +--- a/arch/arm/vfp/vfphw.S ++++ b/arch/arm/vfp/vfphw.S +@@ -61,13 +61,13 @@ + + @ VFP hardware support entry point. + @ +-@ r0 = faulted instruction +-@ r2 = faulted PC+4 +-@ r9 = successful return ++@ r0 = instruction opcode (32-bit ARM or two 16-bit Thumb) ++@ r2 = PC value to resume execution after successful emulation ++@ r9 = normal "successful" return address + @ r10 = vfp_state union + @ r11 = CPU number +-@ lr = failure return +- ++@ lr = unrecognised instruction return address ++@ IRQs enabled. + ENTRY(vfp_support_entry) + DBGSTR3 "instr %08x pc %08x state %p", r0, r2, r10 + +@@ -161,9 +161,12 @@ vfp_hw_state_valid: + @ exception before retrying branch + @ out before setting an FPEXC that + @ stops us reading stuff +- VFPFMXR FPEXC, r1 @ restore FPEXC last +- sub r2, r2, #4 +- str r2, [sp, #S_PC] @ retry the instruction ++ VFPFMXR FPEXC, r1 @ Restore FPEXC last ++ sub r2, r2, #4 @ Retry current instruction - if Thumb ++ str r2, [sp, #S_PC] @ mode it's two 16-bit instructions, ++ @ else it's one 32-bit instruction, so ++ @ always subtract 4 from the following ++ @ instruction address. + #ifdef CONFIG_PREEMPT + get_thread_info r10 + ldr r4, [r10, #TI_PREEMPT] @ get preempt count +diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c +index 8ea07e4..ad83dad 100644 +--- a/arch/arm/vfp/vfpmodule.c ++++ b/arch/arm/vfp/vfpmodule.c +@@ -453,10 +453,16 @@ static int vfp_pm_suspend(void) + + /* disable, just in case */ + fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN); ++ } else if (vfp_current_hw_state[ti->cpu]) { ++#ifndef CONFIG_SMP ++ fmxr(FPEXC, fpexc | FPEXC_EN); ++ vfp_save_state(vfp_current_hw_state[ti->cpu], fpexc); ++ fmxr(FPEXC, fpexc); ++#endif + } + + /* clear any information we had about last context state */ +- memset(vfp_current_hw_state, 0, sizeof(vfp_current_hw_state)); ++ vfp_current_hw_state[ti->cpu] = NULL; + + return 0; + } +diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h +index 3fad89e..2fc214b 100644 +--- a/arch/ia64/include/asm/atomic.h ++++ b/arch/ia64/include/asm/atomic.h +@@ -18,8 +18,8 @@ + #include + + +-#define ATOMIC_INIT(i) ((atomic_t) { (i) }) +-#define ATOMIC64_INIT(i) ((atomic64_t) { (i) }) ++#define ATOMIC_INIT(i) { (i) } ++#define ATOMIC64_INIT(i) { (i) } + + #define atomic_read(v) (*(volatile int *)&(v)->counter) + #define atomic64_read(v) (*(volatile long *)&(v)->counter) +diff --git a/arch/m68k/include/asm/entry.h b/arch/m68k/include/asm/entry.h +index c3c5a86..8798ebc 100644 +--- a/arch/m68k/include/asm/entry.h ++++ b/arch/m68k/include/asm/entry.h +@@ -33,8 +33,8 @@ + + /* the following macro is used when enabling interrupts */ + #if defined(MACH_ATARI_ONLY) +- /* block out HSYNC on the atari */ +-#define ALLOWINT (~0x400) ++ /* block out HSYNC = ipl 2 on the atari */ ++#define ALLOWINT (~0x500) + #define MAX_NOINT_IPL 3 + #else + /* portable version */ +diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c +index 8623f8d..9a5932e 100644 +--- a/arch/m68k/kernel/sys_m68k.c ++++ b/arch/m68k/kernel/sys_m68k.c +@@ -479,9 +479,13 @@ sys_atomic_cmpxchg_32(unsigned long newval, int oldval, int d3, int d4, int d5, + goto bad_access; + } + +- mem_value = *mem; ++ /* ++ * No need to check for EFAULT; we know that the page is ++ * present and writable. ++ */ ++ __get_user(mem_value, mem); + if (mem_value == oldval) +- *mem = newval; ++ __put_user(newval, mem); + + pte_unmap_unlock(pte, ptl); + up_read(&mm->mmap_sem); +diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h +index 5682f16..20f0e01 100644 +--- a/arch/s390/include/asm/mmu_context.h ++++ b/arch/s390/include/asm/mmu_context.h +@@ -12,7 +12,6 @@ + #include + #include + #include +-#include + + static inline int init_new_context(struct task_struct *tsk, + struct mm_struct *mm) +@@ -92,4 +91,17 @@ static inline void activate_mm(struct mm_struct *prev, + switch_mm(prev, next, current); + } + ++static inline void arch_dup_mmap(struct mm_struct *oldmm, ++ struct mm_struct *mm) ++{ ++#ifdef CONFIG_64BIT ++ if (oldmm->context.asce_limit < mm->context.asce_limit) ++ crst_table_downgrade(mm, oldmm->context.asce_limit); ++#endif ++} ++ ++static inline void arch_exit_mmap(struct mm_struct *mm) ++{ ++} ++ + #endif /* __S390_MMU_CONTEXT_H */ +diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h +index 5f33d37..172550d 100644 +--- a/arch/s390/include/asm/processor.h ++++ b/arch/s390/include/asm/processor.h +@@ -130,7 +130,9 @@ struct stack_frame { + regs->psw.mask = psw_user_bits | PSW_MASK_BA; \ + regs->psw.addr = new_psw | PSW_ADDR_AMODE; \ + regs->gprs[15] = new_stackp; \ ++ __tlb_flush_mm(current->mm); \ + crst_table_downgrade(current->mm, 1UL << 31); \ ++ update_mm(current->mm, current); \ + } while (0) + + /* Forward declaration, a strange C thing */ +diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c +index b28aaa4..0fc0a7e 100644 +--- a/arch/s390/mm/fault.c ++++ b/arch/s390/mm/fault.c +@@ -453,6 +453,7 @@ int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write) + struct pt_regs regs; + int access, fault; + ++ /* Emulate a uaccess fault from kernel mode. */ + regs.psw.mask = psw_kernel_bits | PSW_MASK_DAT | PSW_MASK_MCHECK; + if (!irqs_disabled()) + regs.psw.mask |= PSW_MASK_IO | PSW_MASK_EXT; +@@ -461,12 +462,12 @@ int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write) + uaddr &= PAGE_MASK; + access = write ? VM_WRITE : VM_READ; + fault = do_exception(®s, access, uaddr | 2); +- if (unlikely(fault)) { +- if (fault & VM_FAULT_OOM) +- return -EFAULT; +- else if (fault & VM_FAULT_SIGBUS) +- do_sigbus(®s, pgm_int_code, uaddr); +- } ++ /* ++ * Since the fault happened in kernel mode while performing a uaccess ++ * all we need to do now is emulating a fixup in case "fault" is not ++ * zero. ++ * For the calling uaccess functions this results always in -EFAULT. ++ */ + return fault ? -EFAULT : 0; + } + +diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c +index a0155c0..c70b3d8 100644 +--- a/arch/s390/mm/mmap.c ++++ b/arch/s390/mm/mmap.c +@@ -106,9 +106,15 @@ EXPORT_SYMBOL_GPL(arch_pick_mmap_layout); + + int s390_mmap_check(unsigned long addr, unsigned long len) + { ++ int rc; ++ + if (!is_compat_task() && +- len >= TASK_SIZE && TASK_SIZE < (1UL << 53)) +- return crst_table_upgrade(current->mm, 1UL << 53); ++ len >= TASK_SIZE && TASK_SIZE < (1UL << 53)) { ++ rc = crst_table_upgrade(current->mm, 1UL << 53); ++ if (rc) ++ return rc; ++ update_mm(current->mm, current); ++ } + return 0; + } + +@@ -128,6 +134,7 @@ s390_get_unmapped_area(struct file *filp, unsigned long addr, + rc = crst_table_upgrade(mm, 1UL << 53); + if (rc) + return (unsigned long) rc; ++ update_mm(mm, current); + area = arch_get_unmapped_area(filp, addr, len, pgoff, flags); + } + return area; +@@ -150,6 +157,7 @@ s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr, + rc = crst_table_upgrade(mm, 1UL << 53); + if (rc) + return (unsigned long) rc; ++ update_mm(mm, current); + area = arch_get_unmapped_area_topdown(filp, addr, len, + pgoff, flags); + } +diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c +index f8ceac4..f8e92f8 100644 +--- a/arch/s390/mm/pgtable.c ++++ b/arch/s390/mm/pgtable.c +@@ -97,7 +97,6 @@ repeat: + crst_table_free(mm, table); + if (mm->context.asce_limit < limit) + goto repeat; +- update_mm(mm, current); + return 0; + } + +@@ -105,9 +104,6 @@ void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) + { + pgd_t *pgd; + +- if (mm->context.asce_limit <= limit) +- return; +- __tlb_flush_mm(mm); + while (mm->context.asce_limit > limit) { + pgd = mm->pgd; + switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { +@@ -130,7 +126,6 @@ void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) + mm->task_size = mm->context.asce_limit; + crst_table_free(mm, (unsigned long *) pgd); + } +- update_mm(mm, current); + } + #endif + +diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c +index 1f84794..73ef56c 100644 +--- a/arch/x86/kernel/alternative.c ++++ b/arch/x86/kernel/alternative.c +@@ -219,7 +219,7 @@ void __init arch_init_ideal_nops(void) + ideal_nops = intel_nops; + #endif + } +- ++ break; + default: + #ifdef CONFIG_X86_64 + ideal_nops = k8_nops; +diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c +index 1b267e7..00a0385 100644 +--- a/arch/x86/xen/p2m.c ++++ b/arch/x86/xen/p2m.c +@@ -686,6 +686,7 @@ int m2p_add_override(unsigned long mfn, struct page *page, + unsigned long uninitialized_var(address); + unsigned level; + pte_t *ptep = NULL; ++ int ret = 0; + + pfn = page_to_pfn(page); + if (!PageHighMem(page)) { +@@ -721,6 +722,24 @@ int m2p_add_override(unsigned long mfn, struct page *page, + list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); + spin_unlock_irqrestore(&m2p_override_lock, flags); + ++ /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in ++ * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other ++ * pfn so that the following mfn_to_pfn(mfn) calls will return the ++ * pfn from the m2p_override (the backend pfn) instead. ++ * We need to do this because the pages shared by the frontend ++ * (xen-blkfront) can be already locked (lock_page, called by ++ * do_read_cache_page); when the userspace backend tries to use them ++ * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so ++ * do_blockdev_direct_IO is going to try to lock the same pages ++ * again resulting in a deadlock. ++ * As a side effect get_user_pages_fast might not be safe on the ++ * frontend pages while they are being shared with the backend, ++ * because mfn_to_pfn (that ends up being called by GUPF) will ++ * return the backend pfn rather than the frontend pfn. */ ++ ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); ++ if (ret == 0 && get_phys_to_machine(pfn) == mfn) ++ set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); ++ + return 0; + } + EXPORT_SYMBOL_GPL(m2p_add_override); +@@ -732,6 +751,7 @@ int m2p_remove_override(struct page *page, bool clear_pte) + unsigned long uninitialized_var(address); + unsigned level; + pte_t *ptep = NULL; ++ int ret = 0; + + pfn = page_to_pfn(page); + mfn = get_phys_to_machine(pfn); +@@ -801,6 +821,22 @@ int m2p_remove_override(struct page *page, bool clear_pte) + } else + set_phys_to_machine(pfn, page->index); + ++ /* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present ++ * somewhere in this domain, even before being added to the ++ * m2p_override (see comment above in m2p_add_override). ++ * If there are no other entries in the m2p_override corresponding ++ * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for ++ * the original pfn (the one shared by the frontend): the backend ++ * cannot do any IO on this page anymore because it has been ++ * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of ++ * the original pfn causes mfn_to_pfn(mfn) to return the frontend ++ * pfn again. */ ++ mfn &= ~FOREIGN_FRAME_BIT; ++ ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); ++ if (ret == 0 && get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) && ++ m2p_find_override(mfn) == NULL) ++ set_phys_to_machine(pfn, mfn); ++ + return 0; + } + EXPORT_SYMBOL_GPL(m2p_remove_override); +diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c +index 9955a53..c864add 100644 +--- a/drivers/block/floppy.c ++++ b/drivers/block/floppy.c +@@ -4369,8 +4369,14 @@ out_unreg_blkdev: + out_put_disk: + while (dr--) { + del_timer_sync(&motor_off_timer[dr]); +- if (disks[dr]->queue) ++ if (disks[dr]->queue) { + blk_cleanup_queue(disks[dr]->queue); ++ /* ++ * put_disk() is not paired with add_disk() and ++ * will put queue reference one extra time. fix it. ++ */ ++ disks[dr]->queue = NULL; ++ } + put_disk(disks[dr]); + } + return err; +diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c +index e46f2f7..650a308 100644 +--- a/drivers/block/virtio_blk.c ++++ b/drivers/block/virtio_blk.c +@@ -20,8 +20,6 @@ struct workqueue_struct *virtblk_wq; + + struct virtio_blk + { +- spinlock_t lock; +- + struct virtio_device *vdev; + struct virtqueue *vq; + +@@ -62,7 +60,7 @@ static void blk_done(struct virtqueue *vq) + unsigned int len; + unsigned long flags; + +- spin_lock_irqsave(&vblk->lock, flags); ++ spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); + while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { + int error; + +@@ -97,7 +95,7 @@ static void blk_done(struct virtqueue *vq) + } + /* In case queue is stopped waiting for more buffers. */ + blk_start_queue(vblk->disk->queue); +- spin_unlock_irqrestore(&vblk->lock, flags); ++ spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags); + } + + static bool do_req(struct request_queue *q, struct virtio_blk *vblk, +@@ -384,7 +382,6 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) + } + + INIT_LIST_HEAD(&vblk->reqs); +- spin_lock_init(&vblk->lock); + vblk->vdev = vdev; + vblk->sg_elems = sg_elems; + sg_init_table(vblk->sg, vblk->sg_elems); +@@ -410,7 +407,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) + goto out_mempool; + } + +- q = vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock); ++ q = vblk->disk->queue = blk_init_queue(do_virtblk_request, NULL); + if (!q) { + err = -ENOMEM; + goto out_put_disk; +diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c +index 5c0d96a..b12ffea 100644 +--- a/drivers/char/mspec.c ++++ b/drivers/char/mspec.c +@@ -284,7 +284,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, + vdata->flags = flags; + vdata->type = type; + spin_lock_init(&vdata->lock); +- vdata->refcnt = ATOMIC_INIT(1); ++ atomic_set(&vdata->refcnt, 1); + vma->vm_private_data = vdata; + + vma->vm_flags |= (VM_IO | VM_RESERVED | VM_PFNMAP | VM_DONTEXPAND); +diff --git a/drivers/char/random.c b/drivers/char/random.c +index 6035ab8..631d4f6 100644 +--- a/drivers/char/random.c ++++ b/drivers/char/random.c +@@ -125,21 +125,26 @@ + * The current exported interfaces for gathering environmental noise + * from the devices are: + * ++ * void add_device_randomness(const void *buf, unsigned int size); + * void add_input_randomness(unsigned int type, unsigned int code, + * unsigned int value); +- * void add_interrupt_randomness(int irq); ++ * void add_interrupt_randomness(int irq, int irq_flags); + * void add_disk_randomness(struct gendisk *disk); + * ++ * add_device_randomness() is for adding data to the random pool that ++ * is likely to differ between two devices (or possibly even per boot). ++ * This would be things like MAC addresses or serial numbers, or the ++ * read-out of the RTC. This does *not* add any actual entropy to the ++ * pool, but it initializes the pool to different values for devices ++ * that might otherwise be identical and have very little entropy ++ * available to them (particularly common in the embedded world). ++ * + * add_input_randomness() uses the input layer interrupt timing, as well as + * the event type information from the hardware. + * +- * add_interrupt_randomness() uses the inter-interrupt timing as random +- * inputs to the entropy pool. Note that not all interrupts are good +- * sources of randomness! For example, the timer interrupts is not a +- * good choice, because the periodicity of the interrupts is too +- * regular, and hence predictable to an attacker. Network Interface +- * Controller interrupts are a better measure, since the timing of the +- * NIC interrupts are more unpredictable. ++ * add_interrupt_randomness() uses the interrupt timing as random ++ * inputs to the entropy pool. Using the cycle counters and the irq source ++ * as inputs, it feeds the randomness roughly once a second. + * + * add_disk_randomness() uses what amounts to the seek time of block + * layer request events, on a per-disk_devt basis, as input to the +@@ -248,6 +253,8 @@ + #include + #include + #include ++#include ++#include + + #ifdef CONFIG_GENERIC_HARDIRQS + # include +@@ -256,6 +263,7 @@ + #include + #include + #include ++#include + #include + + /* +@@ -266,6 +274,8 @@ + #define SEC_XFER_SIZE 512 + #define EXTRACT_SIZE 10 + ++#define LONGS(x) (((x) + sizeof(unsigned long) - 1)/sizeof(unsigned long)) ++ + /* + * The minimum number of bits of entropy before we wake up a read on + * /dev/random. Should be enough to do a significant reseed. +@@ -420,8 +430,10 @@ struct entropy_store { + /* read-write data: */ + spinlock_t lock; + unsigned add_ptr; ++ unsigned input_rotate; + int entropy_count; +- int input_rotate; ++ int entropy_total; ++ unsigned int initialized:1; + __u8 last_data[EXTRACT_SIZE]; + }; + +@@ -454,6 +466,10 @@ static struct entropy_store nonblocking_pool = { + .pool = nonblocking_pool_data + }; + ++static __u32 const twist_table[8] = { ++ 0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158, ++ 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 }; ++ + /* + * This function adds bytes into the entropy "pool". It does not + * update the entropy estimate. The caller should call +@@ -464,29 +480,24 @@ static struct entropy_store nonblocking_pool = { + * it's cheap to do so and helps slightly in the expected case where + * the entropy is concentrated in the low-order bits. + */ +-static void mix_pool_bytes_extract(struct entropy_store *r, const void *in, +- int nbytes, __u8 out[64]) ++static void __mix_pool_bytes(struct entropy_store *r, const void *in, ++ int nbytes, __u8 out[64]) + { +- static __u32 const twist_table[8] = { +- 0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158, +- 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 }; + unsigned long i, j, tap1, tap2, tap3, tap4, tap5; + int input_rotate; + int wordmask = r->poolinfo->poolwords - 1; + const char *bytes = in; + __u32 w; +- unsigned long flags; + +- /* Taps are constant, so we can load them without holding r->lock. */ + tap1 = r->poolinfo->tap1; + tap2 = r->poolinfo->tap2; + tap3 = r->poolinfo->tap3; + tap4 = r->poolinfo->tap4; + tap5 = r->poolinfo->tap5; + +- spin_lock_irqsave(&r->lock, flags); +- input_rotate = r->input_rotate; +- i = r->add_ptr; ++ smp_rmb(); ++ input_rotate = ACCESS_ONCE(r->input_rotate); ++ i = ACCESS_ONCE(r->add_ptr); + + /* mix one byte at a time to simplify size handling and churn faster */ + while (nbytes--) { +@@ -513,19 +524,53 @@ static void mix_pool_bytes_extract(struct entropy_store *r, const void *in, + input_rotate += i ? 7 : 14; + } + +- r->input_rotate = input_rotate; +- r->add_ptr = i; ++ ACCESS_ONCE(r->input_rotate) = input_rotate; ++ ACCESS_ONCE(r->add_ptr) = i; ++ smp_wmb(); + + if (out) + for (j = 0; j < 16; j++) + ((__u32 *)out)[j] = r->pool[(i - j) & wordmask]; ++} ++ ++static void mix_pool_bytes(struct entropy_store *r, const void *in, ++ int nbytes, __u8 out[64]) ++{ ++ unsigned long flags; + ++ spin_lock_irqsave(&r->lock, flags); ++ __mix_pool_bytes(r, in, nbytes, out); + spin_unlock_irqrestore(&r->lock, flags); + } + +-static void mix_pool_bytes(struct entropy_store *r, const void *in, int bytes) ++struct fast_pool { ++ __u32 pool[4]; ++ unsigned long last; ++ unsigned short count; ++ unsigned char rotate; ++ unsigned char last_timer_intr; ++}; ++ ++/* ++ * This is a fast mixing routine used by the interrupt randomness ++ * collector. It's hardcoded for an 128 bit pool and assumes that any ++ * locks that might be needed are taken by the caller. ++ */ ++static void fast_mix(struct fast_pool *f, const void *in, int nbytes) + { +- mix_pool_bytes_extract(r, in, bytes, NULL); ++ const char *bytes = in; ++ __u32 w; ++ unsigned i = f->count; ++ unsigned input_rotate = f->rotate; ++ ++ while (nbytes--) { ++ w = rol32(*bytes++, input_rotate & 31) ^ f->pool[i & 3] ^ ++ f->pool[(i + 1) & 3]; ++ f->pool[i & 3] = (w >> 3) ^ twist_table[w & 7]; ++ input_rotate += (i++ & 3) ? 7 : 14; ++ } ++ f->count = i; ++ f->rotate = input_rotate; + } + + /* +@@ -533,30 +578,34 @@ static void mix_pool_bytes(struct entropy_store *r, const void *in, int bytes) + */ + static void credit_entropy_bits(struct entropy_store *r, int nbits) + { +- unsigned long flags; +- int entropy_count; ++ int entropy_count, orig; + + if (!nbits) + return; + +- spin_lock_irqsave(&r->lock, flags); +- + DEBUG_ENT("added %d entropy credits to %s\n", nbits, r->name); +- entropy_count = r->entropy_count; ++retry: ++ entropy_count = orig = ACCESS_ONCE(r->entropy_count); + entropy_count += nbits; + if (entropy_count < 0) { + DEBUG_ENT("negative entropy/overflow\n"); + entropy_count = 0; + } else if (entropy_count > r->poolinfo->POOLBITS) + entropy_count = r->poolinfo->POOLBITS; +- r->entropy_count = entropy_count; ++ if (cmpxchg(&r->entropy_count, orig, entropy_count) != orig) ++ goto retry; ++ ++ if (!r->initialized && nbits > 0) { ++ r->entropy_total += nbits; ++ if (r->entropy_total > 128) ++ r->initialized = 1; ++ } + + /* should we wake readers? */ + if (r == &input_pool && entropy_count >= random_read_wakeup_thresh) { + wake_up_interruptible(&random_read_wait); + kill_fasync(&fasync, SIGIO, POLL_IN); + } +- spin_unlock_irqrestore(&r->lock, flags); + } + + /********************************************************************* +@@ -609,6 +658,25 @@ static void set_timer_rand_state(unsigned int irq, + } + #endif + ++/* ++ * Add device- or boot-specific data to the input and nonblocking ++ * pools to help initialize them to unique values. ++ * ++ * None of this adds any entropy, it is meant to avoid the ++ * problem of the nonblocking pool having similar initial state ++ * across largely identical devices. ++ */ ++void add_device_randomness(const void *buf, unsigned int size) ++{ ++ unsigned long time = get_cycles() ^ jiffies; ++ ++ mix_pool_bytes(&input_pool, buf, size, NULL); ++ mix_pool_bytes(&input_pool, &time, sizeof(time), NULL); ++ mix_pool_bytes(&nonblocking_pool, buf, size, NULL); ++ mix_pool_bytes(&nonblocking_pool, &time, sizeof(time), NULL); ++} ++EXPORT_SYMBOL(add_device_randomness); ++ + static struct timer_rand_state input_timer_state; + + /* +@@ -624,8 +692,8 @@ static struct timer_rand_state input_timer_state; + static void add_timer_randomness(struct timer_rand_state *state, unsigned num) + { + struct { +- cycles_t cycles; + long jiffies; ++ unsigned cycles; + unsigned num; + } sample; + long delta, delta2, delta3; +@@ -639,7 +707,7 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num) + sample.jiffies = jiffies; + sample.cycles = get_cycles(); + sample.num = num; +- mix_pool_bytes(&input_pool, &sample, sizeof(sample)); ++ mix_pool_bytes(&input_pool, &sample, sizeof(sample), NULL); + + /* + * Calculate number of bits of randomness we probably added. +@@ -696,17 +764,48 @@ void add_input_randomness(unsigned int type, unsigned int code, + } + EXPORT_SYMBOL_GPL(add_input_randomness); + +-void add_interrupt_randomness(int irq) ++static DEFINE_PER_CPU(struct fast_pool, irq_randomness); ++ ++void add_interrupt_randomness(int irq, int irq_flags) + { +- struct timer_rand_state *state; ++ struct entropy_store *r; ++ struct fast_pool *fast_pool = &__get_cpu_var(irq_randomness); ++ struct pt_regs *regs = get_irq_regs(); ++ unsigned long now = jiffies; ++ __u32 input[4], cycles = get_cycles(); ++ ++ input[0] = cycles ^ jiffies; ++ input[1] = irq; ++ if (regs) { ++ __u64 ip = instruction_pointer(regs); ++ input[2] = ip; ++ input[3] = ip >> 32; ++ } + +- state = get_timer_rand_state(irq); ++ fast_mix(fast_pool, input, sizeof(input)); + +- if (state == NULL) ++ if ((fast_pool->count & 1023) && ++ !time_after(now, fast_pool->last + HZ)) + return; + +- DEBUG_ENT("irq event %d\n", irq); +- add_timer_randomness(state, 0x100 + irq); ++ fast_pool->last = now; ++ ++ r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool; ++ __mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool), NULL); ++ /* ++ * If we don't have a valid cycle counter, and we see ++ * back-to-back timer interrupts, then skip giving credit for ++ * any entropy. ++ */ ++ if (cycles == 0) { ++ if (irq_flags & __IRQF_TIMER) { ++ if (fast_pool->last_timer_intr) ++ return; ++ fast_pool->last_timer_intr = 1; ++ } else ++ fast_pool->last_timer_intr = 0; ++ } ++ credit_entropy_bits(r, 1); + } + + #ifdef CONFIG_BLOCK +@@ -738,7 +837,7 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf, + */ + static void xfer_secondary_pool(struct entropy_store *r, size_t nbytes) + { +- __u32 tmp[OUTPUT_POOL_WORDS]; ++ __u32 tmp[OUTPUT_POOL_WORDS]; + + if (r->pull && r->entropy_count < nbytes * 8 && + r->entropy_count < r->poolinfo->POOLBITS) { +@@ -757,7 +856,7 @@ static void xfer_secondary_pool(struct entropy_store *r, size_t nbytes) + + bytes = extract_entropy(r->pull, tmp, bytes, + random_read_wakeup_thresh / 8, rsvd); +- mix_pool_bytes(r, tmp, bytes); ++ mix_pool_bytes(r, tmp, bytes, NULL); + credit_entropy_bits(r, bytes*8); + } + } +@@ -816,13 +915,19 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min, + static void extract_buf(struct entropy_store *r, __u8 *out) + { + int i; +- __u32 hash[5], workspace[SHA_WORKSPACE_WORDS]; ++ union { ++ __u32 w[5]; ++ unsigned long l[LONGS(EXTRACT_SIZE)]; ++ } hash; ++ __u32 workspace[SHA_WORKSPACE_WORDS]; + __u8 extract[64]; ++ unsigned long flags; + + /* Generate a hash across the pool, 16 words (512 bits) at a time */ +- sha_init(hash); ++ sha_init(hash.w); ++ spin_lock_irqsave(&r->lock, flags); + for (i = 0; i < r->poolinfo->poolwords; i += 16) +- sha_transform(hash, (__u8 *)(r->pool + i), workspace); ++ sha_transform(hash.w, (__u8 *)(r->pool + i), workspace); + + /* + * We mix the hash back into the pool to prevent backtracking +@@ -833,13 +938,14 @@ static void extract_buf(struct entropy_store *r, __u8 *out) + * brute-forcing the feedback as hard as brute-forcing the + * hash. + */ +- mix_pool_bytes_extract(r, hash, sizeof(hash), extract); ++ __mix_pool_bytes(r, hash.w, sizeof(hash.w), extract); ++ spin_unlock_irqrestore(&r->lock, flags); + + /* + * To avoid duplicates, we atomically extract a portion of the + * pool while mixing, and hash one final time. + */ +- sha_transform(hash, extract, workspace); ++ sha_transform(hash.w, extract, workspace); + memset(extract, 0, sizeof(extract)); + memset(workspace, 0, sizeof(workspace)); + +@@ -848,19 +954,30 @@ static void extract_buf(struct entropy_store *r, __u8 *out) + * pattern, we fold it in half. Thus, we always feed back + * twice as much data as we output. + */ +- hash[0] ^= hash[3]; +- hash[1] ^= hash[4]; +- hash[2] ^= rol32(hash[2], 16); +- memcpy(out, hash, EXTRACT_SIZE); +- memset(hash, 0, sizeof(hash)); ++ hash.w[0] ^= hash.w[3]; ++ hash.w[1] ^= hash.w[4]; ++ hash.w[2] ^= rol32(hash.w[2], 16); ++ ++ /* ++ * If we have a architectural hardware random number ++ * generator, mix that in, too. ++ */ ++ for (i = 0; i < LONGS(EXTRACT_SIZE); i++) { ++ unsigned long v; ++ if (!arch_get_random_long(&v)) ++ break; ++ hash.l[i] ^= v; ++ } ++ ++ memcpy(out, &hash, EXTRACT_SIZE); ++ memset(&hash, 0, sizeof(hash)); + } + + static ssize_t extract_entropy(struct entropy_store *r, void *buf, +- size_t nbytes, int min, int reserved) ++ size_t nbytes, int min, int reserved) + { + ssize_t ret = 0, i; + __u8 tmp[EXTRACT_SIZE]; +- unsigned long flags; + + xfer_secondary_pool(r, nbytes); + nbytes = account(r, nbytes, min, reserved); +@@ -869,6 +986,8 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf, + extract_buf(r, tmp); + + if (fips_enabled) { ++ unsigned long flags; ++ + spin_lock_irqsave(&r->lock, flags); + if (!memcmp(tmp, r->last_data, EXTRACT_SIZE)) + panic("Hardware RNG duplicated output!\n"); +@@ -927,17 +1046,34 @@ static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf, + + /* + * This function is the exported kernel interface. It returns some +- * number of good random numbers, suitable for seeding TCP sequence +- * numbers, etc. ++ * number of good random numbers, suitable for key generation, seeding ++ * TCP sequence numbers, etc. It does not use the hw random number ++ * generator, if available; use get_random_bytes_arch() for that. + */ + void get_random_bytes(void *buf, int nbytes) + { ++ extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0); ++} ++EXPORT_SYMBOL(get_random_bytes); ++ ++/* ++ * This function will use the architecture-specific hardware random ++ * number generator if it is available. The arch-specific hw RNG will ++ * almost certainly be faster than what we can do in software, but it ++ * is impossible to verify that it is implemented securely (as ++ * opposed, to, say, the AES encryption of a sequence number using a ++ * key known by the NSA). So it's useful if we need the speed, but ++ * only if we're willing to trust the hardware manufacturer not to ++ * have put in a back door. ++ */ ++void get_random_bytes_arch(void *buf, int nbytes) ++{ + char *p = buf; + + while (nbytes) { + unsigned long v; + int chunk = min(nbytes, (int)sizeof(unsigned long)); +- ++ + if (!arch_get_random_long(&v)) + break; + +@@ -946,9 +1082,11 @@ void get_random_bytes(void *buf, int nbytes) + nbytes -= chunk; + } + +- extract_entropy(&nonblocking_pool, p, nbytes, 0, 0); ++ if (nbytes) ++ extract_entropy(&nonblocking_pool, p, nbytes, 0, 0); + } +-EXPORT_SYMBOL(get_random_bytes); ++EXPORT_SYMBOL(get_random_bytes_arch); ++ + + /* + * init_std_data - initialize pool with system data +@@ -961,16 +1099,19 @@ EXPORT_SYMBOL(get_random_bytes); + */ + static void init_std_data(struct entropy_store *r) + { +- ktime_t now; +- unsigned long flags; ++ int i; ++ ktime_t now = ktime_get_real(); ++ unsigned long rv; + +- spin_lock_irqsave(&r->lock, flags); + r->entropy_count = 0; +- spin_unlock_irqrestore(&r->lock, flags); +- +- now = ktime_get_real(); +- mix_pool_bytes(r, &now, sizeof(now)); +- mix_pool_bytes(r, utsname(), sizeof(*(utsname()))); ++ r->entropy_total = 0; ++ mix_pool_bytes(r, &now, sizeof(now), NULL); ++ for (i = r->poolinfo->POOLBYTES; i > 0; i -= sizeof(rv)) { ++ if (!arch_get_random_long(&rv)) ++ break; ++ mix_pool_bytes(r, &rv, sizeof(rv), NULL); ++ } ++ mix_pool_bytes(r, utsname(), sizeof(*(utsname())), NULL); + } + + static int rand_initialize(void) +@@ -1107,7 +1248,7 @@ write_pool(struct entropy_store *r, const char __user *buffer, size_t count) + count -= bytes; + p += bytes; + +- mix_pool_bytes(r, buf, bytes); ++ mix_pool_bytes(r, buf, bytes, NULL); + cond_resched(); + } + +diff --git a/drivers/firmware/pcdp.c b/drivers/firmware/pcdp.c +index 51e0e2d..a330492 100644 +--- a/drivers/firmware/pcdp.c ++++ b/drivers/firmware/pcdp.c +@@ -95,7 +95,7 @@ efi_setup_pcdp_console(char *cmdline) + if (efi.hcdp == EFI_INVALID_TABLE_ADDR) + return -ENODEV; + +- pcdp = ioremap(efi.hcdp, 4096); ++ pcdp = early_ioremap(efi.hcdp, 4096); + printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, efi.hcdp); + + if (strstr(cmdline, "console=hcdp")) { +@@ -131,6 +131,6 @@ efi_setup_pcdp_console(char *cmdline) + } + + out: +- iounmap(pcdp); ++ early_iounmap(pcdp, 4096); + return rc; + } +diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c +index d4c4937..fae2050 100644 +--- a/drivers/gpu/drm/i915/intel_dp.c ++++ b/drivers/gpu/drm/i915/intel_dp.c +@@ -708,8 +708,8 @@ intel_dp_mode_fixup(struct drm_encoder *encoder, struct drm_display_mode *mode, + + bpp = adjusted_mode->private_flags & INTEL_MODE_DP_FORCE_6BPC ? 18 : 24; + +- for (lane_count = 1; lane_count <= max_lane_count; lane_count <<= 1) { +- for (clock = 0; clock <= max_clock; clock++) { ++ for (clock = 0; clock <= max_clock; clock++) { ++ for (lane_count = 1; lane_count <= max_lane_count; lane_count <<= 1) { + int link_avail = intel_dp_max_data_rate(intel_dp_link_clock(bws[clock]), lane_count); + + if (intel_dp_link_required(mode->clock, bpp) +diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c +index a6dcd18..96532bc 100644 +--- a/drivers/input/mouse/synaptics.c ++++ b/drivers/input/mouse/synaptics.c +@@ -40,11 +40,28 @@ + * Note that newer firmware allows querying device for maximum useable + * coordinates. + */ ++#define XMIN 0 ++#define XMAX 6143 ++#define YMIN 0 ++#define YMAX 6143 + #define XMIN_NOMINAL 1472 + #define XMAX_NOMINAL 5472 + #define YMIN_NOMINAL 1408 + #define YMAX_NOMINAL 4448 + ++/* Size in bits of absolute position values reported by the hardware */ ++#define ABS_POS_BITS 13 ++ ++/* ++ * Any position values from the hardware above the following limits are ++ * treated as "wrapped around negative" values that have been truncated to ++ * the 13-bit reporting range of the hardware. These are just reasonable ++ * guesses and can be adjusted if hardware is found that operates outside ++ * of these parameters. ++ */ ++#define X_MAX_POSITIVE (((1 << ABS_POS_BITS) + XMAX) / 2) ++#define Y_MAX_POSITIVE (((1 << ABS_POS_BITS) + YMAX) / 2) ++ + /* + * Synaptics touchpads report the y coordinate from bottom to top, which is + * opposite from what userspace expects. +@@ -544,6 +561,12 @@ static int synaptics_parse_hw_state(const unsigned char buf[], + hw->right = (buf[0] & 0x02) ? 1 : 0; + } + ++ /* Convert wrap-around values to negative */ ++ if (hw->x > X_MAX_POSITIVE) ++ hw->x -= 1 << ABS_POS_BITS; ++ if (hw->y > Y_MAX_POSITIVE) ++ hw->y -= 1 << ABS_POS_BITS; ++ + return 0; + } + +diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c +index 532a902..d432032 100644 +--- a/drivers/md/dm-thin.c ++++ b/drivers/md/dm-thin.c +@@ -19,7 +19,7 @@ + /* + * Tunable constants + */ +-#define ENDIO_HOOK_POOL_SIZE 10240 ++#define ENDIO_HOOK_POOL_SIZE 1024 + #define DEFERRED_SET_SIZE 64 + #define MAPPING_POOL_SIZE 1024 + #define PRISON_CELLS 1024 +@@ -857,7 +857,7 @@ static void process_prepared_mapping(struct new_mapping *m) + + if (m->err) { + cell_error(m->cell); +- return; ++ goto out; + } + + /* +@@ -869,7 +869,7 @@ static void process_prepared_mapping(struct new_mapping *m) + if (r) { + DMERR("dm_thin_insert_block() failed"); + cell_error(m->cell); +- return; ++ goto out; + } + + /* +@@ -884,6 +884,7 @@ static void process_prepared_mapping(struct new_mapping *m) + } else + cell_defer(tc, m->cell, m->data_block); + ++out: + list_del(&m->list); + mempool_free(m, tc->pool->mapping_pool); + } +diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c +index 2d97bf0..62306e5 100644 +--- a/drivers/md/raid1.c ++++ b/drivers/md/raid1.c +@@ -2321,7 +2321,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp + /* There is nowhere to write, so all non-sync + * drives must be failed - so we are finished + */ +- sector_t rv = max_sector - sector_nr; ++ sector_t rv; ++ if (min_bad > 0) ++ max_sector = sector_nr + min_bad; ++ rv = max_sector - sector_nr; + *skipped = 1; + put_buf(r1_bio); + return rv; +diff --git a/drivers/media/rc/ene_ir.c b/drivers/media/rc/ene_ir.c +index ed77c6d..5327061 100644 +--- a/drivers/media/rc/ene_ir.c ++++ b/drivers/media/rc/ene_ir.c +@@ -1018,6 +1018,8 @@ static int ene_probe(struct pnp_dev *pnp_dev, const struct pnp_device_id *id) + + spin_lock_init(&dev->hw_lock); + ++ dev->hw_io = pnp_port_start(pnp_dev, 0); ++ + pnp_set_drvdata(pnp_dev, dev); + dev->pnp_dev = pnp_dev; + +@@ -1072,7 +1074,6 @@ static int ene_probe(struct pnp_dev *pnp_dev, const struct pnp_device_id *id) + + /* claim the resources */ + error = -EBUSY; +- dev->hw_io = pnp_port_start(pnp_dev, 0); + if (!request_region(dev->hw_io, ENE_IO_SIZE, ENE_DRIVER_NAME)) { + dev->hw_io = -1; + dev->irq = -1; +diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c +index 60107ee..4eec7b7 100644 +--- a/drivers/mfd/ab3100-core.c ++++ b/drivers/mfd/ab3100-core.c +@@ -409,8 +409,6 @@ static irqreturn_t ab3100_irq_handler(int irq, void *data) + u32 fatevent; + int err; + +- add_interrupt_randomness(irq); +- + err = ab3100_get_register_page_interruptible(ab3100, AB3100_EVENTA1, + event_regs, 3); + if (err) +diff --git a/drivers/mfd/wm831x-otp.c b/drivers/mfd/wm831x-otp.c +index f742745..b90f3e0 100644 +--- a/drivers/mfd/wm831x-otp.c ++++ b/drivers/mfd/wm831x-otp.c +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -66,6 +67,7 @@ static DEVICE_ATTR(unique_id, 0444, wm831x_unique_id_show, NULL); + + int wm831x_otp_init(struct wm831x *wm831x) + { ++ char uuid[WM831X_UNIQUE_ID_LEN]; + int ret; + + ret = device_create_file(wm831x->dev, &dev_attr_unique_id); +@@ -73,6 +75,12 @@ int wm831x_otp_init(struct wm831x *wm831x) + dev_err(wm831x->dev, "Unique ID attribute not created: %d\n", + ret); + ++ ret = wm831x_unique_id_read(wm831x, uuid); ++ if (ret == 0) ++ add_device_randomness(uuid, sizeof(uuid)); ++ else ++ dev_err(wm831x->dev, "Failed to read UUID: %d\n", ret); ++ + return ret; + } + +diff --git a/drivers/net/wireless/rt2x00/rt2800usb.c b/drivers/net/wireless/rt2x00/rt2800usb.c +index bdf960b..ae7528b 100644 +--- a/drivers/net/wireless/rt2x00/rt2800usb.c ++++ b/drivers/net/wireless/rt2x00/rt2800usb.c +@@ -925,6 +925,7 @@ static struct usb_device_id rt2800usb_device_table[] = { + { USB_DEVICE(0x0411, 0x015d) }, + { USB_DEVICE(0x0411, 0x016f) }, + { USB_DEVICE(0x0411, 0x01a2) }, ++ { USB_DEVICE(0x0411, 0x01ee) }, + /* Corega */ + { USB_DEVICE(0x07aa, 0x002f) }, + { USB_DEVICE(0x07aa, 0x003c) }, +diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c +index d1049ee..26fba2d 100644 +--- a/drivers/platform/x86/asus-wmi.c ++++ b/drivers/platform/x86/asus-wmi.c +@@ -1431,14 +1431,9 @@ static int asus_wmi_platform_init(struct asus_wmi *asus) + */ + if (!asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, 0, 0, NULL)) + asus->dsts_id = ASUS_WMI_METHODID_DSTS; +- else if (!asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS2, 0, 0, NULL)) ++ else + asus->dsts_id = ASUS_WMI_METHODID_DSTS2; + +- if (!asus->dsts_id) { +- pr_err("Can't find DSTS"); +- return -ENODEV; +- } +- + /* CWAP allow to define the behavior of the Fn+F2 key, + * this method doesn't seems to be present on Eee PCs */ + if (asus->driver->wapf >= 0) +diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c +index bdc909b..f3c2110 100644 +--- a/drivers/rtc/rtc-wm831x.c ++++ b/drivers/rtc/rtc-wm831x.c +@@ -24,7 +24,7 @@ + #include + #include + #include +- ++#include + + /* + * R16416 (0x4020) - RTC Write Counter +@@ -96,6 +96,26 @@ struct wm831x_rtc { + unsigned int alarm_enabled:1; + }; + ++static void wm831x_rtc_add_randomness(struct wm831x *wm831x) ++{ ++ int ret; ++ u16 reg; ++ ++ /* ++ * The write counter contains a pseudo-random number which is ++ * regenerated every time we set the RTC so it should be a ++ * useful per-system source of entropy. ++ */ ++ ret = wm831x_reg_read(wm831x, WM831X_RTC_WRITE_COUNTER); ++ if (ret >= 0) { ++ reg = ret; ++ add_device_randomness(®, sizeof(reg)); ++ } else { ++ dev_warn(wm831x->dev, "Failed to read RTC write counter: %d\n", ++ ret); ++ } ++} ++ + /* + * Read current time and date in RTC + */ +@@ -449,6 +469,8 @@ static int wm831x_rtc_probe(struct platform_device *pdev) + alm_irq, ret); + } + ++ wm831x_rtc_add_randomness(wm831x); ++ + return 0; + + err: +diff --git a/drivers/staging/media/lirc/lirc_sir.c b/drivers/staging/media/lirc/lirc_sir.c +index 6903d39..90e9e32 100644 +--- a/drivers/staging/media/lirc/lirc_sir.c ++++ b/drivers/staging/media/lirc/lirc_sir.c +@@ -53,6 +53,7 @@ + #include + #include + #include ++#include + #ifdef LIRC_ON_SA1100 + #include + #ifdef CONFIG_SA1100_COLLIE +@@ -488,9 +489,11 @@ static struct lirc_driver driver = { + .owner = THIS_MODULE, + }; + ++static struct platform_device *lirc_sir_dev; + + static int init_chrdev(void) + { ++ driver.dev = &lirc_sir_dev->dev; + driver.minor = lirc_register_driver(&driver); + if (driver.minor < 0) { + printk(KERN_ERR LIRC_DRIVER_NAME ": init_chrdev() failed.\n"); +@@ -1216,20 +1219,71 @@ static int init_lirc_sir(void) + return 0; + } + ++static int __devinit lirc_sir_probe(struct platform_device *dev) ++{ ++ return 0; ++} ++ ++static int __devexit lirc_sir_remove(struct platform_device *dev) ++{ ++ return 0; ++} ++ ++static struct platform_driver lirc_sir_driver = { ++ .probe = lirc_sir_probe, ++ .remove = __devexit_p(lirc_sir_remove), ++ .driver = { ++ .name = "lirc_sir", ++ .owner = THIS_MODULE, ++ }, ++}; + + static int __init lirc_sir_init(void) + { + int retval; + ++ retval = platform_driver_register(&lirc_sir_driver); ++ if (retval) { ++ printk(KERN_ERR LIRC_DRIVER_NAME ": Platform driver register " ++ "failed!\n"); ++ return -ENODEV; ++ } ++ ++ lirc_sir_dev = platform_device_alloc("lirc_dev", 0); ++ if (!lirc_sir_dev) { ++ printk(KERN_ERR LIRC_DRIVER_NAME ": Platform device alloc " ++ "failed!\n"); ++ retval = -ENOMEM; ++ goto pdev_alloc_fail; ++ } ++ ++ retval = platform_device_add(lirc_sir_dev); ++ if (retval) { ++ printk(KERN_ERR LIRC_DRIVER_NAME ": Platform device add " ++ "failed!\n"); ++ retval = -ENODEV; ++ goto pdev_add_fail; ++ } ++ + retval = init_chrdev(); + if (retval < 0) +- return retval; ++ goto fail; ++ + retval = init_lirc_sir(); + if (retval) { + drop_chrdev(); +- return retval; ++ goto fail; + } ++ + return 0; ++ ++fail: ++ platform_device_del(lirc_sir_dev); ++pdev_add_fail: ++ platform_device_put(lirc_sir_dev); ++pdev_alloc_fail: ++ platform_driver_unregister(&lirc_sir_driver); ++ return retval; + } + + static void __exit lirc_sir_exit(void) +@@ -1237,6 +1291,8 @@ static void __exit lirc_sir_exit(void) + drop_hardware(); + drop_chrdev(); + drop_port(); ++ platform_device_unregister(lirc_sir_dev); ++ platform_driver_unregister(&lirc_sir_driver); + printk(KERN_INFO LIRC_DRIVER_NAME ": Uninstalled.\n"); + } + +diff --git a/drivers/tty/serial/pch_uart.c b/drivers/tty/serial/pch_uart.c +index a4b192d..08b92a6 100644 +--- a/drivers/tty/serial/pch_uart.c ++++ b/drivers/tty/serial/pch_uart.c +@@ -660,7 +660,8 @@ static void pch_dma_rx_complete(void *arg) + tty_flip_buffer_push(tty); + tty_kref_put(tty); + async_tx_ack(priv->desc_rx); +- pch_uart_hal_enable_interrupt(priv, PCH_UART_HAL_RX_INT); ++ pch_uart_hal_enable_interrupt(priv, PCH_UART_HAL_RX_INT | ++ PCH_UART_HAL_RX_ERR_INT); + } + + static void pch_dma_tx_complete(void *arg) +@@ -715,7 +716,8 @@ static int handle_rx_to(struct eg20t_port *priv) + int rx_size; + int ret; + if (!priv->start_rx) { +- pch_uart_hal_disable_interrupt(priv, PCH_UART_HAL_RX_INT); ++ pch_uart_hal_disable_interrupt(priv, PCH_UART_HAL_RX_INT | ++ PCH_UART_HAL_RX_ERR_INT); + return 0; + } + buf = &priv->rxbuf; +@@ -977,11 +979,13 @@ static irqreturn_t pch_uart_interrupt(int irq, void *dev_id) + case PCH_UART_IID_RDR: /* Received Data Ready */ + if (priv->use_dma) { + pch_uart_hal_disable_interrupt(priv, +- PCH_UART_HAL_RX_INT); ++ PCH_UART_HAL_RX_INT | ++ PCH_UART_HAL_RX_ERR_INT); + ret = dma_handle_rx(priv); + if (!ret) + pch_uart_hal_enable_interrupt(priv, +- PCH_UART_HAL_RX_INT); ++ PCH_UART_HAL_RX_INT | ++ PCH_UART_HAL_RX_ERR_INT); + } else { + ret = handle_rx(priv); + } +@@ -1107,7 +1111,8 @@ static void pch_uart_stop_rx(struct uart_port *port) + struct eg20t_port *priv; + priv = container_of(port, struct eg20t_port, port); + priv->start_rx = 0; +- pch_uart_hal_disable_interrupt(priv, PCH_UART_HAL_RX_INT); ++ pch_uart_hal_disable_interrupt(priv, PCH_UART_HAL_RX_INT | ++ PCH_UART_HAL_RX_ERR_INT); + priv->int_dis_flag = 1; + } + +@@ -1163,6 +1168,7 @@ static int pch_uart_startup(struct uart_port *port) + break; + case 16: + fifo_size = PCH_UART_HAL_FIFO16; ++ break; + case 1: + default: + fifo_size = PCH_UART_HAL_FIFO_DIS; +@@ -1200,7 +1206,8 @@ static int pch_uart_startup(struct uart_port *port) + pch_request_dma(port); + + priv->start_rx = 1; +- pch_uart_hal_enable_interrupt(priv, PCH_UART_HAL_RX_INT); ++ pch_uart_hal_enable_interrupt(priv, PCH_UART_HAL_RX_INT | ++ PCH_UART_HAL_RX_ERR_INT); + uart_update_timeout(port, CS8, default_baud); + + return 0; +@@ -1258,7 +1265,7 @@ static void pch_uart_set_termios(struct uart_port *port, + stb = PCH_UART_HAL_STB1; + + if (termios->c_cflag & PARENB) { +- if (!(termios->c_cflag & PARODD)) ++ if (termios->c_cflag & PARODD) + parity = PCH_UART_HAL_PARITY_ODD; + else + parity = PCH_UART_HAL_PARITY_EVEN; +diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c +index 175b6bb..52340cc 100644 +--- a/drivers/usb/core/hub.c ++++ b/drivers/usb/core/hub.c +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1897,6 +1898,14 @@ int usb_new_device(struct usb_device *udev) + /* Tell the world! */ + announce_device(udev); + ++ if (udev->serial) ++ add_device_randomness(udev->serial, strlen(udev->serial)); ++ if (udev->product) ++ add_device_randomness(udev->product, strlen(udev->product)); ++ if (udev->manufacturer) ++ add_device_randomness(udev->manufacturer, ++ strlen(udev->manufacturer)); ++ + device_enable_async_suspend(&udev->dev); + /* Register the device. The device driver is responsible + * for configuring the device and invoking the add-device +diff --git a/drivers/usb/early/ehci-dbgp.c b/drivers/usb/early/ehci-dbgp.c +index 1fc8f12..347bb05 100644 +--- a/drivers/usb/early/ehci-dbgp.c ++++ b/drivers/usb/early/ehci-dbgp.c +@@ -450,7 +450,7 @@ static int dbgp_ehci_startup(void) + writel(FLAG_CF, &ehci_regs->configured_flag); + + /* Wait until the controller is no longer halted */ +- loop = 10; ++ loop = 1000; + do { + status = readl(&ehci_regs->status); + if (!(status & STS_HALT)) +diff --git a/drivers/video/smscufx.c b/drivers/video/smscufx.c +index aaccffa..dd9533a 100644 +--- a/drivers/video/smscufx.c ++++ b/drivers/video/smscufx.c +@@ -904,7 +904,7 @@ static ssize_t ufx_ops_write(struct fb_info *info, const char __user *buf, + result = fb_sys_write(info, buf, count, ppos); + + if (result > 0) { +- int start = max((int)(offset / info->fix.line_length) - 1, 0); ++ int start = max((int)(offset / info->fix.line_length), 0); + int lines = min((u32)((result / info->fix.line_length) + 1), + (u32)info->var.yres); + +diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c +index 24a49d4..1585db1 100644 +--- a/fs/exofs/ore.c ++++ b/fs/exofs/ore.c +@@ -837,11 +837,11 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) + bio->bi_rw |= REQ_WRITE; + } + +- osd_req_write(or, _ios_obj(ios, dev), per_dev->offset, +- bio, per_dev->length); ++ osd_req_write(or, _ios_obj(ios, cur_comp), ++ per_dev->offset, bio, per_dev->length); + ORE_DBGMSG("write(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", +- _LLU(_ios_obj(ios, dev)->id), ++ _LLU(_ios_obj(ios, cur_comp)->id), + _LLU(per_dev->offset), + _LLU(per_dev->length), dev); + } else if (ios->kern_buff) { +@@ -853,20 +853,20 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) + (ios->si.unit_off + ios->length > + ios->layout->stripe_unit)); + +- ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev), ++ ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp), + per_dev->offset, + ios->kern_buff, ios->length); + if (unlikely(ret)) + goto out; + ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", +- _LLU(_ios_obj(ios, dev)->id), ++ _LLU(_ios_obj(ios, cur_comp)->id), + _LLU(per_dev->offset), + _LLU(ios->length), per_dev->dev); + } else { +- osd_req_set_attributes(or, _ios_obj(ios, dev)); ++ osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); + ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", +- _LLU(_ios_obj(ios, dev)->id), ++ _LLU(_ios_obj(ios, cur_comp)->id), + ios->out_attr_len, dev); + } + +diff --git a/fs/nfs/file.c b/fs/nfs/file.c +index c43a452..961e562 100644 +--- a/fs/nfs/file.c ++++ b/fs/nfs/file.c +@@ -452,8 +452,11 @@ static int nfs_release_page(struct page *page, gfp_t gfp) + + dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); + +- /* Only do I/O if gfp is a superset of GFP_KERNEL */ +- if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) { ++ /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not ++ * doing this memory reclaim for a fs-related allocation. ++ */ ++ if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL && ++ !(current->flags & PF_FSTRANS)) { + int how = FLUSH_SYNC; + + /* Don't let kswapd deadlock waiting for OOM RPC calls */ +diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c +index 9cfa60a..87a1746 100644 +--- a/fs/nfsd/nfs4xdr.c ++++ b/fs/nfsd/nfs4xdr.c +@@ -2236,7 +2236,7 @@ out_acl: + if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) { + if ((buflen -= 4) < 0) + goto out_resource; +- WRITE32(1); ++ WRITE32(0); + } + if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) { + if ((buflen -= 4) < 0) +diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c +index ac258be..c598cfb 100644 +--- a/fs/nilfs2/ioctl.c ++++ b/fs/nilfs2/ioctl.c +@@ -182,7 +182,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, + if (copy_from_user(&cpmode, argp, sizeof(cpmode))) + goto out; + +- down_read(&inode->i_sb->s_umount); ++ mutex_lock(&nilfs->ns_snapshot_mount_mutex); + + nilfs_transaction_begin(inode->i_sb, &ti, 0); + ret = nilfs_cpfile_change_cpmode( +@@ -192,7 +192,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, + else + nilfs_transaction_commit(inode->i_sb); /* never fails */ + +- up_read(&inode->i_sb->s_umount); ++ mutex_unlock(&nilfs->ns_snapshot_mount_mutex); + out: + mnt_drop_write(filp->f_path.mnt); + return ret; +diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c +index 8351c44..97bfbdd 100644 +--- a/fs/nilfs2/super.c ++++ b/fs/nilfs2/super.c +@@ -951,6 +951,8 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, + struct nilfs_root *root; + int ret; + ++ mutex_lock(&nilfs->ns_snapshot_mount_mutex); ++ + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno); + up_read(&nilfs->ns_segctor_sem); +@@ -975,6 +977,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, + ret = nilfs_get_root_dentry(s, root, root_dentry); + nilfs_put_root(root); + out: ++ mutex_unlock(&nilfs->ns_snapshot_mount_mutex); + return ret; + } + +diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c +index 35a8970..1c98f53 100644 +--- a/fs/nilfs2/the_nilfs.c ++++ b/fs/nilfs2/the_nilfs.c +@@ -76,6 +76,7 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev) + nilfs->ns_bdev = bdev; + atomic_set(&nilfs->ns_ndirtyblks, 0); + init_rwsem(&nilfs->ns_sem); ++ mutex_init(&nilfs->ns_snapshot_mount_mutex); + INIT_LIST_HEAD(&nilfs->ns_dirty_files); + INIT_LIST_HEAD(&nilfs->ns_gc_inodes); + spin_lock_init(&nilfs->ns_inode_lock); +diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h +index 9992b11..de7435f 100644 +--- a/fs/nilfs2/the_nilfs.h ++++ b/fs/nilfs2/the_nilfs.h +@@ -47,6 +47,7 @@ enum { + * @ns_flags: flags + * @ns_bdev: block device + * @ns_sem: semaphore for shared states ++ * @ns_snapshot_mount_mutex: mutex to protect snapshot mounts + * @ns_sbh: buffer heads of on-disk super blocks + * @ns_sbp: pointers to super block data + * @ns_sbwtime: previous write time of super block +@@ -99,6 +100,7 @@ struct the_nilfs { + + struct block_device *ns_bdev; + struct rw_semaphore ns_sem; ++ struct mutex ns_snapshot_mount_mutex; + + /* + * used for +diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h +index c5ed2f1..a2227f7 100644 +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -41,6 +41,9 @@ int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, + unsigned long *, int *, int, unsigned int flags); + void unmap_hugepage_range(struct vm_area_struct *, + unsigned long, unsigned long, struct page *); ++void __unmap_hugepage_range_final(struct vm_area_struct *vma, ++ unsigned long start, unsigned long end, ++ struct page *ref_page); + void __unmap_hugepage_range(struct vm_area_struct *, + unsigned long, unsigned long, struct page *); + int hugetlb_prefault(struct address_space *, struct vm_area_struct *); +@@ -99,6 +102,13 @@ static inline unsigned long hugetlb_total_pages(void) + #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) + #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) + #define unmap_hugepage_range(vma, start, end, page) BUG() ++static inline void __unmap_hugepage_range_final(struct vm_area_struct *vma, ++ unsigned long start, unsigned long end, ++ struct page *ref_page) ++{ ++ BUG(); ++} ++ + static inline void hugetlb_report_meminfo(struct seq_file *m) + { + } +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index df53fdf..cdde2b3 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -124,8 +124,17 @@ extern struct group_info init_groups; + + extern struct cred init_cred; + ++extern struct task_group root_task_group; ++ ++#ifdef CONFIG_CGROUP_SCHED ++# define INIT_CGROUP_SCHED(tsk) \ ++ .sched_task_group = &root_task_group, ++#else ++# define INIT_CGROUP_SCHED(tsk) ++#endif ++ + #ifdef CONFIG_PERF_EVENTS +-# define INIT_PERF_EVENTS(tsk) \ ++# define INIT_PERF_EVENTS(tsk) \ + .perf_event_mutex = \ + __MUTEX_INITIALIZER(tsk.perf_event_mutex), \ + .perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list), +@@ -162,6 +171,7 @@ extern struct cred init_cred; + }, \ + .tasks = LIST_HEAD_INIT(tsk.tasks), \ + INIT_PUSHABLE_TASKS(tsk) \ ++ INIT_CGROUP_SCHED(tsk) \ + .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ + .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ + .real_parent = &tsk, \ +diff --git a/include/linux/random.h b/include/linux/random.h +index 8f74538..29e217a 100644 +--- a/include/linux/random.h ++++ b/include/linux/random.h +@@ -50,11 +50,13 @@ struct rnd_state { + + extern void rand_initialize_irq(int irq); + ++extern void add_device_randomness(const void *, unsigned int); + extern void add_input_randomness(unsigned int type, unsigned int code, + unsigned int value); +-extern void add_interrupt_randomness(int irq); ++extern void add_interrupt_randomness(int irq, int irq_flags); + + extern void get_random_bytes(void *buf, int nbytes); ++extern void get_random_bytes_arch(void *buf, int nbytes); + void generate_random_uuid(unsigned char uuid_out[16]); + + #ifndef MODULE +diff --git a/include/linux/sched.h b/include/linux/sched.h +index d336c35..1e86bb4 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1236,6 +1236,9 @@ struct task_struct { + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++#ifdef CONFIG_CGROUP_SCHED ++ struct task_group *sched_task_group; ++#endif + + #ifdef CONFIG_PREEMPT_NOTIFIERS + /* list of struct preempt_notifier: */ +@@ -2646,7 +2649,7 @@ extern int sched_group_set_rt_period(struct task_group *tg, + extern long sched_group_rt_period(struct task_group *tg); + extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); + #endif +-#endif ++#endif /* CONFIG_CGROUP_SCHED */ + + extern int task_can_switch_user(struct user_struct *up, + struct task_struct *tsk); +diff --git a/kernel/futex.c b/kernel/futex.c +index 866c9d5..80fb1c6 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -2231,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, + * @uaddr2: the pi futex we will take prior to returning to user-space + * + * The caller will wait on uaddr and will be requeued by futex_requeue() to +- * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and +- * complete the acquisition of the rt_mutex prior to returning to userspace. +- * This ensures the rt_mutex maintains an owner when it has waiters; without +- * one, the pi logic wouldn't know which task to boost/deboost, if there was a +- * need to. ++ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake ++ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to ++ * userspace. This ensures the rt_mutex maintains an owner when it has waiters; ++ * without one, the pi logic would not know which task to boost/deboost, if ++ * there was a need to. + * + * We call schedule in futex_wait_queue_me() when we enqueue and return there + * via the following: +@@ -2272,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + struct futex_q q = futex_q_init; + int res, ret; + ++ if (uaddr == uaddr2) ++ return -EINVAL; ++ + if (!bitset) + return -EINVAL; + +@@ -2343,7 +2346,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + * signal. futex_unlock_pi() will not destroy the lock_ptr nor + * the pi_state. + */ +- WARN_ON(!&q.pi_state); ++ WARN_ON(!q.pi_state); + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); + debug_rt_mutex_free_waiter(&rt_waiter); +@@ -2370,7 +2373,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + * fault, unlock the rt_mutex and return the fault to userspace. + */ + if (ret == -EFAULT) { +- if (rt_mutex_owner(pi_mutex) == current) ++ if (pi_mutex && rt_mutex_owner(pi_mutex) == current) + rt_mutex_unlock(pi_mutex); + } else if (ret == -EINTR) { + /* +diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c +index 470d08c..10e0772 100644 +--- a/kernel/irq/handle.c ++++ b/kernel/irq/handle.c +@@ -117,7 +117,7 @@ irqreturn_t + handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) + { + irqreturn_t retval = IRQ_NONE; +- unsigned int random = 0, irq = desc->irq_data.irq; ++ unsigned int flags = 0, irq = desc->irq_data.irq; + + do { + irqreturn_t res; +@@ -145,7 +145,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) + + /* Fall through to add to randomness */ + case IRQ_HANDLED: +- random |= action->flags; ++ flags |= action->flags; + break; + + default: +@@ -156,8 +156,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) + action = action->next; + } while (action); + +- if (random & IRQF_SAMPLE_RANDOM) +- add_interrupt_randomness(irq); ++ add_interrupt_randomness(irq, flags); + + if (!noirqdebug) + note_interrupt(irq, desc, retval); +diff --git a/kernel/sched.c b/kernel/sched.c +index 9cd8ca7..e0431c4 100644 +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -746,22 +746,19 @@ static inline int cpu_of(struct rq *rq) + /* + * Return the group to which this tasks belongs. + * +- * We use task_subsys_state_check() and extend the RCU verification with +- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each +- * task it moves into the cgroup. Therefore by holding either of those locks, +- * we pin the task to the current cgroup. ++ * We cannot use task_subsys_state() and friends because the cgroup ++ * subsystem changes that value before the cgroup_subsys::attach() method ++ * is called, therefore we cannot pin it and might observe the wrong value. ++ * ++ * The same is true for autogroup's p->signal->autogroup->tg, the autogroup ++ * core changes this before calling sched_move_task(). ++ * ++ * Instead we use a 'copy' which is updated from sched_move_task() while ++ * holding both task_struct::pi_lock and rq::lock. + */ + static inline struct task_group *task_group(struct task_struct *p) + { +- struct task_group *tg; +- struct cgroup_subsys_state *css; +- +- css = task_subsys_state_check(p, cpu_cgroup_subsys_id, +- lockdep_is_held(&p->pi_lock) || +- lockdep_is_held(&task_rq(p)->lock)); +- tg = container_of(css, struct task_group, css); +- +- return autogroup_task_group(p, tg); ++ return p->sched_task_group; + } + + /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +@@ -2372,7 +2369,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) + * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. + * + * sched_move_task() holds both and thus holding either pins the cgroup, +- * see set_task_rq(). ++ * see task_group(). + * + * Furthermore, all task_rq users should acquire both locks, see + * task_rq_lock(). +@@ -8952,6 +8949,7 @@ void sched_destroy_group(struct task_group *tg) + */ + void sched_move_task(struct task_struct *tsk) + { ++ struct task_group *tg; + int on_rq, running; + unsigned long flags; + struct rq *rq; +@@ -8966,6 +8964,12 @@ void sched_move_task(struct task_struct *tsk) + if (unlikely(running)) + tsk->sched_class->put_prev_task(rq, tsk); + ++ tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, ++ lockdep_is_held(&tsk->sighand->siglock)), ++ struct task_group, css); ++ tg = autogroup_task_group(tsk, tg); ++ tsk->sched_task_group = tg; ++ + #ifdef CONFIG_FAIR_GROUP_SCHED + if (tsk->sched_class->task_move_group) + tsk->sched_class->task_move_group(tsk, on_rq); +diff --git a/lib/vsprintf.c b/lib/vsprintf.c +index 993599e..d74c317 100644 +--- a/lib/vsprintf.c ++++ b/lib/vsprintf.c +@@ -886,7 +886,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, + * %pK cannot be used in IRQ context because its test + * for CAP_SYSLOG would be meaningless. + */ +- if (in_irq() || in_serving_softirq() || in_nmi()) { ++ if (kptr_restrict && (in_irq() || in_serving_softirq() || ++ in_nmi())) { + if (spec.field_width == -1) + spec.field_width = 2 * sizeof(void *); + return string(buf, end, "pK-error", spec); +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index b1e1bad..0f897b8 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -2382,6 +2382,25 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, + } + } + ++void __unmap_hugepage_range_final(struct vm_area_struct *vma, ++ unsigned long start, unsigned long end, ++ struct page *ref_page) ++{ ++ __unmap_hugepage_range(vma, start, end, ref_page); ++ ++ /* ++ * Clear this flag so that x86's huge_pmd_share page_table_shareable ++ * test will fail on a vma being torn down, and not grab a page table ++ * on its way out. We're lucky that the flag has such an appropriate ++ * name, and can in fact be safely cleared here. We could clear it ++ * before the __unmap_hugepage_range above, but all that's necessary ++ * is to clear it before releasing the i_mmap_mutex. This works ++ * because in the context this is called, the VMA is about to be ++ * destroyed and the i_mmap_mutex is held. ++ */ ++ vma->vm_flags &= ~VM_MAYSHARE; ++} ++ + void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct page *ref_page) + { +@@ -2939,9 +2958,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, + } + } + spin_unlock(&mm->page_table_lock); +- mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); +- ++ /* ++ * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare ++ * may have cleared our pud entry and done put_page on the page table: ++ * once we release i_mmap_mutex, another task can do the final put_page ++ * and that page table be reused and filled with junk. ++ */ + flush_tlb_range(vma, start, end); ++ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + } + + int hugetlb_reserve_pages(struct inode *inode, +diff --git a/mm/internal.h b/mm/internal.h +index 2189af4..0c26b5e 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -309,3 +309,5 @@ extern u64 hwpoison_filter_flags_mask; + extern u64 hwpoison_filter_flags_value; + extern u64 hwpoison_filter_memcg; + extern u32 hwpoison_filter_enable; ++ ++extern void set_pageblock_order(void); +diff --git a/mm/memory.c b/mm/memory.c +index 1b1ca17..70f5daf 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -1358,8 +1358,11 @@ unsigned long unmap_vmas(struct mmu_gather *tlb, + * Since no pte has actually been setup, it is + * safe to do nothing in this case. + */ +- if (vma->vm_file) +- unmap_hugepage_range(vma, start, end, NULL); ++ if (vma->vm_file) { ++ mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); ++ __unmap_hugepage_range_final(vma, start, end, NULL); ++ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); ++ } + + start = end; + } else +diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c +index 9a611d3..862b608 100644 +--- a/mm/mmu_notifier.c ++++ b/mm/mmu_notifier.c +@@ -33,6 +33,24 @@ + void __mmu_notifier_release(struct mm_struct *mm) + { + struct mmu_notifier *mn; ++ struct hlist_node *n; ++ ++ /* ++ * RCU here will block mmu_notifier_unregister until ++ * ->release returns. ++ */ ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) ++ /* ++ * if ->release runs before mmu_notifier_unregister it ++ * must be handled as it's the only way for the driver ++ * to flush all existing sptes and stop the driver ++ * from establishing any more sptes before all the ++ * pages in the mm are freed. ++ */ ++ if (mn->ops->release) ++ mn->ops->release(mn, mm); ++ rcu_read_unlock(); + + spin_lock(&mm->mmu_notifier_mm->lock); + while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { +@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm) + * mmu_notifier_unregister to return. + */ + hlist_del_init_rcu(&mn->hlist); +- /* +- * RCU here will block mmu_notifier_unregister until +- * ->release returns. +- */ +- rcu_read_lock(); +- spin_unlock(&mm->mmu_notifier_mm->lock); +- /* +- * if ->release runs before mmu_notifier_unregister it +- * must be handled as it's the only way for the driver +- * to flush all existing sptes and stop the driver +- * from establishing any more sptes before all the +- * pages in the mm are freed. +- */ +- if (mn->ops->release) +- mn->ops->release(mn, mm); +- rcu_read_unlock(); +- spin_lock(&mm->mmu_notifier_mm->lock); + } + spin_unlock(&mm->mmu_notifier_mm->lock); + +@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) + { + BUG_ON(atomic_read(&mm->mm_count) <= 0); + +- spin_lock(&mm->mmu_notifier_mm->lock); + if (!hlist_unhashed(&mn->hlist)) { +- hlist_del_rcu(&mn->hlist); +- + /* + * RCU here will force exit_mmap to wait ->release to finish + * before freeing the pages. + */ + rcu_read_lock(); +- spin_unlock(&mm->mmu_notifier_mm->lock); ++ + /* + * exit_mmap will block in mmu_notifier_release to + * guarantee ->release is called before freeing the +@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) + if (mn->ops->release) + mn->ops->release(mn, mm); + rcu_read_unlock(); +- } else ++ ++ spin_lock(&mm->mmu_notifier_mm->lock); ++ hlist_del_rcu(&mn->hlist); + spin_unlock(&mm->mmu_notifier_mm->lock); ++ } + + /* + * Wait any running method to finish, of course including +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 065dbe8..6e51bf0 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -4281,25 +4281,24 @@ static inline void setup_usemap(struct pglist_data *pgdat, + + #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE + +-/* Return a sensible default order for the pageblock size. */ +-static inline int pageblock_default_order(void) +-{ +- if (HPAGE_SHIFT > PAGE_SHIFT) +- return HUGETLB_PAGE_ORDER; +- +- return MAX_ORDER-1; +-} +- + /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ +-static inline void __init set_pageblock_order(unsigned int order) ++void __init set_pageblock_order(void) + { ++ unsigned int order; ++ + /* Check that pageblock_nr_pages has not already been setup */ + if (pageblock_order) + return; + ++ if (HPAGE_SHIFT > PAGE_SHIFT) ++ order = HUGETLB_PAGE_ORDER; ++ else ++ order = MAX_ORDER - 1; ++ + /* + * Assume the largest contiguous order of interest is a huge page. +- * This value may be variable depending on boot parameters on IA64 ++ * This value may be variable depending on boot parameters on IA64 and ++ * powerpc. + */ + pageblock_order = order; + } +@@ -4307,15 +4306,13 @@ static inline void __init set_pageblock_order(unsigned int order) + + /* + * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() +- * and pageblock_default_order() are unused as pageblock_order is set +- * at compile-time. See include/linux/pageblock-flags.h for the values of +- * pageblock_order based on the kernel config ++ * is unused as pageblock_order is set at compile-time. See ++ * include/linux/pageblock-flags.h for the values of pageblock_order based on ++ * the kernel config + */ +-static inline int pageblock_default_order(unsigned int order) ++void __init set_pageblock_order(void) + { +- return MAX_ORDER-1; + } +-#define set_pageblock_order(x) do {} while (0) + + #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ + +@@ -4403,7 +4400,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, + if (!size) + continue; + +- set_pageblock_order(pageblock_default_order()); ++ set_pageblock_order(); + setup_usemap(pgdat, zone, size); + ret = init_currently_empty_zone(zone, zone_start_pfn, + size, MEMMAP_EARLY); +diff --git a/mm/sparse.c b/mm/sparse.c +index a8bc7d3..bf7d3cc 100644 +--- a/mm/sparse.c ++++ b/mm/sparse.c +@@ -486,6 +486,9 @@ void __init sparse_init(void) + struct page **map_map; + #endif + ++ /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ ++ set_pageblock_order(); ++ + /* + * map is using big page (aka 2M in x86 64 bit) + * usemap is less one page (aka 24 bytes) +diff --git a/net/core/dev.c b/net/core/dev.c +index 5738654..4b18703 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -1177,6 +1177,7 @@ static int __dev_open(struct net_device *dev) + net_dmaengine_get(); + dev_set_rx_mode(dev); + dev_activate(dev); ++ add_device_randomness(dev->dev_addr, dev->addr_len); + } + + return ret; +@@ -4841,6 +4842,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) + err = ops->ndo_set_mac_address(dev, sa); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); ++ add_device_randomness(dev->dev_addr, dev->addr_len); + return err; + } + EXPORT_SYMBOL(dev_set_mac_address); +@@ -5621,6 +5623,7 @@ int register_netdevice(struct net_device *dev) + dev_init_scheduler(dev); + dev_hold(dev); + list_netdevice(dev); ++ add_device_randomness(dev->dev_addr, dev->addr_len); + + /* Notify protocols, that a new device appeared. */ + ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); +diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c +index 7f36b38..b856f87 100644 +--- a/net/core/drop_monitor.c ++++ b/net/core/drop_monitor.c +@@ -33,22 +33,19 @@ + #define TRACE_ON 1 + #define TRACE_OFF 0 + +-static void send_dm_alert(struct work_struct *unused); +- +- + /* + * Globals, our netlink socket pointer + * and the work handle that will send up + * netlink alerts + */ + static int trace_state = TRACE_OFF; +-static DEFINE_SPINLOCK(trace_state_lock); ++static DEFINE_MUTEX(trace_state_mutex); + + struct per_cpu_dm_data { +- struct work_struct dm_alert_work; +- struct sk_buff *skb; +- atomic_t dm_hit_count; +- struct timer_list send_timer; ++ spinlock_t lock; ++ struct sk_buff *skb; ++ struct work_struct dm_alert_work; ++ struct timer_list send_timer; + }; + + struct dm_hw_stat_delta { +@@ -74,56 +71,59 @@ static int dm_delay = 1; + static unsigned long dm_hw_check_delta = 2*HZ; + static LIST_HEAD(hw_stats_list); + +-static void reset_per_cpu_data(struct per_cpu_dm_data *data) ++static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) + { + size_t al; + struct net_dm_alert_msg *msg; + struct nlattr *nla; ++ struct sk_buff *skb; ++ unsigned long flags; + + al = sizeof(struct net_dm_alert_msg); + al += dm_hit_limit * sizeof(struct net_dm_drop_point); + al += sizeof(struct nlattr); + +- data->skb = genlmsg_new(al, GFP_KERNEL); +- genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family, +- 0, NET_DM_CMD_ALERT); +- nla = nla_reserve(data->skb, NLA_UNSPEC, sizeof(struct net_dm_alert_msg)); +- msg = nla_data(nla); +- memset(msg, 0, al); +- atomic_set(&data->dm_hit_count, dm_hit_limit); ++ skb = genlmsg_new(al, GFP_KERNEL); ++ ++ if (skb) { ++ genlmsg_put(skb, 0, 0, &net_drop_monitor_family, ++ 0, NET_DM_CMD_ALERT); ++ nla = nla_reserve(skb, NLA_UNSPEC, ++ sizeof(struct net_dm_alert_msg)); ++ msg = nla_data(nla); ++ memset(msg, 0, al); ++ } else { ++ mod_timer(&data->send_timer, jiffies + HZ / 10); ++ } ++ ++ spin_lock_irqsave(&data->lock, flags); ++ swap(data->skb, skb); ++ spin_unlock_irqrestore(&data->lock, flags); ++ ++ return skb; + } + +-static void send_dm_alert(struct work_struct *unused) ++static void send_dm_alert(struct work_struct *work) + { + struct sk_buff *skb; +- struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); ++ struct per_cpu_dm_data *data; + +- /* +- * Grab the skb we're about to send +- */ +- skb = data->skb; ++ data = container_of(work, struct per_cpu_dm_data, dm_alert_work); + +- /* +- * Replace it with a new one +- */ +- reset_per_cpu_data(data); +- +- /* +- * Ship it! +- */ +- genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL); ++ skb = reset_per_cpu_data(data); + ++ if (skb) ++ genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL); + } + + /* + * This is the timer function to delay the sending of an alert + * in the event that more drops will arrive during the +- * hysteresis period. Note that it operates under the timer interrupt +- * so we don't need to disable preemption here ++ * hysteresis period. + */ +-static void sched_send_work(unsigned long unused) ++static void sched_send_work(unsigned long _data) + { +- struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); ++ struct per_cpu_dm_data *data = (struct per_cpu_dm_data *)_data; + + schedule_work(&data->dm_alert_work); + } +@@ -134,17 +134,19 @@ static void trace_drop_common(struct sk_buff *skb, void *location) + struct nlmsghdr *nlh; + struct nlattr *nla; + int i; +- struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); ++ struct sk_buff *dskb; ++ struct per_cpu_dm_data *data; ++ unsigned long flags; + ++ local_irq_save(flags); ++ data = &__get_cpu_var(dm_cpu_data); ++ spin_lock(&data->lock); ++ dskb = data->skb; + +- if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) { +- /* +- * we're already at zero, discard this hit +- */ ++ if (!dskb) + goto out; +- } + +- nlh = (struct nlmsghdr *)data->skb->data; ++ nlh = (struct nlmsghdr *)dskb->data; + nla = genlmsg_data(nlmsg_data(nlh)); + msg = nla_data(nla); + for (i = 0; i < msg->entries; i++) { +@@ -153,11 +155,12 @@ static void trace_drop_common(struct sk_buff *skb, void *location) + goto out; + } + } +- ++ if (msg->entries == dm_hit_limit) ++ goto out; + /* + * We need to create a new entry + */ +- __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point)); ++ __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point)); + nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); + memcpy(msg->points[msg->entries].pc, &location, sizeof(void *)); + msg->points[msg->entries].count = 1; +@@ -165,11 +168,11 @@ static void trace_drop_common(struct sk_buff *skb, void *location) + + if (!timer_pending(&data->send_timer)) { + data->send_timer.expires = jiffies + dm_delay * HZ; +- add_timer_on(&data->send_timer, smp_processor_id()); ++ add_timer(&data->send_timer); + } + + out: +- return; ++ spin_unlock_irqrestore(&data->lock, flags); + } + + static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location) +@@ -213,7 +216,7 @@ static int set_all_monitor_traces(int state) + struct dm_hw_stat_delta *new_stat = NULL; + struct dm_hw_stat_delta *temp; + +- spin_lock(&trace_state_lock); ++ mutex_lock(&trace_state_mutex); + + if (state == trace_state) { + rc = -EAGAIN; +@@ -252,7 +255,7 @@ static int set_all_monitor_traces(int state) + rc = -EINPROGRESS; + + out_unlock: +- spin_unlock(&trace_state_lock); ++ mutex_unlock(&trace_state_mutex); + + return rc; + } +@@ -295,12 +298,12 @@ static int dropmon_net_event(struct notifier_block *ev_block, + + new_stat->dev = dev; + new_stat->last_rx = jiffies; +- spin_lock(&trace_state_lock); ++ mutex_lock(&trace_state_mutex); + list_add_rcu(&new_stat->list, &hw_stats_list); +- spin_unlock(&trace_state_lock); ++ mutex_unlock(&trace_state_mutex); + break; + case NETDEV_UNREGISTER: +- spin_lock(&trace_state_lock); ++ mutex_lock(&trace_state_mutex); + list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { + if (new_stat->dev == dev) { + new_stat->dev = NULL; +@@ -311,7 +314,7 @@ static int dropmon_net_event(struct notifier_block *ev_block, + } + } + } +- spin_unlock(&trace_state_lock); ++ mutex_unlock(&trace_state_mutex); + break; + } + out: +@@ -367,13 +370,15 @@ static int __init init_net_drop_monitor(void) + + for_each_present_cpu(cpu) { + data = &per_cpu(dm_cpu_data, cpu); +- reset_per_cpu_data(data); + INIT_WORK(&data->dm_alert_work, send_dm_alert); + init_timer(&data->send_timer); +- data->send_timer.data = cpu; ++ data->send_timer.data = (unsigned long)data; + data->send_timer.function = sched_send_work; ++ spin_lock_init(&data->lock); ++ reset_per_cpu_data(data); + } + ++ + goto out; + + out_unreg: +diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c +index 2ef859a..05842ab 100644 +--- a/net/core/rtnetlink.c ++++ b/net/core/rtnetlink.c +@@ -1354,6 +1354,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, + goto errout; + send_addr_notify = 1; + modified = 1; ++ add_device_randomness(dev->dev_addr, dev->addr_len); + } + + if (tb[IFLA_MTU]) { +diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c +index 8761bf8..337c68b 100644 +--- a/net/sunrpc/rpcb_clnt.c ++++ b/net/sunrpc/rpcb_clnt.c +@@ -246,7 +246,7 @@ static int rpcb_create_local_unix(void) + if (IS_ERR(clnt)) { + dprintk("RPC: failed to create AF_LOCAL rpcbind " + "client (errno %ld).\n", PTR_ERR(clnt)); +- result = -PTR_ERR(clnt); ++ result = PTR_ERR(clnt); + goto out; + } + +@@ -293,7 +293,7 @@ static int rpcb_create_local_net(void) + if (IS_ERR(clnt)) { + dprintk("RPC: failed to create local rpcbind " + "client (errno %ld).\n", PTR_ERR(clnt)); +- result = -PTR_ERR(clnt); ++ result = PTR_ERR(clnt); + goto out; + } + +diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c +index 4e2b3b4..c90b832 100644 +--- a/net/sunrpc/sched.c ++++ b/net/sunrpc/sched.c +@@ -755,7 +755,9 @@ void rpc_execute(struct rpc_task *task) + + static void rpc_async_schedule(struct work_struct *work) + { ++ current->flags |= PF_FSTRANS; + __rpc_execute(container_of(work, struct rpc_task, u.tk_work)); ++ current->flags &= ~PF_FSTRANS; + } + + /** +diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c +index b446e10..06cdbff 100644 +--- a/net/sunrpc/xprtrdma/transport.c ++++ b/net/sunrpc/xprtrdma/transport.c +@@ -200,6 +200,7 @@ xprt_rdma_connect_worker(struct work_struct *work) + int rc = 0; + + if (!xprt->shutdown) { ++ current->flags |= PF_FSTRANS; + xprt_clear_connected(xprt); + + dprintk("RPC: %s: %sconnect\n", __func__, +@@ -212,10 +213,10 @@ xprt_rdma_connect_worker(struct work_struct *work) + + out: + xprt_wake_pending_tasks(xprt, rc); +- + out_clear: + dprintk("RPC: %s: exit\n", __func__); + xprt_clear_connecting(xprt); ++ current->flags &= ~PF_FSTRANS; + } + + /* +diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c +index 55472c4..1a6edc7 100644 +--- a/net/sunrpc/xprtsock.c ++++ b/net/sunrpc/xprtsock.c +@@ -1895,6 +1895,8 @@ static void xs_local_setup_socket(struct work_struct *work) + if (xprt->shutdown) + goto out; + ++ current->flags |= PF_FSTRANS; ++ + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); + status = __sock_create(xprt->xprt_net, AF_LOCAL, + SOCK_STREAM, 0, &sock, 1); +@@ -1928,6 +1930,7 @@ static void xs_local_setup_socket(struct work_struct *work) + out: + xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, status); ++ current->flags &= ~PF_FSTRANS; + } + + static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) +@@ -1970,6 +1973,8 @@ static void xs_udp_setup_socket(struct work_struct *work) + if (xprt->shutdown) + goto out; + ++ current->flags |= PF_FSTRANS; ++ + /* Start by resetting any existing state */ + xs_reset_transport(transport); + sock = xs_create_sock(xprt, transport, +@@ -1988,6 +1993,7 @@ static void xs_udp_setup_socket(struct work_struct *work) + out: + xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, status); ++ current->flags &= ~PF_FSTRANS; + } + + /* +@@ -2113,6 +2119,8 @@ static void xs_tcp_setup_socket(struct work_struct *work) + if (xprt->shutdown) + goto out; + ++ current->flags |= PF_FSTRANS; ++ + if (!sock) { + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); + sock = xs_create_sock(xprt, transport, +@@ -2162,6 +2170,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) + case -EINPROGRESS: + case -EALREADY: + xprt_clear_connecting(xprt); ++ current->flags &= ~PF_FSTRANS; + return; + case -EINVAL: + /* Happens, for instance, if the user specified a link +@@ -2174,6 +2183,7 @@ out_eagain: + out: + xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, status); ++ current->flags &= ~PF_FSTRANS; + } + + /** +diff --git a/net/wireless/util.c b/net/wireless/util.c +index 74d5292..b5e4c1c 100644 +--- a/net/wireless/util.c ++++ b/net/wireless/util.c +@@ -981,6 +981,9 @@ int cfg80211_can_change_interface(struct cfg80211_registered_device *rdev, + } + mutex_unlock(&rdev->devlist_mtx); + ++ if (total == 1) ++ return 0; ++ + for (i = 0; i < rdev->wiphy.n_iface_combinations; i++) { + const struct ieee80211_iface_combination *c; + struct ieee80211_iface_limit *limits; +diff --git a/sound/drivers/mpu401/mpu401_uart.c b/sound/drivers/mpu401/mpu401_uart.c +index 1cff331..4608c2c 100644 +--- a/sound/drivers/mpu401/mpu401_uart.c ++++ b/sound/drivers/mpu401/mpu401_uart.c +@@ -554,6 +554,7 @@ int snd_mpu401_uart_new(struct snd_card *card, int device, + spin_lock_init(&mpu->output_lock); + spin_lock_init(&mpu->timer_lock); + mpu->hardware = hardware; ++ mpu->irq = -1; + if (! (info_flags & MPU401_INFO_INTEGRATED)) { + int res_size = hardware == MPU401_HW_PC98II ? 4 : 2; + mpu->res = request_region(port, res_size, "MPU401 UART"); +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index 191fd78..2e2eb93 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -4809,6 +4809,15 @@ static int alc269_resume(struct hda_codec *codec) + } + #endif /* CONFIG_PM */ + ++static void alc269_fixup_pincfg_no_hp_to_lineout(struct hda_codec *codec, ++ const struct alc_fixup *fix, int action) ++{ ++ struct alc_spec *spec = codec->spec; ++ ++ if (action == ALC_FIXUP_ACT_PRE_PROBE) ++ spec->parse_flags = HDA_PINCFG_NO_HP_FIXUP; ++} ++ + static void alc269_fixup_hweq(struct hda_codec *codec, + const struct alc_fixup *fix, int action) + { +@@ -4909,6 +4918,8 @@ enum { + ALC269_FIXUP_DMIC, + ALC269VB_FIXUP_AMIC, + ALC269VB_FIXUP_DMIC, ++ ALC269_FIXUP_LENOVO_DOCK, ++ ALC269_FIXUP_PINCFG_NO_HP_TO_LINEOUT, + }; + + static const struct alc_fixup alc269_fixups[] = { +@@ -5029,6 +5040,20 @@ static const struct alc_fixup alc269_fixups[] = { + { } + }, + }, ++ [ALC269_FIXUP_LENOVO_DOCK] = { ++ .type = ALC_FIXUP_PINS, ++ .v.pins = (const struct alc_pincfg[]) { ++ { 0x19, 0x23a11040 }, /* dock mic */ ++ { 0x1b, 0x2121103f }, /* dock headphone */ ++ { } ++ }, ++ .chained = true, ++ .chain_id = ALC269_FIXUP_PINCFG_NO_HP_TO_LINEOUT ++ }, ++ [ALC269_FIXUP_PINCFG_NO_HP_TO_LINEOUT] = { ++ .type = ALC_FIXUP_FUNC, ++ .v.func = alc269_fixup_pincfg_no_hp_to_lineout, ++ }, + }; + + static const struct snd_pci_quirk alc269_fixup_tbl[] = { +@@ -5051,6 +5076,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x17aa, 0x21b8, "Thinkpad Edge 14", ALC269_FIXUP_SKU_IGNORE), + SND_PCI_QUIRK(0x17aa, 0x21ca, "Thinkpad L412", ALC269_FIXUP_SKU_IGNORE), + SND_PCI_QUIRK(0x17aa, 0x21e9, "Thinkpad Edge 15", ALC269_FIXUP_SKU_IGNORE), ++ SND_PCI_QUIRK(0x17aa, 0x21f6, "Thinkpad T530", ALC269_FIXUP_LENOVO_DOCK), ++ SND_PCI_QUIRK(0x17aa, 0x2203, "Thinkpad X230 Tablet", ALC269_FIXUP_LENOVO_DOCK), + SND_PCI_QUIRK(0x17aa, 0x3bf8, "Quanta FL1", ALC269_FIXUP_QUANTA_MUTE), + SND_PCI_QUIRK(0x17aa, 0x3bf8, "Lenovo Ideapd", ALC269_FIXUP_PCM_44K), + SND_PCI_QUIRK(0x17aa, 0x9e54, "LENOVO NB", ALC269_FIXUP_LENOVO_EAPD), +@@ -5109,6 +5136,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + static const struct alc_model_fixup alc269_fixup_models[] = { + {.id = ALC269_FIXUP_AMIC, .name = "laptop-amic"}, + {.id = ALC269_FIXUP_DMIC, .name = "laptop-dmic"}, ++ {.id = ALC269_FIXUP_LENOVO_DOCK, .name = "lenovo-dock"}, + {} + }; + +diff --git a/sound/pci/hda/patch_via.c b/sound/pci/hda/patch_via.c +index 1fe1308..7160ff2 100644 +--- a/sound/pci/hda/patch_via.c ++++ b/sound/pci/hda/patch_via.c +@@ -3227,7 +3227,7 @@ static void set_widgets_power_state_vt1718S(struct hda_codec *codec) + { + struct via_spec *spec = codec->spec; + int imux_is_smixer; +- unsigned int parm; ++ unsigned int parm, parm2; + /* MUX6 (1eh) = stereo mixer */ + imux_is_smixer = + snd_hda_codec_read(codec, 0x1e, 0, AC_VERB_GET_CONNECT_SEL, 0x00) == 5; +@@ -3250,7 +3250,7 @@ static void set_widgets_power_state_vt1718S(struct hda_codec *codec) + parm = AC_PWRST_D3; + set_pin_power_state(codec, 0x27, &parm); + snd_hda_codec_write(codec, 0x1a, 0, AC_VERB_SET_POWER_STATE, parm); +- snd_hda_codec_write(codec, 0xb, 0, AC_VERB_SET_POWER_STATE, parm); ++ parm2 = parm; /* for pin 0x0b */ + + /* PW2 (26h), AOW2 (ah) */ + parm = AC_PWRST_D3; +@@ -3265,6 +3265,9 @@ static void set_widgets_power_state_vt1718S(struct hda_codec *codec) + if (!spec->hp_independent_mode) /* check for redirected HP */ + set_pin_power_state(codec, 0x28, &parm); + snd_hda_codec_write(codec, 0x8, 0, AC_VERB_SET_POWER_STATE, parm); ++ if (!spec->hp_independent_mode && parm2 != AC_PWRST_D3) ++ parm = parm2; ++ snd_hda_codec_write(codec, 0xb, 0, AC_VERB_SET_POWER_STATE, parm); + /* MW9 (21h), Mw2 (1ah), AOW0 (8h) */ + snd_hda_codec_write(codec, 0x21, 0, AC_VERB_SET_POWER_STATE, + imux_is_smixer ? AC_PWRST_D0 : parm); +diff --git a/sound/soc/codecs/wm8962.c b/sound/soc/codecs/wm8962.c +index 07dd7eb..e97df24 100644 +--- a/sound/soc/codecs/wm8962.c ++++ b/sound/soc/codecs/wm8962.c +@@ -3105,6 +3105,9 @@ static int wm8962_set_bias_level(struct snd_soc_codec *codec, + /* VMID 2*250k */ + snd_soc_update_bits(codec, WM8962_PWR_MGMT_1, + WM8962_VMID_SEL_MASK, 0x100); ++ ++ if (codec->dapm.bias_level == SND_SOC_BIAS_OFF) ++ msleep(100); + break; + + case SND_SOC_BIAS_OFF: +diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c +index de61b8a..98c5774 100644 +--- a/sound/soc/codecs/wm8994.c ++++ b/sound/soc/codecs/wm8994.c +@@ -2508,7 +2508,7 @@ static int wm8994_hw_params(struct snd_pcm_substream *substream, + return -EINVAL; + } + +- bclk_rate = params_rate(params) * 2; ++ bclk_rate = params_rate(params) * 4; + switch (params_format(params)) { + case SNDRV_PCM_FORMAT_S16_LE: + bclk_rate *= 16; +diff --git a/sound/usb/clock.c b/sound/usb/clock.c +index 379baad..5e634a2 100644 +--- a/sound/usb/clock.c ++++ b/sound/usb/clock.c +@@ -111,7 +111,8 @@ static bool uac_clock_source_is_valid(struct snd_usb_audio *chip, int source_id) + return 0; + + /* If a clock source can't tell us whether it's valid, we assume it is */ +- if (!uac2_control_is_readable(cs_desc->bmControls, UAC2_CS_CONTROL_CLOCK_VALID)) ++ if (!uac2_control_is_readable(cs_desc->bmControls, ++ UAC2_CS_CONTROL_CLOCK_VALID - 1)) + return 1; + + err = snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), UAC2_CS_CUR, diff --git a/3.2.34/bump/1027_linux-3.2.28.patch b/3.2.34/bump/1027_linux-3.2.28.patch new file mode 100644 index 0000000..4dbba4b --- /dev/null +++ b/3.2.34/bump/1027_linux-3.2.28.patch @@ -0,0 +1,1114 @@ +diff --git a/Makefile b/Makefile +index bdf851f..5368961 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 2 +-SUBLEVEL = 27 ++SUBLEVEL = 28 + EXTRAVERSION = + NAME = Saber-toothed Squirrel + +diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig +index 6ee781b..3ee3e84 100644 +--- a/arch/arm/configs/mxs_defconfig ++++ b/arch/arm/configs/mxs_defconfig +@@ -32,7 +32,6 @@ CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_AEABI=y +-CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 + CONFIG_AUTO_ZRELADDR=y + CONFIG_FPE_NWFPE=y + CONFIG_NET=y +diff --git a/arch/arm/mach-pxa/raumfeld.c b/arch/arm/mach-pxa/raumfeld.c +index f0c05f4..ae7786d 100644 +--- a/arch/arm/mach-pxa/raumfeld.c ++++ b/arch/arm/mach-pxa/raumfeld.c +@@ -951,12 +951,12 @@ static struct i2c_board_info raumfeld_connector_i2c_board_info __initdata = { + + static struct eeti_ts_platform_data eeti_ts_pdata = { + .irq_active_high = 1, ++ .irq_gpio = GPIO_TOUCH_IRQ, + }; + + static struct i2c_board_info raumfeld_controller_i2c_board_info __initdata = { + .type = "eeti_ts", + .addr = 0x0a, +- .irq = gpio_to_irq(GPIO_TOUCH_IRQ), + .platform_data = &eeti_ts_pdata, + }; + +diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c +index 84a9828..38c6645 100644 +--- a/arch/s390/kernel/compat_linux.c ++++ b/arch/s390/kernel/compat_linux.c +@@ -615,7 +615,6 @@ asmlinkage unsigned long old32_mmap(struct mmap_arg_struct_emu31 __user *arg) + return -EFAULT; + if (a.offset & ~PAGE_MASK) + return -EINVAL; +- a.addr = (unsigned long) compat_ptr(a.addr); + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); + } +@@ -626,7 +625,6 @@ asmlinkage long sys32_mmap2(struct mmap_arg_struct_emu31 __user *arg) + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; +- a.addr = (unsigned long) compat_ptr(a.addr); + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); + } + +diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S +index 18c51df..25408d3 100644 +--- a/arch/s390/kernel/compat_wrapper.S ++++ b/arch/s390/kernel/compat_wrapper.S +@@ -1636,7 +1636,7 @@ ENTRY(compat_sys_process_vm_readv_wrapper) + llgfr %r6,%r6 # unsigned long + llgf %r0,164(%r15) # unsigned long + stg %r0,160(%r15) +- jg sys_process_vm_readv ++ jg compat_sys_process_vm_readv + + ENTRY(compat_sys_process_vm_writev_wrapper) + lgfr %r2,%r2 # compat_pid_t +@@ -1646,4 +1646,4 @@ ENTRY(compat_sys_process_vm_writev_wrapper) + llgfr %r6,%r6 # unsigned long + llgf %r0,164(%r15) # unsigned long + stg %r0,160(%r15) +- jg sys_process_vm_writev ++ jg compat_sys_process_vm_writev +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 7315488..407789b 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -1956,6 +1956,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) + #endif + CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | + CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | ++ CPU_BASED_RDPMC_EXITING | + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + /* + * We can allow some features even when not supported by the +diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h +index d62c731..c364358 100644 +--- a/drivers/gpu/drm/i915/i915_drv.h ++++ b/drivers/gpu/drm/i915/i915_drv.h +@@ -1170,12 +1170,7 @@ i915_seqno_passed(uint32_t seq1, uint32_t seq2) + return (int32_t)(seq1 - seq2) >= 0; + } + +-static inline u32 +-i915_gem_next_request_seqno(struct intel_ring_buffer *ring) +-{ +- drm_i915_private_t *dev_priv = ring->dev->dev_private; +- return ring->outstanding_lazy_request = dev_priv->next_seqno; +-} ++u32 i915_gem_next_request_seqno(struct intel_ring_buffer *ring); + + int __must_check i915_gem_object_get_fence(struct drm_i915_gem_object *obj, + struct intel_ring_buffer *pipelined); +diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c +index 3e2edc6..548a400 100644 +--- a/drivers/gpu/drm/i915/i915_gem.c ++++ b/drivers/gpu/drm/i915/i915_gem.c +@@ -1647,6 +1647,28 @@ i915_gem_process_flushing_list(struct intel_ring_buffer *ring, + } + } + ++static u32 ++i915_gem_get_seqno(struct drm_device *dev) ++{ ++ drm_i915_private_t *dev_priv = dev->dev_private; ++ u32 seqno = dev_priv->next_seqno; ++ ++ /* reserve 0 for non-seqno */ ++ if (++dev_priv->next_seqno == 0) ++ dev_priv->next_seqno = 1; ++ ++ return seqno; ++} ++ ++u32 ++i915_gem_next_request_seqno(struct intel_ring_buffer *ring) ++{ ++ if (ring->outstanding_lazy_request == 0) ++ ring->outstanding_lazy_request = i915_gem_get_seqno(ring->dev); ++ ++ return ring->outstanding_lazy_request; ++} ++ + int + i915_add_request(struct intel_ring_buffer *ring, + struct drm_file *file, +@@ -1658,6 +1680,7 @@ i915_add_request(struct intel_ring_buffer *ring, + int ret; + + BUG_ON(request == NULL); ++ seqno = i915_gem_next_request_seqno(ring); + + ret = ring->add_request(ring, &seqno); + if (ret) +diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c +index f6613dc..19085c0 100644 +--- a/drivers/gpu/drm/i915/intel_ringbuffer.c ++++ b/drivers/gpu/drm/i915/intel_ringbuffer.c +@@ -52,20 +52,6 @@ static inline int ring_space(struct intel_ring_buffer *ring) + return space; + } + +-static u32 i915_gem_get_seqno(struct drm_device *dev) +-{ +- drm_i915_private_t *dev_priv = dev->dev_private; +- u32 seqno; +- +- seqno = dev_priv->next_seqno; +- +- /* reserve 0 for non-seqno */ +- if (++dev_priv->next_seqno == 0) +- dev_priv->next_seqno = 1; +- +- return seqno; +-} +- + static int + render_ring_flush(struct intel_ring_buffer *ring, + u32 invalidate_domains, +@@ -277,8 +263,6 @@ static int init_ring_common(struct intel_ring_buffer *ring) + I915_WRITE_HEAD(ring, 0); + ring->write_tail(ring, 0); + +- /* Initialize the ring. */ +- I915_WRITE_START(ring, obj->gtt_offset); + head = I915_READ_HEAD(ring) & HEAD_ADDR; + + /* G45 ring initialization fails to reset head to zero */ +@@ -304,14 +288,19 @@ static int init_ring_common(struct intel_ring_buffer *ring) + } + } + ++ /* Initialize the ring. This must happen _after_ we've cleared the ring ++ * registers with the above sequence (the readback of the HEAD registers ++ * also enforces ordering), otherwise the hw might lose the new ring ++ * register values. */ ++ I915_WRITE_START(ring, obj->gtt_offset); + I915_WRITE_CTL(ring, + ((ring->size - PAGE_SIZE) & RING_NR_PAGES) + | RING_VALID); + + /* If the head is still not zero, the ring is dead */ +- if ((I915_READ_CTL(ring) & RING_VALID) == 0 || +- I915_READ_START(ring) != obj->gtt_offset || +- (I915_READ_HEAD(ring) & HEAD_ADDR) != 0) { ++ if (wait_for((I915_READ_CTL(ring) & RING_VALID) != 0 && ++ I915_READ_START(ring) == obj->gtt_offset && ++ (I915_READ_HEAD(ring) & HEAD_ADDR) == 0, 50)) { + DRM_ERROR("%s initialization failed " + "ctl %08x head %08x tail %08x start %08x\n", + ring->name, +@@ -488,7 +477,7 @@ gen6_add_request(struct intel_ring_buffer *ring, + mbox1_reg = ring->signal_mbox[0]; + mbox2_reg = ring->signal_mbox[1]; + +- *seqno = i915_gem_get_seqno(ring->dev); ++ *seqno = i915_gem_next_request_seqno(ring); + + update_mboxes(ring, *seqno, mbox1_reg); + update_mboxes(ring, *seqno, mbox2_reg); +@@ -586,8 +575,7 @@ static int + pc_render_add_request(struct intel_ring_buffer *ring, + u32 *result) + { +- struct drm_device *dev = ring->dev; +- u32 seqno = i915_gem_get_seqno(dev); ++ u32 seqno = i915_gem_next_request_seqno(ring); + struct pipe_control *pc = ring->private; + u32 scratch_addr = pc->gtt_offset + 128; + int ret; +@@ -638,8 +626,7 @@ static int + render_ring_add_request(struct intel_ring_buffer *ring, + u32 *result) + { +- struct drm_device *dev = ring->dev; +- u32 seqno = i915_gem_get_seqno(dev); ++ u32 seqno = i915_gem_next_request_seqno(ring); + int ret; + + ret = intel_ring_begin(ring, 4); +@@ -813,7 +800,7 @@ ring_add_request(struct intel_ring_buffer *ring, + if (ret) + return ret; + +- seqno = i915_gem_get_seqno(ring->dev); ++ seqno = i915_gem_next_request_seqno(ring); + + intel_ring_emit(ring, MI_STORE_DWORD_INDEX); + intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT); +diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c +index 931f4df..fc0633c 100644 +--- a/drivers/gpu/drm/radeon/evergreen.c ++++ b/drivers/gpu/drm/radeon/evergreen.c +@@ -1065,24 +1065,8 @@ void evergreen_agp_enable(struct radeon_device *rdev) + + void evergreen_mc_stop(struct radeon_device *rdev, struct evergreen_mc_save *save) + { +- save->vga_control[0] = RREG32(D1VGA_CONTROL); +- save->vga_control[1] = RREG32(D2VGA_CONTROL); + save->vga_render_control = RREG32(VGA_RENDER_CONTROL); + save->vga_hdp_control = RREG32(VGA_HDP_CONTROL); +- save->crtc_control[0] = RREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC0_REGISTER_OFFSET); +- save->crtc_control[1] = RREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC1_REGISTER_OFFSET); +- if (rdev->num_crtc >= 4) { +- save->vga_control[2] = RREG32(EVERGREEN_D3VGA_CONTROL); +- save->vga_control[3] = RREG32(EVERGREEN_D4VGA_CONTROL); +- save->crtc_control[2] = RREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC2_REGISTER_OFFSET); +- save->crtc_control[3] = RREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC3_REGISTER_OFFSET); +- } +- if (rdev->num_crtc >= 6) { +- save->vga_control[4] = RREG32(EVERGREEN_D5VGA_CONTROL); +- save->vga_control[5] = RREG32(EVERGREEN_D6VGA_CONTROL); +- save->crtc_control[4] = RREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC4_REGISTER_OFFSET); +- save->crtc_control[5] = RREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC5_REGISTER_OFFSET); +- } + + /* Stop all video */ + WREG32(VGA_RENDER_CONTROL, 0); +@@ -1193,47 +1177,6 @@ void evergreen_mc_resume(struct radeon_device *rdev, struct evergreen_mc_save *s + /* Unlock host access */ + WREG32(VGA_HDP_CONTROL, save->vga_hdp_control); + mdelay(1); +- /* Restore video state */ +- WREG32(D1VGA_CONTROL, save->vga_control[0]); +- WREG32(D2VGA_CONTROL, save->vga_control[1]); +- if (rdev->num_crtc >= 4) { +- WREG32(EVERGREEN_D3VGA_CONTROL, save->vga_control[2]); +- WREG32(EVERGREEN_D4VGA_CONTROL, save->vga_control[3]); +- } +- if (rdev->num_crtc >= 6) { +- WREG32(EVERGREEN_D5VGA_CONTROL, save->vga_control[4]); +- WREG32(EVERGREEN_D6VGA_CONTROL, save->vga_control[5]); +- } +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC0_REGISTER_OFFSET, 1); +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC1_REGISTER_OFFSET, 1); +- if (rdev->num_crtc >= 4) { +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC2_REGISTER_OFFSET, 1); +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC3_REGISTER_OFFSET, 1); +- } +- if (rdev->num_crtc >= 6) { +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC4_REGISTER_OFFSET, 1); +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC5_REGISTER_OFFSET, 1); +- } +- WREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC0_REGISTER_OFFSET, save->crtc_control[0]); +- WREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC1_REGISTER_OFFSET, save->crtc_control[1]); +- if (rdev->num_crtc >= 4) { +- WREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC2_REGISTER_OFFSET, save->crtc_control[2]); +- WREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC3_REGISTER_OFFSET, save->crtc_control[3]); +- } +- if (rdev->num_crtc >= 6) { +- WREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC4_REGISTER_OFFSET, save->crtc_control[4]); +- WREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC5_REGISTER_OFFSET, save->crtc_control[5]); +- } +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC0_REGISTER_OFFSET, 0); +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC1_REGISTER_OFFSET, 0); +- if (rdev->num_crtc >= 4) { +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC2_REGISTER_OFFSET, 0); +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC3_REGISTER_OFFSET, 0); +- } +- if (rdev->num_crtc >= 6) { +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC4_REGISTER_OFFSET, 0); +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC5_REGISTER_OFFSET, 0); +- } + WREG32(VGA_RENDER_CONTROL, save->vga_render_control); + } + +@@ -2080,10 +2023,18 @@ static void evergreen_gpu_init(struct radeon_device *rdev) + if (rdev->flags & RADEON_IS_IGP) + rdev->config.evergreen.tile_config |= 1 << 4; + else { +- if ((mc_arb_ramcfg & NOOFBANK_MASK) >> NOOFBANK_SHIFT) +- rdev->config.evergreen.tile_config |= 1 << 4; +- else ++ switch ((mc_arb_ramcfg & NOOFBANK_MASK) >> NOOFBANK_SHIFT) { ++ case 0: /* four banks */ + rdev->config.evergreen.tile_config |= 0 << 4; ++ break; ++ case 1: /* eight banks */ ++ rdev->config.evergreen.tile_config |= 1 << 4; ++ break; ++ case 2: /* sixteen banks */ ++ default: ++ rdev->config.evergreen.tile_config |= 2 << 4; ++ break; ++ } + } + rdev->config.evergreen.tile_config |= + ((mc_arb_ramcfg & BURSTLENGTH_MASK) >> BURSTLENGTH_SHIFT) << 8; +diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c +index 9e50814..636255b 100644 +--- a/drivers/gpu/drm/radeon/ni.c ++++ b/drivers/gpu/drm/radeon/ni.c +@@ -804,10 +804,18 @@ static void cayman_gpu_init(struct radeon_device *rdev) + rdev->config.cayman.tile_config |= (3 << 0); + break; + } +- if ((mc_arb_ramcfg & NOOFBANK_MASK) >> NOOFBANK_SHIFT) +- rdev->config.cayman.tile_config |= 1 << 4; +- else ++ switch ((mc_arb_ramcfg & NOOFBANK_MASK) >> NOOFBANK_SHIFT) { ++ case 0: /* four banks */ + rdev->config.cayman.tile_config |= 0 << 4; ++ break; ++ case 1: /* eight banks */ ++ rdev->config.cayman.tile_config |= 1 << 4; ++ break; ++ case 2: /* sixteen banks */ ++ default: ++ rdev->config.cayman.tile_config |= 2 << 4; ++ break; ++ } + rdev->config.cayman.tile_config |= + ((gb_addr_config & PIPE_INTERLEAVE_SIZE_MASK) >> PIPE_INTERLEAVE_SIZE_SHIFT) << 8; + rdev->config.cayman.tile_config |= +diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h +index 5991484..5ce9402 100644 +--- a/drivers/gpu/drm/radeon/radeon_asic.h ++++ b/drivers/gpu/drm/radeon/radeon_asic.h +@@ -253,13 +253,10 @@ void rs690_line_buffer_adjust(struct radeon_device *rdev, + * rv515 + */ + struct rv515_mc_save { +- u32 d1vga_control; +- u32 d2vga_control; + u32 vga_render_control; + u32 vga_hdp_control; +- u32 d1crtc_control; +- u32 d2crtc_control; + }; ++ + int rv515_init(struct radeon_device *rdev); + void rv515_fini(struct radeon_device *rdev); + uint32_t rv515_mc_rreg(struct radeon_device *rdev, uint32_t reg); +@@ -387,11 +384,10 @@ void r700_cp_fini(struct radeon_device *rdev); + * evergreen + */ + struct evergreen_mc_save { +- u32 vga_control[6]; + u32 vga_render_control; + u32 vga_hdp_control; +- u32 crtc_control[6]; + }; ++ + void evergreen_pcie_gart_tlb_flush(struct radeon_device *rdev); + int evergreen_init(struct radeon_device *rdev); + void evergreen_fini(struct radeon_device *rdev); +diff --git a/drivers/gpu/drm/radeon/rv515.c b/drivers/gpu/drm/radeon/rv515.c +index 6613ee9..d5f45b4 100644 +--- a/drivers/gpu/drm/radeon/rv515.c ++++ b/drivers/gpu/drm/radeon/rv515.c +@@ -281,12 +281,8 @@ int rv515_debugfs_ga_info_init(struct radeon_device *rdev) + + void rv515_mc_stop(struct radeon_device *rdev, struct rv515_mc_save *save) + { +- save->d1vga_control = RREG32(R_000330_D1VGA_CONTROL); +- save->d2vga_control = RREG32(R_000338_D2VGA_CONTROL); + save->vga_render_control = RREG32(R_000300_VGA_RENDER_CONTROL); + save->vga_hdp_control = RREG32(R_000328_VGA_HDP_CONTROL); +- save->d1crtc_control = RREG32(R_006080_D1CRTC_CONTROL); +- save->d2crtc_control = RREG32(R_006880_D2CRTC_CONTROL); + + /* Stop all video */ + WREG32(R_0068E8_D2CRTC_UPDATE_LOCK, 0); +@@ -311,15 +307,6 @@ void rv515_mc_resume(struct radeon_device *rdev, struct rv515_mc_save *save) + /* Unlock host access */ + WREG32(R_000328_VGA_HDP_CONTROL, save->vga_hdp_control); + mdelay(1); +- /* Restore video state */ +- WREG32(R_000330_D1VGA_CONTROL, save->d1vga_control); +- WREG32(R_000338_D2VGA_CONTROL, save->d2vga_control); +- WREG32(R_0060E8_D1CRTC_UPDATE_LOCK, 1); +- WREG32(R_0068E8_D2CRTC_UPDATE_LOCK, 1); +- WREG32(R_006080_D1CRTC_CONTROL, save->d1crtc_control); +- WREG32(R_006880_D2CRTC_CONTROL, save->d2crtc_control); +- WREG32(R_0060E8_D1CRTC_UPDATE_LOCK, 0); +- WREG32(R_0068E8_D2CRTC_UPDATE_LOCK, 0); + WREG32(R_000300_VGA_RENDER_CONTROL, save->vga_render_control); + } + +diff --git a/drivers/input/touchscreen/eeti_ts.c b/drivers/input/touchscreen/eeti_ts.c +index 7f8f538..4f938bb 100644 +--- a/drivers/input/touchscreen/eeti_ts.c ++++ b/drivers/input/touchscreen/eeti_ts.c +@@ -48,7 +48,7 @@ struct eeti_ts_priv { + struct input_dev *input; + struct work_struct work; + struct mutex mutex; +- int irq, irq_active_high; ++ int irq_gpio, irq, irq_active_high; + }; + + #define EETI_TS_BITDEPTH (11) +@@ -62,7 +62,7 @@ struct eeti_ts_priv { + + static inline int eeti_ts_irq_active(struct eeti_ts_priv *priv) + { +- return gpio_get_value(irq_to_gpio(priv->irq)) == priv->irq_active_high; ++ return gpio_get_value(priv->irq_gpio) == priv->irq_active_high; + } + + static void eeti_ts_read(struct work_struct *work) +@@ -157,7 +157,7 @@ static void eeti_ts_close(struct input_dev *dev) + static int __devinit eeti_ts_probe(struct i2c_client *client, + const struct i2c_device_id *idp) + { +- struct eeti_ts_platform_data *pdata; ++ struct eeti_ts_platform_data *pdata = client->dev.platform_data; + struct eeti_ts_priv *priv; + struct input_dev *input; + unsigned int irq_flags; +@@ -199,9 +199,12 @@ static int __devinit eeti_ts_probe(struct i2c_client *client, + + priv->client = client; + priv->input = input; +- priv->irq = client->irq; ++ priv->irq_gpio = pdata->irq_gpio; ++ priv->irq = gpio_to_irq(pdata->irq_gpio); + +- pdata = client->dev.platform_data; ++ err = gpio_request_one(pdata->irq_gpio, GPIOF_IN, client->name); ++ if (err < 0) ++ goto err1; + + if (pdata) + priv->irq_active_high = pdata->irq_active_high; +@@ -215,13 +218,13 @@ static int __devinit eeti_ts_probe(struct i2c_client *client, + + err = input_register_device(input); + if (err) +- goto err1; ++ goto err2; + + err = request_irq(priv->irq, eeti_ts_isr, irq_flags, + client->name, priv); + if (err) { + dev_err(&client->dev, "Unable to request touchscreen IRQ.\n"); +- goto err2; ++ goto err3; + } + + /* +@@ -233,9 +236,11 @@ static int __devinit eeti_ts_probe(struct i2c_client *client, + device_init_wakeup(&client->dev, 0); + return 0; + +-err2: ++err3: + input_unregister_device(input); + input = NULL; /* so we dont try to free it below */ ++err2: ++ gpio_free(pdata->irq_gpio); + err1: + input_free_device(input); + kfree(priv); +diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c +index 43a76c4..db662e2 100644 +--- a/drivers/mfd/ezx-pcap.c ++++ b/drivers/mfd/ezx-pcap.c +@@ -202,7 +202,7 @@ static void pcap_isr_work(struct work_struct *work) + } + local_irq_enable(); + ezx_pcap_write(pcap, PCAP_REG_MSR, pcap->msr); +- } while (gpio_get_value(irq_to_gpio(pcap->spi->irq))); ++ } while (gpio_get_value(pdata->gpio)); + } + + static void pcap_irq_handler(unsigned int irq, struct irq_desc *desc) +diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c +index 23406e6..ae286a9 100644 +--- a/drivers/net/caif/caif_serial.c ++++ b/drivers/net/caif/caif_serial.c +@@ -325,6 +325,9 @@ static int ldisc_open(struct tty_struct *tty) + + sprintf(name, "cf%s", tty->name); + dev = alloc_netdev(sizeof(*ser), name, caifdev_setup); ++ if (!dev) ++ return -ENOMEM; ++ + ser = netdev_priv(dev); + ser->tty = tty_kref_get(tty); + ser->dev = dev; +diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c +index 965c723..721adfd 100644 +--- a/drivers/net/ethernet/broadcom/bnx2.c ++++ b/drivers/net/ethernet/broadcom/bnx2.c +@@ -5378,7 +5378,7 @@ bnx2_free_tx_skbs(struct bnx2 *bp) + int k, last; + + if (skb == NULL) { +- j++; ++ j = NEXT_TX_BD(j); + continue; + } + +@@ -5390,8 +5390,8 @@ bnx2_free_tx_skbs(struct bnx2 *bp) + tx_buf->skb = NULL; + + last = tx_buf->nr_frags; +- j++; +- for (k = 0; k < last; k++, j++) { ++ j = NEXT_TX_BD(j); ++ for (k = 0; k < last; k++, j = NEXT_TX_BD(j)) { + tx_buf = &txr->tx_buf_ring[TX_RING_IDX(j)]; + dma_unmap_page(&bp->pdev->dev, + dma_unmap_addr(tx_buf, mapping), +diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c +index de00805..0549261 100644 +--- a/drivers/net/ethernet/intel/e1000/e1000_main.c ++++ b/drivers/net/ethernet/intel/e1000/e1000_main.c +@@ -4743,12 +4743,14 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool *enable_wake) + e1000_setup_rctl(adapter); + e1000_set_rx_mode(netdev); + ++ rctl = er32(RCTL); ++ + /* turn on all-multi mode if wake on multicast is enabled */ +- if (wufc & E1000_WUFC_MC) { +- rctl = er32(RCTL); ++ if (wufc & E1000_WUFC_MC) + rctl |= E1000_RCTL_MPE; +- ew32(RCTL, rctl); +- } ++ ++ /* enable receives in the hardware */ ++ ew32(RCTL, rctl | E1000_RCTL_EN); + + if (hw->mac_type >= e1000_82540) { + ctrl = er32(CTRL); +diff --git a/drivers/net/ethernet/intel/e1000e/82571.c b/drivers/net/ethernet/intel/e1000e/82571.c +index 3072d35..4f4d52a 100644 +--- a/drivers/net/ethernet/intel/e1000e/82571.c ++++ b/drivers/net/ethernet/intel/e1000e/82571.c +@@ -1600,10 +1600,8 @@ static s32 e1000_check_for_serdes_link_82571(struct e1000_hw *hw) + * auto-negotiation in the TXCW register and disable + * forced link in the Device Control register in an + * attempt to auto-negotiate with our link partner. +- * If the partner code word is null, stop forcing +- * and restart auto negotiation. + */ +- if ((rxcw & E1000_RXCW_C) || !(rxcw & E1000_RXCW_CW)) { ++ if (rxcw & E1000_RXCW_C) { + /* Enable autoneg, and unforce link up */ + ew32(TXCW, mac->txcw); + ew32(CTRL, (ctrl & ~E1000_CTRL_SLU)); +diff --git a/drivers/net/tun.c b/drivers/net/tun.c +index 7bea9c6..a12c9bf 100644 +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -1243,10 +1243,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, + int vnet_hdr_sz; + int ret; + +- if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) ++ if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) { + if (copy_from_user(&ifr, argp, ifreq_len)) + return -EFAULT; +- ++ } else { ++ memset(&ifr, 0, sizeof(ifr)); ++ } + if (cmd == TUNGETFEATURES) { + /* Currently this just means: "what IFF flags are valid?". + * This is needed because we never checked for invalid flags on +diff --git a/drivers/net/usb/kaweth.c b/drivers/net/usb/kaweth.c +index 582ca2d..c4c6a73 100644 +--- a/drivers/net/usb/kaweth.c ++++ b/drivers/net/usb/kaweth.c +@@ -1308,7 +1308,7 @@ static int kaweth_internal_control_msg(struct usb_device *usb_dev, + int retv; + int length = 0; /* shut up GCC */ + +- urb = usb_alloc_urb(0, GFP_NOIO); ++ urb = usb_alloc_urb(0, GFP_ATOMIC); + if (!urb) + return -ENOMEM; + +diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c +index 7f97164..2b8e957 100644 +--- a/drivers/net/wireless/ath/ath9k/hw.c ++++ b/drivers/net/wireless/ath/ath9k/hw.c +@@ -674,6 +674,7 @@ int ath9k_hw_init(struct ath_hw *ah) + case AR9300_DEVID_AR9340: + case AR9300_DEVID_AR9580: + case AR9300_DEVID_AR9462: ++ case AR9485_DEVID_AR1111: + break; + default: + if (common->bus_ops->ath_bus_type == ATH_USB) +diff --git a/drivers/net/wireless/ath/ath9k/hw.h b/drivers/net/wireless/ath/ath9k/hw.h +index 1bd8edf..a5c4ba8 100644 +--- a/drivers/net/wireless/ath/ath9k/hw.h ++++ b/drivers/net/wireless/ath/ath9k/hw.h +@@ -48,6 +48,7 @@ + #define AR9300_DEVID_AR9580 0x0033 + #define AR9300_DEVID_AR9462 0x0034 + #define AR9300_DEVID_AR9330 0x0035 ++#define AR9485_DEVID_AR1111 0x0037 + + #define AR5416_AR9100_DEVID 0x000b + +diff --git a/drivers/net/wireless/ath/ath9k/pci.c b/drivers/net/wireless/ath/ath9k/pci.c +index 2dcdf63..1883d39 100644 +--- a/drivers/net/wireless/ath/ath9k/pci.c ++++ b/drivers/net/wireless/ath/ath9k/pci.c +@@ -35,6 +35,7 @@ static DEFINE_PCI_DEVICE_TABLE(ath_pci_id_table) = { + { PCI_VDEVICE(ATHEROS, 0x0032) }, /* PCI-E AR9485 */ + { PCI_VDEVICE(ATHEROS, 0x0033) }, /* PCI-E AR9580 */ + { PCI_VDEVICE(ATHEROS, 0x0034) }, /* PCI-E AR9462 */ ++ { PCI_VDEVICE(ATHEROS, 0x0037) }, /* PCI-E AR1111/AR9485 */ + { 0 } + }; + +diff --git a/drivers/net/wireless/iwlwifi/iwl-agn-rs.c b/drivers/net/wireless/iwlwifi/iwl-agn-rs.c +index 9ba2c1b..3395025 100644 +--- a/drivers/net/wireless/iwlwifi/iwl-agn-rs.c ++++ b/drivers/net/wireless/iwlwifi/iwl-agn-rs.c +@@ -708,11 +708,14 @@ static int rs_toggle_antenna(u32 valid_ant, u32 *rate_n_flags, + */ + static bool rs_use_green(struct ieee80211_sta *sta) + { +- struct iwl_station_priv *sta_priv = (void *)sta->drv_priv; +- struct iwl_rxon_context *ctx = sta_priv->ctx; +- +- return (sta->ht_cap.cap & IEEE80211_HT_CAP_GRN_FLD) && +- !(ctx->ht.non_gf_sta_present); ++ /* ++ * There's a bug somewhere in this code that causes the ++ * scaling to get stuck because GF+SGI can't be combined ++ * in SISO rates. Until we find that bug, disable GF, it ++ * has only limited benefit and we still interoperate with ++ * GF APs since we can always receive GF transmissions. ++ */ ++ return false; + } + + /** +diff --git a/drivers/net/wireless/rt2x00/rt61pci.c b/drivers/net/wireless/rt2x00/rt61pci.c +index bf55b4a..d69f88c 100644 +--- a/drivers/net/wireless/rt2x00/rt61pci.c ++++ b/drivers/net/wireless/rt2x00/rt61pci.c +@@ -2243,8 +2243,7 @@ static void rt61pci_txdone(struct rt2x00_dev *rt2x00dev) + + static void rt61pci_wakeup(struct rt2x00_dev *rt2x00dev) + { +- struct ieee80211_conf conf = { .flags = 0 }; +- struct rt2x00lib_conf libconf = { .conf = &conf }; ++ struct rt2x00lib_conf libconf = { .conf = &rt2x00dev->hw->conf }; + + rt61pci_config(rt2x00dev, &libconf, IEEE80211_CONF_CHANGE_PS); + } +diff --git a/drivers/net/wireless/rtlwifi/usb.c b/drivers/net/wireless/rtlwifi/usb.c +index db34db6..a49e848 100644 +--- a/drivers/net/wireless/rtlwifi/usb.c ++++ b/drivers/net/wireless/rtlwifi/usb.c +@@ -120,15 +120,19 @@ static u32 _usb_read_sync(struct rtl_priv *rtlpriv, u32 addr, u16 len) + u8 request; + u16 wvalue; + u16 index; +- __le32 *data = &rtlpriv->usb_data[rtlpriv->usb_data_index]; ++ __le32 *data; ++ unsigned long flags; + ++ spin_lock_irqsave(&rtlpriv->locks.usb_lock, flags); ++ if (++rtlpriv->usb_data_index >= RTL_USB_MAX_RX_COUNT) ++ rtlpriv->usb_data_index = 0; ++ data = &rtlpriv->usb_data[rtlpriv->usb_data_index]; ++ spin_unlock_irqrestore(&rtlpriv->locks.usb_lock, flags); + request = REALTEK_USB_VENQT_CMD_REQ; + index = REALTEK_USB_VENQT_CMD_IDX; /* n/a */ + + wvalue = (u16)addr; + _usbctrl_vendorreq_sync_read(udev, request, wvalue, index, data, len); +- if (++rtlpriv->usb_data_index >= RTL_USB_MAX_RX_COUNT) +- rtlpriv->usb_data_index = 0; + return le32_to_cpu(*data); + } + +@@ -909,6 +913,10 @@ int __devinit rtl_usb_probe(struct usb_interface *intf, + GFP_KERNEL); + if (!rtlpriv->usb_data) + return -ENOMEM; ++ ++ /* this spin lock must be initialized early */ ++ spin_lock_init(&rtlpriv->locks.usb_lock); ++ + rtlpriv->usb_data_index = 0; + SET_IEEE80211_DEV(hw, &intf->dev); + udev = interface_to_usbdev(intf); +diff --git a/drivers/net/wireless/rtlwifi/wifi.h b/drivers/net/wireless/rtlwifi/wifi.h +index b1e9deb..deb87e9 100644 +--- a/drivers/net/wireless/rtlwifi/wifi.h ++++ b/drivers/net/wireless/rtlwifi/wifi.h +@@ -1550,6 +1550,7 @@ struct rtl_locks { + spinlock_t rf_lock; + spinlock_t lps_lock; + spinlock_t waitq_lock; ++ spinlock_t usb_lock; + + /*Dual mac*/ + spinlock_t cck_and_rw_pagea_lock; +diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c +index 7daf4b8..90effcc 100644 +--- a/fs/hfsplus/wrapper.c ++++ b/fs/hfsplus/wrapper.c +@@ -56,7 +56,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, + DECLARE_COMPLETION_ONSTACK(wait); + struct bio *bio; + int ret = 0; +- unsigned int io_size; ++ u64 io_size; + loff_t start; + int offset; + +diff --git a/include/linux/input/eeti_ts.h b/include/linux/input/eeti_ts.h +index f875b31..16625d7 100644 +--- a/include/linux/input/eeti_ts.h ++++ b/include/linux/input/eeti_ts.h +@@ -2,6 +2,7 @@ + #define LINUX_INPUT_EETI_TS_H + + struct eeti_ts_platform_data { ++ int irq_gpio; + unsigned int irq_active_high; + }; + +diff --git a/include/linux/mfd/ezx-pcap.h b/include/linux/mfd/ezx-pcap.h +index 40c37216..32a1b5c 100644 +--- a/include/linux/mfd/ezx-pcap.h ++++ b/include/linux/mfd/ezx-pcap.h +@@ -16,6 +16,7 @@ struct pcap_subdev { + struct pcap_platform_data { + unsigned int irq_base; + unsigned int config; ++ int gpio; + void (*init) (void *); /* board specific init */ + int num_subdevs; + struct pcap_subdev *subdevs; +diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c +index 68223e4..4e9115d 100644 +--- a/net/caif/caif_dev.c ++++ b/net/caif/caif_dev.c +@@ -428,9 +428,9 @@ static int __init caif_device_init(void) + + static void __exit caif_device_exit(void) + { +- unregister_pernet_subsys(&caif_net_ops); + unregister_netdevice_notifier(&caif_device_notifier); + dev_remove_pack(&caif_packet_type); ++ unregister_pernet_subsys(&caif_net_ops); + } + + module_init(caif_device_init); +diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c +index 05842ab..0cf604b 100644 +--- a/net/core/rtnetlink.c ++++ b/net/core/rtnetlink.c +@@ -670,6 +670,12 @@ static void set_operstate(struct net_device *dev, unsigned char transition) + } + } + ++static unsigned int rtnl_dev_get_flags(const struct net_device *dev) ++{ ++ return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) | ++ (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); ++} ++ + static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, + const struct ifinfomsg *ifm) + { +@@ -678,7 +684,7 @@ static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, + /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ + if (ifm->ifi_change) + flags = (flags & ifm->ifi_change) | +- (dev->flags & ~ifm->ifi_change); ++ (rtnl_dev_get_flags(dev) & ~ifm->ifi_change); + + return flags; + } +diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c +index 86f3b88..afaa735 100644 +--- a/net/ipv4/cipso_ipv4.c ++++ b/net/ipv4/cipso_ipv4.c +@@ -1725,8 +1725,10 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) + case CIPSO_V4_TAG_LOCAL: + /* This is a non-standard tag that we only allow for + * local connections, so if the incoming interface is +- * not the loopback device drop the packet. */ +- if (!(skb->dev->flags & IFF_LOOPBACK)) { ++ * not the loopback device drop the packet. Further, ++ * there is no legitimate reason for setting this from ++ * userspace so reject it if skb is NULL. */ ++ if (skb == NULL || !(skb->dev->flags & IFF_LOOPBACK)) { + err_offset = opt_iter; + goto validate_return_locked; + } +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 11ba922..ad466a7 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -2391,7 +2391,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, + /* Cap the max timeout in ms TCP will retry/retrans + * before giving up and aborting (ETIMEDOUT) a connection. + */ +- icsk->icsk_user_timeout = msecs_to_jiffies(val); ++ if (val < 0) ++ err = -EINVAL; ++ else ++ icsk->icsk_user_timeout = msecs_to_jiffies(val); + break; + default: + err = -ENOPROTOOPT; +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 32e6ca2..a08a621 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -5415,7 +5415,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, + if (tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len) { + #ifdef CONFIG_NET_DMA +- if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { ++ if (tp->ucopy.task == current && ++ sock_owned_by_user(sk) && ++ tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { + copied_early = 1; + eaten = 1; + } +diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c +index a7078fd..f85de8e 100644 +--- a/net/mac80211/mesh.c ++++ b/net/mac80211/mesh.c +@@ -543,6 +543,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) + + del_timer_sync(&sdata->u.mesh.housekeeping_timer); + del_timer_sync(&sdata->u.mesh.mesh_path_root_timer); ++ del_timer_sync(&sdata->u.mesh.mesh_path_timer); + /* + * If the timer fired while we waited for it, it will have + * requeued the work. Now the work will be running again +diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c +index 17859ea..351a69b 100644 +--- a/net/sched/sch_sfb.c ++++ b/net/sched/sch_sfb.c +@@ -559,6 +559,8 @@ static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb) + + sch->qstats.backlog = q->qdisc->qstats.backlog; + opts = nla_nest_start(skb, TCA_OPTIONS); ++ if (opts == NULL) ++ goto nla_put_failure; + NLA_PUT(skb, TCA_SFB_PARMS, sizeof(opt), &opt); + return nla_nest_end(skb, opts); + +diff --git a/net/sctp/input.c b/net/sctp/input.c +index b7692aa..0fc18c7 100644 +--- a/net/sctp/input.c ++++ b/net/sctp/input.c +@@ -736,15 +736,12 @@ static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) + + epb = &ep->base; + +- if (hlist_unhashed(&epb->node)) +- return; +- + epb->hashent = sctp_ep_hashfn(epb->bind_addr.port); + + head = &sctp_ep_hashtable[epb->hashent]; + + sctp_write_lock(&head->lock); +- __hlist_del(&epb->node); ++ hlist_del_init(&epb->node); + sctp_write_unlock(&head->lock); + } + +@@ -825,7 +822,7 @@ static void __sctp_unhash_established(struct sctp_association *asoc) + head = &sctp_assoc_hashtable[epb->hashent]; + + sctp_write_lock(&head->lock); +- __hlist_del(&epb->node); ++ hlist_del_init(&epb->node); + sctp_write_unlock(&head->lock); + } + +diff --git a/net/sctp/socket.c b/net/sctp/socket.c +index 0075554..8e49d76 100644 +--- a/net/sctp/socket.c ++++ b/net/sctp/socket.c +@@ -1231,8 +1231,14 @@ out_free: + SCTP_DEBUG_PRINTK("About to exit __sctp_connect() free asoc: %p" + " kaddrs: %p err: %d\n", + asoc, kaddrs, err); +- if (asoc) ++ if (asoc) { ++ /* sctp_primitive_ASSOCIATE may have added this association ++ * To the hash table, try to unhash it, just in case, its a noop ++ * if it wasn't hashed so we're safe ++ */ ++ sctp_unhash_established(asoc); + sctp_association_free(asoc); ++ } + return err; + } + +@@ -1942,8 +1948,10 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, + goto out_unlock; + + out_free: +- if (new_asoc) ++ if (new_asoc) { ++ sctp_unhash_established(asoc); + sctp_association_free(asoc); ++ } + out_unlock: + sctp_release_sock(sk); + +diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c +index 788a12c..2ab7850 100644 +--- a/net/wanrouter/wanmain.c ++++ b/net/wanrouter/wanmain.c +@@ -602,36 +602,31 @@ static int wanrouter_device_new_if(struct wan_device *wandev, + * successfully, add it to the interface list. + */ + +- if (dev->name == NULL) { +- err = -EINVAL; +- } else { ++#ifdef WANDEBUG ++ printk(KERN_INFO "%s: registering interface %s...\n", ++ wanrouter_modname, dev->name); ++#endif + +- #ifdef WANDEBUG +- printk(KERN_INFO "%s: registering interface %s...\n", +- wanrouter_modname, dev->name); +- #endif +- +- err = register_netdev(dev); +- if (!err) { +- struct net_device *slave = NULL; +- unsigned long smp_flags=0; +- +- lock_adapter_irq(&wandev->lock, &smp_flags); +- +- if (wandev->dev == NULL) { +- wandev->dev = dev; +- } else { +- for (slave=wandev->dev; +- DEV_TO_SLAVE(slave); +- slave = DEV_TO_SLAVE(slave)) +- DEV_TO_SLAVE(slave) = dev; +- } +- ++wandev->ndev; +- +- unlock_adapter_irq(&wandev->lock, &smp_flags); +- err = 0; /* done !!! */ +- goto out; ++ err = register_netdev(dev); ++ if (!err) { ++ struct net_device *slave = NULL; ++ unsigned long smp_flags=0; ++ ++ lock_adapter_irq(&wandev->lock, &smp_flags); ++ ++ if (wandev->dev == NULL) { ++ wandev->dev = dev; ++ } else { ++ for (slave=wandev->dev; ++ DEV_TO_SLAVE(slave); ++ slave = DEV_TO_SLAVE(slave)) ++ DEV_TO_SLAVE(slave) = dev; + } ++ ++wandev->ndev; ++ ++ unlock_adapter_irq(&wandev->lock, &smp_flags); ++ err = 0; /* done !!! */ ++ goto out; + } + if (wandev->del_if) + wandev->del_if(wandev, dev); +diff --git a/net/wireless/core.c b/net/wireless/core.c +index 220f3bd..8f5042d 100644 +--- a/net/wireless/core.c ++++ b/net/wireless/core.c +@@ -971,6 +971,11 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb, + */ + synchronize_rcu(); + INIT_LIST_HEAD(&wdev->list); ++ /* ++ * Ensure that all events have been processed and ++ * freed. ++ */ ++ cfg80211_process_wdev_events(wdev); + break; + case NETDEV_PRE_UP: + if (!(wdev->wiphy->interface_modes & BIT(wdev->iftype))) +diff --git a/net/wireless/core.h b/net/wireless/core.h +index b9ec306..02c3be3 100644 +--- a/net/wireless/core.h ++++ b/net/wireless/core.h +@@ -426,6 +426,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, + struct net_device *dev, enum nl80211_iftype ntype, + u32 *flags, struct vif_params *params); + void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); ++void cfg80211_process_wdev_events(struct wireless_dev *wdev); + + int cfg80211_can_change_interface(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, +diff --git a/net/wireless/util.c b/net/wireless/util.c +index b5e4c1c..22fb802 100644 +--- a/net/wireless/util.c ++++ b/net/wireless/util.c +@@ -725,7 +725,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev) + wdev->connect_keys = NULL; + } + +-static void cfg80211_process_wdev_events(struct wireless_dev *wdev) ++void cfg80211_process_wdev_events(struct wireless_dev *wdev) + { + struct cfg80211_event *ev; + unsigned long flags; +diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c +index 51a1afc..402f330 100644 +--- a/sound/pci/hda/patch_conexant.c ++++ b/sound/pci/hda/patch_conexant.c +@@ -3059,7 +3059,6 @@ static const struct snd_pci_quirk cxt5066_cfg_tbl[] = { + SND_PCI_QUIRK(0x1028, 0x02d8, "Dell Vostro", CXT5066_DELL_VOSTRO), + SND_PCI_QUIRK(0x1028, 0x02f5, "Dell Vostro 320", CXT5066_IDEAPAD), + SND_PCI_QUIRK(0x1028, 0x0401, "Dell Vostro 1014", CXT5066_DELL_VOSTRO), +- SND_PCI_QUIRK(0x1028, 0x0402, "Dell Vostro", CXT5066_DELL_VOSTRO), + SND_PCI_QUIRK(0x1028, 0x0408, "Dell Inspiron One 19T", CXT5066_IDEAPAD), + SND_PCI_QUIRK(0x1028, 0x050f, "Dell Inspiron", CXT5066_IDEAPAD), + SND_PCI_QUIRK(0x1028, 0x0510, "Dell Vostro", CXT5066_IDEAPAD), +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index 2e2eb93..32c8169 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -4981,6 +4981,8 @@ static const struct alc_fixup alc269_fixups[] = { + [ALC269_FIXUP_PCM_44K] = { + .type = ALC_FIXUP_FUNC, + .v.func = alc269_fixup_pcm_44k, ++ .chained = true, ++ .chain_id = ALC269_FIXUP_QUANTA_MUTE + }, + [ALC269_FIXUP_STEREO_DMIC] = { + .type = ALC_FIXUP_FUNC, +@@ -5077,9 +5079,10 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x17aa, 0x21ca, "Thinkpad L412", ALC269_FIXUP_SKU_IGNORE), + SND_PCI_QUIRK(0x17aa, 0x21e9, "Thinkpad Edge 15", ALC269_FIXUP_SKU_IGNORE), + SND_PCI_QUIRK(0x17aa, 0x21f6, "Thinkpad T530", ALC269_FIXUP_LENOVO_DOCK), ++ SND_PCI_QUIRK(0x17aa, 0x21fa, "Thinkpad X230", ALC269_FIXUP_LENOVO_DOCK), ++ SND_PCI_QUIRK(0x17aa, 0x21fb, "Thinkpad T430s", ALC269_FIXUP_LENOVO_DOCK), + SND_PCI_QUIRK(0x17aa, 0x2203, "Thinkpad X230 Tablet", ALC269_FIXUP_LENOVO_DOCK), +- SND_PCI_QUIRK(0x17aa, 0x3bf8, "Quanta FL1", ALC269_FIXUP_QUANTA_MUTE), +- SND_PCI_QUIRK(0x17aa, 0x3bf8, "Lenovo Ideapd", ALC269_FIXUP_PCM_44K), ++ SND_PCI_QUIRK(0x17aa, 0x3bf8, "Quanta FL1", ALC269_FIXUP_PCM_44K), + SND_PCI_QUIRK(0x17aa, 0x9e54, "LENOVO NB", ALC269_FIXUP_LENOVO_EAPD), + + #if 1 diff --git a/3.2.34/bump/1028_linux-3.2.29.patch b/3.2.34/bump/1028_linux-3.2.29.patch new file mode 100644 index 0000000..3c65179 --- /dev/null +++ b/3.2.34/bump/1028_linux-3.2.29.patch @@ -0,0 +1,4279 @@ +diff --git a/MAINTAINERS b/MAINTAINERS +index f986e7d..82d7fa6 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -5452,7 +5452,7 @@ F: Documentation/blockdev/ramdisk.txt + F: drivers/block/brd.c + + RANDOM NUMBER DRIVER +-M: Matt Mackall ++M: Theodore Ts'o" + S: Maintained + F: drivers/char/random.c + +diff --git a/Makefile b/Makefile +index 5368961..d96fc2a 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 2 +-SUBLEVEL = 28 ++SUBLEVEL = 29 + EXTRAVERSION = + NAME = Saber-toothed Squirrel + +diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h +index 640f909..6f1aca7 100644 +--- a/arch/alpha/include/asm/atomic.h ++++ b/arch/alpha/include/asm/atomic.h +@@ -14,8 +14,8 @@ + */ + + +-#define ATOMIC_INIT(i) ( (atomic_t) { (i) } ) +-#define ATOMIC64_INIT(i) ( (atomic64_t) { (i) } ) ++#define ATOMIC_INIT(i) { (i) } ++#define ATOMIC64_INIT(i) { (i) } + + #define atomic_read(v) (*(volatile int *)&(v)->counter) + #define atomic64_read(v) (*(volatile long *)&(v)->counter) +diff --git a/arch/alpha/include/asm/socket.h b/arch/alpha/include/asm/socket.h +index 06edfef..3eeb47c 100644 +--- a/arch/alpha/include/asm/socket.h ++++ b/arch/alpha/include/asm/socket.h +@@ -69,9 +69,11 @@ + + #define SO_RXQ_OVFL 40 + ++#ifdef __KERNEL__ + /* O_NONBLOCK clashes with the bits used for socket types. Therefore we + * have to define SOCK_NONBLOCK to a different value here. + */ + #define SOCK_NONBLOCK 0x40000000 ++#endif /* __KERNEL__ */ + + #endif /* _ASM_SOCKET_H */ +diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h +index 9451dce..8512475 100644 +--- a/arch/arm/include/asm/pgtable.h ++++ b/arch/arm/include/asm/pgtable.h +@@ -288,13 +288,13 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 +- * <--------------- offset --------------------> <- type --> 0 0 0 ++ * <--------------- offset ----------------------> < type -> 0 0 0 + * +- * This gives us up to 63 swap files and 32GB per swap file. Note that ++ * This gives us up to 31 swap files and 64GB per swap file. Note that + * the offset field is always non-zero. + */ + #define __SWP_TYPE_SHIFT 3 +-#define __SWP_TYPE_BITS 6 ++#define __SWP_TYPE_BITS 5 + #define __SWP_TYPE_MASK ((1 << __SWP_TYPE_BITS) - 1) + #define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT) + +diff --git a/arch/arm/mm/tlb-v7.S b/arch/arm/mm/tlb-v7.S +index c202113..ea94765 100644 +--- a/arch/arm/mm/tlb-v7.S ++++ b/arch/arm/mm/tlb-v7.S +@@ -38,10 +38,10 @@ ENTRY(v7wbi_flush_user_tlb_range) + dsb + mov r0, r0, lsr #PAGE_SHIFT @ align address + mov r1, r1, lsr #PAGE_SHIFT +-#ifdef CONFIG_ARM_ERRATA_720789 +- mov r3, #0 +-#else + asid r3, r3 @ mask ASID ++#ifdef CONFIG_ARM_ERRATA_720789 ++ ALT_SMP(W(mov) r3, #0 ) ++ ALT_UP(W(nop) ) + #endif + orr r0, r3, r0, lsl #PAGE_SHIFT @ Create initial MVA + mov r1, r1, lsl #PAGE_SHIFT +diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c +index ad83dad..f0702f3 100644 +--- a/arch/arm/vfp/vfpmodule.c ++++ b/arch/arm/vfp/vfpmodule.c +@@ -628,8 +628,10 @@ static int __init vfp_init(void) + if ((fmrx(MVFR1) & 0x000fff00) == 0x00011100) + elf_hwcap |= HWCAP_NEON; + #endif ++#ifdef CONFIG_VFPv3 + if ((fmrx(MVFR1) & 0xf0000000) == 0x10000000) + elf_hwcap |= HWCAP_VFPv4; ++#endif + } + } + return 0; +diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c +index f581a18..df7d12c 100644 +--- a/arch/x86/mm/hugetlbpage.c ++++ b/arch/x86/mm/hugetlbpage.c +@@ -56,9 +56,16 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) + } + + /* +- * search for a shareable pmd page for hugetlb. ++ * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() ++ * and returns the corresponding pte. While this is not necessary for the ++ * !shared pmd case because we can allocate the pmd later as well, it makes the ++ * code much cleaner. pmd allocation is essential for the shared case because ++ * pud has to be populated inside the same i_mmap_mutex section - otherwise ++ * racing tasks could either miss the sharing (see huge_pte_offset) or select a ++ * bad pmd for sharing. + */ +-static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) ++static pte_t * ++huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) + { + struct vm_area_struct *vma = find_vma(mm, addr); + struct address_space *mapping = vma->vm_file->f_mapping; +@@ -68,9 +75,10 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) + struct vm_area_struct *svma; + unsigned long saddr; + pte_t *spte = NULL; ++ pte_t *pte; + + if (!vma_shareable(vma, addr)) +- return; ++ return (pte_t *)pmd_alloc(mm, pud, addr); + + mutex_lock(&mapping->i_mmap_mutex); + vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { +@@ -97,7 +105,9 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) + put_page(virt_to_page(spte)); + spin_unlock(&mm->page_table_lock); + out: ++ pte = (pte_t *)pmd_alloc(mm, pud, addr); + mutex_unlock(&mapping->i_mmap_mutex); ++ return pte; + } + + /* +@@ -142,8 +152,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, + } else { + BUG_ON(sz != PMD_SIZE); + if (pud_none(*pud)) +- huge_pmd_share(mm, addr, pud); +- pte = (pte_t *) pmd_alloc(mm, pud, addr); ++ pte = huge_pmd_share(mm, addr, pud); ++ else ++ pte = (pte_t *)pmd_alloc(mm, pud, addr); + } + } + BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); +diff --git a/drivers/acpi/acpica/tbxface.c b/drivers/acpi/acpica/tbxface.c +index e7d13f5..d05f2fe 100644 +--- a/drivers/acpi/acpica/tbxface.c ++++ b/drivers/acpi/acpica/tbxface.c +@@ -436,6 +436,7 @@ acpi_get_table_with_size(char *signature, + + return (AE_NOT_FOUND); + } ++ACPI_EXPORT_SYMBOL(acpi_get_table_with_size) + + acpi_status + acpi_get_table(char *signature, +diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c +index 8c78443..3790c80 100644 +--- a/drivers/base/power/runtime.c ++++ b/drivers/base/power/runtime.c +@@ -385,7 +385,6 @@ static int rpm_suspend(struct device *dev, int rpmflags) + goto repeat; + } + +- dev->power.deferred_resume = false; + if (dev->power.no_callbacks) + goto no_callback; /* Assume success. */ + +@@ -446,6 +445,7 @@ static int rpm_suspend(struct device *dev, int rpmflags) + wake_up_all(&dev->power.wait_queue); + + if (dev->power.deferred_resume) { ++ dev->power.deferred_resume = false; + rpm_resume(dev, 0); + retval = -EAGAIN; + goto out; +@@ -568,6 +568,7 @@ static int rpm_resume(struct device *dev, int rpmflags) + || dev->parent->power.runtime_status == RPM_ACTIVE) { + atomic_inc(&dev->parent->power.child_count); + spin_unlock(&dev->parent->power.lock); ++ retval = 1; + goto no_callback; /* Assume success. */ + } + spin_unlock(&dev->parent->power.lock); +@@ -645,7 +646,7 @@ static int rpm_resume(struct device *dev, int rpmflags) + } + wake_up_all(&dev->power.wait_queue); + +- if (!retval) ++ if (retval >= 0) + rpm_idle(dev, RPM_ASYNC); + + out: +diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c +index acda773..38aa6dd 100644 +--- a/drivers/block/cciss_scsi.c ++++ b/drivers/block/cciss_scsi.c +@@ -763,16 +763,7 @@ static void complete_scsi_command(CommandList_struct *c, int timeout, + { + case CMD_TARGET_STATUS: + /* Pass it up to the upper layers... */ +- if( ei->ScsiStatus) +- { +-#if 0 +- printk(KERN_WARNING "cciss: cmd %p " +- "has SCSI Status = %x\n", +- c, ei->ScsiStatus); +-#endif +- cmd->result |= (ei->ScsiStatus << 1); +- } +- else { /* scsi status is zero??? How??? */ ++ if (!ei->ScsiStatus) { + + /* Ordinarily, this case should never happen, but there is a bug + in some released firmware revisions that allows it to happen +diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c +index 650a308..de9c800 100644 +--- a/drivers/block/virtio_blk.c ++++ b/drivers/block/virtio_blk.c +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -26,14 +27,17 @@ struct virtio_blk + /* The disk structure for the kernel. */ + struct gendisk *disk; + +- /* Request tracking. */ +- struct list_head reqs; +- + mempool_t *pool; + + /* Process context for config space updates */ + struct work_struct config_work; + ++ /* Lock for config space updates */ ++ struct mutex config_lock; ++ ++ /* enable config space updates */ ++ bool config_enable; ++ + /* What host tells us, plus 2 for header & tailer. */ + unsigned int sg_elems; + +@@ -46,7 +50,6 @@ struct virtio_blk + + struct virtblk_req + { +- struct list_head list; + struct request *req; + struct virtio_blk_outhdr out_hdr; + struct virtio_scsi_inhdr in_hdr; +@@ -90,7 +93,6 @@ static void blk_done(struct virtqueue *vq) + } + + __blk_end_request_all(vbr->req, error); +- list_del(&vbr->list); + mempool_free(vbr, vblk->pool); + } + /* In case queue is stopped waiting for more buffers. */ +@@ -175,7 +177,6 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, + return false; + } + +- list_add_tail(&vbr->list, &vblk->reqs); + return true; + } + +@@ -316,6 +317,10 @@ static void virtblk_config_changed_work(struct work_struct *work) + char cap_str_2[10], cap_str_10[10]; + u64 capacity, size; + ++ mutex_lock(&vblk->config_lock); ++ if (!vblk->config_enable) ++ goto done; ++ + /* Host must always specify the capacity. */ + vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity), + &capacity, sizeof(capacity)); +@@ -338,6 +343,8 @@ static void virtblk_config_changed_work(struct work_struct *work) + cap_str_10, cap_str_2); + + set_capacity(vblk->disk, capacity); ++done: ++ mutex_unlock(&vblk->config_lock); + } + + static void virtblk_config_changed(struct virtio_device *vdev) +@@ -381,11 +388,12 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) + goto out_free_index; + } + +- INIT_LIST_HEAD(&vblk->reqs); + vblk->vdev = vdev; + vblk->sg_elems = sg_elems; + sg_init_table(vblk->sg, vblk->sg_elems); ++ mutex_init(&vblk->config_lock); + INIT_WORK(&vblk->config_work, virtblk_config_changed_work); ++ vblk->config_enable = true; + + /* We expect one virtqueue, for output. */ + vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests"); +@@ -539,16 +547,19 @@ static void __devexit virtblk_remove(struct virtio_device *vdev) + struct virtio_blk *vblk = vdev->priv; + int index = vblk->index; + +- flush_work(&vblk->config_work); ++ /* Prevent config work handler from accessing the device. */ ++ mutex_lock(&vblk->config_lock); ++ vblk->config_enable = false; ++ mutex_unlock(&vblk->config_lock); + +- /* Nothing should be pending. */ +- BUG_ON(!list_empty(&vblk->reqs)); ++ del_gendisk(vblk->disk); ++ blk_cleanup_queue(vblk->disk->queue); + + /* Stop all the virtqueues. */ + vdev->config->reset(vdev); + +- del_gendisk(vblk->disk); +- blk_cleanup_queue(vblk->disk->queue); ++ flush_work(&vblk->config_work); ++ + put_disk(vblk->disk); + mempool_destroy(vblk->pool); + vdev->config->del_vqs(vdev); +diff --git a/drivers/char/random.c b/drivers/char/random.c +index 631d4f6..8ae9235 100644 +--- a/drivers/char/random.c ++++ b/drivers/char/random.c +@@ -1114,6 +1114,16 @@ static void init_std_data(struct entropy_store *r) + mix_pool_bytes(r, utsname(), sizeof(*(utsname())), NULL); + } + ++/* ++ * Note that setup_arch() may call add_device_randomness() ++ * long before we get here. This allows seeding of the pools ++ * with some platform dependent data very early in the boot ++ * process. But it limits our options here. We must use ++ * statically allocated structures that already have all ++ * initializations complete at compile time. We should also ++ * take care not to overwrite the precious per platform data ++ * we were given. ++ */ + static int rand_initialize(void) + { + init_std_data(&input_pool); +@@ -1391,10 +1401,15 @@ static int proc_do_uuid(ctl_table *table, int write, + uuid = table->data; + if (!uuid) { + uuid = tmp_uuid; +- uuid[8] = 0; +- } +- if (uuid[8] == 0) + generate_random_uuid(uuid); ++ } else { ++ static DEFINE_SPINLOCK(bootid_spinlock); ++ ++ spin_lock(&bootid_spinlock); ++ if (!uuid[8]) ++ generate_random_uuid(uuid); ++ spin_unlock(&bootid_spinlock); ++ } + + sprintf(buf, "%pU", uuid); + +diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c +index 153980b..b298158 100644 +--- a/drivers/firmware/dmi_scan.c ++++ b/drivers/firmware/dmi_scan.c +@@ -6,6 +6,7 @@ + #include + #include + #include ++#include + #include + + /* +@@ -111,6 +112,8 @@ static int __init dmi_walk_early(void (*decode)(const struct dmi_header *, + + dmi_table(buf, dmi_len, dmi_num, decode, NULL); + ++ add_device_randomness(buf, dmi_len); ++ + dmi_iounmap(buf, dmi_len); + return 0; + } +diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c +index cc75c4b..3eed270 100644 +--- a/drivers/gpu/drm/i915/intel_display.c ++++ b/drivers/gpu/drm/i915/intel_display.c +@@ -4748,17 +4748,6 @@ static bool intel_choose_pipe_bpp_dither(struct drm_crtc *crtc, + continue; + } + +- if (intel_encoder->type == INTEL_OUTPUT_EDP) { +- /* Use VBT settings if we have an eDP panel */ +- unsigned int edp_bpc = dev_priv->edp.bpp / 3; +- +- if (edp_bpc < display_bpc) { +- DRM_DEBUG_KMS("clamping display bpc (was %d) to eDP (%d)\n", display_bpc, edp_bpc); +- display_bpc = edp_bpc; +- } +- continue; +- } +- + /* Not one of the known troublemakers, check the EDID */ + list_for_each_entry(connector, &dev->mode_config.connector_list, + head) { +diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c +index fae2050..c8ecaab 100644 +--- a/drivers/gpu/drm/i915/intel_dp.c ++++ b/drivers/gpu/drm/i915/intel_dp.c +@@ -1152,10 +1152,14 @@ static void ironlake_edp_panel_off(struct intel_dp *intel_dp) + WARN(!intel_dp->want_panel_vdd, "Need VDD to turn off panel\n"); + + pp = ironlake_get_pp_control(dev_priv); +- pp &= ~(POWER_TARGET_ON | PANEL_POWER_RESET | EDP_BLC_ENABLE); ++ /* We need to switch off panel power _and_ force vdd, for otherwise some ++ * panels get very unhappy and cease to work. */ ++ pp &= ~(POWER_TARGET_ON | EDP_FORCE_VDD | PANEL_POWER_RESET | EDP_BLC_ENABLE); + I915_WRITE(PCH_PP_CONTROL, pp); + POSTING_READ(PCH_PP_CONTROL); + ++ intel_dp->want_panel_vdd = false; ++ + ironlake_wait_panel_off(intel_dp); + } + +@@ -1265,11 +1269,9 @@ static void intel_dp_prepare(struct drm_encoder *encoder) + * ensure that we have vdd while we switch off the panel. */ + ironlake_edp_panel_vdd_on(intel_dp); + ironlake_edp_backlight_off(intel_dp); +- ironlake_edp_panel_off(intel_dp); +- + intel_dp_sink_dpms(intel_dp, DRM_MODE_DPMS_ON); ++ ironlake_edp_panel_off(intel_dp); + intel_dp_link_down(intel_dp); +- ironlake_edp_panel_vdd_off(intel_dp, false); + } + + static void intel_dp_commit(struct drm_encoder *encoder) +@@ -1304,11 +1306,9 @@ intel_dp_dpms(struct drm_encoder *encoder, int mode) + /* Switching the panel off requires vdd. */ + ironlake_edp_panel_vdd_on(intel_dp); + ironlake_edp_backlight_off(intel_dp); +- ironlake_edp_panel_off(intel_dp); +- + intel_dp_sink_dpms(intel_dp, mode); ++ ironlake_edp_panel_off(intel_dp); + intel_dp_link_down(intel_dp); +- ironlake_edp_panel_vdd_off(intel_dp, false); + + if (is_cpu_edp(intel_dp)) + ironlake_edp_pll_off(encoder); +diff --git a/drivers/gpu/drm/nouveau/nvd0_display.c b/drivers/gpu/drm/nouveau/nvd0_display.c +index cb006a7..3002d82 100644 +--- a/drivers/gpu/drm/nouveau/nvd0_display.c ++++ b/drivers/gpu/drm/nouveau/nvd0_display.c +@@ -472,7 +472,7 @@ static int + nvd0_crtc_cursor_move(struct drm_crtc *crtc, int x, int y) + { + struct nouveau_crtc *nv_crtc = nouveau_crtc(crtc); +- const u32 data = (y << 16) | x; ++ const u32 data = (y << 16) | (x & 0xffff); + + nv_wr32(crtc->dev, 0x64d084 + (nv_crtc->index * 0x1000), data); + nv_wr32(crtc->dev, 0x64d080 + (nv_crtc->index * 0x1000), 0x00000000); +diff --git a/drivers/gpu/drm/radeon/atombios.h b/drivers/gpu/drm/radeon/atombios.h +index 1b50ad8..4760466 100644 +--- a/drivers/gpu/drm/radeon/atombios.h ++++ b/drivers/gpu/drm/radeon/atombios.h +@@ -101,6 +101,7 @@ + #define ATOM_LCD_SELFTEST_START (ATOM_DISABLE+5) + #define ATOM_LCD_SELFTEST_STOP (ATOM_ENABLE+5) + #define ATOM_ENCODER_INIT (ATOM_DISABLE+7) ++#define ATOM_INIT (ATOM_DISABLE+7) + #define ATOM_GET_STATUS (ATOM_DISABLE+8) + + #define ATOM_BLANKING 1 +@@ -251,25 +252,25 @@ typedef struct _ATOM_MASTER_LIST_OF_COMMAND_TABLES{ + USHORT SetEngineClock; //Function Table,directly used by various SW components,latest version 1.1 + USHORT SetMemoryClock; //Function Table,directly used by various SW components,latest version 1.1 + USHORT SetPixelClock; //Function Table,directly used by various SW components,latest version 1.2 +- USHORT DynamicClockGating; //Atomic Table, indirectly used by various SW components,called from ASIC_Init ++ USHORT EnableDispPowerGating; //Atomic Table, indirectly used by various SW components,called from ASIC_Init + USHORT ResetMemoryDLL; //Atomic Table, indirectly used by various SW components,called from SetMemoryClock + USHORT ResetMemoryDevice; //Atomic Table, indirectly used by various SW components,called from SetMemoryClock +- USHORT MemoryPLLInit; +- USHORT AdjustDisplayPll; //only used by Bios ++ USHORT MemoryPLLInit; //Atomic Table, used only by Bios ++ USHORT AdjustDisplayPll; //Atomic Table, used by various SW componentes. + USHORT AdjustMemoryController; //Atomic Table, indirectly used by various SW components,called from SetMemoryClock + USHORT EnableASIC_StaticPwrMgt; //Atomic Table, only used by Bios + USHORT ASIC_StaticPwrMgtStatusChange; //Obsolete , only used by Bios + USHORT DAC_LoadDetection; //Atomic Table, directly used by various SW components,latest version 1.2 + USHORT LVTMAEncoderControl; //Atomic Table,directly used by various SW components,latest version 1.3 +- USHORT LCD1OutputControl; //Atomic Table, directly used by various SW components,latest version 1.1 ++ USHORT HW_Misc_Operation; //Atomic Table, directly used by various SW components,latest version 1.1 + USHORT DAC1EncoderControl; //Atomic Table, directly used by various SW components,latest version 1.1 + USHORT DAC2EncoderControl; //Atomic Table, directly used by various SW components,latest version 1.1 + USHORT DVOOutputControl; //Atomic Table, directly used by various SW components,latest version 1.1 + USHORT CV1OutputControl; //Atomic Table, Atomic Table, Obsolete from Ry6xx, use DAC2 Output instead +- USHORT GetConditionalGoldenSetting; //only used by Bios ++ USHORT GetConditionalGoldenSetting; //Only used by Bios + USHORT TVEncoderControl; //Function Table,directly used by various SW components,latest version 1.1 +- USHORT TMDSAEncoderControl; //Atomic Table, directly used by various SW components,latest version 1.3 +- USHORT LVDSEncoderControl; //Atomic Table, directly used by various SW components,latest version 1.3 ++ USHORT PatchMCSetting; //only used by BIOS ++ USHORT MC_SEQ_Control; //only used by BIOS + USHORT TV1OutputControl; //Atomic Table, Obsolete from Ry6xx, use DAC2 Output instead + USHORT EnableScaler; //Atomic Table, used only by Bios + USHORT BlankCRTC; //Atomic Table, directly used by various SW components,latest version 1.1 +@@ -282,7 +283,7 @@ typedef struct _ATOM_MASTER_LIST_OF_COMMAND_TABLES{ + USHORT SetCRTC_Replication; //Atomic Table, used only by Bios + USHORT SelectCRTC_Source; //Atomic Table, directly used by various SW components,latest version 1.1 + USHORT EnableGraphSurfaces; //Atomic Table, used only by Bios +- USHORT UpdateCRTC_DoubleBufferRegisters; ++ USHORT UpdateCRTC_DoubleBufferRegisters; //Atomic Table, used only by Bios + USHORT LUT_AutoFill; //Atomic Table, only used by Bios + USHORT EnableHW_IconCursor; //Atomic Table, only used by Bios + USHORT GetMemoryClock; //Atomic Table, directly used by various SW components,latest version 1.1 +@@ -308,27 +309,36 @@ typedef struct _ATOM_MASTER_LIST_OF_COMMAND_TABLES{ + USHORT SetVoltage; //Function Table,directly and/or indirectly used by various SW components,latest version 1.1 + USHORT DAC1OutputControl; //Atomic Table, directly used by various SW components,latest version 1.1 + USHORT DAC2OutputControl; //Atomic Table, directly used by various SW components,latest version 1.1 +- USHORT SetupHWAssistedI2CStatus; //Function Table,only used by Bios, obsolete soon.Switch to use "ReadEDIDFromHWAssistedI2C" ++ USHORT ComputeMemoryClockParam; //Function Table,only used by Bios, obsolete soon.Switch to use "ReadEDIDFromHWAssistedI2C" + USHORT ClockSource; //Atomic Table, indirectly used by various SW components,called from ASIC_Init + USHORT MemoryDeviceInit; //Atomic Table, indirectly used by various SW components,called from SetMemoryClock +- USHORT EnableYUV; //Atomic Table, indirectly used by various SW components,called from EnableVGARender ++ USHORT GetDispObjectInfo; //Atomic Table, indirectly used by various SW components,called from EnableVGARender + USHORT DIG1EncoderControl; //Atomic Table,directly used by various SW components,latest version 1.1 + USHORT DIG2EncoderControl; //Atomic Table,directly used by various SW components,latest version 1.1 + USHORT DIG1TransmitterControl; //Atomic Table,directly used by various SW components,latest version 1.1 + USHORT DIG2TransmitterControl; //Atomic Table,directly used by various SW components,latest version 1.1 + USHORT ProcessAuxChannelTransaction; //Function Table,only used by Bios + USHORT DPEncoderService; //Function Table,only used by Bios ++ USHORT GetVoltageInfo; //Function Table,only used by Bios since SI + }ATOM_MASTER_LIST_OF_COMMAND_TABLES; + + // For backward compatible + #define ReadEDIDFromHWAssistedI2C ProcessI2cChannelTransaction +-#define UNIPHYTransmitterControl DIG1TransmitterControl +-#define LVTMATransmitterControl DIG2TransmitterControl ++#define DPTranslatorControl DIG2EncoderControl ++#define UNIPHYTransmitterControl DIG1TransmitterControl ++#define LVTMATransmitterControl DIG2TransmitterControl + #define SetCRTC_DPM_State GetConditionalGoldenSetting + #define SetUniphyInstance ASIC_StaticPwrMgtStatusChange + #define HPDInterruptService ReadHWAssistedI2CStatus + #define EnableVGA_Access GetSCLKOverMCLKRatio +-#define GetDispObjectInfo EnableYUV ++#define EnableYUV GetDispObjectInfo ++#define DynamicClockGating EnableDispPowerGating ++#define SetupHWAssistedI2CStatus ComputeMemoryClockParam ++ ++#define TMDSAEncoderControl PatchMCSetting ++#define LVDSEncoderControl MC_SEQ_Control ++#define LCD1OutputControl HW_Misc_Operation ++ + + typedef struct _ATOM_MASTER_COMMAND_TABLE + { +@@ -495,6 +505,34 @@ typedef struct _COMPUTE_MEMORY_ENGINE_PLL_PARAMETERS_V5 + // ucInputFlag + #define ATOM_PLL_INPUT_FLAG_PLL_STROBE_MODE_EN 1 // 1-StrobeMode, 0-PerformanceMode + ++// use for ComputeMemoryClockParamTable ++typedef struct _COMPUTE_MEMORY_CLOCK_PARAM_PARAMETERS_V2_1 ++{ ++ union ++ { ++ ULONG ulClock; ++ ATOM_S_MPLL_FB_DIVIDER ulFbDiv; //Output:UPPER_WORD=FB_DIV_INTEGER, LOWER_WORD=FB_DIV_FRAC shl (16-FB_FRACTION_BITS) ++ }; ++ UCHAR ucDllSpeed; //Output ++ UCHAR ucPostDiv; //Output ++ union{ ++ UCHAR ucInputFlag; //Input : ATOM_PLL_INPUT_FLAG_PLL_STROBE_MODE_EN: 1-StrobeMode, 0-PerformanceMode ++ UCHAR ucPllCntlFlag; //Output: ++ }; ++ UCHAR ucBWCntl; ++}COMPUTE_MEMORY_CLOCK_PARAM_PARAMETERS_V2_1; ++ ++// definition of ucInputFlag ++#define MPLL_INPUT_FLAG_STROBE_MODE_EN 0x01 ++// definition of ucPllCntlFlag ++#define MPLL_CNTL_FLAG_VCO_MODE_MASK 0x03 ++#define MPLL_CNTL_FLAG_BYPASS_DQ_PLL 0x04 ++#define MPLL_CNTL_FLAG_QDR_ENABLE 0x08 ++#define MPLL_CNTL_FLAG_AD_HALF_RATE 0x10 ++ ++//MPLL_CNTL_FLAG_BYPASS_AD_PLL has a wrong name, should be BYPASS_DQ_PLL ++#define MPLL_CNTL_FLAG_BYPASS_AD_PLL 0x04 ++ + typedef struct _DYNAMICE_MEMORY_SETTINGS_PARAMETER + { + ATOM_COMPUTE_CLOCK_FREQ ulClock; +@@ -562,6 +600,16 @@ typedef struct _DYNAMIC_CLOCK_GATING_PARAMETERS + #define DYNAMIC_CLOCK_GATING_PS_ALLOCATION DYNAMIC_CLOCK_GATING_PARAMETERS + + /****************************************************************************/ ++// Structure used by EnableDispPowerGatingTable.ctb ++/****************************************************************************/ ++typedef struct _ENABLE_DISP_POWER_GATING_PARAMETERS_V2_1 ++{ ++ UCHAR ucDispPipeId; // ATOM_CRTC1, ATOM_CRTC2, ... ++ UCHAR ucEnable; // ATOM_ENABLE or ATOM_DISABLE ++ UCHAR ucPadding[2]; ++}ENABLE_DISP_POWER_GATING_PARAMETERS_V2_1; ++ ++/****************************************************************************/ + // Structure used by EnableASIC_StaticPwrMgtTable.ctb + /****************************************************************************/ + typedef struct _ENABLE_ASIC_STATIC_PWR_MGT_PARAMETERS +@@ -807,6 +855,7 @@ typedef struct _ATOM_DIG_ENCODER_CONFIG_V4 + #define ATOM_ENCODER_CONFIG_V4_DPLINKRATE_1_62GHZ 0x00 + #define ATOM_ENCODER_CONFIG_V4_DPLINKRATE_2_70GHZ 0x01 + #define ATOM_ENCODER_CONFIG_V4_DPLINKRATE_5_40GHZ 0x02 ++#define ATOM_ENCODER_CONFIG_V4_DPLINKRATE_3_24GHZ 0x03 + #define ATOM_ENCODER_CONFIG_V4_ENCODER_SEL 0x70 + #define ATOM_ENCODER_CONFIG_V4_DIG0_ENCODER 0x00 + #define ATOM_ENCODER_CONFIG_V4_DIG1_ENCODER 0x10 +@@ -814,6 +863,7 @@ typedef struct _ATOM_DIG_ENCODER_CONFIG_V4 + #define ATOM_ENCODER_CONFIG_V4_DIG3_ENCODER 0x30 + #define ATOM_ENCODER_CONFIG_V4_DIG4_ENCODER 0x40 + #define ATOM_ENCODER_CONFIG_V4_DIG5_ENCODER 0x50 ++#define ATOM_ENCODER_CONFIG_V4_DIG6_ENCODER 0x60 + + typedef struct _DIG_ENCODER_CONTROL_PARAMETERS_V4 + { +@@ -1171,6 +1221,106 @@ typedef struct _DIG_TRANSMITTER_CONTROL_PARAMETERS_V4 + #define ATOM_TRANSMITTER_CONFIG_V4_TRANSMITTER3 0x80 //EF + + ++typedef struct _ATOM_DIG_TRANSMITTER_CONFIG_V5 ++{ ++#if ATOM_BIG_ENDIAN ++ UCHAR ucReservd1:1; ++ UCHAR ucHPDSel:3; ++ UCHAR ucPhyClkSrcId:2; ++ UCHAR ucCoherentMode:1; ++ UCHAR ucReserved:1; ++#else ++ UCHAR ucReserved:1; ++ UCHAR ucCoherentMode:1; ++ UCHAR ucPhyClkSrcId:2; ++ UCHAR ucHPDSel:3; ++ UCHAR ucReservd1:1; ++#endif ++}ATOM_DIG_TRANSMITTER_CONFIG_V5; ++ ++typedef struct _DIG_TRANSMITTER_CONTROL_PARAMETERS_V1_5 ++{ ++ USHORT usSymClock; // Encoder Clock in 10kHz,(DP mode)= linkclock/10, (TMDS/LVDS/HDMI)= pixel clock, (HDMI deep color), =pixel clock * deep_color_ratio ++ UCHAR ucPhyId; // 0=UNIPHYA, 1=UNIPHYB, 2=UNIPHYC, 3=UNIPHYD, 4= UNIPHYE 5=UNIPHYF ++ UCHAR ucAction; // define as ATOM_TRANSMITER_ACTION_xxx ++ UCHAR ucLaneNum; // indicate lane number 1-8 ++ UCHAR ucConnObjId; // Connector Object Id defined in ObjectId.h ++ UCHAR ucDigMode; // indicate DIG mode ++ union{ ++ ATOM_DIG_TRANSMITTER_CONFIG_V5 asConfig; ++ UCHAR ucConfig; ++ }; ++ UCHAR ucDigEncoderSel; // indicate DIG front end encoder ++ UCHAR ucDPLaneSet; ++ UCHAR ucReserved; ++ UCHAR ucReserved1; ++}DIG_TRANSMITTER_CONTROL_PARAMETERS_V1_5; ++ ++//ucPhyId ++#define ATOM_PHY_ID_UNIPHYA 0 ++#define ATOM_PHY_ID_UNIPHYB 1 ++#define ATOM_PHY_ID_UNIPHYC 2 ++#define ATOM_PHY_ID_UNIPHYD 3 ++#define ATOM_PHY_ID_UNIPHYE 4 ++#define ATOM_PHY_ID_UNIPHYF 5 ++#define ATOM_PHY_ID_UNIPHYG 6 ++ ++// ucDigEncoderSel ++#define ATOM_TRANMSITTER_V5__DIGA_SEL 0x01 ++#define ATOM_TRANMSITTER_V5__DIGB_SEL 0x02 ++#define ATOM_TRANMSITTER_V5__DIGC_SEL 0x04 ++#define ATOM_TRANMSITTER_V5__DIGD_SEL 0x08 ++#define ATOM_TRANMSITTER_V5__DIGE_SEL 0x10 ++#define ATOM_TRANMSITTER_V5__DIGF_SEL 0x20 ++#define ATOM_TRANMSITTER_V5__DIGG_SEL 0x40 ++ ++// ucDigMode ++#define ATOM_TRANSMITTER_DIGMODE_V5_DP 0 ++#define ATOM_TRANSMITTER_DIGMODE_V5_LVDS 1 ++#define ATOM_TRANSMITTER_DIGMODE_V5_DVI 2 ++#define ATOM_TRANSMITTER_DIGMODE_V5_HDMI 3 ++#define ATOM_TRANSMITTER_DIGMODE_V5_SDVO 4 ++#define ATOM_TRANSMITTER_DIGMODE_V5_DP_MST 5 ++ ++// ucDPLaneSet ++#define DP_LANE_SET__0DB_0_4V 0x00 ++#define DP_LANE_SET__0DB_0_6V 0x01 ++#define DP_LANE_SET__0DB_0_8V 0x02 ++#define DP_LANE_SET__0DB_1_2V 0x03 ++#define DP_LANE_SET__3_5DB_0_4V 0x08 ++#define DP_LANE_SET__3_5DB_0_6V 0x09 ++#define DP_LANE_SET__3_5DB_0_8V 0x0a ++#define DP_LANE_SET__6DB_0_4V 0x10 ++#define DP_LANE_SET__6DB_0_6V 0x11 ++#define DP_LANE_SET__9_5DB_0_4V 0x18 ++ ++// ATOM_DIG_TRANSMITTER_CONFIG_V5 asConfig; ++// Bit1 ++#define ATOM_TRANSMITTER_CONFIG_V5_COHERENT 0x02 ++ ++// Bit3:2 ++#define ATOM_TRANSMITTER_CONFIG_V5_REFCLK_SEL_MASK 0x0c ++#define ATOM_TRANSMITTER_CONFIG_V5_REFCLK_SEL_SHIFT 0x02 ++ ++#define ATOM_TRANSMITTER_CONFIG_V5_P1PLL 0x00 ++#define ATOM_TRANSMITTER_CONFIG_V5_P2PLL 0x04 ++#define ATOM_TRANSMITTER_CONFIG_V5_P0PLL 0x08 ++#define ATOM_TRANSMITTER_CONFIG_V5_REFCLK_SRC_EXT 0x0c ++// Bit6:4 ++#define ATOM_TRANSMITTER_CONFIG_V5_HPD_SEL_MASK 0x70 ++#define ATOM_TRANSMITTER_CONFIG_V5_HPD_SEL_SHIFT 0x04 ++ ++#define ATOM_TRANSMITTER_CONFIG_V5_NO_HPD_SEL 0x00 ++#define ATOM_TRANSMITTER_CONFIG_V5_HPD1_SEL 0x10 ++#define ATOM_TRANSMITTER_CONFIG_V5_HPD2_SEL 0x20 ++#define ATOM_TRANSMITTER_CONFIG_V5_HPD3_SEL 0x30 ++#define ATOM_TRANSMITTER_CONFIG_V5_HPD4_SEL 0x40 ++#define ATOM_TRANSMITTER_CONFIG_V5_HPD5_SEL 0x50 ++#define ATOM_TRANSMITTER_CONFIG_V5_HPD6_SEL 0x60 ++ ++#define DIG_TRANSMITTER_CONTROL_PS_ALLOCATION_V1_5 DIG_TRANSMITTER_CONTROL_PARAMETERS_V1_5 ++ ++ + /****************************************************************************/ + // Structures used by ExternalEncoderControlTable V1.3 + // ASIC Families: Evergreen, Llano, NI +@@ -1793,6 +1943,7 @@ typedef struct _ENABLE_SPREAD_SPECTRUM_ON_PPLL_V2 + #define ATOM_PPLL_SS_TYPE_V3_P1PLL 0x00 + #define ATOM_PPLL_SS_TYPE_V3_P2PLL 0x04 + #define ATOM_PPLL_SS_TYPE_V3_DCPLL 0x08 ++#define ATOM_PPLL_SS_TYPE_V3_P0PLL ATOM_PPLL_SS_TYPE_V3_DCPLL + #define ATOM_PPLL_SS_AMOUNT_V3_FBDIV_MASK 0x00FF + #define ATOM_PPLL_SS_AMOUNT_V3_FBDIV_SHIFT 0 + #define ATOM_PPLL_SS_AMOUNT_V3_NFRAC_MASK 0x0F00 +@@ -2030,12 +2181,77 @@ typedef struct _SET_VOLTAGE_PARAMETERS_V2 + USHORT usVoltageLevel; // real voltage level + }SET_VOLTAGE_PARAMETERS_V2; + ++ ++typedef struct _SET_VOLTAGE_PARAMETERS_V1_3 ++{ ++ UCHAR ucVoltageType; // To tell which voltage to set up, VDDC/MVDDC/MVDDQ/VDDCI ++ UCHAR ucVoltageMode; // Indicate action: Set voltage level ++ USHORT usVoltageLevel; // real voltage level in unit of mv or Voltage Phase (0, 1, 2, .. ) ++}SET_VOLTAGE_PARAMETERS_V1_3; ++ ++//ucVoltageType ++#define VOLTAGE_TYPE_VDDC 1 ++#define VOLTAGE_TYPE_MVDDC 2 ++#define VOLTAGE_TYPE_MVDDQ 3 ++#define VOLTAGE_TYPE_VDDCI 4 ++ ++//SET_VOLTAGE_PARAMETERS_V3.ucVoltageMode ++#define ATOM_SET_VOLTAGE 0 //Set voltage Level ++#define ATOM_INIT_VOLTAGE_REGULATOR 3 //Init Regulator ++#define ATOM_SET_VOLTAGE_PHASE 4 //Set Vregulator Phase ++#define ATOM_GET_MAX_VOLTAGE 6 //Get Max Voltage, not used in SetVoltageTable v1.3 ++#define ATOM_GET_VOLTAGE_LEVEL 6 //Get Voltage level from vitual voltage ID ++ ++// define vitual voltage id in usVoltageLevel ++#define ATOM_VIRTUAL_VOLTAGE_ID0 0xff01 ++#define ATOM_VIRTUAL_VOLTAGE_ID1 0xff02 ++#define ATOM_VIRTUAL_VOLTAGE_ID2 0xff03 ++#define ATOM_VIRTUAL_VOLTAGE_ID3 0xff04 ++ + typedef struct _SET_VOLTAGE_PS_ALLOCATION + { + SET_VOLTAGE_PARAMETERS sASICSetVoltage; + WRITE_ONE_BYTE_HW_I2C_DATA_PS_ALLOCATION sReserved; + }SET_VOLTAGE_PS_ALLOCATION; + ++// New Added from SI for GetVoltageInfoTable, input parameter structure ++typedef struct _GET_VOLTAGE_INFO_INPUT_PARAMETER_V1_1 ++{ ++ UCHAR ucVoltageType; // Input: To tell which voltage to set up, VDDC/MVDDC/MVDDQ/VDDCI ++ UCHAR ucVoltageMode; // Input: Indicate action: Get voltage info ++ USHORT usVoltageLevel; // Input: real voltage level in unit of mv or Voltage Phase (0, 1, 2, .. ) or Leakage Id ++ ULONG ulReserved; ++}GET_VOLTAGE_INFO_INPUT_PARAMETER_V1_1; ++ ++// New Added from SI for GetVoltageInfoTable, output parameter structure when ucVotlageMode == ATOM_GET_VOLTAGE_VID ++typedef struct _GET_VOLTAGE_INFO_OUTPUT_PARAMETER_V1_1 ++{ ++ ULONG ulVotlageGpioState; ++ ULONG ulVoltageGPioMask; ++}GET_VOLTAGE_INFO_OUTPUT_PARAMETER_V1_1; ++ ++// New Added from SI for GetVoltageInfoTable, output parameter structure when ucVotlageMode == ATOM_GET_VOLTAGE_STATEx_LEAKAGE_VID ++typedef struct _GET_LEAKAGE_VOLTAGE_INFO_OUTPUT_PARAMETER_V1_1 ++{ ++ USHORT usVoltageLevel; ++ USHORT usVoltageId; // Voltage Id programmed in Voltage Regulator ++ ULONG ulReseved; ++}GET_LEAKAGE_VOLTAGE_INFO_OUTPUT_PARAMETER_V1_1; ++ ++ ++// GetVoltageInfo v1.1 ucVoltageMode ++#define ATOM_GET_VOLTAGE_VID 0x00 ++#define ATOM_GET_VOTLAGE_INIT_SEQ 0x03 ++#define ATOM_GET_VOLTTAGE_PHASE_PHASE_VID 0x04 ++// for SI, this state map to 0xff02 voltage state in Power Play table, which is power boost state ++#define ATOM_GET_VOLTAGE_STATE0_LEAKAGE_VID 0x10 ++ ++// for SI, this state map to 0xff01 voltage state in Power Play table, which is performance state ++#define ATOM_GET_VOLTAGE_STATE1_LEAKAGE_VID 0x11 ++// undefined power state ++#define ATOM_GET_VOLTAGE_STATE2_LEAKAGE_VID 0x12 ++#define ATOM_GET_VOLTAGE_STATE3_LEAKAGE_VID 0x13 ++ + /****************************************************************************/ + // Structures used by TVEncoderControlTable + /****************************************************************************/ +@@ -2065,9 +2281,9 @@ typedef struct _ATOM_MASTER_LIST_OF_DATA_TABLES + USHORT MultimediaConfigInfo; // Only used by MM Lib,latest version 2.1, not configuable from Bios, need to include the table to build Bios + USHORT StandardVESA_Timing; // Only used by Bios + USHORT FirmwareInfo; // Shared by various SW components,latest version 1.4 +- USHORT DAC_Info; // Will be obsolete from R600 ++ USHORT PaletteData; // Only used by BIOS + USHORT LCD_Info; // Shared by various SW components,latest version 1.3, was called LVDS_Info +- USHORT TMDS_Info; // Will be obsolete from R600 ++ USHORT DIGTransmitterInfo; // Internal used by VBIOS only version 3.1 + USHORT AnalogTV_Info; // Shared by various SW components,latest version 1.1 + USHORT SupportedDevicesInfo; // Will be obsolete from R600 + USHORT GPIO_I2C_Info; // Shared by various SW components,latest version 1.2 will be used from R600 +@@ -2096,15 +2312,16 @@ typedef struct _ATOM_MASTER_LIST_OF_DATA_TABLES + USHORT PowerSourceInfo; // Shared by various SW components, latest versoin 1.1 + }ATOM_MASTER_LIST_OF_DATA_TABLES; + +-// For backward compatible +-#define LVDS_Info LCD_Info +- + typedef struct _ATOM_MASTER_DATA_TABLE + { + ATOM_COMMON_TABLE_HEADER sHeader; + ATOM_MASTER_LIST_OF_DATA_TABLES ListOfDataTables; + }ATOM_MASTER_DATA_TABLE; + ++// For backward compatible ++#define LVDS_Info LCD_Info ++#define DAC_Info PaletteData ++#define TMDS_Info DIGTransmitterInfo + + /****************************************************************************/ + // Structure used in MultimediaCapabilityInfoTable +@@ -2171,7 +2388,9 @@ typedef struct _ATOM_MULTIMEDIA_CONFIG_INFO + typedef struct _ATOM_FIRMWARE_CAPABILITY + { + #if ATOM_BIG_ENDIAN +- USHORT Reserved:3; ++ USHORT Reserved:1; ++ USHORT SCL2Redefined:1; ++ USHORT PostWithoutModeSet:1; + USHORT HyperMemory_Size:4; + USHORT HyperMemory_Support:1; + USHORT PPMode_Assigned:1; +@@ -2193,7 +2412,9 @@ typedef struct _ATOM_FIRMWARE_CAPABILITY + USHORT PPMode_Assigned:1; + USHORT HyperMemory_Support:1; + USHORT HyperMemory_Size:4; +- USHORT Reserved:3; ++ USHORT PostWithoutModeSet:1; ++ USHORT SCL2Redefined:1; ++ USHORT Reserved:1; + #endif + }ATOM_FIRMWARE_CAPABILITY; + +@@ -2418,7 +2639,8 @@ typedef struct _ATOM_FIRMWARE_INFO_V2_2 + USHORT usLcdMaxPixelClockPLL_Output; // In MHz unit + ULONG ulReserved4; //Was ulAsicMaximumVoltage + ULONG ulMinPixelClockPLL_Output; //In 10Khz unit +- ULONG ulReserved5; //Was usMinEngineClockPLL_Input and usMaxEngineClockPLL_Input ++ UCHAR ucRemoteDisplayConfig; ++ UCHAR ucReserved5[3]; //Was usMinEngineClockPLL_Input and usMaxEngineClockPLL_Input + ULONG ulReserved6; //Was usMinEngineClockPLL_Output and usMinMemoryClockPLL_Input + ULONG ulReserved7; //Was usMaxMemoryClockPLL_Input and usMinMemoryClockPLL_Output + USHORT usReserved11; //Was usMaxPixelClock; //In 10Khz unit, Max. Pclk used only for DAC +@@ -2438,6 +2660,11 @@ typedef struct _ATOM_FIRMWARE_INFO_V2_2 + + #define ATOM_FIRMWARE_INFO_LAST ATOM_FIRMWARE_INFO_V2_2 + ++ ++// definition of ucRemoteDisplayConfig ++#define REMOTE_DISPLAY_DISABLE 0x00 ++#define REMOTE_DISPLAY_ENABLE 0x01 ++ + /****************************************************************************/ + // Structures used in IntegratedSystemInfoTable + /****************************************************************************/ +@@ -2660,8 +2887,9 @@ usMinDownStreamHTLinkWidth: same as above. + #define INTEGRATED_SYSTEM_INFO__AMD_CPU__GREYHOUND 2 + #define INTEGRATED_SYSTEM_INFO__AMD_CPU__K8 3 + #define INTEGRATED_SYSTEM_INFO__AMD_CPU__PHARAOH 4 ++#define INTEGRATED_SYSTEM_INFO__AMD_CPU__OROCHI 5 + +-#define INTEGRATED_SYSTEM_INFO__AMD_CPU__MAX_CODE INTEGRATED_SYSTEM_INFO__AMD_CPU__PHARAOH // this deff reflects max defined CPU code ++#define INTEGRATED_SYSTEM_INFO__AMD_CPU__MAX_CODE INTEGRATED_SYSTEM_INFO__AMD_CPU__OROCHI // this deff reflects max defined CPU code + + #define SYSTEM_CONFIG_POWEREXPRESS_ENABLE 0x00000001 + #define SYSTEM_CONFIG_RUN_AT_OVERDRIVE_ENGINE 0x00000002 +@@ -2753,6 +2981,7 @@ typedef struct _ATOM_INTEGRATED_SYSTEM_INFO_V5 + #define ASIC_INT_DIG4_ENCODER_ID 0x0b + #define ASIC_INT_DIG5_ENCODER_ID 0x0c + #define ASIC_INT_DIG6_ENCODER_ID 0x0d ++#define ASIC_INT_DIG7_ENCODER_ID 0x0e + + //define Encoder attribute + #define ATOM_ANALOG_ENCODER 0 +@@ -3226,15 +3455,23 @@ typedef struct _ATOM_LCD_INFO_V13 + + UCHAR ucPowerSequenceDIGONtoDE_in4Ms; + UCHAR ucPowerSequenceDEtoVARY_BL_in4Ms; +- UCHAR ucPowerSequenceDEtoDIGON_in4Ms; + UCHAR ucPowerSequenceVARY_BLtoDE_in4Ms; ++ UCHAR ucPowerSequenceDEtoDIGON_in4Ms; + + UCHAR ucOffDelay_in4Ms; + UCHAR ucPowerSequenceVARY_BLtoBLON_in4Ms; + UCHAR ucPowerSequenceBLONtoVARY_BL_in4Ms; + UCHAR ucReserved1; + +- ULONG ulReserved[4]; ++ UCHAR ucDPCD_eDP_CONFIGURATION_CAP; // dpcd 0dh ++ UCHAR ucDPCD_MAX_LINK_RATE; // dpcd 01h ++ UCHAR ucDPCD_MAX_LANE_COUNT; // dpcd 02h ++ UCHAR ucDPCD_MAX_DOWNSPREAD; // dpcd 03h ++ ++ USHORT usMaxPclkFreqInSingleLink; // Max PixelClock frequency in single link mode. ++ UCHAR uceDPToLVDSRxId; ++ UCHAR ucLcdReservd; ++ ULONG ulReserved[2]; + }ATOM_LCD_INFO_V13; + + #define ATOM_LCD_INFO_LAST ATOM_LCD_INFO_V13 +@@ -3273,6 +3510,11 @@ typedef struct _ATOM_LCD_INFO_V13 + //Use this cap bit for a quick reference whether an embadded panel (LCD1 ) is LVDS or eDP. + #define LCDPANEL_CAP_V13_eDP 0x4 // = LCDPANEL_CAP_eDP no change comparing to previous version + ++//uceDPToLVDSRxId ++#define eDP_TO_LVDS_RX_DISABLE 0x00 // no eDP->LVDS translator chip ++#define eDP_TO_LVDS_COMMON_ID 0x01 // common eDP->LVDS translator chip without AMD SW init ++#define eDP_TO_LVDS_RT_ID 0x02 // RT tanslator which require AMD SW init ++ + typedef struct _ATOM_PATCH_RECORD_MODE + { + UCHAR ucRecordType; +@@ -3317,6 +3559,7 @@ typedef struct _ATOM_PANEL_RESOLUTION_PATCH_RECORD + #define LCD_CAP_RECORD_TYPE 3 + #define LCD_FAKE_EDID_PATCH_RECORD_TYPE 4 + #define LCD_PANEL_RESOLUTION_RECORD_TYPE 5 ++#define LCD_EDID_OFFSET_PATCH_RECORD_TYPE 6 + #define ATOM_RECORD_END_TYPE 0xFF + + /****************************Spread Spectrum Info Table Definitions **********************/ +@@ -3528,6 +3771,7 @@ else //Non VGA case + + CAIL needs to claim an reserved area defined by FBAccessAreaOffset and usFBUsedbyDrvInKB in non VGA case.*/ + ++/***********************************************************************************/ + #define ATOM_MAX_FIRMWARE_VRAM_USAGE_INFO 1 + + typedef struct _ATOM_FIRMWARE_VRAM_RESERVE_INFO +@@ -3818,13 +4062,17 @@ typedef struct _EXT_DISPLAY_PATH + ATOM_DP_CONN_CHANNEL_MAPPING asDPMapping; + ATOM_DVI_CONN_CHANNEL_MAPPING asDVIMapping; + }; +- UCHAR ucReserved; +- USHORT usReserved[2]; ++ UCHAR ucChPNInvert; // bit vector for up to 8 lanes, =0: P and N is not invert, =1 P and N is inverted ++ USHORT usCaps; ++ USHORT usReserved; + }EXT_DISPLAY_PATH; + + #define NUMBER_OF_UCHAR_FOR_GUID 16 + #define MAX_NUMBER_OF_EXT_DISPLAY_PATH 7 + ++//usCaps ++#define EXT_DISPLAY_PATH_CAPS__HBR2_DISABLE 0x01 ++ + typedef struct _ATOM_EXTERNAL_DISPLAY_CONNECTION_INFO + { + ATOM_COMMON_TABLE_HEADER sHeader; +@@ -3832,7 +4080,9 @@ typedef struct _ATOM_EXTERNAL_DISPLAY_CONNECTION_INFO + EXT_DISPLAY_PATH sPath[MAX_NUMBER_OF_EXT_DISPLAY_PATH]; // total of fixed 7 entries. + UCHAR ucChecksum; // a simple Checksum of the sum of whole structure equal to 0x0. + UCHAR uc3DStereoPinId; // use for eDP panel +- UCHAR Reserved [6]; // for potential expansion ++ UCHAR ucRemoteDisplayConfig; ++ UCHAR uceDPToLVDSRxId; ++ UCHAR Reserved[4]; // for potential expansion + }ATOM_EXTERNAL_DISPLAY_CONNECTION_INFO; + + //Related definitions, all records are different but they have a commond header +@@ -3977,6 +4227,7 @@ typedef struct _ATOM_OBJECT_GPIO_CNTL_RECORD + #define GPIO_PIN_STATE_ACTIVE_HIGH 0x1 + + // Indexes to GPIO array in GLSync record ++// GLSync record is for Frame Lock/Gen Lock feature. + #define ATOM_GPIO_INDEX_GLSYNC_REFCLK 0 + #define ATOM_GPIO_INDEX_GLSYNC_HSYNC 1 + #define ATOM_GPIO_INDEX_GLSYNC_VSYNC 2 +@@ -3984,7 +4235,9 @@ typedef struct _ATOM_OBJECT_GPIO_CNTL_RECORD + #define ATOM_GPIO_INDEX_GLSYNC_SWAP_GNT 4 + #define ATOM_GPIO_INDEX_GLSYNC_INTERRUPT 5 + #define ATOM_GPIO_INDEX_GLSYNC_V_RESET 6 +-#define ATOM_GPIO_INDEX_GLSYNC_MAX 7 ++#define ATOM_GPIO_INDEX_GLSYNC_SWAP_CNTL 7 ++#define ATOM_GPIO_INDEX_GLSYNC_SWAP_SEL 8 ++#define ATOM_GPIO_INDEX_GLSYNC_MAX 9 + + typedef struct _ATOM_ENCODER_DVO_CF_RECORD + { +@@ -3994,7 +4247,8 @@ typedef struct _ATOM_ENCODER_DVO_CF_RECORD + }ATOM_ENCODER_DVO_CF_RECORD; + + // Bit maps for ATOM_ENCODER_CAP_RECORD.ucEncoderCap +-#define ATOM_ENCODER_CAP_RECORD_HBR2 0x01 // DP1.2 HBR2 is supported by this path ++#define ATOM_ENCODER_CAP_RECORD_HBR2 0x01 // DP1.2 HBR2 is supported by HW encoder ++#define ATOM_ENCODER_CAP_RECORD_HBR2_EN 0x02 // DP1.2 HBR2 setting is qualified and HBR2 can be enabled + + typedef struct _ATOM_ENCODER_CAP_RECORD + { +@@ -4003,11 +4257,13 @@ typedef struct _ATOM_ENCODER_CAP_RECORD + USHORT usEncoderCap; + struct { + #if ATOM_BIG_ENDIAN +- USHORT usReserved:15; // Bit1-15 may be defined for other capability in future ++ USHORT usReserved:14; // Bit1-15 may be defined for other capability in future ++ USHORT usHBR2En:1; // Bit1 is for DP1.2 HBR2 enable + USHORT usHBR2Cap:1; // Bit0 is for DP1.2 HBR2 capability. + #else + USHORT usHBR2Cap:1; // Bit0 is for DP1.2 HBR2 capability. +- USHORT usReserved:15; // Bit1-15 may be defined for other capability in future ++ USHORT usHBR2En:1; // Bit1 is for DP1.2 HBR2 enable ++ USHORT usReserved:14; // Bit1-15 may be defined for other capability in future + #endif + }; + }; +@@ -4157,6 +4413,7 @@ typedef struct _ATOM_VOLTAGE_CONTROL + #define VOLTAGE_CONTROL_ID_VT1556M 0x07 + #define VOLTAGE_CONTROL_ID_CHL822x 0x08 + #define VOLTAGE_CONTROL_ID_VT1586M 0x09 ++#define VOLTAGE_CONTROL_ID_UP1637 0x0A + + typedef struct _ATOM_VOLTAGE_OBJECT + { +@@ -4193,6 +4450,69 @@ typedef struct _ATOM_LEAKID_VOLTAGE + USHORT usVoltage; + }ATOM_LEAKID_VOLTAGE; + ++typedef struct _ATOM_VOLTAGE_OBJECT_HEADER_V3{ ++ UCHAR ucVoltageType; //Indicate Voltage Source: VDDC, MVDDC, MVDDQ or MVDDCI ++ UCHAR ucVoltageMode; //Indicate voltage control mode: Init/Set/Leakage/Set phase ++ USHORT usSize; //Size of Object ++}ATOM_VOLTAGE_OBJECT_HEADER_V3; ++ ++typedef struct _VOLTAGE_LUT_ENTRY_V2 ++{ ++ ULONG ulVoltageId; // The Voltage ID which is used to program GPIO register ++ USHORT usVoltageValue; // The corresponding Voltage Value, in mV ++}VOLTAGE_LUT_ENTRY_V2; ++ ++typedef struct _LEAKAGE_VOLTAGE_LUT_ENTRY_V2 ++{ ++ USHORT usVoltageLevel; // The Voltage ID which is used to program GPIO register ++ USHORT usVoltageId; ++ USHORT usLeakageId; // The corresponding Voltage Value, in mV ++}LEAKAGE_VOLTAGE_LUT_ENTRY_V2; ++ ++typedef struct _ATOM_I2C_VOLTAGE_OBJECT_V3 ++{ ++ ATOM_VOLTAGE_OBJECT_HEADER_V3 sHeader; ++ UCHAR ucVoltageRegulatorId; //Indicate Voltage Regulator Id ++ UCHAR ucVoltageControlI2cLine; ++ UCHAR ucVoltageControlAddress; ++ UCHAR ucVoltageControlOffset; ++ ULONG ulReserved; ++ VOLTAGE_LUT_ENTRY asVolI2cLut[1]; // end with 0xff ++}ATOM_I2C_VOLTAGE_OBJECT_V3; ++ ++typedef struct _ATOM_GPIO_VOLTAGE_OBJECT_V3 ++{ ++ ATOM_VOLTAGE_OBJECT_HEADER_V3 sHeader; ++ UCHAR ucVoltageGpioCntlId; // default is 0 which indicate control through CG VID mode ++ UCHAR ucGpioEntryNum; // indiate the entry numbers of Votlage/Gpio value Look up table ++ UCHAR ucPhaseDelay; // phase delay in unit of micro second ++ UCHAR ucReserved; ++ ULONG ulGpioMaskVal; // GPIO Mask value ++ VOLTAGE_LUT_ENTRY_V2 asVolGpioLut[1]; ++}ATOM_GPIO_VOLTAGE_OBJECT_V3; ++ ++typedef struct _ATOM_LEAKAGE_VOLTAGE_OBJECT_V3 ++{ ++ ATOM_VOLTAGE_OBJECT_HEADER_V3 sHeader; ++ UCHAR ucLeakageCntlId; // default is 0 ++ UCHAR ucLeakageEntryNum; // indicate the entry number of LeakageId/Voltage Lut table ++ UCHAR ucReserved[2]; ++ ULONG ulMaxVoltageLevel; ++ LEAKAGE_VOLTAGE_LUT_ENTRY_V2 asLeakageIdLut[1]; ++}ATOM_LEAKAGE_VOLTAGE_OBJECT_V3; ++ ++typedef union _ATOM_VOLTAGE_OBJECT_V3{ ++ ATOM_GPIO_VOLTAGE_OBJECT_V3 asGpioVoltageObj; ++ ATOM_I2C_VOLTAGE_OBJECT_V3 asI2cVoltageObj; ++ ATOM_LEAKAGE_VOLTAGE_OBJECT_V3 asLeakageObj; ++}ATOM_VOLTAGE_OBJECT_V3; ++ ++typedef struct _ATOM_VOLTAGE_OBJECT_INFO_V3_1 ++{ ++ ATOM_COMMON_TABLE_HEADER sHeader; ++ ATOM_VOLTAGE_OBJECT_V3 asVoltageObj[3]; //Info for Voltage control ++}ATOM_VOLTAGE_OBJECT_INFO_V3_1; ++ + typedef struct _ATOM_ASIC_PROFILE_VOLTAGE + { + UCHAR ucProfileId; +@@ -4305,7 +4625,18 @@ typedef struct _ATOM_INTEGRATED_SYSTEM_INFO_V6 + USHORT usHDMISSpreadRateIn10Hz; + USHORT usDVISSPercentage; + USHORT usDVISSpreadRateIn10Hz; +- ULONG ulReserved3[21]; ++ ULONG SclkDpmBoostMargin; ++ ULONG SclkDpmThrottleMargin; ++ USHORT SclkDpmTdpLimitPG; ++ USHORT SclkDpmTdpLimitBoost; ++ ULONG ulBoostEngineCLock; ++ UCHAR ulBoostVid_2bit; ++ UCHAR EnableBoost; ++ USHORT GnbTdpLimit; ++ USHORT usMaxLVDSPclkFreqInSingleLink; ++ UCHAR ucLvdsMisc; ++ UCHAR ucLVDSReserved; ++ ULONG ulReserved3[15]; + ATOM_EXTERNAL_DISPLAY_CONNECTION_INFO sExtDispConnInfo; + }ATOM_INTEGRATED_SYSTEM_INFO_V6; + +@@ -4313,9 +4644,16 @@ typedef struct _ATOM_INTEGRATED_SYSTEM_INFO_V6 + #define INTEGRATED_SYSTEM_INFO_V6_GPUCAPINFO__TMDSHDMI_COHERENT_SINGLEPLL_MODE 0x01 + #define INTEGRATED_SYSTEM_INFO_V6_GPUCAPINFO__DISABLE_AUX_HW_MODE_DETECTION 0x08 + +-// ulOtherDisplayMisc +-#define INTEGRATED_SYSTEM_INFO__GET_EDID_CALLBACK_FUNC_SUPPORT 0x01 ++//ucLVDSMisc: ++#define SYS_INFO_LVDSMISC__888_FPDI_MODE 0x01 ++#define SYS_INFO_LVDSMISC__DL_CH_SWAP 0x02 ++#define SYS_INFO_LVDSMISC__888_BPC 0x04 ++#define SYS_INFO_LVDSMISC__OVERRIDE_EN 0x08 ++#define SYS_INFO_LVDSMISC__BLON_ACTIVE_LOW 0x10 + ++// not used any more ++#define SYS_INFO_LVDSMISC__VSYNC_ACTIVE_LOW 0x04 ++#define SYS_INFO_LVDSMISC__HSYNC_ACTIVE_LOW 0x08 + + /********************************************************************************************************************** + ATOM_INTEGRATED_SYSTEM_INFO_V6 Description +@@ -4384,7 +4722,208 @@ ucUMAChannelNumber: System memory channel numbers. + ulCSR_M3_ARB_CNTL_DEFAULT[10]: Arrays with values for CSR M3 arbiter for default + ulCSR_M3_ARB_CNTL_UVD[10]: Arrays with values for CSR M3 arbiter for UVD playback. + ulCSR_M3_ARB_CNTL_FS3D[10]: Arrays with values for CSR M3 arbiter for Full Screen 3D applications. +-sAvail_SCLK[5]: Arrays to provide available list of SLCK and corresponding voltage, order from low to high ++sAvail_SCLK[5]: Arrays to provide availabe list of SLCK and corresponding voltage, order from low to high ++ulGMCRestoreResetTime: GMC power restore and GMC reset time to calculate data reconnection latency. Unit in ns. ++ulMinimumNClk: Minimum NCLK speed among all NB-Pstates to calcualte data reconnection latency. Unit in 10kHz. ++ulIdleNClk: NCLK speed while memory runs in self-refresh state. Unit in 10kHz. ++ulDDR_DLL_PowerUpTime: DDR PHY DLL power up time. Unit in ns. ++ulDDR_PLL_PowerUpTime: DDR PHY PLL power up time. Unit in ns. ++usPCIEClkSSPercentage: PCIE Clock Spred Spectrum Percentage in unit 0.01%; 100 mean 1%. ++usPCIEClkSSType: PCIE Clock Spred Spectrum Type. 0 for Down spread(default); 1 for Center spread. ++usLvdsSSPercentage: LVDS panel ( not include eDP ) Spread Spectrum Percentage in unit of 0.01%, =0, use VBIOS default setting. ++usLvdsSSpreadRateIn10Hz: LVDS panel ( not include eDP ) Spread Spectrum frequency in unit of 10Hz, =0, use VBIOS default setting. ++usHDMISSPercentage: HDMI Spread Spectrum Percentage in unit 0.01%; 100 mean 1%, =0, use VBIOS default setting. ++usHDMISSpreadRateIn10Hz: HDMI Spread Spectrum frequency in unit of 10Hz, =0, use VBIOS default setting. ++usDVISSPercentage: DVI Spread Spectrum Percentage in unit 0.01%; 100 mean 1%, =0, use VBIOS default setting. ++usDVISSpreadRateIn10Hz: DVI Spread Spectrum frequency in unit of 10Hz, =0, use VBIOS default setting. ++usMaxLVDSPclkFreqInSingleLink: Max pixel clock LVDS panel single link, if=0 means VBIOS use default threhold, right now it is 85Mhz ++ucLVDSMisc: [bit0] LVDS 888bit panel mode =0: LVDS 888 panel in LDI mode, =1: LVDS 888 panel in FPDI mode ++ [bit1] LVDS panel lower and upper link mapping =0: lower link and upper link not swap, =1: lower link and upper link are swapped ++ [bit2] LVDS 888bit per color mode =0: 666 bit per color =1:888 bit per color ++ [bit3] LVDS parameter override enable =0: ucLvdsMisc parameter are not used =1: ucLvdsMisc parameter should be used ++ [bit4] Polarity of signal sent to digital BLON output pin. =0: not inverted(active high) =1: inverted ( active low ) ++**********************************************************************************************************************/ ++ ++// this Table is used for Liano/Ontario APU ++typedef struct _ATOM_FUSION_SYSTEM_INFO_V1 ++{ ++ ATOM_INTEGRATED_SYSTEM_INFO_V6 sIntegratedSysInfo; ++ ULONG ulPowerplayTable[128]; ++}ATOM_FUSION_SYSTEM_INFO_V1; ++/********************************************************************************************************************** ++ ATOM_FUSION_SYSTEM_INFO_V1 Description ++sIntegratedSysInfo: refer to ATOM_INTEGRATED_SYSTEM_INFO_V6 definition. ++ulPowerplayTable[128]: This 512 bytes memory is used to save ATOM_PPLIB_POWERPLAYTABLE3, starting form ulPowerplayTable[0] ++**********************************************************************************************************************/ ++ ++// this IntegrateSystemInfoTable is used for Trinity APU ++typedef struct _ATOM_INTEGRATED_SYSTEM_INFO_V1_7 ++{ ++ ATOM_COMMON_TABLE_HEADER sHeader; ++ ULONG ulBootUpEngineClock; ++ ULONG ulDentistVCOFreq; ++ ULONG ulBootUpUMAClock; ++ ATOM_CLK_VOLT_CAPABILITY sDISPCLK_Voltage[4]; ++ ULONG ulBootUpReqDisplayVector; ++ ULONG ulOtherDisplayMisc; ++ ULONG ulGPUCapInfo; ++ ULONG ulSB_MMIO_Base_Addr; ++ USHORT usRequestedPWMFreqInHz; ++ UCHAR ucHtcTmpLmt; ++ UCHAR ucHtcHystLmt; ++ ULONG ulMinEngineClock; ++ ULONG ulSystemConfig; ++ ULONG ulCPUCapInfo; ++ USHORT usNBP0Voltage; ++ USHORT usNBP1Voltage; ++ USHORT usBootUpNBVoltage; ++ USHORT usExtDispConnInfoOffset; ++ USHORT usPanelRefreshRateRange; ++ UCHAR ucMemoryType; ++ UCHAR ucUMAChannelNumber; ++ UCHAR strVBIOSMsg[40]; ++ ULONG ulReserved[20]; ++ ATOM_AVAILABLE_SCLK_LIST sAvail_SCLK[5]; ++ ULONG ulGMCRestoreResetTime; ++ ULONG ulMinimumNClk; ++ ULONG ulIdleNClk; ++ ULONG ulDDR_DLL_PowerUpTime; ++ ULONG ulDDR_PLL_PowerUpTime; ++ USHORT usPCIEClkSSPercentage; ++ USHORT usPCIEClkSSType; ++ USHORT usLvdsSSPercentage; ++ USHORT usLvdsSSpreadRateIn10Hz; ++ USHORT usHDMISSPercentage; ++ USHORT usHDMISSpreadRateIn10Hz; ++ USHORT usDVISSPercentage; ++ USHORT usDVISSpreadRateIn10Hz; ++ ULONG SclkDpmBoostMargin; ++ ULONG SclkDpmThrottleMargin; ++ USHORT SclkDpmTdpLimitPG; ++ USHORT SclkDpmTdpLimitBoost; ++ ULONG ulBoostEngineCLock; ++ UCHAR ulBoostVid_2bit; ++ UCHAR EnableBoost; ++ USHORT GnbTdpLimit; ++ USHORT usMaxLVDSPclkFreqInSingleLink; ++ UCHAR ucLvdsMisc; ++ UCHAR ucLVDSReserved; ++ UCHAR ucLVDSPwrOnSeqDIGONtoDE_in4Ms; ++ UCHAR ucLVDSPwrOnSeqDEtoVARY_BL_in4Ms; ++ UCHAR ucLVDSPwrOffSeqVARY_BLtoDE_in4Ms; ++ UCHAR ucLVDSPwrOffSeqDEtoDIGON_in4Ms; ++ UCHAR ucLVDSOffToOnDelay_in4Ms; ++ UCHAR ucLVDSPwrOnSeqVARY_BLtoBLON_in4Ms; ++ UCHAR ucLVDSPwrOffSeqBLONtoVARY_BL_in4Ms; ++ UCHAR ucLVDSReserved1; ++ ULONG ulLCDBitDepthControlVal; ++ ULONG ulNbpStateMemclkFreq[4]; ++ USHORT usNBP2Voltage; ++ USHORT usNBP3Voltage; ++ ULONG ulNbpStateNClkFreq[4]; ++ UCHAR ucNBDPMEnable; ++ UCHAR ucReserved[3]; ++ UCHAR ucDPMState0VclkFid; ++ UCHAR ucDPMState0DclkFid; ++ UCHAR ucDPMState1VclkFid; ++ UCHAR ucDPMState1DclkFid; ++ UCHAR ucDPMState2VclkFid; ++ UCHAR ucDPMState2DclkFid; ++ UCHAR ucDPMState3VclkFid; ++ UCHAR ucDPMState3DclkFid; ++ ATOM_EXTERNAL_DISPLAY_CONNECTION_INFO sExtDispConnInfo; ++}ATOM_INTEGRATED_SYSTEM_INFO_V1_7; ++ ++// ulOtherDisplayMisc ++#define INTEGRATED_SYSTEM_INFO__GET_EDID_CALLBACK_FUNC_SUPPORT 0x01 ++#define INTEGRATED_SYSTEM_INFO__GET_BOOTUP_DISPLAY_CALLBACK_FUNC_SUPPORT 0x02 ++#define INTEGRATED_SYSTEM_INFO__GET_EXPANSION_CALLBACK_FUNC_SUPPORT 0x04 ++#define INTEGRATED_SYSTEM_INFO__FAST_BOOT_SUPPORT 0x08 ++ ++// ulGPUCapInfo ++#define SYS_INFO_GPUCAPS__TMDSHDMI_COHERENT_SINGLEPLL_MODE 0x01 ++#define SYS_INFO_GPUCAPS__DP_SINGLEPLL_MODE 0x02 ++#define SYS_INFO_GPUCAPS__DISABLE_AUX_MODE_DETECT 0x08 ++ ++/********************************************************************************************************************** ++ ATOM_INTEGRATED_SYSTEM_INFO_V1_7 Description ++ulBootUpEngineClock: VBIOS bootup Engine clock frequency, in 10kHz unit. if it is equal 0, then VBIOS use pre-defined bootup engine clock ++ulDentistVCOFreq: Dentist VCO clock in 10kHz unit. ++ulBootUpUMAClock: System memory boot up clock frequency in 10Khz unit. ++sDISPCLK_Voltage: Report Display clock voltage requirement. ++ ++ulBootUpReqDisplayVector: VBIOS boot up display IDs, following are supported devices in Trinity projects: ++ ATOM_DEVICE_CRT1_SUPPORT 0x0001 ++ ATOM_DEVICE_DFP1_SUPPORT 0x0008 ++ ATOM_DEVICE_DFP6_SUPPORT 0x0040 ++ ATOM_DEVICE_DFP2_SUPPORT 0x0080 ++ ATOM_DEVICE_DFP3_SUPPORT 0x0200 ++ ATOM_DEVICE_DFP4_SUPPORT 0x0400 ++ ATOM_DEVICE_DFP5_SUPPORT 0x0800 ++ ATOM_DEVICE_LCD1_SUPPORT 0x0002 ++ulOtherDisplayMisc: bit[0]=0: INT15 callback function Get LCD EDID ( ax=4e08, bl=1b ) is not supported by SBIOS. ++ =1: INT15 callback function Get LCD EDID ( ax=4e08, bl=1b ) is supported by SBIOS. ++ bit[1]=0: INT15 callback function Get boot display( ax=4e08, bl=01h) is not supported by SBIOS ++ =1: INT15 callback function Get boot display( ax=4e08, bl=01h) is supported by SBIOS ++ bit[2]=0: INT15 callback function Get panel Expansion ( ax=4e08, bl=02h) is not supported by SBIOS ++ =1: INT15 callback function Get panel Expansion ( ax=4e08, bl=02h) is supported by SBIOS ++ bit[3]=0: VBIOS fast boot is disable ++ =1: VBIOS fast boot is enable. ( VBIOS skip display device detection in every set mode if LCD panel is connect and LID is open) ++ulGPUCapInfo: bit[0]=0: TMDS/HDMI Coherent Mode use cascade PLL mode. ++ =1: TMDS/HDMI Coherent Mode use signel PLL mode. ++ bit[1]=0: DP mode use cascade PLL mode ( New for Trinity ) ++ =1: DP mode use single PLL mode ++ bit[3]=0: Enable AUX HW mode detection logic ++ =1: Disable AUX HW mode detection logic ++ ++ulSB_MMIO_Base_Addr: Physical Base address to SB MMIO space. Driver needs to initialize it for SMU usage. ++ ++usRequestedPWMFreqInHz: When it's set to 0x0 by SBIOS: the LCD BackLight is not controlled by GPU(SW). ++ Any attempt to change BL using VBIOS function or enable VariBri from PP table is not effective since ATOM_BIOS_INFO_BL_CONTROLLED_BY_GPU==0; ++ ++ When it's set to a non-zero frequency, the BackLight is controlled by GPU (SW) in one of two ways below: ++ 1. SW uses the GPU BL PWM output to control the BL, in chis case, this non-zero frequency determines what freq GPU should use; ++ VBIOS will set up proper PWM frequency and ATOM_BIOS_INFO_BL_CONTROLLED_BY_GPU==1,as the result, ++ Changing BL using VBIOS function is functional in both driver and non-driver present environment; ++ and enabling VariBri under the driver environment from PP table is optional. ++ ++ 2. SW uses other means to control BL (like DPCD),this non-zero frequency serves as a flag only indicating ++ that BL control from GPU is expected. ++ VBIOS will NOT set up PWM frequency but make ATOM_BIOS_INFO_BL_CONTROLLED_BY_GPU==1 ++ Changing BL using VBIOS function could be functional in both driver and non-driver present environment,but ++ it's per platform ++ and enabling VariBri under the driver environment from PP table is optional. ++ ++ucHtcTmpLmt: Refer to D18F3x64 bit[22:16], HtcTmpLmt. ++ Threshold on value to enter HTC_active state. ++ucHtcHystLmt: Refer to D18F3x64 bit[27:24], HtcHystLmt. ++ To calculate threshold off value to exit HTC_active state, which is Threshold on vlaue minus ucHtcHystLmt. ++ulMinEngineClock: Minimum SCLK allowed in 10kHz unit. This is calculated based on WRCK Fuse settings. ++ulSystemConfig: Bit[0]=0: PCIE Power Gating Disabled ++ =1: PCIE Power Gating Enabled ++ Bit[1]=0: DDR-DLL shut-down feature disabled. ++ 1: DDR-DLL shut-down feature enabled. ++ Bit[2]=0: DDR-PLL Power down feature disabled. ++ 1: DDR-PLL Power down feature enabled. ++ulCPUCapInfo: TBD ++usNBP0Voltage: VID for voltage on NB P0 State ++usNBP1Voltage: VID for voltage on NB P1 State ++usNBP2Voltage: VID for voltage on NB P2 State ++usNBP3Voltage: VID for voltage on NB P3 State ++usBootUpNBVoltage: Voltage Index of GNB voltage configured by SBIOS, which is suffcient to support VBIOS DISPCLK requirement. ++usExtDispConnInfoOffset: Offset to sExtDispConnInfo inside the structure ++usPanelRefreshRateRange: Bit vector for LCD supported refresh rate range. If DRR is requestd by the platform, at least two bits need to be set ++ to indicate a range. ++ SUPPORTED_LCD_REFRESHRATE_30Hz 0x0004 ++ SUPPORTED_LCD_REFRESHRATE_40Hz 0x0008 ++ SUPPORTED_LCD_REFRESHRATE_50Hz 0x0010 ++ SUPPORTED_LCD_REFRESHRATE_60Hz 0x0020 ++ucMemoryType: [3:0]=1:DDR1;=2:DDR2;=3:DDR3.[7:4] is reserved. ++ucUMAChannelNumber: System memory channel numbers. ++ulCSR_M3_ARB_CNTL_DEFAULT[10]: Arrays with values for CSR M3 arbiter for default ++ulCSR_M3_ARB_CNTL_UVD[10]: Arrays with values for CSR M3 arbiter for UVD playback. ++ulCSR_M3_ARB_CNTL_FS3D[10]: Arrays with values for CSR M3 arbiter for Full Screen 3D applications. ++sAvail_SCLK[5]: Arrays to provide availabe list of SLCK and corresponding voltage, order from low to high + ulGMCRestoreResetTime: GMC power restore and GMC reset time to calculate data reconnection latency. Unit in ns. + ulMinimumNClk: Minimum NCLK speed among all NB-Pstates to calcualte data reconnection latency. Unit in 10kHz. + ulIdleNClk: NCLK speed while memory runs in self-refresh state. Unit in 10kHz. +@@ -4398,6 +4937,41 @@ usHDMISSPercentage: HDMI Spread Spectrum Percentage in unit 0.01%; + usHDMISSpreadRateIn10Hz: HDMI Spread Spectrum frequency in unit of 10Hz, =0, use VBIOS default setting. + usDVISSPercentage: DVI Spread Spectrum Percentage in unit 0.01%; 100 mean 1%, =0, use VBIOS default setting. + usDVISSpreadRateIn10Hz: DVI Spread Spectrum frequency in unit of 10Hz, =0, use VBIOS default setting. ++usMaxLVDSPclkFreqInSingleLink: Max pixel clock LVDS panel single link, if=0 means VBIOS use default threhold, right now it is 85Mhz ++ucLVDSMisc: [bit0] LVDS 888bit panel mode =0: LVDS 888 panel in LDI mode, =1: LVDS 888 panel in FPDI mode ++ [bit1] LVDS panel lower and upper link mapping =0: lower link and upper link not swap, =1: lower link and upper link are swapped ++ [bit2] LVDS 888bit per color mode =0: 666 bit per color =1:888 bit per color ++ [bit3] LVDS parameter override enable =0: ucLvdsMisc parameter are not used =1: ucLvdsMisc parameter should be used ++ [bit4] Polarity of signal sent to digital BLON output pin. =0: not inverted(active high) =1: inverted ( active low ) ++ucLVDSPwrOnSeqDIGONtoDE_in4Ms: LVDS power up sequence time in unit of 4ms, time delay from DIGON signal active to data enable signal active( DE ). ++ =0 mean use VBIOS default which is 8 ( 32ms ). The LVDS power up sequence is as following: DIGON->DE->VARY_BL->BLON. ++ This parameter is used by VBIOS only. VBIOS will patch LVDS_InfoTable. ++ucLVDSPwrOnDEtoVARY_BL_in4Ms: LVDS power up sequence time in unit of 4ms., time delay from DE( data enable ) active to Vary Brightness enable signal active( VARY_BL ). ++ =0 mean use VBIOS default which is 90 ( 360ms ). The LVDS power up sequence is as following: DIGON->DE->VARY_BL->BLON. ++ This parameter is used by VBIOS only. VBIOS will patch LVDS_InfoTable. ++ ++ucLVDSPwrOffVARY_BLtoDE_in4Ms: LVDS power down sequence time in unit of 4ms, time delay from data enable ( DE ) signal off to LCDVCC (DIGON) off. ++ =0 mean use VBIOS default delay which is 8 ( 32ms ). The LVDS power down sequence is as following: BLON->VARY_BL->DE->DIGON ++ This parameter is used by VBIOS only. VBIOS will patch LVDS_InfoTable. ++ ++ucLVDSPwrOffDEtoDIGON_in4Ms: LVDS power down sequence time in unit of 4ms, time delay from vary brightness enable signal( VARY_BL) off to data enable ( DE ) signal off. ++ =0 mean use VBIOS default which is 90 ( 360ms ). The LVDS power down sequence is as following: BLON->VARY_BL->DE->DIGON ++ This parameter is used by VBIOS only. VBIOS will patch LVDS_InfoTable. ++ ++ucLVDSOffToOnDelay_in4Ms: LVDS power down sequence time in unit of 4ms. Time delay from DIGON signal off to DIGON signal active. ++ =0 means to use VBIOS default delay which is 125 ( 500ms ). ++ This parameter is used by VBIOS only. VBIOS will patch LVDS_InfoTable. ++ ++ucLVDSPwrOnVARY_BLtoBLON_in4Ms: LVDS power up sequence time in unit of 4ms. Time delay from VARY_BL signal on to DLON signal active. ++ =0 means to use VBIOS default delay which is 0 ( 0ms ). ++ This parameter is used by VBIOS only. VBIOS will patch LVDS_InfoTable. ++ ++ucLVDSPwrOffBLONtoVARY_BL_in4Ms: LVDS power down sequence time in unit of 4ms. Time delay from BLON signal off to VARY_BL signal off. ++ =0 means to use VBIOS default delay which is 0 ( 0ms ). ++ This parameter is used by VBIOS only. VBIOS will patch LVDS_InfoTable. ++ ++ulNbpStateMemclkFreq[4]: system memory clock frequncey in unit of 10Khz in different NB pstate. ++ + **********************************************************************************************************************/ + + /**************************************************************************/ +@@ -4459,6 +5033,7 @@ typedef struct _ATOM_ASIC_SS_ASSIGNMENT + #define ASIC_INTERNAL_SS_ON_DP 7 + #define ASIC_INTERNAL_SS_ON_DCPLL 8 + #define ASIC_EXTERNAL_SS_ON_DP_CLOCK 9 ++#define ASIC_INTERNAL_VCE_SS 10 + + typedef struct _ATOM_ASIC_SS_ASSIGNMENT_V2 + { +@@ -4520,7 +5095,7 @@ typedef struct _ATOM_ASIC_INTERNAL_SS_INFO_V3 + #define ATOM_DOS_MODE_INFO_DEF 7 + #define ATOM_I2C_CHANNEL_STATUS_DEF 8 + #define ATOM_I2C_CHANNEL_STATUS1_DEF 9 +- ++#define ATOM_INTERNAL_TIMER_DEF 10 + + // BIOS_0_SCRATCH Definition + #define ATOM_S0_CRT1_MONO 0x00000001L +@@ -4648,6 +5223,7 @@ typedef struct _ATOM_ASIC_INTERNAL_SS_INFO_V3 + #define ATOM_S2_DEVICE_DPMS_MASKw1 0x3FF + #define ATOM_S2_FORCEDLOWPWRMODE_STATE_MASKb3 0x0C + #define ATOM_S2_FORCEDLOWPWRMODE_STATE_CHANGEb3 0x10 ++#define ATOM_S2_TMDS_COHERENT_MODEb3 0x10 // used by VBIOS code only, use coherent mode for TMDS/HDMI mode + #define ATOM_S2_VRI_BRIGHT_ENABLEb3 0x20 + #define ATOM_S2_ROTATION_STATE_MASKb3 0xC0 + +@@ -5038,6 +5614,23 @@ typedef struct _ENABLE_GRAPH_SURFACE_PARAMETERS_V1_3 + USHORT usDeviceId; // Active Device Id for this surface. If no device, set to 0. + }ENABLE_GRAPH_SURFACE_PARAMETERS_V1_3; + ++typedef struct _ENABLE_GRAPH_SURFACE_PARAMETERS_V1_4 ++{ ++ USHORT usHight; // Image Hight ++ USHORT usWidth; // Image Width ++ USHORT usGraphPitch; ++ UCHAR ucColorDepth; ++ UCHAR ucPixelFormat; ++ UCHAR ucSurface; // Surface 1 or 2 ++ UCHAR ucEnable; // ATOM_ENABLE or ATOM_DISABLE ++ UCHAR ucModeType; ++ UCHAR ucReserved; ++}ENABLE_GRAPH_SURFACE_PARAMETERS_V1_4; ++ ++// ucEnable ++#define ATOM_GRAPH_CONTROL_SET_PITCH 0x0f ++#define ATOM_GRAPH_CONTROL_SET_DISP_START 0x10 ++ + typedef struct _ENABLE_GRAPH_SURFACE_PS_ALLOCATION + { + ENABLE_GRAPH_SURFACE_PARAMETERS sSetSurface; +@@ -5057,6 +5650,58 @@ typedef struct _GET_DISPLAY_SURFACE_SIZE_PARAMETERS + USHORT usY_Size; + }GET_DISPLAY_SURFACE_SIZE_PARAMETERS; + ++typedef struct _GET_DISPLAY_SURFACE_SIZE_PARAMETERS_V2 ++{ ++ union{ ++ USHORT usX_Size; //When use as input parameter, usX_Size indicates which CRTC ++ USHORT usSurface; ++ }; ++ USHORT usY_Size; ++ USHORT usDispXStart; ++ USHORT usDispYStart; ++}GET_DISPLAY_SURFACE_SIZE_PARAMETERS_V2; ++ ++ ++typedef struct _PALETTE_DATA_CONTROL_PARAMETERS_V3 ++{ ++ UCHAR ucLutId; ++ UCHAR ucAction; ++ USHORT usLutStartIndex; ++ USHORT usLutLength; ++ USHORT usLutOffsetInVram; ++}PALETTE_DATA_CONTROL_PARAMETERS_V3; ++ ++// ucAction: ++#define PALETTE_DATA_AUTO_FILL 1 ++#define PALETTE_DATA_READ 2 ++#define PALETTE_DATA_WRITE 3 ++ ++ ++typedef struct _INTERRUPT_SERVICE_PARAMETERS_V2 ++{ ++ UCHAR ucInterruptId; ++ UCHAR ucServiceId; ++ UCHAR ucStatus; ++ UCHAR ucReserved; ++}INTERRUPT_SERVICE_PARAMETER_V2; ++ ++// ucInterruptId ++#define HDP1_INTERRUPT_ID 1 ++#define HDP2_INTERRUPT_ID 2 ++#define HDP3_INTERRUPT_ID 3 ++#define HDP4_INTERRUPT_ID 4 ++#define HDP5_INTERRUPT_ID 5 ++#define HDP6_INTERRUPT_ID 6 ++#define SW_INTERRUPT_ID 11 ++ ++// ucAction ++#define INTERRUPT_SERVICE_GEN_SW_INT 1 ++#define INTERRUPT_SERVICE_GET_STATUS 2 ++ ++ // ucStatus ++#define INTERRUPT_STATUS__INT_TRIGGER 1 ++#define INTERRUPT_STATUS__HPD_HIGH 2 ++ + typedef struct _INDIRECT_IO_ACCESS + { + ATOM_COMMON_TABLE_HEADER sHeader; +@@ -5189,7 +5834,7 @@ typedef struct _ATOM_INIT_REG_BLOCK{ + + #define END_OF_REG_INDEX_BLOCK 0x0ffff + #define END_OF_REG_DATA_BLOCK 0x00000000 +-#define ATOM_INIT_REG_MASK_FLAG 0x80 ++#define ATOM_INIT_REG_MASK_FLAG 0x80 //Not used in BIOS + #define CLOCK_RANGE_HIGHEST 0x00ffffff + + #define VALUE_DWORD SIZEOF ULONG +@@ -5229,6 +5874,7 @@ typedef struct _ATOM_MC_INIT_PARAM_TABLE + #define _128Mx8 0x51 + #define _128Mx16 0x52 + #define _256Mx8 0x61 ++#define _256Mx16 0x62 + + #define SAMSUNG 0x1 + #define INFINEON 0x2 +@@ -5585,7 +6231,7 @@ typedef struct _ATOM_VRAM_MODULE_V7 + ULONG ulChannelMapCfg; // mmMC_SHARED_CHREMAP + USHORT usModuleSize; // Size of ATOM_VRAM_MODULE_V7 + USHORT usPrivateReserved; // MC_ARB_RAMCFG (includes NOOFBANK,NOOFRANKS,NOOFROWS,NOOFCOLS) +- USHORT usReserved; ++ USHORT usEnableChannels; // bit vector which indicate which channels are enabled + UCHAR ucExtMemoryID; // Current memory module ID + UCHAR ucMemoryType; // MEM_TYPE_DDR2/DDR3/GDDR3/GDDR5 + UCHAR ucChannelNum; // Number of mem. channels supported in this module +@@ -5597,7 +6243,8 @@ typedef struct _ATOM_VRAM_MODULE_V7 + UCHAR ucNPL_RT; // Round trip delay (MC_SEQ_CAS_TIMING [28:24]:TCL=CL+NPL_RT-2). Always 2. + UCHAR ucPreamble; // [7:4] Write Preamble, [3:0] Read Preamble + UCHAR ucMemorySize; // Total memory size in unit of 16MB for CONFIG_MEMSIZE - bit[23:0] zeros +- UCHAR ucReserved[3]; ++ USHORT usSEQSettingOffset; ++ UCHAR ucReserved; + // Memory Module specific values + USHORT usEMRS2Value; // EMRS2/MR2 Value. + USHORT usEMRS3Value; // EMRS3/MR3 Value. +@@ -5633,10 +6280,10 @@ typedef struct _ATOM_VRAM_INFO_V3 + typedef struct _ATOM_VRAM_INFO_V4 + { + ATOM_COMMON_TABLE_HEADER sHeader; +- USHORT usMemAdjustTblOffset; // offset of ATOM_INIT_REG_BLOCK structure for memory vendor specific MC adjust setting +- USHORT usMemClkPatchTblOffset; // offset of ATOM_INIT_REG_BLOCK structure for memory clock specific MC setting +- USHORT usRerseved; +- UCHAR ucMemDQ7_0ByteRemap; // DQ line byte remap, =0: Memory Data line BYTE0, =1: BYTE1, =2: BYTE2, =3: BYTE3 ++ USHORT usMemAdjustTblOffset; // offset of ATOM_INIT_REG_BLOCK structure for memory vendor specific MC adjust setting ++ USHORT usMemClkPatchTblOffset; // offset of ATOM_INIT_REG_BLOCK structure for memory clock specific MC setting ++ USHORT usRerseved; ++ UCHAR ucMemDQ7_0ByteRemap; // DQ line byte remap, =0: Memory Data line BYTE0, =1: BYTE1, =2: BYTE2, =3: BYTE3 + ULONG ulMemDQ7_0BitRemap; // each DQ line ( 7~0) use 3bits, like: DQ0=Bit[2:0], DQ1:[5:3], ... DQ7:[23:21] + UCHAR ucReservde[4]; + UCHAR ucNumOfVRAMModule; +@@ -5648,9 +6295,10 @@ typedef struct _ATOM_VRAM_INFO_V4 + typedef struct _ATOM_VRAM_INFO_HEADER_V2_1 + { + ATOM_COMMON_TABLE_HEADER sHeader; +- USHORT usMemAdjustTblOffset; // offset of ATOM_INIT_REG_BLOCK structure for memory vendor specific MC adjust setting +- USHORT usMemClkPatchTblOffset; // offset of ATOM_INIT_REG_BLOCK structure for memory clock specific MC setting +- USHORT usReserved[4]; ++ USHORT usMemAdjustTblOffset; // offset of ATOM_INIT_REG_BLOCK structure for memory vendor specific MC adjust setting ++ USHORT usMemClkPatchTblOffset; // offset of ATOM_INIT_REG_BLOCK structure for memory clock specific MC setting ++ USHORT usPerBytePresetOffset; // offset of ATOM_INIT_REG_BLOCK structure for Per Byte Offset Preset Settings ++ USHORT usReserved[3]; + UCHAR ucNumOfVRAMModule; // indicate number of VRAM module + UCHAR ucMemoryClkPatchTblVer; // version of memory AC timing register list + UCHAR ucVramModuleVer; // indicate ATOM_VRAM_MODUE version +@@ -5935,6 +6583,52 @@ typedef struct _ATOM_DISP_OUT_INFO_V2 + ASIC_ENCODER_INFO asEncoderInfo[1]; + }ATOM_DISP_OUT_INFO_V2; + ++ ++typedef struct _ATOM_DISP_CLOCK_ID { ++ UCHAR ucPpllId; ++ UCHAR ucPpllAttribute; ++}ATOM_DISP_CLOCK_ID; ++ ++// ucPpllAttribute ++#define CLOCK_SOURCE_SHAREABLE 0x01 ++#define CLOCK_SOURCE_DP_MODE 0x02 ++#define CLOCK_SOURCE_NONE_DP_MODE 0x04 ++ ++//DispOutInfoTable ++typedef struct _ASIC_TRANSMITTER_INFO_V2 ++{ ++ USHORT usTransmitterObjId; ++ USHORT usDispClkIdOffset; // point to clock source id list supported by Encoder Object ++ UCHAR ucTransmitterCmdTblId; ++ UCHAR ucConfig; ++ UCHAR ucEncoderID; // available 1st encoder ( default ) ++ UCHAR ucOptionEncoderID; // available 2nd encoder ( optional ) ++ UCHAR uc2ndEncoderID; ++ UCHAR ucReserved; ++}ASIC_TRANSMITTER_INFO_V2; ++ ++typedef struct _ATOM_DISP_OUT_INFO_V3 ++{ ++ ATOM_COMMON_TABLE_HEADER sHeader; ++ USHORT ptrTransmitterInfo; ++ USHORT ptrEncoderInfo; ++ USHORT ptrMainCallParserFar; // direct address of main parser call in VBIOS binary. ++ USHORT usReserved; ++ UCHAR ucDCERevision; ++ UCHAR ucMaxDispEngineNum; ++ UCHAR ucMaxActiveDispEngineNum; ++ UCHAR ucMaxPPLLNum; ++ UCHAR ucCoreRefClkSource; // value of CORE_REF_CLK_SOURCE ++ UCHAR ucReserved[3]; ++ ASIC_TRANSMITTER_INFO_V2 asTransmitterInfo[1]; // for alligment only ++}ATOM_DISP_OUT_INFO_V3; ++ ++typedef enum CORE_REF_CLK_SOURCE{ ++ CLOCK_SRC_XTALIN=0, ++ CLOCK_SRC_XO_IN=1, ++ CLOCK_SRC_XO_IN2=2, ++}CORE_REF_CLK_SOURCE; ++ + // DispDevicePriorityInfo + typedef struct _ATOM_DISPLAY_DEVICE_PRIORITY_INFO + { +@@ -6070,6 +6764,39 @@ typedef struct _PROCESS_I2C_CHANNEL_TRANSACTION_PARAMETERS + #define HW_I2C_READ 0 + #define I2C_2BYTE_ADDR 0x02 + ++/****************************************************************************/ ++// Structures used by HW_Misc_OperationTable ++/****************************************************************************/ ++typedef struct _ATOM_HW_MISC_OPERATION_INPUT_PARAMETER_V1_1 ++{ ++ UCHAR ucCmd; // Input: To tell which action to take ++ UCHAR ucReserved[3]; ++ ULONG ulReserved; ++}ATOM_HW_MISC_OPERATION_INPUT_PARAMETER_V1_1; ++ ++typedef struct _ATOM_HW_MISC_OPERATION_OUTPUT_PARAMETER_V1_1 ++{ ++ UCHAR ucReturnCode; // Output: Return value base on action was taken ++ UCHAR ucReserved[3]; ++ ULONG ulReserved; ++}ATOM_HW_MISC_OPERATION_OUTPUT_PARAMETER_V1_1; ++ ++// Actions code ++#define ATOM_GET_SDI_SUPPORT 0xF0 ++ ++// Return code ++#define ATOM_UNKNOWN_CMD 0 ++#define ATOM_FEATURE_NOT_SUPPORTED 1 ++#define ATOM_FEATURE_SUPPORTED 2 ++ ++typedef struct _ATOM_HW_MISC_OPERATION_PS_ALLOCATION ++{ ++ ATOM_HW_MISC_OPERATION_INPUT_PARAMETER_V1_1 sInput_Output; ++ PROCESS_I2C_CHANNEL_TRANSACTION_PARAMETERS sReserved; ++}ATOM_HW_MISC_OPERATION_PS_ALLOCATION; ++ ++/****************************************************************************/ ++ + typedef struct _SET_HWBLOCK_INSTANCE_PARAMETER_V2 + { + UCHAR ucHWBlkInst; // HW block instance, 0, 1, 2, ... +@@ -6090,6 +6817,52 @@ typedef struct _SET_HWBLOCK_INSTANCE_PARAMETER_V2 + #define SELECT_CRTC_PIXEL_RATE 7 + #define SELECT_VGA_BLK 8 + ++// DIGTransmitterInfoTable structure used to program UNIPHY settings ++typedef struct _DIG_TRANSMITTER_INFO_HEADER_V3_1{ ++ ATOM_COMMON_TABLE_HEADER sHeader; ++ USHORT usDPVsPreEmphSettingOffset; // offset of PHY_ANALOG_SETTING_INFO * with DP Voltage Swing and Pre-Emphasis for each Link clock ++ USHORT usPhyAnalogRegListOffset; // offset of CLOCK_CONDITION_REGESTER_INFO* with None-DP mode Analog Setting's register Info ++ USHORT usPhyAnalogSettingOffset; // offset of CLOCK_CONDITION_SETTING_ENTRY* with None-DP mode Analog Setting for each link clock range ++ USHORT usPhyPllRegListOffset; // offset of CLOCK_CONDITION_REGESTER_INFO* with Phy Pll register Info ++ USHORT usPhyPllSettingOffset; // offset of CLOCK_CONDITION_SETTING_ENTRY* with Phy Pll Settings ++}DIG_TRANSMITTER_INFO_HEADER_V3_1; ++ ++typedef struct _CLOCK_CONDITION_REGESTER_INFO{ ++ USHORT usRegisterIndex; ++ UCHAR ucStartBit; ++ UCHAR ucEndBit; ++}CLOCK_CONDITION_REGESTER_INFO; ++ ++typedef struct _CLOCK_CONDITION_SETTING_ENTRY{ ++ USHORT usMaxClockFreq; ++ UCHAR ucEncodeMode; ++ UCHAR ucPhySel; ++ ULONG ulAnalogSetting[1]; ++}CLOCK_CONDITION_SETTING_ENTRY; ++ ++typedef struct _CLOCK_CONDITION_SETTING_INFO{ ++ USHORT usEntrySize; ++ CLOCK_CONDITION_SETTING_ENTRY asClkCondSettingEntry[1]; ++}CLOCK_CONDITION_SETTING_INFO; ++ ++typedef struct _PHY_CONDITION_REG_VAL{ ++ ULONG ulCondition; ++ ULONG ulRegVal; ++}PHY_CONDITION_REG_VAL; ++ ++typedef struct _PHY_CONDITION_REG_INFO{ ++ USHORT usRegIndex; ++ USHORT usSize; ++ PHY_CONDITION_REG_VAL asRegVal[1]; ++}PHY_CONDITION_REG_INFO; ++ ++typedef struct _PHY_ANALOG_SETTING_INFO{ ++ UCHAR ucEncodeMode; ++ UCHAR ucPhySel; ++ USHORT usSize; ++ PHY_CONDITION_REG_INFO asAnalogSetting[1]; ++}PHY_ANALOG_SETTING_INFO; ++ + /****************************************************************************/ + //Portion VI: Definitinos for vbios MC scratch registers that driver used + /****************************************************************************/ +@@ -7020,4 +7793,68 @@ typedef struct _ATOM_PPLIB_Clock_Voltage_Limit_Table + + #pragma pack() // BIOS data must use byte aligment + ++// ++// AMD ACPI Table ++// ++#pragma pack(1) ++ ++typedef struct { ++ ULONG Signature; ++ ULONG TableLength; //Length ++ UCHAR Revision; ++ UCHAR Checksum; ++ UCHAR OemId[6]; ++ UCHAR OemTableId[8]; //UINT64 OemTableId; ++ ULONG OemRevision; ++ ULONG CreatorId; ++ ULONG CreatorRevision; ++} AMD_ACPI_DESCRIPTION_HEADER; ++/* ++//EFI_ACPI_DESCRIPTION_HEADER from AcpiCommon.h ++typedef struct { ++ UINT32 Signature; //0x0 ++ UINT32 Length; //0x4 ++ UINT8 Revision; //0x8 ++ UINT8 Checksum; //0x9 ++ UINT8 OemId[6]; //0xA ++ UINT64 OemTableId; //0x10 ++ UINT32 OemRevision; //0x18 ++ UINT32 CreatorId; //0x1C ++ UINT32 CreatorRevision; //0x20 ++}EFI_ACPI_DESCRIPTION_HEADER; ++*/ ++typedef struct { ++ AMD_ACPI_DESCRIPTION_HEADER SHeader; ++ UCHAR TableUUID[16]; //0x24 ++ ULONG VBIOSImageOffset; //0x34. Offset to the first GOP_VBIOS_CONTENT block from the beginning of the stucture. ++ ULONG Lib1ImageOffset; //0x38. Offset to the first GOP_LIB1_CONTENT block from the beginning of the stucture. ++ ULONG Reserved[4]; //0x3C ++}UEFI_ACPI_VFCT; ++ ++typedef struct { ++ ULONG PCIBus; //0x4C ++ ULONG PCIDevice; //0x50 ++ ULONG PCIFunction; //0x54 ++ USHORT VendorID; //0x58 ++ USHORT DeviceID; //0x5A ++ USHORT SSVID; //0x5C ++ USHORT SSID; //0x5E ++ ULONG Revision; //0x60 ++ ULONG ImageLength; //0x64 ++}VFCT_IMAGE_HEADER; ++ ++ ++typedef struct { ++ VFCT_IMAGE_HEADER VbiosHeader; ++ UCHAR VbiosContent[1]; ++}GOP_VBIOS_CONTENT; ++ ++typedef struct { ++ VFCT_IMAGE_HEADER Lib1Header; ++ UCHAR Lib1Content[1]; ++}GOP_LIB1_CONTENT; ++ ++#pragma pack() ++ ++ + #endif /* _ATOMBIOS_H */ +diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h +index 8227e76..28e69e9 100644 +--- a/drivers/gpu/drm/radeon/radeon.h ++++ b/drivers/gpu/drm/radeon/radeon.h +@@ -123,21 +123,6 @@ struct radeon_device; + /* + * BIOS. + */ +-#define ATRM_BIOS_PAGE 4096 +- +-#if defined(CONFIG_VGA_SWITCHEROO) +-bool radeon_atrm_supported(struct pci_dev *pdev); +-int radeon_atrm_get_bios_chunk(uint8_t *bios, int offset, int len); +-#else +-static inline bool radeon_atrm_supported(struct pci_dev *pdev) +-{ +- return false; +-} +- +-static inline int radeon_atrm_get_bios_chunk(uint8_t *bios, int offset, int len){ +- return -EINVAL; +-} +-#endif + bool radeon_get_bios(struct radeon_device *rdev); + + +diff --git a/drivers/gpu/drm/radeon/radeon_atombios.c b/drivers/gpu/drm/radeon/radeon_atombios.c +index 9d2c369..38585c5 100644 +--- a/drivers/gpu/drm/radeon/radeon_atombios.c ++++ b/drivers/gpu/drm/radeon/radeon_atombios.c +@@ -446,7 +446,7 @@ static bool radeon_atom_apply_quirks(struct drm_device *dev, + } + + /* Fujitsu D3003-S2 board lists DVI-I as DVI-D and VGA */ +- if ((dev->pdev->device == 0x9802) && ++ if (((dev->pdev->device == 0x9802) || (dev->pdev->device == 0x9806)) && + (dev->pdev->subsystem_vendor == 0x1734) && + (dev->pdev->subsystem_device == 0x11bd)) { + if (*connector_type == DRM_MODE_CONNECTOR_VGA) { +diff --git a/drivers/gpu/drm/radeon/radeon_atpx_handler.c b/drivers/gpu/drm/radeon/radeon_atpx_handler.c +index 9d95792..2a2cf0b 100644 +--- a/drivers/gpu/drm/radeon/radeon_atpx_handler.c ++++ b/drivers/gpu/drm/radeon/radeon_atpx_handler.c +@@ -30,56 +30,8 @@ static struct radeon_atpx_priv { + /* handle for device - and atpx */ + acpi_handle dhandle; + acpi_handle atpx_handle; +- acpi_handle atrm_handle; + } radeon_atpx_priv; + +-/* retrieve the ROM in 4k blocks */ +-static int radeon_atrm_call(acpi_handle atrm_handle, uint8_t *bios, +- int offset, int len) +-{ +- acpi_status status; +- union acpi_object atrm_arg_elements[2], *obj; +- struct acpi_object_list atrm_arg; +- struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL}; +- +- atrm_arg.count = 2; +- atrm_arg.pointer = &atrm_arg_elements[0]; +- +- atrm_arg_elements[0].type = ACPI_TYPE_INTEGER; +- atrm_arg_elements[0].integer.value = offset; +- +- atrm_arg_elements[1].type = ACPI_TYPE_INTEGER; +- atrm_arg_elements[1].integer.value = len; +- +- status = acpi_evaluate_object(atrm_handle, NULL, &atrm_arg, &buffer); +- if (ACPI_FAILURE(status)) { +- printk("failed to evaluate ATRM got %s\n", acpi_format_exception(status)); +- return -ENODEV; +- } +- +- obj = (union acpi_object *)buffer.pointer; +- memcpy(bios+offset, obj->buffer.pointer, len); +- kfree(buffer.pointer); +- return len; +-} +- +-bool radeon_atrm_supported(struct pci_dev *pdev) +-{ +- /* get the discrete ROM only via ATRM */ +- if (!radeon_atpx_priv.atpx_detected) +- return false; +- +- if (radeon_atpx_priv.dhandle == DEVICE_ACPI_HANDLE(&pdev->dev)) +- return false; +- return true; +-} +- +- +-int radeon_atrm_get_bios_chunk(uint8_t *bios, int offset, int len) +-{ +- return radeon_atrm_call(radeon_atpx_priv.atrm_handle, bios, offset, len); +-} +- + static int radeon_atpx_get_version(acpi_handle handle) + { + acpi_status status; +@@ -197,7 +149,7 @@ static int radeon_atpx_power_state(enum vga_switcheroo_client_id id, + + static bool radeon_atpx_pci_probe_handle(struct pci_dev *pdev) + { +- acpi_handle dhandle, atpx_handle, atrm_handle; ++ acpi_handle dhandle, atpx_handle; + acpi_status status; + + dhandle = DEVICE_ACPI_HANDLE(&pdev->dev); +@@ -208,13 +160,8 @@ static bool radeon_atpx_pci_probe_handle(struct pci_dev *pdev) + if (ACPI_FAILURE(status)) + return false; + +- status = acpi_get_handle(dhandle, "ATRM", &atrm_handle); +- if (ACPI_FAILURE(status)) +- return false; +- + radeon_atpx_priv.dhandle = dhandle; + radeon_atpx_priv.atpx_handle = atpx_handle; +- radeon_atpx_priv.atrm_handle = atrm_handle; + return true; + } + +diff --git a/drivers/gpu/drm/radeon/radeon_bios.c b/drivers/gpu/drm/radeon/radeon_bios.c +index 229a20f..d306cc8 100644 +--- a/drivers/gpu/drm/radeon/radeon_bios.c ++++ b/drivers/gpu/drm/radeon/radeon_bios.c +@@ -32,6 +32,7 @@ + + #include + #include ++#include + /* + * BIOS. + */ +@@ -98,16 +99,81 @@ static bool radeon_read_bios(struct radeon_device *rdev) + return true; + } + ++#ifdef CONFIG_ACPI + /* ATRM is used to get the BIOS on the discrete cards in + * dual-gpu systems. + */ ++/* retrieve the ROM in 4k blocks */ ++#define ATRM_BIOS_PAGE 4096 ++/** ++ * radeon_atrm_call - fetch a chunk of the vbios ++ * ++ * @atrm_handle: acpi ATRM handle ++ * @bios: vbios image pointer ++ * @offset: offset of vbios image data to fetch ++ * @len: length of vbios image data to fetch ++ * ++ * Executes ATRM to fetch a chunk of the discrete ++ * vbios image on PX systems (all asics). ++ * Returns the length of the buffer fetched. ++ */ ++static int radeon_atrm_call(acpi_handle atrm_handle, uint8_t *bios, ++ int offset, int len) ++{ ++ acpi_status status; ++ union acpi_object atrm_arg_elements[2], *obj; ++ struct acpi_object_list atrm_arg; ++ struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL}; ++ ++ atrm_arg.count = 2; ++ atrm_arg.pointer = &atrm_arg_elements[0]; ++ ++ atrm_arg_elements[0].type = ACPI_TYPE_INTEGER; ++ atrm_arg_elements[0].integer.value = offset; ++ ++ atrm_arg_elements[1].type = ACPI_TYPE_INTEGER; ++ atrm_arg_elements[1].integer.value = len; ++ ++ status = acpi_evaluate_object(atrm_handle, NULL, &atrm_arg, &buffer); ++ if (ACPI_FAILURE(status)) { ++ printk("failed to evaluate ATRM got %s\n", acpi_format_exception(status)); ++ return -ENODEV; ++ } ++ ++ obj = (union acpi_object *)buffer.pointer; ++ memcpy(bios+offset, obj->buffer.pointer, obj->buffer.length); ++ len = obj->buffer.length; ++ kfree(buffer.pointer); ++ return len; ++} ++ + static bool radeon_atrm_get_bios(struct radeon_device *rdev) + { + int ret; + int size = 256 * 1024; + int i; ++ struct pci_dev *pdev = NULL; ++ acpi_handle dhandle, atrm_handle; ++ acpi_status status; ++ bool found = false; ++ ++ /* ATRM is for the discrete card only */ ++ if (rdev->flags & RADEON_IS_IGP) ++ return false; ++ ++ while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, pdev)) != NULL) { ++ dhandle = DEVICE_ACPI_HANDLE(&pdev->dev); ++ if (!dhandle) ++ continue; ++ ++ status = acpi_get_handle(dhandle, "ATRM", &atrm_handle); ++ if (!ACPI_FAILURE(status)) { ++ found = true; ++ break; ++ } ++ } + +- if (!radeon_atrm_supported(rdev->pdev)) ++ if (!found) + return false; + + rdev->bios = kmalloc(size, GFP_KERNEL); +@@ -117,10 +183,11 @@ static bool radeon_atrm_get_bios(struct radeon_device *rdev) + } + + for (i = 0; i < size / ATRM_BIOS_PAGE; i++) { +- ret = radeon_atrm_get_bios_chunk(rdev->bios, +- (i * ATRM_BIOS_PAGE), +- ATRM_BIOS_PAGE); +- if (ret <= 0) ++ ret = radeon_atrm_call(atrm_handle, ++ rdev->bios, ++ (i * ATRM_BIOS_PAGE), ++ ATRM_BIOS_PAGE); ++ if (ret < ATRM_BIOS_PAGE) + break; + } + +@@ -130,6 +197,12 @@ static bool radeon_atrm_get_bios(struct radeon_device *rdev) + } + return true; + } ++#else ++static inline bool radeon_atrm_get_bios(struct radeon_device *rdev) ++{ ++ return false; ++} ++#endif + + static bool ni_read_disabled_bios(struct radeon_device *rdev) + { +@@ -476,6 +549,61 @@ static bool radeon_read_disabled_bios(struct radeon_device *rdev) + return legacy_read_disabled_bios(rdev); + } + ++#ifdef CONFIG_ACPI ++static bool radeon_acpi_vfct_bios(struct radeon_device *rdev) ++{ ++ bool ret = false; ++ struct acpi_table_header *hdr; ++ acpi_size tbl_size; ++ UEFI_ACPI_VFCT *vfct; ++ GOP_VBIOS_CONTENT *vbios; ++ VFCT_IMAGE_HEADER *vhdr; ++ ++ if (!ACPI_SUCCESS(acpi_get_table_with_size("VFCT", 1, &hdr, &tbl_size))) ++ return false; ++ if (tbl_size < sizeof(UEFI_ACPI_VFCT)) { ++ DRM_ERROR("ACPI VFCT table present but broken (too short #1)\n"); ++ goto out_unmap; ++ } ++ ++ vfct = (UEFI_ACPI_VFCT *)hdr; ++ if (vfct->VBIOSImageOffset + sizeof(VFCT_IMAGE_HEADER) > tbl_size) { ++ DRM_ERROR("ACPI VFCT table present but broken (too short #2)\n"); ++ goto out_unmap; ++ } ++ ++ vbios = (GOP_VBIOS_CONTENT *)((char *)hdr + vfct->VBIOSImageOffset); ++ vhdr = &vbios->VbiosHeader; ++ DRM_INFO("ACPI VFCT contains a BIOS for %02x:%02x.%d %04x:%04x, size %d\n", ++ vhdr->PCIBus, vhdr->PCIDevice, vhdr->PCIFunction, ++ vhdr->VendorID, vhdr->DeviceID, vhdr->ImageLength); ++ ++ if (vhdr->PCIBus != rdev->pdev->bus->number || ++ vhdr->PCIDevice != PCI_SLOT(rdev->pdev->devfn) || ++ vhdr->PCIFunction != PCI_FUNC(rdev->pdev->devfn) || ++ vhdr->VendorID != rdev->pdev->vendor || ++ vhdr->DeviceID != rdev->pdev->device) { ++ DRM_INFO("ACPI VFCT table is not for this card\n"); ++ goto out_unmap; ++ }; ++ ++ if (vfct->VBIOSImageOffset + sizeof(VFCT_IMAGE_HEADER) + vhdr->ImageLength > tbl_size) { ++ DRM_ERROR("ACPI VFCT image truncated\n"); ++ goto out_unmap; ++ } ++ ++ rdev->bios = kmemdup(&vbios->VbiosContent, vhdr->ImageLength, GFP_KERNEL); ++ ret = !!rdev->bios; ++ ++out_unmap: ++ return ret; ++} ++#else ++static inline bool radeon_acpi_vfct_bios(struct radeon_device *rdev) ++{ ++ return false; ++} ++#endif + + bool radeon_get_bios(struct radeon_device *rdev) + { +@@ -484,6 +612,8 @@ bool radeon_get_bios(struct radeon_device *rdev) + + r = radeon_atrm_get_bios(rdev); + if (r == false) ++ r = radeon_acpi_vfct_bios(rdev); ++ if (r == false) + r = igp_read_bios_from_vram(rdev); + if (r == false) + r = radeon_read_bios(rdev); +diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c +index 39497c7..f3ae607 100644 +--- a/drivers/gpu/drm/radeon/radeon_object.c ++++ b/drivers/gpu/drm/radeon/radeon_object.c +@@ -117,6 +117,7 @@ int radeon_bo_create(struct radeon_device *rdev, + return -ENOMEM; + } + ++retry: + bo = kzalloc(sizeof(struct radeon_bo), GFP_KERNEL); + if (bo == NULL) + return -ENOMEM; +@@ -129,8 +130,6 @@ int radeon_bo_create(struct radeon_device *rdev, + bo->gem_base.driver_private = NULL; + bo->surface_reg = -1; + INIT_LIST_HEAD(&bo->list); +- +-retry: + radeon_ttm_placement_from_domain(bo, domain); + /* Kernel allocation are uninterruptible */ + mutex_lock(&rdev->vram_mutex); +diff --git a/drivers/hid/hid-chicony.c b/drivers/hid/hid-chicony.c +index b99af34..a2abb8e 100644 +--- a/drivers/hid/hid-chicony.c ++++ b/drivers/hid/hid-chicony.c +@@ -60,6 +60,7 @@ static int ch_input_mapping(struct hid_device *hdev, struct hid_input *hi, + static const struct hid_device_id ch_devices[] = { + { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_TACTICAL_PAD) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_WIRELESS2) }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_AK1D) }, + { } + }; + MODULE_DEVICE_TABLE(hid, ch_devices); +diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c +index 95430a0..5cc029f 100644 +--- a/drivers/hid/hid-core.c ++++ b/drivers/hid/hid-core.c +@@ -1398,12 +1398,14 @@ static const struct hid_device_id hid_have_special_driver[] = { + { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_TACTICAL_PAD) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_WIRELESS) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_WIRELESS2) }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_AK1D) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CHUNGHWAT, USB_DEVICE_ID_CHUNGHWAT_MULTITOUCH) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CREATIVELABS, USB_DEVICE_ID_PRODIKEYS_PCMIDI) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CVTOUCH, USB_DEVICE_ID_CVTOUCH_SCREEN) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CYPRESS, USB_DEVICE_ID_CYPRESS_BARCODE_1) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CYPRESS, USB_DEVICE_ID_CYPRESS_BARCODE_2) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CYPRESS, USB_DEVICE_ID_CYPRESS_BARCODE_3) }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_CYPRESS, USB_DEVICE_ID_CYPRESS_BARCODE_4) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CYPRESS, USB_DEVICE_ID_CYPRESS_MOUSE) }, + { HID_USB_DEVICE(USB_VENDOR_ID_CYPRESS, USB_DEVICE_ID_CYPRESS_TRUETOUCH) }, + { HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, 0x0006) }, +diff --git a/drivers/hid/hid-cypress.c b/drivers/hid/hid-cypress.c +index 2f0be4c..9e43aac 100644 +--- a/drivers/hid/hid-cypress.c ++++ b/drivers/hid/hid-cypress.c +@@ -129,6 +129,8 @@ static const struct hid_device_id cp_devices[] = { + .driver_data = CP_RDESC_SWAPPED_MIN_MAX }, + { HID_USB_DEVICE(USB_VENDOR_ID_CYPRESS, USB_DEVICE_ID_CYPRESS_BARCODE_3), + .driver_data = CP_RDESC_SWAPPED_MIN_MAX }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_CYPRESS, USB_DEVICE_ID_CYPRESS_BARCODE_4), ++ .driver_data = CP_RDESC_SWAPPED_MIN_MAX }, + { HID_USB_DEVICE(USB_VENDOR_ID_CYPRESS, USB_DEVICE_ID_CYPRESS_MOUSE), + .driver_data = CP_2WHEEL_MOUSE_HACK }, + { } +diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h +index 7db934d..e4317a2 100644 +--- a/drivers/hid/hid-ids.h ++++ b/drivers/hid/hid-ids.h +@@ -196,6 +196,7 @@ + #define USB_DEVICE_ID_CHICONY_MULTI_TOUCH 0xb19d + #define USB_DEVICE_ID_CHICONY_WIRELESS 0x0618 + #define USB_DEVICE_ID_CHICONY_WIRELESS2 0x1123 ++#define USB_DEVICE_ID_CHICONY_AK1D 0x1125 + + #define USB_VENDOR_ID_CHUNGHWAT 0x2247 + #define USB_DEVICE_ID_CHUNGHWAT_MULTITOUCH 0x0001 +@@ -225,6 +226,7 @@ + #define USB_DEVICE_ID_CYPRESS_BARCODE_1 0xde61 + #define USB_DEVICE_ID_CYPRESS_BARCODE_2 0xde64 + #define USB_DEVICE_ID_CYPRESS_BARCODE_3 0xbca1 ++#define USB_DEVICE_ID_CYPRESS_BARCODE_4 0xed81 + #define USB_DEVICE_ID_CYPRESS_TRUETOUCH 0xc001 + + #define USB_VENDOR_ID_DEALEXTREAME 0x10c5 +diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c +index 0bfa545..c76b051 100644 +--- a/drivers/infiniband/ulp/srp/ib_srp.c ++++ b/drivers/infiniband/ulp/srp/ib_srp.c +@@ -568,24 +568,62 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, + scmnd->sc_data_direction); + } + +-static void srp_remove_req(struct srp_target_port *target, +- struct srp_request *req, s32 req_lim_delta) ++/** ++ * srp_claim_req - Take ownership of the scmnd associated with a request. ++ * @target: SRP target port. ++ * @req: SRP request. ++ * @scmnd: If NULL, take ownership of @req->scmnd. If not NULL, only take ++ * ownership of @req->scmnd if it equals @scmnd. ++ * ++ * Return value: ++ * Either NULL or a pointer to the SCSI command the caller became owner of. ++ */ ++static struct scsi_cmnd *srp_claim_req(struct srp_target_port *target, ++ struct srp_request *req, ++ struct scsi_cmnd *scmnd) + { + unsigned long flags; + +- srp_unmap_data(req->scmnd, target, req); ++ spin_lock_irqsave(&target->lock, flags); ++ if (!scmnd) { ++ scmnd = req->scmnd; ++ req->scmnd = NULL; ++ } else if (req->scmnd == scmnd) { ++ req->scmnd = NULL; ++ } else { ++ scmnd = NULL; ++ } ++ spin_unlock_irqrestore(&target->lock, flags); ++ ++ return scmnd; ++} ++ ++/** ++ * srp_free_req() - Unmap data and add request to the free request list. ++ */ ++static void srp_free_req(struct srp_target_port *target, ++ struct srp_request *req, struct scsi_cmnd *scmnd, ++ s32 req_lim_delta) ++{ ++ unsigned long flags; ++ ++ srp_unmap_data(scmnd, target, req); ++ + spin_lock_irqsave(&target->lock, flags); + target->req_lim += req_lim_delta; +- req->scmnd = NULL; + list_add_tail(&req->list, &target->free_reqs); + spin_unlock_irqrestore(&target->lock, flags); + } + + static void srp_reset_req(struct srp_target_port *target, struct srp_request *req) + { +- req->scmnd->result = DID_RESET << 16; +- req->scmnd->scsi_done(req->scmnd); +- srp_remove_req(target, req, 0); ++ struct scsi_cmnd *scmnd = srp_claim_req(target, req, NULL); ++ ++ if (scmnd) { ++ scmnd->result = DID_RESET << 16; ++ scmnd->scsi_done(scmnd); ++ srp_free_req(target, req, scmnd, 0); ++ } + } + + static int srp_reconnect_target(struct srp_target_port *target) +@@ -1055,11 +1093,18 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) + complete(&target->tsk_mgmt_done); + } else { + req = &target->req_ring[rsp->tag]; +- scmnd = req->scmnd; +- if (!scmnd) ++ scmnd = srp_claim_req(target, req, NULL); ++ if (!scmnd) { + shost_printk(KERN_ERR, target->scsi_host, + "Null scmnd for RSP w/tag %016llx\n", + (unsigned long long) rsp->tag); ++ ++ spin_lock_irqsave(&target->lock, flags); ++ target->req_lim += be32_to_cpu(rsp->req_lim_delta); ++ spin_unlock_irqrestore(&target->lock, flags); ++ ++ return; ++ } + scmnd->result = rsp->status; + + if (rsp->flags & SRP_RSP_FLAG_SNSVALID) { +@@ -1074,7 +1119,9 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) + else if (rsp->flags & (SRP_RSP_FLAG_DIOVER | SRP_RSP_FLAG_DIUNDER)) + scsi_set_resid(scmnd, be32_to_cpu(rsp->data_in_res_cnt)); + +- srp_remove_req(target, req, be32_to_cpu(rsp->req_lim_delta)); ++ srp_free_req(target, req, scmnd, ++ be32_to_cpu(rsp->req_lim_delta)); ++ + scmnd->host_scribble = NULL; + scmnd->scsi_done(scmnd); + } +@@ -1613,25 +1660,17 @@ static int srp_abort(struct scsi_cmnd *scmnd) + { + struct srp_target_port *target = host_to_target(scmnd->device->host); + struct srp_request *req = (struct srp_request *) scmnd->host_scribble; +- int ret = SUCCESS; + + shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); + +- if (!req || target->qp_in_error) +- return FAILED; +- if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun, +- SRP_TSK_ABORT_TASK)) ++ if (!req || target->qp_in_error || !srp_claim_req(target, req, scmnd)) + return FAILED; ++ srp_send_tsk_mgmt(target, req->index, scmnd->device->lun, ++ SRP_TSK_ABORT_TASK); ++ srp_free_req(target, req, scmnd, 0); ++ scmnd->result = DID_ABORT << 16; + +- if (req->scmnd) { +- if (!target->tsk_mgmt_status) { +- srp_remove_req(target, req, 0); +- scmnd->result = DID_ABORT << 16; +- } else +- ret = FAILED; +- } +- +- return ret; ++ return SUCCESS; + } + + static int srp_reset_device(struct scsi_cmnd *scmnd) +diff --git a/drivers/md/md.c b/drivers/md/md.c +index d8646d7..2887f22 100644 +--- a/drivers/md/md.c ++++ b/drivers/md/md.c +@@ -1144,8 +1144,11 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor + ret = 0; + } + rdev->sectors = rdev->sb_start; +- /* Limit to 4TB as metadata cannot record more than that */ +- if (rdev->sectors >= (2ULL << 32)) ++ /* Limit to 4TB as metadata cannot record more than that. ++ * (not needed for Linear and RAID0 as metadata doesn't ++ * record this size) ++ */ ++ if (rdev->sectors >= (2ULL << 32) && sb->level >= 1) + rdev->sectors = (2ULL << 32) - 2; + + if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) +@@ -1427,7 +1430,7 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) + /* Limit to 4TB as metadata cannot record more than that. + * 4TB == 2^32 KB, or 2*2^32 sectors. + */ +- if (num_sectors >= (2ULL << 32)) ++ if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) + num_sectors = (2ULL << 32) - 2; + md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, + rdev->sb_page); +diff --git a/drivers/media/dvb/siano/smsusb.c b/drivers/media/dvb/siano/smsusb.c +index fb68805..027550d 100644 +--- a/drivers/media/dvb/siano/smsusb.c ++++ b/drivers/media/dvb/siano/smsusb.c +@@ -481,7 +481,7 @@ static int smsusb_resume(struct usb_interface *intf) + return 0; + } + +-static const struct usb_device_id smsusb_id_table[] __devinitconst = { ++static const struct usb_device_id smsusb_id_table[] = { + { USB_DEVICE(0x187f, 0x0010), + .driver_info = SMS1XXX_BOARD_SIANO_STELLAR }, + { USB_DEVICE(0x187f, 0x0100), +diff --git a/drivers/media/video/gspca/spca506.c b/drivers/media/video/gspca/spca506.c +index 89fec4c..731cd16 100644 +--- a/drivers/media/video/gspca/spca506.c ++++ b/drivers/media/video/gspca/spca506.c +@@ -685,7 +685,7 @@ static const struct sd_desc sd_desc = { + }; + + /* -- module initialisation -- */ +-static const struct usb_device_id device_table[] __devinitconst = { ++static const struct usb_device_id device_table[] = { + {USB_DEVICE(0x06e1, 0xa190)}, + /*fixme: may be IntelPCCameraPro BRIDGE_SPCA505 + {USB_DEVICE(0x0733, 0x0430)}, */ +diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c +index 17bbacb..cc2ae7e 100644 +--- a/drivers/misc/sgi-xp/xpc_uv.c ++++ b/drivers/misc/sgi-xp/xpc_uv.c +@@ -18,6 +18,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -59,6 +61,8 @@ static struct xpc_heartbeat_uv *xpc_heartbeat_uv; + XPC_NOTIFY_MSG_SIZE_UV) + #define XPC_NOTIFY_IRQ_NAME "xpc_notify" + ++static int xpc_mq_node = -1; ++ + static struct xpc_gru_mq_uv *xpc_activate_mq_uv; + static struct xpc_gru_mq_uv *xpc_notify_mq_uv; + +@@ -109,11 +113,8 @@ xpc_get_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq, int cpu, char *irq_name) + #if defined CONFIG_X86_64 + mq->irq = uv_setup_irq(irq_name, cpu, mq->mmr_blade, mq->mmr_offset, + UV_AFFINITY_CPU); +- if (mq->irq < 0) { +- dev_err(xpc_part, "uv_setup_irq() returned error=%d\n", +- -mq->irq); ++ if (mq->irq < 0) + return mq->irq; +- } + + mq->mmr_value = uv_read_global_mmr64(mmr_pnode, mq->mmr_offset); + +@@ -238,8 +239,9 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name, + mq->mmr_blade = uv_cpu_to_blade_id(cpu); + + nid = cpu_to_node(cpu); +- page = alloc_pages_exact_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, +- pg_order); ++ page = alloc_pages_exact_node(nid, ++ GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, ++ pg_order); + if (page == NULL) { + dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d " + "bytes of memory on nid=%d for GRU mq\n", mq_size, nid); +@@ -1731,9 +1733,50 @@ static struct xpc_arch_operations xpc_arch_ops_uv = { + .notify_senders_of_disconnect = xpc_notify_senders_of_disconnect_uv, + }; + ++static int ++xpc_init_mq_node(int nid) ++{ ++ int cpu; ++ ++ get_online_cpus(); ++ ++ for_each_cpu(cpu, cpumask_of_node(nid)) { ++ xpc_activate_mq_uv = ++ xpc_create_gru_mq_uv(XPC_ACTIVATE_MQ_SIZE_UV, nid, ++ XPC_ACTIVATE_IRQ_NAME, ++ xpc_handle_activate_IRQ_uv); ++ if (!IS_ERR(xpc_activate_mq_uv)) ++ break; ++ } ++ if (IS_ERR(xpc_activate_mq_uv)) { ++ put_online_cpus(); ++ return PTR_ERR(xpc_activate_mq_uv); ++ } ++ ++ for_each_cpu(cpu, cpumask_of_node(nid)) { ++ xpc_notify_mq_uv = ++ xpc_create_gru_mq_uv(XPC_NOTIFY_MQ_SIZE_UV, nid, ++ XPC_NOTIFY_IRQ_NAME, ++ xpc_handle_notify_IRQ_uv); ++ if (!IS_ERR(xpc_notify_mq_uv)) ++ break; ++ } ++ if (IS_ERR(xpc_notify_mq_uv)) { ++ xpc_destroy_gru_mq_uv(xpc_activate_mq_uv); ++ put_online_cpus(); ++ return PTR_ERR(xpc_notify_mq_uv); ++ } ++ ++ put_online_cpus(); ++ return 0; ++} ++ + int + xpc_init_uv(void) + { ++ int nid; ++ int ret = 0; ++ + xpc_arch_ops = xpc_arch_ops_uv; + + if (sizeof(struct xpc_notify_mq_msghdr_uv) > XPC_MSG_HDR_MAX_SIZE) { +@@ -1742,21 +1785,21 @@ xpc_init_uv(void) + return -E2BIG; + } + +- xpc_activate_mq_uv = xpc_create_gru_mq_uv(XPC_ACTIVATE_MQ_SIZE_UV, 0, +- XPC_ACTIVATE_IRQ_NAME, +- xpc_handle_activate_IRQ_uv); +- if (IS_ERR(xpc_activate_mq_uv)) +- return PTR_ERR(xpc_activate_mq_uv); ++ if (xpc_mq_node < 0) ++ for_each_online_node(nid) { ++ ret = xpc_init_mq_node(nid); + +- xpc_notify_mq_uv = xpc_create_gru_mq_uv(XPC_NOTIFY_MQ_SIZE_UV, 0, +- XPC_NOTIFY_IRQ_NAME, +- xpc_handle_notify_IRQ_uv); +- if (IS_ERR(xpc_notify_mq_uv)) { +- xpc_destroy_gru_mq_uv(xpc_activate_mq_uv); +- return PTR_ERR(xpc_notify_mq_uv); +- } ++ if (!ret) ++ break; ++ } ++ else ++ ret = xpc_init_mq_node(xpc_mq_node); + +- return 0; ++ if (ret < 0) ++ dev_err(xpc_part, "xpc_init_mq_node() returned error=%d\n", ++ -ret); ++ ++ return ret; + } + + void +@@ -1765,3 +1808,6 @@ xpc_exit_uv(void) + xpc_destroy_gru_mq_uv(xpc_notify_mq_uv); + xpc_destroy_gru_mq_uv(xpc_activate_mq_uv); + } ++ ++module_param(xpc_mq_node, int, 0); ++MODULE_PARM_DESC(xpc_mq_node, "Node number on which to allocate message queues."); +diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c +index e888202..01b104e 100644 +--- a/drivers/net/netconsole.c ++++ b/drivers/net/netconsole.c +@@ -652,7 +652,6 @@ static int netconsole_netdev_event(struct notifier_block *this, + flags); + dev_put(nt->np.dev); + nt->np.dev = NULL; +- netconsole_target_put(nt); + } + nt->enabled = 0; + stopped = true; +diff --git a/drivers/net/wireless/ath/ath9k/recv.c b/drivers/net/wireless/ath/ath9k/recv.c +index e6d791c..b4cbc82 100644 +--- a/drivers/net/wireless/ath/ath9k/recv.c ++++ b/drivers/net/wireless/ath/ath9k/recv.c +@@ -1782,7 +1782,6 @@ int ath_rx_tasklet(struct ath_softc *sc, int flush, bool hp) + struct ieee80211_hw *hw = sc->hw; + struct ieee80211_hdr *hdr; + int retval; +- bool decrypt_error = false; + struct ath_rx_status rs; + enum ath9k_rx_qtype qtype; + bool edma = !!(ah->caps.hw_caps & ATH9K_HW_CAP_EDMA); +@@ -1804,6 +1803,7 @@ int ath_rx_tasklet(struct ath_softc *sc, int flush, bool hp) + tsf_lower = tsf & 0xffffffff; + + do { ++ bool decrypt_error = false; + /* If handling rx interrupt and flush is in progress => exit */ + if ((sc->sc_flags & SC_OP_RXFLUSH) && (flush == 0)) + break; +diff --git a/drivers/net/wireless/p54/p54usb.c b/drivers/net/wireless/p54/p54usb.c +index 9b60968..8a009bc 100644 +--- a/drivers/net/wireless/p54/p54usb.c ++++ b/drivers/net/wireless/p54/p54usb.c +@@ -42,7 +42,7 @@ MODULE_FIRMWARE("isl3887usb"); + * whenever you add a new device. + */ + +-static struct usb_device_id p54u_table[] __devinitdata = { ++static struct usb_device_id p54u_table[] = { + /* Version 1 devices (pci chip + net2280) */ + {USB_DEVICE(0x0411, 0x0050)}, /* Buffalo WLI2-USB2-G54 */ + {USB_DEVICE(0x045e, 0x00c2)}, /* Microsoft MN-710 */ +diff --git a/drivers/net/wireless/rtl818x/rtl8187/dev.c b/drivers/net/wireless/rtl818x/rtl8187/dev.c +index 4a78f9e..4e98c39 100644 +--- a/drivers/net/wireless/rtl818x/rtl8187/dev.c ++++ b/drivers/net/wireless/rtl818x/rtl8187/dev.c +@@ -44,7 +44,7 @@ MODULE_AUTHOR("Larry Finger "); + MODULE_DESCRIPTION("RTL8187/RTL8187B USB wireless driver"); + MODULE_LICENSE("GPL"); + +-static struct usb_device_id rtl8187_table[] __devinitdata = { ++static struct usb_device_id rtl8187_table[] = { + /* Asus */ + {USB_DEVICE(0x0b05, 0x171d), .driver_info = DEVICE_RTL8187}, + /* Belkin */ +diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c +index d024f83..68af94c 100644 +--- a/drivers/pci/pci-driver.c ++++ b/drivers/pci/pci-driver.c +@@ -952,6 +952,13 @@ static int pci_pm_poweroff_noirq(struct device *dev) + if (!pci_dev->state_saved && !pci_is_bridge(pci_dev)) + pci_prepare_to_sleep(pci_dev); + ++ /* ++ * The reason for doing this here is the same as for the analogous code ++ * in pci_pm_suspend_noirq(). ++ */ ++ if (pci_dev->class == PCI_CLASS_SERIAL_USB_EHCI) ++ pci_write_config_word(pci_dev, PCI_COMMAND, 0); ++ + return 0; + } + +diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c +index b0859d4..ec5b17f 100644 +--- a/drivers/platform/x86/asus-nb-wmi.c ++++ b/drivers/platform/x86/asus-nb-wmi.c +@@ -86,6 +86,10 @@ static const struct key_entry asus_nb_wmi_keymap[] = { + { KE_KEY, 0x8A, { KEY_PROG1 } }, + { KE_KEY, 0x95, { KEY_MEDIA } }, + { KE_KEY, 0x99, { KEY_PHONE } }, ++ { KE_KEY, 0xA0, { KEY_SWITCHVIDEOMODE } }, /* SDSP HDMI only */ ++ { KE_KEY, 0xA1, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + HDMI */ ++ { KE_KEY, 0xA2, { KEY_SWITCHVIDEOMODE } }, /* SDSP CRT + HDMI */ ++ { KE_KEY, 0xA3, { KEY_SWITCHVIDEOMODE } }, /* SDSP TV + HDMI */ + { KE_KEY, 0xb5, { KEY_CALC } }, + { KE_KEY, 0xc4, { KEY_KBDILLUMUP } }, + { KE_KEY, 0xc5, { KEY_KBDILLUMDOWN } }, +diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c +index 30d2072..33471e1 100644 +--- a/drivers/rapidio/devices/tsi721.c ++++ b/drivers/rapidio/devices/tsi721.c +@@ -439,6 +439,9 @@ static void tsi721_db_dpc(struct work_struct *work) + " info %4.4x\n", DBELL_SID(idb.bytes), + DBELL_TID(idb.bytes), DBELL_INF(idb.bytes)); + } ++ ++ wr_ptr = ioread32(priv->regs + ++ TSI721_IDQ_WP(IDB_QUEUE)) % IDB_QSIZE; + } + + iowrite32(rd_ptr & (IDB_QSIZE - 1), +@@ -449,6 +452,10 @@ static void tsi721_db_dpc(struct work_struct *work) + regval |= TSI721_SR_CHINT_IDBQRCV; + iowrite32(regval, + priv->regs + TSI721_SR_CHINTE(IDB_QUEUE)); ++ ++ wr_ptr = ioread32(priv->regs + TSI721_IDQ_WP(IDB_QUEUE)) % IDB_QSIZE; ++ if (wr_ptr != rd_ptr) ++ schedule_work(&priv->idb_work); + } + + /** +@@ -2155,7 +2162,7 @@ static int __devinit tsi721_probe(struct pci_dev *pdev, + const struct pci_device_id *id) + { + struct tsi721_device *priv; +- int i, cap; ++ int cap; + int err; + u32 regval; + +@@ -2175,12 +2182,15 @@ static int __devinit tsi721_probe(struct pci_dev *pdev, + priv->pdev = pdev; + + #ifdef DEBUG ++ { ++ int i; + for (i = 0; i <= PCI_STD_RESOURCE_END; i++) { + dev_dbg(&pdev->dev, "res[%d] @ 0x%llx (0x%lx, 0x%lx)\n", + i, (unsigned long long)pci_resource_start(pdev, i), + (unsigned long)pci_resource_len(pdev, i), + pci_resource_flags(pdev, i)); + } ++ } + #endif + /* + * Verify BAR configuration +diff --git a/drivers/rtc/rtc-rs5c348.c b/drivers/rtc/rtc-rs5c348.c +index 971bc8e..11bcb20 100644 +--- a/drivers/rtc/rtc-rs5c348.c ++++ b/drivers/rtc/rtc-rs5c348.c +@@ -122,9 +122,12 @@ rs5c348_rtc_read_time(struct device *dev, struct rtc_time *tm) + tm->tm_min = bcd2bin(rxbuf[RS5C348_REG_MINS] & RS5C348_MINS_MASK); + tm->tm_hour = bcd2bin(rxbuf[RS5C348_REG_HOURS] & RS5C348_HOURS_MASK); + if (!pdata->rtc_24h) { +- tm->tm_hour %= 12; +- if (rxbuf[RS5C348_REG_HOURS] & RS5C348_BIT_PM) ++ if (rxbuf[RS5C348_REG_HOURS] & RS5C348_BIT_PM) { ++ tm->tm_hour -= 20; ++ tm->tm_hour %= 12; + tm->tm_hour += 12; ++ } else ++ tm->tm_hour %= 12; + } + tm->tm_wday = bcd2bin(rxbuf[RS5C348_REG_WDAY] & RS5C348_WDAY_MASK); + tm->tm_mday = bcd2bin(rxbuf[RS5C348_REG_DAY] & RS5C348_DAY_MASK); +diff --git a/drivers/staging/speakup/main.c b/drivers/staging/speakup/main.c +index 8be5604..0d70f68 100644 +--- a/drivers/staging/speakup/main.c ++++ b/drivers/staging/speakup/main.c +@@ -1854,7 +1854,7 @@ static void speakup_bits(struct vc_data *vc) + + static int handle_goto(struct vc_data *vc, u_char type, u_char ch, u_short key) + { +- static u_char *goto_buf = "\0\0\0\0\0\0"; ++ static u_char goto_buf[8]; + static int num; + int maxlen, go_pos; + char *cp; +diff --git a/drivers/staging/vt6656/main_usb.c b/drivers/staging/vt6656/main_usb.c +index 27521b6..ae62d57 100644 +--- a/drivers/staging/vt6656/main_usb.c ++++ b/drivers/staging/vt6656/main_usb.c +@@ -222,7 +222,7 @@ DEVICE_PARAM(b80211hEnable, "802.11h mode"); + // Static vars definitions + // + +-static struct usb_device_id vt6656_table[] __devinitdata = { ++static struct usb_device_id vt6656_table[] = { + {USB_DEVICE(VNT_USB_VENDOR_ID, VNT_USB_PRODUCT_ID)}, + {} + }; +diff --git a/drivers/staging/winbond/wbusb.c b/drivers/staging/winbond/wbusb.c +index f958eb4..3f0ce2b 100644 +--- a/drivers/staging/winbond/wbusb.c ++++ b/drivers/staging/winbond/wbusb.c +@@ -25,7 +25,7 @@ MODULE_DESCRIPTION("IS89C35 802.11bg WLAN USB Driver"); + MODULE_LICENSE("GPL"); + MODULE_VERSION("0.1"); + +-static const struct usb_device_id wb35_table[] __devinitconst = { ++static const struct usb_device_id wb35_table[] = { + { USB_DEVICE(0x0416, 0x0035) }, + { USB_DEVICE(0x18E8, 0x6201) }, + { USB_DEVICE(0x18E8, 0x6206) }, +diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c +index 94c03d2..597fb9b 100644 +--- a/drivers/target/target_core_transport.c ++++ b/drivers/target/target_core_transport.c +@@ -3509,9 +3509,9 @@ transport_generic_get_mem(struct se_cmd *cmd) + return 0; + + out: +- while (i >= 0) { +- __free_page(sg_page(&cmd->t_data_sg[i])); ++ while (i > 0) { + i--; ++ __free_page(sg_page(&cmd->t_data_sg[i])); + } + kfree(cmd->t_data_sg); + cmd->t_data_sg = NULL; +diff --git a/drivers/tty/serial/pmac_zilog.c b/drivers/tty/serial/pmac_zilog.c +index 5acd24a..086f7fe 100644 +--- a/drivers/tty/serial/pmac_zilog.c ++++ b/drivers/tty/serial/pmac_zilog.c +@@ -1407,10 +1407,16 @@ static int pmz_verify_port(struct uart_port *port, struct serial_struct *ser) + static int pmz_poll_get_char(struct uart_port *port) + { + struct uart_pmac_port *uap = (struct uart_pmac_port *)port; ++ int tries = 2; + +- while ((read_zsreg(uap, R0) & Rx_CH_AV) == 0) +- udelay(5); +- return read_zsdata(uap); ++ while (tries) { ++ if ((read_zsreg(uap, R0) & Rx_CH_AV) != 0) ++ return read_zsdata(uap); ++ if (tries--) ++ udelay(5); ++ } ++ ++ return NO_POLL_CHAR; + } + + static void pmz_poll_put_char(struct uart_port *port, unsigned char c) +diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c +index 1094469..dbf7d20 100644 +--- a/drivers/usb/class/cdc-acm.c ++++ b/drivers/usb/class/cdc-acm.c +@@ -1043,7 +1043,8 @@ skip_normal_probe: + } + + +- if (data_interface->cur_altsetting->desc.bNumEndpoints < 2) ++ if (data_interface->cur_altsetting->desc.bNumEndpoints < 2 || ++ control_interface->cur_altsetting->desc.bNumEndpoints == 0) + return -EINVAL; + + epctrl = &control_interface->cur_altsetting->endpoint[0].desc; +diff --git a/drivers/usb/gadget/u_ether.c b/drivers/usb/gadget/u_ether.c +index 4e1f0aa..9a2a1ae 100644 +--- a/drivers/usb/gadget/u_ether.c ++++ b/drivers/usb/gadget/u_ether.c +@@ -669,6 +669,8 @@ static int eth_stop(struct net_device *net) + spin_lock_irqsave(&dev->lock, flags); + if (dev->port_usb) { + struct gether *link = dev->port_usb; ++ const struct usb_endpoint_descriptor *in; ++ const struct usb_endpoint_descriptor *out; + + if (link->close) + link->close(link); +@@ -682,10 +684,14 @@ static int eth_stop(struct net_device *net) + * their own pace; the network stack can handle old packets. + * For the moment we leave this here, since it works. + */ ++ in = link->in_ep->desc; ++ out = link->out_ep->desc; + usb_ep_disable(link->in_ep); + usb_ep_disable(link->out_ep); + if (netif_carrier_ok(net)) { + DBG(dev, "host still using in/out endpoints\n"); ++ link->in_ep->desc = in; ++ link->out_ep->desc = out; + usb_ep_enable(link->in_ep); + usb_ep_enable(link->out_ep); + } +diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c +index daf5754..07c72a4 100644 +--- a/drivers/usb/host/xhci-pci.c ++++ b/drivers/usb/host/xhci-pci.c +@@ -95,6 +95,7 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) + pdev->device == PCI_DEVICE_ID_ASROCK_P67) { + xhci->quirks |= XHCI_RESET_ON_RESUME; + xhci_dbg(xhci, "QUIRK: Resetting on resume\n"); ++ xhci->quirks |= XHCI_TRUST_TX_LENGTH; + } + if (pdev->vendor == PCI_VENDOR_ID_VIA) + xhci->quirks |= XHCI_RESET_ON_RESUME; +diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c +index 05f82e9..f7c0a2a 100644 +--- a/drivers/usb/host/xhci.c ++++ b/drivers/usb/host/xhci.c +@@ -163,7 +163,7 @@ int xhci_reset(struct xhci_hcd *xhci) + xhci_writel(xhci, command, &xhci->op_regs->command); + + ret = handshake(xhci, &xhci->op_regs->command, +- CMD_RESET, 0, 250 * 1000); ++ CMD_RESET, 0, 10 * 1000 * 1000); + if (ret) + return ret; + +@@ -172,7 +172,8 @@ int xhci_reset(struct xhci_hcd *xhci) + * xHCI cannot write to any doorbells or operational registers other + * than status until the "Controller Not Ready" flag is cleared. + */ +- return handshake(xhci, &xhci->op_regs->status, STS_CNR, 0, 250 * 1000); ++ return handshake(xhci, &xhci->op_regs->status, ++ STS_CNR, 0, 10 * 1000 * 1000); + } + + #ifdef CONFIG_PCI +diff --git a/drivers/usb/misc/emi62.c b/drivers/usb/misc/emi62.c +index fc15ad4..723e833 100644 +--- a/drivers/usb/misc/emi62.c ++++ b/drivers/usb/misc/emi62.c +@@ -259,7 +259,7 @@ wraperr: + return err; + } + +-static const struct usb_device_id id_table[] __devinitconst = { ++static const struct usb_device_id id_table[] = { + { USB_DEVICE(EMI62_VENDOR_ID, EMI62_PRODUCT_ID) }, + { } /* Terminating entry */ + }; +diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c +index 4045e39..b3182bb 100644 +--- a/drivers/usb/serial/ftdi_sio.c ++++ b/drivers/usb/serial/ftdi_sio.c +@@ -811,6 +811,7 @@ static struct usb_device_id id_table_combined [] = { + { USB_DEVICE(LARSENBRUSGAARD_VID, LB_ALTITRACK_PID) }, + { USB_DEVICE(GN_OTOMETRICS_VID, AURICAL_USB_PID) }, + { USB_DEVICE(PI_VID, PI_E861_PID) }, ++ { USB_DEVICE(KONDO_VID, KONDO_USB_SERIAL_PID) }, + { USB_DEVICE(BAYER_VID, BAYER_CONTOUR_CABLE_PID) }, + { USB_DEVICE(FTDI_VID, MARVELL_OPENRD_PID), + .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk }, +diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h +index d27d7d7..54b4258 100644 +--- a/drivers/usb/serial/ftdi_sio_ids.h ++++ b/drivers/usb/serial/ftdi_sio_ids.h +@@ -795,6 +795,13 @@ + #define PI_E861_PID 0x1008 /* E-861 piezo controller USB connection */ + + /* ++ * Kondo Kagaku Co.Ltd. ++ * http://www.kondo-robot.com/EN ++ */ ++#define KONDO_VID 0x165c ++#define KONDO_USB_SERIAL_PID 0x0002 ++ ++/* + * Bayer Ascensia Contour blood glucose meter USB-converter cable. + * http://winglucofacts.com/cables/ + */ +diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c +index 5c7d654..b150ed9 100644 +--- a/drivers/usb/serial/mos7840.c ++++ b/drivers/usb/serial/mos7840.c +@@ -1191,9 +1191,12 @@ static int mos7840_chars_in_buffer(struct tty_struct *tty) + } + + spin_lock_irqsave(&mos7840_port->pool_lock, flags); +- for (i = 0; i < NUM_URBS; ++i) +- if (mos7840_port->busy[i]) +- chars += URB_TRANSFER_BUFFER_SIZE; ++ for (i = 0; i < NUM_URBS; ++i) { ++ if (mos7840_port->busy[i]) { ++ struct urb *urb = mos7840_port->write_urb_pool[i]; ++ chars += urb->transfer_buffer_length; ++ } ++ } + spin_unlock_irqrestore(&mos7840_port->pool_lock, flags); + dbg("%s - returns %d", __func__, chars); + return chars; +diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c +index d89aac1..113560d 100644 +--- a/drivers/usb/serial/option.c ++++ b/drivers/usb/serial/option.c +@@ -80,84 +80,9 @@ static void option_instat_callback(struct urb *urb); + #define OPTION_PRODUCT_GTM380_MODEM 0x7201 + + #define HUAWEI_VENDOR_ID 0x12D1 +-#define HUAWEI_PRODUCT_E600 0x1001 +-#define HUAWEI_PRODUCT_E220 0x1003 +-#define HUAWEI_PRODUCT_E220BIS 0x1004 +-#define HUAWEI_PRODUCT_E1401 0x1401 +-#define HUAWEI_PRODUCT_E1402 0x1402 +-#define HUAWEI_PRODUCT_E1403 0x1403 +-#define HUAWEI_PRODUCT_E1404 0x1404 +-#define HUAWEI_PRODUCT_E1405 0x1405 +-#define HUAWEI_PRODUCT_E1406 0x1406 +-#define HUAWEI_PRODUCT_E1407 0x1407 +-#define HUAWEI_PRODUCT_E1408 0x1408 +-#define HUAWEI_PRODUCT_E1409 0x1409 +-#define HUAWEI_PRODUCT_E140A 0x140A +-#define HUAWEI_PRODUCT_E140B 0x140B +-#define HUAWEI_PRODUCT_E140C 0x140C +-#define HUAWEI_PRODUCT_E140D 0x140D +-#define HUAWEI_PRODUCT_E140E 0x140E +-#define HUAWEI_PRODUCT_E140F 0x140F +-#define HUAWEI_PRODUCT_E1410 0x1410 +-#define HUAWEI_PRODUCT_E1411 0x1411 +-#define HUAWEI_PRODUCT_E1412 0x1412 +-#define HUAWEI_PRODUCT_E1413 0x1413 +-#define HUAWEI_PRODUCT_E1414 0x1414 +-#define HUAWEI_PRODUCT_E1415 0x1415 +-#define HUAWEI_PRODUCT_E1416 0x1416 +-#define HUAWEI_PRODUCT_E1417 0x1417 +-#define HUAWEI_PRODUCT_E1418 0x1418 +-#define HUAWEI_PRODUCT_E1419 0x1419 +-#define HUAWEI_PRODUCT_E141A 0x141A +-#define HUAWEI_PRODUCT_E141B 0x141B +-#define HUAWEI_PRODUCT_E141C 0x141C +-#define HUAWEI_PRODUCT_E141D 0x141D +-#define HUAWEI_PRODUCT_E141E 0x141E +-#define HUAWEI_PRODUCT_E141F 0x141F +-#define HUAWEI_PRODUCT_E1420 0x1420 +-#define HUAWEI_PRODUCT_E1421 0x1421 +-#define HUAWEI_PRODUCT_E1422 0x1422 +-#define HUAWEI_PRODUCT_E1423 0x1423 +-#define HUAWEI_PRODUCT_E1424 0x1424 +-#define HUAWEI_PRODUCT_E1425 0x1425 +-#define HUAWEI_PRODUCT_E1426 0x1426 +-#define HUAWEI_PRODUCT_E1427 0x1427 +-#define HUAWEI_PRODUCT_E1428 0x1428 +-#define HUAWEI_PRODUCT_E1429 0x1429 +-#define HUAWEI_PRODUCT_E142A 0x142A +-#define HUAWEI_PRODUCT_E142B 0x142B +-#define HUAWEI_PRODUCT_E142C 0x142C +-#define HUAWEI_PRODUCT_E142D 0x142D +-#define HUAWEI_PRODUCT_E142E 0x142E +-#define HUAWEI_PRODUCT_E142F 0x142F +-#define HUAWEI_PRODUCT_E1430 0x1430 +-#define HUAWEI_PRODUCT_E1431 0x1431 +-#define HUAWEI_PRODUCT_E1432 0x1432 +-#define HUAWEI_PRODUCT_E1433 0x1433 +-#define HUAWEI_PRODUCT_E1434 0x1434 +-#define HUAWEI_PRODUCT_E1435 0x1435 +-#define HUAWEI_PRODUCT_E1436 0x1436 +-#define HUAWEI_PRODUCT_E1437 0x1437 +-#define HUAWEI_PRODUCT_E1438 0x1438 +-#define HUAWEI_PRODUCT_E1439 0x1439 +-#define HUAWEI_PRODUCT_E143A 0x143A +-#define HUAWEI_PRODUCT_E143B 0x143B +-#define HUAWEI_PRODUCT_E143C 0x143C +-#define HUAWEI_PRODUCT_E143D 0x143D +-#define HUAWEI_PRODUCT_E143E 0x143E +-#define HUAWEI_PRODUCT_E143F 0x143F + #define HUAWEI_PRODUCT_K4505 0x1464 + #define HUAWEI_PRODUCT_K3765 0x1465 +-#define HUAWEI_PRODUCT_E14AC 0x14AC +-#define HUAWEI_PRODUCT_K3806 0x14AE + #define HUAWEI_PRODUCT_K4605 0x14C6 +-#define HUAWEI_PRODUCT_K3770 0x14C9 +-#define HUAWEI_PRODUCT_K3771 0x14CA +-#define HUAWEI_PRODUCT_K4510 0x14CB +-#define HUAWEI_PRODUCT_K4511 0x14CC +-#define HUAWEI_PRODUCT_ETS1220 0x1803 +-#define HUAWEI_PRODUCT_E353 0x1506 +-#define HUAWEI_PRODUCT_E173S 0x1C05 + + #define QUANTA_VENDOR_ID 0x0408 + #define QUANTA_PRODUCT_Q101 0xEA02 +@@ -614,101 +539,123 @@ static const struct usb_device_id option_ids[] = { + { USB_DEVICE(QUANTA_VENDOR_ID, QUANTA_PRODUCT_GLX) }, + { USB_DEVICE(QUANTA_VENDOR_ID, QUANTA_PRODUCT_GKE) }, + { USB_DEVICE(QUANTA_VENDOR_ID, QUANTA_PRODUCT_GLE) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E600, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E220, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E220BIS, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1401, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1402, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1403, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1404, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1405, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1406, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1407, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1408, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1409, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E140A, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E140B, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E140C, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E140D, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E140E, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E140F, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1410, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1411, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1412, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1413, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1414, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1415, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1416, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1417, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1418, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1419, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E141A, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E141B, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E141C, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E141D, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E141E, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E141F, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1420, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1421, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1422, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1423, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1424, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1425, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1426, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1427, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1428, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1429, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E142A, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E142B, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E142C, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E142D, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E142E, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E142F, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1430, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1431, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1432, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1433, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1434, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1435, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1436, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1437, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1438, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E1439, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E143A, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E143B, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E143C, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E143D, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E143E, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E143F, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E173S, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K4505, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t) &huawei_cdc12_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K3765, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t) &huawei_cdc12_blacklist }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_ETS1220, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E14AC, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K3806, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K4605, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t) &huawei_cdc12_blacklist }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K4605, 0xff, 0x01, 0x31) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K4605, 0xff, 0x01, 0x32) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K3770, 0xff, 0x02, 0x31) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K3770, 0xff, 0x02, 0x32) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K3771, 0xff, 0x02, 0x31) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K3771, 0xff, 0x02, 0x32) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K4510, 0xff, 0x01, 0x31) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K4510, 0xff, 0x01, 0x32) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K4511, 0xff, 0x01, 0x31) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K4511, 0xff, 0x01, 0x32) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E353, 0xff, 0x01, 0x01) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E353, 0xff, 0x01, 0x02) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E353, 0xff, 0x01, 0x03) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E353, 0xff, 0x01, 0x10) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E353, 0xff, 0x01, 0x12) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E353, 0xff, 0x01, 0x13) }, +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E353, 0xff, 0x02, 0x01) }, /* E398 3G Modem */ +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E353, 0xff, 0x02, 0x02) }, /* E398 3G PC UI Interface */ +- { USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E353, 0xff, 0x02, 0x03) }, /* E398 3G Application Interface */ ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0xff, 0xff) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x01) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x02) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x03) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x04) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x05) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x06) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x0A) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x0B) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x0D) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x0E) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x0F) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x10) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x12) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x13) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x14) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x15) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x17) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x18) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x19) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x1A) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x1B) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x1C) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x31) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x32) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x33) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x34) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x35) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x36) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x3A) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x3B) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x3D) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x3E) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x3F) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x48) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x49) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x4A) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x4B) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x4C) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x61) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x62) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x63) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x64) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x65) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x66) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x6A) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x6B) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x6D) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x6E) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x6F) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x78) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x79) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x7A) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x7B) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x01, 0x7C) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x01) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x02) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x03) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x04) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x05) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x06) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x0A) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x0B) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x0D) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x0E) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x0F) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x10) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x12) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x13) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x14) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x15) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x17) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x18) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x19) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x1A) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x1B) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x1C) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x31) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x32) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x33) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x34) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x35) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x36) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x3A) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x3B) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x3D) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x3E) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x3F) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x48) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x49) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x4A) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x4B) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x4C) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x61) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x62) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x63) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x64) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x65) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x66) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x6A) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x6B) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x6D) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x6E) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x6F) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x78) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x79) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x7A) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x7B) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, 0xff, 0x02, 0x7C) }, ++ ++ + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_V640) }, + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_V620) }, + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_V740) }, +diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c +index 8745637..bf9a9b7 100644 +--- a/drivers/video/console/fbcon.c ++++ b/drivers/video/console/fbcon.c +@@ -373,8 +373,15 @@ static void fb_flashcursor(struct work_struct *work) + struct vc_data *vc = NULL; + int c; + int mode; ++ int ret; ++ ++ /* FIXME: we should sort out the unbind locking instead */ ++ /* instead we just fail to flash the cursor if we can't get ++ * the lock instead of blocking fbcon deinit */ ++ ret = console_trylock(); ++ if (ret == 0) ++ return; + +- console_lock(); + if (ops && ops->currcon != -1) + vc = vc_cons[ops->currcon].d; + +diff --git a/fs/buffer.c b/fs/buffer.c +index 4115eca..19a4f0b 100644 +--- a/fs/buffer.c ++++ b/fs/buffer.c +@@ -964,7 +964,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head) + /* + * Initialise the state of a blockdev page's buffers. + */ +-static void ++static sector_t + init_page_buffers(struct page *page, struct block_device *bdev, + sector_t block, int size) + { +@@ -986,33 +986,41 @@ init_page_buffers(struct page *page, struct block_device *bdev, + block++; + bh = bh->b_this_page; + } while (bh != head); ++ ++ /* ++ * Caller needs to validate requested block against end of device. ++ */ ++ return end_block; + } + + /* + * Create the page-cache page that contains the requested block. + * +- * This is user purely for blockdev mappings. ++ * This is used purely for blockdev mappings. + */ +-static struct page * ++static int + grow_dev_page(struct block_device *bdev, sector_t block, +- pgoff_t index, int size) ++ pgoff_t index, int size, int sizebits) + { + struct inode *inode = bdev->bd_inode; + struct page *page; + struct buffer_head *bh; ++ sector_t end_block; ++ int ret = 0; /* Will call free_more_memory() */ + + page = find_or_create_page(inode->i_mapping, index, + (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); + if (!page) +- return NULL; ++ return ret; + + BUG_ON(!PageLocked(page)); + + if (page_has_buffers(page)) { + bh = page_buffers(page); + if (bh->b_size == size) { +- init_page_buffers(page, bdev, block, size); +- return page; ++ end_block = init_page_buffers(page, bdev, ++ index << sizebits, size); ++ goto done; + } + if (!try_to_free_buffers(page)) + goto failed; +@@ -1032,15 +1040,14 @@ grow_dev_page(struct block_device *bdev, sector_t block, + */ + spin_lock(&inode->i_mapping->private_lock); + link_dev_buffers(page, bh); +- init_page_buffers(page, bdev, block, size); ++ end_block = init_page_buffers(page, bdev, index << sizebits, size); + spin_unlock(&inode->i_mapping->private_lock); +- return page; +- ++done: ++ ret = (block < end_block) ? 1 : -ENXIO; + failed: +- BUG(); + unlock_page(page); + page_cache_release(page); +- return NULL; ++ return ret; + } + + /* +@@ -1050,7 +1057,6 @@ failed: + static int + grow_buffers(struct block_device *bdev, sector_t block, int size) + { +- struct page *page; + pgoff_t index; + int sizebits; + +@@ -1074,22 +1080,14 @@ grow_buffers(struct block_device *bdev, sector_t block, int size) + bdevname(bdev, b)); + return -EIO; + } +- block = index << sizebits; ++ + /* Create a page with the proper size buffers.. */ +- page = grow_dev_page(bdev, block, index, size); +- if (!page) +- return 0; +- unlock_page(page); +- page_cache_release(page); +- return 1; ++ return grow_dev_page(bdev, block, index, size, sizebits); + } + + static struct buffer_head * + __getblk_slow(struct block_device *bdev, sector_t block, int size) + { +- int ret; +- struct buffer_head *bh; +- + /* Size must be multiple of hard sectorsize */ + if (unlikely(size & (bdev_logical_block_size(bdev)-1) || + (size < 512 || size > PAGE_SIZE))) { +@@ -1102,21 +1100,20 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) + return NULL; + } + +-retry: +- bh = __find_get_block(bdev, block, size); +- if (bh) +- return bh; ++ for (;;) { ++ struct buffer_head *bh; ++ int ret; + +- ret = grow_buffers(bdev, block, size); +- if (ret == 0) { +- free_more_memory(); +- goto retry; +- } else if (ret > 0) { + bh = __find_get_block(bdev, block, size); + if (bh) + return bh; ++ ++ ret = grow_buffers(bdev, block, size); ++ if (ret < 0) ++ return NULL; ++ if (ret == 0) ++ free_more_memory(); + } +- return NULL; + } + + /* +@@ -1372,10 +1369,6 @@ EXPORT_SYMBOL(__find_get_block); + * which corresponds to the passed block_device, block and size. The + * returned buffer has its reference count incremented. + * +- * __getblk() cannot fail - it just keeps trying. If you pass it an +- * illegal block number, __getblk() will happily return a buffer_head +- * which represents the non-existent block. Very weird. +- * + * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() + * attempt is failing. FIXME, perhaps? + */ +diff --git a/fs/compat.c b/fs/compat.c +index c987875..e07a3d3 100644 +--- a/fs/compat.c ++++ b/fs/compat.c +@@ -1174,11 +1174,14 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, + struct file *file; + int fput_needed; + ssize_t ret; ++ loff_t pos; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; +- ret = compat_readv(file, vec, vlen, &file->f_pos); ++ pos = file->f_pos; ++ ret = compat_readv(file, vec, vlen, &pos); ++ file->f_pos = pos; + fput_light(file, fput_needed); + return ret; + } +@@ -1233,11 +1236,14 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, + struct file *file; + int fput_needed; + ssize_t ret; ++ loff_t pos; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; +- ret = compat_writev(file, vec, vlen, &file->f_pos); ++ pos = file->f_pos; ++ ret = compat_writev(file, vec, vlen, &pos); ++ file->f_pos = pos; + fput_light(file, fput_needed); + return ret; + } +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index a071348..f8d5fce 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -904,6 +904,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) + ei->i_reserved_meta_blocks = 0; + ei->i_allocated_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; ++ ei->i_da_metadata_calc_last_lblock = 0; + spin_lock_init(&(ei->i_block_reservation_lock)); + #ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +@@ -3107,6 +3108,10 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + int s, j, count = 0; + ++ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) ++ return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) + ++ sbi->s_itb_per_group + 2); ++ + first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + + (grp * EXT4_BLOCKS_PER_GROUP(sb)); + last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; +diff --git a/fs/fuse/file.c b/fs/fuse/file.c +index 0c84100..5242006 100644 +--- a/fs/fuse/file.c ++++ b/fs/fuse/file.c +@@ -1687,7 +1687,7 @@ static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) + size_t n; + u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; + +- for (n = 0; n < count; n++) { ++ for (n = 0; n < count; n++, iov++) { + if (iov->iov_len > (size_t) max) + return -ENOMEM; + max -= iov->iov_len; +diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c +index 3db6b82..d774309 100644 +--- a/fs/nfs/blocklayout/blocklayout.c ++++ b/fs/nfs/blocklayout/blocklayout.c +@@ -38,6 +38,8 @@ + #include /* various write calls */ + #include + ++#include "../pnfs.h" ++#include "../internal.h" + #include "blocklayout.h" + + #define NFSDBG_FACILITY NFSDBG_PNFS_LD +@@ -814,7 +816,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; +- max_pages = max_resp_sz >> PAGE_SHIFT; ++ max_pages = nfs_page_array_len(0, max_resp_sz); + dprintk("%s max_resp_sz %u max_pages %d\n", + __func__, max_resp_sz, max_pages); + +diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c +index c69682a..4e2ee99 100644 +--- a/fs/nfs/blocklayout/extents.c ++++ b/fs/nfs/blocklayout/extents.c +@@ -153,7 +153,7 @@ static int _preload_range(struct pnfs_inval_markings *marks, + count = (int)(end - start) / (int)tree->mtt_step_size; + + /* Pre-malloc what memory we might need */ +- storage = kmalloc(sizeof(*storage) * count, GFP_NOFS); ++ storage = kcalloc(count, sizeof(*storage), GFP_NOFS); + if (!storage) + return -ENOMEM; + for (i = 0; i < count; i++) { +diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c +index ac28990..756f4df 100644 +--- a/fs/nfs/dir.c ++++ b/fs/nfs/dir.c +@@ -1103,7 +1103,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) + struct nfs_fattr *fattr = NULL; + int error; + +- if (nd->flags & LOOKUP_RCU) ++ if (nd && (nd->flags & LOOKUP_RCU)) + return -ECHILD; + + parent = dget_parent(dentry); +@@ -1508,7 +1508,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) + struct nfs_open_context *ctx; + int openflags, ret = 0; + +- if (nd->flags & LOOKUP_RCU) ++ if (nd && (nd->flags & LOOKUP_RCU)) + return -ECHILD; + + inode = dentry->d_inode; +diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c +index d4bc9ed9..5195fd6 100644 +--- a/fs/nfs/nfs3proc.c ++++ b/fs/nfs/nfs3proc.c +@@ -68,7 +68,7 @@ do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle, + nfs_fattr_init(info->fattr); + status = rpc_call_sync(client, &msg, 0); + dprintk("%s: reply fsinfo: %d\n", __func__, status); +- if (!(info->fattr->valid & NFS_ATTR_FATTR)) { ++ if (status == 0 && !(info->fattr->valid & NFS_ATTR_FATTR)) { + msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; + msg.rpc_resp = info->fattr; + status = rpc_call_sync(client, &msg, 0); +diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c +index ed388aa..bd5d9cf 100644 +--- a/fs/nfs/nfs4filelayoutdev.c ++++ b/fs/nfs/nfs4filelayoutdev.c +@@ -721,7 +721,7 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_fla + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; +- max_pages = max_resp_sz >> PAGE_SHIFT; ++ max_pages = nfs_page_array_len(0, max_resp_sz); + dprintk("%s inode %p max_resp_sz %u max_pages %d\n", + __func__, inode, max_resp_sz, max_pages); + +diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c +index 8000459..d20221d 100644 +--- a/fs/nfs/nfs4proc.c ++++ b/fs/nfs/nfs4proc.c +@@ -5769,11 +5769,58 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) + dprintk("<-- %s\n", __func__); + } + ++static size_t max_response_pages(struct nfs_server *server) ++{ ++ u32 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ return nfs_page_array_len(0, max_resp_sz); ++} ++ ++static void nfs4_free_pages(struct page **pages, size_t size) ++{ ++ int i; ++ ++ if (!pages) ++ return; ++ ++ for (i = 0; i < size; i++) { ++ if (!pages[i]) ++ break; ++ __free_page(pages[i]); ++ } ++ kfree(pages); ++} ++ ++static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) ++{ ++ struct page **pages; ++ int i; ++ ++ pages = kcalloc(size, sizeof(struct page *), gfp_flags); ++ if (!pages) { ++ dprintk("%s: can't alloc array of %zu pages\n", __func__, size); ++ return NULL; ++ } ++ ++ for (i = 0; i < size; i++) { ++ pages[i] = alloc_page(gfp_flags); ++ if (!pages[i]) { ++ dprintk("%s: failed to allocate page\n", __func__); ++ nfs4_free_pages(pages, size); ++ return NULL; ++ } ++ } ++ ++ return pages; ++} ++ + static void nfs4_layoutget_release(void *calldata) + { + struct nfs4_layoutget *lgp = calldata; ++ struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ size_t max_pages = max_response_pages(server); + + dprintk("--> %s\n", __func__); ++ nfs4_free_pages(lgp->args.layout.pages, max_pages); + put_nfs_open_context(lgp->args.ctx); + kfree(calldata); + dprintk("<-- %s\n", __func__); +@@ -5785,9 +5832,10 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = { + .rpc_release = nfs4_layoutget_release, + }; + +-int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) ++int nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) + { + struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ size_t max_pages = max_response_pages(server); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], +@@ -5805,6 +5853,13 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) + + dprintk("--> %s\n", __func__); + ++ lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); ++ if (!lgp->args.layout.pages) { ++ nfs4_layoutget_release(lgp); ++ return -ENOMEM; ++ } ++ lgp->args.layout.pglen = max_pages * PAGE_SIZE; ++ + lgp->res.layoutp = &lgp->args.layout; + lgp->res.seq_res.sr_slot = NULL; + task = rpc_run_task(&task_setup_data); +diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c +index f881a63..3ad6595 100644 +--- a/fs/nfs/pnfs.c ++++ b/fs/nfs/pnfs.c +@@ -575,9 +575,6 @@ send_layoutget(struct pnfs_layout_hdr *lo, + struct nfs_server *server = NFS_SERVER(ino); + struct nfs4_layoutget *lgp; + struct pnfs_layout_segment *lseg = NULL; +- struct page **pages = NULL; +- int i; +- u32 max_resp_sz, max_pages; + + dprintk("--> %s\n", __func__); + +@@ -586,20 +583,6 @@ send_layoutget(struct pnfs_layout_hdr *lo, + if (lgp == NULL) + return NULL; + +- /* allocate pages for xdr post processing */ +- max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; +- max_pages = max_resp_sz >> PAGE_SHIFT; +- +- pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags); +- if (!pages) +- goto out_err_free; +- +- for (i = 0; i < max_pages; i++) { +- pages[i] = alloc_page(gfp_flags); +- if (!pages[i]) +- goto out_err_free; +- } +- + lgp->args.minlength = PAGE_CACHE_SIZE; + if (lgp->args.minlength > range->length) + lgp->args.minlength = range->length; +@@ -608,39 +591,19 @@ send_layoutget(struct pnfs_layout_hdr *lo, + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->args.ctx = get_nfs_open_context(ctx); +- lgp->args.layout.pages = pages; +- lgp->args.layout.pglen = max_pages * PAGE_SIZE; + lgp->lsegpp = &lseg; + lgp->gfp_flags = gfp_flags; + + /* Synchronously retrieve layout information from server and + * store in lseg. + */ +- nfs4_proc_layoutget(lgp); ++ nfs4_proc_layoutget(lgp, gfp_flags); + if (!lseg) { + /* remember that LAYOUTGET failed and suspend trying */ + set_bit(lo_fail_bit(range->iomode), &lo->plh_flags); + } + +- /* free xdr pages */ +- for (i = 0; i < max_pages; i++) +- __free_page(pages[i]); +- kfree(pages); +- + return lseg; +- +-out_err_free: +- /* free any allocated xdr pages, lgp as it's not used */ +- if (pages) { +- for (i = 0; i < max_pages; i++) { +- if (!pages[i]) +- break; +- __free_page(pages[i]); +- } +- kfree(pages); +- } +- kfree(lgp); +- return NULL; + } + + /* Initiates a LAYOUTRETURN(FILE) */ +diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h +index 53d593a..c946b1b 100644 +--- a/fs/nfs/pnfs.h ++++ b/fs/nfs/pnfs.h +@@ -162,7 +162,7 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server, + struct pnfs_devicelist *devlist); + extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *dev); +-extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); ++extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); + extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); + + /* pnfs.c */ +diff --git a/fs/nfs/super.c b/fs/nfs/super.c +index 376cd65..6e85ec6 100644 +--- a/fs/nfs/super.c ++++ b/fs/nfs/super.c +@@ -3087,4 +3087,6 @@ static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, + return res; + } + ++MODULE_ALIAS("nfs4"); ++ + #endif /* CONFIG_NFS_V4 */ +diff --git a/fs/nfs/write.c b/fs/nfs/write.c +index c6e523a..301391a 100644 +--- a/fs/nfs/write.c ++++ b/fs/nfs/write.c +@@ -1742,12 +1742,12 @@ int __init nfs_init_writepagecache(void) + nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE, + nfs_wdata_cachep); + if (nfs_wdata_mempool == NULL) +- return -ENOMEM; ++ goto out_destroy_write_cache; + + nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT, + nfs_wdata_cachep); + if (nfs_commit_mempool == NULL) +- return -ENOMEM; ++ goto out_destroy_write_mempool; + + /* + * NFS congestion size, scale with available memory. +@@ -1770,6 +1770,12 @@ int __init nfs_init_writepagecache(void) + nfs_congestion_kb = 256*1024; + + return 0; ++ ++out_destroy_write_mempool: ++ mempool_destroy(nfs_wdata_mempool); ++out_destroy_write_cache: ++ kmem_cache_destroy(nfs_wdata_cachep); ++ return -ENOMEM; + } + + void nfs_destroy_writepagecache(void) +diff --git a/fs/open.c b/fs/open.c +index e2b5d51..b8485d3 100644 +--- a/fs/open.c ++++ b/fs/open.c +@@ -882,9 +882,10 @@ static inline int build_open_flags(int flags, int mode, struct open_flags *op) + int lookup_flags = 0; + int acc_mode; + +- if (!(flags & O_CREAT)) +- mode = 0; +- op->mode = mode; ++ if (flags & O_CREAT) ++ op->mode = (mode & S_IALLUGO) | S_IFREG; ++ else ++ op->mode = 0; + + /* Must never be set by userspace */ + flags &= ~FMODE_NONOTIFY; +diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c +index 2da1715..4619247 100644 +--- a/fs/squashfs/super.c ++++ b/fs/squashfs/super.c +@@ -290,7 +290,7 @@ handle_fragments: + + check_directory_table: + /* Sanity check directory_table */ +- if (msblk->directory_table >= next_table) { ++ if (msblk->directory_table > next_table) { + err = -EINVAL; + goto failed_mount; + } +diff --git a/include/asm-generic/mutex-xchg.h b/include/asm-generic/mutex-xchg.h +index 580a6d3..c04e0db 100644 +--- a/include/asm-generic/mutex-xchg.h ++++ b/include/asm-generic/mutex-xchg.h +@@ -26,7 +26,13 @@ static inline void + __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *)) + { + if (unlikely(atomic_xchg(count, 0) != 1)) +- fail_fn(count); ++ /* ++ * We failed to acquire the lock, so mark it contended ++ * to ensure that any waiting tasks are woken up by the ++ * unlock slow path. ++ */ ++ if (likely(atomic_xchg(count, -1) != 1)) ++ fail_fn(count); + } + + /** +@@ -43,7 +49,8 @@ static inline int + __mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *)) + { + if (unlikely(atomic_xchg(count, 0) != 1)) +- return fail_fn(count); ++ if (likely(atomic_xchg(count, -1) != 1)) ++ return fail_fn(count); + return 0; + } + +diff --git a/include/linux/usb.h b/include/linux/usb.h +index 4269c3f..93629fc 100644 +--- a/include/linux/usb.h ++++ b/include/linux/usb.h +@@ -775,6 +775,27 @@ static inline int usb_make_path(struct usb_device *dev, char *buf, size_t size) + .bInterfaceSubClass = (sc), \ + .bInterfaceProtocol = (pr) + ++/** ++ * USB_VENDOR_AND_INTERFACE_INFO - describe a specific usb vendor with a class of usb interfaces ++ * @vend: the 16 bit USB Vendor ID ++ * @cl: bInterfaceClass value ++ * @sc: bInterfaceSubClass value ++ * @pr: bInterfaceProtocol value ++ * ++ * This macro is used to create a struct usb_device_id that matches a ++ * specific vendor with a specific class of interfaces. ++ * ++ * This is especially useful when explicitly matching devices that have ++ * vendor specific bDeviceClass values, but standards-compliant interfaces. ++ */ ++#define USB_VENDOR_AND_INTERFACE_INFO(vend, cl, sc, pr) \ ++ .match_flags = USB_DEVICE_ID_MATCH_INT_INFO \ ++ | USB_DEVICE_ID_MATCH_VENDOR, \ ++ .idVendor = (vend), \ ++ .bInterfaceClass = (cl), \ ++ .bInterfaceSubClass = (sc), \ ++ .bInterfaceProtocol = (pr) ++ + /* ----------------------------------------------------------------------- */ + + /* Stuff for dynamic usb ids */ +diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c +index 5bf0790..31fdc48 100644 +--- a/kernel/audit_tree.c ++++ b/kernel/audit_tree.c +@@ -250,7 +250,6 @@ static void untag_chunk(struct node *p) + spin_unlock(&hash_lock); + spin_unlock(&entry->lock); + fsnotify_destroy_mark(entry); +- fsnotify_put_mark(entry); + goto out; + } + +@@ -259,7 +258,7 @@ static void untag_chunk(struct node *p) + + fsnotify_duplicate_mark(&new->mark, entry); + if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { +- free_chunk(new); ++ fsnotify_put_mark(&new->mark); + goto Fallback; + } + +@@ -293,7 +292,6 @@ static void untag_chunk(struct node *p) + spin_unlock(&hash_lock); + spin_unlock(&entry->lock); + fsnotify_destroy_mark(entry); +- fsnotify_put_mark(entry); + goto out; + + Fallback: +@@ -322,7 +320,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) + + entry = &chunk->mark; + if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { +- free_chunk(chunk); ++ fsnotify_put_mark(entry); + return -ENOSPC; + } + +@@ -332,6 +330,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) + spin_unlock(&hash_lock); + chunk->dead = 1; + spin_unlock(&entry->lock); ++ fsnotify_get_mark(entry); + fsnotify_destroy_mark(entry); + fsnotify_put_mark(entry); + return 0; +@@ -396,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) + fsnotify_duplicate_mark(chunk_entry, old_entry); + if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { + spin_unlock(&old_entry->lock); +- free_chunk(chunk); ++ fsnotify_put_mark(chunk_entry); + fsnotify_put_mark(old_entry); + return -ENOSPC; + } +@@ -412,6 +411,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) + spin_unlock(&chunk_entry->lock); + spin_unlock(&old_entry->lock); + ++ fsnotify_get_mark(chunk_entry); + fsnotify_destroy_mark(chunk_entry); + + fsnotify_put_mark(chunk_entry); +@@ -445,7 +445,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) + spin_unlock(&old_entry->lock); + fsnotify_destroy_mark(old_entry); + fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ +- fsnotify_put_mark(old_entry); /* and kill it */ + return 0; + } + +diff --git a/kernel/sched.c b/kernel/sched.c +index e0431c4..910db7d 100644 +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -4355,6 +4355,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) + # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) + #endif + ++static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) ++{ ++ u64 temp = (__force u64) rtime; ++ ++ temp *= (__force u64) utime; ++ ++ if (sizeof(cputime_t) == 4) ++ temp = div_u64(temp, (__force u32) total); ++ else ++ temp = div64_u64(temp, (__force u64) total); ++ ++ return (__force cputime_t) temp; ++} ++ + void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) + { + cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); +@@ -4364,13 +4378,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) + */ + rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + +- if (total) { +- u64 temp = rtime; +- +- temp *= utime; +- do_div(temp, total); +- utime = (cputime_t)temp; +- } else ++ if (total) ++ utime = scale_utime(utime, rtime, total); ++ else + utime = rtime; + + /* +@@ -4397,13 +4407,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) + total = cputime_add(cputime.utime, cputime.stime); + rtime = nsecs_to_cputime(cputime.sum_exec_runtime); + +- if (total) { +- u64 temp = rtime; +- +- temp *= cputime.utime; +- do_div(temp, total); +- utime = (cputime_t)temp; +- } else ++ if (total) ++ utime = scale_utime(cputime.utime, rtime, total); ++ else + utime = rtime; + + sig->prev_utime = max(sig->prev_utime, utime); +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 48febd7..86eb848 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1977,10 +1977,10 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, + * proportional to the fraction of recently scanned pages on + * each list that were recently referenced and in active use. + */ +- ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); ++ ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); + ap /= reclaim_stat->recent_rotated[0] + 1; + +- fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); ++ fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); + fp /= reclaim_stat->recent_rotated[1] + 1; + spin_unlock_irq(&zone->lru_lock); + +@@ -1993,7 +1993,7 @@ out: + unsigned long scan; + + scan = zone_nr_lru_pages(zone, sc, l); +- if (priority || noswap) { ++ if (priority || noswap || !vmscan_swappiness(sc)) { + scan >>= priority; + if (!scan && force_scan) + scan = SWAP_CLUSTER_MAX; +diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c +index 643a41b..6033f02 100644 +--- a/net/bluetooth/hci_event.c ++++ b/net/bluetooth/hci_event.c +@@ -1411,7 +1411,13 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s + if (conn->type == ACL_LINK) { + conn->state = BT_CONFIG; + hci_conn_hold(conn); +- conn->disc_timeout = HCI_DISCONN_TIMEOUT; ++ ++ if (!conn->out && ++ !(conn->ssp_mode && conn->hdev->ssp_mode) && ++ !hci_find_link_key(hdev, &ev->bdaddr)) ++ conn->disc_timeout = HCI_PAIRING_TIMEOUT; ++ else ++ conn->disc_timeout = HCI_DISCONN_TIMEOUT; + mgmt_connected(hdev->id, &ev->bdaddr, conn->type); + } else + conn->state = BT_CONNECTED; +diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c +index 17b5b1c..dd76177 100644 +--- a/net/bluetooth/l2cap_core.c ++++ b/net/bluetooth/l2cap_core.c +@@ -862,6 +862,7 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn) + write_lock_bh(&conn->chan_lock); + + hci_conn_hold(conn->hcon); ++ conn->hcon->disc_timeout = HCI_DISCONN_TIMEOUT; + + bacpy(&bt_sk(sk)->src, conn->src); + bacpy(&bt_sk(sk)->dst, conn->dst); +@@ -2263,12 +2264,14 @@ static void l2cap_conf_rfc_get(struct l2cap_chan *chan, void *rsp, int len) + while (len >= L2CAP_CONF_OPT_SIZE) { + len -= l2cap_get_conf_opt(&rsp, &type, &olen, &val); + +- switch (type) { +- case L2CAP_CONF_RFC: +- if (olen == sizeof(rfc)) +- memcpy(&rfc, (void *)val, olen); +- goto done; +- } ++ if (type != L2CAP_CONF_RFC) ++ continue; ++ ++ if (olen != sizeof(rfc)) ++ break; ++ ++ memcpy(&rfc, (void *)val, olen); ++ goto done; + } + + /* Use sane default values in case a misbehaving remote device +diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h +index 75c3582..fb85d37 100644 +--- a/net/dccp/ccid.h ++++ b/net/dccp/ccid.h +@@ -246,7 +246,7 @@ static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk, + u32 __user *optval, int __user *optlen) + { + int rc = -ENOPROTOOPT; +- if (ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) ++ if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) + rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len, + optval, optlen); + return rc; +@@ -257,7 +257,7 @@ static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk, + u32 __user *optval, int __user *optlen) + { + int rc = -ENOPROTOOPT; +- if (ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) ++ if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) + rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len, + optval, optlen); + return rc; +diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c +index 9ed2cd0..3282453 100644 +--- a/net/sunrpc/svc_xprt.c ++++ b/net/sunrpc/svc_xprt.c +@@ -315,7 +315,6 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) + */ + void svc_xprt_enqueue(struct svc_xprt *xprt) + { +- struct svc_serv *serv = xprt->xpt_server; + struct svc_pool *pool; + struct svc_rqst *rqstp; + int cpu; +@@ -361,8 +360,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) + rqstp, rqstp->rq_xprt); + rqstp->rq_xprt = xprt; + svc_xprt_get(xprt); +- rqstp->rq_reserved = serv->sv_max_mesg; +- atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + pool->sp_stats.threads_woken++; + wake_up(&rqstp->rq_wait); + } else { +@@ -642,8 +639,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) + if (xprt) { + rqstp->rq_xprt = xprt; + svc_xprt_get(xprt); +- rqstp->rq_reserved = serv->sv_max_mesg; +- atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + + /* As there is a shortage of threads and this request + * had to be queued, don't allow the thread to wait so +@@ -740,6 +735,8 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) + else + len = xprt->xpt_ops->xpo_recvfrom(rqstp); + dprintk("svc: got len=%d\n", len); ++ rqstp->rq_reserved = serv->sv_max_mesg; ++ atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + } + svc_xprt_received(xprt); + +@@ -796,7 +793,8 @@ int svc_send(struct svc_rqst *rqstp) + + /* Grab mutex to serialize outgoing data. */ + mutex_lock(&xprt->xpt_mutex); +- if (test_bit(XPT_DEAD, &xprt->xpt_flags)) ++ if (test_bit(XPT_DEAD, &xprt->xpt_flags) ++ || test_bit(XPT_CLOSE, &xprt->xpt_flags)) + len = -ENOTCONN; + else + len = xprt->xpt_ops->xpo_sendto(rqstp); +diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c +index 71bed1c..296192c 100644 +--- a/net/sunrpc/svcsock.c ++++ b/net/sunrpc/svcsock.c +@@ -1136,9 +1136,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) + if (len >= 0) + svsk->sk_tcplen += len; + if (len != want) { ++ svc_tcp_save_pages(svsk, rqstp); + if (len < 0 && len != -EAGAIN) + goto err_other; +- svc_tcp_save_pages(svsk, rqstp); + dprintk("svc: incomplete TCP record (%d of %d)\n", + svsk->sk_tcplen, svsk->sk_reclen); + goto err_noclose; +diff --git a/sound/pci/hda/hda_proc.c b/sound/pci/hda/hda_proc.c +index 254ab52..2210b83 100644 +--- a/sound/pci/hda/hda_proc.c ++++ b/sound/pci/hda/hda_proc.c +@@ -412,7 +412,7 @@ static void print_digital_conv(struct snd_info_buffer *buffer, + if (digi1 & AC_DIG1_EMPHASIS) + snd_iprintf(buffer, " Preemphasis"); + if (digi1 & AC_DIG1_COPYRIGHT) +- snd_iprintf(buffer, " Copyright"); ++ snd_iprintf(buffer, " Non-Copyright"); + if (digi1 & AC_DIG1_NONAUDIO) + snd_iprintf(buffer, " Non-Audio"); + if (digi1 & AC_DIG1_PROFESSIONAL) +diff --git a/sound/pci/hda/patch_ca0132.c b/sound/pci/hda/patch_ca0132.c +index 35abe3c..b22989e 100644 +--- a/sound/pci/hda/patch_ca0132.c ++++ b/sound/pci/hda/patch_ca0132.c +@@ -276,6 +276,10 @@ static int _add_switch(struct hda_codec *codec, hda_nid_t nid, const char *pfx, + int type = dir ? HDA_INPUT : HDA_OUTPUT; + struct snd_kcontrol_new knew = + HDA_CODEC_MUTE_MONO(namestr, nid, chan, 0, type); ++ if ((query_amp_caps(codec, nid, type) & AC_AMPCAP_MUTE) == 0) { ++ snd_printdd("Skipping '%s %s Switch' (no mute on node 0x%x)\n", pfx, dirstr[dir], nid); ++ return 0; ++ } + sprintf(namestr, "%s %s Switch", pfx, dirstr[dir]); + return snd_hda_ctl_add(codec, nid, snd_ctl_new1(&knew, codec)); + } +@@ -287,6 +291,10 @@ static int _add_volume(struct hda_codec *codec, hda_nid_t nid, const char *pfx, + int type = dir ? HDA_INPUT : HDA_OUTPUT; + struct snd_kcontrol_new knew = + HDA_CODEC_VOLUME_MONO(namestr, nid, chan, 0, type); ++ if ((query_amp_caps(codec, nid, type) & AC_AMPCAP_NUM_STEPS) == 0) { ++ snd_printdd("Skipping '%s %s Volume' (no amp on node 0x%x)\n", pfx, dirstr[dir], nid); ++ return 0; ++ } + sprintf(namestr, "%s %s Volume", pfx, dirstr[dir]); + return snd_hda_ctl_add(codec, nid, snd_ctl_new1(&knew, codec)); + } +diff --git a/sound/soc/codecs/wm9712.c b/sound/soc/codecs/wm9712.c +index 90117f8..90e5005 100644 +--- a/sound/soc/codecs/wm9712.c ++++ b/sound/soc/codecs/wm9712.c +@@ -270,7 +270,7 @@ SOC_DAPM_ENUM("Route", wm9712_enum[9]); + + /* Mic select */ + static const struct snd_kcontrol_new wm9712_mic_src_controls = +-SOC_DAPM_ENUM("Route", wm9712_enum[7]); ++SOC_DAPM_ENUM("Mic Source Select", wm9712_enum[7]); + + /* diff select */ + static const struct snd_kcontrol_new wm9712_diff_sel_controls = +@@ -289,7 +289,9 @@ SND_SOC_DAPM_MUX("Left Capture Select", SND_SOC_NOPM, 0, 0, + &wm9712_capture_selectl_controls), + SND_SOC_DAPM_MUX("Right Capture Select", SND_SOC_NOPM, 0, 0, + &wm9712_capture_selectr_controls), +-SND_SOC_DAPM_MUX("Mic Select Source", SND_SOC_NOPM, 0, 0, ++SND_SOC_DAPM_MUX("Left Mic Select Source", SND_SOC_NOPM, 0, 0, ++ &wm9712_mic_src_controls), ++SND_SOC_DAPM_MUX("Right Mic Select Source", SND_SOC_NOPM, 0, 0, + &wm9712_mic_src_controls), + SND_SOC_DAPM_MUX("Differential Source", SND_SOC_NOPM, 0, 0, + &wm9712_diff_sel_controls), +@@ -317,6 +319,7 @@ SND_SOC_DAPM_PGA("Out 3 PGA", AC97_INT_PAGING, 5, 1, NULL, 0), + SND_SOC_DAPM_PGA("Line PGA", AC97_INT_PAGING, 2, 1, NULL, 0), + SND_SOC_DAPM_PGA("Phone PGA", AC97_INT_PAGING, 1, 1, NULL, 0), + SND_SOC_DAPM_PGA("Mic PGA", AC97_INT_PAGING, 0, 1, NULL, 0), ++SND_SOC_DAPM_PGA("Differential Mic", SND_SOC_NOPM, 0, 0, NULL, 0), + SND_SOC_DAPM_MICBIAS("Mic Bias", AC97_INT_PAGING, 10, 1), + SND_SOC_DAPM_OUTPUT("MONOOUT"), + SND_SOC_DAPM_OUTPUT("HPOUTL"), +@@ -377,6 +380,18 @@ static const struct snd_soc_dapm_route wm9712_audio_map[] = { + {"Mic PGA", NULL, "MIC1"}, + {"Mic PGA", NULL, "MIC2"}, + ++ /* microphones */ ++ {"Differential Mic", NULL, "MIC1"}, ++ {"Differential Mic", NULL, "MIC2"}, ++ {"Left Mic Select Source", "Mic 1", "MIC1"}, ++ {"Left Mic Select Source", "Mic 2", "MIC2"}, ++ {"Left Mic Select Source", "Stereo", "MIC1"}, ++ {"Left Mic Select Source", "Differential", "Differential Mic"}, ++ {"Right Mic Select Source", "Mic 1", "MIC1"}, ++ {"Right Mic Select Source", "Mic 2", "MIC2"}, ++ {"Right Mic Select Source", "Stereo", "MIC2"}, ++ {"Right Mic Select Source", "Differential", "Differential Mic"}, ++ + /* left capture selector */ + {"Left Capture Select", "Mic", "MIC1"}, + {"Left Capture Select", "Speaker Mixer", "Speaker Mixer"}, diff --git a/3.2.34/bump/1029_linux-3.2.30.patch b/3.2.34/bump/1029_linux-3.2.30.patch new file mode 100644 index 0000000..86aea4b --- /dev/null +++ b/3.2.34/bump/1029_linux-3.2.30.patch @@ -0,0 +1,5552 @@ +diff --git a/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.txt b/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.txt +index ab22fe6..e39a0c0 100644 +--- a/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.txt ++++ b/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.txt +@@ -10,8 +10,8 @@ Required properties: + + Optional properties: + - fsl,card-wired : Indicate the card is wired to host permanently +-- fsl,cd-internal : Indicate to use controller internal card detection +-- fsl,wp-internal : Indicate to use controller internal write protection ++- fsl,cd-controller : Indicate to use controller internal card detection ++- fsl,wp-controller : Indicate to use controller internal write protection + - cd-gpios : Specify GPIOs for card detection + - wp-gpios : Specify GPIOs for write protection + +@@ -21,8 +21,8 @@ esdhc@70004000 { + compatible = "fsl,imx51-esdhc"; + reg = <0x70004000 0x4000>; + interrupts = <1>; +- fsl,cd-internal; +- fsl,wp-internal; ++ fsl,cd-controller; ++ fsl,wp-controller; + }; + + esdhc@70008000 { +diff --git a/Documentation/i2c/busses/i2c-i801 b/Documentation/i2c/busses/i2c-i801 +index 2871fd5..99d4e44 100644 +--- a/Documentation/i2c/busses/i2c-i801 ++++ b/Documentation/i2c/busses/i2c-i801 +@@ -20,6 +20,8 @@ Supported adapters: + * Intel Patsburg (PCH) + * Intel DH89xxCC (PCH) + * Intel Panther Point (PCH) ++ * Intel Lynx Point (PCH) ++ * Intel Lynx Point-LP (PCH) + Datasheets: Publicly available at the Intel website + + On Intel Patsburg and later chipsets, both the normal host SMBus controller +diff --git a/Makefile b/Makefile +index d96fc2a..9fd7e60 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 2 +-SUBLEVEL = 29 ++SUBLEVEL = 30 + EXTRAVERSION = + NAME = Saber-toothed Squirrel + +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index 987c72d..9fdc151 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -2065,6 +2065,7 @@ source "drivers/cpufreq/Kconfig" + config CPU_FREQ_IMX + tristate "CPUfreq driver for i.MX CPUs" + depends on ARCH_MXC && CPU_FREQ ++ select CPU_FREQ_TABLE + help + This enables the CPUfreq driver for i.MX CPUs. + +diff --git a/arch/arm/Makefile b/arch/arm/Makefile +index dfcf3b0..362c7ca 100644 +--- a/arch/arm/Makefile ++++ b/arch/arm/Makefile +@@ -284,10 +284,10 @@ zImage Image xipImage bootpImage uImage: vmlinux + zinstall uinstall install: vmlinux + $(Q)$(MAKE) $(build)=$(boot) MACHINE=$(MACHINE) $@ + +-%.dtb: ++%.dtb: scripts + $(Q)$(MAKE) $(build)=$(boot) MACHINE=$(MACHINE) $(boot)/$@ + +-dtbs: ++dtbs: scripts + $(Q)$(MAKE) $(build)=$(boot) MACHINE=$(MACHINE) $(boot)/$@ + + # We use MRPROPER_FILES and CLEAN_FILES now +diff --git a/arch/arm/boot/dts/imx51-babbage.dts b/arch/arm/boot/dts/imx51-babbage.dts +index f8766af..4790df2 100644 +--- a/arch/arm/boot/dts/imx51-babbage.dts ++++ b/arch/arm/boot/dts/imx51-babbage.dts +@@ -29,8 +29,8 @@ + aips@70000000 { /* aips-1 */ + spba@70000000 { + esdhc@70004000 { /* ESDHC1 */ +- fsl,cd-internal; +- fsl,wp-internal; ++ fsl,cd-controller; ++ fsl,wp-controller; + status = "okay"; + }; + +diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h +index 8512475..9b419ab 100644 +--- a/arch/arm/include/asm/pgtable.h ++++ b/arch/arm/include/asm/pgtable.h +@@ -232,6 +232,18 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd) + #define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext) + #define pte_clear(mm,addr,ptep) set_pte_ext(ptep, __pte(0), 0) + ++#define pte_none(pte) (!pte_val(pte)) ++#define pte_present(pte) (pte_val(pte) & L_PTE_PRESENT) ++#define pte_write(pte) (!(pte_val(pte) & L_PTE_RDONLY)) ++#define pte_dirty(pte) (pte_val(pte) & L_PTE_DIRTY) ++#define pte_young(pte) (pte_val(pte) & L_PTE_YOUNG) ++#define pte_exec(pte) (!(pte_val(pte) & L_PTE_XN)) ++#define pte_special(pte) (0) ++ ++#define pte_present_user(pte) \ ++ ((pte_val(pte) & (L_PTE_PRESENT | L_PTE_USER)) == \ ++ (L_PTE_PRESENT | L_PTE_USER)) ++ + #if __LINUX_ARM_ARCH__ < 6 + static inline void __sync_icache_dcache(pte_t pteval) + { +@@ -243,25 +255,15 @@ extern void __sync_icache_dcache(pte_t pteval); + static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) + { +- if (addr >= TASK_SIZE) +- set_pte_ext(ptep, pteval, 0); +- else { ++ unsigned long ext = 0; ++ ++ if (addr < TASK_SIZE && pte_present_user(pteval)) { + __sync_icache_dcache(pteval); +- set_pte_ext(ptep, pteval, PTE_EXT_NG); ++ ext |= PTE_EXT_NG; + } +-} + +-#define pte_none(pte) (!pte_val(pte)) +-#define pte_present(pte) (pte_val(pte) & L_PTE_PRESENT) +-#define pte_write(pte) (!(pte_val(pte) & L_PTE_RDONLY)) +-#define pte_dirty(pte) (pte_val(pte) & L_PTE_DIRTY) +-#define pte_young(pte) (pte_val(pte) & L_PTE_YOUNG) +-#define pte_exec(pte) (!(pte_val(pte) & L_PTE_XN)) +-#define pte_special(pte) (0) +- +-#define pte_present_user(pte) \ +- ((pte_val(pte) & (L_PTE_PRESENT | L_PTE_USER)) == \ +- (L_PTE_PRESENT | L_PTE_USER)) ++ set_pte_ext(ptep, pteval, ext); ++} + + #define PTE_BIT_FUNC(fn,op) \ + static inline pte_t pte_##fn(pte_t pte) { pte_val(pte) op; return pte; } +diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c +index 814a52a9..2bc1a8e 100644 +--- a/arch/arm/kernel/hw_breakpoint.c ++++ b/arch/arm/kernel/hw_breakpoint.c +@@ -160,6 +160,12 @@ static int debug_arch_supported(void) + arch >= ARM_DEBUG_ARCH_V7_1; + } + ++/* Can we determine the watchpoint access type from the fsr? */ ++static int debug_exception_updates_fsr(void) ++{ ++ return 0; ++} ++ + /* Determine number of WRP registers available. */ + static int get_num_wrp_resources(void) + { +@@ -620,18 +626,35 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) + info->address &= ~alignment_mask; + info->ctrl.len <<= offset; + +- /* +- * Currently we rely on an overflow handler to take +- * care of single-stepping the breakpoint when it fires. +- * In the case of userspace breakpoints on a core with V7 debug, +- * we can use the mismatch feature as a poor-man's hardware +- * single-step, but this only works for per-task breakpoints. +- */ +- if (!bp->overflow_handler && (arch_check_bp_in_kernelspace(bp) || +- !core_has_mismatch_brps() || !bp->hw.bp_target)) { +- pr_warning("overflow handler required but none found\n"); +- ret = -EINVAL; ++ if (!bp->overflow_handler) { ++ /* ++ * Mismatch breakpoints are required for single-stepping ++ * breakpoints. ++ */ ++ if (!core_has_mismatch_brps()) ++ return -EINVAL; ++ ++ /* We don't allow mismatch breakpoints in kernel space. */ ++ if (arch_check_bp_in_kernelspace(bp)) ++ return -EPERM; ++ ++ /* ++ * Per-cpu breakpoints are not supported by our stepping ++ * mechanism. ++ */ ++ if (!bp->hw.bp_target) ++ return -EINVAL; ++ ++ /* ++ * We only support specific access types if the fsr ++ * reports them. ++ */ ++ if (!debug_exception_updates_fsr() && ++ (info->ctrl.type == ARM_BREAKPOINT_LOAD || ++ info->ctrl.type == ARM_BREAKPOINT_STORE)) ++ return -EINVAL; + } ++ + out: + return ret; + } +@@ -707,10 +730,12 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr, + goto unlock; + + /* Check that the access type matches. */ +- access = (fsr & ARM_FSR_ACCESS_MASK) ? HW_BREAKPOINT_W : +- HW_BREAKPOINT_R; +- if (!(access & hw_breakpoint_type(wp))) +- goto unlock; ++ if (debug_exception_updates_fsr()) { ++ access = (fsr & ARM_FSR_ACCESS_MASK) ? ++ HW_BREAKPOINT_W : HW_BREAKPOINT_R; ++ if (!(access & hw_breakpoint_type(wp))) ++ goto unlock; ++ } + + /* We have a winner. */ + info->trigger = addr; +diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c +index 8380bd1..7ac5dfd 100644 +--- a/arch/arm/kernel/traps.c ++++ b/arch/arm/kernel/traps.c +@@ -380,20 +380,23 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs) + #endif + instr = *(u32 *) pc; + } else if (thumb_mode(regs)) { +- get_user(instr, (u16 __user *)pc); ++ if (get_user(instr, (u16 __user *)pc)) ++ goto die_sig; + if (is_wide_instruction(instr)) { + unsigned int instr2; +- get_user(instr2, (u16 __user *)pc+1); ++ if (get_user(instr2, (u16 __user *)pc+1)) ++ goto die_sig; + instr <<= 16; + instr |= instr2; + } +- } else { +- get_user(instr, (u32 __user *)pc); ++ } else if (get_user(instr, (u32 __user *)pc)) { ++ goto die_sig; + } + + if (call_undef_hook(regs, instr) == 0) + return; + ++die_sig: + #ifdef CONFIG_DEBUG_USER + if (user_debug & UDBG_UNDEFINED) { + printk(KERN_INFO "%s (%d): undefined instruction: pc=%p\n", +diff --git a/arch/arm/mach-dove/common.c b/arch/arm/mach-dove/common.c +index 1620b15..cb105bf8 100644 +--- a/arch/arm/mach-dove/common.c ++++ b/arch/arm/mach-dove/common.c +@@ -92,7 +92,7 @@ void __init dove_ge00_init(struct mv643xx_eth_platform_data *eth_data) + { + orion_ge00_init(eth_data, &dove_mbus_dram_info, + DOVE_GE00_PHYS_BASE, IRQ_DOVE_GE00_SUM, +- 0, get_tclk()); ++ 0, get_tclk(), 1600); + } + + /***************************************************************************** +diff --git a/arch/arm/mach-imx/hotplug.c b/arch/arm/mach-imx/hotplug.c +index 20ed2d5..f8f7437 100644 +--- a/arch/arm/mach-imx/hotplug.c ++++ b/arch/arm/mach-imx/hotplug.c +@@ -42,22 +42,6 @@ static inline void cpu_enter_lowpower(void) + : "cc"); + } + +-static inline void cpu_leave_lowpower(void) +-{ +- unsigned int v; +- +- asm volatile( +- "mrc p15, 0, %0, c1, c0, 0\n" +- " orr %0, %0, %1\n" +- " mcr p15, 0, %0, c1, c0, 0\n" +- " mrc p15, 0, %0, c1, c0, 1\n" +- " orr %0, %0, %2\n" +- " mcr p15, 0, %0, c1, c0, 1\n" +- : "=&r" (v) +- : "Ir" (CR_C), "Ir" (0x40) +- : "cc"); +-} +- + /* + * platform-specific code to shutdown a CPU + * +@@ -67,11 +51,10 @@ void platform_cpu_die(unsigned int cpu) + { + cpu_enter_lowpower(); + imx_enable_cpu(cpu, false); +- cpu_do_idle(); +- cpu_leave_lowpower(); + +- /* We should never return from idle */ +- panic("cpu %d unexpectedly exit from shutdown\n", cpu); ++ /* spin here until hardware takes it down */ ++ while (1) ++ ; + } + + int platform_cpu_disable(unsigned int cpu) +diff --git a/arch/arm/mach-kirkwood/common.c b/arch/arm/mach-kirkwood/common.c +index c5dbbb3..06faa97 100644 +--- a/arch/arm/mach-kirkwood/common.c ++++ b/arch/arm/mach-kirkwood/common.c +@@ -88,7 +88,7 @@ void __init kirkwood_ge00_init(struct mv643xx_eth_platform_data *eth_data) + + orion_ge00_init(eth_data, &kirkwood_mbus_dram_info, + GE00_PHYS_BASE, IRQ_KIRKWOOD_GE00_SUM, +- IRQ_KIRKWOOD_GE00_ERR, kirkwood_tclk); ++ IRQ_KIRKWOOD_GE00_ERR, kirkwood_tclk, 1600); + } + + +@@ -102,7 +102,7 @@ void __init kirkwood_ge01_init(struct mv643xx_eth_platform_data *eth_data) + + orion_ge01_init(eth_data, &kirkwood_mbus_dram_info, + GE01_PHYS_BASE, IRQ_KIRKWOOD_GE01_SUM, +- IRQ_KIRKWOOD_GE01_ERR, kirkwood_tclk); ++ IRQ_KIRKWOOD_GE01_ERR, kirkwood_tclk, 1600); + } + + +diff --git a/arch/arm/mach-mv78xx0/common.c b/arch/arm/mach-mv78xx0/common.c +index d90e244..570ee4d 100644 +--- a/arch/arm/mach-mv78xx0/common.c ++++ b/arch/arm/mach-mv78xx0/common.c +@@ -202,7 +202,8 @@ void __init mv78xx0_ge00_init(struct mv643xx_eth_platform_data *eth_data) + { + orion_ge00_init(eth_data, &mv78xx0_mbus_dram_info, + GE00_PHYS_BASE, IRQ_MV78XX0_GE00_SUM, +- IRQ_MV78XX0_GE_ERR, get_tclk()); ++ IRQ_MV78XX0_GE_ERR, get_tclk(), ++ MV643XX_TX_CSUM_DEFAULT_LIMIT); + } + + +@@ -213,7 +214,8 @@ void __init mv78xx0_ge01_init(struct mv643xx_eth_platform_data *eth_data) + { + orion_ge01_init(eth_data, &mv78xx0_mbus_dram_info, + GE01_PHYS_BASE, IRQ_MV78XX0_GE01_SUM, +- NO_IRQ, get_tclk()); ++ NO_IRQ, get_tclk(), ++ MV643XX_TX_CSUM_DEFAULT_LIMIT); + } + + +diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c +index 53b68b8..20260db 100644 +--- a/arch/arm/mach-orion5x/common.c ++++ b/arch/arm/mach-orion5x/common.c +@@ -95,7 +95,8 @@ void __init orion5x_eth_init(struct mv643xx_eth_platform_data *eth_data) + { + orion_ge00_init(eth_data, &orion5x_mbus_dram_info, + ORION5X_ETH_PHYS_BASE, IRQ_ORION5X_ETH_SUM, +- IRQ_ORION5X_ETH_ERR, orion5x_tclk); ++ IRQ_ORION5X_ETH_ERR, orion5x_tclk, ++ MV643XX_TX_CSUM_DEFAULT_LIMIT); + } + + +diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c +index 1a8d4aa..8fda9f7 100644 +--- a/arch/arm/mm/flush.c ++++ b/arch/arm/mm/flush.c +@@ -236,8 +236,6 @@ void __sync_icache_dcache(pte_t pteval) + struct page *page; + struct address_space *mapping; + +- if (!pte_present_user(pteval)) +- return; + if (cache_is_vipt_nonaliasing() && !pte_exec(pteval)) + /* only flush non-aliasing VIPT caches for exec mappings */ + return; +diff --git a/arch/arm/plat-omap/dmtimer.c b/arch/arm/plat-omap/dmtimer.c +index af3b92b..f9adbbb 100644 +--- a/arch/arm/plat-omap/dmtimer.c ++++ b/arch/arm/plat-omap/dmtimer.c +@@ -236,7 +236,7 @@ EXPORT_SYMBOL_GPL(omap_dm_timer_enable); + + void omap_dm_timer_disable(struct omap_dm_timer *timer) + { +- pm_runtime_put(&timer->pdev->dev); ++ pm_runtime_put_sync(&timer->pdev->dev); + } + EXPORT_SYMBOL_GPL(omap_dm_timer_disable); + +diff --git a/arch/arm/plat-orion/common.c b/arch/arm/plat-orion/common.c +index 11dce87..8a6886a 100644 +--- a/arch/arm/plat-orion/common.c ++++ b/arch/arm/plat-orion/common.c +@@ -263,10 +263,12 @@ void __init orion_ge00_init(struct mv643xx_eth_platform_data *eth_data, + unsigned long mapbase, + unsigned long irq, + unsigned long irq_err, +- int tclk) ++ int tclk, ++ unsigned int tx_csum_limit) + { + fill_resources(&orion_ge00_shared, orion_ge00_shared_resources, + mapbase + 0x2000, SZ_16K - 1, irq_err); ++ orion_ge00_shared_data.tx_csum_limit = tx_csum_limit; + ge_complete(&orion_ge00_shared_data, mbus_dram_info, tclk, + orion_ge00_resources, irq, &orion_ge00_shared, + eth_data, &orion_ge00); +@@ -317,10 +319,12 @@ void __init orion_ge01_init(struct mv643xx_eth_platform_data *eth_data, + unsigned long mapbase, + unsigned long irq, + unsigned long irq_err, +- int tclk) ++ int tclk, ++ unsigned int tx_csum_limit) + { + fill_resources(&orion_ge01_shared, orion_ge01_shared_resources, + mapbase + 0x2000, SZ_16K - 1, irq_err); ++ orion_ge01_shared_data.tx_csum_limit = tx_csum_limit; + ge_complete(&orion_ge01_shared_data, mbus_dram_info, tclk, + orion_ge01_resources, irq, &orion_ge01_shared, + eth_data, &orion_ge01); +diff --git a/arch/arm/plat-orion/include/plat/common.h b/arch/arm/plat-orion/include/plat/common.h +index a2c0e31..b637dae 100644 +--- a/arch/arm/plat-orion/include/plat/common.h ++++ b/arch/arm/plat-orion/include/plat/common.h +@@ -41,14 +41,16 @@ void __init orion_ge00_init(struct mv643xx_eth_platform_data *eth_data, + unsigned long mapbase, + unsigned long irq, + unsigned long irq_err, +- int tclk); ++ int tclk, ++ unsigned int tx_csum_limit); + + void __init orion_ge01_init(struct mv643xx_eth_platform_data *eth_data, + struct mbus_dram_target_info *mbus_dram_info, + unsigned long mapbase, + unsigned long irq, + unsigned long irq_err, +- int tclk); ++ int tclk, ++ unsigned int tx_csum_limit); + + void __init orion_ge10_init(struct mv643xx_eth_platform_data *eth_data, + struct mbus_dram_target_info *mbus_dram_info, +diff --git a/arch/arm/plat-s3c24xx/dma.c b/arch/arm/plat-s3c24xx/dma.c +index 8a90b6a..1eedf8d 100644 +--- a/arch/arm/plat-s3c24xx/dma.c ++++ b/arch/arm/plat-s3c24xx/dma.c +@@ -431,7 +431,7 @@ s3c2410_dma_canload(struct s3c2410_dma_chan *chan) + * when necessary. + */ + +-int s3c2410_dma_enqueue(unsigned int channel, void *id, ++int s3c2410_dma_enqueue(enum dma_ch channel, void *id, + dma_addr_t data, int size) + { + struct s3c2410_dma_chan *chan = s3c_dma_lookup_channel(channel); +diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h +index 4054b31..c4b779b 100644 +--- a/arch/parisc/include/asm/atomic.h ++++ b/arch/parisc/include/asm/atomic.h +@@ -247,7 +247,7 @@ static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) + + #define atomic_sub_and_test(i,v) (atomic_sub_return((i),(v)) == 0) + +-#define ATOMIC_INIT(i) ((atomic_t) { (i) }) ++#define ATOMIC_INIT(i) { (i) } + + #define smp_mb__before_atomic_dec() smp_mb() + #define smp_mb__after_atomic_dec() smp_mb() +@@ -256,7 +256,7 @@ static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u) + + #ifdef CONFIG_64BIT + +-#define ATOMIC64_INIT(i) ((atomic64_t) { (i) }) ++#define ATOMIC64_INIT(i) { (i) } + + static __inline__ s64 + __atomic64_add_return(s64 i, atomic64_t *v) +diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c +index 7c5324f..cc20b0a 100644 +--- a/arch/powerpc/kernel/asm-offsets.c ++++ b/arch/powerpc/kernel/asm-offsets.c +@@ -79,6 +79,7 @@ int main(void) + DEFINE(SIGSEGV, SIGSEGV); + DEFINE(NMI_MASK, NMI_MASK); + DEFINE(THREAD_DSCR, offsetof(struct thread_struct, dscr)); ++ DEFINE(THREAD_DSCR_INHERIT, offsetof(struct thread_struct, dscr_inherit)); + #else + DEFINE(THREAD_INFO, offsetof(struct task_struct, stack)); + #endif /* CONFIG_PPC64 */ +diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c +index 2cc451a..6856062 100644 +--- a/arch/powerpc/kernel/dbell.c ++++ b/arch/powerpc/kernel/dbell.c +@@ -28,6 +28,8 @@ void doorbell_setup_this_cpu(void) + + void doorbell_cause_ipi(int cpu, unsigned long data) + { ++ /* Order previous accesses vs. msgsnd, which is treated as a store */ ++ mb(); + ppc_msgsnd(PPC_DBELL, 0, data); + } + +diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S +index d834425..654fc53 100644 +--- a/arch/powerpc/kernel/entry_64.S ++++ b/arch/powerpc/kernel/entry_64.S +@@ -380,6 +380,12 @@ _GLOBAL(ret_from_fork) + li r3,0 + b syscall_exit + ++ .section ".toc","aw" ++DSCR_DEFAULT: ++ .tc dscr_default[TC],dscr_default ++ ++ .section ".text" ++ + /* + * This routine switches between two different tasks. The process + * state of one is saved on its kernel stack. Then the state +@@ -519,9 +525,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) + mr r1,r8 /* start using new stack pointer */ + std r7,PACAKSAVE(r13) + +- ld r6,_CCR(r1) +- mtcrf 0xFF,r6 +- + #ifdef CONFIG_ALTIVEC + BEGIN_FTR_SECTION + ld r0,THREAD_VRSAVE(r4) +@@ -530,14 +533,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) + #endif /* CONFIG_ALTIVEC */ + #ifdef CONFIG_PPC64 + BEGIN_FTR_SECTION ++ lwz r6,THREAD_DSCR_INHERIT(r4) ++ ld r7,DSCR_DEFAULT@toc(2) + ld r0,THREAD_DSCR(r4) +- cmpd r0,r25 +- beq 1f ++ cmpwi r6,0 ++ bne 1f ++ ld r0,0(r7) ++1: cmpd r0,r25 ++ beq 2f + mtspr SPRN_DSCR,r0 +-1: ++2: + END_FTR_SECTION_IFSET(CPU_FTR_DSCR) + #endif + ++ ld r6,_CCR(r1) ++ mtcrf 0xFF,r6 ++ + /* r3-r13 are destroyed -- Cort */ + REST_8GPRS(14, r1) + REST_10GPRS(22, r1) +diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c +index 6457574..d687e3f 100644 +--- a/arch/powerpc/kernel/process.c ++++ b/arch/powerpc/kernel/process.c +@@ -778,16 +778,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, + #endif /* CONFIG_PPC_STD_MMU_64 */ + #ifdef CONFIG_PPC64 + if (cpu_has_feature(CPU_FTR_DSCR)) { +- if (current->thread.dscr_inherit) { +- p->thread.dscr_inherit = 1; +- p->thread.dscr = current->thread.dscr; +- } else if (0 != dscr_default) { +- p->thread.dscr_inherit = 1; +- p->thread.dscr = dscr_default; +- } else { +- p->thread.dscr_inherit = 0; +- p->thread.dscr = 0; +- } ++ p->thread.dscr_inherit = current->thread.dscr_inherit; ++ p->thread.dscr = current->thread.dscr; + } + #endif + +diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c +index 6df7090..fe04b4a 100644 +--- a/arch/powerpc/kernel/smp.c ++++ b/arch/powerpc/kernel/smp.c +@@ -214,8 +214,15 @@ void smp_muxed_ipi_message_pass(int cpu, int msg) + struct cpu_messages *info = &per_cpu(ipi_message, cpu); + char *message = (char *)&info->messages; + ++ /* ++ * Order previous accesses before accesses in the IPI handler. ++ */ ++ smp_mb(); + message[msg] = 1; +- mb(); ++ /* ++ * cause_ipi functions are required to include a full barrier ++ * before doing whatever causes the IPI. ++ */ + smp_ops->cause_ipi(cpu, info->data); + } + +@@ -227,7 +234,7 @@ irqreturn_t smp_ipi_demux(void) + mb(); /* order any irq clear */ + + do { +- all = xchg_local(&info->messages, 0); ++ all = xchg(&info->messages, 0); + + #ifdef __BIG_ENDIAN + if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNCTION))) +diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c +index ce035c1..55be64d 100644 +--- a/arch/powerpc/kernel/sysfs.c ++++ b/arch/powerpc/kernel/sysfs.c +@@ -192,6 +192,14 @@ static ssize_t show_dscr_default(struct sysdev_class *class, + return sprintf(buf, "%lx\n", dscr_default); + } + ++static void update_dscr(void *dummy) ++{ ++ if (!current->thread.dscr_inherit) { ++ current->thread.dscr = dscr_default; ++ mtspr(SPRN_DSCR, dscr_default); ++ } ++} ++ + static ssize_t __used store_dscr_default(struct sysdev_class *class, + struct sysdev_class_attribute *attr, const char *buf, + size_t count) +@@ -204,6 +212,8 @@ static ssize_t __used store_dscr_default(struct sysdev_class *class, + return -EINVAL; + dscr_default = val; + ++ on_each_cpu(update_dscr, NULL, 1); ++ + return count; + } + +diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c +index 5459d14..82dcd4d 100644 +--- a/arch/powerpc/kernel/traps.c ++++ b/arch/powerpc/kernel/traps.c +@@ -942,8 +942,9 @@ static int emulate_instruction(struct pt_regs *regs) + cpu_has_feature(CPU_FTR_DSCR)) { + PPC_WARN_EMULATED(mtdscr, regs); + rd = (instword >> 21) & 0x1f; +- mtspr(SPRN_DSCR, regs->gpr[rd]); ++ current->thread.dscr = regs->gpr[rd]; + current->thread.dscr_inherit = 1; ++ mtspr(SPRN_DSCR, current->thread.dscr); + return 0; + } + #endif +diff --git a/arch/powerpc/sysdev/xics/icp-hv.c b/arch/powerpc/sysdev/xics/icp-hv.c +index 9518d36..5c76bf7 100644 +--- a/arch/powerpc/sysdev/xics/icp-hv.c ++++ b/arch/powerpc/sysdev/xics/icp-hv.c +@@ -27,33 +27,53 @@ static inline unsigned int icp_hv_get_xirr(unsigned char cppr) + { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long rc; ++ unsigned int ret = XICS_IRQ_SPURIOUS; + + rc = plpar_hcall(H_XIRR, retbuf, cppr); +- if (rc != H_SUCCESS) +- panic(" bad return code xirr - rc = %lx\n", rc); +- return (unsigned int)retbuf[0]; ++ if (rc == H_SUCCESS) { ++ ret = (unsigned int)retbuf[0]; ++ } else { ++ pr_err("%s: bad return code xirr cppr=0x%x returned %ld\n", ++ __func__, cppr, rc); ++ WARN_ON_ONCE(1); ++ } ++ ++ return ret; + } + + static inline void icp_hv_set_xirr(unsigned int value) + { + long rc = plpar_hcall_norets(H_EOI, value); +- if (rc != H_SUCCESS) +- panic("bad return code EOI - rc = %ld, value=%x\n", rc, value); ++ if (rc != H_SUCCESS) { ++ pr_err("%s: bad return code eoi xirr=0x%x returned %ld\n", ++ __func__, value, rc); ++ WARN_ON_ONCE(1); ++ } + } + + static inline void icp_hv_set_cppr(u8 value) + { + long rc = plpar_hcall_norets(H_CPPR, value); +- if (rc != H_SUCCESS) +- panic("bad return code cppr - rc = %lx\n", rc); ++ if (rc != H_SUCCESS) { ++ pr_err("%s: bad return code cppr cppr=0x%x returned %ld\n", ++ __func__, value, rc); ++ WARN_ON_ONCE(1); ++ } + } + + static inline void icp_hv_set_qirr(int n_cpu , u8 value) + { +- long rc = plpar_hcall_norets(H_IPI, get_hard_smp_processor_id(n_cpu), +- value); +- if (rc != H_SUCCESS) +- panic("bad return code qirr - rc = %lx\n", rc); ++ int hw_cpu = get_hard_smp_processor_id(n_cpu); ++ long rc; ++ ++ /* Make sure all previous accesses are ordered before IPI sending */ ++ mb(); ++ rc = plpar_hcall_norets(H_IPI, hw_cpu, value); ++ if (rc != H_SUCCESS) { ++ pr_err("%s: bad return code qirr cpu=%d hw_cpu=%d mfrr=0x%x " ++ "returned %ld\n", __func__, n_cpu, hw_cpu, value, rc); ++ WARN_ON_ONCE(1); ++ } + } + + static void icp_hv_eoi(struct irq_data *d) +diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c +index b2c7179..bb104b4 100644 +--- a/arch/x86/xen/setup.c ++++ b/arch/x86/xen/setup.c +@@ -78,9 +78,16 @@ static void __init xen_add_extra_mem(u64 start, u64 size) + memblock_x86_reserve_range(start, start + size, "XEN EXTRA"); + + xen_max_p2m_pfn = PFN_DOWN(start + size); ++ for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { ++ unsigned long mfn = pfn_to_mfn(pfn); ++ ++ if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) ++ continue; ++ WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", ++ pfn, mfn); + +- for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++) + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); ++ } + } + + static unsigned long __init xen_release_chunk(unsigned long start, +diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c +index fb65915..608257a 100644 +--- a/drivers/ata/ahci.c ++++ b/drivers/ata/ahci.c +@@ -386,6 +386,8 @@ static const struct pci_device_id ahci_pci_tbl[] = { + .driver_data = board_ahci_yes_fbs }, /* 88se9125 */ + { PCI_DEVICE(0x1b4b, 0x917a), + .driver_data = board_ahci_yes_fbs }, /* 88se9172 */ ++ { PCI_DEVICE(0x1b4b, 0x9192), ++ .driver_data = board_ahci_yes_fbs }, /* 88se9172 on some Gigabyte */ + { PCI_DEVICE(0x1b4b, 0x91a3), + .driver_data = board_ahci_yes_fbs }, + +diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c +index 8323fc3..3f1799b 100644 +--- a/drivers/gpu/drm/drm_crtc.c ++++ b/drivers/gpu/drm/drm_crtc.c +@@ -1625,10 +1625,8 @@ int drm_mode_cursor_ioctl(struct drm_device *dev, + if (!drm_core_check_feature(dev, DRIVER_MODESET)) + return -EINVAL; + +- if (!req->flags) { +- DRM_ERROR("no operation set\n"); ++ if (!req->flags || (~DRM_MODE_CURSOR_FLAGS & req->flags)) + return -EINVAL; +- } + + mutex_lock(&dev->mode_config.mutex); + obj = drm_mode_object_find(dev, req->crtc_id, DRM_MODE_OBJECT_CRTC); +@@ -1641,7 +1639,6 @@ int drm_mode_cursor_ioctl(struct drm_device *dev, + + if (req->flags & DRM_MODE_CURSOR_BO) { + if (!crtc->funcs->cursor_set) { +- DRM_ERROR("crtc does not support cursor\n"); + ret = -ENXIO; + goto out; + } +@@ -1654,7 +1651,6 @@ int drm_mode_cursor_ioctl(struct drm_device *dev, + if (crtc->funcs->cursor_move) { + ret = crtc->funcs->cursor_move(crtc, req->x, req->y); + } else { +- DRM_ERROR("crtc does not support cursor\n"); + ret = -EFAULT; + goto out; + } +@@ -1692,14 +1688,11 @@ int drm_mode_addfb(struct drm_device *dev, + if (!drm_core_check_feature(dev, DRIVER_MODESET)) + return -EINVAL; + +- if ((config->min_width > r->width) || (r->width > config->max_width)) { +- DRM_ERROR("mode new framebuffer width not within limits\n"); ++ if ((config->min_width > r->width) || (r->width > config->max_width)) + return -EINVAL; +- } +- if ((config->min_height > r->height) || (r->height > config->max_height)) { +- DRM_ERROR("mode new framebuffer height not within limits\n"); ++ ++ if ((config->min_height > r->height) || (r->height > config->max_height)) + return -EINVAL; +- } + + mutex_lock(&dev->mode_config.mutex); + +@@ -1756,7 +1749,6 @@ int drm_mode_rmfb(struct drm_device *dev, + obj = drm_mode_object_find(dev, *id, DRM_MODE_OBJECT_FB); + /* TODO check that we really get a framebuffer back. */ + if (!obj) { +- DRM_ERROR("mode invalid framebuffer id\n"); + ret = -EINVAL; + goto out; + } +@@ -1767,7 +1759,6 @@ int drm_mode_rmfb(struct drm_device *dev, + found = 1; + + if (!found) { +- DRM_ERROR("tried to remove a fb that we didn't own\n"); + ret = -EINVAL; + goto out; + } +@@ -1814,7 +1805,6 @@ int drm_mode_getfb(struct drm_device *dev, + mutex_lock(&dev->mode_config.mutex); + obj = drm_mode_object_find(dev, r->fb_id, DRM_MODE_OBJECT_FB); + if (!obj) { +- DRM_ERROR("invalid framebuffer id\n"); + ret = -EINVAL; + goto out; + } +@@ -1850,7 +1840,6 @@ int drm_mode_dirtyfb_ioctl(struct drm_device *dev, + mutex_lock(&dev->mode_config.mutex); + obj = drm_mode_object_find(dev, r->fb_id, DRM_MODE_OBJECT_FB); + if (!obj) { +- DRM_ERROR("invalid framebuffer id\n"); + ret = -EINVAL; + goto out_err1; + } +diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c +index a1ee634..0c1a99b 100644 +--- a/drivers/gpu/drm/drm_edid.c ++++ b/drivers/gpu/drm/drm_edid.c +@@ -66,6 +66,8 @@ + #define EDID_QUIRK_FIRST_DETAILED_PREFERRED (1 << 5) + /* use +hsync +vsync for detailed mode */ + #define EDID_QUIRK_DETAILED_SYNC_PP (1 << 6) ++/* Force reduced-blanking timings for detailed modes */ ++#define EDID_QUIRK_FORCE_REDUCED_BLANKING (1 << 7) + + struct detailed_mode_closure { + struct drm_connector *connector; +@@ -85,6 +87,9 @@ static struct edid_quirk { + int product_id; + u32 quirks; + } edid_quirk_list[] = { ++ /* ASUS VW222S */ ++ { "ACI", 0x22a2, EDID_QUIRK_FORCE_REDUCED_BLANKING }, ++ + /* Acer AL1706 */ + { "ACR", 44358, EDID_QUIRK_PREFER_LARGE_60 }, + /* Acer F51 */ +@@ -120,6 +125,9 @@ static struct edid_quirk { + /* Samsung SyncMaster 22[5-6]BW */ + { "SAM", 596, EDID_QUIRK_PREFER_LARGE_60 }, + { "SAM", 638, EDID_QUIRK_PREFER_LARGE_60 }, ++ ++ /* ViewSonic VA2026w */ ++ { "VSC", 5020, EDID_QUIRK_FORCE_REDUCED_BLANKING }, + }; + + /*** DDC fetch and block validation ***/ +@@ -863,12 +871,19 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev, + "Wrong Hsync/Vsync pulse width\n"); + return NULL; + } ++ ++ if (quirks & EDID_QUIRK_FORCE_REDUCED_BLANKING) { ++ mode = drm_cvt_mode(dev, hactive, vactive, 60, true, false, false); ++ if (!mode) ++ return NULL; ++ ++ goto set_size; ++ } ++ + mode = drm_mode_create(dev); + if (!mode) + return NULL; + +- mode->type = DRM_MODE_TYPE_DRIVER; +- + if (quirks & EDID_QUIRK_135_CLOCK_TOO_HIGH) + timing->pixel_clock = cpu_to_le16(1088); + +@@ -892,8 +907,6 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev, + + drm_mode_do_interlace_quirk(mode, pt); + +- drm_mode_set_name(mode); +- + if (quirks & EDID_QUIRK_DETAILED_SYNC_PP) { + pt->misc |= DRM_EDID_PT_HSYNC_POSITIVE | DRM_EDID_PT_VSYNC_POSITIVE; + } +@@ -903,6 +916,7 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev, + mode->flags |= (pt->misc & DRM_EDID_PT_VSYNC_POSITIVE) ? + DRM_MODE_FLAG_PVSYNC : DRM_MODE_FLAG_NVSYNC; + ++set_size: + mode->width_mm = pt->width_mm_lo | (pt->width_height_mm_hi & 0xf0) << 4; + mode->height_mm = pt->height_mm_lo | (pt->width_height_mm_hi & 0xf) << 8; + +@@ -916,6 +930,9 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev, + mode->height_mm = edid->height_cm * 10; + } + ++ mode->type = DRM_MODE_TYPE_DRIVER; ++ drm_mode_set_name(mode); ++ + return mode; + } + +diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c +index 578ddfc..c8b5bc1 100644 +--- a/drivers/gpu/drm/i915/i915_irq.c ++++ b/drivers/gpu/drm/i915/i915_irq.c +@@ -2006,10 +2006,22 @@ static int i915_driver_irq_postinstall(struct drm_device *dev) + hotplug_en |= HDMIC_HOTPLUG_INT_EN; + if (dev_priv->hotplug_supported_mask & HDMID_HOTPLUG_INT_STATUS) + hotplug_en |= HDMID_HOTPLUG_INT_EN; +- if (dev_priv->hotplug_supported_mask & SDVOC_HOTPLUG_INT_STATUS) +- hotplug_en |= SDVOC_HOTPLUG_INT_EN; +- if (dev_priv->hotplug_supported_mask & SDVOB_HOTPLUG_INT_STATUS) +- hotplug_en |= SDVOB_HOTPLUG_INT_EN; ++ if (IS_G4X(dev)) { ++ if (dev_priv->hotplug_supported_mask & SDVOC_HOTPLUG_INT_STATUS_G4X) ++ hotplug_en |= SDVOC_HOTPLUG_INT_EN; ++ if (dev_priv->hotplug_supported_mask & SDVOB_HOTPLUG_INT_STATUS_G4X) ++ hotplug_en |= SDVOB_HOTPLUG_INT_EN; ++ } else if (IS_GEN4(dev)) { ++ if (dev_priv->hotplug_supported_mask & SDVOC_HOTPLUG_INT_STATUS_I965) ++ hotplug_en |= SDVOC_HOTPLUG_INT_EN; ++ if (dev_priv->hotplug_supported_mask & SDVOB_HOTPLUG_INT_STATUS_I965) ++ hotplug_en |= SDVOB_HOTPLUG_INT_EN; ++ } else { ++ if (dev_priv->hotplug_supported_mask & SDVOC_HOTPLUG_INT_STATUS_I915) ++ hotplug_en |= SDVOC_HOTPLUG_INT_EN; ++ if (dev_priv->hotplug_supported_mask & SDVOB_HOTPLUG_INT_STATUS_I915) ++ hotplug_en |= SDVOB_HOTPLUG_INT_EN; ++ } + if (dev_priv->hotplug_supported_mask & CRT_HOTPLUG_INT_STATUS) { + hotplug_en |= CRT_HOTPLUG_INT_EN; + +diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h +index fd53122..4a5e662 100644 +--- a/drivers/gpu/drm/i915/i915_reg.h ++++ b/drivers/gpu/drm/i915/i915_reg.h +@@ -1419,14 +1419,20 @@ + #define DPC_HOTPLUG_INT_STATUS (1 << 28) + #define HDMID_HOTPLUG_INT_STATUS (1 << 27) + #define DPD_HOTPLUG_INT_STATUS (1 << 27) ++/* CRT/TV common between gen3+ */ + #define CRT_HOTPLUG_INT_STATUS (1 << 11) + #define TV_HOTPLUG_INT_STATUS (1 << 10) + #define CRT_HOTPLUG_MONITOR_MASK (3 << 8) + #define CRT_HOTPLUG_MONITOR_COLOR (3 << 8) + #define CRT_HOTPLUG_MONITOR_MONO (2 << 8) + #define CRT_HOTPLUG_MONITOR_NONE (0 << 8) +-#define SDVOC_HOTPLUG_INT_STATUS (1 << 7) +-#define SDVOB_HOTPLUG_INT_STATUS (1 << 6) ++/* SDVO is different across gen3/4 */ ++#define SDVOC_HOTPLUG_INT_STATUS_G4X (1 << 3) ++#define SDVOB_HOTPLUG_INT_STATUS_G4X (1 << 2) ++#define SDVOC_HOTPLUG_INT_STATUS_I965 (3 << 4) ++#define SDVOB_HOTPLUG_INT_STATUS_I965 (3 << 2) ++#define SDVOC_HOTPLUG_INT_STATUS_I915 (1 << 7) ++#define SDVOB_HOTPLUG_INT_STATUS_I915 (1 << 6) + + /* SDVO port control */ + #define SDVOB 0x61140 +diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c +index 3eed270..6c3fb44 100644 +--- a/drivers/gpu/drm/i915/intel_display.c ++++ b/drivers/gpu/drm/i915/intel_display.c +@@ -1072,8 +1072,8 @@ static void assert_pch_hdmi_disabled(struct drm_i915_private *dev_priv, + enum pipe pipe, int reg) + { + u32 val = I915_READ(reg); +- WARN(hdmi_pipe_enabled(dev_priv, val, pipe), +- "PCH DP (0x%08x) enabled on transcoder %c, should be disabled\n", ++ WARN(hdmi_pipe_enabled(dev_priv, pipe, val), ++ "PCH HDMI (0x%08x) enabled on transcoder %c, should be disabled\n", + reg, pipe_name(pipe)); + } + +@@ -1089,13 +1089,13 @@ static void assert_pch_ports_disabled(struct drm_i915_private *dev_priv, + + reg = PCH_ADPA; + val = I915_READ(reg); +- WARN(adpa_pipe_enabled(dev_priv, val, pipe), ++ WARN(adpa_pipe_enabled(dev_priv, pipe, val), + "PCH VGA enabled on transcoder %c, should be disabled\n", + pipe_name(pipe)); + + reg = PCH_LVDS; + val = I915_READ(reg); +- WARN(lvds_pipe_enabled(dev_priv, val, pipe), ++ WARN(lvds_pipe_enabled(dev_priv, pipe, val), + "PCH LVDS enabled on transcoder %c, should be disabled\n", + pipe_name(pipe)); + +@@ -1437,7 +1437,7 @@ static void disable_pch_hdmi(struct drm_i915_private *dev_priv, + enum pipe pipe, int reg) + { + u32 val = I915_READ(reg); +- if (hdmi_pipe_enabled(dev_priv, val, pipe)) { ++ if (hdmi_pipe_enabled(dev_priv, pipe, val)) { + DRM_DEBUG_KMS("Disabling pch HDMI %x on pipe %d\n", + reg, pipe); + I915_WRITE(reg, val & ~PORT_ENABLE); +@@ -1459,12 +1459,12 @@ static void intel_disable_pch_ports(struct drm_i915_private *dev_priv, + + reg = PCH_ADPA; + val = I915_READ(reg); +- if (adpa_pipe_enabled(dev_priv, val, pipe)) ++ if (adpa_pipe_enabled(dev_priv, pipe, val)) + I915_WRITE(reg, val & ~ADPA_DAC_ENABLE); + + reg = PCH_LVDS; + val = I915_READ(reg); +- if (lvds_pipe_enabled(dev_priv, val, pipe)) { ++ if (lvds_pipe_enabled(dev_priv, pipe, val)) { + DRM_DEBUG_KMS("disable lvds on pipe %d val 0x%08x\n", pipe, val); + I915_WRITE(reg, val & ~LVDS_PORT_EN); + POSTING_READ(reg); +@@ -2852,16 +2852,14 @@ static void intel_clear_scanline_wait(struct drm_device *dev) + + static void intel_crtc_wait_for_pending_flips(struct drm_crtc *crtc) + { +- struct drm_i915_gem_object *obj; +- struct drm_i915_private *dev_priv; ++ struct drm_device *dev = crtc->dev; + + if (crtc->fb == NULL) + return; + +- obj = to_intel_framebuffer(crtc->fb)->obj; +- dev_priv = crtc->dev->dev_private; +- wait_event(dev_priv->pending_flip_queue, +- atomic_read(&obj->pending_flip) == 0); ++ mutex_lock(&dev->struct_mutex); ++ intel_finish_fb(crtc->fb); ++ mutex_unlock(&dev->struct_mutex); + } + + static bool intel_crtc_driving_pch(struct drm_crtc *crtc) +@@ -3322,23 +3320,6 @@ static void intel_crtc_disable(struct drm_crtc *crtc) + struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; + struct drm_device *dev = crtc->dev; + +- /* Flush any pending WAITs before we disable the pipe. Note that +- * we need to drop the struct_mutex in order to acquire it again +- * during the lowlevel dpms routines around a couple of the +- * operations. It does not look trivial nor desirable to move +- * that locking higher. So instead we leave a window for the +- * submission of further commands on the fb before we can actually +- * disable it. This race with userspace exists anyway, and we can +- * only rely on the pipe being disabled by userspace after it +- * receives the hotplug notification and has flushed any pending +- * batches. +- */ +- if (crtc->fb) { +- mutex_lock(&dev->struct_mutex); +- intel_finish_fb(crtc->fb); +- mutex_unlock(&dev->struct_mutex); +- } +- + crtc_funcs->dpms(crtc, DRM_MODE_DPMS_OFF); + + if (crtc->fb) { +diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c +index ceec71b..f07bde2 100644 +--- a/drivers/gpu/drm/i915/intel_lvds.c ++++ b/drivers/gpu/drm/i915/intel_lvds.c +@@ -752,7 +752,7 @@ static const struct dmi_system_id intel_no_lvds[] = { + .ident = "Hewlett-Packard t5745", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), +- DMI_MATCH(DMI_BOARD_NAME, "hp t5745"), ++ DMI_MATCH(DMI_PRODUCT_NAME, "hp t5745"), + }, + }, + { +@@ -760,7 +760,7 @@ static const struct dmi_system_id intel_no_lvds[] = { + .ident = "Hewlett-Packard st5747", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), +- DMI_MATCH(DMI_BOARD_NAME, "hp st5747"), ++ DMI_MATCH(DMI_PRODUCT_NAME, "hp st5747"), + }, + }, + { +diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c +index a8d8ee5..bbf247c 100644 +--- a/drivers/gpu/drm/i915/intel_sdvo.c ++++ b/drivers/gpu/drm/i915/intel_sdvo.c +@@ -2514,6 +2514,7 @@ bool intel_sdvo_init(struct drm_device *dev, int sdvo_reg) + struct drm_i915_private *dev_priv = dev->dev_private; + struct intel_encoder *intel_encoder; + struct intel_sdvo *intel_sdvo; ++ u32 hotplug_mask; + int i; + + intel_sdvo = kzalloc(sizeof(struct intel_sdvo), GFP_KERNEL); +@@ -2544,10 +2545,17 @@ bool intel_sdvo_init(struct drm_device *dev, int sdvo_reg) + } + } + +- if (IS_SDVOB(sdvo_reg)) +- dev_priv->hotplug_supported_mask |= SDVOB_HOTPLUG_INT_STATUS; +- else +- dev_priv->hotplug_supported_mask |= SDVOC_HOTPLUG_INT_STATUS; ++ hotplug_mask = 0; ++ if (IS_G4X(dev)) { ++ hotplug_mask = IS_SDVOB(sdvo_reg) ? ++ SDVOB_HOTPLUG_INT_STATUS_G4X : SDVOC_HOTPLUG_INT_STATUS_G4X; ++ } else if (IS_GEN4(dev)) { ++ hotplug_mask = IS_SDVOB(sdvo_reg) ? ++ SDVOB_HOTPLUG_INT_STATUS_I965 : SDVOC_HOTPLUG_INT_STATUS_I965; ++ } else { ++ hotplug_mask = IS_SDVOB(sdvo_reg) ? ++ SDVOB_HOTPLUG_INT_STATUS_I915 : SDVOC_HOTPLUG_INT_STATUS_I915; ++ } + + drm_encoder_helper_add(&intel_encoder->base, &intel_sdvo_helper_funcs); + +@@ -2555,14 +2563,6 @@ bool intel_sdvo_init(struct drm_device *dev, int sdvo_reg) + if (!intel_sdvo_get_capabilities(intel_sdvo, &intel_sdvo->caps)) + goto err; + +- /* Set up hotplug command - note paranoia about contents of reply. +- * We assume that the hardware is in a sane state, and only touch +- * the bits we think we understand. +- */ +- intel_sdvo_get_value(intel_sdvo, SDVO_CMD_GET_ACTIVE_HOT_PLUG, +- &intel_sdvo->hotplug_active, 2); +- intel_sdvo->hotplug_active[0] &= ~0x3; +- + if (intel_sdvo_output_setup(intel_sdvo, + intel_sdvo->caps.output_flags) != true) { + DRM_DEBUG_KMS("SDVO output failed to setup on SDVO%c\n", +@@ -2570,6 +2570,12 @@ bool intel_sdvo_init(struct drm_device *dev, int sdvo_reg) + goto err; + } + ++ /* Only enable the hotplug irq if we need it, to work around noisy ++ * hotplug lines. ++ */ ++ if (intel_sdvo->hotplug_active[0]) ++ dev_priv->hotplug_supported_mask |= hotplug_mask; ++ + intel_sdvo_select_ddc_bus(dev_priv, intel_sdvo, sdvo_reg); + + /* Set the input timing to the screen. Assume always input 0. */ +diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c b/drivers/gpu/drm/nouveau/nouveau_display.c +index b12fd2c..6adef06 100644 +--- a/drivers/gpu/drm/nouveau/nouveau_display.c ++++ b/drivers/gpu/drm/nouveau/nouveau_display.c +@@ -381,7 +381,7 @@ nouveau_display_dumb_create(struct drm_file *file_priv, struct drm_device *dev, + args->size = args->pitch * args->height; + args->size = roundup(args->size, PAGE_SIZE); + +- ret = nouveau_gem_new(dev, args->size, 0, TTM_PL_FLAG_VRAM, 0, 0, &bo); ++ ret = nouveau_gem_new(dev, args->size, 0, NOUVEAU_GEM_DOMAIN_VRAM, 0, 0, &bo); + if (ret) + return ret; + +diff --git a/drivers/gpu/drm/radeon/atombios_crtc.c b/drivers/gpu/drm/radeon/atombios_crtc.c +index 757c549..ceffd20 100644 +--- a/drivers/gpu/drm/radeon/atombios_crtc.c ++++ b/drivers/gpu/drm/radeon/atombios_crtc.c +@@ -1446,14 +1446,98 @@ static void radeon_legacy_atom_fixup(struct drm_crtc *crtc) + } + } + ++/** ++ * radeon_get_pll_use_mask - look up a mask of which pplls are in use ++ * ++ * @crtc: drm crtc ++ * ++ * Returns the mask of which PPLLs (Pixel PLLs) are in use. ++ */ ++static u32 radeon_get_pll_use_mask(struct drm_crtc *crtc) ++{ ++ struct drm_device *dev = crtc->dev; ++ struct drm_crtc *test_crtc; ++ struct radeon_crtc *radeon_test_crtc; ++ u32 pll_in_use = 0; ++ ++ list_for_each_entry(test_crtc, &dev->mode_config.crtc_list, head) { ++ if (crtc == test_crtc) ++ continue; ++ ++ radeon_test_crtc = to_radeon_crtc(test_crtc); ++ if (radeon_test_crtc->pll_id != ATOM_PPLL_INVALID) ++ pll_in_use |= (1 << radeon_test_crtc->pll_id); ++ } ++ return pll_in_use; ++} ++ ++/** ++ * radeon_get_shared_dp_ppll - return the PPLL used by another crtc for DP ++ * ++ * @crtc: drm crtc ++ * ++ * Returns the PPLL (Pixel PLL) used by another crtc/encoder which is ++ * also in DP mode. For DP, a single PPLL can be used for all DP ++ * crtcs/encoders. ++ */ ++static int radeon_get_shared_dp_ppll(struct drm_crtc *crtc) ++{ ++ struct drm_device *dev = crtc->dev; ++ struct drm_encoder *test_encoder; ++ struct radeon_crtc *radeon_test_crtc; ++ ++ list_for_each_entry(test_encoder, &dev->mode_config.encoder_list, head) { ++ if (test_encoder->crtc && (test_encoder->crtc != crtc)) { ++ if (ENCODER_MODE_IS_DP(atombios_get_encoder_mode(test_encoder))) { ++ /* for DP use the same PLL for all */ ++ radeon_test_crtc = to_radeon_crtc(test_encoder->crtc); ++ if (radeon_test_crtc->pll_id != ATOM_PPLL_INVALID) ++ return radeon_test_crtc->pll_id; ++ } ++ } ++ } ++ return ATOM_PPLL_INVALID; ++} ++ ++/** ++ * radeon_atom_pick_pll - Allocate a PPLL for use by the crtc. ++ * ++ * @crtc: drm crtc ++ * ++ * Returns the PPLL (Pixel PLL) to be used by the crtc. For DP monitors ++ * a single PPLL can be used for all DP crtcs/encoders. For non-DP ++ * monitors a dedicated PPLL must be used. If a particular board has ++ * an external DP PLL, return ATOM_PPLL_INVALID to skip PLL programming ++ * as there is no need to program the PLL itself. If we are not able to ++ * allocate a PLL, return ATOM_PPLL_INVALID to skip PLL programming to ++ * avoid messing up an existing monitor. ++ * ++ * Asic specific PLL information ++ * ++ * DCE 6.1 ++ * - PPLL2 is only available to UNIPHYA (both DP and non-DP) ++ * - PPLL0, PPLL1 are available for UNIPHYB/C/D/E/F (both DP and non-DP) ++ * ++ * DCE 6.0 ++ * - PPLL0 is available to all UNIPHY (DP only) ++ * - PPLL1, PPLL2 are available for all UNIPHY (both DP and non-DP) and DAC ++ * ++ * DCE 5.0 ++ * - DCPLL is available to all UNIPHY (DP only) ++ * - PPLL1, PPLL2 are available for all UNIPHY (both DP and non-DP) and DAC ++ * ++ * DCE 3.0/4.0/4.1 ++ * - PPLL1, PPLL2 are available for all UNIPHY (both DP and non-DP) and DAC ++ * ++ */ + static int radeon_atom_pick_pll(struct drm_crtc *crtc) + { + struct radeon_crtc *radeon_crtc = to_radeon_crtc(crtc); + struct drm_device *dev = crtc->dev; + struct radeon_device *rdev = dev->dev_private; + struct drm_encoder *test_encoder; +- struct drm_crtc *test_crtc; +- uint32_t pll_in_use = 0; ++ u32 pll_in_use; ++ int pll; + + if (ASIC_IS_DCE4(rdev)) { + list_for_each_entry(test_encoder, &dev->mode_config.encoder_list, head) { +@@ -1461,35 +1545,39 @@ static int radeon_atom_pick_pll(struct drm_crtc *crtc) + /* in DP mode, the DP ref clock can come from PPLL, DCPLL, or ext clock, + * depending on the asic: + * DCE4: PPLL or ext clock +- * DCE5: DCPLL or ext clock ++ * DCE5: PPLL, DCPLL, or ext clock + * + * Setting ATOM_PPLL_INVALID will cause SetPixelClock to skip + * PPLL/DCPLL programming and only program the DP DTO for the + * crtc virtual pixel clock. + */ + if (ENCODER_MODE_IS_DP(atombios_get_encoder_mode(test_encoder))) { +- if (ASIC_IS_DCE5(rdev) || rdev->clock.dp_extclk) ++ if (rdev->clock.dp_extclk) ++ /* skip PPLL programming if using ext clock */ + return ATOM_PPLL_INVALID; ++ else if (ASIC_IS_DCE5(rdev)) ++ /* use DCPLL for all DP */ ++ return ATOM_DCPLL; ++ else { ++ /* use the same PPLL for all DP monitors */ ++ pll = radeon_get_shared_dp_ppll(crtc); ++ if (pll != ATOM_PPLL_INVALID) ++ return pll; ++ } + } ++ break; + } + } +- +- /* otherwise, pick one of the plls */ +- list_for_each_entry(test_crtc, &dev->mode_config.crtc_list, head) { +- struct radeon_crtc *radeon_test_crtc; +- +- if (crtc == test_crtc) +- continue; +- +- radeon_test_crtc = to_radeon_crtc(test_crtc); +- if ((radeon_test_crtc->pll_id >= ATOM_PPLL1) && +- (radeon_test_crtc->pll_id <= ATOM_PPLL2)) +- pll_in_use |= (1 << radeon_test_crtc->pll_id); +- } +- if (!(pll_in_use & 1)) ++ /* all other cases */ ++ pll_in_use = radeon_get_pll_use_mask(crtc); ++ if (!(pll_in_use & (1 << ATOM_PPLL2))) ++ return ATOM_PPLL2; ++ if (!(pll_in_use & (1 << ATOM_PPLL1))) + return ATOM_PPLL1; +- return ATOM_PPLL2; ++ DRM_ERROR("unable to allocate a PPLL\n"); ++ return ATOM_PPLL_INVALID; + } else ++ /* use PPLL1 or PPLL2 */ + return radeon_crtc->crtc_id; + + } +@@ -1578,10 +1666,25 @@ static void atombios_crtc_commit(struct drm_crtc *crtc) + static void atombios_crtc_disable(struct drm_crtc *crtc) + { + struct radeon_crtc *radeon_crtc = to_radeon_crtc(crtc); ++ struct drm_device *dev = crtc->dev; ++ struct radeon_device *rdev = dev->dev_private; + struct radeon_atom_ss ss; ++ int i; + + atombios_crtc_dpms(crtc, DRM_MODE_DPMS_OFF); + ++ for (i = 0; i < rdev->num_crtc; i++) { ++ if (rdev->mode_info.crtcs[i] && ++ rdev->mode_info.crtcs[i]->enabled && ++ i != radeon_crtc->crtc_id && ++ radeon_crtc->pll_id == rdev->mode_info.crtcs[i]->pll_id) { ++ /* one other crtc is using this pll don't turn ++ * off the pll ++ */ ++ goto done; ++ } ++ } ++ + switch (radeon_crtc->pll_id) { + case ATOM_PPLL1: + case ATOM_PPLL2: +@@ -1592,7 +1695,8 @@ static void atombios_crtc_disable(struct drm_crtc *crtc) + default: + break; + } +- radeon_crtc->pll_id = -1; ++done: ++ radeon_crtc->pll_id = ATOM_PPLL_INVALID; + } + + static const struct drm_crtc_helper_funcs atombios_helper_funcs = { +@@ -1641,6 +1745,6 @@ void radeon_atombios_init_crtc(struct drm_device *dev, + else + radeon_crtc->crtc_offset = 0; + } +- radeon_crtc->pll_id = -1; ++ radeon_crtc->pll_id = ATOM_PPLL_INVALID; + drm_crtc_helper_add(&radeon_crtc->base, &atombios_helper_funcs); + } +diff --git a/drivers/gpu/drm/radeon/atombios_encoders.c b/drivers/gpu/drm/radeon/atombios_encoders.c +index 5351ee1..382e141 100644 +--- a/drivers/gpu/drm/radeon/atombios_encoders.c ++++ b/drivers/gpu/drm/radeon/atombios_encoders.c +@@ -1344,6 +1344,8 @@ radeon_atom_encoder_dpms_dig(struct drm_encoder *encoder, int mode) + struct drm_device *dev = encoder->dev; + struct radeon_device *rdev = dev->dev_private; + struct radeon_encoder *radeon_encoder = to_radeon_encoder(encoder); ++ struct drm_encoder *ext_encoder = radeon_get_external_encoder(encoder); ++ struct radeon_encoder_atom_dig *dig = radeon_encoder->enc_priv; + struct drm_connector *connector = radeon_get_connector_for_encoder(encoder); + struct radeon_connector *radeon_connector = NULL; + struct radeon_connector_atom_dig *radeon_dig_connector = NULL; +@@ -1355,12 +1357,38 @@ radeon_atom_encoder_dpms_dig(struct drm_encoder *encoder, int mode) + + switch (mode) { + case DRM_MODE_DPMS_ON: +- /* some early dce3.2 boards have a bug in their transmitter control table */ +- if ((rdev->family == CHIP_RV710) || (rdev->family == CHIP_RV730) || +- ASIC_IS_DCE41(rdev) || ASIC_IS_DCE5(rdev)) ++ if (ASIC_IS_DCE41(rdev) || ASIC_IS_DCE5(rdev)) { ++ if (!connector) ++ dig->panel_mode = DP_PANEL_MODE_EXTERNAL_DP_MODE; ++ else ++ dig->panel_mode = radeon_dp_get_panel_mode(encoder, connector); ++ ++ /* setup and enable the encoder */ ++ atombios_dig_encoder_setup(encoder, ATOM_ENCODER_CMD_SETUP, 0); ++ atombios_dig_encoder_setup(encoder, ++ ATOM_ENCODER_CMD_SETUP_PANEL_MODE, ++ dig->panel_mode); ++ if (ext_encoder) { ++ if (ASIC_IS_DCE41(rdev)) ++ atombios_external_encoder_setup(encoder, ext_encoder, ++ EXTERNAL_ENCODER_ACTION_V3_ENCODER_SETUP); ++ } ++ atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_ENABLE, 0, 0); ++ } else if (ASIC_IS_DCE4(rdev)) { ++ /* setup and enable the encoder */ ++ atombios_dig_encoder_setup(encoder, ATOM_ENCODER_CMD_SETUP, 0); ++ /* enable the transmitter */ + atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_ENABLE, 0, 0); +- else + atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_ENABLE_OUTPUT, 0, 0); ++ } else { ++ /* setup and enable the encoder and transmitter */ ++ atombios_dig_encoder_setup(encoder, ATOM_ENABLE, 0); ++ atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_SETUP, 0, 0); ++ atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_ENABLE, 0, 0); ++ /* some early dce3.2 boards have a bug in their transmitter control table */ ++ if ((rdev->family != CHIP_RV710) || (rdev->family != CHIP_RV730)) ++ atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_ENABLE_OUTPUT, 0, 0); ++ } + if (ENCODER_MODE_IS_DP(atombios_get_encoder_mode(encoder)) && connector) { + if (connector->connector_type == DRM_MODE_CONNECTOR_eDP) { + atombios_set_edp_panel_power(connector, +@@ -1377,10 +1405,19 @@ radeon_atom_encoder_dpms_dig(struct drm_encoder *encoder, int mode) + case DRM_MODE_DPMS_STANDBY: + case DRM_MODE_DPMS_SUSPEND: + case DRM_MODE_DPMS_OFF: +- if (ASIC_IS_DCE41(rdev) || ASIC_IS_DCE5(rdev)) ++ if (ASIC_IS_DCE41(rdev) || ASIC_IS_DCE5(rdev)) { ++ /* disable the transmitter */ + atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_DISABLE, 0, 0); +- else ++ } else if (ASIC_IS_DCE4(rdev)) { ++ /* disable the transmitter */ + atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_DISABLE_OUTPUT, 0, 0); ++ atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_DISABLE, 0, 0); ++ } else { ++ /* disable the encoder and transmitter */ ++ atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_DISABLE_OUTPUT, 0, 0); ++ atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_DISABLE, 0, 0); ++ atombios_dig_encoder_setup(encoder, ATOM_DISABLE, 0); ++ } + if (ENCODER_MODE_IS_DP(atombios_get_encoder_mode(encoder)) && connector) { + if (ASIC_IS_DCE4(rdev)) + atombios_dig_encoder_setup(encoder, ATOM_ENCODER_CMD_DP_VIDEO_OFF, 0); +@@ -1805,10 +1842,12 @@ radeon_atom_encoder_mode_set(struct drm_encoder *encoder, + struct drm_device *dev = encoder->dev; + struct radeon_device *rdev = dev->dev_private; + struct radeon_encoder *radeon_encoder = to_radeon_encoder(encoder); +- struct drm_encoder *ext_encoder = radeon_get_external_encoder(encoder); + + radeon_encoder->pixel_clock = adjusted_mode->clock; + ++ /* need to call this here rather than in prepare() since we need some crtc info */ ++ radeon_atom_encoder_dpms(encoder, DRM_MODE_DPMS_OFF); ++ + if (ASIC_IS_AVIVO(rdev) && !ASIC_IS_DCE4(rdev)) { + if (radeon_encoder->active_device & (ATOM_DEVICE_CV_SUPPORT | ATOM_DEVICE_TV_SUPPORT)) + atombios_yuv_setup(encoder, true); +@@ -1827,38 +1866,7 @@ radeon_atom_encoder_mode_set(struct drm_encoder *encoder, + case ENCODER_OBJECT_ID_INTERNAL_UNIPHY1: + case ENCODER_OBJECT_ID_INTERNAL_UNIPHY2: + case ENCODER_OBJECT_ID_INTERNAL_KLDSCP_LVTMA: +- if (ASIC_IS_DCE41(rdev) || ASIC_IS_DCE5(rdev)) { +- struct drm_connector *connector = radeon_get_connector_for_encoder(encoder); +- struct radeon_encoder_atom_dig *dig = radeon_encoder->enc_priv; +- +- if (!connector) +- dig->panel_mode = DP_PANEL_MODE_EXTERNAL_DP_MODE; +- else +- dig->panel_mode = radeon_dp_get_panel_mode(encoder, connector); +- +- /* setup and enable the encoder */ +- atombios_dig_encoder_setup(encoder, ATOM_ENCODER_CMD_SETUP, 0); +- atombios_dig_encoder_setup(encoder, +- ATOM_ENCODER_CMD_SETUP_PANEL_MODE, +- dig->panel_mode); +- } else if (ASIC_IS_DCE4(rdev)) { +- /* disable the transmitter */ +- atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_DISABLE, 0, 0); +- /* setup and enable the encoder */ +- atombios_dig_encoder_setup(encoder, ATOM_ENCODER_CMD_SETUP, 0); +- +- /* enable the transmitter */ +- atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_ENABLE, 0, 0); +- } else { +- /* disable the encoder and transmitter */ +- atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_DISABLE, 0, 0); +- atombios_dig_encoder_setup(encoder, ATOM_DISABLE, 0); +- +- /* setup and enable the encoder and transmitter */ +- atombios_dig_encoder_setup(encoder, ATOM_ENABLE, 0); +- atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_SETUP, 0, 0); +- atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_ENABLE, 0, 0); +- } ++ /* handled in dpms */ + break; + case ENCODER_OBJECT_ID_INTERNAL_DDI: + case ENCODER_OBJECT_ID_INTERNAL_DVO1: +@@ -1879,14 +1887,6 @@ radeon_atom_encoder_mode_set(struct drm_encoder *encoder, + break; + } + +- if (ext_encoder) { +- if (ASIC_IS_DCE41(rdev)) +- atombios_external_encoder_setup(encoder, ext_encoder, +- EXTERNAL_ENCODER_ACTION_V3_ENCODER_SETUP); +- else +- atombios_external_encoder_setup(encoder, ext_encoder, ATOM_ENABLE); +- } +- + atombios_apply_encoder_quirks(encoder, adjusted_mode); + + if (atombios_get_encoder_mode(encoder) == ATOM_ENCODER_MODE_HDMI) { +@@ -2059,7 +2059,6 @@ static void radeon_atom_encoder_prepare(struct drm_encoder *encoder) + } + + radeon_atom_output_lock(encoder, true); +- radeon_atom_encoder_dpms(encoder, DRM_MODE_DPMS_OFF); + + if (connector) { + struct radeon_connector *radeon_connector = to_radeon_connector(connector); +@@ -2080,6 +2079,7 @@ static void radeon_atom_encoder_prepare(struct drm_encoder *encoder) + + static void radeon_atom_encoder_commit(struct drm_encoder *encoder) + { ++ /* need to call this here as we need the crtc set up */ + radeon_atom_encoder_dpms(encoder, DRM_MODE_DPMS_ON); + radeon_atom_output_lock(encoder, false); + } +@@ -2120,14 +2120,7 @@ static void radeon_atom_encoder_disable(struct drm_encoder *encoder) + case ENCODER_OBJECT_ID_INTERNAL_UNIPHY1: + case ENCODER_OBJECT_ID_INTERNAL_UNIPHY2: + case ENCODER_OBJECT_ID_INTERNAL_KLDSCP_LVTMA: +- if (ASIC_IS_DCE4(rdev)) +- /* disable the transmitter */ +- atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_DISABLE, 0, 0); +- else { +- /* disable the encoder and transmitter */ +- atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_DISABLE, 0, 0); +- atombios_dig_encoder_setup(encoder, ATOM_DISABLE, 0); +- } ++ /* handled in dpms */ + break; + case ENCODER_OBJECT_ID_INTERNAL_DDI: + case ENCODER_OBJECT_ID_INTERNAL_DVO1: +diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c +index 9231564..c5762e3 100644 +--- a/drivers/gpu/drm/radeon/radeon_device.c ++++ b/drivers/gpu/drm/radeon/radeon_device.c +@@ -761,7 +761,7 @@ int radeon_device_init(struct radeon_device *rdev, + if (rdev->flags & RADEON_IS_AGP) + rdev->need_dma32 = true; + if ((rdev->flags & RADEON_IS_PCI) && +- (rdev->family < CHIP_RS400)) ++ (rdev->family <= CHIP_RS740)) + rdev->need_dma32 = true; + + dma_bits = rdev->need_dma32 ? 32 : 40; +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +index dff8fc7..033fc96 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +@@ -178,6 +178,7 @@ static struct pci_device_id vmw_pci_id_list[] = { + {0x15ad, 0x0405, PCI_ANY_ID, PCI_ANY_ID, 0, 0, VMWGFX_CHIP_SVGAII}, + {0, 0, 0} + }; ++MODULE_DEVICE_TABLE(pci, vmw_pci_id_list); + + static int enable_fbdev; + +@@ -1088,6 +1089,11 @@ static struct drm_driver driver = { + .master_drop = vmw_master_drop, + .open = vmw_driver_open, + .postclose = vmw_postclose, ++ ++ .dumb_create = vmw_dumb_create, ++ .dumb_map_offset = vmw_dumb_map_offset, ++ .dumb_destroy = vmw_dumb_destroy, ++ + .fops = { + .owner = THIS_MODULE, + .open = drm_open, +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h +index dc27970..0e3fa7d 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h +@@ -641,6 +641,16 @@ int vmw_kms_readback(struct vmw_private *dev_priv, + int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data, + struct drm_file *file_priv); + ++int vmw_dumb_create(struct drm_file *file_priv, ++ struct drm_device *dev, ++ struct drm_mode_create_dumb *args); ++ ++int vmw_dumb_map_offset(struct drm_file *file_priv, ++ struct drm_device *dev, uint32_t handle, ++ uint64_t *offset); ++int vmw_dumb_destroy(struct drm_file *file_priv, ++ struct drm_device *dev, ++ uint32_t handle); + /** + * Overlay control - vmwgfx_overlay.c + */ +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c +index 1c7f09e..0795d17 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c +@@ -1950,3 +1950,76 @@ err_ref: + vmw_resource_unreference(&res); + return ret; + } ++ ++ ++int vmw_dumb_create(struct drm_file *file_priv, ++ struct drm_device *dev, ++ struct drm_mode_create_dumb *args) ++{ ++ struct vmw_private *dev_priv = vmw_priv(dev); ++ struct vmw_master *vmaster = vmw_master(file_priv->master); ++ struct vmw_user_dma_buffer *vmw_user_bo; ++ struct ttm_buffer_object *tmp; ++ int ret; ++ ++ args->pitch = args->width * ((args->bpp + 7) / 8); ++ args->size = args->pitch * args->height; ++ ++ vmw_user_bo = kzalloc(sizeof(*vmw_user_bo), GFP_KERNEL); ++ if (vmw_user_bo == NULL) ++ return -ENOMEM; ++ ++ ret = ttm_read_lock(&vmaster->lock, true); ++ if (ret != 0) { ++ kfree(vmw_user_bo); ++ return ret; ++ } ++ ++ ret = vmw_dmabuf_init(dev_priv, &vmw_user_bo->dma, args->size, ++ &vmw_vram_sys_placement, true, ++ &vmw_user_dmabuf_destroy); ++ if (ret != 0) ++ goto out_no_dmabuf; ++ ++ tmp = ttm_bo_reference(&vmw_user_bo->dma.base); ++ ret = ttm_base_object_init(vmw_fpriv(file_priv)->tfile, ++ &vmw_user_bo->base, ++ false, ++ ttm_buffer_type, ++ &vmw_user_dmabuf_release, NULL); ++ if (unlikely(ret != 0)) ++ goto out_no_base_object; ++ ++ args->handle = vmw_user_bo->base.hash.key; ++ ++out_no_base_object: ++ ttm_bo_unref(&tmp); ++out_no_dmabuf: ++ ttm_read_unlock(&vmaster->lock); ++ return ret; ++} ++ ++int vmw_dumb_map_offset(struct drm_file *file_priv, ++ struct drm_device *dev, uint32_t handle, ++ uint64_t *offset) ++{ ++ struct ttm_object_file *tfile = vmw_fpriv(file_priv)->tfile; ++ struct vmw_dma_buffer *out_buf; ++ int ret; ++ ++ ret = vmw_user_dmabuf_lookup(tfile, handle, &out_buf); ++ if (ret != 0) ++ return -EINVAL; ++ ++ *offset = out_buf->base.addr_space_offset; ++ vmw_dmabuf_unreference(&out_buf); ++ return 0; ++} ++ ++int vmw_dumb_destroy(struct drm_file *file_priv, ++ struct drm_device *dev, ++ uint32_t handle) ++{ ++ return ttm_ref_object_base_unref(vmw_fpriv(file_priv)->tfile, ++ handle, TTM_REF_USAGE); ++} +diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig +index d21f6d0..b5cc078 100644 +--- a/drivers/hid/Kconfig ++++ b/drivers/hid/Kconfig +@@ -350,6 +350,7 @@ config HID_MULTITOUCH + - Lumio CrystalTouch panels + - MosArt dual-touch panels + - PenMount dual touch panels ++ - PixArt optical touch screen + - Pixcir dual touch panels + - eGalax dual-touch panels, including the Joojoo and Wetab tablets + - Stantum multitouch panels +diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c +index 5cc029f..0c8bea9 100644 +--- a/drivers/hid/hid-core.c ++++ b/drivers/hid/hid-core.c +@@ -1507,6 +1507,9 @@ static const struct hid_device_id hid_have_special_driver[] = { + { HID_USB_DEVICE(USB_VENDOR_ID_ORTEK, USB_DEVICE_ID_ORTEK_WKB2000) }, + { HID_USB_DEVICE(USB_VENDOR_ID_PENMOUNT, USB_DEVICE_ID_PENMOUNT_PCI) }, + { HID_USB_DEVICE(USB_VENDOR_ID_PETALYNX, USB_DEVICE_ID_PETALYNX_MAXTER_REMOTE) }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_PIXART, USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN) }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_PIXART, USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN1) }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_PIXART, USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN2) }, + { HID_USB_DEVICE(USB_VENDOR_ID_PRIMAX, USB_DEVICE_ID_PRIMAX_KEYBOARD) }, + { HID_USB_DEVICE(USB_VENDOR_ID_QUANTA, USB_DEVICE_ID_QUANTA_OPTICAL_TOUCH) }, + { HID_USB_DEVICE(USB_VENDOR_ID_QUANTA, USB_DEVICE_ID_PIXART_IMAGING_INC_OPTICAL_TOUCH_SCREEN) }, +diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h +index e4317a2..ab75a4e 100644 +--- a/drivers/hid/hid-ids.h ++++ b/drivers/hid/hid-ids.h +@@ -593,6 +593,11 @@ + #define USB_VENDOR_ID_PI_ENGINEERING 0x05f3 + #define USB_DEVICE_ID_PI_ENGINEERING_VEC_USB_FOOTPEDAL 0xff + ++#define USB_VENDOR_ID_PIXART 0x093a ++#define USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN 0x8001 ++#define USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN1 0x8002 ++#define USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN2 0x8003 ++ + #define USB_VENDOR_ID_PLAYDOTCOM 0x0b43 + #define USB_DEVICE_ID_PLAYDOTCOM_EMS_USBII 0x0003 + +diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c +index 995fc4c..13af0f1 100644 +--- a/drivers/hid/hid-multitouch.c ++++ b/drivers/hid/hid-multitouch.c +@@ -93,6 +93,7 @@ struct mt_class { + #define MT_CLS_DUAL_INRANGE_CONTACTID 0x0005 + #define MT_CLS_DUAL_INRANGE_CONTACTNUMBER 0x0006 + #define MT_CLS_DUAL_NSMU_CONTACTID 0x0007 ++#define MT_CLS_INRANGE_CONTACTNUMBER 0x0009 + + /* vendor specific classes */ + #define MT_CLS_3M 0x0101 +@@ -155,6 +156,9 @@ struct mt_class mt_classes[] = { + .quirks = MT_QUIRK_NOT_SEEN_MEANS_UP | + MT_QUIRK_SLOT_IS_CONTACTID, + .maxcontacts = 2 }, ++ { .name = MT_CLS_INRANGE_CONTACTNUMBER, ++ .quirks = MT_QUIRK_VALID_IS_INRANGE | ++ MT_QUIRK_SLOT_IS_CONTACTNUMBER }, + + /* + * vendor specific classes +@@ -744,6 +748,17 @@ static const struct hid_device_id mt_devices[] = { + HID_USB_DEVICE(USB_VENDOR_ID_PENMOUNT, + USB_DEVICE_ID_PENMOUNT_PCI) }, + ++ /* PixArt optical touch screen */ ++ { .driver_data = MT_CLS_INRANGE_CONTACTNUMBER, ++ HID_USB_DEVICE(USB_VENDOR_ID_PIXART, ++ USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN) }, ++ { .driver_data = MT_CLS_INRANGE_CONTACTNUMBER, ++ HID_USB_DEVICE(USB_VENDOR_ID_PIXART, ++ USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN1) }, ++ { .driver_data = MT_CLS_INRANGE_CONTACTNUMBER, ++ HID_USB_DEVICE(USB_VENDOR_ID_PIXART, ++ USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN2) }, ++ + /* PixCir-based panels */ + { .driver_data = MT_CLS_DUAL_INRANGE_CONTACTID, + HID_USB_DEVICE(USB_VENDOR_ID_HANVON, +diff --git a/drivers/hid/usbhid/hid-quirks.c b/drivers/hid/usbhid/hid-quirks.c +index 1fe6b80..afb73af 100644 +--- a/drivers/hid/usbhid/hid-quirks.c ++++ b/drivers/hid/usbhid/hid-quirks.c +@@ -68,6 +68,10 @@ static const struct hid_blacklist { + { USB_VENDOR_ID_CH, USB_DEVICE_ID_CH_AXIS_295, HID_QUIRK_NOGET }, + { USB_VENDOR_ID_DMI, USB_DEVICE_ID_DMI_ENC, HID_QUIRK_NOGET }, + { USB_VENDOR_ID_ELO, USB_DEVICE_ID_ELO_TS2700, HID_QUIRK_NOGET }, ++ { USB_VENDOR_ID_MGE, USB_DEVICE_ID_MGE_UPS, HID_QUIRK_NOGET }, ++ { USB_VENDOR_ID_PIXART, USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN, HID_QUIRK_NO_INIT_REPORTS }, ++ { USB_VENDOR_ID_PIXART, USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN1, HID_QUIRK_NO_INIT_REPORTS }, ++ { USB_VENDOR_ID_PIXART, USB_DEVICE_ID_PIXART_OPTICAL_TOUCH_SCREEN2, HID_QUIRK_NO_INIT_REPORTS }, + { USB_VENDOR_ID_PRODIGE, USB_DEVICE_ID_PRODIGE_CORDLESS, HID_QUIRK_NOGET }, + { USB_VENDOR_ID_QUANTA, USB_DEVICE_ID_PIXART_IMAGING_INC_OPTICAL_TOUCH_SCREEN, HID_QUIRK_NOGET }, + { USB_VENDOR_ID_SUN, USB_DEVICE_ID_RARITAN_KVM_DONGLE, HID_QUIRK_NOGET }, +diff --git a/drivers/hwmon/asus_atk0110.c b/drivers/hwmon/asus_atk0110.c +index 00e9851..83d2fbd6 100644 +--- a/drivers/hwmon/asus_atk0110.c ++++ b/drivers/hwmon/asus_atk0110.c +@@ -34,6 +34,12 @@ static const struct dmi_system_id __initconst atk_force_new_if[] = { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "SABERTOOTH X58") + } ++ }, { ++ /* Old interface reads the same sensor for fan0 and fan1 */ ++ .ident = "Asus M5A78L", ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "M5A78L") ++ } + }, + { } + }; +diff --git a/drivers/hwmon/twl4030-madc-hwmon.c b/drivers/hwmon/twl4030-madc-hwmon.c +index 0018c7d..1a174f0 100644 +--- a/drivers/hwmon/twl4030-madc-hwmon.c ++++ b/drivers/hwmon/twl4030-madc-hwmon.c +@@ -44,12 +44,13 @@ static ssize_t madc_read(struct device *dev, + struct device_attribute *devattr, char *buf) + { + struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr); +- struct twl4030_madc_request req; ++ struct twl4030_madc_request req = { ++ .channels = 1 << attr->index, ++ .method = TWL4030_MADC_SW2, ++ .type = TWL4030_MADC_WAIT, ++ }; + long val; + +- req.channels = (1 << attr->index); +- req.method = TWL4030_MADC_SW2; +- req.func_cb = NULL; + val = twl4030_madc_conversion(&req); + if (val < 0) + return val; +diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig +index a3afac4..60f593c 100644 +--- a/drivers/i2c/busses/Kconfig ++++ b/drivers/i2c/busses/Kconfig +@@ -103,6 +103,8 @@ config I2C_I801 + Patsburg (PCH) + DH89xxCC (PCH) + Panther Point (PCH) ++ Lynx Point (PCH) ++ Lynx Point-LP (PCH) + + This driver can also be built as a module. If so, the module + will be called i2c-i801. +@@ -349,9 +351,13 @@ config I2C_DAVINCI + devices such as DaVinci NIC. + For details please see http://www.ti.com/davinci + ++config I2C_DESIGNWARE_CORE ++ tristate ++ + config I2C_DESIGNWARE_PLATFORM + tristate "Synopsys DesignWare Platfrom" + depends on HAVE_CLK ++ select I2C_DESIGNWARE_CORE + help + If you say yes to this option, support will be included for the + Synopsys DesignWare I2C adapter. Only master mode is supported. +@@ -362,6 +368,7 @@ config I2C_DESIGNWARE_PLATFORM + config I2C_DESIGNWARE_PCI + tristate "Synopsys DesignWare PCI" + depends on PCI ++ select I2C_DESIGNWARE_CORE + help + If you say yes to this option, support will be included for the + Synopsys DesignWare I2C adapter. Only master mode is supported. +diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile +index fba6da6..d6b8779 100644 +--- a/drivers/i2c/busses/Makefile ++++ b/drivers/i2c/busses/Makefile +@@ -33,10 +33,11 @@ obj-$(CONFIG_I2C_AU1550) += i2c-au1550.o + obj-$(CONFIG_I2C_BLACKFIN_TWI) += i2c-bfin-twi.o + obj-$(CONFIG_I2C_CPM) += i2c-cpm.o + obj-$(CONFIG_I2C_DAVINCI) += i2c-davinci.o ++obj-$(CONFIG_I2C_DESIGNWARE_CORE) += i2c-designware-core.o + obj-$(CONFIG_I2C_DESIGNWARE_PLATFORM) += i2c-designware-platform.o +-i2c-designware-platform-objs := i2c-designware-platdrv.o i2c-designware-core.o ++i2c-designware-platform-objs := i2c-designware-platdrv.o + obj-$(CONFIG_I2C_DESIGNWARE_PCI) += i2c-designware-pci.o +-i2c-designware-pci-objs := i2c-designware-pcidrv.o i2c-designware-core.o ++i2c-designware-pci-objs := i2c-designware-pcidrv.o + obj-$(CONFIG_I2C_GPIO) += i2c-gpio.o + obj-$(CONFIG_I2C_HIGHLANDER) += i2c-highlander.o + obj-$(CONFIG_I2C_IBM_IIC) += i2c-ibm_iic.o +diff --git a/drivers/i2c/busses/i2c-designware-core.c b/drivers/i2c/busses/i2c-designware-core.c +index df87992..6193349 100644 +--- a/drivers/i2c/busses/i2c-designware-core.c ++++ b/drivers/i2c/busses/i2c-designware-core.c +@@ -25,6 +25,7 @@ + * ---------------------------------------------------------------------------- + * + */ ++#include + #include + #include + #include +@@ -305,6 +306,7 @@ int i2c_dw_init(struct dw_i2c_dev *dev) + dw_writel(dev, dev->master_cfg , DW_IC_CON); + return 0; + } ++EXPORT_SYMBOL_GPL(i2c_dw_init); + + /* + * Waiting for bus not busy +@@ -557,12 +559,14 @@ done: + + return ret; + } ++EXPORT_SYMBOL_GPL(i2c_dw_xfer); + + u32 i2c_dw_func(struct i2c_adapter *adap) + { + struct dw_i2c_dev *dev = i2c_get_adapdata(adap); + return dev->functionality; + } ++EXPORT_SYMBOL_GPL(i2c_dw_func); + + static u32 i2c_dw_read_clear_intrbits(struct dw_i2c_dev *dev) + { +@@ -667,17 +671,20 @@ tx_aborted: + + return IRQ_HANDLED; + } ++EXPORT_SYMBOL_GPL(i2c_dw_isr); + + void i2c_dw_enable(struct dw_i2c_dev *dev) + { + /* Enable the adapter */ + dw_writel(dev, 1, DW_IC_ENABLE); + } ++EXPORT_SYMBOL_GPL(i2c_dw_enable); + + u32 i2c_dw_is_enabled(struct dw_i2c_dev *dev) + { + return dw_readl(dev, DW_IC_ENABLE); + } ++EXPORT_SYMBOL_GPL(i2c_dw_is_enabled); + + void i2c_dw_disable(struct dw_i2c_dev *dev) + { +@@ -688,18 +695,22 @@ void i2c_dw_disable(struct dw_i2c_dev *dev) + dw_writel(dev, 0, DW_IC_INTR_MASK); + dw_readl(dev, DW_IC_CLR_INTR); + } ++EXPORT_SYMBOL_GPL(i2c_dw_disable); + + void i2c_dw_clear_int(struct dw_i2c_dev *dev) + { + dw_readl(dev, DW_IC_CLR_INTR); + } ++EXPORT_SYMBOL_GPL(i2c_dw_clear_int); + + void i2c_dw_disable_int(struct dw_i2c_dev *dev) + { + dw_writel(dev, 0, DW_IC_INTR_MASK); + } ++EXPORT_SYMBOL_GPL(i2c_dw_disable_int); + + u32 i2c_dw_read_comp_param(struct dw_i2c_dev *dev) + { + return dw_readl(dev, DW_IC_COMP_PARAM_1); + } ++EXPORT_SYMBOL_GPL(i2c_dw_read_comp_param); +diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c +index ab26840d..817d025 100644 +--- a/drivers/i2c/busses/i2c-i801.c ++++ b/drivers/i2c/busses/i2c-i801.c +@@ -51,6 +51,8 @@ + Patsburg (PCH) IDF 0x1d72 32 hard yes yes yes + DH89xxCC (PCH) 0x2330 32 hard yes yes yes + Panther Point (PCH) 0x1e22 32 hard yes yes yes ++ Lynx Point (PCH) 0x8c22 32 hard yes yes yes ++ Lynx Point-LP (PCH) 0x9c22 32 hard yes yes yes + + Features supported by this driver: + Software PEC no +@@ -145,6 +147,8 @@ + #define PCI_DEVICE_ID_INTEL_PANTHERPOINT_SMBUS 0x1e22 + #define PCI_DEVICE_ID_INTEL_DH89XXCC_SMBUS 0x2330 + #define PCI_DEVICE_ID_INTEL_5_3400_SERIES_SMBUS 0x3b30 ++#define PCI_DEVICE_ID_INTEL_LYNXPOINT_SMBUS 0x8c22 ++#define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_SMBUS 0x9c22 + + struct i801_priv { + struct i2c_adapter adapter; +@@ -633,6 +637,8 @@ static const struct pci_device_id i801_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_PATSBURG_SMBUS_IDF2) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_DH89XXCC_SMBUS) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_PANTHERPOINT_SMBUS) }, ++ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_LYNXPOINT_SMBUS) }, ++ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_SMBUS) }, + { 0, } + }; + +diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-x86ia64io.h +index b4cfc6c..d4ec371 100644 +--- a/drivers/input/serio/i8042-x86ia64io.h ++++ b/drivers/input/serio/i8042-x86ia64io.h +@@ -177,6 +177,20 @@ static const struct dmi_system_id __initconst i8042_dmi_noloop_table[] = { + }, + }, + { ++ /* Gigabyte T1005 - defines wrong chassis type ("Other") */ ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "GIGABYTE"), ++ DMI_MATCH(DMI_PRODUCT_NAME, "T1005"), ++ }, ++ }, ++ { ++ /* Gigabyte T1005M/P - defines wrong chassis type ("Other") */ ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "GIGABYTE"), ++ DMI_MATCH(DMI_PRODUCT_NAME, "T1005M/P"), ++ }, ++ }, ++ { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion dv9700"), +diff --git a/drivers/isdn/isdnloop/isdnloop.c b/drivers/isdn/isdnloop/isdnloop.c +index d497db0..509135f 100644 +--- a/drivers/isdn/isdnloop/isdnloop.c ++++ b/drivers/isdn/isdnloop/isdnloop.c +@@ -16,7 +16,6 @@ + #include + #include "isdnloop.h" + +-static char *revision = "$Revision: 1.11.6.7 $"; + static char *isdnloop_id = "loop0"; + + MODULE_DESCRIPTION("ISDN4Linux: Pseudo Driver that simulates an ISDN card"); +@@ -1494,17 +1493,6 @@ isdnloop_addcard(char *id1) + static int __init + isdnloop_init(void) + { +- char *p; +- char rev[10]; +- +- if ((p = strchr(revision, ':'))) { +- strcpy(rev, p + 1); +- p = strchr(rev, '$'); +- *p = 0; +- } else +- strcpy(rev, " ??? "); +- printk(KERN_NOTICE "isdnloop-ISDN-driver Rev%s\n", rev); +- + if (isdnloop_id) + return (isdnloop_addcard(isdnloop_id)); + +diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c +index 34416d4..74793af 100644 +--- a/drivers/mmc/card/block.c ++++ b/drivers/mmc/card/block.c +@@ -1339,7 +1339,8 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) + /* complete ongoing async transfer before issuing discard */ + if (card->host->areq) + mmc_blk_issue_rw_rq(mq, NULL); +- if (req->cmd_flags & REQ_SECURE) ++ if (req->cmd_flags & REQ_SECURE && ++ !(card->quirks & MMC_QUIRK_SEC_ERASE_TRIM_BROKEN)) + ret = mmc_blk_issue_secdiscard_rq(mq, req); + else + ret = mmc_blk_issue_discard_rq(mq, req); +@@ -1614,6 +1615,8 @@ static int mmc_add_disk(struct mmc_blk_data *md) + return ret; + } + ++#define CID_MANFID_SAMSUNG 0x15 ++ + static const struct mmc_fixup blk_fixups[] = + { + MMC_FIXUP("SEM02G", 0x2, 0x100, add_quirk, MMC_QUIRK_INAND_CMD38), +@@ -1644,6 +1647,28 @@ static const struct mmc_fixup blk_fixups[] = + MMC_FIXUP(CID_NAME_ANY, 0x13, 0x200, add_quirk_mmc, + MMC_QUIRK_LONG_READ_TIME), + ++ /* ++ * On these Samsung MoviNAND parts, performing secure erase or ++ * secure trim can result in unrecoverable corruption due to a ++ * firmware bug. ++ */ ++ MMC_FIXUP("M8G2FA", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, ++ MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), ++ MMC_FIXUP("MAG4FA", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, ++ MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), ++ MMC_FIXUP("MBG8FA", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, ++ MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), ++ MMC_FIXUP("MCGAFA", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, ++ MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), ++ MMC_FIXUP("VAL00M", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, ++ MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), ++ MMC_FIXUP("VYL00M", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, ++ MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), ++ MMC_FIXUP("KYL00M", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, ++ MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), ++ MMC_FIXUP("VZL00M", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, ++ MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), ++ + END_FIXUP + }; + +diff --git a/drivers/mmc/host/mxs-mmc.c b/drivers/mmc/host/mxs-mmc.c +index 99b449d..f201bed 100644 +--- a/drivers/mmc/host/mxs-mmc.c ++++ b/drivers/mmc/host/mxs-mmc.c +@@ -279,11 +279,11 @@ static irqreturn_t mxs_mmc_irq_handler(int irq, void *dev_id) + writel(stat & MXS_MMC_IRQ_BITS, + host->base + HW_SSP_CTRL1 + MXS_CLR_ADDR); + ++ spin_unlock(&host->lock); ++ + if ((stat & BM_SSP_CTRL1_SDIO_IRQ) && (stat & BM_SSP_CTRL1_SDIO_IRQ_EN)) + mmc_signal_sdio_irq(host->mmc); + +- spin_unlock(&host->lock); +- + if (stat & BM_SSP_CTRL1_RESP_TIMEOUT_IRQ) + cmd->error = -ETIMEDOUT; + else if (stat & BM_SSP_CTRL1_RESP_ERR_IRQ) +@@ -628,10 +628,6 @@ static void mxs_mmc_enable_sdio_irq(struct mmc_host *mmc, int enable) + host->base + HW_SSP_CTRL0 + MXS_SET_ADDR); + writel(BM_SSP_CTRL1_SDIO_IRQ_EN, + host->base + HW_SSP_CTRL1 + MXS_SET_ADDR); +- +- if (readl(host->base + HW_SSP_STATUS) & BM_SSP_STATUS_SDIO_IRQ) +- mmc_signal_sdio_irq(host->mmc); +- + } else { + writel(BM_SSP_CTRL0_SDIO_IRQ_CHECK, + host->base + HW_SSP_CTRL0 + MXS_CLR_ADDR); +@@ -640,6 +636,10 @@ static void mxs_mmc_enable_sdio_irq(struct mmc_host *mmc, int enable) + } + + spin_unlock_irqrestore(&host->lock, flags); ++ ++ if (enable && readl(host->base + HW_SSP_STATUS) & BM_SSP_STATUS_SDIO_IRQ) ++ mmc_signal_sdio_irq(host->mmc); ++ + } + + static const struct mmc_host_ops mxs_mmc_ops = { +diff --git a/drivers/mmc/host/sdhci-esdhc.h b/drivers/mmc/host/sdhci-esdhc.h +index c3b08f1..62ca03a 100644 +--- a/drivers/mmc/host/sdhci-esdhc.h ++++ b/drivers/mmc/host/sdhci-esdhc.h +@@ -48,14 +48,14 @@ static inline void esdhc_set_clock(struct sdhci_host *host, unsigned int clock) + int div = 1; + u32 temp; + ++ if (clock == 0) ++ goto out; ++ + temp = sdhci_readl(host, ESDHC_SYSTEM_CONTROL); + temp &= ~(ESDHC_CLOCK_IPGEN | ESDHC_CLOCK_HCKEN | ESDHC_CLOCK_PEREN + | ESDHC_CLOCK_MASK); + sdhci_writel(host, temp, ESDHC_SYSTEM_CONTROL); + +- if (clock == 0) +- goto out; +- + while (host->max_clk / pre_div / 16 > clock && pre_div < 256) + pre_div *= 2; + +diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c +index 890754c..95b29f5 100644 +--- a/drivers/mtd/ubi/vtbl.c ++++ b/drivers/mtd/ubi/vtbl.c +@@ -346,7 +346,7 @@ retry: + */ + err = ubi_scan_add_used(ubi, si, new_seb->pnum, new_seb->ec, + vid_hdr, 0); +- kfree(new_seb); ++ kmem_cache_free(si->scan_leb_slab, new_seb); + ubi_free_vid_hdr(ubi, vid_hdr); + return err; + +@@ -359,7 +359,7 @@ write_error: + list_add(&new_seb->u.list, &si->erase); + goto retry; + } +- kfree(new_seb); ++ kmem_cache_free(si->scan_leb_slab, new_seb); + out_free: + ubi_free_vid_hdr(ubi, vid_hdr); + return err; +diff --git a/drivers/net/can/mcp251x.c b/drivers/net/can/mcp251x.c +index 330140e..9bcc39a 100644 +--- a/drivers/net/can/mcp251x.c ++++ b/drivers/net/can/mcp251x.c +@@ -83,6 +83,11 @@ + #define INSTRUCTION_LOAD_TXB(n) (0x40 + 2 * (n)) + #define INSTRUCTION_READ_RXB(n) (((n) == 0) ? 0x90 : 0x94) + #define INSTRUCTION_RESET 0xC0 ++#define RTS_TXB0 0x01 ++#define RTS_TXB1 0x02 ++#define RTS_TXB2 0x04 ++#define INSTRUCTION_RTS(n) (0x80 | ((n) & 0x07)) ++ + + /* MPC251x registers */ + #define CANSTAT 0x0e +@@ -397,6 +402,7 @@ static void mcp251x_hw_tx_frame(struct spi_device *spi, u8 *buf, + static void mcp251x_hw_tx(struct spi_device *spi, struct can_frame *frame, + int tx_buf_idx) + { ++ struct mcp251x_priv *priv = dev_get_drvdata(&spi->dev); + u32 sid, eid, exide, rtr; + u8 buf[SPI_TRANSFER_BUF_LEN]; + +@@ -418,7 +424,10 @@ static void mcp251x_hw_tx(struct spi_device *spi, struct can_frame *frame, + buf[TXBDLC_OFF] = (rtr << DLC_RTR_SHIFT) | frame->can_dlc; + memcpy(buf + TXBDAT_OFF, frame->data, frame->can_dlc); + mcp251x_hw_tx_frame(spi, buf, frame->can_dlc, tx_buf_idx); +- mcp251x_write_reg(spi, TXBCTRL(tx_buf_idx), TXBCTRL_TXREQ); ++ ++ /* use INSTRUCTION_RTS, to avoid "repeated frame problem" */ ++ priv->spi_tx_buf[0] = INSTRUCTION_RTS(1 << tx_buf_idx); ++ mcp251x_spi_trans(priv->spi, 1); + } + + static void mcp251x_hw_rx_frame(struct spi_device *spi, u8 *buf, +diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c +index 83199fd..d0722a7 100644 +--- a/drivers/net/ethernet/freescale/gianfar.c ++++ b/drivers/net/ethernet/freescale/gianfar.c +@@ -1041,7 +1041,7 @@ static int gfar_probe(struct platform_device *ofdev) + + if (priv->device_flags & FSL_GIANFAR_DEV_HAS_VLAN) { + dev->hw_features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX; +- dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX; ++ dev->features |= NETIF_F_HW_VLAN_RX; + } + + if (priv->device_flags & FSL_GIANFAR_DEV_HAS_EXTENDED_HASH) { +diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c +index b1cd41b..021463b 100644 +--- a/drivers/net/ethernet/ibm/ibmveth.c ++++ b/drivers/net/ethernet/ibm/ibmveth.c +@@ -472,14 +472,9 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter) + } + + if (adapter->rx_queue.queue_addr != NULL) { +- if (!dma_mapping_error(dev, adapter->rx_queue.queue_dma)) { +- dma_unmap_single(dev, +- adapter->rx_queue.queue_dma, +- adapter->rx_queue.queue_len, +- DMA_BIDIRECTIONAL); +- adapter->rx_queue.queue_dma = DMA_ERROR_CODE; +- } +- kfree(adapter->rx_queue.queue_addr); ++ dma_free_coherent(dev, adapter->rx_queue.queue_len, ++ adapter->rx_queue.queue_addr, ++ adapter->rx_queue.queue_dma); + adapter->rx_queue.queue_addr = NULL; + } + +@@ -556,10 +551,13 @@ static int ibmveth_open(struct net_device *netdev) + goto err_out; + } + ++ dev = &adapter->vdev->dev; ++ + adapter->rx_queue.queue_len = sizeof(struct ibmveth_rx_q_entry) * + rxq_entries; +- adapter->rx_queue.queue_addr = kmalloc(adapter->rx_queue.queue_len, +- GFP_KERNEL); ++ adapter->rx_queue.queue_addr = ++ dma_alloc_coherent(dev, adapter->rx_queue.queue_len, ++ &adapter->rx_queue.queue_dma, GFP_KERNEL); + + if (!adapter->rx_queue.queue_addr) { + netdev_err(netdev, "unable to allocate rx queue pages\n"); +@@ -567,19 +565,13 @@ static int ibmveth_open(struct net_device *netdev) + goto err_out; + } + +- dev = &adapter->vdev->dev; +- + adapter->buffer_list_dma = dma_map_single(dev, + adapter->buffer_list_addr, 4096, DMA_BIDIRECTIONAL); + adapter->filter_list_dma = dma_map_single(dev, + adapter->filter_list_addr, 4096, DMA_BIDIRECTIONAL); +- adapter->rx_queue.queue_dma = dma_map_single(dev, +- adapter->rx_queue.queue_addr, +- adapter->rx_queue.queue_len, DMA_BIDIRECTIONAL); + + if ((dma_mapping_error(dev, adapter->buffer_list_dma)) || +- (dma_mapping_error(dev, adapter->filter_list_dma)) || +- (dma_mapping_error(dev, adapter->rx_queue.queue_dma))) { ++ (dma_mapping_error(dev, adapter->filter_list_dma))) { + netdev_err(netdev, "unable to map filter or buffer list " + "pages\n"); + rc = -ENOMEM; +diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h +index f478a22..8e362bb 100644 +--- a/drivers/net/ethernet/intel/e1000e/e1000.h ++++ b/drivers/net/ethernet/intel/e1000e/e1000.h +@@ -302,6 +302,7 @@ struct e1000_adapter { + */ + struct e1000_ring *tx_ring /* One per active queue */ + ____cacheline_aligned_in_smp; ++ u32 tx_fifo_limit; + + struct napi_struct napi; + +diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c +index 64d3f98..0182649 100644 +--- a/drivers/net/ethernet/intel/e1000e/netdev.c ++++ b/drivers/net/ethernet/intel/e1000e/netdev.c +@@ -3386,6 +3386,15 @@ void e1000e_reset(struct e1000_adapter *adapter) + } + + /* ++ * Alignment of Tx data is on an arbitrary byte boundary with the ++ * maximum size per Tx descriptor limited only to the transmit ++ * allocation of the packet buffer minus 96 bytes with an upper ++ * limit of 24KB due to receive synchronization limitations. ++ */ ++ adapter->tx_fifo_limit = min_t(u32, ((er32(PBA) >> 16) << 10) - 96, ++ 24 << 10); ++ ++ /* + * Disable Adaptive Interrupt Moderation if 2 full packets cannot + * fit in receive buffer and early-receive not supported. + */ +@@ -4647,13 +4656,9 @@ static bool e1000_tx_csum(struct e1000_adapter *adapter, struct sk_buff *skb) + return 1; + } + +-#define E1000_MAX_PER_TXD 8192 +-#define E1000_MAX_TXD_PWR 12 +- + static int e1000_tx_map(struct e1000_adapter *adapter, + struct sk_buff *skb, unsigned int first, +- unsigned int max_per_txd, unsigned int nr_frags, +- unsigned int mss) ++ unsigned int max_per_txd, unsigned int nr_frags) + { + struct e1000_ring *tx_ring = adapter->tx_ring; + struct pci_dev *pdev = adapter->pdev; +@@ -4882,20 +4887,19 @@ static int e1000_maybe_stop_tx(struct net_device *netdev, int size) + { + struct e1000_adapter *adapter = netdev_priv(netdev); + ++ BUG_ON(size > adapter->tx_ring->count); ++ + if (e1000_desc_unused(adapter->tx_ring) >= size) + return 0; + return __e1000_maybe_stop_tx(netdev, size); + } + +-#define TXD_USE_COUNT(S, X) (((S) >> (X)) + 1 ) + static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb, + struct net_device *netdev) + { + struct e1000_adapter *adapter = netdev_priv(netdev); + struct e1000_ring *tx_ring = adapter->tx_ring; + unsigned int first; +- unsigned int max_per_txd = E1000_MAX_PER_TXD; +- unsigned int max_txd_pwr = E1000_MAX_TXD_PWR; + unsigned int tx_flags = 0; + unsigned int len = skb_headlen(skb); + unsigned int nr_frags; +@@ -4915,18 +4919,8 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb, + } + + mss = skb_shinfo(skb)->gso_size; +- /* +- * The controller does a simple calculation to +- * make sure there is enough room in the FIFO before +- * initiating the DMA for each buffer. The calc is: +- * 4 = ceil(buffer len/mss). To make sure we don't +- * overrun the FIFO, adjust the max buffer len if mss +- * drops. +- */ + if (mss) { + u8 hdr_len; +- max_per_txd = min(mss << 2, max_per_txd); +- max_txd_pwr = fls(max_per_txd) - 1; + + /* + * TSO Workaround for 82571/2/3 Controllers -- if skb->data +@@ -4956,12 +4950,12 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb, + count++; + count++; + +- count += TXD_USE_COUNT(len, max_txd_pwr); ++ count += DIV_ROUND_UP(len, adapter->tx_fifo_limit); + + nr_frags = skb_shinfo(skb)->nr_frags; + for (f = 0; f < nr_frags; f++) +- count += TXD_USE_COUNT(skb_frag_size(&skb_shinfo(skb)->frags[f]), +- max_txd_pwr); ++ count += DIV_ROUND_UP(skb_frag_size(&skb_shinfo(skb)->frags[f]), ++ adapter->tx_fifo_limit); + + if (adapter->hw.mac.tx_pkt_filtering) + e1000_transfer_dhcp_info(adapter, skb); +@@ -5000,12 +4994,15 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb, + tx_flags |= E1000_TX_FLAGS_IPV4; + + /* if count is 0 then mapping error has occurred */ +- count = e1000_tx_map(adapter, skb, first, max_per_txd, nr_frags, mss); ++ count = e1000_tx_map(adapter, skb, first, adapter->tx_fifo_limit, ++ nr_frags); + if (count) { + e1000_tx_queue(adapter, tx_flags, count); + /* Make sure there is space in the ring for the next send. */ +- e1000_maybe_stop_tx(netdev, MAX_SKB_FRAGS + 2); +- ++ e1000_maybe_stop_tx(netdev, ++ (MAX_SKB_FRAGS * ++ DIV_ROUND_UP(PAGE_SIZE, ++ adapter->tx_fifo_limit) + 2)); + } else { + dev_kfree_skb_any(skb); + tx_ring->buffer_info[first].time_stamp = 0; +@@ -6150,8 +6147,8 @@ static int __devinit e1000_probe(struct pci_dev *pdev, + adapter->hw.phy.autoneg_advertised = 0x2f; + + /* ring size defaults */ +- adapter->rx_ring->count = 256; +- adapter->tx_ring->count = 256; ++ adapter->rx_ring->count = E1000_DEFAULT_RXD; ++ adapter->tx_ring->count = E1000_DEFAULT_TXD; + + /* + * Initial Wake on LAN setting - If APM wake is enabled in +diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c +index d5731f1..a6611f1 100644 +--- a/drivers/net/ethernet/sfc/efx.c ++++ b/drivers/net/ethernet/sfc/efx.c +@@ -1383,6 +1383,11 @@ static int efx_probe_all(struct efx_nic *efx) + goto fail2; + } + ++ BUILD_BUG_ON(EFX_DEFAULT_DMAQ_SIZE < EFX_RXQ_MIN_ENT); ++ if (WARN_ON(EFX_DEFAULT_DMAQ_SIZE < EFX_TXQ_MIN_ENT(efx))) { ++ rc = -EINVAL; ++ goto fail3; ++ } + efx->rxq_entries = efx->txq_entries = EFX_DEFAULT_DMAQ_SIZE; + rc = efx_probe_channels(efx); + if (rc) +@@ -1973,6 +1978,7 @@ static int efx_register_netdev(struct efx_nic *efx) + net_dev->irq = efx->pci_dev->irq; + net_dev->netdev_ops = &efx_netdev_ops; + SET_ETHTOOL_OPS(net_dev, &efx_ethtool_ops); ++ net_dev->gso_max_segs = EFX_TSO_MAX_SEGS; + + /* Clear MAC statistics */ + efx->mac_op->update_stats(efx); +diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h +index 4764793..1355245 100644 +--- a/drivers/net/ethernet/sfc/efx.h ++++ b/drivers/net/ethernet/sfc/efx.h +@@ -34,6 +34,7 @@ extern netdev_tx_t + efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb); + extern void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index); + extern int efx_setup_tc(struct net_device *net_dev, u8 num_tc); ++extern unsigned int efx_tx_max_skb_descs(struct efx_nic *efx); + + /* RX */ + extern int efx_probe_rx_queue(struct efx_rx_queue *rx_queue); +@@ -56,10 +57,15 @@ extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue); + #define EFX_MAX_EVQ_SIZE 16384UL + #define EFX_MIN_EVQ_SIZE 512UL + +-/* The smallest [rt]xq_entries that the driver supports. Callers of +- * efx_wake_queue() assume that they can subsequently send at least one +- * skb. Falcon/A1 may require up to three descriptors per skb_frag. */ +-#define EFX_MIN_RING_SIZE (roundup_pow_of_two(2 * 3 * MAX_SKB_FRAGS)) ++/* Maximum number of TCP segments we support for soft-TSO */ ++#define EFX_TSO_MAX_SEGS 100 ++ ++/* The smallest [rt]xq_entries that the driver supports. RX minimum ++ * is a bit arbitrary. For TX, we must have space for at least 2 ++ * TSO skbs. ++ */ ++#define EFX_RXQ_MIN_ENT 128U ++#define EFX_TXQ_MIN_ENT(efx) (2 * efx_tx_max_skb_descs(efx)) + + /* Filters */ + extern int efx_probe_filters(struct efx_nic *efx); +diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c +index f3cd96d..90158c9 100644 +--- a/drivers/net/ethernet/sfc/ethtool.c ++++ b/drivers/net/ethernet/sfc/ethtool.c +@@ -690,21 +690,27 @@ static int efx_ethtool_set_ringparam(struct net_device *net_dev, + struct ethtool_ringparam *ring) + { + struct efx_nic *efx = netdev_priv(net_dev); ++ u32 txq_entries; + + if (ring->rx_mini_pending || ring->rx_jumbo_pending || + ring->rx_pending > EFX_MAX_DMAQ_SIZE || + ring->tx_pending > EFX_MAX_DMAQ_SIZE) + return -EINVAL; + +- if (ring->rx_pending < EFX_MIN_RING_SIZE || +- ring->tx_pending < EFX_MIN_RING_SIZE) { ++ if (ring->rx_pending < EFX_RXQ_MIN_ENT) { + netif_err(efx, drv, efx->net_dev, +- "TX and RX queues cannot be smaller than %ld\n", +- EFX_MIN_RING_SIZE); ++ "RX queues cannot be smaller than %u\n", ++ EFX_RXQ_MIN_ENT); + return -EINVAL; + } + +- return efx_realloc_channels(efx, ring->rx_pending, ring->tx_pending); ++ txq_entries = max(ring->tx_pending, EFX_TXQ_MIN_ENT(efx)); ++ if (txq_entries != ring->tx_pending) ++ netif_warn(efx, drv, efx->net_dev, ++ "increasing TX queue size to minimum of %u\n", ++ txq_entries); ++ ++ return efx_realloc_channels(efx, ring->rx_pending, txq_entries); + } + + static int efx_ethtool_set_pauseparam(struct net_device *net_dev, +diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h +index 5fb24d3..66ece48 100644 +--- a/drivers/net/ethernet/sfc/nic.h ++++ b/drivers/net/ethernet/sfc/nic.h +@@ -65,6 +65,9 @@ enum { + #define FALCON_GMAC_LOOPBACKS \ + (1 << LOOPBACK_GMAC) + ++/* Alignment of PCIe DMA boundaries (4KB) */ ++#define EFX_PAGE_SIZE 4096 ++ + /** + * struct falcon_board_type - board operations and type information + * @id: Board type id, as found in NVRAM +diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c +index df88c543..807d515 100644 +--- a/drivers/net/ethernet/sfc/tx.c ++++ b/drivers/net/ethernet/sfc/tx.c +@@ -115,6 +115,25 @@ efx_max_tx_len(struct efx_nic *efx, dma_addr_t dma_addr) + return len; + } + ++unsigned int efx_tx_max_skb_descs(struct efx_nic *efx) ++{ ++ /* Header and payload descriptor for each output segment, plus ++ * one for every input fragment boundary within a segment ++ */ ++ unsigned int max_descs = EFX_TSO_MAX_SEGS * 2 + MAX_SKB_FRAGS; ++ ++ /* Possibly one more per segment for the alignment workaround */ ++ if (EFX_WORKAROUND_5391(efx)) ++ max_descs += EFX_TSO_MAX_SEGS; ++ ++ /* Possibly more for PCIe page boundaries within input fragments */ ++ if (PAGE_SIZE > EFX_PAGE_SIZE) ++ max_descs += max_t(unsigned int, MAX_SKB_FRAGS, ++ DIV_ROUND_UP(GSO_MAX_SIZE, EFX_PAGE_SIZE)); ++ ++ return max_descs; ++} ++ + /* + * Add a socket buffer to a TX queue + * +diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c +index f8a6853..ad6a9d9 100644 +--- a/drivers/net/ppp/pptp.c ++++ b/drivers/net/ppp/pptp.c +@@ -189,7 +189,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) + if (sk_pppox(po)->sk_state & PPPOX_DEAD) + goto tx_error; + +- rt = ip_route_output_ports(&init_net, &fl4, NULL, ++ rt = ip_route_output_ports(sock_net(sk), &fl4, NULL, + opt->dst_addr.sin_addr.s_addr, + opt->src_addr.sin_addr.s_addr, + 0, 0, IPPROTO_GRE, +@@ -468,7 +468,7 @@ static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr, + po->chan.private = sk; + po->chan.ops = &pptp_chan_ops; + +- rt = ip_route_output_ports(&init_net, &fl4, sk, ++ rt = ip_route_output_ports(sock_net(sk), &fl4, sk, + opt->dst_addr.sin_addr.s_addr, + opt->src_addr.sin_addr.s_addr, + 0, 0, +diff --git a/drivers/net/wireless/iwlwifi/iwl-debugfs.c b/drivers/net/wireless/iwlwifi/iwl-debugfs.c +index a1670e3..93e6179 100644 +--- a/drivers/net/wireless/iwlwifi/iwl-debugfs.c ++++ b/drivers/net/wireless/iwlwifi/iwl-debugfs.c +@@ -232,6 +232,9 @@ static ssize_t iwl_dbgfs_sram_read(struct file *file, + struct iwl_priv *priv = file->private_data; + size_t bufsz; + ++ if (!iwl_is_ready_rf(priv->shrd)) ++ return -EAGAIN; ++ + /* default is to dump the entire data segment */ + if (!priv->dbgfs_sram_offset && !priv->dbgfs_sram_len) { + priv->dbgfs_sram_offset = 0x800000; +diff --git a/drivers/net/wireless/iwlwifi/iwl-trans-pcie-int.h b/drivers/net/wireless/iwlwifi/iwl-trans-pcie-int.h +index 5c29281..8533ba2 100644 +--- a/drivers/net/wireless/iwlwifi/iwl-trans-pcie-int.h ++++ b/drivers/net/wireless/iwlwifi/iwl-trans-pcie-int.h +@@ -303,7 +303,7 @@ int iwl_queue_space(const struct iwl_queue *q); + ******************************************************/ + int iwl_dump_nic_event_log(struct iwl_trans *trans, bool full_log, + char **buf, bool display); +-int iwl_dump_fh(struct iwl_trans *trans, char **buf, bool display); ++int iwl_dump_fh(struct iwl_trans *trans, char **buf); + void iwl_dump_csr(struct iwl_trans *trans); + + /***************************************************** +diff --git a/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c b/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c +index 1daf01e..17fb25d 100644 +--- a/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c ++++ b/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c +@@ -678,7 +678,7 @@ static void iwl_irq_handle_error(struct iwl_trans *trans) + + iwl_dump_nic_error_log(trans); + iwl_dump_csr(trans); +- iwl_dump_fh(trans, NULL, false); ++ iwl_dump_fh(trans, NULL); + iwl_dump_nic_event_log(trans, false, NULL, false); + #ifdef CONFIG_IWLWIFI_DEBUG + if (iwl_get_debug_level(trans->shrd) & IWL_DL_FW_ERRORS) +diff --git a/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c b/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c +index 4661a64..75da4bc 100644 +--- a/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c ++++ b/drivers/net/wireless/iwlwifi/iwl-trans-pcie.c +@@ -1541,13 +1541,9 @@ static const char *get_fh_string(int cmd) + } + } + +-int iwl_dump_fh(struct iwl_trans *trans, char **buf, bool display) ++int iwl_dump_fh(struct iwl_trans *trans, char **buf) + { + int i; +-#ifdef CONFIG_IWLWIFI_DEBUG +- int pos = 0; +- size_t bufsz = 0; +-#endif + static const u32 fh_tbl[] = { + FH_RSCSR_CHNL0_STTS_WPTR_REG, + FH_RSCSR_CHNL0_RBDCB_BASE_REG, +@@ -1559,29 +1555,35 @@ int iwl_dump_fh(struct iwl_trans *trans, char **buf, bool display) + FH_TSSR_TX_STATUS_REG, + FH_TSSR_TX_ERROR_REG + }; +-#ifdef CONFIG_IWLWIFI_DEBUG +- if (display) { +- bufsz = ARRAY_SIZE(fh_tbl) * 48 + 40; ++ ++#ifdef CONFIG_IWLWIFI_DEBUGFS ++ if (buf) { ++ int pos = 0; ++ size_t bufsz = ARRAY_SIZE(fh_tbl) * 48 + 40; ++ + *buf = kmalloc(bufsz, GFP_KERNEL); + if (!*buf) + return -ENOMEM; ++ + pos += scnprintf(*buf + pos, bufsz - pos, + "FH register values:\n"); +- for (i = 0; i < ARRAY_SIZE(fh_tbl); i++) { ++ ++ for (i = 0; i < ARRAY_SIZE(fh_tbl); i++) + pos += scnprintf(*buf + pos, bufsz - pos, + " %34s: 0X%08x\n", + get_fh_string(fh_tbl[i]), + iwl_read_direct32(bus(trans), fh_tbl[i])); +- } ++ + return pos; + } + #endif ++ + IWL_ERR(trans, "FH register values:\n"); +- for (i = 0; i < ARRAY_SIZE(fh_tbl); i++) { ++ for (i = 0; i < ARRAY_SIZE(fh_tbl); i++) + IWL_ERR(trans, " %34s: 0X%08x\n", + get_fh_string(fh_tbl[i]), + iwl_read_direct32(bus(trans), fh_tbl[i])); +- } ++ + return 0; + } + +@@ -1929,11 +1931,11 @@ static ssize_t iwl_dbgfs_fh_reg_read(struct file *file, + size_t count, loff_t *ppos) + { + struct iwl_trans *trans = file->private_data; +- char *buf; ++ char *buf = NULL; + int pos = 0; + ssize_t ret = -EFAULT; + +- ret = pos = iwl_dump_fh(trans, &buf, true); ++ ret = pos = iwl_dump_fh(trans, &buf); + if (buf) { + ret = simple_read_from_buffer(user_buf, + count, ppos, buf, pos); +diff --git a/drivers/net/wireless/rt2x00/rt2400pci.c b/drivers/net/wireless/rt2x00/rt2400pci.c +index 3a6b402..0ea85f4 100644 +--- a/drivers/net/wireless/rt2x00/rt2400pci.c ++++ b/drivers/net/wireless/rt2x00/rt2400pci.c +@@ -1611,6 +1611,7 @@ static int rt2400pci_probe_hw_mode(struct rt2x00_dev *rt2x00dev) + static int rt2400pci_probe_hw(struct rt2x00_dev *rt2x00dev) + { + int retval; ++ u32 reg; + + /* + * Allocate eeprom data. +@@ -1624,6 +1625,14 @@ static int rt2400pci_probe_hw(struct rt2x00_dev *rt2x00dev) + return retval; + + /* ++ * Enable rfkill polling by setting GPIO direction of the ++ * rfkill switch GPIO pin correctly. ++ */ ++ rt2x00pci_register_read(rt2x00dev, GPIOCSR, ®); ++ rt2x00_set_field32(®, GPIOCSR_BIT8, 1); ++ rt2x00pci_register_write(rt2x00dev, GPIOCSR, reg); ++ ++ /* + * Initialize hw specifications. + */ + retval = rt2400pci_probe_hw_mode(rt2x00dev); +diff --git a/drivers/net/wireless/rt2x00/rt2400pci.h b/drivers/net/wireless/rt2x00/rt2400pci.h +index d3a4a68..7564ae9 100644 +--- a/drivers/net/wireless/rt2x00/rt2400pci.h ++++ b/drivers/net/wireless/rt2x00/rt2400pci.h +@@ -670,6 +670,7 @@ + #define GPIOCSR_BIT5 FIELD32(0x00000020) + #define GPIOCSR_BIT6 FIELD32(0x00000040) + #define GPIOCSR_BIT7 FIELD32(0x00000080) ++#define GPIOCSR_BIT8 FIELD32(0x00000100) + + /* + * BBPPCSR: BBP Pin control register. +diff --git a/drivers/net/wireless/rt2x00/rt2500pci.c b/drivers/net/wireless/rt2x00/rt2500pci.c +index dcc0e1f..aa10c48 100644 +--- a/drivers/net/wireless/rt2x00/rt2500pci.c ++++ b/drivers/net/wireless/rt2x00/rt2500pci.c +@@ -1929,6 +1929,7 @@ static int rt2500pci_probe_hw_mode(struct rt2x00_dev *rt2x00dev) + static int rt2500pci_probe_hw(struct rt2x00_dev *rt2x00dev) + { + int retval; ++ u32 reg; + + /* + * Allocate eeprom data. +@@ -1942,6 +1943,14 @@ static int rt2500pci_probe_hw(struct rt2x00_dev *rt2x00dev) + return retval; + + /* ++ * Enable rfkill polling by setting GPIO direction of the ++ * rfkill switch GPIO pin correctly. ++ */ ++ rt2x00pci_register_read(rt2x00dev, GPIOCSR, ®); ++ rt2x00_set_field32(®, GPIOCSR_DIR0, 1); ++ rt2x00pci_register_write(rt2x00dev, GPIOCSR, reg); ++ ++ /* + * Initialize hw specifications. + */ + retval = rt2500pci_probe_hw_mode(rt2x00dev); +diff --git a/drivers/net/wireless/rt2x00/rt2500usb.c b/drivers/net/wireless/rt2x00/rt2500usb.c +index 53c5f87..22ed6df 100644 +--- a/drivers/net/wireless/rt2x00/rt2500usb.c ++++ b/drivers/net/wireless/rt2x00/rt2500usb.c +@@ -283,7 +283,7 @@ static int rt2500usb_rfkill_poll(struct rt2x00_dev *rt2x00dev) + u16 reg; + + rt2500usb_register_read(rt2x00dev, MAC_CSR19, ®); +- return rt2x00_get_field32(reg, MAC_CSR19_BIT7); ++ return rt2x00_get_field16(reg, MAC_CSR19_BIT7); + } + + #ifdef CONFIG_RT2X00_LIB_LEDS +@@ -1768,6 +1768,7 @@ static int rt2500usb_probe_hw_mode(struct rt2x00_dev *rt2x00dev) + static int rt2500usb_probe_hw(struct rt2x00_dev *rt2x00dev) + { + int retval; ++ u16 reg; + + /* + * Allocate eeprom data. +@@ -1781,6 +1782,14 @@ static int rt2500usb_probe_hw(struct rt2x00_dev *rt2x00dev) + return retval; + + /* ++ * Enable rfkill polling by setting GPIO direction of the ++ * rfkill switch GPIO pin correctly. ++ */ ++ rt2500usb_register_read(rt2x00dev, MAC_CSR19, ®); ++ rt2x00_set_field16(®, MAC_CSR19_BIT8, 0); ++ rt2500usb_register_write(rt2x00dev, MAC_CSR19, reg); ++ ++ /* + * Initialize hw specifications. + */ + retval = rt2500usb_probe_hw_mode(rt2x00dev); +diff --git a/drivers/net/wireless/rt2x00/rt2500usb.h b/drivers/net/wireless/rt2x00/rt2500usb.h +index b493306..196bd51 100644 +--- a/drivers/net/wireless/rt2x00/rt2500usb.h ++++ b/drivers/net/wireless/rt2x00/rt2500usb.h +@@ -189,14 +189,15 @@ + * MAC_CSR19: GPIO control register. + */ + #define MAC_CSR19 0x0426 +-#define MAC_CSR19_BIT0 FIELD32(0x0001) +-#define MAC_CSR19_BIT1 FIELD32(0x0002) +-#define MAC_CSR19_BIT2 FIELD32(0x0004) +-#define MAC_CSR19_BIT3 FIELD32(0x0008) +-#define MAC_CSR19_BIT4 FIELD32(0x0010) +-#define MAC_CSR19_BIT5 FIELD32(0x0020) +-#define MAC_CSR19_BIT6 FIELD32(0x0040) +-#define MAC_CSR19_BIT7 FIELD32(0x0080) ++#define MAC_CSR19_BIT0 FIELD16(0x0001) ++#define MAC_CSR19_BIT1 FIELD16(0x0002) ++#define MAC_CSR19_BIT2 FIELD16(0x0004) ++#define MAC_CSR19_BIT3 FIELD16(0x0008) ++#define MAC_CSR19_BIT4 FIELD16(0x0010) ++#define MAC_CSR19_BIT5 FIELD16(0x0020) ++#define MAC_CSR19_BIT6 FIELD16(0x0040) ++#define MAC_CSR19_BIT7 FIELD16(0x0080) ++#define MAC_CSR19_BIT8 FIELD16(0x0100) + + /* + * MAC_CSR20: LED control register. +diff --git a/drivers/net/wireless/rt2x00/rt2800pci.c b/drivers/net/wireless/rt2x00/rt2800pci.c +index 837b460..518157d 100644 +--- a/drivers/net/wireless/rt2x00/rt2800pci.c ++++ b/drivers/net/wireless/rt2x00/rt2800pci.c +@@ -935,6 +935,7 @@ static int rt2800pci_validate_eeprom(struct rt2x00_dev *rt2x00dev) + static int rt2800pci_probe_hw(struct rt2x00_dev *rt2x00dev) + { + int retval; ++ u32 reg; + + /* + * Allocate eeprom data. +@@ -948,6 +949,14 @@ static int rt2800pci_probe_hw(struct rt2x00_dev *rt2x00dev) + return retval; + + /* ++ * Enable rfkill polling by setting GPIO direction of the ++ * rfkill switch GPIO pin correctly. ++ */ ++ rt2x00pci_register_read(rt2x00dev, GPIO_CTRL_CFG, ®); ++ rt2x00_set_field32(®, GPIO_CTRL_CFG_GPIOD_BIT2, 1); ++ rt2x00pci_register_write(rt2x00dev, GPIO_CTRL_CFG, reg); ++ ++ /* + * Initialize hw specifications. + */ + retval = rt2800_probe_hw_mode(rt2x00dev); +diff --git a/drivers/net/wireless/rt2x00/rt2800usb.c b/drivers/net/wireless/rt2x00/rt2800usb.c +index ae7528b..b66a61b 100644 +--- a/drivers/net/wireless/rt2x00/rt2800usb.c ++++ b/drivers/net/wireless/rt2x00/rt2800usb.c +@@ -621,8 +621,16 @@ static void rt2800usb_fill_rxdone(struct queue_entry *entry, + skb_pull(entry->skb, RXINFO_DESC_SIZE); + + /* +- * FIXME: we need to check for rx_pkt_len validity ++ * Check for rx_pkt_len validity. Return if invalid, leaving ++ * rxdesc->size zeroed out by the upper level. + */ ++ if (unlikely(rx_pkt_len == 0 || ++ rx_pkt_len > entry->queue->data_size)) { ++ ERROR(entry->queue->rt2x00dev, ++ "Bad frame size %d, forcing to 0\n", rx_pkt_len); ++ return; ++ } ++ + rxd = (__le32 *)(entry->skb->data + rx_pkt_len); + + /* +@@ -690,6 +698,7 @@ static int rt2800usb_validate_eeprom(struct rt2x00_dev *rt2x00dev) + static int rt2800usb_probe_hw(struct rt2x00_dev *rt2x00dev) + { + int retval; ++ u32 reg; + + /* + * Allocate eeprom data. +@@ -703,6 +712,14 @@ static int rt2800usb_probe_hw(struct rt2x00_dev *rt2x00dev) + return retval; + + /* ++ * Enable rfkill polling by setting GPIO direction of the ++ * rfkill switch GPIO pin correctly. ++ */ ++ rt2x00usb_register_read(rt2x00dev, GPIO_CTRL_CFG, ®); ++ rt2x00_set_field32(®, GPIO_CTRL_CFG_GPIOD_BIT2, 1); ++ rt2x00usb_register_write(rt2x00dev, GPIO_CTRL_CFG, reg); ++ ++ /* + * Initialize hw specifications. + */ + retval = rt2800_probe_hw_mode(rt2x00dev); +@@ -1111,6 +1128,8 @@ static struct usb_device_id rt2800usb_device_table[] = { + { USB_DEVICE(0x1690, 0x0744) }, + { USB_DEVICE(0x1690, 0x0761) }, + { USB_DEVICE(0x1690, 0x0764) }, ++ /* ASUS */ ++ { USB_DEVICE(0x0b05, 0x179d) }, + /* Cisco */ + { USB_DEVICE(0x167b, 0x4001) }, + /* EnGenius */ +@@ -1163,7 +1182,6 @@ static struct usb_device_id rt2800usb_device_table[] = { + { USB_DEVICE(0x0b05, 0x1760) }, + { USB_DEVICE(0x0b05, 0x1761) }, + { USB_DEVICE(0x0b05, 0x1790) }, +- { USB_DEVICE(0x0b05, 0x179d) }, + /* AzureWave */ + { USB_DEVICE(0x13d3, 0x3262) }, + { USB_DEVICE(0x13d3, 0x3284) }, +diff --git a/drivers/net/wireless/rt2x00/rt2x00dev.c b/drivers/net/wireless/rt2x00/rt2x00dev.c +index 21b529b..f099b30 100644 +--- a/drivers/net/wireless/rt2x00/rt2x00dev.c ++++ b/drivers/net/wireless/rt2x00/rt2x00dev.c +@@ -624,7 +624,7 @@ void rt2x00lib_rxdone(struct queue_entry *entry) + */ + if (unlikely(rxdesc.size == 0 || + rxdesc.size > entry->queue->data_size)) { +- WARNING(rt2x00dev, "Wrong frame size %d max %d.\n", ++ ERROR(rt2x00dev, "Wrong frame size %d max %d.\n", + rxdesc.size, entry->queue->data_size); + dev_kfree_skb(entry->skb); + goto renew_skb; +diff --git a/drivers/net/wireless/rt2x00/rt61pci.c b/drivers/net/wireless/rt2x00/rt61pci.c +index d69f88c..3e058e5 100644 +--- a/drivers/net/wireless/rt2x00/rt61pci.c ++++ b/drivers/net/wireless/rt2x00/rt61pci.c +@@ -2832,6 +2832,7 @@ static int rt61pci_probe_hw_mode(struct rt2x00_dev *rt2x00dev) + static int rt61pci_probe_hw(struct rt2x00_dev *rt2x00dev) + { + int retval; ++ u32 reg; + + /* + * Disable power saving. +@@ -2850,6 +2851,14 @@ static int rt61pci_probe_hw(struct rt2x00_dev *rt2x00dev) + return retval; + + /* ++ * Enable rfkill polling by setting GPIO direction of the ++ * rfkill switch GPIO pin correctly. ++ */ ++ rt2x00pci_register_read(rt2x00dev, MAC_CSR13, ®); ++ rt2x00_set_field32(®, MAC_CSR13_BIT13, 1); ++ rt2x00pci_register_write(rt2x00dev, MAC_CSR13, reg); ++ ++ /* + * Initialize hw specifications. + */ + retval = rt61pci_probe_hw_mode(rt2x00dev); +diff --git a/drivers/net/wireless/rt2x00/rt61pci.h b/drivers/net/wireless/rt2x00/rt61pci.h +index e3cd6db..8f3da5a 100644 +--- a/drivers/net/wireless/rt2x00/rt61pci.h ++++ b/drivers/net/wireless/rt2x00/rt61pci.h +@@ -372,6 +372,7 @@ struct hw_pairwise_ta_entry { + #define MAC_CSR13_BIT10 FIELD32(0x00000400) + #define MAC_CSR13_BIT11 FIELD32(0x00000800) + #define MAC_CSR13_BIT12 FIELD32(0x00001000) ++#define MAC_CSR13_BIT13 FIELD32(0x00002000) + + /* + * MAC_CSR14: LED control register. +diff --git a/drivers/net/wireless/rt2x00/rt73usb.c b/drivers/net/wireless/rt2x00/rt73usb.c +index cfb19db..2ad468d 100644 +--- a/drivers/net/wireless/rt2x00/rt73usb.c ++++ b/drivers/net/wireless/rt2x00/rt73usb.c +@@ -2177,6 +2177,7 @@ static int rt73usb_probe_hw_mode(struct rt2x00_dev *rt2x00dev) + static int rt73usb_probe_hw(struct rt2x00_dev *rt2x00dev) + { + int retval; ++ u32 reg; + + /* + * Allocate eeprom data. +@@ -2190,6 +2191,14 @@ static int rt73usb_probe_hw(struct rt2x00_dev *rt2x00dev) + return retval; + + /* ++ * Enable rfkill polling by setting GPIO direction of the ++ * rfkill switch GPIO pin correctly. ++ */ ++ rt2x00usb_register_read(rt2x00dev, MAC_CSR13, ®); ++ rt2x00_set_field32(®, MAC_CSR13_BIT15, 0); ++ rt2x00usb_register_write(rt2x00dev, MAC_CSR13, reg); ++ ++ /* + * Initialize hw specifications. + */ + retval = rt73usb_probe_hw_mode(rt2x00dev); +diff --git a/drivers/net/wireless/rt2x00/rt73usb.h b/drivers/net/wireless/rt2x00/rt73usb.h +index 9f6b470..df1cc11 100644 +--- a/drivers/net/wireless/rt2x00/rt73usb.h ++++ b/drivers/net/wireless/rt2x00/rt73usb.h +@@ -282,6 +282,9 @@ struct hw_pairwise_ta_entry { + #define MAC_CSR13_BIT10 FIELD32(0x00000400) + #define MAC_CSR13_BIT11 FIELD32(0x00000800) + #define MAC_CSR13_BIT12 FIELD32(0x00001000) ++#define MAC_CSR13_BIT13 FIELD32(0x00002000) ++#define MAC_CSR13_BIT14 FIELD32(0x00004000) ++#define MAC_CSR13_BIT15 FIELD32(0x00008000) + + /* + * MAC_CSR14: LED control register. +diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c +index 29a994f..7c471eb 100644 +--- a/drivers/scsi/megaraid/megaraid_sas_base.c ++++ b/drivers/scsi/megaraid/megaraid_sas_base.c +@@ -4125,7 +4125,6 @@ megasas_probe_one(struct pci_dev *pdev, const struct pci_device_id *id) + spin_lock_init(&instance->cmd_pool_lock); + spin_lock_init(&instance->hba_lock); + spin_lock_init(&instance->completion_lock); +- spin_lock_init(&poll_aen_lock); + + mutex_init(&instance->aen_mutex); + mutex_init(&instance->reset_mutex); +@@ -5520,6 +5519,8 @@ static int __init megasas_init(void) + printk(KERN_INFO "megasas: %s %s\n", MEGASAS_VERSION, + MEGASAS_EXT_VERSION); + ++ spin_lock_init(&poll_aen_lock); ++ + support_poll_for_event = 2; + support_device_change = 1; + +diff --git a/drivers/scsi/mpt2sas/mpt2sas_base.c b/drivers/scsi/mpt2sas/mpt2sas_base.c +index e903077..98cb5e6 100644 +--- a/drivers/scsi/mpt2sas/mpt2sas_base.c ++++ b/drivers/scsi/mpt2sas/mpt2sas_base.c +@@ -2353,10 +2353,13 @@ _base_allocate_memory_pools(struct MPT2SAS_ADAPTER *ioc, int sleep_flag) + } + + /* command line tunables for max controller queue depth */ +- if (max_queue_depth != -1) +- max_request_credit = (max_queue_depth < facts->RequestCredit) +- ? max_queue_depth : facts->RequestCredit; +- else ++ if (max_queue_depth != -1 && max_queue_depth != 0) { ++ max_request_credit = min_t(u16, max_queue_depth + ++ ioc->hi_priority_depth + ioc->internal_depth, ++ facts->RequestCredit); ++ if (max_request_credit > MAX_HBA_QUEUE_DEPTH) ++ max_request_credit = MAX_HBA_QUEUE_DEPTH; ++ } else + max_request_credit = min_t(u16, facts->RequestCredit, + MAX_HBA_QUEUE_DEPTH); + +@@ -2431,7 +2434,7 @@ _base_allocate_memory_pools(struct MPT2SAS_ADAPTER *ioc, int sleep_flag) + /* set the scsi host can_queue depth + * with some internal commands that could be outstanding + */ +- ioc->shost->can_queue = ioc->scsiio_depth - (2); ++ ioc->shost->can_queue = ioc->scsiio_depth; + dinitprintk(ioc, printk(MPT2SAS_INFO_FMT "scsi host: " + "can_queue depth (%d)\n", ioc->name, ioc->shost->can_queue)); + +diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c +index 456b131..c83571e 100644 +--- a/drivers/scsi/scsi_error.c ++++ b/drivers/scsi/scsi_error.c +@@ -41,6 +41,8 @@ + + #include + ++static void scsi_eh_done(struct scsi_cmnd *scmd); ++ + #define SENSE_TIMEOUT (10*HZ) + + /* +@@ -240,6 +242,14 @@ static int scsi_check_sense(struct scsi_cmnd *scmd) + if (! scsi_command_normalize_sense(scmd, &sshdr)) + return FAILED; /* no valid sense data */ + ++ if (scmd->cmnd[0] == TEST_UNIT_READY && scmd->scsi_done != scsi_eh_done) ++ /* ++ * nasty: for mid-layer issued TURs, we need to return the ++ * actual sense data without any recovery attempt. For eh ++ * issued ones, we need to try to recover and interpret ++ */ ++ return SUCCESS; ++ + if (scsi_sense_is_deferred(&sshdr)) + return NEEDS_RETRY; + +diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c +index a48b59c..c6c80c9 100644 +--- a/drivers/scsi/scsi_scan.c ++++ b/drivers/scsi/scsi_scan.c +@@ -776,6 +776,16 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result, + sdev->model = (char *) (sdev->inquiry + 16); + sdev->rev = (char *) (sdev->inquiry + 32); + ++ if (strncmp(sdev->vendor, "ATA ", 8) == 0) { ++ /* ++ * sata emulation layer device. This is a hack to work around ++ * the SATL power management specifications which state that ++ * when the SATL detects the device has gone into standby ++ * mode, it shall respond with NOT READY. ++ */ ++ sdev->allow_restart = 1; ++ } ++ + if (*bflags & BLIST_ISROM) { + sdev->type = TYPE_ROM; + sdev->removable = 1; +diff --git a/drivers/staging/comedi/drivers/das08.c b/drivers/staging/comedi/drivers/das08.c +index 3141dc8..a48fe88 100644 +--- a/drivers/staging/comedi/drivers/das08.c ++++ b/drivers/staging/comedi/drivers/das08.c +@@ -385,7 +385,7 @@ static const struct das08_board_struct das08_boards[] = { + .ai = das08_ai_rinsn, + .ai_nbits = 16, + .ai_pg = das08_pg_none, +- .ai_encoding = das08_encode12, ++ .ai_encoding = das08_encode16, + .ao = das08jr_ao_winsn, + .ao_nbits = 16, + .di = das08jr_di_rbits, +@@ -655,7 +655,7 @@ static int das08jr_ao_winsn(struct comedi_device *dev, + int chan; + + lsb = data[0] & 0xff; +- msb = (data[0] >> 8) & 0xf; ++ msb = (data[0] >> 8) & 0xff; + + chan = CR_CHAN(insn->chanspec); + +diff --git a/drivers/staging/rtl8712/recv_linux.c b/drivers/staging/rtl8712/recv_linux.c +index 0e26d5f..495ee12 100644 +--- a/drivers/staging/rtl8712/recv_linux.c ++++ b/drivers/staging/rtl8712/recv_linux.c +@@ -117,13 +117,8 @@ void r8712_recv_indicatepkt(struct _adapter *padapter, + if (skb == NULL) + goto _recv_indicatepkt_drop; + skb->data = precv_frame->u.hdr.rx_data; +-#ifdef NET_SKBUFF_DATA_USES_OFFSET +- skb->tail = (sk_buff_data_t)(precv_frame->u.hdr.rx_tail - +- precv_frame->u.hdr.rx_head); +-#else +- skb->tail = (sk_buff_data_t)precv_frame->u.hdr.rx_tail; +-#endif + skb->len = precv_frame->u.hdr.len; ++ skb_set_tail_pointer(skb, skb->len); + if ((pattrib->tcpchk_valid == 1) && (pattrib->tcp_chkrpt == 1)) + skb->ip_summed = CHECKSUM_UNNECESSARY; + else +diff --git a/drivers/staging/vt6656/dpc.c b/drivers/staging/vt6656/dpc.c +index c0edf97..08021f4 100644 +--- a/drivers/staging/vt6656/dpc.c ++++ b/drivers/staging/vt6656/dpc.c +@@ -200,7 +200,7 @@ s_vProcessRxMACHeader ( + } else if (!compare_ether_addr(pbyRxBuffer, &pDevice->abySNAP_RFC1042[0])) { + cbHeaderSize += 6; + pwType = (PWORD) (pbyRxBufferAddr + cbHeaderSize); +- if ((*pwType == cpu_to_le16(ETH_P_IPX)) || ++ if ((*pwType == cpu_to_be16(ETH_P_IPX)) || + (*pwType == cpu_to_le16(0xF380))) { + cbHeaderSize -= 8; + pwType = (PWORD) (pbyRxBufferAddr + cbHeaderSize); +diff --git a/drivers/staging/vt6656/rxtx.c b/drivers/staging/vt6656/rxtx.c +index 9b64b10..fe21868 100644 +--- a/drivers/staging/vt6656/rxtx.c ++++ b/drivers/staging/vt6656/rxtx.c +@@ -1701,7 +1701,7 @@ s_bPacketToWirelessUsb( + // 802.1H + if (ntohs(psEthHeader->wType) > ETH_DATA_LEN) { + if (pDevice->dwDiagRefCount == 0) { +- if ((psEthHeader->wType == cpu_to_le16(ETH_P_IPX)) || ++ if ((psEthHeader->wType == cpu_to_be16(ETH_P_IPX)) || + (psEthHeader->wType == cpu_to_le16(0xF380))) { + memcpy((PBYTE) (pbyPayloadHead), + abySNAP_Bridgetunnel, 6); +@@ -2840,10 +2840,10 @@ int nsDMA_tx_packet(PSDevice pDevice, unsigned int uDMAIdx, struct sk_buff *skb) + Packet_Type = skb->data[ETH_HLEN+1]; + Descriptor_type = skb->data[ETH_HLEN+1+1+2]; + Key_info = (skb->data[ETH_HLEN+1+1+2+1] << 8)|(skb->data[ETH_HLEN+1+1+2+2]); +- if (pDevice->sTxEthHeader.wType == cpu_to_le16(ETH_P_PAE)) { +- /* 802.1x OR eapol-key challenge frame transfer */ +- if (((Protocol_Version == 1) || (Protocol_Version == 2)) && +- (Packet_Type == 3)) { ++ if (pDevice->sTxEthHeader.wType == cpu_to_be16(ETH_P_PAE)) { ++ /* 802.1x OR eapol-key challenge frame transfer */ ++ if (((Protocol_Version == 1) || (Protocol_Version == 2)) && ++ (Packet_Type == 3)) { + bTxeapol_key = TRUE; + if(!(Key_info & BIT3) && //WPA or RSN group-key challenge + (Key_info & BIT8) && (Key_info & BIT9)) { //send 2/2 key +@@ -2989,19 +2989,19 @@ int nsDMA_tx_packet(PSDevice pDevice, unsigned int uDMAIdx, struct sk_buff *skb) + } + } + +- if (pDevice->sTxEthHeader.wType == cpu_to_le16(ETH_P_PAE)) { +- if (pDevice->byBBType != BB_TYPE_11A) { +- pDevice->wCurrentRate = RATE_1M; +- pDevice->byACKRate = RATE_1M; +- pDevice->byTopCCKBasicRate = RATE_1M; +- pDevice->byTopOFDMBasicRate = RATE_6M; +- } else { +- pDevice->wCurrentRate = RATE_6M; +- pDevice->byACKRate = RATE_6M; +- pDevice->byTopCCKBasicRate = RATE_1M; +- pDevice->byTopOFDMBasicRate = RATE_6M; +- } +- } ++ if (pDevice->sTxEthHeader.wType == cpu_to_be16(ETH_P_PAE)) { ++ if (pDevice->byBBType != BB_TYPE_11A) { ++ pDevice->wCurrentRate = RATE_1M; ++ pDevice->byACKRate = RATE_1M; ++ pDevice->byTopCCKBasicRate = RATE_1M; ++ pDevice->byTopOFDMBasicRate = RATE_6M; ++ } else { ++ pDevice->wCurrentRate = RATE_6M; ++ pDevice->byACKRate = RATE_6M; ++ pDevice->byTopCCKBasicRate = RATE_1M; ++ pDevice->byTopOFDMBasicRate = RATE_6M; ++ } ++ } + + DBG_PRT(MSG_LEVEL_DEBUG, + KERN_INFO "dma_tx: pDevice->wCurrentRate = %d\n", +@@ -3017,7 +3017,7 @@ int nsDMA_tx_packet(PSDevice pDevice, unsigned int uDMAIdx, struct sk_buff *skb) + + if (bNeedEncryption == TRUE) { + DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO"ntohs Pkt Type=%04x\n", ntohs(pDevice->sTxEthHeader.wType)); +- if ((pDevice->sTxEthHeader.wType) == cpu_to_le16(ETH_P_PAE)) { ++ if ((pDevice->sTxEthHeader.wType) == cpu_to_be16(ETH_P_PAE)) { + bNeedEncryption = FALSE; + DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO"Pkt Type=%04x\n", (pDevice->sTxEthHeader.wType)); + if ((pMgmt->eCurrMode == WMAC_MODE_ESS_STA) && (pMgmt->eCurrState == WMAC_STATE_ASSOC)) { +diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c +index 16ad9fe..4306475 100644 +--- a/drivers/staging/zcache/zcache-main.c ++++ b/drivers/staging/zcache/zcache-main.c +@@ -1223,13 +1223,12 @@ static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw, + void *pampd, struct tmem_pool *pool, + struct tmem_oid *oid, uint32_t index) + { +- int ret = 0; +- + BUG_ON(!is_ephemeral(pool)); +- zbud_decompress((struct page *)(data), pampd); ++ if (zbud_decompress((struct page *)(data), pampd) < 0) ++ return -EINVAL; + zbud_free_and_delist((struct zbud_hdr *)pampd); + atomic_dec(&zcache_curr_eph_pampd_count); +- return ret; ++ return 0; + } + + /* +diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c +index 163fc90..8e68f79 100644 +--- a/drivers/tty/serial/imx.c ++++ b/drivers/tty/serial/imx.c +@@ -130,6 +130,7 @@ + #define UCR4_OREN (1<<1) /* Receiver overrun interrupt enable */ + #define UCR4_DREN (1<<0) /* Recv data ready interrupt enable */ + #define UFCR_RXTL_SHF 0 /* Receiver trigger level shift */ ++#define UFCR_DCEDTE (1<<6) /* DCE/DTE mode select */ + #define UFCR_RFDIV (7<<7) /* Reference freq divider mask */ + #define UFCR_RFDIV_REG(x) (((x) < 7 ? 6 - (x) : 6) << 7) + #define UFCR_TXTL_SHF 10 /* Transmitter trigger level shift */ +@@ -635,22 +636,11 @@ static void imx_break_ctl(struct uart_port *port, int break_state) + static int imx_setup_ufcr(struct imx_port *sport, unsigned int mode) + { + unsigned int val; +- unsigned int ufcr_rfdiv; +- +- /* set receiver / transmitter trigger level. +- * RFDIV is set such way to satisfy requested uartclk value +- */ +- val = TXTL << 10 | RXTL; +- ufcr_rfdiv = (clk_get_rate(sport->clk) + sport->port.uartclk / 2) +- / sport->port.uartclk; +- +- if(!ufcr_rfdiv) +- ufcr_rfdiv = 1; +- +- val |= UFCR_RFDIV_REG(ufcr_rfdiv); + ++ /* set receiver / transmitter trigger level */ ++ val = readl(sport->port.membase + UFCR) & (UFCR_RFDIV | UFCR_DCEDTE); ++ val |= TXTL << UFCR_TXTL_SHF | RXTL; + writel(val, sport->port.membase + UFCR); +- + return 0; + } + +@@ -725,6 +715,7 @@ static int imx_startup(struct uart_port *port) + } + } + ++ spin_lock_irqsave(&sport->port.lock, flags); + /* + * Finally, clear and enable interrupts + */ +@@ -778,7 +769,6 @@ static int imx_startup(struct uart_port *port) + /* + * Enable modem status interrupts + */ +- spin_lock_irqsave(&sport->port.lock,flags); + imx_enable_ms(&sport->port); + spin_unlock_irqrestore(&sport->port.lock,flags); + +@@ -808,10 +798,13 @@ static void imx_shutdown(struct uart_port *port) + { + struct imx_port *sport = (struct imx_port *)port; + unsigned long temp; ++ unsigned long flags; + ++ spin_lock_irqsave(&sport->port.lock, flags); + temp = readl(sport->port.membase + UCR2); + temp &= ~(UCR2_TXEN); + writel(temp, sport->port.membase + UCR2); ++ spin_unlock_irqrestore(&sport->port.lock, flags); + + if (USE_IRDA(sport)) { + struct imxuart_platform_data *pdata; +@@ -840,12 +833,14 @@ static void imx_shutdown(struct uart_port *port) + * Disable all interrupts, port and break condition. + */ + ++ spin_lock_irqsave(&sport->port.lock, flags); + temp = readl(sport->port.membase + UCR1); + temp &= ~(UCR1_TXMPTYEN | UCR1_RRDYEN | UCR1_RTSDEN | UCR1_UARTEN); + if (USE_IRDA(sport)) + temp &= ~(UCR1_IREN); + + writel(temp, sport->port.membase + UCR1); ++ spin_unlock_irqrestore(&sport->port.lock, flags); + } + + static void +@@ -1119,6 +1114,9 @@ imx_console_write(struct console *co, const char *s, unsigned int count) + { + struct imx_port *sport = imx_ports[co->index]; + unsigned int old_ucr1, old_ucr2, ucr1; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&sport->port.lock, flags); + + /* + * First, save UCR1/2 and then disable interrupts +@@ -1145,6 +1143,8 @@ imx_console_write(struct console *co, const char *s, unsigned int count) + + writel(old_ucr1, sport->port.membase + UCR1); + writel(old_ucr2, sport->port.membase + UCR2); ++ ++ spin_unlock_irqrestore(&sport->port.lock, flags); + } + + /* +diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c +index 32d3adc..8b2a9d8 100644 +--- a/drivers/usb/core/quirks.c ++++ b/drivers/usb/core/quirks.c +@@ -96,6 +96,10 @@ static const struct usb_device_id usb_quirk_list[] = { + { USB_DEVICE(0x04b4, 0x0526), .driver_info = + USB_QUIRK_CONFIG_INTF_STRINGS }, + ++ /* Microchip Joss Optical infrared touchboard device */ ++ { USB_DEVICE(0x04d8, 0x000c), .driver_info = ++ USB_QUIRK_CONFIG_INTF_STRINGS }, ++ + /* Samsung Android phone modem - ID conflict with SPH-I500 */ + { USB_DEVICE(0x04e8, 0x6601), .driver_info = + USB_QUIRK_CONFIG_INTF_STRINGS }, +diff --git a/drivers/usb/host/ehci-q.c b/drivers/usb/host/ehci-q.c +index fef1db3..2023733 100644 +--- a/drivers/usb/host/ehci-q.c ++++ b/drivers/usb/host/ehci-q.c +@@ -128,9 +128,17 @@ qh_refresh (struct ehci_hcd *ehci, struct ehci_qh *qh) + else { + qtd = list_entry (qh->qtd_list.next, + struct ehci_qtd, qtd_list); +- /* first qtd may already be partially processed */ +- if (cpu_to_hc32(ehci, qtd->qtd_dma) == qh->hw->hw_current) ++ /* ++ * first qtd may already be partially processed. ++ * If we come here during unlink, the QH overlay region ++ * might have reference to the just unlinked qtd. The ++ * qtd is updated in qh_completions(). Update the QH ++ * overlay here. ++ */ ++ if (cpu_to_hc32(ehci, qtd->qtd_dma) == qh->hw->hw_current) { ++ qh->hw->hw_qtd_next = qtd->hw_next; + qtd = NULL; ++ } + } + + if (qtd) +diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c +index 833b3c6..d0ec2f0 100644 +--- a/drivers/usb/host/pci-quirks.c ++++ b/drivers/usb/host/pci-quirks.c +@@ -75,7 +75,9 @@ + #define NB_PIF0_PWRDOWN_1 0x01100013 + + #define USB_INTEL_XUSB2PR 0xD0 ++#define USB_INTEL_USB2PRM 0xD4 + #define USB_INTEL_USB3_PSSEN 0xD8 ++#define USB_INTEL_USB3PRM 0xDC + + static struct amd_chipset_info { + struct pci_dev *nb_dev; +@@ -772,10 +774,18 @@ void usb_enable_xhci_ports(struct pci_dev *xhci_pdev) + return; + } + +- ports_available = 0xffffffff; ++ /* Read USB3PRM, the USB 3.0 Port Routing Mask Register ++ * Indicate the ports that can be changed from OS. ++ */ ++ pci_read_config_dword(xhci_pdev, USB_INTEL_USB3PRM, ++ &ports_available); ++ ++ dev_dbg(&xhci_pdev->dev, "Configurable ports to enable SuperSpeed: 0x%x\n", ++ ports_available); ++ + /* Write USB3_PSSEN, the USB 3.0 Port SuperSpeed Enable +- * Register, to turn on SuperSpeed terminations for all +- * available ports. ++ * Register, to turn on SuperSpeed terminations for the ++ * switchable ports. + */ + pci_write_config_dword(xhci_pdev, USB_INTEL_USB3_PSSEN, + cpu_to_le32(ports_available)); +@@ -785,7 +795,16 @@ void usb_enable_xhci_ports(struct pci_dev *xhci_pdev) + dev_dbg(&xhci_pdev->dev, "USB 3.0 ports that are now enabled " + "under xHCI: 0x%x\n", ports_available); + +- ports_available = 0xffffffff; ++ /* Read XUSB2PRM, xHCI USB 2.0 Port Routing Mask Register ++ * Indicate the USB 2.0 ports to be controlled by the xHCI host. ++ */ ++ ++ pci_read_config_dword(xhci_pdev, USB_INTEL_USB2PRM, ++ &ports_available); ++ ++ dev_dbg(&xhci_pdev->dev, "Configurable USB 2.0 ports to hand over to xCHI: 0x%x\n", ++ ports_available); ++ + /* Write XUSB2PR, the xHC USB 2.0 Port Routing Register, to + * switch the USB 2.0 power and data lines over to the xHCI + * host. +@@ -800,6 +819,13 @@ void usb_enable_xhci_ports(struct pci_dev *xhci_pdev) + } + EXPORT_SYMBOL_GPL(usb_enable_xhci_ports); + ++void usb_disable_xhci_ports(struct pci_dev *xhci_pdev) ++{ ++ pci_write_config_dword(xhci_pdev, USB_INTEL_USB3_PSSEN, 0x0); ++ pci_write_config_dword(xhci_pdev, USB_INTEL_XUSB2PR, 0x0); ++} ++EXPORT_SYMBOL_GPL(usb_disable_xhci_ports); ++ + /** + * PCI Quirks for xHCI. + * +@@ -815,12 +841,12 @@ static void __devinit quirk_usb_handoff_xhci(struct pci_dev *pdev) + void __iomem *op_reg_base; + u32 val; + int timeout; ++ int len = pci_resource_len(pdev, 0); + + if (!mmio_resource_enabled(pdev, 0)) + return; + +- base = ioremap_nocache(pci_resource_start(pdev, 0), +- pci_resource_len(pdev, 0)); ++ base = ioremap_nocache(pci_resource_start(pdev, 0), len); + if (base == NULL) + return; + +@@ -830,9 +856,17 @@ static void __devinit quirk_usb_handoff_xhci(struct pci_dev *pdev) + */ + ext_cap_offset = xhci_find_next_cap_offset(base, XHCI_HCC_PARAMS_OFFSET); + do { ++ if ((ext_cap_offset + sizeof(val)) > len) { ++ /* We're reading garbage from the controller */ ++ dev_warn(&pdev->dev, ++ "xHCI controller failing to respond"); ++ return; ++ } ++ + if (!ext_cap_offset) + /* We've reached the end of the extended capabilities */ + goto hc_init; ++ + val = readl(base + ext_cap_offset); + if (XHCI_EXT_CAPS_ID(val) == XHCI_EXT_CAPS_LEGACY) + break; +@@ -863,9 +897,10 @@ static void __devinit quirk_usb_handoff_xhci(struct pci_dev *pdev) + /* Disable any BIOS SMIs and clear all SMI events*/ + writel(val, base + ext_cap_offset + XHCI_LEGACY_CONTROL_OFFSET); + ++hc_init: + if (usb_is_intel_switchable_xhci(pdev)) + usb_enable_xhci_ports(pdev); +-hc_init: ++ + op_reg_base = base + XHCI_HC_LENGTH(readl(base)); + + /* Wait for the host controller to be ready before writing any +diff --git a/drivers/usb/host/pci-quirks.h b/drivers/usb/host/pci-quirks.h +index b1002a8..7f69a39 100644 +--- a/drivers/usb/host/pci-quirks.h ++++ b/drivers/usb/host/pci-quirks.h +@@ -10,10 +10,12 @@ void usb_amd_quirk_pll_disable(void); + void usb_amd_quirk_pll_enable(void); + bool usb_is_intel_switchable_xhci(struct pci_dev *pdev); + void usb_enable_xhci_ports(struct pci_dev *xhci_pdev); ++void usb_disable_xhci_ports(struct pci_dev *xhci_pdev); + #else + static inline void usb_amd_quirk_pll_disable(void) {} + static inline void usb_amd_quirk_pll_enable(void) {} + static inline void usb_amd_dev_put(void) {} ++static inline void usb_disable_xhci_ports(struct pci_dev *xhci_pdev) {} + #endif /* CONFIG_PCI */ + + #endif /* __LINUX_USB_PCI_QUIRKS_H */ +diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c +index fd8a2c2..978860b 100644 +--- a/drivers/usb/host/xhci-hub.c ++++ b/drivers/usb/host/xhci-hub.c +@@ -469,11 +469,48 @@ static void xhci_hub_report_link_state(u32 *status, u32 status_reg) + * when this bit is set. + */ + pls |= USB_PORT_STAT_CONNECTION; ++ } else { ++ /* ++ * If CAS bit isn't set but the Port is already at ++ * Compliance Mode, fake a connection so the USB core ++ * notices the Compliance state and resets the port. ++ * This resolves an issue generated by the SN65LVPE502CP ++ * in which sometimes the port enters compliance mode ++ * caused by a delay on the host-device negotiation. ++ */ ++ if (pls == USB_SS_PORT_LS_COMP_MOD) ++ pls |= USB_PORT_STAT_CONNECTION; + } ++ + /* update status field */ + *status |= pls; + } + ++/* ++ * Function for Compliance Mode Quirk. ++ * ++ * This Function verifies if all xhc USB3 ports have entered U0, if so, ++ * the compliance mode timer is deleted. A port won't enter ++ * compliance mode if it has previously entered U0. ++ */ ++void xhci_del_comp_mod_timer(struct xhci_hcd *xhci, u32 status, u16 wIndex) ++{ ++ u32 all_ports_seen_u0 = ((1 << xhci->num_usb3_ports)-1); ++ bool port_in_u0 = ((status & PORT_PLS_MASK) == XDEV_U0); ++ ++ if (!(xhci->quirks & XHCI_COMP_MODE_QUIRK)) ++ return; ++ ++ if ((xhci->port_status_u0 != all_ports_seen_u0) && port_in_u0) { ++ xhci->port_status_u0 |= 1 << wIndex; ++ if (xhci->port_status_u0 == all_ports_seen_u0) { ++ del_timer_sync(&xhci->comp_mode_recovery_timer); ++ xhci_dbg(xhci, "All USB3 ports have entered U0 already!\n"); ++ xhci_dbg(xhci, "Compliance Mode Recovery Timer Deleted.\n"); ++ } ++ } ++} ++ + int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, + u16 wIndex, char *buf, u16 wLength) + { +@@ -618,6 +655,11 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, + /* Update Port Link State for super speed ports*/ + if (hcd->speed == HCD_USB3) { + xhci_hub_report_link_state(&status, temp); ++ /* ++ * Verify if all USB3 Ports Have entered U0 already. ++ * Delete Compliance Mode Timer if so. ++ */ ++ xhci_del_comp_mod_timer(xhci, temp, wIndex); + } + if (bus_state->port_c_suspend & (1 << wIndex)) + status |= 1 << USB_PORT_FEAT_C_SUSPEND; +diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c +index 07c72a4..bddcbfc 100644 +--- a/drivers/usb/host/xhci-pci.c ++++ b/drivers/usb/host/xhci-pci.c +@@ -90,6 +90,15 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) + xhci->quirks |= XHCI_EP_LIMIT_QUIRK; + xhci->limit_active_eps = 64; + xhci->quirks |= XHCI_SW_BW_CHECKING; ++ /* ++ * PPT desktop boards DH77EB and DH77DF will power back on after ++ * a few seconds of being shutdown. The fix for this is to ++ * switch the ports from xHCI to EHCI on shutdown. We can't use ++ * DMI information to find those particular boards (since each ++ * vendor will change the board name), so we have to key off all ++ * PPT chipsets. ++ */ ++ xhci->quirks |= XHCI_SPURIOUS_REBOOT; + } + if (pdev->vendor == PCI_VENDOR_ID_ETRON && + pdev->device == PCI_DEVICE_ID_ASROCK_P67) { +diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c +index fb0981e..c7c530c 100644 +--- a/drivers/usb/host/xhci-ring.c ++++ b/drivers/usb/host/xhci-ring.c +@@ -145,25 +145,34 @@ static void next_trb(struct xhci_hcd *xhci, + */ + static void inc_deq(struct xhci_hcd *xhci, struct xhci_ring *ring, bool consumer) + { +- union xhci_trb *next = ++(ring->dequeue); + unsigned long long addr; + + ring->deq_updates++; +- /* Update the dequeue pointer further if that was a link TRB or we're at +- * the end of an event ring segment (which doesn't have link TRBS) +- */ +- while (last_trb(xhci, ring, ring->deq_seg, next)) { +- if (consumer && last_trb_on_last_seg(xhci, ring, ring->deq_seg, next)) { +- ring->cycle_state = (ring->cycle_state ? 0 : 1); +- if (!in_interrupt()) +- xhci_dbg(xhci, "Toggle cycle state for ring %p = %i\n", +- ring, +- (unsigned int) ring->cycle_state); ++ ++ do { ++ /* ++ * Update the dequeue pointer further if that was a link TRB or ++ * we're at the end of an event ring segment (which doesn't have ++ * link TRBS) ++ */ ++ if (last_trb(xhci, ring, ring->deq_seg, ring->dequeue)) { ++ if (consumer && last_trb_on_last_seg(xhci, ring, ++ ring->deq_seg, ring->dequeue)) { ++ if (!in_interrupt()) ++ xhci_dbg(xhci, "Toggle cycle state " ++ "for ring %p = %i\n", ++ ring, ++ (unsigned int) ++ ring->cycle_state); ++ ring->cycle_state = (ring->cycle_state ? 0 : 1); ++ } ++ ring->deq_seg = ring->deq_seg->next; ++ ring->dequeue = ring->deq_seg->trbs; ++ } else { ++ ring->dequeue++; + } +- ring->deq_seg = ring->deq_seg->next; +- ring->dequeue = ring->deq_seg->trbs; +- next = ring->dequeue; +- } ++ } while (last_trb(xhci, ring, ring->deq_seg, ring->dequeue)); ++ + addr = (unsigned long long) xhci_trb_virt_to_dma(ring->deq_seg, ring->dequeue); + } + +diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c +index f7c0a2a..09872ee 100644 +--- a/drivers/usb/host/xhci.c ++++ b/drivers/usb/host/xhci.c +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + + #include "xhci.h" + +@@ -387,6 +388,95 @@ static void xhci_msix_sync_irqs(struct xhci_hcd *xhci) + + #endif + ++static void compliance_mode_recovery(unsigned long arg) ++{ ++ struct xhci_hcd *xhci; ++ struct usb_hcd *hcd; ++ u32 temp; ++ int i; ++ ++ xhci = (struct xhci_hcd *)arg; ++ ++ for (i = 0; i < xhci->num_usb3_ports; i++) { ++ temp = xhci_readl(xhci, xhci->usb3_ports[i]); ++ if ((temp & PORT_PLS_MASK) == USB_SS_PORT_LS_COMP_MOD) { ++ /* ++ * Compliance Mode Detected. Letting USB Core ++ * handle the Warm Reset ++ */ ++ xhci_dbg(xhci, "Compliance Mode Detected->Port %d!\n", ++ i + 1); ++ xhci_dbg(xhci, "Attempting Recovery routine!\n"); ++ hcd = xhci->shared_hcd; ++ ++ if (hcd->state == HC_STATE_SUSPENDED) ++ usb_hcd_resume_root_hub(hcd); ++ ++ usb_hcd_poll_rh_status(hcd); ++ } ++ } ++ ++ if (xhci->port_status_u0 != ((1 << xhci->num_usb3_ports)-1)) ++ mod_timer(&xhci->comp_mode_recovery_timer, ++ jiffies + msecs_to_jiffies(COMP_MODE_RCVRY_MSECS)); ++} ++ ++/* ++ * Quirk to work around issue generated by the SN65LVPE502CP USB3.0 re-driver ++ * that causes ports behind that hardware to enter compliance mode sometimes. ++ * The quirk creates a timer that polls every 2 seconds the link state of ++ * each host controller's port and recovers it by issuing a Warm reset ++ * if Compliance mode is detected, otherwise the port will become "dead" (no ++ * device connections or disconnections will be detected anymore). Becasue no ++ * status event is generated when entering compliance mode (per xhci spec), ++ * this quirk is needed on systems that have the failing hardware installed. ++ */ ++static void compliance_mode_recovery_timer_init(struct xhci_hcd *xhci) ++{ ++ xhci->port_status_u0 = 0; ++ init_timer(&xhci->comp_mode_recovery_timer); ++ ++ xhci->comp_mode_recovery_timer.data = (unsigned long) xhci; ++ xhci->comp_mode_recovery_timer.function = compliance_mode_recovery; ++ xhci->comp_mode_recovery_timer.expires = jiffies + ++ msecs_to_jiffies(COMP_MODE_RCVRY_MSECS); ++ ++ set_timer_slack(&xhci->comp_mode_recovery_timer, ++ msecs_to_jiffies(COMP_MODE_RCVRY_MSECS)); ++ add_timer(&xhci->comp_mode_recovery_timer); ++ xhci_dbg(xhci, "Compliance Mode Recovery Timer Initialized.\n"); ++} ++ ++/* ++ * This function identifies the systems that have installed the SN65LVPE502CP ++ * USB3.0 re-driver and that need the Compliance Mode Quirk. ++ * Systems: ++ * Vendor: Hewlett-Packard -> System Models: Z420, Z620 and Z820 ++ */ ++static bool compliance_mode_recovery_timer_quirk_check(void) ++{ ++ const char *dmi_product_name, *dmi_sys_vendor; ++ ++ dmi_product_name = dmi_get_system_info(DMI_PRODUCT_NAME); ++ dmi_sys_vendor = dmi_get_system_info(DMI_SYS_VENDOR); ++ ++ if (!(strstr(dmi_sys_vendor, "Hewlett-Packard"))) ++ return false; ++ ++ if (strstr(dmi_product_name, "Z420") || ++ strstr(dmi_product_name, "Z620") || ++ strstr(dmi_product_name, "Z820")) ++ return true; ++ ++ return false; ++} ++ ++static int xhci_all_ports_seen_u0(struct xhci_hcd *xhci) ++{ ++ return (xhci->port_status_u0 == ((1 << xhci->num_usb3_ports)-1)); ++} ++ ++ + /* + * Initialize memory for HCD and xHC (one-time init). + * +@@ -410,6 +500,12 @@ int xhci_init(struct usb_hcd *hcd) + retval = xhci_mem_init(xhci, GFP_KERNEL); + xhci_dbg(xhci, "Finished xhci_init\n"); + ++ /* Initializing Compliance Mode Recovery Data If Needed */ ++ if (compliance_mode_recovery_timer_quirk_check()) { ++ xhci->quirks |= XHCI_COMP_MODE_QUIRK; ++ compliance_mode_recovery_timer_init(xhci); ++ } ++ + return retval; + } + +@@ -618,6 +714,11 @@ void xhci_stop(struct usb_hcd *hcd) + del_timer_sync(&xhci->event_ring_timer); + #endif + ++ /* Deleting Compliance Mode Recovery Timer */ ++ if ((xhci->quirks & XHCI_COMP_MODE_QUIRK) && ++ (!(xhci_all_ports_seen_u0(xhci)))) ++ del_timer_sync(&xhci->comp_mode_recovery_timer); ++ + if (xhci->quirks & XHCI_AMD_PLL_FIX) + usb_amd_dev_put(); + +@@ -648,6 +749,9 @@ void xhci_shutdown(struct usb_hcd *hcd) + { + struct xhci_hcd *xhci = hcd_to_xhci(hcd); + ++ if (xhci->quirks & XHCI_SPURIOUS_REBOOT) ++ usb_disable_xhci_ports(to_pci_dev(hcd->self.controller)); ++ + spin_lock_irq(&xhci->lock); + xhci_halt(xhci); + spin_unlock_irq(&xhci->lock); +@@ -791,6 +895,16 @@ int xhci_suspend(struct xhci_hcd *xhci) + } + spin_unlock_irq(&xhci->lock); + ++ /* ++ * Deleting Compliance Mode Recovery Timer because the xHCI Host ++ * is about to be suspended. ++ */ ++ if ((xhci->quirks & XHCI_COMP_MODE_QUIRK) && ++ (!(xhci_all_ports_seen_u0(xhci)))) { ++ del_timer_sync(&xhci->comp_mode_recovery_timer); ++ xhci_dbg(xhci, "Compliance Mode Recovery Timer Deleted!\n"); ++ } ++ + /* step 5: remove core well power */ + /* synchronize irq when using MSI-X */ + xhci_msix_sync_irqs(xhci); +@@ -923,6 +1037,16 @@ int xhci_resume(struct xhci_hcd *xhci, bool hibernated) + usb_hcd_resume_root_hub(hcd); + usb_hcd_resume_root_hub(xhci->shared_hcd); + } ++ ++ /* ++ * If system is subject to the Quirk, Compliance Mode Timer needs to ++ * be re-initialized Always after a system resume. Ports are subject ++ * to suffer the Compliance Mode issue again. It doesn't matter if ++ * ports have entered previously to U0 before system's suspension. ++ */ ++ if (xhci->quirks & XHCI_COMP_MODE_QUIRK) ++ compliance_mode_recovery_timer_init(xhci); ++ + return retval; + } + #endif /* CONFIG_PM */ +diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h +index 7a56805..44d518a 100644 +--- a/drivers/usb/host/xhci.h ++++ b/drivers/usb/host/xhci.h +@@ -1471,6 +1471,8 @@ struct xhci_hcd { + #define XHCI_SW_BW_CHECKING (1 << 8) + #define XHCI_AMD_0x96_HOST (1 << 9) + #define XHCI_TRUST_TX_LENGTH (1 << 10) ++#define XHCI_SPURIOUS_REBOOT (1 << 13) ++#define XHCI_COMP_MODE_QUIRK (1 << 14) + unsigned int num_active_eps; + unsigned int limit_active_eps; + /* There are two roothubs to keep track of bus suspend info for */ +@@ -1487,6 +1489,11 @@ struct xhci_hcd { + unsigned sw_lpm_support:1; + /* support xHCI 1.0 spec USB2 hardware LPM */ + unsigned hw_lpm_support:1; ++ /* Compliance Mode Recovery Data */ ++ struct timer_list comp_mode_recovery_timer; ++ u32 port_status_u0; ++/* Compliance Mode Timer Triggered every 2 seconds */ ++#define COMP_MODE_RCVRY_MSECS 2000 + }; + + /* convert between an HCD pointer and the corresponding EHCI_HCD */ +diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c +index b3182bb..7324bea 100644 +--- a/drivers/usb/serial/ftdi_sio.c ++++ b/drivers/usb/serial/ftdi_sio.c +@@ -704,6 +704,7 @@ static struct usb_device_id id_table_combined [] = { + { USB_DEVICE(FTDI_VID, FTDI_PCDJ_DAC2_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_RRCIRKITS_LOCOBUFFER_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_ASK_RDR400_PID) }, ++ { USB_DEVICE(FTDI_VID, FTDI_NZR_SEM_USB_PID) }, + { USB_DEVICE(ICOM_VID, ICOM_ID_1_PID) }, + { USB_DEVICE(ICOM_VID, ICOM_OPC_U_UC_PID) }, + { USB_DEVICE(ICOM_VID, ICOM_ID_RP2C1_PID) }, +@@ -804,13 +805,32 @@ static struct usb_device_id id_table_combined [] = { + .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk }, + { USB_DEVICE(ADI_VID, ADI_GNICEPLUS_PID), + .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk }, +- { USB_DEVICE(MICROCHIP_VID, MICROCHIP_USB_BOARD_PID) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(MICROCHIP_VID, MICROCHIP_USB_BOARD_PID, ++ USB_CLASS_VENDOR_SPEC, ++ USB_SUBCLASS_VENDOR_SPEC, 0x00) }, + { USB_DEVICE(JETI_VID, JETI_SPC1201_PID) }, + { USB_DEVICE(MARVELL_VID, MARVELL_SHEEVAPLUG_PID), + .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk }, + { USB_DEVICE(LARSENBRUSGAARD_VID, LB_ALTITRACK_PID) }, + { USB_DEVICE(GN_OTOMETRICS_VID, AURICAL_USB_PID) }, ++ { USB_DEVICE(FTDI_VID, PI_C865_PID) }, ++ { USB_DEVICE(FTDI_VID, PI_C857_PID) }, ++ { USB_DEVICE(PI_VID, PI_C866_PID) }, ++ { USB_DEVICE(PI_VID, PI_C663_PID) }, ++ { USB_DEVICE(PI_VID, PI_C725_PID) }, ++ { USB_DEVICE(PI_VID, PI_E517_PID) }, ++ { USB_DEVICE(PI_VID, PI_C863_PID) }, + { USB_DEVICE(PI_VID, PI_E861_PID) }, ++ { USB_DEVICE(PI_VID, PI_C867_PID) }, ++ { USB_DEVICE(PI_VID, PI_E609_PID) }, ++ { USB_DEVICE(PI_VID, PI_E709_PID) }, ++ { USB_DEVICE(PI_VID, PI_100F_PID) }, ++ { USB_DEVICE(PI_VID, PI_1011_PID) }, ++ { USB_DEVICE(PI_VID, PI_1012_PID) }, ++ { USB_DEVICE(PI_VID, PI_1013_PID) }, ++ { USB_DEVICE(PI_VID, PI_1014_PID) }, ++ { USB_DEVICE(PI_VID, PI_1015_PID) }, ++ { USB_DEVICE(PI_VID, PI_1016_PID) }, + { USB_DEVICE(KONDO_VID, KONDO_USB_SERIAL_PID) }, + { USB_DEVICE(BAYER_VID, BAYER_CONTOUR_CABLE_PID) }, + { USB_DEVICE(FTDI_VID, MARVELL_OPENRD_PID), +diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h +index 54b4258..06f6fd2 100644 +--- a/drivers/usb/serial/ftdi_sio_ids.h ++++ b/drivers/usb/serial/ftdi_sio_ids.h +@@ -75,6 +75,9 @@ + #define FTDI_OPENDCC_GATEWAY_PID 0xBFDB + #define FTDI_OPENDCC_GBM_PID 0xBFDC + ++/* NZR SEM 16+ USB (http://www.nzr.de) */ ++#define FTDI_NZR_SEM_USB_PID 0xC1E0 /* NZR SEM-LOG16+ */ ++ + /* + * RR-CirKits LocoBuffer USB (http://www.rr-cirkits.com) + */ +@@ -539,7 +542,10 @@ + /* + * Microchip Technology, Inc. + * +- * MICROCHIP_VID (0x04D8) and MICROCHIP_USB_BOARD_PID (0x000A) are also used by: ++ * MICROCHIP_VID (0x04D8) and MICROCHIP_USB_BOARD_PID (0x000A) are ++ * used by single function CDC ACM class based firmware demo ++ * applications. The VID/PID has also been used in firmware ++ * emulating FTDI serial chips by: + * Hornby Elite - Digital Command Control Console + * http://www.hornby.com/hornby-dcc/controllers/ + */ +@@ -791,8 +797,27 @@ + * Physik Instrumente + * http://www.physikinstrumente.com/en/products/ + */ ++/* These two devices use the VID of FTDI */ ++#define PI_C865_PID 0xe0a0 /* PI C-865 Piezomotor Controller */ ++#define PI_C857_PID 0xe0a1 /* PI Encoder Trigger Box */ ++ + #define PI_VID 0x1a72 /* Vendor ID */ +-#define PI_E861_PID 0x1008 /* E-861 piezo controller USB connection */ ++#define PI_C866_PID 0x1000 /* PI C-866 Piezomotor Controller */ ++#define PI_C663_PID 0x1001 /* PI C-663 Mercury-Step */ ++#define PI_C725_PID 0x1002 /* PI C-725 Piezomotor Controller */ ++#define PI_E517_PID 0x1005 /* PI E-517 Digital Piezo Controller Operation Module */ ++#define PI_C863_PID 0x1007 /* PI C-863 */ ++#define PI_E861_PID 0x1008 /* PI E-861 Piezomotor Controller */ ++#define PI_C867_PID 0x1009 /* PI C-867 Piezomotor Controller */ ++#define PI_E609_PID 0x100D /* PI E-609 Digital Piezo Controller */ ++#define PI_E709_PID 0x100E /* PI E-709 Digital Piezo Controller */ ++#define PI_100F_PID 0x100F /* PI Digital Piezo Controller */ ++#define PI_1011_PID 0x1011 /* PI Digital Piezo Controller */ ++#define PI_1012_PID 0x1012 /* PI Motion Controller */ ++#define PI_1013_PID 0x1013 /* PI Motion Controller */ ++#define PI_1014_PID 0x1014 /* PI Device */ ++#define PI_1015_PID 0x1015 /* PI Device */ ++#define PI_1016_PID 0x1016 /* PI Digital Servo Module */ + + /* + * Kondo Kagaku Co.Ltd. +diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c +index 113560d..c068b4d 100644 +--- a/drivers/usb/serial/option.c ++++ b/drivers/usb/serial/option.c +@@ -1090,6 +1090,10 @@ static const struct usb_device_id option_ids[] = { + .driver_info = (kernel_ulong_t)&zte_ad3812_z_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MC2716, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&zte_mc2716_z_blacklist }, ++ { USB_VENDOR_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff, 0x02, 0x01) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff, 0x02, 0x05) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0xff, 0x86, 0x10) }, ++ + { USB_DEVICE(BENQ_VENDOR_ID, BENQ_PRODUCT_H10) }, + { USB_DEVICE(DLINK_VENDOR_ID, DLINK_PRODUCT_DWM_652) }, + { USB_DEVICE(ALINK_VENDOR_ID, DLINK_PRODUCT_DWM_652_U5) }, /* Yes, ALINK_VENDOR_ID */ +diff --git a/drivers/video/omap2/omapfb/omapfb-main.c b/drivers/video/omap2/omapfb/omapfb-main.c +index 70aa47d..f7c1753 100644 +--- a/drivers/video/omap2/omapfb/omapfb-main.c ++++ b/drivers/video/omap2/omapfb/omapfb-main.c +@@ -1183,7 +1183,7 @@ static int _setcolreg(struct fb_info *fbi, u_int regno, u_int red, u_int green, + break; + + if (regno < 16) { +- u16 pal; ++ u32 pal; + pal = ((red >> (16 - var->red.length)) << + var->red.offset) | + ((green >> (16 - var->green.length)) << +diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c +index 284798a..89588e7 100644 +--- a/drivers/xen/swiotlb-xen.c ++++ b/drivers/xen/swiotlb-xen.c +@@ -231,7 +231,7 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, + return ret; + + if (hwdev && hwdev->coherent_dma_mask) +- dma_mask = hwdev->coherent_dma_mask; ++ dma_mask = dma_alloc_coherent_mask(hwdev, flags); + + phys = virt_to_phys(ret); + dev_addr = xen_phys_to_bus(phys); +diff --git a/fs/cifs/file.c b/fs/cifs/file.c +index 0bb785f..51574d4 100644 +--- a/fs/cifs/file.c ++++ b/fs/cifs/file.c +@@ -882,7 +882,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) + if (!buf) { + mutex_unlock(&cinode->lock_mutex); + FreeXid(xid); +- return rc; ++ return -ENOMEM; + } + + for (i = 0; i < 2; i++) { +diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c +index af11098..7c7556b 100644 +--- a/fs/ecryptfs/inode.c ++++ b/fs/ecryptfs/inode.c +@@ -640,6 +640,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct dentry *lower_old_dir_dentry; + struct dentry *lower_new_dir_dentry; + struct dentry *trap = NULL; ++ struct inode *target_inode; + + lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); + lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); +@@ -647,6 +648,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, + dget(lower_new_dentry); + lower_old_dir_dentry = dget_parent(lower_old_dentry); + lower_new_dir_dentry = dget_parent(lower_new_dentry); ++ target_inode = new_dentry->d_inode; + trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + /* source should not be ancestor of target */ + if (trap == lower_old_dentry) { +@@ -662,6 +664,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, + lower_new_dir_dentry->d_inode, lower_new_dentry); + if (rc) + goto out_lock; ++ if (target_inode) ++ fsstack_copy_attr_all(target_inode, ++ ecryptfs_inode_to_lower(target_inode)); + fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); + if (new_dir != old_dir) + fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); +diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c +index 5b3f907..71b263f 100644 +--- a/fs/ext3/inode.c ++++ b/fs/ext3/inode.c +@@ -3072,6 +3072,8 @@ static int ext3_do_update_inode(handle_t *handle, + struct ext3_inode_info *ei = EXT3_I(inode); + struct buffer_head *bh = iloc->bh; + int err = 0, rc, block; ++ int need_datasync = 0; ++ __le32 disksize; + + again: + /* we can't allow multiple procs in here at once, its a bit racey */ +@@ -3109,7 +3111,11 @@ again: + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); +- raw_inode->i_size = cpu_to_le32(ei->i_disksize); ++ disksize = cpu_to_le32(ei->i_disksize); ++ if (disksize != raw_inode->i_size) { ++ need_datasync = 1; ++ raw_inode->i_size = disksize; ++ } + raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); + raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); +@@ -3125,8 +3131,11 @@ again: + if (!S_ISREG(inode->i_mode)) { + raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); + } else { +- raw_inode->i_size_high = +- cpu_to_le32(ei->i_disksize >> 32); ++ disksize = cpu_to_le32(ei->i_disksize >> 32); ++ if (disksize != raw_inode->i_size_high) { ++ raw_inode->i_size_high = disksize; ++ need_datasync = 1; ++ } + if (ei->i_disksize > 0x7fffffffULL) { + struct super_block *sb = inode->i_sb; + if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, +@@ -3179,6 +3188,8 @@ again: + ext3_clear_inode_state(inode, EXT3_STATE_NEW); + + atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); ++ if (need_datasync) ++ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); + out_brelse: + brelse (bh); + ext3_std_error(inode->i_sb, err); +diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c +index 2aaf3ea..5c029fb 100644 +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -1524,6 +1524,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, + req->pages[req->num_pages] = page; + req->num_pages++; + ++ offset = 0; + num -= this_num; + total_len += this_num; + index++; +diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c +index 50a15fa..b78b5b6 100644 +--- a/fs/nfs/inode.c ++++ b/fs/nfs/inode.c +@@ -150,7 +150,7 @@ static void nfs_zap_caches_locked(struct inode *inode) + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); + nfsi->attrtimeo_timestamp = jiffies; + +- memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); ++ memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; + else +diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c +index 5195fd6..dba87e6 100644 +--- a/fs/nfs/nfs3proc.c ++++ b/fs/nfs/nfs3proc.c +@@ -633,7 +633,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, + u64 cookie, struct page **pages, unsigned int count, int plus) + { + struct inode *dir = dentry->d_inode; +- __be32 *verf = NFS_COOKIEVERF(dir); ++ __be32 *verf = NFS_I(dir)->cookieverf; + struct nfs3_readdirargs arg = { + .fh = NFS_FH(dir), + .cookie = cookie, +diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c +index d20221d..61796a40 100644 +--- a/fs/nfs/nfs4proc.c ++++ b/fs/nfs/nfs4proc.c +@@ -3025,11 +3025,11 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, + dentry->d_parent->d_name.name, + dentry->d_name.name, + (unsigned long long)cookie); +- nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); ++ nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args); + res.pgbase = args.pgbase; + status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); + if (status >= 0) { +- memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); ++ memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE); + status += args.pgbase; + } + +diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c +index bdd5bdc..00818c8 100644 +--- a/fs/nfs/nfs4xdr.c ++++ b/fs/nfs/nfs4xdr.c +@@ -6113,7 +6113,8 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + status = decode_open(xdr, res); + if (status) + goto out; +- if (decode_getfh(xdr, &res->fh) != 0) ++ status = decode_getfh(xdr, &res->fh); ++ if (status) + goto out; + if (decode_getfattr(xdr, res->f_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) +diff --git a/fs/nfs/super.c b/fs/nfs/super.c +index 6e85ec6..e42d6f6 100644 +--- a/fs/nfs/super.c ++++ b/fs/nfs/super.c +@@ -1820,6 +1820,7 @@ static int nfs_validate_mount_data(void *options, + + memcpy(sap, &data->addr, sizeof(data->addr)); + args->nfs_server.addrlen = sizeof(data->addr); ++ args->nfs_server.port = ntohs(data->addr.sin_port); + if (!nfs_verify_server_address(sap)) + goto out_no_address; + +@@ -2538,6 +2539,7 @@ static int nfs4_validate_mount_data(void *options, + return -EFAULT; + if (!nfs_verify_server_address(sap)) + goto out_no_address; ++ args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); + + if (data->auth_flavourlen) { + if (data->auth_flavourlen > 1) +diff --git a/fs/stat.c b/fs/stat.c +index 8806b89..7b21801 100644 +--- a/fs/stat.c ++++ b/fs/stat.c +@@ -57,12 +57,13 @@ EXPORT_SYMBOL(vfs_getattr); + + int vfs_fstat(unsigned int fd, struct kstat *stat) + { +- struct file *f = fget(fd); ++ int fput_needed; ++ struct file *f = fget_raw_light(fd, &fput_needed); + int error = -EBADF; + + if (f) { + error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat); +- fput(f); ++ fput_light(f, fput_needed); + } + return error; + } +diff --git a/fs/udf/file.c b/fs/udf/file.c +index d567b84..874c9e3 100644 +--- a/fs/udf/file.c ++++ b/fs/udf/file.c +@@ -39,20 +39,24 @@ + #include "udf_i.h" + #include "udf_sb.h" + +-static int udf_adinicb_readpage(struct file *file, struct page *page) ++static void __udf_adinicb_readpage(struct page *page) + { + struct inode *inode = page->mapping->host; + char *kaddr; + struct udf_inode_info *iinfo = UDF_I(inode); + +- BUG_ON(!PageLocked(page)); +- + kaddr = kmap(page); +- memset(kaddr, 0, PAGE_CACHE_SIZE); + memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size); ++ memset(kaddr + inode->i_size, 0, PAGE_CACHE_SIZE - inode->i_size); + flush_dcache_page(page); + SetPageUptodate(page); + kunmap(page); ++} ++ ++static int udf_adinicb_readpage(struct file *file, struct page *page) ++{ ++ BUG_ON(!PageLocked(page)); ++ __udf_adinicb_readpage(page); + unlock_page(page); + + return 0; +@@ -77,6 +81,25 @@ static int udf_adinicb_writepage(struct page *page, + return 0; + } + ++static int udf_adinicb_write_begin(struct file *file, ++ struct address_space *mapping, loff_t pos, ++ unsigned len, unsigned flags, struct page **pagep, ++ void **fsdata) ++{ ++ struct page *page; ++ ++ if (WARN_ON_ONCE(pos >= PAGE_CACHE_SIZE)) ++ return -EIO; ++ page = grab_cache_page_write_begin(mapping, 0, flags); ++ if (!page) ++ return -ENOMEM; ++ *pagep = page; ++ ++ if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) ++ __udf_adinicb_readpage(page); ++ return 0; ++} ++ + static int udf_adinicb_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, +@@ -98,8 +121,8 @@ static int udf_adinicb_write_end(struct file *file, + const struct address_space_operations udf_adinicb_aops = { + .readpage = udf_adinicb_readpage, + .writepage = udf_adinicb_writepage, +- .write_begin = simple_write_begin, +- .write_end = udf_adinicb_write_end, ++ .write_begin = udf_adinicb_write_begin, ++ .write_end = udf_adinicb_write_end, + }; + + static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, +diff --git a/include/drm/drm_mode.h b/include/drm/drm_mode.h +index ddd46db..7639f18 100644 +--- a/include/drm/drm_mode.h ++++ b/include/drm/drm_mode.h +@@ -277,8 +277,9 @@ struct drm_mode_mode_cmd { + struct drm_mode_modeinfo mode; + }; + +-#define DRM_MODE_CURSOR_BO (1<<0) +-#define DRM_MODE_CURSOR_MOVE (1<<1) ++#define DRM_MODE_CURSOR_BO 0x01 ++#define DRM_MODE_CURSOR_MOVE 0x02 ++#define DRM_MODE_CURSOR_FLAGS 0x03 + + /* + * depending on the value in flags different members are used. +diff --git a/include/linux/kobject.h b/include/linux/kobject.h +index ad81e1c..445f978 100644 +--- a/include/linux/kobject.h ++++ b/include/linux/kobject.h +@@ -226,7 +226,7 @@ static inline int kobject_uevent_env(struct kobject *kobj, + + static inline __printf(2, 3) + int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...) +-{ return 0; } ++{ return -ENOMEM; } + + static inline int kobject_action_type(const char *buf, size_t count, + enum kobject_action *type) +diff --git a/include/linux/ktime.h b/include/linux/ktime.h +index 603bec2..06177ba10 100644 +--- a/include/linux/ktime.h ++++ b/include/linux/ktime.h +@@ -58,13 +58,6 @@ union ktime { + + typedef union ktime ktime_t; /* Kill this */ + +-#define KTIME_MAX ((s64)~((u64)1 << 63)) +-#if (BITS_PER_LONG == 64) +-# define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) +-#else +-# define KTIME_SEC_MAX LONG_MAX +-#endif +- + /* + * ktime_t definitions when using the 64-bit scalar representation: + */ +diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h +index c8ef9bc..87967ee 100644 +--- a/include/linux/mmc/card.h ++++ b/include/linux/mmc/card.h +@@ -219,6 +219,7 @@ struct mmc_card { + #define MMC_QUIRK_BLK_NO_CMD23 (1<<7) /* Avoid CMD23 for regular multiblock */ + #define MMC_QUIRK_BROKEN_BYTE_MODE_512 (1<<8) /* Avoid sending 512 bytes in */ + #define MMC_QUIRK_LONG_READ_TIME (1<<9) /* Data read time > CSD says */ ++#define MMC_QUIRK_SEC_ERASE_TRIM_BROKEN (1<<10) /* Skip secure for erase/trim */ + /* byte mode */ + unsigned int poweroff_notify_state; /* eMMC4.5 notify feature */ + #define MMC_NO_POWER_NOTIFICATION 0 +diff --git a/include/linux/mv643xx_eth.h b/include/linux/mv643xx_eth.h +index 30b0c4e..43e038a 100644 +--- a/include/linux/mv643xx_eth.h ++++ b/include/linux/mv643xx_eth.h +@@ -15,6 +15,8 @@ + #define MV643XX_ETH_SIZE_REG_4 0x2224 + #define MV643XX_ETH_BASE_ADDR_ENABLE_REG 0x2290 + ++#define MV643XX_TX_CSUM_DEFAULT_LIMIT 0 ++ + struct mv643xx_eth_shared_platform_data { + struct mbus_dram_target_info *dram; + struct platform_device *shared_smi; +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index cb52340..00ca32b 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -1299,6 +1299,8 @@ struct net_device { + /* for setting kernel sock attribute on TCP connection setup */ + #define GSO_MAX_SIZE 65536 + unsigned int gso_max_size; ++#define GSO_MAX_SEGS 65535 ++ u16 gso_max_segs; + + #ifdef CONFIG_DCB + /* Data Center Bridging netlink ops */ +@@ -1511,6 +1513,8 @@ struct packet_type { + struct sk_buff **(*gro_receive)(struct sk_buff **head, + struct sk_buff *skb); + int (*gro_complete)(struct sk_buff *skb); ++ bool (*id_match)(struct packet_type *ptype, ++ struct sock *sk); + void *af_packet_priv; + struct list_head list; + }; +diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h +index 92ecf55..33c52a2 100644 +--- a/include/linux/nfs_fs.h ++++ b/include/linux/nfs_fs.h +@@ -261,11 +261,6 @@ static inline const struct nfs_rpc_ops *NFS_PROTO(const struct inode *inode) + return NFS_SERVER(inode)->nfs_client->rpc_ops; + } + +-static inline __be32 *NFS_COOKIEVERF(const struct inode *inode) +-{ +- return NFS_I(inode)->cookieverf; +-} +- + static inline unsigned NFS_MINATTRTIMEO(const struct inode *inode) + { + struct nfs_server *nfss = NFS_SERVER(inode); +diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h +index 2aaee0c..67cc215 100644 +--- a/include/linux/pci_ids.h ++++ b/include/linux/pci_ids.h +@@ -2124,7 +2124,7 @@ + #define PCI_DEVICE_ID_TIGON3_5704S 0x16a8 + #define PCI_DEVICE_ID_NX2_57800_VF 0x16a9 + #define PCI_DEVICE_ID_NX2_5706S 0x16aa +-#define PCI_DEVICE_ID_NX2_57840_MF 0x16ab ++#define PCI_DEVICE_ID_NX2_57840_MF 0x16a4 + #define PCI_DEVICE_ID_NX2_5708S 0x16ac + #define PCI_DEVICE_ID_NX2_57840_VF 0x16ad + #define PCI_DEVICE_ID_NX2_57810_MF 0x16ae +diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h +index b1f8912..b669be6 100644 +--- a/include/linux/perf_event.h ++++ b/include/linux/perf_event.h +@@ -794,7 +794,7 @@ struct perf_event { + struct hw_perf_event hw; + + struct perf_event_context *ctx; +- struct file *filp; ++ atomic_long_t refcount; + + /* + * These accumulate total time (in nanoseconds) that children +diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h +index 15518a1..0a4cd10 100644 +--- a/include/linux/sunrpc/xprt.h ++++ b/include/linux/sunrpc/xprt.h +@@ -114,6 +114,7 @@ struct rpc_xprt_ops { + void (*set_buffer_size)(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize); + int (*reserve_xprt)(struct rpc_xprt *xprt, struct rpc_task *task); + void (*release_xprt)(struct rpc_xprt *xprt, struct rpc_task *task); ++ void (*alloc_slot)(struct rpc_xprt *xprt, struct rpc_task *task); + void (*rpcbind)(struct rpc_task *task); + void (*set_port)(struct rpc_xprt *xprt, unsigned short port); + void (*connect)(struct rpc_task *task); +@@ -274,6 +275,8 @@ void xprt_connect(struct rpc_task *task); + void xprt_reserve(struct rpc_task *task); + int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task); + int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task); ++void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task); ++void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task); + int xprt_prepare_transmit(struct rpc_task *task); + void xprt_transmit(struct rpc_task *task); + void xprt_end_transmit(struct rpc_task *task); +diff --git a/include/linux/time.h b/include/linux/time.h +index b306178..8c0216e 100644 +--- a/include/linux/time.h ++++ b/include/linux/time.h +@@ -107,11 +107,36 @@ static inline struct timespec timespec_sub(struct timespec lhs, + return ts_delta; + } + ++#define KTIME_MAX ((s64)~((u64)1 << 63)) ++#if (BITS_PER_LONG == 64) ++# define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) ++#else ++# define KTIME_SEC_MAX LONG_MAX ++#endif ++ + /* + * Returns true if the timespec is norm, false if denorm: + */ +-#define timespec_valid(ts) \ +- (((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC)) ++static inline bool timespec_valid(const struct timespec *ts) ++{ ++ /* Dates before 1970 are bogus */ ++ if (ts->tv_sec < 0) ++ return false; ++ /* Can't have more nanoseconds then a second */ ++ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) ++ return false; ++ return true; ++} ++ ++static inline bool timespec_valid_strict(const struct timespec *ts) ++{ ++ if (!timespec_valid(ts)) ++ return false; ++ /* Disallow values that could overflow ktime_t */ ++ if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) ++ return false; ++ return true; ++} + + extern void read_persistent_clock(struct timespec *ts); + extern void read_boot_clock(struct timespec *ts); +diff --git a/include/net/scm.h b/include/net/scm.h +index d456f4c..0c0017c 100644 +--- a/include/net/scm.h ++++ b/include/net/scm.h +@@ -71,9 +71,11 @@ static __inline__ void scm_destroy(struct scm_cookie *scm) + } + + static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, +- struct scm_cookie *scm) ++ struct scm_cookie *scm, bool forcecreds) + { + memset(scm, 0, sizeof(*scm)); ++ if (forcecreds) ++ scm_set_cred(scm, task_tgid(current), current_cred()); + unix_get_peersec_dgram(sock, scm); + if (msg->msg_controllen <= 0) + return 0; +diff --git a/include/net/sock.h b/include/net/sock.h +index 32e3937..ddf523c 100644 +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -194,6 +194,7 @@ struct sock_common { + * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK) + * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) + * @sk_gso_max_size: Maximum GSO segment size to build ++ * @sk_gso_max_segs: Maximum number of GSO segments + * @sk_lingertime: %SO_LINGER l_linger setting + * @sk_backlog: always used with the per-socket spinlock held + * @sk_callback_lock: used with the callbacks in the end of this struct +@@ -310,6 +311,7 @@ struct sock { + int sk_route_nocaps; + int sk_gso_type; + unsigned int sk_gso_max_size; ++ u16 sk_gso_max_segs; + int sk_rcvlowat; + unsigned long sk_lingertime; + struct sk_buff_head sk_error_queue; +diff --git a/kernel/events/core.c b/kernel/events/core.c +index 58690af..7d1f05e 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -3011,12 +3011,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); + /* + * Called when the last reference to the file is gone. + */ +-static int perf_release(struct inode *inode, struct file *file) ++static void put_event(struct perf_event *event) + { +- struct perf_event *event = file->private_data; + struct task_struct *owner; + +- file->private_data = NULL; ++ if (!atomic_long_dec_and_test(&event->refcount)) ++ return; + + rcu_read_lock(); + owner = ACCESS_ONCE(event->owner); +@@ -3051,7 +3051,13 @@ static int perf_release(struct inode *inode, struct file *file) + put_task_struct(owner); + } + +- return perf_event_release_kernel(event); ++ perf_event_release_kernel(event); ++} ++ ++static int perf_release(struct inode *inode, struct file *file) ++{ ++ put_event(file->private_data); ++ return 0; + } + + u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) +@@ -3304,7 +3310,7 @@ unlock: + + static const struct file_operations perf_fops; + +-static struct perf_event *perf_fget_light(int fd, int *fput_needed) ++static struct file *perf_fget_light(int fd, int *fput_needed) + { + struct file *file; + +@@ -3318,7 +3324,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed) + return ERR_PTR(-EBADF); + } + +- return file->private_data; ++ return file; + } + + static int perf_event_set_output(struct perf_event *event, +@@ -3350,19 +3356,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) + + case PERF_EVENT_IOC_SET_OUTPUT: + { ++ struct file *output_file = NULL; + struct perf_event *output_event = NULL; + int fput_needed = 0; + int ret; + + if (arg != -1) { +- output_event = perf_fget_light(arg, &fput_needed); +- if (IS_ERR(output_event)) +- return PTR_ERR(output_event); ++ output_file = perf_fget_light(arg, &fput_needed); ++ if (IS_ERR(output_file)) ++ return PTR_ERR(output_file); ++ output_event = output_file->private_data; + } + + ret = perf_event_set_output(event, output_event); + if (output_event) +- fput_light(output_event->filp, fput_needed); ++ fput_light(output_file, fput_needed); + + return ret; + } +@@ -5912,6 +5920,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, + + mutex_init(&event->mmap_mutex); + ++ atomic_long_set(&event->refcount, 1); + event->cpu = cpu; + event->attr = *attr; + event->group_leader = group_leader; +@@ -6182,12 +6191,12 @@ SYSCALL_DEFINE5(perf_event_open, + return event_fd; + + if (group_fd != -1) { +- group_leader = perf_fget_light(group_fd, &fput_needed); +- if (IS_ERR(group_leader)) { +- err = PTR_ERR(group_leader); ++ group_file = perf_fget_light(group_fd, &fput_needed); ++ if (IS_ERR(group_file)) { ++ err = PTR_ERR(group_file); + goto err_fd; + } +- group_file = group_leader->filp; ++ group_leader = group_file->private_data; + if (flags & PERF_FLAG_FD_OUTPUT) + output_event = group_leader; + if (flags & PERF_FLAG_FD_NO_GROUP) +@@ -6322,7 +6331,6 @@ SYSCALL_DEFINE5(perf_event_open, + put_ctx(gctx); + } + +- event->filp = event_file; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + +@@ -6412,7 +6420,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, + goto err_free; + } + +- event->filp = NULL; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, event, cpu); +@@ -6461,7 +6468,7 @@ static void sync_child_event(struct perf_event *child_event, + * Release the parent event, if this was the last + * reference to it. + */ +- fput(parent_event->filp); ++ put_event(parent_event); + } + + static void +@@ -6537,9 +6544,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) + * + * __perf_event_exit_task() + * sync_child_event() +- * fput(parent_event->filp) +- * perf_release() +- * mutex_lock(&ctx->mutex) ++ * put_event() ++ * mutex_lock(&ctx->mutex) + * + * But since its the parent context it won't be the same instance. + */ +@@ -6607,7 +6613,7 @@ static void perf_free_event(struct perf_event *event, + list_del_init(&event->child_list); + mutex_unlock(&parent->child_mutex); + +- fput(parent->filp); ++ put_event(parent); + + perf_group_detach(event); + list_del_event(event, ctx); +@@ -6687,6 +6693,12 @@ inherit_event(struct perf_event *parent_event, + NULL, NULL); + if (IS_ERR(child_event)) + return child_event; ++ ++ if (!atomic_long_inc_not_zero(&parent_event->refcount)) { ++ free_event(child_event); ++ return NULL; ++ } ++ + get_ctx(child_ctx); + + /* +@@ -6728,14 +6740,6 @@ inherit_event(struct perf_event *parent_event, + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* +- * Get a reference to the parent filp - we will fput it +- * when the child event exits. This is safe to do because +- * we are in the parent and we know that the filp still +- * exists and has a nonzero count: +- */ +- atomic_long_inc(&parent_event->filp->f_count); +- +- /* + * Link this into the parent event's child list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); +diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c +index 03e67d4..5ee1ac0 100644 +--- a/kernel/time/timekeeping.c ++++ b/kernel/time/timekeeping.c +@@ -382,7 +382,7 @@ int do_settimeofday(const struct timespec *tv) + struct timespec ts_delta; + unsigned long flags; + +- if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) ++ if (!timespec_valid_strict(tv)) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); +@@ -417,6 +417,8 @@ EXPORT_SYMBOL(do_settimeofday); + int timekeeping_inject_offset(struct timespec *ts) + { + unsigned long flags; ++ struct timespec tmp; ++ int ret = 0; + + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; +@@ -425,9 +427,16 @@ int timekeeping_inject_offset(struct timespec *ts) + + timekeeping_forward_now(); + ++ tmp = timespec_add(xtime, *ts); ++ if (!timespec_valid_strict(&tmp)) { ++ ret = -EINVAL; ++ goto error; ++ } ++ + xtime = timespec_add(xtime, *ts); + wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); + ++error: /* even if we error out, we forwarded the time, so call update */ + timekeeping_update(true); + + write_sequnlock_irqrestore(&xtime_lock, flags); +@@ -435,7 +444,7 @@ int timekeeping_inject_offset(struct timespec *ts) + /* signal hrtimers about time change */ + clock_was_set(); + +- return 0; ++ return ret; + } + EXPORT_SYMBOL(timekeeping_inject_offset); + +@@ -582,7 +591,20 @@ void __init timekeeping_init(void) + struct timespec now, boot; + + read_persistent_clock(&now); ++ if (!timespec_valid_strict(&now)) { ++ pr_warn("WARNING: Persistent clock returned invalid value!\n" ++ " Check your CMOS/BIOS settings.\n"); ++ now.tv_sec = 0; ++ now.tv_nsec = 0; ++ } ++ + read_boot_clock(&boot); ++ if (!timespec_valid_strict(&boot)) { ++ pr_warn("WARNING: Boot clock returned invalid value!\n" ++ " Check your CMOS/BIOS settings.\n"); ++ boot.tv_sec = 0; ++ boot.tv_nsec = 0; ++ } + + write_seqlock_irqsave(&xtime_lock, flags); + +@@ -627,7 +649,7 @@ static void update_sleep_time(struct timespec t) + */ + static void __timekeeping_inject_sleeptime(struct timespec *delta) + { +- if (!timespec_valid(delta)) { ++ if (!timespec_valid_strict(delta)) { + printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " + "sleep delta value!\n"); + return; +@@ -1011,6 +1033,10 @@ static void update_wall_time(void) + #else + offset = (clock->read(clock) - clock->cycle_last) & clock->mask; + #endif ++ /* Check if there's really nothing to do */ ++ if (offset < timekeeper.cycle_interval) ++ return; ++ + timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; + + /* +diff --git a/kernel/workqueue.c b/kernel/workqueue.c +index a650bee..979d4de 100644 +--- a/kernel/workqueue.c ++++ b/kernel/workqueue.c +@@ -3437,14 +3437,17 @@ static int __cpuinit trustee_thread(void *__gcwq) + + for_each_busy_worker(worker, i, pos, gcwq) { + struct work_struct *rebind_work = &worker->rebind_work; ++ unsigned long worker_flags = worker->flags; + + /* + * Rebind_work may race with future cpu hotplug + * operations. Use a separate flag to mark that +- * rebinding is scheduled. ++ * rebinding is scheduled. The morphing should ++ * be atomic. + */ +- worker->flags |= WORKER_REBIND; +- worker->flags &= ~WORKER_ROGUE; ++ worker_flags |= WORKER_REBIND; ++ worker_flags &= ~WORKER_ROGUE; ++ ACCESS_ONCE(worker->flags) = worker_flags; + + /* queue rebind_work, wq doesn't matter, use the default one */ + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, +diff --git a/mm/mempolicy.c b/mm/mempolicy.c +index c0007f9..11b8d47 100644 +--- a/mm/mempolicy.c ++++ b/mm/mempolicy.c +@@ -2533,7 +2533,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) + break; + + default: +- BUG(); ++ return -EINVAL; + } + + l = strlen(policy_modes[mode]); +diff --git a/net/atm/common.c b/net/atm/common.c +index 14ff9fe..0ca06e8 100644 +--- a/net/atm/common.c ++++ b/net/atm/common.c +@@ -784,6 +784,7 @@ int vcc_getsockopt(struct socket *sock, int level, int optname, + + if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags)) + return -ENOTCONN; ++ memset(&pvc, 0, sizeof(pvc)); + pvc.sap_family = AF_ATMPVC; + pvc.sap_addr.itf = vcc->dev->number; + pvc.sap_addr.vpi = vcc->vpi; +diff --git a/net/atm/pvc.c b/net/atm/pvc.c +index 3a73491..ae03240 100644 +--- a/net/atm/pvc.c ++++ b/net/atm/pvc.c +@@ -95,6 +95,7 @@ static int pvc_getname(struct socket *sock, struct sockaddr *sockaddr, + return -ENOTCONN; + *sockaddr_len = sizeof(struct sockaddr_atmpvc); + addr = (struct sockaddr_atmpvc *)sockaddr; ++ memset(addr, 0, sizeof(*addr)); + addr->sap_family = AF_ATMPVC; + addr->sap_addr.itf = vcc->dev->number; + addr->sap_addr.vpi = vcc->vpi; +diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c +index f6afe3d..8361ee4 100644 +--- a/net/bluetooth/hci_sock.c ++++ b/net/bluetooth/hci_sock.c +@@ -388,6 +388,7 @@ static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, int *add + *addr_len = sizeof(*haddr); + haddr->hci_family = AF_BLUETOOTH; + haddr->hci_dev = hdev->id; ++ haddr->hci_channel= 0; + + release_sock(sk); + return 0; +@@ -671,6 +672,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char + { + struct hci_filter *f = &hci_pi(sk)->filter; + ++ memset(&uf, 0, sizeof(uf)); + uf.type_mask = f->type_mask; + uf.opcode = f->opcode; + uf.event_mask[0] = *((u32 *) f->event_mask + 0); +diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c +index 5c406d3..6dedd6f 100644 +--- a/net/bluetooth/l2cap_sock.c ++++ b/net/bluetooth/l2cap_sock.c +@@ -293,6 +293,7 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int *l + + BT_DBG("sock %p, sk %p", sock, sk); + ++ memset(la, 0, sizeof(struct sockaddr_l2)); + addr->sa_family = AF_BLUETOOTH; + *len = sizeof(struct sockaddr_l2); + +diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c +index 5417f61..7ee4ead 100644 +--- a/net/bluetooth/rfcomm/sock.c ++++ b/net/bluetooth/rfcomm/sock.c +@@ -547,6 +547,7 @@ static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int * + + BT_DBG("sock %p, sk %p", sock, sk); + ++ memset(sa, 0, sizeof(*sa)); + sa->rc_family = AF_BLUETOOTH; + sa->rc_channel = rfcomm_pi(sk)->channel; + if (peer) +@@ -835,6 +836,7 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c + } + + sec.level = rfcomm_pi(sk)->sec_level; ++ sec.key_size = 0; + + len = min_t(unsigned int, len, sizeof(sec)); + if (copy_to_user(optval, (char *) &sec, len)) +diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c +index c258796..bc1eb56 100644 +--- a/net/bluetooth/rfcomm/tty.c ++++ b/net/bluetooth/rfcomm/tty.c +@@ -471,7 +471,7 @@ static int rfcomm_get_dev_list(void __user *arg) + + size = sizeof(*dl) + dev_num * sizeof(*di); + +- dl = kmalloc(size, GFP_KERNEL); ++ dl = kzalloc(size, GFP_KERNEL); + if (!dl) + return -ENOMEM; + +diff --git a/net/core/dev.c b/net/core/dev.c +index 4b18703..832ba6d 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -1059,6 +1059,8 @@ rollback: + */ + int dev_set_alias(struct net_device *dev, const char *alias, size_t len) + { ++ char *new_ifalias; ++ + ASSERT_RTNL(); + + if (len >= IFALIASZ) +@@ -1072,9 +1074,10 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) + return 0; + } + +- dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); +- if (!dev->ifalias) ++ new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); ++ if (!new_ifalias) + return -ENOMEM; ++ dev->ifalias = new_ifalias; + + strlcpy(dev->ifalias, alias, len+1); + return len; +@@ -1628,6 +1631,19 @@ static inline int deliver_skb(struct sk_buff *skb, + return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + } + ++static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) ++{ ++ if (ptype->af_packet_priv == NULL) ++ return false; ++ ++ if (ptype->id_match) ++ return ptype->id_match(ptype, skb->sk); ++ else if ((struct sock *)ptype->af_packet_priv == skb->sk) ++ return true; ++ ++ return false; ++} ++ + /* + * Support routine. Sends outgoing frames to any network + * taps currently in use. +@@ -1645,8 +1661,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) + * they originated from - MvS (miquels@drinkel.ow.org) + */ + if ((ptype->dev == dev || !ptype->dev) && +- (ptype->af_packet_priv == NULL || +- (struct sock *)ptype->af_packet_priv != skb->sk)) { ++ (!skb_loop_sk(ptype, skb))) { + if (pt_prev) { + deliver_skb(skb2, pt_prev, skb->dev); + pt_prev = ptype; +@@ -2108,6 +2123,9 @@ u32 netif_skb_features(struct sk_buff *skb) + __be16 protocol = skb->protocol; + u32 features = skb->dev->features; + ++ if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) ++ features &= ~NETIF_F_GSO_MASK; ++ + if (protocol == htons(ETH_P_8021Q)) { + struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; + protocol = veh->h_vlan_encapsulated_proto; +@@ -5990,6 +6008,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + dev_net_set(dev, &init_net); + + dev->gso_max_size = GSO_MAX_SIZE; ++ dev->gso_max_segs = GSO_MAX_SEGS; + + INIT_LIST_HEAD(&dev->napi_list); + INIT_LIST_HEAD(&dev->unreg_list); +diff --git a/net/core/sock.c b/net/core/sock.c +index 8d095b9..018fd41 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -1308,6 +1308,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) + } else { + sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; + sk->sk_gso_max_size = dst->dev->gso_max_size; ++ sk->sk_gso_max_segs = dst->dev->gso_max_segs; + } + } + } +diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c +index 3d604e1..4caf63f 100644 +--- a/net/dccp/ccids/ccid3.c ++++ b/net/dccp/ccids/ccid3.c +@@ -532,6 +532,7 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, + case DCCP_SOCKOPT_CCID_TX_INFO: + if (len < sizeof(tfrc)) + return -EINVAL; ++ memset(&tfrc, 0, sizeof(tfrc)); + tfrc.tfrctx_x = hc->tx_x; + tfrc.tfrctx_x_recv = hc->tx_x_recv; + tfrc.tfrctx_x_calc = hc->tx_x_calc; +diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c +index d2aae27..0064394 100644 +--- a/net/ipv4/ipmr.c ++++ b/net/ipv4/ipmr.c +@@ -125,6 +125,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock); + static struct kmem_cache *mrt_cachep __read_mostly; + + static struct mr_table *ipmr_new_table(struct net *net, u32 id); ++static void ipmr_free_table(struct mr_table *mrt); ++ + static int ip_mr_forward(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, struct mfc_cache *cache, + int local); +@@ -132,6 +134,7 @@ static int ipmr_cache_report(struct mr_table *mrt, + struct sk_buff *pkt, vifi_t vifi, int assert); + static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + struct mfc_cache *c, struct rtmsg *rtm); ++static void mroute_clean_tables(struct mr_table *mrt); + static void ipmr_expire_process(unsigned long arg); + + #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES +@@ -272,7 +275,7 @@ static void __net_exit ipmr_rules_exit(struct net *net) + + list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { + list_del(&mrt->list); +- kfree(mrt); ++ ipmr_free_table(mrt); + } + fib_rules_unregister(net->ipv4.mr_rules_ops); + } +@@ -300,7 +303,7 @@ static int __net_init ipmr_rules_init(struct net *net) + + static void __net_exit ipmr_rules_exit(struct net *net) + { +- kfree(net->ipv4.mrt); ++ ipmr_free_table(net->ipv4.mrt); + } + #endif + +@@ -337,6 +340,13 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) + return mrt; + } + ++static void ipmr_free_table(struct mr_table *mrt) ++{ ++ del_timer_sync(&mrt->ipmr_expire_timer); ++ mroute_clean_tables(mrt); ++ kfree(mrt); ++} ++ + /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ + + static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index ad466a7..043d49b 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -740,7 +740,9 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, + old_size_goal + mss_now > xmit_size_goal)) { + xmit_size_goal = old_size_goal; + } else { +- tp->xmit_size_goal_segs = xmit_size_goal / mss_now; ++ tp->xmit_size_goal_segs = ++ min_t(u16, xmit_size_goal / mss_now, ++ sk->sk_gso_max_segs); + xmit_size_goal = tp->xmit_size_goal_segs * mss_now; + } + } +diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c +index 850c737..6cebfd2 100644 +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -290,7 +290,8 @@ int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) + left = tp->snd_cwnd - in_flight; + if (sk_can_gso(sk) && + left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && +- left * tp->mss_cache < sk->sk_gso_max_size) ++ left * tp->mss_cache < sk->sk_gso_max_size && ++ left < sk->sk_gso_max_segs) + return 1; + return left <= tcp_max_burst(tp); + } +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index c51dd5b..921cbac 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -1318,21 +1318,21 @@ static void tcp_cwnd_validate(struct sock *sk) + * when we would be allowed to send the split-due-to-Nagle skb fully. + */ + static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, +- unsigned int mss_now, unsigned int cwnd) ++ unsigned int mss_now, unsigned int max_segs) + { + const struct tcp_sock *tp = tcp_sk(sk); +- u32 needed, window, cwnd_len; ++ u32 needed, window, max_len; + + window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; +- cwnd_len = mss_now * cwnd; ++ max_len = mss_now * max_segs; + +- if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) +- return cwnd_len; ++ if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) ++ return max_len; + + needed = min(skb->len, window); + +- if (cwnd_len <= needed) +- return cwnd_len; ++ if (max_len <= needed) ++ return max_len; + + return needed - needed % mss_now; + } +@@ -1560,7 +1560,8 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) + limit = min(send_win, cong_win); + + /* If a full-sized TSO skb can be sent, do it. */ +- if (limit >= sk->sk_gso_max_size) ++ if (limit >= min_t(unsigned int, sk->sk_gso_max_size, ++ sk->sk_gso_max_segs * tp->mss_cache)) + goto send_now; + + /* Middle in queue won't get any more data, full sendable already? */ +@@ -1786,7 +1787,9 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + limit = mss_now; + if (tso_segs > 1 && !tcp_urg_mode(tp)) + limit = tcp_mss_split_point(sk, skb, mss_now, +- cwnd_quota); ++ min_t(unsigned int, ++ cwnd_quota, ++ sk->sk_gso_max_segs)); + + if (skb->len > limit && + unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) +diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c +index a5521c5..aef80d7 100644 +--- a/net/ipv6/addrconf.c ++++ b/net/ipv6/addrconf.c +@@ -493,8 +493,7 @@ static void addrconf_forward_change(struct net *net, __s32 newf) + struct net_device *dev; + struct inet6_dev *idev; + +- rcu_read_lock(); +- for_each_netdev_rcu(net, dev) { ++ for_each_netdev(net, dev) { + idev = __in6_dev_get(dev); + if (idev) { + int changed = (!idev->cnf.forwarding) ^ (!newf); +@@ -503,7 +502,6 @@ static void addrconf_forward_change(struct net *net, __s32 newf) + dev_forward_change(idev); + } + } +- rcu_read_unlock(); + } + + static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) +diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c +index 89ff8c6..7501b22 100644 +--- a/net/l2tp/l2tp_core.c ++++ b/net/l2tp/l2tp_core.c +@@ -1253,11 +1253,10 @@ static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) + /* Remove from tunnel list */ + spin_lock_bh(&pn->l2tp_tunnel_list_lock); + list_del_rcu(&tunnel->list); ++ kfree_rcu(tunnel, rcu); + spin_unlock_bh(&pn->l2tp_tunnel_list_lock); +- synchronize_rcu(); + + atomic_dec(&l2tp_tunnel_count); +- kfree(tunnel); + } + + /* Create a socket for the tunnel, if one isn't set up by +diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h +index a16a48e..4393794 100644 +--- a/net/l2tp/l2tp_core.h ++++ b/net/l2tp/l2tp_core.h +@@ -157,6 +157,7 @@ struct l2tp_tunnel_cfg { + + struct l2tp_tunnel { + int magic; /* Should be L2TP_TUNNEL_MAGIC */ ++ struct rcu_head rcu; + rwlock_t hlist_lock; /* protect session_hlist */ + struct hlist_head session_hlist[L2TP_HASH_SIZE]; + /* hashed list of sessions, +diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c +index a18e6c3..99a60d5 100644 +--- a/net/llc/af_llc.c ++++ b/net/llc/af_llc.c +@@ -966,14 +966,13 @@ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, + struct sockaddr_llc sllc; + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); +- int rc = 0; ++ int rc = -EBADF; + + memset(&sllc, 0, sizeof(sllc)); + lock_sock(sk); + if (sock_flag(sk, SOCK_ZAPPED)) + goto out; + *uaddrlen = sizeof(sllc); +- memset(uaddr, 0, *uaddrlen); + if (peer) { + rc = -ENOTCONN; + if (sk->sk_state != TCP_ESTABLISHED) +diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c +index e1a66cf..72f4253 100644 +--- a/net/netfilter/ipvs/ip_vs_ctl.c ++++ b/net/netfilter/ipvs/ip_vs_ctl.c +@@ -2713,6 +2713,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) + { + struct ip_vs_timeout_user t; + ++ memset(&t, 0, sizeof(t)); + __ip_vs_get_timeouts(net, &t); + if (copy_to_user(user, &t, sizeof(t)) != 0) + ret = -EFAULT; +diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c +index a99fb41..38b78b9 100644 +--- a/net/netlink/af_netlink.c ++++ b/net/netlink/af_netlink.c +@@ -1333,7 +1333,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, + if (NULL == siocb->scm) + siocb->scm = &scm; + +- err = scm_send(sock, msg, siocb->scm); ++ err = scm_send(sock, msg, siocb->scm, true); + if (err < 0) + return err; + +@@ -1344,7 +1344,8 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, + dst_pid = addr->nl_pid; + dst_group = ffs(addr->nl_groups); + err = -EPERM; +- if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) ++ if ((dst_group || dst_pid) && ++ !netlink_capable(sock, NL_NONROOT_SEND)) + goto out; + } else { + dst_pid = nlk->dst_pid; +@@ -2103,6 +2104,7 @@ static void __init netlink_add_usersock_entry(void) + rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); + nl_table[NETLINK_USERSOCK].module = THIS_MODULE; + nl_table[NETLINK_USERSOCK].registered = 1; ++ nl_table[NETLINK_USERSOCK].nl_nonroot = NL_NONROOT_SEND; + + netlink_table_ungrab(); + } +diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c +index d9d4970..85afc13 100644 +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -1281,6 +1281,14 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) + spin_unlock(&f->lock); + } + ++bool match_fanout_group(struct packet_type *ptype, struct sock * sk) ++{ ++ if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout) ++ return true; ++ ++ return false; ++} ++ + static int fanout_add(struct sock *sk, u16 id, u16 type_flags) + { + struct packet_sock *po = pkt_sk(sk); +@@ -1333,6 +1341,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) + match->prot_hook.dev = po->prot_hook.dev; + match->prot_hook.func = packet_rcv_fanout; + match->prot_hook.af_packet_priv = match; ++ match->prot_hook.id_match = match_fanout_group; + dev_add_pack(&match->prot_hook); + list_add(&match->list, &fanout_list); + } +@@ -1931,7 +1940,6 @@ static void tpacket_destruct_skb(struct sk_buff *skb) + + if (likely(po->tx_ring.pg_vec)) { + ph = skb_shinfo(skb)->destructor_arg; +- BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING); + BUG_ON(atomic_read(&po->tx_ring.pending) == 0); + atomic_dec(&po->tx_ring.pending); + __packet_set_status(po, ph, TP_STATUS_AVAILABLE); +diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c +index b77f5a0..bdacd8d 100644 +--- a/net/sched/act_gact.c ++++ b/net/sched/act_gact.c +@@ -67,6 +67,9 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, + struct tcf_common *pc; + int ret = 0; + int err; ++#ifdef CONFIG_GACT_PROB ++ struct tc_gact_p *p_parm = NULL; ++#endif + + if (nla == NULL) + return -EINVAL; +@@ -82,6 +85,12 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, + #ifndef CONFIG_GACT_PROB + if (tb[TCA_GACT_PROB] != NULL) + return -EOPNOTSUPP; ++#else ++ if (tb[TCA_GACT_PROB]) { ++ p_parm = nla_data(tb[TCA_GACT_PROB]); ++ if (p_parm->ptype >= MAX_RAND) ++ return -EINVAL; ++ } + #endif + + pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info); +@@ -103,8 +112,7 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, + spin_lock_bh(&gact->tcf_lock); + gact->tcf_action = parm->action; + #ifdef CONFIG_GACT_PROB +- if (tb[TCA_GACT_PROB] != NULL) { +- struct tc_gact_p *p_parm = nla_data(tb[TCA_GACT_PROB]); ++ if (p_parm) { + gact->tcfg_paction = p_parm->paction; + gact->tcfg_pval = p_parm->pval; + gact->tcfg_ptype = p_parm->ptype; +@@ -133,7 +141,7 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, + + spin_lock(&gact->tcf_lock); + #ifdef CONFIG_GACT_PROB +- if (gact->tcfg_ptype && gact_rand[gact->tcfg_ptype] != NULL) ++ if (gact->tcfg_ptype) + action = gact_rand[gact->tcfg_ptype](gact); + else + action = gact->tcf_action; +diff --git a/net/socket.c b/net/socket.c +index 273cbce..68879db 100644 +--- a/net/socket.c ++++ b/net/socket.c +@@ -2645,6 +2645,7 @@ static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) + if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf))) + return -EFAULT; + ++ memset(&ifc, 0, sizeof(ifc)); + if (ifc32.ifcbuf == 0) { + ifc32.ifc_len = 0; + ifc.ifc_len = 0; +diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c +index 3ac9789..ffba207 100644 +--- a/net/sunrpc/xprt.c ++++ b/net/sunrpc/xprt.c +@@ -962,11 +962,11 @@ static bool xprt_dynamic_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) + return false; + } + +-static void xprt_alloc_slot(struct rpc_task *task) ++void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) + { +- struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_rqst *req; + ++ spin_lock(&xprt->reserve_lock); + if (!list_empty(&xprt->free)) { + req = list_entry(xprt->free.next, struct rpc_rqst, rq_list); + list_del(&req->rq_list); +@@ -987,12 +987,29 @@ static void xprt_alloc_slot(struct rpc_task *task) + default: + task->tk_status = -EAGAIN; + } ++ spin_unlock(&xprt->reserve_lock); + return; + out_init_req: + task->tk_status = 0; + task->tk_rqstp = req; + xprt_request_init(task, xprt); ++ spin_unlock(&xprt->reserve_lock); ++} ++EXPORT_SYMBOL_GPL(xprt_alloc_slot); ++ ++void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) ++{ ++ /* Note: grabbing the xprt_lock_write() ensures that we throttle ++ * new slot allocation if the transport is congested (i.e. when ++ * reconnecting a stream transport or when out of socket write ++ * buffer space). ++ */ ++ if (xprt_lock_write(xprt, task)) { ++ xprt_alloc_slot(xprt, task); ++ xprt_release_write(xprt, task); ++ } + } ++EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot); + + static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) + { +@@ -1076,20 +1093,9 @@ void xprt_reserve(struct rpc_task *task) + if (task->tk_rqstp != NULL) + return; + +- /* Note: grabbing the xprt_lock_write() here is not strictly needed, +- * but ensures that we throttle new slot allocation if the transport +- * is congested (e.g. if reconnecting or if we're out of socket +- * write buffer space). +- */ + task->tk_timeout = 0; + task->tk_status = -EAGAIN; +- if (!xprt_lock_write(xprt, task)) +- return; +- +- spin_lock(&xprt->reserve_lock); +- xprt_alloc_slot(task); +- spin_unlock(&xprt->reserve_lock); +- xprt_release_write(xprt, task); ++ xprt->ops->alloc_slot(xprt, task); + } + + static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) +diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c +index 06cdbff..5d9202d 100644 +--- a/net/sunrpc/xprtrdma/transport.c ++++ b/net/sunrpc/xprtrdma/transport.c +@@ -713,6 +713,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) + static struct rpc_xprt_ops xprt_rdma_procs = { + .reserve_xprt = xprt_rdma_reserve_xprt, + .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ ++ .alloc_slot = xprt_alloc_slot, + .release_request = xprt_release_rqst_cong, /* ditto */ + .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ + .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ +diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c +index 1a6edc7..c5391af 100644 +--- a/net/sunrpc/xprtsock.c ++++ b/net/sunrpc/xprtsock.c +@@ -2422,6 +2422,7 @@ static void bc_destroy(struct rpc_xprt *xprt) + static struct rpc_xprt_ops xs_local_ops = { + .reserve_xprt = xprt_reserve_xprt, + .release_xprt = xs_tcp_release_xprt, ++ .alloc_slot = xprt_alloc_slot, + .rpcbind = xs_local_rpcbind, + .set_port = xs_local_set_port, + .connect = xs_connect, +@@ -2438,6 +2439,7 @@ static struct rpc_xprt_ops xs_udp_ops = { + .set_buffer_size = xs_udp_set_buffer_size, + .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, ++ .alloc_slot = xprt_alloc_slot, + .rpcbind = rpcb_getport_async, + .set_port = xs_set_port, + .connect = xs_connect, +@@ -2455,6 +2457,7 @@ static struct rpc_xprt_ops xs_udp_ops = { + static struct rpc_xprt_ops xs_tcp_ops = { + .reserve_xprt = xprt_reserve_xprt, + .release_xprt = xs_tcp_release_xprt, ++ .alloc_slot = xprt_lock_and_alloc_slot, + .rpcbind = rpcb_getport_async, + .set_port = xs_set_port, + .connect = xs_connect, +diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c +index d99678a..317bfe3 100644 +--- a/net/unix/af_unix.c ++++ b/net/unix/af_unix.c +@@ -1435,7 +1435,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, + if (NULL == siocb->scm) + siocb->scm = &tmp_scm; + wait_for_unix_gc(); +- err = scm_send(sock, msg, siocb->scm); ++ err = scm_send(sock, msg, siocb->scm, false); + if (err < 0) + return err; + +@@ -1596,7 +1596,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, + if (NULL == siocb->scm) + siocb->scm = &tmp_scm; + wait_for_unix_gc(); +- err = scm_send(sock, msg, siocb->scm); ++ err = scm_send(sock, msg, siocb->scm, false); + if (err < 0) + return err; + +diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c +index f3be54e..b0187e7 100644 +--- a/sound/pci/hda/hda_codec.c ++++ b/sound/pci/hda/hda_codec.c +@@ -2312,6 +2312,7 @@ int snd_hda_codec_reset(struct hda_codec *codec) + } + if (codec->patch_ops.free) + codec->patch_ops.free(codec); ++ memset(&codec->patch_ops, 0, sizeof(codec->patch_ops)); + codec->proc_widget_hook = NULL; + codec->spec = NULL; + free_hda_cache(&codec->amp_cache); +@@ -2324,7 +2325,6 @@ int snd_hda_codec_reset(struct hda_codec *codec) + codec->num_pcms = 0; + codec->pcm_info = NULL; + codec->preset = NULL; +- memset(&codec->patch_ops, 0, sizeof(codec->patch_ops)); + codec->slave_dig_outs = NULL; + codec->spdif_status_reset = 0; + module_put(codec->owner); +diff --git a/sound/pci/ice1712/prodigy_hifi.c b/sound/pci/ice1712/prodigy_hifi.c +index 764cc93..075d5aa 100644 +--- a/sound/pci/ice1712/prodigy_hifi.c ++++ b/sound/pci/ice1712/prodigy_hifi.c +@@ -297,6 +297,7 @@ static int ak4396_dac_vol_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem + } + + static const DECLARE_TLV_DB_SCALE(db_scale_wm_dac, -12700, 100, 1); ++static const DECLARE_TLV_DB_LINEAR(ak4396_db_scale, TLV_DB_GAIN_MUTE, 0); + + static struct snd_kcontrol_new prodigy_hd2_controls[] __devinitdata = { + { +@@ -307,7 +308,7 @@ static struct snd_kcontrol_new prodigy_hd2_controls[] __devinitdata = { + .info = ak4396_dac_vol_info, + .get = ak4396_dac_vol_get, + .put = ak4396_dac_vol_put, +- .tlv = { .p = db_scale_wm_dac }, ++ .tlv = { .p = ak4396_db_scale }, + }, + }; + diff --git a/3.2.34/bump/1030_linux-3.2.31.patch b/3.2.34/bump/1030_linux-3.2.31.patch new file mode 100644 index 0000000..c6accf5 --- /dev/null +++ b/3.2.34/bump/1030_linux-3.2.31.patch @@ -0,0 +1,3327 @@ +diff --git a/Makefile b/Makefile +index 9fd7e60..fd9c414 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 2 +-SUBLEVEL = 30 ++SUBLEVEL = 31 + EXTRAVERSION = + NAME = Saber-toothed Squirrel + +diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S +index 9c18ebd..d63632f 100644 +--- a/arch/arm/boot/compressed/head.S ++++ b/arch/arm/boot/compressed/head.S +@@ -648,6 +648,7 @@ __armv7_mmu_cache_on: + mcrne p15, 0, r0, c8, c7, 0 @ flush I,D TLBs + #endif + mrc p15, 0, r0, c1, c0, 0 @ read control reg ++ bic r0, r0, #1 << 28 @ clear SCTLR.TRE + orr r0, r0, #0x5000 @ I-cache enable, RR cache replacement + orr r0, r0, #0x003c @ write buffer + #ifdef CONFIG_MMU +diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c +index 73ef56c..bda833c 100644 +--- a/arch/x86/kernel/alternative.c ++++ b/arch/x86/kernel/alternative.c +@@ -160,7 +160,7 @@ static const unsigned char * const k7_nops[ASM_NOP_MAX+2] = + #endif + + #ifdef P6_NOP1 +-static const unsigned char __initconst_or_module p6nops[] = ++static const unsigned char p6nops[] = + { + P6_NOP1, + P6_NOP2, +diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c +index 44d4393..a1e21ae 100644 +--- a/arch/x86/xen/enlighten.c ++++ b/arch/x86/xen/enlighten.c +@@ -1289,6 +1289,10 @@ asmlinkage void __init xen_start_kernel(void) + + /* Make sure ACS will be enabled */ + pci_request_acs(); ++ ++ /* Avoid searching for BIOS MP tables */ ++ x86_init.mpparse.find_smp_config = x86_init_noop; ++ x86_init.mpparse.get_smp_config = x86_init_uint_noop; + } + #ifdef CONFIG_PCI + /* PCI BIOS service won't work from a PV guest. */ +diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c +index bb104b4..6e5a7f1 100644 +--- a/arch/x86/xen/setup.c ++++ b/arch/x86/xen/setup.c +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -431,4 +432,7 @@ void __init xen_arch_setup(void) + boot_option_idle_override = IDLE_HALT; + WARN_ON(set_pm_idle_to_default()); + fiddle_vdso(); ++#ifdef CONFIG_NUMA ++ numa_off = 1; ++#endif + } +diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c +index c04ad68..321e23e 100644 +--- a/drivers/ata/libata-core.c ++++ b/drivers/ata/libata-core.c +@@ -4118,6 +4118,7 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { + + /* Devices which aren't very happy with higher link speeds */ + { "WD My Book", NULL, ATA_HORKAGE_1_5_GBPS, }, ++ { "Seagate FreeAgent GoFlex", NULL, ATA_HORKAGE_1_5_GBPS, }, + + /* + * Devices which choke on SETXFER. Applies only if both the +diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c +index de0435e..887f68f 100644 +--- a/drivers/block/aoe/aoecmd.c ++++ b/drivers/block/aoe/aoecmd.c +@@ -35,6 +35,7 @@ new_skb(ulong len) + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb->protocol = __constant_htons(ETH_P_AOE); ++ skb_checksum_none_assert(skb); + } + return skb; + } +diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c +index 38aa6dd..da33111 100644 +--- a/drivers/block/cciss_scsi.c ++++ b/drivers/block/cciss_scsi.c +@@ -795,6 +795,7 @@ static void complete_scsi_command(CommandList_struct *c, int timeout, + } + break; + case CMD_PROTOCOL_ERR: ++ cmd->result = DID_ERROR << 16; + dev_warn(&h->pdev->dev, + "%p has protocol error\n", c); + break; +diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c +index c3f0ee1..86848c6 100644 +--- a/drivers/block/nbd.c ++++ b/drivers/block/nbd.c +@@ -445,6 +445,14 @@ static void nbd_clear_que(struct nbd_device *lo) + req->errors++; + nbd_end_request(req); + } ++ ++ while (!list_empty(&lo->waiting_queue)) { ++ req = list_entry(lo->waiting_queue.next, struct request, ++ queuelist); ++ list_del_init(&req->queuelist); ++ req->errors++; ++ nbd_end_request(req); ++ } + } + + +@@ -594,6 +602,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, + lo->file = NULL; + nbd_clear_que(lo); + BUG_ON(!list_empty(&lo->queue_head)); ++ BUG_ON(!list_empty(&lo->waiting_queue)); + if (file) + fput(file); + return 0; +diff --git a/drivers/bluetooth/ath3k.c b/drivers/bluetooth/ath3k.c +index f1bd44f..5c6709d 100644 +--- a/drivers/bluetooth/ath3k.c ++++ b/drivers/bluetooth/ath3k.c +@@ -62,6 +62,7 @@ static struct usb_device_id ath3k_table[] = { + + /* Atheros AR3011 with sflash firmware*/ + { USB_DEVICE(0x0CF3, 0x3002) }, ++ { USB_DEVICE(0x0CF3, 0xE019) }, + { USB_DEVICE(0x13d3, 0x3304) }, + { USB_DEVICE(0x0930, 0x0215) }, + { USB_DEVICE(0x0489, 0xE03D) }, +@@ -76,12 +77,15 @@ static struct usb_device_id ath3k_table[] = { + { USB_DEVICE(0x04CA, 0x3005) }, + { USB_DEVICE(0x13d3, 0x3362) }, + { USB_DEVICE(0x0CF3, 0xE004) }, ++ { USB_DEVICE(0x0930, 0x0219) }, ++ { USB_DEVICE(0x0489, 0xe057) }, + + /* Atheros AR5BBU12 with sflash firmware */ + { USB_DEVICE(0x0489, 0xE02C) }, + + /* Atheros AR5BBU22 with sflash firmware */ + { USB_DEVICE(0x0489, 0xE03C) }, ++ { USB_DEVICE(0x0489, 0xE036) }, + + { } /* Terminating entry */ + }; +@@ -100,9 +104,12 @@ static struct usb_device_id ath3k_blist_tbl[] = { + { USB_DEVICE(0x04ca, 0x3005), .driver_info = BTUSB_ATH3012 }, + { USB_DEVICE(0x13d3, 0x3362), .driver_info = BTUSB_ATH3012 }, + { USB_DEVICE(0x0cf3, 0xe004), .driver_info = BTUSB_ATH3012 }, ++ { USB_DEVICE(0x0930, 0x0219), .driver_info = BTUSB_ATH3012 }, ++ { USB_DEVICE(0x0489, 0xe057), .driver_info = BTUSB_ATH3012 }, + + /* Atheros AR5BBU22 with sflash firmware */ + { USB_DEVICE(0x0489, 0xE03C), .driver_info = BTUSB_ATH3012 }, ++ { USB_DEVICE(0x0489, 0xE036), .driver_info = BTUSB_ATH3012 }, + + { } /* Terminating entry */ + }; +diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c +index fc4bcd6..6f95d98 100644 +--- a/drivers/bluetooth/btusb.c ++++ b/drivers/bluetooth/btusb.c +@@ -60,6 +60,9 @@ static struct usb_device_id btusb_table[] = { + /* Generic Bluetooth USB device */ + { USB_DEVICE_INFO(0xe0, 0x01, 0x01) }, + ++ /* Apple-specific (Broadcom) devices */ ++ { USB_VENDOR_AND_INTERFACE_INFO(0x05ac, 0xff, 0x01, 0x01) }, ++ + /* Broadcom SoftSailing reporting vendor specific */ + { USB_DEVICE(0x0a5c, 0x21e1) }, + +@@ -102,15 +105,14 @@ static struct usb_device_id btusb_table[] = { + + /* Broadcom BCM20702A0 */ + { USB_DEVICE(0x0489, 0xe042) }, +- { USB_DEVICE(0x0a5c, 0x21e3) }, +- { USB_DEVICE(0x0a5c, 0x21e6) }, +- { USB_DEVICE(0x0a5c, 0x21e8) }, +- { USB_DEVICE(0x0a5c, 0x21f3) }, + { USB_DEVICE(0x413c, 0x8197) }, + + /* Foxconn - Hon Hai */ + { USB_DEVICE(0x0489, 0xe033) }, + ++ /*Broadcom devices with vendor specific id */ ++ { USB_VENDOR_AND_INTERFACE_INFO(0x0a5c, 0xff, 0x01, 0x01) }, ++ + { } /* Terminating entry */ + }; + +@@ -125,6 +127,7 @@ static struct usb_device_id blacklist_table[] = { + + /* Atheros 3011 with sflash firmware */ + { USB_DEVICE(0x0cf3, 0x3002), .driver_info = BTUSB_IGNORE }, ++ { USB_DEVICE(0x0cf3, 0xe019), .driver_info = BTUSB_IGNORE }, + { USB_DEVICE(0x13d3, 0x3304), .driver_info = BTUSB_IGNORE }, + { USB_DEVICE(0x0930, 0x0215), .driver_info = BTUSB_IGNORE }, + { USB_DEVICE(0x0489, 0xe03d), .driver_info = BTUSB_IGNORE }, +@@ -139,12 +142,15 @@ static struct usb_device_id blacklist_table[] = { + { USB_DEVICE(0x04ca, 0x3005), .driver_info = BTUSB_ATH3012 }, + { USB_DEVICE(0x13d3, 0x3362), .driver_info = BTUSB_ATH3012 }, + { USB_DEVICE(0x0cf3, 0xe004), .driver_info = BTUSB_ATH3012 }, ++ { USB_DEVICE(0x0930, 0x0219), .driver_info = BTUSB_ATH3012 }, ++ { USB_DEVICE(0x0489, 0xe057), .driver_info = BTUSB_ATH3012 }, + + /* Atheros AR5BBU12 with sflash firmware */ + { USB_DEVICE(0x0489, 0xe02c), .driver_info = BTUSB_IGNORE }, + + /* Atheros AR5BBU12 with sflash firmware */ + { USB_DEVICE(0x0489, 0xe03c), .driver_info = BTUSB_ATH3012 }, ++ { USB_DEVICE(0x0489, 0xe036), .driver_info = BTUSB_ATH3012 }, + + /* Broadcom BCM2035 */ + { USB_DEVICE(0x0a5c, 0x2035), .driver_info = BTUSB_WRONG_SCO_MTU }, +diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c +index ad683ec..b7fe343 100644 +--- a/drivers/cpufreq/powernow-k8.c ++++ b/drivers/cpufreq/powernow-k8.c +@@ -32,7 +32,6 @@ + #include + #include + #include +-#include /* for current / set_cpus_allowed() */ + #include + #include + +@@ -1132,16 +1131,23 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, + return res; + } + +-/* Driver entry point to switch to the target frequency */ +-static int powernowk8_target(struct cpufreq_policy *pol, +- unsigned targfreq, unsigned relation) ++struct powernowk8_target_arg { ++ struct cpufreq_policy *pol; ++ unsigned targfreq; ++ unsigned relation; ++}; ++ ++static long powernowk8_target_fn(void *arg) + { +- cpumask_var_t oldmask; ++ struct powernowk8_target_arg *pta = arg; ++ struct cpufreq_policy *pol = pta->pol; ++ unsigned targfreq = pta->targfreq; ++ unsigned relation = pta->relation; + struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); + u32 checkfid; + u32 checkvid; + unsigned int newstate; +- int ret = -EIO; ++ int ret; + + if (!data) + return -EINVAL; +@@ -1149,29 +1155,16 @@ static int powernowk8_target(struct cpufreq_policy *pol, + checkfid = data->currfid; + checkvid = data->currvid; + +- /* only run on specific CPU from here on. */ +- /* This is poor form: use a workqueue or smp_call_function_single */ +- if (!alloc_cpumask_var(&oldmask, GFP_KERNEL)) +- return -ENOMEM; +- +- cpumask_copy(oldmask, tsk_cpus_allowed(current)); +- set_cpus_allowed_ptr(current, cpumask_of(pol->cpu)); +- +- if (smp_processor_id() != pol->cpu) { +- printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); +- goto err_out; +- } +- + if (pending_bit_stuck()) { + printk(KERN_ERR PFX "failing targ, change pending bit set\n"); +- goto err_out; ++ return -EIO; + } + + pr_debug("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n", + pol->cpu, targfreq, pol->min, pol->max, relation); + + if (query_current_values_with_pending_wait(data)) +- goto err_out; ++ return -EIO; + + if (cpu_family != CPU_HW_PSTATE) { + pr_debug("targ: curr fid 0x%x, vid 0x%x\n", +@@ -1189,7 +1182,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, + + if (cpufreq_frequency_table_target(pol, data->powernow_table, + targfreq, relation, &newstate)) +- goto err_out; ++ return -EIO; + + mutex_lock(&fidvid_mutex); + +@@ -1202,9 +1195,8 @@ static int powernowk8_target(struct cpufreq_policy *pol, + ret = transition_frequency_fidvid(data, newstate); + if (ret) { + printk(KERN_ERR PFX "transition frequency failed\n"); +- ret = 1; + mutex_unlock(&fidvid_mutex); +- goto err_out; ++ return 1; + } + mutex_unlock(&fidvid_mutex); + +@@ -1213,12 +1205,25 @@ static int powernowk8_target(struct cpufreq_policy *pol, + data->powernow_table[newstate].index); + else + pol->cur = find_khz_freq_from_fid(data->currfid); +- ret = 0; + +-err_out: +- set_cpus_allowed_ptr(current, oldmask); +- free_cpumask_var(oldmask); +- return ret; ++ return 0; ++} ++ ++/* Driver entry point to switch to the target frequency */ ++static int powernowk8_target(struct cpufreq_policy *pol, ++ unsigned targfreq, unsigned relation) ++{ ++ struct powernowk8_target_arg pta = { .pol = pol, .targfreq = targfreq, ++ .relation = relation }; ++ ++ /* ++ * Must run on @pol->cpu. cpufreq core is responsible for ensuring ++ * that we're bound to the current CPU and pol->cpu stays online. ++ */ ++ if (smp_processor_id() == pol->cpu) ++ return powernowk8_target_fn(&pta); ++ else ++ return work_on_cpu(pol->cpu, powernowk8_target_fn, &pta); + } + + /* Driver entry point to verify the policy and range of frequencies */ +diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c +index 79dcf6e..c60d9c1 100644 +--- a/drivers/dma/at_hdmac.c ++++ b/drivers/dma/at_hdmac.c +@@ -678,7 +678,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, + flags); + + if (unlikely(!atslave || !sg_len)) { +- dev_dbg(chan2dev(chan), "prep_dma_memcpy: length is zero!\n"); ++ dev_dbg(chan2dev(chan), "prep_slave_sg: sg length is zero!\n"); + return NULL; + } + +@@ -706,6 +706,11 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, + + mem = sg_dma_address(sg); + len = sg_dma_len(sg); ++ if (unlikely(!len)) { ++ dev_dbg(chan2dev(chan), ++ "prep_slave_sg: sg(%d) data length is zero\n", i); ++ goto err; ++ } + mem_width = 2; + if (unlikely(mem & 3 || len & 3)) + mem_width = 0; +@@ -740,6 +745,11 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, + + mem = sg_dma_address(sg); + len = sg_dma_len(sg); ++ if (unlikely(!len)) { ++ dev_dbg(chan2dev(chan), ++ "prep_slave_sg: sg(%d) data length is zero\n", i); ++ goto err; ++ } + mem_width = 2; + if (unlikely(mem & 3 || len & 3)) + mem_width = 0; +@@ -773,6 +783,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, + + err_desc_get: + dev_err(chan2dev(chan), "not enough descriptors available\n"); ++err: + atc_desc_put(atchan, first); + return NULL; + } +diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c +index 57104147..e8eedb7 100644 +--- a/drivers/dma/pl330.c ++++ b/drivers/dma/pl330.c +@@ -858,6 +858,11 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id) + /* Initialize channel parameters */ + num_chan = max(pdat ? pdat->nr_valid_peri : 0, (u8)pi->pcfg.num_chan); + pdmac->peripherals = kzalloc(num_chan * sizeof(*pch), GFP_KERNEL); ++ if (!pdmac->peripherals) { ++ ret = -ENOMEM; ++ dev_err(&adev->dev, "unable to allocate pdmac->peripherals\n"); ++ goto probe_err4; ++ } + + for (i = 0; i < num_chan; i++) { + pch = &pdmac->peripherals[i]; +diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c +index 0db57b5..da71881 100644 +--- a/drivers/edac/sb_edac.c ++++ b/drivers/edac/sb_edac.c +@@ -554,7 +554,8 @@ static int get_dimm_config(const struct mem_ctl_info *mci) + { + struct sbridge_pvt *pvt = mci->pvt_info; + struct csrow_info *csr; +- int i, j, banks, ranks, rows, cols, size, npages; ++ unsigned i, j, banks, ranks, rows, cols, npages; ++ u64 size; + int csrow = 0; + unsigned long last_page = 0; + u32 reg; +@@ -626,10 +627,10 @@ static int get_dimm_config(const struct mem_ctl_info *mci) + cols = numcol(mtr); + + /* DDR3 has 8 I/O banks */ +- size = (rows * cols * banks * ranks) >> (20 - 3); ++ size = ((u64)rows * cols * banks * ranks) >> (20 - 3); + npages = MiB_TO_PAGES(size); + +- debugf0("mc#%d: channel %d, dimm %d, %d Mb (%d pages) bank: %d, rank: %d, row: %#x, col: %#x\n", ++ debugf0("mc#%d: channel %d, dimm %d, %Ld Mb (%d pages) bank: %d, rank: %d, row: %#x, col: %#x\n", + pvt->sbridge_dev->mc, i, j, + size, npages, + banks, ranks, rows, cols); +diff --git a/drivers/gpio/gpio-lpc32xx.c b/drivers/gpio/gpio-lpc32xx.c +index 5b69480..2c40776 100644 +--- a/drivers/gpio/gpio-lpc32xx.c ++++ b/drivers/gpio/gpio-lpc32xx.c +@@ -295,6 +295,7 @@ static int lpc32xx_gpio_dir_output_p012(struct gpio_chip *chip, unsigned pin, + { + struct lpc32xx_gpio_chip *group = to_lpc32xx_gpio(chip); + ++ __set_gpio_level_p012(group, pin, value); + __set_gpio_dir_p012(group, pin, 0); + + return 0; +@@ -305,6 +306,7 @@ static int lpc32xx_gpio_dir_output_p3(struct gpio_chip *chip, unsigned pin, + { + struct lpc32xx_gpio_chip *group = to_lpc32xx_gpio(chip); + ++ __set_gpio_level_p3(group, pin, value); + __set_gpio_dir_p3(group, pin, 0); + + return 0; +@@ -313,6 +315,9 @@ static int lpc32xx_gpio_dir_output_p3(struct gpio_chip *chip, unsigned pin, + static int lpc32xx_gpio_dir_out_always(struct gpio_chip *chip, unsigned pin, + int value) + { ++ struct lpc32xx_gpio_chip *group = to_lpc32xx_gpio(chip); ++ ++ __set_gpo_level_p3(group, pin, value); + return 0; + } + +diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c +index 548a400..e48e01e 100644 +--- a/drivers/gpu/drm/i915/i915_gem.c ++++ b/drivers/gpu/drm/i915/i915_gem.c +@@ -3357,7 +3357,8 @@ i915_gem_object_pin(struct drm_i915_gem_object *obj, + struct drm_i915_private *dev_priv = dev->dev_private; + int ret; + +- BUG_ON(obj->pin_count == DRM_I915_GEM_OBJECT_MAX_PIN_COUNT); ++ if (WARN_ON(obj->pin_count == DRM_I915_GEM_OBJECT_MAX_PIN_COUNT)) ++ return -EBUSY; + WARN_ON(i915_verify_lists(dev)); + + if (obj->gtt_space != NULL) { +diff --git a/drivers/gpu/drm/i915/intel_hdmi.c b/drivers/gpu/drm/i915/intel_hdmi.c +index 9cd81ba..c2a64f4 100644 +--- a/drivers/gpu/drm/i915/intel_hdmi.c ++++ b/drivers/gpu/drm/i915/intel_hdmi.c +@@ -271,7 +271,7 @@ static void intel_hdmi_dpms(struct drm_encoder *encoder, int mode) + u32 temp; + u32 enable_bits = SDVO_ENABLE; + +- if (intel_hdmi->has_audio) ++ if (intel_hdmi->has_audio || mode != DRM_MODE_DPMS_ON) + enable_bits |= SDVO_AUDIO_ENABLE; + + temp = I915_READ(intel_hdmi->sdvox_reg); +diff --git a/drivers/gpu/drm/radeon/atombios_crtc.c b/drivers/gpu/drm/radeon/atombios_crtc.c +index ceffd20..a4011b0 100644 +--- a/drivers/gpu/drm/radeon/atombios_crtc.c ++++ b/drivers/gpu/drm/radeon/atombios_crtc.c +@@ -1446,98 +1446,14 @@ static void radeon_legacy_atom_fixup(struct drm_crtc *crtc) + } + } + +-/** +- * radeon_get_pll_use_mask - look up a mask of which pplls are in use +- * +- * @crtc: drm crtc +- * +- * Returns the mask of which PPLLs (Pixel PLLs) are in use. +- */ +-static u32 radeon_get_pll_use_mask(struct drm_crtc *crtc) +-{ +- struct drm_device *dev = crtc->dev; +- struct drm_crtc *test_crtc; +- struct radeon_crtc *radeon_test_crtc; +- u32 pll_in_use = 0; +- +- list_for_each_entry(test_crtc, &dev->mode_config.crtc_list, head) { +- if (crtc == test_crtc) +- continue; +- +- radeon_test_crtc = to_radeon_crtc(test_crtc); +- if (radeon_test_crtc->pll_id != ATOM_PPLL_INVALID) +- pll_in_use |= (1 << radeon_test_crtc->pll_id); +- } +- return pll_in_use; +-} +- +-/** +- * radeon_get_shared_dp_ppll - return the PPLL used by another crtc for DP +- * +- * @crtc: drm crtc +- * +- * Returns the PPLL (Pixel PLL) used by another crtc/encoder which is +- * also in DP mode. For DP, a single PPLL can be used for all DP +- * crtcs/encoders. +- */ +-static int radeon_get_shared_dp_ppll(struct drm_crtc *crtc) +-{ +- struct drm_device *dev = crtc->dev; +- struct drm_encoder *test_encoder; +- struct radeon_crtc *radeon_test_crtc; +- +- list_for_each_entry(test_encoder, &dev->mode_config.encoder_list, head) { +- if (test_encoder->crtc && (test_encoder->crtc != crtc)) { +- if (ENCODER_MODE_IS_DP(atombios_get_encoder_mode(test_encoder))) { +- /* for DP use the same PLL for all */ +- radeon_test_crtc = to_radeon_crtc(test_encoder->crtc); +- if (radeon_test_crtc->pll_id != ATOM_PPLL_INVALID) +- return radeon_test_crtc->pll_id; +- } +- } +- } +- return ATOM_PPLL_INVALID; +-} +- +-/** +- * radeon_atom_pick_pll - Allocate a PPLL for use by the crtc. +- * +- * @crtc: drm crtc +- * +- * Returns the PPLL (Pixel PLL) to be used by the crtc. For DP monitors +- * a single PPLL can be used for all DP crtcs/encoders. For non-DP +- * monitors a dedicated PPLL must be used. If a particular board has +- * an external DP PLL, return ATOM_PPLL_INVALID to skip PLL programming +- * as there is no need to program the PLL itself. If we are not able to +- * allocate a PLL, return ATOM_PPLL_INVALID to skip PLL programming to +- * avoid messing up an existing monitor. +- * +- * Asic specific PLL information +- * +- * DCE 6.1 +- * - PPLL2 is only available to UNIPHYA (both DP and non-DP) +- * - PPLL0, PPLL1 are available for UNIPHYB/C/D/E/F (both DP and non-DP) +- * +- * DCE 6.0 +- * - PPLL0 is available to all UNIPHY (DP only) +- * - PPLL1, PPLL2 are available for all UNIPHY (both DP and non-DP) and DAC +- * +- * DCE 5.0 +- * - DCPLL is available to all UNIPHY (DP only) +- * - PPLL1, PPLL2 are available for all UNIPHY (both DP and non-DP) and DAC +- * +- * DCE 3.0/4.0/4.1 +- * - PPLL1, PPLL2 are available for all UNIPHY (both DP and non-DP) and DAC +- * +- */ + static int radeon_atom_pick_pll(struct drm_crtc *crtc) + { + struct radeon_crtc *radeon_crtc = to_radeon_crtc(crtc); + struct drm_device *dev = crtc->dev; + struct radeon_device *rdev = dev->dev_private; + struct drm_encoder *test_encoder; +- u32 pll_in_use; +- int pll; ++ struct drm_crtc *test_crtc; ++ uint32_t pll_in_use = 0; + + if (ASIC_IS_DCE4(rdev)) { + list_for_each_entry(test_encoder, &dev->mode_config.encoder_list, head) { +@@ -1545,7 +1461,7 @@ static int radeon_atom_pick_pll(struct drm_crtc *crtc) + /* in DP mode, the DP ref clock can come from PPLL, DCPLL, or ext clock, + * depending on the asic: + * DCE4: PPLL or ext clock +- * DCE5: PPLL, DCPLL, or ext clock ++ * DCE5: DCPLL or ext clock + * + * Setting ATOM_PPLL_INVALID will cause SetPixelClock to skip + * PPLL/DCPLL programming and only program the DP DTO for the +@@ -1553,31 +1469,29 @@ static int radeon_atom_pick_pll(struct drm_crtc *crtc) + */ + if (ENCODER_MODE_IS_DP(atombios_get_encoder_mode(test_encoder))) { + if (rdev->clock.dp_extclk) +- /* skip PPLL programming if using ext clock */ + return ATOM_PPLL_INVALID; + else if (ASIC_IS_DCE5(rdev)) +- /* use DCPLL for all DP */ + return ATOM_DCPLL; +- else { +- /* use the same PPLL for all DP monitors */ +- pll = radeon_get_shared_dp_ppll(crtc); +- if (pll != ATOM_PPLL_INVALID) +- return pll; +- } + } +- break; + } + } +- /* all other cases */ +- pll_in_use = radeon_get_pll_use_mask(crtc); +- if (!(pll_in_use & (1 << ATOM_PPLL2))) +- return ATOM_PPLL2; +- if (!(pll_in_use & (1 << ATOM_PPLL1))) ++ ++ /* otherwise, pick one of the plls */ ++ list_for_each_entry(test_crtc, &dev->mode_config.crtc_list, head) { ++ struct radeon_crtc *radeon_test_crtc; ++ ++ if (crtc == test_crtc) ++ continue; ++ ++ radeon_test_crtc = to_radeon_crtc(test_crtc); ++ if ((radeon_test_crtc->pll_id >= ATOM_PPLL1) && ++ (radeon_test_crtc->pll_id <= ATOM_PPLL2)) ++ pll_in_use |= (1 << radeon_test_crtc->pll_id); ++ } ++ if (!(pll_in_use & 1)) + return ATOM_PPLL1; +- DRM_ERROR("unable to allocate a PPLL\n"); +- return ATOM_PPLL_INVALID; ++ return ATOM_PPLL2; + } else +- /* use PPLL1 or PPLL2 */ + return radeon_crtc->crtc_id; + + } +@@ -1696,7 +1610,7 @@ static void atombios_crtc_disable(struct drm_crtc *crtc) + break; + } + done: +- radeon_crtc->pll_id = ATOM_PPLL_INVALID; ++ radeon_crtc->pll_id = -1; + } + + static const struct drm_crtc_helper_funcs atombios_helper_funcs = { +@@ -1745,6 +1659,6 @@ void radeon_atombios_init_crtc(struct drm_device *dev, + else + radeon_crtc->crtc_offset = 0; + } +- radeon_crtc->pll_id = ATOM_PPLL_INVALID; ++ radeon_crtc->pll_id = -1; + drm_crtc_helper_add(&radeon_crtc->base, &atombios_helper_funcs); + } +diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c +index 0c8bea9..a21e763 100644 +--- a/drivers/hid/hid-core.c ++++ b/drivers/hid/hid-core.c +@@ -1026,7 +1026,7 @@ static struct hid_report *hid_get_report(struct hid_report_enum *report_enum, + return report; + } + +-void hid_report_raw_event(struct hid_device *hid, int type, u8 *data, int size, ++int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, int size, + int interrupt) + { + struct hid_report_enum *report_enum = hid->report_enum + type; +@@ -1034,10 +1034,11 @@ void hid_report_raw_event(struct hid_device *hid, int type, u8 *data, int size, + unsigned int a; + int rsize, csize = size; + u8 *cdata = data; ++ int ret = 0; + + report = hid_get_report(report_enum, data); + if (!report) +- return; ++ goto out; + + if (report_enum->numbered) { + cdata++; +@@ -1057,14 +1058,19 @@ void hid_report_raw_event(struct hid_device *hid, int type, u8 *data, int size, + + if ((hid->claimed & HID_CLAIMED_HIDDEV) && hid->hiddev_report_event) + hid->hiddev_report_event(hid, report); +- if (hid->claimed & HID_CLAIMED_HIDRAW) +- hidraw_report_event(hid, data, size); ++ if (hid->claimed & HID_CLAIMED_HIDRAW) { ++ ret = hidraw_report_event(hid, data, size); ++ if (ret) ++ goto out; ++ } + + for (a = 0; a < report->maxfield; a++) + hid_input_field(hid, report->field[a], cdata, interrupt); + + if (hid->claimed & HID_CLAIMED_INPUT) + hidinput_report_event(hid, report); ++out: ++ return ret; + } + EXPORT_SYMBOL_GPL(hid_report_raw_event); + +@@ -1141,7 +1147,7 @@ nomem: + } + } + +- hid_report_raw_event(hid, type, data, size, interrupt); ++ ret = hid_report_raw_event(hid, type, data, size, interrupt); + + unlock: + up(&hid->driver_lock); +diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c +index 2eac8c5..8821ecc 100644 +--- a/drivers/hid/hid-logitech-dj.c ++++ b/drivers/hid/hid-logitech-dj.c +@@ -185,6 +185,7 @@ static struct hid_ll_driver logi_dj_ll_driver; + static int logi_dj_output_hidraw_report(struct hid_device *hid, u8 * buf, + size_t count, + unsigned char report_type); ++static int logi_dj_recv_query_paired_devices(struct dj_receiver_dev *djrcv_dev); + + static void logi_dj_recv_destroy_djhid_device(struct dj_receiver_dev *djrcv_dev, + struct dj_report *dj_report) +@@ -225,6 +226,7 @@ static void logi_dj_recv_add_djhid_device(struct dj_receiver_dev *djrcv_dev, + if (dj_report->report_params[DEVICE_PAIRED_PARAM_SPFUNCTION] & + SPFUNCTION_DEVICE_LIST_EMPTY) { + dbg_hid("%s: device list is empty\n", __func__); ++ djrcv_dev->querying_devices = false; + return; + } + +@@ -235,6 +237,12 @@ static void logi_dj_recv_add_djhid_device(struct dj_receiver_dev *djrcv_dev, + return; + } + ++ if (djrcv_dev->paired_dj_devices[dj_report->device_index]) { ++ /* The device is already known. No need to reallocate it. */ ++ dbg_hid("%s: device is already known\n", __func__); ++ return; ++ } ++ + dj_hiddev = hid_allocate_device(); + if (IS_ERR(dj_hiddev)) { + dev_err(&djrcv_hdev->dev, "%s: hid_allocate_device failed\n", +@@ -298,6 +306,7 @@ static void delayedwork_callback(struct work_struct *work) + struct dj_report dj_report; + unsigned long flags; + int count; ++ int retval; + + dbg_hid("%s\n", __func__); + +@@ -330,6 +339,25 @@ static void delayedwork_callback(struct work_struct *work) + logi_dj_recv_destroy_djhid_device(djrcv_dev, &dj_report); + break; + default: ++ /* A normal report (i. e. not belonging to a pair/unpair notification) ++ * arriving here, means that the report arrived but we did not have a ++ * paired dj_device associated to the report's device_index, this ++ * means that the original "device paired" notification corresponding ++ * to this dj_device never arrived to this driver. The reason is that ++ * hid-core discards all packets coming from a device while probe() is ++ * executing. */ ++ if (!djrcv_dev->paired_dj_devices[dj_report.device_index]) { ++ /* ok, we don't know the device, just re-ask the ++ * receiver for the list of connected devices. */ ++ retval = logi_dj_recv_query_paired_devices(djrcv_dev); ++ if (!retval) { ++ /* everything went fine, so just leave */ ++ break; ++ } ++ dev_err(&djrcv_dev->hdev->dev, ++ "%s:logi_dj_recv_query_paired_devices " ++ "error:%d\n", __func__, retval); ++ } + dbg_hid("%s: unexpected report type\n", __func__); + } + } +@@ -360,6 +388,12 @@ static void logi_dj_recv_forward_null_report(struct dj_receiver_dev *djrcv_dev, + if (!djdev) { + dbg_hid("djrcv_dev->paired_dj_devices[dj_report->device_index]" + " is NULL, index %d\n", dj_report->device_index); ++ kfifo_in(&djrcv_dev->notif_fifo, dj_report, sizeof(struct dj_report)); ++ ++ if (schedule_work(&djrcv_dev->work) == 0) { ++ dbg_hid("%s: did not schedule the work item, was already " ++ "queued\n", __func__); ++ } + return; + } + +@@ -390,6 +424,12 @@ static void logi_dj_recv_forward_report(struct dj_receiver_dev *djrcv_dev, + if (dj_device == NULL) { + dbg_hid("djrcv_dev->paired_dj_devices[dj_report->device_index]" + " is NULL, index %d\n", dj_report->device_index); ++ kfifo_in(&djrcv_dev->notif_fifo, dj_report, sizeof(struct dj_report)); ++ ++ if (schedule_work(&djrcv_dev->work) == 0) { ++ dbg_hid("%s: did not schedule the work item, was already " ++ "queued\n", __func__); ++ } + return; + } + +@@ -428,27 +468,42 @@ static int logi_dj_recv_send_report(struct dj_receiver_dev *djrcv_dev, + + static int logi_dj_recv_query_paired_devices(struct dj_receiver_dev *djrcv_dev) + { +- struct dj_report dj_report; ++ struct dj_report *dj_report; ++ int retval; ++ ++ /* no need to protect djrcv_dev->querying_devices */ ++ if (djrcv_dev->querying_devices) ++ return 0; + +- memset(&dj_report, 0, sizeof(dj_report)); +- dj_report.report_id = REPORT_ID_DJ_SHORT; +- dj_report.device_index = 0xFF; +- dj_report.report_type = REPORT_TYPE_CMD_GET_PAIRED_DEVICES; +- return logi_dj_recv_send_report(djrcv_dev, &dj_report); ++ dj_report = kzalloc(sizeof(struct dj_report), GFP_KERNEL); ++ if (!dj_report) ++ return -ENOMEM; ++ dj_report->report_id = REPORT_ID_DJ_SHORT; ++ dj_report->device_index = 0xFF; ++ dj_report->report_type = REPORT_TYPE_CMD_GET_PAIRED_DEVICES; ++ retval = logi_dj_recv_send_report(djrcv_dev, dj_report); ++ kfree(dj_report); ++ return retval; + } + ++ + static int logi_dj_recv_switch_to_dj_mode(struct dj_receiver_dev *djrcv_dev, + unsigned timeout) + { +- struct dj_report dj_report; ++ struct dj_report *dj_report; ++ int retval; + +- memset(&dj_report, 0, sizeof(dj_report)); +- dj_report.report_id = REPORT_ID_DJ_SHORT; +- dj_report.device_index = 0xFF; +- dj_report.report_type = REPORT_TYPE_CMD_SWITCH; +- dj_report.report_params[CMD_SWITCH_PARAM_DEVBITFIELD] = 0x1F; +- dj_report.report_params[CMD_SWITCH_PARAM_TIMEOUT_SECONDS] = (u8)timeout; +- return logi_dj_recv_send_report(djrcv_dev, &dj_report); ++ dj_report = kzalloc(sizeof(struct dj_report), GFP_KERNEL); ++ if (!dj_report) ++ return -ENOMEM; ++ dj_report->report_id = REPORT_ID_DJ_SHORT; ++ dj_report->device_index = 0xFF; ++ dj_report->report_type = REPORT_TYPE_CMD_SWITCH; ++ dj_report->report_params[CMD_SWITCH_PARAM_DEVBITFIELD] = 0x3F; ++ dj_report->report_params[CMD_SWITCH_PARAM_TIMEOUT_SECONDS] = (u8)timeout; ++ retval = logi_dj_recv_send_report(djrcv_dev, dj_report); ++ kfree(dj_report); ++ return retval; + } + + +diff --git a/drivers/hid/hid-logitech-dj.h b/drivers/hid/hid-logitech-dj.h +index fd28a5e..4a40003 100644 +--- a/drivers/hid/hid-logitech-dj.h ++++ b/drivers/hid/hid-logitech-dj.h +@@ -101,6 +101,7 @@ struct dj_receiver_dev { + struct work_struct work; + struct kfifo notif_fifo; + spinlock_t lock; ++ bool querying_devices; + }; + + struct dj_device { +diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c +index cf7d6d5..17d15bb 100644 +--- a/drivers/hid/hidraw.c ++++ b/drivers/hid/hidraw.c +@@ -42,6 +42,7 @@ static struct cdev hidraw_cdev; + static struct class *hidraw_class; + static struct hidraw *hidraw_table[HIDRAW_MAX_DEVICES]; + static DEFINE_MUTEX(minors_lock); ++static void drop_ref(struct hidraw *hid, int exists_bit); + + static ssize_t hidraw_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) + { +@@ -87,13 +88,16 @@ static ssize_t hidraw_read(struct file *file, char __user *buffer, size_t count, + len = list->buffer[list->tail].len > count ? + count : list->buffer[list->tail].len; + +- if (copy_to_user(buffer, list->buffer[list->tail].value, len)) { +- ret = -EFAULT; +- goto out; ++ if (list->buffer[list->tail].value) { ++ if (copy_to_user(buffer, list->buffer[list->tail].value, len)) { ++ ret = -EFAULT; ++ goto out; ++ } ++ ret = len; + } +- ret = len; + + kfree(list->buffer[list->tail].value); ++ list->buffer[list->tail].value = NULL; + list->tail = (list->tail + 1) & (HIDRAW_BUFFER_SIZE - 1); + } + out: +@@ -110,7 +114,7 @@ static ssize_t hidraw_send_report(struct file *file, const char __user *buffer, + __u8 *buf; + int ret = 0; + +- if (!hidraw_table[minor]) { ++ if (!hidraw_table[minor] || !hidraw_table[minor]->exist) { + ret = -ENODEV; + goto out; + } +@@ -258,7 +262,7 @@ static int hidraw_open(struct inode *inode, struct file *file) + } + + mutex_lock(&minors_lock); +- if (!hidraw_table[minor]) { ++ if (!hidraw_table[minor] || !hidraw_table[minor]->exist) { + err = -ENODEV; + goto out_unlock; + } +@@ -295,32 +299,12 @@ out: + static int hidraw_release(struct inode * inode, struct file * file) + { + unsigned int minor = iminor(inode); +- struct hidraw *dev; + struct hidraw_list *list = file->private_data; +- int ret; +- +- mutex_lock(&minors_lock); +- if (!hidraw_table[minor]) { +- ret = -ENODEV; +- goto unlock; +- } + ++ drop_ref(hidraw_table[minor], 0); + list_del(&list->node); +- dev = hidraw_table[minor]; +- if (!--dev->open) { +- if (list->hidraw->exist) { +- hid_hw_power(dev->hid, PM_HINT_NORMAL); +- hid_hw_close(dev->hid); +- } else { +- kfree(list->hidraw); +- } +- } + kfree(list); +- ret = 0; +-unlock: +- mutex_unlock(&minors_lock); +- +- return ret; ++ return 0; + } + + static long hidraw_ioctl(struct file *file, unsigned int cmd, +@@ -437,19 +421,29 @@ static const struct file_operations hidraw_ops = { + .llseek = noop_llseek, + }; + +-void hidraw_report_event(struct hid_device *hid, u8 *data, int len) ++int hidraw_report_event(struct hid_device *hid, u8 *data, int len) + { + struct hidraw *dev = hid->hidraw; + struct hidraw_list *list; ++ int ret = 0; + + list_for_each_entry(list, &dev->list, node) { +- list->buffer[list->head].value = kmemdup(data, len, GFP_ATOMIC); ++ int new_head = (list->head + 1) & (HIDRAW_BUFFER_SIZE - 1); ++ ++ if (new_head == list->tail) ++ continue; ++ ++ if (!(list->buffer[list->head].value = kmemdup(data, len, GFP_ATOMIC))) { ++ ret = -ENOMEM; ++ break; ++ } + list->buffer[list->head].len = len; +- list->head = (list->head + 1) & (HIDRAW_BUFFER_SIZE - 1); ++ list->head = new_head; + kill_fasync(&list->fasync, SIGIO, POLL_IN); + } + + wake_up_interruptible(&dev->wait); ++ return ret; + } + EXPORT_SYMBOL_GPL(hidraw_report_event); + +@@ -512,21 +506,7 @@ EXPORT_SYMBOL_GPL(hidraw_connect); + void hidraw_disconnect(struct hid_device *hid) + { + struct hidraw *hidraw = hid->hidraw; +- +- mutex_lock(&minors_lock); +- hidraw->exist = 0; +- +- device_destroy(hidraw_class, MKDEV(hidraw_major, hidraw->minor)); +- +- hidraw_table[hidraw->minor] = NULL; +- +- if (hidraw->open) { +- hid_hw_close(hid); +- wake_up_interruptible(&hidraw->wait); +- } else { +- kfree(hidraw); +- } +- mutex_unlock(&minors_lock); ++ drop_ref(hidraw, 1); + } + EXPORT_SYMBOL_GPL(hidraw_disconnect); + +@@ -542,21 +522,28 @@ int __init hidraw_init(void) + + if (result < 0) { + pr_warn("can't get major number\n"); +- result = 0; + goto out; + } + + hidraw_class = class_create(THIS_MODULE, "hidraw"); + if (IS_ERR(hidraw_class)) { + result = PTR_ERR(hidraw_class); +- unregister_chrdev(hidraw_major, "hidraw"); +- goto out; ++ goto error_cdev; + } + + cdev_init(&hidraw_cdev, &hidraw_ops); +- cdev_add(&hidraw_cdev, dev_id, HIDRAW_MAX_DEVICES); ++ result = cdev_add(&hidraw_cdev, dev_id, HIDRAW_MAX_DEVICES); ++ if (result < 0) ++ goto error_class; ++ + out: + return result; ++ ++error_class: ++ class_destroy(hidraw_class); ++error_cdev: ++ unregister_chrdev_region(dev_id, HIDRAW_MAX_DEVICES); ++ goto out; + } + + void hidraw_exit(void) +@@ -568,3 +555,23 @@ void hidraw_exit(void) + unregister_chrdev_region(dev_id, HIDRAW_MAX_DEVICES); + + } ++ ++static void drop_ref(struct hidraw *hidraw, int exists_bit) ++{ ++ mutex_lock(&minors_lock); ++ if (exists_bit) { ++ hid_hw_close(hidraw->hid); ++ hidraw->exist = 0; ++ if (hidraw->open) ++ wake_up_interruptible(&hidraw->wait); ++ } else { ++ --hidraw->open; ++ } ++ ++ if (!hidraw->open && !hidraw->exist) { ++ device_destroy(hidraw_class, MKDEV(hidraw_major, hidraw->minor)); ++ hidraw_table[hidraw->minor] = NULL; ++ kfree(hidraw); ++ } ++ mutex_unlock(&minors_lock); ++} +diff --git a/drivers/hwmon/ad7314.c b/drivers/hwmon/ad7314.c +index 5d760f3..08e2947 100644 +--- a/drivers/hwmon/ad7314.c ++++ b/drivers/hwmon/ad7314.c +@@ -96,10 +96,18 @@ static ssize_t ad7314_show_temperature(struct device *dev, + } + } + ++static ssize_t ad7314_show_name(struct device *dev, ++ struct device_attribute *devattr, char *buf) ++{ ++ return sprintf(buf, "%s\n", to_spi_device(dev)->modalias); ++} ++ ++static DEVICE_ATTR(name, S_IRUGO, ad7314_show_name, NULL); + static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, + ad7314_show_temperature, NULL, 0); + + static struct attribute *ad7314_attributes[] = { ++ &dev_attr_name.attr, + &sensor_dev_attr_temp1_input.dev_attr.attr, + NULL, + }; +diff --git a/drivers/hwmon/ads7871.c b/drivers/hwmon/ads7871.c +index 04450f8..685aae6 100644 +--- a/drivers/hwmon/ads7871.c ++++ b/drivers/hwmon/ads7871.c +@@ -133,6 +133,12 @@ static ssize_t show_voltage(struct device *dev, + } + } + ++static ssize_t ads7871_show_name(struct device *dev, ++ struct device_attribute *devattr, char *buf) ++{ ++ return sprintf(buf, "%s\n", to_spi_device(dev)->modalias); ++} ++ + static SENSOR_DEVICE_ATTR(in0_input, S_IRUGO, show_voltage, NULL, 0); + static SENSOR_DEVICE_ATTR(in1_input, S_IRUGO, show_voltage, NULL, 1); + static SENSOR_DEVICE_ATTR(in2_input, S_IRUGO, show_voltage, NULL, 2); +@@ -142,6 +148,8 @@ static SENSOR_DEVICE_ATTR(in5_input, S_IRUGO, show_voltage, NULL, 5); + static SENSOR_DEVICE_ATTR(in6_input, S_IRUGO, show_voltage, NULL, 6); + static SENSOR_DEVICE_ATTR(in7_input, S_IRUGO, show_voltage, NULL, 7); + ++static DEVICE_ATTR(name, S_IRUGO, ads7871_show_name, NULL); ++ + static struct attribute *ads7871_attributes[] = { + &sensor_dev_attr_in0_input.dev_attr.attr, + &sensor_dev_attr_in1_input.dev_attr.attr, +@@ -151,6 +159,7 @@ static struct attribute *ads7871_attributes[] = { + &sensor_dev_attr_in5_input.dev_attr.attr, + &sensor_dev_attr_in6_input.dev_attr.attr, + &sensor_dev_attr_in7_input.dev_attr.attr, ++ &dev_attr_name.attr, + NULL + }; + +diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c +index e8e18ca..ac2d6cb 100644 +--- a/drivers/hwmon/fam15h_power.c ++++ b/drivers/hwmon/fam15h_power.c +@@ -128,12 +128,12 @@ static bool __devinit fam15h_power_is_internal_node0(struct pci_dev *f4) + * counter saturations resulting in bogus power readings. + * We correct this value ourselves to cope with older BIOSes. + */ +-static DEFINE_PCI_DEVICE_TABLE(affected_device) = { ++static const struct pci_device_id affected_device[] = { + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, + { 0 } + }; + +-static void __devinit tweak_runavg_range(struct pci_dev *pdev) ++static void tweak_runavg_range(struct pci_dev *pdev) + { + u32 val; + +@@ -157,6 +157,16 @@ static void __devinit tweak_runavg_range(struct pci_dev *pdev) + REG_TDP_RUNNING_AVERAGE, val); + } + ++#ifdef CONFIG_PM ++static int fam15h_power_resume(struct pci_dev *pdev) ++{ ++ tweak_runavg_range(pdev); ++ return 0; ++} ++#else ++#define fam15h_power_resume NULL ++#endif ++ + static void __devinit fam15h_power_init_data(struct pci_dev *f4, + struct fam15h_power_data *data) + { +@@ -255,6 +265,7 @@ static struct pci_driver fam15h_power_driver = { + .id_table = fam15h_power_id_table, + .probe = fam15h_power_probe, + .remove = __devexit_p(fam15h_power_remove), ++ .resume = fam15h_power_resume, + }; + + static int __init fam15h_power_init(void) +diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-x86ia64io.h +index d4ec371..cd1a843 100644 +--- a/drivers/input/serio/i8042-x86ia64io.h ++++ b/drivers/input/serio/i8042-x86ia64io.h +@@ -335,6 +335,12 @@ static const struct dmi_system_id __initconst i8042_dmi_nomux_table[] = { + }, + { + .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"), ++ DMI_MATCH(DMI_PRODUCT_NAME, "SATELLITE C850D"), ++ }, ++ }, ++ { ++ .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ALIENWARE"), + DMI_MATCH(DMI_PRODUCT_NAME, "Sentia"), + }, +diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c +index 6777ca0..73ca321 100644 +--- a/drivers/iommu/intr_remapping.c ++++ b/drivers/iommu/intr_remapping.c +@@ -752,6 +752,7 @@ int __init parse_ioapics_under_ir(void) + { + struct dmar_drhd_unit *drhd; + int ir_supported = 0; ++ int ioapic_idx; + + for_each_drhd_unit(drhd) { + struct intel_iommu *iommu = drhd->iommu; +@@ -764,13 +765,20 @@ int __init parse_ioapics_under_ir(void) + } + } + +- if (ir_supported && ir_ioapic_num != nr_ioapics) { +- printk(KERN_WARNING +- "Not all IO-APIC's listed under remapping hardware\n"); +- return -1; ++ if (!ir_supported) ++ return 0; ++ ++ for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) { ++ int ioapic_id = mpc_ioapic_id(ioapic_idx); ++ if (!map_ioapic_to_ir(ioapic_id)) { ++ pr_err(FW_BUG "ioapic %d has no mapping iommu, " ++ "interrupt remapping will be disabled\n", ++ ioapic_id); ++ return -1; ++ } + } + +- return ir_supported; ++ return 1; + } + + int __init ir_dev_scope_init(void) +diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c +index 8e91321..52848ab 100644 +--- a/drivers/md/dm-table.c ++++ b/drivers/md/dm-table.c +@@ -1350,17 +1350,25 @@ static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, + return q && blk_queue_nonrot(q); + } + +-static bool dm_table_is_nonrot(struct dm_table *t) ++static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, ++ sector_t start, sector_t len, void *data) ++{ ++ struct request_queue *q = bdev_get_queue(dev->bdev); ++ ++ return q && !blk_queue_add_random(q); ++} ++ ++static bool dm_table_all_devices_attribute(struct dm_table *t, ++ iterate_devices_callout_fn func) + { + struct dm_target *ti; + unsigned i = 0; + +- /* Ensure that all underlying device are non-rotational. */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->type->iterate_devices || +- !ti->type->iterate_devices(ti, device_is_nonrot, NULL)) ++ !ti->type->iterate_devices(ti, func, NULL)) + return 0; + } + +@@ -1392,7 +1400,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, + if (!dm_table_discard_zeroes_data(t)) + q->limits.discard_zeroes_data = 0; + +- if (dm_table_is_nonrot(t)) ++ /* Ensure that all underlying devices are non-rotational. */ ++ if (dm_table_all_devices_attribute(t, device_is_nonrot)) + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + else + queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q); +@@ -1400,6 +1409,15 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, + dm_table_set_integrity(t); + + /* ++ * Determine whether or not this queue's I/O timings contribute ++ * to the entropy pool, Only request-based targets use this. ++ * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not ++ * have it set. ++ */ ++ if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) ++ queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); ++ ++ /* + * QUEUE_FLAG_STACKABLE must be set after all queue settings are + * visible to other CPUs because, once the flag is set, incoming bios + * are processed by request-based dm, which refers to the queue +diff --git a/drivers/md/dm.c b/drivers/md/dm.c +index 4720f68..502dcf7 100644 +--- a/drivers/md/dm.c ++++ b/drivers/md/dm.c +@@ -866,10 +866,14 @@ static void dm_done(struct request *clone, int error, bool mapped) + { + int r = error; + struct dm_rq_target_io *tio = clone->end_io_data; +- dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; ++ dm_request_endio_fn rq_end_io = NULL; + +- if (mapped && rq_end_io) +- r = rq_end_io(tio->ti, clone, error, &tio->info); ++ if (tio->ti) { ++ rq_end_io = tio->ti->type->rq_end_io; ++ ++ if (mapped && rq_end_io) ++ r = rq_end_io(tio->ti, clone, error, &tio->info); ++ } + + if (r <= 0) + /* The target wants to complete the I/O */ +@@ -1566,15 +1570,6 @@ static int map_request(struct dm_target *ti, struct request *clone, + int r, requeued = 0; + struct dm_rq_target_io *tio = clone->end_io_data; + +- /* +- * Hold the md reference here for the in-flight I/O. +- * We can't rely on the reference count by device opener, +- * because the device may be closed during the request completion +- * when all bios are completed. +- * See the comment in rq_completed() too. +- */ +- dm_get(md); +- + tio->ti = ti; + r = ti->type->map_rq(ti, clone, &tio->info); + switch (r) { +@@ -1606,6 +1601,26 @@ static int map_request(struct dm_target *ti, struct request *clone, + return requeued; + } + ++static struct request *dm_start_request(struct mapped_device *md, struct request *orig) ++{ ++ struct request *clone; ++ ++ blk_start_request(orig); ++ clone = orig->special; ++ atomic_inc(&md->pending[rq_data_dir(clone)]); ++ ++ /* ++ * Hold the md reference here for the in-flight I/O. ++ * We can't rely on the reference count by device opener, ++ * because the device may be closed during the request completion ++ * when all bios are completed. ++ * See the comment in rq_completed() too. ++ */ ++ dm_get(md); ++ ++ return clone; ++} ++ + /* + * q->request_fn for request-based dm. + * Called with the queue lock held. +@@ -1635,14 +1650,21 @@ static void dm_request_fn(struct request_queue *q) + pos = blk_rq_pos(rq); + + ti = dm_table_find_target(map, pos); +- BUG_ON(!dm_target_is_valid(ti)); ++ if (!dm_target_is_valid(ti)) { ++ /* ++ * Must perform setup, that dm_done() requires, ++ * before calling dm_kill_unmapped_request ++ */ ++ DMERR_LIMIT("request attempted access beyond the end of device"); ++ clone = dm_start_request(md, rq); ++ dm_kill_unmapped_request(clone, -EIO); ++ continue; ++ } + + if (ti->type->busy && ti->type->busy(ti)) + goto delay_and_out; + +- blk_start_request(rq); +- clone = rq->special; +- atomic_inc(&md->pending[rq_data_dir(clone)]); ++ clone = dm_start_request(md, rq); + + spin_unlock(q->queue_lock); + if (map_request(ti, clone, md)) +@@ -1662,8 +1684,6 @@ delay_and_out: + blk_delay_queue(q, HZ / 10); + out: + dm_table_put(map); +- +- return; + } + + int dm_underlying_device_busy(struct request_queue *q) +diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c +index 7a9eef6..0634ee5 100644 +--- a/drivers/md/raid10.c ++++ b/drivers/md/raid10.c +@@ -1226,14 +1226,16 @@ static int enough(struct r10conf *conf, int ignore) + do { + int n = conf->copies; + int cnt = 0; ++ int this = first; + while (n--) { +- if (conf->mirrors[first].rdev && +- first != ignore) ++ if (conf->mirrors[this].rdev && ++ this != ignore) + cnt++; +- first = (first+1) % conf->raid_disks; ++ this = (this+1) % conf->raid_disks; + } + if (cnt == 0) + return 0; ++ first = (first + conf->near_copies) % conf->raid_disks; + } while (first != 0); + return 1; + } +diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c +index 6ce32a7..aaeaff2 100644 +--- a/drivers/mmc/host/sdhci.c ++++ b/drivers/mmc/host/sdhci.c +@@ -2712,8 +2712,9 @@ int sdhci_add_host(struct sdhci_host *host) + mmc_card_is_removable(mmc)) + mmc->caps |= MMC_CAP_NEEDS_POLL; + +- /* UHS-I mode(s) supported by the host controller. */ +- if (host->version >= SDHCI_SPEC_300) ++ /* Any UHS-I mode in caps implies SDR12 and SDR25 support. */ ++ if (caps[1] & (SDHCI_SUPPORT_SDR104 | SDHCI_SUPPORT_SDR50 | ++ SDHCI_SUPPORT_DDR50)) + mmc->caps |= MMC_CAP_UHS_SDR12 | MMC_CAP_UHS_SDR25; + + /* SDR104 supports also implies SDR50 support */ +diff --git a/drivers/net/can/janz-ican3.c b/drivers/net/can/janz-ican3.c +index 32778d5..46194bc 100644 +--- a/drivers/net/can/janz-ican3.c ++++ b/drivers/net/can/janz-ican3.c +@@ -1250,7 +1250,6 @@ static irqreturn_t ican3_irq(int irq, void *dev_id) + */ + static int ican3_reset_module(struct ican3_dev *mod) + { +- u8 val = 1 << mod->num; + unsigned long start; + u8 runold, runnew; + +@@ -1264,8 +1263,7 @@ static int ican3_reset_module(struct ican3_dev *mod) + runold = ioread8(mod->dpm + TARGET_RUNNING); + + /* reset the module */ +- iowrite8(val, &mod->ctrl->reset_assert); +- iowrite8(val, &mod->ctrl->reset_deassert); ++ iowrite8(0x00, &mod->dpmctrl->hwreset); + + /* wait until the module has finished resetting and is running */ + start = jiffies; +diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c +index 2adc294..79c70ae 100644 +--- a/drivers/net/can/ti_hecc.c ++++ b/drivers/net/can/ti_hecc.c +@@ -971,12 +971,12 @@ static int __devexit ti_hecc_remove(struct platform_device *pdev) + struct net_device *ndev = platform_get_drvdata(pdev); + struct ti_hecc_priv *priv = netdev_priv(ndev); + ++ unregister_candev(ndev); + clk_disable(priv->clk); + clk_put(priv->clk); + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + iounmap(priv->base); + release_mem_region(res->start, resource_size(res)); +- unregister_candev(ndev); + free_candev(ndev); + platform_set_drvdata(pdev, NULL); + +diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +index 2c1a5c0..4c50ac0 100644 +--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c ++++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +@@ -554,14 +554,16 @@ static inline void bnx2x_set_skb_rxhash(struct bnx2x *bp, union eth_rx_cqe *cqe, + static void bnx2x_csum_validate(struct sk_buff *skb, union eth_rx_cqe *cqe, + struct bnx2x_fastpath *fp) + { +- /* Do nothing if no IP/L4 csum validation was done */ +- ++ /* Do nothing if no L4 csum validation was done. ++ * We do not check whether IP csum was validated. For IPv4 we assume ++ * that if the card got as far as validating the L4 csum, it also ++ * validated the IP csum. IPv6 has no IP csum. ++ */ + if (cqe->fast_path_cqe.status_flags & +- (ETH_FAST_PATH_RX_CQE_IP_XSUM_NO_VALIDATION_FLG | +- ETH_FAST_PATH_RX_CQE_L4_XSUM_NO_VALIDATION_FLG)) ++ ETH_FAST_PATH_RX_CQE_L4_XSUM_NO_VALIDATION_FLG) + return; + +- /* If both IP/L4 validation were done, check if an error was found. */ ++ /* If L4 validation was done, check if an error was found. */ + + if (cqe->fast_path_cqe.type_error_flags & + (ETH_FAST_PATH_RX_CQE_IP_BAD_XSUM_FLG | +diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c +index 6b258d9..01bc102 100644 +--- a/drivers/net/ethernet/broadcom/tg3.c ++++ b/drivers/net/ethernet/broadcom/tg3.c +@@ -14013,9 +14013,13 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) + if (tg3_flag(tp, HW_TSO_1) || + tg3_flag(tp, HW_TSO_2) || + tg3_flag(tp, HW_TSO_3) || +- (tp->fw_needed && !tg3_flag(tp, ENABLE_ASF))) ++ tp->fw_needed) { ++ /* For firmware TSO, assume ASF is disabled. ++ * We'll disable TSO later if we discover ASF ++ * is enabled in tg3_get_eeprom_hw_cfg(). ++ */ + tg3_flag_set(tp, TSO_CAPABLE); +- else { ++ } else { + tg3_flag_clear(tp, TSO_CAPABLE); + tg3_flag_clear(tp, TSO_BUG); + tp->fw_needed = NULL; +@@ -14290,6 +14294,12 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) + */ + tg3_get_eeprom_hw_cfg(tp); + ++ if (tp->fw_needed && tg3_flag(tp, ENABLE_ASF)) { ++ tg3_flag_clear(tp, TSO_CAPABLE); ++ tg3_flag_clear(tp, TSO_BUG); ++ tp->fw_needed = NULL; ++ } ++ + if (tg3_flag(tp, ENABLE_APE)) { + /* Allow reads and writes to the + * APE register and memory space. +diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c +index 8cf3173..da5204d 100644 +--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c ++++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c +@@ -1351,6 +1351,10 @@ static void netxen_mask_aer_correctable(struct netxen_adapter *adapter) + struct pci_dev *root = pdev->bus->self; + u32 aer_pos; + ++ /* root bus? */ ++ if (!root) ++ return; ++ + if (adapter->ahw.board_type != NETXEN_BRDTYPE_P3_4_GB_MM && + adapter->ahw.board_type != NETXEN_BRDTYPE_P3_10G_TP) + return; +diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c +index c97d2f5..bfc3b0d 100644 +--- a/drivers/net/ethernet/ti/davinci_cpdma.c ++++ b/drivers/net/ethernet/ti/davinci_cpdma.c +@@ -851,6 +851,7 @@ int cpdma_chan_stop(struct cpdma_chan *chan) + + next_dma = desc_read(desc, hw_next); + chan->head = desc_from_phys(pool, next_dma); ++ chan->count--; + chan->stats.teardown_dequeue++; + + /* issue callback without locks held */ +diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c +index bc9a4bb..1161584 100644 +--- a/drivers/net/ppp/pppoe.c ++++ b/drivers/net/ppp/pppoe.c +@@ -576,7 +576,7 @@ static int pppoe_release(struct socket *sock) + + po = pppox_sk(sk); + +- if (sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND)) { ++ if (sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND | PPPOX_ZOMBIE)) { + dev_put(po->pppoe_dev); + po->pppoe_dev = NULL; + } +diff --git a/drivers/net/usb/asix.c b/drivers/net/usb/asix.c +index fc147a5..6729585 100644 +--- a/drivers/net/usb/asix.c ++++ b/drivers/net/usb/asix.c +@@ -1648,6 +1648,10 @@ static const struct usb_device_id products [] = { + USB_DEVICE (0x2001, 0x3c05), + .driver_info = (unsigned long) &ax88772_info, + }, { ++ // DLink DUB-E100 H/W Ver C1 ++ USB_DEVICE (0x2001, 0x1a02), ++ .driver_info = (unsigned long) &ax88772_info, ++}, { + // Linksys USB1000 + USB_DEVICE (0x1737, 0x0039), + .driver_info = (unsigned long) &ax88178_info, +diff --git a/drivers/net/usb/sierra_net.c b/drivers/net/usb/sierra_net.c +index 864448b..e773250 100644 +--- a/drivers/net/usb/sierra_net.c ++++ b/drivers/net/usb/sierra_net.c +@@ -678,7 +678,7 @@ static int sierra_net_get_fw_attr(struct usbnet *dev, u16 *datap) + return -EIO; + } + +- *datap = *attrdata; ++ *datap = le16_to_cpu(*attrdata); + + kfree(attrdata); + return result; +diff --git a/drivers/net/wan/ixp4xx_hss.c b/drivers/net/wan/ixp4xx_hss.c +index aaaca9a..3f575af 100644 +--- a/drivers/net/wan/ixp4xx_hss.c ++++ b/drivers/net/wan/ixp4xx_hss.c +@@ -10,6 +10,7 @@ + + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + ++#include + #include + #include + #include +diff --git a/drivers/net/wireless/brcm80211/brcmfmac/dhd_common.c b/drivers/net/wireless/brcm80211/brcmfmac/dhd_common.c +index 8918261..746202f 100644 +--- a/drivers/net/wireless/brcm80211/brcmfmac/dhd_common.c ++++ b/drivers/net/wireless/brcm80211/brcmfmac/dhd_common.c +@@ -775,8 +775,11 @@ static void brcmf_c_arp_offload_set(struct brcmf_pub *drvr, int arp_mode) + { + char iovbuf[32]; + int retcode; ++ __le32 arp_mode_le; + +- brcmf_c_mkiovar("arp_ol", (char *)&arp_mode, 4, iovbuf, sizeof(iovbuf)); ++ arp_mode_le = cpu_to_le32(arp_mode); ++ brcmf_c_mkiovar("arp_ol", (char *)&arp_mode_le, 4, iovbuf, ++ sizeof(iovbuf)); + retcode = brcmf_proto_cdc_set_dcmd(drvr, 0, BRCMF_C_SET_VAR, + iovbuf, sizeof(iovbuf)); + retcode = retcode >= 0 ? 0 : retcode; +@@ -792,8 +795,11 @@ static void brcmf_c_arp_offload_enable(struct brcmf_pub *drvr, int arp_enable) + { + char iovbuf[32]; + int retcode; ++ __le32 arp_enable_le; + +- brcmf_c_mkiovar("arpoe", (char *)&arp_enable, 4, ++ arp_enable_le = cpu_to_le32(arp_enable); ++ ++ brcmf_c_mkiovar("arpoe", (char *)&arp_enable_le, 4, + iovbuf, sizeof(iovbuf)); + retcode = brcmf_proto_cdc_set_dcmd(drvr, 0, BRCMF_C_SET_VAR, + iovbuf, sizeof(iovbuf)); +@@ -814,10 +820,10 @@ int brcmf_c_preinit_dcmds(struct brcmf_pub *drvr) + char buf[128], *ptr; + u32 dongle_align = BRCMF_SDALIGN; + u32 glom = 0; +- u32 roaming = 1; +- uint bcn_timeout = 3; +- int scan_assoc_time = 40; +- int scan_unassoc_time = 40; ++ __le32 roaming_le = cpu_to_le32(1); ++ __le32 bcn_timeout_le = cpu_to_le32(3); ++ __le32 scan_assoc_time_le = cpu_to_le32(40); ++ __le32 scan_unassoc_time_le = cpu_to_le32(40); + int i; + + brcmf_os_proto_block(drvr); +@@ -852,14 +858,14 @@ int brcmf_c_preinit_dcmds(struct brcmf_pub *drvr) + + /* Setup timeout if Beacons are lost and roam is off to report + link down */ +- brcmf_c_mkiovar("bcn_timeout", (char *)&bcn_timeout, 4, iovbuf, ++ brcmf_c_mkiovar("bcn_timeout", (char *)&bcn_timeout_le, 4, iovbuf, + sizeof(iovbuf)); + brcmf_proto_cdc_set_dcmd(drvr, 0, BRCMF_C_SET_VAR, iovbuf, + sizeof(iovbuf)); + + /* Enable/Disable build-in roaming to allowed ext supplicant to take + of romaing */ +- brcmf_c_mkiovar("roam_off", (char *)&roaming, 4, ++ brcmf_c_mkiovar("roam_off", (char *)&roaming_le, 4, + iovbuf, sizeof(iovbuf)); + brcmf_proto_cdc_set_dcmd(drvr, 0, BRCMF_C_SET_VAR, iovbuf, + sizeof(iovbuf)); +@@ -874,9 +880,9 @@ int brcmf_c_preinit_dcmds(struct brcmf_pub *drvr) + sizeof(iovbuf)); + + brcmf_proto_cdc_set_dcmd(drvr, 0, BRCMF_C_SET_SCAN_CHANNEL_TIME, +- (char *)&scan_assoc_time, sizeof(scan_assoc_time)); ++ (char *)&scan_assoc_time_le, sizeof(scan_assoc_time_le)); + brcmf_proto_cdc_set_dcmd(drvr, 0, BRCMF_C_SET_SCAN_UNASSOC_TIME, +- (char *)&scan_unassoc_time, sizeof(scan_unassoc_time)); ++ (char *)&scan_unassoc_time_le, sizeof(scan_unassoc_time_le)); + + /* Set and enable ARP offload feature */ + brcmf_c_arp_offload_set(drvr, BRCMF_ARPOL_MODE); +diff --git a/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c b/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c +index 5eddabe..e4e326a 100644 +--- a/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c ++++ b/drivers/net/wireless/brcm80211/brcmfmac/wl_cfg80211.c +@@ -498,8 +498,10 @@ static void wl_iscan_prep(struct brcmf_scan_params_le *params_le, + params_le->active_time = cpu_to_le32(-1); + params_le->passive_time = cpu_to_le32(-1); + params_le->home_time = cpu_to_le32(-1); +- if (ssid && ssid->SSID_len) +- memcpy(¶ms_le->ssid_le, ssid, sizeof(struct brcmf_ssid)); ++ if (ssid && ssid->SSID_len) { ++ params_le->ssid_le.SSID_len = cpu_to_le32(ssid->SSID_len); ++ memcpy(¶ms_le->ssid_le.SSID, ssid->SSID, ssid->SSID_len); ++ } + } + + static s32 +diff --git a/drivers/net/wireless/rtlwifi/rtl8192ce/def.h b/drivers/net/wireless/rtlwifi/rtl8192ce/def.h +index 9fc804d..7305a47 100644 +--- a/drivers/net/wireless/rtlwifi/rtl8192ce/def.h ++++ b/drivers/net/wireless/rtlwifi/rtl8192ce/def.h +@@ -117,6 +117,7 @@ + + #define CHIP_VER_B BIT(4) + #define CHIP_92C_BITMASK BIT(0) ++#define CHIP_UNKNOWN BIT(7) + #define CHIP_92C_1T2R 0x03 + #define CHIP_92C 0x01 + #define CHIP_88C 0x00 +diff --git a/drivers/net/wireless/rtlwifi/rtl8192ce/hw.c b/drivers/net/wireless/rtlwifi/rtl8192ce/hw.c +index a3deaef..cb480d8 100644 +--- a/drivers/net/wireless/rtlwifi/rtl8192ce/hw.c ++++ b/drivers/net/wireless/rtlwifi/rtl8192ce/hw.c +@@ -1001,8 +1001,16 @@ static enum version_8192c _rtl92ce_read_chip_version(struct ieee80211_hw *hw) + version = (value32 & TYPE_ID) ? VERSION_A_CHIP_92C : + VERSION_A_CHIP_88C; + } else { +- version = (value32 & TYPE_ID) ? VERSION_B_CHIP_92C : +- VERSION_B_CHIP_88C; ++ version = (enum version_8192c) (CHIP_VER_B | ++ ((value32 & TYPE_ID) ? CHIP_92C_BITMASK : 0) | ++ ((value32 & VENDOR_ID) ? CHIP_VENDOR_UMC : 0)); ++ if ((!IS_CHIP_VENDOR_UMC(version)) && (value32 & ++ CHIP_VER_RTL_MASK)) { ++ version = (enum version_8192c)(version | ++ ((((value32 & CHIP_VER_RTL_MASK) == BIT(12)) ++ ? CHIP_VENDOR_UMC_B_CUT : CHIP_UNKNOWN) | ++ CHIP_VENDOR_UMC)); ++ } + } + + switch (version) { +diff --git a/drivers/net/wireless/rtlwifi/rtl8192ce/sw.c b/drivers/net/wireless/rtlwifi/rtl8192ce/sw.c +index f2aa33d..df852e8 100644 +--- a/drivers/net/wireless/rtlwifi/rtl8192ce/sw.c ++++ b/drivers/net/wireless/rtlwifi/rtl8192ce/sw.c +@@ -165,12 +165,14 @@ int rtl92c_init_sw_vars(struct ieee80211_hw *hw) + + /* request fw */ + if (IS_VENDOR_UMC_A_CUT(rtlhal->version) && +- !IS_92C_SERIAL(rtlhal->version)) ++ !IS_92C_SERIAL(rtlhal->version)) { + fw_name = "rtlwifi/rtl8192cfwU.bin"; +- else if (IS_81xxC_VENDOR_UMC_B_CUT(rtlhal->version)) ++ } else if (IS_81xxC_VENDOR_UMC_B_CUT(rtlhal->version)) { + fw_name = "rtlwifi/rtl8192cfwU_B.bin"; +- else ++ pr_info("****** This B_CUT device may not work with kernels 3.6 and earlier\n"); ++ } else { + fw_name = rtlpriv->cfg->fw_name; ++ } + err = request_firmware(&firmware, fw_name, rtlpriv->io.dev); + if (err) { + RT_TRACE(rtlpriv, COMP_ERR, DBG_EMERG, +diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c +index 9ddf69e..74d38ca 100644 +--- a/drivers/pci/hotplug/acpiphp_glue.c ++++ b/drivers/pci/hotplug/acpiphp_glue.c +@@ -132,6 +132,15 @@ register_slot(acpi_handle handle, u32 lvl, void *context, void **rv) + if (!acpi_pci_check_ejectable(pbus, handle) && !is_dock_device(handle)) + return AE_OK; + ++ status = acpi_evaluate_integer(handle, "_ADR", NULL, &adr); ++ if (ACPI_FAILURE(status)) { ++ warn("can't evaluate _ADR (%#x)\n", status); ++ return AE_OK; ++ } ++ ++ device = (adr >> 16) & 0xffff; ++ function = adr & 0xffff; ++ + pdev = pbus->self; + if (pdev && pci_is_pcie(pdev)) { + tmp = acpi_find_root_bridge_handle(pdev); +@@ -144,10 +153,6 @@ register_slot(acpi_handle handle, u32 lvl, void *context, void **rv) + } + } + +- acpi_evaluate_integer(handle, "_ADR", NULL, &adr); +- device = (adr >> 16) & 0xffff; +- function = adr & 0xffff; +- + newfunc = kzalloc(sizeof(struct acpiphp_func), GFP_KERNEL); + if (!newfunc) + return AE_NO_MEMORY; +diff --git a/drivers/platform/x86/asus-laptop.c b/drivers/platform/x86/asus-laptop.c +index edaccad..f75a4c8 100644 +--- a/drivers/platform/x86/asus-laptop.c ++++ b/drivers/platform/x86/asus-laptop.c +@@ -823,9 +823,9 @@ static ssize_t show_infos(struct device *dev, + * The significance of others is yet to be found. + * If we don't find the method, we assume the device are present. + */ +- rv = acpi_evaluate_integer(asus->handle, "HRWS", NULL, &temp); ++ rv = acpi_evaluate_integer(asus->handle, "HWRS", NULL, &temp); + if (!ACPI_FAILURE(rv)) +- len += sprintf(page + len, "HRWS value : %#x\n", ++ len += sprintf(page + len, "HWRS value : %#x\n", + (uint) temp); + /* + * Another value for userspace: the ASYM method returns 0x02 for +@@ -1660,9 +1660,9 @@ static int asus_laptop_get_info(struct asus_laptop *asus) + * The significance of others is yet to be found. + */ + status = +- acpi_evaluate_integer(asus->handle, "HRWS", NULL, &hwrs_result); ++ acpi_evaluate_integer(asus->handle, "HWRS", NULL, &hwrs_result); + if (!ACPI_FAILURE(status)) +- pr_notice(" HRWS returned %x", (int)hwrs_result); ++ pr_notice(" HWRS returned %x", (int)hwrs_result); + + if (!acpi_check_handle(asus->handle, METHOD_WL_STATUS, NULL)) + asus->have_rsts = true; +diff --git a/drivers/rtc/rtc-twl.c b/drivers/rtc/rtc-twl.c +index 20687d5..a3e98f1 100644 +--- a/drivers/rtc/rtc-twl.c ++++ b/drivers/rtc/rtc-twl.c +@@ -462,6 +462,11 @@ static int __devinit twl_rtc_probe(struct platform_device *pdev) + goto out1; + } + ++ /* ensure interrupts are disabled, bootloaders can be strange */ ++ ret = twl_rtc_write_u8(0, REG_RTC_INTERRUPTS_REG); ++ if (ret < 0) ++ dev_warn(&pdev->dev, "unable to disable interrupt\n"); ++ + /* init cached IRQ enable bits */ + ret = twl_rtc_read_u8(&rtc_irq_bits, REG_RTC_INTERRUPTS_REG); + if (ret < 0) +diff --git a/drivers/scsi/bnx2i/bnx2i_hwi.c b/drivers/scsi/bnx2i/bnx2i_hwi.c +index 1ad0b82..1069974 100644 +--- a/drivers/scsi/bnx2i/bnx2i_hwi.c ++++ b/drivers/scsi/bnx2i/bnx2i_hwi.c +@@ -1264,6 +1264,9 @@ int bnx2i_send_fw_iscsi_init_msg(struct bnx2i_hba *hba) + int rc = 0; + u64 mask64; + ++ memset(&iscsi_init, 0x00, sizeof(struct iscsi_kwqe_init1)); ++ memset(&iscsi_init2, 0x00, sizeof(struct iscsi_kwqe_init2)); ++ + bnx2i_adjust_qp_size(hba); + + iscsi_init.flags = +diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c +index b4d2c86..be9aad8 100644 +--- a/drivers/scsi/hpsa.c ++++ b/drivers/scsi/hpsa.c +@@ -1213,8 +1213,9 @@ static void complete_scsi_command(struct CommandList *cp) + } + break; + case CMD_PROTOCOL_ERR: ++ cmd->result = DID_ERROR << 16; + dev_warn(&h->pdev->dev, "cp %p has " +- "protocol error \n", cp); ++ "protocol error\n", cp); + break; + case CMD_HARDWARE_ERR: + cmd->result = DID_ERROR << 16; +diff --git a/drivers/scsi/mpt2sas/mpt2sas_base.c b/drivers/scsi/mpt2sas/mpt2sas_base.c +index 98cb5e6..17de348 100644 +--- a/drivers/scsi/mpt2sas/mpt2sas_base.c ++++ b/drivers/scsi/mpt2sas/mpt2sas_base.c +@@ -1156,6 +1156,13 @@ _base_check_enable_msix(struct MPT2SAS_ADAPTER *ioc) + u16 message_control; + + ++ /* Check whether controller SAS2008 B0 controller, ++ if it is SAS2008 B0 controller use IO-APIC instead of MSIX */ ++ if (ioc->pdev->device == MPI2_MFGPAGE_DEVID_SAS2008 && ++ ioc->pdev->revision == 0x01) { ++ return -EINVAL; ++ } ++ + base = pci_find_capability(ioc->pdev, PCI_CAP_ID_MSIX); + if (!base) { + dfailprintk(ioc, printk(MPT2SAS_INFO_FMT "msix not " +diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c +index 597fb9b..34d114a 100644 +--- a/drivers/target/target_core_transport.c ++++ b/drivers/target/target_core_transport.c +@@ -3039,15 +3039,20 @@ static int transport_generic_cmd_sequencer( + /* Returns CHECK_CONDITION + INVALID_CDB_FIELD */ + goto out_invalid_cdb_field; + } +- ++ /* ++ * For the overflow case keep the existing fabric provided ++ * ->data_length. Otherwise for the underflow case, reset ++ * ->data_length to the smaller SCSI expected data transfer ++ * length. ++ */ + if (size > cmd->data_length) { + cmd->se_cmd_flags |= SCF_OVERFLOW_BIT; + cmd->residual_count = (size - cmd->data_length); + } else { + cmd->se_cmd_flags |= SCF_UNDERFLOW_BIT; + cmd->residual_count = (cmd->data_length - size); ++ cmd->data_length = size; + } +- cmd->data_length = size; + } + + /* reject any command that we don't have a handler for */ +diff --git a/drivers/tty/serial/pch_uart.c b/drivers/tty/serial/pch_uart.c +index 08b92a6..8d70fbc 100644 +--- a/drivers/tty/serial/pch_uart.c ++++ b/drivers/tty/serial/pch_uart.c +@@ -236,6 +236,9 @@ struct eg20t_port { + int tx_dma_use; + void *rx_buf_virt; + dma_addr_t rx_buf_dma; ++ ++ /* protect the eg20t_port private structure and io access to membase */ ++ spinlock_t lock; + }; + + /** +@@ -964,7 +967,7 @@ static irqreturn_t pch_uart_interrupt(int irq, void *dev_id) + unsigned int iid; + unsigned long flags; + +- spin_lock_irqsave(&priv->port.lock, flags); ++ spin_lock_irqsave(&priv->lock, flags); + handled = 0; + while ((iid = pch_uart_hal_get_iid(priv)) > 1) { + switch (iid) { +@@ -1017,7 +1020,7 @@ static irqreturn_t pch_uart_interrupt(int irq, void *dev_id) + priv->int_dis_flag = 0; + } + +- spin_unlock_irqrestore(&priv->port.lock, flags); ++ spin_unlock_irqrestore(&priv->lock, flags); + return IRQ_RETVAL(handled); + } + +@@ -1131,9 +1134,9 @@ static void pch_uart_break_ctl(struct uart_port *port, int ctl) + unsigned long flags; + + priv = container_of(port, struct eg20t_port, port); +- spin_lock_irqsave(&port->lock, flags); ++ spin_lock_irqsave(&priv->lock, flags); + pch_uart_hal_set_break(priv, ctl); +- spin_unlock_irqrestore(&port->lock, flags); ++ spin_unlock_irqrestore(&priv->lock, flags); + } + + /* Grab any interrupt resources and initialise any low level driver state. */ +@@ -1284,7 +1287,8 @@ static void pch_uart_set_termios(struct uart_port *port, + + baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk / 16); + +- spin_lock_irqsave(&port->lock, flags); ++ spin_lock_irqsave(&priv->lock, flags); ++ spin_lock(&port->lock); + + uart_update_timeout(port, termios->c_cflag, baud); + rtn = pch_uart_hal_set_line(priv, baud, parity, bits, stb); +@@ -1297,7 +1301,8 @@ static void pch_uart_set_termios(struct uart_port *port, + tty_termios_encode_baud_rate(termios, baud, baud); + + out: +- spin_unlock_irqrestore(&port->lock, flags); ++ spin_unlock(&port->lock); ++ spin_unlock_irqrestore(&priv->lock, flags); + } + + static const char *pch_uart_type(struct uart_port *port) +@@ -1449,6 +1454,8 @@ static struct eg20t_port *pch_uart_init_port(struct pci_dev *pdev, + pci_enable_msi(pdev); + pci_set_master(pdev); + ++ spin_lock_init(&priv->lock); ++ + iobase = pci_resource_start(pdev, 0); + mapbase = pci_resource_start(pdev, 1); + priv->mapbase = mapbase; +diff --git a/drivers/usb/core/devices.c b/drivers/usb/core/devices.c +index d956965..3440812 100644 +--- a/drivers/usb/core/devices.c ++++ b/drivers/usb/core/devices.c +@@ -624,7 +624,7 @@ static ssize_t usb_device_read(struct file *file, char __user *buf, + /* print devices for all busses */ + list_for_each_entry(bus, &usb_bus_list, bus_list) { + /* recurse through all children of the root hub */ +- if (!bus->root_hub) ++ if (!bus_to_hcd(bus)->rh_registered) + continue; + usb_lock_device(bus->root_hub); + ret = usb_device_dump(&buf, &nbytes, &skip_bytes, ppos, +diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c +index 8cb9304..032e5a6 100644 +--- a/drivers/usb/core/hcd.c ++++ b/drivers/usb/core/hcd.c +@@ -1002,10 +1002,7 @@ static int register_root_hub(struct usb_hcd *hcd) + if (retval) { + dev_err (parent_dev, "can't register root hub for %s, %d\n", + dev_name(&usb_dev->dev), retval); +- } +- mutex_unlock(&usb_bus_list_lock); +- +- if (retval == 0) { ++ } else { + spin_lock_irq (&hcd_root_hub_lock); + hcd->rh_registered = 1; + spin_unlock_irq (&hcd_root_hub_lock); +@@ -1014,6 +1011,7 @@ static int register_root_hub(struct usb_hcd *hcd) + if (HCD_DEAD(hcd)) + usb_hc_died (hcd); /* This time clean up */ + } ++ mutex_unlock(&usb_bus_list_lock); + + return retval; + } +diff --git a/drivers/usb/gadget/dummy_hcd.c b/drivers/usb/gadget/dummy_hcd.c +index 527736e..d584eaf 100644 +--- a/drivers/usb/gadget/dummy_hcd.c ++++ b/drivers/usb/gadget/dummy_hcd.c +@@ -2292,10 +2292,8 @@ static int dummy_hcd_probe(struct platform_device *pdev) + hs_hcd->has_tt = 1; + + retval = usb_add_hcd(hs_hcd, 0, 0); +- if (retval != 0) { +- usb_put_hcd(hs_hcd); +- return retval; +- } ++ if (retval) ++ goto put_usb2_hcd; + + if (mod_data.is_super_speed) { + ss_hcd = usb_create_shared_hcd(&dummy_hcd, &pdev->dev, +@@ -2314,6 +2312,8 @@ static int dummy_hcd_probe(struct platform_device *pdev) + put_usb3_hcd: + usb_put_hcd(ss_hcd); + dealloc_usb2_hcd: ++ usb_remove_hcd(hs_hcd); ++put_usb2_hcd: + usb_put_hcd(hs_hcd); + the_controller.hs_hcd = the_controller.ss_hcd = NULL; + return retval; +diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c +index 3c166d3..f62be89 100644 +--- a/drivers/watchdog/hpwdt.c ++++ b/drivers/watchdog/hpwdt.c +@@ -813,6 +813,9 @@ static int __devinit hpwdt_init_one(struct pci_dev *dev, + hpwdt_timer_reg = pci_mem_addr + 0x70; + hpwdt_timer_con = pci_mem_addr + 0x72; + ++ /* Make sure that timer is disabled until /dev/watchdog is opened */ ++ hpwdt_stop(); ++ + /* Make sure that we have a valid soft_margin */ + if (hpwdt_change_timer(soft_margin)) + hpwdt_change_timer(DEFAULT_MARGIN); +diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c +index 1b2e180..667776e 100644 +--- a/fs/cifs/cifs_unicode.c ++++ b/fs/cifs/cifs_unicode.c +@@ -327,6 +327,6 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen, + } + + ctoUCS_out: +- return i; ++ return j; + } + +diff --git a/fs/dcache.c b/fs/dcache.c +index eb723d3..63c0c6b 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -311,7 +311,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) + * Inform try_to_ascend() that we are no longer attached to the + * dentry tree + */ +- dentry->d_flags |= DCACHE_DISCONNECTED; ++ dentry->d_flags |= DCACHE_DENTRY_KILLED; + if (parent) + spin_unlock(&parent->d_lock); + dentry_iput(dentry); +@@ -968,7 +968,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq + * or deletion + */ + if (new != old->d_parent || +- (old->d_flags & DCACHE_DISCONNECTED) || ++ (old->d_flags & DCACHE_DENTRY_KILLED) || + (!locked && read_seqretry(&rename_lock, seq))) { + spin_unlock(&new->d_lock); + new = NULL; +@@ -1054,6 +1054,8 @@ positive: + return 1; + + rename_retry: ++ if (locked) ++ goto again; + locked = 1; + write_seqlock(&rename_lock); + goto again; +@@ -1156,6 +1158,8 @@ out: + rename_retry: + if (found) + return found; ++ if (locked) ++ goto again; + locked = 1; + write_seqlock(&rename_lock); + goto again; +@@ -2922,6 +2926,8 @@ resume: + return; + + rename_retry: ++ if (locked) ++ goto again; + locked = 1; + write_seqlock(&rename_lock); + goto again; +diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c +index 53c3bce..0be1aa4 100644 +--- a/fs/proc/proc_sysctl.c ++++ b/fs/proc/proc_sysctl.c +@@ -123,9 +123,6 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, + + err = ERR_PTR(-ENOMEM); + inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); +- if (h) +- sysctl_head_finish(h); +- + if (!inode) + goto out; + +@@ -134,6 +131,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, + d_add(dentry, inode); + + out: ++ if (h) ++ sysctl_head_finish(h); + sysctl_head_finish(head); + return err; + } +diff --git a/include/linux/dcache.h b/include/linux/dcache.h +index 4eb8c80..1dfe974 100644 +--- a/include/linux/dcache.h ++++ b/include/linux/dcache.h +@@ -219,6 +219,8 @@ struct dentry_operations { + #define DCACHE_MANAGED_DENTRY \ + (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) + ++#define DCACHE_DENTRY_KILLED 0x100000 ++ + extern seqlock_t rename_lock; + + static inline int dname_external(struct dentry *dentry) +diff --git a/include/linux/hid.h b/include/linux/hid.h +index c235e4e..331e2ef 100644 +--- a/include/linux/hid.h ++++ b/include/linux/hid.h +@@ -875,7 +875,7 @@ static inline int hid_hw_power(struct hid_device *hdev, int level) + return hdev->ll_driver->power ? hdev->ll_driver->power(hdev, level) : 0; + } + +-void hid_report_raw_event(struct hid_device *hid, int type, u8 *data, int size, ++int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, int size, + int interrupt); + + extern int hid_generic_init(void); +diff --git a/include/linux/hidraw.h b/include/linux/hidraw.h +index 4b88e69..45e9fcb 100644 +--- a/include/linux/hidraw.h ++++ b/include/linux/hidraw.h +@@ -76,13 +76,13 @@ struct hidraw_list { + #ifdef CONFIG_HIDRAW + int hidraw_init(void); + void hidraw_exit(void); +-void hidraw_report_event(struct hid_device *, u8 *, int); ++int hidraw_report_event(struct hid_device *, u8 *, int); + int hidraw_connect(struct hid_device *); + void hidraw_disconnect(struct hid_device *); + #else + static inline int hidraw_init(void) { return 0; } + static inline void hidraw_exit(void) { } +-static inline void hidraw_report_event(struct hid_device *hid, u8 *data, int len) { } ++static inline int hidraw_report_event(struct hid_device *hid, u8 *data, int len) { return 0; } + static inline int hidraw_connect(struct hid_device *hid) { return -1; } + static inline void hidraw_disconnect(struct hid_device *hid) { } + #endif +diff --git a/include/linux/memory.h b/include/linux/memory.h +index 935699b..6bea2c2 100644 +--- a/include/linux/memory.h ++++ b/include/linux/memory.h +@@ -20,7 +20,7 @@ + #include + #include + +-#define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS) ++#define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) + + struct memory_block { + unsigned long start_section_nr; +diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h +index 22e61fd..28e493b 100644 +--- a/include/linux/xfrm.h ++++ b/include/linux/xfrm.h +@@ -84,6 +84,8 @@ struct xfrm_replay_state { + __u32 bitmap; + }; + ++#define XFRMA_REPLAY_ESN_MAX 4096 ++ + struct xfrm_replay_state_esn { + unsigned int bmp_len; + __u32 oseq; +diff --git a/include/net/bluetooth/smp.h b/include/net/bluetooth/smp.h +index 15b97d5..fe810d4 100644 +--- a/include/net/bluetooth/smp.h ++++ b/include/net/bluetooth/smp.h +@@ -131,7 +131,7 @@ struct smp_chan { + }; + + /* SMP Commands */ +-int smp_conn_security(struct l2cap_conn *conn, __u8 sec_level); ++int smp_conn_security(struct hci_conn *hcon, __u8 sec_level); + int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb); + int smp_distribute_keys(struct l2cap_conn *conn, __u8 force); + +diff --git a/include/net/xfrm.h b/include/net/xfrm.h +index b203e14..921f627 100644 +--- a/include/net/xfrm.h ++++ b/include/net/xfrm.h +@@ -269,6 +269,9 @@ struct xfrm_replay { + int (*check)(struct xfrm_state *x, + struct sk_buff *skb, + __be32 net_seq); ++ int (*recheck)(struct xfrm_state *x, ++ struct sk_buff *skb, ++ __be32 net_seq); + void (*notify)(struct xfrm_state *x, int event); + int (*overflow)(struct xfrm_state *x, struct sk_buff *skb); + }; +diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h +index a9c87ad..a9536da 100644 +--- a/include/trace/events/kmem.h ++++ b/include/trace/events/kmem.h +@@ -214,7 +214,7 @@ TRACE_EVENT(mm_page_alloc, + + TP_printk("page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s", + __entry->page, +- page_to_pfn(__entry->page), ++ __entry->page ? page_to_pfn(__entry->page) : 0, + __entry->order, + __entry->migratetype, + show_gfp_flags(__entry->gfp_flags)) +@@ -240,7 +240,7 @@ DECLARE_EVENT_CLASS(mm_page, + + TP_printk("page=%p pfn=%lu order=%u migratetype=%d percpu_refill=%d", + __entry->page, +- page_to_pfn(__entry->page), ++ __entry->page ? page_to_pfn(__entry->page) : 0, + __entry->order, + __entry->migratetype, + __entry->order == 0) +diff --git a/kernel/async.c b/kernel/async.c +index 80b74b8..009f516 100644 +--- a/kernel/async.c ++++ b/kernel/async.c +@@ -88,6 +88,13 @@ static async_cookie_t __lowest_in_progress(struct list_head *running) + { + struct async_entry *entry; + ++ if (!running) { /* just check the entry count */ ++ if (atomic_read(&entry_count)) ++ return 0; /* smaller than any cookie */ ++ else ++ return next_cookie; ++ } ++ + if (!list_empty(running)) { + entry = list_first_entry(running, + struct async_entry, list); +@@ -238,9 +245,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain); + */ + void async_synchronize_full(void) + { +- do { +- async_synchronize_cookie(next_cookie); +- } while (!list_empty(&async_running) || !list_empty(&async_pending)); ++ async_synchronize_cookie_domain(next_cookie, NULL); + } + EXPORT_SYMBOL_GPL(async_synchronize_full); + +@@ -260,7 +265,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); + /** + * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing + * @cookie: async_cookie_t to use as checkpoint +- * @running: running list to synchronize on ++ * @running: running list to synchronize on, NULL indicates all lists + * + * This function waits until all asynchronous function calls for the + * synchronization domain specified by the running list @list submitted +diff --git a/kernel/cpuset.c b/kernel/cpuset.c +index 46a1d3c..84a524b 100644 +--- a/kernel/cpuset.c ++++ b/kernel/cpuset.c +@@ -2080,6 +2080,9 @@ static void scan_for_empty_cpusets(struct cpuset *root) + * (of no affect) on systems that are actively using CPU hotplug + * but making no active use of cpusets. + * ++ * The only exception to this is suspend/resume, where we don't ++ * modify cpusets at all. ++ * + * This routine ensures that top_cpuset.cpus_allowed tracks + * cpu_active_mask on each CPU hotplug (cpuhp) event. + * +diff --git a/kernel/exit.c b/kernel/exit.c +index 5a8a66e..234e152 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -1019,6 +1019,22 @@ NORET_TYPE void do_exit(long code) + + preempt_disable(); + exit_rcu(); ++ ++ /* ++ * The setting of TASK_RUNNING by try_to_wake_up() may be delayed ++ * when the following two conditions become true. ++ * - There is race condition of mmap_sem (It is acquired by ++ * exit_mm()), and ++ * - SMI occurs before setting TASK_RUNINNG. ++ * (or hypervisor of virtual machine switches to other guest) ++ * As a result, we may become TASK_RUNNING after becoming TASK_DEAD ++ * ++ * To avoid it, we have to wait for releasing tsk->pi_lock which ++ * is held by try_to_wake_up() ++ */ ++ smp_mb(); ++ raw_spin_unlock_wait(&tsk->pi_lock); ++ + /* causes final put_task_struct in finish_task_switch(). */ + tsk->state = TASK_DEAD; + schedule(); +diff --git a/kernel/sched.c b/kernel/sched.c +index 910db7d..fcc893f 100644 +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -8192,34 +8192,66 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) + } + #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ + ++static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ ++ + /* + * Update cpusets according to cpu_active mask. If cpusets are + * disabled, cpuset_update_active_cpus() becomes a simple wrapper + * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. + */ + static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, + void *hcpu) + { +- switch (action & ~CPU_TASKS_FROZEN) { ++ switch (action) { ++ case CPU_ONLINE_FROZEN: ++ case CPU_DOWN_FAILED_FROZEN: ++ ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ num_cpus_frozen--; ++ if (likely(num_cpus_frozen)) { ++ partition_sched_domains(1, NULL, NULL); ++ break; ++ } ++ ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ + case CPU_ONLINE: + case CPU_DOWN_FAILED: + cpuset_update_active_cpus(); +- return NOTIFY_OK; ++ break; + default: + return NOTIFY_DONE; + } ++ return NOTIFY_OK; + } + + static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, + void *hcpu) + { +- switch (action & ~CPU_TASKS_FROZEN) { ++ switch (action) { + case CPU_DOWN_PREPARE: + cpuset_update_active_cpus(); +- return NOTIFY_OK; ++ break; ++ case CPU_DOWN_PREPARE_FROZEN: ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ break; + default: + return NOTIFY_DONE; + } ++ return NOTIFY_OK; + } + + static int update_runtime(struct notifier_block *nfb, +diff --git a/kernel/workqueue.c b/kernel/workqueue.c +index 979d4de..b413138 100644 +--- a/kernel/workqueue.c ++++ b/kernel/workqueue.c +@@ -3627,18 +3627,17 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, + #ifdef CONFIG_SMP + + struct work_for_cpu { +- struct completion completion; ++ struct work_struct work; + long (*fn)(void *); + void *arg; + long ret; + }; + +-static int do_work_for_cpu(void *_wfc) ++static void work_for_cpu_fn(struct work_struct *work) + { +- struct work_for_cpu *wfc = _wfc; ++ struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); ++ + wfc->ret = wfc->fn(wfc->arg); +- complete(&wfc->completion); +- return 0; + } + + /** +@@ -3653,19 +3652,11 @@ static int do_work_for_cpu(void *_wfc) + */ + long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) + { +- struct task_struct *sub_thread; +- struct work_for_cpu wfc = { +- .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion), +- .fn = fn, +- .arg = arg, +- }; ++ struct work_for_cpu wfc = { .fn = fn, .arg = arg }; + +- sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu"); +- if (IS_ERR(sub_thread)) +- return PTR_ERR(sub_thread); +- kthread_bind(sub_thread, cpu); +- wake_up_process(sub_thread); +- wait_for_completion(&wfc.completion); ++ INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); ++ schedule_work_on(cpu, &wfc.work); ++ flush_work(&wfc.work); + return wfc.ret; + } + EXPORT_SYMBOL_GPL(work_on_cpu); +diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c +index 6629faf..9ad7d1e 100644 +--- a/mm/memory_hotplug.c ++++ b/mm/memory_hotplug.c +@@ -127,9 +127,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) + struct mem_section *ms; + struct page *page, *memmap; + +- if (!pfn_valid(start_pfn)) +- return; +- + section_nr = pfn_to_section_nr(start_pfn); + ms = __nr_to_section(section_nr); + +@@ -188,9 +185,16 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) + end_pfn = pfn + pgdat->node_spanned_pages; + + /* register_section info */ +- for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) +- register_page_bootmem_info_section(pfn); +- ++ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { ++ /* ++ * Some platforms can assign the same pfn to multiple nodes - on ++ * node0 as well as nodeN. To avoid registering a pfn against ++ * multiple nodes we check that this pfn does not already ++ * reside in some other node. ++ */ ++ if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) ++ register_page_bootmem_info_section(pfn); ++ } + } + #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 6e51bf0..a88dded 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -541,7 +541,7 @@ static inline void __free_one_page(struct page *page, + combined_idx = buddy_idx & page_idx; + higher_page = page + (combined_idx - page_idx); + buddy_idx = __find_buddy_index(combined_idx, order + 1); +- higher_buddy = page + (buddy_idx - combined_idx); ++ higher_buddy = higher_page + (buddy_idx - combined_idx); + if (page_is_buddy(higher_page, higher_buddy, order + 1)) { + list_add_tail(&page->lru, + &zone->free_area[order].free_list[migratetype]); +diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c +index f5ffc02..9ddbd4e 100644 +--- a/net/8021q/vlan_core.c ++++ b/net/8021q/vlan_core.c +@@ -106,7 +106,6 @@ static struct sk_buff *vlan_reorder_header(struct sk_buff *skb) + return NULL; + memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); + skb->mac_header += VLAN_HLEN; +- skb_reset_mac_len(skb); + return skb; + } + +@@ -173,6 +172,8 @@ struct sk_buff *vlan_untag(struct sk_buff *skb) + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); ++ skb_reset_mac_len(skb); ++ + return skb; + + err_free: +diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c +index 98bfbd5..1fb1aec 100644 +--- a/net/bluetooth/hci_conn.c ++++ b/net/bluetooth/hci_conn.c +@@ -44,6 +44,7 @@ + + #include + #include ++#include + + static void hci_le_connect(struct hci_conn *conn) + { +@@ -641,6 +642,9 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) + { + BT_DBG("conn %p", conn); + ++ if (conn->type == LE_LINK) ++ return smp_conn_security(conn, sec_level); ++ + /* For sdp we don't need the link key. */ + if (sec_level == BT_SECURITY_SDP) + return 1; +diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c +index dd76177..04175d9 100644 +--- a/net/bluetooth/l2cap_core.c ++++ b/net/bluetooth/l2cap_core.c +@@ -902,14 +902,15 @@ static void l2cap_chan_ready(struct sock *sk) + static void l2cap_conn_ready(struct l2cap_conn *conn) + { + struct l2cap_chan *chan; ++ struct hci_conn *hcon = conn->hcon; + + BT_DBG("conn %p", conn); + +- if (!conn->hcon->out && conn->hcon->type == LE_LINK) ++ if (!hcon->out && hcon->type == LE_LINK) + l2cap_le_conn_ready(conn); + +- if (conn->hcon->out && conn->hcon->type == LE_LINK) +- smp_conn_security(conn, conn->hcon->pending_sec_level); ++ if (hcon->out && hcon->type == LE_LINK) ++ smp_conn_security(hcon, hcon->pending_sec_level); + + read_lock(&conn->chan_lock); + +@@ -918,8 +919,8 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) + + bh_lock_sock(sk); + +- if (conn->hcon->type == LE_LINK) { +- if (smp_conn_security(conn, chan->sec_level)) ++ if (hcon->type == LE_LINK) { ++ if (smp_conn_security(hcon, chan->sec_level)) + l2cap_chan_ready(sk); + + } else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { +diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c +index 6dedd6f..158887a 100644 +--- a/net/bluetooth/l2cap_sock.c ++++ b/net/bluetooth/l2cap_sock.c +@@ -616,7 +616,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch + break; + } + +- if (smp_conn_security(conn, sec.level)) ++ if (smp_conn_security(conn->hcon, sec.level)) + break; + + err = 0; +diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c +index 759b635..c27b4e3 100644 +--- a/net/bluetooth/smp.c ++++ b/net/bluetooth/smp.c +@@ -554,9 +554,9 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) + return 0; + } + +-int smp_conn_security(struct l2cap_conn *conn, __u8 sec_level) ++int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) + { +- struct hci_conn *hcon = conn->hcon; ++ struct l2cap_conn *conn = hcon->l2cap_data; + struct smp_chan *smp = conn->smp_chan; + + BT_DBG("conn %p hcon %p level 0x%2.2x", conn, hcon, sec_level); +diff --git a/net/core/dev.c b/net/core/dev.c +index 832ba6d..abe1147 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -2108,7 +2108,8 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol) + + static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features) + { +- if (!can_checksum_protocol(features, protocol)) { ++ if (skb->ip_summed != CHECKSUM_NONE && ++ !can_checksum_protocol(features, protocol)) { + features &= ~NETIF_F_ALL_CSUM; + features &= ~NETIF_F_SG; + } else if (illegal_highdma(skb->dev, skb)) { +@@ -2686,16 +2687,17 @@ ipv6: + nhoff += poff; + if (pskb_may_pull(skb, nhoff + 4)) { + ports.v32 = * (__force u32 *) (skb->data + nhoff); +- if (ports.v16[1] < ports.v16[0]) +- swap(ports.v16[0], ports.v16[1]); + skb->l4_rxhash = 1; + } + } + + /* get a consistent hash (same value on both flow directions) */ +- if (addr2 < addr1) ++ if (addr2 < addr1 || ++ (addr2 == addr1 && ++ ports.v16[1] < ports.v16[0])) { + swap(addr1, addr2); +- ++ swap(ports.v16[0], ports.v16[1]); ++ } + hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); + if (!hash) + hash = 1; +@@ -6387,7 +6389,8 @@ static struct hlist_head *netdev_create_hash(void) + /* Initialize per network namespace state */ + static int __net_init netdev_init(struct net *net) + { +- INIT_LIST_HEAD(&net->dev_base_head); ++ if (net != &init_net) ++ INIT_LIST_HEAD(&net->dev_base_head); + + net->dev_name_head = netdev_create_hash(); + if (net->dev_name_head == NULL) +diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c +index 31a5ae5..dd00b71 100644 +--- a/net/core/net_namespace.c ++++ b/net/core/net_namespace.c +@@ -25,7 +25,9 @@ static DEFINE_MUTEX(net_mutex); + LIST_HEAD(net_namespace_list); + EXPORT_SYMBOL_GPL(net_namespace_list); + +-struct net init_net; ++struct net init_net = { ++ .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), ++}; + EXPORT_SYMBOL(init_net); + + #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ +diff --git a/net/core/sock.c b/net/core/sock.c +index 018fd41..1e8a882 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -593,7 +593,8 @@ set_rcvbuf: + + case SO_KEEPALIVE: + #ifdef CONFIG_INET +- if (sk->sk_protocol == IPPROTO_TCP) ++ if (sk->sk_protocol == IPPROTO_TCP && ++ sk->sk_type == SOCK_STREAM) + tcp_set_keepalive(sk, valbool); + #endif + sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); +diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c +index 007e2eb..e1d4f30 100644 +--- a/net/ipv4/raw.c ++++ b/net/ipv4/raw.c +@@ -131,18 +131,20 @@ found: + * 0 - deliver + * 1 - block + */ +-static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) ++static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) + { +- int type; ++ struct icmphdr _hdr; ++ const struct icmphdr *hdr; + +- if (!pskb_may_pull(skb, sizeof(struct icmphdr))) ++ hdr = skb_header_pointer(skb, skb_transport_offset(skb), ++ sizeof(_hdr), &_hdr); ++ if (!hdr) + return 1; + +- type = icmp_hdr(skb)->type; +- if (type < 32) { ++ if (hdr->type < 32) { + __u32 data = raw_sk(sk)->filter.data; + +- return ((1 << type) & data) != 0; ++ return ((1U << hdr->type) & data) != 0; + } + + /* Do not block unknown ICMP types */ +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 043d49b..7397ad8 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -1589,8 +1589,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + } + + #ifdef CONFIG_NET_DMA +- if (tp->ucopy.dma_chan) +- dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); ++ if (tp->ucopy.dma_chan) { ++ if (tp->rcv_wnd == 0 && ++ !skb_queue_empty(&sk->sk_async_wait_queue)) { ++ tcp_service_net_dma(sk, true); ++ tcp_cleanup_rbuf(sk, copied); ++ } else ++ dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); ++ } + #endif + if (copied >= target) { + /* Do not sleep, just process backlog. */ +diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c +index 43242e6..42853c4 100644 +--- a/net/ipv6/mip6.c ++++ b/net/ipv6/mip6.c +@@ -84,28 +84,30 @@ static int mip6_mh_len(int type) + + static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb) + { +- struct ip6_mh *mh; ++ struct ip6_mh _hdr; ++ const struct ip6_mh *mh; + +- if (!pskb_may_pull(skb, (skb_transport_offset(skb)) + 8) || +- !pskb_may_pull(skb, (skb_transport_offset(skb) + +- ((skb_transport_header(skb)[1] + 1) << 3)))) ++ mh = skb_header_pointer(skb, skb_transport_offset(skb), ++ sizeof(_hdr), &_hdr); ++ if (!mh) + return -1; + +- mh = (struct ip6_mh *)skb_transport_header(skb); ++ if (((mh->ip6mh_hdrlen + 1) << 3) > skb->len) ++ return -1; + + if (mh->ip6mh_hdrlen < mip6_mh_len(mh->ip6mh_type)) { + LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH message too short: %d vs >=%d\n", + mh->ip6mh_hdrlen, mip6_mh_len(mh->ip6mh_type)); +- mip6_param_prob(skb, 0, ((&mh->ip6mh_hdrlen) - +- skb_network_header(skb))); ++ mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_hdrlen) + ++ skb_network_header_len(skb)); + return -1; + } + + if (mh->ip6mh_proto != IPPROTO_NONE) { + LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH invalid payload proto = %d\n", + mh->ip6mh_proto); +- mip6_param_prob(skb, 0, ((&mh->ip6mh_proto) - +- skb_network_header(skb))); ++ mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_proto) + ++ skb_network_header_len(skb)); + return -1; + } + +diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c +index 361ebf3..6e6c2c4 100644 +--- a/net/ipv6/raw.c ++++ b/net/ipv6/raw.c +@@ -107,21 +107,20 @@ found: + * 0 - deliver + * 1 - block + */ +-static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb) ++static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb) + { +- struct icmp6hdr *icmph; +- struct raw6_sock *rp = raw6_sk(sk); +- +- if (pskb_may_pull(skb, sizeof(struct icmp6hdr))) { +- __u32 *data = &rp->filter.data[0]; +- int bit_nr; ++ struct icmp6hdr *_hdr; ++ const struct icmp6hdr *hdr; + +- icmph = (struct icmp6hdr *) skb->data; +- bit_nr = icmph->icmp6_type; ++ hdr = skb_header_pointer(skb, skb_transport_offset(skb), ++ sizeof(_hdr), &_hdr); ++ if (hdr) { ++ const __u32 *data = &raw6_sk(sk)->filter.data[0]; ++ unsigned int type = hdr->icmp6_type; + +- return (data[bit_nr >> 5] & (1 << (bit_nr & 31))) != 0; ++ return (data[type >> 5] & (1U << (type & 31))) != 0; + } +- return 0; ++ return 1; + } + + #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +diff --git a/net/ipv6/route.c b/net/ipv6/route.c +index 2e21751..488a1b7 100644 +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -1435,17 +1435,18 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) + struct fib6_table *table; + struct net *net = dev_net(rt->rt6i_dev); + +- if (rt == net->ipv6.ip6_null_entry) +- return -ENOENT; ++ if (rt == net->ipv6.ip6_null_entry) { ++ err = -ENOENT; ++ goto out; ++ } + + table = rt->rt6i_table; + write_lock_bh(&table->tb6_lock); +- + err = fib6_del(rt, info); +- dst_release(&rt->dst); +- + write_unlock_bh(&table->tb6_lock); + ++out: ++ dst_release(&rt->dst); + return err; + } + +diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c +index 3c55f63..2cef50b 100644 +--- a/net/l2tp/l2tp_eth.c ++++ b/net/l2tp/l2tp_eth.c +@@ -132,7 +132,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, + printk("\n"); + } + +- if (!pskb_may_pull(skb, sizeof(ETH_HLEN))) ++ if (!pskb_may_pull(skb, ETH_HLEN)) + goto error; + + secpath_reset(skb); +diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c +index 732152f..f156382 100644 +--- a/net/netrom/af_netrom.c ++++ b/net/netrom/af_netrom.c +@@ -1170,7 +1170,12 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, + msg->msg_flags |= MSG_TRUNC; + } + +- skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); ++ er = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); ++ if (er < 0) { ++ skb_free_datagram(sk, skb); ++ release_sock(sk); ++ return er; ++ } + + if (sax != NULL) { + sax->sax25_family = AF_NETROM; +diff --git a/net/rds/recv.c b/net/rds/recv.c +index bc3f8cd..fc57d31 100644 +--- a/net/rds/recv.c ++++ b/net/rds/recv.c +@@ -410,6 +410,8 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + + rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo); + ++ msg->msg_namelen = 0; ++ + if (msg_flags & MSG_OOB) + goto out; + +@@ -485,6 +487,7 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + sin->sin_port = inc->i_hdr.h_sport; + sin->sin_addr.s_addr = inc->i_saddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); ++ msg->msg_namelen = sizeof(*sin); + } + break; + } +diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c +index 24d94c0..599f67a 100644 +--- a/net/sched/sch_cbq.c ++++ b/net/sched/sch_cbq.c +@@ -250,10 +250,11 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) + else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL) + cl = defmap[TC_PRIO_BESTEFFORT]; + +- if (cl == NULL || cl->level >= head->level) ++ if (cl == NULL) + goto fallback; + } +- ++ if (cl->level >= head->level) ++ goto fallback; + #ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: +diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c +index 7b03254..ca0fb48 100644 +--- a/net/sched/sch_qfq.c ++++ b/net/sched/sch_qfq.c +@@ -829,7 +829,10 @@ static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) + if (mask) { + struct qfq_group *next = qfq_ffs(q, mask); + if (qfq_gt(roundedF, next->F)) { +- cl->S = next->F; ++ if (qfq_gt(limit, next->F)) ++ cl->S = next->F; ++ else /* preserve timestamp correctness */ ++ cl->S = limit; + return; + } + } +diff --git a/net/sctp/output.c b/net/sctp/output.c +index 8fc4dcd..32ba8d0 100644 +--- a/net/sctp/output.c ++++ b/net/sctp/output.c +@@ -334,6 +334,25 @@ finish: + return retval; + } + ++static void sctp_packet_release_owner(struct sk_buff *skb) ++{ ++ sk_free(skb->sk); ++} ++ ++static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk) ++{ ++ skb_orphan(skb); ++ skb->sk = sk; ++ skb->destructor = sctp_packet_release_owner; ++ ++ /* ++ * The data chunks have already been accounted for in sctp_sendmsg(), ++ * therefore only reserve a single byte to keep socket around until ++ * the packet has been transmitted. ++ */ ++ atomic_inc(&sk->sk_wmem_alloc); ++} ++ + /* All packets are sent to the network through this function from + * sctp_outq_tail(). + * +@@ -375,7 +394,7 @@ int sctp_packet_transmit(struct sctp_packet *packet) + /* Set the owning socket so that we know where to get the + * destination IP address. + */ +- skb_set_owner_w(nskb, sk); ++ sctp_packet_set_owner_w(nskb, sk); + + if (!sctp_transport_dst_check(tp)) { + sctp_transport_route(tp, NULL, sctp_sk(sk)); +diff --git a/net/wireless/reg.c b/net/wireless/reg.c +index d57d05b..fa39731 100644 +--- a/net/wireless/reg.c ++++ b/net/wireless/reg.c +@@ -331,6 +331,9 @@ static void reg_regdb_search(struct work_struct *work) + struct reg_regdb_search_request *request; + const struct ieee80211_regdomain *curdom, *regdom; + int i, r; ++ bool set_reg = false; ++ ++ mutex_lock(&cfg80211_mutex); + + mutex_lock(®_regdb_search_mutex); + while (!list_empty(®_regdb_search_list)) { +@@ -346,9 +349,7 @@ static void reg_regdb_search(struct work_struct *work) + r = reg_copy_regd(®dom, curdom); + if (r) + break; +- mutex_lock(&cfg80211_mutex); +- set_regdom(regdom); +- mutex_unlock(&cfg80211_mutex); ++ set_reg = true; + break; + } + } +@@ -356,6 +357,11 @@ static void reg_regdb_search(struct work_struct *work) + kfree(request); + } + mutex_unlock(®_regdb_search_mutex); ++ ++ if (set_reg) ++ set_regdom(regdom); ++ ++ mutex_unlock(&cfg80211_mutex); + } + + static DECLARE_WORK(reg_regdb_work, reg_regdb_search); +diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c +index 54a0dc2..ab2bb42 100644 +--- a/net/xfrm/xfrm_input.c ++++ b/net/xfrm/xfrm_input.c +@@ -212,7 +212,7 @@ resume: + /* only the first xfrm gets the encap type */ + encap_type = 0; + +- if (async && x->repl->check(x, skb, seq)) { ++ if (async && x->repl->recheck(x, skb, seq)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); + goto drop_unlock; + } +diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c +index 0174034..113d20e 100644 +--- a/net/xfrm/xfrm_policy.c ++++ b/net/xfrm/xfrm_policy.c +@@ -1761,7 +1761,7 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family, + + if (!afinfo) { + dst_release(dst_orig); +- ret = ERR_PTR(-EINVAL); ++ return ERR_PTR(-EINVAL); + } else { + ret = afinfo->blackhole_route(net, dst_orig); + } +diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c +index 2f6d11d..3efb07d 100644 +--- a/net/xfrm/xfrm_replay.c ++++ b/net/xfrm/xfrm_replay.c +@@ -420,6 +420,18 @@ err: + return -EINVAL; + } + ++static int xfrm_replay_recheck_esn(struct xfrm_state *x, ++ struct sk_buff *skb, __be32 net_seq) ++{ ++ if (unlikely(XFRM_SKB_CB(skb)->seq.input.hi != ++ htonl(xfrm_replay_seqhi(x, net_seq)))) { ++ x->stats.replay_window++; ++ return -EINVAL; ++ } ++ ++ return xfrm_replay_check_esn(x, skb, net_seq); ++} ++ + static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) + { + unsigned int bitnr, nr, i; +@@ -479,6 +491,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) + static struct xfrm_replay xfrm_replay_legacy = { + .advance = xfrm_replay_advance, + .check = xfrm_replay_check, ++ .recheck = xfrm_replay_check, + .notify = xfrm_replay_notify, + .overflow = xfrm_replay_overflow, + }; +@@ -486,6 +499,7 @@ static struct xfrm_replay xfrm_replay_legacy = { + static struct xfrm_replay xfrm_replay_bmp = { + .advance = xfrm_replay_advance_bmp, + .check = xfrm_replay_check_bmp, ++ .recheck = xfrm_replay_check_bmp, + .notify = xfrm_replay_notify_bmp, + .overflow = xfrm_replay_overflow_bmp, + }; +@@ -493,6 +507,7 @@ static struct xfrm_replay xfrm_replay_bmp = { + static struct xfrm_replay xfrm_replay_esn = { + .advance = xfrm_replay_advance_esn, + .check = xfrm_replay_check_esn, ++ .recheck = xfrm_replay_recheck_esn, + .notify = xfrm_replay_notify_bmp, + .overflow = xfrm_replay_overflow_esn, + }; +diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c +index 7cae73e..ede01a8 100644 +--- a/net/xfrm/xfrm_user.c ++++ b/net/xfrm/xfrm_user.c +@@ -123,9 +123,21 @@ static inline int verify_replay(struct xfrm_usersa_info *p, + struct nlattr **attrs) + { + struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL]; ++ struct xfrm_replay_state_esn *rs; + +- if ((p->flags & XFRM_STATE_ESN) && !rt) +- return -EINVAL; ++ if (p->flags & XFRM_STATE_ESN) { ++ if (!rt) ++ return -EINVAL; ++ ++ rs = nla_data(rt); ++ ++ if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8) ++ return -EINVAL; ++ ++ if (nla_len(rt) < xfrm_replay_state_esn_len(rs) && ++ nla_len(rt) != sizeof(*rs)) ++ return -EINVAL; ++ } + + if (!rt) + return 0; +@@ -370,14 +382,15 @@ static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_es + struct nlattr *rp) + { + struct xfrm_replay_state_esn *up; ++ int ulen; + + if (!replay_esn || !rp) + return 0; + + up = nla_data(rp); ++ ulen = xfrm_replay_state_esn_len(up); + +- if (xfrm_replay_state_esn_len(replay_esn) != +- xfrm_replay_state_esn_len(up)) ++ if (nla_len(rp) < ulen || xfrm_replay_state_esn_len(replay_esn) != ulen) + return -EINVAL; + + return 0; +@@ -388,22 +401,28 @@ static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn + struct nlattr *rta) + { + struct xfrm_replay_state_esn *p, *pp, *up; ++ int klen, ulen; + + if (!rta) + return 0; + + up = nla_data(rta); ++ klen = xfrm_replay_state_esn_len(up); ++ ulen = nla_len(rta) >= klen ? klen : sizeof(*up); + +- p = kmemdup(up, xfrm_replay_state_esn_len(up), GFP_KERNEL); ++ p = kzalloc(klen, GFP_KERNEL); + if (!p) + return -ENOMEM; + +- pp = kmemdup(up, xfrm_replay_state_esn_len(up), GFP_KERNEL); ++ pp = kzalloc(klen, GFP_KERNEL); + if (!pp) { + kfree(p); + return -ENOMEM; + } + ++ memcpy(p, up, ulen); ++ memcpy(pp, up, ulen); ++ + *replay_esn = p; + *preplay_esn = pp; + +@@ -442,10 +461,11 @@ static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info * + * somehow made shareable and move it to xfrm_state.c - JHS + * + */ +-static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs) ++static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs, ++ int update_esn) + { + struct nlattr *rp = attrs[XFRMA_REPLAY_VAL]; +- struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL]; ++ struct nlattr *re = update_esn ? attrs[XFRMA_REPLAY_ESN_VAL] : NULL; + struct nlattr *lt = attrs[XFRMA_LTIME_VAL]; + struct nlattr *et = attrs[XFRMA_ETIMER_THRESH]; + struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH]; +@@ -555,7 +575,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, + goto error; + + /* override default values from above */ +- xfrm_update_ae_params(x, attrs); ++ xfrm_update_ae_params(x, attrs, 0); + + return x; + +@@ -689,6 +709,7 @@ out: + + static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) + { ++ memset(p, 0, sizeof(*p)); + memcpy(&p->id, &x->id, sizeof(p->id)); + memcpy(&p->sel, &x->sel, sizeof(p->sel)); + memcpy(&p->lft, &x->lft, sizeof(p->lft)); +@@ -742,7 +763,7 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) + return -EMSGSIZE; + + algo = nla_data(nla); +- strcpy(algo->alg_name, auth->alg_name); ++ strncpy(algo->alg_name, auth->alg_name, sizeof(algo->alg_name)); + memcpy(algo->alg_key, auth->alg_key, (auth->alg_key_len + 7) / 8); + algo->alg_key_len = auth->alg_key_len; + +@@ -862,6 +883,7 @@ static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, + { + struct xfrm_dump_info info; + struct sk_buff *skb; ++ int err; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) +@@ -872,9 +894,10 @@ static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, + info.nlmsg_seq = seq; + info.nlmsg_flags = 0; + +- if (dump_one_state(x, 0, &info)) { ++ err = dump_one_state(x, 0, &info); ++ if (err) { + kfree_skb(skb); +- return NULL; ++ return ERR_PTR(err); + } + + return skb; +@@ -1297,6 +1320,7 @@ static void copy_from_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy + + static void copy_to_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p, int dir) + { ++ memset(p, 0, sizeof(*p)); + memcpy(&p->sel, &xp->selector, sizeof(p->sel)); + memcpy(&p->lft, &xp->lft, sizeof(p->lft)); + memcpy(&p->curlft, &xp->curlft, sizeof(p->curlft)); +@@ -1401,6 +1425,7 @@ static int copy_to_user_tmpl(struct xfrm_policy *xp, struct sk_buff *skb) + struct xfrm_user_tmpl *up = &vec[i]; + struct xfrm_tmpl *kp = &xp->xfrm_vec[i]; + ++ memset(up, 0, sizeof(*up)); + memcpy(&up->id, &kp->id, sizeof(up->id)); + up->family = kp->encap_family; + memcpy(&up->saddr, &kp->saddr, sizeof(up->saddr)); +@@ -1529,6 +1554,7 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, + { + struct xfrm_dump_info info; + struct sk_buff *skb; ++ int err; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) +@@ -1539,9 +1565,10 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, + info.nlmsg_seq = seq; + info.nlmsg_flags = 0; + +- if (dump_one_policy(xp, dir, 0, &info) < 0) { ++ err = dump_one_policy(xp, dir, 0, &info); ++ if (err) { + kfree_skb(skb); +- return NULL; ++ return ERR_PTR(err); + } + + return skb; +@@ -1794,7 +1821,7 @@ static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh, + goto out; + + spin_lock_bh(&x->lock); +- xfrm_update_ae_params(x, attrs); ++ xfrm_update_ae_params(x, attrs, 1); + spin_unlock_bh(&x->lock); + + c.event = nlh->nlmsg_type; +diff --git a/sound/soc/samsung/dma.c b/sound/soc/samsung/dma.c +index a68b264..a9a593a 100644 +--- a/sound/soc/samsung/dma.c ++++ b/sound/soc/samsung/dma.c +@@ -34,9 +34,7 @@ static const struct snd_pcm_hardware dma_hardware = { + .info = SNDRV_PCM_INFO_INTERLEAVED | + SNDRV_PCM_INFO_BLOCK_TRANSFER | + SNDRV_PCM_INFO_MMAP | +- SNDRV_PCM_INFO_MMAP_VALID | +- SNDRV_PCM_INFO_PAUSE | +- SNDRV_PCM_INFO_RESUME, ++ SNDRV_PCM_INFO_MMAP_VALID, + .formats = SNDRV_PCM_FMTBIT_S16_LE | + SNDRV_PCM_FMTBIT_U16_LE | + SNDRV_PCM_FMTBIT_U8 | +@@ -246,15 +244,11 @@ static int dma_trigger(struct snd_pcm_substream *substream, int cmd) + + switch (cmd) { + case SNDRV_PCM_TRIGGER_START: +- case SNDRV_PCM_TRIGGER_RESUME: +- case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: + prtd->state |= ST_RUNNING; + prtd->params->ops->trigger(prtd->params->ch); + break; + + case SNDRV_PCM_TRIGGER_STOP: +- case SNDRV_PCM_TRIGGER_SUSPEND: +- case SNDRV_PCM_TRIGGER_PAUSE_PUSH: + prtd->state &= ~ST_RUNNING; + prtd->params->ops->stop(prtd->params->ch); + break; diff --git a/3.2.34/bump/1031_linux-3.2.32.patch b/3.2.34/bump/1031_linux-3.2.32.patch new file mode 100644 index 0000000..247fc0b --- /dev/null +++ b/3.2.34/bump/1031_linux-3.2.32.patch @@ -0,0 +1,6206 @@ +diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c +index c095d79..288dba6 100644 +--- a/Documentation/virtual/lguest/lguest.c ++++ b/Documentation/virtual/lguest/lguest.c +@@ -1299,6 +1299,7 @@ static struct device *new_device(const char *name, u16 type) + dev->feature_len = 0; + dev->num_vq = 0; + dev->running = false; ++ dev->next = NULL; + + /* + * Append to device list. Prepending to a single-linked list is +diff --git a/Makefile b/Makefile +index fd9c414..b6d8282 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 2 +-SUBLEVEL = 31 ++SUBLEVEL = 32 + EXTRAVERSION = + NAME = Saber-toothed Squirrel + +diff --git a/arch/arm/plat-omap/counter_32k.c b/arch/arm/plat-omap/counter_32k.c +index a6cbb71..04e703a 100644 +--- a/arch/arm/plat-omap/counter_32k.c ++++ b/arch/arm/plat-omap/counter_32k.c +@@ -82,22 +82,29 @@ static void notrace omap_update_sched_clock(void) + * nsecs and adds to a monotonically increasing timespec. + */ + static struct timespec persistent_ts; +-static cycles_t cycles, last_cycles; ++static cycles_t cycles; + static unsigned int persistent_mult, persistent_shift; ++static DEFINE_SPINLOCK(read_persistent_clock_lock); ++ + void read_persistent_clock(struct timespec *ts) + { + unsigned long long nsecs; +- cycles_t delta; +- struct timespec *tsp = &persistent_ts; ++ cycles_t last_cycles; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&read_persistent_clock_lock, flags); + + last_cycles = cycles; + cycles = timer_32k_base ? __raw_readl(timer_32k_base) : 0; +- delta = cycles - last_cycles; + +- nsecs = clocksource_cyc2ns(delta, persistent_mult, persistent_shift); ++ nsecs = clocksource_cyc2ns(cycles - last_cycles, ++ persistent_mult, persistent_shift); ++ ++ timespec_add_ns(&persistent_ts, nsecs); ++ ++ *ts = persistent_ts; + +- timespec_add_ns(tsp, nsecs); +- *ts = *tsp; ++ spin_unlock_irqrestore(&read_persistent_clock_lock, flags); + } + + int __init omap_init_clocksource_32k(void) +diff --git a/arch/mips/Makefile b/arch/mips/Makefile +index 0be3186..aaf7444 100644 +--- a/arch/mips/Makefile ++++ b/arch/mips/Makefile +@@ -224,7 +224,7 @@ KBUILD_CPPFLAGS += -D"DATAOFFSET=$(if $(dataoffset-y),$(dataoffset-y),0)" + LDFLAGS += -m $(ld-emul) + + ifdef CONFIG_MIPS +-CHECKFLAGS += $(shell $(CC) $(KBUILD_CFLAGS) -dM -E -xc /dev/null | \ ++CHECKFLAGS += $(shell $(CC) $(KBUILD_CFLAGS) -dM -E -x c /dev/null | \ + egrep -vw '__GNUC_(|MINOR_|PATCHLEVEL_)_' | \ + sed -e "s/^\#define /-D'/" -e "s/ /'='/" -e "s/$$/'/") + ifdef CONFIG_64BIT +diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile +index 1a96618..ce7dd99 100644 +--- a/arch/mips/kernel/Makefile ++++ b/arch/mips/kernel/Makefile +@@ -102,7 +102,7 @@ obj-$(CONFIG_MIPS_MACHINE) += mips_machine.o + + obj-$(CONFIG_OF) += prom.o + +-CFLAGS_cpu-bugs64.o = $(shell if $(CC) $(KBUILD_CFLAGS) -Wa,-mdaddi -c -o /dev/null -xc /dev/null >/dev/null 2>&1; then echo "-DHAVE_AS_SET_DADDI"; fi) ++CFLAGS_cpu-bugs64.o = $(shell if $(CC) $(KBUILD_CFLAGS) -Wa,-mdaddi -c -o /dev/null -x c /dev/null >/dev/null 2>&1; then echo "-DHAVE_AS_SET_DADDI"; fi) + + obj-$(CONFIG_HAVE_STD_PC_SERIAL_PORT) += 8250-platform.o + +diff --git a/arch/mn10300/Makefile b/arch/mn10300/Makefile +index 7120282..3eb4a52 100644 +--- a/arch/mn10300/Makefile ++++ b/arch/mn10300/Makefile +@@ -26,7 +26,7 @@ CHECKFLAGS += + PROCESSOR := unset + UNIT := unset + +-KBUILD_CFLAGS += -mam33 -mmem-funcs -DCPU=AM33 ++KBUILD_CFLAGS += -mam33 -DCPU=AM33 $(call cc-option,-mmem-funcs,) + KBUILD_AFLAGS += -mam33 -DCPU=AM33 + + ifeq ($(CONFIG_MN10300_CURRENT_IN_E2),y) +diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c +index 1b6cb10..a0a4e8a 100644 +--- a/arch/powerpc/platforms/pseries/eeh_driver.c ++++ b/arch/powerpc/platforms/pseries/eeh_driver.c +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -41,6 +42,41 @@ static inline const char * pcid_name (struct pci_dev *pdev) + return ""; + } + ++/** ++ * eeh_pcid_get - Get the PCI device driver ++ * @pdev: PCI device ++ * ++ * The function is used to retrieve the PCI device driver for ++ * the indicated PCI device. Besides, we will increase the reference ++ * of the PCI device driver to prevent that being unloaded on ++ * the fly. Otherwise, kernel crash would be seen. ++ */ ++static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) ++{ ++ if (!pdev || !pdev->driver) ++ return NULL; ++ ++ if (!try_module_get(pdev->driver->driver.owner)) ++ return NULL; ++ ++ return pdev->driver; ++} ++ ++/** ++ * eeh_pcid_put - Dereference on the PCI device driver ++ * @pdev: PCI device ++ * ++ * The function is called to do dereference on the PCI device ++ * driver of the indicated PCI device. ++ */ ++static inline void eeh_pcid_put(struct pci_dev *pdev) ++{ ++ if (!pdev || !pdev->driver) ++ return; ++ ++ module_put(pdev->driver->driver.owner); ++} ++ + #if 0 + static void print_device_node_tree(struct pci_dn *pdn, int dent) + { +@@ -109,18 +145,20 @@ static void eeh_enable_irq(struct pci_dev *dev) + static int eeh_report_error(struct pci_dev *dev, void *userdata) + { + enum pci_ers_result rc, *res = userdata; +- struct pci_driver *driver = dev->driver; ++ struct pci_driver *driver; + + dev->error_state = pci_channel_io_frozen; + +- if (!driver) +- return 0; ++ driver = eeh_pcid_get(dev); ++ if (!driver) return 0; + + eeh_disable_irq(dev); + + if (!driver->err_handler || +- !driver->err_handler->error_detected) ++ !driver->err_handler->error_detected) { ++ eeh_pcid_put(dev); + return 0; ++ } + + rc = driver->err_handler->error_detected (dev, pci_channel_io_frozen); + +@@ -128,6 +166,7 @@ static int eeh_report_error(struct pci_dev *dev, void *userdata) + if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + if (*res == PCI_ERS_RESULT_NONE) *res = rc; + ++ eeh_pcid_put(dev); + return 0; + } + +@@ -142,12 +181,15 @@ static int eeh_report_error(struct pci_dev *dev, void *userdata) + static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata) + { + enum pci_ers_result rc, *res = userdata; +- struct pci_driver *driver = dev->driver; ++ struct pci_driver *driver; + +- if (!driver || +- !driver->err_handler || +- !driver->err_handler->mmio_enabled) ++ driver = eeh_pcid_get(dev); ++ if (!driver) return 0; ++ if (!driver->err_handler || ++ !driver->err_handler->mmio_enabled) { ++ eeh_pcid_put(dev); + return 0; ++ } + + rc = driver->err_handler->mmio_enabled (dev); + +@@ -155,6 +197,7 @@ static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata) + if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + if (*res == PCI_ERS_RESULT_NONE) *res = rc; + ++ eeh_pcid_put(dev); + return 0; + } + +@@ -165,18 +208,20 @@ static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata) + static int eeh_report_reset(struct pci_dev *dev, void *userdata) + { + enum pci_ers_result rc, *res = userdata; +- struct pci_driver *driver = dev->driver; +- +- if (!driver) +- return 0; ++ struct pci_driver *driver; + + dev->error_state = pci_channel_io_normal; + ++ driver = eeh_pcid_get(dev); ++ if (!driver) return 0; ++ + eeh_enable_irq(dev); + + if (!driver->err_handler || +- !driver->err_handler->slot_reset) ++ !driver->err_handler->slot_reset) { ++ eeh_pcid_put(dev); + return 0; ++ } + + rc = driver->err_handler->slot_reset(dev); + if ((*res == PCI_ERS_RESULT_NONE) || +@@ -184,6 +229,7 @@ static int eeh_report_reset(struct pci_dev *dev, void *userdata) + if (*res == PCI_ERS_RESULT_DISCONNECT && + rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + ++ eeh_pcid_put(dev); + return 0; + } + +@@ -193,21 +239,24 @@ static int eeh_report_reset(struct pci_dev *dev, void *userdata) + + static int eeh_report_resume(struct pci_dev *dev, void *userdata) + { +- struct pci_driver *driver = dev->driver; ++ struct pci_driver *driver; + + dev->error_state = pci_channel_io_normal; + +- if (!driver) +- return 0; ++ driver = eeh_pcid_get(dev); ++ if (!driver) return 0; + + eeh_enable_irq(dev); + + if (!driver->err_handler || +- !driver->err_handler->resume) ++ !driver->err_handler->resume) { ++ eeh_pcid_put(dev); + return 0; ++ } + + driver->err_handler->resume(dev); + ++ eeh_pcid_put(dev); + return 0; + } + +@@ -220,21 +269,24 @@ static int eeh_report_resume(struct pci_dev *dev, void *userdata) + + static int eeh_report_failure(struct pci_dev *dev, void *userdata) + { +- struct pci_driver *driver = dev->driver; ++ struct pci_driver *driver; + + dev->error_state = pci_channel_io_perm_failure; + +- if (!driver) +- return 0; ++ driver = eeh_pcid_get(dev); ++ if (!driver) return 0; + + eeh_disable_irq(dev); + + if (!driver->err_handler || +- !driver->err_handler->error_detected) ++ !driver->err_handler->error_detected) { ++ eeh_pcid_put(dev); + return 0; ++ } + + driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); + ++ eeh_pcid_put(dev); + return 0; + } + +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 18601c8..884507e 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -146,8 +146,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd) + + static inline int pmd_large(pmd_t pte) + { +- return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == +- (_PAGE_PSE | _PAGE_PRESENT); ++ return pmd_flags(pte) & _PAGE_PSE; + } + + #ifdef CONFIG_TRANSPARENT_HUGEPAGE +@@ -415,7 +414,13 @@ static inline int pte_hidden(pte_t pte) + + static inline int pmd_present(pmd_t pmd) + { +- return pmd_flags(pmd) & _PAGE_PRESENT; ++ /* ++ * Checking for _PAGE_PSE is needed too because ++ * split_huge_page will temporarily clear the present bit (but ++ * the _PAGE_PSE flag will remain set at all times while the ++ * _PAGE_PRESENT bit is clear). ++ */ ++ return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); + } + + static inline int pmd_none(pmd_t pmd) +diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c +index 37718f0..4d320b2 100644 +--- a/arch/x86/platform/efi/efi.c ++++ b/arch/x86/platform/efi/efi.c +@@ -731,6 +731,7 @@ void __init efi_enter_virtual_mode(void) + * + * Call EFI services through wrapper functions. + */ ++ efi.runtime_version = efi_systab.fw_revision; + efi.get_time = virt_efi_get_time; + efi.set_time = virt_efi_set_time; + efi.get_wakeup_time = virt_efi_get_wakeup_time; +diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c +index 9ecec98..5016de5 100644 +--- a/drivers/acpi/bus.c ++++ b/drivers/acpi/bus.c +@@ -950,8 +950,6 @@ static int __init acpi_bus_init(void) + status = acpi_ec_ecdt_probe(); + /* Ignore result. Not having an ECDT is not fatal. */ + +- acpi_bus_osc_support(); +- + status = acpi_initialize_objects(ACPI_FULL_INITIALIZATION); + if (ACPI_FAILURE(status)) { + printk(KERN_ERR PREFIX "Unable to initialize ACPI objects\n"); +@@ -959,6 +957,12 @@ static int __init acpi_bus_init(void) + } + + /* ++ * _OSC method may exist in module level code, ++ * so it must be run after ACPI_FULL_INITIALIZATION ++ */ ++ acpi_bus_osc_support(); ++ ++ /* + * _PDC control method may load dynamic SSDT tables, + * and we need to install the table handler before that. + */ +diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c +index 6f95d98..1f90dab 100644 +--- a/drivers/bluetooth/btusb.c ++++ b/drivers/bluetooth/btusb.c +@@ -108,7 +108,7 @@ static struct usb_device_id btusb_table[] = { + { USB_DEVICE(0x413c, 0x8197) }, + + /* Foxconn - Hon Hai */ +- { USB_DEVICE(0x0489, 0xe033) }, ++ { USB_VENDOR_AND_INTERFACE_INFO(0x0489, 0xff, 0x01, 0x01) }, + + /*Broadcom devices with vendor specific id */ + { USB_VENDOR_AND_INTERFACE_INFO(0x0a5c, 0xff, 0x01, 0x01) }, +diff --git a/drivers/char/ttyprintk.c b/drivers/char/ttyprintk.c +index eedd547..5936691 100644 +--- a/drivers/char/ttyprintk.c ++++ b/drivers/char/ttyprintk.c +@@ -67,7 +67,7 @@ static int tpk_printk(const unsigned char *buf, int count) + tmp[tpk_curr + 1] = '\0'; + printk(KERN_INFO "%s%s", tpk_tag, tmp); + tpk_curr = 0; +- if (buf[i + 1] == '\n') ++ if ((i + 1) < count && buf[i + 1] == '\n') + i++; + break; + case '\n': +diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c +index b48967b..5991114 100644 +--- a/drivers/dma/dmaengine.c ++++ b/drivers/dma/dmaengine.c +@@ -564,8 +564,8 @@ void dmaengine_get(void) + list_del_rcu(&device->global_node); + break; + } else if (err) +- pr_err("dmaengine: failed to get %s: (%d)\n", +- dma_chan_name(chan), err); ++ pr_debug("%s: failed to get %s: (%d)\n", ++ __func__, dma_chan_name(chan), err); + } + } + +diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c +index 4799393..b97d4f0 100644 +--- a/drivers/firewire/core-cdev.c ++++ b/drivers/firewire/core-cdev.c +@@ -471,8 +471,8 @@ static int ioctl_get_info(struct client *client, union ioctl_arg *arg) + client->bus_reset_closure = a->bus_reset_closure; + if (a->bus_reset != 0) { + fill_bus_reset_event(&bus_reset, client); +- ret = copy_to_user(u64_to_uptr(a->bus_reset), +- &bus_reset, sizeof(bus_reset)); ++ /* unaligned size of bus_reset is 36 bytes */ ++ ret = copy_to_user(u64_to_uptr(a->bus_reset), &bus_reset, 36); + } + if (ret == 0 && list_empty(&client->link)) + list_add_tail(&client->link, &client->device->client_list); +diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c +index 0535c21..3e60e8d 100644 +--- a/drivers/firmware/efivars.c ++++ b/drivers/firmware/efivars.c +@@ -435,12 +435,23 @@ efivar_attr_read(struct efivar_entry *entry, char *buf) + if (status != EFI_SUCCESS) + return -EIO; + +- if (var->Attributes & 0x1) ++ if (var->Attributes & EFI_VARIABLE_NON_VOLATILE) + str += sprintf(str, "EFI_VARIABLE_NON_VOLATILE\n"); +- if (var->Attributes & 0x2) ++ if (var->Attributes & EFI_VARIABLE_BOOTSERVICE_ACCESS) + str += sprintf(str, "EFI_VARIABLE_BOOTSERVICE_ACCESS\n"); +- if (var->Attributes & 0x4) ++ if (var->Attributes & EFI_VARIABLE_RUNTIME_ACCESS) + str += sprintf(str, "EFI_VARIABLE_RUNTIME_ACCESS\n"); ++ if (var->Attributes & EFI_VARIABLE_HARDWARE_ERROR_RECORD) ++ str += sprintf(str, "EFI_VARIABLE_HARDWARE_ERROR_RECORD\n"); ++ if (var->Attributes & EFI_VARIABLE_AUTHENTICATED_WRITE_ACCESS) ++ str += sprintf(str, ++ "EFI_VARIABLE_AUTHENTICATED_WRITE_ACCESS\n"); ++ if (var->Attributes & ++ EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS) ++ str += sprintf(str, ++ "EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS\n"); ++ if (var->Attributes & EFI_VARIABLE_APPEND_WRITE) ++ str += sprintf(str, "EFI_VARIABLE_APPEND_WRITE\n"); + return str - buf; + } + +diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c +index e48e01e..33e1555 100644 +--- a/drivers/gpu/drm/i915/i915_gem.c ++++ b/drivers/gpu/drm/i915/i915_gem.c +@@ -1543,16 +1543,19 @@ i915_gem_object_move_to_active(struct drm_i915_gem_object *obj, + list_move_tail(&obj->ring_list, &ring->active_list); + + obj->last_rendering_seqno = seqno; +- if (obj->fenced_gpu_access) { +- struct drm_i915_fence_reg *reg; +- +- BUG_ON(obj->fence_reg == I915_FENCE_REG_NONE); + ++ if (obj->fenced_gpu_access) { + obj->last_fenced_seqno = seqno; + obj->last_fenced_ring = ring; + +- reg = &dev_priv->fence_regs[obj->fence_reg]; +- list_move_tail(®->lru_list, &dev_priv->mm.fence_list); ++ /* Bump MRU to take account of the delayed flush */ ++ if (obj->fence_reg != I915_FENCE_REG_NONE) { ++ struct drm_i915_fence_reg *reg; ++ ++ reg = &dev_priv->fence_regs[obj->fence_reg]; ++ list_move_tail(®->lru_list, ++ &dev_priv->mm.fence_list); ++ } + } + } + +@@ -1561,6 +1564,7 @@ i915_gem_object_move_off_active(struct drm_i915_gem_object *obj) + { + list_del_init(&obj->ring_list); + obj->last_rendering_seqno = 0; ++ obj->last_fenced_seqno = 0; + } + + static void +@@ -1589,6 +1593,7 @@ i915_gem_object_move_to_inactive(struct drm_i915_gem_object *obj) + BUG_ON(!list_empty(&obj->gpu_write_list)); + BUG_ON(!obj->active); + obj->ring = NULL; ++ obj->last_fenced_ring = NULL; + + i915_gem_object_move_off_active(obj); + obj->fenced_gpu_access = false; +diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c +index a6c2f7a..1202198 100644 +--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c ++++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c +@@ -574,7 +574,8 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, + if (ret) + break; + } +- obj->pending_fenced_gpu_access = need_fence; ++ obj->pending_fenced_gpu_access = ++ !!(entry->flags & EXEC_OBJECT_NEEDS_FENCE); + } + + entry->offset = obj->gtt_offset; +diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c +index 31d334d..861223b 100644 +--- a/drivers/gpu/drm/i915/i915_gem_tiling.c ++++ b/drivers/gpu/drm/i915/i915_gem_tiling.c +@@ -107,10 +107,10 @@ i915_gem_detect_bit_6_swizzle(struct drm_device *dev) + */ + swizzle_x = I915_BIT_6_SWIZZLE_NONE; + swizzle_y = I915_BIT_6_SWIZZLE_NONE; +- } else if (IS_MOBILE(dev)) { ++ } else if (IS_MOBILE(dev) || (IS_GEN3(dev) && !IS_G33(dev))) { + uint32_t dcc; + +- /* On mobile 9xx chipsets, channel interleave by the CPU is ++ /* On 9xx chipsets, channel interleave by the CPU is + * determined by DCC. For single-channel, neither the CPU + * nor the GPU do swizzling. For dual channel interleaved, + * the GPU's interleave is bit 9 and 10 for X tiled, and bit +diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c +index c8b5bc1..2812d7b 100644 +--- a/drivers/gpu/drm/i915/i915_irq.c ++++ b/drivers/gpu/drm/i915/i915_irq.c +@@ -530,6 +530,12 @@ static irqreturn_t ivybridge_irq_handler(DRM_IRQ_ARGS) + if (de_iir & DE_GSE_IVB) + intel_opregion_gse_intr(dev); + ++ if (de_iir & DE_PIPEA_VBLANK_IVB) ++ drm_handle_vblank(dev, 0); ++ ++ if (de_iir & DE_PIPEB_VBLANK_IVB) ++ drm_handle_vblank(dev, 1); ++ + if (de_iir & DE_PLANEA_FLIP_DONE_IVB) { + intel_prepare_page_flip(dev, 0); + intel_finish_page_flip_plane(dev, 0); +@@ -540,12 +546,6 @@ static irqreturn_t ivybridge_irq_handler(DRM_IRQ_ARGS) + intel_finish_page_flip_plane(dev, 1); + } + +- if (de_iir & DE_PIPEA_VBLANK_IVB) +- drm_handle_vblank(dev, 0); +- +- if (de_iir & DE_PIPEB_VBLANK_IVB) +- drm_handle_vblank(dev, 1); +- + /* check event from PCH */ + if (de_iir & DE_PCH_EVENT_IVB) { + if (pch_iir & SDE_HOTPLUG_MASK_CPT) +@@ -622,6 +622,12 @@ static irqreturn_t ironlake_irq_handler(DRM_IRQ_ARGS) + if (de_iir & DE_GSE) + intel_opregion_gse_intr(dev); + ++ if (de_iir & DE_PIPEA_VBLANK) ++ drm_handle_vblank(dev, 0); ++ ++ if (de_iir & DE_PIPEB_VBLANK) ++ drm_handle_vblank(dev, 1); ++ + if (de_iir & DE_PLANEA_FLIP_DONE) { + intel_prepare_page_flip(dev, 0); + intel_finish_page_flip_plane(dev, 0); +@@ -632,12 +638,6 @@ static irqreturn_t ironlake_irq_handler(DRM_IRQ_ARGS) + intel_finish_page_flip_plane(dev, 1); + } + +- if (de_iir & DE_PIPEA_VBLANK) +- drm_handle_vblank(dev, 0); +- +- if (de_iir & DE_PIPEB_VBLANK) +- drm_handle_vblank(dev, 1); +- + /* check event from PCH */ + if (de_iir & DE_PCH_EVENT) { + if (pch_iir & hotplug_mask) +diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h +index 4a5e662..a294a32 100644 +--- a/drivers/gpu/drm/i915/i915_reg.h ++++ b/drivers/gpu/drm/i915/i915_reg.h +@@ -401,6 +401,9 @@ + # define VS_TIMER_DISPATCH (1 << 6) + # define MI_FLUSH_ENABLE (1 << 11) + ++#define GEN6_GT_MODE 0x20d0 ++#define GEN6_GT_MODE_HI (1 << 9) ++ + #define GFX_MODE 0x02520 + #define GFX_MODE_GEN7 0x0229c + #define GFX_RUN_LIST_ENABLE (1<<15) +@@ -1557,6 +1560,10 @@ + + /* Video Data Island Packet control */ + #define VIDEO_DIP_DATA 0x61178 ++/* Read the description of VIDEO_DIP_DATA (before Haswel) or VIDEO_DIP_ECC ++ * (Haswell and newer) to see which VIDEO_DIP_DATA byte corresponds to each byte ++ * of the infoframe structure specified by CEA-861. */ ++#define VIDEO_DIP_DATA_SIZE 32 + #define VIDEO_DIP_CTL 0x61170 + #define VIDEO_DIP_ENABLE (1 << 31) + #define VIDEO_DIP_PORT_B (1 << 29) +diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c +index 6c3fb44..adac0dd 100644 +--- a/drivers/gpu/drm/i915/intel_display.c ++++ b/drivers/gpu/drm/i915/intel_display.c +@@ -2850,13 +2850,34 @@ static void intel_clear_scanline_wait(struct drm_device *dev) + I915_WRITE_CTL(ring, tmp); + } + ++static bool intel_crtc_has_pending_flip(struct drm_crtc *crtc) ++{ ++ struct drm_device *dev = crtc->dev; ++ struct drm_i915_private *dev_priv = dev->dev_private; ++ unsigned long flags; ++ bool pending; ++ ++ if (atomic_read(&dev_priv->mm.wedged)) ++ return false; ++ ++ spin_lock_irqsave(&dev->event_lock, flags); ++ pending = to_intel_crtc(crtc)->unpin_work != NULL; ++ spin_unlock_irqrestore(&dev->event_lock, flags); ++ ++ return pending; ++} ++ + static void intel_crtc_wait_for_pending_flips(struct drm_crtc *crtc) + { + struct drm_device *dev = crtc->dev; ++ struct drm_i915_private *dev_priv = dev->dev_private; + + if (crtc->fb == NULL) + return; + ++ wait_event(dev_priv->pending_flip_queue, ++ !intel_crtc_has_pending_flip(crtc)); ++ + mutex_lock(&dev->struct_mutex); + intel_finish_fb(crtc->fb); + mutex_unlock(&dev->struct_mutex); +@@ -5027,7 +5048,7 @@ static int i9xx_crtc_mode_set(struct drm_crtc *crtc, + /* default to 8bpc */ + pipeconf &= ~(PIPECONF_BPP_MASK | PIPECONF_DITHER_EN); + if (is_dp) { +- if (mode->private_flags & INTEL_MODE_DP_FORCE_6BPC) { ++ if (adjusted_mode->private_flags & INTEL_MODE_DP_FORCE_6BPC) { + pipeconf |= PIPECONF_BPP_6 | + PIPECONF_DITHER_EN | + PIPECONF_DITHER_TYPE_SP; +@@ -5495,7 +5516,7 @@ static int ironlake_crtc_mode_set(struct drm_crtc *crtc, + /* determine panel color depth */ + temp = I915_READ(PIPECONF(pipe)); + temp &= ~PIPE_BPC_MASK; +- dither = intel_choose_pipe_bpp_dither(crtc, &pipe_bpp, mode); ++ dither = intel_choose_pipe_bpp_dither(crtc, &pipe_bpp, adjusted_mode); + switch (pipe_bpp) { + case 18: + temp |= PIPE_6BPC; +@@ -6952,9 +6973,8 @@ static void do_intel_finish_page_flip(struct drm_device *dev, + + atomic_clear_mask(1 << intel_crtc->plane, + &obj->pending_flip.counter); +- if (atomic_read(&obj->pending_flip) == 0) +- wake_up(&dev_priv->pending_flip_queue); + ++ wake_up(&dev_priv->pending_flip_queue); + schedule_work(&work->work); + + trace_i915_flip_complete(intel_crtc->plane, work->pending_flip_obj); +@@ -7193,7 +7213,7 @@ static int intel_gen7_queue_flip(struct drm_device *dev, + default: + WARN_ONCE(1, "unknown plane in flip command\n"); + ret = -ENODEV; +- goto err; ++ goto err_unpin; + } + + ret = intel_ring_begin(ring, 4); +@@ -8278,6 +8298,11 @@ static void gen6_init_clock_gating(struct drm_device *dev) + DISPPLANE_TRICKLE_FEED_DISABLE); + intel_flush_display_plane(dev_priv, pipe); + } ++ ++ /* The default value should be 0x200 according to docs, but the two ++ * platforms I checked have a 0 for this. (Maybe BIOS overrides?) */ ++ I915_WRITE(GEN6_GT_MODE, 0xffff << 16); ++ I915_WRITE(GEN6_GT_MODE, GEN6_GT_MODE_HI << 16 | GEN6_GT_MODE_HI); + } + + static void gen7_setup_fixed_func_scheduler(struct drm_i915_private *dev_priv) +diff --git a/drivers/gpu/drm/i915/intel_hdmi.c b/drivers/gpu/drm/i915/intel_hdmi.c +index c2a64f4..497da2a 100644 +--- a/drivers/gpu/drm/i915/intel_hdmi.c ++++ b/drivers/gpu/drm/i915/intel_hdmi.c +@@ -138,14 +138,20 @@ static void i9xx_write_infoframe(struct drm_encoder *encoder, + + I915_WRITE(VIDEO_DIP_CTL, VIDEO_DIP_ENABLE | val | port | flags); + ++ mmiowb(); + for (i = 0; i < len; i += 4) { + I915_WRITE(VIDEO_DIP_DATA, *data); + data++; + } ++ /* Write every possible data byte to force correct ECC calculation. */ ++ for (; i < VIDEO_DIP_DATA_SIZE; i += 4) ++ I915_WRITE(VIDEO_DIP_DATA, 0); ++ mmiowb(); + + flags |= intel_infoframe_flags(frame); + + I915_WRITE(VIDEO_DIP_CTL, VIDEO_DIP_ENABLE | val | port | flags); ++ POSTING_READ(VIDEO_DIP_CTL); + } + + static void ironlake_write_infoframe(struct drm_encoder *encoder, +@@ -168,14 +174,20 @@ static void ironlake_write_infoframe(struct drm_encoder *encoder, + + I915_WRITE(reg, VIDEO_DIP_ENABLE | val | flags); + ++ mmiowb(); + for (i = 0; i < len; i += 4) { + I915_WRITE(TVIDEO_DIP_DATA(intel_crtc->pipe), *data); + data++; + } ++ /* Write every possible data byte to force correct ECC calculation. */ ++ for (; i < VIDEO_DIP_DATA_SIZE; i += 4) ++ I915_WRITE(TVIDEO_DIP_DATA(intel_crtc->pipe), 0); ++ mmiowb(); + + flags |= intel_infoframe_flags(frame); + + I915_WRITE(reg, VIDEO_DIP_ENABLE | val | flags); ++ POSTING_READ(reg); + } + static void intel_set_infoframe(struct drm_encoder *encoder, + struct dip_infoframe *frame) +@@ -546,10 +558,13 @@ void intel_hdmi_init(struct drm_device *dev, int sdvox_reg) + if (!HAS_PCH_SPLIT(dev)) { + intel_hdmi->write_infoframe = i9xx_write_infoframe; + I915_WRITE(VIDEO_DIP_CTL, 0); ++ POSTING_READ(VIDEO_DIP_CTL); + } else { + intel_hdmi->write_infoframe = ironlake_write_infoframe; +- for_each_pipe(i) ++ for_each_pipe(i) { + I915_WRITE(TVIDEO_DIP_CTL(i), 0); ++ POSTING_READ(TVIDEO_DIP_CTL(i)); ++ } + } + + drm_encoder_helper_add(&intel_encoder->base, &intel_hdmi_helper_funcs); +diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c +index fc0633c..b61f490 100644 +--- a/drivers/gpu/drm/radeon/evergreen.c ++++ b/drivers/gpu/drm/radeon/evergreen.c +@@ -37,6 +37,16 @@ + #define EVERGREEN_PFP_UCODE_SIZE 1120 + #define EVERGREEN_PM4_UCODE_SIZE 1376 + ++static const u32 crtc_offsets[6] = ++{ ++ EVERGREEN_CRTC0_REGISTER_OFFSET, ++ EVERGREEN_CRTC1_REGISTER_OFFSET, ++ EVERGREEN_CRTC2_REGISTER_OFFSET, ++ EVERGREEN_CRTC3_REGISTER_OFFSET, ++ EVERGREEN_CRTC4_REGISTER_OFFSET, ++ EVERGREEN_CRTC5_REGISTER_OFFSET ++}; ++ + static void evergreen_gpu_init(struct radeon_device *rdev); + void evergreen_fini(struct radeon_device *rdev); + void evergreen_pcie_gen2_enable(struct radeon_device *rdev); +@@ -66,6 +76,27 @@ void evergreen_fix_pci_max_read_req_size(struct radeon_device *rdev) + } + } + ++void dce4_wait_for_vblank(struct radeon_device *rdev, int crtc) ++{ ++ int i; ++ ++ if (crtc >= rdev->num_crtc) ++ return; ++ ++ if (RREG32(EVERGREEN_CRTC_CONTROL + crtc_offsets[crtc]) & EVERGREEN_CRTC_MASTER_EN) { ++ for (i = 0; i < rdev->usec_timeout; i++) { ++ if (!(RREG32(EVERGREEN_CRTC_STATUS + crtc_offsets[crtc]) & EVERGREEN_CRTC_V_BLANK)) ++ break; ++ udelay(1); ++ } ++ for (i = 0; i < rdev->usec_timeout; i++) { ++ if (RREG32(EVERGREEN_CRTC_STATUS + crtc_offsets[crtc]) & EVERGREEN_CRTC_V_BLANK) ++ break; ++ udelay(1); ++ } ++ } ++} ++ + void evergreen_pre_page_flip(struct radeon_device *rdev, int crtc) + { + /* enable the pflip int */ +@@ -1065,116 +1096,88 @@ void evergreen_agp_enable(struct radeon_device *rdev) + + void evergreen_mc_stop(struct radeon_device *rdev, struct evergreen_mc_save *save) + { ++ u32 crtc_enabled, tmp, frame_count, blackout; ++ int i, j; ++ + save->vga_render_control = RREG32(VGA_RENDER_CONTROL); + save->vga_hdp_control = RREG32(VGA_HDP_CONTROL); + +- /* Stop all video */ ++ /* disable VGA render */ + WREG32(VGA_RENDER_CONTROL, 0); +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC0_REGISTER_OFFSET, 1); +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC1_REGISTER_OFFSET, 1); +- if (rdev->num_crtc >= 4) { +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC2_REGISTER_OFFSET, 1); +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC3_REGISTER_OFFSET, 1); +- } +- if (rdev->num_crtc >= 6) { +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC4_REGISTER_OFFSET, 1); +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC5_REGISTER_OFFSET, 1); +- } +- WREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC0_REGISTER_OFFSET, 0); +- WREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC1_REGISTER_OFFSET, 0); +- if (rdev->num_crtc >= 4) { +- WREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC2_REGISTER_OFFSET, 0); +- WREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC3_REGISTER_OFFSET, 0); +- } +- if (rdev->num_crtc >= 6) { +- WREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC4_REGISTER_OFFSET, 0); +- WREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC5_REGISTER_OFFSET, 0); +- } +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC0_REGISTER_OFFSET, 0); +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC1_REGISTER_OFFSET, 0); +- if (rdev->num_crtc >= 4) { +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC2_REGISTER_OFFSET, 0); +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC3_REGISTER_OFFSET, 0); +- } +- if (rdev->num_crtc >= 6) { +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC4_REGISTER_OFFSET, 0); +- WREG32(EVERGREEN_CRTC_UPDATE_LOCK + EVERGREEN_CRTC5_REGISTER_OFFSET, 0); ++ /* blank the display controllers */ ++ for (i = 0; i < rdev->num_crtc; i++) { ++ crtc_enabled = RREG32(EVERGREEN_CRTC_CONTROL + crtc_offsets[i]) & EVERGREEN_CRTC_MASTER_EN; ++ if (crtc_enabled) { ++ save->crtc_enabled[i] = true; ++ tmp = RREG32(EVERGREEN_CRTC_CONTROL + crtc_offsets[i]); ++ if (!(tmp & EVERGREEN_CRTC_DISP_READ_REQUEST_DISABLE)) { ++ dce4_wait_for_vblank(rdev, i); ++ tmp |= EVERGREEN_CRTC_DISP_READ_REQUEST_DISABLE; ++ WREG32(EVERGREEN_CRTC_CONTROL + crtc_offsets[i], tmp); ++ } ++ /* wait for the next frame */ ++ frame_count = radeon_get_vblank_counter(rdev, i); ++ for (j = 0; j < rdev->usec_timeout; j++) { ++ if (radeon_get_vblank_counter(rdev, i) != frame_count) ++ break; ++ udelay(1); ++ } ++ } + } + +- WREG32(D1VGA_CONTROL, 0); +- WREG32(D2VGA_CONTROL, 0); +- if (rdev->num_crtc >= 4) { +- WREG32(EVERGREEN_D3VGA_CONTROL, 0); +- WREG32(EVERGREEN_D4VGA_CONTROL, 0); +- } +- if (rdev->num_crtc >= 6) { +- WREG32(EVERGREEN_D5VGA_CONTROL, 0); +- WREG32(EVERGREEN_D6VGA_CONTROL, 0); ++ evergreen_mc_wait_for_idle(rdev); ++ ++ blackout = RREG32(MC_SHARED_BLACKOUT_CNTL); ++ if ((blackout & BLACKOUT_MODE_MASK) != 1) { ++ /* Block CPU access */ ++ WREG32(BIF_FB_EN, 0); ++ /* blackout the MC */ ++ blackout &= ~BLACKOUT_MODE_MASK; ++ WREG32(MC_SHARED_BLACKOUT_CNTL, blackout | 1); + } + } + + void evergreen_mc_resume(struct radeon_device *rdev, struct evergreen_mc_save *save) + { +- WREG32(EVERGREEN_GRPH_PRIMARY_SURFACE_ADDRESS_HIGH + EVERGREEN_CRTC0_REGISTER_OFFSET, +- upper_32_bits(rdev->mc.vram_start)); +- WREG32(EVERGREEN_GRPH_SECONDARY_SURFACE_ADDRESS_HIGH + EVERGREEN_CRTC0_REGISTER_OFFSET, +- upper_32_bits(rdev->mc.vram_start)); +- WREG32(EVERGREEN_GRPH_PRIMARY_SURFACE_ADDRESS + EVERGREEN_CRTC0_REGISTER_OFFSET, +- (u32)rdev->mc.vram_start); +- WREG32(EVERGREEN_GRPH_SECONDARY_SURFACE_ADDRESS + EVERGREEN_CRTC0_REGISTER_OFFSET, +- (u32)rdev->mc.vram_start); +- +- WREG32(EVERGREEN_GRPH_PRIMARY_SURFACE_ADDRESS_HIGH + EVERGREEN_CRTC1_REGISTER_OFFSET, +- upper_32_bits(rdev->mc.vram_start)); +- WREG32(EVERGREEN_GRPH_SECONDARY_SURFACE_ADDRESS_HIGH + EVERGREEN_CRTC1_REGISTER_OFFSET, +- upper_32_bits(rdev->mc.vram_start)); +- WREG32(EVERGREEN_GRPH_PRIMARY_SURFACE_ADDRESS + EVERGREEN_CRTC1_REGISTER_OFFSET, +- (u32)rdev->mc.vram_start); +- WREG32(EVERGREEN_GRPH_SECONDARY_SURFACE_ADDRESS + EVERGREEN_CRTC1_REGISTER_OFFSET, +- (u32)rdev->mc.vram_start); +- +- if (rdev->num_crtc >= 4) { +- WREG32(EVERGREEN_GRPH_PRIMARY_SURFACE_ADDRESS_HIGH + EVERGREEN_CRTC2_REGISTER_OFFSET, +- upper_32_bits(rdev->mc.vram_start)); +- WREG32(EVERGREEN_GRPH_SECONDARY_SURFACE_ADDRESS_HIGH + EVERGREEN_CRTC2_REGISTER_OFFSET, +- upper_32_bits(rdev->mc.vram_start)); +- WREG32(EVERGREEN_GRPH_PRIMARY_SURFACE_ADDRESS + EVERGREEN_CRTC2_REGISTER_OFFSET, +- (u32)rdev->mc.vram_start); +- WREG32(EVERGREEN_GRPH_SECONDARY_SURFACE_ADDRESS + EVERGREEN_CRTC2_REGISTER_OFFSET, +- (u32)rdev->mc.vram_start); +- +- WREG32(EVERGREEN_GRPH_PRIMARY_SURFACE_ADDRESS_HIGH + EVERGREEN_CRTC3_REGISTER_OFFSET, +- upper_32_bits(rdev->mc.vram_start)); +- WREG32(EVERGREEN_GRPH_SECONDARY_SURFACE_ADDRESS_HIGH + EVERGREEN_CRTC3_REGISTER_OFFSET, +- upper_32_bits(rdev->mc.vram_start)); +- WREG32(EVERGREEN_GRPH_PRIMARY_SURFACE_ADDRESS + EVERGREEN_CRTC3_REGISTER_OFFSET, +- (u32)rdev->mc.vram_start); +- WREG32(EVERGREEN_GRPH_SECONDARY_SURFACE_ADDRESS + EVERGREEN_CRTC3_REGISTER_OFFSET, +- (u32)rdev->mc.vram_start); +- } +- if (rdev->num_crtc >= 6) { +- WREG32(EVERGREEN_GRPH_PRIMARY_SURFACE_ADDRESS_HIGH + EVERGREEN_CRTC4_REGISTER_OFFSET, +- upper_32_bits(rdev->mc.vram_start)); +- WREG32(EVERGREEN_GRPH_SECONDARY_SURFACE_ADDRESS_HIGH + EVERGREEN_CRTC4_REGISTER_OFFSET, +- upper_32_bits(rdev->mc.vram_start)); +- WREG32(EVERGREEN_GRPH_PRIMARY_SURFACE_ADDRESS + EVERGREEN_CRTC4_REGISTER_OFFSET, +- (u32)rdev->mc.vram_start); +- WREG32(EVERGREEN_GRPH_SECONDARY_SURFACE_ADDRESS + EVERGREEN_CRTC4_REGISTER_OFFSET, +- (u32)rdev->mc.vram_start); ++ u32 tmp, frame_count; ++ int i, j; + +- WREG32(EVERGREEN_GRPH_PRIMARY_SURFACE_ADDRESS_HIGH + EVERGREEN_CRTC5_REGISTER_OFFSET, ++ /* update crtc base addresses */ ++ for (i = 0; i < rdev->num_crtc; i++) { ++ WREG32(EVERGREEN_GRPH_PRIMARY_SURFACE_ADDRESS_HIGH + crtc_offsets[i], + upper_32_bits(rdev->mc.vram_start)); +- WREG32(EVERGREEN_GRPH_SECONDARY_SURFACE_ADDRESS_HIGH + EVERGREEN_CRTC5_REGISTER_OFFSET, ++ WREG32(EVERGREEN_GRPH_SECONDARY_SURFACE_ADDRESS_HIGH + crtc_offsets[i], + upper_32_bits(rdev->mc.vram_start)); +- WREG32(EVERGREEN_GRPH_PRIMARY_SURFACE_ADDRESS + EVERGREEN_CRTC5_REGISTER_OFFSET, ++ WREG32(EVERGREEN_GRPH_PRIMARY_SURFACE_ADDRESS + crtc_offsets[i], + (u32)rdev->mc.vram_start); +- WREG32(EVERGREEN_GRPH_SECONDARY_SURFACE_ADDRESS + EVERGREEN_CRTC5_REGISTER_OFFSET, ++ WREG32(EVERGREEN_GRPH_SECONDARY_SURFACE_ADDRESS + crtc_offsets[i], + (u32)rdev->mc.vram_start); + } +- + WREG32(EVERGREEN_VGA_MEMORY_BASE_ADDRESS_HIGH, upper_32_bits(rdev->mc.vram_start)); + WREG32(EVERGREEN_VGA_MEMORY_BASE_ADDRESS, (u32)rdev->mc.vram_start); +- /* Unlock host access */ ++ ++ /* unblackout the MC */ ++ tmp = RREG32(MC_SHARED_BLACKOUT_CNTL); ++ tmp &= ~BLACKOUT_MODE_MASK; ++ WREG32(MC_SHARED_BLACKOUT_CNTL, tmp); ++ /* allow CPU access */ ++ WREG32(BIF_FB_EN, FB_READ_EN | FB_WRITE_EN); ++ ++ for (i = 0; i < rdev->num_crtc; i++) { ++ if (save->crtc_enabled) { ++ tmp = RREG32(EVERGREEN_CRTC_CONTROL + crtc_offsets[i]); ++ tmp &= ~EVERGREEN_CRTC_DISP_READ_REQUEST_DISABLE; ++ WREG32(EVERGREEN_CRTC_CONTROL + crtc_offsets[i], tmp); ++ /* wait for the next frame */ ++ frame_count = radeon_get_vblank_counter(rdev, i); ++ for (j = 0; j < rdev->usec_timeout; j++) { ++ if (radeon_get_vblank_counter(rdev, i) != frame_count) ++ break; ++ udelay(1); ++ } ++ } ++ } ++ /* Unlock vga access */ + WREG32(VGA_HDP_CONTROL, save->vga_hdp_control); + mdelay(1); + WREG32(VGA_RENDER_CONTROL, save->vga_render_control); +diff --git a/drivers/gpu/drm/radeon/evergreen_reg.h b/drivers/gpu/drm/radeon/evergreen_reg.h +index 7d7f215..e022776 100644 +--- a/drivers/gpu/drm/radeon/evergreen_reg.h ++++ b/drivers/gpu/drm/radeon/evergreen_reg.h +@@ -210,7 +210,10 @@ + #define EVERGREEN_CRTC_CONTROL 0x6e70 + # define EVERGREEN_CRTC_MASTER_EN (1 << 0) + # define EVERGREEN_CRTC_DISP_READ_REQUEST_DISABLE (1 << 24) ++#define EVERGREEN_CRTC_BLANK_CONTROL 0x6e74 ++# define EVERGREEN_CRTC_BLANK_DATA_EN (1 << 8) + #define EVERGREEN_CRTC_STATUS 0x6e8c ++# define EVERGREEN_CRTC_V_BLANK (1 << 0) + #define EVERGREEN_CRTC_STATUS_POSITION 0x6e90 + #define EVERGREEN_MASTER_UPDATE_MODE 0x6ef8 + #define EVERGREEN_CRTC_UPDATE_LOCK 0x6ed4 +diff --git a/drivers/gpu/drm/radeon/evergreend.h b/drivers/gpu/drm/radeon/evergreend.h +index 6ecd23f..fe44a95 100644 +--- a/drivers/gpu/drm/radeon/evergreend.h ++++ b/drivers/gpu/drm/radeon/evergreend.h +@@ -77,6 +77,10 @@ + + #define CONFIG_MEMSIZE 0x5428 + ++#define BIF_FB_EN 0x5490 ++#define FB_READ_EN (1 << 0) ++#define FB_WRITE_EN (1 << 1) ++ + #define CP_ME_CNTL 0x86D8 + #define CP_ME_HALT (1 << 28) + #define CP_PFP_HALT (1 << 26) +@@ -194,6 +198,9 @@ + #define NOOFCHAN_MASK 0x00003000 + #define MC_SHARED_CHREMAP 0x2008 + ++#define MC_SHARED_BLACKOUT_CNTL 0x20ac ++#define BLACKOUT_MODE_MASK 0x00000007 ++ + #define MC_ARB_RAMCFG 0x2760 + #define NOOFBANK_SHIFT 0 + #define NOOFBANK_MASK 0x00000003 +diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h +index 5ce9402..5aa6670 100644 +--- a/drivers/gpu/drm/radeon/radeon_asic.h ++++ b/drivers/gpu/drm/radeon/radeon_asic.h +@@ -386,6 +386,7 @@ void r700_cp_fini(struct radeon_device *rdev); + struct evergreen_mc_save { + u32 vga_render_control; + u32 vga_hdp_control; ++ bool crtc_enabled[RADEON_MAX_CRTCS]; + }; + + void evergreen_pcie_gart_tlb_flush(struct radeon_device *rdev); +diff --git a/drivers/gpu/drm/radeon/radeon_irq_kms.c b/drivers/gpu/drm/radeon/radeon_irq_kms.c +index baa019e..4f9496e 100644 +--- a/drivers/gpu/drm/radeon/radeon_irq_kms.c ++++ b/drivers/gpu/drm/radeon/radeon_irq_kms.c +@@ -143,6 +143,16 @@ static bool radeon_msi_ok(struct radeon_device *rdev) + (rdev->pdev->subsystem_device == 0x01fd)) + return true; + ++ /* Gateway RS690 only seems to work with MSIs. */ ++ if ((rdev->pdev->device == 0x791f) && ++ (rdev->pdev->subsystem_vendor == 0x107b) && ++ (rdev->pdev->subsystem_device == 0x0185)) ++ return true; ++ ++ /* try and enable MSIs by default on all RS690s */ ++ if (rdev->family == CHIP_RS690) ++ return true; ++ + /* RV515 seems to have MSI issues where it loses + * MSI rearms occasionally. This leads to lockups and freezes. + * disable it by default. +diff --git a/drivers/gpu/drm/radeon/radeon_pm.c b/drivers/gpu/drm/radeon/radeon_pm.c +index 78a665b..ebd6c51 100644 +--- a/drivers/gpu/drm/radeon/radeon_pm.c ++++ b/drivers/gpu/drm/radeon/radeon_pm.c +@@ -553,7 +553,9 @@ void radeon_pm_suspend(struct radeon_device *rdev) + void radeon_pm_resume(struct radeon_device *rdev) + { + /* set up the default clocks if the MC ucode is loaded */ +- if (ASIC_IS_DCE5(rdev) && rdev->mc_fw) { ++ if ((rdev->family >= CHIP_BARTS) && ++ (rdev->family <= CHIP_CAYMAN) && ++ rdev->mc_fw) { + if (rdev->pm.default_vddc) + radeon_atom_set_voltage(rdev, rdev->pm.default_vddc, + SET_VOLTAGE_TYPE_ASIC_VDDC); +@@ -608,7 +610,9 @@ int radeon_pm_init(struct radeon_device *rdev) + radeon_pm_print_states(rdev); + radeon_pm_init_profile(rdev); + /* set up the default clocks if the MC ucode is loaded */ +- if (ASIC_IS_DCE5(rdev) && rdev->mc_fw) { ++ if ((rdev->family >= CHIP_BARTS) && ++ (rdev->family <= CHIP_CAYMAN) && ++ rdev->mc_fw) { + if (rdev->pm.default_vddc) + radeon_atom_set_voltage(rdev, rdev->pm.default_vddc, + SET_VOLTAGE_TYPE_ASIC_VDDC); +diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c +index fe2fdbb..1740b82 100644 +--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c +@@ -148,7 +148,7 @@ static int ipoib_stop(struct net_device *dev) + + netif_stop_queue(dev); + +- ipoib_ib_dev_down(dev, 0); ++ ipoib_ib_dev_down(dev, 1); + ipoib_ib_dev_stop(dev, 0); + + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { +diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +index e5069b4..80799c0 100644 +--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +@@ -190,7 +190,9 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, + + mcast->mcmember = *mcmember; + +- /* Set the cached Q_Key before we attach if it's the broadcast group */ ++ /* Set the multicast MTU and cached Q_Key before we attach if it's ++ * the broadcast group. ++ */ + if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4, + sizeof (union ib_gid))) { + spin_lock_irq(&priv->lock); +@@ -198,10 +200,17 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, + spin_unlock_irq(&priv->lock); + return -EAGAIN; + } ++ priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); + priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); + spin_unlock_irq(&priv->lock); + priv->tx_wr.wr.ud.remote_qkey = priv->qkey; + set_qkey = 1; ++ ++ if (!ipoib_cm_admin_enabled(dev)) { ++ rtnl_lock(); ++ dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); ++ rtnl_unlock(); ++ } + } + + if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { +@@ -590,14 +599,6 @@ void ipoib_mcast_join_task(struct work_struct *work) + return; + } + +- priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); +- +- if (!ipoib_cm_admin_enabled(dev)) { +- rtnl_lock(); +- dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); +- rtnl_unlock(); +- } +- + ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); + + clear_bit(IPOIB_MCAST_RUN, &priv->flags); +diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c +index c76b051..4ec049d 100644 +--- a/drivers/infiniband/ulp/srp/ib_srp.c ++++ b/drivers/infiniband/ulp/srp/ib_srp.c +@@ -620,9 +620,9 @@ static void srp_reset_req(struct srp_target_port *target, struct srp_request *re + struct scsi_cmnd *scmnd = srp_claim_req(target, req, NULL); + + if (scmnd) { ++ srp_free_req(target, req, scmnd, 0); + scmnd->result = DID_RESET << 16; + scmnd->scsi_done(scmnd); +- srp_free_req(target, req, scmnd, 0); + } + } + +@@ -1669,6 +1669,7 @@ static int srp_abort(struct scsi_cmnd *scmnd) + SRP_TSK_ABORT_TASK); + srp_free_req(target, req, scmnd, 0); + scmnd->result = DID_ABORT << 16; ++ scmnd->scsi_done(scmnd); + + return SUCCESS; + } +diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c +index 96532bc..7be5fd9 100644 +--- a/drivers/input/mouse/synaptics.c ++++ b/drivers/input/mouse/synaptics.c +@@ -53,14 +53,19 @@ + #define ABS_POS_BITS 13 + + /* +- * Any position values from the hardware above the following limits are +- * treated as "wrapped around negative" values that have been truncated to +- * the 13-bit reporting range of the hardware. These are just reasonable +- * guesses and can be adjusted if hardware is found that operates outside +- * of these parameters. ++ * These values should represent the absolute maximum value that will ++ * be reported for a positive position value. Some Synaptics firmware ++ * uses this value to indicate a finger near the edge of the touchpad ++ * whose precise position cannot be determined. ++ * ++ * At least one touchpad is known to report positions in excess of this ++ * value which are actually negative values truncated to the 13-bit ++ * reporting range. These values have never been observed to be lower ++ * than 8184 (i.e. -8), so we treat all values greater than 8176 as ++ * negative and any other value as positive. + */ +-#define X_MAX_POSITIVE (((1 << ABS_POS_BITS) + XMAX) / 2) +-#define Y_MAX_POSITIVE (((1 << ABS_POS_BITS) + YMAX) / 2) ++#define X_MAX_POSITIVE 8176 ++#define Y_MAX_POSITIVE 8176 + + /* + * Synaptics touchpads report the y coordinate from bottom to top, which is +@@ -561,11 +566,21 @@ static int synaptics_parse_hw_state(const unsigned char buf[], + hw->right = (buf[0] & 0x02) ? 1 : 0; + } + +- /* Convert wrap-around values to negative */ ++ /* ++ * Convert wrap-around values to negative. (X|Y)_MAX_POSITIVE ++ * is used by some firmware to indicate a finger at the edge of ++ * the touchpad whose precise position cannot be determined, so ++ * convert these values to the maximum axis value. ++ */ + if (hw->x > X_MAX_POSITIVE) + hw->x -= 1 << ABS_POS_BITS; ++ else if (hw->x == X_MAX_POSITIVE) ++ hw->x = XMAX; ++ + if (hw->y > Y_MAX_POSITIVE) + hw->y -= 1 << ABS_POS_BITS; ++ else if (hw->y == Y_MAX_POSITIVE) ++ hw->y = YMAX; + + return 0; + } +diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c +index ccf347f..b9062c0 100644 +--- a/drivers/iommu/intel-iommu.c ++++ b/drivers/iommu/intel-iommu.c +@@ -563,7 +563,9 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain) + { + int i; + +- domain->iommu_coherency = 1; ++ i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus); ++ ++ domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0; + + for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) { + if (!ecap_coherent(g_iommus[i]->ecap)) { +diff --git a/drivers/media/rc/ite-cir.c b/drivers/media/rc/ite-cir.c +index 0e49c99..c06992e 100644 +--- a/drivers/media/rc/ite-cir.c ++++ b/drivers/media/rc/ite-cir.c +@@ -1473,6 +1473,7 @@ static int ite_probe(struct pnp_dev *pdev, const struct pnp_device_id + rdev = rc_allocate_device(); + if (!rdev) + goto failure; ++ itdev->rdev = rdev; + + ret = -ENODEV; + +@@ -1604,7 +1605,6 @@ static int ite_probe(struct pnp_dev *pdev, const struct pnp_device_id + if (ret) + goto failure; + +- itdev->rdev = rdev; + ite_pr(KERN_NOTICE, "driver has been successfully loaded\n"); + + return 0; +diff --git a/drivers/media/video/gspca/pac7302.c b/drivers/media/video/gspca/pac7302.c +index 1c44f78..6ddc769 100644 +--- a/drivers/media/video/gspca/pac7302.c ++++ b/drivers/media/video/gspca/pac7302.c +@@ -1197,6 +1197,8 @@ static const struct usb_device_id device_table[] = { + {USB_DEVICE(0x093a, 0x2629), .driver_info = FL_VFLIP}, + {USB_DEVICE(0x093a, 0x262a)}, + {USB_DEVICE(0x093a, 0x262c)}, ++ {USB_DEVICE(0x145f, 0x013c)}, ++ {USB_DEVICE(0x1ae7, 0x2001)}, /* SpeedLink Snappy Mic SL-6825-SBK */ + {} + }; + MODULE_DEVICE_TABLE(usb, device_table); +diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c +index d5fe43d..bc27065 100644 +--- a/drivers/mmc/host/omap_hsmmc.c ++++ b/drivers/mmc/host/omap_hsmmc.c +@@ -2188,9 +2188,7 @@ static int omap_hsmmc_suspend(struct device *dev) + } else { + host->suspended = 0; + if (host->pdata->resume) { +- ret = host->pdata->resume(&pdev->dev, +- host->slot_id); +- if (ret) ++ if (host->pdata->resume(&pdev->dev, host->slot_id)) + dev_dbg(mmc_dev(host->mmc), + "Unmask interrupt failed\n"); + } +diff --git a/drivers/mmc/host/sdhci-s3c.c b/drivers/mmc/host/sdhci-s3c.c +index 0d33ff0..06af9e4 100644 +--- a/drivers/mmc/host/sdhci-s3c.c ++++ b/drivers/mmc/host/sdhci-s3c.c +@@ -601,7 +601,7 @@ static int __devexit sdhci_s3c_remove(struct platform_device *pdev) + + sdhci_remove_host(host, 1); + +- for (ptr = 0; ptr < 3; ptr++) { ++ for (ptr = 0; ptr < MAX_BUS_CLK; ptr++) { + if (sc->clk_bus[ptr]) { + clk_disable(sc->clk_bus[ptr]); + clk_put(sc->clk_bus[ptr]); +diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c +index d5505f3..559d30d 100644 +--- a/drivers/mmc/host/sh_mmcif.c ++++ b/drivers/mmc/host/sh_mmcif.c +@@ -1003,6 +1003,10 @@ static irqreturn_t sh_mmcif_intr(int irq, void *dev_id) + host->sd_error = true; + dev_dbg(&host->pd->dev, "int err state = %08x\n", state); + } ++ if (host->state == STATE_IDLE) { ++ dev_info(&host->pd->dev, "Spurious IRQ status 0x%x", state); ++ return IRQ_HANDLED; ++ } + if (state & ~(INT_CMD12RBE | INT_CMD12CRE)) + complete(&host->intr_wait); + else +diff --git a/drivers/mtd/maps/autcpu12-nvram.c b/drivers/mtd/maps/autcpu12-nvram.c +index e5bfd0e..0598d52 100644 +--- a/drivers/mtd/maps/autcpu12-nvram.c ++++ b/drivers/mtd/maps/autcpu12-nvram.c +@@ -43,7 +43,8 @@ struct map_info autcpu12_sram_map = { + + static int __init init_autcpu12_sram (void) + { +- int err, save0, save1; ++ map_word tmp, save0, save1; ++ int err; + + autcpu12_sram_map.virt = ioremap(0x12000000, SZ_128K); + if (!autcpu12_sram_map.virt) { +@@ -51,7 +52,7 @@ static int __init init_autcpu12_sram (void) + err = -EIO; + goto out; + } +- simple_map_init(&autcpu_sram_map); ++ simple_map_init(&autcpu12_sram_map); + + /* + * Check for 32K/128K +@@ -61,20 +62,22 @@ static int __init init_autcpu12_sram (void) + * Read and check result on ofs 0x0 + * Restore contents + */ +- save0 = map_read32(&autcpu12_sram_map,0); +- save1 = map_read32(&autcpu12_sram_map,0x10000); +- map_write32(&autcpu12_sram_map,~save0,0x10000); ++ save0 = map_read(&autcpu12_sram_map, 0); ++ save1 = map_read(&autcpu12_sram_map, 0x10000); ++ tmp.x[0] = ~save0.x[0]; ++ map_write(&autcpu12_sram_map, tmp, 0x10000); + /* if we find this pattern on 0x0, we have 32K size + * restore contents and exit + */ +- if ( map_read32(&autcpu12_sram_map,0) != save0) { +- map_write32(&autcpu12_sram_map,save0,0x0); ++ tmp = map_read(&autcpu12_sram_map, 0); ++ if (!map_word_equal(&autcpu12_sram_map, tmp, save0)) { ++ map_write(&autcpu12_sram_map, save0, 0x0); + goto map; + } + /* We have a 128K found, restore 0x10000 and set size + * to 128K + */ +- map_write32(&autcpu12_sram_map,save1,0x10000); ++ map_write(&autcpu12_sram_map, save1, 0x10000); + autcpu12_sram_map.size = SZ_128K; + + map: +diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c +index a0bd2de..198da0a 100644 +--- a/drivers/mtd/mtdpart.c ++++ b/drivers/mtd/mtdpart.c +@@ -748,6 +748,8 @@ static const char *default_mtd_part_types[] = { + * partition parsers, specified in @types. However, if @types is %NULL, then + * the default list of parsers is used. The default list contains only the + * "cmdlinepart" and "ofpart" parsers ATM. ++ * Note: If there are more then one parser in @types, the kernel only takes the ++ * partitions parsed out by the first parser. + * + * This function may return: + * o a negative error code in case of failure +@@ -772,11 +774,12 @@ int parse_mtd_partitions(struct mtd_info *master, const char **types, + if (!parser) + continue; + ret = (*parser->parse_fn)(master, pparts, data); ++ put_partition_parser(parser); + if (ret > 0) { + printk(KERN_NOTICE "%d %s partitions found on MTD device %s\n", + ret, parser->name, master->name); ++ break; + } +- put_partition_parser(parser); + } + return ret; + } +diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c +index f024375..532da04 100644 +--- a/drivers/mtd/nand/nand_bbt.c ++++ b/drivers/mtd/nand/nand_bbt.c +@@ -390,7 +390,7 @@ static int read_abs_bbts(struct mtd_info *mtd, uint8_t *buf, + /* Read the mirror version, if available */ + if (md && (md->options & NAND_BBT_VERSION)) { + scan_read_raw(mtd, buf, (loff_t)md->pages[0] << this->page_shift, +- mtd->writesize, td); ++ mtd->writesize, md); + md->version[0] = buf[bbt_get_ver_offs(mtd, md)]; + pr_info("Bad block table at page %d, version 0x%02X\n", + md->pages[0], md->version[0]); +diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c +index 83e8e1b..ade0da0 100644 +--- a/drivers/mtd/nand/nandsim.c ++++ b/drivers/mtd/nand/nandsim.c +@@ -2355,6 +2355,7 @@ static int __init ns_init_module(void) + uint64_t new_size = (uint64_t)nsmtd->erasesize << overridesize; + if (new_size >> overridesize != nsmtd->erasesize) { + NS_ERR("overridesize is too big\n"); ++ retval = -EINVAL; + goto err_exit; + } + /* N.B. This relies on nand_scan not doing anything with the size before we change it */ +diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c +index f745f00..297c965 100644 +--- a/drivers/mtd/nand/omap2.c ++++ b/drivers/mtd/nand/omap2.c +@@ -1132,7 +1132,8 @@ static int omap_nand_remove(struct platform_device *pdev) + /* Release NAND device, its internal structures and partitions */ + nand_release(&info->mtd); + iounmap(info->nand.IO_ADDR_R); +- kfree(&info->mtd); ++ release_mem_region(info->phys_base, NAND_IO_SIZE); ++ kfree(info); + return 0; + } + +diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c +index 6c3fb5a..1f9c363 100644 +--- a/drivers/mtd/ubi/build.c ++++ b/drivers/mtd/ubi/build.c +@@ -816,6 +816,11 @@ static int autoresize(struct ubi_device *ubi, int vol_id) + struct ubi_volume *vol = ubi->volumes[vol_id]; + int err, old_reserved_pebs = vol->reserved_pebs; + ++ if (ubi->ro_mode) { ++ ubi_warn("skip auto-resize because of R/O mode"); ++ return 0; ++ } ++ + /* + * Clear the auto-resize flag in the volume in-memory copy of the + * volume table, and 'ubi_resize_volume()' will propagate this change +diff --git a/drivers/mtd/ubi/scan.c b/drivers/mtd/ubi/scan.c +index b99318e..b2b62de 100644 +--- a/drivers/mtd/ubi/scan.c ++++ b/drivers/mtd/ubi/scan.c +@@ -997,7 +997,7 @@ static int process_eb(struct ubi_device *ubi, struct ubi_scan_info *si, + return err; + goto adjust_mean_ec; + case UBI_IO_FF: +- if (ec_err) ++ if (ec_err || bitflips) + err = add_to_list(si, pnum, ec, 1, &si->erase); + else + err = add_to_list(si, pnum, ec, 0, &si->free); +diff --git a/drivers/net/can/mscan/mpc5xxx_can.c b/drivers/net/can/mscan/mpc5xxx_can.c +index 5fedc33..d8f2b5b 100644 +--- a/drivers/net/can/mscan/mpc5xxx_can.c ++++ b/drivers/net/can/mscan/mpc5xxx_can.c +@@ -181,7 +181,7 @@ static u32 __devinit mpc512x_can_get_clock(struct platform_device *ofdev, + + if (!clock_name || !strcmp(clock_name, "sys")) { + sys_clk = clk_get(&ofdev->dev, "sys_clk"); +- if (!sys_clk) { ++ if (IS_ERR(sys_clk)) { + dev_err(&ofdev->dev, "couldn't get sys_clk\n"); + goto exit_unmap; + } +@@ -204,7 +204,7 @@ static u32 __devinit mpc512x_can_get_clock(struct platform_device *ofdev, + + if (clocksrc < 0) { + ref_clk = clk_get(&ofdev->dev, "ref_clk"); +- if (!ref_clk) { ++ if (IS_ERR(ref_clk)) { + dev_err(&ofdev->dev, "couldn't get ref_clk\n"); + goto exit_unmap; + } +diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c +index 0549261..c5f6b0e 100644 +--- a/drivers/net/ethernet/intel/e1000/e1000_main.c ++++ b/drivers/net/ethernet/intel/e1000/e1000_main.c +@@ -4720,8 +4720,6 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool *enable_wake) + + netif_device_detach(netdev); + +- mutex_lock(&adapter->mutex); +- + if (netif_running(netdev)) { + WARN_ON(test_bit(__E1000_RESETTING, &adapter->flags)); + e1000_down(adapter); +@@ -4729,10 +4727,8 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool *enable_wake) + + #ifdef CONFIG_PM + retval = pci_save_state(pdev); +- if (retval) { +- mutex_unlock(&adapter->mutex); ++ if (retval) + return retval; +- } + #endif + + status = er32(STATUS); +@@ -4789,8 +4785,6 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool *enable_wake) + if (netif_running(netdev)) + e1000_free_irq(adapter); + +- mutex_unlock(&adapter->mutex); +- + pci_disable_device(pdev); + + return 0; +diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c +index ed1be8a..4b43bc5 100644 +--- a/drivers/net/ethernet/realtek/r8169.c ++++ b/drivers/net/ethernet/realtek/r8169.c +@@ -327,6 +327,8 @@ enum rtl_registers { + Config0 = 0x51, + Config1 = 0x52, + Config2 = 0x53, ++#define PME_SIGNAL (1 << 5) /* 8168c and later */ ++ + Config3 = 0x54, + Config4 = 0x55, + Config5 = 0x56, +@@ -1360,7 +1362,6 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts) + u16 reg; + u8 mask; + } cfg[] = { +- { WAKE_ANY, Config1, PMEnable }, + { WAKE_PHY, Config3, LinkUp }, + { WAKE_MAGIC, Config3, MagicPacket }, + { WAKE_UCAST, Config5, UWF }, +@@ -1368,16 +1369,32 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts) + { WAKE_MCAST, Config5, MWF }, + { WAKE_ANY, Config5, LanWake } + }; ++ u8 options; + + RTL_W8(Cfg9346, Cfg9346_Unlock); + + for (i = 0; i < ARRAY_SIZE(cfg); i++) { +- u8 options = RTL_R8(cfg[i].reg) & ~cfg[i].mask; ++ options = RTL_R8(cfg[i].reg) & ~cfg[i].mask; + if (wolopts & cfg[i].opt) + options |= cfg[i].mask; + RTL_W8(cfg[i].reg, options); + } + ++ switch (tp->mac_version) { ++ case RTL_GIGA_MAC_VER_01 ... RTL_GIGA_MAC_VER_17: ++ options = RTL_R8(Config1) & ~PMEnable; ++ if (wolopts) ++ options |= PMEnable; ++ RTL_W8(Config1, options); ++ break; ++ default: ++ options = RTL_R8(Config2) & ~PME_SIGNAL; ++ if (wolopts) ++ options |= PME_SIGNAL; ++ RTL_W8(Config2, options); ++ break; ++ } ++ + RTL_W8(Cfg9346, Cfg9346_Lock); + } + +diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c +index 7145714..c0f097b 100644 +--- a/drivers/net/rionet.c ++++ b/drivers/net/rionet.c +@@ -79,6 +79,7 @@ static int rionet_capable = 1; + * on system trade-offs. + */ + static struct rio_dev **rionet_active; ++static int nact; /* total number of active rionet peers */ + + #define is_rionet_capable(src_ops, dst_ops) \ + ((src_ops & RIO_SRC_OPS_DATA_MSG) && \ +@@ -175,6 +176,7 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev) + struct ethhdr *eth = (struct ethhdr *)skb->data; + u16 destid; + unsigned long flags; ++ int add_num = 1; + + local_irq_save(flags); + if (!spin_trylock(&rnet->tx_lock)) { +@@ -182,7 +184,10 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev) + return NETDEV_TX_LOCKED; + } + +- if ((rnet->tx_cnt + 1) > RIONET_TX_RING_SIZE) { ++ if (is_multicast_ether_addr(eth->h_dest)) ++ add_num = nact; ++ ++ if ((rnet->tx_cnt + add_num) > RIONET_TX_RING_SIZE) { + netif_stop_queue(ndev); + spin_unlock_irqrestore(&rnet->tx_lock, flags); + printk(KERN_ERR "%s: BUG! Tx Ring full when queue awake!\n", +@@ -191,11 +196,16 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev) + } + + if (is_multicast_ether_addr(eth->h_dest)) { ++ int count = 0; + for (i = 0; i < RIO_MAX_ROUTE_ENTRIES(rnet->mport->sys_size); + i++) +- if (rionet_active[i]) ++ if (rionet_active[i]) { + rionet_queue_tx_msg(skb, ndev, + rionet_active[i]); ++ if (count) ++ atomic_inc(&skb->users); ++ count++; ++ } + } else if (RIONET_MAC_MATCH(eth->h_dest)) { + destid = RIONET_GET_DESTID(eth->h_dest); + if (rionet_active[destid]) +@@ -220,14 +230,17 @@ static void rionet_dbell_event(struct rio_mport *mport, void *dev_id, u16 sid, u + if (info == RIONET_DOORBELL_JOIN) { + if (!rionet_active[sid]) { + list_for_each_entry(peer, &rionet_peers, node) { +- if (peer->rdev->destid == sid) ++ if (peer->rdev->destid == sid) { + rionet_active[sid] = peer->rdev; ++ nact++; ++ } + } + rio_mport_send_doorbell(mport, sid, + RIONET_DOORBELL_JOIN); + } + } else if (info == RIONET_DOORBELL_LEAVE) { + rionet_active[sid] = NULL; ++ nact--; + } else { + if (netif_msg_intr(rnet)) + printk(KERN_WARNING "%s: unhandled doorbell\n", +@@ -524,6 +537,7 @@ static int rionet_probe(struct rio_dev *rdev, const struct rio_device_id *id) + + rc = rionet_setup_netdev(rdev->net->hport, ndev); + rionet_check = 1; ++ nact = 0; + } + + /* +diff --git a/drivers/net/wireless/ath/ath9k/pci.c b/drivers/net/wireless/ath/ath9k/pci.c +index 1883d39..f7e17a0 100644 +--- a/drivers/net/wireless/ath/ath9k/pci.c ++++ b/drivers/net/wireless/ath/ath9k/pci.c +@@ -122,8 +122,9 @@ static void ath_pci_aspm_init(struct ath_common *common) + if (!parent) + return; + +- if (ah->btcoex_hw.scheme != ATH_BTCOEX_CFG_NONE) { +- /* Bluetooth coexistance requires disabling ASPM. */ ++ if ((ah->btcoex_hw.scheme != ATH_BTCOEX_CFG_NONE) && ++ (AR_SREV_9285(ah))) { ++ /* Bluetooth coexistance requires disabling ASPM for AR9285. */ + pci_read_config_byte(pdev, pos + PCI_EXP_LNKCTL, &aspm); + aspm &= ~(PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1); + pci_write_config_byte(pdev, pos + PCI_EXP_LNKCTL, aspm); +diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c +index dfee1b3..9005380 100644 +--- a/drivers/pci/probe.c ++++ b/drivers/pci/probe.c +@@ -658,8 +658,10 @@ int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, + + /* Check if setup is sensible at all */ + if (!pass && +- (primary != bus->number || secondary <= bus->number)) { +- dev_dbg(&dev->dev, "bus configuration invalid, reconfiguring\n"); ++ (primary != bus->number || secondary <= bus->number || ++ secondary > subordinate)) { ++ dev_info(&dev->dev, "bridge configuration invalid ([bus %02x-%02x]), reconfiguring\n", ++ secondary, subordinate); + broken = 1; + } + +diff --git a/drivers/s390/scsi/zfcp_aux.c b/drivers/s390/scsi/zfcp_aux.c +index 0860181..4f1b10b 100644 +--- a/drivers/s390/scsi/zfcp_aux.c ++++ b/drivers/s390/scsi/zfcp_aux.c +@@ -519,6 +519,7 @@ struct zfcp_port *zfcp_port_enqueue(struct zfcp_adapter *adapter, u64 wwpn, + + rwlock_init(&port->unit_list_lock); + INIT_LIST_HEAD(&port->unit_list); ++ atomic_set(&port->units, 0); + + INIT_WORK(&port->gid_pn_work, zfcp_fc_port_did_lookup); + INIT_WORK(&port->test_link_work, zfcp_fc_link_test_work); +diff --git a/drivers/s390/scsi/zfcp_ccw.c b/drivers/s390/scsi/zfcp_ccw.c +index 96f13ad8..79a6afe 100644 +--- a/drivers/s390/scsi/zfcp_ccw.c ++++ b/drivers/s390/scsi/zfcp_ccw.c +@@ -39,17 +39,23 @@ void zfcp_ccw_adapter_put(struct zfcp_adapter *adapter) + spin_unlock_irqrestore(&zfcp_ccw_adapter_ref_lock, flags); + } + +-static int zfcp_ccw_activate(struct ccw_device *cdev) +- ++/** ++ * zfcp_ccw_activate - activate adapter and wait for it to finish ++ * @cdev: pointer to belonging ccw device ++ * @clear: Status flags to clear. ++ * @tag: s390dbf trace record tag ++ */ ++static int zfcp_ccw_activate(struct ccw_device *cdev, int clear, char *tag) + { + struct zfcp_adapter *adapter = zfcp_ccw_adapter_by_cdev(cdev); + + if (!adapter) + return 0; + ++ zfcp_erp_clear_adapter_status(adapter, clear); + zfcp_erp_set_adapter_status(adapter, ZFCP_STATUS_COMMON_RUNNING); + zfcp_erp_adapter_reopen(adapter, ZFCP_STATUS_COMMON_ERP_FAILED, +- "ccresu2"); ++ tag); + zfcp_erp_wait(adapter); + flush_work(&adapter->scan_work); + +@@ -164,26 +170,29 @@ static int zfcp_ccw_set_online(struct ccw_device *cdev) + BUG_ON(!zfcp_reqlist_isempty(adapter->req_list)); + adapter->req_no = 0; + +- zfcp_ccw_activate(cdev); ++ zfcp_ccw_activate(cdev, 0, "ccsonl1"); + zfcp_ccw_adapter_put(adapter); + return 0; + } + + /** +- * zfcp_ccw_set_offline - set_offline function of zfcp driver ++ * zfcp_ccw_offline_sync - shut down adapter and wait for it to finish + * @cdev: pointer to belonging ccw device ++ * @set: Status flags to set. ++ * @tag: s390dbf trace record tag + * + * This function gets called by the common i/o layer and sets an adapter + * into state offline. + */ +-static int zfcp_ccw_set_offline(struct ccw_device *cdev) ++static int zfcp_ccw_offline_sync(struct ccw_device *cdev, int set, char *tag) + { + struct zfcp_adapter *adapter = zfcp_ccw_adapter_by_cdev(cdev); + + if (!adapter) + return 0; + +- zfcp_erp_adapter_shutdown(adapter, 0, "ccsoff1"); ++ zfcp_erp_set_adapter_status(adapter, set); ++ zfcp_erp_adapter_shutdown(adapter, 0, tag); + zfcp_erp_wait(adapter); + + zfcp_ccw_adapter_put(adapter); +@@ -191,6 +200,18 @@ static int zfcp_ccw_set_offline(struct ccw_device *cdev) + } + + /** ++ * zfcp_ccw_set_offline - set_offline function of zfcp driver ++ * @cdev: pointer to belonging ccw device ++ * ++ * This function gets called by the common i/o layer and sets an adapter ++ * into state offline. ++ */ ++static int zfcp_ccw_set_offline(struct ccw_device *cdev) ++{ ++ return zfcp_ccw_offline_sync(cdev, 0, "ccsoff1"); ++} ++ ++/** + * zfcp_ccw_notify - ccw notify function + * @cdev: pointer to belonging ccw device + * @event: indicates if adapter was detached or attached +@@ -207,6 +228,11 @@ static int zfcp_ccw_notify(struct ccw_device *cdev, int event) + + switch (event) { + case CIO_GONE: ++ if (atomic_read(&adapter->status) & ++ ZFCP_STATUS_ADAPTER_SUSPENDED) { /* notification ignore */ ++ zfcp_dbf_hba_basic("ccnigo1", adapter); ++ break; ++ } + dev_warn(&cdev->dev, "The FCP device has been detached\n"); + zfcp_erp_adapter_shutdown(adapter, 0, "ccnoti1"); + break; +@@ -216,6 +242,11 @@ static int zfcp_ccw_notify(struct ccw_device *cdev, int event) + zfcp_erp_adapter_shutdown(adapter, 0, "ccnoti2"); + break; + case CIO_OPER: ++ if (atomic_read(&adapter->status) & ++ ZFCP_STATUS_ADAPTER_SUSPENDED) { /* notification ignore */ ++ zfcp_dbf_hba_basic("ccniop1", adapter); ++ break; ++ } + dev_info(&cdev->dev, "The FCP device is operational again\n"); + zfcp_erp_set_adapter_status(adapter, + ZFCP_STATUS_COMMON_RUNNING); +@@ -251,6 +282,28 @@ static void zfcp_ccw_shutdown(struct ccw_device *cdev) + zfcp_ccw_adapter_put(adapter); + } + ++static int zfcp_ccw_suspend(struct ccw_device *cdev) ++{ ++ zfcp_ccw_offline_sync(cdev, ZFCP_STATUS_ADAPTER_SUSPENDED, "ccsusp1"); ++ return 0; ++} ++ ++static int zfcp_ccw_thaw(struct ccw_device *cdev) ++{ ++ /* trace records for thaw and final shutdown during suspend ++ can only be found in system dump until the end of suspend ++ but not after resume because it's based on the memory image ++ right after the very first suspend (freeze) callback */ ++ zfcp_ccw_activate(cdev, 0, "ccthaw1"); ++ return 0; ++} ++ ++static int zfcp_ccw_resume(struct ccw_device *cdev) ++{ ++ zfcp_ccw_activate(cdev, ZFCP_STATUS_ADAPTER_SUSPENDED, "ccresu1"); ++ return 0; ++} ++ + struct ccw_driver zfcp_ccw_driver = { + .driver = { + .owner = THIS_MODULE, +@@ -263,7 +316,7 @@ struct ccw_driver zfcp_ccw_driver = { + .set_offline = zfcp_ccw_set_offline, + .notify = zfcp_ccw_notify, + .shutdown = zfcp_ccw_shutdown, +- .freeze = zfcp_ccw_set_offline, +- .thaw = zfcp_ccw_activate, +- .restore = zfcp_ccw_activate, ++ .freeze = zfcp_ccw_suspend, ++ .thaw = zfcp_ccw_thaw, ++ .restore = zfcp_ccw_resume, + }; +diff --git a/drivers/s390/scsi/zfcp_cfdc.c b/drivers/s390/scsi/zfcp_cfdc.c +index fab2c25..8ed63aa 100644 +--- a/drivers/s390/scsi/zfcp_cfdc.c ++++ b/drivers/s390/scsi/zfcp_cfdc.c +@@ -293,7 +293,7 @@ void zfcp_cfdc_adapter_access_changed(struct zfcp_adapter *adapter) + } + read_unlock_irqrestore(&adapter->port_list_lock, flags); + +- shost_for_each_device(sdev, port->adapter->scsi_host) { ++ shost_for_each_device(sdev, adapter->scsi_host) { + zfcp_sdev = sdev_to_zfcp(sdev); + status = atomic_read(&zfcp_sdev->status); + if ((status & ZFCP_STATUS_COMMON_ACCESS_DENIED) || +diff --git a/drivers/s390/scsi/zfcp_dbf.c b/drivers/s390/scsi/zfcp_dbf.c +index a9a816e..79b9848 100644 +--- a/drivers/s390/scsi/zfcp_dbf.c ++++ b/drivers/s390/scsi/zfcp_dbf.c +@@ -191,7 +191,7 @@ void zfcp_dbf_hba_def_err(struct zfcp_adapter *adapter, u64 req_id, u16 scount, + length = min((u16)sizeof(struct qdio_buffer), + (u16)ZFCP_DBF_PAY_MAX_REC); + +- while ((char *)pl[payload->counter] && payload->counter < scount) { ++ while (payload->counter < scount && (char *)pl[payload->counter]) { + memcpy(payload->data, (char *)pl[payload->counter], length); + debug_event(dbf->pay, 1, payload, zfcp_dbf_plen(length)); + payload->counter++; +@@ -200,6 +200,26 @@ void zfcp_dbf_hba_def_err(struct zfcp_adapter *adapter, u64 req_id, u16 scount, + spin_unlock_irqrestore(&dbf->pay_lock, flags); + } + ++/** ++ * zfcp_dbf_hba_basic - trace event for basic adapter events ++ * @adapter: pointer to struct zfcp_adapter ++ */ ++void zfcp_dbf_hba_basic(char *tag, struct zfcp_adapter *adapter) ++{ ++ struct zfcp_dbf *dbf = adapter->dbf; ++ struct zfcp_dbf_hba *rec = &dbf->hba_buf; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&dbf->hba_lock, flags); ++ memset(rec, 0, sizeof(*rec)); ++ ++ memcpy(rec->tag, tag, ZFCP_DBF_TAG_LEN); ++ rec->id = ZFCP_DBF_HBA_BASIC; ++ ++ debug_event(dbf->hba, 1, rec, sizeof(*rec)); ++ spin_unlock_irqrestore(&dbf->hba_lock, flags); ++} ++ + static void zfcp_dbf_set_common(struct zfcp_dbf_rec *rec, + struct zfcp_adapter *adapter, + struct zfcp_port *port, +diff --git a/drivers/s390/scsi/zfcp_dbf.h b/drivers/s390/scsi/zfcp_dbf.h +index 714f087..3ac7a4b 100644 +--- a/drivers/s390/scsi/zfcp_dbf.h ++++ b/drivers/s390/scsi/zfcp_dbf.h +@@ -154,6 +154,7 @@ enum zfcp_dbf_hba_id { + ZFCP_DBF_HBA_RES = 1, + ZFCP_DBF_HBA_USS = 2, + ZFCP_DBF_HBA_BIT = 3, ++ ZFCP_DBF_HBA_BASIC = 4, + }; + + /** +diff --git a/drivers/s390/scsi/zfcp_def.h b/drivers/s390/scsi/zfcp_def.h +index ed5d921..f172b84 100644 +--- a/drivers/s390/scsi/zfcp_def.h ++++ b/drivers/s390/scsi/zfcp_def.h +@@ -77,6 +77,7 @@ struct zfcp_reqlist; + #define ZFCP_STATUS_ADAPTER_SIOSL_ISSUED 0x00000004 + #define ZFCP_STATUS_ADAPTER_XCONFIG_OK 0x00000008 + #define ZFCP_STATUS_ADAPTER_HOST_CON_INIT 0x00000010 ++#define ZFCP_STATUS_ADAPTER_SUSPENDED 0x00000040 + #define ZFCP_STATUS_ADAPTER_ERP_PENDING 0x00000100 + #define ZFCP_STATUS_ADAPTER_LINK_UNPLUGGED 0x00000200 + #define ZFCP_STATUS_ADAPTER_DATA_DIV_ENABLED 0x00000400 +@@ -204,6 +205,7 @@ struct zfcp_port { + struct zfcp_adapter *adapter; /* adapter used to access port */ + struct list_head unit_list; /* head of logical unit list */ + rwlock_t unit_list_lock; /* unit list lock */ ++ atomic_t units; /* zfcp_unit count */ + atomic_t status; /* status of this remote port */ + u64 wwnn; /* WWNN if known */ + u64 wwpn; /* WWPN */ +diff --git a/drivers/s390/scsi/zfcp_ext.h b/drivers/s390/scsi/zfcp_ext.h +index 2302e1c..ef9e502 100644 +--- a/drivers/s390/scsi/zfcp_ext.h ++++ b/drivers/s390/scsi/zfcp_ext.h +@@ -54,6 +54,7 @@ extern void zfcp_dbf_hba_fsf_res(char *, struct zfcp_fsf_req *); + extern void zfcp_dbf_hba_bit_err(char *, struct zfcp_fsf_req *); + extern void zfcp_dbf_hba_berr(struct zfcp_dbf *, struct zfcp_fsf_req *); + extern void zfcp_dbf_hba_def_err(struct zfcp_adapter *, u64, u16, void **); ++extern void zfcp_dbf_hba_basic(char *, struct zfcp_adapter *); + extern void zfcp_dbf_san_req(char *, struct zfcp_fsf_req *, u32); + extern void zfcp_dbf_san_res(char *, struct zfcp_fsf_req *); + extern void zfcp_dbf_san_in_els(char *, struct zfcp_fsf_req *); +@@ -158,6 +159,7 @@ extern void zfcp_scsi_dif_sense_error(struct scsi_cmnd *, int); + extern struct attribute_group zfcp_sysfs_unit_attrs; + extern struct attribute_group zfcp_sysfs_adapter_attrs; + extern struct attribute_group zfcp_sysfs_port_attrs; ++extern struct mutex zfcp_sysfs_port_units_mutex; + extern struct device_attribute *zfcp_sysfs_sdev_attrs[]; + extern struct device_attribute *zfcp_sysfs_shost_attrs[]; + +diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c +index e9a787e..8c849f0 100644 +--- a/drivers/s390/scsi/zfcp_fsf.c ++++ b/drivers/s390/scsi/zfcp_fsf.c +@@ -219,7 +219,7 @@ static void zfcp_fsf_status_read_handler(struct zfcp_fsf_req *req) + return; + } + +- zfcp_dbf_hba_fsf_uss("fssrh_2", req); ++ zfcp_dbf_hba_fsf_uss("fssrh_4", req); + + switch (sr_buf->status_type) { + case FSF_STATUS_READ_PORT_CLOSED: +@@ -771,12 +771,14 @@ out: + static void zfcp_fsf_abort_fcp_command_handler(struct zfcp_fsf_req *req) + { + struct scsi_device *sdev = req->data; +- struct zfcp_scsi_dev *zfcp_sdev = sdev_to_zfcp(sdev); ++ struct zfcp_scsi_dev *zfcp_sdev; + union fsf_status_qual *fsq = &req->qtcb->header.fsf_status_qual; + + if (req->status & ZFCP_STATUS_FSFREQ_ERROR) + return; + ++ zfcp_sdev = sdev_to_zfcp(sdev); ++ + switch (req->qtcb->header.fsf_status) { + case FSF_PORT_HANDLE_NOT_VALID: + if (fsq->word[0] == fsq->word[1]) { +@@ -885,7 +887,7 @@ static void zfcp_fsf_send_ct_handler(struct zfcp_fsf_req *req) + + switch (header->fsf_status) { + case FSF_GOOD: +- zfcp_dbf_san_res("fsscth1", req); ++ zfcp_dbf_san_res("fsscth2", req); + ct->status = 0; + break; + case FSF_SERVICE_CLASS_NOT_SUPPORTED: +@@ -1739,13 +1741,15 @@ static void zfcp_fsf_open_lun_handler(struct zfcp_fsf_req *req) + { + struct zfcp_adapter *adapter = req->adapter; + struct scsi_device *sdev = req->data; +- struct zfcp_scsi_dev *zfcp_sdev = sdev_to_zfcp(sdev); ++ struct zfcp_scsi_dev *zfcp_sdev; + struct fsf_qtcb_header *header = &req->qtcb->header; + struct fsf_qtcb_bottom_support *bottom = &req->qtcb->bottom.support; + + if (req->status & ZFCP_STATUS_FSFREQ_ERROR) + return; + ++ zfcp_sdev = sdev_to_zfcp(sdev); ++ + atomic_clear_mask(ZFCP_STATUS_COMMON_ACCESS_DENIED | + ZFCP_STATUS_COMMON_ACCESS_BOXED | + ZFCP_STATUS_LUN_SHARED | +@@ -1856,11 +1860,13 @@ out: + static void zfcp_fsf_close_lun_handler(struct zfcp_fsf_req *req) + { + struct scsi_device *sdev = req->data; +- struct zfcp_scsi_dev *zfcp_sdev = sdev_to_zfcp(sdev); ++ struct zfcp_scsi_dev *zfcp_sdev; + + if (req->status & ZFCP_STATUS_FSFREQ_ERROR) + return; + ++ zfcp_sdev = sdev_to_zfcp(sdev); ++ + switch (req->qtcb->header.fsf_status) { + case FSF_PORT_HANDLE_NOT_VALID: + zfcp_erp_adapter_reopen(zfcp_sdev->port->adapter, 0, "fscuh_1"); +@@ -1950,7 +1956,7 @@ static void zfcp_fsf_req_trace(struct zfcp_fsf_req *req, struct scsi_cmnd *scsi) + { + struct fsf_qual_latency_info *lat_in; + struct latency_cont *lat = NULL; +- struct zfcp_scsi_dev *zfcp_sdev = sdev_to_zfcp(scsi->device); ++ struct zfcp_scsi_dev *zfcp_sdev; + struct zfcp_blk_drv_data blktrc; + int ticks = req->adapter->timer_ticks; + +@@ -1965,6 +1971,7 @@ static void zfcp_fsf_req_trace(struct zfcp_fsf_req *req, struct scsi_cmnd *scsi) + + if (req->adapter->adapter_features & FSF_FEATURE_MEASUREMENT_DATA && + !(req->status & ZFCP_STATUS_FSFREQ_ERROR)) { ++ zfcp_sdev = sdev_to_zfcp(scsi->device); + blktrc.flags |= ZFCP_BLK_LAT_VALID; + blktrc.channel_lat = lat_in->channel_lat * ticks; + blktrc.fabric_lat = lat_in->fabric_lat * ticks; +@@ -2002,12 +2009,14 @@ static void zfcp_fsf_fcp_handler_common(struct zfcp_fsf_req *req) + { + struct scsi_cmnd *scmnd = req->data; + struct scsi_device *sdev = scmnd->device; +- struct zfcp_scsi_dev *zfcp_sdev = sdev_to_zfcp(sdev); ++ struct zfcp_scsi_dev *zfcp_sdev; + struct fsf_qtcb_header *header = &req->qtcb->header; + + if (unlikely(req->status & ZFCP_STATUS_FSFREQ_ERROR)) + return; + ++ zfcp_sdev = sdev_to_zfcp(sdev); ++ + switch (header->fsf_status) { + case FSF_HANDLE_MISMATCH: + case FSF_PORT_HANDLE_NOT_VALID: +diff --git a/drivers/s390/scsi/zfcp_qdio.c b/drivers/s390/scsi/zfcp_qdio.c +index e14da57..e76d003 100644 +--- a/drivers/s390/scsi/zfcp_qdio.c ++++ b/drivers/s390/scsi/zfcp_qdio.c +@@ -102,18 +102,22 @@ static void zfcp_qdio_int_resp(struct ccw_device *cdev, unsigned int qdio_err, + { + struct zfcp_qdio *qdio = (struct zfcp_qdio *) parm; + struct zfcp_adapter *adapter = qdio->adapter; +- struct qdio_buffer_element *sbale; + int sbal_no, sbal_idx; +- void *pl[ZFCP_QDIO_MAX_SBALS_PER_REQ + 1]; +- u64 req_id; +- u8 scount; + + if (unlikely(qdio_err)) { +- memset(pl, 0, ZFCP_QDIO_MAX_SBALS_PER_REQ * sizeof(void *)); + if (zfcp_adapter_multi_buffer_active(adapter)) { ++ void *pl[ZFCP_QDIO_MAX_SBALS_PER_REQ + 1]; ++ struct qdio_buffer_element *sbale; ++ u64 req_id; ++ u8 scount; ++ ++ memset(pl, 0, ++ ZFCP_QDIO_MAX_SBALS_PER_REQ * sizeof(void *)); + sbale = qdio->res_q[idx]->element; + req_id = (u64) sbale->addr; +- scount = sbale->scount + 1; /* incl. signaling SBAL */ ++ scount = min(sbale->scount + 1, ++ ZFCP_QDIO_MAX_SBALS_PER_REQ + 1); ++ /* incl. signaling SBAL */ + + for (sbal_no = 0; sbal_no < scount; sbal_no++) { + sbal_idx = (idx + sbal_no) % +diff --git a/drivers/s390/scsi/zfcp_sysfs.c b/drivers/s390/scsi/zfcp_sysfs.c +index cdc4ff7..9e62210 100644 +--- a/drivers/s390/scsi/zfcp_sysfs.c ++++ b/drivers/s390/scsi/zfcp_sysfs.c +@@ -227,6 +227,8 @@ static ssize_t zfcp_sysfs_port_rescan_store(struct device *dev, + static ZFCP_DEV_ATTR(adapter, port_rescan, S_IWUSR, NULL, + zfcp_sysfs_port_rescan_store); + ++DEFINE_MUTEX(zfcp_sysfs_port_units_mutex); ++ + static ssize_t zfcp_sysfs_port_remove_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +@@ -249,6 +251,16 @@ static ssize_t zfcp_sysfs_port_remove_store(struct device *dev, + else + retval = 0; + ++ mutex_lock(&zfcp_sysfs_port_units_mutex); ++ if (atomic_read(&port->units) > 0) { ++ retval = -EBUSY; ++ mutex_unlock(&zfcp_sysfs_port_units_mutex); ++ goto out; ++ } ++ /* port is about to be removed, so no more unit_add */ ++ atomic_set(&port->units, -1); ++ mutex_unlock(&zfcp_sysfs_port_units_mutex); ++ + write_lock_irq(&adapter->port_list_lock); + list_del(&port->list); + write_unlock_irq(&adapter->port_list_lock); +@@ -289,12 +301,14 @@ static ssize_t zfcp_sysfs_unit_add_store(struct device *dev, + { + struct zfcp_port *port = container_of(dev, struct zfcp_port, dev); + u64 fcp_lun; ++ int retval; + + if (strict_strtoull(buf, 0, (unsigned long long *) &fcp_lun)) + return -EINVAL; + +- if (zfcp_unit_add(port, fcp_lun)) +- return -EINVAL; ++ retval = zfcp_unit_add(port, fcp_lun); ++ if (retval) ++ return retval; + + return count; + } +diff --git a/drivers/s390/scsi/zfcp_unit.c b/drivers/s390/scsi/zfcp_unit.c +index 20796eb..4e6a535 100644 +--- a/drivers/s390/scsi/zfcp_unit.c ++++ b/drivers/s390/scsi/zfcp_unit.c +@@ -104,7 +104,7 @@ static void zfcp_unit_release(struct device *dev) + { + struct zfcp_unit *unit = container_of(dev, struct zfcp_unit, dev); + +- put_device(&unit->port->dev); ++ atomic_dec(&unit->port->units); + kfree(unit); + } + +@@ -119,16 +119,27 @@ static void zfcp_unit_release(struct device *dev) + int zfcp_unit_add(struct zfcp_port *port, u64 fcp_lun) + { + struct zfcp_unit *unit; ++ int retval = 0; ++ ++ mutex_lock(&zfcp_sysfs_port_units_mutex); ++ if (atomic_read(&port->units) == -1) { ++ /* port is already gone */ ++ retval = -ENODEV; ++ goto out; ++ } + + unit = zfcp_unit_find(port, fcp_lun); + if (unit) { + put_device(&unit->dev); +- return -EEXIST; ++ retval = -EEXIST; ++ goto out; + } + + unit = kzalloc(sizeof(struct zfcp_unit), GFP_KERNEL); +- if (!unit) +- return -ENOMEM; ++ if (!unit) { ++ retval = -ENOMEM; ++ goto out; ++ } + + unit->port = port; + unit->fcp_lun = fcp_lun; +@@ -139,28 +150,33 @@ int zfcp_unit_add(struct zfcp_port *port, u64 fcp_lun) + if (dev_set_name(&unit->dev, "0x%016llx", + (unsigned long long) fcp_lun)) { + kfree(unit); +- return -ENOMEM; ++ retval = -ENOMEM; ++ goto out; + } + +- get_device(&port->dev); +- + if (device_register(&unit->dev)) { + put_device(&unit->dev); +- return -ENOMEM; ++ retval = -ENOMEM; ++ goto out; + } + + if (sysfs_create_group(&unit->dev.kobj, &zfcp_sysfs_unit_attrs)) { + device_unregister(&unit->dev); +- return -EINVAL; ++ retval = -EINVAL; ++ goto out; + } + ++ atomic_inc(&port->units); /* under zfcp_sysfs_port_units_mutex ! */ ++ + write_lock_irq(&port->unit_list_lock); + list_add_tail(&unit->list, &port->unit_list); + write_unlock_irq(&port->unit_list_lock); + + zfcp_unit_scsi_scan(unit); + +- return 0; ++out: ++ mutex_unlock(&zfcp_sysfs_port_units_mutex); ++ return retval; + } + + /** +diff --git a/drivers/scsi/atp870u.c b/drivers/scsi/atp870u.c +index 7e6eca4..59fc5a1 100644 +--- a/drivers/scsi/atp870u.c ++++ b/drivers/scsi/atp870u.c +@@ -1174,7 +1174,16 @@ wait_io1: + outw(val, tmport); + outb(2, 0x80); + TCM_SYNC: +- udelay(0x800); ++ /* ++ * The funny division into multiple delays is to accomodate ++ * arches like ARM where udelay() multiplies its argument by ++ * a large number to initialize a loop counter. To avoid ++ * overflow, the maximum supported udelay is 2000 microseconds. ++ * ++ * XXX it would be more polite to find a way to use msleep() ++ */ ++ mdelay(2); ++ udelay(48); + if ((inb(tmport) & 0x80) == 0x00) { /* bsy ? */ + outw(0, tmport--); + outb(0, tmport); +diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c +index 4ef0212..e5a4423 100644 +--- a/drivers/scsi/device_handler/scsi_dh_alua.c ++++ b/drivers/scsi/device_handler/scsi_dh_alua.c +@@ -578,8 +578,7 @@ static int alua_rtpg(struct scsi_device *sdev, struct alua_dh_data *h) + h->state = TPGS_STATE_STANDBY; + break; + case TPGS_STATE_OFFLINE: +- case TPGS_STATE_UNAVAILABLE: +- /* Path unusable for unavailable/offline */ ++ /* Path unusable */ + err = SCSI_DH_DEV_OFFLINED; + break; + default: +diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c +index be9aad8..22523aa 100644 +--- a/drivers/scsi/hpsa.c ++++ b/drivers/scsi/hpsa.c +@@ -532,12 +532,42 @@ static void set_performant_mode(struct ctlr_info *h, struct CommandList *c) + c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1); + } + ++static int is_firmware_flash_cmd(u8 *cdb) ++{ ++ return cdb[0] == BMIC_WRITE && cdb[6] == BMIC_FLASH_FIRMWARE; ++} ++ ++/* ++ * During firmware flash, the heartbeat register may not update as frequently ++ * as it should. So we dial down lockup detection during firmware flash. and ++ * dial it back up when firmware flash completes. ++ */ ++#define HEARTBEAT_SAMPLE_INTERVAL_DURING_FLASH (240 * HZ) ++#define HEARTBEAT_SAMPLE_INTERVAL (30 * HZ) ++static void dial_down_lockup_detection_during_fw_flash(struct ctlr_info *h, ++ struct CommandList *c) ++{ ++ if (!is_firmware_flash_cmd(c->Request.CDB)) ++ return; ++ atomic_inc(&h->firmware_flash_in_progress); ++ h->heartbeat_sample_interval = HEARTBEAT_SAMPLE_INTERVAL_DURING_FLASH; ++} ++ ++static void dial_up_lockup_detection_on_fw_flash_complete(struct ctlr_info *h, ++ struct CommandList *c) ++{ ++ if (is_firmware_flash_cmd(c->Request.CDB) && ++ atomic_dec_and_test(&h->firmware_flash_in_progress)) ++ h->heartbeat_sample_interval = HEARTBEAT_SAMPLE_INTERVAL; ++} ++ + static void enqueue_cmd_and_start_io(struct ctlr_info *h, + struct CommandList *c) + { + unsigned long flags; + + set_performant_mode(h, c); ++ dial_down_lockup_detection_during_fw_flash(h, c); + spin_lock_irqsave(&h->lock, flags); + addQ(&h->reqQ, c); + h->Qdepth++; +@@ -2926,7 +2956,7 @@ static void fill_cmd(struct CommandList *c, u8 cmd, struct ctlr_info *h, + c->Request.Timeout = 0; /* Don't time out */ + memset(&c->Request.CDB[0], 0, sizeof(c->Request.CDB)); + c->Request.CDB[0] = cmd; +- c->Request.CDB[1] = 0x03; /* Reset target above */ ++ c->Request.CDB[1] = HPSA_RESET_TYPE_LUN; + /* If bytes 4-7 are zero, it means reset the */ + /* LunID device */ + c->Request.CDB[4] = 0x00; +@@ -3032,6 +3062,7 @@ static inline int bad_tag(struct ctlr_info *h, u32 tag_index, + static inline void finish_cmd(struct CommandList *c, u32 raw_tag) + { + removeQ(c); ++ dial_up_lockup_detection_on_fw_flash_complete(c->h, c); + if (likely(c->cmd_type == CMD_SCSI)) + complete_scsi_command(c); + else if (c->cmd_type == CMD_IOCTL_PEND) +@@ -4172,9 +4203,6 @@ static void controller_lockup_detected(struct ctlr_info *h) + spin_unlock_irqrestore(&h->lock, flags); + } + +-#define HEARTBEAT_SAMPLE_INTERVAL (10 * HZ) +-#define HEARTBEAT_CHECK_MINIMUM_INTERVAL (HEARTBEAT_SAMPLE_INTERVAL / 2) +- + static void detect_controller_lockup(struct ctlr_info *h) + { + u64 now; +@@ -4185,7 +4213,7 @@ static void detect_controller_lockup(struct ctlr_info *h) + now = get_jiffies_64(); + /* If we've received an interrupt recently, we're ok. */ + if (time_after64(h->last_intr_timestamp + +- (HEARTBEAT_CHECK_MINIMUM_INTERVAL), now)) ++ (h->heartbeat_sample_interval), now)) + return; + + /* +@@ -4194,7 +4222,7 @@ static void detect_controller_lockup(struct ctlr_info *h) + * otherwise don't care about signals in this thread. + */ + if (time_after64(h->last_heartbeat_timestamp + +- (HEARTBEAT_CHECK_MINIMUM_INTERVAL), now)) ++ (h->heartbeat_sample_interval), now)) + return; + + /* If heartbeat has not changed since we last looked, we're not ok. */ +@@ -4236,6 +4264,7 @@ static void add_ctlr_to_lockup_detector_list(struct ctlr_info *h) + { + unsigned long flags; + ++ h->heartbeat_sample_interval = HEARTBEAT_SAMPLE_INTERVAL; + spin_lock_irqsave(&lockup_detector_lock, flags); + list_add_tail(&h->lockup_list, &hpsa_ctlr_list); + spin_unlock_irqrestore(&lockup_detector_lock, flags); +diff --git a/drivers/scsi/hpsa.h b/drivers/scsi/hpsa.h +index 91edafb..c721509 100644 +--- a/drivers/scsi/hpsa.h ++++ b/drivers/scsi/hpsa.h +@@ -124,6 +124,8 @@ struct ctlr_info { + u64 last_intr_timestamp; + u32 last_heartbeat; + u64 last_heartbeat_timestamp; ++ u32 heartbeat_sample_interval; ++ atomic_t firmware_flash_in_progress; + u32 lockup_detected; + struct list_head lockup_list; + }; +diff --git a/drivers/scsi/hpsa_cmd.h b/drivers/scsi/hpsa_cmd.h +index 3fd4715..e4ea0a3 100644 +--- a/drivers/scsi/hpsa_cmd.h ++++ b/drivers/scsi/hpsa_cmd.h +@@ -163,6 +163,7 @@ struct SenseSubsystem_info { + #define BMIC_WRITE 0x27 + #define BMIC_CACHE_FLUSH 0xc2 + #define HPSA_CACHE_FLUSH 0x01 /* C2 was already being used by HPSA */ ++#define BMIC_FLASH_FIRMWARE 0xF7 + + /* Command List Structure */ + union SCSI3Addr { +diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c +index 3d391dc..36aca4b 100644 +--- a/drivers/scsi/ibmvscsi/ibmvscsi.c ++++ b/drivers/scsi/ibmvscsi/ibmvscsi.c +@@ -1547,6 +1547,9 @@ static int ibmvscsi_do_host_config(struct ibmvscsi_host_data *hostdata, + + host_config = &evt_struct->iu.mad.host_config; + ++ /* The transport length field is only 16-bit */ ++ length = min(0xffff, length); ++ + /* Set up a lun reset SRP command */ + memset(host_config, 0x00, sizeof(*host_config)); + host_config->common.type = VIOSRP_HOST_CONFIG_TYPE; +diff --git a/drivers/scsi/isci/init.c b/drivers/scsi/isci/init.c +index 83d08b6..5c8b0dc 100644 +--- a/drivers/scsi/isci/init.c ++++ b/drivers/scsi/isci/init.c +@@ -469,7 +469,6 @@ static int __devinit isci_pci_probe(struct pci_dev *pdev, const struct pci_devic + if (sci_oem_parameters_validate(&orom->ctrl[i])) { + dev_warn(&pdev->dev, + "[%d]: invalid oem parameters detected, falling back to firmware\n", i); +- devm_kfree(&pdev->dev, orom); + orom = NULL; + break; + } +diff --git a/drivers/scsi/isci/probe_roms.c b/drivers/scsi/isci/probe_roms.c +index b5f4341..7cd637d 100644 +--- a/drivers/scsi/isci/probe_roms.c ++++ b/drivers/scsi/isci/probe_roms.c +@@ -104,7 +104,6 @@ struct isci_orom *isci_request_oprom(struct pci_dev *pdev) + + if (i >= len) { + dev_err(&pdev->dev, "oprom parse error\n"); +- devm_kfree(&pdev->dev, rom); + rom = NULL; + } + pci_unmap_biosrom(oprom); +diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c +index bb7c482..08d48a3 100644 +--- a/drivers/scsi/scsi_sysfs.c ++++ b/drivers/scsi/scsi_sysfs.c +@@ -1023,33 +1023,31 @@ static void __scsi_remove_target(struct scsi_target *starget) + void scsi_remove_target(struct device *dev) + { + struct Scsi_Host *shost = dev_to_shost(dev->parent); +- struct scsi_target *starget, *found; ++ struct scsi_target *starget, *last = NULL; + unsigned long flags; + +- restart: +- found = NULL; ++ /* remove targets being careful to lookup next entry before ++ * deleting the last ++ */ + spin_lock_irqsave(shost->host_lock, flags); + list_for_each_entry(starget, &shost->__targets, siblings) { + if (starget->state == STARGET_DEL) + continue; + if (starget->dev.parent == dev || &starget->dev == dev) { +- found = starget; +- found->reap_ref++; +- break; ++ /* assuming new targets arrive at the end */ ++ starget->reap_ref++; ++ spin_unlock_irqrestore(shost->host_lock, flags); ++ if (last) ++ scsi_target_reap(last); ++ last = starget; ++ __scsi_remove_target(starget); ++ spin_lock_irqsave(shost->host_lock, flags); + } + } + spin_unlock_irqrestore(shost->host_lock, flags); + +- if (found) { +- __scsi_remove_target(found); +- scsi_target_reap(found); +- /* in the case where @dev has multiple starget children, +- * continue removing. +- * +- * FIXME: does such a case exist? +- */ +- goto restart; +- } ++ if (last) ++ scsi_target_reap(last); + } + EXPORT_SYMBOL(scsi_remove_target); + +diff --git a/drivers/staging/comedi/comedi_fops.c b/drivers/staging/comedi/comedi_fops.c +index 4ad2c0e..9465bce 100644 +--- a/drivers/staging/comedi/comedi_fops.c ++++ b/drivers/staging/comedi/comedi_fops.c +@@ -843,7 +843,7 @@ static int parse_insn(struct comedi_device *dev, struct comedi_insn *insn, + ret = -EAGAIN; + break; + } +- ret = s->async->inttrig(dev, s, insn->data[0]); ++ ret = s->async->inttrig(dev, s, data[0]); + if (ret >= 0) + ret = 1; + break; +@@ -1088,7 +1088,6 @@ static int do_cmd_ioctl(struct comedi_device *dev, + goto cleanup; + } + +- kfree(async->cmd.chanlist); + async->cmd = user_cmd; + async->cmd.data = NULL; + /* load channel/gain list */ +@@ -1833,6 +1832,8 @@ void do_become_nonbusy(struct comedi_device *dev, struct comedi_subdevice *s) + if (async) { + comedi_reset_async_buf(async); + async->inttrig = NULL; ++ kfree(async->cmd.chanlist); ++ async->cmd.chanlist = NULL; + } else { + printk(KERN_ERR + "BUG: (?) do_become_nonbusy called with async=0\n"); +diff --git a/drivers/staging/comedi/drivers/jr3_pci.c b/drivers/staging/comedi/drivers/jr3_pci.c +index 8d98cf4..c8b7eed 100644 +--- a/drivers/staging/comedi/drivers/jr3_pci.c ++++ b/drivers/staging/comedi/drivers/jr3_pci.c +@@ -913,7 +913,7 @@ static int jr3_pci_attach(struct comedi_device *dev, + } + + /* Reset DSP card */ +- devpriv->iobase->channel[0].reset = 0; ++ writel(0, &devpriv->iobase->channel[0].reset); + + result = comedi_load_firmware(dev, "jr3pci.idm", jr3_download_firmware); + printk("Firmare load %d\n", result); +diff --git a/drivers/staging/comedi/drivers/s626.c b/drivers/staging/comedi/drivers/s626.c +index 23fc64b..c72128f 100644 +--- a/drivers/staging/comedi/drivers/s626.c ++++ b/drivers/staging/comedi/drivers/s626.c +@@ -2370,7 +2370,7 @@ static int s626_enc_insn_config(struct comedi_device *dev, + /* (data==NULL) ? (Preloadvalue=0) : (Preloadvalue=data[0]); */ + + k->SetMode(dev, k, Setup, TRUE); +- Preload(dev, k, *(insn->data)); ++ Preload(dev, k, data[0]); + k->PulseIndex(dev, k); + SetLatchSource(dev, k, valueSrclatch); + k->SetEnable(dev, k, (uint16_t) (enab != 0)); +diff --git a/drivers/staging/speakup/speakup_soft.c b/drivers/staging/speakup/speakup_soft.c +index 42cdafe..b5130c8 100644 +--- a/drivers/staging/speakup/speakup_soft.c ++++ b/drivers/staging/speakup/speakup_soft.c +@@ -40,7 +40,7 @@ static int softsynth_is_alive(struct spk_synth *synth); + static unsigned char get_index(void); + + static struct miscdevice synth_device; +-static int initialized; ++static int init_pos; + static int misc_registered; + + static struct var_t vars[] = { +@@ -194,7 +194,7 @@ static int softsynth_close(struct inode *inode, struct file *fp) + unsigned long flags; + spk_lock(flags); + synth_soft.alive = 0; +- initialized = 0; ++ init_pos = 0; + spk_unlock(flags); + /* Make sure we let applications go before leaving */ + speakup_start_ttys(); +@@ -239,13 +239,8 @@ static ssize_t softsynth_read(struct file *fp, char *buf, size_t count, + ch = '\x18'; + } else if (synth_buffer_empty()) { + break; +- } else if (!initialized) { +- if (*init) { +- ch = *init; +- init++; +- } else { +- initialized = 1; +- } ++ } else if (init[init_pos]) { ++ ch = init[init_pos++]; + } else { + ch = synth_buffer_getc(); + } +diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c +index 2ff1255..f35cb10 100644 +--- a/drivers/target/iscsi/iscsi_target.c ++++ b/drivers/target/iscsi/iscsi_target.c +@@ -3204,7 +3204,6 @@ static int iscsit_build_sendtargets_response(struct iscsi_cmd *cmd) + len += 1; + + if ((len + payload_len) > buffer_len) { +- spin_unlock(&tiqn->tiqn_tpg_lock); + end_of_buf = 1; + goto eob; + } +@@ -3357,6 +3356,7 @@ static int iscsit_send_reject( + hdr->opcode = ISCSI_OP_REJECT; + hdr->flags |= ISCSI_FLAG_CMD_FINAL; + hton24(hdr->dlength, ISCSI_HDR_LEN); ++ hdr->ffffffff = 0xffffffff; + cmd->stat_sn = conn->stat_sn++; + hdr->statsn = cpu_to_be32(cmd->stat_sn); + hdr->exp_cmdsn = cpu_to_be32(conn->sess->exp_cmd_sn); +diff --git a/drivers/target/iscsi/iscsi_target_core.h b/drivers/target/iscsi/iscsi_target_core.h +index 0f68197..dae283f 100644 +--- a/drivers/target/iscsi/iscsi_target_core.h ++++ b/drivers/target/iscsi/iscsi_target_core.h +@@ -25,10 +25,10 @@ + #define NA_DATAOUT_TIMEOUT_RETRIES 5 + #define NA_DATAOUT_TIMEOUT_RETRIES_MAX 15 + #define NA_DATAOUT_TIMEOUT_RETRIES_MIN 1 +-#define NA_NOPIN_TIMEOUT 5 ++#define NA_NOPIN_TIMEOUT 15 + #define NA_NOPIN_TIMEOUT_MAX 60 + #define NA_NOPIN_TIMEOUT_MIN 3 +-#define NA_NOPIN_RESPONSE_TIMEOUT 5 ++#define NA_NOPIN_RESPONSE_TIMEOUT 30 + #define NA_NOPIN_RESPONSE_TIMEOUT_MAX 60 + #define NA_NOPIN_RESPONSE_TIMEOUT_MIN 3 + #define NA_RANDOM_DATAIN_PDU_OFFSETS 0 +diff --git a/drivers/target/iscsi/iscsi_target_tpg.c b/drivers/target/iscsi/iscsi_target_tpg.c +index d4cf2cd..309f14c 100644 +--- a/drivers/target/iscsi/iscsi_target_tpg.c ++++ b/drivers/target/iscsi/iscsi_target_tpg.c +@@ -674,6 +674,12 @@ int iscsit_ta_generate_node_acls( + pr_debug("iSCSI_TPG[%hu] - Generate Initiator Portal Group ACLs: %s\n", + tpg->tpgt, (a->generate_node_acls) ? "Enabled" : "Disabled"); + ++ if (flag == 1 && a->cache_dynamic_acls == 0) { ++ pr_debug("Explicitly setting cache_dynamic_acls=1 when " ++ "generate_node_acls=1\n"); ++ a->cache_dynamic_acls = 1; ++ } ++ + return 0; + } + +@@ -713,6 +719,12 @@ int iscsit_ta_cache_dynamic_acls( + return -EINVAL; + } + ++ if (a->generate_node_acls == 1 && flag == 0) { ++ pr_debug("Skipping cache_dynamic_acls=0 when" ++ " generate_node_acls=1\n"); ++ return 0; ++ } ++ + a->cache_dynamic_acls = flag; + pr_debug("iSCSI_TPG[%hu] - Cache Dynamic Initiator Portal Group" + " ACLs %s\n", tpg->tpgt, (a->cache_dynamic_acls) ? +diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c +index 93d4f6a..0b01bfc 100644 +--- a/drivers/target/target_core_configfs.c ++++ b/drivers/target/target_core_configfs.c +@@ -3123,6 +3123,7 @@ static int __init target_core_init_configfs(void) + GFP_KERNEL); + if (!target_cg->default_groups) { + pr_err("Unable to allocate target_cg->default_groups\n"); ++ ret = -ENOMEM; + goto out_global; + } + +@@ -3138,6 +3139,7 @@ static int __init target_core_init_configfs(void) + GFP_KERNEL); + if (!hba_cg->default_groups) { + pr_err("Unable to allocate hba_cg->default_groups\n"); ++ ret = -ENOMEM; + goto out_global; + } + config_group_init_type_name(&alua_group, +@@ -3153,6 +3155,7 @@ static int __init target_core_init_configfs(void) + GFP_KERNEL); + if (!alua_cg->default_groups) { + pr_err("Unable to allocate alua_cg->default_groups\n"); ++ ret = -ENOMEM; + goto out_global; + } + +@@ -3164,14 +3167,17 @@ static int __init target_core_init_configfs(void) + * Add core/alua/lu_gps/default_lu_gp + */ + lu_gp = core_alua_allocate_lu_gp("default_lu_gp", 1); +- if (IS_ERR(lu_gp)) ++ if (IS_ERR(lu_gp)) { ++ ret = -ENOMEM; + goto out_global; ++ } + + lu_gp_cg = &alua_lu_gps_group; + lu_gp_cg->default_groups = kzalloc(sizeof(struct config_group) * 2, + GFP_KERNEL); + if (!lu_gp_cg->default_groups) { + pr_err("Unable to allocate lu_gp_cg->default_groups\n"); ++ ret = -ENOMEM; + goto out_global; + } + +diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c +index 455a251..cafa477 100644 +--- a/drivers/target/target_core_file.c ++++ b/drivers/target/target_core_file.c +@@ -139,6 +139,19 @@ static struct se_device *fd_create_virtdevice( + * of pure timestamp updates. + */ + flags = O_RDWR | O_CREAT | O_LARGEFILE | O_DSYNC; ++ /* ++ * Optionally allow fd_buffered_io=1 to be enabled for people ++ * who want use the fs buffer cache as an WriteCache mechanism. ++ * ++ * This means that in event of a hard failure, there is a risk ++ * of silent data-loss if the SCSI client has *not* performed a ++ * forced unit access (FUA) write, or issued SYNCHRONIZE_CACHE ++ * to write-out the entire device cache. ++ */ ++ if (fd_dev->fbd_flags & FDBD_HAS_BUFFERED_IO_WCE) { ++ pr_debug("FILEIO: Disabling O_DSYNC, using buffered FILEIO\n"); ++ flags &= ~O_DSYNC; ++ } + + file = filp_open(dev_p, flags, 0600); + if (IS_ERR(file)) { +@@ -206,6 +219,12 @@ static struct se_device *fd_create_virtdevice( + if (!dev) + goto fail; + ++ if (fd_dev->fbd_flags & FDBD_HAS_BUFFERED_IO_WCE) { ++ pr_debug("FILEIO: Forcing setting of emulate_write_cache=1" ++ " with FDBD_HAS_BUFFERED_IO_WCE\n"); ++ dev->se_sub_dev->se_dev_attrib.emulate_write_cache = 1; ++ } ++ + fd_dev->fd_dev_id = fd_host->fd_host_dev_id_count++; + fd_dev->fd_queue_depth = dev->queue_depth; + +@@ -450,6 +469,7 @@ enum { + static match_table_t tokens = { + {Opt_fd_dev_name, "fd_dev_name=%s"}, + {Opt_fd_dev_size, "fd_dev_size=%s"}, ++ {Opt_fd_buffered_io, "fd_buffered_io=%d"}, + {Opt_err, NULL} + }; + +@@ -461,7 +481,7 @@ static ssize_t fd_set_configfs_dev_params( + struct fd_dev *fd_dev = se_dev->se_dev_su_ptr; + char *orig, *ptr, *arg_p, *opts; + substring_t args[MAX_OPT_ARGS]; +- int ret = 0, token; ++ int ret = 0, arg, token; + + opts = kstrdup(page, GFP_KERNEL); + if (!opts) +@@ -505,6 +525,19 @@ static ssize_t fd_set_configfs_dev_params( + " bytes\n", fd_dev->fd_dev_size); + fd_dev->fbd_flags |= FBDF_HAS_SIZE; + break; ++ case Opt_fd_buffered_io: ++ match_int(args, &arg); ++ if (arg != 1) { ++ pr_err("bogus fd_buffered_io=%d value\n", arg); ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ pr_debug("FILEIO: Using buffered I/O" ++ " operations for struct fd_dev\n"); ++ ++ fd_dev->fbd_flags |= FDBD_HAS_BUFFERED_IO_WCE; ++ break; + default: + break; + } +@@ -536,8 +569,10 @@ static ssize_t fd_show_configfs_dev_params( + ssize_t bl = 0; + + bl = sprintf(b + bl, "TCM FILEIO ID: %u", fd_dev->fd_dev_id); +- bl += sprintf(b + bl, " File: %s Size: %llu Mode: O_DSYNC\n", +- fd_dev->fd_dev_name, fd_dev->fd_dev_size); ++ bl += sprintf(b + bl, " File: %s Size: %llu Mode: %s\n", ++ fd_dev->fd_dev_name, fd_dev->fd_dev_size, ++ (fd_dev->fbd_flags & FDBD_HAS_BUFFERED_IO_WCE) ? ++ "Buffered-WCE" : "O_DSYNC"); + return bl; + } + +diff --git a/drivers/target/target_core_file.h b/drivers/target/target_core_file.h +index 53ece69..6b1b6a9 100644 +--- a/drivers/target/target_core_file.h ++++ b/drivers/target/target_core_file.h +@@ -18,6 +18,7 @@ struct fd_request { + + #define FBDF_HAS_PATH 0x01 + #define FBDF_HAS_SIZE 0x02 ++#define FDBD_HAS_BUFFERED_IO_WCE 0x04 + + struct fd_dev { + u32 fbd_flags; +diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c +index fc7bbba..d190269 100644 +--- a/drivers/tty/n_gsm.c ++++ b/drivers/tty/n_gsm.c +@@ -108,7 +108,7 @@ struct gsm_mux_net { + */ + + struct gsm_msg { +- struct gsm_msg *next; ++ struct list_head list; + u8 addr; /* DLCI address + flags */ + u8 ctrl; /* Control byte + flags */ + unsigned int len; /* Length of data block (can be zero) */ +@@ -245,8 +245,7 @@ struct gsm_mux { + unsigned int tx_bytes; /* TX data outstanding */ + #define TX_THRESH_HI 8192 + #define TX_THRESH_LO 2048 +- struct gsm_msg *tx_head; /* Pending data packets */ +- struct gsm_msg *tx_tail; ++ struct list_head tx_list; /* Pending data packets */ + + /* Control messages */ + struct timer_list t2_timer; /* Retransmit timer for commands */ +@@ -663,7 +662,7 @@ static struct gsm_msg *gsm_data_alloc(struct gsm_mux *gsm, u8 addr, int len, + m->len = len; + m->addr = addr; + m->ctrl = ctrl; +- m->next = NULL; ++ INIT_LIST_HEAD(&m->list); + return m; + } + +@@ -673,22 +672,21 @@ static struct gsm_msg *gsm_data_alloc(struct gsm_mux *gsm, u8 addr, int len, + * + * The tty device has called us to indicate that room has appeared in + * the transmit queue. Ram more data into the pipe if we have any ++ * If we have been flow-stopped by a CMD_FCOFF, then we can only ++ * send messages on DLCI0 until CMD_FCON + * + * FIXME: lock against link layer control transmissions + */ + + static void gsm_data_kick(struct gsm_mux *gsm) + { +- struct gsm_msg *msg = gsm->tx_head; ++ struct gsm_msg *msg, *nmsg; + int len; + int skip_sof = 0; + +- /* FIXME: We need to apply this solely to data messages */ +- if (gsm->constipated) +- return; +- +- while (gsm->tx_head != NULL) { +- msg = gsm->tx_head; ++ list_for_each_entry_safe(msg, nmsg, &gsm->tx_list, list) { ++ if (gsm->constipated && msg->addr) ++ continue; + if (gsm->encoding != 0) { + gsm->txframe[0] = GSM1_SOF; + len = gsm_stuff_frame(msg->data, +@@ -711,14 +709,13 @@ static void gsm_data_kick(struct gsm_mux *gsm) + len - skip_sof) < 0) + break; + /* FIXME: Can eliminate one SOF in many more cases */ +- gsm->tx_head = msg->next; +- if (gsm->tx_head == NULL) +- gsm->tx_tail = NULL; + gsm->tx_bytes -= msg->len; +- kfree(msg); + /* For a burst of frames skip the extra SOF within the + burst */ + skip_sof = 1; ++ ++ list_del(&msg->list); ++ kfree(msg); + } + } + +@@ -768,11 +765,7 @@ static void __gsm_data_queue(struct gsm_dlci *dlci, struct gsm_msg *msg) + msg->data = dp; + + /* Add to the actual output queue */ +- if (gsm->tx_tail) +- gsm->tx_tail->next = msg; +- else +- gsm->tx_head = msg; +- gsm->tx_tail = msg; ++ list_add_tail(&msg->list, &gsm->tx_list); + gsm->tx_bytes += msg->len; + gsm_data_kick(gsm); + } +@@ -875,7 +868,7 @@ static int gsm_dlci_data_output_framed(struct gsm_mux *gsm, + + /* dlci->skb is locked by tx_lock */ + if (dlci->skb == NULL) { +- dlci->skb = skb_dequeue(&dlci->skb_list); ++ dlci->skb = skb_dequeue_tail(&dlci->skb_list); + if (dlci->skb == NULL) + return 0; + first = 1; +@@ -886,7 +879,7 @@ static int gsm_dlci_data_output_framed(struct gsm_mux *gsm, + if (len > gsm->mtu) { + if (dlci->adaption == 3) { + /* Over long frame, bin it */ +- kfree_skb(dlci->skb); ++ dev_kfree_skb_any(dlci->skb); + dlci->skb = NULL; + return 0; + } +@@ -899,8 +892,11 @@ static int gsm_dlci_data_output_framed(struct gsm_mux *gsm, + + /* FIXME: need a timer or something to kick this so it can't + get stuck with no work outstanding and no buffer free */ +- if (msg == NULL) ++ if (msg == NULL) { ++ skb_queue_tail(&dlci->skb_list, dlci->skb); ++ dlci->skb = NULL; + return -ENOMEM; ++ } + dp = msg->data; + + if (dlci->adaption == 4) { /* Interruptible framed (Packetised Data) */ +@@ -912,7 +908,7 @@ static int gsm_dlci_data_output_framed(struct gsm_mux *gsm, + skb_pull(dlci->skb, len); + __gsm_data_queue(dlci, msg); + if (last) { +- kfree_skb(dlci->skb); ++ dev_kfree_skb_any(dlci->skb); + dlci->skb = NULL; + } + return size; +@@ -971,16 +967,22 @@ static void gsm_dlci_data_sweep(struct gsm_mux *gsm) + static void gsm_dlci_data_kick(struct gsm_dlci *dlci) + { + unsigned long flags; ++ int sweep; ++ ++ if (dlci->constipated) ++ return; + + spin_lock_irqsave(&dlci->gsm->tx_lock, flags); + /* If we have nothing running then we need to fire up */ ++ sweep = (dlci->gsm->tx_bytes < TX_THRESH_LO); + if (dlci->gsm->tx_bytes == 0) { + if (dlci->net) + gsm_dlci_data_output_framed(dlci->gsm, dlci); + else + gsm_dlci_data_output(dlci->gsm, dlci); +- } else if (dlci->gsm->tx_bytes < TX_THRESH_LO) +- gsm_dlci_data_sweep(dlci->gsm); ++ } ++ if (sweep) ++ gsm_dlci_data_sweep(dlci->gsm); + spin_unlock_irqrestore(&dlci->gsm->tx_lock, flags); + } + +@@ -1027,6 +1029,7 @@ static void gsm_process_modem(struct tty_struct *tty, struct gsm_dlci *dlci, + { + int mlines = 0; + u8 brk = 0; ++ int fc; + + /* The modem status command can either contain one octet (v.24 signals) + or two octets (v.24 signals + break signals). The length field will +@@ -1038,19 +1041,21 @@ static void gsm_process_modem(struct tty_struct *tty, struct gsm_dlci *dlci, + else { + brk = modem & 0x7f; + modem = (modem >> 7) & 0x7f; +- }; ++ } + + /* Flow control/ready to communicate */ +- if (modem & MDM_FC) { ++ fc = (modem & MDM_FC) || !(modem & MDM_RTR); ++ if (fc && !dlci->constipated) { + /* Need to throttle our output on this device */ + dlci->constipated = 1; +- } +- if (modem & MDM_RTC) { +- mlines |= TIOCM_DSR | TIOCM_DTR; ++ } else if (!fc && dlci->constipated) { + dlci->constipated = 0; + gsm_dlci_data_kick(dlci); + } ++ + /* Map modem bits */ ++ if (modem & MDM_RTC) ++ mlines |= TIOCM_DSR | TIOCM_DTR; + if (modem & MDM_RTR) + mlines |= TIOCM_RTS | TIOCM_CTS; + if (modem & MDM_IC) +@@ -1190,6 +1195,8 @@ static void gsm_control_message(struct gsm_mux *gsm, unsigned int command, + u8 *data, int clen) + { + u8 buf[1]; ++ unsigned long flags; ++ + switch (command) { + case CMD_CLD: { + struct gsm_dlci *dlci = gsm->dlci[0]; +@@ -1206,16 +1213,18 @@ static void gsm_control_message(struct gsm_mux *gsm, unsigned int command, + gsm_control_reply(gsm, CMD_TEST, data, clen); + break; + case CMD_FCON: +- /* Modem wants us to STFU */ +- gsm->constipated = 1; +- gsm_control_reply(gsm, CMD_FCON, NULL, 0); +- break; +- case CMD_FCOFF: + /* Modem can accept data again */ + gsm->constipated = 0; +- gsm_control_reply(gsm, CMD_FCOFF, NULL, 0); ++ gsm_control_reply(gsm, CMD_FCON, NULL, 0); + /* Kick the link in case it is idling */ ++ spin_lock_irqsave(&gsm->tx_lock, flags); + gsm_data_kick(gsm); ++ spin_unlock_irqrestore(&gsm->tx_lock, flags); ++ break; ++ case CMD_FCOFF: ++ /* Modem wants us to STFU */ ++ gsm->constipated = 1; ++ gsm_control_reply(gsm, CMD_FCOFF, NULL, 0); + break; + case CMD_MSC: + /* Out of band modem line change indicator for a DLCI */ +@@ -1668,7 +1677,7 @@ static void gsm_dlci_free(struct kref *ref) + dlci->gsm->dlci[dlci->addr] = NULL; + kfifo_free(dlci->fifo); + while ((dlci->skb = skb_dequeue(&dlci->skb_list))) +- kfree_skb(dlci->skb); ++ dev_kfree_skb(dlci->skb); + kfree(dlci); + } + +@@ -2007,7 +2016,7 @@ void gsm_cleanup_mux(struct gsm_mux *gsm) + { + int i; + struct gsm_dlci *dlci = gsm->dlci[0]; +- struct gsm_msg *txq; ++ struct gsm_msg *txq, *ntxq; + struct gsm_control *gc; + + gsm->dead = 1; +@@ -2042,11 +2051,9 @@ void gsm_cleanup_mux(struct gsm_mux *gsm) + if (gsm->dlci[i]) + gsm_dlci_release(gsm->dlci[i]); + /* Now wipe the queues */ +- for (txq = gsm->tx_head; txq != NULL; txq = gsm->tx_head) { +- gsm->tx_head = txq->next; ++ list_for_each_entry_safe(txq, ntxq, &gsm->tx_list, list) + kfree(txq); +- } +- gsm->tx_tail = NULL; ++ INIT_LIST_HEAD(&gsm->tx_list); + } + EXPORT_SYMBOL_GPL(gsm_cleanup_mux); + +@@ -2157,6 +2164,7 @@ struct gsm_mux *gsm_alloc_mux(void) + } + spin_lock_init(&gsm->lock); + kref_init(&gsm->ref); ++ INIT_LIST_HEAD(&gsm->tx_list); + + gsm->t1 = T1; + gsm->t2 = T2; +@@ -2273,7 +2281,7 @@ static void gsmld_receive_buf(struct tty_struct *tty, const unsigned char *cp, + gsm->error(gsm, *dp, flags); + break; + default: +- WARN_ONCE("%s: unknown flag %d\n", ++ WARN_ONCE(1, "%s: unknown flag %d\n", + tty_name(tty, buf), flags); + break; + } +@@ -2377,12 +2385,12 @@ static void gsmld_write_wakeup(struct tty_struct *tty) + + /* Queue poll */ + clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); ++ spin_lock_irqsave(&gsm->tx_lock, flags); + gsm_data_kick(gsm); + if (gsm->tx_bytes < TX_THRESH_LO) { +- spin_lock_irqsave(&gsm->tx_lock, flags); + gsm_dlci_data_sweep(gsm); +- spin_unlock_irqrestore(&gsm->tx_lock, flags); + } ++ spin_unlock_irqrestore(&gsm->tx_lock, flags); + } + + /** +@@ -2889,6 +2897,10 @@ static int gsmtty_open(struct tty_struct *tty, struct file *filp) + gsm = gsm_mux[mux]; + if (gsm->dead) + return -EL2HLT; ++ /* If DLCI 0 is not yet fully open return an error. This is ok from a locking ++ perspective as we don't have to worry about this if DLCI0 is lost */ ++ if (gsm->dlci[0] && gsm->dlci[0]->state != DLCI_OPEN) ++ return -EL2NSYNC; + dlci = gsm->dlci[line]; + if (dlci == NULL) + dlci = gsm_dlci_alloc(gsm, line); +diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c +index 39d6ab6..8481aae 100644 +--- a/drivers/tty/n_tty.c ++++ b/drivers/tty/n_tty.c +@@ -1728,7 +1728,8 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file, + + do_it_again: + +- BUG_ON(!tty->read_buf); ++ if (WARN_ON(!tty->read_buf)) ++ return -EAGAIN; + + c = job_control(tty, file); + if (c < 0) +diff --git a/drivers/tty/serial/8250_pci.c b/drivers/tty/serial/8250_pci.c +index 482d51e..e7d82c1 100644 +--- a/drivers/tty/serial/8250_pci.c ++++ b/drivers/tty/serial/8250_pci.c +@@ -1118,6 +1118,8 @@ pci_xr17c154_setup(struct serial_private *priv, + #define PCI_SUBDEVICE_ID_OCTPRO422 0x0208 + #define PCI_SUBDEVICE_ID_POCTAL232 0x0308 + #define PCI_SUBDEVICE_ID_POCTAL422 0x0408 ++#define PCI_SUBDEVICE_ID_SIIG_DUAL_00 0x2500 ++#define PCI_SUBDEVICE_ID_SIIG_DUAL_30 0x2530 + #define PCI_VENDOR_ID_ADVANTECH 0x13fe + #define PCI_DEVICE_ID_INTEL_CE4100_UART 0x2e66 + #define PCI_DEVICE_ID_ADVANTECH_PCI3620 0x3620 +@@ -3168,8 +3170,11 @@ static struct pci_device_id serial_pci_tbl[] = { + * For now just used the hex ID 0x950a. + */ + { PCI_VENDOR_ID_OXSEMI, 0x950a, +- PCI_SUBVENDOR_ID_SIIG, PCI_SUBDEVICE_ID_SIIG_DUAL_SERIAL, 0, 0, +- pbn_b0_2_115200 }, ++ PCI_SUBVENDOR_ID_SIIG, PCI_SUBDEVICE_ID_SIIG_DUAL_00, ++ 0, 0, pbn_b0_2_115200 }, ++ { PCI_VENDOR_ID_OXSEMI, 0x950a, ++ PCI_SUBVENDOR_ID_SIIG, PCI_SUBDEVICE_ID_SIIG_DUAL_30, ++ 0, 0, pbn_b0_2_115200 }, + { PCI_VENDOR_ID_OXSEMI, 0x950a, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, + pbn_b0_2_1130000 }, +diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c +index 6da8cf8..fe9f111 100644 +--- a/drivers/tty/serial/amba-pl011.c ++++ b/drivers/tty/serial/amba-pl011.c +@@ -1627,13 +1627,26 @@ pl011_set_termios(struct uart_port *port, struct ktermios *termios, + old_cr &= ~ST_UART011_CR_OVSFACT; + } + ++ /* ++ * Workaround for the ST Micro oversampling variants to ++ * increase the bitrate slightly, by lowering the divisor, ++ * to avoid delayed sampling of start bit at high speeds, ++ * else we see data corruption. ++ */ ++ if (uap->vendor->oversampling) { ++ if ((baud >= 3000000) && (baud < 3250000) && (quot > 1)) ++ quot -= 1; ++ else if ((baud > 3250000) && (quot > 2)) ++ quot -= 2; ++ } + /* Set baud rate */ + writew(quot & 0x3f, port->membase + UART011_FBRD); + writew(quot >> 6, port->membase + UART011_IBRD); + + /* + * ----------v----------v----------v----------v----- +- * NOTE: MUST BE WRITTEN AFTER UARTLCR_M & UARTLCR_L ++ * NOTE: lcrh_tx and lcrh_rx MUST BE WRITTEN AFTER ++ * UART011_FBRD & UART011_IBRD. + * ----------^----------^----------^----------^----- + */ + writew(lcr_h, port->membase + uap->lcrh_rx); +diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c +index a40ab98..4cddbfc 100644 +--- a/drivers/usb/host/xhci-mem.c ++++ b/drivers/usb/host/xhci-mem.c +@@ -1680,6 +1680,7 @@ void xhci_mem_cleanup(struct xhci_hcd *xhci) + { + struct pci_dev *pdev = to_pci_dev(xhci_to_hcd(xhci)->self.controller); + struct dev_info *dev_info, *next; ++ struct xhci_cd *cur_cd, *next_cd; + unsigned long flags; + int size; + int i, j, num_ports; +@@ -1701,6 +1702,11 @@ void xhci_mem_cleanup(struct xhci_hcd *xhci) + xhci_ring_free(xhci, xhci->cmd_ring); + xhci->cmd_ring = NULL; + xhci_dbg(xhci, "Freed command ring\n"); ++ list_for_each_entry_safe(cur_cd, next_cd, ++ &xhci->cancel_cmd_list, cancel_cmd_list) { ++ list_del(&cur_cd->cancel_cmd_list); ++ kfree(cur_cd); ++ } + + for (i = 1; i < MAX_HC_SLOTS; ++i) + xhci_free_virt_device(xhci, i); +@@ -2246,6 +2252,7 @@ int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags) + xhci->cmd_ring = xhci_ring_alloc(xhci, 1, true, false, flags); + if (!xhci->cmd_ring) + goto fail; ++ INIT_LIST_HEAD(&xhci->cancel_cmd_list); + xhci_dbg(xhci, "Allocated command ring at %p\n", xhci->cmd_ring); + xhci_dbg(xhci, "First segment DMA is 0x%llx\n", + (unsigned long long)xhci->cmd_ring->first_seg->dma); +diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c +index bddcbfc..4ed7572 100644 +--- a/drivers/usb/host/xhci-pci.c ++++ b/drivers/usb/host/xhci-pci.c +@@ -99,6 +99,7 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) + * PPT chipsets. + */ + xhci->quirks |= XHCI_SPURIOUS_REBOOT; ++ xhci->quirks |= XHCI_AVOID_BEI; + } + if (pdev->vendor == PCI_VENDOR_ID_ETRON && + pdev->device == PCI_DEVICE_ID_ASROCK_P67) { +diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c +index c7c530c..950aef8 100644 +--- a/drivers/usb/host/xhci-ring.c ++++ b/drivers/usb/host/xhci-ring.c +@@ -309,12 +309,123 @@ static int room_on_ring(struct xhci_hcd *xhci, struct xhci_ring *ring, + /* Ring the host controller doorbell after placing a command on the ring */ + void xhci_ring_cmd_db(struct xhci_hcd *xhci) + { ++ if (!(xhci->cmd_ring_state & CMD_RING_STATE_RUNNING)) ++ return; ++ + xhci_dbg(xhci, "// Ding dong!\n"); + xhci_writel(xhci, DB_VALUE_HOST, &xhci->dba->doorbell[0]); + /* Flush PCI posted writes */ + xhci_readl(xhci, &xhci->dba->doorbell[0]); + } + ++static int xhci_abort_cmd_ring(struct xhci_hcd *xhci) ++{ ++ u64 temp_64; ++ int ret; ++ ++ xhci_dbg(xhci, "Abort command ring\n"); ++ ++ if (!(xhci->cmd_ring_state & CMD_RING_STATE_RUNNING)) { ++ xhci_dbg(xhci, "The command ring isn't running, " ++ "Have the command ring been stopped?\n"); ++ return 0; ++ } ++ ++ temp_64 = xhci_read_64(xhci, &xhci->op_regs->cmd_ring); ++ if (!(temp_64 & CMD_RING_RUNNING)) { ++ xhci_dbg(xhci, "Command ring had been stopped\n"); ++ return 0; ++ } ++ xhci->cmd_ring_state = CMD_RING_STATE_ABORTED; ++ xhci_write_64(xhci, temp_64 | CMD_RING_ABORT, ++ &xhci->op_regs->cmd_ring); ++ ++ /* Section 4.6.1.2 of xHCI 1.0 spec says software should ++ * time the completion od all xHCI commands, including ++ * the Command Abort operation. If software doesn't see ++ * CRR negated in a timely manner (e.g. longer than 5 ++ * seconds), then it should assume that the there are ++ * larger problems with the xHC and assert HCRST. ++ */ ++ ret = handshake(xhci, &xhci->op_regs->cmd_ring, ++ CMD_RING_RUNNING, 0, 5 * 1000 * 1000); ++ if (ret < 0) { ++ xhci_err(xhci, "Stopped the command ring failed, " ++ "maybe the host is dead\n"); ++ xhci->xhc_state |= XHCI_STATE_DYING; ++ xhci_quiesce(xhci); ++ xhci_halt(xhci); ++ return -ESHUTDOWN; ++ } ++ ++ return 0; ++} ++ ++static int xhci_queue_cd(struct xhci_hcd *xhci, ++ struct xhci_command *command, ++ union xhci_trb *cmd_trb) ++{ ++ struct xhci_cd *cd; ++ cd = kzalloc(sizeof(struct xhci_cd), GFP_ATOMIC); ++ if (!cd) ++ return -ENOMEM; ++ INIT_LIST_HEAD(&cd->cancel_cmd_list); ++ ++ cd->command = command; ++ cd->cmd_trb = cmd_trb; ++ list_add_tail(&cd->cancel_cmd_list, &xhci->cancel_cmd_list); ++ ++ return 0; ++} ++ ++/* ++ * Cancel the command which has issue. ++ * ++ * Some commands may hang due to waiting for acknowledgement from ++ * usb device. It is outside of the xHC's ability to control and ++ * will cause the command ring is blocked. When it occurs software ++ * should intervene to recover the command ring. ++ * See Section 4.6.1.1 and 4.6.1.2 ++ */ ++int xhci_cancel_cmd(struct xhci_hcd *xhci, struct xhci_command *command, ++ union xhci_trb *cmd_trb) ++{ ++ int retval = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&xhci->lock, flags); ++ ++ if (xhci->xhc_state & XHCI_STATE_DYING) { ++ xhci_warn(xhci, "Abort the command ring," ++ " but the xHCI is dead.\n"); ++ retval = -ESHUTDOWN; ++ goto fail; ++ } ++ ++ /* queue the cmd desriptor to cancel_cmd_list */ ++ retval = xhci_queue_cd(xhci, command, cmd_trb); ++ if (retval) { ++ xhci_warn(xhci, "Queuing command descriptor failed.\n"); ++ goto fail; ++ } ++ ++ /* abort command ring */ ++ retval = xhci_abort_cmd_ring(xhci); ++ if (retval) { ++ xhci_err(xhci, "Abort command ring failed\n"); ++ if (unlikely(retval == -ESHUTDOWN)) { ++ spin_unlock_irqrestore(&xhci->lock, flags); ++ usb_hc_died(xhci_to_hcd(xhci)->primary_hcd); ++ xhci_dbg(xhci, "xHCI host controller is dead.\n"); ++ return retval; ++ } ++ } ++ ++fail: ++ spin_unlock_irqrestore(&xhci->lock, flags); ++ return retval; ++} ++ + void xhci_ring_ep_doorbell(struct xhci_hcd *xhci, + unsigned int slot_id, + unsigned int ep_index, +@@ -1043,6 +1154,20 @@ static void handle_reset_ep_completion(struct xhci_hcd *xhci, + } + } + ++/* Complete the command and detele it from the devcie's command queue. ++ */ ++static void xhci_complete_cmd_in_cmd_wait_list(struct xhci_hcd *xhci, ++ struct xhci_command *command, u32 status) ++{ ++ command->status = status; ++ list_del(&command->cmd_list); ++ if (command->completion) ++ complete(command->completion); ++ else ++ xhci_free_command(xhci, command); ++} ++ ++ + /* Check to see if a command in the device's command queue matches this one. + * Signal the completion or free the command, and return 1. Return 0 if the + * completed command isn't at the head of the command list. +@@ -1061,15 +1186,144 @@ static int handle_cmd_in_cmd_wait_list(struct xhci_hcd *xhci, + if (xhci->cmd_ring->dequeue != command->command_trb) + return 0; + +- command->status = GET_COMP_CODE(le32_to_cpu(event->status)); +- list_del(&command->cmd_list); +- if (command->completion) +- complete(command->completion); +- else +- xhci_free_command(xhci, command); ++ xhci_complete_cmd_in_cmd_wait_list(xhci, command, ++ GET_COMP_CODE(le32_to_cpu(event->status))); + return 1; + } + ++/* ++ * Finding the command trb need to be cancelled and modifying it to ++ * NO OP command. And if the command is in device's command wait ++ * list, finishing and freeing it. ++ * ++ * If we can't find the command trb, we think it had already been ++ * executed. ++ */ ++static void xhci_cmd_to_noop(struct xhci_hcd *xhci, struct xhci_cd *cur_cd) ++{ ++ struct xhci_segment *cur_seg; ++ union xhci_trb *cmd_trb; ++ u32 cycle_state; ++ ++ if (xhci->cmd_ring->dequeue == xhci->cmd_ring->enqueue) ++ return; ++ ++ /* find the current segment of command ring */ ++ cur_seg = find_trb_seg(xhci->cmd_ring->first_seg, ++ xhci->cmd_ring->dequeue, &cycle_state); ++ ++ /* find the command trb matched by cd from command ring */ ++ for (cmd_trb = xhci->cmd_ring->dequeue; ++ cmd_trb != xhci->cmd_ring->enqueue; ++ next_trb(xhci, xhci->cmd_ring, &cur_seg, &cmd_trb)) { ++ /* If the trb is link trb, continue */ ++ if (TRB_TYPE_LINK_LE32(cmd_trb->generic.field[3])) ++ continue; ++ ++ if (cur_cd->cmd_trb == cmd_trb) { ++ ++ /* If the command in device's command list, we should ++ * finish it and free the command structure. ++ */ ++ if (cur_cd->command) ++ xhci_complete_cmd_in_cmd_wait_list(xhci, ++ cur_cd->command, COMP_CMD_STOP); ++ ++ /* get cycle state from the origin command trb */ ++ cycle_state = le32_to_cpu(cmd_trb->generic.field[3]) ++ & TRB_CYCLE; ++ ++ /* modify the command trb to NO OP command */ ++ cmd_trb->generic.field[0] = 0; ++ cmd_trb->generic.field[1] = 0; ++ cmd_trb->generic.field[2] = 0; ++ cmd_trb->generic.field[3] = cpu_to_le32( ++ TRB_TYPE(TRB_CMD_NOOP) | cycle_state); ++ break; ++ } ++ } ++} ++ ++static void xhci_cancel_cmd_in_cd_list(struct xhci_hcd *xhci) ++{ ++ struct xhci_cd *cur_cd, *next_cd; ++ ++ if (list_empty(&xhci->cancel_cmd_list)) ++ return; ++ ++ list_for_each_entry_safe(cur_cd, next_cd, ++ &xhci->cancel_cmd_list, cancel_cmd_list) { ++ xhci_cmd_to_noop(xhci, cur_cd); ++ list_del(&cur_cd->cancel_cmd_list); ++ kfree(cur_cd); ++ } ++} ++ ++/* ++ * traversing the cancel_cmd_list. If the command descriptor according ++ * to cmd_trb is found, the function free it and return 1, otherwise ++ * return 0. ++ */ ++static int xhci_search_cmd_trb_in_cd_list(struct xhci_hcd *xhci, ++ union xhci_trb *cmd_trb) ++{ ++ struct xhci_cd *cur_cd, *next_cd; ++ ++ if (list_empty(&xhci->cancel_cmd_list)) ++ return 0; ++ ++ list_for_each_entry_safe(cur_cd, next_cd, ++ &xhci->cancel_cmd_list, cancel_cmd_list) { ++ if (cur_cd->cmd_trb == cmd_trb) { ++ if (cur_cd->command) ++ xhci_complete_cmd_in_cmd_wait_list(xhci, ++ cur_cd->command, COMP_CMD_STOP); ++ list_del(&cur_cd->cancel_cmd_list); ++ kfree(cur_cd); ++ return 1; ++ } ++ } ++ ++ return 0; ++} ++ ++/* ++ * If the cmd_trb_comp_code is COMP_CMD_ABORT, we just check whether the ++ * trb pointed by the command ring dequeue pointer is the trb we want to ++ * cancel or not. And if the cmd_trb_comp_code is COMP_CMD_STOP, we will ++ * traverse the cancel_cmd_list to trun the all of the commands according ++ * to command descriptor to NO-OP trb. ++ */ ++static int handle_stopped_cmd_ring(struct xhci_hcd *xhci, ++ int cmd_trb_comp_code) ++{ ++ int cur_trb_is_good = 0; ++ ++ /* Searching the cmd trb pointed by the command ring dequeue ++ * pointer in command descriptor list. If it is found, free it. ++ */ ++ cur_trb_is_good = xhci_search_cmd_trb_in_cd_list(xhci, ++ xhci->cmd_ring->dequeue); ++ ++ if (cmd_trb_comp_code == COMP_CMD_ABORT) ++ xhci->cmd_ring_state = CMD_RING_STATE_STOPPED; ++ else if (cmd_trb_comp_code == COMP_CMD_STOP) { ++ /* traversing the cancel_cmd_list and canceling ++ * the command according to command descriptor ++ */ ++ xhci_cancel_cmd_in_cd_list(xhci); ++ ++ xhci->cmd_ring_state = CMD_RING_STATE_RUNNING; ++ /* ++ * ring command ring doorbell again to restart the ++ * command ring ++ */ ++ if (xhci->cmd_ring->dequeue != xhci->cmd_ring->enqueue) ++ xhci_ring_cmd_db(xhci); ++ } ++ return cur_trb_is_good; ++} ++ + static void handle_cmd_completion(struct xhci_hcd *xhci, + struct xhci_event_cmd *event) + { +@@ -1095,6 +1349,22 @@ static void handle_cmd_completion(struct xhci_hcd *xhci, + xhci->error_bitmask |= 1 << 5; + return; + } ++ ++ if ((GET_COMP_CODE(le32_to_cpu(event->status)) == COMP_CMD_ABORT) || ++ (GET_COMP_CODE(le32_to_cpu(event->status)) == COMP_CMD_STOP)) { ++ /* If the return value is 0, we think the trb pointed by ++ * command ring dequeue pointer is a good trb. The good ++ * trb means we don't want to cancel the trb, but it have ++ * been stopped by host. So we should handle it normally. ++ * Otherwise, driver should invoke inc_deq() and return. ++ */ ++ if (handle_stopped_cmd_ring(xhci, ++ GET_COMP_CODE(le32_to_cpu(event->status)))) { ++ inc_deq(xhci, xhci->cmd_ring, false); ++ return; ++ } ++ } ++ + switch (le32_to_cpu(xhci->cmd_ring->dequeue->generic.field[3]) + & TRB_TYPE_BITMASK) { + case TRB_TYPE(TRB_ENABLE_SLOT): +@@ -3356,7 +3626,9 @@ static int xhci_queue_isoc_tx(struct xhci_hcd *xhci, gfp_t mem_flags, + } else { + td->last_trb = ep_ring->enqueue; + field |= TRB_IOC; +- if (xhci->hci_version == 0x100) { ++ if (xhci->hci_version == 0x100 && ++ !(xhci->quirks & ++ XHCI_AVOID_BEI)) { + /* Set BEI bit except for the last td */ + if (i < num_tds - 1) + field |= TRB_BEI; +diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c +index 09872ee..f5c0f38 100644 +--- a/drivers/usb/host/xhci.c ++++ b/drivers/usb/host/xhci.c +@@ -52,7 +52,7 @@ MODULE_PARM_DESC(link_quirk, "Don't clear the chain bit on a link TRB"); + * handshake done). There are two failure modes: "usec" have passed (major + * hardware flakeout), or the register reads as all-ones (hardware removed). + */ +-static int handshake(struct xhci_hcd *xhci, void __iomem *ptr, ++int handshake(struct xhci_hcd *xhci, void __iomem *ptr, + u32 mask, u32 done, int usec) + { + u32 result; +@@ -105,8 +105,12 @@ int xhci_halt(struct xhci_hcd *xhci) + + ret = handshake(xhci, &xhci->op_regs->status, + STS_HALT, STS_HALT, XHCI_MAX_HALT_USEC); +- if (!ret) ++ if (!ret) { + xhci->xhc_state |= XHCI_STATE_HALTED; ++ xhci->cmd_ring_state = CMD_RING_STATE_STOPPED; ++ } else ++ xhci_warn(xhci, "Host not halted after %u microseconds.\n", ++ XHCI_MAX_HALT_USEC); + return ret; + } + +@@ -459,6 +463,8 @@ static bool compliance_mode_recovery_timer_quirk_check(void) + + dmi_product_name = dmi_get_system_info(DMI_PRODUCT_NAME); + dmi_sys_vendor = dmi_get_system_info(DMI_SYS_VENDOR); ++ if (!dmi_product_name || !dmi_sys_vendor) ++ return false; + + if (!(strstr(dmi_sys_vendor, "Hewlett-Packard"))) + return false; +@@ -570,6 +576,7 @@ static int xhci_run_finished(struct xhci_hcd *xhci) + return -ENODEV; + } + xhci->shared_hcd->state = HC_STATE_RUNNING; ++ xhci->cmd_ring_state = CMD_RING_STATE_RUNNING; + + if (xhci->quirks & XHCI_NEC_HOST) + xhci_ring_cmd_db(xhci); +@@ -874,7 +881,7 @@ int xhci_suspend(struct xhci_hcd *xhci) + command &= ~CMD_RUN; + xhci_writel(xhci, command, &xhci->op_regs->command); + if (handshake(xhci, &xhci->op_regs->status, +- STS_HALT, STS_HALT, 100*100)) { ++ STS_HALT, STS_HALT, XHCI_MAX_HALT_USEC)) { + xhci_warn(xhci, "WARN: xHC CMD_RUN timeout\n"); + spin_unlock_irq(&xhci->lock); + return -ETIMEDOUT; +@@ -2506,6 +2513,7 @@ static int xhci_configure_endpoint(struct xhci_hcd *xhci, + struct completion *cmd_completion; + u32 *cmd_status; + struct xhci_virt_device *virt_dev; ++ union xhci_trb *cmd_trb; + + spin_lock_irqsave(&xhci->lock, flags); + virt_dev = xhci->devs[udev->slot_id]; +@@ -2551,6 +2559,7 @@ static int xhci_configure_endpoint(struct xhci_hcd *xhci, + } + init_completion(cmd_completion); + ++ cmd_trb = xhci->cmd_ring->dequeue; + if (!ctx_change) + ret = xhci_queue_configure_endpoint(xhci, in_ctx->dma, + udev->slot_id, must_succeed); +@@ -2572,14 +2581,17 @@ static int xhci_configure_endpoint(struct xhci_hcd *xhci, + /* Wait for the configure endpoint command to complete */ + timeleft = wait_for_completion_interruptible_timeout( + cmd_completion, +- USB_CTRL_SET_TIMEOUT); ++ XHCI_CMD_DEFAULT_TIMEOUT); + if (timeleft <= 0) { + xhci_warn(xhci, "%s while waiting for %s command\n", + timeleft == 0 ? "Timeout" : "Signal", + ctx_change == 0 ? + "configure endpoint" : + "evaluate context"); +- /* FIXME cancel the configure endpoint command */ ++ /* cancel the configure endpoint command */ ++ ret = xhci_cancel_cmd(xhci, command, cmd_trb); ++ if (ret < 0) ++ return ret; + return -ETIME; + } + +@@ -3528,8 +3540,10 @@ int xhci_alloc_dev(struct usb_hcd *hcd, struct usb_device *udev) + unsigned long flags; + int timeleft; + int ret; ++ union xhci_trb *cmd_trb; + + spin_lock_irqsave(&xhci->lock, flags); ++ cmd_trb = xhci->cmd_ring->dequeue; + ret = xhci_queue_slot_control(xhci, TRB_ENABLE_SLOT, 0); + if (ret) { + spin_unlock_irqrestore(&xhci->lock, flags); +@@ -3541,12 +3555,12 @@ int xhci_alloc_dev(struct usb_hcd *hcd, struct usb_device *udev) + + /* XXX: how much time for xHC slot assignment? */ + timeleft = wait_for_completion_interruptible_timeout(&xhci->addr_dev, +- USB_CTRL_SET_TIMEOUT); ++ XHCI_CMD_DEFAULT_TIMEOUT); + if (timeleft <= 0) { + xhci_warn(xhci, "%s while waiting for a slot\n", + timeleft == 0 ? "Timeout" : "Signal"); +- /* FIXME cancel the enable slot request */ +- return 0; ++ /* cancel the enable slot request */ ++ return xhci_cancel_cmd(xhci, NULL, cmd_trb); + } + + if (!xhci->slot_id) { +@@ -3607,6 +3621,7 @@ int xhci_address_device(struct usb_hcd *hcd, struct usb_device *udev) + struct xhci_slot_ctx *slot_ctx; + struct xhci_input_control_ctx *ctrl_ctx; + u64 temp_64; ++ union xhci_trb *cmd_trb; + + if (!udev->slot_id) { + xhci_dbg(xhci, "Bad Slot ID %d\n", udev->slot_id); +@@ -3645,6 +3660,7 @@ int xhci_address_device(struct usb_hcd *hcd, struct usb_device *udev) + xhci_dbg_ctx(xhci, virt_dev->in_ctx, 2); + + spin_lock_irqsave(&xhci->lock, flags); ++ cmd_trb = xhci->cmd_ring->dequeue; + ret = xhci_queue_address_device(xhci, virt_dev->in_ctx->dma, + udev->slot_id); + if (ret) { +@@ -3657,7 +3673,7 @@ int xhci_address_device(struct usb_hcd *hcd, struct usb_device *udev) + + /* ctrl tx can take up to 5 sec; XXX: need more time for xHC? */ + timeleft = wait_for_completion_interruptible_timeout(&xhci->addr_dev, +- USB_CTRL_SET_TIMEOUT); ++ XHCI_CMD_DEFAULT_TIMEOUT); + /* FIXME: From section 4.3.4: "Software shall be responsible for timing + * the SetAddress() "recovery interval" required by USB and aborting the + * command on a timeout. +@@ -3665,7 +3681,10 @@ int xhci_address_device(struct usb_hcd *hcd, struct usb_device *udev) + if (timeleft <= 0) { + xhci_warn(xhci, "%s while waiting for address device command\n", + timeleft == 0 ? "Timeout" : "Signal"); +- /* FIXME cancel the address device command */ ++ /* cancel the address device command */ ++ ret = xhci_cancel_cmd(xhci, NULL, cmd_trb); ++ if (ret < 0) ++ return ret; + return -ETIME; + } + +diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h +index 44d518a..cc368c2 100644 +--- a/drivers/usb/host/xhci.h ++++ b/drivers/usb/host/xhci.h +@@ -1255,6 +1255,16 @@ struct xhci_td { + union xhci_trb *last_trb; + }; + ++/* xHCI command default timeout value */ ++#define XHCI_CMD_DEFAULT_TIMEOUT (5 * HZ) ++ ++/* command descriptor */ ++struct xhci_cd { ++ struct list_head cancel_cmd_list; ++ struct xhci_command *command; ++ union xhci_trb *cmd_trb; ++}; ++ + struct xhci_dequeue_state { + struct xhci_segment *new_deq_seg; + union xhci_trb *new_deq_ptr; +@@ -1402,6 +1412,11 @@ struct xhci_hcd { + /* data structures */ + struct xhci_device_context_array *dcbaa; + struct xhci_ring *cmd_ring; ++ unsigned int cmd_ring_state; ++#define CMD_RING_STATE_RUNNING (1 << 0) ++#define CMD_RING_STATE_ABORTED (1 << 1) ++#define CMD_RING_STATE_STOPPED (1 << 2) ++ struct list_head cancel_cmd_list; + unsigned int cmd_ring_reserved_trbs; + struct xhci_ring *event_ring; + struct xhci_erst erst; +@@ -1473,6 +1488,7 @@ struct xhci_hcd { + #define XHCI_TRUST_TX_LENGTH (1 << 10) + #define XHCI_SPURIOUS_REBOOT (1 << 13) + #define XHCI_COMP_MODE_QUIRK (1 << 14) ++#define XHCI_AVOID_BEI (1 << 15) + unsigned int num_active_eps; + unsigned int limit_active_eps; + /* There are two roothubs to keep track of bus suspend info for */ +@@ -1666,6 +1682,8 @@ static inline void xhci_unregister_pci(void) {} + + /* xHCI host controller glue */ + typedef void (*xhci_get_quirks_t)(struct device *, struct xhci_hcd *); ++int handshake(struct xhci_hcd *xhci, void __iomem *ptr, ++ u32 mask, u32 done, int usec); + void xhci_quiesce(struct xhci_hcd *xhci); + int xhci_halt(struct xhci_hcd *xhci); + int xhci_reset(struct xhci_hcd *xhci); +@@ -1756,6 +1774,8 @@ void xhci_queue_config_ep_quirk(struct xhci_hcd *xhci, + unsigned int slot_id, unsigned int ep_index, + struct xhci_dequeue_state *deq_state); + void xhci_stop_endpoint_command_watchdog(unsigned long arg); ++int xhci_cancel_cmd(struct xhci_hcd *xhci, struct xhci_command *command, ++ union xhci_trb *cmd_trb); + void xhci_ring_ep_doorbell(struct xhci_hcd *xhci, unsigned int slot_id, + unsigned int ep_index, unsigned int stream_id); + +diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c +index 7324bea..e29a664 100644 +--- a/drivers/usb/serial/ftdi_sio.c ++++ b/drivers/usb/serial/ftdi_sio.c +@@ -584,6 +584,8 @@ static struct usb_device_id id_table_combined [] = { + { USB_DEVICE(FTDI_VID, FTDI_IBS_PEDO_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_IBS_PROD_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_TAVIR_STK500_PID) }, ++ { USB_DEVICE(FTDI_VID, FTDI_TIAO_UMPA_PID), ++ .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk }, + /* + * ELV devices: + */ +diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h +index 06f6fd2..7b5eb74 100644 +--- a/drivers/usb/serial/ftdi_sio_ids.h ++++ b/drivers/usb/serial/ftdi_sio_ids.h +@@ -517,6 +517,11 @@ + */ + #define FTDI_TAVIR_STK500_PID 0xFA33 /* STK500 AVR programmer */ + ++/* ++ * TIAO product ids (FTDI_VID) ++ * http://www.tiaowiki.com/w/Main_Page ++ */ ++#define FTDI_TIAO_UMPA_PID 0x8a98 /* TIAO/DIYGADGET USB Multi-Protocol Adapter */ + + + /********************************/ +diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c +index c068b4d..3fd4e6f 100644 +--- a/drivers/usb/serial/option.c ++++ b/drivers/usb/serial/option.c +@@ -870,7 +870,8 @@ static const struct usb_device_id option_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0153, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0155, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0156, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0157, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0157, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf5_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0158, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0159, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0161, 0xff, 0xff, 0xff) }, +diff --git a/drivers/usb/serial/qcaux.c b/drivers/usb/serial/qcaux.c +index a348198..87271e3 100644 +--- a/drivers/usb/serial/qcaux.c ++++ b/drivers/usb/serial/qcaux.c +@@ -36,8 +36,6 @@ + #define UTSTARCOM_PRODUCT_UM175_V1 0x3712 + #define UTSTARCOM_PRODUCT_UM175_V2 0x3714 + #define UTSTARCOM_PRODUCT_UM175_ALLTEL 0x3715 +-#define PANTECH_PRODUCT_UML190_VZW 0x3716 +-#define PANTECH_PRODUCT_UML290_VZW 0x3718 + + /* CMOTECH devices */ + #define CMOTECH_VENDOR_ID 0x16d8 +@@ -68,11 +66,9 @@ static struct usb_device_id id_table[] = { + { USB_DEVICE_AND_INTERFACE_INFO(LG_VENDOR_ID, LG_PRODUCT_VX4400_6000, 0xff, 0xff, 0x00) }, + { USB_DEVICE_AND_INTERFACE_INFO(SANYO_VENDOR_ID, SANYO_PRODUCT_KATANA_LX, 0xff, 0xff, 0x00) }, + { USB_DEVICE_AND_INTERFACE_INFO(SAMSUNG_VENDOR_ID, SAMSUNG_PRODUCT_U520, 0xff, 0x00, 0x00) }, +- { USB_DEVICE_AND_INTERFACE_INFO(UTSTARCOM_VENDOR_ID, PANTECH_PRODUCT_UML190_VZW, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(UTSTARCOM_VENDOR_ID, PANTECH_PRODUCT_UML190_VZW, 0xff, 0xfe, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(UTSTARCOM_VENDOR_ID, PANTECH_PRODUCT_UML290_VZW, 0xff, 0xfd, 0xff) }, /* NMEA */ +- { USB_DEVICE_AND_INTERFACE_INFO(UTSTARCOM_VENDOR_ID, PANTECH_PRODUCT_UML290_VZW, 0xff, 0xfe, 0xff) }, /* WMC */ +- { USB_DEVICE_AND_INTERFACE_INFO(UTSTARCOM_VENDOR_ID, PANTECH_PRODUCT_UML290_VZW, 0xff, 0xff, 0xff) }, /* DIAG */ ++ { USB_VENDOR_AND_INTERFACE_INFO(UTSTARCOM_VENDOR_ID, 0xff, 0xfd, 0xff) }, /* NMEA */ ++ { USB_VENDOR_AND_INTERFACE_INFO(UTSTARCOM_VENDOR_ID, 0xff, 0xfe, 0xff) }, /* WMC */ ++ { USB_VENDOR_AND_INTERFACE_INFO(UTSTARCOM_VENDOR_ID, 0xff, 0xff, 0xff) }, /* DIAG */ + { }, + }; + MODULE_DEVICE_TABLE(usb, id_table); +diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c +index f55ae23..790fa63 100644 +--- a/fs/autofs4/root.c ++++ b/fs/autofs4/root.c +@@ -392,10 +392,12 @@ static struct vfsmount *autofs4_d_automount(struct path *path) + ino->flags |= AUTOFS_INF_PENDING; + spin_unlock(&sbi->fs_lock); + status = autofs4_mount_wait(dentry); +- if (status) +- return ERR_PTR(status); + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_PENDING; ++ if (status) { ++ spin_unlock(&sbi->fs_lock); ++ return ERR_PTR(status); ++ } + } + done: + if (!(ino->flags & AUTOFS_INF_EXPIRING)) { +diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c +index 6ff96c6..8dd615c 100644 +--- a/fs/binfmt_elf.c ++++ b/fs/binfmt_elf.c +@@ -1668,30 +1668,19 @@ static int elf_note_info_init(struct elf_note_info *info) + return 0; + info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); + if (!info->psinfo) +- goto notes_free; ++ return 0; + info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL); + if (!info->prstatus) +- goto psinfo_free; ++ return 0; + info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL); + if (!info->fpu) +- goto prstatus_free; ++ return 0; + #ifdef ELF_CORE_COPY_XFPREGS + info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL); + if (!info->xfpu) +- goto fpu_free; ++ return 0; + #endif + return 1; +-#ifdef ELF_CORE_COPY_XFPREGS +- fpu_free: +- kfree(info->fpu); +-#endif +- prstatus_free: +- kfree(info->prstatus); +- psinfo_free: +- kfree(info->psinfo); +- notes_free: +- kfree(info->notes); +- return 0; + } + + static int fill_note_info(struct elfhdr *elf, int phdrs, +diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h +index a9f29b1..2262a77 100644 +--- a/fs/ecryptfs/ecryptfs_kernel.h ++++ b/fs/ecryptfs/ecryptfs_kernel.h +@@ -559,6 +559,8 @@ struct ecryptfs_open_req { + struct inode *ecryptfs_get_inode(struct inode *lower_inode, + struct super_block *sb); + void ecryptfs_i_size_init(const char *page_virt, struct inode *inode); ++int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, ++ struct inode *ecryptfs_inode); + int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, + size_t *decrypted_name_size, + struct dentry *ecryptfs_dentry, +diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c +index d3f95f9..841f24f 100644 +--- a/fs/ecryptfs/file.c ++++ b/fs/ecryptfs/file.c +@@ -139,29 +139,50 @@ out: + return rc; + } + +-static void ecryptfs_vma_close(struct vm_area_struct *vma) +-{ +- filemap_write_and_wait(vma->vm_file->f_mapping); +-} +- +-static const struct vm_operations_struct ecryptfs_file_vm_ops = { +- .close = ecryptfs_vma_close, +- .fault = filemap_fault, +-}; ++struct kmem_cache *ecryptfs_file_info_cache; + +-static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma) ++static int read_or_initialize_metadata(struct dentry *dentry) + { ++ struct inode *inode = dentry->d_inode; ++ struct ecryptfs_mount_crypt_stat *mount_crypt_stat; ++ struct ecryptfs_crypt_stat *crypt_stat; + int rc; + +- rc = generic_file_mmap(file, vma); ++ crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; ++ mount_crypt_stat = &ecryptfs_superblock_to_private( ++ inode->i_sb)->mount_crypt_stat; ++ mutex_lock(&crypt_stat->cs_mutex); ++ ++ if (crypt_stat->flags & ECRYPTFS_POLICY_APPLIED && ++ crypt_stat->flags & ECRYPTFS_KEY_VALID) { ++ rc = 0; ++ goto out; ++ } ++ ++ rc = ecryptfs_read_metadata(dentry); + if (!rc) +- vma->vm_ops = &ecryptfs_file_vm_ops; ++ goto out; ++ ++ if (mount_crypt_stat->flags & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED) { ++ crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED ++ | ECRYPTFS_ENCRYPTED); ++ rc = 0; ++ goto out; ++ } ++ ++ if (!(mount_crypt_stat->flags & ECRYPTFS_XATTR_METADATA_ENABLED) && ++ !i_size_read(ecryptfs_inode_to_lower(inode))) { ++ rc = ecryptfs_initialize_file(dentry, inode); ++ if (!rc) ++ goto out; ++ } + ++ rc = -EIO; ++out: ++ mutex_unlock(&crypt_stat->cs_mutex); + return rc; + } + +-struct kmem_cache *ecryptfs_file_info_cache; +- + /** + * ecryptfs_open + * @inode: inode speciying file to open +@@ -237,32 +258,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file) + rc = 0; + goto out; + } +- mutex_lock(&crypt_stat->cs_mutex); +- if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED) +- || !(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { +- rc = ecryptfs_read_metadata(ecryptfs_dentry); +- if (rc) { +- ecryptfs_printk(KERN_DEBUG, +- "Valid headers not found\n"); +- if (!(mount_crypt_stat->flags +- & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) { +- rc = -EIO; +- printk(KERN_WARNING "Either the lower file " +- "is not in a valid eCryptfs format, " +- "or the key could not be retrieved. " +- "Plaintext passthrough mode is not " +- "enabled; returning -EIO\n"); +- mutex_unlock(&crypt_stat->cs_mutex); +- goto out_put; +- } +- rc = 0; +- crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED +- | ECRYPTFS_ENCRYPTED); +- mutex_unlock(&crypt_stat->cs_mutex); +- goto out; +- } +- } +- mutex_unlock(&crypt_stat->cs_mutex); ++ rc = read_or_initialize_metadata(ecryptfs_dentry); ++ if (rc) ++ goto out_put; + ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = " + "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino, + (unsigned long long)i_size_read(inode)); +@@ -278,8 +276,14 @@ out: + + static int ecryptfs_flush(struct file *file, fl_owner_t td) + { +- return file->f_mode & FMODE_WRITE +- ? filemap_write_and_wait(file->f_mapping) : 0; ++ struct file *lower_file = ecryptfs_file_to_lower(file); ++ ++ if (lower_file->f_op && lower_file->f_op->flush) { ++ filemap_write_and_wait(file->f_mapping); ++ return lower_file->f_op->flush(lower_file, td); ++ } ++ ++ return 0; + } + + static int ecryptfs_release(struct inode *inode, struct file *file) +@@ -293,15 +297,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file) + static int + ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) + { +- int rc = 0; +- +- rc = generic_file_fsync(file, start, end, datasync); +- if (rc) +- goto out; +- rc = vfs_fsync_range(ecryptfs_file_to_lower(file), start, end, +- datasync); +-out: +- return rc; ++ return vfs_fsync(ecryptfs_file_to_lower(file), datasync); + } + + static int ecryptfs_fasync(int fd, struct file *file, int flag) +@@ -370,7 +366,7 @@ const struct file_operations ecryptfs_main_fops = { + #ifdef CONFIG_COMPAT + .compat_ioctl = ecryptfs_compat_ioctl, + #endif +- .mmap = ecryptfs_file_mmap, ++ .mmap = generic_file_mmap, + .open = ecryptfs_open, + .flush = ecryptfs_flush, + .release = ecryptfs_release, +diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c +index 7c7556b..a9be90d 100644 +--- a/fs/ecryptfs/inode.c ++++ b/fs/ecryptfs/inode.c +@@ -161,6 +161,31 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode, + return vfs_create(lower_dir_inode, lower_dentry, mode, NULL); + } + ++static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, ++ struct inode *inode) ++{ ++ struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); ++ struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir); ++ struct dentry *lower_dir_dentry; ++ int rc; ++ ++ dget(lower_dentry); ++ lower_dir_dentry = lock_parent(lower_dentry); ++ rc = vfs_unlink(lower_dir_inode, lower_dentry); ++ if (rc) { ++ printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); ++ goto out_unlock; ++ } ++ fsstack_copy_attr_times(dir, lower_dir_inode); ++ set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink); ++ inode->i_ctime = dir->i_ctime; ++ d_drop(dentry); ++out_unlock: ++ unlock_dir(lower_dir_dentry); ++ dput(lower_dentry); ++ return rc; ++} ++ + /** + * ecryptfs_do_create + * @directory_inode: inode of the new file's dentry's parent in ecryptfs +@@ -201,8 +226,10 @@ ecryptfs_do_create(struct inode *directory_inode, + } + inode = __ecryptfs_get_inode(lower_dentry->d_inode, + directory_inode->i_sb); +- if (IS_ERR(inode)) ++ if (IS_ERR(inode)) { ++ vfs_unlink(lower_dir_dentry->d_inode, lower_dentry); + goto out_lock; ++ } + fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); + fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); + out_lock: +@@ -219,8 +246,8 @@ out: + * + * Returns zero on success + */ +-static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, +- struct inode *ecryptfs_inode) ++int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, ++ struct inode *ecryptfs_inode) + { + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; +@@ -284,7 +311,9 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, + * that this on disk file is prepared to be an ecryptfs file */ + rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode); + if (rc) { +- drop_nlink(ecryptfs_inode); ++ ecryptfs_do_unlink(directory_inode, ecryptfs_dentry, ++ ecryptfs_inode); ++ make_bad_inode(ecryptfs_inode); + unlock_new_inode(ecryptfs_inode); + iput(ecryptfs_inode); + goto out; +@@ -496,27 +525,7 @@ out_lock: + + static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) + { +- int rc = 0; +- struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); +- struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir); +- struct dentry *lower_dir_dentry; +- +- dget(lower_dentry); +- lower_dir_dentry = lock_parent(lower_dentry); +- rc = vfs_unlink(lower_dir_inode, lower_dentry); +- if (rc) { +- printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); +- goto out_unlock; +- } +- fsstack_copy_attr_times(dir, lower_dir_inode); +- set_nlink(dentry->d_inode, +- ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink); +- dentry->d_inode->i_ctime = dir->i_ctime; +- d_drop(dentry); +-out_unlock: +- unlock_dir(lower_dir_dentry); +- dput(lower_dentry); +- return rc; ++ return ecryptfs_do_unlink(dir, dentry, dentry->d_inode); + } + + static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, +@@ -1026,12 +1035,6 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) + goto out; + } + +- if (S_ISREG(inode->i_mode)) { +- rc = filemap_write_and_wait(inode->i_mapping); +- if (rc) +- goto out; +- fsstack_copy_attr_all(inode, lower_inode); +- } + memcpy(&lower_ia, ia, sizeof(lower_ia)); + if (ia->ia_valid & ATTR_FILE) + lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file); +diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c +index b4a6bef..1cfef9f 100644 +--- a/fs/ecryptfs/main.c ++++ b/fs/ecryptfs/main.c +@@ -162,6 +162,7 @@ void ecryptfs_put_lower_file(struct inode *inode) + inode_info = ecryptfs_inode_to_private(inode); + if (atomic_dec_and_mutex_lock(&inode_info->lower_file_count, + &inode_info->lower_file_mutex)) { ++ filemap_write_and_wait(inode->i_mapping); + fput(inode_info->lower_file); + inode_info->lower_file = NULL; + mutex_unlock(&inode_info->lower_file_mutex); +diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c +index 6a44148..93a998a 100644 +--- a/fs/ecryptfs/mmap.c ++++ b/fs/ecryptfs/mmap.c +@@ -62,18 +62,6 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc) + { + int rc; + +- /* +- * Refuse to write the page out if we are called from reclaim context +- * since our writepage() path may potentially allocate memory when +- * calling into the lower fs vfs_write() which may in turn invoke +- * us again. +- */ +- if (current->flags & PF_MEMALLOC) { +- redirty_page_for_writepage(wbc, page); +- rc = 0; +- goto out; +- } +- + rc = ecryptfs_encrypt_page(page); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error encrypting " +@@ -498,7 +486,6 @@ static int ecryptfs_write_end(struct file *file, + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; + int rc; +- int need_unlock_page = 1; + + ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" + "(page w/ index = [0x%.16lx], to = [%d])\n", index, to); +@@ -519,26 +506,26 @@ static int ecryptfs_write_end(struct file *file, + "zeros in page with index = [0x%.16lx]\n", index); + goto out; + } +- set_page_dirty(page); +- unlock_page(page); +- need_unlock_page = 0; ++ rc = ecryptfs_encrypt_page(page); ++ if (rc) { ++ ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " ++ "index [0x%.16lx])\n", index); ++ goto out; ++ } + if (pos + copied > i_size_read(ecryptfs_inode)) { + i_size_write(ecryptfs_inode, pos + copied); + ecryptfs_printk(KERN_DEBUG, "Expanded file size to " + "[0x%.16llx]\n", + (unsigned long long)i_size_read(ecryptfs_inode)); +- balance_dirty_pages_ratelimited(mapping); +- rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); +- if (rc) { +- printk(KERN_ERR "Error writing inode size to metadata; " +- "rc = [%d]\n", rc); +- goto out; +- } + } +- rc = copied; ++ rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); ++ if (rc) ++ printk(KERN_ERR "Error writing inode size to metadata; " ++ "rc = [%d]\n", rc); ++ else ++ rc = copied; + out: +- if (need_unlock_page) +- unlock_page(page); ++ unlock_page(page); + page_cache_release(page); + return rc; + } +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 8b01f9f..bac2330 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -2382,6 +2382,16 @@ static int ext4_nonda_switch(struct super_block *sb) + free_blocks = EXT4_C2B(sbi, + percpu_counter_read_positive(&sbi->s_freeclusters_counter)); + dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); ++ /* ++ * Start pushing delalloc when 1/2 of free blocks are dirty. ++ */ ++ if (dirty_blocks && (free_blocks < 2 * dirty_blocks) && ++ !writeback_in_progress(sb->s_bdi) && ++ down_read_trylock(&sb->s_umount)) { ++ writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); ++ up_read(&sb->s_umount); ++ } ++ + if (2 * free_blocks < 3 * dirty_blocks || + free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { + /* +@@ -2390,13 +2400,6 @@ static int ext4_nonda_switch(struct super_block *sb) + */ + return 1; + } +- /* +- * Even if we don't switch but are nearing capacity, +- * start pushing delalloc when 1/2 of free blocks are dirty. +- */ +- if (free_blocks < 2 * dirty_blocks) +- writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE); +- + return 0; + } + +@@ -4004,6 +4007,7 @@ static int ext4_do_update_inode(handle_t *handle, + struct ext4_inode_info *ei = EXT4_I(inode); + struct buffer_head *bh = iloc->bh; + int err = 0, rc, block; ++ int need_datasync = 0; + + /* For fields not not tracking in the in-memory inode, + * initialise them to zero for new inodes. */ +@@ -4052,7 +4056,10 @@ static int ext4_do_update_inode(handle_t *handle, + raw_inode->i_file_acl_high = + cpu_to_le16(ei->i_file_acl >> 32); + raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); +- ext4_isize_set(raw_inode, ei->i_disksize); ++ if (ei->i_disksize != ext4_isize(raw_inode)) { ++ ext4_isize_set(raw_inode, ei->i_disksize); ++ need_datasync = 1; ++ } + if (ei->i_disksize > 0x7fffffffULL) { + struct super_block *sb = inode->i_sb; + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, +@@ -4105,7 +4112,7 @@ static int ext4_do_update_inode(handle_t *handle, + err = rc; + ext4_clear_inode_state(inode, EXT4_STATE_NEW); + +- ext4_update_inode_fsync_trans(handle, inode, 0); ++ ext4_update_inode_fsync_trans(handle, inode, need_datasync); + out_brelse: + brelse(bh); + ext4_std_error(inode->i_sb, err); +diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c +index c5826c6..e2016f3 100644 +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -141,55 +141,21 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, + } + + /** +- * mext_check_null_inode - NULL check for two inodes +- * +- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. +- */ +-static int +-mext_check_null_inode(struct inode *inode1, struct inode *inode2, +- const char *function, unsigned int line) +-{ +- int ret = 0; +- +- if (inode1 == NULL) { +- __ext4_error(inode2->i_sb, function, line, +- "Both inodes should not be NULL: " +- "inode1 NULL inode2 %lu", inode2->i_ino); +- ret = -EIO; +- } else if (inode2 == NULL) { +- __ext4_error(inode1->i_sb, function, line, +- "Both inodes should not be NULL: " +- "inode1 %lu inode2 NULL", inode1->i_ino); +- ret = -EIO; +- } +- return ret; +-} +- +-/** + * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem + * +- * @orig_inode: original inode structure +- * @donor_inode: donor inode structure +- * Acquire write lock of i_data_sem of the two inodes (orig and donor) by +- * i_ino order. ++ * Acquire write lock of i_data_sem of the two inodes + */ + static void +-double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) ++double_down_write_data_sem(struct inode *first, struct inode *second) + { +- struct inode *first = orig_inode, *second = donor_inode; ++ if (first < second) { ++ down_write(&EXT4_I(first)->i_data_sem); ++ down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); ++ } else { ++ down_write(&EXT4_I(second)->i_data_sem); ++ down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING); + +- /* +- * Use the inode number to provide the stable locking order instead +- * of its address, because the C language doesn't guarantee you can +- * compare pointers that don't come from the same array. +- */ +- if (donor_inode->i_ino < orig_inode->i_ino) { +- first = donor_inode; +- second = orig_inode; + } +- +- down_write(&EXT4_I(first)->i_data_sem); +- down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); + } + + /** +@@ -969,14 +935,6 @@ mext_check_arguments(struct inode *orig_inode, + return -EINVAL; + } + +- /* Files should be in the same ext4 FS */ +- if (orig_inode->i_sb != donor_inode->i_sb) { +- ext4_debug("ext4 move extent: The argument files " +- "should be in same FS [ino:orig %lu, donor %lu]\n", +- orig_inode->i_ino, donor_inode->i_ino); +- return -EINVAL; +- } +- + /* Ext4 move extent supports only extent based file */ + if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { + ext4_debug("ext4 move extent: orig file is not extents " +@@ -1072,35 +1030,19 @@ mext_check_arguments(struct inode *orig_inode, + * @inode1: the inode structure + * @inode2: the inode structure + * +- * Lock two inodes' i_mutex by i_ino order. +- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. ++ * Lock two inodes' i_mutex + */ +-static int ++static void + mext_inode_double_lock(struct inode *inode1, struct inode *inode2) + { +- int ret = 0; +- +- BUG_ON(inode1 == NULL && inode2 == NULL); +- +- ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); +- if (ret < 0) +- goto out; +- +- if (inode1 == inode2) { +- mutex_lock(&inode1->i_mutex); +- goto out; +- } +- +- if (inode1->i_ino < inode2->i_ino) { ++ BUG_ON(inode1 == inode2); ++ if (inode1 < inode2) { + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + } else { + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); + } +- +-out: +- return ret; + } + + /** +@@ -1109,28 +1051,13 @@ out: + * @inode1: the inode that is released first + * @inode2: the inode that is released second + * +- * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + */ + +-static int ++static void + mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) + { +- int ret = 0; +- +- BUG_ON(inode1 == NULL && inode2 == NULL); +- +- ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); +- if (ret < 0) +- goto out; +- +- if (inode1) +- mutex_unlock(&inode1->i_mutex); +- +- if (inode2 && inode2 != inode1) +- mutex_unlock(&inode2->i_mutex); +- +-out: +- return ret; ++ mutex_unlock(&inode1->i_mutex); ++ mutex_unlock(&inode2->i_mutex); + } + + /** +@@ -1187,16 +1114,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, + ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; + ext4_lblk_t rest_blocks; + pgoff_t orig_page_offset = 0, seq_end_page; +- int ret1, ret2, depth, last_extent = 0; ++ int ret, depth, last_extent = 0; + int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + int data_offset_in_page; + int block_len_in_page; + int uninit; + +- /* orig and donor should be different file */ +- if (orig_inode->i_ino == donor_inode->i_ino) { ++ if (orig_inode->i_sb != donor_inode->i_sb) { ++ ext4_debug("ext4 move extent: The argument files " ++ "should be in same FS [ino:orig %lu, donor %lu]\n", ++ orig_inode->i_ino, donor_inode->i_ino); ++ return -EINVAL; ++ } ++ ++ /* orig and donor should be different inodes */ ++ if (orig_inode == donor_inode) { + ext4_debug("ext4 move extent: The argument files should not " +- "be same file [ino:orig %lu, donor %lu]\n", ++ "be same inode [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } +@@ -1208,18 +1142,21 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } +- ++ /* TODO: This is non obvious task to swap blocks for inodes with full ++ jornaling enabled */ ++ if (ext4_should_journal_data(orig_inode) || ++ ext4_should_journal_data(donor_inode)) { ++ return -EINVAL; ++ } + /* Protect orig and donor inodes against a truncate */ +- ret1 = mext_inode_double_lock(orig_inode, donor_inode); +- if (ret1 < 0) +- return ret1; ++ mext_inode_double_lock(orig_inode, donor_inode); + + /* Protect extent tree against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); + /* Check the filesystem environment whether move_extent can be done */ +- ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, ++ ret = mext_check_arguments(orig_inode, donor_inode, orig_start, + donor_start, &len); +- if (ret1) ++ if (ret) + goto out; + + file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; +@@ -1227,13 +1164,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, + if (file_end < block_end) + len -= block_end - file_end; + +- ret1 = get_ext_path(orig_inode, block_start, &orig_path); +- if (ret1) ++ ret = get_ext_path(orig_inode, block_start, &orig_path); ++ if (ret) + goto out; + + /* Get path structure to check the hole */ +- ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); +- if (ret1) ++ ret = get_ext_path(orig_inode, block_start, &holecheck_path); ++ if (ret) + goto out; + + depth = ext_depth(orig_inode); +@@ -1252,13 +1189,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, + last_extent = mext_next_extent(orig_inode, + holecheck_path, &ext_cur); + if (last_extent < 0) { +- ret1 = last_extent; ++ ret = last_extent; + goto out; + } + last_extent = mext_next_extent(orig_inode, orig_path, + &ext_dummy); + if (last_extent < 0) { +- ret1 = last_extent; ++ ret = last_extent; + goto out; + } + seq_start = le32_to_cpu(ext_cur->ee_block); +@@ -1272,7 +1209,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, + if (le32_to_cpu(ext_cur->ee_block) > block_end) { + ext4_debug("ext4 move extent: The specified range of file " + "may be the hole\n"); +- ret1 = -EINVAL; ++ ret = -EINVAL; + goto out; + } + +@@ -1292,7 +1229,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, + last_extent = mext_next_extent(orig_inode, holecheck_path, + &ext_cur); + if (last_extent < 0) { +- ret1 = last_extent; ++ ret = last_extent; + break; + } + add_blocks = ext4_ext_get_actual_len(ext_cur); +@@ -1349,18 +1286,18 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, + orig_page_offset, + data_offset_in_page, + block_len_in_page, uninit, +- &ret1); ++ &ret); + + /* Count how many blocks we have exchanged */ + *moved_len += block_len_in_page; +- if (ret1 < 0) ++ if (ret < 0) + break; + if (*moved_len > len) { + EXT4_ERROR_INODE(orig_inode, + "We replaced blocks too much! " + "sum of replaced: %llu requested: %llu", + *moved_len, len); +- ret1 = -EIO; ++ ret = -EIO; + break; + } + +@@ -1374,22 +1311,22 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, + } + + double_down_write_data_sem(orig_inode, donor_inode); +- if (ret1 < 0) ++ if (ret < 0) + break; + + /* Decrease buffer counter */ + if (holecheck_path) + ext4_ext_drop_refs(holecheck_path); +- ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); +- if (ret1) ++ ret = get_ext_path(orig_inode, seq_start, &holecheck_path); ++ if (ret) + break; + depth = holecheck_path->p_depth; + + /* Decrease buffer counter */ + if (orig_path) + ext4_ext_drop_refs(orig_path); +- ret1 = get_ext_path(orig_inode, seq_start, &orig_path); +- if (ret1) ++ ret = get_ext_path(orig_inode, seq_start, &orig_path); ++ if (ret) + break; + + ext_cur = holecheck_path[depth].p_ext; +@@ -1412,12 +1349,7 @@ out: + kfree(holecheck_path); + } + double_up_write_data_sem(orig_inode, donor_inode); +- ret2 = mext_inode_double_unlock(orig_inode, donor_inode); +- +- if (ret1) +- return ret1; +- else if (ret2) +- return ret2; ++ mext_inode_double_unlock(orig_inode, donor_inode); + +- return 0; ++ return ret; + } +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 4dd0890..88f97e5 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -1801,9 +1801,7 @@ retry: + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); +-#ifdef CONFIG_EXT4_FS_XATTR + inode->i_op = &ext4_special_inode_operations; +-#endif + err = ext4_add_nondir(handle, dentry, inode); + } + ext4_journal_stop(handle); +diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c +index 54f5786..13bfa07 100644 +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -63,6 +63,7 @@ int writeback_in_progress(struct backing_dev_info *bdi) + { + return test_bit(BDI_writeback_running, &bdi->state); + } ++EXPORT_SYMBOL(writeback_in_progress); + + static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) + { +diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c +index b09e51d..464cd76 100644 +--- a/fs/jffs2/wbuf.c ++++ b/fs/jffs2/wbuf.c +@@ -1032,11 +1032,11 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, + ops.datbuf = NULL; + + ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops); +- if (ret || ops.oobretlen != ops.ooblen) { ++ if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) { + printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd" + " bytes, read %zd bytes, error %d\n", + jeb->offset, ops.ooblen, ops.oobretlen, ret); +- if (!ret) ++ if (!ret || mtd_is_bitflip(ret)) + ret = -EIO; + return ret; + } +@@ -1075,11 +1075,11 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, + ops.datbuf = NULL; + + ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops); +- if (ret || ops.oobretlen != ops.ooblen) { ++ if ((ret && !mtd_is_bitflip(ret)) || ops.oobretlen != ops.ooblen) { + printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd" + " bytes, read %zd bytes, error %d\n", + jeb->offset, ops.ooblen, ops.oobretlen, ret); +- if (!ret) ++ if (!ret || mtd_is_bitflip(ret)) + ret = -EIO; + return ret; + } +diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c +index 23d7451..df753a1 100644 +--- a/fs/lockd/mon.c ++++ b/fs/lockd/mon.c +@@ -40,6 +40,7 @@ struct nsm_args { + u32 proc; + + char *mon_name; ++ char *nodename; + }; + + struct nsm_res { +@@ -93,6 +94,7 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) + .vers = 3, + .proc = NLMPROC_NSM_NOTIFY, + .mon_name = nsm->sm_mon_name, ++ .nodename = utsname()->nodename, + }; + struct rpc_message msg = { + .rpc_argp = &args, +@@ -429,7 +431,7 @@ static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp) + { + __be32 *p; + +- encode_nsm_string(xdr, utsname()->nodename); ++ encode_nsm_string(xdr, argp->nodename); + p = xdr_reserve_space(xdr, 4 + 4 + 4); + *p++ = cpu_to_be32(argp->prog); + *p++ = cpu_to_be32(argp->vers); +diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c +index d774309..1aaa0ee 100644 +--- a/fs/nfs/blocklayout/blocklayout.c ++++ b/fs/nfs/blocklayout/blocklayout.c +@@ -164,25 +164,39 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect, + return bio; + } + +-static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, ++static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, + sector_t isect, struct page *page, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), +- struct parallel_io *par) ++ struct parallel_io *par, ++ unsigned int offset, int len) + { ++ isect = isect + (offset >> SECTOR_SHIFT); ++ dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, ++ npg, rw, (unsigned long long)isect, offset, len); + retry: + if (!bio) { + bio = bl_alloc_init_bio(npg, isect, be, end_io, par); + if (!bio) + return ERR_PTR(-ENOMEM); + } +- if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { ++ if (bio_add_page(bio, page, len, offset) < len) { + bio = bl_submit_bio(rw, bio); + goto retry; + } + return bio; + } + ++static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, ++ sector_t isect, struct page *page, ++ struct pnfs_block_extent *be, ++ void (*end_io)(struct bio *, int err), ++ struct parallel_io *par) ++{ ++ return do_add_page_to_bio(bio, npg, rw, isect, page, be, ++ end_io, par, 0, PAGE_CACHE_SIZE); ++} ++ + /* This is basically copied from mpage_end_io_read */ + static void bl_end_io_read(struct bio *bio, int err) + { +@@ -446,6 +460,106 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) + return; + } + ++static void ++bl_read_single_end_io(struct bio *bio, int error) ++{ ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ struct page *page = bvec->bv_page; ++ ++ /* Only one page in bvec */ ++ unlock_page(page); ++} ++ ++static int ++bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be, ++ unsigned int offset, unsigned int len) ++{ ++ struct bio *bio; ++ struct page *shadow_page; ++ sector_t isect; ++ char *kaddr, *kshadow_addr; ++ int ret = 0; ++ ++ dprintk("%s: offset %u len %u\n", __func__, offset, len); ++ ++ shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); ++ if (shadow_page == NULL) ++ return -ENOMEM; ++ ++ bio = bio_alloc(GFP_NOIO, 1); ++ if (bio == NULL) ++ return -ENOMEM; ++ ++ isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) + ++ (offset / SECTOR_SIZE); ++ ++ bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; ++ bio->bi_bdev = be->be_mdev; ++ bio->bi_end_io = bl_read_single_end_io; ++ ++ lock_page(shadow_page); ++ if (bio_add_page(bio, shadow_page, ++ SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) { ++ unlock_page(shadow_page); ++ bio_put(bio); ++ return -EIO; ++ } ++ ++ submit_bio(READ, bio); ++ wait_on_page_locked(shadow_page); ++ if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) { ++ ret = -EIO; ++ } else { ++ kaddr = kmap_atomic(page); ++ kshadow_addr = kmap_atomic(shadow_page); ++ memcpy(kaddr + offset, kshadow_addr + offset, len); ++ kunmap_atomic(kshadow_addr); ++ kunmap_atomic(kaddr); ++ } ++ __free_page(shadow_page); ++ bio_put(bio); ++ ++ return ret; ++} ++ ++static int ++bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be, ++ unsigned int dirty_offset, unsigned int dirty_len, ++ bool full_page) ++{ ++ int ret = 0; ++ unsigned int start, end; ++ ++ if (full_page) { ++ start = 0; ++ end = PAGE_CACHE_SIZE; ++ } else { ++ start = round_down(dirty_offset, SECTOR_SIZE); ++ end = round_up(dirty_offset + dirty_len, SECTOR_SIZE); ++ } ++ ++ dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len); ++ if (!be) { ++ zero_user_segments(page, start, dirty_offset, ++ dirty_offset + dirty_len, end); ++ if (start == 0 && end == PAGE_CACHE_SIZE && ++ trylock_page(page)) { ++ SetPageUptodate(page); ++ unlock_page(page); ++ } ++ return ret; ++ } ++ ++ if (start != dirty_offset) ++ ret = bl_do_readpage_sync(page, be, start, dirty_offset - start); ++ ++ if (!ret && (dirty_offset + dirty_len < end)) ++ ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len, ++ end - dirty_offset - dirty_len); ++ ++ return ret; ++} ++ + /* Given an unmapped page, zero it or read in page for COW, page is locked + * by caller. + */ +@@ -479,7 +593,6 @@ init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) + SetPageUptodate(page); + + cleanup: +- bl_put_extent(cow_read); + if (bh) + free_buffer_head(bh); + if (ret) { +@@ -501,6 +614,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) + struct parallel_io *par; + loff_t offset = wdata->args.offset; + size_t count = wdata->args.count; ++ unsigned int pg_offset, pg_len, saved_len; + struct page **pages = wdata->args.pages; + struct page *page; + pgoff_t index; +@@ -615,10 +729,11 @@ next_page: + if (!extent_length) { + /* We've used up the previous extent */ + bl_put_extent(be); ++ bl_put_extent(cow_read); + bio = bl_submit_bio(WRITE, bio); + /* Get the next one */ + be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), +- isect, NULL); ++ isect, &cow_read); + if (!be || !is_writable(be, isect)) { + wdata->pnfs_error = -EINVAL; + goto out; +@@ -626,7 +741,26 @@ next_page: + extent_length = be->be_length - + (isect - be->be_f_offset); + } +- if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ ++ dprintk("%s offset %lld count %Zu\n", __func__, offset, count); ++ pg_offset = offset & ~PAGE_CACHE_MASK; ++ if (pg_offset + count > PAGE_CACHE_SIZE) ++ pg_len = PAGE_CACHE_SIZE - pg_offset; ++ else ++ pg_len = count; ++ ++ saved_len = pg_len; ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA && ++ !bl_is_sector_init(be->be_inval, isect)) { ++ ret = bl_read_partial_page_sync(pages[i], cow_read, ++ pg_offset, pg_len, true); ++ if (ret) { ++ dprintk("%s bl_read_partial_page_sync fail %d\n", ++ __func__, ret); ++ wdata->pnfs_error = ret; ++ goto out; ++ } ++ + ret = bl_mark_sectors_init(be->be_inval, isect, + PAGE_CACHE_SECTORS, + NULL); +@@ -636,15 +770,35 @@ next_page: + wdata->pnfs_error = ret; + goto out; + } ++ ++ /* Expand to full page write */ ++ pg_offset = 0; ++ pg_len = PAGE_CACHE_SIZE; ++ } else if ((pg_offset & (SECTOR_SIZE - 1)) || ++ (pg_len & (SECTOR_SIZE - 1))){ ++ /* ahh, nasty case. We have to do sync full sector ++ * read-modify-write cycles. ++ */ ++ unsigned int saved_offset = pg_offset; ++ ret = bl_read_partial_page_sync(pages[i], be, pg_offset, ++ pg_len, false); ++ pg_offset = round_down(pg_offset, SECTOR_SIZE); ++ pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE) ++ - pg_offset; + } +- bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE, ++ ++ ++ bio = do_add_page_to_bio(bio, wdata->npages - i, WRITE, + isect, pages[i], be, +- bl_end_io_write, par); ++ bl_end_io_write, par, ++ pg_offset, pg_len); + if (IS_ERR(bio)) { + wdata->pnfs_error = PTR_ERR(bio); + bio = NULL; + goto out; + } ++ offset += saved_len; ++ count -= saved_len; + isect += PAGE_CACHE_SECTORS; + last_isect = isect; + extent_length -= PAGE_CACHE_SECTORS; +@@ -662,12 +816,10 @@ next_page: + } + + write_done: +- wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); +- if (count < wdata->res.count) { +- wdata->res.count = count; +- } ++ wdata->res.count = wdata->args.count; + out: + bl_put_extent(be); ++ bl_put_extent(cow_read); + bl_submit_bio(WRITE, bio); + put_parallel(par); + return PNFS_ATTEMPTED; +diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h +index 42acf7e..519a9de 100644 +--- a/fs/nfs/blocklayout/blocklayout.h ++++ b/fs/nfs/blocklayout/blocklayout.h +@@ -40,6 +40,7 @@ + + #define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) + #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) ++#define SECTOR_SIZE (1 << SECTOR_SHIFT) + + struct block_mount_id { + spinlock_t bm_lock; /* protects list */ +diff --git a/fs/udf/super.c b/fs/udf/super.c +index 516b7f0..f66439e 100644 +--- a/fs/udf/super.c ++++ b/fs/udf/super.c +@@ -1289,6 +1289,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, + udf_err(sb, "error loading logical volume descriptor: " + "Partition table too long (%u > %lu)\n", table_len, + sb->s_blocksize - sizeof(*lvd)); ++ ret = 1; + goto out_bh; + } + +@@ -1333,8 +1334,10 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, + UDF_ID_SPARABLE, + strlen(UDF_ID_SPARABLE))) { + if (udf_load_sparable_map(sb, map, +- (struct sparablePartitionMap *)gpm) < 0) ++ (struct sparablePartitionMap *)gpm) < 0) { ++ ret = 1; + goto out_bh; ++ } + } else if (!strncmp(upm2->partIdent.ident, + UDF_ID_METADATA, + strlen(UDF_ID_METADATA))) { +diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h +index 7978eec..3e8f2f7 100644 +--- a/include/linux/mempolicy.h ++++ b/include/linux/mempolicy.h +@@ -188,7 +188,7 @@ struct sp_node { + + struct shared_policy { + struct rb_root root; +- spinlock_t lock; ++ struct mutex mutex; + }; + + void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); +diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h +index 67cc215..1874c5e 100644 +--- a/include/linux/pci_ids.h ++++ b/include/linux/pci_ids.h +@@ -1823,7 +1823,6 @@ + #define PCI_DEVICE_ID_SIIG_8S_20x_650 0x2081 + #define PCI_DEVICE_ID_SIIG_8S_20x_850 0x2082 + #define PCI_SUBDEVICE_ID_SIIG_QUARTET_SERIAL 0x2050 +-#define PCI_SUBDEVICE_ID_SIIG_DUAL_SERIAL 0x2530 + + #define PCI_VENDOR_ID_RADISYS 0x1331 + +diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h +index e5a7b9a..416dcb0 100644 +--- a/include/net/ip_vs.h ++++ b/include/net/ip_vs.h +@@ -1353,7 +1353,7 @@ static inline void ip_vs_notrack(struct sk_buff *skb) + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (!ct || !nf_ct_is_untracked(ct)) { +- nf_reset(skb); ++ nf_conntrack_put(skb->nfct); + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); +diff --git a/kernel/rcutree.c b/kernel/rcutree.c +index 6b76d81..a122196 100644 +--- a/kernel/rcutree.c ++++ b/kernel/rcutree.c +@@ -292,7 +292,9 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) + static int + cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) + { +- return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); ++ return *rdp->nxttail[RCU_DONE_TAIL + ++ ACCESS_ONCE(rsp->completed) != rdp->completed] && ++ !rcu_gp_in_progress(rsp); + } + + /* +diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c +index 8b44e7f..85e9da2 100644 +--- a/kernel/sched_stoptask.c ++++ b/kernel/sched_stoptask.c +@@ -25,8 +25,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) + { + struct task_struct *stop = rq->stop; + +- if (stop && stop->on_rq) ++ if (stop && stop->on_rq) { ++ stop->se.exec_start = rq->clock_task; + return stop; ++ } + + return NULL; + } +@@ -50,6 +52,21 @@ static void yield_task_stop(struct rq *rq) + + static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) + { ++ struct task_struct *curr = rq->curr; ++ u64 delta_exec; ++ ++ delta_exec = rq->clock_task - curr->se.exec_start; ++ if (unlikely((s64)delta_exec < 0)) ++ delta_exec = 0; ++ ++ schedstat_set(curr->se.statistics.exec_max, ++ max(curr->se.statistics.exec_max, delta_exec)); ++ ++ curr->se.sum_exec_runtime += delta_exec; ++ account_group_exec_runtime(curr, delta_exec); ++ ++ curr->se.exec_start = rq->clock_task; ++ cpuacct_charge(curr, delta_exec); + } + + static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) +@@ -58,6 +75,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) + + static void set_curr_task_stop(struct rq *rq) + { ++ struct task_struct *stop = rq->stop; ++ ++ stop->se.exec_start = rq->clock_task; + } + + static void switched_to_stop(struct rq *rq, struct task_struct *p) +diff --git a/kernel/sys.c b/kernel/sys.c +index 481611f..c504302 100644 +--- a/kernel/sys.c ++++ b/kernel/sys.c +@@ -365,6 +365,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); + void kernel_restart(char *cmd) + { + kernel_restart_prepare(cmd); ++ disable_nonboot_cpus(); + if (!cmd) + printk(KERN_EMERG "Restarting system.\n"); + else +diff --git a/kernel/workqueue.c b/kernel/workqueue.c +index b413138..43a19c5 100644 +--- a/kernel/workqueue.c ++++ b/kernel/workqueue.c +@@ -1726,10 +1726,9 @@ static void move_linked_works(struct work_struct *work, struct list_head *head, + *nextp = n; + } + +-static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) ++static void cwq_activate_delayed_work(struct work_struct *work) + { +- struct work_struct *work = list_first_entry(&cwq->delayed_works, +- struct work_struct, entry); ++ struct cpu_workqueue_struct *cwq = get_work_cwq(work); + struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); + + trace_workqueue_activate_work(work); +@@ -1738,6 +1737,14 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) + cwq->nr_active++; + } + ++static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) ++{ ++ struct work_struct *work = list_first_entry(&cwq->delayed_works, ++ struct work_struct, entry); ++ ++ cwq_activate_delayed_work(work); ++} ++ + /** + * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight + * @cwq: cwq of interest +@@ -1869,7 +1876,9 @@ __acquires(&gcwq->lock) + + spin_unlock_irq(&gcwq->lock); + ++ smp_wmb(); /* paired with test_and_set_bit(PENDING) */ + work_clear_pending(work); ++ + lock_map_acquire_read(&cwq->wq->lockdep_map); + lock_map_acquire(&lockdep_map); + trace_workqueue_execute_start(work); +@@ -2626,6 +2635,18 @@ static int try_to_grab_pending(struct work_struct *work) + smp_rmb(); + if (gcwq == get_work_gcwq(work)) { + debug_work_deactivate(work); ++ ++ /* ++ * A delayed work item cannot be grabbed directly ++ * because it might have linked NO_COLOR work items ++ * which, if left on the delayed_list, will confuse ++ * cwq->nr_active management later on and cause ++ * stall. Make sure the work item is activated ++ * before grabbing. ++ */ ++ if (*work_data_bits(work) & WORK_STRUCT_DELAYED) ++ cwq_activate_delayed_work(work); ++ + list_del_init(&work->entry); + cwq_dec_nr_in_flight(get_work_cwq(work), + get_work_color(work), +diff --git a/lib/gcd.c b/lib/gcd.c +index f879033..433d89b 100644 +--- a/lib/gcd.c ++++ b/lib/gcd.c +@@ -9,6 +9,9 @@ unsigned long gcd(unsigned long a, unsigned long b) + + if (a < b) + swap(a, b); ++ ++ if (!b) ++ return a; + while ((r = a % b) != 0) { + a = b; + b = r; +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index 0f897b8..d6c0fdf 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -2429,8 +2429,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + * from page cache lookup which is in HPAGE_SIZE units. + */ + address = address & huge_page_mask(h); +- pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +- + (vma->vm_pgoff >> PAGE_SHIFT); ++ pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + ++ vma->vm_pgoff; + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + + /* +diff --git a/mm/mempolicy.c b/mm/mempolicy.c +index 11b8d47..4c82c21 100644 +--- a/mm/mempolicy.c ++++ b/mm/mempolicy.c +@@ -607,24 +607,39 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, + return first; + } + +-/* Apply policy to a single VMA */ +-static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) ++/* ++ * Apply policy to a single VMA ++ * This must be called with the mmap_sem held for writing. ++ */ ++static int vma_replace_policy(struct vm_area_struct *vma, ++ struct mempolicy *pol) + { +- int err = 0; +- struct mempolicy *old = vma->vm_policy; ++ int err; ++ struct mempolicy *old; ++ struct mempolicy *new; + + pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_ops, vma->vm_file, + vma->vm_ops ? vma->vm_ops->set_policy : NULL); + +- if (vma->vm_ops && vma->vm_ops->set_policy) ++ new = mpol_dup(pol); ++ if (IS_ERR(new)) ++ return PTR_ERR(new); ++ ++ if (vma->vm_ops && vma->vm_ops->set_policy) { + err = vma->vm_ops->set_policy(vma, new); +- if (!err) { +- mpol_get(new); +- vma->vm_policy = new; +- mpol_put(old); ++ if (err) ++ goto err_out; + } ++ ++ old = vma->vm_policy; ++ vma->vm_policy = new; /* protected by mmap_sem */ ++ mpol_put(old); ++ ++ return 0; ++ err_out: ++ mpol_put(new); + return err; + } + +@@ -675,7 +690,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, + if (err) + goto out; + } +- err = policy_vma(vma, new_pol); ++ err = vma_replace_policy(vma, new_pol); + if (err) + goto out; + } +@@ -1507,8 +1522,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task, + addr); + if (vpol) + pol = vpol; +- } else if (vma->vm_policy) ++ } else if (vma->vm_policy) { + pol = vma->vm_policy; ++ ++ /* ++ * shmem_alloc_page() passes MPOL_F_SHARED policy with ++ * a pseudo vma whose vma->vm_ops=NULL. Take a reference ++ * count on these policies which will be dropped by ++ * mpol_cond_put() later ++ */ ++ if (mpol_needs_cond_ref(pol)) ++ mpol_get(pol); ++ } + } + if (!pol) + pol = &default_policy; +@@ -2032,7 +2057,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) + */ + + /* lookup first element intersecting start-end */ +-/* Caller holds sp->lock */ ++/* Caller holds sp->mutex */ + static struct sp_node * + sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) + { +@@ -2096,36 +2121,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) + + if (!sp->root.rb_node) + return NULL; +- spin_lock(&sp->lock); ++ mutex_lock(&sp->mutex); + sn = sp_lookup(sp, idx, idx+1); + if (sn) { + mpol_get(sn->policy); + pol = sn->policy; + } +- spin_unlock(&sp->lock); ++ mutex_unlock(&sp->mutex); + return pol; + } + ++static void sp_free(struct sp_node *n) ++{ ++ mpol_put(n->policy); ++ kmem_cache_free(sn_cache, n); ++} ++ + static void sp_delete(struct shared_policy *sp, struct sp_node *n) + { + pr_debug("deleting %lx-l%lx\n", n->start, n->end); + rb_erase(&n->nd, &sp->root); +- mpol_put(n->policy); +- kmem_cache_free(sn_cache, n); ++ sp_free(n); + } + + static struct sp_node *sp_alloc(unsigned long start, unsigned long end, + struct mempolicy *pol) + { +- struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); ++ struct sp_node *n; ++ struct mempolicy *newpol; + ++ n = kmem_cache_alloc(sn_cache, GFP_KERNEL); + if (!n) + return NULL; ++ ++ newpol = mpol_dup(pol); ++ if (IS_ERR(newpol)) { ++ kmem_cache_free(sn_cache, n); ++ return NULL; ++ } ++ newpol->flags |= MPOL_F_SHARED; ++ + n->start = start; + n->end = end; +- mpol_get(pol); +- pol->flags |= MPOL_F_SHARED; /* for unref */ +- n->policy = pol; ++ n->policy = newpol; ++ + return n; + } + +@@ -2133,10 +2172,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, + static int shared_policy_replace(struct shared_policy *sp, unsigned long start, + unsigned long end, struct sp_node *new) + { +- struct sp_node *n, *new2 = NULL; ++ struct sp_node *n; ++ int ret = 0; + +-restart: +- spin_lock(&sp->lock); ++ mutex_lock(&sp->mutex); + n = sp_lookup(sp, start, end); + /* Take care of old policies in the same range. */ + while (n && n->start < end) { +@@ -2149,16 +2188,14 @@ restart: + } else { + /* Old policy spanning whole new range. */ + if (n->end > end) { ++ struct sp_node *new2; ++ new2 = sp_alloc(end, n->end, n->policy); + if (!new2) { +- spin_unlock(&sp->lock); +- new2 = sp_alloc(end, n->end, n->policy); +- if (!new2) +- return -ENOMEM; +- goto restart; ++ ret = -ENOMEM; ++ goto out; + } + n->end = start; + sp_insert(sp, new2); +- new2 = NULL; + break; + } else + n->end = start; +@@ -2169,12 +2206,9 @@ restart: + } + if (new) + sp_insert(sp, new); +- spin_unlock(&sp->lock); +- if (new2) { +- mpol_put(new2->policy); +- kmem_cache_free(sn_cache, new2); +- } +- return 0; ++out: ++ mutex_unlock(&sp->mutex); ++ return ret; + } + + /** +@@ -2192,7 +2226,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) + int ret; + + sp->root = RB_ROOT; /* empty tree == default mempolicy */ +- spin_lock_init(&sp->lock); ++ mutex_init(&sp->mutex); + + if (mpol) { + struct vm_area_struct pvma; +@@ -2246,7 +2280,7 @@ int mpol_set_shared_policy(struct shared_policy *info, + } + err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); + if (err && new) +- kmem_cache_free(sn_cache, new); ++ sp_free(new); + return err; + } + +@@ -2258,16 +2292,14 @@ void mpol_free_shared_policy(struct shared_policy *p) + + if (!p->root.rb_node) + return; +- spin_lock(&p->lock); ++ mutex_lock(&p->mutex); + next = rb_first(&p->root); + while (next) { + n = rb_entry(next, struct sp_node, nd); + next = rb_next(&n->nd); +- rb_erase(&n->nd, &p->root); +- mpol_put(n->policy); +- kmem_cache_free(sn_cache, n); ++ sp_delete(p, n); + } +- spin_unlock(&p->lock); ++ mutex_unlock(&p->mutex); + } + + /* assumes fs == KERNEL_DS */ +diff --git a/mm/slab.c b/mm/slab.c +index cd3ab93..4c3b671 100644 +--- a/mm/slab.c ++++ b/mm/slab.c +@@ -1669,9 +1669,6 @@ void __init kmem_cache_init_late(void) + + g_cpucache_up = LATE; + +- /* Annotate slab for lockdep -- annotate the malloc caches */ +- init_lock_keys(); +- + /* 6) resize the head arrays to their final sizes */ + mutex_lock(&cache_chain_mutex); + list_for_each_entry(cachep, &cache_chain, next) +@@ -1679,6 +1676,9 @@ void __init kmem_cache_init_late(void) + BUG(); + mutex_unlock(&cache_chain_mutex); + ++ /* Annotate slab for lockdep -- annotate the malloc caches */ ++ init_lock_keys(); ++ + /* Done! */ + g_cpucache_up = FULL; + +diff --git a/mm/truncate.c b/mm/truncate.c +index 632b15e..00fb58a 100644 +--- a/mm/truncate.c ++++ b/mm/truncate.c +@@ -394,11 +394,12 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) + if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) + return 0; + ++ clear_page_mlock(page); ++ + spin_lock_irq(&mapping->tree_lock); + if (PageDirty(page)) + goto failed; + +- clear_page_mlock(page); + BUG_ON(page_has_private(page)); + __delete_from_page_cache(page); + spin_unlock_irq(&mapping->tree_lock); +diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +index de9da21..d7d63f4 100644 +--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c ++++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +@@ -84,6 +84,14 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, + *dataoff = nhoff + (iph->ihl << 2); + *protonum = iph->protocol; + ++ /* Check bogus IP headers */ ++ if (*dataoff > skb->len) { ++ pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: " ++ "nhoff %u, ihl %u, skblen %u\n", ++ nhoff, iph->ihl << 2, skb->len); ++ return -NF_ACCEPT; ++ } ++ + return NF_ACCEPT; + } + +diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c +index 78844d9..6609a84 100644 +--- a/net/ipv4/netfilter/nf_nat_sip.c ++++ b/net/ipv4/netfilter/nf_nat_sip.c +@@ -148,7 +148,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, + if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, + hdr, NULL, &matchoff, &matchlen, + &addr, &port) > 0) { +- unsigned int matchend, poff, plen, buflen, n; ++ unsigned int olen, matchend, poff, plen, buflen, n; + char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; + + /* We're only interested in headers related to this +@@ -163,11 +163,12 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, + goto next; + } + ++ olen = *datalen; + if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, + &addr, port)) + return NF_DROP; + +- matchend = matchoff + matchlen; ++ matchend = matchoff + matchlen + *datalen - olen; + + /* The maddr= parameter (RFC 2361) specifies where to send + * the reply. */ +@@ -501,7 +502,10 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff, + ret = nf_ct_expect_related(rtcp_exp); + if (ret == 0) + break; +- else if (ret != -EBUSY) { ++ else if (ret == -EBUSY) { ++ nf_ct_unexpect_related(rtp_exp); ++ continue; ++ } else if (ret < 0) { + nf_ct_unexpect_related(rtp_exp); + port = 0; + break; +diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c +index 340c80d..7918eb7 100644 +--- a/net/netfilter/nf_conntrack_expect.c ++++ b/net/netfilter/nf_conntrack_expect.c +@@ -366,23 +366,6 @@ static void evict_oldest_expect(struct nf_conn *master, + } + } + +-static inline int refresh_timer(struct nf_conntrack_expect *i) +-{ +- struct nf_conn_help *master_help = nfct_help(i->master); +- const struct nf_conntrack_expect_policy *p; +- +- if (!del_timer(&i->timeout)) +- return 0; +- +- p = &rcu_dereference_protected( +- master_help->helper, +- lockdep_is_held(&nf_conntrack_lock) +- )->expect_policy[i->class]; +- i->timeout.expires = jiffies + p->timeout * HZ; +- add_timer(&i->timeout); +- return 1; +-} +- + static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) + { + const struct nf_conntrack_expect_policy *p; +@@ -390,7 +373,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) + struct nf_conn *master = expect->master; + struct nf_conn_help *master_help = nfct_help(master); + struct net *net = nf_ct_exp_net(expect); +- struct hlist_node *n; ++ struct hlist_node *n, *next; + unsigned int h; + int ret = 1; + +@@ -401,12 +384,12 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) + goto out; + } + h = nf_ct_expect_dst_hash(&expect->tuple); +- hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { ++ hlist_for_each_entry_safe(i, n, next, &net->ct.expect_hash[h], hnode) { + if (expect_matches(i, expect)) { +- /* Refresh timer: if it's dying, ignore.. */ +- if (refresh_timer(i)) { +- ret = 0; +- goto out; ++ if (del_timer(&i->timeout)) { ++ nf_ct_unlink_expect(i); ++ nf_ct_expect_put(i); ++ break; + } + } else if (expect_clash(i, expect)) { + ret = -EBUSY; +diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c +index dfd52ba..8f3f280 100644 +--- a/net/netfilter/xt_hashlimit.c ++++ b/net/netfilter/xt_hashlimit.c +@@ -389,8 +389,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) + #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) + + /* Precision saver. */ +-static inline u_int32_t +-user2credits(u_int32_t user) ++static u32 user2credits(u32 user) + { + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) +@@ -400,7 +399,7 @@ user2credits(u_int32_t user) + return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE; + } + +-static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) ++static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) + { + dh->rateinfo.credit += (now - dh->rateinfo.prev) * CREDITS_PER_JIFFY; + if (dh->rateinfo.credit > dh->rateinfo.credit_cap) +@@ -531,8 +530,7 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) + dh->rateinfo.prev = jiffies; + dh->rateinfo.credit = user2credits(hinfo->cfg.avg * + hinfo->cfg.burst); +- dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg * +- hinfo->cfg.burst); ++ dh->rateinfo.credit_cap = dh->rateinfo.credit; + dh->rateinfo.cost = user2credits(hinfo->cfg.avg); + } else { + /* update expiration timeout */ +diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c +index 32b7a57..a4c1e45 100644 +--- a/net/netfilter/xt_limit.c ++++ b/net/netfilter/xt_limit.c +@@ -88,8 +88,7 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param *par) + } + + /* Precision saver. */ +-static u_int32_t +-user2credits(u_int32_t user) ++static u32 user2credits(u32 user) + { + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) +@@ -118,12 +117,12 @@ static int limit_mt_check(const struct xt_mtchk_param *par) + + /* For SMP, we only want to use one set of state. */ + r->master = priv; ++ /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * ++ 128. */ ++ priv->prev = jiffies; ++ priv->credit = user2credits(r->avg * r->burst); /* Credits full. */ + if (r->cost == 0) { +- /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * +- 128. */ +- priv->prev = jiffies; +- priv->credit = user2credits(r->avg * r->burst); /* Credits full. */ +- r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ ++ r->credit_cap = priv->credit; /* Credits full. */ + r->cost = user2credits(r->avg); + } + return 0; +diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c +index c5391af..10a385b 100644 +--- a/net/sunrpc/xprtsock.c ++++ b/net/sunrpc/xprtsock.c +@@ -1028,6 +1028,16 @@ static void xs_udp_data_ready(struct sock *sk, int len) + read_unlock_bh(&sk->sk_callback_lock); + } + ++/* ++ * Helper function to force a TCP close if the server is sending ++ * junk and/or it has put us in CLOSE_WAIT ++ */ ++static void xs_tcp_force_close(struct rpc_xprt *xprt) ++{ ++ set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); ++ xprt_force_disconnect(xprt); ++} ++ + static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) + { + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); +@@ -1054,7 +1064,7 @@ static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_rea + /* Sanity check of the record length */ + if (unlikely(transport->tcp_reclen < 8)) { + dprintk("RPC: invalid TCP record fragment length\n"); +- xprt_force_disconnect(xprt); ++ xs_tcp_force_close(xprt); + return; + } + dprintk("RPC: reading TCP record fragment of length %d\n", +@@ -1135,7 +1145,7 @@ static inline void xs_tcp_read_calldir(struct sock_xprt *transport, + break; + default: + dprintk("RPC: invalid request message type\n"); +- xprt_force_disconnect(&transport->xprt); ++ xs_tcp_force_close(&transport->xprt); + } + xs_tcp_check_fraghdr(transport); + } +@@ -1458,6 +1468,8 @@ static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) + static void xs_sock_mark_closed(struct rpc_xprt *xprt) + { + smp_mb__before_clear_bit(); ++ clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); ++ clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state); + clear_bit(XPRT_CLOSE_WAIT, &xprt->state); + clear_bit(XPRT_CLOSING, &xprt->state); + smp_mb__after_clear_bit(); +@@ -1515,8 +1527,8 @@ static void xs_tcp_state_change(struct sock *sk) + break; + case TCP_CLOSE_WAIT: + /* The server initiated a shutdown of the socket */ +- xprt_force_disconnect(xprt); + xprt->connect_cookie++; ++ xs_tcp_force_close(xprt); + case TCP_CLOSING: + /* + * If the server closed down the connection, make sure that +@@ -2159,8 +2171,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) + /* We're probably in TIME_WAIT. Get rid of existing socket, + * and retry + */ +- set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); +- xprt_force_disconnect(xprt); ++ xs_tcp_force_close(xprt); + break; + case -ECONNREFUSED: + case -ECONNRESET: +diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include +index d897278..978416d 100644 +--- a/scripts/Kbuild.include ++++ b/scripts/Kbuild.include +@@ -98,24 +98,24 @@ try-run = $(shell set -e; \ + # Usage: cflags-y += $(call as-option,-Wa$(comma)-isa=foo,) + + as-option = $(call try-run,\ +- $(CC) $(KBUILD_CFLAGS) $(1) -c -xassembler /dev/null -o "$$TMP",$(1),$(2)) ++ $(CC) $(KBUILD_CFLAGS) $(1) -c -x assembler /dev/null -o "$$TMP",$(1),$(2)) + + # as-instr + # Usage: cflags-y += $(call as-instr,instr,option1,option2) + + as-instr = $(call try-run,\ +- /bin/echo -e "$(1)" | $(CC) $(KBUILD_AFLAGS) -c -xassembler -o "$$TMP" -,$(2),$(3)) ++ printf "%b\n" "$(1)" | $(CC) $(KBUILD_AFLAGS) -c -x assembler -o "$$TMP" -,$(2),$(3)) + + # cc-option + # Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586) + + cc-option = $(call try-run,\ +- $(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -xc /dev/null -o "$$TMP",$(1),$(2)) ++ $(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) + + # cc-option-yn + # Usage: flag := $(call cc-option-yn,-march=winchip-c6) + cc-option-yn = $(call try-run,\ +- $(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -xc /dev/null -o "$$TMP",y,n) ++ $(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",y,n) + + # cc-option-align + # Prefix align with either -falign or -malign +@@ -125,7 +125,7 @@ cc-option-align = $(subst -functions=0,,\ + # cc-disable-warning + # Usage: cflags-y += $(call cc-disable-warning,unused-but-set-variable) + cc-disable-warning = $(call try-run,\ +- $(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) -W$(strip $(1)) -c -xc /dev/null -o "$$TMP",-Wno-$(strip $(1))) ++ $(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) + + # cc-version + # Usage gcc-ver := $(call cc-version) +@@ -143,7 +143,7 @@ cc-ifversion = $(shell [ $(call cc-version, $(CC)) $(1) $(2) ] && echo $(3)) + # cc-ldoption + # Usage: ldflags += $(call cc-ldoption, -Wl$(comma)--hash-style=both) + cc-ldoption = $(call try-run,\ +- $(CC) $(1) -nostdlib -xc /dev/null -o "$$TMP",$(1),$(2)) ++ $(CC) $(1) -nostdlib -x c /dev/null -o "$$TMP",$(1),$(2)) + + # ld-option + # Usage: LDFLAGS += $(call ld-option, -X) +@@ -209,7 +209,7 @@ endif + # >$< substitution to preserve $ when reloading .cmd file + # note: when using inline perl scripts [perl -e '...$$t=1;...'] + # in $(cmd_xxx) double $$ your perl vars +-make-cmd = $(subst \#,\\\#,$(subst $$,$$$$,$(call escsq,$(cmd_$(1))))) ++make-cmd = $(subst \\,\\\\,$(subst \#,\\\#,$(subst $$,$$$$,$(call escsq,$(cmd_$(1)))))) + + # Find any prerequisites that is newer than target or that does not exist. + # PHONY targets skipped in both cases. +diff --git a/scripts/gcc-version.sh b/scripts/gcc-version.sh +index debecb5..7f2126d 100644 +--- a/scripts/gcc-version.sh ++++ b/scripts/gcc-version.sh +@@ -22,10 +22,10 @@ if [ ${#compiler} -eq 0 ]; then + exit 1 + fi + +-MAJOR=$(echo __GNUC__ | $compiler -E -xc - | tail -n 1) +-MINOR=$(echo __GNUC_MINOR__ | $compiler -E -xc - | tail -n 1) ++MAJOR=$(echo __GNUC__ | $compiler -E -x c - | tail -n 1) ++MINOR=$(echo __GNUC_MINOR__ | $compiler -E -x c - | tail -n 1) + if [ "x$with_patchlevel" != "x" ] ; then +- PATCHLEVEL=$(echo __GNUC_PATCHLEVEL__ | $compiler -E -xc - | tail -n 1) ++ PATCHLEVEL=$(echo __GNUC_PATCHLEVEL__ | $compiler -E -x c - | tail -n 1) + printf "%02d%02d%02d\\n" $MAJOR $MINOR $PATCHLEVEL + else + printf "%02d%02d\\n" $MAJOR $MINOR +diff --git a/scripts/gcc-x86_32-has-stack-protector.sh b/scripts/gcc-x86_32-has-stack-protector.sh +index 29493dc..12dbd0b 100644 +--- a/scripts/gcc-x86_32-has-stack-protector.sh ++++ b/scripts/gcc-x86_32-has-stack-protector.sh +@@ -1,6 +1,6 @@ + #!/bin/sh + +-echo "int foo(void) { char X[200]; return 3; }" | $* -S -xc -c -O0 -fstack-protector - -o - 2> /dev/null | grep -q "%gs" ++echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -c -O0 -fstack-protector - -o - 2> /dev/null | grep -q "%gs" + if [ "$?" -eq "0" ] ; then + echo y + else +diff --git a/scripts/gcc-x86_64-has-stack-protector.sh b/scripts/gcc-x86_64-has-stack-protector.sh +index afaec61..973e8c1 100644 +--- a/scripts/gcc-x86_64-has-stack-protector.sh ++++ b/scripts/gcc-x86_64-has-stack-protector.sh +@@ -1,6 +1,6 @@ + #!/bin/sh + +-echo "int foo(void) { char X[200]; return 3; }" | $* -S -xc -c -O0 -mcmodel=kernel -fstack-protector - -o - 2> /dev/null | grep -q "%gs" ++echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -c -O0 -mcmodel=kernel -fstack-protector - -o - 2> /dev/null | grep -q "%gs" + if [ "$?" -eq "0" ] ; then + echo y + else +diff --git a/scripts/kconfig/check.sh b/scripts/kconfig/check.sh +index fa59cbf..854d9c7 100755 +--- a/scripts/kconfig/check.sh ++++ b/scripts/kconfig/check.sh +@@ -1,6 +1,6 @@ + #!/bin/sh + # Needed for systems without gettext +-$* -xc -o /dev/null - > /dev/null 2>&1 << EOF ++$* -x c -o /dev/null - > /dev/null 2>&1 << EOF + #include + int main() + { +diff --git a/scripts/kconfig/lxdialog/check-lxdialog.sh b/scripts/kconfig/lxdialog/check-lxdialog.sh +index 82cc3a8..50df490 100644 +--- a/scripts/kconfig/lxdialog/check-lxdialog.sh ++++ b/scripts/kconfig/lxdialog/check-lxdialog.sh +@@ -38,7 +38,7 @@ trap "rm -f $tmp" 0 1 2 3 15 + + # Check if we can link to ncurses + check() { +- $cc -xc - -o $tmp 2>/dev/null <<'EOF' ++ $cc -x c - -o $tmp 2>/dev/null <<'EOF' + #include CURSES_LOC + main() {} + EOF +diff --git a/scripts/kconfig/streamline_config.pl b/scripts/kconfig/streamline_config.pl +index bccf07dd..3346f42 100644 +--- a/scripts/kconfig/streamline_config.pl ++++ b/scripts/kconfig/streamline_config.pl +@@ -463,6 +463,8 @@ while() { + if (defined($configs{$1})) { + if ($localyesconfig) { + $setconfigs{$1} = 'y'; ++ print "$1=y\n"; ++ next; + } else { + $setconfigs{$1} = $2; + } +diff --git a/scripts/package/buildtar b/scripts/package/buildtar +index 8a7b155..d0d748e 100644 +--- a/scripts/package/buildtar ++++ b/scripts/package/buildtar +@@ -109,7 +109,7 @@ esac + if tar --owner=root --group=root --help >/dev/null 2>&1; then + opts="--owner=root --group=root" + fi +- tar cf - . $opts | ${compress} > "${tarball}${file_ext}" ++ tar cf - boot/* lib/* $opts | ${compress} > "${tarball}${file_ext}" + ) + + echo "Tarball successfully created in ${tarball}${file_ext}" +diff --git a/sound/drivers/aloop.c b/sound/drivers/aloop.c +index d83bafc..193ce81 100644 +--- a/sound/drivers/aloop.c ++++ b/sound/drivers/aloop.c +@@ -119,6 +119,7 @@ struct loopback_pcm { + unsigned int period_size_frac; + unsigned long last_jiffies; + struct timer_list timer; ++ spinlock_t timer_lock; + }; + + static struct platform_device *devices[SNDRV_CARDS]; +@@ -169,6 +170,7 @@ static void loopback_timer_start(struct loopback_pcm *dpcm) + unsigned long tick; + unsigned int rate_shift = get_rate_shift(dpcm); + ++ spin_lock(&dpcm->timer_lock); + if (rate_shift != dpcm->pcm_rate_shift) { + dpcm->pcm_rate_shift = rate_shift; + dpcm->period_size_frac = frac_pos(dpcm, dpcm->pcm_period_size); +@@ -181,12 +183,15 @@ static void loopback_timer_start(struct loopback_pcm *dpcm) + tick = (tick + dpcm->pcm_bps - 1) / dpcm->pcm_bps; + dpcm->timer.expires = jiffies + tick; + add_timer(&dpcm->timer); ++ spin_unlock(&dpcm->timer_lock); + } + + static inline void loopback_timer_stop(struct loopback_pcm *dpcm) + { ++ spin_lock(&dpcm->timer_lock); + del_timer(&dpcm->timer); + dpcm->timer.expires = 0; ++ spin_unlock(&dpcm->timer_lock); + } + + #define CABLE_VALID_PLAYBACK (1 << SNDRV_PCM_STREAM_PLAYBACK) +@@ -659,6 +664,7 @@ static int loopback_open(struct snd_pcm_substream *substream) + dpcm->substream = substream; + setup_timer(&dpcm->timer, loopback_timer_function, + (unsigned long)dpcm); ++ spin_lock_init(&dpcm->timer_lock); + + cable = loopback->cables[substream->number][dev]; + if (!cable) { +diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c +index 402f330..94f0c4a 100644 +--- a/sound/pci/hda/patch_conexant.c ++++ b/sound/pci/hda/patch_conexant.c +@@ -139,6 +139,7 @@ struct conexant_spec { + unsigned int asus:1; + unsigned int pin_eapd_ctrls:1; + unsigned int single_adc_amp:1; ++ unsigned int fixup_stereo_dmic:1; + + unsigned int adc_switching:1; + +@@ -4113,9 +4114,9 @@ static int cx_auto_init(struct hda_codec *codec) + + static int cx_auto_add_volume_idx(struct hda_codec *codec, const char *basename, + const char *dir, int cidx, +- hda_nid_t nid, int hda_dir, int amp_idx) ++ hda_nid_t nid, int hda_dir, int amp_idx, int chs) + { +- static char name[32]; ++ static char name[44]; + static struct snd_kcontrol_new knew[] = { + HDA_CODEC_VOLUME(name, 0, 0, 0), + HDA_CODEC_MUTE(name, 0, 0, 0), +@@ -4125,7 +4126,7 @@ static int cx_auto_add_volume_idx(struct hda_codec *codec, const char *basename, + + for (i = 0; i < 2; i++) { + struct snd_kcontrol *kctl; +- knew[i].private_value = HDA_COMPOSE_AMP_VAL(nid, 3, amp_idx, ++ knew[i].private_value = HDA_COMPOSE_AMP_VAL(nid, chs, amp_idx, + hda_dir); + knew[i].subdevice = HDA_SUBDEV_AMP_FLAG; + knew[i].index = cidx; +@@ -4144,7 +4145,7 @@ static int cx_auto_add_volume_idx(struct hda_codec *codec, const char *basename, + } + + #define cx_auto_add_volume(codec, str, dir, cidx, nid, hda_dir) \ +- cx_auto_add_volume_idx(codec, str, dir, cidx, nid, hda_dir, 0) ++ cx_auto_add_volume_idx(codec, str, dir, cidx, nid, hda_dir, 0, 3) + + #define cx_auto_add_pb_volume(codec, nid, str, idx) \ + cx_auto_add_volume(codec, str, " Playback", idx, nid, HDA_OUTPUT) +@@ -4214,6 +4215,36 @@ static int cx_auto_build_output_controls(struct hda_codec *codec) + return 0; + } + ++/* Returns zero if this is a normal stereo channel, and non-zero if it should ++ be split in two independent channels. ++ dest_label must be at least 44 characters. */ ++static int cx_auto_get_rightch_label(struct hda_codec *codec, const char *label, ++ char *dest_label, int nid) ++{ ++ struct conexant_spec *spec = codec->spec; ++ int i; ++ ++ if (!spec->fixup_stereo_dmic) ++ return 0; ++ ++ for (i = 0; i < AUTO_CFG_MAX_INS; i++) { ++ int def_conf; ++ if (spec->autocfg.inputs[i].pin != nid) ++ continue; ++ ++ if (spec->autocfg.inputs[i].type != AUTO_PIN_MIC) ++ return 0; ++ def_conf = snd_hda_codec_get_pincfg(codec, nid); ++ if (snd_hda_get_input_pin_attr(def_conf) != INPUT_PIN_ATTR_INT) ++ return 0; ++ ++ /* Finally found the inverted internal mic! */ ++ snprintf(dest_label, 44, "Inverted %s", label); ++ return 1; ++ } ++ return 0; ++} ++ + static int cx_auto_add_capture_volume(struct hda_codec *codec, hda_nid_t nid, + const char *label, const char *pfx, + int cidx) +@@ -4222,14 +4253,25 @@ static int cx_auto_add_capture_volume(struct hda_codec *codec, hda_nid_t nid, + int i; + + for (i = 0; i < spec->num_adc_nids; i++) { ++ char rightch_label[44]; + hda_nid_t adc_nid = spec->adc_nids[i]; + int idx = get_input_connection(codec, adc_nid, nid); + if (idx < 0) + continue; + if (spec->single_adc_amp) + idx = 0; ++ ++ if (cx_auto_get_rightch_label(codec, label, rightch_label, nid)) { ++ /* Make two independent kcontrols for left and right */ ++ int err = cx_auto_add_volume_idx(codec, label, pfx, ++ cidx, adc_nid, HDA_INPUT, idx, 1); ++ if (err < 0) ++ return err; ++ return cx_auto_add_volume_idx(codec, rightch_label, pfx, ++ cidx, adc_nid, HDA_INPUT, idx, 2); ++ } + return cx_auto_add_volume_idx(codec, label, pfx, +- cidx, adc_nid, HDA_INPUT, idx); ++ cidx, adc_nid, HDA_INPUT, idx, 3); + } + return 0; + } +@@ -4242,9 +4284,19 @@ static int cx_auto_add_boost_volume(struct hda_codec *codec, int idx, + int i, con; + + nid = spec->imux_info[idx].pin; +- if (get_wcaps(codec, nid) & AC_WCAP_IN_AMP) ++ if (get_wcaps(codec, nid) & AC_WCAP_IN_AMP) { ++ char rightch_label[44]; ++ if (cx_auto_get_rightch_label(codec, label, rightch_label, nid)) { ++ int err = cx_auto_add_volume_idx(codec, label, " Boost", ++ cidx, nid, HDA_INPUT, 0, 1); ++ if (err < 0) ++ return err; ++ return cx_auto_add_volume_idx(codec, rightch_label, " Boost", ++ cidx, nid, HDA_INPUT, 0, 2); ++ } + return cx_auto_add_volume(codec, label, " Boost", cidx, + nid, HDA_INPUT); ++ } + con = __select_input_connection(codec, spec->imux_info[idx].adc, nid, + &mux, false, 0); + if (con < 0) +@@ -4398,23 +4450,31 @@ static void apply_pincfg(struct hda_codec *codec, const struct cxt_pincfg *cfg) + + } + +-static void apply_pin_fixup(struct hda_codec *codec, ++enum { ++ CXT_PINCFG_LENOVO_X200, ++ CXT_PINCFG_LENOVO_TP410, ++ CXT_FIXUP_STEREO_DMIC, ++}; ++ ++static void apply_fixup(struct hda_codec *codec, + const struct snd_pci_quirk *quirk, + const struct cxt_pincfg **table) + { ++ struct conexant_spec *spec = codec->spec; ++ + quirk = snd_pci_quirk_lookup(codec->bus->pci, quirk); +- if (quirk) { ++ if (quirk && table[quirk->value]) { + snd_printdd(KERN_INFO "hda_codec: applying pincfg for %s\n", + quirk->name); + apply_pincfg(codec, table[quirk->value]); + } ++ if (quirk->value == CXT_FIXUP_STEREO_DMIC) { ++ snd_printdd(KERN_INFO "hda_codec: applying internal mic workaround for %s\n", ++ quirk->name); ++ spec->fixup_stereo_dmic = 1; ++ } + } + +-enum { +- CXT_PINCFG_LENOVO_X200, +- CXT_PINCFG_LENOVO_TP410, +-}; +- + /* ThinkPad X200 & co with cxt5051 */ + static const struct cxt_pincfg cxt_pincfg_lenovo_x200[] = { + { 0x16, 0x042140ff }, /* HP (seq# overridden) */ +@@ -4434,6 +4494,7 @@ static const struct cxt_pincfg cxt_pincfg_lenovo_tp410[] = { + static const struct cxt_pincfg *cxt_pincfg_tbl[] = { + [CXT_PINCFG_LENOVO_X200] = cxt_pincfg_lenovo_x200, + [CXT_PINCFG_LENOVO_TP410] = cxt_pincfg_lenovo_tp410, ++ [CXT_FIXUP_STEREO_DMIC] = NULL, + }; + + static const struct snd_pci_quirk cxt5051_fixups[] = { +@@ -4447,6 +4508,9 @@ static const struct snd_pci_quirk cxt5066_fixups[] = { + SND_PCI_QUIRK(0x17aa, 0x215f, "Lenovo T510", CXT_PINCFG_LENOVO_TP410), + SND_PCI_QUIRK(0x17aa, 0x21ce, "Lenovo T420", CXT_PINCFG_LENOVO_TP410), + SND_PCI_QUIRK(0x17aa, 0x21cf, "Lenovo T520", CXT_PINCFG_LENOVO_TP410), ++ SND_PCI_QUIRK(0x17aa, 0x3975, "Lenovo U300s", CXT_FIXUP_STEREO_DMIC), ++ SND_PCI_QUIRK(0x17aa, 0x3977, "Lenovo IdeaPad U310", CXT_FIXUP_STEREO_DMIC), ++ SND_PCI_QUIRK(0x17aa, 0x397b, "Lenovo S205", CXT_FIXUP_STEREO_DMIC), + {} + }; + +@@ -4486,10 +4550,10 @@ static int patch_conexant_auto(struct hda_codec *codec) + break; + case 0x14f15051: + add_cx5051_fake_mutes(codec); +- apply_pin_fixup(codec, cxt5051_fixups, cxt_pincfg_tbl); ++ apply_fixup(codec, cxt5051_fixups, cxt_pincfg_tbl); + break; + default: +- apply_pin_fixup(codec, cxt5066_fixups, cxt_pincfg_tbl); ++ apply_fixup(codec, cxt5066_fixups, cxt_pincfg_tbl); + break; + } + +diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c +index 323d4d9..0961d88 100644 +--- a/tools/hv/hv_kvp_daemon.c ++++ b/tools/hv/hv_kvp_daemon.c +@@ -348,7 +348,7 @@ int main(void) + fd = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR); + if (fd < 0) { + syslog(LOG_ERR, "netlink socket creation failed; error:%d", fd); +- exit(-1); ++ exit(EXIT_FAILURE); + } + addr.nl_family = AF_NETLINK; + addr.nl_pad = 0; +@@ -360,7 +360,7 @@ int main(void) + if (error < 0) { + syslog(LOG_ERR, "bind failed; error:%d", error); + close(fd); +- exit(-1); ++ exit(EXIT_FAILURE); + } + sock_opt = addr.nl_groups; + setsockopt(fd, 270, 1, &sock_opt, sizeof(sock_opt)); +@@ -378,7 +378,7 @@ int main(void) + if (len < 0) { + syslog(LOG_ERR, "netlink_send failed; error:%d", len); + close(fd); +- exit(-1); ++ exit(EXIT_FAILURE); + } + + pfd.fd = fd; +@@ -497,7 +497,7 @@ int main(void) + len = netlink_send(fd, incoming_cn_msg); + if (len < 0) { + syslog(LOG_ERR, "net_link send failed; error:%d", len); +- exit(-1); ++ exit(EXIT_FAILURE); + } + } + +diff --git a/tools/perf/Makefile b/tools/perf/Makefile +index b98e307..e45d2b1 100644 +--- a/tools/perf/Makefile ++++ b/tools/perf/Makefile +@@ -56,7 +56,7 @@ ifeq ($(ARCH),x86_64) + ARCH := x86 + IS_X86_64 := 0 + ifeq (, $(findstring m32,$(EXTRA_CFLAGS))) +- IS_X86_64 := $(shell echo __x86_64__ | ${CC} -E -xc - | tail -n 1) ++ IS_X86_64 := $(shell echo __x86_64__ | ${CC} -E -x c - | tail -n 1) + endif + ifeq (${IS_X86_64}, 1) + RAW_ARCH := x86_64 +diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile +index e8a03ac..7db8da5 100644 +--- a/tools/power/cpupower/Makefile ++++ b/tools/power/cpupower/Makefile +@@ -100,7 +100,7 @@ GMO_FILES = ${shell for HLANG in ${LANGUAGES}; do echo po/$$HLANG.gmo; done;} + export CROSS CC AR STRIP RANLIB CFLAGS LDFLAGS LIB_OBJS + + # check if compiler option is supported +-cc-supports = ${shell if $(CC) ${1} -S -o /dev/null -xc /dev/null > /dev/null 2>&1; then echo "$(1)"; fi;} ++cc-supports = ${shell if $(CC) ${1} -S -o /dev/null -x c /dev/null > /dev/null 2>&1; then echo "$(1)"; fi;} + + # use '-Os' optimization if available, else use -O2 + OPTIMIZATION := $(call cc-supports,-Os,-O2) diff --git a/3.2.34/bump/1032_linux-3.2.33.patch b/3.2.34/bump/1032_linux-3.2.33.patch new file mode 100644 index 0000000..c32fb75 --- /dev/null +++ b/3.2.34/bump/1032_linux-3.2.33.patch @@ -0,0 +1,3450 @@ +diff --git a/Makefile b/Makefile +index b6d8282..63ca1ea2 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 2 +-SUBLEVEL = 32 ++SUBLEVEL = 33 + EXTRAVERSION = + NAME = Saber-toothed Squirrel + +diff --git a/arch/arm/include/asm/vfpmacros.h b/arch/arm/include/asm/vfpmacros.h +index 3d5fc41..bf53047 100644 +--- a/arch/arm/include/asm/vfpmacros.h ++++ b/arch/arm/include/asm/vfpmacros.h +@@ -28,7 +28,7 @@ + ldr \tmp, =elf_hwcap @ may not have MVFR regs + ldr \tmp, [\tmp, #0] + tst \tmp, #HWCAP_VFPv3D16 +- ldceq p11, cr0, [\base],#32*4 @ FLDMIAD \base!, {d16-d31} ++ ldceql p11, cr0, [\base],#32*4 @ FLDMIAD \base!, {d16-d31} + addne \base, \base, #32*4 @ step over unused register space + #else + VFPFMRX \tmp, MVFR0 @ Media and VFP Feature Register 0 +@@ -52,7 +52,7 @@ + ldr \tmp, =elf_hwcap @ may not have MVFR regs + ldr \tmp, [\tmp, #0] + tst \tmp, #HWCAP_VFPv3D16 +- stceq p11, cr0, [\base],#32*4 @ FSTMIAD \base!, {d16-d31} ++ stceql p11, cr0, [\base],#32*4 @ FSTMIAD \base!, {d16-d31} + addne \base, \base, #32*4 @ step over unused register space + #else + VFPFMRX \tmp, MVFR0 @ Media and VFP Feature Register 0 +diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c +index 1d1710e..bfa0eeb 100644 +--- a/arch/arm/kernel/smp.c ++++ b/arch/arm/kernel/smp.c +@@ -295,18 +295,24 @@ static void __cpuinit smp_store_cpu_info(unsigned int cpuid) + asmlinkage void __cpuinit secondary_start_kernel(void) + { + struct mm_struct *mm = &init_mm; +- unsigned int cpu = smp_processor_id(); ++ unsigned int cpu; ++ ++ /* ++ * The identity mapping is uncached (strongly ordered), so ++ * switch away from it before attempting any exclusive accesses. ++ */ ++ cpu_switch_mm(mm->pgd, mm); ++ enter_lazy_tlb(mm, current); ++ local_flush_tlb_all(); + + /* + * All kernel threads share the same mm context; grab a + * reference and switch to it. + */ ++ cpu = smp_processor_id(); + atomic_inc(&mm->mm_count); + current->active_mm = mm; + cpumask_set_cpu(cpu, mm_cpumask(mm)); +- cpu_switch_mm(mm->pgd, mm); +- enter_lazy_tlb(mm, current); +- local_flush_tlb_all(); + + printk("CPU%u: Booted secondary processor\n", cpu); + +diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c +index f4546e9..23817a6 100644 +--- a/arch/mips/kernel/kgdb.c ++++ b/arch/mips/kernel/kgdb.c +@@ -283,6 +283,15 @@ static int kgdb_mips_notify(struct notifier_block *self, unsigned long cmd, + struct pt_regs *regs = args->regs; + int trap = (regs->cp0_cause & 0x7c) >> 2; + ++#ifdef CONFIG_KPROBES ++ /* ++ * Return immediately if the kprobes fault notifier has set ++ * DIE_PAGE_FAULT. ++ */ ++ if (cmd == DIE_PAGE_FAULT) ++ return NOTIFY_DONE; ++#endif /* CONFIG_KPROBES */ ++ + /* Userspace events, ignore. */ + if (user_mode(regs)) + return NOTIFY_DONE; +diff --git a/arch/s390/boot/compressed/vmlinux.lds.S b/arch/s390/boot/compressed/vmlinux.lds.S +index d80f79d..8e1fb82 100644 +--- a/arch/s390/boot/compressed/vmlinux.lds.S ++++ b/arch/s390/boot/compressed/vmlinux.lds.S +@@ -5,7 +5,7 @@ OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") + OUTPUT_ARCH(s390:64-bit) + #else + OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") +-OUTPUT_ARCH(s390) ++OUTPUT_ARCH(s390:31-bit) + #endif + + ENTRY(startup) +diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S +index e4c79eb..e43d21e 100644 +--- a/arch/s390/kernel/vmlinux.lds.S ++++ b/arch/s390/kernel/vmlinux.lds.S +@@ -8,7 +8,7 @@ + + #ifndef CONFIG_64BIT + OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") +-OUTPUT_ARCH(s390) ++OUTPUT_ARCH(s390:31-bit) + ENTRY(_start) + jiffies = jiffies_64 + 4; + #else +diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c +index 614da62..3c8f220 100644 +--- a/arch/sparc/kernel/perf_event.c ++++ b/arch/sparc/kernel/perf_event.c +@@ -555,11 +555,13 @@ static u64 nop_for_index(int idx) + + static inline void sparc_pmu_enable_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc, int idx) + { +- u64 val, mask = mask_for_index(idx); ++ u64 enc, val, mask = mask_for_index(idx); ++ ++ enc = perf_event_get_enc(cpuc->events[idx]); + + val = cpuc->pcr; + val &= ~mask; +- val |= hwc->config; ++ val |= event_encoding(enc, idx); + cpuc->pcr = val; + + pcr_ops->write(cpuc->pcr); +@@ -1422,8 +1424,6 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry, + { + unsigned long ufp; + +- perf_callchain_store(entry, regs->tpc); +- + ufp = regs->u_regs[UREG_I6] + STACK_BIAS; + do { + struct sparc_stackf *usf, sf; +@@ -1444,8 +1444,6 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, + { + unsigned long ufp; + +- perf_callchain_store(entry, regs->tpc); +- + ufp = regs->u_regs[UREG_I6] & 0xffffffffUL; + do { + struct sparc_stackf32 *usf, sf; +@@ -1464,6 +1462,11 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, + void + perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) + { ++ perf_callchain_store(entry, regs->tpc); ++ ++ if (!current->mm) ++ return; ++ + flushw_user(); + if (test_thread_flag(TIF_32BIT)) + perf_callchain_user_32(entry, regs); +diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c +index 441521a..5e4252b 100644 +--- a/arch/sparc/kernel/sys_sparc_64.c ++++ b/arch/sparc/kernel/sys_sparc_64.c +@@ -519,12 +519,12 @@ SYSCALL_DEFINE1(sparc64_personality, unsigned long, personality) + { + int ret; + +- if (current->personality == PER_LINUX32 && +- personality == PER_LINUX) +- personality = PER_LINUX32; ++ if (personality(current->personality) == PER_LINUX32 && ++ personality(personality) == PER_LINUX) ++ personality |= PER_LINUX32; + ret = sys_personality(personality); +- if (ret == PER_LINUX32) +- ret = PER_LINUX; ++ if (personality(ret) == PER_LINUX32) ++ ret &= ~PER_LINUX32; + + return ret; + } +diff --git a/arch/sparc/kernel/syscalls.S b/arch/sparc/kernel/syscalls.S +index 1d7e274..7f5f65d 100644 +--- a/arch/sparc/kernel/syscalls.S ++++ b/arch/sparc/kernel/syscalls.S +@@ -212,24 +212,20 @@ linux_sparc_syscall: + 3: stx %o0, [%sp + PTREGS_OFF + PT_V9_I0] + ret_sys_call: + ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %g3 +- ldx [%sp + PTREGS_OFF + PT_V9_TNPC], %l1 ! pc = npc + sra %o0, 0, %o0 + mov %ulo(TSTATE_XCARRY | TSTATE_ICARRY), %g2 + sllx %g2, 32, %g2 + +- /* Check if force_successful_syscall_return() +- * was invoked. +- */ +- ldub [%g6 + TI_SYS_NOERROR], %l2 +- brnz,a,pn %l2, 80f +- stb %g0, [%g6 + TI_SYS_NOERROR] +- + cmp %o0, -ERESTART_RESTARTBLOCK + bgeu,pn %xcc, 1f +- andcc %l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT), %l6 +-80: ++ andcc %l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT), %g0 ++ ldx [%sp + PTREGS_OFF + PT_V9_TNPC], %l1 ! pc = npc ++ ++2: ++ stb %g0, [%g6 + TI_SYS_NOERROR] + /* System call success, clear Carry condition code. */ + andn %g3, %g2, %g3 ++3: + stx %g3, [%sp + PTREGS_OFF + PT_V9_TSTATE] + bne,pn %icc, linux_syscall_trace2 + add %l1, 0x4, %l2 ! npc = npc+4 +@@ -238,20 +234,20 @@ ret_sys_call: + stx %l2, [%sp + PTREGS_OFF + PT_V9_TNPC] + + 1: ++ /* Check if force_successful_syscall_return() ++ * was invoked. ++ */ ++ ldub [%g6 + TI_SYS_NOERROR], %l2 ++ brnz,pn %l2, 2b ++ ldx [%sp + PTREGS_OFF + PT_V9_TNPC], %l1 ! pc = npc + /* System call failure, set Carry condition code. + * Also, get abs(errno) to return to the process. + */ +- andcc %l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT|_TIF_SYSCALL_TRACEPOINT), %l6 + sub %g0, %o0, %o0 +- or %g3, %g2, %g3 + stx %o0, [%sp + PTREGS_OFF + PT_V9_I0] +- stx %g3, [%sp + PTREGS_OFF + PT_V9_TSTATE] +- bne,pn %icc, linux_syscall_trace2 +- add %l1, 0x4, %l2 ! npc = npc+4 +- stx %l1, [%sp + PTREGS_OFF + PT_V9_TPC] ++ ba,pt %xcc, 3b ++ or %g3, %g2, %g3 + +- b,pt %xcc, rtrap +- stx %l2, [%sp + PTREGS_OFF + PT_V9_TNPC] + linux_syscall_trace2: + call syscall_trace_leave + add %sp, PTREGS_OFF, %o0 +diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c +index 8e073d8..6ff4d78 100644 +--- a/arch/sparc/mm/init_64.c ++++ b/arch/sparc/mm/init_64.c +@@ -2118,6 +2118,9 @@ EXPORT_SYMBOL(_PAGE_CACHE); + #ifdef CONFIG_SPARSEMEM_VMEMMAP + unsigned long vmemmap_table[VMEMMAP_SIZE]; + ++static long __meminitdata addr_start, addr_end; ++static int __meminitdata node_start; ++ + int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node) + { + unsigned long vstart = (unsigned long) start; +@@ -2148,15 +2151,30 @@ int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node) + + *vmem_pp = pte_base | __pa(block); + +- printk(KERN_INFO "[%p-%p] page_structs=%lu " +- "node=%d entry=%lu/%lu\n", start, block, nr, +- node, +- addr >> VMEMMAP_CHUNK_SHIFT, +- VMEMMAP_SIZE); ++ /* check to see if we have contiguous blocks */ ++ if (addr_end != addr || node_start != node) { ++ if (addr_start) ++ printk(KERN_DEBUG " [%lx-%lx] on node %d\n", ++ addr_start, addr_end-1, node_start); ++ addr_start = addr; ++ node_start = node; ++ } ++ addr_end = addr + VMEMMAP_CHUNK; + } + } + return 0; + } ++ ++void __meminit vmemmap_populate_print_last(void) ++{ ++ if (addr_start) { ++ printk(KERN_DEBUG " [%lx-%lx] on node %d\n", ++ addr_start, addr_end-1, node_start); ++ addr_start = 0; ++ addr_end = 0; ++ node_start = 0; ++ } ++} + #endif /* CONFIG_SPARSEMEM_VMEMMAP */ + + static void prot_init_common(unsigned long page_none, +diff --git a/arch/tile/Makefile b/arch/tile/Makefile +index 17acce7..04c637c 100644 +--- a/arch/tile/Makefile ++++ b/arch/tile/Makefile +@@ -26,6 +26,10 @@ $(error Set TILERA_ROOT or CROSS_COMPILE when building $(ARCH) on $(HOST_ARCH)) + endif + endif + ++# The tile compiler may emit .eh_frame information for backtracing. ++# In kernel modules, this causes load failures due to unsupported relocations. ++KBUILD_CFLAGS += -fno-asynchronous-unwind-tables ++ + ifneq ($(CONFIG_DEBUG_EXTRA_FLAGS),"") + KBUILD_CFLAGS += $(CONFIG_DEBUG_EXTRA_FLAGS) + endif +diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S +index bcda816..4893d58 100644 +--- a/arch/x86/kernel/entry_32.S ++++ b/arch/x86/kernel/entry_32.S +@@ -1025,7 +1025,7 @@ ENTRY(xen_sysenter_target) + + ENTRY(xen_hypervisor_callback) + CFI_STARTPROC +- pushl_cfi $0 ++ pushl_cfi $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + TRACE_IRQS_OFF + +@@ -1067,14 +1067,16 @@ ENTRY(xen_failsafe_callback) + 2: mov 8(%esp),%es + 3: mov 12(%esp),%fs + 4: mov 16(%esp),%gs ++ /* EAX == 0 => Category 1 (Bad segment) ++ EAX != 0 => Category 2 (Bad IRET) */ + testl %eax,%eax + popl_cfi %eax + lea 16(%esp),%esp + CFI_ADJUST_CFA_OFFSET -16 + jz 5f + addl $16,%esp +- jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) +-5: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment) ++ jmp iret_exc ++5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + jmp ret_from_exception + CFI_ENDPROC +diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S +index faf8d5e..6274f5f 100644 +--- a/arch/x86/kernel/entry_64.S ++++ b/arch/x86/kernel/entry_64.S +@@ -1303,7 +1303,7 @@ ENTRY(xen_failsafe_callback) + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 +- pushq_cfi $0 ++ pushq_cfi $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + jmp error_exit + CFI_ENDPROC +diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c +index 75f9528..6bc0899 100644 +--- a/arch/x86/oprofile/nmi_int.c ++++ b/arch/x86/oprofile/nmi_int.c +@@ -55,7 +55,7 @@ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, + val |= counter_config->extra; + event &= model->event_mask ? model->event_mask : 0xFF; + val |= event & 0xFF; +- val |= (event & 0x0F00) << 24; ++ val |= (u64)(event & 0x0F00) << 24; + + return val; + } +diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c +index a1e21ae..69b9ef6 100644 +--- a/arch/x86/xen/enlighten.c ++++ b/arch/x86/xen/enlighten.c +@@ -818,7 +818,16 @@ static void xen_write_cr4(unsigned long cr4) + + native_write_cr4(cr4); + } +- ++#ifdef CONFIG_X86_64 ++static inline unsigned long xen_read_cr8(void) ++{ ++ return 0; ++} ++static inline void xen_write_cr8(unsigned long val) ++{ ++ BUG_ON(val); ++} ++#endif + static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) + { + int ret; +@@ -987,6 +996,11 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { + .read_cr4_safe = native_read_cr4_safe, + .write_cr4 = xen_write_cr4, + ++#ifdef CONFIG_X86_64 ++ .read_cr8 = xen_read_cr8, ++ .write_cr8 = xen_write_cr8, ++#endif ++ + .wbinvd = native_wbinvd, + + .read_msr = native_read_msr_safe, +@@ -997,6 +1011,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { + .read_tsc = native_read_tsc, + .read_pmc = native_read_pmc, + ++ .read_tscp = native_read_tscp, ++ + .iret = xen_iret, + .irq_enable_sysexit = xen_sysexit, + #ifdef CONFIG_X86_64 +diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c +index b19a18d..d2519b2 100644 +--- a/drivers/acpi/ec.c ++++ b/drivers/acpi/ec.c +@@ -71,9 +71,6 @@ enum ec_command { + #define ACPI_EC_UDELAY_GLK 1000 /* Wait 1ms max. to get global lock */ + #define ACPI_EC_MSI_UDELAY 550 /* Wait 550us for MSI EC */ + +-#define ACPI_EC_STORM_THRESHOLD 8 /* number of false interrupts +- per one transaction */ +- + enum { + EC_FLAGS_QUERY_PENDING, /* Query is pending */ + EC_FLAGS_GPE_STORM, /* GPE storm detected */ +@@ -87,6 +84,15 @@ static unsigned int ec_delay __read_mostly = ACPI_EC_DELAY; + module_param(ec_delay, uint, 0644); + MODULE_PARM_DESC(ec_delay, "Timeout(ms) waited until an EC command completes"); + ++/* ++ * If the number of false interrupts per one transaction exceeds ++ * this threshold, will think there is a GPE storm happened and ++ * will disable the GPE for normal transaction. ++ */ ++static unsigned int ec_storm_threshold __read_mostly = 8; ++module_param(ec_storm_threshold, uint, 0644); ++MODULE_PARM_DESC(ec_storm_threshold, "Maxim false GPE numbers not considered as GPE storm"); ++ + /* If we find an EC via the ECDT, we need to keep a ptr to its context */ + /* External interfaces use first EC only, so remember */ + typedef int (*acpi_ec_query_func) (void *data); +@@ -319,7 +325,7 @@ static int acpi_ec_transaction(struct acpi_ec *ec, struct transaction *t) + msleep(1); + /* It is safe to enable the GPE outside of the transaction. */ + acpi_enable_gpe(NULL, ec->gpe); +- } else if (t->irq_count > ACPI_EC_STORM_THRESHOLD) { ++ } else if (t->irq_count > ec_storm_threshold) { + pr_info(PREFIX "GPE storm detected, " + "transactions will use polling mode\n"); + set_bit(EC_FLAGS_GPE_STORM, &ec->flags); +@@ -914,6 +920,17 @@ static int ec_flag_msi(const struct dmi_system_id *id) + return 0; + } + ++/* ++ * Clevo M720 notebook actually works ok with IRQ mode, if we lifted ++ * the GPE storm threshold back to 20 ++ */ ++static int ec_enlarge_storm_threshold(const struct dmi_system_id *id) ++{ ++ pr_debug("Setting the EC GPE storm threshold to 20\n"); ++ ec_storm_threshold = 20; ++ return 0; ++} ++ + static struct dmi_system_id __initdata ec_dmi_table[] = { + { + ec_skip_dsdt_scan, "Compal JFL92", { +@@ -945,10 +962,13 @@ static struct dmi_system_id __initdata ec_dmi_table[] = { + { + ec_validate_ecdt, "ASUS hardware", { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer Inc.") }, NULL}, ++ { ++ ec_enlarge_storm_threshold, "CLEVO hardware", { ++ DMI_MATCH(DMI_SYS_VENDOR, "CLEVO Co."), ++ DMI_MATCH(DMI_PRODUCT_NAME, "M720T/M730T"),}, NULL}, + {}, + }; + +- + int __init acpi_ec_ecdt_probe(void) + { + acpi_status status; +diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c +index 10f92b3..7a987a7 100644 +--- a/drivers/bcma/main.c ++++ b/drivers/bcma/main.c +@@ -124,9 +124,10 @@ static int bcma_register_cores(struct bcma_bus *bus) + + static void bcma_unregister_cores(struct bcma_bus *bus) + { +- struct bcma_device *core; ++ struct bcma_device *core, *tmp; + +- list_for_each_entry(core, &bus->cores, list) { ++ list_for_each_entry_safe(core, tmp, &bus->cores, list) { ++ list_del(&core->list); + if (core->dev_registered) + device_unregister(&core->dev); + } +diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c +index b366b34..0d91655 100644 +--- a/drivers/char/tpm/tpm.c ++++ b/drivers/char/tpm/tpm.c +@@ -1072,17 +1072,20 @@ ssize_t tpm_write(struct file *file, const char __user *buf, + size_t size, loff_t *off) + { + struct tpm_chip *chip = file->private_data; +- size_t in_size = size, out_size; ++ size_t in_size = size; ++ ssize_t out_size; + + /* cannot perform a write until the read has cleared +- either via tpm_read or a user_read_timer timeout */ +- while (atomic_read(&chip->data_pending) != 0) +- msleep(TPM_TIMEOUT); +- +- mutex_lock(&chip->buffer_mutex); ++ either via tpm_read or a user_read_timer timeout. ++ This also prevents splitted buffered writes from blocking here. ++ */ ++ if (atomic_read(&chip->data_pending) != 0) ++ return -EBUSY; + + if (in_size > TPM_BUFSIZE) +- in_size = TPM_BUFSIZE; ++ return -E2BIG; ++ ++ mutex_lock(&chip->buffer_mutex); + + if (copy_from_user + (chip->data_buffer, (void __user *) buf, in_size)) { +@@ -1092,6 +1095,10 @@ ssize_t tpm_write(struct file *file, const char __user *buf, + + /* atomic tpm command send and result receive */ + out_size = tpm_transmit(chip, chip->data_buffer, TPM_BUFSIZE); ++ if (out_size < 0) { ++ mutex_unlock(&chip->buffer_mutex); ++ return out_size; ++ } + + atomic_set(&chip->data_pending, out_size); + mutex_unlock(&chip->buffer_mutex); +diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c +index b7fe343..f6cd315 100644 +--- a/drivers/cpufreq/powernow-k8.c ++++ b/drivers/cpufreq/powernow-k8.c +@@ -1216,14 +1216,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, + struct powernowk8_target_arg pta = { .pol = pol, .targfreq = targfreq, + .relation = relation }; + +- /* +- * Must run on @pol->cpu. cpufreq core is responsible for ensuring +- * that we're bound to the current CPU and pol->cpu stays online. +- */ +- if (smp_processor_id() == pol->cpu) +- return powernowk8_target_fn(&pta); +- else +- return work_on_cpu(pol->cpu, powernowk8_target_fn, &pta); ++ return work_on_cpu(pol->cpu, powernowk8_target_fn, &pta); + } + + /* Driver entry point to verify the policy and range of frequencies */ +diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c +index c9eee6d..a9d5482 100644 +--- a/drivers/edac/amd64_edac.c ++++ b/drivers/edac/amd64_edac.c +@@ -170,8 +170,11 @@ static int __amd64_set_scrub_rate(struct pci_dev *ctl, u32 new_bw, u32 min_rate) + * memory controller and apply to register. Search for the first + * bandwidth entry that is greater or equal than the setting requested + * and program that. If at last entry, turn off DRAM scrubbing. ++ * ++ * If no suitable bandwidth is found, turn off DRAM scrubbing entirely ++ * by falling back to the last element in scrubrates[]. + */ +- for (i = 0; i < ARRAY_SIZE(scrubrates); i++) { ++ for (i = 0; i < ARRAY_SIZE(scrubrates) - 1; i++) { + /* + * skip scrub rates which aren't recommended + * (see F10 BKDG, F3x58) +@@ -181,12 +184,6 @@ static int __amd64_set_scrub_rate(struct pci_dev *ctl, u32 new_bw, u32 min_rate) + + if (scrubrates[i].bandwidth <= new_bw) + break; +- +- /* +- * if no suitable bandwidth found, turn off DRAM scrubbing +- * entirely by falling back to the last element in the +- * scrubrates array. +- */ + } + + scrubval = scrubrates[i].scrubval; +diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c +index 33e1555..dbe4dbe 100644 +--- a/drivers/gpu/drm/i915/i915_gem.c ++++ b/drivers/gpu/drm/i915/i915_gem.c +@@ -999,6 +999,7 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data, + if (obj->phys_obj) + ret = i915_gem_phys_pwrite(dev, obj, args, file); + else if (obj->gtt_space && ++ obj->tiling_mode == I915_TILING_NONE && + obj->base.write_domain != I915_GEM_DOMAIN_CPU) { + ret = i915_gem_object_pin(obj, 0, true); + if (ret) +diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c +index f07bde2..57152a7 100644 +--- a/drivers/gpu/drm/i915/intel_lvds.c ++++ b/drivers/gpu/drm/i915/intel_lvds.c +@@ -771,6 +771,14 @@ static const struct dmi_system_id intel_no_lvds[] = { + DMI_MATCH(DMI_BOARD_NAME, "MS-7469"), + }, + }, ++ { ++ .callback = intel_no_lvds_dmi_callback, ++ .ident = "ZOTAC ZBOXSD-ID12/ID13", ++ .matches = { ++ DMI_MATCH(DMI_BOARD_VENDOR, "ZOTAC"), ++ DMI_MATCH(DMI_BOARD_NAME, "ZBOXSD-ID12/ID13"), ++ }, ++ }, + + { } /* terminating entry */ + }; +diff --git a/drivers/gpu/drm/radeon/radeon_legacy_encoders.c b/drivers/gpu/drm/radeon/radeon_legacy_encoders.c +index 2f46e0c..3ad3cc6 100644 +--- a/drivers/gpu/drm/radeon/radeon_legacy_encoders.c ++++ b/drivers/gpu/drm/radeon/radeon_legacy_encoders.c +@@ -973,11 +973,7 @@ static void radeon_legacy_tmds_ext_mode_set(struct drm_encoder *encoder, + static void radeon_ext_tmds_enc_destroy(struct drm_encoder *encoder) + { + struct radeon_encoder *radeon_encoder = to_radeon_encoder(encoder); +- struct radeon_encoder_ext_tmds *tmds = radeon_encoder->enc_priv; +- if (tmds) { +- if (tmds->i2c_bus) +- radeon_i2c_destroy(tmds->i2c_bus); +- } ++ /* don't destroy the i2c bus record here, this will be done in radeon_i2c_fini */ + kfree(radeon_encoder->enc_priv); + drm_encoder_cleanup(encoder); + kfree(radeon_encoder); +diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c +index 4065374..f4c3d28 100644 +--- a/drivers/hv/channel.c ++++ b/drivers/hv/channel.c +@@ -146,14 +146,14 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, + + if (ret != 0) { + err = ret; +- goto errorout; ++ goto error0; + } + + ret = hv_ringbuffer_init( + &newchannel->inbound, in, recv_ringbuffer_size); + if (ret != 0) { + err = ret; +- goto errorout; ++ goto error0; + } + + +@@ -168,7 +168,7 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, + + if (ret != 0) { + err = ret; +- goto errorout; ++ goto error0; + } + + /* Create and init the channel open message */ +@@ -177,7 +177,7 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, + GFP_KERNEL); + if (!open_info) { + err = -ENOMEM; +- goto errorout; ++ goto error0; + } + + init_completion(&open_info->waitevent); +@@ -193,7 +193,7 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, + + if (userdatalen > MAX_USER_DEFINED_BYTES) { + err = -EINVAL; +- goto errorout; ++ goto error0; + } + + if (userdatalen) +@@ -208,19 +208,18 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, + sizeof(struct vmbus_channel_open_channel)); + + if (ret != 0) +- goto cleanup; ++ goto error1; + + t = wait_for_completion_timeout(&open_info->waitevent, 5*HZ); + if (t == 0) { + err = -ETIMEDOUT; +- goto errorout; ++ goto error1; + } + + + if (open_info->response.open_result.status) + err = open_info->response.open_result.status; + +-cleanup: + spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + list_del(&open_info->msglistentry); + spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); +@@ -228,9 +227,12 @@ cleanup: + kfree(open_info); + return err; + +-errorout: +- hv_ringbuffer_cleanup(&newchannel->outbound); +- hv_ringbuffer_cleanup(&newchannel->inbound); ++error1: ++ spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); ++ list_del(&open_info->msglistentry); ++ spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); ++ ++error0: + free_pages((unsigned long)out, + get_order(send_ringbuffer_size + recv_ringbuffer_size)); + kfree(open_info); +diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c +index 0634ee5..8f67c4d 100644 +--- a/drivers/md/raid10.c ++++ b/drivers/md/raid10.c +@@ -2641,7 +2641,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, + else { + bad_sectors -= (sector - first_bad); + if (max_sync > bad_sectors) +- max_sync = max_sync; ++ max_sync = bad_sectors; + continue; + } + } +diff --git a/drivers/media/video/au0828/au0828-video.c b/drivers/media/video/au0828/au0828-video.c +index 0b3e481..eab0641 100644 +--- a/drivers/media/video/au0828/au0828-video.c ++++ b/drivers/media/video/au0828/au0828-video.c +@@ -1692,14 +1692,18 @@ static int vidioc_streamoff(struct file *file, void *priv, + (AUVI_INPUT(i).audio_setup)(dev, 0); + } + +- videobuf_streamoff(&fh->vb_vidq); +- res_free(fh, AU0828_RESOURCE_VIDEO); ++ if (res_check(fh, AU0828_RESOURCE_VIDEO)) { ++ videobuf_streamoff(&fh->vb_vidq); ++ res_free(fh, AU0828_RESOURCE_VIDEO); ++ } + } else if (fh->type == V4L2_BUF_TYPE_VBI_CAPTURE) { + dev->vbi_timeout_running = 0; + del_timer_sync(&dev->vbi_timeout); + +- videobuf_streamoff(&fh->vb_vbiq); +- res_free(fh, AU0828_RESOURCE_VBI); ++ if (res_check(fh, AU0828_RESOURCE_VBI)) { ++ videobuf_streamoff(&fh->vb_vbiq); ++ res_free(fh, AU0828_RESOURCE_VBI); ++ } + } + + return 0; +diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c +index 3ed9c5e..daed698 100644 +--- a/drivers/mtd/nand/nand_base.c ++++ b/drivers/mtd/nand/nand_base.c +@@ -2903,9 +2903,7 @@ static int nand_flash_detect_onfi(struct mtd_info *mtd, struct nand_chip *chip, + if (le16_to_cpu(p->features) & 1) + *busw = NAND_BUSWIDTH_16; + +- chip->options &= ~NAND_CHIPOPTIONS_MSK; +- chip->options |= (NAND_NO_READRDY | +- NAND_NO_AUTOINCR) & NAND_CHIPOPTIONS_MSK; ++ chip->options |= NAND_NO_READRDY | NAND_NO_AUTOINCR; + + return 1; + } +@@ -3069,9 +3067,8 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd, + mtd->erasesize <<= ((id_data[3] & 0x03) << 1); + } + } +- /* Get chip options, preserve non chip based options */ +- chip->options &= ~NAND_CHIPOPTIONS_MSK; +- chip->options |= type->options & NAND_CHIPOPTIONS_MSK; ++ /* Get chip options */ ++ chip->options |= type->options; + + /* + * Check if chip is not a Samsung device. Do not clear the +diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c +index c5f6b0e..6546191 100644 +--- a/drivers/net/ethernet/intel/e1000/e1000_main.c ++++ b/drivers/net/ethernet/intel/e1000/e1000_main.c +@@ -168,6 +168,8 @@ static int e1000_82547_fifo_workaround(struct e1000_adapter *adapter, + + static bool e1000_vlan_used(struct e1000_adapter *adapter); + static void e1000_vlan_mode(struct net_device *netdev, u32 features); ++static void e1000_vlan_filter_on_off(struct e1000_adapter *adapter, ++ bool filter_on); + static void e1000_vlan_rx_add_vid(struct net_device *netdev, u16 vid); + static void e1000_vlan_rx_kill_vid(struct net_device *netdev, u16 vid); + static void e1000_restore_vlan(struct e1000_adapter *adapter); +@@ -1219,7 +1221,7 @@ static int __devinit e1000_probe(struct pci_dev *pdev, + if (err) + goto err_register; + +- e1000_vlan_mode(netdev, netdev->features); ++ e1000_vlan_filter_on_off(adapter, false); + + /* print bus type/speed/width info */ + e_info(probe, "(PCI%s:%dMHz:%d-bit) %pM\n", +@@ -4553,6 +4555,21 @@ static bool e1000_vlan_used(struct e1000_adapter *adapter) + return false; + } + ++static void __e1000_vlan_mode(struct e1000_adapter *adapter, u32 features) ++{ ++ struct e1000_hw *hw = &adapter->hw; ++ u32 ctrl; ++ ++ ctrl = er32(CTRL); ++ if (features & NETIF_F_HW_VLAN_RX) { ++ /* enable VLAN tag insert/strip */ ++ ctrl |= E1000_CTRL_VME; ++ } else { ++ /* disable VLAN tag insert/strip */ ++ ctrl &= ~E1000_CTRL_VME; ++ } ++ ew32(CTRL, ctrl); ++} + static void e1000_vlan_filter_on_off(struct e1000_adapter *adapter, + bool filter_on) + { +@@ -4562,6 +4579,7 @@ static void e1000_vlan_filter_on_off(struct e1000_adapter *adapter, + if (!test_bit(__E1000_DOWN, &adapter->flags)) + e1000_irq_disable(adapter); + ++ __e1000_vlan_mode(adapter, adapter->netdev->features); + if (filter_on) { + /* enable VLAN receive filtering */ + rctl = er32(RCTL); +@@ -4584,21 +4602,11 @@ static void e1000_vlan_filter_on_off(struct e1000_adapter *adapter, + static void e1000_vlan_mode(struct net_device *netdev, u32 features) + { + struct e1000_adapter *adapter = netdev_priv(netdev); +- struct e1000_hw *hw = &adapter->hw; +- u32 ctrl; + + if (!test_bit(__E1000_DOWN, &adapter->flags)) + e1000_irq_disable(adapter); + +- ctrl = er32(CTRL); +- if (features & NETIF_F_HW_VLAN_RX) { +- /* enable VLAN tag insert/strip */ +- ctrl |= E1000_CTRL_VME; +- } else { +- /* disable VLAN tag insert/strip */ +- ctrl &= ~E1000_CTRL_VME; +- } +- ew32(CTRL, ctrl); ++ __e1000_vlan_mode(adapter, features); + + if (!test_bit(__E1000_DOWN, &adapter->flags)) + e1000_irq_enable(adapter); +diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c +index dea0cb4..57be855 100644 +--- a/drivers/net/ethernet/marvell/skge.c ++++ b/drivers/net/ethernet/marvell/skge.c +@@ -4143,6 +4143,13 @@ static struct dmi_system_id skge_32bit_dma_boards[] = { + DMI_MATCH(DMI_BOARD_NAME, "nForce"), + }, + }, ++ { ++ .ident = "ASUS P5NSLI", ++ .matches = { ++ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), ++ DMI_MATCH(DMI_BOARD_NAME, "P5NSLI") ++ }, ++ }, + {} + }; + +diff --git a/drivers/net/wireless/ath/ath9k/ar9003_2p2_initvals.h b/drivers/net/wireless/ath/ath9k/ar9003_2p2_initvals.h +index 026f9de..cc54153 100644 +--- a/drivers/net/wireless/ath/ath9k/ar9003_2p2_initvals.h ++++ b/drivers/net/wireless/ath/ath9k/ar9003_2p2_initvals.h +@@ -835,107 +835,107 @@ static const u32 ar9300_2p2_baseband_core[][2] = { + + static const u32 ar9300Modes_high_power_tx_gain_table_2p2[][5] = { + /* Addr 5G_HT20 5G_HT40 2G_HT40 2G_HT20 */ +- {0x0000a2dc, 0x000cfff0, 0x000cfff0, 0x03aaa352, 0x03aaa352}, +- {0x0000a2e0, 0x000f0000, 0x000f0000, 0x03ccc584, 0x03ccc584}, +- {0x0000a2e4, 0x03f00000, 0x03f00000, 0x03f0f800, 0x03f0f800}, ++ {0x0000a2dc, 0x00033800, 0x00033800, 0x03aaa352, 0x03aaa352}, ++ {0x0000a2e0, 0x0003c000, 0x0003c000, 0x03ccc584, 0x03ccc584}, ++ {0x0000a2e4, 0x03fc0000, 0x03fc0000, 0x03f0f800, 0x03f0f800}, + {0x0000a2e8, 0x00000000, 0x00000000, 0x03ff0000, 0x03ff0000}, + {0x0000a410, 0x000050d9, 0x000050d9, 0x000050d9, 0x000050d9}, + {0x0000a500, 0x00000000, 0x00000000, 0x00000000, 0x00000000}, + {0x0000a504, 0x06000003, 0x06000003, 0x04000002, 0x04000002}, + {0x0000a508, 0x0a000020, 0x0a000020, 0x08000004, 0x08000004}, + {0x0000a50c, 0x10000023, 0x10000023, 0x0b000200, 0x0b000200}, +- {0x0000a510, 0x15000028, 0x15000028, 0x0f000202, 0x0f000202}, +- {0x0000a514, 0x1b00002b, 0x1b00002b, 0x12000400, 0x12000400}, +- {0x0000a518, 0x1f020028, 0x1f020028, 0x16000402, 0x16000402}, +- {0x0000a51c, 0x2502002b, 0x2502002b, 0x19000404, 0x19000404}, +- {0x0000a520, 0x2a04002a, 0x2a04002a, 0x1c000603, 0x1c000603}, +- {0x0000a524, 0x2e06002a, 0x2e06002a, 0x21000a02, 0x21000a02}, +- {0x0000a528, 0x3302202d, 0x3302202d, 0x25000a04, 0x25000a04}, +- {0x0000a52c, 0x3804202c, 0x3804202c, 0x28000a20, 0x28000a20}, +- {0x0000a530, 0x3c06202c, 0x3c06202c, 0x2c000e20, 0x2c000e20}, +- {0x0000a534, 0x4108202d, 0x4108202d, 0x30000e22, 0x30000e22}, +- {0x0000a538, 0x4506402d, 0x4506402d, 0x34000e24, 0x34000e24}, +- {0x0000a53c, 0x4906222d, 0x4906222d, 0x38001640, 0x38001640}, +- {0x0000a540, 0x4d062231, 0x4d062231, 0x3c001660, 0x3c001660}, +- {0x0000a544, 0x50082231, 0x50082231, 0x3f001861, 0x3f001861}, +- {0x0000a548, 0x5608422e, 0x5608422e, 0x43001a81, 0x43001a81}, +- {0x0000a54c, 0x5a08442e, 0x5a08442e, 0x47001a83, 0x47001a83}, +- {0x0000a550, 0x5e0a4431, 0x5e0a4431, 0x4a001c84, 0x4a001c84}, +- {0x0000a554, 0x640a4432, 0x640a4432, 0x4e001ce3, 0x4e001ce3}, +- {0x0000a558, 0x680a4434, 0x680a4434, 0x52001ce5, 0x52001ce5}, +- {0x0000a55c, 0x6c0a6434, 0x6c0a6434, 0x56001ce9, 0x56001ce9}, +- {0x0000a560, 0x6f0a6633, 0x6f0a6633, 0x5a001ceb, 0x5a001ceb}, +- {0x0000a564, 0x730c6634, 0x730c6634, 0x5d001eec, 0x5d001eec}, +- {0x0000a568, 0x730c6634, 0x730c6634, 0x5d001eec, 0x5d001eec}, +- {0x0000a56c, 0x730c6634, 0x730c6634, 0x5d001eec, 0x5d001eec}, +- {0x0000a570, 0x730c6634, 0x730c6634, 0x5d001eec, 0x5d001eec}, +- {0x0000a574, 0x730c6634, 0x730c6634, 0x5d001eec, 0x5d001eec}, +- {0x0000a578, 0x730c6634, 0x730c6634, 0x5d001eec, 0x5d001eec}, +- {0x0000a57c, 0x730c6634, 0x730c6634, 0x5d001eec, 0x5d001eec}, ++ {0x0000a510, 0x16000220, 0x16000220, 0x0f000202, 0x0f000202}, ++ {0x0000a514, 0x1c000223, 0x1c000223, 0x12000400, 0x12000400}, ++ {0x0000a518, 0x21002220, 0x21002220, 0x16000402, 0x16000402}, ++ {0x0000a51c, 0x27002223, 0x27002223, 0x19000404, 0x19000404}, ++ {0x0000a520, 0x2b022220, 0x2b022220, 0x1c000603, 0x1c000603}, ++ {0x0000a524, 0x2f022222, 0x2f022222, 0x21000a02, 0x21000a02}, ++ {0x0000a528, 0x34022225, 0x34022225, 0x25000a04, 0x25000a04}, ++ {0x0000a52c, 0x3a02222a, 0x3a02222a, 0x28000a20, 0x28000a20}, ++ {0x0000a530, 0x3e02222c, 0x3e02222c, 0x2c000e20, 0x2c000e20}, ++ {0x0000a534, 0x4202242a, 0x4202242a, 0x30000e22, 0x30000e22}, ++ {0x0000a538, 0x4702244a, 0x4702244a, 0x34000e24, 0x34000e24}, ++ {0x0000a53c, 0x4b02244c, 0x4b02244c, 0x38001640, 0x38001640}, ++ {0x0000a540, 0x4e02246c, 0x4e02246c, 0x3c001660, 0x3c001660}, ++ {0x0000a544, 0x52022470, 0x52022470, 0x3f001861, 0x3f001861}, ++ {0x0000a548, 0x55022490, 0x55022490, 0x43001a81, 0x43001a81}, ++ {0x0000a54c, 0x59022492, 0x59022492, 0x47001a83, 0x47001a83}, ++ {0x0000a550, 0x5d022692, 0x5d022692, 0x4a001c84, 0x4a001c84}, ++ {0x0000a554, 0x61022892, 0x61022892, 0x4e001ce3, 0x4e001ce3}, ++ {0x0000a558, 0x65024890, 0x65024890, 0x52001ce5, 0x52001ce5}, ++ {0x0000a55c, 0x69024892, 0x69024892, 0x56001ce9, 0x56001ce9}, ++ {0x0000a560, 0x6e024c92, 0x6e024c92, 0x5a001ceb, 0x5a001ceb}, ++ {0x0000a564, 0x74026e92, 0x74026e92, 0x5d001eec, 0x5d001eec}, ++ {0x0000a568, 0x74026e92, 0x74026e92, 0x5d001eec, 0x5d001eec}, ++ {0x0000a56c, 0x74026e92, 0x74026e92, 0x5d001eec, 0x5d001eec}, ++ {0x0000a570, 0x74026e92, 0x74026e92, 0x5d001eec, 0x5d001eec}, ++ {0x0000a574, 0x74026e92, 0x74026e92, 0x5d001eec, 0x5d001eec}, ++ {0x0000a578, 0x74026e92, 0x74026e92, 0x5d001eec, 0x5d001eec}, ++ {0x0000a57c, 0x74026e92, 0x74026e92, 0x5d001eec, 0x5d001eec}, + {0x0000a580, 0x00800000, 0x00800000, 0x00800000, 0x00800000}, + {0x0000a584, 0x06800003, 0x06800003, 0x04800002, 0x04800002}, + {0x0000a588, 0x0a800020, 0x0a800020, 0x08800004, 0x08800004}, + {0x0000a58c, 0x10800023, 0x10800023, 0x0b800200, 0x0b800200}, +- {0x0000a590, 0x15800028, 0x15800028, 0x0f800202, 0x0f800202}, +- {0x0000a594, 0x1b80002b, 0x1b80002b, 0x12800400, 0x12800400}, +- {0x0000a598, 0x1f820028, 0x1f820028, 0x16800402, 0x16800402}, +- {0x0000a59c, 0x2582002b, 0x2582002b, 0x19800404, 0x19800404}, +- {0x0000a5a0, 0x2a84002a, 0x2a84002a, 0x1c800603, 0x1c800603}, +- {0x0000a5a4, 0x2e86002a, 0x2e86002a, 0x21800a02, 0x21800a02}, +- {0x0000a5a8, 0x3382202d, 0x3382202d, 0x25800a04, 0x25800a04}, +- {0x0000a5ac, 0x3884202c, 0x3884202c, 0x28800a20, 0x28800a20}, +- {0x0000a5b0, 0x3c86202c, 0x3c86202c, 0x2c800e20, 0x2c800e20}, +- {0x0000a5b4, 0x4188202d, 0x4188202d, 0x30800e22, 0x30800e22}, +- {0x0000a5b8, 0x4586402d, 0x4586402d, 0x34800e24, 0x34800e24}, +- {0x0000a5bc, 0x4986222d, 0x4986222d, 0x38801640, 0x38801640}, +- {0x0000a5c0, 0x4d862231, 0x4d862231, 0x3c801660, 0x3c801660}, +- {0x0000a5c4, 0x50882231, 0x50882231, 0x3f801861, 0x3f801861}, +- {0x0000a5c8, 0x5688422e, 0x5688422e, 0x43801a81, 0x43801a81}, +- {0x0000a5cc, 0x5a88442e, 0x5a88442e, 0x47801a83, 0x47801a83}, +- {0x0000a5d0, 0x5e8a4431, 0x5e8a4431, 0x4a801c84, 0x4a801c84}, +- {0x0000a5d4, 0x648a4432, 0x648a4432, 0x4e801ce3, 0x4e801ce3}, +- {0x0000a5d8, 0x688a4434, 0x688a4434, 0x52801ce5, 0x52801ce5}, +- {0x0000a5dc, 0x6c8a6434, 0x6c8a6434, 0x56801ce9, 0x56801ce9}, +- {0x0000a5e0, 0x6f8a6633, 0x6f8a6633, 0x5a801ceb, 0x5a801ceb}, +- {0x0000a5e4, 0x738c6634, 0x738c6634, 0x5d801eec, 0x5d801eec}, +- {0x0000a5e8, 0x738c6634, 0x738c6634, 0x5d801eec, 0x5d801eec}, +- {0x0000a5ec, 0x738c6634, 0x738c6634, 0x5d801eec, 0x5d801eec}, +- {0x0000a5f0, 0x738c6634, 0x738c6634, 0x5d801eec, 0x5d801eec}, +- {0x0000a5f4, 0x738c6634, 0x738c6634, 0x5d801eec, 0x5d801eec}, +- {0x0000a5f8, 0x738c6634, 0x738c6634, 0x5d801eec, 0x5d801eec}, +- {0x0000a5fc, 0x738c6634, 0x738c6634, 0x5d801eec, 0x5d801eec}, ++ {0x0000a590, 0x16800220, 0x16800220, 0x0f800202, 0x0f800202}, ++ {0x0000a594, 0x1c800223, 0x1c800223, 0x12800400, 0x12800400}, ++ {0x0000a598, 0x21802220, 0x21802220, 0x16800402, 0x16800402}, ++ {0x0000a59c, 0x27802223, 0x27802223, 0x19800404, 0x19800404}, ++ {0x0000a5a0, 0x2b822220, 0x2b822220, 0x1c800603, 0x1c800603}, ++ {0x0000a5a4, 0x2f822222, 0x2f822222, 0x21800a02, 0x21800a02}, ++ {0x0000a5a8, 0x34822225, 0x34822225, 0x25800a04, 0x25800a04}, ++ {0x0000a5ac, 0x3a82222a, 0x3a82222a, 0x28800a20, 0x28800a20}, ++ {0x0000a5b0, 0x3e82222c, 0x3e82222c, 0x2c800e20, 0x2c800e20}, ++ {0x0000a5b4, 0x4282242a, 0x4282242a, 0x30800e22, 0x30800e22}, ++ {0x0000a5b8, 0x4782244a, 0x4782244a, 0x34800e24, 0x34800e24}, ++ {0x0000a5bc, 0x4b82244c, 0x4b82244c, 0x38801640, 0x38801640}, ++ {0x0000a5c0, 0x4e82246c, 0x4e82246c, 0x3c801660, 0x3c801660}, ++ {0x0000a5c4, 0x52822470, 0x52822470, 0x3f801861, 0x3f801861}, ++ {0x0000a5c8, 0x55822490, 0x55822490, 0x43801a81, 0x43801a81}, ++ {0x0000a5cc, 0x59822492, 0x59822492, 0x47801a83, 0x47801a83}, ++ {0x0000a5d0, 0x5d822692, 0x5d822692, 0x4a801c84, 0x4a801c84}, ++ {0x0000a5d4, 0x61822892, 0x61822892, 0x4e801ce3, 0x4e801ce3}, ++ {0x0000a5d8, 0x65824890, 0x65824890, 0x52801ce5, 0x52801ce5}, ++ {0x0000a5dc, 0x69824892, 0x69824892, 0x56801ce9, 0x56801ce9}, ++ {0x0000a5e0, 0x6e824c92, 0x6e824c92, 0x5a801ceb, 0x5a801ceb}, ++ {0x0000a5e4, 0x74826e92, 0x74826e92, 0x5d801eec, 0x5d801eec}, ++ {0x0000a5e8, 0x74826e92, 0x74826e92, 0x5d801eec, 0x5d801eec}, ++ {0x0000a5ec, 0x74826e92, 0x74826e92, 0x5d801eec, 0x5d801eec}, ++ {0x0000a5f0, 0x74826e92, 0x74826e92, 0x5d801eec, 0x5d801eec}, ++ {0x0000a5f4, 0x74826e92, 0x74826e92, 0x5d801eec, 0x5d801eec}, ++ {0x0000a5f8, 0x74826e92, 0x74826e92, 0x5d801eec, 0x5d801eec}, ++ {0x0000a5fc, 0x74826e92, 0x74826e92, 0x5d801eec, 0x5d801eec}, + {0x0000a600, 0x00000000, 0x00000000, 0x00000000, 0x00000000}, + {0x0000a604, 0x00000000, 0x00000000, 0x00000000, 0x00000000}, +- {0x0000a608, 0x01804601, 0x01804601, 0x00000000, 0x00000000}, +- {0x0000a60c, 0x01804601, 0x01804601, 0x00000000, 0x00000000}, +- {0x0000a610, 0x01804601, 0x01804601, 0x00000000, 0x00000000}, +- {0x0000a614, 0x01804601, 0x01804601, 0x01404000, 0x01404000}, +- {0x0000a618, 0x01804601, 0x01804601, 0x01404501, 0x01404501}, +- {0x0000a61c, 0x01804601, 0x01804601, 0x02008501, 0x02008501}, +- {0x0000a620, 0x03408d02, 0x03408d02, 0x0280ca03, 0x0280ca03}, +- {0x0000a624, 0x0300cc03, 0x0300cc03, 0x03010c04, 0x03010c04}, +- {0x0000a628, 0x03410d04, 0x03410d04, 0x04014c04, 0x04014c04}, +- {0x0000a62c, 0x03410d04, 0x03410d04, 0x04015005, 0x04015005}, +- {0x0000a630, 0x03410d04, 0x03410d04, 0x04015005, 0x04015005}, +- {0x0000a634, 0x03410d04, 0x03410d04, 0x04015005, 0x04015005}, +- {0x0000a638, 0x03410d04, 0x03410d04, 0x04015005, 0x04015005}, +- {0x0000a63c, 0x03410d04, 0x03410d04, 0x04015005, 0x04015005}, +- {0x0000b2dc, 0x000cfff0, 0x000cfff0, 0x03aaa352, 0x03aaa352}, +- {0x0000b2e0, 0x000f0000, 0x000f0000, 0x03ccc584, 0x03ccc584}, +- {0x0000b2e4, 0x03f00000, 0x03f00000, 0x03f0f800, 0x03f0f800}, ++ {0x0000a608, 0x00000000, 0x00000000, 0x00000000, 0x00000000}, ++ {0x0000a60c, 0x00000000, 0x00000000, 0x00000000, 0x00000000}, ++ {0x0000a610, 0x00000000, 0x00000000, 0x00000000, 0x00000000}, ++ {0x0000a614, 0x02004000, 0x02004000, 0x01404000, 0x01404000}, ++ {0x0000a618, 0x02004801, 0x02004801, 0x01404501, 0x01404501}, ++ {0x0000a61c, 0x02808a02, 0x02808a02, 0x02008501, 0x02008501}, ++ {0x0000a620, 0x0380ce03, 0x0380ce03, 0x0280ca03, 0x0280ca03}, ++ {0x0000a624, 0x04411104, 0x04411104, 0x03010c04, 0x03010c04}, ++ {0x0000a628, 0x04411104, 0x04411104, 0x04014c04, 0x04014c04}, ++ {0x0000a62c, 0x04411104, 0x04411104, 0x04015005, 0x04015005}, ++ {0x0000a630, 0x04411104, 0x04411104, 0x04015005, 0x04015005}, ++ {0x0000a634, 0x04411104, 0x04411104, 0x04015005, 0x04015005}, ++ {0x0000a638, 0x04411104, 0x04411104, 0x04015005, 0x04015005}, ++ {0x0000a63c, 0x04411104, 0x04411104, 0x04015005, 0x04015005}, ++ {0x0000b2dc, 0x00033800, 0x00033800, 0x03aaa352, 0x03aaa352}, ++ {0x0000b2e0, 0x0003c000, 0x0003c000, 0x03ccc584, 0x03ccc584}, ++ {0x0000b2e4, 0x03fc0000, 0x03fc0000, 0x03f0f800, 0x03f0f800}, + {0x0000b2e8, 0x00000000, 0x00000000, 0x03ff0000, 0x03ff0000}, +- {0x0000c2dc, 0x000cfff0, 0x000cfff0, 0x03aaa352, 0x03aaa352}, +- {0x0000c2e0, 0x000f0000, 0x000f0000, 0x03ccc584, 0x03ccc584}, +- {0x0000c2e4, 0x03f00000, 0x03f00000, 0x03f0f800, 0x03f0f800}, ++ {0x0000c2dc, 0x00033800, 0x00033800, 0x03aaa352, 0x03aaa352}, ++ {0x0000c2e0, 0x0003c000, 0x0003c000, 0x03ccc584, 0x03ccc584}, ++ {0x0000c2e4, 0x03fc0000, 0x03fc0000, 0x03f0f800, 0x03f0f800}, + {0x0000c2e8, 0x00000000, 0x00000000, 0x03ff0000, 0x03ff0000}, + {0x00016044, 0x012492d4, 0x012492d4, 0x012492d4, 0x012492d4}, +- {0x00016048, 0x61200001, 0x61200001, 0x66480001, 0x66480001}, ++ {0x00016048, 0x66480001, 0x66480001, 0x66480001, 0x66480001}, + {0x00016068, 0x6db6db6c, 0x6db6db6c, 0x6db6db6c, 0x6db6db6c}, + {0x00016444, 0x012492d4, 0x012492d4, 0x012492d4, 0x012492d4}, +- {0x00016448, 0x61200001, 0x61200001, 0x66480001, 0x66480001}, ++ {0x00016448, 0x66480001, 0x66480001, 0x66480001, 0x66480001}, + {0x00016468, 0x6db6db6c, 0x6db6db6c, 0x6db6db6c, 0x6db6db6c}, + {0x00016844, 0x012492d4, 0x012492d4, 0x012492d4, 0x012492d4}, +- {0x00016848, 0x61200001, 0x61200001, 0x66480001, 0x66480001}, ++ {0x00016848, 0x66480001, 0x66480001, 0x66480001, 0x66480001}, + {0x00016868, 0x6db6db6c, 0x6db6db6c, 0x6db6db6c, 0x6db6db6c}, + }; + +diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c +index 56bd370..da567f0 100644 +--- a/drivers/net/wireless/ipw2x00/ipw2200.c ++++ b/drivers/net/wireless/ipw2x00/ipw2200.c +@@ -10463,7 +10463,7 @@ static void ipw_handle_promiscuous_tx(struct ipw_priv *priv, + } else + len = src->len; + +- dst = alloc_skb(len + sizeof(*rt_hdr), GFP_ATOMIC); ++ dst = alloc_skb(len + sizeof(*rt_hdr) + sizeof(u16)*2, GFP_ATOMIC); + if (!dst) + continue; + +diff --git a/drivers/net/wireless/iwlwifi/iwl-6000.c b/drivers/net/wireless/iwlwifi/iwl-6000.c +index 9b6b010..4ac4ef0 100644 +--- a/drivers/net/wireless/iwlwifi/iwl-6000.c ++++ b/drivers/net/wireless/iwlwifi/iwl-6000.c +@@ -193,7 +193,7 @@ static int iwl6000_hw_channel_switch(struct iwl_priv *priv, + * See iwlagn_mac_channel_switch. + */ + struct iwl_rxon_context *ctx = &priv->contexts[IWL_RXON_CTX_BSS]; +- struct iwl6000_channel_switch_cmd cmd; ++ struct iwl6000_channel_switch_cmd *cmd; + const struct iwl_channel_info *ch_info; + u32 switch_time_in_usec, ucode_switch_time; + u16 ch; +@@ -203,18 +203,25 @@ static int iwl6000_hw_channel_switch(struct iwl_priv *priv, + struct ieee80211_vif *vif = ctx->vif; + struct iwl_host_cmd hcmd = { + .id = REPLY_CHANNEL_SWITCH, +- .len = { sizeof(cmd), }, ++ .len = { sizeof(*cmd), }, + .flags = CMD_SYNC, +- .data = { &cmd, }, ++ .dataflags[0] = IWL_HCMD_DFL_NOCOPY, + }; ++ int err; + +- cmd.band = priv->band == IEEE80211_BAND_2GHZ; ++ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); ++ if (!cmd) ++ return -ENOMEM; ++ ++ hcmd.data[0] = cmd; ++ ++ cmd->band = priv->band == IEEE80211_BAND_2GHZ; + ch = ch_switch->channel->hw_value; + IWL_DEBUG_11H(priv, "channel switch from %u to %u\n", + ctx->active.channel, ch); +- cmd.channel = cpu_to_le16(ch); +- cmd.rxon_flags = ctx->staging.flags; +- cmd.rxon_filter_flags = ctx->staging.filter_flags; ++ cmd->channel = cpu_to_le16(ch); ++ cmd->rxon_flags = ctx->staging.flags; ++ cmd->rxon_filter_flags = ctx->staging.filter_flags; + switch_count = ch_switch->count; + tsf_low = ch_switch->timestamp & 0x0ffffffff; + /* +@@ -230,30 +237,32 @@ static int iwl6000_hw_channel_switch(struct iwl_priv *priv, + switch_count = 0; + } + if (switch_count <= 1) +- cmd.switch_time = cpu_to_le32(priv->ucode_beacon_time); ++ cmd->switch_time = cpu_to_le32(priv->ucode_beacon_time); + else { + switch_time_in_usec = + vif->bss_conf.beacon_int * switch_count * TIME_UNIT; + ucode_switch_time = iwl_usecs_to_beacons(priv, + switch_time_in_usec, + beacon_interval); +- cmd.switch_time = iwl_add_beacon_time(priv, +- priv->ucode_beacon_time, +- ucode_switch_time, +- beacon_interval); ++ cmd->switch_time = iwl_add_beacon_time(priv, ++ priv->ucode_beacon_time, ++ ucode_switch_time, ++ beacon_interval); + } + IWL_DEBUG_11H(priv, "uCode time for the switch is 0x%x\n", +- cmd.switch_time); ++ cmd->switch_time); + ch_info = iwl_get_channel_info(priv, priv->band, ch); + if (ch_info) +- cmd.expect_beacon = is_channel_radar(ch_info); ++ cmd->expect_beacon = is_channel_radar(ch_info); + else { + IWL_ERR(priv, "invalid channel switch from %u to %u\n", + ctx->active.channel, ch); + return -EFAULT; + } + +- return iwl_trans_send_cmd(trans(priv), &hcmd); ++ err = iwl_trans_send_cmd(trans(priv), &hcmd); ++ kfree(cmd); ++ return err; + } + + static struct iwl_lib_ops iwl6000_lib = { +diff --git a/drivers/pcmcia/pxa2xx_sharpsl.c b/drivers/pcmcia/pxa2xx_sharpsl.c +index 69ae2fd..b938163 100644 +--- a/drivers/pcmcia/pxa2xx_sharpsl.c ++++ b/drivers/pcmcia/pxa2xx_sharpsl.c +@@ -219,7 +219,7 @@ static void sharpsl_pcmcia_socket_suspend(struct soc_pcmcia_socket *skt) + sharpsl_pcmcia_init_reset(skt); + } + +-static struct pcmcia_low_level sharpsl_pcmcia_ops __initdata = { ++static struct pcmcia_low_level sharpsl_pcmcia_ops = { + .owner = THIS_MODULE, + .hw_init = sharpsl_pcmcia_hw_init, + .hw_shutdown = sharpsl_pcmcia_hw_shutdown, +diff --git a/drivers/platform/x86/samsung-laptop.c b/drivers/platform/x86/samsung-laptop.c +index af1e296..21bc1a7 100644 +--- a/drivers/platform/x86/samsung-laptop.c ++++ b/drivers/platform/x86/samsung-laptop.c +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + + /* + * This driver is needed because a number of Samsung laptops do not hook +@@ -226,6 +227,7 @@ static struct backlight_device *backlight_device; + static struct mutex sabi_mutex; + static struct platform_device *sdev; + static struct rfkill *rfk; ++static bool handle_backlight; + static bool has_stepping_quirk; + + static int force; +@@ -602,6 +604,13 @@ static int __init samsung_init(void) + int retval; + + mutex_init(&sabi_mutex); ++ handle_backlight = true; ++ ++#ifdef CONFIG_ACPI ++ /* Don't handle backlight here if the acpi video already handle it */ ++ if (acpi_video_backlight_support()) ++ handle_backlight = false; ++#endif + + if (!force && !dmi_check_system(samsung_dmi_table)) + return -ENODEV; +@@ -661,7 +670,8 @@ static int __init samsung_init(void) + printk(KERN_DEBUG "ifaceP = 0x%08x\n", ifaceP); + printk(KERN_DEBUG "sabi_iface = %p\n", sabi_iface); + +- test_backlight(); ++ if (handle_backlight) ++ test_backlight(); + test_wireless(); + + retval = sabi_get_command(sabi_config->commands.get_brightness, +@@ -680,13 +690,23 @@ static int __init samsung_init(void) + } + + /* Check for stepping quirk */ +- check_for_stepping_quirk(); ++ if (handle_backlight) ++ check_for_stepping_quirk(); ++ ++#ifdef CONFIG_ACPI ++ /* Only log that if we are really on a sabi platform */ ++ if (acpi_video_backlight_support()) ++ pr_info("Backlight controlled by ACPI video driver\n"); ++#endif + + /* knock up a platform device to hang stuff off of */ + sdev = platform_device_register_simple("samsung", -1, NULL, 0); + if (IS_ERR(sdev)) + goto error_no_platform; + ++ if (!handle_backlight) ++ goto skip_backlight; ++ + /* create a backlight device to talk to this one */ + memset(&props, 0, sizeof(struct backlight_properties)); + props.type = BACKLIGHT_PLATFORM; +@@ -702,6 +722,7 @@ static int __init samsung_init(void) + backlight_device->props.power = FB_BLANK_UNBLANK; + backlight_update_status(backlight_device); + ++skip_backlight: + retval = init_wireless(sdev); + if (retval) + goto error_no_rfk; +diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c +index d93a960..bc744b4 100644 +--- a/drivers/rtc/rtc-imxdi.c ++++ b/drivers/rtc/rtc-imxdi.c +@@ -392,6 +392,8 @@ static int dryice_rtc_probe(struct platform_device *pdev) + if (imxdi->ioaddr == NULL) + return -ENOMEM; + ++ spin_lock_init(&imxdi->irq_lock); ++ + imxdi->irq = platform_get_irq(pdev, 0); + if (imxdi->irq < 0) + return imxdi->irq; +diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c +index 6888b2c..b3a729c 100644 +--- a/drivers/scsi/scsi_debug.c ++++ b/drivers/scsi/scsi_debug.c +@@ -2045,8 +2045,7 @@ static void unmap_region(sector_t lba, unsigned int len) + block = lba + alignment; + rem = do_div(block, granularity); + +- if (rem == 0 && lba + granularity <= end && +- block < map_size) ++ if (rem == 0 && lba + granularity < end && block < map_size) + clear_bit(block, map_storep); + + lba += granularity - rem; +diff --git a/drivers/staging/comedi/drivers/amplc_pc236.c b/drivers/staging/comedi/drivers/amplc_pc236.c +index 48246cd..b4311bf 100644 +--- a/drivers/staging/comedi/drivers/amplc_pc236.c ++++ b/drivers/staging/comedi/drivers/amplc_pc236.c +@@ -470,7 +470,7 @@ static int pc236_detach(struct comedi_device *dev) + { + printk(KERN_DEBUG "comedi%d: %s: detach\n", dev->minor, + PC236_DRIVER_NAME); +- if (devpriv) ++ if (dev->iobase) + pc236_intr_disable(dev); + + if (dev->irq) +diff --git a/drivers/staging/hv/storvsc_drv.c b/drivers/staging/hv/storvsc_drv.c +index ae8c33e..abc5ac5 100644 +--- a/drivers/staging/hv/storvsc_drv.c ++++ b/drivers/staging/hv/storvsc_drv.c +@@ -1043,7 +1043,12 @@ static int storvsc_host_reset(struct hv_device *device) + /* + * At this point, all outstanding requests in the adapter + * should have been flushed out and return to us ++ * There is a potential race here where the host may be in ++ * the process of responding when we return from here. ++ * Just wait for all in-transit packets to be accounted for ++ * before we return from here. + */ ++ storvsc_wait_to_drain(stor_device); + + cleanup: + return ret; +diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c +index dbf7d20..df7f15d 100644 +--- a/drivers/usb/class/cdc-acm.c ++++ b/drivers/usb/class/cdc-acm.c +@@ -760,10 +760,6 @@ static const __u32 acm_tty_speed[] = { + 2500000, 3000000, 3500000, 4000000 + }; + +-static const __u8 acm_tty_size[] = { +- 5, 6, 7, 8 +-}; +- + static void acm_tty_set_termios(struct tty_struct *tty, + struct ktermios *termios_old) + { +@@ -780,7 +776,21 @@ static void acm_tty_set_termios(struct tty_struct *tty, + newline.bParityType = termios->c_cflag & PARENB ? + (termios->c_cflag & PARODD ? 1 : 2) + + (termios->c_cflag & CMSPAR ? 2 : 0) : 0; +- newline.bDataBits = acm_tty_size[(termios->c_cflag & CSIZE) >> 4]; ++ switch (termios->c_cflag & CSIZE) { ++ case CS5: ++ newline.bDataBits = 5; ++ break; ++ case CS6: ++ newline.bDataBits = 6; ++ break; ++ case CS7: ++ newline.bDataBits = 7; ++ break; ++ case CS8: ++ default: ++ newline.bDataBits = 8; ++ break; ++ } + /* FIXME: Needs to clear unsupported bits in the termios */ + acm->clocal = ((termios->c_cflag & CLOCAL) != 0); + +@@ -1172,7 +1182,7 @@ made_compressed_probe: + + if (usb_endpoint_xfer_int(epwrite)) + usb_fill_int_urb(snd->urb, usb_dev, +- usb_sndbulkpipe(usb_dev, epwrite->bEndpointAddress), ++ usb_sndintpipe(usb_dev, epwrite->bEndpointAddress), + NULL, acm->writesize, acm_write_bulk, snd, epwrite->bInterval); + else + usb_fill_bulk_urb(snd->urb, usb_dev, +@@ -1496,6 +1506,9 @@ static const struct usb_device_id acm_ids[] = { + Maybe we should define a new + quirk for this. */ + }, ++ { USB_DEVICE(0x0572, 0x1340), /* Conexant CX93010-2x UCMxx */ ++ .driver_info = NO_UNION_NORMAL, ++ }, + { USB_DEVICE(0x1bbb, 0x0003), /* Alcatel OT-I650 */ + .driver_info = NO_UNION_NORMAL, /* reports zero length descriptor */ + }, +diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c +index 52340cc..a9a74d2 100644 +--- a/drivers/usb/core/hub.c ++++ b/drivers/usb/core/hub.c +@@ -482,13 +482,16 @@ static void hub_tt_work(struct work_struct *work) + int limit = 100; + + spin_lock_irqsave (&hub->tt.lock, flags); +- while (--limit && !list_empty (&hub->tt.clear_list)) { ++ while (!list_empty(&hub->tt.clear_list)) { + struct list_head *next; + struct usb_tt_clear *clear; + struct usb_device *hdev = hub->hdev; + const struct hc_driver *drv; + int status; + ++ if (!hub->quiescing && --limit < 0) ++ break; ++ + next = hub->tt.clear_list.next; + clear = list_entry (next, struct usb_tt_clear, clear_list); + list_del (&clear->clear_list); +@@ -952,7 +955,7 @@ static void hub_quiesce(struct usb_hub *hub, enum hub_quiescing_type type) + if (hub->has_indicators) + cancel_delayed_work_sync(&hub->leds); + if (hub->tt.hub) +- cancel_work_sync(&hub->tt.clear_work); ++ flush_work_sync(&hub->tt.clear_work); + } + + /* caller has locked the hub device */ +diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c +index d0ec2f0..c2815a5 100644 +--- a/drivers/usb/host/pci-quirks.c ++++ b/drivers/usb/host/pci-quirks.c +@@ -545,7 +545,14 @@ static const struct dmi_system_id __devinitconst ehci_dmi_nohandoff_table[] = { + /* Pegatron Lucid (Ordissimo AIRIS) */ + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "M11JB"), +- DMI_MATCH(DMI_BIOS_VERSION, "Lucid-GE-133"), ++ DMI_MATCH(DMI_BIOS_VERSION, "Lucid-"), ++ }, ++ }, ++ { ++ /* Pegatron Lucid (Ordissimo) */ ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "Ordissimo"), ++ DMI_MATCH(DMI_BIOS_VERSION, "Lucid-"), + }, + }, + { } +diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c +index 950aef8..0c6fb19 100644 +--- a/drivers/usb/host/xhci-ring.c ++++ b/drivers/usb/host/xhci-ring.c +@@ -1212,6 +1212,17 @@ static void xhci_cmd_to_noop(struct xhci_hcd *xhci, struct xhci_cd *cur_cd) + cur_seg = find_trb_seg(xhci->cmd_ring->first_seg, + xhci->cmd_ring->dequeue, &cycle_state); + ++ if (!cur_seg) { ++ xhci_warn(xhci, "Command ring mismatch, dequeue = %p %llx (dma)\n", ++ xhci->cmd_ring->dequeue, ++ (unsigned long long) ++ xhci_trb_virt_to_dma(xhci->cmd_ring->deq_seg, ++ xhci->cmd_ring->dequeue)); ++ xhci_debug_ring(xhci, xhci->cmd_ring); ++ xhci_dbg_ring_ptrs(xhci, xhci->cmd_ring); ++ return; ++ } ++ + /* find the command trb matched by cd from command ring */ + for (cmd_trb = xhci->cmd_ring->dequeue; + cmd_trb != xhci->cmd_ring->enqueue; +diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c +index f5c0f38..5a23f4d 100644 +--- a/drivers/usb/host/xhci.c ++++ b/drivers/usb/host/xhci.c +@@ -471,7 +471,8 @@ static bool compliance_mode_recovery_timer_quirk_check(void) + + if (strstr(dmi_product_name, "Z420") || + strstr(dmi_product_name, "Z620") || +- strstr(dmi_product_name, "Z820")) ++ strstr(dmi_product_name, "Z820") || ++ strstr(dmi_product_name, "Z1")) + return true; + + return false; +diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c +index 42de17b..d3addb2 100644 +--- a/drivers/usb/serial/mct_u232.c ++++ b/drivers/usb/serial/mct_u232.c +@@ -577,12 +577,14 @@ static void mct_u232_close(struct usb_serial_port *port) + { + dbg("%s port %d", __func__, port->number); + +- if (port->serial->dev) { +- /* shutdown our urbs */ +- usb_kill_urb(port->write_urb); +- usb_kill_urb(port->read_urb); +- usb_kill_urb(port->interrupt_in_urb); +- } ++ /* ++ * Must kill the read urb as it is actually an interrupt urb, which ++ * generic close thus fails to kill. ++ */ ++ usb_kill_urb(port->read_urb); ++ usb_kill_urb(port->interrupt_in_urb); ++ ++ usb_serial_generic_close(port); + } /* mct_u232_close */ + + +diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c +index b150ed9..d481f80 100644 +--- a/drivers/usb/serial/mos7840.c ++++ b/drivers/usb/serial/mos7840.c +@@ -235,12 +235,10 @@ struct moschip_port { + int port_num; /*Actual port number in the device(1,2,etc) */ + struct urb *write_urb; /* write URB for this port */ + struct urb *read_urb; /* read URB for this port */ +- struct urb *int_urb; + __u8 shadowLCR; /* last LCR value received */ + __u8 shadowMCR; /* last MCR value received */ + char open; + char open_ports; +- char zombie; + wait_queue_head_t wait_chase; /* for handling sleeping while waiting for chase to finish */ + wait_queue_head_t delta_msr_wait; /* for handling sleeping while waiting for msr change to happen */ + int delta_msr_cond; +@@ -505,7 +503,6 @@ static void mos7840_control_callback(struct urb *urb) + unsigned char *data; + struct moschip_port *mos7840_port; + __u8 regval = 0x0; +- int result = 0; + int status = urb->status; + + mos7840_port = urb->context; +@@ -524,7 +521,7 @@ static void mos7840_control_callback(struct urb *urb) + default: + dbg("%s - nonzero urb status received: %d", __func__, + status); +- goto exit; ++ return; + } + + dbg("%s urb buffer size is %d", __func__, urb->actual_length); +@@ -537,17 +534,6 @@ static void mos7840_control_callback(struct urb *urb) + mos7840_handle_new_msr(mos7840_port, regval); + else if (mos7840_port->MsrLsr == 1) + mos7840_handle_new_lsr(mos7840_port, regval); +- +-exit: +- spin_lock(&mos7840_port->pool_lock); +- if (!mos7840_port->zombie) +- result = usb_submit_urb(mos7840_port->int_urb, GFP_ATOMIC); +- spin_unlock(&mos7840_port->pool_lock); +- if (result) { +- dev_err(&urb->dev->dev, +- "%s - Error %d submitting interrupt urb\n", +- __func__, result); +- } + } + + static int mos7840_get_reg(struct moschip_port *mcs, __u16 Wval, __u16 reg, +@@ -655,14 +641,7 @@ static void mos7840_interrupt_callback(struct urb *urb) + wreg = MODEM_STATUS_REGISTER; + break; + } +- spin_lock(&mos7840_port->pool_lock); +- if (!mos7840_port->zombie) { +- rv = mos7840_get_reg(mos7840_port, wval, wreg, &Data); +- } else { +- spin_unlock(&mos7840_port->pool_lock); +- return; +- } +- spin_unlock(&mos7840_port->pool_lock); ++ rv = mos7840_get_reg(mos7840_port, wval, wreg, &Data); + } + } + } +@@ -2594,7 +2573,6 @@ error: + kfree(mos7840_port->ctrl_buf); + usb_free_urb(mos7840_port->control_urb); + kfree(mos7840_port); +- serial->port[i] = NULL; + } + return status; + } +@@ -2625,9 +2603,6 @@ static void mos7840_disconnect(struct usb_serial *serial) + mos7840_port = mos7840_get_port_private(serial->port[i]); + dbg ("mos7840_port %d = %p", i, mos7840_port); + if (mos7840_port) { +- spin_lock_irqsave(&mos7840_port->pool_lock, flags); +- mos7840_port->zombie = 1; +- spin_unlock_irqrestore(&mos7840_port->pool_lock, flags); + usb_kill_urb(mos7840_port->control_urb); + } + } +@@ -2661,6 +2636,7 @@ static void mos7840_release(struct usb_serial *serial) + mos7840_port = mos7840_get_port_private(serial->port[i]); + dbg("mos7840_port %d = %p", i, mos7840_port); + if (mos7840_port) { ++ usb_free_urb(mos7840_port->control_urb); + kfree(mos7840_port->ctrl_buf); + kfree(mos7840_port->dr); + kfree(mos7840_port); +diff --git a/drivers/usb/serial/opticon.c b/drivers/usb/serial/opticon.c +index c248a91..d6c5ed6 100644 +--- a/drivers/usb/serial/opticon.c ++++ b/drivers/usb/serial/opticon.c +@@ -160,7 +160,11 @@ static int send_control_msg(struct usb_serial_port *port, u8 requesttype, + { + struct usb_serial *serial = port->serial; + int retval; +- u8 buffer[2]; ++ u8 *buffer; ++ ++ buffer = kzalloc(1, GFP_KERNEL); ++ if (!buffer) ++ return -ENOMEM; + + buffer[0] = val; + /* Send the message to the vendor control endpoint +@@ -169,6 +173,7 @@ static int send_control_msg(struct usb_serial_port *port, u8 requesttype, + requesttype, + USB_DIR_OUT|USB_TYPE_VENDOR|USB_RECIP_INTERFACE, + 0, 0, buffer, 1, 0); ++ kfree(buffer); + + return retval; + } +@@ -292,7 +297,7 @@ static int opticon_write(struct tty_struct *tty, struct usb_serial_port *port, + if (!dr) { + dev_err(&port->dev, "out of memory\n"); + count = -ENOMEM; +- goto error; ++ goto error_no_dr; + } + + dr->bRequestType = USB_TYPE_VENDOR | USB_RECIP_INTERFACE | USB_DIR_OUT; +@@ -322,6 +327,8 @@ static int opticon_write(struct tty_struct *tty, struct usb_serial_port *port, + + return count; + error: ++ kfree(dr); ++error_no_dr: + usb_free_urb(urb); + error_no_urb: + kfree(buffer); +diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c +index 3fd4e6f..c334670 100644 +--- a/drivers/usb/serial/option.c ++++ b/drivers/usb/serial/option.c +@@ -503,11 +503,19 @@ static const struct option_blacklist_info net_intf5_blacklist = { + .reserved = BIT(5), + }; + ++static const struct option_blacklist_info net_intf6_blacklist = { ++ .reserved = BIT(6), ++}; ++ + static const struct option_blacklist_info zte_mf626_blacklist = { + .sendsetup = BIT(0) | BIT(1), + .reserved = BIT(4), + }; + ++static const struct option_blacklist_info zte_1255_blacklist = { ++ .reserved = BIT(3) | BIT(4), ++}; ++ + static const struct usb_device_id option_ids[] = { + { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COLT) }, + { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA) }, +@@ -853,13 +861,19 @@ static const struct usb_device_id option_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0113, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&net_intf5_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0117, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0118, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0121, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0118, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf5_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0121, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf5_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0122, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0123, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0124, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0125, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0126, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0123, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0124, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf5_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0125, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf6_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0126, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf5_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0128, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0142, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0143, 0xff, 0xff, 0xff) }, +@@ -872,7 +886,8 @@ static const struct usb_device_id option_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0156, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0157, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&net_intf5_blacklist }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0158, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0158, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf3_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0159, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0161, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0162, 0xff, 0xff, 0xff) }, +@@ -880,13 +895,22 @@ static const struct usb_device_id option_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0165, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0167, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0191, 0xff, 0xff, 0xff), /* ZTE EuFi890 */ ++ .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0199, 0xff, 0xff, 0xff), /* ZTE MF820S */ ++ .driver_info = (kernel_ulong_t)&net_intf1_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0257, 0xff, 0xff, 0xff), /* ZTE MF821 */ ++ .driver_info = (kernel_ulong_t)&net_intf3_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0326, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1008, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1010, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1012, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1012, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1021, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf2_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1057, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1058, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1059, 0xff, 0xff, 0xff) }, +@@ -1002,18 +1026,24 @@ static const struct usb_device_id option_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1169, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1170, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1244, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1245, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1245, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1246, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1247, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1247, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1248, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1249, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1250, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1251, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1252, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1252, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1253, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1254, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1255, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1256, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1254, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1255, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&zte_1255_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1256, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1257, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1258, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1259, 0xff, 0xff, 0xff) }, +@@ -1058,8 +1088,16 @@ static const struct usb_device_id option_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1298, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1299, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1300, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1401, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf2_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1402, 0xff, 0xff, 0xff), + .driver_info = (kernel_ulong_t)&net_intf2_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1424, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf2_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1425, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf2_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x1426, 0xff, 0xff, 0xff), /* ZTE MF91 */ ++ .driver_info = (kernel_ulong_t)&net_intf2_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x2002, 0xff, + 0xff, 0xff), .driver_info = (kernel_ulong_t)&zte_k3765_z_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x2003, 0xff, 0xff, 0xff) }, +@@ -1071,15 +1109,21 @@ static const struct usb_device_id option_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0070, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0073, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0094, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0130, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0133, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0141, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0130, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf1_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0133, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf3_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0141, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf5_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0147, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0152, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0168, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0168, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0170, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0176, 0xff, 0xff, 0xff) }, +- { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0178, 0xff, 0xff, 0xff) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0176, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf3_blacklist }, ++ { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0178, 0xff, 0xff, 0xff), ++ .driver_info = (kernel_ulong_t)&net_intf3_blacklist }, + + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_CDMA_TECH, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_AC8710, 0xff, 0xff, 0xff) }, +diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c +index 535d087..e1f1ebd 100644 +--- a/drivers/usb/serial/sierra.c ++++ b/drivers/usb/serial/sierra.c +@@ -171,7 +171,6 @@ static int sierra_probe(struct usb_serial *serial, + { + int result = 0; + struct usb_device *udev; +- struct sierra_intf_private *data; + u8 ifnum; + + udev = serial->dev; +@@ -199,11 +198,6 @@ static int sierra_probe(struct usb_serial *serial, + return -ENODEV; + } + +- data = serial->private = kzalloc(sizeof(struct sierra_intf_private), GFP_KERNEL); +- if (!data) +- return -ENOMEM; +- spin_lock_init(&data->susp_lock); +- + return result; + } + +@@ -915,6 +909,7 @@ static void sierra_dtr_rts(struct usb_serial_port *port, int on) + static int sierra_startup(struct usb_serial *serial) + { + struct usb_serial_port *port; ++ struct sierra_intf_private *intfdata; + struct sierra_port_private *portdata; + struct sierra_iface_info *himemoryp = NULL; + int i; +@@ -922,6 +917,14 @@ static int sierra_startup(struct usb_serial *serial) + + dev_dbg(&serial->dev->dev, "%s\n", __func__); + ++ intfdata = kzalloc(sizeof(*intfdata), GFP_KERNEL); ++ if (!intfdata) ++ return -ENOMEM; ++ ++ spin_lock_init(&intfdata->susp_lock); ++ ++ usb_set_serial_data(serial, intfdata); ++ + /* Set Device mode to D0 */ + sierra_set_power_state(serial->dev, 0x0000); + +@@ -937,7 +940,7 @@ static int sierra_startup(struct usb_serial *serial) + dev_dbg(&port->dev, "%s: kmalloc for " + "sierra_port_private (%d) failed!\n", + __func__, i); +- return -ENOMEM; ++ goto err; + } + spin_lock_init(&portdata->lock); + init_usb_anchor(&portdata->active); +@@ -974,6 +977,14 @@ static int sierra_startup(struct usb_serial *serial) + } + + return 0; ++err: ++ for (--i; i >= 0; --i) { ++ portdata = usb_get_serial_port_data(serial->port[i]); ++ kfree(portdata); ++ } ++ kfree(intfdata); ++ ++ return -ENOMEM; + } + + static void sierra_release(struct usb_serial *serial) +@@ -993,6 +1004,7 @@ static void sierra_release(struct usb_serial *serial) + continue; + kfree(portdata); + } ++ kfree(serial->private); + } + + #ifdef CONFIG_PM +diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c +index 5b073bc..59d646d 100644 +--- a/drivers/usb/serial/whiteheat.c ++++ b/drivers/usb/serial/whiteheat.c +@@ -576,6 +576,7 @@ no_firmware: + "%s: please contact support@connecttech.com\n", + serial->type->description); + kfree(result); ++ kfree(command); + return -ENODEV; + + no_command_private: +diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h +index 591f57f..fa8a1b2 100644 +--- a/drivers/usb/storage/unusual_devs.h ++++ b/drivers/usb/storage/unusual_devs.h +@@ -1004,6 +1004,12 @@ UNUSUAL_DEV( 0x07cf, 0x1001, 0x1000, 0x9999, + USB_SC_8070, USB_PR_CB, NULL, + US_FL_NEED_OVERRIDE | US_FL_FIX_INQUIRY ), + ++/* Submitted by Oleksandr Chumachenko */ ++UNUSUAL_DEV( 0x07cf, 0x1167, 0x0100, 0x0100, ++ "Casio", ++ "EX-N1 DigitalCamera", ++ USB_SC_8070, USB_PR_DEVICE, NULL, 0), ++ + /* Submitted by Hartmut Wahl */ + UNUSUAL_DEV( 0x0839, 0x000a, 0x0001, 0x0001, + "Samsung", +diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c +index 882a51f..b76071e 100644 +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -371,7 +371,8 @@ static void handle_rx(struct vhost_net *net) + .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE + }; + size_t total_len = 0; +- int err, headcount, mergeable; ++ int err, mergeable; ++ s16 headcount; + size_t vhost_hlen, sock_hlen; + size_t vhost_len, sock_len; + /* TODO: check that we are running from vhost_worker? */ +diff --git a/drivers/video/udlfb.c b/drivers/video/udlfb.c +index 41746bb..cb5988f 100644 +--- a/drivers/video/udlfb.c ++++ b/drivers/video/udlfb.c +@@ -646,7 +646,7 @@ static ssize_t dlfb_ops_write(struct fb_info *info, const char __user *buf, + result = fb_sys_write(info, buf, count, ppos); + + if (result > 0) { +- int start = max((int)(offset / info->fix.line_length) - 1, 0); ++ int start = max((int)(offset / info->fix.line_length), 0); + int lines = min((u32)((result / info->fix.line_length) + 1), + (u32)info->var.yres); + +diff --git a/drivers/video/via/via_clock.c b/drivers/video/via/via_clock.c +index af8f26b..db1e392 100644 +--- a/drivers/video/via/via_clock.c ++++ b/drivers/video/via/via_clock.c +@@ -25,6 +25,7 @@ + + #include + #include ++#include + #include "via_clock.h" + #include "global.h" + #include "debug.h" +@@ -289,6 +290,10 @@ static void dummy_set_pll(struct via_pll_config config) + printk(KERN_INFO "Using undocumented set PLL.\n%s", via_slap); + } + ++static void noop_set_clock_state(u8 state) ++{ ++} ++ + void via_clock_init(struct via_clock *clock, int gfx_chip) + { + switch (gfx_chip) { +@@ -346,4 +351,18 @@ void via_clock_init(struct via_clock *clock, int gfx_chip) + break; + + } ++ ++ if (machine_is_olpc()) { ++ /* The OLPC XO-1.5 cannot suspend/resume reliably if the ++ * IGA1/IGA2 clocks are set as on or off (memory rot ++ * occasionally happens during suspend under such ++ * configurations). ++ * ++ * The only known stable scenario is to leave this bits as-is, ++ * which in their default states are documented to enable the ++ * clock only when it is needed. ++ */ ++ clock->set_primary_clock_state = noop_set_clock_state; ++ clock->set_secondary_clock_state = noop_set_clock_state; ++ } + } +diff --git a/fs/ceph/export.c b/fs/ceph/export.c +index 9fbcdec..b001030 100644 +--- a/fs/ceph/export.c ++++ b/fs/ceph/export.c +@@ -91,7 +91,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, + * FIXME: we should try harder by querying the mds for the ino. + */ + static struct dentry *__fh_to_dentry(struct super_block *sb, +- struct ceph_nfs_fh *fh) ++ struct ceph_nfs_fh *fh, int fh_len) + { + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct inode *inode; +@@ -99,6 +99,9 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, + struct ceph_vino vino; + int err; + ++ if (fh_len < sizeof(*fh) / 4) ++ return ERR_PTR(-ESTALE); ++ + dout("__fh_to_dentry %llx\n", fh->ino); + vino.ino = fh->ino; + vino.snap = CEPH_NOSNAP; +@@ -142,7 +145,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, + * convert connectable fh to dentry + */ + static struct dentry *__cfh_to_dentry(struct super_block *sb, +- struct ceph_nfs_confh *cfh) ++ struct ceph_nfs_confh *cfh, int fh_len) + { + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct inode *inode; +@@ -150,6 +153,9 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb, + struct ceph_vino vino; + int err; + ++ if (fh_len < sizeof(*cfh) / 4) ++ return ERR_PTR(-ESTALE); ++ + dout("__cfh_to_dentry %llx (%llx/%x)\n", + cfh->ino, cfh->parent_ino, cfh->parent_name_hash); + +@@ -199,9 +205,11 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) + { + if (fh_type == 1) +- return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw); ++ return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw, ++ fh_len); + else +- return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw); ++ return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw, ++ fh_len); + } + + /* +@@ -222,6 +230,8 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, + + if (fh_type == 1) + return ERR_PTR(-ESTALE); ++ if (fh_len < sizeof(*cfh) / 4) ++ return ERR_PTR(-ESTALE); + + pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino, + cfh->parent_name_hash); +diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c +index 51352de..f854cf9 100644 +--- a/fs/compat_ioctl.c ++++ b/fs/compat_ioctl.c +@@ -210,6 +210,8 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, + + err = get_user(palp, &up->palette); + err |= get_user(length, &up->length); ++ if (err) ++ return -EFAULT; + + up_native = compat_alloc_user_space(sizeof(struct video_spu_palette)); + err = put_user(compat_ptr(palp), &up_native->palette); +diff --git a/fs/exec.c b/fs/exec.c +index 160cd2f..121ccae 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1095,7 +1095,7 @@ int flush_old_exec(struct linux_binprm * bprm) + bprm->mm = NULL; /* We're using it now */ + + set_fs(USER_DS); +- current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD); ++ current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD | PF_NOFREEZE); + flush_thread(); + current->personality &= ~bprm->per_clear; + +diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c +index 54f2bdc..191580a 100644 +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -2715,6 +2715,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) + #define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ + #define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ + ++#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ ++#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ ++ + /* + * ext4_split_extent_at() splits an extent at given block. + * +@@ -2750,6 +2753,9 @@ static int ext4_split_extent_at(handle_t *handle, + unsigned int ee_len, depth; + int err = 0; + ++ BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == ++ (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); ++ + ext_debug("ext4_split_extents_at: inode %lu, logical" + "block %llu\n", inode->i_ino, (unsigned long long)split); + +@@ -2808,7 +2814,14 @@ static int ext4_split_extent_at(handle_t *handle, + + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { +- err = ext4_ext_zeroout(inode, &orig_ex); ++ if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { ++ if (split_flag & EXT4_EXT_DATA_VALID1) ++ err = ext4_ext_zeroout(inode, ex2); ++ else ++ err = ext4_ext_zeroout(inode, ex); ++ } else ++ err = ext4_ext_zeroout(inode, &orig_ex); ++ + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ +@@ -2861,12 +2874,13 @@ static int ext4_split_extent(handle_t *handle, + uninitialized = ext4_ext_is_uninitialized(ex); + + if (map->m_lblk + map->m_len < ee_block + ee_len) { +- split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? +- EXT4_EXT_MAY_ZEROOUT : 0; ++ split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; + flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; + if (uninitialized) + split_flag1 |= EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2; ++ if (split_flag & EXT4_EXT_DATA_VALID2) ++ split_flag1 |= EXT4_EXT_DATA_VALID1; + err = ext4_split_extent_at(handle, inode, path, + map->m_lblk + map->m_len, split_flag1, flags1); + if (err) +@@ -2879,8 +2893,8 @@ static int ext4_split_extent(handle_t *handle, + return PTR_ERR(path); + + if (map->m_lblk >= ee_block) { +- split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? +- EXT4_EXT_MAY_ZEROOUT : 0; ++ split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT | ++ EXT4_EXT_DATA_VALID2); + if (uninitialized) + split_flag1 |= EXT4_EXT_MARK_UNINIT1; + if (split_flag & EXT4_EXT_MARK_UNINIT2) +@@ -3158,26 +3172,47 @@ static int ext4_split_unwritten_extents(handle_t *handle, + + split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; + split_flag |= EXT4_EXT_MARK_UNINIT2; +- ++ if (flags & EXT4_GET_BLOCKS_CONVERT) ++ split_flag |= EXT4_EXT_DATA_VALID2; + flags |= EXT4_GET_BLOCKS_PRE_IO; + return ext4_split_extent(handle, inode, path, map, split_flag, flags); + } + + static int ext4_convert_unwritten_extents_endio(handle_t *handle, +- struct inode *inode, +- struct ext4_ext_path *path) ++ struct inode *inode, ++ struct ext4_map_blocks *map, ++ struct ext4_ext_path *path) + { + struct ext4_extent *ex; ++ ext4_lblk_t ee_block; ++ unsigned int ee_len; + int depth; + int err = 0; + + depth = ext_depth(inode); + ex = path[depth].p_ext; ++ ee_block = le32_to_cpu(ex->ee_block); ++ ee_len = ext4_ext_get_actual_len(ex); + + ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" + "block %llu, max_blocks %u\n", inode->i_ino, +- (unsigned long long)le32_to_cpu(ex->ee_block), +- ext4_ext_get_actual_len(ex)); ++ (unsigned long long)ee_block, ee_len); ++ ++ /* If extent is larger than requested then split is required */ ++ if (ee_block != map->m_lblk || ee_len > map->m_len) { ++ err = ext4_split_unwritten_extents(handle, inode, map, path, ++ EXT4_GET_BLOCKS_CONVERT); ++ if (err < 0) ++ goto out; ++ ext4_ext_drop_refs(path); ++ path = ext4_ext_find_extent(inode, map->m_lblk, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ goto out; ++ } ++ depth = ext_depth(inode); ++ ex = path[depth].p_ext; ++ } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) +@@ -3479,7 +3514,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, + } + /* IO end_io complete, convert the filled extent to written */ + if ((flags & EXT4_GET_BLOCKS_CONVERT)) { +- ret = ext4_convert_unwritten_extents_endio(handle, inode, ++ ret = ext4_convert_unwritten_extents_endio(handle, inode, map, + path); + if (ret >= 0) { + ext4_update_inode_fsync_trans(handle, inode, 1); +diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c +index fe9945f..5235d6e 100644 +--- a/fs/gfs2/export.c ++++ b/fs/gfs2/export.c +@@ -167,6 +167,8 @@ static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, + case GFS2_SMALL_FH_SIZE: + case GFS2_LARGE_FH_SIZE: + case GFS2_OLD_FH_SIZE: ++ if (fh_len < GFS2_SMALL_FH_SIZE) ++ return NULL; + this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32; + this.no_formal_ino |= be32_to_cpu(fh[1]); + this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32; +@@ -186,6 +188,8 @@ static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid, + switch (fh_type) { + case GFS2_LARGE_FH_SIZE: + case GFS2_OLD_FH_SIZE: ++ if (fh_len < GFS2_LARGE_FH_SIZE) ++ return NULL; + parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32; + parent.no_formal_ino |= be32_to_cpu(fh[5]); + parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32; +diff --git a/fs/isofs/export.c b/fs/isofs/export.c +index dd4687f..516eb21 100644 +--- a/fs/isofs/export.c ++++ b/fs/isofs/export.c +@@ -179,7 +179,7 @@ static struct dentry *isofs_fh_to_parent(struct super_block *sb, + { + struct isofs_fid *ifid = (struct isofs_fid *)fid; + +- if (fh_type != 2) ++ if (fh_len < 2 || fh_type != 2) + return NULL; + + return isofs_export_iget(sb, +diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c +index 8799207..931bf95 100644 +--- a/fs/jbd/commit.c ++++ b/fs/jbd/commit.c +@@ -86,7 +86,12 @@ nope: + static void release_data_buffer(struct buffer_head *bh) + { + if (buffer_freed(bh)) { ++ WARN_ON_ONCE(buffer_dirty(bh)); + clear_buffer_freed(bh); ++ clear_buffer_mapped(bh); ++ clear_buffer_new(bh); ++ clear_buffer_req(bh); ++ bh->b_bdev = NULL; + release_buffer_page(bh); + } else + put_bh(bh); +@@ -847,17 +852,35 @@ restart_loop: + * there's no point in keeping a checkpoint record for + * it. */ + +- /* A buffer which has been freed while still being +- * journaled by a previous transaction may end up still +- * being dirty here, but we want to avoid writing back +- * that buffer in the future after the "add to orphan" +- * operation been committed, That's not only a performance +- * gain, it also stops aliasing problems if the buffer is +- * left behind for writeback and gets reallocated for another +- * use in a different page. */ +- if (buffer_freed(bh) && !jh->b_next_transaction) { +- clear_buffer_freed(bh); +- clear_buffer_jbddirty(bh); ++ /* ++ * A buffer which has been freed while still being journaled by ++ * a previous transaction. ++ */ ++ if (buffer_freed(bh)) { ++ /* ++ * If the running transaction is the one containing ++ * "add to orphan" operation (b_next_transaction != ++ * NULL), we have to wait for that transaction to ++ * commit before we can really get rid of the buffer. ++ * So just clear b_modified to not confuse transaction ++ * credit accounting and refile the buffer to ++ * BJ_Forget of the running transaction. If the just ++ * committed transaction contains "add to orphan" ++ * operation, we can completely invalidate the buffer ++ * now. We are rather throughout in that since the ++ * buffer may be still accessible when blocksize < ++ * pagesize and it is attached to the last partial ++ * page. ++ */ ++ jh->b_modified = 0; ++ if (!jh->b_next_transaction) { ++ clear_buffer_freed(bh); ++ clear_buffer_jbddirty(bh); ++ clear_buffer_mapped(bh); ++ clear_buffer_new(bh); ++ clear_buffer_req(bh); ++ bh->b_bdev = NULL; ++ } + } + + if (buffer_jbddirty(bh)) { +diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c +index 7e59c6e..edac004 100644 +--- a/fs/jbd/transaction.c ++++ b/fs/jbd/transaction.c +@@ -1839,15 +1839,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) + * We're outside-transaction here. Either or both of j_running_transaction + * and j_committing_transaction may be NULL. + */ +-static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) ++static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, ++ int partial_page) + { + transaction_t *transaction; + struct journal_head *jh; + int may_free = 1; +- int ret; + + BUFFER_TRACE(bh, "entry"); + ++retry: + /* + * It is safe to proceed here without the j_list_lock because the + * buffers cannot be stolen by try_to_free_buffers as long as we are +@@ -1875,10 +1876,18 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the +- * buffer can be reallocated and used by a different page. ++ * block can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. ++ * ++ * Also we have to clear buffer_mapped flag of a truncated buffer ++ * because the buffer_head may be attached to the page straddling ++ * i_size (can happen only when blocksize < pagesize) and thus the ++ * buffer_head can be reused when the file is extended again. So we end ++ * up keeping around invalidated buffers attached to transactions' ++ * BJ_Forget list just to stop checkpointing code from cleaning up ++ * the transaction this buffer was modified in. + */ + transaction = jh->b_transaction; + if (transaction == NULL) { +@@ -1905,13 +1914,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) + * committed, the buffer won't be needed any + * longer. */ + JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); +- ret = __dispose_buffer(jh, ++ may_free = __dispose_buffer(jh, + journal->j_running_transaction); +- journal_put_journal_head(jh); +- spin_unlock(&journal->j_list_lock); +- jbd_unlock_bh_state(bh); +- spin_unlock(&journal->j_state_lock); +- return ret; ++ goto zap_buffer; + } else { + /* There is no currently-running transaction. So the + * orphan record which we wrote for this file must have +@@ -1919,13 +1924,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) + * the committing transaction, if it exists. */ + if (journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "give to committing trans"); +- ret = __dispose_buffer(jh, ++ may_free = __dispose_buffer(jh, + journal->j_committing_transaction); +- journal_put_journal_head(jh); +- spin_unlock(&journal->j_list_lock); +- jbd_unlock_bh_state(bh); +- spin_unlock(&journal->j_state_lock); +- return ret; ++ goto zap_buffer; + } else { + /* The orphan record's transaction has + * committed. We can cleanse this buffer */ +@@ -1946,10 +1947,24 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) + } + /* + * The buffer is committing, we simply cannot touch +- * it. So we just set j_next_transaction to the +- * running transaction (if there is one) and mark +- * buffer as freed so that commit code knows it should +- * clear dirty bits when it is done with the buffer. ++ * it. If the page is straddling i_size we have to wait ++ * for commit and try again. ++ */ ++ if (partial_page) { ++ tid_t tid = journal->j_committing_transaction->t_tid; ++ ++ journal_put_journal_head(jh); ++ spin_unlock(&journal->j_list_lock); ++ jbd_unlock_bh_state(bh); ++ spin_unlock(&journal->j_state_lock); ++ log_wait_commit(journal, tid); ++ goto retry; ++ } ++ /* ++ * OK, buffer won't be reachable after truncate. We just set ++ * j_next_transaction to the running transaction (if there is ++ * one) and mark buffer as freed so that commit code knows it ++ * should clear dirty bits when it is done with the buffer. + */ + set_buffer_freed(bh); + if (journal->j_running_transaction && buffer_jbddirty(bh)) +@@ -1972,6 +1987,14 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) + } + + zap_buffer: ++ /* ++ * This is tricky. Although the buffer is truncated, it may be reused ++ * if blocksize < pagesize and it is attached to the page straddling ++ * EOF. Since the buffer might have been added to BJ_Forget list of the ++ * running transaction, journal_get_write_access() won't clear ++ * b_modified and credit accounting gets confused. So clear b_modified ++ * here. */ ++ jh->b_modified = 0; + journal_put_journal_head(jh); + zap_buffer_no_jh: + spin_unlock(&journal->j_list_lock); +@@ -2020,7 +2043,8 @@ void journal_invalidatepage(journal_t *journal, + if (offset <= curr_off) { + /* This block is wholly outside the truncation point */ + lock_buffer(bh); +- may_free &= journal_unmap_buffer(journal, bh); ++ may_free &= journal_unmap_buffer(journal, bh, ++ offset > 0); + unlock_buffer(bh); + } + curr_off = next_off; +diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c +index 36057ce..6e2a2d5 100644 +--- a/fs/lockd/clntxdr.c ++++ b/fs/lockd/clntxdr.c +@@ -223,7 +223,7 @@ static void encode_nlm_stat(struct xdr_stream *xdr, + { + __be32 *p; + +- BUG_ON(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD); ++ WARN_ON_ONCE(be32_to_cpu(stat) > NLM_LCK_DENIED_GRACE_PERIOD); + p = xdr_reserve_space(xdr, 4); + *p = stat; + } +diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c +index df753a1..23d7451 100644 +--- a/fs/lockd/mon.c ++++ b/fs/lockd/mon.c +@@ -40,7 +40,6 @@ struct nsm_args { + u32 proc; + + char *mon_name; +- char *nodename; + }; + + struct nsm_res { +@@ -94,7 +93,6 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) + .vers = 3, + .proc = NLMPROC_NSM_NOTIFY, + .mon_name = nsm->sm_mon_name, +- .nodename = utsname()->nodename, + }; + struct rpc_message msg = { + .rpc_argp = &args, +@@ -431,7 +429,7 @@ static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp) + { + __be32 *p; + +- encode_nsm_string(xdr, argp->nodename); ++ encode_nsm_string(xdr, utsname()->nodename); + p = xdr_reserve_space(xdr, 4 + 4 + 4); + *p++ = cpu_to_be32(argp->prog); + *p++ = cpu_to_be32(argp->vers); +diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c +index d27aab1..d413af3 100644 +--- a/fs/lockd/svcproc.c ++++ b/fs/lockd/svcproc.c +@@ -67,7 +67,8 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, + + /* Obtain file pointer. Not used by FREE_ALL call. */ + if (filp != NULL) { +- if ((error = nlm_lookup_file(rqstp, &file, &lock->fh)) != 0) ++ error = cast_status(nlm_lookup_file(rqstp, &file, &lock->fh)); ++ if (error != 0) + goto no_locks; + *filp = file; + +diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c +index 4cfe260..d225b51 100644 +--- a/fs/nfsd/nfs4state.c ++++ b/fs/nfsd/nfs4state.c +@@ -3673,6 +3673,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + + nfsd4_close_open_stateid(stp); ++ release_last_closed_stateid(oo); + oo->oo_last_closed_stid = stp; + + /* place unused nfs4_stateowners on so_close_lru list to be +diff --git a/fs/proc/stat.c b/fs/proc/stat.c +index 0855e6f..4c9a859 100644 +--- a/fs/proc/stat.c ++++ b/fs/proc/stat.c +@@ -24,11 +24,14 @@ + + static cputime64_t get_idle_time(int cpu) + { +- u64 idle_time = get_cpu_idle_time_us(cpu, NULL); ++ u64 idle_time = -1ULL; + cputime64_t idle; + ++ if (cpu_online(cpu)) ++ idle_time = get_cpu_idle_time_us(cpu, NULL); ++ + if (idle_time == -1ULL) { +- /* !NO_HZ so we can rely on cpustat.idle */ ++ /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ + idle = kstat_cpu(cpu).cpustat.idle; + idle = cputime64_add(idle, arch_idle_time(cpu)); + } else +@@ -39,11 +42,14 @@ static cputime64_t get_idle_time(int cpu) + + static cputime64_t get_iowait_time(int cpu) + { +- u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL); ++ u64 iowait_time = -1ULL; + cputime64_t iowait; + ++ if (cpu_online(cpu)) ++ iowait_time = get_cpu_iowait_time_us(cpu, NULL); ++ + if (iowait_time == -1ULL) +- /* !NO_HZ so we can rely on cpustat.iowait */ ++ /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ + iowait = kstat_cpu(cpu).cpustat.iowait; + else + iowait = usecs_to_cputime64(iowait_time); +diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c +index 950f13a..5809abb 100644 +--- a/fs/reiserfs/inode.c ++++ b/fs/reiserfs/inode.c +@@ -1573,8 +1573,10 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + reiserfs_warning(sb, "reiserfs-13077", + "nfsd/reiserfs, fhtype=%d, len=%d - odd", + fh_type, fh_len); +- fh_type = 5; ++ fh_type = fh_len; + } ++ if (fh_len < 2) ++ return NULL; + + return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1], + (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0); +@@ -1583,6 +1585,8 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) + { ++ if (fh_type > fh_len) ++ fh_type = fh_len; + if (fh_type < 4) + return NULL; + +diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c +index 7fdf6a7..fabbb81 100644 +--- a/fs/sysfs/dir.c ++++ b/fs/sysfs/dir.c +@@ -430,20 +430,18 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) + /** + * sysfs_pathname - return full path to sysfs dirent + * @sd: sysfs_dirent whose path we want +- * @path: caller allocated buffer ++ * @path: caller allocated buffer of size PATH_MAX + * + * Gives the name "/" to the sysfs_root entry; any path returned + * is relative to wherever sysfs is mounted. +- * +- * XXX: does no error checking on @path size + */ + static char *sysfs_pathname(struct sysfs_dirent *sd, char *path) + { + if (sd->s_parent) { + sysfs_pathname(sd->s_parent, path); +- strcat(path, "/"); ++ strlcat(path, "/", PATH_MAX); + } +- strcat(path, sd->s_name); ++ strlcat(path, sd->s_name, PATH_MAX); + return path; + } + +@@ -476,9 +474,11 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) + char *path = kzalloc(PATH_MAX, GFP_KERNEL); + WARN(1, KERN_WARNING + "sysfs: cannot create duplicate filename '%s'\n", +- (path == NULL) ? sd->s_name : +- strcat(strcat(sysfs_pathname(acxt->parent_sd, path), "/"), +- sd->s_name)); ++ (path == NULL) ? sd->s_name ++ : (sysfs_pathname(acxt->parent_sd, path), ++ strlcat(path, "/", PATH_MAX), ++ strlcat(path, sd->s_name, PATH_MAX), ++ path)); + kfree(path); + } + +diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c +index 558910f..5703fb8 100644 +--- a/fs/xfs/xfs_export.c ++++ b/fs/xfs/xfs_export.c +@@ -195,6 +195,9 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid, + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; + struct inode *inode = NULL; + ++ if (fh_len < xfs_fileid_length(fileid_type)) ++ return NULL; ++ + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: + inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino, +diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h +index 12d5543..c944c4f 100644 +--- a/include/linux/if_vlan.h ++++ b/include/linux/if_vlan.h +@@ -97,6 +97,8 @@ static inline int is_vlan_dev(struct net_device *dev) + } + + #define vlan_tx_tag_present(__skb) ((__skb)->vlan_tci & VLAN_TAG_PRESENT) ++#define vlan_tx_nonzero_tag_present(__skb) \ ++ (vlan_tx_tag_present(__skb) && ((__skb)->vlan_tci & VLAN_VID_MASK)) + #define vlan_tx_tag_get(__skb) ((__skb)->vlan_tci & ~VLAN_TAG_PRESENT) + + #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) +@@ -106,7 +108,7 @@ extern struct net_device *__vlan_find_dev_deep(struct net_device *real_dev, + extern struct net_device *vlan_dev_real_dev(const struct net_device *dev); + extern u16 vlan_dev_vlan_id(const struct net_device *dev); + +-extern bool vlan_do_receive(struct sk_buff **skb, bool last_handler); ++extern bool vlan_do_receive(struct sk_buff **skb); + extern struct sk_buff *vlan_untag(struct sk_buff *skb); + + #else +@@ -128,10 +130,8 @@ static inline u16 vlan_dev_vlan_id(const struct net_device *dev) + return 0; + } + +-static inline bool vlan_do_receive(struct sk_buff **skb, bool last_handler) ++static inline bool vlan_do_receive(struct sk_buff **skb) + { +- if (((*skb)->vlan_tci & VLAN_VID_MASK) && last_handler) +- (*skb)->pkt_type = PACKET_OTHERHOST; + return false; + } + +diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h +index 904131b..b25b09b 100644 +--- a/include/linux/mtd/nand.h ++++ b/include/linux/mtd/nand.h +@@ -215,9 +215,6 @@ typedef enum { + #define NAND_SUBPAGE_READ(chip) ((chip->ecc.mode == NAND_ECC_SOFT) \ + && (chip->page_shift > 9)) + +-/* Mask to zero out the chip options, which come from the id table */ +-#define NAND_CHIPOPTIONS_MSK (0x0000ffff & ~NAND_NO_AUTOINCR) +- + /* Non chip related options */ + /* This option skips the bbt scan during initialization. */ + #define NAND_SKIP_BBTSCAN 0x00010000 +diff --git a/include/linux/netfilter/xt_set.h b/include/linux/netfilter/xt_set.h +index c0405ac..e3a9978 100644 +--- a/include/linux/netfilter/xt_set.h ++++ b/include/linux/netfilter/xt_set.h +@@ -58,8 +58,8 @@ struct xt_set_info_target_v1 { + struct xt_set_info_target_v2 { + struct xt_set_info add_set; + struct xt_set_info del_set; +- u32 flags; +- u32 timeout; ++ __u32 flags; ++ __u32 timeout; + }; + + #endif /*_XT_SET_H*/ +diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h +index a88fb69..ea6f8a4 100644 +--- a/include/net/netfilter/nf_conntrack_ecache.h ++++ b/include/net/netfilter/nf_conntrack_ecache.h +@@ -18,6 +18,7 @@ struct nf_conntrack_ecache { + u16 ctmask; /* bitmask of ct events to be delivered */ + u16 expmask; /* bitmask of expect events to be delivered */ + u32 pid; /* netlink pid of destroyer */ ++ struct timer_list timeout; + }; + + static inline struct nf_conntrack_ecache * +diff --git a/kernel/cgroup.c b/kernel/cgroup.c +index cdc0354..6337535 100644 +--- a/kernel/cgroup.c ++++ b/kernel/cgroup.c +@@ -1803,9 +1803,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, + * trading it for newcg is protected by cgroup_mutex, we're safe to drop + * it here; it will be freed under RCU. + */ +- put_css_set(oldcg); +- + set_bit(CGRP_RELEASABLE, &oldcgrp->flags); ++ put_css_set(oldcg); + return 0; + } + +diff --git a/kernel/module.c b/kernel/module.c +index 6969ef0..6c8fa34 100644 +--- a/kernel/module.c ++++ b/kernel/module.c +@@ -2659,6 +2659,10 @@ static int check_module_license_and_versions(struct module *mod) + if (strcmp(mod->name, "driverloader") == 0) + add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + ++ /* lve claims to be GPL but upstream won't provide source */ ++ if (strcmp(mod->name, "lve") == 0) ++ add_taint_module(mod, TAINT_PROPRIETARY_MODULE); ++ + #ifdef CONFIG_MODVERSIONS + if ((mod->num_syms && !mod->crcs) + || (mod->num_gpl_syms && !mod->gpl_crcs) +diff --git a/kernel/sys.c b/kernel/sys.c +index c504302..d7c4ab0 100644 +--- a/kernel/sys.c ++++ b/kernel/sys.c +@@ -1171,15 +1171,16 @@ DECLARE_RWSEM(uts_sem); + * Work around broken programs that cannot handle "Linux 3.0". + * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 + */ +-static int override_release(char __user *release, int len) ++static int override_release(char __user *release, size_t len) + { + int ret = 0; +- char buf[65]; + + if (current->personality & UNAME26) { +- char *rest = UTS_RELEASE; ++ const char *rest = UTS_RELEASE; ++ char buf[65] = { 0 }; + int ndots = 0; + unsigned v; ++ size_t copy; + + while (*rest) { + if (*rest == '.' && ++ndots >= 3) +@@ -1189,8 +1190,9 @@ static int override_release(char __user *release, int len) + rest++; + } + v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; +- snprintf(buf, len, "2.6.%u%s", v, rest); +- ret = copy_to_user(release, buf, len); ++ copy = min(sizeof(buf), max_t(size_t, 1, len)); ++ copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); ++ ret = copy_to_user(release, buf, copy + 1); + } + return ret; + } +diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c +index 5ee1ac0..cb7f33e 100644 +--- a/kernel/time/timekeeping.c ++++ b/kernel/time/timekeeping.c +@@ -992,7 +992,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) + } + + /* Accumulate raw time */ +- raw_nsecs = timekeeper.raw_interval << shift; ++ raw_nsecs = (u64)timekeeper.raw_interval << shift; + raw_nsecs += raw_time.tv_nsec; + if (raw_nsecs >= NSEC_PER_SEC) { + u64 raw_secs = raw_nsecs; +diff --git a/kernel/timer.c b/kernel/timer.c +index 9c3c62b..c219db6 100644 +--- a/kernel/timer.c ++++ b/kernel/timer.c +@@ -63,6 +63,7 @@ EXPORT_SYMBOL(jiffies_64); + #define TVR_SIZE (1 << TVR_BITS) + #define TVN_MASK (TVN_SIZE - 1) + #define TVR_MASK (TVR_SIZE - 1) ++#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) + + struct tvec { + struct list_head vec[TVN_SIZE]; +@@ -356,11 +357,12 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) + vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); + } else { + int i; +- /* If the timeout is larger than 0xffffffff on 64-bit +- * architectures then we use the maximum timeout: ++ /* If the timeout is larger than MAX_TVAL (on 64-bit ++ * architectures or with CONFIG_BASE_SMALL=1) then we ++ * use the maximum timeout. + */ +- if (idx > 0xffffffffUL) { +- idx = 0xffffffffUL; ++ if (idx > MAX_TVAL) { ++ idx = MAX_TVAL; + expires = idx + base->timer_jiffies; + } + i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; +diff --git a/lib/genalloc.c b/lib/genalloc.c +index f352cc4..716f947 100644 +--- a/lib/genalloc.c ++++ b/lib/genalloc.c +@@ -176,7 +176,7 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy + struct gen_pool_chunk *chunk; + int nbits = size >> pool->min_alloc_order; + int nbytes = sizeof(struct gen_pool_chunk) + +- (nbits + BITS_PER_BYTE - 1) / BITS_PER_BYTE; ++ BITS_TO_LONGS(nbits) * sizeof(long); + + chunk = kmalloc_node(nbytes, GFP_KERNEL | __GFP_ZERO, nid); + if (unlikely(chunk == NULL)) +diff --git a/mm/rmap.c b/mm/rmap.c +index a4fd368..8685697 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -56,6 +56,7 @@ + #include + #include + #include ++#include + + #include + +@@ -935,11 +936,8 @@ int page_mkclean(struct page *page) + + if (page_mapped(page)) { + struct address_space *mapping = page_mapping(page); +- if (mapping) { ++ if (mapping) + ret = page_mkclean_file(mapping, page); +- if (page_test_and_clear_dirty(page_to_pfn(page), 1)) +- ret = 1; +- } + } + + return ret; +@@ -1120,6 +1118,8 @@ void page_add_file_rmap(struct page *page) + */ + void page_remove_rmap(struct page *page) + { ++ struct address_space *mapping = page_mapping(page); ++ + /* page still mapped by someone else? */ + if (!atomic_add_negative(-1, &page->_mapcount)) + return; +@@ -1130,8 +1130,19 @@ void page_remove_rmap(struct page *page) + * this if the page is anon, so about to be freed; but perhaps + * not if it's in swapcache - there might be another pte slot + * containing the swap entry, but page not yet written to swap. ++ * ++ * And we can skip it on file pages, so long as the filesystem ++ * participates in dirty tracking; but need to catch shm and tmpfs ++ * and ramfs pages which have been modified since creation by read ++ * fault. ++ * ++ * Note that mapping must be decided above, before decrementing ++ * mapcount (which luckily provides a barrier): once page is unmapped, ++ * it could be truncated and page->mapping reset to NULL at any moment. ++ * Note also that we are relying on page_mapping(page) to set mapping ++ * to &swapper_space when PageSwapCache(page). + */ +- if ((!PageAnon(page) || PageSwapCache(page)) && ++ if (mapping && !mapping_cap_account_dirty(mapping) && + page_test_and_clear_dirty(page_to_pfn(page), 1)) + set_page_dirty(page); + /* +diff --git a/mm/shmem.c b/mm/shmem.c +index 7a82174..126ca35 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -1962,12 +1962,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, + { + struct inode *inode; + struct dentry *dentry = NULL; +- u64 inum = fid->raw[2]; +- inum = (inum << 32) | fid->raw[1]; ++ u64 inum; + + if (fh_len < 3) + return NULL; + ++ inum = fid->raw[2]; ++ inum = (inum << 32) | fid->raw[1]; ++ + inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), + shmem_match, fid->raw); + if (inode) { +diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c +index 9ddbd4e..e860a4f 100644 +--- a/net/8021q/vlan_core.c ++++ b/net/8021q/vlan_core.c +@@ -5,7 +5,7 @@ + #include + #include "vlan.h" + +-bool vlan_do_receive(struct sk_buff **skbp, bool last_handler) ++bool vlan_do_receive(struct sk_buff **skbp) + { + struct sk_buff *skb = *skbp; + u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK; +@@ -13,14 +13,8 @@ bool vlan_do_receive(struct sk_buff **skbp, bool last_handler) + struct vlan_pcpu_stats *rx_stats; + + vlan_dev = vlan_find_dev(skb->dev, vlan_id); +- if (!vlan_dev) { +- /* Only the last call to vlan_do_receive() should change +- * pkt_type to PACKET_OTHERHOST +- */ +- if (vlan_id && last_handler) +- skb->pkt_type = PACKET_OTHERHOST; ++ if (!vlan_dev) + return false; +- } + + skb = *skbp = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) +diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c +index c27b4e3..1849ee0 100644 +--- a/net/bluetooth/smp.c ++++ b/net/bluetooth/smp.c +@@ -30,6 +30,8 @@ + + #define SMP_TIMEOUT 30000 /* 30 seconds */ + ++#define AUTH_REQ_MASK 0x07 ++ + static inline void swap128(u8 src[16], u8 dst[16]) + { + int i; +@@ -206,7 +208,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn, + req->max_key_size = SMP_MAX_ENC_KEY_SIZE; + req->init_key_dist = dist_keys; + req->resp_key_dist = dist_keys; +- req->auth_req = authreq; ++ req->auth_req = (authreq & AUTH_REQ_MASK); + return; + } + +@@ -215,7 +217,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn, + rsp->max_key_size = SMP_MAX_ENC_KEY_SIZE; + rsp->init_key_dist = req->init_key_dist & dist_keys; + rsp->resp_key_dist = req->resp_key_dist & dist_keys; +- rsp->auth_req = authreq; ++ rsp->auth_req = (authreq & AUTH_REQ_MASK); + } + + static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) +diff --git a/net/core/dev.c b/net/core/dev.c +index abe1147..f500a69 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -3278,18 +3278,18 @@ another_round: + ncls: + #endif + +- rx_handler = rcu_dereference(skb->dev->rx_handler); + if (vlan_tx_tag_present(skb)) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } +- if (vlan_do_receive(&skb, !rx_handler)) ++ if (vlan_do_receive(&skb)) + goto another_round; + else if (unlikely(!skb)) + goto out; + } + ++ rx_handler = rcu_dereference(skb->dev->rx_handler); + if (rx_handler) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); +@@ -3309,6 +3309,9 @@ ncls: + } + } + ++ if (vlan_tx_nonzero_tag_present(skb)) ++ skb->pkt_type = PACKET_OTHERHOST; ++ + /* deliver only exact match when indicated */ + null_or_dev = deliver_exact ? skb->dev : NULL; + +diff --git a/net/core/neighbour.c b/net/core/neighbour.c +index 7aafaed..5b9709f 100644 +--- a/net/core/neighbour.c ++++ b/net/core/neighbour.c +@@ -1254,8 +1254,6 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) + if (!dst) + goto discard; + +- __skb_pull(skb, skb_network_offset(skb)); +- + if (!neigh_event_send(neigh, skb)) { + int err; + struct net_device *dev = neigh->dev; +@@ -1265,6 +1263,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) + neigh_hh_init(neigh, dst); + + do { ++ __skb_pull(skb, skb_network_offset(skb)); + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); +@@ -1295,9 +1294,8 @@ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) + unsigned int seq; + int err; + +- __skb_pull(skb, skb_network_offset(skb)); +- + do { ++ __skb_pull(skb, skb_network_offset(skb)); + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); +diff --git a/net/core/pktgen.c b/net/core/pktgen.c +index df878de..7bc9991 100644 +--- a/net/core/pktgen.c ++++ b/net/core/pktgen.c +@@ -2935,7 +2935,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, + sizeof(struct ipv6hdr) - sizeof(struct udphdr) - + pkt_dev->pkt_overhead; + +- if (datalen < sizeof(struct pktgen_hdr)) { ++ if (datalen < 0 || datalen < sizeof(struct pktgen_hdr)) { + datalen = sizeof(struct pktgen_hdr); + if (net_ratelimit()) + pr_info("increased datalen to %d\n", datalen); +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c +index de69cec..58c09a0 100644 +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -651,10 +651,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) + arg.csumoffset = offsetof(struct tcphdr, check) / 2; + arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; + /* When socket is gone, all binding information is lost. +- * routing might fail in this case. using iif for oif to +- * make sure we can deliver it ++ * routing might fail in this case. No choice here, if we choose to force ++ * input interface, we will misroute in case of asymmetric route. + */ +- arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb); ++ if (sk) ++ arg.bound_dev_if = sk->sk_bound_dev_if; + + net = dev_net(skb_dst(skb)->dev); + arg.tos = ip_hdr(skb)->tos; +diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c +index 4a56574..ccab3c8 100644 +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -1048,7 +1048,8 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, + __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); + + fl6.flowi6_proto = IPPROTO_TCP; +- fl6.flowi6_oif = inet6_iif(skb); ++ if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL) ++ fl6.flowi6_oif = inet6_iif(skb); + fl6.fl6_dport = t1->dest; + fl6.fl6_sport = t1->source; + security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); +diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c +index 28a39bb..a582504 100644 +--- a/net/mac80211/wpa.c ++++ b/net/mac80211/wpa.c +@@ -106,7 +106,8 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx) + if (status->flag & RX_FLAG_MMIC_ERROR) + goto mic_fail; + +- if (!(status->flag & RX_FLAG_IV_STRIPPED) && rx->key) ++ if (!(status->flag & RX_FLAG_IV_STRIPPED) && rx->key && ++ rx->key->conf.cipher == WLAN_CIPHER_SUITE_TKIP) + goto update_iv; + + return RX_CONTINUE; +diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c +index 1d15193..7489bd3 100644 +--- a/net/netfilter/nf_conntrack_core.c ++++ b/net/netfilter/nf_conntrack_core.c +@@ -247,12 +247,15 @@ static void death_by_event(unsigned long ul_conntrack) + { + struct nf_conn *ct = (void *)ul_conntrack; + struct net *net = nf_ct_net(ct); ++ struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); ++ ++ BUG_ON(ecache == NULL); + + if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { + /* bad luck, let's retry again */ +- ct->timeout.expires = jiffies + ++ ecache->timeout.expires = jiffies + + (random32() % net->ct.sysctl_events_retry_timeout); +- add_timer(&ct->timeout); ++ add_timer(&ecache->timeout); + return; + } + /* we've got the event delivered, now it's dying */ +@@ -266,6 +269,9 @@ static void death_by_event(unsigned long ul_conntrack) + void nf_ct_insert_dying_list(struct nf_conn *ct) + { + struct net *net = nf_ct_net(ct); ++ struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); ++ ++ BUG_ON(ecache == NULL); + + /* add this conntrack to the dying list */ + spin_lock_bh(&nf_conntrack_lock); +@@ -273,10 +279,10 @@ void nf_ct_insert_dying_list(struct nf_conn *ct) + &net->ct.dying); + spin_unlock_bh(&nf_conntrack_lock); + /* set a new timer to retry event delivery */ +- setup_timer(&ct->timeout, death_by_event, (unsigned long)ct); +- ct->timeout.expires = jiffies + ++ setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct); ++ ecache->timeout.expires = jiffies + + (random32() % net->ct.sysctl_events_retry_timeout); +- add_timer(&ct->timeout); ++ add_timer(&ecache->timeout); + } + EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); + +diff --git a/net/rds/send.c b/net/rds/send.c +index 96531d4..88eace5 100644 +--- a/net/rds/send.c ++++ b/net/rds/send.c +@@ -1122,7 +1122,7 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) + rds_stats_inc(s_send_pong); + + if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) +- rds_send_xmit(conn); ++ queue_delayed_work(rds_wq, &conn->c_send_w, 0); + + rds_message_put(rm); + return 0; +diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c +index 4530a91..237a2ee 100644 +--- a/net/sunrpc/cache.c ++++ b/net/sunrpc/cache.c +@@ -1404,11 +1404,11 @@ static ssize_t read_flush(struct file *file, char __user *buf, + size_t count, loff_t *ppos, + struct cache_detail *cd) + { +- char tbuf[20]; ++ char tbuf[22]; + unsigned long p = *ppos; + size_t len; + +- sprintf(tbuf, "%lu\n", convert_to_wallclock(cd->flush_time)); ++ snprintf(tbuf, sizeof(tbuf), "%lu\n", convert_to_wallclock(cd->flush_time)); + len = strlen(tbuf); + if (p >= len) + return 0; +diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c +index 10a385b..65fe23b 100644 +--- a/net/sunrpc/xprtsock.c ++++ b/net/sunrpc/xprtsock.c +@@ -254,7 +254,6 @@ struct sock_xprt { + void (*old_data_ready)(struct sock *, int); + void (*old_state_change)(struct sock *); + void (*old_write_space)(struct sock *); +- void (*old_error_report)(struct sock *); + }; + + /* +@@ -737,10 +736,10 @@ static int xs_tcp_send_request(struct rpc_task *task) + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); + case -ECONNRESET: +- case -EPIPE: + xs_tcp_shutdown(xprt); + case -ECONNREFUSED: + case -ENOTCONN: ++ case -EPIPE: + clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + } + +@@ -781,7 +780,6 @@ static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk) + transport->old_data_ready = sk->sk_data_ready; + transport->old_state_change = sk->sk_state_change; + transport->old_write_space = sk->sk_write_space; +- transport->old_error_report = sk->sk_error_report; + } + + static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *sk) +@@ -789,7 +787,6 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s + sk->sk_data_ready = transport->old_data_ready; + sk->sk_state_change = transport->old_state_change; + sk->sk_write_space = transport->old_write_space; +- sk->sk_error_report = transport->old_error_report; + } + + static void xs_reset_transport(struct sock_xprt *transport) +@@ -1465,7 +1462,7 @@ static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) + xprt_clear_connecting(xprt); + } + +-static void xs_sock_mark_closed(struct rpc_xprt *xprt) ++static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) + { + smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); +@@ -1473,6 +1470,11 @@ static void xs_sock_mark_closed(struct rpc_xprt *xprt) + clear_bit(XPRT_CLOSE_WAIT, &xprt->state); + clear_bit(XPRT_CLOSING, &xprt->state); + smp_mb__after_clear_bit(); ++} ++ ++static void xs_sock_mark_closed(struct rpc_xprt *xprt) ++{ ++ xs_sock_reset_connection_flags(xprt); + /* Mark transport as closed and wake up all pending tasks */ + xprt_disconnect_done(xprt); + } +@@ -1528,6 +1530,7 @@ static void xs_tcp_state_change(struct sock *sk) + case TCP_CLOSE_WAIT: + /* The server initiated a shutdown of the socket */ + xprt->connect_cookie++; ++ clear_bit(XPRT_CONNECTED, &xprt->state); + xs_tcp_force_close(xprt); + case TCP_CLOSING: + /* +@@ -1552,25 +1555,6 @@ static void xs_tcp_state_change(struct sock *sk) + read_unlock_bh(&sk->sk_callback_lock); + } + +-/** +- * xs_error_report - callback mainly for catching socket errors +- * @sk: socket +- */ +-static void xs_error_report(struct sock *sk) +-{ +- struct rpc_xprt *xprt; +- +- read_lock_bh(&sk->sk_callback_lock); +- if (!(xprt = xprt_from_sock(sk))) +- goto out; +- dprintk("RPC: %s client %p...\n" +- "RPC: error %d\n", +- __func__, xprt, sk->sk_err); +- xprt_wake_pending_tasks(xprt, -EAGAIN); +-out: +- read_unlock_bh(&sk->sk_callback_lock); +-} +- + static void xs_write_space(struct sock *sk) + { + struct socket *sock; +@@ -1870,7 +1854,6 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, + sk->sk_user_data = xprt; + sk->sk_data_ready = xs_local_data_ready; + sk->sk_write_space = xs_udp_write_space; +- sk->sk_error_report = xs_error_report; + sk->sk_allocation = GFP_ATOMIC; + + xprt_clear_connected(xprt); +@@ -1959,7 +1942,6 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) + sk->sk_user_data = xprt; + sk->sk_data_ready = xs_udp_data_ready; + sk->sk_write_space = xs_udp_write_space; +- sk->sk_error_report = xs_error_report; + sk->sk_no_check = UDP_CSUM_NORCV; + sk->sk_allocation = GFP_ATOMIC; + +@@ -2027,10 +2009,8 @@ static void xs_abort_connection(struct sock_xprt *transport) + any.sa_family = AF_UNSPEC; + result = kernel_connect(transport->sock, &any, sizeof(any), 0); + if (!result) +- xs_sock_mark_closed(&transport->xprt); +- else +- dprintk("RPC: AF_UNSPEC connect return code %d\n", +- result); ++ xs_sock_reset_connection_flags(&transport->xprt); ++ dprintk("RPC: AF_UNSPEC connect return code %d\n", result); + } + + static void xs_tcp_reuse_connection(struct sock_xprt *transport) +@@ -2075,7 +2055,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) + sk->sk_data_ready = xs_tcp_data_ready; + sk->sk_state_change = xs_tcp_state_change; + sk->sk_write_space = xs_tcp_write_space; +- sk->sk_error_report = xs_error_report; + sk->sk_allocation = GFP_ATOMIC; + + /* socket options */ +@@ -2488,6 +2467,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { + static struct rpc_xprt_ops bc_tcp_ops = { + .reserve_xprt = xprt_reserve_xprt, + .release_xprt = xprt_release_xprt, ++ .alloc_slot = xprt_alloc_slot, + .buf_alloc = bc_malloc, + .buf_free = bc_free, + .send_request = bc_send_request, +diff --git a/sound/pci/ac97/ac97_codec.c b/sound/pci/ac97/ac97_codec.c +index fac51ee..1e7cfba 100644 +--- a/sound/pci/ac97/ac97_codec.c ++++ b/sound/pci/ac97/ac97_codec.c +@@ -1271,6 +1271,8 @@ static int snd_ac97_cvol_new(struct snd_card *card, char *name, int reg, unsigne + tmp.index = ac97->num; + kctl = snd_ctl_new1(&tmp, ac97); + } ++ if (!kctl) ++ return -ENOMEM; + if (reg >= AC97_PHONE && reg <= AC97_PCM) + set_tlv_db_scale(kctl, db_scale_5bit_12db_max); + else +diff --git a/sound/pci/emu10k1/emu10k1_main.c b/sound/pci/emu10k1/emu10k1_main.c +index 6a3e567..d37b946 100644 +--- a/sound/pci/emu10k1/emu10k1_main.c ++++ b/sound/pci/emu10k1/emu10k1_main.c +@@ -1416,6 +1416,15 @@ static struct snd_emu_chip_details emu_chip_details[] = { + .ca0108_chip = 1, + .spk71 = 1, + .emu_model = EMU_MODEL_EMU1010B}, /* EMU 1010 new revision */ ++ /* Tested by Maxim Kachur 17th Oct 2012. */ ++ /* This is MAEM8986, 0202 is MAEM8980 */ ++ {.vendor = 0x1102, .device = 0x0008, .subsystem = 0x40071102, ++ .driver = "Audigy2", .name = "E-mu 1010 PCIe [MAEM8986]", ++ .id = "EMU1010", ++ .emu10k2_chip = 1, ++ .ca0108_chip = 1, ++ .spk71 = 1, ++ .emu_model = EMU_MODEL_EMU1010B}, /* EMU 1010 PCIe */ + /* Tested by James@superbug.co.uk 8th July 2005. */ + /* This is MAEM8810, 0202 is MAEM8820 */ + {.vendor = 0x1102, .device = 0x0004, .subsystem = 0x40011102, +diff --git a/sound/pci/hda/patch_cirrus.c b/sound/pci/hda/patch_cirrus.c +index ec0518e..e449278 100644 +--- a/sound/pci/hda/patch_cirrus.c ++++ b/sound/pci/hda/patch_cirrus.c +@@ -1404,7 +1404,7 @@ static int patch_cs420x(struct hda_codec *codec) + return 0; + + error: +- kfree(codec->spec); ++ cs_free(codec); + codec->spec = NULL; + return err; + } +@@ -1949,7 +1949,7 @@ static int patch_cs421x(struct hda_codec *codec) + return 0; + + error: +- kfree(codec->spec); ++ cs_free(codec); + codec->spec = NULL; + return err; + } +diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c +index 94f0c4a..58c287b 100644 +--- a/sound/pci/hda/patch_conexant.c ++++ b/sound/pci/hda/patch_conexant.c +@@ -4463,7 +4463,9 @@ static void apply_fixup(struct hda_codec *codec, + struct conexant_spec *spec = codec->spec; + + quirk = snd_pci_quirk_lookup(codec->bus->pci, quirk); +- if (quirk && table[quirk->value]) { ++ if (!quirk) ++ return; ++ if (table[quirk->value]) { + snd_printdd(KERN_INFO "hda_codec: applying pincfg for %s\n", + quirk->name); + apply_pincfg(codec, table[quirk->value]); +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index 32c8169..c2c7f90 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -620,6 +620,8 @@ static void alc_line_automute(struct hda_codec *codec) + { + struct alc_spec *spec = codec->spec; + ++ if (spec->autocfg.line_out_type == AUTO_PIN_SPEAKER_OUT) ++ return; + /* check LO jack only when it's different from HP */ + if (spec->autocfg.line_out_pins[0] == spec->autocfg.hp_pins[0]) + return; +@@ -2663,8 +2665,10 @@ static const char *alc_get_line_out_pfx(struct alc_spec *spec, int ch, + return "PCM"; + break; + } +- if (snd_BUG_ON(ch >= ARRAY_SIZE(channel_name))) ++ if (ch >= ARRAY_SIZE(channel_name)) { ++ snd_BUG(); + return "PCM"; ++ } + + return channel_name[ch]; + } +@@ -5080,6 +5084,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x17aa, 0x21e9, "Thinkpad Edge 15", ALC269_FIXUP_SKU_IGNORE), + SND_PCI_QUIRK(0x17aa, 0x21f6, "Thinkpad T530", ALC269_FIXUP_LENOVO_DOCK), + SND_PCI_QUIRK(0x17aa, 0x21fa, "Thinkpad X230", ALC269_FIXUP_LENOVO_DOCK), ++ SND_PCI_QUIRK(0x17aa, 0x21f3, "Thinkpad T430", ALC269_FIXUP_LENOVO_DOCK), + SND_PCI_QUIRK(0x17aa, 0x21fb, "Thinkpad T430s", ALC269_FIXUP_LENOVO_DOCK), + SND_PCI_QUIRK(0x17aa, 0x2203, "Thinkpad X230 Tablet", ALC269_FIXUP_LENOVO_DOCK), + SND_PCI_QUIRK(0x17aa, 0x3bf8, "Quanta FL1", ALC269_FIXUP_PCM_44K), +diff --git a/usr/gen_init_cpio.c b/usr/gen_init_cpio.c +index af0f22f..aca6edc 100644 +--- a/usr/gen_init_cpio.c ++++ b/usr/gen_init_cpio.c +@@ -303,7 +303,7 @@ static int cpio_mkfile(const char *name, const char *location, + int retval; + int rc = -1; + int namesize; +- int i; ++ unsigned int i; + + mode |= S_IFREG; + +@@ -381,25 +381,28 @@ error: + + static char *cpio_replace_env(char *new_location) + { +- char expanded[PATH_MAX + 1]; +- char env_var[PATH_MAX + 1]; +- char *start; +- char *end; +- +- for (start = NULL; (start = strstr(new_location, "${")); ) { +- end = strchr(start, '}'); +- if (start < end) { +- *env_var = *expanded = '\0'; +- strncat(env_var, start + 2, end - start - 2); +- strncat(expanded, new_location, start - new_location); +- strncat(expanded, getenv(env_var), PATH_MAX); +- strncat(expanded, end + 1, PATH_MAX); +- strncpy(new_location, expanded, PATH_MAX); +- } else +- break; +- } +- +- return new_location; ++ char expanded[PATH_MAX + 1]; ++ char env_var[PATH_MAX + 1]; ++ char *start; ++ char *end; ++ ++ for (start = NULL; (start = strstr(new_location, "${")); ) { ++ end = strchr(start, '}'); ++ if (start < end) { ++ *env_var = *expanded = '\0'; ++ strncat(env_var, start + 2, end - start - 2); ++ strncat(expanded, new_location, start - new_location); ++ strncat(expanded, getenv(env_var), ++ PATH_MAX - strlen(expanded)); ++ strncat(expanded, end + 1, ++ PATH_MAX - strlen(expanded)); ++ strncpy(new_location, expanded, PATH_MAX); ++ new_location[PATH_MAX] = 0; ++ } else ++ break; ++ } ++ ++ return new_location; + } + + diff --git a/3.2.34/bump/1033_linux-3.2.34.patch b/3.2.34/bump/1033_linux-3.2.34.patch new file mode 100644 index 0000000..d647b38 --- /dev/null +++ b/3.2.34/bump/1033_linux-3.2.34.patch @@ -0,0 +1,3678 @@ +diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt +index 3d84912..47c4ec2 100644 +--- a/Documentation/feature-removal-schedule.txt ++++ b/Documentation/feature-removal-schedule.txt +@@ -6,14 +6,6 @@ be removed from this file. + + --------------------------- + +-What: x86 floppy disable_hlt +-When: 2012 +-Why: ancient workaround of dubious utility clutters the +- code used by everybody else. +-Who: Len Brown +- +---------------------------- +- + What: CONFIG_APM_CPU_IDLE, and its ability to call APM BIOS in idle + When: 2012 + Why: This optional sub-feature of APM is of dubious reliability, +diff --git a/Makefile b/Makefile +index 63ca1ea2..14ebacf 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 2 +-SUBLEVEL = 33 ++SUBLEVEL = 34 + EXTRAVERSION = + NAME = Saber-toothed Squirrel + +diff --git a/arch/arm/mach-at91/at91rm9200_devices.c b/arch/arm/mach-at91/at91rm9200_devices.c +index 143eebb..929fd91 100644 +--- a/arch/arm/mach-at91/at91rm9200_devices.c ++++ b/arch/arm/mach-at91/at91rm9200_devices.c +@@ -462,7 +462,7 @@ static struct i2c_gpio_platform_data pdata = { + + static struct platform_device at91rm9200_twi_device = { + .name = "i2c-gpio", +- .id = -1, ++ .id = 0, + .dev.platform_data = &pdata, + }; + +diff --git a/arch/arm/mach-at91/at91sam9260_devices.c b/arch/arm/mach-at91/at91sam9260_devices.c +index 2590988..465e026 100644 +--- a/arch/arm/mach-at91/at91sam9260_devices.c ++++ b/arch/arm/mach-at91/at91sam9260_devices.c +@@ -467,7 +467,7 @@ static struct i2c_gpio_platform_data pdata = { + + static struct platform_device at91sam9260_twi_device = { + .name = "i2c-gpio", +- .id = -1, ++ .id = 0, + .dev.platform_data = &pdata, + }; + +diff --git a/arch/arm/mach-at91/at91sam9261_devices.c b/arch/arm/mach-at91/at91sam9261_devices.c +index daf3e66..d6d1e76 100644 +--- a/arch/arm/mach-at91/at91sam9261_devices.c ++++ b/arch/arm/mach-at91/at91sam9261_devices.c +@@ -284,7 +284,7 @@ static struct i2c_gpio_platform_data pdata = { + + static struct platform_device at91sam9261_twi_device = { + .name = "i2c-gpio", +- .id = -1, ++ .id = 0, + .dev.platform_data = &pdata, + }; + +diff --git a/arch/arm/mach-at91/at91sam9263_devices.c b/arch/arm/mach-at91/at91sam9263_devices.c +index 32a7e43..e051376e 100644 +--- a/arch/arm/mach-at91/at91sam9263_devices.c ++++ b/arch/arm/mach-at91/at91sam9263_devices.c +@@ -540,7 +540,7 @@ static struct i2c_gpio_platform_data pdata = { + + static struct platform_device at91sam9263_twi_device = { + .name = "i2c-gpio", +- .id = -1, ++ .id = 0, + .dev.platform_data = &pdata, + }; + +diff --git a/arch/arm/mach-at91/at91sam9rl_devices.c b/arch/arm/mach-at91/at91sam9rl_devices.c +index 628eb56..4862b23 100644 +--- a/arch/arm/mach-at91/at91sam9rl_devices.c ++++ b/arch/arm/mach-at91/at91sam9rl_devices.c +@@ -319,7 +319,7 @@ static struct i2c_gpio_platform_data pdata = { + + static struct platform_device at91sam9rl_twi_device = { + .name = "i2c-gpio", +- .id = -1, ++ .id = 0, + .dev.platform_data = &pdata, + }; + +diff --git a/arch/arm/mach-at91/setup.c b/arch/arm/mach-at91/setup.c +index f5bbe0ef..0d264bf 100644 +--- a/arch/arm/mach-at91/setup.c ++++ b/arch/arm/mach-at91/setup.c +@@ -163,7 +163,7 @@ static void __init soc_detect(u32 dbgu_base) + } + + /* at91sam9g10 */ +- if ((cidr & ~AT91_CIDR_EXT) == ARCH_ID_AT91SAM9G10) { ++ if ((socid & ~AT91_CIDR_EXT) == ARCH_ID_AT91SAM9G10) { + at91_soc_initdata.type = AT91_SOC_SAM9G10; + at91_boot_soc = at91sam9261_soc; + } +diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h +index 2d2f01c..d75adff 100644 +--- a/arch/x86/include/asm/system.h ++++ b/arch/x86/include/asm/system.h +@@ -93,10 +93,6 @@ do { \ + "memory"); \ + } while (0) + +-/* +- * disable hlt during certain critical i/o operations +- */ +-#define HAVE_DISABLE_HLT + #else + + /* frame pointer must be last for get_wchan */ +@@ -392,9 +388,6 @@ static inline void clflush(volatile void *__p) + + #define nop() asm volatile ("nop") + +-void disable_hlt(void); +-void enable_hlt(void); +- + void cpu_idle_wait(void); + + extern unsigned long arch_align_stack(unsigned long sp); +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index ee5d4fb..59b9b37 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -341,34 +341,10 @@ void (*pm_idle)(void); + EXPORT_SYMBOL(pm_idle); + #endif + +-#ifdef CONFIG_X86_32 +-/* +- * This halt magic was a workaround for ancient floppy DMA +- * wreckage. It should be safe to remove. +- */ +-static int hlt_counter; +-void disable_hlt(void) +-{ +- hlt_counter++; +-} +-EXPORT_SYMBOL(disable_hlt); +- +-void enable_hlt(void) +-{ +- hlt_counter--; +-} +-EXPORT_SYMBOL(enable_hlt); +- +-static inline int hlt_use_halt(void) +-{ +- return (!hlt_counter && boot_cpu_data.hlt_works_ok); +-} +-#else + static inline int hlt_use_halt(void) + { + return 1; + } +-#endif + + /* + * We use this if we don't have any better +diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c +index ec3d603..2b8b0de 100644 +--- a/arch/x86/xen/mmu.c ++++ b/arch/x86/xen/mmu.c +@@ -1203,6 +1203,25 @@ unsigned long xen_read_cr2_direct(void) + return percpu_read(xen_vcpu_info.arch.cr2); + } + ++void xen_flush_tlb_all(void) ++{ ++ struct mmuext_op *op; ++ struct multicall_space mcs; ++ ++ trace_xen_mmu_flush_tlb_all(0); ++ ++ preempt_disable(); ++ ++ mcs = xen_mc_entry(sizeof(*op)); ++ ++ op = mcs.args; ++ op->cmd = MMUEXT_TLB_FLUSH_ALL; ++ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); ++ ++ xen_mc_issue(PARAVIRT_LAZY_MMU); ++ ++ preempt_enable(); ++} + static void xen_flush_tlb(void) + { + struct mmuext_op *op; +@@ -2366,7 +2385,7 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, + err = 0; + out: + +- flush_tlb_all(); ++ xen_flush_tlb_all(); + + return err; + } +diff --git a/crypto/cryptd.c b/crypto/cryptd.c +index 671d4d6..7bdd61b 100644 +--- a/crypto/cryptd.c ++++ b/crypto/cryptd.c +@@ -137,13 +137,18 @@ static void cryptd_queue_worker(struct work_struct *work) + struct crypto_async_request *req, *backlog; + + cpu_queue = container_of(work, struct cryptd_cpu_queue, work); +- /* Only handle one request at a time to avoid hogging crypto +- * workqueue. preempt_disable/enable is used to prevent +- * being preempted by cryptd_enqueue_request() */ ++ /* ++ * Only handle one request at a time to avoid hogging crypto workqueue. ++ * preempt_disable/enable is used to prevent being preempted by ++ * cryptd_enqueue_request(). local_bh_disable/enable is used to prevent ++ * cryptd_enqueue_request() being accessed from software interrupts. ++ */ ++ local_bh_disable(); + preempt_disable(); + backlog = crypto_get_backlog(&cpu_queue->queue); + req = crypto_dequeue_request(&cpu_queue->queue); + preempt_enable(); ++ local_bh_enable(); + + if (!req) + return; +diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c +index c864add..7a90d4a 100644 +--- a/drivers/block/floppy.c ++++ b/drivers/block/floppy.c +@@ -1032,37 +1032,6 @@ static int fd_wait_for_completion(unsigned long delay, timeout_fn function) + return 0; + } + +-static DEFINE_SPINLOCK(floppy_hlt_lock); +-static int hlt_disabled; +-static void floppy_disable_hlt(void) +-{ +- unsigned long flags; +- +- WARN_ONCE(1, "floppy_disable_hlt() scheduled for removal in 2012"); +- spin_lock_irqsave(&floppy_hlt_lock, flags); +- if (!hlt_disabled) { +- hlt_disabled = 1; +-#ifdef HAVE_DISABLE_HLT +- disable_hlt(); +-#endif +- } +- spin_unlock_irqrestore(&floppy_hlt_lock, flags); +-} +- +-static void floppy_enable_hlt(void) +-{ +- unsigned long flags; +- +- spin_lock_irqsave(&floppy_hlt_lock, flags); +- if (hlt_disabled) { +- hlt_disabled = 0; +-#ifdef HAVE_DISABLE_HLT +- enable_hlt(); +-#endif +- } +- spin_unlock_irqrestore(&floppy_hlt_lock, flags); +-} +- + static void setup_DMA(void) + { + unsigned long f; +@@ -1107,7 +1076,6 @@ static void setup_DMA(void) + fd_enable_dma(); + release_dma_lock(f); + #endif +- floppy_disable_hlt(); + } + + static void show_floppy(void); +@@ -1709,7 +1677,6 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id) + fd_disable_dma(); + release_dma_lock(f); + +- floppy_enable_hlt(); + do_floppy = NULL; + if (fdc >= N_FDC || FDCS->address == -1) { + /* we don't even know which FDC is the culprit */ +@@ -1858,8 +1825,6 @@ static void floppy_shutdown(unsigned long data) + show_floppy(); + cancel_activity(); + +- floppy_enable_hlt(); +- + flags = claim_dma_lock(); + fd_disable_dma(); + release_dma_lock(flags); +@@ -4198,6 +4163,7 @@ static int __init floppy_init(void) + + disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock); + if (!disks[dr]->queue) { ++ put_disk(disks[dr]); + err = -ENOMEM; + goto out_put_disk; + } +@@ -4339,7 +4305,7 @@ static int __init floppy_init(void) + + err = platform_device_register(&floppy_device[drive]); + if (err) +- goto out_flush_work; ++ goto out_remove_drives; + + err = device_create_file(&floppy_device[drive].dev, + &dev_attr_cmos); +@@ -4357,6 +4323,15 @@ static int __init floppy_init(void) + + out_unreg_platform_dev: + platform_device_unregister(&floppy_device[drive]); ++out_remove_drives: ++ while (drive--) { ++ if ((allowed_drive_mask & (1 << drive)) && ++ fdc_state[FDC(drive)].version != FDC_NONE) { ++ del_gendisk(disks[drive]); ++ device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos); ++ platform_device_unregister(&floppy_device[drive]); ++ } ++ } + out_flush_work: + flush_work_sync(&floppy_work); + if (atomic_read(&usage_count)) +@@ -4510,7 +4485,6 @@ static void floppy_release_irq_and_dma(void) + #if N_FDC > 1 + set_dor(1, ~8, 0); + #endif +- floppy_enable_hlt(); + + if (floppy_track_buffer && max_buffer_sectors) { + tmpsize = max_buffer_sectors * 1024; +diff --git a/drivers/gpio/gpio-timberdale.c b/drivers/gpio/gpio-timberdale.c +index c593bd4..edff410 100644 +--- a/drivers/gpio/gpio-timberdale.c ++++ b/drivers/gpio/gpio-timberdale.c +@@ -116,7 +116,7 @@ static void timbgpio_irq_disable(struct irq_data *d) + unsigned long flags; + + spin_lock_irqsave(&tgpio->lock, flags); +- tgpio->last_ier &= ~(1 << offset); ++ tgpio->last_ier &= ~(1UL << offset); + iowrite32(tgpio->last_ier, tgpio->membase + TGPIO_IER); + spin_unlock_irqrestore(&tgpio->lock, flags); + } +@@ -128,7 +128,7 @@ static void timbgpio_irq_enable(struct irq_data *d) + unsigned long flags; + + spin_lock_irqsave(&tgpio->lock, flags); +- tgpio->last_ier |= 1 << offset; ++ tgpio->last_ier |= 1UL << offset; + iowrite32(tgpio->last_ier, tgpio->membase + TGPIO_IER); + spin_unlock_irqrestore(&tgpio->lock, flags); + } +diff --git a/drivers/gpu/drm/drm_fops.c b/drivers/gpu/drm/drm_fops.c +index 828bf65..020b103 100644 +--- a/drivers/gpu/drm/drm_fops.c ++++ b/drivers/gpu/drm/drm_fops.c +@@ -136,8 +136,11 @@ int drm_open(struct inode *inode, struct file *filp) + retcode = drm_open_helper(inode, filp, dev); + if (!retcode) { + atomic_inc(&dev->counts[_DRM_STAT_OPENS]); +- if (!dev->open_count++) ++ if (!dev->open_count++) { + retcode = drm_setup(dev); ++ if (retcode) ++ dev->open_count--; ++ } + } + if (!retcode) { + mutex_lock(&dev->struct_mutex); +diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h +index 83e820e..bcadf74 100644 +--- a/drivers/gpu/drm/i915/intel_drv.h ++++ b/drivers/gpu/drm/i915/intel_drv.h +@@ -227,12 +227,12 @@ struct dip_infoframe { + uint16_t bottom_bar_start; + uint16_t left_bar_end; + uint16_t right_bar_start; +- } avi; ++ } __attribute__ ((packed)) avi; + struct { + uint8_t vn[8]; + uint8_t pd[16]; + uint8_t sdi; +- } spd; ++ } __attribute__ ((packed)) spd; + uint8_t payload[27]; + } __attribute__ ((packed)) body; + } __attribute__((packed)); +diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c +index cdf17d4..478b51f 100644 +--- a/drivers/gpu/drm/i915/intel_overlay.c ++++ b/drivers/gpu/drm/i915/intel_overlay.c +@@ -428,9 +428,17 @@ static int intel_overlay_off(struct intel_overlay *overlay) + OUT_RING(flip_addr); + OUT_RING(MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP); + /* turn overlay off */ +- OUT_RING(MI_OVERLAY_FLIP | MI_OVERLAY_OFF); +- OUT_RING(flip_addr); +- OUT_RING(MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP); ++ if (IS_I830(dev)) { ++ /* Workaround: Don't disable the overlay fully, since otherwise ++ * it dies on the next OVERLAY_ON cmd. */ ++ OUT_RING(MI_NOOP); ++ OUT_RING(MI_NOOP); ++ OUT_RING(MI_NOOP); ++ } else { ++ OUT_RING(MI_OVERLAY_FLIP | MI_OVERLAY_OFF); ++ OUT_RING(flip_addr); ++ OUT_RING(MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP); ++ } + ADVANCE_LP_RING(); + + return intel_overlay_do_wait_request(overlay, request, +diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c +index bbf247c..3f4afba 100644 +--- a/drivers/gpu/drm/i915/intel_sdvo.c ++++ b/drivers/gpu/drm/i915/intel_sdvo.c +@@ -868,31 +868,38 @@ static void intel_sdvo_dump_hdmi_buf(struct intel_sdvo *intel_sdvo) + } + #endif + +-static bool intel_sdvo_set_avi_infoframe(struct intel_sdvo *intel_sdvo) ++static bool intel_sdvo_write_infoframe(struct intel_sdvo *intel_sdvo, ++ unsigned if_index, uint8_t tx_rate, ++ uint8_t *data, unsigned length) + { +- struct dip_infoframe avi_if = { +- .type = DIP_TYPE_AVI, +- .ver = DIP_VERSION_AVI, +- .len = DIP_LEN_AVI, +- }; +- uint8_t tx_rate = SDVO_HBUF_TX_VSYNC; +- uint8_t set_buf_index[2] = { 1, 0 }; +- uint64_t *data = (uint64_t *)&avi_if; +- unsigned i; +- +- intel_dip_infoframe_csum(&avi_if); ++ uint8_t set_buf_index[2] = { if_index, 0 }; ++ uint8_t hbuf_size, tmp[8]; ++ int i; + + if (!intel_sdvo_set_value(intel_sdvo, + SDVO_CMD_SET_HBUF_INDEX, + set_buf_index, 2)) + return false; + +- for (i = 0; i < sizeof(avi_if); i += 8) { ++ if (!intel_sdvo_get_value(intel_sdvo, SDVO_CMD_GET_HBUF_INFO, ++ &hbuf_size, 1)) ++ return false; ++ ++ /* Buffer size is 0 based, hooray! */ ++ hbuf_size++; ++ ++ DRM_DEBUG_KMS("writing sdvo hbuf: %i, hbuf_size %i, hbuf_size: %i\n", ++ if_index, length, hbuf_size); ++ ++ for (i = 0; i < hbuf_size; i += 8) { ++ memset(tmp, 0, 8); ++ if (i < length) ++ memcpy(tmp, data + i, min_t(unsigned, 8, length - i)); ++ + if (!intel_sdvo_set_value(intel_sdvo, + SDVO_CMD_SET_HBUF_DATA, +- data, 8)) ++ tmp, 8)) + return false; +- data++; + } + + return intel_sdvo_set_value(intel_sdvo, +@@ -900,6 +907,28 @@ static bool intel_sdvo_set_avi_infoframe(struct intel_sdvo *intel_sdvo) + &tx_rate, 1); + } + ++static bool intel_sdvo_set_avi_infoframe(struct intel_sdvo *intel_sdvo) ++{ ++ struct dip_infoframe avi_if = { ++ .type = DIP_TYPE_AVI, ++ .ver = DIP_VERSION_AVI, ++ .len = DIP_LEN_AVI, ++ }; ++ uint8_t sdvo_data[4 + sizeof(avi_if.body.avi)]; ++ ++ intel_dip_infoframe_csum(&avi_if); ++ ++ /* sdvo spec says that the ecc is handled by the hw, and it looks like ++ * we must not send the ecc field, either. */ ++ memcpy(sdvo_data, &avi_if, 3); ++ sdvo_data[3] = avi_if.checksum; ++ memcpy(&sdvo_data[4], &avi_if.body, sizeof(avi_if.body.avi)); ++ ++ return intel_sdvo_write_infoframe(intel_sdvo, SDVO_HBUF_INDEX_AVI_IF, ++ SDVO_HBUF_TX_VSYNC, ++ sdvo_data, sizeof(sdvo_data)); ++} ++ + static bool intel_sdvo_set_tv_format(struct intel_sdvo *intel_sdvo) + { + struct intel_sdvo_tv_format format; +diff --git a/drivers/gpu/drm/i915/intel_sdvo_regs.h b/drivers/gpu/drm/i915/intel_sdvo_regs.h +index 372f33b..4193c54 100644 +--- a/drivers/gpu/drm/i915/intel_sdvo_regs.h ++++ b/drivers/gpu/drm/i915/intel_sdvo_regs.h +@@ -708,6 +708,8 @@ struct intel_sdvo_enhancements_arg { + #define SDVO_CMD_SET_AUDIO_STAT 0x91 + #define SDVO_CMD_GET_AUDIO_STAT 0x92 + #define SDVO_CMD_SET_HBUF_INDEX 0x93 ++ #define SDVO_HBUF_INDEX_ELD 0 ++ #define SDVO_HBUF_INDEX_AVI_IF 1 + #define SDVO_CMD_GET_HBUF_INDEX 0x94 + #define SDVO_CMD_GET_HBUF_INFO 0x95 + #define SDVO_CMD_SET_HBUF_AV_SPLIT 0x96 +diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c b/drivers/gpu/drm/nouveau/nouveau_drv.c +index 9791d13..8c084c0 100644 +--- a/drivers/gpu/drm/nouveau/nouveau_drv.c ++++ b/drivers/gpu/drm/nouveau/nouveau_drv.c +@@ -178,8 +178,10 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t pm_state) + if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) + return 0; + +- NV_INFO(dev, "Disabling fbcon acceleration...\n"); +- nouveau_fbcon_save_disable_accel(dev); ++ if (dev->mode_config.num_crtc) { ++ NV_INFO(dev, "Disabling fbcon acceleration...\n"); ++ nouveau_fbcon_save_disable_accel(dev); ++ } + + NV_INFO(dev, "Unpinning framebuffer(s)...\n"); + list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { +@@ -246,10 +248,12 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t pm_state) + pci_set_power_state(pdev, PCI_D3hot); + } + +- console_lock(); +- nouveau_fbcon_set_suspend(dev, 1); +- console_unlock(); +- nouveau_fbcon_restore_accel(dev); ++ if (dev->mode_config.num_crtc) { ++ console_lock(); ++ nouveau_fbcon_set_suspend(dev, 1); ++ console_unlock(); ++ nouveau_fbcon_restore_accel(dev); ++ } + return 0; + + out_abort: +@@ -275,7 +279,8 @@ nouveau_pci_resume(struct pci_dev *pdev) + if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) + return 0; + +- nouveau_fbcon_save_disable_accel(dev); ++ if (dev->mode_config.num_crtc) ++ nouveau_fbcon_save_disable_accel(dev); + + NV_INFO(dev, "We're back, enabling device...\n"); + pci_set_power_state(pdev, PCI_D0); +@@ -376,15 +381,18 @@ nouveau_pci_resume(struct pci_dev *pdev) + nv_crtc->lut.depth = 0; + } + +- console_lock(); +- nouveau_fbcon_set_suspend(dev, 0); +- console_unlock(); ++ if (dev->mode_config.num_crtc) { ++ console_lock(); ++ nouveau_fbcon_set_suspend(dev, 0); ++ console_unlock(); + +- nouveau_fbcon_zfill_all(dev); ++ nouveau_fbcon_zfill_all(dev); ++ } + + drm_helper_resume_force_mode(dev); + +- nouveau_fbcon_restore_accel(dev); ++ if (dev->mode_config.num_crtc) ++ nouveau_fbcon_restore_accel(dev); + return 0; + } + +@@ -466,9 +474,7 @@ static int __init nouveau_init(void) + #ifdef CONFIG_VGA_CONSOLE + if (vgacon_text_force()) + nouveau_modeset = 0; +- else + #endif +- nouveau_modeset = 1; + } + + if (!nouveau_modeset) +diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c b/drivers/gpu/drm/nouveau/nouveau_state.c +index d8831ab..01adcfb 100644 +--- a/drivers/gpu/drm/nouveau/nouveau_state.c ++++ b/drivers/gpu/drm/nouveau/nouveau_state.c +@@ -46,6 +46,7 @@ static int nouveau_init_engine_ptrs(struct drm_device *dev) + { + struct drm_nouveau_private *dev_priv = dev->dev_private; + struct nouveau_engine *engine = &dev_priv->engine; ++ u32 pclass = dev->pdev->class >> 8; + + switch (dev_priv->chipset & 0xf0) { + case 0x00: +@@ -481,7 +482,8 @@ static int nouveau_init_engine_ptrs(struct drm_device *dev) + } + + /* headless mode */ +- if (nouveau_modeset == 2) { ++ if (nouveau_modeset == 2 || ++ (nouveau_modeset < 0 && pclass != PCI_CLASS_DISPLAY_VGA)) { + engine->display.early_init = nouveau_stub_init; + engine->display.late_takedown = nouveau_stub_takedown; + engine->display.create = nouveau_stub_init; +diff --git a/drivers/gpu/drm/nouveau/nv04_dac.c b/drivers/gpu/drm/nouveau/nv04_dac.c +index e000455..2d6bfd0 100644 +--- a/drivers/gpu/drm/nouveau/nv04_dac.c ++++ b/drivers/gpu/drm/nouveau/nv04_dac.c +@@ -209,7 +209,7 @@ out: + NVWriteVgaCrtc(dev, 0, NV_CIO_CR_MODE_INDEX, saved_cr_mode); + + if (blue == 0x18) { +- NV_INFO(dev, "Load detected on head A\n"); ++ NV_DEBUG(dev, "Load detected on head A\n"); + return connector_status_connected; + } + +@@ -323,7 +323,7 @@ nv17_dac_detect(struct drm_encoder *encoder, struct drm_connector *connector) + + if (nv17_dac_sample_load(encoder) & + NV_PRAMDAC_TEST_CONTROL_SENSEB_ALLHI) { +- NV_INFO(dev, "Load detected on output %c\n", ++ NV_DEBUG(dev, "Load detected on output %c\n", + '@' + ffs(dcb->or)); + return connector_status_connected; + } else { +@@ -398,7 +398,7 @@ static void nv04_dac_commit(struct drm_encoder *encoder) + + helper->dpms(encoder, DRM_MODE_DPMS_ON); + +- NV_INFO(dev, "Output %s is running on CRTC %d using output %c\n", ++ NV_DEBUG(dev, "Output %s is running on CRTC %d using output %c\n", + drm_get_connector_name(&nouveau_encoder_connector_get(nv_encoder)->base), + nv_crtc->index, '@' + ffs(nv_encoder->dcb->or)); + } +@@ -447,7 +447,7 @@ static void nv04_dac_dpms(struct drm_encoder *encoder, int mode) + return; + nv_encoder->last_dpms = mode; + +- NV_INFO(dev, "Setting dpms mode %d on vga encoder (output %d)\n", ++ NV_DEBUG(dev, "Setting dpms mode %d on vga encoder (output %d)\n", + mode, nv_encoder->dcb->index); + + nv04_dac_update_dacclk(encoder, mode == DRM_MODE_DPMS_ON); +diff --git a/drivers/gpu/drm/nouveau/nv04_dfp.c b/drivers/gpu/drm/nouveau/nv04_dfp.c +index 12098bf..752440c 100644 +--- a/drivers/gpu/drm/nouveau/nv04_dfp.c ++++ b/drivers/gpu/drm/nouveau/nv04_dfp.c +@@ -468,7 +468,7 @@ static void nv04_dfp_commit(struct drm_encoder *encoder) + + helper->dpms(encoder, DRM_MODE_DPMS_ON); + +- NV_INFO(dev, "Output %s is running on CRTC %d using output %c\n", ++ NV_DEBUG(dev, "Output %s is running on CRTC %d using output %c\n", + drm_get_connector_name(&nouveau_encoder_connector_get(nv_encoder)->base), + nv_crtc->index, '@' + ffs(nv_encoder->dcb->or)); + } +@@ -511,7 +511,7 @@ static void nv04_lvds_dpms(struct drm_encoder *encoder, int mode) + return; + nv_encoder->last_dpms = mode; + +- NV_INFO(dev, "Setting dpms mode %d on lvds encoder (output %d)\n", ++ NV_DEBUG(dev, "Setting dpms mode %d on lvds encoder (output %d)\n", + mode, nv_encoder->dcb->index); + + if (was_powersaving && is_powersaving_dpms(mode)) +@@ -556,7 +556,7 @@ static void nv04_tmds_dpms(struct drm_encoder *encoder, int mode) + return; + nv_encoder->last_dpms = mode; + +- NV_INFO(dev, "Setting dpms mode %d on tmds encoder (output %d)\n", ++ NV_DEBUG(dev, "Setting dpms mode %d on tmds encoder (output %d)\n", + mode, nv_encoder->dcb->index); + + nv04_dfp_update_backlight(encoder, mode); +diff --git a/drivers/gpu/drm/nouveau/nv04_tv.c b/drivers/gpu/drm/nouveau/nv04_tv.c +index 3eb605d..4de1fbe 100644 +--- a/drivers/gpu/drm/nouveau/nv04_tv.c ++++ b/drivers/gpu/drm/nouveau/nv04_tv.c +@@ -69,7 +69,7 @@ static void nv04_tv_dpms(struct drm_encoder *encoder, int mode) + struct nv04_mode_state *state = &dev_priv->mode_reg; + uint8_t crtc1A; + +- NV_INFO(dev, "Setting dpms mode %d on TV encoder (output %d)\n", ++ NV_DEBUG(dev, "Setting dpms mode %d on TV encoder (output %d)\n", + mode, nv_encoder->dcb->index); + + state->pllsel &= ~(PLLSEL_TV_CRTC1_MASK | PLLSEL_TV_CRTC2_MASK); +@@ -162,7 +162,7 @@ static void nv04_tv_commit(struct drm_encoder *encoder) + + helper->dpms(encoder, DRM_MODE_DPMS_ON); + +- NV_INFO(dev, "Output %s is running on CRTC %d using output %c\n", ++ NV_DEBUG(dev, "Output %s is running on CRTC %d using output %c\n", + drm_get_connector_name(&nouveau_encoder_connector_get(nv_encoder)->base), nv_crtc->index, + '@' + ffs(nv_encoder->dcb->or)); + } +diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c +index b61f490..ca94e23 100644 +--- a/drivers/gpu/drm/radeon/evergreen.c ++++ b/drivers/gpu/drm/radeon/evergreen.c +@@ -1164,7 +1164,7 @@ void evergreen_mc_resume(struct radeon_device *rdev, struct evergreen_mc_save *s + WREG32(BIF_FB_EN, FB_READ_EN | FB_WRITE_EN); + + for (i = 0; i < rdev->num_crtc; i++) { +- if (save->crtc_enabled) { ++ if (save->crtc_enabled[i]) { + tmp = RREG32(EVERGREEN_CRTC_CONTROL + crtc_offsets[i]); + tmp &= ~EVERGREEN_CRTC_DISP_READ_REQUEST_DISABLE; + WREG32(EVERGREEN_CRTC_CONTROL + crtc_offsets[i], tmp); +diff --git a/drivers/gpu/drm/radeon/radeon_legacy_encoders.c b/drivers/gpu/drm/radeon/radeon_legacy_encoders.c +index 3ad3cc6..8165953 100644 +--- a/drivers/gpu/drm/radeon/radeon_legacy_encoders.c ++++ b/drivers/gpu/drm/radeon/radeon_legacy_encoders.c +@@ -650,6 +650,7 @@ static enum drm_connector_status radeon_legacy_primary_dac_detect(struct drm_enc + tmp |= RADEON_DAC_RANGE_CNTL_PS2 | RADEON_DAC_CMP_EN; + WREG32(RADEON_DAC_CNTL, tmp); + ++ tmp = dac_macro_cntl; + tmp &= ~(RADEON_DAC_PDWN_R | + RADEON_DAC_PDWN_G | + RADEON_DAC_PDWN_B); +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_dmabuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_dmabuf.c +index 3fa884d..27151f7 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_dmabuf.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_dmabuf.c +@@ -306,7 +306,7 @@ void vmw_bo_pin(struct ttm_buffer_object *bo, bool pin) + + BUG_ON(!atomic_read(&bo->reserved)); + BUG_ON(old_mem_type != TTM_PL_VRAM && +- old_mem_type != VMW_PL_FLAG_GMR); ++ old_mem_type != VMW_PL_GMR); + + pl_flags = TTM_PL_FLAG_VRAM | VMW_PL_FLAG_GMR | TTM_PL_FLAG_CACHED; + if (pin) +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +index 033fc96..b639536 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +@@ -1048,6 +1048,11 @@ static void vmw_pm_complete(struct device *kdev) + struct drm_device *dev = pci_get_drvdata(pdev); + struct vmw_private *dev_priv = vmw_priv(dev); + ++ mutex_lock(&dev_priv->hw_mutex); ++ vmw_write(dev_priv, SVGA_REG_ID, SVGA_ID_2); ++ (void) vmw_read(dev_priv, SVGA_REG_ID); ++ mutex_unlock(&dev_priv->hw_mutex); ++ + /** + * Reclaim 3d reference held by fbdev and potentially + * start fifo. +diff --git a/drivers/hid/hid-microsoft.c b/drivers/hid/hid-microsoft.c +index e5c699b..3899989 100644 +--- a/drivers/hid/hid-microsoft.c ++++ b/drivers/hid/hid-microsoft.c +@@ -29,22 +29,30 @@ + #define MS_RDESC 0x08 + #define MS_NOGET 0x10 + #define MS_DUPLICATE_USAGES 0x20 ++#define MS_RDESC_3K 0x40 + +-/* +- * Microsoft Wireless Desktop Receiver (Model 1028) has +- * 'Usage Min/Max' where it ought to have 'Physical Min/Max' +- */ + static __u8 *ms_report_fixup(struct hid_device *hdev, __u8 *rdesc, + unsigned int *rsize) + { + unsigned long quirks = (unsigned long)hid_get_drvdata(hdev); + ++ /* ++ * Microsoft Wireless Desktop Receiver (Model 1028) has ++ * 'Usage Min/Max' where it ought to have 'Physical Min/Max' ++ */ + if ((quirks & MS_RDESC) && *rsize == 571 && rdesc[557] == 0x19 && + rdesc[559] == 0x29) { + hid_info(hdev, "fixing up Microsoft Wireless Receiver Model 1028 report descriptor\n"); + rdesc[557] = 0x35; + rdesc[559] = 0x45; + } ++ /* the same as above (s/usage/physical/) */ ++ if ((quirks & MS_RDESC_3K) && *rsize == 106 && ++ !memcmp((char []){ 0x19, 0x00, 0x29, 0xff }, ++ &rdesc[94], 4)) { ++ rdesc[94] = 0x35; ++ rdesc[96] = 0x45; ++ } + return rdesc; + } + +@@ -193,7 +201,7 @@ static const struct hid_device_id ms_devices[] = { + { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_PRESENTER_8K_USB), + .driver_data = MS_PRESENTER }, + { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_DIGITAL_MEDIA_3K), +- .driver_data = MS_ERGONOMY }, ++ .driver_data = MS_ERGONOMY | MS_RDESC_3K }, + { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_WIRELESS_OPTICAL_DESKTOP_3_0), + .driver_data = MS_NOGET }, + { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_COMFORT_MOUSE_4500), +diff --git a/drivers/hwmon/w83627ehf.c b/drivers/hwmon/w83627ehf.c +index ceaec92..b6a3ce3 100644 +--- a/drivers/hwmon/w83627ehf.c ++++ b/drivers/hwmon/w83627ehf.c +@@ -2015,6 +2015,7 @@ static int __devinit w83627ehf_probe(struct platform_device *pdev) + mutex_init(&data->lock); + mutex_init(&data->update_lock); + data->name = w83627ehf_device_names[sio_data->kind]; ++ data->bank = 0xff; /* Force initial bank selection */ + platform_set_drvdata(pdev, data); + + /* 627EHG and 627EHF have 10 voltage inputs; 627DHG and 667HG have 9 */ +diff --git a/drivers/input/touchscreen/tsc40.c b/drivers/input/touchscreen/tsc40.c +index 29d5ed4..80d4610 100644 +--- a/drivers/input/touchscreen/tsc40.c ++++ b/drivers/input/touchscreen/tsc40.c +@@ -107,7 +107,6 @@ static int tsc_connect(struct serio *serio, struct serio_driver *drv) + __set_bit(BTN_TOUCH, input_dev->keybit); + input_set_abs_params(ptsc->dev, ABS_X, 0, 0x3ff, 0, 0); + input_set_abs_params(ptsc->dev, ABS_Y, 0, 0x3ff, 0, 0); +- input_set_abs_params(ptsc->dev, ABS_PRESSURE, 0, 0, 0, 0); + + serio_set_drvdata(serio, ptsc); + +diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c +index 11ddd838..69fc888 100644 +--- a/drivers/net/ethernet/marvell/sky2.c ++++ b/drivers/net/ethernet/marvell/sky2.c +@@ -3060,8 +3060,10 @@ static irqreturn_t sky2_intr(int irq, void *dev_id) + + /* Reading this mask interrupts as side effect */ + status = sky2_read32(hw, B0_Y2_SP_ISRC2); +- if (status == 0 || status == ~0) ++ if (status == 0 || status == ~0) { ++ sky2_write32(hw, B0_Y2_SP_ICR, 2); + return IRQ_NONE; ++ } + + prefetch(&hw->st_le[hw->st_idx]); + +diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c +index 4b43bc5..b8db4cd 100644 +--- a/drivers/net/ethernet/realtek/r8169.c ++++ b/drivers/net/ethernet/realtek/r8169.c +@@ -77,7 +77,7 @@ static const int multicast_filter_limit = 32; + #define MAC_ADDR_LEN 6 + + #define MAX_READ_REQUEST_SHIFT 12 +-#define TX_DMA_BURST 6 /* Maximum PCI burst, '6' is 1024 */ ++#define TX_DMA_BURST 7 /* Maximum PCI burst, '7' is unlimited */ + #define SafeMtu 0x1c20 /* ... actually life sucks beyond ~7k */ + #define InterFrameGap 0x03 /* 3 means InterFrameGap = the shortest one */ + +@@ -3521,6 +3521,8 @@ static void rtl_wol_suspend_quirk(struct rtl8169_private *tp) + void __iomem *ioaddr = tp->mmio_addr; + + switch (tp->mac_version) { ++ case RTL_GIGA_MAC_VER_25: ++ case RTL_GIGA_MAC_VER_26: + case RTL_GIGA_MAC_VER_29: + case RTL_GIGA_MAC_VER_30: + case RTL_GIGA_MAC_VER_32: +@@ -6064,6 +6066,9 @@ static void rtl_set_rx_mode(struct net_device *dev) + mc_filter[1] = swab32(data); + } + ++ if (tp->mac_version == RTL_GIGA_MAC_VER_35) ++ mc_filter[1] = mc_filter[0] = 0xffffffff; ++ + RTL_W32(MAR0 + 4, mc_filter[1]); + RTL_W32(MAR0 + 0, mc_filter[0]); + +diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c +index b873b5d..dc53a8f 100644 +--- a/drivers/net/usb/usbnet.c ++++ b/drivers/net/usb/usbnet.c +@@ -1156,6 +1156,7 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb, + usb_anchor_urb(urb, &dev->deferred); + /* no use to process more packets */ + netif_stop_queue(net); ++ usb_put_urb(urb); + spin_unlock_irqrestore(&dev->txq.lock, flags); + netdev_dbg(dev->net, "Delaying transmission for resumption\n"); + goto deferred; +@@ -1297,6 +1298,8 @@ void usbnet_disconnect (struct usb_interface *intf) + + cancel_work_sync(&dev->kevent); + ++ usb_scuttle_anchored_urbs(&dev->deferred); ++ + if (dev->driver_info->unbind) + dev->driver_info->unbind (dev, intf); + +diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c +index c59c592..18da100 100644 +--- a/drivers/net/wireless/ath/ath9k/xmit.c ++++ b/drivers/net/wireless/ath/ath9k/xmit.c +@@ -288,6 +288,7 @@ static struct ath_buf *ath_tx_get_buffer(struct ath_softc *sc) + } + + bf = list_first_entry(&sc->tx.txbuf, struct ath_buf, list); ++ bf->bf_next = NULL; + list_del(&bf->list); + + spin_unlock_bh(&sc->tx.txbuflock); +@@ -369,7 +370,7 @@ static void ath_tx_complete_aggr(struct ath_softc *sc, struct ath_txq *txq, + u16 seq_st = 0, acked_cnt = 0, txfail_cnt = 0; + u32 ba[WME_BA_BMP_SIZE >> 5]; + int isaggr, txfail, txpending, sendbar = 0, needreset = 0, nbad = 0; +- bool rc_update = true; ++ bool rc_update = true, isba; + struct ieee80211_tx_rate rates[4]; + struct ath_frame_info *fi; + int nframes; +@@ -407,13 +408,17 @@ static void ath_tx_complete_aggr(struct ath_softc *sc, struct ath_txq *txq, + an = (struct ath_node *)sta->drv_priv; + tidno = ieee80211_get_qos_ctl(hdr)[0] & IEEE80211_QOS_CTL_TID_MASK; + tid = ATH_AN_2_TID(an, tidno); ++ isba = ts->ts_flags & ATH9K_TX_BA; + + /* + * The hardware occasionally sends a tx status for the wrong TID. + * In this case, the BA status cannot be considered valid and all + * subframes need to be retransmitted ++ * ++ * Only BlockAcks have a TID and therefore normal Acks cannot be ++ * checked + */ +- if (tidno != ts->tid) ++ if (isba && tidno != ts->tid) + txok = false; + + isaggr = bf_isaggr(bf); +@@ -1710,6 +1715,7 @@ static void ath_tx_send_normal(struct ath_softc *sc, struct ath_txq *txq, + if (tid) + INCR(tid->seq_start, IEEE80211_SEQ_MAX); + ++ bf->bf_next = NULL; + bf->bf_lastbf = bf; + ath_tx_fill_desc(sc, bf, txq, fi->framelen); + ath_tx_txqaddbuf(sc, txq, &bf_head, false); +diff --git a/drivers/net/wireless/rt2x00/rt2800lib.c b/drivers/net/wireless/rt2x00/rt2800lib.c +index 1ba079d..fb19447 100644 +--- a/drivers/net/wireless/rt2x00/rt2800lib.c ++++ b/drivers/net/wireless/rt2x00/rt2800lib.c +@@ -2141,7 +2141,7 @@ static int rt2800_get_gain_calibration_delta(struct rt2x00_dev *rt2x00dev) + /* + * Check if temperature compensation is supported. + */ +- if (tssi_bounds[4] == 0xff) ++ if (tssi_bounds[4] == 0xff || step == 0xff) + return 0; + + /* +diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c +index f35cb10..6fa7222 100644 +--- a/drivers/target/iscsi/iscsi_target.c ++++ b/drivers/target/iscsi/iscsi_target.c +@@ -3523,7 +3523,9 @@ restart: + */ + iscsit_thread_check_cpumask(conn, current, 1); + +- schedule_timeout_interruptible(MAX_SCHEDULE_TIMEOUT); ++ wait_event_interruptible(conn->queues_wq, ++ !iscsit_conn_all_queues_empty(conn) || ++ ts->status == ISCSI_THREAD_SET_RESET); + + if ((ts->status == ISCSI_THREAD_SET_RESET) || + signal_pending(current)) +diff --git a/drivers/target/iscsi/iscsi_target_core.h b/drivers/target/iscsi/iscsi_target_core.h +index dae283f..bd8ce01 100644 +--- a/drivers/target/iscsi/iscsi_target_core.h ++++ b/drivers/target/iscsi/iscsi_target_core.h +@@ -491,6 +491,7 @@ struct iscsi_tmr_req { + }; + + struct iscsi_conn { ++ wait_queue_head_t queues_wq; + /* Authentication Successful for this connection */ + u8 auth_complete; + /* State connection is currently in */ +diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c +index 2ec5339..eb0c9fe 100644 +--- a/drivers/target/iscsi/iscsi_target_login.c ++++ b/drivers/target/iscsi/iscsi_target_login.c +@@ -44,6 +44,7 @@ extern spinlock_t sess_idr_lock; + + static int iscsi_login_init_conn(struct iscsi_conn *conn) + { ++ init_waitqueue_head(&conn->queues_wq); + INIT_LIST_HEAD(&conn->conn_list); + INIT_LIST_HEAD(&conn->conn_cmd_list); + INIT_LIST_HEAD(&conn->immed_queue_list); +diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c +index 99f2af3..e612722 100644 +--- a/drivers/target/iscsi/iscsi_target_util.c ++++ b/drivers/target/iscsi/iscsi_target_util.c +@@ -659,7 +659,7 @@ void iscsit_add_cmd_to_immediate_queue( + atomic_set(&conn->check_immediate_queue, 1); + spin_unlock_bh(&conn->immed_queue_lock); + +- wake_up_process(conn->thread_set->tx_thread); ++ wake_up(&conn->queues_wq); + } + + struct iscsi_queue_req *iscsit_get_cmd_from_immediate_queue(struct iscsi_conn *conn) +@@ -733,7 +733,7 @@ void iscsit_add_cmd_to_response_queue( + atomic_inc(&cmd->response_queue_count); + spin_unlock_bh(&conn->response_queue_lock); + +- wake_up_process(conn->thread_set->tx_thread); ++ wake_up(&conn->queues_wq); + } + + struct iscsi_queue_req *iscsit_get_cmd_from_response_queue(struct iscsi_conn *conn) +@@ -787,6 +787,24 @@ static void iscsit_remove_cmd_from_response_queue( + } + } + ++bool iscsit_conn_all_queues_empty(struct iscsi_conn *conn) ++{ ++ bool empty; ++ ++ spin_lock_bh(&conn->immed_queue_lock); ++ empty = list_empty(&conn->immed_queue_list); ++ spin_unlock_bh(&conn->immed_queue_lock); ++ ++ if (!empty) ++ return empty; ++ ++ spin_lock_bh(&conn->response_queue_lock); ++ empty = list_empty(&conn->response_queue_list); ++ spin_unlock_bh(&conn->response_queue_lock); ++ ++ return empty; ++} ++ + void iscsit_free_queue_reqs_for_conn(struct iscsi_conn *conn) + { + struct iscsi_queue_req *qr, *qr_tmp; +diff --git a/drivers/target/iscsi/iscsi_target_util.h b/drivers/target/iscsi/iscsi_target_util.h +index 835bf7d..cfac698 100644 +--- a/drivers/target/iscsi/iscsi_target_util.h ++++ b/drivers/target/iscsi/iscsi_target_util.h +@@ -28,6 +28,7 @@ extern struct iscsi_queue_req *iscsit_get_cmd_from_immediate_queue(struct iscsi_ + extern void iscsit_add_cmd_to_response_queue(struct iscsi_cmd *, struct iscsi_conn *, u8); + extern struct iscsi_queue_req *iscsit_get_cmd_from_response_queue(struct iscsi_conn *); + extern void iscsit_remove_cmd_from_tx_queues(struct iscsi_cmd *, struct iscsi_conn *); ++extern bool iscsit_conn_all_queues_empty(struct iscsi_conn *); + extern void iscsit_free_queue_reqs_for_conn(struct iscsi_conn *); + extern void iscsit_release_cmd(struct iscsi_cmd *); + extern void iscsit_free_cmd(struct iscsi_cmd *); +diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c +index 0b01bfc..013b133 100644 +--- a/drivers/target/target_core_configfs.c ++++ b/drivers/target/target_core_configfs.c +@@ -3205,7 +3205,8 @@ static int __init target_core_init_configfs(void) + if (ret < 0) + goto out; + +- if (core_dev_setup_virtual_lun0() < 0) ++ ret = core_dev_setup_virtual_lun0(); ++ if (ret < 0) + goto out; + + return 0; +diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c +index f8773ae..a0143a0 100644 +--- a/drivers/target/target_core_device.c ++++ b/drivers/target/target_core_device.c +@@ -835,20 +835,20 @@ int se_dev_check_shutdown(struct se_device *dev) + + u32 se_dev_align_max_sectors(u32 max_sectors, u32 block_size) + { +- u32 tmp, aligned_max_sectors; ++ u32 aligned_max_sectors; ++ u32 alignment; + /* + * Limit max_sectors to a PAGE_SIZE aligned value for modern + * transport_allocate_data_tasks() operation. + */ +- tmp = rounddown((max_sectors * block_size), PAGE_SIZE); +- aligned_max_sectors = (tmp / block_size); +- if (max_sectors != aligned_max_sectors) { +- printk(KERN_INFO "Rounding down aligned max_sectors from %u" +- " to %u\n", max_sectors, aligned_max_sectors); +- return aligned_max_sectors; +- } ++ alignment = max(1ul, PAGE_SIZE / block_size); ++ aligned_max_sectors = rounddown(max_sectors, alignment); ++ ++ if (max_sectors != aligned_max_sectors) ++ pr_info("Rounding down aligned max_sectors from %u to %u\n", ++ max_sectors, aligned_max_sectors); + +- return max_sectors; ++ return aligned_max_sectors; + } + + void se_dev_set_default_attribs( +diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c +index d481f80..43a38aa 100644 +--- a/drivers/usb/serial/mos7840.c ++++ b/drivers/usb/serial/mos7840.c +@@ -2585,7 +2585,6 @@ error: + static void mos7840_disconnect(struct usb_serial *serial) + { + int i; +- unsigned long flags; + struct moschip_port *mos7840_port; + dbg("%s", " disconnect :entering.........."); + +diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c +index 625890c..080b186 100644 +--- a/drivers/xen/gntdev.c ++++ b/drivers/xen/gntdev.c +@@ -105,6 +105,21 @@ static void gntdev_print_maps(struct gntdev_priv *priv, + #endif + } + ++static void gntdev_free_map(struct grant_map *map) ++{ ++ if (map == NULL) ++ return; ++ ++ if (map->pages) ++ free_xenballooned_pages(map->count, map->pages); ++ kfree(map->pages); ++ kfree(map->grants); ++ kfree(map->map_ops); ++ kfree(map->unmap_ops); ++ kfree(map->kmap_ops); ++ kfree(map); ++} ++ + static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) + { + struct grant_map *add; +@@ -142,12 +157,7 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) + return add; + + err: +- kfree(add->pages); +- kfree(add->grants); +- kfree(add->map_ops); +- kfree(add->unmap_ops); +- kfree(add->kmap_ops); +- kfree(add); ++ gntdev_free_map(add); + return NULL; + } + +@@ -196,17 +206,9 @@ static void gntdev_put_map(struct grant_map *map) + if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) + notify_remote_via_evtchn(map->notify.event); + +- if (map->pages) { +- if (!use_ptemod) +- unmap_grant_pages(map, 0, map->count); +- +- free_xenballooned_pages(map->count, map->pages); +- } +- kfree(map->pages); +- kfree(map->grants); +- kfree(map->map_ops); +- kfree(map->unmap_ops); +- kfree(map); ++ if (map->pages && !use_ptemod) ++ unmap_grant_pages(map, 0, map->count); ++ gntdev_free_map(map); + } + + /* ------------------------------------------------------------------ */ +diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c +index 72ddf23..b3522af 100644 +--- a/fs/cifs/cifsacl.c ++++ b/fs/cifs/cifsacl.c +@@ -225,6 +225,13 @@ sid_to_str(struct cifs_sid *sidptr, char *sidstr) + } + + static void ++cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) ++{ ++ memcpy(dst, src, sizeof(*dst)); ++ dst->num_subauth = min_t(u8, src->num_subauth, NUM_SUBAUTHS); ++} ++ ++static void + id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr, + struct cifs_sid_id **psidid, char *typestr) + { +@@ -248,7 +255,7 @@ id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr, + } + } + +- memcpy(&(*psidid)->sid, sidptr, sizeof(struct cifs_sid)); ++ cifs_copy_sid(&(*psidid)->sid, sidptr); + (*psidid)->time = jiffies - (SID_MAP_RETRY + 1); + (*psidid)->refcount = 0; + +@@ -354,7 +361,7 @@ id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) + * any fields of the node after a reference is put . + */ + if (test_bit(SID_ID_MAPPED, &psidid->state)) { +- memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); ++ cifs_copy_sid(ssid, &psidid->sid); + psidid->time = jiffies; /* update ts for accessing */ + goto id_sid_out; + } +@@ -370,14 +377,14 @@ id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) + if (IS_ERR(sidkey)) { + rc = -EINVAL; + cFYI(1, "%s: Can't map and id to a SID", __func__); ++ } else if (sidkey->datalen < sizeof(struct cifs_sid)) { ++ rc = -EIO; ++ cFYI(1, "%s: Downcall contained malformed key " ++ "(datalen=%hu)", __func__, sidkey->datalen); + } else { + lsid = (struct cifs_sid *)sidkey->payload.data; +- memcpy(&psidid->sid, lsid, +- sidkey->datalen < sizeof(struct cifs_sid) ? +- sidkey->datalen : sizeof(struct cifs_sid)); +- memcpy(ssid, &psidid->sid, +- sidkey->datalen < sizeof(struct cifs_sid) ? +- sidkey->datalen : sizeof(struct cifs_sid)); ++ cifs_copy_sid(&psidid->sid, lsid); ++ cifs_copy_sid(ssid, &psidid->sid); + set_bit(SID_ID_MAPPED, &psidid->state); + key_put(sidkey); + kfree(psidid->sidstr); +@@ -396,7 +403,7 @@ id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) + return rc; + } + if (test_bit(SID_ID_MAPPED, &psidid->state)) +- memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); ++ cifs_copy_sid(ssid, &psidid->sid); + else + rc = -EINVAL; + } +@@ -674,8 +681,6 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) + static void copy_sec_desc(const struct cifs_ntsd *pntsd, + struct cifs_ntsd *pnntsd, __u32 sidsoffset) + { +- int i; +- + struct cifs_sid *owner_sid_ptr, *group_sid_ptr; + struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; + +@@ -691,26 +696,14 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd, + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->osidoffset)); + nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset); +- +- nowner_sid_ptr->revision = owner_sid_ptr->revision; +- nowner_sid_ptr->num_subauth = owner_sid_ptr->num_subauth; +- for (i = 0; i < 6; i++) +- nowner_sid_ptr->authority[i] = owner_sid_ptr->authority[i]; +- for (i = 0; i < 5; i++) +- nowner_sid_ptr->sub_auth[i] = owner_sid_ptr->sub_auth[i]; ++ cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr); + + /* copy group sid */ + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->gsidoffset)); + ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset + + sizeof(struct cifs_sid)); +- +- ngroup_sid_ptr->revision = group_sid_ptr->revision; +- ngroup_sid_ptr->num_subauth = group_sid_ptr->num_subauth; +- for (i = 0; i < 6; i++) +- ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i]; +- for (i = 0; i < 5; i++) +- ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i]; ++ cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr); + + return; + } +@@ -1117,8 +1110,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, + kfree(nowner_sid_ptr); + return rc; + } +- memcpy(owner_sid_ptr, nowner_sid_ptr, +- sizeof(struct cifs_sid)); ++ cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr); + kfree(nowner_sid_ptr); + *aclflag = CIFS_ACL_OWNER; + } +@@ -1136,8 +1128,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, + kfree(ngroup_sid_ptr); + return rc; + } +- memcpy(group_sid_ptr, ngroup_sid_ptr, +- sizeof(struct cifs_sid)); ++ cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr); + kfree(ngroup_sid_ptr); + *aclflag = CIFS_ACL_GROUP; + } +diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c +index 1cfef9f..94afdfd 100644 +--- a/fs/ecryptfs/main.c ++++ b/fs/ecryptfs/main.c +@@ -280,6 +280,7 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, + char *fnek_src; + char *cipher_key_bytes_src; + char *fn_cipher_key_bytes_src; ++ u8 cipher_code; + + *check_ruid = 0; + +@@ -421,6 +422,18 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, + && !fn_cipher_key_bytes_set) + mount_crypt_stat->global_default_fn_cipher_key_bytes = + mount_crypt_stat->global_default_cipher_key_size; ++ ++ cipher_code = ecryptfs_code_for_cipher_string( ++ mount_crypt_stat->global_default_cipher_name, ++ mount_crypt_stat->global_default_cipher_key_size); ++ if (!cipher_code) { ++ ecryptfs_printk(KERN_ERR, ++ "eCryptfs doesn't support cipher: %s", ++ mount_crypt_stat->global_default_cipher_name); ++ rc = -EINVAL; ++ goto out; ++ } ++ + mutex_lock(&key_tfm_list_mutex); + if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, + NULL)) { +@@ -506,7 +519,6 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags + goto out; + } + +- s->s_flags = flags; + rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); + if (rc) + goto out1; +@@ -542,6 +554,15 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags + } + + ecryptfs_set_superblock_lower(s, path.dentry->d_sb); ++ ++ /** ++ * Set the POSIX ACL flag based on whether they're enabled in the lower ++ * mount. Force a read-only eCryptfs mount if the lower mount is ro. ++ * Allow a ro eCryptfs mount even when the lower mount is rw. ++ */ ++ s->s_flags = flags & ~MS_POSIXACL; ++ s->s_flags |= path.dentry->d_sb->s_flags & (MS_RDONLY | MS_POSIXACL); ++ + s->s_maxbytes = path.dentry->d_sb->s_maxbytes; + s->s_blocksize = path.dentry->d_sb->s_blocksize; + s->s_magic = ECRYPTFS_SUPER_MAGIC; +diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c +index a6e711a..ee02db5 100644 +--- a/fs/nfs/dns_resolve.c ++++ b/fs/nfs/dns_resolve.c +@@ -213,7 +213,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) + { + char buf1[NFS_DNS_HOSTNAME_MAXLEN+1]; + struct nfs_dns_ent key, *item; +- unsigned long ttl; ++ unsigned int ttl; + ssize_t len; + int ret = -EINVAL; + +@@ -236,7 +236,8 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) + key.namelen = len; + memset(&key.h, 0, sizeof(key.h)); + +- ttl = get_expiry(&buf); ++ if (get_uint(&buf, &ttl) < 0) ++ goto out; + if (ttl == 0) + goto out; + key.h.expiry_time = ttl + seconds_since_boot(); +diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h +index 68b3f20..c5af878 100644 +--- a/fs/nfs/internal.h ++++ b/fs/nfs/internal.h +@@ -274,8 +274,9 @@ extern void nfs_sb_active(struct super_block *sb); + extern void nfs_sb_deactive(struct super_block *sb); + + /* namespace.c */ ++#define NFS_PATH_CANONICAL 1 + extern char *nfs_path(char **p, struct dentry *dentry, +- char *buffer, ssize_t buflen); ++ char *buffer, ssize_t buflen, unsigned flags); + extern struct vfsmount *nfs_d_automount(struct path *path); + #ifdef CONFIG_NFS_V4 + rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *); +@@ -364,7 +365,7 @@ static inline char *nfs_devname(struct dentry *dentry, + char *buffer, ssize_t buflen) + { + char *dummy; +- return nfs_path(&dummy, dentry, buffer, buflen); ++ return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL); + } + + /* +diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c +index d4c2d6b..3d93216 100644 +--- a/fs/nfs/mount_clnt.c ++++ b/fs/nfs/mount_clnt.c +@@ -181,7 +181,7 @@ int nfs_mount(struct nfs_mount_request *info) + else + msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT]; + +- status = rpc_call_sync(mnt_clnt, &msg, 0); ++ status = rpc_call_sync(mnt_clnt, &msg, RPC_TASK_SOFT|RPC_TASK_TIMEOUT); + rpc_shutdown_client(mnt_clnt); + + if (status < 0) +diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c +index 8102391..a86873e 100644 +--- a/fs/nfs/namespace.c ++++ b/fs/nfs/namespace.c +@@ -37,6 +37,7 @@ static struct vfsmount *nfs_do_submount(struct dentry *dentry, + * @dentry - pointer to dentry + * @buffer - result buffer + * @buflen - length of buffer ++ * @flags - options (see below) + * + * Helper function for constructing the server pathname + * by arbitrary hashed dentry. +@@ -44,8 +45,14 @@ static struct vfsmount *nfs_do_submount(struct dentry *dentry, + * This is mainly for use in figuring out the path on the + * server side when automounting on top of an existing partition + * and in generating /proc/mounts and friends. ++ * ++ * Supported flags: ++ * NFS_PATH_CANONICAL: ensure there is exactly one slash after ++ * the original device (export) name ++ * (if unset, the original name is returned verbatim) + */ +-char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen) ++char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen, ++ unsigned flags) + { + char *end; + int namelen; +@@ -78,7 +85,7 @@ rename_retry: + rcu_read_unlock(); + goto rename_retry; + } +- if (*end != '/') { ++ if ((flags & NFS_PATH_CANONICAL) && *end != '/') { + if (--buflen < 0) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); +@@ -95,9 +102,11 @@ rename_retry: + return end; + } + namelen = strlen(base); +- /* Strip off excess slashes in base string */ +- while (namelen > 0 && base[namelen - 1] == '/') +- namelen--; ++ if (flags & NFS_PATH_CANONICAL) { ++ /* Strip off excess slashes in base string */ ++ while (namelen > 0 && base[namelen - 1] == '/') ++ namelen--; ++ } + buflen -= namelen; + if (buflen < 0) { + spin_unlock(&dentry->d_lock); +diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c +index bb80c49..96f2b67 100644 +--- a/fs/nfs/nfs4namespace.c ++++ b/fs/nfs/nfs4namespace.c +@@ -57,7 +57,8 @@ Elong: + static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen) + { + char *limit; +- char *path = nfs_path(&limit, dentry, buffer, buflen); ++ char *path = nfs_path(&limit, dentry, buffer, buflen, ++ NFS_PATH_CANONICAL); + if (!IS_ERR(path)) { + char *colon = strchr(path, ':'); + if (colon && colon < limit) +diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c +index 61796a40..864b831 100644 +--- a/fs/nfs/nfs4proc.c ++++ b/fs/nfs/nfs4proc.c +@@ -303,8 +303,7 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc + dprintk("%s ERROR: %d Reset session\n", __func__, + errorcode); + nfs4_schedule_session_recovery(clp->cl_session); +- exception->retry = 1; +- break; ++ goto wait_on_recovery; + #endif /* defined(CONFIG_NFS_V4_1) */ + case -NFS4ERR_FILE_OPEN: + if (exception->timeout > HZ) { +@@ -1464,9 +1463,11 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) + data->timestamp = jiffies; + if (nfs4_setup_sequence(data->o_arg.server, + &data->o_arg.seq_args, +- &data->o_res.seq_res, 1, task)) +- return; +- rpc_call_start(task); ++ &data->o_res.seq_res, ++ 1, task) != 0) ++ nfs_release_seqid(data->o_arg.seqid); ++ else ++ rpc_call_start(task); + return; + unlock_no_action: + rcu_read_unlock(); +@@ -2046,9 +2047,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) + calldata->timestamp = jiffies; + if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), + &calldata->arg.seq_args, &calldata->res.seq_res, +- 1, task)) +- return; +- rpc_call_start(task); ++ 1, task) != 0) ++ nfs_release_seqid(calldata->arg.seqid); ++ else ++ rpc_call_start(task); + } + + static const struct rpc_call_ops nfs4_close_ops = { +@@ -4148,6 +4150,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) + if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } ++ nfs_release_seqid(calldata->arg.seqid); + } + + static void nfs4_locku_prepare(struct rpc_task *task, void *data) +@@ -4164,9 +4167,11 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) + calldata->timestamp = jiffies; + if (nfs4_setup_sequence(calldata->server, + &calldata->arg.seq_args, +- &calldata->res.seq_res, 1, task)) +- return; +- rpc_call_start(task); ++ &calldata->res.seq_res, ++ 1, task) != 0) ++ nfs_release_seqid(calldata->arg.seqid); ++ else ++ rpc_call_start(task); + } + + static const struct rpc_call_ops nfs4_locku_ops = { +@@ -4310,7 +4315,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) + /* Do we need to do an open_to_lock_owner? */ + if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { + if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) +- return; ++ goto out_release_lock_seqid; + data->arg.open_stateid = &state->stateid; + data->arg.new_lock_owner = 1; + data->res.open_seqid = data->arg.open_seqid; +@@ -4319,10 +4324,15 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) + data->timestamp = jiffies; + if (nfs4_setup_sequence(data->server, + &data->arg.seq_args, +- &data->res.seq_res, 1, task)) ++ &data->res.seq_res, ++ 1, task) == 0) { ++ rpc_call_start(task); + return; +- rpc_call_start(task); +- dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); ++ } ++ nfs_release_seqid(data->arg.open_seqid); ++out_release_lock_seqid: ++ nfs_release_seqid(data->arg.lock_seqid); ++ dprintk("%s: done!, ret = %d\n", __func__, task->tk_status); + } + + static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata) +diff --git a/fs/nfs/super.c b/fs/nfs/super.c +index e42d6f6..8150344 100644 +--- a/fs/nfs/super.c ++++ b/fs/nfs/super.c +@@ -768,7 +768,7 @@ static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) + int err = 0; + if (!page) + return -ENOMEM; +- devname = nfs_path(&dummy, mnt->mnt_root, page, PAGE_SIZE); ++ devname = nfs_path(&dummy, mnt->mnt_root, page, PAGE_SIZE, 0); + if (IS_ERR(devname)) + err = PTR_ERR(devname); + else +diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c +index 5f312ab..a0205fc 100644 +--- a/fs/nfsd/export.c ++++ b/fs/nfsd/export.c +@@ -401,7 +401,7 @@ fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) + int migrated, i, err; + + /* listsize */ +- err = get_int(mesg, &fsloc->locations_count); ++ err = get_uint(mesg, &fsloc->locations_count); + if (err) + return err; + if (fsloc->locations_count > MAX_FS_LOCATIONS) +@@ -459,7 +459,7 @@ static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) + return -EINVAL; + + for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) { +- err = get_int(mesg, &f->pseudoflavor); ++ err = get_uint(mesg, &f->pseudoflavor); + if (err) + return err; + /* +@@ -468,7 +468,7 @@ static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) + * problem at export time instead of when a client fails + * to authenticate. + */ +- err = get_int(mesg, &f->flags); ++ err = get_uint(mesg, &f->flags); + if (err) + return err; + /* Only some flags are allowed to differ between flavors: */ +diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c +index f35794b..a506360 100644 +--- a/fs/notify/fanotify/fanotify.c ++++ b/fs/notify/fanotify/fanotify.c +@@ -21,6 +21,7 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) + if ((old->path.mnt == new->path.mnt) && + (old->path.dentry == new->path.dentry)) + return true; ++ break; + case (FSNOTIFY_EVENT_NONE): + return true; + default: +diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c +index 4f5d0ce..86ca506 100644 +--- a/fs/xfs/xfs_log_recover.c ++++ b/fs/xfs/xfs_log_recover.c +@@ -3514,7 +3514,7 @@ xlog_do_recovery_pass( + * - order is important. + */ + error = xlog_bread_offset(log, 0, +- bblks - split_bblks, hbp, ++ bblks - split_bblks, dbp, + offset + BBTOB(split_bblks)); + if (error) + goto bread_err2; +diff --git a/include/linux/if_link.h b/include/linux/if_link.h +index c52d4b5..4b24ff4 100644 +--- a/include/linux/if_link.h ++++ b/include/linux/if_link.h +@@ -137,6 +137,7 @@ enum { + IFLA_AF_SPEC, + IFLA_GROUP, /* Group the device belongs to */ + IFLA_NET_NS_FD, ++ IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ + __IFLA_MAX + }; + +diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h +index 8e872ea..577592e 100644 +--- a/include/linux/rtnetlink.h ++++ b/include/linux/rtnetlink.h +@@ -602,6 +602,9 @@ struct tcamsg { + #define TCA_ACT_TAB 1 /* attr type must be >=1 */ + #define TCAA_MAX 1 + ++/* New extended info filters for IFLA_EXT_MASK */ ++#define RTEXT_FILTER_VF (1 << 0) ++ + /* End of information exported to user level */ + + #ifdef __KERNEL__ +diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h +index 5efd8ce..f0c6ab5 100644 +--- a/include/linux/sunrpc/cache.h ++++ b/include/linux/sunrpc/cache.h +@@ -224,6 +224,22 @@ static inline int get_int(char **bpp, int *anint) + return 0; + } + ++static inline int get_uint(char **bpp, unsigned int *anint) ++{ ++ char buf[50]; ++ int len = qword_get(bpp, buf, sizeof(buf)); ++ ++ if (len < 0) ++ return -EINVAL; ++ if (len == 0) ++ return -ENOENT; ++ ++ if (kstrtouint(buf, 0, anint)) ++ return -EINVAL; ++ ++ return 0; ++} ++ + /* + * timestamps kept in the cache are expressed in seconds + * since boot. This is the best for measuring differences in +diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h +index 95852e3..19d632d 100644 +--- a/include/net/cfg80211.h ++++ b/include/net/cfg80211.h +@@ -2431,6 +2431,15 @@ unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb); + unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc); + + /** ++ * ieee80211_get_mesh_hdrlen - get mesh extension header length ++ * @meshhdr: the mesh extension header, only the flags field ++ * (first byte) will be accessed ++ * Returns the length of the extension header, which is always at ++ * least 6 bytes and at most 18 if address 5 and 6 are present. ++ */ ++unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr); ++ ++/** + * DOC: Data path helpers + * + * In addition to generic utilities, cfg80211 also offers +diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h +index 678f1ff..3702939 100644 +--- a/include/net/rtnetlink.h ++++ b/include/net/rtnetlink.h +@@ -6,7 +6,7 @@ + + typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, void *); + typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); +-typedef u16 (*rtnl_calcit_func)(struct sk_buff *); ++typedef u16 (*rtnl_calcit_func)(struct sk_buff *, struct nlmsghdr *); + + extern int __rtnl_register(int protocol, int msgtype, + rtnl_doit_func, rtnl_dumpit_func, +diff --git a/include/sound/core.h b/include/sound/core.h +index 3be5ab7..222f11e 100644 +--- a/include/sound/core.h ++++ b/include/sound/core.h +@@ -132,6 +132,7 @@ struct snd_card { + int shutdown; /* this card is going down */ + int free_on_last_close; /* free in context of file_release */ + wait_queue_head_t shutdown_sleep; ++ atomic_t refcount; /* refcount for disconnection */ + struct device *dev; /* device assigned to this card */ + struct device *card_dev; /* cardX object for sysfs */ + +@@ -189,6 +190,7 @@ struct snd_minor { + const struct file_operations *f_ops; /* file operations */ + void *private_data; /* private data for f_ops->open */ + struct device *dev; /* device for sysfs */ ++ struct snd_card *card_ptr; /* assigned card instance */ + }; + + /* return a device pointer linked to each sound device as a parent */ +@@ -295,6 +297,7 @@ int snd_card_info_done(void); + int snd_component_add(struct snd_card *card, const char *component); + int snd_card_file_add(struct snd_card *card, struct file *file); + int snd_card_file_remove(struct snd_card *card, struct file *file); ++void snd_card_unref(struct snd_card *card); + + #define snd_card_set_dev(card, devptr) ((card)->dev = (devptr)) + +diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h +index 92f1a79..348c4fe 100644 +--- a/include/trace/events/xen.h ++++ b/include/trace/events/xen.h +@@ -377,6 +377,14 @@ DECLARE_EVENT_CLASS(xen_mmu_pgd, + DEFINE_XEN_MMU_PGD_EVENT(xen_mmu_pgd_pin); + DEFINE_XEN_MMU_PGD_EVENT(xen_mmu_pgd_unpin); + ++TRACE_EVENT(xen_mmu_flush_tlb_all, ++ TP_PROTO(int x), ++ TP_ARGS(x), ++ TP_STRUCT__entry(__array(char, x, 0)), ++ TP_fast_assign((void)x), ++ TP_printk("%s", "") ++ ); ++ + TRACE_EVENT(xen_mmu_flush_tlb, + TP_PROTO(int x), + TP_ARGS(x), +diff --git a/kernel/module.c b/kernel/module.c +index 6c8fa34..65362d9 100644 +--- a/kernel/module.c ++++ b/kernel/module.c +@@ -2193,15 +2193,17 @@ static void layout_symtab(struct module *mod, struct load_info *info) + + src = (void *)info->hdr + symsect->sh_offset; + nsrc = symsect->sh_size / sizeof(*src); +- for (ndst = i = 1; i < nsrc; ++i, ++src) +- if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { +- unsigned int j = src->st_name; ++ for (ndst = i = 0; i < nsrc; i++) { ++ if (i == 0 || ++ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { ++ unsigned int j = src[i].st_name; + + while (!__test_and_set_bit(j, info->strmap) + && info->strtab[j]) + ++j; + ++ndst; + } ++ } + + /* Append room for core symbols at end of core part. */ + info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); +@@ -2238,14 +2240,14 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) + + mod->core_symtab = dst = mod->module_core + info->symoffs; + src = mod->symtab; +- *dst = *src; +- for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { +- if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) +- continue; +- dst[ndst] = *src; +- dst[ndst].st_name = bitmap_weight(info->strmap, +- dst[ndst].st_name); +- ++ndst; ++ for (ndst = i = 0; i < mod->num_symtab; i++) { ++ if (i == 0 || ++ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { ++ dst[ndst] = src[i]; ++ dst[ndst].st_name = bitmap_weight(info->strmap, ++ dst[ndst].st_name); ++ ++ndst; ++ } + } + mod->core_num_syms = ndst; + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 86eb848..313381c 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -3015,6 +3015,8 @@ static int kswapd(void *p) + &balanced_classzone_idx); + } + } ++ ++ current->reclaim_state = NULL; + return 0; + } + +diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c +index 1fb1aec..aa12649 100644 +--- a/net/bluetooth/hci_conn.c ++++ b/net/bluetooth/hci_conn.c +@@ -642,8 +642,10 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) + { + BT_DBG("conn %p", conn); + ++#ifdef CONFIG_BT_L2CAP + if (conn->type == LE_LINK) + return smp_conn_security(conn, sec_level); ++#endif + + /* For sdp we don't need the link key. */ + if (sec_level == BT_SECURITY_SDP) +diff --git a/net/core/dev.c b/net/core/dev.c +index f500a69..480be72 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -1633,7 +1633,7 @@ static inline int deliver_skb(struct sk_buff *skb, + + static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) + { +- if (ptype->af_packet_priv == NULL) ++ if (!ptype->af_packet_priv || !skb->sk) + return false; + + if (ptype->id_match) +diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c +index 0cf604b..5229c7f 100644 +--- a/net/core/rtnetlink.c ++++ b/net/core/rtnetlink.c +@@ -60,7 +60,6 @@ struct rtnl_link { + }; + + static DEFINE_MUTEX(rtnl_mutex); +-static u16 min_ifinfo_dump_size; + + void rtnl_lock(void) + { +@@ -727,10 +726,11 @@ static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) + } + + /* All VF info */ +-static inline int rtnl_vfinfo_size(const struct net_device *dev) ++static inline int rtnl_vfinfo_size(const struct net_device *dev, ++ u32 ext_filter_mask) + { +- if (dev->dev.parent && dev_is_pci(dev->dev.parent)) { +- ++ if (dev->dev.parent && dev_is_pci(dev->dev.parent) && ++ (ext_filter_mask & RTEXT_FILTER_VF)) { + int num_vfs = dev_num_vf(dev->dev.parent); + size_t size = nla_total_size(sizeof(struct nlattr)); + size += nla_total_size(num_vfs * sizeof(struct nlattr)); +@@ -769,7 +769,8 @@ static size_t rtnl_port_size(const struct net_device *dev) + return port_self_size; + } + +-static noinline size_t if_nlmsg_size(const struct net_device *dev) ++static noinline size_t if_nlmsg_size(const struct net_device *dev, ++ u32 ext_filter_mask) + { + return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ +@@ -787,8 +788,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev) + + nla_total_size(4) /* IFLA_MASTER */ + + nla_total_size(1) /* IFLA_OPERSTATE */ + + nla_total_size(1) /* IFLA_LINKMODE */ +- + nla_total_size(4) /* IFLA_NUM_VF */ +- + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */ ++ + nla_total_size(ext_filter_mask ++ & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ ++ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ +@@ -871,7 +873,7 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) + + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, + int type, u32 pid, u32 seq, u32 change, +- unsigned int flags) ++ unsigned int flags, u32 ext_filter_mask) + { + struct ifinfomsg *ifm; + struct nlmsghdr *nlh; +@@ -944,10 +946,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, + goto nla_put_failure; + copy_rtnl_link_stats64(nla_data(attr), stats); + +- if (dev->dev.parent) ++ if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) + NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)); + +- if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { ++ if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent ++ && (ext_filter_mask & RTEXT_FILTER_VF)) { + int i; + + struct nlattr *vfinfo, *vf; +@@ -1051,6 +1054,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) + struct net_device *dev; + struct hlist_head *head; + struct hlist_node *node; ++ struct nlattr *tb[IFLA_MAX+1]; ++ u32 ext_filter_mask = 0; + + s_h = cb->args[0]; + s_idx = cb->args[1]; +@@ -1058,6 +1063,13 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) + rcu_read_lock(); + cb->seq = net->dev_base_seq; + ++ if (nlmsg_parse(cb->nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, ++ ifla_policy) >= 0) { ++ ++ if (tb[IFLA_EXT_MASK]) ++ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); ++ } ++ + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; +@@ -1067,7 +1079,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) + if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, 0, +- NLM_F_MULTI) <= 0) ++ NLM_F_MULTI, ++ ext_filter_mask) <= 0) + goto out; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +@@ -1103,6 +1116,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { + [IFLA_VF_PORTS] = { .type = NLA_NESTED }, + [IFLA_PORT_SELF] = { .type = NLA_NESTED }, + [IFLA_AF_SPEC] = { .type = NLA_NESTED }, ++ [IFLA_EXT_MASK] = { .type = NLA_U32 }, + }; + EXPORT_SYMBOL(ifla_policy); + +@@ -1845,6 +1859,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) + struct net_device *dev = NULL; + struct sk_buff *nskb; + int err; ++ u32 ext_filter_mask = 0; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + if (err < 0) +@@ -1853,6 +1868,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) + if (tb[IFLA_IFNAME]) + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + ++ if (tb[IFLA_EXT_MASK]) ++ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); ++ + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); +@@ -1864,12 +1882,12 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) + if (dev == NULL) + return -ENODEV; + +- nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); ++ nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL); + if (nskb == NULL) + return -ENOBUFS; + + err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, +- nlh->nlmsg_seq, 0, 0); ++ nlh->nlmsg_seq, 0, 0, ext_filter_mask); + if (err < 0) { + /* -EMSGSIZE implies BUG in if_nlmsg_size */ + WARN_ON(err == -EMSGSIZE); +@@ -1880,8 +1898,32 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) + return err; + } + +-static u16 rtnl_calcit(struct sk_buff *skb) ++static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) + { ++ struct net *net = sock_net(skb->sk); ++ struct net_device *dev; ++ struct nlattr *tb[IFLA_MAX+1]; ++ u32 ext_filter_mask = 0; ++ u16 min_ifinfo_dump_size = 0; ++ ++ if (nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, ++ ifla_policy) >= 0) { ++ if (tb[IFLA_EXT_MASK]) ++ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); ++ } ++ ++ if (!ext_filter_mask) ++ return NLMSG_GOODSIZE; ++ /* ++ * traverse the list of net devices and compute the minimum ++ * buffer size based upon the filter mask. ++ */ ++ list_for_each_entry(dev, &net->dev_base_head, dev_list) { ++ min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size, ++ if_nlmsg_size(dev, ++ ext_filter_mask)); ++ } ++ + return min_ifinfo_dump_size; + } + +@@ -1916,13 +1958,11 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) + int err = -ENOBUFS; + size_t if_info_size; + +- skb = nlmsg_new((if_info_size = if_nlmsg_size(dev)), GFP_KERNEL); ++ skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL); + if (skb == NULL) + goto errout; + +- min_ifinfo_dump_size = max_t(u16, if_info_size, min_ifinfo_dump_size); +- +- err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0); ++ err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in if_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); +@@ -1980,7 +2020,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) + return -EOPNOTSUPP; + calcit = rtnl_get_calcit(family, type); + if (calcit) +- min_dump_alloc = calcit(skb); ++ min_dump_alloc = calcit(skb, nlh); + + __rtnl_unlock(); + rtnl = net->rtnl; +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 7397ad8..52edbb8 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -481,14 +481,12 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) + !tp->urg_data || + before(tp->urg_seq, tp->copied_seq) || + !before(tp->urg_seq, tp->rcv_nxt)) { +- struct sk_buff *skb; + + answ = tp->rcv_nxt - tp->copied_seq; + +- /* Subtract 1, if FIN is in queue. */ +- skb = skb_peek_tail(&sk->sk_receive_queue); +- if (answ && skb) +- answ -= tcp_hdr(skb)->fin; ++ /* Subtract 1, if FIN was received */ ++ if (answ && sock_flag(sk, SOCK_DONE)) ++ answ--; + } else + answ = tp->urg_seq - tp->copied_seq; + release_sock(sk); +diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c +index 813b43a..834857f 100644 +--- a/net/ipv4/tcp_illinois.c ++++ b/net/ipv4/tcp_illinois.c +@@ -313,11 +313,13 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, + .tcpv_rttcnt = ca->cnt_rtt, + .tcpv_minrtt = ca->base_rtt, + }; +- u64 t = ca->sum_rtt; + +- do_div(t, ca->cnt_rtt); +- info.tcpv_rtt = t; ++ if (info.tcpv_rttcnt > 0) { ++ u64 t = ca->sum_rtt; + ++ do_div(t, info.tcpv_rttcnt); ++ info.tcpv_rtt = t; ++ } + nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + } + } +diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c +index 0cb78d7..9ffc37f 100644 +--- a/net/ipv6/ndisc.c ++++ b/net/ipv6/ndisc.c +@@ -606,7 +606,7 @@ static void ndisc_send_unsol_na(struct net_device *dev) + { + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; +- struct in6_addr mcaddr; ++ struct in6_addr mcaddr = IN6ADDR_LINKLOCAL_ALLNODES_INIT; + + idev = in6_dev_get(dev); + if (!idev) +@@ -614,7 +614,6 @@ static void ndisc_send_unsol_na(struct net_device *dev) + + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { +- addrconf_addr_solict_mult(&ifa->addr, &mcaddr); + ndisc_send_na(dev, NULL, &mcaddr, &ifa->addr, + /*router=*/ !!idev->cnf.forwarding, + /*solicited=*/ false, /*override=*/ true, +diff --git a/net/ipv6/route.c b/net/ipv6/route.c +index 488a1b7..19724bd 100644 +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -185,7 +185,7 @@ static struct dst_ops ip6_dst_blackhole_ops = { + }; + + static const u32 ip6_template_metrics[RTAX_MAX] = { +- [RTAX_HOPLIMIT - 1] = 255, ++ [RTAX_HOPLIMIT - 1] = 0, + }; + + static struct rt6_info ip6_null_entry_template = { +@@ -1097,7 +1097,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, + ipv6_addr_copy(&rt->rt6i_dst.addr, addr); + rt->rt6i_dst.plen = 128; + rt->rt6i_idev = idev; +- dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255); ++ dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); + + spin_lock_bh(&icmp6_dst_lock); + rt->dst.next = icmp6_dst_gc_list; +diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c +index 2cef50b..64164fb 100644 +--- a/net/l2tp/l2tp_eth.c ++++ b/net/l2tp/l2tp_eth.c +@@ -269,6 +269,7 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p + + out_del_dev: + free_netdev(dev); ++ spriv->dev = NULL; + out_del_session: + l2tp_session_delete(session); + out: +diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c +index 3ece106..8c7364b 100644 +--- a/net/mac80211/ibss.c ++++ b/net/mac80211/ibss.c +@@ -940,7 +940,7 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, + sdata->u.ibss.state = IEEE80211_IBSS_MLME_SEARCH; + sdata->u.ibss.ibss_join_req = jiffies; + +- memcpy(sdata->u.ibss.ssid, params->ssid, IEEE80211_MAX_SSID_LEN); ++ memcpy(sdata->u.ibss.ssid, params->ssid, params->ssid_len); + sdata->u.ibss.ssid_len = params->ssid_len; + + mutex_unlock(&sdata->u.ibss.mtx); +diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c +index cda4875..cd6cbdb 100644 +--- a/net/mac80211/rx.c ++++ b/net/mac80211/rx.c +@@ -515,6 +515,11 @@ ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) + + if (ieee80211_is_action(hdr->frame_control)) { + u8 category; ++ ++ /* make sure category field is present */ ++ if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE) ++ return RX_DROP_MONITOR; ++ + mgmt = (struct ieee80211_mgmt *)hdr; + category = mgmt->u.action.category; + if (category != WLAN_CATEGORY_MESH_ACTION && +@@ -854,14 +859,16 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) + (!rx->sta || !test_sta_flag(rx->sta, WLAN_STA_ASSOC)))) { + if (rx->sta && rx->sta->dummy && + ieee80211_is_data_present(hdr->frame_control)) { +- u16 ethertype; +- u8 *payload; +- +- payload = rx->skb->data + +- ieee80211_hdrlen(hdr->frame_control); +- ethertype = (payload[6] << 8) | payload[7]; +- if (cpu_to_be16(ethertype) == +- rx->sdata->control_port_protocol) ++ unsigned int hdrlen; ++ __be16 ethertype; ++ ++ hdrlen = ieee80211_hdrlen(hdr->frame_control); ++ ++ if (rx->skb->len < hdrlen + 8) ++ return RX_DROP_MONITOR; ++ ++ skb_copy_bits(rx->skb, hdrlen + 6, ðertype, 2); ++ if (ethertype == rx->sdata->control_port_protocol) + return RX_CONTINUE; + } + return RX_DROP_MONITOR; +@@ -1449,11 +1456,14 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) + + hdr = (struct ieee80211_hdr *)rx->skb->data; + fc = hdr->frame_control; ++ ++ if (ieee80211_is_ctl(fc)) ++ return RX_CONTINUE; ++ + sc = le16_to_cpu(hdr->seq_ctrl); + frag = sc & IEEE80211_SCTL_FRAG; + + if (likely((!ieee80211_has_morefrags(fc) && frag == 0) || +- (rx->skb)->len < 24 || + is_multicast_ether_addr(hdr->addr1))) { + /* not fragmented */ + goto out; +@@ -1887,6 +1897,20 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) + + hdr = (struct ieee80211_hdr *) skb->data; + hdrlen = ieee80211_hdrlen(hdr->frame_control); ++ ++ /* make sure fixed part of mesh header is there, also checks skb len */ ++ if (!pskb_may_pull(rx->skb, hdrlen + 6)) ++ return RX_DROP_MONITOR; ++ ++ mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen); ++ ++ /* make sure full mesh header is there, also checks skb len */ ++ if (!pskb_may_pull(rx->skb, ++ hdrlen + ieee80211_get_mesh_hdrlen(mesh_hdr))) ++ return RX_DROP_MONITOR; ++ ++ /* reload pointers */ ++ hdr = (struct ieee80211_hdr *) skb->data; + mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen); + + /* frame is in RMC, don't forward */ +@@ -1895,7 +1919,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) + mesh_rmc_check(hdr->addr3, mesh_hdr, rx->sdata)) + return RX_DROP_MONITOR; + +- if (!ieee80211_is_data(hdr->frame_control)) ++ if (!ieee80211_is_data(hdr->frame_control) || ++ !(status->rx_flags & IEEE80211_RX_RA_MATCH)) + return RX_CONTINUE; + + if (!mesh_hdr->ttl) +@@ -1916,9 +1941,12 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) + if (is_multicast_ether_addr(hdr->addr1)) { + mpp_addr = hdr->addr3; + proxied_addr = mesh_hdr->eaddr1; +- } else { ++ } else if (mesh_hdr->flags & MESH_FLAGS_AE_A5_A6) { ++ /* has_a4 already checked in ieee80211_rx_mesh_check */ + mpp_addr = hdr->addr4; + proxied_addr = mesh_hdr->eaddr2; ++ } else { ++ return RX_DROP_MONITOR; + } + + rcu_read_lock(); +@@ -1941,7 +1969,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) + + mesh_hdr->ttl--; + +- if (status->rx_flags & IEEE80211_RX_RA_MATCH) { ++ { + if (!mesh_hdr->ttl) + IEEE80211_IFSTA_MESH_CTR_INC(&rx->sdata->u.mesh, + dropped_frames_ttl); +@@ -2295,6 +2323,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) + } + break; + case WLAN_CATEGORY_SELF_PROTECTED: ++ if (len < (IEEE80211_MIN_ACTION_SIZE + ++ sizeof(mgmt->u.action.u.self_prot.action_code))) ++ break; ++ + switch (mgmt->u.action.u.self_prot.action_code) { + case WLAN_SP_MESH_PEERING_OPEN: + case WLAN_SP_MESH_PEERING_CLOSE: +@@ -2313,6 +2345,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) + } + break; + case WLAN_CATEGORY_MESH_ACTION: ++ if (len < (IEEE80211_MIN_ACTION_SIZE + ++ sizeof(mgmt->u.action.u.mesh_action.action_code))) ++ break; ++ + if (!ieee80211_vif_is_mesh(&sdata->vif)) + break; + if (mesh_action_is_path_sel(mgmt) && +@@ -2870,10 +2906,15 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, + test_bit(SCAN_OFF_CHANNEL, &local->scanning))) + status->rx_flags |= IEEE80211_RX_IN_SCAN; + +- if (ieee80211_is_mgmt(fc)) +- err = skb_linearize(skb); +- else ++ if (ieee80211_is_mgmt(fc)) { ++ /* drop frame if too short for header */ ++ if (skb->len < ieee80211_hdrlen(fc)) ++ err = -ENOBUFS; ++ else ++ err = skb_linearize(skb); ++ } else { + err = !pskb_may_pull(skb, ieee80211_hdrlen(fc)); ++ } + + if (err) { + dev_kfree_skb(skb); +diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c +index 38b78b9..3d1d55d 100644 +--- a/net/netlink/af_netlink.c ++++ b/net/netlink/af_netlink.c +@@ -137,6 +137,8 @@ static void netlink_destroy_callback(struct netlink_callback *cb); + static DEFINE_RWLOCK(nl_table_lock); + static atomic_t nl_table_users = ATOMIC_INIT(0); + ++#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock)); ++ + static ATOMIC_NOTIFIER_HEAD(netlink_chain); + + static u32 netlink_group_mask(u32 group) +@@ -331,6 +333,11 @@ netlink_update_listeners(struct sock *sk) + struct hlist_node *node; + unsigned long mask; + unsigned int i; ++ struct listeners *listeners; ++ ++ listeners = nl_deref_protected(tbl->listeners); ++ if (!listeners) ++ return; + + for (i = 0; i < NLGRPLONGS(tbl->groups); i++) { + mask = 0; +@@ -338,7 +345,7 @@ netlink_update_listeners(struct sock *sk) + if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) + mask |= nlk_sk(sk)->groups[i]; + } +- tbl->listeners->masks[i] = mask; ++ listeners->masks[i] = mask; + } + /* this function is only called with the netlink table "grabbed", which + * makes sure updates are visible before bind or setsockopt return. */ +@@ -519,7 +526,11 @@ static int netlink_release(struct socket *sock) + if (netlink_is_kernel(sk)) { + BUG_ON(nl_table[sk->sk_protocol].registered == 0); + if (--nl_table[sk->sk_protocol].registered == 0) { +- kfree(nl_table[sk->sk_protocol].listeners); ++ struct listeners *old; ++ ++ old = nl_deref_protected(nl_table[sk->sk_protocol].listeners); ++ RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL); ++ kfree_rcu(old, rcu); + nl_table[sk->sk_protocol].module = NULL; + nl_table[sk->sk_protocol].registered = 0; + } +@@ -950,7 +961,7 @@ int netlink_has_listeners(struct sock *sk, unsigned int group) + rcu_read_lock(); + listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); + +- if (group - 1 < nl_table[sk->sk_protocol].groups) ++ if (listeners && group - 1 < nl_table[sk->sk_protocol].groups) + res = test_bit(group - 1, listeners->masks); + + rcu_read_unlock(); +@@ -1584,7 +1595,7 @@ int __netlink_change_ngroups(struct sock *sk, unsigned int groups) + new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC); + if (!new) + return -ENOMEM; +- old = rcu_dereference_protected(tbl->listeners, 1); ++ old = nl_deref_protected(tbl->listeners); + memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups)); + rcu_assign_pointer(tbl->listeners, new); + +diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c +index 76388b0..9032d50 100644 +--- a/net/sctp/sm_sideeffect.c ++++ b/net/sctp/sm_sideeffect.c +@@ -1604,8 +1604,9 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, + asoc->outqueue.outstanding_bytes; + sackh.num_gap_ack_blocks = 0; + sackh.num_dup_tsns = 0; ++ chunk->subh.sack_hdr = &sackh; + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, +- SCTP_SACKH(&sackh)); ++ SCTP_CHUNK(chunk)); + break; + + case SCTP_CMD_DISCARD_PACKET: +diff --git a/net/wireless/core.c b/net/wireless/core.c +index 8f5042d..ea93f4b 100644 +--- a/net/wireless/core.c ++++ b/net/wireless/core.c +@@ -548,8 +548,7 @@ int wiphy_register(struct wiphy *wiphy) + for (i = 0; i < sband->n_channels; i++) { + sband->channels[i].orig_flags = + sband->channels[i].flags; +- sband->channels[i].orig_mag = +- sband->channels[i].max_antenna_gain; ++ sband->channels[i].orig_mag = INT_MAX; + sband->channels[i].orig_mpwr = + sband->channels[i].max_power; + sband->channels[i].band = band; +diff --git a/net/wireless/util.c b/net/wireless/util.c +index 22fb802..5fba039 100644 +--- a/net/wireless/util.c ++++ b/net/wireless/util.c +@@ -301,23 +301,21 @@ unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb) + } + EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb); + +-static int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) ++unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) + { + int ae = meshhdr->flags & MESH_FLAGS_AE; +- /* 7.1.3.5a.2 */ ++ /* 802.11-2012, 8.2.4.7.3 */ + switch (ae) { ++ default: + case 0: + return 6; + case MESH_FLAGS_AE_A4: + return 12; + case MESH_FLAGS_AE_A5_A6: + return 18; +- case (MESH_FLAGS_AE_A4 | MESH_FLAGS_AE_A5_A6): +- return 24; +- default: +- return 6; + } + } ++EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen); + + int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, + enum nl80211_iftype iftype) +@@ -365,6 +363,8 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, + /* make sure meshdr->flags is on the linear part */ + if (!pskb_may_pull(skb, hdrlen + 1)) + return -1; ++ if (meshdr->flags & MESH_FLAGS_AE_A4) ++ return -1; + if (meshdr->flags & MESH_FLAGS_AE_A5_A6) { + skb_copy_bits(skb, hdrlen + + offsetof(struct ieee80211s_hdr, eaddr1), +@@ -389,6 +389,8 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, + /* make sure meshdr->flags is on the linear part */ + if (!pskb_may_pull(skb, hdrlen + 1)) + return -1; ++ if (meshdr->flags & MESH_FLAGS_AE_A5_A6) ++ return -1; + if (meshdr->flags & MESH_FLAGS_AE_A4) + skb_copy_bits(skb, hdrlen + + offsetof(struct ieee80211s_hdr, eaddr1), +diff --git a/sound/core/control.c b/sound/core/control.c +index 819a5c5..5511307 100644 +--- a/sound/core/control.c ++++ b/sound/core/control.c +@@ -86,6 +86,7 @@ static int snd_ctl_open(struct inode *inode, struct file *file) + write_lock_irqsave(&card->ctl_files_rwlock, flags); + list_add_tail(&ctl->list, &card->ctl_files); + write_unlock_irqrestore(&card->ctl_files_rwlock, flags); ++ snd_card_unref(card); + return 0; + + __error: +@@ -93,6 +94,8 @@ static int snd_ctl_open(struct inode *inode, struct file *file) + __error2: + snd_card_file_remove(card, file); + __error1: ++ if (card) ++ snd_card_unref(card); + return err; + } + +@@ -1433,6 +1436,8 @@ static ssize_t snd_ctl_read(struct file *file, char __user *buffer, + spin_unlock_irq(&ctl->read_lock); + schedule(); + remove_wait_queue(&ctl->change_sleep, &wait); ++ if (ctl->card->shutdown) ++ return -ENODEV; + if (signal_pending(current)) + return -ERESTARTSYS; + spin_lock_irq(&ctl->read_lock); +diff --git a/sound/core/hwdep.c b/sound/core/hwdep.c +index 75ea16f..3f7f662 100644 +--- a/sound/core/hwdep.c ++++ b/sound/core/hwdep.c +@@ -100,8 +100,10 @@ static int snd_hwdep_open(struct inode *inode, struct file * file) + if (hw == NULL) + return -ENODEV; + +- if (!try_module_get(hw->card->module)) ++ if (!try_module_get(hw->card->module)) { ++ snd_card_unref(hw->card); + return -EFAULT; ++ } + + init_waitqueue_entry(&wait, current); + add_wait_queue(&hw->open_wait, &wait); +@@ -129,6 +131,10 @@ static int snd_hwdep_open(struct inode *inode, struct file * file) + mutex_unlock(&hw->open_mutex); + schedule(); + mutex_lock(&hw->open_mutex); ++ if (hw->card->shutdown) { ++ err = -ENODEV; ++ break; ++ } + if (signal_pending(current)) { + err = -ERESTARTSYS; + break; +@@ -148,6 +154,7 @@ static int snd_hwdep_open(struct inode *inode, struct file * file) + mutex_unlock(&hw->open_mutex); + if (err < 0) + module_put(hw->card->module); ++ snd_card_unref(hw->card); + return err; + } + +@@ -459,12 +466,15 @@ static int snd_hwdep_dev_disconnect(struct snd_device *device) + mutex_unlock(®ister_mutex); + return -EINVAL; + } ++ mutex_lock(&hwdep->open_mutex); ++ wake_up(&hwdep->open_wait); + #ifdef CONFIG_SND_OSSEMUL + if (hwdep->ossreg) + snd_unregister_oss_device(hwdep->oss_type, hwdep->card, hwdep->device); + #endif + snd_unregister_device(SNDRV_DEVICE_TYPE_HWDEP, hwdep->card, hwdep->device); + list_del_init(&hwdep->list); ++ mutex_unlock(&hwdep->open_mutex); + mutex_unlock(®ister_mutex); + return 0; + } +diff --git a/sound/core/init.c b/sound/core/init.c +index 3ac49b1..fa0f35b 100644 +--- a/sound/core/init.c ++++ b/sound/core/init.c +@@ -212,6 +212,7 @@ int snd_card_create(int idx, const char *xid, + spin_lock_init(&card->files_lock); + INIT_LIST_HEAD(&card->files_list); + init_waitqueue_head(&card->shutdown_sleep); ++ atomic_set(&card->refcount, 0); + #ifdef CONFIG_PM + mutex_init(&card->power_lock); + init_waitqueue_head(&card->power_sleep); +@@ -445,21 +446,36 @@ static int snd_card_do_free(struct snd_card *card) + return 0; + } + ++/** ++ * snd_card_unref - release the reference counter ++ * @card: the card instance ++ * ++ * Decrements the reference counter. When it reaches to zero, wake up ++ * the sleeper and call the destructor if needed. ++ */ ++void snd_card_unref(struct snd_card *card) ++{ ++ if (atomic_dec_and_test(&card->refcount)) { ++ wake_up(&card->shutdown_sleep); ++ if (card->free_on_last_close) ++ snd_card_do_free(card); ++ } ++} ++EXPORT_SYMBOL(snd_card_unref); ++ + int snd_card_free_when_closed(struct snd_card *card) + { +- int free_now = 0; +- int ret = snd_card_disconnect(card); +- if (ret) +- return ret; ++ int ret; + +- spin_lock(&card->files_lock); +- if (list_empty(&card->files_list)) +- free_now = 1; +- else +- card->free_on_last_close = 1; +- spin_unlock(&card->files_lock); ++ atomic_inc(&card->refcount); ++ ret = snd_card_disconnect(card); ++ if (ret) { ++ atomic_dec(&card->refcount); ++ return ret; ++ } + +- if (free_now) ++ card->free_on_last_close = 1; ++ if (atomic_dec_and_test(&card->refcount)) + snd_card_do_free(card); + return 0; + } +@@ -473,7 +489,7 @@ int snd_card_free(struct snd_card *card) + return ret; + + /* wait, until all devices are ready for the free operation */ +- wait_event(card->shutdown_sleep, list_empty(&card->files_list)); ++ wait_event(card->shutdown_sleep, !atomic_read(&card->refcount)); + snd_card_do_free(card); + return 0; + } +@@ -854,6 +870,7 @@ int snd_card_file_add(struct snd_card *card, struct file *file) + return -ENODEV; + } + list_add(&mfile->list, &card->files_list); ++ atomic_inc(&card->refcount); + spin_unlock(&card->files_lock); + return 0; + } +@@ -876,7 +893,6 @@ EXPORT_SYMBOL(snd_card_file_add); + int snd_card_file_remove(struct snd_card *card, struct file *file) + { + struct snd_monitor_file *mfile, *found = NULL; +- int last_close = 0; + + spin_lock(&card->files_lock); + list_for_each_entry(mfile, &card->files_list, list) { +@@ -891,19 +907,13 @@ int snd_card_file_remove(struct snd_card *card, struct file *file) + break; + } + } +- if (list_empty(&card->files_list)) +- last_close = 1; + spin_unlock(&card->files_lock); +- if (last_close) { +- wake_up(&card->shutdown_sleep); +- if (card->free_on_last_close) +- snd_card_do_free(card); +- } + if (!found) { + snd_printk(KERN_ERR "ALSA card file remove problem (%p)\n", file); + return -ENOENT; + } + kfree(found); ++ snd_card_unref(card); + return 0; + } + +diff --git a/sound/core/oss/mixer_oss.c b/sound/core/oss/mixer_oss.c +index 18297f7..c353768 100644 +--- a/sound/core/oss/mixer_oss.c ++++ b/sound/core/oss/mixer_oss.c +@@ -52,14 +52,19 @@ static int snd_mixer_oss_open(struct inode *inode, struct file *file) + SNDRV_OSS_DEVICE_TYPE_MIXER); + if (card == NULL) + return -ENODEV; +- if (card->mixer_oss == NULL) ++ if (card->mixer_oss == NULL) { ++ snd_card_unref(card); + return -ENODEV; ++ } + err = snd_card_file_add(card, file); +- if (err < 0) ++ if (err < 0) { ++ snd_card_unref(card); + return err; ++ } + fmixer = kzalloc(sizeof(*fmixer), GFP_KERNEL); + if (fmixer == NULL) { + snd_card_file_remove(card, file); ++ snd_card_unref(card); + return -ENOMEM; + } + fmixer->card = card; +@@ -68,8 +73,10 @@ static int snd_mixer_oss_open(struct inode *inode, struct file *file) + if (!try_module_get(card->module)) { + kfree(fmixer); + snd_card_file_remove(card, file); ++ snd_card_unref(card); + return -EFAULT; + } ++ snd_card_unref(card); + return 0; + } + +diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c +index 3cc4b86..542f69e 100644 +--- a/sound/core/oss/pcm_oss.c ++++ b/sound/core/oss/pcm_oss.c +@@ -2441,6 +2441,10 @@ static int snd_pcm_oss_open(struct inode *inode, struct file *file) + mutex_unlock(&pcm->open_mutex); + schedule(); + mutex_lock(&pcm->open_mutex); ++ if (pcm->card->shutdown) { ++ err = -ENODEV; ++ break; ++ } + if (signal_pending(current)) { + err = -ERESTARTSYS; + break; +@@ -2450,6 +2454,7 @@ static int snd_pcm_oss_open(struct inode *inode, struct file *file) + mutex_unlock(&pcm->open_mutex); + if (err < 0) + goto __error; ++ snd_card_unref(pcm->card); + return err; + + __error: +@@ -2457,6 +2462,8 @@ static int snd_pcm_oss_open(struct inode *inode, struct file *file) + __error2: + snd_card_file_remove(pcm->card, file); + __error1: ++ if (pcm) ++ snd_card_unref(pcm->card); + return err; + } + +diff --git a/sound/core/pcm.c b/sound/core/pcm.c +index 8928ca87..13eaeb3 100644 +--- a/sound/core/pcm.c ++++ b/sound/core/pcm.c +@@ -1046,11 +1046,19 @@ static int snd_pcm_dev_disconnect(struct snd_device *device) + if (list_empty(&pcm->list)) + goto unlock; + ++ mutex_lock(&pcm->open_mutex); ++ wake_up(&pcm->open_wait); + list_del_init(&pcm->list); + for (cidx = 0; cidx < 2; cidx++) +- for (substream = pcm->streams[cidx].substream; substream; substream = substream->next) +- if (substream->runtime) ++ for (substream = pcm->streams[cidx].substream; substream; substream = substream->next) { ++ snd_pcm_stream_lock_irq(substream); ++ if (substream->runtime) { + substream->runtime->status->state = SNDRV_PCM_STATE_DISCONNECTED; ++ wake_up(&substream->runtime->sleep); ++ wake_up(&substream->runtime->tsleep); ++ } ++ snd_pcm_stream_unlock_irq(substream); ++ } + list_for_each_entry(notify, &snd_pcm_notify_list, list) { + notify->n_disconnect(pcm); + } +@@ -1066,6 +1074,7 @@ static int snd_pcm_dev_disconnect(struct snd_device *device) + } + snd_unregister_device(devtype, pcm->card, pcm->device); + } ++ mutex_unlock(&pcm->open_mutex); + unlock: + mutex_unlock(®ister_mutex); + return 0; +diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c +index 25ed9fe..7ada40e 100644 +--- a/sound/core/pcm_native.c ++++ b/sound/core/pcm_native.c +@@ -369,6 +369,14 @@ static int period_to_usecs(struct snd_pcm_runtime *runtime) + return usecs; + } + ++static void snd_pcm_set_state(struct snd_pcm_substream *substream, int state) ++{ ++ snd_pcm_stream_lock_irq(substream); ++ if (substream->runtime->status->state != SNDRV_PCM_STATE_DISCONNECTED) ++ substream->runtime->status->state = state; ++ snd_pcm_stream_unlock_irq(substream); ++} ++ + static int snd_pcm_hw_params(struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params) + { +@@ -452,7 +460,7 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream, + runtime->boundary *= 2; + + snd_pcm_timer_resolution_change(substream); +- runtime->status->state = SNDRV_PCM_STATE_SETUP; ++ snd_pcm_set_state(substream, SNDRV_PCM_STATE_SETUP); + + if (pm_qos_request_active(&substream->latency_pm_qos_req)) + pm_qos_remove_request(&substream->latency_pm_qos_req); +@@ -464,7 +472,7 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream, + /* hardware might be unusable from this time, + so we force application to retry to set + the correct hardware parameter settings */ +- runtime->status->state = SNDRV_PCM_STATE_OPEN; ++ snd_pcm_set_state(substream, SNDRV_PCM_STATE_OPEN); + if (substream->ops->hw_free != NULL) + substream->ops->hw_free(substream); + return err; +@@ -512,7 +520,7 @@ static int snd_pcm_hw_free(struct snd_pcm_substream *substream) + return -EBADFD; + if (substream->ops->hw_free) + result = substream->ops->hw_free(substream); +- runtime->status->state = SNDRV_PCM_STATE_OPEN; ++ snd_pcm_set_state(substream, SNDRV_PCM_STATE_OPEN); + pm_qos_remove_request(&substream->latency_pm_qos_req); + return result; + } +@@ -1320,7 +1328,7 @@ static void snd_pcm_post_prepare(struct snd_pcm_substream *substream, int state) + { + struct snd_pcm_runtime *runtime = substream->runtime; + runtime->control->appl_ptr = runtime->status->hw_ptr; +- runtime->status->state = SNDRV_PCM_STATE_PREPARED; ++ snd_pcm_set_state(substream, SNDRV_PCM_STATE_PREPARED); + } + + static struct action_ops snd_pcm_action_prepare = { +@@ -1500,6 +1508,10 @@ static int snd_pcm_drain(struct snd_pcm_substream *substream, + down_read(&snd_pcm_link_rwsem); + snd_pcm_stream_lock_irq(substream); + remove_wait_queue(&to_check->sleep, &wait); ++ if (card->shutdown) { ++ result = -ENODEV; ++ break; ++ } + if (tout == 0) { + if (substream->runtime->status->state == SNDRV_PCM_STATE_SUSPENDED) + result = -ESTRPIPE; +@@ -1620,6 +1632,7 @@ static int snd_pcm_link(struct snd_pcm_substream *substream, int fd) + _end: + write_unlock_irq(&snd_pcm_link_rwlock); + up_write(&snd_pcm_link_rwsem); ++ snd_card_unref(substream1->pcm->card); + fput(file); + return res; + } +@@ -2092,7 +2105,10 @@ static int snd_pcm_playback_open(struct inode *inode, struct file *file) + return err; + pcm = snd_lookup_minor_data(iminor(inode), + SNDRV_DEVICE_TYPE_PCM_PLAYBACK); +- return snd_pcm_open(file, pcm, SNDRV_PCM_STREAM_PLAYBACK); ++ err = snd_pcm_open(file, pcm, SNDRV_PCM_STREAM_PLAYBACK); ++ if (pcm) ++ snd_card_unref(pcm->card); ++ return err; + } + + static int snd_pcm_capture_open(struct inode *inode, struct file *file) +@@ -2103,7 +2119,10 @@ static int snd_pcm_capture_open(struct inode *inode, struct file *file) + return err; + pcm = snd_lookup_minor_data(iminor(inode), + SNDRV_DEVICE_TYPE_PCM_CAPTURE); +- return snd_pcm_open(file, pcm, SNDRV_PCM_STREAM_CAPTURE); ++ err = snd_pcm_open(file, pcm, SNDRV_PCM_STREAM_CAPTURE); ++ if (pcm) ++ snd_card_unref(pcm->card); ++ return err; + } + + static int snd_pcm_open(struct file *file, struct snd_pcm *pcm, int stream) +@@ -2140,6 +2159,10 @@ static int snd_pcm_open(struct file *file, struct snd_pcm *pcm, int stream) + mutex_unlock(&pcm->open_mutex); + schedule(); + mutex_lock(&pcm->open_mutex); ++ if (pcm->card->shutdown) { ++ err = -ENODEV; ++ break; ++ } + if (signal_pending(current)) { + err = -ERESTARTSYS; + break; +diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c +index ebf6e49..1bb95ae 100644 +--- a/sound/core/rawmidi.c ++++ b/sound/core/rawmidi.c +@@ -379,8 +379,10 @@ static int snd_rawmidi_open(struct inode *inode, struct file *file) + if (rmidi == NULL) + return -ENODEV; + +- if (!try_module_get(rmidi->card->module)) ++ if (!try_module_get(rmidi->card->module)) { ++ snd_card_unref(rmidi->card); + return -ENXIO; ++ } + + mutex_lock(&rmidi->open_mutex); + card = rmidi->card; +@@ -422,6 +424,10 @@ static int snd_rawmidi_open(struct inode *inode, struct file *file) + mutex_unlock(&rmidi->open_mutex); + schedule(); + mutex_lock(&rmidi->open_mutex); ++ if (rmidi->card->shutdown) { ++ err = -ENODEV; ++ break; ++ } + if (signal_pending(current)) { + err = -ERESTARTSYS; + break; +@@ -440,6 +446,7 @@ static int snd_rawmidi_open(struct inode *inode, struct file *file) + #endif + file->private_data = rawmidi_file; + mutex_unlock(&rmidi->open_mutex); ++ snd_card_unref(rmidi->card); + return 0; + + __error: +@@ -447,6 +454,7 @@ static int snd_rawmidi_open(struct inode *inode, struct file *file) + __error_card: + mutex_unlock(&rmidi->open_mutex); + module_put(rmidi->card->module); ++ snd_card_unref(rmidi->card); + return err; + } + +@@ -991,6 +999,8 @@ static ssize_t snd_rawmidi_read(struct file *file, char __user *buf, size_t coun + spin_unlock_irq(&runtime->lock); + schedule(); + remove_wait_queue(&runtime->sleep, &wait); ++ if (rfile->rmidi->card->shutdown) ++ return -ENODEV; + if (signal_pending(current)) + return result > 0 ? result : -ERESTARTSYS; + if (!runtime->avail) +@@ -1234,6 +1244,8 @@ static ssize_t snd_rawmidi_write(struct file *file, const char __user *buf, + spin_unlock_irq(&runtime->lock); + timeout = schedule_timeout(30 * HZ); + remove_wait_queue(&runtime->sleep, &wait); ++ if (rfile->rmidi->card->shutdown) ++ return -ENODEV; + if (signal_pending(current)) + return result > 0 ? result : -ERESTARTSYS; + if (!runtime->avail && !timeout) +@@ -1609,9 +1621,20 @@ static int snd_rawmidi_dev_register(struct snd_device *device) + static int snd_rawmidi_dev_disconnect(struct snd_device *device) + { + struct snd_rawmidi *rmidi = device->device_data; ++ int dir; + + mutex_lock(®ister_mutex); ++ mutex_lock(&rmidi->open_mutex); ++ wake_up(&rmidi->open_wait); + list_del_init(&rmidi->list); ++ for (dir = 0; dir < 2; dir++) { ++ struct snd_rawmidi_substream *s; ++ list_for_each_entry(s, &rmidi->streams[dir].substreams, list) { ++ if (s->runtime) ++ wake_up(&s->runtime->sleep); ++ } ++ } ++ + #ifdef CONFIG_SND_OSSEMUL + if (rmidi->ossreg) { + if ((int)rmidi->device == midi_map[rmidi->card->number]) { +@@ -1626,6 +1649,7 @@ static int snd_rawmidi_dev_disconnect(struct snd_device *device) + } + #endif /* CONFIG_SND_OSSEMUL */ + snd_unregister_device(SNDRV_DEVICE_TYPE_RAWMIDI, rmidi->card, rmidi->device); ++ mutex_unlock(&rmidi->open_mutex); + mutex_unlock(®ister_mutex); + return 0; + } +diff --git a/sound/core/sound.c b/sound/core/sound.c +index 828af35..8e17b4d 100644 +--- a/sound/core/sound.c ++++ b/sound/core/sound.c +@@ -99,6 +99,10 @@ static void snd_request_other(int minor) + * + * Checks that a minor device with the specified type is registered, and returns + * its user data pointer. ++ * ++ * This function increments the reference counter of the card instance ++ * if an associated instance with the given minor number and type is found. ++ * The caller must call snd_card_unref() appropriately later. + */ + void *snd_lookup_minor_data(unsigned int minor, int type) + { +@@ -109,9 +113,11 @@ void *snd_lookup_minor_data(unsigned int minor, int type) + return NULL; + mutex_lock(&sound_mutex); + mreg = snd_minors[minor]; +- if (mreg && mreg->type == type) ++ if (mreg && mreg->type == type) { + private_data = mreg->private_data; +- else ++ if (private_data && mreg->card_ptr) ++ atomic_inc(&mreg->card_ptr->refcount); ++ } else + private_data = NULL; + mutex_unlock(&sound_mutex); + return private_data; +@@ -275,6 +281,7 @@ int snd_register_device_for_dev(int type, struct snd_card *card, int dev, + preg->device = dev; + preg->f_ops = f_ops; + preg->private_data = private_data; ++ preg->card_ptr = card; + mutex_lock(&sound_mutex); + #ifdef CONFIG_SND_DYNAMIC_MINORS + minor = snd_find_free_minor(type); +diff --git a/sound/core/sound_oss.c b/sound/core/sound_oss.c +index c700920..ec86009 100644 +--- a/sound/core/sound_oss.c ++++ b/sound/core/sound_oss.c +@@ -40,6 +40,9 @@ + static struct snd_minor *snd_oss_minors[SNDRV_OSS_MINORS]; + static DEFINE_MUTEX(sound_oss_mutex); + ++/* NOTE: This function increments the refcount of the associated card like ++ * snd_lookup_minor_data(); the caller must call snd_card_unref() appropriately ++ */ + void *snd_lookup_oss_minor_data(unsigned int minor, int type) + { + struct snd_minor *mreg; +@@ -49,9 +52,11 @@ void *snd_lookup_oss_minor_data(unsigned int minor, int type) + return NULL; + mutex_lock(&sound_oss_mutex); + mreg = snd_oss_minors[minor]; +- if (mreg && mreg->type == type) ++ if (mreg && mreg->type == type) { + private_data = mreg->private_data; +- else ++ if (private_data && mreg->card_ptr) ++ atomic_inc(&mreg->card_ptr->refcount); ++ } else + private_data = NULL; + mutex_unlock(&sound_oss_mutex); + return private_data; +@@ -123,6 +128,7 @@ int snd_register_oss_device(int type, struct snd_card *card, int dev, + preg->device = dev; + preg->f_ops = f_ops; + preg->private_data = private_data; ++ preg->card_ptr = card; + mutex_lock(&sound_oss_mutex); + snd_oss_minors[minor] = preg; + minor_unit = SNDRV_MINOR_OSS_DEVICE(minor); +diff --git a/sound/pci/hda/patch_analog.c b/sound/pci/hda/patch_analog.c +index bcb3310..b4890f9 100644 +--- a/sound/pci/hda/patch_analog.c ++++ b/sound/pci/hda/patch_analog.c +@@ -573,6 +573,7 @@ static int ad198x_build_pcms(struct hda_codec *codec) + if (spec->multiout.dig_out_nid) { + info++; + codec->num_pcms++; ++ codec->spdif_status_reset = 1; + info->name = "AD198x Digital"; + info->pcm_type = HDA_PCM_TYPE_SPDIF; + info->stream[SNDRV_PCM_STREAM_PLAYBACK] = ad198x_pcm_digital_playback; +diff --git a/sound/pci/hda/patch_cirrus.c b/sound/pci/hda/patch_cirrus.c +index e449278..0ed6867 100644 +--- a/sound/pci/hda/patch_cirrus.c ++++ b/sound/pci/hda/patch_cirrus.c +@@ -93,8 +93,8 @@ enum { + #define CS420X_VENDOR_NID 0x11 + #define CS_DIG_OUT1_PIN_NID 0x10 + #define CS_DIG_OUT2_PIN_NID 0x15 +-#define CS_DMIC1_PIN_NID 0x12 +-#define CS_DMIC2_PIN_NID 0x0e ++#define CS_DMIC1_PIN_NID 0x0e ++#define CS_DMIC2_PIN_NID 0x12 + + /* coef indices */ + #define IDX_SPDIF_STAT 0x0000 +@@ -1088,14 +1088,18 @@ static void init_input(struct hda_codec *codec) + cs_automic(codec); + + coef = 0x000a; /* ADC1/2 - Digital and Analog Soft Ramp */ ++ cs_vendor_coef_set(codec, IDX_ADC_CFG, coef); ++ ++ coef = cs_vendor_coef_get(codec, IDX_BEEP_CFG); + if (is_active_pin(codec, CS_DMIC2_PIN_NID)) +- coef |= 0x0500; /* DMIC2 2 chan on, GPIO1 off */ ++ coef |= 1 << 4; /* DMIC2 2 chan on, GPIO1 off */ + if (is_active_pin(codec, CS_DMIC1_PIN_NID)) +- coef |= 0x1800; /* DMIC1 2 chan on, GPIO0 off ++ coef |= 1 << 3; /* DMIC1 2 chan on, GPIO0 off + * No effect if SPDIF_OUT2 is + * selected in IDX_SPDIF_CTL. + */ +- cs_vendor_coef_set(codec, IDX_ADC_CFG, coef); ++ ++ cs_vendor_coef_set(codec, IDX_BEEP_CFG, coef); + } + } + +@@ -1109,7 +1113,7 @@ static const struct hda_verb cs_coef_init_verbs[] = { + | 0x0400 /* Disable Coefficient Auto increment */ + )}, + /* Beep */ +- {0x11, AC_VERB_SET_COEF_INDEX, IDX_DAC_CFG}, ++ {0x11, AC_VERB_SET_COEF_INDEX, IDX_BEEP_CFG}, + {0x11, AC_VERB_SET_PROC_COEF, 0x0007}, /* Enable Beep thru DAC1/2/3 */ + + {} /* terminator */ +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index c2c7f90..3ce2da2 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -6039,6 +6039,7 @@ static const struct hda_codec_preset snd_hda_preset_realtek[] = { + .patch = patch_alc662 }, + { .id = 0x10ec0663, .name = "ALC663", .patch = patch_alc662 }, + { .id = 0x10ec0665, .name = "ALC665", .patch = patch_alc662 }, ++ { .id = 0x10ec0668, .name = "ALC668", .patch = patch_alc662 }, + { .id = 0x10ec0670, .name = "ALC670", .patch = patch_alc662 }, + { .id = 0x10ec0680, .name = "ALC680", .patch = patch_alc680 }, + { .id = 0x10ec0880, .name = "ALC880", .patch = patch_alc880 }, +@@ -6056,6 +6057,7 @@ static const struct hda_codec_preset snd_hda_preset_realtek[] = { + { .id = 0x10ec0889, .name = "ALC889", .patch = patch_alc882 }, + { .id = 0x10ec0892, .name = "ALC892", .patch = patch_alc662 }, + { .id = 0x10ec0899, .name = "ALC898", .patch = patch_alc882 }, ++ { .id = 0x10ec0900, .name = "ALC1150", .patch = patch_alc882 }, + {} /* terminator */ + }; + +diff --git a/sound/pci/hda/patch_via.c b/sound/pci/hda/patch_via.c +index 7160ff2..9e0c889 100644 +--- a/sound/pci/hda/patch_via.c ++++ b/sound/pci/hda/patch_via.c +@@ -1856,11 +1856,11 @@ static int via_auto_fill_dac_nids(struct hda_codec *codec) + { + struct via_spec *spec = codec->spec; + const struct auto_pin_cfg *cfg = &spec->autocfg; +- int i, dac_num; ++ int i; + hda_nid_t nid; + ++ spec->multiout.num_dacs = 0; + spec->multiout.dac_nids = spec->private_dac_nids; +- dac_num = 0; + for (i = 0; i < cfg->line_outs; i++) { + hda_nid_t dac = 0; + nid = cfg->line_out_pins[i]; +@@ -1871,16 +1871,13 @@ static int via_auto_fill_dac_nids(struct hda_codec *codec) + if (!i && parse_output_path(codec, nid, dac, 1, + &spec->out_mix_path)) + dac = spec->out_mix_path.path[0]; +- if (dac) { +- spec->private_dac_nids[i] = dac; +- dac_num++; +- } ++ if (dac) ++ spec->private_dac_nids[spec->multiout.num_dacs++] = dac; + } + if (!spec->out_path[0].depth && spec->out_mix_path.depth) { + spec->out_path[0] = spec->out_mix_path; + spec->out_mix_path.depth = 0; + } +- spec->multiout.num_dacs = dac_num; + return 0; + } + +@@ -3689,6 +3686,18 @@ static void set_widgets_power_state_vt2002P(struct hda_codec *codec) + AC_VERB_SET_POWER_STATE, AC_PWRST_D3); + } + ++/* NIDs 0x24 and 0x33 on VT1802 have connections to non-existing NID 0x3e ++ * Replace this with mixer NID 0x1c ++ */ ++static void fix_vt1802_connections(struct hda_codec *codec) ++{ ++ static hda_nid_t conn_24[] = { 0x14, 0x1c }; ++ static hda_nid_t conn_33[] = { 0x1c }; ++ ++ snd_hda_override_conn_list(codec, 0x24, ARRAY_SIZE(conn_24), conn_24); ++ snd_hda_override_conn_list(codec, 0x33, ARRAY_SIZE(conn_33), conn_33); ++} ++ + /* patch for vt2002P */ + static int patch_vt2002P(struct hda_codec *codec) + { +@@ -3703,6 +3712,8 @@ static int patch_vt2002P(struct hda_codec *codec) + spec->aa_mix_nid = 0x21; + override_mic_boost(codec, 0x2b, 0, 3, 40); + override_mic_boost(codec, 0x29, 0, 3, 40); ++ if (spec->codec_type == VT1802) ++ fix_vt1802_connections(codec); + add_secret_dac_path(codec); + + /* automatic parse from the BIOS config */ +diff --git a/sound/usb/card.c b/sound/usb/card.c +index 0f6dc0d..566acb3 100644 +--- a/sound/usb/card.c ++++ b/sound/usb/card.c +@@ -336,7 +336,7 @@ static int snd_usb_audio_create(struct usb_device *dev, int idx, + return -ENOMEM; + } + +- mutex_init(&chip->shutdown_mutex); ++ init_rwsem(&chip->shutdown_rwsem); + chip->index = idx; + chip->dev = dev; + chip->card = card; +@@ -555,9 +555,11 @@ static void snd_usb_audio_disconnect(struct usb_device *dev, + return; + + card = chip->card; +- mutex_lock(®ister_mutex); +- mutex_lock(&chip->shutdown_mutex); ++ down_write(&chip->shutdown_rwsem); + chip->shutdown = 1; ++ up_write(&chip->shutdown_rwsem); ++ ++ mutex_lock(®ister_mutex); + chip->num_interfaces--; + if (chip->num_interfaces <= 0) { + snd_card_disconnect(card); +@@ -574,11 +576,9 @@ static void snd_usb_audio_disconnect(struct usb_device *dev, + snd_usb_mixer_disconnect(p); + } + usb_chip[chip->index] = NULL; +- mutex_unlock(&chip->shutdown_mutex); + mutex_unlock(®ister_mutex); + snd_card_free_when_closed(card); + } else { +- mutex_unlock(&chip->shutdown_mutex); + mutex_unlock(®ister_mutex); + } + } +@@ -610,16 +610,20 @@ int snd_usb_autoresume(struct snd_usb_audio *chip) + { + int err = -ENODEV; + ++ down_read(&chip->shutdown_rwsem); + if (!chip->shutdown && !chip->probing) + err = usb_autopm_get_interface(chip->pm_intf); ++ up_read(&chip->shutdown_rwsem); + + return err; + } + + void snd_usb_autosuspend(struct snd_usb_audio *chip) + { ++ down_read(&chip->shutdown_rwsem); + if (!chip->shutdown && !chip->probing) + usb_autopm_put_interface(chip->pm_intf); ++ up_read(&chip->shutdown_rwsem); + } + + static int usb_audio_suspend(struct usb_interface *intf, pm_message_t message) +diff --git a/sound/usb/card.h b/sound/usb/card.h +index a39edcc..665e297 100644 +--- a/sound/usb/card.h ++++ b/sound/usb/card.h +@@ -86,6 +86,7 @@ struct snd_usb_substream { + struct snd_urb_ctx syncurb[SYNC_URBS]; /* sync urb table */ + char *syncbuf; /* sync buffer for all sync URBs */ + dma_addr_t sync_dma; /* DMA address of syncbuf */ ++ unsigned int speed; /* USB_SPEED_XXX */ + + u64 formats; /* format bitmasks (all or'ed) */ + unsigned int num_formats; /* number of supported audio formats (list) */ +diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c +index 08dcce5..24c5114 100644 +--- a/sound/usb/endpoint.c ++++ b/sound/usb/endpoint.c +@@ -148,8 +148,10 @@ void snd_usb_release_substream_urbs(struct snd_usb_substream *subs, int force) + int i; + + /* stop urbs (to be sure) */ +- deactivate_urbs(subs, force, 1); +- wait_clear_urbs(subs); ++ if (!subs->stream->chip->shutdown) { ++ deactivate_urbs(subs, force, 1); ++ wait_clear_urbs(subs); ++ } + + for (i = 0; i < MAX_URBS; i++) + release_urb_ctx(&subs->dataurb[i]); +@@ -895,7 +897,8 @@ void snd_usb_init_substream(struct snd_usb_stream *as, + subs->dev = as->chip->dev; + subs->txfr_quirk = as->chip->txfr_quirk; + subs->ops = audio_urb_ops[stream]; +- if (snd_usb_get_speed(subs->dev) >= USB_SPEED_HIGH) ++ subs->speed = snd_usb_get_speed(subs->dev); ++ if (subs->speed >= USB_SPEED_HIGH) + subs->ops.prepare_sync = prepare_capture_sync_urb_hs; + + snd_usb_set_pcm_ops(as->pcm, stream); +diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c +index ab23869..6730a33 100644 +--- a/sound/usb/mixer.c ++++ b/sound/usb/mixer.c +@@ -287,25 +287,32 @@ static int get_ctl_value_v1(struct usb_mixer_elem_info *cval, int request, int v + unsigned char buf[2]; + int val_len = cval->val_type >= USB_MIXER_S16 ? 2 : 1; + int timeout = 10; +- int err; ++ int idx = 0, err; + + err = snd_usb_autoresume(cval->mixer->chip); + if (err < 0) + return -EIO; ++ down_read(&chip->shutdown_rwsem); + while (timeout-- > 0) { ++ if (chip->shutdown) ++ break; ++ idx = snd_usb_ctrl_intf(chip) | (cval->id << 8); + if (snd_usb_ctl_msg(chip->dev, usb_rcvctrlpipe(chip->dev, 0), request, + USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_IN, +- validx, snd_usb_ctrl_intf(chip) | (cval->id << 8), +- buf, val_len) >= val_len) { ++ validx, idx, buf, val_len) >= val_len) { + *value_ret = convert_signed_value(cval, snd_usb_combine_bytes(buf, val_len)); +- snd_usb_autosuspend(cval->mixer->chip); +- return 0; ++ err = 0; ++ goto out; + } + } +- snd_usb_autosuspend(cval->mixer->chip); + snd_printdd(KERN_ERR "cannot get ctl value: req = %#x, wValue = %#x, wIndex = %#x, type = %d\n", +- request, validx, snd_usb_ctrl_intf(chip) | (cval->id << 8), cval->val_type); +- return -EINVAL; ++ request, validx, idx, cval->val_type); ++ err = -EINVAL; ++ ++ out: ++ up_read(&chip->shutdown_rwsem); ++ snd_usb_autosuspend(cval->mixer->chip); ++ return err; + } + + static int get_ctl_value_v2(struct usb_mixer_elem_info *cval, int request, int validx, int *value_ret) +@@ -313,7 +320,7 @@ static int get_ctl_value_v2(struct usb_mixer_elem_info *cval, int request, int v + struct snd_usb_audio *chip = cval->mixer->chip; + unsigned char buf[2 + 3*sizeof(__u16)]; /* enough space for one range */ + unsigned char *val; +- int ret, size; ++ int idx = 0, ret, size; + __u8 bRequest; + + if (request == UAC_GET_CUR) { +@@ -330,16 +337,22 @@ static int get_ctl_value_v2(struct usb_mixer_elem_info *cval, int request, int v + if (ret) + goto error; + +- ret = snd_usb_ctl_msg(chip->dev, usb_rcvctrlpipe(chip->dev, 0), bRequest, ++ down_read(&chip->shutdown_rwsem); ++ if (chip->shutdown) ++ ret = -ENODEV; ++ else { ++ idx = snd_usb_ctrl_intf(chip) | (cval->id << 8); ++ ret = snd_usb_ctl_msg(chip->dev, usb_rcvctrlpipe(chip->dev, 0), bRequest, + USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_IN, +- validx, snd_usb_ctrl_intf(chip) | (cval->id << 8), +- buf, size); ++ validx, idx, buf, size); ++ } ++ up_read(&chip->shutdown_rwsem); + snd_usb_autosuspend(chip); + + if (ret < 0) { + error: + snd_printk(KERN_ERR "cannot get ctl value: req = %#x, wValue = %#x, wIndex = %#x, type = %d\n", +- request, validx, snd_usb_ctrl_intf(chip) | (cval->id << 8), cval->val_type); ++ request, validx, idx, cval->val_type); + return ret; + } + +@@ -417,7 +430,7 @@ int snd_usb_mixer_set_ctl_value(struct usb_mixer_elem_info *cval, + { + struct snd_usb_audio *chip = cval->mixer->chip; + unsigned char buf[2]; +- int val_len, err, timeout = 10; ++ int idx = 0, val_len, err, timeout = 10; + + if (cval->mixer->protocol == UAC_VERSION_1) { + val_len = cval->val_type >= USB_MIXER_S16 ? 2 : 1; +@@ -440,19 +453,27 @@ int snd_usb_mixer_set_ctl_value(struct usb_mixer_elem_info *cval, + err = snd_usb_autoresume(chip); + if (err < 0) + return -EIO; +- while (timeout-- > 0) ++ down_read(&chip->shutdown_rwsem); ++ while (timeout-- > 0) { ++ if (chip->shutdown) ++ break; ++ idx = snd_usb_ctrl_intf(chip) | (cval->id << 8); + if (snd_usb_ctl_msg(chip->dev, + usb_sndctrlpipe(chip->dev, 0), request, + USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_OUT, +- validx, snd_usb_ctrl_intf(chip) | (cval->id << 8), +- buf, val_len) >= 0) { +- snd_usb_autosuspend(chip); +- return 0; ++ validx, idx, buf, val_len) >= 0) { ++ err = 0; ++ goto out; + } +- snd_usb_autosuspend(chip); ++ } + snd_printdd(KERN_ERR "cannot set ctl value: req = %#x, wValue = %#x, wIndex = %#x, type = %d, data = %#x/%#x\n", +- request, validx, snd_usb_ctrl_intf(chip) | (cval->id << 8), cval->val_type, buf[0], buf[1]); +- return -EINVAL; ++ request, validx, idx, cval->val_type, buf[0], buf[1]); ++ err = -EINVAL; ++ ++ out: ++ up_read(&chip->shutdown_rwsem); ++ snd_usb_autosuspend(chip); ++ return err; + } + + static int set_cur_ctl_value(struct usb_mixer_elem_info *cval, int validx, int value) +diff --git a/sound/usb/mixer_quirks.c b/sound/usb/mixer_quirks.c +index ab125ee..38a607a 100644 +--- a/sound/usb/mixer_quirks.c ++++ b/sound/usb/mixer_quirks.c +@@ -186,6 +186,11 @@ static int snd_audigy2nx_led_put(struct snd_kcontrol *kcontrol, struct snd_ctl_e + if (value > 1) + return -EINVAL; + changed = value != mixer->audigy2nx_leds[index]; ++ down_read(&mixer->chip->shutdown_rwsem); ++ if (mixer->chip->shutdown) { ++ err = -ENODEV; ++ goto out; ++ } + if (mixer->chip->usb_id == USB_ID(0x041e, 0x3042)) + err = snd_usb_ctl_msg(mixer->chip->dev, + usb_sndctrlpipe(mixer->chip->dev, 0), 0x24, +@@ -202,6 +207,8 @@ static int snd_audigy2nx_led_put(struct snd_kcontrol *kcontrol, struct snd_ctl_e + usb_sndctrlpipe(mixer->chip->dev, 0), 0x24, + USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_OTHER, + value, index + 2, NULL, 0); ++ out: ++ up_read(&mixer->chip->shutdown_rwsem); + if (err < 0) + return err; + mixer->audigy2nx_leds[index] = value; +@@ -295,11 +302,16 @@ static void snd_audigy2nx_proc_read(struct snd_info_entry *entry, + + for (i = 0; jacks[i].name; ++i) { + snd_iprintf(buffer, "%s: ", jacks[i].name); +- err = snd_usb_ctl_msg(mixer->chip->dev, ++ down_read(&mixer->chip->shutdown_rwsem); ++ if (mixer->chip->shutdown) ++ err = 0; ++ else ++ err = snd_usb_ctl_msg(mixer->chip->dev, + usb_rcvctrlpipe(mixer->chip->dev, 0), + UAC_GET_MEM, USB_DIR_IN | USB_TYPE_CLASS | + USB_RECIP_INTERFACE, 0, + jacks[i].unitid << 8, buf, 3); ++ up_read(&mixer->chip->shutdown_rwsem); + if (err == 3 && (buf[0] == 3 || buf[0] == 6)) + snd_iprintf(buffer, "%02x %02x\n", buf[1], buf[2]); + else +@@ -329,10 +341,15 @@ static int snd_xonar_u1_switch_put(struct snd_kcontrol *kcontrol, + else + new_status = old_status & ~0x02; + changed = new_status != old_status; +- err = snd_usb_ctl_msg(mixer->chip->dev, ++ down_read(&mixer->chip->shutdown_rwsem); ++ if (mixer->chip->shutdown) ++ err = -ENODEV; ++ else ++ err = snd_usb_ctl_msg(mixer->chip->dev, + usb_sndctrlpipe(mixer->chip->dev, 0), 0x08, + USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_OTHER, + 50, 0, &new_status, 1); ++ up_read(&mixer->chip->shutdown_rwsem); + if (err < 0) + return err; + mixer->xonar_u1_status = new_status; +@@ -371,11 +388,17 @@ static int snd_nativeinstruments_control_get(struct snd_kcontrol *kcontrol, + u8 bRequest = (kcontrol->private_value >> 16) & 0xff; + u16 wIndex = kcontrol->private_value & 0xffff; + u8 tmp; ++ int ret; + +- int ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), bRequest, ++ down_read(&mixer->chip->shutdown_rwsem); ++ if (mixer->chip->shutdown) ++ ret = -ENODEV; ++ else ++ ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), bRequest, + USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, + 0, cpu_to_le16(wIndex), + &tmp, sizeof(tmp), 1000); ++ up_read(&mixer->chip->shutdown_rwsem); + + if (ret < 0) { + snd_printk(KERN_ERR +@@ -396,11 +419,17 @@ static int snd_nativeinstruments_control_put(struct snd_kcontrol *kcontrol, + u8 bRequest = (kcontrol->private_value >> 16) & 0xff; + u16 wIndex = kcontrol->private_value & 0xffff; + u16 wValue = ucontrol->value.integer.value[0]; ++ int ret; + +- int ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), bRequest, ++ down_read(&mixer->chip->shutdown_rwsem); ++ if (mixer->chip->shutdown) ++ ret = -ENODEV; ++ else ++ ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), bRequest, + USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, + cpu_to_le16(wValue), cpu_to_le16(wIndex), + NULL, 0, 1000); ++ up_read(&mixer->chip->shutdown_rwsem); + + if (ret < 0) { + snd_printk(KERN_ERR +diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c +index 839165f..983e071 100644 +--- a/sound/usb/pcm.c ++++ b/sound/usb/pcm.c +@@ -67,6 +67,8 @@ static snd_pcm_uframes_t snd_usb_pcm_pointer(struct snd_pcm_substream *substream + unsigned int hwptr_done; + + subs = (struct snd_usb_substream *)substream->runtime->private_data; ++ if (subs->stream->chip->shutdown) ++ return SNDRV_PCM_POS_XRUN; + spin_lock(&subs->lock); + hwptr_done = subs->hwptr_done; + substream->runtime->delay = snd_usb_pcm_delay(subs, +@@ -373,8 +375,14 @@ static int snd_usb_hw_params(struct snd_pcm_substream *substream, + changed = subs->cur_audiofmt != fmt || + subs->period_bytes != params_period_bytes(hw_params) || + subs->cur_rate != rate; ++ ++ down_read(&subs->stream->chip->shutdown_rwsem); ++ if (subs->stream->chip->shutdown) { ++ ret = -ENODEV; ++ goto unlock; ++ } + if ((ret = set_format(subs, fmt)) < 0) +- return ret; ++ goto unlock; + + if (subs->cur_rate != rate) { + struct usb_host_interface *alts; +@@ -383,12 +391,11 @@ static int snd_usb_hw_params(struct snd_pcm_substream *substream, + alts = &iface->altsetting[fmt->altset_idx]; + ret = snd_usb_init_sample_rate(subs->stream->chip, subs->interface, alts, fmt, rate); + if (ret < 0) +- return ret; ++ goto unlock; + subs->cur_rate = rate; + } + + if (changed) { +- mutex_lock(&subs->stream->chip->shutdown_mutex); + /* format changed */ + snd_usb_release_substream_urbs(subs, 0); + /* influenced: period_bytes, channels, rate, format, */ +@@ -396,9 +403,10 @@ static int snd_usb_hw_params(struct snd_pcm_substream *substream, + params_rate(hw_params), + snd_pcm_format_physical_width(params_format(hw_params)) * + params_channels(hw_params)); +- mutex_unlock(&subs->stream->chip->shutdown_mutex); + } + ++unlock: ++ up_read(&subs->stream->chip->shutdown_rwsem); + return ret; + } + +@@ -414,9 +422,9 @@ static int snd_usb_hw_free(struct snd_pcm_substream *substream) + subs->cur_audiofmt = NULL; + subs->cur_rate = 0; + subs->period_bytes = 0; +- mutex_lock(&subs->stream->chip->shutdown_mutex); ++ down_read(&subs->stream->chip->shutdown_rwsem); + snd_usb_release_substream_urbs(subs, 0); +- mutex_unlock(&subs->stream->chip->shutdown_mutex); ++ up_read(&subs->stream->chip->shutdown_rwsem); + return snd_pcm_lib_free_vmalloc_buffer(substream); + } + +@@ -429,12 +437,18 @@ static int snd_usb_pcm_prepare(struct snd_pcm_substream *substream) + { + struct snd_pcm_runtime *runtime = substream->runtime; + struct snd_usb_substream *subs = runtime->private_data; ++ int ret = 0; + + if (! subs->cur_audiofmt) { + snd_printk(KERN_ERR "usbaudio: no format is specified!\n"); + return -ENXIO; + } + ++ down_read(&subs->stream->chip->shutdown_rwsem); ++ if (subs->stream->chip->shutdown) { ++ ret = -ENODEV; ++ goto unlock; ++ } + /* some unit conversions in runtime */ + subs->maxframesize = bytes_to_frames(runtime, subs->maxpacksize); + subs->curframesize = bytes_to_frames(runtime, subs->curpacksize); +@@ -447,7 +461,10 @@ static int snd_usb_pcm_prepare(struct snd_pcm_substream *substream) + subs->last_frame_number = 0; + runtime->delay = 0; + +- return snd_usb_substream_prepare(subs, runtime); ++ ret = snd_usb_substream_prepare(subs, runtime); ++ unlock: ++ up_read(&subs->stream->chip->shutdown_rwsem); ++ return ret; + } + + static struct snd_pcm_hardware snd_usb_hardware = +@@ -500,7 +517,7 @@ static int hw_check_valid_format(struct snd_usb_substream *subs, + return 0; + } + /* check whether the period time is >= the data packet interval */ +- if (snd_usb_get_speed(subs->dev) != USB_SPEED_FULL) { ++ if (subs->speed != USB_SPEED_FULL) { + ptime = 125 * (1 << fp->datainterval); + if (ptime > pt->max || (ptime == pt->max && pt->openmax)) { + hwc_debug(" > check: ptime %u > max %u\n", ptime, pt->max); +@@ -776,7 +793,7 @@ static int setup_hw_info(struct snd_pcm_runtime *runtime, struct snd_usb_substre + return err; + + param_period_time_if_needed = SNDRV_PCM_HW_PARAM_PERIOD_TIME; +- if (snd_usb_get_speed(subs->dev) == USB_SPEED_FULL) ++ if (subs->speed == USB_SPEED_FULL) + /* full speed devices have fixed data packet interval */ + ptmin = 1000; + if (ptmin == 1000) +diff --git a/sound/usb/proc.c b/sound/usb/proc.c +index 961c9a2..aef03db 100644 +--- a/sound/usb/proc.c ++++ b/sound/usb/proc.c +@@ -107,7 +107,7 @@ static void proc_dump_substream_formats(struct snd_usb_substream *subs, struct s + } + snd_iprintf(buffer, "\n"); + } +- if (snd_usb_get_speed(subs->dev) != USB_SPEED_FULL) ++ if (subs->speed != USB_SPEED_FULL) + snd_iprintf(buffer, " Data packet interval: %d us\n", + 125 * (1 << fp->datainterval)); + // snd_iprintf(buffer, " Max Packet Size = %d\n", fp->maxpacksize); +@@ -128,7 +128,7 @@ static void proc_dump_substream_status(struct snd_usb_substream *subs, struct sn + snd_iprintf(buffer, "]\n"); + snd_iprintf(buffer, " Packet Size = %d\n", subs->curpacksize); + snd_iprintf(buffer, " Momentary freq = %u Hz (%#x.%04x)\n", +- snd_usb_get_speed(subs->dev) == USB_SPEED_FULL ++ subs->speed == USB_SPEED_FULL + ? get_full_speed_hz(subs->freqm) + : get_high_speed_hz(subs->freqm), + subs->freqm >> 16, subs->freqm & 0xffff); +diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h +index 3e2b035..6c805a5 100644 +--- a/sound/usb/usbaudio.h ++++ b/sound/usb/usbaudio.h +@@ -36,7 +36,7 @@ struct snd_usb_audio { + struct snd_card *card; + struct usb_interface *pm_intf; + u32 usb_id; +- struct mutex shutdown_mutex; ++ struct rw_semaphore shutdown_rwsem; + unsigned int shutdown:1; + unsigned int probing:1; + unsigned int autosuspended:1; diff --git a/3.2.34/cloneconfig.patch b/3.2.34/cloneconfig.patch new file mode 100644 index 0000000..4bfb615 --- /dev/null +++ b/3.2.34/cloneconfig.patch @@ -0,0 +1,41 @@ +From: Andreas Gruenbacher +Subject: Add ``cloneconfig'' target +Patch-mainline: Submitted 24 Feb 2011 + +Cloneconfig takes the first configuration it finds which appears +to belong to the running kernel, and configures the kernel sources +to match this configuration as closely as possible. + +Signed-off-by: Andreas Gruenbacher +Signed-off-by: Jeff Mahoney +--- + + scripts/kconfig/Makefile | 17 +++++++++++++++++ + 1 file changed, 17 insertions(+) + +--- a/scripts/kconfig/Makefile ++++ b/scripts/kconfig/Makefile +@@ -99,6 +99,23 @@ PHONY += allnoconfig allyesconfig allmod + + allnoconfig allyesconfig allmodconfig alldefconfig randconfig: $(obj)/conf + $< --$@ $(Kconfig) ++ ++UNAME_RELEASE := $(shell uname -r) ++CLONECONFIG := $(firstword $(wildcard /proc/config.gz \ ++ /lib/modules/$(UNAME_RELEASE)/.config \ ++ /etc/kernel-config \ ++ /boot/config-$(UNAME_RELEASE))) ++cloneconfig: $(obj)/conf ++ $(Q)case "$(CLONECONFIG)" in \ ++ '') echo -e "The configuration of the running" \ ++ "kernel could not be determined\n"; \ ++ false ;; \ ++ *.gz) gzip -cd $(CLONECONFIG) > .config.running ;; \ ++ *) cat $(CLONECONFIG) > .config.running ;; \ ++ esac && \ ++ echo -e "Cloning configuration file $(CLONECONFIG)\n" ++ $(Q)$< --defconfig=.config.running arch/$(SRCARCH)/Kconfig ++ + + PHONY += listnewconfig oldnoconfig savedefconfig defconfig + diff --git a/3.2.34/colored-printk-3.2.33.patch b/3.2.34/colored-printk-3.2.33.patch new file mode 100644 index 0000000..574f706 --- /dev/null +++ b/3.2.34/colored-printk-3.2.33.patch @@ -0,0 +1,337 @@ +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/arch/x86/kernel/early_printk.c linux-2.6.29.3-cprintk/arch/x86/kernel/early_printk.c +--- a/arch/x86/kernel/early_printk.c 2009-03-24 00:12:14.000000000 +0100 ++++ b/arch/x86/kernel/early_printk.c 2009-05-09 16:10:36.000000000 +0200 +@@ -23,7 +23,8 @@ + static int max_ypos = 25, max_xpos = 80; + static int current_ypos = 25, current_xpos; + +-static void early_vga_write(struct console *con, const char *str, unsigned n) ++static void early_vga_write(struct console *con, const char *str, unsigned n, ++ unsigned int loglevel) + { + char c; + int i, k, j; +@@ -93,7 +94,8 @@ static int early_serial_putc(unsigned ch + return timeout ? 0 : -1; + } + +-static void early_serial_write(struct console *con, const char *s, unsigned n) ++static void early_serial_write(struct console *con, const char *s, unsigned n, ++ unsigned int loglevel) + { + while (*s && n-- > 0) { + if (*s == '\n') +@@ -887,7 +889,7 @@ asmlinkage void early_printk(const char + + va_start(ap, fmt); + n = vscnprintf(buf, sizeof(buf), fmt, ap); +- early_console->write(early_console, buf, n); ++ early_console->write(early_console, buf, n, 0); + va_end(ap); + } + +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/drivers/char/Kconfig linux-2.6.29.3-cprintk/drivers/tty/Kconfig +--- a/drivers/char/Kconfig 2009-03-24 00:12:14.000000000 +0100 ++++ b/drivers/tty/Kconfig 2009-05-09 14:43:48.000000000 +0200 +@@ -66,6 +66,111 @@ config VT_CONSOLE + + If unsure, say Y. + ++menuconfig VT_CKO ++ bool "Colored kernel message output" ++ depends on VT_CONSOLE ++ ---help--- ++ This option enables kernel messages to be emitted in ++ colors other than the default. ++ ++ The color value you need to enter is composed (OR-ed) ++ of a foreground and a background color. ++ ++ Foreground: ++ 0x00 = black, 0x08 = dark gray, ++ 0x01 = red, 0x09 = light red, ++ 0x02 = green, 0x0A = light green, ++ 0x03 = brown, 0x0B = yellow, ++ 0x04 = blue, 0x0C = light blue, ++ 0x05 = magenta, 0x0D = light magenta, ++ 0x06 = cyan, 0x0E = light cyan, ++ 0x07 = gray, 0x0F = white, ++ ++ (Foreground colors 0x08 to 0x0F do not work when a VGA ++ console font with 512 glyphs is used.) ++ ++ Background: ++ 0x00 = black, 0x40 = blue, ++ 0x10 = red, 0x50 = magenta, ++ 0x20 = green, 0x60 = cyan, ++ 0x30 = brown, 0x70 = gray, ++ ++ For example, 0x1F would yield white on red. ++ ++ If unsure, say N. ++ ++config VT_PRINTK_EMERG_COLOR ++ hex "Emergency messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel emergency messages will ++ be printed to the console. ++ ++config VT_PRINTK_ALERT_COLOR ++ hex "Alert messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel alert messages will ++ be printed to the console. ++ ++config VT_PRINTK_CRIT_COLOR ++ hex "Critical messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel critical messages will ++ be printed to the console. ++ ++config VT_PRINTK_ERR_COLOR ++ hex "Error messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel error messages will ++ be printed to the console. ++ ++config VT_PRINTK_WARNING_COLOR ++ hex "Warning messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel warning messages will ++ be printed to the console. ++ ++config VT_PRINTK_NOTICE_COLOR ++ hex "Notice messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel notice messages will ++ be printed to the console. ++ ++config VT_PRINTK_INFO_COLOR ++ hex "Information messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel information messages will ++ be printed to the console. ++ ++config VT_PRINTK_DEBUG_COLOR ++ hex "Debug messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel debug messages will ++ be printed to the console. ++ + config HW_CONSOLE + bool + depends on VT && !S390 && !UML +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/drivers/char/vt.c linux-2.6.29.3-cprintk/drivers/tty/vt/vt.c +--- a/drivers/char/vt.c 2009-05-09 10:46:57.000000000 +0200 ++++ b/drivers/tty/vt/vt.c 2009-05-09 14:43:48.000000000 +0200 +@@ -73,6 +73,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -2431,17 +2432,45 @@ struct tty_driver *console_driver; + + #ifdef CONFIG_VT_CONSOLE + ++#ifdef CONFIG_VT_CKO ++static unsigned int printk_color[8] __read_mostly = { ++ CONFIG_VT_PRINTK_EMERG_COLOR, /* KERN_EMERG */ ++ CONFIG_VT_PRINTK_ALERT_COLOR, /* KERN_ALERT */ ++ CONFIG_VT_PRINTK_CRIT_COLOR, /* KERN_CRIT */ ++ CONFIG_VT_PRINTK_ERR_COLOR, /* KERN_ERR */ ++ CONFIG_VT_PRINTK_WARNING_COLOR, /* KERN_WARNING */ ++ CONFIG_VT_PRINTK_NOTICE_COLOR, /* KERN_NOTICE */ ++ CONFIG_VT_PRINTK_INFO_COLOR, /* KERN_INFO */ ++ CONFIG_VT_PRINTK_DEBUG_COLOR, /* KERN_DEBUG */ ++}; ++module_param_array(printk_color, uint, NULL, S_IRUGO | S_IWUSR); ++ ++static inline void vc_set_color(struct vc_data *vc, unsigned char color) ++{ ++ vc->vc_color = color_table[color & 0xF] | ++ (color_table[(color >> 4) & 0x7] << 4) | ++ (color & 0x80); ++ update_attr(vc); ++} ++#else ++static unsigned int printk_color[8]; ++static inline void vc_set_color(const struct vc_data *vc, unsigned char c) ++{ ++} ++#endif ++ + /* + * Console on virtual terminal + * + * The console must be locked when we get here. + */ + +-static void vt_console_print(struct console *co, const char *b, unsigned count) ++static void vt_console_print(struct console *co, const char *b, unsigned count, ++ unsigned int loglevel) + { + struct vc_data *vc = vc_cons[fg_console].d; +- unsigned char c; + static DEFINE_SPINLOCK(printing_lock); ++ unsigned char current_color, c; + const ushort *start; + ushort cnt = 0; + ushort myx; +@@ -2474,11 +2503,19 @@ static void vt_console_print(struct cons + + start = (ushort *)vc->vc_pos; + ++ /* ++ * We always get a valid loglevel - <8> and "no level" is transformed ++ * to <4> in the typical kernel. ++ */ ++ current_color = printk_color[loglevel]; ++ vc_set_color(vc, current_color); ++ + /* Contrived structure to try to emulate original need_wrap behaviour + * Problems caused when we have need_wrap set on '\n' character */ + while (count--) { + c = *b++; + if (c == 10 || c == 13 || c == 8 || vc->vc_need_wrap) { ++ vc_set_color(vc, vc->vc_def_color); + if (cnt > 0) { + if (CON_IS_VISIBLE(vc)) + vc->vc_sw->con_putcs(vc, start, cnt, vc->vc_y, vc->vc_x); +@@ -2491,6 +2528,7 @@ static void vt_console_print(struct cons + bs(vc); + start = (ushort *)vc->vc_pos; + myx = vc->vc_x; ++ vc_set_color(vc, current_color); + continue; + } + if (c != 13) +@@ -2498,6 +2536,7 @@ static void vt_console_print(struct cons + cr(vc); + start = (ushort *)vc->vc_pos; + myx = vc->vc_x; ++ vc_set_color(vc, current_color); + if (c == 10 || c == 13) + continue; + } +@@ -2520,6 +2559,7 @@ static void vt_console_print(struct cons + vc->vc_need_wrap = 1; + } + } ++ vc_set_color(vc, vc->vc_def_color); + set_cursor(vc); + notify_update(vc); + +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/drivers/net/netconsole.c linux-2.6.29.3-cprintk/drivers/net/netconsole.c +--- a/drivers/net/netconsole.c 2009-03-24 00:12:14.000000000 +0100 ++++ b/drivers/net/netconsole.c 2009-05-09 14:43:48.000000000 +0200 +@@ -691,7 +691,8 @@ static struct notifier_block netconsole_ + .notifier_call = netconsole_netdev_event, + }; + +-static void write_msg(struct console *con, const char *msg, unsigned int len) ++static void write_msg(struct console *con, const char *msg, unsigned int len, ++ unsigned int loglevel) + { + int frag, left; + unsigned long flags; +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/drivers/serial/8250.c linux-2.6.29.3-cprintk/drivers/tty/serial/8250.c +--- a/drivers/serial/8250.c 2009-03-24 00:12:14.000000000 +0100 ++++ b/drivers/tty/serial/8250.c 2009-05-09 14:43:48.000000000 +0200 +@@ -2698,7 +2698,8 @@ static void serial8250_console_putchar(s + * The console_lock must be held when we get here. + */ + static void +-serial8250_console_write(struct console *co, const char *s, unsigned int count) ++serial8250_console_write(struct console *co, const char *s, unsigned int count, ++ unsigned int loglevel) + { + struct uart_8250_port *up = &serial8250_ports[co->index]; + unsigned long flags; +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/drivers/serial/8250_early.c linux-2.6.29.3-cprintk/drivers/tty/serial/8250_early.c +--- a/drivers/serial/8250_early.c 2009-03-24 00:12:14.000000000 +0100 ++++ b/drivers/tty/serial/8250_early.c 2009-05-09 14:43:48.000000000 +0200 +@@ -83,7 +83,7 @@ static void __init serial_putc(struct ua + } + + static void __init early_serial8250_write(struct console *console, +- const char *s, unsigned int count) ++ const char *s, unsigned int count, unsigned int loglevel) + { + struct uart_port *port = &early_device.port; + unsigned int ier; +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/include/linux/console.h linux-2.6.29.3-cprintk/include/linux/console.h +--- a/include/linux/console.h 2009-03-24 00:12:14.000000000 +0100 ++++ b/include/linux/console.h 2009-05-09 14:43:48.000000000 +0200 +@@ -95,7 +95,7 @@ void give_up_console(const struct consw + + struct console { + char name[16]; +- void (*write)(struct console *, const char *, unsigned); ++ void (*write)(struct console *, const char *, unsigned, unsigned int); + int (*read)(struct console *, char *, unsigned); + struct tty_driver *(*device)(struct console *, int *); + void (*unblank)(void); +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/kernel/printk.c linux-2.6.29.3-cprintk/kernel/printk.c +--- a/kernel/printk.c 2009-03-24 00:12:14.000000000 +0100 ++++ b/kernel/printk.c 2009-05-09 14:43:48.000000000 +0200 +@@ -389,7 +389,8 @@ SYSCALL_DEFINE3(syslog, int, type, char + /* + * Call the console drivers on a range of log_buf + */ +-static void __call_console_drivers(unsigned start, unsigned end) ++static void __call_console_drivers(unsigned start, unsigned end, ++ unsigned int loglevel) + { + struct console *con; + +@@ -397,7 +398,7 @@ static void __call_console_drivers(unsig + if ((con->flags & CON_ENABLED) && con->write && + (cpu_online(smp_processor_id()) || + (con->flags & CON_ANYTIME))) +- con->write(con, &LOG_BUF(start), end - start); ++ con->write(con, &LOG_BUF(start), end - start, loglevel); + } + } + +@@ -424,10 +425,11 @@ static void _call_console_drivers(unsign + if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { + /* wrapped write */ + __call_console_drivers(start & LOG_BUF_MASK, +- log_buf_len); +- __call_console_drivers(0, end & LOG_BUF_MASK); ++ log_buf_len, msg_log_level); ++ __call_console_drivers(0, end & LOG_BUF_MASK, ++ msg_log_level); + } else { +- __call_console_drivers(start, end); ++ __call_console_drivers(start, end, msg_log_level); + } + } + } diff --git a/3.2.34/hz-432-kconfig-option.patch b/3.2.34/hz-432-kconfig-option.patch new file mode 100644 index 0000000..2fe9a4f --- /dev/null +++ b/3.2.34/hz-432-kconfig-option.patch @@ -0,0 +1,25 @@ +diff -urN oldtree/kernel/Kconfig.hz newtree/kernel/Kconfig.hz +--- oldtree/kernel/Kconfig.hz 2007-03-06 15:00:55.000000000 -0500 ++++ newtree/kernel/Kconfig.hz 2007-03-06 17:52:36.000000000 -0500 +@@ -39,6 +39,14 @@ + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_432 ++ bool "432 HZ" ++ help ++ 432 HZ is the best value for desktop systems. Most responsive ++ out of all the options. This is for Dual Core/Processor systems only. ++ as timer frequencies * number of processors = actual frequency. ++ Try this if you have a dual-core/dual processor system. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,5 +60,6 @@ + default 100 if HZ_100 + default 250 if HZ_250_NODEFAULT + default 300 if HZ_300 ++ default 432 if HZ_432 + default 1000 if HZ_1000 + diff --git a/3.2.34/hz-864-kconfig-option.patch b/3.2.34/hz-864-kconfig-option.patch new file mode 100644 index 0000000..6bdca04 --- /dev/null +++ b/3.2.34/hz-864-kconfig-option.patch @@ -0,0 +1,25 @@ +diff -urN oldtree/kernel/Kconfig.hz newtree/kernel/Kconfig.hz +--- oldtree/kernel/Kconfig.hz 2007-03-06 15:00:55.000000000 -0500 ++++ newtree/kernel/Kconfig.hz 2007-03-06 17:52:36.000000000 -0500 +@@ -39,6 +39,14 @@ + as timer frequencies * number of processors = actual frequency. + Try this if you have a dual-core/dual processor system. + ++ config HZ_864 ++ bool "864 HZ" ++ help ++ 864 HZ is the best value for desktop systems. Most responsive ++ out of all the options. The only reason it is not default is ++ because it may break few drivers. Give it a try if you have ++ a desktop :). ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,5 +60,6 @@ + default 250 if HZ_250_NODEFAULT + default 300 if HZ_300 + default 432 if HZ_432 ++ default 864 if HZ_864 + default 1000 if HZ_1000 + diff --git a/3.2.34/imqmq-3.2.patch b/3.2.34/imqmq-3.2.patch new file mode 100644 index 0000000..678869b --- /dev/null +++ b/3.2.34/imqmq-3.2.patch @@ -0,0 +1,1603 @@ +diff -uNr linux-3.2.0-go.orig//drivers/net/imq.c linux-3.2.0-go/drivers/net/imq.c +--- linux-3.2.0-go.orig//drivers/net/imq.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-go/drivers/net/imq.c 2012-01-16 18:54:18.592086804 +0100 +@@ -0,0 +1,850 @@ ++/* ++ * Pseudo-driver for the intermediate queue device. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ * ++ * Authors: Patrick McHardy, ++ * ++ * The first version was written by Martin Devera, ++ * ++ * Credits: Jan Rafaj ++ * - Update patch to 2.4.21 ++ * Sebastian Strollo ++ * - Fix "Dead-loop on netdevice imq"-issue ++ * Marcel Sebek ++ * - Update to 2.6.2-rc1 ++ * ++ * After some time of inactivity there is a group taking care ++ * of IMQ again: http://www.linuximq.net ++ * ++ * ++ * 2004/06/30 - New version of IMQ patch to kernels <=2.6.7 ++ * including the following changes: ++ * ++ * - Correction of ipv6 support "+"s issue (Hasso Tepper) ++ * - Correction of imq_init_devs() issue that resulted in ++ * kernel OOPS unloading IMQ as module (Norbert Buchmuller) ++ * - Addition of functionality to choose number of IMQ devices ++ * during kernel config (Andre Correa) ++ * - Addition of functionality to choose how IMQ hooks on ++ * PRE and POSTROUTING (after or before NAT) (Andre Correa) ++ * - Cosmetic corrections (Norbert Buchmuller) (Andre Correa) ++ * ++ * ++ * 2005/12/16 - IMQ versions between 2.6.7 and 2.6.13 were ++ * released with almost no problems. 2.6.14-x was released ++ * with some important changes: nfcache was removed; After ++ * some weeks of trouble we figured out that some IMQ fields ++ * in skb were missing in skbuff.c - skb_clone and copy_skb_header. ++ * These functions are correctly patched by this new patch version. ++ * ++ * Thanks for all who helped to figure out all the problems with ++ * 2.6.14.x: Patrick McHardy, Rune Kock, VeNoMouS, Max CtRiX, ++ * Kevin Shanahan, Richard Lucassen, Valery Dachev (hopefully ++ * I didn't forget anybody). I apologize again for my lack of time. ++ * ++ * ++ * 2008/06/17 - 2.6.25 - Changed imq.c to use qdisc_run() instead ++ * of qdisc_restart() and moved qdisc_run() to tasklet to avoid ++ * recursive locking. New initialization routines to fix 'rmmod' not ++ * working anymore. Used code from ifb.c. (Jussi Kivilinna) ++ * ++ * 2008/08/06 - 2.6.26 - (JK) ++ * - Replaced tasklet with 'netif_schedule()'. ++ * - Cleaned up and added comments for imq_nf_queue(). ++ * ++ * 2009/04/12 ++ * - Add skb_save_cb/skb_restore_cb helper functions for backuping ++ * control buffer. This is needed because qdisc-layer on kernels ++ * 2.6.27 and newer overwrite control buffer. (Jussi Kivilinna) ++ * - Add better locking for IMQ device. Hopefully this will solve ++ * SMP issues. (Jussi Kivilinna) ++ * - Port to 2.6.27 ++ * - Port to 2.6.28 ++ * - Port to 2.6.29 + fix rmmod not working ++ * ++ * 2009/04/20 - (Jussi Kivilinna) ++ * - Use netdevice feature flags to avoid extra packet handling ++ * by core networking layer and possibly increase performance. ++ * ++ * 2009/09/26 - (Jussi Kivilinna) ++ * - Add imq_nf_reinject_lockless to fix deadlock with ++ * imq_nf_queue/imq_nf_reinject. ++ * ++ * 2009/12/08 - (Jussi Kivilinna) ++ * - Port to 2.6.32 ++ * - Add check for skb->nf_queue_entry==NULL in imq_dev_xmit() ++ * - Also add better error checking for skb->nf_queue_entry usage ++ * ++ * 2010/02/25 - (Jussi Kivilinna) ++ * - Port to 2.6.33 ++ * ++ * 2010/08/15 - (Jussi Kivilinna) ++ * - Port to 2.6.35 ++ * - Simplify hook registration by using nf_register_hooks. ++ * - nf_reinject doesn't need spinlock around it, therefore remove ++ * imq_nf_reinject function. Other nf_reinject users protect ++ * their own data with spinlock. With IMQ however all data is ++ * needed is stored per skbuff, so no locking is needed. ++ * - Changed IMQ to use 'separate' NF_IMQ_QUEUE instead of ++ * NF_QUEUE, this allows working coexistance of IMQ and other ++ * NF_QUEUE users. ++ * - Make IMQ multi-queue. Number of IMQ device queues can be ++ * increased with 'numqueues' module parameters. Default number ++ * of queues is 1, in other words by default IMQ works as ++ * single-queue device. Multi-queue selection is based on ++ * IFB multi-queue patch by Changli Gao . ++ * ++ * 2011/03/18 - (Jussi Kivilinna) ++ * - Port to 2.6.38 ++ * ++ * 2011/07/12 - (syoder89@gmail.com) ++ * - Crash fix that happens when the receiving interface has more ++ * than one queue (add missing skb_set_queue_mapping in ++ * imq_select_queue). ++ * ++ * 2011/07/26 - (Jussi Kivilinna) ++ * - Add queue mapping checks for packets exiting IMQ. ++ * - Port to 3.0 ++ * ++ * 2011/08/16 - (Jussi Kivilinna) ++ * - Clear IFF_TX_SKB_SHARING flag that was added for linux 3.0.2 ++ * ++ * 2011/11/03 - Germano Michel ++ * - Fix IMQ for net namespaces ++ * ++ * 2011/11/04 - Jussi Kivilinna ++ * - Port to 3.1 ++ * - Clean-up, move 'get imq device pointer by imqX name' to ++ * separate function from imq_nf_queue(). ++ * ++ * Also, many thanks to pablo Sebastian Greco for making the initial ++ * patch and to those who helped the testing. ++ * ++ * More info at: http://www.linuximq.net/ (Andre Correa) ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ #include ++#endif ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num); ++ ++static nf_hookfn imq_nf_hook; ++ ++static struct nf_hook_ops imq_ops[] = { ++ { ++ /* imq_ingress_ipv4 */ ++ .hook = imq_nf_hook, ++ .owner = THIS_MODULE, ++ .pf = PF_INET, ++ .hooknum = NF_INET_PRE_ROUTING, ++#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) ++ .priority = NF_IP_PRI_MANGLE + 1, ++#else ++ .priority = NF_IP_PRI_NAT_DST + 1, ++#endif ++ }, ++ { ++ /* imq_egress_ipv4 */ ++ .hook = imq_nf_hook, ++ .owner = THIS_MODULE, ++ .pf = PF_INET, ++ .hooknum = NF_INET_POST_ROUTING, ++#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA) ++ .priority = NF_IP_PRI_LAST, ++#else ++ .priority = NF_IP_PRI_NAT_SRC - 1, ++#endif ++ }, ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ { ++ /* imq_ingress_ipv6 */ ++ .hook = imq_nf_hook, ++ .owner = THIS_MODULE, ++ .pf = PF_INET6, ++ .hooknum = NF_INET_PRE_ROUTING, ++#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) ++ .priority = NF_IP6_PRI_MANGLE + 1, ++#else ++ .priority = NF_IP6_PRI_NAT_DST + 1, ++#endif ++ }, ++ { ++ /* imq_egress_ipv6 */ ++ .hook = imq_nf_hook, ++ .owner = THIS_MODULE, ++ .pf = PF_INET6, ++ .hooknum = NF_INET_POST_ROUTING, ++#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA) ++ .priority = NF_IP6_PRI_LAST, ++#else ++ .priority = NF_IP6_PRI_NAT_SRC - 1, ++#endif ++ }, ++#endif ++}; ++ ++#if defined(CONFIG_IMQ_NUM_DEVS) ++static int numdevs = CONFIG_IMQ_NUM_DEVS; ++#else ++static int numdevs = IMQ_MAX_DEVS; ++#endif ++ ++static struct net_device *imq_devs_cache[IMQ_MAX_DEVS]; ++ ++#define IMQ_MAX_QUEUES 32 ++static int numqueues = 1; ++static u32 imq_hashrnd; ++ ++static inline __be16 pppoe_proto(const struct sk_buff *skb) ++{ ++ return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + ++ sizeof(struct pppoe_hdr))); ++} ++ ++static u16 imq_hash(struct net_device *dev, struct sk_buff *skb) ++{ ++ unsigned int pull_len; ++ u16 protocol = skb->protocol; ++ u32 addr1, addr2; ++ u32 hash, ihl = 0; ++ union { ++ u16 in16[2]; ++ u32 in32; ++ } ports; ++ u8 ip_proto; ++ ++ pull_len = 0; ++ ++recheck: ++ switch (protocol) { ++ case htons(ETH_P_8021Q): { ++ if (unlikely(skb_pull(skb, VLAN_HLEN) == NULL)) ++ goto other; ++ ++ pull_len += VLAN_HLEN; ++ skb->network_header += VLAN_HLEN; ++ ++ protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; ++ goto recheck; ++ } ++ ++ case htons(ETH_P_PPP_SES): { ++ if (unlikely(skb_pull(skb, PPPOE_SES_HLEN) == NULL)) ++ goto other; ++ ++ pull_len += PPPOE_SES_HLEN; ++ skb->network_header += PPPOE_SES_HLEN; ++ ++ protocol = pppoe_proto(skb); ++ goto recheck; ++ } ++ ++ case htons(ETH_P_IP): { ++ const struct iphdr *iph = ip_hdr(skb); ++ ++ if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr)))) ++ goto other; ++ ++ addr1 = iph->daddr; ++ addr2 = iph->saddr; ++ ++ ip_proto = !(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) ? ++ iph->protocol : 0; ++ ihl = ip_hdrlen(skb); ++ ++ break; ++ } ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ case htons(ETH_P_IPV6): { ++ const struct ipv6hdr *iph = ipv6_hdr(skb); ++ ++ if (unlikely(!pskb_may_pull(skb, sizeof(struct ipv6hdr)))) ++ goto other; ++ ++ addr1 = iph->daddr.s6_addr32[3]; ++ addr2 = iph->saddr.s6_addr32[3]; ++ ihl = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &ip_proto); ++ if (unlikely(ihl < 0)) ++ goto other; ++ ++ break; ++ } ++#endif ++ default: ++other: ++ if (pull_len != 0) { ++ skb_push(skb, pull_len); ++ skb->network_header -= pull_len; ++ } ++ ++ return (u16)(ntohs(protocol) % dev->real_num_tx_queues); ++ } ++ ++ if (addr1 > addr2) ++ swap(addr1, addr2); ++ ++ switch (ip_proto) { ++ case IPPROTO_TCP: ++ case IPPROTO_UDP: ++ case IPPROTO_DCCP: ++ case IPPROTO_ESP: ++ case IPPROTO_AH: ++ case IPPROTO_SCTP: ++ case IPPROTO_UDPLITE: { ++ if (likely(skb_copy_bits(skb, ihl, &ports.in32, 4) >= 0)) { ++ if (ports.in16[0] > ports.in16[1]) ++ swap(ports.in16[0], ports.in16[1]); ++ break; ++ } ++ /* fall-through */ ++ } ++ default: ++ ports.in32 = 0; ++ break; ++ } ++ ++ if (pull_len != 0) { ++ skb_push(skb, pull_len); ++ skb->network_header -= pull_len; ++ } ++ ++ hash = jhash_3words(addr1, addr2, ports.in32, imq_hashrnd ^ ip_proto); ++ ++ return (u16)(((u64)hash * dev->real_num_tx_queues) >> 32); ++} ++ ++static inline bool sk_tx_queue_recorded(struct sock *sk) ++{ ++ return (sk_tx_queue_get(sk) >= 0); ++} ++ ++static struct netdev_queue *imq_select_queue(struct net_device *dev, ++ struct sk_buff *skb) ++{ ++ u16 queue_index = 0; ++ u32 hash; ++ ++ if (likely(dev->real_num_tx_queues == 1)) ++ goto out; ++ ++ /* IMQ can be receiving ingress or engress packets. */ ++ ++ /* Check first for if rx_queue is set */ ++ if (skb_rx_queue_recorded(skb)) { ++ queue_index = skb_get_rx_queue(skb); ++ goto out; ++ } ++ ++ /* Check if socket has tx_queue set */ ++ if (sk_tx_queue_recorded(skb->sk)) { ++ queue_index = sk_tx_queue_get(skb->sk); ++ goto out; ++ } ++ ++ /* Try use socket hash */ ++ if (skb->sk && skb->sk->sk_hash) { ++ hash = skb->sk->sk_hash; ++ queue_index = ++ (u16)(((u64)hash * dev->real_num_tx_queues) >> 32); ++ goto out; ++ } ++ ++ /* Generate hash from packet data */ ++ queue_index = imq_hash(dev, skb); ++ ++out: ++ if (unlikely(queue_index >= dev->real_num_tx_queues)) ++ queue_index = (u16)((u32)queue_index % dev->real_num_tx_queues); ++ ++ skb_set_queue_mapping(skb, queue_index); ++ return netdev_get_tx_queue(dev, queue_index); ++} ++ ++static struct net_device_stats *imq_get_stats(struct net_device *dev) ++{ ++ return &dev->stats; ++} ++ ++/* called for packets kfree'd in qdiscs at places other than enqueue */ ++static void imq_skb_destructor(struct sk_buff *skb) ++{ ++ struct nf_queue_entry *entry = skb->nf_queue_entry; ++ ++ skb->nf_queue_entry = NULL; ++ ++ if (entry) { ++ nf_queue_entry_release_refs(entry); ++ kfree(entry); ++ } ++ ++ skb_restore_cb(skb); /* kfree backup */ ++} ++ ++static void imq_done_check_queue_mapping(struct sk_buff *skb, ++ struct net_device *dev) ++{ ++ unsigned int queue_index; ++ ++ /* Don't let queue_mapping be left too large after exiting IMQ */ ++ if (likely(skb->dev != dev && skb->dev != NULL)) { ++ queue_index = skb_get_queue_mapping(skb); ++ if (unlikely(queue_index >= skb->dev->real_num_tx_queues)) { ++ queue_index = (u16)((u32)queue_index % ++ skb->dev->real_num_tx_queues); ++ skb_set_queue_mapping(skb, queue_index); ++ } ++ } else { ++ /* skb->dev was IMQ device itself or NULL, be on safe side and ++ * just clear queue mapping. ++ */ ++ skb_set_queue_mapping(skb, 0); ++ } ++} ++ ++static netdev_tx_t imq_dev_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct nf_queue_entry *entry = skb->nf_queue_entry; ++ ++ skb->nf_queue_entry = NULL; ++ dev->trans_start = jiffies; ++ ++ dev->stats.tx_bytes += skb->len; ++ dev->stats.tx_packets++; ++ ++ if (unlikely(entry == NULL)) { ++ /* We don't know what is going on here.. packet is queued for ++ * imq device, but (probably) not by us. ++ * ++ * If this packet was not send here by imq_nf_queue(), then ++ * skb_save_cb() was not used and skb_free() should not show: ++ * WARNING: IMQ: kfree_skb: skb->cb_next:.. ++ * and/or ++ * WARNING: IMQ: kfree_skb: skb->nf_queue_entry... ++ * ++ * However if this message is shown, then IMQ is somehow broken ++ * and you should report this to linuximq.net. ++ */ ++ ++ /* imq_dev_xmit is black hole that eats all packets, report that ++ * we eat this packet happily and increase dropped counters. ++ */ ++ ++ dev->stats.tx_dropped++; ++ dev_kfree_skb(skb); ++ ++ return NETDEV_TX_OK; ++ } ++ ++ skb_restore_cb(skb); /* restore skb->cb */ ++ ++ skb->imq_flags = 0; ++ skb->destructor = NULL; ++ ++ imq_done_check_queue_mapping(skb, dev); ++ ++ nf_reinject(entry, NF_ACCEPT); ++ ++ return NETDEV_TX_OK; ++} ++ ++static struct net_device *get_imq_device_by_index(int index) ++{ ++ struct net_device *dev = NULL; ++ struct net *net; ++ char buf[8]; ++ ++ /* get device by name and cache result */ ++ snprintf(buf, sizeof(buf), "imq%d", index); ++ ++ /* Search device from all namespaces. */ ++ for_each_net(net) { ++ dev = dev_get_by_name(net, buf); ++ if (dev) ++ break; ++ } ++ ++ if (WARN_ON_ONCE(dev == NULL)) { ++ /* IMQ device not found. Exotic config? */ ++ return ERR_PTR(-ENODEV); ++ } ++ ++ imq_devs_cache[index] = dev; ++ dev_put(dev); ++ ++ return dev; ++} ++ ++static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num) ++{ ++ struct net_device *dev; ++ struct sk_buff *skb_orig, *skb, *skb_shared; ++ struct Qdisc *q; ++ struct netdev_queue *txq; ++ spinlock_t *root_lock; ++ int users, index; ++ int retval = -EINVAL; ++ unsigned int orig_queue_index; ++ ++ index = entry->skb->imq_flags & IMQ_F_IFMASK; ++ if (unlikely(index > numdevs - 1)) { ++ if (net_ratelimit()) ++ printk(KERN_WARNING ++ "IMQ: invalid device specified, highest is %u\n", ++ numdevs - 1); ++ retval = -EINVAL; ++ goto out; ++ } ++ ++ /* check for imq device by index from cache */ ++ dev = imq_devs_cache[index]; ++ if (unlikely(!dev)) { ++ dev = get_imq_device_by_index(index); ++ if (IS_ERR(dev)) { ++ retval = PTR_ERR(dev); ++ goto out; ++ } ++ } ++ ++ if (unlikely(!(dev->flags & IFF_UP))) { ++ entry->skb->imq_flags = 0; ++ nf_reinject(entry, NF_ACCEPT); ++ retval = 0; ++ goto out; ++ } ++ dev->last_rx = jiffies; ++ ++ skb = entry->skb; ++ skb_orig = NULL; ++ ++ /* skb has owner? => make clone */ ++ if (unlikely(skb->destructor)) { ++ skb_orig = skb; ++ skb = skb_clone(skb, GFP_ATOMIC); ++ if (unlikely(!skb)) { ++ retval = -ENOMEM; ++ goto out; ++ } ++ entry->skb = skb; ++ } ++ ++ skb->nf_queue_entry = entry; ++ ++ dev->stats.rx_bytes += skb->len; ++ dev->stats.rx_packets++; ++ ++ if (!skb->dev) { ++ /* skb->dev == NULL causes problems, try the find cause. */ ++ if (net_ratelimit()) { ++ dev_warn(&dev->dev, ++ "received packet with skb->dev == NULL\n"); ++ dump_stack(); ++ } ++ ++ skb->dev = dev; ++ } ++ ++ /* Disables softirqs for lock below */ ++ rcu_read_lock_bh(); ++ ++ /* Multi-queue selection */ ++ orig_queue_index = skb_get_queue_mapping(skb); ++ txq = imq_select_queue(dev, skb); ++ ++ q = rcu_dereference(txq->qdisc); ++ if (unlikely(!q->enqueue)) ++ goto packet_not_eaten_by_imq_dev; ++ ++ root_lock = qdisc_lock(q); ++ spin_lock(root_lock); ++ ++ users = atomic_read(&skb->users); ++ ++ skb_shared = skb_get(skb); /* increase reference count by one */ ++ skb_save_cb(skb_shared); /* backup skb->cb, as qdisc layer will ++ overwrite it */ ++ qdisc_enqueue_root(skb_shared, q); /* might kfree_skb */ ++ ++ if (likely(atomic_read(&skb_shared->users) == users + 1)) { ++ kfree_skb(skb_shared); /* decrease reference count by one */ ++ ++ skb->destructor = &imq_skb_destructor; ++ ++ /* cloned? */ ++ if (unlikely(skb_orig)) ++ kfree_skb(skb_orig); /* free original */ ++ ++ spin_unlock(root_lock); ++ rcu_read_unlock_bh(); ++ ++ /* schedule qdisc dequeue */ ++ __netif_schedule(q); ++ ++ retval = 0; ++ goto out; ++ } else { ++ skb_restore_cb(skb_shared); /* restore skb->cb */ ++ skb->nf_queue_entry = NULL; ++ /* qdisc dropped packet and decreased skb reference count of ++ * skb, so we don't really want to and try refree as that would ++ * actually destroy the skb. */ ++ spin_unlock(root_lock); ++ goto packet_not_eaten_by_imq_dev; ++ } ++ ++packet_not_eaten_by_imq_dev: ++ skb_set_queue_mapping(skb, orig_queue_index); ++ rcu_read_unlock_bh(); ++ ++ /* cloned? restore original */ ++ if (unlikely(skb_orig)) { ++ kfree_skb(skb); ++ entry->skb = skb_orig; ++ } ++ retval = -1; ++out: ++ return retval; ++} ++ ++static unsigned int imq_nf_hook(unsigned int hook, struct sk_buff *pskb, ++ const struct net_device *indev, ++ const struct net_device *outdev, ++ int (*okfn)(struct sk_buff *)) ++{ ++ return (pskb->imq_flags & IMQ_F_ENQUEUE) ? NF_IMQ_QUEUE : NF_ACCEPT; ++} ++ ++static int imq_close(struct net_device *dev) ++{ ++ netif_stop_queue(dev); ++ return 0; ++} ++ ++static int imq_open(struct net_device *dev) ++{ ++ netif_start_queue(dev); ++ return 0; ++} ++ ++static const struct net_device_ops imq_netdev_ops = { ++ .ndo_open = imq_open, ++ .ndo_stop = imq_close, ++ .ndo_start_xmit = imq_dev_xmit, ++ .ndo_get_stats = imq_get_stats, ++}; ++ ++static void imq_setup(struct net_device *dev) ++{ ++ dev->netdev_ops = &imq_netdev_ops; ++ dev->type = ARPHRD_VOID; ++ dev->mtu = 16000; /* too small? */ ++ dev->tx_queue_len = 11000; /* too big? */ ++ dev->flags = IFF_NOARP; ++ dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | ++ NETIF_F_GSO | NETIF_F_HW_CSUM | ++ NETIF_F_HIGHDMA; ++ dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | ++ IFF_TX_SKB_SHARING); ++} ++ ++static int imq_validate(struct nlattr *tb[], struct nlattr *data[]) ++{ ++ int ret = 0; ++ ++ if (tb[IFLA_ADDRESS]) { ++ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { ++ ret = -EINVAL; ++ goto end; ++ } ++ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { ++ ret = -EADDRNOTAVAIL; ++ goto end; ++ } ++ } ++ return 0; ++end: ++ printk(KERN_WARNING "IMQ: imq_validate failed (%d)\n", ret); ++ return ret; ++} ++ ++static struct rtnl_link_ops imq_link_ops __read_mostly = { ++ .kind = "imq", ++ .priv_size = 0, ++ .setup = imq_setup, ++ .validate = imq_validate, ++}; ++ ++static const struct nf_queue_handler imq_nfqh = { ++ .name = "imq", ++ .outfn = imq_nf_queue, ++}; ++ ++static int __init imq_init_hooks(void) ++{ ++ int ret; ++ ++ nf_register_queue_imq_handler(&imq_nfqh); ++ ++ ret = nf_register_hooks(imq_ops, ARRAY_SIZE(imq_ops)); ++ if (ret < 0) ++ nf_unregister_queue_imq_handler(); ++ ++ return ret; ++} ++ ++static int __init imq_init_one(int index) ++{ ++ struct net_device *dev; ++ int ret; ++ ++ dev = alloc_netdev_mq(0, "imq%d", imq_setup, numqueues); ++ if (!dev) ++ return -ENOMEM; ++ ++ ret = dev_alloc_name(dev, dev->name); ++ if (ret < 0) ++ goto fail; ++ ++ dev->rtnl_link_ops = &imq_link_ops; ++ ret = register_netdevice(dev); ++ if (ret < 0) ++ goto fail; ++ ++ return 0; ++fail: ++ free_netdev(dev); ++ return ret; ++} ++ ++static int __init imq_init_devs(void) ++{ ++ int err, i; ++ ++ if (numdevs < 1 || numdevs > IMQ_MAX_DEVS) { ++ printk(KERN_ERR "IMQ: numdevs has to be betweed 1 and %u\n", ++ IMQ_MAX_DEVS); ++ return -EINVAL; ++ } ++ ++ if (numqueues < 1 || numqueues > IMQ_MAX_QUEUES) { ++ printk(KERN_ERR "IMQ: numqueues has to be betweed 1 and %u\n", ++ IMQ_MAX_QUEUES); ++ return -EINVAL; ++ } ++ ++ get_random_bytes(&imq_hashrnd, sizeof(imq_hashrnd)); ++ ++ rtnl_lock(); ++ err = __rtnl_link_register(&imq_link_ops); ++ ++ for (i = 0; i < numdevs && !err; i++) ++ err = imq_init_one(i); ++ ++ if (err) { ++ __rtnl_link_unregister(&imq_link_ops); ++ memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); ++ } ++ rtnl_unlock(); ++ ++ return err; ++} ++ ++static int __init imq_init_module(void) ++{ ++ int err; ++ ++#if defined(CONFIG_IMQ_NUM_DEVS) ++ BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS > 16); ++ BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS < 2); ++ BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS - 1 > IMQ_F_IFMASK); ++#endif ++ ++ err = imq_init_devs(); ++ if (err) { ++ printk(KERN_ERR "IMQ: Error trying imq_init_devs(net)\n"); ++ return err; ++ } ++ ++ err = imq_init_hooks(); ++ if (err) { ++ printk(KERN_ERR "IMQ: Error trying imq_init_hooks()\n"); ++ rtnl_link_unregister(&imq_link_ops); ++ memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); ++ return err; ++ } ++ ++ printk(KERN_INFO "IMQ driver loaded successfully. " ++ "(numdevs = %d, numqueues = %d)\n", numdevs, numqueues); ++ ++#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) ++ printk(KERN_INFO "\tHooking IMQ before NAT on PREROUTING.\n"); ++#else ++ printk(KERN_INFO "\tHooking IMQ after NAT on PREROUTING.\n"); ++#endif ++#if defined(CONFIG_IMQ_BEHAVIOR_AB) || defined(CONFIG_IMQ_BEHAVIOR_BB) ++ printk(KERN_INFO "\tHooking IMQ before NAT on POSTROUTING.\n"); ++#else ++ printk(KERN_INFO "\tHooking IMQ after NAT on POSTROUTING.\n"); ++#endif ++ ++ return 0; ++} ++ ++static void __exit imq_unhook(void) ++{ ++ nf_unregister_hooks(imq_ops, ARRAY_SIZE(imq_ops)); ++ nf_unregister_queue_imq_handler(); ++} ++ ++static void __exit imq_cleanup_devs(void) ++{ ++ rtnl_link_unregister(&imq_link_ops); ++ memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); ++} ++ ++static void __exit imq_exit_module(void) ++{ ++ imq_unhook(); ++ imq_cleanup_devs(); ++ printk(KERN_INFO "IMQ driver unloaded successfully.\n"); ++} ++ ++module_init(imq_init_module); ++module_exit(imq_exit_module); ++ ++module_param(numdevs, int, 0); ++module_param(numqueues, int, 0); ++MODULE_PARM_DESC(numdevs, "number of IMQ devices (how many imq* devices will " ++ "be created)"); ++MODULE_PARM_DESC(numqueues, "number of queues per IMQ device"); ++MODULE_AUTHOR("http://www.linuximq.net"); ++MODULE_DESCRIPTION("Pseudo-driver for the intermediate queue device. See " ++ "http://www.linuximq.net/ for more information."); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS_RTNL_LINK("imq"); ++ +diff -uNr linux-3.2.0-go.orig//drivers/net/Kconfig linux-3.2.0-go/drivers/net/Kconfig +--- linux-3.2.0-go.orig//drivers/net/Kconfig 2012-01-16 18:52:00.206580353 +0100 ++++ linux-3.2.0-go/drivers/net/Kconfig 2012-01-16 18:54:18.680050560 +0100 +@@ -193,6 +193,125 @@ + depends on RIONET + default "128" + ++config IMQ ++ tristate "IMQ (intermediate queueing device) support" ++ depends on NETDEVICES && NETFILTER ++ ---help--- ++ The IMQ device(s) is used as placeholder for QoS queueing ++ disciplines. Every packet entering/leaving the IP stack can be ++ directed through the IMQ device where it's enqueued/dequeued to the ++ attached qdisc. This allows you to treat network devices as classes ++ and distribute bandwidth among them. Iptables is used to specify ++ through which IMQ device, if any, packets travel. ++ ++ More information at: http://www.linuximq.net/ ++ ++ To compile this driver as a module, choose M here: the module ++ will be called imq. If unsure, say N. ++ ++choice ++ prompt "IMQ behavior (PRE/POSTROUTING)" ++ depends on IMQ ++ default IMQ_BEHAVIOR_AB ++ help ++ This setting defines how IMQ behaves in respect to its ++ hooking in PREROUTING and POSTROUTING. ++ ++ IMQ can work in any of the following ways: ++ ++ PREROUTING | POSTROUTING ++ -----------------|------------------- ++ #1 After NAT | After NAT ++ #2 After NAT | Before NAT ++ #3 Before NAT | After NAT ++ #4 Before NAT | Before NAT ++ ++ The default behavior is to hook before NAT on PREROUTING ++ and after NAT on POSTROUTING (#3). ++ ++ This settings are specially usefull when trying to use IMQ ++ to shape NATed clients. ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ ++config IMQ_BEHAVIOR_AA ++ bool "IMQ AA" ++ help ++ This setting defines how IMQ behaves in respect to its ++ hooking in PREROUTING and POSTROUTING. ++ ++ Choosing this option will make IMQ hook like this: ++ ++ PREROUTING: After NAT ++ POSTROUTING: After NAT ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ ++config IMQ_BEHAVIOR_AB ++ bool "IMQ AB" ++ help ++ This setting defines how IMQ behaves in respect to its ++ hooking in PREROUTING and POSTROUTING. ++ ++ Choosing this option will make IMQ hook like this: ++ ++ PREROUTING: After NAT ++ POSTROUTING: Before NAT ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ ++config IMQ_BEHAVIOR_BA ++ bool "IMQ BA" ++ help ++ This setting defines how IMQ behaves in respect to its ++ hooking in PREROUTING and POSTROUTING. ++ ++ Choosing this option will make IMQ hook like this: ++ ++ PREROUTING: Before NAT ++ POSTROUTING: After NAT ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ ++config IMQ_BEHAVIOR_BB ++ bool "IMQ BB" ++ help ++ This setting defines how IMQ behaves in respect to its ++ hooking in PREROUTING and POSTROUTING. ++ ++ Choosing this option will make IMQ hook like this: ++ ++ PREROUTING: Before NAT ++ POSTROUTING: Before NAT ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ ++endchoice ++ ++config IMQ_NUM_DEVS ++ int "Number of IMQ devices" ++ range 2 16 ++ depends on IMQ ++ default "16" ++ help ++ This setting defines how many IMQ devices will be created. ++ ++ The default value is 16. ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ + config TUN + tristate "Universal TUN/TAP device driver support" + select CRC32 +diff -uNr linux-3.2.0-go.orig//drivers/net/Makefile linux-3.2.0-go/drivers/net/Makefile +--- linux-3.2.0-go.orig//drivers/net/Makefile 2012-01-16 18:52:00.345470492 +0100 ++++ linux-3.2.0-go/drivers/net/Makefile 2012-01-16 18:57:33.577640398 +0100 +@@ -7,6 +7,7 @@ + # + obj-$(CONFIG_BONDING) += bonding/ + obj-$(CONFIG_DUMMY) += dummy.o ++obj-$(CONFIG_IMQ) += imq.o + obj-$(CONFIG_EQUALIZER) += eql.o + obj-$(CONFIG_IFB) += ifb.o + obj-$(CONFIG_MACVLAN) += macvlan.o +diff -uNr linux-3.2.0-go.orig//include/linux/imq.h linux-3.2.0-go/include/linux/imq.h +--- linux-3.2.0-go.orig//include/linux/imq.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-go/include/linux/imq.h 2012-01-16 18:54:18.682365396 +0100 +@@ -0,0 +1,13 @@ ++#ifndef _IMQ_H ++#define _IMQ_H ++ ++/* IFMASK (16 device indexes, 0 to 15) and flag(s) fit in 5 bits */ ++#define IMQ_F_BITS 5 ++ ++#define IMQ_F_IFMASK 0x0f ++#define IMQ_F_ENQUEUE 0x10 ++ ++#define IMQ_MAX_DEVS (IMQ_F_IFMASK + 1) ++ ++#endif /* _IMQ_H */ ++ +diff -uNr linux-3.2.0-go.orig//include/linux/netfilter/xt_IMQ.h linux-3.2.0-go/include/linux/netfilter/xt_IMQ.h +--- linux-3.2.0-go.orig//include/linux/netfilter/xt_IMQ.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-go/include/linux/netfilter/xt_IMQ.h 2012-01-16 18:54:18.682365396 +0100 +@@ -0,0 +1,9 @@ ++#ifndef _XT_IMQ_H ++#define _XT_IMQ_H ++ ++struct xt_imq_info { ++ unsigned int todev; /* target imq device */ ++}; ++ ++#endif /* _XT_IMQ_H */ ++ +diff -uNr linux-3.2.0-go.orig//include/linux/netfilter.h linux-3.2.0-go/include/linux/netfilter.h +--- linux-3.2.0-go.orig//include/linux/netfilter.h 2012-01-16 18:53:45.165859627 +0100 ++++ linux-3.2.0-go/include/linux/netfilter.h 2012-01-16 18:54:18.684680232 +0100 +@@ -22,7 +22,8 @@ + #define NF_QUEUE 3 + #define NF_REPEAT 4 + #define NF_STOP 5 +-#define NF_MAX_VERDICT NF_STOP ++#define NF_IMQ_QUEUE 6 ++#define NF_MAX_VERDICT NF_IMQ_QUEUE + + /* we overload the higher bits for encoding auxiliary data such as the queue + * number or errno values. Not nice, but better than additional function +diff -uNr linux-3.2.0-go.orig//include/linux/netfilter_ipv4/ipt_IMQ.h linux-3.2.0-go/include/linux/netfilter_ipv4/ipt_IMQ.h +--- linux-3.2.0-go.orig//include/linux/netfilter_ipv4/ipt_IMQ.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-go/include/linux/netfilter_ipv4/ipt_IMQ.h 2012-01-16 18:54:18.686995068 +0100 +@@ -0,0 +1,10 @@ ++#ifndef _IPT_IMQ_H ++#define _IPT_IMQ_H ++ ++/* Backwards compatibility for old userspace */ ++#include ++ ++#define ipt_imq_info xt_imq_info ++ ++#endif /* _IPT_IMQ_H */ ++ +diff -uNr linux-3.2.0-go.orig//include/linux/netfilter_ipv6/ip6t_IMQ.h linux-3.2.0-go/include/linux/netfilter_ipv6/ip6t_IMQ.h +--- linux-3.2.0-go.orig//include/linux/netfilter_ipv6/ip6t_IMQ.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-go/include/linux/netfilter_ipv6/ip6t_IMQ.h 2012-01-16 18:54:18.686995068 +0100 +@@ -0,0 +1,10 @@ ++#ifndef _IP6T_IMQ_H ++#define _IP6T_IMQ_H ++ ++/* Backwards compatibility for old userspace */ ++#include ++ ++#define ip6t_imq_info xt_imq_info ++ ++#endif /* _IP6T_IMQ_H */ ++ +diff -uNr linux-3.2.0-go.orig//include/linux/skbuff.h linux-3.2.0-go/include/linux/skbuff.h +--- linux-3.2.0-go.orig//include/linux/skbuff.h 2012-01-16 18:53:43.114915216 +0100 ++++ linux-3.2.0-go/include/linux/skbuff.h 2012-01-16 18:59:22.256860605 +0100 +@@ -30,6 +30,9 @@ + #include + #include + #include ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++#include ++#endif + + /* Don't change this without changing skb_csum_unnecessary! */ + #define CHECKSUM_NONE 0 +@@ -386,6 +389,9 @@ + * first. This is owned by whoever has the skb queued ATM. + */ + char cb[48] __aligned(8); ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ void *cb_next; ++#endif + + unsigned long _skb_refdst; + #ifdef CONFIG_XFRM +@@ -424,6 +430,9 @@ + #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED + struct sk_buff *nfct_reasm; + #endif ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ struct nf_queue_entry *nf_queue_entry; ++#endif + #ifdef CONFIG_BRIDGE_NETFILTER + struct nf_bridge_info *nf_bridge; + #endif +@@ -449,6 +458,10 @@ + + /* 0/13 bit hole */ + ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ __u8 imq_flags:IMQ_F_BITS; ++#endif ++ + #ifdef CONFIG_NET_DMA + dma_cookie_t dma_cookie; + #endif +@@ -535,6 +548,12 @@ + return (struct rtable *)skb_dst(skb); + } + ++ ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++extern int skb_save_cb(struct sk_buff *skb); ++extern int skb_restore_cb(struct sk_buff *skb); ++#endif ++ + extern void kfree_skb(struct sk_buff *skb); + extern void consume_skb(struct sk_buff *skb); + extern void __kfree_skb(struct sk_buff *skb); +@@ -2358,6 +2377,10 @@ + dst->nfct_reasm = src->nfct_reasm; + nf_conntrack_get_reasm(src->nfct_reasm); + #endif ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ dst->imq_flags = src->imq_flags; ++ dst->nf_queue_entry = src->nf_queue_entry; ++#endif + #ifdef CONFIG_BRIDGE_NETFILTER + dst->nf_bridge = src->nf_bridge; + nf_bridge_get(src->nf_bridge); +diff -uNr linux-3.2.0-go.orig//include/net/netfilter/nf_queue.h linux-3.2.0-go/include/net/netfilter/nf_queue.h +--- linux-3.2.0-go.orig//include/net/netfilter/nf_queue.h 2012-01-16 18:53:39.024600575 +0100 ++++ linux-3.2.0-go/include/net/netfilter/nf_queue.h 2012-01-16 18:54:18.703198917 +0100 +@@ -30,5 +30,11 @@ + const struct nf_queue_handler *qh); + extern void nf_unregister_queue_handlers(const struct nf_queue_handler *qh); + extern void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); ++extern void nf_queue_entry_release_refs(struct nf_queue_entry *entry); ++ ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++extern void nf_register_queue_imq_handler(const struct nf_queue_handler *qh); ++extern void nf_unregister_queue_imq_handler(void); ++#endif + + #endif /* _NF_QUEUE_H */ +diff -uNr linux-3.2.0-go.orig//net/core/dev.c linux-3.2.0-go/net/core/dev.c +--- linux-3.2.0-go.orig//net/core/dev.c 2012-01-16 18:52:41.130560289 +0100 ++++ linux-3.2.0-go/net/core/dev.c 2012-01-16 18:54:18.707828588 +0100 +@@ -98,6 +98,9 @@ + #include + #include + #include ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++#include ++#endif + #include + #include + #include +@@ -2185,7 +2188,12 @@ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ if (!list_empty(&ptype_all) && ++ !(skb->imq_flags & IMQ_F_ENQUEUE)) ++#else + if (!list_empty(&ptype_all)) ++#endif + dev_queue_xmit_nit(skb, dev); + + skb_orphan_try(skb); +diff -uNr linux-3.2.0-go.orig//net/core/skbuff.c linux-3.2.0-go/net/core/skbuff.c +--- linux-3.2.0-go.orig//net/core/skbuff.c 2012-01-16 18:52:41.146764138 +0100 ++++ linux-3.2.0-go/net/core/skbuff.c 2012-01-16 18:54:18.710143424 +0100 +@@ -73,6 +73,9 @@ + + static struct kmem_cache *skbuff_head_cache __read_mostly; + static struct kmem_cache *skbuff_fclone_cache __read_mostly; ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++static struct kmem_cache *skbuff_cb_store_cache __read_mostly; ++#endif + + static void sock_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +@@ -92,6 +95,82 @@ + return 1; + } + ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++/* Control buffer save/restore for IMQ devices */ ++struct skb_cb_table { ++ char cb[48] __aligned(8); ++ void *cb_next; ++ atomic_t refcnt; ++}; ++ ++static DEFINE_SPINLOCK(skb_cb_store_lock); ++ ++int skb_save_cb(struct sk_buff *skb) ++{ ++ struct skb_cb_table *next; ++ ++ next = kmem_cache_alloc(skbuff_cb_store_cache, GFP_ATOMIC); ++ if (!next) ++ return -ENOMEM; ++ ++ BUILD_BUG_ON(sizeof(skb->cb) != sizeof(next->cb)); ++ ++ memcpy(next->cb, skb->cb, sizeof(skb->cb)); ++ next->cb_next = skb->cb_next; ++ ++ atomic_set(&next->refcnt, 1); ++ ++ skb->cb_next = next; ++ return 0; ++} ++EXPORT_SYMBOL(skb_save_cb); ++ ++int skb_restore_cb(struct sk_buff *skb) ++{ ++ struct skb_cb_table *next; ++ ++ if (!skb->cb_next) ++ return 0; ++ ++ next = skb->cb_next; ++ ++ BUILD_BUG_ON(sizeof(skb->cb) != sizeof(next->cb)); ++ ++ memcpy(skb->cb, next->cb, sizeof(skb->cb)); ++ skb->cb_next = next->cb_next; ++ ++ spin_lock(&skb_cb_store_lock); ++ ++ if (atomic_dec_and_test(&next->refcnt)) ++ kmem_cache_free(skbuff_cb_store_cache, next); ++ ++ spin_unlock(&skb_cb_store_lock); ++ ++ return 0; ++} ++EXPORT_SYMBOL(skb_restore_cb); ++ ++static void skb_copy_stored_cb(struct sk_buff *new, const struct sk_buff *__old) ++{ ++ struct skb_cb_table *next; ++ struct sk_buff *old; ++ ++ if (!__old->cb_next) { ++ new->cb_next = NULL; ++ return; ++ } ++ ++ spin_lock(&skb_cb_store_lock); ++ ++ old = (struct sk_buff *)__old; ++ ++ next = old->cb_next; ++ atomic_inc(&next->refcnt); ++ new->cb_next = next; ++ ++ spin_unlock(&skb_cb_store_lock); ++} ++#endif + + /* Pipe buffer operations for a socket. */ + static const struct pipe_buf_operations sock_pipe_buf_ops = { +@@ -403,6 +482,26 @@ + WARN_ON(in_irq()); + skb->destructor(skb); + } ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ /* This should not happen. When it does, avoid memleak by restoring ++ the chain of cb-backups. */ ++ while (skb->cb_next != NULL) { ++ if (net_ratelimit()) ++ printk(KERN_WARNING "IMQ: kfree_skb: skb->cb_next: " ++ "%08x\n", (unsigned int)skb->cb_next); ++ ++ skb_restore_cb(skb); ++ } ++ /* This should not happen either, nf_queue_entry is nullified in ++ * imq_dev_xmit(). If we have non-NULL nf_queue_entry then we are ++ * leaking entry pointers, maybe memory. We don't know if this is ++ * pointer to already freed memory, or should this be freed. ++ * If this happens we need to add refcounting, etc for nf_queue_entry. ++ */ ++ if (skb->nf_queue_entry && net_ratelimit()) ++ printk(KERN_WARNING ++ "IMQ: kfree_skb: skb->nf_queue_entry != NULL"); ++#endif + #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put(skb->nfct); + #endif +@@ -547,6 +646,9 @@ + new->sp = secpath_get(old->sp); + #endif + memcpy(new->cb, old->cb, sizeof(old->cb)); ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ skb_copy_stored_cb(new, old); ++#endif + new->csum = old->csum; + new->local_df = old->local_df; + new->pkt_type = old->pkt_type; +@@ -2907,6 +3009,13 @@ + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ skbuff_cb_store_cache = kmem_cache_create("skbuff_cb_store_cache", ++ sizeof(struct skb_cb_table), ++ 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, ++ NULL); ++#endif + } + + /** +diff -uNr linux-3.2.0-go.orig//net/ipv6/ip6_output.c linux-3.2.0-go/net/ipv6/ip6_output.c +--- linux-3.2.0-go.orig//net/ipv6/ip6_output.c 2012-01-16 18:52:40.091199069 +0100 ++++ linux-3.2.0-go/net/ipv6/ip6_output.c 2012-01-16 18:54:18.712458260 +0100 +@@ -102,9 +102,6 @@ + struct net_device *dev = dst->dev; + struct neighbour *neigh; + +- skb->protocol = htons(ETH_P_IPV6); +- skb->dev = dev; +- + if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + +@@ -170,6 +167,11 @@ + return 0; + } + ++ /* IMQ-patch: moved setting skb->dev and skb->protocol from ++ * ip6_finish_output2 to fix crashing at netif_skb_features(). */ ++ skb->protocol = htons(ETH_P_IPV6); ++ skb->dev = dev; ++ + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev, + ip6_finish_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); +diff -uNr linux-3.2.0-go.orig//net/netfilter/core.c linux-3.2.0-go/net/netfilter/core.c +--- linux-3.2.0-go.orig//net/netfilter/core.c 2012-01-16 18:52:40.811112965 +0100 ++++ linux-3.2.0-go/net/netfilter/core.c 2012-01-16 19:02:01.429591439 +0100 +@@ -179,9 +179,11 @@ + ret = NF_DROP_GETERR(verdict); + if (ret == 0) + ret = -EPERM; +- } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { ++ } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE || ++ (verdict & NF_VERDICT_MASK) == NF_IMQ_QUEUE) { + int err = nf_queue(skb, elem, pf, hook, indev, outdev, okfn, +- verdict >> NF_VERDICT_QBITS); ++ verdict >> NF_VERDICT_QBITS, ++ verdict & NF_VERDICT_MASK); + if (err < 0) { + if (err == -ECANCELED) + goto next_hook; +diff -uNr linux-3.2.0-go.orig//net/netfilter/Kconfig linux-3.2.0-go/net/netfilter/Kconfig +--- linux-3.2.0-go.orig//net/netfilter/Kconfig 2012-01-16 18:52:40.595833248 +0100 ++++ linux-3.2.0-go/net/netfilter/Kconfig 2012-01-16 18:54:18.714773096 +0100 +@@ -506,6 +506,18 @@ + For more information on the LEDs available on your system, see + Documentation/leds/leds-class.txt + ++config NETFILTER_XT_TARGET_IMQ ++ tristate '"IMQ" target support' ++ depends on NETFILTER_XTABLES ++ depends on IP_NF_MANGLE || IP6_NF_MANGLE ++ select IMQ ++ default m if NETFILTER_ADVANCED=n ++ help ++ This option adds a `IMQ' target which is used to specify if and ++ to which imq device packets should get enqueued/dequeued. ++ ++ To compile it as a module, choose M here. If unsure, say N. ++ + config NETFILTER_XT_TARGET_MARK + tristate '"MARK" target support' + depends on NETFILTER_ADVANCED +diff -uNr linux-3.2.0-go.orig//net/netfilter/Makefile linux-3.2.0-go/net/netfilter/Makefile +--- linux-3.2.0-go.orig//net/netfilter/Makefile 2012-01-16 18:52:40.818057473 +0100 ++++ linux-3.2.0-go/net/netfilter/Makefile 2012-01-16 18:54:18.714773096 +0100 +@@ -56,6 +56,7 @@ + obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o + obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o + obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o ++obj-$(CONFIG_NETFILTER_XT_TARGET_IMQ) += xt_IMQ.o + obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o + obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o + obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o +diff -uNr linux-3.2.0-go.orig//net/netfilter/nf_internals.h linux-3.2.0-go/net/netfilter/nf_internals.h +--- linux-3.2.0-go.orig//net/netfilter/nf_internals.h 2012-01-16 18:52:40.598148084 +0100 ++++ linux-3.2.0-go/net/netfilter/nf_internals.h 2012-01-16 18:54:18.733291780 +0100 +@@ -29,7 +29,7 @@ + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), +- unsigned int queuenum); ++ unsigned int queuenum, unsigned int queuetype); + extern int __init netfilter_queue_init(void); + + /* nf_log.c */ +diff -uNr linux-3.2.0-go.orig//net/netfilter/nf_queue.c linux-3.2.0-go/net/netfilter/nf_queue.c +--- linux-3.2.0-go.orig//net/netfilter/nf_queue.c 2012-01-16 18:52:40.665278317 +0100 ++++ linux-3.2.0-go/net/netfilter/nf_queue.c 2012-01-16 18:54:18.763384644 +0100 +@@ -22,6 +22,26 @@ + + static DEFINE_MUTEX(queue_handler_mutex); + ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++static const struct nf_queue_handler *queue_imq_handler; ++ ++void nf_register_queue_imq_handler(const struct nf_queue_handler *qh) ++{ ++ mutex_lock(&queue_handler_mutex); ++ rcu_assign_pointer(queue_imq_handler, qh); ++ mutex_unlock(&queue_handler_mutex); ++} ++EXPORT_SYMBOL_GPL(nf_register_queue_imq_handler); ++ ++void nf_unregister_queue_imq_handler(void) ++{ ++ mutex_lock(&queue_handler_mutex); ++ rcu_assign_pointer(queue_imq_handler, NULL); ++ mutex_unlock(&queue_handler_mutex); ++} ++EXPORT_SYMBOL_GPL(nf_unregister_queue_imq_handler); ++#endif ++ + /* return EBUSY when somebody else is registered, return EEXIST if the + * same handler is registered, return 0 in case of success. */ + int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) +@@ -92,7 +112,7 @@ + } + EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers); + +-static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) ++void nf_queue_entry_release_refs(struct nf_queue_entry *entry) + { + /* Release those devices we held, or Alexey will kill me. */ + if (entry->indev) +@@ -112,6 +132,7 @@ + /* Drop reference to owner of hook which queued us. */ + module_put(entry->elem->owner); + } ++EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); + + /* + * Any packet that leaves via this function must come back +@@ -123,7 +144,8 @@ + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), +- unsigned int queuenum) ++ unsigned int queuenum, ++ unsigned int queuetype) + { + int status = -ENOENT; + struct nf_queue_entry *entry = NULL; +@@ -137,7 +159,17 @@ + /* QUEUE == DROP if no one is waiting, to be safe. */ + rcu_read_lock(); + +- qh = rcu_dereference(queue_handler[pf]); ++ if (queuetype == NF_IMQ_QUEUE) { ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ qh = rcu_dereference(queue_imq_handler); ++#else ++ BUG(); ++ goto err_unlock; ++#endif ++ } else { ++ qh = rcu_dereference(queue_handler[pf]); ++ } ++ + if (!qh) { + status = -ESRCH; + goto err_unlock; +@@ -209,7 +241,8 @@ + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), +- unsigned int queuenum) ++ unsigned int queuenum, ++ unsigned int queuetype) + { + struct sk_buff *segs; + int err; +@@ -217,7 +250,7 @@ + + if (!skb_is_gso(skb)) + return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn, +- queuenum); ++ queuenum, queuetype); + + switch (pf) { + case NFPROTO_IPV4: +@@ -244,7 +277,7 @@ + segs->next = NULL; + if (err == 0) + err = __nf_queue(segs, elem, pf, hook, indev, +- outdev, okfn, queuenum); ++ outdev, okfn, queuenum, queuetype); + if (err == 0) + queued++; + else +@@ -299,9 +332,11 @@ + local_bh_enable(); + break; + case NF_QUEUE: ++ case NF_IMQ_QUEUE: + err = __nf_queue(skb, elem, entry->pf, entry->hook, + entry->indev, entry->outdev, entry->okfn, +- verdict >> NF_VERDICT_QBITS); ++ verdict >> NF_VERDICT_QBITS, ++ verdict & NF_VERDICT_MASK); + if (err < 0) { + if (err == -ECANCELED) + goto next_hook; +diff -uNr linux-3.2.0-go.orig//net/netfilter/xt_IMQ.c linux-3.2.0-go/net/netfilter/xt_IMQ.c +--- linux-3.2.0-go.orig//net/netfilter/xt_IMQ.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.0-go/net/netfilter/xt_IMQ.c 2012-01-16 18:54:18.872181922 +0100 +@@ -0,0 +1,74 @@ ++/* ++ * This target marks packets to be enqueued to an imq device ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++static unsigned int imq_target(struct sk_buff *pskb, ++ const struct xt_action_param *par) ++{ ++ const struct xt_imq_info *mr = par->targinfo; ++ ++ pskb->imq_flags = (mr->todev & IMQ_F_IFMASK) | IMQ_F_ENQUEUE; ++ ++ return XT_CONTINUE; ++} ++ ++static int imq_checkentry(const struct xt_tgchk_param *par) ++{ ++ struct xt_imq_info *mr = par->targinfo; ++ ++ if (mr->todev > IMQ_MAX_DEVS - 1) { ++ printk(KERN_WARNING ++ "IMQ: invalid device specified, highest is %u\n", ++ IMQ_MAX_DEVS - 1); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static struct xt_target xt_imq_reg[] __read_mostly = { ++ { ++ .name = "IMQ", ++ .family = AF_INET, ++ .checkentry = imq_checkentry, ++ .target = imq_target, ++ .targetsize = sizeof(struct xt_imq_info), ++ .table = "mangle", ++ .me = THIS_MODULE ++ }, ++ { ++ .name = "IMQ", ++ .family = AF_INET6, ++ .checkentry = imq_checkentry, ++ .target = imq_target, ++ .targetsize = sizeof(struct xt_imq_info), ++ .table = "mangle", ++ .me = THIS_MODULE ++ }, ++}; ++ ++static int __init imq_init(void) ++{ ++ return xt_register_targets(xt_imq_reg, ARRAY_SIZE(xt_imq_reg)); ++} ++ ++static void __exit imq_fini(void) ++{ ++ xt_unregister_targets(xt_imq_reg, ARRAY_SIZE(xt_imq_reg)); ++} ++ ++module_init(imq_init); ++module_exit(imq_fini); ++ ++MODULE_AUTHOR("http://www.linuximq.net"); ++MODULE_DESCRIPTION("Pseudo-driver for the intermediate queue device. " ++ "See http://www.linuximq.net/ for more information."); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS("ipt_IMQ"); ++MODULE_ALIAS("ip6t_IMQ"); ++ diff --git a/3.2.34/kbuild-compress-kernel-modules-on-installation.patch b/3.2.34/kbuild-compress-kernel-modules-on-installation.patch new file mode 100644 index 0000000..cb3cc7f --- /dev/null +++ b/3.2.34/kbuild-compress-kernel-modules-on-installation.patch @@ -0,0 +1,137 @@ +================================ +Signed-off-by: Steve Brokenshire +[Rediffed for 2.6.31.3, defaulted to y and compress with -9 /Thomas] +Signed-off-by: Thomas Backlund + +diff -Nurp linux-2.6.31/Documentation/kbuild/modules.txt linux-2.6.31.compress/Documentation/kbuild/modules.txt +--- linux-2.6.31/Documentation/kbuild/modules.txt 2009-09-10 01:13:59.000000000 +0300 ++++ linux-2.6.31.compress/Documentation/kbuild/modules.txt 2009-10-09 14:17:49.335619817 +0300 +@@ -123,6 +123,13 @@ executed to make module versioning work. + Install the external module(s). The default location is + /lib/modules//extra/, but a prefix may + be added with INSTALL_MOD_PATH (discussed in section 5). ++ If MODULES_COMPRESS is set when the modules_install target is ++ run then the module is compressed after it has been ++ copied to /lib/modules/. Compressed modules ++ using the default gzip compression format will require ++ module-init-tools installed with --zlib-enabled. ++ Any options set in MODULE_COMPRESS_OPTIONS will be ++ passed to the selected compression format. + + clean + Remove all generated files in the module directory only. +diff -Nurp linux-2.6.31/init/Kconfig linux-2.6.31.compress/init/Kconfig +--- linux-2.6.31/init/Kconfig 2009-09-10 01:13:59.000000000 +0300 ++++ linux-2.6.31.compress/init/Kconfig 2009-10-09 14:19:01.812591181 +0300 +@@ -1161,6 +1161,64 @@ config MODULE_FORCE_UNLOAD + rmmod). This is mainly for kernel developers and desperate users. + If unsure, say N. + ++config MODULE_COMPRESS ++ bool "Compress kernel modules on installation" ++ depends on MODULES ++ default y ++ help ++ This option compresses the kernel modules when 'make ++ modules_install' is run. ++ ++ The modules will be compressed into the selected compression ++ format with gzip being the default compression format. ++ ++ When a kernel module is installed from outside of the main kernel ++ source and uses the Kbuild system for installing modules then that ++ kernel module will also be compressed when it is installed. ++ ++ When running mkinitrd you will find that an error message ++ appears saying that it cannot find a certain kernel module. ++ As a workaround, unset CONFIG_MODULE_COMPRESS, build the modules ++ and install them, run mkinitrd and create the initrd image, place ++ the initrd image in the correct place for booting, set ++ CONFIG_MODULE_COMPRESS and then install the modules again. ++ ++ This option requires the module-init-tools package to be ++ configured with --enable-zlib (if using gzip which is the ++ default compression format). ++ ++ If unsure, say Y. ++ ++config MODULE_COMPRESS_OPTIONS ++ string "Compression format command line options" ++ depends on MODULE_COMPRESS ++ default "-9" ++ help ++ This option specifies the command line options to be used for ++ the selected compression format. ++ ++ Please refer to the selected compression format's documentation ++ on which options should be used. ++ ++ If unsure, leave this option blank. ++ ++choice ++ prompt "Kernel module compression format" ++ depends on MODULE_COMPRESS ++ default MODULE_COMPRESS_GZIP ++ ++config MODULE_COMPRESS_GZIP ++ bool "gzip compression" ++ help ++ Compresses the kernel modules using the gzip (GNU zip) ++ compression format. ++ ++ This option requires gzip to be installed. ++ ++ If unsure, leave this option selected. ++ ++endchoice ++ + config MODVERSIONS + bool "Module versioning support" + help +diff -Nurp linux-2.6.31/scripts/Makefile.modinst linux-2.6.31.compress/scripts/Makefile.modinst +--- linux-2.6.31/scripts/Makefile.modinst 2009-09-10 01:13:59.000000000 +0300 ++++ linux-2.6.31.compress/scripts/Makefile.modinst 2009-10-09 14:17:49.337619404 +0300 +@@ -5,6 +5,7 @@ + PHONY := __modinst + __modinst: + ++include include/config/auto.conf + include scripts/Kbuild.include + + # +@@ -16,8 +17,21 @@ PHONY += $(modules) + __modinst: $(modules) + @: + +-quiet_cmd_modules_install = INSTALL $@ +- cmd_modules_install = mkdir -p $(2); cp $@ $(2) ; $(mod_strip_cmd) $(2)/$(notdir $@) ++ifeq ($(CONFIG_MODULE_COMPRESS_OPTIONS), "") ++else ++ MODCOMPOPT = $(shell echo -n $(CONFIG_MODULE_COMPRESS_OPTIONS)) ++endif ++ ++quiet_cmd_modules_install = INSTALL $@ ++ cmd_modules_install = mkdir -p $(2); \ ++ cp $@ $(2) ; \ ++ $(mod_strip_cmd) $(2)/$(notdir $@) ++ ++quiet_cmd_modules_compress_gzip = COMPRESS $@ ++ cmd_modules_compress_gzip = gzip $(MODCOMPOPT) -c \ ++ $(2)/$(@F) \ ++ > $(2)/$(@F).gz; \ ++ rm $(2)/$(@F) + + # Modules built outside the kernel source tree go into extra by default + INSTALL_MOD_DIR ?= extra +@@ -26,8 +40,11 @@ ext-mod-dir = $(INSTALL_MOD_DIR)$(subst + modinst_dir = $(if $(KBUILD_EXTMOD),$(ext-mod-dir),kernel/$(@D)) + + $(modules): ++ + $(call cmd,modules_install,$(MODLIB)/$(modinst_dir)) + ++ $(if $(CONFIG_MODULE_COMPRESS_GZIP), \ ++ $(call cmd,modules_compress_gzip,$(MODLIB)/$(modinst_dir))) + + # Declare the contents of the .PHONY variable as phony. We keep that + # information in a variable se we can use it in if_changed and friends. diff --git a/3.2.34/kernel-3.2-lsproduo.patch b/3.2.34/kernel-3.2-lsproduo.patch new file mode 100644 index 0000000..d1d5982 --- /dev/null +++ b/3.2.34/kernel-3.2-lsproduo.patch @@ -0,0 +1,569 @@ +diff -uprN linux-3.4-rc7/arch/arm/configs/orion5x_defconfig linux-3.4-rc7-wtgl/arch/arm/configs/orion5x_defconfig +--- linux-3.4-rc7/arch/arm/configs/orion5x_defconfig 2012-05-12 19:37:47.000000000 -0600 ++++ linux-3.4-rc7-wtgl/arch/arm/configs/orion5x_defconfig 2012-08-16 23:41:47.118502384 -0600 +@@ -19,6 +19,7 @@ CONFIG_MACH_TS209=y + CONFIG_MACH_TERASTATION_PRO2=y + CONFIG_MACH_LINKSTATION_PRO=y + CONFIG_MACH_LINKSTATION_MINI=y ++CONFIG_MACH_LINKSTATION_PRODUO=y + CONFIG_MACH_LINKSTATION_LS_HGL=y + CONFIG_MACH_TS409=y + CONFIG_MACH_WRT350N_V2=y +diff -uprN linux-3.4-rc7/arch/arm/mach-orion5x/Kconfig linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/Kconfig +--- linux-3.4-rc7/arch/arm/mach-orion5x/Kconfig 2012-05-12 19:37:47.000000000 -0600 ++++ linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/Kconfig 2012-08-16 23:47:02.334496150 -0600 +@@ -65,13 +65,52 @@ config MACH_LINKSTATION_MINI + Say 'Y' here if you want your kernel to support the + Buffalo Linkstation Mini platform. + ++config MACH_LINKSTATION_PRODUO ++ bool "Buffalo Linkstation Pro Duo" ++ select I2C_BOARDINFO ++ help ++ Say 'Y' here if you want your kernel to support the ++ Buffalo Linkstation Pro Duo platform. ++ ++ LS-W1.0TGL/R1 is the general model number. There ++ is no /R3 models, as /R1 stands for RAID1. ++ There are two hardware revisions of the product. ++ ++ The first revision has version 1.xx firmware, 64 MB RAM, ++ a single USB port, a power BUTTON, an Auto/Manual ++ power MODE SWITCH, and a RESET button. ++ ++ The second revision has version 3.xx firmware, 128 MB RAM, ++ two USB ports, an Off/On/Auto power SWITCH, and a FUNCTION button. ++ ++ choice ++ prompt "HW model" ++ depends on MACH_LINKSTATION_PRODUO ++ default MACH_LINKSTATION_PRODUO_REV1 ++ default MACH_LINKSTATION_PRODUO_REV2 ++ ++ config MACH_LINKSTATION_PRODUO_REV1 ++ bool "Revision 1" ++ help ++ The first revision has version 1.xx firmware, 64 MB RAM, ++ a single USB port, a power BUTTON, an Auto/Manual ++ power MODE SWITCH, and a RESET button. ++ ++ config MACH_LINKSTATION_PRODUO_REV2 ++ bool "Revision 2" ++ help ++ The second revision has version 3.xx firmware, 128 MB RAM, ++ two USB ports, an Off/On/Auto power SWITCH, and a FUNCTION button. ++ endchoice ++ ++ + config MACH_LINKSTATION_LS_HGL + bool "Buffalo Linkstation LS-HGL" + select I2C_BOARDINFO + help + Say 'Y' here if you want your kernel to support the + Buffalo Linkstation LS-HGL platform. +- ++ + config MACH_TS409 + bool "QNAP TS-409" + help +diff -uprN linux-3.4-rc7/arch/arm/mach-orion5x/Makefile linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/Makefile +--- linux-3.4-rc7/arch/arm/mach-orion5x/Makefile 2012-05-12 19:37:47.000000000 -0600 ++++ linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/Makefile 2012-08-16 23:48:26.182494492 -0600 +@@ -5,6 +5,7 @@ obj-$(CONFIG_MACH_KUROBOX_PRO) += kurobo + obj-$(CONFIG_MACH_TERASTATION_PRO2) += terastation_pro2-setup.o + obj-$(CONFIG_MACH_LINKSTATION_PRO) += kurobox_pro-setup.o + obj-$(CONFIG_MACH_LINKSTATION_MINI) += lsmini-setup.o ++obj-$(CONFIG_MACH_LINKSTATION_PRODUO) += lsproduo-setup.o + obj-$(CONFIG_MACH_LINKSTATION_LS_HGL) += ls_hgl-setup.o + obj-$(CONFIG_MACH_DNS323) += dns323-setup.o + obj-$(CONFIG_MACH_TS209) += ts209-setup.o tsx09-common.o +diff -uprN linux-3.4-rc7/arch/arm/mach-orion5x/lsproduo-setup.c linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/lsproduo-setup.c +--- linux-3.4-rc7/arch/arm/mach-orion5x/lsproduo-setup.c 1969-12-31 17:00:00.000000000 -0700 ++++ linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/lsproduo-setup.c 2012-08-16 23:52:09.630490073 -0600 +@@ -0,0 +1,459 @@ ++/* ++ * arch/arm/mach-orion5x/lsproduo-setup.c ++ * ++ * Source taken from arch/arm/mach-orion5x/lsmini-setup.c - kernel 2.6.30 ++ * Maintainer: Matt Gomboc ++ * ++ * This file is licensed under the terms of the GNU General Public ++ * License version 2. This program is licensed "as is" without any ++ * warranty of any kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "common.h" ++#include "mpp.h" ++#include ++#include ++#include ++ ++/***************************************************************************** ++ * Linkstation Pro Duo Info ++ ****************************************************************************/ ++ ++/* ++ * 256K NOR flash Device bus boot chip select ++ */ ++ ++#define LSPRODUO_NOR_BOOT_BASE 0xf4000000 ++#define LSPRODUO_NOR_BOOT_SIZE SZ_256K ++ ++/***************************************************************************** ++ * 256KB NOR Flash on BOOT Device ++ ****************************************************************************/ ++ ++static struct physmap_flash_data lsproduo_nor_flash_data = { ++ .width = 1, ++}; ++ ++static struct resource lsproduo_nor_flash_resource = { ++ .flags = IORESOURCE_MEM, ++ .start = LSPRODUO_NOR_BOOT_BASE, ++ .end = LSPRODUO_NOR_BOOT_BASE + LSPRODUO_NOR_BOOT_SIZE - 1, ++}; ++ ++static struct platform_device lsproduo_nor_flash = { ++ .name = "physmap-flash", ++ .id = 0, ++ .dev = { ++ .platform_data = &lsproduo_nor_flash_data, ++ }, ++ .num_resources = 1, ++ .resource = &lsproduo_nor_flash_resource, ++}; ++ ++/***************************************************************************** ++ * Ethernet ++ ****************************************************************************/ ++ ++static struct mv643xx_eth_platform_data lsproduo_eth_data = { ++ .phy_addr = 8, ++}; ++ ++/***************************************************************************** ++ * RTC 5C372a on I2C bus ++ ****************************************************************************/ ++ ++static struct i2c_board_info __initdata lsproduo_i2c_rtc = { ++ I2C_BOARD_INFO("rs5c372a", 0x32), ++}; ++ ++/***************************************************************************** ++ * LEDs attached to GPIO ++ ****************************************************************************/ ++ ++#define LSPRODUO_GPIO_LED_ALARM 2 ++#define LSPRODUO_GPIO_LED_INFO 3 ++#define LSPRODUO_GPIO_LED_PWR 0 ++ ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV2 ++ #define LSPRODUO_GPIO_LED_FUNC 18 ++#endif ++ ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV1 ++static struct gpio_led lsproduo_led_pins[] = { ++ { ++ .name = "alarm:red", ++ .gpio = LSPRODUO_GPIO_LED_ALARM, ++ .active_low = 1, ++ }, { ++ .name = "info:amber", ++ .gpio = LSPRODUO_GPIO_LED_INFO, ++ .active_low = 1, ++ }, { ++ .name = "power:greem", ++ .gpio = LSPRODUO_GPIO_LED_PWR, ++ .active_low = 1, ++ }, ++}; ++#endif ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV2 ++static struct gpio_led lsproduo_led_pins[] = { ++ { ++ .name = "alarm:red", ++ .gpio = LSPRODUO_GPIO_LED_ALARM, ++ .active_low = 1, ++ }, { ++ .name = "info:amber", ++ .gpio = LSPRODUO_GPIO_LED_INFO, ++ .active_low = 1, ++ }, { ++ .name = "power:green", ++ .gpio = LSPRODUO_GPIO_LED_PWR, ++ .active_low = 1, ++ },{ ++ .name = "func:blue", ++ .gpio = LSPRODUO_GPIO_LED_FUNC, ++ .active_low = 1, ++ }, ++}; ++#endif ++ ++ ++ ++static struct gpio_led_platform_data lsproduo_led_data = { ++ .leds = lsproduo_led_pins, ++ .num_leds = ARRAY_SIZE(lsproduo_led_pins), ++}; ++ ++static struct platform_device lsproduo_leds = { ++ .name = "leds-gpio", ++ .id = -1, ++ .dev = { ++ .platform_data = &lsproduo_led_data, ++ }, ++}; ++ ++/**************************************************************************** ++ * GPIO Attached Keys ++ ****************************************************************************/ ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV1 ++ #define LSPRODUO_GPIO_KEY_POWER 8 ++ #define LSPRODUO_GPIO_KEY_AUTOPOWER 10 ++ ++ #define LSPRODUO_SW_POWER 0x00 ++ #define LSPRODUO_SW_AUTOPOWER 0x01 ++ ++static struct gpio_keys_button lsproduo_buttons[] = { ++ { ++ .type = EV_SW, ++ .code = LSPRODUO_SW_POWER, ++ .gpio = LSPRODUO_GPIO_KEY_POWER, ++ .desc = "Power-on Switch", ++ .active_low = 1, ++ }, { ++ .type = EV_SW, ++ .code = LSPRODUO_SW_AUTOPOWER, ++ .gpio = LSPRODUO_GPIO_KEY_AUTOPOWER, ++ .desc = "Power-auto Switch", ++ .active_low = 1, ++ }, ++}; ++ ++#endif ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV2 ++ #define LSPRODUO_GPIO_KEY_POWER 10 ++ #define LSPRODUO_GPIO_KEY_AUTOPOWER 22 ++ #define LSPRODUO_GPIO_KEY_FUNC 8 ++ ++ #define LSPRODUO_SW_POWER 0x00 ++ #define LSPRODUO_SW_AUTOPOWER 0x01 ++ ++static struct gpio_keys_button lsproduo_buttons[] = { ++ { ++ .code = KEY_OPTION, ++ .gpio = LSPRODUO_GPIO_KEY_FUNC, ++ .desc = "Function Button", ++ .active_low = 1, ++ },{ ++ .type = EV_SW, ++ .code = LSPRODUO_SW_POWER, ++ .gpio = LSPRODUO_GPIO_KEY_POWER, ++ .desc = "Power-on Switch", ++ .active_low = 1, ++ }, { ++ .type = EV_SW, ++ .code = LSPRODUO_SW_AUTOPOWER, ++ .gpio = LSPRODUO_GPIO_KEY_AUTOPOWER, ++ .desc = "Power-auto Switch", ++ .active_low = 1, ++ }, ++}; ++ ++#endif ++ ++static struct gpio_keys_platform_data lsproduo_button_data = { ++ .buttons = lsproduo_buttons, ++ .nbuttons = ARRAY_SIZE(lsproduo_buttons), ++}; ++ ++static struct platform_device lsproduo_button_device = { ++ .name = "gpio-keys", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lsproduo_button_data, ++ }, ++}; ++ ++/**************************************************************************** ++ * GPIO Attached Fan ++ ****************************************************************************/ ++ ++/* Define max char len */ ++#define MAX_LEN 8 ++ ++#define LSPRODUO_GPIO_FAN_LOW 17 ++#define LSPRODUO_GPIO_FAN_HIGH 14 ++ ++static struct proc_dir_entry *lsproduo_proc_dir_root, *lsproduo_proc_dir_gpio, *lsproduo_fan_proc_file; ++static char lsproduo_fan_state[MAX_LEN]; ++ ++static int lsproduo_fan_get(char *buf, char **start, off_t offset, int count, int *eof, void *data) ++{ ++ int len; ++ ++ len = snprintf(buf, count, "state: %s\n", lsproduo_fan_state); ++ return len; ++} ++ ++static int lsproduo_fan_set( struct file *file, const char *buffer, unsigned long count, void *data ) ++{ ++ int len, ret; ++ char *ptr, tState[MAX_LEN]; ++ ++ if (count > MAX_LEN ) ++ len = MAX_LEN; ++ else ++ len = count; ++ ++ ret = copy_from_user(tState, buffer, len); ++ if(ret < 0) ++ { ++ printk(KERN_ERR "%s: Setting fan speed failed\n", "lsproduo"); ++ return -EFAULT; ++ } ++ ++ ptr = strrchr(tState, '\n'); ++ if(ptr) *ptr = '\0'; ++ ++ if (strcasecmp(tState, "off") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan off\n", "lsproduo"); ++ sprintf(lsproduo_fan_state, "off"); ++ gpio_set_value(LSPRODUO_GPIO_FAN_LOW, 1); ++ gpio_set_value(LSPRODUO_GPIO_FAN_HIGH, 1); ++ } else if (strcasecmp(tState, "slow") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan slow\n", "lsproduo"); ++ sprintf(lsproduo_fan_state, "slow"); ++ gpio_set_value(LSPRODUO_GPIO_FAN_LOW, 1); ++ gpio_set_value(LSPRODUO_GPIO_FAN_HIGH, 0); ++ } else if (strcasecmp(tState, "fast") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan fast\n", "lsproduo"); ++ sprintf(lsproduo_fan_state, "fast"); ++ gpio_set_value(LSPRODUO_GPIO_FAN_LOW, 0); ++ gpio_set_value(LSPRODUO_GPIO_FAN_HIGH, 1); ++ } else if (strcasecmp(tState, "full") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan full\n", "lsproduo"); ++ sprintf(lsproduo_fan_state, "full"); ++ gpio_set_value(LSPRODUO_GPIO_FAN_LOW, 0); ++ gpio_set_value(LSPRODUO_GPIO_FAN_HIGH, 0); ++ } else ++ { ++ printk(KERN_ERR "%s: unknown fan speed given\n", "lsproduo"); ++ } ++ ++ lsproduo_fan_state[len] = '\0'; ++ ++ return len; ++} ++ ++/***************************************************************************** ++ * SATA ++ ****************************************************************************/ ++static struct mv_sata_platform_data lsproduo_sata_data = { ++ .n_ports = 2, ++}; ++ ++ ++/***************************************************************************** ++ * Linkstation Pro Duo specific power off method: reboot ++ ****************************************************************************/ ++/* ++ * On the Linkstation Pro Duo, the shutdown process is following: ++ * - Userland monitors key events until the power switch goes to off position ++ * - The board reboots ++ * - U-boot starts and goes into an idle mode waiting for the user ++ * to move the switch to ON position ++ */ ++ ++static void lsproduo_power_off(void) ++{ ++ /* orion5x_restart('h', NULL); */ ++ arm_machine_restart(0, NULL); ++} ++ ++ ++/***************************************************************************** ++ * General Setup ++ ****************************************************************************/ ++#define LSPRODUO_GPIO_HDD_POWER0 1 ++#define LSPRODUO_GPIO_USB_POWER 9 ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV1 ++ #define LSPRODUO_GPIO_POWER 8 ++ #define LSPRODUO_GPIO_AUTO_POWER 10 ++#endif ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV2 ++ #define LSPRODUO_GPIO_POWER 10 ++ #define LSPRODUO_GPIO_USB_POWER2 19 ++ #define LSPRODUO_GPIO_AUTO_POWER 22 ++#endif ++ ++static unsigned int lsproduo_mpp_modes[] __initdata = { ++ MPP0_GPIO, /* LED_PWR */ ++ MPP1_GPIO, /* HDD_PWR */ ++ MPP2_GPIO, /* LED_ALARM */ ++ MPP3_GPIO, /* LED_INFO */ ++ MPP4_UNUSED, ++ MPP5_UNUSED, ++ MPP6_GPIO, /* FAN_LCK */ ++ MPP9_GPIO, /* USB_PWR */ ++ MPP11_UNUSED, /* LED_ETH dummy */ ++ MPP12_UNUSED, ++ MPP13_UNUSED, ++ MPP14_GPIO, /* FAN_HIGH */ ++ MPP15_UNUSED, ++ MPP16_UNUSED, ++ MPP17_GPIO, /* FAN_LOW */ ++ ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV1 ++ MPP7_GPIO, /* INIT */ ++ MPP8_GPIO, /* POWER */ ++ MPP10_GPIO, /* AUTO_POWER */ ++ MPP18_UNUSED, ++ MPP19_UNUSED, ++#endif ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV2 ++ MPP7_UNUSED, ++ MPP8_GPIO, /* FUNC */ ++ MPP10_GPIO, /* POWER */ ++ MPP18_GPIO, /* LED_FUNC*/ ++ MPP19_GPIO, /* USB_PWR2 */ ++ MPP22_GPIO, /* AUTO_POWER */ ++#endif ++ 0, ++}; ++ ++static void __init lsproduo_init(void) ++{ ++ /* ++ * Setup basic Orion functions. Need to be called early. ++ */ ++ orion5x_init(); ++ ++ orion5x_mpp_conf(lsproduo_mpp_modes); ++ ++ /* ++ * Configure peripherals. ++ */ ++ orion5x_ehci0_init(); ++ orion5x_ehci1_init(); ++ orion5x_eth_init(&lsproduo_eth_data); ++ orion5x_i2c_init(); ++ orion5x_sata_init(&lsproduo_sata_data); ++ orion5x_uart0_init(); ++ orion5x_xor_init(); ++ ++ orion5x_setup_dev_boot_win(LSPRODUO_NOR_BOOT_BASE, ++ LSPRODUO_NOR_BOOT_SIZE); ++ platform_device_register(&lsproduo_nor_flash); ++ ++ platform_device_register(&lsproduo_button_device); ++ ++ platform_device_register(&lsproduo_leds); ++ ++ i2c_register_board_info(0, &lsproduo_i2c_rtc, 1); ++ ++ /* enable USB power */ ++ gpio_set_value(LSPRODUO_GPIO_USB_POWER, 1); ++ ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV2 ++ gpio_set_value(LSPRODUO_GPIO_USB_POWER2, 1); ++#endif ++ ++ printk(KERN_INFO "Buffalo Linkstation Pro Duo fan driver loaded\n"); ++ sprintf(lsproduo_fan_state, "fast"); ++ gpio_set_value(LSPRODUO_GPIO_FAN_LOW, 1); ++ gpio_set_value(LSPRODUO_GPIO_FAN_HIGH, 0); ++ ++ lsproduo_proc_dir_root = proc_mkdir( "linkstation", NULL ); ++ lsproduo_proc_dir_gpio = proc_mkdir( "gpio", lsproduo_proc_dir_root ); ++ lsproduo_fan_proc_file = create_proc_entry( "fan", S_IRUGO, lsproduo_proc_dir_gpio ); ++ if( lsproduo_fan_proc_file ) { ++ lsproduo_fan_proc_file->read_proc = lsproduo_fan_get; ++ lsproduo_fan_proc_file->write_proc = lsproduo_fan_set; ++ lsproduo_fan_proc_file->data = NULL; ++ } else ++ { ++ printk(KERN_INFO "Registration of fan device failed\n"); ++ } ++ ++ /* register power-off method */ ++ pm_power_off = lsproduo_power_off; ++ ++ pr_info("%s: finished\n", __func__); ++} ++ ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV1 ++MACHINE_START(LINKSTATION_PRODUO, "Buffalo Linkstation Pro Duo - Revision 1") ++ .atag_offset = 0x00000100, ++ .init_machine = lsproduo_init, ++ .map_io = orion5x_map_io, ++ .init_early = orion5x_init_early, ++ .init_irq = orion5x_init_irq, ++ .timer = &orion5x_timer, ++ .fixup = tag_fixup_mem32, ++ /* .restart = orion5x_restart, */ ++MACHINE_END ++#endif ++ ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV2 ++MACHINE_START(LINKSTATION_PRODUO, "Buffalo Linkstation Pro Duo - Revision 2") ++ .atag_offset = 0x00000100, ++ .init_machine = lsproduo_init, ++ .map_io = orion5x_map_io, ++ .init_early = orion5x_init_early, ++ .init_irq = orion5x_init_irq, ++ .timer = &orion5x_timer, ++ .fixup = tag_fixup_mem32, ++ /* .restart = orion5x_restart, */ ++MACHINE_END ++#endif ++ ++ ++ +diff -uprN linux-3.4-rc7/arch/arm/mach-orion5x/mpp.h linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/mpp.h +--- linux-3.4-rc7/arch/arm/mach-orion5x/mpp.h 2012-05-12 19:37:47.000000000 -0600 ++++ linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/mpp.h 2012-08-16 22:15:34.000000000 -0600 +@@ -122,7 +122,10 @@ + #define MPP19_GIGE MPP(19, 0x1, 0, 0, 1, 1, 1) + #define MPP19_UART MPP(19, 0x0, 0, 0, 0, 1, 1) + +-#define MPP_MAX 19 ++#define MPP22_GPIO MPP(22, 0x5, 1, 1, 0, 1, 0) ++ ++ ++#define MPP_MAX 22 + + void orion5x_mpp_conf(unsigned int *mpp_list); + +diff -uprN linux-3.4-rc7/arch/arm/tools/mach-types linux-3.4-rc7-wtgl/arch/arm/tools/mach-types +--- linux-3.4-rc7/arch/arm/tools/mach-types 2012-05-12 19:37:47.000000000 -0600 ++++ linux-3.4-rc7-wtgl/arch/arm/tools/mach-types 2012-08-16 23:43:59.830499760 -0600 +@@ -333,6 +333,8 @@ smdkc100 MACH_SMDKC100 SMDKC100 1826 + tavorevb MACH_TAVOREVB TAVOREVB 1827 + saar MACH_SAAR SAAR 1828 + at91sam9m10g45ek MACH_AT91SAM9M10G45EK AT91SAM9M10G45EK 1830 ++linkstation_produo MACH_LINKSTATION_PRODUO LINKSTATION_PRODUO 1831 ++##see header for btaining a new version, preferred to patching + usb_a9g20 MACH_USB_A9G20 USB_A9G20 1841 + mxlads MACH_MXLADS MXLADS 1851 + linkstation_mini MACH_LINKSTATION_MINI LINKSTATION_MINI 1858 diff --git a/3.2.34/kernel-3.2-lsql.patch b/3.2.34/kernel-3.2-lsql.patch new file mode 100644 index 0000000..89c1f91 --- /dev/null +++ b/3.2.34/kernel-3.2-lsql.patch @@ -0,0 +1,439 @@ +diff -uNr linux-3.2.33-go.orig/arch/arm/configs/orion5x_defconfig linux-3.2.33-go/arch/arm/configs/orion5x_defconfig +--- linux-3.2.33-go.orig/arch/arm/configs/orion5x_defconfig 2012-11-11 15:13:23.313493927 +0100 ++++ linux-3.2.33-go/arch/arm/configs/orion5x_defconfig 2012-11-11 15:14:01.321037277 +0100 +@@ -21,6 +21,7 @@ + CONFIG_MACH_LINKSTATION_MINI=y + CONFIG_MACH_LINKSTATION_PRODUO=y + CONFIG_MACH_LINKSTATION_LS_HGL=y ++CONFIG_MACH_LINKSTATION_LSQL=y + CONFIG_MACH_TS409=y + CONFIG_MACH_WRT350N_V2=y + CONFIG_MACH_TS78XX=y +diff -uNr linux-3.2.33-go.orig/arch/arm/mach-orion5x/Kconfig linux-3.2.33-go/arch/arm/mach-orion5x/Kconfig +--- linux-3.2.33-go.orig/arch/arm/mach-orion5x/Kconfig 2012-11-11 15:13:23.518491566 +0100 ++++ linux-3.2.33-go/arch/arm/mach-orion5x/Kconfig 2012-11-11 15:14:01.321037277 +0100 +@@ -111,6 +111,13 @@ + Say 'Y' here if you want your kernel to support the + Buffalo Linkstation LS-HGL platform. + ++config MACH_LINKSTATION_LSQL ++ bool "Buffalo Linkstation LS-QL" ++ select I2C_BOARDINFO ++ help ++ Say 'Y' here if you want your kernel to support the ++ Buffalo Linkstation LS-QL platform. ++ + config MACH_TS409 + bool "QNAP TS-409" + help +diff -uNr linux-3.2.33-go.orig/arch/arm/mach-orion5x/lsql-setup.c linux-3.2.33-go/arch/arm/mach-orion5x/lsql-setup.c +--- linux-3.2.33-go.orig/arch/arm/mach-orion5x/lsql-setup.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/arch/arm/mach-orion5x/lsql-setup.c 2012-11-11 15:14:01.323037254 +0100 +@@ -0,0 +1,388 @@ ++/* ++ * arch/arm/mach-orion5x/lsql-setup.c ++ * ++ * Source based off arch/arm/mach-orion5x/lsproduo-setup.c, which was from lsmini-setup.c ++ * Maintainer: Matt Gomboc ++ * ++ * This file is licensed under the terms of the GNU General Public ++ * License version 2. This program is licensed "as is" without any ++ * warranty of any kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "common.h" ++#include "mpp.h" ++#include ++#include ++#include ++ ++/***************************************************************************** ++ * Linkstation Quad LS-QL/R5 Info ++ ****************************************************************************/ ++ ++/* ++ * 256K NOR flash Device bus boot chip select ++ */ ++ ++#define LSQL_NOR_BOOT_BASE 0xf4000000 ++#define LSQL_NOR_BOOT_SIZE SZ_256K ++ ++/***************************************************************************** ++ * 256KB NOR Flash on BOOT Device ++ ****************************************************************************/ ++ ++static struct physmap_flash_data lsql_nor_flash_data = { ++ .width = 1, ++}; ++ ++static struct resource lsql_nor_flash_resource = { ++ .flags = IORESOURCE_MEM, ++ .start = LSQL_NOR_BOOT_BASE, ++ .end = LSQL_NOR_BOOT_BASE + LSQL_NOR_BOOT_SIZE - 1, ++}; ++ ++static struct platform_device lsql_nor_flash = { ++ .name = "physmap-flash", ++ .id = 0, ++ .dev = { ++ .platform_data = &lsql_nor_flash_data, ++ }, ++ .num_resources = 1, ++ .resource = &lsql_nor_flash_resource, ++}; ++ ++/***************************************************************************** ++ * Ethernet ++ ****************************************************************************/ ++ ++static struct mv643xx_eth_platform_data lsql_eth_data = { ++ .phy_addr = 8, ++}; ++ ++/***************************************************************************** ++ * RTC 5C372a on I2C bus ++ ****************************************************************************/ ++ ++static struct i2c_board_info __initdata lsql_i2c_rtc = { ++ I2C_BOARD_INFO("rs5c372a", 0x32), ++}; ++ ++/***************************************************************************** ++ * LEDs attached to GPIO ++ ****************************************************************************/ ++ ++#define LSQL_GPIO_LED_ALARM 2 /* looks like it should be 2 by the uboot sources, but doesnt successfully trigger the3 top LED*/ ++#define LSQL_GPIO_LED_INFO 3 ++#define LSQL_GPIO_LED_PWR 0 ++#define LSQL_GPIO_LED_FUNC 18 ++ ++ ++static struct gpio_led lsql_led_pins[] = { ++ { ++ .name = "alarm:red", ++ .gpio = LSQL_GPIO_LED_ALARM, ++ .active_low = 1, ++ }, { ++ .name = "info:amber", ++ .gpio = LSQL_GPIO_LED_INFO, ++ .active_low = 1, ++ }, { ++ .name = "power:blue", ++ .gpio = LSQL_GPIO_LED_PWR, ++ .active_low = 1, ++ },{ ++ .name = "func:blue", ++ .gpio = LSQL_GPIO_LED_FUNC, ++ .active_low = 1, ++ }, ++}; ++ ++ ++ ++static struct gpio_led_platform_data lsql_led_data = { ++ .leds = lsql_led_pins, ++ .num_leds = ARRAY_SIZE(lsql_led_pins), ++}; ++ ++ ++static struct platform_device lsql_leds = { ++ .name = "leds-gpio", ++ .id = -1, ++ .dev = { ++ .platform_data = &lsql_led_data, ++ }, ++}; ++ ++ ++/**************************************************************************** ++ * GPIO Attached Keys ++ ****************************************************************************/ ++ ++ #define LSQL_GPIO_KEY_POWER 10 ++ #define LSQL_GPIO_KEY_AUTOPOWER 22 ++ #define LSQL_GPIO_KEY_FUNC 7 ++ ++ #define LSQL_SW_POWER 0x00 ++ #define LSQL_SW_AUTOPOWER 0x01 ++ ++static struct gpio_keys_button lsql_buttons[] = { ++ { ++ .code = KEY_OPTION, ++ .gpio = LSQL_GPIO_KEY_FUNC, ++ .desc = "Function Button", ++ .active_low = 1, ++ },{ ++ .type = EV_SW, ++ .code = LSQL_SW_POWER, ++ .gpio = LSQL_GPIO_KEY_POWER, ++ .desc = "Power-on Switch", ++ .active_low = 1, ++ }, { ++ .type = EV_SW, ++ .code = LSQL_SW_AUTOPOWER, ++ .gpio = LSQL_GPIO_KEY_AUTOPOWER, ++ .desc = "Power-auto Switch", ++ .active_low = 1, ++ }, ++}; ++ ++ ++static struct gpio_keys_platform_data lsql_button_data = { ++ .buttons = lsql_buttons, ++ .nbuttons = ARRAY_SIZE(lsql_buttons), ++}; ++ ++static struct platform_device lsql_button_device = { ++ .name = "gpio-keys", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lsql_button_data, ++ }, ++}; ++ ++/**************************************************************************** ++ * GPIO Attached Fan ++ ****************************************************************************/ ++ ++/* Define max char len */ ++ ++#define MAX_LEN 8 ++ ++#define LSQL_GPIO_FAN_LOW 17 ++#define LSQL_GPIO_FAN_HIGH 14 ++ ++static struct proc_dir_entry *lsql_proc_dir_root, *lsql_proc_dir_gpio, *lsql_fan_proc_file; ++static char lsql_fan_state[MAX_LEN]; ++ ++static int lsql_fan_get(char *buf, char **start, off_t offset, int count, int *eof, void *data) ++{ ++ int len; ++ ++ len = snprintf(buf, count, "state: %s\n", lsql_fan_state); ++ return len; ++} ++ ++static int lsql_fan_set( struct file *file, const char *buffer, unsigned long count, void *data ) ++{ ++ int len, ret; ++ char *ptr, tState[MAX_LEN]; ++ ++ if (count > MAX_LEN ) ++ len = MAX_LEN; ++ else ++ len = count; ++ ++ ret = copy_from_user(tState, buffer, len); ++ if(ret < 0) ++ { ++ printk(KERN_ERR "%s: Setting fan speed failed\n", "lsql"); ++ return -EFAULT; ++ } ++ ++ ptr = strrchr(tState, '\n'); ++ if(ptr) *ptr = '\0'; ++ ++ if (strcasecmp(tState, "off") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan off\n", "lsql"); ++ sprintf(lsql_fan_state, "off"); ++ gpio_set_value(LSQL_GPIO_FAN_LOW, 1); ++ gpio_set_value(LSQL_GPIO_FAN_HIGH, 1); ++ } else if (strcasecmp(tState, "slow") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan slow\n", "lsql"); ++ sprintf(lsql_fan_state, "slow"); ++ gpio_set_value(LSQL_GPIO_FAN_LOW, 1); ++ gpio_set_value(LSQL_GPIO_FAN_HIGH, 0); ++ } else if (strcasecmp(tState, "fast") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan fast\n", "lsql"); ++ sprintf(lsql_fan_state, "fast"); ++ gpio_set_value(LSQL_GPIO_FAN_LOW, 0); ++ gpio_set_value(LSQL_GPIO_FAN_HIGH, 1); ++ } else if (strcasecmp(tState, "full") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan full\n", "lsql"); ++ sprintf(lsql_fan_state, "full"); ++ gpio_set_value(LSQL_GPIO_FAN_LOW, 0); ++ gpio_set_value(LSQL_GPIO_FAN_HIGH, 0); ++ } else ++ { ++ printk(KERN_ERR "%s: unknown fan speed given\n", "lsql"); ++ } ++ ++ lsql_fan_state[len] = '\0'; ++ ++ return len; ++} ++ ++/***************************************************************************** ++ * SATA ++ ****************************************************************************/ ++static struct mv_sata_platform_data lsql_sata_data = { ++ .n_ports = 2, /*maybe this should be 4, but works with 2 */ ++}; ++ ++ ++/***************************************************************************** ++ * Linkstation Quad specific power off method: reboot ++ ****************************************************************************/ ++/* ++ * On Linkstations in general, the shutdown process is following: ++ * - Userland monitors key events until the power switch goes to off position ++ * - The board reboots ++ * - U-boot starts and goes into an idle mode waiting for the user ++ * to move the switch to ON position ++ * ++ * on the Quad however, there is a power button on the upper, front, ++ * a function button on the lower front, ans a Auto/Manual power button on the back. ++ * After halting system, uboot waits the power button on the front panel to be pushed ++ * ++ * ++ */ ++ ++static void lsql_power_off(void) ++{ ++ arm_machine_restart(0, NULL); /* orion5x_restart('h', NULL); */ ++} ++ ++ ++/***************************************************************************** ++ * General Setup ++ ****************************************************************************/ ++#define LSQL_GPIO_USB_POWER 9 ++#define LSQL_GPIO_POWER 10 ++#define LSQL_GPIO_USB_POWER2 19 ++#define LSQL_GPIO_AUTO_POWER 22 ++ ++static unsigned int lsql_mpp_modes[] __initdata = { ++ MPP0_GPIO, /* LED_PWR */ ++ MPP1_GPIO, /* for debugging purposes, change to MPP1_UNUSED for final */ ++ MPP2_GPIO, /* LED_ALARM */ /* looks like it should be 2 by the uboot sources, but doesnt successfully trigger the3 top LED*/ ++ MPP3_GPIO, /* LED_INFO */ ++ MPP4_GPIO, ++ MPP5_GPIO, ++ MPP6_GPIO, /* FAN_LCK */ ++ MPP7_GPIO, /* FUNC */ ++ MPP8_GPIO, ++ MPP9_GPIO, /* USB_PWR */ ++ MPP10_GPIO, /* POWER */ ++ MPP11_GPIO, ++ MPP12_GPIO, ++ MPP13_GPIO, ++ MPP14_GPIO, /* FAN_HIGH */ ++ MPP15_GPIO, ++ MPP16_GPIO, ++ MPP17_GPIO, /* FAN_LOW */ ++ MPP18_GPIO, /* LED_FUNC*/ ++ MPP19_GPIO, /* USB_PWR2 */ ++ MPP22_GPIO, /* AUTO_POWER*/ ++ 0, ++}; ++ ++static void __init lsql_init(void) ++{ ++ /* ++ * Setup basic Orion functions. Need to be called early. ++ */ ++ orion5x_init(); ++ ++ orion5x_mpp_conf(lsql_mpp_modes); ++ ++ /* ++ * Configure peripherals. ++ */ ++ orion5x_ehci0_init(); ++ orion5x_ehci1_init(); ++ orion5x_eth_init(&lsql_eth_data); ++ orion5x_i2c_init(); ++ orion5x_sata_init(&lsql_sata_data); ++ orion5x_uart0_init(); ++ orion5x_xor_init(); ++ ++ orion5x_setup_dev_boot_win(LSQL_NOR_BOOT_BASE, ++ LSQL_NOR_BOOT_SIZE); ++ platform_device_register(&lsql_nor_flash); ++ ++ platform_device_register(&lsql_button_device); ++ ++ platform_device_register(&lsql_leds); ++ ++ i2c_register_board_info(0, &lsql_i2c_rtc, 1); ++ ++ /* enable USB power */ ++ gpio_set_value(LSQL_GPIO_USB_POWER, 1); ++ gpio_set_value(LSQL_GPIO_USB_POWER2, 1); ++ ++ ++ printk(KERN_INFO "Buffalo Linkstation fan driver loaded\n"); ++ sprintf(lsql_fan_state, "fast"); ++ gpio_set_value(LSQL_GPIO_FAN_LOW, 0); ++ gpio_set_value(LSQL_GPIO_FAN_HIGH, 1); ++ ++ lsql_proc_dir_root = proc_mkdir( "linkstation", NULL ); ++ lsql_proc_dir_gpio = proc_mkdir( "gpio", lsql_proc_dir_root ); ++ lsql_fan_proc_file = create_proc_entry( "fan", S_IRUGO, lsql_proc_dir_gpio ); ++ if( lsql_fan_proc_file ) { ++ lsql_fan_proc_file->read_proc = lsql_fan_get; ++ lsql_fan_proc_file->write_proc = lsql_fan_set; ++ lsql_fan_proc_file->data = NULL; ++ } else ++ { ++ printk(KERN_INFO "Registration of fan device failed\n"); ++ } ++ ++ /* register power-off method */ ++ pm_power_off = lsql_power_off; ++ ++ pr_info("%s: finished\n", __func__); ++} ++ ++#ifdef CONFIG_MACH_LINKSTATION_LSQL ++MACHINE_START(LINKSTATION_LSQL, "Buffalo Linkstation Quad QL/R5") ++ .atag_offset = 0x00000100, ++ .init_machine = lsql_init, ++ .map_io = orion5x_map_io, ++ .init_early = orion5x_init_early, ++ .init_irq = orion5x_init_irq, ++ .timer = &orion5x_timer, ++ .fixup = tag_fixup_mem32, ++ /* .restart = orion5x_restart, */ ++MACHINE_END ++#endif ++ ++ +diff -uNr linux-3.2.33-go.orig/arch/arm/mach-orion5x/Makefile linux-3.2.33-go/arch/arm/mach-orion5x/Makefile +--- linux-3.2.33-go.orig/arch/arm/mach-orion5x/Makefile 2012-11-11 15:13:23.517491578 +0100 ++++ linux-3.2.33-go/arch/arm/mach-orion5x/Makefile 2012-11-11 15:14:01.323037254 +0100 +@@ -7,6 +7,7 @@ + obj-$(CONFIG_MACH_LINKSTATION_MINI) += lsmini-setup.o + obj-$(CONFIG_MACH_LINKSTATION_PRODUO) += lsproduo-setup.o + obj-$(CONFIG_MACH_LINKSTATION_LS_HGL) += ls_hgl-setup.o ++obj-$(CONFIG_MACH_LINKSTATION_LSQL) += lsql-setup.o + obj-$(CONFIG_MACH_DNS323) += dns323-setup.o + obj-$(CONFIG_MACH_TS209) += ts209-setup.o tsx09-common.o + obj-$(CONFIG_MACH_TS409) += ts409-setup.o tsx09-common.o +diff -uNr linux-3.2.33-go.orig/arch/arm/tools/mach-types linux-3.2.33-go/arch/arm/tools/mach-types +--- linux-3.2.33-go.orig/arch/arm/tools/mach-types 2012-11-11 15:13:23.340493615 +0100 ++++ linux-3.2.33-go/arch/arm/tools/mach-types 2012-11-11 15:14:26.618733715 +0100 +@@ -1129,3 +1129,4 @@ + m28evk MACH_M28EVK M28EVK 3613 + smdk4212 MACH_SMDK4212 SMDK4212 3638 + smdk4412 MACH_SMDK4412 SMDK4412 3765 ++linkstation_lsql MACH_LINKSTATION_LSQL LINKSTATION_LSQL 4238 diff --git a/3.2.34/kernel-3.2-lsxhl.patch b/3.2.34/kernel-3.2-lsxhl.patch new file mode 100644 index 0000000..60034e8 --- /dev/null +++ b/3.2.34/kernel-3.2-lsxhl.patch @@ -0,0 +1,387 @@ +Add support for the Buffalo Linkstation XHL. This NAS box is based on a +Marvell Kirkwood chip at 1.2 GHz and features 256 MB RAM, 512kb SPI boot +flash, gigabit ethernet and one SATA port. + +Signed-off-by: Michael Walle +--- + arch/arm/configs/kirkwood_defconfig | 1 + + arch/arm/mach-kirkwood/Kconfig | 6 + + arch/arm/mach-kirkwood/Makefile | 1 + + arch/arm/mach-kirkwood/lsxhl-setup.c | 313 ++++++++++++++++++++++++++++++++++ + arch/arm/tools/mach-types | 1 + + 5 files changed, 322 insertions(+), 0 deletions(-) + create mode 100644 arch/arm/mach-kirkwood/lsxhl-setup.c + +diff --git a/arch/arm/configs/kirkwood_defconfig b/arch/arm/configs/kirkwood_defconfig +index aeb3af5..9f77811 100644 +--- a/arch/arm/configs/kirkwood_defconfig ++++ b/arch/arm/configs/kirkwood_defconfig +@@ -28,6 +28,7 @@ CONFIG_MACH_D2NET_V2=y + CONFIG_MACH_NET2BIG_V2=y + CONFIG_MACH_NET5BIG_V2=y + CONFIG_MACH_T5325=y ++CONFIG_MACH_LSXHL=y + # CONFIG_CPU_FEROCEON_OLD_ID is not set + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +diff --git a/arch/arm/mach-kirkwood/Kconfig b/arch/arm/mach-kirkwood/Kconfig +index 7fc603b..307cc99 100644 +--- a/arch/arm/mach-kirkwood/Kconfig ++++ b/arch/arm/mach-kirkwood/Kconfig +@@ -130,6 +130,12 @@ config MACH_T5325 + Say 'Y' here if you want your kernel to support the + HP t5325 Thin Client. + ++config MACH_LSXHL ++ bool "Buffalo LS-XHL Series" ++ help ++ Say 'Y' here if you want your kernel to support the ++ Buffalo LS-XHL Series. ++ + endmenu + + endif +diff --git a/arch/arm/mach-kirkwood/Makefile b/arch/arm/mach-kirkwood/Makefile +index 5dcaa81..221980b 100644 +--- a/arch/arm/mach-kirkwood/Makefile ++++ b/arch/arm/mach-kirkwood/Makefile +@@ -18,5 +18,6 @@ obj-$(CONFIG_MACH_D2NET_V2) += d2net_v2-setup.o lacie_v2-common.o + obj-$(CONFIG_MACH_NET2BIG_V2) += netxbig_v2-setup.o lacie_v2-common.o + obj-$(CONFIG_MACH_NET5BIG_V2) += netxbig_v2-setup.o lacie_v2-common.o + obj-$(CONFIG_MACH_T5325) += t5325-setup.o ++obj-$(CONFIG_MACH_LSXHL) += lsxhl-setup.o + + obj-$(CONFIG_CPU_IDLE) += cpuidle.o +diff --git a/arch/arm/mach-kirkwood/lsxhl-setup.c b/arch/arm/mach-kirkwood/lsxhl-setup.c +new file mode 100644 +index 0000000..783d257 +--- /dev/null ++++ b/arch/arm/mach-kirkwood/lsxhl-setup.c +@@ -0,0 +1,313 @@ ++/* ++ * arch/arm/mach-kirkwood/lsxhl-setup.c ++ * ++ * Buffalo LS-XHL Series Setup ++ * ++ * This file is licensed under the terms of the GNU General Public ++ * License version 2. This program is licensed "as is" without any ++ * warranty of any kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "common.h" ++#include "mpp.h" ++ ++/***************************************************************************** ++ * 512KB SPI Flash on BOOT Device ++ ****************************************************************************/ ++static struct mtd_partition lsxhl_partitions[] = { ++ { ++ .name = "u-boot", ++ .size = 0x70000, ++ .offset = 0x00000, ++ .mask_flags = MTD_WRITEABLE, ++ }, ++ { ++ .name = "u-boot env", ++ .size = 0x10000, ++ .offset = 0x70000, ++ } ++}; ++ ++static struct flash_platform_data lsxhl_spi_slave_data = { ++ .type = "m25p40", ++ .parts = lsxhl_partitions, ++ .nr_parts = ARRAY_SIZE(lsxhl_partitions), ++}; ++ ++static struct spi_board_info __initdata lsxhl_spi_slave_info[] = { ++ { ++ .modalias = "m25p80", ++ .platform_data = &lsxhl_spi_slave_data, ++ .irq = -1, ++ .max_speed_hz = 20000000, ++ .bus_num = 0, ++ .chip_select = 0, ++ } ++}; ++ ++/***************************************************************************** ++ * Ethernet ++ ****************************************************************************/ ++static struct mv643xx_eth_platform_data lsxhl_ge00_data = { ++ .phy_addr = MV643XX_ETH_PHY_ADDR(0), ++}; ++ ++static struct mv643xx_eth_platform_data lsxhl_ge01_data = { ++ .phy_addr = MV643XX_ETH_PHY_ADDR(8), ++}; ++ ++/***************************************************************************** ++ * SATA ++ ****************************************************************************/ ++static struct mv_sata_platform_data lsxhl_sata_data = { ++ .n_ports = 1, ++}; ++ ++/***************************************************************************** ++ * LEDs attached to GPIO ++ ****************************************************************************/ ++#define LSXHL_GPIO_LED_ALARM 37 ++#define LSXHL_GPIO_LED_INFO 38 ++#define LSXHL_GPIO_LED_PWR 39 ++#define LSXHL_GPIO_LED_FUNC_BLUE 36 ++#define LSXHL_GPIO_LED_FUNC_RED 48 ++ ++static struct gpio_led lsxhl_led_pins[] = { ++ { ++ .name = "alarm:red", ++ .gpio = LSXHL_GPIO_LED_ALARM, ++ .active_low = 1, ++ }, ++ { ++ .name = "info:amber", ++ .gpio = LSXHL_GPIO_LED_INFO, ++ .active_low = 1, ++ }, ++ { ++ .name = "power:blue", ++ .default_trigger = "default-on", ++ .gpio = LSXHL_GPIO_LED_PWR, ++ .active_low = 1, ++ }, ++ { ++ .name = "func:blue:bottom", ++ .gpio = LSXHL_GPIO_LED_FUNC_BLUE, ++ .active_low = 1, ++ }, ++ { ++ .name = "func:red:bottom", ++ .gpio = LSXHL_GPIO_LED_FUNC_RED, ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_led_platform_data lsxhl_led_data = { ++ .leds = lsxhl_led_pins, ++ .num_leds = ARRAY_SIZE(lsxhl_led_pins), ++}; ++ ++static struct platform_device lsxhl_leds = { ++ .name = "leds-gpio", ++ .id = -1, ++ .dev = { ++ .platform_data = &lsxhl_led_data, ++ } ++}; ++ ++/***************************************************************************** ++ * General Setup ++ ****************************************************************************/ ++#define LSXHL_GPIO_HDD_POWER 10 ++#define LSXHL_GPIO_USB_POWER 11 ++ ++/***************************************************************************** ++ * GPIO Attached Keys ++ ****************************************************************************/ ++#define LSXHL_GPIO_KEY_FUNC 41 ++#define LSXHL_GPIO_KEY_AUTOPOWER 42 ++#define LSXHL_GPIO_KEY_POWER 43 ++#define LSXHL_SW_POWER 0x00 ++#define LSXHL_SW_AUTOPOWER 0x01 ++#define LSXHL_SW_FUNC 0x02 ++ ++static struct gpio_keys_button lsxhl_buttons[] = { ++ { ++ .type = EV_SW, ++ .code = LSXHL_SW_POWER, ++ .gpio = LSXHL_GPIO_KEY_POWER, ++ .desc = "Power-on Switch", ++ .active_low = 1, ++ }, { ++ .type = EV_SW, ++ .code = LSXHL_SW_AUTOPOWER, ++ .gpio = LSXHL_GPIO_KEY_AUTOPOWER, ++ .desc = "Power-auto Switch", ++ .active_low = 1, ++ }, { ++ .type = EV_SW, ++ .code = LSXHL_SW_POWER, ++ .gpio = LSXHL_GPIO_KEY_FUNC, ++ .desc = "Function Button", ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_keys_platform_data lsxhl_button_data = { ++ .buttons = lsxhl_buttons, ++ .nbuttons = ARRAY_SIZE(lsxhl_buttons), ++}; ++ ++static struct platform_device lsxhl_button_device = { ++ .name = "gpio-keys", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lsxhl_button_data, ++ }, ++}; ++ ++/***************************************************************************** ++ * GPIO Fan ++ ****************************************************************************/ ++#define LSXHL_GPIO_FAN_HIGH 18 ++#define LSXHL_GPIO_FAN_LOW 19 ++#define LSXHL_GPIO_FAN_LOCK 40 ++ ++static struct gpio_fan_alarm lsxhl_alarm = { ++ .gpio = LSXHL_GPIO_FAN_LOCK, ++}; ++ ++static struct gpio_fan_speed lsxhl_speeds[] = { ++ { ++ .rpm = 0, ++ .ctrl_val = 3, ++ }, { ++ .rpm = 1500, ++ .ctrl_val = 1, ++ }, { ++ .rpm = 3250, ++ .ctrl_val = 2, ++ }, { ++ .rpm = 5000, ++ .ctrl_val = 0, ++ } ++}; ++ ++static int lsxhl_gpio_list[] = { ++ LSXHL_GPIO_FAN_HIGH, LSXHL_GPIO_FAN_LOW, ++}; ++ ++static struct gpio_fan_platform_data lsxhl_fan_data = { ++ .num_ctrl = ARRAY_SIZE(lsxhl_gpio_list), ++ .ctrl = lsxhl_gpio_list, ++ .alarm = &lsxhl_alarm, ++ .num_speed = ARRAY_SIZE(lsxhl_speeds), ++ .speed = lsxhl_speeds, ++}; ++ ++static struct platform_device lsxhl_fan_device = { ++ .name = "gpio-fan", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lsxhl_fan_data, ++ }, ++}; ++ ++/***************************************************************************** ++ * GPIO Data ++ ****************************************************************************/ ++ ++static unsigned int lsxhl_mpp_config[] __initdata = { ++ MPP10_GPO, /* HDD Power Enable */ ++ MPP11_GPIO, /* USB Vbus Enable */ ++ MPP18_GPO, /* FAN High Enable# */ ++ MPP19_GPO, /* FAN Low Enable# */ ++ MPP36_GPIO, /* Function Blue LED */ ++ MPP37_GPIO, /* Alarm LED */ ++ MPP38_GPIO, /* Info LED */ ++ MPP39_GPIO, /* Power LED */ ++ MPP40_GPIO, /* Fan Lock */ ++ MPP41_GPIO, /* Function Button */ ++ MPP42_GPIO, /* Power Switch */ ++ MPP43_GPIO, /* Power Auto Switch */ ++ MPP48_GPIO, /* Function Red LED */ ++ 0 ++}; ++ ++/***************************************************************************** ++ * LS-XHL specific power off method: reboot ++ ****************************************************************************/ ++/* ++ * On the LS-XHL, the shutdown process is following: ++ * - Userland monitors key events until the power switch goes to off position ++ * - The board reboots ++ * - U-boot starts and goes into an idle mode waiting for the user ++ * to move the switch to ON position ++ * ++ */ ++ ++static void lsxhl_power_off(void) ++{ ++ arm_machine_restart('h', NULL); ++} ++ ++static void __init lsxhl_init(void) ++{ ++ /* ++ * Basic setup. Needs to be called early. ++ */ ++ kirkwood_init(); ++ kirkwood_mpp_conf(lsxhl_mpp_config); ++ ++ /* ++ * Configure peripherals. ++ */ ++ kirkwood_uart0_init(); ++ kirkwood_ehci_init(); ++ kirkwood_ge00_init(&lsxhl_ge00_data); ++ kirkwood_ge01_init(&lsxhl_ge01_data); ++ kirkwood_sata_init(&lsxhl_sata_data); ++ kirkwood_spi_init(); ++ ++ platform_device_register(&lsxhl_leds); ++ platform_device_register(&lsxhl_button_device); ++ platform_device_register(&lsxhl_fan_device); ++ ++ spi_register_board_info(lsxhl_spi_slave_info, ++ ARRAY_SIZE(lsxhl_spi_slave_info)); ++ ++ /* usb power on */ ++ gpio_set_value(LSXHL_GPIO_USB_POWER, 1); ++ ++ /* register power-off method */ ++ pm_power_off = lsxhl_power_off; ++ ++ pr_info("%s: finished\n", __func__); ++} ++ ++MACHINE_START(LSXHL, "Buffalo Linkstation LS-XHL") ++ .atag_offset = 0x100, ++ .init_machine = lsxhl_init, ++ .map_io = kirkwood_map_io, ++ .init_early = kirkwood_init_early, ++ .init_irq = kirkwood_init_irq, ++ .timer = &kirkwood_timer, ++MACHINE_END +diff --git a/arch/arm/tools/mach-types b/arch/arm/tools/mach-types +index 3b3776d..8acc587 100644 +--- a/arch/arm/tools/mach-types ++++ b/arch/arm/tools/mach-types +@@ -448,6 +448,7 @@ mityomapl138 MACH_MITYOMAPL138 MITYOMAPL138 2650 + guruplug MACH_GURUPLUG GURUPLUG 2659 + spear310 MACH_SPEAR310 SPEAR310 2660 + spear320 MACH_SPEAR320 SPEAR320 2661 ++lsxhl MACH_LSXHL LSXHL 2663 + aquila MACH_AQUILA AQUILA 2676 + sheeva_esata MACH_ESATA_SHEEVAPLUG ESATA_SHEEVAPLUG 2678 + msm7x30_surf MACH_MSM7X30_SURF MSM7X30_SURF 2679 +-- +1.7.2.3 diff --git a/3.2.34/kernel-3.4.0-layer7-2.22.patch b/3.2.34/kernel-3.4.0-layer7-2.22.patch new file mode 100644 index 0000000..736adfe --- /dev/null +++ b/3.2.34/kernel-3.4.0-layer7-2.22.patch @@ -0,0 +1,2132 @@ +--- linux-2.6.28-stock/net/netfilter/Kconfig 2009-01-07 16:05:35.000000000 -0600 ++++ linux-2.6.28/net/netfilter/Kconfig 2009-01-07 16:07:31.000000000 -0600 +@@ -795,6 +795,27 @@ config NETFILTER_XT_MATCH_STATE + + To compile it as a module, choose M here. If unsure, say N. + ++config NETFILTER_XT_MATCH_LAYER7 ++ tristate '"layer7" match support' ++ depends on NETFILTER_XTABLES ++ depends on EXPERIMENTAL && (IP_NF_CONNTRACK || NF_CONNTRACK) ++ depends on NF_CT_ACCT ++ help ++ Say Y if you want to be able to classify connections (and their ++ packets) based on regular expression matching of their application ++ layer data. This is one way to classify applications such as ++ peer-to-peer filesharing systems that do not always use the same ++ port. ++ ++ To compile it as a module, choose M here. If unsure, say N. ++ ++config NETFILTER_XT_MATCH_LAYER7_DEBUG ++ bool 'Layer 7 debugging output' ++ depends on NETFILTER_XT_MATCH_LAYER7 ++ help ++ Say Y to get lots of debugging output. ++ ++ + config NETFILTER_XT_MATCH_STATISTIC + tristate '"statistic" match support' + depends on NETFILTER_ADVANCED +--- linux-2.6.28-stock/net/netfilter/Makefile 2009-01-07 16:05:35.000000000 -0600 ++++ linux-2.6.28/net/netfilter/Makefile 2009-01-07 16:07:31.000000000 -0600 +@@ -84,6 +84,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) + obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o + obj-$(CONFIG_NETFILTER_XT_MATCH_SOCKET) += xt_socket.o + obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o ++obj-$(CONFIG_NETFILTER_XT_MATCH_LAYER7) += xt_layer7.o + obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o + obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o + obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o +--- linux-2.6.28-stock/net/netfilter/xt_layer7.c 1969-12-31 18:00:00.000000000 -0600 ++++ linux-2.6.28/net/netfilter/xt_layer7.c 2009-01-07 20:47:14.000000000 -0600 +@@ -0,0 +1,666 @@ ++/* ++ Kernel module to match application layer (OSI layer 7) data in connections. ++ ++ http://l7-filter.sf.net ++ ++ (C) 2003-2009 Matthew Strait and Ethan Sommer. ++ ++ This program is free software; you can redistribute it and/or ++ modify it under the terms of the GNU General Public License ++ as published by the Free Software Foundation; either version ++ 2 of the License, or (at your option) any later version. ++ http://www.gnu.org/licenses/gpl.txt ++ ++ Based on ipt_string.c (C) 2000 Emmanuel Roger , ++ xt_helper.c (C) 2002 Harald Welte and cls_layer7.c (C) 2003 Matthew Strait, ++ Ethan Sommer, Justin Levandoski. ++*/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) ++#include ++#include ++#endif ++#include ++#include ++#include ++#include ++ ++#include "regexp/regexp.c" ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Matthew Strait , Ethan Sommer "); ++MODULE_DESCRIPTION("iptables application layer match module"); ++MODULE_ALIAS("ipt_layer7"); ++MODULE_VERSION("2.21"); ++ ++static int maxdatalen = 2048; // this is the default ++module_param(maxdatalen, int, 0444); ++MODULE_PARM_DESC(maxdatalen, "maximum bytes of data looked at by l7-filter"); ++#ifdef CONFIG_NETFILTER_XT_MATCH_LAYER7_DEBUG ++ #define DPRINTK(format,args...) printk(format,##args) ++#else ++ #define DPRINTK(format,args...) ++#endif ++ ++/* Number of packets whose data we look at. ++This can be modified through /proc/net/layer7_numpackets */ ++static int num_packets = 10; ++ ++static struct pattern_cache { ++ char * regex_string; ++ regexp * pattern; ++ struct pattern_cache * next; ++} * first_pattern_cache = NULL; ++ ++DEFINE_SPINLOCK(l7_lock); ++ ++static int total_acct_packets(struct nf_conn *ct) ++{ ++#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 26) ++ BUG_ON(ct == NULL); ++ return (ct->counters[IP_CT_DIR_ORIGINAL].packets + ct->counters[IP_CT_DIR_REPLY].packets); ++#else ++ struct nf_conn_counter *acct; ++ ++ BUG_ON(ct == NULL); ++ acct = nf_conn_acct_find(ct); ++ if (!acct) ++ return 0; ++ return (acct[IP_CT_DIR_ORIGINAL].packets + acct[IP_CT_DIR_REPLY].packets); ++#endif ++} ++ ++#ifdef CONFIG_IP_NF_MATCH_LAYER7_DEBUG ++/* Converts an unfriendly string into a friendly one by ++replacing unprintables with periods and all whitespace with " ". */ ++static char * friendly_print(unsigned char * s) ++{ ++ char * f = kmalloc(strlen(s) + 1, GFP_ATOMIC); ++ int i; ++ ++ if(!f) { ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory in " ++ "friendly_print, bailing.\n"); ++ return NULL; ++ } ++ ++ for(i = 0; i < strlen(s); i++){ ++ if(isprint(s[i]) && s[i] < 128) f[i] = s[i]; ++ else if(isspace(s[i])) f[i] = ' '; ++ else f[i] = '.'; ++ } ++ f[i] = '\0'; ++ return f; ++} ++ ++static char dec2hex(int i) ++{ ++ switch (i) { ++ case 0 ... 9: ++ return (i + '0'); ++ break; ++ case 10 ... 15: ++ return (i - 10 + 'a'); ++ break; ++ default: ++ if (net_ratelimit()) ++ printk("layer7: Problem in dec2hex\n"); ++ return '\0'; ++ } ++} ++ ++static char * hex_print(unsigned char * s) ++{ ++ char * g = kmalloc(strlen(s)*3 + 1, GFP_ATOMIC); ++ int i; ++ ++ if(!g) { ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory in hex_print, " ++ "bailing.\n"); ++ return NULL; ++ } ++ ++ for(i = 0; i < strlen(s); i++) { ++ g[i*3 ] = dec2hex(s[i]/16); ++ g[i*3 + 1] = dec2hex(s[i]%16); ++ g[i*3 + 2] = ' '; ++ } ++ g[i*3] = '\0'; ++ ++ return g; ++} ++#endif // DEBUG ++ ++/* Use instead of regcomp. As we expect to be seeing the same regexps over and ++over again, it make sense to cache the results. */ ++static regexp * compile_and_cache(const char * regex_string, ++ const char * protocol) ++{ ++ struct pattern_cache * node = first_pattern_cache; ++ struct pattern_cache * last_pattern_cache = first_pattern_cache; ++ struct pattern_cache * tmp; ++ unsigned int len; ++ ++ while (node != NULL) { ++ if (!strcmp(node->regex_string, regex_string)) ++ return node->pattern; ++ ++ last_pattern_cache = node;/* points at the last non-NULL node */ ++ node = node->next; ++ } ++ ++ /* If we reach the end of the list, then we have not yet cached ++ the pattern for this regex. Let's do that now. ++ Be paranoid about running out of memory to avoid list corruption. */ ++ tmp = kmalloc(sizeof(struct pattern_cache), GFP_ATOMIC); ++ ++ if(!tmp) { ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory in " ++ "compile_and_cache, bailing.\n"); ++ return NULL; ++ } ++ ++ tmp->regex_string = kmalloc(strlen(regex_string) + 1, GFP_ATOMIC); ++ tmp->pattern = kmalloc(sizeof(struct regexp), GFP_ATOMIC); ++ tmp->next = NULL; ++ ++ if(!tmp->regex_string || !tmp->pattern) { ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory in " ++ "compile_and_cache, bailing.\n"); ++ kfree(tmp->regex_string); ++ kfree(tmp->pattern); ++ kfree(tmp); ++ return NULL; ++ } ++ ++ /* Ok. The new node is all ready now. */ ++ node = tmp; ++ ++ if(first_pattern_cache == NULL) /* list is empty */ ++ first_pattern_cache = node; /* make node the beginning */ ++ else ++ last_pattern_cache->next = node; /* attach node to the end */ ++ ++ /* copy the string and compile the regex */ ++ len = strlen(regex_string); ++ DPRINTK("About to compile this: \"%s\"\n", regex_string); ++ node->pattern = regcomp((char *)regex_string, &len); ++ if ( !node->pattern ) { ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: Error compiling regexp " ++ "\"%s\" (%s)\n", ++ regex_string, protocol); ++ /* pattern is now cached as NULL, so we won't try again. */ ++ } ++ ++ strcpy(node->regex_string, regex_string); ++ return node->pattern; ++} ++ ++static int can_handle(const struct sk_buff *skb) ++{ ++ if(!ip_hdr(skb)) /* not IP */ ++ return 0; ++ if(ip_hdr(skb)->protocol != IPPROTO_TCP && ++ ip_hdr(skb)->protocol != IPPROTO_UDP && ++ ip_hdr(skb)->protocol != IPPROTO_ICMP) ++ return 0; ++ return 1; ++} ++ ++/* Returns offset the into the skb->data that the application data starts */ ++static int app_data_offset(const struct sk_buff *skb) ++{ ++ /* In case we are ported somewhere (ebtables?) where ip_hdr(skb) ++ isn't set, this can be gotten from 4*(skb->data[0] & 0x0f) as well. */ ++ int ip_hl = 4*ip_hdr(skb)->ihl; ++ ++ if( ip_hdr(skb)->protocol == IPPROTO_TCP ) { ++ /* 12 == offset into TCP header for the header length field. ++ Can't get this with skb->h.th->doff because the tcphdr ++ struct doesn't get set when routing (this is confirmed to be ++ true in Netfilter as well as QoS.) */ ++ int tcp_hl = 4*(skb->data[ip_hl + 12] >> 4); ++ ++ return ip_hl + tcp_hl; ++ } else if( ip_hdr(skb)->protocol == IPPROTO_UDP ) { ++ return ip_hl + 8; /* UDP header is always 8 bytes */ ++ } else if( ip_hdr(skb)->protocol == IPPROTO_ICMP ) { ++ return ip_hl + 8; /* ICMP header is 8 bytes */ ++ } else { ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: tried to handle unknown " ++ "protocol!\n"); ++ return ip_hl + 8; /* something reasonable */ ++ } ++} ++ ++/* handles whether there's a match when we aren't appending data anymore */ ++static int match_no_append(struct nf_conn * conntrack, ++ struct nf_conn * master_conntrack, ++ enum ip_conntrack_info ctinfo, ++ enum ip_conntrack_info master_ctinfo, ++ const struct xt_layer7_info * info) ++{ ++ /* If we're in here, throw the app data away */ ++ if(master_conntrack->layer7.app_data != NULL) { ++ ++ #ifdef CONFIG_IP_NF_MATCH_LAYER7_DEBUG ++ if(!master_conntrack->layer7.app_proto) { ++ char * f = ++ friendly_print(master_conntrack->layer7.app_data); ++ char * g = ++ hex_print(master_conntrack->layer7.app_data); ++ DPRINTK("\nl7-filter gave up after %d bytes " ++ "(%d packets):\n%s\n", ++ strlen(f), total_acct_packets(master_conntrack), f); ++ kfree(f); ++ DPRINTK("In hex: %s\n", g); ++ kfree(g); ++ } ++ #endif ++ ++ kfree(master_conntrack->layer7.app_data); ++ master_conntrack->layer7.app_data = NULL; /* don't free again */ ++ } ++ ++ if(master_conntrack->layer7.app_proto){ ++ /* Here child connections set their .app_proto (for /proc) */ ++ if(!conntrack->layer7.app_proto) { ++ conntrack->layer7.app_proto = ++ kmalloc(strlen(master_conntrack->layer7.app_proto)+1, ++ GFP_ATOMIC); ++ if(!conntrack->layer7.app_proto){ ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory " ++ "in match_no_append, " ++ "bailing.\n"); ++ return 1; ++ } ++ strcpy(conntrack->layer7.app_proto, ++ master_conntrack->layer7.app_proto); ++ } ++ ++ return (!strcmp(master_conntrack->layer7.app_proto, ++ info->protocol)); ++ } ++ else { ++ /* If not classified, set to "unknown" to distinguish from ++ connections that are still being tested. */ ++ master_conntrack->layer7.app_proto = ++ kmalloc(strlen("unknown")+1, GFP_ATOMIC); ++ if(!master_conntrack->layer7.app_proto){ ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory in " ++ "match_no_append, bailing.\n"); ++ return 1; ++ } ++ strcpy(master_conntrack->layer7.app_proto, "unknown"); ++ return 0; ++ } ++} ++ ++/* add the new app data to the conntrack. Return number of bytes added. */ ++static int add_data(struct nf_conn * master_conntrack, ++ char * app_data, int appdatalen) ++{ ++ int length = 0, i; ++ int oldlength = master_conntrack->layer7.app_data_len; ++ ++ /* This is a fix for a race condition by Deti Fliegl. However, I'm not ++ clear on whether the race condition exists or whether this really ++ fixes it. I might just be being dense... Anyway, if it's not really ++ a fix, all it does is waste a very small amount of time. */ ++ if(!master_conntrack->layer7.app_data) return 0; ++ ++ /* Strip nulls. Make everything lower case (our regex lib doesn't ++ do case insensitivity). Add it to the end of the current data. */ ++ for(i = 0; i < maxdatalen-oldlength-1 && ++ i < appdatalen; i++) { ++ if(app_data[i] != '\0') { ++ /* the kernel version of tolower mungs 'upper ascii' */ ++ master_conntrack->layer7.app_data[length+oldlength] = ++ isascii(app_data[i])? ++ tolower(app_data[i]) : app_data[i]; ++ length++; ++ } ++ } ++ ++ master_conntrack->layer7.app_data[length+oldlength] = '\0'; ++ master_conntrack->layer7.app_data_len = length + oldlength; ++ ++ return length; ++} ++ ++/* taken from drivers/video/modedb.c */ ++static int my_atoi(const char *s) ++{ ++ int val = 0; ++ ++ for (;; s++) { ++ switch (*s) { ++ case '0'...'9': ++ val = 10*val+(*s-'0'); ++ break; ++ default: ++ return val; ++ } ++ } ++} ++ ++/* write out num_packets to userland. */ ++static int layer7_read_proc(char* page, char ** start, off_t off, int count, ++ int* eof, void * data) ++{ ++ if(num_packets > 99 && net_ratelimit()) ++ printk(KERN_ERR "layer7: NOT REACHED. num_packets too big\n"); ++ ++ page[0] = num_packets/10 + '0'; ++ page[1] = num_packets%10 + '0'; ++ page[2] = '\n'; ++ page[3] = '\0'; ++ ++ *eof=1; ++ ++ return 3; ++} ++ ++/* Read in num_packets from userland */ ++static int layer7_write_proc(struct file* file, const char* buffer, ++ unsigned long count, void *data) ++{ ++ char * foo = kmalloc(count, GFP_ATOMIC); ++ ++ if(!foo){ ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory, bailing. " ++ "num_packets unchanged.\n"); ++ return count; ++ } ++ ++ if(copy_from_user(foo, buffer, count)) { ++ return -EFAULT; ++ } ++ ++ ++ num_packets = my_atoi(foo); ++ kfree (foo); ++ ++ /* This has an arbitrary limit to make the math easier. I'm lazy. ++ But anyway, 99 is a LOT! If you want more, you're doing it wrong! */ ++ if(num_packets > 99) { ++ printk(KERN_WARNING "layer7: num_packets can't be > 99.\n"); ++ num_packets = 99; ++ } else if(num_packets < 1) { ++ printk(KERN_WARNING "layer7: num_packets can't be < 1.\n"); ++ num_packets = 1; ++ } ++ ++ return count; ++} ++ ++static bool ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28) ++match(const struct sk_buff *skbin, const struct xt_match_param *par) ++#else ++match(const struct sk_buff *skbin, ++ const struct net_device *in, ++ const struct net_device *out, ++ const struct xt_match *match, ++ const void *matchinfo, ++ int offset, ++ unsigned int protoff, ++ bool *hotdrop) ++#endif ++{ ++ /* sidestep const without getting a compiler warning... */ ++ struct sk_buff * skb = (struct sk_buff *)skbin; ++ ++ const struct xt_layer7_info * info = ++ #if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28) ++ par->matchinfo; ++ #else ++ matchinfo; ++ #endif ++ ++ enum ip_conntrack_info master_ctinfo, ctinfo; ++ struct nf_conn *master_conntrack, *conntrack; ++ unsigned char * app_data; ++ unsigned int pattern_result, appdatalen; ++ regexp * comppattern; ++ ++ /* Be paranoid/incompetent - lock the entire match function. */ ++ spin_lock_bh(&l7_lock); ++ ++ if(!can_handle(skb)){ ++ DPRINTK("layer7: This is some protocol I can't handle.\n"); ++ spin_unlock_bh(&l7_lock); ++ return info->invert; ++ } ++ ++ /* Treat parent & all its children together as one connection, except ++ for the purpose of setting conntrack->layer7.app_proto in the actual ++ connection. This makes /proc/net/ip_conntrack more satisfying. */ ++ if(!(conntrack = nf_ct_get(skb, &ctinfo)) || ++ !(master_conntrack=nf_ct_get(skb,&master_ctinfo))){ ++ DPRINTK("layer7: couldn't get conntrack.\n"); ++ spin_unlock_bh(&l7_lock); ++ return info->invert; ++ } ++ ++ /* Try to get a master conntrack (and its master etc) for FTP, etc. */ ++ while (master_ct(master_conntrack) != NULL) ++ master_conntrack = master_ct(master_conntrack); ++ ++ /* if we've classified it or seen too many packets */ ++ if(total_acct_packets(master_conntrack) > num_packets || ++ master_conntrack->layer7.app_proto) { ++ ++ pattern_result = match_no_append(conntrack, master_conntrack, ++ ctinfo, master_ctinfo, info); ++ ++ /* skb->cb[0] == seen. Don't do things twice if there are ++ multiple l7 rules. I'm not sure that using cb for this purpose ++ is correct, even though it says "put your private variables ++ there". But it doesn't look like it is being used for anything ++ else in the skbs that make it here. */ ++ skb->cb[0] = 1; /* marking it seen here's probably irrelevant */ ++ ++ spin_unlock_bh(&l7_lock); ++ return (pattern_result ^ info->invert); ++ } ++ ++ if(skb_is_nonlinear(skb)){ ++ if(skb_linearize(skb) != 0){ ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: failed to linearize " ++ "packet, bailing.\n"); ++ spin_unlock_bh(&l7_lock); ++ return info->invert; ++ } ++ } ++ ++ /* now that the skb is linearized, it's safe to set these. */ ++ app_data = skb->data + app_data_offset(skb); ++ appdatalen = skb_tail_pointer(skb) - app_data; ++ ++ /* the return value gets checked later, when we're ready to use it */ ++ comppattern = compile_and_cache(info->pattern, info->protocol); ++ ++ /* On the first packet of a connection, allocate space for app data */ ++ if(total_acct_packets(master_conntrack) == 1 && !skb->cb[0] && ++ !master_conntrack->layer7.app_data){ ++ master_conntrack->layer7.app_data = ++ kmalloc(maxdatalen, GFP_ATOMIC); ++ if(!master_conntrack->layer7.app_data){ ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory in " ++ "match, bailing.\n"); ++ spin_unlock_bh(&l7_lock); ++ return info->invert; ++ } ++ ++ master_conntrack->layer7.app_data[0] = '\0'; ++ } ++ ++ /* Can be here, but unallocated, if numpackets is increased near ++ the beginning of a connection */ ++ if(master_conntrack->layer7.app_data == NULL){ ++ spin_unlock_bh(&l7_lock); ++ return info->invert; /* unmatched */ ++ } ++ ++ if(!skb->cb[0]){ ++ int newbytes; ++ newbytes = add_data(master_conntrack, app_data, appdatalen); ++ ++ if(newbytes == 0) { /* didn't add any data */ ++ skb->cb[0] = 1; ++ /* Didn't match before, not going to match now */ ++ spin_unlock_bh(&l7_lock); ++ return info->invert; ++ } ++ } ++ ++ /* If looking for "unknown", then never match. "Unknown" means that ++ we've given up; we're still trying with these packets. */ ++ if(!strcmp(info->protocol, "unknown")) { ++ pattern_result = 0; ++ /* If looking for "unset", then always match. "Unset" means that we ++ haven't yet classified the connection. */ ++ } else if(!strcmp(info->protocol, "unset")) { ++ pattern_result = 2; ++ DPRINTK("layer7: matched unset: not yet classified " ++ "(%d/%d packets)\n", ++ total_acct_packets(master_conntrack), num_packets); ++ /* If the regexp failed to compile, don't bother running it */ ++ } else if(comppattern && ++ regexec(comppattern, master_conntrack->layer7.app_data)){ ++ DPRINTK("layer7: matched %s\n", info->protocol); ++ pattern_result = 1; ++ } else pattern_result = 0; ++ ++ if(pattern_result == 1) { ++ master_conntrack->layer7.app_proto = ++ kmalloc(strlen(info->protocol)+1, GFP_ATOMIC); ++ if(!master_conntrack->layer7.app_proto){ ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory in " ++ "match, bailing.\n"); ++ spin_unlock_bh(&l7_lock); ++ return (pattern_result ^ info->invert); ++ } ++ strcpy(master_conntrack->layer7.app_proto, info->protocol); ++ } else if(pattern_result > 1) { /* cleanup from "unset" */ ++ pattern_result = 1; ++ } ++ ++ /* mark the packet seen */ ++ skb->cb[0] = 1; ++ ++ spin_unlock_bh(&l7_lock); ++ return (pattern_result ^ info->invert); ++} ++ ++// load nf_conntrack_ipv4 ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28) ++static bool check(const struct xt_mtchk_param *par) ++{ ++ if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { ++ printk(KERN_WARNING "can't load conntrack support for " ++ "proto=%d\n", par->match->family); ++#else ++static bool check(const char *tablename, const void *inf, ++ const struct xt_match *match, void *matchinfo, ++ unsigned int hook_mask) ++{ ++ if (nf_ct_l3proto_try_module_get(match->family) < 0) { ++ printk(KERN_WARNING "can't load conntrack support for " ++ "proto=%d\n", match->family); ++#endif ++ return 0; ++ } ++ return 1; ++} ++ ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28) ++ static void destroy(const struct xt_mtdtor_param *par) ++ { ++ nf_ct_l3proto_module_put(par->match->family); ++ } ++#else ++ static void destroy(const struct xt_match *match, void *matchinfo) ++ { ++ nf_ct_l3proto_module_put(match->family); ++ } ++#endif ++ ++static struct xt_match xt_layer7_match[] __read_mostly = { ++{ ++ .name = "layer7", ++ .family = AF_INET, ++ .checkentry = check, ++ .match = match, ++ .destroy = destroy, ++ .matchsize = sizeof(struct xt_layer7_info), ++ .me = THIS_MODULE ++} ++}; ++ ++static void layer7_cleanup_proc(void) ++{ ++ remove_proc_entry("layer7_numpackets", init_net.proc_net); ++} ++ ++/* register the proc file */ ++static void layer7_init_proc(void) ++{ ++ struct proc_dir_entry* entry; ++ entry = create_proc_entry("layer7_numpackets", 0644, init_net.proc_net); ++ entry->read_proc = layer7_read_proc; ++ entry->write_proc = layer7_write_proc; ++} ++ ++static int __init xt_layer7_init(void) ++{ ++ need_conntrack(); ++ ++ layer7_init_proc(); ++ if(maxdatalen < 1) { ++ printk(KERN_WARNING "layer7: maxdatalen can't be < 1, " ++ "using 1\n"); ++ maxdatalen = 1; ++ } ++ /* This is not a hard limit. It's just here to prevent people from ++ bringing their slow machines to a grinding halt. */ ++ else if(maxdatalen > 65536) { ++ printk(KERN_WARNING "layer7: maxdatalen can't be > 65536, " ++ "using 65536\n"); ++ maxdatalen = 65536; ++ } ++ return xt_register_matches(xt_layer7_match, ++ ARRAY_SIZE(xt_layer7_match)); ++} ++ ++static void __exit xt_layer7_fini(void) ++{ ++ layer7_cleanup_proc(); ++ xt_unregister_matches(xt_layer7_match, ARRAY_SIZE(xt_layer7_match)); ++} ++ ++module_init(xt_layer7_init); ++module_exit(xt_layer7_fini); +--- linux-2.6.28-stock/net/netfilter/regexp/regexp.c 1969-12-31 18:00:00.000000000 -0600 ++++ linux-2.6.28/net/netfilter/regexp/regexp.c 2009-01-07 16:07:31.000000000 -0600 +@@ -0,0 +1,1197 @@ ++/* ++ * regcomp and regexec -- regsub and regerror are elsewhere ++ * @(#)regexp.c 1.3 of 18 April 87 ++ * ++ * Copyright (c) 1986 by University of Toronto. ++ * Written by Henry Spencer. Not derived from licensed software. ++ * ++ * Permission is granted to anyone to use this software for any ++ * purpose on any computer system, and to redistribute it freely, ++ * subject to the following restrictions: ++ * ++ * 1. The author is not responsible for the consequences of use of ++ * this software, no matter how awful, even if they arise ++ * from defects in it. ++ * ++ * 2. The origin of this software must not be misrepresented, either ++ * by explicit claim or by omission. ++ * ++ * 3. Altered versions must be plainly marked as such, and must not ++ * be misrepresented as being the original software. ++ * ++ * Beware that some of this code is subtly aware of the way operator ++ * precedence is structured in regular expressions. Serious changes in ++ * regular-expression syntax might require a total rethink. ++ * ++ * This code was modified by Ethan Sommer to work within the kernel ++ * (it now uses kmalloc etc..) ++ * ++ * Modified slightly by Matthew Strait to use more modern C. ++ */ ++ ++#include "regexp.h" ++#include "regmagic.h" ++ ++/* added by ethan and matt. Lets it work in both kernel and user space. ++(So iptables can use it, for instance.) Yea, it goes both ways... */ ++#if __KERNEL__ ++ #define malloc(foo) kmalloc(foo,GFP_ATOMIC) ++#else ++ #define printk(format,args...) printf(format,##args) ++#endif ++ ++void regerror(char * s) ++{ ++ printk("<3>Regexp: %s\n", s); ++ /* NOTREACHED */ ++} ++ ++/* ++ * The "internal use only" fields in regexp.h are present to pass info from ++ * compile to execute that permits the execute phase to run lots faster on ++ * simple cases. They are: ++ * ++ * regstart char that must begin a match; '\0' if none obvious ++ * reganch is the match anchored (at beginning-of-line only)? ++ * regmust string (pointer into program) that match must include, or NULL ++ * regmlen length of regmust string ++ * ++ * Regstart and reganch permit very fast decisions on suitable starting points ++ * for a match, cutting down the work a lot. Regmust permits fast rejection ++ * of lines that cannot possibly match. The regmust tests are costly enough ++ * that regcomp() supplies a regmust only if the r.e. contains something ++ * potentially expensive (at present, the only such thing detected is * or + ++ * at the start of the r.e., which can involve a lot of backup). Regmlen is ++ * supplied because the test in regexec() needs it and regcomp() is computing ++ * it anyway. ++ */ ++ ++/* ++ * Structure for regexp "program". This is essentially a linear encoding ++ * of a nondeterministic finite-state machine (aka syntax charts or ++ * "railroad normal form" in parsing technology). Each node is an opcode ++ * plus a "next" pointer, possibly plus an operand. "Next" pointers of ++ * all nodes except BRANCH implement concatenation; a "next" pointer with ++ * a BRANCH on both ends of it is connecting two alternatives. (Here we ++ * have one of the subtle syntax dependencies: an individual BRANCH (as ++ * opposed to a collection of them) is never concatenated with anything ++ * because of operator precedence.) The operand of some types of node is ++ * a literal string; for others, it is a node leading into a sub-FSM. In ++ * particular, the operand of a BRANCH node is the first node of the branch. ++ * (NB this is *not* a tree structure: the tail of the branch connects ++ * to the thing following the set of BRANCHes.) The opcodes are: ++ */ ++ ++/* definition number opnd? meaning */ ++#define END 0 /* no End of program. */ ++#define BOL 1 /* no Match "" at beginning of line. */ ++#define EOL 2 /* no Match "" at end of line. */ ++#define ANY 3 /* no Match any one character. */ ++#define ANYOF 4 /* str Match any character in this string. */ ++#define ANYBUT 5 /* str Match any character not in this string. */ ++#define BRANCH 6 /* node Match this alternative, or the next... */ ++#define BACK 7 /* no Match "", "next" ptr points backward. */ ++#define EXACTLY 8 /* str Match this string. */ ++#define NOTHING 9 /* no Match empty string. */ ++#define STAR 10 /* node Match this (simple) thing 0 or more times. */ ++#define PLUS 11 /* node Match this (simple) thing 1 or more times. */ ++#define OPEN 20 /* no Mark this point in input as start of #n. */ ++ /* OPEN+1 is number 1, etc. */ ++#define CLOSE 30 /* no Analogous to OPEN. */ ++ ++/* ++ * Opcode notes: ++ * ++ * BRANCH The set of branches constituting a single choice are hooked ++ * together with their "next" pointers, since precedence prevents ++ * anything being concatenated to any individual branch. The ++ * "next" pointer of the last BRANCH in a choice points to the ++ * thing following the whole choice. This is also where the ++ * final "next" pointer of each individual branch points; each ++ * branch starts with the operand node of a BRANCH node. ++ * ++ * BACK Normal "next" pointers all implicitly point forward; BACK ++ * exists to make loop structures possible. ++ * ++ * STAR,PLUS '?', and complex '*' and '+', are implemented as circular ++ * BRANCH structures using BACK. Simple cases (one character ++ * per match) are implemented with STAR and PLUS for speed ++ * and to minimize recursive plunges. ++ * ++ * OPEN,CLOSE ...are numbered at compile time. ++ */ ++ ++/* ++ * A node is one char of opcode followed by two chars of "next" pointer. ++ * "Next" pointers are stored as two 8-bit pieces, high order first. The ++ * value is a positive offset from the opcode of the node containing it. ++ * An operand, if any, simply follows the node. (Note that much of the ++ * code generation knows about this implicit relationship.) ++ * ++ * Using two bytes for the "next" pointer is vast overkill for most things, ++ * but allows patterns to get big without disasters. ++ */ ++#define OP(p) (*(p)) ++#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) ++#define OPERAND(p) ((p) + 3) ++ ++/* ++ * See regmagic.h for one further detail of program structure. ++ */ ++ ++ ++/* ++ * Utility definitions. ++ */ ++#ifndef CHARBITS ++#define UCHARAT(p) ((int)*(unsigned char *)(p)) ++#else ++#define UCHARAT(p) ((int)*(p)&CHARBITS) ++#endif ++ ++#define FAIL(m) { regerror(m); return(NULL); } ++#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') ++#define META "^$.[()|?+*\\" ++ ++/* ++ * Flags to be passed up and down. ++ */ ++#define HASWIDTH 01 /* Known never to match null string. */ ++#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ ++#define SPSTART 04 /* Starts with * or +. */ ++#define WORST 0 /* Worst case. */ ++ ++/* ++ * Global work variables for regcomp(). ++ */ ++struct match_globals { ++char *reginput; /* String-input pointer. */ ++char *regbol; /* Beginning of input, for ^ check. */ ++char **regstartp; /* Pointer to startp array. */ ++char **regendp; /* Ditto for endp. */ ++char *regparse; /* Input-scan pointer. */ ++int regnpar; /* () count. */ ++char regdummy; ++char *regcode; /* Code-emit pointer; ®dummy = don't. */ ++long regsize; /* Code size. */ ++}; ++ ++/* ++ * Forward declarations for regcomp()'s friends. ++ */ ++#ifndef STATIC ++#define STATIC static ++#endif ++STATIC char *reg(struct match_globals *g, int paren,int *flagp); ++STATIC char *regbranch(struct match_globals *g, int *flagp); ++STATIC char *regpiece(struct match_globals *g, int *flagp); ++STATIC char *regatom(struct match_globals *g, int *flagp); ++STATIC char *regnode(struct match_globals *g, char op); ++STATIC char *regnext(struct match_globals *g, char *p); ++STATIC void regc(struct match_globals *g, char b); ++STATIC void reginsert(struct match_globals *g, char op, char *opnd); ++STATIC void regtail(struct match_globals *g, char *p, char *val); ++STATIC void regoptail(struct match_globals *g, char *p, char *val); ++ ++ ++__kernel_size_t my_strcspn(const char *s1,const char *s2) ++{ ++ char *scan1; ++ char *scan2; ++ int count; ++ ++ count = 0; ++ for (scan1 = (char *)s1; *scan1 != '\0'; scan1++) { ++ for (scan2 = (char *)s2; *scan2 != '\0';) /* ++ moved down. */ ++ if (*scan1 == *scan2++) ++ return(count); ++ count++; ++ } ++ return(count); ++} ++ ++/* ++ - regcomp - compile a regular expression into internal code ++ * ++ * We can't allocate space until we know how big the compiled form will be, ++ * but we can't compile it (and thus know how big it is) until we've got a ++ * place to put the code. So we cheat: we compile it twice, once with code ++ * generation turned off and size counting turned on, and once "for real". ++ * This also means that we don't allocate space until we are sure that the ++ * thing really will compile successfully, and we never have to move the ++ * code and thus invalidate pointers into it. (Note that it has to be in ++ * one piece because free() must be able to free it all.) ++ * ++ * Beware that the optimization-preparation code in here knows about some ++ * of the structure of the compiled regexp. ++ */ ++regexp * ++regcomp(char *exp,int *patternsize) ++{ ++ register regexp *r; ++ register char *scan; ++ register char *longest; ++ register int len; ++ int flags; ++ struct match_globals g; ++ ++ /* commented out by ethan ++ extern char *malloc(); ++ */ ++ ++ if (exp == NULL) ++ FAIL("NULL argument"); ++ ++ /* First pass: determine size, legality. */ ++ g.regparse = exp; ++ g.regnpar = 1; ++ g.regsize = 0L; ++ g.regcode = &g.regdummy; ++ regc(&g, MAGIC); ++ if (reg(&g, 0, &flags) == NULL) ++ return(NULL); ++ ++ /* Small enough for pointer-storage convention? */ ++ if (g.regsize >= 32767L) /* Probably could be 65535L. */ ++ FAIL("regexp too big"); ++ ++ /* Allocate space. */ ++ *patternsize=sizeof(regexp) + (unsigned)g.regsize; ++ r = (regexp *)malloc(sizeof(regexp) + (unsigned)g.regsize); ++ if (r == NULL) ++ FAIL("out of space"); ++ ++ /* Second pass: emit code. */ ++ g.regparse = exp; ++ g.regnpar = 1; ++ g.regcode = r->program; ++ regc(&g, MAGIC); ++ if (reg(&g, 0, &flags) == NULL) ++ return(NULL); ++ ++ /* Dig out information for optimizations. */ ++ r->regstart = '\0'; /* Worst-case defaults. */ ++ r->reganch = 0; ++ r->regmust = NULL; ++ r->regmlen = 0; ++ scan = r->program+1; /* First BRANCH. */ ++ if (OP(regnext(&g, scan)) == END) { /* Only one top-level choice. */ ++ scan = OPERAND(scan); ++ ++ /* Starting-point info. */ ++ if (OP(scan) == EXACTLY) ++ r->regstart = *OPERAND(scan); ++ else if (OP(scan) == BOL) ++ r->reganch++; ++ ++ /* ++ * If there's something expensive in the r.e., find the ++ * longest literal string that must appear and make it the ++ * regmust. Resolve ties in favor of later strings, since ++ * the regstart check works with the beginning of the r.e. ++ * and avoiding duplication strengthens checking. Not a ++ * strong reason, but sufficient in the absence of others. ++ */ ++ if (flags&SPSTART) { ++ longest = NULL; ++ len = 0; ++ for (; scan != NULL; scan = regnext(&g, scan)) ++ if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { ++ longest = OPERAND(scan); ++ len = strlen(OPERAND(scan)); ++ } ++ r->regmust = longest; ++ r->regmlen = len; ++ } ++ } ++ ++ return(r); ++} ++ ++/* ++ - reg - regular expression, i.e. main body or parenthesized thing ++ * ++ * Caller must absorb opening parenthesis. ++ * ++ * Combining parenthesis handling with the base level of regular expression ++ * is a trifle forced, but the need to tie the tails of the branches to what ++ * follows makes it hard to avoid. ++ */ ++static char * ++reg(struct match_globals *g, int paren, int *flagp /* Parenthesized? */ ) ++{ ++ register char *ret; ++ register char *br; ++ register char *ender; ++ register int parno = 0; /* 0 makes gcc happy */ ++ int flags; ++ ++ *flagp = HASWIDTH; /* Tentatively. */ ++ ++ /* Make an OPEN node, if parenthesized. */ ++ if (paren) { ++ if (g->regnpar >= NSUBEXP) ++ FAIL("too many ()"); ++ parno = g->regnpar; ++ g->regnpar++; ++ ret = regnode(g, OPEN+parno); ++ } else ++ ret = NULL; ++ ++ /* Pick up the branches, linking them together. */ ++ br = regbranch(g, &flags); ++ if (br == NULL) ++ return(NULL); ++ if (ret != NULL) ++ regtail(g, ret, br); /* OPEN -> first. */ ++ else ++ ret = br; ++ if (!(flags&HASWIDTH)) ++ *flagp &= ~HASWIDTH; ++ *flagp |= flags&SPSTART; ++ while (*g->regparse == '|') { ++ g->regparse++; ++ br = regbranch(g, &flags); ++ if (br == NULL) ++ return(NULL); ++ regtail(g, ret, br); /* BRANCH -> BRANCH. */ ++ if (!(flags&HASWIDTH)) ++ *flagp &= ~HASWIDTH; ++ *flagp |= flags&SPSTART; ++ } ++ ++ /* Make a closing node, and hook it on the end. */ ++ ender = regnode(g, (paren) ? CLOSE+parno : END); ++ regtail(g, ret, ender); ++ ++ /* Hook the tails of the branches to the closing node. */ ++ for (br = ret; br != NULL; br = regnext(g, br)) ++ regoptail(g, br, ender); ++ ++ /* Check for proper termination. */ ++ if (paren && *g->regparse++ != ')') { ++ FAIL("unmatched ()"); ++ } else if (!paren && *g->regparse != '\0') { ++ if (*g->regparse == ')') { ++ FAIL("unmatched ()"); ++ } else ++ FAIL("junk on end"); /* "Can't happen". */ ++ /* NOTREACHED */ ++ } ++ ++ return(ret); ++} ++ ++/* ++ - regbranch - one alternative of an | operator ++ * ++ * Implements the concatenation operator. ++ */ ++static char * ++regbranch(struct match_globals *g, int *flagp) ++{ ++ register char *ret; ++ register char *chain; ++ register char *latest; ++ int flags; ++ ++ *flagp = WORST; /* Tentatively. */ ++ ++ ret = regnode(g, BRANCH); ++ chain = NULL; ++ while (*g->regparse != '\0' && *g->regparse != '|' && *g->regparse != ')') { ++ latest = regpiece(g, &flags); ++ if (latest == NULL) ++ return(NULL); ++ *flagp |= flags&HASWIDTH; ++ if (chain == NULL) /* First piece. */ ++ *flagp |= flags&SPSTART; ++ else ++ regtail(g, chain, latest); ++ chain = latest; ++ } ++ if (chain == NULL) /* Loop ran zero times. */ ++ (void) regnode(g, NOTHING); ++ ++ return(ret); ++} ++ ++/* ++ - regpiece - something followed by possible [*+?] ++ * ++ * Note that the branching code sequences used for ? and the general cases ++ * of * and + are somewhat optimized: they use the same NOTHING node as ++ * both the endmarker for their branch list and the body of the last branch. ++ * It might seem that this node could be dispensed with entirely, but the ++ * endmarker role is not redundant. ++ */ ++static char * ++regpiece(struct match_globals *g, int *flagp) ++{ ++ register char *ret; ++ register char op; ++ register char *next; ++ int flags; ++ ++ ret = regatom(g, &flags); ++ if (ret == NULL) ++ return(NULL); ++ ++ op = *g->regparse; ++ if (!ISMULT(op)) { ++ *flagp = flags; ++ return(ret); ++ } ++ ++ if (!(flags&HASWIDTH) && op != '?') ++ FAIL("*+ operand could be empty"); ++ *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); ++ ++ if (op == '*' && (flags&SIMPLE)) ++ reginsert(g, STAR, ret); ++ else if (op == '*') { ++ /* Emit x* as (x&|), where & means "self". */ ++ reginsert(g, BRANCH, ret); /* Either x */ ++ regoptail(g, ret, regnode(g, BACK)); /* and loop */ ++ regoptail(g, ret, ret); /* back */ ++ regtail(g, ret, regnode(g, BRANCH)); /* or */ ++ regtail(g, ret, regnode(g, NOTHING)); /* null. */ ++ } else if (op == '+' && (flags&SIMPLE)) ++ reginsert(g, PLUS, ret); ++ else if (op == '+') { ++ /* Emit x+ as x(&|), where & means "self". */ ++ next = regnode(g, BRANCH); /* Either */ ++ regtail(g, ret, next); ++ regtail(g, regnode(g, BACK), ret); /* loop back */ ++ regtail(g, next, regnode(g, BRANCH)); /* or */ ++ regtail(g, ret, regnode(g, NOTHING)); /* null. */ ++ } else if (op == '?') { ++ /* Emit x? as (x|) */ ++ reginsert(g, BRANCH, ret); /* Either x */ ++ regtail(g, ret, regnode(g, BRANCH)); /* or */ ++ next = regnode(g, NOTHING); /* null. */ ++ regtail(g, ret, next); ++ regoptail(g, ret, next); ++ } ++ g->regparse++; ++ if (ISMULT(*g->regparse)) ++ FAIL("nested *?+"); ++ ++ return(ret); ++} ++ ++/* ++ - regatom - the lowest level ++ * ++ * Optimization: gobbles an entire sequence of ordinary characters so that ++ * it can turn them into a single node, which is smaller to store and ++ * faster to run. Backslashed characters are exceptions, each becoming a ++ * separate node; the code is simpler that way and it's not worth fixing. ++ */ ++static char * ++regatom(struct match_globals *g, int *flagp) ++{ ++ register char *ret; ++ int flags; ++ ++ *flagp = WORST; /* Tentatively. */ ++ ++ switch (*g->regparse++) { ++ case '^': ++ ret = regnode(g, BOL); ++ break; ++ case '$': ++ ret = regnode(g, EOL); ++ break; ++ case '.': ++ ret = regnode(g, ANY); ++ *flagp |= HASWIDTH|SIMPLE; ++ break; ++ case '[': { ++ register int class; ++ register int classend; ++ ++ if (*g->regparse == '^') { /* Complement of range. */ ++ ret = regnode(g, ANYBUT); ++ g->regparse++; ++ } else ++ ret = regnode(g, ANYOF); ++ if (*g->regparse == ']' || *g->regparse == '-') ++ regc(g, *g->regparse++); ++ while (*g->regparse != '\0' && *g->regparse != ']') { ++ if (*g->regparse == '-') { ++ g->regparse++; ++ if (*g->regparse == ']' || *g->regparse == '\0') ++ regc(g, '-'); ++ else { ++ class = UCHARAT(g->regparse-2)+1; ++ classend = UCHARAT(g->regparse); ++ if (class > classend+1) ++ FAIL("invalid [] range"); ++ for (; class <= classend; class++) ++ regc(g, class); ++ g->regparse++; ++ } ++ } else ++ regc(g, *g->regparse++); ++ } ++ regc(g, '\0'); ++ if (*g->regparse != ']') ++ FAIL("unmatched []"); ++ g->regparse++; ++ *flagp |= HASWIDTH|SIMPLE; ++ } ++ break; ++ case '(': ++ ret = reg(g, 1, &flags); ++ if (ret == NULL) ++ return(NULL); ++ *flagp |= flags&(HASWIDTH|SPSTART); ++ break; ++ case '\0': ++ case '|': ++ case ')': ++ FAIL("internal urp"); /* Supposed to be caught earlier. */ ++ break; ++ case '?': ++ case '+': ++ case '*': ++ FAIL("?+* follows nothing"); ++ break; ++ case '\\': ++ if (*g->regparse == '\0') ++ FAIL("trailing \\"); ++ ret = regnode(g, EXACTLY); ++ regc(g, *g->regparse++); ++ regc(g, '\0'); ++ *flagp |= HASWIDTH|SIMPLE; ++ break; ++ default: { ++ register int len; ++ register char ender; ++ ++ g->regparse--; ++ len = my_strcspn((const char *)g->regparse, (const char *)META); ++ if (len <= 0) ++ FAIL("internal disaster"); ++ ender = *(g->regparse+len); ++ if (len > 1 && ISMULT(ender)) ++ len--; /* Back off clear of ?+* operand. */ ++ *flagp |= HASWIDTH; ++ if (len == 1) ++ *flagp |= SIMPLE; ++ ret = regnode(g, EXACTLY); ++ while (len > 0) { ++ regc(g, *g->regparse++); ++ len--; ++ } ++ regc(g, '\0'); ++ } ++ break; ++ } ++ ++ return(ret); ++} ++ ++/* ++ - regnode - emit a node ++ */ ++static char * /* Location. */ ++regnode(struct match_globals *g, char op) ++{ ++ register char *ret; ++ register char *ptr; ++ ++ ret = g->regcode; ++ if (ret == &g->regdummy) { ++ g->regsize += 3; ++ return(ret); ++ } ++ ++ ptr = ret; ++ *ptr++ = op; ++ *ptr++ = '\0'; /* Null "next" pointer. */ ++ *ptr++ = '\0'; ++ g->regcode = ptr; ++ ++ return(ret); ++} ++ ++/* ++ - regc - emit (if appropriate) a byte of code ++ */ ++static void ++regc(struct match_globals *g, char b) ++{ ++ if (g->regcode != &g->regdummy) ++ *g->regcode++ = b; ++ else ++ g->regsize++; ++} ++ ++/* ++ - reginsert - insert an operator in front of already-emitted operand ++ * ++ * Means relocating the operand. ++ */ ++static void ++reginsert(struct match_globals *g, char op, char* opnd) ++{ ++ register char *src; ++ register char *dst; ++ register char *place; ++ ++ if (g->regcode == &g->regdummy) { ++ g->regsize += 3; ++ return; ++ } ++ ++ src = g->regcode; ++ g->regcode += 3; ++ dst = g->regcode; ++ while (src > opnd) ++ *--dst = *--src; ++ ++ place = opnd; /* Op node, where operand used to be. */ ++ *place++ = op; ++ *place++ = '\0'; ++ *place++ = '\0'; ++} ++ ++/* ++ - regtail - set the next-pointer at the end of a node chain ++ */ ++static void ++regtail(struct match_globals *g, char *p, char *val) ++{ ++ register char *scan; ++ register char *temp; ++ register int offset; ++ ++ if (p == &g->regdummy) ++ return; ++ ++ /* Find last node. */ ++ scan = p; ++ for (;;) { ++ temp = regnext(g, scan); ++ if (temp == NULL) ++ break; ++ scan = temp; ++ } ++ ++ if (OP(scan) == BACK) ++ offset = scan - val; ++ else ++ offset = val - scan; ++ *(scan+1) = (offset>>8)&0377; ++ *(scan+2) = offset&0377; ++} ++ ++/* ++ - regoptail - regtail on operand of first argument; nop if operandless ++ */ ++static void ++regoptail(struct match_globals *g, char *p, char *val) ++{ ++ /* "Operandless" and "op != BRANCH" are synonymous in practice. */ ++ if (p == NULL || p == &g->regdummy || OP(p) != BRANCH) ++ return; ++ regtail(g, OPERAND(p), val); ++} ++ ++/* ++ * regexec and friends ++ */ ++ ++ ++/* ++ * Forwards. ++ */ ++STATIC int regtry(struct match_globals *g, regexp *prog, char *string); ++STATIC int regmatch(struct match_globals *g, char *prog); ++STATIC int regrepeat(struct match_globals *g, char *p); ++ ++#ifdef DEBUG ++int regnarrate = 0; ++void regdump(); ++STATIC char *regprop(char *op); ++#endif ++ ++/* ++ - regexec - match a regexp against a string ++ */ ++int ++regexec(regexp *prog, char *string) ++{ ++ register char *s; ++ struct match_globals g; ++ ++ /* Be paranoid... */ ++ if (prog == NULL || string == NULL) { ++ printk("<3>Regexp: NULL parameter\n"); ++ return(0); ++ } ++ ++ /* Check validity of program. */ ++ if (UCHARAT(prog->program) != MAGIC) { ++ printk("<3>Regexp: corrupted program\n"); ++ return(0); ++ } ++ ++ /* If there is a "must appear" string, look for it. */ ++ if (prog->regmust != NULL) { ++ s = string; ++ while ((s = strchr(s, prog->regmust[0])) != NULL) { ++ if (strncmp(s, prog->regmust, prog->regmlen) == 0) ++ break; /* Found it. */ ++ s++; ++ } ++ if (s == NULL) /* Not present. */ ++ return(0); ++ } ++ ++ /* Mark beginning of line for ^ . */ ++ g.regbol = string; ++ ++ /* Simplest case: anchored match need be tried only once. */ ++ if (prog->reganch) ++ return(regtry(&g, prog, string)); ++ ++ /* Messy cases: unanchored match. */ ++ s = string; ++ if (prog->regstart != '\0') ++ /* We know what char it must start with. */ ++ while ((s = strchr(s, prog->regstart)) != NULL) { ++ if (regtry(&g, prog, s)) ++ return(1); ++ s++; ++ } ++ else ++ /* We don't -- general case. */ ++ do { ++ if (regtry(&g, prog, s)) ++ return(1); ++ } while (*s++ != '\0'); ++ ++ /* Failure. */ ++ return(0); ++} ++ ++/* ++ - regtry - try match at specific point ++ */ ++static int /* 0 failure, 1 success */ ++regtry(struct match_globals *g, regexp *prog, char *string) ++{ ++ register int i; ++ register char **sp; ++ register char **ep; ++ ++ g->reginput = string; ++ g->regstartp = prog->startp; ++ g->regendp = prog->endp; ++ ++ sp = prog->startp; ++ ep = prog->endp; ++ for (i = NSUBEXP; i > 0; i--) { ++ *sp++ = NULL; ++ *ep++ = NULL; ++ } ++ if (regmatch(g, prog->program + 1)) { ++ prog->startp[0] = string; ++ prog->endp[0] = g->reginput; ++ return(1); ++ } else ++ return(0); ++} ++ ++/* ++ - regmatch - main matching routine ++ * ++ * Conceptually the strategy is simple: check to see whether the current ++ * node matches, call self recursively to see whether the rest matches, ++ * and then act accordingly. In practice we make some effort to avoid ++ * recursion, in particular by going through "ordinary" nodes (that don't ++ * need to know whether the rest of the match failed) by a loop instead of ++ * by recursion. ++ */ ++static int /* 0 failure, 1 success */ ++regmatch(struct match_globals *g, char *prog) ++{ ++ register char *scan = prog; /* Current node. */ ++ char *next; /* Next node. */ ++ ++#ifdef DEBUG ++ if (scan != NULL && regnarrate) ++ fprintf(stderr, "%s(\n", regprop(scan)); ++#endif ++ while (scan != NULL) { ++#ifdef DEBUG ++ if (regnarrate) ++ fprintf(stderr, "%s...\n", regprop(scan)); ++#endif ++ next = regnext(g, scan); ++ ++ switch (OP(scan)) { ++ case BOL: ++ if (g->reginput != g->regbol) ++ return(0); ++ break; ++ case EOL: ++ if (*g->reginput != '\0') ++ return(0); ++ break; ++ case ANY: ++ if (*g->reginput == '\0') ++ return(0); ++ g->reginput++; ++ break; ++ case EXACTLY: { ++ register int len; ++ register char *opnd; ++ ++ opnd = OPERAND(scan); ++ /* Inline the first character, for speed. */ ++ if (*opnd != *g->reginput) ++ return(0); ++ len = strlen(opnd); ++ if (len > 1 && strncmp(opnd, g->reginput, len) != 0) ++ return(0); ++ g->reginput += len; ++ } ++ break; ++ case ANYOF: ++ if (*g->reginput == '\0' || strchr(OPERAND(scan), *g->reginput) == NULL) ++ return(0); ++ g->reginput++; ++ break; ++ case ANYBUT: ++ if (*g->reginput == '\0' || strchr(OPERAND(scan), *g->reginput) != NULL) ++ return(0); ++ g->reginput++; ++ break; ++ case NOTHING: ++ case BACK: ++ break; ++ case OPEN+1: ++ case OPEN+2: ++ case OPEN+3: ++ case OPEN+4: ++ case OPEN+5: ++ case OPEN+6: ++ case OPEN+7: ++ case OPEN+8: ++ case OPEN+9: { ++ register int no; ++ register char *save; ++ ++ no = OP(scan) - OPEN; ++ save = g->reginput; ++ ++ if (regmatch(g, next)) { ++ /* ++ * Don't set startp if some later ++ * invocation of the same parentheses ++ * already has. ++ */ ++ if (g->regstartp[no] == NULL) ++ g->regstartp[no] = save; ++ return(1); ++ } else ++ return(0); ++ } ++ break; ++ case CLOSE+1: ++ case CLOSE+2: ++ case CLOSE+3: ++ case CLOSE+4: ++ case CLOSE+5: ++ case CLOSE+6: ++ case CLOSE+7: ++ case CLOSE+8: ++ case CLOSE+9: ++ { ++ register int no; ++ register char *save; ++ ++ no = OP(scan) - CLOSE; ++ save = g->reginput; ++ ++ if (regmatch(g, next)) { ++ /* ++ * Don't set endp if some later ++ * invocation of the same parentheses ++ * already has. ++ */ ++ if (g->regendp[no] == NULL) ++ g->regendp[no] = save; ++ return(1); ++ } else ++ return(0); ++ } ++ break; ++ case BRANCH: { ++ register char *save; ++ ++ if (OP(next) != BRANCH) /* No choice. */ ++ next = OPERAND(scan); /* Avoid recursion. */ ++ else { ++ do { ++ save = g->reginput; ++ if (regmatch(g, OPERAND(scan))) ++ return(1); ++ g->reginput = save; ++ scan = regnext(g, scan); ++ } while (scan != NULL && OP(scan) == BRANCH); ++ return(0); ++ /* NOTREACHED */ ++ } ++ } ++ break; ++ case STAR: ++ case PLUS: { ++ register char nextch; ++ register int no; ++ register char *save; ++ register int min; ++ ++ /* ++ * Lookahead to avoid useless match attempts ++ * when we know what character comes next. ++ */ ++ nextch = '\0'; ++ if (OP(next) == EXACTLY) ++ nextch = *OPERAND(next); ++ min = (OP(scan) == STAR) ? 0 : 1; ++ save = g->reginput; ++ no = regrepeat(g, OPERAND(scan)); ++ while (no >= min) { ++ /* If it could work, try it. */ ++ if (nextch == '\0' || *g->reginput == nextch) ++ if (regmatch(g, next)) ++ return(1); ++ /* Couldn't or didn't -- back up. */ ++ no--; ++ g->reginput = save + no; ++ } ++ return(0); ++ } ++ break; ++ case END: ++ return(1); /* Success! */ ++ break; ++ default: ++ printk("<3>Regexp: memory corruption\n"); ++ return(0); ++ break; ++ } ++ ++ scan = next; ++ } ++ ++ /* ++ * We get here only if there's trouble -- normally "case END" is ++ * the terminating point. ++ */ ++ printk("<3>Regexp: corrupted pointers\n"); ++ return(0); ++} ++ ++/* ++ - regrepeat - repeatedly match something simple, report how many ++ */ ++static int ++regrepeat(struct match_globals *g, char *p) ++{ ++ register int count = 0; ++ register char *scan; ++ register char *opnd; ++ ++ scan = g->reginput; ++ opnd = OPERAND(p); ++ switch (OP(p)) { ++ case ANY: ++ count = strlen(scan); ++ scan += count; ++ break; ++ case EXACTLY: ++ while (*opnd == *scan) { ++ count++; ++ scan++; ++ } ++ break; ++ case ANYOF: ++ while (*scan != '\0' && strchr(opnd, *scan) != NULL) { ++ count++; ++ scan++; ++ } ++ break; ++ case ANYBUT: ++ while (*scan != '\0' && strchr(opnd, *scan) == NULL) { ++ count++; ++ scan++; ++ } ++ break; ++ default: /* Oh dear. Called inappropriately. */ ++ printk("<3>Regexp: internal foulup\n"); ++ count = 0; /* Best compromise. */ ++ break; ++ } ++ g->reginput = scan; ++ ++ return(count); ++} ++ ++/* ++ - regnext - dig the "next" pointer out of a node ++ */ ++static char* ++regnext(struct match_globals *g, char *p) ++{ ++ register int offset; ++ ++ if (p == &g->regdummy) ++ return(NULL); ++ ++ offset = NEXT(p); ++ if (offset == 0) ++ return(NULL); ++ ++ if (OP(p) == BACK) ++ return(p-offset); ++ else ++ return(p+offset); ++} ++ ++#ifdef DEBUG ++ ++STATIC char *regprop(); ++ ++/* ++ - regdump - dump a regexp onto stdout in vaguely comprehensible form ++ */ ++void ++regdump(regexp *r) ++{ ++ register char *s; ++ register char op = EXACTLY; /* Arbitrary non-END op. */ ++ register char *next; ++ /* extern char *strchr(); */ ++ ++ ++ s = r->program + 1; ++ while (op != END) { /* While that wasn't END last time... */ ++ op = OP(s); ++ printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ ++ next = regnext(s); ++ if (next == NULL) /* Next ptr. */ ++ printf("(0)"); ++ else ++ printf("(%d)", (s-r->program)+(next-s)); ++ s += 3; ++ if (op == ANYOF || op == ANYBUT || op == EXACTLY) { ++ /* Literal string, where present. */ ++ while (*s != '\0') { ++ putchar(*s); ++ s++; ++ } ++ s++; ++ } ++ putchar('\n'); ++ } ++ ++ /* Header fields of interest. */ ++ if (r->regstart != '\0') ++ printf("start `%c' ", r->regstart); ++ if (r->reganch) ++ printf("anchored "); ++ if (r->regmust != NULL) ++ printf("must have \"%s\"", r->regmust); ++ printf("\n"); ++} ++ ++/* ++ - regprop - printable representation of opcode ++ */ ++static char * ++regprop(char *op) ++{ ++#define BUFLEN 50 ++ register char *p; ++ static char buf[BUFLEN]; ++ ++ strcpy(buf, ":"); ++ ++ switch (OP(op)) { ++ case BOL: ++ p = "BOL"; ++ break; ++ case EOL: ++ p = "EOL"; ++ break; ++ case ANY: ++ p = "ANY"; ++ break; ++ case ANYOF: ++ p = "ANYOF"; ++ break; ++ case ANYBUT: ++ p = "ANYBUT"; ++ break; ++ case BRANCH: ++ p = "BRANCH"; ++ break; ++ case EXACTLY: ++ p = "EXACTLY"; ++ break; ++ case NOTHING: ++ p = "NOTHING"; ++ break; ++ case BACK: ++ p = "BACK"; ++ break; ++ case END: ++ p = "END"; ++ break; ++ case OPEN+1: ++ case OPEN+2: ++ case OPEN+3: ++ case OPEN+4: ++ case OPEN+5: ++ case OPEN+6: ++ case OPEN+7: ++ case OPEN+8: ++ case OPEN+9: ++ snprintf(buf+strlen(buf),BUFLEN-strlen(buf), "OPEN%d", OP(op)-OPEN); ++ p = NULL; ++ break; ++ case CLOSE+1: ++ case CLOSE+2: ++ case CLOSE+3: ++ case CLOSE+4: ++ case CLOSE+5: ++ case CLOSE+6: ++ case CLOSE+7: ++ case CLOSE+8: ++ case CLOSE+9: ++ snprintf(buf+strlen(buf),BUFLEN-strlen(buf), "CLOSE%d", OP(op)-CLOSE); ++ p = NULL; ++ break; ++ case STAR: ++ p = "STAR"; ++ break; ++ case PLUS: ++ p = "PLUS"; ++ break; ++ default: ++ printk("<3>Regexp: corrupted opcode\n"); ++ break; ++ } ++ if (p != NULL) ++ strncat(buf, p, BUFLEN-strlen(buf)); ++ return(buf); ++} ++#endif ++ ++ +--- linux-2.6.28-stock/net/netfilter/regexp/regexp.h 1969-12-31 18:00:00.000000000 -0600 ++++ linux-2.6.28/net/netfilter/regexp/regexp.h 2009-01-07 16:07:31.000000000 -0600 +@@ -0,0 +1,41 @@ ++/* ++ * Definitions etc. for regexp(3) routines. ++ * ++ * Caveat: this is V8 regexp(3) [actually, a reimplementation thereof], ++ * not the System V one. ++ */ ++ ++#ifndef REGEXP_H ++#define REGEXP_H ++ ++ ++/* ++http://www.opensource.apple.com/darwinsource/10.3/expect-1/expect/expect.h , ++which contains a version of this library, says: ++ ++ * ++ * NSUBEXP must be at least 10, and no greater than 117 or the parser ++ * will not work properly. ++ * ++ ++However, it looks rather like this library is limited to 10. If you think ++otherwise, let us know. ++*/ ++ ++#define NSUBEXP 10 ++typedef struct regexp { ++ char *startp[NSUBEXP]; ++ char *endp[NSUBEXP]; ++ char regstart; /* Internal use only. */ ++ char reganch; /* Internal use only. */ ++ char *regmust; /* Internal use only. */ ++ int regmlen; /* Internal use only. */ ++ char program[1]; /* Unwarranted chumminess with compiler. */ ++} regexp; ++ ++regexp * regcomp(char *exp, int *patternsize); ++int regexec(regexp *prog, char *string); ++void regsub(regexp *prog, char *source, char *dest); ++void regerror(char *s); ++ ++#endif +--- linux-2.6.28-stock/net/netfilter/regexp/regmagic.h 1969-12-31 18:00:00.000000000 -0600 ++++ linux-2.6.28/net/netfilter/regexp/regmagic.h 2009-01-07 16:07:31.000000000 -0600 +@@ -0,0 +1,5 @@ ++/* ++ * The first byte of the regexp internal "program" is actually this magic ++ * number; the start node begins in the second byte. ++ */ ++#define MAGIC 0234 +--- linux-2.6.28-stock/net/netfilter/regexp/regsub.c 1969-12-31 18:00:00.000000000 -0600 ++++ linux-2.6.28/net/netfilter/regexp/regsub.c 2009-01-07 16:07:31.000000000 -0600 +@@ -0,0 +1,95 @@ ++/* ++ * regsub ++ * @(#)regsub.c 1.3 of 2 April 86 ++ * ++ * Copyright (c) 1986 by University of Toronto. ++ * Written by Henry Spencer. Not derived from licensed software. ++ * ++ * Permission is granted to anyone to use this software for any ++ * purpose on any computer system, and to redistribute it freely, ++ * subject to the following restrictions: ++ * ++ * 1. The author is not responsible for the consequences of use of ++ * this software, no matter how awful, even if they arise ++ * from defects in it. ++ * ++ * 2. The origin of this software must not be misrepresented, either ++ * by explicit claim or by omission. ++ * ++ * 3. Altered versions must be plainly marked as such, and must not ++ * be misrepresented as being the original software. ++ * ++ * ++ * This code was modified by Ethan Sommer to work within the kernel ++ * (it now uses kmalloc etc..) ++ * ++ */ ++#include "regexp.h" ++#include "regmagic.h" ++#include ++ ++ ++#ifndef CHARBITS ++#define UCHARAT(p) ((int)*(unsigned char *)(p)) ++#else ++#define UCHARAT(p) ((int)*(p)&CHARBITS) ++#endif ++ ++#if 0 ++//void regerror(char * s) ++//{ ++// printk("regexp(3): %s", s); ++// /* NOTREACHED */ ++//} ++#endif ++ ++/* ++ - regsub - perform substitutions after a regexp match ++ */ ++void ++regsub(regexp * prog, char * source, char * dest) ++{ ++ register char *src; ++ register char *dst; ++ register char c; ++ register int no; ++ register int len; ++ ++ /* Not necessary and gcc doesn't like it -MLS */ ++ /*extern char *strncpy();*/ ++ ++ if (prog == NULL || source == NULL || dest == NULL) { ++ regerror("NULL parm to regsub"); ++ return; ++ } ++ if (UCHARAT(prog->program) != MAGIC) { ++ regerror("damaged regexp fed to regsub"); ++ return; ++ } ++ ++ src = source; ++ dst = dest; ++ while ((c = *src++) != '\0') { ++ if (c == '&') ++ no = 0; ++ else if (c == '\\' && '0' <= *src && *src <= '9') ++ no = *src++ - '0'; ++ else ++ no = -1; ++ ++ if (no < 0) { /* Ordinary character. */ ++ if (c == '\\' && (*src == '\\' || *src == '&')) ++ c = *src++; ++ *dst++ = c; ++ } else if (prog->startp[no] != NULL && prog->endp[no] != NULL) { ++ len = prog->endp[no] - prog->startp[no]; ++ (void) strncpy(dst, prog->startp[no], len); ++ dst += len; ++ if (len != 0 && *(dst-1) == '\0') { /* strncpy hit NUL. */ ++ regerror("damaged match string"); ++ return; ++ } ++ } ++ } ++ *dst++ = '\0'; ++} +--- linux-2.6.28-stock/net/netfilter/nf_conntrack_core.c 2009-01-07 16:05:35.000000000 -0600 ++++ linux-2.6.28/net/netfilter/nf_conntrack_core.c 2009-01-07 16:07:31.000000000 -0600 +@@ -201,6 +201,14 @@ destroy_conntrack(struct nf_conntrack *n + * too. */ + nf_ct_remove_expectations(ct); + ++ #if defined(CONFIG_NETFILTER_XT_MATCH_LAYER7) || defined(CONFIG_NETFILTER_XT_MATCH_LAYER7_MODULE) ++ if(ct->layer7.app_proto) ++ kfree(ct->layer7.app_proto); ++ if(ct->layer7.app_data) ++ kfree(ct->layer7.app_data); ++ #endif ++ ++ + /* We overload first tuple to link into unconfirmed list. */ + if (!nf_ct_is_confirmed(ct)) { + BUG_ON(hlist_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode)); +--- linux-2.6.28-stock/net/netfilter/nf_conntrack_standalone.c 2009-01-07 16:05:35.000000000 -0600 ++++ linux-2.6.28/net/netfilter/nf_conntrack_standalone.c 2009-01-07 16:07:31.000000000 -0600 +@@ -165,6 +165,12 @@ static int ct_seq_show(struct seq_file * + return -ENOSPC; + #endif + ++#if defined(CONFIG_NETFILTER_XT_MATCH_LAYER7) || defined(CONFIG_NETFILTER_XT_MATCH_LAYER7_MODULE) ++ if(ct->layer7.app_proto && ++ seq_printf(s, "l7proto=%s ", ct->layer7.app_proto)) ++ return -ENOSPC; ++#endif ++ + if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) + return -ENOSPC; + +--- linux-2.6.28-stock/include/net/netfilter/nf_conntrack.h 2009-01-07 16:05:30.000000000 -0600 ++++ linux-2.6.28/include/net/netfilter/nf_conntrack.h 2009-01-07 16:07:31.000000000 -0600 +@@ -134,6 +134,22 @@ + struct net *ct_net; + #endif + ++#if defined(CONFIG_NETFILTER_XT_MATCH_LAYER7) || \ ++ defined(CONFIG_NETFILTER_XT_MATCH_LAYER7_MODULE) ++ struct { ++ /* ++ * e.g. "http". NULL before decision. "unknown" after decision ++ * if no match. ++ */ ++ char *app_proto; ++ /* ++ * application layer data so far. NULL after match decision. ++ */ ++ char *app_data; ++ unsigned int app_data_len; ++ } layer7; ++#endif ++ + /* Storage reserved for other modules, must be the last member */ + union nf_conntrack_proto proto; + }; +--- linux-2.6.28-stock/include/linux/netfilter/xt_layer7.h 1969-12-31 18:00:00.000000000 -0600 ++++ linux-2.6.28/include/linux/netfilter/xt_layer7.h 2009-01-07 16:07:31.000000000 -0600 +@@ -0,0 +1,13 @@ ++#ifndef _XT_LAYER7_H ++#define _XT_LAYER7_H ++ ++#define MAX_PATTERN_LEN 8192 ++#define MAX_PROTOCOL_LEN 256 ++ ++struct xt_layer7_info { ++ char protocol[MAX_PROTOCOL_LEN]; ++ char pattern[MAX_PATTERN_LEN]; ++ u_int8_t invert; ++}; ++ ++#endif /* _XT_LAYER7_H */ diff --git a/3.2.34/kirkwood-jumbo-frame.patch b/3.2.34/kirkwood-jumbo-frame.patch new file mode 100644 index 0000000..fdbc5b1 --- /dev/null +++ b/3.2.34/kirkwood-jumbo-frame.patch @@ -0,0 +1,135 @@ +kirkwood and dove have a smaller FIFO than other "orion" SoCs. This +needs to be taken into account otherwise people using things like jumbo frames +will get into some troubles. + +As a side note, this patch is an updated version of a patch sent some years +ago: http://lists.infradead.org/pipermail/linux-arm-kernel/2010-June/017320.html +which seems to have been lost. + +Signed-off-by: Arnaud Patard + +Index: alunn/arch/arm/mach-dove/common.c +=================================================================== +--- alunn.orig/arch/arm/mach-dove/common.c 2012-07-20 09:14:45.000000000 +0200 ++++ alunn/arch/arm/mach-dove/common.c 2012-07-20 17:51:38.872925518 +0200 +@@ -102,7 +102,7 @@ void __init dove_ehci1_init(void) + void __init dove_ge00_init(struct mv643xx_eth_platform_data *eth_data) + { + orion_ge00_init(eth_data, DOVE_GE00_PHYS_BASE, +- IRQ_DOVE_GE00_SUM, IRQ_DOVE_GE00_ERR); ++ IRQ_DOVE_GE00_SUM, IRQ_DOVE_GE00_ERR, 0); + } + + /***************************************************************************** +Index: alunn/arch/arm/mach-kirkwood/common.c +=================================================================== +--- alunn.orig/arch/arm/mach-kirkwood/common.c 2012-07-20 09:14:46.000000000 +0200 ++++ alunn/arch/arm/mach-kirkwood/common.c 2012-07-20 17:51:03.104927094 +0200 +@@ -301,7 +301,7 @@ void __init kirkwood_ge00_init(struct mv + { + orion_ge00_init(eth_data, + GE00_PHYS_BASE, IRQ_KIRKWOOD_GE00_SUM, +- IRQ_KIRKWOOD_GE00_ERR); ++ IRQ_KIRKWOOD_GE00_ERR, 1600); + /* The interface forgets the MAC address assigned by u-boot if + the clock is turned off, so claim the clk now. */ + clk_prepare_enable(ge0); +@@ -315,7 +315,7 @@ void __init kirkwood_ge01_init(struct mv + { + orion_ge01_init(eth_data, + GE01_PHYS_BASE, IRQ_KIRKWOOD_GE01_SUM, +- IRQ_KIRKWOOD_GE01_ERR); ++ IRQ_KIRKWOOD_GE01_ERR, 1600); + clk_prepare_enable(ge1); + } + +Index: alunn/arch/arm/mach-mv78xx0/common.c +=================================================================== +--- alunn.orig/arch/arm/mach-mv78xx0/common.c 2012-07-20 09:14:46.000000000 +0200 ++++ alunn/arch/arm/mach-mv78xx0/common.c 2012-07-20 17:50:26.712928695 +0200 +@@ -213,7 +213,7 @@ void __init mv78xx0_ge00_init(struct mv6 + { + orion_ge00_init(eth_data, + GE00_PHYS_BASE, IRQ_MV78XX0_GE00_SUM, +- IRQ_MV78XX0_GE_ERR); ++ IRQ_MV78XX0_GE_ERR, 0); + } + + +@@ -224,7 +224,7 @@ void __init mv78xx0_ge01_init(struct mv6 + { + orion_ge01_init(eth_data, + GE01_PHYS_BASE, IRQ_MV78XX0_GE01_SUM, +- NO_IRQ); ++ NO_IRQ, 0); + } + + +Index: alunn/arch/arm/mach-orion5x/common.c +=================================================================== +--- alunn.orig/arch/arm/mach-orion5x/common.c 2012-07-20 09:14:46.000000000 +0200 ++++ alunn/arch/arm/mach-orion5x/common.c 2012-07-20 17:50:26.744928692 +0200 +@@ -109,7 +109,7 @@ void __init orion5x_eth_init(struct mv64 + { + orion_ge00_init(eth_data, + ORION5X_ETH_PHYS_BASE, IRQ_ORION5X_ETH_SUM, +- IRQ_ORION5X_ETH_ERR); ++ IRQ_ORION5X_ETH_ERR, 0); + } + + +Index: alunn/arch/arm/plat-orion/common.c +=================================================================== +--- alunn.orig/arch/arm/plat-orion/common.c 2012-07-20 09:14:46.000000000 +0200 ++++ alunn/arch/arm/plat-orion/common.c 2012-07-20 17:50:26.756928690 +0200 +@@ -291,10 +291,12 @@ static struct platform_device orion_ge00 + void __init orion_ge00_init(struct mv643xx_eth_platform_data *eth_data, + unsigned long mapbase, + unsigned long irq, +- unsigned long irq_err) ++ unsigned long irq_err, ++ unsigned int tx_csum_limit) + { + fill_resources(&orion_ge00_shared, orion_ge00_shared_resources, + mapbase + 0x2000, SZ_16K - 1, irq_err); ++ orion_ge00_shared_data.tx_csum_limit = tx_csum_limit; + ge_complete(&orion_ge00_shared_data, + orion_ge00_resources, irq, &orion_ge00_shared, + eth_data, &orion_ge00); +@@ -343,10 +345,12 @@ static struct platform_device orion_ge01 + void __init orion_ge01_init(struct mv643xx_eth_platform_data *eth_data, + unsigned long mapbase, + unsigned long irq, +- unsigned long irq_err) ++ unsigned long irq_err, ++ unsigned int tx_csum_limit) + { + fill_resources(&orion_ge01_shared, orion_ge01_shared_resources, + mapbase + 0x2000, SZ_16K - 1, irq_err); ++ orion_ge01_shared_data.tx_csum_limit = tx_csum_limit; + ge_complete(&orion_ge01_shared_data, + orion_ge01_resources, irq, &orion_ge01_shared, + eth_data, &orion_ge01); +Index: alunn/arch/arm/plat-orion/include/plat/common.h +=================================================================== +--- alunn.orig/arch/arm/plat-orion/include/plat/common.h 2012-07-20 09:14:46.000000000 +0200 ++++ alunn/arch/arm/plat-orion/include/plat/common.h 2012-07-20 17:50:26.772928691 +0200 +@@ -39,12 +39,14 @@ void __init orion_rtc_init(unsigned long + void __init orion_ge00_init(struct mv643xx_eth_platform_data *eth_data, + unsigned long mapbase, + unsigned long irq, +- unsigned long irq_err); ++ unsigned long irq_err, ++ unsigned int tx_csum_limit); + + void __init orion_ge01_init(struct mv643xx_eth_platform_data *eth_data, + unsigned long mapbase, + unsigned long irq, +- unsigned long irq_err); ++ unsigned long irq_err, ++ unsigned int tx_csum_limit); + + void __init orion_ge10_init(struct mv643xx_eth_platform_data *eth_data, + unsigned long mapbase, + + diff --git a/3.2.34/linux-2.6-defaults-fat-utf8.patch b/3.2.34/linux-2.6-defaults-fat-utf8.patch new file mode 100644 index 0000000..0d40fd3 --- /dev/null +++ b/3.2.34/linux-2.6-defaults-fat-utf8.patch @@ -0,0 +1,15 @@ + +https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=181963 + +--- linux-2.6.15.noarch/fs/fat/inode.c~ 2006-02-20 23:20:12.000000000 -0500 ++++ linux-2.6.15.noarch/fs/fat/inode.c 2006-02-20 23:21:42.000000000 -0500 +@@ -952,7 +952,8 @@ static int parse_options(char *options, + opts->shortname = 0; + opts->name_check = 'n'; + opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0; +- opts->utf8 = opts->unicode_xlate = 0; ++ opts->utf8 = 1; ++ opts->unicode_xlate = 0; + opts->numtail = 1; + opts->nocase = 0; + *debug = 0; diff --git a/3.2.34/linux-2.6-x86-tune-generic.patch b/3.2.34/linux-2.6-x86-tune-generic.patch new file mode 100644 index 0000000..7a7c76e --- /dev/null +++ b/3.2.34/linux-2.6-x86-tune-generic.patch @@ -0,0 +1,13 @@ +* Optimise for today's CPUs. + +--- linux-2.6/arch/x86/Makefile_32.cpu 2006-01-09 11:39:04.000000000 -0500 ++++ linux-2.6/arch/x86/Makefile_32.cpu 2006-01-09 11:39:36.000000000 -0500 +@@ -15,7 +15,7 @@ cflags-$(CONFIG_M486) += -march=i486 + cflags-$(CONFIG_M586) += -march=i586 + cflags-$(CONFIG_M586TSC) += -march=i586 + cflags-$(CONFIG_M586MMX) += -march=pentium-mmx +-cflags-$(CONFIG_M686) += -march=i686 ++cflags-$(CONFIG_M686) += -march=i686 $(call tune,generic) + cflags-$(CONFIG_MPENTIUMII) += -march=i686 $(call tune,pentium2) + cflags-$(CONFIG_MPENTIUMIII) += -march=i686 $(call tune,pentium3) + cflags-$(CONFIG_MPENTIUMM) += -march=i686 $(call tune,pentium3) diff --git a/3.2.34/linux-3.2-e2c-0.4.58.patch b/3.2.34/linux-3.2-e2c-0.4.58.patch new file mode 100644 index 0000000..9f8b37f --- /dev/null +++ b/3.2.34/linux-3.2-e2c-0.4.58.patch @@ -0,0 +1,7807 @@ +--- linux-3.2-rc5/fs/ext2/ChangeLog.e2compr-26port 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2-rc5-e2c/fs/ext2/ChangeLog.e2compr-26port 2011-12-13 14:22:47.822975235 +0100 +@@ -0,0 +1,439 @@ ++ ++e2compr - Released under the GPL V 2 license. ++ ++ ++Installation: ++============= ++ ++1. gunzip: ++ > gunzip linux-3.1-rc3-e2c-0.4.58.patch.gz ++ ++2. change to you kernel directory ++ ++3. make clean: ++ > make clean ++ ++3. patch: ++ > patch -p1 < ../patch/to/patch/linux-3.1-rc3-e2c-0.4.58.patch ++ ++ see if any rejects occured: ++ > find | grep .rej ++ ++ WARNING: All rejects must be fixed manually! ++ ++4. config: ++ > make oldconfig ++ > make menuconfig ++ Now enable at least the ext2-compression feature: ++ Filesystems: ++ <*> Second extended fs support ++ [ ] Ext2 extended attributes ++ [ ] Ext2 execute in place support ++ [*] Ext2 file compression (DANGEROUS) ++ Ext2 file compression options ---> ++ ++5. make: ++ > make ++ ++ ++Building a patch: ++================= ++ ++files.txt: ++ ++fs/ext2/ChangeLog.e2compr-26port ++Documentation/filesystems/e2compress.txt ++fs/ext2/Readme.e2compr ++fs/Kconfig ++include/linux/ext2_fs_c.h ++fs/ext2/Makefile ++fs/ext2/compress.c ++fs/ext2/e2zlib.c ++fs/ext2/adler32.c ++fs/ext2/super.c ++fs/ext2/ialloc.c ++fs/ext2/balloc.c ++fs/ext2/inode.c ++fs/ext2/file.c ++fs/ext2/ioctl.c ++fs/ext2/ext2.h ++include/linux/ext2_fs.h ++fs/fcntl.c ++mm/truncate.c ++mm/swapfile.c ++mm/filemap.c ++mm/page_alloc.c ++ ++ ++cat files.txt | xargs -n1 -I '{}' diff -pruNbB linux-3.1-rc3/'{}' linux-3.1-rc3-e2c/'{}' > ./linux-3.1-e2c-0.4.58.patch ++ ++ ++Changelog: ++========== ++ ++25 August 2011 ++ Matthias Winkler ++ * released version 0.4.58 for kernel 3.1 ++ * file.c: i_alloc_sem was removed. I am not sure if only holding i_mutex ++ will be enough. See http://patchwork.ozlabs.org/patch/101859/. ++ In ext2_file_write() I replaced: ++ ++ mutex_lock(&inode->i_mutex); ++ - down_read(&inode->i_alloc_sem); ++ + atomic_inc(&inode->i_dio_count); ++ ++ - up_read(&inode->i_alloc_sem); ++ + inode_dio_done(inode); ++ mutex_unlock(&inode->i_mutex); ++ ++ The main prupose of i_dio_count is blocking vmtruncate_range() ++ as long as the i_dio_count is greater than 0. In other words, ++ all direct io must be completed before truncating is allowed. ++ ++ * file.c: generic_osync_inode was removed from mm - added functionality to ++ file.c as ex_generic_osync_inode() ++ * file.c: changed: &inode_lock to &inode->i_lock ++ * ext2_warning() replaced by ext2_msg() ++ * compress.c: vfs_dq_init(inode) replaced by dquot_initialize(inode) ++ * compress.c: ext2_truncate(inode) replaced by ++ ext2_truncate_blocks(inode, inode->i_size) which looks like ++ exactly the same! ++ * inode.c: dentry->d_lock now seems to need ++ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED) held. ++ * compress.c, inode.c: added might_schedule() before wait_on_buffer() ++ statements to assure we are not atomic at this point. ++ * truncate.c: removed patch from memory.c and moved it to truncate.c ++ as surrounding kernel code also moved there. vmtruncate() was ++ split in truncate_setsize() and truncate_pagecache() with kernel 3.1 ++ ++ ++10 August 2009 ++ Matthias Winkler ++ * released version 0.4.58 ++ * merged assert.h and debug.h into ext2_fs_c.h ++ * merged NDEBUG into EXT2_COMPR_DEBUG ++ * disabled adler cheksums on "read" if not defined EXT2_COMPR_DEBUG. ++ * merged none.c into compress.c ++ * inserted multiple defines "CONFIG_EXT2_COMPRESS" to allow disabling ++ of ext2compression with patched sources. ++ * re-inserted EXPORT_SYMBOL(__pagevec_free) to support ext2 as module ++ ++05 August 2009 ++ Matthias Winkler ++ * released version 0.4.57 ++ * ported to kernel 2.6.30: ++ inode.c: after fix of generic ext2 ext2_get_blocks() needed to remove bforget. ++ * integrated SMP from version 0.4.56 ++ * per CPU one separate read and one separate write working area ++ * removed all external compression codecs ++ * removed "verify compression" (never helped to find a bug anyway) ++ * Lindent'ed all source and header files ++ ++01 August 2008 ++ Matthias Winkler ++ * released version 0.4.55 ++ * complete code cleanup ++ * changed policy to ALWAYS_LOCKING pages in do_generic_mapping_read() ++ => completely removed PG_Compr-Flag now! ++ ++31 July 2008 ++ Matthias Winkler ++ * released version 0.4.54 ++ * fixes rare himem bug: only occures if page > cluster in inode.c/readpage() ++ * fixes rare readpage bug in mm/filemap.c/do_generic_mapping_read(): ++ PG_Compr flags dissallow reading a page while de/compressing. ++ Setting and unsetting it requires the page lock, with one exception ++ do_generic_mapping_read() in filemap.c. This is done because of performance ++ reasons. Anyway, a simultaneous call of do_generic_mapping_read() for the SAME ++ page might break the PG_Compr-Mimic. ++ ++ Solutions: Always lock any page before reading OR second(n-th) call of ++ do_generic_mapping_read() busy waits until first is done. ++ Default is busy wait now, ALWAYS_LOCK implemented as option via define. ++ ++25 June 2008 ++ Matthias Winkler ++ * released version 0.4.53 ++ * fixes himem bug: unmapped block in ext2_decompress_cluster() ++ * fixes bdev bug: ext2_get_block() must be called for every block ++ which cause ooops because of bdev == NULL. ext2_get_block() will ++ set the correct bdev and the correct blocknumber of the block. ++ ++ NEVER assign bdev manually, because the blocknumber might be random then: ++ "block->b_bdev = something" (DON'T!) ++ ++ ALWAYS use: ++ if (!buffer_mapped(block)) || (block->b_bdev == NULL) ++ ext2_get_block() ++ ++ Bdev bug is closely related to file holes (empty block in a file). ++ If compressed data will be written to a former hole, then ++ usually ext2_get_block() must be called with create. ++ ext2_get_block( , , , 1 /*create*/). ++ ++ * fixed missing include in xattr.h ++ * EXT2_COMPRBLK might be removed during compression if a cluster ++ doesn't compress. During compression we re-raise EXT2_COMPRBLK ++ flag after every cluster now. ++ * added missing export of __pagevec_free to (mm/page_alloc.c) ++ * deny O_DIRECT access mode after open of a file using fcntl() ++ (in fs/fcntl.c). ++ * file.c: ++ Replaced ext2_filew_write() to use kernels generic ++ do_sync_write(). Writing on compressed files calls ++ ext2_filew_write(): ++ - divide write range into clusters ++ - ext2_decompress_cluster (if needed) ++ - do_sync_write() ++ - ext2_compress_cluster (if needed) ++ * inode.c: ++ ext2_writepage()/ext2_writepages() usually writes back ++ dirty pages of an inode. They reside in the kernels page cache. ++ This pages might e.g. be written/dirtied by a mmap()-ped file. ++ Also generic_file_aio_write() uses ext2_writepage() finally. ++ I don't see how the ext2_writepage() would handle compressed ++ files, so I re-inserted and re-wrote this part of old 2.4 code. ++ Don't know if this code (USE_WRITEPAGE) is needed at all. ++ So I leave it disabled by default. Enabled it might ++ leave compressed files with compression ratio of 100%. ++ Don't use yet! ++ ++17 April 2008 ++ Matthias Winkler ++ * first patch for kernel 2.6.25 released ++ ++20 March 2008 ++ Matthias Winkler ++ * version 0.4.52: EXT2_COMPRESS_WHEN_CLU didn't work. this ++ feature enables compression during file write. ++ ++15 Oct 2007 ++ Matthias Winkler ++ * First offical Sourceforge release as version 0.4.51 ++ * TODO: figure out what is necessary to enable swap ++ suppport for e2compr again (see mm/swapfile.c). ++ ++27 Sep 2007 ++ Matthias Winkler ++ * System stalled with a lot of I/O during de-compression of ++ USB-Sticks, too. I replaced mark_buffer_dirty ++ with set_buffer_dirty. This achieves that ONLY the buffers ++ and not the pages are marked. Then I write back the ++ buffers with ll_rw_block() at the end of ++ ext2_decompress_cluster() and ext2_decompress_pages(). ++ This should stop flooding the system with dirty pages. ++ Because now every routine waits for its newly dirtied buffers. ++ My system with 128MB of RAM is responding much more better during ++ compression/decompression now. Desompression also seems ++ to be a bit faster. ++ (this change is active with: #ifndef E2C_GENERIC_OSYNC) ++ ++25 Sep 2007 ++ Matthias Winkler ++ * System stalled with a lot of I/O during compression of ++ USB-Sticks. Seems generic_osync_inode() should not be ++ called in ext2_compress_cluster. Therefore I replaced ++ it with ll_rw_block() to write the modified blocks ++ directly back to disk. This gave also a ~100% better ++ performance for compression. ++ ++9 Sep 2007 ++ Matthias Winkler ++ * fixed bdev-bug. this bug appeared primarily when ++ files contained holes. A page with holes, which ++ was dirty caused ext2_get_cluster_blocks [ext2_get_block()] ++ to create ALL blocks of the page, even if there were holes! ++ These allocated hole-blocks weren't set to 0 anywhere and ++ therefore contained invalid data. I changed the ++ code to never allocate these holes. ++ ++ * ext2_truncate() added again to ext2_compress_cluster for ++ uncompressed clusters. Fixes filesize errors reported by ++ "e2fsck -f /dev/..." ++ ++24 Aug 2007 ++ Matthias Winkler ++ ++ Major changes: ++ * completly ported inode->i_mutex ++ ++ * clever CONFIG_GZ_HACK to reject "uncompressable" files ++ (according to their extension) early. The IOCTL in ioctl.c ++ which sets the compression on the file already rejects such ++ extensions now. ++ ++ * new create_empty_buffers_e2c() was necessary, because the ++ "extra"-pages should NOT have a valid i_mapping! Further the ++ buffers needed to be initalized right. ++ ++ * proper block initalization (bdev-bug) in: ++ - create_empty_buffers_e2c() ++ - ext2_get_cluster_blocks ++ ++ * in file.c copied: ++ ...with one single change at ext2_mapping_read in label page_ok: ++ A new Page-Flag (page-flags.h) the so called "PG_compr"-Flag is ++ checked to assure the corresponding page is not under ++ compression/decompression. This was necessary because ++ generic_mapping_read() doesn't lock() the page in ALL cases!!! ++ Otherwise the generic_mapping_read() would have to lock EVERY page ++ in the whole system before returning it.... ++ ++ * Fixed HiMem-Support: Balanced ALL kamp/kunmap calls. Unbalanced ++ functions cause the system to hang at "kmap_himem()" after some ++ time. Can be seen with magic-sysctrl "altgr + prtscr + W". ++ ++ * ext2_decompres_cluster() didn't mark uptodate pages for writeback. ++ Don't know how this method could EVER have worked... ++ ++ * ext2_compress_cluster() caused an always increasing amount of dirty-pages ++ (cat /proc/vmstat) which couldn't be wrote back by sync/umount. ++ I think this was due the ClearPageDirty at the end of ext2_compress_cluster(). ++ ++ * introduced ext2_get_dcount() to savely determine if a file is really "open" ++ and to abort compression/decompression in such a case. ++ ++ * Removed gzip completely and not working assembler code. Replaced by the ++ kernels built-in zlib, which is pretty the same code... ++ ++ * New kernel configuration interface ++ ++ * Rollback of some unecessary "fixes"... ++ ++ TODO: ++ ++ * HiMem-Support: ++ One might try to use kmap_atomic instead of kamp in ext2_readpage. kmap_atomic ++ doesn't block and might speed up the regular page reading. might. ++ ++20 April 2007 ++ Andreas: ++ ++ * Replaced GZIP with zlib of the kernel because the assembly versions of existing ++ compression modules crashed. ++ ++ * Replaced gzip with the kernel zlib, which is built-in anyway ++ ++ * Initial HiMem-Support. ++ ++ ++06 Mar 2007 ++ ++ Terry Loveall ++ ++ * adapted linux-2.6.10-e2compr-0.4.45-alpha0126.diff to 2.6.18.5 kernel ++ ++ * replaced most instances of down/up(inode->i_sem) with ++ lock/unlock(inode->i_mutex). For exception see file.c, below. ++ ++ * made various printk regularizations to uniquely identify each printk ++ instance. Inserted missing KERN_DEBUG and KERN_WARNING. ++ ++ * compress.c: ++ bug fix: ext2_count_blocks: init head_bh for each iteration. ++ bug fix: ext2_count_blocks: add set clen=ulen for uncompressable clusters. ++ bug fix: ext2_compress_cluster: replacement and inlining of an ++ invalidate_inode_buffers function to keep root filesystem changes ++ uptodate on disk (prevents umounting root file system to update). ++ warning fix: ext2_compress_cluster: various variables initialized. ++ ext2_compress_cluster: removed #ifdef NDEBUG ++ bug fix: ext2_compress_cluster: defined maxclus, calculate and set for: ++ bug fix: ext2_compress_cluster: set filesize for uncompressed clusters. ++ ext2_cleanup_compressed_inode: changed error message to indicate 'Z' ++ flag was caused by trying to un/compress already open file. ++ bug fix: cp to compr dir: Truncate uncompressed files to their ++ uncompressed length, i.e. force kernel to update inode and sb ++ ++ * file.c: ++ removed file->f_error code since f_error no longer in file struct. ++ ext2_file_write: changed down/up i_sem to down_read/up_read i_alloc_sem ++ ++ * inode.c: ++ bug fix: ext2_get_block: restored changed: loop to bforget ++ ++ * ioctl.c: ++ ext2_ioctl: scrubbed 'B' flag on file uncompress. ++ ++ * match[56]86.S: ++ made code dependent on #ifdef CONFIG_REGPARM to compile with either ++ register variable or stack variable parameter passing. ++ ++28 Feb 2005 ++ ++ Yabo Ding , ++ ++ * Corrected page unlocking in inode.c. ++ ++19 Feb 2005 ++ ++ Paul Whittaker ++ ++ * Added corrections le32_to_cpu in critical areas of compress.c ++ * Optimized function exit code in inode.c. ++ ++24 Aug 2004 ++Yabo Ding , ++ ++ compress.c ++* ext2_decompress_pages() ++ The old code cannot reread data from disk to a changed buffers data pointer in 2.6.x. ++ So, I copy memory data(decompressed) to a temporary buffer; ++ Then reread data(compressed) from disk, and copy to head; ++ Then copy back the memory data from temporary buffer. ++ It seems clumsy, but it works well. ++* ext2_compress_cluster() ++ Force write to disk. ++ ++ inode.c ++* ext2_writepage() ++ Delete old code. All directly call block_write_full_page() function. ++ ++* ../Kconfig ++ Change e2compr config as a submenu config ++ ++04 Aug 2004 ++ ++Paul Whittaker ++ ++* compress.c: replaced mark_buffer_dirty(x,y) with mark_buffer_dirty(x). I'm ++ still not at all sure that this is sufficient. ++ ++03 Aug 2004 ++ ++Paul Whittaker ++ ++* ../../include/linux/ext2_fs_c.h: added missing prototypes for ext2_iLZRW3A(), ++ ext2_iLZRW3A(), ext2_rLZRW3A(). ++ ++02 Aug 2004 ++ ++Paul Whittaker ++ ++* ../../mm/page_alloc.c: added EXPORT_SYMBOL(__pagevec_free). ++ ++* ../../include/linux/pagemap.h, ../../mm/filemap.c: removed inline from ++ __grab_cache_page() declarations, added EXPORT_SYMBOL(__grab_cache_page). ++ ++* ../../include/linux/mm.h, ../../mm/filemap.c: removed inline from ++ page_waitqueue() declarations, added EXPORT_SYMBOL(page_waitqueue). ++ ++* bzip2/{lib_bzip_d,lib_bzip_e}.c, {gzip,lzo,lzrw3a,lzv1}/e2compr*.c: ++ replaced MOD_INC_USE_COUNT and MOD_DEC_USE_COUNT with try_module_get() ++ and module_put() to avoid deprecation and safety warnings. ++ ++* lzrw3a/lzrw3a.c: added (UBYTE *) casts to avoid compiler warnings. ++ ++* compress.c, inode.c: incorporated Yabo's changes, correcting mistakes in ++ ext2_readpages() in inode.c. ++ ++* removed printks for ext2_discard_prealloc from file.c and inode.c (not ++ needed now that this problem has been resolved). ++ ++2.6.5 -> 2.6.7 updates: ++ ++* ../../mm/filemap.c: rewrote CONFIG_EXT2_COMPRESS hunk for 2.6.7. ++ ++* compress.c, file.c: use mapping_mapped(), since mapping->i_mmap has changed ++ and mapping->i_mmap_shared no longer exists. ++ ++* inode.c: page->count becomes page->_count. +--- linux-3.2-rc5/Documentation/filesystems/e2compress.txt 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2-rc5-e2c/Documentation/filesystems/e2compress.txt 2011-12-13 14:22:47.824975303 +0100 +@@ -0,0 +1,116 @@ ++Transparent compression for ext2 filesystem ++=========================================== ++ ++What this document is. ++---------------------- ++This document is intended for explaining how e2compress has been implented/ported ++in kernel 2.4. It also give a status of current work. You need to have e2compress ++knowledge (i.e. to know how e2compress works, from a general point of view) ++ ++What this document is not. ++-------------------------- ++This document is not a full explaination of how e2compress work. For this, ++there are other documents such as fs/ext2/Readme.e2compr file for the technical ++point of view and user manual can be found at . ++This site is also a place were you will find many information about e2compress ++development for kernel 2.4, tools, manuals and so on. ++ ++ ++Introduction ++============ ++ ++This is a first adaptation of e2compress for kernel 2.4. The work has been done ++by Alcatel (Alcatel Business Systems - R&D) at Illkirch. It has been started ++from the latest patch provided by Peter Moulder for kernel 2.2, ++i.e. e2compr-0.4.39-patch-2.2.18. ++It is full compatible with previous version. ++Here after you will first find some explainations about the choices mades for ++the development, and then the status of current work from functionnal point of ++view. ++ ++ ++Development ++=========== ++ ++As for previous patches, most interesting happens when reading in ext2_readpage ++and when writing in ext2_writepage and ext2_file_write. ++In fact, in 2.2 kernel, compression occures on cluster of blocks. So when reading ++or writing a part of a file, we first have to compute the cluster on which I/O ++occures, then we have to get every buffers of the cluster, uncompress the data if ++needed, then reading/writing happens "as for normal files". ++In 2.4 kernels, I/O occures through page cache: i.e. when reading/writing to a ++part of the file, first the corresponding page is get, we then get the needed ++buffers, which point to the page; this means that for keeping same work as for 2.2, ++we have to use the notion of cluster of page. For getting every buffers of a cluster, ++we first get every pages of the cluster, then get buffers of every pages... ++ ++So, things happens as follow: ++ ++ext2_readpage ++------------- ++If data corresponding to the page are in a compressed cluster, this functions perfoms ++more works: instead of reading one page, it reads the whole "cluster of pages". ++In fact, anyway, we have to read all compressed buffer. Once we have got all buffers ++of the cluster, uncompressed (at least a part of) the data, and located the part of ++the uncompressed data which correspond to the requested page, there is not any more ++lot of work for also reading (i.e. doing some memcpy) other pages belonging to this ++cluster. ++So, the first reading of the first page of the cluster is quite longer, but then, ++every pages of the cluster are uptodate in the cache. ++ ++ext2_writepage ++-------------- ++An overhead has been added for pages belonging to a compressed cluster. ++In fact, if cluster is still compressed on the disk, we can't directly write the ++page (which contains uncompressed data) in the middle of a compressed cluster. ++So, we first have to uncompress the whole cluster on the disk, then we can write the ++new data of the dirty page(s). ++ ++ext2_file_write ++--------------- ++This replaces `generic_file_write' when e2compress option is activated. ++It is a copy of `generic_file_write'. The main difference is that instead of looping ++page by page in `generic_file_write', we loops on cluster of page. ++In each loop: ++ * we compute the cluster on which beginning of data (to be written) belongs to. ++ * then, we get all pages of the cluster. ++ * If cluster is a compressed one, we read all pages, and uncompress it. ++ Otherwise, we perfoms a `prepare_write' (as in generic_file_write). ++ * We copy the data on each page from user space, ++ * Call `commit_write' on dirty pages. ++ * When reaching end of cluster, we compress it. (As in 2.2) ++ ++Note: Another implentation could have been to keep generic_file_write, and add an overhead ++to `ext2_prepare_write' and `ext2_commit_write'; on the first access to a page of a compressed ++cluster, whole cluster will be uncompressed (i.e. all pages of the cluster will be read and ++uncompressed in `ext2_prepare_write') and when commiting the last page of the cluster, ++compression occures... ++ ++ext2_open_file ++-------------- ++In 2.4.16 kernel, this function has been added for treating the case of files opened for ++"direct IO". Direct IO is not supported on compressed file. So opening a file by this way ++is forbidden. ++ ++Other places in ext2 ++-------------------- ++Other changes occures as in 2.2 for managing the compression flags of files and specific ++`COMPRESSED_BLK_ADDR' address for compressed blocks. ++So please, refer to existing documentation for 2.2 about this topic. ++ ++Status ++====== ++Today (middle of december 2001), e2compress on kernel 2.4.16 has been tested on i386 ++architecture, is used with success by tens of people in the department from some weeks. ++It is full fonctionnal on ix86, full compatible with 2.2 version of e2compress. ++It should work on other architecture, but has NOT been tested. ++Please, note the following: ++ * No performance tests have been done. ++ * I don't proclaim that code is optimized (and it is probably not, but I hope that ++ "gurus" will not find it too bad) ++So, I think I can say that there is no known "big" bug or "blocking" bug. ++ ++Some strange things has been observed in very limit case, i.e. when memory is overloaded. ++ ++ ++As usual, this e2compress comes without warranty, use it at your won risk, etc... +--- linux-3.2-rc5/fs/ext2/Readme.e2compr 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2-rc5-e2c/fs/ext2/Readme.e2compr 2011-12-13 14:22:47.825975345 +0100 +@@ -0,0 +1,511 @@ ++ ++ 0. Introduction ++ ~~~~~~~~~~~~~~~ ++ ++This file gives some technical information on e2compr and how it's ++implemented. ++ ++More general information on e2compr can be found at ++http://e2compr.sourceforge.net/. ++ ++The first couple of sections of this document are written for those ++who have no interest in the source code but just want to know enough ++to be able to predict and understand e2compr behaviour and its ++implications. ++ ++Section 3 describes the e2compr-specific ext2 attributes for a file ++(i.e. chattr things). ++ ++Section 4 describes the e2compr ioctls from the point of view of a ++user-mode C programmer. ++ ++Section 5 gives more detail about the file format on disk. ++ ++Section 6 gives details on what's written where, i.e. a map of e2compr ++code in the kernel. ++ ++ ++Authorship: section 2 is written mainly by Antoine; the remainder is ++written by Peter. ++ ++Questions should be sent to the e2compr mailing list, ++e2compr-misc@lists.sourceforge.net, or to the current maintainers, ++bothie@users.sourceforge.net and whitpa@users.sourceforge.net. ++ ++ ++ 1. The idea ++ ~~~~~~~~~~~ ++ ++See section `E2compr implementation' in the main e2compr texinfo ++documentation for an introduction to how e2compr works. (Type ++`info "(e2compr)Implementation"' at the shell prompt.) It was ++originally written as part of the file you're now reading. ++ ++ ++ 2. More details ++ ~~~~~~~~~~~~~~~ ++ ++Every compressed file stores its cluster size in the inode structure ++(in the ext2 attribute flags field). ++This (the cluster size) is the most important information: when ++knowing the cluster size, we can convert a block number into a cluster ++number, get the cluster the block belongs to, and then get the block. ++The inode's flags field also keeps the algorithm that is used to compress data ++written to the file. ++ ++(The algorithm that was used to compress a given ++cluster is stored in the cluster head near the beginning of the ++compressed data. This may differ from the current algorithm ++identified in the inode, which is only used to determine which ++algorithm to use at the time clusters are written.) ++ ++The algorithm id and the cluster size are stored in the i_flags field ++(thus reducing the number of possible flags). We also create some new ++flags: the COMPRBLK flags tells if there is at least one compressed ++cluster in the file, the ECOMPR flag indicates that an error (related ++to compression) occurred while reading from or writing to this file. ++If it is set, the file becomes read-only. (In previous releases, you ++were denied even read access to the file unless you set the NOCOMPR ++flag. There might be some benefit in returning to the old behaviour ++if decompressing erroneous data can cause an OOPS, but I think it ++would be better to correct the decompressors. Others may disagree, ++pointing out that it costs CPU time to check for incorrect data.) ++ ++Beside the information stored into the inode, each cluster holds some ++data. Here is the cluster_head structure for e2compr-0.4: ++ ++struct ext2_cluster_head { ++ __u16 magic; /* == EXT2_COMPRESS_MAGIC_04X. */ ++ __u8 method; /* compression method id. */ ++ __u8 holemap_nbytes; /* length of holemap[] array */ ++ __u32 checksum; /* adler32 checksum. Checksum covers all fields ++ below this one, and the compressed data. */ ++ __u32 ulen; /* size of uncompressed data */ ++ __u32 clen; /* size of compressed data (excluding cluster head) */ ++ __u8 holemap[0]; /* bitmap describing where to put holes. */ ++}; ++ ++The `magic' field is a magic number. It is used to detect filesystem ++corruption, and can also be used for data recovery purposes. (The ++e2compress program for e2compr-0.3 does this.) ++ ++The `checksum' field contains an Adler-32 checksum on the fields below ++it in the struct and the compressed data. Its purpose is to protect ++us from buffer overruns caused by corrupted data. ++ ++The `ulen' field says how many bytes are stored in the cluster, when ++uncompressed. ++ ++The `clen' field says how many bytes are held in the cluster, when ++compressed. ++ ++The `method' ++field identifies the algorithm that was used to compress the cluster ++(this id will be used to uncompress the cluster, not the one stored ++into the inode that will be used only to compress a new cluster). ++ ++The variable-length `holemap' array says where to put hole blocks when ++decompressing data. The `holemap_nbytes' field gives the length of ++this array. Iff holemap_nbytes is zero then there are no holes (other ++than at the end of the cluster, as determined by ulen versus cluster ++size). ++ ++The compressed data immediately follows the holemap array (with no ++padding before it). ++ ++ ++Compressing a cluster is done in the following way: We first get every ++block in the cluster and compute the bitmap. We then compress the ++non-hole data, and store back the compressed data into the existing ++blocks. Unused blocks are then freed. ++ ++Decompressing a cluster is done in the following way: We get the ++cluster head and retrieve the bitmap. Missing blocks are allocated and ++put where the bitmap says, and then compressed data is decompressed and ++stored back into the blocks. ++ ++ ++Reading from a compressed cluster is really easy: get the blocks, ++decompress them into a working area, and get the bytes we want from ++the working area. Writing to a compressed cluster is done by first ++decompressing the cluster, and then write to it, as if it were a ++normal file. The file is then marked so that the cluster will be ++recompressed later. [pjm: Do we decompress the cluster even if it's ++to be entirely written over?] ++ ++In the current version, compression really occurs only when the inode ++is put (which in turn only occurs when no processes have the file ++open). This may change. ++ ++ ++ 3. Ext2 file attributes ++ ~~~~~~~~~~~~~~~~~~~~~~~ ++ ++Attribute Lsattr Meaning ++~~~~~~~~~ ~~~~~~ ~~~~~~~ ++EXT2_SECRM_FL s Secure deletion (not yet implemented) ++EXT2_UNRM_FL u Undelete-able. (Not yet implemented.) ++EXT2_COMPR_FL c Future writes to this file should be compressed. ++ (Clearing this flag decompresses the file if it ++ is a regular file and there is space to do so; ++ see the e2compr FAQ for details.) ++EXT2_SYNC_FL S Synchronous updates. (As far as I know, this is ++ not yet fully implemented.) ++EXT2_IMMUTABLE_FL i Immutable file. ++EXT2_APPEND_FL a Writes to file may only append. ++EXT2_NODUMP_FL d Not a candidate for backup with dump(8). ++EXT2_NOATIME_FL A No access time updates. ++EXT2_DIRTY_FL Z De/compression is yet to happen. Read the ++ source for exact meaning. ++EXT2_COMPRBLK_FL B File contains one or more compressed clusters. ++EXT2_NOCOMPR_FL X Access raw compressed data. This isn't really ++ supported at the moment; user-space access is ++ yet to be worked out for 0.4. ++EXT2_ECOMPR_FL E Compression error associated with this file ++EXT2_BTREE_FL I B-tree indexed directory (seemingly not yet implemented) ++EXT2_RESERVED_FL - (reserved for ext2 lib) ++ ++See the chattr(1) man page for more verbose descriptions of the ++non-e2compr flags. ++ ++ ++ 4. Ioctls available ++ ~~~~~~~~~~~~~~~~~~~ ++ ++ In brief ++ ~~~~~~~~ ++ ++Action Ioctl To kernel From kernel ++~~~~~~ ~~~~~ ~~~~~~~~~ ~~~~~~~~~~~ ++Get cluster bit EXT2_IOC_GETCLUSTERBIT Cluster num 1 or 0 (cmp,uncmp) ++Recognize compressed Cluster num - ++ EXT2_IOC_RECOGNIZE_COMPRESSED ++Get algorithm EXT2_IOC_GETCOMPRMETHOD - Id ++Set algorithm EXT2_IOC_SETCOMPRMETHOD Id - ++Get cluster size EXT2_IOC_GETCLUSTERSIZE - Cluster size ++Set cluster size EXT2_IOC_SETCLUSTERSIZE Cluster size - ++Get attributes EXT2_IOC_GETFLAGS - Flags ++Set attributes EXT2_IOC_SETFLAGS Flags - ++Get block size FIGETBSZ - Block size ++ ++#include to use any of these ioctls, except FIGETBSZ, ++which requires . ++ ++To find out what errors can be returned by these ioctls, read ++fs/ext2/ioctl.c (for all of the above ioctls except FIGETBSZ) or ++fs/ioctl.c (for FIGETBSZ). ++ ++ ++ Setting or testing a cluster bit ++ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++[Note: user-space access to compression details are yet to be worked out, ++so this section may not be accurate.] ++ ++EXT2_IOC_GETCLUSTERBIT sets *arg to 1 if the specified cluster (0 for first ++cluster, 1 for second, etc.) is stored in compressed form. ++ ++To make the kernel consider a certain cluster to be compressed (after ++you've done the compression yourself, in user space), use ++EXT2_IOC_RECOGNIZE_COMPRESSED. This ioctl checks the validity of the ++cluster's data, then marks it as compressed (if valid). This ioctl ++requires special priveleges, because if the compressed data is not ++valid then it may be possible to crash the system (due to buffer ++overruns). ++ ++ ++ Setting or getting the compression algorithm ++ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++EXT2_IOC_SETCOMPRMETHOD sets the default compression method (stored in ++the inode). This is the compression method that is used for future ++writes. In the current version of e2compr [accurate at 0.4.36], this ++does not cause a change to how ++existing clusters are stored, except when the compression method ++changes from `none' to something else, in which case the kernel ++attempts to compress ,all currently-uncompressed clusters` using the ++new algorithm. It is an error to use this ioctl on a file without the ++compressed attribute. ++ ++EXT2_IOC_GETCOMPRMETHOD sets *arg to the current compression method. ++ ++In either case, Id is one of: EXT2_DEFER_METH, EXT2_LZV1_METH, ++EXT2_AUTO_METH, EXT2_NEVER_METH, EXT2_BZIP2_METH, EXT2_LZO1X_1_METH, ++EXT2_LZRW3A_METH (deprecated), EXT2_GZIP1_METH, EXT2_GZIP2_METH, ..., ++EXT2_GZIP9_METH. ++ ++ ++ Setting or getting the cluster size ++ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++EXT2_IOC_SETCLUSTERSIZE sets the cluster size to the value of *arg. ++This ioctl fails if there are already compressed clusters in the file ++(as determined by checking the EXT2_COMPRBLK_FL attribute). ++ ++EXT2_IOC_GETCLUSTERSIZE sets *arg to the current cluster size. ++Surprisingly, this ioctl succeeds even if the EXT2_COMPR_FL attribute ++is clear. (Maybe this will change in future, since the result is ++meaningless.) ++ ++In either case, the size is one of {4, 8, 16, 32}, and represents the ++number of blocks per cluster. To convert to or from a number of ++bytes, use the FIGETBSZ ioctl. ++ ++ ++ Setting or getting the ext2 file attributes ++ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++These ioctls (EXT2_IOC_GETFLAGS and EXT2_IOC_SETFLAGS) are not ++e2compr-specific, but some attributes are e2compr-specific. ++ ++*arg consists of the set of attributes for that file OR'ed together. ++E.g. a value of (EXT2_COMPR_FL | EXT2_COMPRBLK_FL | EXT2_NODUMP_FL) ++for a regular file means that the file contains one or more compressed ++clusters, and should not be backed up when using dump(8). ++ ++See section 3 for a description of the various attributes. ++ ++Note that although the compression method and cluster size are ++physically stored in the flags field on disk this information is ++masked out (i.e. set to zero) for GETFLAGS if the kernel has e2compr compiled in. ++If the kernel does not have e2compr compiled in, then this information ++is not masked out. See section 5 for how the cluster size and ++compression method is stored if you wish to work with ,kernels without ++e2compr`. ++ ++ ++ Getting the block size ++ ~~~~~~~~~~~~~~~~~~~~~~ ++ ++This ioctl (FIGETBSZ) is not e2compr-specific, but is useful in ++interpreting a cluster size (which is specified as a number of blocks ++rather than bytes or kilobytes). ++ ++*arg is set to the block size (in bytes) of the file. For ext2 files, ++this is one of {1024,2048,4096}. It is the same value for all files ++on the same filesystem. ++ ++You must #include to use this ioctl (unlike the rest of ++the ioctls listed here, which require ). ++ ++ ++ 5. File format ++ ~~~~~~~~~~~~~~ ++ ++A note on byte ordering. All current versions of the kernel and ++e2compr write to disk in little-endian format, so the 16-bit number ++`0x8EC7' would be written as a 0xC7 byte followed by a 0x8E byte. ++Unless you want to know the most general rule for byte ordering, you ++can skip to the `Inode' heading. ++ ++In kernel 2.0, the ext2 fs is written to disk in the native byte ++ordering. On x86 machines, this means little endian; most other ++architectures are big-endian (so the same 16-bit number would be ++written as an 0x8E byte followed by 0xC7). ++ ++On kernel 2.1 and later, the ext2 fs (including e2compr data) is ++written in little-endian order regardless of the host architecture. ++ ++ ++ 5.1. Inode ++ ~~~~~~~~~~ ++ ++fs/inode.c controls the reading and writing of inode information ++to/from disk; consult this file (functions ext2_read_inode(), ++ext2_update_inode() and/or ext2_write_inode()) for any detail omitted ++from this section. ++ ++The physical structure of an inode is struct ext2_inode (defined in ++include/linux/ext2_fs.h). ++ ++ ++The i_flags member contains the ext2 file attributes, as well as ++cluster size and compression method. ++ ++The normal flags are stored in the low 23 bits. Only the low 12 bits ++are defined at present, including 4 flags introduced by the e2compr ++patch. See ext2_fs.h for the flag meanings (search for ++EXT2_SECRM_FL). ++ ++Bits 23 through 25 hold the cluster size, or more precisely the log2 of ++the number of filesystem blocks per cluster (excluding the first cluster; ++see ext2_first_cluster_nblocks in include/linux/ext2_fs_c.h). ++ ++Bits 26 through 30 store the compression method. See the definitions ++for EXT2_LZV1_METH etc. in ext2_fs_c.h for the interpretation. ++ ++Bit 31 is reserved for ext2 lib (which means that programs like e2fsck ++store things there during its operation but it isn't used by the ++kernel). ++ ++ ++ Data blocks ++ ~~~~~~~~~~~ ++ ++Uncompressed clusters are stored just as they would be without ++e2compr. So if there are no compressed clusters then the file ++is stored identically to any other file. ++ ++ ++If a cluster is compressed, then the first non-hole block starts with ++a `cluster head', as defined in struct ext2_cluster_head in ext2_fs.h. ++ ++The magic number (i.e. the value of the `magic' field) is 0x8ec7. ++`method' holds one of EXT2_LZV1_ID and the like. `reserved_0' ++contains zero. `ubitmap' describes where the uncompressed data goes. ++(Recall that when we compress a cluster, we only compress the data ++from non-hole blocks, so we need to know where the holes and non-holes ++go when we decompress the data.) A `0' bit means a hole and a `1' bit ++means a data block; bit 0 refers to the first block, b1 the second, ++and so on. ++ ++ ++The block positions within the file where the compressed data is held ++is a subset of where the uncompressed data would be held. Further, if the ++uncompressed data occupies u non-hole blocks and this compresses to c ++blocks, then the compressed data occupies the first c non-hole blocks ++of the file (and the remainder are freed). ++ ++[This paragraph is an expansion of the preceeding: if you understood ++the preceeding paragraph then skip this one.] Consider an array ++cblock[] where cblock[0] holds the block number on disk (or 0 to ++represent a hole) of the first block of a certain cluster of a file, ++cblock[1] the second, and so on. (If you are familiar with the bmap ++array or the format of first-level indirect blocks, then cblock[] is a ++section of that array.) Suppose that the cluster size of this file is ++16 blocks. Suppose too that, when uncompressed, blocks 0, 1, 5 and 6 ++of the cluster are holes but the other 12 blocks (2,3,4,7,8,...,15) ++contain data. (Thus the bitmap is 0x0000ff9c.) Now if we compress this ++cluster to just 5 blocks, then cblock[0], [1], [5] and [6] will continue ++to be holes, ,the positions of the compressed data blocks` are stored in ++cblock[2], cblock[3], [4], [7] and [8], the blocks referenced by ++cblock[9] through cblock[15] are freed, and cblock[9] through cblock[15] ++are set to zero. ++ ++ ++ 6. What's coded where ++ ~~~~~~~~~~~~~~~~~~~~~ ++ ++File names in this section are relative to linux/fs/ext2, except for ++ext2_fs.h which is in linux/include/linux. ++ ++Most of the action happens in compress.c; though note that a few ++small, commonly-used routines are written as inline functions in ++ext2_fs.h. ++ ++ext2_readpage() and ext2_mmap() are in file.c. ext2_file_write() is ++also there. ++ ++Routines to read/write the inode from/to disk are in inode.c. ++ ++super.c contains some e2compr initialisation code (such as allocating ++the e2compr work area). ++ ++All ioctl handling is in ioctl.c. ++ ++acl.c is where we deny open() access in a couple of situations (if the ++EXT2_NOCOMPR_FL is set and another process has the file open; and we ++deny write access to a file with EXT2_ECOMPR_FL set). ++ ++ialloc.c contains code in ext2_new_inode() for newly-created files to ++inherit compression attributes from the directory in which they're ++created. ++ ++truncate.c handles truncation, i.e. zeroing any part of the cluster ++bitmap that's been truncated, and decompressing the final cluster (but ++marking dirty so that we try to recompress it on file close) if the ++new size is part-way through a compressed cluster, so that zeroing ++over the truncated data works. ++ ++linux/include/linux/ext2_fs_i.h has the definition of the ++ext2-specific parts of the in-memory inode. (The on-disk inode is ++defined in ext2_fs.h.) ++ ++linux/mm/filemap.c is also interesting, though there's no ++e2compr-specific code there. Similarly linux/include/linux/mm.h and ++linux/include/linux/fs.h. ++ ++generic_readpage() is in linux/fs/buffer.c. Also all buffer handling. ++ ++ ++The cleanup scheme ++~~~~~~~~~~~~~~~~~~ ++ ++inode->u.ext2_i.i_compr_flags has only a single bit defined: ++EXT2_CLEANUP_FL. This bit gets set to 1 to indicate that ++ext2_cleanup_compressed_inode() needs to be called. ++ ++There is a related flag stored on disk as well as in memory: ++EXT2_DIRTY_FL of i_flags. If ext2_cleanup_compressed_inode() couldn't ++finish it's job (e.g. due to I/O error) then it clears EXT2_CLEANUP_FL ++of i_compr_flags, but leaves EXT2_DIRTY_FL high. ++ ++In ext2_read_inode(), if EXT2_DIRTY_FL is high then EXT2_CLEANUP_FL is ++raised, in the hope that ,whatever was preventing ++ext2_cleanup_compressed_inode() from finishing` is now past. ++ ++Except for ext2_read_inode() as noted above, everything that raises ++EXT2_CLEANUP_FL (i.e. ext2_write_file(), ext2_ioctl() and ++ext2_truncate()) also raises EXT2_DIRTY_FL. ++ ++Nothing lowers either EXT2_CLEANUP_FL or EXT2_DIRTY_FL except ++ext2_cleanup_compressed_inode() (and one or both of new_inode and ++delete_inode routines). ++ ++ ++One feels that at least one of these cleanup flags ought to ++disappear. The main use of the persistent EXT2_DIRTY_FL is where the ++user does `chattr -c' in order to decompress the file, but there isn't ++enough space on the device to do this. We can get rid of this problem ++by having ext2_ioctl() call ext2_cleanup_compressed_inode() ++try to ++ ++ ++Notes on a few variables ++~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++Don't confuse the inode->i_dirt flag with (inode->u.ext2_i.i_flags & ++EXT2_DIRTY_FL). See section `The cleanup scheme' above for a ++description of EXT2_DIRTY_FL. ++ ++ ++inode->u.ext2_i.i_clu_nblocks, ++inode->u.ext2_i.i_log2_clu_nblocks: ++ ++i_clu_nblocks is always equal to ,1 << i_clu_nblocks` (except during a ++couple of cycles while they're being changed; I haven't consciously ++tried to avoid problems for SMP machines in this respect). ++ ++i_clu_nblocks is the number of blocks per cluster for this inode. ++ ++Old information: these variables were previously called ++`i_cluster_bits' and `i_cluster_size'. They were in an array: ++ ++inode->u.ext2_i.i_cluster_bits[2], ++inode->u.ext2_i.i_cluster_size[2]: ++ ++I believe the reason these were declared as an array was for the case ++where someone changes the cluster size of a file that was already ++compressed. (Reason for this belief: All readers of these fields use ++[0]. On creation (ialloc), read_inode, and `chattr +c' (where ++previously uncompressed), both [0] and [1] are updated. On change ++(IOC_SET_CLUSTERSIZE), only [0] is updated.) Since ,changing cluster ++size of an already-compressed file` isn't implemented, I've renamed ++them and made them scalars rather than arrays. ++ ++ ++inode->u.ext2_i.i_flags: When the e2compr patch is applied, this ++variable only holds the low 24 bits of the on-disk i_flags field. ++(Without the e2compr patch applied, all 32 bits are available. An ++interesting side effect of this is that user programs can access the ++compression algorithm and cluster size on kernels without e2compr ++patch by using the EXT2_IOC_GETFLAGS, EXT2_IOC_SETFLAGS ioctls.) ++ ++ ++inode->u.ext2_i.i_compr_method: Holds the compression method ++identifier. Starting from e2compr-0.4.0, this is different from an ++algorithm identifier: an example of a method is gzip9; the ++corresponding algorithm is gzip. See compress.c for where ++ext2_method_table and ext2_algorithm_table are defined. ext2_fs.h has ++some enumerations for addressing these tables (search for ++`EXT2_NONE_METH' and `EXT2_NONE_ALG'). +--- linux-3.2-rc5/fs/Kconfig 2011-12-10 00:09:32.000000000 +0100 ++++ linux-3.2-rc5-e2c/fs/Kconfig 2011-12-13 14:22:47.826975380 +0100 +@@ -7,6 +7,126 @@ menu "File systems" + if BLOCK + + source "fs/ext2/Kconfig" ++ ++config EXT2_COMPRESS ++ bool "Ext2 file compression (DANGEROUS)" ++ depends on EXT2_FS && EXPERIMENTAL ++ select CRYPTO ++ select CRYPTO_ALGAPI ++ select CRYPTO_DEFLATE ++ select ZLIB_INFLATE ++ select ZLIB_DEFLATE ++ help ++ Ext2 file compression allows transparent compression of files on an ++ ext2 filesystem. Transparent compression means that files are ++ stored on the disk in a compressed format but they are automatically ++ decompressed as they are read in and compressed when written out. ++ The user is in control of how and which files are compressed, using ++ the `chattr' utility (see chattr(1)). For the sake of safety, ++ administrative data (superblock, inodes, directories, etc.) are not ++ compressed. ++ ++ Compression is very useful if you're short on disk space, and ++ provides a better option than having lots of .gz files around. ++ For more information, see . ++ ++ You _need_ to have the special e2compr version of e2fsck to be able ++ to make use of this. ++ ++ If you say Y, you will be asked which compression algorithms you wish ++ to include. Gzip is a good all-round algorithm, as its 1..9 parameter ++ allows a good range of speed/compression trade-off. Other noteworthy ++ algorithms are LZV, which caters better to the faster/less compressing ++ end of the scale, and bzip, which caters slightly better to the more ++ compressing but slower end of the scale. ++ ++ Ext2 compression is still experimental, so unless you know you need ++ it, you'd better say N. ++ ++menu "Ext2 file compression options" ++ depends on EXT2_COMPRESS ++ ++choice ++ #depends on EXT2_DEFAULT_COMPR_METHOD_GZIP ++ prompt "Gzip parameter for default compression method" ++ default EXT2_DEFAULT_COMPR_METHOD_GZIP8 ++ help ++ You have selected `gzip' as your default compression algorithm, but ++ I need to know whether to use `gzip -1', `gzip -9', or somewhere ++ in between. gzip1 is the least compressing but fastest; gzip9 is the ++ most compressing and slowest; and the numbers in between have ++ characteristics in between (though not on a linear scale). ++ If unsure, say `8'. ++ ++config EXT2_DEFAULT_COMPR_METHOD_GZIP1 ++ bool "1" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP2 ++ bool "2" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP3 ++ bool "3" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP4 ++ bool "4" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP5 ++ bool "5" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP6 ++ bool "6" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP7 ++ bool "7" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP8 ++ bool "8" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP9 ++ bool "9" ++ ++endchoice ++ ++config GZ_HACK ++ bool "Exclude .gz files from automatic compression" ++ depends on EXT2_COMPRESS ++ default y ++ help ++ If you say Y here, then files created with names ending in `.gz' or ++ `.?gz' or `.bz2' don't inherit the `c' ("compress") attribute from ++ their parent directory. (However, you can still do `chattr +c FILE' ++ if you want to try to compress it anyway.) This means that you ++ don't waste CPU time trying to compress a file that probably can't ++ be compressed. See fs/ext2/namei.c if you want to add other rules. ++ If you have any aesthetic sensibilities then you will say N here ++ and try to implement something better. Most people will say Y here. ++ ++ ++choice ++ depends on EXT2_COMPRESS ++ prompt "Default cluster size (in blocks, usually 1KB each)" ++ default EXT2_DEFAULT_CLUSTER_BITS_5 ++ help ++ To make random access to compressed files reasonably fast the files ++ are compressed in clusters. By default, the clusters will be of the ++ size defined here but there is a modified version of the chattr ++ utility that can set the cluster size for each file independently. ++ Large clusters usually result in better compression at the cost of ++ being slower. ++ ++ Note that the answer to this question is specified in filesystem ++ blocks rather than in kilobytes, though most filesystems have 1KB ++ blocks anyway. (If you have a filesystem with large blocks then ++ you should know it, but if you want to check then "tune2fs -l ++ /dev/xxx | grep size".) The default is 32 blocks which is the ++ slowest setting but gives the best compression. ++ ++config EXT2_DEFAULT_CLUSTER_BITS_2 ++ bool "4" ++config EXT2_DEFAULT_CLUSTER_BITS_3 ++ bool "8" ++config EXT2_DEFAULT_CLUSTER_BITS_4 ++ bool "16" ++config EXT2_DEFAULT_CLUSTER_BITS_5 ++ bool "32" ++ ++endchoice ++ ++endmenu ++ ++ + source "fs/ext3/Kconfig" + source "fs/ext4/Kconfig" + +--- linux-3.2-rc5/include/linux/ext2_fs_c.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2-rc5-e2c/include/linux/ext2_fs_c.h 2011-12-13 14:22:47.830975497 +0100 +@@ -0,0 +1,498 @@ ++/* ++ * Copyright (C) 2001 Alcatel Business Systems - R&D Illkirch ++ * (transparent compression code) ++ * Pierre Peiffer (pierre.peiffer@sxb.bsf.alcatel.fr) - Denis Richard (denis.richard@sxb.bsf.alcatel.fr) ++ * Adapted from patch e2compr-0.4.39-patch-2.2.18 . ++ */ ++ ++#ifndef EXT2_FS_C_H ++#define EXT2_FS_C_H ++ ++#include ++#include ++#include ++#include "../../fs/ext2/ext2.h" ++ ++/* EXT2_COMPR_DEBUG enables: ++ * - all assertions ++ * - adler checksum checking ++ */ ++//#undef EXT2_COMPR_DEBUG ++#define EXT2_COMPR_DEBUG ++ ++#ifdef EXT2_COMPR_DEBUG ++# define assert(expr) \ ++ if(unlikely(!(expr))) { \ ++ printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \ ++#expr, __FILE__, __func__, __LINE__); \ ++ } ++#else ++# define assert(expr) do {} while (0) ++#endif ++ ++ ++/* proof get_cpu and put_cpu correctness by calling might_sleep() or mabye schedule(). ++ this will check if we are atomic */ ++#ifdef EXT2_COMPR_DEBUG ++#define CHECK_NOT_ATOMIC assert(! in_atomic());//might_sleep(); ++#else ++#define CHECK_NOT_ATOMIC ++#endif ++ ++ ++#undef EXT2_COMPR_REPORT ++//#define EXT2_COMPR_REPORT ++//#define EXT2_COMPR_REPORT_VERBOSE ++//#define EXT2_COMPR_REPORT_PUT ++//# define EXT2_COMPR_REPORT_FILEOPEN ++//#define EXT2_COMPR_REPORT_MUTEX ++ ++#ifdef EXT2_COMPR_REPORT ++//# define EXT2_COMPR_REPORT_PUT ++//# define EXT2_COMPR_REPORT_WA ++//# define EXT2_COMPR_REPORT_MUTEX ++//# define EXT2_COMPR_REPORT_ALLOC /* disk allocation etc. */ ++//# define EXT2_COMPR_REPORT_ALGORITHMS /* Compression algorithms */ ++//# define EXT2_COMPR_REPORT_VERBOSE /* Various things I don't think ++// useful at the moment. */ ++//#define EXT2_COMPR_REPORT_VERBOSE_INODE ++#endif ++ ++ ++#ifdef EXT2_COMPR_DEBUG ++#define E2COMPR_VERSION "ext2-compression: e2c-0.4.58-smp-debug (26 August 2011) for kernel 3.1" ++#else ++#define E2COMPR_VERSION "ext2-compression: e2c-0.4.58-smp-release (26 August 2011) for kernel 3.1" ++#endif ++ ++#define EXT2_IOC_GETCLUSTERSIZE _IOR('c', 0, long) ++#define EXT2_IOC_SETCLUSTERSIZE _IOW('c', 0, long) ++#define EXT2_IOC_GETCOMPRMETHOD _IOR('c', 1, long) ++#define EXT2_IOC_SETCOMPRMETHOD _IOW('c', 1, long) ++#define EXT2_IOC_GETFIRSTCLUSTERSIZE _IOR('c', 2, long) ++#define EXT2_IOC_RECOGNIZE_COMPRESSED _IOW('c', 2, long) ++#define EXT2_IOC_GETCLUSTERBIT _IOR('c', 3, long) ++#define EXT2_IOC_GETCOMPRRATIO _IOR('c', 4, long) ++/* Don't use _IOW('c', {5,6}, long), as these are used by old ++ e2compress binaries as SETCLUSTERBIT and CLRCLUSTERBIT ++ respectively. */ ++ ++/* EXT2_xxxx_ALG is an index into ext2_algorithm_table[] defined in ++ fs/ext2/compress.c. */ ++/* N.B. Don't change these without also changing the table in ++ compress.c. Be careful not to break binary compatibility. ++ (EXT2_NONE_ALG and EXT2_UNDEF_ALG are safe from binary ++ compatibility problems, though, so they can safely be renumbered -- ++ and indeed probably should be if you do add another algorithm.) */ ++#define EXT2_LZV1_ALG 0 ++#define EXT2_LZRW3A_ALG 1 ++#define EXT2_GZIP_ALG 2 ++#define EXT2_BZIP2_ALG 3 ++#define EXT2_LZO_ALG 4 ++#define EXT2_NONE_ALG 5 ++#define EXT2_UNDEF_ALG 6 ++#define EXT2_N_ALGORITHMS 5 /* Count of "real" algorithms. Excludes ++ `none' and `undef'. */ ++ ++/* EXT2_xxxx_METH is an index into ext2_method_table[] defined in ++ fs/ext2/compress.c. */ ++/* N.B. Don't change these without also changing the table in ++ compress.c. */ ++#define EXT2_LZV1_METH 0 ++#define EXT2_AUTO_METH 1 ++#define EXT2_DEFER_METH 2 ++#define EXT2_NEVER_METH 3 ++#define EXT2_BZIP2_METH 4 ++#define EXT2_LZRW3A_METH 8 ++#define EXT2_LZO1X_1_METH 10 ++#define EXT2_GZIP_1_METH 16 ++#define EXT2_GZIP_2_METH 17 ++#define EXT2_GZIP_3_METH 18 ++#define EXT2_GZIP_4_METH 19 ++#define EXT2_GZIP_5_METH 20 ++#define EXT2_GZIP_6_METH 21 ++#define EXT2_GZIP_7_METH 22 ++#define EXT2_GZIP_8_METH 23 ++#define EXT2_GZIP_9_METH 24 ++ ++#define EXT2_N_METHODS 32 /* Don't change this unless you know what ++ you're doing. In particular, it's tied ++ to the width of the algorithm field ++ in i_flags.*/ ++ ++/* Note: EXT2_N_ALGORITHMS can't be increased beyond 16 without ++ changing the width of the s_algorithms_used field in the in-memory ++ superblock. The on-disk s_algorithms_used field is 32 bits long. ++ (This is in a state of flux. Currently (1998-02-05) there is no ++ distinction: we always use the s_es copy. */ ++ ++ ++#define EXT2_MAX_CLUSTER_BYTES (32*1024) ++#define EXT2_LOG2_MAX_CLUSTER_BYTES (5 + 10) ++ ++#define EXT2_COMPRESS_MAGIC_04X 0x9ec7 ++#define EXT2_MAX_CLUSTER_BLOCKS 32 ++#define EXT2_MAX_CLUSTER_PAGES EXT2_MAX_CLUSTER_BYTES >> PAGE_CACHE_SHIFT ++#define EXT2_ECOMPR EIO ++/* A cluster is considered compressed iff the block number for the ++ last block of that cluster is EXT2_COMPRESSED_BLKADDR. If this ++ changes then check if there's anywhere that needs a cpu_to_le32() ++ conversion. */ ++#define EXT2_COMPRESSED_BLKADDR 0xffffffff ++ ++/* I like these names better. */ ++#define EXT2_MAX_CLU_NBYTES EXT2_MAX_CLUSTER_BYTES ++#define EXT2_LOG2_MAX_CLU_NBYTES EXT2_LOG2_MAX_CLUSTER_BYTES ++#define EXT2_MAX_CLU_NBLOCKS EXT2_MAX_CLUSTER_BLOCKS ++ ++ ++#ifndef __KERNEL__ ++ ++/* Cluster head on disk, for e2compr versions before 0.4.0. I'm ++ leaving this here so tht as I may make e2compress able to read ++ old-style e2compr files. */ ++struct ext2_cluster_head_03x { ++ __u16 magic; /* == EXT2_COMPRESS_MAGIC_03X */ ++ __u16 len; /* size of uncompressed data */ ++ __u16 compr_len; /* size of compressed data */ ++ __u8 method; /* compress method */ ++ __u8 reserved_0; ++ __u32 bitmap; /* block bitmap */ ++ __u32 reserved_2; /* 0 or adler32 checksum of ++ _compressed_ data */ ++}; ++# define EXT2_COMPRESS_MAGIC_03X 0x8ec7 /* Head magic number ++ for e2compr versions ++ before 0.4.0. */ ++#endif /* !__KERNEL__ */ ++ ++ ++#ifdef __KERNEL__ ++# ifdef CONFIG_EXT2_COMPRESS ++ ++//mw ++#define CONFIG_EXT2_HAVE_GZIP ++ ++/* If defined, compress each cluster as soon as we get to the end of a ++ whole cluster, when writing. (If undefined, we wait until ++ ext2_release_file() or the like.) */ ++#define EXT2_COMPRESS_WHEN_CLU ++ ++# ifdef CONFIG_EXT2_DEFAULT_COMPR_METHOD_DEFER ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_DEFER_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_BZIP2) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_BZIP2_METH ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_LZO1X_1_ME ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_LZO) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_LZO1X_1_METH ++# ifndef CONFIG_EXT2_HAVE_LZO ++# error "Default algorithm (lzo) is not compiled in." ++# endif ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_LZV1) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_LZV1_METH ++# ifndef CONFIG_EXT2_HAVE_LZV1 ++# error "Default algorithm (lzv1) is not compiled in." ++# endif ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_LZRW3A) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_LZRW3A_METH ++# ifndef CONFIG_EXT2_HAVE_LZRW3A ++# error "Default algorithm (lzrw3a) is not compiled in." ++# endif ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP1) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_1_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP2) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_2_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP3) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_3_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP4) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_4_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP5) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_5_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP6) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_6_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP7) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_7_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP8) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_8_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP9) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_9_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_BZIP2) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_BZIP2_METH ++# ifndef CONFIG_EXT2_HAVE_BZIP2 ++# error "Default algorithm (bzip2) is not compiled in." ++# endif ++# else ++# error "No default compression algorithm." ++# endif ++# if EXT2_DEFAULT_COMPR_METHOD >= EXT2_GZIP_1_METH && EXT2_DEFAULT_COMPR_METHOD <= EXT2_GZIP_9_METH ++# ifndef CONFIG_EXT2_HAVE_GZIP ++# error "Default algorithm (gzip) is not compiled in." ++# endif ++# endif ++ ++# if defined (CONFIG_EXT2_DEFAULT_CLUSTER_BITS_2) ++# define EXT2_DEFAULT_LOG2_CLU_NBLOCKS 2 ++# elif defined (CONFIG_EXT2_DEFAULT_CLUSTER_BITS_3) ++# define EXT2_DEFAULT_LOG2_CLU_NBLOCKS 3 ++# elif defined (CONFIG_EXT2_DEFAULT_CLUSTER_BITS_4) ++# define EXT2_DEFAULT_LOG2_CLU_NBLOCKS 4 ++# elif defined (CONFIG_EXT2_DEFAULT_CLUSTER_BITS_5) ++# define EXT2_DEFAULT_LOG2_CLU_NBLOCKS 5 ++# else ++# error "No default cluster size." ++# endif ++ ++# define EXT2_DEFAULT_CLU_NBLOCKS (1 << EXT2_DEFAULT_LOG2_CLU_NBLOCKS) ++ ++# if (EXT2_LZV1_ALG != 0) || (EXT2_BZIP2_ALG != 3) || (EXT2_LZO_ALG != 4) || (EXT2_N_ALGORITHMS != 5) ++# error "this code needs changing; but then, you shouldn't be messing with algorithm ids anyway unless you are very careful to protect disk format compatibility" ++# endif ++# ifdef CONFIG_EXT2_HAVE_LZV1 ++# define _ext2_lzv1_builtin (1 << EXT2_LZV1_ALG) ++# else ++# define _ext2_lzv1_builtin 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_LZRW3A ++# define _ext2_lzrw3a_builtin (1 << EXT2_LZRW3A_ALG) ++# else ++# define _ext2_lzrw3a_builtin 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_GZIP ++# define _ext2_gzip_builtin (1 << EXT2_GZIP_ALG) ++# else ++# define _ext2_gzip_builtin 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_BZIP2 ++# define _ext2_bzip2_builtin (1 << EXT2_BZIP2_ALG) ++# else ++# define _ext2_bzip2_builtin 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_LZO ++# define _ext2_lzo_builtin (1 << EXT2_LZO_ALG) ++# else ++# define _ext2_lzo_builtin 0 ++# endif ++ ++# ifdef CONFIG_EXT2_HAVE_LZV1_MODULE ++# define _ext2_lzv1_module (1 << EXT2_LZV1_ALG) ++# else ++# define _ext2_lzv1_module 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_LZRW3A_MODULE ++# define _ext2_lzrw3a_module (1 << EXT2_LZRW3A_ALG) ++# else ++# define _ext2_lzrw3a_module 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_GZIP_MODULE ++# define _ext2_gzip_module (1 << EXT2_GZIP_ALG) ++# else ++# define _ext2_gzip_module 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_BZIP2_MODULE ++# define _ext2_bzip2_module (1 << EXT2_BZIP2_ALG) ++# else ++# define _ext2_bzip2_module 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_LZO_MODULE ++# define _ext2_lzo_module (1 << EXT2_LZO_ALG) ++# else ++# define _ext2_lzo_module 0 ++# endif ++ ++# define EXT2_ALGORITHMS_MODULE (_ext2_lzv1_module | _ext2_lzrw3a_module | _ext2_gzip_module | _ext2_bzip2_module | _ext2_lzo_module) ++# define EXT2_ALGORITHMS_BUILTIN (_ext2_lzv1_builtin | _ext2_lzrw3a_builtin | _ext2_gzip_builtin | _ext2_bzip2_builtin | _ext2_lzo_builtin) ++ ++# if EXT2_ALGORITHMS_MODULE & EXT2_ALGORITHMS_BUILTIN ++# error "Arithmetic error? Some algorithm appears to be both built-in and a module." ++# endif ++ ++/* EXT2_ALGORITHMS_SUPP is what we test when mounting a filesystem. ++ See fs/ext2/super.c. */ ++# define EXT2_ALGORITHMS_SUPP (EXT2_ALGORITHMS_MODULE | EXT2_ALGORITHMS_BUILTIN) ++# if EXT2_ALGORITHMS_SUPP == 0 ++# error "You must select at least one compression algorithm." ++# endif ++ ++/* Cluster head on disk. Little-endian. */ ++struct ext2_cluster_head { ++ __u16 magic; /* == EXT2_COMPRESS_MAGIC_04X. */ ++ __u8 method; /* compression method id. */ ++ __u8 holemap_nbytes; /* length of holemap[] array */ ++ __u32 checksum; /* adler32 checksum. Checksum covers all fields ++ below this one, and the compressed data. */ ++ __u32 ulen; /* size of uncompressed data */ ++ __u32 clen; /* size of compressed data (excluding cluster head) */ ++ __u8 holemap[0]; /* bitmap describing where to put holes. */ ++}; ++ ++ ++struct ext2_wa_S { ++ __u8 u[EXT2_MAX_CLUSTER_BYTES]; /* Uncompressed data. */ ++ __u8 c[EXT2_MAX_CLUSTER_BYTES]; /* Compressed data. */ ++ __u8 heap[1]; /* Heap: working space for de/compression routines. */ ++}; ++ ++# define EXT2_CLEANUP_FL 0x40 /* See Readme.e2compr */ ++# define EXT2_OSYNC_INODE 0x20 /* sync of inode running */ ++# define ROUNDUP_DIV(_n, _d) ((_n) ? 1 + (((_n) - 1) / (_d)) : 0) ++# define ROUNDUP_RSHIFT(_n, _b) ((_n) ? 1 + (((_n) - 1) >> (_b)) : 0) ++ ++# if defined(EXT2_NDIR_BLOCKS) && (EXT2_NDIR_BLOCKS != 12) ++# error "e2compr currently assumes that EXT2_NDIR_BLOCKS is 12." ++/* If EXT2_NDIR_BLOCKS changes then change the definitions of ++ ext2_first_cluster_nblocks() and friends, and search the patch for ++ anywhere where 12 is hard-coded. (At the time of writing, it's ++ only hard-coded in ext2_first_cluster_nblocks().) What we want to ++ achieve is for clusters not to straddle address blocks. Apart from ++ performance, some code in compress.c (search for `straddle') ++ assumes this. */ ++# endif ++ ++# include ++ ++# define EXT2_ALG_INIT_COMPRESS 1 ++# define EXT2_ALG_INIT_DECOMPRESS 2 ++ ++extern int ext2_get_cluster_pages (struct inode*, u32, struct page**, struct page *, int); ++extern int ext2_get_cluster_extra_pages (struct inode*, u32, struct page**, struct page**); ++extern int ext2_kmap_cluster_pages (struct page *, struct page**, struct page**); ++extern int ext2_kunmap_cluster_pages (struct page *, struct page**, struct page**); ++extern int ext2_get_cluster_blocks (struct inode*, u32, struct buffer_head**, struct page**, struct page**, int); ++extern int ext2_decompress_cluster (struct inode*, u32); ++extern int ext2_decompress_pages(struct inode*, u32, struct page**); ++extern int ext2_compress_cluster (struct inode*, u32); ++extern int ext2_decompress_inode (struct inode*); ++extern int ext2_cleanup_compressed_inode (struct inode*); ++extern void ext2_update_comprblk (struct inode *); ++extern int ext2_get_dcount(struct inode *inode); ++ ++extern size_t ext2_decompress_blocks (struct inode*, struct buffer_head**, int, size_t, u32 cluster); ++extern int ext2_count_blocks (struct inode*); ++extern int ext2_recognize_compressed (struct inode *, unsigned cluster); ++extern unsigned long ext2_adler32 (unsigned long, unsigned char*, int); ++ ++extern size_t ext2_iLZV1 (int); ++extern size_t ext2_iLZV2 (int); ++extern size_t ext2_iNONE (int); ++extern size_t ext2_iGZIP (int); ++extern size_t ext2_iBZIP2 (int); ++extern size_t ext2_iLZO (int); ++extern size_t ext2_iLZRW3A (int); ++extern size_t ext2_iZLIB (int); ++ ++extern size_t ext2_wLZV1 (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_wLZV2 (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_wNONE (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_wGZIP (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_wBZIP2 (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_wLZO (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_wLZRW3A (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_wZLIB (__u8*, __u8*, void*, size_t, size_t, int); ++ ++extern size_t ext2_rLZV1 (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_rLZV2 (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_rNONE (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_rGZIP (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_rBZIP2 (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_rLZO (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_rLZRW3A (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_rZLIB (__u8*, __u8*, void*, size_t, size_t, int); ++ ++struct ext2_algorithm { ++ char *name; ++ int avail; ++ size_t (*init) (int); ++ size_t (*compress) (__u8*, __u8*, void*, size_t, size_t, int); ++ size_t (*decompress) (__u8*, __u8*, void*, size_t, size_t, int); ++}; ++ ++struct ext2_method { ++ unsigned alg; ++ int xarg; ++}; ++ ++ ++# define ext2_first_cluster_nblocks(_i) ((EXT2_I(_i))->i_clu_nblocks > 4 && (_i)->i_sb->s_blocksize < 4096 ? 12 : 4) ++# define ext2_block_to_cluster(_i,_b) ((_b) < ext2_first_cluster_nblocks(_i) ? 0 : (((_b) - ext2_first_cluster_nblocks(_i)) >> (EXT2_I(_i))->i_log2_clu_nblocks) + 1) ++# define ext2_offset_to_cluster(_i,_o) ext2_block_to_cluster((_i), ((_o) >> (_i)->i_sb->s_blocksize_bits)) ++# define ext2_n_clusters(_i) ((_i)->i_size ? ext2_offset_to_cluster((_i), (_i)->i_size - 1) + 1 : 0) ++# define ext2_cluster_block0(_i,_c) ((_c) ? ext2_first_cluster_nblocks(_i) + (((_c) - 1) << (EXT2_I(_i))->i_log2_clu_nblocks) : 0) ++# define ext2_cluster_nblocks(_i,_c) ((_c) ? (EXT2_I(_i))->i_clu_nblocks : ext2_first_cluster_nblocks(_i)) ++# define ext2_cluster_offset(_i,_c) ((_c) ? ext2_cluster_block0((_i), (_c)) << (_i)->i_sb->s_blocksize_bits : 0) ++ ++# define ext2_first_cluster_npages(_i) ((EXT2_I(_i))->i_clu_nblocks > 4 && (_i)->i_sb->s_blocksize < 4096 ? 12 >> (PAGE_CACHE_SHIFT - (_i)->i_sb->s_blocksize_bits) : 4 >> (PAGE_CACHE_SHIFT - (_i)->i_sb->s_blocksize_bits)) ++# define ext2_page_to_cluster(_i,_p) ((_p) < ext2_first_cluster_npages(_i) ? 0 : (((_p) - ext2_first_cluster_npages(_i)) >> (((EXT2_I(_i))->i_log2_clu_nblocks)+(_i)->i_sb->s_blocksize_bits-PAGE_CACHE_SHIFT)) + 1) ++# define ext2_cluster_page0(_i,_c) ((_c) ? ext2_cluster_block0(_i, _c) >> (PAGE_CACHE_SHIFT - (_i)->i_sb->s_blocksize_bits) : 0) ++# define ext2_cluster_npages(_i,_c) ((_c) ? (EXT2_I(_i))->i_clu_nblocks >> (PAGE_CACHE_SHIFT - (_i)->i_sb->s_blocksize_bits) : ext2_first_cluster_npages(_i)) ++ ++static inline int ++ext2_offset_is_clu_boundary(struct inode *inode, u32 off) ++{ ++ if (off & (inode->i_sb->s_blocksize - 1)) ++ return 0; ++ if (off == 0) ++ return 1; ++ off >>= inode->i_sb->s_blocksize_bits; ++ if (off < ext2_first_cluster_nblocks(inode)) ++ return 0; ++ off -= ext2_first_cluster_nblocks(inode); ++ return !(off & (EXT2_I(inode)->i_clu_nblocks - 1)); ++} ++ ++struct ext2_wa_contents_S { ++ ino_t ino; ++ dev_t dev; ++ unsigned cluster; ++}; ++ ++DECLARE_PER_CPU(struct ext2_wa_S *, ext2_rd_wa); ++DECLARE_PER_CPU(struct ext2_wa_S *, ext2_wr_wa); ++ ++extern void ext2_alloc_rd_wa(void); ++extern void ext2_alloc_wr_wa(void); ++ ++extern struct ext2_algorithm ext2_algorithm_table[]; ++extern struct ext2_method ext2_method_table[]; /*mw: is static so far, no writes*/ ++ ++/* Both of these return -errno if error, 0 if not compressed, positive ++ if compressed. (You should use the macro unless you've already ++ tested COMPRBLK.) */ ++extern int ext2_cluster_is_compressed_fn (struct inode *inode, __u32 cluster); ++static inline int ext2_cluster_is_compressed (struct inode *inode, __u32 cluster) ++{ ++ if ((EXT2_I(inode)->i_flags & EXT2_COMPRBLK_FL) == 0) ++ return 0; ++ return ext2_cluster_is_compressed_fn (inode, cluster); ++} ++extern unsigned ext2_calc_free_ix (unsigned , u8 const *, unsigned ); ++extern int ext2_unpack_blkaddrs(struct inode *, struct buffer_head **, int, unsigned , u8 const *, unsigned , unsigned , unsigned , unsigned ); ++ ++# define HOLE_BLKADDR(_b) \ ++ (((_b) == 0) \ ++ || ((_b) == EXT2_COMPRESSED_BLKADDR)) ++# else /* !CONFIG_EXT2_COMPRESS */ ++# define HOLE_BLKADDR(_b) ((_b) == 0) ++# endif ++ ++/* For some reason or other, I see code like `if (le32_to_cpu(tmp) != ++ 0)' around in the kernel. So far I haven't checked whether or not ++ the compiler knows that the swab can be dropped. */ ++# if defined(EXT2_COMPRESSED_BLKADDR) && EXT2_COMPRESSED_BLKADDR != 0xffffffff ++/* This may be a false positive; the "correct" test would be `if ++ defined(CONFIG_EXT2_COMPRESS)', but if this test does succeed, then ++ there is at least cause to have a look around. */ ++# error "Next bit of code is wrong." ++# endif ++ ++# define HOLE_BLKADDR_SWAB32(_b) HOLE_BLKADDR(_b) ++ ++#ifdef EXT2_COMPR_REPORT ++#define trace_e2c(format, args...) printk(KERN_DEBUG format, ## args) ++#else ++#define trace_e2c(format, args...) do {} while(0) ++#endif ++ ++#endif /* __KERNEL__ */ ++ ++ ++#endif /* EXT2_FS_C_H */ +--- linux-3.2-rc5/fs/ext2/Makefile 2011-12-10 00:09:32.000000000 +0100 ++++ linux-3.2-rc5-e2c/fs/ext2/Makefile 2011-12-13 14:22:47.830975498 +0100 +@@ -2,10 +2,17 @@ + # Makefile for the linux ext2-filesystem routines. + # + ++ifeq ($(CONFIG_EXT2_COMPRESS),y) ++ ++COMPRESS_STUFF := adler32.o compress.o e2zlib.o\ ++ $($(obj-y):%/=%/ext2-compr-%.o) ++endif ++ + obj-$(CONFIG_EXT2_FS) += ext2.o + + ext2-y := balloc.o dir.o file.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o ++ ioctl.o namei.o super.o symlink.o $(COMPRESS_STUFF) ++ + + ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o +--- linux-3.2-rc5/fs/ext2/compress.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2-rc5-e2c/fs/ext2/compress.c 2011-12-13 14:22:47.839975781 +0100 +@@ -0,0 +1,3420 @@ ++/* ++ * linux/fs/ext2/compress.c ++ * ++ * Copyright (C) 1995 Antoine Dumesnil de Maricourt (dumesnil@etca.fr) ++ * (transparent compression code) ++ */ ++ ++/* ++ * Copyright (C) 2001 Alcatel Business Systems - R&D Illkirch FRANCE ++ * ++ * Transparent compression code for 2.4 kernel. ++ * ++ * Denis Richard (denis.richard@sxb.bsf.alcatel.fr) ++ * Pierre Peiffer (pierre.peiffer@sxb.bsf.alcatel.fr) ++ * ++ * Adapted from patch e2compr-0.4.39-patch-2.2.18 . ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define MIN(a,b) ((a) < (b) ? (a) : (b)) ++ ++#ifdef CONFIG_HIGHMEM ++#define restore_b_data_himem(bh) assert(page_address(bh->b_page)); bh->b_data = page_address(bh->b_page) + bh_offset(bh) ++ ++ ++ ++int ext2_kmap_cluster_pages(struct page *page, struct page *pg[], ++ struct page *epg[]) ++{ ++ int i = 0; ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (!pg[i]) ++ break; ++ if (epg && epg[i]) ++ kmap(epg[i]); ++ else ++ kmap(pg[i]); ++ } ++ ++ if (page) ++ kmap(page); ++ return 0; ++} ++ ++ ++int ext2_kunmap_cluster_pages(struct page *page, struct page *pg[], ++ struct page *epg[]) ++{ ++ int i = 0; ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (!pg[i]) ++ break; ++ if (epg && epg[i]) ++ kunmap(epg[i]); ++ else ++ kunmap(pg[i]); ++ } ++ ++ if (page) ++ kunmap(page); ++ return 0; ++} ++#else //no high-mem: ++#define restore_b_data_himem(bh) ; ++#endif ++ ++ ++/*none compression dummy functions*/ ++size_t ext2_iNONE (int action) { return 0; } ++size_t ext2_wNONE (__u8 *ibuf, __u8 *obuf, void *wa, size_t ilen, size_t olen, int xarg) { return 0; } ++size_t ext2_rNONE (__u8 *ibuf, __u8 *obuf, void *wa, size_t ilen, size_t olen, int xarg) { return 0; } ++ ++/* ++ * Algorithm and method tables ++ */ ++struct ext2_algorithm ext2_algorithm_table[] = { ++ /* Note: all algorithms must have the `name' field filled in. ++ This is used to autoload algorithm modules (ext2-compr-%s), and ++ in kernel printk. */ ++ /* N.B. Do not renumber these algorithms! (To do so is to change ++ the binary format.) It's OK for `none' and `undef' to be ++ renumbered, though. */ ++ ++ /* Fields: ++ name; available; routines for: ++ init, compress, decompress. */ ++ {"lzv1", 0, ext2_iNONE, ext2_wNONE, ext2_rNONE}, ++ {"lzrw3a", 0, ext2_iNONE, ext2_wNONE, ext2_rNONE}, ++ {"gzip", 1, ext2_iZLIB, ext2_wZLIB, ext2_rZLIB}, //Andreas: workaround ++ {"bzip2", 0, ext2_iNONE, ext2_wNONE, ext2_rNONE}, ++ {"lzo", 0, ext2_iNONE, ext2_wNONE, ext2_rNONE}, ++ {"none", 1, ext2_iNONE, ext2_wNONE, ext2_rNONE}, ++ ++ /* This "algorithm" is for unused entries in the method table. ++ It differs from EXT2_NONE_ALG in that it is considered ++ unavailable, whereas `none' is always available. */ ++ {"undef", 0, ext2_iNONE, ext2_wNONE, ext2_rNONE}, ++ ++}; ++ ++/* Note: EXT2_N_ALGORITHMS can't be increased beyond 16 without ++ changing the width of the s_algorithms_used field in the in-memory ++ superblock. The on-disk s_algorithms_used field is 32 bits long. ++ (This is in a state of flux. Currently (1998-02-05) there is no ++ distinction: we always use the s_es copy. */ ++ ++/* The size of this table must be 32 to prevent Oopsen from ++ invalid data. We index this from 5 bits of i_flags, so ++ the size is (1 << 5) == 32. */ ++struct ext2_method ext2_method_table[32] = { ++ /* Fields: algorithm id, algorithm argument. */ ++ {EXT2_LZV1_ALG, 0}, ++ {EXT2_NONE_ALG, 0}, /* 1: auto */ ++ {EXT2_NONE_ALG, 0}, /* 2: defer */ ++ {EXT2_NONE_ALG, 0}, /* 3: never */ ++ {EXT2_BZIP2_ALG, 0}, /* 4: bzip2 */ ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_LZRW3A_ALG, 0}, /* 8: lzrw3a */ ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_LZO_ALG, 0}, /* 10: lzo1x_1 */ ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_GZIP_ALG, 1}, /* 16 */ ++ {EXT2_GZIP_ALG, 2}, ++ {EXT2_GZIP_ALG, 3}, ++ {EXT2_GZIP_ALG, 4}, ++ {EXT2_GZIP_ALG, 5}, ++ {EXT2_GZIP_ALG, 6}, ++ {EXT2_GZIP_ALG, 7}, ++ {EXT2_GZIP_ALG, 8}, ++ {EXT2_GZIP_ALG, 9}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0} ++}; ++ ++ ++static void ext2_mark_algorithm_use(struct inode *inode, unsigned alg) ++{ ++ struct ext2_sb_info *sbi = EXT2_SB(inode->i_sb); ++ ++ /* Hopefully, lock_super() isn't needed here, as we don't ++ block in the critical region. True? */ ++ assert(alg < EXT2_N_ALGORITHMS); ++ if (sbi->s_es->s_feature_incompat ++ & cpu_to_le32(EXT2_FEATURE_INCOMPAT_COMPRESSION)) { ++ sbi->s_es->s_algorithm_usage_bitmap |= cpu_to_le32(1 << alg); ++ } else { ++ struct ext2_super_block *es = sbi->s_es; ++ ++ es->s_algorithm_usage_bitmap = cpu_to_le32(1 << alg); ++ es->s_feature_incompat ++ |= cpu_to_le32(EXT2_FEATURE_INCOMPAT_COMPRESSION); ++ if (es->s_rev_level < EXT2_DYNAMIC_REV) { ++ /* Raise the filesystem revision level to ++ EXT2_DYNAMIC_REV so that s_feature_incompat ++ is honoured (except in ancient kernels / ++ e2fsprogs). We must also initialize two ++ other dynamic-rev fields. The remaining ++ fields are assumed to be already correct ++ (e.g. still zeroed). */ ++ es->s_rev_level = cpu_to_le32(EXT2_DYNAMIC_REV); ++ es->s_first_ino = cpu_to_le32(EXT2_GOOD_OLD_FIRST_INO); ++ es->s_inode_size = cpu_to_le16(EXT2_GOOD_OLD_INODE_SIZE); ++ } ++ } ++ mark_buffer_dirty(sbi->s_sbh); ++} ++ ++ ++/* Displays an error message if algorithm ,alg` is not marked in use, ++ and then marks it in use. */ ++static void ext2_ensure_algorithm_use(struct inode *inode, unsigned alg) ++{ ++ assert(alg < EXT2_N_ALGORITHMS); ++ ++ if (!(EXT2_SB(inode->i_sb)->s_es->s_algorithm_usage_bitmap ++ & cpu_to_le32(1 << alg))) { ++ ext2_msg(inode->i_sb, "algorithm usage bitmap algorithm %s not marked used in inode %lu", ++ ext2_algorithm_table[alg].name, inode->i_ino); ++ ext2_mark_algorithm_use(inode, alg); ++ } ++} ++ ++ ++/*mw: out of cache bug fix 5-16-07 */ ++static void create_empty_buffers_e2c(struct page *page, ++ unsigned long blocksize, ++ unsigned long b_state, ++ struct inode *inode) ++{ ++ struct buffer_head *bh, *head, *tail; ++ ++ head = alloc_page_buffers(page, blocksize, 1); ++ bh = head; ++ do { ++ bh->b_state |= b_state; ++ tail = bh; ++ bh->b_bdev = NULL; //mw: make it like 2.4 ++ bh->b_blocknr = 0; //mw: make it like 2.4 ++ bh->b_end_io = NULL; //mw: make it like 2.4 ++ bh = bh->b_this_page; ++ } while (bh); ++ tail->b_this_page = head; ++ spin_lock(&inode->i_mapping->private_lock); ++ if (PageUptodate(page) || PageDirty(page)) { ++ bh = head; ++ do { ++ if (PageDirty(page)) ++ set_buffer_dirty(bh); ++ if (PageUptodate(page)) ++ set_buffer_uptodate(bh); ++ bh = bh->b_this_page; ++ } while (bh != head); ++ } ++ attach_page_buffers(page, head); ++ spin_unlock(&inode->i_mapping->private_lock); ++} ++ ++int ext2_get_cluster_pages(struct inode *inode, u32 cluster, ++ struct page *pg[], struct page *page, int compr) ++{ ++ int nbpg, npg, i; ++ u32 page0; /* = position within file (not position within fs). */ ++ u32 idx = 0; ++ struct page *cached_page; ++ struct pagevec lru_pvec; ++ ++ /*mw */ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) ++ pg[i] = NULL; ++ ++ cached_page = NULL; ++ pagevec_init(&lru_pvec, 0); ++ ++ page0 = ext2_cluster_page0(inode, cluster); ++ nbpg = ext2_cluster_npages(inode, cluster); ++ ++ if (compr && (((page0 + nbpg) << PAGE_CACHE_SHIFT) > inode->i_size)) ++ nbpg = ((inode->i_size - 1) >> PAGE_CACHE_SHIFT) - page0 + 1; ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c("ext2_get_cluster_pages: page0=%d, nbpg=%d page=%ld\n", ++ page0, nbpg, ((page != NULL) ? page->index : 0)); ++#endif ++ for (npg = 0; npg < nbpg; npg++) { ++ if ((page == NULL) || ((page0 + npg) != page->index)) { ++ //pg[npg] = __grab_cache_page(inode->i_mapping, page0+npg); /* &cached_page, &lru_pvec);*/ ++ pg[npg] = grab_cache_page_write_begin(inode->i_mapping, page0+npg, 0); ++ if (!pg[npg]) ++ goto error; ++ } else { ++ pg[npg] = page; ++ } ++ if (!page_has_buffers(pg[npg])) { ++ ClearPageUptodate(pg[npg]); ++ ClearPageDirty(pg[npg]); ++ create_empty_buffers_e2c(pg[npg], inode->i_sb->s_blocksize, 0, inode); ++ if (unlikely(!page_has_buffers(pg[npg]))) ++ trace_e2c("ext2_get_cluster_pages: NOMEM!\n"); ++ assert(!PageUptodate(pg[npg])); ++ assert(!PageDirty(pg[npg])); ++ } ++ } ++ //set remaining pages to NULL ++ for (idx = npg; idx < EXT2_MAX_CLUSTER_PAGES; idx++) ++ pg[idx] = NULL; ++ ++ if (cached_page) ++ page_cache_release(cached_page); ++ pagevec_lru_add_file(&lru_pvec); ++ pagevec_free(&lru_pvec); ++ return (npg); ++ error: ++ if (cached_page) ++ page_cache_release(cached_page); ++ pagevec_lru_add_file(&lru_pvec); ++ pagevec_free(&lru_pvec); ++ while (--npg >= 0) { ++ if ((page == NULL) || ((page0 + npg) != page->index)) { ++ unlock_page(pg[npg]); ++ page_cache_release(pg[npg]); ++ } ++ pg[npg] = NULL; ++ } ++ trace_e2c("ext2_get_cluster_pages: error no page\n"); ++ return (-ENOMEM); ++} ++ ++ ++int ext2_get_cluster_extra_pages(struct inode *inode, u32 cluster, ++ struct page *pg[], struct page *epg[]) ++{ ++ struct page *page; ++ int nbpg, npg, i; ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) ++ epg[i] = NULL; ++ ++ nbpg = ext2_cluster_npages(inode, cluster); ++ for (npg = 0; npg < nbpg; npg++) { ++ if (pg[npg] == NULL) ++ break; ++ if (PageUptodate(pg[npg])) { ++ //page = page_cache_alloc(inode->i_mapping); ++ //mw: has gfp-mask of adress-space: gfp_t mapping_gfp_mask(struct address_space * mapping) ++ // don't trigger. shrink_dcache_memory which might call ext2_cleanup_compressed_inode with the SAME mutex. ++ page = __page_cache_alloc(GFP_NOFS); ++ ++ if (!page) { ++ goto error; ++ } ++ ClearPageError(page); ++ ClearPageReferenced(page); ++ ClearPageUptodate(page); ++ ClearPageDirty(page); ++ lock_page(page); ++ page->index = pg[npg]->index; ++ ++ if (!page_has_buffers(page)) { ++ create_empty_buffers_e2c(page, inode->i_sb->s_blocksize, 0, ++ inode); ++ /*mw : only the "extra_pages" for decompression need create_empty_buffers_unlocked, because ++ * they have no mapping-context and they must not have one. Otherwise they get need a page->index ++ * which belongs always to an address_space object (e.g.: inode). But I think this is not intented here. ++ * we just need thei buffers for a short time of decompression */ ++ if (unlikely(!page_has_buffers(page))) ++ return printk("Error: NOMEM!\n"); ++ } ++ ++ epg[npg] = page; ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c ++ ("ext2_get_cluster_extra_pages: allocated page idx=%ld\n", ++ pg[npg]->index); ++#endif ++ } else { ++ epg[npg] = NULL; ++ } ++ } ++ return (npg); ++ error: ++ while (--npg >= 0) ++ if (epg[npg]) { ++ ClearPageDirty(epg[npg]); ++ ClearPageUptodate(epg[npg]); ++ try_to_free_buffers(epg[npg]); ++ unlock_page(epg[npg]); ++ assert(page_count(epg[npg]) == 1); ++ page_cache_release(epg[npg]); ++ } ++ trace_e2c("ext2_get_cluster_extra_pages: error no page\n"); ++ return (-ENOMEM); ++ ++} ++ ++/* Read every block in the cluster. The blocks are stored in the bh ++ array, which must be big enough. ++ ++ Return the number of block contained in the cluster, or -errno if an ++ error occured. The buffers should be released by the caller ++ (unless an error occurred). ++ ++ The inode must be locked, otherwise it is possible that we return ++ some out of date blocks. ++ ++ Called by : ++ ++ ext2_decompress_cluster() [i_sem] ++ ext2_compress_cluster() [i_sem] ++ ext2_readpage() [i_sem] */ ++ ++ ++int ext2_get_cluster_blocks(struct inode *inode, u32 cluster, ++ struct buffer_head *bh[], struct page *pg[], ++ struct page *epg[], int compr) ++{ ++ struct buffer_head *br[EXT2_MAX_CLUSTER_BLOCKS]; ++ int nreq, nbh = 0, npg, i; ++ u32 clu_nblocks; ++ int err; ++ const int blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits; ++ ++ /*mw */ ++ for (i = 0; i < EXT2_MAX_CLUSTER_BLOCKS; i++) ++ bh[i] = NULL; ++ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); /* i.e. mutex_lock */ ++ ++ /* ++ * Request full cluster. ++ */ ++ { ++ u32 endblk; ++ u32 block; /* = position within file (not position within fs). */ ++ u32 nbpg; ++ u32 page0; /* = position within file (not position within fs). */ ++ u32 idx; ++ ++ block = ext2_cluster_block0(inode, cluster); ++ clu_nblocks = ext2_cluster_nblocks(inode, cluster); ++ /* impl: Don't shorten endblk for i_size. The ++ remaining blocks should be NULL anyway, except in ++ the case when called from ext2_decompress_cluster ++ from ext2_truncate, in which case i_size is short ++ and we _want_ to get all of the blocks. */ ++ endblk = block + clu_nblocks; ++ ++ page0 = ext2_cluster_page0(inode, cluster); ++ nbpg = ext2_cluster_npages(inode, cluster); ++ ++ if (compr ++ && (((page0 + nbpg) << PAGE_CACHE_SHIFT) > inode->i_size)) { ++ nbpg = ((inode->i_size - 1) >> PAGE_CACHE_SHIFT) - page0 + 1; ++ endblk = ++ block + ++ (nbpg << ++ (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)); ++ } ++ ++ idx = page0 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c("ext2_get_cluster_blocks: page0=%d, nbpg=%d\n", page0, ++ nbpg); ++#endif ++ for (npg = 0; npg < nbpg; npg++) { ++ struct buffer_head *buffer; ++ ++ if ((epg != NULL) && (epg[npg] != NULL)) ++ buffer = page_buffers(epg[npg]); ++ else ++ buffer = page_buffers(pg[npg]); ++ for (i = 0; i < blocks && (block + nbh) < endblk; ++ buffer = buffer->b_this_page, i++) { ++ if (idx == (block + nbh)) { ++ bh[nbh] = buffer; ++ nbh++; ++ } ++ idx++; ++ } ++ } ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c ++ ("ext2_get_cluster_blocks: get every pages and %d buffers\n", ++ nbh); ++#endif ++ ++ for (nbh = 0, nreq = 0; block < endblk; nbh++) { ++ assert(bh[nbh] != NULL); ++ bh[nbh]->b_blocknr = 0; ++ clear_bit(BH_Mapped, &bh[nbh]->b_state); ++ ++ //mw: does not work with 2.6 and holes!!! ++ //err=ext2_get_block(inode, block++, bh[nbh], (PageDirty(bh[nbh]->b_page) ? 1 : 0)); ++ err = ext2_get_block(inode, block++, bh[nbh], 0); ++ /* mw: 0: we dont' create non existing blocks here ++ * let's do it just before the writeback, when we know, which blocks we really need...*/ ++ //err=ext2_get_block(inode, block++, bh[nbh], (buffer_dirty(bh[nbh]) ? 1 : 0)); ++ ++ /* mw: bdev-bug-fix: for files which got compressed and now consume less buffers ++ * ext2_get_block returns 0, for a empty-block. As these buffer were used before ++ * the bh[nbh]->b_bdev might be != NULL or just invalid. So we set them explicitly ++ * to NULL. */ ++ //printk("Get Block cluster %i: (%#x):%i Blk-NR:%lu(%lu)[%lu-%lu] Bdev:%#x(%#x), PGDirty:%i, mapped:%i, PID: %lu\n", cluster, bh[nbh], nbh, block, ++ ++ //if we are not mapped, then the blocknr will be wrong ++ //we set a bdev here the we will write to some "random" block ++ if (!buffer_mapped(bh[nbh])) { ++ bh[nbh]->b_bdev = NULL; /* don't write wrongly mapped blocks !!! */ ++ /* mw: you encounter null pointer oops you MUST ++ * map your buffer using ext2_get_block()*/ ++ } ++ ++ if (bh[nbh]->b_blocknr != 0) { ++ if (!buffer_uptodate(bh[nbh]) ++ /* TODO: Do we need this ++ `!buffer_locked' test? */ ++ && !buffer_locked(bh[nbh]) ++ && !PageDirty(bh[nbh]->b_page)) ++ br[nreq++] = bh[nbh]; ++ } else if ((err != 0) ++ && (err != -EFBIG)) ++ /* impl: for some unknown reason, ++ ext2_getblk() returns -EFBIG if ++ !create and there's a hole. ==> not right any more in 2.4 */ ++ goto error; ++ } ++ for (i = nbh; i < EXT2_MAX_CLUSTER_BLOCKS; i++) { ++ bh[i] = NULL; ++ } ++ } ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("ext2_get_cluster_blocks: nreq=%d for cluster=%d\n", nreq, ++ cluster); ++#endif ++ ++ //read all blocks, which are not null-blocks ++ if (nreq > 0) ++ ll_rw_block(READ, nreq, br); ++ ++ /* ++ * Adjust nbh if we have some null blocks at end of cluster. ++ */ ++ while ((nbh != 0) && (bh[nbh - 1]->b_blocknr == 0)) ++ nbh--; ++ ++ /* ++ * Wait for blocks. ++ */ ++ err = -EIO; ++ CHECK_NOT_ATOMIC ++ for (i = 0; i < nbh; i++) ++ if ((!PageDirty(bh[i]->b_page)) && (bh[i]->b_blocknr != 0)) { ++ wait_on_buffer(bh[i]); ++ if (!buffer_uptodate(bh[i])) { /* Read error ??? */ ++ trace_e2c ++ ("ext2_get_cluster_blocks: wait_on_buffer error (blocknr=%ld)\n", ++ bh[i]->b_blocknr); ++ goto error; ++ } ++ } ++ assert(nbh <= EXT2_MAX_CLU_NBLOCKS); ++ ++ return nbh; ++ ++ error: ++ printk("ERROR: ext2_get_cluster_blocks()\n"); ++ return err; ++} ++ ++ ++/* Iterations over block in the inode are done with a generic ++ iteration key mechanism. We need one method to convert a block ++ number into a new key, one method to iterate (i.e., increment the ++ key) and one method to free the key. The code could be shared with ++ truncate.c, as this mechanism is very general. ++ ++ This code assumes tht nobody else can read or write the file ++ between ext2_get_key() and ext2_free_key(), so callers need to have ++ i_sem (which they all do anyway). */ ++ ++/* TODO: Get all of the bkey routines to return -errno instead of ++ true/false. */ ++/* TODO: The bkey routines currently assume tht address blocks are ++ allocated even if all contained addresses are NULL, but this is not ++ true. Make sure tht we differentiate between NULL block and error, ++ and then fix up ext2_set_key_blkaddr() and anything else (including ++ the pack/unpack routines). */ ++struct ext2_bkey { ++ int level; ++ u32 block; ++ struct inode *inode; ++ int off[4]; ++ u32 *ptr[4]; ++ struct buffer_head *ibh[4]; ++}; ++ ++ ++/* ++ * Method to convert a block number into a key. ++ * ++ * Returns 1 on success, 0 on failure. You may safely, but need ++ * not, free the key even if ext2_get_key() fails. ++ */ ++static int ext2_get_key(struct ext2_bkey *key, struct inode *inode, ++ u32 block) ++{ ++ int x, level; ++ int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); ++ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); ++ ++ /* ++ * The first step can be viewed as translating the ++ * original block number in a special base (powers ++ * of addr_per_block). ++ */ ++ ++ key->block = block; ++ ++ key->off[0] = key->off[1] = key->off[2] = key->off[3] = 0; ++ key->ibh[0] = key->ibh[1] = key->ibh[2] = key->ibh[3] = NULL; ++ key->ptr[0] = key->ptr[1] = key->ptr[2] = key->ptr[3] = NULL; ++ ++ if (block >= EXT2_NDIR_BLOCKS) { ++ block -= EXT2_NDIR_BLOCKS; ++ ++ if (block >= addr_per_block) { ++ block -= addr_per_block; ++ ++ if (block >= addr_per_block * addr_per_block) { ++ block -= addr_per_block * addr_per_block; ++ ++ key->off[0] = EXT2_TIND_BLOCK; ++ key->off[1] = (block / (addr_per_block * addr_per_block)); ++ key->off[2] = ++ (block % (addr_per_block * addr_per_block)) / ++ addr_per_block; ++ key->off[3] = (block % addr_per_block); ++ level = 3; ++ } else { ++ key->off[0] = EXT2_DIND_BLOCK; ++ key->off[1] = block / addr_per_block; ++ key->off[2] = block % addr_per_block; ++ level = 2; ++ } ++ } else { ++ key->off[0] = EXT2_IND_BLOCK; ++ key->off[1] = block; ++ level = 1; ++ } ++ } else { ++ key->off[0] = block; ++ level = 0; ++ } ++ ++ /* ++ * In the second step, we load the needed buffers. ++ */ ++ ++ key->level = level; ++ key->inode = inode; ++ ++ key->ptr[0] = (u32 *) (&(EXT2_I(inode)->i_data)); ++ ++ for (x = 1; x <= level; x++) { ++ u32 *ptr; ++ ++ ptr = key->ptr[x - 1]; ++ if (ptr == NULL) ++ break; ++/* Paul Whittaker tweak 19 Feb 2005 */ ++ block = le32_to_cpu(ptr[key->off[x - 1]]); ++ if (block == 0) ++ continue; // TLL 05/01/07 ++ if (x - 1 != 0) ++ block = le32_to_cpu(block); ++ if ((key->ibh[x] = __bread(inode->i_sb->s_bdev, ++ block, inode->i_sb->s_blocksize)) ++ == NULL) ++ goto error; ++ key->ptr[x] = (u32 *) (key->ibh[x]->b_data); ++ } ++ ++ return 1; ++ error: ++ for (; x != 0; x--) ++ if (key->ibh[x] != NULL) ++ brelse(key->ibh[x]); ++ return 0; ++} ++ ++ ++/* ++ * Find the block for a given key. Return 0 if there ++ * is no block for this key. ++ */ ++static inline u32 ext2_get_key_blkaddr(struct ext2_bkey *key) ++{ ++ assert(key->inode); ++ assert(atomic_read(&(key->inode)->i_mutex.count) <= 0); ++ ++/* Paul Whittaker tweak 19 Feb 2005 */ ++ if (key->ptr[key->level] == NULL) ++ return 0; ++ return le32_to_cpu(key->ptr[key->level][key->off[key->level]]); ++} ++ ++ ++/* ++ * Change the block for a given key. Return 0 on success, ++ * -errno on failure. ++ */ ++static inline int ext2_set_key_blkaddr(struct ext2_bkey *key, u32 blkaddr) ++{ ++ char bdn[BDEVNAME_SIZE]; ++ assert(key->inode); ++ assert(atomic_read(&(key->inode)->i_mutex.count) <= 0); ++ ++ if (key->ptr[key->level] == NULL) { ++ /* The reason that this "can't happen" is that this ++ routine is only used to shuffle block numbers or by ++ free_cluster_blocks. Cluster sizes are such that ++ clusters can't straddle address blocks. So the ++ indirect block address can't be zero. AFAIK, ptr ++ can only be NULL on error or on null indirect block ++ address. Hmm, come to think of it, I think there ++ are still some callers that don't check for errors ++ from ext2_get_key(), so this still can happen until ++ those are fixed up. */ ++ printk(KERN_ERR ++ "ext2_set_key_blkaddr: can't happen: NULL parent. " ++ "dev=%s, ino=%lu, level=%u.\n", ++ bdevname(key->inode->i_sb->s_bdev, bdn), ++ key->inode->i_ino, key->level); ++ return -ENOSYS; ++ } ++ /* Paul Whittaker tweak 19 Feb 2005 */ ++ key->ptr[key->level][key->off[key->level]] = le32_to_cpu(blkaddr); ++ if (key->level > 0) ++ mark_buffer_dirty(key->ibh[key->level]); ++ return 0; ++} ++ ++ ++/* ++ * Increment the key. Returns 0 if we go beyond the limits, ++ * 1 otherwise. ++ * ++ * Precondition: -key->off[level] <= incr < addr_per_block. ++ */ ++static int ext2_next_key(struct ext2_bkey *key, int incr) ++{ ++ int addr_per_block = EXT2_ADDR_PER_BLOCK(key->inode->i_sb); ++ int x, level = key->level; ++ u32 tmp; ++ ++ assert(key->inode); ++ assert(atomic_read(&(key->inode)->i_mutex.count) <= 0); ++ ++ ++ /* ++ * Increment the key. This is done in two step: first ++ * adjust the off array, then reload buffers that should ++ * be reloaded (we assume level > 0). ++ */ ++ ++ assert(key->off[level] >= -incr); ++ assert(incr < addr_per_block); ++ key->block += incr; ++ key->off[level] += incr; ++ ++ /* ++ * First step: should be thought as the propagation ++ * of a carry. ++ */ ++ ++ if (level == 0) { ++ if (key->off[0] >= EXT2_NDIR_BLOCKS) { ++ key->off[1] = key->off[0] - EXT2_NDIR_BLOCKS; ++ key->off[0] = EXT2_IND_BLOCK; ++ level = 1; ++ } ++ x = 0; ++ } else { ++ for (x = level; x > 0; x--) { ++ if (key->off[x] >= addr_per_block) { ++ key->off[x] -= addr_per_block; ++ key->off[x - 1]++; ++ ++ if (x == 1) { ++ if (++level < 4) { ++ key->off[level] = key->off[level - 1]; ++ key->off[level - 1] = 0; ++ } else ++ return 0; ++ } ++ } else ++ break; ++ } ++ } ++ ++ /* ++ * Second step: reload the buffers that have changed. ++ */ ++ ++ key->level = level; ++ ++ CHECK_NOT_ATOMIC ++ while (x++ < level) { ++ if (key->ibh[x] != NULL) { ++ if (IS_SYNC(key->inode) && buffer_dirty(key->ibh[x])) { ++ //mw: ++ assert(buffer_mapped(key->ibh[x]) ++ && (key->ibh[x]->b_bdev != NULL)); ++ ll_rw_block(WRITE, 1, &(key->ibh[x])); ++ wait_on_buffer(key->ibh[x]); ++ } ++ brelse(key->ibh[x]); ++ } ++/* Paul Whittaker tweak 19 Feb 2005 */ ++ if ((key->ptr[x - 1] != NULL) ++ && ((tmp = le32_to_cpu(key->ptr[x - 1][key->off[x - 1]])) != ++ 0)) { ++ if ((key->ibh[x] = ++ __bread(key->inode->i_sb->s_bdev, tmp, ++ key->inode->i_sb->s_blocksize)) ++ != NULL) ++ key->ptr[x] = (u32 *) (key->ibh[x]->b_data); ++ else ++ key->ptr[x] = NULL; ++ } else { ++ key->ibh[x] = NULL; ++ key->ptr[x] = NULL; ++ } ++ } ++ ++ return 1; ++} ++ ++ ++/* Method to free the key: just release buffers. ++ ++ Returns 0 on success, -errno on error. ++*/ ++ ++static int ext2_free_key(struct ext2_bkey *key) ++{ ++ int x, n; ++ struct buffer_head *bh[4]; ++ ++ assert(key->inode); ++ assert(atomic_read(&(key->inode)->i_mutex.count) <= 0); ++ ++ ++ for (x = 0, n = 0; x <= key->level; x++) { ++ if (key->ibh[x] != NULL) { ++ if (IS_SYNC(key->inode) && buffer_dirty(key->ibh[x])) ++ bh[n++] = key->ibh[x]; ++ else ++ brelse(key->ibh[x]); ++ } ++ } ++ ++ if (n > 0) { ++ int ncopy = n; ++ while (ncopy-- > 0) { ++ assert(buffer_mapped(bh[ncopy]) ++ && (bh[ncopy]->b_bdev != NULL)); ++ } ++ ++ ll_rw_block(WRITE, n, bh); ++ ++ CHECK_NOT_ATOMIC ++ ++ while (n-- > 0) { ++ wait_on_buffer(bh[n]); ++ /* TODO: Check for error. */ ++ brelse(bh[n]); ++ } ++ } ++ return 0; ++} ++ ++ ++/* Returns positive if specified cluster is compressed, ++ zero if not, ++ -errno if an error occurred. ++ ++ If you need the result to be accurate, then down i_sem before ++ calling this, and don't raise i_sem until after you've used the ++ result. */ ++int ext2_cluster_is_compressed_fn(struct inode *inode, unsigned cluster) ++{ ++ unsigned block = (ext2_cluster_block0(inode, cluster) ++ + ext2_cluster_nblocks(inode, cluster) ++ - 1); ++ struct ext2_bkey key; ++ int result; ++ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); ++ ++ /* impl: Not all callers of ext2_cluster_is_compressed_fn() have ++ i_sem down. Of course it is impossible to guarantee ++ up-to-date information for such callers (someone may ++ compress or decompress between when we check and when they ++ use the information), so hopefully it won't matter if the ++ information we return is slightly inaccurate (e.g. because ++ someone is de/compressing the cluster while we check). */ ++ if (!ext2_get_key(&key, inode, block)) ++ return -EIO; ++ ++ result = (ext2_get_key_blkaddr(&key) == EXT2_COMPRESSED_BLKADDR); ++ ext2_free_key(&key); ++ return result; ++} ++ ++ ++/* Support for the GETCOMPRRATIO ioctl() call. We calculate how many ++ blocks the file would hold if it weren't compressed. This requires ++ reading the cluster head for every compressed cluster. ++ ++ Returns either -EAGAIN or the number of blocks that the file would ++ take up if uncompressed. */ ++int ext2_count_blocks(struct inode *inode) ++{ ++ struct buffer_head *head_bh; ++ int count; ++ int cluster; ++ struct ext2_bkey key; ++ u32 end_blknr; ++ ++ if (!(EXT2_I(inode)->i_flags & EXT2_COMPRBLK_FL)) ++ return inode->i_blocks; ++ ++ mutex_lock(&inode->i_mutex); ++ end_blknr = ROUNDUP_RSHIFT(inode->i_size, ++ inode->i_sb->s_blocksize_bits); ++ ++ /* inode->i_blocks is stored in units of 512-byte blocks. It's ++ more convenient for us to work in units of s_blocksize. */ ++ { ++ u32 shift = inode->i_sb->s_blocksize_bits - 9; ++ ++ count = inode->i_blocks; ++ if (count & ((1 << shift) - 1)) ++ ext2_msg(inode->i_sb, ++ "ext2_count_blocks", ++ "i_blocks not multiple of blocksize"); ++ count >>= shift; ++ } ++ ++ cluster = 0; ++ if (!ext2_get_key(&key, inode, 0)) { ++ count = -EIO; ++ goto out; ++ } ++ while (key.block < end_blknr) { ++ u32 head_blkaddr = ext2_get_key_blkaddr(&key); ++ ++ /* bug fix: init head_bh for each iteration TLL 2/21/07 */ ++ head_bh = NULL; ++ if (head_blkaddr == EXT2_COMPRESSED_BLKADDR) { ++ count = -EXT2_ECOMPR; ++ break; ++ } ++ if (!ext2_next_key(&key, ext2_cluster_nblocks(inode, cluster) - 1)) ++ break; ++ if (ext2_get_key_blkaddr(&key) == EXT2_COMPRESSED_BLKADDR) { ++ struct ext2_cluster_head *head; ++ ++ if (head_blkaddr == 0) { ++ count = -EXT2_ECOMPR; ++ break; ++ } ++ head_bh = __getblk(inode->i_sb->s_bdev, ++ head_blkaddr, inode->i_sb->s_blocksize); ++ if (head_bh == NULL) { ++ /* Hmm, EAGAIN or EIO? */ ++ count = -EAGAIN; ++ break; ++ } ++ if (!buffer_uptodate(head_bh)) ++ ll_rw_block(READ, 1, &head_bh); ++ ++ CHECK_NOT_ATOMIC ++ ++ wait_on_buffer(head_bh); ++ ++#ifdef CONFIG_HIGHMEM ++ if (!page_address(head_bh->b_page)) { ++ BUG(); ++ } ++#endif ++ ++ head = (struct ext2_cluster_head *) head_bh->b_data; ++ /* remove clen > ulen test TLL 2/21/07 */ ++ if ((head->magic != cpu_to_le16(EXT2_COMPRESS_MAGIC_04X)) ++ || (le32_to_cpu(head->ulen) > EXT2_MAX_CLUSTER_BYTES) ++ || (head->holemap_nbytes > 4)) { ++ count = -EXT2_ECOMPR; ++ break; ++ } ++ assert(sizeof(struct ext2_cluster_head) == 16); ++ count += (ROUNDUP_RSHIFT(le32_to_cpu(head->ulen), ++ inode->i_sb->s_blocksize_bits) ++ - ROUNDUP_RSHIFT((le32_to_cpu(head->clen) ++ + sizeof(struct ext2_cluster_head) ++ + head->holemap_nbytes), ++ inode->i_sb->s_blocksize_bits)); ++ brelse(head_bh); ++ head_bh = NULL; ++ } ++ ++ if (!ext2_next_key(&key, 1)) ++ break; ++ cluster++; ++ } ++ ext2_free_key(&key); ++ if (head_bh != NULL) ++ brelse(head_bh); ++ out: ++ mutex_unlock(&inode->i_mutex); ++ if (count == -EXT2_ECOMPR) { ++ ext2_msg(inode->i_sb, ++ "ext2_count_blocks", ++ "invalid compressed cluster %u of inode %lu", ++ cluster, inode->i_ino); ++ EXT2_I(inode)->i_flags |= EXT2_ECOMPR_FL; ++ } ++ ++ /* The count should be in units of 512 (i.e. 1 << 9) bytes. */ ++ if (count >= 0) ++ count <<= inode->i_sb->s_blocksize_bits - 9; ++ return count; ++} ++ ++ ++/* Decompress some blocks previously obtained from a cluster. ++ Decompressed data is stored in ext2_rd_wa.u. Buffer heads in the bh ++ array are packed together at the begining of the array. The ulen ++ argument is an indication of how many bytes the caller wants to ++ obtain, excluding holes. (This can be less than head->ulen, as in the ++ case of readpage.) No hole processing is done; we don't even look at ++ head->holemap. ++ ++ Note the semantic difference between this and ++ (): the latter decompresses a cluster _and ++ stores it as such_, whereas ext2_decompress_blocks() just ++ decompresses the contents of the blocks into ext2_rd_wa.u. ++ ++ The working area is supposed to be available and locked. ++ ++ Returns a negative value on failure, the number of bytes ++ decompressed otherwise. ++ ++ Called by : ++ ++ ext2_decompress_cluster () [sem down] ++ ext2_readpage () [sem down, but only ifndef EXT2_LOCK_BUFFERS] */ ++ ++/* TODO: ext2_decompress_blocks() scribbles in ext2_rd_wa.c. ++ Check callers to make sure this isn't a problem. */ ++ ++/* mw: caller must already have done: "get_cpu_var(ext2_rd_wa)" */ ++size_t ++ext2_decompress_blocks(struct inode * inode, ++ struct buffer_head ** bh, ++ int nblk, size_t ulen, u32 cluster) ++{ ++ struct ext2_cluster_head *head; ++ int count, src_ix, x; ++ unsigned char *dst; ++ unsigned meth, alg; ++ char bdn[BDEVNAME_SIZE]; ++ ++#ifdef EXT2_COMPR_DEBUG ++ assert(in_atomic()); ++ assert(atomic_read(&inode->i_mutex.count) <= 0); /* i.e. mutex_lock */ ++#endif ++ ++ /* ++ We pack the buffer together before (and must take care ++ not to duplicate the buffer heads in the array). ++ ++ pjm 1998-01-09: Starting from e2compr-0.4.0, they should ++ already be packed together in the blkaddr array. TODO: ++ Insert appropriate assert() statements checking tht this is ++ the case. TODO: Check that callers have bh[] packed. */ ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c("ext2_decompress_blocks: nblk=%d\n", nblk); ++#endif ++ for (src_ix = 0, x = 0; src_ix < nblk; src_ix++) { ++ if (bh[src_ix] == NULL) ++ printk("no_bheader()\n"); ++ if ((bh[src_ix] != NULL) && (bh[src_ix]->b_blocknr != 0)) { ++ ++ if (x < src_ix) { ++ ext2_msg(inode->i_sb, "bad buffer table", ++ "inode = %lu", inode->i_ino); ++ goto error; ++ } ++ x++; ++ } ++ } ++ ++ nblk = x; ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("ext2_decompress_blocks (2): nblk=%d\n", nblk); ++#endif ++ if (nblk == 0) { ++ ext2_msg(inode->i_sb, "no block in cluster", "inode = %lu", ++ inode->i_ino); ++ goto error; ++ } ++ ++ restore_b_data_himem(bh[0]); ++ head = (struct ext2_cluster_head *) (bh[0]->b_data); ++ ++ /* ++ * Do some consistency checks. ++ */ ++ ++ if (head->magic != cpu_to_le16(EXT2_COMPRESS_MAGIC_04X)) { ++ ext2_msg(inode->i_sb, ++ "bad magic number", ++ "inode = %lu, magic = %#04x", ++ inode->i_ino, le16_to_cpu(head->magic)); ++ goto error; ++ } ++#if EXT2_GRAIN_SIZE & (EXT2_GRAIN_SIZE - 1) ++# error "This code assumes EXT2_GRAIN_SIZE to be a power of two." ++#endif ++ /* The macro also assumes that _a > 0, _b > 0. */ ++#define ROUNDUP_GE(_a, _b, _d) ( ( ((_a) - 1) \ ++ | ((_d) - 1)) \ ++ >= ( ((_b) - 1) \ ++ | ((_d) - 1))) ++ ++ //mw: following 3 just for debugging!!! ++ assert(!((le32_to_cpu(head->ulen) > EXT2_MAX_CLUSTER_BYTES))); ++ assert(!((head->clen == 0))); ++ assert(!(ROUNDUP_GE(le32_to_cpu(head->clen) ++ + head->holemap_nbytes + sizeof(struct ext2_cluster_head), ++ le32_to_cpu(head->ulen), EXT2_GRAIN_SIZE))); ++ ++ if ((le32_to_cpu(head->ulen) > EXT2_MAX_CLUSTER_BYTES) ++ || (head->clen == 0) ++ || ROUNDUP_GE(le32_to_cpu(head->clen) ++ + head->holemap_nbytes ++ + sizeof(struct ext2_cluster_head), ++ le32_to_cpu(head->ulen), EXT2_GRAIN_SIZE)) { ++ ext2_msg(inode->i_sb, ++ "invalid cluster len", ++ "inode = %lu, len = %u:%u", ++ inode->i_ino, ++ le32_to_cpu(head->clen), le32_to_cpu(head->ulen)); ++ goto error; ++ } ++#undef ROUNDUP_GE ++ ++ /* TODO: Test for `nblk != 1 + ...' instead of the current ++ one-sided test. However, first look at callers, and make ++ sure that they handle the situation properly (e.g. freeing ++ unneeded blocks) and tht they always pass a correct ++ value for nblk. */ ++ if (nblk <= ((le32_to_cpu(head->clen) ++ + head->holemap_nbytes + sizeof(struct ext2_cluster_head) ++ - 1) ++ / bh[0]->b_size)) { ++ int i; ++ ext2_msg(inode->i_sb, ++ "missing blocks", ++ "inode = %lu, blocks = %d/%u", ++ inode->i_ino, nblk, ((le32_to_cpu(head->clen) ++ + head->holemap_nbytes ++ + sizeof(struct ext2_cluster_head) ++ - 1) ++ / bh[0]->b_size) + 1); ++ printk("i_size=%d\n", (int) inode->i_size); ++ for (i = 0; i < 12; i++) ++ printk("i_data[%d]=%d\n", i, EXT2_I(inode)->i_data[i]); ++ printk("cluster_head (sizeof head=%u):\n\tmagic=0x%4x\n\tmethod=%d\n\t \ ++ holemap_nbytes=%d\n\tulen=%d\n\tclen=%d\n\tbh->b_size=%zu\n", ++ sizeof(struct ext2_cluster_head), head->magic, ++ (int) head->method, (int) head->holemap_nbytes, head->ulen, ++ head->clen, bh[0]->b_size); ++ goto error; ++ } ++ ++ /* I moved it here in case we need to load a module that ++ * needs more heap that is currently allocated. ++ * In such case "init_module" for that algorithm forces ++ * re-allocation of ext2_wa. It should be safe here b/c the ++ * first reference to ext2_wa comes just after and we have ++ * locked ext2_wa before. ++ * ++ * FIXME: Totally separate working areas for reading and writing. ++ * Jan R. ++ */ ++ meth = head->method; /* only a byte, so no swabbing needed. */ ++ if (meth >= EXT2_N_METHODS) { ++ ext2_msg(inode->i_sb, ++ "Ass: illegal method id", ++ "inode = %lu, id = %u", inode->i_ino, meth); ++ dump_stack(); ++ goto error; ++ } ++ alg = ext2_method_table[meth].alg; ++ ++ /* ++ * Adjust the length if too many bytes are requested. ++ * ++ * TODO: Traiter les bitmaps ici, et non plus au niveau de ++ * l'appelant. Faire un petit cache en memorisant le ++ * numero du dernier noeud decompresse et du dernier ++ * cluster. Le pb, c'est qu'on ne peut pas savoir si ++ * les blocs ont ete liberes et realloue entre temps ++ * -> il faut etre prevenu pour invalider le buffer. ++ * ++ * pjm fixme tr: Take care of the bitmaps here, ++ * instead of by the caller as we currently do. Keep ++ * a small cache that holds the number of the ++ * previous to have been ++ * decompressed. The problem is that we have no way ++ * of knowing whether the blocks have been freed and ++ * reallocated in the meantime / since last time -> ++ * we must be informed so that we can invalidate the ++ * buffer. */ ++ if (ulen > le32_to_cpu(head->ulen)) { ++ memset(__get_cpu_var(ext2_rd_wa)->u + le32_to_cpu(head->ulen), 0, ulen - le32_to_cpu(head->ulen)); ++ ulen = le32_to_cpu(head->ulen); ++ ++ assert((bh[0]->b_size & (bh[nblk - 1]->b_size - 1)) == 0); ++ if (((le32_to_cpu(head->clen) ++ + head->holemap_nbytes + sizeof(struct ext2_cluster_head) ++ - 1) ++ | (bh[0]->b_size - 1)) ++ >= ((ulen - 1) | (bh[0]->b_size - 1))) { ++ printk(KERN_WARNING ++ "ext2_decompress_blocks: " ++ "ulen (=%zu) or clen (=%u) wrong " ++ "in dev %s, inode %lu.\n", ++ ulen, le32_to_cpu(head->clen), ++ bdevname(inode->i_sb->s_bdev, bdn), inode->i_ino); ++ goto error; ++ } ++ } ++ ++ /* ++ * Now, decompress data. ++ */ ++ /* TODO: Is this (ulen == 0) possible? */ ++ if (ulen == 0) ++ return 0; ++ ++ for (x = 0, dst = __get_cpu_var(ext2_rd_wa)->c; x < nblk; dst += bh[x++]->b_size) { ++ restore_b_data_himem(bh[x]); ++ memcpy(dst, bh[x]->b_data, bh[x]->b_size); ++ } ++ ++ ++ if (!ext2_algorithm_table[alg].avail) { ++ ext2_msg(inode->i_sb, ++ "ext2_decompress_blocks", ++ "algorithm `%s' not available for inode %lu", ++ ext2_algorithm_table[alg].name, inode->i_ino); ++ ext2_mark_algorithm_use(inode, alg); ++ goto error; ++ } ++ ++ ++#ifdef EXT2_COMPR_DEBUG ++ { ++ struct ext2_cluster_head *wa1head = (struct ext2_cluster_head *) __get_cpu_var(ext2_rd_wa)->c; ++ unsigned clen = le32_to_cpu(wa1head->clen); ++ if (wa1head->checksum != ++ cpu_to_le32(ext2_adler32 ++ (le32_to_cpu(*(u32 *) __get_cpu_var(ext2_rd_wa)->c), ++ __get_cpu_var(ext2_rd_wa)->c + 8, ++ (sizeof(struct ext2_cluster_head) - 8 + ++ head->holemap_nbytes + clen)))) ++ { ++ head->checksum = cpu_to_le32(0); ++ ext2_msg(inode->i_sb, "ext2_decompress_blocks: corrupted compressed data ", ++ "in inode %lu", inode->i_ino); ++ //goto error; ++ //mw: we try to go on. if data is corrupt we will get an compression error anyway. ++ } ++ } ++#endif ++ ++ count = ext2_algorithm_table[alg].decompress(__get_cpu_var(ext2_rd_wa)->c + ++ sizeof(struct ++ ext2_cluster_head) + ++ head->holemap_nbytes, ++ __get_cpu_var(ext2_rd_wa)->u, ++ __get_cpu_var(ext2_rd_wa)->heap, ++ le32_to_cpu(head->clen), ulen, ++ ext2_method_table[meth].xarg); ++ ++ /* If we got fewer than ulen bytes, there is a problem, since ++ we corrected the ulen value before decompressing. Note ++ that it's OK for count to exceed ulen, because ulen can be ++ less than head->ulen. */ ++ if ((count < ulen) || (count != le32_to_cpu(head->ulen))) { ++ ext2_msg(inode->i_sb, ++ "ext2_decompress_blocks: corrupted compressed data ", "inode = %lu, count = %u of %zu (%u/%u)", ++ inode->i_ino, count, ulen, le32_to_cpu(head->clen), le32_to_cpu(head->ulen)); ++ goto error; ++ } ++ ext2_ensure_algorithm_use(inode, alg); ++ return count; ++ ++ error: ++ ++ /* Raise the ECOMPR flag for this file. What this means is ++ that the file cannot be written to, and can only be read if ++ the user raises the NOCOMPR flag. ++ ++ pjm 1997-01-16: I've changed it so that files with ECOMPR ++ still have read permission, so user can still read the rest ++ of the file but get an I/O error (errno = EXT2_ECOMPR) when ++ they try to access anything from this cluster. */ ++ ++ EXT2_I(inode)->i_flags |= EXT2_ECOMPR_FL; ++ ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty_sync(inode); ++ /* pjm 1998-02-21: We used to do `memset(ext2_rd_wa.u, 0, ulen)' ++ here because once upon a time the user could sometimes see ++ buf contents. I believe that this can never happen any ++ more. */ ++ return -EXT2_ECOMPR; ++} ++ ++ ++/* ext2_calc_free_ix: Calculates the position of the C_NBLK'th non-hole ++ block; equals C_NBLK plus the number of holes in the first CALC_FREE_IX() ++ block positions of the cluster. ++ ++ pre: 1 =< c_nblk < EXT2_MAX_CLUSTER_BLOCKS, ++ Number of 1 bits in ,ubitmap` > ,c_nblk`. ++ post: c_nblk =< calc_free_ix() < EXT2_MAX_CLUSTER_BLOCKS ++ ++ Called by: ++ ext2_decompress_cluster() ++ ext2_file_write() ++ ++ TODO: Have ext2_compress_cluster() call this. ++ */ ++unsigned ext2_calc_free_ix(unsigned holemap_nbytes, u8 const *holemap, ++ unsigned c_nblk) ++{ ++ unsigned i; ++ ++ assert(1 <= c_nblk); ++ assert(c_nblk < EXT2_MAX_CLUSTER_BLOCKS); ++ for (i = 0; (i < holemap_nbytes * 8) && (c_nblk > 0);) { ++ assert(i < EXT2_MAX_CLUSTER_BLOCKS - 1); ++ if ((holemap[i >> 3] & (1 << (i & 7))) == 0) ++ c_nblk--; ++ i++; ++ } ++ i += c_nblk; ++ assert(i < EXT2_MAX_CLUSTER_BLOCKS); ++ return i; ++} ++ ++ ++/* (): Prepare the blkaddr[] array for ++ decompression by moving non-hole blocks to their proper positions ++ (according to ubitmap) and zeroing any other blocks. ++ ++ Returns 0 on success, -errno on error. ++ ++ Note: We assume tht blkaddr[i] won't change under us forall ++ clu_block0 =< i < clu_block0 + clu_nblocks. Holding i_sem should ++ guarantee this. ++ ++ Called by: ++ ext2_decompress_cluster() ++ ext2_file_write() */ ++int ++ext2_unpack_blkaddrs(struct inode *inode, ++ struct buffer_head *bh[], ++ int mmcp, ++ unsigned holemap_nbytes, ++ u8 const *holemap, ++ unsigned c_nblk, ++ unsigned free_ix, ++ unsigned clu_block0, unsigned clu_nblocks) ++{ ++ struct ext2_bkey key; ++ u32 *blkaddr; ++ unsigned si, di; ++ ++ assert(clu_nblocks <= EXT2_MAX_CLUSTER_BLOCKS); ++ assert(1 <= c_nblk); ++ assert(c_nblk <= free_ix); ++ assert(free_ix < EXT2_MAX_CLUSTER_BLOCKS); ++ if (!ext2_get_key(&key, inode, clu_block0)) ++ return -EIO; ++ ++ if (key.ptr[key.level] == NULL) { ++ /* TODO: Call ext2_error(). */ ++ ext2_free_key(&key); ++ return -EIO; ++ } ++ ++ /* impl: Note tht we're relying on clusters not straddling ++ address block boundaries. */ ++ blkaddr = &key.ptr[key.level][key.off[key.level]]; ++ memset(blkaddr + free_ix, ++ 0, sizeof(*blkaddr) * (clu_nblocks - free_ix)); ++ si = c_nblk; ++ for (di = free_ix; di > si;) { ++ --di; ++ if (((di >> 3) < holemap_nbytes) ++ && (holemap[di >> 3] & (1 << (di & 7)))) { ++ blkaddr[di] = 0; ++ bh[di]->b_blocknr = 0; ++ clear_bit(BH_Mapped, &bh[di]->b_state); ++ } else { ++ if (si == 0) { ++ break; ++ } ++ blkaddr[di] = blkaddr[--si]; ++ assert(bh[di]->b_blocknr == 0); ++ assert(bh[si]->b_blocknr != 0); ++ assert(buffer_mapped(bh[si])); ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("unpack: di=%d sts=0x%x si=%d blk=%ld sts=0x%x\n", ++ di, (int) bh[di]->b_state, si, bh[si]->b_blocknr, ++ (int) bh[si]->b_state); ++#endif ++ bh[di]->b_blocknr = bh[si]->b_blocknr; ++ set_bit(BH_Mapped, &bh[di]->b_state); ++ bh[si]->b_blocknr = 0; ++ clear_bit(BH_Mapped, &bh[si]->b_state); ++ set_bit(BH_Uptodate, &bh[di]->b_state); ++ if (mmcp) { ++ restore_b_data_himem(bh[si]); ++ restore_b_data_himem(bh[di]); ++ memcpy(bh[di]->b_data, bh[si]->b_data, ++ inode->i_sb->s_blocksize); ++ } ++ } ++ } ++ if (key.level > 0) ++ mark_buffer_dirty(key.ibh[key.level]); ++ return ext2_free_key(&key); ++} ++ ++ ++/* ++ * Decompress one cluster. If already compressed, the cluster ++ * is decompressed in place, and the compress bitmap is updated. ++ * ++ * Returns the size of decompressed data on success, a negative ++ * value in case of failure, or 0 if the cluster was not compressed. ++ * ++ * The inode is supposed to be writable. ++ * ++ * Called by : ++ * ++ * ext2_decompress_inode() [sem down] ++ * ext2_file_write() [sem down] ++ * trunc_bitmap() [sem down] ++ */ ++int ext2_decompress_cluster(struct inode *inode, u32 cluster) ++{ ++ struct buffer_head *bh[EXT2_MAX_CLUSTER_BLOCKS]; ++ struct buffer_head *bhc[EXT2_MAX_CLUSTER_BLOCKS]; ++ struct page *pg[EXT2_MAX_CLUSTER_PAGES], *epg[EXT2_MAX_CLUSTER_PAGES]; ++ int result, nbh; ++ unsigned npg, c_nblk; ++ struct ext2_cluster_head *head; ++ int i = 0; ++ unsigned free_ix, clu_block0, clu_nblocks; ++ int d_npg = -1; /* number of decompressed page */ ++ unsigned long allpagesuptodate = 1; ++ struct buffer_head *bh_writeout[EXT2_MAX_CLUSTER_BLOCKS]; ++ int bhn_writeout; ++#ifdef CONFIG_HIGHMEM ++ int kmapped = 0; ++#endif ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_BLOCKS; i++) { ++ bh_writeout[i] = NULL; ++ bhn_writeout = 0; ++ } ++ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); /* i.e. mutex_lock */ ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) ++ epg[i] = NULL; ++ ++ /* ++ Get blocks from cluster. ++ Assign to variables head, ubitmap, clu_block0, clu_nblocks. ++ Shuffle blkaddr[] array and write zero to holes. ++ Allocate new blocks. ++ Get the working area. ++ Decompress. ++ Copy to bh[]->b_data (marking buffers uptodate and dirty). ++ Release working area. ++ Release bh[]. ++ */ ++ ++ nbh = 0; ++ npg = ext2_cluster_npages(inode, cluster); ++ result = ext2_get_cluster_pages(inode, cluster, pg, NULL, 0); ++ if (result <= 0) { ++ for (i = 0; i < npg; i++) ++ epg[i] = NULL; ++ goto out_err; ++ } ++ ++ for (i = 0; i < npg; i++) { ++ if ((pg[i]->index <= ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) && ++ !PageUptodate(pg[i])) { ++ allpagesuptodate = 0; ++ } ++ } ++ if (allpagesuptodate) { ++ //printk("DecompressPages: Ino:%lu\n", inode->i_ino); ++ result = ext2_decompress_pages(inode, cluster, pg); ++ if (result != 0) { ++ for (i = 0; i < npg; i++) ++ epg[i] = NULL; ++ if (result > 0) ++ goto cleanup; ++ else ++ goto out_err; ++ } ++ /*mw: if we continue here then in ext2_decompress_pages ++ * not all pages were up-to-date ++ */ ++ } ++ //printk("DecompressCluster: Ino:%lu\n", inode->i_ino); ++ result = ext2_get_cluster_extra_pages(inode, cluster, pg, epg); ++ if (result <= 0) { ++ goto out_err; ++ } ++#ifdef CONFIG_HIGHMEM ++ ext2_kmap_cluster_pages(NULL, pg, epg); ++ kmapped = 1; ++#endif ++ ++ result = ext2_get_cluster_blocks(inode, cluster, bh, pg, epg, 0); ++ if (result <= 0) { ++ goto out_err; ++ } ++ nbh = c_nblk = result; ++ ++ ++#ifdef EXT2_COMPR_REPORT ++ { ++ int j; ++ printk ++ (" > > > ext2_decompress_cluster %d: inode=%ld, size=%d nbh=%d\n", ++ cluster, inode->i_ino, (int) inode->i_size, nbh); ++#ifdef EXT2_COMPR_REPORT_VERBOSE ++ for (j = 0; j < nbh; j++) { ++ if (bh[j]) { ++ printk("0buffer_head[%d]: blocknr=%lu, addr=%p \n", j, ++ (unsigned long) bh[j]->b_blocknr, bh[j]); ++ if (bh[j]->b_page) ++ printk("0:[page->index=%ld]\n", bh[j]->b_page->index); ++ else ++ printk("[No page]\n"); ++ } else ++ printk("buffer_head[%d] is NULL\n", j); ++ } ++ while ((j < EXT2_MAX_CLUSTER_BLOCKS) && (bh[j] != NULL) && bh[j]->b_blocknr) { /*Add by Yabo Ding */ ++ printk ++ ("buffer_head[%d] is free but not NULL: blocknr=%lu, addr=%p\n", ++ j, (unsigned long) bh[j]->b_blocknr, bh[j]); ++ j++; ++ } ++#endif ++ } ++#endif ++ for (i = 0; i < nbh; i++) ++ assert(bh[i]->b_blocknr != 0); ++ ++ restore_b_data_himem(bh[0]); ++ ++ head = (struct ext2_cluster_head *) bh[0]->b_data; ++ if (head->magic != cpu_to_le16(EXT2_COMPRESS_MAGIC_04X)) { ++ ext2_msg(inode->i_sb, ++ "ext2_decompress_cluster: bad magic number", ++ "cluster %d: inode = %lu, magic = %#04x", ++ cluster, inode->i_ino, le16_to_cpu(head->magic)); ++ EXT2_I(inode)->i_flags |= EXT2_ECOMPR_FL; ++ result = -EXT2_ECOMPR; ++ goto out_err; ++ } ++ if (le32_to_cpu(head->ulen) - ++ (c_nblk << inode->i_sb->s_blocksize_bits) <= 0) { ++ ext2_error(inode->i_sb, "ext2_decompress_cluster", ++ "ulen too small for c_nblk. ulen=%u, c_nblk=%u, bs=%lu", ++ le32_to_cpu(head->ulen), c_nblk, ++ inode->i_sb->s_blocksize); ++ EXT2_I(inode)->i_flags |= EXT2_ECOMPR_FL; ++ result = -EXT2_ECOMPR; ++ goto out_err; ++ } ++ free_ix = ++ ext2_calc_free_ix(head->holemap_nbytes, (u8 const *) (&head[1]), ++ c_nblk); ++ clu_block0 = ext2_cluster_block0(inode, cluster); ++ clu_nblocks = ext2_cluster_nblocks(inode, cluster); ++ ext2_unpack_blkaddrs(inode, bh, 1, ++ head->holemap_nbytes, (u8 const *) (&head[1]), ++ c_nblk, free_ix, clu_block0, clu_nblocks); ++ ++ /* Allocate the extra blocks needed. */ ++ { ++ int data_left = le32_to_cpu(head->ulen); ++ ++ data_left -= c_nblk << inode->i_sb->s_blocksize_bits; ++ assert(data_left > 0); ++ for (i = free_ix; i < clu_nblocks; i++) ++ if (((i >> 3) >= head->holemap_nbytes) ++ || !(head->holemap[i >> 3] & (1 << (i & 7)))) { ++ result = ext2_get_block(inode, ++ clu_block0 + i, ++ bh[i], 1 /* create */ ); ++ if (bh[i]->b_blocknr == 0) ++ goto out_err; ++ d_npg = ++ (i >> ++ (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)) + ++ 1; ++ nbh++; ++ data_left -= inode->i_sb->s_blocksize; ++ if (data_left <= 0) ++ break; ++ } ++ } ++ ++ /* jmr 1998-10-28 Hope this is the last time I'm moving this code. ++ * Module loading must be done _before_ we lock wa, just think what ++ * can happen if we reallocate wa when somebody else uses it... ++ */ ++ { ++ unsigned meth; ++#ifdef CONFIG_KMOD ++ unsigned alg; ++#endif ++ ++ meth = head->method; /* only a byte, so no swabbing needed. */ ++ if (meth >= EXT2_N_METHODS) { ++ ext2_msg(inode->i_sb, ++ "Ass.: illegal method id", ++ "inode = %lu, id = %u", inode->i_ino, meth); ++ result = -EXT2_ECOMPR; ++ goto out_err; ++ } ++#ifdef CONFIG_KMOD ++ alg = ext2_method_table[meth].alg; ++ if (!ext2_algorithm_table[alg].avail) { ++ char str[32]; ++ ++ sprintf(str, "ext2-compr-%s", ext2_algorithm_table[alg].name); ++ request_module(str); ++ } ++#endif ++ } ++ ++ result = -EINTR; ++ ++ /* ++ * Then, decompress and copy back data. ++ */ ++ { ++ int ic; ++ ++ for (ic = 0, i = 0; i < clu_nblocks; i++) { ++ if (bh[i]->b_blocknr != 0) { ++ bhc[ic] = bh[i]; ++ ic++; ++ if (ic == c_nblk) { ++ break; ++ } ++ } ++ } ++ } ++ ++ ++#ifdef EXT2_COMPR_REPORT_WA ++ printk(KERN_DEBUG "pid %d locks wa\n", current->pid); ++#endif ++ if (get_cpu_var(ext2_rd_wa) == NULL) ++ { ++ ext2_alloc_rd_wa(); ++ } ++ assert(__get_cpu_var(ext2_rd_wa) != NULL); ++ ++ result = ext2_decompress_blocks(inode, bhc, c_nblk, ++ le32_to_cpu(head->ulen), cluster); ++ if (result != (int) le32_to_cpu(head->ulen)) { ++ if (result >= 0) { ++ /* I think this is impossible, as ++ ext2_decompress_blocks() checks against ++ head->ulen. */ ++ printk(KERN_WARNING "Unexpected return value %d " ++ "from ext2_decompress_blocks()\n", result); ++ result = -EXT2_ECOMPR; ++ } ++ ++#ifdef EXT2_COMPR_REPORT_WA ++ printk(KERN_DEBUG "pid %d unlocks wa\n", current->pid); ++#endif ++ put_cpu_var(ext2_rd_wa); ++ goto out_err; ++ } ++ ++#ifdef EXT2_COMPR_REPORT ++ printk(KERN_DEBUG "ext2: %04x:%lu: cluster %d+%d [%d] " ++ "decompressed into %d bytes\n", ++ inode->i_rdev, ++ inode->i_ino, clu_block0, clu_nblocks, c_nblk, result); ++#endif ++ ++ /* Copy back decompressed data. */ ++ { ++ int count = result; ++ unsigned char const *src; ++ int c, p; ++ int cbh; ++ int n; /* block index in page */ ++ struct buffer_head *bp; ++ unsigned addr0, b_start, b_end; ++ ++ assert(count > 0); ++ if (d_npg == -1) { ++ d_npg = ((count - 1) >> PAGE_CACHE_SHIFT) + 1; ++ } ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c ++ ("ext2_decompress_cluster: cnt=%d free_ix=%d d_npg=%d nbh=%d\n", ++ count, free_ix, d_npg, nbh); ++#endif ++ result = -EXT2_ECOMPR; ++ src = __get_cpu_var(ext2_rd_wa)->u; ++ cbh = 0; ++ for (c = 0; c < clu_nblocks; c++) { ++ ++ if (bh[c]->b_blocknr == 0) { ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("\t clear buf %d sts=0x%x\n", c, ++ (int) bh[c]->b_state); ++#endif ++ restore_b_data_himem(bh[c]); ++ memset(bh[c]->b_data, 0, inode->i_sb->s_blocksize); ++ continue; ++ } ++ if (cbh >= (nbh - 1)) { ++ break; ++ } ++ if (count < inode->i_sb->s_blocksize) { ++ put_cpu_var(ext2_rd_wa); ++ goto out_err; ++ } ++ cbh++; ++ count -= inode->i_sb->s_blocksize; ++ p = c >> (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); ++ if (!PageUptodate(pg[p])) { ++ addr0 = (clu_block0 << inode->i_sb->s_blocksize_bits); ++ b_start = addr0 + (c << inode->i_sb->s_blocksize_bits); ++ b_end = b_start + inode->i_sb->s_blocksize; ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("\t[%d] sts=0x%x e=%d s=%d sz=%d pg:%lu(%#x)\n", ++ c, (int) bh[c]->b_state, b_end, b_start, ++ (int) inode->i_size, pg[p]->index, ++ (unsigned int) pg[p]); ++#endif ++ if (b_end <= inode->i_size) { ++ /* Block is before end of file, copy data */ ++ restore_b_data_himem(bh[c]); ++ memcpy(bh[c]->b_data, src, inode->i_sb->s_blocksize); ++ ++ } else if (b_start < inode->i_size) { ++ /* Block contains end of file, copy to end */ ++ restore_b_data_himem(bh[c]); ++ memcpy(bh[c]->b_data, src, inode->i_size - b_start); ++ ++ } ++ set_buffer_uptodate(bh[c]); ++ set_buffer_dirty(bh[c]); ++ bh_writeout[bhn_writeout] = bh[c]; //mw ++ bhn_writeout++; //mw ++ } else { ++ //mw: DEBUG. buffer is uptodate now. compress will not reread! an get the compressed data!!! ++ // clear flag in extra page!!! ++ // clear_bit(BH_Uptodate, &bh[c]->b_state); ++ ++ n = c & ((PAGE_CACHE_SIZE - 1) >> inode->i_sb-> ++ s_blocksize_bits); ++ bp = page_buffers(pg[p]); ++ for (i = 0; i < n; i++) { ++ bp = bp->b_this_page; ++ } ++ result = ext2_get_block(inode, clu_block0 + c, bp, 0); ++ ++ //mw: needed to do a writeback of the non-epg-buffers ++ //no idea how it was done before ++ set_buffer_uptodate(bp); ++ set_buffer_dirty(bp); ++ bh_writeout[bhn_writeout] = bp; //mw ++ bhn_writeout++; //mw ++ ++ if (bp->b_blocknr == 0) { ++ put_cpu_var(ext2_rd_wa); ++ goto out_err; ++ } ++ assert(bp->b_blocknr == bh[c]->b_blocknr); ++ } ++ src += inode->i_sb->s_blocksize; ++ } ++ if (count > inode->i_sb->s_blocksize) { ++ put_cpu_var(ext2_rd_wa); ++ goto out_err; ++ } ++ p = c >> (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); ++ if (!PageUptodate(pg[p])) { ++ addr0 = (clu_block0 << inode->i_sb->s_blocksize_bits); ++ b_start = addr0 + (c << inode->i_sb->s_blocksize_bits); ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("\t[%d] sts=0x%x c=%d s=%d sz=%d pg:%lu(%#x)\n", c, ++ (int) bh[c]->b_state, count, b_start, ++ (int) inode->i_size, pg[p]->index, ++ (unsigned int) pg[p]); ++#endif ++ if (b_start >= inode->i_size) { ++ restore_b_data_himem(bh[c]); ++ memset(bh[c]->b_data, 0, inode->i_sb->s_blocksize); ++ ++ } else { ++ if ((inode->i_size - b_start) < count) { ++ restore_b_data_himem(bh[c]); ++ memcpy(bh[c]->b_data, src, inode->i_size - b_start); ++ memset(bh[c]->b_data + (inode->i_size - b_start), 0, ++ count - (inode->i_size - b_start)); ++ } else { ++ restore_b_data_himem(bh[c]); ++ memcpy(bh[c]->b_data, src, count); ++ } ++ } ++ set_buffer_uptodate(bh[c]); ++ set_buffer_dirty(bh[c]); ++ bh_writeout[bhn_writeout] = bh[c]; //mw ++ bhn_writeout++; //mw ++ } else { ++ assert(epg[p] != NULL); //mw ++ n = c & ((PAGE_CACHE_SIZE - 1) >> inode->i_sb-> ++ s_blocksize_bits); ++ bp = page_buffers(pg[p]); ++ for (i = 0; i < n; i++) { ++ bp = bp->b_this_page; ++ } ++ result = ext2_get_block(inode, clu_block0 + c, bp, 0); ++ ++ //mw: needed to do a writeback of the non-epg-buffers ++ //no idea how it was done before ++ set_buffer_uptodate(bp); ++ set_buffer_dirty(bp); ++ bh_writeout[bhn_writeout] = bp; //mw ++ bhn_writeout++; //mw ++ if (bp->b_blocknr == 0) { ++ put_cpu_var(ext2_rd_wa); ++ goto out_err; ++ } ++ assert(bp->b_blocknr == bh[c]->b_blocknr); ++ } ++ result = (nbh - 1) * inode->i_sb->s_blocksize + count; ++ } ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (pg[i] == NULL) ++ break; ++ if (i < d_npg) ++ SetPageUptodate(pg[i]); ++ } ++ ++#ifdef EXT2_COMPR_REPORT_WA ++ printk(KERN_DEBUG "pid %d unlocks wa\n", current->pid); ++#endif ++ put_cpu_var(ext2_rd_wa); ++ ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty_sync(inode); ++ /* If needed, EXT2_DIRTY_FL is raised by the caller. */ ++ ++#if 0 ++ /* TODO: SYNC */ ++ if (IS_SYNC(inode)) { ++ generic_osync_inode(inode, inode->i_mapping, ++ OSYNC_METADATA | OSYNC_DATA); ++ } ++#endif ++ assert(result >= 0); ++ ++ //Sync out changes: ++ assert(bhn_writeout <= EXT2_MAX_CLUSTER_BLOCKS); ++ assert(bhn_writeout >= 0); ++ ++ //mw: debug ++ for (i = 0; i < bhn_writeout; i++) { ++ if ((!buffer_mapped(bh_writeout[i])) ++ || (bh_writeout[i]->b_bdev == NULL)) { ++ u32 block = ext2_cluster_block0(inode, cluster); ++ ext2_get_block(inode, block + i, bh_writeout[i], 1); ++ //printk("ext2_get_block Block:%lu, Mapped:%i, Page:%lu, bdev: %#x\n", bh_writeout[i]->b_blocknr, (bh_writeout[i]->b_state & BH_Mapped), (bh_writeout[i]->b_page ? bh_writeout[i]->b_page->index : 0), bh_writeout[i]->b_bdev ); ++ } ++ assert(buffer_mapped(bh_writeout[i])); ++ assert(bh_writeout[i]->b_bdev != NULL); ++ assert(bh_writeout[i]->b_bdev == inode->i_sb->s_bdev); ++ /*if (bh_writeout[i]->b_bdev == NULL) ++ bh_writeout[i]->b_bdev = inode->i_sb->s_bdev; //fix bdev-bug */ ++ } ++ ++ ll_rw_block(WRITE, bhn_writeout, bh_writeout); ++ //mw: seems we have to wait here, otherwise: crash! ++ ++ CHECK_NOT_ATOMIC ++ for (i = 0; i < bhn_writeout; i++) { ++ if (bh_writeout[i]) ++ wait_on_buffer(bh_writeout[i]); ++ } ++ goto cleanup; ++ ++ out_err: ++ printk("Error in Decompressing cluster: Err=%i\n", result); ++ ++ cleanup: ++ ++#ifdef CONFIG_HIGHMEM ++ if (kmapped) ++ ext2_kunmap_cluster_pages(NULL, pg, epg); ++#endif ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (pg[i] == NULL) ++ break; ++ unlock_page(pg[i]); ++ page_cache_release(pg[i]); ++ } ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (epg[i] != NULL) { ++ ClearPageDirty(epg[i]); ++ ClearPageUptodate(epg[i]); ++ try_to_free_buffers(epg[i]); ++ unlock_page(epg[i]); ++ assert(page_count(epg[i]) == 1); ++ page_cache_release(epg[i]); ++ } ++ } ++ ++ /* ++ * Release buffers, don't forget to unlock the locked ones. ++ * pjm 1998-01-14: TO_DO: Locked ones? ++ */ ++ assert(nbh >= 0); ++ assert(nbh <= EXT2_MAX_CLUSTER_BLOCKS); ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c(" < < < ext2_decompress_cluster %d: inode=%ld, res=%i\n", ++ cluster, inode->i_ino, result); ++#endif ++ return result; ++} ++ ++ ++/* ++ * Function to decompress the pages of a cluster. ++ * ++ * Allocate buffers to pages what are not mapped on the device. ++ * ++ * Returns the size of decompressed data on success, a negative ++ * value in case of failure, or 0 if some pages are not uptodate. ++ * ++ * The inode is supposed to be writable. ++ * All the pages must be UPTODATE, ++ */ ++int ext2_decompress_pages(struct inode *inode, u32 cluster, ++ struct page *pg[]) ++{ ++ struct ext2_cluster_head *head; ++ struct buffer_head *bh0; ++ struct buffer_head *bh[EXT2_MAX_CLUSTER_BLOCKS]; ++ unsigned nbh, c_nblk; ++ unsigned free_ix, clu_block0, clu_nblocks; ++ int i, pagesPerCluster, data_left, size = 0; ++ long status = 0; ++ char *dp; ++ struct buffer_head *bh_writeout[EXT2_MAX_CLUSTER_BLOCKS]; ++ int bhn_writeout; ++#ifdef CONFIG_HIGHMEM ++ int kmapped = 0; ++ ++ ext2_kmap_cluster_pages(NULL, pg, NULL); ++ kmapped = 1; ++#endif ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_BLOCKS; i++) { ++ bh_writeout[i] = NULL; ++ bhn_writeout = 0; ++ } ++ ++ /* First, get cluster_head (For this, we need to re-read the first block of ++ the cluster, without overwriting the data of the page the buffer point to... */ ++ /* This suppose that cluster are aligned with PAGE_SIZE... To be improved */ ++ ++ /* Changed by Yabo Ding, ++ The old code cannot reread data from disk to a changed buffers data pointer in 2.6.x. ++ So, I copy memory data(decompressed) to a temporary buffer; ++ Then reread data(compressed) from disk, and copy to head; ++ Then copy back the memory data from temporary buffer. ++ It seems clumsy, but it works well. ++ */ ++ ++ bh0 = page_buffers(pg[0]); ++ restore_b_data_himem(bh0); ++ ++ head = (struct ext2_cluster_head *) kmalloc(bh0->b_size, GFP_KERNEL); ++ if (head == NULL) { ++ ext2_msg(inode->i_sb, "no more memory", "inode = %lu", ++ inode->i_ino); ++ status = -EIO; ++ goto out_x; ++ } ++ dp = kmalloc(bh0->b_size, GFP_KERNEL); ++ if (dp == NULL) { ++ ext2_msg(inode->i_sb, "no more memory", "inode = %lu", ++ inode->i_ino); ++ kfree(head); ++ status = -EIO; ++ goto out_x; ++ } ++ memcpy(dp, bh0->b_data, bh0->b_size); ++ clear_bit(BH_Uptodate, &bh0->b_state); ++ if (!buffer_mapped(bh0)) { ++ status = ++ ext2_get_block(inode, ext2_cluster_block0(inode, cluster), bh0, ++ 0); ++ if (bh0->b_blocknr == 0) { ++ trace_e2c ++ ("ext2_decompress_pages: ext2_get_block error %ld (cluster = %u)\n", ++ status, cluster); ++ kfree(head); ++ memcpy(bh0->b_data, dp, bh0->b_size); ++ kfree(dp); ++ status = -EIO; ++ goto out; ++ } ++ } ++ ll_rw_block(READ, 1, &bh0); ++ ++ CHECK_NOT_ATOMIC ++ wait_on_buffer(bh0); ++ //printk("RE-Read: Buffer: blocknr:%lu(%#x) \n", bh0->b_blocknr, bh0); ++ if (!buffer_uptodate(bh0)) { /* Read error ??? */ ++ trace_e2c("ext2_decompress_pages: IO error (cluster = %u)\n", ++ cluster); ++ kfree(head); ++ memcpy(bh0->b_data, dp, bh0->b_size); ++ kfree(dp); ++ status = -EIO; ++ goto out; ++ } ++ /* This suppose that cluster are aligned with PAGE_SIZE... To be improved ++ bh0->b_data = page_address(pg[0]); */ ++ memcpy((char *) head, bh0->b_data, bh0->b_size); ++ memcpy(bh0->b_data, dp, bh0->b_size); ++ kfree(dp); ++ ++ if (head->magic != cpu_to_le16(EXT2_COMPRESS_MAGIC_04X)) { ++ ext2_msg(inode->i_sb, ++ "ext2_decompress_pages: bad magic number", ++ "inode = %lu, magic = %#04x", inode->i_ino, ++ le16_to_cpu(head->magic)); ++ kfree(head); ++ status = -EIO; ++ goto out; ++ } ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c("ext2_decompress_pages: clt=%d i=%ld head=0x%x\n", cluster, ++ inode->i_ino, (unsigned) head); ++#endif ++ ++ /* Now, try to do the same as in ext2_decompress_cluster for moving/allocating blocks */ ++ nbh = 0; ++ pagesPerCluster = ext2_cluster_npages(inode, cluster); ++ for (i = 0; i < pagesPerCluster && pg[i]; i++) { ++ assert(PageLocked(pg[i])); ++ //if (!(PageUptodate(pg[i]))) { ++ //mw: do it like ext2_decompress_cluster to handle end of a file correctly ++ if (!(PageUptodate(pg[i])) ++ && (pg[i]->index <= ((inode->i_size - 1) >> PAGE_CACHE_SHIFT))) { ++ kfree(head); ++ printk("should never happen: not all pages uptodate!\n"); //mw ++ status = 0; ++ goto out_x; ++ } ++ } ++ ++ for (i = 0; i < pagesPerCluster && pg[i]; i++) { ++ struct buffer_head *bhead, *bhx; ++ int idx = 0; ++ ++ /* assert(PageUptodate(pg[i])); with ftruncate() can be false */ ++ if (!page_has_buffers(pg[i])) { ++ ClearPageUptodate(pg[i]); /*mw */ ++ ClearPageDirty(pg[i]); /*mw */ ++ assert(0); ++ create_empty_buffers_e2c(pg[i], inode->i_sb->s_blocksize, 0, ++ inode); ++ if (unlikely(!page_has_buffers(pg[i]))) ++ printk("Error: NOMEM!\n"); ++ } ++ bhead = page_buffers(pg[i]); ++ for (bhx = bhead; bhx != bhead || !idx; bhx = bhx->b_this_page) { ++ idx++; ++ bh[nbh] = bhx; ++ nbh++; ++ } ++ } ++ ++ while ((nbh != 0) && (bh[nbh - 1]->b_blocknr == 0)) ++ --nbh; ++ ++ c_nblk = nbh; ++ ++ free_ix = ++ ext2_calc_free_ix(head->holemap_nbytes, (u8 const *) (&head[1]), ++ c_nblk); ++ clu_block0 = ext2_cluster_block0(inode, cluster); ++ clu_nblocks = ext2_cluster_nblocks(inode, cluster); ++ ext2_unpack_blkaddrs(inode, bh, 0, head->holemap_nbytes, ++ (u8 const *) (&head[1]), c_nblk, free_ix, ++ clu_block0, clu_nblocks); ++ ++ /* Allocate the extra blocks needed. */ ++ data_left = size = le32_to_cpu(head->ulen); ++ ++ data_left -= c_nblk << inode->i_sb->s_blocksize_bits; ++ assert(data_left > 0); ++ for (i = 0; i < free_ix; i++) { ++ if (bh[i]->b_blocknr != 0) { ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("\t [%d] blk=%ld sts=0x%x\n", i, bh[i]->b_blocknr, ++ (int) bh[i]->b_state); ++#endif ++ set_buffer_dirty(bh[i]); ++ bh_writeout[bhn_writeout] = bh[i]; //mw ++ bhn_writeout++; //mw ++ } ++ } ++ ++ for (i = free_ix; i < clu_nblocks; i++) { ++ if (((i >> 3) >= head->holemap_nbytes) ++ || !(head->holemap[i >> 3] & (1 << (i & 7)))) { ++ status = ++ ext2_get_block(inode, clu_block0 + i, bh[i], ++ 1 /* create */ ); ++ if (status || bh[i]->b_blocknr == 0) { ++ status = -EIO; ++ goto out; ++ } ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("\t [%d] blk=%ld sts=0x%x\n", i, bh[i]->b_blocknr, ++ (int) bh[i]->b_state); ++#endif ++ set_bit(BH_Uptodate, &bh[i]->b_state); ++ set_buffer_dirty(bh[i]); ++ bh_writeout[bhn_writeout] = bh[i]; //mw ++ bhn_writeout++; //mw ++ nbh++; ++ data_left -= inode->i_sb->s_blocksize; ++ if (data_left <= 0) ++ break; ++ } ++ } ++ ++ out: ++ kfree(head); ++ ++ out_x: ++ ++ for (i = 0; i < bhn_writeout; i++) { ++ ++ if ((!buffer_mapped(bh_writeout[i])) ++ || (bh_writeout[i]->b_bdev == NULL)) { ++ u32 block = ext2_cluster_block0(inode, cluster); ++ ext2_get_block(inode, block + i, bh_writeout[i], 1); ++ //printk("ext2_get_block Block:%lu, Mapped:%i, Page:%lu, bdev: %#x\n", bh_writeout[i]->b_blocknr, (bh_writeout[i]->b_state & BH_Mapped), (bh_writeout[i]->b_page ? bh_writeout[i]->b_page->index : 0), bh_writeout[i]->b_bdev ); ++ } ++ assert(buffer_mapped(bh_writeout[i])); ++ assert(bh_writeout[i]->b_bdev != NULL); ++ assert(bh_writeout[i]->b_bdev == inode->i_sb->s_bdev); ++ /*if (bh_writeout[i]->b_bdev == NULL) ++ bh_writeout[i]->b_bdev = inode->i_sb->s_bdev; //fix bdev-bug */ ++ } ++ //Sync out changes: ++ ll_rw_block(WRITE, bhn_writeout, bh_writeout); ++ //mw: seems we have to wait here, otherwise: crash! ++ ++ CHECK_NOT_ATOMIC ++ for (i = 0; i < bhn_writeout; i++) { ++ if (bh_writeout[i]) ++ wait_on_buffer(bh_writeout[i]); ++ } ++ ++ ++#ifdef CONFIG_HIGHMEM ++ if (kmapped) ++ ext2_kunmap_cluster_pages(NULL, pg, NULL); ++#endif ++ ++ return (status ? status : size); ++} ++ ++ ++/* Decompress every cluster that is still compressed. ++ We stop and return -ENOSPC if we run out of space on device. ++ ++ The caller needs to check for EXT2_COMPRBLK_FL before calling. ++ ++ Returns 0 on success, -errno on failure. ++ ++ Called by ext2_ioctl(). */ ++int ext2_decompress_inode(struct inode *inode) ++{ ++ u32 cluster; ++ u32 n_clusters; ++ int err = 0; ++ struct ext2_inode_info *ei = EXT2_I(inode); ++ ++ assert(ei->i_flags & EXT2_COMPRBLK_FL); ++ ++ /* Quotas aren't otherwise kept if file is opened O_RDONLY. */ ++ dquot_initialize(inode); ++ ++ //mutex_lock(&inode->i_mutex); /* MW 5-16-07 */ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); /* i.e. mutex_lock */ ++ err = 0; ++ /* This test can succeed because down() (and I think DQUOT_INIT) can block. */ ++ if (!(ei->i_flags & EXT2_COMPRBLK_FL)) ++ goto out; ++ ++ n_clusters = ext2_n_clusters(inode); ++ for (cluster = 0; cluster < n_clusters; cluster++) { ++ err = ext2_cluster_is_compressed_fn(inode, cluster); ++ if (err > 0) { ++ err = ext2_decompress_cluster(inode, cluster); ++ /* If we later get an error, we'll need to recompress. */ ++ ei->i_flags |= EXT2_DIRTY_FL; ++ ei->i_compr_flags |= EXT2_CLEANUP_FL; ++ } ++ if (err < 0) ++ goto error; ++ } ++ assert(err >= 0); ++ err = 0; ++ ei->i_flags &= ~(EXT2_COMPRBLK_FL | EXT2_DIRTY_FL); ++ ei->i_compr_flags &= ~EXT2_CLEANUP_FL; ++ error: ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty_sync(inode); ++ out: ++// mutex_unlock(&inode->i_mutex); /* MW 5-16-07 */ ++ return err; ++} ++ ++ ++/* ++ TODO: SECRM_FL ++ ++ TODO: Avant de liberer les blocs, regarder si le compteur ++ est a 1, et marquer le noeud si ce n'est pas le cas ++ (pour preparer la recompression immediate). ++ ++ pjm fixme translation. ++ "Before freeing the blocks, check if the counter is 1, ++ and mark the inode if not (in order to prepare for ++ immediate recompression)." */ ++ ++/* This is called by ext2_compress_cluster to free the blocks now ++ available due to compression. We free ,nb` blocks beginning with ++ block ,block`. We set the address of each freed block to ++ EXT2_COMPRESSED_BLKADDR, thus marking the cluster as compressed. ++ N.B. It is up to the caller to adjust i_blocks. */ ++ ++/* TODO: ext2_truncate() is much more careful than this routine. ++ (E.g. it checks for bh->b_count > 1, and checks for things changing ++ underneath it. It also calls bforget instead of brelse if it's ++ going to free it.) Why? Maybe we should copy it. */ ++ ++/* effic: Reduce the number of calls to ext2_free_block() the way ++ ext2_trunc_direct() does. */ ++ ++/* fixme: I think tht we do indeed need to check if buffers are held by ++ somebody else before freeing them. */ ++static int ext2_free_cluster_blocks(struct inode *inode, u32 block, ++ unsigned nb) ++{ ++ u32 tmp; ++ struct ext2_bkey key; ++ int err; ++ ++/* ++ * whitpa 04 Oct 2004: although it may be true that using e2compr in ++ * conjunction with quotas is a Bad Idea, having quotas enabled for other ++ * filesystems doesn't necessarily mean that the quota feature will actually be ++ * used in this one, so many people find the following assertion very annoying. ++ * I have therefore disabled it. ++ */ ++/* assert (!inode->i_sb->dq_op || (inode->i_flags & S_QUOTA)); */ ++ if (!nb) ++ return 0; ++ if (nb > EXT2_MAX_CLU_NBLOCKS) { ++ assert((int) nb >= 0); ++ assert(nb <= EXT2_MAX_CLU_NBLOCKS); ++ return -EDOM; ++ } ++ assert(((block + nb) & 3) == 0); ++ if (!ext2_get_key(&key, inode, block)) ++ return -EIO; ++ ++ while (nb-- > 0) { ++ tmp = ext2_get_key_blkaddr(&key); ++ err = ext2_set_key_blkaddr(&key, EXT2_COMPRESSED_BLKADDR); ++ if (err) ++ goto out; ++ if (tmp != 0) { ++ assert(tmp != EXT2_COMPRESSED_BLKADDR); ++#ifdef EXT2_COMPR_REPORT_ALLOC ++ printk(KERN_DEBUG "ext2: free %d = (%d) %d:%d:%d:%d : %d\n", ++ key.block, ++ key.level, ++ key.off[0], key.off[1], key.off[2], key.off[3], tmp); ++#endif ++ ext2_free_blocks(inode, tmp, 1); ++ } ++ if (!ext2_next_key(&key, 1)) ++ break; ++ } ++ err = 0; ++ out: ++ ext2_free_key(&key); ++ return err; ++} ++ ++#ifdef EXT2_COMPR_DEBUG ++static unsigned count_bits(unsigned char *p, unsigned nb) ++{ ++ u32 x = le32_to_cpu(*(u32 *) p); ++ unsigned n = 0; ++ ++ assert(nb <= 4); ++ if (nb != 4) ++ x &= (1 << (nb * 8)) - 1; ++ while (x) { ++ x &= (x - 1); ++ n++; ++ } ++ return n; ++} ++#endif ++ ++/* ++ * __remove_compr_assoc_queue is used in invalidate_inode_buffers ++ * replacement code for ext2_compress_cluster(). TLL 02/21/07 ++ * Yeah, it is duplicate code, but using it does not require ++ * patching fs/buffer.c/__remove_assoc_queue to export it. ++ * The buffer's backing address_space's private_lock must be held. ++ */ ++/*static inline void __remove_compr_assoc_queue(struct buffer_head *bh) ++{ ++ list_del_init(&bh->b_assoc_buffers); ++}*/ ++ ++/* Compress one cluster. If the cluster uses fewer blocks once ++ compressed, it is stored in place of the original data. Unused ++ blocks are freed, and the cluster is marked as compressed. ++ ++ Returns a negative value on error, ++ 0 if the cluster does not compress well, ++ positive if it is compressed (whether it was already compressed ++ or whether we compressed it). ++ ++ Assume inode is writable. ++ ++ Called by : ++ ++ ext2_cleanup_compressed_inode () [i_sem] ++ ++ If ever we acquire new callers, make sure that quotas are ++ initialised, and COMPRBLK is handled correctly (i.e. such ++ that ioctl() can't change the cluster size on us), and that caller ++ tests for ext2_wa==NULL. ++*/ ++ ++int ext2_compress_cluster(struct inode *inode, u32 cluster) ++{ ++ struct buffer_head *bh[EXT2_MAX_CLUSTER_BLOCKS + 1]; ++ struct page *pg[EXT2_MAX_CLUSTER_PAGES]; ++ int s_nblk; /* Equals clu_nblocks less any trailing hole blocks. */ ++ unsigned u_nblk = (~(unsigned) 0), c_nblk; /* Number of blocks occupied by ++ un/compressed data. */ ++ int result, n, x; ++ int ulen, maxlen = 0, clen = 0; ++ unsigned char *dst; ++ u8 *src; ++ unsigned meth, alg; ++ int nbh = 0, npg, i; ++ unsigned char holemap_nbytes = 0; ++ unsigned last_hole_pos; ++ struct ext2_cluster_head *head; ++ unsigned r_nblk; ++ struct ext2_inode_info *ei = EXT2_I(inode); ++ unsigned long saved_isize; ++ //int dotrunc = 1; //mw ++ ++#ifdef CONFIG_HIGHMEM ++ int kmapped = 0; ++#endif ++ ++ /* impl: Otherwise, ioctl() could change the cluster size ++ beneath us. */ ++ /* TLL say not compressed and return -1 6-15-07 */ ++ if (!(ei->i_flags & EXT2_COMPRBLK_FL)) ++ return -1; ++ ++ //mw ++ saved_isize = inode->i_size; ++ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); /* i.e. mutex_lock */ ++ assert(!mapping_mapped(inode->i_mapping)); ++ ++ npg = ext2_cluster_npages(inode, cluster); ++ ++ result = ext2_get_cluster_pages(inode, cluster, pg, NULL, 1); ++ if (result <= 0) ++ goto done; ++ ++#ifdef CONFIG_HIGHMEM ++ ext2_kmap_cluster_pages(NULL, pg, NULL); ++ kmapped = 1; ++#endif ++ ++ /* effic: We ought to use the page cache. Using the page ++ cache always costs extra CPU time, but saves I/O if the ++ page is present. We still need to detect holes, which ++ unfortunately may still cause I/O. Testing for all-zero ++ could save us that I/O. */ ++ ++ nbh = ext2_get_cluster_blocks(inode, cluster, bh, pg, NULL, 1); ++ ++ s_nblk = nbh; ++ ++#ifdef EXT2_COMPR_REPORT ++ { ++ int i; ++ trace_e2c(" > > > ext2_compress_cluster %d: inode=%ld, size=%d\n", ++ cluster, inode->i_ino, (int) inode->i_size); ++#ifdef EXT2_COMPR_REPORT_CPR ++ for (i = 0; i < s_nblk; i++) { ++ if (bh[i]) { ++ printk(KERN_DEBUG ++ "bbuffer_head[%d]: blocknr=%lu, addr=0x%p ", i, ++ (unsigned long) bh[i]->b_blocknr, bh[i]); ++ if (bh[i]->b_page) ++ printk(KERN_DEBUG "bgn:[page->index=%ld]\n", ++ bh[i]->b_page->index); ++ else ++ printk(KERN_DEBUG "[No page]\n"); ++ } else ++ printk("bbuffer_head[%d] is NULL\n", i); ++ } ++#endif ++ } ++#endif ++ /* ++ * Did somebody else compress the cluster while we were waiting ? ++ * This should never arise ... ++ */ ++ result = ext2_cluster_is_compressed_fn(inode, cluster); ++ if (result != 0) { ++ if (result > 0) { ++ ext2_msg(inode->i_sb, ++ "ext2_compress_cluster", ++ "compressing compressed cluster"); ++ } ++ goto done; ++ } ++ ++ /* I moved it here in case we need to load a module that ++ * needs more heap that is currently allocated. ++ * In such case "init_module" for that algorithm forces ++ * re-allocation of ext2_wa. It should be safe here b/c the ++ * first reference to ext2_wa comes just after and we have ++ * locked ext2_wa before. ++ * ++ * I know that we may not need the compression at all ++ * (compressing 0 or 1 block) but it's better to sacrifice ++ * a bit than do make a total mess of this code. ++ * ++ * FIXME: Totally separate working areas for reading and writing. ++ * Jan R. ++ */ ++ ++ meth = ei->i_compr_method; ++ assert(meth < EXT2_N_METHODS); ++ alg = ext2_method_table[meth].alg; ++#ifdef CONFIG_KMOD ++ if (!ext2_algorithm_table[alg].avail) { ++ char str[32]; ++ ++ sprintf(str, "ext2-compr-%s", ext2_algorithm_table[alg].name); ++ request_module(str); ++ } ++#endif ++ ++ result = -EINTR; ++ ++ /* ++ * Try to get the working area. ++ */ ++#ifdef EXT2_COMPR_REPORT_WA ++ printk(KERN_DEBUG "pid %d enters critical region\n", current->pid); ++#endif ++ if (get_cpu_var(ext2_wr_wa) == NULL) ++ { ++ ext2_alloc_wr_wa(); ++ } ++ assert(__get_cpu_var(ext2_wr_wa) != NULL); ++ ++ ++ /* ++ * Now, we try to compress the cluster. If the cluster does ++ * not compress well, we just give up. Otherwise, we reuse ++ * the old blocks to store the compressed data (except that ++ * compressed data is contiguous in the file even if the ++ * uncompressed data had holes). ++ */ ++ ++ /* ++ * Compute the block bitmap, how many bytes of data we have ++ * in the cluster, and the maximum interesting length after ++ * compression. The bitmap will be used to reallocate blocks ++ * when decompressing the cluster, so that we don't create blocks ++ * that were previously missing. We also pack the buffers ++ * together. ++ */ ++ ++ head = (struct ext2_cluster_head *) __get_cpu_var(ext2_wr_wa)->c; ++#if EXT2_MAX_CLUSTER_BLOCKS > 32 ++# error "We need to zero more bits than this." ++#endif ++ *(u32 *) (&head[1]) = 0; ++ last_hole_pos = (unsigned) (-1); ++ assert(head->holemap[0] == 0); ++ assert(head->holemap[1] == 0); ++ assert(head->holemap[2] == 0); ++ assert(head->holemap[3] == 0); ++ assert(*(u32 *) head->holemap == 0); ++ assert(count_bits(head->holemap, 4) == 0); ++ ++ /* TODO: Check that i_size can't change beneath us. ++ do_truncate() is safe because it uses i_sem around changing ++ i_size. For the moment, I do a runtime check. */ ++ ++ saved_isize = inode->i_size; ++ ++#ifdef EXT2_COMPR_REPORT_VERBOSE ++ printk ++ ("00 ext2_compress_cluster[%u]: i_size=%u, s_blocksize_bits=%u, s_nblk=%u\n", ++ __LINE__, (unsigned) inode->i_size, inode->i_sb->s_blocksize_bits, ++ s_nblk); ++#endif ++// assert (ROUNDUP_RSHIFT(inode->i_size, inode->i_sb->s_blocksize_bits) ++// >= s_nblk); ++ /* This initial guess at ulen doesn't take holes into account ++ unless they're at end of cluster. We ,compensate for other ++ holes` during the loop below. */ ++ ulen = MIN(s_nblk << inode->i_sb->s_blocksize_bits, ++ inode->i_size - ext2_cluster_offset(inode, cluster)); ++ r_nblk = (((ulen - 1) >> inode->i_sb->s_blocksize_bits) + 1); ++ if (r_nblk <= 1) { ++ /* MW: required to remove Z flag, otherwise compress ++ * is tried on each access */ ++ result = 0; ++ goto no_compress; ++ } ++ /* Verify if more than 1 block to compress in the cluster */ ++ nbh = 0; ++ for (x = 0; x < s_nblk; x++) { ++ if ((bh[x] != NULL) && (bh[x]->b_blocknr != 0)) { ++ nbh++; ++ } else { ++ last_hole_pos = x; ++ head->holemap[x >> 3] |= 1 << (x & 7); ++ ulen -= inode->i_sb->s_blocksize; ++ /* impl: We know that it's a whole block because ++ ext2_get_cluster_blocks trims s_nblk for trailing ++ NULL blocks, and partial blocks only come at ++ the end, so there can't be partial NULL blocks. */ ++ } ++ } ++ /* We don't try to compress cluster that only have one block ++ or no block at all. (When fragments are implemented, this code ++ should be changed.) */ ++ if (nbh <= 1) { ++ /* MW: required to remove Z flag, otherwise compress ++ * is tried on each access */ ++ goto no_compress; ++ } ++ ++ u_nblk = nbh; ++ /* Copy the data in the compression area */ ++ dst = __get_cpu_var(ext2_wr_wa)->u; ++ for (x = 0; x < s_nblk; x++) { ++ if ((bh[x] != NULL) && (bh[x]->b_blocknr != 0)) { ++ restore_b_data_himem(bh[x]); ++ memcpy(dst, bh[x]->b_data, bh[x]->b_size); ++ dst += bh[x]->b_size; ++ } ++ } ++ ++ assert(count_bits(head->holemap, 4) == s_nblk - u_nblk); ++ ++#if EXT2_GRAIN_SIZE != EXT2_MIN_BLOCK_SIZE ++# error "this code ought to be changed" ++#endif ++ ++ /* ,maxlen` is the maximum length that the compressed data can ++ be while still taking up fewer blocks on disk. */ ++ holemap_nbytes = (last_hole_pos >> 3) + 1; ++ /* impl: Remember that ,last_hole_pos` starts off as being -1, ++ so the high 3 bits of ,last_hole_pos >> 3` can be wrong. ++ This doesn't matter if holemap_nbytes discards the high ++ bits. */ ++ ++ assert(sizeof(holemap_nbytes) < sizeof(unsigned)); ++ assert((last_hole_pos == (unsigned) -1) ++ == (holemap_nbytes == 0)); ++ maxlen = ++ ((((r_nblk < ++ u_nblk) ? r_nblk : u_nblk) - 1) * inode->i_sb->s_blocksize - ++ sizeof(struct ext2_cluster_head) ++ - holemap_nbytes); ++ clen = 0; ++ /* Handling of EXT2_AUTO_METH at the moment is just that we ++ use the kernel default algorithm. I hope that in future ++ this can be extended to the kernel deciding when to ++ compress and what algorithm to use, based on available disk ++ space, CPU time, algorithms currently used by the fs, ++ etc. */ ++ if ((meth == EXT2_AUTO_METH) ++ || !ext2_algorithm_table[alg].avail) { ++ meth = EXT2_DEFAULT_COMPR_METHOD; ++ alg = ext2_method_table[meth].alg; ++ assert(ext2_algorithm_table[alg].avail); ++ } ++ if (alg == EXT2_NONE_ALG) ++ goto no_compress; ++ ++ clen = ext2_algorithm_table[alg].compress(__get_cpu_var(ext2_wr_wa)->u, ++ __get_cpu_var(ext2_wr_wa)->c + sizeof(struct ext2_cluster_head) + holemap_nbytes, ++ __get_cpu_var(ext2_wr_wa)->heap, ulen, maxlen, ext2_method_table[meth].xarg); ++ ++#ifdef EXT2_COMPR_REPORT_ALGORITHMS ++ printk(KERN_DEBUG "03 ext2: %lu: cluster %d+%d [%d] compressed " ++ "into %d bytes (ulen = %d, maxlen = %d)\n", ++ inode->i_ino, ++ ext2_cluster_offset(inode, cluster), ++ ext2_cluster_nblocks(inode, cluster), ++ u_nblk, clen, ulen, maxlen); ++#endif ++ ++ if ((clen == 0) || (clen > maxlen)) { ++ no_compress: ++ ++ /* this chunk didn't compress. */ ++ assert(inode->i_size == saved_isize); ++#ifdef EXT2_COMPR_REPORT_WA ++ printk(KERN_DEBUG ++ "pid %d leaves critical region, nbh=%d, u_nblk=%d, " ++ "inode->i_size=%lu, saved_isize=%lu, clen=%d, ulen=%d, maxlen=%d\n", ++ current->pid, nbh, u_nblk, ++ (long unsigned) inode->i_size, saved_isize, clen, ulen, ++ maxlen); ++#endif ++ ++ result = 0; ++ put_cpu_var(ext2_wr_wa); ++ goto done; ++ } ++ ++ ++#if EXT2_MAX_CLUSTER_BLOCKS > 32 ++# error "We need to zero more bits than this." ++#endif ++ assert(-1 <= (int) last_hole_pos); ++ assert((int) last_hole_pos < 32); ++ assert((le32_to_cpu(*(u32 *) head->holemap) ++ & (~0u << (1 + last_hole_pos)) ++ & (~(~0u << (8 * holemap_nbytes)))) ++ == 0); ++ /* Don't change "~0u << (1 + last_hole_pos)" to "~1u << last_hole_pos" ++ as I almost did, as last_hole_pos can be -1 and cannot be 32. */ ++ assert(count_bits(head->holemap, holemap_nbytes) == s_nblk - u_nblk); ++ ++ /* Compress the blocks at the beginning of the cluster */ ++ for (x = 0, nbh = 0; x < s_nblk; x++) { ++ if ((bh[x] != NULL) && (bh[x]->b_blocknr != 0)) { ++ if (nbh != x) { ++ restore_b_data_himem(bh[x]); ++ bh[nbh]->b_blocknr = bh[x]->b_blocknr; ++ set_bit(BH_Mapped, &bh[nbh]->b_state); ++ bh[x]->b_blocknr = 0; ++ assert(buffer_mapped(bh[x])); ++ clear_bit(BH_Mapped, &bh[x]->b_state); ++ } ++ nbh++; ++ } ++ } ++ assert(nbh == u_nblk); ++ assert(count_bits(head->holemap, holemap_nbytes) == s_nblk - u_nblk); ++ ++ /* ++ * Compression was successful, so add the header and copy to blocks. ++ */ ++ ++ /* Header. */ ++ { ++ head->magic = cpu_to_le16(EXT2_COMPRESS_MAGIC_04X); ++ head->method = meth; ++ head->holemap_nbytes = holemap_nbytes; ++ head->ulen = cpu_to_le32(ulen); ++ head->clen = cpu_to_le32(clen); ++ ++ barrier(); //mw: "barrier" tells compiler not to re-order resulting asm statments, somehow. ++ head->checksum = ++ cpu_to_le32(ext2_adler32 ++ (le32_to_cpu(*(u32 *) __get_cpu_var(ext2_wr_wa)->c), ++ __get_cpu_var(ext2_wr_wa)->c + 8, ++ (sizeof(struct ext2_cluster_head) - 8 + ++ head->holemap_nbytes + clen))); ++ } ++ ++ assert((le32_to_cpu(*(u32 *) head->holemap) ++ & (~0 << (1 + last_hole_pos)) ++ & ((1 << (8 * holemap_nbytes)) - 1)) == 0); ++ result = clen += sizeof(struct ext2_cluster_head) + holemap_nbytes; ++ c_nblk = ROUNDUP_RSHIFT(clen, inode->i_sb->s_blocksize_bits); ++ ++ /* Release unneeded buffer heads. (Freeing is done later, ++ after unlocking ext2_wr_wa.) */ ++ assert(nbh == u_nblk); ++ nbh = c_nblk; ++ ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c("ext2_compress_cluster: head->clen=%d, clen=%d\n", head->clen, clen); ++#endif ++ src = __get_cpu_var(ext2_wr_wa)->c; ++ ++ for (n = 0; (int) clen > 0; n++) { ++ restore_b_data_himem(bh[n]); ++ if (clen >= inode->i_sb->s_blocksize) { ++ memcpy(bh[n]->b_data, src, inode->i_sb->s_blocksize); ++ } else { ++ memcpy(bh[n]->b_data, src, clen); ++ } ++ ++ /* TO_DO: OSYNC. means: write opertions are blocking until the ++ * the pages are written from page cache to disk */ ++ ++ set_buffer_uptodate(bh[n]); ++ set_buffer_dirty(bh[n]); ++ src += inode->i_sb->s_blocksize; ++ clen -= inode->i_sb->s_blocksize; ++ } ++ ++ i = 0; ++ assert(n == c_nblk); ++ assert((le32_to_cpu(*(u32 *) head->holemap) ++ & (~0 << (1 + last_hole_pos)) ++ & ((1 << (8 * holemap_nbytes)) - 1)) == 0); ++ ++ /* Runtime check that no-one can change i_size while i_sem is down. ++ (See where saved_isize is set, above.) */ ++ assert(inode->i_size == saved_isize); ++ assert(!mapping_mapped(inode->i_mapping)); ++ ++ /* Free the remaining blocks, and shuffle used blocks to start ++ of cluster in blkaddr array. */ ++ { ++ u32 free_ix, curr; ++ int err; ++ ++ /* Calculate free_ix. There should be ,c_nblk` ++ non-hole blocks among the first ,free_ix` ++ blocks. */ ++ { ++ assert((le32_to_cpu(*(u32 *) head->holemap) ++ & (~0 << (1 + last_hole_pos)) ++ & ((1 << (8 * holemap_nbytes)) - 1)) == 0); ++ assert(n == c_nblk); ++ for (free_ix = 0; ++ ((int) free_ix <= (int) last_hole_pos) && (n > 0); ++ free_ix++) ++ if (!(head->holemap[free_ix >> 3] ++ & (1 << (free_ix & 7)))) ++ n--; ++ free_ix += n; ++ ++ if ((free_ix < c_nblk) ++ || (free_ix + u_nblk > s_nblk + c_nblk) ++ || (free_ix >= ext2_cluster_nblocks(inode, cluster)) ++ || ((holemap_nbytes == 0) && (c_nblk != free_ix))) { ++ assert(free_ix >= c_nblk); ++ /*assert (free_ix - c_nblk <= s_nblk - u_nblk); */ ++ assert(free_ix + u_nblk <= s_nblk + c_nblk); ++ assert(free_ix < ext2_cluster_nblocks(inode, cluster)); ++ assert((holemap_nbytes != 0) || (c_nblk == free_ix)); ++ assert(1 <= c_nblk); ++ assert(c_nblk < u_nblk); ++ assert(u_nblk <= s_nblk); ++ assert(s_nblk <= ext2_cluster_nblocks(inode, cluster)); ++ assert(ext2_cluster_nblocks(inode, cluster) <= ++ EXT2_MAX_CLU_NBLOCKS); ++ ext2_error(inode->i_sb, "ext2_compress_cluster", ++ "re assertions: c=%d, u=%d, f=%d, s=%d, n=%d, " ++ "lhp=%d, hm=%x, hnb=%d, " "ino=%lu, clu=%u", ++ (int) c_nblk, (int) u_nblk, (int) free_ix, ++ (int) s_nblk, (int) ext2_cluster_nblocks(inode, ++ cluster), ++ (int) last_hole_pos, ++ (unsigned) le32_to_cpu(*(u32 *) head->holemap), ++ (int) holemap_nbytes, inode->i_ino, cluster); ++ } ++ } ++ ++ /*mw: put here: set all __get_cpu related pointers to NULL ++ as they become invalid with put_cpu */ ++ head = NULL; /* prevent any more stupid bugs */ ++ src = NULL; ++ dst = NULL; ++ put_cpu_var(ext2_wr_wa); ++ ++#ifdef EXT2_COMPR_DEBUG ++ /* TODO: remove this TEST */ ++ /* mw: ext2_free_cluster_blocks can sleep: check we are not atomic */ ++ schedule(); ++#endif ++ ++ /* Free unneeded blocks, and mark cluster as ++ compressed. */ ++ err = ext2_free_cluster_blocks ++ (inode, ++ ext2_cluster_block0(inode, cluster) + free_ix, ++ ext2_cluster_nblocks(inode, cluster) - free_ix); ++ /* pjm 1998-06-15: This should help reduce fragmentation. ++ Actually, we could set block to clu_block0 + clu_nbytes, ++ and goal to the last allocated blkaddr in the compressed ++ cluster. ++ It would be nice if we would transfer the freed blocks ++ to preallocation, while we're at it. */ ++// write_lock(&ei->i_meta_lock); ++ /* mw: i_next_alloc_goal and i_next_alloc_block were removed in 2.6.24.x ++ * so we dont need to set them to 0 (they are anyway, somehow). ++ */ ++ //ei->i_next_alloc_goal = ei->i_next_alloc_block = 0; ++// write_unlock(&ei->i_meta_lock); ++ if (err < 0) { ++ goto done; ++ } ++ /* Note that ext2_free_cluster_blocks() marks the ++ cluster as compressed. */ ++ ++ /* Shuffle used blocks to beginning of block-number array. */ ++ { ++ struct ext2_bkey key; ++ unsigned i; ++ ++ if (!ext2_get_key(&key, ++ inode, ++ ext2_cluster_block0(inode, cluster))) { ++ ei->i_flags |= EXT2_ECOMPR_FL; ++ result = -EIO; ++ free_ix = 0; ++ } ++ for (i = 0; i < free_ix; i++) { ++ curr = ext2_get_key_blkaddr(&key); ++ ++ if ((c_nblk == free_ix) ++ && (curr != bh[i]->b_blocknr)) { ++ /* "Can't happen", yet has ++ happened a couple of times. */ ++ ext2_error(inode->i_sb, "ext2_compress_cluster", ++ "c_nblk=free_ix=%d, " ++ "curr=%u, b_blocknr=%lu, " ++ "lhp=%d , hm=, " ++ "ino=%lu, blk=%u", ++ c_nblk, curr, ++ (unsigned long) bh[i]->b_blocknr, ++ (int) last_hole_pos, ++ /*mw: became invalid due put_cpu: ++ (unsigned) le32_to_cpu(*(u32 *) head-> ++ holemap),*/ ++ inode->i_ino, ++ (unsigned) ++ ext2_cluster_block0(inode, cluster) + i); ++ } ++ err = ext2_set_key_blkaddr(&key, ++ (i < c_nblk ++ ? bh[i]->b_blocknr ++ : EXT2_COMPRESSED_BLKADDR)); ++ if (err) ++ break; ++ if (!ext2_next_key(&key, 1)) { ++ ei->i_flags |= EXT2_ECOMPR_FL; /* sorry... */ ++ result = -EIO; ++ break; ++ } ++ } ++ ext2_free_key(&key); ++ } ++ } ++ ++ /* ++ * Unlock the working area. ++ */ ++ ++#ifdef EXT2_COMPR_REPORT_WA ++ printk(KERN_DEBUG "pid %d leaves critical region\n", current->pid); ++#endif ++ ++ assert(c_nblk < u_nblk); ++ ext2_mark_algorithm_use(inode, alg); ++ ++ /* TLL update b_assoc_map per 2.6.20 6-07-07 */ ++ for (i = 0; i < c_nblk; i++) ++ if (bh[i] != NULL) { ++ bh[i]->b_assoc_map = inode->i_mapping; ++ bh[i]->b_page->mapping = inode->i_mapping; //Andreas 5-24-07 : necessary? WRONG? ++ } ++ //mw: we must force the writeback, otherwise ext2_readpage will get confused ++ // yaboo ding had similiar code above. but I think it makes more sense after ++ // the block shuffeling. ++ // Note: generic_oysnc_inode() made trouble with USB-Sticks and caused a lot ++ // of IO, stalled system ... therefore ll_rw_block() replace it. Anyway we already operate ++ // with this low-level function. ++ ++ /*mw: new "hole" fix. hole == bdev bug! */ ++ for (i = 0; i < c_nblk; i++) { ++ ++ /* this was a hole (uncompressed) ++ * at the beginning of the cluster. ++ * so NO block was yet associated with it. ++ * But now we need it, because a compressed ++ * cluster always starts at the cluster.*/ ++ if (!buffer_mapped(bh[i]) || bh[i]->b_bdev == NULL) { ++ u32 block = ext2_cluster_block0(inode, cluster); ++ ext2_get_block(inode, block + i, bh[i], 1); ++ //printk("ext2_get_block Block:%lu, Mapped:%i, Page:%lu, bdev: %#x\n", bh[i]->b_blocknr, (bh[i]->b_state & BH_Mapped), (bh[i]->b_page ? bh[i]->b_page->index : 0), bh[i]->b_bdev ); ++ } ++ assert(buffer_mapped(bh[i])); ++ assert(bh[i]->b_bdev != NULL); ++ assert(bh[i]->b_bdev == inode->i_sb->s_bdev); ++ } ++ ++ ll_rw_block(WRITE, c_nblk, bh); ++ ++ CHECK_NOT_ATOMIC ++ //mw: seems we have to wait here, otherwise: crash! ++ for (i = 0; i < c_nblk; i++) { ++ if (bh[i]) ++ wait_on_buffer(bh[i]); ++ //printk("written compressed block: Block:%lu, Mapped:%i, Page:%lu, bdev: %#x\n", bh[i]->b_blocknr, (bh[i]->b_state & BH_Mapped), (bh[i]->b_page ? bh[i]->b_page->index : 0), bh[i]->b_bdev ); ++ } ++ ++ ++#ifdef CONFIG_HIGHMEM ++ if (kmapped) ++ ext2_kunmap_cluster_pages(NULL, pg, NULL); ++#endif ++ ++ inode->i_ctime = CURRENT_TIME; //mw: these two come always together. So I also put it here. ++ mark_inode_dirty_sync(inode); ++ ++ //ext2_update_inode(inode, inode_needs_sync(inode)); //mw: might be able to fix pipe_write vs. readpage. mutex-rec-locking ++ ++ /* COMPRBLK is already high, so no need to raise it. */ ++ { ++ for (i = c_nblk; (i < EXT2_MAX_CLUSTER_BLOCKS) && (bh[i] != NULL); ++ i++) { ++ clear_buffer_dirty(bh[i]); ++ bh[i]->b_blocknr = 0; ++ clear_bit(BH_Mapped, &bh[i]->b_state); ++ clear_bit(BH_Uptodate, &bh[i]->b_state); ++ } ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (pg[i] == NULL) { ++ break; ++ } ++ assert(PageLocked(pg[i])); ++ ClearPageUptodate(pg[i]); ++ unlock_page(pg[i]); ++ page_cache_release(pg[i]); ++ } ++ ++ /* invalidate_inode_buffers replacement code: TLL 02/21/07 ++ * e2compr on post 2.6.10 kernels do not have an uptodate ++ * mapping->assoc_mapping (other Vm(?) changes require it be ++ * made explicit, 2.4 kernels have it implicit). Therefore, when ++ * umount is called, a GPF ensues from a NULL ops pointer. ++ * e2c on a USB thumbdrive mounted as the root fs does not ++ * support repeated compress/uncompress cycles on a given file. ++ * Inlined the flush list code to explicityly force update to ++ * disk with a known valid bh list. ++ */ ++ ++ /* mw: I consider this code as ... not so good! */ ++ /* ++ if (inode_has_buffers(inode)) { ++ //struct address_space *mapping = &inode->i_data; ++ // struct address_space *buffer_mapping = mapping->assoc_mapping; ++ // requires: inode->i_data->mapping->assoc_mapping; to be set ++ invalidate_inode_buffers(inode); // TLL do it proper 5-25-07 ++ //if (dotrunc) ++ //ext2_truncate(inode); // TLL file size hack 6-19-07 ++ } ++ */ ++ ++ } ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c(" < < < ext2_compress_cluster %i: [done cpr] inode=%ld\n", cluster, inode->i_ino); ++#endif ++ return result; ++ ++ ++ done: ++ ++#ifdef CONFIG_HIGHMEM ++ if (kmapped) ++ ext2_kunmap_cluster_pages(NULL, pg, NULL); ++#endif ++ ++ { ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (pg[i] == NULL) { ++ break; ++ } ++ unlock_page(pg[i]); ++ page_cache_release(pg[i]); ++ } ++ /* TLL cp to compr dir bug fix 03-25-07 ++ Truncate uncompressed files to their uncompressed ++ length, i.e. force kernel to update inode and sb */ ++ //if(dotrunc) ++ //26.08.2011: ext2_truncate(inode) does not exist anymore ++ ext2_truncate_blocks(inode, inode->i_size); ++ ++ } ++#ifdef EXT2_COMPR_REPORT_VERBOSE ++ { ++ int i; ++ ++ printk(KERN_DEBUG "ext2_compress_cluster[end]: buffers kept for cluster=%d\n", cluster); ++ for (i = 0; i < nbh; i++) { ++ if (bh[i]) { ++ printk(KERN_DEBUG "2buffer_head[%d]: blocknr=%lu, addr=0x%p ", i, (unsigned long) bh[i]->b_blocknr, bh[i]); ++ if (bh[i]->b_page) ++ printk(KERN_DEBUG "2:[page->index=%ld]\n", bh[i]->b_page->index); ++ else ++ printk(KERN_DEBUG "[No page]\n"); ++ } else ++ printk(KERN_DEBUG "buffer_head[%d] is NULL\n", i); ++ } ++ } ++#endif ++ ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c(" < < < ext2_compress_cluster %i: [done NO cpr] inode=%ld\n", cluster, inode->i_ino); ++#endif ++ return result; ++} ++ ++ ++/* Go through all the clusters and compress them if not already ++ compressed. ++ ++ This is called by ext2_put_inode() and ext2_release_file(). Later, ++ we may have ext2_ioctl() call it (when EXT2_COMPR_FL rises). None ++ of the callers does any locking, so we do it here. ++ ++ Neither of the current callers uses the return code, but we get ready ++ for if we start using it. ++ ++ Returns 0 on "success" (whether or not we cleared EXT2_CLEANUP_FL ++ or EXT2_DIRTY_FL bits), -errno on error. */ ++int ext2_cleanup_compressed_inode(struct inode *inode) ++{ ++ u32 cluster; ++ u32 n_clusters; ++ int dirty = 0; ++ int err = 0; ++ u32 comprblk_mask; ++ atomic_t start_i_count = inode->i_count; ++ int retry = 0; ++ int have_downed; ++ struct ext2_inode_info *ei = EXT2_I(inode); ++#ifdef EXT2_COMPR_REPORT ++ char bdn[BDEVNAME_SIZE]; ++#endif ++ ++ /* impl: Actually, this assertion could fail if the kernel ++ isn't locked. I haven't looked, but I suppose that the ++ kernel always is locked when this is called. */ ++ assert(ei->i_compr_flags & EXT2_CLEANUP_FL); ++ ++#ifdef EXT2_COMPR_REPORT_PUT ++ printk(KERN_DEBUG "ext2_cleanup_compressed_inode() called for pid %d; " ++ "dev=%s, ino=%lu, i_state=0x%lx, i_count=%u\n", ++ current->pid, bdevname(inode->i_sb->s_bdev, bdn), inode->i_ino, ++ inode->i_state, atomic_read(&inode->i_count)); ++#endif ++ ++ /* Do these tests twice: once before down() and once after. */ ++ for (have_downed = 0;; have_downed++) { ++ if ((ei->i_flags & (EXT2_COMPR_FL | EXT2_DIRTY_FL)) ++ != (EXT2_COMPR_FL | EXT2_DIRTY_FL)) { ++ if (have_downed) ++ goto out; ++ /* TLL 5-25-07 changed from a warning to trace */ ++ /*trace_e2c("ext2_cleanup_compressed_inode: trying to un/compress an " ++ "uncompressable file.\n" ++ "i_flags=%#x. (dev=%s, ino=%lu, down=%d)\n", ++ ei->i_flags, bdevname(inode->i_sb->s_bdev, bdn), ++ inode->i_ino, have_downed); */ ++ return 0; ++ } ++ ++ /* test if file is mapped by mmap */ ++ if (mapping_mapped(inode->i_mapping)) ++ { ++ //trace_e2c("ext2_cleanup_compressed_inode: (dev. %s): ino=%ld: file mapped, does not compress cluster\n", bdevname(inode->i_sb->s_bdev, bdn), inode->i_ino); ++ if (have_downed) ++ goto out; ++ else ++ return 0; ++ } ++ ++ if (IS_RDONLY(inode) ++ || (ei->i_flags & EXT2_ECOMPR_FL)) { ++ ei->i_compr_flags &= ~EXT2_CLEANUP_FL; ++ if (have_downed) ++ goto out; ++ else ++ return 0; ++ } ++ ++ //mw ++ if (ext2_get_dcount(inode) > 1) { ++ err = 0; ++ //printk("Compress: file busy (dcount: %i>1)\n", ext2_get_dcount(inode)); ++ if (have_downed) ++ goto out; ++ else ++ return 0; ++ } ++ ++ if (have_downed) ++ break; ++ ++ /* Quotas aren't otherwise kept if file is opened O_RDONLY. */ ++ dquot_initialize(inode); ++ ++ /* Check whether OSYNC of inode is acutally running */ ++ //if (ei->i_compr_flags & EXT2_OSYNC_INODE) ++ //printk(KERN_DEBUG "OSYNC!\n"); ++ ++ /* I think: ++ * checking these flags should prevent that one Process aquires the MUTEX again, ++ * e.g. in a recursive call ++ * BUT: what happens acutally: two processes are working on this inode: pdflush and the userprogramm ++ * SO: the check might be correct if: ei->i_compr_flags & EXT2_OSYNC_INOD AND the same process already posesses this lock!!! ++ */ ++ //if (!(ei->i_compr_flags & EXT2_OSYNC_INODE)) ++ //{ ++ mutex_lock(&inode->i_mutex); ++#ifdef EXT2_COMPR_REPORT_MUTEX ++ printk(KERN_DEBUG "CLEANUP_LOCK of PID %u @ inode:%lu\n", current->pid, inode->i_ino); ++#endif ++ //} ++ } ++ n_clusters = ext2_n_clusters(inode); ++ ++#ifdef EXT2_COMPR_REPORT_PUT ++ printk(KERN_DEBUG "ext2: inode:%lu: put compressed, clusters = %d, flags = %x, pid = %u\n", ++ inode->i_ino, n_clusters, ei->i_flags, current->pid); ++#endif ++ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); /* i.e. mutex_lock */ ++ ++ /* Try to compress the clusters. We clear EXT2_DIRTY_FL only ++ if we looked at every cluster and if there was no error. */ ++ ++ /* impl: We raise EXT2_COMPRBLK_FL now so that ext2_ioctl() ++ doesn't try to change the cluster size beneath us. If need ++ be, we restore the bit to its original setting before ++ returning. Note that no-one else can _change_ ++ EXT2_COMPRBLK_FL while we work because i_sem is down. */ ++ /* impl: Note what's happening here with comprblk_mask. The ++ current state of COMPRBLK_FL (before we start) is that ++ (comprblk == 1) || (no compressed clusters). At the end of ++ the procedure, comprblk == one if (at least one compressed ++ cluster, or an error occurred preventing us from finding ++ out). */ ++ comprblk_mask = ~EXT2_COMPRBLK_FL | ei->i_flags; ++ ei->i_flags |= EXT2_COMPRBLK_FL; ++ ++ for (cluster = 0; cluster < n_clusters; cluster++) { ++ if (atomic_read(&inode->i_count) > atomic_read(&start_i_count)) { ++ /* This is a poor way of doing this (and doubly ++ poor now that the only users of i_count are ++ the dentries), but the idea is not to ++ compress things tht are likely to be ++ decompressed soon. I guess a better way of ++ doing this would be just to make sure tht ++ the stuff is in the page cache. */ ++ retry = 1; ++ break; ++ } ++ err = ext2_cluster_is_compressed_fn(inode, cluster); ++ if (err == 0) { ++ //mw: ext2_compress_cluster might clean EXT2_COMPRBLK_FL, therefore raise it for every new cluster ++ ei->i_flags |= EXT2_COMPRBLK_FL; ++ ++ err = ext2_compress_cluster(inode, cluster); ++ if (err < 0) ++ dirty = 1; ++ else if (err > 0) ++ comprblk_mask = ~0ul; ++ } else if (err < 0) ++ break; ++ else { ++ err = 0; ++ assert(comprblk_mask == ~0ul); /* i.e. that EXT2_COMPRBLK_FL was high. */ ++ } ++ } ++ ++ if ((cluster >= n_clusters) && !dirty) ++ ei->i_flags &= ~EXT2_DIRTY_FL; ++ if (!retry) { ++ ei->i_compr_flags &= ~EXT2_CLEANUP_FL; ++ ei->i_flags &= comprblk_mask; ++ } ++ ++ /* We clear EXT2_CLEANUP_FL because, otherwise, we'll get ++ called again almost immediately. */ ++ ++ /* ++ * The CLEANUP flag *MUST* be cleared, otherwise the iput routine ++ * calls ext2_put_inode() again (because i_dirt is set) and there ++ * is a loop. The control scheme (CLEANUP + DIRTY flags) could ++ * probably be improved. On the other hand, i_dirt MUST be set ++ * because we may have sleeped, and we must force the iput routine ++ * to look again at the i_count ... ++ */ ++ /* TODO: Have a look at this cleanup scheme. The above ++ comment sounds wrong. */ ++ ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty_sync(inode); ++ out: ++ ++#ifdef EXT2_COMPR_REPORT_MUTEX ++ printk(KERN_DEBUG "CLEANUP_UNLOCK of PID %u @ inode:%lu\n", current->pid, inode->i_ino); ++#endif ++ ++// if (!(ei->i_compr_flags & EXT2_OSYNC_INODE)) { /* MW 5-16-07 */ ++ mutex_unlock(&inode->i_mutex); ++// } /* MW 5-16-07 */ ++ return err; /* TODO: Check that ,err` is appropriate. */ ++} ++ ++ ++int ext2_recognize_compressed(struct inode *inode, unsigned cluster) ++{ ++ /* ext2_recognize_compressed(): Check tht the cluster is valid ++ in every way, and then do the EXT2_COMPRESSED_BLKADDR ++ thing. */ ++ /* nyi, fixme. All of the userspace stuff (EXT2_NOCOMPR_FL ++ etc.) needs work, so I might as well leave this. See ++ ioctl.c for a description of what it's supposed to do. */ ++ return -ENOSYS; ++} ++ ++ ++/* Look for compressed clusters. If none, then clear EXT2_COMPRBLK_FL. ++ ++ Called by: ++ ext2_truncate(). ++ */ ++void ext2_update_comprblk(struct inode *inode) ++{ ++ unsigned block, last_block; ++ struct ext2_bkey key; ++ struct ext2_inode_info *ei = EXT2_I(inode); ++ ++ assert(ei->i_flags & EXT2_COMPRBLK_FL); ++ if (inode->i_size == 0) { ++ ei->i_flags &= ~EXT2_COMPRBLK_FL; ++ trace_e2c("ext2_update_comprblk 1: inode: %lu removed EXT2_COMPRBLK_FL!\n", inode->i_ino); ++ return; ++ } ++ last_block = ROUNDUP_RSHIFT(inode->i_size, ++ inode->i_sb->s_blocksize_bits) - 1; ++ block = ext2_first_cluster_nblocks(inode) - 1; ++ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); ++ ++ if (!ext2_get_key(&key, inode, block)) ++ return; ++ for (;;) { ++ if (ext2_get_key_blkaddr(&key) == EXT2_COMPRESSED_BLKADDR) ++ goto out; ++ if (block >= last_block) ++ goto clear; ++ if (!ext2_next_key(&key, ei->i_clu_nblocks)) ++ goto out; ++ block += ei->i_clu_nblocks; ++ } ++ clear: ++ trace_e2c("ext2_update_comprblk 2: inode: %lu removed EXT2_COMPRBLK_FL!\n", inode->i_ino); ++ ei->i_flags &= ~EXT2_COMPRBLK_FL; ++ out: ++ ext2_free_key(&key); ++ assert(atomic_read(&inode->i_mutex.count) <= 0); ++ ++} ++ ++ ++/* ++ * allocate working areas ++ */ ++ ++DEFINE_PER_CPU(struct ext2_wa_S *, ext2_rd_wa) = NULL; ++DEFINE_PER_CPU(struct ext2_wa_S *, ext2_wr_wa) = NULL; ++ ++/* SMP, setup wa's. caller must hold wa already via get_cpu_var */ ++void ext2_alloc_rd_wa(){ ++ if ((__get_cpu_var(ext2_rd_wa) == NULL) ) { ++ size_t rsize = 2 * EXT2_MAX_CLUSTER_BYTES; //mw: just guessing ++ ++ __get_cpu_var(ext2_rd_wa) = vmalloc (rsize); ++ if (__get_cpu_var(ext2_rd_wa) == NULL) ++ printk ("EXT2-fs: can't allocate working area; compression turned off.\n"); ++ else { ++ printk ("ext2-compression: allocated read buffer for CPU%i at %p-%p (%zu bytes)\n", ++ get_cpu(), __get_cpu_var(ext2_rd_wa), (char *)__get_cpu_var(ext2_rd_wa) + rsize, rsize); ++# ifdef EXT2_COMPR_REPORT_WA ++ printk (KERN_INFO "EXT2-fs: rd_wa=%p--%p (%d)\n", ++ ext2_rd_wa, (char *)ext2_rd_wa + rsize, rsize); ++# endif ++ put_cpu(); ++ } ++ } ++} ++ ++void ext2_alloc_wr_wa(){ ++ ++ if ((__get_cpu_var(ext2_wr_wa) == NULL) ) { ++ size_t wsize = 2 * EXT2_MAX_CLUSTER_BYTES; //mw: just guessing ++ __get_cpu_var(ext2_wr_wa) = vmalloc (wsize); ++ ++ if (__get_cpu_var(ext2_wr_wa) == NULL) ++ printk ("EXT2-fs: can't allocate working area; " ++ "compression turned off.\n"); ++ else { ++ printk ("ext2-compression: allocated write buffer for CPU%i at %p-%p (%zu bytes)\n", ++ get_cpu(), __get_cpu_var(ext2_wr_wa), (char *)__get_cpu_var(ext2_wr_wa) + wsize, wsize); ++#ifdef EXT2_COMPR_REPORT_WA ++ printk (KERN_INFO "EXT2-fs: wr_wa=%p--%p (%d)\n", ++ ext2_wr_wa, (char *)ext2_wr_wa + wsize, wsize); ++#endif ++ put_cpu(); ++ } ++ } ++} ++ ++ +--- linux-3.2-rc5/fs/ext2/e2zlib.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2-rc5-e2c/fs/ext2/e2zlib.c 2011-12-13 14:22:47.841975843 +0100 +@@ -0,0 +1,74 @@ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static DEFINE_PER_CPU(struct crypto_comp *, tfm) = NULL; ++ ++size_t ext2_iZLIB(int action) ++{ ++ /*mw: we init tfm when we need it...*/ ++ return 0; ++} ++ ++ ++size_t ext2_wZLIB(__u8 * ibuf, __u8 * obuf, void *heap, ++ size_t ilen, size_t olen, int level) ++{ ++ int ret, dlen; ++ ++ if (!try_module_get(THIS_MODULE)) ++ return 0; ++ ++ /*check if we already have a tfm*/ ++ get_cpu_var(tfm); ++ if (__get_cpu_var(tfm) == NULL){ ++ __get_cpu_var(tfm) = crypto_alloc_comp("deflate", 0, CRYPTO_ALG_ASYNC); ++ } ++ assert(__get_cpu_var(tfm) != NULL); ++ ++ dlen = olen; ++ ret = crypto_comp_compress(__get_cpu_var(tfm) , ibuf, ilen, obuf, &dlen); ++ ++ put_cpu_var(tfm); ++ ++ if (ret) { ++ //printk(KERN_DEBUG "ext2_wZLIB: crypto_comp_compress failed: %d, ilen: %d, olen: %d\n", ret, ilen, olen); ++ return 0; ++ } ++ return dlen; ++} ++ ++ ++size_t ext2_rZLIB(__u8 * ibuf, __u8 * obuf, void *heap, ++ size_t ilen, size_t olen, int ignored) ++{ ++ int ret, dlen; ++ ++ if (!try_module_get(THIS_MODULE)) ++ return 0; ++ ++ /*check if we already have a tfm*/ ++ get_cpu_var(tfm); ++ if (__get_cpu_var(tfm) == NULL){ ++ __get_cpu_var(tfm) = crypto_alloc_comp("deflate", 0, CRYPTO_ALG_ASYNC); ++ } ++ assert(__get_cpu_var(tfm) != NULL); ++ ++ dlen = olen; ++ ret = crypto_comp_decompress(__get_cpu_var(tfm), ibuf, ilen, obuf, &dlen); ++ ++ put_cpu_var(tfm); ++ ++ if (ret) { ++ //printk(KERN_DEBUG "ext2_wZLIB: crypto_comp_decompress failed: %d, ilen: %d, olen: %d\n", ret, ilen, olen); ++ return 0; ++ } ++ ++ return dlen; ++} +--- linux-3.2-rc5/fs/ext2/adler32.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2-rc5-e2c/fs/ext2/adler32.c 2011-12-13 14:22:47.841975844 +0100 +@@ -0,0 +1,43 @@ ++/* adler32.c -- compute the Adler-32 checksum of a data stream ++ * Copyright (C) 1995-1998 Mark Adler ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++ ++/* @(#) $Id: e2compr2.6.25.patch,v 1.1.2.1 2008/04/17 09:49:32 winkler Exp $ */ ++ ++#define BASE 65521L /* largest prime smaller than 65536 */ ++#define NMAX 5552 ++/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ ++ ++#define DO1(buf,i) {s1 += buf[i]; s2 += s1;} ++#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1); ++#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2); ++#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); ++#define DO16(buf) DO8(buf,0); DO8(buf,8); ++ ++/* ========================================================================= */ ++unsigned long ext2_adler32(unsigned long adler, const unsigned char *buf, unsigned int len) ++{ ++ unsigned long s1 = adler & 0xffff; ++ unsigned long s2 = (adler >> 16) & 0xffff; ++ int k; ++ ++ if (buf == 0) return 1L; ++ ++ while (len > 0) { ++ k = len < NMAX ? len : NMAX; ++ len -= k; ++ while (k >= 16) { ++ DO16(buf); ++ buf += 16; ++ k -= 16; ++ } ++ if (k != 0) do { ++ s1 += *buf++; ++ s2 += s1; ++ } while (--k); ++ s1 %= BASE; ++ s2 %= BASE; ++ } ++ return (s2 << 16) | s1; ++} +--- linux-3.2-rc5/fs/ext2/super.c 2011-12-10 00:09:32.000000000 +0100 ++++ linux-3.2-rc5-e2c/fs/ext2/super.c 2011-12-13 14:22:47.843975906 +0100 +@@ -32,7 +32,12 @@ + #include + #include + #include ++#ifdef CONFIG_EXT2_COMPRESS ++#include ++#include ++#else + #include "ext2.h" ++#endif + #include "xattr.h" + #include "acl.h" + #include "xip.h" +@@ -393,7 +398,11 @@ enum { + Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, + Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, + Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, +- Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota, ++ Opt_acl, Opt_noacl, ++#ifdef CONFIG_EXT2_COMPRESS ++ Opt_force_compat, ++#endif ++ Opt_xip, Opt_ignore, Opt_err, Opt_quota, + Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation + }; + +@@ -426,6 +435,9 @@ static const match_table_t tokens = { + {Opt_ignore, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, ++#ifdef CONFIG_EXT2_COMPRESS ++ {Opt_force_compat, "force-compat"}, ++#endif + {Opt_reservation, "reservation"}, + {Opt_noreservation, "noreservation"}, + {Opt_err, NULL} +@@ -569,6 +581,11 @@ static int parse_options(char *options, + clear_opt(sbi->s_mount_opt, RESERVATION); + ext2_msg(sb, KERN_INFO, "reservations OFF"); + break; ++#ifdef CONFIG_EXT2_COMPRESS ++ case Opt_force_compat: ++ set_opt(sbi->s_mount_opt, FORCE_COMPAT); ++ break; ++#endif + case Opt_ignore: + break; + default: +@@ -585,6 +602,10 @@ static int ext2_setup_super (struct supe + int res = 0; + struct ext2_sb_info *sbi = EXT2_SB(sb); + ++#ifdef CONFIG_EXT2_COMPRESS ++ printk (KERN_INFO E2COMPR_VERSION "\n"); ++#endif ++ + if (le32_to_cpu(es->s_rev_level) > EXT2_MAX_SUPP_REV) { + ext2_msg(sb, KERN_ERR, + "error: revision level too high, " +@@ -876,6 +897,65 @@ static int ext2_fill_super(struct super_ + le32_to_cpu(features)); + goto failed_mount; + } ++#ifdef CONFIG_EXT2_COMPRESS ++ /* Check that required algorithms are available. */ ++ /* todo: Provide a mount option to override this. */ ++ /* ++ * Philosophical bug: we assume that an algorithm's ++ * module is available if and only if this kernel was ++ * compiled with that algorithm as a module. This is ++ * untrue, but it is unclear what the right thing to ++ * do is. ++ */ ++ j = 0; /* error flag */ ++ if ((es->s_feature_incompat ++ & cpu_to_le32(EXT2_FEATURE_INCOMPAT_COMPRESSION)) ++ && (es->s_algorithm_usage_bitmap ++ & ~cpu_to_le32(EXT2_ALGORITHMS_SUPP))) { ++ /* ++ * The filesystem employs an algorithm not ++ * supported by this filesystem. Issue warning or ++ * error. ++ */ ++ for (i = 0; i < 32; i++) { ++ if (!(es->s_algorithm_usage_bitmap ++ & cpu_to_le32(1 << i)) ++ || ((EXT2_ALGORITHMS_SUPP ++ & (1 << i)))) ++ continue; ++ /* ++ * TODO: Can't this message be moved outside ++ * of the for loop? ++ */ ++ if (!j) { ++ if (test_opt(sb, FORCE_COMPAT)) ++ printk(KERN_WARNING ++ "EXT2-fs: %s: " ++ "uses unsupported " ++ "compression algorithms", ++ sb->s_id); ++ else ++ printk("EXT2-fs: %s: couldn't mount " ++ "because of unsupported " ++ "compression algorithms", ++ sb->s_id); ++ j = 1; ++ } ++ if (i < EXT2_N_ALGORITHMS) ++ printk(" %s", ext2_algorithm_table[i].name); ++ else ++ printk(" %u", i); ++ } ++ } ++ if (j) { ++ if (test_opt(sb, FORCE_COMPAT)) ++ printk(" but ignoring as you request.\n"); ++ else { ++ printk(".\n"); ++ goto failed_mount; ++ } ++ } ++#endif /* CONFIG_EXT2_COMPRESS */ + if (!(sb->s_flags & MS_RDONLY) && + (features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){ + ext2_msg(sb, KERN_ERR, "error: couldn't mount RDWR because of " +--- linux-3.2-rc5/fs/ext2/ialloc.c 2011-12-10 00:09:32.000000000 +0100 ++++ linux-3.2-rc5-e2c/fs/ext2/ialloc.c 2011-12-13 14:22:47.845975968 +0100 +@@ -470,6 +470,9 @@ struct inode *ext2_new_inode(struct inod + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, group); + if (!bitmap_bh) { ++#ifdef CONFIG_EXT2_COMPRESS ++ EXT2_I(inode)->i_flags &= ~EXT2_COMPR_FL; ++#endif + err = -EIO; + goto fail; + } +@@ -558,6 +561,17 @@ got: + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_flags = + ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED); ++#ifdef CONFIG_EXT2_COMPRESS ++ /* ++ * The EXT2_COMPR flag is inherited from the parent ++ * directory as well as the cluster size and the compression ++ * algorithm. ++ */ ++ ei->i_log2_clu_nblocks = EXT2_I(dir)->i_log2_clu_nblocks; ++ ei->i_clu_nblocks = EXT2_I(dir)->i_clu_nblocks; ++ ei->i_compr_method = EXT2_I(dir)->i_compr_method; ++ ei->i_compr_flags = 0; ++#endif + ei->i_faddr = 0; + ei->i_frag_no = 0; + ei->i_frag_size = 0; +--- linux-3.2-rc5/fs/ext2/balloc.c 2011-12-10 00:09:32.000000000 +0100 ++++ linux-3.2-rc5-e2c/fs/ext2/balloc.c 2011-12-13 14:22:47.847976031 +0100 +@@ -11,8 +11,13 @@ + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + ++#ifdef CONFIG_EXT2_COMPRESS ++#include ++#include ++#else + #include "ext2.h" + #include ++#endif + #include + #include + #include +@@ -499,6 +504,13 @@ void ext2_free_blocks (struct inode * in + struct ext2_super_block * es = sbi->s_es; + unsigned freed = 0, group_freed; + ++ ++#ifdef CONFIG_EXT2_COMPRESS ++ assert((block != EXT2_COMPRESSED_BLKADDR) ++ || !S_ISREG(inode->i_mode) ++ || !(EXT2_SB(sb)->s_es->s_feature_incompat ++ & cpu_to_le32(EXT2_FEATURE_INCOMPAT_COMPRESSION))); ++#endif + if (block < le32_to_cpu(es->s_first_data_block) || + block + count < block || + block + count > le32_to_cpu(es->s_blocks_count)) { +--- linux-3.2-rc5/fs/ext2/inode.c 2011-12-10 00:09:32.000000000 +0100 ++++ linux-3.2-rc5-e2c/fs/ext2/inode.c 2011-12-13 14:22:47.852976189 +0100 +@@ -32,7 +32,14 @@ + #include + #include + #include ++#ifdef CONFIG_EXT2_COMPRESS ++#include ++#include ++#include ++#include ++#else + #include "ext2.h" ++#endif + #include "acl.h" + #include "xip.h" + +@@ -40,6 +47,34 @@ MODULE_AUTHOR("Remy Card and others"); + MODULE_DESCRIPTION("Second Extended Filesystem"); + MODULE_LICENSE("GPL"); + ++#ifdef CONFIG_EXT2_COMPRESS ++/* mw: this function counts all references ++ * to this inode. this is necessary to ++ * refuse un/compression if the file has ++ * more than one refernce, I guess. */ ++int ext2_get_dcount(struct inode *inode) ++{ ++ struct dentry *dentry; ++ struct list_head *head, *next, *tmp; ++ int count; ++ ++ head = &inode->i_dentry; ++ next = inode->i_dentry.next; ++ count = 0; ++ while (next != head) { ++ dentry = list_entry(next, struct dentry, d_alias); ++ tmp = next; ++ next = tmp->next; ++ spin_lock(&dentry->d_lock); ++ count += dentry->d_count; ++ spin_unlock(&dentry->d_lock); ++ //mw: similar to fs/dcache.c ++ } ++ ++ return count; ++} ++#endif ++ + static int __ext2_write_inode(struct inode *inode, int do_sync); + + /* +@@ -54,7 +89,9 @@ static inline int ext2_inode_is_fast_sym + inode->i_blocks - ea_blocks == 0); + } + ++#ifndef CONFIG_EXT2_COMPRESS + static void ext2_truncate_blocks(struct inode *inode, loff_t offset); ++#endif + + static void ext2_write_failed(struct address_space *mapping, loff_t to) + { +@@ -240,7 +277,11 @@ static Indirect *ext2_get_branch(struct + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, EXT2_I(inode)->i_data + *offsets); ++#ifdef CONFIG_EXT2_COMPRESS ++ if (HOLE_BLKADDR(p->key)) ++#else + if (!p->key) ++#endif + goto no_block; + while (--depth) { + bh = sb_bread(sb, le32_to_cpu(p->key)); +@@ -251,7 +292,11 @@ static Indirect *ext2_get_branch(struct + goto changed; + add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); + read_unlock(&EXT2_I(inode)->i_meta_lock); ++#ifdef CONFIG_EXT2_COMPRESS ++ if (HOLE_BLKADDR(p->key)) ++#else + if (!p->key) ++#endif + goto no_block; + } + return NULL; +@@ -297,7 +342,11 @@ static ext2_fsblk_t ext2_find_near(struc + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) ++#ifdef CONFIG_EXT2_COMPRESS ++ if (!HOLE_BLKADDR(*p)) ++#else + if (*p) ++#endif + return le32_to_cpu(*p); + + /* No such thing, so let's try location of indirect block */ +@@ -498,7 +547,13 @@ static int ext2_alloc_branch(struct inod + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + branch[n].bh = bh; ++#ifndef CONFIG_EXT2_COMPRESS + lock_buffer(bh); ++#else ++ CHECK_NOT_ATOMIC ++ if (!buffer_uptodate(bh)) ++ wait_on_buffer(bh); ++#endif + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); +@@ -514,7 +569,9 @@ static int ext2_alloc_branch(struct inod + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + set_buffer_uptodate(bh); ++#ifndef CONFIG_EXT2_COMPRESS + unlock_buffer(bh); ++#endif + mark_buffer_dirty_inode(bh, inode); + /* We used to sync bh here if IS_SYNC(inode). + * But we now rely upon generic_write_sync() +@@ -675,6 +732,7 @@ static int ext2_get_blocks(struct inode + if (err == -EAGAIN || !verify_chain(chain, partial)) { + while (partial > chain) { + brelse(partial->bh); ++// bforget(partial->bh); /*mw: e2c-pre-2.6.30.4 used bforget here*/ + partial--; + } + partial = ext2_get_branch(inode, depth, offsets, chain, &err); +@@ -766,21 +824,608 @@ int ext2_fiemap(struct inode *inode, str + ext2_get_block); + } + ++#ifdef CONFIG_EXT2_COMPRESS ++/* ++ * Readpage method that will take care of decompression. ++ */ ++/* effic: I (pjm) think tht at present, reading a 32KB cluster 4KB at ++ a time does `decompress 4KB' for the first 4KB, then `decompress ++ 8KB' for the second, and so on. See if we can provide the page ++ cache with all the pages in a cluster. The problem is, we don't ++ want to erase anything tht hasn't been written to disk, so we can't ++ just call update_vm_cache(). The plan at present is to remember ++ what the contents of ext2_rd_wa.u come from, and don't bother ++ decompressing anything if the working area already contains the ++ right data. However, this is only a win where adjacent calls to ++ ext2_decompress_blocks() request the same cluster. We could force ++ that by copying some code from generic_file_read() (but check for ++ deadlocks before doing anything like that), but instead I'm taking ++ the more passive approach of hoping for the best. */ ++static int ext2_readpage(struct file *file, struct page *page) ++{ ++ struct inode *inode = page->mapping->host; ++ struct page *pg[EXT2_MAX_CLUSTER_PAGES], *epg[EXT2_MAX_CLUSTER_PAGES]; ++ u32 cluster0, max_cluster; ++ int i, blockOfCluster, blocksToDo, npg; ++ const int inc = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; ++ struct ext2_inode_info *ei = EXT2_I(page->mapping->host); ++#ifdef CONFIG_HIGHMEM ++ int kmapped = 0; //mw ++#endif ++ ++ int iClusterCnt; ++ ++ /* For directories, fall out through default routine */ ++ if (S_ISDIR(inode->i_mode)) ++ { ++ int rc; ++ ++ rc = block_read_full_page(page,ext2_get_block); ++ assert(!rc); ++ return rc; ++ } ++ ++ /* The semaphore prevents us trying to compress and decompress ++ the cluster at the same time, or compress a cluster in the ++ middle of reading it (thinking it to be uncompressed). ++ ++ You may not like the fact that we hold the semaphore across ++ readpage (given that it isn't held without e2compr compiled ++ in), but it does guarantee that we won't compress the ++ cluster during readpage. (OTOH, it's unlikely, if not ++ impossible, for someone to ,compress a cluster and rewrite ++ the blocks` before the readpage completes.) */ ++ /* This procedure used to have `#ifndef EXT2_LOCK_BUFFERS' ++ around all the semaphore stuff, and unlocked each buffer ++ before brelsing them ifdef EXT2_LOCK_BUFFERS. I (pjm, ++ 1998-01-20) have removed that because (a) EXT2_LOCK_BUFFERS ++ isn't #defined anywhere, and doesn't appear outside of this ++ function, and (b) I haven't looked at what effect locking ++ the buffers has. You may like to reintroduce the idea of ++ buffer locking to this function if you're more familiar ++ with buffer locking than I, and believe that the full i_sem ++ isn't necessary to protect from races (people seeing raw ++ compressed data) between readpage and ext2_file_write(), ++ ext2_compress_cluster() and ext2_truncate(). */ ++ unlock_page(page); ++ mutex_lock(&inode->i_mutex); ++ ++ assert (atomic_read(&inode->i_mutex.count) <= 0); /* i.e. mutex_lock */ ++ ++ //mw: added EXT2_COMPR_FL, because EXT2_COMPRBLK_FL mit change without mutex !!! ++ if ( !(ei->i_flags & (EXT2_COMPRBLK_FL|EXT2_COMPR_FL)) ++ || (ei->i_flags & EXT2_NOCOMPR_FL) ) ++ { ++ goto readpage_uncompressed; ++ } ++ ++ { ++ register u32 blockOfFile ++ = (page->index << PAGE_CACHE_SHIFT) >> inode->i_sb->s_blocksize_bits; ++ ++ blocksToDo = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; ++ cluster0 = ext2_block_to_cluster(inode, blockOfFile); ++ max_cluster = ext2_block_to_cluster ++ (inode, blockOfFile + blocksToDo - 1); ++ blockOfCluster ++ = blockOfFile - ext2_cluster_block0(inode, cluster0); ++ } ++ ++ /* return -???, any idea which code. do_generic_file_read() cares, ext2_readpages() doesn't. ++ maybe I should look at the "generic" readpage() and see what it returns in this case */ ++ ++ /* Check if any part of the requested area contains part of a ++ compressed cluster. If not, we can use default ext2_readpage(). ++ ++ (Note that we don't have to worry about a cluster becoming ++ compressed in the meantime, because we have the semaphore.) ++ ++ A page can cover up to 9 clusters. (The maximum can only ++ occur with 32KB pages, 4KB clusters, and a non-page-aligned ++ offset. Thanks go to Kurt Fitzner for reporting that ++ page offsets needn't be aligned; see generic_file_mmap().) */ ++ { ++ int isCmp[(PAGE_SIZE >> 12) + 1]; ++ u8 *dst; ++ unsigned clu_ix; ++ ++ assert (max_cluster - cluster0 < sizeof(isCmp)/sizeof(*isCmp)); ++ for (clu_ix = 0; cluster0 + clu_ix <= max_cluster; clu_ix++) { ++ isCmp[clu_ix] = ext2_cluster_is_compressed_fn (inode, cluster0 + clu_ix); ++ if (isCmp[clu_ix] < 0){ ++ printk("IO-ERROR: isCmp\n"); ++ goto io_error; ++ } ++ } ++ ++ for (clu_ix = 0; cluster0 + clu_ix <= max_cluster; clu_ix++) ++ if (isCmp[clu_ix] > 0) ++ goto readpage_compressed; ++ /* fall through */ ++ readpage_uncompressed: ++ { ++ int rc=0; ++ lock_page(page); ++ ++ /* Did somebody else fill it already? */ ++ if (PageUptodate(page) ){ //mw: necessary for DEBUG! anyway checked in do_generic_mapping_read ++ unlock_page(page); ++ } ++ else { ++ //try_to_free_buffers(page); ++ rc = block_read_full_page(page,ext2_get_block); ++ } ++ mutex_unlock(&inode->i_mutex); ++ assert(!rc); ++ return rc; ++ } ++ ++ readpage_compressed: ++ ++ /* Copied from block_read_full_page */ ++ /* if (!PageLocked(page)) */ ++ /* PAGE_BUG(page); */ ++ lock_page(page); ++ if (PageUptodate(page)) { ++ unlock_page(page); ++ mutex_unlock(&inode->i_mutex); ++ return(0); ++ } ++ get_page(page); ++ ++ ClearPageUptodate(page); ++ ClearPageError(page); ++ ++ dst = (u8 *) page_address(page); ++ for (clu_ix = 0; cluster0 + clu_ix <= max_cluster; clu_ix++) { ++ struct buffer_head *bh[EXT2_MAX_CLUSTER_BLOCKS]; ++ int nbh, blocksThisClu; ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ pg[i] = NULL; ++ epg[i] = NULL; ++ } ++ ++ /* clear_bit(PG_locked, &page->flags); */ ++ npg = ext2_cluster_npages(inode, cluster0 + clu_ix); ++ nbh = ext2_get_cluster_pages(inode, cluster0 + clu_ix, pg, page, 0); ++ ++ if (nbh <= 0) { ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) ++ printk("no pages\n"); ++ goto out; ++ } ++ iClusterCnt = ext2_cluster_npages(inode, cluster0); ++ ++ nbh = ext2_get_cluster_extra_pages(inode, cluster0 + clu_ix, pg, epg); ++ if (nbh <= 0) ++ { ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) ++ epg[i] = NULL; ++ printk("no extra pages\n"); ++ goto out; ++ } ++ assert (iClusterCnt = ext2_cluster_npages(inode, cluster0)); ++ ++#ifdef CONFIG_HIGHMEM ++ ext2_kmap_cluster_pages(page, pg, epg); ++ kmapped = 1; ++#endif ++ ++ nbh = ext2_get_cluster_blocks(inode, cluster0 + clu_ix, bh, pg, epg, 0); ++ if (nbh <= 0) ++ { ++ printk("no blocks\n"); ++ goto out; ++ } ++ ++ /* How many blocks (including holes) we need from this cluster. */ ++ { ++ blocksThisClu = (ext2_cluster_nblocks(inode, cluster0 + ++ clu_ix) - blockOfCluster); ++ if (blocksThisClu > blocksToDo) ++ blocksThisClu = blocksToDo; ++ } ++ ++ if (isCmp[clu_ix]) { ++ u8 const *src; ++ int n, nbytes_wanted; ++ struct ext2_cluster_head *head; ++ unsigned meth; ++# ifdef CONFIG_KMOD ++ unsigned alg; ++# endif ++ ++ bh[0]->b_data = page_address(bh[0]->b_page); ++ head = (struct ext2_cluster_head *) bh[0]->b_data; ++ ++ /* jmr 1998-10-28 Hope this is the last time I'm moving this code. ++ * Module loading must be done _before_ we lock wa, just think what ++ * can happen if we reallocate wa when somebody else uses it... ++ */ ++ meth = head->method; /* only a byte, so no swabbing needed. */ ++ if (meth >= EXT2_N_METHODS) { ++ printk("illegal method id\n"); ++ ext2_msg(inode->i_sb, ++ "illegal method id", ++ "inode = %lu, id = %u", ++ inode->i_ino, meth); ++ goto out; ++ } ++# ifdef CONFIG_KMOD ++ alg = ext2_method_table[meth].alg; ++ if (!ext2_algorithm_table[alg].avail) { ++ char str[32]; ++ ++ sprintf(str, "ext2-compr-%s", ext2_algorithm_table[alg].name); ++ request_module(str); ++ } ++# endif /* CONFIG_KMOD */ ++ ++ /* Calculate nbytes_wanted. */ ++ { ++ unsigned nblk_wanted, i; ++ ++ /* We want to decompress the whole cluster */ ++ //nblk_wanted = ext2_cluster_nblocks(inode, cluster0 + clu_ix); ++ nblk_wanted = npg << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); /*mw: FIXED */ ++ ++ for (i = nblk_wanted; i != 0;) ++ if (((--i >> 3) < head->holemap_nbytes) ++ && (head->holemap[i >> 3] & (1 << (i & 7)))) ++ --nblk_wanted; ++ nbytes_wanted = (nblk_wanted ++ << inode->i_sb->s_blocksize_bits); ++ } ++ ++ /* Decompress. */ ++ get_cpu_var(ext2_rd_wa); ++ if (__get_cpu_var(ext2_rd_wa) == NULL) ++ { ++ ext2_alloc_rd_wa(); ++ } ++ assert(__get_cpu_var(ext2_rd_wa) != NULL); ++ ++ n = ext2_decompress_blocks(inode, bh, nbh, nbytes_wanted, cluster0 + clu_ix); ++ if (n < 0) { ++ assert(nbh >= 0); ++ printk("ext2_readpage: noblocks decompressed\n"); ++ put_cpu_var(ext2_rd_wa); ++ goto out; ++ } ++ ++# ifdef EXT2_COMPR_REPORT_VERBOSE_INODE ++ if (ei->i_flags & EXT2_COMPR_FL) ++ printk(KERN_DEBUG "ext2: mmap %04x:%lu: blocksToDo=%d, blockOfCluster=%d, blocksThisClu=%d, clu_nblocks=%d\n", ++ inode->i_rdev, ++ inode->i_ino, ++ blocksToDo, ++ blockOfCluster, ++ blocksThisClu, ++ ext2_cluster_nblocks(inode, cluster0 + clu_ix)); ++# endif ++ ++ /* */ ++ { ++ unsigned i; ++ int ipg; ++ ++ i = ext2_cluster_nblocks(inode, cluster0 + clu_ix) - 1; ++ //i = (npg << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)) - 1; /*mw: FIXED!!! (here: shift = 2Bit) */ ++ //if(i+1 != ext2_cluster_nblocks(inode, cluster0 + clu_ix)) ++ //printk("npg=%i, nbh=%i, npgf=%i, nbhf =%i, cluster:%i, dec_blk:%i, b_wanted:%i, size:%i\n ", ext2_cluster_npages(inode, cluster0 + clu_ix), ext2_cluster_nblocks(inode, cluster0 + clu_ix), npgtest, i+1, cluster0 + clu_ix, n, nbytes_wanted, inode->i_size); ++ blockOfCluster = 0; ++ assert(n > 0); ++ src = __get_cpu_var(ext2_rd_wa)->u + nbytes_wanted - inode->i_sb->s_blocksize; ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c("ext2_readpage: copy data inc=%d blocksThisClu=%d, n=%d\n", inc, blocksThisClu, n); ++#endif ++ for (ipg = npg - 1; ipg >= 0; ipg--) { ++ if (pg[ipg] == NULL) { ++ i -= inc; ++ src -= PAGE_SIZE; ++ continue; ++ } ++ if (((inode->i_size-1) >> PAGE_SHIFT) == pg[ipg]->index) { ++ n = ((inode->i_size-1) & (PAGE_SIZE -1)) >> inode->i_sb->s_blocksize_bits; ++ i -= ((blocksThisClu-1) - n); ++ src -= ((blocksThisClu-1) - n) << inode->i_sb->s_blocksize_bits; ++ } else { ++ n = blocksThisClu - 1; ++ } ++ if (PageUptodate(pg[ipg]) ) { ++ for (;n >= 0;n--, i--) { ++ if (((i >> 3) >= head->holemap_nbytes) ++ || !(head->holemap[i >> 3] & (1 << (i & 7)))) { ++ src -= inode->i_sb->s_blocksize; ++ } ++ } ++ } else { ++ ++ dst = (u8 *) page_address(pg[ipg]) + (n << inode->i_sb->s_blocksize_bits); ++ ++ for (; ++ n >= 0; ++ n--, i--, dst -= inode->i_sb->s_blocksize) { ++ assert(!buffer_dirty(bh[i])); ++ clear_buffer_dirty(bh[i]); //mw: had a refile_buffer in 2.4 ++ if (((i >> 3) >= head->holemap_nbytes) ++ || !(head->holemap[i >> 3] & (1 << (i & 7)))) { ++ assert(i >= 0); ++ memcpy(dst, src, inode->i_sb->s_blocksize); ++ src -= inode->i_sb->s_blocksize; ++ } else { ++ assert(i >= 0); ++ memset (dst, 0, inode->i_sb->s_blocksize); ++ } ++ //clear_bit(BH_Uptodate, &bh[i]->b_state); ++ } ++ SetPageUptodate(pg[ipg]); ++ } ++ } ++ } ++ put_cpu_var(ext2_rd_wa); ++ } else { ++ /* Uncompressed cluster. Just copy the data. */ ++ int n; ++ ++# ifdef EXT2_COMPR_REPORT_VERBOSE_INODE ++ if (ei->i_flags & EXT2_COMPR_FL) ++ printk(KERN_DEBUG ++ "ext2: mmap %lu: blocksToDo = %d, " ++ "blockOfCluster = %d, clu_nblocks = %d\n", ++ inode->i_ino, blocksToDo, blockOfCluster, ++ ext2_cluster_nblocks(inode, cluster0 + ++ clu_ix)); ++# endif ++ ++ for (n = 0; ++ n < blocksThisClu; ++ n++, dst += inode->i_sb->s_blocksize) { ++ if ((blockOfCluster + n < nbh) ++ && (bh[blockOfCluster + n] != NULL)) ++ { ++ memcpy(dst, ++ bh[blockOfCluster + n]->b_data, ++ inode->i_sb->s_blocksize); ++ } ++ else ++ { ++ memset(dst, 0, inode->i_sb->s_blocksize); ++ } ++ } ++ blockOfCluster = 0; ++ } // end uncompressed Cluster ++ ++ blocksToDo -= blocksThisClu; ++ ++#ifdef CONFIG_HIGHMEM ++ if (kmapped) ++ ext2_kunmap_cluster_pages(page, pg, epg); ++#endif ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (epg[i] != NULL) { ++ ++ ClearPageDirty(epg[i]); ++ ClearPageUptodate(epg[i]); ++ try_to_free_buffers(epg[i]); ++ unlock_page(epg[i]); ++ assert(page_count(epg[i]) <= 1); ++ page_cache_release(epg[i]); ++ } ++ } ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (pg[i] == NULL) ++ break; ++ if (pg[i] == page) ++ continue; ++ unlock_page(pg[i]); ++ page_cache_release(pg[i]); ++ } ++ //mw ++ assert (isCmp[clu_ix] == ext2_cluster_is_compressed_fn (inode, cluster0 + clu_ix)); ++ } // end for-loop: Cluster ++ } ++ ++ SetPageUptodate(page); ++ unlock_page(page); ++ atomic_dec(&page->_count); ++ mutex_unlock(&inode->i_mutex); ++ return 0; ++ ++ out: ++ ++#ifdef CONFIG_HIGHMEM ++ if (kmapped) ++ ext2_kunmap_cluster_pages(page, pg, epg); ++#endif ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (epg[i] != NULL) { ++ ++ ClearPageDirty(epg[i]); ++ ClearPageUptodate(epg[i]); ++ try_to_free_buffers(epg[i]); ++ unlock_page(epg[i]); ++ assert(page_count(epg[i]) <= 1); ++ page_cache_release(epg[i]); ++ } ++ } ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (pg[i] == NULL) ++ break; ++ if (pg[i] == page) ++ continue; ++ unlock_page(pg[i]); ++ page_cache_release(pg[i]); ++ } ++ mutex_unlock(&inode->i_mutex); ++ return 0; ++ ++ io_error: ++#ifdef CONFIG_HIGHMEM ++ if (kmapped) ++ ext2_kunmap_cluster_pages(page, pg, epg); ++#endif ++ SetPageError(page); ++ unlock_page(page); ++ atomic_dec(&page->_count); ++ mutex_unlock(&inode->i_mutex); ++ printk("Readpage: IOERROR\n"); ++ return -EIO; /* it is tested in do_generic_file_read(), ... */ ++} ++#endif /* CONFIG_EXT2_COMPRESS */ ++ + static int ext2_writepage(struct page *page, struct writeback_control *wbc) + { ++/* mw (24/06/2008): ++ * WRITEPAGE: this code was also in e2compr 2.4 and once removed by yaboo ding. ++ * ext2_writepage() is also called for dirty pages. Usually we write using file_write() which ++ * wraps correctly to compressed files. BUT: a writeable memory map might ++ * produce dirty pages, which will be written back normally. this should/might fail. ++ * The following code should fix this bug, but this was not tested yet. ++ */ ++#ifdef CONFIG_EXT2_COMPRESS ++#undef USE_WRITEPAGE ++//#define USE_WRITEPAGE ++#ifdef USE_WRITEPAGE ++ ++ struct ext2_inode_info *ei = EXT2_I(page->mapping->host); ++ int retval; ++ ++ struct inode *inode = page->mapping->host; ++ u32 cluster0, max_cluster; ++ int blocksToDo; ++ ++ unlock_page(page); ++ //mw: do we need this ??? ++ //if (!(ei->i_compr_flags & EXT2_OSYNC_INODE)) { ++ /* trace_e2c("ext2_writepage: inode"); */ ++ mutex_lock(&inode->i_mutex); ++ /* trace_e2c(" down\n"); */ ++ //} ++ if (!(ei->i_flags & EXT2_COMPRBLK_FL) ++ || (ei->i_flags & EXT2_NOCOMPR_FL) ) ++ { ++ //mw: do we need this ??? ++ //if (!(ei->i_compr_flags & EXT2_OSYNC_INODE)) { ++ /* trace_e2c("ext2_writepage: inode up 1\n"); */ ++ mutex_unlock(&inode->i_mutex); ++ //} ++ lock_page(page); ++ return block_write_full_page(page, ext2_get_block, wbc); ++ } ++ /* */ ++ { ++ register u32 blockOfFile ++ = (page->index << PAGE_CACHE_SHIFT) >> inode->i_sb->s_blocksize_bits; ++ ++ blocksToDo = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; ++ cluster0 = ext2_block_to_cluster(inode, blockOfFile); ++ max_cluster = ext2_block_to_cluster(inode, blockOfFile + blocksToDo - 1); ++ } ++ ++ /* Check if any part of the requested area contains part of a ++ compressed cluster. If not, we can use default ext2_writepage(). ++ ++ (Note that we don't have to worry about a cluster becoming ++ compressed in the meantime, because we have the semaphore.) ++ ++ A page can cover up to 9 clusters. (The maximum can only ++ occur with 32KB pages, 4KB clusters, and a non-page-aligned ++ offset. Thanks go to Kurt Fitzner for reporting that ++ page offsets needn't be aligned; see generic_file_mmap().) */ ++ ++ { ++ int isCmp[(PAGE_SIZE >> 12) + 1]; ++ unsigned clu_ix; ++ ++ assert (max_cluster - cluster0 < sizeof(isCmp)/sizeof(*isCmp)); ++ for (clu_ix = 0; cluster0 + clu_ix <= max_cluster; clu_ix++) { ++ isCmp[clu_ix] = ext2_cluster_is_compressed_fn (inode, cluster0 + clu_ix); ++ if (isCmp[clu_ix] < 0) { ++ //mw: do we need this ???if (!(ei->i_compr_flags & EXT2_OSYNC_INODE)) { ++ /* trace_e2c("ext2_writepage: inode up 2\n"); */ ++ lock_page(page); ++ mutex_unlock(&inode->i_mutex); ++ //} ++ return -EIO; ++ } ++ } ++ ++ for (clu_ix = 0; cluster0 + clu_ix <= max_cluster; clu_ix++) ++ if (isCmp[clu_ix] > 0) ++ ext2_decompress_cluster(inode, cluster0 + clu_ix); ++ ++ //mw: do we need this ??? ++ //if (!(ei->i_compr_flags & EXT2_OSYNC_INODE)) { ++ /* trace_e2c("ext2_writepage: inode up 3\n"); */ ++ mutex_unlock(&inode->i_mutex); ++ //} ++ lock_page(page); ++ ++ /* fall through */ ++ } ++#endif /* CONFIG_EXT2_COMPRESS */ ++#endif + return block_write_full_page(page, ext2_get_block, wbc); + } + ++#ifndef CONFIG_EXT2_COMPRESS + static int ext2_readpage(struct file *file, struct page *page) + { + return mpage_readpage(page, ext2_get_block); + } ++#endif + + static int + ext2_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) + { ++#ifdef CONFIG_EXT2_COMPRESS ++/* ++ * For now, just read each page into cache and don't worry about emitting BIOs. ++ * (whitpa 02 Aug 2004). ++ */ ++ ++ unsigned page_idx; ++ struct pagevec lru_pvec; ++ int iError; ++ ++ pagevec_init(&lru_pvec, 0); ++ ++ for (page_idx = 0; page_idx < nr_pages; page_idx++) { ++ struct page *page = list_entry(pages->prev, struct page, lru); ++ ++ prefetchw(&page->flags); ++ list_del(&page->lru); ++ ++ iError = add_to_page_cache(page, mapping, page->index, GFP_KERNEL); ++ if (!iError) { ++ if (!PageUptodate(page)) ++ { ++ (void) ext2_readpage(file, page); ++ } ++ else ++ { ++ unlock_page(page); ++ } ++ if (!pagevec_add(&lru_pvec, page)) ++ __pagevec_lru_add_file(&lru_pvec); ++ } else { ++ page_cache_release(page); ++ } ++ ++ } ++ pagevec_lru_add_file(&lru_pvec); ++ BUG_ON(!list_empty(pages)); ++ return 0; ++#else + return mpage_readpages(mapping, pages, nr_pages, ext2_get_block); ++#endif + } + + static int +@@ -829,11 +1474,58 @@ static int ext2_nobh_writepage(struct pa + return nobh_writepage(page, ext2_get_block, wbc); + } + ++#ifdef CONFIG_EXT2_COMPRESS ++static sector_t ext2_do_bmap(struct address_space *mapping, sector_t block) ++#else + static sector_t ext2_bmap(struct address_space *mapping, sector_t block) ++#endif + { + return generic_block_bmap(mapping,block,ext2_get_block); + } + ++#ifdef CONFIG_EXT2_COMPRESS ++/* Return 0 instead of EXT2_COMPRESSED_BLKADDR if EXT2_NOCOMPR_FL ++ * high. This is necessary for us to be able to use ++ * generic_readpage() when EXT2_NOCOMPR_FL is high. ++ */ ++static sector_t ext2_bmap(struct address_space *mapping, sector_t block) ++{ ++ sector_t result; ++ struct inode *inode = mapping->host; ++ ++ if ((EXT2_I(inode)->i_flags & (EXT2_COMPRBLK_FL | EXT2_NOCOMPR_FL)) ++ == (EXT2_COMPRBLK_FL | 0)) { ++ int err; ++ ++ err = ext2_cluster_is_compressed_fn ++ (inode, ext2_block_to_cluster(inode, block)); ++ if (err > 0) ++ ext2_msg (inode->i_sb, "ext2_bmap", ++ "compressed cluster, inode %lu", ++ inode->i_ino); ++ if (err != 0) ++ return 0; ++ } ++ ++ result = ext2_do_bmap(mapping, block); ++ if (result != EXT2_COMPRESSED_BLKADDR) ++ return result; ++ ++ if (!(EXT2_SB(inode->i_sb)->s_es->s_feature_incompat ++ & cpu_to_le32(EXT2_FEATURE_INCOMPAT_COMPRESSION))) ++ ext2_error(inode->i_sb, "ext2_bmap", ++ "compressed_blkaddr (ino %lu, blk %lu) " ++ "on non-compressed fs", ++ inode->i_ino, (unsigned long) block); ++ if (!S_ISREG(inode->i_mode)) ++ ext2_error(inode->i_sb, "ext2_bmap", ++ "compressed_blkaddr for non-regular file " ++ "(ino %lu, blk %lu)", ++ inode->i_ino, (unsigned long) block); ++ return 0; ++} ++#endif /* CONFIG_EXT2_COMPRESS */ ++ + static ssize_t + ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +@@ -853,6 +1545,18 @@ ext2_direct_IO(int rw, struct kiocb *ioc + static int + ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) + { ++#ifdef CONFIG_EXT2_COMPRESS ++#ifdef USE_WRITEPAGE ++ struct ext2_inode_info *ei = EXT2_I(mapping->host); ++ if ( (ei->i_flags & EXT2_COMPRBLK_FL) ++ && !(ei->i_flags & EXT2_NOCOMPR_FL)) ++ { ++ //NULL will invoke ext2_writepage for writeback, hopefully. ++ return mpage_writepages(mapping, wbc, NULL); ++ } ++ else ++#endif ++#endif + return mpage_writepages(mapping, wbc, ext2_get_block); + } + +@@ -1001,6 +1705,12 @@ static inline void ext2_free_data(struct + + for ( ; p < q ; p++) { + nr = le32_to_cpu(*p); ++#ifdef CONFIG_EXT2_COMPRESS ++ if (nr == EXT2_COMPRESSED_BLKADDR) { ++ *p = 0; ++ continue; ++ } ++#endif + if (nr) { + *p = 0; + /* accumulate blocks to free if they're contiguous */ +@@ -1045,6 +1755,12 @@ static void ext2_free_branches(struct in + nr = le32_to_cpu(*p); + if (!nr) + continue; ++#ifdef CONFIG_EXT2_COMPRESS ++ if (nr == EXT2_COMPRESSED_BLKADDR) { ++ *p = 0; ++ continue; ++ } ++#endif + *p = 0; + bh = sb_bread(inode->i_sb, nr); + /* +@@ -1069,6 +1785,96 @@ static void ext2_free_branches(struct in + ext2_free_data(inode, p, q); + } + ++/* pjm 1998-01-14: As far as I can tell, "I don't do any locking" is ++ no longer correct, as i_sem is downed for all write() and ++ truncate() stuff except where it doesn't matter (e.g. new inode). */ ++ ++#ifdef CONFIG_EXT2_COMPRESS ++/* If the EXT2_ECOMPR_FL bit is high, then things can go rather badly. ++ This can only happen if access permission was obtained before the ++ flag was raised. Also, it shouldn't be too much of a problem ++ unless the end point of truncation is a compressed cluster with a ++ compression error. */ ++ ++ /* From what I (Antoine) understand, the complexity of the truncate ++ code is due to the fact that we don't want to free blocks that ++ are still referenced. It does not ensure that concurrent read ++ operation will terminate properly, i.e., the semantic of reading ++ while somebody truncates is undefined (you can either get the old ++ data if you got the blocks before, or get plenty of zeros ++ otherwise). */ ++ ++/* todo: Provide error trapping in readiness for when i_op->truncate ++ allows a return code. */ ++static void fix_compression (struct inode * inode) ++{ ++ struct ext2_inode_info *ei = EXT2_I(inode); ++ /*if (atomic_read(&inode->i_mutex.count) > 0) ++ { ++ printk("Assert Mutex failed for file: %s \n", inode_name(inode, 0)); ++ dump_stack(); ++ }*/ ++ ++ assert (ei->i_flags & EXT2_COMPRBLK_FL); /* one or more compressed clusters */ ++ assert ((atomic_read(&inode->i_mutex.count) < 1) ++ || ((inode->i_nlink == 0) ++ && (atomic_read(&inode->i_count) == 0))); ++ /* pjm 1998-01-14: I think the below comment can safely be removed, as ++ it's impossible for someone to be compressing during truncate(), because ++ i_sem is down. */ ++ /* Dans le cas ou les clusters peuvent etre compresses, cela pose ++ un probleme : il faudrait stopper aussi si le cluster est ++ comprime et ne contient pas plus de donnees que i_size ne ++ permet. Sinon, on peut passer son temps a decompresser un ++ cluster que quelqu'un d'autre compresse en meme ++ temps... (TODO). Cela ne peut arriver que si on reverifie apres ++ coup si le cluster est non compresse (ce qu'on fait a l'heure ++ actuelle) => faire autrement. ++ ++ pjm fixme tr ++ ++ If the clusters can be compressed, we'd have a problem: we'd ++ also need to stop if the cluster is compressed and doesn't ++ contain more data than i_size permits. Otherwise we can spend ++ time decompressing a cluster that someone else is compressing ++ at the same time. (TODO.) This can only happen if we reverify ++ "apres coup" ("after the event"? "after each time"?) "si" ("if" ++ or "that") the cluster is not compressed (as we are currently ++ doing) => do differently. */ ++ ++ /* todo: Handle errors from ext2_cluster_is_compressed(). ++ (Except ext2_truncate() currently silently ignores errors ++ anyway.) */ ++ ++ if (!ext2_offset_is_clu_boundary(inode, inode->i_size) ++ && (! ( ei->i_flags & EXT2_NOCOMPR_FL)) ++ && (ext2_cluster_is_compressed_fn ++ (inode, ext2_offset_to_cluster (inode, inode->i_size)) ++ > 0)) { ++ trace_e2c("fix_compression: inode:%lu decompress_cluster!\n", inode->i_ino); ++ ext2_decompress_cluster(inode, ext2_offset_to_cluster(inode, inode->i_size)); ++ /* todo: Check the return code of ++ ext2_decompress_cluster(). (Then again, I don't ++ know how to report an error anyway. ++ ext2_truncate() silently ignores errors.) */ ++ ++ /* Organise for the cluster to be recompressed later. */ ++ assert (ei->i_flags & EXT2_COMPR_FL); ++ ++ ei->i_flags |= EXT2_DIRTY_FL; ++ ei->i_compr_flags |= EXT2_CLEANUP_FL; ++ mark_inode_dirty(inode); ++ } else ++ /* If there are no more compressed clusters, then ++ remove the EXT2_COMPRBLK_FL. Not essential from a ++ safety point of view, but friendlier. We only do ++ this in the `else' because the cleanup function ++ will handle it in the `if' case. */ ++ ext2_update_comprblk(inode); ++} ++#endif ++ ++ + static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) + { + __le32 *i_data = EXT2_I(inode)->i_data; +@@ -1081,6 +1887,27 @@ static void __ext2_truncate_blocks(struc + int n; + long iblock; + unsigned blocksize; ++ ++#ifdef CONFIG_EXT2_COMPRESS ++ /* If the new size is in the middle of a compressed cluster, ++ then we decompress it, and set things up to be recompressed ++ later. ++ ++ todo: It isn't very nice to get ENOSPC on truncate. We ++ can't completely remove the possibility (unless the ++ compression algorithms obey the rule `shorter input never ++ gives longer output') but we could greatly reduce the ++ possibility, e.g. by moving the fix_compression() function ++ to compress.c, and have it decompress and immediately ++ recompress the cluster, without allocating blocks for the ++ full decompressed data. */ ++ if (EXT2_I(inode)->i_flags & EXT2_COMPRBLK_FL) { ++ trace_e2c("ext2_truncate: ino=%ld sz=%d\n", inode->i_ino, (int)inode->i_size); ++ fix_compression(inode); ++ truncate_inode_pages(inode->i_mapping, inode->i_size); ++ } ++#endif ++ + blocksize = inode->i_sb->s_blocksize; + iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); + +@@ -1151,8 +1978,11 @@ do_indirects: + + mutex_unlock(&ei->truncate_mutex); + } +- ++#ifdef CONFIG_EXT2_COMPRESS ++void ext2_truncate_blocks(struct inode *inode, loff_t offset) ++#else + static void ext2_truncate_blocks(struct inode *inode, loff_t offset) ++#endif + { + /* + * XXX: it seems like a bug here that we don't allow +@@ -1340,7 +2170,73 @@ struct inode *ext2_iget (struct super_bl + goto bad_inode; + } + inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); ++#ifdef CONFIG_EXT2_COMPRESS ++ ei->i_flags = 0x807fffff & le32_to_cpu(raw_inode->i_flags); ++ ei->i_compr_flags = 0; ++ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) { ++ ++ if (S_ISDIR(inode->i_mode)) ++ { ++ //mw: ++ //mutex_lock(&inode->i_mutex); ++ if (S_ISDIR(inode->i_mode)) ++ { ++ ei->i_flags &= ~(EXT2_COMPRBLK_FL | EXT2_DIRTY_FL); //modify!!! ++ } ++ //mutex_unlock(&inode->i_mutex); ++ } ++ ++ /* The above shouldn't be necessary unless someone's ++ * been playing with EXT2_IOC_SETFLAGS on a non-e2compr ++ * kernel, or the inode has been scribbled on. ++ */ ++ if (ei->i_flags & (EXT2_COMPR_FL | EXT2_COMPRBLK_FL)) { ++ ei->i_compr_method ++ = (le32_to_cpu(raw_inode->i_flags) >> 26) & 0x1f; ++ ei->i_log2_clu_nblocks ++ = (le32_to_cpu(raw_inode->i_flags) >> 23) & 0x7; ++ if ((ei->i_log2_clu_nblocks < 2) ++ || (ei->i_log2_clu_nblocks > 5)) { ++ if ((ei->i_log2_clu_nblocks == 0) ++ && !(ei->i_flags & EXT2_COMPRBLK_FL)) { ++ /* The EXT2_COMPR_FL flag was ++ * raised under a kernel ++ * without e2compr support. ++ */ ++ if (S_ISREG(inode->i_mode)) ++ ei->i_flags |= EXT2_DIRTY_FL; ++ /* Todo: once we're sure the kernel can ++ * handle [log2_]clu_nblocks==0, get rid ++ * of the next statement. ++ */ ++ ei->i_log2_clu_nblocks ++ = EXT2_DEFAULT_LOG2_CLU_NBLOCKS; ++ } else { ++ ei->i_flags |= EXT2_ECOMPR_FL; ++ ext2_error(inode->i_sb, ++ "ext2_read_inode", ++ "inode %lu is corrupted: " ++ "log2_clu_nblocks=%u", ++ inode->i_ino, ++ ei->i_log2_clu_nblocks); ++ } ++ } ++ } else { ++ ei->i_compr_method = EXT2_DEFAULT_COMPR_METHOD; ++ ei->i_log2_clu_nblocks ++ = EXT2_DEFAULT_LOG2_CLU_NBLOCKS; ++ } ++ if (ei->i_log2_clu_nblocks > ++ (EXT2_LOG2_MAX_CLUSTER_BYTES - inode->i_sb->s_blocksize_bits)) ++ ei->i_log2_clu_nblocks = (EXT2_LOG2_MAX_CLUSTER_BYTES ++ - inode->i_sb->s_blocksize_bits); ++ ei->i_clu_nblocks = 1 << ei->i_log2_clu_nblocks; ++ if (ei->i_flags & EXT2_DIRTY_FL) ++ ei->i_compr_flags = EXT2_CLEANUP_FL; ++ } ++#else /* !CONFIG_EXT2_COMPRESS */ + ei->i_flags = le32_to_cpu(raw_inode->i_flags); ++#endif + ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); + ei->i_frag_no = raw_inode->i_frag; + ei->i_frag_size = raw_inode->i_fsize; +@@ -1463,7 +2359,35 @@ static int __ext2_write_inode(struct ino + + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); ++#ifdef CONFIG_EXT2_COMPRESS ++ if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) ++ && (ei->i_flags & (EXT2_COMPR_FL | EXT2_COMPRBLK_FL))) { ++ if ((ei->i_log2_clu_nblocks < 2) ++ || (ei->i_log2_clu_nblocks > 5)) { ++ ei->i_flags |= EXT2_ECOMPR_FL; ++ ext2_error (inode->i_sb, "ext2_write_inode", ++ "inode %lu is corrupted: log2_clu_nblocks=%u", ++ inode->i_ino, ei->i_log2_clu_nblocks); ++ } ++ assert (ei->i_clu_nblocks == (1 << ei->i_log2_clu_nblocks)); ++ assert (ei->i_compr_method < 0x20); ++ raw_inode->i_flags = cpu_to_le32 ++ ((ei->i_flags & 0x807fffff) ++ | (ei->i_compr_method << 26) ++ | (ei->i_log2_clu_nblocks << 23)); ++ } else ++ { ++ //mw: i_mutex was introduced and disabled again: deadlock with lilo ++ // mutex_lock(&inode->i_mutex); //mw ++ raw_inode->i_flags = cpu_to_le32 //modify !!! ++ (ei->i_flags ++ & 0x807fffff /* no compr meth/size */ ++ & ~(EXT2_COMPR_FL | EXT2_COMPRBLK_FL | EXT2_IMMUTABLE_FL | EXT2_ECOMPR_FL | EXT2_NOCOMPR_FL)); ++ // mutex_unlock(&inode->i_mutex); //mw ++ } ++#else + raw_inode->i_flags = cpu_to_le32(ei->i_flags); ++#endif + raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); + raw_inode->i_frag = ei->i_frag_no; + raw_inode->i_fsize = ei->i_frag_size; +--- linux-3.2-rc5/fs/ext2/file.c 2011-12-10 00:09:32.000000000 +0100 ++++ linux-3.2-rc5-e2c/fs/ext2/file.c 2011-12-13 14:22:47.853976220 +0100 +@@ -18,10 +18,25 @@ + * (jj@sunsite.ms.mff.cuni.cz) + */ + ++#ifdef CONFIG_EXT2_COMPRESS ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#else + #include + #include + #include + #include "ext2.h" ++#endif ++ ++ + #include "xattr.h" + #include "acl.h" + +@@ -30,8 +45,39 @@ + * for a single struct file are closed. Note that different open() calls + * for the same file yield different struct file structures. + */ ++ ++/* ++ * pjm 1998-01-09: I would note that this is different from `when no ++ * process has the inode open'. ++ */ + static int ext2_release_file (struct inode * inode, struct file * filp) + { ++#ifdef CONFIG_EXT2_COMPRESS ++ /* ++ * Now's as good a time as any to clean up wrt compression. ++ * Previously (before 2.1.4x) we waited until ++ * ext2_put_inode(), but now the dcache sometimes delays that ++ * call until umount time. ++ */ ++ //printk(KERN_DEBUG "ext2_release_file: pid=%d, i_ino=%lu, i_count=%d\n", current->pid, inode->i_ino, atomic_read(&inode->i_count)); ++ ++ if (S_ISREG (inode->i_mode) ++ && inode->i_nlink ++ && (EXT2_I(inode)->i_compr_flags & EXT2_CLEANUP_FL)) { ++#ifdef EXT2_COMPR_REPORT_PUT ++ printk(KERN_DEBUG "ext2_release_file: pid=%d, i_ino=%lu, i_count=%d\n", current->pid, inode->i_ino, atomic_read(&inode->i_count)); ++#endif ++ /* ++ * todo: See how the return code of ++ * ext2_release_file() is used, and decide whether it ++ * might be appropriate to pass any errors to ++ * caller. ++ */ ++ //dump_stack(); ++ (void) ext2_cleanup_compressed_inode (inode); ++ } ++ ++#endif + if (filp->f_mode & FMODE_WRITE) { + mutex_lock(&EXT2_I(inode)->truncate_mutex); + ext2_discard_reservation(inode); +@@ -56,6 +102,456 @@ int ext2_fsync(struct file *file, loff_t + return ret; + } + ++#ifdef CONFIG_EXT2_COMPRESS ++struct page_cluster { ++ struct page * page; ++ loff_t pos; ++ unsigned bytes; ++ unsigned long offset; ++ unsigned char in_range; ++ const char * buf; ++}; ++ ++#define PAGE_IN_RANGE 1 ++#define PAGE_KMAPPED 2 ++ ++ ++/** ++ * generic_osync_inode - flush all dirty data for a given inode to disk ++ * @inode: inode to write ++ * @mapping: the address_space that should be flushed ++ * @what: what to write and wait upon ++ * ++ * This can be called by file_write functions for files which have the ++ * O_SYNC flag set, to flush dirty writes to disk. ++ * ++ * @what is a bitmask, specifying which part of the inode's data should be ++ * written and waited upon. ++ * ++ * OSYNC_DATA: i_mapping's dirty data ++ * OSYNC_METADATA: the buffers at i_mapping->private_list ++ * OSYNC_INODE: the inode itself ++ */ ++ ++/* mw: see generic_osync_inode() in kernel<2.6.30 for orginal method. ++ basically we want all of it: OSYNC_DATA and OSYNC_METADATA and OSYNC_INODE */ ++int ex_generic_osync_inode(struct inode *inode, struct address_space *mapping) //, int what) ++{ ++ int err = 0; ++ int need_write_inode_now = 0; ++ int err2; ++ ++ err = filemap_fdatawrite(mapping); ++ ++ err2 = sync_mapping_buffers(mapping); ++ if (!err) ++ err = err2; ++ ++ err2 = filemap_fdatawait(mapping); ++ if (!err) ++ err = err2; ++ ++ /* check if data is dirty */ ++ spin_lock(&inode->i_lock); ++ if (inode->i_state & I_DIRTY) ++ need_write_inode_now = 1; ++ spin_unlock(&inode->i_lock); ++ ++ if (need_write_inode_now) { ++ err2 = write_inode_now(inode, 1); ++ if (!err) ++ err = err2; ++ } ++ else ++ inode_sync_wait(inode); ++ ++ return err; ++} ++ ++ ++/* ++ * Write to a file through the page cache. ++ * ++ * We currently put everything into the page cache prior to writing it. ++ * This is not a problem when writing full pages. With partial pages, ++ * however, we first have to read the data into the cache, then ++ * dirty the page, and finally schedule it for writing. Alternatively, we ++ * could write-through just the portion of data that would go into that ++ * page, but that would kill performance for applications that write data ++ * line by line, and it's prone to race conditions. ++ * ++ * Note that this routine doesn't try to keep track of dirty pages. Each ++ * file system has to do this all by itself, unfortunately. ++ * okir@monad.swb.de ++ */ ++ssize_t ++ext2_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) ++{ ++ struct address_space *mapping = file->f_dentry->d_inode->i_mapping; ++ struct inode *inode = mapping->host; ++ unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur, written, last_index; /* last page index */ ++ loff_t pos; ++ long status; ++ int err; ++ unsigned bytes; ++ u32 comprblk_mask=0; ++ struct ext2_inode_info *ei = EXT2_I(inode); ++ ++ if (!(ei->i_flags & (EXT2_COMPR_FL|EXT2_COMPRBLK_FL)) ++#undef DUD //mw: I think this is a buggy bug-fix ++#ifdef DUD ++ || (count < inode->i_sb->s_blocksize) ++#endif ++ ) ++ { ++ return do_sync_write(file, buf, count, ppos); ++ } ++ ++ if ((ssize_t) count < 0) ++ return -EINVAL; ++ ++ if (!access_ok(VERIFY_READ, buf, count)) ++ return -EFAULT; ++ ++#ifdef EXT2_COMPR_REPORT_MUTEX ++ printk(KERN_DEBUG "EXT2_FILE_WRITE_LOCK of PID %u @ inode:%lu\n", current->pid, inode->i_ino ); ++#endif ++ mutex_lock(&inode->i_mutex); ++ /* mw: down_read(&inode->i_alloc_sem); // as used by ocsf2 TLL 02/21/07 ++ was removed with kernel 3.1 */ ++ atomic_inc(&inode->i_dio_count); ++ ++ pos = *ppos; ++ err = -EINVAL; ++ if (pos < 0) ++ goto out; ++ ++ written = 0; ++ ++ /* FIXME: this is for backwards compatibility with 2.4 */ ++ if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) ++ { ++ pos = inode->i_size; ++ } ++ ++ /* ++ * Check whether we've reached the file size limit. ++ */ ++ err = -EFBIG; ++ ++ if (limit != RLIM_INFINITY) { ++ if (pos >= limit) { ++ send_sig(SIGXFSZ, current, 0); ++ goto out; ++ } ++ if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) { ++ /* send_sig(SIGXFSZ, current, 0); */ ++ count = limit - (u32)pos; ++ } ++ } ++ ++ /* ++ * LFS rule ++ */ ++ if ( pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { ++ if (pos >= MAX_NON_LFS) { ++ send_sig(SIGXFSZ, current, 0); ++ goto out; ++ } ++ if (count > MAX_NON_LFS - (u32)pos) { ++ /* send_sig(SIGXFSZ, current, 0); */ ++ count = MAX_NON_LFS - (u32)pos; ++ } ++ } ++ ++ /* ++ * Are we about to exceed the fs block limit ? ++ * ++ * If we have written data it becomes a short write ++ * If we have exceeded without writing data we send ++ * a signal and give them an EFBIG. ++ * ++ * Linus frestrict idea will clean these up nicely.. ++ */ ++ if (!S_ISBLK(inode->i_mode)) { ++ if (pos >= inode->i_sb->s_maxbytes) { ++ if (count || pos > inode->i_sb->s_maxbytes) { ++ send_sig(SIGXFSZ, current, 0); ++ err = -EFBIG; ++ goto out; ++ } ++ /* zero-length writes at ->s_maxbytes are OK */ ++ } ++ ++ if (pos + count > inode->i_sb->s_maxbytes) ++ count = inode->i_sb->s_maxbytes - pos; ++ } else { ++ if (bdev_read_only(inode->i_sb->s_bdev)) { ++ err = -EPERM; ++ goto out; ++ } ++ if (pos >= inode->i_size) { ++ if (count || pos > inode->i_size) { ++ err = -ENOSPC; ++ goto out; ++ } ++ } ++ ++ if (pos + count > inode->i_size) ++ { ++ count = inode->i_size - pos; ++ } ++ } ++ ++ err = 0; ++ if (count == 0) ++ goto out; ++ ++ status = 0; ++ ++ if (file->f_flags & O_DIRECT) ++ { ++ err = -EINVAL; ++ goto out; ++ } ++ /* ++ * We must still check for EXT2_ECOMPR_FL, as it may have been ++ * set after we got the write permission to this file. ++ */ ++ if ((ei->i_flags & (EXT2_ECOMPR_FL | EXT2_NOCOMPR_FL)) == (EXT2_ECOMPR_FL | 0)) ++ { ++ err = -EXT2_ECOMPR; ++ goto out; ++ } ++ ++ should_remove_suid(file->f_dentry); ++ inode->i_ctime = inode->i_mtime = CURRENT_TIME; ++ mark_inode_dirty_sync(inode); ++ ++ if ((pos+count) > inode->i_size) ++ last_index = (pos+count-1) >> PAGE_CACHE_SHIFT; ++ else ++ last_index = (inode->i_size-1) >> PAGE_CACHE_SHIFT; ++ ++ comprblk_mask = ei->i_flags | ~EXT2_COMPRBLK_FL; ++ ++ //mw: now do it cluster-wise ++ do { ++ //unsigned long index, offset, clusters_page_index0, ++ unsigned long index, nextClusterFirstByte, cluster_compressed=0; ++ u32 cluster=0; ++ status = -ENOMEM; /* we'll assign it later anyway */ ++ ++#ifdef EXT2_COMPRESS_WHEN_CLU ++ ei->i_flags |= EXT2_COMPRBLK_FL; ++ assert( (file->f_flags & O_DIRECT) == 0); ++ assert(mapping_mapped(inode->i_mapping) == 0); ++#endif ++ ++ index = pos >> PAGE_CACHE_SHIFT; /*mw: pageindex (start)*/ ++ cluster = ext2_page_to_cluster(inode, index); ++ ++ /* ++ * We decompress the cluster if needed, and write ++ * the data as normal. The cluster will be ++ * compressed again when the inode is cleaned up. ++ */ ++ if ((comprblk_mask == ~(u32)0) ++ && !(ei->i_flags & EXT2_NOCOMPR_FL)) { ++ /* AUFFÄLLIG 2*/ ++ /* assert (block == pos >> inode->i_sb->s_blocksize_bits); */ ++ ++ cluster_compressed = ext2_cluster_is_compressed_fn(inode, cluster); ++ if (cluster_compressed < 0) { ++ if (! written) ++ written = cluster_compressed; ++ break; ++ } ++ } ++ ++ if (cluster_compressed > 0) { ++ /* Here, decompression take place */ ++ cluster_compressed = ext2_decompress_cluster(inode, cluster); ++ if (cluster_compressed < 0) { ++ if (! written) { ++ written = cluster_compressed; ++ } ++ break; ++ } ++ } ++ ++ nextClusterFirstByte = (ext2_cluster_page0(inode, cluster+1) * PAGE_CACHE_SIZE); ++ bytes = nextClusterFirstByte - pos; /*mw: bytes todo in this cluster*/ ++ if (bytes > count) { ++ bytes = count; /*mw: if end of data*/ ++ } ++ ++#ifdef EXT2_COMPR_DEBUG ++ //assert we stay inside the cluster! ++ { ++ int endpos; ++ int endindex; ++ int endcluster; ++ unsigned long thisClusterFirstByte; ++ int relstart, relend, startblock, endblock; ++ ++ thisClusterFirstByte = (ext2_cluster_page0(inode, cluster) * PAGE_CACHE_SIZE); ++ ++ relstart = pos - thisClusterFirstByte; ++ relend = bytes + relstart; ++ ++ startblock = relstart >> 10; ++ endblock = relend >> 10; ++ ++ ++ endpos = pos + bytes; ++ //printk("do_sync_write cluster %d: inode:%lu, \t start:%i(%i), end:%i(%i), \t ccount:%d \t tcount:%d\n", cluster , inode->i_ino, relstart, startblock, relend , endblock, (int)bytes, count); ++ endindex = (endpos-1) >> PAGE_CACHE_SHIFT; /*mw: pageindex (start)*/ ++ endcluster = ext2_page_to_cluster(inode, endindex); ++ assert(cluster == endcluster); ++ } ++#endif ++ ++ //mw: must unlock here, do_sync_write() will aquire the mutex again ++ mutex_unlock(&inode->i_mutex); ++ ++ //mw: this is pretty clever: we use the generic method now :-) ++ //printk("do_sync_write cluster %d, mapped:%i\n", cluster, mapping_mapped(inode->i_mapping)); ++ //status = do_sync_write_nolock(file, buf, bytes, &pos); //without locking mutex ++ status = do_sync_write(file, buf, bytes, &pos); //with locking mutex ++ assert(status>=0); ++ ++ mutex_lock(&inode->i_mutex); ++ ++ written += status; ++ count -= status; ++ buf += status; ++ ++#ifdef EXT2_COMPRESS_WHEN_CLU ++ assert (ei->i_flags & EXT2_COMPRBLK_FL); ++ if ((ei->i_flags & EXT2_COMPR_FL) ++ && (ext2_offset_is_clu_boundary(inode, pos)) ) { ++ ++ if (mapping_mapped(inode->i_mapping) == 0 ) ++ /* ++ * Pierre Peiffer: For file mapped (via mmap, I mean), ++ * compression will occure when releasing the file. ++ * We must, in this case, avoid the pages (possibly ++ * mapped by a process) to be compressed under them. ++ */ ++ { ++ int error; ++ assert(mapping_mapped(inode->i_mapping) == 0); ++ error = ext2_compress_cluster(inode, cluster); ++ /*if (ext2_cluster_is_compressed_fn(inode, cluster)) ++ ext2_decompress_cluster(inode, cluster);*/ ++ assert(mapping_mapped(inode->i_mapping) == 0); ++ /* ++ * Actually, raising write_error may be a ++ * mistake. For example, ++ * ext2_cleanup_compressed_cluster() doesn't ++ * usually return any errors to user. todo: ++ * Have a look at ext2_compress_cluster, and ++ * check whether its errors are such that they ++ * should be returned to user. Some of the ++ * will be, of course, but it might be ++ * possible for it to return without ++ * change. ++ */ ++ if (error > 0) ++ comprblk_mask = ~(u32)0; ++ } else { ++#ifdef EXT2_COMPR_REPORT ++ char bdn[BDEVNAME_SIZE]; ++ bdevname(inode->i_sb->s_bdev, bdn); ++#endif ++ ++ trace_e2c("ext2_file_write: (dev. %s): " ++ "ino=%ld, cluster=%d: file mapped, does " ++ "not compress cluster\n", ++ bdn, inode->i_ino, cluster); ++ ei->i_flags |= EXT2_DIRTY_FL; ++ ei->i_compr_flags |= EXT2_CLEANUP_FL; ++ } ++ } ++#endif ++ ++ } while (count); ++ *ppos = pos; ++ ++ /* ++ * For now, when the user asks for O_SYNC, we'll actually ++ * provide O_DSYNC. ++ */ ++ if (status >= 0) { ++ if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { ++ /*if (ei->i_compr_flags & EXT2_OSYNC_INODE) { ++ osync_already = 1; ++ } else { ++ osync_already = 0; ++ ei->i_compr_flags |= EXT2_OSYNC_INODE; ++ }*/ ++ /* Should 2nd arg be inode->i_mapping? */ ++ status = ex_generic_osync_inode(inode, file->f_mapping ++ /*, OSYNC_METADATA|OSYNC_DATA*/); ++ /*if (osync_already == 0) { ++ ei->i_compr_flags &= ~EXT2_OSYNC_INODE; ++ }*/ ++ } ++ } ++ ++ err = written ? written : status; ++ ++# ifdef EXT2_COMPRESS_WHEN_CLU ++ //mw: ext2_compress_cluster() might remove EXT2_COMPRBLK_FL ++ //if the file does not compress at all. this is NO error: remove next line? ++ //assert (ei->i_flags & EXT2_COMPRBLK_FL); ++ ++ ei->i_flags &= comprblk_mask; ++ if ( (ei->i_flags & EXT2_COMPR_FL) ++ && (!ext2_offset_is_clu_boundary(inode, pos)) ) ++ { ++ ei->i_flags |= EXT2_DIRTY_FL; ++ ei->i_compr_flags |= EXT2_CLEANUP_FL; ++ } ++ ++# else ++ if (ei->i_flags & EXT2_COMPR_FL) { ++ ei->i_flags |= EXT2_DIRTY_FL; ++ ei->i_compr_flags |= EXT2_CLEANUP_FL; ++ } ++# endif ++out: ++ ++#ifdef EXT2_COMPR_REPORT_MUTEX ++ printk(KERN_DEBUG "EXT2_FILE_WRITE_UNLOCK of PID %u @ inode:%lu\n", current->pid, inode->i_ino); ++#endif ++ /* mw: up_read(&inode->i_alloc_sem); // as used by ocsf2 TLL 02/21/07 ++ was removed with kernel 3.1 */ ++ inode_dio_done(inode); ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++ ++/* ++ * Called when an inode is about to be open. ++ * We use this to disallow opening RW large files on 32bit systems if ++ * the caller didn't specify O_LARGEFILE. On 64bit systems we force ++ * on this flag in sys_open. ++ * Prevent opening compressed file with O_DIRECT. ++ */ ++static int ext2_file_open(struct inode * inode, struct file * filp) ++{ ++ if ((filp->f_flags & O_DIRECT) && (EXT2_I(inode)->i_flags & ++ (EXT2_COMPR_FL|EXT2_COMPRBLK_FL))) ++ return -EINVAL; ++ if (!(filp->f_flags & O_LARGEFILE) && inode->i_size > MAX_NON_LFS) ++ return -EFBIG; ++ ++ return 0; ++ } ++#endif /* CONFIG_EXT2_COMPRESS*/ ++ + /* + * We have mostly NULL's here: the current defaults are ok for + * the ext2 filesystem. +@@ -63,7 +559,12 @@ int ext2_fsync(struct file *file, loff_t + const struct file_operations ext2_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, ++#ifdef CONFIG_EXT2_COMPRESS ++ .write = ext2_file_write, ++#else + .write = do_sync_write, ++#endif ++ + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .unlocked_ioctl = ext2_ioctl, +@@ -71,7 +572,11 @@ const struct file_operations ext2_file_o + .compat_ioctl = ext2_compat_ioctl, + #endif + .mmap = generic_file_mmap, ++#ifdef CONFIG_EXT2_COMPRESS ++ .open = ext2_file_open, ++#else + .open = dquot_file_open, ++#endif + .release = ext2_release_file, + .fsync = ext2_fsync, + .splice_read = generic_file_splice_read, +--- linux-3.2-rc5/fs/ext2/ioctl.c 2011-12-10 00:09:32.000000000 +0100 ++++ linux-3.2-rc5-e2c/fs/ext2/ioctl.c 2011-12-13 14:22:47.855976282 +0100 +@@ -7,7 +7,14 @@ + * Universite Pierre et Marie Curie (Paris VI) + */ + ++#ifdef CONFIG_EXT2_COMPRESS ++#include ++#include ++#include ++#include ++#else + #include "ext2.h" ++#endif + #include + #include + #include +@@ -17,6 +24,65 @@ + #include + + ++#ifdef CONFIG_EXT2_COMPRESS ++ ++#ifndef MIN ++# define MIN(a,b) ((a) < (b) ? (a) : (b)) ++#endif ++ ++#ifdef CONFIG_GZ_HACK ++static int check_name(struct inode *ino) ++{ ++ struct dentry *dentry = list_entry(ino->i_dentry.next, struct dentry, d_alias); ++ if (dentry) ++ if ( ++ ++ (dentry->d_name.len >= 4) && ++ (((dentry->d_name.name[dentry->d_name.len - 2] == 'g') ++ && (dentry->d_name.name[dentry->d_name.len - 1] == 'z') ++ && ((dentry->d_name.name[dentry->d_name.len - 3] == '.') ++ || (dentry->d_name.name[dentry->d_name.len - 4] == '.'))) ++ ++ || ((dentry->d_name.name[dentry->d_name.len - 3] == 't') ++ && (dentry->d_name.name[dentry->d_name.len - 2] == 'g') ++ && (dentry->d_name.name[dentry->d_name.len - 1] == 'z') ++ && (dentry->d_name.name[dentry->d_name.len - 4] == '.') ++ && (dentry->d_name.len >= 5)) ++ ++ || ((dentry->d_name.name[dentry->d_name.len - 3] == 'p') ++ && (dentry->d_name.name[dentry->d_name.len - 2] == 'n') ++ && (dentry->d_name.name[dentry->d_name.len - 1] == 'g') ++ && (dentry->d_name.name[dentry->d_name.len - 4] == '.') ++ && (dentry->d_name.len >= 5)) ++ ++ || ((dentry->d_name.name[dentry->d_name.len - 3] == 'j') ++ && (dentry->d_name.name[dentry->d_name.len - 2] == 'p') ++ && (dentry->d_name.name[dentry->d_name.len - 1] == 'g') ++ && (dentry->d_name.name[dentry->d_name.len - 4] == '.') ++ && (dentry->d_name.len >= 5)) ++ ++ || ((dentry->d_name.name[dentry->d_name.len - 3] == 'b') ++ && (dentry->d_name.name[dentry->d_name.len - 2] == 'z') ++ && (dentry->d_name.name[dentry->d_name.len - 1] == '2') ++ && (dentry->d_name.name[dentry->d_name.len - 4] == '.') ++ && (dentry->d_name.len >= 5)) ++ ++ || ((dentry->d_name.name[dentry->d_name.len - 3] == 'm') ++ && (dentry->d_name.name[dentry->d_name.len - 2] == 'n') ++ && (dentry->d_name.name[dentry->d_name.len - 1] == 'g') ++ && (dentry->d_name.name[dentry->d_name.len - 4] == '.') ++ && (dentry->d_name.len >= 5)) ++ ) ++ ) { ++ return 1; ++ } ++ return 0; ++} ++#endif ++#endif ++ ++ ++ + long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_dentry->d_inode; +@@ -24,6 +90,10 @@ long ext2_ioctl(struct file *filp, unsig + unsigned int flags; + unsigned short rsv_window_size; + int ret; ++#ifdef CONFIG_EXT2_COMPRESS ++ unsigned long datum; ++ int err; ++#endif + + ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg); + +@@ -75,7 +145,127 @@ long ext2_ioctl(struct file *filp, unsig + } + + flags = flags & EXT2_FL_USER_MODIFIABLE; ++#ifdef CONFIG_EXT2_COMPRESS ++ if (S_ISREG (inode->i_mode) || S_ISDIR (inode->i_mode)) { ++ ++ /* pjm 1998-01-14: In previous versions of ++ e2compr, the kernel forbade raising ++ EXT2_ECOMPR_FL from userspace. I can't ++ think of any purpose for forbidding this, ++ and I find it useful to raise ++ EXT2_ECOMPR_FL for testing purposes, so ++ I've removed the forbidding code. */ ++ if (S_ISREG (inode->i_mode) ++ && (EXT2_NOCOMPR_FL ++ & (flags ^ ei->i_flags))) { // mw hint: ^ is a (excluisive OR) ++ /* NOCOMPR_FL can only be changed if ++ nobody else has the file opened. */ ++ /* pjm 1998-02-16: inode->i_count is ++ useless to us because only dentries ++ use inodes now. Unfortunately, ++ there isn't an easy way of finding ++ the equivalent. We'd have to go ++ through all dentries using the ++ inode, and sum their d_count ++ values. Rather than do that, I'd ++ rather get rid of the exclusion ++ constraint. todo. */ ++ //printk("i_count: %i\n", atomic_read(&inode->i_count)); ++ //if (atomic_read(&inode->i_count) > 1) ++ //if (0) ++ if (ext2_get_dcount(inode) > 1) ++ { ++ mutex_unlock(&inode->i_mutex); /*mw*/ ++ return -ETXTBSY; ++ } ++ else { ++ /* pjm 970429: Discarding ++ cached pages is not very ++ clean, but should work. */ ++ /* pjm 980114: Not quite. We ++ should also sync any ++ mappings to buffers first. ++ This isn't very important, ++ as none of the current ++ e2compr programs can ++ trigger this, but todo. */ ++ invalidate_remote_inode (inode); ++ } ++ } ++ ++ if (EXT2_COMPR_FL ++ & (flags ^ ei->i_flags)) { ++ if (flags & EXT2_COMPR_FL) { ++ if (ei->i_flags & EXT2_COMPRBLK_FL) { ++ /* There shouldn't actually be any ++ compressed blocks, AFAIK. However, ++ this is still possible because sometimes ++ COMPRBLK gets raised just to stop ++ us changing cluster size at the wrong ++ time. ++ ++ todo: Call a function that just ++ checks that there are not compressed ++ clusters, and print a warning if any are ++ found. */ ++ } else { ++ int bits = MIN(EXT2_DEFAULT_LOG2_CLU_NBLOCKS, ++ (EXT2_LOG2_MAX_CLUSTER_BYTES ++ - inode->i_sb->s_blocksize_bits)); ++ ++ ei->i_log2_clu_nblocks = bits; ++ ei->i_clu_nblocks = 1 << bits; ++ } ++ ei->i_compr_method = EXT2_DEFAULT_COMPR_METHOD; ++ if (S_ISREG (inode->i_mode)) { ++ //compress ++#ifdef CONFIG_GZ_HACK ++ /* mw: check for .gz-files and similar ++ * I think this is the most clever place for ++ * rejecting files. They remain regular, uncompressed ++ * files and though can be read bypassing all ++ * compression stuff (= fast) :-). And it seems to save ++ * space... somehow */ ++ if (check_name (inode)) ++ { ++ //printk("non-compressable file extension\n"); ++ mutex_unlock(&inode->i_mutex); ++ return 0; ++ } ++#endif ++ //set flags to trigger compression later on ++ flags |= EXT2_DIRTY_FL; ++ ei->i_compr_flags |= EXT2_CLEANUP_FL; ++ } ++ } else if (S_ISREG (inode->i_mode)) { ++ if (ei->i_flags & EXT2_COMPRBLK_FL) { ++ int err; ++ ++ if (ext2_get_dcount(inode) > 1){ ++ mutex_unlock(&inode->i_mutex); //mw ++ return -ETXTBSY; ++ } ++ err = ext2_decompress_inode(inode); ++ if (err) ++ { ++ mutex_unlock(&inode->i_mutex); //mw ++ return err; ++ } ++ } ++ ei->i_flags &= ~EXT2_DIRTY_FL; ++ ei->i_compr_flags &= ~EXT2_CLEANUP_FL; ++ } ++ } ++ } ++#endif + flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE; ++#ifdef CONFIG_EXT2_COMPRESS ++ /* bug fix: scrub 'B' flag from uncompressed files TLL 02/28/07 */ ++ if (!(flags & EXT2_COMPR_FL) && (flags & EXT2_COMPRBLK_FL) ) ++ { ++ flags &= ~EXT2_COMPRBLK_FL; ++ } ++#endif + ei->i_flags = flags; + mutex_unlock(&inode->i_mutex); + +@@ -148,6 +338,184 @@ setflags_out: + mnt_drop_write(filp->f_path.mnt); + return 0; + } ++#ifdef CONFIG_EXT2_COMPRESS ++ case EXT2_IOC_GETCOMPRMETHOD: /* Result means nothing if COMPR_FL is not set */ ++ return put_user (ei->i_compr_method, (long *) arg); ++ case EXT2_IOC_SETCOMPRMETHOD: ++ if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER)) ++ return -EPERM; ++ if (IS_RDONLY (inode)) ++ return -EROFS; ++ if (get_user (datum, (long*) arg)) ++ return -EFAULT; ++ if (!S_ISREG (inode->i_mode) && !S_ISDIR (inode->i_mode)) ++ return -ENOSYS; ++ /* todo: Allow the below, but set initial value of ++ i_compr_meth at read_inode() time (using default if ++ !/) instead of +c time. Same for cluster ++ size. */ ++ if ((unsigned) datum >= EXT2_N_METHODS) ++ return -EINVAL; ++ if (ei->i_compr_method != datum) { ++ if ((ei->i_compr_method == EXT2_NEVER_METH) ++ && (ei->i_flags & EXT2_COMPR_FL)) ++ return -EPERM; ++ /* If the previous method was `defer' then ++ take a look at all uncompressed clusters ++ and try to compress them. (pjm 1997-04-16) */ ++ if ((ei->i_compr_method == EXT2_DEFER_METH) ++ && S_ISREG (inode->i_mode)) { ++ ei->i_flags |= EXT2_DIRTY_FL; ++ ei->i_compr_flags |= EXT2_CLEANUP_FL; ++ } ++ if ((datum == EXT2_NEVER_METH) ++ && S_ISREG (inode->i_mode)) { ++ //printk("SETCOMPR\n"); ++ if ((ei->i_flags & EXT2_COMPRBLK_FL)) ++ { ++ /*mw*/ ++ mutex_lock(&inode->i_mutex); ++ if (ext2_get_dcount(inode) > 1){ ++ mutex_unlock(&inode->i_mutex); /*mw*/ ++ return -ETXTBSY; ++ } ++ err = ext2_decompress_inode(inode); ++ mutex_unlock(&inode->i_mutex); ++ if ( err < 0) ++ return err; ++ } ++ ei->i_flags &= ~EXT2_DIRTY_FL; ++ ei->i_compr_flags &= ~EXT2_CLEANUP_FL; ++ } ++ ei->i_compr_method = datum; ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty(inode); ++ } ++#ifdef CONFIG_KMOD ++ if (!ext2_algorithm_table[ext2_method_table[datum].alg].avail) { ++ char str[32]; ++ ++ sprintf(str, "ext2-compr-%s", ext2_algorithm_table[ext2_method_table[datum].alg].name); ++ request_module(str); ++ } ++#endif ++ datum = ((datum < EXT2_N_METHODS) ++ && (ext2_algorithm_table[ext2_method_table[datum].alg].avail)); ++ return put_user(datum, (long *)arg); ++ ++ case EXT2_IOC_GETCLUSTERBIT: ++ if (get_user (datum, (long*) arg)) ++ return -EFAULT; ++ if (!S_ISREG (inode->i_mode)) ++ return -ENOSYS; ++ /* We don't do `down(&inode->i_sem)' here because ++ there's no way for userspace to do the ++ corresponding up(). Userspace must rely on ++ EXT2_NOCOMPR_FL if it needs to lock. */ ++ err = ext2_cluster_is_compressed (inode, datum); ++ if (err < 0) ++ return err; ++ return put_user ((err ? 1 : 0), ++ (long *) arg); ++ ++ case EXT2_IOC_RECOGNIZE_COMPRESSED: ++ if (get_user (datum, (long*) arg)) ++ return -EFAULT; ++ if (!S_ISREG (inode->i_mode)) ++ return -ENOSYS; ++ if (IS_RDONLY (inode)) ++ return -EROFS; ++ return ext2_recognize_compressed (inode, datum); ++ ++ case EXT2_IOC_GETCLUSTERSIZE: ++ /* Result means nothing if COMPR_FL is not set (until ++ SETCLUSTERSIZE w/o COMPR_FL is implemented; ++ todo). */ ++ if (!S_ISREG (inode->i_mode) ++ && !S_ISDIR (inode->i_mode)) ++ return -ENOSYS; ++ return put_user (ei->i_clu_nblocks, (long *) arg); ++ ++ case EXT2_IOC_GETFIRSTCLUSTERSIZE: ++ /* Result means nothing if COMPR_FL is not set (until ++ SETCLUSTERSIZE w/o COMPR_FL is implemented; ++ todo). */ ++ if (!S_ISREG (inode->i_mode) ++ && !S_ISDIR (inode->i_mode)) ++ return -ENOSYS; ++ return put_user (ext2_first_cluster_nblocks(inode), (long *) arg); ++ ++ case EXT2_IOC_SETCLUSTERSIZE: ++ if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER)) ++ return -EPERM; ++ if (IS_RDONLY (inode)) ++ return -EROFS; ++ if (get_user (datum, (long *) arg)) ++ return -EFAULT; ++ if (!S_ISREG (inode->i_mode) ++ && !S_ISDIR (inode->i_mode)) ++ return -ENOSYS; ++ ++ /* These are the only possible cluster sizes. The ++ cluster size must be a power of two so that ++ clusters don't straddle address (aka indirect) ++ blocks. At the moment, the upper limit is constrained ++ by how much memory is allocated for de/compression. ++ Also, the gzip algorithms have some optimisations ++ that assume tht the input is no more than 32KB, ++ and in compress.c we would need to zero more bits ++ of head->holemap. (In previous releases, the file ++ format was limited to 32 blocks and under 64KB.) */ ++// #if EXT2_MAX_CLUSTER_BLOCKS > 32 || EXT2_MAX_CLUSTER_NBYTES > 32768 ++// # error "This code not updated for cluster size yet." ++// #endif ++ switch (datum) { ++ case (1 << 2): datum = 2; break; ++ case (1 << 3): datum = 3; break; ++ case (1 << 4): datum = 4; break; ++ case (1 << 5): datum = 5; break; ++ default: return -EINVAL; ++ } ++ ++ assert (ei->i_clu_nblocks == (1 << ei->i_log2_clu_nblocks)); ++ if (datum == ei->i_log2_clu_nblocks) ++ return 0; ++ ++ if (ei->i_flags & EXT2_ECOMPR_FL) ++ return -EPERM; ++ if (!(ei->i_flags & EXT2_COMPR_FL)) ++ return -ENOSYS; ++ ++ /* We currently lack a mechanism to change the cluster ++ size if there are already some compressed clusters. ++ The compression must be done in userspace ++ (e.g. with the e2compress program) instead. */ ++ if (ei->i_flags & EXT2_COMPRBLK_FL) ++ return -ENOSYS; ++ ++ if (datum + inode->i_sb->s_blocksize_bits ++ > EXT2_LOG2_MAX_CLUSTER_BYTES) ++ return -EINVAL; ++ ++ ei->i_log2_clu_nblocks = datum; ++ ei->i_clu_nblocks = 1 << datum; ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty(inode); ++ return 0; ++ ++ case EXT2_IOC_GETCOMPRRATIO: ++ if (!S_ISREG (inode->i_mode)) ++ return -ENOSYS; ++ if (ei->i_flags & EXT2_ECOMPR_FL) ++ return -EPERM; ++ if ((long) (datum = ext2_count_blocks (inode)) < 0) ++ return datum; ++ if ((err = put_user ((long) datum, (long*) arg))) ++ return err; ++ return put_user ((long) inode->i_blocks, (long*) arg + 1); ++ ++ ++#endif + default: + return -ENOTTY; + } +--- linux-3.2-rc5/fs/ext2/ext2.h 2011-12-10 00:09:32.000000000 +0100 ++++ linux-3.2-rc5-e2c/fs/ext2/ext2.h 2011-12-13 14:22:47.855976282 +0100 +@@ -37,6 +37,12 @@ struct ext2_inode_info { + struct ext2_block_alloc_info *i_block_alloc_info; + + __u32 i_dir_start_lookup; ++#ifdef CONFIG_EXT2_COMPRESS ++ __u8 i_log2_clu_nblocks; ++ __u8 i_clu_nblocks; ++ __u8 i_compr_method; ++ __u8 i_compr_flags; ++#endif + #ifdef CONFIG_EXT2_FS_XATTR + /* + * Extended attributes can be read independently of the main file +@@ -126,6 +132,7 @@ extern void ext2_set_inode_flags(struct + extern void ext2_get_inode_flags(struct ext2_inode_info *); + extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); ++extern void ext2_truncate_blocks(struct inode *inode, loff_t offset); + + /* ioctl.c */ + extern long ext2_ioctl(struct file *, unsigned int, unsigned long); +--- linux-3.2-rc5/include/linux/ext2_fs.h 2011-12-10 00:09:32.000000000 +0100 ++++ linux-3.2-rc5-e2c/include/linux/ext2_fs.h 2011-12-13 14:22:47.856976313 +0100 +@@ -87,6 +87,10 @@ static inline struct ext2_sb_info *EXT2_ + /* + * Macro-instructions used to manage several block sizes + */ ++#define EXT2_GRAIN_SIZE 1024 ++/* Minimum allocation unit. This is used in fs/ext2/compress.c to ++ check compr_len validity wrt (uncompressed) len. This definition ++ will probably need to be changed when fragments are implemented. */ + #define EXT2_MIN_BLOCK_SIZE 1024 + #define EXT2_MAX_BLOCK_SIZE 4096 + #define EXT2_MIN_BLOCK_LOG_SIZE 10 +@@ -178,9 +182,10 @@ struct ext2_group_desc + #define EXT2_NODUMP_FL FS_NODUMP_FL /* do not dump file */ + #define EXT2_NOATIME_FL FS_NOATIME_FL /* do not update atime */ + /* Reserved for compression usage... */ +-#define EXT2_DIRTY_FL FS_DIRTY_FL ++#define EXT2_DIRTY_FL FS_DIRTY_FL /* Needs compressing; see Readme.e2compr */ + #define EXT2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */ + #define EXT2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */ ++#define EXT2_NOCOMPR_FL FS_NOCOMP_FL /* Access raw data */ + #define EXT2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */ + /* End compression flags --- maybe not all used */ + #define EXT2_BTREE_FL FS_BTREE_FL /* btree format dir */ +@@ -342,6 +347,7 @@ struct ext2_inode { + #define EXT2_MOUNT_MINIX_DF 0x000080 /* Mimics the Minix statfs */ + #define EXT2_MOUNT_NOBH 0x000100 /* No buffer_heads */ + #define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */ ++#define EXT2_MOUNT_FORCE_COMPAT 0x000400 /* Mount despite incompatibilities */ + #define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */ + #define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */ + #define EXT2_MOUNT_XIP 0x010000 /* Execute in place */ +@@ -507,8 +513,14 @@ struct ext2_super_block { + #define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff + + #define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR ++#ifdef CONFIG_EXT2_COMPRESS ++#define EXT2_FEATURE_INCOMPAT_SUPP (EXT2_FEATURE_INCOMPAT_COMPRESSION| \ ++ EXT2_FEATURE_INCOMPAT_FILETYPE| \ ++ EXT2_FEATURE_INCOMPAT_META_BG) ++#else + #define EXT2_FEATURE_INCOMPAT_SUPP (EXT2_FEATURE_INCOMPAT_FILETYPE| \ + EXT2_FEATURE_INCOMPAT_META_BG) ++#endif + #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT2_FEATURE_RO_COMPAT_BTREE_DIR) +@@ -588,4 +600,16 @@ enum { + ~EXT2_DIR_ROUND) + #define EXT2_MAX_REC_LEN ((1<<16)-1) + ++#ifndef __KERNEL__ ++/* This simplifies things for user programs (notably e2fsprogs) that ++ must compile whether or not is present, but ++ would prefer to include it. Presumably the file is present if the ++ user has this version of ext2_fs.h. */ ++ ++# /* Do not remove this comment. */ include ++ ++/* The comment between `#' and `include' prevents mkdep from generating ++ a dependency on ext2_fs_c.h. */ ++#endif ++ + #endif /* _LINUX_EXT2_FS_H */ +--- linux-3.2-rc5/fs/fcntl.c 2011-12-10 00:09:32.000000000 +0100 ++++ linux-3.2-rc5-e2c/fs/fcntl.c 2011-12-13 14:22:47.857976344 +0100 +@@ -25,6 +25,12 @@ + #include + #include + ++#ifdef CONFIG_EXT2_COMPRESS ++//mw: deny O_DIRECT on file with compression ++#include ++#include "ext2/ext2.h" ++#endif ++ + void set_close_on_exec(unsigned int fd, int flag) + { + struct files_struct *files = current->files; +@@ -171,6 +177,16 @@ static int setfl(int fd, struct file * f + if (!filp->f_mapping || !filp->f_mapping->a_ops || + !filp->f_mapping->a_ops->direct_IO) + return -EINVAL; ++ ++#ifdef CONFIG_EXT2_COMPRESS ++ //mw: if we have a compressed ext2 file: deny! ++ // TODO: maybe check fs-type first! ++ //assert(!(EXT2_I(inode)->i_flags & (EXT2_COMPR_FL|EXT2_COMPRBLK_FL))); ++ if (EXT2_I(inode)->i_flags & (EXT2_COMPR_FL|EXT2_COMPRBLK_FL)) ++ { ++ return -EINVAL; ++ } ++#endif + } + + if (filp->f_op && filp->f_op->check_flags) +--- linux-3.2-rc5/mm/truncate.c 2011-12-10 00:09:32.000000000 +0100 ++++ linux-3.2-rc5-e2c/mm/truncate.c 2011-12-13 14:22:47.858976376 +0100 +@@ -22,6 +22,9 @@ + #include + #include "internal.h" + ++#ifdef CONFIG_EXT2_COMPRESS ++#include ++#endif + + /** + * do_invalidatepage - invalidate part or all of a page +@@ -551,6 +554,11 @@ void truncate_pagecache(struct inode *in + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, holebegin, 0, 1); ++#ifdef CONFIG_EXT2_COMPRESS ++ if ((inode->i_op && inode->i_op->truncate) && ++ ((strcmp(inode->i_sb->s_type->name, "ext2") != 0) || ++ (!(EXT2_I(inode)->i_flags & EXT2_COMPRBLK_FL)))) ++#endif + truncate_inode_pages(mapping, newsize); + unmap_mapping_range(mapping, holebegin, 0, 1); + } +--- linux-3.2-rc5/mm/swapfile.c 2011-12-10 00:09:32.000000000 +0100 ++++ linux-3.2-rc5-e2c/mm/swapfile.c 2011-12-13 14:22:47.859976408 +0100 +@@ -31,6 +31,10 @@ + #include + #include + #include ++#ifdef CONFIG_EXT2_COMPRESS ++#include ++#endif ++ + + #include + #include +@@ -2056,6 +2060,24 @@ SYSCALL_DEFINE2(swapon, const char __use + } + + inode = mapping->host; ++ ++#ifdef CONFIG_EXT2_COMPRESS ++ /* ++ * Swapping not supported for e2compressed files. ++ * (Actually, this code is pretty useless because we ++ * should get an error later anyway because of the ++ * holes.) Yes, this is pretty horrible code... I'll ++ * improve it later. ++ */ ++ if ((strcmp(inode->i_sb->s_type->name, "ext2") == 0) ++ && (EXT2_I(inode)->i_flags & EXT2_COMPRBLK_FL)) ++ { ++ printk("Assertion: Error NO swap SWAP implemented!\n"); ++ error = -EINVAL; ++ goto bad_swap; ++ } ++#endif ++ + /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ + error = claim_swapfile(p, inode); + if (unlikely(error)) +--- linux-3.2-rc5/mm/filemap.c 2011-12-10 00:09:32.000000000 +0100 ++++ linux-3.2-rc5-e2c/mm/filemap.c 2011-12-13 14:22:47.860976440 +0100 +@@ -43,6 +43,10 @@ + + #include + ++#ifdef CONFIG_EXT2_COMPRESS ++# include ++#endif ++ + /* + * Shared mappings implemented 30.11.1994. It's not fully working yet, + * though. +@@ -278,7 +282,19 @@ int filemap_fdatawait_range(struct addre + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + unsigned i; ++#ifdef CONFIG_EXT2_COMPRESS ++/* ++ * I'm not sure that this is right. It has been reworked considerably since ++ * 2.6.5. - whitpa ++ */ ++ struct inode *inode = mapping->host; ++ //printk("wait_on_page_writeback_range\n"); + ++ if ((strcmp(inode->i_sb->s_type->name, "ext2") != 0) ++ || (atomic_read(&inode->i_mutex.count) > 0) ++ || (EXT2_I(inode)->i_compr_flags & ++ EXT2_OSYNC_INODE)) ++#endif + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + +@@ -1184,6 +1200,15 @@ page_ok: + } + nr = nr - offset; + ++#ifdef CONFIG_EXT2_COMPRESS ++ lock_page(page); ++ //check again: after locking still uptodate? ++ if(!PageUptodate(page)){ ++ unlock_page(page); ++ goto page_not_up_to_date; ++ } ++#endif ++ + /* If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. +@@ -1215,6 +1240,10 @@ page_ok: + offset &= ~PAGE_CACHE_MASK; + prev_offset = offset; + ++#ifdef CONFIG_EXT2_COMPRESS ++ unlock_page(page); ++#endif ++ + page_cache_release(page); + if (ret == nr && desc->count) + continue; +@@ -1224,7 +1253,12 @@ page_not_up_to_date: + /* Get exclusive access to the page ... */ + error = lock_page_killable(page); + if (unlikely(error)) ++ { ++ printk("Readpage Error: mw: page locking failed with code: %i\n", error); ++ printk("Readpage Error: mw: might happen as page was locked 'killable'\n"); ++ printk("Readpage Error: mw: was reading app killed?\n"); + goto readpage_error; ++ } + + page_not_up_to_date_locked: + /* Did it get truncated before we got the lock? */ +@@ -1255,13 +1289,17 @@ readpage: + page_cache_release(page); + goto find_page; + } ++ printk("Readpage Error: fs-specific readpage failed with code: %i\n", error); + goto readpage_error; + } + + if (!PageUptodate(page)) { + error = lock_page_killable(page); + if (unlikely(error)) ++ { ++ printk("Readpage Error: page was not uptodate after read. page locking failed with code: %i\n", error); + goto readpage_error; ++ } + if (!PageUptodate(page)) { + if (page->mapping == NULL) { + /* +@@ -1274,6 +1312,7 @@ readpage: + unlock_page(page); + shrink_readahead_size_eio(filp, ra); + error = -EIO; ++ printk("Readpage Error: page was not uptodate after read AND page locked. failed with code: %i\n", error); + goto readpage_error; + } + unlock_page(page); +@@ -1285,6 +1324,7 @@ readpage_error: + /* UHHUH! A synchronous read error occurred. Report it */ + desc->error = error; + page_cache_release(page); ++ printk("Readpage Error\n"); + goto out; + + no_cached_page: +--- linux-3.2-rc5/mm/page_alloc.c 2011-12-10 00:09:32.000000000 +0100 ++++ linux-3.2-rc5-e2c/mm/page_alloc.c 2011-12-13 14:22:47.863976534 +0100 +@@ -1733,6 +1733,8 @@ this_zone_full: + } + return page; + } ++/*mw: needed to build ext2 /w e2compr as module */ ++EXPORT_SYMBOL(__pagevec_free); + + /* + * Large machines with many possible nodes should not always dump per-node diff --git a/3.2.34/linux-3.2.33-zfs.patch b/3.2.34/linux-3.2.33-zfs.patch new file mode 100644 index 0000000..022b674 --- /dev/null +++ b/3.2.34/linux-3.2.33-zfs.patch @@ -0,0 +1,201830 @@ +diff -uNr linux-3.2.33-go.orig/fs/Kconfig linux-3.2.33-go/fs/Kconfig +--- linux-3.2.33-go.orig/fs/Kconfig 2012-11-16 23:15:05.844494007 +0100 ++++ linux-3.2.33-go/fs/Kconfig 2012-11-16 23:25:34.395038807 +0100 +@@ -5,6 +5,7 @@ + menu "File systems" + + if BLOCK ++source "fs/zfs/Kconfig" + + source "fs/ext2/Kconfig" + source "fs/ext3/Kconfig" +diff -uNr linux-3.2.33-go.orig/fs/Makefile linux-3.2.33-go/fs/Makefile +--- linux-3.2.33-go.orig/fs/Makefile 2012-11-16 23:15:05.654496192 +0100 ++++ linux-3.2.33-go/fs/Makefile 2012-11-16 23:25:34.403038714 +0100 +@@ -18,6 +18,7 @@ + else + obj-y += no-block.o + endif ++obj-$(CONFIG_ZFS) += zfs/ + + obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o + obj-y += notify/ +diff -uNr linux-3.2.33-go.orig/fs/zfs/avl/avl.c linux-3.2.33-go/fs/zfs/avl/avl.c +--- linux-3.2.33-go.orig/fs/zfs/avl/avl.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/avl/avl.c 2012-11-16 23:25:34.355039267 +0100 +@@ -0,0 +1,1057 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++/* ++ * AVL - generic AVL tree implementation for kernel use ++ * ++ * A complete description of AVL trees can be found in many CS textbooks. ++ * ++ * Here is a very brief overview. An AVL tree is a binary search tree that is ++ * almost perfectly balanced. By "almost" perfectly balanced, we mean that at ++ * any given node, the left and right subtrees are allowed to differ in height ++ * by at most 1 level. ++ * ++ * This relaxation from a perfectly balanced binary tree allows doing ++ * insertion and deletion relatively efficiently. Searching the tree is ++ * still a fast operation, roughly O(log(N)). ++ * ++ * The key to insertion and deletion is a set of tree maniuplations called ++ * rotations, which bring unbalanced subtrees back into the semi-balanced state. ++ * ++ * This implementation of AVL trees has the following peculiarities: ++ * ++ * - The AVL specific data structures are physically embedded as fields ++ * in the "using" data structures. To maintain generality the code ++ * must constantly translate between "avl_node_t *" and containing ++ * data structure "void *"s by adding/subracting the avl_offset. ++ * ++ * - Since the AVL data is always embedded in other structures, there is ++ * no locking or memory allocation in the AVL routines. This must be ++ * provided for by the enclosing data structure's semantics. Typically, ++ * avl_insert()/_add()/_remove()/avl_insert_here() require some kind of ++ * exclusive write lock. Other operations require a read lock. ++ * ++ * - The implementation uses iteration instead of explicit recursion, ++ * since it is intended to run on limited size kernel stacks. Since ++ * there is no recursion stack present to move "up" in the tree, ++ * there is an explicit "parent" link in the avl_node_t. ++ * ++ * - The left/right children pointers of a node are in an array. ++ * In the code, variables (instead of constants) are used to represent ++ * left and right indices. The implementation is written as if it only ++ * dealt with left handed manipulations. By changing the value assigned ++ * to "left", the code also works for right handed trees. The ++ * following variables/terms are frequently used: ++ * ++ * int left; // 0 when dealing with left children, ++ * // 1 for dealing with right children ++ * ++ * int left_heavy; // -1 when left subtree is taller at some node, ++ * // +1 when right subtree is taller ++ * ++ * int right; // will be the opposite of left (0 or 1) ++ * int right_heavy;// will be the opposite of left_heavy (-1 or 1) ++ * ++ * int direction; // 0 for "<" (ie. left child); 1 for ">" (right) ++ * ++ * Though it is a little more confusing to read the code, the approach ++ * allows using half as much code (and hence cache footprint) for tree ++ * manipulations and eliminates many conditional branches. ++ * ++ * - The avl_index_t is an opaque "cookie" used to find nodes at or ++ * adjacent to where a new value would be inserted in the tree. The value ++ * is a modified "avl_node_t *". The bottom bit (normally 0 for a ++ * pointer) is set to indicate if that the new node has a value greater ++ * than the value of the indicated "avl_node_t *". ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Small arrays to translate between balance (or diff) values and child indeces. ++ * ++ * Code that deals with binary tree data structures will randomly use ++ * left and right children when examining a tree. C "if()" statements ++ * which evaluate randomly suffer from very poor hardware branch prediction. ++ * In this code we avoid some of the branch mispredictions by using the ++ * following translation arrays. They replace random branches with an ++ * additional memory reference. Since the translation arrays are both very ++ * small the data should remain efficiently in cache. ++ */ ++static const int avl_child2balance[2] = {-1, 1}; ++static const int avl_balance2child[] = {0, 0, 1}; ++ ++ ++/* ++ * Walk from one node to the previous valued node (ie. an infix walk ++ * towards the left). At any given node we do one of 2 things: ++ * ++ * - If there is a left child, go to it, then to it's rightmost descendant. ++ * ++ * - otherwise we return thru parent nodes until we've come from a right child. ++ * ++ * Return Value: ++ * NULL - if at the end of the nodes ++ * otherwise next node ++ */ ++void * ++avl_walk(avl_tree_t *tree, void *oldnode, int left) ++{ ++ size_t off = tree->avl_offset; ++ avl_node_t *node = AVL_DATA2NODE(oldnode, off); ++ int right = 1 - left; ++ int was_child; ++ ++ ++ /* ++ * nowhere to walk to if tree is empty ++ */ ++ if (node == NULL) ++ return (NULL); ++ ++ /* ++ * Visit the previous valued node. There are two possibilities: ++ * ++ * If this node has a left child, go down one left, then all ++ * the way right. ++ */ ++ if (node->avl_child[left] != NULL) { ++ for (node = node->avl_child[left]; ++ node->avl_child[right] != NULL; ++ node = node->avl_child[right]) ++ ; ++ /* ++ * Otherwise, return thru left children as far as we can. ++ */ ++ } else { ++ for (;;) { ++ was_child = AVL_XCHILD(node); ++ node = AVL_XPARENT(node); ++ if (node == NULL) ++ return (NULL); ++ if (was_child == right) ++ break; ++ } ++ } ++ ++ return (AVL_NODE2DATA(node, off)); ++} ++ ++/* ++ * Return the lowest valued node in a tree or NULL. ++ * (leftmost child from root of tree) ++ */ ++void * ++avl_first(avl_tree_t *tree) ++{ ++ avl_node_t *node; ++ avl_node_t *prev = NULL; ++ size_t off = tree->avl_offset; ++ ++ for (node = tree->avl_root; node != NULL; node = node->avl_child[0]) ++ prev = node; ++ ++ if (prev != NULL) ++ return (AVL_NODE2DATA(prev, off)); ++ return (NULL); ++} ++ ++/* ++ * Return the highest valued node in a tree or NULL. ++ * (rightmost child from root of tree) ++ */ ++void * ++avl_last(avl_tree_t *tree) ++{ ++ avl_node_t *node; ++ avl_node_t *prev = NULL; ++ size_t off = tree->avl_offset; ++ ++ for (node = tree->avl_root; node != NULL; node = node->avl_child[1]) ++ prev = node; ++ ++ if (prev != NULL) ++ return (AVL_NODE2DATA(prev, off)); ++ return (NULL); ++} ++ ++/* ++ * Access the node immediately before or after an insertion point. ++ * ++ * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child ++ * ++ * Return value: ++ * NULL: no node in the given direction ++ * "void *" of the found tree node ++ */ ++void * ++avl_nearest(avl_tree_t *tree, avl_index_t where, int direction) ++{ ++ int child = AVL_INDEX2CHILD(where); ++ avl_node_t *node = AVL_INDEX2NODE(where); ++ void *data; ++ size_t off = tree->avl_offset; ++ ++ if (node == NULL) { ++ ASSERT(tree->avl_root == NULL); ++ return (NULL); ++ } ++ data = AVL_NODE2DATA(node, off); ++ if (child != direction) ++ return (data); ++ ++ return (avl_walk(tree, data, direction)); ++} ++ ++ ++/* ++ * Search for the node which contains "value". The algorithm is a ++ * simple binary tree search. ++ * ++ * return value: ++ * NULL: the value is not in the AVL tree ++ * *where (if not NULL) is set to indicate the insertion point ++ * "void *" of the found tree node ++ */ ++void * ++avl_find(avl_tree_t *tree, const void *value, avl_index_t *where) ++{ ++ avl_node_t *node; ++ avl_node_t *prev = NULL; ++ int child = 0; ++ int diff; ++ size_t off = tree->avl_offset; ++ ++ for (node = tree->avl_root; node != NULL; ++ node = node->avl_child[child]) { ++ ++ prev = node; ++ ++ diff = tree->avl_compar(value, AVL_NODE2DATA(node, off)); ++ ASSERT(-1 <= diff && diff <= 1); ++ if (diff == 0) { ++#ifdef DEBUG ++ if (where != NULL) ++ *where = 0; ++#endif ++ return (AVL_NODE2DATA(node, off)); ++ } ++ child = avl_balance2child[1 + diff]; ++ ++ } ++ ++ if (where != NULL) ++ *where = AVL_MKINDEX(prev, child); ++ ++ return (NULL); ++} ++ ++ ++/* ++ * Perform a rotation to restore balance at the subtree given by depth. ++ * ++ * This routine is used by both insertion and deletion. The return value ++ * indicates: ++ * 0 : subtree did not change height ++ * !0 : subtree was reduced in height ++ * ++ * The code is written as if handling left rotations, right rotations are ++ * symmetric and handled by swapping values of variables right/left[_heavy] ++ * ++ * On input balance is the "new" balance at "node". This value is either ++ * -2 or +2. ++ */ ++static int ++avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance) ++{ ++ int left = !(balance < 0); /* when balance = -2, left will be 0 */ ++ int right = 1 - left; ++ int left_heavy = balance >> 1; ++ int right_heavy = -left_heavy; ++ avl_node_t *parent = AVL_XPARENT(node); ++ avl_node_t *child = node->avl_child[left]; ++ avl_node_t *cright; ++ avl_node_t *gchild; ++ avl_node_t *gright; ++ avl_node_t *gleft; ++ int which_child = AVL_XCHILD(node); ++ int child_bal = AVL_XBALANCE(child); ++ ++ /* BEGIN CSTYLED */ ++ /* ++ * case 1 : node is overly left heavy, the left child is balanced or ++ * also left heavy. This requires the following rotation. ++ * ++ * (node bal:-2) ++ * / \ ++ * / \ ++ * (child bal:0 or -1) ++ * / \ ++ * / \ ++ * cright ++ * ++ * becomes: ++ * ++ * (child bal:1 or 0) ++ * / \ ++ * / \ ++ * (node bal:-1 or 0) ++ * / \ ++ * / \ ++ * cright ++ * ++ * we detect this situation by noting that child's balance is not ++ * right_heavy. ++ */ ++ /* END CSTYLED */ ++ if (child_bal != right_heavy) { ++ ++ /* ++ * compute new balance of nodes ++ * ++ * If child used to be left heavy (now balanced) we reduced ++ * the height of this sub-tree -- used in "return...;" below ++ */ ++ child_bal += right_heavy; /* adjust towards right */ ++ ++ /* ++ * move "cright" to be node's left child ++ */ ++ cright = child->avl_child[right]; ++ node->avl_child[left] = cright; ++ if (cright != NULL) { ++ AVL_SETPARENT(cright, node); ++ AVL_SETCHILD(cright, left); ++ } ++ ++ /* ++ * move node to be child's right child ++ */ ++ child->avl_child[right] = node; ++ AVL_SETBALANCE(node, -child_bal); ++ AVL_SETCHILD(node, right); ++ AVL_SETPARENT(node, child); ++ ++ /* ++ * update the pointer into this subtree ++ */ ++ AVL_SETBALANCE(child, child_bal); ++ AVL_SETCHILD(child, which_child); ++ AVL_SETPARENT(child, parent); ++ if (parent != NULL) ++ parent->avl_child[which_child] = child; ++ else ++ tree->avl_root = child; ++ ++ return (child_bal == 0); ++ } ++ ++ /* BEGIN CSTYLED */ ++ /* ++ * case 2 : When node is left heavy, but child is right heavy we use ++ * a different rotation. ++ * ++ * (node b:-2) ++ * / \ ++ * / \ ++ * / \ ++ * (child b:+1) ++ * / \ ++ * / \ ++ * (gchild b: != 0) ++ * / \ ++ * / \ ++ * gleft gright ++ * ++ * becomes: ++ * ++ * (gchild b:0) ++ * / \ ++ * / \ ++ * / \ ++ * (child b:?) (node b:?) ++ * / \ / \ ++ * / \ / \ ++ * gleft gright ++ * ++ * computing the new balances is more complicated. As an example: ++ * if gchild was right_heavy, then child is now left heavy ++ * else it is balanced ++ */ ++ /* END CSTYLED */ ++ gchild = child->avl_child[right]; ++ gleft = gchild->avl_child[left]; ++ gright = gchild->avl_child[right]; ++ ++ /* ++ * move gright to left child of node and ++ * ++ * move gleft to right child of node ++ */ ++ node->avl_child[left] = gright; ++ if (gright != NULL) { ++ AVL_SETPARENT(gright, node); ++ AVL_SETCHILD(gright, left); ++ } ++ ++ child->avl_child[right] = gleft; ++ if (gleft != NULL) { ++ AVL_SETPARENT(gleft, child); ++ AVL_SETCHILD(gleft, right); ++ } ++ ++ /* ++ * move child to left child of gchild and ++ * ++ * move node to right child of gchild and ++ * ++ * fixup parent of all this to point to gchild ++ */ ++ balance = AVL_XBALANCE(gchild); ++ gchild->avl_child[left] = child; ++ AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0)); ++ AVL_SETPARENT(child, gchild); ++ AVL_SETCHILD(child, left); ++ ++ gchild->avl_child[right] = node; ++ AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0)); ++ AVL_SETPARENT(node, gchild); ++ AVL_SETCHILD(node, right); ++ ++ AVL_SETBALANCE(gchild, 0); ++ AVL_SETPARENT(gchild, parent); ++ AVL_SETCHILD(gchild, which_child); ++ if (parent != NULL) ++ parent->avl_child[which_child] = gchild; ++ else ++ tree->avl_root = gchild; ++ ++ return (1); /* the new tree is always shorter */ ++} ++ ++ ++/* ++ * Insert a new node into an AVL tree at the specified (from avl_find()) place. ++ * ++ * Newly inserted nodes are always leaf nodes in the tree, since avl_find() ++ * searches out to the leaf positions. The avl_index_t indicates the node ++ * which will be the parent of the new node. ++ * ++ * After the node is inserted, a single rotation further up the tree may ++ * be necessary to maintain an acceptable AVL balance. ++ */ ++void ++avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where) ++{ ++ avl_node_t *node; ++ avl_node_t *parent = AVL_INDEX2NODE(where); ++ int old_balance; ++ int new_balance; ++ int which_child = AVL_INDEX2CHILD(where); ++ size_t off = tree->avl_offset; ++ ++ ASSERT(tree); ++#ifdef _LP64 ++ ASSERT(((uintptr_t)new_data & 0x7) == 0); ++#endif ++ ++ node = AVL_DATA2NODE(new_data, off); ++ ++ /* ++ * First, add the node to the tree at the indicated position. ++ */ ++ ++tree->avl_numnodes; ++ ++ node->avl_child[0] = NULL; ++ node->avl_child[1] = NULL; ++ ++ AVL_SETCHILD(node, which_child); ++ AVL_SETBALANCE(node, 0); ++ AVL_SETPARENT(node, parent); ++ if (parent != NULL) { ++ ASSERT(parent->avl_child[which_child] == NULL); ++ parent->avl_child[which_child] = node; ++ } else { ++ ASSERT(tree->avl_root == NULL); ++ tree->avl_root = node; ++ } ++ /* ++ * Now, back up the tree modifying the balance of all nodes above the ++ * insertion point. If we get to a highly unbalanced ancestor, we ++ * need to do a rotation. If we back out of the tree we are done. ++ * If we brought any subtree into perfect balance (0), we are also done. ++ */ ++ for (;;) { ++ node = parent; ++ if (node == NULL) ++ return; ++ ++ /* ++ * Compute the new balance ++ */ ++ old_balance = AVL_XBALANCE(node); ++ new_balance = old_balance + avl_child2balance[which_child]; ++ ++ /* ++ * If we introduced equal balance, then we are done immediately ++ */ ++ if (new_balance == 0) { ++ AVL_SETBALANCE(node, 0); ++ return; ++ } ++ ++ /* ++ * If both old and new are not zero we went ++ * from -1 to -2 balance, do a rotation. ++ */ ++ if (old_balance != 0) ++ break; ++ ++ AVL_SETBALANCE(node, new_balance); ++ parent = AVL_XPARENT(node); ++ which_child = AVL_XCHILD(node); ++ } ++ ++ /* ++ * perform a rotation to fix the tree and return ++ */ ++ (void) avl_rotation(tree, node, new_balance); ++} ++ ++/* ++ * Insert "new_data" in "tree" in the given "direction" either after or ++ * before (AVL_AFTER, AVL_BEFORE) the data "here". ++ * ++ * Insertions can only be done at empty leaf points in the tree, therefore ++ * if the given child of the node is already present we move to either ++ * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since ++ * every other node in the tree is a leaf, this always works. ++ * ++ * To help developers using this interface, we assert that the new node ++ * is correctly ordered at every step of the way in DEBUG kernels. ++ */ ++void ++avl_insert_here( ++ avl_tree_t *tree, ++ void *new_data, ++ void *here, ++ int direction) ++{ ++ avl_node_t *node; ++ int child = direction; /* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */ ++#ifdef DEBUG ++ int diff; ++#endif ++ ++ ASSERT(tree != NULL); ++ ASSERT(new_data != NULL); ++ ASSERT(here != NULL); ++ ASSERT(direction == AVL_BEFORE || direction == AVL_AFTER); ++ ++ /* ++ * If corresponding child of node is not NULL, go to the neighboring ++ * node and reverse the insertion direction. ++ */ ++ node = AVL_DATA2NODE(here, tree->avl_offset); ++ ++#ifdef DEBUG ++ diff = tree->avl_compar(new_data, here); ++ ASSERT(-1 <= diff && diff <= 1); ++ ASSERT(diff != 0); ++ ASSERT(diff > 0 ? child == 1 : child == 0); ++#endif ++ ++ if (node->avl_child[child] != NULL) { ++ node = node->avl_child[child]; ++ child = 1 - child; ++ while (node->avl_child[child] != NULL) { ++#ifdef DEBUG ++ diff = tree->avl_compar(new_data, ++ AVL_NODE2DATA(node, tree->avl_offset)); ++ ASSERT(-1 <= diff && diff <= 1); ++ ASSERT(diff != 0); ++ ASSERT(diff > 0 ? child == 1 : child == 0); ++#endif ++ node = node->avl_child[child]; ++ } ++#ifdef DEBUG ++ diff = tree->avl_compar(new_data, ++ AVL_NODE2DATA(node, tree->avl_offset)); ++ ASSERT(-1 <= diff && diff <= 1); ++ ASSERT(diff != 0); ++ ASSERT(diff > 0 ? child == 1 : child == 0); ++#endif ++ } ++ ASSERT(node->avl_child[child] == NULL); ++ ++ avl_insert(tree, new_data, AVL_MKINDEX(node, child)); ++} ++ ++/* ++ * Add a new node to an AVL tree. ++ */ ++void ++avl_add(avl_tree_t *tree, void *new_node) ++{ ++ avl_index_t where; ++ ++ /* ++ * This is unfortunate. We want to call panic() here, even for ++ * non-DEBUG kernels. In userland, however, we can't depend on anything ++ * in libc or else the rtld build process gets confused. So, all we can ++ * do in userland is resort to a normal ASSERT(). ++ */ ++ if (avl_find(tree, new_node, &where) != NULL) ++#ifdef _KERNEL ++ panic("avl_find() succeeded inside avl_add()"); ++#else ++ ASSERT(0); ++#endif ++ avl_insert(tree, new_node, where); ++} ++ ++/* ++ * Delete a node from the AVL tree. Deletion is similar to insertion, but ++ * with 2 complications. ++ * ++ * First, we may be deleting an interior node. Consider the following subtree: ++ * ++ * d c c ++ * / \ / \ / \ ++ * b e b e b e ++ * / \ / \ / ++ * a c a a ++ * ++ * When we are deleting node (d), we find and bring up an adjacent valued leaf ++ * node, say (c), to take the interior node's place. In the code this is ++ * handled by temporarily swapping (d) and (c) in the tree and then using ++ * common code to delete (d) from the leaf position. ++ * ++ * Secondly, an interior deletion from a deep tree may require more than one ++ * rotation to fix the balance. This is handled by moving up the tree through ++ * parents and applying rotations as needed. The return value from ++ * avl_rotation() is used to detect when a subtree did not change overall ++ * height due to a rotation. ++ */ ++void ++avl_remove(avl_tree_t *tree, void *data) ++{ ++ avl_node_t *delete; ++ avl_node_t *parent; ++ avl_node_t *node; ++ avl_node_t tmp; ++ int old_balance; ++ int new_balance; ++ int left; ++ int right; ++ int which_child; ++ size_t off = tree->avl_offset; ++ ++ ASSERT(tree); ++ ++ delete = AVL_DATA2NODE(data, off); ++ ++ /* ++ * Deletion is easiest with a node that has at most 1 child. ++ * We swap a node with 2 children with a sequentially valued ++ * neighbor node. That node will have at most 1 child. Note this ++ * has no effect on the ordering of the remaining nodes. ++ * ++ * As an optimization, we choose the greater neighbor if the tree ++ * is right heavy, otherwise the left neighbor. This reduces the ++ * number of rotations needed. ++ */ ++ if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) { ++ ++ /* ++ * choose node to swap from whichever side is taller ++ */ ++ old_balance = AVL_XBALANCE(delete); ++ left = avl_balance2child[old_balance + 1]; ++ right = 1 - left; ++ ++ /* ++ * get to the previous value'd node ++ * (down 1 left, as far as possible right) ++ */ ++ for (node = delete->avl_child[left]; ++ node->avl_child[right] != NULL; ++ node = node->avl_child[right]) ++ ; ++ ++ /* ++ * create a temp placeholder for 'node' ++ * move 'node' to delete's spot in the tree ++ */ ++ tmp = *node; ++ ++ *node = *delete; ++ if (node->avl_child[left] == node) ++ node->avl_child[left] = &tmp; ++ ++ parent = AVL_XPARENT(node); ++ if (parent != NULL) ++ parent->avl_child[AVL_XCHILD(node)] = node; ++ else ++ tree->avl_root = node; ++ AVL_SETPARENT(node->avl_child[left], node); ++ AVL_SETPARENT(node->avl_child[right], node); ++ ++ /* ++ * Put tmp where node used to be (just temporary). ++ * It always has a parent and at most 1 child. ++ */ ++ delete = &tmp; ++ parent = AVL_XPARENT(delete); ++ parent->avl_child[AVL_XCHILD(delete)] = delete; ++ which_child = (delete->avl_child[1] != 0); ++ if (delete->avl_child[which_child] != NULL) ++ AVL_SETPARENT(delete->avl_child[which_child], delete); ++ } ++ ++ ++ /* ++ * Here we know "delete" is at least partially a leaf node. It can ++ * be easily removed from the tree. ++ */ ++ ASSERT(tree->avl_numnodes > 0); ++ --tree->avl_numnodes; ++ parent = AVL_XPARENT(delete); ++ which_child = AVL_XCHILD(delete); ++ if (delete->avl_child[0] != NULL) ++ node = delete->avl_child[0]; ++ else ++ node = delete->avl_child[1]; ++ ++ /* ++ * Connect parent directly to node (leaving out delete). ++ */ ++ if (node != NULL) { ++ AVL_SETPARENT(node, parent); ++ AVL_SETCHILD(node, which_child); ++ } ++ if (parent == NULL) { ++ tree->avl_root = node; ++ return; ++ } ++ parent->avl_child[which_child] = node; ++ ++ ++ /* ++ * Since the subtree is now shorter, begin adjusting parent balances ++ * and performing any needed rotations. ++ */ ++ do { ++ ++ /* ++ * Move up the tree and adjust the balance ++ * ++ * Capture the parent and which_child values for the next ++ * iteration before any rotations occur. ++ */ ++ node = parent; ++ old_balance = AVL_XBALANCE(node); ++ new_balance = old_balance - avl_child2balance[which_child]; ++ parent = AVL_XPARENT(node); ++ which_child = AVL_XCHILD(node); ++ ++ /* ++ * If a node was in perfect balance but isn't anymore then ++ * we can stop, since the height didn't change above this point ++ * due to a deletion. ++ */ ++ if (old_balance == 0) { ++ AVL_SETBALANCE(node, new_balance); ++ break; ++ } ++ ++ /* ++ * If the new balance is zero, we don't need to rotate ++ * else ++ * need a rotation to fix the balance. ++ * If the rotation doesn't change the height ++ * of the sub-tree we have finished adjusting. ++ */ ++ if (new_balance == 0) ++ AVL_SETBALANCE(node, new_balance); ++ else if (!avl_rotation(tree, node, new_balance)) ++ break; ++ } while (parent != NULL); ++} ++ ++#define AVL_REINSERT(tree, obj) \ ++ avl_remove((tree), (obj)); \ ++ avl_add((tree), (obj)) ++ ++boolean_t ++avl_update_lt(avl_tree_t *t, void *obj) ++{ ++ void *neighbor; ++ ++ ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) || ++ (t->avl_compar(obj, neighbor) <= 0)); ++ ++ neighbor = AVL_PREV(t, obj); ++ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { ++ AVL_REINSERT(t, obj); ++ return (B_TRUE); ++ } ++ ++ return (B_FALSE); ++} ++ ++boolean_t ++avl_update_gt(avl_tree_t *t, void *obj) ++{ ++ void *neighbor; ++ ++ ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) || ++ (t->avl_compar(obj, neighbor) >= 0)); ++ ++ neighbor = AVL_NEXT(t, obj); ++ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { ++ AVL_REINSERT(t, obj); ++ return (B_TRUE); ++ } ++ ++ return (B_FALSE); ++} ++ ++boolean_t ++avl_update(avl_tree_t *t, void *obj) ++{ ++ void *neighbor; ++ ++ neighbor = AVL_PREV(t, obj); ++ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { ++ AVL_REINSERT(t, obj); ++ return (B_TRUE); ++ } ++ ++ neighbor = AVL_NEXT(t, obj); ++ if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { ++ AVL_REINSERT(t, obj); ++ return (B_TRUE); ++ } ++ ++ return (B_FALSE); ++} ++ ++/* ++ * initialize a new AVL tree ++ */ ++void ++avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *), ++ size_t size, size_t offset) ++{ ++ ASSERT(tree); ++ ASSERT(compar); ++ ASSERT(size > 0); ++ ASSERT(size >= offset + sizeof (avl_node_t)); ++#ifdef _LP64 ++ ASSERT((offset & 0x7) == 0); ++#endif ++ ++ tree->avl_compar = compar; ++ tree->avl_root = NULL; ++ tree->avl_numnodes = 0; ++ tree->avl_size = size; ++ tree->avl_offset = offset; ++} ++ ++/* ++ * Delete a tree. ++ */ ++/* ARGSUSED */ ++void ++avl_destroy(avl_tree_t *tree) ++{ ++ ASSERT(tree); ++ ASSERT(tree->avl_numnodes == 0); ++ ASSERT(tree->avl_root == NULL); ++} ++ ++ ++/* ++ * Return the number of nodes in an AVL tree. ++ */ ++ulong_t ++avl_numnodes(avl_tree_t *tree) ++{ ++ ASSERT(tree); ++ return (tree->avl_numnodes); ++} ++ ++boolean_t ++avl_is_empty(avl_tree_t *tree) ++{ ++ ASSERT(tree); ++ return (tree->avl_numnodes == 0); ++} ++ ++#define CHILDBIT (1L) ++ ++/* ++ * Post-order tree walk used to visit all tree nodes and destroy the tree ++ * in post order. This is used for destroying a tree w/o paying any cost ++ * for rebalancing it. ++ * ++ * example: ++ * ++ * void *cookie = NULL; ++ * my_data_t *node; ++ * ++ * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL) ++ * free(node); ++ * avl_destroy(tree); ++ * ++ * The cookie is really an avl_node_t to the current node's parent and ++ * an indication of which child you looked at last. ++ * ++ * On input, a cookie value of CHILDBIT indicates the tree is done. ++ */ ++void * ++avl_destroy_nodes(avl_tree_t *tree, void **cookie) ++{ ++ avl_node_t *node; ++ avl_node_t *parent; ++ int child; ++ void *first; ++ size_t off = tree->avl_offset; ++ ++ /* ++ * Initial calls go to the first node or it's right descendant. ++ */ ++ if (*cookie == NULL) { ++ first = avl_first(tree); ++ ++ /* ++ * deal with an empty tree ++ */ ++ if (first == NULL) { ++ *cookie = (void *)CHILDBIT; ++ return (NULL); ++ } ++ ++ node = AVL_DATA2NODE(first, off); ++ parent = AVL_XPARENT(node); ++ goto check_right_side; ++ } ++ ++ /* ++ * If there is no parent to return to we are done. ++ */ ++ parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT); ++ if (parent == NULL) { ++ if (tree->avl_root != NULL) { ++ ASSERT(tree->avl_numnodes == 1); ++ tree->avl_root = NULL; ++ tree->avl_numnodes = 0; ++ } ++ return (NULL); ++ } ++ ++ /* ++ * Remove the child pointer we just visited from the parent and tree. ++ */ ++ child = (uintptr_t)(*cookie) & CHILDBIT; ++ parent->avl_child[child] = NULL; ++ ASSERT(tree->avl_numnodes > 1); ++ --tree->avl_numnodes; ++ ++ /* ++ * If we just did a right child or there isn't one, go up to parent. ++ */ ++ if (child == 1 || parent->avl_child[1] == NULL) { ++ node = parent; ++ parent = AVL_XPARENT(parent); ++ goto done; ++ } ++ ++ /* ++ * Do parent's right child, then leftmost descendent. ++ */ ++ node = parent->avl_child[1]; ++ while (node->avl_child[0] != NULL) { ++ parent = node; ++ node = node->avl_child[0]; ++ } ++ ++ /* ++ * If here, we moved to a left child. It may have one ++ * child on the right (when balance == +1). ++ */ ++check_right_side: ++ if (node->avl_child[1] != NULL) { ++ ASSERT(AVL_XBALANCE(node) == 1); ++ parent = node; ++ node = node->avl_child[1]; ++ ASSERT(node->avl_child[0] == NULL && ++ node->avl_child[1] == NULL); ++ } else { ++ ASSERT(AVL_XBALANCE(node) <= 0); ++ } ++ ++done: ++ if (parent == NULL) { ++ *cookie = (void *)CHILDBIT; ++ ASSERT(node == tree->avl_root); ++ } else { ++ *cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node)); ++ } ++ ++ return (AVL_NODE2DATA(node, off)); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++ ++static int avl_init(void) { return 0; } ++static int avl_fini(void) { return 0; } ++ ++spl_module_init(avl_init); ++spl_module_exit(avl_fini); ++ ++MODULE_DESCRIPTION("Generic AVL tree implementation"); ++MODULE_AUTHOR(ZFS_META_AUTHOR); ++MODULE_LICENSE(ZFS_META_LICENSE); ++ ++EXPORT_SYMBOL(avl_create); ++EXPORT_SYMBOL(avl_find); ++EXPORT_SYMBOL(avl_insert); ++EXPORT_SYMBOL(avl_insert_here); ++EXPORT_SYMBOL(avl_walk); ++EXPORT_SYMBOL(avl_first); ++EXPORT_SYMBOL(avl_last); ++EXPORT_SYMBOL(avl_nearest); ++EXPORT_SYMBOL(avl_add); ++EXPORT_SYMBOL(avl_remove); ++EXPORT_SYMBOL(avl_numnodes); ++EXPORT_SYMBOL(avl_destroy_nodes); ++EXPORT_SYMBOL(avl_destroy); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/avl/Makefile linux-3.2.33-go/fs/zfs/avl/Makefile +--- linux-3.2.33-go.orig/fs/zfs/avl/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/avl/Makefile 2012-11-16 23:25:34.357039243 +0100 +@@ -0,0 +1,7 @@ ++MODULE := zavl ++ ++EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) -Wno-unused-but-set-variable -DHAVE_SPL -D_KERNEL -DTEXT_DOMAIN=\"zfs-linux-kernel\" -DNDEBUG ++ ++obj-$(CONFIG_ZFS) := $(MODULE).o ++ ++$(MODULE)-objs += avl.o +diff -uNr linux-3.2.33-go.orig/fs/zfs/avl/Makefile.in linux-3.2.33-go/fs/zfs/avl/Makefile.in +--- linux-3.2.33-go.orig/fs/zfs/avl/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/avl/Makefile.in 2012-11-16 23:25:34.355039267 +0100 +@@ -0,0 +1,7 @@ ++MODULE := zavl ++ ++EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@ ++ ++obj-$(CONFIG_ZFS) := $(MODULE).o ++ ++$(MODULE)-objs += @top_srcdir@/module/avl/avl.o +diff -uNr linux-3.2.33-go.orig/fs/zfs/Kbuild linux-3.2.33-go/fs/zfs/Kbuild +--- linux-3.2.33-go.orig/fs/zfs/Kbuild 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/Kbuild 2012-11-16 23:25:34.379038989 +0100 +@@ -0,0 +1,11 @@ ++ZFS_MODULE_CFLAGS = -I$(srctree)/include/zfs -I$(srctree)/include/spl ++ZFS_MODULE_CFLAGS += -include $(srctree)/spl_config.h -include $(srctree)/zfs_config.h ++export ZFS_MODULE_CFLAGS ++ ++obj-$(CONFIG_ZFS) := ++obj-$(CONFIG_ZFS) += avl/ ++obj-$(CONFIG_ZFS) += nvpair/ ++obj-$(CONFIG_ZFS) += unicode/ ++obj-$(CONFIG_ZFS) += zcommon/ ++obj-$(CONFIG_ZFS) += zfs/ ++obj-$(CONFIG_ZFS) += zpios/ +diff -uNr linux-3.2.33-go.orig/fs/zfs/Kconfig linux-3.2.33-go/fs/zfs/Kconfig +--- linux-3.2.33-go.orig/fs/zfs/Kconfig 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/Kconfig 2012-11-16 23:25:34.378039001 +0100 +@@ -0,0 +1,14 @@ ++config ZFS ++ tristate "ZFS filesystem support" ++ depends on SPL ++ depends on EFI_PARTITION ++ select ZLIB_INFLATE ++ select ZLIB_DEFLATE ++ help ++ This is the ZFS filesystem from the ZFS On Linux project. ++ ++ See http://zfsonlinux.org/ ++ ++ To compile this file system support as a module, choose M here. ++ ++ If unsure, say N. +diff -uNr linux-3.2.33-go.orig/fs/zfs/Makefile linux-3.2.33-go/fs/zfs/Makefile +--- linux-3.2.33-go.orig/fs/zfs/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/Makefile 2012-11-16 23:25:34.355039267 +0100 +@@ -0,0 +1,65 @@ ++subdir-m += avl ++subdir-m += nvpair ++subdir-m += unicode ++subdir-m += zcommon ++subdir-m += zfs ++subdir-m += zpios ++ ++ZFS_MODULE_CFLAGS += -include /usr/src/linux-3.2.33-go/spl_config.h ++ZFS_MODULE_CFLAGS += -include /root/zfs-0.6.0-rc12/zfs_config.h ++ZFS_MODULE_CFLAGS += -I/root/zfs-0.6.0-rc12/include -I/usr/src/linux-3.2.33-go/include -I/usr/src/linux-3.2.33-go ++export ZFS_MODULE_CFLAGS ++ ++modules: ++ @# Make the exported SPL symbols available to these modules. ++ @# They may be in the root of SPL_OBJ when building against ++ @# installed devel headers, or they may be in the module ++ @# subdirectory when building against the spl source tree. ++ @if [ -f /usr/src/linux-3.2.33-go/NONE ]; then \ ++ /bin/cp /usr/src/linux-3.2.33-go/NONE .; \ ++ elif [ -f /usr/src/linux-3.2.33-go/module/NONE ]; then \ ++ /bin/cp /usr/src/linux-3.2.33-go/module/NONE .; \ ++ else \ ++ echo -e "\n" \ ++ "*** Missing spl symbols ensure you have built the spl:\n" \ ++ "*** - /usr/src/linux-3.2.33-go/NONE, or\n" \ ++ "*** - /usr/src/linux-3.2.33-go/module/NONE\n"; \ ++ exit 1; \ ++ fi ++ $(MAKE) -C /usr/src/linux-3.6.0-sabayon SUBDIRS=`pwd` O=/usr/src/linux-3.6.0-sabayon CONFIG_ZFS=m $@ ++ ++clean: ++ @# Only cleanup the kernel build directories when CONFIG_KERNEL ++ @# is defined. This indicates that kernel modules should be built. ++# $(MAKE) -C /usr/src/linux-3.6.0-sabayon SUBDIRS=`pwd` O=/usr/src/linux-3.6.0-sabayon $@ ++ ++ if [ -f NONE ]; then $(RM) NONE; fi ++ if [ -f NONE ]; then $(RM) NONE; fi ++ if [ -f Module.markers ]; then $(RM) Module.markers; fi ++ ++modules_install: ++ @# Install the kernel modules ++ $(MAKE) -C /usr/src/linux-3.6.0-sabayon SUBDIRS=`pwd` \ ++ INSTALL_MOD_PATH=$(DESTDIR) \ ++ INSTALL_MOD_DIR=addon/zfs $@ ++ @# Remove extraneous build products when packaging ++ if [ -n "$(DESTDIR)" ]; then \ ++ find $(DESTDIR)/lib/modules/3.6.0-sabayon \ ++ -name 'modules.*' | xargs $(RM); \ ++ fi ++ sysmap=$(DESTDIR)/boot/System.map-3.6.0-sabayon; \ ++ if [ -f $$sysmap ]; then \ ++ depmod -ae -F $$sysmap 3.6.0-sabayon; \ ++ fi ++ ++modules_uninstall: ++ @# Uninstall the kernel modules ++ $(RM) -R $(DESTDIR)/lib/modules/3.6.0-sabayon/addon/zfs ++ ++distdir: ++ ++distclean maintainer-clean: clean ++install: modules_install ++uninstall: modules_uninstall ++all: modules ++check: +diff -uNr linux-3.2.33-go.orig/fs/zfs/Makefile.in linux-3.2.33-go/fs/zfs/Makefile.in +--- linux-3.2.33-go.orig/fs/zfs/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/Makefile.in 2012-11-16 23:25:34.355039267 +0100 +@@ -0,0 +1,65 @@ ++subdir-m += avl ++subdir-m += nvpair ++subdir-m += unicode ++subdir-m += zcommon ++subdir-m += zfs ++subdir-m += zpios ++ ++ZFS_MODULE_CFLAGS += -include @SPL_OBJ@/spl_config.h ++ZFS_MODULE_CFLAGS += -include @abs_top_builddir@/zfs_config.h ++ZFS_MODULE_CFLAGS += -I@abs_top_srcdir@/include -I@SPL@/include -I@SPL@ ++export ZFS_MODULE_CFLAGS ++ ++modules: ++ @# Make the exported SPL symbols available to these modules. ++ @# They may be in the root of SPL_OBJ when building against ++ @# installed devel headers, or they may be in the module ++ @# subdirectory when building against the spl source tree. ++ @if [ -f @SPL_OBJ@/@SPL_SYMBOLS@ ]; then \ ++ /bin/cp @SPL_OBJ@/@SPL_SYMBOLS@ .; \ ++ elif [ -f @SPL_OBJ@/module/@SPL_SYMBOLS@ ]; then \ ++ /bin/cp @SPL_OBJ@/module/@SPL_SYMBOLS@ .; \ ++ else \ ++ echo -e "\n" \ ++ "*** Missing spl symbols ensure you have built the spl:\n" \ ++ "*** - @SPL_OBJ@/@SPL_SYMBOLS@, or\n" \ ++ "*** - @SPL_OBJ@/module/@SPL_SYMBOLS@\n"; \ ++ exit 1; \ ++ fi ++ $(MAKE) -C @LINUX_OBJ@ SUBDIRS=`pwd` @KERNELMAKE_PARAMS@ CONFIG_ZFS=m $@ ++ ++clean: ++ @# Only cleanup the kernel build directories when CONFIG_KERNEL ++ @# is defined. This indicates that kernel modules should be built. ++@CONFIG_KERNEL_TRUE@ $(MAKE) -C @LINUX_OBJ@ SUBDIRS=`pwd` @KERNELMAKE_PARAMS@ $@ ++ ++ if [ -f @SPL_SYMBOLS@ ]; then $(RM) @SPL_SYMBOLS@; fi ++ if [ -f @LINUX_SYMBOLS@ ]; then $(RM) @LINUX_SYMBOLS@; fi ++ if [ -f Module.markers ]; then $(RM) Module.markers; fi ++ ++modules_install: ++ @# Install the kernel modules ++ $(MAKE) -C @LINUX_OBJ@ SUBDIRS=`pwd` \ ++ INSTALL_MOD_PATH=$(DESTDIR) \ ++ INSTALL_MOD_DIR=addon/zfs $@ ++ @# Remove extraneous build products when packaging ++ if [ -n "$(DESTDIR)" ]; then \ ++ find $(DESTDIR)/lib/modules/@LINUX_VERSION@ \ ++ -name 'modules.*' | xargs $(RM); \ ++ fi ++ sysmap=$(DESTDIR)/boot/System.map-@LINUX_VERSION@; \ ++ if [ -f $$sysmap ]; then \ ++ depmod -ae -F $$sysmap @LINUX_VERSION@; \ ++ fi ++ ++modules_uninstall: ++ @# Uninstall the kernel modules ++ $(RM) -R $(DESTDIR)/lib/modules/@LINUX_VERSION@/addon/zfs ++ ++distdir: ++ ++distclean maintainer-clean: clean ++install: modules_install ++uninstall: modules_uninstall ++all: modules ++check: +diff -uNr linux-3.2.33-go.orig/fs/zfs/nvpair/Makefile linux-3.2.33-go/fs/zfs/nvpair/Makefile +--- linux-3.2.33-go.orig/fs/zfs/nvpair/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/nvpair/Makefile 2012-11-16 23:25:34.359039219 +0100 +@@ -0,0 +1,9 @@ ++MODULE := znvpair ++ ++EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) -Wno-unused-but-set-variable -DHAVE_SPL -D_KERNEL -DTEXT_DOMAIN=\"zfs-linux-kernel\" -DNDEBUG ++ ++obj-$(CONFIG_ZFS) := $(MODULE).o ++ ++$(MODULE)-objs += nvpair.o ++$(MODULE)-objs += nvpair_alloc_spl.o ++$(MODULE)-objs += nvpair_alloc_fixed.o +diff -uNr linux-3.2.33-go.orig/fs/zfs/nvpair/Makefile.in linux-3.2.33-go/fs/zfs/nvpair/Makefile.in +--- linux-3.2.33-go.orig/fs/zfs/nvpair/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/nvpair/Makefile.in 2012-11-16 23:25:34.354039278 +0100 +@@ -0,0 +1,9 @@ ++MODULE := znvpair ++ ++EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@ ++ ++obj-$(CONFIG_ZFS) := $(MODULE).o ++ ++$(MODULE)-objs += @top_srcdir@/module/nvpair/nvpair.o ++$(MODULE)-objs += @top_srcdir@/module/nvpair/nvpair_alloc_spl.o ++$(MODULE)-objs += @top_srcdir@/module/nvpair/nvpair_alloc_fixed.o +diff -uNr linux-3.2.33-go.orig/fs/zfs/nvpair/nvpair_alloc_fixed.c linux-3.2.33-go/fs/zfs/nvpair/nvpair_alloc_fixed.c +--- linux-3.2.33-go.orig/fs/zfs/nvpair/nvpair_alloc_fixed.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/nvpair/nvpair_alloc_fixed.c 2012-11-16 23:25:34.354039278 +0100 +@@ -0,0 +1,124 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++ ++ ++#include ++#include ++#include ++#include ++#if defined(_KERNEL) && !defined(_BOOT) ++#include ++#else ++#include ++#include ++#endif ++ ++/* ++ * This allocator is very simple. ++ * - it uses a pre-allocated buffer for memory allocations. ++ * - it does _not_ free memory in the pre-allocated buffer. ++ * ++ * The reason for the selected implemention is simplicity. ++ * This allocator is designed for the usage in interrupt context when ++ * the caller may not wait for free memory. ++ */ ++ ++/* pre-allocated buffer for memory allocations */ ++typedef struct nvbuf { ++ uintptr_t nvb_buf; /* address of pre-allocated buffer */ ++ uintptr_t nvb_lim; /* limit address in the buffer */ ++ uintptr_t nvb_cur; /* current address in the buffer */ ++} nvbuf_t; ++ ++/* ++ * Initialize the pre-allocated buffer allocator. The caller needs to supply ++ * ++ * buf address of pre-allocated buffer ++ * bufsz size of pre-allocated buffer ++ * ++ * nv_fixed_init() calculates the remaining members of nvbuf_t. ++ */ ++static int ++nv_fixed_init(nv_alloc_t *nva, va_list valist) ++{ ++ uintptr_t base = va_arg(valist, uintptr_t); ++ uintptr_t lim = base + va_arg(valist, size_t); ++ nvbuf_t *nvb = (nvbuf_t *)P2ROUNDUP(base, sizeof (uintptr_t)); ++ ++ if (base == 0 || (uintptr_t)&nvb[1] > lim) ++ return (EINVAL); ++ ++ nvb->nvb_buf = (uintptr_t)&nvb[0]; ++ nvb->nvb_cur = (uintptr_t)&nvb[1]; ++ nvb->nvb_lim = lim; ++ nva->nva_arg = nvb; ++ ++ return (0); ++} ++ ++static void * ++nv_fixed_alloc(nv_alloc_t *nva, size_t size) ++{ ++ nvbuf_t *nvb = nva->nva_arg; ++ uintptr_t new = nvb->nvb_cur; ++ ++ if (size == 0 || new + size > nvb->nvb_lim) ++ return (NULL); ++ ++ nvb->nvb_cur = P2ROUNDUP(new + size, sizeof (uintptr_t)); ++ ++ return ((void *)new); ++} ++ ++/*ARGSUSED*/ ++static void ++nv_fixed_free(nv_alloc_t *nva, void *buf, size_t size) ++{ ++ /* don't free memory in the pre-allocated buffer */ ++} ++ ++static void ++nv_fixed_reset(nv_alloc_t *nva) ++{ ++ nvbuf_t *nvb = nva->nva_arg; ++ ++ nvb->nvb_cur = (uintptr_t)&nvb[1]; ++} ++ ++const nv_alloc_ops_t nv_fixed_ops_def = { ++ nv_fixed_init, /* nv_ao_init() */ ++ NULL, /* nv_ao_fini() */ ++ nv_fixed_alloc, /* nv_ao_alloc() */ ++ nv_fixed_free, /* nv_ao_free() */ ++ nv_fixed_reset /* nv_ao_reset() */ ++}; ++ ++const nv_alloc_ops_t *nv_fixed_ops = &nv_fixed_ops_def; ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(nv_fixed_ops); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/nvpair/nvpair_alloc_spl.c linux-3.2.33-go/fs/zfs/nvpair/nvpair_alloc_spl.c +--- linux-3.2.33-go.orig/fs/zfs/nvpair/nvpair_alloc_spl.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/nvpair/nvpair_alloc_spl.c 2012-11-16 23:25:34.354039278 +0100 +@@ -0,0 +1,75 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License, Version 1.0 only ++ * (the "License"). You may not use this file except in compliance ++ * with the License. ++ * ++ * You can obtain a copy of the license at * usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2004 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#include ++#include ++ ++static void * ++nv_alloc_sleep_spl(nv_alloc_t *nva, size_t size) ++{ ++ return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); ++} ++ ++static void * ++nv_alloc_nosleep_spl(nv_alloc_t *nva, size_t size) ++{ ++ return (kmem_alloc(size, KM_NOSLEEP)); ++} ++ ++static void ++nv_free_spl(nv_alloc_t *nva, void *buf, size_t size) ++{ ++ kmem_free(buf, size); ++} ++ ++const nv_alloc_ops_t spl_sleep_ops_def = { ++ NULL, /* nv_ao_init() */ ++ NULL, /* nv_ao_fini() */ ++ nv_alloc_sleep_spl, /* nv_ao_alloc() */ ++ nv_free_spl, /* nv_ao_free() */ ++ NULL /* nv_ao_reset() */ ++}; ++ ++const nv_alloc_ops_t spl_nosleep_ops_def = { ++ NULL, /* nv_ao_init() */ ++ NULL, /* nv_ao_fini() */ ++ nv_alloc_nosleep_spl, /* nv_ao_alloc() */ ++ nv_free_spl, /* nv_ao_free() */ ++ NULL /* nv_ao_reset() */ ++}; ++ ++nv_alloc_t nv_alloc_sleep_def = { ++ &spl_sleep_ops_def, ++ NULL ++}; ++ ++nv_alloc_t nv_alloc_nosleep_def = { ++ &spl_nosleep_ops_def, ++ NULL ++}; ++ ++nv_alloc_t *nv_alloc_sleep = &nv_alloc_sleep_def; ++nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def; +diff -uNr linux-3.2.33-go.orig/fs/zfs/nvpair/nvpair.c linux-3.2.33-go/fs/zfs/nvpair/nvpair.c +--- linux-3.2.33-go.orig/fs/zfs/nvpair/nvpair.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/nvpair/nvpair.c 2012-11-16 23:25:34.354039278 +0100 +@@ -0,0 +1,3425 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#if defined(_KERNEL) && !defined(_BOOT) ++#include ++#include ++#include ++#else ++#include ++#include ++#include ++#include ++#endif ++ ++#ifndef offsetof ++#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) ++#endif ++#define skip_whitespace(p) while ((*(p) == ' ') || (*(p) == '\t')) p++ ++ ++/* ++ * nvpair.c - Provides kernel & userland interfaces for manipulating ++ * name-value pairs. ++ * ++ * Overview Diagram ++ * ++ * +--------------+ ++ * | nvlist_t | ++ * |--------------| ++ * | nvl_version | ++ * | nvl_nvflag | ++ * | nvl_priv -+-+ ++ * | nvl_flag | | ++ * | nvl_pad | | ++ * +--------------+ | ++ * V ++ * +--------------+ last i_nvp in list ++ * | nvpriv_t | +---------------------> ++ * |--------------| | ++ * +--+- nvp_list | | +------------+ ++ * | | nvp_last -+--+ + nv_alloc_t | ++ * | | nvp_curr | |------------| ++ * | | nvp_nva -+----> | nva_ops | ++ * | | nvp_stat | | nva_arg | ++ * | +--------------+ +------------+ ++ * | ++ * +-------+ ++ * V ++ * +---------------------+ +-------------------+ ++ * | i_nvp_t | +-->| i_nvp_t | +--> ++ * |---------------------| | |-------------------| | ++ * | nvi_next -+--+ | nvi_next -+--+ ++ * | nvi_prev (NULL) | <----+ nvi_prev | ++ * | . . . . . . . . . . | | . . . . . . . . . | ++ * | nvp (nvpair_t) | | nvp (nvpair_t) | ++ * | - nvp_size | | - nvp_size | ++ * | - nvp_name_sz | | - nvp_name_sz | ++ * | - nvp_value_elem | | - nvp_value_elem | ++ * | - nvp_type | | - nvp_type | ++ * | - data ... | | - data ... | ++ * +---------------------+ +-------------------+ ++ * ++ * ++ * ++ * +---------------------+ +---------------------+ ++ * | i_nvp_t | +--> +-->| i_nvp_t (last) | ++ * |---------------------| | | |---------------------| ++ * | nvi_next -+--+ ... --+ | nvi_next (NULL) | ++ * <-+- nvi_prev |<-- ... <----+ nvi_prev | ++ * | . . . . . . . . . | | . . . . . . . . . | ++ * | nvp (nvpair_t) | | nvp (nvpair_t) | ++ * | - nvp_size | | - nvp_size | ++ * | - nvp_name_sz | | - nvp_name_sz | ++ * | - nvp_value_elem | | - nvp_value_elem | ++ * | - DATA_TYPE_NVLIST | | - nvp_type | ++ * | - data (embedded) | | - data ... | ++ * | nvlist name | +---------------------+ ++ * | +--------------+ | ++ * | | nvlist_t | | ++ * | |--------------| | ++ * | | nvl_version | | ++ * | | nvl_nvflag | | ++ * | | nvl_priv --+---+----> ++ * | | nvl_flag | | ++ * | | nvl_pad | | ++ * | +--------------+ | ++ * +---------------------+ ++ * ++ * ++ * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will ++ * allow value to be aligned on 8 byte boundary ++ * ++ * name_len is the length of the name string including the null terminator ++ * so it must be >= 1 ++ */ ++#define NVP_SIZE_CALC(name_len, data_len) \ ++ (NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len)) ++ ++static int i_get_value_size(data_type_t type, const void *data, uint_t nelem); ++static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type, ++ uint_t nelem, const void *data); ++ ++#define NV_STAT_EMBEDDED 0x1 ++#define EMBEDDED_NVL(nvp) ((nvlist_t *)(void *)NVP_VALUE(nvp)) ++#define EMBEDDED_NVL_ARRAY(nvp) ((nvlist_t **)(void *)NVP_VALUE(nvp)) ++ ++#define NVP_VALOFF(nvp) (NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz)) ++#define NVPAIR2I_NVP(nvp) \ ++ ((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp))) ++ ++ ++int ++nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...) ++{ ++ va_list valist; ++ int err = 0; ++ ++ nva->nva_ops = nvo; ++ nva->nva_arg = NULL; ++ ++ va_start(valist, nvo); ++ if (nva->nva_ops->nv_ao_init != NULL) ++ err = nva->nva_ops->nv_ao_init(nva, valist); ++ va_end(valist); ++ ++ return (err); ++} ++ ++void ++nv_alloc_reset(nv_alloc_t *nva) ++{ ++ if (nva->nva_ops->nv_ao_reset != NULL) ++ nva->nva_ops->nv_ao_reset(nva); ++} ++ ++void ++nv_alloc_fini(nv_alloc_t *nva) ++{ ++ if (nva->nva_ops->nv_ao_fini != NULL) ++ nva->nva_ops->nv_ao_fini(nva); ++} ++ ++nv_alloc_t * ++nvlist_lookup_nv_alloc(nvlist_t *nvl) ++{ ++ nvpriv_t *priv; ++ ++ if (nvl == NULL || ++ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) ++ return (NULL); ++ ++ return (priv->nvp_nva); ++} ++ ++static void * ++nv_mem_zalloc(nvpriv_t *nvp, size_t size) ++{ ++ nv_alloc_t *nva = nvp->nvp_nva; ++ void *buf; ++ ++ if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL) ++ bzero(buf, size); ++ ++ return (buf); ++} ++ ++static void ++nv_mem_free(nvpriv_t *nvp, void *buf, size_t size) ++{ ++ nv_alloc_t *nva = nvp->nvp_nva; ++ ++ nva->nva_ops->nv_ao_free(nva, buf, size); ++} ++ ++static void ++nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat) ++{ ++ bzero(priv, sizeof (nvpriv_t)); ++ ++ priv->nvp_nva = nva; ++ priv->nvp_stat = stat; ++} ++ ++static nvpriv_t * ++nv_priv_alloc(nv_alloc_t *nva) ++{ ++ nvpriv_t *priv; ++ ++ /* ++ * nv_mem_alloc() cannot called here because it needs the priv ++ * argument. ++ */ ++ if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL) ++ return (NULL); ++ ++ nv_priv_init(priv, nva, 0); ++ ++ return (priv); ++} ++ ++/* ++ * Embedded lists need their own nvpriv_t's. We create a new ++ * nvpriv_t using the parameters and allocator from the parent ++ * list's nvpriv_t. ++ */ ++static nvpriv_t * ++nv_priv_alloc_embedded(nvpriv_t *priv) ++{ ++ nvpriv_t *emb_priv; ++ ++ if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL) ++ return (NULL); ++ ++ nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED); ++ ++ return (emb_priv); ++} ++ ++static void ++nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv) ++{ ++ nvl->nvl_version = NV_VERSION; ++ nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE); ++ nvl->nvl_priv = (uint64_t)(uintptr_t)priv; ++ nvl->nvl_flag = 0; ++ nvl->nvl_pad = 0; ++} ++ ++uint_t ++nvlist_nvflag(nvlist_t *nvl) ++{ ++ return (nvl->nvl_nvflag); ++} ++ ++/* ++ * nvlist_alloc - Allocate nvlist. ++ */ ++/*ARGSUSED1*/ ++int ++nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag) ++{ ++#if defined(_KERNEL) && !defined(_BOOT) ++ return (nvlist_xalloc(nvlp, nvflag, ++ (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); ++#else ++ return (nvlist_xalloc(nvlp, nvflag, nv_alloc_nosleep)); ++#endif ++} ++ ++int ++nvlist_xalloc(nvlist_t **nvlp, uint_t nvflag, nv_alloc_t *nva) ++{ ++ nvpriv_t *priv; ++ ++ if (nvlp == NULL || nva == NULL) ++ return (EINVAL); ++ ++ if ((priv = nv_priv_alloc(nva)) == NULL) ++ return (ENOMEM); ++ ++ if ((*nvlp = nv_mem_zalloc(priv, ++ NV_ALIGN(sizeof (nvlist_t)))) == NULL) { ++ nv_mem_free(priv, priv, sizeof (nvpriv_t)); ++ return (ENOMEM); ++ } ++ ++ nvlist_init(*nvlp, nvflag, priv); ++ ++ return (0); ++} ++ ++/* ++ * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair. ++ */ ++static nvpair_t * ++nvp_buf_alloc(nvlist_t *nvl, size_t len) ++{ ++ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; ++ i_nvp_t *buf; ++ nvpair_t *nvp; ++ size_t nvsize; ++ ++ /* ++ * Allocate the buffer ++ */ ++ nvsize = len + offsetof(i_nvp_t, nvi_nvp); ++ ++ if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL) ++ return (NULL); ++ ++ nvp = &buf->nvi_nvp; ++ nvp->nvp_size = len; ++ ++ return (nvp); ++} ++ ++/* ++ * nvp_buf_free - de-Allocate an i_nvp_t. ++ */ ++static void ++nvp_buf_free(nvlist_t *nvl, nvpair_t *nvp) ++{ ++ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; ++ size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp); ++ ++ nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize); ++} ++ ++/* ++ * nvp_buf_link - link a new nv pair into the nvlist. ++ */ ++static void ++nvp_buf_link(nvlist_t *nvl, nvpair_t *nvp) ++{ ++ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; ++ i_nvp_t *curr = NVPAIR2I_NVP(nvp); ++ ++ /* Put element at end of nvlist */ ++ if (priv->nvp_list == NULL) { ++ priv->nvp_list = priv->nvp_last = curr; ++ } else { ++ curr->nvi_prev = priv->nvp_last; ++ priv->nvp_last->nvi_next = curr; ++ priv->nvp_last = curr; ++ } ++} ++ ++/* ++ * nvp_buf_unlink - unlink an removed nvpair out of the nvlist. ++ */ ++static void ++nvp_buf_unlink(nvlist_t *nvl, nvpair_t *nvp) ++{ ++ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; ++ i_nvp_t *curr = NVPAIR2I_NVP(nvp); ++ ++ /* ++ * protect nvlist_next_nvpair() against walking on freed memory. ++ */ ++ if (priv->nvp_curr == curr) ++ priv->nvp_curr = curr->nvi_next; ++ ++ if (curr == priv->nvp_list) ++ priv->nvp_list = curr->nvi_next; ++ else ++ curr->nvi_prev->nvi_next = curr->nvi_next; ++ ++ if (curr == priv->nvp_last) ++ priv->nvp_last = curr->nvi_prev; ++ else ++ curr->nvi_next->nvi_prev = curr->nvi_prev; ++} ++ ++/* ++ * take a nvpair type and number of elements and make sure the are valid ++ */ ++static int ++i_validate_type_nelem(data_type_t type, uint_t nelem) ++{ ++ switch (type) { ++ case DATA_TYPE_BOOLEAN: ++ if (nelem != 0) ++ return (EINVAL); ++ break; ++ case DATA_TYPE_BOOLEAN_VALUE: ++ case DATA_TYPE_BYTE: ++ case DATA_TYPE_INT8: ++ case DATA_TYPE_UINT8: ++ case DATA_TYPE_INT16: ++ case DATA_TYPE_UINT16: ++ case DATA_TYPE_INT32: ++ case DATA_TYPE_UINT32: ++ case DATA_TYPE_INT64: ++ case DATA_TYPE_UINT64: ++ case DATA_TYPE_STRING: ++ case DATA_TYPE_HRTIME: ++ case DATA_TYPE_NVLIST: ++#if !defined(_KERNEL) ++ case DATA_TYPE_DOUBLE: ++#endif ++ if (nelem != 1) ++ return (EINVAL); ++ break; ++ case DATA_TYPE_BOOLEAN_ARRAY: ++ case DATA_TYPE_BYTE_ARRAY: ++ case DATA_TYPE_INT8_ARRAY: ++ case DATA_TYPE_UINT8_ARRAY: ++ case DATA_TYPE_INT16_ARRAY: ++ case DATA_TYPE_UINT16_ARRAY: ++ case DATA_TYPE_INT32_ARRAY: ++ case DATA_TYPE_UINT32_ARRAY: ++ case DATA_TYPE_INT64_ARRAY: ++ case DATA_TYPE_UINT64_ARRAY: ++ case DATA_TYPE_STRING_ARRAY: ++ case DATA_TYPE_NVLIST_ARRAY: ++ /* we allow arrays with 0 elements */ ++ break; ++ default: ++ return (EINVAL); ++ } ++ return (0); ++} ++ ++/* ++ * Verify nvp_name_sz and check the name string length. ++ */ ++static int ++i_validate_nvpair_name(nvpair_t *nvp) ++{ ++ if ((nvp->nvp_name_sz <= 0) || ++ (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0))) ++ return (EFAULT); ++ ++ /* verify the name string, make sure its terminated */ ++ if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0') ++ return (EFAULT); ++ ++ return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT); ++} ++ ++static int ++i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data) ++{ ++ switch (type) { ++ case DATA_TYPE_BOOLEAN_VALUE: ++ if (*(boolean_t *)data != B_TRUE && ++ *(boolean_t *)data != B_FALSE) ++ return (EINVAL); ++ break; ++ case DATA_TYPE_BOOLEAN_ARRAY: { ++ int i; ++ ++ for (i = 0; i < nelem; i++) ++ if (((boolean_t *)data)[i] != B_TRUE && ++ ((boolean_t *)data)[i] != B_FALSE) ++ return (EINVAL); ++ break; ++ } ++ default: ++ break; ++ } ++ ++ return (0); ++} ++ ++/* ++ * This function takes a pointer to what should be a nvpair and it's size ++ * and then verifies that all the nvpair fields make sense and can be ++ * trusted. This function is used when decoding packed nvpairs. ++ */ ++static int ++i_validate_nvpair(nvpair_t *nvp) ++{ ++ data_type_t type = NVP_TYPE(nvp); ++ int size1, size2; ++ ++ /* verify nvp_name_sz, check the name string length */ ++ if (i_validate_nvpair_name(nvp) != 0) ++ return (EFAULT); ++ ++ if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0) ++ return (EFAULT); ++ ++ /* ++ * verify nvp_type, nvp_value_elem, and also possibly ++ * verify string values and get the value size. ++ */ ++ size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp)); ++ size1 = nvp->nvp_size - NVP_VALOFF(nvp); ++ if (size2 < 0 || size1 != NV_ALIGN(size2)) ++ return (EFAULT); ++ ++ return (0); ++} ++ ++static int ++nvlist_copy_pairs(nvlist_t *snvl, nvlist_t *dnvl) ++{ ++ nvpriv_t *priv; ++ i_nvp_t *curr; ++ ++ if ((priv = (nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL) ++ return (EINVAL); ++ ++ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { ++ nvpair_t *nvp = &curr->nvi_nvp; ++ int err; ++ ++ if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp), ++ NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0) ++ return (err); ++ } ++ ++ return (0); ++} ++ ++/* ++ * Frees all memory allocated for an nvpair (like embedded lists) with ++ * the exception of the nvpair buffer itself. ++ */ ++static void ++nvpair_free(nvpair_t *nvp) ++{ ++ switch (NVP_TYPE(nvp)) { ++ case DATA_TYPE_NVLIST: ++ nvlist_free(EMBEDDED_NVL(nvp)); ++ break; ++ case DATA_TYPE_NVLIST_ARRAY: { ++ nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); ++ int i; ++ ++ for (i = 0; i < NVP_NELEM(nvp); i++) ++ if (nvlp[i] != NULL) ++ nvlist_free(nvlp[i]); ++ break; ++ } ++ default: ++ break; ++ } ++} ++ ++/* ++ * nvlist_free - free an unpacked nvlist ++ */ ++void ++nvlist_free(nvlist_t *nvl) ++{ ++ nvpriv_t *priv; ++ i_nvp_t *curr; ++ ++ if (nvl == NULL || ++ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) ++ return; ++ ++ /* ++ * Unpacked nvlist are linked through i_nvp_t ++ */ ++ curr = priv->nvp_list; ++ while (curr != NULL) { ++ nvpair_t *nvp = &curr->nvi_nvp; ++ curr = curr->nvi_next; ++ ++ nvpair_free(nvp); ++ nvp_buf_free(nvl, nvp); ++ } ++ ++ if (!(priv->nvp_stat & NV_STAT_EMBEDDED)) ++ nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t))); ++ else ++ nvl->nvl_priv = 0; ++ ++ nv_mem_free(priv, priv, sizeof (nvpriv_t)); ++} ++ ++static int ++nvlist_contains_nvp(nvlist_t *nvl, nvpair_t *nvp) ++{ ++ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; ++ i_nvp_t *curr; ++ ++ if (nvp == NULL) ++ return (0); ++ ++ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) ++ if (&curr->nvi_nvp == nvp) ++ return (1); ++ ++ return (0); ++} ++ ++/* ++ * Make a copy of nvlist ++ */ ++/*ARGSUSED1*/ ++int ++nvlist_dup(nvlist_t *nvl, nvlist_t **nvlp, int kmflag) ++{ ++#if defined(_KERNEL) && !defined(_BOOT) ++ return (nvlist_xdup(nvl, nvlp, ++ (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); ++#else ++ return (nvlist_xdup(nvl, nvlp, nv_alloc_nosleep)); ++#endif ++} ++ ++int ++nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva) ++{ ++ int err; ++ nvlist_t *ret; ++ ++ if (nvl == NULL || nvlp == NULL) ++ return (EINVAL); ++ ++ if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0) ++ return (err); ++ ++ if ((err = nvlist_copy_pairs(nvl, ret)) != 0) ++ nvlist_free(ret); ++ else ++ *nvlp = ret; ++ ++ return (err); ++} ++ ++/* ++ * Remove all with matching name ++ */ ++int ++nvlist_remove_all(nvlist_t *nvl, const char *name) ++{ ++ nvpriv_t *priv; ++ i_nvp_t *curr; ++ int error = ENOENT; ++ ++ if (nvl == NULL || name == NULL || ++ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) ++ return (EINVAL); ++ ++ curr = priv->nvp_list; ++ while (curr != NULL) { ++ nvpair_t *nvp = &curr->nvi_nvp; ++ ++ curr = curr->nvi_next; ++ if (strcmp(name, NVP_NAME(nvp)) != 0) ++ continue; ++ ++ nvp_buf_unlink(nvl, nvp); ++ nvpair_free(nvp); ++ nvp_buf_free(nvl, nvp); ++ ++ error = 0; ++ } ++ ++ return (error); ++} ++ ++/* ++ * Remove first one with matching name and type ++ */ ++int ++nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type) ++{ ++ nvpriv_t *priv; ++ i_nvp_t *curr; ++ ++ if (nvl == NULL || name == NULL || ++ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) ++ return (EINVAL); ++ ++ curr = priv->nvp_list; ++ while (curr != NULL) { ++ nvpair_t *nvp = &curr->nvi_nvp; ++ ++ if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type) { ++ nvp_buf_unlink(nvl, nvp); ++ nvpair_free(nvp); ++ nvp_buf_free(nvl, nvp); ++ ++ return (0); ++ } ++ curr = curr->nvi_next; ++ } ++ ++ return (ENOENT); ++} ++ ++int ++nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp) ++{ ++ if (nvl == NULL || nvp == NULL) ++ return (EINVAL); ++ ++ nvp_buf_unlink(nvl, nvp); ++ nvpair_free(nvp); ++ nvp_buf_free(nvl, nvp); ++ return (0); ++} ++ ++/* ++ * This function calculates the size of an nvpair value. ++ * ++ * The data argument controls the behavior in case of the data types ++ * DATA_TYPE_STRING and ++ * DATA_TYPE_STRING_ARRAY ++ * Is data == NULL then the size of the string(s) is excluded. ++ */ ++static int ++i_get_value_size(data_type_t type, const void *data, uint_t nelem) ++{ ++ uint64_t value_sz; ++ ++ if (i_validate_type_nelem(type, nelem) != 0) ++ return (-1); ++ ++ /* Calculate required size for holding value */ ++ switch (type) { ++ case DATA_TYPE_BOOLEAN: ++ value_sz = 0; ++ break; ++ case DATA_TYPE_BOOLEAN_VALUE: ++ value_sz = sizeof (boolean_t); ++ break; ++ case DATA_TYPE_BYTE: ++ value_sz = sizeof (uchar_t); ++ break; ++ case DATA_TYPE_INT8: ++ value_sz = sizeof (int8_t); ++ break; ++ case DATA_TYPE_UINT8: ++ value_sz = sizeof (uint8_t); ++ break; ++ case DATA_TYPE_INT16: ++ value_sz = sizeof (int16_t); ++ break; ++ case DATA_TYPE_UINT16: ++ value_sz = sizeof (uint16_t); ++ break; ++ case DATA_TYPE_INT32: ++ value_sz = sizeof (int32_t); ++ break; ++ case DATA_TYPE_UINT32: ++ value_sz = sizeof (uint32_t); ++ break; ++ case DATA_TYPE_INT64: ++ value_sz = sizeof (int64_t); ++ break; ++ case DATA_TYPE_UINT64: ++ value_sz = sizeof (uint64_t); ++ break; ++#if !defined(_KERNEL) ++ case DATA_TYPE_DOUBLE: ++ value_sz = sizeof (double); ++ break; ++#endif ++ case DATA_TYPE_STRING: ++ if (data == NULL) ++ value_sz = 0; ++ else ++ value_sz = strlen(data) + 1; ++ break; ++ case DATA_TYPE_BOOLEAN_ARRAY: ++ value_sz = (uint64_t)nelem * sizeof (boolean_t); ++ break; ++ case DATA_TYPE_BYTE_ARRAY: ++ value_sz = (uint64_t)nelem * sizeof (uchar_t); ++ break; ++ case DATA_TYPE_INT8_ARRAY: ++ value_sz = (uint64_t)nelem * sizeof (int8_t); ++ break; ++ case DATA_TYPE_UINT8_ARRAY: ++ value_sz = (uint64_t)nelem * sizeof (uint8_t); ++ break; ++ case DATA_TYPE_INT16_ARRAY: ++ value_sz = (uint64_t)nelem * sizeof (int16_t); ++ break; ++ case DATA_TYPE_UINT16_ARRAY: ++ value_sz = (uint64_t)nelem * sizeof (uint16_t); ++ break; ++ case DATA_TYPE_INT32_ARRAY: ++ value_sz = (uint64_t)nelem * sizeof (int32_t); ++ break; ++ case DATA_TYPE_UINT32_ARRAY: ++ value_sz = (uint64_t)nelem * sizeof (uint32_t); ++ break; ++ case DATA_TYPE_INT64_ARRAY: ++ value_sz = (uint64_t)nelem * sizeof (int64_t); ++ break; ++ case DATA_TYPE_UINT64_ARRAY: ++ value_sz = (uint64_t)nelem * sizeof (uint64_t); ++ break; ++ case DATA_TYPE_STRING_ARRAY: ++ value_sz = (uint64_t)nelem * sizeof (uint64_t); ++ ++ if (data != NULL) { ++ char *const *strs = data; ++ uint_t i; ++ ++ /* no alignment requirement for strings */ ++ for (i = 0; i < nelem; i++) { ++ if (strs[i] == NULL) ++ return (-1); ++ value_sz += strlen(strs[i]) + 1; ++ } ++ } ++ break; ++ case DATA_TYPE_HRTIME: ++ value_sz = sizeof (hrtime_t); ++ break; ++ case DATA_TYPE_NVLIST: ++ value_sz = NV_ALIGN(sizeof (nvlist_t)); ++ break; ++ case DATA_TYPE_NVLIST_ARRAY: ++ value_sz = (uint64_t)nelem * sizeof (uint64_t) + ++ (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t)); ++ break; ++ default: ++ return (-1); ++ } ++ ++ return (value_sz > INT32_MAX ? -1 : (int)value_sz); ++} ++ ++static int ++nvlist_copy_embedded(nvlist_t *nvl, nvlist_t *onvl, nvlist_t *emb_nvl) ++{ ++ nvpriv_t *priv; ++ int err; ++ ++ if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t) ++ nvl->nvl_priv)) == NULL) ++ return (ENOMEM); ++ ++ nvlist_init(emb_nvl, onvl->nvl_nvflag, priv); ++ ++ if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) { ++ nvlist_free(emb_nvl); ++ emb_nvl->nvl_priv = 0; ++ } ++ ++ return (err); ++} ++ ++/* ++ * nvlist_add_common - Add new pair to nvlist ++ */ ++static int ++nvlist_add_common(nvlist_t *nvl, const char *name, ++ data_type_t type, uint_t nelem, const void *data) ++{ ++ nvpair_t *nvp; ++ uint_t i; ++ ++ int nvp_sz, name_sz, value_sz; ++ int err = 0; ++ ++ if (name == NULL || nvl == NULL || nvl->nvl_priv == 0) ++ return (EINVAL); ++ ++ if (nelem != 0 && data == NULL) ++ return (EINVAL); ++ ++ /* ++ * Verify type and nelem and get the value size. ++ * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY ++ * is the size of the string(s) included. ++ */ ++ if ((value_sz = i_get_value_size(type, data, nelem)) < 0) ++ return (EINVAL); ++ ++ if (i_validate_nvpair_value(type, nelem, data) != 0) ++ return (EINVAL); ++ ++ /* ++ * If we're adding an nvlist or nvlist array, ensure that we are not ++ * adding the input nvlist to itself, which would cause recursion, ++ * and ensure that no NULL nvlist pointers are present. ++ */ ++ switch (type) { ++ case DATA_TYPE_NVLIST: ++ if (data == nvl || data == NULL) ++ return (EINVAL); ++ break; ++ case DATA_TYPE_NVLIST_ARRAY: { ++ nvlist_t **onvlp = (nvlist_t **)data; ++ for (i = 0; i < nelem; i++) { ++ if (onvlp[i] == nvl || onvlp[i] == NULL) ++ return (EINVAL); ++ } ++ break; ++ } ++ default: ++ break; ++ } ++ ++ /* calculate sizes of the nvpair elements and the nvpair itself */ ++ name_sz = strlen(name) + 1; ++ ++ nvp_sz = NVP_SIZE_CALC(name_sz, value_sz); ++ ++ if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL) ++ return (ENOMEM); ++ ++ ASSERT(nvp->nvp_size == nvp_sz); ++ nvp->nvp_name_sz = name_sz; ++ nvp->nvp_value_elem = nelem; ++ nvp->nvp_type = type; ++ bcopy(name, NVP_NAME(nvp), name_sz); ++ ++ switch (type) { ++ case DATA_TYPE_BOOLEAN: ++ break; ++ case DATA_TYPE_STRING_ARRAY: { ++ char *const *strs = data; ++ char *buf = NVP_VALUE(nvp); ++ char **cstrs = (void *)buf; ++ ++ /* skip pre-allocated space for pointer array */ ++ buf += nelem * sizeof (uint64_t); ++ for (i = 0; i < nelem; i++) { ++ int slen = strlen(strs[i]) + 1; ++ bcopy(strs[i], buf, slen); ++ cstrs[i] = buf; ++ buf += slen; ++ } ++ break; ++ } ++ case DATA_TYPE_NVLIST: { ++ nvlist_t *nnvl = EMBEDDED_NVL(nvp); ++ nvlist_t *onvl = (nvlist_t *)data; ++ ++ if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) { ++ nvp_buf_free(nvl, nvp); ++ return (err); ++ } ++ break; ++ } ++ case DATA_TYPE_NVLIST_ARRAY: { ++ nvlist_t **onvlp = (nvlist_t **)data; ++ nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); ++ nvlist_t *embedded = (nvlist_t *) ++ ((uintptr_t)nvlp + nelem * sizeof (uint64_t)); ++ ++ for (i = 0; i < nelem; i++) { ++ if ((err = nvlist_copy_embedded(nvl, ++ onvlp[i], embedded)) != 0) { ++ /* ++ * Free any successfully created lists ++ */ ++ nvpair_free(nvp); ++ nvp_buf_free(nvl, nvp); ++ return (err); ++ } ++ ++ nvlp[i] = embedded++; ++ } ++ break; ++ } ++ default: ++ bcopy(data, NVP_VALUE(nvp), value_sz); ++ } ++ ++ /* if unique name, remove before add */ ++ if (nvl->nvl_nvflag & NV_UNIQUE_NAME) ++ (void) nvlist_remove_all(nvl, name); ++ else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE) ++ (void) nvlist_remove(nvl, name, type); ++ ++ nvp_buf_link(nvl, nvp); ++ ++ return (0); ++} ++ ++int ++nvlist_add_boolean(nvlist_t *nvl, const char *name) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL)); ++} ++ ++int ++nvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val)); ++} ++ ++int ++nvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val)); ++} ++ ++int ++nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val)); ++} ++ ++int ++nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val)); ++} ++ ++int ++nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val)); ++} ++ ++int ++nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val)); ++} ++ ++int ++nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val)); ++} ++ ++int ++nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val)); ++} ++ ++int ++nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val)); ++} ++ ++int ++nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val)); ++} ++ ++#if !defined(_KERNEL) ++int ++nvlist_add_double(nvlist_t *nvl, const char *name, double val) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val)); ++} ++#endif ++ ++int ++nvlist_add_string(nvlist_t *nvl, const char *name, const char *val) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val)); ++} ++ ++int ++nvlist_add_boolean_array(nvlist_t *nvl, const char *name, ++ boolean_t *a, uint_t n) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a)); ++} ++ ++int ++nvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *a, uint_t n) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a)); ++} ++ ++int ++nvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *a, uint_t n) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a)); ++} ++ ++int ++nvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *a, uint_t n) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a)); ++} ++ ++int ++nvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *a, uint_t n) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a)); ++} ++ ++int ++nvlist_add_uint16_array(nvlist_t *nvl, const char *name, uint16_t *a, uint_t n) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a)); ++} ++ ++int ++nvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *a, uint_t n) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a)); ++} ++ ++int ++nvlist_add_uint32_array(nvlist_t *nvl, const char *name, uint32_t *a, uint_t n) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a)); ++} ++ ++int ++nvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *a, uint_t n) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a)); ++} ++ ++int ++nvlist_add_uint64_array(nvlist_t *nvl, const char *name, uint64_t *a, uint_t n) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a)); ++} ++ ++int ++nvlist_add_string_array(nvlist_t *nvl, const char *name, ++ char *const *a, uint_t n) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a)); ++} ++ ++int ++nvlist_add_hrtime(nvlist_t *nvl, const char *name, hrtime_t val) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val)); ++} ++ ++int ++nvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val)); ++} ++ ++int ++nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **a, uint_t n) ++{ ++ return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a)); ++} ++ ++/* reading name-value pairs */ ++nvpair_t * ++nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp) ++{ ++ nvpriv_t *priv; ++ i_nvp_t *curr; ++ ++ if (nvl == NULL || ++ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) ++ return (NULL); ++ ++ curr = NVPAIR2I_NVP(nvp); ++ ++ /* ++ * Ensure that nvp is a valid nvpair on this nvlist. ++ * NB: nvp_curr is used only as a hint so that we don't always ++ * have to walk the list to determine if nvp is still on the list. ++ */ ++ if (nvp == NULL) ++ curr = priv->nvp_list; ++ else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp)) ++ curr = curr->nvi_next; ++ else ++ curr = NULL; ++ ++ priv->nvp_curr = curr; ++ ++ return (curr != NULL ? &curr->nvi_nvp : NULL); ++} ++ ++nvpair_t * ++nvlist_prev_nvpair(nvlist_t *nvl, nvpair_t *nvp) ++{ ++ nvpriv_t *priv; ++ i_nvp_t *curr; ++ ++ if (nvl == NULL || ++ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) ++ return (NULL); ++ ++ curr = NVPAIR2I_NVP(nvp); ++ ++ if (nvp == NULL) ++ curr = priv->nvp_last; ++ else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp)) ++ curr = curr->nvi_prev; ++ else ++ curr = NULL; ++ ++ priv->nvp_curr = curr; ++ ++ return (curr != NULL ? &curr->nvi_nvp : NULL); ++} ++ ++boolean_t ++nvlist_empty(nvlist_t *nvl) ++{ ++ nvpriv_t *priv; ++ ++ if (nvl == NULL || ++ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) ++ return (B_TRUE); ++ ++ return (priv->nvp_list == NULL); ++} ++ ++char * ++nvpair_name(nvpair_t *nvp) ++{ ++ return (NVP_NAME(nvp)); ++} ++ ++data_type_t ++nvpair_type(nvpair_t *nvp) ++{ ++ return (NVP_TYPE(nvp)); ++} ++ ++int ++nvpair_type_is_array(nvpair_t *nvp) ++{ ++ data_type_t type = NVP_TYPE(nvp); ++ ++ if ((type == DATA_TYPE_BYTE_ARRAY) || ++ (type == DATA_TYPE_UINT8_ARRAY) || ++ (type == DATA_TYPE_INT16_ARRAY) || ++ (type == DATA_TYPE_UINT16_ARRAY) || ++ (type == DATA_TYPE_INT32_ARRAY) || ++ (type == DATA_TYPE_UINT32_ARRAY) || ++ (type == DATA_TYPE_INT64_ARRAY) || ++ (type == DATA_TYPE_UINT64_ARRAY) || ++ (type == DATA_TYPE_BOOLEAN_ARRAY) || ++ (type == DATA_TYPE_STRING_ARRAY) || ++ (type == DATA_TYPE_NVLIST_ARRAY)) ++ return (1); ++ return (0); ++ ++} ++ ++static int ++nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data) ++{ ++ if (nvp == NULL || nvpair_type(nvp) != type) ++ return (EINVAL); ++ ++ /* ++ * For non-array types, we copy the data. ++ * For array types (including string), we set a pointer. ++ */ ++ switch (type) { ++ case DATA_TYPE_BOOLEAN: ++ if (nelem != NULL) ++ *nelem = 0; ++ break; ++ ++ case DATA_TYPE_BOOLEAN_VALUE: ++ case DATA_TYPE_BYTE: ++ case DATA_TYPE_INT8: ++ case DATA_TYPE_UINT8: ++ case DATA_TYPE_INT16: ++ case DATA_TYPE_UINT16: ++ case DATA_TYPE_INT32: ++ case DATA_TYPE_UINT32: ++ case DATA_TYPE_INT64: ++ case DATA_TYPE_UINT64: ++ case DATA_TYPE_HRTIME: ++#if !defined(_KERNEL) ++ case DATA_TYPE_DOUBLE: ++#endif ++ if (data == NULL) ++ return (EINVAL); ++ bcopy(NVP_VALUE(nvp), data, ++ (size_t)i_get_value_size(type, NULL, 1)); ++ if (nelem != NULL) ++ *nelem = 1; ++ break; ++ ++ case DATA_TYPE_NVLIST: ++ case DATA_TYPE_STRING: ++ if (data == NULL) ++ return (EINVAL); ++ *(void **)data = (void *)NVP_VALUE(nvp); ++ if (nelem != NULL) ++ *nelem = 1; ++ break; ++ ++ case DATA_TYPE_BOOLEAN_ARRAY: ++ case DATA_TYPE_BYTE_ARRAY: ++ case DATA_TYPE_INT8_ARRAY: ++ case DATA_TYPE_UINT8_ARRAY: ++ case DATA_TYPE_INT16_ARRAY: ++ case DATA_TYPE_UINT16_ARRAY: ++ case DATA_TYPE_INT32_ARRAY: ++ case DATA_TYPE_UINT32_ARRAY: ++ case DATA_TYPE_INT64_ARRAY: ++ case DATA_TYPE_UINT64_ARRAY: ++ case DATA_TYPE_STRING_ARRAY: ++ case DATA_TYPE_NVLIST_ARRAY: ++ if (nelem == NULL || data == NULL) ++ return (EINVAL); ++ if ((*nelem = NVP_NELEM(nvp)) != 0) ++ *(void **)data = (void *)NVP_VALUE(nvp); ++ else ++ *(void **)data = NULL; ++ break; ++ ++ default: ++ return (ENOTSUP); ++ } ++ ++ return (0); ++} ++ ++static int ++nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type, ++ uint_t *nelem, void *data) ++{ ++ nvpriv_t *priv; ++ nvpair_t *nvp; ++ i_nvp_t *curr; ++ ++ if (name == NULL || nvl == NULL || ++ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) ++ return (EINVAL); ++ ++ if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE))) ++ return (ENOTSUP); ++ ++ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { ++ nvp = &curr->nvi_nvp; ++ ++ if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type) ++ return (nvpair_value_common(nvp, type, nelem, data)); ++ } ++ ++ return (ENOENT); ++} ++ ++int ++nvlist_lookup_boolean(nvlist_t *nvl, const char *name) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL)); ++} ++ ++int ++nvlist_lookup_boolean_value(nvlist_t *nvl, const char *name, boolean_t *val) ++{ ++ return (nvlist_lookup_common(nvl, name, ++ DATA_TYPE_BOOLEAN_VALUE, NULL, val)); ++} ++ ++int ++nvlist_lookup_byte(nvlist_t *nvl, const char *name, uchar_t *val) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val)); ++} ++ ++int ++nvlist_lookup_int8(nvlist_t *nvl, const char *name, int8_t *val) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val)); ++} ++ ++int ++nvlist_lookup_uint8(nvlist_t *nvl, const char *name, uint8_t *val) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val)); ++} ++ ++int ++nvlist_lookup_int16(nvlist_t *nvl, const char *name, int16_t *val) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val)); ++} ++ ++int ++nvlist_lookup_uint16(nvlist_t *nvl, const char *name, uint16_t *val) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val)); ++} ++ ++int ++nvlist_lookup_int32(nvlist_t *nvl, const char *name, int32_t *val) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val)); ++} ++ ++int ++nvlist_lookup_uint32(nvlist_t *nvl, const char *name, uint32_t *val) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val)); ++} ++ ++int ++nvlist_lookup_int64(nvlist_t *nvl, const char *name, int64_t *val) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val)); ++} ++ ++int ++nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val)); ++} ++ ++#if !defined(_KERNEL) ++int ++nvlist_lookup_double(nvlist_t *nvl, const char *name, double *val) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val)); ++} ++#endif ++ ++int ++nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val)); ++} ++ ++int ++nvlist_lookup_nvlist(nvlist_t *nvl, const char *name, nvlist_t **val) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val)); ++} ++ ++int ++nvlist_lookup_boolean_array(nvlist_t *nvl, const char *name, ++ boolean_t **a, uint_t *n) ++{ ++ return (nvlist_lookup_common(nvl, name, ++ DATA_TYPE_BOOLEAN_ARRAY, n, a)); ++} ++ ++int ++nvlist_lookup_byte_array(nvlist_t *nvl, const char *name, ++ uchar_t **a, uint_t *n) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a)); ++} ++ ++int ++nvlist_lookup_int8_array(nvlist_t *nvl, const char *name, int8_t **a, uint_t *n) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a)); ++} ++ ++int ++nvlist_lookup_uint8_array(nvlist_t *nvl, const char *name, ++ uint8_t **a, uint_t *n) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a)); ++} ++ ++int ++nvlist_lookup_int16_array(nvlist_t *nvl, const char *name, ++ int16_t **a, uint_t *n) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a)); ++} ++ ++int ++nvlist_lookup_uint16_array(nvlist_t *nvl, const char *name, ++ uint16_t **a, uint_t *n) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a)); ++} ++ ++int ++nvlist_lookup_int32_array(nvlist_t *nvl, const char *name, ++ int32_t **a, uint_t *n) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a)); ++} ++ ++int ++nvlist_lookup_uint32_array(nvlist_t *nvl, const char *name, ++ uint32_t **a, uint_t *n) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a)); ++} ++ ++int ++nvlist_lookup_int64_array(nvlist_t *nvl, const char *name, ++ int64_t **a, uint_t *n) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a)); ++} ++ ++int ++nvlist_lookup_uint64_array(nvlist_t *nvl, const char *name, ++ uint64_t **a, uint_t *n) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a)); ++} ++ ++int ++nvlist_lookup_string_array(nvlist_t *nvl, const char *name, ++ char ***a, uint_t *n) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a)); ++} ++ ++int ++nvlist_lookup_nvlist_array(nvlist_t *nvl, const char *name, ++ nvlist_t ***a, uint_t *n) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a)); ++} ++ ++int ++nvlist_lookup_hrtime(nvlist_t *nvl, const char *name, hrtime_t *val) ++{ ++ return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val)); ++} ++ ++int ++nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...) ++{ ++ va_list ap; ++ char *name; ++ int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0); ++ int ret = 0; ++ ++ va_start(ap, flag); ++ while (ret == 0 && (name = va_arg(ap, char *)) != NULL) { ++ data_type_t type; ++ void *val; ++ uint_t *nelem; ++ ++ switch (type = va_arg(ap, data_type_t)) { ++ case DATA_TYPE_BOOLEAN: ++ ret = nvlist_lookup_common(nvl, name, type, NULL, NULL); ++ break; ++ ++ case DATA_TYPE_BOOLEAN_VALUE: ++ case DATA_TYPE_BYTE: ++ case DATA_TYPE_INT8: ++ case DATA_TYPE_UINT8: ++ case DATA_TYPE_INT16: ++ case DATA_TYPE_UINT16: ++ case DATA_TYPE_INT32: ++ case DATA_TYPE_UINT32: ++ case DATA_TYPE_INT64: ++ case DATA_TYPE_UINT64: ++ case DATA_TYPE_HRTIME: ++ case DATA_TYPE_STRING: ++ case DATA_TYPE_NVLIST: ++#if !defined(_KERNEL) ++ case DATA_TYPE_DOUBLE: ++#endif ++ val = va_arg(ap, void *); ++ ret = nvlist_lookup_common(nvl, name, type, NULL, val); ++ break; ++ ++ case DATA_TYPE_BYTE_ARRAY: ++ case DATA_TYPE_BOOLEAN_ARRAY: ++ case DATA_TYPE_INT8_ARRAY: ++ case DATA_TYPE_UINT8_ARRAY: ++ case DATA_TYPE_INT16_ARRAY: ++ case DATA_TYPE_UINT16_ARRAY: ++ case DATA_TYPE_INT32_ARRAY: ++ case DATA_TYPE_UINT32_ARRAY: ++ case DATA_TYPE_INT64_ARRAY: ++ case DATA_TYPE_UINT64_ARRAY: ++ case DATA_TYPE_STRING_ARRAY: ++ case DATA_TYPE_NVLIST_ARRAY: ++ val = va_arg(ap, void *); ++ nelem = va_arg(ap, uint_t *); ++ ret = nvlist_lookup_common(nvl, name, type, nelem, val); ++ break; ++ ++ default: ++ ret = EINVAL; ++ } ++ ++ if (ret == ENOENT && noentok) ++ ret = 0; ++ } ++ va_end(ap); ++ ++ return (ret); ++} ++ ++/* ++ * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function ++ * returns zero and a pointer to the matching nvpair is returned in '*ret' ++ * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate ++ * multiple levels of embedded nvlists, with 'sep' as the separator. As an ++ * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or ++ * "a.d[3].e[1]". This matches the C syntax for array embed (for convience, ++ * code also supports "a.d[3]e[1]" syntax). ++ * ++ * If 'ip' is non-NULL and the last name component is an array, return the ++ * value of the "...[index]" array index in *ip. For an array reference that ++ * is not indexed, *ip will be returned as -1. If there is a syntax error in ++ * 'name', and 'ep' is non-NULL then *ep will be set to point to the location ++ * inside the 'name' string where the syntax error was detected. ++ */ ++static int ++nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep, ++ nvpair_t **ret, int *ip, char **ep) ++{ ++ nvpair_t *nvp; ++ const char *np; ++ char *sepp=NULL; ++ char *idxp, *idxep; ++ nvlist_t **nva; ++ long idx = 0; ++ int n; ++ ++ if (ip) ++ *ip = -1; /* not indexed */ ++ if (ep) ++ *ep = NULL; ++ ++ if ((nvl == NULL) || (name == NULL)) ++ return (EINVAL); ++ ++ /* step through components of name */ ++ for (np = name; np && *np; np = sepp) { ++ /* ensure unique names */ ++ if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME)) ++ return (ENOTSUP); ++ ++ /* skip white space */ ++ skip_whitespace(np); ++ if (*np == 0) ++ break; ++ ++ /* set 'sepp' to end of current component 'np' */ ++ if (sep) ++ sepp = strchr(np, sep); ++ else ++ sepp = NULL; ++ ++ /* find start of next "[ index ]..." */ ++ idxp = strchr(np, '['); ++ ++ /* if sepp comes first, set idxp to NULL */ ++ if (sepp && idxp && (sepp < idxp)) ++ idxp = NULL; ++ ++ /* ++ * At this point 'idxp' is set if there is an index ++ * expected for the current component. ++ */ ++ if (idxp) { ++ /* set 'n' to length of current 'np' name component */ ++ n = idxp++ - np; ++ ++ /* keep sepp up to date for *ep use as we advance */ ++ skip_whitespace(idxp); ++ sepp = idxp; ++ ++ /* determine the index value */ ++#if defined(_KERNEL) && !defined(_BOOT) ++ if (ddi_strtol(idxp, &idxep, 0, &idx)) ++ goto fail; ++#else ++ idx = strtol(idxp, &idxep, 0); ++#endif ++ if (idxep == idxp) ++ goto fail; ++ ++ /* keep sepp up to date for *ep use as we advance */ ++ sepp = idxep; ++ ++ /* skip white space index value and check for ']' */ ++ skip_whitespace(sepp); ++ if (*sepp++ != ']') ++ goto fail; ++ ++ /* for embedded arrays, support C syntax: "a[1].b" */ ++ skip_whitespace(sepp); ++ if (sep && (*sepp == sep)) ++ sepp++; ++ } else if (sepp) { ++ n = sepp++ - np; ++ } else { ++ n = strlen(np); ++ } ++ ++ /* trim trailing whitespace by reducing length of 'np' */ ++ if (n == 0) ++ goto fail; ++ for (n--; (np[n] == ' ') || (np[n] == '\t'); n--) ++ ; ++ n++; ++ ++ /* skip whitespace, and set sepp to NULL if complete */ ++ if (sepp) { ++ skip_whitespace(sepp); ++ if (*sepp == 0) ++ sepp = NULL; ++ } ++ ++ /* ++ * At this point: ++ * o 'n' is the length of current 'np' component. ++ * o 'idxp' is set if there was an index, and value 'idx'. ++ * o 'sepp' is set to the beginning of the next component, ++ * and set to NULL if we have no more components. ++ * ++ * Search for nvpair with matching component name. ++ */ ++ for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; ++ nvp = nvlist_next_nvpair(nvl, nvp)) { ++ ++ /* continue if no match on name */ ++ if (strncmp(np, nvpair_name(nvp), n) || ++ (strlen(nvpair_name(nvp)) != n)) ++ continue; ++ ++ /* if indexed, verify type is array oriented */ ++ if (idxp && !nvpair_type_is_array(nvp)) ++ goto fail; ++ ++ /* ++ * Full match found, return nvp and idx if this ++ * was the last component. ++ */ ++ if (sepp == NULL) { ++ if (ret) ++ *ret = nvp; ++ if (ip && idxp) ++ *ip = (int)idx; /* return index */ ++ return (0); /* found */ ++ } ++ ++ /* ++ * More components: current match must be ++ * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY ++ * to support going deeper. ++ */ ++ if (nvpair_type(nvp) == DATA_TYPE_NVLIST) { ++ nvl = EMBEDDED_NVL(nvp); ++ break; ++ } else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) { ++ (void) nvpair_value_nvlist_array(nvp, ++ &nva, (uint_t *)&n); ++ if ((n < 0) || (idx >= n)) ++ goto fail; ++ nvl = nva[idx]; ++ break; ++ } ++ ++ /* type does not support more levels */ ++ goto fail; ++ } ++ if (nvp == NULL) ++ goto fail; /* 'name' not found */ ++ ++ /* search for match of next component in embedded 'nvl' list */ ++ } ++ ++fail: if (ep && sepp) ++ *ep = sepp; ++ return (EINVAL); ++} ++ ++/* ++ * Return pointer to nvpair with specified 'name'. ++ */ ++int ++nvlist_lookup_nvpair(nvlist_t *nvl, const char *name, nvpair_t **ret) ++{ ++ return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL)); ++} ++ ++/* ++ * Determine if named nvpair exists in nvlist (use embedded separator of '.' ++ * and return array index). See nvlist_lookup_nvpair_ei_sep for more detailed ++ * description. ++ */ ++int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl, ++ const char *name, nvpair_t **ret, int *ip, char **ep) ++{ ++ return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep)); ++} ++ ++boolean_t ++nvlist_exists(nvlist_t *nvl, const char *name) ++{ ++ nvpriv_t *priv; ++ nvpair_t *nvp; ++ i_nvp_t *curr; ++ ++ if (name == NULL || nvl == NULL || ++ (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) ++ return (B_FALSE); ++ ++ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { ++ nvp = &curr->nvi_nvp; ++ ++ if (strcmp(name, NVP_NAME(nvp)) == 0) ++ return (B_TRUE); ++ } ++ ++ return (B_FALSE); ++} ++ ++int ++nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val)); ++} ++ ++int ++nvpair_value_byte(nvpair_t *nvp, uchar_t *val) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val)); ++} ++ ++int ++nvpair_value_int8(nvpair_t *nvp, int8_t *val) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val)); ++} ++ ++int ++nvpair_value_uint8(nvpair_t *nvp, uint8_t *val) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val)); ++} ++ ++int ++nvpair_value_int16(nvpair_t *nvp, int16_t *val) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val)); ++} ++ ++int ++nvpair_value_uint16(nvpair_t *nvp, uint16_t *val) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val)); ++} ++ ++int ++nvpair_value_int32(nvpair_t *nvp, int32_t *val) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val)); ++} ++ ++int ++nvpair_value_uint32(nvpair_t *nvp, uint32_t *val) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val)); ++} ++ ++int ++nvpair_value_int64(nvpair_t *nvp, int64_t *val) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val)); ++} ++ ++int ++nvpair_value_uint64(nvpair_t *nvp, uint64_t *val) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val)); ++} ++ ++#if !defined(_KERNEL) ++int ++nvpair_value_double(nvpair_t *nvp, double *val) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val)); ++} ++#endif ++ ++int ++nvpair_value_string(nvpair_t *nvp, char **val) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val)); ++} ++ ++int ++nvpair_value_nvlist(nvpair_t *nvp, nvlist_t **val) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val)); ++} ++ ++int ++nvpair_value_boolean_array(nvpair_t *nvp, boolean_t **val, uint_t *nelem) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val)); ++} ++ ++int ++nvpair_value_byte_array(nvpair_t *nvp, uchar_t **val, uint_t *nelem) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val)); ++} ++ ++int ++nvpair_value_int8_array(nvpair_t *nvp, int8_t **val, uint_t *nelem) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val)); ++} ++ ++int ++nvpair_value_uint8_array(nvpair_t *nvp, uint8_t **val, uint_t *nelem) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val)); ++} ++ ++int ++nvpair_value_int16_array(nvpair_t *nvp, int16_t **val, uint_t *nelem) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val)); ++} ++ ++int ++nvpair_value_uint16_array(nvpair_t *nvp, uint16_t **val, uint_t *nelem) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val)); ++} ++ ++int ++nvpair_value_int32_array(nvpair_t *nvp, int32_t **val, uint_t *nelem) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val)); ++} ++ ++int ++nvpair_value_uint32_array(nvpair_t *nvp, uint32_t **val, uint_t *nelem) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val)); ++} ++ ++int ++nvpair_value_int64_array(nvpair_t *nvp, int64_t **val, uint_t *nelem) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val)); ++} ++ ++int ++nvpair_value_uint64_array(nvpair_t *nvp, uint64_t **val, uint_t *nelem) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val)); ++} ++ ++int ++nvpair_value_string_array(nvpair_t *nvp, char ***val, uint_t *nelem) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val)); ++} ++ ++int ++nvpair_value_nvlist_array(nvpair_t *nvp, nvlist_t ***val, uint_t *nelem) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val)); ++} ++ ++int ++nvpair_value_hrtime(nvpair_t *nvp, hrtime_t *val) ++{ ++ return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val)); ++} ++ ++/* ++ * Add specified pair to the list. ++ */ ++int ++nvlist_add_nvpair(nvlist_t *nvl, nvpair_t *nvp) ++{ ++ if (nvl == NULL || nvp == NULL) ++ return (EINVAL); ++ ++ return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp), ++ NVP_NELEM(nvp), NVP_VALUE(nvp))); ++} ++ ++/* ++ * Merge the supplied nvlists and put the result in dst. ++ * The merged list will contain all names specified in both lists, ++ * the values are taken from nvl in the case of duplicates. ++ * Return 0 on success. ++ */ ++/*ARGSUSED*/ ++int ++nvlist_merge(nvlist_t *dst, nvlist_t *nvl, int flag) ++{ ++ if (nvl == NULL || dst == NULL) ++ return (EINVAL); ++ ++ if (dst != nvl) ++ return (nvlist_copy_pairs(nvl, dst)); ++ ++ return (0); ++} ++ ++/* ++ * Encoding related routines ++ */ ++#define NVS_OP_ENCODE 0 ++#define NVS_OP_DECODE 1 ++#define NVS_OP_GETSIZE 2 ++ ++typedef struct nvs_ops nvs_ops_t; ++ ++typedef struct { ++ int nvs_op; ++ const nvs_ops_t *nvs_ops; ++ void *nvs_private; ++ nvpriv_t *nvs_priv; ++} nvstream_t; ++ ++/* ++ * nvs operations are: ++ * - nvs_nvlist ++ * encoding / decoding of a nvlist header (nvlist_t) ++ * calculates the size used for header and end detection ++ * ++ * - nvs_nvpair ++ * responsible for the first part of encoding / decoding of an nvpair ++ * calculates the decoded size of an nvpair ++ * ++ * - nvs_nvp_op ++ * second part of encoding / decoding of an nvpair ++ * ++ * - nvs_nvp_size ++ * calculates the encoding size of an nvpair ++ * ++ * - nvs_nvl_fini ++ * encodes the end detection mark (zeros). ++ */ ++struct nvs_ops { ++ int (*nvs_nvlist)(nvstream_t *, nvlist_t *, size_t *); ++ int (*nvs_nvpair)(nvstream_t *, nvpair_t *, size_t *); ++ int (*nvs_nvp_op)(nvstream_t *, nvpair_t *); ++ int (*nvs_nvp_size)(nvstream_t *, nvpair_t *, size_t *); ++ int (*nvs_nvl_fini)(nvstream_t *); ++}; ++ ++typedef struct { ++ char nvh_encoding; /* nvs encoding method */ ++ char nvh_endian; /* nvs endian */ ++ char nvh_reserved1; /* reserved for future use */ ++ char nvh_reserved2; /* reserved for future use */ ++} nvs_header_t; ++ ++static int ++nvs_encode_pairs(nvstream_t *nvs, nvlist_t *nvl) ++{ ++ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; ++ i_nvp_t *curr; ++ ++ /* ++ * Walk nvpair in list and encode each nvpair ++ */ ++ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) ++ if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0) ++ return (EFAULT); ++ ++ return (nvs->nvs_ops->nvs_nvl_fini(nvs)); ++} ++ ++static int ++nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl) ++{ ++ nvpair_t *nvp; ++ size_t nvsize; ++ int err; ++ ++ /* ++ * Get decoded size of next pair in stream, alloc ++ * memory for nvpair_t, then decode the nvpair ++ */ ++ while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) { ++ if (nvsize == 0) /* end of list */ ++ break; ++ ++ /* make sure len makes sense */ ++ if (nvsize < NVP_SIZE_CALC(1, 0)) ++ return (EFAULT); ++ ++ if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL) ++ return (ENOMEM); ++ ++ if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) { ++ nvp_buf_free(nvl, nvp); ++ return (err); ++ } ++ ++ if (i_validate_nvpair(nvp) != 0) { ++ nvpair_free(nvp); ++ nvp_buf_free(nvl, nvp); ++ return (EFAULT); ++ } ++ ++ nvp_buf_link(nvl, nvp); ++ } ++ return (err); ++} ++ ++static int ++nvs_getsize_pairs(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen) ++{ ++ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; ++ i_nvp_t *curr; ++ uint64_t nvsize = *buflen; ++ size_t size; ++ ++ /* ++ * Get encoded size of nvpairs in nvlist ++ */ ++ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { ++ if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0) ++ return (EINVAL); ++ ++ if ((nvsize += size) > INT32_MAX) ++ return (EINVAL); ++ } ++ ++ *buflen = nvsize; ++ return (0); ++} ++ ++static int ++nvs_operation(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen) ++{ ++ int err; ++ ++ if (nvl->nvl_priv == 0) ++ return (EFAULT); ++ ++ /* ++ * Perform the operation, starting with header, then each nvpair ++ */ ++ if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0) ++ return (err); ++ ++ switch (nvs->nvs_op) { ++ case NVS_OP_ENCODE: ++ err = nvs_encode_pairs(nvs, nvl); ++ break; ++ ++ case NVS_OP_DECODE: ++ err = nvs_decode_pairs(nvs, nvl); ++ break; ++ ++ case NVS_OP_GETSIZE: ++ err = nvs_getsize_pairs(nvs, nvl, buflen); ++ break; ++ ++ default: ++ err = EINVAL; ++ } ++ ++ return (err); ++} ++ ++static int ++nvs_embedded(nvstream_t *nvs, nvlist_t *embedded) ++{ ++ switch (nvs->nvs_op) { ++ case NVS_OP_ENCODE: ++ return (nvs_operation(nvs, embedded, NULL)); ++ ++ case NVS_OP_DECODE: { ++ nvpriv_t *priv; ++ int err; ++ ++ if (embedded->nvl_version != NV_VERSION) ++ return (ENOTSUP); ++ ++ if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL) ++ return (ENOMEM); ++ ++ nvlist_init(embedded, embedded->nvl_nvflag, priv); ++ ++ if ((err = nvs_operation(nvs, embedded, NULL)) != 0) ++ nvlist_free(embedded); ++ return (err); ++ } ++ default: ++ break; ++ } ++ ++ return (EINVAL); ++} ++ ++static int ++nvs_embedded_nvl_array(nvstream_t *nvs, nvpair_t *nvp, size_t *size) ++{ ++ size_t nelem = NVP_NELEM(nvp); ++ nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); ++ int i; ++ ++ switch (nvs->nvs_op) { ++ case NVS_OP_ENCODE: ++ for (i = 0; i < nelem; i++) ++ if (nvs_embedded(nvs, nvlp[i]) != 0) ++ return (EFAULT); ++ break; ++ ++ case NVS_OP_DECODE: { ++ size_t len = nelem * sizeof (uint64_t); ++ nvlist_t *embedded = (nvlist_t *)((uintptr_t)nvlp + len); ++ ++ bzero(nvlp, len); /* don't trust packed data */ ++ for (i = 0; i < nelem; i++) { ++ if (nvs_embedded(nvs, embedded) != 0) { ++ nvpair_free(nvp); ++ return (EFAULT); ++ } ++ ++ nvlp[i] = embedded++; ++ } ++ break; ++ } ++ case NVS_OP_GETSIZE: { ++ uint64_t nvsize = 0; ++ ++ for (i = 0; i < nelem; i++) { ++ size_t nvp_sz = 0; ++ ++ if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0) ++ return (EINVAL); ++ ++ if ((nvsize += nvp_sz) > INT32_MAX) ++ return (EINVAL); ++ } ++ ++ *size = nvsize; ++ break; ++ } ++ default: ++ return (EINVAL); ++ } ++ ++ return (0); ++} ++ ++static int nvs_native(nvstream_t *, nvlist_t *, char *, size_t *); ++static int nvs_xdr(nvstream_t *, nvlist_t *, char *, size_t *); ++ ++/* ++ * Common routine for nvlist operations: ++ * encode, decode, getsize (encoded size). ++ */ ++static int ++nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding, ++ int nvs_op) ++{ ++ int err = 0; ++ nvstream_t nvs; ++ int nvl_endian; ++#ifdef _LITTLE_ENDIAN ++ int host_endian = 1; ++#else ++ int host_endian = 0; ++#endif /* _LITTLE_ENDIAN */ ++ nvs_header_t *nvh = (void *)buf; ++ ++ if (buflen == NULL || nvl == NULL || ++ (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) ++ return (EINVAL); ++ ++ nvs.nvs_op = nvs_op; ++ ++ /* ++ * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and ++ * a buffer is allocated. The first 4 bytes in the buffer are ++ * used for encoding method and host endian. ++ */ ++ switch (nvs_op) { ++ case NVS_OP_ENCODE: ++ if (buf == NULL || *buflen < sizeof (nvs_header_t)) ++ return (EINVAL); ++ ++ nvh->nvh_encoding = encoding; ++ nvh->nvh_endian = nvl_endian = host_endian; ++ nvh->nvh_reserved1 = 0; ++ nvh->nvh_reserved2 = 0; ++ break; ++ ++ case NVS_OP_DECODE: ++ if (buf == NULL || *buflen < sizeof (nvs_header_t)) ++ return (EINVAL); ++ ++ /* get method of encoding from first byte */ ++ encoding = nvh->nvh_encoding; ++ nvl_endian = nvh->nvh_endian; ++ break; ++ ++ case NVS_OP_GETSIZE: ++ nvl_endian = host_endian; ++ ++ /* ++ * add the size for encoding ++ */ ++ *buflen = sizeof (nvs_header_t); ++ break; ++ ++ default: ++ return (ENOTSUP); ++ } ++ ++ /* ++ * Create an nvstream with proper encoding method ++ */ ++ switch (encoding) { ++ case NV_ENCODE_NATIVE: ++ /* ++ * check endianness, in case we are unpacking ++ * from a file ++ */ ++ if (nvl_endian != host_endian) ++ return (ENOTSUP); ++ err = nvs_native(&nvs, nvl, buf, buflen); ++ break; ++ case NV_ENCODE_XDR: ++ err = nvs_xdr(&nvs, nvl, buf, buflen); ++ break; ++ default: ++ err = ENOTSUP; ++ break; ++ } ++ ++ return (err); ++} ++ ++int ++nvlist_size(nvlist_t *nvl, size_t *size, int encoding) ++{ ++ return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE)); ++} ++ ++/* ++ * Pack nvlist into contiguous memory ++ */ ++/*ARGSUSED1*/ ++int ++nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, ++ int kmflag) ++{ ++#if defined(_KERNEL) && !defined(_BOOT) ++ return (nvlist_xpack(nvl, bufp, buflen, encoding, ++ (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); ++#else ++ return (nvlist_xpack(nvl, bufp, buflen, encoding, nv_alloc_nosleep)); ++#endif ++} ++ ++int ++nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, ++ nv_alloc_t *nva) ++{ ++ nvpriv_t nvpriv; ++ size_t alloc_size; ++ char *buf; ++ int err; ++ ++ if (nva == NULL || nvl == NULL || bufp == NULL || buflen == NULL) ++ return (EINVAL); ++ ++ if (*bufp != NULL) ++ return (nvlist_common(nvl, *bufp, buflen, encoding, ++ NVS_OP_ENCODE)); ++ ++ /* ++ * Here is a difficult situation: ++ * 1. The nvlist has fixed allocator properties. ++ * All other nvlist routines (like nvlist_add_*, ...) use ++ * these properties. ++ * 2. When using nvlist_pack() the user can specify his own ++ * allocator properties (e.g. by using KM_NOSLEEP). ++ * ++ * We use the user specified properties (2). A clearer solution ++ * will be to remove the kmflag from nvlist_pack(), but we will ++ * not change the interface. ++ */ ++ nv_priv_init(&nvpriv, nva, 0); ++ ++ if ((err = nvlist_size(nvl, &alloc_size, encoding))) ++ return (err); ++ ++ if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL) ++ return (ENOMEM); ++ ++ if ((err = nvlist_common(nvl, buf, &alloc_size, encoding, ++ NVS_OP_ENCODE)) != 0) { ++ nv_mem_free(&nvpriv, buf, alloc_size); ++ } else { ++ *buflen = alloc_size; ++ *bufp = buf; ++ } ++ ++ return (err); ++} ++ ++/* ++ * Unpack buf into an nvlist_t ++ */ ++/*ARGSUSED1*/ ++int ++nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag) ++{ ++#if defined(_KERNEL) && !defined(_BOOT) ++ return (nvlist_xunpack(buf, buflen, nvlp, ++ (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); ++#else ++ return (nvlist_xunpack(buf, buflen, nvlp, nv_alloc_nosleep)); ++#endif ++} ++ ++int ++nvlist_xunpack(char *buf, size_t buflen, nvlist_t **nvlp, nv_alloc_t *nva) ++{ ++ nvlist_t *nvl; ++ int err; ++ ++ if (nvlp == NULL) ++ return (EINVAL); ++ ++ if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0) ++ return (err); ++ ++ if ((err = nvlist_common(nvl, buf, &buflen, 0, NVS_OP_DECODE)) != 0) ++ nvlist_free(nvl); ++ else ++ *nvlp = nvl; ++ ++ return (err); ++} ++ ++/* ++ * Native encoding functions ++ */ ++typedef struct { ++ /* ++ * This structure is used when decoding a packed nvpair in ++ * the native format. n_base points to a buffer containing the ++ * packed nvpair. n_end is a pointer to the end of the buffer. ++ * (n_end actually points to the first byte past the end of the ++ * buffer.) n_curr is a pointer that lies between n_base and n_end. ++ * It points to the current data that we are decoding. ++ * The amount of data left in the buffer is equal to n_end - n_curr. ++ * n_flag is used to recognize a packed embedded list. ++ */ ++ caddr_t n_base; ++ caddr_t n_end; ++ caddr_t n_curr; ++ uint_t n_flag; ++} nvs_native_t; ++ ++static int ++nvs_native_create(nvstream_t *nvs, nvs_native_t *native, char *buf, ++ size_t buflen) ++{ ++ switch (nvs->nvs_op) { ++ case NVS_OP_ENCODE: ++ case NVS_OP_DECODE: ++ nvs->nvs_private = native; ++ native->n_curr = native->n_base = buf; ++ native->n_end = buf + buflen; ++ native->n_flag = 0; ++ return (0); ++ ++ case NVS_OP_GETSIZE: ++ nvs->nvs_private = native; ++ native->n_curr = native->n_base = native->n_end = NULL; ++ native->n_flag = 0; ++ return (0); ++ default: ++ return (EINVAL); ++ } ++} ++ ++/*ARGSUSED*/ ++static void ++nvs_native_destroy(nvstream_t *nvs) ++{ ++} ++ ++static int ++native_cp(nvstream_t *nvs, void *buf, size_t size) ++{ ++ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; ++ ++ if (native->n_curr + size > native->n_end) ++ return (EFAULT); ++ ++ /* ++ * The bcopy() below eliminates alignment requirement ++ * on the buffer (stream) and is preferred over direct access. ++ */ ++ switch (nvs->nvs_op) { ++ case NVS_OP_ENCODE: ++ bcopy(buf, native->n_curr, size); ++ break; ++ case NVS_OP_DECODE: ++ bcopy(native->n_curr, buf, size); ++ break; ++ default: ++ return (EINVAL); ++ } ++ ++ native->n_curr += size; ++ return (0); ++} ++ ++/* ++ * operate on nvlist_t header ++ */ ++static int ++nvs_native_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size) ++{ ++ nvs_native_t *native = nvs->nvs_private; ++ ++ switch (nvs->nvs_op) { ++ case NVS_OP_ENCODE: ++ case NVS_OP_DECODE: ++ if (native->n_flag) ++ return (0); /* packed embedded list */ ++ ++ native->n_flag = 1; ++ ++ /* copy version and nvflag of the nvlist_t */ ++ if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 || ++ native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0) ++ return (EFAULT); ++ ++ return (0); ++ ++ case NVS_OP_GETSIZE: ++ /* ++ * if calculate for packed embedded list ++ * 4 for end of the embedded list ++ * else ++ * 2 * sizeof (int32_t) for nvl_version and nvl_nvflag ++ * and 4 for end of the entire list ++ */ ++ if (native->n_flag) { ++ *size += 4; ++ } else { ++ native->n_flag = 1; ++ *size += 2 * sizeof (int32_t) + 4; ++ } ++ ++ return (0); ++ ++ default: ++ return (EINVAL); ++ } ++} ++ ++static int ++nvs_native_nvl_fini(nvstream_t *nvs) ++{ ++ if (nvs->nvs_op == NVS_OP_ENCODE) { ++ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; ++ /* ++ * Add 4 zero bytes at end of nvlist. They are used ++ * for end detection by the decode routine. ++ */ ++ if (native->n_curr + sizeof (int) > native->n_end) ++ return (EFAULT); ++ ++ bzero(native->n_curr, sizeof (int)); ++ native->n_curr += sizeof (int); ++ } ++ ++ return (0); ++} ++ ++static int ++nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp) ++{ ++ if (nvs->nvs_op == NVS_OP_ENCODE) { ++ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; ++ nvlist_t *packed = (void *) ++ (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp)); ++ /* ++ * Null out the pointer that is meaningless in the packed ++ * structure. The address may not be aligned, so we have ++ * to use bzero. ++ */ ++ bzero(&packed->nvl_priv, sizeof (packed->nvl_priv)); ++ } ++ ++ return (nvs_embedded(nvs, EMBEDDED_NVL(nvp))); ++} ++ ++static int ++nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp) ++{ ++ if (nvs->nvs_op == NVS_OP_ENCODE) { ++ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; ++ char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp); ++ size_t len = NVP_NELEM(nvp) * sizeof (uint64_t); ++ nvlist_t *packed = (nvlist_t *)((uintptr_t)value + len); ++ int i; ++ /* ++ * Null out pointers that are meaningless in the packed ++ * structure. The addresses may not be aligned, so we have ++ * to use bzero. ++ */ ++ bzero(value, len); ++ ++ for (i = 0; i < NVP_NELEM(nvp); i++, packed++) ++ /* ++ * Null out the pointer that is meaningless in the ++ * packed structure. The address may not be aligned, ++ * so we have to use bzero. ++ */ ++ bzero(&packed->nvl_priv, sizeof (packed->nvl_priv)); ++ } ++ ++ return (nvs_embedded_nvl_array(nvs, nvp, NULL)); ++} ++ ++static void ++nvpair_native_string_array(nvstream_t *nvs, nvpair_t *nvp) ++{ ++ switch (nvs->nvs_op) { ++ case NVS_OP_ENCODE: { ++ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; ++ uint64_t *strp = (void *) ++ (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp)); ++ /* ++ * Null out pointers that are meaningless in the packed ++ * structure. The addresses may not be aligned, so we have ++ * to use bzero. ++ */ ++ bzero(strp, NVP_NELEM(nvp) * sizeof (uint64_t)); ++ break; ++ } ++ case NVS_OP_DECODE: { ++ char **strp = (void *)NVP_VALUE(nvp); ++ char *buf = ((char *)strp + NVP_NELEM(nvp) * sizeof (uint64_t)); ++ int i; ++ ++ for (i = 0; i < NVP_NELEM(nvp); i++) { ++ strp[i] = buf; ++ buf += strlen(buf) + 1; ++ } ++ break; ++ } ++ } ++} ++ ++static int ++nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp) ++{ ++ data_type_t type; ++ int value_sz; ++ int ret = 0; ++ ++ /* ++ * We do the initial bcopy of the data before we look at ++ * the nvpair type, because when we're decoding, we won't ++ * have the correct values for the pair until we do the bcopy. ++ */ ++ switch (nvs->nvs_op) { ++ case NVS_OP_ENCODE: ++ case NVS_OP_DECODE: ++ if (native_cp(nvs, nvp, nvp->nvp_size) != 0) ++ return (EFAULT); ++ break; ++ default: ++ return (EINVAL); ++ } ++ ++ /* verify nvp_name_sz, check the name string length */ ++ if (i_validate_nvpair_name(nvp) != 0) ++ return (EFAULT); ++ ++ type = NVP_TYPE(nvp); ++ ++ /* ++ * Verify type and nelem and get the value size. ++ * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY ++ * is the size of the string(s) excluded. ++ */ ++ if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0) ++ return (EFAULT); ++ ++ if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size) ++ return (EFAULT); ++ ++ switch (type) { ++ case DATA_TYPE_NVLIST: ++ ret = nvpair_native_embedded(nvs, nvp); ++ break; ++ case DATA_TYPE_NVLIST_ARRAY: ++ ret = nvpair_native_embedded_array(nvs, nvp); ++ break; ++ case DATA_TYPE_STRING_ARRAY: ++ nvpair_native_string_array(nvs, nvp); ++ break; ++ default: ++ break; ++ } ++ ++ return (ret); ++} ++ ++static int ++nvs_native_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) ++{ ++ uint64_t nvp_sz = nvp->nvp_size; ++ ++ switch (NVP_TYPE(nvp)) { ++ case DATA_TYPE_NVLIST: { ++ size_t nvsize = 0; ++ ++ if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0) ++ return (EINVAL); ++ ++ nvp_sz += nvsize; ++ break; ++ } ++ case DATA_TYPE_NVLIST_ARRAY: { ++ size_t nvsize; ++ ++ if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0) ++ return (EINVAL); ++ ++ nvp_sz += nvsize; ++ break; ++ } ++ default: ++ break; ++ } ++ ++ if (nvp_sz > INT32_MAX) ++ return (EINVAL); ++ ++ *size = nvp_sz; ++ ++ return (0); ++} ++ ++static int ++nvs_native_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size) ++{ ++ switch (nvs->nvs_op) { ++ case NVS_OP_ENCODE: ++ return (nvs_native_nvp_op(nvs, nvp)); ++ ++ case NVS_OP_DECODE: { ++ nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; ++ int32_t decode_len; ++ ++ /* try to read the size value from the stream */ ++ if (native->n_curr + sizeof (int32_t) > native->n_end) ++ return (EFAULT); ++ bcopy(native->n_curr, &decode_len, sizeof (int32_t)); ++ ++ /* sanity check the size value */ ++ if (decode_len < 0 || ++ decode_len > native->n_end - native->n_curr) ++ return (EFAULT); ++ ++ *size = decode_len; ++ ++ /* ++ * If at the end of the stream then move the cursor ++ * forward, otherwise nvpair_native_op() will read ++ * the entire nvpair at the same cursor position. ++ */ ++ if (*size == 0) ++ native->n_curr += sizeof (int32_t); ++ break; ++ } ++ ++ default: ++ return (EINVAL); ++ } ++ ++ return (0); ++} ++ ++static const nvs_ops_t nvs_native_ops = { ++ nvs_native_nvlist, ++ nvs_native_nvpair, ++ nvs_native_nvp_op, ++ nvs_native_nvp_size, ++ nvs_native_nvl_fini ++}; ++ ++static int ++nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) ++{ ++ nvs_native_t native; ++ int err; ++ ++ nvs->nvs_ops = &nvs_native_ops; ++ ++ if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t), ++ *buflen - sizeof (nvs_header_t))) != 0) ++ return (err); ++ ++ err = nvs_operation(nvs, nvl, buflen); ++ ++ nvs_native_destroy(nvs); ++ ++ return (err); ++} ++ ++/* ++ * XDR encoding functions ++ * ++ * An xdr packed nvlist is encoded as: ++ * ++ * - encoding methode and host endian (4 bytes) ++ * - nvl_version (4 bytes) ++ * - nvl_nvflag (4 bytes) ++ * ++ * - encoded nvpairs, the format of one xdr encoded nvpair is: ++ * - encoded size of the nvpair (4 bytes) ++ * - decoded size of the nvpair (4 bytes) ++ * - name string, (4 + sizeof(NV_ALIGN4(string)) ++ * a string is coded as size (4 bytes) and data ++ * - data type (4 bytes) ++ * - number of elements in the nvpair (4 bytes) ++ * - data ++ * ++ * - 2 zero's for end of the entire list (8 bytes) ++ */ ++static int ++nvs_xdr_create(nvstream_t *nvs, XDR *xdr, char *buf, size_t buflen) ++{ ++ /* xdr data must be 4 byte aligned */ ++ if ((ulong_t)buf % 4 != 0) ++ return (EFAULT); ++ ++ switch (nvs->nvs_op) { ++ case NVS_OP_ENCODE: ++ xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE); ++ nvs->nvs_private = xdr; ++ return (0); ++ case NVS_OP_DECODE: ++ xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE); ++ nvs->nvs_private = xdr; ++ return (0); ++ case NVS_OP_GETSIZE: ++ nvs->nvs_private = NULL; ++ return (0); ++ default: ++ return (EINVAL); ++ } ++} ++ ++static void ++nvs_xdr_destroy(nvstream_t *nvs) ++{ ++ switch (nvs->nvs_op) { ++ case NVS_OP_ENCODE: ++ case NVS_OP_DECODE: ++ xdr_destroy((XDR *)nvs->nvs_private); ++ break; ++ default: ++ break; ++ } ++} ++ ++static int ++nvs_xdr_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size) ++{ ++ switch (nvs->nvs_op) { ++ case NVS_OP_ENCODE: ++ case NVS_OP_DECODE: { ++ XDR *xdr = nvs->nvs_private; ++ ++ if (!xdr_int(xdr, &nvl->nvl_version) || ++ !xdr_u_int(xdr, &nvl->nvl_nvflag)) ++ return (EFAULT); ++ break; ++ } ++ case NVS_OP_GETSIZE: { ++ /* ++ * 2 * 4 for nvl_version + nvl_nvflag ++ * and 8 for end of the entire list ++ */ ++ *size += 2 * 4 + 8; ++ break; ++ } ++ default: ++ return (EINVAL); ++ } ++ return (0); ++} ++ ++static int ++nvs_xdr_nvl_fini(nvstream_t *nvs) ++{ ++ if (nvs->nvs_op == NVS_OP_ENCODE) { ++ XDR *xdr = nvs->nvs_private; ++ int zero = 0; ++ ++ if (!xdr_int(xdr, &zero) || !xdr_int(xdr, &zero)) ++ return (EFAULT); ++ } ++ ++ return (0); ++} ++ ++/* ++ * The format of xdr encoded nvpair is: ++ * encode_size, decode_size, name string, data type, nelem, data ++ */ ++static int ++nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp) ++{ ++ data_type_t type; ++ char *buf; ++ char *buf_end = (char *)nvp + nvp->nvp_size; ++ int value_sz; ++ uint_t nelem, buflen; ++ bool_t ret = FALSE; ++ XDR *xdr = nvs->nvs_private; ++ ++ ASSERT(xdr != NULL && nvp != NULL); ++ ++ /* name string */ ++ if ((buf = NVP_NAME(nvp)) >= buf_end) ++ return (EFAULT); ++ buflen = buf_end - buf; ++ ++ if (!xdr_string(xdr, &buf, buflen - 1)) ++ return (EFAULT); ++ nvp->nvp_name_sz = strlen(buf) + 1; ++ ++ /* type and nelem */ ++ if (!xdr_int(xdr, (int *)&nvp->nvp_type) || ++ !xdr_int(xdr, &nvp->nvp_value_elem)) ++ return (EFAULT); ++ ++ type = NVP_TYPE(nvp); ++ nelem = nvp->nvp_value_elem; ++ ++ /* ++ * Verify type and nelem and get the value size. ++ * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY ++ * is the size of the string(s) excluded. ++ */ ++ if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0) ++ return (EFAULT); ++ ++ /* if there is no data to extract then return */ ++ if (nelem == 0) ++ return (0); ++ ++ /* value */ ++ if ((buf = NVP_VALUE(nvp)) >= buf_end) ++ return (EFAULT); ++ buflen = buf_end - buf; ++ ++ if (buflen < value_sz) ++ return (EFAULT); ++ ++ switch (type) { ++ case DATA_TYPE_NVLIST: ++ if (nvs_embedded(nvs, (void *)buf) == 0) ++ return (0); ++ break; ++ ++ case DATA_TYPE_NVLIST_ARRAY: ++ if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0) ++ return (0); ++ break; ++ ++ case DATA_TYPE_BOOLEAN: ++ ret = TRUE; ++ break; ++ ++ case DATA_TYPE_BYTE: ++ case DATA_TYPE_INT8: ++ case DATA_TYPE_UINT8: ++ ret = xdr_char(xdr, buf); ++ break; ++ ++ case DATA_TYPE_INT16: ++ ret = xdr_short(xdr, (void *)buf); ++ break; ++ ++ case DATA_TYPE_UINT16: ++ ret = xdr_u_short(xdr, (void *)buf); ++ break; ++ ++ case DATA_TYPE_BOOLEAN_VALUE: ++ case DATA_TYPE_INT32: ++ ret = xdr_int(xdr, (void *)buf); ++ break; ++ ++ case DATA_TYPE_UINT32: ++ ret = xdr_u_int(xdr, (void *)buf); ++ break; ++ ++ case DATA_TYPE_INT64: ++ ret = xdr_longlong_t(xdr, (void *)buf); ++ break; ++ ++ case DATA_TYPE_UINT64: ++ ret = xdr_u_longlong_t(xdr, (void *)buf); ++ break; ++ ++ case DATA_TYPE_HRTIME: ++ /* ++ * NOTE: must expose the definition of hrtime_t here ++ */ ++ ret = xdr_longlong_t(xdr, (void *)buf); ++ break; ++#if !defined(_KERNEL) ++ case DATA_TYPE_DOUBLE: ++ ret = xdr_double(xdr, (void *)buf); ++ break; ++#endif ++ case DATA_TYPE_STRING: ++ ret = xdr_string(xdr, &buf, buflen - 1); ++ break; ++ ++ case DATA_TYPE_BYTE_ARRAY: ++ ret = xdr_opaque(xdr, buf, nelem); ++ break; ++ ++ case DATA_TYPE_INT8_ARRAY: ++ case DATA_TYPE_UINT8_ARRAY: ++ ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t), ++ (xdrproc_t)xdr_char); ++ break; ++ ++ case DATA_TYPE_INT16_ARRAY: ++ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t), ++ sizeof (int16_t), (xdrproc_t)xdr_short); ++ break; ++ ++ case DATA_TYPE_UINT16_ARRAY: ++ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t), ++ sizeof (uint16_t), (xdrproc_t)xdr_u_short); ++ break; ++ ++ case DATA_TYPE_BOOLEAN_ARRAY: ++ case DATA_TYPE_INT32_ARRAY: ++ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t), ++ sizeof (int32_t), (xdrproc_t)xdr_int); ++ break; ++ ++ case DATA_TYPE_UINT32_ARRAY: ++ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t), ++ sizeof (uint32_t), (xdrproc_t)xdr_u_int); ++ break; ++ ++ case DATA_TYPE_INT64_ARRAY: ++ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t), ++ sizeof (int64_t), (xdrproc_t)xdr_longlong_t); ++ break; ++ ++ case DATA_TYPE_UINT64_ARRAY: ++ ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t), ++ sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t); ++ break; ++ ++ case DATA_TYPE_STRING_ARRAY: { ++ size_t len = nelem * sizeof (uint64_t); ++ char **strp = (void *)buf; ++ int i; ++ ++ if (nvs->nvs_op == NVS_OP_DECODE) ++ bzero(buf, len); /* don't trust packed data */ ++ ++ for (i = 0; i < nelem; i++) { ++ if (buflen <= len) ++ return (EFAULT); ++ ++ buf += len; ++ buflen -= len; ++ ++ if (xdr_string(xdr, &buf, buflen - 1) != TRUE) ++ return (EFAULT); ++ ++ if (nvs->nvs_op == NVS_OP_DECODE) ++ strp[i] = buf; ++ len = strlen(buf) + 1; ++ } ++ ret = TRUE; ++ break; ++ } ++ default: ++ break; ++ } ++ ++ return (ret == TRUE ? 0 : EFAULT); ++} ++ ++static int ++nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) ++{ ++ data_type_t type = NVP_TYPE(nvp); ++ /* ++ * encode_size + decode_size + name string size + data type + nelem ++ * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) ++ */ ++ uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4; ++ ++ switch (type) { ++ case DATA_TYPE_BOOLEAN: ++ break; ++ ++ case DATA_TYPE_BOOLEAN_VALUE: ++ case DATA_TYPE_BYTE: ++ case DATA_TYPE_INT8: ++ case DATA_TYPE_UINT8: ++ case DATA_TYPE_INT16: ++ case DATA_TYPE_UINT16: ++ case DATA_TYPE_INT32: ++ case DATA_TYPE_UINT32: ++ nvp_sz += 4; /* 4 is the minimum xdr unit */ ++ break; ++ ++ case DATA_TYPE_INT64: ++ case DATA_TYPE_UINT64: ++ case DATA_TYPE_HRTIME: ++#if !defined(_KERNEL) ++ case DATA_TYPE_DOUBLE: ++#endif ++ nvp_sz += 8; ++ break; ++ ++ case DATA_TYPE_STRING: ++ nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp))); ++ break; ++ ++ case DATA_TYPE_BYTE_ARRAY: ++ nvp_sz += NV_ALIGN4(NVP_NELEM(nvp)); ++ break; ++ ++ case DATA_TYPE_BOOLEAN_ARRAY: ++ case DATA_TYPE_INT8_ARRAY: ++ case DATA_TYPE_UINT8_ARRAY: ++ case DATA_TYPE_INT16_ARRAY: ++ case DATA_TYPE_UINT16_ARRAY: ++ case DATA_TYPE_INT32_ARRAY: ++ case DATA_TYPE_UINT32_ARRAY: ++ nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp); ++ break; ++ ++ case DATA_TYPE_INT64_ARRAY: ++ case DATA_TYPE_UINT64_ARRAY: ++ nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp); ++ break; ++ ++ case DATA_TYPE_STRING_ARRAY: { ++ int i; ++ char **strs = (void *)NVP_VALUE(nvp); ++ ++ for (i = 0; i < NVP_NELEM(nvp); i++) ++ nvp_sz += 4 + NV_ALIGN4(strlen(strs[i])); ++ ++ break; ++ } ++ ++ case DATA_TYPE_NVLIST: ++ case DATA_TYPE_NVLIST_ARRAY: { ++ size_t nvsize = 0; ++ int old_nvs_op = nvs->nvs_op; ++ int err; ++ ++ nvs->nvs_op = NVS_OP_GETSIZE; ++ if (type == DATA_TYPE_NVLIST) ++ err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize); ++ else ++ err = nvs_embedded_nvl_array(nvs, nvp, &nvsize); ++ nvs->nvs_op = old_nvs_op; ++ ++ if (err != 0) ++ return (EINVAL); ++ ++ nvp_sz += nvsize; ++ break; ++ } ++ ++ default: ++ return (EINVAL); ++ } ++ ++ if (nvp_sz > INT32_MAX) ++ return (EINVAL); ++ ++ *size = nvp_sz; ++ ++ return (0); ++} ++ ++ ++/* ++ * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates ++ * the largest nvpair that could be encoded in the buffer. ++ * ++ * See comments above nvpair_xdr_op() for the format of xdr encoding. ++ * The size of a xdr packed nvpair without any data is 5 words. ++ * ++ * Using the size of the data directly as an estimate would be ok ++ * in all cases except one. If the data type is of DATA_TYPE_STRING_ARRAY ++ * then the actual nvpair has space for an array of pointers to index ++ * the strings. These pointers are not encoded into the packed xdr buffer. ++ * ++ * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are ++ * of length 0, then each string is endcoded in xdr format as a single word. ++ * Therefore when expanded to an nvpair there will be 2.25 word used for ++ * each string. (a int64_t allocated for pointer usage, and a single char ++ * for the null termination.) ++ * ++ * This is the calculation performed by the NVS_XDR_MAX_LEN macro. ++ */ ++#define NVS_XDR_HDR_LEN ((size_t)(5 * 4)) ++#define NVS_XDR_DATA_LEN(y) (((size_t)(y) <= NVS_XDR_HDR_LEN) ? \ ++ 0 : ((size_t)(y) - NVS_XDR_HDR_LEN)) ++#define NVS_XDR_MAX_LEN(x) (NVP_SIZE_CALC(1, 0) + \ ++ (NVS_XDR_DATA_LEN(x) * 2) + \ ++ NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4))) ++ ++static int ++nvs_xdr_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size) ++{ ++ XDR *xdr = nvs->nvs_private; ++ int32_t encode_len, decode_len; ++ ++ switch (nvs->nvs_op) { ++ case NVS_OP_ENCODE: { ++ size_t nvsize; ++ ++ if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0) ++ return (EFAULT); ++ ++ decode_len = nvp->nvp_size; ++ encode_len = nvsize; ++ if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len)) ++ return (EFAULT); ++ ++ return (nvs_xdr_nvp_op(nvs, nvp)); ++ } ++ case NVS_OP_DECODE: { ++ struct xdr_bytesrec bytesrec; ++ ++ /* get the encode and decode size */ ++ if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len)) ++ return (EFAULT); ++ *size = decode_len; ++ ++ /* are we at the end of the stream? */ ++ if (*size == 0) ++ return (0); ++ ++ /* sanity check the size parameter */ ++ if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec)) ++ return (EFAULT); ++ ++ if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail)) ++ return (EFAULT); ++ break; ++ } ++ ++ default: ++ return (EINVAL); ++ } ++ return (0); ++} ++ ++static const struct nvs_ops nvs_xdr_ops = { ++ nvs_xdr_nvlist, ++ nvs_xdr_nvpair, ++ nvs_xdr_nvp_op, ++ nvs_xdr_nvp_size, ++ nvs_xdr_nvl_fini ++}; ++ ++static int ++nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) ++{ ++ XDR xdr; ++ int err; ++ ++ nvs->nvs_ops = &nvs_xdr_ops; ++ ++ if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t), ++ *buflen - sizeof (nvs_header_t))) != 0) ++ return (err); ++ ++ err = nvs_operation(nvs, nvl, buflen); ++ ++ nvs_xdr_destroy(nvs); ++ ++ return (err); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++ ++static int nvpair_init(void) { return 0; } ++static int nvpair_fini(void) { return 0; } ++ ++spl_module_init(nvpair_init); ++spl_module_exit(nvpair_fini); ++ ++MODULE_DESCRIPTION("Generic name/value pair implementation"); ++MODULE_AUTHOR(ZFS_META_AUTHOR); ++MODULE_LICENSE(ZFS_META_LICENSE); ++ ++EXPORT_SYMBOL(nv_alloc_init); ++EXPORT_SYMBOL(nv_alloc_reset); ++EXPORT_SYMBOL(nv_alloc_fini); ++ ++/* list management */ ++EXPORT_SYMBOL(nvlist_alloc); ++EXPORT_SYMBOL(nvlist_free); ++EXPORT_SYMBOL(nvlist_size); ++EXPORT_SYMBOL(nvlist_pack); ++EXPORT_SYMBOL(nvlist_unpack); ++EXPORT_SYMBOL(nvlist_dup); ++EXPORT_SYMBOL(nvlist_merge); ++ ++EXPORT_SYMBOL(nvlist_xalloc); ++EXPORT_SYMBOL(nvlist_xpack); ++EXPORT_SYMBOL(nvlist_xunpack); ++EXPORT_SYMBOL(nvlist_xdup); ++EXPORT_SYMBOL(nvlist_lookup_nv_alloc); ++ ++EXPORT_SYMBOL(nvlist_add_nvpair); ++EXPORT_SYMBOL(nvlist_add_boolean); ++EXPORT_SYMBOL(nvlist_add_boolean_value); ++EXPORT_SYMBOL(nvlist_add_byte); ++EXPORT_SYMBOL(nvlist_add_int8); ++EXPORT_SYMBOL(nvlist_add_uint8); ++EXPORT_SYMBOL(nvlist_add_int16); ++EXPORT_SYMBOL(nvlist_add_uint16); ++EXPORT_SYMBOL(nvlist_add_int32); ++EXPORT_SYMBOL(nvlist_add_uint32); ++EXPORT_SYMBOL(nvlist_add_int64); ++EXPORT_SYMBOL(nvlist_add_uint64); ++EXPORT_SYMBOL(nvlist_add_string); ++EXPORT_SYMBOL(nvlist_add_nvlist); ++EXPORT_SYMBOL(nvlist_add_boolean_array); ++EXPORT_SYMBOL(nvlist_add_byte_array); ++EXPORT_SYMBOL(nvlist_add_int8_array); ++EXPORT_SYMBOL(nvlist_add_uint8_array); ++EXPORT_SYMBOL(nvlist_add_int16_array); ++EXPORT_SYMBOL(nvlist_add_uint16_array); ++EXPORT_SYMBOL(nvlist_add_int32_array); ++EXPORT_SYMBOL(nvlist_add_uint32_array); ++EXPORT_SYMBOL(nvlist_add_int64_array); ++EXPORT_SYMBOL(nvlist_add_uint64_array); ++EXPORT_SYMBOL(nvlist_add_string_array); ++EXPORT_SYMBOL(nvlist_add_nvlist_array); ++EXPORT_SYMBOL(nvlist_next_nvpair); ++EXPORT_SYMBOL(nvlist_prev_nvpair); ++EXPORT_SYMBOL(nvlist_empty); ++EXPORT_SYMBOL(nvlist_add_hrtime); ++ ++EXPORT_SYMBOL(nvlist_remove); ++EXPORT_SYMBOL(nvlist_remove_nvpair); ++EXPORT_SYMBOL(nvlist_remove_all); ++ ++EXPORT_SYMBOL(nvlist_lookup_boolean); ++EXPORT_SYMBOL(nvlist_lookup_boolean_value); ++EXPORT_SYMBOL(nvlist_lookup_byte); ++EXPORT_SYMBOL(nvlist_lookup_int8); ++EXPORT_SYMBOL(nvlist_lookup_uint8); ++EXPORT_SYMBOL(nvlist_lookup_int16); ++EXPORT_SYMBOL(nvlist_lookup_uint16); ++EXPORT_SYMBOL(nvlist_lookup_int32); ++EXPORT_SYMBOL(nvlist_lookup_uint32); ++EXPORT_SYMBOL(nvlist_lookup_int64); ++EXPORT_SYMBOL(nvlist_lookup_uint64); ++EXPORT_SYMBOL(nvlist_lookup_string); ++EXPORT_SYMBOL(nvlist_lookup_nvlist); ++EXPORT_SYMBOL(nvlist_lookup_boolean_array); ++EXPORT_SYMBOL(nvlist_lookup_byte_array); ++EXPORT_SYMBOL(nvlist_lookup_int8_array); ++EXPORT_SYMBOL(nvlist_lookup_uint8_array); ++EXPORT_SYMBOL(nvlist_lookup_int16_array); ++EXPORT_SYMBOL(nvlist_lookup_uint16_array); ++EXPORT_SYMBOL(nvlist_lookup_int32_array); ++EXPORT_SYMBOL(nvlist_lookup_uint32_array); ++EXPORT_SYMBOL(nvlist_lookup_int64_array); ++EXPORT_SYMBOL(nvlist_lookup_uint64_array); ++EXPORT_SYMBOL(nvlist_lookup_string_array); ++EXPORT_SYMBOL(nvlist_lookup_nvlist_array); ++EXPORT_SYMBOL(nvlist_lookup_hrtime); ++EXPORT_SYMBOL(nvlist_lookup_pairs); ++ ++EXPORT_SYMBOL(nvlist_lookup_nvpair); ++EXPORT_SYMBOL(nvlist_exists); ++ ++/* processing nvpair */ ++EXPORT_SYMBOL(nvpair_name); ++EXPORT_SYMBOL(nvpair_type); ++EXPORT_SYMBOL(nvpair_value_boolean_value); ++EXPORT_SYMBOL(nvpair_value_byte); ++EXPORT_SYMBOL(nvpair_value_int8); ++EXPORT_SYMBOL(nvpair_value_uint8); ++EXPORT_SYMBOL(nvpair_value_int16); ++EXPORT_SYMBOL(nvpair_value_uint16); ++EXPORT_SYMBOL(nvpair_value_int32); ++EXPORT_SYMBOL(nvpair_value_uint32); ++EXPORT_SYMBOL(nvpair_value_int64); ++EXPORT_SYMBOL(nvpair_value_uint64); ++EXPORT_SYMBOL(nvpair_value_string); ++EXPORT_SYMBOL(nvpair_value_nvlist); ++EXPORT_SYMBOL(nvpair_value_boolean_array); ++EXPORT_SYMBOL(nvpair_value_byte_array); ++EXPORT_SYMBOL(nvpair_value_int8_array); ++EXPORT_SYMBOL(nvpair_value_uint8_array); ++EXPORT_SYMBOL(nvpair_value_int16_array); ++EXPORT_SYMBOL(nvpair_value_uint16_array); ++EXPORT_SYMBOL(nvpair_value_int32_array); ++EXPORT_SYMBOL(nvpair_value_uint32_array); ++EXPORT_SYMBOL(nvpair_value_int64_array); ++EXPORT_SYMBOL(nvpair_value_uint64_array); ++EXPORT_SYMBOL(nvpair_value_string_array); ++EXPORT_SYMBOL(nvpair_value_nvlist_array); ++EXPORT_SYMBOL(nvpair_value_hrtime); ++ ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/unicode/Makefile linux-3.2.33-go/fs/zfs/unicode/Makefile +--- linux-3.2.33-go.orig/fs/zfs/unicode/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/unicode/Makefile 2012-11-16 23:25:34.362039185 +0100 +@@ -0,0 +1,8 @@ ++MODULE := zunicode ++ ++EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) -Wno-unused-but-set-variable -DHAVE_SPL -D_KERNEL -DTEXT_DOMAIN=\"zfs-linux-kernel\" -DNDEBUG ++ ++obj-$(CONFIG_ZFS) := $(MODULE).o ++ ++$(MODULE)-objs += u8_textprep.o ++$(MODULE)-objs += uconv.o +diff -uNr linux-3.2.33-go.orig/fs/zfs/unicode/Makefile.in linux-3.2.33-go/fs/zfs/unicode/Makefile.in +--- linux-3.2.33-go.orig/fs/zfs/unicode/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/unicode/Makefile.in 2012-11-16 23:25:34.355039267 +0100 +@@ -0,0 +1,8 @@ ++MODULE := zunicode ++ ++EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@ ++ ++obj-$(CONFIG_ZFS) := $(MODULE).o ++ ++$(MODULE)-objs += @top_srcdir@/module/unicode/u8_textprep.o ++$(MODULE)-objs += @top_srcdir@/module/unicode/uconv.o +diff -uNr linux-3.2.33-go.orig/fs/zfs/unicode/u8_textprep.c linux-3.2.33-go/fs/zfs/unicode/u8_textprep.c +--- linux-3.2.33-go.orig/fs/zfs/unicode/u8_textprep.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/unicode/u8_textprep.c 2012-11-16 23:25:34.355039267 +0100 +@@ -0,0 +1,2150 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++ ++ ++ ++/* ++ * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458). ++ * ++ * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F), ++ * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also ++ * the section 3C man pages. ++ * Interface stability: Committed. ++ */ ++ ++#include ++#ifdef _KERNEL ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#else ++#include ++#include ++#endif /* _KERNEL */ ++#include ++#include ++#include ++ ++ ++/* The maximum possible number of bytes in a UTF-8 character. */ ++#define U8_MB_CUR_MAX (4) ++ ++/* ++ * The maximum number of bytes needed for a UTF-8 character to cover ++ * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2. ++ */ ++#define U8_MAX_BYTES_UCS2 (3) ++ ++/* The maximum possible number of bytes in a Stream-Safe Text. */ ++#define U8_STREAM_SAFE_TEXT_MAX (128) ++ ++/* ++ * The maximum number of characters in a combining/conjoining sequence and ++ * the actual upperbound limit of a combining/conjoining sequence. ++ */ ++#define U8_MAX_CHARS_A_SEQ (32) ++#define U8_UPPER_LIMIT_IN_A_SEQ (31) ++ ++/* The combining class value for Starter. */ ++#define U8_COMBINING_CLASS_STARTER (0) ++ ++/* ++ * Some Hangul related macros at below. ++ * ++ * The first and the last of Hangul syllables, Hangul Jamo Leading consonants, ++ * Vowels, and optional Trailing consonants in Unicode scalar values. ++ * ++ * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not ++ * the actual U+11A8. This is due to that the trailing consonant is optional ++ * and thus we are doing a pre-calculation of subtracting one. ++ * ++ * Each of 19 modern leading consonants has total 588 possible syllables since ++ * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for ++ * no trailing consonant case, i.e., 21 x 28 = 588. ++ * ++ * We also have bunch of Hangul related macros at below. Please bear in mind ++ * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is ++ * a Hangul Jamo or not but the value does not guarantee that it is a Hangul ++ * Jamo; it just guarantee that it will be most likely. ++ */ ++#define U8_HANGUL_SYL_FIRST (0xAC00U) ++#define U8_HANGUL_SYL_LAST (0xD7A3U) ++ ++#define U8_HANGUL_JAMO_L_FIRST (0x1100U) ++#define U8_HANGUL_JAMO_L_LAST (0x1112U) ++#define U8_HANGUL_JAMO_V_FIRST (0x1161U) ++#define U8_HANGUL_JAMO_V_LAST (0x1175U) ++#define U8_HANGUL_JAMO_T_FIRST (0x11A7U) ++#define U8_HANGUL_JAMO_T_LAST (0x11C2U) ++ ++#define U8_HANGUL_V_COUNT (21) ++#define U8_HANGUL_VT_COUNT (588) ++#define U8_HANGUL_T_COUNT (28) ++ ++#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U) ++ ++#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \ ++ (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \ ++ (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \ ++ (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU)); ++ ++#define U8_HANGUL_JAMO_L(u) \ ++ ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST) ++ ++#define U8_HANGUL_JAMO_V(u) \ ++ ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST) ++ ++#define U8_HANGUL_JAMO_T(u) \ ++ ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) ++ ++#define U8_HANGUL_JAMO(u) \ ++ ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) ++ ++#define U8_HANGUL_SYLLABLE(u) \ ++ ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST) ++ ++#define U8_HANGUL_COMPOSABLE_L_V(s, u) \ ++ ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u))) ++ ++#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \ ++ ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u))) ++ ++/* The types of decomposition mappings. */ ++#define U8_DECOMP_BOTH (0xF5U) ++#define U8_DECOMP_CANONICAL (0xF6U) ++ ++/* The indicator for 16-bit table. */ ++#define U8_16BIT_TABLE_INDICATOR (0x8000U) ++ ++/* The following are some convenience macros. */ ++#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ ++ (u) = ((((uint32_t)(b1) & 0x0F) << 12) | \ ++ (((uint32_t)(b2) & 0x3F) << 6) | \ ++ ((uint32_t)(b3) & 0x3F)); ++ ++#define U8_SIMPLE_SWAP(a, b, t) \ ++ (t) = (a); \ ++ (a) = (b); \ ++ (b) = (t); ++ ++#define U8_ASCII_TOUPPER(c) \ ++ (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c)) ++ ++#define U8_ASCII_TOLOWER(c) \ ++ (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c)) ++ ++#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U) ++/* ++ * The following macro assumes that the two characters that are to be ++ * swapped are adjacent to each other and 'a' comes before 'b'. ++ * ++ * If the assumptions are not met, then, the macro will fail. ++ */ ++#define U8_SWAP_COMB_MARKS(a, b) \ ++ for (k = 0; k < disp[(a)]; k++) \ ++ u8t[k] = u8s[start[(a)] + k]; \ ++ for (k = 0; k < disp[(b)]; k++) \ ++ u8s[start[(a)] + k] = u8s[start[(b)] + k]; \ ++ start[(b)] = start[(a)] + disp[(b)]; \ ++ for (k = 0; k < disp[(a)]; k++) \ ++ u8s[start[(b)] + k] = u8t[k]; \ ++ U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \ ++ U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc); ++ ++/* The possible states during normalization. */ ++typedef enum { ++ U8_STATE_START = 0, ++ U8_STATE_HANGUL_L = 1, ++ U8_STATE_HANGUL_LV = 2, ++ U8_STATE_HANGUL_LVT = 3, ++ U8_STATE_HANGUL_V = 4, ++ U8_STATE_HANGUL_T = 5, ++ U8_STATE_COMBINING_MARK = 6 ++} u8_normalization_states_t; ++ ++/* ++ * The three vectors at below are used to check bytes of a given UTF-8 ++ * character are valid and not containing any malformed byte values. ++ * ++ * We used to have a quite relaxed UTF-8 binary representation but then there ++ * was some security related issues and so the Unicode Consortium defined ++ * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it ++ * one more time at the Unicode 3.2. The following three tables are based on ++ * that. ++ */ ++ ++#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF) ++ ++#define I_ U8_ILLEGAL_CHAR ++#define O_ U8_OUT_OF_RANGE_CHAR ++ ++const int8_t u8_number_of_bytes[0x100] = { ++ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ ++/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ ++ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, ++ ++/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ ++ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, ++ ++/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ ++ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, ++ ++/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ ++ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, ++ ++/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ ++ I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ++ ++/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ ++ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ++ ++/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ ++ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ++ ++/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ ++ 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, ++}; ++ ++#undef I_ ++#undef O_ ++ ++const uint8_t u8_valid_min_2nd_byte[0x100] = { ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++/* C0 C1 C2 C3 C4 C5 C6 C7 */ ++ 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++/* C8 C9 CA CB CC CD CE CF */ ++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++/* D0 D1 D2 D3 D4 D5 D6 D7 */ ++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++/* D8 D9 DA DB DC DD DE DF */ ++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++/* E0 E1 E2 E3 E4 E5 E6 E7 */ ++ 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++/* E8 E9 EA EB EC ED EE EF */ ++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++/* F0 F1 F2 F3 F4 F5 F6 F7 */ ++ 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++}; ++ ++const uint8_t u8_valid_max_2nd_byte[0x100] = { ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++/* C0 C1 C2 C3 C4 C5 C6 C7 */ ++ 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, ++/* C8 C9 CA CB CC CD CE CF */ ++ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, ++/* D0 D1 D2 D3 D4 D5 D6 D7 */ ++ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, ++/* D8 D9 DA DB DC DD DE DF */ ++ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, ++/* E0 E1 E2 E3 E4 E5 E6 E7 */ ++ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, ++/* E8 E9 EA EB EC ED EE EF */ ++ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, ++/* F0 F1 F2 F3 F4 F5 F6 F7 */ ++ 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++}; ++ ++ ++/* ++ * The u8_validate() validates on the given UTF-8 character string and ++ * calculate the byte length. It is quite similar to mblen(3C) except that ++ * this will validate against the list of characters if required and ++ * specific to UTF-8 and Unicode. ++ */ ++int ++u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum) ++{ ++ uchar_t *ib; ++ uchar_t *ibtail; ++ uchar_t **p; ++ uchar_t *s1; ++ uchar_t *s2; ++ uchar_t f; ++ int sz; ++ size_t i; ++ int ret_val; ++ boolean_t second; ++ boolean_t no_need_to_validate_entire; ++ boolean_t check_additional; ++ boolean_t validate_ucs2_range_only; ++ ++ if (! u8str) ++ return (0); ++ ++ ib = (uchar_t *)u8str; ++ ibtail = ib + n; ++ ++ ret_val = 0; ++ ++ no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE); ++ check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL; ++ validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE; ++ ++ while (ib < ibtail) { ++ /* ++ * The first byte of a UTF-8 character tells how many ++ * bytes will follow for the character. If the first byte ++ * is an illegal byte value or out of range value, we just ++ * return -1 with an appropriate error number. ++ */ ++ sz = u8_number_of_bytes[*ib]; ++ if (sz == U8_ILLEGAL_CHAR) { ++ *errnum = EILSEQ; ++ return (-1); ++ } ++ ++ if (sz == U8_OUT_OF_RANGE_CHAR || ++ (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) { ++ *errnum = ERANGE; ++ return (-1); ++ } ++ ++ /* ++ * If we don't have enough bytes to check on, that's also ++ * an error. As you can see, we give illegal byte sequence ++ * checking higher priority then EINVAL cases. ++ */ ++ if ((ibtail - ib) < sz) { ++ *errnum = EINVAL; ++ return (-1); ++ } ++ ++ if (sz == 1) { ++ ib++; ++ ret_val++; ++ } else { ++ /* ++ * Check on the multi-byte UTF-8 character. For more ++ * details on this, see comment added for the used ++ * data structures at the beginning of the file. ++ */ ++ f = *ib++; ++ ret_val++; ++ second = B_TRUE; ++ for (i = 1; i < sz; i++) { ++ if (second) { ++ if (*ib < u8_valid_min_2nd_byte[f] || ++ *ib > u8_valid_max_2nd_byte[f]) { ++ *errnum = EILSEQ; ++ return (-1); ++ } ++ second = B_FALSE; ++ } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) { ++ *errnum = EILSEQ; ++ return (-1); ++ } ++ ib++; ++ ret_val++; ++ } ++ } ++ ++ if (check_additional) { ++ for (p = (uchar_t **)list, i = 0; p[i]; i++) { ++ s1 = ib - sz; ++ s2 = p[i]; ++ while (s1 < ib) { ++ if (*s1 != *s2 || *s2 == '\0') ++ break; ++ s1++; ++ s2++; ++ } ++ ++ if (s1 >= ib && *s2 == '\0') { ++ *errnum = EBADF; ++ return (-1); ++ } ++ } ++ } ++ ++ if (no_need_to_validate_entire) ++ break; ++ } ++ ++ return (ret_val); ++} ++ ++/* ++ * The do_case_conv() looks at the mapping tables and returns found ++ * bytes if any. If not found, the input bytes are returned. The function ++ * always terminate the return bytes with a null character assuming that ++ * there are plenty of room to do so. ++ * ++ * The case conversions are simple case conversions mapping a character to ++ * another character as specified in the Unicode data. The byte size of ++ * the mapped character could be different from that of the input character. ++ * ++ * The return value is the byte length of the returned character excluding ++ * the terminating null byte. ++ */ ++static size_t ++do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper) ++{ ++ size_t i; ++ uint16_t b1 = 0; ++ uint16_t b2 = 0; ++ uint16_t b3 = 0; ++ uint16_t b3_tbl; ++ uint16_t b3_base; ++ uint16_t b4 = 0; ++ size_t start_id; ++ size_t end_id; ++ ++ /* ++ * At this point, the only possible values for sz are 2, 3, and 4. ++ * The u8s should point to a vector that is well beyond the size of ++ * 5 bytes. ++ */ ++ if (sz == 2) { ++ b3 = u8s[0] = s[0]; ++ b4 = u8s[1] = s[1]; ++ } else if (sz == 3) { ++ b2 = u8s[0] = s[0]; ++ b3 = u8s[1] = s[1]; ++ b4 = u8s[2] = s[2]; ++ } else if (sz == 4) { ++ b1 = u8s[0] = s[0]; ++ b2 = u8s[1] = s[1]; ++ b3 = u8s[2] = s[2]; ++ b4 = u8s[3] = s[3]; ++ } else { ++ /* This is not possible but just in case as a fallback. */ ++ if (is_it_toupper) ++ *u8s = U8_ASCII_TOUPPER(*s); ++ else ++ *u8s = U8_ASCII_TOLOWER(*s); ++ u8s[1] = '\0'; ++ ++ return (1); ++ } ++ u8s[sz] = '\0'; ++ ++ /* ++ * Let's find out if we have a corresponding character. ++ */ ++ b1 = u8_common_b1_tbl[uv][b1]; ++ if (b1 == U8_TBL_ELEMENT_NOT_DEF) ++ return ((size_t)sz); ++ ++ b2 = u8_case_common_b2_tbl[uv][b1][b2]; ++ if (b2 == U8_TBL_ELEMENT_NOT_DEF) ++ return ((size_t)sz); ++ ++ if (is_it_toupper) { ++ b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id; ++ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) ++ return ((size_t)sz); ++ ++ start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4]; ++ end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1]; ++ ++ /* Either there is no match or an error at the table. */ ++ if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) ++ return ((size_t)sz); ++ ++ b3_base = u8_toupper_b3_tbl[uv][b2][b3].base; ++ ++ for (i = 0; start_id < end_id; start_id++) ++ u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id]; ++ } else { ++ b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id; ++ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) ++ return ((size_t)sz); ++ ++ start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4]; ++ end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1]; ++ ++ if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) ++ return ((size_t)sz); ++ ++ b3_base = u8_tolower_b3_tbl[uv][b2][b3].base; ++ ++ for (i = 0; start_id < end_id; start_id++) ++ u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id]; ++ } ++ ++ /* ++ * If i is still zero, that means there is no corresponding character. ++ */ ++ if (i == 0) ++ return ((size_t)sz); ++ ++ u8s[i] = '\0'; ++ ++ return (i); ++} ++ ++/* ++ * The do_case_compare() function compares the two input strings, s1 and s2, ++ * one character at a time doing case conversions if applicable and return ++ * the comparison result as like strcmp(). ++ * ++ * Since, in empirical sense, most of text data are 7-bit ASCII characters, ++ * we treat the 7-bit ASCII characters as a special case trying to yield ++ * faster processing time. ++ */ ++static int ++do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, ++ size_t n2, boolean_t is_it_toupper, int *errnum) ++{ ++ int f; ++ int sz1; ++ int sz2; ++ size_t j; ++ size_t i1; ++ size_t i2; ++ uchar_t u8s1[U8_MB_CUR_MAX + 1]; ++ uchar_t u8s2[U8_MB_CUR_MAX + 1]; ++ ++ i1 = i2 = 0; ++ while (i1 < n1 && i2 < n2) { ++ /* ++ * Find out what would be the byte length for this UTF-8 ++ * character at string s1 and also find out if this is ++ * an illegal start byte or not and if so, issue a proper ++ * error number and yet treat this byte as a character. ++ */ ++ sz1 = u8_number_of_bytes[*s1]; ++ if (sz1 < 0) { ++ *errnum = EILSEQ; ++ sz1 = 1; ++ } ++ ++ /* ++ * For 7-bit ASCII characters mainly, we do a quick case ++ * conversion right at here. ++ * ++ * If we don't have enough bytes for this character, issue ++ * an EINVAL error and use what are available. ++ * ++ * If we have enough bytes, find out if there is ++ * a corresponding uppercase character and if so, copy over ++ * the bytes for a comparison later. If there is no ++ * corresponding uppercase character, then, use what we have ++ * for the comparison. ++ */ ++ if (sz1 == 1) { ++ if (is_it_toupper) ++ u8s1[0] = U8_ASCII_TOUPPER(*s1); ++ else ++ u8s1[0] = U8_ASCII_TOLOWER(*s1); ++ s1++; ++ u8s1[1] = '\0'; ++ } else if ((i1 + sz1) > n1) { ++ *errnum = EINVAL; ++ for (j = 0; (i1 + j) < n1; ) ++ u8s1[j++] = *s1++; ++ u8s1[j] = '\0'; ++ } else { ++ (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper); ++ s1 += sz1; ++ } ++ ++ /* Do the same for the string s2. */ ++ sz2 = u8_number_of_bytes[*s2]; ++ if (sz2 < 0) { ++ *errnum = EILSEQ; ++ sz2 = 1; ++ } ++ ++ if (sz2 == 1) { ++ if (is_it_toupper) ++ u8s2[0] = U8_ASCII_TOUPPER(*s2); ++ else ++ u8s2[0] = U8_ASCII_TOLOWER(*s2); ++ s2++; ++ u8s2[1] = '\0'; ++ } else if ((i2 + sz2) > n2) { ++ *errnum = EINVAL; ++ for (j = 0; (i2 + j) < n2; ) ++ u8s2[j++] = *s2++; ++ u8s2[j] = '\0'; ++ } else { ++ (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper); ++ s2 += sz2; ++ } ++ ++ /* Now compare the two characters. */ ++ if (sz1 == 1 && sz2 == 1) { ++ if (*u8s1 > *u8s2) ++ return (1); ++ if (*u8s1 < *u8s2) ++ return (-1); ++ } else { ++ f = strcmp((const char *)u8s1, (const char *)u8s2); ++ if (f != 0) ++ return (f); ++ } ++ ++ /* ++ * They were the same. Let's move on to the next ++ * characters then. ++ */ ++ i1 += sz1; ++ i2 += sz2; ++ } ++ ++ /* ++ * We compared until the end of either or both strings. ++ * ++ * If we reached to or went over the ends for the both, that means ++ * they are the same. ++ * ++ * If we reached only one of the two ends, that means the other string ++ * has something which then the fact can be used to determine ++ * the return value. ++ */ ++ if (i1 >= n1) { ++ if (i2 >= n2) ++ return (0); ++ return (-1); ++ } ++ return (1); ++} ++ ++/* ++ * The combining_class() function checks on the given bytes and find out ++ * the corresponding Unicode combining class value. The return value 0 means ++ * it is a Starter. Any illegal UTF-8 character will also be treated as ++ * a Starter. ++ */ ++static uchar_t ++combining_class(size_t uv, uchar_t *s, size_t sz) ++{ ++ uint16_t b1 = 0; ++ uint16_t b2 = 0; ++ uint16_t b3 = 0; ++ uint16_t b4 = 0; ++ ++ if (sz == 1 || sz > 4) ++ return (0); ++ ++ if (sz == 2) { ++ b3 = s[0]; ++ b4 = s[1]; ++ } else if (sz == 3) { ++ b2 = s[0]; ++ b3 = s[1]; ++ b4 = s[2]; ++ } else if (sz == 4) { ++ b1 = s[0]; ++ b2 = s[1]; ++ b3 = s[2]; ++ b4 = s[3]; ++ } ++ ++ b1 = u8_common_b1_tbl[uv][b1]; ++ if (b1 == U8_TBL_ELEMENT_NOT_DEF) ++ return (0); ++ ++ b2 = u8_combining_class_b2_tbl[uv][b1][b2]; ++ if (b2 == U8_TBL_ELEMENT_NOT_DEF) ++ return (0); ++ ++ b3 = u8_combining_class_b3_tbl[uv][b2][b3]; ++ if (b3 == U8_TBL_ELEMENT_NOT_DEF) ++ return (0); ++ ++ return (u8_combining_class_b4_tbl[uv][b3][b4]); ++} ++ ++/* ++ * The do_decomp() function finds out a matching decomposition if any ++ * and return. If there is no match, the input bytes are copied and returned. ++ * The function also checks if there is a Hangul, decomposes it if necessary ++ * and returns. ++ * ++ * To save time, a single byte 7-bit ASCII character should be handled by ++ * the caller. ++ * ++ * The function returns the number of bytes returned sans always terminating ++ * the null byte. It will also return a state that will tell if there was ++ * a Hangul character decomposed which then will be used by the caller. ++ */ ++static size_t ++do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, ++ boolean_t canonical_decomposition, u8_normalization_states_t *state) ++{ ++ uint16_t b1 = 0; ++ uint16_t b2 = 0; ++ uint16_t b3 = 0; ++ uint16_t b3_tbl; ++ uint16_t b3_base; ++ uint16_t b4 = 0; ++ size_t start_id; ++ size_t end_id; ++ size_t i; ++ uint32_t u1; ++ ++ if (sz == 2) { ++ b3 = u8s[0] = s[0]; ++ b4 = u8s[1] = s[1]; ++ u8s[2] = '\0'; ++ } else if (sz == 3) { ++ /* Convert it to a Unicode scalar value. */ ++ U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]); ++ ++ /* ++ * If this is a Hangul syllable, we decompose it into ++ * a leading consonant, a vowel, and an optional trailing ++ * consonant and then return. ++ */ ++ if (U8_HANGUL_SYLLABLE(u1)) { ++ u1 -= U8_HANGUL_SYL_FIRST; ++ ++ b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT; ++ b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT) ++ / U8_HANGUL_T_COUNT; ++ b3 = u1 % U8_HANGUL_T_COUNT; ++ ++ U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1); ++ U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2); ++ if (b3) { ++ b3 += U8_HANGUL_JAMO_T_FIRST; ++ U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3); ++ ++ u8s[9] = '\0'; ++ *state = U8_STATE_HANGUL_LVT; ++ return (9); ++ } ++ ++ u8s[6] = '\0'; ++ *state = U8_STATE_HANGUL_LV; ++ return (6); ++ } ++ ++ b2 = u8s[0] = s[0]; ++ b3 = u8s[1] = s[1]; ++ b4 = u8s[2] = s[2]; ++ u8s[3] = '\0'; ++ ++ /* ++ * If this is a Hangul Jamo, we know there is nothing ++ * further that we can decompose. ++ */ ++ if (U8_HANGUL_JAMO_L(u1)) { ++ *state = U8_STATE_HANGUL_L; ++ return (3); ++ } ++ ++ if (U8_HANGUL_JAMO_V(u1)) { ++ if (*state == U8_STATE_HANGUL_L) ++ *state = U8_STATE_HANGUL_LV; ++ else ++ *state = U8_STATE_HANGUL_V; ++ return (3); ++ } ++ ++ if (U8_HANGUL_JAMO_T(u1)) { ++ if (*state == U8_STATE_HANGUL_LV) ++ *state = U8_STATE_HANGUL_LVT; ++ else ++ *state = U8_STATE_HANGUL_T; ++ return (3); ++ } ++ } else if (sz == 4) { ++ b1 = u8s[0] = s[0]; ++ b2 = u8s[1] = s[1]; ++ b3 = u8s[2] = s[2]; ++ b4 = u8s[3] = s[3]; ++ u8s[4] = '\0'; ++ } else { ++ /* ++ * This is a fallback and should not happen if the function ++ * was called properly. ++ */ ++ u8s[0] = s[0]; ++ u8s[1] = '\0'; ++ *state = U8_STATE_START; ++ return (1); ++ } ++ ++ /* ++ * At this point, this rountine does not know what it would get. ++ * The caller should sort it out if the state isn't a Hangul one. ++ */ ++ *state = U8_STATE_START; ++ ++ /* Try to find matching decomposition mapping byte sequence. */ ++ b1 = u8_common_b1_tbl[uv][b1]; ++ if (b1 == U8_TBL_ELEMENT_NOT_DEF) ++ return ((size_t)sz); ++ ++ b2 = u8_decomp_b2_tbl[uv][b1][b2]; ++ if (b2 == U8_TBL_ELEMENT_NOT_DEF) ++ return ((size_t)sz); ++ ++ b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id; ++ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) ++ return ((size_t)sz); ++ ++ /* ++ * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR ++ * which is 0x8000, this means we couldn't fit the mappings into ++ * the cardinality of a unsigned byte. ++ */ ++ if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { ++ b3_tbl -= U8_16BIT_TABLE_INDICATOR; ++ start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4]; ++ end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; ++ } else { ++ start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4]; ++ end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1]; ++ } ++ ++ /* This also means there wasn't any matching decomposition. */ ++ if (start_id >= end_id) ++ return ((size_t)sz); ++ ++ /* ++ * The final table for decomposition mappings has three types of ++ * byte sequences depending on whether a mapping is for compatibility ++ * decomposition, canonical decomposition, or both like the following: ++ * ++ * (1) Compatibility decomposition mappings: ++ * ++ * +---+---+-...-+---+ ++ * | B0| B1| ... | Bm| ++ * +---+---+-...-+---+ ++ * ++ * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH). ++ * ++ * (2) Canonical decomposition mappings: ++ * ++ * +---+---+---+-...-+---+ ++ * | T | b0| b1| ... | bn| ++ * +---+---+---+-...-+---+ ++ * ++ * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL). ++ * ++ * (3) Both mappings: ++ * ++ * +---+---+---+---+-...-+---+---+---+-...-+---+ ++ * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm| ++ * +---+---+---+---+-...-+---+---+---+-...-+---+ ++ * ++ * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement ++ * byte, b0 to bn are canonical mapping bytes and B0 to Bm are ++ * compatibility mapping bytes. ++ * ++ * Note that compatibility decomposition means doing recursive ++ * decompositions using both compatibility decomposition mappings and ++ * canonical decomposition mappings. On the other hand, canonical ++ * decomposition means doing recursive decompositions using only ++ * canonical decomposition mappings. Since the table we have has gone ++ * through the recursions already, we do not need to do so during ++ * runtime, i.e., the table has been completely flattened out ++ * already. ++ */ ++ ++ b3_base = u8_decomp_b3_tbl[uv][b2][b3].base; ++ ++ /* Get the type, T, of the byte sequence. */ ++ b1 = u8_decomp_final_tbl[uv][b3_base + start_id]; ++ ++ /* ++ * If necessary, adjust start_id, end_id, or both. Note that if ++ * this is compatibility decomposition mapping, there is no ++ * adjustment. ++ */ ++ if (canonical_decomposition) { ++ /* Is the mapping only for compatibility decomposition? */ ++ if (b1 < U8_DECOMP_BOTH) ++ return ((size_t)sz); ++ ++ start_id++; ++ ++ if (b1 == U8_DECOMP_BOTH) { ++ end_id = start_id + ++ u8_decomp_final_tbl[uv][b3_base + start_id]; ++ start_id++; ++ } ++ } else { ++ /* ++ * Unless this is a compatibility decomposition mapping, ++ * we adjust the start_id. ++ */ ++ if (b1 == U8_DECOMP_BOTH) { ++ start_id++; ++ start_id += u8_decomp_final_tbl[uv][b3_base + start_id]; ++ } else if (b1 == U8_DECOMP_CANONICAL) { ++ start_id++; ++ } ++ } ++ ++ for (i = 0; start_id < end_id; start_id++) ++ u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id]; ++ u8s[i] = '\0'; ++ ++ return (i); ++} ++ ++/* ++ * The find_composition_start() function uses the character bytes given and ++ * find out the matching composition mappings if any and return the address ++ * to the composition mappings as explained in the do_composition(). ++ */ ++static uchar_t * ++find_composition_start(size_t uv, uchar_t *s, size_t sz) ++{ ++ uint16_t b1 = 0; ++ uint16_t b2 = 0; ++ uint16_t b3 = 0; ++ uint16_t b3_tbl; ++ uint16_t b3_base; ++ uint16_t b4 = 0; ++ size_t start_id; ++ size_t end_id; ++ ++ if (sz == 1) { ++ b4 = s[0]; ++ } else if (sz == 2) { ++ b3 = s[0]; ++ b4 = s[1]; ++ } else if (sz == 3) { ++ b2 = s[0]; ++ b3 = s[1]; ++ b4 = s[2]; ++ } else if (sz == 4) { ++ b1 = s[0]; ++ b2 = s[1]; ++ b3 = s[2]; ++ b4 = s[3]; ++ } else { ++ /* ++ * This is a fallback and should not happen if the function ++ * was called properly. ++ */ ++ return (NULL); ++ } ++ ++ b1 = u8_composition_b1_tbl[uv][b1]; ++ if (b1 == U8_TBL_ELEMENT_NOT_DEF) ++ return (NULL); ++ ++ b2 = u8_composition_b2_tbl[uv][b1][b2]; ++ if (b2 == U8_TBL_ELEMENT_NOT_DEF) ++ return (NULL); ++ ++ b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id; ++ if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) ++ return (NULL); ++ ++ if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { ++ b3_tbl -= U8_16BIT_TABLE_INDICATOR; ++ start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4]; ++ end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; ++ } else { ++ start_id = u8_composition_b4_tbl[uv][b3_tbl][b4]; ++ end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1]; ++ } ++ ++ if (start_id >= end_id) ++ return (NULL); ++ ++ b3_base = u8_composition_b3_tbl[uv][b2][b3].base; ++ ++ return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id])); ++} ++ ++/* ++ * The blocked() function checks on the combining class values of previous ++ * characters in this sequence and return whether it is blocked or not. ++ */ ++static boolean_t ++blocked(uchar_t *comb_class, size_t last) ++{ ++ uchar_t my_comb_class; ++ size_t i; ++ ++ my_comb_class = comb_class[last]; ++ for (i = 1; i < last; i++) ++ if (comb_class[i] >= my_comb_class || ++ comb_class[i] == U8_COMBINING_CLASS_STARTER) ++ return (B_TRUE); ++ ++ return (B_FALSE); ++} ++ ++/* ++ * The do_composition() reads the character string pointed by 's' and ++ * do necessary canonical composition and then copy over the result back to ++ * the 's'. ++ * ++ * The input argument 's' cannot contain more than 32 characters. ++ */ ++static size_t ++do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start, ++ uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast) ++{ ++ uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1]; ++ uchar_t tc[U8_MB_CUR_MAX]; ++ uint8_t saved_marks[U8_MAX_CHARS_A_SEQ]; ++ size_t saved_marks_count; ++ uchar_t *p; ++ uchar_t *saved_p; ++ uchar_t *q; ++ size_t i; ++ size_t saved_i; ++ size_t j; ++ size_t k; ++ size_t l; ++ size_t C; ++ size_t saved_l; ++ size_t size; ++ uint32_t u1; ++ uint32_t u2; ++ boolean_t match_not_found = B_TRUE; ++ ++ /* ++ * This should never happen unless the callers are doing some strange ++ * and unexpected things. ++ * ++ * The "last" is the index pointing to the last character not last + 1. ++ */ ++ if (last >= U8_MAX_CHARS_A_SEQ) ++ last = U8_UPPER_LIMIT_IN_A_SEQ; ++ ++ for (i = l = 0; i <= last; i++) { ++ /* ++ * The last or any non-Starters at the beginning, we don't ++ * have any chance to do composition and so we just copy them ++ * to the temporary buffer. ++ */ ++ if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) { ++SAVE_THE_CHAR: ++ p = s + start[i]; ++ size = disp[i]; ++ for (k = 0; k < size; k++) ++ t[l++] = *p++; ++ continue; ++ } ++ ++ /* ++ * If this could be a start of Hangul Jamos, then, we try to ++ * conjoin them. ++ */ ++ if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) { ++ U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]], ++ s[start[i] + 1], s[start[i] + 2]); ++ U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3], ++ s[start[i] + 4], s[start[i] + 5]); ++ ++ if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) { ++ u1 -= U8_HANGUL_JAMO_L_FIRST; ++ u2 -= U8_HANGUL_JAMO_V_FIRST; ++ u1 = U8_HANGUL_SYL_FIRST + ++ (u1 * U8_HANGUL_V_COUNT + u2) * ++ U8_HANGUL_T_COUNT; ++ ++ i += 2; ++ if (i <= last) { ++ U8_PUT_3BYTES_INTO_UTF32(u2, ++ s[start[i]], s[start[i] + 1], ++ s[start[i] + 2]); ++ ++ if (U8_HANGUL_JAMO_T(u2)) { ++ u1 += u2 - ++ U8_HANGUL_JAMO_T_FIRST; ++ i++; ++ } ++ } ++ ++ U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1); ++ i--; ++ l += 3; ++ continue; ++ } ++ } ++ ++ /* ++ * Let's then find out if this Starter has composition ++ * mapping. ++ */ ++ p = find_composition_start(uv, s + start[i], disp[i]); ++ if (p == NULL) ++ goto SAVE_THE_CHAR; ++ ++ /* ++ * We have a Starter with composition mapping and the next ++ * character is a non-Starter. Let's try to find out if ++ * we can do composition. ++ */ ++ ++ saved_p = p; ++ saved_i = i; ++ saved_l = l; ++ saved_marks_count = 0; ++ ++TRY_THE_NEXT_MARK: ++ q = s + start[++i]; ++ size = disp[i]; ++ ++ /* ++ * The next for() loop compares the non-Starter pointed by ++ * 'q' with the possible (joinable) characters pointed by 'p'. ++ * ++ * The composition final table entry pointed by the 'p' ++ * looks like the following: ++ * ++ * +---+---+---+-...-+---+---+---+---+-...-+---+---+ ++ * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F | ++ * +---+---+---+-...-+---+---+---+---+-...-+---+---+ ++ * ++ * where C is the count byte indicating the number of ++ * mapping pairs where each pair would be look like ++ * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second ++ * character of a canonical decomposition and the B0-Bm are ++ * the bytes of a matching composite character. The F is ++ * a filler byte after each character as the separator. ++ */ ++ ++ match_not_found = B_TRUE; ++ ++ for (C = *p++; C > 0; C--) { ++ for (k = 0; k < size; p++, k++) ++ if (*p != q[k]) ++ break; ++ ++ /* Have we found it? */ ++ if (k >= size && *p == U8_TBL_ELEMENT_FILLER) { ++ match_not_found = B_FALSE; ++ ++ l = saved_l; ++ ++ while (*++p != U8_TBL_ELEMENT_FILLER) ++ t[l++] = *p; ++ ++ break; ++ } ++ ++ /* We didn't find; skip to the next pair. */ ++ if (*p != U8_TBL_ELEMENT_FILLER) ++ while (*++p != U8_TBL_ELEMENT_FILLER) ++ ; ++ while (*++p != U8_TBL_ELEMENT_FILLER) ++ ; ++ p++; ++ } ++ ++ /* ++ * If there was no match, we will need to save the combining ++ * mark for later appending. After that, if the next one ++ * is a non-Starter and not blocked, then, we try once ++ * again to do composition with the next non-Starter. ++ * ++ * If there was no match and this was a Starter, then, ++ * this is a new start. ++ * ++ * If there was a match and a composition done and we have ++ * more to check on, then, we retrieve a new composition final ++ * table entry for the composite and then try to do the ++ * composition again. ++ */ ++ ++ if (match_not_found) { ++ if (comb_class[i] == U8_COMBINING_CLASS_STARTER) { ++ i--; ++ goto SAVE_THE_CHAR; ++ } ++ ++ saved_marks[saved_marks_count++] = i; ++ } ++ ++ if (saved_l == l) { ++ while (i < last) { ++ if (blocked(comb_class, i + 1)) ++ saved_marks[saved_marks_count++] = ++i; ++ else ++ break; ++ } ++ if (i < last) { ++ p = saved_p; ++ goto TRY_THE_NEXT_MARK; ++ } ++ } else if (i < last) { ++ p = find_composition_start(uv, t + saved_l, ++ l - saved_l); ++ if (p != NULL) { ++ saved_p = p; ++ goto TRY_THE_NEXT_MARK; ++ } ++ } ++ ++ /* ++ * There is no more composition possible. ++ * ++ * If there was no composition what so ever then we copy ++ * over the original Starter and then append any non-Starters ++ * remaining at the target string sequentially after that. ++ */ ++ ++ if (saved_l == l) { ++ p = s + start[saved_i]; ++ size = disp[saved_i]; ++ for (j = 0; j < size; j++) ++ t[l++] = *p++; ++ } ++ ++ for (k = 0; k < saved_marks_count; k++) { ++ p = s + start[saved_marks[k]]; ++ size = disp[saved_marks[k]]; ++ for (j = 0; j < size; j++) ++ t[l++] = *p++; ++ } ++ } ++ ++ /* ++ * If the last character is a Starter and if we have a character ++ * (possibly another Starter) that can be turned into a composite, ++ * we do so and we do so until there is no more of composition ++ * possible. ++ */ ++ if (comb_class[last] == U8_COMBINING_CLASS_STARTER) { ++ p = *os; ++ saved_l = l - disp[last]; ++ ++ while (p < oslast) { ++ size = u8_number_of_bytes[*p]; ++ if (size <= 1 || (p + size) > oslast) ++ break; ++ ++ saved_p = p; ++ ++ for (i = 0; i < size; i++) ++ tc[i] = *p++; ++ ++ q = find_composition_start(uv, t + saved_l, ++ l - saved_l); ++ if (q == NULL) { ++ p = saved_p; ++ break; ++ } ++ ++ match_not_found = B_TRUE; ++ ++ for (C = *q++; C > 0; C--) { ++ for (k = 0; k < size; q++, k++) ++ if (*q != tc[k]) ++ break; ++ ++ if (k >= size && *q == U8_TBL_ELEMENT_FILLER) { ++ match_not_found = B_FALSE; ++ ++ l = saved_l; ++ ++ while (*++q != U8_TBL_ELEMENT_FILLER) { ++ /* ++ * This is practically ++ * impossible but we don't ++ * want to take any chances. ++ */ ++ if (l >= ++ U8_STREAM_SAFE_TEXT_MAX) { ++ p = saved_p; ++ goto SAFE_RETURN; ++ } ++ t[l++] = *q; ++ } ++ ++ break; ++ } ++ ++ if (*q != U8_TBL_ELEMENT_FILLER) ++ while (*++q != U8_TBL_ELEMENT_FILLER) ++ ; ++ while (*++q != U8_TBL_ELEMENT_FILLER) ++ ; ++ q++; ++ } ++ ++ if (match_not_found) { ++ p = saved_p; ++ break; ++ } ++ } ++SAFE_RETURN: ++ *os = p; ++ } ++ ++ /* ++ * Now we copy over the temporary string to the target string. ++ * Since composition always reduces the number of characters or ++ * the number of characters stay, we don't need to worry about ++ * the buffer overflow here. ++ */ ++ for (i = 0; i < l; i++) ++ s[i] = t[i]; ++ s[l] = '\0'; ++ ++ return (l); ++} ++ ++/* ++ * The collect_a_seq() function checks on the given string s, collect ++ * a sequence of characters at u8s, and return the sequence. While it collects ++ * a sequence, it also applies case conversion, canonical or compatibility ++ * decomposition, canonical decomposition, or some or all of them and ++ * in that order. ++ * ++ * The collected sequence cannot be bigger than 32 characters since if ++ * it is having more than 31 characters, the sequence will be terminated ++ * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into ++ * a Stream-Safe Text. The collected sequence is always terminated with ++ * a null byte and the return value is the byte length of the sequence ++ * including 0. The return value does not include the terminating ++ * null byte. ++ */ ++static size_t ++collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast, ++ boolean_t is_it_toupper, ++ boolean_t is_it_tolower, ++ boolean_t canonical_decomposition, ++ boolean_t compatibility_decomposition, ++ boolean_t canonical_composition, ++ int *errnum, u8_normalization_states_t *state) ++{ ++ uchar_t *s; ++ int sz; ++ int saved_sz; ++ size_t i; ++ size_t j; ++ size_t k; ++ size_t l; ++ uchar_t comb_class[U8_MAX_CHARS_A_SEQ]; ++ uchar_t disp[U8_MAX_CHARS_A_SEQ]; ++ uchar_t start[U8_MAX_CHARS_A_SEQ]; ++ uchar_t u8t[U8_MB_CUR_MAX]; ++ uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1]; ++ uchar_t tc; ++ size_t last; ++ size_t saved_last; ++ uint32_t u1; ++ ++ /* ++ * Save the source string pointer which we will return a changed ++ * pointer if we do processing. ++ */ ++ s = *source; ++ ++ /* ++ * The following is a fallback for just in case callers are not ++ * checking the string boundaries before the calling. ++ */ ++ if (s >= slast) { ++ u8s[0] = '\0'; ++ ++ return (0); ++ } ++ ++ /* ++ * As the first thing, let's collect a character and do case ++ * conversion if necessary. ++ */ ++ ++ sz = u8_number_of_bytes[*s]; ++ ++ if (sz < 0) { ++ *errnum = EILSEQ; ++ ++ u8s[0] = *s++; ++ u8s[1] = '\0'; ++ ++ *source = s; ++ ++ return (1); ++ } ++ ++ if (sz == 1) { ++ if (is_it_toupper) ++ u8s[0] = U8_ASCII_TOUPPER(*s); ++ else if (is_it_tolower) ++ u8s[0] = U8_ASCII_TOLOWER(*s); ++ else ++ u8s[0] = *s; ++ s++; ++ u8s[1] = '\0'; ++ } else if ((s + sz) > slast) { ++ *errnum = EINVAL; ++ ++ for (i = 0; s < slast; ) ++ u8s[i++] = *s++; ++ u8s[i] = '\0'; ++ ++ *source = s; ++ ++ return (i); ++ } else { ++ if (is_it_toupper || is_it_tolower) { ++ i = do_case_conv(uv, u8s, s, sz, is_it_toupper); ++ s += sz; ++ sz = i; ++ } else { ++ for (i = 0; i < sz; ) ++ u8s[i++] = *s++; ++ u8s[i] = '\0'; ++ } ++ } ++ ++ /* ++ * And then canonical/compatibility decomposition followed by ++ * an optional canonical composition. Please be noted that ++ * canonical composition is done only when a decomposition is ++ * done. ++ */ ++ if (canonical_decomposition || compatibility_decomposition) { ++ if (sz == 1) { ++ *state = U8_STATE_START; ++ ++ saved_sz = 1; ++ ++ comb_class[0] = 0; ++ start[0] = 0; ++ disp[0] = 1; ++ ++ last = 1; ++ } else { ++ saved_sz = do_decomp(uv, u8s, u8s, sz, ++ canonical_decomposition, state); ++ ++ last = 0; ++ ++ for (i = 0; i < saved_sz; ) { ++ sz = u8_number_of_bytes[u8s[i]]; ++ ++ comb_class[last] = combining_class(uv, ++ u8s + i, sz); ++ start[last] = i; ++ disp[last] = sz; ++ ++ last++; ++ i += sz; ++ } ++ ++ /* ++ * Decomposition yields various Hangul related ++ * states but not on combining marks. We need to ++ * find out at here by checking on the last ++ * character. ++ */ ++ if (*state == U8_STATE_START) { ++ if (comb_class[last - 1]) ++ *state = U8_STATE_COMBINING_MARK; ++ } ++ } ++ ++ saved_last = last; ++ ++ while (s < slast) { ++ sz = u8_number_of_bytes[*s]; ++ ++ /* ++ * If this is an illegal character, an incomplete ++ * character, or an 7-bit ASCII Starter character, ++ * then we have collected a sequence; break and let ++ * the next call deal with the two cases. ++ * ++ * Note that this is okay only if you are using this ++ * function with a fixed length string, not on ++ * a buffer with multiple calls of one chunk at a time. ++ */ ++ if (sz <= 1) { ++ break; ++ } else if ((s + sz) > slast) { ++ break; ++ } else { ++ /* ++ * If the previous character was a Hangul Jamo ++ * and this character is a Hangul Jamo that ++ * can be conjoined, we collect the Jamo. ++ */ ++ if (*s == U8_HANGUL_JAMO_1ST_BYTE) { ++ U8_PUT_3BYTES_INTO_UTF32(u1, ++ *s, *(s + 1), *(s + 2)); ++ ++ if (U8_HANGUL_COMPOSABLE_L_V(*state, ++ u1)) { ++ i = 0; ++ *state = U8_STATE_HANGUL_LV; ++ goto COLLECT_A_HANGUL; ++ } ++ ++ if (U8_HANGUL_COMPOSABLE_LV_T(*state, ++ u1)) { ++ i = 0; ++ *state = U8_STATE_HANGUL_LVT; ++ goto COLLECT_A_HANGUL; ++ } ++ } ++ ++ /* ++ * Regardless of whatever it was, if this is ++ * a Starter, we don't collect the character ++ * since that's a new start and we will deal ++ * with it at the next time. ++ */ ++ i = combining_class(uv, s, sz); ++ if (i == U8_COMBINING_CLASS_STARTER) ++ break; ++ ++ /* ++ * We know the current character is a combining ++ * mark. If the previous character wasn't ++ * a Starter (not Hangul) or a combining mark, ++ * then, we don't collect this combining mark. ++ */ ++ if (*state != U8_STATE_START && ++ *state != U8_STATE_COMBINING_MARK) ++ break; ++ ++ *state = U8_STATE_COMBINING_MARK; ++COLLECT_A_HANGUL: ++ /* ++ * If we collected a Starter and combining ++ * marks up to 30, i.e., total 31 characters, ++ * then, we terminate this degenerately long ++ * combining sequence with a U+034F COMBINING ++ * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in ++ * UTF-8 and turn this into a Stream-Safe ++ * Text. This will be extremely rare but ++ * possible. ++ * ++ * The following will also guarantee that ++ * we are not writing more than 32 characters ++ * plus a NULL at u8s[]. ++ */ ++ if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { ++TURN_STREAM_SAFE: ++ *state = U8_STATE_START; ++ comb_class[last] = 0; ++ start[last] = saved_sz; ++ disp[last] = 2; ++ last++; ++ ++ u8s[saved_sz++] = 0xCD; ++ u8s[saved_sz++] = 0x8F; ++ ++ break; ++ } ++ ++ /* ++ * Some combining marks also do decompose into ++ * another combining mark or marks. ++ */ ++ if (*state == U8_STATE_COMBINING_MARK) { ++ k = last; ++ l = sz; ++ i = do_decomp(uv, uts, s, sz, ++ canonical_decomposition, state); ++ for (j = 0; j < i; ) { ++ sz = u8_number_of_bytes[uts[j]]; ++ ++ comb_class[last] = ++ combining_class(uv, ++ uts + j, sz); ++ start[last] = saved_sz + j; ++ disp[last] = sz; ++ ++ last++; ++ if (last >= ++ U8_UPPER_LIMIT_IN_A_SEQ) { ++ last = k; ++ goto TURN_STREAM_SAFE; ++ } ++ j += sz; ++ } ++ ++ *state = U8_STATE_COMBINING_MARK; ++ sz = i; ++ s += l; ++ ++ for (i = 0; i < sz; i++) ++ u8s[saved_sz++] = uts[i]; ++ } else { ++ comb_class[last] = i; ++ start[last] = saved_sz; ++ disp[last] = sz; ++ last++; ++ ++ for (i = 0; i < sz; i++) ++ u8s[saved_sz++] = *s++; ++ } ++ ++ /* ++ * If this is U+0345 COMBINING GREEK ++ * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a., ++ * iota subscript, and need to be converted to ++ * uppercase letter, convert it to U+0399 GREEK ++ * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8), ++ * i.e., convert to capital adscript form as ++ * specified in the Unicode standard. ++ * ++ * This is the only special case of (ambiguous) ++ * case conversion at combining marks and ++ * probably the standard will never have ++ * anything similar like this in future. ++ */ ++ if (is_it_toupper && sz >= 2 && ++ u8s[saved_sz - 2] == 0xCD && ++ u8s[saved_sz - 1] == 0x85) { ++ u8s[saved_sz - 2] = 0xCE; ++ u8s[saved_sz - 1] = 0x99; ++ } ++ } ++ } ++ ++ /* ++ * Let's try to ensure a canonical ordering for the collected ++ * combining marks. We do this only if we have collected ++ * at least one more non-Starter. (The decomposition mapping ++ * data tables have fully (and recursively) expanded and ++ * canonically ordered decompositions.) ++ * ++ * The U8_SWAP_COMB_MARKS() convenience macro has some ++ * assumptions and we are meeting the assumptions. ++ */ ++ last--; ++ if (last >= saved_last) { ++ for (i = 0; i < last; i++) ++ for (j = last; j > i; j--) ++ if (comb_class[j] && ++ comb_class[j - 1] > comb_class[j]) { ++ U8_SWAP_COMB_MARKS(j - 1, j); ++ } ++ } ++ ++ *source = s; ++ ++ if (! canonical_composition) { ++ u8s[saved_sz] = '\0'; ++ return (saved_sz); ++ } ++ ++ /* ++ * Now do the canonical composition. Note that we do this ++ * only after a canonical or compatibility decomposition to ++ * finish up NFC or NFKC. ++ */ ++ sz = do_composition(uv, u8s, comb_class, start, disp, last, ++ &s, slast); ++ } ++ ++ *source = s; ++ ++ return ((size_t)sz); ++} ++ ++/* ++ * The do_norm_compare() function does string comparion based on Unicode ++ * simple case mappings and Unicode Normalization definitions. ++ * ++ * It does so by collecting a sequence of character at a time and comparing ++ * the collected sequences from the strings. ++ * ++ * The meanings on the return values are the same as the usual strcmp(). ++ */ ++static int ++do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2, ++ int flag, int *errnum) ++{ ++ int result; ++ size_t sz1; ++ size_t sz2; ++ uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1]; ++ uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1]; ++ uchar_t *s1last; ++ uchar_t *s2last; ++ boolean_t is_it_toupper; ++ boolean_t is_it_tolower; ++ boolean_t canonical_decomposition; ++ boolean_t compatibility_decomposition; ++ boolean_t canonical_composition; ++ u8_normalization_states_t state; ++ ++ s1last = s1 + n1; ++ s2last = s2 + n2; ++ ++ is_it_toupper = flag & U8_TEXTPREP_TOUPPER; ++ is_it_tolower = flag & U8_TEXTPREP_TOLOWER; ++ canonical_decomposition = flag & U8_CANON_DECOMP; ++ compatibility_decomposition = flag & U8_COMPAT_DECOMP; ++ canonical_composition = flag & U8_CANON_COMP; ++ ++ while (s1 < s1last && s2 < s2last) { ++ /* ++ * If the current character is a 7-bit ASCII and the last ++ * character, or, if the current character and the next ++ * character are both some 7-bit ASCII characters then ++ * we treat the current character as a sequence. ++ * ++ * In any other cases, we need to call collect_a_seq(). ++ */ ++ ++ if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last || ++ ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) { ++ if (is_it_toupper) ++ u8s1[0] = U8_ASCII_TOUPPER(*s1); ++ else if (is_it_tolower) ++ u8s1[0] = U8_ASCII_TOLOWER(*s1); ++ else ++ u8s1[0] = *s1; ++ u8s1[1] = '\0'; ++ sz1 = 1; ++ s1++; ++ } else { ++ state = U8_STATE_START; ++ sz1 = collect_a_seq(uv, u8s1, &s1, s1last, ++ is_it_toupper, is_it_tolower, ++ canonical_decomposition, ++ compatibility_decomposition, ++ canonical_composition, errnum, &state); ++ } ++ ++ if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last || ++ ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) { ++ if (is_it_toupper) ++ u8s2[0] = U8_ASCII_TOUPPER(*s2); ++ else if (is_it_tolower) ++ u8s2[0] = U8_ASCII_TOLOWER(*s2); ++ else ++ u8s2[0] = *s2; ++ u8s2[1] = '\0'; ++ sz2 = 1; ++ s2++; ++ } else { ++ state = U8_STATE_START; ++ sz2 = collect_a_seq(uv, u8s2, &s2, s2last, ++ is_it_toupper, is_it_tolower, ++ canonical_decomposition, ++ compatibility_decomposition, ++ canonical_composition, errnum, &state); ++ } ++ ++ /* ++ * Now compare the two characters. If they are the same, ++ * we move on to the next character sequences. ++ */ ++ if (sz1 == 1 && sz2 == 1) { ++ if (*u8s1 > *u8s2) ++ return (1); ++ if (*u8s1 < *u8s2) ++ return (-1); ++ } else { ++ result = strcmp((const char *)u8s1, (const char *)u8s2); ++ if (result != 0) ++ return (result); ++ } ++ } ++ ++ /* ++ * We compared until the end of either or both strings. ++ * ++ * If we reached to or went over the ends for the both, that means ++ * they are the same. ++ * ++ * If we reached only one end, that means the other string has ++ * something which then can be used to determine the return value. ++ */ ++ if (s1 >= s1last) { ++ if (s2 >= s2last) ++ return (0); ++ return (-1); ++ } ++ return (1); ++} ++ ++/* ++ * The u8_strcmp() function compares two UTF-8 strings quite similar to ++ * the strcmp(). For the comparison, however, Unicode Normalization specific ++ * equivalency and Unicode simple case conversion mappings based equivalency ++ * can be requested and checked against. ++ */ ++int ++u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv, ++ int *errnum) ++{ ++ int f; ++ size_t n1; ++ size_t n2; ++ ++ *errnum = 0; ++ ++ /* ++ * Check on the requested Unicode version, case conversion, and ++ * normalization flag values. ++ */ ++ ++ if (uv > U8_UNICODE_LATEST) { ++ *errnum = ERANGE; ++ uv = U8_UNICODE_LATEST; ++ } ++ ++ if (flag == 0) { ++ flag = U8_STRCMP_CS; ++ } else { ++ f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER | ++ U8_STRCMP_CI_LOWER); ++ if (f == 0) { ++ flag |= U8_STRCMP_CS; ++ } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER && ++ f != U8_STRCMP_CI_LOWER) { ++ *errnum = EBADF; ++ flag = U8_STRCMP_CS; ++ } ++ ++ f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); ++ if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC && ++ f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) { ++ *errnum = EBADF; ++ flag = U8_STRCMP_CS; ++ } ++ } ++ ++ if (flag == U8_STRCMP_CS) { ++ return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n)); ++ } ++ ++ n1 = strlen(s1); ++ n2 = strlen(s2); ++ if (n != 0) { ++ if (n < n1) ++ n1 = n; ++ if (n < n2) ++ n2 = n; ++ } ++ ++ /* ++ * Simple case conversion can be done much faster and so we do ++ * them separately here. ++ */ ++ if (flag == U8_STRCMP_CI_UPPER) { ++ return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, ++ n1, n2, B_TRUE, errnum)); ++ } else if (flag == U8_STRCMP_CI_LOWER) { ++ return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, ++ n1, n2, B_FALSE, errnum)); ++ } ++ ++ return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, ++ flag, errnum)); ++} ++ ++size_t ++u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen, ++ int flag, size_t unicode_version, int *errnum) ++{ ++ int f; ++ int sz; ++ uchar_t *ib; ++ uchar_t *ibtail; ++ uchar_t *ob; ++ uchar_t *obtail; ++ boolean_t do_not_ignore_null; ++ boolean_t do_not_ignore_invalid; ++ boolean_t is_it_toupper; ++ boolean_t is_it_tolower; ++ boolean_t canonical_decomposition; ++ boolean_t compatibility_decomposition; ++ boolean_t canonical_composition; ++ size_t ret_val; ++ size_t i; ++ size_t j; ++ uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1]; ++ u8_normalization_states_t state; ++ ++ if (unicode_version > U8_UNICODE_LATEST) { ++ *errnum = ERANGE; ++ return ((size_t)-1); ++ } ++ ++ f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER); ++ if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) { ++ *errnum = EBADF; ++ return ((size_t)-1); ++ } ++ ++ f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); ++ if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC && ++ f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) { ++ *errnum = EBADF; ++ return ((size_t)-1); ++ } ++ ++ if (inarray == NULL || *inlen == 0) ++ return (0); ++ ++ if (outarray == NULL) { ++ *errnum = E2BIG; ++ return ((size_t)-1); ++ } ++ ++ ib = (uchar_t *)inarray; ++ ob = (uchar_t *)outarray; ++ ibtail = ib + *inlen; ++ obtail = ob + *outlen; ++ ++ do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL); ++ do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID); ++ is_it_toupper = flag & U8_TEXTPREP_TOUPPER; ++ is_it_tolower = flag & U8_TEXTPREP_TOLOWER; ++ ++ ret_val = 0; ++ ++ /* ++ * If we don't have a normalization flag set, we do the simple case ++ * conversion based text preparation separately below. Text ++ * preparation involving Normalization will be done in the false task ++ * block, again, separately since it will take much more time and ++ * resource than doing simple case conversions. ++ */ ++ if (f == 0) { ++ while (ib < ibtail) { ++ if (*ib == '\0' && do_not_ignore_null) ++ break; ++ ++ sz = u8_number_of_bytes[*ib]; ++ ++ if (sz < 0) { ++ if (do_not_ignore_invalid) { ++ *errnum = EILSEQ; ++ ret_val = (size_t)-1; ++ break; ++ } ++ ++ sz = 1; ++ ret_val++; ++ } ++ ++ if (sz == 1) { ++ if (ob >= obtail) { ++ *errnum = E2BIG; ++ ret_val = (size_t)-1; ++ break; ++ } ++ ++ if (is_it_toupper) ++ *ob = U8_ASCII_TOUPPER(*ib); ++ else if (is_it_tolower) ++ *ob = U8_ASCII_TOLOWER(*ib); ++ else ++ *ob = *ib; ++ ib++; ++ ob++; ++ } else if ((ib + sz) > ibtail) { ++ if (do_not_ignore_invalid) { ++ *errnum = EINVAL; ++ ret_val = (size_t)-1; ++ break; ++ } ++ ++ if ((obtail - ob) < (ibtail - ib)) { ++ *errnum = E2BIG; ++ ret_val = (size_t)-1; ++ break; ++ } ++ ++ /* ++ * We treat the remaining incomplete character ++ * bytes as a character. ++ */ ++ ret_val++; ++ ++ while (ib < ibtail) ++ *ob++ = *ib++; ++ } else { ++ if (is_it_toupper || is_it_tolower) { ++ i = do_case_conv(unicode_version, u8s, ++ ib, sz, is_it_toupper); ++ ++ if ((obtail - ob) < i) { ++ *errnum = E2BIG; ++ ret_val = (size_t)-1; ++ break; ++ } ++ ++ ib += sz; ++ ++ for (sz = 0; sz < i; sz++) ++ *ob++ = u8s[sz]; ++ } else { ++ if ((obtail - ob) < sz) { ++ *errnum = E2BIG; ++ ret_val = (size_t)-1; ++ break; ++ } ++ ++ for (i = 0; i < sz; i++) ++ *ob++ = *ib++; ++ } ++ } ++ } ++ } else { ++ canonical_decomposition = flag & U8_CANON_DECOMP; ++ compatibility_decomposition = flag & U8_COMPAT_DECOMP; ++ canonical_composition = flag & U8_CANON_COMP; ++ ++ while (ib < ibtail) { ++ if (*ib == '\0' && do_not_ignore_null) ++ break; ++ ++ /* ++ * If the current character is a 7-bit ASCII ++ * character and it is the last character, or, ++ * if the current character is a 7-bit ASCII ++ * character and the next character is also a 7-bit ++ * ASCII character, then, we copy over this ++ * character without going through collect_a_seq(). ++ * ++ * In any other cases, we need to look further with ++ * the collect_a_seq() function. ++ */ ++ if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail || ++ ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) { ++ if (ob >= obtail) { ++ *errnum = E2BIG; ++ ret_val = (size_t)-1; ++ break; ++ } ++ ++ if (is_it_toupper) ++ *ob = U8_ASCII_TOUPPER(*ib); ++ else if (is_it_tolower) ++ *ob = U8_ASCII_TOLOWER(*ib); ++ else ++ *ob = *ib; ++ ib++; ++ ob++; ++ } else { ++ *errnum = 0; ++ state = U8_STATE_START; ++ ++ j = collect_a_seq(unicode_version, u8s, ++ &ib, ibtail, ++ is_it_toupper, ++ is_it_tolower, ++ canonical_decomposition, ++ compatibility_decomposition, ++ canonical_composition, ++ errnum, &state); ++ ++ if (*errnum && do_not_ignore_invalid) { ++ ret_val = (size_t)-1; ++ break; ++ } ++ ++ if ((obtail - ob) < j) { ++ *errnum = E2BIG; ++ ret_val = (size_t)-1; ++ break; ++ } ++ ++ for (i = 0; i < j; i++) ++ *ob++ = u8s[i]; ++ } ++ } ++ } ++ ++ *inlen = ibtail - ib; ++ *outlen = obtail - ob; ++ ++ return (ret_val); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++ ++static int unicode_init(void) { return 0; } ++static int unicode_fini(void) { return 0; } ++ ++spl_module_init(unicode_init); ++spl_module_exit(unicode_fini); ++ ++MODULE_DESCRIPTION("Unicode implementation"); ++MODULE_AUTHOR(ZFS_META_AUTHOR); ++MODULE_LICENSE(ZFS_META_LICENSE); ++ ++EXPORT_SYMBOL(u8_validate); ++EXPORT_SYMBOL(u8_strcmp); ++EXPORT_SYMBOL(u8_textprep_str); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/unicode/uconv.c linux-3.2.33-go/fs/zfs/unicode/uconv.c +--- linux-3.2.33-go.orig/fs/zfs/unicode/uconv.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/unicode/uconv.c 2012-11-16 23:25:34.355039267 +0100 +@@ -0,0 +1,864 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++ ++ ++/* ++ * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32. ++ * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517) ++ * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F), ++ * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also ++ * the section 3C man pages. ++ * Interface stability: Committed ++ */ ++ ++#include ++#ifdef _KERNEL ++#include ++#include ++#include ++#include ++#include ++#include ++#else ++#include ++#endif /* _KERNEL */ ++#include ++#include ++ ++ ++/* ++ * The max and min values of high and low surrogate pairs of UTF-16, ++ * UTF-16 bit shift value, bit mask, and starting value outside of BMP. ++ */ ++#define UCONV_U16_HI_MIN (0xd800U) ++#define UCONV_U16_HI_MAX (0xdbffU) ++#define UCONV_U16_LO_MIN (0xdc00U) ++#define UCONV_U16_LO_MAX (0xdfffU) ++#define UCONV_U16_BIT_SHIFT (0x0400U) ++#define UCONV_U16_BIT_MASK (0x0fffffU) ++#define UCONV_U16_START (0x010000U) ++ ++/* The maximum value of Unicode coding space and ASCII coding space. */ ++#define UCONV_UNICODE_MAX (0x10ffffU) ++#define UCONV_ASCII_MAX (0x7fU) ++ ++/* The mask values for input and output endians. */ ++#define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN) ++#define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN) ++ ++/* Native and reversed endian macros. */ ++#ifdef _BIG_ENDIAN ++#define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN ++#define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN ++#define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN ++#define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN ++#else ++#define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN ++#define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN ++#define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN ++#define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN ++#endif /* _BIG_ENDIAN */ ++ ++/* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */ ++#define UCONV_BOM_NORMAL (0xfeffU) ++#define UCONV_BOM_SWAPPED (0xfffeU) ++#define UCONV_BOM_SWAPPED_32 (0xfffe0000U) ++ ++/* UTF-32 boundaries based on UTF-8 character byte lengths. */ ++#define UCONV_U8_ONE_BYTE (0x7fU) ++#define UCONV_U8_TWO_BYTES (0x7ffU) ++#define UCONV_U8_THREE_BYTES (0xffffU) ++#define UCONV_U8_FOUR_BYTES (0x10ffffU) ++ ++/* The common minimum and maximum values at the UTF-8 character bytes. */ ++#define UCONV_U8_BYTE_MIN (0x80U) ++#define UCONV_U8_BYTE_MAX (0xbfU) ++ ++/* ++ * The following "6" and "0x3f" came from "10xx xxxx" bit representation of ++ * UTF-8 character bytes. ++ */ ++#define UCONV_U8_BIT_SHIFT 6 ++#define UCONV_U8_BIT_MASK 0x3f ++ ++/* ++ * The following vector shows remaining bytes in a UTF-8 character. ++ * Index will be the first byte of the character. ++ */ ++static const uchar_t remaining_bytes_tbl[0x100] = { ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ ++/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ ++ 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ ++/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ ++ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ ++/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ ++ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ++ ++/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ ++ 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ++}; ++ ++/* ++ * The following is a vector of bit-masks to get used bits in ++ * the first byte of a UTF-8 character. Index is remaining bytes at above of ++ * the character. ++ */ ++#ifdef _KERNEL ++const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; ++#else ++static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; ++#endif /* _KERNEL */ ++ ++/* ++ * The following two vectors are to provide valid minimum and ++ * maximum values for the 2'nd byte of a multibyte UTF-8 character for ++ * better illegal sequence checking. The index value must be the value of ++ * the first byte of the UTF-8 character. ++ */ ++static const uchar_t valid_min_2nd_byte[0x100] = { ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ ++/* C0 C1 C2 C3 C4 C5 C6 C7 */ ++ 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++ ++/* C8 C9 CA CB CC CD CE CF */ ++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++ ++/* D0 D1 D2 D3 D4 D5 D6 D7 */ ++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++ ++/* D8 D9 DA DB DC DD DE DF */ ++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++ ++/* E0 E1 E2 E3 E4 E5 E6 E7 */ ++ 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++ ++/* E8 E9 EA EB EC ED EE EF */ ++ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ++ ++/* F0 F1 F2 F3 F4 F5 F6 F7 */ ++ 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, ++ ++ 0, 0, 0, 0, 0, 0, 0, 0 ++}; ++ ++static const uchar_t valid_max_2nd_byte[0x100] = { ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ ++/* C0 C1 C2 C3 C4 C5 C6 C7 */ ++ 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, ++ ++/* C8 C9 CA CB CC CD CE CF */ ++ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, ++ ++/* D0 D1 D2 D3 D4 D5 D6 D7 */ ++ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, ++ ++/* D8 D9 DA DB DC DD DE DF */ ++ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, ++ ++/* E0 E1 E2 E3 E4 E5 E6 E7 */ ++ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, ++ ++/* E8 E9 EA EB EC ED EE EF */ ++ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, ++ ++/* F0 F1 F2 F3 F4 F5 F6 F7 */ ++ 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, ++ ++ 0, 0, 0, 0, 0, 0, 0, 0 ++}; ++ ++ ++static int ++check_endian(int flag, int *in, int *out) ++{ ++ *in = flag & UCONV_IN_ENDIAN_MASKS; ++ ++ /* You cannot have both. */ ++ if (*in == UCONV_IN_ENDIAN_MASKS) ++ return (EBADF); ++ ++ if (*in == 0) ++ *in = UCONV_IN_NAT_ENDIAN; ++ ++ *out = flag & UCONV_OUT_ENDIAN_MASKS; ++ ++ /* You cannot have both. */ ++ if (*out == UCONV_OUT_ENDIAN_MASKS) ++ return (EBADF); ++ ++ if (*out == 0) ++ *out = UCONV_OUT_NAT_ENDIAN; ++ ++ return (0); ++} ++ ++static boolean_t ++check_bom16(const uint16_t *u16s, size_t u16l, int *in) ++{ ++ if (u16l > 0) { ++ if (*u16s == UCONV_BOM_NORMAL) { ++ *in = UCONV_IN_NAT_ENDIAN; ++ return (B_TRUE); ++ } ++ if (*u16s == UCONV_BOM_SWAPPED) { ++ *in = UCONV_IN_REV_ENDIAN; ++ return (B_TRUE); ++ } ++ } ++ ++ return (B_FALSE); ++} ++ ++static boolean_t ++check_bom32(const uint32_t *u32s, size_t u32l, int *in) ++{ ++ if (u32l > 0) { ++ if (*u32s == UCONV_BOM_NORMAL) { ++ *in = UCONV_IN_NAT_ENDIAN; ++ return (B_TRUE); ++ } ++ if (*u32s == UCONV_BOM_SWAPPED_32) { ++ *in = UCONV_IN_REV_ENDIAN; ++ return (B_TRUE); ++ } ++ } ++ ++ return (B_FALSE); ++} ++ ++int ++uconv_u16tou32(const uint16_t *u16s, size_t *utf16len, ++ uint32_t *u32s, size_t *utf32len, int flag) ++{ ++ int inendian; ++ int outendian; ++ size_t u16l; ++ size_t u32l; ++ uint32_t hi; ++ uint32_t lo; ++ boolean_t do_not_ignore_null; ++ ++ /* ++ * Do preliminary validity checks on parameters and collect info on ++ * endians. ++ */ ++ if (u16s == NULL || utf16len == NULL) ++ return (EILSEQ); ++ ++ if (u32s == NULL || utf32len == NULL) ++ return (E2BIG); ++ ++ if (check_endian(flag, &inendian, &outendian) != 0) ++ return (EBADF); ++ ++ /* ++ * Initialize input and output parameter buffer indices and ++ * temporary variables. ++ */ ++ u16l = u32l = 0; ++ hi = 0; ++ do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); ++ ++ /* ++ * Check on the BOM at the beginning of the input buffer if required ++ * and if there is indeed one, process it. ++ */ ++ if ((flag & UCONV_IN_ACCEPT_BOM) && ++ check_bom16(u16s, *utf16len, &inendian)) ++ u16l++; ++ ++ /* ++ * Reset inendian and outendian so that after this point, those can be ++ * used as condition values. ++ */ ++ inendian &= UCONV_IN_NAT_ENDIAN; ++ outendian &= UCONV_OUT_NAT_ENDIAN; ++ ++ /* ++ * If there is something in the input buffer and if necessary and ++ * requested, save the BOM at the output buffer. ++ */ ++ if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) ++ u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : ++ UCONV_BOM_SWAPPED_32; ++ ++ /* ++ * Do conversion; if encounter a surrogate pair, assemble high and ++ * low pair values to form a UTF-32 character. If a half of a pair ++ * exists alone, then, either it is an illegal (EILSEQ) or ++ * invalid (EINVAL) value. ++ */ ++ for (; u16l < *utf16len; u16l++) { ++ if (u16s[u16l] == 0 && do_not_ignore_null) ++ break; ++ ++ lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); ++ ++ if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { ++ if (hi) ++ return (EILSEQ); ++ hi = lo; ++ continue; ++ } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { ++ if (! hi) ++ return (EILSEQ); ++ lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + ++ lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) ++ + UCONV_U16_START; ++ hi = 0; ++ } else if (hi) { ++ return (EILSEQ); ++ } ++ ++ if (u32l >= *utf32len) ++ return (E2BIG); ++ ++ u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo); ++ } ++ ++ /* ++ * If high half didn't see low half, then, it's most likely the input ++ * parameter is incomplete. ++ */ ++ if (hi) ++ return (EINVAL); ++ ++ /* ++ * Save the number of consumed and saved characters. They do not ++ * include terminating NULL character (U+0000) at the end of ++ * the input buffer (even when UCONV_IGNORE_NULL isn't specified and ++ * the input buffer length is big enough to include the terminating ++ * NULL character). ++ */ ++ *utf16len = u16l; ++ *utf32len = u32l; ++ ++ return (0); ++} ++ ++int ++uconv_u16tou8(const uint16_t *u16s, size_t *utf16len, ++ uchar_t *u8s, size_t *utf8len, int flag) ++{ ++ int inendian; ++ int outendian; ++ size_t u16l; ++ size_t u8l; ++ uint32_t hi; ++ uint32_t lo; ++ boolean_t do_not_ignore_null; ++ ++ if (u16s == NULL || utf16len == NULL) ++ return (EILSEQ); ++ ++ if (u8s == NULL || utf8len == NULL) ++ return (E2BIG); ++ ++ if (check_endian(flag, &inendian, &outendian) != 0) ++ return (EBADF); ++ ++ u16l = u8l = 0; ++ hi = 0; ++ do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); ++ ++ if ((flag & UCONV_IN_ACCEPT_BOM) && ++ check_bom16(u16s, *utf16len, &inendian)) ++ u16l++; ++ ++ inendian &= UCONV_IN_NAT_ENDIAN; ++ ++ for (; u16l < *utf16len; u16l++) { ++ if (u16s[u16l] == 0 && do_not_ignore_null) ++ break; ++ ++ lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); ++ ++ if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { ++ if (hi) ++ return (EILSEQ); ++ hi = lo; ++ continue; ++ } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { ++ if (! hi) ++ return (EILSEQ); ++ lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + ++ lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) ++ + UCONV_U16_START; ++ hi = 0; ++ } else if (hi) { ++ return (EILSEQ); ++ } ++ ++ /* ++ * Now we convert a UTF-32 character into a UTF-8 character. ++ * Unicode coding space is between U+0000 and U+10FFFF; ++ * anything bigger is an illegal character. ++ */ ++ if (lo <= UCONV_U8_ONE_BYTE) { ++ if (u8l >= *utf8len) ++ return (E2BIG); ++ u8s[u8l++] = (uchar_t)lo; ++ } else if (lo <= UCONV_U8_TWO_BYTES) { ++ if ((u8l + 1) >= *utf8len) ++ return (E2BIG); ++ u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); ++ u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); ++ } else if (lo <= UCONV_U8_THREE_BYTES) { ++ if ((u8l + 2) >= *utf8len) ++ return (E2BIG); ++ u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); ++ u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); ++ u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); ++ } else if (lo <= UCONV_U8_FOUR_BYTES) { ++ if ((u8l + 3) >= *utf8len) ++ return (E2BIG); ++ u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); ++ u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); ++ u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); ++ u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); ++ } else { ++ return (EILSEQ); ++ } ++ } ++ ++ if (hi) ++ return (EINVAL); ++ ++ *utf16len = u16l; ++ *utf8len = u8l; ++ ++ return (0); ++} ++ ++int ++uconv_u32tou16(const uint32_t *u32s, size_t *utf32len, ++ uint16_t *u16s, size_t *utf16len, int flag) ++{ ++ int inendian; ++ int outendian; ++ size_t u16l; ++ size_t u32l; ++ uint32_t hi; ++ uint32_t lo; ++ boolean_t do_not_ignore_null; ++ ++ if (u32s == NULL || utf32len == NULL) ++ return (EILSEQ); ++ ++ if (u16s == NULL || utf16len == NULL) ++ return (E2BIG); ++ ++ if (check_endian(flag, &inendian, &outendian) != 0) ++ return (EBADF); ++ ++ u16l = u32l = 0; ++ do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); ++ ++ if ((flag & UCONV_IN_ACCEPT_BOM) && ++ check_bom32(u32s, *utf32len, &inendian)) ++ u32l++; ++ ++ inendian &= UCONV_IN_NAT_ENDIAN; ++ outendian &= UCONV_OUT_NAT_ENDIAN; ++ ++ if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) ++ u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : ++ UCONV_BOM_SWAPPED; ++ ++ for (; u32l < *utf32len; u32l++) { ++ if (u32s[u32l] == 0 && do_not_ignore_null) ++ break; ++ ++ hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); ++ ++ /* ++ * Anything bigger than the Unicode coding space, i.e., ++ * Unicode scalar value bigger than U+10FFFF, is an illegal ++ * character. ++ */ ++ if (hi > UCONV_UNICODE_MAX) ++ return (EILSEQ); ++ ++ /* ++ * Anything bigger than U+FFFF must be converted into ++ * a surrogate pair in UTF-16. ++ */ ++ if (hi >= UCONV_U16_START) { ++ lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + ++ UCONV_U16_LO_MIN; ++ hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + ++ UCONV_U16_HI_MIN; ++ ++ if ((u16l + 1) >= *utf16len) ++ return (E2BIG); ++ ++ if (outendian) { ++ u16s[u16l++] = (uint16_t)hi; ++ u16s[u16l++] = (uint16_t)lo; ++ } else { ++ u16s[u16l++] = BSWAP_16(((uint16_t)hi)); ++ u16s[u16l++] = BSWAP_16(((uint16_t)lo)); ++ } ++ } else { ++ if (u16l >= *utf16len) ++ return (E2BIG); ++ u16s[u16l++] = (outendian) ? (uint16_t)hi : ++ BSWAP_16(((uint16_t)hi)); ++ } ++ } ++ ++ *utf16len = u16l; ++ *utf32len = u32l; ++ ++ return (0); ++} ++ ++int ++uconv_u32tou8(const uint32_t *u32s, size_t *utf32len, ++ uchar_t *u8s, size_t *utf8len, int flag) ++{ ++ int inendian; ++ int outendian; ++ size_t u32l; ++ size_t u8l; ++ uint32_t lo; ++ boolean_t do_not_ignore_null; ++ ++ if (u32s == NULL || utf32len == NULL) ++ return (EILSEQ); ++ ++ if (u8s == NULL || utf8len == NULL) ++ return (E2BIG); ++ ++ if (check_endian(flag, &inendian, &outendian) != 0) ++ return (EBADF); ++ ++ u32l = u8l = 0; ++ do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); ++ ++ if ((flag & UCONV_IN_ACCEPT_BOM) && ++ check_bom32(u32s, *utf32len, &inendian)) ++ u32l++; ++ ++ inendian &= UCONV_IN_NAT_ENDIAN; ++ ++ for (; u32l < *utf32len; u32l++) { ++ if (u32s[u32l] == 0 && do_not_ignore_null) ++ break; ++ ++ lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); ++ ++ if (lo <= UCONV_U8_ONE_BYTE) { ++ if (u8l >= *utf8len) ++ return (E2BIG); ++ u8s[u8l++] = (uchar_t)lo; ++ } else if (lo <= UCONV_U8_TWO_BYTES) { ++ if ((u8l + 1) >= *utf8len) ++ return (E2BIG); ++ u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); ++ u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); ++ } else if (lo <= UCONV_U8_THREE_BYTES) { ++ if ((u8l + 2) >= *utf8len) ++ return (E2BIG); ++ u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); ++ u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); ++ u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); ++ } else if (lo <= UCONV_U8_FOUR_BYTES) { ++ if ((u8l + 3) >= *utf8len) ++ return (E2BIG); ++ u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); ++ u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); ++ u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); ++ u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); ++ } else { ++ return (EILSEQ); ++ } ++ } ++ ++ *utf32len = u32l; ++ *utf8len = u8l; ++ ++ return (0); ++} ++ ++int ++uconv_u8tou16(const uchar_t *u8s, size_t *utf8len, ++ uint16_t *u16s, size_t *utf16len, int flag) ++{ ++ int inendian; ++ int outendian; ++ size_t u16l; ++ size_t u8l; ++ uint32_t hi; ++ uint32_t lo; ++ int remaining_bytes; ++ int first_b; ++ boolean_t do_not_ignore_null; ++ ++ if (u8s == NULL || utf8len == NULL) ++ return (EILSEQ); ++ ++ if (u16s == NULL || utf16len == NULL) ++ return (E2BIG); ++ ++ if (check_endian(flag, &inendian, &outendian) != 0) ++ return (EBADF); ++ ++ u16l = u8l = 0; ++ do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); ++ ++ outendian &= UCONV_OUT_NAT_ENDIAN; ++ ++ if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) ++ u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : ++ UCONV_BOM_SWAPPED; ++ ++ for (; u8l < *utf8len; ) { ++ if (u8s[u8l] == 0 && do_not_ignore_null) ++ break; ++ ++ /* ++ * Collect a UTF-8 character and convert it to a UTF-32 ++ * character. In doing so, we screen out illegally formed ++ * UTF-8 characters and treat such as illegal characters. ++ * The algorithm at below also screens out anything bigger ++ * than the U+10FFFF. ++ * ++ * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for ++ * more details on the illegal values of UTF-8 character ++ * bytes. ++ */ ++ hi = (uint32_t)u8s[u8l++]; ++ ++ if (hi > UCONV_ASCII_MAX) { ++ if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) ++ return (EILSEQ); ++ ++ first_b = hi; ++ hi = hi & u8_masks_tbl[remaining_bytes]; ++ ++ for (; remaining_bytes > 0; remaining_bytes--) { ++ /* ++ * If we have no more bytes, the current ++ * UTF-8 character is incomplete. ++ */ ++ if (u8l >= *utf8len) ++ return (EINVAL); ++ ++ lo = (uint32_t)u8s[u8l++]; ++ ++ if (first_b) { ++ if (lo < valid_min_2nd_byte[first_b] || ++ lo > valid_max_2nd_byte[first_b]) ++ return (EILSEQ); ++ first_b = 0; ++ } else if (lo < UCONV_U8_BYTE_MIN || ++ lo > UCONV_U8_BYTE_MAX) { ++ return (EILSEQ); ++ } ++ hi = (hi << UCONV_U8_BIT_SHIFT) | ++ (lo & UCONV_U8_BIT_MASK); ++ } ++ } ++ ++ if (hi >= UCONV_U16_START) { ++ lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + ++ UCONV_U16_LO_MIN; ++ hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + ++ UCONV_U16_HI_MIN; ++ ++ if ((u16l + 1) >= *utf16len) ++ return (E2BIG); ++ ++ if (outendian) { ++ u16s[u16l++] = (uint16_t)hi; ++ u16s[u16l++] = (uint16_t)lo; ++ } else { ++ u16s[u16l++] = BSWAP_16(((uint16_t)hi)); ++ u16s[u16l++] = BSWAP_16(((uint16_t)lo)); ++ } ++ } else { ++ if (u16l >= *utf16len) ++ return (E2BIG); ++ ++ u16s[u16l++] = (outendian) ? (uint16_t)hi : ++ BSWAP_16(((uint16_t)hi)); ++ } ++ } ++ ++ *utf16len = u16l; ++ *utf8len = u8l; ++ ++ return (0); ++} ++ ++int ++uconv_u8tou32(const uchar_t *u8s, size_t *utf8len, ++ uint32_t *u32s, size_t *utf32len, int flag) ++{ ++ int inendian; ++ int outendian; ++ size_t u32l; ++ size_t u8l; ++ uint32_t hi; ++ uint32_t c; ++ int remaining_bytes; ++ int first_b; ++ boolean_t do_not_ignore_null; ++ ++ if (u8s == NULL || utf8len == NULL) ++ return (EILSEQ); ++ ++ if (u32s == NULL || utf32len == NULL) ++ return (E2BIG); ++ ++ if (check_endian(flag, &inendian, &outendian) != 0) ++ return (EBADF); ++ ++ u32l = u8l = 0; ++ do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); ++ ++ outendian &= UCONV_OUT_NAT_ENDIAN; ++ ++ if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) ++ u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : ++ UCONV_BOM_SWAPPED_32; ++ ++ for (; u8l < *utf8len; ) { ++ if (u8s[u8l] == 0 && do_not_ignore_null) ++ break; ++ ++ hi = (uint32_t)u8s[u8l++]; ++ ++ if (hi > UCONV_ASCII_MAX) { ++ if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) ++ return (EILSEQ); ++ ++ first_b = hi; ++ hi = hi & u8_masks_tbl[remaining_bytes]; ++ ++ for (; remaining_bytes > 0; remaining_bytes--) { ++ if (u8l >= *utf8len) ++ return (EINVAL); ++ ++ c = (uint32_t)u8s[u8l++]; ++ ++ if (first_b) { ++ if (c < valid_min_2nd_byte[first_b] || ++ c > valid_max_2nd_byte[first_b]) ++ return (EILSEQ); ++ first_b = 0; ++ } else if (c < UCONV_U8_BYTE_MIN || ++ c > UCONV_U8_BYTE_MAX) { ++ return (EILSEQ); ++ } ++ hi = (hi << UCONV_U8_BIT_SHIFT) | ++ (c & UCONV_U8_BIT_MASK); ++ } ++ } ++ ++ if (u32l >= *utf32len) ++ return (E2BIG); ++ ++ u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi); ++ } ++ ++ *utf32len = u32l; ++ *utf8len = u8l; ++ ++ return (0); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(uconv_u16tou32); ++EXPORT_SYMBOL(uconv_u16tou8); ++EXPORT_SYMBOL(uconv_u32tou16); ++EXPORT_SYMBOL(uconv_u32tou8); ++EXPORT_SYMBOL(uconv_u8tou16); ++EXPORT_SYMBOL(uconv_u8tou32); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zcommon/Makefile linux-3.2.33-go/fs/zfs/zcommon/Makefile +--- linux-3.2.33-go.orig/fs/zfs/zcommon/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zcommon/Makefile 2012-11-16 23:25:34.364039163 +0100 +@@ -0,0 +1,14 @@ ++MODULE := zcommon ++ ++EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) -Wno-unused-but-set-variable -DHAVE_SPL -D_KERNEL -DTEXT_DOMAIN=\"zfs-linux-kernel\" -DNDEBUG ++ ++obj-$(CONFIG_ZFS) := $(MODULE).o ++ ++$(MODULE)-objs += zfs_deleg.o ++$(MODULE)-objs += zfs_prop.o ++$(MODULE)-objs += zprop_common.o ++$(MODULE)-objs += zfs_namecheck.o ++$(MODULE)-objs += zfs_comutil.o ++$(MODULE)-objs += zfs_fletcher.o ++$(MODULE)-objs += zfs_uio.o ++$(MODULE)-objs += zpool_prop.o +diff -uNr linux-3.2.33-go.orig/fs/zfs/zcommon/Makefile.in linux-3.2.33-go/fs/zfs/zcommon/Makefile.in +--- linux-3.2.33-go.orig/fs/zfs/zcommon/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zcommon/Makefile.in 2012-11-16 23:25:34.354039278 +0100 +@@ -0,0 +1,14 @@ ++MODULE := zcommon ++ ++EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@ ++ ++obj-$(CONFIG_ZFS) := $(MODULE).o ++ ++$(MODULE)-objs += @top_srcdir@/module/zcommon/zfs_deleg.o ++$(MODULE)-objs += @top_srcdir@/module/zcommon/zfs_prop.o ++$(MODULE)-objs += @top_srcdir@/module/zcommon/zprop_common.o ++$(MODULE)-objs += @top_srcdir@/module/zcommon/zfs_namecheck.o ++$(MODULE)-objs += @top_srcdir@/module/zcommon/zfs_comutil.o ++$(MODULE)-objs += @top_srcdir@/module/zcommon/zfs_fletcher.o ++$(MODULE)-objs += @top_srcdir@/module/zcommon/zfs_uio.o ++$(MODULE)-objs += @top_srcdir@/module/zcommon/zpool_prop.o +diff -uNr linux-3.2.33-go.orig/fs/zfs/zcommon/zfs_comutil.c linux-3.2.33-go/fs/zfs/zcommon/zfs_comutil.c +--- linux-3.2.33-go.orig/fs/zfs/zcommon/zfs_comutil.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zcommon/zfs_comutil.c 2012-11-16 23:25:34.354039278 +0100 +@@ -0,0 +1,210 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* ++ * This file is intended for functions that ought to be common between user ++ * land (libzfs) and the kernel. When many common routines need to be shared ++ * then a separate file should to be created. ++ */ ++ ++#if defined(_KERNEL) ++#include ++#else ++#include ++#endif ++ ++#include ++#include ++#include ++#include ++#include "zfs_comutil.h" ++ ++/* ++ * Are there allocatable vdevs? ++ */ ++boolean_t ++zfs_allocatable_devs(nvlist_t *nv) ++{ ++ uint64_t is_log; ++ uint_t c; ++ nvlist_t **child; ++ uint_t children; ++ ++ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, ++ &child, &children) != 0) { ++ return (B_FALSE); ++ } ++ for (c = 0; c < children; c++) { ++ is_log = 0; ++ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, ++ &is_log); ++ if (!is_log) ++ return (B_TRUE); ++ } ++ return (B_FALSE); ++} ++ ++void ++zpool_get_rewind_policy(nvlist_t *nvl, zpool_rewind_policy_t *zrpp) ++{ ++ nvlist_t *policy; ++ nvpair_t *elem; ++ char *nm; ++ ++ /* Defaults */ ++ zrpp->zrp_request = ZPOOL_NO_REWIND; ++ zrpp->zrp_maxmeta = 0; ++ zrpp->zrp_maxdata = UINT64_MAX; ++ zrpp->zrp_txg = UINT64_MAX; ++ ++ if (nvl == NULL) ++ return; ++ ++ elem = NULL; ++ while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { ++ nm = nvpair_name(elem); ++ if (strcmp(nm, ZPOOL_REWIND_POLICY) == 0) { ++ if (nvpair_value_nvlist(elem, &policy) == 0) ++ zpool_get_rewind_policy(policy, zrpp); ++ return; ++ } else if (strcmp(nm, ZPOOL_REWIND_REQUEST) == 0) { ++ if (nvpair_value_uint32(elem, &zrpp->zrp_request) == 0) ++ if (zrpp->zrp_request & ~ZPOOL_REWIND_POLICIES) ++ zrpp->zrp_request = ZPOOL_NO_REWIND; ++ } else if (strcmp(nm, ZPOOL_REWIND_REQUEST_TXG) == 0) { ++ (void) nvpair_value_uint64(elem, &zrpp->zrp_txg); ++ } else if (strcmp(nm, ZPOOL_REWIND_META_THRESH) == 0) { ++ (void) nvpair_value_uint64(elem, &zrpp->zrp_maxmeta); ++ } else if (strcmp(nm, ZPOOL_REWIND_DATA_THRESH) == 0) { ++ (void) nvpair_value_uint64(elem, &zrpp->zrp_maxdata); ++ } ++ } ++ if (zrpp->zrp_request == 0) ++ zrpp->zrp_request = ZPOOL_NO_REWIND; ++} ++ ++typedef struct zfs_version_spa_map { ++ int version_zpl; ++ int version_spa; ++} zfs_version_spa_map_t; ++ ++/* ++ * Keep this table in monotonically increasing version number order. ++ */ ++static zfs_version_spa_map_t zfs_version_table[] = { ++ {ZPL_VERSION_INITIAL, SPA_VERSION_INITIAL}, ++ {ZPL_VERSION_DIRENT_TYPE, SPA_VERSION_INITIAL}, ++ {ZPL_VERSION_FUID, SPA_VERSION_FUID}, ++ {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE}, ++ {ZPL_VERSION_SA, SPA_VERSION_SA}, ++ {0, 0} ++}; ++ ++/* ++ * Return the max zpl version for a corresponding spa version ++ * -1 is returned if no mapping exists. ++ */ ++int ++zfs_zpl_version_map(int spa_version) ++{ ++ int i; ++ int version = -1; ++ ++ for (i = 0; zfs_version_table[i].version_spa; i++) { ++ if (spa_version >= zfs_version_table[i].version_spa) ++ version = zfs_version_table[i].version_zpl; ++ } ++ ++ return (version); ++} ++ ++/* ++ * Return the min spa version for a corresponding spa version ++ * -1 is returned if no mapping exists. ++ */ ++int ++zfs_spa_version_map(int zpl_version) ++{ ++ int i; ++ int version = -1; ++ ++ for (i = 0; zfs_version_table[i].version_zpl; i++) { ++ if (zfs_version_table[i].version_zpl >= zpl_version) ++ return (zfs_version_table[i].version_spa); ++ } ++ ++ return (version); ++} ++ ++const char *zfs_history_event_names[LOG_END] = { ++ "invalid event", ++ "pool create", ++ "vdev add", ++ "pool remove", ++ "pool destroy", ++ "pool export", ++ "pool import", ++ "vdev attach", ++ "vdev replace", ++ "vdev detach", ++ "vdev online", ++ "vdev offline", ++ "vdev upgrade", ++ "pool clear", ++ "pool scrub", ++ "pool property set", ++ "create", ++ "clone", ++ "destroy", ++ "destroy_begin_sync", ++ "inherit", ++ "property set", ++ "quota set", ++ "permission update", ++ "permission remove", ++ "permission who remove", ++ "promote", ++ "receive", ++ "rename", ++ "reservation set", ++ "replay_inc_sync", ++ "replay_full_sync", ++ "rollback", ++ "snapshot", ++ "filesystem version upgrade", ++ "refquota set", ++ "refreservation set", ++ "pool scrub done", ++ "user hold", ++ "user release", ++ "pool split", ++}; ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(zfs_allocatable_devs); ++EXPORT_SYMBOL(zpool_get_rewind_policy); ++EXPORT_SYMBOL(zfs_zpl_version_map); ++EXPORT_SYMBOL(zfs_spa_version_map); ++EXPORT_SYMBOL(zfs_history_event_names); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zcommon/zfs_deleg.c linux-3.2.33-go/fs/zfs/zcommon/zfs_deleg.c +--- linux-3.2.33-go.orig/fs/zfs/zcommon/zfs_deleg.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zcommon/zfs_deleg.c 2012-11-16 23:25:34.354039278 +0100 +@@ -0,0 +1,244 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright 2010 Nexenta Systems, Inc. All rights reserved. ++ */ ++ ++#if defined(_KERNEL) ++#include ++#include ++#include ++#else ++#include ++#include ++#include ++#include ++#include ++#endif ++/* XXX includes zfs_context.h, so why bother with the above? */ ++#include ++#include "zfs_prop.h" ++#include "zfs_deleg.h" ++#include "zfs_namecheck.h" ++ ++/* ++ * permission table ++ * ++ * Keep this table in sorted order ++ * ++ * This table is used for displaying all permissions for ++ * zfs allow ++ */ ++ ++zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = { ++ {ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW}, ++ {ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE }, ++ {ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE }, ++ {ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY }, ++ {ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT }, ++ {ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE }, ++ {ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE }, ++ {ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, ++ {ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, ++ {ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, ++ {ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, ++ {ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, ++ {ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, ++ {ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, ++ {ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, ++ {ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, ++ {ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, ++ {ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD }, ++ {ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE }, ++ {ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF}, ++ {NULL, ZFS_DELEG_NOTE_NONE } ++}; ++ ++static int ++zfs_valid_permission_name(const char *perm) ++{ ++ if (zfs_deleg_canonicalize_perm(perm)) ++ return (0); ++ ++ return (permset_namecheck(perm, NULL, NULL)); ++} ++ ++const char * ++zfs_deleg_canonicalize_perm(const char *perm) ++{ ++ int i; ++ zfs_prop_t prop; ++ ++ for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) { ++ if (strcmp(perm, zfs_deleg_perm_tab[i].z_perm) == 0) ++ return (perm); ++ } ++ ++ prop = zfs_name_to_prop(perm); ++ if (prop != ZPROP_INVAL && zfs_prop_delegatable(prop)) ++ return (zfs_prop_to_name(prop)); ++ return (NULL); ++ ++} ++ ++static int ++zfs_validate_who(char *who) ++{ ++ char *p; ++ ++ if (who[2] != ZFS_DELEG_FIELD_SEP_CHR) ++ return (-1); ++ ++ switch (who[0]) { ++ case ZFS_DELEG_USER: ++ case ZFS_DELEG_GROUP: ++ case ZFS_DELEG_USER_SETS: ++ case ZFS_DELEG_GROUP_SETS: ++ if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT) ++ return (-1); ++ for (p = &who[3]; *p; p++) ++ if (!isdigit(*p)) ++ return (-1); ++ break; ++ ++ case ZFS_DELEG_NAMED_SET: ++ case ZFS_DELEG_NAMED_SET_SETS: ++ if (who[1] != ZFS_DELEG_NA) ++ return (-1); ++ return (permset_namecheck(&who[3], NULL, NULL)); ++ ++ case ZFS_DELEG_CREATE: ++ case ZFS_DELEG_CREATE_SETS: ++ if (who[1] != ZFS_DELEG_NA) ++ return (-1); ++ if (who[3] != '\0') ++ return (-1); ++ break; ++ ++ case ZFS_DELEG_EVERYONE: ++ case ZFS_DELEG_EVERYONE_SETS: ++ if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT) ++ return (-1); ++ if (who[3] != '\0') ++ return (-1); ++ break; ++ ++ default: ++ return (-1); ++ } ++ ++ return (0); ++} ++ ++int ++zfs_deleg_verify_nvlist(nvlist_t *nvp) ++{ ++ nvpair_t *who, *perm_name; ++ nvlist_t *perms; ++ int error; ++ ++ if (nvp == NULL) ++ return (-1); ++ ++ who = nvlist_next_nvpair(nvp, NULL); ++ if (who == NULL) ++ return (-1); ++ ++ do { ++ if (zfs_validate_who(nvpair_name(who))) ++ return (-1); ++ ++ error = nvlist_lookup_nvlist(nvp, nvpair_name(who), &perms); ++ ++ if (error && error != ENOENT) ++ return (-1); ++ if (error == ENOENT) ++ continue; ++ ++ perm_name = nvlist_next_nvpair(perms, NULL); ++ if (perm_name == NULL) { ++ return (-1); ++ } ++ do { ++ error = zfs_valid_permission_name( ++ nvpair_name(perm_name)); ++ if (error) ++ return (-1); ++ } while ((perm_name = nvlist_next_nvpair(perms, perm_name))); ++ } while ((who = nvlist_next_nvpair(nvp, who))); ++ return (0); ++} ++ ++/* ++ * Construct the base attribute name. The base attribute names ++ * are the "key" to locate the jump objects which contain the actual ++ * permissions. The base attribute names are encoded based on ++ * type of entry and whether it is a local or descendent permission. ++ * ++ * Arguments: ++ * attr - attribute name return string, attribute is assumed to be ++ * ZFS_MAX_DELEG_NAME long. ++ * type - type of entry to construct ++ * inheritchr - inheritance type (local,descendent, or NA for create and ++ * permission set definitions ++ * data - is either a permission set name or a 64 bit uid/gid. ++ */ ++void ++zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type, ++ char inheritchr, void *data) ++{ ++ int len = ZFS_MAX_DELEG_NAME; ++ uint64_t *id = data; ++ ++ switch (type) { ++ case ZFS_DELEG_USER: ++ case ZFS_DELEG_GROUP: ++ case ZFS_DELEG_USER_SETS: ++ case ZFS_DELEG_GROUP_SETS: ++ (void) snprintf(attr, len, "%c%c%c%lld", type, inheritchr, ++ ZFS_DELEG_FIELD_SEP_CHR, (longlong_t)*id); ++ break; ++ case ZFS_DELEG_NAMED_SET_SETS: ++ case ZFS_DELEG_NAMED_SET: ++ (void) snprintf(attr, len, "%c-%c%s", type, ++ ZFS_DELEG_FIELD_SEP_CHR, (char *)data); ++ break; ++ case ZFS_DELEG_CREATE: ++ case ZFS_DELEG_CREATE_SETS: ++ (void) snprintf(attr, len, "%c-%c", type, ++ ZFS_DELEG_FIELD_SEP_CHR); ++ break; ++ case ZFS_DELEG_EVERYONE: ++ case ZFS_DELEG_EVERYONE_SETS: ++ (void) snprintf(attr, len, "%c%c%c", type, inheritchr, ++ ZFS_DELEG_FIELD_SEP_CHR); ++ break; ++ default: ++ ASSERT(!"bad zfs_deleg_who_type_t"); ++ } ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(zfs_deleg_verify_nvlist); ++EXPORT_SYMBOL(zfs_deleg_whokey); ++EXPORT_SYMBOL(zfs_deleg_canonicalize_perm); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zcommon/zfs_fletcher.c linux-3.2.33-go/fs/zfs/zcommon/zfs_fletcher.c +--- linux-3.2.33-go.orig/fs/zfs/zcommon/zfs_fletcher.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zcommon/zfs_fletcher.c 2012-11-16 23:25:34.354039278 +0100 +@@ -0,0 +1,255 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++/* ++ * Fletcher Checksums ++ * ------------------ ++ * ++ * ZFS's 2nd and 4th order Fletcher checksums are defined by the following ++ * recurrence relations: ++ * ++ * a = a + f ++ * i i-1 i-1 ++ * ++ * b = b + a ++ * i i-1 i ++ * ++ * c = c + b (fletcher-4 only) ++ * i i-1 i ++ * ++ * d = d + c (fletcher-4 only) ++ * i i-1 i ++ * ++ * Where ++ * a_0 = b_0 = c_0 = d_0 = 0 ++ * and ++ * f_0 .. f_(n-1) are the input data. ++ * ++ * Using standard techniques, these translate into the following series: ++ * ++ * __n_ __n_ ++ * \ | \ | ++ * a = > f b = > i * f ++ * n /___| n - i n /___| n - i ++ * i = 1 i = 1 ++ * ++ * ++ * __n_ __n_ ++ * \ | i*(i+1) \ | i*(i+1)*(i+2) ++ * c = > ------- f d = > ------------- f ++ * n /___| 2 n - i n /___| 6 n - i ++ * i = 1 i = 1 ++ * ++ * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators. ++ * Since the additions are done mod (2^64), errors in the high bits may not ++ * be noticed. For this reason, fletcher-2 is deprecated. ++ * ++ * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators. ++ * A conservative estimate of how big the buffer can get before we overflow ++ * can be estimated using f_i = 0xffffffff for all i: ++ * ++ * % bc ++ * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4 ++ * 2264 ++ * quit ++ * % ++ * ++ * So blocks of up to 2k will not overflow. Our largest block size is ++ * 128k, which has 32k 4-byte words, so we can compute the largest possible ++ * accumulators, then divide by 2^64 to figure the max amount of overflow: ++ * ++ * % bc ++ * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c } ++ * a/2^64;b/2^64;c/2^64;d/2^64 ++ * 0 ++ * 0 ++ * 1365 ++ * 11186858 ++ * quit ++ * % ++ * ++ * So a and b cannot overflow. To make sure each bit of input has some ++ * effect on the contents of c and d, we can look at what the factors of ++ * the coefficients in the equations for c_n and d_n are. The number of 2s ++ * in the factors determines the lowest set bit in the multiplier. Running ++ * through the cases for n*(n+1)/2 reveals that the highest power of 2 is ++ * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow ++ * the 64-bit accumulators, every bit of every f_i effects every accumulator, ++ * even for 128k blocks. ++ * ++ * If we wanted to make a stronger version of fletcher4 (fletcher4c?), ++ * we could do our calculations mod (2^32 - 1) by adding in the carries ++ * periodically, and store the number of carries in the top 32-bits. ++ * ++ * -------------------- ++ * Checksum Performance ++ * -------------------- ++ * ++ * There are two interesting components to checksum performance: cached and ++ * uncached performance. With cached data, fletcher-2 is about four times ++ * faster than fletcher-4. With uncached data, the performance difference is ++ * negligible, since the cost of a cache fill dominates the processing time. ++ * Even though fletcher-4 is slower than fletcher-2, it is still a pretty ++ * efficient pass over the data. ++ * ++ * In normal operation, the data which is being checksummed is in a buffer ++ * which has been filled either by: ++ * ++ * 1. a compression step, which will be mostly cached, or ++ * 2. a bcopy() or copyin(), which will be uncached (because the ++ * copy is cache-bypassing). ++ * ++ * For both cached and uncached data, both fletcher checksums are much faster ++ * than sha-256, and slower than 'off', which doesn't touch the data at all. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++void ++fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) ++{ ++ const uint64_t *ip = buf; ++ const uint64_t *ipend = ip + (size / sizeof (uint64_t)); ++ uint64_t a0, b0, a1, b1; ++ ++ for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { ++ a0 += ip[0]; ++ a1 += ip[1]; ++ b0 += a0; ++ b1 += a1; ++ } ++ ++ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); ++} ++ ++void ++fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) ++{ ++ const uint64_t *ip = buf; ++ const uint64_t *ipend = ip + (size / sizeof (uint64_t)); ++ uint64_t a0, b0, a1, b1; ++ ++ for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { ++ a0 += BSWAP_64(ip[0]); ++ a1 += BSWAP_64(ip[1]); ++ b0 += a0; ++ b1 += a1; ++ } ++ ++ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); ++} ++ ++void ++fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) ++{ ++ const uint32_t *ip = buf; ++ const uint32_t *ipend = ip + (size / sizeof (uint32_t)); ++ uint64_t a, b, c, d; ++ ++ for (a = b = c = d = 0; ip < ipend; ip++) { ++ a += ip[0]; ++ b += a; ++ c += b; ++ d += c; ++ } ++ ++ ZIO_SET_CHECKSUM(zcp, a, b, c, d); ++} ++ ++void ++fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) ++{ ++ const uint32_t *ip = buf; ++ const uint32_t *ipend = ip + (size / sizeof (uint32_t)); ++ uint64_t a, b, c, d; ++ ++ for (a = b = c = d = 0; ip < ipend; ip++) { ++ a += BSWAP_32(ip[0]); ++ b += a; ++ c += b; ++ d += c; ++ } ++ ++ ZIO_SET_CHECKSUM(zcp, a, b, c, d); ++} ++ ++void ++fletcher_4_incremental_native(const void *buf, uint64_t size, ++ zio_cksum_t *zcp) ++{ ++ const uint32_t *ip = buf; ++ const uint32_t *ipend = ip + (size / sizeof (uint32_t)); ++ uint64_t a, b, c, d; ++ ++ a = zcp->zc_word[0]; ++ b = zcp->zc_word[1]; ++ c = zcp->zc_word[2]; ++ d = zcp->zc_word[3]; ++ ++ for (; ip < ipend; ip++) { ++ a += ip[0]; ++ b += a; ++ c += b; ++ d += c; ++ } ++ ++ ZIO_SET_CHECKSUM(zcp, a, b, c, d); ++} ++ ++void ++fletcher_4_incremental_byteswap(const void *buf, uint64_t size, ++ zio_cksum_t *zcp) ++{ ++ const uint32_t *ip = buf; ++ const uint32_t *ipend = ip + (size / sizeof (uint32_t)); ++ uint64_t a, b, c, d; ++ ++ a = zcp->zc_word[0]; ++ b = zcp->zc_word[1]; ++ c = zcp->zc_word[2]; ++ d = zcp->zc_word[3]; ++ ++ for (; ip < ipend; ip++) { ++ a += BSWAP_32(ip[0]); ++ b += a; ++ c += b; ++ d += c; ++ } ++ ++ ZIO_SET_CHECKSUM(zcp, a, b, c, d); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(fletcher_2_native); ++EXPORT_SYMBOL(fletcher_2_byteswap); ++EXPORT_SYMBOL(fletcher_4_native); ++EXPORT_SYMBOL(fletcher_4_byteswap); ++EXPORT_SYMBOL(fletcher_4_incremental_native); ++EXPORT_SYMBOL(fletcher_4_incremental_byteswap); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zcommon/zfs_namecheck.c linux-3.2.33-go/fs/zfs/zcommon/zfs_namecheck.c +--- linux-3.2.33-go.orig/fs/zfs/zcommon/zfs_namecheck.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zcommon/zfs_namecheck.c 2012-11-16 23:25:34.354039278 +0100 +@@ -0,0 +1,378 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++/* ++ * Common name validation routines for ZFS. These routines are shared by the ++ * userland code as well as the ioctl() layer to ensure that we don't ++ * inadvertently expose a hole through direct ioctl()s that never gets tested. ++ * In userland, however, we want significantly more information about _why_ the ++ * name is invalid. In the kernel, we only care whether it's valid or not. ++ * Each routine therefore takes a 'namecheck_err_t' which describes exactly why ++ * the name failed to validate. ++ * ++ * Each function returns 0 on success, -1 on error. ++ */ ++ ++#if defined(_KERNEL) ++#include ++#else ++#include ++#endif ++ ++#include ++#include ++#include "zfs_namecheck.h" ++#include "zfs_deleg.h" ++ ++static int ++valid_char(char c) ++{ ++ return ((c >= 'a' && c <= 'z') || ++ (c >= 'A' && c <= 'Z') || ++ (c >= '0' && c <= '9') || ++ c == '-' || c == '_' || c == '.' || c == ':' || c == ' '); ++} ++ ++/* ++ * Snapshot names must be made up of alphanumeric characters plus the following ++ * characters: ++ * ++ * [-_.: ] ++ */ ++int ++snapshot_namecheck(const char *path, namecheck_err_t *why, char *what) ++{ ++ const char *loc; ++ ++ if (strlen(path) >= MAXNAMELEN) { ++ if (why) ++ *why = NAME_ERR_TOOLONG; ++ return (-1); ++ } ++ ++ if (path[0] == '\0') { ++ if (why) ++ *why = NAME_ERR_EMPTY_COMPONENT; ++ return (-1); ++ } ++ ++ for (loc = path; *loc; loc++) { ++ if (!valid_char(*loc)) { ++ if (why) { ++ *why = NAME_ERR_INVALCHAR; ++ *what = *loc; ++ } ++ return (-1); ++ } ++ } ++ return (0); ++} ++ ++ ++/* ++ * Permissions set name must start with the letter '@' followed by the ++ * same character restrictions as snapshot names, except that the name ++ * cannot exceed 64 characters. ++ */ ++int ++permset_namecheck(const char *path, namecheck_err_t *why, char *what) ++{ ++ if (strlen(path) >= ZFS_PERMSET_MAXLEN) { ++ if (why) ++ *why = NAME_ERR_TOOLONG; ++ return (-1); ++ } ++ ++ if (path[0] != '@') { ++ if (why) { ++ *why = NAME_ERR_NO_AT; ++ *what = path[0]; ++ } ++ return (-1); ++ } ++ ++ return (snapshot_namecheck(&path[1], why, what)); ++} ++ ++/* ++ * Dataset names must be of the following form: ++ * ++ * [component][/]*[component][@component] ++ * ++ * Where each component is made up of alphanumeric characters plus the following ++ * characters: ++ * ++ * [-_.:%] ++ * ++ * We allow '%' here as we use that character internally to create unique ++ * names for temporary clones (for online recv). ++ */ ++int ++dataset_namecheck(const char *path, namecheck_err_t *why, char *what) ++{ ++ const char *loc, *end; ++ int found_snapshot; ++ ++ /* ++ * Make sure the name is not too long. ++ * ++ * ZFS_MAXNAMELEN is the maximum dataset length used in the userland ++ * which is the same as MAXNAMELEN used in the kernel. ++ * If ZFS_MAXNAMELEN value is changed, make sure to cleanup all ++ * places using MAXNAMELEN. ++ * ++ * When HAVE_KOBJ_NAME_LEN is defined the maximum safe kobject name ++ * length is 20 bytes. This 20 bytes is broken down as follows to ++ * provide a maximum safe /[@snapshot] length of only ++ * 18 bytes. To ensure bytes are left for [@snapshot] the ++ * portition is futher limited to 9 bytes. For 2.6.27 and ++ * newer kernels this limit is set to MAXNAMELEN. ++ * ++ * / + + ++ * (18) + (1) + (1) ++ */ ++#ifdef HAVE_KOBJ_NAME_LEN ++ if (strlen(path) > 18) { ++#else ++ if (strlen(path) >= MAXNAMELEN) { ++#endif /* HAVE_KOBJ_NAME_LEN */ ++ if (why) ++ *why = NAME_ERR_TOOLONG; ++ return (-1); ++ } ++ ++ /* Explicitly check for a leading slash. */ ++ if (path[0] == '/') { ++ if (why) ++ *why = NAME_ERR_LEADING_SLASH; ++ return (-1); ++ } ++ ++ if (path[0] == '\0') { ++ if (why) ++ *why = NAME_ERR_EMPTY_COMPONENT; ++ return (-1); ++ } ++ ++ loc = path; ++ found_snapshot = 0; ++ for (;;) { ++ /* Find the end of this component */ ++ end = loc; ++ while (*end != '/' && *end != '@' && *end != '\0') ++ end++; ++ ++ if (*end == '\0' && end[-1] == '/') { ++ /* trailing slashes are not allowed */ ++ if (why) ++ *why = NAME_ERR_TRAILING_SLASH; ++ return (-1); ++ } ++ ++ /* Zero-length components are not allowed */ ++ if (loc == end) { ++ if (why) { ++ /* ++ * Make sure this is really a zero-length ++ * component and not a '@@'. ++ */ ++ if (*end == '@' && found_snapshot) { ++ *why = NAME_ERR_MULTIPLE_AT; ++ } else { ++ *why = NAME_ERR_EMPTY_COMPONENT; ++ } ++ } ++ ++ return (-1); ++ } ++ ++ /* Validate the contents of this component */ ++ while (loc != end) { ++ if (!valid_char(*loc) && *loc != '%') { ++ if (why) { ++ *why = NAME_ERR_INVALCHAR; ++ *what = *loc; ++ } ++ return (-1); ++ } ++ loc++; ++ } ++ ++ /* If we've reached the end of the string, we're OK */ ++ if (*end == '\0') ++ return (0); ++ ++ if (*end == '@') { ++ /* ++ * If we've found an @ symbol, indicate that we're in ++ * the snapshot component, and report a second '@' ++ * character as an error. ++ */ ++ if (found_snapshot) { ++ if (why) ++ *why = NAME_ERR_MULTIPLE_AT; ++ return (-1); ++ } ++ ++ found_snapshot = 1; ++ } ++ ++ /* ++ * If there is a '/' in a snapshot name ++ * then report an error ++ */ ++ if (*end == '/' && found_snapshot) { ++ if (why) ++ *why = NAME_ERR_TRAILING_SLASH; ++ return (-1); ++ } ++ ++ /* Update to the next component */ ++ loc = end + 1; ++ } ++} ++ ++ ++/* ++ * mountpoint names must be of the following form: ++ * ++ * /[component][/]*[component][/] ++ */ ++int ++mountpoint_namecheck(const char *path, namecheck_err_t *why) ++{ ++ const char *start, *end; ++ ++ /* ++ * Make sure none of the mountpoint component names are too long. ++ * If a component name is too long then the mkdir of the mountpoint ++ * will fail but then the mountpoint property will be set to a value ++ * that can never be mounted. Better to fail before setting the prop. ++ * Extra slashes are OK, they will be tossed by the mountpoint mkdir. ++ */ ++ ++ if (path == NULL || *path != '/') { ++ if (why) ++ *why = NAME_ERR_LEADING_SLASH; ++ return (-1); ++ } ++ ++ /* Skip leading slash */ ++ start = &path[1]; ++ do { ++ end = start; ++ while (*end != '/' && *end != '\0') ++ end++; ++ ++ if (end - start >= MAXNAMELEN) { ++ if (why) ++ *why = NAME_ERR_TOOLONG; ++ return (-1); ++ } ++ start = end + 1; ++ ++ } while (*end != '\0'); ++ ++ return (0); ++} ++ ++/* ++ * For pool names, we have the same set of valid characters as described in ++ * dataset names, with the additional restriction that the pool name must begin ++ * with a letter. The pool names 'raidz' and 'mirror' are also reserved names ++ * that cannot be used. ++ */ ++int ++pool_namecheck(const char *pool, namecheck_err_t *why, char *what) ++{ ++ const char *c; ++ ++ /* ++ * Make sure the name is not too long. ++ * ++ * ZPOOL_MAXNAMELEN is the maximum pool length used in the userland ++ * which is the same as MAXNAMELEN used in the kernel. ++ * If ZPOOL_MAXNAMELEN value is changed, make sure to cleanup all ++ * places using MAXNAMELEN. ++ * ++ * When HAVE_KOBJ_NAME_LEN is defined the maximum safe kobject name ++ * length is 20 bytes. This 20 bytes is broken down as follows to ++ * provide a maximum safe /[@snapshot] length of only ++ * 18 bytes. To ensure bytes are left for [@snapshot] the ++ * portition is futher limited to 8 bytes. For 2.6.27 and ++ * newer kernels this limit is set to MAXNAMELEN. ++ * ++ * / + + ++ * (18) + (1) + (1) ++ */ ++#ifdef HAVE_KOBJ_NAME_LEN ++ if (strlen(pool) > 8) { ++#else ++ if (strlen(pool) >= MAXNAMELEN) { ++#endif /* HAVE_KOBJ_NAME_LEN */ ++ if (why) ++ *why = NAME_ERR_TOOLONG; ++ return (-1); ++ } ++ ++ c = pool; ++ while (*c != '\0') { ++ if (!valid_char(*c)) { ++ if (why) { ++ *why = NAME_ERR_INVALCHAR; ++ *what = *c; ++ } ++ return (-1); ++ } ++ c++; ++ } ++ ++ if (!(*pool >= 'a' && *pool <= 'z') && ++ !(*pool >= 'A' && *pool <= 'Z')) { ++ if (why) ++ *why = NAME_ERR_NOLETTER; ++ return (-1); ++ } ++ ++ if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) { ++ if (why) ++ *why = NAME_ERR_RESERVED; ++ return (-1); ++ } ++ ++ if (pool[0] == 'c' && (pool[1] >= '0' && pool[1] <= '9')) { ++ if (why) ++ *why = NAME_ERR_DISKLIKE; ++ return (-1); ++ } ++ ++ return (0); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(snapshot_namecheck); ++EXPORT_SYMBOL(pool_namecheck); ++EXPORT_SYMBOL(dataset_namecheck); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zcommon/zfs_prop.c linux-3.2.33-go/fs/zfs/zcommon/zfs_prop.c +--- linux-3.2.33-go.orig/fs/zfs/zcommon/zfs_prop.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zcommon/zfs_prop.c 2012-11-16 23:25:34.354039278 +0100 +@@ -0,0 +1,657 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ */ ++ ++/* Portions Copyright 2010 Robert Milkowski */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "zfs_prop.h" ++#include "zfs_deleg.h" ++ ++#if defined(_KERNEL) ++#include ++#else ++#include ++#include ++#include ++#endif ++ ++static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS]; ++ ++/* Note this is indexed by zfs_userquota_prop_t, keep the order the same */ ++const char *zfs_userquota_prop_prefixes[] = { ++ "userused@", ++ "userquota@", ++ "groupused@", ++ "groupquota@" ++}; ++ ++zprop_desc_t * ++zfs_prop_get_table(void) ++{ ++ return (zfs_prop_table); ++} ++ ++void ++zfs_prop_init(void) ++{ ++ static zprop_index_t checksum_table[] = { ++ { "on", ZIO_CHECKSUM_ON }, ++ { "off", ZIO_CHECKSUM_OFF }, ++ { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 }, ++ { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 }, ++ { "sha256", ZIO_CHECKSUM_SHA256 }, ++ { NULL } ++ }; ++ ++ static zprop_index_t dedup_table[] = { ++ { "on", ZIO_CHECKSUM_ON }, ++ { "off", ZIO_CHECKSUM_OFF }, ++ { "verify", ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY }, ++ { "sha256", ZIO_CHECKSUM_SHA256 }, ++ { "sha256,verify", ++ ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY }, ++ { NULL } ++ }; ++ ++ static zprop_index_t compress_table[] = { ++ { "on", ZIO_COMPRESS_ON }, ++ { "off", ZIO_COMPRESS_OFF }, ++ { "lzjb", ZIO_COMPRESS_LZJB }, ++ { "gzip", ZIO_COMPRESS_GZIP_6 }, /* gzip default */ ++ { "gzip-1", ZIO_COMPRESS_GZIP_1 }, ++ { "gzip-2", ZIO_COMPRESS_GZIP_2 }, ++ { "gzip-3", ZIO_COMPRESS_GZIP_3 }, ++ { "gzip-4", ZIO_COMPRESS_GZIP_4 }, ++ { "gzip-5", ZIO_COMPRESS_GZIP_5 }, ++ { "gzip-6", ZIO_COMPRESS_GZIP_6 }, ++ { "gzip-7", ZIO_COMPRESS_GZIP_7 }, ++ { "gzip-8", ZIO_COMPRESS_GZIP_8 }, ++ { "gzip-9", ZIO_COMPRESS_GZIP_9 }, ++ { "zle", ZIO_COMPRESS_ZLE }, ++ { NULL } ++ }; ++ ++ static zprop_index_t snapdir_table[] = { ++ { "hidden", ZFS_SNAPDIR_HIDDEN }, ++ { "visible", ZFS_SNAPDIR_VISIBLE }, ++ { NULL } ++ }; ++ ++ static zprop_index_t acl_inherit_table[] = { ++ { "discard", ZFS_ACL_DISCARD }, ++ { "noallow", ZFS_ACL_NOALLOW }, ++ { "restricted", ZFS_ACL_RESTRICTED }, ++ { "passthrough", ZFS_ACL_PASSTHROUGH }, ++ { "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatability */ ++ { "passthrough-x", ZFS_ACL_PASSTHROUGH_X }, ++ { NULL } ++ }; ++ ++ static zprop_index_t case_table[] = { ++ { "sensitive", ZFS_CASE_SENSITIVE }, ++ { "insensitive", ZFS_CASE_INSENSITIVE }, ++ { "mixed", ZFS_CASE_MIXED }, ++ { NULL } ++ }; ++ ++ static zprop_index_t copies_table[] = { ++ { "1", 1 }, ++ { "2", 2 }, ++ { "3", 3 }, ++ { NULL } ++ }; ++ ++ /* ++ * Use the unique flags we have to send to u8_strcmp() and/or ++ * u8_textprep() to represent the various normalization property ++ * values. ++ */ ++ static zprop_index_t normalize_table[] = { ++ { "none", 0 }, ++ { "formD", U8_TEXTPREP_NFD }, ++ { "formKC", U8_TEXTPREP_NFKC }, ++ { "formC", U8_TEXTPREP_NFC }, ++ { "formKD", U8_TEXTPREP_NFKD }, ++ { NULL } ++ }; ++ ++ static zprop_index_t version_table[] = { ++ { "1", 1 }, ++ { "2", 2 }, ++ { "3", 3 }, ++ { "4", 4 }, ++ { "5", 5 }, ++ { "current", ZPL_VERSION }, ++ { NULL } ++ }; ++ ++ static zprop_index_t boolean_table[] = { ++ { "off", 0 }, ++ { "on", 1 }, ++ { NULL } ++ }; ++ ++ static zprop_index_t logbias_table[] = { ++ { "latency", ZFS_LOGBIAS_LATENCY }, ++ { "throughput", ZFS_LOGBIAS_THROUGHPUT }, ++ { NULL } ++ }; ++ ++ static zprop_index_t canmount_table[] = { ++ { "off", ZFS_CANMOUNT_OFF }, ++ { "on", ZFS_CANMOUNT_ON }, ++ { "noauto", ZFS_CANMOUNT_NOAUTO }, ++ { NULL } ++ }; ++ ++ static zprop_index_t cache_table[] = { ++ { "none", ZFS_CACHE_NONE }, ++ { "metadata", ZFS_CACHE_METADATA }, ++ { "all", ZFS_CACHE_ALL }, ++ { NULL } ++ }; ++ ++ static zprop_index_t sync_table[] = { ++ { "standard", ZFS_SYNC_STANDARD }, ++ { "always", ZFS_SYNC_ALWAYS }, ++ { "disabled", ZFS_SYNC_DISABLED }, ++ { NULL } ++ }; ++ ++ static zprop_index_t xattr_table[] = { ++ { "off", ZFS_XATTR_OFF }, ++ { "on", ZFS_XATTR_DIR }, ++ { "sa", ZFS_XATTR_SA }, ++ { "dir", ZFS_XATTR_DIR }, ++ { NULL } ++ }; ++ ++ /* inherit index properties */ ++ zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD, ++ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, ++ "standard | always | disabled", "SYNC", ++ sync_table); ++ zprop_register_index(ZFS_PROP_CHECKSUM, "checksum", ++ ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ++ ZFS_TYPE_VOLUME, ++ "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM", ++ checksum_table); ++ zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF, ++ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, ++ "on | off | verify | sha256[,verify]", "DEDUP", ++ dedup_table); ++ zprop_register_index(ZFS_PROP_COMPRESSION, "compression", ++ ZIO_COMPRESS_DEFAULT, PROP_INHERIT, ++ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, ++ "on | off | lzjb | gzip | gzip-[1-9] | zle", "COMPRESS", ++ compress_table); ++ zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN, ++ PROP_INHERIT, ZFS_TYPE_FILESYSTEM, ++ "hidden | visible", "SNAPDIR", snapdir_table); ++ zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ++ ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, ++ "discard | noallow | restricted | passthrough | passthrough-x", ++ "ACLINHERIT", acl_inherit_table); ++ zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT, ++ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, ++ "1 | 2 | 3", "COPIES", copies_table); ++ zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache", ++ ZFS_CACHE_ALL, PROP_INHERIT, ++ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, ++ "all | none | metadata", "PRIMARYCACHE", cache_table); ++ zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache", ++ ZFS_CACHE_ALL, PROP_INHERIT, ++ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, ++ "all | none | metadata", "SECONDARYCACHE", cache_table); ++ zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY, ++ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, ++ "latency | throughput", "LOGBIAS", logbias_table); ++ zprop_register_index(ZFS_PROP_XATTR, "xattr", ZFS_XATTR_DIR, ++ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, ++ "on | off | dir | sa", "XATTR", xattr_table); ++ ++ /* inherit index (boolean) properties */ ++ zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, ++ ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table); ++ zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT, ++ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES", ++ boolean_table); ++ zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT, ++ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC", ++ boolean_table); ++ zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT, ++ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID", ++ boolean_table); ++ zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT, ++ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY", ++ boolean_table); ++ zprop_register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT, ++ ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table); ++ zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT, ++ ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", boolean_table); ++ zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT, ++ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND", ++ boolean_table); ++ ++ /* default index properties */ ++ zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT, ++ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, ++ "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table); ++ zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON, ++ PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto", ++ "CANMOUNT", canmount_table); ++ ++ /* readonly index (boolean) properties */ ++ zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY, ++ ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table); ++ zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0, ++ PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY", ++ boolean_table); ++ ++ /* set once index properties */ ++ zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0, ++ PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, ++ "none | formC | formD | formKC | formKD", "NORMALIZATION", ++ normalize_table); ++ zprop_register_index(ZFS_PROP_CASE, "casesensitivity", ++ ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ++ ZFS_TYPE_SNAPSHOT, ++ "sensitive | insensitive | mixed", "CASE", case_table); ++ ++ /* set once index (boolean) properties */ ++ zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME, ++ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, ++ "on | off", "UTF8ONLY", boolean_table); ++ ++ /* string properties */ ++ zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY, ++ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "ORIGIN"); ++ zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY, ++ ZFS_TYPE_SNAPSHOT, "[,...]", "CLONES"); ++ zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", ++ PROP_INHERIT, ZFS_TYPE_FILESYSTEM, " | legacy | none", ++ "MOUNTPOINT"); ++ zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", ++ PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", ++ "SHARENFS"); ++ zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY, ++ ZFS_TYPE_DATASET, "filesystem | volume | snapshot", "TYPE"); ++ zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", ++ PROP_INHERIT, ZFS_TYPE_FILESYSTEM, ++ "on | off | sharemgr(1M) options", "SHARESMB"); ++ zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel", ++ ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET, ++ "", "MLSLABEL"); ++ ++ /* readonly number properties */ ++ zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, ++ ZFS_TYPE_DATASET, "", "USED"); ++ zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY, ++ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "AVAIL"); ++ zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0, ++ PROP_READONLY, ZFS_TYPE_DATASET, "", "REFER"); ++ zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0, ++ PROP_READONLY, ZFS_TYPE_DATASET, ++ "<1.00x or higher if compressed>", "RATIO"); ++ zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0, ++ PROP_READONLY, ZFS_TYPE_DATASET, ++ "<1.00x or higher if compressed>", "REFRATIO"); ++ zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", ++ ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME, ++ ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK"); ++ zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, ++ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", ++ "USEDSNAP"); ++ zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, ++ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", ++ "USEDDS"); ++ zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, ++ PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", ++ "USEDCHILD"); ++ zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0, ++ PROP_READONLY, ++ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDREFRESERV"); ++ zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY, ++ ZFS_TYPE_SNAPSHOT, "", "USERREFS"); ++ zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY, ++ ZFS_TYPE_DATASET, "", "WRITTEN"); ++ ++ /* default number properties */ ++ zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, ++ ZFS_TYPE_FILESYSTEM, " | none", "QUOTA"); ++ zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0, ++ PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, ++ " | none", "RESERV"); ++ zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT, ++ ZFS_TYPE_VOLUME, "", "VOLSIZE"); ++ zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT, ++ ZFS_TYPE_FILESYSTEM, " | none", "REFQUOTA"); ++ zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0, ++ PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, ++ " | none", "REFRESERV"); ++ ++ /* inherit number properties */ ++ zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize", ++ SPA_MAXBLOCKSIZE, PROP_INHERIT, ++ ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE"); ++ ++ /* hidden properties */ ++ zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER, ++ PROP_READONLY, ZFS_TYPE_DATASET, "CREATETXG"); ++ zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER, ++ PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES"); ++ zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING, ++ PROP_READONLY, ZFS_TYPE_DATASET, "NAME"); ++ zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", ++ PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS"); ++ zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu", ++ PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, ++ "STMF_SBD_LU"); ++ zprop_register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, ++ PROP_READONLY, ZFS_TYPE_DATASET, "GUID"); ++ zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting", ++ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, ++ "USERACCOUNTING"); ++ zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER, ++ PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE"); ++ zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER, ++ PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID"); ++ ++ /* ++ * Property to be removed once libbe is integrated ++ */ ++ zprop_register_hidden(ZFS_PROP_PRIVATE, "priv_prop", ++ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_FILESYSTEM, ++ "PRIV_PROP"); ++ ++ /* oddball properties */ ++ zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, ++ NULL, PROP_READONLY, ZFS_TYPE_DATASET, ++ "", "CREATION", B_FALSE, B_TRUE, NULL); ++} ++ ++boolean_t ++zfs_prop_delegatable(zfs_prop_t prop) ++{ ++ zprop_desc_t *pd = &zfs_prop_table[prop]; ++ ++ /* The mlslabel property is never delegatable. */ ++ if (prop == ZFS_PROP_MLSLABEL) ++ return (B_FALSE); ++ ++ return (pd->pd_attr != PROP_READONLY); ++} ++ ++/* ++ * Given a zfs dataset property name, returns the corresponding property ID. ++ */ ++zfs_prop_t ++zfs_name_to_prop(const char *propname) ++{ ++ return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET)); ++} ++ ++/* ++ * For user property names, we allow all lowercase alphanumeric characters, plus ++ * a few useful punctuation characters. ++ */ ++static int ++valid_char(char c) ++{ ++ return ((c >= 'a' && c <= 'z') || ++ (c >= '0' && c <= '9') || ++ c == '-' || c == '_' || c == '.' || c == ':'); ++} ++ ++/* ++ * Returns true if this is a valid user-defined property (one with a ':'). ++ */ ++boolean_t ++zfs_prop_user(const char *name) ++{ ++ int i; ++ char c; ++ boolean_t foundsep = B_FALSE; ++ ++ for (i = 0; i < strlen(name); i++) { ++ c = name[i]; ++ if (!valid_char(c)) ++ return (B_FALSE); ++ if (c == ':') ++ foundsep = B_TRUE; ++ } ++ ++ if (!foundsep) ++ return (B_FALSE); ++ ++ return (B_TRUE); ++} ++ ++/* ++ * Returns true if this is a valid userspace-type property (one with a '@'). ++ * Note that after the @, any character is valid (eg, another @, for SID ++ * user@domain). ++ */ ++boolean_t ++zfs_prop_userquota(const char *name) ++{ ++ zfs_userquota_prop_t prop; ++ ++ for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) { ++ if (strncmp(name, zfs_userquota_prop_prefixes[prop], ++ strlen(zfs_userquota_prop_prefixes[prop])) == 0) { ++ return (B_TRUE); ++ } ++ } ++ ++ return (B_FALSE); ++} ++ ++/* ++ * Returns true if this is a valid written@ property. ++ * Note that after the @, any character is valid (eg, another @, for ++ * written@pool/fs@origin). ++ */ ++boolean_t ++zfs_prop_written(const char *name) ++{ ++ static const char *prefix = "written@"; ++ return (strncmp(name, prefix, strlen(prefix)) == 0); ++} ++ ++/* ++ * Tables of index types, plus functions to convert between the user view ++ * (strings) and internal representation (uint64_t). ++ */ ++int ++zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index) ++{ ++ return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET)); ++} ++ ++int ++zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string) ++{ ++ return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET)); ++} ++ ++uint64_t ++zfs_prop_random_value(zfs_prop_t prop, uint64_t seed) ++{ ++ return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET)); ++} ++ ++/* ++ * Returns TRUE if the property applies to any of the given dataset types. ++ */ ++boolean_t ++zfs_prop_valid_for_type(int prop, zfs_type_t types) ++{ ++ return (zprop_valid_for_type(prop, types)); ++} ++ ++zprop_type_t ++zfs_prop_get_type(zfs_prop_t prop) ++{ ++ return (zfs_prop_table[prop].pd_proptype); ++} ++ ++/* ++ * Returns TRUE if the property is readonly. ++ */ ++boolean_t ++zfs_prop_readonly(zfs_prop_t prop) ++{ ++ return (zfs_prop_table[prop].pd_attr == PROP_READONLY || ++ zfs_prop_table[prop].pd_attr == PROP_ONETIME); ++} ++ ++/* ++ * Returns TRUE if the property is only allowed to be set once. ++ */ ++boolean_t ++zfs_prop_setonce(zfs_prop_t prop) ++{ ++ return (zfs_prop_table[prop].pd_attr == PROP_ONETIME); ++} ++ ++const char * ++zfs_prop_default_string(zfs_prop_t prop) ++{ ++ return (zfs_prop_table[prop].pd_strdefault); ++} ++ ++uint64_t ++zfs_prop_default_numeric(zfs_prop_t prop) ++{ ++ return (zfs_prop_table[prop].pd_numdefault); ++} ++ ++/* ++ * Given a dataset property ID, returns the corresponding name. ++ * Assuming the zfs dataset property ID is valid. ++ */ ++const char * ++zfs_prop_to_name(zfs_prop_t prop) ++{ ++ return (zfs_prop_table[prop].pd_name); ++} ++ ++/* ++ * Returns TRUE if the property is inheritable. ++ */ ++boolean_t ++zfs_prop_inheritable(zfs_prop_t prop) ++{ ++ return (zfs_prop_table[prop].pd_attr == PROP_INHERIT || ++ zfs_prop_table[prop].pd_attr == PROP_ONETIME); ++} ++ ++#ifndef _KERNEL ++ ++/* ++ * Returns a string describing the set of acceptable values for the given ++ * zfs property, or NULL if it cannot be set. ++ */ ++const char * ++zfs_prop_values(zfs_prop_t prop) ++{ ++ return (zfs_prop_table[prop].pd_values); ++} ++ ++/* ++ * Returns TRUE if this property is a string type. Note that index types ++ * (compression, checksum) are treated as strings in userland, even though they ++ * are stored numerically on disk. ++ */ ++int ++zfs_prop_is_string(zfs_prop_t prop) ++{ ++ return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING || ++ zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX); ++} ++ ++/* ++ * Returns the column header for the given property. Used only in ++ * 'zfs list -o', but centralized here with the other property information. ++ */ ++const char * ++zfs_prop_column_name(zfs_prop_t prop) ++{ ++ return (zfs_prop_table[prop].pd_colname); ++} ++ ++/* ++ * Returns whether the given property should be displayed right-justified for ++ * 'zfs list'. ++ */ ++boolean_t ++zfs_prop_align_right(zfs_prop_t prop) ++{ ++ return (zfs_prop_table[prop].pd_rightalign); ++} ++ ++#endif ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++ ++static int zcommon_init(void) { return 0; } ++static int zcommon_fini(void) { return 0; } ++ ++spl_module_init(zcommon_init); ++spl_module_exit(zcommon_fini); ++ ++MODULE_DESCRIPTION("Generic ZFS support"); ++MODULE_AUTHOR(ZFS_META_AUTHOR); ++MODULE_LICENSE(ZFS_META_LICENSE); ++ ++/* zfs dataset property functions */ ++EXPORT_SYMBOL(zfs_userquota_prop_prefixes); ++EXPORT_SYMBOL(zfs_prop_init); ++EXPORT_SYMBOL(zfs_prop_get_type); ++EXPORT_SYMBOL(zfs_prop_get_table); ++EXPORT_SYMBOL(zfs_prop_delegatable); ++ ++/* Dataset property functions shared between libzfs and kernel. */ ++EXPORT_SYMBOL(zfs_prop_default_string); ++EXPORT_SYMBOL(zfs_prop_default_numeric); ++EXPORT_SYMBOL(zfs_prop_readonly); ++EXPORT_SYMBOL(zfs_prop_inheritable); ++EXPORT_SYMBOL(zfs_prop_setonce); ++EXPORT_SYMBOL(zfs_prop_to_name); ++EXPORT_SYMBOL(zfs_name_to_prop); ++EXPORT_SYMBOL(zfs_prop_user); ++EXPORT_SYMBOL(zfs_prop_userquota); ++EXPORT_SYMBOL(zfs_prop_index_to_string); ++EXPORT_SYMBOL(zfs_prop_string_to_index); ++EXPORT_SYMBOL(zfs_prop_valid_for_type); ++ ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zcommon/zfs_uio.c linux-3.2.33-go/fs/zfs/zcommon/zfs_uio.c +--- linux-3.2.33-go.orig/fs/zfs/zcommon/zfs_uio.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zcommon/zfs_uio.c 2012-11-16 23:25:34.354039278 +0100 +@@ -0,0 +1,255 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ ++/* All Rights Reserved */ ++ ++/* ++ * University Copyright- Copyright (c) 1982, 1986, 1988 ++ * The Regents of the University of California ++ * All Rights Reserved ++ * ++ * University Acknowledgment- Portions of this document are derived from ++ * software developed by the University of California, Berkeley, and its ++ * contributors. ++ */ ++ ++/* ++ * The uio support from OpenSolaris has been added as a short term ++ * work around. The hope is to adopt native Linux type and drop the ++ * use of uio's entirely. Under Linux they only add overhead and ++ * when possible we want to use native APIs for the ZPL layer. ++ */ ++#ifdef _KERNEL ++ ++#include ++#include ++ ++/* ++ * Move "n" bytes at byte address "p"; "rw" indicates the direction ++ * of the move, and the I/O parameters are provided in "uio", which is ++ * update to reflect the data which was moved. Returns 0 on success or ++ * a non-zero errno on failure. ++ */ ++int ++uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio) ++{ ++ struct iovec *iov; ++ ulong_t cnt; ++ ++ while (n && uio->uio_resid) { ++ iov = uio->uio_iov; ++ cnt = MIN(iov->iov_len, n); ++ if (cnt == 0l) { ++ uio->uio_iov++; ++ uio->uio_iovcnt--; ++ continue; ++ } ++ switch (uio->uio_segflg) { ++ case UIO_USERSPACE: ++ case UIO_USERISPACE: ++ /* p = kernel data pointer ++ * iov->iov_base = user data pointer */ ++ ++ if (rw == UIO_READ) { ++ if (copy_to_user(iov->iov_base, p, cnt)) ++ return EFAULT; ++ /* error = xcopyout_nta(p, iov->iov_base, cnt, ++ * (uio->uio_extflg & UIO_COPY_CACHED)); */ ++ } else { ++ /* error = xcopyin_nta(iov->iov_base, p, cnt, ++ * (uio->uio_extflg & UIO_COPY_CACHED)); */ ++ if (copy_from_user(p, iov->iov_base, cnt)) ++ return EFAULT; ++ } ++ break; ++ case UIO_SYSSPACE: ++ if (rw == UIO_READ) ++ bcopy(p, iov->iov_base, cnt); ++ else ++ bcopy(iov->iov_base, p, cnt); ++ break; ++ } ++ iov->iov_base += cnt; ++ iov->iov_len -= cnt; ++ uio->uio_resid -= cnt; ++ uio->uio_loffset += cnt; ++ p = (caddr_t)p + cnt; ++ n -= cnt; ++ } ++ return (0); ++} ++EXPORT_SYMBOL(uiomove); ++ ++#define fuword8(uptr, vptr) get_user((*vptr), (uptr)) ++ ++/* ++ * Fault in the pages of the first n bytes specified by the uio structure. ++ * 1 byte in each page is touched and the uio struct is unmodified. Any ++ * error will terminate the process as this is only a best attempt to get ++ * the pages resident. ++ */ ++void ++uio_prefaultpages(ssize_t n, struct uio *uio) ++{ ++ struct iovec *iov; ++ ulong_t cnt, incr; ++ caddr_t p; ++ uint8_t tmp; ++ int iovcnt; ++ ++ iov = uio->uio_iov; ++ iovcnt = uio->uio_iovcnt; ++ ++ while ((n > 0) && (iovcnt > 0)) { ++ cnt = MIN(iov->iov_len, n); ++ if (cnt == 0) { ++ /* empty iov entry */ ++ iov++; ++ iovcnt--; ++ continue; ++ } ++ n -= cnt; ++ /* ++ * touch each page in this segment. ++ */ ++ p = iov->iov_base; ++ while (cnt) { ++ switch (uio->uio_segflg) { ++ case UIO_USERSPACE: ++ case UIO_USERISPACE: ++ if (fuword8((uint8_t *) p, &tmp)) ++ return; ++ break; ++ case UIO_SYSSPACE: ++ bcopy(p, &tmp, 1); ++ break; ++ } ++ incr = MIN(cnt, PAGESIZE); ++ p += incr; ++ cnt -= incr; ++ } ++ /* ++ * touch the last byte in case it straddles a page. ++ */ ++ p--; ++ switch (uio->uio_segflg) { ++ case UIO_USERSPACE: ++ case UIO_USERISPACE: ++ if (fuword8((uint8_t *) p, &tmp)) ++ return; ++ break; ++ case UIO_SYSSPACE: ++ bcopy(p, &tmp, 1); ++ break; ++ } ++ iov++; ++ iovcnt--; ++ } ++} ++EXPORT_SYMBOL(uio_prefaultpages); ++ ++/* ++ * same as uiomove() but doesn't modify uio structure. ++ * return in cbytes how many bytes were copied. ++ */ ++int ++uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes) ++{ ++ struct iovec *iov; ++ ulong_t cnt; ++ int iovcnt; ++ ++ iovcnt = uio->uio_iovcnt; ++ *cbytes = 0; ++ ++ for (iov = uio->uio_iov; n && iovcnt; iov++, iovcnt--) { ++ cnt = MIN(iov->iov_len, n); ++ if (cnt == 0) ++ continue; ++ ++ switch (uio->uio_segflg) { ++ ++ case UIO_USERSPACE: ++ case UIO_USERISPACE: ++ /* p = kernel data pointer ++ * iov->iov_base = user data pointer */ ++ ++ if (rw == UIO_READ) { ++ /* * UIO_READ = copy data from kernel to user * */ ++ if (copy_to_user(iov->iov_base, p, cnt)) ++ return EFAULT; ++ /* error = xcopyout_nta(p, iov->iov_base, cnt, ++ * (uio->uio_extflg & UIO_COPY_CACHED)); */ ++ } else { ++ /* * UIO_WRITE = copy data from user to kernel * */ ++ /* error = xcopyin_nta(iov->iov_base, p, cnt, ++ * (uio->uio_extflg & UIO_COPY_CACHED)); */ ++ if (copy_from_user(p, iov->iov_base, cnt)) ++ return EFAULT; ++ } ++ break; ++ ++ case UIO_SYSSPACE: ++ if (rw == UIO_READ) ++ bcopy(p, iov->iov_base, cnt); ++ else ++ bcopy(iov->iov_base, p, cnt); ++ break; ++ } ++ p = (caddr_t)p + cnt; ++ n -= cnt; ++ *cbytes += cnt; ++ } ++ return (0); ++} ++EXPORT_SYMBOL(uiocopy); ++ ++/* ++ * Drop the next n chars out of *uiop. ++ */ ++void ++uioskip(uio_t *uiop, size_t n) ++{ ++ if (n > uiop->uio_resid) ++ return; ++ while (n != 0) { ++ iovec_t *iovp = uiop->uio_iov; ++ size_t niovb = MIN(iovp->iov_len, n); ++ ++ if (niovb == 0) { ++ uiop->uio_iov++; ++ uiop->uio_iovcnt--; ++ continue; ++ } ++ iovp->iov_base += niovb; ++ uiop->uio_loffset += niovb; ++ iovp->iov_len -= niovb; ++ uiop->uio_resid -= niovb; ++ n -= niovb; ++ } ++} ++EXPORT_SYMBOL(uioskip); ++#endif /* _KERNEL */ +diff -uNr linux-3.2.33-go.orig/fs/zfs/zcommon/zpool_prop.c linux-3.2.33-go/fs/zfs/zcommon/zpool_prop.c +--- linux-3.2.33-go.orig/fs/zfs/zcommon/zpool_prop.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zcommon/zpool_prop.c 2012-11-16 23:25:34.354039278 +0100 +@@ -0,0 +1,228 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "zfs_prop.h" ++ ++#if defined(_KERNEL) ++#include ++#else ++#include ++#include ++#include ++#endif ++ ++static zprop_desc_t zpool_prop_table[ZPOOL_NUM_PROPS]; ++ ++zprop_desc_t * ++zpool_prop_get_table(void) ++{ ++ return (zpool_prop_table); ++} ++ ++void ++zpool_prop_init(void) ++{ ++ static zprop_index_t boolean_table[] = { ++ { "off", 0}, ++ { "on", 1}, ++ { NULL } ++ }; ++ ++ static zprop_index_t failuremode_table[] = { ++ { "wait", ZIO_FAILURE_MODE_WAIT }, ++ { "continue", ZIO_FAILURE_MODE_CONTINUE }, ++ { "panic", ZIO_FAILURE_MODE_PANIC }, ++ { NULL } ++ }; ++ ++ /* string properties */ ++ zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT, ++ ZFS_TYPE_POOL, "", "ALTROOT"); ++ zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT, ++ ZFS_TYPE_POOL, "", "BOOTFS"); ++ zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, ++ PROP_DEFAULT, ZFS_TYPE_POOL, " | none", "CACHEFILE"); ++ zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL, ++ PROP_DEFAULT, ZFS_TYPE_POOL, "", "COMMENT"); ++ ++ /* readonly number properties */ ++ zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY, ++ ZFS_TYPE_POOL, "", "SIZE"); ++ zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY, ++ ZFS_TYPE_POOL, "", "FREE"); ++ zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, ++ PROP_READONLY, ZFS_TYPE_POOL, "", "ALLOC"); ++ zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, ++ PROP_READONLY, ZFS_TYPE_POOL, "", "EXPANDSZ"); ++ zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, ++ ZFS_TYPE_POOL, "", "CAP"); ++ zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, ++ ZFS_TYPE_POOL, "", "GUID"); ++ zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY, ++ ZFS_TYPE_POOL, "", "HEALTH"); ++ zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0, ++ PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>", ++ "DEDUP"); ++ ++ /* readonly onetime number properties */ ++ zprop_register_number(ZPOOL_PROP_ASHIFT, "ashift", 0, PROP_ONETIME, ++ ZFS_TYPE_POOL, "", "ASHIFT"); ++ ++ /* default number properties */ ++ zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, ++ PROP_DEFAULT, ZFS_TYPE_POOL, "", "VERSION"); ++ zprop_register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0, ++ PROP_DEFAULT, ZFS_TYPE_POOL, "", "DEDUPDITTO"); ++ ++ /* default index (boolean) properties */ ++ zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, ++ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION", ++ boolean_table); ++ zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0, ++ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table); ++ zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, ++ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS", ++ boolean_table); ++ zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0, ++ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table); ++ zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0, ++ PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "RDONLY", boolean_table); ++ ++ /* default index properties */ ++ zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode", ++ ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL, ++ "wait | continue | panic", "FAILMODE", failuremode_table); ++ ++ /* hidden properties */ ++ zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING, ++ PROP_READONLY, ZFS_TYPE_POOL, "NAME"); ++} ++ ++/* ++ * Given a property name and its type, returns the corresponding property ID. ++ */ ++zpool_prop_t ++zpool_name_to_prop(const char *propname) ++{ ++ return (zprop_name_to_prop(propname, ZFS_TYPE_POOL)); ++} ++ ++/* ++ * Given a pool property ID, returns the corresponding name. ++ * Assuming the pool propety ID is valid. ++ */ ++const char * ++zpool_prop_to_name(zpool_prop_t prop) ++{ ++ return (zpool_prop_table[prop].pd_name); ++} ++ ++zprop_type_t ++zpool_prop_get_type(zpool_prop_t prop) ++{ ++ return (zpool_prop_table[prop].pd_proptype); ++} ++ ++boolean_t ++zpool_prop_readonly(zpool_prop_t prop) ++{ ++ return (zpool_prop_table[prop].pd_attr == PROP_READONLY); ++} ++ ++const char * ++zpool_prop_default_string(zpool_prop_t prop) ++{ ++ return (zpool_prop_table[prop].pd_strdefault); ++} ++ ++uint64_t ++zpool_prop_default_numeric(zpool_prop_t prop) ++{ ++ return (zpool_prop_table[prop].pd_numdefault); ++} ++ ++int ++zpool_prop_string_to_index(zpool_prop_t prop, const char *string, ++ uint64_t *index) ++{ ++ return (zprop_string_to_index(prop, string, index, ZFS_TYPE_POOL)); ++} ++ ++int ++zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index, ++ const char **string) ++{ ++ return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL)); ++} ++ ++uint64_t ++zpool_prop_random_value(zpool_prop_t prop, uint64_t seed) ++{ ++ return (zprop_random_value(prop, seed, ZFS_TYPE_POOL)); ++} ++ ++#ifndef _KERNEL ++ ++const char * ++zpool_prop_values(zpool_prop_t prop) ++{ ++ return (zpool_prop_table[prop].pd_values); ++} ++ ++const char * ++zpool_prop_column_name(zpool_prop_t prop) ++{ ++ return (zpool_prop_table[prop].pd_colname); ++} ++ ++boolean_t ++zpool_prop_align_right(zpool_prop_t prop) ++{ ++ return (zpool_prop_table[prop].pd_rightalign); ++} ++#endif ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++/* zpool property functions */ ++EXPORT_SYMBOL(zpool_prop_init); ++EXPORT_SYMBOL(zpool_prop_get_type); ++EXPORT_SYMBOL(zpool_prop_get_table); ++ ++/* Pool property functions shared between libzfs and kernel. */ ++EXPORT_SYMBOL(zpool_name_to_prop); ++EXPORT_SYMBOL(zpool_prop_to_name); ++EXPORT_SYMBOL(zpool_prop_default_string); ++EXPORT_SYMBOL(zpool_prop_default_numeric); ++EXPORT_SYMBOL(zpool_prop_readonly); ++EXPORT_SYMBOL(zpool_prop_index_to_string); ++EXPORT_SYMBOL(zpool_prop_string_to_index); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zcommon/zprop_common.c linux-3.2.33-go/fs/zfs/zcommon/zprop_common.c +--- linux-3.2.33-go.orig/fs/zfs/zcommon/zprop_common.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zcommon/zprop_common.c 2012-11-16 23:25:34.354039278 +0100 +@@ -0,0 +1,444 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++/* ++ * Common routines used by zfs and zpool property management. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "zfs_prop.h" ++#include "zfs_deleg.h" ++ ++#if defined(_KERNEL) ++#include ++#include ++#else ++#include ++#include ++#include ++#endif ++ ++static zprop_desc_t * ++zprop_get_proptable(zfs_type_t type) ++{ ++ if (type == ZFS_TYPE_POOL) ++ return (zpool_prop_get_table()); ++ else ++ return (zfs_prop_get_table()); ++} ++ ++static int ++zprop_get_numprops(zfs_type_t type) ++{ ++ if (type == ZFS_TYPE_POOL) ++ return (ZPOOL_NUM_PROPS); ++ else ++ return (ZFS_NUM_PROPS); ++} ++ ++void ++zprop_register_impl(int prop, const char *name, zprop_type_t type, ++ uint64_t numdefault, const char *strdefault, zprop_attr_t attr, ++ int objset_types, const char *values, const char *colname, ++ boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl) ++{ ++ zprop_desc_t *prop_tbl = zprop_get_proptable(objset_types); ++ zprop_desc_t *pd; ++ ++ pd = &prop_tbl[prop]; ++ ++ ASSERT(pd->pd_name == NULL || pd->pd_name == name); ++ ASSERT(name != NULL); ++ ASSERT(colname != NULL); ++ ++ pd->pd_name = name; ++ pd->pd_propnum = prop; ++ pd->pd_proptype = type; ++ pd->pd_numdefault = numdefault; ++ pd->pd_strdefault = strdefault; ++ pd->pd_attr = attr; ++ pd->pd_types = objset_types; ++ pd->pd_values = values; ++ pd->pd_colname = colname; ++ pd->pd_rightalign = rightalign; ++ pd->pd_visible = visible; ++ pd->pd_table = idx_tbl; ++ pd->pd_table_size = 0; ++ while (idx_tbl && (idx_tbl++)->pi_name != NULL) ++ pd->pd_table_size++; ++} ++ ++void ++zprop_register_string(int prop, const char *name, const char *def, ++ zprop_attr_t attr, int objset_types, const char *values, ++ const char *colname) ++{ ++ zprop_register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr, ++ objset_types, values, colname, B_FALSE, B_TRUE, NULL); ++ ++} ++ ++void ++zprop_register_number(int prop, const char *name, uint64_t def, ++ zprop_attr_t attr, int objset_types, const char *values, ++ const char *colname) ++{ ++ zprop_register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr, ++ objset_types, values, colname, B_TRUE, B_TRUE, NULL); ++} ++ ++void ++zprop_register_index(int prop, const char *name, uint64_t def, ++ zprop_attr_t attr, int objset_types, const char *values, ++ const char *colname, const zprop_index_t *idx_tbl) ++{ ++ zprop_register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr, ++ objset_types, values, colname, B_TRUE, B_TRUE, idx_tbl); ++} ++ ++void ++zprop_register_hidden(int prop, const char *name, zprop_type_t type, ++ zprop_attr_t attr, int objset_types, const char *colname) ++{ ++ zprop_register_impl(prop, name, type, 0, NULL, attr, ++ objset_types, NULL, colname, B_FALSE, B_FALSE, NULL); ++} ++ ++ ++/* ++ * A comparison function we can use to order indexes into property tables. ++ */ ++static int ++zprop_compare(const void *arg1, const void *arg2) ++{ ++ const zprop_desc_t *p1 = *((zprop_desc_t **)arg1); ++ const zprop_desc_t *p2 = *((zprop_desc_t **)arg2); ++ boolean_t p1ro, p2ro; ++ ++ p1ro = (p1->pd_attr == PROP_READONLY); ++ p2ro = (p2->pd_attr == PROP_READONLY); ++ ++ if (p1ro == p2ro) ++ return (strcmp(p1->pd_name, p2->pd_name)); ++ ++ return (p1ro ? -1 : 1); ++} ++ ++/* ++ * Iterate over all properties in the given property table, calling back ++ * into the specified function for each property. We will continue to ++ * iterate until we either reach the end or the callback function returns ++ * something other than ZPROP_CONT. ++ */ ++int ++zprop_iter_common(zprop_func func, void *cb, boolean_t show_all, ++ boolean_t ordered, zfs_type_t type) ++{ ++ int i, j, num_props, size, prop; ++ zprop_desc_t *prop_tbl; ++ zprop_desc_t **order; ++ ++ prop_tbl = zprop_get_proptable(type); ++ num_props = zprop_get_numprops(type); ++ size = num_props * sizeof (zprop_desc_t *); ++ ++#if defined(_KERNEL) ++ order = kmem_alloc(size, KM_PUSHPAGE); ++#else ++ if ((order = malloc(size)) == NULL) ++ return (ZPROP_CONT); ++#endif ++ ++ for (j = 0; j < num_props; j++) ++ order[j] = &prop_tbl[j]; ++ ++ if (ordered) { ++ qsort((void *)order, num_props, sizeof (zprop_desc_t *), ++ zprop_compare); ++ } ++ ++ prop = ZPROP_CONT; ++ for (i = 0; i < num_props; i++) { ++ if ((order[i]->pd_visible || show_all) && ++ (func(order[i]->pd_propnum, cb) != ZPROP_CONT)) { ++ prop = order[i]->pd_propnum; ++ break; ++ } ++ } ++ ++#if defined(_KERNEL) ++ kmem_free(order, size); ++#else ++ free(order); ++#endif ++ return (prop); ++} ++ ++static boolean_t ++propname_match(const char *p, size_t len, zprop_desc_t *prop_entry) ++{ ++ const char *propname = prop_entry->pd_name; ++#ifndef _KERNEL ++ const char *colname = prop_entry->pd_colname; ++ int c; ++#endif ++ ++ if (len == strlen(propname) && ++ strncmp(p, propname, len) == 0) ++ return (B_TRUE); ++ ++#ifndef _KERNEL ++ if (colname == NULL || len != strlen(colname)) ++ return (B_FALSE); ++ ++ for (c = 0; c < len; c++) ++ if (p[c] != tolower(colname[c])) ++ break; ++ ++ return (colname[c] == '\0'); ++#else ++ return (B_FALSE); ++#endif ++} ++ ++typedef struct name_to_prop_cb { ++ const char *propname; ++ zprop_desc_t *prop_tbl; ++} name_to_prop_cb_t; ++ ++static int ++zprop_name_to_prop_cb(int prop, void *cb_data) ++{ ++ name_to_prop_cb_t *data = cb_data; ++ ++ if (propname_match(data->propname, strlen(data->propname), ++ &data->prop_tbl[prop])) ++ return (prop); ++ ++ return (ZPROP_CONT); ++} ++ ++int ++zprop_name_to_prop(const char *propname, zfs_type_t type) ++{ ++ int prop; ++ name_to_prop_cb_t cb_data; ++ ++ cb_data.propname = propname; ++ cb_data.prop_tbl = zprop_get_proptable(type); ++ ++ prop = zprop_iter_common(zprop_name_to_prop_cb, &cb_data, ++ B_TRUE, B_FALSE, type); ++ ++ return (prop == ZPROP_CONT ? ZPROP_INVAL : prop); ++} ++ ++int ++zprop_string_to_index(int prop, const char *string, uint64_t *index, ++ zfs_type_t type) ++{ ++ zprop_desc_t *prop_tbl; ++ const zprop_index_t *idx_tbl; ++ int i; ++ ++ if (prop == ZPROP_INVAL || prop == ZPROP_CONT) ++ return (-1); ++ ++ ASSERT(prop < zprop_get_numprops(type)); ++ prop_tbl = zprop_get_proptable(type); ++ if ((idx_tbl = prop_tbl[prop].pd_table) == NULL) ++ return (-1); ++ ++ for (i = 0; idx_tbl[i].pi_name != NULL; i++) { ++ if (strcmp(string, idx_tbl[i].pi_name) == 0) { ++ *index = idx_tbl[i].pi_value; ++ return (0); ++ } ++ } ++ ++ return (-1); ++} ++ ++int ++zprop_index_to_string(int prop, uint64_t index, const char **string, ++ zfs_type_t type) ++{ ++ zprop_desc_t *prop_tbl; ++ const zprop_index_t *idx_tbl; ++ int i; ++ ++ if (prop == ZPROP_INVAL || prop == ZPROP_CONT) ++ return (-1); ++ ++ ASSERT(prop < zprop_get_numprops(type)); ++ prop_tbl = zprop_get_proptable(type); ++ if ((idx_tbl = prop_tbl[prop].pd_table) == NULL) ++ return (-1); ++ ++ for (i = 0; idx_tbl[i].pi_name != NULL; i++) { ++ if (idx_tbl[i].pi_value == index) { ++ *string = idx_tbl[i].pi_name; ++ return (0); ++ } ++ } ++ ++ return (-1); ++} ++ ++/* ++ * Return a random valid property value. Used by ztest. ++ */ ++uint64_t ++zprop_random_value(int prop, uint64_t seed, zfs_type_t type) ++{ ++ zprop_desc_t *prop_tbl; ++ const zprop_index_t *idx_tbl; ++ ++ ASSERT((uint_t)prop < zprop_get_numprops(type)); ++ prop_tbl = zprop_get_proptable(type); ++ idx_tbl = prop_tbl[prop].pd_table; ++ ++ if (idx_tbl == NULL) ++ return (seed); ++ ++ return (idx_tbl[seed % prop_tbl[prop].pd_table_size].pi_value); ++} ++ ++const char * ++zprop_values(int prop, zfs_type_t type) ++{ ++ zprop_desc_t *prop_tbl; ++ ++ ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT); ++ ASSERT(prop < zprop_get_numprops(type)); ++ ++ prop_tbl = zprop_get_proptable(type); ++ ++ return (prop_tbl[prop].pd_values); ++} ++ ++/* ++ * Returns TRUE if the property applies to any of the given dataset types. ++ */ ++boolean_t ++zprop_valid_for_type(int prop, zfs_type_t type) ++{ ++ zprop_desc_t *prop_tbl; ++ ++ if (prop == ZPROP_INVAL || prop == ZPROP_CONT) ++ return (B_FALSE); ++ ++ ASSERT(prop < zprop_get_numprops(type)); ++ prop_tbl = zprop_get_proptable(type); ++ return ((prop_tbl[prop].pd_types & type) != 0); ++} ++ ++#ifndef _KERNEL ++ ++/* ++ * Determines the minimum width for the column, and indicates whether it's fixed ++ * or not. Only string columns are non-fixed. ++ */ ++size_t ++zprop_width(int prop, boolean_t *fixed, zfs_type_t type) ++{ ++ zprop_desc_t *prop_tbl, *pd; ++ const zprop_index_t *idx; ++ size_t ret; ++ int i; ++ ++ ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT); ++ ASSERT(prop < zprop_get_numprops(type)); ++ ++ prop_tbl = zprop_get_proptable(type); ++ pd = &prop_tbl[prop]; ++ ++ *fixed = B_TRUE; ++ ++ /* ++ * Start with the width of the column name. ++ */ ++ ret = strlen(pd->pd_colname); ++ ++ /* ++ * For fixed-width values, make sure the width is large enough to hold ++ * any possible value. ++ */ ++ switch (pd->pd_proptype) { ++ case PROP_TYPE_NUMBER: ++ /* ++ * The maximum length of a human-readable number is 5 characters ++ * ("20.4M", for example). ++ */ ++ if (ret < 5) ++ ret = 5; ++ /* ++ * 'creation' is handled specially because it's a number ++ * internally, but displayed as a date string. ++ */ ++ if (prop == ZFS_PROP_CREATION) ++ *fixed = B_FALSE; ++ break; ++ case PROP_TYPE_INDEX: ++ idx = prop_tbl[prop].pd_table; ++ for (i = 0; idx[i].pi_name != NULL; i++) { ++ if (strlen(idx[i].pi_name) > ret) ++ ret = strlen(idx[i].pi_name); ++ } ++ break; ++ ++ case PROP_TYPE_STRING: ++ *fixed = B_FALSE; ++ break; ++ } ++ ++ return (ret); ++} ++ ++#endif ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++/* Common routines to initialize property tables */ ++EXPORT_SYMBOL(zprop_register_impl); ++EXPORT_SYMBOL(zprop_register_string); ++EXPORT_SYMBOL(zprop_register_number); ++EXPORT_SYMBOL(zprop_register_index); ++EXPORT_SYMBOL(zprop_register_hidden); ++ ++/* Common routines for zfs and zpool property management */ ++EXPORT_SYMBOL(zprop_iter_common); ++EXPORT_SYMBOL(zprop_name_to_prop); ++EXPORT_SYMBOL(zprop_string_to_index); ++EXPORT_SYMBOL(zprop_index_to_string); ++EXPORT_SYMBOL(zprop_random_value); ++EXPORT_SYMBOL(zprop_values); ++EXPORT_SYMBOL(zprop_valid_for_type); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/arc.c linux-3.2.33-go/fs/zfs/zfs/arc.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/arc.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/arc.c 2012-11-16 23:25:34.349039334 +0100 +@@ -0,0 +1,4985 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ */ ++ ++/* ++ * DVA-based Adjustable Replacement Cache ++ * ++ * While much of the theory of operation used here is ++ * based on the self-tuning, low overhead replacement cache ++ * presented by Megiddo and Modha at FAST 2003, there are some ++ * significant differences: ++ * ++ * 1. The Megiddo and Modha model assumes any page is evictable. ++ * Pages in its cache cannot be "locked" into memory. This makes ++ * the eviction algorithm simple: evict the last page in the list. ++ * This also make the performance characteristics easy to reason ++ * about. Our cache is not so simple. At any given moment, some ++ * subset of the blocks in the cache are un-evictable because we ++ * have handed out a reference to them. Blocks are only evictable ++ * when there are no external references active. This makes ++ * eviction far more problematic: we choose to evict the evictable ++ * blocks that are the "lowest" in the list. ++ * ++ * There are times when it is not possible to evict the requested ++ * space. In these circumstances we are unable to adjust the cache ++ * size. To prevent the cache growing unbounded at these times we ++ * implement a "cache throttle" that slows the flow of new data ++ * into the cache until we can make space available. ++ * ++ * 2. The Megiddo and Modha model assumes a fixed cache size. ++ * Pages are evicted when the cache is full and there is a cache ++ * miss. Our model has a variable sized cache. It grows with ++ * high use, but also tries to react to memory pressure from the ++ * operating system: decreasing its size when system memory is ++ * tight. ++ * ++ * 3. The Megiddo and Modha model assumes a fixed page size. All ++ * elements of the cache are therefor exactly the same size. So ++ * when adjusting the cache size following a cache miss, its simply ++ * a matter of choosing a single page to evict. In our model, we ++ * have variable sized cache blocks (rangeing from 512 bytes to ++ * 128K bytes). We therefor choose a set of blocks to evict to make ++ * space for a cache miss that approximates as closely as possible ++ * the space used by the new block. ++ * ++ * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" ++ * by N. Megiddo & D. Modha, FAST 2003 ++ */ ++ ++/* ++ * The locking model: ++ * ++ * A new reference to a cache buffer can be obtained in two ++ * ways: 1) via a hash table lookup using the DVA as a key, ++ * or 2) via one of the ARC lists. The arc_read() interface ++ * uses method 1, while the internal arc algorithms for ++ * adjusting the cache use method 2. We therefor provide two ++ * types of locks: 1) the hash table lock array, and 2) the ++ * arc list locks. ++ * ++ * Buffers do not have their own mutexs, rather they rely on the ++ * hash table mutexs for the bulk of their protection (i.e. most ++ * fields in the arc_buf_hdr_t are protected by these mutexs). ++ * ++ * buf_hash_find() returns the appropriate mutex (held) when it ++ * locates the requested buffer in the hash table. It returns ++ * NULL for the mutex if the buffer was not in the table. ++ * ++ * buf_hash_remove() expects the appropriate hash mutex to be ++ * already held before it is invoked. ++ * ++ * Each arc state also has a mutex which is used to protect the ++ * buffer list associated with the state. When attempting to ++ * obtain a hash table lock while holding an arc list lock you ++ * must use: mutex_tryenter() to avoid deadlock. Also note that ++ * the active state mutex must be held before the ghost state mutex. ++ * ++ * Arc buffers may have an associated eviction callback function. ++ * This function will be invoked prior to removing the buffer (e.g. ++ * in arc_do_user_evicts()). Note however that the data associated ++ * with the buffer may be evicted prior to the callback. The callback ++ * must be made with *no locks held* (to prevent deadlock). Additionally, ++ * the users of callbacks must ensure that their private data is ++ * protected from simultaneous callbacks from arc_buf_evict() ++ * and arc_do_user_evicts(). ++ * ++ * It as also possible to register a callback which is run when the ++ * arc_meta_limit is reached and no buffers can be safely evicted. In ++ * this case the arc user should drop a reference on some arc buffers so ++ * they can be reclaimed and the arc_meta_limit honored. For example, ++ * when using the ZPL each dentry holds a references on a znode. These ++ * dentries must be pruned before the arc buffer holding the znode can ++ * be safely evicted. ++ * ++ * Note that the majority of the performance stats are manipulated ++ * with atomic operations. ++ * ++ * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: ++ * ++ * - L2ARC buflist creation ++ * - L2ARC buflist eviction ++ * - L2ARC write completion, which walks L2ARC buflists ++ * - ARC header destruction, as it removes from L2ARC buflists ++ * - ARC header release, as it removes from L2ARC buflists ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef _KERNEL ++#include ++#include ++#include ++#include ++#endif ++#include ++#include ++#include ++#include ++ ++static kmutex_t arc_reclaim_thr_lock; ++static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ ++static uint8_t arc_thread_exit; ++ ++/* number of bytes to prune from caches when at arc_meta_limit is reached */ ++uint_t arc_meta_prune = 1048576; ++ ++typedef enum arc_reclaim_strategy { ++ ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ ++ ARC_RECLAIM_CONS /* Conservative reclaim strategy */ ++} arc_reclaim_strategy_t; ++ ++/* number of seconds before growing cache again */ ++static int arc_grow_retry = 5; ++ ++/* expiration time for arc_no_grow */ ++static clock_t arc_grow_time = 0; ++ ++/* shift of arc_c for calculating both min and max arc_p */ ++static int arc_p_min_shift = 4; ++ ++/* log2(fraction of arc to reclaim) */ ++static int arc_shrink_shift = 5; ++ ++/* ++ * minimum lifespan of a prefetch block in clock ticks ++ * (initialized in arc_init()) ++ */ ++static int arc_min_prefetch_lifespan; ++ ++static int arc_dead; ++ ++/* ++ * The arc has filled available memory and has now warmed up. ++ */ ++static boolean_t arc_warm; ++ ++/* ++ * These tunables are for performance analysis. ++ */ ++unsigned long zfs_arc_max = 0; ++unsigned long zfs_arc_min = 0; ++unsigned long zfs_arc_meta_limit = 0; ++int zfs_arc_grow_retry = 0; ++int zfs_arc_shrink_shift = 0; ++int zfs_arc_p_min_shift = 0; ++int zfs_arc_meta_prune = 0; ++ ++/* ++ * Note that buffers can be in one of 6 states: ++ * ARC_anon - anonymous (discussed below) ++ * ARC_mru - recently used, currently cached ++ * ARC_mru_ghost - recentely used, no longer in cache ++ * ARC_mfu - frequently used, currently cached ++ * ARC_mfu_ghost - frequently used, no longer in cache ++ * ARC_l2c_only - exists in L2ARC but not other states ++ * When there are no active references to the buffer, they are ++ * are linked onto a list in one of these arc states. These are ++ * the only buffers that can be evicted or deleted. Within each ++ * state there are multiple lists, one for meta-data and one for ++ * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, ++ * etc.) is tracked separately so that it can be managed more ++ * explicitly: favored over data, limited explicitly. ++ * ++ * Anonymous buffers are buffers that are not associated with ++ * a DVA. These are buffers that hold dirty block copies ++ * before they are written to stable storage. By definition, ++ * they are "ref'd" and are considered part of arc_mru ++ * that cannot be freed. Generally, they will aquire a DVA ++ * as they are written and migrate onto the arc_mru list. ++ * ++ * The ARC_l2c_only state is for buffers that are in the second ++ * level ARC but no longer in any of the ARC_m* lists. The second ++ * level ARC itself may also contain buffers that are in any of ++ * the ARC_m* states - meaning that a buffer can exist in two ++ * places. The reason for the ARC_l2c_only state is to keep the ++ * buffer header in the hash table, so that reads that hit the ++ * second level ARC benefit from these fast lookups. ++ */ ++ ++typedef struct arc_state { ++ list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ ++ uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ ++ uint64_t arcs_size; /* total amount of data in this state */ ++ kmutex_t arcs_mtx; ++} arc_state_t; ++ ++/* The 6 states: */ ++static arc_state_t ARC_anon; ++static arc_state_t ARC_mru; ++static arc_state_t ARC_mru_ghost; ++static arc_state_t ARC_mfu; ++static arc_state_t ARC_mfu_ghost; ++static arc_state_t ARC_l2c_only; ++ ++typedef struct arc_stats { ++ kstat_named_t arcstat_hits; ++ kstat_named_t arcstat_misses; ++ kstat_named_t arcstat_demand_data_hits; ++ kstat_named_t arcstat_demand_data_misses; ++ kstat_named_t arcstat_demand_metadata_hits; ++ kstat_named_t arcstat_demand_metadata_misses; ++ kstat_named_t arcstat_prefetch_data_hits; ++ kstat_named_t arcstat_prefetch_data_misses; ++ kstat_named_t arcstat_prefetch_metadata_hits; ++ kstat_named_t arcstat_prefetch_metadata_misses; ++ kstat_named_t arcstat_mru_hits; ++ kstat_named_t arcstat_mru_ghost_hits; ++ kstat_named_t arcstat_mfu_hits; ++ kstat_named_t arcstat_mfu_ghost_hits; ++ kstat_named_t arcstat_deleted; ++ kstat_named_t arcstat_recycle_miss; ++ kstat_named_t arcstat_mutex_miss; ++ kstat_named_t arcstat_evict_skip; ++ kstat_named_t arcstat_evict_l2_cached; ++ kstat_named_t arcstat_evict_l2_eligible; ++ kstat_named_t arcstat_evict_l2_ineligible; ++ kstat_named_t arcstat_hash_elements; ++ kstat_named_t arcstat_hash_elements_max; ++ kstat_named_t arcstat_hash_collisions; ++ kstat_named_t arcstat_hash_chains; ++ kstat_named_t arcstat_hash_chain_max; ++ kstat_named_t arcstat_p; ++ kstat_named_t arcstat_c; ++ kstat_named_t arcstat_c_min; ++ kstat_named_t arcstat_c_max; ++ kstat_named_t arcstat_size; ++ kstat_named_t arcstat_hdr_size; ++ kstat_named_t arcstat_data_size; ++ kstat_named_t arcstat_other_size; ++ kstat_named_t arcstat_anon_size; ++ kstat_named_t arcstat_anon_evict_data; ++ kstat_named_t arcstat_anon_evict_metadata; ++ kstat_named_t arcstat_mru_size; ++ kstat_named_t arcstat_mru_evict_data; ++ kstat_named_t arcstat_mru_evict_metadata; ++ kstat_named_t arcstat_mru_ghost_size; ++ kstat_named_t arcstat_mru_ghost_evict_data; ++ kstat_named_t arcstat_mru_ghost_evict_metadata; ++ kstat_named_t arcstat_mfu_size; ++ kstat_named_t arcstat_mfu_evict_data; ++ kstat_named_t arcstat_mfu_evict_metadata; ++ kstat_named_t arcstat_mfu_ghost_size; ++ kstat_named_t arcstat_mfu_ghost_evict_data; ++ kstat_named_t arcstat_mfu_ghost_evict_metadata; ++ kstat_named_t arcstat_l2_hits; ++ kstat_named_t arcstat_l2_misses; ++ kstat_named_t arcstat_l2_feeds; ++ kstat_named_t arcstat_l2_rw_clash; ++ kstat_named_t arcstat_l2_read_bytes; ++ kstat_named_t arcstat_l2_write_bytes; ++ kstat_named_t arcstat_l2_writes_sent; ++ kstat_named_t arcstat_l2_writes_done; ++ kstat_named_t arcstat_l2_writes_error; ++ kstat_named_t arcstat_l2_writes_hdr_miss; ++ kstat_named_t arcstat_l2_evict_lock_retry; ++ kstat_named_t arcstat_l2_evict_reading; ++ kstat_named_t arcstat_l2_free_on_write; ++ kstat_named_t arcstat_l2_abort_lowmem; ++ kstat_named_t arcstat_l2_cksum_bad; ++ kstat_named_t arcstat_l2_io_error; ++ kstat_named_t arcstat_l2_size; ++ kstat_named_t arcstat_l2_hdr_size; ++ kstat_named_t arcstat_memory_throttle_count; ++ kstat_named_t arcstat_memory_direct_count; ++ kstat_named_t arcstat_memory_indirect_count; ++ kstat_named_t arcstat_no_grow; ++ kstat_named_t arcstat_tempreserve; ++ kstat_named_t arcstat_loaned_bytes; ++ kstat_named_t arcstat_prune; ++ kstat_named_t arcstat_meta_used; ++ kstat_named_t arcstat_meta_limit; ++ kstat_named_t arcstat_meta_max; ++} arc_stats_t; ++ ++static arc_stats_t arc_stats = { ++ { "hits", KSTAT_DATA_UINT64 }, ++ { "misses", KSTAT_DATA_UINT64 }, ++ { "demand_data_hits", KSTAT_DATA_UINT64 }, ++ { "demand_data_misses", KSTAT_DATA_UINT64 }, ++ { "demand_metadata_hits", KSTAT_DATA_UINT64 }, ++ { "demand_metadata_misses", KSTAT_DATA_UINT64 }, ++ { "prefetch_data_hits", KSTAT_DATA_UINT64 }, ++ { "prefetch_data_misses", KSTAT_DATA_UINT64 }, ++ { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, ++ { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, ++ { "mru_hits", KSTAT_DATA_UINT64 }, ++ { "mru_ghost_hits", KSTAT_DATA_UINT64 }, ++ { "mfu_hits", KSTAT_DATA_UINT64 }, ++ { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, ++ { "deleted", KSTAT_DATA_UINT64 }, ++ { "recycle_miss", KSTAT_DATA_UINT64 }, ++ { "mutex_miss", KSTAT_DATA_UINT64 }, ++ { "evict_skip", KSTAT_DATA_UINT64 }, ++ { "evict_l2_cached", KSTAT_DATA_UINT64 }, ++ { "evict_l2_eligible", KSTAT_DATA_UINT64 }, ++ { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, ++ { "hash_elements", KSTAT_DATA_UINT64 }, ++ { "hash_elements_max", KSTAT_DATA_UINT64 }, ++ { "hash_collisions", KSTAT_DATA_UINT64 }, ++ { "hash_chains", KSTAT_DATA_UINT64 }, ++ { "hash_chain_max", KSTAT_DATA_UINT64 }, ++ { "p", KSTAT_DATA_UINT64 }, ++ { "c", KSTAT_DATA_UINT64 }, ++ { "c_min", KSTAT_DATA_UINT64 }, ++ { "c_max", KSTAT_DATA_UINT64 }, ++ { "size", KSTAT_DATA_UINT64 }, ++ { "hdr_size", KSTAT_DATA_UINT64 }, ++ { "data_size", KSTAT_DATA_UINT64 }, ++ { "other_size", KSTAT_DATA_UINT64 }, ++ { "anon_size", KSTAT_DATA_UINT64 }, ++ { "anon_evict_data", KSTAT_DATA_UINT64 }, ++ { "anon_evict_metadata", KSTAT_DATA_UINT64 }, ++ { "mru_size", KSTAT_DATA_UINT64 }, ++ { "mru_evict_data", KSTAT_DATA_UINT64 }, ++ { "mru_evict_metadata", KSTAT_DATA_UINT64 }, ++ { "mru_ghost_size", KSTAT_DATA_UINT64 }, ++ { "mru_ghost_evict_data", KSTAT_DATA_UINT64 }, ++ { "mru_ghost_evict_metadata", KSTAT_DATA_UINT64 }, ++ { "mfu_size", KSTAT_DATA_UINT64 }, ++ { "mfu_evict_data", KSTAT_DATA_UINT64 }, ++ { "mfu_evict_metadata", KSTAT_DATA_UINT64 }, ++ { "mfu_ghost_size", KSTAT_DATA_UINT64 }, ++ { "mfu_ghost_evict_data", KSTAT_DATA_UINT64 }, ++ { "mfu_ghost_evict_metadata", KSTAT_DATA_UINT64 }, ++ { "l2_hits", KSTAT_DATA_UINT64 }, ++ { "l2_misses", KSTAT_DATA_UINT64 }, ++ { "l2_feeds", KSTAT_DATA_UINT64 }, ++ { "l2_rw_clash", KSTAT_DATA_UINT64 }, ++ { "l2_read_bytes", KSTAT_DATA_UINT64 }, ++ { "l2_write_bytes", KSTAT_DATA_UINT64 }, ++ { "l2_writes_sent", KSTAT_DATA_UINT64 }, ++ { "l2_writes_done", KSTAT_DATA_UINT64 }, ++ { "l2_writes_error", KSTAT_DATA_UINT64 }, ++ { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, ++ { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, ++ { "l2_evict_reading", KSTAT_DATA_UINT64 }, ++ { "l2_free_on_write", KSTAT_DATA_UINT64 }, ++ { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, ++ { "l2_cksum_bad", KSTAT_DATA_UINT64 }, ++ { "l2_io_error", KSTAT_DATA_UINT64 }, ++ { "l2_size", KSTAT_DATA_UINT64 }, ++ { "l2_hdr_size", KSTAT_DATA_UINT64 }, ++ { "memory_throttle_count", KSTAT_DATA_UINT64 }, ++ { "memory_direct_count", KSTAT_DATA_UINT64 }, ++ { "memory_indirect_count", KSTAT_DATA_UINT64 }, ++ { "arc_no_grow", KSTAT_DATA_UINT64 }, ++ { "arc_tempreserve", KSTAT_DATA_UINT64 }, ++ { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, ++ { "arc_prune", KSTAT_DATA_UINT64 }, ++ { "arc_meta_used", KSTAT_DATA_UINT64 }, ++ { "arc_meta_limit", KSTAT_DATA_UINT64 }, ++ { "arc_meta_max", KSTAT_DATA_UINT64 }, ++}; ++ ++#define ARCSTAT(stat) (arc_stats.stat.value.ui64) ++ ++#define ARCSTAT_INCR(stat, val) \ ++ atomic_add_64(&arc_stats.stat.value.ui64, (val)); ++ ++#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) ++#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) ++ ++#define ARCSTAT_MAX(stat, val) { \ ++ uint64_t m; \ ++ while ((val) > (m = arc_stats.stat.value.ui64) && \ ++ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ ++ continue; \ ++} ++ ++#define ARCSTAT_MAXSTAT(stat) \ ++ ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) ++ ++/* ++ * We define a macro to allow ARC hits/misses to be easily broken down by ++ * two separate conditions, giving a total of four different subtypes for ++ * each of hits and misses (so eight statistics total). ++ */ ++#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ ++ if (cond1) { \ ++ if (cond2) { \ ++ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ ++ } else { \ ++ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ ++ } \ ++ } else { \ ++ if (cond2) { \ ++ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ ++ } else { \ ++ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ ++ } \ ++ } ++ ++kstat_t *arc_ksp; ++static arc_state_t *arc_anon; ++static arc_state_t *arc_mru; ++static arc_state_t *arc_mru_ghost; ++static arc_state_t *arc_mfu; ++static arc_state_t *arc_mfu_ghost; ++static arc_state_t *arc_l2c_only; ++ ++/* ++ * There are several ARC variables that are critical to export as kstats -- ++ * but we don't want to have to grovel around in the kstat whenever we wish to ++ * manipulate them. For these variables, we therefore define them to be in ++ * terms of the statistic variable. This assures that we are not introducing ++ * the possibility of inconsistency by having shadow copies of the variables, ++ * while still allowing the code to be readable. ++ */ ++#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ ++#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ ++#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ ++#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ ++#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ ++#define arc_no_grow ARCSTAT(arcstat_no_grow) ++#define arc_tempreserve ARCSTAT(arcstat_tempreserve) ++#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) ++#define arc_meta_used ARCSTAT(arcstat_meta_used) ++#define arc_meta_limit ARCSTAT(arcstat_meta_limit) ++#define arc_meta_max ARCSTAT(arcstat_meta_max) ++ ++typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; ++ ++typedef struct arc_callback arc_callback_t; ++ ++struct arc_callback { ++ void *acb_private; ++ arc_done_func_t *acb_done; ++ arc_buf_t *acb_buf; ++ zio_t *acb_zio_dummy; ++ arc_callback_t *acb_next; ++}; ++ ++typedef struct arc_write_callback arc_write_callback_t; ++ ++struct arc_write_callback { ++ void *awcb_private; ++ arc_done_func_t *awcb_ready; ++ arc_done_func_t *awcb_done; ++ arc_buf_t *awcb_buf; ++}; ++ ++struct arc_buf_hdr { ++ /* protected by hash lock */ ++ dva_t b_dva; ++ uint64_t b_birth; ++ uint64_t b_cksum0; ++ ++ kmutex_t b_freeze_lock; ++ zio_cksum_t *b_freeze_cksum; ++ void *b_thawed; ++ ++ arc_buf_hdr_t *b_hash_next; ++ arc_buf_t *b_buf; ++ uint32_t b_flags; ++ uint32_t b_datacnt; ++ ++ arc_callback_t *b_acb; ++ kcondvar_t b_cv; ++ ++ /* immutable */ ++ arc_buf_contents_t b_type; ++ uint64_t b_size; ++ uint64_t b_spa; ++ ++ /* protected by arc state mutex */ ++ arc_state_t *b_state; ++ list_node_t b_arc_node; ++ ++ /* updated atomically */ ++ clock_t b_arc_access; ++ ++ /* self protecting */ ++ refcount_t b_refcnt; ++ ++ l2arc_buf_hdr_t *b_l2hdr; ++ list_node_t b_l2node; ++}; ++ ++static list_t arc_prune_list; ++static kmutex_t arc_prune_mtx; ++static arc_buf_t *arc_eviction_list; ++static kmutex_t arc_eviction_mtx; ++static arc_buf_hdr_t arc_eviction_hdr; ++static void arc_get_data_buf(arc_buf_t *buf); ++static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); ++static int arc_evict_needed(arc_buf_contents_t type); ++static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); ++ ++static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); ++ ++#define GHOST_STATE(state) \ ++ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ ++ (state) == arc_l2c_only) ++ ++/* ++ * Private ARC flags. These flags are private ARC only flags that will show up ++ * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can ++ * be passed in as arc_flags in things like arc_read. However, these flags ++ * should never be passed and should only be set by ARC code. When adding new ++ * public flags, make sure not to smash the private ones. ++ */ ++ ++#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ ++#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ ++#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ ++#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ ++#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ ++#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ ++#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ ++#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ ++#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ ++#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ ++ ++#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) ++#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) ++#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) ++#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) ++#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) ++#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) ++#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) ++#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) ++#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ ++ (hdr)->b_l2hdr != NULL) ++#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) ++#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) ++#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) ++ ++/* ++ * Other sizes ++ */ ++ ++#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) ++#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) ++ ++/* ++ * Hash table routines ++ */ ++ ++#define HT_LOCK_ALIGN 64 ++#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN))) ++ ++struct ht_lock { ++ kmutex_t ht_lock; ++#ifdef _KERNEL ++ unsigned char pad[HT_LOCK_PAD]; ++#endif ++}; ++ ++#define BUF_LOCKS 256 ++typedef struct buf_hash_table { ++ uint64_t ht_mask; ++ arc_buf_hdr_t **ht_table; ++ struct ht_lock ht_locks[BUF_LOCKS]; ++} buf_hash_table_t; ++ ++static buf_hash_table_t buf_hash_table; ++ ++#define BUF_HASH_INDEX(spa, dva, birth) \ ++ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) ++#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) ++#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) ++#define HDR_LOCK(hdr) \ ++ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) ++ ++uint64_t zfs_crc64_table[256]; ++ ++/* ++ * Level 2 ARC ++ */ ++ ++#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ ++#define L2ARC_HEADROOM 2 /* num of writes */ ++#define L2ARC_FEED_SECS 1 /* caching interval secs */ ++#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ ++ ++#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) ++#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) ++ ++/* ++ * L2ARC Performance Tunables ++ */ ++unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ ++unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ ++unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ ++unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ ++unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ ++int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ ++int l2arc_feed_again = B_TRUE; /* turbo warmup */ ++int l2arc_norw = B_TRUE; /* no reads during writes */ ++ ++/* ++ * L2ARC Internals ++ */ ++typedef struct l2arc_dev { ++ vdev_t *l2ad_vdev; /* vdev */ ++ spa_t *l2ad_spa; /* spa */ ++ uint64_t l2ad_hand; /* next write location */ ++ uint64_t l2ad_write; /* desired write size, bytes */ ++ uint64_t l2ad_boost; /* warmup write boost, bytes */ ++ uint64_t l2ad_start; /* first addr on device */ ++ uint64_t l2ad_end; /* last addr on device */ ++ uint64_t l2ad_evict; /* last addr eviction reached */ ++ boolean_t l2ad_first; /* first sweep through */ ++ boolean_t l2ad_writing; /* currently writing */ ++ list_t *l2ad_buflist; /* buffer list */ ++ list_node_t l2ad_node; /* device list node */ ++} l2arc_dev_t; ++ ++static list_t L2ARC_dev_list; /* device list */ ++static list_t *l2arc_dev_list; /* device list pointer */ ++static kmutex_t l2arc_dev_mtx; /* device list mutex */ ++static l2arc_dev_t *l2arc_dev_last; /* last device used */ ++static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ ++static list_t L2ARC_free_on_write; /* free after write buf list */ ++static list_t *l2arc_free_on_write; /* free after write list ptr */ ++static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ ++static uint64_t l2arc_ndev; /* number of devices */ ++ ++typedef struct l2arc_read_callback { ++ arc_buf_t *l2rcb_buf; /* read buffer */ ++ spa_t *l2rcb_spa; /* spa */ ++ blkptr_t l2rcb_bp; /* original blkptr */ ++ zbookmark_t l2rcb_zb; /* original bookmark */ ++ int l2rcb_flags; /* original flags */ ++} l2arc_read_callback_t; ++ ++typedef struct l2arc_write_callback { ++ l2arc_dev_t *l2wcb_dev; /* device info */ ++ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ ++} l2arc_write_callback_t; ++ ++struct l2arc_buf_hdr { ++ /* protected by arc_buf_hdr mutex */ ++ l2arc_dev_t *b_dev; /* L2ARC device */ ++ uint64_t b_daddr; /* disk address, offset byte */ ++}; ++ ++typedef struct l2arc_data_free { ++ /* protected by l2arc_free_on_write_mtx */ ++ void *l2df_data; ++ size_t l2df_size; ++ void (*l2df_func)(void *, size_t); ++ list_node_t l2df_list_node; ++} l2arc_data_free_t; ++ ++static kmutex_t l2arc_feed_thr_lock; ++static kcondvar_t l2arc_feed_thr_cv; ++static uint8_t l2arc_thread_exit; ++ ++static void l2arc_read_done(zio_t *zio); ++static void l2arc_hdr_stat_add(void); ++static void l2arc_hdr_stat_remove(void); ++ ++static uint64_t ++buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) ++{ ++ uint8_t *vdva = (uint8_t *)dva; ++ uint64_t crc = -1ULL; ++ int i; ++ ++ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); ++ ++ for (i = 0; i < sizeof (dva_t); i++) ++ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; ++ ++ crc ^= (spa>>8) ^ birth; ++ ++ return (crc); ++} ++ ++#define BUF_EMPTY(buf) \ ++ ((buf)->b_dva.dva_word[0] == 0 && \ ++ (buf)->b_dva.dva_word[1] == 0 && \ ++ (buf)->b_birth == 0) ++ ++#define BUF_EQUAL(spa, dva, birth, buf) \ ++ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ++ ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ++ ((buf)->b_birth == birth) && ((buf)->b_spa == spa) ++ ++static void ++buf_discard_identity(arc_buf_hdr_t *hdr) ++{ ++ hdr->b_dva.dva_word[0] = 0; ++ hdr->b_dva.dva_word[1] = 0; ++ hdr->b_birth = 0; ++ hdr->b_cksum0 = 0; ++} ++ ++static arc_buf_hdr_t * ++buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) ++{ ++ uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); ++ kmutex_t *hash_lock = BUF_HASH_LOCK(idx); ++ arc_buf_hdr_t *buf; ++ ++ mutex_enter(hash_lock); ++ for (buf = buf_hash_table.ht_table[idx]; buf != NULL; ++ buf = buf->b_hash_next) { ++ if (BUF_EQUAL(spa, dva, birth, buf)) { ++ *lockp = hash_lock; ++ return (buf); ++ } ++ } ++ mutex_exit(hash_lock); ++ *lockp = NULL; ++ return (NULL); ++} ++ ++/* ++ * Insert an entry into the hash table. If there is already an element ++ * equal to elem in the hash table, then the already existing element ++ * will be returned and the new element will not be inserted. ++ * Otherwise returns NULL. ++ */ ++static arc_buf_hdr_t * ++buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) ++{ ++ uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); ++ kmutex_t *hash_lock = BUF_HASH_LOCK(idx); ++ arc_buf_hdr_t *fbuf; ++ uint32_t i; ++ ++ ASSERT(!HDR_IN_HASH_TABLE(buf)); ++ *lockp = hash_lock; ++ mutex_enter(hash_lock); ++ for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; ++ fbuf = fbuf->b_hash_next, i++) { ++ if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) ++ return (fbuf); ++ } ++ ++ buf->b_hash_next = buf_hash_table.ht_table[idx]; ++ buf_hash_table.ht_table[idx] = buf; ++ buf->b_flags |= ARC_IN_HASH_TABLE; ++ ++ /* collect some hash table performance data */ ++ if (i > 0) { ++ ARCSTAT_BUMP(arcstat_hash_collisions); ++ if (i == 1) ++ ARCSTAT_BUMP(arcstat_hash_chains); ++ ++ ARCSTAT_MAX(arcstat_hash_chain_max, i); ++ } ++ ++ ARCSTAT_BUMP(arcstat_hash_elements); ++ ARCSTAT_MAXSTAT(arcstat_hash_elements); ++ ++ return (NULL); ++} ++ ++static void ++buf_hash_remove(arc_buf_hdr_t *buf) ++{ ++ arc_buf_hdr_t *fbuf, **bufp; ++ uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); ++ ++ ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); ++ ASSERT(HDR_IN_HASH_TABLE(buf)); ++ ++ bufp = &buf_hash_table.ht_table[idx]; ++ while ((fbuf = *bufp) != buf) { ++ ASSERT(fbuf != NULL); ++ bufp = &fbuf->b_hash_next; ++ } ++ *bufp = buf->b_hash_next; ++ buf->b_hash_next = NULL; ++ buf->b_flags &= ~ARC_IN_HASH_TABLE; ++ ++ /* collect some hash table performance data */ ++ ARCSTAT_BUMPDOWN(arcstat_hash_elements); ++ ++ if (buf_hash_table.ht_table[idx] && ++ buf_hash_table.ht_table[idx]->b_hash_next == NULL) ++ ARCSTAT_BUMPDOWN(arcstat_hash_chains); ++} ++ ++/* ++ * Global data structures and functions for the buf kmem cache. ++ */ ++static kmem_cache_t *hdr_cache; ++static kmem_cache_t *buf_cache; ++ ++static void ++buf_fini(void) ++{ ++ int i; ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++ /* Large allocations which do not require contiguous pages ++ * should be using vmem_free() in the linux kernel */ ++ vmem_free(buf_hash_table.ht_table, ++ (buf_hash_table.ht_mask + 1) * sizeof (void *)); ++#else ++ kmem_free(buf_hash_table.ht_table, ++ (buf_hash_table.ht_mask + 1) * sizeof (void *)); ++#endif ++ for (i = 0; i < BUF_LOCKS; i++) ++ mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); ++ kmem_cache_destroy(hdr_cache); ++ kmem_cache_destroy(buf_cache); ++} ++ ++/* ++ * Constructor callback - called when the cache is empty ++ * and a new buf is requested. ++ */ ++/* ARGSUSED */ ++static int ++hdr_cons(void *vbuf, void *unused, int kmflag) ++{ ++ arc_buf_hdr_t *buf = vbuf; ++ ++ bzero(buf, sizeof (arc_buf_hdr_t)); ++ refcount_create(&buf->b_refcnt); ++ cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); ++ mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); ++ list_link_init(&buf->b_arc_node); ++ list_link_init(&buf->b_l2node); ++ arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); ++ ++ return (0); ++} ++ ++/* ARGSUSED */ ++static int ++buf_cons(void *vbuf, void *unused, int kmflag) ++{ ++ arc_buf_t *buf = vbuf; ++ ++ bzero(buf, sizeof (arc_buf_t)); ++ mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); ++ rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL); ++ arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); ++ ++ return (0); ++} ++ ++/* ++ * Destructor callback - called when a cached buf is ++ * no longer required. ++ */ ++/* ARGSUSED */ ++static void ++hdr_dest(void *vbuf, void *unused) ++{ ++ arc_buf_hdr_t *buf = vbuf; ++ ++ ASSERT(BUF_EMPTY(buf)); ++ refcount_destroy(&buf->b_refcnt); ++ cv_destroy(&buf->b_cv); ++ mutex_destroy(&buf->b_freeze_lock); ++ arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); ++} ++ ++/* ARGSUSED */ ++static void ++buf_dest(void *vbuf, void *unused) ++{ ++ arc_buf_t *buf = vbuf; ++ ++ mutex_destroy(&buf->b_evict_lock); ++ rw_destroy(&buf->b_data_lock); ++ arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); ++} ++ ++static void ++buf_init(void) ++{ ++ uint64_t *ct; ++ uint64_t hsize = 1ULL << 12; ++ int i, j; ++ ++ /* ++ * The hash table is big enough to fill all of physical memory ++ * with an average 64K block size. The table will take up ++ * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). ++ */ ++ while (hsize * 65536 < physmem * PAGESIZE) ++ hsize <<= 1; ++retry: ++ buf_hash_table.ht_mask = hsize - 1; ++#if defined(_KERNEL) && defined(HAVE_SPL) ++ /* Large allocations which do not require contiguous pages ++ * should be using vmem_alloc() in the linux kernel */ ++ buf_hash_table.ht_table = ++ vmem_zalloc(hsize * sizeof (void*), KM_SLEEP); ++#else ++ buf_hash_table.ht_table = ++ kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); ++#endif ++ if (buf_hash_table.ht_table == NULL) { ++ ASSERT(hsize > (1ULL << 8)); ++ hsize >>= 1; ++ goto retry; ++ } ++ ++ hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), ++ 0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0); ++ buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), ++ 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); ++ ++ for (i = 0; i < 256; i++) ++ for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) ++ *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); ++ ++ for (i = 0; i < BUF_LOCKS; i++) { ++ mutex_init(&buf_hash_table.ht_locks[i].ht_lock, ++ NULL, MUTEX_DEFAULT, NULL); ++ } ++} ++ ++#define ARC_MINTIME (hz>>4) /* 62 ms */ ++ ++static void ++arc_cksum_verify(arc_buf_t *buf) ++{ ++ zio_cksum_t zc; ++ ++ if (!(zfs_flags & ZFS_DEBUG_MODIFY)) ++ return; ++ ++ mutex_enter(&buf->b_hdr->b_freeze_lock); ++ if (buf->b_hdr->b_freeze_cksum == NULL || ++ (buf->b_hdr->b_flags & ARC_IO_ERROR)) { ++ mutex_exit(&buf->b_hdr->b_freeze_lock); ++ return; ++ } ++ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); ++ if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) ++ panic("buffer modified while frozen!"); ++ mutex_exit(&buf->b_hdr->b_freeze_lock); ++} ++ ++static int ++arc_cksum_equal(arc_buf_t *buf) ++{ ++ zio_cksum_t zc; ++ int equal; ++ ++ mutex_enter(&buf->b_hdr->b_freeze_lock); ++ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); ++ equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); ++ mutex_exit(&buf->b_hdr->b_freeze_lock); ++ ++ return (equal); ++} ++ ++static void ++arc_cksum_compute(arc_buf_t *buf, boolean_t force) ++{ ++ if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) ++ return; ++ ++ mutex_enter(&buf->b_hdr->b_freeze_lock); ++ if (buf->b_hdr->b_freeze_cksum != NULL) { ++ mutex_exit(&buf->b_hdr->b_freeze_lock); ++ return; ++ } ++ buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), ++ KM_PUSHPAGE); ++ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, ++ buf->b_hdr->b_freeze_cksum); ++ mutex_exit(&buf->b_hdr->b_freeze_lock); ++} ++ ++void ++arc_buf_thaw(arc_buf_t *buf) ++{ ++ if (zfs_flags & ZFS_DEBUG_MODIFY) { ++ if (buf->b_hdr->b_state != arc_anon) ++ panic("modifying non-anon buffer!"); ++ if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) ++ panic("modifying buffer while i/o in progress!"); ++ arc_cksum_verify(buf); ++ } ++ ++ mutex_enter(&buf->b_hdr->b_freeze_lock); ++ if (buf->b_hdr->b_freeze_cksum != NULL) { ++ kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); ++ buf->b_hdr->b_freeze_cksum = NULL; ++ } ++ ++ if (zfs_flags & ZFS_DEBUG_MODIFY) { ++ if (buf->b_hdr->b_thawed) ++ kmem_free(buf->b_hdr->b_thawed, 1); ++ buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); ++ } ++ ++ mutex_exit(&buf->b_hdr->b_freeze_lock); ++} ++ ++void ++arc_buf_freeze(arc_buf_t *buf) ++{ ++ kmutex_t *hash_lock; ++ ++ if (!(zfs_flags & ZFS_DEBUG_MODIFY)) ++ return; ++ ++ hash_lock = HDR_LOCK(buf->b_hdr); ++ mutex_enter(hash_lock); ++ ++ ASSERT(buf->b_hdr->b_freeze_cksum != NULL || ++ buf->b_hdr->b_state == arc_anon); ++ arc_cksum_compute(buf, B_FALSE); ++ mutex_exit(hash_lock); ++} ++ ++static void ++add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) ++{ ++ ASSERT(MUTEX_HELD(hash_lock)); ++ ++ if ((refcount_add(&ab->b_refcnt, tag) == 1) && ++ (ab->b_state != arc_anon)) { ++ uint64_t delta = ab->b_size * ab->b_datacnt; ++ list_t *list = &ab->b_state->arcs_list[ab->b_type]; ++ uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; ++ ++ ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); ++ mutex_enter(&ab->b_state->arcs_mtx); ++ ASSERT(list_link_active(&ab->b_arc_node)); ++ list_remove(list, ab); ++ if (GHOST_STATE(ab->b_state)) { ++ ASSERT3U(ab->b_datacnt, ==, 0); ++ ASSERT3P(ab->b_buf, ==, NULL); ++ delta = ab->b_size; ++ } ++ ASSERT(delta > 0); ++ ASSERT3U(*size, >=, delta); ++ atomic_add_64(size, -delta); ++ mutex_exit(&ab->b_state->arcs_mtx); ++ /* remove the prefetch flag if we get a reference */ ++ if (ab->b_flags & ARC_PREFETCH) ++ ab->b_flags &= ~ARC_PREFETCH; ++ } ++} ++ ++static int ++remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) ++{ ++ int cnt; ++ arc_state_t *state = ab->b_state; ++ ++ ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ++ ASSERT(!GHOST_STATE(state)); ++ ++ if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && ++ (state != arc_anon)) { ++ uint64_t *size = &state->arcs_lsize[ab->b_type]; ++ ++ ASSERT(!MUTEX_HELD(&state->arcs_mtx)); ++ mutex_enter(&state->arcs_mtx); ++ ASSERT(!list_link_active(&ab->b_arc_node)); ++ list_insert_head(&state->arcs_list[ab->b_type], ab); ++ ASSERT(ab->b_datacnt > 0); ++ atomic_add_64(size, ab->b_size * ab->b_datacnt); ++ mutex_exit(&state->arcs_mtx); ++ } ++ return (cnt); ++} ++ ++/* ++ * Move the supplied buffer to the indicated state. The mutex ++ * for the buffer must be held by the caller. ++ */ ++static void ++arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) ++{ ++ arc_state_t *old_state = ab->b_state; ++ int64_t refcnt = refcount_count(&ab->b_refcnt); ++ uint64_t from_delta, to_delta; ++ ++ ASSERT(MUTEX_HELD(hash_lock)); ++ ASSERT(new_state != old_state); ++ ASSERT(refcnt == 0 || ab->b_datacnt > 0); ++ ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); ++ ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); ++ ++ from_delta = to_delta = ab->b_datacnt * ab->b_size; ++ ++ /* ++ * If this buffer is evictable, transfer it from the ++ * old state list to the new state list. ++ */ ++ if (refcnt == 0) { ++ if (old_state != arc_anon) { ++ int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); ++ uint64_t *size = &old_state->arcs_lsize[ab->b_type]; ++ ++ if (use_mutex) ++ mutex_enter(&old_state->arcs_mtx); ++ ++ ASSERT(list_link_active(&ab->b_arc_node)); ++ list_remove(&old_state->arcs_list[ab->b_type], ab); ++ ++ /* ++ * If prefetching out of the ghost cache, ++ * we will have a non-zero datacnt. ++ */ ++ if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { ++ /* ghost elements have a ghost size */ ++ ASSERT(ab->b_buf == NULL); ++ from_delta = ab->b_size; ++ } ++ ASSERT3U(*size, >=, from_delta); ++ atomic_add_64(size, -from_delta); ++ ++ if (use_mutex) ++ mutex_exit(&old_state->arcs_mtx); ++ } ++ if (new_state != arc_anon) { ++ int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); ++ uint64_t *size = &new_state->arcs_lsize[ab->b_type]; ++ ++ if (use_mutex) ++ mutex_enter(&new_state->arcs_mtx); ++ ++ list_insert_head(&new_state->arcs_list[ab->b_type], ab); ++ ++ /* ghost elements have a ghost size */ ++ if (GHOST_STATE(new_state)) { ++ ASSERT(ab->b_datacnt == 0); ++ ASSERT(ab->b_buf == NULL); ++ to_delta = ab->b_size; ++ } ++ atomic_add_64(size, to_delta); ++ ++ if (use_mutex) ++ mutex_exit(&new_state->arcs_mtx); ++ } ++ } ++ ++ ASSERT(!BUF_EMPTY(ab)); ++ if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) ++ buf_hash_remove(ab); ++ ++ /* adjust state sizes */ ++ if (to_delta) ++ atomic_add_64(&new_state->arcs_size, to_delta); ++ if (from_delta) { ++ ASSERT3U(old_state->arcs_size, >=, from_delta); ++ atomic_add_64(&old_state->arcs_size, -from_delta); ++ } ++ ab->b_state = new_state; ++ ++ /* adjust l2arc hdr stats */ ++ if (new_state == arc_l2c_only) ++ l2arc_hdr_stat_add(); ++ else if (old_state == arc_l2c_only) ++ l2arc_hdr_stat_remove(); ++} ++ ++void ++arc_space_consume(uint64_t space, arc_space_type_t type) ++{ ++ ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); ++ ++ switch (type) { ++ default: ++ break; ++ case ARC_SPACE_DATA: ++ ARCSTAT_INCR(arcstat_data_size, space); ++ break; ++ case ARC_SPACE_OTHER: ++ ARCSTAT_INCR(arcstat_other_size, space); ++ break; ++ case ARC_SPACE_HDRS: ++ ARCSTAT_INCR(arcstat_hdr_size, space); ++ break; ++ case ARC_SPACE_L2HDRS: ++ ARCSTAT_INCR(arcstat_l2_hdr_size, space); ++ break; ++ } ++ ++ atomic_add_64(&arc_meta_used, space); ++ atomic_add_64(&arc_size, space); ++} ++ ++void ++arc_space_return(uint64_t space, arc_space_type_t type) ++{ ++ ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); ++ ++ switch (type) { ++ default: ++ break; ++ case ARC_SPACE_DATA: ++ ARCSTAT_INCR(arcstat_data_size, -space); ++ break; ++ case ARC_SPACE_OTHER: ++ ARCSTAT_INCR(arcstat_other_size, -space); ++ break; ++ case ARC_SPACE_HDRS: ++ ARCSTAT_INCR(arcstat_hdr_size, -space); ++ break; ++ case ARC_SPACE_L2HDRS: ++ ARCSTAT_INCR(arcstat_l2_hdr_size, -space); ++ break; ++ } ++ ++ ASSERT(arc_meta_used >= space); ++ if (arc_meta_max < arc_meta_used) ++ arc_meta_max = arc_meta_used; ++ atomic_add_64(&arc_meta_used, -space); ++ ASSERT(arc_size >= space); ++ atomic_add_64(&arc_size, -space); ++} ++ ++void * ++arc_data_buf_alloc(uint64_t size) ++{ ++ if (arc_evict_needed(ARC_BUFC_DATA)) ++ cv_signal(&arc_reclaim_thr_cv); ++ atomic_add_64(&arc_size, size); ++ return (zio_data_buf_alloc(size)); ++} ++ ++void ++arc_data_buf_free(void *buf, uint64_t size) ++{ ++ zio_data_buf_free(buf, size); ++ ASSERT(arc_size >= size); ++ atomic_add_64(&arc_size, -size); ++} ++ ++arc_buf_t * ++arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) ++{ ++ arc_buf_hdr_t *hdr; ++ arc_buf_t *buf; ++ ++ ASSERT3U(size, >, 0); ++ hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); ++ ASSERT(BUF_EMPTY(hdr)); ++ hdr->b_size = size; ++ hdr->b_type = type; ++ hdr->b_spa = spa_load_guid(spa); ++ hdr->b_state = arc_anon; ++ hdr->b_arc_access = 0; ++ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); ++ buf->b_hdr = hdr; ++ buf->b_data = NULL; ++ buf->b_efunc = NULL; ++ buf->b_private = NULL; ++ buf->b_next = NULL; ++ hdr->b_buf = buf; ++ arc_get_data_buf(buf); ++ hdr->b_datacnt = 1; ++ hdr->b_flags = 0; ++ ASSERT(refcount_is_zero(&hdr->b_refcnt)); ++ (void) refcount_add(&hdr->b_refcnt, tag); ++ ++ return (buf); ++} ++ ++static char *arc_onloan_tag = "onloan"; ++ ++/* ++ * Loan out an anonymous arc buffer. Loaned buffers are not counted as in ++ * flight data by arc_tempreserve_space() until they are "returned". Loaned ++ * buffers must be returned to the arc before they can be used by the DMU or ++ * freed. ++ */ ++arc_buf_t * ++arc_loan_buf(spa_t *spa, int size) ++{ ++ arc_buf_t *buf; ++ ++ buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); ++ ++ atomic_add_64(&arc_loaned_bytes, size); ++ return (buf); ++} ++ ++/* ++ * Return a loaned arc buffer to the arc. ++ */ ++void ++arc_return_buf(arc_buf_t *buf, void *tag) ++{ ++ arc_buf_hdr_t *hdr = buf->b_hdr; ++ ++ ASSERT(buf->b_data != NULL); ++ (void) refcount_add(&hdr->b_refcnt, tag); ++ (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); ++ ++ atomic_add_64(&arc_loaned_bytes, -hdr->b_size); ++} ++ ++/* Detach an arc_buf from a dbuf (tag) */ ++void ++arc_loan_inuse_buf(arc_buf_t *buf, void *tag) ++{ ++ arc_buf_hdr_t *hdr; ++ ++ ASSERT(buf->b_data != NULL); ++ hdr = buf->b_hdr; ++ (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); ++ (void) refcount_remove(&hdr->b_refcnt, tag); ++ buf->b_efunc = NULL; ++ buf->b_private = NULL; ++ ++ atomic_add_64(&arc_loaned_bytes, hdr->b_size); ++} ++ ++static arc_buf_t * ++arc_buf_clone(arc_buf_t *from) ++{ ++ arc_buf_t *buf; ++ arc_buf_hdr_t *hdr = from->b_hdr; ++ uint64_t size = hdr->b_size; ++ ++ ASSERT(hdr->b_state != arc_anon); ++ ++ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); ++ buf->b_hdr = hdr; ++ buf->b_data = NULL; ++ buf->b_efunc = NULL; ++ buf->b_private = NULL; ++ buf->b_next = hdr->b_buf; ++ hdr->b_buf = buf; ++ arc_get_data_buf(buf); ++ bcopy(from->b_data, buf->b_data, size); ++ hdr->b_datacnt += 1; ++ return (buf); ++} ++ ++void ++arc_buf_add_ref(arc_buf_t *buf, void* tag) ++{ ++ arc_buf_hdr_t *hdr; ++ kmutex_t *hash_lock; ++ ++ /* ++ * Check to see if this buffer is evicted. Callers ++ * must verify b_data != NULL to know if the add_ref ++ * was successful. ++ */ ++ mutex_enter(&buf->b_evict_lock); ++ if (buf->b_data == NULL) { ++ mutex_exit(&buf->b_evict_lock); ++ return; ++ } ++ hash_lock = HDR_LOCK(buf->b_hdr); ++ mutex_enter(hash_lock); ++ hdr = buf->b_hdr; ++ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ++ mutex_exit(&buf->b_evict_lock); ++ ++ ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); ++ add_reference(hdr, hash_lock, tag); ++ DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); ++ arc_access(hdr, hash_lock); ++ mutex_exit(hash_lock); ++ ARCSTAT_BUMP(arcstat_hits); ++ ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), ++ demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, ++ data, metadata, hits); ++} ++ ++/* ++ * Free the arc data buffer. If it is an l2arc write in progress, ++ * the buffer is placed on l2arc_free_on_write to be freed later. ++ */ ++static void ++arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), ++ void *data, size_t size) ++{ ++ if (HDR_L2_WRITING(hdr)) { ++ l2arc_data_free_t *df; ++ df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE); ++ df->l2df_data = data; ++ df->l2df_size = size; ++ df->l2df_func = free_func; ++ mutex_enter(&l2arc_free_on_write_mtx); ++ list_insert_head(l2arc_free_on_write, df); ++ mutex_exit(&l2arc_free_on_write_mtx); ++ ARCSTAT_BUMP(arcstat_l2_free_on_write); ++ } else { ++ free_func(data, size); ++ } ++} ++ ++static void ++arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) ++{ ++ arc_buf_t **bufp; ++ ++ /* free up data associated with the buf */ ++ if (buf->b_data) { ++ arc_state_t *state = buf->b_hdr->b_state; ++ uint64_t size = buf->b_hdr->b_size; ++ arc_buf_contents_t type = buf->b_hdr->b_type; ++ ++ arc_cksum_verify(buf); ++ ++ if (!recycle) { ++ if (type == ARC_BUFC_METADATA) { ++ arc_buf_data_free(buf->b_hdr, zio_buf_free, ++ buf->b_data, size); ++ arc_space_return(size, ARC_SPACE_DATA); ++ } else { ++ ASSERT(type == ARC_BUFC_DATA); ++ arc_buf_data_free(buf->b_hdr, ++ zio_data_buf_free, buf->b_data, size); ++ ARCSTAT_INCR(arcstat_data_size, -size); ++ atomic_add_64(&arc_size, -size); ++ } ++ } ++ if (list_link_active(&buf->b_hdr->b_arc_node)) { ++ uint64_t *cnt = &state->arcs_lsize[type]; ++ ++ ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); ++ ASSERT(state != arc_anon); ++ ++ ASSERT3U(*cnt, >=, size); ++ atomic_add_64(cnt, -size); ++ } ++ ASSERT3U(state->arcs_size, >=, size); ++ atomic_add_64(&state->arcs_size, -size); ++ buf->b_data = NULL; ++ ASSERT(buf->b_hdr->b_datacnt > 0); ++ buf->b_hdr->b_datacnt -= 1; ++ } ++ ++ /* only remove the buf if requested */ ++ if (!all) ++ return; ++ ++ /* remove the buf from the hdr list */ ++ for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) ++ continue; ++ *bufp = buf->b_next; ++ buf->b_next = NULL; ++ ++ ASSERT(buf->b_efunc == NULL); ++ ++ /* clean up the buf */ ++ buf->b_hdr = NULL; ++ kmem_cache_free(buf_cache, buf); ++} ++ ++static void ++arc_hdr_destroy(arc_buf_hdr_t *hdr) ++{ ++ l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; ++ ++ ASSERT(refcount_is_zero(&hdr->b_refcnt)); ++ ASSERT3P(hdr->b_state, ==, arc_anon); ++ ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ++ ++ if (l2hdr != NULL) { ++ boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); ++ /* ++ * To prevent arc_free() and l2arc_evict() from ++ * attempting to free the same buffer at the same time, ++ * a FREE_IN_PROGRESS flag is given to arc_free() to ++ * give it priority. l2arc_evict() can't destroy this ++ * header while we are waiting on l2arc_buflist_mtx. ++ * ++ * The hdr may be removed from l2ad_buflist before we ++ * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. ++ */ ++ if (!buflist_held) { ++ mutex_enter(&l2arc_buflist_mtx); ++ l2hdr = hdr->b_l2hdr; ++ } ++ ++ if (l2hdr != NULL) { ++ list_remove(l2hdr->b_dev->l2ad_buflist, hdr); ++ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); ++ kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); ++ if (hdr->b_state == arc_l2c_only) ++ l2arc_hdr_stat_remove(); ++ hdr->b_l2hdr = NULL; ++ } ++ ++ if (!buflist_held) ++ mutex_exit(&l2arc_buflist_mtx); ++ } ++ ++ if (!BUF_EMPTY(hdr)) { ++ ASSERT(!HDR_IN_HASH_TABLE(hdr)); ++ buf_discard_identity(hdr); ++ } ++ while (hdr->b_buf) { ++ arc_buf_t *buf = hdr->b_buf; ++ ++ if (buf->b_efunc) { ++ mutex_enter(&arc_eviction_mtx); ++ mutex_enter(&buf->b_evict_lock); ++ ASSERT(buf->b_hdr != NULL); ++ arc_buf_destroy(hdr->b_buf, FALSE, FALSE); ++ hdr->b_buf = buf->b_next; ++ buf->b_hdr = &arc_eviction_hdr; ++ buf->b_next = arc_eviction_list; ++ arc_eviction_list = buf; ++ mutex_exit(&buf->b_evict_lock); ++ mutex_exit(&arc_eviction_mtx); ++ } else { ++ arc_buf_destroy(hdr->b_buf, FALSE, TRUE); ++ } ++ } ++ if (hdr->b_freeze_cksum != NULL) { ++ kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); ++ hdr->b_freeze_cksum = NULL; ++ } ++ if (hdr->b_thawed) { ++ kmem_free(hdr->b_thawed, 1); ++ hdr->b_thawed = NULL; ++ } ++ ++ ASSERT(!list_link_active(&hdr->b_arc_node)); ++ ASSERT3P(hdr->b_hash_next, ==, NULL); ++ ASSERT3P(hdr->b_acb, ==, NULL); ++ kmem_cache_free(hdr_cache, hdr); ++} ++ ++void ++arc_buf_free(arc_buf_t *buf, void *tag) ++{ ++ arc_buf_hdr_t *hdr = buf->b_hdr; ++ int hashed = hdr->b_state != arc_anon; ++ ++ ASSERT(buf->b_efunc == NULL); ++ ASSERT(buf->b_data != NULL); ++ ++ if (hashed) { ++ kmutex_t *hash_lock = HDR_LOCK(hdr); ++ ++ mutex_enter(hash_lock); ++ hdr = buf->b_hdr; ++ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ++ ++ (void) remove_reference(hdr, hash_lock, tag); ++ if (hdr->b_datacnt > 1) { ++ arc_buf_destroy(buf, FALSE, TRUE); ++ } else { ++ ASSERT(buf == hdr->b_buf); ++ ASSERT(buf->b_efunc == NULL); ++ hdr->b_flags |= ARC_BUF_AVAILABLE; ++ } ++ mutex_exit(hash_lock); ++ } else if (HDR_IO_IN_PROGRESS(hdr)) { ++ int destroy_hdr; ++ /* ++ * We are in the middle of an async write. Don't destroy ++ * this buffer unless the write completes before we finish ++ * decrementing the reference count. ++ */ ++ mutex_enter(&arc_eviction_mtx); ++ (void) remove_reference(hdr, NULL, tag); ++ ASSERT(refcount_is_zero(&hdr->b_refcnt)); ++ destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); ++ mutex_exit(&arc_eviction_mtx); ++ if (destroy_hdr) ++ arc_hdr_destroy(hdr); ++ } else { ++ if (remove_reference(hdr, NULL, tag) > 0) ++ arc_buf_destroy(buf, FALSE, TRUE); ++ else ++ arc_hdr_destroy(hdr); ++ } ++} ++ ++int ++arc_buf_remove_ref(arc_buf_t *buf, void* tag) ++{ ++ arc_buf_hdr_t *hdr = buf->b_hdr; ++ kmutex_t *hash_lock = HDR_LOCK(hdr); ++ int no_callback = (buf->b_efunc == NULL); ++ ++ if (hdr->b_state == arc_anon) { ++ ASSERT(hdr->b_datacnt == 1); ++ arc_buf_free(buf, tag); ++ return (no_callback); ++ } ++ ++ mutex_enter(hash_lock); ++ hdr = buf->b_hdr; ++ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ++ ASSERT(hdr->b_state != arc_anon); ++ ASSERT(buf->b_data != NULL); ++ ++ (void) remove_reference(hdr, hash_lock, tag); ++ if (hdr->b_datacnt > 1) { ++ if (no_callback) ++ arc_buf_destroy(buf, FALSE, TRUE); ++ } else if (no_callback) { ++ ASSERT(hdr->b_buf == buf && buf->b_next == NULL); ++ ASSERT(buf->b_efunc == NULL); ++ hdr->b_flags |= ARC_BUF_AVAILABLE; ++ } ++ ASSERT(no_callback || hdr->b_datacnt > 1 || ++ refcount_is_zero(&hdr->b_refcnt)); ++ mutex_exit(hash_lock); ++ return (no_callback); ++} ++ ++int ++arc_buf_size(arc_buf_t *buf) ++{ ++ return (buf->b_hdr->b_size); ++} ++ ++/* ++ * Evict buffers from list until we've removed the specified number of ++ * bytes. Move the removed buffers to the appropriate evict state. ++ * If the recycle flag is set, then attempt to "recycle" a buffer: ++ * - look for a buffer to evict that is `bytes' long. ++ * - return the data block from this buffer rather than freeing it. ++ * This flag is used by callers that are trying to make space for a ++ * new buffer in a full arc cache. ++ * ++ * This function makes a "best effort". It skips over any buffers ++ * it can't get a hash_lock on, and so may not catch all candidates. ++ * It may also return without evicting as much space as requested. ++ */ ++static void * ++arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, ++ arc_buf_contents_t type) ++{ ++ arc_state_t *evicted_state; ++ uint64_t bytes_evicted = 0, skipped = 0, missed = 0; ++ arc_buf_hdr_t *ab, *ab_prev = NULL; ++ list_t *list = &state->arcs_list[type]; ++ kmutex_t *hash_lock; ++ boolean_t have_lock; ++ void *stolen = NULL; ++ ++ ASSERT(state == arc_mru || state == arc_mfu); ++ ++ evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; ++ ++ mutex_enter(&state->arcs_mtx); ++ mutex_enter(&evicted_state->arcs_mtx); ++ ++ for (ab = list_tail(list); ab; ab = ab_prev) { ++ ab_prev = list_prev(list, ab); ++ /* prefetch buffers have a minimum lifespan */ ++ if (HDR_IO_IN_PROGRESS(ab) || ++ (spa && ab->b_spa != spa) || ++ (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && ++ ddi_get_lbolt() - ab->b_arc_access < ++ arc_min_prefetch_lifespan)) { ++ skipped++; ++ continue; ++ } ++ /* "lookahead" for better eviction candidate */ ++ if (recycle && ab->b_size != bytes && ++ ab_prev && ab_prev->b_size == bytes) ++ continue; ++ hash_lock = HDR_LOCK(ab); ++ have_lock = MUTEX_HELD(hash_lock); ++ if (have_lock || mutex_tryenter(hash_lock)) { ++ ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); ++ ASSERT(ab->b_datacnt > 0); ++ while (ab->b_buf) { ++ arc_buf_t *buf = ab->b_buf; ++ if (!mutex_tryenter(&buf->b_evict_lock)) { ++ missed += 1; ++ break; ++ } ++ if (buf->b_data) { ++ bytes_evicted += ab->b_size; ++ if (recycle && ab->b_type == type && ++ ab->b_size == bytes && ++ !HDR_L2_WRITING(ab)) { ++ stolen = buf->b_data; ++ recycle = FALSE; ++ } ++ } ++ if (buf->b_efunc) { ++ mutex_enter(&arc_eviction_mtx); ++ arc_buf_destroy(buf, ++ buf->b_data == stolen, FALSE); ++ ab->b_buf = buf->b_next; ++ buf->b_hdr = &arc_eviction_hdr; ++ buf->b_next = arc_eviction_list; ++ arc_eviction_list = buf; ++ mutex_exit(&arc_eviction_mtx); ++ mutex_exit(&buf->b_evict_lock); ++ } else { ++ mutex_exit(&buf->b_evict_lock); ++ arc_buf_destroy(buf, ++ buf->b_data == stolen, TRUE); ++ } ++ } ++ ++ if (ab->b_l2hdr) { ++ ARCSTAT_INCR(arcstat_evict_l2_cached, ++ ab->b_size); ++ } else { ++ if (l2arc_write_eligible(ab->b_spa, ab)) { ++ ARCSTAT_INCR(arcstat_evict_l2_eligible, ++ ab->b_size); ++ } else { ++ ARCSTAT_INCR( ++ arcstat_evict_l2_ineligible, ++ ab->b_size); ++ } ++ } ++ ++ if (ab->b_datacnt == 0) { ++ arc_change_state(evicted_state, ab, hash_lock); ++ ASSERT(HDR_IN_HASH_TABLE(ab)); ++ ab->b_flags |= ARC_IN_HASH_TABLE; ++ ab->b_flags &= ~ARC_BUF_AVAILABLE; ++ DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); ++ } ++ if (!have_lock) ++ mutex_exit(hash_lock); ++ if (bytes >= 0 && bytes_evicted >= bytes) ++ break; ++ } else { ++ missed += 1; ++ } ++ } ++ ++ mutex_exit(&evicted_state->arcs_mtx); ++ mutex_exit(&state->arcs_mtx); ++ ++ if (bytes_evicted < bytes) ++ dprintf("only evicted %lld bytes from %x\n", ++ (longlong_t)bytes_evicted, state); ++ ++ if (skipped) ++ ARCSTAT_INCR(arcstat_evict_skip, skipped); ++ ++ if (missed) ++ ARCSTAT_INCR(arcstat_mutex_miss, missed); ++ ++ /* ++ * We have just evicted some date into the ghost state, make ++ * sure we also adjust the ghost state size if necessary. ++ */ ++ if (arc_no_grow && ++ arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { ++ int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + ++ arc_mru_ghost->arcs_size - arc_c; ++ ++ if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { ++ int64_t todelete = ++ MIN(arc_mru_ghost->arcs_lsize[type], mru_over); ++ arc_evict_ghost(arc_mru_ghost, 0, todelete); ++ } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { ++ int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], ++ arc_mru_ghost->arcs_size + ++ arc_mfu_ghost->arcs_size - arc_c); ++ arc_evict_ghost(arc_mfu_ghost, 0, todelete); ++ } ++ } ++ ++ return (stolen); ++} ++ ++/* ++ * Remove buffers from list until we've removed the specified number of ++ * bytes. Destroy the buffers that are removed. ++ */ ++static void ++arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) ++{ ++ arc_buf_hdr_t *ab, *ab_prev; ++ arc_buf_hdr_t marker; ++ list_t *list = &state->arcs_list[ARC_BUFC_DATA]; ++ kmutex_t *hash_lock; ++ uint64_t bytes_deleted = 0; ++ uint64_t bufs_skipped = 0; ++ ++ ASSERT(GHOST_STATE(state)); ++ bzero(&marker, sizeof(marker)); ++top: ++ mutex_enter(&state->arcs_mtx); ++ for (ab = list_tail(list); ab; ab = ab_prev) { ++ ab_prev = list_prev(list, ab); ++ if (spa && ab->b_spa != spa) ++ continue; ++ ++ /* ignore markers */ ++ if (ab->b_spa == 0) ++ continue; ++ ++ hash_lock = HDR_LOCK(ab); ++ /* caller may be trying to modify this buffer, skip it */ ++ if (MUTEX_HELD(hash_lock)) ++ continue; ++ if (mutex_tryenter(hash_lock)) { ++ ASSERT(!HDR_IO_IN_PROGRESS(ab)); ++ ASSERT(ab->b_buf == NULL); ++ ARCSTAT_BUMP(arcstat_deleted); ++ bytes_deleted += ab->b_size; ++ ++ if (ab->b_l2hdr != NULL) { ++ /* ++ * This buffer is cached on the 2nd Level ARC; ++ * don't destroy the header. ++ */ ++ arc_change_state(arc_l2c_only, ab, hash_lock); ++ mutex_exit(hash_lock); ++ } else { ++ arc_change_state(arc_anon, ab, hash_lock); ++ mutex_exit(hash_lock); ++ arc_hdr_destroy(ab); ++ } ++ ++ DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); ++ if (bytes >= 0 && bytes_deleted >= bytes) ++ break; ++ } else if (bytes < 0) { ++ /* ++ * Insert a list marker and then wait for the ++ * hash lock to become available. Once its ++ * available, restart from where we left off. ++ */ ++ list_insert_after(list, ab, &marker); ++ mutex_exit(&state->arcs_mtx); ++ mutex_enter(hash_lock); ++ mutex_exit(hash_lock); ++ mutex_enter(&state->arcs_mtx); ++ ab_prev = list_prev(list, &marker); ++ list_remove(list, &marker); ++ } else ++ bufs_skipped += 1; ++ } ++ mutex_exit(&state->arcs_mtx); ++ ++ if (list == &state->arcs_list[ARC_BUFC_DATA] && ++ (bytes < 0 || bytes_deleted < bytes)) { ++ list = &state->arcs_list[ARC_BUFC_METADATA]; ++ goto top; ++ } ++ ++ if (bufs_skipped) { ++ ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); ++ ASSERT(bytes >= 0); ++ } ++ ++ if (bytes_deleted < bytes) ++ dprintf("only deleted %lld bytes from %p\n", ++ (longlong_t)bytes_deleted, state); ++} ++ ++static void ++arc_adjust(void) ++{ ++ int64_t adjustment, delta; ++ ++ /* ++ * Adjust MRU size ++ */ ++ ++ adjustment = MIN((int64_t)(arc_size - arc_c), ++ (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - ++ arc_p)); ++ ++ if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { ++ delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); ++ (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); ++ adjustment -= delta; ++ } ++ ++ if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { ++ delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); ++ (void) arc_evict(arc_mru, 0, delta, FALSE, ++ ARC_BUFC_METADATA); ++ } ++ ++ /* ++ * Adjust MFU size ++ */ ++ ++ adjustment = arc_size - arc_c; ++ ++ if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { ++ delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); ++ (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); ++ adjustment -= delta; ++ } ++ ++ if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { ++ int64_t delta = MIN(adjustment, ++ arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); ++ (void) arc_evict(arc_mfu, 0, delta, FALSE, ++ ARC_BUFC_METADATA); ++ } ++ ++ /* ++ * Adjust ghost lists ++ */ ++ ++ adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; ++ ++ if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { ++ delta = MIN(arc_mru_ghost->arcs_size, adjustment); ++ arc_evict_ghost(arc_mru_ghost, 0, delta); ++ } ++ ++ adjustment = ++ arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; ++ ++ if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { ++ delta = MIN(arc_mfu_ghost->arcs_size, adjustment); ++ arc_evict_ghost(arc_mfu_ghost, 0, delta); ++ } ++} ++ ++/* ++ * Request that arc user drop references so that N bytes can be released ++ * from the cache. This provides a mechanism to ensure the arc can honor ++ * the arc_meta_limit and reclaim buffers which are pinned in the cache ++ * by higher layers. (i.e. the zpl) ++ */ ++static void ++arc_do_user_prune(int64_t adjustment) ++{ ++ arc_prune_func_t *func; ++ void *private; ++ arc_prune_t *cp, *np; ++ ++ mutex_enter(&arc_prune_mtx); ++ ++ cp = list_head(&arc_prune_list); ++ while (cp != NULL) { ++ func = cp->p_pfunc; ++ private = cp->p_private; ++ np = list_next(&arc_prune_list, cp); ++ refcount_add(&cp->p_refcnt, func); ++ mutex_exit(&arc_prune_mtx); ++ ++ if (func != NULL) ++ func(adjustment, private); ++ ++ mutex_enter(&arc_prune_mtx); ++ ++ /* User removed prune callback concurrently with execution */ ++ if (refcount_remove(&cp->p_refcnt, func) == 0) { ++ ASSERT(!list_link_active(&cp->p_node)); ++ refcount_destroy(&cp->p_refcnt); ++ kmem_free(cp, sizeof (*cp)); ++ } ++ ++ cp = np; ++ } ++ ++ ARCSTAT_BUMP(arcstat_prune); ++ mutex_exit(&arc_prune_mtx); ++} ++ ++static void ++arc_do_user_evicts(void) ++{ ++ mutex_enter(&arc_eviction_mtx); ++ while (arc_eviction_list != NULL) { ++ arc_buf_t *buf = arc_eviction_list; ++ arc_eviction_list = buf->b_next; ++ mutex_enter(&buf->b_evict_lock); ++ buf->b_hdr = NULL; ++ mutex_exit(&buf->b_evict_lock); ++ mutex_exit(&arc_eviction_mtx); ++ ++ if (buf->b_efunc != NULL) ++ VERIFY(buf->b_efunc(buf) == 0); ++ ++ buf->b_efunc = NULL; ++ buf->b_private = NULL; ++ kmem_cache_free(buf_cache, buf); ++ mutex_enter(&arc_eviction_mtx); ++ } ++ mutex_exit(&arc_eviction_mtx); ++} ++ ++/* ++ * Evict only meta data objects from the cache leaving the data objects. ++ * This is only used to enforce the tunable arc_meta_limit, if we are ++ * unable to evict enough buffers notify the user via the prune callback. ++ */ ++void ++arc_adjust_meta(int64_t adjustment, boolean_t may_prune) ++{ ++ int64_t delta; ++ ++ if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { ++ delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); ++ arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); ++ adjustment -= delta; ++ } ++ ++ if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { ++ delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment); ++ arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA); ++ adjustment -= delta; ++ } ++ ++ if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit)) ++ arc_do_user_prune(arc_meta_prune); ++} ++ ++/* ++ * Flush all *evictable* data from the cache for the given spa. ++ * NOTE: this will not touch "active" (i.e. referenced) data. ++ */ ++void ++arc_flush(spa_t *spa) ++{ ++ uint64_t guid = 0; ++ ++ if (spa) ++ guid = spa_load_guid(spa); ++ ++ while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { ++ (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); ++ if (spa) ++ break; ++ } ++ while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { ++ (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); ++ if (spa) ++ break; ++ } ++ while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { ++ (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); ++ if (spa) ++ break; ++ } ++ while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { ++ (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); ++ if (spa) ++ break; ++ } ++ ++ arc_evict_ghost(arc_mru_ghost, guid, -1); ++ arc_evict_ghost(arc_mfu_ghost, guid, -1); ++ ++ mutex_enter(&arc_reclaim_thr_lock); ++ arc_do_user_evicts(); ++ mutex_exit(&arc_reclaim_thr_lock); ++ ASSERT(spa || arc_eviction_list == NULL); ++} ++ ++void ++arc_shrink(uint64_t bytes) ++{ ++ if (arc_c > arc_c_min) { ++ uint64_t to_free; ++ ++ to_free = bytes ? bytes : arc_c >> arc_shrink_shift; ++ ++ if (arc_c > arc_c_min + to_free) ++ atomic_add_64(&arc_c, -to_free); ++ else ++ arc_c = arc_c_min; ++ ++ atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); ++ if (arc_c > arc_size) ++ arc_c = MAX(arc_size, arc_c_min); ++ if (arc_p > arc_c) ++ arc_p = (arc_c >> 1); ++ ASSERT(arc_c >= arc_c_min); ++ ASSERT((int64_t)arc_p >= 0); ++ } ++ ++ if (arc_size > arc_c) ++ arc_adjust(); ++} ++ ++static void ++arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) ++{ ++ size_t i; ++ kmem_cache_t *prev_cache = NULL; ++ kmem_cache_t *prev_data_cache = NULL; ++ extern kmem_cache_t *zio_buf_cache[]; ++ extern kmem_cache_t *zio_data_buf_cache[]; ++ ++ /* ++ * An aggressive reclamation will shrink the cache size as well as ++ * reap free buffers from the arc kmem caches. ++ */ ++ if (strat == ARC_RECLAIM_AGGR) ++ arc_shrink(bytes); ++ ++ for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { ++ if (zio_buf_cache[i] != prev_cache) { ++ prev_cache = zio_buf_cache[i]; ++ kmem_cache_reap_now(zio_buf_cache[i]); ++ } ++ if (zio_data_buf_cache[i] != prev_data_cache) { ++ prev_data_cache = zio_data_buf_cache[i]; ++ kmem_cache_reap_now(zio_data_buf_cache[i]); ++ } ++ } ++ ++ kmem_cache_reap_now(buf_cache); ++ kmem_cache_reap_now(hdr_cache); ++} ++ ++/* ++ * Unlike other ZFS implementations this thread is only responsible for ++ * adapting the target ARC size on Linux. The responsibility for memory ++ * reclamation has been entirely delegated to the arc_shrinker_func() ++ * which is registered with the VM. To reflect this change in behavior ++ * the arc_reclaim thread has been renamed to arc_adapt. ++ */ ++static void ++arc_adapt_thread(void) ++{ ++ callb_cpr_t cpr; ++ int64_t prune; ++ ++ CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); ++ ++ mutex_enter(&arc_reclaim_thr_lock); ++ while (arc_thread_exit == 0) { ++#ifndef _KERNEL ++ arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; ++ ++ if (spa_get_random(100) == 0) { ++ ++ if (arc_no_grow) { ++ if (last_reclaim == ARC_RECLAIM_CONS) { ++ last_reclaim = ARC_RECLAIM_AGGR; ++ } else { ++ last_reclaim = ARC_RECLAIM_CONS; ++ } ++ } else { ++ arc_no_grow = TRUE; ++ last_reclaim = ARC_RECLAIM_AGGR; ++ membar_producer(); ++ } ++ ++ /* reset the growth delay for every reclaim */ ++ arc_grow_time = ddi_get_lbolt()+(arc_grow_retry * hz); ++ ++ arc_kmem_reap_now(last_reclaim, 0); ++ arc_warm = B_TRUE; ++ } ++#endif /* !_KERNEL */ ++ ++ /* No recent memory pressure allow the ARC to grow. */ ++ if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time) ++ arc_no_grow = FALSE; ++ ++ /* ++ * Keep meta data usage within limits, arc_shrink() is not ++ * used to avoid collapsing the arc_c value when only the ++ * arc_meta_limit is being exceeded. ++ */ ++ prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit; ++ if (prune > 0) ++ arc_adjust_meta(prune, B_TRUE); ++ ++ arc_adjust(); ++ ++ if (arc_eviction_list != NULL) ++ arc_do_user_evicts(); ++ ++ /* block until needed, or one second, whichever is shorter */ ++ CALLB_CPR_SAFE_BEGIN(&cpr); ++ (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv, ++ &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); ++ CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); ++ } ++ ++ arc_thread_exit = 0; ++ cv_broadcast(&arc_reclaim_thr_cv); ++ CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ ++ thread_exit(); ++} ++ ++#ifdef _KERNEL ++/* ++ * Determine the amount of memory eligible for eviction contained in the ++ * ARC. All clean data reported by the ghost lists can always be safely ++ * evicted. Due to arc_c_min, the same does not hold for all clean data ++ * contained by the regular mru and mfu lists. ++ * ++ * In the case of the regular mru and mfu lists, we need to report as ++ * much clean data as possible, such that evicting that same reported ++ * data will not bring arc_size below arc_c_min. Thus, in certain ++ * circumstances, the total amount of clean data in the mru and mfu ++ * lists might not actually be evictable. ++ * ++ * The following two distinct cases are accounted for: ++ * ++ * 1. The sum of the amount of dirty data contained by both the mru and ++ * mfu lists, plus the ARC's other accounting (e.g. the anon list), ++ * is greater than or equal to arc_c_min. ++ * (i.e. amount of dirty data >= arc_c_min) ++ * ++ * This is the easy case; all clean data contained by the mru and mfu ++ * lists is evictable. Evicting all clean data can only drop arc_size ++ * to the amount of dirty data, which is greater than arc_c_min. ++ * ++ * 2. The sum of the amount of dirty data contained by both the mru and ++ * mfu lists, plus the ARC's other accounting (e.g. the anon list), ++ * is less than arc_c_min. ++ * (i.e. arc_c_min > amount of dirty data) ++ * ++ * 2.1. arc_size is greater than or equal arc_c_min. ++ * (i.e. arc_size >= arc_c_min > amount of dirty data) ++ * ++ * In this case, not all clean data from the regular mru and mfu ++ * lists is actually evictable; we must leave enough clean data ++ * to keep arc_size above arc_c_min. Thus, the maximum amount of ++ * evictable data from the two lists combined, is exactly the ++ * difference between arc_size and arc_c_min. ++ * ++ * 2.2. arc_size is less than arc_c_min ++ * (i.e. arc_c_min > arc_size > amount of dirty data) ++ * ++ * In this case, none of the data contained in the mru and mfu ++ * lists is evictable, even if it's clean. Since arc_size is ++ * already below arc_c_min, evicting any more would only ++ * increase this negative difference. ++ */ ++static uint64_t ++arc_evictable_memory(void) { ++ uint64_t arc_clean = ++ arc_mru->arcs_lsize[ARC_BUFC_DATA] + ++ arc_mru->arcs_lsize[ARC_BUFC_METADATA] + ++ arc_mfu->arcs_lsize[ARC_BUFC_DATA] + ++ arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; ++ uint64_t ghost_clean = ++ arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] + ++ arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] + ++ arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] + ++ arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]; ++ uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0); ++ ++ if (arc_dirty >= arc_c_min) ++ return (ghost_clean + arc_clean); ++ ++ return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0)); ++} ++ ++static int ++__arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) ++{ ++ uint64_t pages; ++ ++ /* The arc is considered warm once reclaim has occurred */ ++ if (unlikely(arc_warm == B_FALSE)) ++ arc_warm = B_TRUE; ++ ++ /* Return the potential number of reclaimable pages */ ++ pages = btop(arc_evictable_memory()); ++ if (sc->nr_to_scan == 0) ++ return (pages); ++ ++ /* Not allowed to perform filesystem reclaim */ ++ if (!(sc->gfp_mask & __GFP_FS)) ++ return (-1); ++ ++ /* Reclaim in progress */ ++ if (mutex_tryenter(&arc_reclaim_thr_lock) == 0) ++ return (-1); ++ ++ /* ++ * Evict the requested number of pages by shrinking arc_c the ++ * requested amount. If there is nothing left to evict just ++ * reap whatever we can from the various arc slabs. ++ */ ++ if (pages > 0) { ++ arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan)); ++ pages = btop(arc_evictable_memory()); ++ } else { ++ arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan)); ++ pages = -1; ++ } ++ ++ /* ++ * When direct reclaim is observed it usually indicates a rapid ++ * increase in memory pressure. This occurs because the kswapd ++ * threads were unable to asynchronously keep enough free memory ++ * available. In this case set arc_no_grow to briefly pause arc ++ * growth to avoid compounding the memory pressure. ++ */ ++ if (current_is_kswapd()) { ++ ARCSTAT_BUMP(arcstat_memory_indirect_count); ++ } else { ++ arc_no_grow = B_TRUE; ++ arc_grow_time = ddi_get_lbolt() + (arc_grow_retry * hz); ++ ARCSTAT_BUMP(arcstat_memory_direct_count); ++ } ++ ++ mutex_exit(&arc_reclaim_thr_lock); ++ ++ return (pages); ++} ++SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func); ++ ++SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS); ++#endif /* _KERNEL */ ++ ++/* ++ * Adapt arc info given the number of bytes we are trying to add and ++ * the state that we are comming from. This function is only called ++ * when we are adding new content to the cache. ++ */ ++static void ++arc_adapt(int bytes, arc_state_t *state) ++{ ++ int mult; ++ uint64_t arc_p_min = (arc_c >> arc_p_min_shift); ++ ++ if (state == arc_l2c_only) ++ return; ++ ++ ASSERT(bytes > 0); ++ /* ++ * Adapt the target size of the MRU list: ++ * - if we just hit in the MRU ghost list, then increase ++ * the target size of the MRU list. ++ * - if we just hit in the MFU ghost list, then increase ++ * the target size of the MFU list by decreasing the ++ * target size of the MRU list. ++ */ ++ if (state == arc_mru_ghost) { ++ mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? ++ 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); ++ mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ ++ ++ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); ++ } else if (state == arc_mfu_ghost) { ++ uint64_t delta; ++ ++ mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? ++ 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); ++ mult = MIN(mult, 10); ++ ++ delta = MIN(bytes * mult, arc_p); ++ arc_p = MAX(arc_p_min, arc_p - delta); ++ } ++ ASSERT((int64_t)arc_p >= 0); ++ ++ if (arc_no_grow) ++ return; ++ ++ if (arc_c >= arc_c_max) ++ return; ++ ++ /* ++ * If we're within (2 * maxblocksize) bytes of the target ++ * cache size, increment the target cache size ++ */ ++ if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { ++ atomic_add_64(&arc_c, (int64_t)bytes); ++ if (arc_c > arc_c_max) ++ arc_c = arc_c_max; ++ else if (state == arc_anon) ++ atomic_add_64(&arc_p, (int64_t)bytes); ++ if (arc_p > arc_c) ++ arc_p = arc_c; ++ } ++ ASSERT((int64_t)arc_p >= 0); ++} ++ ++/* ++ * Check if the cache has reached its limits and eviction is required ++ * prior to insert. ++ */ ++static int ++arc_evict_needed(arc_buf_contents_t type) ++{ ++ if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) ++ return (1); ++ ++ if (arc_no_grow) ++ return (1); ++ ++ return (arc_size > arc_c); ++} ++ ++/* ++ * The buffer, supplied as the first argument, needs a data block. ++ * So, if we are at cache max, determine which cache should be victimized. ++ * We have the following cases: ++ * ++ * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> ++ * In this situation if we're out of space, but the resident size of the MFU is ++ * under the limit, victimize the MFU cache to satisfy this insertion request. ++ * ++ * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> ++ * Here, we've used up all of the available space for the MRU, so we need to ++ * evict from our own cache instead. Evict from the set of resident MRU ++ * entries. ++ * ++ * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> ++ * c minus p represents the MFU space in the cache, since p is the size of the ++ * cache that is dedicated to the MRU. In this situation there's still space on ++ * the MFU side, so the MRU side needs to be victimized. ++ * ++ * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> ++ * MFU's resident set is consuming more space than it has been allotted. In ++ * this situation, we must victimize our own cache, the MFU, for this insertion. ++ */ ++static void ++arc_get_data_buf(arc_buf_t *buf) ++{ ++ arc_state_t *state = buf->b_hdr->b_state; ++ uint64_t size = buf->b_hdr->b_size; ++ arc_buf_contents_t type = buf->b_hdr->b_type; ++ ++ arc_adapt(size, state); ++ ++ /* ++ * We have not yet reached cache maximum size, ++ * just allocate a new buffer. ++ */ ++ if (!arc_evict_needed(type)) { ++ if (type == ARC_BUFC_METADATA) { ++ buf->b_data = zio_buf_alloc(size); ++ arc_space_consume(size, ARC_SPACE_DATA); ++ } else { ++ ASSERT(type == ARC_BUFC_DATA); ++ buf->b_data = zio_data_buf_alloc(size); ++ ARCSTAT_INCR(arcstat_data_size, size); ++ atomic_add_64(&arc_size, size); ++ } ++ goto out; ++ } ++ ++ /* ++ * If we are prefetching from the mfu ghost list, this buffer ++ * will end up on the mru list; so steal space from there. ++ */ ++ if (state == arc_mfu_ghost) ++ state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; ++ else if (state == arc_mru_ghost) ++ state = arc_mru; ++ ++ if (state == arc_mru || state == arc_anon) { ++ uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; ++ state = (arc_mfu->arcs_lsize[type] >= size && ++ arc_p > mru_used) ? arc_mfu : arc_mru; ++ } else { ++ /* MFU cases */ ++ uint64_t mfu_space = arc_c - arc_p; ++ state = (arc_mru->arcs_lsize[type] >= size && ++ mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; ++ } ++ ++ if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { ++ if (type == ARC_BUFC_METADATA) { ++ buf->b_data = zio_buf_alloc(size); ++ arc_space_consume(size, ARC_SPACE_DATA); ++ ++ /* ++ * If we are unable to recycle an existing meta buffer ++ * signal the reclaim thread. It will notify users ++ * via the prune callback to drop references. The ++ * prune callback in run in the context of the reclaim ++ * thread to avoid deadlocking on the hash_lock. ++ */ ++ cv_signal(&arc_reclaim_thr_cv); ++ } else { ++ ASSERT(type == ARC_BUFC_DATA); ++ buf->b_data = zio_data_buf_alloc(size); ++ ARCSTAT_INCR(arcstat_data_size, size); ++ atomic_add_64(&arc_size, size); ++ } ++ ++ ARCSTAT_BUMP(arcstat_recycle_miss); ++ } ++ ASSERT(buf->b_data != NULL); ++out: ++ /* ++ * Update the state size. Note that ghost states have a ++ * "ghost size" and so don't need to be updated. ++ */ ++ if (!GHOST_STATE(buf->b_hdr->b_state)) { ++ arc_buf_hdr_t *hdr = buf->b_hdr; ++ ++ atomic_add_64(&hdr->b_state->arcs_size, size); ++ if (list_link_active(&hdr->b_arc_node)) { ++ ASSERT(refcount_is_zero(&hdr->b_refcnt)); ++ atomic_add_64(&hdr->b_state->arcs_lsize[type], size); ++ } ++ /* ++ * If we are growing the cache, and we are adding anonymous ++ * data, and we have outgrown arc_p, update arc_p ++ */ ++ if (arc_size < arc_c && hdr->b_state == arc_anon && ++ arc_anon->arcs_size + arc_mru->arcs_size > arc_p) ++ arc_p = MIN(arc_c, arc_p + size); ++ } ++} ++ ++/* ++ * This routine is called whenever a buffer is accessed. ++ * NOTE: the hash lock is dropped in this function. ++ */ ++static void ++arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) ++{ ++ clock_t now; ++ ++ ASSERT(MUTEX_HELD(hash_lock)); ++ ++ if (buf->b_state == arc_anon) { ++ /* ++ * This buffer is not in the cache, and does not ++ * appear in our "ghost" list. Add the new buffer ++ * to the MRU state. ++ */ ++ ++ ASSERT(buf->b_arc_access == 0); ++ buf->b_arc_access = ddi_get_lbolt(); ++ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); ++ arc_change_state(arc_mru, buf, hash_lock); ++ ++ } else if (buf->b_state == arc_mru) { ++ now = ddi_get_lbolt(); ++ ++ /* ++ * If this buffer is here because of a prefetch, then either: ++ * - clear the flag if this is a "referencing" read ++ * (any subsequent access will bump this into the MFU state). ++ * or ++ * - move the buffer to the head of the list if this is ++ * another prefetch (to make it less likely to be evicted). ++ */ ++ if ((buf->b_flags & ARC_PREFETCH) != 0) { ++ if (refcount_count(&buf->b_refcnt) == 0) { ++ ASSERT(list_link_active(&buf->b_arc_node)); ++ } else { ++ buf->b_flags &= ~ARC_PREFETCH; ++ ARCSTAT_BUMP(arcstat_mru_hits); ++ } ++ buf->b_arc_access = now; ++ return; ++ } ++ ++ /* ++ * This buffer has been "accessed" only once so far, ++ * but it is still in the cache. Move it to the MFU ++ * state. ++ */ ++ if (now > buf->b_arc_access + ARC_MINTIME) { ++ /* ++ * More than 125ms have passed since we ++ * instantiated this buffer. Move it to the ++ * most frequently used state. ++ */ ++ buf->b_arc_access = now; ++ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); ++ arc_change_state(arc_mfu, buf, hash_lock); ++ } ++ ARCSTAT_BUMP(arcstat_mru_hits); ++ } else if (buf->b_state == arc_mru_ghost) { ++ arc_state_t *new_state; ++ /* ++ * This buffer has been "accessed" recently, but ++ * was evicted from the cache. Move it to the ++ * MFU state. ++ */ ++ ++ if (buf->b_flags & ARC_PREFETCH) { ++ new_state = arc_mru; ++ if (refcount_count(&buf->b_refcnt) > 0) ++ buf->b_flags &= ~ARC_PREFETCH; ++ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); ++ } else { ++ new_state = arc_mfu; ++ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); ++ } ++ ++ buf->b_arc_access = ddi_get_lbolt(); ++ arc_change_state(new_state, buf, hash_lock); ++ ++ ARCSTAT_BUMP(arcstat_mru_ghost_hits); ++ } else if (buf->b_state == arc_mfu) { ++ /* ++ * This buffer has been accessed more than once and is ++ * still in the cache. Keep it in the MFU state. ++ * ++ * NOTE: an add_reference() that occurred when we did ++ * the arc_read() will have kicked this off the list. ++ * If it was a prefetch, we will explicitly move it to ++ * the head of the list now. ++ */ ++ if ((buf->b_flags & ARC_PREFETCH) != 0) { ++ ASSERT(refcount_count(&buf->b_refcnt) == 0); ++ ASSERT(list_link_active(&buf->b_arc_node)); ++ } ++ ARCSTAT_BUMP(arcstat_mfu_hits); ++ buf->b_arc_access = ddi_get_lbolt(); ++ } else if (buf->b_state == arc_mfu_ghost) { ++ arc_state_t *new_state = arc_mfu; ++ /* ++ * This buffer has been accessed more than once but has ++ * been evicted from the cache. Move it back to the ++ * MFU state. ++ */ ++ ++ if (buf->b_flags & ARC_PREFETCH) { ++ /* ++ * This is a prefetch access... ++ * move this block back to the MRU state. ++ */ ++ ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); ++ new_state = arc_mru; ++ } ++ ++ buf->b_arc_access = ddi_get_lbolt(); ++ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); ++ arc_change_state(new_state, buf, hash_lock); ++ ++ ARCSTAT_BUMP(arcstat_mfu_ghost_hits); ++ } else if (buf->b_state == arc_l2c_only) { ++ /* ++ * This buffer is on the 2nd Level ARC. ++ */ ++ ++ buf->b_arc_access = ddi_get_lbolt(); ++ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); ++ arc_change_state(arc_mfu, buf, hash_lock); ++ } else { ++ ASSERT(!"invalid arc state"); ++ } ++} ++ ++/* a generic arc_done_func_t which you can use */ ++/* ARGSUSED */ ++void ++arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) ++{ ++ if (zio == NULL || zio->io_error == 0) ++ bcopy(buf->b_data, arg, buf->b_hdr->b_size); ++ VERIFY(arc_buf_remove_ref(buf, arg) == 1); ++} ++ ++/* a generic arc_done_func_t */ ++void ++arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) ++{ ++ arc_buf_t **bufp = arg; ++ if (zio && zio->io_error) { ++ VERIFY(arc_buf_remove_ref(buf, arg) == 1); ++ *bufp = NULL; ++ } else { ++ *bufp = buf; ++ ASSERT(buf->b_data); ++ } ++} ++ ++static void ++arc_read_done(zio_t *zio) ++{ ++ arc_buf_hdr_t *hdr, *found; ++ arc_buf_t *buf; ++ arc_buf_t *abuf; /* buffer we're assigning to callback */ ++ kmutex_t *hash_lock; ++ arc_callback_t *callback_list, *acb; ++ int freeable = FALSE; ++ ++ buf = zio->io_private; ++ hdr = buf->b_hdr; ++ ++ /* ++ * The hdr was inserted into hash-table and removed from lists ++ * prior to starting I/O. We should find this header, since ++ * it's in the hash table, and it should be legit since it's ++ * not possible to evict it during the I/O. The only possible ++ * reason for it not to be found is if we were freed during the ++ * read. ++ */ ++ found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, ++ &hash_lock); ++ ++ ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || ++ (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || ++ (found == hdr && HDR_L2_READING(hdr))); ++ ++ hdr->b_flags &= ~ARC_L2_EVICTED; ++ if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) ++ hdr->b_flags &= ~ARC_L2CACHE; ++ ++ /* byteswap if necessary */ ++ callback_list = hdr->b_acb; ++ ASSERT(callback_list != NULL); ++ if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { ++ arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? ++ byteswap_uint64_array : ++ dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; ++ func(buf->b_data, hdr->b_size); ++ } ++ ++ arc_cksum_compute(buf, B_FALSE); ++ ++ if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { ++ /* ++ * Only call arc_access on anonymous buffers. This is because ++ * if we've issued an I/O for an evicted buffer, we've already ++ * called arc_access (to prevent any simultaneous readers from ++ * getting confused). ++ */ ++ arc_access(hdr, hash_lock); ++ } ++ ++ /* create copies of the data buffer for the callers */ ++ abuf = buf; ++ for (acb = callback_list; acb; acb = acb->acb_next) { ++ if (acb->acb_done) { ++ if (abuf == NULL) ++ abuf = arc_buf_clone(buf); ++ acb->acb_buf = abuf; ++ abuf = NULL; ++ } ++ } ++ hdr->b_acb = NULL; ++ hdr->b_flags &= ~ARC_IO_IN_PROGRESS; ++ ASSERT(!HDR_BUF_AVAILABLE(hdr)); ++ if (abuf == buf) { ++ ASSERT(buf->b_efunc == NULL); ++ ASSERT(hdr->b_datacnt == 1); ++ hdr->b_flags |= ARC_BUF_AVAILABLE; ++ } ++ ++ ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); ++ ++ if (zio->io_error != 0) { ++ hdr->b_flags |= ARC_IO_ERROR; ++ if (hdr->b_state != arc_anon) ++ arc_change_state(arc_anon, hdr, hash_lock); ++ if (HDR_IN_HASH_TABLE(hdr)) ++ buf_hash_remove(hdr); ++ freeable = refcount_is_zero(&hdr->b_refcnt); ++ } ++ ++ /* ++ * Broadcast before we drop the hash_lock to avoid the possibility ++ * that the hdr (and hence the cv) might be freed before we get to ++ * the cv_broadcast(). ++ */ ++ cv_broadcast(&hdr->b_cv); ++ ++ if (hash_lock) { ++ mutex_exit(hash_lock); ++ } else { ++ /* ++ * This block was freed while we waited for the read to ++ * complete. It has been removed from the hash table and ++ * moved to the anonymous state (so that it won't show up ++ * in the cache). ++ */ ++ ASSERT3P(hdr->b_state, ==, arc_anon); ++ freeable = refcount_is_zero(&hdr->b_refcnt); ++ } ++ ++ /* execute each callback and free its structure */ ++ while ((acb = callback_list) != NULL) { ++ if (acb->acb_done) ++ acb->acb_done(zio, acb->acb_buf, acb->acb_private); ++ ++ if (acb->acb_zio_dummy != NULL) { ++ acb->acb_zio_dummy->io_error = zio->io_error; ++ zio_nowait(acb->acb_zio_dummy); ++ } ++ ++ callback_list = acb->acb_next; ++ kmem_free(acb, sizeof (arc_callback_t)); ++ } ++ ++ if (freeable) ++ arc_hdr_destroy(hdr); ++} ++ ++/* ++ * "Read" the block block at the specified DVA (in bp) via the ++ * cache. If the block is found in the cache, invoke the provided ++ * callback immediately and return. Note that the `zio' parameter ++ * in the callback will be NULL in this case, since no IO was ++ * required. If the block is not in the cache pass the read request ++ * on to the spa with a substitute callback function, so that the ++ * requested block will be added to the cache. ++ * ++ * If a read request arrives for a block that has a read in-progress, ++ * either wait for the in-progress read to complete (and return the ++ * results); or, if this is a read with a "done" func, add a record ++ * to the read to invoke the "done" func when the read completes, ++ * and return; or just return. ++ * ++ * arc_read_done() will invoke all the requested "done" functions ++ * for readers of this block. ++ * ++ * Normal callers should use arc_read and pass the arc buffer and offset ++ * for the bp. But if you know you don't need locking, you can use ++ * arc_read_bp. ++ */ ++int ++arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, ++ arc_done_func_t *done, void *private, int priority, int zio_flags, ++ uint32_t *arc_flags, const zbookmark_t *zb) ++{ ++ int err; ++ ++ if (pbuf == NULL) { ++ /* ++ * XXX This happens from traverse callback funcs, for ++ * the objset_phys_t block. ++ */ ++ return (arc_read_nolock(pio, spa, bp, done, private, priority, ++ zio_flags, arc_flags, zb)); ++ } ++ ++ ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); ++ ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); ++ rw_enter(&pbuf->b_data_lock, RW_READER); ++ ++ err = arc_read_nolock(pio, spa, bp, done, private, priority, ++ zio_flags, arc_flags, zb); ++ rw_exit(&pbuf->b_data_lock); ++ ++ return (err); ++} ++ ++int ++arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, ++ arc_done_func_t *done, void *private, int priority, int zio_flags, ++ uint32_t *arc_flags, const zbookmark_t *zb) ++{ ++ arc_buf_hdr_t *hdr; ++ arc_buf_t *buf = NULL; ++ kmutex_t *hash_lock; ++ zio_t *rzio; ++ uint64_t guid = spa_load_guid(spa); ++ ++top: ++ hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), ++ &hash_lock); ++ if (hdr && hdr->b_datacnt > 0) { ++ ++ *arc_flags |= ARC_CACHED; ++ ++ if (HDR_IO_IN_PROGRESS(hdr)) { ++ ++ if (*arc_flags & ARC_WAIT) { ++ cv_wait(&hdr->b_cv, hash_lock); ++ mutex_exit(hash_lock); ++ goto top; ++ } ++ ASSERT(*arc_flags & ARC_NOWAIT); ++ ++ if (done) { ++ arc_callback_t *acb = NULL; ++ ++ acb = kmem_zalloc(sizeof (arc_callback_t), ++ KM_PUSHPAGE); ++ acb->acb_done = done; ++ acb->acb_private = private; ++ if (pio != NULL) ++ acb->acb_zio_dummy = zio_null(pio, ++ spa, NULL, NULL, NULL, zio_flags); ++ ++ ASSERT(acb->acb_done != NULL); ++ acb->acb_next = hdr->b_acb; ++ hdr->b_acb = acb; ++ add_reference(hdr, hash_lock, private); ++ mutex_exit(hash_lock); ++ return (0); ++ } ++ mutex_exit(hash_lock); ++ return (0); ++ } ++ ++ ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); ++ ++ if (done) { ++ add_reference(hdr, hash_lock, private); ++ /* ++ * If this block is already in use, create a new ++ * copy of the data so that we will be guaranteed ++ * that arc_release() will always succeed. ++ */ ++ buf = hdr->b_buf; ++ ASSERT(buf); ++ ASSERT(buf->b_data); ++ if (HDR_BUF_AVAILABLE(hdr)) { ++ ASSERT(buf->b_efunc == NULL); ++ hdr->b_flags &= ~ARC_BUF_AVAILABLE; ++ } else { ++ buf = arc_buf_clone(buf); ++ } ++ ++ } else if (*arc_flags & ARC_PREFETCH && ++ refcount_count(&hdr->b_refcnt) == 0) { ++ hdr->b_flags |= ARC_PREFETCH; ++ } ++ DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); ++ arc_access(hdr, hash_lock); ++ if (*arc_flags & ARC_L2CACHE) ++ hdr->b_flags |= ARC_L2CACHE; ++ mutex_exit(hash_lock); ++ ARCSTAT_BUMP(arcstat_hits); ++ ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), ++ demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, ++ data, metadata, hits); ++ ++ if (done) ++ done(NULL, buf, private); ++ } else { ++ uint64_t size = BP_GET_LSIZE(bp); ++ arc_callback_t *acb; ++ vdev_t *vd = NULL; ++ uint64_t addr = -1; ++ boolean_t devw = B_FALSE; ++ ++ if (hdr == NULL) { ++ /* this block is not in the cache */ ++ arc_buf_hdr_t *exists; ++ arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); ++ buf = arc_buf_alloc(spa, size, private, type); ++ hdr = buf->b_hdr; ++ hdr->b_dva = *BP_IDENTITY(bp); ++ hdr->b_birth = BP_PHYSICAL_BIRTH(bp); ++ hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; ++ exists = buf_hash_insert(hdr, &hash_lock); ++ if (exists) { ++ /* somebody beat us to the hash insert */ ++ mutex_exit(hash_lock); ++ buf_discard_identity(hdr); ++ (void) arc_buf_remove_ref(buf, private); ++ goto top; /* restart the IO request */ ++ } ++ /* if this is a prefetch, we don't have a reference */ ++ if (*arc_flags & ARC_PREFETCH) { ++ (void) remove_reference(hdr, hash_lock, ++ private); ++ hdr->b_flags |= ARC_PREFETCH; ++ } ++ if (*arc_flags & ARC_L2CACHE) ++ hdr->b_flags |= ARC_L2CACHE; ++ if (BP_GET_LEVEL(bp) > 0) ++ hdr->b_flags |= ARC_INDIRECT; ++ } else { ++ /* this block is in the ghost cache */ ++ ASSERT(GHOST_STATE(hdr->b_state)); ++ ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ++ ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); ++ ASSERT(hdr->b_buf == NULL); ++ ++ /* if this is a prefetch, we don't have a reference */ ++ if (*arc_flags & ARC_PREFETCH) ++ hdr->b_flags |= ARC_PREFETCH; ++ else ++ add_reference(hdr, hash_lock, private); ++ if (*arc_flags & ARC_L2CACHE) ++ hdr->b_flags |= ARC_L2CACHE; ++ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); ++ buf->b_hdr = hdr; ++ buf->b_data = NULL; ++ buf->b_efunc = NULL; ++ buf->b_private = NULL; ++ buf->b_next = NULL; ++ hdr->b_buf = buf; ++ ASSERT(hdr->b_datacnt == 0); ++ hdr->b_datacnt = 1; ++ arc_get_data_buf(buf); ++ arc_access(hdr, hash_lock); ++ } ++ ++ ASSERT(!GHOST_STATE(hdr->b_state)); ++ ++ acb = kmem_zalloc(sizeof (arc_callback_t), KM_PUSHPAGE); ++ acb->acb_done = done; ++ acb->acb_private = private; ++ ++ ASSERT(hdr->b_acb == NULL); ++ hdr->b_acb = acb; ++ hdr->b_flags |= ARC_IO_IN_PROGRESS; ++ ++ if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && ++ (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { ++ devw = hdr->b_l2hdr->b_dev->l2ad_writing; ++ addr = hdr->b_l2hdr->b_daddr; ++ /* ++ * Lock out device removal. ++ */ ++ if (vdev_is_dead(vd) || ++ !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) ++ vd = NULL; ++ } ++ ++ mutex_exit(hash_lock); ++ ++ ASSERT3U(hdr->b_size, ==, size); ++ DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, ++ uint64_t, size, zbookmark_t *, zb); ++ ARCSTAT_BUMP(arcstat_misses); ++ ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), ++ demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, ++ data, metadata, misses); ++ ++ if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { ++ /* ++ * Read from the L2ARC if the following are true: ++ * 1. The L2ARC vdev was previously cached. ++ * 2. This buffer still has L2ARC metadata. ++ * 3. This buffer isn't currently writing to the L2ARC. ++ * 4. The L2ARC entry wasn't evicted, which may ++ * also have invalidated the vdev. ++ * 5. This isn't prefetch and l2arc_noprefetch is set. ++ */ ++ if (hdr->b_l2hdr != NULL && ++ !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && ++ !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { ++ l2arc_read_callback_t *cb; ++ ++ DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ++ ARCSTAT_BUMP(arcstat_l2_hits); ++ ++ cb = kmem_zalloc(sizeof (l2arc_read_callback_t), ++ KM_PUSHPAGE); ++ cb->l2rcb_buf = buf; ++ cb->l2rcb_spa = spa; ++ cb->l2rcb_bp = *bp; ++ cb->l2rcb_zb = *zb; ++ cb->l2rcb_flags = zio_flags; ++ ++ /* ++ * l2arc read. The SCL_L2ARC lock will be ++ * released by l2arc_read_done(). ++ */ ++ rzio = zio_read_phys(pio, vd, addr, size, ++ buf->b_data, ZIO_CHECKSUM_OFF, ++ l2arc_read_done, cb, priority, zio_flags | ++ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ++ ZIO_FLAG_DONT_PROPAGATE | ++ ZIO_FLAG_DONT_RETRY, B_FALSE); ++ DTRACE_PROBE2(l2arc__read, vdev_t *, vd, ++ zio_t *, rzio); ++ ARCSTAT_INCR(arcstat_l2_read_bytes, size); ++ ++ if (*arc_flags & ARC_NOWAIT) { ++ zio_nowait(rzio); ++ return (0); ++ } ++ ++ ASSERT(*arc_flags & ARC_WAIT); ++ if (zio_wait(rzio) == 0) ++ return (0); ++ ++ /* l2arc read error; goto zio_read() */ ++ } else { ++ DTRACE_PROBE1(l2arc__miss, ++ arc_buf_hdr_t *, hdr); ++ ARCSTAT_BUMP(arcstat_l2_misses); ++ if (HDR_L2_WRITING(hdr)) ++ ARCSTAT_BUMP(arcstat_l2_rw_clash); ++ spa_config_exit(spa, SCL_L2ARC, vd); ++ } ++ } else { ++ if (vd != NULL) ++ spa_config_exit(spa, SCL_L2ARC, vd); ++ if (l2arc_ndev != 0) { ++ DTRACE_PROBE1(l2arc__miss, ++ arc_buf_hdr_t *, hdr); ++ ARCSTAT_BUMP(arcstat_l2_misses); ++ } ++ } ++ ++ rzio = zio_read(pio, spa, bp, buf->b_data, size, ++ arc_read_done, buf, priority, zio_flags, zb); ++ ++ if (*arc_flags & ARC_WAIT) ++ return (zio_wait(rzio)); ++ ++ ASSERT(*arc_flags & ARC_NOWAIT); ++ zio_nowait(rzio); ++ } ++ return (0); ++} ++ ++arc_prune_t * ++arc_add_prune_callback(arc_prune_func_t *func, void *private) ++{ ++ arc_prune_t *p; ++ ++ p = kmem_alloc(sizeof(*p), KM_SLEEP); ++ p->p_pfunc = func; ++ p->p_private = private; ++ list_link_init(&p->p_node); ++ refcount_create(&p->p_refcnt); ++ ++ mutex_enter(&arc_prune_mtx); ++ refcount_add(&p->p_refcnt, &arc_prune_list); ++ list_insert_head(&arc_prune_list, p); ++ mutex_exit(&arc_prune_mtx); ++ ++ return (p); ++} ++ ++void ++arc_remove_prune_callback(arc_prune_t *p) ++{ ++ mutex_enter(&arc_prune_mtx); ++ list_remove(&arc_prune_list, p); ++ if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) { ++ refcount_destroy(&p->p_refcnt); ++ kmem_free(p, sizeof (*p)); ++ } ++ mutex_exit(&arc_prune_mtx); ++} ++ ++void ++arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) ++{ ++ ASSERT(buf->b_hdr != NULL); ++ ASSERT(buf->b_hdr->b_state != arc_anon); ++ ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); ++ ASSERT(buf->b_efunc == NULL); ++ ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); ++ ++ buf->b_efunc = func; ++ buf->b_private = private; ++} ++ ++/* ++ * This is used by the DMU to let the ARC know that a buffer is ++ * being evicted, so the ARC should clean up. If this arc buf ++ * is not yet in the evicted state, it will be put there. ++ */ ++int ++arc_buf_evict(arc_buf_t *buf) ++{ ++ arc_buf_hdr_t *hdr; ++ kmutex_t *hash_lock; ++ arc_buf_t **bufp; ++ ++ mutex_enter(&buf->b_evict_lock); ++ hdr = buf->b_hdr; ++ if (hdr == NULL) { ++ /* ++ * We are in arc_do_user_evicts(). ++ */ ++ ASSERT(buf->b_data == NULL); ++ mutex_exit(&buf->b_evict_lock); ++ return (0); ++ } else if (buf->b_data == NULL) { ++ arc_buf_t copy = *buf; /* structure assignment */ ++ /* ++ * We are on the eviction list; process this buffer now ++ * but let arc_do_user_evicts() do the reaping. ++ */ ++ buf->b_efunc = NULL; ++ mutex_exit(&buf->b_evict_lock); ++ VERIFY(copy.b_efunc(©) == 0); ++ return (1); ++ } ++ hash_lock = HDR_LOCK(hdr); ++ mutex_enter(hash_lock); ++ hdr = buf->b_hdr; ++ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ++ ++ ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); ++ ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); ++ ++ /* ++ * Pull this buffer off of the hdr ++ */ ++ bufp = &hdr->b_buf; ++ while (*bufp != buf) ++ bufp = &(*bufp)->b_next; ++ *bufp = buf->b_next; ++ ++ ASSERT(buf->b_data != NULL); ++ arc_buf_destroy(buf, FALSE, FALSE); ++ ++ if (hdr->b_datacnt == 0) { ++ arc_state_t *old_state = hdr->b_state; ++ arc_state_t *evicted_state; ++ ++ ASSERT(hdr->b_buf == NULL); ++ ASSERT(refcount_is_zero(&hdr->b_refcnt)); ++ ++ evicted_state = ++ (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; ++ ++ mutex_enter(&old_state->arcs_mtx); ++ mutex_enter(&evicted_state->arcs_mtx); ++ ++ arc_change_state(evicted_state, hdr, hash_lock); ++ ASSERT(HDR_IN_HASH_TABLE(hdr)); ++ hdr->b_flags |= ARC_IN_HASH_TABLE; ++ hdr->b_flags &= ~ARC_BUF_AVAILABLE; ++ ++ mutex_exit(&evicted_state->arcs_mtx); ++ mutex_exit(&old_state->arcs_mtx); ++ } ++ mutex_exit(hash_lock); ++ mutex_exit(&buf->b_evict_lock); ++ ++ VERIFY(buf->b_efunc(buf) == 0); ++ buf->b_efunc = NULL; ++ buf->b_private = NULL; ++ buf->b_hdr = NULL; ++ buf->b_next = NULL; ++ kmem_cache_free(buf_cache, buf); ++ return (1); ++} ++ ++/* ++ * Release this buffer from the cache. This must be done ++ * after a read and prior to modifying the buffer contents. ++ * If the buffer has more than one reference, we must make ++ * a new hdr for the buffer. ++ */ ++void ++arc_release(arc_buf_t *buf, void *tag) ++{ ++ arc_buf_hdr_t *hdr; ++ kmutex_t *hash_lock = NULL; ++ l2arc_buf_hdr_t *l2hdr; ++ uint64_t buf_size = 0; ++ ++ /* ++ * It would be nice to assert that if it's DMU metadata (level > ++ * 0 || it's the dnode file), then it must be syncing context. ++ * But we don't know that information at this level. ++ */ ++ ++ mutex_enter(&buf->b_evict_lock); ++ hdr = buf->b_hdr; ++ ++ /* this buffer is not on any list */ ++ ASSERT(refcount_count(&hdr->b_refcnt) > 0); ++ ++ if (hdr->b_state == arc_anon) { ++ /* this buffer is already released */ ++ ASSERT(buf->b_efunc == NULL); ++ } else { ++ hash_lock = HDR_LOCK(hdr); ++ mutex_enter(hash_lock); ++ hdr = buf->b_hdr; ++ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ++ } ++ ++ l2hdr = hdr->b_l2hdr; ++ if (l2hdr) { ++ mutex_enter(&l2arc_buflist_mtx); ++ hdr->b_l2hdr = NULL; ++ buf_size = hdr->b_size; ++ } ++ ++ /* ++ * Do we have more than one buf? ++ */ ++ if (hdr->b_datacnt > 1) { ++ arc_buf_hdr_t *nhdr; ++ arc_buf_t **bufp; ++ uint64_t blksz = hdr->b_size; ++ uint64_t spa = hdr->b_spa; ++ arc_buf_contents_t type = hdr->b_type; ++ uint32_t flags = hdr->b_flags; ++ ++ ASSERT(hdr->b_buf != buf || buf->b_next != NULL); ++ /* ++ * Pull the data off of this hdr and attach it to ++ * a new anonymous hdr. ++ */ ++ (void) remove_reference(hdr, hash_lock, tag); ++ bufp = &hdr->b_buf; ++ while (*bufp != buf) ++ bufp = &(*bufp)->b_next; ++ *bufp = buf->b_next; ++ buf->b_next = NULL; ++ ++ ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); ++ atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); ++ if (refcount_is_zero(&hdr->b_refcnt)) { ++ uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; ++ ASSERT3U(*size, >=, hdr->b_size); ++ atomic_add_64(size, -hdr->b_size); ++ } ++ hdr->b_datacnt -= 1; ++ arc_cksum_verify(buf); ++ ++ mutex_exit(hash_lock); ++ ++ nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); ++ nhdr->b_size = blksz; ++ nhdr->b_spa = spa; ++ nhdr->b_type = type; ++ nhdr->b_buf = buf; ++ nhdr->b_state = arc_anon; ++ nhdr->b_arc_access = 0; ++ nhdr->b_flags = flags & ARC_L2_WRITING; ++ nhdr->b_l2hdr = NULL; ++ nhdr->b_datacnt = 1; ++ nhdr->b_freeze_cksum = NULL; ++ (void) refcount_add(&nhdr->b_refcnt, tag); ++ buf->b_hdr = nhdr; ++ mutex_exit(&buf->b_evict_lock); ++ atomic_add_64(&arc_anon->arcs_size, blksz); ++ } else { ++ mutex_exit(&buf->b_evict_lock); ++ ASSERT(refcount_count(&hdr->b_refcnt) == 1); ++ ASSERT(!list_link_active(&hdr->b_arc_node)); ++ ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ++ if (hdr->b_state != arc_anon) ++ arc_change_state(arc_anon, hdr, hash_lock); ++ hdr->b_arc_access = 0; ++ if (hash_lock) ++ mutex_exit(hash_lock); ++ ++ buf_discard_identity(hdr); ++ arc_buf_thaw(buf); ++ } ++ buf->b_efunc = NULL; ++ buf->b_private = NULL; ++ ++ if (l2hdr) { ++ list_remove(l2hdr->b_dev->l2ad_buflist, hdr); ++ kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); ++ ARCSTAT_INCR(arcstat_l2_size, -buf_size); ++ mutex_exit(&l2arc_buflist_mtx); ++ } ++} ++ ++/* ++ * Release this buffer. If it does not match the provided BP, fill it ++ * with that block's contents. ++ */ ++/* ARGSUSED */ ++int ++arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, ++ zbookmark_t *zb) ++{ ++ arc_release(buf, tag); ++ return (0); ++} ++ ++int ++arc_released(arc_buf_t *buf) ++{ ++ int released; ++ ++ mutex_enter(&buf->b_evict_lock); ++ released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); ++ mutex_exit(&buf->b_evict_lock); ++ return (released); ++} ++ ++int ++arc_has_callback(arc_buf_t *buf) ++{ ++ int callback; ++ ++ mutex_enter(&buf->b_evict_lock); ++ callback = (buf->b_efunc != NULL); ++ mutex_exit(&buf->b_evict_lock); ++ return (callback); ++} ++ ++#ifdef ZFS_DEBUG ++int ++arc_referenced(arc_buf_t *buf) ++{ ++ int referenced; ++ ++ mutex_enter(&buf->b_evict_lock); ++ referenced = (refcount_count(&buf->b_hdr->b_refcnt)); ++ mutex_exit(&buf->b_evict_lock); ++ return (referenced); ++} ++#endif ++ ++static void ++arc_write_ready(zio_t *zio) ++{ ++ arc_write_callback_t *callback = zio->io_private; ++ arc_buf_t *buf = callback->awcb_buf; ++ arc_buf_hdr_t *hdr = buf->b_hdr; ++ ++ ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); ++ callback->awcb_ready(zio, buf, callback->awcb_private); ++ ++ /* ++ * If the IO is already in progress, then this is a re-write ++ * attempt, so we need to thaw and re-compute the cksum. ++ * It is the responsibility of the callback to handle the ++ * accounting for any re-write attempt. ++ */ ++ if (HDR_IO_IN_PROGRESS(hdr)) { ++ mutex_enter(&hdr->b_freeze_lock); ++ if (hdr->b_freeze_cksum != NULL) { ++ kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); ++ hdr->b_freeze_cksum = NULL; ++ } ++ mutex_exit(&hdr->b_freeze_lock); ++ } ++ arc_cksum_compute(buf, B_FALSE); ++ hdr->b_flags |= ARC_IO_IN_PROGRESS; ++} ++ ++static void ++arc_write_done(zio_t *zio) ++{ ++ arc_write_callback_t *callback = zio->io_private; ++ arc_buf_t *buf = callback->awcb_buf; ++ arc_buf_hdr_t *hdr = buf->b_hdr; ++ ++ ASSERT(hdr->b_acb == NULL); ++ ++ if (zio->io_error == 0) { ++ hdr->b_dva = *BP_IDENTITY(zio->io_bp); ++ hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); ++ hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; ++ } else { ++ ASSERT(BUF_EMPTY(hdr)); ++ } ++ ++ /* ++ * If the block to be written was all-zero, we may have ++ * compressed it away. In this case no write was performed ++ * so there will be no dva/birth/checksum. The buffer must ++ * therefore remain anonymous (and uncached). ++ */ ++ if (!BUF_EMPTY(hdr)) { ++ arc_buf_hdr_t *exists; ++ kmutex_t *hash_lock; ++ ++ ASSERT(zio->io_error == 0); ++ ++ arc_cksum_verify(buf); ++ ++ exists = buf_hash_insert(hdr, &hash_lock); ++ if (exists) { ++ /* ++ * This can only happen if we overwrite for ++ * sync-to-convergence, because we remove ++ * buffers from the hash table when we arc_free(). ++ */ ++ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ++ if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) ++ panic("bad overwrite, hdr=%p exists=%p", ++ (void *)hdr, (void *)exists); ++ ASSERT(refcount_is_zero(&exists->b_refcnt)); ++ arc_change_state(arc_anon, exists, hash_lock); ++ mutex_exit(hash_lock); ++ arc_hdr_destroy(exists); ++ exists = buf_hash_insert(hdr, &hash_lock); ++ ASSERT3P(exists, ==, NULL); ++ } else { ++ /* Dedup */ ++ ASSERT(hdr->b_datacnt == 1); ++ ASSERT(hdr->b_state == arc_anon); ++ ASSERT(BP_GET_DEDUP(zio->io_bp)); ++ ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); ++ } ++ } ++ hdr->b_flags &= ~ARC_IO_IN_PROGRESS; ++ /* if it's not anon, we are doing a scrub */ ++ if (!exists && hdr->b_state == arc_anon) ++ arc_access(hdr, hash_lock); ++ mutex_exit(hash_lock); ++ } else { ++ hdr->b_flags &= ~ARC_IO_IN_PROGRESS; ++ } ++ ++ ASSERT(!refcount_is_zero(&hdr->b_refcnt)); ++ callback->awcb_done(zio, buf, callback->awcb_private); ++ ++ kmem_free(callback, sizeof (arc_write_callback_t)); ++} ++ ++zio_t * ++arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ++ blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, ++ arc_done_func_t *ready, arc_done_func_t *done, void *private, ++ int priority, int zio_flags, const zbookmark_t *zb) ++{ ++ arc_buf_hdr_t *hdr = buf->b_hdr; ++ arc_write_callback_t *callback; ++ zio_t *zio; ++ ++ ASSERT(ready != NULL); ++ ASSERT(done != NULL); ++ ASSERT(!HDR_IO_ERROR(hdr)); ++ ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); ++ ASSERT(hdr->b_acb == NULL); ++ if (l2arc) ++ hdr->b_flags |= ARC_L2CACHE; ++ callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_PUSHPAGE); ++ callback->awcb_ready = ready; ++ callback->awcb_done = done; ++ callback->awcb_private = private; ++ callback->awcb_buf = buf; ++ ++ zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, ++ arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); ++ ++ return (zio); ++} ++ ++static int ++arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) ++{ ++#ifdef _KERNEL ++ uint64_t available_memory; ++ ++ /* Easily reclaimable memory (free + inactive + arc-evictable) */ ++ available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory(); ++ ++ if (available_memory <= zfs_write_limit_max) { ++ ARCSTAT_INCR(arcstat_memory_throttle_count, 1); ++ DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); ++ return (EAGAIN); ++ } ++ ++ if (inflight_data > available_memory / 4) { ++ ARCSTAT_INCR(arcstat_memory_throttle_count, 1); ++ DMU_TX_STAT_BUMP(dmu_tx_memory_inflight); ++ return (ERESTART); ++ } ++#endif ++ return (0); ++} ++ ++void ++arc_tempreserve_clear(uint64_t reserve) ++{ ++ atomic_add_64(&arc_tempreserve, -reserve); ++ ASSERT((int64_t)arc_tempreserve >= 0); ++} ++ ++int ++arc_tempreserve_space(uint64_t reserve, uint64_t txg) ++{ ++ int error; ++ uint64_t anon_size; ++ ++#ifdef ZFS_DEBUG ++ /* ++ * Once in a while, fail for no reason. Everything should cope. ++ */ ++ if (spa_get_random(10000) == 0) { ++ dprintf("forcing random failure\n"); ++ return (ERESTART); ++ } ++#endif ++ if (reserve > arc_c/4 && !arc_no_grow) ++ arc_c = MIN(arc_c_max, reserve * 4); ++ if (reserve > arc_c) { ++ DMU_TX_STAT_BUMP(dmu_tx_memory_reserve); ++ return (ENOMEM); ++ } ++ ++ /* ++ * Don't count loaned bufs as in flight dirty data to prevent long ++ * network delays from blocking transactions that are ready to be ++ * assigned to a txg. ++ */ ++ anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); ++ ++ /* ++ * Writes will, almost always, require additional memory allocations ++ * in order to compress/encrypt/etc the data. We therefor need to ++ * make sure that there is sufficient available memory for this. ++ */ ++ if ((error = arc_memory_throttle(reserve, anon_size, txg))) ++ return (error); ++ ++ /* ++ * Throttle writes when the amount of dirty data in the cache ++ * gets too large. We try to keep the cache less than half full ++ * of dirty blocks so that our sync times don't grow too large. ++ * Note: if two requests come in concurrently, we might let them ++ * both succeed, when one of them should fail. Not a huge deal. ++ */ ++ ++ if (reserve + arc_tempreserve + anon_size > arc_c / 2 && ++ anon_size > arc_c / 4) { ++ dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " ++ "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", ++ arc_tempreserve>>10, ++ arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, ++ arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, ++ reserve>>10, arc_c>>10); ++ DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); ++ return (ERESTART); ++ } ++ atomic_add_64(&arc_tempreserve, reserve); ++ return (0); ++} ++ ++static void ++arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, ++ kstat_named_t *evict_data, kstat_named_t *evict_metadata) ++{ ++ size->value.ui64 = state->arcs_size; ++ evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; ++ evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; ++} ++ ++static int ++arc_kstat_update(kstat_t *ksp, int rw) ++{ ++ arc_stats_t *as = ksp->ks_data; ++ ++ if (rw == KSTAT_WRITE) { ++ return (EACCES); ++ } else { ++ arc_kstat_update_state(arc_anon, ++ &as->arcstat_anon_size, ++ &as->arcstat_anon_evict_data, ++ &as->arcstat_anon_evict_metadata); ++ arc_kstat_update_state(arc_mru, ++ &as->arcstat_mru_size, ++ &as->arcstat_mru_evict_data, ++ &as->arcstat_mru_evict_metadata); ++ arc_kstat_update_state(arc_mru_ghost, ++ &as->arcstat_mru_ghost_size, ++ &as->arcstat_mru_ghost_evict_data, ++ &as->arcstat_mru_ghost_evict_metadata); ++ arc_kstat_update_state(arc_mfu, ++ &as->arcstat_mfu_size, ++ &as->arcstat_mfu_evict_data, ++ &as->arcstat_mfu_evict_metadata); ++ arc_kstat_update_state(arc_mfu_ghost, ++ &as->arcstat_mfu_ghost_size, ++ &as->arcstat_mfu_ghost_evict_data, ++ &as->arcstat_mfu_ghost_evict_metadata); ++ } ++ ++ return (0); ++} ++ ++void ++arc_init(void) ++{ ++ mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); ++ cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); ++ ++ /* Convert seconds to clock ticks */ ++ arc_min_prefetch_lifespan = 1 * hz; ++ ++ /* Start out with 1/8 of all memory */ ++ arc_c = physmem * PAGESIZE / 8; ++ ++#ifdef _KERNEL ++ /* ++ * On architectures where the physical memory can be larger ++ * than the addressable space (intel in 32-bit mode), we may ++ * need to limit the cache to 1/8 of VM size. ++ */ ++ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); ++ /* ++ * Register a shrinker to support synchronous (direct) memory ++ * reclaim from the arc. This is done to prevent kswapd from ++ * swapping out pages when it is preferable to shrink the arc. ++ */ ++ spl_register_shrinker(&arc_shrinker); ++#endif ++ ++ /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ ++ arc_c_min = MAX(arc_c / 4, 64<<20); ++ /* set max to 1/2 of all memory */ ++ arc_c_max = MAX(arc_c * 4, arc_c_max); ++ ++ /* ++ * Allow the tunables to override our calculations if they are ++ * reasonable (ie. over 64MB) ++ */ ++ if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) ++ arc_c_max = zfs_arc_max; ++ if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) ++ arc_c_min = zfs_arc_min; ++ ++ arc_c = arc_c_max; ++ arc_p = (arc_c >> 1); ++ ++ /* limit meta-data to 1/4 of the arc capacity */ ++ arc_meta_limit = arc_c_max / 4; ++ arc_meta_max = 0; ++ ++ /* Allow the tunable to override if it is reasonable */ ++ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) ++ arc_meta_limit = zfs_arc_meta_limit; ++ ++ if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) ++ arc_c_min = arc_meta_limit / 2; ++ ++ if (zfs_arc_grow_retry > 0) ++ arc_grow_retry = zfs_arc_grow_retry; ++ ++ if (zfs_arc_shrink_shift > 0) ++ arc_shrink_shift = zfs_arc_shrink_shift; ++ ++ if (zfs_arc_p_min_shift > 0) ++ arc_p_min_shift = zfs_arc_p_min_shift; ++ ++ if (zfs_arc_meta_prune > 0) ++ arc_meta_prune = zfs_arc_meta_prune; ++ ++ /* if kmem_flags are set, lets try to use less memory */ ++ if (kmem_debugging()) ++ arc_c = arc_c / 2; ++ if (arc_c < arc_c_min) ++ arc_c = arc_c_min; ++ ++ arc_anon = &ARC_anon; ++ arc_mru = &ARC_mru; ++ arc_mru_ghost = &ARC_mru_ghost; ++ arc_mfu = &ARC_mfu; ++ arc_mfu_ghost = &ARC_mfu_ghost; ++ arc_l2c_only = &ARC_l2c_only; ++ arc_size = 0; ++ ++ mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); ++ ++ list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], ++ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); ++ list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], ++ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); ++ list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], ++ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); ++ list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], ++ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); ++ list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], ++ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); ++ list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], ++ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); ++ list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], ++ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); ++ list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], ++ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); ++ list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], ++ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); ++ list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], ++ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); ++ ++ buf_init(); ++ ++ arc_thread_exit = 0; ++ list_create(&arc_prune_list, sizeof (arc_prune_t), ++ offsetof(arc_prune_t, p_node)); ++ arc_eviction_list = NULL; ++ mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); ++ bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); ++ ++ arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, ++ sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); ++ ++ if (arc_ksp != NULL) { ++ arc_ksp->ks_data = &arc_stats; ++ arc_ksp->ks_update = arc_kstat_update; ++ kstat_install(arc_ksp); ++ } ++ ++ (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0, ++ TS_RUN, minclsyspri); ++ ++ arc_dead = FALSE; ++ arc_warm = B_FALSE; ++ ++ if (zfs_write_limit_max == 0) ++ zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; ++ else ++ zfs_write_limit_shift = 0; ++ mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); ++} ++ ++void ++arc_fini(void) ++{ ++ arc_prune_t *p; ++ ++ mutex_enter(&arc_reclaim_thr_lock); ++#ifdef _KERNEL ++ spl_unregister_shrinker(&arc_shrinker); ++#endif /* _KERNEL */ ++ ++ arc_thread_exit = 1; ++ while (arc_thread_exit != 0) ++ cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); ++ mutex_exit(&arc_reclaim_thr_lock); ++ ++ arc_flush(NULL); ++ ++ arc_dead = TRUE; ++ ++ if (arc_ksp != NULL) { ++ kstat_delete(arc_ksp); ++ arc_ksp = NULL; ++ } ++ ++ mutex_enter(&arc_prune_mtx); ++ while ((p = list_head(&arc_prune_list)) != NULL) { ++ list_remove(&arc_prune_list, p); ++ refcount_remove(&p->p_refcnt, &arc_prune_list); ++ refcount_destroy(&p->p_refcnt); ++ kmem_free(p, sizeof (*p)); ++ } ++ mutex_exit(&arc_prune_mtx); ++ ++ list_destroy(&arc_prune_list); ++ mutex_destroy(&arc_prune_mtx); ++ mutex_destroy(&arc_eviction_mtx); ++ mutex_destroy(&arc_reclaim_thr_lock); ++ cv_destroy(&arc_reclaim_thr_cv); ++ ++ list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); ++ list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); ++ list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); ++ list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); ++ list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); ++ list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); ++ list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); ++ list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); ++ ++ mutex_destroy(&arc_anon->arcs_mtx); ++ mutex_destroy(&arc_mru->arcs_mtx); ++ mutex_destroy(&arc_mru_ghost->arcs_mtx); ++ mutex_destroy(&arc_mfu->arcs_mtx); ++ mutex_destroy(&arc_mfu_ghost->arcs_mtx); ++ mutex_destroy(&arc_l2c_only->arcs_mtx); ++ ++ mutex_destroy(&zfs_write_limit_lock); ++ ++ buf_fini(); ++ ++ ASSERT(arc_loaned_bytes == 0); ++} ++ ++/* ++ * Level 2 ARC ++ * ++ * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. ++ * It uses dedicated storage devices to hold cached data, which are populated ++ * using large infrequent writes. The main role of this cache is to boost ++ * the performance of random read workloads. The intended L2ARC devices ++ * include short-stroked disks, solid state disks, and other media with ++ * substantially faster read latency than disk. ++ * ++ * +-----------------------+ ++ * | ARC | ++ * +-----------------------+ ++ * | ^ ^ ++ * | | | ++ * l2arc_feed_thread() arc_read() ++ * | | | ++ * | l2arc read | ++ * V | | ++ * +---------------+ | ++ * | L2ARC | | ++ * +---------------+ | ++ * | ^ | ++ * l2arc_write() | | ++ * | | | ++ * V | | ++ * +-------+ +-------+ ++ * | vdev | | vdev | ++ * | cache | | cache | ++ * +-------+ +-------+ ++ * +=========+ .-----. ++ * : L2ARC : |-_____-| ++ * : devices : | Disks | ++ * +=========+ `-_____-' ++ * ++ * Read requests are satisfied from the following sources, in order: ++ * ++ * 1) ARC ++ * 2) vdev cache of L2ARC devices ++ * 3) L2ARC devices ++ * 4) vdev cache of disks ++ * 5) disks ++ * ++ * Some L2ARC device types exhibit extremely slow write performance. ++ * To accommodate for this there are some significant differences between ++ * the L2ARC and traditional cache design: ++ * ++ * 1. There is no eviction path from the ARC to the L2ARC. Evictions from ++ * the ARC behave as usual, freeing buffers and placing headers on ghost ++ * lists. The ARC does not send buffers to the L2ARC during eviction as ++ * this would add inflated write latencies for all ARC memory pressure. ++ * ++ * 2. The L2ARC attempts to cache data from the ARC before it is evicted. ++ * It does this by periodically scanning buffers from the eviction-end of ++ * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are ++ * not already there. It scans until a headroom of buffers is satisfied, ++ * which itself is a buffer for ARC eviction. The thread that does this is ++ * l2arc_feed_thread(), illustrated below; example sizes are included to ++ * provide a better sense of ratio than this diagram: ++ * ++ * head --> tail ++ * +---------------------+----------+ ++ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC ++ * +---------------------+----------+ | o L2ARC eligible ++ * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer ++ * +---------------------+----------+ | ++ * 15.9 Gbytes ^ 32 Mbytes | ++ * headroom | ++ * l2arc_feed_thread() ++ * | ++ * l2arc write hand <--[oooo]--' ++ * | 8 Mbyte ++ * | write max ++ * V ++ * +==============================+ ++ * L2ARC dev |####|#|###|###| |####| ... | ++ * +==============================+ ++ * 32 Gbytes ++ * ++ * 3. If an ARC buffer is copied to the L2ARC but then hit instead of ++ * evicted, then the L2ARC has cached a buffer much sooner than it probably ++ * needed to, potentially wasting L2ARC device bandwidth and storage. It is ++ * safe to say that this is an uncommon case, since buffers at the end of ++ * the ARC lists have moved there due to inactivity. ++ * ++ * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, ++ * then the L2ARC simply misses copying some buffers. This serves as a ++ * pressure valve to prevent heavy read workloads from both stalling the ARC ++ * with waits and clogging the L2ARC with writes. This also helps prevent ++ * the potential for the L2ARC to churn if it attempts to cache content too ++ * quickly, such as during backups of the entire pool. ++ * ++ * 5. After system boot and before the ARC has filled main memory, there are ++ * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru ++ * lists can remain mostly static. Instead of searching from tail of these ++ * lists as pictured, the l2arc_feed_thread() will search from the list heads ++ * for eligible buffers, greatly increasing its chance of finding them. ++ * ++ * The L2ARC device write speed is also boosted during this time so that ++ * the L2ARC warms up faster. Since there have been no ARC evictions yet, ++ * there are no L2ARC reads, and no fear of degrading read performance ++ * through increased writes. ++ * ++ * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that ++ * the vdev queue can aggregate them into larger and fewer writes. Each ++ * device is written to in a rotor fashion, sweeping writes through ++ * available space then repeating. ++ * ++ * 7. The L2ARC does not store dirty content. It never needs to flush ++ * write buffers back to disk based storage. ++ * ++ * 8. If an ARC buffer is written (and dirtied) which also exists in the ++ * L2ARC, the now stale L2ARC buffer is immediately dropped. ++ * ++ * The performance of the L2ARC can be tweaked by a number of tunables, which ++ * may be necessary for different workloads: ++ * ++ * l2arc_write_max max write bytes per interval ++ * l2arc_write_boost extra write bytes during device warmup ++ * l2arc_noprefetch skip caching prefetched buffers ++ * l2arc_headroom number of max device writes to precache ++ * l2arc_feed_secs seconds between L2ARC writing ++ * ++ * Tunables may be removed or added as future performance improvements are ++ * integrated, and also may become zpool properties. ++ * ++ * There are three key functions that control how the L2ARC warms up: ++ * ++ * l2arc_write_eligible() check if a buffer is eligible to cache ++ * l2arc_write_size() calculate how much to write ++ * l2arc_write_interval() calculate sleep delay between writes ++ * ++ * These three functions determine what to write, how much, and how quickly ++ * to send writes. ++ */ ++ ++static boolean_t ++l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) ++{ ++ /* ++ * A buffer is *not* eligible for the L2ARC if it: ++ * 1. belongs to a different spa. ++ * 2. is already cached on the L2ARC. ++ * 3. has an I/O in progress (it may be an incomplete read). ++ * 4. is flagged not eligible (zfs property). ++ */ ++ if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || ++ HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) ++ return (B_FALSE); ++ ++ return (B_TRUE); ++} ++ ++static uint64_t ++l2arc_write_size(l2arc_dev_t *dev) ++{ ++ uint64_t size; ++ ++ size = dev->l2ad_write; ++ ++ if (arc_warm == B_FALSE) ++ size += dev->l2ad_boost; ++ ++ return (size); ++ ++} ++ ++static clock_t ++l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) ++{ ++ clock_t interval, next, now; ++ ++ /* ++ * If the ARC lists are busy, increase our write rate; if the ++ * lists are stale, idle back. This is achieved by checking ++ * how much we previously wrote - if it was more than half of ++ * what we wanted, schedule the next write much sooner. ++ */ ++ if (l2arc_feed_again && wrote > (wanted / 2)) ++ interval = (hz * l2arc_feed_min_ms) / 1000; ++ else ++ interval = hz * l2arc_feed_secs; ++ ++ now = ddi_get_lbolt(); ++ next = MAX(now, MIN(now + interval, began + interval)); ++ ++ return (next); ++} ++ ++static void ++l2arc_hdr_stat_add(void) ++{ ++ ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); ++ ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); ++} ++ ++static void ++l2arc_hdr_stat_remove(void) ++{ ++ ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); ++ ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); ++} ++ ++/* ++ * Cycle through L2ARC devices. This is how L2ARC load balances. ++ * If a device is returned, this also returns holding the spa config lock. ++ */ ++static l2arc_dev_t * ++l2arc_dev_get_next(void) ++{ ++ l2arc_dev_t *first, *next = NULL; ++ ++ /* ++ * Lock out the removal of spas (spa_namespace_lock), then removal ++ * of cache devices (l2arc_dev_mtx). Once a device has been selected, ++ * both locks will be dropped and a spa config lock held instead. ++ */ ++ mutex_enter(&spa_namespace_lock); ++ mutex_enter(&l2arc_dev_mtx); ++ ++ /* if there are no vdevs, there is nothing to do */ ++ if (l2arc_ndev == 0) ++ goto out; ++ ++ first = NULL; ++ next = l2arc_dev_last; ++ do { ++ /* loop around the list looking for a non-faulted vdev */ ++ if (next == NULL) { ++ next = list_head(l2arc_dev_list); ++ } else { ++ next = list_next(l2arc_dev_list, next); ++ if (next == NULL) ++ next = list_head(l2arc_dev_list); ++ } ++ ++ /* if we have come back to the start, bail out */ ++ if (first == NULL) ++ first = next; ++ else if (next == first) ++ break; ++ ++ } while (vdev_is_dead(next->l2ad_vdev)); ++ ++ /* if we were unable to find any usable vdevs, return NULL */ ++ if (vdev_is_dead(next->l2ad_vdev)) ++ next = NULL; ++ ++ l2arc_dev_last = next; ++ ++out: ++ mutex_exit(&l2arc_dev_mtx); ++ ++ /* ++ * Grab the config lock to prevent the 'next' device from being ++ * removed while we are writing to it. ++ */ ++ if (next != NULL) ++ spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); ++ mutex_exit(&spa_namespace_lock); ++ ++ return (next); ++} ++ ++/* ++ * Free buffers that were tagged for destruction. ++ */ ++static void ++l2arc_do_free_on_write(void) ++{ ++ list_t *buflist; ++ l2arc_data_free_t *df, *df_prev; ++ ++ mutex_enter(&l2arc_free_on_write_mtx); ++ buflist = l2arc_free_on_write; ++ ++ for (df = list_tail(buflist); df; df = df_prev) { ++ df_prev = list_prev(buflist, df); ++ ASSERT(df->l2df_data != NULL); ++ ASSERT(df->l2df_func != NULL); ++ df->l2df_func(df->l2df_data, df->l2df_size); ++ list_remove(buflist, df); ++ kmem_free(df, sizeof (l2arc_data_free_t)); ++ } ++ ++ mutex_exit(&l2arc_free_on_write_mtx); ++} ++ ++/* ++ * A write to a cache device has completed. Update all headers to allow ++ * reads from these buffers to begin. ++ */ ++static void ++l2arc_write_done(zio_t *zio) ++{ ++ l2arc_write_callback_t *cb; ++ l2arc_dev_t *dev; ++ list_t *buflist; ++ arc_buf_hdr_t *head, *ab, *ab_prev; ++ l2arc_buf_hdr_t *abl2; ++ kmutex_t *hash_lock; ++ ++ cb = zio->io_private; ++ ASSERT(cb != NULL); ++ dev = cb->l2wcb_dev; ++ ASSERT(dev != NULL); ++ head = cb->l2wcb_head; ++ ASSERT(head != NULL); ++ buflist = dev->l2ad_buflist; ++ ASSERT(buflist != NULL); ++ DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, ++ l2arc_write_callback_t *, cb); ++ ++ if (zio->io_error != 0) ++ ARCSTAT_BUMP(arcstat_l2_writes_error); ++ ++ mutex_enter(&l2arc_buflist_mtx); ++ ++ /* ++ * All writes completed, or an error was hit. ++ */ ++ for (ab = list_prev(buflist, head); ab; ab = ab_prev) { ++ ab_prev = list_prev(buflist, ab); ++ ++ hash_lock = HDR_LOCK(ab); ++ if (!mutex_tryenter(hash_lock)) { ++ /* ++ * This buffer misses out. It may be in a stage ++ * of eviction. Its ARC_L2_WRITING flag will be ++ * left set, denying reads to this buffer. ++ */ ++ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); ++ continue; ++ } ++ ++ if (zio->io_error != 0) { ++ /* ++ * Error - drop L2ARC entry. ++ */ ++ list_remove(buflist, ab); ++ abl2 = ab->b_l2hdr; ++ ab->b_l2hdr = NULL; ++ kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); ++ ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); ++ } ++ ++ /* ++ * Allow ARC to begin reads to this L2ARC entry. ++ */ ++ ab->b_flags &= ~ARC_L2_WRITING; ++ ++ mutex_exit(hash_lock); ++ } ++ ++ atomic_inc_64(&l2arc_writes_done); ++ list_remove(buflist, head); ++ kmem_cache_free(hdr_cache, head); ++ mutex_exit(&l2arc_buflist_mtx); ++ ++ l2arc_do_free_on_write(); ++ ++ kmem_free(cb, sizeof (l2arc_write_callback_t)); ++} ++ ++/* ++ * A read to a cache device completed. Validate buffer contents before ++ * handing over to the regular ARC routines. ++ */ ++static void ++l2arc_read_done(zio_t *zio) ++{ ++ l2arc_read_callback_t *cb; ++ arc_buf_hdr_t *hdr; ++ arc_buf_t *buf; ++ kmutex_t *hash_lock; ++ int equal; ++ ++ ASSERT(zio->io_vd != NULL); ++ ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); ++ ++ spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); ++ ++ cb = zio->io_private; ++ ASSERT(cb != NULL); ++ buf = cb->l2rcb_buf; ++ ASSERT(buf != NULL); ++ ++ hash_lock = HDR_LOCK(buf->b_hdr); ++ mutex_enter(hash_lock); ++ hdr = buf->b_hdr; ++ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ++ ++ /* ++ * Check this survived the L2ARC journey. ++ */ ++ equal = arc_cksum_equal(buf); ++ if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { ++ mutex_exit(hash_lock); ++ zio->io_private = buf; ++ zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ ++ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ ++ arc_read_done(zio); ++ } else { ++ mutex_exit(hash_lock); ++ /* ++ * Buffer didn't survive caching. Increment stats and ++ * reissue to the original storage device. ++ */ ++ if (zio->io_error != 0) { ++ ARCSTAT_BUMP(arcstat_l2_io_error); ++ } else { ++ zio->io_error = EIO; ++ } ++ if (!equal) ++ ARCSTAT_BUMP(arcstat_l2_cksum_bad); ++ ++ /* ++ * If there's no waiter, issue an async i/o to the primary ++ * storage now. If there *is* a waiter, the caller must ++ * issue the i/o in a context where it's OK to block. ++ */ ++ if (zio->io_waiter == NULL) { ++ zio_t *pio = zio_unique_parent(zio); ++ ++ ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); ++ ++ zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, ++ buf->b_data, zio->io_size, arc_read_done, buf, ++ zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); ++ } ++ } ++ ++ kmem_free(cb, sizeof (l2arc_read_callback_t)); ++} ++ ++/* ++ * This is the list priority from which the L2ARC will search for pages to ++ * cache. This is used within loops (0..3) to cycle through lists in the ++ * desired order. This order can have a significant effect on cache ++ * performance. ++ * ++ * Currently the metadata lists are hit first, MFU then MRU, followed by ++ * the data lists. This function returns a locked list, and also returns ++ * the lock pointer. ++ */ ++static list_t * ++l2arc_list_locked(int list_num, kmutex_t **lock) ++{ ++ list_t *list = NULL; ++ ++ ASSERT(list_num >= 0 && list_num <= 3); ++ ++ switch (list_num) { ++ case 0: ++ list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; ++ *lock = &arc_mfu->arcs_mtx; ++ break; ++ case 1: ++ list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; ++ *lock = &arc_mru->arcs_mtx; ++ break; ++ case 2: ++ list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; ++ *lock = &arc_mfu->arcs_mtx; ++ break; ++ case 3: ++ list = &arc_mru->arcs_list[ARC_BUFC_DATA]; ++ *lock = &arc_mru->arcs_mtx; ++ break; ++ } ++ ++ ASSERT(!(MUTEX_HELD(*lock))); ++ mutex_enter(*lock); ++ return (list); ++} ++ ++/* ++ * Evict buffers from the device write hand to the distance specified in ++ * bytes. This distance may span populated buffers, it may span nothing. ++ * This is clearing a region on the L2ARC device ready for writing. ++ * If the 'all' boolean is set, every buffer is evicted. ++ */ ++static void ++l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) ++{ ++ list_t *buflist; ++ l2arc_buf_hdr_t *abl2; ++ arc_buf_hdr_t *ab, *ab_prev; ++ kmutex_t *hash_lock; ++ uint64_t taddr; ++ ++ buflist = dev->l2ad_buflist; ++ ++ if (buflist == NULL) ++ return; ++ ++ if (!all && dev->l2ad_first) { ++ /* ++ * This is the first sweep through the device. There is ++ * nothing to evict. ++ */ ++ return; ++ } ++ ++ if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { ++ /* ++ * When nearing the end of the device, evict to the end ++ * before the device write hand jumps to the start. ++ */ ++ taddr = dev->l2ad_end; ++ } else { ++ taddr = dev->l2ad_hand + distance; ++ } ++ DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, ++ uint64_t, taddr, boolean_t, all); ++ ++top: ++ mutex_enter(&l2arc_buflist_mtx); ++ for (ab = list_tail(buflist); ab; ab = ab_prev) { ++ ab_prev = list_prev(buflist, ab); ++ ++ hash_lock = HDR_LOCK(ab); ++ if (!mutex_tryenter(hash_lock)) { ++ /* ++ * Missed the hash lock. Retry. ++ */ ++ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); ++ mutex_exit(&l2arc_buflist_mtx); ++ mutex_enter(hash_lock); ++ mutex_exit(hash_lock); ++ goto top; ++ } ++ ++ if (HDR_L2_WRITE_HEAD(ab)) { ++ /* ++ * We hit a write head node. Leave it for ++ * l2arc_write_done(). ++ */ ++ list_remove(buflist, ab); ++ mutex_exit(hash_lock); ++ continue; ++ } ++ ++ if (!all && ab->b_l2hdr != NULL && ++ (ab->b_l2hdr->b_daddr > taddr || ++ ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { ++ /* ++ * We've evicted to the target address, ++ * or the end of the device. ++ */ ++ mutex_exit(hash_lock); ++ break; ++ } ++ ++ if (HDR_FREE_IN_PROGRESS(ab)) { ++ /* ++ * Already on the path to destruction. ++ */ ++ mutex_exit(hash_lock); ++ continue; ++ } ++ ++ if (ab->b_state == arc_l2c_only) { ++ ASSERT(!HDR_L2_READING(ab)); ++ /* ++ * This doesn't exist in the ARC. Destroy. ++ * arc_hdr_destroy() will call list_remove() ++ * and decrement arcstat_l2_size. ++ */ ++ arc_change_state(arc_anon, ab, hash_lock); ++ arc_hdr_destroy(ab); ++ } else { ++ /* ++ * Invalidate issued or about to be issued ++ * reads, since we may be about to write ++ * over this location. ++ */ ++ if (HDR_L2_READING(ab)) { ++ ARCSTAT_BUMP(arcstat_l2_evict_reading); ++ ab->b_flags |= ARC_L2_EVICTED; ++ } ++ ++ /* ++ * Tell ARC this no longer exists in L2ARC. ++ */ ++ if (ab->b_l2hdr != NULL) { ++ abl2 = ab->b_l2hdr; ++ ab->b_l2hdr = NULL; ++ kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); ++ ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); ++ } ++ list_remove(buflist, ab); ++ ++ /* ++ * This may have been leftover after a ++ * failed write. ++ */ ++ ab->b_flags &= ~ARC_L2_WRITING; ++ } ++ mutex_exit(hash_lock); ++ } ++ mutex_exit(&l2arc_buflist_mtx); ++ ++ vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); ++ dev->l2ad_evict = taddr; ++} ++ ++/* ++ * Find and write ARC buffers to the L2ARC device. ++ * ++ * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid ++ * for reading until they have completed writing. ++ */ ++static uint64_t ++l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ++{ ++ arc_buf_hdr_t *ab, *ab_prev, *head; ++ l2arc_buf_hdr_t *hdrl2; ++ list_t *list; ++ uint64_t passed_sz, write_sz, buf_sz, headroom; ++ void *buf_data; ++ kmutex_t *hash_lock, *list_lock = NULL; ++ boolean_t have_lock, full; ++ l2arc_write_callback_t *cb; ++ zio_t *pio, *wzio; ++ uint64_t guid = spa_load_guid(spa); ++ int try; ++ ++ ASSERT(dev->l2ad_vdev != NULL); ++ ++ pio = NULL; ++ write_sz = 0; ++ full = B_FALSE; ++ head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); ++ head->b_flags |= ARC_L2_WRITE_HEAD; ++ ++ /* ++ * Copy buffers for L2ARC writing. ++ */ ++ mutex_enter(&l2arc_buflist_mtx); ++ for (try = 0; try <= 3; try++) { ++ list = l2arc_list_locked(try, &list_lock); ++ passed_sz = 0; ++ ++ /* ++ * L2ARC fast warmup. ++ * ++ * Until the ARC is warm and starts to evict, read from the ++ * head of the ARC lists rather than the tail. ++ */ ++ headroom = target_sz * l2arc_headroom; ++ if (arc_warm == B_FALSE) ++ ab = list_head(list); ++ else ++ ab = list_tail(list); ++ ++ for (; ab; ab = ab_prev) { ++ if (arc_warm == B_FALSE) ++ ab_prev = list_next(list, ab); ++ else ++ ab_prev = list_prev(list, ab); ++ ++ hash_lock = HDR_LOCK(ab); ++ have_lock = MUTEX_HELD(hash_lock); ++ if (!have_lock && !mutex_tryenter(hash_lock)) { ++ /* ++ * Skip this buffer rather than waiting. ++ */ ++ continue; ++ } ++ ++ passed_sz += ab->b_size; ++ if (passed_sz > headroom) { ++ /* ++ * Searched too far. ++ */ ++ mutex_exit(hash_lock); ++ break; ++ } ++ ++ if (!l2arc_write_eligible(guid, ab)) { ++ mutex_exit(hash_lock); ++ continue; ++ } ++ ++ if ((write_sz + ab->b_size) > target_sz) { ++ full = B_TRUE; ++ mutex_exit(hash_lock); ++ break; ++ } ++ ++ if (pio == NULL) { ++ /* ++ * Insert a dummy header on the buflist so ++ * l2arc_write_done() can find where the ++ * write buffers begin without searching. ++ */ ++ list_insert_head(dev->l2ad_buflist, head); ++ ++ cb = kmem_alloc(sizeof (l2arc_write_callback_t), ++ KM_PUSHPAGE); ++ cb->l2wcb_dev = dev; ++ cb->l2wcb_head = head; ++ pio = zio_root(spa, l2arc_write_done, cb, ++ ZIO_FLAG_CANFAIL); ++ } ++ ++ /* ++ * Create and add a new L2ARC header. ++ */ ++ hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), ++ KM_PUSHPAGE); ++ hdrl2->b_dev = dev; ++ hdrl2->b_daddr = dev->l2ad_hand; ++ ++ ab->b_flags |= ARC_L2_WRITING; ++ ab->b_l2hdr = hdrl2; ++ list_insert_head(dev->l2ad_buflist, ab); ++ buf_data = ab->b_buf->b_data; ++ buf_sz = ab->b_size; ++ ++ /* ++ * Compute and store the buffer cksum before ++ * writing. On debug the cksum is verified first. ++ */ ++ arc_cksum_verify(ab->b_buf); ++ arc_cksum_compute(ab->b_buf, B_TRUE); ++ ++ mutex_exit(hash_lock); ++ ++ wzio = zio_write_phys(pio, dev->l2ad_vdev, ++ dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, ++ NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ++ ZIO_FLAG_CANFAIL, B_FALSE); ++ ++ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, ++ zio_t *, wzio); ++ (void) zio_nowait(wzio); ++ ++ /* ++ * Keep the clock hand suitably device-aligned. ++ */ ++ buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); ++ ++ write_sz += buf_sz; ++ dev->l2ad_hand += buf_sz; ++ } ++ ++ mutex_exit(list_lock); ++ ++ if (full == B_TRUE) ++ break; ++ } ++ mutex_exit(&l2arc_buflist_mtx); ++ ++ if (pio == NULL) { ++ ASSERT3U(write_sz, ==, 0); ++ kmem_cache_free(hdr_cache, head); ++ return (0); ++ } ++ ++ ASSERT3U(write_sz, <=, target_sz); ++ ARCSTAT_BUMP(arcstat_l2_writes_sent); ++ ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); ++ ARCSTAT_INCR(arcstat_l2_size, write_sz); ++ vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0); ++ ++ /* ++ * Bump device hand to the device start if it is approaching the end. ++ * l2arc_evict() will already have evicted ahead for this case. ++ */ ++ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { ++ vdev_space_update(dev->l2ad_vdev, ++ dev->l2ad_end - dev->l2ad_hand, 0, 0); ++ dev->l2ad_hand = dev->l2ad_start; ++ dev->l2ad_evict = dev->l2ad_start; ++ dev->l2ad_first = B_FALSE; ++ } ++ ++ dev->l2ad_writing = B_TRUE; ++ (void) zio_wait(pio); ++ dev->l2ad_writing = B_FALSE; ++ ++ return (write_sz); ++} ++ ++/* ++ * This thread feeds the L2ARC at regular intervals. This is the beating ++ * heart of the L2ARC. ++ */ ++static void ++l2arc_feed_thread(void) ++{ ++ callb_cpr_t cpr; ++ l2arc_dev_t *dev; ++ spa_t *spa; ++ uint64_t size, wrote; ++ clock_t begin, next = ddi_get_lbolt(); ++ ++ CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); ++ ++ mutex_enter(&l2arc_feed_thr_lock); ++ ++ while (l2arc_thread_exit == 0) { ++ CALLB_CPR_SAFE_BEGIN(&cpr); ++ (void) cv_timedwait_interruptible(&l2arc_feed_thr_cv, ++ &l2arc_feed_thr_lock, next); ++ CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); ++ next = ddi_get_lbolt() + hz; ++ ++ /* ++ * Quick check for L2ARC devices. ++ */ ++ mutex_enter(&l2arc_dev_mtx); ++ if (l2arc_ndev == 0) { ++ mutex_exit(&l2arc_dev_mtx); ++ continue; ++ } ++ mutex_exit(&l2arc_dev_mtx); ++ begin = ddi_get_lbolt(); ++ ++ /* ++ * This selects the next l2arc device to write to, and in ++ * doing so the next spa to feed from: dev->l2ad_spa. This ++ * will return NULL if there are now no l2arc devices or if ++ * they are all faulted. ++ * ++ * If a device is returned, its spa's config lock is also ++ * held to prevent device removal. l2arc_dev_get_next() ++ * will grab and release l2arc_dev_mtx. ++ */ ++ if ((dev = l2arc_dev_get_next()) == NULL) ++ continue; ++ ++ spa = dev->l2ad_spa; ++ ASSERT(spa != NULL); ++ ++ /* ++ * If the pool is read-only then force the feed thread to ++ * sleep a little longer. ++ */ ++ if (!spa_writeable(spa)) { ++ next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; ++ spa_config_exit(spa, SCL_L2ARC, dev); ++ continue; ++ } ++ ++ /* ++ * Avoid contributing to memory pressure. ++ */ ++ if (arc_no_grow) { ++ ARCSTAT_BUMP(arcstat_l2_abort_lowmem); ++ spa_config_exit(spa, SCL_L2ARC, dev); ++ continue; ++ } ++ ++ ARCSTAT_BUMP(arcstat_l2_feeds); ++ ++ size = l2arc_write_size(dev); ++ ++ /* ++ * Evict L2ARC buffers that will be overwritten. ++ */ ++ l2arc_evict(dev, size, B_FALSE); ++ ++ /* ++ * Write ARC buffers. ++ */ ++ wrote = l2arc_write_buffers(spa, dev, size); ++ ++ /* ++ * Calculate interval between writes. ++ */ ++ next = l2arc_write_interval(begin, size, wrote); ++ spa_config_exit(spa, SCL_L2ARC, dev); ++ } ++ ++ l2arc_thread_exit = 0; ++ cv_broadcast(&l2arc_feed_thr_cv); ++ CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ ++ thread_exit(); ++} ++ ++boolean_t ++l2arc_vdev_present(vdev_t *vd) ++{ ++ l2arc_dev_t *dev; ++ ++ mutex_enter(&l2arc_dev_mtx); ++ for (dev = list_head(l2arc_dev_list); dev != NULL; ++ dev = list_next(l2arc_dev_list, dev)) { ++ if (dev->l2ad_vdev == vd) ++ break; ++ } ++ mutex_exit(&l2arc_dev_mtx); ++ ++ return (dev != NULL); ++} ++ ++/* ++ * Add a vdev for use by the L2ARC. By this point the spa has already ++ * validated the vdev and opened it. ++ */ ++void ++l2arc_add_vdev(spa_t *spa, vdev_t *vd) ++{ ++ l2arc_dev_t *adddev; ++ ++ ASSERT(!l2arc_vdev_present(vd)); ++ ++ /* ++ * Create a new l2arc device entry. ++ */ ++ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); ++ adddev->l2ad_spa = spa; ++ adddev->l2ad_vdev = vd; ++ adddev->l2ad_write = l2arc_write_max; ++ adddev->l2ad_boost = l2arc_write_boost; ++ adddev->l2ad_start = VDEV_LABEL_START_SIZE; ++ adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); ++ adddev->l2ad_hand = adddev->l2ad_start; ++ adddev->l2ad_evict = adddev->l2ad_start; ++ adddev->l2ad_first = B_TRUE; ++ adddev->l2ad_writing = B_FALSE; ++ list_link_init(&adddev->l2ad_node); ++ ASSERT3U(adddev->l2ad_write, >, 0); ++ ++ /* ++ * This is a list of all ARC buffers that are still valid on the ++ * device. ++ */ ++ adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); ++ list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), ++ offsetof(arc_buf_hdr_t, b_l2node)); ++ ++ vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); ++ ++ /* ++ * Add device to global list ++ */ ++ mutex_enter(&l2arc_dev_mtx); ++ list_insert_head(l2arc_dev_list, adddev); ++ atomic_inc_64(&l2arc_ndev); ++ mutex_exit(&l2arc_dev_mtx); ++} ++ ++/* ++ * Remove a vdev from the L2ARC. ++ */ ++void ++l2arc_remove_vdev(vdev_t *vd) ++{ ++ l2arc_dev_t *dev, *nextdev, *remdev = NULL; ++ ++ /* ++ * Find the device by vdev ++ */ ++ mutex_enter(&l2arc_dev_mtx); ++ for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { ++ nextdev = list_next(l2arc_dev_list, dev); ++ if (vd == dev->l2ad_vdev) { ++ remdev = dev; ++ break; ++ } ++ } ++ ASSERT(remdev != NULL); ++ ++ /* ++ * Remove device from global list ++ */ ++ list_remove(l2arc_dev_list, remdev); ++ l2arc_dev_last = NULL; /* may have been invalidated */ ++ atomic_dec_64(&l2arc_ndev); ++ mutex_exit(&l2arc_dev_mtx); ++ ++ /* ++ * Clear all buflists and ARC references. L2ARC device flush. ++ */ ++ l2arc_evict(remdev, 0, B_TRUE); ++ list_destroy(remdev->l2ad_buflist); ++ kmem_free(remdev->l2ad_buflist, sizeof (list_t)); ++ kmem_free(remdev, sizeof (l2arc_dev_t)); ++} ++ ++void ++l2arc_init(void) ++{ ++ l2arc_thread_exit = 0; ++ l2arc_ndev = 0; ++ l2arc_writes_sent = 0; ++ l2arc_writes_done = 0; ++ ++ mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); ++ cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); ++ mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); ++ ++ l2arc_dev_list = &L2ARC_dev_list; ++ l2arc_free_on_write = &L2ARC_free_on_write; ++ list_create(l2arc_dev_list, sizeof (l2arc_dev_t), ++ offsetof(l2arc_dev_t, l2ad_node)); ++ list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), ++ offsetof(l2arc_data_free_t, l2df_list_node)); ++} ++ ++void ++l2arc_fini(void) ++{ ++ /* ++ * This is called from dmu_fini(), which is called from spa_fini(); ++ * Because of this, we can assume that all l2arc devices have ++ * already been removed when the pools themselves were removed. ++ */ ++ ++ l2arc_do_free_on_write(); ++ ++ mutex_destroy(&l2arc_feed_thr_lock); ++ cv_destroy(&l2arc_feed_thr_cv); ++ mutex_destroy(&l2arc_dev_mtx); ++ mutex_destroy(&l2arc_buflist_mtx); ++ mutex_destroy(&l2arc_free_on_write_mtx); ++ ++ list_destroy(l2arc_dev_list); ++ list_destroy(l2arc_free_on_write); ++} ++ ++void ++l2arc_start(void) ++{ ++ if (!(spa_mode_global & FWRITE)) ++ return; ++ ++ (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, ++ TS_RUN, minclsyspri); ++} ++ ++void ++l2arc_stop(void) ++{ ++ if (!(spa_mode_global & FWRITE)) ++ return; ++ ++ mutex_enter(&l2arc_feed_thr_lock); ++ cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ ++ l2arc_thread_exit = 1; ++ while (l2arc_thread_exit != 0) ++ cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); ++ mutex_exit(&l2arc_feed_thr_lock); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(arc_read); ++EXPORT_SYMBOL(arc_buf_remove_ref); ++EXPORT_SYMBOL(arc_getbuf_func); ++EXPORT_SYMBOL(arc_add_prune_callback); ++EXPORT_SYMBOL(arc_remove_prune_callback); ++ ++module_param(zfs_arc_min, ulong, 0444); ++MODULE_PARM_DESC(zfs_arc_min, "Min arc size"); ++ ++module_param(zfs_arc_max, ulong, 0444); ++MODULE_PARM_DESC(zfs_arc_max, "Max arc size"); ++ ++module_param(zfs_arc_meta_limit, ulong, 0444); ++MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size"); ++ ++module_param(zfs_arc_meta_prune, int, 0444); ++MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune"); ++ ++module_param(zfs_arc_grow_retry, int, 0444); ++MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); ++ ++module_param(zfs_arc_shrink_shift, int, 0444); ++MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)"); ++ ++module_param(zfs_arc_p_min_shift, int, 0444); ++MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p"); ++ ++module_param(l2arc_write_max, ulong, 0444); ++MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval"); ++ ++module_param(l2arc_write_boost, ulong, 0444); ++MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup"); ++ ++module_param(l2arc_headroom, ulong, 0444); ++MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache"); ++ ++module_param(l2arc_feed_secs, ulong, 0444); ++MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing"); ++ ++module_param(l2arc_feed_min_ms, ulong, 0444); ++MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds"); ++ ++module_param(l2arc_noprefetch, int, 0444); ++MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers"); ++ ++module_param(l2arc_feed_again, int, 0444); ++MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup"); ++ ++module_param(l2arc_norw, int, 0444); ++MODULE_PARM_DESC(l2arc_norw, "No reads during writes"); ++ ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/bplist.c linux-3.2.33-go/fs/zfs/zfs/bplist.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/bplist.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/bplist.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,69 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++ ++ ++void ++bplist_create(bplist_t *bpl) ++{ ++ mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL); ++ list_create(&bpl->bpl_list, sizeof (bplist_entry_t), ++ offsetof(bplist_entry_t, bpe_node)); ++} ++ ++void ++bplist_destroy(bplist_t *bpl) ++{ ++ list_destroy(&bpl->bpl_list); ++ mutex_destroy(&bpl->bpl_lock); ++} ++ ++void ++bplist_append(bplist_t *bpl, const blkptr_t *bp) ++{ ++ bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_PUSHPAGE); ++ ++ mutex_enter(&bpl->bpl_lock); ++ bpe->bpe_blk = *bp; ++ list_insert_tail(&bpl->bpl_list, bpe); ++ mutex_exit(&bpl->bpl_lock); ++} ++ ++void ++bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) ++{ ++ bplist_entry_t *bpe; ++ ++ mutex_enter(&bpl->bpl_lock); ++ while ((bpe = list_head(&bpl->bpl_list))) { ++ list_remove(&bpl->bpl_list, bpe); ++ mutex_exit(&bpl->bpl_lock); ++ func(arg, &bpe->bpe_blk, tx); ++ kmem_free(bpe, sizeof (*bpe)); ++ mutex_enter(&bpl->bpl_lock); ++ } ++ mutex_exit(&bpl->bpl_lock); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/bpobj.c linux-3.2.33-go/fs/zfs/zfs/bpobj.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/bpobj.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/bpobj.c 2012-11-16 23:25:34.350039322 +0100 +@@ -0,0 +1,500 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++uint64_t ++bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) ++{ ++ int size; ++ ++ if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) ++ size = BPOBJ_SIZE_V0; ++ else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) ++ size = BPOBJ_SIZE_V1; ++ else ++ size = sizeof (bpobj_phys_t); ++ ++ return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, ++ DMU_OT_BPOBJ_HDR, size, tx)); ++} ++ ++void ++bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) ++{ ++ int64_t i; ++ bpobj_t bpo; ++ dmu_object_info_t doi; ++ int epb; ++ dmu_buf_t *dbuf = NULL; ++ ++ VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); ++ ++ mutex_enter(&bpo.bpo_lock); ++ ++ if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) ++ goto out; ++ ++ VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); ++ epb = doi.doi_data_block_size / sizeof (uint64_t); ++ ++ for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { ++ uint64_t *objarray; ++ uint64_t offset, blkoff; ++ ++ offset = i * sizeof (uint64_t); ++ blkoff = P2PHASE(i, epb); ++ ++ if (dbuf == NULL || dbuf->db_offset > offset) { ++ if (dbuf) ++ dmu_buf_rele(dbuf, FTAG); ++ VERIFY3U(0, ==, dmu_buf_hold(os, ++ bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); ++ } ++ ++ ASSERT3U(offset, >=, dbuf->db_offset); ++ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); ++ ++ objarray = dbuf->db_data; ++ bpobj_free(os, objarray[blkoff], tx); ++ } ++ if (dbuf) { ++ dmu_buf_rele(dbuf, FTAG); ++ dbuf = NULL; ++ } ++ VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); ++ ++out: ++ mutex_exit(&bpo.bpo_lock); ++ bpobj_close(&bpo); ++ ++ VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); ++} ++ ++int ++bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) ++{ ++ dmu_object_info_t doi; ++ int err; ++ ++ err = dmu_object_info(os, object, &doi); ++ if (err) ++ return (err); ++ ++ bzero(bpo, sizeof (*bpo)); ++ mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ ASSERT(bpo->bpo_dbuf == NULL); ++ ASSERT(bpo->bpo_phys == NULL); ++ ASSERT(object != 0); ++ ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); ++ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); ++ ++ err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); ++ if (err) ++ return (err); ++ ++ bpo->bpo_os = os; ++ bpo->bpo_object = object; ++ bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; ++ bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); ++ bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); ++ bpo->bpo_phys = bpo->bpo_dbuf->db_data; ++ return (0); ++} ++ ++void ++bpobj_close(bpobj_t *bpo) ++{ ++ /* Lame workaround for closing a bpobj that was never opened. */ ++ if (bpo->bpo_object == 0) ++ return; ++ ++ dmu_buf_rele(bpo->bpo_dbuf, bpo); ++ if (bpo->bpo_cached_dbuf != NULL) ++ dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); ++ bpo->bpo_dbuf = NULL; ++ bpo->bpo_phys = NULL; ++ bpo->bpo_cached_dbuf = NULL; ++ bpo->bpo_object = 0; ++ ++ mutex_destroy(&bpo->bpo_lock); ++} ++ ++static int ++bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, ++ boolean_t free) ++{ ++ dmu_object_info_t doi; ++ int epb; ++ int64_t i; ++ int err = 0; ++ dmu_buf_t *dbuf = NULL; ++ ++ mutex_enter(&bpo->bpo_lock); ++ ++ if (free) ++ dmu_buf_will_dirty(bpo->bpo_dbuf, tx); ++ ++ for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { ++ blkptr_t *bparray; ++ blkptr_t *bp; ++ uint64_t offset, blkoff; ++ ++ offset = i * sizeof (blkptr_t); ++ blkoff = P2PHASE(i, bpo->bpo_epb); ++ ++ if (dbuf == NULL || dbuf->db_offset > offset) { ++ if (dbuf) ++ dmu_buf_rele(dbuf, FTAG); ++ err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, ++ FTAG, &dbuf, 0); ++ if (err) ++ break; ++ } ++ ++ ASSERT3U(offset, >=, dbuf->db_offset); ++ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); ++ ++ bparray = dbuf->db_data; ++ bp = &bparray[blkoff]; ++ err = func(arg, bp, tx); ++ if (err) ++ break; ++ if (free) { ++ bpo->bpo_phys->bpo_bytes -= ++ bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); ++ ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); ++ if (bpo->bpo_havecomp) { ++ bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp); ++ bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp); ++ } ++ bpo->bpo_phys->bpo_num_blkptrs--; ++ ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); ++ } ++ } ++ if (dbuf) { ++ dmu_buf_rele(dbuf, FTAG); ++ dbuf = NULL; ++ } ++ if (free) { ++ i++; ++ VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, ++ i * sizeof (blkptr_t), -1ULL, tx)); ++ } ++ if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) ++ goto out; ++ ++ ASSERT(bpo->bpo_havecomp); ++ err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); ++ if (err) { ++ mutex_exit(&bpo->bpo_lock); ++ return (err); ++ } ++ epb = doi.doi_data_block_size / sizeof (uint64_t); ++ ++ for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { ++ uint64_t *objarray; ++ uint64_t offset, blkoff; ++ bpobj_t sublist; ++ uint64_t used_before, comp_before, uncomp_before; ++ uint64_t used_after, comp_after, uncomp_after; ++ ++ offset = i * sizeof (uint64_t); ++ blkoff = P2PHASE(i, epb); ++ ++ if (dbuf == NULL || dbuf->db_offset > offset) { ++ if (dbuf) ++ dmu_buf_rele(dbuf, FTAG); ++ err = dmu_buf_hold(bpo->bpo_os, ++ bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0); ++ if (err) ++ break; ++ } ++ ++ ASSERT3U(offset, >=, dbuf->db_offset); ++ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); ++ ++ objarray = dbuf->db_data; ++ err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); ++ if (err) ++ break; ++ if (free) { ++ err = bpobj_space(&sublist, ++ &used_before, &comp_before, &uncomp_before); ++ if (err) ++ break; ++ } ++ err = bpobj_iterate_impl(&sublist, func, arg, tx, free); ++ if (free) { ++ VERIFY3U(0, ==, bpobj_space(&sublist, ++ &used_after, &comp_after, &uncomp_after)); ++ bpo->bpo_phys->bpo_bytes -= used_before - used_after; ++ ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); ++ bpo->bpo_phys->bpo_comp -= comp_before - comp_after; ++ bpo->bpo_phys->bpo_uncomp -= ++ uncomp_before - uncomp_after; ++ } ++ ++ bpobj_close(&sublist); ++ if (err) ++ break; ++ if (free) { ++ err = dmu_object_free(bpo->bpo_os, ++ objarray[blkoff], tx); ++ if (err) ++ break; ++ bpo->bpo_phys->bpo_num_subobjs--; ++ ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0); ++ } ++ } ++ if (dbuf) { ++ dmu_buf_rele(dbuf, FTAG); ++ dbuf = NULL; ++ } ++ if (free) { ++ VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, ++ bpo->bpo_phys->bpo_subobjs, ++ (i + 1) * sizeof (uint64_t), -1ULL, tx)); ++ } ++ ++out: ++ /* If there are no entries, there should be no bytes. */ ++ ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 || ++ (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) || ++ bpo->bpo_phys->bpo_bytes == 0); ++ ++ mutex_exit(&bpo->bpo_lock); ++ return (err); ++} ++ ++/* ++ * Iterate and remove the entries. If func returns nonzero, iteration ++ * will stop and that entry will not be removed. ++ */ ++int ++bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) ++{ ++ return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); ++} ++ ++/* ++ * Iterate the entries. If func returns nonzero, iteration will stop. ++ */ ++int ++bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) ++{ ++ return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); ++} ++ ++void ++bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) ++{ ++ bpobj_t subbpo; ++ uint64_t used, comp, uncomp, subsubobjs; ++ ++ ASSERT(bpo->bpo_havesubobj); ++ ASSERT(bpo->bpo_havecomp); ++ ++ VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); ++ VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); ++ ++ if (used == 0) { ++ /* No point in having an empty subobj. */ ++ bpobj_close(&subbpo); ++ bpobj_free(bpo->bpo_os, subobj, tx); ++ return; ++ } ++ ++ dmu_buf_will_dirty(bpo->bpo_dbuf, tx); ++ if (bpo->bpo_phys->bpo_subobjs == 0) { ++ bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, ++ DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); ++ } ++ ++ mutex_enter(&bpo->bpo_lock); ++ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, ++ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), ++ sizeof (subobj), &subobj, tx); ++ bpo->bpo_phys->bpo_num_subobjs++; ++ ++ /* ++ * If subobj has only one block of subobjs, then move subobj's ++ * subobjs to bpo's subobj list directly. This reduces ++ * recursion in bpobj_iterate due to nested subobjs. ++ */ ++ subsubobjs = subbpo.bpo_phys->bpo_subobjs; ++ if (subsubobjs != 0) { ++ dmu_object_info_t doi; ++ ++ VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); ++ if (doi.doi_max_offset == doi.doi_data_block_size) { ++ dmu_buf_t *subdb; ++ uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; ++ ++ VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs, ++ 0, FTAG, &subdb, 0)); ++ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, ++ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), ++ numsubsub * sizeof (subobj), subdb->db_data, tx); ++ dmu_buf_rele(subdb, FTAG); ++ bpo->bpo_phys->bpo_num_subobjs += numsubsub; ++ ++ dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); ++ subbpo.bpo_phys->bpo_subobjs = 0; ++ VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os, ++ subsubobjs, tx)); ++ } ++ } ++ bpo->bpo_phys->bpo_bytes += used; ++ bpo->bpo_phys->bpo_comp += comp; ++ bpo->bpo_phys->bpo_uncomp += uncomp; ++ mutex_exit(&bpo->bpo_lock); ++ ++ bpobj_close(&subbpo); ++} ++ ++void ++bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) ++{ ++ blkptr_t stored_bp = *bp; ++ uint64_t offset; ++ int blkoff; ++ blkptr_t *bparray; ++ ++ ASSERT(!BP_IS_HOLE(bp)); ++ ++ /* We never need the fill count. */ ++ stored_bp.blk_fill = 0; ++ ++ /* The bpobj will compress better if we can leave off the checksum */ ++ if (!BP_GET_DEDUP(bp)) ++ bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); ++ ++ mutex_enter(&bpo->bpo_lock); ++ ++ offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); ++ blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); ++ ++ if (bpo->bpo_cached_dbuf == NULL || ++ offset < bpo->bpo_cached_dbuf->db_offset || ++ offset >= bpo->bpo_cached_dbuf->db_offset + ++ bpo->bpo_cached_dbuf->db_size) { ++ if (bpo->bpo_cached_dbuf) ++ dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); ++ VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, ++ offset, bpo, &bpo->bpo_cached_dbuf, 0)); ++ } ++ ++ dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); ++ bparray = bpo->bpo_cached_dbuf->db_data; ++ bparray[blkoff] = stored_bp; ++ ++ dmu_buf_will_dirty(bpo->bpo_dbuf, tx); ++ bpo->bpo_phys->bpo_num_blkptrs++; ++ bpo->bpo_phys->bpo_bytes += ++ bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); ++ if (bpo->bpo_havecomp) { ++ bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); ++ bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); ++ } ++ mutex_exit(&bpo->bpo_lock); ++} ++ ++struct space_range_arg { ++ spa_t *spa; ++ uint64_t mintxg; ++ uint64_t maxtxg; ++ uint64_t used; ++ uint64_t comp; ++ uint64_t uncomp; ++}; ++ ++/* ARGSUSED */ ++static int ++space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) ++{ ++ struct space_range_arg *sra = arg; ++ ++ if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { ++ if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) ++ sra->used += bp_get_dsize_sync(sra->spa, bp); ++ else ++ sra->used += bp_get_dsize(sra->spa, bp); ++ sra->comp += BP_GET_PSIZE(bp); ++ sra->uncomp += BP_GET_UCSIZE(bp); ++ } ++ return (0); ++} ++ ++int ++bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) ++{ ++ mutex_enter(&bpo->bpo_lock); ++ ++ *usedp = bpo->bpo_phys->bpo_bytes; ++ if (bpo->bpo_havecomp) { ++ *compp = bpo->bpo_phys->bpo_comp; ++ *uncompp = bpo->bpo_phys->bpo_uncomp; ++ mutex_exit(&bpo->bpo_lock); ++ return (0); ++ } else { ++ mutex_exit(&bpo->bpo_lock); ++ return (bpobj_space_range(bpo, 0, UINT64_MAX, ++ usedp, compp, uncompp)); ++ } ++} ++ ++/* ++ * Return the amount of space in the bpobj which is: ++ * mintxg < blk_birth <= maxtxg ++ */ ++int ++bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, ++ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) ++{ ++ struct space_range_arg sra = { 0 }; ++ int err; ++ ++ /* ++ * As an optimization, if they want the whole txg range, just ++ * get bpo_bytes rather than iterating over the bps. ++ */ ++ if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) ++ return (bpobj_space(bpo, usedp, compp, uncompp)); ++ ++ sra.spa = dmu_objset_spa(bpo->bpo_os); ++ sra.mintxg = mintxg; ++ sra.maxtxg = maxtxg; ++ ++ err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); ++ *usedp = sra.used; ++ *compp = sra.comp; ++ *uncompp = sra.uncomp; ++ return (err); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dbuf.c linux-3.2.33-go/fs/zfs/zfs/dbuf.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dbuf.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dbuf.c 2012-11-16 23:25:34.353039289 +0100 +@@ -0,0 +1,2869 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct dbuf_hold_impl_data { ++ /* Function arguments */ ++ dnode_t *dh_dn; ++ uint8_t dh_level; ++ uint64_t dh_blkid; ++ int dh_fail_sparse; ++ void *dh_tag; ++ dmu_buf_impl_t **dh_dbp; ++ /* Local variables */ ++ dmu_buf_impl_t *dh_db; ++ dmu_buf_impl_t *dh_parent; ++ blkptr_t *dh_bp; ++ int dh_err; ++ dbuf_dirty_record_t *dh_dr; ++ arc_buf_contents_t dh_type; ++ int dh_depth; ++}; ++ ++static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, ++ dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, ++ void *tag, dmu_buf_impl_t **dbp, int depth); ++static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); ++ ++static void dbuf_destroy(dmu_buf_impl_t *db); ++static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); ++static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); ++ ++/* ++ * Global data structures and functions for the dbuf cache. ++ */ ++static kmem_cache_t *dbuf_cache; ++ ++/* ARGSUSED */ ++static int ++dbuf_cons(void *vdb, void *unused, int kmflag) ++{ ++ dmu_buf_impl_t *db = vdb; ++ bzero(db, sizeof (dmu_buf_impl_t)); ++ ++ mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); ++ cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); ++ refcount_create(&db->db_holds); ++ list_link_init(&db->db_link); ++ return (0); ++} ++ ++/* ARGSUSED */ ++static void ++dbuf_dest(void *vdb, void *unused) ++{ ++ dmu_buf_impl_t *db = vdb; ++ mutex_destroy(&db->db_mtx); ++ cv_destroy(&db->db_changed); ++ refcount_destroy(&db->db_holds); ++} ++ ++/* ++ * dbuf hash table routines ++ */ ++static dbuf_hash_table_t dbuf_hash_table; ++ ++static uint64_t dbuf_hash_count; ++ ++static uint64_t ++dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) ++{ ++ uintptr_t osv = (uintptr_t)os; ++ uint64_t crc = -1ULL; ++ ++ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); ++ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; ++ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; ++ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; ++ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; ++ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; ++ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; ++ ++ crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); ++ ++ return (crc); ++} ++ ++#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); ++ ++#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ++ ((dbuf)->db.db_object == (obj) && \ ++ (dbuf)->db_objset == (os) && \ ++ (dbuf)->db_level == (level) && \ ++ (dbuf)->db_blkid == (blkid)) ++ ++dmu_buf_impl_t * ++dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) ++{ ++ dbuf_hash_table_t *h = &dbuf_hash_table; ++ objset_t *os = dn->dn_objset; ++ uint64_t obj; ++ uint64_t hv; ++ uint64_t idx; ++ dmu_buf_impl_t *db; ++ ++ obj = dn->dn_object; ++ hv = DBUF_HASH(os, obj, level, blkid); ++ idx = hv & h->hash_table_mask; ++ ++ mutex_enter(DBUF_HASH_MUTEX(h, idx)); ++ for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { ++ if (DBUF_EQUAL(db, os, obj, level, blkid)) { ++ mutex_enter(&db->db_mtx); ++ if (db->db_state != DB_EVICTING) { ++ mutex_exit(DBUF_HASH_MUTEX(h, idx)); ++ return (db); ++ } ++ mutex_exit(&db->db_mtx); ++ } ++ } ++ mutex_exit(DBUF_HASH_MUTEX(h, idx)); ++ return (NULL); ++} ++ ++/* ++ * Insert an entry into the hash table. If there is already an element ++ * equal to elem in the hash table, then the already existing element ++ * will be returned and the new element will not be inserted. ++ * Otherwise returns NULL. ++ */ ++static dmu_buf_impl_t * ++dbuf_hash_insert(dmu_buf_impl_t *db) ++{ ++ dbuf_hash_table_t *h = &dbuf_hash_table; ++ objset_t *os = db->db_objset; ++ uint64_t obj = db->db.db_object; ++ int level = db->db_level; ++ uint64_t blkid, hv, idx; ++ dmu_buf_impl_t *dbf; ++ ++ blkid = db->db_blkid; ++ hv = DBUF_HASH(os, obj, level, blkid); ++ idx = hv & h->hash_table_mask; ++ ++ mutex_enter(DBUF_HASH_MUTEX(h, idx)); ++ for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { ++ if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { ++ mutex_enter(&dbf->db_mtx); ++ if (dbf->db_state != DB_EVICTING) { ++ mutex_exit(DBUF_HASH_MUTEX(h, idx)); ++ return (dbf); ++ } ++ mutex_exit(&dbf->db_mtx); ++ } ++ } ++ ++ mutex_enter(&db->db_mtx); ++ db->db_hash_next = h->hash_table[idx]; ++ h->hash_table[idx] = db; ++ mutex_exit(DBUF_HASH_MUTEX(h, idx)); ++ atomic_add_64(&dbuf_hash_count, 1); ++ ++ return (NULL); ++} ++ ++/* ++ * Remove an entry from the hash table. This operation will ++ * fail if there are any existing holds on the db. ++ */ ++static void ++dbuf_hash_remove(dmu_buf_impl_t *db) ++{ ++ dbuf_hash_table_t *h = &dbuf_hash_table; ++ uint64_t hv, idx; ++ dmu_buf_impl_t *dbf, **dbp; ++ ++ hv = DBUF_HASH(db->db_objset, db->db.db_object, ++ db->db_level, db->db_blkid); ++ idx = hv & h->hash_table_mask; ++ ++ /* ++ * We musn't hold db_mtx to maintin lock ordering: ++ * DBUF_HASH_MUTEX > db_mtx. ++ */ ++ ASSERT(refcount_is_zero(&db->db_holds)); ++ ASSERT(db->db_state == DB_EVICTING); ++ ASSERT(!MUTEX_HELD(&db->db_mtx)); ++ ++ mutex_enter(DBUF_HASH_MUTEX(h, idx)); ++ dbp = &h->hash_table[idx]; ++ while ((dbf = *dbp) != db) { ++ dbp = &dbf->db_hash_next; ++ ASSERT(dbf != NULL); ++ } ++ *dbp = db->db_hash_next; ++ db->db_hash_next = NULL; ++ mutex_exit(DBUF_HASH_MUTEX(h, idx)); ++ atomic_add_64(&dbuf_hash_count, -1); ++} ++ ++static arc_evict_func_t dbuf_do_evict; ++ ++static void ++dbuf_evict_user(dmu_buf_impl_t *db) ++{ ++ ASSERT(MUTEX_HELD(&db->db_mtx)); ++ ++ if (db->db_level != 0 || db->db_evict_func == NULL) ++ return; ++ ++ if (db->db_user_data_ptr_ptr) ++ *db->db_user_data_ptr_ptr = db->db.db_data; ++ db->db_evict_func(&db->db, db->db_user_ptr); ++ db->db_user_ptr = NULL; ++ db->db_user_data_ptr_ptr = NULL; ++ db->db_evict_func = NULL; ++} ++ ++boolean_t ++dbuf_is_metadata(dmu_buf_impl_t *db) ++{ ++ if (db->db_level > 0) { ++ return (B_TRUE); ++ } else { ++ boolean_t is_metadata; ++ ++ DB_DNODE_ENTER(db); ++ is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata; ++ DB_DNODE_EXIT(db); ++ ++ return (is_metadata); ++ } ++} ++ ++void ++dbuf_evict(dmu_buf_impl_t *db) ++{ ++ ASSERT(MUTEX_HELD(&db->db_mtx)); ++ ASSERT(db->db_buf == NULL); ++ ASSERT(db->db_data_pending == NULL); ++ ++ dbuf_clear(db); ++ dbuf_destroy(db); ++} ++ ++void ++dbuf_init(void) ++{ ++ uint64_t hsize = 1ULL << 16; ++ dbuf_hash_table_t *h = &dbuf_hash_table; ++ int i; ++ ++ /* ++ * The hash table is big enough to fill all of physical memory ++ * with an average 4K block size. The table will take up ++ * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). ++ */ ++ while (hsize * 4096 < physmem * PAGESIZE) ++ hsize <<= 1; ++ ++retry: ++ h->hash_table_mask = hsize - 1; ++#if defined(_KERNEL) && defined(HAVE_SPL) ++ /* Large allocations which do not require contiguous pages ++ * should be using vmem_alloc() in the linux kernel */ ++ h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_PUSHPAGE); ++#else ++ h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); ++#endif ++ if (h->hash_table == NULL) { ++ /* XXX - we should really return an error instead of assert */ ++ ASSERT(hsize > (1ULL << 10)); ++ hsize >>= 1; ++ goto retry; ++ } ++ ++ dbuf_cache = kmem_cache_create("dmu_buf_impl_t", ++ sizeof (dmu_buf_impl_t), ++ 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); ++ ++ for (i = 0; i < DBUF_MUTEXES; i++) ++ mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); ++} ++ ++void ++dbuf_fini(void) ++{ ++ dbuf_hash_table_t *h = &dbuf_hash_table; ++ int i; ++ ++ for (i = 0; i < DBUF_MUTEXES; i++) ++ mutex_destroy(&h->hash_mutexes[i]); ++#if defined(_KERNEL) && defined(HAVE_SPL) ++ /* Large allocations which do not require contiguous pages ++ * should be using vmem_free() in the linux kernel */ ++ vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); ++#else ++ kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); ++#endif ++ kmem_cache_destroy(dbuf_cache); ++} ++ ++/* ++ * Other stuff. ++ */ ++ ++#ifdef ZFS_DEBUG ++static void ++dbuf_verify(dmu_buf_impl_t *db) ++{ ++ dnode_t *dn; ++ dbuf_dirty_record_t *dr; ++ ++ ASSERT(MUTEX_HELD(&db->db_mtx)); ++ ++ if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) ++ return; ++ ++ ASSERT(db->db_objset != NULL); ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ if (dn == NULL) { ++ ASSERT(db->db_parent == NULL); ++ ASSERT(db->db_blkptr == NULL); ++ } else { ++ ASSERT3U(db->db.db_object, ==, dn->dn_object); ++ ASSERT3P(db->db_objset, ==, dn->dn_objset); ++ ASSERT3U(db->db_level, <, dn->dn_nlevels); ++ ASSERT(db->db_blkid == DMU_BONUS_BLKID || ++ db->db_blkid == DMU_SPILL_BLKID || ++ !list_is_empty(&dn->dn_dbufs)); ++ } ++ if (db->db_blkid == DMU_BONUS_BLKID) { ++ ASSERT(dn != NULL); ++ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ++ ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); ++ } else if (db->db_blkid == DMU_SPILL_BLKID) { ++ ASSERT(dn != NULL); ++ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ++ ASSERT3U(db->db.db_offset, ==, 0); ++ } else { ++ ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); ++ } ++ ++ for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) ++ ASSERT(dr->dr_dbuf == db); ++ ++ for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) ++ ASSERT(dr->dr_dbuf == db); ++ ++ /* ++ * We can't assert that db_size matches dn_datablksz because it ++ * can be momentarily different when another thread is doing ++ * dnode_set_blksz(). ++ */ ++ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { ++ dr = db->db_data_pending; ++ /* ++ * It should only be modified in syncing context, so ++ * make sure we only have one copy of the data. ++ */ ++ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); ++ } ++ ++ /* verify db->db_blkptr */ ++ if (db->db_blkptr) { ++ if (db->db_parent == dn->dn_dbuf) { ++ /* db is pointed to by the dnode */ ++ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ ++ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) ++ ASSERT(db->db_parent == NULL); ++ else ++ ASSERT(db->db_parent != NULL); ++ if (db->db_blkid != DMU_SPILL_BLKID) ++ ASSERT3P(db->db_blkptr, ==, ++ &dn->dn_phys->dn_blkptr[db->db_blkid]); ++ } else { ++ /* db is pointed to by an indirect block */ ++ ASSERTV(int epb = db->db_parent->db.db_size >> ++ SPA_BLKPTRSHIFT); ++ ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); ++ ASSERT3U(db->db_parent->db.db_object, ==, ++ db->db.db_object); ++ /* ++ * dnode_grow_indblksz() can make this fail if we don't ++ * have the struct_rwlock. XXX indblksz no longer ++ * grows. safe to do this now? ++ */ ++ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ++ ASSERT3P(db->db_blkptr, ==, ++ ((blkptr_t *)db->db_parent->db.db_data + ++ db->db_blkid % epb)); ++ } ++ } ++ } ++ if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && ++ (db->db_buf == NULL || db->db_buf->b_data) && ++ db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && ++ db->db_state != DB_FILL && !dn->dn_free_txg) { ++ /* ++ * If the blkptr isn't set but they have nonzero data, ++ * it had better be dirty, otherwise we'll lose that ++ * data when we evict this buffer. ++ */ ++ if (db->db_dirtycnt == 0) { ++ ASSERTV(uint64_t *buf = db->db.db_data); ++ int i; ++ ++ for (i = 0; i < db->db.db_size >> 3; i++) { ++ ASSERT(buf[i] == 0); ++ } ++ } ++ } ++ DB_DNODE_EXIT(db); ++} ++#endif ++ ++static void ++dbuf_update_data(dmu_buf_impl_t *db) ++{ ++ ASSERT(MUTEX_HELD(&db->db_mtx)); ++ if (db->db_level == 0 && db->db_user_data_ptr_ptr) { ++ ASSERT(!refcount_is_zero(&db->db_holds)); ++ *db->db_user_data_ptr_ptr = db->db.db_data; ++ } ++} ++ ++static void ++dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) ++{ ++ ASSERT(MUTEX_HELD(&db->db_mtx)); ++ ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); ++ db->db_buf = buf; ++ if (buf != NULL) { ++ ASSERT(buf->b_data != NULL); ++ db->db.db_data = buf->b_data; ++ if (!arc_released(buf)) ++ arc_set_callback(buf, dbuf_do_evict, db); ++ dbuf_update_data(db); ++ } else { ++ dbuf_evict_user(db); ++ db->db.db_data = NULL; ++ if (db->db_state != DB_NOFILL) ++ db->db_state = DB_UNCACHED; ++ } ++} ++ ++/* ++ * Loan out an arc_buf for read. Return the loaned arc_buf. ++ */ ++arc_buf_t * ++dbuf_loan_arcbuf(dmu_buf_impl_t *db) ++{ ++ arc_buf_t *abuf; ++ ++ mutex_enter(&db->db_mtx); ++ if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { ++ int blksz = db->db.db_size; ++ spa_t *spa; ++ ++ mutex_exit(&db->db_mtx); ++ DB_GET_SPA(&spa, db); ++ abuf = arc_loan_buf(spa, blksz); ++ bcopy(db->db.db_data, abuf->b_data, blksz); ++ } else { ++ abuf = db->db_buf; ++ arc_loan_inuse_buf(abuf, db); ++ dbuf_set_data(db, NULL); ++ mutex_exit(&db->db_mtx); ++ } ++ return (abuf); ++} ++ ++uint64_t ++dbuf_whichblock(dnode_t *dn, uint64_t offset) ++{ ++ if (dn->dn_datablkshift) { ++ return (offset >> dn->dn_datablkshift); ++ } else { ++ ASSERT3U(offset, <, dn->dn_datablksz); ++ return (0); ++ } ++} ++ ++static void ++dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) ++{ ++ dmu_buf_impl_t *db = vdb; ++ ++ mutex_enter(&db->db_mtx); ++ ASSERT3U(db->db_state, ==, DB_READ); ++ /* ++ * All reads are synchronous, so we must have a hold on the dbuf ++ */ ++ ASSERT(refcount_count(&db->db_holds) > 0); ++ ASSERT(db->db_buf == NULL); ++ ASSERT(db->db.db_data == NULL); ++ if (db->db_level == 0 && db->db_freed_in_flight) { ++ /* we were freed in flight; disregard any error */ ++ arc_release(buf, db); ++ bzero(buf->b_data, db->db.db_size); ++ arc_buf_freeze(buf); ++ db->db_freed_in_flight = FALSE; ++ dbuf_set_data(db, buf); ++ db->db_state = DB_CACHED; ++ } else if (zio == NULL || zio->io_error == 0) { ++ dbuf_set_data(db, buf); ++ db->db_state = DB_CACHED; ++ } else { ++ ASSERT(db->db_blkid != DMU_BONUS_BLKID); ++ ASSERT3P(db->db_buf, ==, NULL); ++ VERIFY(arc_buf_remove_ref(buf, db) == 1); ++ db->db_state = DB_UNCACHED; ++ } ++ cv_broadcast(&db->db_changed); ++ dbuf_rele_and_unlock(db, NULL); ++} ++ ++static void ++dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) ++{ ++ dnode_t *dn; ++ spa_t *spa; ++ zbookmark_t zb; ++ uint32_t aflags = ARC_NOWAIT; ++ arc_buf_t *pbuf; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ ASSERT(!refcount_is_zero(&db->db_holds)); ++ /* We need the struct_rwlock to prevent db_blkptr from changing. */ ++ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ++ ASSERT(MUTEX_HELD(&db->db_mtx)); ++ ASSERT(db->db_state == DB_UNCACHED); ++ ASSERT(db->db_buf == NULL); ++ ++ if (db->db_blkid == DMU_BONUS_BLKID) { ++ int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); ++ ++ ASSERT3U(bonuslen, <=, db->db.db_size); ++ db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); ++ arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); ++ if (bonuslen < DN_MAX_BONUSLEN) ++ bzero(db->db.db_data, DN_MAX_BONUSLEN); ++ if (bonuslen) ++ bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); ++ DB_DNODE_EXIT(db); ++ dbuf_update_data(db); ++ db->db_state = DB_CACHED; ++ mutex_exit(&db->db_mtx); ++ return; ++ } ++ ++ /* ++ * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() ++ * processes the delete record and clears the bp while we are waiting ++ * for the dn_mtx (resulting in a "no" from block_freed). ++ */ ++ if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || ++ (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || ++ BP_IS_HOLE(db->db_blkptr)))) { ++ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); ++ ++ dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, ++ db->db.db_size, db, type)); ++ DB_DNODE_EXIT(db); ++ bzero(db->db.db_data, db->db.db_size); ++ db->db_state = DB_CACHED; ++ *flags |= DB_RF_CACHED; ++ mutex_exit(&db->db_mtx); ++ return; ++ } ++ ++ spa = dn->dn_objset->os_spa; ++ DB_DNODE_EXIT(db); ++ ++ db->db_state = DB_READ; ++ mutex_exit(&db->db_mtx); ++ ++ if (DBUF_IS_L2CACHEABLE(db)) ++ aflags |= ARC_L2CACHE; ++ ++ SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? ++ db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, ++ db->db.db_object, db->db_level, db->db_blkid); ++ ++ dbuf_add_ref(db, NULL); ++ /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ ++ ++ if (db->db_parent) ++ pbuf = db->db_parent->db_buf; ++ else ++ pbuf = db->db_objset->os_phys_buf; ++ ++ (void) dsl_read(zio, spa, db->db_blkptr, pbuf, ++ dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, ++ (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, ++ &aflags, &zb); ++ if (aflags & ARC_CACHED) ++ *flags |= DB_RF_CACHED; ++} ++ ++int ++dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) ++{ ++ int err = 0; ++ int havepzio = (zio != NULL); ++ int prefetch; ++ dnode_t *dn; ++ ++ /* ++ * We don't have to hold the mutex to check db_state because it ++ * can't be freed while we have a hold on the buffer. ++ */ ++ ASSERT(!refcount_is_zero(&db->db_holds)); ++ ++ if (db->db_state == DB_NOFILL) ++ return (EIO); ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ if ((flags & DB_RF_HAVESTRUCT) == 0) ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ ++ prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && ++ (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && ++ DBUF_IS_CACHEABLE(db); ++ ++ mutex_enter(&db->db_mtx); ++ if (db->db_state == DB_CACHED) { ++ mutex_exit(&db->db_mtx); ++ if (prefetch) ++ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, ++ db->db.db_size, TRUE); ++ if ((flags & DB_RF_HAVESTRUCT) == 0) ++ rw_exit(&dn->dn_struct_rwlock); ++ DB_DNODE_EXIT(db); ++ } else if (db->db_state == DB_UNCACHED) { ++ spa_t *spa = dn->dn_objset->os_spa; ++ ++ if (zio == NULL) ++ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); ++ dbuf_read_impl(db, zio, &flags); ++ ++ /* dbuf_read_impl has dropped db_mtx for us */ ++ ++ if (prefetch) ++ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, ++ db->db.db_size, flags & DB_RF_CACHED); ++ ++ if ((flags & DB_RF_HAVESTRUCT) == 0) ++ rw_exit(&dn->dn_struct_rwlock); ++ DB_DNODE_EXIT(db); ++ ++ if (!havepzio) ++ err = zio_wait(zio); ++ } else { ++ mutex_exit(&db->db_mtx); ++ if (prefetch) ++ dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, ++ db->db.db_size, TRUE); ++ if ((flags & DB_RF_HAVESTRUCT) == 0) ++ rw_exit(&dn->dn_struct_rwlock); ++ DB_DNODE_EXIT(db); ++ ++ mutex_enter(&db->db_mtx); ++ if ((flags & DB_RF_NEVERWAIT) == 0) { ++ while (db->db_state == DB_READ || ++ db->db_state == DB_FILL) { ++ ASSERT(db->db_state == DB_READ || ++ (flags & DB_RF_HAVESTRUCT) == 0); ++ cv_wait(&db->db_changed, &db->db_mtx); ++ } ++ if (db->db_state == DB_UNCACHED) ++ err = EIO; ++ } ++ mutex_exit(&db->db_mtx); ++ } ++ ++ ASSERT(err || havepzio || db->db_state == DB_CACHED); ++ return (err); ++} ++ ++static void ++dbuf_noread(dmu_buf_impl_t *db) ++{ ++ ASSERT(!refcount_is_zero(&db->db_holds)); ++ ASSERT(db->db_blkid != DMU_BONUS_BLKID); ++ mutex_enter(&db->db_mtx); ++ while (db->db_state == DB_READ || db->db_state == DB_FILL) ++ cv_wait(&db->db_changed, &db->db_mtx); ++ if (db->db_state == DB_UNCACHED) { ++ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); ++ spa_t *spa; ++ ++ ASSERT(db->db_buf == NULL); ++ ASSERT(db->db.db_data == NULL); ++ DB_GET_SPA(&spa, db); ++ dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); ++ db->db_state = DB_FILL; ++ } else if (db->db_state == DB_NOFILL) { ++ dbuf_set_data(db, NULL); ++ } else { ++ ASSERT3U(db->db_state, ==, DB_CACHED); ++ } ++ mutex_exit(&db->db_mtx); ++} ++ ++/* ++ * This is our just-in-time copy function. It makes a copy of ++ * buffers, that have been modified in a previous transaction ++ * group, before we modify them in the current active group. ++ * ++ * This function is used in two places: when we are dirtying a ++ * buffer for the first time in a txg, and when we are freeing ++ * a range in a dnode that includes this buffer. ++ * ++ * Note that when we are called from dbuf_free_range() we do ++ * not put a hold on the buffer, we just traverse the active ++ * dbuf list for the dnode. ++ */ ++static void ++dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) ++{ ++ dbuf_dirty_record_t *dr = db->db_last_dirty; ++ ++ ASSERT(MUTEX_HELD(&db->db_mtx)); ++ ASSERT(db->db.db_data != NULL); ++ ASSERT(db->db_level == 0); ++ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); ++ ++ if (dr == NULL || ++ (dr->dt.dl.dr_data != ++ ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) ++ return; ++ ++ /* ++ * If the last dirty record for this dbuf has not yet synced ++ * and its referencing the dbuf data, either: ++ * reset the reference to point to a new copy, ++ * or (if there a no active holders) ++ * just null out the current db_data pointer. ++ */ ++ ASSERT(dr->dr_txg >= txg - 2); ++ if (db->db_blkid == DMU_BONUS_BLKID) { ++ /* Note that the data bufs here are zio_bufs */ ++ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); ++ arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); ++ bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); ++ } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { ++ int size = db->db.db_size; ++ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); ++ spa_t *spa; ++ ++ DB_GET_SPA(&spa, db); ++ dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); ++ bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); ++ } else { ++ dbuf_set_data(db, NULL); ++ } ++} ++ ++void ++dbuf_unoverride(dbuf_dirty_record_t *dr) ++{ ++ dmu_buf_impl_t *db = dr->dr_dbuf; ++ blkptr_t *bp = &dr->dt.dl.dr_overridden_by; ++ uint64_t txg = dr->dr_txg; ++ ++ ASSERT(MUTEX_HELD(&db->db_mtx)); ++ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); ++ ASSERT(db->db_level == 0); ++ ++ if (db->db_blkid == DMU_BONUS_BLKID || ++ dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) ++ return; ++ ++ ASSERT(db->db_data_pending != dr); ++ ++ /* free this block */ ++ if (!BP_IS_HOLE(bp)) { ++ spa_t *spa; ++ ++ DB_GET_SPA(&spa, db); ++ zio_free(spa, txg, bp); ++ } ++ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; ++ /* ++ * Release the already-written buffer, so we leave it in ++ * a consistent dirty state. Note that all callers are ++ * modifying the buffer, so they will immediately do ++ * another (redundant) arc_release(). Therefore, leave ++ * the buf thawed to save the effort of freezing & ++ * immediately re-thawing it. ++ */ ++ arc_release(dr->dt.dl.dr_data, db); ++} ++ ++/* ++ * Evict (if its unreferenced) or clear (if its referenced) any level-0 ++ * data blocks in the free range, so that any future readers will find ++ * empty blocks. Also, if we happen accross any level-1 dbufs in the ++ * range that have not already been marked dirty, mark them dirty so ++ * they stay in memory. ++ */ ++void ++dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) ++{ ++ dmu_buf_impl_t *db, *db_next; ++ uint64_t txg = tx->tx_txg; ++ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ++ uint64_t first_l1 = start >> epbs; ++ uint64_t last_l1 = end >> epbs; ++ ++ if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { ++ end = dn->dn_maxblkid; ++ last_l1 = end >> epbs; ++ } ++ dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); ++ mutex_enter(&dn->dn_dbufs_mtx); ++ for (db = list_head(&dn->dn_dbufs); db; db = db_next) { ++ db_next = list_next(&dn->dn_dbufs, db); ++ ASSERT(db->db_blkid != DMU_BONUS_BLKID); ++ ++ if (db->db_level == 1 && ++ db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { ++ mutex_enter(&db->db_mtx); ++ if (db->db_last_dirty && ++ db->db_last_dirty->dr_txg < txg) { ++ dbuf_add_ref(db, FTAG); ++ mutex_exit(&db->db_mtx); ++ dbuf_will_dirty(db, tx); ++ dbuf_rele(db, FTAG); ++ } else { ++ mutex_exit(&db->db_mtx); ++ } ++ } ++ ++ if (db->db_level != 0) ++ continue; ++ dprintf_dbuf(db, "found buf %s\n", ""); ++ if (db->db_blkid < start || db->db_blkid > end) ++ continue; ++ ++ /* found a level 0 buffer in the range */ ++ if (dbuf_undirty(db, tx)) ++ continue; ++ ++ mutex_enter(&db->db_mtx); ++ if (db->db_state == DB_UNCACHED || ++ db->db_state == DB_NOFILL || ++ db->db_state == DB_EVICTING) { ++ ASSERT(db->db.db_data == NULL); ++ mutex_exit(&db->db_mtx); ++ continue; ++ } ++ if (db->db_state == DB_READ || db->db_state == DB_FILL) { ++ /* will be handled in dbuf_read_done or dbuf_rele */ ++ db->db_freed_in_flight = TRUE; ++ mutex_exit(&db->db_mtx); ++ continue; ++ } ++ if (refcount_count(&db->db_holds) == 0) { ++ ASSERT(db->db_buf); ++ dbuf_clear(db); ++ continue; ++ } ++ /* The dbuf is referenced */ ++ ++ if (db->db_last_dirty != NULL) { ++ dbuf_dirty_record_t *dr = db->db_last_dirty; ++ ++ if (dr->dr_txg == txg) { ++ /* ++ * This buffer is "in-use", re-adjust the file ++ * size to reflect that this buffer may ++ * contain new data when we sync. ++ */ ++ if (db->db_blkid != DMU_SPILL_BLKID && ++ db->db_blkid > dn->dn_maxblkid) ++ dn->dn_maxblkid = db->db_blkid; ++ dbuf_unoverride(dr); ++ } else { ++ /* ++ * This dbuf is not dirty in the open context. ++ * Either uncache it (if its not referenced in ++ * the open context) or reset its contents to ++ * empty. ++ */ ++ dbuf_fix_old_data(db, txg); ++ } ++ } ++ /* clear the contents if its cached */ ++ if (db->db_state == DB_CACHED) { ++ ASSERT(db->db.db_data != NULL); ++ arc_release(db->db_buf, db); ++ bzero(db->db.db_data, db->db.db_size); ++ arc_buf_freeze(db->db_buf); ++ } ++ ++ mutex_exit(&db->db_mtx); ++ } ++ mutex_exit(&dn->dn_dbufs_mtx); ++} ++ ++static int ++dbuf_block_freeable(dmu_buf_impl_t *db) ++{ ++ dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; ++ uint64_t birth_txg = 0; ++ ++ /* ++ * We don't need any locking to protect db_blkptr: ++ * If it's syncing, then db_last_dirty will be set ++ * so we'll ignore db_blkptr. ++ */ ++ ASSERT(MUTEX_HELD(&db->db_mtx)); ++ if (db->db_last_dirty) ++ birth_txg = db->db_last_dirty->dr_txg; ++ else if (db->db_blkptr) ++ birth_txg = db->db_blkptr->blk_birth; ++ ++ /* ++ * If we don't exist or are in a snapshot, we can't be freed. ++ * Don't pass the bp to dsl_dataset_block_freeable() since we ++ * are holding the db_mtx lock and might deadlock if we are ++ * prefetching a dedup-ed block. ++ */ ++ if (birth_txg) ++ return (ds == NULL || ++ dsl_dataset_block_freeable(ds, NULL, birth_txg)); ++ else ++ return (FALSE); ++} ++ ++void ++dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) ++{ ++ arc_buf_t *buf, *obuf; ++ int osize = db->db.db_size; ++ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); ++ dnode_t *dn; ++ ++ ASSERT(db->db_blkid != DMU_BONUS_BLKID); ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ ++ /* XXX does *this* func really need the lock? */ ++ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ++ ++ /* ++ * This call to dbuf_will_dirty() with the dn_struct_rwlock held ++ * is OK, because there can be no other references to the db ++ * when we are changing its size, so no concurrent DB_FILL can ++ * be happening. ++ */ ++ /* ++ * XXX we should be doing a dbuf_read, checking the return ++ * value and returning that up to our callers ++ */ ++ dbuf_will_dirty(db, tx); ++ ++ /* create the data buffer for the new block */ ++ buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); ++ ++ /* copy old block data to the new block */ ++ obuf = db->db_buf; ++ bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); ++ /* zero the remainder */ ++ if (size > osize) ++ bzero((uint8_t *)buf->b_data + osize, size - osize); ++ ++ mutex_enter(&db->db_mtx); ++ dbuf_set_data(db, buf); ++ VERIFY(arc_buf_remove_ref(obuf, db) == 1); ++ db->db.db_size = size; ++ ++ if (db->db_level == 0) { ++ ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); ++ db->db_last_dirty->dt.dl.dr_data = buf; ++ } ++ mutex_exit(&db->db_mtx); ++ ++ dnode_willuse_space(dn, size-osize, tx); ++ DB_DNODE_EXIT(db); ++} ++ ++void ++dbuf_release_bp(dmu_buf_impl_t *db) ++{ ++ objset_t *os; ++ zbookmark_t zb; ++ ++ DB_GET_OBJSET(&os, db); ++ ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ++ ASSERT(arc_released(os->os_phys_buf) || ++ list_link_active(&os->os_dsl_dataset->ds_synced_link)); ++ ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); ++ ++ zb.zb_objset = os->os_dsl_dataset ? ++ os->os_dsl_dataset->ds_object : 0; ++ zb.zb_object = db->db.db_object; ++ zb.zb_level = db->db_level; ++ zb.zb_blkid = db->db_blkid; ++ (void) arc_release_bp(db->db_buf, db, ++ db->db_blkptr, os->os_spa, &zb); ++} ++ ++dbuf_dirty_record_t * ++dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ++{ ++ dnode_t *dn; ++ objset_t *os; ++ dbuf_dirty_record_t **drp, *dr; ++ int drop_struct_lock = FALSE; ++ boolean_t do_free_accounting = B_FALSE; ++ int txgoff = tx->tx_txg & TXG_MASK; ++ ++ ASSERT(tx->tx_txg != 0); ++ ASSERT(!refcount_is_zero(&db->db_holds)); ++ DMU_TX_DIRTY_BUF(tx, db); ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ /* ++ * Shouldn't dirty a regular buffer in syncing context. Private ++ * objects may be dirtied in syncing context, but only if they ++ * were already pre-dirtied in open context. ++ */ ++ ASSERT(!dmu_tx_is_syncing(tx) || ++ BP_IS_HOLE(dn->dn_objset->os_rootbp) || ++ DMU_OBJECT_IS_SPECIAL(dn->dn_object) || ++ dn->dn_objset->os_dsl_dataset == NULL); ++ /* ++ * We make this assert for private objects as well, but after we ++ * check if we're already dirty. They are allowed to re-dirty ++ * in syncing context. ++ */ ++ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || ++ dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == ++ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); ++ ++ mutex_enter(&db->db_mtx); ++ /* ++ * XXX make this true for indirects too? The problem is that ++ * transactions created with dmu_tx_create_assigned() from ++ * syncing context don't bother holding ahead. ++ */ ++ ASSERT(db->db_level != 0 || ++ db->db_state == DB_CACHED || db->db_state == DB_FILL || ++ db->db_state == DB_NOFILL); ++ ++ mutex_enter(&dn->dn_mtx); ++ /* ++ * Don't set dirtyctx to SYNC if we're just modifying this as we ++ * initialize the objset. ++ */ ++ if (dn->dn_dirtyctx == DN_UNDIRTIED && ++ !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { ++ dn->dn_dirtyctx = ++ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); ++ ASSERT(dn->dn_dirtyctx_firstset == NULL); ++ dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_PUSHPAGE); ++ } ++ mutex_exit(&dn->dn_mtx); ++ ++ if (db->db_blkid == DMU_SPILL_BLKID) ++ dn->dn_have_spill = B_TRUE; ++ ++ /* ++ * If this buffer is already dirty, we're done. ++ */ ++ drp = &db->db_last_dirty; ++ ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || ++ db->db.db_object == DMU_META_DNODE_OBJECT); ++ while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) ++ drp = &dr->dr_next; ++ if (dr && dr->dr_txg == tx->tx_txg) { ++ DB_DNODE_EXIT(db); ++ ++ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { ++ /* ++ * If this buffer has already been written out, ++ * we now need to reset its state. ++ */ ++ dbuf_unoverride(dr); ++ if (db->db.db_object != DMU_META_DNODE_OBJECT && ++ db->db_state != DB_NOFILL) ++ arc_buf_thaw(db->db_buf); ++ } ++ mutex_exit(&db->db_mtx); ++ return (dr); ++ } ++ ++ /* ++ * Only valid if not already dirty. ++ */ ++ ASSERT(dn->dn_object == 0 || ++ dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == ++ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); ++ ++ ASSERT3U(dn->dn_nlevels, >, db->db_level); ++ ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || ++ dn->dn_phys->dn_nlevels > db->db_level || ++ dn->dn_next_nlevels[txgoff] > db->db_level || ++ dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || ++ dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); ++ ++ /* ++ * We should only be dirtying in syncing context if it's the ++ * mos or we're initializing the os or it's a special object. ++ * However, we are allowed to dirty in syncing context provided ++ * we already dirtied it in open context. Hence we must make ++ * this assertion only if we're not already dirty. ++ */ ++ os = dn->dn_objset; ++ ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || ++ os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); ++ ASSERT(db->db.db_size != 0); ++ ++ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); ++ ++ if (db->db_blkid != DMU_BONUS_BLKID) { ++ /* ++ * Update the accounting. ++ * Note: we delay "free accounting" until after we drop ++ * the db_mtx. This keeps us from grabbing other locks ++ * (and possibly deadlocking) in bp_get_dsize() while ++ * also holding the db_mtx. ++ */ ++ dnode_willuse_space(dn, db->db.db_size, tx); ++ do_free_accounting = dbuf_block_freeable(db); ++ } ++ ++ /* ++ * If this buffer is dirty in an old transaction group we need ++ * to make a copy of it so that the changes we make in this ++ * transaction group won't leak out when we sync the older txg. ++ */ ++ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_PUSHPAGE); ++ list_link_init(&dr->dr_dirty_node); ++ if (db->db_level == 0) { ++ void *data_old = db->db_buf; ++ ++ if (db->db_state != DB_NOFILL) { ++ if (db->db_blkid == DMU_BONUS_BLKID) { ++ dbuf_fix_old_data(db, tx->tx_txg); ++ data_old = db->db.db_data; ++ } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { ++ /* ++ * Release the data buffer from the cache so ++ * that we can modify it without impacting ++ * possible other users of this cached data ++ * block. Note that indirect blocks and ++ * private objects are not released until the ++ * syncing state (since they are only modified ++ * then). ++ */ ++ arc_release(db->db_buf, db); ++ dbuf_fix_old_data(db, tx->tx_txg); ++ data_old = db->db_buf; ++ } ++ ASSERT(data_old != NULL); ++ } ++ dr->dt.dl.dr_data = data_old; ++ } else { ++ mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); ++ list_create(&dr->dt.di.dr_children, ++ sizeof (dbuf_dirty_record_t), ++ offsetof(dbuf_dirty_record_t, dr_dirty_node)); ++ } ++ dr->dr_dbuf = db; ++ dr->dr_txg = tx->tx_txg; ++ dr->dr_next = *drp; ++ *drp = dr; ++ ++ /* ++ * We could have been freed_in_flight between the dbuf_noread ++ * and dbuf_dirty. We win, as though the dbuf_noread() had ++ * happened after the free. ++ */ ++ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && ++ db->db_blkid != DMU_SPILL_BLKID) { ++ mutex_enter(&dn->dn_mtx); ++ dnode_clear_range(dn, db->db_blkid, 1, tx); ++ mutex_exit(&dn->dn_mtx); ++ db->db_freed_in_flight = FALSE; ++ } ++ ++ /* ++ * This buffer is now part of this txg ++ */ ++ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); ++ db->db_dirtycnt += 1; ++ ASSERT3U(db->db_dirtycnt, <=, 3); ++ ++ mutex_exit(&db->db_mtx); ++ ++ if (db->db_blkid == DMU_BONUS_BLKID || ++ db->db_blkid == DMU_SPILL_BLKID) { ++ mutex_enter(&dn->dn_mtx); ++ ASSERT(!list_link_active(&dr->dr_dirty_node)); ++ list_insert_tail(&dn->dn_dirty_records[txgoff], dr); ++ mutex_exit(&dn->dn_mtx); ++ dnode_setdirty(dn, tx); ++ DB_DNODE_EXIT(db); ++ return (dr); ++ } else if (do_free_accounting) { ++ blkptr_t *bp = db->db_blkptr; ++ int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? ++ bp_get_dsize(os->os_spa, bp) : db->db.db_size; ++ /* ++ * This is only a guess -- if the dbuf is dirty ++ * in a previous txg, we don't know how much ++ * space it will use on disk yet. We should ++ * really have the struct_rwlock to access ++ * db_blkptr, but since this is just a guess, ++ * it's OK if we get an odd answer. ++ */ ++ ddt_prefetch(os->os_spa, bp); ++ dnode_willuse_space(dn, -willfree, tx); ++ } ++ ++ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ drop_struct_lock = TRUE; ++ } ++ ++ if (db->db_level == 0) { ++ dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); ++ ASSERT(dn->dn_maxblkid >= db->db_blkid); ++ } ++ ++ if (db->db_level+1 < dn->dn_nlevels) { ++ dmu_buf_impl_t *parent = db->db_parent; ++ dbuf_dirty_record_t *di; ++ int parent_held = FALSE; ++ ++ if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { ++ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ++ ++ parent = dbuf_hold_level(dn, db->db_level+1, ++ db->db_blkid >> epbs, FTAG); ++ ASSERT(parent != NULL); ++ parent_held = TRUE; ++ } ++ if (drop_struct_lock) ++ rw_exit(&dn->dn_struct_rwlock); ++ ASSERT3U(db->db_level+1, ==, parent->db_level); ++ di = dbuf_dirty(parent, tx); ++ if (parent_held) ++ dbuf_rele(parent, FTAG); ++ ++ mutex_enter(&db->db_mtx); ++ /* possible race with dbuf_undirty() */ ++ if (db->db_last_dirty == dr || ++ dn->dn_object == DMU_META_DNODE_OBJECT) { ++ mutex_enter(&di->dt.di.dr_mtx); ++ ASSERT3U(di->dr_txg, ==, tx->tx_txg); ++ ASSERT(!list_link_active(&dr->dr_dirty_node)); ++ list_insert_tail(&di->dt.di.dr_children, dr); ++ mutex_exit(&di->dt.di.dr_mtx); ++ dr->dr_parent = di; ++ } ++ mutex_exit(&db->db_mtx); ++ } else { ++ ASSERT(db->db_level+1 == dn->dn_nlevels); ++ ASSERT(db->db_blkid < dn->dn_nblkptr); ++ ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); ++ mutex_enter(&dn->dn_mtx); ++ ASSERT(!list_link_active(&dr->dr_dirty_node)); ++ list_insert_tail(&dn->dn_dirty_records[txgoff], dr); ++ mutex_exit(&dn->dn_mtx); ++ if (drop_struct_lock) ++ rw_exit(&dn->dn_struct_rwlock); ++ } ++ ++ dnode_setdirty(dn, tx); ++ DB_DNODE_EXIT(db); ++ return (dr); ++} ++ ++static int ++dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ++{ ++ dnode_t *dn; ++ uint64_t txg = tx->tx_txg; ++ dbuf_dirty_record_t *dr, **drp; ++ ++ ASSERT(txg != 0); ++ ASSERT(db->db_blkid != DMU_BONUS_BLKID); ++ ++ mutex_enter(&db->db_mtx); ++ /* ++ * If this buffer is not dirty, we're done. ++ */ ++ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) ++ if (dr->dr_txg <= txg) ++ break; ++ if (dr == NULL || dr->dr_txg < txg) { ++ mutex_exit(&db->db_mtx); ++ return (0); ++ } ++ ASSERT(dr->dr_txg == txg); ++ ASSERT(dr->dr_dbuf == db); ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ ++ /* ++ * If this buffer is currently held, we cannot undirty ++ * it, since one of the current holders may be in the ++ * middle of an update. Note that users of dbuf_undirty() ++ * should not place a hold on the dbuf before the call. ++ * Also note: we can get here with a spill block, so ++ * test for that similar to how dbuf_dirty does. ++ */ ++ if (refcount_count(&db->db_holds) > db->db_dirtycnt) { ++ mutex_exit(&db->db_mtx); ++ /* Make sure we don't toss this buffer at sync phase */ ++ if (db->db_blkid != DMU_SPILL_BLKID) { ++ mutex_enter(&dn->dn_mtx); ++ dnode_clear_range(dn, db->db_blkid, 1, tx); ++ mutex_exit(&dn->dn_mtx); ++ } ++ DB_DNODE_EXIT(db); ++ return (0); ++ } ++ ++ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); ++ ++ ASSERT(db->db.db_size != 0); ++ ++ /* XXX would be nice to fix up dn_towrite_space[] */ ++ ++ *drp = dr->dr_next; ++ ++ /* ++ * Note that there are three places in dbuf_dirty() ++ * where this dirty record may be put on a list. ++ * Make sure to do a list_remove corresponding to ++ * every one of those list_insert calls. ++ */ ++ if (dr->dr_parent) { ++ mutex_enter(&dr->dr_parent->dt.di.dr_mtx); ++ list_remove(&dr->dr_parent->dt.di.dr_children, dr); ++ mutex_exit(&dr->dr_parent->dt.di.dr_mtx); ++ } else if (db->db_blkid == DMU_SPILL_BLKID || ++ db->db_level+1 == dn->dn_nlevels) { ++ ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); ++ mutex_enter(&dn->dn_mtx); ++ list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); ++ mutex_exit(&dn->dn_mtx); ++ } ++ DB_DNODE_EXIT(db); ++ ++ if (db->db_level == 0) { ++ if (db->db_state != DB_NOFILL) { ++ dbuf_unoverride(dr); ++ ++ ASSERT(db->db_buf != NULL); ++ ASSERT(dr->dt.dl.dr_data != NULL); ++ if (dr->dt.dl.dr_data != db->db_buf) ++ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, ++ db) == 1); ++ } ++ } else { ++ ASSERT(db->db_buf != NULL); ++ ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ++ mutex_destroy(&dr->dt.di.dr_mtx); ++ list_destroy(&dr->dt.di.dr_children); ++ } ++ kmem_free(dr, sizeof (dbuf_dirty_record_t)); ++ ++ ASSERT(db->db_dirtycnt > 0); ++ db->db_dirtycnt -= 1; ++ ++ if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { ++ arc_buf_t *buf = db->db_buf; ++ ++ ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); ++ dbuf_set_data(db, NULL); ++ VERIFY(arc_buf_remove_ref(buf, db) == 1); ++ dbuf_evict(db); ++ return (1); ++ } ++ ++ mutex_exit(&db->db_mtx); ++ return (0); ++} ++ ++#pragma weak dmu_buf_will_dirty = dbuf_will_dirty ++void ++dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ++{ ++ int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; ++ ++ ASSERT(tx->tx_txg != 0); ++ ASSERT(!refcount_is_zero(&db->db_holds)); ++ ++ DB_DNODE_ENTER(db); ++ if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) ++ rf |= DB_RF_HAVESTRUCT; ++ DB_DNODE_EXIT(db); ++ (void) dbuf_read(db, NULL, rf); ++ (void) dbuf_dirty(db, tx); ++} ++ ++void ++dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ++ ++ db->db_state = DB_NOFILL; ++ ++ dmu_buf_will_fill(db_fake, tx); ++} ++ ++void ++dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ++ ++ ASSERT(db->db_blkid != DMU_BONUS_BLKID); ++ ASSERT(tx->tx_txg != 0); ++ ASSERT(db->db_level == 0); ++ ASSERT(!refcount_is_zero(&db->db_holds)); ++ ++ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || ++ dmu_tx_private_ok(tx)); ++ ++ dbuf_noread(db); ++ (void) dbuf_dirty(db, tx); ++} ++ ++#pragma weak dmu_buf_fill_done = dbuf_fill_done ++/* ARGSUSED */ ++void ++dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) ++{ ++ mutex_enter(&db->db_mtx); ++ DBUF_VERIFY(db); ++ ++ if (db->db_state == DB_FILL) { ++ if (db->db_level == 0 && db->db_freed_in_flight) { ++ ASSERT(db->db_blkid != DMU_BONUS_BLKID); ++ /* we were freed while filling */ ++ /* XXX dbuf_undirty? */ ++ bzero(db->db.db_data, db->db.db_size); ++ db->db_freed_in_flight = FALSE; ++ } ++ db->db_state = DB_CACHED; ++ cv_broadcast(&db->db_changed); ++ } ++ mutex_exit(&db->db_mtx); ++} ++ ++/* ++ * Directly assign a provided arc buf to a given dbuf if it's not referenced ++ * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. ++ */ ++void ++dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) ++{ ++ ASSERT(!refcount_is_zero(&db->db_holds)); ++ ASSERT(db->db_blkid != DMU_BONUS_BLKID); ++ ASSERT(db->db_level == 0); ++ ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); ++ ASSERT(buf != NULL); ++ ASSERT(arc_buf_size(buf) == db->db.db_size); ++ ASSERT(tx->tx_txg != 0); ++ ++ arc_return_buf(buf, db); ++ ASSERT(arc_released(buf)); ++ ++ mutex_enter(&db->db_mtx); ++ ++ while (db->db_state == DB_READ || db->db_state == DB_FILL) ++ cv_wait(&db->db_changed, &db->db_mtx); ++ ++ ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); ++ ++ if (db->db_state == DB_CACHED && ++ refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { ++ mutex_exit(&db->db_mtx); ++ (void) dbuf_dirty(db, tx); ++ bcopy(buf->b_data, db->db.db_data, db->db.db_size); ++ VERIFY(arc_buf_remove_ref(buf, db) == 1); ++ xuio_stat_wbuf_copied(); ++ return; ++ } ++ ++ xuio_stat_wbuf_nocopy(); ++ if (db->db_state == DB_CACHED) { ++ dbuf_dirty_record_t *dr = db->db_last_dirty; ++ ++ ASSERT(db->db_buf != NULL); ++ if (dr != NULL && dr->dr_txg == tx->tx_txg) { ++ ASSERT(dr->dt.dl.dr_data == db->db_buf); ++ if (!arc_released(db->db_buf)) { ++ ASSERT(dr->dt.dl.dr_override_state == ++ DR_OVERRIDDEN); ++ arc_release(db->db_buf, db); ++ } ++ dr->dt.dl.dr_data = buf; ++ VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); ++ } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { ++ arc_release(db->db_buf, db); ++ VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); ++ } ++ db->db_buf = NULL; ++ } ++ ASSERT(db->db_buf == NULL); ++ dbuf_set_data(db, buf); ++ db->db_state = DB_FILL; ++ mutex_exit(&db->db_mtx); ++ (void) dbuf_dirty(db, tx); ++ dbuf_fill_done(db, tx); ++} ++ ++/* ++ * "Clear" the contents of this dbuf. This will mark the dbuf ++ * EVICTING and clear *most* of its references. Unfortunetely, ++ * when we are not holding the dn_dbufs_mtx, we can't clear the ++ * entry in the dn_dbufs list. We have to wait until dbuf_destroy() ++ * in this case. For callers from the DMU we will usually see: ++ * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() ++ * For the arc callback, we will usually see: ++ * dbuf_do_evict()->dbuf_clear();dbuf_destroy() ++ * Sometimes, though, we will get a mix of these two: ++ * DMU: dbuf_clear()->arc_buf_evict() ++ * ARC: dbuf_do_evict()->dbuf_destroy() ++ */ ++void ++dbuf_clear(dmu_buf_impl_t *db) ++{ ++ dnode_t *dn; ++ dmu_buf_impl_t *parent = db->db_parent; ++ dmu_buf_impl_t *dndb; ++ int dbuf_gone = FALSE; ++ ++ ASSERT(MUTEX_HELD(&db->db_mtx)); ++ ASSERT(refcount_is_zero(&db->db_holds)); ++ ++ dbuf_evict_user(db); ++ ++ if (db->db_state == DB_CACHED) { ++ ASSERT(db->db.db_data != NULL); ++ if (db->db_blkid == DMU_BONUS_BLKID) { ++ zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); ++ arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); ++ } ++ db->db.db_data = NULL; ++ db->db_state = DB_UNCACHED; ++ } ++ ++ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ++ ASSERT(db->db_data_pending == NULL); ++ ++ db->db_state = DB_EVICTING; ++ db->db_blkptr = NULL; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ dndb = dn->dn_dbuf; ++ if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { ++ list_remove(&dn->dn_dbufs, db); ++ (void) atomic_dec_32_nv(&dn->dn_dbufs_count); ++ membar_producer(); ++ DB_DNODE_EXIT(db); ++ /* ++ * Decrementing the dbuf count means that the hold corresponding ++ * to the removed dbuf is no longer discounted in dnode_move(), ++ * so the dnode cannot be moved until after we release the hold. ++ * The membar_producer() ensures visibility of the decremented ++ * value in dnode_move(), since DB_DNODE_EXIT doesn't actually ++ * release any lock. ++ */ ++ dnode_rele(dn, db); ++ db->db_dnode_handle = NULL; ++ } else { ++ DB_DNODE_EXIT(db); ++ } ++ ++ if (db->db_buf) ++ dbuf_gone = arc_buf_evict(db->db_buf); ++ ++ if (!dbuf_gone) ++ mutex_exit(&db->db_mtx); ++ ++ /* ++ * If this dbuf is referenced from an indirect dbuf, ++ * decrement the ref count on the indirect dbuf. ++ */ ++ if (parent && parent != dndb) ++ dbuf_rele(parent, db); ++} ++ ++__attribute__((always_inline)) ++static inline int ++dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, ++ dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh) ++{ ++ int nlevels, epbs; ++ ++ *parentp = NULL; ++ *bpp = NULL; ++ ++ ASSERT(blkid != DMU_BONUS_BLKID); ++ ++ if (blkid == DMU_SPILL_BLKID) { ++ mutex_enter(&dn->dn_mtx); ++ if (dn->dn_have_spill && ++ (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) ++ *bpp = &dn->dn_phys->dn_spill; ++ else ++ *bpp = NULL; ++ dbuf_add_ref(dn->dn_dbuf, NULL); ++ *parentp = dn->dn_dbuf; ++ mutex_exit(&dn->dn_mtx); ++ return (0); ++ } ++ ++ if (dn->dn_phys->dn_nlevels == 0) ++ nlevels = 1; ++ else ++ nlevels = dn->dn_phys->dn_nlevels; ++ ++ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ++ ++ ASSERT3U(level * epbs, <, 64); ++ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ++ if (level >= nlevels || ++ (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { ++ /* the buffer has no parent yet */ ++ return (ENOENT); ++ } else if (level < nlevels-1) { ++ /* this block is referenced from an indirect block */ ++ int err; ++ if (dh == NULL) { ++ err = dbuf_hold_impl(dn, level+1, blkid >> epbs, ++ fail_sparse, NULL, parentp); ++ } ++ else { ++ __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1, ++ blkid >> epbs, fail_sparse, NULL, ++ parentp, dh->dh_depth + 1); ++ err = __dbuf_hold_impl(dh + 1); ++ } ++ if (err) ++ return (err); ++ err = dbuf_read(*parentp, NULL, ++ (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); ++ if (err) { ++ dbuf_rele(*parentp, NULL); ++ *parentp = NULL; ++ return (err); ++ } ++ *bpp = ((blkptr_t *)(*parentp)->db.db_data) + ++ (blkid & ((1ULL << epbs) - 1)); ++ return (0); ++ } else { ++ /* the block is referenced from the dnode */ ++ ASSERT3U(level, ==, nlevels-1); ++ ASSERT(dn->dn_phys->dn_nblkptr == 0 || ++ blkid < dn->dn_phys->dn_nblkptr); ++ if (dn->dn_dbuf) { ++ dbuf_add_ref(dn->dn_dbuf, NULL); ++ *parentp = dn->dn_dbuf; ++ } ++ *bpp = &dn->dn_phys->dn_blkptr[blkid]; ++ return (0); ++ } ++} ++ ++static dmu_buf_impl_t * ++dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ++ dmu_buf_impl_t *parent, blkptr_t *blkptr) ++{ ++ objset_t *os = dn->dn_objset; ++ dmu_buf_impl_t *db, *odb; ++ ++ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ++ ASSERT(dn->dn_type != DMU_OT_NONE); ++ ++ db = kmem_cache_alloc(dbuf_cache, KM_PUSHPAGE); ++ ++ db->db_objset = os; ++ db->db.db_object = dn->dn_object; ++ db->db_level = level; ++ db->db_blkid = blkid; ++ db->db_last_dirty = NULL; ++ db->db_dirtycnt = 0; ++ db->db_dnode_handle = dn->dn_handle; ++ db->db_parent = parent; ++ db->db_blkptr = blkptr; ++ ++ db->db_user_ptr = NULL; ++ db->db_user_data_ptr_ptr = NULL; ++ db->db_evict_func = NULL; ++ db->db_immediate_evict = 0; ++ db->db_freed_in_flight = 0; ++ ++ if (blkid == DMU_BONUS_BLKID) { ++ ASSERT3P(parent, ==, dn->dn_dbuf); ++ db->db.db_size = DN_MAX_BONUSLEN - ++ (dn->dn_nblkptr-1) * sizeof (blkptr_t); ++ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ++ db->db.db_offset = DMU_BONUS_BLKID; ++ db->db_state = DB_UNCACHED; ++ /* the bonus dbuf is not placed in the hash table */ ++ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); ++ return (db); ++ } else if (blkid == DMU_SPILL_BLKID) { ++ db->db.db_size = (blkptr != NULL) ? ++ BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; ++ db->db.db_offset = 0; ++ } else { ++ int blocksize = ++ db->db_level ? 1<dn_indblkshift : dn->dn_datablksz; ++ db->db.db_size = blocksize; ++ db->db.db_offset = db->db_blkid * blocksize; ++ } ++ ++ /* ++ * Hold the dn_dbufs_mtx while we get the new dbuf ++ * in the hash table *and* added to the dbufs list. ++ * This prevents a possible deadlock with someone ++ * trying to look up this dbuf before its added to the ++ * dn_dbufs list. ++ */ ++ mutex_enter(&dn->dn_dbufs_mtx); ++ db->db_state = DB_EVICTING; ++ if ((odb = dbuf_hash_insert(db)) != NULL) { ++ /* someone else inserted it first */ ++ kmem_cache_free(dbuf_cache, db); ++ mutex_exit(&dn->dn_dbufs_mtx); ++ return (odb); ++ } ++ list_insert_head(&dn->dn_dbufs, db); ++ db->db_state = DB_UNCACHED; ++ mutex_exit(&dn->dn_dbufs_mtx); ++ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); ++ ++ if (parent && parent != dn->dn_dbuf) ++ dbuf_add_ref(parent, db); ++ ++ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || ++ refcount_count(&dn->dn_holds) > 0); ++ (void) refcount_add(&dn->dn_holds, db); ++ (void) atomic_inc_32_nv(&dn->dn_dbufs_count); ++ ++ dprintf_dbuf(db, "db=%p\n", db); ++ ++ return (db); ++} ++ ++static int ++dbuf_do_evict(void *private) ++{ ++ arc_buf_t *buf = private; ++ dmu_buf_impl_t *db = buf->b_private; ++ ++ if (!MUTEX_HELD(&db->db_mtx)) ++ mutex_enter(&db->db_mtx); ++ ++ ASSERT(refcount_is_zero(&db->db_holds)); ++ ++ if (db->db_state != DB_EVICTING) { ++ ASSERT(db->db_state == DB_CACHED); ++ DBUF_VERIFY(db); ++ db->db_buf = NULL; ++ dbuf_evict(db); ++ } else { ++ mutex_exit(&db->db_mtx); ++ dbuf_destroy(db); ++ } ++ return (0); ++} ++ ++static void ++dbuf_destroy(dmu_buf_impl_t *db) ++{ ++ ASSERT(refcount_is_zero(&db->db_holds)); ++ ++ if (db->db_blkid != DMU_BONUS_BLKID) { ++ /* ++ * If this dbuf is still on the dn_dbufs list, ++ * remove it from that list. ++ */ ++ if (db->db_dnode_handle != NULL) { ++ dnode_t *dn; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ mutex_enter(&dn->dn_dbufs_mtx); ++ list_remove(&dn->dn_dbufs, db); ++ (void) atomic_dec_32_nv(&dn->dn_dbufs_count); ++ mutex_exit(&dn->dn_dbufs_mtx); ++ DB_DNODE_EXIT(db); ++ /* ++ * Decrementing the dbuf count means that the hold ++ * corresponding to the removed dbuf is no longer ++ * discounted in dnode_move(), so the dnode cannot be ++ * moved until after we release the hold. ++ */ ++ dnode_rele(dn, db); ++ db->db_dnode_handle = NULL; ++ } ++ dbuf_hash_remove(db); ++ } ++ db->db_parent = NULL; ++ db->db_buf = NULL; ++ ++ ASSERT(!list_link_active(&db->db_link)); ++ ASSERT(db->db.db_data == NULL); ++ ASSERT(db->db_hash_next == NULL); ++ ASSERT(db->db_blkptr == NULL); ++ ASSERT(db->db_data_pending == NULL); ++ ++ kmem_cache_free(dbuf_cache, db); ++ arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); ++} ++ ++void ++dbuf_prefetch(dnode_t *dn, uint64_t blkid) ++{ ++ dmu_buf_impl_t *db = NULL; ++ blkptr_t *bp = NULL; ++ ++ ASSERT(blkid != DMU_BONUS_BLKID); ++ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ++ ++ if (dnode_block_freed(dn, blkid)) ++ return; ++ ++ /* dbuf_find() returns with db_mtx held */ ++ if ((db = dbuf_find(dn, 0, blkid))) { ++ /* ++ * This dbuf is already in the cache. We assume that ++ * it is already CACHED, or else about to be either ++ * read or filled. ++ */ ++ mutex_exit(&db->db_mtx); ++ return; ++ } ++ ++ if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) { ++ if (bp && !BP_IS_HOLE(bp)) { ++ int priority = dn->dn_type == DMU_OT_DDT_ZAP ? ++ ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; ++ arc_buf_t *pbuf; ++ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; ++ uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; ++ zbookmark_t zb; ++ ++ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ++ dn->dn_object, 0, blkid); ++ ++ if (db) ++ pbuf = db->db_buf; ++ else ++ pbuf = dn->dn_objset->os_phys_buf; ++ ++ (void) dsl_read(NULL, dn->dn_objset->os_spa, ++ bp, pbuf, NULL, NULL, priority, ++ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, ++ &aflags, &zb); ++ } ++ if (db) ++ dbuf_rele(db, NULL); ++ } ++} ++ ++#define DBUF_HOLD_IMPL_MAX_DEPTH 20 ++ ++/* ++ * Returns with db_holds incremented, and db_mtx not held. ++ * Note: dn_struct_rwlock must be held. ++ */ ++static int ++__dbuf_hold_impl(struct dbuf_hold_impl_data *dh) ++{ ++ ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH); ++ dh->dh_parent = NULL; ++ ++ ASSERT(dh->dh_blkid != DMU_BONUS_BLKID); ++ ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock)); ++ ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level); ++ ++ *(dh->dh_dbp) = NULL; ++top: ++ /* dbuf_find() returns with db_mtx held */ ++ dh->dh_db = dbuf_find(dh->dh_dn, dh->dh_level, dh->dh_blkid); ++ ++ if (dh->dh_db == NULL) { ++ dh->dh_bp = NULL; ++ ++ ASSERT3P(dh->dh_parent, ==, NULL); ++ dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, ++ dh->dh_fail_sparse, &dh->dh_parent, ++ &dh->dh_bp, dh); ++ if (dh->dh_fail_sparse) { ++ if (dh->dh_err == 0 && dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) ++ dh->dh_err = ENOENT; ++ if (dh->dh_err) { ++ if (dh->dh_parent) ++ dbuf_rele(dh->dh_parent, NULL); ++ return (dh->dh_err); ++ } ++ } ++ if (dh->dh_err && dh->dh_err != ENOENT) ++ return (dh->dh_err); ++ dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid, ++ dh->dh_parent, dh->dh_bp); ++ } ++ ++ if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) { ++ arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db); ++ if (dh->dh_db->db_buf->b_data == NULL) { ++ dbuf_clear(dh->dh_db); ++ if (dh->dh_parent) { ++ dbuf_rele(dh->dh_parent, NULL); ++ dh->dh_parent = NULL; ++ } ++ goto top; ++ } ++ ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data); ++ } ++ ++ ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf)); ++ ++ /* ++ * If this buffer is currently syncing out, and we are are ++ * still referencing it from db_data, we need to make a copy ++ * of it in case we decide we want to dirty it again in this txg. ++ */ ++ if (dh->dh_db->db_level == 0 && ++ dh->dh_db->db_blkid != DMU_BONUS_BLKID && ++ dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT && ++ dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) { ++ dh->dh_dr = dh->dh_db->db_data_pending; ++ ++ if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) { ++ dh->dh_type = DBUF_GET_BUFC_TYPE(dh->dh_db); ++ ++ dbuf_set_data(dh->dh_db, ++ arc_buf_alloc(dh->dh_dn->dn_objset->os_spa, ++ dh->dh_db->db.db_size, dh->dh_db, dh->dh_type)); ++ bcopy(dh->dh_dr->dt.dl.dr_data->b_data, ++ dh->dh_db->db.db_data, dh->dh_db->db.db_size); ++ } ++ } ++ ++ (void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag); ++ dbuf_update_data(dh->dh_db); ++ DBUF_VERIFY(dh->dh_db); ++ mutex_exit(&dh->dh_db->db_mtx); ++ ++ /* NOTE: we can't rele the parent until after we drop the db_mtx */ ++ if (dh->dh_parent) ++ dbuf_rele(dh->dh_parent, NULL); ++ ++ ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn); ++ ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid); ++ ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level); ++ *(dh->dh_dbp) = dh->dh_db; ++ ++ return (0); ++} ++ ++/* ++ * The following code preserves the recursive function dbuf_hold_impl() ++ * but moves the local variables AND function arguments to the heap to ++ * minimize the stack frame size. Enough space is initially allocated ++ * on the stack for 20 levels of recursion. ++ */ ++int ++dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, ++ void *tag, dmu_buf_impl_t **dbp) ++{ ++ struct dbuf_hold_impl_data *dh; ++ int error; ++ ++ dh = kmem_zalloc(sizeof(struct dbuf_hold_impl_data) * ++ DBUF_HOLD_IMPL_MAX_DEPTH, KM_PUSHPAGE); ++ __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0); ++ ++ error = __dbuf_hold_impl(dh); ++ ++ kmem_free(dh, sizeof(struct dbuf_hold_impl_data) * ++ DBUF_HOLD_IMPL_MAX_DEPTH); ++ ++ return (error); ++} ++ ++static void ++__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, ++ dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, ++ void *tag, dmu_buf_impl_t **dbp, int depth) ++{ ++ dh->dh_dn = dn; ++ dh->dh_level = level; ++ dh->dh_blkid = blkid; ++ dh->dh_fail_sparse = fail_sparse; ++ dh->dh_tag = tag; ++ dh->dh_dbp = dbp; ++ dh->dh_depth = depth; ++} ++ ++dmu_buf_impl_t * ++dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) ++{ ++ dmu_buf_impl_t *db; ++ int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); ++ return (err ? NULL : db); ++} ++ ++dmu_buf_impl_t * ++dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) ++{ ++ dmu_buf_impl_t *db; ++ int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); ++ return (err ? NULL : db); ++} ++ ++void ++dbuf_create_bonus(dnode_t *dn) ++{ ++ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ++ ++ ASSERT(dn->dn_bonus == NULL); ++ dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); ++} ++ ++int ++dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ++ dnode_t *dn; ++ ++ if (db->db_blkid != DMU_SPILL_BLKID) ++ return (ENOTSUP); ++ if (blksz == 0) ++ blksz = SPA_MINBLOCKSIZE; ++ if (blksz > SPA_MAXBLOCKSIZE) ++ blksz = SPA_MAXBLOCKSIZE; ++ else ++ blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ++ dbuf_new_size(db, blksz, tx); ++ rw_exit(&dn->dn_struct_rwlock); ++ DB_DNODE_EXIT(db); ++ ++ return (0); ++} ++ ++void ++dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) ++{ ++ dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); ++} ++ ++#pragma weak dmu_buf_add_ref = dbuf_add_ref ++void ++dbuf_add_ref(dmu_buf_impl_t *db, void *tag) ++{ ++ VERIFY(refcount_add(&db->db_holds, tag) > 1); ++} ++ ++/* ++ * If you call dbuf_rele() you had better not be referencing the dnode handle ++ * unless you have some other direct or indirect hold on the dnode. (An indirect ++ * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) ++ * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the ++ * dnode's parent dbuf evicting its dnode handles. ++ */ ++#pragma weak dmu_buf_rele = dbuf_rele ++void ++dbuf_rele(dmu_buf_impl_t *db, void *tag) ++{ ++ mutex_enter(&db->db_mtx); ++ dbuf_rele_and_unlock(db, tag); ++} ++ ++/* ++ * dbuf_rele() for an already-locked dbuf. This is necessary to allow ++ * db_dirtycnt and db_holds to be updated atomically. ++ */ ++void ++dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) ++{ ++ int64_t holds; ++ ++ ASSERT(MUTEX_HELD(&db->db_mtx)); ++ DBUF_VERIFY(db); ++ ++ /* ++ * Remove the reference to the dbuf before removing its hold on the ++ * dnode so we can guarantee in dnode_move() that a referenced bonus ++ * buffer has a corresponding dnode hold. ++ */ ++ holds = refcount_remove(&db->db_holds, tag); ++ ASSERT(holds >= 0); ++ ++ /* ++ * We can't freeze indirects if there is a possibility that they ++ * may be modified in the current syncing context. ++ */ ++ if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) ++ arc_buf_freeze(db->db_buf); ++ ++ if (holds == db->db_dirtycnt && ++ db->db_level == 0 && db->db_immediate_evict) ++ dbuf_evict_user(db); ++ ++ if (holds == 0) { ++ if (db->db_blkid == DMU_BONUS_BLKID) { ++ mutex_exit(&db->db_mtx); ++ ++ /* ++ * If the dnode moves here, we cannot cross this barrier ++ * until the move completes. ++ */ ++ DB_DNODE_ENTER(db); ++ (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); ++ DB_DNODE_EXIT(db); ++ /* ++ * The bonus buffer's dnode hold is no longer discounted ++ * in dnode_move(). The dnode cannot move until after ++ * the dnode_rele(). ++ */ ++ dnode_rele(DB_DNODE(db), db); ++ } else if (db->db_buf == NULL) { ++ /* ++ * This is a special case: we never associated this ++ * dbuf with any data allocated from the ARC. ++ */ ++ ASSERT(db->db_state == DB_UNCACHED || ++ db->db_state == DB_NOFILL); ++ dbuf_evict(db); ++ } else if (arc_released(db->db_buf)) { ++ arc_buf_t *buf = db->db_buf; ++ /* ++ * This dbuf has anonymous data associated with it. ++ */ ++ dbuf_set_data(db, NULL); ++ VERIFY(arc_buf_remove_ref(buf, db) == 1); ++ dbuf_evict(db); ++ } else { ++ VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); ++ if (!DBUF_IS_CACHEABLE(db)) ++ dbuf_clear(db); ++ else ++ mutex_exit(&db->db_mtx); ++ } ++ } else { ++ mutex_exit(&db->db_mtx); ++ } ++} ++ ++#pragma weak dmu_buf_refcount = dbuf_refcount ++uint64_t ++dbuf_refcount(dmu_buf_impl_t *db) ++{ ++ return (refcount_count(&db->db_holds)); ++} ++ ++void * ++dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, ++ dmu_buf_evict_func_t *evict_func) ++{ ++ return (dmu_buf_update_user(db_fake, NULL, user_ptr, ++ user_data_ptr_ptr, evict_func)); ++} ++ ++void * ++dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, ++ dmu_buf_evict_func_t *evict_func) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ++ ++ db->db_immediate_evict = TRUE; ++ return (dmu_buf_update_user(db_fake, NULL, user_ptr, ++ user_data_ptr_ptr, evict_func)); ++} ++ ++void * ++dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, ++ void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ++ ASSERT(db->db_level == 0); ++ ++ ASSERT((user_ptr == NULL) == (evict_func == NULL)); ++ ++ mutex_enter(&db->db_mtx); ++ ++ if (db->db_user_ptr == old_user_ptr) { ++ db->db_user_ptr = user_ptr; ++ db->db_user_data_ptr_ptr = user_data_ptr_ptr; ++ db->db_evict_func = evict_func; ++ ++ dbuf_update_data(db); ++ } else { ++ old_user_ptr = db->db_user_ptr; ++ } ++ ++ mutex_exit(&db->db_mtx); ++ return (old_user_ptr); ++} ++ ++void * ++dmu_buf_get_user(dmu_buf_t *db_fake) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ++ ASSERT(!refcount_is_zero(&db->db_holds)); ++ ++ return (db->db_user_ptr); ++} ++ ++boolean_t ++dmu_buf_freeable(dmu_buf_t *dbuf) ++{ ++ boolean_t res = B_FALSE; ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; ++ ++ if (db->db_blkptr) ++ res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, ++ db->db_blkptr, db->db_blkptr->blk_birth); ++ ++ return (res); ++} ++ ++static void ++dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) ++{ ++ /* ASSERT(dmu_tx_is_syncing(tx) */ ++ ASSERT(MUTEX_HELD(&db->db_mtx)); ++ ++ if (db->db_blkptr != NULL) ++ return; ++ ++ if (db->db_blkid == DMU_SPILL_BLKID) { ++ db->db_blkptr = &dn->dn_phys->dn_spill; ++ BP_ZERO(db->db_blkptr); ++ return; ++ } ++ if (db->db_level == dn->dn_phys->dn_nlevels-1) { ++ /* ++ * This buffer was allocated at a time when there was ++ * no available blkptrs from the dnode, or it was ++ * inappropriate to hook it in (i.e., nlevels mis-match). ++ */ ++ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); ++ ASSERT(db->db_parent == NULL); ++ db->db_parent = dn->dn_dbuf; ++ db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; ++ DBUF_VERIFY(db); ++ } else { ++ dmu_buf_impl_t *parent = db->db_parent; ++ int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ++ ++ ASSERT(dn->dn_phys->dn_nlevels > 1); ++ if (parent == NULL) { ++ mutex_exit(&db->db_mtx); ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ (void) dbuf_hold_impl(dn, db->db_level+1, ++ db->db_blkid >> epbs, FALSE, db, &parent); ++ rw_exit(&dn->dn_struct_rwlock); ++ mutex_enter(&db->db_mtx); ++ db->db_parent = parent; ++ } ++ db->db_blkptr = (blkptr_t *)parent->db.db_data + ++ (db->db_blkid & ((1ULL << epbs) - 1)); ++ DBUF_VERIFY(db); ++ } ++} ++ ++/* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it ++ * is critical the we not allow the compiler to inline this function in to ++ * dbuf_sync_list() thereby drastically bloating the stack usage. ++ */ ++noinline static void ++dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ++{ ++ dmu_buf_impl_t *db = dr->dr_dbuf; ++ dnode_t *dn; ++ zio_t *zio; ++ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ++ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); ++ ++ mutex_enter(&db->db_mtx); ++ ++ ASSERT(db->db_level > 0); ++ DBUF_VERIFY(db); ++ ++ if (db->db_buf == NULL) { ++ mutex_exit(&db->db_mtx); ++ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); ++ mutex_enter(&db->db_mtx); ++ } ++ ASSERT3U(db->db_state, ==, DB_CACHED); ++ ASSERT(db->db_buf != NULL); ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); ++ dbuf_check_blkptr(dn, db); ++ DB_DNODE_EXIT(db); ++ ++ db->db_data_pending = dr; ++ ++ mutex_exit(&db->db_mtx); ++ dbuf_write(dr, db->db_buf, tx); ++ ++ zio = dr->dr_zio; ++ mutex_enter(&dr->dt.di.dr_mtx); ++ dbuf_sync_list(&dr->dt.di.dr_children, tx); ++ ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ++ mutex_exit(&dr->dt.di.dr_mtx); ++ zio_nowait(zio); ++} ++ ++/* dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is ++ * critical the we not allow the compiler to inline this function in to ++ * dbuf_sync_list() thereby drastically bloating the stack usage. ++ */ ++noinline static void ++dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ++{ ++ arc_buf_t **datap = &dr->dt.dl.dr_data; ++ dmu_buf_impl_t *db = dr->dr_dbuf; ++ dnode_t *dn; ++ objset_t *os; ++ uint64_t txg = tx->tx_txg; ++ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ++ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); ++ ++ mutex_enter(&db->db_mtx); ++ /* ++ * To be synced, we must be dirtied. But we ++ * might have been freed after the dirty. ++ */ ++ if (db->db_state == DB_UNCACHED) { ++ /* This buffer has been freed since it was dirtied */ ++ ASSERT(db->db.db_data == NULL); ++ } else if (db->db_state == DB_FILL) { ++ /* This buffer was freed and is now being re-filled */ ++ ASSERT(db->db.db_data != dr->dt.dl.dr_data); ++ } else { ++ ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); ++ } ++ DBUF_VERIFY(db); ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ ++ if (db->db_blkid == DMU_SPILL_BLKID) { ++ mutex_enter(&dn->dn_mtx); ++ dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; ++ mutex_exit(&dn->dn_mtx); ++ } ++ ++ /* ++ * If this is a bonus buffer, simply copy the bonus data into the ++ * dnode. It will be written out when the dnode is synced (and it ++ * will be synced, since it must have been dirty for dbuf_sync to ++ * be called). ++ */ ++ if (db->db_blkid == DMU_BONUS_BLKID) { ++ dbuf_dirty_record_t **drp; ++ ++ ASSERT(*datap != NULL); ++ ASSERT3U(db->db_level, ==, 0); ++ ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); ++ bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); ++ DB_DNODE_EXIT(db); ++ ++ if (*datap != db->db.db_data) { ++ zio_buf_free(*datap, DN_MAX_BONUSLEN); ++ arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); ++ } ++ db->db_data_pending = NULL; ++ drp = &db->db_last_dirty; ++ while (*drp != dr) ++ drp = &(*drp)->dr_next; ++ ASSERT(dr->dr_next == NULL); ++ ASSERT(dr->dr_dbuf == db); ++ *drp = dr->dr_next; ++ if (dr->dr_dbuf->db_level != 0) { ++ mutex_destroy(&dr->dt.di.dr_mtx); ++ list_destroy(&dr->dt.di.dr_children); ++ } ++ kmem_free(dr, sizeof (dbuf_dirty_record_t)); ++ ASSERT(db->db_dirtycnt > 0); ++ db->db_dirtycnt -= 1; ++ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); ++ return; ++ } ++ ++ os = dn->dn_objset; ++ ++ /* ++ * This function may have dropped the db_mtx lock allowing a dmu_sync ++ * operation to sneak in. As a result, we need to ensure that we ++ * don't check the dr_override_state until we have returned from ++ * dbuf_check_blkptr. ++ */ ++ dbuf_check_blkptr(dn, db); ++ ++ /* ++ * If this buffer is in the middle of an immediate write, ++ * wait for the synchronous IO to complete. ++ */ ++ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { ++ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); ++ cv_wait(&db->db_changed, &db->db_mtx); ++ ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); ++ } ++ ++ if (db->db_state != DB_NOFILL && ++ dn->dn_object != DMU_META_DNODE_OBJECT && ++ refcount_count(&db->db_holds) > 1 && ++ dr->dt.dl.dr_override_state != DR_OVERRIDDEN && ++ *datap == db->db_buf) { ++ /* ++ * If this buffer is currently "in use" (i.e., there ++ * are active holds and db_data still references it), ++ * then make a copy before we start the write so that ++ * any modifications from the open txg will not leak ++ * into this write. ++ * ++ * NOTE: this copy does not need to be made for ++ * objects only modified in the syncing context (e.g. ++ * DNONE_DNODE blocks). ++ */ ++ int blksz = arc_buf_size(*datap); ++ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); ++ *datap = arc_buf_alloc(os->os_spa, blksz, db, type); ++ bcopy(db->db.db_data, (*datap)->b_data, blksz); ++ } ++ db->db_data_pending = dr; ++ ++ mutex_exit(&db->db_mtx); ++ ++ dbuf_write(dr, *datap, tx); ++ ++ ASSERT(!list_link_active(&dr->dr_dirty_node)); ++ if (dn->dn_object == DMU_META_DNODE_OBJECT) { ++ list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); ++ DB_DNODE_EXIT(db); ++ } else { ++ /* ++ * Although zio_nowait() does not "wait for an IO", it does ++ * initiate the IO. If this is an empty write it seems plausible ++ * that the IO could actually be completed before the nowait ++ * returns. We need to DB_DNODE_EXIT() first in case ++ * zio_nowait() invalidates the dbuf. ++ */ ++ DB_DNODE_EXIT(db); ++ zio_nowait(dr->dr_zio); ++ } ++} ++ ++void ++dbuf_sync_list(list_t *list, dmu_tx_t *tx) ++{ ++ dbuf_dirty_record_t *dr; ++ ++ while ((dr = list_head(list))) { ++ if (dr->dr_zio != NULL) { ++ /* ++ * If we find an already initialized zio then we ++ * are processing the meta-dnode, and we have finished. ++ * The dbufs for all dnodes are put back on the list ++ * during processing, so that we can zio_wait() ++ * these IOs after initiating all child IOs. ++ */ ++ ASSERT3U(dr->dr_dbuf->db.db_object, ==, ++ DMU_META_DNODE_OBJECT); ++ break; ++ } ++ list_remove(list, dr); ++ if (dr->dr_dbuf->db_level > 0) ++ dbuf_sync_indirect(dr, tx); ++ else ++ dbuf_sync_leaf(dr, tx); ++ } ++} ++ ++/* ARGSUSED */ ++static void ++dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) ++{ ++ dmu_buf_impl_t *db = vdb; ++ dnode_t *dn; ++ blkptr_t *bp = zio->io_bp; ++ blkptr_t *bp_orig = &zio->io_bp_orig; ++ spa_t *spa = zio->io_spa; ++ int64_t delta; ++ uint64_t fill = 0; ++ int i; ++ ++ ASSERT(db->db_blkptr == bp); ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); ++ dnode_diduse_space(dn, delta - zio->io_prev_space_delta); ++ zio->io_prev_space_delta = delta; ++ ++ if (BP_IS_HOLE(bp)) { ++ ASSERT(bp->blk_fill == 0); ++ DB_DNODE_EXIT(db); ++ return; ++ } ++ ++ ASSERT((db->db_blkid != DMU_SPILL_BLKID && ++ BP_GET_TYPE(bp) == dn->dn_type) || ++ (db->db_blkid == DMU_SPILL_BLKID && ++ BP_GET_TYPE(bp) == dn->dn_bonustype)); ++ ASSERT(BP_GET_LEVEL(bp) == db->db_level); ++ ++ mutex_enter(&db->db_mtx); ++ ++#ifdef ZFS_DEBUG ++ if (db->db_blkid == DMU_SPILL_BLKID) { ++ ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ++ ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && ++ db->db_blkptr == &dn->dn_phys->dn_spill); ++ } ++#endif ++ ++ if (db->db_level == 0) { ++ mutex_enter(&dn->dn_mtx); ++ if (db->db_blkid > dn->dn_phys->dn_maxblkid && ++ db->db_blkid != DMU_SPILL_BLKID) ++ dn->dn_phys->dn_maxblkid = db->db_blkid; ++ mutex_exit(&dn->dn_mtx); ++ ++ if (dn->dn_type == DMU_OT_DNODE) { ++ dnode_phys_t *dnp = db->db.db_data; ++ for (i = db->db.db_size >> DNODE_SHIFT; i > 0; ++ i--, dnp++) { ++ if (dnp->dn_type != DMU_OT_NONE) ++ fill++; ++ } ++ } else { ++ fill = 1; ++ } ++ } else { ++ blkptr_t *ibp = db->db.db_data; ++ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); ++ for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { ++ if (BP_IS_HOLE(ibp)) ++ continue; ++ fill += ibp->blk_fill; ++ } ++ } ++ DB_DNODE_EXIT(db); ++ ++ bp->blk_fill = fill; ++ ++ mutex_exit(&db->db_mtx); ++} ++ ++/* ARGSUSED */ ++static void ++dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ++{ ++ dmu_buf_impl_t *db = vdb; ++ blkptr_t *bp = zio->io_bp; ++ blkptr_t *bp_orig = &zio->io_bp_orig; ++ uint64_t txg = zio->io_txg; ++ dbuf_dirty_record_t **drp, *dr; ++ ++ ASSERT3U(zio->io_error, ==, 0); ++ ASSERT(db->db_blkptr == bp); ++ ++ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ++ ASSERT(BP_EQUAL(bp, bp_orig)); ++ } else { ++ objset_t *os; ++ dsl_dataset_t *ds; ++ dmu_tx_t *tx; ++ ++ DB_GET_OBJSET(&os, db); ++ ds = os->os_dsl_dataset; ++ tx = os->os_synctx; ++ ++ (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); ++ dsl_dataset_block_born(ds, bp, tx); ++ } ++ ++ mutex_enter(&db->db_mtx); ++ ++ DBUF_VERIFY(db); ++ ++ drp = &db->db_last_dirty; ++ while ((dr = *drp) != db->db_data_pending) ++ drp = &dr->dr_next; ++ ASSERT(!list_link_active(&dr->dr_dirty_node)); ++ ASSERT(dr->dr_txg == txg); ++ ASSERT(dr->dr_dbuf == db); ++ ASSERT(dr->dr_next == NULL); ++ *drp = dr->dr_next; ++ ++#ifdef ZFS_DEBUG ++ if (db->db_blkid == DMU_SPILL_BLKID) { ++ dnode_t *dn; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ++ ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && ++ db->db_blkptr == &dn->dn_phys->dn_spill); ++ DB_DNODE_EXIT(db); ++ } ++#endif ++ ++ if (db->db_level == 0) { ++ ASSERT(db->db_blkid != DMU_BONUS_BLKID); ++ ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); ++ if (db->db_state != DB_NOFILL) { ++ if (dr->dt.dl.dr_data != db->db_buf) ++ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, ++ db) == 1); ++ else if (!arc_released(db->db_buf)) ++ arc_set_callback(db->db_buf, dbuf_do_evict, db); ++ } ++ } else { ++ dnode_t *dn; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ++ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); ++ if (!BP_IS_HOLE(db->db_blkptr)) { ++ ASSERTV(int epbs = dn->dn_phys->dn_indblkshift - ++ SPA_BLKPTRSHIFT); ++ ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, ++ db->db.db_size); ++ ASSERT3U(dn->dn_phys->dn_maxblkid ++ >> (db->db_level * epbs), >=, db->db_blkid); ++ arc_set_callback(db->db_buf, dbuf_do_evict, db); ++ } ++ DB_DNODE_EXIT(db); ++ mutex_destroy(&dr->dt.di.dr_mtx); ++ list_destroy(&dr->dt.di.dr_children); ++ } ++ kmem_free(dr, sizeof (dbuf_dirty_record_t)); ++ ++ cv_broadcast(&db->db_changed); ++ ASSERT(db->db_dirtycnt > 0); ++ db->db_dirtycnt -= 1; ++ db->db_data_pending = NULL; ++ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); ++} ++ ++static void ++dbuf_write_nofill_ready(zio_t *zio) ++{ ++ dbuf_write_ready(zio, NULL, zio->io_private); ++} ++ ++static void ++dbuf_write_nofill_done(zio_t *zio) ++{ ++ dbuf_write_done(zio, NULL, zio->io_private); ++} ++ ++static void ++dbuf_write_override_ready(zio_t *zio) ++{ ++ dbuf_dirty_record_t *dr = zio->io_private; ++ dmu_buf_impl_t *db = dr->dr_dbuf; ++ ++ dbuf_write_ready(zio, NULL, db); ++} ++ ++static void ++dbuf_write_override_done(zio_t *zio) ++{ ++ dbuf_dirty_record_t *dr = zio->io_private; ++ dmu_buf_impl_t *db = dr->dr_dbuf; ++ blkptr_t *obp = &dr->dt.dl.dr_overridden_by; ++ ++ mutex_enter(&db->db_mtx); ++ if (!BP_EQUAL(zio->io_bp, obp)) { ++ if (!BP_IS_HOLE(obp)) ++ dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); ++ arc_release(dr->dt.dl.dr_data, db); ++ } ++ mutex_exit(&db->db_mtx); ++ ++ dbuf_write_done(zio, NULL, db); ++} ++ ++static void ++dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) ++{ ++ dmu_buf_impl_t *db = dr->dr_dbuf; ++ dnode_t *dn; ++ objset_t *os; ++ dmu_buf_impl_t *parent = db->db_parent; ++ uint64_t txg = tx->tx_txg; ++ zbookmark_t zb; ++ zio_prop_t zp; ++ zio_t *zio; ++ int wp_flag = 0; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ os = dn->dn_objset; ++ ++ if (db->db_state != DB_NOFILL) { ++ if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { ++ /* ++ * Private object buffers are released here rather ++ * than in dbuf_dirty() since they are only modified ++ * in the syncing context and we don't want the ++ * overhead of making multiple copies of the data. ++ */ ++ if (BP_IS_HOLE(db->db_blkptr)) { ++ arc_buf_thaw(data); ++ } else { ++ dbuf_release_bp(db); ++ } ++ } ++ } ++ ++ if (parent != dn->dn_dbuf) { ++ ASSERT(parent && parent->db_data_pending); ++ ASSERT(db->db_level == parent->db_level-1); ++ ASSERT(arc_released(parent->db_buf)); ++ zio = parent->db_data_pending->dr_zio; ++ } else { ++ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && ++ db->db_blkid != DMU_SPILL_BLKID) || ++ (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); ++ if (db->db_blkid != DMU_SPILL_BLKID) ++ ASSERT3P(db->db_blkptr, ==, ++ &dn->dn_phys->dn_blkptr[db->db_blkid]); ++ zio = dn->dn_zio; ++ } ++ ++ ASSERT(db->db_level == 0 || data == db->db_buf); ++ ASSERT3U(db->db_blkptr->blk_birth, <=, txg); ++ ASSERT(zio); ++ ++ SET_BOOKMARK(&zb, os->os_dsl_dataset ? ++ os->os_dsl_dataset->ds_object : DMU_META_OBJSET, ++ db->db.db_object, db->db_level, db->db_blkid); ++ ++ if (db->db_blkid == DMU_SPILL_BLKID) ++ wp_flag = WP_SPILL; ++ wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; ++ ++ dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); ++ DB_DNODE_EXIT(db); ++ ++ if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { ++ ASSERT(db->db_state != DB_NOFILL); ++ dr->dr_zio = zio_write(zio, os->os_spa, txg, ++ db->db_blkptr, data->b_data, arc_buf_size(data), &zp, ++ dbuf_write_override_ready, dbuf_write_override_done, dr, ++ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); ++ mutex_enter(&db->db_mtx); ++ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; ++ zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, ++ dr->dt.dl.dr_copies); ++ mutex_exit(&db->db_mtx); ++ } else if (db->db_state == DB_NOFILL) { ++ ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); ++ dr->dr_zio = zio_write(zio, os->os_spa, txg, ++ db->db_blkptr, NULL, db->db.db_size, &zp, ++ dbuf_write_nofill_ready, dbuf_write_nofill_done, db, ++ ZIO_PRIORITY_ASYNC_WRITE, ++ ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); ++ } else { ++ ASSERT(arc_released(data)); ++ dr->dr_zio = arc_write(zio, os->os_spa, txg, ++ db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, ++ dbuf_write_ready, dbuf_write_done, db, ++ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); ++ } ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(dbuf_find); ++EXPORT_SYMBOL(dbuf_is_metadata); ++EXPORT_SYMBOL(dbuf_evict); ++EXPORT_SYMBOL(dbuf_loan_arcbuf); ++EXPORT_SYMBOL(dbuf_whichblock); ++EXPORT_SYMBOL(dbuf_read); ++EXPORT_SYMBOL(dbuf_unoverride); ++EXPORT_SYMBOL(dbuf_free_range); ++EXPORT_SYMBOL(dbuf_new_size); ++EXPORT_SYMBOL(dbuf_release_bp); ++EXPORT_SYMBOL(dbuf_dirty); ++EXPORT_SYMBOL(dmu_buf_will_dirty); ++EXPORT_SYMBOL(dmu_buf_will_not_fill); ++EXPORT_SYMBOL(dmu_buf_will_fill); ++EXPORT_SYMBOL(dmu_buf_fill_done); ++EXPORT_SYMBOL(dmu_buf_rele); ++EXPORT_SYMBOL(dbuf_assign_arcbuf); ++EXPORT_SYMBOL(dbuf_clear); ++EXPORT_SYMBOL(dbuf_prefetch); ++EXPORT_SYMBOL(dbuf_hold_impl); ++EXPORT_SYMBOL(dbuf_hold); ++EXPORT_SYMBOL(dbuf_hold_level); ++EXPORT_SYMBOL(dbuf_create_bonus); ++EXPORT_SYMBOL(dbuf_spill_set_blksz); ++EXPORT_SYMBOL(dbuf_rm_spill); ++EXPORT_SYMBOL(dbuf_add_ref); ++EXPORT_SYMBOL(dbuf_rele); ++EXPORT_SYMBOL(dbuf_rele_and_unlock); ++EXPORT_SYMBOL(dbuf_refcount); ++EXPORT_SYMBOL(dbuf_sync_list); ++EXPORT_SYMBOL(dmu_buf_set_user); ++EXPORT_SYMBOL(dmu_buf_set_user_ie); ++EXPORT_SYMBOL(dmu_buf_update_user); ++EXPORT_SYMBOL(dmu_buf_get_user); ++EXPORT_SYMBOL(dmu_buf_freeable); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/ddt.c linux-3.2.33-go/fs/zfs/zfs/ddt.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/ddt.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/ddt.c 2012-11-16 23:25:34.349039334 +0100 +@@ -0,0 +1,1213 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Enable/disable prefetching of dedup-ed blocks which are going to be freed. ++ */ ++int zfs_dedup_prefetch = 1; ++ ++static const ddt_ops_t *ddt_ops[DDT_TYPES] = { ++ &ddt_zap_ops, ++}; ++ ++static const char *ddt_class_name[DDT_CLASSES] = { ++ "ditto", ++ "duplicate", ++ "unique", ++}; ++ ++static void ++ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ++ dmu_tx_t *tx) ++{ ++ spa_t *spa = ddt->ddt_spa; ++ objset_t *os = ddt->ddt_os; ++ uint64_t *objectp = &ddt->ddt_object[type][class]; ++ boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup; ++ char name[DDT_NAMELEN]; ++ ++ ddt_object_name(ddt, type, class, name); ++ ++ ASSERT(*objectp == 0); ++ VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); ++ ASSERT(*objectp != 0); ++ ++ VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, ++ sizeof (uint64_t), 1, objectp, tx) == 0); ++ ++ VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, ++ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), ++ &ddt->ddt_histogram[type][class], tx) == 0); ++} ++ ++static void ++ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ++ dmu_tx_t *tx) ++{ ++ spa_t *spa = ddt->ddt_spa; ++ objset_t *os = ddt->ddt_os; ++ uint64_t *objectp = &ddt->ddt_object[type][class]; ++ uint64_t count; ++ char name[DDT_NAMELEN]; ++ ++ ddt_object_name(ddt, type, class, name); ++ ++ ASSERT(*objectp != 0); ++ ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); ++ VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0); ++ VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); ++ VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); ++ VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); ++ bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); ++ ++ *objectp = 0; ++} ++ ++static int ++ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) ++{ ++ ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; ++ dmu_object_info_t doi; ++ uint64_t count; ++ char name[DDT_NAMELEN]; ++ int error; ++ ++ ddt_object_name(ddt, type, class, name); ++ ++ error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, ++ sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); ++ ++ if (error) ++ return (error); ++ ++ error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, ++ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), ++ &ddt->ddt_histogram[type][class]); ++ ++ /* ++ * Seed the cached statistics. ++ */ ++ error = ddt_object_info(ddt, type, class, &doi); ++ if (error) ++ return (error); ++ ++ error = ddt_object_count(ddt, type, class, &count); ++ if (error) ++ return (error); ++ ++ ddo->ddo_count = count; ++ ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; ++ ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; ++ ++ ASSERT(error == 0); ++ return (error); ++} ++ ++static void ++ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ++ dmu_tx_t *tx) ++{ ++ ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; ++ dmu_object_info_t doi; ++ uint64_t count; ++ char name[DDT_NAMELEN]; ++ ++ ddt_object_name(ddt, type, class, name); ++ ++ VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, ++ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), ++ &ddt->ddt_histogram[type][class], tx) == 0); ++ ++ /* ++ * Cache DDT statistics; this is the only time they'll change. ++ */ ++ VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); ++ VERIFY(ddt_object_count(ddt, type, class, &count) == 0); ++ ++ ddo->ddo_count = count; ++ ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; ++ ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; ++} ++ ++static int ++ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ++ ddt_entry_t *dde) ++{ ++ if (!ddt_object_exists(ddt, type, class)) ++ return (ENOENT); ++ ++ return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, ++ ddt->ddt_object[type][class], dde)); ++} ++ ++static void ++ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ++ ddt_entry_t *dde) ++{ ++ if (!ddt_object_exists(ddt, type, class)) ++ return; ++ ++ ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, ++ ddt->ddt_object[type][class], dde); ++} ++ ++int ++ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ++ ddt_entry_t *dde, dmu_tx_t *tx) ++{ ++ ASSERT(ddt_object_exists(ddt, type, class)); ++ ++ return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, ++ ddt->ddt_object[type][class], dde, tx)); ++} ++ ++static int ++ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ++ ddt_entry_t *dde, dmu_tx_t *tx) ++{ ++ ASSERT(ddt_object_exists(ddt, type, class)); ++ ++ return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, ++ ddt->ddt_object[type][class], dde, tx)); ++} ++ ++int ++ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ++ uint64_t *walk, ddt_entry_t *dde) ++{ ++ ASSERT(ddt_object_exists(ddt, type, class)); ++ ++ return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, ++ ddt->ddt_object[type][class], dde, walk)); ++} ++ ++int ++ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ++ uint64_t *count) ++{ ++ ASSERT(ddt_object_exists(ddt, type, class)); ++ ++ return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, ++ ddt->ddt_object[type][class], count)); ++} ++ ++int ++ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ++ dmu_object_info_t *doi) ++{ ++ if (!ddt_object_exists(ddt, type, class)) ++ return (ENOENT); ++ ++ return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], ++ doi)); ++} ++ ++boolean_t ++ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) ++{ ++ return (!!ddt->ddt_object[type][class]); ++} ++ ++void ++ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ++ char *name) ++{ ++ (void) sprintf(name, DMU_POOL_DDT, ++ zio_checksum_table[ddt->ddt_checksum].ci_name, ++ ddt_ops[type]->ddt_op_name, ddt_class_name[class]); ++} ++ ++void ++ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) ++{ ++ int d; ++ ASSERT(txg != 0); ++ ++ for (d = 0; d < SPA_DVAS_PER_BP; d++) ++ bp->blk_dva[d] = ddp->ddp_dva[d]; ++ BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); ++} ++ ++void ++ddt_bp_create(enum zio_checksum checksum, ++ const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) ++{ ++ BP_ZERO(bp); ++ ++ if (ddp != NULL) ++ ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); ++ ++ bp->blk_cksum = ddk->ddk_cksum; ++ bp->blk_fill = 1; ++ ++ BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); ++ BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); ++ BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); ++ BP_SET_CHECKSUM(bp, checksum); ++ BP_SET_TYPE(bp, DMU_OT_DEDUP); ++ BP_SET_LEVEL(bp, 0); ++ BP_SET_DEDUP(bp, 0); ++ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); ++} ++ ++void ++ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) ++{ ++ ddk->ddk_cksum = bp->blk_cksum; ++ ddk->ddk_prop = 0; ++ ++ DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); ++ DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); ++ DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); ++} ++ ++void ++ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) ++{ ++ int d; ++ ASSERT(ddp->ddp_phys_birth == 0); ++ ++ for (d = 0; d < SPA_DVAS_PER_BP; d++) ++ ddp->ddp_dva[d] = bp->blk_dva[d]; ++ ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); ++} ++ ++void ++ddt_phys_clear(ddt_phys_t *ddp) ++{ ++ bzero(ddp, sizeof (*ddp)); ++} ++ ++void ++ddt_phys_addref(ddt_phys_t *ddp) ++{ ++ ddp->ddp_refcnt++; ++} ++ ++void ++ddt_phys_decref(ddt_phys_t *ddp) ++{ ++ ASSERT((int64_t)ddp->ddp_refcnt > 0); ++ ddp->ddp_refcnt--; ++} ++ ++void ++ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) ++{ ++ blkptr_t blk; ++ ++ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); ++ ddt_phys_clear(ddp); ++ zio_free(ddt->ddt_spa, txg, &blk); ++} ++ ++ddt_phys_t * ++ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) ++{ ++ ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; ++ int p; ++ ++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { ++ if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && ++ BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) ++ return (ddp); ++ } ++ return (NULL); ++} ++ ++uint64_t ++ddt_phys_total_refcnt(const ddt_entry_t *dde) ++{ ++ uint64_t refcnt = 0; ++ int p; ++ ++ for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) ++ refcnt += dde->dde_phys[p].ddp_refcnt; ++ ++ return (refcnt); ++} ++ ++static void ++ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) ++{ ++ spa_t *spa = ddt->ddt_spa; ++ ddt_phys_t *ddp = dde->dde_phys; ++ ddt_key_t *ddk = &dde->dde_key; ++ uint64_t lsize = DDK_GET_LSIZE(ddk); ++ uint64_t psize = DDK_GET_PSIZE(ddk); ++ int p, d; ++ ++ bzero(dds, sizeof (*dds)); ++ ++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { ++ uint64_t dsize = 0; ++ uint64_t refcnt = ddp->ddp_refcnt; ++ ++ if (ddp->ddp_phys_birth == 0) ++ continue; ++ ++ for (d = 0; d < SPA_DVAS_PER_BP; d++) ++ dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); ++ ++ dds->dds_blocks += 1; ++ dds->dds_lsize += lsize; ++ dds->dds_psize += psize; ++ dds->dds_dsize += dsize; ++ ++ dds->dds_ref_blocks += refcnt; ++ dds->dds_ref_lsize += lsize * refcnt; ++ dds->dds_ref_psize += psize * refcnt; ++ dds->dds_ref_dsize += dsize * refcnt; ++ } ++} ++ ++void ++ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) ++{ ++ const uint64_t *s = (const uint64_t *)src; ++ uint64_t *d = (uint64_t *)dst; ++ uint64_t *d_end = (uint64_t *)(dst + 1); ++ ++ ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ ++ ++ while (d < d_end) ++ *d++ += (*s++ ^ neg) - neg; ++} ++ ++static void ++ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) ++{ ++ ddt_stat_t dds; ++ ddt_histogram_t *ddh; ++ int bucket; ++ ++ ddt_stat_generate(ddt, dde, &dds); ++ ++ bucket = highbit(dds.dds_ref_blocks) - 1; ++ ASSERT(bucket >= 0); ++ ++ ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; ++ ++ ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); ++} ++ ++void ++ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) ++{ ++ int h; ++ ++ for (h = 0; h < 64; h++) ++ ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); ++} ++ ++void ++ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) ++{ ++ int h; ++ ++ bzero(dds, sizeof (*dds)); ++ ++ for (h = 0; h < 64; h++) ++ ddt_stat_add(dds, &ddh->ddh_stat[h], 0); ++} ++ ++boolean_t ++ddt_histogram_empty(const ddt_histogram_t *ddh) ++{ ++ const uint64_t *s = (const uint64_t *)ddh; ++ const uint64_t *s_end = (const uint64_t *)(ddh + 1); ++ ++ while (s < s_end) ++ if (*s++ != 0) ++ return (B_FALSE); ++ ++ return (B_TRUE); ++} ++ ++void ++ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) ++{ ++ enum zio_checksum c; ++ enum ddt_type type; ++ enum ddt_class class; ++ ++ /* Sum the statistics we cached in ddt_object_sync(). */ ++ for (c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ++ ddt_t *ddt = spa->spa_ddt[c]; ++ for (type = 0; type < DDT_TYPES; type++) { ++ for (class = 0; class < DDT_CLASSES; ++ class++) { ++ ddt_object_t *ddo = ++ &ddt->ddt_object_stats[type][class]; ++ ddo_total->ddo_count += ddo->ddo_count; ++ ddo_total->ddo_dspace += ddo->ddo_dspace; ++ ddo_total->ddo_mspace += ddo->ddo_mspace; ++ } ++ } ++ } ++ ++ /* ... and compute the averages. */ ++ if (ddo_total->ddo_count != 0) { ++ ddo_total->ddo_dspace /= ddo_total->ddo_count; ++ ddo_total->ddo_mspace /= ddo_total->ddo_count; ++ } ++} ++ ++void ++ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) ++{ ++ enum zio_checksum c; ++ enum ddt_type type; ++ enum ddt_class class; ++ ++ for (c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ++ ddt_t *ddt = spa->spa_ddt[c]; ++ for (type = 0; type < DDT_TYPES; type++) { ++ for (class = 0; class < DDT_CLASSES; ++ class++) { ++ ddt_histogram_add(ddh, ++ &ddt->ddt_histogram_cache[type][class]); ++ } ++ } ++ } ++} ++ ++void ++ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) ++{ ++ ddt_histogram_t *ddh_total; ++ ++ /* XXX: Move to a slab */ ++ ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_PUSHPAGE); ++ ddt_get_dedup_histogram(spa, ddh_total); ++ ddt_histogram_stat(dds_total, ddh_total); ++ kmem_free(ddh_total, sizeof (ddt_histogram_t)); ++} ++ ++uint64_t ++ddt_get_dedup_dspace(spa_t *spa) ++{ ++ ddt_stat_t dds_total = { 0 }; ++ ++ ddt_get_dedup_stats(spa, &dds_total); ++ return (dds_total.dds_ref_dsize - dds_total.dds_dsize); ++} ++ ++uint64_t ++ddt_get_pool_dedup_ratio(spa_t *spa) ++{ ++ ddt_stat_t dds_total = { 0 }; ++ ++ ddt_get_dedup_stats(spa, &dds_total); ++ if (dds_total.dds_dsize == 0) ++ return (100); ++ ++ return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); ++} ++ ++int ++ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) ++{ ++ spa_t *spa = ddt->ddt_spa; ++ uint64_t total_refcnt = 0; ++ uint64_t ditto = spa->spa_dedup_ditto; ++ int total_copies = 0; ++ int desired_copies = 0; ++ int p; ++ ++ for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ++ ddt_phys_t *ddp = &dde->dde_phys[p]; ++ zio_t *zio = dde->dde_lead_zio[p]; ++ uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ ++ if (zio != NULL) ++ refcnt += zio->io_parent_count; /* pending refs */ ++ if (ddp == ddp_willref) ++ refcnt++; /* caller's ref */ ++ if (refcnt != 0) { ++ total_refcnt += refcnt; ++ total_copies += p; ++ } ++ } ++ ++ if (ditto == 0 || ditto > UINT32_MAX) ++ ditto = UINT32_MAX; ++ ++ if (total_refcnt >= 1) ++ desired_copies++; ++ if (total_refcnt >= ditto) ++ desired_copies++; ++ if (total_refcnt >= ditto * ditto) ++ desired_copies++; ++ ++ return (MAX(desired_copies, total_copies) - total_copies); ++} ++ ++int ++ddt_ditto_copies_present(ddt_entry_t *dde) ++{ ++ ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; ++ dva_t *dva = ddp->ddp_dva; ++ int copies = 0 - DVA_GET_GANG(dva); ++ int d; ++ ++ for (d = 0; d < SPA_DVAS_PER_BP; d++, dva++) ++ if (DVA_IS_VALID(dva)) ++ copies++; ++ ++ ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); ++ ++ return (copies); ++} ++ ++size_t ++ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) ++{ ++ uchar_t *version = dst++; ++ int cpfunc = ZIO_COMPRESS_ZLE; ++ zio_compress_info_t *ci = &zio_compress_table[cpfunc]; ++ size_t c_len; ++ ++ ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ ++ ++ c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); ++ ++ if (c_len == s_len) { ++ cpfunc = ZIO_COMPRESS_OFF; ++ bcopy(src, dst, s_len); ++ } ++ ++ *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc; ++ ++ return (c_len + 1); ++} ++ ++void ++ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) ++{ ++ uchar_t version = *src++; ++ int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; ++ zio_compress_info_t *ci = &zio_compress_table[cpfunc]; ++ ++ if (ci->ci_decompress != NULL) ++ (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); ++ else ++ bcopy(src, dst, d_len); ++ ++ if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK) ++ byteswap_uint64_array(dst, d_len); ++} ++ ++ddt_t * ++ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) ++{ ++ return (spa->spa_ddt[c]); ++} ++ ++ddt_t * ++ddt_select(spa_t *spa, const blkptr_t *bp) ++{ ++ return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); ++} ++ ++void ++ddt_enter(ddt_t *ddt) ++{ ++ mutex_enter(&ddt->ddt_lock); ++} ++ ++void ++ddt_exit(ddt_t *ddt) ++{ ++ mutex_exit(&ddt->ddt_lock); ++} ++ ++static ddt_entry_t * ++ddt_alloc(const ddt_key_t *ddk) ++{ ++ ddt_entry_t *dde; ++ ++ /* XXX: Move to a slab */ ++ dde = kmem_zalloc(sizeof (ddt_entry_t), KM_PUSHPAGE); ++ cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); ++ ++ dde->dde_key = *ddk; ++ ++ return (dde); ++} ++ ++static void ++ddt_free(ddt_entry_t *dde) ++{ ++ int p; ++ ++ ASSERT(!dde->dde_loading); ++ ++ for (p = 0; p < DDT_PHYS_TYPES; p++) ++ ASSERT(dde->dde_lead_zio[p] == NULL); ++ ++ if (dde->dde_repair_data != NULL) ++ zio_buf_free(dde->dde_repair_data, ++ DDK_GET_PSIZE(&dde->dde_key)); ++ ++ cv_destroy(&dde->dde_cv); ++ kmem_free(dde, sizeof (*dde)); ++} ++ ++void ++ddt_remove(ddt_t *ddt, ddt_entry_t *dde) ++{ ++ ASSERT(MUTEX_HELD(&ddt->ddt_lock)); ++ ++ avl_remove(&ddt->ddt_tree, dde); ++ ddt_free(dde); ++} ++ ++ddt_entry_t * ++ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) ++{ ++ ddt_entry_t *dde, dde_search; ++ enum ddt_type type; ++ enum ddt_class class; ++ avl_index_t where; ++ int error; ++ ++ ASSERT(MUTEX_HELD(&ddt->ddt_lock)); ++ ++ ddt_key_fill(&dde_search.dde_key, bp); ++ ++ dde = avl_find(&ddt->ddt_tree, &dde_search, &where); ++ if (dde == NULL) { ++ if (!add) ++ return (NULL); ++ dde = ddt_alloc(&dde_search.dde_key); ++ avl_insert(&ddt->ddt_tree, dde, where); ++ } ++ ++ while (dde->dde_loading) ++ cv_wait(&dde->dde_cv, &ddt->ddt_lock); ++ ++ if (dde->dde_loaded) ++ return (dde); ++ ++ dde->dde_loading = B_TRUE; ++ ++ ddt_exit(ddt); ++ ++ error = ENOENT; ++ ++ for (type = 0; type < DDT_TYPES; type++) { ++ for (class = 0; class < DDT_CLASSES; class++) { ++ error = ddt_object_lookup(ddt, type, class, dde); ++ if (error != ENOENT) ++ break; ++ } ++ if (error != ENOENT) ++ break; ++ } ++ ++ ASSERT(error == 0 || error == ENOENT); ++ ++ ddt_enter(ddt); ++ ++ ASSERT(dde->dde_loaded == B_FALSE); ++ ASSERT(dde->dde_loading == B_TRUE); ++ ++ dde->dde_type = type; /* will be DDT_TYPES if no entry found */ ++ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ ++ dde->dde_loaded = B_TRUE; ++ dde->dde_loading = B_FALSE; ++ ++ if (error == 0) ++ ddt_stat_update(ddt, dde, -1ULL); ++ ++ cv_broadcast(&dde->dde_cv); ++ ++ return (dde); ++} ++ ++void ++ddt_prefetch(spa_t *spa, const blkptr_t *bp) ++{ ++ ddt_t *ddt; ++ ddt_entry_t dde; ++ enum ddt_type type; ++ enum ddt_class class; ++ ++ if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) ++ return; ++ ++ /* ++ * We only remove the DDT once all tables are empty and only ++ * prefetch dedup blocks when there are entries in the DDT. ++ * Thus no locking is required as the DDT can't disappear on us. ++ */ ++ ddt = ddt_select(spa, bp); ++ ddt_key_fill(&dde.dde_key, bp); ++ ++ for (type = 0; type < DDT_TYPES; type++) { ++ for (class = 0; class < DDT_CLASSES; class++) { ++ ddt_object_prefetch(ddt, type, class, &dde); ++ } ++ } ++} ++ ++int ++ddt_entry_compare(const void *x1, const void *x2) ++{ ++ const ddt_entry_t *dde1 = x1; ++ const ddt_entry_t *dde2 = x2; ++ const uint64_t *u1 = (const uint64_t *)&dde1->dde_key; ++ const uint64_t *u2 = (const uint64_t *)&dde2->dde_key; ++ int i; ++ ++ for (i = 0; i < DDT_KEY_WORDS; i++) { ++ if (u1[i] < u2[i]) ++ return (-1); ++ if (u1[i] > u2[i]) ++ return (1); ++ } ++ ++ return (0); ++} ++ ++static ddt_t * ++ddt_table_alloc(spa_t *spa, enum zio_checksum c) ++{ ++ ddt_t *ddt; ++ ++ /* XXX: Move to a slab */ ++ ddt = kmem_zalloc(sizeof (*ddt), KM_PUSHPAGE | KM_NODEBUG); ++ ++ mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); ++ avl_create(&ddt->ddt_tree, ddt_entry_compare, ++ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); ++ avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, ++ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); ++ ddt->ddt_checksum = c; ++ ddt->ddt_spa = spa; ++ ddt->ddt_os = spa->spa_meta_objset; ++ ++ return (ddt); ++} ++ ++static void ++ddt_table_free(ddt_t *ddt) ++{ ++ ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); ++ ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); ++ avl_destroy(&ddt->ddt_tree); ++ avl_destroy(&ddt->ddt_repair_tree); ++ mutex_destroy(&ddt->ddt_lock); ++ kmem_free(ddt, sizeof (*ddt)); ++} ++ ++void ++ddt_create(spa_t *spa) ++{ ++ enum zio_checksum c; ++ ++ spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; ++ ++ for (c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) ++ spa->spa_ddt[c] = ddt_table_alloc(spa, c); ++} ++ ++int ++ddt_load(spa_t *spa) ++{ ++ enum zio_checksum c; ++ enum ddt_type type; ++ enum ddt_class class; ++ int error; ++ ++ ddt_create(spa); ++ ++ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, ++ DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, ++ &spa->spa_ddt_stat_object); ++ ++ if (error) ++ return (error == ENOENT ? 0 : error); ++ ++ for (c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ++ ddt_t *ddt = spa->spa_ddt[c]; ++ for (type = 0; type < DDT_TYPES; type++) { ++ for (class = 0; class < DDT_CLASSES; ++ class++) { ++ error = ddt_object_load(ddt, type, class); ++ if (error != 0 && error != ENOENT) ++ return (error); ++ } ++ } ++ ++ /* ++ * Seed the cached histograms. ++ */ ++ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, ++ sizeof (ddt->ddt_histogram)); ++ } ++ ++ return (0); ++} ++ ++void ++ddt_unload(spa_t *spa) ++{ ++ enum zio_checksum c; ++ ++ for (c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ++ if (spa->spa_ddt[c]) { ++ ddt_table_free(spa->spa_ddt[c]); ++ spa->spa_ddt[c] = NULL; ++ } ++ } ++} ++ ++boolean_t ++ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) ++{ ++ ddt_t *ddt; ++ ddt_entry_t *dde; ++ enum ddt_type type; ++ enum ddt_class class; ++ ++ if (!BP_GET_DEDUP(bp)) ++ return (B_FALSE); ++ ++ if (max_class == DDT_CLASS_UNIQUE) ++ return (B_TRUE); ++ ++ ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; ++ dde = kmem_alloc(sizeof(ddt_entry_t), KM_PUSHPAGE); ++ ++ ddt_key_fill(&(dde->dde_key), bp); ++ ++ for (type = 0; type < DDT_TYPES; type++) { ++ for (class = 0; class <= max_class; class++) { ++ if (ddt_object_lookup(ddt, type, class, dde) == 0) { ++ kmem_free(dde, sizeof(ddt_entry_t)); ++ return (B_TRUE); ++ } ++ } ++ } ++ ++ kmem_free(dde, sizeof(ddt_entry_t)); ++ return (B_FALSE); ++} ++ ++ddt_entry_t * ++ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) ++{ ++ ddt_key_t ddk; ++ ddt_entry_t *dde; ++ enum ddt_type type; ++ enum ddt_class class; ++ ++ ddt_key_fill(&ddk, bp); ++ ++ dde = ddt_alloc(&ddk); ++ ++ for (type = 0; type < DDT_TYPES; type++) { ++ for (class = 0; class < DDT_CLASSES; class++) { ++ /* ++ * We can only do repair if there are multiple copies ++ * of the block. For anything in the UNIQUE class, ++ * there's definitely only one copy, so don't even try. ++ */ ++ if (class != DDT_CLASS_UNIQUE && ++ ddt_object_lookup(ddt, type, class, dde) == 0) ++ return (dde); ++ } ++ } ++ ++ bzero(dde->dde_phys, sizeof (dde->dde_phys)); ++ ++ return (dde); ++} ++ ++void ++ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) ++{ ++ avl_index_t where; ++ ++ ddt_enter(ddt); ++ ++ if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && ++ avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) ++ avl_insert(&ddt->ddt_repair_tree, dde, where); ++ else ++ ddt_free(dde); ++ ++ ddt_exit(ddt); ++} ++ ++static void ++ddt_repair_entry_done(zio_t *zio) ++{ ++ ddt_entry_t *rdde = zio->io_private; ++ ++ ddt_free(rdde); ++} ++ ++static void ++ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) ++{ ++ ddt_phys_t *ddp = dde->dde_phys; ++ ddt_phys_t *rddp = rdde->dde_phys; ++ ddt_key_t *ddk = &dde->dde_key; ++ ddt_key_t *rddk = &rdde->dde_key; ++ zio_t *zio; ++ blkptr_t blk; ++ int p; ++ ++ zio = zio_null(rio, rio->io_spa, NULL, ++ ddt_repair_entry_done, rdde, rio->io_flags); ++ ++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { ++ if (ddp->ddp_phys_birth == 0 || ++ ddp->ddp_phys_birth != rddp->ddp_phys_birth || ++ bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) ++ continue; ++ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); ++ zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, ++ rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, ++ ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); ++ } ++ ++ zio_nowait(zio); ++} ++ ++static void ++ddt_repair_table(ddt_t *ddt, zio_t *rio) ++{ ++ spa_t *spa = ddt->ddt_spa; ++ ddt_entry_t *dde, *rdde_next, *rdde; ++ avl_tree_t *t = &ddt->ddt_repair_tree; ++ blkptr_t blk; ++ ++ if (spa_sync_pass(spa) > 1) ++ return; ++ ++ ddt_enter(ddt); ++ for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { ++ rdde_next = AVL_NEXT(t, rdde); ++ avl_remove(&ddt->ddt_repair_tree, rdde); ++ ddt_exit(ddt); ++ ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); ++ dde = ddt_repair_start(ddt, &blk); ++ ddt_repair_entry(ddt, dde, rdde, rio); ++ ddt_repair_done(ddt, dde); ++ ddt_enter(ddt); ++ } ++ ddt_exit(ddt); ++} ++ ++static void ++ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) ++{ ++ dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; ++ ddt_phys_t *ddp = dde->dde_phys; ++ ddt_key_t *ddk = &dde->dde_key; ++ enum ddt_type otype = dde->dde_type; ++ enum ddt_type ntype = DDT_TYPE_CURRENT; ++ enum ddt_class oclass = dde->dde_class; ++ enum ddt_class nclass; ++ uint64_t total_refcnt = 0; ++ int p; ++ ++ ASSERT(dde->dde_loaded); ++ ASSERT(!dde->dde_loading); ++ ++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { ++ ASSERT(dde->dde_lead_zio[p] == NULL); ++ ASSERT((int64_t)ddp->ddp_refcnt >= 0); ++ if (ddp->ddp_phys_birth == 0) { ++ ASSERT(ddp->ddp_refcnt == 0); ++ continue; ++ } ++ if (p == DDT_PHYS_DITTO) { ++ if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) ++ ddt_phys_free(ddt, ddk, ddp, txg); ++ continue; ++ } ++ if (ddp->ddp_refcnt == 0) ++ ddt_phys_free(ddt, ddk, ddp, txg); ++ total_refcnt += ddp->ddp_refcnt; ++ } ++ ++ if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) ++ nclass = DDT_CLASS_DITTO; ++ else if (total_refcnt > 1) ++ nclass = DDT_CLASS_DUPLICATE; ++ else ++ nclass = DDT_CLASS_UNIQUE; ++ ++ if (otype != DDT_TYPES && ++ (otype != ntype || oclass != nclass || total_refcnt == 0)) { ++ VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); ++ ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); ++ } ++ ++ if (total_refcnt != 0) { ++ dde->dde_type = ntype; ++ dde->dde_class = nclass; ++ ddt_stat_update(ddt, dde, 0); ++ if (!ddt_object_exists(ddt, ntype, nclass)) ++ ddt_object_create(ddt, ntype, nclass, tx); ++ VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); ++ ++ /* ++ * If the class changes, the order that we scan this bp ++ * changes. If it decreases, we could miss it, so ++ * scan it right now. (This covers both class changing ++ * while we are doing ddt_walk(), and when we are ++ * traversing.) ++ */ ++ if (nclass < oclass) { ++ dsl_scan_ddt_entry(dp->dp_scan, ++ ddt->ddt_checksum, dde, tx); ++ } ++ } ++} ++ ++static void ++ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) ++{ ++ spa_t *spa = ddt->ddt_spa; ++ ddt_entry_t *dde; ++ void *cookie = NULL; ++ enum ddt_type type; ++ enum ddt_class class; ++ ++ if (avl_numnodes(&ddt->ddt_tree) == 0) ++ return; ++ ++ ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); ++ ++ if (spa->spa_ddt_stat_object == 0) { ++ spa->spa_ddt_stat_object = zap_create(ddt->ddt_os, ++ DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx); ++ VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, ++ DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, ++ &spa->spa_ddt_stat_object, tx) == 0); ++ } ++ ++ while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { ++ ddt_sync_entry(ddt, dde, tx, txg); ++ ddt_free(dde); ++ } ++ ++ for (type = 0; type < DDT_TYPES; type++) { ++ uint64_t add, count = 0; ++ for (class = 0; class < DDT_CLASSES; class++) { ++ if (ddt_object_exists(ddt, type, class)) { ++ ddt_object_sync(ddt, type, class, tx); ++ VERIFY(ddt_object_count(ddt, type, class, ++ &add) == 0); ++ count += add; ++ } ++ } ++ for (class = 0; class < DDT_CLASSES; class++) { ++ if (count == 0 && ddt_object_exists(ddt, type, class)) ++ ddt_object_destroy(ddt, type, class, tx); ++ } ++ } ++ ++ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, ++ sizeof (ddt->ddt_histogram)); ++} ++ ++void ++ddt_sync(spa_t *spa, uint64_t txg) ++{ ++ dmu_tx_t *tx; ++ zio_t *rio = zio_root(spa, NULL, NULL, ++ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); ++ enum zio_checksum c; ++ ++ ASSERT(spa_syncing_txg(spa) == txg); ++ ++ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); ++ ++ for (c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ++ ddt_t *ddt = spa->spa_ddt[c]; ++ if (ddt == NULL) ++ continue; ++ ddt_sync_table(ddt, tx, txg); ++ ddt_repair_table(ddt, rio); ++ } ++ ++ (void) zio_wait(rio); ++ ++ dmu_tx_commit(tx); ++} ++ ++int ++ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) ++{ ++ do { ++ do { ++ do { ++ ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; ++ int error = ENOENT; ++ if (ddt_object_exists(ddt, ddb->ddb_type, ++ ddb->ddb_class)) { ++ error = ddt_object_walk(ddt, ++ ddb->ddb_type, ddb->ddb_class, ++ &ddb->ddb_cursor, dde); ++ } ++ dde->dde_type = ddb->ddb_type; ++ dde->dde_class = ddb->ddb_class; ++ if (error == 0) ++ return (0); ++ if (error != ENOENT) ++ return (error); ++ ddb->ddb_cursor = 0; ++ } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); ++ ddb->ddb_checksum = 0; ++ } while (++ddb->ddb_type < DDT_TYPES); ++ ddb->ddb_type = 0; ++ } while (++ddb->ddb_class < DDT_CLASSES); ++ ++ return (ENOENT); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++module_param(zfs_dedup_prefetch, int, 0644); ++MODULE_PARM_DESC(zfs_dedup_prefetch,"Enable prefetching dedup-ed blks"); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/ddt_zap.c linux-3.2.33-go/fs/zfs/zfs/ddt_zap.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/ddt_zap.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/ddt_zap.c 2012-11-16 23:25:34.348039346 +0100 +@@ -0,0 +1,157 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int ddt_zap_leaf_blockshift = 12; ++int ddt_zap_indirect_blockshift = 12; ++ ++static int ++ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) ++{ ++ zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY; ++ ++ if (prehash) ++ flags |= ZAP_FLAG_PRE_HASHED_KEY; ++ ++ *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP, ++ ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift, ++ DMU_OT_NONE, 0, tx); ++ ++ return (*objectp == 0 ? ENOTSUP : 0); ++} ++ ++static int ++ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) ++{ ++ return (zap_destroy(os, object, tx)); ++} ++ ++static int ++ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde) ++{ ++ uchar_t *cbuf; ++ uint64_t one, csize; ++ int error; ++ ++ cbuf = kmem_alloc(sizeof (dde->dde_phys) + 1, KM_PUSHPAGE); ++ ++ error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key, ++ DDT_KEY_WORDS, &one, &csize); ++ if (error) ++ goto out; ++ ++ ASSERT(one == 1); ++ ASSERT(csize <= (sizeof (dde->dde_phys) + 1)); ++ ++ error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key, ++ DDT_KEY_WORDS, 1, csize, cbuf); ++ if (error) ++ goto out; ++ ++ ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); ++out: ++ kmem_free(cbuf, sizeof (dde->dde_phys) + 1); ++ ++ return (error); ++} ++ ++static void ++ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde) ++{ ++ (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key, ++ DDT_KEY_WORDS); ++} ++ ++static int ++ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) ++{ ++ uchar_t cbuf[sizeof (dde->dde_phys) + 1]; ++ uint64_t csize; ++ ++ csize = ddt_compress(dde->dde_phys, cbuf, ++ sizeof (dde->dde_phys), sizeof (cbuf)); ++ ++ return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key, ++ DDT_KEY_WORDS, 1, csize, cbuf, tx)); ++} ++ ++static int ++ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) ++{ ++ return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key, ++ DDT_KEY_WORDS, tx)); ++} ++ ++static int ++ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) ++{ ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ int error; ++ ++ zap_cursor_init_serialized(&zc, os, object, *walk); ++ if ((error = zap_cursor_retrieve(&zc, &za)) == 0) { ++ uchar_t cbuf[sizeof (dde->dde_phys) + 1]; ++ uint64_t csize = za.za_num_integers; ++ ASSERT(za.za_integer_length == 1); ++ error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name, ++ DDT_KEY_WORDS, 1, csize, cbuf); ++ ASSERT(error == 0); ++ if (error == 0) { ++ ddt_decompress(cbuf, dde->dde_phys, csize, ++ sizeof (dde->dde_phys)); ++ dde->dde_key = *(ddt_key_t *)za.za_name; ++ } ++ zap_cursor_advance(&zc); ++ *walk = zap_cursor_serialize(&zc); ++ } ++ zap_cursor_fini(&zc); ++ return (error); ++} ++ ++static int ++ddt_zap_count(objset_t *os, uint64_t object, uint64_t *count) ++{ ++ return zap_count(os, object, count); ++} ++ ++const ddt_ops_t ddt_zap_ops = { ++ "zap", ++ ddt_zap_create, ++ ddt_zap_destroy, ++ ddt_zap_lookup, ++ ddt_zap_prefetch, ++ ddt_zap_update, ++ ddt_zap_remove, ++ ddt_zap_walk, ++ ddt_zap_count, ++}; +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dmu.c linux-3.2.33-go/fs/zfs/zfs/dmu.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dmu.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dmu.c 2012-11-16 23:25:34.353039289 +0100 +@@ -0,0 +1,1983 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef _KERNEL ++#include ++#include ++#endif ++ ++const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { ++ { byteswap_uint8_array, TRUE, "unallocated" }, ++ { zap_byteswap, TRUE, "object directory" }, ++ { byteswap_uint64_array, TRUE, "object array" }, ++ { byteswap_uint8_array, TRUE, "packed nvlist" }, ++ { byteswap_uint64_array, TRUE, "packed nvlist size" }, ++ { byteswap_uint64_array, TRUE, "bpobj" }, ++ { byteswap_uint64_array, TRUE, "bpobj header" }, ++ { byteswap_uint64_array, TRUE, "SPA space map header" }, ++ { byteswap_uint64_array, TRUE, "SPA space map" }, ++ { byteswap_uint64_array, TRUE, "ZIL intent log" }, ++ { dnode_buf_byteswap, TRUE, "DMU dnode" }, ++ { dmu_objset_byteswap, TRUE, "DMU objset" }, ++ { byteswap_uint64_array, TRUE, "DSL directory" }, ++ { zap_byteswap, TRUE, "DSL directory child map"}, ++ { zap_byteswap, TRUE, "DSL dataset snap map" }, ++ { zap_byteswap, TRUE, "DSL props" }, ++ { byteswap_uint64_array, TRUE, "DSL dataset" }, ++ { zfs_znode_byteswap, TRUE, "ZFS znode" }, ++ { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" }, ++ { byteswap_uint8_array, FALSE, "ZFS plain file" }, ++ { zap_byteswap, TRUE, "ZFS directory" }, ++ { zap_byteswap, TRUE, "ZFS master node" }, ++ { zap_byteswap, TRUE, "ZFS delete queue" }, ++ { byteswap_uint8_array, FALSE, "zvol object" }, ++ { zap_byteswap, TRUE, "zvol prop" }, ++ { byteswap_uint8_array, FALSE, "other uint8[]" }, ++ { byteswap_uint64_array, FALSE, "other uint64[]" }, ++ { zap_byteswap, TRUE, "other ZAP" }, ++ { zap_byteswap, TRUE, "persistent error log" }, ++ { byteswap_uint8_array, TRUE, "SPA history" }, ++ { byteswap_uint64_array, TRUE, "SPA history offsets" }, ++ { zap_byteswap, TRUE, "Pool properties" }, ++ { zap_byteswap, TRUE, "DSL permissions" }, ++ { zfs_acl_byteswap, TRUE, "ZFS ACL" }, ++ { byteswap_uint8_array, TRUE, "ZFS SYSACL" }, ++ { byteswap_uint8_array, TRUE, "FUID table" }, ++ { byteswap_uint64_array, TRUE, "FUID table size" }, ++ { zap_byteswap, TRUE, "DSL dataset next clones"}, ++ { zap_byteswap, TRUE, "scan work queue" }, ++ { zap_byteswap, TRUE, "ZFS user/group used" }, ++ { zap_byteswap, TRUE, "ZFS user/group quota" }, ++ { zap_byteswap, TRUE, "snapshot refcount tags"}, ++ { zap_byteswap, TRUE, "DDT ZAP algorithm" }, ++ { zap_byteswap, TRUE, "DDT statistics" }, ++ { byteswap_uint8_array, TRUE, "System attributes" }, ++ { zap_byteswap, TRUE, "SA master node" }, ++ { zap_byteswap, TRUE, "SA attr registration" }, ++ { zap_byteswap, TRUE, "SA attr layouts" }, ++ { zap_byteswap, TRUE, "scan translations" }, ++ { byteswap_uint8_array, FALSE, "deduplicated block" }, ++ { zap_byteswap, TRUE, "DSL deadlist map" }, ++ { byteswap_uint64_array, TRUE, "DSL deadlist map hdr" }, ++ { zap_byteswap, TRUE, "DSL dir clones" }, ++ { byteswap_uint64_array, TRUE, "bpobj subobj" }, ++}; ++ ++int ++dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, ++ void *tag, dmu_buf_t **dbp, int flags) ++{ ++ dnode_t *dn; ++ uint64_t blkid; ++ dmu_buf_impl_t *db; ++ int err; ++ int db_flags = DB_RF_CANFAIL; ++ ++ if (flags & DMU_READ_NO_PREFETCH) ++ db_flags |= DB_RF_NOPREFETCH; ++ ++ err = dnode_hold(os, object, FTAG, &dn); ++ if (err) ++ return (err); ++ blkid = dbuf_whichblock(dn, offset); ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ db = dbuf_hold(dn, blkid, tag); ++ rw_exit(&dn->dn_struct_rwlock); ++ if (db == NULL) { ++ err = EIO; ++ } else { ++ err = dbuf_read(db, NULL, db_flags); ++ if (err) { ++ dbuf_rele(db, tag); ++ db = NULL; ++ } ++ } ++ ++ dnode_rele(dn, FTAG); ++ *dbp = &db->db; /* NULL db plus first field offset is NULL */ ++ return (err); ++} ++ ++int ++dmu_bonus_max(void) ++{ ++ return (DN_MAX_BONUSLEN); ++} ++ ++int ++dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ++ dnode_t *dn; ++ int error; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ ++ if (dn->dn_bonus != db) { ++ error = EINVAL; ++ } else if (newsize < 0 || newsize > db_fake->db_size) { ++ error = EINVAL; ++ } else { ++ dnode_setbonuslen(dn, newsize, tx); ++ error = 0; ++ } ++ ++ DB_DNODE_EXIT(db); ++ return (error); ++} ++ ++int ++dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ++ dnode_t *dn; ++ int error; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ ++ if (type > DMU_OT_NUMTYPES) { ++ error = EINVAL; ++ } else if (dn->dn_bonus != db) { ++ error = EINVAL; ++ } else { ++ dnode_setbonus_type(dn, type, tx); ++ error = 0; ++ } ++ ++ DB_DNODE_EXIT(db); ++ return (error); ++} ++ ++dmu_object_type_t ++dmu_get_bonustype(dmu_buf_t *db_fake) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ++ dnode_t *dn; ++ dmu_object_type_t type; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ type = dn->dn_bonustype; ++ DB_DNODE_EXIT(db); ++ ++ return (type); ++} ++ ++int ++dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) ++{ ++ dnode_t *dn; ++ int error; ++ ++ error = dnode_hold(os, object, FTAG, &dn); ++ dbuf_rm_spill(dn, tx); ++ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ++ dnode_rm_spill(dn, tx); ++ rw_exit(&dn->dn_struct_rwlock); ++ dnode_rele(dn, FTAG); ++ return (error); ++} ++ ++/* ++ * returns ENOENT, EIO, or 0. ++ */ ++int ++dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) ++{ ++ dnode_t *dn; ++ dmu_buf_impl_t *db; ++ int error; ++ ++ error = dnode_hold(os, object, FTAG, &dn); ++ if (error) ++ return (error); ++ ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ if (dn->dn_bonus == NULL) { ++ rw_exit(&dn->dn_struct_rwlock); ++ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ++ if (dn->dn_bonus == NULL) ++ dbuf_create_bonus(dn); ++ } ++ db = dn->dn_bonus; ++ ++ /* as long as the bonus buf is held, the dnode will be held */ ++ if (refcount_add(&db->db_holds, tag) == 1) { ++ VERIFY(dnode_add_ref(dn, db)); ++ (void) atomic_inc_32_nv(&dn->dn_dbufs_count); ++ } ++ ++ /* ++ * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's ++ * hold and incrementing the dbuf count to ensure that dnode_move() sees ++ * a dnode hold for every dbuf. ++ */ ++ rw_exit(&dn->dn_struct_rwlock); ++ ++ dnode_rele(dn, FTAG); ++ ++ VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); ++ ++ *dbp = &db->db; ++ return (0); ++} ++ ++/* ++ * returns ENOENT, EIO, or 0. ++ * ++ * This interface will allocate a blank spill dbuf when a spill blk ++ * doesn't already exist on the dnode. ++ * ++ * if you only want to find an already existing spill db, then ++ * dmu_spill_hold_existing() should be used. ++ */ ++int ++dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) ++{ ++ dmu_buf_impl_t *db = NULL; ++ int err; ++ ++ if ((flags & DB_RF_HAVESTRUCT) == 0) ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ ++ db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); ++ ++ if ((flags & DB_RF_HAVESTRUCT) == 0) ++ rw_exit(&dn->dn_struct_rwlock); ++ ++ ASSERT(db != NULL); ++ err = dbuf_read(db, NULL, flags); ++ if (err == 0) ++ *dbp = &db->db; ++ else ++ dbuf_rele(db, tag); ++ return (err); ++} ++ ++int ++dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; ++ dnode_t *dn; ++ int err; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ ++ if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { ++ err = EINVAL; ++ } else { ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ ++ if (!dn->dn_have_spill) { ++ err = ENOENT; ++ } else { ++ err = dmu_spill_hold_by_dnode(dn, ++ DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); ++ } ++ ++ rw_exit(&dn->dn_struct_rwlock); ++ } ++ ++ DB_DNODE_EXIT(db); ++ return (err); ++} ++ ++int ++dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; ++ dnode_t *dn; ++ int err; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); ++ DB_DNODE_EXIT(db); ++ ++ return (err); ++} ++ ++/* ++ * Note: longer-term, we should modify all of the dmu_buf_*() interfaces ++ * to take a held dnode rather than -- the lookup is wasteful, ++ * and can induce severe lock contention when writing to several files ++ * whose dnodes are in the same block. ++ */ ++static int ++dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, ++ int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) ++{ ++ dsl_pool_t *dp = NULL; ++ dmu_buf_t **dbp; ++ uint64_t blkid, nblks, i; ++ uint32_t dbuf_flags; ++ int err; ++ zio_t *zio; ++ hrtime_t start = 0; ++ ++ ASSERT(length <= DMU_MAX_ACCESS); ++ ++ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; ++ if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) ++ dbuf_flags |= DB_RF_NOPREFETCH; ++ ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ if (dn->dn_datablkshift) { ++ int blkshift = dn->dn_datablkshift; ++ nblks = (P2ROUNDUP(offset+length, 1ULL<> blkshift; ++ } else { ++ if (offset + length > dn->dn_datablksz) { ++ zfs_panic_recover("zfs: accessing past end of object " ++ "%llx/%llx (size=%u access=%llu+%llu)", ++ (longlong_t)dn->dn_objset-> ++ os_dsl_dataset->ds_object, ++ (longlong_t)dn->dn_object, dn->dn_datablksz, ++ (longlong_t)offset, (longlong_t)length); ++ rw_exit(&dn->dn_struct_rwlock); ++ return (EIO); ++ } ++ nblks = 1; ++ } ++ dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_PUSHPAGE | KM_NODEBUG); ++ ++ if (dn->dn_objset->os_dsl_dataset) ++ dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; ++ if (dp && dsl_pool_sync_context(dp)) ++ start = gethrtime(); ++ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); ++ blkid = dbuf_whichblock(dn, offset); ++ for (i = 0; i < nblks; i++) { ++ dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); ++ if (db == NULL) { ++ rw_exit(&dn->dn_struct_rwlock); ++ dmu_buf_rele_array(dbp, nblks, tag); ++ zio_nowait(zio); ++ return (EIO); ++ } ++ /* initiate async i/o */ ++ if (read) { ++ (void) dbuf_read(db, zio, dbuf_flags); ++ } ++ dbp[i] = &db->db; ++ } ++ rw_exit(&dn->dn_struct_rwlock); ++ ++ /* wait for async i/o */ ++ err = zio_wait(zio); ++ /* track read overhead when we are in sync context */ ++ if (dp && dsl_pool_sync_context(dp)) ++ dp->dp_read_overhead += gethrtime() - start; ++ if (err) { ++ dmu_buf_rele_array(dbp, nblks, tag); ++ return (err); ++ } ++ ++ /* wait for other io to complete */ ++ if (read) { ++ for (i = 0; i < nblks; i++) { ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; ++ mutex_enter(&db->db_mtx); ++ while (db->db_state == DB_READ || ++ db->db_state == DB_FILL) ++ cv_wait(&db->db_changed, &db->db_mtx); ++ if (db->db_state == DB_UNCACHED) ++ err = EIO; ++ mutex_exit(&db->db_mtx); ++ if (err) { ++ dmu_buf_rele_array(dbp, nblks, tag); ++ return (err); ++ } ++ } ++ } ++ ++ *numbufsp = nblks; ++ *dbpp = dbp; ++ return (0); ++} ++ ++static int ++dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, ++ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) ++{ ++ dnode_t *dn; ++ int err; ++ ++ err = dnode_hold(os, object, FTAG, &dn); ++ if (err) ++ return (err); ++ ++ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, ++ numbufsp, dbpp, DMU_READ_PREFETCH); ++ ++ dnode_rele(dn, FTAG); ++ ++ return (err); ++} ++ ++int ++dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, ++ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ++ dnode_t *dn; ++ int err; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, ++ numbufsp, dbpp, DMU_READ_PREFETCH); ++ DB_DNODE_EXIT(db); ++ ++ return (err); ++} ++ ++void ++dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) ++{ ++ int i; ++ dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; ++ ++ if (numbufs == 0) ++ return; ++ ++ for (i = 0; i < numbufs; i++) { ++ if (dbp[i]) ++ dbuf_rele(dbp[i], tag); ++ } ++ ++ kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); ++} ++ ++void ++dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) ++{ ++ dnode_t *dn; ++ uint64_t blkid; ++ int nblks, i, err; ++ ++ if (zfs_prefetch_disable) ++ return; ++ ++ if (len == 0) { /* they're interested in the bonus buffer */ ++ dn = DMU_META_DNODE(os); ++ ++ if (object == 0 || object >= DN_MAX_OBJECT) ++ return; ++ ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); ++ dbuf_prefetch(dn, blkid); ++ rw_exit(&dn->dn_struct_rwlock); ++ return; ++ } ++ ++ /* ++ * XXX - Note, if the dnode for the requested object is not ++ * already cached, we will do a *synchronous* read in the ++ * dnode_hold() call. The same is true for any indirects. ++ */ ++ err = dnode_hold(os, object, FTAG, &dn); ++ if (err != 0) ++ return; ++ ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ if (dn->dn_datablkshift) { ++ int blkshift = dn->dn_datablkshift; ++ nblks = (P2ROUNDUP(offset+len, 1<> blkshift; ++ } else { ++ nblks = (offset < dn->dn_datablksz); ++ } ++ ++ if (nblks != 0) { ++ blkid = dbuf_whichblock(dn, offset); ++ for (i = 0; i < nblks; i++) ++ dbuf_prefetch(dn, blkid+i); ++ } ++ ++ rw_exit(&dn->dn_struct_rwlock); ++ ++ dnode_rele(dn, FTAG); ++} ++ ++/* ++ * Get the next "chunk" of file data to free. We traverse the file from ++ * the end so that the file gets shorter over time (if we crashes in the ++ * middle, this will leave us in a better state). We find allocated file ++ * data by simply searching the allocated level 1 indirects. ++ */ ++static int ++get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) ++{ ++ uint64_t len = *start - limit; ++ uint64_t blkcnt = 0; ++ uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1)); ++ uint64_t iblkrange = ++ dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); ++ ++ ASSERT(limit <= *start); ++ ++ if (len <= iblkrange * maxblks) { ++ *start = limit; ++ return (0); ++ } ++ ASSERT(ISP2(iblkrange)); ++ ++ while (*start > limit && blkcnt < maxblks) { ++ int err; ++ ++ /* find next allocated L1 indirect */ ++ err = dnode_next_offset(dn, ++ DNODE_FIND_BACKWARDS, start, 2, 1, 0); ++ ++ /* if there are no more, then we are done */ ++ if (err == ESRCH) { ++ *start = limit; ++ return (0); ++ } else if (err) { ++ return (err); ++ } ++ blkcnt += 1; ++ ++ /* reset offset to end of "next" block back */ ++ *start = P2ALIGN(*start, iblkrange); ++ if (*start <= limit) ++ *start = limit; ++ else ++ *start -= 1; ++ } ++ return (0); ++} ++ ++static int ++dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, ++ uint64_t length, boolean_t free_dnode) ++{ ++ dmu_tx_t *tx; ++ uint64_t object_size, start, end, len; ++ boolean_t trunc = (length == DMU_OBJECT_END); ++ int align, err; ++ ++ align = 1 << dn->dn_datablkshift; ++ ASSERT(align > 0); ++ object_size = align == 1 ? dn->dn_datablksz : ++ (dn->dn_maxblkid + 1) << dn->dn_datablkshift; ++ ++ end = offset + length; ++ if (trunc || end > object_size) ++ end = object_size; ++ if (end <= offset) ++ return (0); ++ length = end - offset; ++ ++ while (length) { ++ start = end; ++ /* assert(offset <= start) */ ++ err = get_next_chunk(dn, &start, offset); ++ if (err) ++ return (err); ++ len = trunc ? DMU_OBJECT_END : end - start; ++ ++ tx = dmu_tx_create(os); ++ dmu_tx_hold_free(tx, dn->dn_object, start, len); ++ err = dmu_tx_assign(tx, TXG_WAIT); ++ if (err) { ++ dmu_tx_abort(tx); ++ return (err); ++ } ++ ++ dnode_free_range(dn, start, trunc ? -1 : len, tx); ++ ++ if (start == 0 && free_dnode) { ++ ASSERT(trunc); ++ dnode_free(dn, tx); ++ } ++ ++ length -= end - start; ++ ++ dmu_tx_commit(tx); ++ end = start; ++ } ++ return (0); ++} ++ ++int ++dmu_free_long_range(objset_t *os, uint64_t object, ++ uint64_t offset, uint64_t length) ++{ ++ dnode_t *dn; ++ int err; ++ ++ err = dnode_hold(os, object, FTAG, &dn); ++ if (err != 0) ++ return (err); ++ err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); ++ dnode_rele(dn, FTAG); ++ return (err); ++} ++ ++int ++dmu_free_object(objset_t *os, uint64_t object) ++{ ++ dnode_t *dn; ++ dmu_tx_t *tx; ++ int err; ++ ++ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, ++ FTAG, &dn); ++ if (err != 0) ++ return (err); ++ if (dn->dn_nlevels == 1) { ++ tx = dmu_tx_create(os); ++ dmu_tx_hold_bonus(tx, object); ++ dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); ++ err = dmu_tx_assign(tx, TXG_WAIT); ++ if (err == 0) { ++ dnode_free_range(dn, 0, DMU_OBJECT_END, tx); ++ dnode_free(dn, tx); ++ dmu_tx_commit(tx); ++ } else { ++ dmu_tx_abort(tx); ++ } ++ } else { ++ err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE); ++ } ++ dnode_rele(dn, FTAG); ++ return (err); ++} ++ ++int ++dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, ++ uint64_t size, dmu_tx_t *tx) ++{ ++ dnode_t *dn; ++ int err = dnode_hold(os, object, FTAG, &dn); ++ if (err) ++ return (err); ++ ASSERT(offset < UINT64_MAX); ++ ASSERT(size == -1ULL || size <= UINT64_MAX - offset); ++ dnode_free_range(dn, offset, size, tx); ++ dnode_rele(dn, FTAG); ++ return (0); ++} ++ ++int ++dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, ++ void *buf, uint32_t flags) ++{ ++ dnode_t *dn; ++ dmu_buf_t **dbp; ++ int numbufs, err; ++ ++ err = dnode_hold(os, object, FTAG, &dn); ++ if (err) ++ return (err); ++ ++ /* ++ * Deal with odd block sizes, where there can't be data past the first ++ * block. If we ever do the tail block optimization, we will need to ++ * handle that here as well. ++ */ ++ if (dn->dn_maxblkid == 0) { ++ int newsz = offset > dn->dn_datablksz ? 0 : ++ MIN(size, dn->dn_datablksz - offset); ++ bzero((char *)buf + newsz, size - newsz); ++ size = newsz; ++ } ++ ++ while (size > 0) { ++ uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); ++ int i; ++ ++ /* ++ * NB: we could do this block-at-a-time, but it's nice ++ * to be reading in parallel. ++ */ ++ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, ++ TRUE, FTAG, &numbufs, &dbp, flags); ++ if (err) ++ break; ++ ++ for (i = 0; i < numbufs; i++) { ++ int tocpy; ++ int bufoff; ++ dmu_buf_t *db = dbp[i]; ++ ++ ASSERT(size > 0); ++ ++ bufoff = offset - db->db_offset; ++ tocpy = (int)MIN(db->db_size - bufoff, size); ++ ++ bcopy((char *)db->db_data + bufoff, buf, tocpy); ++ ++ offset += tocpy; ++ size -= tocpy; ++ buf = (char *)buf + tocpy; ++ } ++ dmu_buf_rele_array(dbp, numbufs, FTAG); ++ } ++ dnode_rele(dn, FTAG); ++ return (err); ++} ++ ++void ++dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, ++ const void *buf, dmu_tx_t *tx) ++{ ++ dmu_buf_t **dbp; ++ int numbufs, i; ++ ++ if (size == 0) ++ return; ++ ++ VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, ++ FALSE, FTAG, &numbufs, &dbp)); ++ ++ for (i = 0; i < numbufs; i++) { ++ int tocpy; ++ int bufoff; ++ dmu_buf_t *db = dbp[i]; ++ ++ ASSERT(size > 0); ++ ++ bufoff = offset - db->db_offset; ++ tocpy = (int)MIN(db->db_size - bufoff, size); ++ ++ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); ++ ++ if (tocpy == db->db_size) ++ dmu_buf_will_fill(db, tx); ++ else ++ dmu_buf_will_dirty(db, tx); ++ ++ (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); ++ ++ if (tocpy == db->db_size) ++ dmu_buf_fill_done(db, tx); ++ ++ offset += tocpy; ++ size -= tocpy; ++ buf = (char *)buf + tocpy; ++ } ++ dmu_buf_rele_array(dbp, numbufs, FTAG); ++} ++ ++void ++dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, ++ dmu_tx_t *tx) ++{ ++ dmu_buf_t **dbp; ++ int numbufs, i; ++ ++ if (size == 0) ++ return; ++ ++ VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, ++ FALSE, FTAG, &numbufs, &dbp)); ++ ++ for (i = 0; i < numbufs; i++) { ++ dmu_buf_t *db = dbp[i]; ++ ++ dmu_buf_will_not_fill(db, tx); ++ } ++ dmu_buf_rele_array(dbp, numbufs, FTAG); ++} ++ ++/* ++ * DMU support for xuio ++ */ ++kstat_t *xuio_ksp = NULL; ++ ++typedef struct xuio_stats { ++ /* loaned yet not returned arc_buf */ ++ kstat_named_t xuiostat_onloan_rbuf; ++ kstat_named_t xuiostat_onloan_wbuf; ++ /* whether a copy is made when loaning out a read buffer */ ++ kstat_named_t xuiostat_rbuf_copied; ++ kstat_named_t xuiostat_rbuf_nocopy; ++ /* whether a copy is made when assigning a write buffer */ ++ kstat_named_t xuiostat_wbuf_copied; ++ kstat_named_t xuiostat_wbuf_nocopy; ++} xuio_stats_t; ++ ++static xuio_stats_t xuio_stats = { ++ { "onloan_read_buf", KSTAT_DATA_UINT64 }, ++ { "onloan_write_buf", KSTAT_DATA_UINT64 }, ++ { "read_buf_copied", KSTAT_DATA_UINT64 }, ++ { "read_buf_nocopy", KSTAT_DATA_UINT64 }, ++ { "write_buf_copied", KSTAT_DATA_UINT64 }, ++ { "write_buf_nocopy", KSTAT_DATA_UINT64 } ++}; ++ ++#define XUIOSTAT_INCR(stat, val) \ ++ atomic_add_64(&xuio_stats.stat.value.ui64, (val)) ++#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) ++ ++int ++dmu_xuio_init(xuio_t *xuio, int nblk) ++{ ++ dmu_xuio_t *priv; ++ uio_t *uio = &xuio->xu_uio; ++ ++ uio->uio_iovcnt = nblk; ++ uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_PUSHPAGE); ++ ++ priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_PUSHPAGE); ++ priv->cnt = nblk; ++ priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_PUSHPAGE); ++ priv->iovp = uio->uio_iov; ++ XUIO_XUZC_PRIV(xuio) = priv; ++ ++ if (XUIO_XUZC_RW(xuio) == UIO_READ) ++ XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); ++ else ++ XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); ++ ++ return (0); ++} ++ ++void ++dmu_xuio_fini(xuio_t *xuio) ++{ ++ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ++ int nblk = priv->cnt; ++ ++ kmem_free(priv->iovp, nblk * sizeof (iovec_t)); ++ kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); ++ kmem_free(priv, sizeof (dmu_xuio_t)); ++ ++ if (XUIO_XUZC_RW(xuio) == UIO_READ) ++ XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); ++ else ++ XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); ++} ++ ++/* ++ * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } ++ * and increase priv->next by 1. ++ */ ++int ++dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) ++{ ++ struct iovec *iov; ++ uio_t *uio = &xuio->xu_uio; ++ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ++ int i = priv->next++; ++ ++ ASSERT(i < priv->cnt); ++ ASSERT(off + n <= arc_buf_size(abuf)); ++ iov = uio->uio_iov + i; ++ iov->iov_base = (char *)abuf->b_data + off; ++ iov->iov_len = n; ++ priv->bufs[i] = abuf; ++ return (0); ++} ++ ++int ++dmu_xuio_cnt(xuio_t *xuio) ++{ ++ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ++ return (priv->cnt); ++} ++ ++arc_buf_t * ++dmu_xuio_arcbuf(xuio_t *xuio, int i) ++{ ++ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ++ ++ ASSERT(i < priv->cnt); ++ return (priv->bufs[i]); ++} ++ ++void ++dmu_xuio_clear(xuio_t *xuio, int i) ++{ ++ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ++ ++ ASSERT(i < priv->cnt); ++ priv->bufs[i] = NULL; ++} ++ ++static void ++xuio_stat_init(void) ++{ ++ xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", ++ KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), ++ KSTAT_FLAG_VIRTUAL); ++ if (xuio_ksp != NULL) { ++ xuio_ksp->ks_data = &xuio_stats; ++ kstat_install(xuio_ksp); ++ } ++} ++ ++static void ++xuio_stat_fini(void) ++{ ++ if (xuio_ksp != NULL) { ++ kstat_delete(xuio_ksp); ++ xuio_ksp = NULL; ++ } ++} ++ ++void ++xuio_stat_wbuf_copied() ++{ ++ XUIOSTAT_BUMP(xuiostat_wbuf_copied); ++} ++ ++void ++xuio_stat_wbuf_nocopy() ++{ ++ XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); ++} ++ ++#ifdef _KERNEL ++ ++/* ++ * Copy up to size bytes between arg_buf and req based on the data direction ++ * described by the req. If an entire req's data cannot be transfered the ++ * req's is updated such that it's current index and bv offsets correctly ++ * reference any residual data which could not be copied. The return value ++ * is the number of bytes successfully copied to arg_buf. ++ */ ++static int ++dmu_req_copy(void *arg_buf, int size, int *offset, struct request *req) ++{ ++ struct bio_vec *bv; ++ struct req_iterator iter; ++ char *bv_buf; ++ int tocpy; ++ ++ *offset = 0; ++ rq_for_each_segment(bv, req, iter) { ++ ++ /* Fully consumed the passed arg_buf */ ++ ASSERT3S(*offset, <=, size); ++ if (size == *offset) ++ break; ++ ++ /* Skip fully consumed bv's */ ++ if (bv->bv_len == 0) ++ continue; ++ ++ tocpy = MIN(bv->bv_len, size - *offset); ++ ASSERT3S(tocpy, >=, 0); ++ ++ bv_buf = page_address(bv->bv_page) + bv->bv_offset; ++ ASSERT3P(bv_buf, !=, NULL); ++ ++ if (rq_data_dir(req) == WRITE) ++ memcpy(arg_buf + *offset, bv_buf, tocpy); ++ else ++ memcpy(bv_buf, arg_buf + *offset, tocpy); ++ ++ *offset += tocpy; ++ bv->bv_offset += tocpy; ++ bv->bv_len -= tocpy; ++ } ++ ++ return 0; ++} ++ ++static void ++dmu_bio_put(struct bio *bio) ++{ ++ struct bio *bio_next; ++ ++ while (bio) { ++ bio_next = bio->bi_next; ++ bio_put(bio); ++ bio = bio_next; ++ } ++} ++ ++static int ++dmu_bio_clone(struct bio *bio, struct bio **bio_copy) ++{ ++ struct bio *bio_root = NULL; ++ struct bio *bio_last = NULL; ++ struct bio *bio_new; ++ ++ if (bio == NULL) ++ return EINVAL; ++ ++ while (bio) { ++ bio_new = bio_clone(bio, GFP_NOIO); ++ if (bio_new == NULL) { ++ dmu_bio_put(bio_root); ++ return ENOMEM; ++ } ++ ++ if (bio_last) { ++ bio_last->bi_next = bio_new; ++ bio_last = bio_new; ++ } else { ++ bio_root = bio_new; ++ bio_last = bio_new; ++ } ++ ++ bio = bio->bi_next; ++ } ++ ++ *bio_copy = bio_root; ++ ++ return 0; ++} ++ ++int ++dmu_read_req(objset_t *os, uint64_t object, struct request *req) ++{ ++ uint64_t size = blk_rq_bytes(req); ++ uint64_t offset = blk_rq_pos(req) << 9; ++ struct bio *bio_saved = req->bio; ++ dmu_buf_t **dbp; ++ int numbufs, i, err; ++ ++ /* ++ * NB: we could do this block-at-a-time, but it's nice ++ * to be reading in parallel. ++ */ ++ err = dmu_buf_hold_array(os, object, offset, size, TRUE, FTAG, ++ &numbufs, &dbp); ++ if (err) ++ return (err); ++ ++ /* ++ * Clone the bio list so the bv->bv_offset and bv->bv_len members ++ * can be safely modified. The original bio list is relinked in to ++ * the request when the function exits. This is required because ++ * some file systems blindly assume that these values will remain ++ * constant between bio_submit() and the IO completion callback. ++ */ ++ err = dmu_bio_clone(bio_saved, &req->bio); ++ if (err) ++ goto error; ++ ++ for (i = 0; i < numbufs; i++) { ++ int tocpy, didcpy, bufoff; ++ dmu_buf_t *db = dbp[i]; ++ ++ bufoff = offset - db->db_offset; ++ ASSERT3S(bufoff, >=, 0); ++ ++ tocpy = (int)MIN(db->db_size - bufoff, size); ++ if (tocpy == 0) ++ break; ++ ++ err = dmu_req_copy(db->db_data + bufoff, tocpy, &didcpy, req); ++ ++ if (didcpy < tocpy) ++ err = EIO; ++ ++ if (err) ++ break; ++ ++ size -= tocpy; ++ offset += didcpy; ++ err = 0; ++ } ++ ++ dmu_bio_put(req->bio); ++ req->bio = bio_saved; ++error: ++ dmu_buf_rele_array(dbp, numbufs, FTAG); ++ ++ return (err); ++} ++ ++int ++dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) ++{ ++ uint64_t size = blk_rq_bytes(req); ++ uint64_t offset = blk_rq_pos(req) << 9; ++ struct bio *bio_saved = req->bio; ++ dmu_buf_t **dbp; ++ int numbufs; ++ int err = 0; ++ int i; ++ ++ if (size == 0) ++ return (0); ++ ++ err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, ++ &numbufs, &dbp); ++ if (err) ++ return (err); ++ ++ /* ++ * Clone the bio list so the bv->bv_offset and bv->bv_len members ++ * can be safely modified. The original bio list is relinked in to ++ * the request when the function exits. This is required because ++ * some file systems blindly assume that these values will remain ++ * constant between bio_submit() and the IO completion callback. ++ */ ++ err = dmu_bio_clone(bio_saved, &req->bio); ++ if (err) ++ goto error; ++ ++ for (i = 0; i < numbufs; i++) { ++ int tocpy, didcpy, bufoff; ++ dmu_buf_t *db = dbp[i]; ++ ++ bufoff = offset - db->db_offset; ++ ASSERT3S(bufoff, >=, 0); ++ ++ tocpy = (int)MIN(db->db_size - bufoff, size); ++ if (tocpy == 0) ++ break; ++ ++ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); ++ ++ if (tocpy == db->db_size) ++ dmu_buf_will_fill(db, tx); ++ else ++ dmu_buf_will_dirty(db, tx); ++ ++ err = dmu_req_copy(db->db_data + bufoff, tocpy, &didcpy, req); ++ ++ if (tocpy == db->db_size) ++ dmu_buf_fill_done(db, tx); ++ ++ if (didcpy < tocpy) ++ err = EIO; ++ ++ if (err) ++ break; ++ ++ size -= tocpy; ++ offset += didcpy; ++ err = 0; ++ } ++ ++ dmu_bio_put(req->bio); ++ req->bio = bio_saved; ++error: ++ dmu_buf_rele_array(dbp, numbufs, FTAG); ++ ++ return (err); ++} ++ ++int ++dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) ++{ ++ dmu_buf_t **dbp; ++ int numbufs, i, err; ++ xuio_t *xuio = NULL; ++ ++ /* ++ * NB: we could do this block-at-a-time, but it's nice ++ * to be reading in parallel. ++ */ ++ err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, ++ &numbufs, &dbp); ++ if (err) ++ return (err); ++ ++ for (i = 0; i < numbufs; i++) { ++ int tocpy; ++ int bufoff; ++ dmu_buf_t *db = dbp[i]; ++ ++ ASSERT(size > 0); ++ ++ bufoff = uio->uio_loffset - db->db_offset; ++ tocpy = (int)MIN(db->db_size - bufoff, size); ++ ++ if (xuio) { ++ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; ++ arc_buf_t *dbuf_abuf = dbi->db_buf; ++ arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); ++ err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); ++ if (!err) { ++ uio->uio_resid -= tocpy; ++ uio->uio_loffset += tocpy; ++ } ++ ++ if (abuf == dbuf_abuf) ++ XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); ++ else ++ XUIOSTAT_BUMP(xuiostat_rbuf_copied); ++ } else { ++ err = uiomove((char *)db->db_data + bufoff, tocpy, ++ UIO_READ, uio); ++ } ++ if (err) ++ break; ++ ++ size -= tocpy; ++ } ++ dmu_buf_rele_array(dbp, numbufs, FTAG); ++ ++ return (err); ++} ++ ++static int ++dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) ++{ ++ dmu_buf_t **dbp; ++ int numbufs; ++ int err = 0; ++ int i; ++ ++ err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, ++ FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); ++ if (err) ++ return (err); ++ ++ for (i = 0; i < numbufs; i++) { ++ int tocpy; ++ int bufoff; ++ dmu_buf_t *db = dbp[i]; ++ ++ ASSERT(size > 0); ++ ++ bufoff = uio->uio_loffset - db->db_offset; ++ tocpy = (int)MIN(db->db_size - bufoff, size); ++ ++ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); ++ ++ if (tocpy == db->db_size) ++ dmu_buf_will_fill(db, tx); ++ else ++ dmu_buf_will_dirty(db, tx); ++ ++ /* ++ * XXX uiomove could block forever (eg.nfs-backed ++ * pages). There needs to be a uiolockdown() function ++ * to lock the pages in memory, so that uiomove won't ++ * block. ++ */ ++ err = uiomove((char *)db->db_data + bufoff, tocpy, ++ UIO_WRITE, uio); ++ ++ if (tocpy == db->db_size) ++ dmu_buf_fill_done(db, tx); ++ ++ if (err) ++ break; ++ ++ size -= tocpy; ++ } ++ ++ dmu_buf_rele_array(dbp, numbufs, FTAG); ++ return (err); ++} ++ ++int ++dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, ++ dmu_tx_t *tx) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; ++ dnode_t *dn; ++ int err; ++ ++ if (size == 0) ++ return (0); ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ err = dmu_write_uio_dnode(dn, uio, size, tx); ++ DB_DNODE_EXIT(db); ++ ++ return (err); ++} ++ ++int ++dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, ++ dmu_tx_t *tx) ++{ ++ dnode_t *dn; ++ int err; ++ ++ if (size == 0) ++ return (0); ++ ++ err = dnode_hold(os, object, FTAG, &dn); ++ if (err) ++ return (err); ++ ++ err = dmu_write_uio_dnode(dn, uio, size, tx); ++ ++ dnode_rele(dn, FTAG); ++ ++ return (err); ++} ++#endif /* _KERNEL */ ++ ++/* ++ * Allocate a loaned anonymous arc buffer. ++ */ ++arc_buf_t * ++dmu_request_arcbuf(dmu_buf_t *handle, int size) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; ++ spa_t *spa; ++ ++ DB_GET_SPA(&spa, db); ++ return (arc_loan_buf(spa, size)); ++} ++ ++/* ++ * Free a loaned arc buffer. ++ */ ++void ++dmu_return_arcbuf(arc_buf_t *buf) ++{ ++ arc_return_buf(buf, FTAG); ++ VERIFY(arc_buf_remove_ref(buf, FTAG) == 1); ++} ++ ++/* ++ * When possible directly assign passed loaned arc buffer to a dbuf. ++ * If this is not possible copy the contents of passed arc buf via ++ * dmu_write(). ++ */ ++void ++dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, ++ dmu_tx_t *tx) ++{ ++ dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; ++ dnode_t *dn; ++ dmu_buf_impl_t *db; ++ uint32_t blksz = (uint32_t)arc_buf_size(buf); ++ uint64_t blkid; ++ ++ DB_DNODE_ENTER(dbuf); ++ dn = DB_DNODE(dbuf); ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ blkid = dbuf_whichblock(dn, offset); ++ VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); ++ rw_exit(&dn->dn_struct_rwlock); ++ DB_DNODE_EXIT(dbuf); ++ ++ if (offset == db->db.db_offset && blksz == db->db.db_size) { ++ dbuf_assign_arcbuf(db, buf, tx); ++ dbuf_rele(db, FTAG); ++ } else { ++ objset_t *os; ++ uint64_t object; ++ ++ DB_DNODE_ENTER(dbuf); ++ dn = DB_DNODE(dbuf); ++ os = dn->dn_objset; ++ object = dn->dn_object; ++ DB_DNODE_EXIT(dbuf); ++ ++ dbuf_rele(db, FTAG); ++ dmu_write(os, object, offset, blksz, buf->b_data, tx); ++ dmu_return_arcbuf(buf); ++ XUIOSTAT_BUMP(xuiostat_wbuf_copied); ++ } ++} ++ ++typedef struct { ++ dbuf_dirty_record_t *dsa_dr; ++ dmu_sync_cb_t *dsa_done; ++ zgd_t *dsa_zgd; ++ dmu_tx_t *dsa_tx; ++} dmu_sync_arg_t; ++ ++/* ARGSUSED */ ++static void ++dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) ++{ ++ dmu_sync_arg_t *dsa = varg; ++ dmu_buf_t *db = dsa->dsa_zgd->zgd_db; ++ blkptr_t *bp = zio->io_bp; ++ ++ if (zio->io_error == 0) { ++ if (BP_IS_HOLE(bp)) { ++ /* ++ * A block of zeros may compress to a hole, but the ++ * block size still needs to be known for replay. ++ */ ++ BP_SET_LSIZE(bp, db->db_size); ++ } else { ++ ASSERT(BP_GET_LEVEL(bp) == 0); ++ bp->blk_fill = 1; ++ } ++ } ++} ++ ++static void ++dmu_sync_late_arrival_ready(zio_t *zio) ++{ ++ dmu_sync_ready(zio, NULL, zio->io_private); ++} ++ ++/* ARGSUSED */ ++static void ++dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) ++{ ++ dmu_sync_arg_t *dsa = varg; ++ dbuf_dirty_record_t *dr = dsa->dsa_dr; ++ dmu_buf_impl_t *db = dr->dr_dbuf; ++ ++ mutex_enter(&db->db_mtx); ++ ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); ++ if (zio->io_error == 0) { ++ dr->dt.dl.dr_overridden_by = *zio->io_bp; ++ dr->dt.dl.dr_override_state = DR_OVERRIDDEN; ++ dr->dt.dl.dr_copies = zio->io_prop.zp_copies; ++ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) ++ BP_ZERO(&dr->dt.dl.dr_overridden_by); ++ } else { ++ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; ++ } ++ cv_broadcast(&db->db_changed); ++ mutex_exit(&db->db_mtx); ++ ++ dsa->dsa_done(dsa->dsa_zgd, zio->io_error); ++ ++ kmem_free(dsa, sizeof (*dsa)); ++} ++ ++static void ++dmu_sync_late_arrival_done(zio_t *zio) ++{ ++ blkptr_t *bp = zio->io_bp; ++ dmu_sync_arg_t *dsa = zio->io_private; ++ ++ if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { ++ ASSERT(zio->io_bp->blk_birth == zio->io_txg); ++ ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); ++ zio_free(zio->io_spa, zio->io_txg, zio->io_bp); ++ } ++ ++ dmu_tx_commit(dsa->dsa_tx); ++ ++ dsa->dsa_done(dsa->dsa_zgd, zio->io_error); ++ ++ kmem_free(dsa, sizeof (*dsa)); ++} ++ ++static int ++dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, ++ zio_prop_t *zp, zbookmark_t *zb) ++{ ++ dmu_sync_arg_t *dsa; ++ dmu_tx_t *tx; ++ ++ tx = dmu_tx_create(os); ++ dmu_tx_hold_space(tx, zgd->zgd_db->db_size); ++ if (dmu_tx_assign(tx, TXG_WAIT) != 0) { ++ dmu_tx_abort(tx); ++ return (EIO); /* Make zl_get_data do txg_waited_synced() */ ++ } ++ ++ dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_PUSHPAGE); ++ dsa->dsa_dr = NULL; ++ dsa->dsa_done = done; ++ dsa->dsa_zgd = zgd; ++ dsa->dsa_tx = tx; ++ ++ zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, ++ zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, ++ dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa, ++ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, zb)); ++ ++ return (0); ++} ++ ++/* ++ * Intent log support: sync the block associated with db to disk. ++ * N.B. and XXX: the caller is responsible for making sure that the ++ * data isn't changing while dmu_sync() is writing it. ++ * ++ * Return values: ++ * ++ * EEXIST: this txg has already been synced, so there's nothing to to. ++ * The caller should not log the write. ++ * ++ * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. ++ * The caller should not log the write. ++ * ++ * EALREADY: this block is already in the process of being synced. ++ * The caller should track its progress (somehow). ++ * ++ * EIO: could not do the I/O. ++ * The caller should do a txg_wait_synced(). ++ * ++ * 0: the I/O has been initiated. ++ * The caller should log this blkptr in the done callback. ++ * It is possible that the I/O will fail, in which case ++ * the error will be reported to the done callback and ++ * propagated to pio from zio_done(). ++ */ ++int ++dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) ++{ ++ blkptr_t *bp = zgd->zgd_bp; ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; ++ objset_t *os = db->db_objset; ++ dsl_dataset_t *ds = os->os_dsl_dataset; ++ dbuf_dirty_record_t *dr; ++ dmu_sync_arg_t *dsa; ++ zbookmark_t zb; ++ zio_prop_t zp; ++ dnode_t *dn; ++ ++ ASSERT(pio != NULL); ++ ASSERT(BP_IS_HOLE(bp)); ++ ASSERT(txg != 0); ++ ++ SET_BOOKMARK(&zb, ds->ds_object, ++ db->db.db_object, db->db_level, db->db_blkid); ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); ++ DB_DNODE_EXIT(db); ++ ++ /* ++ * If we're frozen (running ziltest), we always need to generate a bp. ++ */ ++ if (txg > spa_freeze_txg(os->os_spa)) ++ return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); ++ ++ /* ++ * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() ++ * and us. If we determine that this txg is not yet syncing, ++ * but it begins to sync a moment later, that's OK because the ++ * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. ++ */ ++ mutex_enter(&db->db_mtx); ++ ++ if (txg <= spa_last_synced_txg(os->os_spa)) { ++ /* ++ * This txg has already synced. There's nothing to do. ++ */ ++ mutex_exit(&db->db_mtx); ++ return (EEXIST); ++ } ++ ++ if (txg <= spa_syncing_txg(os->os_spa)) { ++ /* ++ * This txg is currently syncing, so we can't mess with ++ * the dirty record anymore; just write a new log block. ++ */ ++ mutex_exit(&db->db_mtx); ++ return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); ++ } ++ ++ dr = db->db_last_dirty; ++ while (dr && dr->dr_txg != txg) ++ dr = dr->dr_next; ++ ++ if (dr == NULL) { ++ /* ++ * There's no dr for this dbuf, so it must have been freed. ++ * There's no need to log writes to freed blocks, so we're done. ++ */ ++ mutex_exit(&db->db_mtx); ++ return (ENOENT); ++ } ++ ++ ASSERT(dr->dr_txg == txg); ++ if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || ++ dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { ++ /* ++ * We have already issued a sync write for this buffer, ++ * or this buffer has already been synced. It could not ++ * have been dirtied since, or we would have cleared the state. ++ */ ++ mutex_exit(&db->db_mtx); ++ return (EALREADY); ++ } ++ ++ ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); ++ dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; ++ mutex_exit(&db->db_mtx); ++ ++ dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_PUSHPAGE); ++ dsa->dsa_dr = dr; ++ dsa->dsa_done = done; ++ dsa->dsa_zgd = zgd; ++ dsa->dsa_tx = NULL; ++ ++ zio_nowait(arc_write(pio, os->os_spa, txg, ++ bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp, ++ dmu_sync_ready, dmu_sync_done, dsa, ++ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb)); ++ ++ return (0); ++} ++ ++int ++dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, ++ dmu_tx_t *tx) ++{ ++ dnode_t *dn; ++ int err; ++ ++ err = dnode_hold(os, object, FTAG, &dn); ++ if (err) ++ return (err); ++ err = dnode_set_blksz(dn, size, ibs, tx); ++ dnode_rele(dn, FTAG); ++ return (err); ++} ++ ++void ++dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, ++ dmu_tx_t *tx) ++{ ++ dnode_t *dn; ++ ++ /* XXX assumes dnode_hold will not get an i/o error */ ++ (void) dnode_hold(os, object, FTAG, &dn); ++ ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); ++ dn->dn_checksum = checksum; ++ dnode_setdirty(dn, tx); ++ dnode_rele(dn, FTAG); ++} ++ ++void ++dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, ++ dmu_tx_t *tx) ++{ ++ dnode_t *dn; ++ ++ /* XXX assumes dnode_hold will not get an i/o error */ ++ (void) dnode_hold(os, object, FTAG, &dn); ++ ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); ++ dn->dn_compress = compress; ++ dnode_setdirty(dn, tx); ++ dnode_rele(dn, FTAG); ++} ++ ++int zfs_mdcomp_disable = 0; ++ ++void ++dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) ++{ ++ dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; ++ boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata || ++ (wp & WP_SPILL)); ++ enum zio_checksum checksum = os->os_checksum; ++ enum zio_compress compress = os->os_compress; ++ enum zio_checksum dedup_checksum = os->os_dedup_checksum; ++ boolean_t dedup; ++ boolean_t dedup_verify = os->os_dedup_verify; ++ int copies = os->os_copies; ++ ++ /* ++ * Determine checksum setting. ++ */ ++ if (ismd) { ++ /* ++ * Metadata always gets checksummed. If the data ++ * checksum is multi-bit correctable, and it's not a ++ * ZBT-style checksum, then it's suitable for metadata ++ * as well. Otherwise, the metadata checksum defaults ++ * to fletcher4. ++ */ ++ if (zio_checksum_table[checksum].ci_correctable < 1 || ++ zio_checksum_table[checksum].ci_eck) ++ checksum = ZIO_CHECKSUM_FLETCHER_4; ++ } else { ++ checksum = zio_checksum_select(dn->dn_checksum, checksum); ++ } ++ ++ /* ++ * Determine compression setting. ++ */ ++ if (ismd) { ++ /* ++ * XXX -- we should design a compression algorithm ++ * that specializes in arrays of bps. ++ */ ++ compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ++ ZIO_COMPRESS_LZJB; ++ } else { ++ compress = zio_compress_select(dn->dn_compress, compress); ++ } ++ ++ /* ++ * Determine dedup setting. If we are in dmu_sync(), we won't ++ * actually dedup now because that's all done in syncing context; ++ * but we do want to use the dedup checkum. If the checksum is not ++ * strong enough to ensure unique signatures, force dedup_verify. ++ */ ++ dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF); ++ if (dedup) { ++ checksum = dedup_checksum; ++ if (!zio_checksum_table[checksum].ci_dedup) ++ dedup_verify = 1; ++ } ++ ++ if (wp & WP_DMU_SYNC) ++ dedup = 0; ++ ++ if (wp & WP_NOFILL) { ++ ASSERT(!ismd && level == 0); ++ checksum = ZIO_CHECKSUM_OFF; ++ compress = ZIO_COMPRESS_OFF; ++ dedup = B_FALSE; ++ } ++ ++ zp->zp_checksum = checksum; ++ zp->zp_compress = compress; ++ zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; ++ zp->zp_level = level; ++ zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa)); ++ zp->zp_dedup = dedup; ++ zp->zp_dedup_verify = dedup && dedup_verify; ++} ++ ++int ++dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) ++{ ++ dnode_t *dn; ++ int i, err; ++ ++ err = dnode_hold(os, object, FTAG, &dn); ++ if (err) ++ return (err); ++ /* ++ * Sync any current changes before ++ * we go trundling through the block pointers. ++ */ ++ for (i = 0; i < TXG_SIZE; i++) { ++ if (list_link_active(&dn->dn_dirty_link[i])) ++ break; ++ } ++ if (i != TXG_SIZE) { ++ dnode_rele(dn, FTAG); ++ txg_wait_synced(dmu_objset_pool(os), 0); ++ err = dnode_hold(os, object, FTAG, &dn); ++ if (err) ++ return (err); ++ } ++ ++ err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); ++ dnode_rele(dn, FTAG); ++ ++ return (err); ++} ++ ++void ++dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) ++{ ++ dnode_phys_t *dnp; ++ int i; ++ ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ mutex_enter(&dn->dn_mtx); ++ ++ dnp = dn->dn_phys; ++ ++ doi->doi_data_block_size = dn->dn_datablksz; ++ doi->doi_metadata_block_size = dn->dn_indblkshift ? ++ 1ULL << dn->dn_indblkshift : 0; ++ doi->doi_type = dn->dn_type; ++ doi->doi_bonus_type = dn->dn_bonustype; ++ doi->doi_bonus_size = dn->dn_bonuslen; ++ doi->doi_indirection = dn->dn_nlevels; ++ doi->doi_checksum = dn->dn_checksum; ++ doi->doi_compress = dn->dn_compress; ++ doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; ++ doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz; ++ doi->doi_fill_count = 0; ++ for (i = 0; i < dnp->dn_nblkptr; i++) ++ doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; ++ ++ mutex_exit(&dn->dn_mtx); ++ rw_exit(&dn->dn_struct_rwlock); ++} ++ ++/* ++ * Get information on a DMU object. ++ * If doi is NULL, just indicates whether the object exists. ++ */ ++int ++dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) ++{ ++ dnode_t *dn; ++ int err = dnode_hold(os, object, FTAG, &dn); ++ ++ if (err) ++ return (err); ++ ++ if (doi != NULL) ++ dmu_object_info_from_dnode(dn, doi); ++ ++ dnode_rele(dn, FTAG); ++ return (0); ++} ++ ++/* ++ * As above, but faster; can be used when you have a held dbuf in hand. ++ */ ++void ++dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ++ ++ DB_DNODE_ENTER(db); ++ dmu_object_info_from_dnode(DB_DNODE(db), doi); ++ DB_DNODE_EXIT(db); ++} ++ ++/* ++ * Faster still when you only care about the size. ++ * This is specifically optimized for zfs_getattr(). ++ */ ++void ++dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, ++ u_longlong_t *nblk512) ++{ ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ++ dnode_t *dn; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ ++ *blksize = dn->dn_datablksz; ++ /* add 1 for dnode space */ ++ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> ++ SPA_MINBLOCKSHIFT) + 1; ++ DB_DNODE_EXIT(db); ++} ++ ++void ++byteswap_uint64_array(void *vbuf, size_t size) ++{ ++ uint64_t *buf = vbuf; ++ size_t count = size >> 3; ++ int i; ++ ++ ASSERT((size & 7) == 0); ++ ++ for (i = 0; i < count; i++) ++ buf[i] = BSWAP_64(buf[i]); ++} ++ ++void ++byteswap_uint32_array(void *vbuf, size_t size) ++{ ++ uint32_t *buf = vbuf; ++ size_t count = size >> 2; ++ int i; ++ ++ ASSERT((size & 3) == 0); ++ ++ for (i = 0; i < count; i++) ++ buf[i] = BSWAP_32(buf[i]); ++} ++ ++void ++byteswap_uint16_array(void *vbuf, size_t size) ++{ ++ uint16_t *buf = vbuf; ++ size_t count = size >> 1; ++ int i; ++ ++ ASSERT((size & 1) == 0); ++ ++ for (i = 0; i < count; i++) ++ buf[i] = BSWAP_16(buf[i]); ++} ++ ++/* ARGSUSED */ ++void ++byteswap_uint8_array(void *vbuf, size_t size) ++{ ++} ++ ++void ++dmu_init(void) ++{ ++ zfs_dbgmsg_init(); ++ sa_cache_init(); ++ xuio_stat_init(); ++ dmu_objset_init(); ++ dnode_init(); ++ dbuf_init(); ++ zfetch_init(); ++ dmu_tx_init(); ++ arc_init(); ++ l2arc_init(); ++} ++ ++void ++dmu_fini(void) ++{ ++ l2arc_fini(); ++ arc_fini(); ++ dmu_tx_fini(); ++ zfetch_fini(); ++ dbuf_fini(); ++ dnode_fini(); ++ dmu_objset_fini(); ++ xuio_stat_fini(); ++ sa_cache_fini(); ++ zfs_dbgmsg_fini(); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(dmu_bonus_hold); ++EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus); ++EXPORT_SYMBOL(dmu_buf_rele_array); ++EXPORT_SYMBOL(dmu_free_range); ++EXPORT_SYMBOL(dmu_read); ++EXPORT_SYMBOL(dmu_write); ++EXPORT_SYMBOL(dmu_object_info); ++EXPORT_SYMBOL(dmu_object_info_from_dnode); ++EXPORT_SYMBOL(dmu_object_info_from_db); ++EXPORT_SYMBOL(dmu_object_size_from_db); ++EXPORT_SYMBOL(dmu_object_set_blocksize); ++EXPORT_SYMBOL(dmu_object_set_checksum); ++EXPORT_SYMBOL(dmu_object_set_compress); ++EXPORT_SYMBOL(dmu_request_arcbuf); ++EXPORT_SYMBOL(dmu_return_arcbuf); ++EXPORT_SYMBOL(dmu_assign_arcbuf); ++EXPORT_SYMBOL(dmu_buf_hold); ++EXPORT_SYMBOL(dmu_ot); ++ ++module_param(zfs_mdcomp_disable, int, 0644); ++MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression"); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dmu_diff.c linux-3.2.33-go/fs/zfs/zfs/dmu_diff.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dmu_diff.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dmu_diff.c 2012-11-16 23:25:34.348039346 +0100 +@@ -0,0 +1,221 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct diffarg { ++ struct vnode *da_vp; /* file to which we are reporting */ ++ offset_t *da_offp; ++ int da_err; /* error that stopped diff search */ ++ dmu_diff_record_t da_ddr; ++}; ++ ++static int ++write_record(struct diffarg *da) ++{ ++ ssize_t resid; /* have to get resid to get detailed errno */ ++ ++ if (da->da_ddr.ddr_type == DDR_NONE) { ++ da->da_err = 0; ++ return (0); ++ } ++ ++ da->da_err = vn_rdwr(UIO_WRITE, da->da_vp, (caddr_t)&da->da_ddr, ++ sizeof (da->da_ddr), 0, UIO_SYSSPACE, FAPPEND, ++ RLIM64_INFINITY, CRED(), &resid); ++ *da->da_offp += sizeof (da->da_ddr); ++ return (da->da_err); ++} ++ ++static int ++report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last) ++{ ++ ASSERT(first <= last); ++ if (da->da_ddr.ddr_type != DDR_FREE || ++ first != da->da_ddr.ddr_last + 1) { ++ if (write_record(da) != 0) ++ return (da->da_err); ++ da->da_ddr.ddr_type = DDR_FREE; ++ da->da_ddr.ddr_first = first; ++ da->da_ddr.ddr_last = last; ++ return (0); ++ } ++ da->da_ddr.ddr_last = last; ++ return (0); ++} ++ ++static int ++report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp) ++{ ++ ASSERT(dnp != NULL); ++ if (dnp->dn_type == DMU_OT_NONE) ++ return (report_free_dnode_range(da, object, object)); ++ ++ if (da->da_ddr.ddr_type != DDR_INUSE || ++ object != da->da_ddr.ddr_last + 1) { ++ if (write_record(da) != 0) ++ return (da->da_err); ++ da->da_ddr.ddr_type = DDR_INUSE; ++ da->da_ddr.ddr_first = da->da_ddr.ddr_last = object; ++ return (0); ++ } ++ da->da_ddr.ddr_last = object; ++ return (0); ++} ++ ++#define DBP_SPAN(dnp, level) \ ++ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ ++ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) ++ ++/* ARGSUSED */ ++static int ++diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, ++ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) ++{ ++ struct diffarg *da = arg; ++ int err = 0; ++ ++ if (issig(JUSTLOOKING) && issig(FORREAL)) ++ return (EINTR); ++ ++ if (zb->zb_object != DMU_META_DNODE_OBJECT) ++ return (0); ++ ++ if (bp == NULL) { ++ uint64_t span = DBP_SPAN(dnp, zb->zb_level); ++ uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; ++ ++ err = report_free_dnode_range(da, dnobj, ++ dnobj + (span >> DNODE_SHIFT) - 1); ++ if (err) ++ return (err); ++ } else if (zb->zb_level == 0) { ++ dnode_phys_t *blk; ++ arc_buf_t *abuf; ++ uint32_t aflags = ARC_WAIT; ++ int blksz = BP_GET_LSIZE(bp); ++ int i; ++ ++ if (dsl_read(NULL, spa, bp, pbuf, ++ arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ++ ZIO_FLAG_CANFAIL, &aflags, zb) != 0) ++ return (EIO); ++ ++ blk = abuf->b_data; ++ for (i = 0; i < blksz >> DNODE_SHIFT; i++) { ++ uint64_t dnobj = (zb->zb_blkid << ++ (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; ++ err = report_dnode(da, dnobj, blk+i); ++ if (err) ++ break; ++ } ++ (void) arc_buf_remove_ref(abuf, &abuf); ++ if (err) ++ return (err); ++ /* Don't care about the data blocks */ ++ return (TRAVERSE_VISIT_NO_CHILDREN); ++ } ++ return (0); ++} ++ ++int ++dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp) ++{ ++ struct diffarg da; ++ dsl_dataset_t *ds = tosnap->os_dsl_dataset; ++ dsl_dataset_t *fromds = fromsnap->os_dsl_dataset; ++ dsl_dataset_t *findds; ++ dsl_dataset_t *relds; ++ int err = 0; ++ ++ /* make certain we are looking at snapshots */ ++ if (!dsl_dataset_is_snapshot(ds) || !dsl_dataset_is_snapshot(fromds)) ++ return (EINVAL); ++ ++ /* fromsnap must be earlier and from the same lineage as tosnap */ ++ if (fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg) ++ return (EXDEV); ++ ++ relds = NULL; ++ findds = ds; ++ ++ while (fromds->ds_dir != findds->ds_dir) { ++ dsl_pool_t *dp = ds->ds_dir->dd_pool; ++ ++ if (!dsl_dir_is_clone(findds->ds_dir)) { ++ if (relds) ++ dsl_dataset_rele(relds, FTAG); ++ return (EXDEV); ++ } ++ ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ err = dsl_dataset_hold_obj(dp, ++ findds->ds_dir->dd_phys->dd_origin_obj, FTAG, &findds); ++ rw_exit(&dp->dp_config_rwlock); ++ ++ if (relds) ++ dsl_dataset_rele(relds, FTAG); ++ ++ if (err) ++ return (EXDEV); ++ ++ relds = findds; ++ } ++ ++ if (relds) ++ dsl_dataset_rele(relds, FTAG); ++ ++ da.da_vp = vp; ++ da.da_offp = offp; ++ da.da_ddr.ddr_type = DDR_NONE; ++ da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0; ++ da.da_err = 0; ++ ++ err = traverse_dataset(ds, fromds->ds_phys->ds_creation_txg, ++ TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da); ++ ++ if (err) { ++ da.da_err = err; ++ } else { ++ /* we set the da.da_err we return as side-effect */ ++ (void) write_record(&da); ++ } ++ ++ return (da.da_err); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dmu_object.c linux-3.2.33-go/fs/zfs/zfs/dmu_object.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dmu_object.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dmu_object.c 2012-11-16 23:25:34.353039289 +0100 +@@ -0,0 +1,204 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++uint64_t ++dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, ++ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) ++{ ++ uint64_t object; ++ uint64_t L2_dnode_count = DNODES_PER_BLOCK << ++ (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); ++ dnode_t *dn = NULL; ++ int restarted = B_FALSE; ++ ++ mutex_enter(&os->os_obj_lock); ++ for (;;) { ++ object = os->os_obj_next; ++ /* ++ * Each time we polish off an L2 bp worth of dnodes ++ * (2^13 objects), move to another L2 bp that's still ++ * reasonably sparse (at most 1/4 full). Look from the ++ * beginning once, but after that keep looking from here. ++ * If we can't find one, just keep going from here. ++ */ ++ if (P2PHASE(object, L2_dnode_count) == 0) { ++ uint64_t offset = restarted ? object << DNODE_SHIFT : 0; ++ int error = dnode_next_offset(DMU_META_DNODE(os), ++ DNODE_FIND_HOLE, ++ &offset, 2, DNODES_PER_BLOCK >> 2, 0); ++ restarted = B_TRUE; ++ if (error == 0) ++ object = offset >> DNODE_SHIFT; ++ } ++ os->os_obj_next = ++object; ++ ++ /* ++ * XXX We should check for an i/o error here and return ++ * up to our caller. Actually we should pre-read it in ++ * dmu_tx_assign(), but there is currently no mechanism ++ * to do so. ++ */ ++ (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, ++ FTAG, &dn); ++ if (dn) ++ break; ++ ++ if (dmu_object_next(os, &object, B_TRUE, 0) == 0) ++ os->os_obj_next = object - 1; ++ } ++ ++ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); ++ dnode_rele(dn, FTAG); ++ ++ mutex_exit(&os->os_obj_lock); ++ ++ dmu_tx_add_new_object(tx, os, object); ++ return (object); ++} ++ ++int ++dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, ++ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) ++{ ++ dnode_t *dn; ++ int err; ++ ++ if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) ++ return (EBADF); ++ ++ err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn); ++ if (err) ++ return (err); ++ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); ++ dnode_rele(dn, FTAG); ++ ++ dmu_tx_add_new_object(tx, os, object); ++ return (0); ++} ++ ++int ++dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, ++ int blocksize, dmu_object_type_t bonustype, int bonuslen) ++{ ++ dnode_t *dn; ++ dmu_tx_t *tx; ++ int nblkptr; ++ int err; ++ ++ if (object == DMU_META_DNODE_OBJECT) ++ return (EBADF); ++ ++ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, ++ FTAG, &dn); ++ if (err) ++ return (err); ++ ++ if (dn->dn_type == ot && dn->dn_datablksz == blocksize && ++ dn->dn_bonustype == bonustype && dn->dn_bonuslen == bonuslen) { ++ /* nothing is changing, this is a noop */ ++ dnode_rele(dn, FTAG); ++ return (0); ++ } ++ ++ if (bonustype == DMU_OT_SA) { ++ nblkptr = 1; ++ } else { ++ nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); ++ } ++ ++ /* ++ * If we are losing blkptrs or changing the block size this must ++ * be a new file instance. We must clear out the previous file ++ * contents before we can change this type of metadata in the dnode. ++ */ ++ if (dn->dn_nblkptr > nblkptr || dn->dn_datablksz != blocksize) { ++ err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); ++ if (err) ++ goto out; ++ } ++ ++ tx = dmu_tx_create(os); ++ dmu_tx_hold_bonus(tx, object); ++ err = dmu_tx_assign(tx, TXG_WAIT); ++ if (err) { ++ dmu_tx_abort(tx); ++ goto out; ++ } ++ ++ dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx); ++ ++ dmu_tx_commit(tx); ++out: ++ dnode_rele(dn, FTAG); ++ ++ return (err); ++} ++ ++int ++dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) ++{ ++ dnode_t *dn; ++ int err; ++ ++ ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); ++ ++ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, ++ FTAG, &dn); ++ if (err) ++ return (err); ++ ++ ASSERT(dn->dn_type != DMU_OT_NONE); ++ dnode_free_range(dn, 0, DMU_OBJECT_END, tx); ++ dnode_free(dn, tx); ++ dnode_rele(dn, FTAG); ++ ++ return (0); ++} ++ ++int ++dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) ++{ ++ uint64_t offset = (*objectp + 1) << DNODE_SHIFT; ++ int error; ++ ++ error = dnode_next_offset(DMU_META_DNODE(os), ++ (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); ++ ++ *objectp = offset >> DNODE_SHIFT; ++ ++ return (error); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(dmu_object_alloc); ++EXPORT_SYMBOL(dmu_object_claim); ++EXPORT_SYMBOL(dmu_object_reclaim); ++EXPORT_SYMBOL(dmu_object_free); ++EXPORT_SYMBOL(dmu_object_next); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dmu_objset.c linux-3.2.33-go/fs/zfs/zfs/dmu_objset.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dmu_objset.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dmu_objset.c 2012-11-16 23:25:34.350039322 +0100 +@@ -0,0 +1,1862 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* Portions Copyright 2010 Robert Milkowski */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Needed to close a window in dnode_move() that allows the objset to be freed ++ * before it can be safely accessed. ++ */ ++krwlock_t os_lock; ++ ++void ++dmu_objset_init(void) ++{ ++ rw_init(&os_lock, NULL, RW_DEFAULT, NULL); ++} ++ ++void ++dmu_objset_fini(void) ++{ ++ rw_destroy(&os_lock); ++} ++ ++spa_t * ++dmu_objset_spa(objset_t *os) ++{ ++ return (os->os_spa); ++} ++ ++zilog_t * ++dmu_objset_zil(objset_t *os) ++{ ++ return (os->os_zil); ++} ++ ++dsl_pool_t * ++dmu_objset_pool(objset_t *os) ++{ ++ dsl_dataset_t *ds; ++ ++ if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) ++ return (ds->ds_dir->dd_pool); ++ else ++ return (spa_get_dsl(os->os_spa)); ++} ++ ++dsl_dataset_t * ++dmu_objset_ds(objset_t *os) ++{ ++ return (os->os_dsl_dataset); ++} ++ ++dmu_objset_type_t ++dmu_objset_type(objset_t *os) ++{ ++ return (os->os_phys->os_type); ++} ++ ++void ++dmu_objset_name(objset_t *os, char *buf) ++{ ++ dsl_dataset_name(os->os_dsl_dataset, buf); ++} ++ ++uint64_t ++dmu_objset_id(objset_t *os) ++{ ++ dsl_dataset_t *ds = os->os_dsl_dataset; ++ ++ return (ds ? ds->ds_object : 0); ++} ++ ++uint64_t ++dmu_objset_syncprop(objset_t *os) ++{ ++ return (os->os_sync); ++} ++ ++uint64_t ++dmu_objset_logbias(objset_t *os) ++{ ++ return (os->os_logbias); ++} ++ ++static void ++checksum_changed_cb(void *arg, uint64_t newval) ++{ ++ objset_t *os = arg; ++ ++ /* ++ * Inheritance should have been done by now. ++ */ ++ ASSERT(newval != ZIO_CHECKSUM_INHERIT); ++ ++ os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); ++} ++ ++static void ++compression_changed_cb(void *arg, uint64_t newval) ++{ ++ objset_t *os = arg; ++ ++ /* ++ * Inheritance and range checking should have been done by now. ++ */ ++ ASSERT(newval != ZIO_COMPRESS_INHERIT); ++ ++ os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); ++} ++ ++static void ++copies_changed_cb(void *arg, uint64_t newval) ++{ ++ objset_t *os = arg; ++ ++ /* ++ * Inheritance and range checking should have been done by now. ++ */ ++ ASSERT(newval > 0); ++ ASSERT(newval <= spa_max_replication(os->os_spa)); ++ ++ os->os_copies = newval; ++} ++ ++static void ++dedup_changed_cb(void *arg, uint64_t newval) ++{ ++ objset_t *os = arg; ++ spa_t *spa = os->os_spa; ++ enum zio_checksum checksum; ++ ++ /* ++ * Inheritance should have been done by now. ++ */ ++ ASSERT(newval != ZIO_CHECKSUM_INHERIT); ++ ++ checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); ++ ++ os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; ++ os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); ++} ++ ++static void ++primary_cache_changed_cb(void *arg, uint64_t newval) ++{ ++ objset_t *os = arg; ++ ++ /* ++ * Inheritance and range checking should have been done by now. ++ */ ++ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || ++ newval == ZFS_CACHE_METADATA); ++ ++ os->os_primary_cache = newval; ++} ++ ++static void ++secondary_cache_changed_cb(void *arg, uint64_t newval) ++{ ++ objset_t *os = arg; ++ ++ /* ++ * Inheritance and range checking should have been done by now. ++ */ ++ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || ++ newval == ZFS_CACHE_METADATA); ++ ++ os->os_secondary_cache = newval; ++} ++ ++static void ++sync_changed_cb(void *arg, uint64_t newval) ++{ ++ objset_t *os = arg; ++ ++ /* ++ * Inheritance and range checking should have been done by now. ++ */ ++ ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || ++ newval == ZFS_SYNC_DISABLED); ++ ++ os->os_sync = newval; ++ if (os->os_zil) ++ zil_set_sync(os->os_zil, newval); ++} ++ ++static void ++logbias_changed_cb(void *arg, uint64_t newval) ++{ ++ objset_t *os = arg; ++ ++ ASSERT(newval == ZFS_LOGBIAS_LATENCY || ++ newval == ZFS_LOGBIAS_THROUGHPUT); ++ os->os_logbias = newval; ++ if (os->os_zil) ++ zil_set_logbias(os->os_zil, newval); ++} ++ ++void ++dmu_objset_byteswap(void *buf, size_t size) ++{ ++ objset_phys_t *osp = buf; ++ ++ ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); ++ dnode_byteswap(&osp->os_meta_dnode); ++ byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); ++ osp->os_type = BSWAP_64(osp->os_type); ++ osp->os_flags = BSWAP_64(osp->os_flags); ++ if (size == sizeof (objset_phys_t)) { ++ dnode_byteswap(&osp->os_userused_dnode); ++ dnode_byteswap(&osp->os_groupused_dnode); ++ } ++} ++ ++int ++dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ++ objset_t **osp) ++{ ++ objset_t *os; ++ int i, err; ++ ++ ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); ++ ++ os = kmem_zalloc(sizeof (objset_t), KM_PUSHPAGE); ++ os->os_dsl_dataset = ds; ++ os->os_spa = spa; ++ os->os_rootbp = bp; ++ if (!BP_IS_HOLE(os->os_rootbp)) { ++ uint32_t aflags = ARC_WAIT; ++ zbookmark_t zb; ++ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ++ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); ++ ++ if (DMU_OS_IS_L2CACHEABLE(os)) ++ aflags |= ARC_L2CACHE; ++ ++ dprintf_bp(os->os_rootbp, "reading %s", ""); ++ /* ++ * XXX when bprewrite scrub can change the bp, ++ * and this is called from dmu_objset_open_ds_os, the bp ++ * could change, and we'll need a lock. ++ */ ++ err = dsl_read_nolock(NULL, spa, os->os_rootbp, ++ arc_getbuf_func, &os->os_phys_buf, ++ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); ++ if (err) { ++ kmem_free(os, sizeof (objset_t)); ++ /* convert checksum errors into IO errors */ ++ if (err == ECKSUM) ++ err = EIO; ++ return (err); ++ } ++ ++ /* Increase the blocksize if we are permitted. */ ++ if (spa_version(spa) >= SPA_VERSION_USERSPACE && ++ arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { ++ arc_buf_t *buf = arc_buf_alloc(spa, ++ sizeof (objset_phys_t), &os->os_phys_buf, ++ ARC_BUFC_METADATA); ++ bzero(buf->b_data, sizeof (objset_phys_t)); ++ bcopy(os->os_phys_buf->b_data, buf->b_data, ++ arc_buf_size(os->os_phys_buf)); ++ (void) arc_buf_remove_ref(os->os_phys_buf, ++ &os->os_phys_buf); ++ os->os_phys_buf = buf; ++ } ++ ++ os->os_phys = os->os_phys_buf->b_data; ++ os->os_flags = os->os_phys->os_flags; ++ } else { ++ int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? ++ sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; ++ os->os_phys_buf = arc_buf_alloc(spa, size, ++ &os->os_phys_buf, ARC_BUFC_METADATA); ++ os->os_phys = os->os_phys_buf->b_data; ++ bzero(os->os_phys, size); ++ } ++ ++ /* ++ * Note: the changed_cb will be called once before the register ++ * func returns, thus changing the checksum/compression from the ++ * default (fletcher2/off). Snapshots don't need to know about ++ * checksum/compression/copies. ++ */ ++ if (ds) { ++ err = dsl_prop_register(ds, "primarycache", ++ primary_cache_changed_cb, os); ++ if (err == 0) ++ err = dsl_prop_register(ds, "secondarycache", ++ secondary_cache_changed_cb, os); ++ if (!dsl_dataset_is_snapshot(ds)) { ++ if (err == 0) ++ err = dsl_prop_register(ds, "checksum", ++ checksum_changed_cb, os); ++ if (err == 0) ++ err = dsl_prop_register(ds, "compression", ++ compression_changed_cb, os); ++ if (err == 0) ++ err = dsl_prop_register(ds, "copies", ++ copies_changed_cb, os); ++ if (err == 0) ++ err = dsl_prop_register(ds, "dedup", ++ dedup_changed_cb, os); ++ if (err == 0) ++ err = dsl_prop_register(ds, "logbias", ++ logbias_changed_cb, os); ++ if (err == 0) ++ err = dsl_prop_register(ds, "sync", ++ sync_changed_cb, os); ++ } ++ if (err) { ++ VERIFY(arc_buf_remove_ref(os->os_phys_buf, ++ &os->os_phys_buf) == 1); ++ kmem_free(os, sizeof (objset_t)); ++ return (err); ++ } ++ } else if (ds == NULL) { ++ /* It's the meta-objset. */ ++ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; ++ os->os_compress = ZIO_COMPRESS_LZJB; ++ os->os_copies = spa_max_replication(spa); ++ os->os_dedup_checksum = ZIO_CHECKSUM_OFF; ++ os->os_dedup_verify = 0; ++ os->os_logbias = 0; ++ os->os_sync = 0; ++ os->os_primary_cache = ZFS_CACHE_ALL; ++ os->os_secondary_cache = ZFS_CACHE_ALL; ++ } ++ ++ if (ds == NULL || !dsl_dataset_is_snapshot(ds)) ++ os->os_zil_header = os->os_phys->os_zil_header; ++ os->os_zil = zil_alloc(os, &os->os_zil_header); ++ ++ for (i = 0; i < TXG_SIZE; i++) { ++ list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), ++ offsetof(dnode_t, dn_dirty_link[i])); ++ list_create(&os->os_free_dnodes[i], sizeof (dnode_t), ++ offsetof(dnode_t, dn_dirty_link[i])); ++ } ++ list_create(&os->os_dnodes, sizeof (dnode_t), ++ offsetof(dnode_t, dn_link)); ++ list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), ++ offsetof(dmu_buf_impl_t, db_link)); ++ ++ mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ DMU_META_DNODE(os) = dnode_special_open(os, ++ &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, ++ &os->os_meta_dnode); ++ if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { ++ DMU_USERUSED_DNODE(os) = dnode_special_open(os, ++ &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, ++ &os->os_userused_dnode); ++ DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, ++ &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, ++ &os->os_groupused_dnode); ++ } ++ ++ /* ++ * We should be the only thread trying to do this because we ++ * have ds_opening_lock ++ */ ++ if (ds) { ++ mutex_enter(&ds->ds_lock); ++ ASSERT(ds->ds_objset == NULL); ++ ds->ds_objset = os; ++ mutex_exit(&ds->ds_lock); ++ } ++ ++ *osp = os; ++ return (0); ++} ++ ++int ++dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) ++{ ++ int err = 0; ++ ++ mutex_enter(&ds->ds_opening_lock); ++ *osp = ds->ds_objset; ++ if (*osp == NULL) { ++ err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ++ ds, dsl_dataset_get_blkptr(ds), osp); ++ } ++ mutex_exit(&ds->ds_opening_lock); ++ return (err); ++} ++ ++/* called from zpl */ ++int ++dmu_objset_hold(const char *name, void *tag, objset_t **osp) ++{ ++ dsl_dataset_t *ds; ++ int err; ++ ++ err = dsl_dataset_hold(name, tag, &ds); ++ if (err) ++ return (err); ++ ++ err = dmu_objset_from_ds(ds, osp); ++ if (err) ++ dsl_dataset_rele(ds, tag); ++ ++ return (err); ++} ++ ++/* called from zpl */ ++int ++dmu_objset_own(const char *name, dmu_objset_type_t type, ++ boolean_t readonly, void *tag, objset_t **osp) ++{ ++ dsl_dataset_t *ds; ++ int err; ++ ++ err = dsl_dataset_own(name, B_FALSE, tag, &ds); ++ if (err) ++ return (err); ++ ++ err = dmu_objset_from_ds(ds, osp); ++ if (err) { ++ dsl_dataset_disown(ds, tag); ++ } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { ++ dmu_objset_disown(*osp, tag); ++ return (EINVAL); ++ } else if (!readonly && dsl_dataset_is_snapshot(ds)) { ++ dmu_objset_disown(*osp, tag); ++ return (EROFS); ++ } ++ return (err); ++} ++ ++void ++dmu_objset_rele(objset_t *os, void *tag) ++{ ++ dsl_dataset_rele(os->os_dsl_dataset, tag); ++} ++ ++void ++dmu_objset_disown(objset_t *os, void *tag) ++{ ++ dsl_dataset_disown(os->os_dsl_dataset, tag); ++} ++ ++int ++dmu_objset_evict_dbufs(objset_t *os) ++{ ++ dnode_t *dn; ++ ++ mutex_enter(&os->os_lock); ++ ++ /* process the mdn last, since the other dnodes have holds on it */ ++ list_remove(&os->os_dnodes, DMU_META_DNODE(os)); ++ list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); ++ ++ /* ++ * Find the first dnode with holds. We have to do this dance ++ * because dnode_add_ref() only works if you already have a ++ * hold. If there are no holds then it has no dbufs so OK to ++ * skip. ++ */ ++ for (dn = list_head(&os->os_dnodes); ++ dn && !dnode_add_ref(dn, FTAG); ++ dn = list_next(&os->os_dnodes, dn)) ++ continue; ++ ++ while (dn) { ++ dnode_t *next_dn = dn; ++ ++ do { ++ next_dn = list_next(&os->os_dnodes, next_dn); ++ } while (next_dn && !dnode_add_ref(next_dn, FTAG)); ++ ++ mutex_exit(&os->os_lock); ++ dnode_evict_dbufs(dn); ++ dnode_rele(dn, FTAG); ++ mutex_enter(&os->os_lock); ++ dn = next_dn; ++ } ++ dn = list_head(&os->os_dnodes); ++ mutex_exit(&os->os_lock); ++ return (dn != DMU_META_DNODE(os)); ++} ++ ++void ++dmu_objset_evict(objset_t *os) ++{ ++ dsl_dataset_t *ds = os->os_dsl_dataset; ++ int t; ++ ++ for (t = 0; t < TXG_SIZE; t++) ++ ASSERT(!dmu_objset_is_dirty(os, t)); ++ ++ if (ds) { ++ if (!dsl_dataset_is_snapshot(ds)) { ++ VERIFY(0 == dsl_prop_unregister(ds, "checksum", ++ checksum_changed_cb, os)); ++ VERIFY(0 == dsl_prop_unregister(ds, "compression", ++ compression_changed_cb, os)); ++ VERIFY(0 == dsl_prop_unregister(ds, "copies", ++ copies_changed_cb, os)); ++ VERIFY(0 == dsl_prop_unregister(ds, "dedup", ++ dedup_changed_cb, os)); ++ VERIFY(0 == dsl_prop_unregister(ds, "logbias", ++ logbias_changed_cb, os)); ++ VERIFY(0 == dsl_prop_unregister(ds, "sync", ++ sync_changed_cb, os)); ++ } ++ VERIFY(0 == dsl_prop_unregister(ds, "primarycache", ++ primary_cache_changed_cb, os)); ++ VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", ++ secondary_cache_changed_cb, os)); ++ } ++ ++ if (os->os_sa) ++ sa_tear_down(os); ++ ++ /* ++ * We should need only a single pass over the dnode list, since ++ * nothing can be added to the list at this point. ++ */ ++ (void) dmu_objset_evict_dbufs(os); ++ ++ dnode_special_close(&os->os_meta_dnode); ++ if (DMU_USERUSED_DNODE(os)) { ++ dnode_special_close(&os->os_userused_dnode); ++ dnode_special_close(&os->os_groupused_dnode); ++ } ++ zil_free(os->os_zil); ++ ++ ASSERT3P(list_head(&os->os_dnodes), ==, NULL); ++ ++ VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); ++ ++ /* ++ * This is a barrier to prevent the objset from going away in ++ * dnode_move() until we can safely ensure that the objset is still in ++ * use. We consider the objset valid before the barrier and invalid ++ * after the barrier. ++ */ ++ rw_enter(&os_lock, RW_READER); ++ rw_exit(&os_lock); ++ ++ mutex_destroy(&os->os_lock); ++ mutex_destroy(&os->os_obj_lock); ++ mutex_destroy(&os->os_user_ptr_lock); ++ kmem_free(os, sizeof (objset_t)); ++} ++ ++timestruc_t ++dmu_objset_snap_cmtime(objset_t *os) ++{ ++ return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); ++} ++ ++/* called from dsl for meta-objset */ ++objset_t * ++dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ++ dmu_objset_type_t type, dmu_tx_t *tx) ++{ ++ objset_t *os; ++ dnode_t *mdn; ++ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ if (ds != NULL) ++ VERIFY(0 == dmu_objset_from_ds(ds, &os)); ++ else ++ VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os)); ++ ++ mdn = DMU_META_DNODE(os); ++ ++ dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, ++ DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); ++ ++ /* ++ * We don't want to have to increase the meta-dnode's nlevels ++ * later, because then we could do it in quescing context while ++ * we are also accessing it in open context. ++ * ++ * This precaution is not necessary for the MOS (ds == NULL), ++ * because the MOS is only updated in syncing context. ++ * This is most fortunate: the MOS is the only objset that ++ * needs to be synced multiple times as spa_sync() iterates ++ * to convergence, so minimizing its dn_nlevels matters. ++ */ ++ if (ds != NULL) { ++ int levels = 1; ++ ++ /* ++ * Determine the number of levels necessary for the meta-dnode ++ * to contain DN_MAX_OBJECT dnodes. ++ */ ++ while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + ++ (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < ++ DN_MAX_OBJECT * sizeof (dnode_phys_t)) ++ levels++; ++ ++ mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = ++ mdn->dn_nlevels = levels; ++ } ++ ++ ASSERT(type != DMU_OST_NONE); ++ ASSERT(type != DMU_OST_ANY); ++ ASSERT(type < DMU_OST_NUMTYPES); ++ os->os_phys->os_type = type; ++ if (dmu_objset_userused_enabled(os)) { ++ os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; ++ os->os_flags = os->os_phys->os_flags; ++ } ++ ++ dsl_dataset_dirty(ds, tx); ++ ++ return (os); ++} ++ ++struct oscarg { ++ void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); ++ void *userarg; ++ dsl_dataset_t *clone_origin; ++ const char *lastname; ++ dmu_objset_type_t type; ++ uint64_t flags; ++ cred_t *cr; ++}; ++ ++/*ARGSUSED*/ ++static int ++dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dir_t *dd = arg1; ++ struct oscarg *oa = arg2; ++ objset_t *mos = dd->dd_pool->dp_meta_objset; ++ int err; ++ uint64_t ddobj; ++ ++ err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, ++ oa->lastname, sizeof (uint64_t), 1, &ddobj); ++ if (err != ENOENT) ++ return (err ? err : EEXIST); ++ ++ if (oa->clone_origin != NULL) { ++ /* You can't clone across pools. */ ++ if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool) ++ return (EXDEV); ++ ++ /* You can only clone snapshots, not the head datasets. */ ++ if (!dsl_dataset_is_snapshot(oa->clone_origin)) ++ return (EINVAL); ++ } ++ ++ return (0); ++} ++ ++static void ++dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dir_t *dd = arg1; ++ spa_t *spa = dd->dd_pool->dp_spa; ++ struct oscarg *oa = arg2; ++ uint64_t obj; ++ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ++ obj = dsl_dataset_create_sync(dd, oa->lastname, ++ oa->clone_origin, oa->flags, oa->cr, tx); ++ ++ if (oa->clone_origin == NULL) { ++ dsl_pool_t *dp = dd->dd_pool; ++ dsl_dataset_t *ds; ++ blkptr_t *bp; ++ objset_t *os; ++ ++ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); ++ bp = dsl_dataset_get_blkptr(ds); ++ ASSERT(BP_IS_HOLE(bp)); ++ ++ os = dmu_objset_create_impl(spa, ds, bp, oa->type, tx); ++ ++ if (oa->userfunc) ++ oa->userfunc(os, oa->userarg, oa->cr, tx); ++ dsl_dataset_rele(ds, FTAG); ++ } ++ ++ spa_history_log_internal(LOG_DS_CREATE, spa, tx, "dataset = %llu", obj); ++} ++ ++int ++dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, ++ void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) ++{ ++ dsl_dir_t *pdd; ++ const char *tail; ++ int err = 0; ++ struct oscarg oa = { 0 }; ++ ++ ASSERT(strchr(name, '@') == NULL); ++ err = dsl_dir_open(name, FTAG, &pdd, &tail); ++ if (err) ++ return (err); ++ if (tail == NULL) { ++ dsl_dir_close(pdd, FTAG); ++ return (EEXIST); ++ } ++ ++ oa.userfunc = func; ++ oa.userarg = arg; ++ oa.lastname = tail; ++ oa.type = type; ++ oa.flags = flags; ++ oa.cr = CRED(); ++ ++ err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, ++ dmu_objset_create_sync, pdd, &oa, 5); ++ dsl_dir_close(pdd, FTAG); ++ return (err); ++} ++ ++int ++dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) ++{ ++ dsl_dir_t *pdd; ++ const char *tail; ++ int err = 0; ++ struct oscarg oa = { 0 }; ++ ++ ASSERT(strchr(name, '@') == NULL); ++ err = dsl_dir_open(name, FTAG, &pdd, &tail); ++ if (err) ++ return (err); ++ if (tail == NULL) { ++ dsl_dir_close(pdd, FTAG); ++ return (EEXIST); ++ } ++ ++ oa.lastname = tail; ++ oa.clone_origin = clone_origin; ++ oa.flags = flags; ++ oa.cr = CRED(); ++ ++ err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, ++ dmu_objset_create_sync, pdd, &oa, 5); ++ dsl_dir_close(pdd, FTAG); ++ return (err); ++} ++ ++int ++dmu_objset_destroy(const char *name, boolean_t defer) ++{ ++ dsl_dataset_t *ds; ++ int error; ++ ++ error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); ++ if (error == 0) { ++ error = dsl_dataset_destroy(ds, FTAG, defer); ++ /* dsl_dataset_destroy() closes the ds. */ ++ } ++ ++ return (error); ++} ++ ++struct snaparg { ++ dsl_sync_task_group_t *dstg; ++ char *snapname; ++ char *htag; ++ char failed[MAXPATHLEN]; ++ boolean_t recursive; ++ boolean_t needsuspend; ++ boolean_t temporary; ++ nvlist_t *props; ++ struct dsl_ds_holdarg *ha; /* only needed in the temporary case */ ++ dsl_dataset_t *newds; ++}; ++ ++static int ++snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ objset_t *os = arg1; ++ struct snaparg *sn = arg2; ++ int error; ++ ++ /* The props have already been checked by zfs_check_userprops(). */ ++ ++ error = dsl_dataset_snapshot_check(os->os_dsl_dataset, ++ sn->snapname, tx); ++ if (error) ++ return (error); ++ ++ if (sn->temporary) { ++ /* ++ * Ideally we would just call ++ * dsl_dataset_user_hold_check() and ++ * dsl_dataset_destroy_check() here. However the ++ * dataset we want to hold and destroy is the snapshot ++ * that we just confirmed we can create, but it won't ++ * exist until after these checks are run. Do any ++ * checks we can here and if more checks are added to ++ * those routines in the future, similar checks may be ++ * necessary here. ++ */ ++ if (spa_version(os->os_spa) < SPA_VERSION_USERREFS) ++ return (ENOTSUP); ++ /* ++ * Not checking number of tags because the tag will be ++ * unique, as it will be the only tag. ++ */ ++ if (strlen(sn->htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) ++ return (E2BIG); ++ ++ sn->ha = kmem_alloc(sizeof(struct dsl_ds_holdarg), KM_PUSHPAGE); ++ sn->ha->temphold = B_TRUE; ++ sn->ha->htag = sn->htag; ++ } ++ return (error); ++} ++ ++static void ++snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ objset_t *os = arg1; ++ dsl_dataset_t *ds = os->os_dsl_dataset; ++ struct snaparg *sn = arg2; ++ ++ dsl_dataset_snapshot_sync(ds, sn->snapname, tx); ++ ++ if (sn->props) { ++ dsl_props_arg_t pa; ++ pa.pa_props = sn->props; ++ pa.pa_source = ZPROP_SRC_LOCAL; ++ dsl_props_set_sync(ds->ds_prev, &pa, tx); ++ } ++ ++ if (sn->temporary) { ++ struct dsl_ds_destroyarg da; ++ ++ dsl_dataset_user_hold_sync(ds->ds_prev, sn->ha, tx); ++ kmem_free(sn->ha, sizeof (struct dsl_ds_holdarg)); ++ sn->ha = NULL; ++ sn->newds = ds->ds_prev; ++ ++ da.ds = ds->ds_prev; ++ da.defer = B_TRUE; ++ dsl_dataset_destroy_sync(&da, FTAG, tx); ++ } ++} ++ ++static int ++dmu_objset_snapshot_one(const char *name, void *arg) ++{ ++ struct snaparg *sn = arg; ++ objset_t *os; ++ int err; ++ char *cp; ++ ++ /* ++ * If the objset starts with a '%', then ignore it unless it was ++ * explicitly named (ie, not recursive). These hidden datasets ++ * are always inconsistent, and by not opening them here, we can ++ * avoid a race with dsl_dir_destroy_check(). ++ */ ++ cp = strrchr(name, '/'); ++ if (cp && cp[1] == '%' && sn->recursive) ++ return (0); ++ ++ (void) strcpy(sn->failed, name); ++ ++ /* ++ * Check permissions if we are doing a recursive snapshot. The ++ * permission checks for the starting dataset have already been ++ * performed in zfs_secpolicy_snapshot() ++ */ ++ if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED()))) ++ return (err); ++ ++ err = dmu_objset_hold(name, sn, &os); ++ if (err != 0) ++ return (err); ++ ++ /* ++ * If the objset is in an inconsistent state (eg, in the process ++ * of being destroyed), don't snapshot it. As with %hidden ++ * datasets, we return EBUSY if this name was explicitly ++ * requested (ie, not recursive), and otherwise ignore it. ++ */ ++ if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { ++ dmu_objset_rele(os, sn); ++ return (sn->recursive ? 0 : EBUSY); ++ } ++ ++ if (sn->needsuspend) { ++ err = zil_suspend(dmu_objset_zil(os)); ++ if (err) { ++ dmu_objset_rele(os, sn); ++ return (err); ++ } ++ } ++ dsl_sync_task_create(sn->dstg, snapshot_check, snapshot_sync, ++ os, sn, 3); ++ ++ return (0); ++} ++ ++int ++dmu_objset_snapshot(char *fsname, char *snapname, char *tag, ++ nvlist_t *props, boolean_t recursive, boolean_t temporary, int cleanup_fd) ++{ ++ dsl_sync_task_t *dst; ++ struct snaparg *sn; ++ spa_t *spa; ++ minor_t minor; ++ int err; ++ ++ sn = kmem_alloc(sizeof (struct snaparg), KM_SLEEP); ++ (void) strcpy(sn->failed, fsname); ++ ++ err = spa_open(fsname, &spa, FTAG); ++ if (err) { ++ kmem_free(sn, sizeof (struct snaparg)); ++ return (err); ++ } ++ ++ if (temporary) { ++ if (cleanup_fd < 0) { ++ spa_close(spa, FTAG); ++ return (EINVAL); ++ } ++ if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) { ++ spa_close(spa, FTAG); ++ return (err); ++ } ++ } ++ ++ sn->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); ++ sn->snapname = snapname; ++ sn->htag = tag; ++ sn->props = props; ++ sn->recursive = recursive; ++ sn->needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); ++ sn->temporary = temporary; ++ sn->ha = NULL; ++ sn->newds = NULL; ++ ++ if (recursive) { ++ err = dmu_objset_find(fsname, ++ dmu_objset_snapshot_one, sn, DS_FIND_CHILDREN); ++ } else { ++ err = dmu_objset_snapshot_one(fsname, sn); ++ } ++ ++ if (err == 0) ++ err = dsl_sync_task_group_wait(sn->dstg); ++ ++ for (dst = list_head(&sn->dstg->dstg_tasks); dst; ++ dst = list_next(&sn->dstg->dstg_tasks, dst)) { ++ objset_t *os = dst->dst_arg1; ++ dsl_dataset_t *ds = os->os_dsl_dataset; ++ if (dst->dst_err) { ++ dsl_dataset_name(ds, sn->failed); ++ } else if (temporary) { ++ dsl_register_onexit_hold_cleanup(sn->newds, tag, minor); ++ } ++ if (sn->needsuspend) ++ zil_resume(dmu_objset_zil(os)); ++ dmu_objset_rele(os, sn); ++ } ++ ++ if (err) ++ (void) strcpy(fsname, sn->failed); ++ if (temporary) ++ zfs_onexit_fd_rele(cleanup_fd); ++ dsl_sync_task_group_destroy(sn->dstg); ++ spa_close(spa, FTAG); ++ kmem_free(sn, sizeof (struct snaparg)); ++ return (err); ++} ++ ++static void ++dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) ++{ ++ dnode_t *dn; ++ ++ while ((dn = list_head(list))) { ++ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); ++ ASSERT(dn->dn_dbuf->db_data_pending); ++ /* ++ * Initialize dn_zio outside dnode_sync() because the ++ * meta-dnode needs to set it ouside dnode_sync(). ++ */ ++ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; ++ ASSERT(dn->dn_zio); ++ ++ ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); ++ list_remove(list, dn); ++ ++ if (newlist) { ++ (void) dnode_add_ref(dn, newlist); ++ list_insert_tail(newlist, dn); ++ } ++ ++ dnode_sync(dn, tx); ++ } ++} ++ ++/* ARGSUSED */ ++static void ++dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) ++{ ++ int i; ++ ++ blkptr_t *bp = zio->io_bp; ++ objset_t *os = arg; ++ dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; ++ ++ ASSERT(bp == os->os_rootbp); ++ ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); ++ ASSERT(BP_GET_LEVEL(bp) == 0); ++ ++ /* ++ * Update rootbp fill count: it should be the number of objects ++ * allocated in the object set (not counting the "special" ++ * objects that are stored in the objset_phys_t -- the meta ++ * dnode and user/group accounting objects). ++ */ ++ bp->blk_fill = 0; ++ for (i = 0; i < dnp->dn_nblkptr; i++) ++ bp->blk_fill += dnp->dn_blkptr[i].blk_fill; ++} ++ ++/* ARGSUSED */ ++static void ++dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) ++{ ++ blkptr_t *bp = zio->io_bp; ++ blkptr_t *bp_orig = &zio->io_bp_orig; ++ objset_t *os = arg; ++ ++ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ++ ASSERT(BP_EQUAL(bp, bp_orig)); ++ } else { ++ dsl_dataset_t *ds = os->os_dsl_dataset; ++ dmu_tx_t *tx = os->os_synctx; ++ ++ (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); ++ dsl_dataset_block_born(ds, bp, tx); ++ } ++} ++ ++/* called from dsl */ ++void ++dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) ++{ ++ int txgoff; ++ zbookmark_t zb; ++ zio_prop_t zp; ++ zio_t *zio; ++ list_t *list; ++ list_t *newlist = NULL; ++ dbuf_dirty_record_t *dr; ++ ++ dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); ++ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ /* XXX the write_done callback should really give us the tx... */ ++ os->os_synctx = tx; ++ ++ if (os->os_dsl_dataset == NULL) { ++ /* ++ * This is the MOS. If we have upgraded, ++ * spa_max_replication() could change, so reset ++ * os_copies here. ++ */ ++ os->os_copies = spa_max_replication(os->os_spa); ++ } ++ ++ /* ++ * Create the root block IO ++ */ ++ SET_BOOKMARK(&zb, os->os_dsl_dataset ? ++ os->os_dsl_dataset->ds_object : DMU_META_OBJSET, ++ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); ++ VERIFY3U(0, ==, arc_release_bp(os->os_phys_buf, &os->os_phys_buf, ++ os->os_rootbp, os->os_spa, &zb)); ++ ++ dmu_write_policy(os, NULL, 0, 0, &zp); ++ ++ zio = arc_write(pio, os->os_spa, tx->tx_txg, ++ os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp, ++ dmu_objset_write_ready, dmu_objset_write_done, os, ++ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); ++ ++ /* ++ * Sync special dnodes - the parent IO for the sync is the root block ++ */ ++ DMU_META_DNODE(os)->dn_zio = zio; ++ dnode_sync(DMU_META_DNODE(os), tx); ++ ++ os->os_phys->os_flags = os->os_flags; ++ ++ if (DMU_USERUSED_DNODE(os) && ++ DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { ++ DMU_USERUSED_DNODE(os)->dn_zio = zio; ++ dnode_sync(DMU_USERUSED_DNODE(os), tx); ++ DMU_GROUPUSED_DNODE(os)->dn_zio = zio; ++ dnode_sync(DMU_GROUPUSED_DNODE(os), tx); ++ } ++ ++ txgoff = tx->tx_txg & TXG_MASK; ++ ++ if (dmu_objset_userused_enabled(os)) { ++ newlist = &os->os_synced_dnodes; ++ /* ++ * We must create the list here because it uses the ++ * dn_dirty_link[] of this txg. ++ */ ++ list_create(newlist, sizeof (dnode_t), ++ offsetof(dnode_t, dn_dirty_link[txgoff])); ++ } ++ ++ dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); ++ dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); ++ ++ list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; ++ while ((dr = list_head(list)) != NULL) { ++ ASSERT(dr->dr_dbuf->db_level == 0); ++ list_remove(list, dr); ++ if (dr->dr_zio) ++ zio_nowait(dr->dr_zio); ++ } ++ /* ++ * Free intent log blocks up to this tx. ++ */ ++ zil_sync(os->os_zil, tx); ++ os->os_phys->os_zil_header = os->os_zil_header; ++ zio_nowait(zio); ++} ++ ++boolean_t ++dmu_objset_is_dirty(objset_t *os, uint64_t txg) ++{ ++ return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || ++ !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); ++} ++ ++static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; ++ ++void ++dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) ++{ ++ used_cbs[ost] = cb; ++} ++ ++boolean_t ++dmu_objset_userused_enabled(objset_t *os) ++{ ++ return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && ++ used_cbs[os->os_phys->os_type] != NULL && ++ DMU_USERUSED_DNODE(os) != NULL); ++} ++ ++static void ++do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, ++ uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) ++{ ++ if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { ++ int64_t delta = DNODE_SIZE + used; ++ if (subtract) ++ delta = -delta; ++ VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, ++ user, delta, tx)); ++ VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, ++ group, delta, tx)); ++ } ++} ++ ++void ++dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) ++{ ++ dnode_t *dn; ++ list_t *list = &os->os_synced_dnodes; ++ ++ ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); ++ ++ while ((dn = list_head(list)) != NULL) { ++ int flags; ++ ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); ++ ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || ++ dn->dn_phys->dn_flags & ++ DNODE_FLAG_USERUSED_ACCOUNTED); ++ ++ /* Allocate the user/groupused objects if necessary. */ ++ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { ++ VERIFY(0 == zap_create_claim(os, ++ DMU_USERUSED_OBJECT, ++ DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); ++ VERIFY(0 == zap_create_claim(os, ++ DMU_GROUPUSED_OBJECT, ++ DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); ++ } ++ ++ /* ++ * We intentionally modify the zap object even if the ++ * net delta is zero. Otherwise ++ * the block of the zap obj could be shared between ++ * datasets but need to be different between them after ++ * a bprewrite. ++ */ ++ ++ flags = dn->dn_id_flags; ++ ASSERT(flags); ++ if (flags & DN_ID_OLD_EXIST) { ++ do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, ++ dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); ++ } ++ if (flags & DN_ID_NEW_EXIST) { ++ do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), ++ dn->dn_phys->dn_flags, dn->dn_newuid, ++ dn->dn_newgid, B_FALSE, tx); ++ } ++ ++ mutex_enter(&dn->dn_mtx); ++ dn->dn_oldused = 0; ++ dn->dn_oldflags = 0; ++ if (dn->dn_id_flags & DN_ID_NEW_EXIST) { ++ dn->dn_olduid = dn->dn_newuid; ++ dn->dn_oldgid = dn->dn_newgid; ++ dn->dn_id_flags |= DN_ID_OLD_EXIST; ++ if (dn->dn_bonuslen == 0) ++ dn->dn_id_flags |= DN_ID_CHKED_SPILL; ++ else ++ dn->dn_id_flags |= DN_ID_CHKED_BONUS; ++ } ++ dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); ++ mutex_exit(&dn->dn_mtx); ++ ++ list_remove(list, dn); ++ dnode_rele(dn, list); ++ } ++} ++ ++/* ++ * Returns a pointer to data to find uid/gid from ++ * ++ * If a dirty record for transaction group that is syncing can't ++ * be found then NULL is returned. In the NULL case it is assumed ++ * the uid/gid aren't changing. ++ */ ++static void * ++dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) ++{ ++ dbuf_dirty_record_t *dr, **drp; ++ void *data; ++ ++ if (db->db_dirtycnt == 0) ++ return (db->db.db_data); /* Nothing is changing */ ++ ++ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) ++ if (dr->dr_txg == tx->tx_txg) ++ break; ++ ++ if (dr == NULL) { ++ data = NULL; ++ } else { ++ dnode_t *dn; ++ ++ DB_DNODE_ENTER(dr->dr_dbuf); ++ dn = DB_DNODE(dr->dr_dbuf); ++ ++ if (dn->dn_bonuslen == 0 && ++ dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) ++ data = dr->dt.dl.dr_data->b_data; ++ else ++ data = dr->dt.dl.dr_data; ++ ++ DB_DNODE_EXIT(dr->dr_dbuf); ++ } ++ ++ return (data); ++} ++ ++void ++dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) ++{ ++ objset_t *os = dn->dn_objset; ++ void *data = NULL; ++ dmu_buf_impl_t *db = NULL; ++ uint64_t *user = NULL, *group = NULL; ++ int flags = dn->dn_id_flags; ++ int error; ++ boolean_t have_spill = B_FALSE; ++ ++ if (!dmu_objset_userused_enabled(dn->dn_objset)) ++ return; ++ ++ if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| ++ DN_ID_CHKED_SPILL))) ++ return; ++ ++ if (before && dn->dn_bonuslen != 0) ++ data = DN_BONUS(dn->dn_phys); ++ else if (!before && dn->dn_bonuslen != 0) { ++ if (dn->dn_bonus) { ++ db = dn->dn_bonus; ++ mutex_enter(&db->db_mtx); ++ data = dmu_objset_userquota_find_data(db, tx); ++ } else { ++ data = DN_BONUS(dn->dn_phys); ++ } ++ } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { ++ int rf = 0; ++ ++ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) ++ rf |= DB_RF_HAVESTRUCT; ++ error = dmu_spill_hold_by_dnode(dn, ++ rf | DB_RF_MUST_SUCCEED, ++ FTAG, (dmu_buf_t **)&db); ++ ASSERT(error == 0); ++ mutex_enter(&db->db_mtx); ++ data = (before) ? db->db.db_data : ++ dmu_objset_userquota_find_data(db, tx); ++ have_spill = B_TRUE; ++ } else { ++ mutex_enter(&dn->dn_mtx); ++ dn->dn_id_flags |= DN_ID_CHKED_BONUS; ++ mutex_exit(&dn->dn_mtx); ++ return; ++ } ++ ++ if (before) { ++ ASSERT(data); ++ user = &dn->dn_olduid; ++ group = &dn->dn_oldgid; ++ } else if (data) { ++ user = &dn->dn_newuid; ++ group = &dn->dn_newgid; ++ } ++ ++ /* ++ * Must always call the callback in case the object ++ * type has changed and that type isn't an object type to track ++ */ ++ error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, ++ user, group); ++ ++ /* ++ * Preserve existing uid/gid when the callback can't determine ++ * what the new uid/gid are and the callback returned EEXIST. ++ * The EEXIST error tells us to just use the existing uid/gid. ++ * If we don't know what the old values are then just assign ++ * them to 0, since that is a new file being created. ++ */ ++ if (!before && data == NULL && error == EEXIST) { ++ if (flags & DN_ID_OLD_EXIST) { ++ dn->dn_newuid = dn->dn_olduid; ++ dn->dn_newgid = dn->dn_oldgid; ++ } else { ++ dn->dn_newuid = 0; ++ dn->dn_newgid = 0; ++ } ++ error = 0; ++ } ++ ++ if (db) ++ mutex_exit(&db->db_mtx); ++ ++ mutex_enter(&dn->dn_mtx); ++ if (error == 0 && before) ++ dn->dn_id_flags |= DN_ID_OLD_EXIST; ++ if (error == 0 && !before) ++ dn->dn_id_flags |= DN_ID_NEW_EXIST; ++ ++ if (have_spill) { ++ dn->dn_id_flags |= DN_ID_CHKED_SPILL; ++ } else { ++ dn->dn_id_flags |= DN_ID_CHKED_BONUS; ++ } ++ mutex_exit(&dn->dn_mtx); ++ if (have_spill) ++ dmu_buf_rele((dmu_buf_t *)db, FTAG); ++} ++ ++boolean_t ++dmu_objset_userspace_present(objset_t *os) ++{ ++ return (os->os_phys->os_flags & ++ OBJSET_FLAG_USERACCOUNTING_COMPLETE); ++} ++ ++int ++dmu_objset_userspace_upgrade(objset_t *os) ++{ ++ uint64_t obj; ++ int err = 0; ++ ++ if (dmu_objset_userspace_present(os)) ++ return (0); ++ if (!dmu_objset_userused_enabled(os)) ++ return (ENOTSUP); ++ if (dmu_objset_is_snapshot(os)) ++ return (EINVAL); ++ ++ /* ++ * We simply need to mark every object dirty, so that it will be ++ * synced out and now accounted. If this is called ++ * concurrently, or if we already did some work before crashing, ++ * that's fine, since we track each object's accounted state ++ * independently. ++ */ ++ ++ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { ++ dmu_tx_t *tx; ++ dmu_buf_t *db; ++ int objerr; ++ ++ if (issig(JUSTLOOKING) && issig(FORREAL)) ++ return (EINTR); ++ ++ objerr = dmu_bonus_hold(os, obj, FTAG, &db); ++ if (objerr) ++ continue; ++ tx = dmu_tx_create(os); ++ dmu_tx_hold_bonus(tx, obj); ++ objerr = dmu_tx_assign(tx, TXG_WAIT); ++ if (objerr) { ++ dmu_tx_abort(tx); ++ continue; ++ } ++ dmu_buf_will_dirty(db, tx); ++ dmu_buf_rele(db, FTAG); ++ dmu_tx_commit(tx); ++ } ++ ++ os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; ++ txg_wait_synced(dmu_objset_pool(os), 0); ++ return (0); ++} ++ ++void ++dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, ++ uint64_t *usedobjsp, uint64_t *availobjsp) ++{ ++ dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, ++ usedobjsp, availobjsp); ++} ++ ++uint64_t ++dmu_objset_fsid_guid(objset_t *os) ++{ ++ return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); ++} ++ ++void ++dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) ++{ ++ stat->dds_type = os->os_phys->os_type; ++ if (os->os_dsl_dataset) ++ dsl_dataset_fast_stat(os->os_dsl_dataset, stat); ++} ++ ++void ++dmu_objset_stats(objset_t *os, nvlist_t *nv) ++{ ++ ASSERT(os->os_dsl_dataset || ++ os->os_phys->os_type == DMU_OST_META); ++ ++ if (os->os_dsl_dataset != NULL) ++ dsl_dataset_stats(os->os_dsl_dataset, nv); ++ ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, ++ os->os_phys->os_type); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, ++ dmu_objset_userspace_present(os)); ++} ++ ++int ++dmu_objset_is_snapshot(objset_t *os) ++{ ++ if (os->os_dsl_dataset != NULL) ++ return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); ++ else ++ return (B_FALSE); ++} ++ ++int ++dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, ++ boolean_t *conflict) ++{ ++ dsl_dataset_t *ds = os->os_dsl_dataset; ++ uint64_t ignored; ++ ++ if (ds->ds_phys->ds_snapnames_zapobj == 0) ++ return (ENOENT); ++ ++ return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, ++ ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, ++ real, maxlen, conflict)); ++} ++ ++int ++dmu_snapshot_list_next(objset_t *os, int namelen, char *name, ++ uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) ++{ ++ dsl_dataset_t *ds = os->os_dsl_dataset; ++ zap_cursor_t cursor; ++ zap_attribute_t attr; ++ ++ if (ds->ds_phys->ds_snapnames_zapobj == 0) ++ return (ENOENT); ++ ++ zap_cursor_init_serialized(&cursor, ++ ds->ds_dir->dd_pool->dp_meta_objset, ++ ds->ds_phys->ds_snapnames_zapobj, *offp); ++ ++ if (zap_cursor_retrieve(&cursor, &attr) != 0) { ++ zap_cursor_fini(&cursor); ++ return (ENOENT); ++ } ++ ++ if (strlen(attr.za_name) + 1 > namelen) { ++ zap_cursor_fini(&cursor); ++ return (ENAMETOOLONG); ++ } ++ ++ (void) strcpy(name, attr.za_name); ++ if (idp) ++ *idp = attr.za_first_integer; ++ if (case_conflict) ++ *case_conflict = attr.za_normalization_conflict; ++ zap_cursor_advance(&cursor); ++ *offp = zap_cursor_serialize(&cursor); ++ zap_cursor_fini(&cursor); ++ ++ return (0); ++} ++ ++/* ++ * Determine the objset id for a given snapshot name. ++ */ ++int ++dmu_snapshot_id(objset_t *os, const char *snapname, uint64_t *idp) ++{ ++ dsl_dataset_t *ds = os->os_dsl_dataset; ++ zap_cursor_t cursor; ++ zap_attribute_t attr; ++ int error; ++ ++ if (ds->ds_phys->ds_snapnames_zapobj == 0) ++ return (ENOENT); ++ ++ zap_cursor_init(&cursor, ds->ds_dir->dd_pool->dp_meta_objset, ++ ds->ds_phys->ds_snapnames_zapobj); ++ ++ error = zap_cursor_move_to_key(&cursor, snapname, MT_EXACT); ++ if (error) { ++ zap_cursor_fini(&cursor); ++ return (error); ++ } ++ ++ error = zap_cursor_retrieve(&cursor, &attr); ++ if (error) { ++ zap_cursor_fini(&cursor); ++ return (error); ++ } ++ ++ *idp = attr.za_first_integer; ++ zap_cursor_fini(&cursor); ++ ++ return (0); ++} ++ ++int ++dmu_dir_list_next(objset_t *os, int namelen, char *name, ++ uint64_t *idp, uint64_t *offp) ++{ ++ dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; ++ zap_cursor_t cursor; ++ zap_attribute_t attr; ++ ++ /* there is no next dir on a snapshot! */ ++ if (os->os_dsl_dataset->ds_object != ++ dd->dd_phys->dd_head_dataset_obj) ++ return (ENOENT); ++ ++ zap_cursor_init_serialized(&cursor, ++ dd->dd_pool->dp_meta_objset, ++ dd->dd_phys->dd_child_dir_zapobj, *offp); ++ ++ if (zap_cursor_retrieve(&cursor, &attr) != 0) { ++ zap_cursor_fini(&cursor); ++ return (ENOENT); ++ } ++ ++ if (strlen(attr.za_name) + 1 > namelen) { ++ zap_cursor_fini(&cursor); ++ return (ENAMETOOLONG); ++ } ++ ++ (void) strcpy(name, attr.za_name); ++ if (idp) ++ *idp = attr.za_first_integer; ++ zap_cursor_advance(&cursor); ++ *offp = zap_cursor_serialize(&cursor); ++ zap_cursor_fini(&cursor); ++ ++ return (0); ++} ++ ++struct findarg { ++ int (*func)(const char *, void *); ++ void *arg; ++}; ++ ++/* ARGSUSED */ ++static int ++findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) ++{ ++ struct findarg *fa = arg; ++ return (fa->func(dsname, fa->arg)); ++} ++ ++/* ++ * Find all objsets under name, and for each, call 'func(child_name, arg)'. ++ * Perhaps change all callers to use dmu_objset_find_spa()? ++ */ ++int ++dmu_objset_find(char *name, int func(const char *, void *), void *arg, ++ int flags) ++{ ++ struct findarg fa; ++ fa.func = func; ++ fa.arg = arg; ++ return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags)); ++} ++ ++/* ++ * Find all objsets under name, call func on each ++ */ ++int ++dmu_objset_find_spa(spa_t *spa, const char *name, ++ int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags) ++{ ++ dsl_dir_t *dd; ++ dsl_pool_t *dp; ++ dsl_dataset_t *ds; ++ zap_cursor_t zc; ++ zap_attribute_t *attr; ++ char *child; ++ uint64_t thisobj; ++ int err; ++ ++ if (name == NULL) ++ name = spa_name(spa); ++ err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL); ++ if (err) ++ return (err); ++ ++ /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ ++ if (dd->dd_myname[0] == '$') { ++ dsl_dir_close(dd, FTAG); ++ return (0); ++ } ++ ++ thisobj = dd->dd_phys->dd_head_dataset_obj; ++ attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); ++ dp = dd->dd_pool; ++ ++ /* ++ * Iterate over all children. ++ */ ++ if (flags & DS_FIND_CHILDREN) { ++ for (zap_cursor_init(&zc, dp->dp_meta_objset, ++ dd->dd_phys->dd_child_dir_zapobj); ++ zap_cursor_retrieve(&zc, attr) == 0; ++ (void) zap_cursor_advance(&zc)) { ++ ASSERT(attr->za_integer_length == sizeof (uint64_t)); ++ ASSERT(attr->za_num_integers == 1); ++ ++ child = kmem_asprintf("%s/%s", name, attr->za_name); ++ err = dmu_objset_find_spa(spa, child, func, arg, flags); ++ strfree(child); ++ if (err) ++ break; ++ } ++ zap_cursor_fini(&zc); ++ ++ if (err) { ++ dsl_dir_close(dd, FTAG); ++ kmem_free(attr, sizeof (zap_attribute_t)); ++ return (err); ++ } ++ } ++ ++ /* ++ * Iterate over all snapshots. ++ */ ++ if (flags & DS_FIND_SNAPSHOTS) { ++ if (!dsl_pool_sync_context(dp)) ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); ++ if (!dsl_pool_sync_context(dp)) ++ rw_exit(&dp->dp_config_rwlock); ++ ++ if (err == 0) { ++ uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; ++ dsl_dataset_rele(ds, FTAG); ++ ++ for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); ++ zap_cursor_retrieve(&zc, attr) == 0; ++ (void) zap_cursor_advance(&zc)) { ++ ASSERT(attr->za_integer_length == ++ sizeof (uint64_t)); ++ ASSERT(attr->za_num_integers == 1); ++ ++ child = kmem_asprintf("%s@%s", ++ name, attr->za_name); ++ err = func(spa, attr->za_first_integer, ++ child, arg); ++ strfree(child); ++ if (err) ++ break; ++ } ++ zap_cursor_fini(&zc); ++ } ++ } ++ ++ dsl_dir_close(dd, FTAG); ++ kmem_free(attr, sizeof (zap_attribute_t)); ++ ++ if (err) ++ return (err); ++ ++ /* ++ * Apply to self if appropriate. ++ */ ++ err = func(spa, thisobj, name, arg); ++ return (err); ++} ++ ++/* ARGSUSED */ ++int ++dmu_objset_prefetch(const char *name, void *arg) ++{ ++ dsl_dataset_t *ds; ++ ++ if (dsl_dataset_hold(name, FTAG, &ds)) ++ return (0); ++ ++ if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { ++ mutex_enter(&ds->ds_opening_lock); ++ if (ds->ds_objset == NULL) { ++ uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; ++ zbookmark_t zb; ++ ++ SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, ++ ZB_ROOT_LEVEL, ZB_ROOT_BLKID); ++ ++ (void) dsl_read_nolock(NULL, dsl_dataset_get_spa(ds), ++ &ds->ds_phys->ds_bp, NULL, NULL, ++ ZIO_PRIORITY_ASYNC_READ, ++ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, ++ &aflags, &zb); ++ } ++ mutex_exit(&ds->ds_opening_lock); ++ } ++ ++ dsl_dataset_rele(ds, FTAG); ++ return (0); ++} ++ ++void ++dmu_objset_set_user(objset_t *os, void *user_ptr) ++{ ++ ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); ++ os->os_user_ptr = user_ptr; ++} ++ ++void * ++dmu_objset_get_user(objset_t *os) ++{ ++ ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); ++ return (os->os_user_ptr); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(dmu_objset_zil); ++EXPORT_SYMBOL(dmu_objset_pool); ++EXPORT_SYMBOL(dmu_objset_ds); ++EXPORT_SYMBOL(dmu_objset_type); ++EXPORT_SYMBOL(dmu_objset_name); ++EXPORT_SYMBOL(dmu_objset_hold); ++EXPORT_SYMBOL(dmu_objset_own); ++EXPORT_SYMBOL(dmu_objset_rele); ++EXPORT_SYMBOL(dmu_objset_disown); ++EXPORT_SYMBOL(dmu_objset_from_ds); ++EXPORT_SYMBOL(dmu_objset_create); ++EXPORT_SYMBOL(dmu_objset_clone); ++EXPORT_SYMBOL(dmu_objset_destroy); ++EXPORT_SYMBOL(dmu_objset_snapshot); ++EXPORT_SYMBOL(dmu_objset_stats); ++EXPORT_SYMBOL(dmu_objset_fast_stat); ++EXPORT_SYMBOL(dmu_objset_spa); ++EXPORT_SYMBOL(dmu_objset_space); ++EXPORT_SYMBOL(dmu_objset_fsid_guid); ++EXPORT_SYMBOL(dmu_objset_find); ++EXPORT_SYMBOL(dmu_objset_find_spa); ++EXPORT_SYMBOL(dmu_objset_prefetch); ++EXPORT_SYMBOL(dmu_objset_byteswap); ++EXPORT_SYMBOL(dmu_objset_evict_dbufs); ++EXPORT_SYMBOL(dmu_objset_snap_cmtime); ++ ++EXPORT_SYMBOL(dmu_objset_sync); ++EXPORT_SYMBOL(dmu_objset_is_dirty); ++EXPORT_SYMBOL(dmu_objset_create_impl); ++EXPORT_SYMBOL(dmu_objset_open_impl); ++EXPORT_SYMBOL(dmu_objset_evict); ++EXPORT_SYMBOL(dmu_objset_register_type); ++EXPORT_SYMBOL(dmu_objset_do_userquota_updates); ++EXPORT_SYMBOL(dmu_objset_userquota_get_ids); ++EXPORT_SYMBOL(dmu_objset_userused_enabled); ++EXPORT_SYMBOL(dmu_objset_userspace_upgrade); ++EXPORT_SYMBOL(dmu_objset_userspace_present); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dmu_send.c linux-3.2.33-go/fs/zfs/zfs/dmu_send.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dmu_send.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dmu_send.c 2012-11-16 23:25:34.350039322 +0100 +@@ -0,0 +1,1687 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ * Copyright (c) 2012, Joyent, Inc. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ ++int zfs_send_corrupt_data = B_FALSE; ++ ++static char *dmu_recv_tag = "dmu_recv_tag"; ++ ++static int ++dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) ++{ ++ dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; ++ ssize_t resid; /* have to get resid to get detailed errno */ ++ ASSERT3U(len % 8, ==, 0); ++ ++ fletcher_4_incremental_native(buf, len, &dsp->dsa_zc); ++ dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, ++ (caddr_t)buf, len, ++ 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); ++ ++ mutex_enter(&ds->ds_sendstream_lock); ++ *dsp->dsa_off += len; ++ mutex_exit(&ds->ds_sendstream_lock); ++ ++ return (dsp->dsa_err); ++} ++ ++static int ++dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, ++ uint64_t length) ++{ ++ struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); ++ ++ if (length != -1ULL && offset + length < offset) ++ length = -1ULL; ++ ++ /* ++ * If there is a pending op, but it's not PENDING_FREE, push it out, ++ * since free block aggregation can only be done for blocks of the ++ * same type (i.e., DRR_FREE records can only be aggregated with ++ * other DRR_FREE records. DRR_FREEOBJECTS records can only be ++ * aggregated with other DRR_FREEOBJECTS records. ++ */ ++ if (dsp->dsa_pending_op != PENDING_NONE && ++ dsp->dsa_pending_op != PENDING_FREE) { ++ if (dump_bytes(dsp, dsp->dsa_drr, ++ sizeof (dmu_replay_record_t)) != 0) ++ return (EINTR); ++ dsp->dsa_pending_op = PENDING_NONE; ++ } ++ ++ if (dsp->dsa_pending_op == PENDING_FREE) { ++ /* ++ * There should never be a PENDING_FREE if length is -1 ++ * (because dump_dnode is the only place where this ++ * function is called with a -1, and only after flushing ++ * any pending record). ++ */ ++ ASSERT(length != -1ULL); ++ /* ++ * Check to see whether this free block can be aggregated ++ * with pending one. ++ */ ++ if (drrf->drr_object == object && drrf->drr_offset + ++ drrf->drr_length == offset) { ++ drrf->drr_length += length; ++ return (0); ++ } else { ++ /* not a continuation. Push out pending record */ ++ if (dump_bytes(dsp, dsp->dsa_drr, ++ sizeof (dmu_replay_record_t)) != 0) ++ return (EINTR); ++ dsp->dsa_pending_op = PENDING_NONE; ++ } ++ } ++ /* create a FREE record and make it pending */ ++ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); ++ dsp->dsa_drr->drr_type = DRR_FREE; ++ drrf->drr_object = object; ++ drrf->drr_offset = offset; ++ drrf->drr_length = length; ++ drrf->drr_toguid = dsp->dsa_toguid; ++ if (length == -1ULL) { ++ if (dump_bytes(dsp, dsp->dsa_drr, ++ sizeof (dmu_replay_record_t)) != 0) ++ return (EINTR); ++ } else { ++ dsp->dsa_pending_op = PENDING_FREE; ++ } ++ ++ return (0); ++} ++ ++static int ++dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, ++ uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) ++{ ++ struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); ++ ++ ++ /* ++ * If there is any kind of pending aggregation (currently either ++ * a grouping of free objects or free blocks), push it out to ++ * the stream, since aggregation can't be done across operations ++ * of different types. ++ */ ++ if (dsp->dsa_pending_op != PENDING_NONE) { ++ if (dump_bytes(dsp, dsp->dsa_drr, ++ sizeof (dmu_replay_record_t)) != 0) ++ return (EINTR); ++ dsp->dsa_pending_op = PENDING_NONE; ++ } ++ /* write a DATA record */ ++ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); ++ dsp->dsa_drr->drr_type = DRR_WRITE; ++ drrw->drr_object = object; ++ drrw->drr_type = type; ++ drrw->drr_offset = offset; ++ drrw->drr_length = blksz; ++ drrw->drr_toguid = dsp->dsa_toguid; ++ drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); ++ if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) ++ drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; ++ DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); ++ DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); ++ DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); ++ drrw->drr_key.ddk_cksum = bp->blk_cksum; ++ ++ if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) ++ return (EINTR); ++ if (dump_bytes(dsp, data, blksz) != 0) ++ return (EINTR); ++ return (0); ++} ++ ++static int ++dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) ++{ ++ struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); ++ ++ if (dsp->dsa_pending_op != PENDING_NONE) { ++ if (dump_bytes(dsp, dsp->dsa_drr, ++ sizeof (dmu_replay_record_t)) != 0) ++ return (EINTR); ++ dsp->dsa_pending_op = PENDING_NONE; ++ } ++ ++ /* write a SPILL record */ ++ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); ++ dsp->dsa_drr->drr_type = DRR_SPILL; ++ drrs->drr_object = object; ++ drrs->drr_length = blksz; ++ drrs->drr_toguid = dsp->dsa_toguid; ++ ++ if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) ++ return (EINTR); ++ if (dump_bytes(dsp, data, blksz)) ++ return (EINTR); ++ return (0); ++} ++ ++static int ++dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) ++{ ++ struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); ++ ++ /* ++ * If there is a pending op, but it's not PENDING_FREEOBJECTS, ++ * push it out, since free block aggregation can only be done for ++ * blocks of the same type (i.e., DRR_FREE records can only be ++ * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records ++ * can only be aggregated with other DRR_FREEOBJECTS records. ++ */ ++ if (dsp->dsa_pending_op != PENDING_NONE && ++ dsp->dsa_pending_op != PENDING_FREEOBJECTS) { ++ if (dump_bytes(dsp, dsp->dsa_drr, ++ sizeof (dmu_replay_record_t)) != 0) ++ return (EINTR); ++ dsp->dsa_pending_op = PENDING_NONE; ++ } ++ if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { ++ /* ++ * See whether this free object array can be aggregated ++ * with pending one ++ */ ++ if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { ++ drrfo->drr_numobjs += numobjs; ++ return (0); ++ } else { ++ /* can't be aggregated. Push out pending record */ ++ if (dump_bytes(dsp, dsp->dsa_drr, ++ sizeof (dmu_replay_record_t)) != 0) ++ return (EINTR); ++ dsp->dsa_pending_op = PENDING_NONE; ++ } ++ } ++ ++ /* write a FREEOBJECTS record */ ++ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); ++ dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; ++ drrfo->drr_firstobj = firstobj; ++ drrfo->drr_numobjs = numobjs; ++ drrfo->drr_toguid = dsp->dsa_toguid; ++ ++ dsp->dsa_pending_op = PENDING_FREEOBJECTS; ++ ++ return (0); ++} ++ ++static int ++dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) ++{ ++ struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); ++ ++ if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) ++ return (dump_freeobjects(dsp, object, 1)); ++ ++ if (dsp->dsa_pending_op != PENDING_NONE) { ++ if (dump_bytes(dsp, dsp->dsa_drr, ++ sizeof (dmu_replay_record_t)) != 0) ++ return (EINTR); ++ dsp->dsa_pending_op = PENDING_NONE; ++ } ++ ++ /* write an OBJECT record */ ++ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); ++ dsp->dsa_drr->drr_type = DRR_OBJECT; ++ drro->drr_object = object; ++ drro->drr_type = dnp->dn_type; ++ drro->drr_bonustype = dnp->dn_bonustype; ++ drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; ++ drro->drr_bonuslen = dnp->dn_bonuslen; ++ drro->drr_checksumtype = dnp->dn_checksum; ++ drro->drr_compress = dnp->dn_compress; ++ drro->drr_toguid = dsp->dsa_toguid; ++ ++ if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) ++ return (EINTR); ++ ++ if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) ++ return (EINTR); ++ ++ /* free anything past the end of the file */ ++ if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * ++ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) ++ return (EINTR); ++ if (dsp->dsa_err) ++ return (EINTR); ++ return (0); ++} ++ ++#define BP_SPAN(dnp, level) \ ++ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ ++ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) ++ ++/* ARGSUSED */ ++static int ++backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, ++ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) ++{ ++ dmu_sendarg_t *dsp = arg; ++ dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; ++ int err = 0; ++ ++ if (issig(JUSTLOOKING) && issig(FORREAL)) ++ return (EINTR); ++ ++ if (zb->zb_object != DMU_META_DNODE_OBJECT && ++ DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { ++ return (0); ++ } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { ++ uint64_t span = BP_SPAN(dnp, zb->zb_level); ++ uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; ++ err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); ++ } else if (bp == NULL) { ++ uint64_t span = BP_SPAN(dnp, zb->zb_level); ++ err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); ++ } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { ++ return (0); ++ } else if (type == DMU_OT_DNODE) { ++ dnode_phys_t *blk; ++ int i; ++ int blksz = BP_GET_LSIZE(bp); ++ uint32_t aflags = ARC_WAIT; ++ arc_buf_t *abuf; ++ ++ if (dsl_read(NULL, spa, bp, pbuf, ++ arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ++ ZIO_FLAG_CANFAIL, &aflags, zb) != 0) ++ return (EIO); ++ ++ blk = abuf->b_data; ++ for (i = 0; i < blksz >> DNODE_SHIFT; i++) { ++ uint64_t dnobj = (zb->zb_blkid << ++ (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; ++ err = dump_dnode(dsp, dnobj, blk+i); ++ if (err) ++ break; ++ } ++ (void) arc_buf_remove_ref(abuf, &abuf); ++ } else if (type == DMU_OT_SA) { ++ uint32_t aflags = ARC_WAIT; ++ arc_buf_t *abuf; ++ int blksz = BP_GET_LSIZE(bp); ++ ++ if (arc_read_nolock(NULL, spa, bp, ++ arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ++ ZIO_FLAG_CANFAIL, &aflags, zb) != 0) ++ return (EIO); ++ ++ err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); ++ (void) arc_buf_remove_ref(abuf, &abuf); ++ } else { /* it's a level-0 block of a regular object */ ++ uint32_t aflags = ARC_WAIT; ++ arc_buf_t *abuf; ++ int blksz = BP_GET_LSIZE(bp); ++ ++ if (dsl_read(NULL, spa, bp, pbuf, ++ arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ++ ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { ++ if (zfs_send_corrupt_data) { ++ uint64_t *ptr; ++ /* Send a block filled with 0x"zfs badd bloc" */ ++ abuf = arc_buf_alloc(spa, blksz, &abuf, ++ ARC_BUFC_DATA); ++ for (ptr = abuf->b_data; ++ (char *)ptr < (char *)abuf->b_data + blksz; ++ ptr++) ++ *ptr = 0x2f5baddb10c; ++ } else { ++ return (EIO); ++ } ++ } ++ ++ err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz, ++ blksz, bp, abuf->b_data); ++ (void) arc_buf_remove_ref(abuf, &abuf); ++ } ++ ++ ASSERT(err == 0 || err == EINTR); ++ return (err); ++} ++ ++int ++dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, ++ int outfd, vnode_t *vp, offset_t *off) ++{ ++ dsl_dataset_t *ds = tosnap->os_dsl_dataset; ++ dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; ++ dmu_replay_record_t *drr; ++ dmu_sendarg_t *dsp; ++ int err; ++ uint64_t fromtxg = 0; ++ ++ /* tosnap must be a snapshot */ ++ if (ds->ds_phys->ds_next_snap_obj == 0) ++ return (EINVAL); ++ ++ /* fromsnap must be an earlier snapshot from the same fs as tosnap */ ++ if (fromds && (ds->ds_dir != fromds->ds_dir || ++ fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) ++ return (EXDEV); ++ ++ if (fromorigin) { ++ dsl_pool_t *dp = ds->ds_dir->dd_pool; ++ ++ if (fromsnap) ++ return (EINVAL); ++ ++ if (dsl_dir_is_clone(ds->ds_dir)) { ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ err = dsl_dataset_hold_obj(dp, ++ ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); ++ rw_exit(&dp->dp_config_rwlock); ++ if (err) ++ return (err); ++ } else { ++ fromorigin = B_FALSE; ++ } ++ } ++ ++ ++ drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); ++ drr->drr_type = DRR_BEGIN; ++ drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; ++ DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, ++ DMU_SUBSTREAM); ++ ++#ifdef _KERNEL ++ if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { ++ uint64_t version; ++ if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) { ++ kmem_free(drr, sizeof (dmu_replay_record_t)); ++ return (EINVAL); ++ } ++ if (version == ZPL_VERSION_SA) { ++ DMU_SET_FEATUREFLAGS( ++ drr->drr_u.drr_begin.drr_versioninfo, ++ DMU_BACKUP_FEATURE_SA_SPILL); ++ } ++ } ++#endif ++ ++ drr->drr_u.drr_begin.drr_creation_time = ++ ds->ds_phys->ds_creation_time; ++ drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; ++ if (fromorigin) ++ drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; ++ drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; ++ if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) ++ drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; ++ ++ if (fromds) ++ drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; ++ dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); ++ ++ if (fromds) ++ fromtxg = fromds->ds_phys->ds_creation_txg; ++ if (fromorigin) ++ dsl_dataset_rele(fromds, FTAG); ++ ++ dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); ++ ++ dsp->dsa_drr = drr; ++ dsp->dsa_vp = vp; ++ dsp->dsa_outfd = outfd; ++ dsp->dsa_proc = curproc; ++ dsp->dsa_os = tosnap; ++ dsp->dsa_off = off; ++ dsp->dsa_toguid = ds->ds_phys->ds_guid; ++ ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); ++ dsp->dsa_pending_op = PENDING_NONE; ++ ++ mutex_enter(&ds->ds_sendstream_lock); ++ list_insert_head(&ds->ds_sendstreams, dsp); ++ mutex_exit(&ds->ds_sendstream_lock); ++ ++ if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { ++ err = dsp->dsa_err; ++ goto out; ++ } ++ ++ err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, ++ backup_cb, dsp); ++ ++ if (dsp->dsa_pending_op != PENDING_NONE) ++ if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) ++ err = EINTR; ++ ++ if (err) { ++ if (err == EINTR && dsp->dsa_err) ++ err = dsp->dsa_err; ++ goto out; ++ } ++ ++ bzero(drr, sizeof (dmu_replay_record_t)); ++ drr->drr_type = DRR_END; ++ drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; ++ drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; ++ ++ if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { ++ err = dsp->dsa_err; ++ goto out; ++ } ++ ++out: ++ mutex_enter(&ds->ds_sendstream_lock); ++ list_remove(&ds->ds_sendstreams, dsp); ++ mutex_exit(&ds->ds_sendstream_lock); ++ ++ kmem_free(drr, sizeof (dmu_replay_record_t)); ++ kmem_free(dsp, sizeof (dmu_sendarg_t)); ++ ++ return (err); ++} ++ ++int ++dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, ++ uint64_t *sizep) ++{ ++ dsl_dataset_t *ds = tosnap->os_dsl_dataset; ++ dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; ++ dsl_pool_t *dp = ds->ds_dir->dd_pool; ++ int err; ++ uint64_t size, recordsize; ++ ++ /* tosnap must be a snapshot */ ++ if (ds->ds_phys->ds_next_snap_obj == 0) ++ return (EINVAL); ++ ++ /* fromsnap must be an earlier snapshot from the same fs as tosnap */ ++ if (fromds && (ds->ds_dir != fromds->ds_dir || ++ fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) ++ return (EXDEV); ++ ++ if (fromorigin) { ++ if (fromsnap) ++ return (EINVAL); ++ ++ if (dsl_dir_is_clone(ds->ds_dir)) { ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ err = dsl_dataset_hold_obj(dp, ++ ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); ++ rw_exit(&dp->dp_config_rwlock); ++ if (err) ++ return (err); ++ } else { ++ fromorigin = B_FALSE; ++ } ++ } ++ ++ /* Get uncompressed size estimate of changed data. */ ++ if (fromds == NULL) { ++ size = ds->ds_phys->ds_uncompressed_bytes; ++ } else { ++ uint64_t used, comp; ++ err = dsl_dataset_space_written(fromds, ds, ++ &used, &comp, &size); ++ if (fromorigin) ++ dsl_dataset_rele(fromds, FTAG); ++ if (err) ++ return (err); ++ } ++ ++ /* ++ * Assume that space (both on-disk and in-stream) is dominated by ++ * data. We will adjust for indirect blocks and the copies property, ++ * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). ++ */ ++ ++ /* ++ * Subtract out approximate space used by indirect blocks. ++ * Assume most space is used by data blocks (non-indirect, non-dnode). ++ * Assume all blocks are recordsize. Assume ditto blocks and ++ * internal fragmentation counter out compression. ++ * ++ * Therefore, space used by indirect blocks is sizeof(blkptr_t) per ++ * block, which we observe in practice. ++ */ ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ err = dsl_prop_get_ds(ds, "recordsize", ++ sizeof (recordsize), 1, &recordsize, NULL); ++ rw_exit(&dp->dp_config_rwlock); ++ if (err) ++ return (err); ++ size -= size / recordsize * sizeof (blkptr_t); ++ ++ /* Add in the space for the record associated with each block. */ ++ size += size / recordsize * sizeof (dmu_replay_record_t); ++ ++ *sizep = size; ++ ++ return (0); ++} ++ ++struct recvbeginsyncarg { ++ const char *tofs; ++ const char *tosnap; ++ dsl_dataset_t *origin; ++ uint64_t fromguid; ++ dmu_objset_type_t type; ++ void *tag; ++ boolean_t force; ++ uint64_t dsflags; ++ char clonelastname[MAXNAMELEN]; ++ dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ ++ cred_t *cr; ++}; ++ ++/* ARGSUSED */ ++static int ++recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dir_t *dd = arg1; ++ struct recvbeginsyncarg *rbsa = arg2; ++ objset_t *mos = dd->dd_pool->dp_meta_objset; ++ uint64_t val; ++ int err; ++ ++ err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, ++ strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); ++ ++ if (err != ENOENT) ++ return (err ? err : EEXIST); ++ ++ if (rbsa->origin) { ++ /* make sure it's a snap in the same pool */ ++ if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) ++ return (EXDEV); ++ if (!dsl_dataset_is_snapshot(rbsa->origin)) ++ return (EINVAL); ++ if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) ++ return (ENODEV); ++ } ++ ++ return (0); ++} ++ ++static void ++recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dir_t *dd = arg1; ++ struct recvbeginsyncarg *rbsa = arg2; ++ uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; ++ uint64_t dsobj; ++ ++ /* Create and open new dataset. */ ++ dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, ++ rbsa->origin, flags, rbsa->cr, tx); ++ VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, ++ B_TRUE, dmu_recv_tag, &rbsa->ds)); ++ ++ if (rbsa->origin == NULL) { ++ (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, ++ rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); ++ } ++ ++ spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC, ++ dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj); ++} ++ ++/* ARGSUSED */ ++static int ++recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ struct recvbeginsyncarg *rbsa = arg2; ++ int err; ++ uint64_t val; ++ ++ /* must not have any changes since most recent snapshot */ ++ if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) ++ return (ETXTBSY); ++ ++ /* new snapshot name must not exist */ ++ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ++ ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); ++ if (err == 0) ++ return (EEXIST); ++ if (err != ENOENT) ++ return (err); ++ ++ if (rbsa->fromguid) { ++ /* if incremental, most recent snapshot must match fromguid */ ++ if (ds->ds_prev == NULL) ++ return (ENODEV); ++ ++ /* ++ * most recent snapshot must match fromguid, or there are no ++ * changes since the fromguid one ++ */ ++ if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { ++ uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; ++ uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; ++ while (obj != 0) { ++ dsl_dataset_t *snap; ++ err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, ++ obj, FTAG, &snap); ++ if (err) ++ return (ENODEV); ++ if (snap->ds_phys->ds_creation_txg < birth) { ++ dsl_dataset_rele(snap, FTAG); ++ return (ENODEV); ++ } ++ if (snap->ds_phys->ds_guid == rbsa->fromguid) { ++ dsl_dataset_rele(snap, FTAG); ++ break; /* it's ok */ ++ } ++ obj = snap->ds_phys->ds_prev_snap_obj; ++ dsl_dataset_rele(snap, FTAG); ++ } ++ if (obj == 0) ++ return (ENODEV); ++ } ++ } else { ++ /* if full, most recent snapshot must be $ORIGIN */ ++ if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) ++ return (ENODEV); ++ } ++ ++ /* temporary clone name must not exist */ ++ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ++ ds->ds_dir->dd_phys->dd_child_dir_zapobj, ++ rbsa->clonelastname, 8, 1, &val); ++ if (err == 0) ++ return (EEXIST); ++ if (err != ENOENT) ++ return (err); ++ ++ return (0); ++} ++ ++/* ARGSUSED */ ++static void ++recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ohds = arg1; ++ struct recvbeginsyncarg *rbsa = arg2; ++ dsl_pool_t *dp = ohds->ds_dir->dd_pool; ++ dsl_dataset_t *cds; ++ uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; ++ uint64_t dsobj; ++ ++ /* create and open the temporary clone */ ++ dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, ++ ohds->ds_prev, flags, rbsa->cr, tx); ++ VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); ++ ++ /* ++ * If we actually created a non-clone, we need to create the ++ * objset in our new dataset. ++ */ ++ if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { ++ (void) dmu_objset_create_impl(dp->dp_spa, ++ cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); ++ } ++ ++ rbsa->ds = cds; ++ ++ spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC, ++ dp->dp_spa, tx, "dataset = %lld", dsobj); ++} ++ ++static boolean_t ++dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) ++{ ++ int featureflags; ++ ++ featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); ++ ++ /* Verify pool version supports SA if SA_SPILL feature set */ ++ return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && ++ (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA)); ++} ++ ++/* ++ * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() ++ * succeeds; otherwise we will leak the holds on the datasets. ++ */ ++int ++dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, ++ boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) ++{ ++ int err = 0; ++ boolean_t byteswap; ++ struct recvbeginsyncarg rbsa = { 0 }; ++ uint64_t versioninfo; ++ int flags; ++ dsl_dataset_t *ds; ++ ++ if (drrb->drr_magic == DMU_BACKUP_MAGIC) ++ byteswap = FALSE; ++ else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) ++ byteswap = TRUE; ++ else ++ return (EINVAL); ++ ++ rbsa.tofs = tofs; ++ rbsa.tosnap = tosnap; ++ rbsa.origin = origin ? origin->os_dsl_dataset : NULL; ++ rbsa.fromguid = drrb->drr_fromguid; ++ rbsa.type = drrb->drr_type; ++ rbsa.tag = FTAG; ++ rbsa.dsflags = 0; ++ rbsa.cr = CRED(); ++ versioninfo = drrb->drr_versioninfo; ++ flags = drrb->drr_flags; ++ ++ if (byteswap) { ++ rbsa.type = BSWAP_32(rbsa.type); ++ rbsa.fromguid = BSWAP_64(rbsa.fromguid); ++ versioninfo = BSWAP_64(versioninfo); ++ flags = BSWAP_32(flags); ++ } ++ ++ if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || ++ rbsa.type >= DMU_OST_NUMTYPES || ++ ((flags & DRR_FLAG_CLONE) && origin == NULL)) ++ return (EINVAL); ++ ++ if (flags & DRR_FLAG_CI_DATA) ++ rbsa.dsflags = DS_FLAG_CI_DATASET; ++ ++ bzero(drc, sizeof (dmu_recv_cookie_t)); ++ drc->drc_drrb = drrb; ++ drc->drc_tosnap = tosnap; ++ drc->drc_top_ds = top_ds; ++ drc->drc_force = force; ++ ++ /* ++ * Process the begin in syncing context. ++ */ ++ ++ /* open the dataset we are logically receiving into */ ++ err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); ++ if (err == 0) { ++ if (dmu_recv_verify_features(ds, drrb)) { ++ dsl_dataset_rele(ds, dmu_recv_tag); ++ return (ENOTSUP); ++ } ++ /* target fs already exists; recv into temp clone */ ++ ++ /* Can't recv a clone into an existing fs */ ++ if (flags & DRR_FLAG_CLONE) { ++ dsl_dataset_rele(ds, dmu_recv_tag); ++ return (EINVAL); ++ } ++ ++ /* must not have an incremental recv already in progress */ ++ if (!mutex_tryenter(&ds->ds_recvlock)) { ++ dsl_dataset_rele(ds, dmu_recv_tag); ++ return (EBUSY); ++ } ++ ++ /* tmp clone name is: tofs/%tosnap" */ ++ (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), ++ "%%%s", tosnap); ++ rbsa.force = force; ++ err = dsl_sync_task_do(ds->ds_dir->dd_pool, ++ recv_existing_check, recv_existing_sync, ds, &rbsa, 5); ++ if (err) { ++ mutex_exit(&ds->ds_recvlock); ++ dsl_dataset_rele(ds, dmu_recv_tag); ++ return (err); ++ } ++ drc->drc_logical_ds = ds; ++ drc->drc_real_ds = rbsa.ds; ++ } else if (err == ENOENT) { ++ /* target fs does not exist; must be a full backup or clone */ ++ char *cp; ++ ++ /* ++ * If it's a non-clone incremental, we are missing the ++ * target fs, so fail the recv. ++ */ ++ if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) ++ return (ENOENT); ++ ++ /* Open the parent of tofs */ ++ cp = strrchr(tofs, '/'); ++ *cp = '\0'; ++ err = dsl_dataset_hold(tofs, FTAG, &ds); ++ *cp = '/'; ++ if (err) ++ return (err); ++ ++ if (dmu_recv_verify_features(ds, drrb)) { ++ dsl_dataset_rele(ds, FTAG); ++ return (ENOTSUP); ++ } ++ ++ err = dsl_sync_task_do(ds->ds_dir->dd_pool, ++ recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); ++ dsl_dataset_rele(ds, FTAG); ++ if (err) ++ return (err); ++ drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; ++ drc->drc_newfs = B_TRUE; ++ } ++ ++ return (err); ++} ++ ++struct restorearg { ++ int err; ++ int byteswap; ++ vnode_t *vp; ++ char *buf; ++ uint64_t voff; ++ int bufsize; /* amount of memory allocated for buf */ ++ zio_cksum_t cksum; ++ avl_tree_t *guid_to_ds_map; ++}; ++ ++typedef struct guid_map_entry { ++ uint64_t guid; ++ dsl_dataset_t *gme_ds; ++ avl_node_t avlnode; ++} guid_map_entry_t; ++ ++static int ++guid_compare(const void *arg1, const void *arg2) ++{ ++ const guid_map_entry_t *gmep1 = arg1; ++ const guid_map_entry_t *gmep2 = arg2; ++ ++ if (gmep1->guid < gmep2->guid) ++ return (-1); ++ else if (gmep1->guid > gmep2->guid) ++ return (1); ++ return (0); ++} ++ ++static void ++free_guid_map_onexit(void *arg) ++{ ++ avl_tree_t *ca = arg; ++ void *cookie = NULL; ++ guid_map_entry_t *gmep; ++ ++ while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { ++ dsl_dataset_rele(gmep->gme_ds, ca); ++ kmem_free(gmep, sizeof (guid_map_entry_t)); ++ } ++ avl_destroy(ca); ++ kmem_free(ca, sizeof (avl_tree_t)); ++} ++ ++static void * ++restore_read(struct restorearg *ra, int len) ++{ ++ void *rv; ++ int done = 0; ++ ++ /* some things will require 8-byte alignment, so everything must */ ++ ASSERT3U(len % 8, ==, 0); ++ ++ while (done < len) { ++ ssize_t resid; ++ ++ ra->err = vn_rdwr(UIO_READ, ra->vp, ++ (caddr_t)ra->buf + done, len - done, ++ ra->voff, UIO_SYSSPACE, FAPPEND, ++ RLIM64_INFINITY, CRED(), &resid); ++ ++ if (resid == len - done) ++ ra->err = EINVAL; ++ ra->voff += len - done - resid; ++ done = len - resid; ++ if (ra->err) ++ return (NULL); ++ } ++ ++ ASSERT3U(done, ==, len); ++ rv = ra->buf; ++ if (ra->byteswap) ++ fletcher_4_incremental_byteswap(rv, len, &ra->cksum); ++ else ++ fletcher_4_incremental_native(rv, len, &ra->cksum); ++ return (rv); ++} ++ ++noinline static void ++backup_byteswap(dmu_replay_record_t *drr) ++{ ++#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) ++#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) ++ drr->drr_type = BSWAP_32(drr->drr_type); ++ drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); ++ switch (drr->drr_type) { ++ case DRR_BEGIN: ++ DO64(drr_begin.drr_magic); ++ DO64(drr_begin.drr_versioninfo); ++ DO64(drr_begin.drr_creation_time); ++ DO32(drr_begin.drr_type); ++ DO32(drr_begin.drr_flags); ++ DO64(drr_begin.drr_toguid); ++ DO64(drr_begin.drr_fromguid); ++ break; ++ case DRR_OBJECT: ++ DO64(drr_object.drr_object); ++ /* DO64(drr_object.drr_allocation_txg); */ ++ DO32(drr_object.drr_type); ++ DO32(drr_object.drr_bonustype); ++ DO32(drr_object.drr_blksz); ++ DO32(drr_object.drr_bonuslen); ++ DO64(drr_object.drr_toguid); ++ break; ++ case DRR_FREEOBJECTS: ++ DO64(drr_freeobjects.drr_firstobj); ++ DO64(drr_freeobjects.drr_numobjs); ++ DO64(drr_freeobjects.drr_toguid); ++ break; ++ case DRR_WRITE: ++ DO64(drr_write.drr_object); ++ DO32(drr_write.drr_type); ++ DO64(drr_write.drr_offset); ++ DO64(drr_write.drr_length); ++ DO64(drr_write.drr_toguid); ++ DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); ++ DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); ++ DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); ++ DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); ++ DO64(drr_write.drr_key.ddk_prop); ++ break; ++ case DRR_WRITE_BYREF: ++ DO64(drr_write_byref.drr_object); ++ DO64(drr_write_byref.drr_offset); ++ DO64(drr_write_byref.drr_length); ++ DO64(drr_write_byref.drr_toguid); ++ DO64(drr_write_byref.drr_refguid); ++ DO64(drr_write_byref.drr_refobject); ++ DO64(drr_write_byref.drr_refoffset); ++ DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); ++ DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); ++ DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); ++ DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); ++ DO64(drr_write_byref.drr_key.ddk_prop); ++ break; ++ case DRR_FREE: ++ DO64(drr_free.drr_object); ++ DO64(drr_free.drr_offset); ++ DO64(drr_free.drr_length); ++ DO64(drr_free.drr_toguid); ++ break; ++ case DRR_SPILL: ++ DO64(drr_spill.drr_object); ++ DO64(drr_spill.drr_length); ++ DO64(drr_spill.drr_toguid); ++ break; ++ case DRR_END: ++ DO64(drr_end.drr_checksum.zc_word[0]); ++ DO64(drr_end.drr_checksum.zc_word[1]); ++ DO64(drr_end.drr_checksum.zc_word[2]); ++ DO64(drr_end.drr_checksum.zc_word[3]); ++ DO64(drr_end.drr_toguid); ++ break; ++ default: ++ break; ++ } ++#undef DO64 ++#undef DO32 ++} ++ ++noinline static int ++restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) ++{ ++ int err; ++ dmu_tx_t *tx; ++ void *data = NULL; ++ ++ if (drro->drr_type == DMU_OT_NONE || ++ drro->drr_type >= DMU_OT_NUMTYPES || ++ drro->drr_bonustype >= DMU_OT_NUMTYPES || ++ drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || ++ drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || ++ P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || ++ drro->drr_blksz < SPA_MINBLOCKSIZE || ++ drro->drr_blksz > SPA_MAXBLOCKSIZE || ++ drro->drr_bonuslen > DN_MAX_BONUSLEN) { ++ return (EINVAL); ++ } ++ ++ err = dmu_object_info(os, drro->drr_object, NULL); ++ ++ if (err != 0 && err != ENOENT) ++ return (EINVAL); ++ ++ if (drro->drr_bonuslen) { ++ data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); ++ if (ra->err) ++ return (ra->err); ++ } ++ ++ if (err == ENOENT) { ++ /* currently free, want to be allocated */ ++ tx = dmu_tx_create(os); ++ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); ++ err = dmu_tx_assign(tx, TXG_WAIT); ++ if (err) { ++ dmu_tx_abort(tx); ++ return (err); ++ } ++ err = dmu_object_claim(os, drro->drr_object, ++ drro->drr_type, drro->drr_blksz, ++ drro->drr_bonustype, drro->drr_bonuslen, tx); ++ dmu_tx_commit(tx); ++ } else { ++ /* currently allocated, want to be allocated */ ++ err = dmu_object_reclaim(os, drro->drr_object, ++ drro->drr_type, drro->drr_blksz, ++ drro->drr_bonustype, drro->drr_bonuslen); ++ } ++ if (err) { ++ return (EINVAL); ++ } ++ ++ tx = dmu_tx_create(os); ++ dmu_tx_hold_bonus(tx, drro->drr_object); ++ err = dmu_tx_assign(tx, TXG_WAIT); ++ if (err) { ++ dmu_tx_abort(tx); ++ return (err); ++ } ++ ++ dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, ++ tx); ++ dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); ++ ++ if (data != NULL) { ++ dmu_buf_t *db; ++ ++ VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); ++ dmu_buf_will_dirty(db, tx); ++ ++ ASSERT3U(db->db_size, >=, drro->drr_bonuslen); ++ bcopy(data, db->db_data, drro->drr_bonuslen); ++ if (ra->byteswap) { ++ dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, ++ drro->drr_bonuslen); ++ } ++ dmu_buf_rele(db, FTAG); ++ } ++ dmu_tx_commit(tx); ++ return (0); ++} ++ ++/* ARGSUSED */ ++noinline static int ++restore_freeobjects(struct restorearg *ra, objset_t *os, ++ struct drr_freeobjects *drrfo) ++{ ++ uint64_t obj; ++ ++ if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) ++ return (EINVAL); ++ ++ for (obj = drrfo->drr_firstobj; ++ obj < drrfo->drr_firstobj + drrfo->drr_numobjs; ++ (void) dmu_object_next(os, &obj, FALSE, 0)) { ++ int err; ++ ++ if (dmu_object_info(os, obj, NULL) != 0) ++ continue; ++ ++ err = dmu_free_object(os, obj); ++ if (err) ++ return (err); ++ } ++ return (0); ++} ++ ++noinline static int ++restore_write(struct restorearg *ra, objset_t *os, ++ struct drr_write *drrw) ++{ ++ dmu_tx_t *tx; ++ void *data; ++ int err; ++ ++ if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || ++ drrw->drr_type >= DMU_OT_NUMTYPES) ++ return (EINVAL); ++ ++ data = restore_read(ra, drrw->drr_length); ++ if (data == NULL) ++ return (ra->err); ++ ++ if (dmu_object_info(os, drrw->drr_object, NULL) != 0) ++ return (EINVAL); ++ ++ tx = dmu_tx_create(os); ++ ++ dmu_tx_hold_write(tx, drrw->drr_object, ++ drrw->drr_offset, drrw->drr_length); ++ err = dmu_tx_assign(tx, TXG_WAIT); ++ if (err) { ++ dmu_tx_abort(tx); ++ return (err); ++ } ++ if (ra->byteswap) ++ dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); ++ dmu_write(os, drrw->drr_object, ++ drrw->drr_offset, drrw->drr_length, data, tx); ++ dmu_tx_commit(tx); ++ return (0); ++} ++ ++/* ++ * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed ++ * streams to refer to a copy of the data that is already on the ++ * system because it came in earlier in the stream. This function ++ * finds the earlier copy of the data, and uses that copy instead of ++ * data from the stream to fulfill this write. ++ */ ++static int ++restore_write_byref(struct restorearg *ra, objset_t *os, ++ struct drr_write_byref *drrwbr) ++{ ++ dmu_tx_t *tx; ++ int err; ++ guid_map_entry_t gmesrch; ++ guid_map_entry_t *gmep; ++ avl_index_t where; ++ objset_t *ref_os = NULL; ++ dmu_buf_t *dbp; ++ ++ if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) ++ return (EINVAL); ++ ++ /* ++ * If the GUID of the referenced dataset is different from the ++ * GUID of the target dataset, find the referenced dataset. ++ */ ++ if (drrwbr->drr_toguid != drrwbr->drr_refguid) { ++ gmesrch.guid = drrwbr->drr_refguid; ++ if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, ++ &where)) == NULL) { ++ return (EINVAL); ++ } ++ if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) ++ return (EINVAL); ++ } else { ++ ref_os = os; ++ } ++ ++ err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, ++ drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); ++ if (err) ++ return (err); ++ ++ tx = dmu_tx_create(os); ++ ++ dmu_tx_hold_write(tx, drrwbr->drr_object, ++ drrwbr->drr_offset, drrwbr->drr_length); ++ err = dmu_tx_assign(tx, TXG_WAIT); ++ if (err) { ++ dmu_tx_abort(tx); ++ return (err); ++ } ++ dmu_write(os, drrwbr->drr_object, ++ drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); ++ dmu_buf_rele(dbp, FTAG); ++ dmu_tx_commit(tx); ++ return (0); ++} ++ ++static int ++restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) ++{ ++ dmu_tx_t *tx; ++ void *data; ++ dmu_buf_t *db, *db_spill; ++ int err; ++ ++ if (drrs->drr_length < SPA_MINBLOCKSIZE || ++ drrs->drr_length > SPA_MAXBLOCKSIZE) ++ return (EINVAL); ++ ++ data = restore_read(ra, drrs->drr_length); ++ if (data == NULL) ++ return (ra->err); ++ ++ if (dmu_object_info(os, drrs->drr_object, NULL) != 0) ++ return (EINVAL); ++ ++ VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); ++ if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { ++ dmu_buf_rele(db, FTAG); ++ return (err); ++ } ++ ++ tx = dmu_tx_create(os); ++ ++ dmu_tx_hold_spill(tx, db->db_object); ++ ++ err = dmu_tx_assign(tx, TXG_WAIT); ++ if (err) { ++ dmu_buf_rele(db, FTAG); ++ dmu_buf_rele(db_spill, FTAG); ++ dmu_tx_abort(tx); ++ return (err); ++ } ++ dmu_buf_will_dirty(db_spill, tx); ++ ++ if (db_spill->db_size < drrs->drr_length) ++ VERIFY(0 == dbuf_spill_set_blksz(db_spill, ++ drrs->drr_length, tx)); ++ bcopy(data, db_spill->db_data, drrs->drr_length); ++ ++ dmu_buf_rele(db, FTAG); ++ dmu_buf_rele(db_spill, FTAG); ++ ++ dmu_tx_commit(tx); ++ return (0); ++} ++ ++/* ARGSUSED */ ++noinline static int ++restore_free(struct restorearg *ra, objset_t *os, ++ struct drr_free *drrf) ++{ ++ int err; ++ ++ if (drrf->drr_length != -1ULL && ++ drrf->drr_offset + drrf->drr_length < drrf->drr_offset) ++ return (EINVAL); ++ ++ if (dmu_object_info(os, drrf->drr_object, NULL) != 0) ++ return (EINVAL); ++ ++ err = dmu_free_long_range(os, drrf->drr_object, ++ drrf->drr_offset, drrf->drr_length); ++ return (err); ++} ++ ++/* ++ * NB: callers *must* call dmu_recv_end() if this succeeds. ++ */ ++int ++dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, ++ int cleanup_fd, uint64_t *action_handlep) ++{ ++ struct restorearg ra = { 0 }; ++ dmu_replay_record_t *drr; ++ objset_t *os; ++ zio_cksum_t pcksum; ++ int featureflags; ++ ++ if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) ++ ra.byteswap = TRUE; ++ ++ { ++ /* compute checksum of drr_begin record */ ++ dmu_replay_record_t *drr; ++ drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); ++ ++ drr->drr_type = DRR_BEGIN; ++ drr->drr_u.drr_begin = *drc->drc_drrb; ++ if (ra.byteswap) { ++ fletcher_4_incremental_byteswap(drr, ++ sizeof (dmu_replay_record_t), &ra.cksum); ++ } else { ++ fletcher_4_incremental_native(drr, ++ sizeof (dmu_replay_record_t), &ra.cksum); ++ } ++ kmem_free(drr, sizeof (dmu_replay_record_t)); ++ } ++ ++ if (ra.byteswap) { ++ struct drr_begin *drrb = drc->drc_drrb; ++ drrb->drr_magic = BSWAP_64(drrb->drr_magic); ++ drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); ++ drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); ++ drrb->drr_type = BSWAP_32(drrb->drr_type); ++ drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); ++ drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); ++ } ++ ++ ra.vp = vp; ++ ra.voff = *voffp; ++ ra.bufsize = 1<<20; ++ ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP); ++ ++ /* these were verified in dmu_recv_begin */ ++ ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == ++ DMU_SUBSTREAM); ++ ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); ++ ++ /* ++ * Open the objset we are modifying. ++ */ ++ VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); ++ ++ ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); ++ ++ featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); ++ ++ /* if this stream is dedup'ed, set up the avl tree for guid mapping */ ++ if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { ++ minor_t minor; ++ ++ if (cleanup_fd == -1) { ++ ra.err = EBADF; ++ goto out; ++ } ++ ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); ++ if (ra.err) { ++ cleanup_fd = -1; ++ goto out; ++ } ++ ++ if (*action_handlep == 0) { ++ ra.guid_to_ds_map = ++ kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); ++ avl_create(ra.guid_to_ds_map, guid_compare, ++ sizeof (guid_map_entry_t), ++ offsetof(guid_map_entry_t, avlnode)); ++ ra.err = zfs_onexit_add_cb(minor, ++ free_guid_map_onexit, ra.guid_to_ds_map, ++ action_handlep); ++ if (ra.err) ++ goto out; ++ } else { ++ ra.err = zfs_onexit_cb_data(minor, *action_handlep, ++ (void **)&ra.guid_to_ds_map); ++ if (ra.err) ++ goto out; ++ } ++ ++ drc->drc_guid_to_ds_map = ra.guid_to_ds_map; ++ } ++ ++ /* ++ * Read records and process them. ++ */ ++ pcksum = ra.cksum; ++ while (ra.err == 0 && ++ NULL != (drr = restore_read(&ra, sizeof (*drr)))) { ++ if (issig(JUSTLOOKING) && issig(FORREAL)) { ++ ra.err = EINTR; ++ goto out; ++ } ++ ++ if (ra.byteswap) ++ backup_byteswap(drr); ++ ++ switch (drr->drr_type) { ++ case DRR_OBJECT: ++ { ++ /* ++ * We need to make a copy of the record header, ++ * because restore_{object,write} may need to ++ * restore_read(), which will invalidate drr. ++ */ ++ struct drr_object drro = drr->drr_u.drr_object; ++ ra.err = restore_object(&ra, os, &drro); ++ break; ++ } ++ case DRR_FREEOBJECTS: ++ { ++ struct drr_freeobjects drrfo = ++ drr->drr_u.drr_freeobjects; ++ ra.err = restore_freeobjects(&ra, os, &drrfo); ++ break; ++ } ++ case DRR_WRITE: ++ { ++ struct drr_write drrw = drr->drr_u.drr_write; ++ ra.err = restore_write(&ra, os, &drrw); ++ break; ++ } ++ case DRR_WRITE_BYREF: ++ { ++ struct drr_write_byref drrwbr = ++ drr->drr_u.drr_write_byref; ++ ra.err = restore_write_byref(&ra, os, &drrwbr); ++ break; ++ } ++ case DRR_FREE: ++ { ++ struct drr_free drrf = drr->drr_u.drr_free; ++ ra.err = restore_free(&ra, os, &drrf); ++ break; ++ } ++ case DRR_END: ++ { ++ struct drr_end drre = drr->drr_u.drr_end; ++ /* ++ * We compare against the *previous* checksum ++ * value, because the stored checksum is of ++ * everything before the DRR_END record. ++ */ ++ if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) ++ ra.err = ECKSUM; ++ goto out; ++ } ++ case DRR_SPILL: ++ { ++ struct drr_spill drrs = drr->drr_u.drr_spill; ++ ra.err = restore_spill(&ra, os, &drrs); ++ break; ++ } ++ default: ++ ra.err = EINVAL; ++ goto out; ++ } ++ pcksum = ra.cksum; ++ } ++ ASSERT(ra.err != 0); ++ ++out: ++ if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) ++ zfs_onexit_fd_rele(cleanup_fd); ++ ++ if (ra.err != 0) { ++ /* ++ * destroy what we created, so we don't leave it in the ++ * inconsistent restoring state. ++ */ ++ txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); ++ ++ (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, ++ B_FALSE); ++ if (drc->drc_real_ds != drc->drc_logical_ds) { ++ mutex_exit(&drc->drc_logical_ds->ds_recvlock); ++ dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); ++ } ++ } ++ ++ vmem_free(ra.buf, ra.bufsize); ++ *voffp = ra.voff; ++ return (ra.err); ++} ++ ++struct recvendsyncarg { ++ char *tosnap; ++ uint64_t creation_time; ++ uint64_t toguid; ++}; ++ ++static int ++recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ struct recvendsyncarg *resa = arg2; ++ ++ return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); ++} ++ ++static void ++recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ struct recvendsyncarg *resa = arg2; ++ ++ dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); ++ ++ /* set snapshot's creation time and guid */ ++ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); ++ ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; ++ ds->ds_prev->ds_phys->ds_guid = resa->toguid; ++ ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; ++ ++ dmu_buf_will_dirty(ds->ds_dbuf, tx); ++ ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; ++} ++ ++static int ++add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) ++{ ++ dsl_pool_t *dp = ds->ds_dir->dd_pool; ++ uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj; ++ dsl_dataset_t *snapds; ++ guid_map_entry_t *gmep; ++ int err; ++ ++ ASSERT(guid_map != NULL); ++ ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds); ++ if (err == 0) { ++ gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); ++ gmep->guid = snapds->ds_phys->ds_guid; ++ gmep->gme_ds = snapds; ++ avl_add(guid_map, gmep); ++ } ++ ++ rw_exit(&dp->dp_config_rwlock); ++ return (err); ++} ++ ++static int ++dmu_recv_existing_end(dmu_recv_cookie_t *drc) ++{ ++ struct recvendsyncarg resa; ++ dsl_dataset_t *ds = drc->drc_logical_ds; ++ int err, myerr; ++ ++ /* ++ * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() ++ * expects it to have a ds_user_ptr (and zil), but clone_swap() ++ * can close it. ++ */ ++ txg_wait_synced(ds->ds_dir->dd_pool, 0); ++ ++ if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { ++ err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, ++ drc->drc_force); ++ if (err) ++ goto out; ++ } else { ++ mutex_exit(&ds->ds_recvlock); ++ dsl_dataset_rele(ds, dmu_recv_tag); ++ (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, ++ B_FALSE); ++ return (EBUSY); ++ } ++ ++ resa.creation_time = drc->drc_drrb->drr_creation_time; ++ resa.toguid = drc->drc_drrb->drr_toguid; ++ resa.tosnap = drc->drc_tosnap; ++ ++ err = dsl_sync_task_do(ds->ds_dir->dd_pool, ++ recv_end_check, recv_end_sync, ds, &resa, 3); ++ if (err) { ++ /* swap back */ ++ (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); ++ } ++ ++out: ++ mutex_exit(&ds->ds_recvlock); ++ if (err == 0 && drc->drc_guid_to_ds_map != NULL) ++ (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); ++ dsl_dataset_disown(ds, dmu_recv_tag); ++ myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); ++ ASSERT3U(myerr, ==, 0); ++ return (err); ++} ++ ++static int ++dmu_recv_new_end(dmu_recv_cookie_t *drc) ++{ ++ struct recvendsyncarg resa; ++ dsl_dataset_t *ds = drc->drc_logical_ds; ++ int err; ++ ++ /* ++ * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() ++ * expects it to have a ds_user_ptr (and zil), but clone_swap() ++ * can close it. ++ */ ++ txg_wait_synced(ds->ds_dir->dd_pool, 0); ++ ++ resa.creation_time = drc->drc_drrb->drr_creation_time; ++ resa.toguid = drc->drc_drrb->drr_toguid; ++ resa.tosnap = drc->drc_tosnap; ++ ++ err = dsl_sync_task_do(ds->ds_dir->dd_pool, ++ recv_end_check, recv_end_sync, ds, &resa, 3); ++ if (err) { ++ /* clean up the fs we just recv'd into */ ++ (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); ++ } else { ++ if (drc->drc_guid_to_ds_map != NULL) ++ (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); ++ /* release the hold from dmu_recv_begin */ ++ dsl_dataset_disown(ds, dmu_recv_tag); ++ } ++ return (err); ++} ++ ++int ++dmu_recv_end(dmu_recv_cookie_t *drc) ++{ ++ if (drc->drc_logical_ds != drc->drc_real_ds) ++ return (dmu_recv_existing_end(drc)); ++ else ++ return (dmu_recv_new_end(drc)); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dmu_traverse.c linux-3.2.33-go/fs/zfs/zfs/dmu_traverse.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dmu_traverse.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dmu_traverse.c 2012-11-16 23:25:34.349039334 +0100 +@@ -0,0 +1,498 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int zfs_pd_blks_max = 100; ++ ++typedef struct prefetch_data { ++ kmutex_t pd_mtx; ++ kcondvar_t pd_cv; ++ int pd_blks_max; ++ int pd_blks_fetched; ++ int pd_flags; ++ boolean_t pd_cancel; ++ boolean_t pd_exited; ++} prefetch_data_t; ++ ++typedef struct traverse_data { ++ spa_t *td_spa; ++ uint64_t td_objset; ++ blkptr_t *td_rootbp; ++ uint64_t td_min_txg; ++ int td_flags; ++ prefetch_data_t *td_pfd; ++ blkptr_cb_t *td_func; ++ void *td_arg; ++} traverse_data_t; ++ ++static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, ++ arc_buf_t *buf, uint64_t objset, uint64_t object); ++ ++static int ++traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) ++{ ++ traverse_data_t *td = arg; ++ zbookmark_t zb; ++ ++ if (bp->blk_birth == 0) ++ return (0); ++ ++ if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) ++ return (0); ++ ++ SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, ++ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); ++ ++ (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg); ++ ++ return (0); ++} ++ ++static int ++traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) ++{ ++ traverse_data_t *td = arg; ++ ++ if (lrc->lrc_txtype == TX_WRITE) { ++ lr_write_t *lr = (lr_write_t *)lrc; ++ blkptr_t *bp = &lr->lr_blkptr; ++ zbookmark_t zb; ++ ++ if (bp->blk_birth == 0) ++ return (0); ++ ++ if (claim_txg == 0 || bp->blk_birth < claim_txg) ++ return (0); ++ ++ SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ++ ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); ++ ++ (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, ++ td->td_arg); ++ } ++ return (0); ++} ++ ++static void ++traverse_zil(traverse_data_t *td, zil_header_t *zh) ++{ ++ uint64_t claim_txg = zh->zh_claim_txg; ++ zilog_t *zilog; ++ ++ /* ++ * We only want to visit blocks that have been claimed but not yet ++ * replayed; plus, in read-only mode, blocks that are already stable. ++ */ ++ if (claim_txg == 0 && spa_writeable(td->td_spa)) ++ return; ++ ++ zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); ++ ++ (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, ++ claim_txg); ++ ++ zil_free(zilog); ++} ++ ++static int ++traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, ++ arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) ++{ ++ zbookmark_t czb; ++ int err = 0, lasterr = 0; ++ arc_buf_t *buf = NULL; ++ prefetch_data_t *pd = td->td_pfd; ++ boolean_t hard = td->td_flags & TRAVERSE_HARD; ++ ++ if (bp->blk_birth == 0) { ++ err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp, ++ td->td_arg); ++ return (err); ++ } ++ ++ if (bp->blk_birth <= td->td_min_txg) ++ return (0); ++ ++ if (pd && !pd->pd_exited && ++ ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) || ++ BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) { ++ mutex_enter(&pd->pd_mtx); ++ ASSERT(pd->pd_blks_fetched >= 0); ++ while (pd->pd_blks_fetched == 0 && !pd->pd_exited) ++ cv_wait(&pd->pd_cv, &pd->pd_mtx); ++ pd->pd_blks_fetched--; ++ cv_broadcast(&pd->pd_cv); ++ mutex_exit(&pd->pd_mtx); ++ } ++ ++ if (td->td_flags & TRAVERSE_PRE) { ++ err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, ++ td->td_arg); ++ if (err == TRAVERSE_VISIT_NO_CHILDREN) ++ return (0); ++ if (err) ++ return (err); ++ } ++ ++ if (BP_GET_LEVEL(bp) > 0) { ++ uint32_t flags = ARC_WAIT; ++ int i; ++ blkptr_t *cbp; ++ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; ++ ++ err = dsl_read(NULL, td->td_spa, bp, pbuf, ++ arc_getbuf_func, &buf, ++ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); ++ if (err) ++ return (err); ++ ++ /* recursively visitbp() blocks below this */ ++ cbp = buf->b_data; ++ for (i = 0; i < epb; i++, cbp++) { ++ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, ++ zb->zb_level - 1, ++ zb->zb_blkid * epb + i); ++ err = traverse_visitbp(td, dnp, buf, cbp, &czb); ++ if (err) { ++ if (!hard) ++ break; ++ lasterr = err; ++ } ++ } ++ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { ++ uint32_t flags = ARC_WAIT; ++ int i; ++ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; ++ ++ err = dsl_read(NULL, td->td_spa, bp, pbuf, ++ arc_getbuf_func, &buf, ++ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); ++ if (err) ++ return (err); ++ ++ /* recursively visitbp() blocks below this */ ++ dnp = buf->b_data; ++ for (i = 0; i < epb; i++, dnp++) { ++ err = traverse_dnode(td, dnp, buf, zb->zb_objset, ++ zb->zb_blkid * epb + i); ++ if (err) { ++ if (!hard) ++ break; ++ lasterr = err; ++ } ++ } ++ } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { ++ uint32_t flags = ARC_WAIT; ++ objset_phys_t *osp; ++ dnode_phys_t *dnp; ++ ++ err = dsl_read_nolock(NULL, td->td_spa, bp, ++ arc_getbuf_func, &buf, ++ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); ++ if (err) ++ return (err); ++ ++ osp = buf->b_data; ++ dnp = &osp->os_meta_dnode; ++ err = traverse_dnode(td, dnp, buf, zb->zb_objset, ++ DMU_META_DNODE_OBJECT); ++ if (err && hard) { ++ lasterr = err; ++ err = 0; ++ } ++ if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { ++ dnp = &osp->os_userused_dnode; ++ err = traverse_dnode(td, dnp, buf, zb->zb_objset, ++ DMU_USERUSED_OBJECT); ++ } ++ if (err && hard) { ++ lasterr = err; ++ err = 0; ++ } ++ if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { ++ dnp = &osp->os_groupused_dnode; ++ err = traverse_dnode(td, dnp, buf, zb->zb_objset, ++ DMU_GROUPUSED_OBJECT); ++ } ++ } ++ ++ if (buf) ++ (void) arc_buf_remove_ref(buf, &buf); ++ ++ if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) { ++ err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, ++ td->td_arg); ++ } ++ ++ return (err != 0 ? err : lasterr); ++} ++ ++static int ++traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, ++ arc_buf_t *buf, uint64_t objset, uint64_t object) ++{ ++ int j, err = 0, lasterr = 0; ++ zbookmark_t czb; ++ boolean_t hard = (td->td_flags & TRAVERSE_HARD); ++ ++ for (j = 0; j < dnp->dn_nblkptr; j++) { ++ SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); ++ err = traverse_visitbp(td, dnp, buf, ++ (blkptr_t *)&dnp->dn_blkptr[j], &czb); ++ if (err) { ++ if (!hard) ++ break; ++ lasterr = err; ++ } ++ } ++ ++ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { ++ SET_BOOKMARK(&czb, objset, ++ object, 0, DMU_SPILL_BLKID); ++ err = traverse_visitbp(td, dnp, buf, ++ (blkptr_t *)&dnp->dn_spill, &czb); ++ if (err) { ++ if (!hard) ++ return (err); ++ lasterr = err; ++ } ++ } ++ return (err != 0 ? err : lasterr); ++} ++ ++/* ARGSUSED */ ++static int ++traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, ++ arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, ++ void *arg) ++{ ++ prefetch_data_t *pfd = arg; ++ uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; ++ ++ ASSERT(pfd->pd_blks_fetched >= 0); ++ if (pfd->pd_cancel) ++ return (EINTR); ++ ++ if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || ++ BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) || ++ BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) ++ return (0); ++ ++ mutex_enter(&pfd->pd_mtx); ++ while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max) ++ cv_wait(&pfd->pd_cv, &pfd->pd_mtx); ++ pfd->pd_blks_fetched++; ++ cv_broadcast(&pfd->pd_cv); ++ mutex_exit(&pfd->pd_mtx); ++ ++ (void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL, ++ ZIO_PRIORITY_ASYNC_READ, ++ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, ++ &aflags, zb); ++ ++ return (0); ++} ++ ++static void ++traverse_prefetch_thread(void *arg) ++{ ++ traverse_data_t *td_main = arg; ++ traverse_data_t td = *td_main; ++ zbookmark_t czb; ++ ++ td.td_func = traverse_prefetcher; ++ td.td_arg = td_main->td_pfd; ++ td.td_pfd = NULL; ++ ++ SET_BOOKMARK(&czb, td.td_objset, ++ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); ++ (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb); ++ ++ mutex_enter(&td_main->td_pfd->pd_mtx); ++ td_main->td_pfd->pd_exited = B_TRUE; ++ cv_broadcast(&td_main->td_pfd->pd_cv); ++ mutex_exit(&td_main->td_pfd->pd_mtx); ++} ++ ++/* ++ * NB: dataset must not be changing on-disk (eg, is a snapshot or we are ++ * in syncing context). ++ */ ++static int ++traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp, ++ uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) ++{ ++ traverse_data_t *td; ++ prefetch_data_t *pd; ++ zbookmark_t *czb; ++ int err; ++ ++ td = kmem_alloc(sizeof(traverse_data_t), KM_PUSHPAGE); ++ pd = kmem_zalloc(sizeof(prefetch_data_t), KM_PUSHPAGE); ++ czb = kmem_alloc(sizeof(zbookmark_t), KM_PUSHPAGE); ++ ++ td->td_spa = spa; ++ td->td_objset = ds ? ds->ds_object : 0; ++ td->td_rootbp = rootbp; ++ td->td_min_txg = txg_start; ++ td->td_func = func; ++ td->td_arg = arg; ++ td->td_pfd = pd; ++ td->td_flags = flags; ++ ++ pd->pd_blks_max = zfs_pd_blks_max; ++ pd->pd_flags = flags; ++ mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL); ++ cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL); ++ ++ /* See comment on ZIL traversal in dsl_scan_visitds. */ ++ if (ds != NULL && !dsl_dataset_is_snapshot(ds)) { ++ objset_t *os; ++ ++ err = dmu_objset_from_ds(ds, &os); ++ if (err) ++ return (err); ++ ++ traverse_zil(td, &os->os_zil_header); ++ } ++ ++ if (!(flags & TRAVERSE_PREFETCH) || ++ 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, ++ td, TQ_NOQUEUE)) ++ pd->pd_exited = B_TRUE; ++ ++ SET_BOOKMARK(czb, td->td_objset, ++ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); ++ err = traverse_visitbp(td, NULL, NULL, rootbp, czb); ++ ++ mutex_enter(&pd->pd_mtx); ++ pd->pd_cancel = B_TRUE; ++ cv_broadcast(&pd->pd_cv); ++ while (!pd->pd_exited) ++ cv_wait(&pd->pd_cv, &pd->pd_mtx); ++ mutex_exit(&pd->pd_mtx); ++ ++ mutex_destroy(&pd->pd_mtx); ++ cv_destroy(&pd->pd_cv); ++ ++ kmem_free(czb, sizeof(zbookmark_t)); ++ kmem_free(pd, sizeof(struct prefetch_data)); ++ kmem_free(td, sizeof(struct traverse_data)); ++ ++ return (err); ++} ++ ++/* ++ * NB: dataset must not be changing on-disk (eg, is a snapshot or we are ++ * in syncing context). ++ */ ++int ++traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, ++ blkptr_cb_t func, void *arg) ++{ ++ return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ++ &ds->ds_phys->ds_bp, txg_start, flags, func, arg)); ++} ++ ++/* ++ * NB: pool must not be changing on-disk (eg, from zdb or sync context). ++ */ ++int ++traverse_pool(spa_t *spa, uint64_t txg_start, int flags, ++ blkptr_cb_t func, void *arg) ++{ ++ int err, lasterr = 0; ++ uint64_t obj; ++ dsl_pool_t *dp = spa_get_dsl(spa); ++ objset_t *mos = dp->dp_meta_objset; ++ boolean_t hard = (flags & TRAVERSE_HARD); ++ ++ /* visit the MOS */ ++ err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa), ++ txg_start, flags, func, arg); ++ if (err) ++ return (err); ++ ++ /* visit each dataset */ ++ for (obj = 1; err == 0 || (err != ESRCH && hard); ++ err = dmu_object_next(mos, &obj, FALSE, txg_start)) { ++ dmu_object_info_t doi; ++ ++ err = dmu_object_info(mos, obj, &doi); ++ if (err) { ++ if (!hard) ++ return (err); ++ lasterr = err; ++ continue; ++ } ++ ++ if (doi.doi_type == DMU_OT_DSL_DATASET) { ++ dsl_dataset_t *ds; ++ uint64_t txg = txg_start; ++ ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); ++ rw_exit(&dp->dp_config_rwlock); ++ if (err) { ++ if (!hard) ++ return (err); ++ lasterr = err; ++ continue; ++ } ++ if (ds->ds_phys->ds_prev_snap_txg > txg) ++ txg = ds->ds_phys->ds_prev_snap_txg; ++ err = traverse_dataset(ds, txg, flags, func, arg); ++ dsl_dataset_rele(ds, FTAG); ++ if (err) { ++ if (!hard) ++ return (err); ++ lasterr = err; ++ } ++ } ++ } ++ if (err == ESRCH) ++ err = 0; ++ return (err != 0 ? err : lasterr); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(traverse_dataset); ++EXPORT_SYMBOL(traverse_pool); ++ ++module_param(zfs_pd_blks_max, int, 0644); ++MODULE_PARM_DESC(zfs_pd_blks_max, "Max number of blocks to prefetch"); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dmu_tx.c linux-3.2.33-go/fs/zfs/zfs/dmu_tx.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dmu_tx.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dmu_tx.c 2012-11-16 23:25:34.348039346 +0100 +@@ -0,0 +1,1453 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++/* ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include /* for dsl_dataset_block_freeable() */ ++#include /* for dsl_dir_tempreserve_*() */ ++#include ++#include /* for fzap_default_block_shift */ ++#include ++#include ++#include ++#include ++#include ++ ++typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, ++ uint64_t arg1, uint64_t arg2); ++ ++dmu_tx_stats_t dmu_tx_stats = { ++ { "dmu_tx_assigned", KSTAT_DATA_UINT64 }, ++ { "dmu_tx_delay", KSTAT_DATA_UINT64 }, ++ { "dmu_tx_error", KSTAT_DATA_UINT64 }, ++ { "dmu_tx_suspended", KSTAT_DATA_UINT64 }, ++ { "dmu_tx_group", KSTAT_DATA_UINT64 }, ++ { "dmu_tx_how", KSTAT_DATA_UINT64 }, ++ { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, ++ { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, ++ { "dmu_tx_memory_inflight", KSTAT_DATA_UINT64 }, ++ { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, ++ { "dmu_tx_write_limit", KSTAT_DATA_UINT64 }, ++ { "dmu_tx_quota", KSTAT_DATA_UINT64 }, ++}; ++ ++static kstat_t *dmu_tx_ksp; ++ ++dmu_tx_t * ++dmu_tx_create_dd(dsl_dir_t *dd) ++{ ++ dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_PUSHPAGE); ++ tx->tx_dir = dd; ++ if (dd) ++ tx->tx_pool = dd->dd_pool; ++ list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), ++ offsetof(dmu_tx_hold_t, txh_node)); ++ list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), ++ offsetof(dmu_tx_callback_t, dcb_node)); ++#ifdef DEBUG_DMU_TX ++ refcount_create(&tx->tx_space_written); ++ refcount_create(&tx->tx_space_freed); ++#endif ++ return (tx); ++} ++ ++dmu_tx_t * ++dmu_tx_create(objset_t *os) ++{ ++ dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); ++ tx->tx_objset = os; ++ tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset); ++ return (tx); ++} ++ ++dmu_tx_t * ++dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) ++{ ++ dmu_tx_t *tx = dmu_tx_create_dd(NULL); ++ ++ ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); ++ tx->tx_pool = dp; ++ tx->tx_txg = txg; ++ tx->tx_anyobj = TRUE; ++ ++ return (tx); ++} ++ ++int ++dmu_tx_is_syncing(dmu_tx_t *tx) ++{ ++ return (tx->tx_anyobj); ++} ++ ++int ++dmu_tx_private_ok(dmu_tx_t *tx) ++{ ++ return (tx->tx_anyobj); ++} ++ ++static dmu_tx_hold_t * ++dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, ++ enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) ++{ ++ dmu_tx_hold_t *txh; ++ dnode_t *dn = NULL; ++ int err; ++ ++ if (object != DMU_NEW_OBJECT) { ++ err = dnode_hold(os, object, tx, &dn); ++ if (err) { ++ tx->tx_err = err; ++ return (NULL); ++ } ++ ++ if (err == 0 && tx->tx_txg != 0) { ++ mutex_enter(&dn->dn_mtx); ++ /* ++ * dn->dn_assigned_txg == tx->tx_txg doesn't pose a ++ * problem, but there's no way for it to happen (for ++ * now, at least). ++ */ ++ ASSERT(dn->dn_assigned_txg == 0); ++ dn->dn_assigned_txg = tx->tx_txg; ++ (void) refcount_add(&dn->dn_tx_holds, tx); ++ mutex_exit(&dn->dn_mtx); ++ } ++ } ++ ++ txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_PUSHPAGE); ++ txh->txh_tx = tx; ++ txh->txh_dnode = dn; ++#ifdef DEBUG_DMU_TX ++ txh->txh_type = type; ++ txh->txh_arg1 = arg1; ++ txh->txh_arg2 = arg2; ++#endif ++ list_insert_tail(&tx->tx_holds, txh); ++ ++ return (txh); ++} ++ ++void ++dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) ++{ ++ /* ++ * If we're syncing, they can manipulate any object anyhow, and ++ * the hold on the dnode_t can cause problems. ++ */ ++ if (!dmu_tx_is_syncing(tx)) { ++ (void) dmu_tx_hold_object_impl(tx, os, ++ object, THT_NEWOBJECT, 0, 0); ++ } ++} ++ ++static int ++dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) ++{ ++ int err; ++ dmu_buf_impl_t *db; ++ ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ db = dbuf_hold_level(dn, level, blkid, FTAG); ++ rw_exit(&dn->dn_struct_rwlock); ++ if (db == NULL) ++ return (EIO); ++ err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); ++ dbuf_rele(db, FTAG); ++ return (err); ++} ++ ++static void ++dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, ++ int level, uint64_t blkid, boolean_t freeable, uint64_t *history) ++{ ++ objset_t *os = dn->dn_objset; ++ dsl_dataset_t *ds = os->os_dsl_dataset; ++ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ++ dmu_buf_impl_t *parent = NULL; ++ blkptr_t *bp = NULL; ++ uint64_t space; ++ ++ if (level >= dn->dn_nlevels || history[level] == blkid) ++ return; ++ ++ history[level] = blkid; ++ ++ space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift); ++ ++ if (db == NULL || db == dn->dn_dbuf) { ++ ASSERT(level != 0); ++ db = NULL; ++ } else { ++ ASSERT(DB_DNODE(db) == dn); ++ ASSERT(db->db_level == level); ++ ASSERT(db->db.db_size == space); ++ ASSERT(db->db_blkid == blkid); ++ bp = db->db_blkptr; ++ parent = db->db_parent; ++ } ++ ++ freeable = (bp && (freeable || ++ dsl_dataset_block_freeable(ds, bp, bp->blk_birth))); ++ ++ if (freeable) ++ txh->txh_space_tooverwrite += space; ++ else ++ txh->txh_space_towrite += space; ++ if (bp) ++ txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp); ++ ++ dmu_tx_count_twig(txh, dn, parent, level + 1, ++ blkid >> epbs, freeable, history); ++} ++ ++/* ARGSUSED */ ++static void ++dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) ++{ ++ dnode_t *dn = txh->txh_dnode; ++ uint64_t start, end, i; ++ int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; ++ int err = 0; ++ int l; ++ ++ if (len == 0) ++ return; ++ ++ min_bs = SPA_MINBLOCKSHIFT; ++ max_bs = SPA_MAXBLOCKSHIFT; ++ min_ibs = DN_MIN_INDBLKSHIFT; ++ max_ibs = DN_MAX_INDBLKSHIFT; ++ ++ if (dn) { ++ uint64_t history[DN_MAX_LEVELS]; ++ int nlvls = dn->dn_nlevels; ++ int delta; ++ ++ /* ++ * For i/o error checking, read the first and last level-0 ++ * blocks (if they are not aligned), and all the level-1 blocks. ++ */ ++ if (dn->dn_maxblkid == 0) { ++ delta = dn->dn_datablksz; ++ start = (off < dn->dn_datablksz) ? 0 : 1; ++ end = (off+len <= dn->dn_datablksz) ? 0 : 1; ++ if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { ++ err = dmu_tx_check_ioerr(NULL, dn, 0, 0); ++ if (err) ++ goto out; ++ delta -= off; ++ } ++ } else { ++ zio_t *zio = zio_root(dn->dn_objset->os_spa, ++ NULL, NULL, ZIO_FLAG_CANFAIL); ++ ++ /* first level-0 block */ ++ start = off >> dn->dn_datablkshift; ++ if (P2PHASE(off, dn->dn_datablksz) || ++ len < dn->dn_datablksz) { ++ err = dmu_tx_check_ioerr(zio, dn, 0, start); ++ if (err) ++ goto out; ++ } ++ ++ /* last level-0 block */ ++ end = (off+len-1) >> dn->dn_datablkshift; ++ if (end != start && end <= dn->dn_maxblkid && ++ P2PHASE(off+len, dn->dn_datablksz)) { ++ err = dmu_tx_check_ioerr(zio, dn, 0, end); ++ if (err) ++ goto out; ++ } ++ ++ /* level-1 blocks */ ++ if (nlvls > 1) { ++ int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ++ for (i = (start>>shft)+1; i < end>>shft; i++) { ++ err = dmu_tx_check_ioerr(zio, dn, 1, i); ++ if (err) ++ goto out; ++ } ++ } ++ ++ err = zio_wait(zio); ++ if (err) ++ goto out; ++ delta = P2NPHASE(off, dn->dn_datablksz); ++ } ++ ++ if (dn->dn_maxblkid > 0) { ++ /* ++ * The blocksize can't change, ++ * so we can make a more precise estimate. ++ */ ++ ASSERT(dn->dn_datablkshift != 0); ++ min_bs = max_bs = dn->dn_datablkshift; ++ min_ibs = max_ibs = dn->dn_indblkshift; ++ } else if (dn->dn_indblkshift > max_ibs) { ++ /* ++ * This ensures that if we reduce DN_MAX_INDBLKSHIFT, ++ * the code will still work correctly on older pools. ++ */ ++ min_ibs = max_ibs = dn->dn_indblkshift; ++ } ++ ++ /* ++ * If this write is not off the end of the file ++ * we need to account for overwrites/unref. ++ */ ++ if (start <= dn->dn_maxblkid) { ++ for (l = 0; l < DN_MAX_LEVELS; l++) ++ history[l] = -1ULL; ++ } ++ while (start <= dn->dn_maxblkid) { ++ dmu_buf_impl_t *db; ++ ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); ++ rw_exit(&dn->dn_struct_rwlock); ++ ++ if (err) { ++ txh->txh_tx->tx_err = err; ++ return; ++ } ++ ++ dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, ++ history); ++ dbuf_rele(db, FTAG); ++ if (++start > end) { ++ /* ++ * Account for new indirects appearing ++ * before this IO gets assigned into a txg. ++ */ ++ bits = 64 - min_bs; ++ epbs = min_ibs - SPA_BLKPTRSHIFT; ++ for (bits -= epbs * (nlvls - 1); ++ bits >= 0; bits -= epbs) ++ txh->txh_fudge += 1ULL << max_ibs; ++ goto out; ++ } ++ off += delta; ++ if (len >= delta) ++ len -= delta; ++ delta = dn->dn_datablksz; ++ } ++ } ++ ++ /* ++ * 'end' is the last thing we will access, not one past. ++ * This way we won't overflow when accessing the last byte. ++ */ ++ start = P2ALIGN(off, 1ULL << max_bs); ++ end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; ++ txh->txh_space_towrite += end - start + 1; ++ ++ start >>= min_bs; ++ end >>= min_bs; ++ ++ epbs = min_ibs - SPA_BLKPTRSHIFT; ++ ++ /* ++ * The object contains at most 2^(64 - min_bs) blocks, ++ * and each indirect level maps 2^epbs. ++ */ ++ for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { ++ start >>= epbs; ++ end >>= epbs; ++ ASSERT3U(end, >=, start); ++ txh->txh_space_towrite += (end - start + 1) << max_ibs; ++ if (start != 0) { ++ /* ++ * We also need a new blkid=0 indirect block ++ * to reference any existing file data. ++ */ ++ txh->txh_space_towrite += 1ULL << max_ibs; ++ } ++ } ++ ++out: ++ if (txh->txh_space_towrite + txh->txh_space_tooverwrite > ++ 2 * DMU_MAX_ACCESS) ++ err = EFBIG; ++ ++ if (err) ++ txh->txh_tx->tx_err = err; ++} ++ ++static void ++dmu_tx_count_dnode(dmu_tx_hold_t *txh) ++{ ++ dnode_t *dn = txh->txh_dnode; ++ dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset); ++ uint64_t space = mdn->dn_datablksz + ++ ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); ++ ++ if (dn && dn->dn_dbuf->db_blkptr && ++ dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, ++ dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) { ++ txh->txh_space_tooverwrite += space; ++ txh->txh_space_tounref += space; ++ } else { ++ txh->txh_space_towrite += space; ++ if (dn && dn->dn_dbuf->db_blkptr) ++ txh->txh_space_tounref += space; ++ } ++} ++ ++void ++dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) ++{ ++ dmu_tx_hold_t *txh; ++ ++ ASSERT(tx->tx_txg == 0); ++ ASSERT(len < DMU_MAX_ACCESS); ++ ASSERT(len == 0 || UINT64_MAX - off >= len - 1); ++ ++ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, ++ object, THT_WRITE, off, len); ++ if (txh == NULL) ++ return; ++ ++ dmu_tx_count_write(txh, off, len); ++ dmu_tx_count_dnode(txh); ++} ++ ++static void ++dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) ++{ ++ uint64_t blkid, nblks, lastblk; ++ uint64_t space = 0, unref = 0, skipped = 0; ++ dnode_t *dn = txh->txh_dnode; ++ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; ++ spa_t *spa = txh->txh_tx->tx_pool->dp_spa; ++ int epbs; ++ ++ if (dn->dn_nlevels == 0) ++ return; ++ ++ /* ++ * The struct_rwlock protects us against dn_nlevels ++ * changing, in case (against all odds) we manage to dirty & ++ * sync out the changes after we check for being dirty. ++ * Also, dbuf_hold_impl() wants us to have the struct_rwlock. ++ */ ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ++ if (dn->dn_maxblkid == 0) { ++ if (off == 0 && len >= dn->dn_datablksz) { ++ blkid = 0; ++ nblks = 1; ++ } else { ++ rw_exit(&dn->dn_struct_rwlock); ++ return; ++ } ++ } else { ++ blkid = off >> dn->dn_datablkshift; ++ nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; ++ ++ if (blkid >= dn->dn_maxblkid) { ++ rw_exit(&dn->dn_struct_rwlock); ++ return; ++ } ++ if (blkid + nblks > dn->dn_maxblkid) ++ nblks = dn->dn_maxblkid - blkid; ++ ++ } ++ if (dn->dn_nlevels == 1) { ++ int i; ++ for (i = 0; i < nblks; i++) { ++ blkptr_t *bp = dn->dn_phys->dn_blkptr; ++ ASSERT3U(blkid + i, <, dn->dn_nblkptr); ++ bp += blkid + i; ++ if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) { ++ dprintf_bp(bp, "can free old%s", ""); ++ space += bp_get_dsize(spa, bp); ++ } ++ unref += BP_GET_ASIZE(bp); ++ } ++ nblks = 0; ++ } ++ ++ /* ++ * Add in memory requirements of higher-level indirects. ++ * This assumes a worst-possible scenario for dn_nlevels. ++ */ ++ { ++ uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs); ++ int level = (dn->dn_nlevels > 1) ? 2 : 1; ++ ++ while (level++ < DN_MAX_LEVELS) { ++ txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift; ++ blkcnt = 1 + (blkcnt >> epbs); ++ } ++ ASSERT(blkcnt <= dn->dn_nblkptr); ++ } ++ ++ lastblk = blkid + nblks - 1; ++ while (nblks) { ++ dmu_buf_impl_t *dbuf; ++ uint64_t ibyte, new_blkid; ++ int epb = 1 << epbs; ++ int err, i, blkoff, tochk; ++ blkptr_t *bp; ++ ++ ibyte = blkid << dn->dn_datablkshift; ++ err = dnode_next_offset(dn, ++ DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); ++ new_blkid = ibyte >> dn->dn_datablkshift; ++ if (err == ESRCH) { ++ skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; ++ break; ++ } ++ if (err) { ++ txh->txh_tx->tx_err = err; ++ break; ++ } ++ if (new_blkid > lastblk) { ++ skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; ++ break; ++ } ++ ++ if (new_blkid > blkid) { ++ ASSERT((new_blkid >> epbs) > (blkid >> epbs)); ++ skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1; ++ nblks -= new_blkid - blkid; ++ blkid = new_blkid; ++ } ++ blkoff = P2PHASE(blkid, epb); ++ tochk = MIN(epb - blkoff, nblks); ++ ++ err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); ++ if (err) { ++ txh->txh_tx->tx_err = err; ++ break; ++ } ++ ++ txh->txh_memory_tohold += dbuf->db.db_size; ++ ++ /* ++ * We don't check memory_tohold against DMU_MAX_ACCESS because ++ * memory_tohold is an over-estimation (especially the >L1 ++ * indirect blocks), so it could fail. Callers should have ++ * already verified that they will not be holding too much ++ * memory. ++ */ ++ ++ err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); ++ if (err != 0) { ++ txh->txh_tx->tx_err = err; ++ dbuf_rele(dbuf, FTAG); ++ break; ++ } ++ ++ bp = dbuf->db.db_data; ++ bp += blkoff; ++ ++ for (i = 0; i < tochk; i++) { ++ if (dsl_dataset_block_freeable(ds, &bp[i], ++ bp[i].blk_birth)) { ++ dprintf_bp(&bp[i], "can free old%s", ""); ++ space += bp_get_dsize(spa, &bp[i]); ++ } ++ unref += BP_GET_ASIZE(bp); ++ } ++ dbuf_rele(dbuf, FTAG); ++ ++ blkid += tochk; ++ nblks -= tochk; ++ } ++ rw_exit(&dn->dn_struct_rwlock); ++ ++ /* account for new level 1 indirect blocks that might show up */ ++ if (skipped > 0) { ++ txh->txh_fudge += skipped << dn->dn_indblkshift; ++ skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); ++ txh->txh_memory_tohold += skipped << dn->dn_indblkshift; ++ } ++ txh->txh_space_tofree += space; ++ txh->txh_space_tounref += unref; ++} ++ ++void ++dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) ++{ ++ dmu_tx_hold_t *txh; ++ dnode_t *dn; ++ uint64_t start, end, i; ++ int err, shift; ++ zio_t *zio; ++ ++ ASSERT(tx->tx_txg == 0); ++ ++ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, ++ object, THT_FREE, off, len); ++ if (txh == NULL) ++ return; ++ dn = txh->txh_dnode; ++ ++ /* first block */ ++ if (off != 0) ++ dmu_tx_count_write(txh, off, 1); ++ /* last block */ ++ if (len != DMU_OBJECT_END) ++ dmu_tx_count_write(txh, off+len, 1); ++ ++ dmu_tx_count_dnode(txh); ++ ++ if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) ++ return; ++ if (len == DMU_OBJECT_END) ++ len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; ++ ++ /* ++ * For i/o error checking, read the first and last level-0 ++ * blocks, and all the level-1 blocks. The above count_write's ++ * have already taken care of the level-0 blocks. ++ */ ++ if (dn->dn_nlevels > 1) { ++ shift = dn->dn_datablkshift + dn->dn_indblkshift - ++ SPA_BLKPTRSHIFT; ++ start = off >> shift; ++ end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; ++ ++ zio = zio_root(tx->tx_pool->dp_spa, ++ NULL, NULL, ZIO_FLAG_CANFAIL); ++ for (i = start; i <= end; i++) { ++ uint64_t ibyte = i << shift; ++ err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); ++ i = ibyte >> shift; ++ if (err == ESRCH) ++ break; ++ if (err) { ++ tx->tx_err = err; ++ return; ++ } ++ ++ err = dmu_tx_check_ioerr(zio, dn, 1, i); ++ if (err) { ++ tx->tx_err = err; ++ return; ++ } ++ } ++ err = zio_wait(zio); ++ if (err) { ++ tx->tx_err = err; ++ return; ++ } ++ } ++ ++ dmu_tx_count_free(txh, off, len); ++} ++ ++void ++dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) ++{ ++ dmu_tx_hold_t *txh; ++ dnode_t *dn; ++ uint64_t nblocks; ++ int epbs, err; ++ ++ ASSERT(tx->tx_txg == 0); ++ ++ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, ++ object, THT_ZAP, add, (uintptr_t)name); ++ if (txh == NULL) ++ return; ++ dn = txh->txh_dnode; ++ ++ dmu_tx_count_dnode(txh); ++ ++ if (dn == NULL) { ++ /* ++ * We will be able to fit a new object's entries into one leaf ++ * block. So there will be at most 2 blocks total, ++ * including the header block. ++ */ ++ dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); ++ return; ++ } ++ ++ ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); ++ ++ if (dn->dn_maxblkid == 0 && !add) { ++ blkptr_t *bp; ++ ++ /* ++ * If there is only one block (i.e. this is a micro-zap) ++ * and we are not adding anything, the accounting is simple. ++ */ ++ err = dmu_tx_check_ioerr(NULL, dn, 0, 0); ++ if (err) { ++ tx->tx_err = err; ++ return; ++ } ++ ++ /* ++ * Use max block size here, since we don't know how much ++ * the size will change between now and the dbuf dirty call. ++ */ ++ bp = &dn->dn_phys->dn_blkptr[0]; ++ if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, ++ bp, bp->blk_birth)) ++ txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; ++ else ++ txh->txh_space_towrite += SPA_MAXBLOCKSIZE; ++ if (!BP_IS_HOLE(bp)) ++ txh->txh_space_tounref += SPA_MAXBLOCKSIZE; ++ return; ++ } ++ ++ if (dn->dn_maxblkid > 0 && name) { ++ /* ++ * access the name in this fat-zap so that we'll check ++ * for i/o errors to the leaf blocks, etc. ++ */ ++ err = zap_lookup(dn->dn_objset, dn->dn_object, name, ++ 8, 0, NULL); ++ if (err == EIO) { ++ tx->tx_err = err; ++ return; ++ } ++ } ++ ++ err = zap_count_write(dn->dn_objset, dn->dn_object, name, add, ++ &txh->txh_space_towrite, &txh->txh_space_tooverwrite); ++ ++ /* ++ * If the modified blocks are scattered to the four winds, ++ * we'll have to modify an indirect twig for each. ++ */ ++ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ++ for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) ++ if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj) ++ txh->txh_space_towrite += 3 << dn->dn_indblkshift; ++ else ++ txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift; ++} ++ ++void ++dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) ++{ ++ dmu_tx_hold_t *txh; ++ ++ ASSERT(tx->tx_txg == 0); ++ ++ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, ++ object, THT_BONUS, 0, 0); ++ if (txh) ++ dmu_tx_count_dnode(txh); ++} ++ ++void ++dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) ++{ ++ dmu_tx_hold_t *txh; ++ ASSERT(tx->tx_txg == 0); ++ ++ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, ++ DMU_NEW_OBJECT, THT_SPACE, space, 0); ++ ++ txh->txh_space_towrite += space; ++} ++ ++int ++dmu_tx_holds(dmu_tx_t *tx, uint64_t object) ++{ ++ dmu_tx_hold_t *txh; ++ int holds = 0; ++ ++ /* ++ * By asserting that the tx is assigned, we're counting the ++ * number of dn_tx_holds, which is the same as the number of ++ * dn_holds. Otherwise, we'd be counting dn_holds, but ++ * dn_tx_holds could be 0. ++ */ ++ ASSERT(tx->tx_txg != 0); ++ ++ /* if (tx->tx_anyobj == TRUE) */ ++ /* return (0); */ ++ ++ for (txh = list_head(&tx->tx_holds); txh; ++ txh = list_next(&tx->tx_holds, txh)) { ++ if (txh->txh_dnode && txh->txh_dnode->dn_object == object) ++ holds++; ++ } ++ ++ return (holds); ++} ++ ++#ifdef DEBUG_DMU_TX ++void ++dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) ++{ ++ dmu_tx_hold_t *txh; ++ int match_object = FALSE, match_offset = FALSE; ++ dnode_t *dn; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ ASSERT(dn != NULL); ++ ASSERT(tx->tx_txg != 0); ++ ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ++ ASSERT3U(dn->dn_object, ==, db->db.db_object); ++ ++ if (tx->tx_anyobj) { ++ DB_DNODE_EXIT(db); ++ return; ++ } ++ ++ /* XXX No checking on the meta dnode for now */ ++ if (db->db.db_object == DMU_META_DNODE_OBJECT) { ++ DB_DNODE_EXIT(db); ++ return; ++ } ++ ++ for (txh = list_head(&tx->tx_holds); txh; ++ txh = list_next(&tx->tx_holds, txh)) { ++ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); ++ if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) ++ match_object = TRUE; ++ if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { ++ int datablkshift = dn->dn_datablkshift ? ++ dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; ++ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ++ int shift = datablkshift + epbs * db->db_level; ++ uint64_t beginblk = shift >= 64 ? 0 : ++ (txh->txh_arg1 >> shift); ++ uint64_t endblk = shift >= 64 ? 0 : ++ ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); ++ uint64_t blkid = db->db_blkid; ++ ++ /* XXX txh_arg2 better not be zero... */ ++ ++ dprintf("found txh type %x beginblk=%llx endblk=%llx\n", ++ txh->txh_type, beginblk, endblk); ++ ++ switch (txh->txh_type) { ++ case THT_WRITE: ++ if (blkid >= beginblk && blkid <= endblk) ++ match_offset = TRUE; ++ /* ++ * We will let this hold work for the bonus ++ * or spill buffer so that we don't need to ++ * hold it when creating a new object. ++ */ ++ if (blkid == DMU_BONUS_BLKID || ++ blkid == DMU_SPILL_BLKID) ++ match_offset = TRUE; ++ /* ++ * They might have to increase nlevels, ++ * thus dirtying the new TLIBs. Or the ++ * might have to change the block size, ++ * thus dirying the new lvl=0 blk=0. ++ */ ++ if (blkid == 0) ++ match_offset = TRUE; ++ break; ++ case THT_FREE: ++ /* ++ * We will dirty all the level 1 blocks in ++ * the free range and perhaps the first and ++ * last level 0 block. ++ */ ++ if (blkid >= beginblk && (blkid <= endblk || ++ txh->txh_arg2 == DMU_OBJECT_END)) ++ match_offset = TRUE; ++ break; ++ case THT_SPILL: ++ if (blkid == DMU_SPILL_BLKID) ++ match_offset = TRUE; ++ break; ++ case THT_BONUS: ++ if (blkid == DMU_BONUS_BLKID) ++ match_offset = TRUE; ++ break; ++ case THT_ZAP: ++ match_offset = TRUE; ++ break; ++ case THT_NEWOBJECT: ++ match_object = TRUE; ++ break; ++ default: ++ ASSERT(!"bad txh_type"); ++ } ++ } ++ if (match_object && match_offset) { ++ DB_DNODE_EXIT(db); ++ return; ++ } ++ } ++ DB_DNODE_EXIT(db); ++ panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", ++ (u_longlong_t)db->db.db_object, db->db_level, ++ (u_longlong_t)db->db_blkid); ++} ++#endif ++ ++static int ++dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) ++{ ++ dmu_tx_hold_t *txh; ++ spa_t *spa = tx->tx_pool->dp_spa; ++ uint64_t memory, asize, fsize, usize; ++ uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; ++ ++ ASSERT3U(tx->tx_txg, ==, 0); ++ ++ if (tx->tx_err) { ++ DMU_TX_STAT_BUMP(dmu_tx_error); ++ return (tx->tx_err); ++ } ++ ++ if (spa_suspended(spa)) { ++ DMU_TX_STAT_BUMP(dmu_tx_suspended); ++ ++ /* ++ * If the user has indicated a blocking failure mode ++ * then return ERESTART which will block in dmu_tx_wait(). ++ * Otherwise, return EIO so that an error can get ++ * propagated back to the VOP calls. ++ * ++ * Note that we always honor the txg_how flag regardless ++ * of the failuremode setting. ++ */ ++ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && ++ txg_how != TXG_WAIT) ++ return (EIO); ++ ++ return (ERESTART); ++ } ++ ++ tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); ++ tx->tx_needassign_txh = NULL; ++ ++ /* ++ * NB: No error returns are allowed after txg_hold_open, but ++ * before processing the dnode holds, due to the ++ * dmu_tx_unassign() logic. ++ */ ++ ++ towrite = tofree = tooverwrite = tounref = tohold = fudge = 0; ++ for (txh = list_head(&tx->tx_holds); txh; ++ txh = list_next(&tx->tx_holds, txh)) { ++ dnode_t *dn = txh->txh_dnode; ++ if (dn != NULL) { ++ mutex_enter(&dn->dn_mtx); ++ if (dn->dn_assigned_txg == tx->tx_txg - 1) { ++ mutex_exit(&dn->dn_mtx); ++ tx->tx_needassign_txh = txh; ++ DMU_TX_STAT_BUMP(dmu_tx_group); ++ return (ERESTART); ++ } ++ if (dn->dn_assigned_txg == 0) ++ dn->dn_assigned_txg = tx->tx_txg; ++ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); ++ (void) refcount_add(&dn->dn_tx_holds, tx); ++ mutex_exit(&dn->dn_mtx); ++ } ++ towrite += txh->txh_space_towrite; ++ tofree += txh->txh_space_tofree; ++ tooverwrite += txh->txh_space_tooverwrite; ++ tounref += txh->txh_space_tounref; ++ tohold += txh->txh_memory_tohold; ++ fudge += txh->txh_fudge; ++ } ++ ++ /* ++ * NB: This check must be after we've held the dnodes, so that ++ * the dmu_tx_unassign() logic will work properly ++ */ ++ if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) { ++ DMU_TX_STAT_BUMP(dmu_tx_how); ++ return (ERESTART); ++ } ++ ++ /* ++ * If a snapshot has been taken since we made our estimates, ++ * assume that we won't be able to free or overwrite anything. ++ */ ++ if (tx->tx_objset && ++ dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) > ++ tx->tx_lastsnap_txg) { ++ towrite += tooverwrite; ++ tooverwrite = tofree = 0; ++ } ++ ++ /* needed allocation: worst-case estimate of write space */ ++ asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); ++ /* freed space estimate: worst-case overwrite + free estimate */ ++ fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; ++ /* convert unrefd space to worst-case estimate */ ++ usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); ++ /* calculate memory footprint estimate */ ++ memory = towrite + tooverwrite + tohold; ++ ++#ifdef DEBUG_DMU_TX ++ /* ++ * Add in 'tohold' to account for our dirty holds on this memory ++ * XXX - the "fudge" factor is to account for skipped blocks that ++ * we missed because dnode_next_offset() misses in-core-only blocks. ++ */ ++ tx->tx_space_towrite = asize + ++ spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge); ++ tx->tx_space_tofree = tofree; ++ tx->tx_space_tooverwrite = tooverwrite; ++ tx->tx_space_tounref = tounref; ++#endif ++ ++ if (tx->tx_dir && asize != 0) { ++ int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, ++ asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); ++ if (err) ++ return (err); ++ } ++ ++ DMU_TX_STAT_BUMP(dmu_tx_assigned); ++ ++ return (0); ++} ++ ++static void ++dmu_tx_unassign(dmu_tx_t *tx) ++{ ++ dmu_tx_hold_t *txh; ++ ++ if (tx->tx_txg == 0) ++ return; ++ ++ txg_rele_to_quiesce(&tx->tx_txgh); ++ ++ for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; ++ txh = list_next(&tx->tx_holds, txh)) { ++ dnode_t *dn = txh->txh_dnode; ++ ++ if (dn == NULL) ++ continue; ++ mutex_enter(&dn->dn_mtx); ++ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); ++ ++ if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { ++ dn->dn_assigned_txg = 0; ++ cv_broadcast(&dn->dn_notxholds); ++ } ++ mutex_exit(&dn->dn_mtx); ++ } ++ ++ txg_rele_to_sync(&tx->tx_txgh); ++ ++ tx->tx_lasttried_txg = tx->tx_txg; ++ tx->tx_txg = 0; ++} ++ ++/* ++ * Assign tx to a transaction group. txg_how can be one of: ++ * ++ * (1) TXG_WAIT. If the current open txg is full, waits until there's ++ * a new one. This should be used when you're not holding locks. ++ * If will only fail if we're truly out of space (or over quota). ++ * ++ * (2) TXG_NOWAIT. If we can't assign into the current open txg without ++ * blocking, returns immediately with ERESTART. This should be used ++ * whenever you're holding locks. On an ERESTART error, the caller ++ * should drop locks, do a dmu_tx_wait(tx), and try again. ++ * ++ * (3) A specific txg. Use this if you need to ensure that multiple ++ * transactions all sync in the same txg. Like TXG_NOWAIT, it ++ * returns ERESTART if it can't assign you into the requested txg. ++ */ ++int ++dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) ++{ ++ int err; ++ ++ ASSERT(tx->tx_txg == 0); ++ ASSERT(txg_how != 0); ++ ASSERT(!dsl_pool_sync_context(tx->tx_pool)); ++ ++ while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { ++ dmu_tx_unassign(tx); ++ ++ if (err != ERESTART || txg_how != TXG_WAIT) ++ return (err); ++ ++ dmu_tx_wait(tx); ++ } ++ ++ txg_rele_to_quiesce(&tx->tx_txgh); ++ ++ return (0); ++} ++ ++void ++dmu_tx_wait(dmu_tx_t *tx) ++{ ++ spa_t *spa = tx->tx_pool->dp_spa; ++ ++ ASSERT(tx->tx_txg == 0); ++ ++ /* ++ * It's possible that the pool has become active after this thread ++ * has tried to obtain a tx. If that's the case then his ++ * tx_lasttried_txg would not have been assigned. ++ */ ++ if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { ++ txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1); ++ } else if (tx->tx_needassign_txh) { ++ dnode_t *dn = tx->tx_needassign_txh->txh_dnode; ++ ++ mutex_enter(&dn->dn_mtx); ++ while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) ++ cv_wait(&dn->dn_notxholds, &dn->dn_mtx); ++ mutex_exit(&dn->dn_mtx); ++ tx->tx_needassign_txh = NULL; ++ } else { ++ txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); ++ } ++} ++ ++void ++dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) ++{ ++#ifdef DEBUG_DMU_TX ++ if (tx->tx_dir == NULL || delta == 0) ++ return; ++ ++ if (delta > 0) { ++ ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, ++ tx->tx_space_towrite); ++ (void) refcount_add_many(&tx->tx_space_written, delta, NULL); ++ } else { ++ (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); ++ } ++#endif ++} ++ ++void ++dmu_tx_commit(dmu_tx_t *tx) ++{ ++ dmu_tx_hold_t *txh; ++ ++ ASSERT(tx->tx_txg != 0); ++ ++ while ((txh = list_head(&tx->tx_holds))) { ++ dnode_t *dn = txh->txh_dnode; ++ ++ list_remove(&tx->tx_holds, txh); ++ kmem_free(txh, sizeof (dmu_tx_hold_t)); ++ if (dn == NULL) ++ continue; ++ mutex_enter(&dn->dn_mtx); ++ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); ++ ++ if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { ++ dn->dn_assigned_txg = 0; ++ cv_broadcast(&dn->dn_notxholds); ++ } ++ mutex_exit(&dn->dn_mtx); ++ dnode_rele(dn, tx); ++ } ++ ++ if (tx->tx_tempreserve_cookie) ++ dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); ++ ++ if (!list_is_empty(&tx->tx_callbacks)) ++ txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); ++ ++ if (tx->tx_anyobj == FALSE) ++ txg_rele_to_sync(&tx->tx_txgh); ++ ++ list_destroy(&tx->tx_callbacks); ++ list_destroy(&tx->tx_holds); ++#ifdef DEBUG_DMU_TX ++ dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", ++ tx->tx_space_towrite, refcount_count(&tx->tx_space_written), ++ tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); ++ refcount_destroy_many(&tx->tx_space_written, ++ refcount_count(&tx->tx_space_written)); ++ refcount_destroy_many(&tx->tx_space_freed, ++ refcount_count(&tx->tx_space_freed)); ++#endif ++ kmem_free(tx, sizeof (dmu_tx_t)); ++} ++ ++void ++dmu_tx_abort(dmu_tx_t *tx) ++{ ++ dmu_tx_hold_t *txh; ++ ++ ASSERT(tx->tx_txg == 0); ++ ++ while ((txh = list_head(&tx->tx_holds))) { ++ dnode_t *dn = txh->txh_dnode; ++ ++ list_remove(&tx->tx_holds, txh); ++ kmem_free(txh, sizeof (dmu_tx_hold_t)); ++ if (dn != NULL) ++ dnode_rele(dn, tx); ++ } ++ ++ /* ++ * Call any registered callbacks with an error code. ++ */ ++ if (!list_is_empty(&tx->tx_callbacks)) ++ dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); ++ ++ list_destroy(&tx->tx_callbacks); ++ list_destroy(&tx->tx_holds); ++#ifdef DEBUG_DMU_TX ++ refcount_destroy_many(&tx->tx_space_written, ++ refcount_count(&tx->tx_space_written)); ++ refcount_destroy_many(&tx->tx_space_freed, ++ refcount_count(&tx->tx_space_freed)); ++#endif ++ kmem_free(tx, sizeof (dmu_tx_t)); ++} ++ ++uint64_t ++dmu_tx_get_txg(dmu_tx_t *tx) ++{ ++ ASSERT(tx->tx_txg != 0); ++ return (tx->tx_txg); ++} ++ ++void ++dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) ++{ ++ dmu_tx_callback_t *dcb; ++ ++ dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_PUSHPAGE); ++ ++ dcb->dcb_func = func; ++ dcb->dcb_data = data; ++ ++ list_insert_tail(&tx->tx_callbacks, dcb); ++} ++ ++/* ++ * Call all the commit callbacks on a list, with a given error code. ++ */ ++void ++dmu_tx_do_callbacks(list_t *cb_list, int error) ++{ ++ dmu_tx_callback_t *dcb; ++ ++ while ((dcb = list_head(cb_list))) { ++ list_remove(cb_list, dcb); ++ dcb->dcb_func(dcb->dcb_data, error); ++ kmem_free(dcb, sizeof (dmu_tx_callback_t)); ++ } ++} ++ ++/* ++ * Interface to hold a bunch of attributes. ++ * used for creating new files. ++ * attrsize is the total size of all attributes ++ * to be added during object creation ++ * ++ * For updating/adding a single attribute dmu_tx_hold_sa() should be used. ++ */ ++ ++/* ++ * hold necessary attribute name for attribute registration. ++ * should be a very rare case where this is needed. If it does ++ * happen it would only happen on the first write to the file system. ++ */ ++static void ++dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) ++{ ++ int i; ++ ++ if (!sa->sa_need_attr_registration) ++ return; ++ ++ for (i = 0; i != sa->sa_num_attrs; i++) { ++ if (!sa->sa_attr_table[i].sa_registered) { ++ if (sa->sa_reg_attr_obj) ++ dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, ++ B_TRUE, sa->sa_attr_table[i].sa_name); ++ else ++ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, ++ B_TRUE, sa->sa_attr_table[i].sa_name); ++ } ++ } ++} ++ ++ ++void ++dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) ++{ ++ dnode_t *dn; ++ dmu_tx_hold_t *txh; ++ ++ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, ++ THT_SPILL, 0, 0); ++ ++ dn = txh->txh_dnode; ++ ++ if (dn == NULL) ++ return; ++ ++ /* If blkptr doesn't exist then add space to towrite */ ++ if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { ++ txh->txh_space_towrite += SPA_MAXBLOCKSIZE; ++ } else { ++ blkptr_t *bp; ++ ++ bp = &dn->dn_phys->dn_spill; ++ if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, ++ bp, bp->blk_birth)) ++ txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; ++ else ++ txh->txh_space_towrite += SPA_MAXBLOCKSIZE; ++ if (!BP_IS_HOLE(bp)) ++ txh->txh_space_tounref += SPA_MAXBLOCKSIZE; ++ } ++} ++ ++void ++dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) ++{ ++ sa_os_t *sa = tx->tx_objset->os_sa; ++ ++ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); ++ ++ if (tx->tx_objset->os_sa->sa_master_obj == 0) ++ return; ++ ++ if (tx->tx_objset->os_sa->sa_layout_attr_obj) ++ dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); ++ else { ++ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); ++ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); ++ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); ++ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); ++ } ++ ++ dmu_tx_sa_registration_hold(sa, tx); ++ ++ if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) ++ return; ++ ++ (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, ++ THT_SPILL, 0, 0); ++} ++ ++/* ++ * Hold SA attribute ++ * ++ * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) ++ * ++ * variable_size is the total size of all variable sized attributes ++ * passed to this function. It is not the total size of all ++ * variable size attributes that *may* exist on this object. ++ */ ++void ++dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) ++{ ++ uint64_t object; ++ sa_os_t *sa = tx->tx_objset->os_sa; ++ ++ ASSERT(hdl != NULL); ++ ++ object = sa_handle_object(hdl); ++ ++ dmu_tx_hold_bonus(tx, object); ++ ++ if (tx->tx_objset->os_sa->sa_master_obj == 0) ++ return; ++ ++ if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || ++ tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { ++ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); ++ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); ++ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); ++ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); ++ } ++ ++ dmu_tx_sa_registration_hold(sa, tx); ++ ++ if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) ++ dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); ++ ++ if (sa->sa_force_spill || may_grow || hdl->sa_spill) { ++ ASSERT(tx->tx_txg == 0); ++ dmu_tx_hold_spill(tx, object); ++ } else { ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; ++ dnode_t *dn; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ if (dn->dn_have_spill) { ++ ASSERT(tx->tx_txg == 0); ++ dmu_tx_hold_spill(tx, object); ++ } ++ DB_DNODE_EXIT(db); ++ } ++} ++ ++void ++dmu_tx_init(void) ++{ ++ dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc", ++ KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t), ++ KSTAT_FLAG_VIRTUAL); ++ ++ if (dmu_tx_ksp != NULL) { ++ dmu_tx_ksp->ks_data = &dmu_tx_stats; ++ kstat_install(dmu_tx_ksp); ++ } ++} ++ ++void ++dmu_tx_fini(void) ++{ ++ if (dmu_tx_ksp != NULL) { ++ kstat_delete(dmu_tx_ksp); ++ dmu_tx_ksp = NULL; ++ } ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(dmu_tx_create); ++EXPORT_SYMBOL(dmu_tx_hold_write); ++EXPORT_SYMBOL(dmu_tx_hold_free); ++EXPORT_SYMBOL(dmu_tx_hold_zap); ++EXPORT_SYMBOL(dmu_tx_hold_bonus); ++EXPORT_SYMBOL(dmu_tx_abort); ++EXPORT_SYMBOL(dmu_tx_assign); ++EXPORT_SYMBOL(dmu_tx_wait); ++EXPORT_SYMBOL(dmu_tx_commit); ++EXPORT_SYMBOL(dmu_tx_get_txg); ++EXPORT_SYMBOL(dmu_tx_callback_register); ++EXPORT_SYMBOL(dmu_tx_do_callbacks); ++EXPORT_SYMBOL(dmu_tx_hold_spill); ++EXPORT_SYMBOL(dmu_tx_hold_sa_create); ++EXPORT_SYMBOL(dmu_tx_hold_sa); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dmu_zfetch.c linux-3.2.33-go/fs/zfs/zfs/dmu_zfetch.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dmu_zfetch.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dmu_zfetch.c 2012-11-16 23:25:34.350039322 +0100 +@@ -0,0 +1,742 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * I'm against tune-ables, but these should probably exist as tweakable globals ++ * until we can get this working the way we want it to. ++ */ ++ ++int zfs_prefetch_disable = 0; ++ ++/* max # of streams per zfetch */ ++unsigned int zfetch_max_streams = 8; ++/* min time before stream reclaim */ ++unsigned int zfetch_min_sec_reap = 2; ++/* max number of blocks to fetch at a time */ ++unsigned int zfetch_block_cap = 256; ++/* number of bytes in a array_read at which we stop prefetching (1Mb) */ ++unsigned long zfetch_array_rd_sz = 1024 * 1024; ++ ++/* forward decls for static routines */ ++static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); ++static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); ++static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); ++static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); ++static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); ++static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); ++static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); ++static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); ++static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *); ++ ++typedef struct zfetch_stats { ++ kstat_named_t zfetchstat_hits; ++ kstat_named_t zfetchstat_misses; ++ kstat_named_t zfetchstat_colinear_hits; ++ kstat_named_t zfetchstat_colinear_misses; ++ kstat_named_t zfetchstat_stride_hits; ++ kstat_named_t zfetchstat_stride_misses; ++ kstat_named_t zfetchstat_reclaim_successes; ++ kstat_named_t zfetchstat_reclaim_failures; ++ kstat_named_t zfetchstat_stream_resets; ++ kstat_named_t zfetchstat_stream_noresets; ++ kstat_named_t zfetchstat_bogus_streams; ++} zfetch_stats_t; ++ ++static zfetch_stats_t zfetch_stats = { ++ { "hits", KSTAT_DATA_UINT64 }, ++ { "misses", KSTAT_DATA_UINT64 }, ++ { "colinear_hits", KSTAT_DATA_UINT64 }, ++ { "colinear_misses", KSTAT_DATA_UINT64 }, ++ { "stride_hits", KSTAT_DATA_UINT64 }, ++ { "stride_misses", KSTAT_DATA_UINT64 }, ++ { "reclaim_successes", KSTAT_DATA_UINT64 }, ++ { "reclaim_failures", KSTAT_DATA_UINT64 }, ++ { "streams_resets", KSTAT_DATA_UINT64 }, ++ { "streams_noresets", KSTAT_DATA_UINT64 }, ++ { "bogus_streams", KSTAT_DATA_UINT64 }, ++}; ++ ++#define ZFETCHSTAT_INCR(stat, val) \ ++ atomic_add_64(&zfetch_stats.stat.value.ui64, (val)); ++ ++#define ZFETCHSTAT_BUMP(stat) ZFETCHSTAT_INCR(stat, 1); ++ ++kstat_t *zfetch_ksp; ++ ++/* ++ * Given a zfetch structure and a zstream structure, determine whether the ++ * blocks to be read are part of a co-linear pair of existing prefetch ++ * streams. If a set is found, coalesce the streams, removing one, and ++ * configure the prefetch so it looks for a strided access pattern. ++ * ++ * In other words: if we find two sequential access streams that are ++ * the same length and distance N appart, and this read is N from the ++ * last stream, then we are probably in a strided access pattern. So ++ * combine the two sequential streams into a single strided stream. ++ * ++ * If no co-linear streams are found, return NULL. ++ */ ++static int ++dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) ++{ ++ zstream_t *z_walk; ++ zstream_t *z_comp; ++ ++ if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) ++ return (0); ++ ++ if (zh == NULL) { ++ rw_exit(&zf->zf_rwlock); ++ return (0); ++ } ++ ++ for (z_walk = list_head(&zf->zf_stream); z_walk; ++ z_walk = list_next(&zf->zf_stream, z_walk)) { ++ for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp; ++ z_comp = list_next(&zf->zf_stream, z_comp)) { ++ int64_t diff; ++ ++ if (z_walk->zst_len != z_walk->zst_stride || ++ z_comp->zst_len != z_comp->zst_stride) { ++ continue; ++ } ++ ++ diff = z_comp->zst_offset - z_walk->zst_offset; ++ if (z_comp->zst_offset + diff == zh->zst_offset) { ++ z_walk->zst_offset = zh->zst_offset; ++ z_walk->zst_direction = diff < 0 ? -1 : 1; ++ z_walk->zst_stride = ++ diff * z_walk->zst_direction; ++ z_walk->zst_ph_offset = ++ zh->zst_offset + z_walk->zst_stride; ++ dmu_zfetch_stream_remove(zf, z_comp); ++ mutex_destroy(&z_comp->zst_lock); ++ kmem_free(z_comp, sizeof (zstream_t)); ++ ++ dmu_zfetch_dofetch(zf, z_walk); ++ ++ rw_exit(&zf->zf_rwlock); ++ return (1); ++ } ++ ++ diff = z_walk->zst_offset - z_comp->zst_offset; ++ if (z_walk->zst_offset + diff == zh->zst_offset) { ++ z_walk->zst_offset = zh->zst_offset; ++ z_walk->zst_direction = diff < 0 ? -1 : 1; ++ z_walk->zst_stride = ++ diff * z_walk->zst_direction; ++ z_walk->zst_ph_offset = ++ zh->zst_offset + z_walk->zst_stride; ++ dmu_zfetch_stream_remove(zf, z_comp); ++ mutex_destroy(&z_comp->zst_lock); ++ kmem_free(z_comp, sizeof (zstream_t)); ++ ++ dmu_zfetch_dofetch(zf, z_walk); ++ ++ rw_exit(&zf->zf_rwlock); ++ return (1); ++ } ++ } ++ } ++ ++ rw_exit(&zf->zf_rwlock); ++ return (0); ++} ++ ++/* ++ * Given a zstream_t, determine the bounds of the prefetch. Then call the ++ * routine that actually prefetches the individual blocks. ++ */ ++static void ++dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) ++{ ++ uint64_t prefetch_tail; ++ uint64_t prefetch_limit; ++ uint64_t prefetch_ofst; ++ uint64_t prefetch_len; ++ uint64_t blocks_fetched; ++ ++ zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len); ++ zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap); ++ ++ prefetch_tail = MAX((int64_t)zs->zst_ph_offset, ++ (int64_t)(zs->zst_offset + zs->zst_stride)); ++ /* ++ * XXX: use a faster division method? ++ */ ++ prefetch_limit = zs->zst_offset + zs->zst_len + ++ (zs->zst_cap * zs->zst_stride) / zs->zst_len; ++ ++ while (prefetch_tail < prefetch_limit) { ++ prefetch_ofst = zs->zst_offset + zs->zst_direction * ++ (prefetch_tail - zs->zst_offset); ++ ++ prefetch_len = zs->zst_len; ++ ++ /* ++ * Don't prefetch beyond the end of the file, if working ++ * backwards. ++ */ ++ if ((zs->zst_direction == ZFETCH_BACKWARD) && ++ (prefetch_ofst > prefetch_tail)) { ++ prefetch_len += prefetch_ofst; ++ prefetch_ofst = 0; ++ } ++ ++ /* don't prefetch more than we're supposed to */ ++ if (prefetch_len > zs->zst_len) ++ break; ++ ++ blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode, ++ prefetch_ofst, zs->zst_len); ++ ++ prefetch_tail += zs->zst_stride; ++ /* stop if we've run out of stuff to prefetch */ ++ if (blocks_fetched < zs->zst_len) ++ break; ++ } ++ zs->zst_ph_offset = prefetch_tail; ++ zs->zst_last = ddi_get_lbolt(); ++} ++ ++void ++zfetch_init(void) ++{ ++ ++ zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", ++ KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), ++ KSTAT_FLAG_VIRTUAL); ++ ++ if (zfetch_ksp != NULL) { ++ zfetch_ksp->ks_data = &zfetch_stats; ++ kstat_install(zfetch_ksp); ++ } ++} ++ ++void ++zfetch_fini(void) ++{ ++ if (zfetch_ksp != NULL) { ++ kstat_delete(zfetch_ksp); ++ zfetch_ksp = NULL; ++ } ++} ++ ++/* ++ * This takes a pointer to a zfetch structure and a dnode. It performs the ++ * necessary setup for the zfetch structure, grokking data from the ++ * associated dnode. ++ */ ++void ++dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) ++{ ++ if (zf == NULL) { ++ return; ++ } ++ ++ zf->zf_dnode = dno; ++ zf->zf_stream_cnt = 0; ++ zf->zf_alloc_fail = 0; ++ ++ list_create(&zf->zf_stream, sizeof (zstream_t), ++ offsetof(zstream_t, zst_node)); ++ ++ rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); ++} ++ ++/* ++ * This function computes the actual size, in blocks, that can be prefetched, ++ * and fetches it. ++ */ ++static uint64_t ++dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) ++{ ++ uint64_t fetchsz; ++ uint64_t i; ++ ++ fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); ++ ++ for (i = 0; i < fetchsz; i++) { ++ dbuf_prefetch(dn, blkid + i); ++ } ++ ++ return (fetchsz); ++} ++ ++/* ++ * this function returns the number of blocks that would be prefetched, based ++ * upon the supplied dnode, blockid, and nblks. This is used so that we can ++ * update streams in place, and then prefetch with their old value after the ++ * fact. This way, we can delay the prefetch, but subsequent accesses to the ++ * stream won't result in the same data being prefetched multiple times. ++ */ ++static uint64_t ++dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) ++{ ++ uint64_t fetchsz; ++ ++ if (blkid > dn->dn_maxblkid) { ++ return (0); ++ } ++ ++ /* compute fetch size */ ++ if (blkid + nblks + 1 > dn->dn_maxblkid) { ++ fetchsz = (dn->dn_maxblkid - blkid) + 1; ++ ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid); ++ } else { ++ fetchsz = nblks; ++ } ++ ++ ++ return (fetchsz); ++} ++ ++/* ++ * given a zfetch and a zstream structure, see if there is an associated zstream ++ * for this block read. If so, it starts a prefetch for the stream it ++ * located and returns true, otherwise it returns false ++ */ ++static int ++dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) ++{ ++ zstream_t *zs; ++ int64_t diff; ++ int reset = !prefetched; ++ int rc = 0; ++ ++ if (zh == NULL) ++ return (0); ++ ++ /* ++ * XXX: This locking strategy is a bit coarse; however, it's impact has ++ * yet to be tested. If this turns out to be an issue, it can be ++ * modified in a number of different ways. ++ */ ++ ++ rw_enter(&zf->zf_rwlock, RW_READER); ++top: ++ ++ for (zs = list_head(&zf->zf_stream); zs; ++ zs = list_next(&zf->zf_stream, zs)) { ++ ++ /* ++ * XXX - should this be an assert? ++ */ ++ if (zs->zst_len == 0) { ++ /* bogus stream */ ++ ZFETCHSTAT_BUMP(zfetchstat_bogus_streams); ++ continue; ++ } ++ ++ /* ++ * We hit this case when we are in a strided prefetch stream: ++ * we will read "len" blocks before "striding". ++ */ ++ if (zh->zst_offset >= zs->zst_offset && ++ zh->zst_offset < zs->zst_offset + zs->zst_len) { ++ if (prefetched) { ++ /* already fetched */ ++ ZFETCHSTAT_BUMP(zfetchstat_stride_hits); ++ rc = 1; ++ goto out; ++ } else { ++ ZFETCHSTAT_BUMP(zfetchstat_stride_misses); ++ } ++ } ++ ++ /* ++ * This is the forward sequential read case: we increment ++ * len by one each time we hit here, so we will enter this ++ * case on every read. ++ */ ++ if (zh->zst_offset == zs->zst_offset + zs->zst_len) { ++ ++ reset = !prefetched && zs->zst_len > 1; ++ ++ mutex_enter(&zs->zst_lock); ++ ++ if (zh->zst_offset != zs->zst_offset + zs->zst_len) { ++ mutex_exit(&zs->zst_lock); ++ goto top; ++ } ++ zs->zst_len += zh->zst_len; ++ diff = zs->zst_len - zfetch_block_cap; ++ if (diff > 0) { ++ zs->zst_offset += diff; ++ zs->zst_len = zs->zst_len > diff ? ++ zs->zst_len - diff : 0; ++ } ++ zs->zst_direction = ZFETCH_FORWARD; ++ ++ break; ++ ++ /* ++ * Same as above, but reading backwards through the file. ++ */ ++ } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) { ++ /* backwards sequential access */ ++ ++ reset = !prefetched && zs->zst_len > 1; ++ ++ mutex_enter(&zs->zst_lock); ++ ++ if (zh->zst_offset != zs->zst_offset - zh->zst_len) { ++ mutex_exit(&zs->zst_lock); ++ goto top; ++ } ++ ++ zs->zst_offset = zs->zst_offset > zh->zst_len ? ++ zs->zst_offset - zh->zst_len : 0; ++ zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ? ++ zs->zst_ph_offset - zh->zst_len : 0; ++ zs->zst_len += zh->zst_len; ++ ++ diff = zs->zst_len - zfetch_block_cap; ++ if (diff > 0) { ++ zs->zst_ph_offset = zs->zst_ph_offset > diff ? ++ zs->zst_ph_offset - diff : 0; ++ zs->zst_len = zs->zst_len > diff ? ++ zs->zst_len - diff : zs->zst_len; ++ } ++ zs->zst_direction = ZFETCH_BACKWARD; ++ ++ break; ++ ++ } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride < ++ zs->zst_len) && (zs->zst_len != zs->zst_stride)) { ++ /* strided forward access */ ++ ++ mutex_enter(&zs->zst_lock); ++ ++ if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= ++ zs->zst_len) || (zs->zst_len == zs->zst_stride)) { ++ mutex_exit(&zs->zst_lock); ++ goto top; ++ } ++ ++ zs->zst_offset += zs->zst_stride; ++ zs->zst_direction = ZFETCH_FORWARD; ++ ++ break; ++ ++ } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride < ++ zs->zst_len) && (zs->zst_len != zs->zst_stride)) { ++ /* strided reverse access */ ++ ++ mutex_enter(&zs->zst_lock); ++ ++ if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= ++ zs->zst_len) || (zs->zst_len == zs->zst_stride)) { ++ mutex_exit(&zs->zst_lock); ++ goto top; ++ } ++ ++ zs->zst_offset = zs->zst_offset > zs->zst_stride ? ++ zs->zst_offset - zs->zst_stride : 0; ++ zs->zst_ph_offset = (zs->zst_ph_offset > ++ (2 * zs->zst_stride)) ? ++ (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0; ++ zs->zst_direction = ZFETCH_BACKWARD; ++ ++ break; ++ } ++ } ++ ++ if (zs) { ++ if (reset) { ++ zstream_t *remove = zs; ++ ++ ZFETCHSTAT_BUMP(zfetchstat_stream_resets); ++ rc = 0; ++ mutex_exit(&zs->zst_lock); ++ rw_exit(&zf->zf_rwlock); ++ rw_enter(&zf->zf_rwlock, RW_WRITER); ++ /* ++ * Relocate the stream, in case someone removes ++ * it while we were acquiring the WRITER lock. ++ */ ++ for (zs = list_head(&zf->zf_stream); zs; ++ zs = list_next(&zf->zf_stream, zs)) { ++ if (zs == remove) { ++ dmu_zfetch_stream_remove(zf, zs); ++ mutex_destroy(&zs->zst_lock); ++ kmem_free(zs, sizeof (zstream_t)); ++ break; ++ } ++ } ++ } else { ++ ZFETCHSTAT_BUMP(zfetchstat_stream_noresets); ++ rc = 1; ++ dmu_zfetch_dofetch(zf, zs); ++ mutex_exit(&zs->zst_lock); ++ } ++ } ++out: ++ rw_exit(&zf->zf_rwlock); ++ return (rc); ++} ++ ++/* ++ * Clean-up state associated with a zfetch structure. This frees allocated ++ * structure members, empties the zf_stream tree, and generally makes things ++ * nice. This doesn't free the zfetch_t itself, that's left to the caller. ++ */ ++void ++dmu_zfetch_rele(zfetch_t *zf) ++{ ++ zstream_t *zs; ++ zstream_t *zs_next; ++ ++ ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); ++ ++ for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { ++ zs_next = list_next(&zf->zf_stream, zs); ++ ++ list_remove(&zf->zf_stream, zs); ++ mutex_destroy(&zs->zst_lock); ++ kmem_free(zs, sizeof (zstream_t)); ++ } ++ list_destroy(&zf->zf_stream); ++ rw_destroy(&zf->zf_rwlock); ++ ++ zf->zf_dnode = NULL; ++} ++ ++/* ++ * Given a zfetch and zstream structure, insert the zstream structure into the ++ * AVL tree contained within the zfetch structure. Peform the appropriate ++ * book-keeping. It is possible that another thread has inserted a stream which ++ * matches one that we are about to insert, so we must be sure to check for this ++ * case. If one is found, return failure, and let the caller cleanup the ++ * duplicates. ++ */ ++static int ++dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs) ++{ ++ zstream_t *zs_walk; ++ zstream_t *zs_next; ++ ++ ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); ++ ++ for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) { ++ zs_next = list_next(&zf->zf_stream, zs_walk); ++ ++ if (dmu_zfetch_streams_equal(zs_walk, zs)) { ++ return (0); ++ } ++ } ++ ++ list_insert_head(&zf->zf_stream, zs); ++ zf->zf_stream_cnt++; ++ return (1); ++} ++ ++ ++/* ++ * Walk the list of zstreams in the given zfetch, find an old one (by time), and ++ * reclaim it for use by the caller. ++ */ ++static zstream_t * ++dmu_zfetch_stream_reclaim(zfetch_t *zf) ++{ ++ zstream_t *zs; ++ ++ if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) ++ return (0); ++ ++ for (zs = list_head(&zf->zf_stream); zs; ++ zs = list_next(&zf->zf_stream, zs)) { ++ ++ if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap) ++ break; ++ } ++ ++ if (zs) { ++ dmu_zfetch_stream_remove(zf, zs); ++ mutex_destroy(&zs->zst_lock); ++ bzero(zs, sizeof (zstream_t)); ++ } else { ++ zf->zf_alloc_fail++; ++ } ++ rw_exit(&zf->zf_rwlock); ++ ++ return (zs); ++} ++ ++/* ++ * Given a zfetch and zstream structure, remove the zstream structure from its ++ * container in the zfetch structure. Perform the appropriate book-keeping. ++ */ ++static void ++dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) ++{ ++ ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); ++ ++ list_remove(&zf->zf_stream, zs); ++ zf->zf_stream_cnt--; ++} ++ ++static int ++dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2) ++{ ++ if (zs1->zst_offset != zs2->zst_offset) ++ return (0); ++ ++ if (zs1->zst_len != zs2->zst_len) ++ return (0); ++ ++ if (zs1->zst_stride != zs2->zst_stride) ++ return (0); ++ ++ if (zs1->zst_ph_offset != zs2->zst_ph_offset) ++ return (0); ++ ++ if (zs1->zst_cap != zs2->zst_cap) ++ return (0); ++ ++ if (zs1->zst_direction != zs2->zst_direction) ++ return (0); ++ ++ return (1); ++} ++ ++/* ++ * This is the prefetch entry point. It calls all of the other dmu_zfetch ++ * routines to create, delete, find, or operate upon prefetch streams. ++ */ ++void ++dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) ++{ ++ zstream_t zst; ++ zstream_t *newstream; ++ int fetched; ++ int inserted; ++ unsigned int blkshft; ++ uint64_t blksz; ++ ++ if (zfs_prefetch_disable) ++ return; ++ ++ /* files that aren't ln2 blocksz are only one block -- nothing to do */ ++ if (!zf->zf_dnode->dn_datablkshift) ++ return; ++ ++ /* convert offset and size, into blockid and nblocks */ ++ blkshft = zf->zf_dnode->dn_datablkshift; ++ blksz = (1 << blkshft); ++ ++ bzero(&zst, sizeof (zstream_t)); ++ zst.zst_offset = offset >> blkshft; ++ zst.zst_len = (P2ROUNDUP(offset + size, blksz) - ++ P2ALIGN(offset, blksz)) >> blkshft; ++ ++ fetched = dmu_zfetch_find(zf, &zst, prefetched); ++ if (fetched) { ++ ZFETCHSTAT_BUMP(zfetchstat_hits); ++ } else { ++ ZFETCHSTAT_BUMP(zfetchstat_misses); ++ if ((fetched = dmu_zfetch_colinear(zf, &zst))) { ++ ZFETCHSTAT_BUMP(zfetchstat_colinear_hits); ++ } else { ++ ZFETCHSTAT_BUMP(zfetchstat_colinear_misses); ++ } ++ } ++ ++ if (!fetched) { ++ newstream = dmu_zfetch_stream_reclaim(zf); ++ ++ /* ++ * we still couldn't find a stream, drop the lock, and allocate ++ * one if possible. Otherwise, give up and go home. ++ */ ++ if (newstream) { ++ ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes); ++ } else { ++ uint64_t maxblocks; ++ uint32_t max_streams; ++ uint32_t cur_streams; ++ ++ ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures); ++ cur_streams = zf->zf_stream_cnt; ++ maxblocks = zf->zf_dnode->dn_maxblkid; ++ ++ max_streams = MIN(zfetch_max_streams, ++ (maxblocks / zfetch_block_cap)); ++ if (max_streams == 0) { ++ max_streams++; ++ } ++ ++ if (cur_streams >= max_streams) { ++ return; ++ } ++ newstream = kmem_zalloc(sizeof (zstream_t), KM_PUSHPAGE); ++ } ++ ++ newstream->zst_offset = zst.zst_offset; ++ newstream->zst_len = zst.zst_len; ++ newstream->zst_stride = zst.zst_len; ++ newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; ++ newstream->zst_cap = zst.zst_len; ++ newstream->zst_direction = ZFETCH_FORWARD; ++ newstream->zst_last = ddi_get_lbolt(); ++ ++ mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ rw_enter(&zf->zf_rwlock, RW_WRITER); ++ inserted = dmu_zfetch_stream_insert(zf, newstream); ++ rw_exit(&zf->zf_rwlock); ++ ++ if (!inserted) { ++ mutex_destroy(&newstream->zst_lock); ++ kmem_free(newstream, sizeof (zstream_t)); ++ } ++ } ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++module_param(zfs_prefetch_disable, int, 0644); ++MODULE_PARM_DESC(zfs_prefetch_disable, "Disable all ZFS prefetching"); ++ ++module_param(zfetch_max_streams, uint, 0644); ++MODULE_PARM_DESC(zfetch_max_streams, "Max number of streams per zfetch"); ++ ++module_param(zfetch_min_sec_reap, uint, 0644); ++MODULE_PARM_DESC(zfetch_min_sec_reap, "Min time before stream reclaim"); ++ ++module_param(zfetch_block_cap, uint, 0644); ++MODULE_PARM_DESC(zfetch_block_cap, "Max number of blocks to fetch at a time"); ++ ++module_param(zfetch_array_rd_sz, ulong, 0644); ++MODULE_PARM_DESC(zfetch_array_rd_sz, "Number of bytes in a array_read"); ++#endif ++ +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dnode.c linux-3.2.33-go/fs/zfs/zfs/dnode.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dnode.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dnode.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,1994 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static int free_range_compar(const void *node1, const void *node2); ++ ++static kmem_cache_t *dnode_cache; ++/* ++ * Define DNODE_STATS to turn on statistic gathering. By default, it is only ++ * turned on when DEBUG is also defined. ++ */ ++#ifdef DEBUG ++#define DNODE_STATS ++#endif /* DEBUG */ ++ ++#ifdef DNODE_STATS ++#define DNODE_STAT_ADD(stat) ((stat)++) ++#else ++#define DNODE_STAT_ADD(stat) /* nothing */ ++#endif /* DNODE_STATS */ ++ ++ASSERTV(static dnode_phys_t dnode_phys_zero); ++ ++int zfs_default_bs = SPA_MINBLOCKSHIFT; ++int zfs_default_ibs = DN_MAX_INDBLKSHIFT; ++ ++#ifdef _KERNEL ++static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); ++#endif /* _KERNEL */ ++ ++/* ARGSUSED */ ++static int ++dnode_cons(void *arg, void *unused, int kmflag) ++{ ++ dnode_t *dn = arg; ++ int i; ++ ++ rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); ++ mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); ++ cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); ++ ++ refcount_create(&dn->dn_holds); ++ refcount_create(&dn->dn_tx_holds); ++ list_link_init(&dn->dn_link); ++ ++ bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); ++ bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); ++ bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); ++ bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype)); ++ bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); ++ bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); ++ bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); ++ ++ for (i = 0; i < TXG_SIZE; i++) { ++ list_link_init(&dn->dn_dirty_link[i]); ++ avl_create(&dn->dn_ranges[i], free_range_compar, ++ sizeof (free_range_t), ++ offsetof(struct free_range, fr_node)); ++ list_create(&dn->dn_dirty_records[i], ++ sizeof (dbuf_dirty_record_t), ++ offsetof(dbuf_dirty_record_t, dr_dirty_node)); ++ } ++ ++ dn->dn_allocated_txg = 0; ++ dn->dn_free_txg = 0; ++ dn->dn_assigned_txg = 0; ++ dn->dn_dirtyctx = 0; ++ dn->dn_dirtyctx_firstset = NULL; ++ dn->dn_bonus = NULL; ++ dn->dn_have_spill = B_FALSE; ++ dn->dn_zio = NULL; ++ dn->dn_oldused = 0; ++ dn->dn_oldflags = 0; ++ dn->dn_olduid = 0; ++ dn->dn_oldgid = 0; ++ dn->dn_newuid = 0; ++ dn->dn_newgid = 0; ++ dn->dn_id_flags = 0; ++ ++ dn->dn_dbufs_count = 0; ++ list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), ++ offsetof(dmu_buf_impl_t, db_link)); ++ ++ dn->dn_moved = 0; ++ return (0); ++} ++ ++/* ARGSUSED */ ++static void ++dnode_dest(void *arg, void *unused) ++{ ++ int i; ++ dnode_t *dn = arg; ++ ++ rw_destroy(&dn->dn_struct_rwlock); ++ mutex_destroy(&dn->dn_mtx); ++ mutex_destroy(&dn->dn_dbufs_mtx); ++ cv_destroy(&dn->dn_notxholds); ++ refcount_destroy(&dn->dn_holds); ++ refcount_destroy(&dn->dn_tx_holds); ++ ASSERT(!list_link_active(&dn->dn_link)); ++ ++ for (i = 0; i < TXG_SIZE; i++) { ++ ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ++ avl_destroy(&dn->dn_ranges[i]); ++ list_destroy(&dn->dn_dirty_records[i]); ++ ASSERT3U(dn->dn_next_nblkptr[i], ==, 0); ++ ASSERT3U(dn->dn_next_nlevels[i], ==, 0); ++ ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); ++ ASSERT3U(dn->dn_next_bonustype[i], ==, 0); ++ ASSERT3U(dn->dn_rm_spillblk[i], ==, 0); ++ ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); ++ ASSERT3U(dn->dn_next_blksz[i], ==, 0); ++ } ++ ++ ASSERT3U(dn->dn_allocated_txg, ==, 0); ++ ASSERT3U(dn->dn_free_txg, ==, 0); ++ ASSERT3U(dn->dn_assigned_txg, ==, 0); ++ ASSERT3U(dn->dn_dirtyctx, ==, 0); ++ ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); ++ ASSERT3P(dn->dn_bonus, ==, NULL); ++ ASSERT(!dn->dn_have_spill); ++ ASSERT3P(dn->dn_zio, ==, NULL); ++ ASSERT3U(dn->dn_oldused, ==, 0); ++ ASSERT3U(dn->dn_oldflags, ==, 0); ++ ASSERT3U(dn->dn_olduid, ==, 0); ++ ASSERT3U(dn->dn_oldgid, ==, 0); ++ ASSERT3U(dn->dn_newuid, ==, 0); ++ ASSERT3U(dn->dn_newgid, ==, 0); ++ ASSERT3U(dn->dn_id_flags, ==, 0); ++ ++ ASSERT3U(dn->dn_dbufs_count, ==, 0); ++ list_destroy(&dn->dn_dbufs); ++} ++ ++void ++dnode_init(void) ++{ ++ ASSERT(dnode_cache == NULL); ++ dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), ++ 0, dnode_cons, dnode_dest, NULL, NULL, NULL, KMC_KMEM); ++ kmem_cache_set_move(dnode_cache, dnode_move); ++} ++ ++void ++dnode_fini(void) ++{ ++ kmem_cache_destroy(dnode_cache); ++ dnode_cache = NULL; ++} ++ ++ ++#ifdef ZFS_DEBUG ++void ++dnode_verify(dnode_t *dn) ++{ ++ int drop_struct_lock = FALSE; ++ ++ ASSERT(dn->dn_phys); ++ ASSERT(dn->dn_objset); ++ ASSERT(dn->dn_handle->dnh_dnode == dn); ++ ++ ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); ++ ++ if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) ++ return; ++ ++ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ drop_struct_lock = TRUE; ++ } ++ if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { ++ int i; ++ ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); ++ if (dn->dn_datablkshift) { ++ ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); ++ ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT); ++ ASSERT3U(1<dn_datablkshift, ==, dn->dn_datablksz); ++ } ++ ASSERT3U(dn->dn_nlevels, <=, 30); ++ ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES); ++ ASSERT3U(dn->dn_nblkptr, >=, 1); ++ ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); ++ ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); ++ ASSERT3U(dn->dn_datablksz, ==, ++ dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); ++ ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); ++ ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + ++ dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); ++ for (i = 0; i < TXG_SIZE; i++) { ++ ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); ++ } ++ } ++ if (dn->dn_phys->dn_type != DMU_OT_NONE) ++ ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); ++ ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); ++ if (dn->dn_dbuf != NULL) { ++ ASSERT3P(dn->dn_phys, ==, ++ (dnode_phys_t *)dn->dn_dbuf->db.db_data + ++ (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); ++ } ++ if (drop_struct_lock) ++ rw_exit(&dn->dn_struct_rwlock); ++} ++#endif ++ ++void ++dnode_byteswap(dnode_phys_t *dnp) ++{ ++ uint64_t *buf64 = (void*)&dnp->dn_blkptr; ++ int i; ++ ++ if (dnp->dn_type == DMU_OT_NONE) { ++ bzero(dnp, sizeof (dnode_phys_t)); ++ return; ++ } ++ ++ dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); ++ dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); ++ dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); ++ dnp->dn_used = BSWAP_64(dnp->dn_used); ++ ++ /* ++ * dn_nblkptr is only one byte, so it's OK to read it in either ++ * byte order. We can't read dn_bouslen. ++ */ ++ ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT); ++ ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR); ++ for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++) ++ buf64[i] = BSWAP_64(buf64[i]); ++ ++ /* ++ * OK to check dn_bonuslen for zero, because it won't matter if ++ * we have the wrong byte order. This is necessary because the ++ * dnode dnode is smaller than a regular dnode. ++ */ ++ if (dnp->dn_bonuslen != 0) { ++ /* ++ * Note that the bonus length calculated here may be ++ * longer than the actual bonus buffer. This is because ++ * we always put the bonus buffer after the last block ++ * pointer (instead of packing it against the end of the ++ * dnode buffer). ++ */ ++ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); ++ size_t len = DN_MAX_BONUSLEN - off; ++ ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES); ++ dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len); ++ } ++ ++ /* Swap SPILL block if we have one */ ++ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ++ byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t)); ++ ++} ++ ++void ++dnode_buf_byteswap(void *vbuf, size_t size) ++{ ++ dnode_phys_t *buf = vbuf; ++ int i; ++ ++ ASSERT3U(sizeof (dnode_phys_t), ==, (1<>= DNODE_SHIFT; ++ for (i = 0; i < size; i++) { ++ dnode_byteswap(buf); ++ buf++; ++ } ++} ++ ++static int ++free_range_compar(const void *node1, const void *node2) ++{ ++ const free_range_t *rp1 = node1; ++ const free_range_t *rp2 = node2; ++ ++ if (rp1->fr_blkid < rp2->fr_blkid) ++ return (-1); ++ else if (rp1->fr_blkid > rp2->fr_blkid) ++ return (1); ++ else return (0); ++} ++ ++void ++dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) ++{ ++ ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); ++ ++ dnode_setdirty(dn, tx); ++ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ++ ASSERT3U(newsize, <=, DN_MAX_BONUSLEN - ++ (dn->dn_nblkptr-1) * sizeof (blkptr_t)); ++ dn->dn_bonuslen = newsize; ++ if (newsize == 0) ++ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; ++ else ++ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; ++ rw_exit(&dn->dn_struct_rwlock); ++} ++ ++void ++dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) ++{ ++ ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); ++ dnode_setdirty(dn, tx); ++ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ++ dn->dn_bonustype = newtype; ++ dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; ++ rw_exit(&dn->dn_struct_rwlock); ++} ++ ++void ++dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) ++{ ++ ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); ++ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ++ dnode_setdirty(dn, tx); ++ dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK; ++ dn->dn_have_spill = B_FALSE; ++} ++ ++static void ++dnode_setdblksz(dnode_t *dn, int size) ++{ ++ ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0); ++ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); ++ ASSERT3U(size, >=, SPA_MINBLOCKSIZE); ++ ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, ++ 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); ++ dn->dn_datablksz = size; ++ dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; ++ dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0; ++} ++ ++static dnode_t * ++dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, ++ uint64_t object, dnode_handle_t *dnh) ++{ ++ dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_PUSHPAGE); ++ ++ ASSERT(!POINTER_IS_VALID(dn->dn_objset)); ++ dn->dn_moved = 0; ++ ++ /* ++ * Defer setting dn_objset until the dnode is ready to be a candidate ++ * for the dnode_move() callback. ++ */ ++ dn->dn_object = object; ++ dn->dn_dbuf = db; ++ dn->dn_handle = dnh; ++ dn->dn_phys = dnp; ++ ++ if (dnp->dn_datablkszsec) { ++ dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); ++ } else { ++ dn->dn_datablksz = 0; ++ dn->dn_datablkszsec = 0; ++ dn->dn_datablkshift = 0; ++ } ++ dn->dn_indblkshift = dnp->dn_indblkshift; ++ dn->dn_nlevels = dnp->dn_nlevels; ++ dn->dn_type = dnp->dn_type; ++ dn->dn_nblkptr = dnp->dn_nblkptr; ++ dn->dn_checksum = dnp->dn_checksum; ++ dn->dn_compress = dnp->dn_compress; ++ dn->dn_bonustype = dnp->dn_bonustype; ++ dn->dn_bonuslen = dnp->dn_bonuslen; ++ dn->dn_maxblkid = dnp->dn_maxblkid; ++ dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); ++ dn->dn_id_flags = 0; ++ ++ dmu_zfetch_init(&dn->dn_zfetch, dn); ++ ++ ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); ++ ++ mutex_enter(&os->os_lock); ++ list_insert_head(&os->os_dnodes, dn); ++ membar_producer(); ++ /* ++ * Everything else must be valid before assigning dn_objset makes the ++ * dnode eligible for dnode_move(). ++ */ ++ dn->dn_objset = os; ++ mutex_exit(&os->os_lock); ++ ++ arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); ++ return (dn); ++} ++ ++/* ++ * Caller must be holding the dnode handle, which is released upon return. ++ */ ++static void ++dnode_destroy(dnode_t *dn) ++{ ++ objset_t *os = dn->dn_objset; ++ ++ ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); ++ ++ mutex_enter(&os->os_lock); ++ POINTER_INVALIDATE(&dn->dn_objset); ++ list_remove(&os->os_dnodes, dn); ++ mutex_exit(&os->os_lock); ++ ++ /* the dnode can no longer move, so we can release the handle */ ++ zrl_remove(&dn->dn_handle->dnh_zrlock); ++ ++ dn->dn_allocated_txg = 0; ++ dn->dn_free_txg = 0; ++ dn->dn_assigned_txg = 0; ++ ++ dn->dn_dirtyctx = 0; ++ if (dn->dn_dirtyctx_firstset != NULL) { ++ kmem_free(dn->dn_dirtyctx_firstset, 1); ++ dn->dn_dirtyctx_firstset = NULL; ++ } ++ if (dn->dn_bonus != NULL) { ++ mutex_enter(&dn->dn_bonus->db_mtx); ++ dbuf_evict(dn->dn_bonus); ++ dn->dn_bonus = NULL; ++ } ++ dn->dn_zio = NULL; ++ ++ dn->dn_have_spill = B_FALSE; ++ dn->dn_oldused = 0; ++ dn->dn_oldflags = 0; ++ dn->dn_olduid = 0; ++ dn->dn_oldgid = 0; ++ dn->dn_newuid = 0; ++ dn->dn_newgid = 0; ++ dn->dn_id_flags = 0; ++ ++ dmu_zfetch_rele(&dn->dn_zfetch); ++ kmem_cache_free(dnode_cache, dn); ++ arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); ++} ++ ++void ++dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ++ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) ++{ ++ int i; ++ ++ if (blocksize == 0) ++ blocksize = 1 << zfs_default_bs; ++ else if (blocksize > SPA_MAXBLOCKSIZE) ++ blocksize = SPA_MAXBLOCKSIZE; ++ else ++ blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); ++ ++ if (ibs == 0) ++ ibs = zfs_default_ibs; ++ ++ ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); ++ ++ dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset, ++ dn->dn_object, tx->tx_txg, blocksize, ibs); ++ ++ ASSERT(dn->dn_type == DMU_OT_NONE); ++ ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); ++ ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); ++ ASSERT(ot != DMU_OT_NONE); ++ ASSERT3U(ot, <, DMU_OT_NUMTYPES); ++ ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || ++ (bonustype == DMU_OT_SA && bonuslen == 0) || ++ (bonustype != DMU_OT_NONE && bonuslen != 0)); ++ ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); ++ ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); ++ ASSERT(dn->dn_type == DMU_OT_NONE); ++ ASSERT3U(dn->dn_maxblkid, ==, 0); ++ ASSERT3U(dn->dn_allocated_txg, ==, 0); ++ ASSERT3U(dn->dn_assigned_txg, ==, 0); ++ ASSERT(refcount_is_zero(&dn->dn_tx_holds)); ++ ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); ++ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); ++ ++ for (i = 0; i < TXG_SIZE; i++) { ++ ASSERT3U(dn->dn_next_nblkptr[i], ==, 0); ++ ASSERT3U(dn->dn_next_nlevels[i], ==, 0); ++ ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); ++ ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); ++ ASSERT3U(dn->dn_next_bonustype[i], ==, 0); ++ ASSERT3U(dn->dn_rm_spillblk[i], ==, 0); ++ ASSERT3U(dn->dn_next_blksz[i], ==, 0); ++ ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ++ ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); ++ ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0); ++ } ++ ++ dn->dn_type = ot; ++ dnode_setdblksz(dn, blocksize); ++ dn->dn_indblkshift = ibs; ++ dn->dn_nlevels = 1; ++ if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ ++ dn->dn_nblkptr = 1; ++ else ++ dn->dn_nblkptr = 1 + ++ ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); ++ dn->dn_bonustype = bonustype; ++ dn->dn_bonuslen = bonuslen; ++ dn->dn_checksum = ZIO_CHECKSUM_INHERIT; ++ dn->dn_compress = ZIO_COMPRESS_INHERIT; ++ dn->dn_dirtyctx = 0; ++ ++ dn->dn_free_txg = 0; ++ if (dn->dn_dirtyctx_firstset) { ++ kmem_free(dn->dn_dirtyctx_firstset, 1); ++ dn->dn_dirtyctx_firstset = NULL; ++ } ++ ++ dn->dn_allocated_txg = tx->tx_txg; ++ dn->dn_id_flags = 0; ++ ++ dnode_setdirty(dn, tx); ++ dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; ++ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; ++ dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; ++ dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; ++} ++ ++void ++dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, ++ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) ++{ ++ int nblkptr; ++ ++ ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); ++ ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE); ++ ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0); ++ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); ++ ASSERT(tx->tx_txg != 0); ++ ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || ++ (bonustype != DMU_OT_NONE && bonuslen != 0) || ++ (bonustype == DMU_OT_SA && bonuslen == 0)); ++ ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); ++ ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); ++ ++ /* clean up any unreferenced dbufs */ ++ dnode_evict_dbufs(dn); ++ ++ dn->dn_id_flags = 0; ++ ++ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ++ dnode_setdirty(dn, tx); ++ if (dn->dn_datablksz != blocksize) { ++ /* change blocksize */ ++ ASSERT(dn->dn_maxblkid == 0 && ++ (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || ++ dnode_block_freed(dn, 0))); ++ dnode_setdblksz(dn, blocksize); ++ dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; ++ } ++ if (dn->dn_bonuslen != bonuslen) ++ dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; ++ ++ if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ ++ nblkptr = 1; ++ else ++ nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); ++ if (dn->dn_bonustype != bonustype) ++ dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype; ++ if (dn->dn_nblkptr != nblkptr) ++ dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; ++ if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { ++ dbuf_rm_spill(dn, tx); ++ dnode_rm_spill(dn, tx); ++ } ++ rw_exit(&dn->dn_struct_rwlock); ++ ++ /* change type */ ++ dn->dn_type = ot; ++ ++ /* change bonus size and type */ ++ mutex_enter(&dn->dn_mtx); ++ dn->dn_bonustype = bonustype; ++ dn->dn_bonuslen = bonuslen; ++ dn->dn_nblkptr = nblkptr; ++ dn->dn_checksum = ZIO_CHECKSUM_INHERIT; ++ dn->dn_compress = ZIO_COMPRESS_INHERIT; ++ ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); ++ ++ /* fix up the bonus db_size */ ++ if (dn->dn_bonus) { ++ dn->dn_bonus->db.db_size = ++ DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ++ ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); ++ } ++ ++ dn->dn_allocated_txg = tx->tx_txg; ++ mutex_exit(&dn->dn_mtx); ++} ++ ++#ifdef _KERNEL ++#ifdef DNODE_STATS ++static struct { ++ uint64_t dms_dnode_invalid; ++ uint64_t dms_dnode_recheck1; ++ uint64_t dms_dnode_recheck2; ++ uint64_t dms_dnode_special; ++ uint64_t dms_dnode_handle; ++ uint64_t dms_dnode_rwlock; ++ uint64_t dms_dnode_active; ++} dnode_move_stats; ++#endif /* DNODE_STATS */ ++ ++static void ++dnode_move_impl(dnode_t *odn, dnode_t *ndn) ++{ ++ int i; ++ ++ ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); ++ ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); ++ ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); ++ ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock)); ++ ++ /* Copy fields. */ ++ ndn->dn_objset = odn->dn_objset; ++ ndn->dn_object = odn->dn_object; ++ ndn->dn_dbuf = odn->dn_dbuf; ++ ndn->dn_handle = odn->dn_handle; ++ ndn->dn_phys = odn->dn_phys; ++ ndn->dn_type = odn->dn_type; ++ ndn->dn_bonuslen = odn->dn_bonuslen; ++ ndn->dn_bonustype = odn->dn_bonustype; ++ ndn->dn_nblkptr = odn->dn_nblkptr; ++ ndn->dn_checksum = odn->dn_checksum; ++ ndn->dn_compress = odn->dn_compress; ++ ndn->dn_nlevels = odn->dn_nlevels; ++ ndn->dn_indblkshift = odn->dn_indblkshift; ++ ndn->dn_datablkshift = odn->dn_datablkshift; ++ ndn->dn_datablkszsec = odn->dn_datablkszsec; ++ ndn->dn_datablksz = odn->dn_datablksz; ++ ndn->dn_maxblkid = odn->dn_maxblkid; ++ bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], ++ sizeof (odn->dn_next_nblkptr)); ++ bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], ++ sizeof (odn->dn_next_nlevels)); ++ bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0], ++ sizeof (odn->dn_next_indblkshift)); ++ bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0], ++ sizeof (odn->dn_next_bonustype)); ++ bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0], ++ sizeof (odn->dn_rm_spillblk)); ++ bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0], ++ sizeof (odn->dn_next_bonuslen)); ++ bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], ++ sizeof (odn->dn_next_blksz)); ++ for (i = 0; i < TXG_SIZE; i++) { ++ list_move_tail(&ndn->dn_dirty_records[i], ++ &odn->dn_dirty_records[i]); ++ } ++ bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges)); ++ ndn->dn_allocated_txg = odn->dn_allocated_txg; ++ ndn->dn_free_txg = odn->dn_free_txg; ++ ndn->dn_assigned_txg = odn->dn_assigned_txg; ++ ndn->dn_dirtyctx = odn->dn_dirtyctx; ++ ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; ++ ASSERT(refcount_count(&odn->dn_tx_holds) == 0); ++ refcount_transfer(&ndn->dn_holds, &odn->dn_holds); ++ ASSERT(list_is_empty(&ndn->dn_dbufs)); ++ list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs); ++ ndn->dn_dbufs_count = odn->dn_dbufs_count; ++ ndn->dn_bonus = odn->dn_bonus; ++ ndn->dn_have_spill = odn->dn_have_spill; ++ ndn->dn_zio = odn->dn_zio; ++ ndn->dn_oldused = odn->dn_oldused; ++ ndn->dn_oldflags = odn->dn_oldflags; ++ ndn->dn_olduid = odn->dn_olduid; ++ ndn->dn_oldgid = odn->dn_oldgid; ++ ndn->dn_newuid = odn->dn_newuid; ++ ndn->dn_newgid = odn->dn_newgid; ++ ndn->dn_id_flags = odn->dn_id_flags; ++ dmu_zfetch_init(&ndn->dn_zfetch, NULL); ++ list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream); ++ ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode; ++ ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt; ++ ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail; ++ ++ /* ++ * Update back pointers. Updating the handle fixes the back pointer of ++ * every descendant dbuf as well as the bonus dbuf. ++ */ ++ ASSERT(ndn->dn_handle->dnh_dnode == odn); ++ ndn->dn_handle->dnh_dnode = ndn; ++ if (ndn->dn_zfetch.zf_dnode == odn) { ++ ndn->dn_zfetch.zf_dnode = ndn; ++ } ++ ++ /* ++ * Invalidate the original dnode by clearing all of its back pointers. ++ */ ++ odn->dn_dbuf = NULL; ++ odn->dn_handle = NULL; ++ list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t), ++ offsetof(dmu_buf_impl_t, db_link)); ++ odn->dn_dbufs_count = 0; ++ odn->dn_bonus = NULL; ++ odn->dn_zfetch.zf_dnode = NULL; ++ ++ /* ++ * Set the low bit of the objset pointer to ensure that dnode_move() ++ * recognizes the dnode as invalid in any subsequent callback. ++ */ ++ POINTER_INVALIDATE(&odn->dn_objset); ++ ++ /* ++ * Satisfy the destructor. ++ */ ++ for (i = 0; i < TXG_SIZE; i++) { ++ list_create(&odn->dn_dirty_records[i], ++ sizeof (dbuf_dirty_record_t), ++ offsetof(dbuf_dirty_record_t, dr_dirty_node)); ++ odn->dn_ranges[i].avl_root = NULL; ++ odn->dn_ranges[i].avl_numnodes = 0; ++ odn->dn_next_nlevels[i] = 0; ++ odn->dn_next_indblkshift[i] = 0; ++ odn->dn_next_bonustype[i] = 0; ++ odn->dn_rm_spillblk[i] = 0; ++ odn->dn_next_bonuslen[i] = 0; ++ odn->dn_next_blksz[i] = 0; ++ } ++ odn->dn_allocated_txg = 0; ++ odn->dn_free_txg = 0; ++ odn->dn_assigned_txg = 0; ++ odn->dn_dirtyctx = 0; ++ odn->dn_dirtyctx_firstset = NULL; ++ odn->dn_have_spill = B_FALSE; ++ odn->dn_zio = NULL; ++ odn->dn_oldused = 0; ++ odn->dn_oldflags = 0; ++ odn->dn_olduid = 0; ++ odn->dn_oldgid = 0; ++ odn->dn_newuid = 0; ++ odn->dn_newgid = 0; ++ odn->dn_id_flags = 0; ++ ++ /* ++ * Mark the dnode. ++ */ ++ ndn->dn_moved = 1; ++ odn->dn_moved = (uint8_t)-1; ++} ++ ++/*ARGSUSED*/ ++static kmem_cbrc_t ++dnode_move(void *buf, void *newbuf, size_t size, void *arg) ++{ ++ dnode_t *odn = buf, *ndn = newbuf; ++ objset_t *os; ++ int64_t refcount; ++ uint32_t dbufs; ++ ++ /* ++ * The dnode is on the objset's list of known dnodes if the objset ++ * pointer is valid. We set the low bit of the objset pointer when ++ * freeing the dnode to invalidate it, and the memory patterns written ++ * by kmem (baddcafe and deadbeef) set at least one of the two low bits. ++ * A newly created dnode sets the objset pointer last of all to indicate ++ * that the dnode is known and in a valid state to be moved by this ++ * function. ++ */ ++ os = odn->dn_objset; ++ if (!POINTER_IS_VALID(os)) { ++ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid); ++ return (KMEM_CBRC_DONT_KNOW); ++ } ++ ++ /* ++ * Ensure that the objset does not go away during the move. ++ */ ++ rw_enter(&os_lock, RW_WRITER); ++ if (os != odn->dn_objset) { ++ rw_exit(&os_lock); ++ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1); ++ return (KMEM_CBRC_DONT_KNOW); ++ } ++ ++ /* ++ * If the dnode is still valid, then so is the objset. We know that no ++ * valid objset can be freed while we hold os_lock, so we can safely ++ * ensure that the objset remains in use. ++ */ ++ mutex_enter(&os->os_lock); ++ ++ /* ++ * Recheck the objset pointer in case the dnode was removed just before ++ * acquiring the lock. ++ */ ++ if (os != odn->dn_objset) { ++ mutex_exit(&os->os_lock); ++ rw_exit(&os_lock); ++ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2); ++ return (KMEM_CBRC_DONT_KNOW); ++ } ++ ++ /* ++ * At this point we know that as long as we hold os->os_lock, the dnode ++ * cannot be freed and fields within the dnode can be safely accessed. ++ * The objset listing this dnode cannot go away as long as this dnode is ++ * on its list. ++ */ ++ rw_exit(&os_lock); ++ if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { ++ mutex_exit(&os->os_lock); ++ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special); ++ return (KMEM_CBRC_NO); ++ } ++ ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ ++ ++ /* ++ * Lock the dnode handle to prevent the dnode from obtaining any new ++ * holds. This also prevents the descendant dbufs and the bonus dbuf ++ * from accessing the dnode, so that we can discount their holds. The ++ * handle is safe to access because we know that while the dnode cannot ++ * go away, neither can its handle. Once we hold dnh_zrlock, we can ++ * safely move any dnode referenced only by dbufs. ++ */ ++ if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { ++ mutex_exit(&os->os_lock); ++ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle); ++ return (KMEM_CBRC_LATER); ++ } ++ ++ /* ++ * Ensure a consistent view of the dnode's holds and the dnode's dbufs. ++ * We need to guarantee that there is a hold for every dbuf in order to ++ * determine whether the dnode is actively referenced. Falsely matching ++ * a dbuf to an active hold would lead to an unsafe move. It's possible ++ * that a thread already having an active dnode hold is about to add a ++ * dbuf, and we can't compare hold and dbuf counts while the add is in ++ * progress. ++ */ ++ if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { ++ zrl_exit(&odn->dn_handle->dnh_zrlock); ++ mutex_exit(&os->os_lock); ++ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock); ++ return (KMEM_CBRC_LATER); ++ } ++ ++ /* ++ * A dbuf may be removed (evicted) without an active dnode hold. In that ++ * case, the dbuf count is decremented under the handle lock before the ++ * dbuf's hold is released. This order ensures that if we count the hold ++ * after the dbuf is removed but before its hold is released, we will ++ * treat the unmatched hold as active and exit safely. If we count the ++ * hold before the dbuf is removed, the hold is discounted, and the ++ * removal is blocked until the move completes. ++ */ ++ refcount = refcount_count(&odn->dn_holds); ++ ASSERT(refcount >= 0); ++ dbufs = odn->dn_dbufs_count; ++ ++ /* We can't have more dbufs than dnode holds. */ ++ ASSERT3U(dbufs, <=, refcount); ++ DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount, ++ uint32_t, dbufs); ++ ++ if (refcount > dbufs) { ++ rw_exit(&odn->dn_struct_rwlock); ++ zrl_exit(&odn->dn_handle->dnh_zrlock); ++ mutex_exit(&os->os_lock); ++ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active); ++ return (KMEM_CBRC_LATER); ++ } ++ ++ rw_exit(&odn->dn_struct_rwlock); ++ ++ /* ++ * At this point we know that anyone with a hold on the dnode is not ++ * actively referencing it. The dnode is known and in a valid state to ++ * move. We're holding the locks needed to execute the critical section. ++ */ ++ dnode_move_impl(odn, ndn); ++ ++ list_link_replace(&odn->dn_link, &ndn->dn_link); ++ /* If the dnode was safe to move, the refcount cannot have changed. */ ++ ASSERT(refcount == refcount_count(&ndn->dn_holds)); ++ ASSERT(dbufs == ndn->dn_dbufs_count); ++ zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ ++ mutex_exit(&os->os_lock); ++ ++ return (KMEM_CBRC_YES); ++} ++#endif /* _KERNEL */ ++ ++void ++dnode_special_close(dnode_handle_t *dnh) ++{ ++ dnode_t *dn = dnh->dnh_dnode; ++ ++ /* ++ * Wait for final references to the dnode to clear. This can ++ * only happen if the arc is asyncronously evicting state that ++ * has a hold on this dnode while we are trying to evict this ++ * dnode. ++ */ ++ while (refcount_count(&dn->dn_holds) > 0) ++ delay(1); ++ zrl_add(&dnh->dnh_zrlock); ++ dnode_destroy(dn); /* implicit zrl_remove() */ ++ zrl_destroy(&dnh->dnh_zrlock); ++ dnh->dnh_dnode = NULL; ++} ++ ++dnode_t * ++dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, ++ dnode_handle_t *dnh) ++{ ++ dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh); ++ dnh->dnh_dnode = dn; ++ zrl_init(&dnh->dnh_zrlock); ++ DNODE_VERIFY(dn); ++ return (dn); ++} ++ ++static void ++dnode_buf_pageout(dmu_buf_t *db, void *arg) ++{ ++ dnode_children_t *children_dnodes = arg; ++ int i; ++ int epb = db->db_size >> DNODE_SHIFT; ++ ++ ASSERT(epb == children_dnodes->dnc_count); ++ ++ for (i = 0; i < epb; i++) { ++ dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; ++ dnode_t *dn; ++ ++ /* ++ * The dnode handle lock guards against the dnode moving to ++ * another valid address, so there is no need here to guard ++ * against changes to or from NULL. ++ */ ++ if (dnh->dnh_dnode == NULL) { ++ zrl_destroy(&dnh->dnh_zrlock); ++ continue; ++ } ++ ++ zrl_add(&dnh->dnh_zrlock); ++ dn = dnh->dnh_dnode; ++ /* ++ * If there are holds on this dnode, then there should ++ * be holds on the dnode's containing dbuf as well; thus ++ * it wouldn't be eligible for eviction and this function ++ * would not have been called. ++ */ ++ ASSERT(refcount_is_zero(&dn->dn_holds)); ++ ASSERT(refcount_is_zero(&dn->dn_tx_holds)); ++ ++ dnode_destroy(dn); /* implicit zrl_remove() */ ++ zrl_destroy(&dnh->dnh_zrlock); ++ dnh->dnh_dnode = NULL; ++ } ++ kmem_free(children_dnodes, sizeof (dnode_children_t) + ++ (epb - 1) * sizeof (dnode_handle_t)); ++} ++ ++/* ++ * errors: ++ * EINVAL - invalid object number. ++ * EIO - i/o error. ++ * succeeds even for free dnodes. ++ */ ++int ++dnode_hold_impl(objset_t *os, uint64_t object, int flag, ++ void *tag, dnode_t **dnp) ++{ ++ int epb, idx, err; ++ int drop_struct_lock = FALSE; ++ int type; ++ uint64_t blk; ++ dnode_t *mdn, *dn; ++ dmu_buf_impl_t *db; ++ dnode_children_t *children_dnodes; ++ dnode_handle_t *dnh; ++ ++ /* ++ * If you are holding the spa config lock as writer, you shouldn't ++ * be asking the DMU to do *anything* unless it's the root pool ++ * which may require us to read from the root filesystem while ++ * holding some (not all) of the locks as writer. ++ */ ++ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 || ++ (spa_is_root(os->os_spa) && ++ spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); ++ ++ if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { ++ dn = (object == DMU_USERUSED_OBJECT) ? ++ DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os); ++ if (dn == NULL) ++ return (ENOENT); ++ type = dn->dn_type; ++ if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ++ return (ENOENT); ++ if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) ++ return (EEXIST); ++ DNODE_VERIFY(dn); ++ (void) refcount_add(&dn->dn_holds, tag); ++ *dnp = dn; ++ return (0); ++ } ++ ++ if (object == 0 || object >= DN_MAX_OBJECT) ++ return (EINVAL); ++ ++ mdn = DMU_META_DNODE(os); ++ ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT); ++ ++ DNODE_VERIFY(mdn); ++ ++ if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) { ++ rw_enter(&mdn->dn_struct_rwlock, RW_READER); ++ drop_struct_lock = TRUE; ++ } ++ ++ blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); ++ ++ db = dbuf_hold(mdn, blk, FTAG); ++ if (drop_struct_lock) ++ rw_exit(&mdn->dn_struct_rwlock); ++ if (db == NULL) ++ return (EIO); ++ err = dbuf_read(db, NULL, DB_RF_CANFAIL); ++ if (err) { ++ dbuf_rele(db, FTAG); ++ return (err); ++ } ++ ++ ASSERT3U(db->db.db_size, >=, 1<db.db_size >> DNODE_SHIFT; ++ ++ idx = object & (epb-1); ++ ++ ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); ++ children_dnodes = dmu_buf_get_user(&db->db); ++ if (children_dnodes == NULL) { ++ int i; ++ dnode_children_t *winner; ++ children_dnodes = kmem_alloc(sizeof (dnode_children_t) + ++ (epb - 1) * sizeof (dnode_handle_t), ++ KM_PUSHPAGE | KM_NODEBUG); ++ children_dnodes->dnc_count = epb; ++ dnh = &children_dnodes->dnc_children[0]; ++ for (i = 0; i < epb; i++) { ++ zrl_init(&dnh[i].dnh_zrlock); ++ dnh[i].dnh_dnode = NULL; ++ } ++ if ((winner = dmu_buf_set_user(&db->db, children_dnodes, NULL, ++ dnode_buf_pageout))) { ++ kmem_free(children_dnodes, sizeof (dnode_children_t) + ++ (epb - 1) * sizeof (dnode_handle_t)); ++ children_dnodes = winner; ++ } ++ } ++ ASSERT(children_dnodes->dnc_count == epb); ++ ++ dnh = &children_dnodes->dnc_children[idx]; ++ zrl_add(&dnh->dnh_zrlock); ++ if ((dn = dnh->dnh_dnode) == NULL) { ++ dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; ++ dnode_t *winner; ++ ++ dn = dnode_create(os, phys, db, object, dnh); ++ winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn); ++ if (winner != NULL) { ++ zrl_add(&dnh->dnh_zrlock); ++ dnode_destroy(dn); /* implicit zrl_remove() */ ++ dn = winner; ++ } ++ } ++ ++ mutex_enter(&dn->dn_mtx); ++ type = dn->dn_type; ++ if (dn->dn_free_txg || ++ ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || ++ ((flag & DNODE_MUST_BE_FREE) && ++ (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) { ++ mutex_exit(&dn->dn_mtx); ++ zrl_remove(&dnh->dnh_zrlock); ++ dbuf_rele(db, FTAG); ++ return (type == DMU_OT_NONE ? ENOENT : EEXIST); ++ } ++ mutex_exit(&dn->dn_mtx); ++ ++ if (refcount_add(&dn->dn_holds, tag) == 1) ++ dbuf_add_ref(db, dnh); ++ /* Now we can rely on the hold to prevent the dnode from moving. */ ++ zrl_remove(&dnh->dnh_zrlock); ++ ++ DNODE_VERIFY(dn); ++ ASSERT3P(dn->dn_dbuf, ==, db); ++ ASSERT3U(dn->dn_object, ==, object); ++ dbuf_rele(db, FTAG); ++ ++ *dnp = dn; ++ return (0); ++} ++ ++/* ++ * Return held dnode if the object is allocated, NULL if not. ++ */ ++int ++dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) ++{ ++ return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); ++} ++ ++/* ++ * Can only add a reference if there is already at least one ++ * reference on the dnode. Returns FALSE if unable to add a ++ * new reference. ++ */ ++boolean_t ++dnode_add_ref(dnode_t *dn, void *tag) ++{ ++ mutex_enter(&dn->dn_mtx); ++ if (refcount_is_zero(&dn->dn_holds)) { ++ mutex_exit(&dn->dn_mtx); ++ return (FALSE); ++ } ++ VERIFY(1 < refcount_add(&dn->dn_holds, tag)); ++ mutex_exit(&dn->dn_mtx); ++ return (TRUE); ++} ++ ++void ++dnode_rele(dnode_t *dn, void *tag) ++{ ++ uint64_t refs; ++ /* Get while the hold prevents the dnode from moving. */ ++ dmu_buf_impl_t *db = dn->dn_dbuf; ++ dnode_handle_t *dnh = dn->dn_handle; ++ ++ mutex_enter(&dn->dn_mtx); ++ refs = refcount_remove(&dn->dn_holds, tag); ++ mutex_exit(&dn->dn_mtx); ++ ++ /* ++ * It's unsafe to release the last hold on a dnode by dnode_rele() or ++ * indirectly by dbuf_rele() while relying on the dnode handle to ++ * prevent the dnode from moving, since releasing the last hold could ++ * result in the dnode's parent dbuf evicting its dnode handles. For ++ * that reason anyone calling dnode_rele() or dbuf_rele() without some ++ * other direct or indirect hold on the dnode must first drop the dnode ++ * handle. ++ */ ++ ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); ++ ++ /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ ++ if (refs == 0 && db != NULL) { ++ /* ++ * Another thread could add a hold to the dnode handle in ++ * dnode_hold_impl() while holding the parent dbuf. Since the ++ * hold on the parent dbuf prevents the handle from being ++ * destroyed, the hold on the handle is OK. We can't yet assert ++ * that the handle has zero references, but that will be ++ * asserted anyway when the handle gets destroyed. ++ */ ++ dbuf_rele(db, dnh); ++ } ++} ++ ++void ++dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) ++{ ++ objset_t *os = dn->dn_objset; ++ uint64_t txg = tx->tx_txg; ++ ++ if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { ++ dsl_dataset_dirty(os->os_dsl_dataset, tx); ++ return; ++ } ++ ++ DNODE_VERIFY(dn); ++ ++#ifdef ZFS_DEBUG ++ mutex_enter(&dn->dn_mtx); ++ ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); ++ ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); ++ mutex_exit(&dn->dn_mtx); ++#endif ++ ++ /* ++ * Determine old uid/gid when necessary ++ */ ++ dmu_objset_userquota_get_ids(dn, B_TRUE, tx); ++ ++ mutex_enter(&os->os_lock); ++ ++ /* ++ * If we are already marked dirty, we're done. ++ */ ++ if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { ++ mutex_exit(&os->os_lock); ++ return; ++ } ++ ++ ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs)); ++ ASSERT(dn->dn_datablksz != 0); ++ ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0); ++ ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0); ++ ASSERT3U(dn->dn_next_bonustype[txg&TXG_MASK], ==, 0); ++ ++ dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", ++ dn->dn_object, txg); ++ ++ if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) { ++ list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn); ++ } else { ++ list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn); ++ } ++ ++ mutex_exit(&os->os_lock); ++ ++ /* ++ * The dnode maintains a hold on its containing dbuf as ++ * long as there are holds on it. Each instantiated child ++ * dbuf maintains a hold on the dnode. When the last child ++ * drops its hold, the dnode will drop its hold on the ++ * containing dbuf. We add a "dirty hold" here so that the ++ * dnode will hang around after we finish processing its ++ * children. ++ */ ++ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); ++ ++ (void) dbuf_dirty(dn->dn_dbuf, tx); ++ ++ dsl_dataset_dirty(os->os_dsl_dataset, tx); ++} ++ ++void ++dnode_free(dnode_t *dn, dmu_tx_t *tx) ++{ ++ int txgoff = tx->tx_txg & TXG_MASK; ++ ++ dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg); ++ ++ /* we should be the only holder... hopefully */ ++ /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */ ++ ++ mutex_enter(&dn->dn_mtx); ++ if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { ++ mutex_exit(&dn->dn_mtx); ++ return; ++ } ++ dn->dn_free_txg = tx->tx_txg; ++ mutex_exit(&dn->dn_mtx); ++ ++ /* ++ * If the dnode is already dirty, it needs to be moved from ++ * the dirty list to the free list. ++ */ ++ mutex_enter(&dn->dn_objset->os_lock); ++ if (list_link_active(&dn->dn_dirty_link[txgoff])) { ++ list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn); ++ list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn); ++ mutex_exit(&dn->dn_objset->os_lock); ++ } else { ++ mutex_exit(&dn->dn_objset->os_lock); ++ dnode_setdirty(dn, tx); ++ } ++} ++ ++/* ++ * Try to change the block size for the indicated dnode. This can only ++ * succeed if there are no blocks allocated or dirty beyond first block ++ */ ++int ++dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) ++{ ++ dmu_buf_impl_t *db, *db_next; ++ int err; ++ ++ if (size == 0) ++ size = SPA_MINBLOCKSIZE; ++ if (size > SPA_MAXBLOCKSIZE) ++ size = SPA_MAXBLOCKSIZE; ++ else ++ size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); ++ ++ if (ibs == dn->dn_indblkshift) ++ ibs = 0; ++ ++ if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) ++ return (0); ++ ++ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ++ ++ /* Check for any allocated blocks beyond the first */ ++ if (dn->dn_phys->dn_maxblkid != 0) ++ goto fail; ++ ++ mutex_enter(&dn->dn_dbufs_mtx); ++ for (db = list_head(&dn->dn_dbufs); db; db = db_next) { ++ db_next = list_next(&dn->dn_dbufs, db); ++ ++ if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && ++ db->db_blkid != DMU_SPILL_BLKID) { ++ mutex_exit(&dn->dn_dbufs_mtx); ++ goto fail; ++ } ++ } ++ mutex_exit(&dn->dn_dbufs_mtx); ++ ++ if (ibs && dn->dn_nlevels != 1) ++ goto fail; ++ ++ /* resize the old block */ ++ err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); ++ if (err == 0) ++ dbuf_new_size(db, size, tx); ++ else if (err != ENOENT) ++ goto fail; ++ ++ dnode_setdblksz(dn, size); ++ dnode_setdirty(dn, tx); ++ dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; ++ if (ibs) { ++ dn->dn_indblkshift = ibs; ++ dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; ++ } ++ /* rele after we have fixed the blocksize in the dnode */ ++ if (db) ++ dbuf_rele(db, FTAG); ++ ++ rw_exit(&dn->dn_struct_rwlock); ++ return (0); ++ ++fail: ++ rw_exit(&dn->dn_struct_rwlock); ++ return (ENOTSUP); ++} ++ ++/* read-holding callers must not rely on the lock being continuously held */ ++void ++dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) ++{ ++ uint64_t txgoff = tx->tx_txg & TXG_MASK; ++ int epbs, new_nlevels; ++ uint64_t sz; ++ ++ ASSERT(blkid != DMU_BONUS_BLKID); ++ ++ ASSERT(have_read ? ++ RW_READ_HELD(&dn->dn_struct_rwlock) : ++ RW_WRITE_HELD(&dn->dn_struct_rwlock)); ++ ++ /* ++ * if we have a read-lock, check to see if we need to do any work ++ * before upgrading to a write-lock. ++ */ ++ if (have_read) { ++ if (blkid <= dn->dn_maxblkid) ++ return; ++ ++ if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { ++ rw_exit(&dn->dn_struct_rwlock); ++ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ++ } ++ } ++ ++ if (blkid <= dn->dn_maxblkid) ++ goto out; ++ ++ dn->dn_maxblkid = blkid; ++ ++ /* ++ * Compute the number of levels necessary to support the new maxblkid. ++ */ ++ new_nlevels = 1; ++ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ++ for (sz = dn->dn_nblkptr; ++ sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) ++ new_nlevels++; ++ ++ if (new_nlevels > dn->dn_nlevels) { ++ int old_nlevels = dn->dn_nlevels; ++ dmu_buf_impl_t *db; ++ list_t *list; ++ dbuf_dirty_record_t *new, *dr, *dr_next; ++ ++ dn->dn_nlevels = new_nlevels; ++ ++ ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); ++ dn->dn_next_nlevels[txgoff] = new_nlevels; ++ ++ /* dirty the left indirects */ ++ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); ++ ASSERT(db != NULL); ++ new = dbuf_dirty(db, tx); ++ dbuf_rele(db, FTAG); ++ ++ /* transfer the dirty records to the new indirect */ ++ mutex_enter(&dn->dn_mtx); ++ mutex_enter(&new->dt.di.dr_mtx); ++ list = &dn->dn_dirty_records[txgoff]; ++ for (dr = list_head(list); dr; dr = dr_next) { ++ dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); ++ if (dr->dr_dbuf->db_level != new_nlevels-1 && ++ dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && ++ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { ++ ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); ++ list_remove(&dn->dn_dirty_records[txgoff], dr); ++ list_insert_tail(&new->dt.di.dr_children, dr); ++ dr->dr_parent = new; ++ } ++ } ++ mutex_exit(&new->dt.di.dr_mtx); ++ mutex_exit(&dn->dn_mtx); ++ } ++ ++out: ++ if (have_read) ++ rw_downgrade(&dn->dn_struct_rwlock); ++} ++ ++void ++dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) ++{ ++ avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK]; ++ avl_index_t where; ++ free_range_t *rp; ++ free_range_t rp_tofind; ++ uint64_t endblk = blkid + nblks; ++ ++ ASSERT(MUTEX_HELD(&dn->dn_mtx)); ++ ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */ ++ ++ dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", ++ blkid, nblks, tx->tx_txg); ++ rp_tofind.fr_blkid = blkid; ++ rp = avl_find(tree, &rp_tofind, &where); ++ if (rp == NULL) ++ rp = avl_nearest(tree, where, AVL_BEFORE); ++ if (rp == NULL) ++ rp = avl_nearest(tree, where, AVL_AFTER); ++ ++ while (rp && (rp->fr_blkid <= blkid + nblks)) { ++ uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks; ++ free_range_t *nrp = AVL_NEXT(tree, rp); ++ ++ if (blkid <= rp->fr_blkid && endblk >= fr_endblk) { ++ /* clear this entire range */ ++ avl_remove(tree, rp); ++ kmem_free(rp, sizeof (free_range_t)); ++ } else if (blkid <= rp->fr_blkid && ++ endblk > rp->fr_blkid && endblk < fr_endblk) { ++ /* clear the beginning of this range */ ++ rp->fr_blkid = endblk; ++ rp->fr_nblks = fr_endblk - endblk; ++ } else if (blkid > rp->fr_blkid && blkid < fr_endblk && ++ endblk >= fr_endblk) { ++ /* clear the end of this range */ ++ rp->fr_nblks = blkid - rp->fr_blkid; ++ } else if (blkid > rp->fr_blkid && endblk < fr_endblk) { ++ /* clear a chunk out of this range */ ++ free_range_t *new_rp = ++ kmem_alloc(sizeof (free_range_t), KM_PUSHPAGE); ++ ++ new_rp->fr_blkid = endblk; ++ new_rp->fr_nblks = fr_endblk - endblk; ++ avl_insert_here(tree, new_rp, rp, AVL_AFTER); ++ rp->fr_nblks = blkid - rp->fr_blkid; ++ } ++ /* there may be no overlap */ ++ rp = nrp; ++ } ++} ++ ++void ++dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) ++{ ++ dmu_buf_impl_t *db; ++ uint64_t blkoff, blkid, nblks; ++ int blksz, blkshift, head, tail; ++ int trunc = FALSE; ++ int epbs; ++ ++ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ++ blksz = dn->dn_datablksz; ++ blkshift = dn->dn_datablkshift; ++ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ++ ++ if (len == -1ULL) { ++ len = UINT64_MAX - off; ++ trunc = TRUE; ++ } ++ ++ /* ++ * First, block align the region to free: ++ */ ++ if (ISP2(blksz)) { ++ head = P2NPHASE(off, blksz); ++ blkoff = P2PHASE(off, blksz); ++ if ((off >> blkshift) > dn->dn_maxblkid) ++ goto out; ++ } else { ++ ASSERT(dn->dn_maxblkid == 0); ++ if (off == 0 && len >= blksz) { ++ /* Freeing the whole block; fast-track this request */ ++ blkid = 0; ++ nblks = 1; ++ goto done; ++ } else if (off >= blksz) { ++ /* Freeing past end-of-data */ ++ goto out; ++ } else { ++ /* Freeing part of the block. */ ++ head = blksz - off; ++ ASSERT3U(head, >, 0); ++ } ++ blkoff = off; ++ } ++ /* zero out any partial block data at the start of the range */ ++ if (head) { ++ ASSERT3U(blkoff + head, ==, blksz); ++ if (len < head) ++ head = len; ++ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, ++ FTAG, &db) == 0) { ++ caddr_t data; ++ ++ /* don't dirty if it isn't on disk and isn't dirty */ ++ if (db->db_last_dirty || ++ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { ++ rw_exit(&dn->dn_struct_rwlock); ++ dbuf_will_dirty(db, tx); ++ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ++ data = db->db.db_data; ++ bzero(data + blkoff, head); ++ } ++ dbuf_rele(db, FTAG); ++ } ++ off += head; ++ len -= head; ++ } ++ ++ /* If the range was less than one block, we're done */ ++ if (len == 0) ++ goto out; ++ ++ /* If the remaining range is past end of file, we're done */ ++ if ((off >> blkshift) > dn->dn_maxblkid) ++ goto out; ++ ++ ASSERT(ISP2(blksz)); ++ if (trunc) ++ tail = 0; ++ else ++ tail = P2PHASE(len, blksz); ++ ++ ASSERT3U(P2PHASE(off, blksz), ==, 0); ++ /* zero out any partial block data at the end of the range */ ++ if (tail) { ++ if (len < tail) ++ tail = len; ++ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), ++ TRUE, FTAG, &db) == 0) { ++ /* don't dirty if not on disk and not dirty */ ++ if (db->db_last_dirty || ++ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { ++ rw_exit(&dn->dn_struct_rwlock); ++ dbuf_will_dirty(db, tx); ++ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ++ bzero(db->db.db_data, tail); ++ } ++ dbuf_rele(db, FTAG); ++ } ++ len -= tail; ++ } ++ ++ /* If the range did not include a full block, we are done */ ++ if (len == 0) ++ goto out; ++ ++ ASSERT(IS_P2ALIGNED(off, blksz)); ++ ASSERT(trunc || IS_P2ALIGNED(len, blksz)); ++ blkid = off >> blkshift; ++ nblks = len >> blkshift; ++ if (trunc) ++ nblks += 1; ++ ++ /* ++ * Read in and mark all the level-1 indirects dirty, ++ * so that they will stay in memory until syncing phase. ++ * Always dirty the first and last indirect to make sure ++ * we dirty all the partial indirects. ++ */ ++ if (dn->dn_nlevels > 1) { ++ uint64_t i, first, last; ++ int shift = epbs + dn->dn_datablkshift; ++ ++ first = blkid >> epbs; ++ if ((db = dbuf_hold_level(dn, 1, first, FTAG))) { ++ dbuf_will_dirty(db, tx); ++ dbuf_rele(db, FTAG); ++ } ++ if (trunc) ++ last = dn->dn_maxblkid >> epbs; ++ else ++ last = (blkid + nblks - 1) >> epbs; ++ if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) { ++ dbuf_will_dirty(db, tx); ++ dbuf_rele(db, FTAG); ++ } ++ for (i = first + 1; i < last; i++) { ++ uint64_t ibyte = i << shift; ++ int err; ++ ++ err = dnode_next_offset(dn, ++ DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0); ++ i = ibyte >> shift; ++ if (err == ESRCH || i >= last) ++ break; ++ ASSERT(err == 0); ++ db = dbuf_hold_level(dn, 1, i, FTAG); ++ if (db) { ++ dbuf_will_dirty(db, tx); ++ dbuf_rele(db, FTAG); ++ } ++ } ++ } ++done: ++ /* ++ * Add this range to the dnode range list. ++ * We will finish up this free operation in the syncing phase. ++ */ ++ mutex_enter(&dn->dn_mtx); ++ dnode_clear_range(dn, blkid, nblks, tx); ++ { ++ free_range_t *rp, *found; ++ avl_index_t where; ++ avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK]; ++ ++ /* Add new range to dn_ranges */ ++ rp = kmem_alloc(sizeof (free_range_t), KM_PUSHPAGE); ++ rp->fr_blkid = blkid; ++ rp->fr_nblks = nblks; ++ found = avl_find(tree, rp, &where); ++ ASSERT(found == NULL); ++ avl_insert(tree, rp, where); ++ dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", ++ blkid, nblks, tx->tx_txg); ++ } ++ mutex_exit(&dn->dn_mtx); ++ ++ dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); ++ dnode_setdirty(dn, tx); ++out: ++ if (trunc && dn->dn_maxblkid >= (off >> blkshift)) ++ dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0); ++ ++ rw_exit(&dn->dn_struct_rwlock); ++} ++ ++static boolean_t ++dnode_spill_freed(dnode_t *dn) ++{ ++ int i; ++ ++ mutex_enter(&dn->dn_mtx); ++ for (i = 0; i < TXG_SIZE; i++) { ++ if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK) ++ break; ++ } ++ mutex_exit(&dn->dn_mtx); ++ return (i < TXG_SIZE); ++} ++ ++/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ ++uint64_t ++dnode_block_freed(dnode_t *dn, uint64_t blkid) ++{ ++ free_range_t range_tofind; ++ void *dp = spa_get_dsl(dn->dn_objset->os_spa); ++ int i; ++ ++ if (blkid == DMU_BONUS_BLKID) ++ return (FALSE); ++ ++ /* ++ * If we're in the process of opening the pool, dp will not be ++ * set yet, but there shouldn't be anything dirty. ++ */ ++ if (dp == NULL) ++ return (FALSE); ++ ++ if (dn->dn_free_txg) ++ return (TRUE); ++ ++ if (blkid == DMU_SPILL_BLKID) ++ return (dnode_spill_freed(dn)); ++ ++ range_tofind.fr_blkid = blkid; ++ mutex_enter(&dn->dn_mtx); ++ for (i = 0; i < TXG_SIZE; i++) { ++ free_range_t *range_found; ++ avl_index_t idx; ++ ++ range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx); ++ if (range_found) { ++ ASSERT(range_found->fr_nblks > 0); ++ break; ++ } ++ range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE); ++ if (range_found && ++ range_found->fr_blkid + range_found->fr_nblks > blkid) ++ break; ++ } ++ mutex_exit(&dn->dn_mtx); ++ return (i < TXG_SIZE); ++} ++ ++/* call from syncing context when we actually write/free space for this dnode */ ++void ++dnode_diduse_space(dnode_t *dn, int64_t delta) ++{ ++ uint64_t space; ++ dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n", ++ dn, dn->dn_phys, ++ (u_longlong_t)dn->dn_phys->dn_used, ++ (longlong_t)delta); ++ ++ mutex_enter(&dn->dn_mtx); ++ space = DN_USED_BYTES(dn->dn_phys); ++ if (delta > 0) { ++ ASSERT3U(space + delta, >=, space); /* no overflow */ ++ } else { ++ ASSERT3U(space, >=, -delta); /* no underflow */ ++ } ++ space += delta; ++ if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) { ++ ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); ++ ASSERT3U(P2PHASE(space, 1<dn_phys->dn_used = space >> DEV_BSHIFT; ++ } else { ++ dn->dn_phys->dn_used = space; ++ dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES; ++ } ++ mutex_exit(&dn->dn_mtx); ++} ++ ++/* ++ * Call when we think we're going to write/free space in open context. ++ * Be conservative (ie. OK to write less than this or free more than ++ * this, but don't write more or free less). ++ */ ++void ++dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) ++{ ++ objset_t *os = dn->dn_objset; ++ dsl_dataset_t *ds = os->os_dsl_dataset; ++ ++ if (space > 0) ++ space = spa_get_asize(os->os_spa, space); ++ ++ if (ds) ++ dsl_dir_willuse_space(ds->ds_dir, space, tx); ++ ++ dmu_tx_willuse_space(tx, space); ++} ++ ++/* ++ * This function scans a block at the indicated "level" looking for ++ * a hole or data (depending on 'flags'). If level > 0, then we are ++ * scanning an indirect block looking at its pointers. If level == 0, ++ * then we are looking at a block of dnodes. If we don't find what we ++ * are looking for in the block, we return ESRCH. Otherwise, return ++ * with *offset pointing to the beginning (if searching forwards) or ++ * end (if searching backwards) of the range covered by the block ++ * pointer we matched on (or dnode). ++ * ++ * The basic search algorithm used below by dnode_next_offset() is to ++ * use this function to search up the block tree (widen the search) until ++ * we find something (i.e., we don't return ESRCH) and then search back ++ * down the tree (narrow the search) until we reach our original search ++ * level. ++ */ ++static int ++dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, ++ int lvl, uint64_t blkfill, uint64_t txg) ++{ ++ dmu_buf_impl_t *db = NULL; ++ void *data = NULL; ++ uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ++ uint64_t epb = 1ULL << epbs; ++ uint64_t minfill, maxfill; ++ boolean_t hole; ++ int i, inc, error, span; ++ ++ dprintf("probing object %llu offset %llx level %d of %u\n", ++ dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); ++ ++ hole = ((flags & DNODE_FIND_HOLE) != 0); ++ inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; ++ ASSERT(txg == 0 || !hole); ++ ++ if (lvl == dn->dn_phys->dn_nlevels) { ++ error = 0; ++ epb = dn->dn_phys->dn_nblkptr; ++ data = dn->dn_phys->dn_blkptr; ++ } else { ++ uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); ++ error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); ++ if (error) { ++ if (error != ENOENT) ++ return (error); ++ if (hole) ++ return (0); ++ /* ++ * This can only happen when we are searching up ++ * the block tree for data. We don't really need to ++ * adjust the offset, as we will just end up looking ++ * at the pointer to this block in its parent, and its ++ * going to be unallocated, so we will skip over it. ++ */ ++ return (ESRCH); ++ } ++ error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT); ++ if (error) { ++ dbuf_rele(db, FTAG); ++ return (error); ++ } ++ data = db->db.db_data; ++ } ++ ++ if (db && txg && ++ (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) { ++ /* ++ * This can only happen when we are searching up the tree ++ * and these conditions mean that we need to keep climbing. ++ */ ++ error = ESRCH; ++ } else if (lvl == 0) { ++ dnode_phys_t *dnp = data; ++ span = DNODE_SHIFT; ++ ASSERT(dn->dn_type == DMU_OT_DNODE); ++ ++ for (i = (*offset >> span) & (blkfill - 1); ++ i >= 0 && i < blkfill; i += inc) { ++ if ((dnp[i].dn_type == DMU_OT_NONE) == hole) ++ break; ++ *offset += (1ULL << span) * inc; ++ } ++ if (i < 0 || i == blkfill) ++ error = ESRCH; ++ } else { ++ blkptr_t *bp = data; ++ uint64_t start = *offset; ++ span = (lvl - 1) * epbs + dn->dn_datablkshift; ++ minfill = 0; ++ maxfill = blkfill << ((lvl - 1) * epbs); ++ ++ if (hole) ++ maxfill--; ++ else ++ minfill++; ++ ++ *offset = *offset >> span; ++ for (i = BF64_GET(*offset, 0, epbs); ++ i >= 0 && i < epb; i += inc) { ++ if (bp[i].blk_fill >= minfill && ++ bp[i].blk_fill <= maxfill && ++ (hole || bp[i].blk_birth > txg)) ++ break; ++ if (inc > 0 || *offset > 0) ++ *offset += inc; ++ } ++ *offset = *offset << span; ++ if (inc < 0) { ++ /* traversing backwards; position offset at the end */ ++ ASSERT3U(*offset, <=, start); ++ *offset = MIN(*offset + (1ULL << span) - 1, start); ++ } else if (*offset < start) { ++ *offset = start; ++ } ++ if (i < 0 || i >= epb) ++ error = ESRCH; ++ } ++ ++ if (db) ++ dbuf_rele(db, FTAG); ++ ++ return (error); ++} ++ ++/* ++ * Find the next hole, data, or sparse region at or after *offset. ++ * The value 'blkfill' tells us how many items we expect to find ++ * in an L0 data block; this value is 1 for normal objects, ++ * DNODES_PER_BLOCK for the meta dnode, and some fraction of ++ * DNODES_PER_BLOCK when searching for sparse regions thereof. ++ * ++ * Examples: ++ * ++ * dnode_next_offset(dn, flags, offset, 1, 1, 0); ++ * Finds the next/previous hole/data in a file. ++ * Used in dmu_offset_next(). ++ * ++ * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg); ++ * Finds the next free/allocated dnode an objset's meta-dnode. ++ * Only finds objects that have new contents since txg (ie. ++ * bonus buffer changes and content removal are ignored). ++ * Used in dmu_object_next(). ++ * ++ * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0); ++ * Finds the next L2 meta-dnode bp that's at most 1/4 full. ++ * Used in dmu_object_alloc(). ++ */ ++int ++dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, ++ int minlvl, uint64_t blkfill, uint64_t txg) ++{ ++ uint64_t initial_offset = *offset; ++ int lvl, maxlvl; ++ int error = 0; ++ ++ if (!(flags & DNODE_FIND_HAVELOCK)) ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ ++ if (dn->dn_phys->dn_nlevels == 0) { ++ error = ESRCH; ++ goto out; ++ } ++ ++ if (dn->dn_datablkshift == 0) { ++ if (*offset < dn->dn_datablksz) { ++ if (flags & DNODE_FIND_HOLE) ++ *offset = dn->dn_datablksz; ++ } else { ++ error = ESRCH; ++ } ++ goto out; ++ } ++ ++ maxlvl = dn->dn_phys->dn_nlevels; ++ ++ for (lvl = minlvl; lvl <= maxlvl; lvl++) { ++ error = dnode_next_offset_level(dn, ++ flags, offset, lvl, blkfill, txg); ++ if (error != ESRCH) ++ break; ++ } ++ ++ while (error == 0 && --lvl >= minlvl) { ++ error = dnode_next_offset_level(dn, ++ flags, offset, lvl, blkfill, txg); ++ } ++ ++ if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? ++ initial_offset < *offset : initial_offset > *offset)) ++ error = ESRCH; ++out: ++ if (!(flags & DNODE_FIND_HAVELOCK)) ++ rw_exit(&dn->dn_struct_rwlock); ++ ++ return (error); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dnode_sync.c linux-3.2.33-go/fs/zfs/zfs/dnode_sync.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dnode_sync.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dnode_sync.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,697 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static void ++dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) ++{ ++ dmu_buf_impl_t *db; ++ int txgoff = tx->tx_txg & TXG_MASK; ++ int nblkptr = dn->dn_phys->dn_nblkptr; ++ int old_toplvl = dn->dn_phys->dn_nlevels - 1; ++ int new_level = dn->dn_next_nlevels[txgoff]; ++ int i; ++ ++ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ++ ++ /* this dnode can't be paged out because it's dirty */ ++ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); ++ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ++ ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); ++ ++ db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); ++ ASSERT(db != NULL); ++ ++ dn->dn_phys->dn_nlevels = new_level; ++ dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, ++ dn->dn_object, dn->dn_phys->dn_nlevels); ++ ++ /* check for existing blkptrs in the dnode */ ++ for (i = 0; i < nblkptr; i++) ++ if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i])) ++ break; ++ if (i != nblkptr) { ++ /* transfer dnode's block pointers to new indirect block */ ++ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); ++ ASSERT(db->db.db_data); ++ ASSERT(arc_released(db->db_buf)); ++ ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); ++ bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, ++ sizeof (blkptr_t) * nblkptr); ++ arc_buf_freeze(db->db_buf); ++ } ++ ++ /* set dbuf's parent pointers to new indirect buf */ ++ for (i = 0; i < nblkptr; i++) { ++ dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i); ++ ++ if (child == NULL) ++ continue; ++#ifdef DEBUG ++ DB_DNODE_ENTER(child); ++ ASSERT3P(DB_DNODE(child), ==, dn); ++ DB_DNODE_EXIT(child); ++#endif /* DEBUG */ ++ if (child->db_parent && child->db_parent != dn->dn_dbuf) { ++ ASSERT(child->db_parent->db_level == db->db_level); ++ ASSERT(child->db_blkptr != ++ &dn->dn_phys->dn_blkptr[child->db_blkid]); ++ mutex_exit(&child->db_mtx); ++ continue; ++ } ++ ASSERT(child->db_parent == NULL || ++ child->db_parent == dn->dn_dbuf); ++ ++ child->db_parent = db; ++ dbuf_add_ref(db, child); ++ if (db->db.db_data) ++ child->db_blkptr = (blkptr_t *)db->db.db_data + i; ++ else ++ child->db_blkptr = NULL; ++ dprintf_dbuf_bp(child, child->db_blkptr, ++ "changed db_blkptr to new indirect %s", ""); ++ ++ mutex_exit(&child->db_mtx); ++ } ++ ++ bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); ++ ++ dbuf_rele(db, FTAG); ++ ++ rw_exit(&dn->dn_struct_rwlock); ++} ++ ++static int ++free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; ++ uint64_t bytesfreed = 0; ++ int i, blocks_freed = 0; ++ ++ dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); ++ ++ for (i = 0; i < num; i++, bp++) { ++ if (BP_IS_HOLE(bp)) ++ continue; ++ ++ bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE); ++ ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); ++ bzero(bp, sizeof (blkptr_t)); ++ blocks_freed += 1; ++ } ++ dnode_diduse_space(dn, -bytesfreed); ++ return (blocks_freed); ++} ++ ++#ifdef ZFS_DEBUG ++static void ++free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) ++{ ++ int off, num; ++ int i, err, epbs; ++ uint64_t txg = tx->tx_txg; ++ dnode_t *dn; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ++ off = start - (db->db_blkid * 1<=, 0); ++ ASSERT3U(num, >=, 0); ++ ASSERT3U(db->db_level, >, 0); ++ ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); ++ ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); ++ ASSERT(db->db_blkptr != NULL); ++ ++ for (i = off; i < off+num; i++) { ++ uint64_t *buf; ++ dmu_buf_impl_t *child; ++ dbuf_dirty_record_t *dr; ++ int j; ++ ++ ASSERT(db->db_level == 1); ++ ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ err = dbuf_hold_impl(dn, db->db_level-1, ++ (db->db_blkid << epbs) + i, TRUE, FTAG, &child); ++ rw_exit(&dn->dn_struct_rwlock); ++ if (err == ENOENT) ++ continue; ++ ASSERT(err == 0); ++ ASSERT(child->db_level == 0); ++ dr = child->db_last_dirty; ++ while (dr && dr->dr_txg > txg) ++ dr = dr->dr_next; ++ ASSERT(dr == NULL || dr->dr_txg == txg); ++ ++ /* data_old better be zeroed */ ++ if (dr) { ++ buf = dr->dt.dl.dr_data->b_data; ++ for (j = 0; j < child->db.db_size >> 3; j++) { ++ if (buf[j] != 0) { ++ panic("freed data not zero: " ++ "child=%p i=%d off=%d num=%d\n", ++ (void *)child, i, off, num); ++ } ++ } ++ } ++ ++ /* ++ * db_data better be zeroed unless it's dirty in a ++ * future txg. ++ */ ++ mutex_enter(&child->db_mtx); ++ buf = child->db.db_data; ++ if (buf != NULL && child->db_state != DB_FILL && ++ child->db_last_dirty == NULL) { ++ for (j = 0; j < child->db.db_size >> 3; j++) { ++ if (buf[j] != 0) { ++ panic("freed data not zero: " ++ "child=%p i=%d off=%d num=%d\n", ++ (void *)child, i, off, num); ++ } ++ } ++ } ++ mutex_exit(&child->db_mtx); ++ ++ dbuf_rele(child, FTAG); ++ } ++ DB_DNODE_EXIT(db); ++} ++#endif ++ ++#define ALL -1 ++ ++static int ++free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, ++ dmu_tx_t *tx) ++{ ++ dnode_t *dn; ++ blkptr_t *bp; ++ dmu_buf_impl_t *subdb; ++ uint64_t start, end, dbstart, dbend, i; ++ int epbs, shift, err; ++ int all = TRUE; ++ int blocks_freed = 0; ++ ++ /* ++ * There is a small possibility that this block will not be cached: ++ * 1 - if level > 1 and there are no children with level <= 1 ++ * 2 - if we didn't get a dirty hold (because this block had just ++ * finished being written -- and so had no holds), and then this ++ * block got evicted before we got here. ++ */ ++ if (db->db_state != DB_CACHED) ++ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); ++ ++ dbuf_release_bp(db); ++ bp = (blkptr_t *)db->db.db_data; ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ++ shift = (db->db_level - 1) * epbs; ++ dbstart = db->db_blkid << epbs; ++ start = blkid >> shift; ++ if (dbstart < start) { ++ bp += start - dbstart; ++ all = FALSE; ++ } else { ++ start = dbstart; ++ } ++ dbend = ((db->db_blkid + 1) << epbs) - 1; ++ end = (blkid + nblks - 1) >> shift; ++ if (dbend <= end) ++ end = dbend; ++ else if (all) ++ all = trunc; ++ ASSERT3U(start, <=, end); ++ ++ if (db->db_level == 1) { ++ FREE_VERIFY(db, start, end, tx); ++ blocks_freed = free_blocks(dn, bp, end-start+1, tx); ++ arc_buf_freeze(db->db_buf); ++ ASSERT(all || blocks_freed == 0 || db->db_last_dirty); ++ DB_DNODE_EXIT(db); ++ return (all ? ALL : blocks_freed); ++ } ++ ++ for (i = start; i <= end; i++, bp++) { ++ if (BP_IS_HOLE(bp)) ++ continue; ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb); ++ ASSERT3U(err, ==, 0); ++ rw_exit(&dn->dn_struct_rwlock); ++ ++ if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) { ++ ASSERT3P(subdb->db_blkptr, ==, bp); ++ blocks_freed += free_blocks(dn, bp, 1, tx); ++ } else { ++ all = FALSE; ++ } ++ dbuf_rele(subdb, FTAG); ++ } ++ DB_DNODE_EXIT(db); ++ arc_buf_freeze(db->db_buf); ++#ifdef ZFS_DEBUG ++ bp -= (end-start)+1; ++ for (i = start; i <= end; i++, bp++) { ++ if (i == start && blkid != 0) ++ continue; ++ else if (i == end && !trunc) ++ continue; ++ ASSERT3U(bp->blk_birth, ==, 0); ++ } ++#endif ++ ASSERT(all || blocks_freed == 0 || db->db_last_dirty); ++ return (all ? ALL : blocks_freed); ++} ++ ++/* ++ * free_range: Traverse the indicated range of the provided file ++ * and "free" all the blocks contained there. ++ */ ++static void ++dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) ++{ ++ blkptr_t *bp = dn->dn_phys->dn_blkptr; ++ dmu_buf_impl_t *db; ++ int trunc, start, end, shift, i, err; ++ int dnlevel = dn->dn_phys->dn_nlevels; ++ ++ if (blkid > dn->dn_phys->dn_maxblkid) ++ return; ++ ++ ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX); ++ trunc = blkid + nblks > dn->dn_phys->dn_maxblkid; ++ if (trunc) ++ nblks = dn->dn_phys->dn_maxblkid - blkid + 1; ++ ++ /* There are no indirect blocks in the object */ ++ if (dnlevel == 1) { ++ if (blkid >= dn->dn_phys->dn_nblkptr) { ++ /* this range was never made persistent */ ++ return; ++ } ++ ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); ++ (void) free_blocks(dn, bp + blkid, nblks, tx); ++ if (trunc) { ++ ASSERTV(uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * ++ (dn->dn_phys->dn_datablkszsec<dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); ++ ASSERT(off < dn->dn_phys->dn_maxblkid || ++ dn->dn_phys->dn_maxblkid == 0 || ++ dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); ++ } ++ return; ++ } ++ ++ shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); ++ start = blkid >> shift; ++ ASSERT(start < dn->dn_phys->dn_nblkptr); ++ end = (blkid + nblks - 1) >> shift; ++ bp += start; ++ for (i = start; i <= end; i++, bp++) { ++ if (BP_IS_HOLE(bp)) ++ continue; ++ rw_enter(&dn->dn_struct_rwlock, RW_READER); ++ err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db); ++ ASSERT3U(err, ==, 0); ++ rw_exit(&dn->dn_struct_rwlock); ++ ++ if (free_children(db, blkid, nblks, trunc, tx) == ALL) { ++ ASSERT3P(db->db_blkptr, ==, bp); ++ (void) free_blocks(dn, bp, 1, tx); ++ } ++ dbuf_rele(db, FTAG); ++ } ++ if (trunc) { ++ ASSERTV(uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * ++ (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT)); ++ dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); ++ ASSERT(off < dn->dn_phys->dn_maxblkid || ++ dn->dn_phys->dn_maxblkid == 0 || ++ dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); ++ } ++} ++ ++/* ++ * Try to kick all the dnodes dbufs out of the cache... ++ */ ++void ++dnode_evict_dbufs(dnode_t *dn) ++{ ++ int progress; ++ int pass = 0; ++ ++ do { ++ dmu_buf_impl_t *db, marker; ++ int evicting = FALSE; ++ ++ progress = FALSE; ++ mutex_enter(&dn->dn_dbufs_mtx); ++ list_insert_tail(&dn->dn_dbufs, &marker); ++ db = list_head(&dn->dn_dbufs); ++ for (; db != ▮ db = list_head(&dn->dn_dbufs)) { ++ list_remove(&dn->dn_dbufs, db); ++ list_insert_tail(&dn->dn_dbufs, db); ++#ifdef DEBUG ++ DB_DNODE_ENTER(db); ++ ASSERT3P(DB_DNODE(db), ==, dn); ++ DB_DNODE_EXIT(db); ++#endif /* DEBUG */ ++ ++ mutex_enter(&db->db_mtx); ++ if (db->db_state == DB_EVICTING) { ++ progress = TRUE; ++ evicting = TRUE; ++ mutex_exit(&db->db_mtx); ++ } else if (refcount_is_zero(&db->db_holds)) { ++ progress = TRUE; ++ dbuf_clear(db); /* exits db_mtx for us */ ++ } else { ++ mutex_exit(&db->db_mtx); ++ } ++ ++ } ++ list_remove(&dn->dn_dbufs, &marker); ++ /* ++ * NB: we need to drop dn_dbufs_mtx between passes so ++ * that any DB_EVICTING dbufs can make progress. ++ * Ideally, we would have some cv we could wait on, but ++ * since we don't, just wait a bit to give the other ++ * thread a chance to run. ++ */ ++ mutex_exit(&dn->dn_dbufs_mtx); ++ if (evicting) ++ delay(1); ++ pass++; ++ if ((pass % 100) == 0) ++ dprintf("Exceeded %d passes evicting dbufs\n", pass); ++ } while (progress); ++ ++ if (pass >= 100) ++ dprintf("Required %d passes to evict dbufs\n", pass); ++ ++ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ++ if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) { ++ mutex_enter(&dn->dn_bonus->db_mtx); ++ dbuf_evict(dn->dn_bonus); ++ dn->dn_bonus = NULL; ++ } ++ rw_exit(&dn->dn_struct_rwlock); ++} ++ ++static void ++dnode_undirty_dbufs(list_t *list) ++{ ++ dbuf_dirty_record_t *dr; ++ ++ while ((dr = list_head(list))) { ++ dmu_buf_impl_t *db = dr->dr_dbuf; ++ uint64_t txg = dr->dr_txg; ++ ++ if (db->db_level != 0) ++ dnode_undirty_dbufs(&dr->dt.di.dr_children); ++ ++ mutex_enter(&db->db_mtx); ++ /* XXX - use dbuf_undirty()? */ ++ list_remove(list, dr); ++ ASSERT(db->db_last_dirty == dr); ++ db->db_last_dirty = NULL; ++ db->db_dirtycnt -= 1; ++ if (db->db_level == 0) { ++ ASSERT(db->db_blkid == DMU_BONUS_BLKID || ++ dr->dt.dl.dr_data == db->db_buf); ++ dbuf_unoverride(dr); ++ } ++ kmem_free(dr, sizeof (dbuf_dirty_record_t)); ++ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); ++ } ++} ++ ++static void ++dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) ++{ ++ int txgoff = tx->tx_txg & TXG_MASK; ++ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ++ /* ++ * Our contents should have been freed in dnode_sync() by the ++ * free range record inserted by the caller of dnode_free(). ++ */ ++ ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0); ++ ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr)); ++ ++ dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); ++ dnode_evict_dbufs(dn); ++ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); ++ ++ /* ++ * XXX - It would be nice to assert this, but we may still ++ * have residual holds from async evictions from the arc... ++ * ++ * zfs_obj_to_path() also depends on this being ++ * commented out. ++ * ++ * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); ++ */ ++ ++ /* Undirty next bits */ ++ dn->dn_next_nlevels[txgoff] = 0; ++ dn->dn_next_indblkshift[txgoff] = 0; ++ dn->dn_next_blksz[txgoff] = 0; ++ ++ /* ASSERT(blkptrs are zero); */ ++ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); ++ ASSERT(dn->dn_type != DMU_OT_NONE); ++ ++ ASSERT(dn->dn_free_txg > 0); ++ if (dn->dn_allocated_txg != dn->dn_free_txg) ++ dbuf_will_dirty(dn->dn_dbuf, tx); ++ bzero(dn->dn_phys, sizeof (dnode_phys_t)); ++ ++ mutex_enter(&dn->dn_mtx); ++ dn->dn_type = DMU_OT_NONE; ++ dn->dn_maxblkid = 0; ++ dn->dn_allocated_txg = 0; ++ dn->dn_free_txg = 0; ++ dn->dn_have_spill = B_FALSE; ++ mutex_exit(&dn->dn_mtx); ++ ++ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); ++ ++ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); ++ /* ++ * Now that we've released our hold, the dnode may ++ * be evicted, so we musn't access it. ++ */ ++} ++ ++/* ++ * Write out the dnode's dirty buffers. ++ */ ++void ++dnode_sync(dnode_t *dn, dmu_tx_t *tx) ++{ ++ free_range_t *rp; ++ dnode_phys_t *dnp = dn->dn_phys; ++ int txgoff = tx->tx_txg & TXG_MASK; ++ list_t *list = &dn->dn_dirty_records[txgoff]; ++ boolean_t kill_spill = B_FALSE; ++ ASSERTV(static const dnode_phys_t zerodn = { 0 }); ++ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); ++ ASSERT(dnp->dn_type != DMU_OT_NONE || ++ bcmp(dnp, &zerodn, DNODE_SIZE) == 0); ++ DNODE_VERIFY(dn); ++ ++ ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); ++ ++ if (dmu_objset_userused_enabled(dn->dn_objset) && ++ !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { ++ mutex_enter(&dn->dn_mtx); ++ dn->dn_oldused = DN_USED_BYTES(dn->dn_phys); ++ dn->dn_oldflags = dn->dn_phys->dn_flags; ++ dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED; ++ mutex_exit(&dn->dn_mtx); ++ dmu_objset_userquota_get_ids(dn, B_FALSE, tx); ++ } else { ++ /* Once we account for it, we should always account for it. */ ++ ASSERT(!(dn->dn_phys->dn_flags & ++ DNODE_FLAG_USERUSED_ACCOUNTED)); ++ } ++ ++ mutex_enter(&dn->dn_mtx); ++ if (dn->dn_allocated_txg == tx->tx_txg) { ++ /* The dnode is newly allocated or reallocated */ ++ if (dnp->dn_type == DMU_OT_NONE) { ++ /* this is a first alloc, not a realloc */ ++ dnp->dn_nlevels = 1; ++ dnp->dn_nblkptr = dn->dn_nblkptr; ++ } ++ ++ dnp->dn_type = dn->dn_type; ++ dnp->dn_bonustype = dn->dn_bonustype; ++ dnp->dn_bonuslen = dn->dn_bonuslen; ++ } ++ ++ ASSERT(dnp->dn_nlevels > 1 || ++ BP_IS_HOLE(&dnp->dn_blkptr[0]) || ++ BP_GET_LSIZE(&dnp->dn_blkptr[0]) == ++ dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); ++ ++ if (dn->dn_next_blksz[txgoff]) { ++ ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], ++ SPA_MINBLOCKSIZE) == 0); ++ ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || ++ dn->dn_maxblkid == 0 || list_head(list) != NULL || ++ avl_last(&dn->dn_ranges[txgoff]) || ++ dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == ++ dnp->dn_datablkszsec); ++ dnp->dn_datablkszsec = ++ dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT; ++ dn->dn_next_blksz[txgoff] = 0; ++ } ++ ++ if (dn->dn_next_bonuslen[txgoff]) { ++ if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN) ++ dnp->dn_bonuslen = 0; ++ else ++ dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff]; ++ ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN); ++ dn->dn_next_bonuslen[txgoff] = 0; ++ } ++ ++ if (dn->dn_next_bonustype[txgoff]) { ++ ASSERT(dn->dn_next_bonustype[txgoff] < DMU_OT_NUMTYPES); ++ dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; ++ dn->dn_next_bonustype[txgoff] = 0; ++ } ++ ++ /* ++ * We will either remove a spill block when a file is being removed ++ * or we have been asked to remove it. ++ */ ++ if (dn->dn_rm_spillblk[txgoff] || ++ ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && ++ dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg)) { ++ if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) ++ kill_spill = B_TRUE; ++ dn->dn_rm_spillblk[txgoff] = 0; ++ } ++ ++ if (dn->dn_next_indblkshift[txgoff]) { ++ ASSERT(dnp->dn_nlevels == 1); ++ dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; ++ dn->dn_next_indblkshift[txgoff] = 0; ++ } ++ ++ /* ++ * Just take the live (open-context) values for checksum and compress. ++ * Strictly speaking it's a future leak, but nothing bad happens if we ++ * start using the new checksum or compress algorithm a little early. ++ */ ++ dnp->dn_checksum = dn->dn_checksum; ++ dnp->dn_compress = dn->dn_compress; ++ ++ mutex_exit(&dn->dn_mtx); ++ ++ if (kill_spill) { ++ (void) free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx); ++ mutex_enter(&dn->dn_mtx); ++ dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR; ++ mutex_exit(&dn->dn_mtx); ++ } ++ ++ /* process all the "freed" ranges in the file */ ++ while ((rp = avl_last(&dn->dn_ranges[txgoff]))) { ++ dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx); ++ /* grab the mutex so we don't race with dnode_block_freed() */ ++ mutex_enter(&dn->dn_mtx); ++ avl_remove(&dn->dn_ranges[txgoff], rp); ++ mutex_exit(&dn->dn_mtx); ++ kmem_free(rp, sizeof (free_range_t)); ++ } ++ ++ if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) { ++ dnode_sync_free(dn, tx); ++ return; ++ } ++ ++ if (dn->dn_next_nblkptr[txgoff]) { ++ /* this should only happen on a realloc */ ++ ASSERT(dn->dn_allocated_txg == tx->tx_txg); ++ if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) { ++ /* zero the new blkptrs we are gaining */ ++ bzero(dnp->dn_blkptr + dnp->dn_nblkptr, ++ sizeof (blkptr_t) * ++ (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr)); ++#ifdef ZFS_DEBUG ++ } else { ++ int i; ++ ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr); ++ /* the blkptrs we are losing better be unallocated */ ++ for (i = dn->dn_next_nblkptr[txgoff]; ++ i < dnp->dn_nblkptr; i++) ++ ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i])); ++#endif ++ } ++ mutex_enter(&dn->dn_mtx); ++ dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff]; ++ dn->dn_next_nblkptr[txgoff] = 0; ++ mutex_exit(&dn->dn_mtx); ++ } ++ ++ if (dn->dn_next_nlevels[txgoff]) { ++ dnode_increase_indirection(dn, tx); ++ dn->dn_next_nlevels[txgoff] = 0; ++ } ++ ++ dbuf_sync_list(list, tx); ++ ++ if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { ++ ASSERT3P(list_head(list), ==, NULL); ++ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); ++ } ++ ++ /* ++ * Although we have dropped our reference to the dnode, it ++ * can't be evicted until its written, and we haven't yet ++ * initiated the IO for the dnode's dbuf. ++ */ ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dsl_dataset.c linux-3.2.33-go/fs/zfs/zfs/dsl_dataset.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dsl_dataset.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dsl_dataset.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,4314 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ * Copyright (c) 2012, Joyent, Inc. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static char *dsl_reaper = "the grim reaper"; ++ ++static dsl_checkfunc_t dsl_dataset_destroy_begin_check; ++static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; ++static dsl_syncfunc_t dsl_dataset_set_reservation_sync; ++ ++#define SWITCH64(x, y) \ ++ { \ ++ uint64_t __tmp = (x); \ ++ (x) = (y); \ ++ (y) = __tmp; \ ++ } ++ ++#define DS_REF_MAX (1ULL << 62) ++ ++#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE ++ ++#define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) ++ ++ ++/* ++ * Figure out how much of this delta should be propogated to the dsl_dir ++ * layer. If there's a refreservation, that space has already been ++ * partially accounted for in our ancestors. ++ */ ++static int64_t ++parent_delta(dsl_dataset_t *ds, int64_t delta) ++{ ++ uint64_t old_bytes, new_bytes; ++ ++ if (ds->ds_reserved == 0) ++ return (delta); ++ ++ old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); ++ new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); ++ ++ ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); ++ return (new_bytes - old_bytes); ++} ++ ++void ++dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) ++{ ++ int used, compressed, uncompressed; ++ int64_t delta; ++ ++ used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); ++ compressed = BP_GET_PSIZE(bp); ++ uncompressed = BP_GET_UCSIZE(bp); ++ ++ dprintf_bp(bp, "ds=%p", ds); ++ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ /* It could have been compressed away to nothing */ ++ if (BP_IS_HOLE(bp)) ++ return; ++ ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); ++ ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); ++ if (ds == NULL) { ++ /* ++ * Account for the meta-objset space in its placeholder ++ * dsl_dir. ++ */ ++ ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ ++ dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, ++ used, compressed, uncompressed, tx); ++ dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); ++ return; ++ } ++ dmu_buf_will_dirty(ds->ds_dbuf, tx); ++ ++ mutex_enter(&ds->ds_dir->dd_lock); ++ mutex_enter(&ds->ds_lock); ++ delta = parent_delta(ds, used); ++ ds->ds_phys->ds_used_bytes += used; ++ ds->ds_phys->ds_compressed_bytes += compressed; ++ ds->ds_phys->ds_uncompressed_bytes += uncompressed; ++ ds->ds_phys->ds_unique_bytes += used; ++ mutex_exit(&ds->ds_lock); ++ dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, ++ compressed, uncompressed, tx); ++ dsl_dir_transfer_space(ds->ds_dir, used - delta, ++ DD_USED_REFRSRV, DD_USED_HEAD, tx); ++ mutex_exit(&ds->ds_dir->dd_lock); ++} ++ ++int ++dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, ++ boolean_t async) ++{ ++ int used, compressed, uncompressed; ++ ++ if (BP_IS_HOLE(bp)) ++ return (0); ++ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ASSERT(bp->blk_birth <= tx->tx_txg); ++ ++ used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); ++ compressed = BP_GET_PSIZE(bp); ++ uncompressed = BP_GET_UCSIZE(bp); ++ ++ ASSERT(used > 0); ++ if (ds == NULL) { ++ /* ++ * Account for the meta-objset space in its placeholder ++ * dataset. ++ */ ++ dsl_free(tx->tx_pool, tx->tx_txg, bp); ++ ++ dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, ++ -used, -compressed, -uncompressed, tx); ++ dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); ++ return (used); ++ } ++ ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); ++ ++ ASSERT(!dsl_dataset_is_snapshot(ds)); ++ dmu_buf_will_dirty(ds->ds_dbuf, tx); ++ ++ if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { ++ int64_t delta; ++ ++ dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); ++ dsl_free(tx->tx_pool, tx->tx_txg, bp); ++ ++ mutex_enter(&ds->ds_dir->dd_lock); ++ mutex_enter(&ds->ds_lock); ++ ASSERT(ds->ds_phys->ds_unique_bytes >= used || ++ !DS_UNIQUE_IS_ACCURATE(ds)); ++ delta = parent_delta(ds, -used); ++ ds->ds_phys->ds_unique_bytes -= used; ++ mutex_exit(&ds->ds_lock); ++ dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, ++ delta, -compressed, -uncompressed, tx); ++ dsl_dir_transfer_space(ds->ds_dir, -used - delta, ++ DD_USED_REFRSRV, DD_USED_HEAD, tx); ++ mutex_exit(&ds->ds_dir->dd_lock); ++ } else { ++ dprintf_bp(bp, "putting on dead list: %s", ""); ++ if (async) { ++ /* ++ * We are here as part of zio's write done callback, ++ * which means we're a zio interrupt thread. We can't ++ * call dsl_deadlist_insert() now because it may block ++ * waiting for I/O. Instead, put bp on the deferred ++ * queue and let dsl_pool_sync() finish the job. ++ */ ++ bplist_append(&ds->ds_pending_deadlist, bp); ++ } else { ++ dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); ++ } ++ ASSERT3U(ds->ds_prev->ds_object, ==, ++ ds->ds_phys->ds_prev_snap_obj); ++ ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); ++ /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ ++ if (ds->ds_prev->ds_phys->ds_next_snap_obj == ++ ds->ds_object && bp->blk_birth > ++ ds->ds_prev->ds_phys->ds_prev_snap_txg) { ++ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); ++ mutex_enter(&ds->ds_prev->ds_lock); ++ ds->ds_prev->ds_phys->ds_unique_bytes += used; ++ mutex_exit(&ds->ds_prev->ds_lock); ++ } ++ if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { ++ dsl_dir_transfer_space(ds->ds_dir, used, ++ DD_USED_HEAD, DD_USED_SNAP, tx); ++ } ++ } ++ mutex_enter(&ds->ds_lock); ++ ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); ++ ds->ds_phys->ds_used_bytes -= used; ++ ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); ++ ds->ds_phys->ds_compressed_bytes -= compressed; ++ ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); ++ ds->ds_phys->ds_uncompressed_bytes -= uncompressed; ++ mutex_exit(&ds->ds_lock); ++ ++ return (used); ++} ++ ++uint64_t ++dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) ++{ ++ uint64_t trysnap = 0; ++ ++ if (ds == NULL) ++ return (0); ++ /* ++ * The snapshot creation could fail, but that would cause an ++ * incorrect FALSE return, which would only result in an ++ * overestimation of the amount of space that an operation would ++ * consume, which is OK. ++ * ++ * There's also a small window where we could miss a pending ++ * snapshot, because we could set the sync task in the quiescing ++ * phase. So this should only be used as a guess. ++ */ ++ if (ds->ds_trysnap_txg > ++ spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) ++ trysnap = ds->ds_trysnap_txg; ++ return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); ++} ++ ++boolean_t ++dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, ++ uint64_t blk_birth) ++{ ++ if (blk_birth <= dsl_dataset_prev_snap_txg(ds)) ++ return (B_FALSE); ++ ++ ddt_prefetch(dsl_dataset_get_spa(ds), bp); ++ ++ return (B_TRUE); ++} ++ ++/* ARGSUSED */ ++static void ++dsl_dataset_evict(dmu_buf_t *db, void *dsv) ++{ ++ dsl_dataset_t *ds = dsv; ++ ++ ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); ++ ++ unique_remove(ds->ds_fsid_guid); ++ ++ if (ds->ds_objset != NULL) ++ dmu_objset_evict(ds->ds_objset); ++ ++ if (ds->ds_prev) { ++ dsl_dataset_drop_ref(ds->ds_prev, ds); ++ ds->ds_prev = NULL; ++ } ++ ++ bplist_destroy(&ds->ds_pending_deadlist); ++ if (db != NULL) { ++ dsl_deadlist_close(&ds->ds_deadlist); ++ } else { ++ ASSERT(ds->ds_deadlist.dl_dbuf == NULL); ++ ASSERT(!ds->ds_deadlist.dl_oldfmt); ++ } ++ if (ds->ds_dir) ++ dsl_dir_close(ds->ds_dir, ds); ++ ++ ASSERT(!list_link_active(&ds->ds_synced_link)); ++ ++ mutex_destroy(&ds->ds_lock); ++ mutex_destroy(&ds->ds_recvlock); ++ mutex_destroy(&ds->ds_opening_lock); ++ rw_destroy(&ds->ds_rwlock); ++ cv_destroy(&ds->ds_exclusive_cv); ++ ++ kmem_free(ds, sizeof (dsl_dataset_t)); ++} ++ ++static int ++dsl_dataset_get_snapname(dsl_dataset_t *ds) ++{ ++ dsl_dataset_phys_t *headphys; ++ int err; ++ dmu_buf_t *headdbuf; ++ dsl_pool_t *dp = ds->ds_dir->dd_pool; ++ objset_t *mos = dp->dp_meta_objset; ++ ++ if (ds->ds_snapname[0]) ++ return (0); ++ if (ds->ds_phys->ds_next_snap_obj == 0) ++ return (0); ++ ++ err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, ++ FTAG, &headdbuf); ++ if (err) ++ return (err); ++ headphys = headdbuf->db_data; ++ err = zap_value_search(dp->dp_meta_objset, ++ headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); ++ dmu_buf_rele(headdbuf, FTAG); ++ return (err); ++} ++ ++static int ++dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) ++{ ++ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; ++ uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; ++ matchtype_t mt; ++ int err; ++ ++ if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) ++ mt = MT_FIRST; ++ else ++ mt = MT_EXACT; ++ ++ err = zap_lookup_norm(mos, snapobj, name, 8, 1, ++ value, mt, NULL, 0, NULL); ++ if (err == ENOTSUP && mt == MT_FIRST) ++ err = zap_lookup(mos, snapobj, name, 8, 1, value); ++ return (err); ++} ++ ++static int ++dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) ++{ ++ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; ++ uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; ++ matchtype_t mt; ++ int err; ++ ++ dsl_dir_snap_cmtime_update(ds->ds_dir); ++ ++ if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) ++ mt = MT_FIRST; ++ else ++ mt = MT_EXACT; ++ ++ err = zap_remove_norm(mos, snapobj, name, mt, tx); ++ if (err == ENOTSUP && mt == MT_FIRST) ++ err = zap_remove(mos, snapobj, name, tx); ++ return (err); ++} ++ ++static int ++dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, ++ dsl_dataset_t **dsp) ++{ ++ objset_t *mos = dp->dp_meta_objset; ++ dmu_buf_t *dbuf; ++ dsl_dataset_t *ds; ++ int err; ++ dmu_object_info_t doi; ++ ++ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || ++ dsl_pool_sync_context(dp)); ++ ++ err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); ++ if (err) ++ return (err); ++ ++ /* Make sure dsobj has the correct object type. */ ++ dmu_object_info_from_db(dbuf, &doi); ++ if (doi.doi_type != DMU_OT_DSL_DATASET) ++ return (EINVAL); ++ ++ ds = dmu_buf_get_user(dbuf); ++ if (ds == NULL) { ++ dsl_dataset_t *winner = NULL; ++ ++ ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_PUSHPAGE); ++ ds->ds_dbuf = dbuf; ++ ds->ds_object = dsobj; ++ ds->ds_phys = dbuf->db_data; ++ list_link_init(&ds->ds_synced_link); ++ ++ mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ rw_init(&ds->ds_rwlock, NULL, RW_DEFAULT, NULL); ++ cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); ++ ++ bplist_create(&ds->ds_pending_deadlist); ++ dsl_deadlist_open(&ds->ds_deadlist, ++ mos, ds->ds_phys->ds_deadlist_obj); ++ ++ list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), ++ offsetof(dmu_sendarg_t, dsa_link)); ++ ++ if (err == 0) { ++ err = dsl_dir_open_obj(dp, ++ ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); ++ } ++ if (err) { ++ mutex_destroy(&ds->ds_lock); ++ mutex_destroy(&ds->ds_recvlock); ++ mutex_destroy(&ds->ds_opening_lock); ++ rw_destroy(&ds->ds_rwlock); ++ cv_destroy(&ds->ds_exclusive_cv); ++ bplist_destroy(&ds->ds_pending_deadlist); ++ dsl_deadlist_close(&ds->ds_deadlist); ++ kmem_free(ds, sizeof (dsl_dataset_t)); ++ dmu_buf_rele(dbuf, tag); ++ return (err); ++ } ++ ++ if (!dsl_dataset_is_snapshot(ds)) { ++ ds->ds_snapname[0] = '\0'; ++ if (ds->ds_phys->ds_prev_snap_obj) { ++ err = dsl_dataset_get_ref(dp, ++ ds->ds_phys->ds_prev_snap_obj, ++ ds, &ds->ds_prev); ++ } ++ } else { ++ if (zfs_flags & ZFS_DEBUG_SNAPNAMES) ++ err = dsl_dataset_get_snapname(ds); ++ if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { ++ err = zap_count( ++ ds->ds_dir->dd_pool->dp_meta_objset, ++ ds->ds_phys->ds_userrefs_obj, ++ &ds->ds_userrefs); ++ } ++ } ++ ++ if (err == 0 && !dsl_dataset_is_snapshot(ds)) { ++ /* ++ * In sync context, we're called with either no lock ++ * or with the write lock. If we're not syncing, ++ * we're always called with the read lock held. ++ */ ++ boolean_t need_lock = ++ !RW_WRITE_HELD(&dp->dp_config_rwlock) && ++ dsl_pool_sync_context(dp); ++ ++ if (need_lock) ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ ++ err = dsl_prop_get_ds(ds, ++ "refreservation", sizeof (uint64_t), 1, ++ &ds->ds_reserved, NULL); ++ if (err == 0) { ++ err = dsl_prop_get_ds(ds, ++ "refquota", sizeof (uint64_t), 1, ++ &ds->ds_quota, NULL); ++ } ++ ++ if (need_lock) ++ rw_exit(&dp->dp_config_rwlock); ++ } else { ++ ds->ds_reserved = ds->ds_quota = 0; ++ } ++ ++ if (err == 0) { ++ winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, ++ dsl_dataset_evict); ++ } ++ if (err || winner) { ++ bplist_destroy(&ds->ds_pending_deadlist); ++ dsl_deadlist_close(&ds->ds_deadlist); ++ if (ds->ds_prev) ++ dsl_dataset_drop_ref(ds->ds_prev, ds); ++ dsl_dir_close(ds->ds_dir, ds); ++ mutex_destroy(&ds->ds_lock); ++ mutex_destroy(&ds->ds_recvlock); ++ mutex_destroy(&ds->ds_opening_lock); ++ rw_destroy(&ds->ds_rwlock); ++ cv_destroy(&ds->ds_exclusive_cv); ++ kmem_free(ds, sizeof (dsl_dataset_t)); ++ if (err) { ++ dmu_buf_rele(dbuf, tag); ++ return (err); ++ } ++ ds = winner; ++ } else { ++ ds->ds_fsid_guid = ++ unique_insert(ds->ds_phys->ds_fsid_guid); ++ } ++ } ++ ASSERT3P(ds->ds_dbuf, ==, dbuf); ++ ASSERT3P(ds->ds_phys, ==, dbuf->db_data); ++ ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || ++ spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || ++ dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); ++ mutex_enter(&ds->ds_lock); ++ if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { ++ mutex_exit(&ds->ds_lock); ++ dmu_buf_rele(ds->ds_dbuf, tag); ++ return (ENOENT); ++ } ++ mutex_exit(&ds->ds_lock); ++ *dsp = ds; ++ return (0); ++} ++ ++static int ++dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) ++{ ++ dsl_pool_t *dp = ds->ds_dir->dd_pool; ++ ++ /* ++ * In syncing context we don't want the rwlock lock: there ++ * may be an existing writer waiting for sync phase to ++ * finish. We don't need to worry about such writers, since ++ * sync phase is single-threaded, so the writer can't be ++ * doing anything while we are active. ++ */ ++ if (dsl_pool_sync_context(dp)) { ++ ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); ++ return (0); ++ } ++ ++ /* ++ * Normal users will hold the ds_rwlock as a READER until they ++ * are finished (i.e., call dsl_dataset_rele()). "Owners" will ++ * drop their READER lock after they set the ds_owner field. ++ * ++ * If the dataset is being destroyed, the destroy thread will ++ * obtain a WRITER lock for exclusive access after it's done its ++ * open-context work and then change the ds_owner to ++ * dsl_reaper once destruction is assured. So threads ++ * may block here temporarily, until the "destructability" of ++ * the dataset is determined. ++ */ ++ ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); ++ mutex_enter(&ds->ds_lock); ++ while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { ++ rw_exit(&dp->dp_config_rwlock); ++ cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); ++ if (DSL_DATASET_IS_DESTROYED(ds)) { ++ mutex_exit(&ds->ds_lock); ++ dsl_dataset_drop_ref(ds, tag); ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ return (ENOENT); ++ } ++ /* ++ * The dp_config_rwlock lives above the ds_lock. And ++ * we need to check DSL_DATASET_IS_DESTROYED() while ++ * holding the ds_lock, so we have to drop and reacquire ++ * the ds_lock here. ++ */ ++ mutex_exit(&ds->ds_lock); ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ mutex_enter(&ds->ds_lock); ++ } ++ mutex_exit(&ds->ds_lock); ++ return (0); ++} ++ ++int ++dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ++ dsl_dataset_t **dsp) ++{ ++ int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); ++ ++ if (err) ++ return (err); ++ return (dsl_dataset_hold_ref(*dsp, tag)); ++} ++ ++int ++dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, ++ void *tag, dsl_dataset_t **dsp) ++{ ++ int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); ++ if (err) ++ return (err); ++ if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { ++ dsl_dataset_rele(*dsp, tag); ++ *dsp = NULL; ++ return (EBUSY); ++ } ++ return (0); ++} ++ ++int ++dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) ++{ ++ dsl_dir_t *dd; ++ dsl_pool_t *dp; ++ const char *snapname; ++ uint64_t obj; ++ int err = 0; ++ ++ err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); ++ if (err) ++ return (err); ++ ++ dp = dd->dd_pool; ++ obj = dd->dd_phys->dd_head_dataset_obj; ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ if (obj) ++ err = dsl_dataset_get_ref(dp, obj, tag, dsp); ++ else ++ err = ENOENT; ++ if (err) ++ goto out; ++ ++ err = dsl_dataset_hold_ref(*dsp, tag); ++ ++ /* we may be looking for a snapshot */ ++ if (err == 0 && snapname != NULL) { ++ dsl_dataset_t *ds = NULL; ++ ++ if (*snapname++ != '@') { ++ dsl_dataset_rele(*dsp, tag); ++ err = ENOENT; ++ goto out; ++ } ++ ++ dprintf("looking for snapshot '%s'\n", snapname); ++ err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); ++ if (err == 0) ++ err = dsl_dataset_get_ref(dp, obj, tag, &ds); ++ dsl_dataset_rele(*dsp, tag); ++ ++ ASSERT3U((err == 0), ==, (ds != NULL)); ++ ++ if (ds) { ++ mutex_enter(&ds->ds_lock); ++ if (ds->ds_snapname[0] == 0) ++ (void) strlcpy(ds->ds_snapname, snapname, ++ sizeof (ds->ds_snapname)); ++ mutex_exit(&ds->ds_lock); ++ err = dsl_dataset_hold_ref(ds, tag); ++ *dsp = err ? NULL : ds; ++ } ++ } ++out: ++ rw_exit(&dp->dp_config_rwlock); ++ dsl_dir_close(dd, FTAG); ++ return (err); ++} ++ ++int ++dsl_dataset_own(const char *name, boolean_t inconsistentok, ++ void *tag, dsl_dataset_t **dsp) ++{ ++ int err = dsl_dataset_hold(name, tag, dsp); ++ if (err) ++ return (err); ++ if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { ++ dsl_dataset_rele(*dsp, tag); ++ return (EBUSY); ++ } ++ return (0); ++} ++ ++void ++dsl_dataset_name(dsl_dataset_t *ds, char *name) ++{ ++ if (ds == NULL) { ++ (void) strcpy(name, "mos"); ++ } else { ++ dsl_dir_name(ds->ds_dir, name); ++ VERIFY(0 == dsl_dataset_get_snapname(ds)); ++ if (ds->ds_snapname[0]) { ++ (void) strcat(name, "@"); ++ /* ++ * We use a "recursive" mutex so that we ++ * can call dprintf_ds() with ds_lock held. ++ */ ++ if (!MUTEX_HELD(&ds->ds_lock)) { ++ mutex_enter(&ds->ds_lock); ++ (void) strcat(name, ds->ds_snapname); ++ mutex_exit(&ds->ds_lock); ++ } else { ++ (void) strcat(name, ds->ds_snapname); ++ } ++ } ++ } ++} ++ ++static int ++dsl_dataset_namelen(dsl_dataset_t *ds) ++{ ++ int result; ++ ++ if (ds == NULL) { ++ result = 3; /* "mos" */ ++ } else { ++ result = dsl_dir_namelen(ds->ds_dir); ++ VERIFY(0 == dsl_dataset_get_snapname(ds)); ++ if (ds->ds_snapname[0]) { ++ ++result; /* adding one for the @-sign */ ++ if (!MUTEX_HELD(&ds->ds_lock)) { ++ mutex_enter(&ds->ds_lock); ++ result += strlen(ds->ds_snapname); ++ mutex_exit(&ds->ds_lock); ++ } else { ++ result += strlen(ds->ds_snapname); ++ } ++ } ++ } ++ ++ return (result); ++} ++ ++void ++dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) ++{ ++ dmu_buf_rele(ds->ds_dbuf, tag); ++} ++ ++void ++dsl_dataset_rele(dsl_dataset_t *ds, void *tag) ++{ ++ if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { ++ rw_exit(&ds->ds_rwlock); ++ } ++ dsl_dataset_drop_ref(ds, tag); ++} ++ ++void ++dsl_dataset_disown(dsl_dataset_t *ds, void *tag) ++{ ++ ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || ++ (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); ++ ++ mutex_enter(&ds->ds_lock); ++ ds->ds_owner = NULL; ++ if (RW_WRITE_HELD(&ds->ds_rwlock)) { ++ rw_exit(&ds->ds_rwlock); ++ cv_broadcast(&ds->ds_exclusive_cv); ++ } ++ mutex_exit(&ds->ds_lock); ++ if (ds->ds_dbuf) ++ dsl_dataset_drop_ref(ds, tag); ++ else ++ dsl_dataset_evict(NULL, ds); ++} ++ ++boolean_t ++dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) ++{ ++ boolean_t gotit = FALSE; ++ ++ mutex_enter(&ds->ds_lock); ++ if (ds->ds_owner == NULL && ++ (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { ++ ds->ds_owner = tag; ++ if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) ++ rw_exit(&ds->ds_rwlock); ++ gotit = TRUE; ++ } ++ mutex_exit(&ds->ds_lock); ++ return (gotit); ++} ++ ++void ++dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) ++{ ++ ASSERT3P(owner, ==, ds->ds_owner); ++ if (!RW_WRITE_HELD(&ds->ds_rwlock)) ++ rw_enter(&ds->ds_rwlock, RW_WRITER); ++} ++ ++uint64_t ++dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, ++ uint64_t flags, dmu_tx_t *tx) ++{ ++ dsl_pool_t *dp = dd->dd_pool; ++ dmu_buf_t *dbuf; ++ dsl_dataset_phys_t *dsphys; ++ uint64_t dsobj; ++ objset_t *mos = dp->dp_meta_objset; ++ ++ if (origin == NULL) ++ origin = dp->dp_origin_snap; ++ ++ ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); ++ ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); ++ ++ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, ++ DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); ++ VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); ++ dmu_buf_will_dirty(dbuf, tx); ++ dsphys = dbuf->db_data; ++ bzero(dsphys, sizeof (dsl_dataset_phys_t)); ++ dsphys->ds_dir_obj = dd->dd_object; ++ dsphys->ds_flags = flags; ++ dsphys->ds_fsid_guid = unique_create(); ++ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, ++ sizeof (dsphys->ds_guid)); ++ dsphys->ds_snapnames_zapobj = ++ zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, ++ DMU_OT_NONE, 0, tx); ++ dsphys->ds_creation_time = gethrestime_sec(); ++ dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; ++ ++ if (origin == NULL) { ++ dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); ++ } else { ++ dsl_dataset_t *ohds; ++ ++ dsphys->ds_prev_snap_obj = origin->ds_object; ++ dsphys->ds_prev_snap_txg = ++ origin->ds_phys->ds_creation_txg; ++ dsphys->ds_used_bytes = ++ origin->ds_phys->ds_used_bytes; ++ dsphys->ds_compressed_bytes = ++ origin->ds_phys->ds_compressed_bytes; ++ dsphys->ds_uncompressed_bytes = ++ origin->ds_phys->ds_uncompressed_bytes; ++ dsphys->ds_bp = origin->ds_phys->ds_bp; ++ dsphys->ds_flags |= origin->ds_phys->ds_flags; ++ ++ dmu_buf_will_dirty(origin->ds_dbuf, tx); ++ origin->ds_phys->ds_num_children++; ++ ++ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, ++ origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); ++ dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, ++ dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); ++ dsl_dataset_rele(ohds, FTAG); ++ ++ if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { ++ if (origin->ds_phys->ds_next_clones_obj == 0) { ++ origin->ds_phys->ds_next_clones_obj = ++ zap_create(mos, ++ DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); ++ } ++ VERIFY(0 == zap_add_int(mos, ++ origin->ds_phys->ds_next_clones_obj, ++ dsobj, tx)); ++ } ++ ++ dmu_buf_will_dirty(dd->dd_dbuf, tx); ++ dd->dd_phys->dd_origin_obj = origin->ds_object; ++ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { ++ if (origin->ds_dir->dd_phys->dd_clones == 0) { ++ dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); ++ origin->ds_dir->dd_phys->dd_clones = ++ zap_create(mos, ++ DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); ++ } ++ VERIFY3U(0, ==, zap_add_int(mos, ++ origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); ++ } ++ } ++ ++ if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) ++ dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; ++ ++ dmu_buf_rele(dbuf, FTAG); ++ ++ dmu_buf_will_dirty(dd->dd_dbuf, tx); ++ dd->dd_phys->dd_head_dataset_obj = dsobj; ++ ++ return (dsobj); ++} ++ ++uint64_t ++dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, ++ dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) ++{ ++ dsl_pool_t *dp = pdd->dd_pool; ++ uint64_t dsobj, ddobj; ++ dsl_dir_t *dd; ++ ++ ASSERT(lastname[0] != '@'); ++ ++ ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); ++ VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); ++ ++ dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); ++ ++ dsl_deleg_set_create_perms(dd, tx, cr); ++ ++ dsl_dir_close(dd, FTAG); ++ ++ /* ++ * If we are creating a clone, make sure we zero out any stale ++ * data from the origin snapshots zil header. ++ */ ++ if (origin != NULL) { ++ dsl_dataset_t *ds; ++ objset_t *os; ++ ++ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); ++ VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); ++ bzero(&os->os_zil_header, sizeof (os->os_zil_header)); ++ dsl_dataset_dirty(ds, tx); ++ dsl_dataset_rele(ds, FTAG); ++ } ++ ++ return (dsobj); ++} ++ ++/* ++ * The snapshots must all be in the same pool. ++ */ ++int ++dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed) ++{ ++ int err; ++ dsl_sync_task_t *dst; ++ spa_t *spa; ++ nvpair_t *pair; ++ dsl_sync_task_group_t *dstg; ++ ++ pair = nvlist_next_nvpair(snaps, NULL); ++ if (pair == NULL) ++ return (0); ++ ++ err = spa_open(nvpair_name(pair), &spa, FTAG); ++ if (err) ++ return (err); ++ dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); ++ ++ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; ++ pair = nvlist_next_nvpair(snaps, pair)) { ++ dsl_dataset_t *ds; ++ int err; ++ ++ err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); ++ if (err == 0) { ++ struct dsl_ds_destroyarg *dsda; ++ ++ dsl_dataset_make_exclusive(ds, dstg); ++ dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), ++ KM_SLEEP); ++ dsda->ds = ds; ++ dsda->defer = defer; ++ dsl_sync_task_create(dstg, dsl_dataset_destroy_check, ++ dsl_dataset_destroy_sync, dsda, dstg, 0); ++ } else if (err == ENOENT) { ++ err = 0; ++ } else { ++ (void) strcpy(failed, nvpair_name(pair)); ++ break; ++ } ++ } ++ ++ if (err == 0) ++ err = dsl_sync_task_group_wait(dstg); ++ ++ for (dst = list_head(&dstg->dstg_tasks); dst; ++ dst = list_next(&dstg->dstg_tasks, dst)) { ++ struct dsl_ds_destroyarg *dsda = dst->dst_arg1; ++ dsl_dataset_t *ds = dsda->ds; ++ ++ /* ++ * Return the file system name that triggered the error ++ */ ++ if (dst->dst_err) { ++ dsl_dataset_name(ds, failed); ++ } ++ ASSERT3P(dsda->rm_origin, ==, NULL); ++ dsl_dataset_disown(ds, dstg); ++ kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); ++ } ++ ++ dsl_sync_task_group_destroy(dstg); ++ spa_close(spa, FTAG); ++ return (err); ++ ++} ++ ++static boolean_t ++dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) ++{ ++ boolean_t might_destroy = B_FALSE; ++ ++ mutex_enter(&ds->ds_lock); ++ if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && ++ DS_IS_DEFER_DESTROY(ds)) ++ might_destroy = B_TRUE; ++ mutex_exit(&ds->ds_lock); ++ ++ return (might_destroy); ++} ++ ++/* ++ * If we're removing a clone, and these three conditions are true: ++ * 1) the clone's origin has no other children ++ * 2) the clone's origin has no user references ++ * 3) the clone's origin has been marked for deferred destruction ++ * Then, prepare to remove the origin as part of this sync task group. ++ */ ++static int ++dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) ++{ ++ dsl_dataset_t *ds = dsda->ds; ++ dsl_dataset_t *origin = ds->ds_prev; ++ ++ if (dsl_dataset_might_destroy_origin(origin)) { ++ char *name; ++ int namelen; ++ int error; ++ ++ namelen = dsl_dataset_namelen(origin) + 1; ++ name = kmem_alloc(namelen, KM_SLEEP); ++ dsl_dataset_name(origin, name); ++#ifdef _KERNEL ++ error = zfs_unmount_snap(name, NULL); ++ if (error) { ++ kmem_free(name, namelen); ++ return (error); ++ } ++#endif ++ error = dsl_dataset_own(name, B_TRUE, tag, &origin); ++ kmem_free(name, namelen); ++ if (error) ++ return (error); ++ dsda->rm_origin = origin; ++ dsl_dataset_make_exclusive(origin, tag); ++ } ++ ++ return (0); ++} ++ ++/* ++ * ds must be opened as OWNER. On return (whether successful or not), ++ * ds will be closed and caller can no longer dereference it. ++ */ ++int ++dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) ++{ ++ int err; ++ dsl_sync_task_group_t *dstg; ++ objset_t *os; ++ dsl_dir_t *dd; ++ uint64_t obj; ++ struct dsl_ds_destroyarg dsda = { 0 }; ++ dsl_dataset_t *dummy_ds; ++ ++ dsda.ds = ds; ++ ++ if (dsl_dataset_is_snapshot(ds)) { ++ /* Destroying a snapshot is simpler */ ++ dsl_dataset_make_exclusive(ds, tag); ++ ++ dsda.defer = defer; ++ err = dsl_sync_task_do(ds->ds_dir->dd_pool, ++ dsl_dataset_destroy_check, dsl_dataset_destroy_sync, ++ &dsda, tag, 0); ++ ASSERT3P(dsda.rm_origin, ==, NULL); ++ goto out; ++ } else if (defer) { ++ err = EINVAL; ++ goto out; ++ } ++ ++ dd = ds->ds_dir; ++ dummy_ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ++ dummy_ds->ds_dir = dd; ++ dummy_ds->ds_object = ds->ds_object; ++ ++ /* ++ * Check for errors and mark this ds as inconsistent, in ++ * case we crash while freeing the objects. ++ */ ++ err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, ++ dsl_dataset_destroy_begin_sync, ds, NULL, 0); ++ if (err) ++ goto out_free; ++ ++ err = dmu_objset_from_ds(ds, &os); ++ if (err) ++ goto out_free; ++ ++ /* ++ * remove the objects in open context, so that we won't ++ * have too much to do in syncing context. ++ */ ++ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, ++ ds->ds_phys->ds_prev_snap_txg)) { ++ /* ++ * Ignore errors, if there is not enough disk space ++ * we will deal with it in dsl_dataset_destroy_sync(). ++ */ ++ (void) dmu_free_object(os, obj); ++ } ++ if (err != ESRCH) ++ goto out_free; ++ ++ /* ++ * Only the ZIL knows how to free log blocks. ++ */ ++ zil_destroy(dmu_objset_zil(os), B_FALSE); ++ ++ /* ++ * Sync out all in-flight IO. ++ */ ++ txg_wait_synced(dd->dd_pool, 0); ++ ++ /* ++ * If we managed to free all the objects in open ++ * context, the user space accounting should be zero. ++ */ ++ if (ds->ds_phys->ds_bp.blk_fill == 0 && ++ dmu_objset_userused_enabled(os)) { ++ ASSERTV(uint64_t count); ++ ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || ++ count == 0); ++ ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || ++ count == 0); ++ } ++ ++ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); ++ err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); ++ rw_exit(&dd->dd_pool->dp_config_rwlock); ++ ++ if (err) ++ goto out_free; ++ ++ /* ++ * Blow away the dsl_dir + head dataset. ++ */ ++ dsl_dataset_make_exclusive(ds, tag); ++ /* ++ * If we're removing a clone, we might also need to remove its ++ * origin. ++ */ ++ do { ++ dsda.need_prep = B_FALSE; ++ if (dsl_dir_is_clone(dd)) { ++ err = dsl_dataset_origin_rm_prep(&dsda, tag); ++ if (err) { ++ dsl_dir_close(dd, FTAG); ++ goto out_free; ++ } ++ } ++ ++ dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); ++ dsl_sync_task_create(dstg, dsl_dataset_destroy_check, ++ dsl_dataset_destroy_sync, &dsda, tag, 0); ++ dsl_sync_task_create(dstg, dsl_dir_destroy_check, ++ dsl_dir_destroy_sync, dummy_ds, FTAG, 0); ++ err = dsl_sync_task_group_wait(dstg); ++ dsl_sync_task_group_destroy(dstg); ++ ++ /* ++ * We could be racing against 'zfs release' or 'zfs destroy -d' ++ * on the origin snap, in which case we can get EBUSY if we ++ * needed to destroy the origin snap but were not ready to ++ * do so. ++ */ ++ if (dsda.need_prep) { ++ ASSERT(err == EBUSY); ++ ASSERT(dsl_dir_is_clone(dd)); ++ ASSERT(dsda.rm_origin == NULL); ++ } ++ } while (dsda.need_prep); ++ ++ if (dsda.rm_origin != NULL) ++ dsl_dataset_disown(dsda.rm_origin, tag); ++ ++ /* if it is successful, dsl_dir_destroy_sync will close the dd */ ++ if (err) ++ dsl_dir_close(dd, FTAG); ++ ++out_free: ++ kmem_free(dummy_ds, sizeof (dsl_dataset_t)); ++out: ++ dsl_dataset_disown(ds, tag); ++ return (err); ++} ++ ++blkptr_t * ++dsl_dataset_get_blkptr(dsl_dataset_t *ds) ++{ ++ return (&ds->ds_phys->ds_bp); ++} ++ ++void ++dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) ++{ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ /* If it's the meta-objset, set dp_meta_rootbp */ ++ if (ds == NULL) { ++ tx->tx_pool->dp_meta_rootbp = *bp; ++ } else { ++ dmu_buf_will_dirty(ds->ds_dbuf, tx); ++ ds->ds_phys->ds_bp = *bp; ++ } ++} ++ ++spa_t * ++dsl_dataset_get_spa(dsl_dataset_t *ds) ++{ ++ return (ds->ds_dir->dd_pool->dp_spa); ++} ++ ++void ++dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) ++{ ++ dsl_pool_t *dp; ++ ++ if (ds == NULL) /* this is the meta-objset */ ++ return; ++ ++ ASSERT(ds->ds_objset != NULL); ++ ++ if (ds->ds_phys->ds_next_snap_obj != 0) ++ panic("dirtying snapshot!"); ++ ++ dp = ds->ds_dir->dd_pool; ++ ++ if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { ++ /* up the hold count until we can be written out */ ++ dmu_buf_add_ref(ds->ds_dbuf, ds); ++ } ++} ++ ++boolean_t ++dsl_dataset_is_dirty(dsl_dataset_t *ds) ++{ ++ int t; ++ ++ for (t = 0; t < TXG_SIZE; t++) { ++ if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, ++ ds, t)) ++ return (B_TRUE); ++ } ++ return (B_FALSE); ++} ++ ++/* ++ * The unique space in the head dataset can be calculated by subtracting ++ * the space used in the most recent snapshot, that is still being used ++ * in this file system, from the space currently in use. To figure out ++ * the space in the most recent snapshot still in use, we need to take ++ * the total space used in the snapshot and subtract out the space that ++ * has been freed up since the snapshot was taken. ++ */ ++static void ++dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) ++{ ++ uint64_t mrs_used; ++ uint64_t dlused, dlcomp, dluncomp; ++ ++ ASSERT(!dsl_dataset_is_snapshot(ds)); ++ ++ if (ds->ds_phys->ds_prev_snap_obj != 0) ++ mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; ++ else ++ mrs_used = 0; ++ ++ dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); ++ ++ ASSERT3U(dlused, <=, mrs_used); ++ ds->ds_phys->ds_unique_bytes = ++ ds->ds_phys->ds_used_bytes - (mrs_used - dlused); ++ ++ if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= ++ SPA_VERSION_UNIQUE_ACCURATE) ++ ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; ++} ++ ++struct killarg { ++ dsl_dataset_t *ds; ++ dmu_tx_t *tx; ++}; ++ ++/* ARGSUSED */ ++static int ++kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, ++ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) ++{ ++ struct killarg *ka = arg; ++ dmu_tx_t *tx = ka->tx; ++ ++ if (bp == NULL) ++ return (0); ++ ++ if (zb->zb_level == ZB_ZIL_LEVEL) { ++ ASSERT(zilog != NULL); ++ /* ++ * It's a block in the intent log. It has no ++ * accounting, so just free it. ++ */ ++ dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); ++ } else { ++ ASSERT(zilog == NULL); ++ ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); ++ (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); ++ } ++ ++ return (0); ++} ++ ++/* ARGSUSED */ ++static int ++dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; ++ uint64_t count; ++ int err; ++ ++ /* ++ * Can't delete a head dataset if there are snapshots of it. ++ * (Except if the only snapshots are from the branch we cloned ++ * from.) ++ */ ++ if (ds->ds_prev != NULL && ++ ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) ++ return (EBUSY); ++ ++ /* ++ * This is really a dsl_dir thing, but check it here so that ++ * we'll be less likely to leave this dataset inconsistent & ++ * nearly destroyed. ++ */ ++ err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); ++ if (err) ++ return (err); ++ if (count != 0) ++ return (EEXIST); ++ ++ return (0); ++} ++ ++/* ARGSUSED */ ++static void ++dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ dsl_pool_t *dp = ds->ds_dir->dd_pool; ++ ++ /* Mark it as inconsistent on-disk, in case we crash */ ++ dmu_buf_will_dirty(ds->ds_dbuf, tx); ++ ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; ++ ++ spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, ++ "dataset = %llu", ds->ds_object); ++} ++ ++static int ++dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, ++ dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = dsda->ds; ++ dsl_dataset_t *ds_prev = ds->ds_prev; ++ ++ if (dsl_dataset_might_destroy_origin(ds_prev)) { ++ struct dsl_ds_destroyarg ndsda = {0}; ++ ++ /* ++ * If we're not prepared to remove the origin, don't remove ++ * the clone either. ++ */ ++ if (dsda->rm_origin == NULL) { ++ dsda->need_prep = B_TRUE; ++ return (EBUSY); ++ } ++ ++ ndsda.ds = ds_prev; ++ ndsda.is_origin_rm = B_TRUE; ++ return (dsl_dataset_destroy_check(&ndsda, tag, tx)); ++ } ++ ++ /* ++ * If we're not going to remove the origin after all, ++ * undo the open context setup. ++ */ ++ if (dsda->rm_origin != NULL) { ++ dsl_dataset_disown(dsda->rm_origin, tag); ++ dsda->rm_origin = NULL; ++ } ++ ++ return (0); ++} ++ ++/* ++ * If you add new checks here, you may need to add ++ * additional checks to the "temporary" case in ++ * snapshot_check() in dmu_objset.c. ++ */ ++/* ARGSUSED */ ++int ++dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ struct dsl_ds_destroyarg *dsda = arg1; ++ dsl_dataset_t *ds = dsda->ds; ++ ++ /* we have an owner hold, so noone else can destroy us */ ++ ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); ++ ++ /* ++ * Only allow deferred destroy on pools that support it. ++ * NOTE: deferred destroy is only supported on snapshots. ++ */ ++ if (dsda->defer) { ++ if (spa_version(ds->ds_dir->dd_pool->dp_spa) < ++ SPA_VERSION_USERREFS) ++ return (ENOTSUP); ++ ASSERT(dsl_dataset_is_snapshot(ds)); ++ return (0); ++ } ++ ++ /* ++ * Can't delete a head dataset if there are snapshots of it. ++ * (Except if the only snapshots are from the branch we cloned ++ * from.) ++ */ ++ if (ds->ds_prev != NULL && ++ ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) ++ return (EBUSY); ++ ++ /* ++ * If we made changes this txg, traverse_dsl_dataset won't find ++ * them. Try again. ++ */ ++ if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) ++ return (EAGAIN); ++ ++ if (dsl_dataset_is_snapshot(ds)) { ++ /* ++ * If this snapshot has an elevated user reference count, ++ * we can't destroy it yet. ++ */ ++ if (ds->ds_userrefs > 0 && !dsda->releasing) ++ return (EBUSY); ++ ++ mutex_enter(&ds->ds_lock); ++ /* ++ * Can't delete a branch point. However, if we're destroying ++ * a clone and removing its origin due to it having a user ++ * hold count of 0 and having been marked for deferred destroy, ++ * it's OK for the origin to have a single clone. ++ */ ++ if (ds->ds_phys->ds_num_children > ++ (dsda->is_origin_rm ? 2 : 1)) { ++ mutex_exit(&ds->ds_lock); ++ return (EEXIST); ++ } ++ mutex_exit(&ds->ds_lock); ++ } else if (dsl_dir_is_clone(ds->ds_dir)) { ++ return (dsl_dataset_origin_check(dsda, arg2, tx)); ++ } ++ ++ /* XXX we should do some i/o error checking... */ ++ return (0); ++} ++ ++struct refsarg { ++ kmutex_t lock; ++ boolean_t gone; ++ kcondvar_t cv; ++}; ++ ++/* ARGSUSED */ ++static void ++dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) ++{ ++ struct refsarg *arg = argv; ++ ++ mutex_enter(&arg->lock); ++ arg->gone = TRUE; ++ cv_signal(&arg->cv); ++ mutex_exit(&arg->lock); ++} ++ ++static void ++dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) ++{ ++ struct refsarg arg; ++ ++ mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); ++ cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); ++ arg.gone = FALSE; ++ (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, ++ dsl_dataset_refs_gone); ++ dmu_buf_rele(ds->ds_dbuf, tag); ++ mutex_enter(&arg.lock); ++ while (!arg.gone) ++ cv_wait(&arg.cv, &arg.lock); ++ ASSERT(arg.gone); ++ mutex_exit(&arg.lock); ++ ds->ds_dbuf = NULL; ++ ds->ds_phys = NULL; ++ mutex_destroy(&arg.lock); ++ cv_destroy(&arg.cv); ++} ++ ++static void ++remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) ++{ ++ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; ++ int err; ++ ASSERTV(uint64_t count); ++ ++ ASSERT(ds->ds_phys->ds_num_children >= 2); ++ err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); ++ /* ++ * The err should not be ENOENT, but a bug in a previous version ++ * of the code could cause upgrade_clones_cb() to not set ++ * ds_next_snap_obj when it should, leading to a missing entry. ++ * If we knew that the pool was created after ++ * SPA_VERSION_NEXT_CLONES, we could assert that it isn't ++ * ENOENT. However, at least we can check that we don't have ++ * too many entries in the next_clones_obj even after failing to ++ * remove this one. ++ */ ++ if (err != ENOENT) { ++ VERIFY3U(err, ==, 0); ++ } ++ ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, ++ &count)); ++ ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); ++} ++ ++static void ++dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) ++{ ++ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ ++ /* ++ * If it is the old version, dd_clones doesn't exist so we can't ++ * find the clones, but deadlist_remove_key() is a no-op so it ++ * doesn't matter. ++ */ ++ if (ds->ds_dir->dd_phys->dd_clones == 0) ++ return; ++ ++ for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); ++ zap_cursor_retrieve(&zc, &za) == 0; ++ zap_cursor_advance(&zc)) { ++ dsl_dataset_t *clone; ++ ++ VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, ++ za.za_first_integer, FTAG, &clone)); ++ if (clone->ds_dir->dd_origin_txg > mintxg) { ++ dsl_deadlist_remove_key(&clone->ds_deadlist, ++ mintxg, tx); ++ dsl_dataset_remove_clones_key(clone, mintxg, tx); ++ } ++ dsl_dataset_rele(clone, FTAG); ++ } ++ zap_cursor_fini(&zc); ++} ++ ++struct process_old_arg { ++ dsl_dataset_t *ds; ++ dsl_dataset_t *ds_prev; ++ boolean_t after_branch_point; ++ zio_t *pio; ++ uint64_t used, comp, uncomp; ++}; ++ ++static int ++process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) ++{ ++ struct process_old_arg *poa = arg; ++ dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; ++ ++ if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { ++ dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); ++ if (poa->ds_prev && !poa->after_branch_point && ++ bp->blk_birth > ++ poa->ds_prev->ds_phys->ds_prev_snap_txg) { ++ poa->ds_prev->ds_phys->ds_unique_bytes += ++ bp_get_dsize_sync(dp->dp_spa, bp); ++ } ++ } else { ++ poa->used += bp_get_dsize_sync(dp->dp_spa, bp); ++ poa->comp += BP_GET_PSIZE(bp); ++ poa->uncomp += BP_GET_UCSIZE(bp); ++ dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); ++ } ++ return (0); ++} ++ ++static void ++process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, ++ dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) ++{ ++ struct process_old_arg poa = { 0 }; ++ dsl_pool_t *dp = ds->ds_dir->dd_pool; ++ objset_t *mos = dp->dp_meta_objset; ++ ++ ASSERT(ds->ds_deadlist.dl_oldfmt); ++ ASSERT(ds_next->ds_deadlist.dl_oldfmt); ++ ++ poa.ds = ds; ++ poa.ds_prev = ds_prev; ++ poa.after_branch_point = after_branch_point; ++ poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); ++ VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, ++ process_old_cb, &poa, tx)); ++ VERIFY3U(zio_wait(poa.pio), ==, 0); ++ ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); ++ ++ /* change snapused */ ++ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, ++ -poa.used, -poa.comp, -poa.uncomp, tx); ++ ++ /* swap next's deadlist to our deadlist */ ++ dsl_deadlist_close(&ds->ds_deadlist); ++ dsl_deadlist_close(&ds_next->ds_deadlist); ++ SWITCH64(ds_next->ds_phys->ds_deadlist_obj, ++ ds->ds_phys->ds_deadlist_obj); ++ dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); ++ dsl_deadlist_open(&ds_next->ds_deadlist, mos, ++ ds_next->ds_phys->ds_deadlist_obj); ++} ++ ++void ++dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) ++{ ++ struct dsl_ds_destroyarg *dsda = arg1; ++ dsl_dataset_t *ds = dsda->ds; ++ int err; ++ int after_branch_point = FALSE; ++ dsl_pool_t *dp = ds->ds_dir->dd_pool; ++ objset_t *mos = dp->dp_meta_objset; ++ dsl_dataset_t *ds_prev = NULL; ++ boolean_t wont_destroy; ++ uint64_t obj; ++ ++ wont_destroy = (dsda->defer && ++ (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); ++ ++ ASSERT(ds->ds_owner || wont_destroy); ++ ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); ++ ASSERT(ds->ds_prev == NULL || ++ ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); ++ ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); ++ ++ if (wont_destroy) { ++ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); ++ dmu_buf_will_dirty(ds->ds_dbuf, tx); ++ ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; ++ return; ++ } ++ ++ /* signal any waiters that this dataset is going away */ ++ mutex_enter(&ds->ds_lock); ++ ds->ds_owner = dsl_reaper; ++ cv_broadcast(&ds->ds_exclusive_cv); ++ mutex_exit(&ds->ds_lock); ++ ++ /* Remove our reservation */ ++ if (ds->ds_reserved != 0) { ++ dsl_prop_setarg_t psa; ++ uint64_t value = 0; ++ ++ dsl_prop_setarg_init_uint64(&psa, "refreservation", ++ (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), ++ &value); ++ psa.psa_effective_value = 0; /* predict default value */ ++ ++ dsl_dataset_set_reservation_sync(ds, &psa, tx); ++ ASSERT3U(ds->ds_reserved, ==, 0); ++ } ++ ++ ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); ++ ++ dsl_scan_ds_destroyed(ds, tx); ++ ++ obj = ds->ds_object; ++ ++ if (ds->ds_phys->ds_prev_snap_obj != 0) { ++ if (ds->ds_prev) { ++ ds_prev = ds->ds_prev; ++ } else { ++ VERIFY(0 == dsl_dataset_hold_obj(dp, ++ ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); ++ } ++ after_branch_point = ++ (ds_prev->ds_phys->ds_next_snap_obj != obj); ++ ++ dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); ++ if (after_branch_point && ++ ds_prev->ds_phys->ds_next_clones_obj != 0) { ++ remove_from_next_clones(ds_prev, obj, tx); ++ if (ds->ds_phys->ds_next_snap_obj != 0) { ++ VERIFY(0 == zap_add_int(mos, ++ ds_prev->ds_phys->ds_next_clones_obj, ++ ds->ds_phys->ds_next_snap_obj, tx)); ++ } ++ } ++ if (after_branch_point && ++ ds->ds_phys->ds_next_snap_obj == 0) { ++ /* This clone is toast. */ ++ ASSERT(ds_prev->ds_phys->ds_num_children > 1); ++ ds_prev->ds_phys->ds_num_children--; ++ ++ /* ++ * If the clone's origin has no other clones, no ++ * user holds, and has been marked for deferred ++ * deletion, then we should have done the necessary ++ * destroy setup for it. ++ */ ++ if (ds_prev->ds_phys->ds_num_children == 1 && ++ ds_prev->ds_userrefs == 0 && ++ DS_IS_DEFER_DESTROY(ds_prev)) { ++ ASSERT3P(dsda->rm_origin, !=, NULL); ++ } else { ++ ASSERT3P(dsda->rm_origin, ==, NULL); ++ } ++ } else if (!after_branch_point) { ++ ds_prev->ds_phys->ds_next_snap_obj = ++ ds->ds_phys->ds_next_snap_obj; ++ } ++ } ++ ++ if (dsl_dataset_is_snapshot(ds)) { ++ dsl_dataset_t *ds_next; ++ uint64_t old_unique; ++ uint64_t used = 0, comp = 0, uncomp = 0; ++ ++ VERIFY(0 == dsl_dataset_hold_obj(dp, ++ ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); ++ ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); ++ ++ old_unique = ds_next->ds_phys->ds_unique_bytes; ++ ++ dmu_buf_will_dirty(ds_next->ds_dbuf, tx); ++ ds_next->ds_phys->ds_prev_snap_obj = ++ ds->ds_phys->ds_prev_snap_obj; ++ ds_next->ds_phys->ds_prev_snap_txg = ++ ds->ds_phys->ds_prev_snap_txg; ++ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, ++ ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); ++ ++ ++ if (ds_next->ds_deadlist.dl_oldfmt) { ++ process_old_deadlist(ds, ds_prev, ds_next, ++ after_branch_point, tx); ++ } else { ++ /* Adjust prev's unique space. */ ++ if (ds_prev && !after_branch_point) { ++ dsl_deadlist_space_range(&ds_next->ds_deadlist, ++ ds_prev->ds_phys->ds_prev_snap_txg, ++ ds->ds_phys->ds_prev_snap_txg, ++ &used, &comp, &uncomp); ++ ds_prev->ds_phys->ds_unique_bytes += used; ++ } ++ ++ /* Adjust snapused. */ ++ dsl_deadlist_space_range(&ds_next->ds_deadlist, ++ ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, ++ &used, &comp, &uncomp); ++ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, ++ -used, -comp, -uncomp, tx); ++ ++ /* Move blocks to be freed to pool's free list. */ ++ dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, ++ &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, ++ tx); ++ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, ++ DD_USED_HEAD, used, comp, uncomp, tx); ++ dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx); ++ ++ /* Merge our deadlist into next's and free it. */ ++ dsl_deadlist_merge(&ds_next->ds_deadlist, ++ ds->ds_phys->ds_deadlist_obj, tx); ++ } ++ dsl_deadlist_close(&ds->ds_deadlist); ++ dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); ++ ++ /* Collapse range in clone heads */ ++ dsl_dataset_remove_clones_key(ds, ++ ds->ds_phys->ds_creation_txg, tx); ++ ++ if (dsl_dataset_is_snapshot(ds_next)) { ++ dsl_dataset_t *ds_nextnext; ++ dsl_dataset_t *hds; ++ ++ /* ++ * Update next's unique to include blocks which ++ * were previously shared by only this snapshot ++ * and it. Those blocks will be born after the ++ * prev snap and before this snap, and will have ++ * died after the next snap and before the one ++ * after that (ie. be on the snap after next's ++ * deadlist). ++ */ ++ VERIFY(0 == dsl_dataset_hold_obj(dp, ++ ds_next->ds_phys->ds_next_snap_obj, ++ FTAG, &ds_nextnext)); ++ dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, ++ ds->ds_phys->ds_prev_snap_txg, ++ ds->ds_phys->ds_creation_txg, ++ &used, &comp, &uncomp); ++ ds_next->ds_phys->ds_unique_bytes += used; ++ dsl_dataset_rele(ds_nextnext, FTAG); ++ ASSERT3P(ds_next->ds_prev, ==, NULL); ++ ++ /* Collapse range in this head. */ ++ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, ++ ds->ds_dir->dd_phys->dd_head_dataset_obj, ++ FTAG, &hds)); ++ dsl_deadlist_remove_key(&hds->ds_deadlist, ++ ds->ds_phys->ds_creation_txg, tx); ++ dsl_dataset_rele(hds, FTAG); ++ ++ } else { ++ ASSERT3P(ds_next->ds_prev, ==, ds); ++ dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); ++ ds_next->ds_prev = NULL; ++ if (ds_prev) { ++ VERIFY(0 == dsl_dataset_get_ref(dp, ++ ds->ds_phys->ds_prev_snap_obj, ++ ds_next, &ds_next->ds_prev)); ++ } ++ ++ dsl_dataset_recalc_head_uniq(ds_next); ++ ++ /* ++ * Reduce the amount of our unconsmed refreservation ++ * being charged to our parent by the amount of ++ * new unique data we have gained. ++ */ ++ if (old_unique < ds_next->ds_reserved) { ++ int64_t mrsdelta; ++ uint64_t new_unique = ++ ds_next->ds_phys->ds_unique_bytes; ++ ++ ASSERT(old_unique <= new_unique); ++ mrsdelta = MIN(new_unique - old_unique, ++ ds_next->ds_reserved - old_unique); ++ dsl_dir_diduse_space(ds->ds_dir, ++ DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); ++ } ++ } ++ dsl_dataset_rele(ds_next, FTAG); ++ } else { ++ /* ++ * There's no next snapshot, so this is a head dataset. ++ * Destroy the deadlist. Unless it's a clone, the ++ * deadlist should be empty. (If it's a clone, it's ++ * safe to ignore the deadlist contents.) ++ */ ++ struct killarg ka; ++ ++ dsl_deadlist_close(&ds->ds_deadlist); ++ dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); ++ ds->ds_phys->ds_deadlist_obj = 0; ++ ++ /* ++ * Free everything that we point to (that's born after ++ * the previous snapshot, if we are a clone) ++ * ++ * NB: this should be very quick, because we already ++ * freed all the objects in open context. ++ */ ++ ka.ds = ds; ++ ka.tx = tx; ++ err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, ++ TRAVERSE_POST, kill_blkptr, &ka); ++ ASSERT3U(err, ==, 0); ++ ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ++ ds->ds_phys->ds_unique_bytes == 0); ++ ++ if (ds->ds_prev != NULL) { ++ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { ++ VERIFY3U(0, ==, zap_remove_int(mos, ++ ds->ds_prev->ds_dir->dd_phys->dd_clones, ++ ds->ds_object, tx)); ++ } ++ dsl_dataset_rele(ds->ds_prev, ds); ++ ds->ds_prev = ds_prev = NULL; ++ } ++ } ++ ++ /* ++ * This must be done after the dsl_traverse(), because it will ++ * re-open the objset. ++ */ ++ if (ds->ds_objset) { ++ dmu_objset_evict(ds->ds_objset); ++ ds->ds_objset = NULL; ++ } ++ ++ if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { ++ /* Erase the link in the dir */ ++ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); ++ ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; ++ ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); ++ err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); ++ ASSERT(err == 0); ++ } else { ++ /* remove from snapshot namespace */ ++ dsl_dataset_t *ds_head; ++ ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); ++ VERIFY(0 == dsl_dataset_hold_obj(dp, ++ ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); ++ VERIFY(0 == dsl_dataset_get_snapname(ds)); ++#ifdef ZFS_DEBUG ++ { ++ uint64_t val; ++ ++ err = dsl_dataset_snap_lookup(ds_head, ++ ds->ds_snapname, &val); ++ ASSERT3U(err, ==, 0); ++ ASSERT3U(val, ==, obj); ++ } ++#endif ++ err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); ++ ASSERT(err == 0); ++ dsl_dataset_rele(ds_head, FTAG); ++ } ++ ++ if (ds_prev && ds->ds_prev != ds_prev) ++ dsl_dataset_rele(ds_prev, FTAG); ++ ++ spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); ++ spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx, ++ "dataset = %llu", ds->ds_object); ++ ++ if (ds->ds_phys->ds_next_clones_obj != 0) { ++ ASSERTV(uint64_t count); ++ ASSERT(0 == zap_count(mos, ++ ds->ds_phys->ds_next_clones_obj, &count) && count == 0); ++ VERIFY(0 == dmu_object_free(mos, ++ ds->ds_phys->ds_next_clones_obj, tx)); ++ } ++ if (ds->ds_phys->ds_props_obj != 0) ++ VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); ++ if (ds->ds_phys->ds_userrefs_obj != 0) ++ VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); ++ dsl_dir_close(ds->ds_dir, ds); ++ ds->ds_dir = NULL; ++ dsl_dataset_drain_refs(ds, tag); ++ VERIFY(0 == dmu_object_free(mos, obj, tx)); ++ ++ if (dsda->rm_origin) { ++ /* ++ * Remove the origin of the clone we just destroyed. ++ */ ++ struct dsl_ds_destroyarg ndsda = {0}; ++ ++ ndsda.ds = dsda->rm_origin; ++ dsl_dataset_destroy_sync(&ndsda, tag, tx); ++ } ++} ++ ++static int ++dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) ++{ ++ uint64_t asize; ++ ++ if (!dmu_tx_is_syncing(tx)) ++ return (0); ++ ++ /* ++ * If there's an fs-only reservation, any blocks that might become ++ * owned by the snapshot dataset must be accommodated by space ++ * outside of the reservation. ++ */ ++ ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); ++ asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); ++ if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) ++ return (ENOSPC); ++ ++ /* ++ * Propogate any reserved space for this snapshot to other ++ * snapshot checks in this sync group. ++ */ ++ if (asize > 0) ++ dsl_dir_willuse_space(ds->ds_dir, asize, tx); ++ ++ return (0); ++} ++ ++int ++dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ const char *snapname = arg2; ++ int err; ++ uint64_t value; ++ ++ /* ++ * We don't allow multiple snapshots of the same txg. If there ++ * is already one, try again. ++ */ ++ if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) ++ return (EAGAIN); ++ ++ /* ++ * Check for conflicting name snapshot name. ++ */ ++ err = dsl_dataset_snap_lookup(ds, snapname, &value); ++ if (err == 0) ++ return (EEXIST); ++ if (err != ENOENT) ++ return (err); ++ ++ /* ++ * Check that the dataset's name is not too long. Name consists ++ * of the dataset's length + 1 for the @-sign + snapshot name's length ++ */ ++ if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) ++ return (ENAMETOOLONG); ++ ++ err = dsl_dataset_snapshot_reserve_space(ds, tx); ++ if (err) ++ return (err); ++ ++ ds->ds_trysnap_txg = tx->tx_txg; ++ return (0); ++} ++ ++void ++dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ const char *snapname = arg2; ++ dsl_pool_t *dp = ds->ds_dir->dd_pool; ++ dmu_buf_t *dbuf; ++ dsl_dataset_phys_t *dsphys; ++ uint64_t dsobj, crtxg; ++ objset_t *mos = dp->dp_meta_objset; ++ int err; ++ ++ ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); ++ ++ /* ++ * The origin's ds_creation_txg has to be < TXG_INITIAL ++ */ ++ if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) ++ crtxg = 1; ++ else ++ crtxg = tx->tx_txg; ++ ++ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, ++ DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); ++ VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); ++ dmu_buf_will_dirty(dbuf, tx); ++ dsphys = dbuf->db_data; ++ bzero(dsphys, sizeof (dsl_dataset_phys_t)); ++ dsphys->ds_dir_obj = ds->ds_dir->dd_object; ++ dsphys->ds_fsid_guid = unique_create(); ++ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, ++ sizeof (dsphys->ds_guid)); ++ dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; ++ dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; ++ dsphys->ds_next_snap_obj = ds->ds_object; ++ dsphys->ds_num_children = 1; ++ dsphys->ds_creation_time = gethrestime_sec(); ++ dsphys->ds_creation_txg = crtxg; ++ dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; ++ dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; ++ dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; ++ dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; ++ dsphys->ds_flags = ds->ds_phys->ds_flags; ++ dsphys->ds_bp = ds->ds_phys->ds_bp; ++ dmu_buf_rele(dbuf, FTAG); ++ ++ ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); ++ if (ds->ds_prev) { ++ uint64_t next_clones_obj = ++ ds->ds_prev->ds_phys->ds_next_clones_obj; ++ ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == ++ ds->ds_object || ++ ds->ds_prev->ds_phys->ds_num_children > 1); ++ if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { ++ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); ++ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, ++ ds->ds_prev->ds_phys->ds_creation_txg); ++ ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; ++ } else if (next_clones_obj != 0) { ++ remove_from_next_clones(ds->ds_prev, ++ dsphys->ds_next_snap_obj, tx); ++ VERIFY3U(0, ==, zap_add_int(mos, ++ next_clones_obj, dsobj, tx)); ++ } ++ } ++ ++ /* ++ * If we have a reference-reservation on this dataset, we will ++ * need to increase the amount of refreservation being charged ++ * since our unique space is going to zero. ++ */ ++ if (ds->ds_reserved) { ++ int64_t delta; ++ ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); ++ delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); ++ dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, ++ delta, 0, 0, tx); ++ } ++ ++ dmu_buf_will_dirty(ds->ds_dbuf, tx); ++ zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", ++ ds->ds_dir->dd_myname, snapname, dsobj, ++ ds->ds_phys->ds_prev_snap_txg); ++ ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, ++ UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx); ++ dsl_deadlist_close(&ds->ds_deadlist); ++ dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); ++ dsl_deadlist_add_key(&ds->ds_deadlist, ++ ds->ds_phys->ds_prev_snap_txg, tx); ++ ++ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); ++ ds->ds_phys->ds_prev_snap_obj = dsobj; ++ ds->ds_phys->ds_prev_snap_txg = crtxg; ++ ds->ds_phys->ds_unique_bytes = 0; ++ if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) ++ ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; ++ ++ err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, ++ snapname, 8, 1, &dsobj, tx); ++ ASSERT(err == 0); ++ ++ if (ds->ds_prev) ++ dsl_dataset_drop_ref(ds->ds_prev, ds); ++ VERIFY(0 == dsl_dataset_get_ref(dp, ++ ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); ++ ++ dsl_scan_ds_snapshotted(ds, tx); ++ ++ dsl_dir_snap_cmtime_update(ds->ds_dir); ++ ++ spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx, ++ "dataset = %llu", dsobj); ++} ++ ++void ++dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) ++{ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ASSERT(ds->ds_objset != NULL); ++ ASSERT(ds->ds_phys->ds_next_snap_obj == 0); ++ ++ /* ++ * in case we had to change ds_fsid_guid when we opened it, ++ * sync it out now. ++ */ ++ dmu_buf_will_dirty(ds->ds_dbuf, tx); ++ ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; ++ ++ dsl_dir_dirty(ds->ds_dir, tx); ++ dmu_objset_sync(ds->ds_objset, zio, tx); ++} ++ ++static void ++get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) ++{ ++ uint64_t count = 0; ++ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ nvlist_t *propval; ++ nvlist_t *val; ++ ++ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); ++ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ ++ /* ++ * There may me missing entries in ds_next_clones_obj ++ * due to a bug in a previous version of the code. ++ * Only trust it if it has the right number of entries. ++ */ ++ if (ds->ds_phys->ds_next_clones_obj != 0) { ++ ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, ++ &count)); ++ } ++ if (count != ds->ds_phys->ds_num_children - 1) { ++ goto fail; ++ } ++ for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); ++ zap_cursor_retrieve(&zc, &za) == 0; ++ zap_cursor_advance(&zc)) { ++ dsl_dataset_t *clone; ++ char buf[ZFS_MAXNAMELEN]; ++ if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, ++ za.za_first_integer, FTAG, &clone) != 0) { ++ goto fail; ++ } ++ dsl_dir_name(clone->ds_dir, buf); ++ VERIFY(nvlist_add_boolean(val, buf) == 0); ++ dsl_dataset_rele(clone, FTAG); ++ } ++ zap_cursor_fini(&zc); ++ VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0); ++ VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), ++ propval) == 0); ++fail: ++ nvlist_free(val); ++ nvlist_free(propval); ++ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); ++} ++ ++void ++dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) ++{ ++ uint64_t refd, avail, uobjs, aobjs, ratio; ++ ++ dsl_dir_stats(ds->ds_dir, nv); ++ ++ dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); ++ ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, ++ ds->ds_phys->ds_creation_time); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, ++ ds->ds_phys->ds_creation_txg); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, ++ ds->ds_quota); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, ++ ds->ds_reserved); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, ++ ds->ds_phys->ds_guid); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, ++ ds->ds_phys->ds_unique_bytes); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, ++ ds->ds_object); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, ++ ds->ds_userrefs); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, ++ DS_IS_DEFER_DESTROY(ds) ? 1 : 0); ++ ++ if (ds->ds_phys->ds_prev_snap_obj != 0) { ++ uint64_t written, comp, uncomp; ++ dsl_pool_t *dp = ds->ds_dir->dd_pool; ++ dsl_dataset_t *prev; ++ int err; ++ ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ err = dsl_dataset_hold_obj(dp, ++ ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); ++ rw_exit(&dp->dp_config_rwlock); ++ if (err == 0) { ++ err = dsl_dataset_space_written(prev, ds, &written, ++ &comp, &uncomp); ++ dsl_dataset_rele(prev, FTAG); ++ if (err == 0) { ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, ++ written); ++ } ++ } ++ } ++ ++ ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : ++ (ds->ds_phys->ds_uncompressed_bytes * 100 / ++ ds->ds_phys->ds_compressed_bytes); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); ++ ++ if (ds->ds_phys->ds_next_snap_obj) { ++ /* ++ * This is a snapshot; override the dd's space used with ++ * our unique space and compression ratio. ++ */ ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, ++ ds->ds_phys->ds_unique_bytes); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); ++ ++ get_clones_stat(ds, nv); ++ } ++} ++ ++void ++dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) ++{ ++ stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; ++ stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; ++ stat->dds_guid = ds->ds_phys->ds_guid; ++ if (ds->ds_phys->ds_next_snap_obj) { ++ stat->dds_is_snapshot = B_TRUE; ++ stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; ++ } else { ++ stat->dds_is_snapshot = B_FALSE; ++ stat->dds_num_clones = 0; ++ } ++ ++ /* clone origin is really a dsl_dir thing... */ ++ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); ++ if (dsl_dir_is_clone(ds->ds_dir)) { ++ dsl_dataset_t *ods; ++ ++ VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, ++ ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); ++ dsl_dataset_name(ods, stat->dds_origin); ++ dsl_dataset_drop_ref(ods, FTAG); ++ } else { ++ stat->dds_origin[0] = '\0'; ++ } ++ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); ++} ++ ++uint64_t ++dsl_dataset_fsid_guid(dsl_dataset_t *ds) ++{ ++ return (ds->ds_fsid_guid); ++} ++ ++void ++dsl_dataset_space(dsl_dataset_t *ds, ++ uint64_t *refdbytesp, uint64_t *availbytesp, ++ uint64_t *usedobjsp, uint64_t *availobjsp) ++{ ++ *refdbytesp = ds->ds_phys->ds_used_bytes; ++ *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); ++ if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) ++ *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; ++ if (ds->ds_quota != 0) { ++ /* ++ * Adjust available bytes according to refquota ++ */ ++ if (*refdbytesp < ds->ds_quota) ++ *availbytesp = MIN(*availbytesp, ++ ds->ds_quota - *refdbytesp); ++ else ++ *availbytesp = 0; ++ } ++ *usedobjsp = ds->ds_phys->ds_bp.blk_fill; ++ *availobjsp = DN_MAX_OBJECT - *usedobjsp; ++} ++ ++boolean_t ++dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) ++{ ++ ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool); ++ ++ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || ++ dsl_pool_sync_context(dp)); ++ if (ds->ds_prev == NULL) ++ return (B_FALSE); ++ if (ds->ds_phys->ds_bp.blk_birth > ++ ds->ds_prev->ds_phys->ds_creation_txg) { ++ objset_t *os, *os_prev; ++ /* ++ * It may be that only the ZIL differs, because it was ++ * reset in the head. Don't count that as being ++ * modified. ++ */ ++ if (dmu_objset_from_ds(ds, &os) != 0) ++ return (B_TRUE); ++ if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0) ++ return (B_TRUE); ++ return (bcmp(&os->os_phys->os_meta_dnode, ++ &os_prev->os_phys->os_meta_dnode, ++ sizeof (os->os_phys->os_meta_dnode)) != 0); ++ } ++ return (B_FALSE); ++} ++ ++/* ARGSUSED */ ++static int ++dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ char *newsnapname = arg2; ++ dsl_dir_t *dd = ds->ds_dir; ++ dsl_dataset_t *hds; ++ uint64_t val; ++ int err; ++ ++ err = dsl_dataset_hold_obj(dd->dd_pool, ++ dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); ++ if (err) ++ return (err); ++ ++ /* new name better not be in use */ ++ err = dsl_dataset_snap_lookup(hds, newsnapname, &val); ++ dsl_dataset_rele(hds, FTAG); ++ ++ if (err == 0) ++ err = EEXIST; ++ else if (err == ENOENT) ++ err = 0; ++ ++ /* dataset name + 1 for the "@" + the new snapshot name must fit */ ++ if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) ++ err = ENAMETOOLONG; ++ ++ return (err); ++} ++ ++static void ++dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ const char *newsnapname = arg2; ++ dsl_dir_t *dd = ds->ds_dir; ++ objset_t *mos = dd->dd_pool->dp_meta_objset; ++ dsl_dataset_t *hds; ++ int err; ++ ++ ASSERT(ds->ds_phys->ds_next_snap_obj != 0); ++ ++ VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, ++ dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); ++ ++ VERIFY(0 == dsl_dataset_get_snapname(ds)); ++ err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); ++ ASSERT3U(err, ==, 0); ++ mutex_enter(&ds->ds_lock); ++ (void) strcpy(ds->ds_snapname, newsnapname); ++ mutex_exit(&ds->ds_lock); ++ err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, ++ ds->ds_snapname, 8, 1, &ds->ds_object, tx); ++ ASSERT3U(err, ==, 0); ++ ++ spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, ++ "dataset = %llu", ds->ds_object); ++ dsl_dataset_rele(hds, FTAG); ++} ++ ++struct renamesnaparg { ++ dsl_sync_task_group_t *dstg; ++ char failed[MAXPATHLEN]; ++ char *oldsnap; ++ char *newsnap; ++}; ++ ++static int ++dsl_snapshot_rename_one(const char *name, void *arg) ++{ ++ struct renamesnaparg *ra = arg; ++ dsl_dataset_t *ds = NULL; ++ char *snapname; ++ int err; ++ ++ snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); ++ (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); ++ ++ /* ++ * For recursive snapshot renames the parent won't be changing ++ * so we just pass name for both the to/from argument. ++ */ ++ err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); ++ if (err != 0) { ++ strfree(snapname); ++ return (err == ENOENT ? 0 : err); ++ } ++ ++#ifdef _KERNEL ++ /* ++ * For all filesystems undergoing rename, we'll need to unmount it. ++ */ ++ (void) zfs_unmount_snap(snapname, NULL); ++#endif ++ err = dsl_dataset_hold(snapname, ra->dstg, &ds); ++ strfree(snapname); ++ if (err != 0) ++ return (err == ENOENT ? 0 : err); ++ ++ dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, ++ dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); ++ ++ return (0); ++} ++ ++static int ++dsl_recursive_rename(char *oldname, const char *newname) ++{ ++ int err; ++ struct renamesnaparg *ra; ++ dsl_sync_task_t *dst; ++ spa_t *spa; ++ char *cp, *fsname = spa_strdup(oldname); ++ int len = strlen(oldname) + 1; ++ ++ /* truncate the snapshot name to get the fsname */ ++ cp = strchr(fsname, '@'); ++ *cp = '\0'; ++ ++ err = spa_open(fsname, &spa, FTAG); ++ if (err) { ++ kmem_free(fsname, len); ++ return (err); ++ } ++ ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); ++ ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); ++ ++ ra->oldsnap = strchr(oldname, '@') + 1; ++ ra->newsnap = strchr(newname, '@') + 1; ++ *ra->failed = '\0'; ++ ++ err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, ++ DS_FIND_CHILDREN); ++ kmem_free(fsname, len); ++ ++ if (err == 0) { ++ err = dsl_sync_task_group_wait(ra->dstg); ++ } ++ ++ for (dst = list_head(&ra->dstg->dstg_tasks); dst; ++ dst = list_next(&ra->dstg->dstg_tasks, dst)) { ++ dsl_dataset_t *ds = dst->dst_arg1; ++ if (dst->dst_err) { ++ dsl_dir_name(ds->ds_dir, ra->failed); ++ (void) strlcat(ra->failed, "@", sizeof (ra->failed)); ++ (void) strlcat(ra->failed, ra->newsnap, ++ sizeof (ra->failed)); ++ } ++ dsl_dataset_rele(ds, ra->dstg); ++ } ++ ++ if (err) ++ (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); ++ ++ dsl_sync_task_group_destroy(ra->dstg); ++ kmem_free(ra, sizeof (struct renamesnaparg)); ++ spa_close(spa, FTAG); ++ return (err); ++} ++ ++static int ++dsl_valid_rename(const char *oldname, void *arg) ++{ ++ int delta = *(int *)arg; ++ ++ if (strlen(oldname) + delta >= MAXNAMELEN) ++ return (ENAMETOOLONG); ++ ++ return (0); ++} ++ ++#pragma weak dmu_objset_rename = dsl_dataset_rename ++int ++dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) ++{ ++ dsl_dir_t *dd; ++ dsl_dataset_t *ds; ++ const char *tail; ++ int err; ++ ++ err = dsl_dir_open(oldname, FTAG, &dd, &tail); ++ if (err) ++ return (err); ++ ++ if (tail == NULL) { ++ int delta = strlen(newname) - strlen(oldname); ++ ++ /* if we're growing, validate child name lengths */ ++ if (delta > 0) ++ err = dmu_objset_find(oldname, dsl_valid_rename, ++ &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); ++ ++ if (err == 0) ++ err = dsl_dir_rename(dd, newname); ++ dsl_dir_close(dd, FTAG); ++ return (err); ++ } ++ ++ if (tail[0] != '@') { ++ /* the name ended in a nonexistent component */ ++ dsl_dir_close(dd, FTAG); ++ return (ENOENT); ++ } ++ ++ dsl_dir_close(dd, FTAG); ++ ++ /* new name must be snapshot in same filesystem */ ++ tail = strchr(newname, '@'); ++ if (tail == NULL) ++ return (EINVAL); ++ tail++; ++ if (strncmp(oldname, newname, tail - newname) != 0) ++ return (EXDEV); ++ ++ if (recursive) { ++ err = dsl_recursive_rename(oldname, newname); ++ } else { ++ err = dsl_dataset_hold(oldname, FTAG, &ds); ++ if (err) ++ return (err); ++ ++ err = dsl_sync_task_do(ds->ds_dir->dd_pool, ++ dsl_dataset_snapshot_rename_check, ++ dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); ++ ++ dsl_dataset_rele(ds, FTAG); ++ } ++ ++ return (err); ++} ++ ++struct promotenode { ++ list_node_t link; ++ dsl_dataset_t *ds; ++}; ++ ++struct promotearg { ++ list_t shared_snaps, origin_snaps, clone_snaps; ++ dsl_dataset_t *origin_origin; ++ uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; ++ char *err_ds; ++}; ++ ++static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); ++ ++static int ++dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *hds = arg1; ++ struct promotearg *pa = arg2; ++ struct promotenode *snap = list_head(&pa->shared_snaps); ++ dsl_dataset_t *origin_ds = snap->ds; ++ int err; ++ uint64_t unused; ++ ++ /* Check that it is a real clone */ ++ if (!dsl_dir_is_clone(hds->ds_dir)) ++ return (EINVAL); ++ ++ /* Since this is so expensive, don't do the preliminary check */ ++ if (!dmu_tx_is_syncing(tx)) ++ return (0); ++ ++ if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) ++ return (EXDEV); ++ ++ /* compute origin's new unique space */ ++ snap = list_tail(&pa->clone_snaps); ++ ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); ++ dsl_deadlist_space_range(&snap->ds->ds_deadlist, ++ origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, ++ &pa->unique, &unused, &unused); ++ ++ /* ++ * Walk the snapshots that we are moving ++ * ++ * Compute space to transfer. Consider the incremental changes ++ * to used for each snapshot: ++ * (my used) = (prev's used) + (blocks born) - (blocks killed) ++ * So each snapshot gave birth to: ++ * (blocks born) = (my used) - (prev's used) + (blocks killed) ++ * So a sequence would look like: ++ * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) ++ * Which simplifies to: ++ * uN + kN + kN-1 + ... + k1 + k0 ++ * Note however, if we stop before we reach the ORIGIN we get: ++ * uN + kN + kN-1 + ... + kM - uM-1 ++ */ ++ pa->used = origin_ds->ds_phys->ds_used_bytes; ++ pa->comp = origin_ds->ds_phys->ds_compressed_bytes; ++ pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; ++ for (snap = list_head(&pa->shared_snaps); snap; ++ snap = list_next(&pa->shared_snaps, snap)) { ++ uint64_t val, dlused, dlcomp, dluncomp; ++ dsl_dataset_t *ds = snap->ds; ++ ++ /* Check that the snapshot name does not conflict */ ++ VERIFY(0 == dsl_dataset_get_snapname(ds)); ++ err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); ++ if (err == 0) { ++ err = EEXIST; ++ goto out; ++ } ++ if (err != ENOENT) ++ goto out; ++ ++ /* The very first snapshot does not have a deadlist */ ++ if (ds->ds_phys->ds_prev_snap_obj == 0) ++ continue; ++ ++ dsl_deadlist_space(&ds->ds_deadlist, ++ &dlused, &dlcomp, &dluncomp); ++ pa->used += dlused; ++ pa->comp += dlcomp; ++ pa->uncomp += dluncomp; ++ } ++ ++ /* ++ * If we are a clone of a clone then we never reached ORIGIN, ++ * so we need to subtract out the clone origin's used space. ++ */ ++ if (pa->origin_origin) { ++ pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; ++ pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; ++ pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; ++ } ++ ++ /* Check that there is enough space here */ ++ err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, ++ pa->used); ++ if (err) ++ return (err); ++ ++ /* ++ * Compute the amounts of space that will be used by snapshots ++ * after the promotion (for both origin and clone). For each, ++ * it is the amount of space that will be on all of their ++ * deadlists (that was not born before their new origin). ++ */ ++ if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { ++ uint64_t space; ++ ++ /* ++ * Note, typically this will not be a clone of a clone, ++ * so dd_origin_txg will be < TXG_INITIAL, so ++ * these snaplist_space() -> dsl_deadlist_space_range() ++ * calls will be fast because they do not have to ++ * iterate over all bps. ++ */ ++ snap = list_head(&pa->origin_snaps); ++ err = snaplist_space(&pa->shared_snaps, ++ snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); ++ if (err) ++ return (err); ++ ++ err = snaplist_space(&pa->clone_snaps, ++ snap->ds->ds_dir->dd_origin_txg, &space); ++ if (err) ++ return (err); ++ pa->cloneusedsnap += space; ++ } ++ if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { ++ err = snaplist_space(&pa->origin_snaps, ++ origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); ++ if (err) ++ return (err); ++ } ++ ++ return (0); ++out: ++ pa->err_ds = snap->ds->ds_snapname; ++ return (err); ++} ++ ++static void ++dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *hds = arg1; ++ struct promotearg *pa = arg2; ++ struct promotenode *snap = list_head(&pa->shared_snaps); ++ dsl_dataset_t *origin_ds = snap->ds; ++ dsl_dataset_t *origin_head; ++ dsl_dir_t *dd = hds->ds_dir; ++ dsl_pool_t *dp = hds->ds_dir->dd_pool; ++ dsl_dir_t *odd = NULL; ++ uint64_t oldnext_obj; ++ int64_t delta; ++ ++ ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); ++ ++ snap = list_head(&pa->origin_snaps); ++ origin_head = snap->ds; ++ ++ /* ++ * We need to explicitly open odd, since origin_ds's dd will be ++ * changing. ++ */ ++ VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, ++ NULL, FTAG, &odd)); ++ ++ /* change origin's next snap */ ++ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); ++ oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; ++ snap = list_tail(&pa->clone_snaps); ++ ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); ++ origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; ++ ++ /* change the origin's next clone */ ++ if (origin_ds->ds_phys->ds_next_clones_obj) { ++ remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); ++ VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, ++ origin_ds->ds_phys->ds_next_clones_obj, ++ oldnext_obj, tx)); ++ } ++ ++ /* change origin */ ++ dmu_buf_will_dirty(dd->dd_dbuf, tx); ++ ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); ++ dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; ++ dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; ++ dmu_buf_will_dirty(odd->dd_dbuf, tx); ++ odd->dd_phys->dd_origin_obj = origin_ds->ds_object; ++ origin_head->ds_dir->dd_origin_txg = ++ origin_ds->ds_phys->ds_creation_txg; ++ ++ /* change dd_clone entries */ ++ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { ++ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, ++ odd->dd_phys->dd_clones, hds->ds_object, tx)); ++ VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, ++ pa->origin_origin->ds_dir->dd_phys->dd_clones, ++ hds->ds_object, tx)); ++ ++ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, ++ pa->origin_origin->ds_dir->dd_phys->dd_clones, ++ origin_head->ds_object, tx)); ++ if (dd->dd_phys->dd_clones == 0) { ++ dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset, ++ DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); ++ } ++ VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, ++ dd->dd_phys->dd_clones, origin_head->ds_object, tx)); ++ ++ } ++ ++ /* move snapshots to this dir */ ++ for (snap = list_head(&pa->shared_snaps); snap; ++ snap = list_next(&pa->shared_snaps, snap)) { ++ dsl_dataset_t *ds = snap->ds; ++ ++ /* unregister props as dsl_dir is changing */ ++ if (ds->ds_objset) { ++ dmu_objset_evict(ds->ds_objset); ++ ds->ds_objset = NULL; ++ } ++ /* move snap name entry */ ++ VERIFY(0 == dsl_dataset_get_snapname(ds)); ++ VERIFY(0 == dsl_dataset_snap_remove(origin_head, ++ ds->ds_snapname, tx)); ++ VERIFY(0 == zap_add(dp->dp_meta_objset, ++ hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, ++ 8, 1, &ds->ds_object, tx)); ++ ++ /* change containing dsl_dir */ ++ dmu_buf_will_dirty(ds->ds_dbuf, tx); ++ ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); ++ ds->ds_phys->ds_dir_obj = dd->dd_object; ++ ASSERT3P(ds->ds_dir, ==, odd); ++ dsl_dir_close(ds->ds_dir, ds); ++ VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, ++ NULL, ds, &ds->ds_dir)); ++ ++ /* move any clone references */ ++ if (ds->ds_phys->ds_next_clones_obj && ++ spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ ++ for (zap_cursor_init(&zc, dp->dp_meta_objset, ++ ds->ds_phys->ds_next_clones_obj); ++ zap_cursor_retrieve(&zc, &za) == 0; ++ zap_cursor_advance(&zc)) { ++ dsl_dataset_t *cnds; ++ uint64_t o; ++ ++ if (za.za_first_integer == oldnext_obj) { ++ /* ++ * We've already moved the ++ * origin's reference. ++ */ ++ continue; ++ } ++ ++ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, ++ za.za_first_integer, FTAG, &cnds)); ++ o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; ++ ++ VERIFY3U(zap_remove_int(dp->dp_meta_objset, ++ odd->dd_phys->dd_clones, o, tx), ==, 0); ++ VERIFY3U(zap_add_int(dp->dp_meta_objset, ++ dd->dd_phys->dd_clones, o, tx), ==, 0); ++ dsl_dataset_rele(cnds, FTAG); ++ } ++ zap_cursor_fini(&zc); ++ } ++ ++ ASSERT3U(dsl_prop_numcb(ds), ==, 0); ++ } ++ ++ /* ++ * Change space accounting. ++ * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either ++ * both be valid, or both be 0 (resulting in delta == 0). This ++ * is true for each of {clone,origin} independently. ++ */ ++ ++ delta = pa->cloneusedsnap - ++ dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; ++ ASSERT3S(delta, >=, 0); ++ ASSERT3U(pa->used, >=, delta); ++ dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); ++ dsl_dir_diduse_space(dd, DD_USED_HEAD, ++ pa->used - delta, pa->comp, pa->uncomp, tx); ++ ++ delta = pa->originusedsnap - ++ odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; ++ ASSERT3S(delta, <=, 0); ++ ASSERT3U(pa->used, >=, -delta); ++ dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); ++ dsl_dir_diduse_space(odd, DD_USED_HEAD, ++ -pa->used - delta, -pa->comp, -pa->uncomp, tx); ++ ++ origin_ds->ds_phys->ds_unique_bytes = pa->unique; ++ ++ /* log history record */ ++ spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, ++ "dataset = %llu", hds->ds_object); ++ ++ dsl_dir_close(odd, FTAG); ++} ++ ++static char *snaplist_tag = "snaplist"; ++/* ++ * Make a list of dsl_dataset_t's for the snapshots between first_obj ++ * (exclusive) and last_obj (inclusive). The list will be in reverse ++ * order (last_obj will be the list_head()). If first_obj == 0, do all ++ * snapshots back to this dataset's origin. ++ */ ++static int ++snaplist_make(dsl_pool_t *dp, boolean_t own, ++ uint64_t first_obj, uint64_t last_obj, list_t *l) ++{ ++ uint64_t obj = last_obj; ++ ++ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); ++ ++ list_create(l, sizeof (struct promotenode), ++ offsetof(struct promotenode, link)); ++ ++ while (obj != first_obj) { ++ dsl_dataset_t *ds; ++ struct promotenode *snap; ++ int err; ++ ++ if (own) { ++ err = dsl_dataset_own_obj(dp, obj, ++ 0, snaplist_tag, &ds); ++ if (err == 0) ++ dsl_dataset_make_exclusive(ds, snaplist_tag); ++ } else { ++ err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); ++ } ++ if (err == ENOENT) { ++ /* lost race with snapshot destroy */ ++ struct promotenode *last = list_tail(l); ++ ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); ++ obj = last->ds->ds_phys->ds_prev_snap_obj; ++ continue; ++ } else if (err) { ++ return (err); ++ } ++ ++ if (first_obj == 0) ++ first_obj = ds->ds_dir->dd_phys->dd_origin_obj; ++ ++ snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); ++ snap->ds = ds; ++ list_insert_tail(l, snap); ++ obj = ds->ds_phys->ds_prev_snap_obj; ++ } ++ ++ return (0); ++} ++ ++static int ++snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) ++{ ++ struct promotenode *snap; ++ ++ *spacep = 0; ++ for (snap = list_head(l); snap; snap = list_next(l, snap)) { ++ uint64_t used, comp, uncomp; ++ dsl_deadlist_space_range(&snap->ds->ds_deadlist, ++ mintxg, UINT64_MAX, &used, &comp, &uncomp); ++ *spacep += used; ++ } ++ return (0); ++} ++ ++static void ++snaplist_destroy(list_t *l, boolean_t own) ++{ ++ struct promotenode *snap; ++ ++ if (!l || !list_link_active(&l->list_head)) ++ return; ++ ++ while ((snap = list_tail(l)) != NULL) { ++ list_remove(l, snap); ++ if (own) ++ dsl_dataset_disown(snap->ds, snaplist_tag); ++ else ++ dsl_dataset_rele(snap->ds, snaplist_tag); ++ kmem_free(snap, sizeof (struct promotenode)); ++ } ++ list_destroy(l); ++} ++ ++/* ++ * Promote a clone. Nomenclature note: ++ * "clone" or "cds": the original clone which is being promoted ++ * "origin" or "ods": the snapshot which is originally clone's origin ++ * "origin head" or "ohds": the dataset which is the head ++ * (filesystem/volume) for the origin ++ * "origin origin": the origin of the origin's filesystem (typically ++ * NULL, indicating that the clone is not a clone of a clone). ++ */ ++int ++dsl_dataset_promote(const char *name, char *conflsnap) ++{ ++ dsl_dataset_t *ds; ++ dsl_dir_t *dd; ++ dsl_pool_t *dp; ++ dmu_object_info_t doi; ++ struct promotearg pa; ++ struct promotenode *snap; ++ int err; ++ ++ bzero(&pa, sizeof(struct promotearg)); ++ err = dsl_dataset_hold(name, FTAG, &ds); ++ if (err) ++ return (err); ++ dd = ds->ds_dir; ++ dp = dd->dd_pool; ++ ++ err = dmu_object_info(dp->dp_meta_objset, ++ ds->ds_phys->ds_snapnames_zapobj, &doi); ++ if (err) { ++ dsl_dataset_rele(ds, FTAG); ++ return (err); ++ } ++ ++ if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { ++ dsl_dataset_rele(ds, FTAG); ++ return (EINVAL); ++ } ++ ++ /* ++ * We are going to inherit all the snapshots taken before our ++ * origin (i.e., our new origin will be our parent's origin). ++ * Take ownership of them so that we can rename them into our ++ * namespace. ++ */ ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ ++ err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, ++ &pa.shared_snaps); ++ if (err != 0) ++ goto out; ++ ++ err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); ++ if (err != 0) ++ goto out; ++ ++ snap = list_head(&pa.shared_snaps); ++ ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); ++ err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, ++ snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); ++ if (err != 0) ++ goto out; ++ ++ if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { ++ err = dsl_dataset_hold_obj(dp, ++ snap->ds->ds_dir->dd_phys->dd_origin_obj, ++ FTAG, &pa.origin_origin); ++ if (err != 0) ++ goto out; ++ } ++ ++out: ++ rw_exit(&dp->dp_config_rwlock); ++ ++ /* ++ * Add in 128x the snapnames zapobj size, since we will be moving ++ * a bunch of snapnames to the promoted ds, and dirtying their ++ * bonus buffers. ++ */ ++ if (err == 0) { ++ err = dsl_sync_task_do(dp, dsl_dataset_promote_check, ++ dsl_dataset_promote_sync, ds, &pa, ++ 2 + 2 * doi.doi_physical_blocks_512); ++ if (err && pa.err_ds && conflsnap) ++ (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); ++ } ++ ++ snaplist_destroy(&pa.shared_snaps, B_TRUE); ++ snaplist_destroy(&pa.clone_snaps, B_FALSE); ++ snaplist_destroy(&pa.origin_snaps, B_FALSE); ++ if (pa.origin_origin) ++ dsl_dataset_rele(pa.origin_origin, FTAG); ++ dsl_dataset_rele(ds, FTAG); ++ return (err); ++} ++ ++struct cloneswaparg { ++ dsl_dataset_t *cds; /* clone dataset */ ++ dsl_dataset_t *ohds; /* origin's head dataset */ ++ boolean_t force; ++ int64_t unused_refres_delta; /* change in unconsumed refreservation */ ++}; ++ ++/* ARGSUSED */ ++static int ++dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ struct cloneswaparg *csa = arg1; ++ ++ /* they should both be heads */ ++ if (dsl_dataset_is_snapshot(csa->cds) || ++ dsl_dataset_is_snapshot(csa->ohds)) ++ return (EINVAL); ++ ++ /* the branch point should be just before them */ ++ if (csa->cds->ds_prev != csa->ohds->ds_prev) ++ return (EINVAL); ++ ++ /* cds should be the clone (unless they are unrelated) */ ++ if (csa->cds->ds_prev != NULL && ++ csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && ++ csa->ohds->ds_object != ++ csa->cds->ds_prev->ds_phys->ds_next_snap_obj) ++ return (EINVAL); ++ ++ /* the clone should be a child of the origin */ ++ if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) ++ return (EINVAL); ++ ++ /* ohds shouldn't be modified unless 'force' */ ++ if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) ++ return (ETXTBSY); ++ ++ /* adjust amount of any unconsumed refreservation */ ++ csa->unused_refres_delta = ++ (int64_t)MIN(csa->ohds->ds_reserved, ++ csa->ohds->ds_phys->ds_unique_bytes) - ++ (int64_t)MIN(csa->ohds->ds_reserved, ++ csa->cds->ds_phys->ds_unique_bytes); ++ ++ if (csa->unused_refres_delta > 0 && ++ csa->unused_refres_delta > ++ dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) ++ return (ENOSPC); ++ ++ if (csa->ohds->ds_quota != 0 && ++ csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) ++ return (EDQUOT); ++ ++ return (0); ++} ++ ++/* ARGSUSED */ ++static void ++dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ struct cloneswaparg *csa = arg1; ++ dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; ++ ++ ASSERT(csa->cds->ds_reserved == 0); ++ ASSERT(csa->ohds->ds_quota == 0 || ++ csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); ++ ++ dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); ++ dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); ++ ++ if (csa->cds->ds_objset != NULL) { ++ dmu_objset_evict(csa->cds->ds_objset); ++ csa->cds->ds_objset = NULL; ++ } ++ ++ if (csa->ohds->ds_objset != NULL) { ++ dmu_objset_evict(csa->ohds->ds_objset); ++ csa->ohds->ds_objset = NULL; ++ } ++ ++ /* ++ * Reset origin's unique bytes, if it exists. ++ */ ++ if (csa->cds->ds_prev) { ++ dsl_dataset_t *origin = csa->cds->ds_prev; ++ uint64_t comp, uncomp; ++ ++ dmu_buf_will_dirty(origin->ds_dbuf, tx); ++ dsl_deadlist_space_range(&csa->cds->ds_deadlist, ++ origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, ++ &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); ++ } ++ ++ /* swap blkptrs */ ++ { ++ blkptr_t tmp; ++ tmp = csa->ohds->ds_phys->ds_bp; ++ csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; ++ csa->cds->ds_phys->ds_bp = tmp; ++ } ++ ++ /* set dd_*_bytes */ ++ { ++ int64_t dused, dcomp, duncomp; ++ uint64_t cdl_used, cdl_comp, cdl_uncomp; ++ uint64_t odl_used, odl_comp, odl_uncomp; ++ ++ ASSERT3U(csa->cds->ds_dir->dd_phys-> ++ dd_used_breakdown[DD_USED_SNAP], ==, 0); ++ ++ dsl_deadlist_space(&csa->cds->ds_deadlist, ++ &cdl_used, &cdl_comp, &cdl_uncomp); ++ dsl_deadlist_space(&csa->ohds->ds_deadlist, ++ &odl_used, &odl_comp, &odl_uncomp); ++ ++ dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - ++ (csa->ohds->ds_phys->ds_used_bytes + odl_used); ++ dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - ++ (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); ++ duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + ++ cdl_uncomp - ++ (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); ++ ++ dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, ++ dused, dcomp, duncomp, tx); ++ dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, ++ -dused, -dcomp, -duncomp, tx); ++ ++ /* ++ * The difference in the space used by snapshots is the ++ * difference in snapshot space due to the head's ++ * deadlist (since that's the only thing that's ++ * changing that affects the snapused). ++ */ ++ dsl_deadlist_space_range(&csa->cds->ds_deadlist, ++ csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, ++ &cdl_used, &cdl_comp, &cdl_uncomp); ++ dsl_deadlist_space_range(&csa->ohds->ds_deadlist, ++ csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, ++ &odl_used, &odl_comp, &odl_uncomp); ++ dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, ++ DD_USED_HEAD, DD_USED_SNAP, tx); ++ } ++ ++ /* swap ds_*_bytes */ ++ SWITCH64(csa->ohds->ds_phys->ds_used_bytes, ++ csa->cds->ds_phys->ds_used_bytes); ++ SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, ++ csa->cds->ds_phys->ds_compressed_bytes); ++ SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, ++ csa->cds->ds_phys->ds_uncompressed_bytes); ++ SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, ++ csa->cds->ds_phys->ds_unique_bytes); ++ ++ /* apply any parent delta for change in unconsumed refreservation */ ++ dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, ++ csa->unused_refres_delta, 0, 0, tx); ++ ++ /* ++ * Swap deadlists. ++ */ ++ dsl_deadlist_close(&csa->cds->ds_deadlist); ++ dsl_deadlist_close(&csa->ohds->ds_deadlist); ++ SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, ++ csa->cds->ds_phys->ds_deadlist_obj); ++ dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, ++ csa->cds->ds_phys->ds_deadlist_obj); ++ dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, ++ csa->ohds->ds_phys->ds_deadlist_obj); ++ ++ dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); ++} ++ ++/* ++ * Swap 'clone' with its origin head datasets. Used at the end of "zfs ++ * recv" into an existing fs to swizzle the file system to the new ++ * version, and by "zfs rollback". Can also be used to swap two ++ * independent head datasets if neither has any snapshots. ++ */ ++int ++dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, ++ boolean_t force) ++{ ++ struct cloneswaparg csa; ++ int error; ++ ++ ASSERT(clone->ds_owner); ++ ASSERT(origin_head->ds_owner); ++retry: ++ /* ++ * Need exclusive access for the swap. If we're swapping these ++ * datasets back after an error, we already hold the locks. ++ */ ++ if (!RW_WRITE_HELD(&clone->ds_rwlock)) ++ rw_enter(&clone->ds_rwlock, RW_WRITER); ++ if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && ++ !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { ++ rw_exit(&clone->ds_rwlock); ++ rw_enter(&origin_head->ds_rwlock, RW_WRITER); ++ if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { ++ rw_exit(&origin_head->ds_rwlock); ++ goto retry; ++ } ++ } ++ csa.cds = clone; ++ csa.ohds = origin_head; ++ csa.force = force; ++ error = dsl_sync_task_do(clone->ds_dir->dd_pool, ++ dsl_dataset_clone_swap_check, ++ dsl_dataset_clone_swap_sync, &csa, NULL, 9); ++ return (error); ++} ++ ++/* ++ * Given a pool name and a dataset object number in that pool, ++ * return the name of that dataset. ++ */ ++int ++dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) ++{ ++ spa_t *spa; ++ dsl_pool_t *dp; ++ dsl_dataset_t *ds; ++ int error; ++ ++ if ((error = spa_open(pname, &spa, FTAG)) != 0) ++ return (error); ++ dp = spa_get_dsl(spa); ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { ++ dsl_dataset_name(ds, buf); ++ dsl_dataset_rele(ds, FTAG); ++ } ++ rw_exit(&dp->dp_config_rwlock); ++ spa_close(spa, FTAG); ++ ++ return (error); ++} ++ ++int ++dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, ++ uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) ++{ ++ int error = 0; ++ ++ ASSERT3S(asize, >, 0); ++ ++ /* ++ * *ref_rsrv is the portion of asize that will come from any ++ * unconsumed refreservation space. ++ */ ++ *ref_rsrv = 0; ++ ++ mutex_enter(&ds->ds_lock); ++ /* ++ * Make a space adjustment for reserved bytes. ++ */ ++ if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { ++ ASSERT3U(*used, >=, ++ ds->ds_reserved - ds->ds_phys->ds_unique_bytes); ++ *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); ++ *ref_rsrv = ++ asize - MIN(asize, parent_delta(ds, asize + inflight)); ++ } ++ ++ if (!check_quota || ds->ds_quota == 0) { ++ mutex_exit(&ds->ds_lock); ++ return (0); ++ } ++ /* ++ * If they are requesting more space, and our current estimate ++ * is over quota, they get to try again unless the actual ++ * on-disk is over quota and there are no pending changes (which ++ * may free up space for us). ++ */ ++ if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { ++ if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) ++ error = ERESTART; ++ else ++ error = EDQUOT; ++ ++ DMU_TX_STAT_BUMP(dmu_tx_quota); ++ } ++ mutex_exit(&ds->ds_lock); ++ ++ return (error); ++} ++ ++/* ARGSUSED */ ++static int ++dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ dsl_prop_setarg_t *psa = arg2; ++ int err; ++ ++ if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) ++ return (ENOTSUP); ++ ++ if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) ++ return (err); ++ ++ if (psa->psa_effective_value == 0) ++ return (0); ++ ++ if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes || ++ psa->psa_effective_value < ds->ds_reserved) ++ return (ENOSPC); ++ ++ return (0); ++} ++ ++extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); ++ ++void ++dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ dsl_prop_setarg_t *psa = arg2; ++ uint64_t effective_value = psa->psa_effective_value; ++ ++ dsl_prop_set_sync(ds, psa, tx); ++ DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); ++ ++ if (ds->ds_quota != effective_value) { ++ dmu_buf_will_dirty(ds->ds_dbuf, tx); ++ ds->ds_quota = effective_value; ++ } ++} ++ ++int ++dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) ++{ ++ dsl_dataset_t *ds; ++ dsl_prop_setarg_t psa; ++ int err; ++ ++ dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); ++ ++ err = dsl_dataset_hold(dsname, FTAG, &ds); ++ if (err) ++ return (err); ++ ++ /* ++ * If someone removes a file, then tries to set the quota, we ++ * want to make sure the file freeing takes effect. ++ */ ++ txg_wait_open(ds->ds_dir->dd_pool, 0); ++ ++ err = dsl_sync_task_do(ds->ds_dir->dd_pool, ++ dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, ++ ds, &psa, 0); ++ ++ dsl_dataset_rele(ds, FTAG); ++ return (err); ++} ++ ++static int ++dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ dsl_prop_setarg_t *psa = arg2; ++ uint64_t effective_value; ++ uint64_t unique; ++ int err; ++ ++ if (spa_version(ds->ds_dir->dd_pool->dp_spa) < ++ SPA_VERSION_REFRESERVATION) ++ return (ENOTSUP); ++ ++ if (dsl_dataset_is_snapshot(ds)) ++ return (EINVAL); ++ ++ if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) ++ return (err); ++ ++ effective_value = psa->psa_effective_value; ++ ++ /* ++ * If we are doing the preliminary check in open context, the ++ * space estimates may be inaccurate. ++ */ ++ if (!dmu_tx_is_syncing(tx)) ++ return (0); ++ ++ mutex_enter(&ds->ds_lock); ++ if (!DS_UNIQUE_IS_ACCURATE(ds)) ++ dsl_dataset_recalc_head_uniq(ds); ++ unique = ds->ds_phys->ds_unique_bytes; ++ mutex_exit(&ds->ds_lock); ++ ++ if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { ++ uint64_t delta = MAX(unique, effective_value) - ++ MAX(unique, ds->ds_reserved); ++ ++ if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) ++ return (ENOSPC); ++ if (ds->ds_quota > 0 && ++ effective_value > ds->ds_quota) ++ return (ENOSPC); ++ } ++ ++ return (0); ++} ++ ++static void ++dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ dsl_prop_setarg_t *psa = arg2; ++ uint64_t effective_value = psa->psa_effective_value; ++ uint64_t unique; ++ int64_t delta; ++ ++ dsl_prop_set_sync(ds, psa, tx); ++ DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); ++ ++ dmu_buf_will_dirty(ds->ds_dbuf, tx); ++ ++ mutex_enter(&ds->ds_dir->dd_lock); ++ mutex_enter(&ds->ds_lock); ++ ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); ++ unique = ds->ds_phys->ds_unique_bytes; ++ delta = MAX(0, (int64_t)(effective_value - unique)) - ++ MAX(0, (int64_t)(ds->ds_reserved - unique)); ++ ds->ds_reserved = effective_value; ++ mutex_exit(&ds->ds_lock); ++ ++ dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); ++ mutex_exit(&ds->ds_dir->dd_lock); ++} ++ ++int ++dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, ++ uint64_t reservation) ++{ ++ dsl_dataset_t *ds; ++ dsl_prop_setarg_t psa; ++ int err; ++ ++ dsl_prop_setarg_init_uint64(&psa, "refreservation", source, ++ &reservation); ++ ++ err = dsl_dataset_hold(dsname, FTAG, &ds); ++ if (err) ++ return (err); ++ ++ err = dsl_sync_task_do(ds->ds_dir->dd_pool, ++ dsl_dataset_set_reservation_check, ++ dsl_dataset_set_reservation_sync, ds, &psa, 0); ++ ++ dsl_dataset_rele(ds, FTAG); ++ return (err); ++} ++ ++typedef struct zfs_hold_cleanup_arg { ++ dsl_pool_t *dp; ++ uint64_t dsobj; ++ char htag[MAXNAMELEN]; ++} zfs_hold_cleanup_arg_t; ++ ++static void ++dsl_dataset_user_release_onexit(void *arg) ++{ ++ zfs_hold_cleanup_arg_t *ca = arg; ++ ++ (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, ++ B_TRUE); ++ kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); ++} ++ ++void ++dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, ++ minor_t minor) ++{ ++ zfs_hold_cleanup_arg_t *ca; ++ ++ ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); ++ ca->dp = ds->ds_dir->dd_pool; ++ ca->dsobj = ds->ds_object; ++ (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); ++ VERIFY3U(0, ==, zfs_onexit_add_cb(minor, ++ dsl_dataset_user_release_onexit, ca, NULL)); ++} ++ ++/* ++ * If you add new checks here, you may need to add ++ * additional checks to the "temporary" case in ++ * snapshot_check() in dmu_objset.c. ++ */ ++static int ++dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ struct dsl_ds_holdarg *ha = arg2; ++ char *htag = ha->htag; ++ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; ++ int error = 0; ++ ++ if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) ++ return (ENOTSUP); ++ ++ if (!dsl_dataset_is_snapshot(ds)) ++ return (EINVAL); ++ ++ /* tags must be unique */ ++ mutex_enter(&ds->ds_lock); ++ if (ds->ds_phys->ds_userrefs_obj) { ++ error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, ++ 8, 1, tx); ++ if (error == 0) ++ error = EEXIST; ++ else if (error == ENOENT) ++ error = 0; ++ } ++ mutex_exit(&ds->ds_lock); ++ ++ if (error == 0 && ha->temphold && ++ strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) ++ error = E2BIG; ++ ++ return (error); ++} ++ ++void ++dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ struct dsl_ds_holdarg *ha = arg2; ++ char *htag = ha->htag; ++ dsl_pool_t *dp = ds->ds_dir->dd_pool; ++ objset_t *mos = dp->dp_meta_objset; ++ uint64_t now = gethrestime_sec(); ++ uint64_t zapobj; ++ ++ mutex_enter(&ds->ds_lock); ++ if (ds->ds_phys->ds_userrefs_obj == 0) { ++ /* ++ * This is the first user hold for this dataset. Create ++ * the userrefs zap object. ++ */ ++ dmu_buf_will_dirty(ds->ds_dbuf, tx); ++ zapobj = ds->ds_phys->ds_userrefs_obj = ++ zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); ++ } else { ++ zapobj = ds->ds_phys->ds_userrefs_obj; ++ } ++ ds->ds_userrefs++; ++ mutex_exit(&ds->ds_lock); ++ ++ VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); ++ ++ if (ha->temphold) { ++ VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, ++ htag, &now, tx)); ++ } ++ ++ spa_history_log_internal(LOG_DS_USER_HOLD, ++ dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag, ++ (int)ha->temphold, ds->ds_object); ++} ++ ++static int ++dsl_dataset_user_hold_one(const char *dsname, void *arg) ++{ ++ struct dsl_ds_holdarg *ha = arg; ++ dsl_dataset_t *ds; ++ int error; ++ char *name; ++ ++ /* alloc a buffer to hold dsname@snapname plus terminating NULL */ ++ name = kmem_asprintf("%s@%s", dsname, ha->snapname); ++ error = dsl_dataset_hold(name, ha->dstg, &ds); ++ strfree(name); ++ if (error == 0) { ++ ha->gotone = B_TRUE; ++ dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, ++ dsl_dataset_user_hold_sync, ds, ha, 0); ++ } else if (error == ENOENT && ha->recursive) { ++ error = 0; ++ } else { ++ (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); ++ } ++ return (error); ++} ++ ++int ++dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, ++ boolean_t temphold) ++{ ++ struct dsl_ds_holdarg *ha; ++ int error; ++ ++ ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); ++ ha->htag = htag; ++ ha->temphold = temphold; ++ error = dsl_sync_task_do(ds->ds_dir->dd_pool, ++ dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, ++ ds, ha, 0); ++ kmem_free(ha, sizeof (struct dsl_ds_holdarg)); ++ ++ return (error); ++} ++ ++int ++dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, ++ boolean_t recursive, boolean_t temphold, int cleanup_fd) ++{ ++ struct dsl_ds_holdarg *ha; ++ dsl_sync_task_t *dst; ++ spa_t *spa; ++ int error; ++ minor_t minor = 0; ++ ++ if (cleanup_fd != -1) { ++ /* Currently we only support cleanup-on-exit of tempholds. */ ++ if (!temphold) ++ return (EINVAL); ++ error = zfs_onexit_fd_hold(cleanup_fd, &minor); ++ if (error) ++ return (error); ++ } ++ ++ ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); ++ ++ (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); ++ ++ error = spa_open(dsname, &spa, FTAG); ++ if (error) { ++ kmem_free(ha, sizeof (struct dsl_ds_holdarg)); ++ if (cleanup_fd != -1) ++ zfs_onexit_fd_rele(cleanup_fd); ++ return (error); ++ } ++ ++ ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); ++ ha->htag = htag; ++ ha->snapname = snapname; ++ ha->recursive = recursive; ++ ha->temphold = temphold; ++ ++ if (recursive) { ++ error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, ++ ha, DS_FIND_CHILDREN); ++ } else { ++ error = dsl_dataset_user_hold_one(dsname, ha); ++ } ++ if (error == 0) ++ error = dsl_sync_task_group_wait(ha->dstg); ++ ++ for (dst = list_head(&ha->dstg->dstg_tasks); dst; ++ dst = list_next(&ha->dstg->dstg_tasks, dst)) { ++ dsl_dataset_t *ds = dst->dst_arg1; ++ ++ if (dst->dst_err) { ++ dsl_dataset_name(ds, ha->failed); ++ *strchr(ha->failed, '@') = '\0'; ++ } else if (error == 0 && minor != 0 && temphold) { ++ /* ++ * If this hold is to be released upon process exit, ++ * register that action now. ++ */ ++ dsl_register_onexit_hold_cleanup(ds, htag, minor); ++ } ++ dsl_dataset_rele(ds, ha->dstg); ++ } ++ ++ if (error == 0 && recursive && !ha->gotone) ++ error = ENOENT; ++ ++ if (error) ++ (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); ++ ++ dsl_sync_task_group_destroy(ha->dstg); ++ ++ kmem_free(ha, sizeof (struct dsl_ds_holdarg)); ++ spa_close(spa, FTAG); ++ if (cleanup_fd != -1) ++ zfs_onexit_fd_rele(cleanup_fd); ++ return (error); ++} ++ ++struct dsl_ds_releasearg { ++ dsl_dataset_t *ds; ++ const char *htag; ++ boolean_t own; /* do we own or just hold ds? */ ++}; ++ ++static int ++dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, ++ boolean_t *might_destroy) ++{ ++ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; ++ uint64_t zapobj; ++ uint64_t tmp; ++ int error; ++ ++ *might_destroy = B_FALSE; ++ ++ mutex_enter(&ds->ds_lock); ++ zapobj = ds->ds_phys->ds_userrefs_obj; ++ if (zapobj == 0) { ++ /* The tag can't possibly exist */ ++ mutex_exit(&ds->ds_lock); ++ return (ESRCH); ++ } ++ ++ /* Make sure the tag exists */ ++ error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); ++ if (error) { ++ mutex_exit(&ds->ds_lock); ++ if (error == ENOENT) ++ error = ESRCH; ++ return (error); ++ } ++ ++ if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && ++ DS_IS_DEFER_DESTROY(ds)) ++ *might_destroy = B_TRUE; ++ ++ mutex_exit(&ds->ds_lock); ++ return (0); ++} ++ ++static int ++dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) ++{ ++ struct dsl_ds_releasearg *ra = arg1; ++ dsl_dataset_t *ds = ra->ds; ++ boolean_t might_destroy; ++ int error; ++ ++ if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) ++ return (ENOTSUP); ++ ++ error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); ++ if (error) ++ return (error); ++ ++ if (might_destroy) { ++ struct dsl_ds_destroyarg dsda = {0}; ++ ++ if (dmu_tx_is_syncing(tx)) { ++ /* ++ * If we're not prepared to remove the snapshot, ++ * we can't allow the release to happen right now. ++ */ ++ if (!ra->own) ++ return (EBUSY); ++ } ++ dsda.ds = ds; ++ dsda.releasing = B_TRUE; ++ return (dsl_dataset_destroy_check(&dsda, tag, tx)); ++ } ++ ++ return (0); ++} ++ ++static void ++dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) ++{ ++ struct dsl_ds_releasearg *ra = arg1; ++ dsl_dataset_t *ds = ra->ds; ++ dsl_pool_t *dp = ds->ds_dir->dd_pool; ++ objset_t *mos = dp->dp_meta_objset; ++ uint64_t zapobj; ++ uint64_t dsobj = ds->ds_object; ++ uint64_t refs; ++ int error; ++ ++ mutex_enter(&ds->ds_lock); ++ ds->ds_userrefs--; ++ refs = ds->ds_userrefs; ++ mutex_exit(&ds->ds_lock); ++ error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); ++ VERIFY(error == 0 || error == ENOENT); ++ zapobj = ds->ds_phys->ds_userrefs_obj; ++ VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); ++ ++ spa_history_log_internal(LOG_DS_USER_RELEASE, ++ dp->dp_spa, tx, "<%s> %lld dataset = %llu", ++ ra->htag, (longlong_t)refs, dsobj); ++ ++ if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && ++ DS_IS_DEFER_DESTROY(ds)) { ++ struct dsl_ds_destroyarg dsda = {0}; ++ ++ ASSERT(ra->own); ++ dsda.ds = ds; ++ dsda.releasing = B_TRUE; ++ /* We already did the destroy_check */ ++ dsl_dataset_destroy_sync(&dsda, tag, tx); ++ } ++} ++ ++static int ++dsl_dataset_user_release_one(const char *dsname, void *arg) ++{ ++ struct dsl_ds_holdarg *ha = arg; ++ struct dsl_ds_releasearg *ra; ++ dsl_dataset_t *ds; ++ int error; ++ void *dtag = ha->dstg; ++ char *name; ++ boolean_t own = B_FALSE; ++ boolean_t might_destroy; ++ ++ /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ ++ name = kmem_asprintf("%s@%s", dsname, ha->snapname); ++ error = dsl_dataset_hold(name, dtag, &ds); ++ strfree(name); ++ if (error == ENOENT && ha->recursive) ++ return (0); ++ (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); ++ if (error) ++ return (error); ++ ++ ha->gotone = B_TRUE; ++ ++ ASSERT(dsl_dataset_is_snapshot(ds)); ++ ++ error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); ++ if (error) { ++ dsl_dataset_rele(ds, dtag); ++ return (error); ++ } ++ ++ if (might_destroy) { ++#ifdef _KERNEL ++ name = kmem_asprintf("%s@%s", dsname, ha->snapname); ++ error = zfs_unmount_snap(name, NULL); ++ strfree(name); ++ if (error) { ++ dsl_dataset_rele(ds, dtag); ++ return (error); ++ } ++#endif ++ if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { ++ dsl_dataset_rele(ds, dtag); ++ return (EBUSY); ++ } else { ++ own = B_TRUE; ++ dsl_dataset_make_exclusive(ds, dtag); ++ } ++ } ++ ++ ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); ++ ra->ds = ds; ++ ra->htag = ha->htag; ++ ra->own = own; ++ dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, ++ dsl_dataset_user_release_sync, ra, dtag, 0); ++ ++ return (0); ++} ++ ++int ++dsl_dataset_user_release(char *dsname, char *snapname, char *htag, ++ boolean_t recursive) ++{ ++ struct dsl_ds_holdarg *ha; ++ dsl_sync_task_t *dst; ++ spa_t *spa; ++ int error; ++ ++top: ++ ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); ++ ++ (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); ++ ++ error = spa_open(dsname, &spa, FTAG); ++ if (error) { ++ kmem_free(ha, sizeof (struct dsl_ds_holdarg)); ++ return (error); ++ } ++ ++ ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); ++ ha->htag = htag; ++ ha->snapname = snapname; ++ ha->recursive = recursive; ++ if (recursive) { ++ error = dmu_objset_find(dsname, dsl_dataset_user_release_one, ++ ha, DS_FIND_CHILDREN); ++ } else { ++ error = dsl_dataset_user_release_one(dsname, ha); ++ } ++ if (error == 0) ++ error = dsl_sync_task_group_wait(ha->dstg); ++ ++ for (dst = list_head(&ha->dstg->dstg_tasks); dst; ++ dst = list_next(&ha->dstg->dstg_tasks, dst)) { ++ struct dsl_ds_releasearg *ra = dst->dst_arg1; ++ dsl_dataset_t *ds = ra->ds; ++ ++ if (dst->dst_err) ++ dsl_dataset_name(ds, ha->failed); ++ ++ if (ra->own) ++ dsl_dataset_disown(ds, ha->dstg); ++ else ++ dsl_dataset_rele(ds, ha->dstg); ++ ++ kmem_free(ra, sizeof (struct dsl_ds_releasearg)); ++ } ++ ++ if (error == 0 && recursive && !ha->gotone) ++ error = ENOENT; ++ ++ if (error && error != EBUSY) ++ (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); ++ ++ dsl_sync_task_group_destroy(ha->dstg); ++ kmem_free(ha, sizeof (struct dsl_ds_holdarg)); ++ spa_close(spa, FTAG); ++ ++ /* ++ * We can get EBUSY if we were racing with deferred destroy and ++ * dsl_dataset_user_release_check() hadn't done the necessary ++ * open context setup. We can also get EBUSY if we're racing ++ * with destroy and that thread is the ds_owner. Either way ++ * the busy condition should be transient, and we should retry ++ * the release operation. ++ */ ++ if (error == EBUSY) ++ goto top; ++ ++ return (error); ++} ++ ++/* ++ * Called at spa_load time (with retry == B_FALSE) to release a stale ++ * temporary user hold. Also called by the onexit code (with retry == B_TRUE). ++ */ ++int ++dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, ++ boolean_t retry) ++{ ++ dsl_dataset_t *ds; ++ char *snap; ++ char *name; ++ int namelen; ++ int error; ++ ++ do { ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); ++ rw_exit(&dp->dp_config_rwlock); ++ if (error) ++ return (error); ++ namelen = dsl_dataset_namelen(ds)+1; ++ name = kmem_alloc(namelen, KM_SLEEP); ++ dsl_dataset_name(ds, name); ++ dsl_dataset_rele(ds, FTAG); ++ ++ snap = strchr(name, '@'); ++ *snap = '\0'; ++ ++snap; ++ error = dsl_dataset_user_release(name, snap, htag, B_FALSE); ++ kmem_free(name, namelen); ++ ++ /* ++ * The object can't have been destroyed because we have a hold, ++ * but it might have been renamed, resulting in ENOENT. Retry ++ * if we've been requested to do so. ++ * ++ * It would be nice if we could use the dsobj all the way ++ * through and avoid ENOENT entirely. But we might need to ++ * unmount the snapshot, and there's currently no way to lookup ++ * a vfsp using a ZFS object id. ++ */ ++ } while ((error == ENOENT) && retry); ++ ++ return (error); ++} ++ ++int ++dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) ++{ ++ dsl_dataset_t *ds; ++ int err; ++ ++ err = dsl_dataset_hold(dsname, FTAG, &ds); ++ if (err) ++ return (err); ++ ++ VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); ++ if (ds->ds_phys->ds_userrefs_obj != 0) { ++ zap_attribute_t *za; ++ zap_cursor_t zc; ++ ++ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); ++ for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, ++ ds->ds_phys->ds_userrefs_obj); ++ zap_cursor_retrieve(&zc, za) == 0; ++ zap_cursor_advance(&zc)) { ++ VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, ++ za->za_first_integer)); ++ } ++ zap_cursor_fini(&zc); ++ kmem_free(za, sizeof (zap_attribute_t)); ++ } ++ dsl_dataset_rele(ds, FTAG); ++ return (0); ++} ++ ++/* ++ * Note, this function is used as the callback for dmu_objset_find(). We ++ * always return 0 so that we will continue to find and process ++ * inconsistent datasets, even if we encounter an error trying to ++ * process one of them. ++ */ ++/* ARGSUSED */ ++int ++dsl_destroy_inconsistent(const char *dsname, void *arg) ++{ ++ dsl_dataset_t *ds; ++ ++ if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { ++ if (DS_IS_INCONSISTENT(ds)) ++ (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); ++ else ++ dsl_dataset_disown(ds, FTAG); ++ } ++ return (0); ++} ++ ++ ++/* ++ * Return (in *usedp) the amount of space written in new that is not ++ * present in oldsnap. New may be a snapshot or the head. Old must be ++ * a snapshot before new, in new's filesystem (or its origin). If not then ++ * fail and return EINVAL. ++ * ++ * The written space is calculated by considering two components: First, we ++ * ignore any freed space, and calculate the written as new's used space ++ * minus old's used space. Next, we add in the amount of space that was freed ++ * between the two snapshots, thus reducing new's used space relative to old's. ++ * Specifically, this is the space that was born before old->ds_creation_txg, ++ * and freed before new (ie. on new's deadlist or a previous deadlist). ++ * ++ * space freed [---------------------] ++ * snapshots ---O-------O--------O-------O------ ++ * oldsnap new ++ */ ++int ++dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, ++ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) ++{ ++ int err = 0; ++ uint64_t snapobj; ++ dsl_pool_t *dp = new->ds_dir->dd_pool; ++ ++ *usedp = 0; ++ *usedp += new->ds_phys->ds_used_bytes; ++ *usedp -= oldsnap->ds_phys->ds_used_bytes; ++ ++ *compp = 0; ++ *compp += new->ds_phys->ds_compressed_bytes; ++ *compp -= oldsnap->ds_phys->ds_compressed_bytes; ++ ++ *uncompp = 0; ++ *uncompp += new->ds_phys->ds_uncompressed_bytes; ++ *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes; ++ ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ snapobj = new->ds_object; ++ while (snapobj != oldsnap->ds_object) { ++ dsl_dataset_t *snap; ++ uint64_t used, comp, uncomp; ++ ++ err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); ++ if (err != 0) ++ break; ++ ++ if (snap->ds_phys->ds_prev_snap_txg == ++ oldsnap->ds_phys->ds_creation_txg) { ++ /* ++ * The blocks in the deadlist can not be born after ++ * ds_prev_snap_txg, so get the whole deadlist space, ++ * which is more efficient (especially for old-format ++ * deadlists). Unfortunately the deadlist code ++ * doesn't have enough information to make this ++ * optimization itself. ++ */ ++ dsl_deadlist_space(&snap->ds_deadlist, ++ &used, &comp, &uncomp); ++ } else { ++ dsl_deadlist_space_range(&snap->ds_deadlist, ++ 0, oldsnap->ds_phys->ds_creation_txg, ++ &used, &comp, &uncomp); ++ } ++ *usedp += used; ++ *compp += comp; ++ *uncompp += uncomp; ++ ++ /* ++ * If we get to the beginning of the chain of snapshots ++ * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap ++ * was not a snapshot of/before new. ++ */ ++ snapobj = snap->ds_phys->ds_prev_snap_obj; ++ dsl_dataset_rele(snap, FTAG); ++ if (snapobj == 0) { ++ err = EINVAL; ++ break; ++ } ++ ++ } ++ rw_exit(&dp->dp_config_rwlock); ++ return (err); ++} ++ ++/* ++ * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, ++ * lastsnap, and all snapshots in between are deleted. ++ * ++ * blocks that would be freed [---------------------------] ++ * snapshots ---O-------O--------O-------O--------O ++ * firstsnap lastsnap ++ * ++ * This is the set of blocks that were born after the snap before firstsnap, ++ * (birth > firstsnap->prev_snap_txg) and died before the snap after the ++ * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). ++ * We calculate this by iterating over the relevant deadlists (from the snap ++ * after lastsnap, backward to the snap after firstsnap), summing up the ++ * space on the deadlist that was born after the snap before firstsnap. ++ */ ++int ++dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, ++ dsl_dataset_t *lastsnap, ++ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) ++{ ++ int err = 0; ++ uint64_t snapobj; ++ dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; ++ ++ ASSERT(dsl_dataset_is_snapshot(firstsnap)); ++ ASSERT(dsl_dataset_is_snapshot(lastsnap)); ++ ++ /* ++ * Check that the snapshots are in the same dsl_dir, and firstsnap ++ * is before lastsnap. ++ */ ++ if (firstsnap->ds_dir != lastsnap->ds_dir || ++ firstsnap->ds_phys->ds_creation_txg > ++ lastsnap->ds_phys->ds_creation_txg) ++ return (EINVAL); ++ ++ *usedp = *compp = *uncompp = 0; ++ ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ snapobj = lastsnap->ds_phys->ds_next_snap_obj; ++ while (snapobj != firstsnap->ds_object) { ++ dsl_dataset_t *ds; ++ uint64_t used, comp, uncomp; ++ ++ err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); ++ if (err != 0) ++ break; ++ ++ dsl_deadlist_space_range(&ds->ds_deadlist, ++ firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX, ++ &used, &comp, &uncomp); ++ *usedp += used; ++ *compp += comp; ++ *uncompp += uncomp; ++ ++ snapobj = ds->ds_phys->ds_prev_snap_obj; ++ ASSERT3U(snapobj, !=, 0); ++ dsl_dataset_rele(ds, FTAG); ++ } ++ rw_exit(&dp->dp_config_rwlock); ++ return (err); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(dmu_snapshots_destroy_nvl); ++EXPORT_SYMBOL(dsl_dataset_hold); ++EXPORT_SYMBOL(dsl_dataset_hold_obj); ++EXPORT_SYMBOL(dsl_dataset_own); ++EXPORT_SYMBOL(dsl_dataset_own_obj); ++EXPORT_SYMBOL(dsl_dataset_name); ++EXPORT_SYMBOL(dsl_dataset_rele); ++EXPORT_SYMBOL(dsl_dataset_disown); ++EXPORT_SYMBOL(dsl_dataset_drop_ref); ++EXPORT_SYMBOL(dsl_dataset_tryown); ++EXPORT_SYMBOL(dsl_dataset_make_exclusive); ++EXPORT_SYMBOL(dsl_dataset_create_sync); ++EXPORT_SYMBOL(dsl_dataset_create_sync_dd); ++EXPORT_SYMBOL(dsl_dataset_destroy); ++EXPORT_SYMBOL(dsl_dataset_destroy_check); ++EXPORT_SYMBOL(dsl_dataset_destroy_sync); ++EXPORT_SYMBOL(dsl_dataset_snapshot_check); ++EXPORT_SYMBOL(dsl_dataset_snapshot_sync); ++EXPORT_SYMBOL(dsl_dataset_rename); ++EXPORT_SYMBOL(dsl_dataset_promote); ++EXPORT_SYMBOL(dsl_dataset_clone_swap); ++EXPORT_SYMBOL(dsl_dataset_user_hold); ++EXPORT_SYMBOL(dsl_dataset_user_release); ++EXPORT_SYMBOL(dsl_dataset_user_release_tmp); ++EXPORT_SYMBOL(dsl_dataset_get_holds); ++EXPORT_SYMBOL(dsl_dataset_get_blkptr); ++EXPORT_SYMBOL(dsl_dataset_set_blkptr); ++EXPORT_SYMBOL(dsl_dataset_get_spa); ++EXPORT_SYMBOL(dsl_dataset_modified_since_lastsnap); ++EXPORT_SYMBOL(dsl_dataset_space_written); ++EXPORT_SYMBOL(dsl_dataset_space_wouldfree); ++EXPORT_SYMBOL(dsl_dataset_sync); ++EXPORT_SYMBOL(dsl_dataset_block_born); ++EXPORT_SYMBOL(dsl_dataset_block_kill); ++EXPORT_SYMBOL(dsl_dataset_block_freeable); ++EXPORT_SYMBOL(dsl_dataset_prev_snap_txg); ++EXPORT_SYMBOL(dsl_dataset_dirty); ++EXPORT_SYMBOL(dsl_dataset_stats); ++EXPORT_SYMBOL(dsl_dataset_fast_stat); ++EXPORT_SYMBOL(dsl_dataset_space); ++EXPORT_SYMBOL(dsl_dataset_fsid_guid); ++EXPORT_SYMBOL(dsl_dsobj_to_dsname); ++EXPORT_SYMBOL(dsl_dataset_check_quota); ++EXPORT_SYMBOL(dsl_dataset_set_quota); ++EXPORT_SYMBOL(dsl_dataset_set_quota_sync); ++EXPORT_SYMBOL(dsl_dataset_set_reservation); ++EXPORT_SYMBOL(dsl_destroy_inconsistent); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dsl_deadlist.c linux-3.2.33-go/fs/zfs/zfs/dsl_deadlist.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dsl_deadlist.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dsl_deadlist.c 2012-11-16 23:25:34.347039358 +0100 +@@ -0,0 +1,500 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Deadlist concurrency: ++ * ++ * Deadlists can only be modified from the syncing thread. ++ * ++ * Except for dsl_deadlist_insert(), it can only be modified with the ++ * dp_config_rwlock held with RW_WRITER. ++ * ++ * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can ++ * be called concurrently, from open context, with the dl_config_rwlock held ++ * with RW_READER. ++ * ++ * Therefore, we only need to provide locking between dsl_deadlist_insert() and ++ * the accessors, protecting: ++ * dl_phys->dl_used,comp,uncomp ++ * and protecting the dl_tree from being loaded. ++ * The locking is provided by dl_lock. Note that locking on the bpobj_t ++ * provides its own locking, and dl_oldfmt is immutable. ++ */ ++ ++static int ++dsl_deadlist_compare(const void *arg1, const void *arg2) ++{ ++ const dsl_deadlist_entry_t *dle1 = arg1; ++ const dsl_deadlist_entry_t *dle2 = arg2; ++ ++ if (dle1->dle_mintxg < dle2->dle_mintxg) ++ return (-1); ++ else if (dle1->dle_mintxg > dle2->dle_mintxg) ++ return (+1); ++ else ++ return (0); ++} ++ ++static void ++dsl_deadlist_load_tree(dsl_deadlist_t *dl) ++{ ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ ++ ASSERT(!dl->dl_oldfmt); ++ if (dl->dl_havetree) ++ return; ++ ++ avl_create(&dl->dl_tree, dsl_deadlist_compare, ++ sizeof (dsl_deadlist_entry_t), ++ offsetof(dsl_deadlist_entry_t, dle_node)); ++ for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); ++ zap_cursor_retrieve(&zc, &za) == 0; ++ zap_cursor_advance(&zc)) { ++ dsl_deadlist_entry_t *dle; ++ ++ dle = kmem_alloc(sizeof (*dle), KM_PUSHPAGE); ++ dle->dle_mintxg = strtonum(za.za_name, NULL); ++ VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, ++ za.za_first_integer)); ++ avl_add(&dl->dl_tree, dle); ++ } ++ zap_cursor_fini(&zc); ++ dl->dl_havetree = B_TRUE; ++} ++ ++void ++dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) ++{ ++ dmu_object_info_t doi; ++ ++ mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); ++ dl->dl_os = os; ++ dl->dl_object = object; ++ VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); ++ dmu_object_info_from_db(dl->dl_dbuf, &doi); ++ if (doi.doi_type == DMU_OT_BPOBJ) { ++ dmu_buf_rele(dl->dl_dbuf, dl); ++ dl->dl_dbuf = NULL; ++ dl->dl_oldfmt = B_TRUE; ++ VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); ++ return; ++ } ++ ++ dl->dl_oldfmt = B_FALSE; ++ dl->dl_phys = dl->dl_dbuf->db_data; ++ dl->dl_havetree = B_FALSE; ++} ++ ++void ++dsl_deadlist_close(dsl_deadlist_t *dl) ++{ ++ void *cookie = NULL; ++ dsl_deadlist_entry_t *dle; ++ ++ if (dl->dl_oldfmt) { ++ dl->dl_oldfmt = B_FALSE; ++ bpobj_close(&dl->dl_bpobj); ++ return; ++ } ++ ++ if (dl->dl_havetree) { ++ while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) ++ != NULL) { ++ bpobj_close(&dle->dle_bpobj); ++ kmem_free(dle, sizeof (*dle)); ++ } ++ avl_destroy(&dl->dl_tree); ++ } ++ dmu_buf_rele(dl->dl_dbuf, dl); ++ mutex_destroy(&dl->dl_lock); ++ dl->dl_dbuf = NULL; ++ dl->dl_phys = NULL; ++} ++ ++uint64_t ++dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) ++{ ++ if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) ++ return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx)); ++ return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, ++ sizeof (dsl_deadlist_phys_t), tx)); ++} ++ ++void ++dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) ++{ ++ dmu_object_info_t doi; ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ ++ VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); ++ if (doi.doi_type == DMU_OT_BPOBJ) { ++ bpobj_free(os, dlobj, tx); ++ return; ++ } ++ ++ for (zap_cursor_init(&zc, os, dlobj); ++ zap_cursor_retrieve(&zc, &za) == 0; ++ zap_cursor_advance(&zc)) ++ bpobj_free(os, za.za_first_integer, tx); ++ zap_cursor_fini(&zc); ++ VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); ++} ++ ++void ++dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) ++{ ++ dsl_deadlist_entry_t dle_tofind; ++ dsl_deadlist_entry_t *dle; ++ avl_index_t where; ++ ++ if (dl->dl_oldfmt) { ++ bpobj_enqueue(&dl->dl_bpobj, bp, tx); ++ return; ++ } ++ ++ dsl_deadlist_load_tree(dl); ++ ++ dmu_buf_will_dirty(dl->dl_dbuf, tx); ++ mutex_enter(&dl->dl_lock); ++ dl->dl_phys->dl_used += ++ bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); ++ dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); ++ dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); ++ mutex_exit(&dl->dl_lock); ++ ++ dle_tofind.dle_mintxg = bp->blk_birth; ++ dle = avl_find(&dl->dl_tree, &dle_tofind, &where); ++ if (dle == NULL) ++ dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); ++ else ++ dle = AVL_PREV(&dl->dl_tree, dle); ++ bpobj_enqueue(&dle->dle_bpobj, bp, tx); ++} ++ ++/* ++ * Insert new key in deadlist, which must be > all current entries. ++ * mintxg is not inclusive. ++ */ ++void ++dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) ++{ ++ uint64_t obj; ++ dsl_deadlist_entry_t *dle; ++ ++ if (dl->dl_oldfmt) ++ return; ++ ++ dsl_deadlist_load_tree(dl); ++ ++ dle = kmem_alloc(sizeof (*dle), KM_PUSHPAGE); ++ dle->dle_mintxg = mintxg; ++ obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); ++ VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); ++ avl_add(&dl->dl_tree, dle); ++ ++ VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, ++ mintxg, obj, tx)); ++} ++ ++/* ++ * Remove this key, merging its entries into the previous key. ++ */ ++void ++dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) ++{ ++ dsl_deadlist_entry_t dle_tofind; ++ dsl_deadlist_entry_t *dle, *dle_prev; ++ ++ if (dl->dl_oldfmt) ++ return; ++ ++ dsl_deadlist_load_tree(dl); ++ ++ dle_tofind.dle_mintxg = mintxg; ++ dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); ++ dle_prev = AVL_PREV(&dl->dl_tree, dle); ++ ++ bpobj_enqueue_subobj(&dle_prev->dle_bpobj, ++ dle->dle_bpobj.bpo_object, tx); ++ ++ avl_remove(&dl->dl_tree, dle); ++ bpobj_close(&dle->dle_bpobj); ++ kmem_free(dle, sizeof (*dle)); ++ ++ VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); ++} ++ ++/* ++ * Walk ds's snapshots to regenerate generate ZAP & AVL. ++ */ ++static void ++dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, ++ uint64_t mrs_obj, dmu_tx_t *tx) ++{ ++ dsl_deadlist_t dl; ++ dsl_pool_t *dp = dmu_objset_pool(os); ++ ++ dsl_deadlist_open(&dl, os, dlobj); ++ if (dl.dl_oldfmt) { ++ dsl_deadlist_close(&dl); ++ return; ++ } ++ ++ while (mrs_obj != 0) { ++ dsl_dataset_t *ds; ++ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); ++ dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx); ++ mrs_obj = ds->ds_phys->ds_prev_snap_obj; ++ dsl_dataset_rele(ds, FTAG); ++ } ++ dsl_deadlist_close(&dl); ++} ++ ++uint64_t ++dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, ++ uint64_t mrs_obj, dmu_tx_t *tx) ++{ ++ dsl_deadlist_entry_t *dle; ++ uint64_t newobj; ++ ++ newobj = dsl_deadlist_alloc(dl->dl_os, tx); ++ ++ if (dl->dl_oldfmt) { ++ dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); ++ return (newobj); ++ } ++ ++ dsl_deadlist_load_tree(dl); ++ ++ for (dle = avl_first(&dl->dl_tree); dle; ++ dle = AVL_NEXT(&dl->dl_tree, dle)) { ++ uint64_t obj; ++ ++ if (dle->dle_mintxg >= maxtxg) ++ break; ++ ++ obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); ++ VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, ++ dle->dle_mintxg, obj, tx)); ++ } ++ return (newobj); ++} ++ ++void ++dsl_deadlist_space(dsl_deadlist_t *dl, ++ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) ++{ ++ if (dl->dl_oldfmt) { ++ VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, ++ usedp, compp, uncompp)); ++ return; ++ } ++ ++ mutex_enter(&dl->dl_lock); ++ *usedp = dl->dl_phys->dl_used; ++ *compp = dl->dl_phys->dl_comp; ++ *uncompp = dl->dl_phys->dl_uncomp; ++ mutex_exit(&dl->dl_lock); ++} ++ ++/* ++ * return space used in the range (mintxg, maxtxg]. ++ * Includes maxtxg, does not include mintxg. ++ * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is ++ * larger than any bp in the deadlist (eg. UINT64_MAX)). ++ */ ++void ++dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, ++ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) ++{ ++ dsl_deadlist_entry_t *dle; ++ dsl_deadlist_entry_t dle_tofind; ++ avl_index_t where; ++ ++ if (dl->dl_oldfmt) { ++ VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, ++ mintxg, maxtxg, usedp, compp, uncompp)); ++ return; ++ } ++ ++ *usedp = *compp = *uncompp = 0; ++ ++ mutex_enter(&dl->dl_lock); ++ dsl_deadlist_load_tree(dl); ++ dle_tofind.dle_mintxg = mintxg; ++ dle = avl_find(&dl->dl_tree, &dle_tofind, &where); ++ /* ++ * If we don't find this mintxg, there shouldn't be anything ++ * after it either. ++ */ ++ ASSERT(dle != NULL || ++ avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); ++ ++ for (; dle && dle->dle_mintxg < maxtxg; ++ dle = AVL_NEXT(&dl->dl_tree, dle)) { ++ uint64_t used, comp, uncomp; ++ ++ VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, ++ &used, &comp, &uncomp)); ++ ++ *usedp += used; ++ *compp += comp; ++ *uncompp += uncomp; ++ } ++ mutex_exit(&dl->dl_lock); ++} ++ ++static void ++dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, ++ dmu_tx_t *tx) ++{ ++ dsl_deadlist_entry_t dle_tofind; ++ dsl_deadlist_entry_t *dle; ++ avl_index_t where; ++ uint64_t used, comp, uncomp; ++ bpobj_t bpo; ++ ++ VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); ++ VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); ++ bpobj_close(&bpo); ++ ++ dsl_deadlist_load_tree(dl); ++ ++ dmu_buf_will_dirty(dl->dl_dbuf, tx); ++ mutex_enter(&dl->dl_lock); ++ dl->dl_phys->dl_used += used; ++ dl->dl_phys->dl_comp += comp; ++ dl->dl_phys->dl_uncomp += uncomp; ++ mutex_exit(&dl->dl_lock); ++ ++ dle_tofind.dle_mintxg = birth; ++ dle = avl_find(&dl->dl_tree, &dle_tofind, &where); ++ if (dle == NULL) ++ dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); ++ bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); ++} ++ ++static int ++dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) ++{ ++ dsl_deadlist_t *dl = arg; ++ dsl_deadlist_insert(dl, bp, tx); ++ return (0); ++} ++ ++/* ++ * Merge the deadlist pointed to by 'obj' into dl. obj will be left as ++ * an empty deadlist. ++ */ ++void ++dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) ++{ ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ dmu_buf_t *bonus; ++ dsl_deadlist_phys_t *dlp; ++ dmu_object_info_t doi; ++ ++ VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); ++ if (doi.doi_type == DMU_OT_BPOBJ) { ++ bpobj_t bpo; ++ VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); ++ VERIFY3U(0, ==, bpobj_iterate(&bpo, ++ dsl_deadlist_insert_cb, dl, tx)); ++ bpobj_close(&bpo); ++ return; ++ } ++ ++ for (zap_cursor_init(&zc, dl->dl_os, obj); ++ zap_cursor_retrieve(&zc, &za) == 0; ++ zap_cursor_advance(&zc)) { ++ uint64_t mintxg = strtonum(za.za_name, NULL); ++ dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); ++ VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); ++ } ++ zap_cursor_fini(&zc); ++ ++ VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); ++ dlp = bonus->db_data; ++ dmu_buf_will_dirty(bonus, tx); ++ bzero(dlp, sizeof (*dlp)); ++ dmu_buf_rele(bonus, FTAG); ++} ++ ++/* ++ * Remove entries on dl that are >= mintxg, and put them on the bpobj. ++ */ ++void ++dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, ++ dmu_tx_t *tx) ++{ ++ dsl_deadlist_entry_t dle_tofind; ++ dsl_deadlist_entry_t *dle; ++ avl_index_t where; ++ ++ ASSERT(!dl->dl_oldfmt); ++ dmu_buf_will_dirty(dl->dl_dbuf, tx); ++ dsl_deadlist_load_tree(dl); ++ ++ dle_tofind.dle_mintxg = mintxg; ++ dle = avl_find(&dl->dl_tree, &dle_tofind, &where); ++ if (dle == NULL) ++ dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); ++ while (dle) { ++ uint64_t used, comp, uncomp; ++ dsl_deadlist_entry_t *dle_next; ++ ++ bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); ++ ++ VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, ++ &used, &comp, &uncomp)); ++ mutex_enter(&dl->dl_lock); ++ ASSERT3U(dl->dl_phys->dl_used, >=, used); ++ ASSERT3U(dl->dl_phys->dl_comp, >=, comp); ++ ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); ++ dl->dl_phys->dl_used -= used; ++ dl->dl_phys->dl_comp -= comp; ++ dl->dl_phys->dl_uncomp -= uncomp; ++ mutex_exit(&dl->dl_lock); ++ ++ VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, ++ dle->dle_mintxg, tx)); ++ ++ dle_next = AVL_NEXT(&dl->dl_tree, dle); ++ avl_remove(&dl->dl_tree, dle); ++ bpobj_close(&dle->dle_bpobj); ++ kmem_free(dle, sizeof (*dle)); ++ dle = dle_next; ++ } ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dsl_deleg.c linux-3.2.33-go/fs/zfs/zfs/dsl_deleg.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dsl_deleg.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dsl_deleg.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,763 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ */ ++ ++/* ++ * DSL permissions are stored in a two level zap attribute ++ * mechanism. The first level identifies the "class" of ++ * entry. The class is identified by the first 2 letters of ++ * the attribute. The second letter "l" or "d" identifies whether ++ * it is a local or descendent permission. The first letter ++ * identifies the type of entry. ++ * ++ * ul$ identifies permissions granted locally for this userid. ++ * ud$ identifies permissions granted on descendent datasets for ++ * this userid. ++ * Ul$ identifies permission sets granted locally for this userid. ++ * Ud$ identifies permission sets granted on descendent datasets for ++ * this userid. ++ * gl$ identifies permissions granted locally for this groupid. ++ * gd$ identifies permissions granted on descendent datasets for ++ * this groupid. ++ * Gl$ identifies permission sets granted locally for this groupid. ++ * Gd$ identifies permission sets granted on descendent datasets for ++ * this groupid. ++ * el$ identifies permissions granted locally for everyone. ++ * ed$ identifies permissions granted on descendent datasets ++ * for everyone. ++ * El$ identifies permission sets granted locally for everyone. ++ * Ed$ identifies permission sets granted to descendent datasets for ++ * everyone. ++ * c-$ identifies permission to create at dataset creation time. ++ * C-$ identifies permission sets to grant locally at dataset creation ++ * time. ++ * s-$@ permissions defined in specified set @ ++ * S-$@ Sets defined in named set @ ++ * ++ * Each of the above entities points to another zap attribute that contains one ++ * attribute for each allowed permission, such as create, destroy,... ++ * All of the "upper" case class types will specify permission set names ++ * rather than permissions. ++ * ++ * Basically it looks something like this: ++ * ul$12 -> ZAP OBJ -> permissions... ++ * ++ * The ZAP OBJ is referred to as the jump object. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "zfs_deleg.h" ++ ++/* ++ * Validate that user is allowed to delegate specified permissions. ++ * ++ * In order to delegate "create" you must have "create" ++ * and "allow". ++ */ ++int ++dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr) ++{ ++ nvpair_t *whopair = NULL; ++ int error; ++ ++ if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) ++ return (error); ++ ++ while ((whopair = nvlist_next_nvpair(nvp, whopair))) { ++ nvlist_t *perms; ++ nvpair_t *permpair = NULL; ++ ++ VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); ++ ++ while ((permpair = nvlist_next_nvpair(perms, permpair))) { ++ const char *perm = nvpair_name(permpair); ++ ++ if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0) ++ return (EPERM); ++ ++ if ((error = dsl_deleg_access(ddname, perm, cr)) != 0) ++ return (error); ++ } ++ } ++ return (0); ++} ++ ++/* ++ * Validate that user is allowed to unallow specified permissions. They ++ * must have the 'allow' permission, and even then can only unallow ++ * perms for their uid. ++ */ ++int ++dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) ++{ ++ nvpair_t *whopair = NULL; ++ int error; ++ char idstr[32]; ++ ++ if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) ++ return (error); ++ ++ (void) snprintf(idstr, sizeof (idstr), "%lld", ++ (longlong_t)crgetuid(cr)); ++ ++ while ((whopair = nvlist_next_nvpair(nvp, whopair))) { ++ zfs_deleg_who_type_t type = nvpair_name(whopair)[0]; ++ ++ if (type != ZFS_DELEG_USER && ++ type != ZFS_DELEG_USER_SETS) ++ return (EPERM); ++ ++ if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0) ++ return (EPERM); ++ } ++ return (0); ++} ++ ++static void ++dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dir_t *dd = arg1; ++ nvlist_t *nvp = arg2; ++ objset_t *mos = dd->dd_pool->dp_meta_objset; ++ nvpair_t *whopair = NULL; ++ uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; ++ ++ if (zapobj == 0) { ++ dmu_buf_will_dirty(dd->dd_dbuf, tx); ++ zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos, ++ DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); ++ } ++ ++ while ((whopair = nvlist_next_nvpair(nvp, whopair))) { ++ const char *whokey = nvpair_name(whopair); ++ nvlist_t *perms; ++ nvpair_t *permpair = NULL; ++ uint64_t jumpobj; ++ ++ VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); ++ ++ if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) { ++ jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, ++ DMU_OT_NONE, 0, tx); ++ VERIFY(zap_update(mos, zapobj, ++ whokey, 8, 1, &jumpobj, tx) == 0); ++ } ++ ++ while ((permpair = nvlist_next_nvpair(perms, permpair))) { ++ const char *perm = nvpair_name(permpair); ++ uint64_t n = 0; ++ ++ VERIFY(zap_update(mos, jumpobj, ++ perm, 8, 1, &n, tx) == 0); ++ spa_history_log_internal(LOG_DS_PERM_UPDATE, ++ dd->dd_pool->dp_spa, tx, ++ "%s %s dataset = %llu", whokey, perm, ++ dd->dd_phys->dd_head_dataset_obj); ++ } ++ } ++} ++ ++static void ++dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dir_t *dd = arg1; ++ nvlist_t *nvp = arg2; ++ objset_t *mos = dd->dd_pool->dp_meta_objset; ++ nvpair_t *whopair = NULL; ++ uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; ++ ++ if (zapobj == 0) ++ return; ++ ++ while ((whopair = nvlist_next_nvpair(nvp, whopair))) { ++ const char *whokey = nvpair_name(whopair); ++ nvlist_t *perms; ++ nvpair_t *permpair = NULL; ++ uint64_t jumpobj; ++ ++ if (nvpair_value_nvlist(whopair, &perms) != 0) { ++ if (zap_lookup(mos, zapobj, whokey, 8, ++ 1, &jumpobj) == 0) { ++ (void) zap_remove(mos, zapobj, whokey, tx); ++ VERIFY(0 == zap_destroy(mos, jumpobj, tx)); ++ } ++ spa_history_log_internal(LOG_DS_PERM_WHO_REMOVE, ++ dd->dd_pool->dp_spa, tx, ++ "%s dataset = %llu", whokey, ++ dd->dd_phys->dd_head_dataset_obj); ++ continue; ++ } ++ ++ if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) ++ continue; ++ ++ while ((permpair = nvlist_next_nvpair(perms, permpair))) { ++ const char *perm = nvpair_name(permpair); ++ uint64_t n = 0; ++ ++ (void) zap_remove(mos, jumpobj, perm, tx); ++ if (zap_count(mos, jumpobj, &n) == 0 && n == 0) { ++ (void) zap_remove(mos, zapobj, ++ whokey, tx); ++ VERIFY(0 == zap_destroy(mos, ++ jumpobj, tx)); ++ } ++ spa_history_log_internal(LOG_DS_PERM_REMOVE, ++ dd->dd_pool->dp_spa, tx, ++ "%s %s dataset = %llu", whokey, perm, ++ dd->dd_phys->dd_head_dataset_obj); ++ } ++ } ++} ++ ++int ++dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) ++{ ++ dsl_dir_t *dd; ++ int error; ++ nvpair_t *whopair = NULL; ++ int blocks_modified = 0; ++ ++ error = dsl_dir_open(ddname, FTAG, &dd, NULL); ++ if (error) ++ return (error); ++ ++ if (spa_version(dmu_objset_spa(dd->dd_pool->dp_meta_objset)) < ++ SPA_VERSION_DELEGATED_PERMS) { ++ dsl_dir_close(dd, FTAG); ++ return (ENOTSUP); ++ } ++ ++ while ((whopair = nvlist_next_nvpair(nvp, whopair))) ++ blocks_modified++; ++ ++ error = dsl_sync_task_do(dd->dd_pool, NULL, ++ unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, ++ dd, nvp, blocks_modified); ++ dsl_dir_close(dd, FTAG); ++ ++ return (error); ++} ++ ++/* ++ * Find all 'allow' permissions from a given point and then continue ++ * traversing up to the root. ++ * ++ * This function constructs an nvlist of nvlists. ++ * each setpoint is an nvlist composed of an nvlist of an nvlist ++ * of the individual * users/groups/everyone/create ++ * permissions. ++ * ++ * The nvlist will look like this. ++ * ++ * { source fsname -> { whokeys { permissions,...}, ...}} ++ * ++ * The fsname nvpairs will be arranged in a bottom up order. For example, ++ * if we have the following structure a/b/c then the nvpairs for the fsnames ++ * will be ordered a/b/c, a/b, a. ++ */ ++int ++dsl_deleg_get(const char *ddname, nvlist_t **nvp) ++{ ++ dsl_dir_t *dd, *startdd; ++ dsl_pool_t *dp; ++ int error; ++ objset_t *mos; ++ zap_cursor_t *basezc, *zc; ++ zap_attribute_t *baseza, *za; ++ char *source; ++ ++ error = dsl_dir_open(ddname, FTAG, &startdd, NULL); ++ if (error) ++ return (error); ++ ++ dp = startdd->dd_pool; ++ mos = dp->dp_meta_objset; ++ ++ zc = kmem_alloc(sizeof(zap_cursor_t), KM_SLEEP); ++ za = kmem_alloc(sizeof(zap_attribute_t), KM_SLEEP); ++ basezc = kmem_alloc(sizeof(zap_cursor_t), KM_SLEEP); ++ baseza = kmem_alloc(sizeof(zap_attribute_t), KM_SLEEP); ++ source = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, KM_SLEEP); ++ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ for (dd = startdd; dd != NULL; dd = dd->dd_parent) { ++ nvlist_t *sp_nvp; ++ uint64_t n; ++ ++ if (dd->dd_phys->dd_deleg_zapobj && ++ (zap_count(mos, dd->dd_phys->dd_deleg_zapobj, ++ &n) == 0) && n) { ++ VERIFY(nvlist_alloc(&sp_nvp, ++ NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ } else { ++ continue; ++ } ++ ++ for (zap_cursor_init(basezc, mos, ++ dd->dd_phys->dd_deleg_zapobj); ++ zap_cursor_retrieve(basezc, baseza) == 0; ++ zap_cursor_advance(basezc)) { ++ nvlist_t *perms_nvp; ++ ++ ASSERT(baseza->za_integer_length == 8); ++ ASSERT(baseza->za_num_integers == 1); ++ ++ VERIFY(nvlist_alloc(&perms_nvp, ++ NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ for (zap_cursor_init(zc, mos, baseza->za_first_integer); ++ zap_cursor_retrieve(zc, za) == 0; ++ zap_cursor_advance(zc)) { ++ VERIFY(nvlist_add_boolean(perms_nvp, ++ za->za_name) == 0); ++ } ++ zap_cursor_fini(zc); ++ VERIFY(nvlist_add_nvlist(sp_nvp, baseza->za_name, ++ perms_nvp) == 0); ++ nvlist_free(perms_nvp); ++ } ++ ++ zap_cursor_fini(basezc); ++ ++ dsl_dir_name(dd, source); ++ VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0); ++ nvlist_free(sp_nvp); ++ } ++ rw_exit(&dp->dp_config_rwlock); ++ ++ kmem_free(source, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); ++ kmem_free(baseza, sizeof(zap_attribute_t)); ++ kmem_free(basezc, sizeof(zap_cursor_t)); ++ kmem_free(za, sizeof(zap_attribute_t)); ++ kmem_free(zc, sizeof(zap_cursor_t)); ++ ++ dsl_dir_close(startdd, FTAG); ++ return (0); ++} ++ ++/* ++ * Routines for dsl_deleg_access() -- access checking. ++ */ ++typedef struct perm_set { ++ avl_node_t p_node; ++ boolean_t p_matched; ++ char p_setname[ZFS_MAX_DELEG_NAME]; ++} perm_set_t; ++ ++static int ++perm_set_compare(const void *arg1, const void *arg2) ++{ ++ const perm_set_t *node1 = arg1; ++ const perm_set_t *node2 = arg2; ++ int val; ++ ++ val = strcmp(node1->p_setname, node2->p_setname); ++ if (val == 0) ++ return (0); ++ return (val > 0 ? 1 : -1); ++} ++ ++/* ++ * Determine whether a specified permission exists. ++ * ++ * First the base attribute has to be retrieved. i.e. ul$12 ++ * Once the base object has been retrieved the actual permission ++ * is lookup up in the zap object the base object points to. ++ * ++ * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if ++ * there is no perm in that jumpobj. ++ */ ++static int ++dsl_check_access(objset_t *mos, uint64_t zapobj, ++ char type, char checkflag, void *valp, const char *perm) ++{ ++ int error; ++ uint64_t jumpobj, zero; ++ char whokey[ZFS_MAX_DELEG_NAME]; ++ ++ zfs_deleg_whokey(whokey, type, checkflag, valp); ++ error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); ++ if (error == 0) { ++ error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero); ++ if (error == ENOENT) ++ error = EPERM; ++ } ++ return (error); ++} ++ ++/* ++ * check a specified user/group for a requested permission ++ */ ++static int ++dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm, ++ int checkflag, cred_t *cr) ++{ ++ const gid_t *gids; ++ int ngids; ++ int i; ++ uint64_t id; ++ ++ /* check for user */ ++ id = crgetuid(cr); ++ if (dsl_check_access(mos, zapobj, ++ ZFS_DELEG_USER, checkflag, &id, perm) == 0) ++ return (0); ++ ++ /* check for users primary group */ ++ id = crgetgid(cr); ++ if (dsl_check_access(mos, zapobj, ++ ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) ++ return (0); ++ ++ /* check for everyone entry */ ++ id = -1; ++ if (dsl_check_access(mos, zapobj, ++ ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0) ++ return (0); ++ ++ /* check each supplemental group user is a member of */ ++ ngids = crgetngroups(cr); ++ gids = crgetgroups(cr); ++ for (i = 0; i != ngids; i++) { ++ id = gids[i]; ++ if (dsl_check_access(mos, zapobj, ++ ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) ++ return (0); ++ } ++ ++ return (EPERM); ++} ++ ++/* ++ * Iterate over the sets specified in the specified zapobj ++ * and load them into the permsets avl tree. ++ */ ++static int ++dsl_load_sets(objset_t *mos, uint64_t zapobj, ++ char type, char checkflag, void *valp, avl_tree_t *avl) ++{ ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ perm_set_t *permnode; ++ avl_index_t idx; ++ uint64_t jumpobj; ++ int error; ++ char whokey[ZFS_MAX_DELEG_NAME]; ++ ++ zfs_deleg_whokey(whokey, type, checkflag, valp); ++ ++ error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); ++ if (error != 0) ++ return (error); ++ ++ for (zap_cursor_init(&zc, mos, jumpobj); ++ zap_cursor_retrieve(&zc, &za) == 0; ++ zap_cursor_advance(&zc)) { ++ permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP); ++ (void) strlcpy(permnode->p_setname, za.za_name, ++ sizeof (permnode->p_setname)); ++ permnode->p_matched = B_FALSE; ++ ++ if (avl_find(avl, permnode, &idx) == NULL) { ++ avl_insert(avl, permnode, idx); ++ } else { ++ kmem_free(permnode, sizeof (perm_set_t)); ++ } ++ } ++ zap_cursor_fini(&zc); ++ return (0); ++} ++ ++/* ++ * Load all permissions user based on cred belongs to. ++ */ ++static void ++dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl, ++ char checkflag, cred_t *cr) ++{ ++ const gid_t *gids; ++ int ngids, i; ++ uint64_t id; ++ ++ id = crgetuid(cr); ++ (void) dsl_load_sets(mos, zapobj, ++ ZFS_DELEG_USER_SETS, checkflag, &id, avl); ++ ++ id = crgetgid(cr); ++ (void) dsl_load_sets(mos, zapobj, ++ ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); ++ ++ (void) dsl_load_sets(mos, zapobj, ++ ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl); ++ ++ ngids = crgetngroups(cr); ++ gids = crgetgroups(cr); ++ for (i = 0; i != ngids; i++) { ++ id = gids[i]; ++ (void) dsl_load_sets(mos, zapobj, ++ ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); ++ } ++} ++ ++/* ++ * Check if user has requested permission. If descendent is set, must have ++ * descendent perms. ++ */ ++int ++dsl_deleg_access_impl(dsl_dataset_t *ds, boolean_t descendent, const char *perm, ++ cred_t *cr) ++{ ++ dsl_dir_t *dd; ++ dsl_pool_t *dp; ++ void *cookie; ++ int error; ++ char checkflag; ++ objset_t *mos; ++ avl_tree_t permsets; ++ perm_set_t *setnode; ++ ++ dp = ds->ds_dir->dd_pool; ++ mos = dp->dp_meta_objset; ++ ++ if (dsl_delegation_on(mos) == B_FALSE) ++ return (ECANCELED); ++ ++ if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) < ++ SPA_VERSION_DELEGATED_PERMS) ++ return (EPERM); ++ ++ if (dsl_dataset_is_snapshot(ds) || descendent) { ++ /* ++ * Snapshots are treated as descendents only, ++ * local permissions do not apply. ++ */ ++ checkflag = ZFS_DELEG_DESCENDENT; ++ } else { ++ checkflag = ZFS_DELEG_LOCAL; ++ } ++ ++ avl_create(&permsets, perm_set_compare, sizeof (perm_set_t), ++ offsetof(perm_set_t, p_node)); ++ ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent, ++ checkflag = ZFS_DELEG_DESCENDENT) { ++ uint64_t zapobj; ++ boolean_t expanded; ++ ++ /* ++ * If not in global zone then make sure ++ * the zoned property is set ++ */ ++ if (!INGLOBALZONE(curproc)) { ++ uint64_t zoned; ++ ++ if (dsl_prop_get_dd(dd, ++ zfs_prop_to_name(ZFS_PROP_ZONED), ++ 8, 1, &zoned, NULL, B_FALSE) != 0) ++ break; ++ if (!zoned) ++ break; ++ } ++ zapobj = dd->dd_phys->dd_deleg_zapobj; ++ ++ if (zapobj == 0) ++ continue; ++ ++ dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr); ++again: ++ expanded = B_FALSE; ++ for (setnode = avl_first(&permsets); setnode; ++ setnode = AVL_NEXT(&permsets, setnode)) { ++ if (setnode->p_matched == B_TRUE) ++ continue; ++ ++ /* See if this set directly grants this permission */ ++ error = dsl_check_access(mos, zapobj, ++ ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm); ++ if (error == 0) ++ goto success; ++ if (error == EPERM) ++ setnode->p_matched = B_TRUE; ++ ++ /* See if this set includes other sets */ ++ error = dsl_load_sets(mos, zapobj, ++ ZFS_DELEG_NAMED_SET_SETS, 0, ++ setnode->p_setname, &permsets); ++ if (error == 0) ++ setnode->p_matched = expanded = B_TRUE; ++ } ++ /* ++ * If we expanded any sets, that will define more sets, ++ * which we need to check. ++ */ ++ if (expanded) ++ goto again; ++ ++ error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr); ++ if (error == 0) ++ goto success; ++ } ++ error = EPERM; ++success: ++ rw_exit(&dp->dp_config_rwlock); ++ ++ cookie = NULL; ++ while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL) ++ kmem_free(setnode, sizeof (perm_set_t)); ++ ++ return (error); ++} ++ ++int ++dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) ++{ ++ dsl_dataset_t *ds; ++ int error; ++ ++ error = dsl_dataset_hold(dsname, FTAG, &ds); ++ if (error) ++ return (error); ++ ++ error = dsl_deleg_access_impl(ds, B_FALSE, perm, cr); ++ dsl_dataset_rele(ds, FTAG); ++ ++ return (error); ++} ++ ++/* ++ * Other routines. ++ */ ++ ++static void ++copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj, ++ boolean_t dosets, uint64_t uid, dmu_tx_t *tx) ++{ ++ objset_t *mos = dd->dd_pool->dp_meta_objset; ++ uint64_t jumpobj, pjumpobj; ++ uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ char whokey[ZFS_MAX_DELEG_NAME]; ++ ++ zfs_deleg_whokey(whokey, ++ dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE, ++ ZFS_DELEG_LOCAL, NULL); ++ if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0) ++ return; ++ ++ if (zapobj == 0) { ++ dmu_buf_will_dirty(dd->dd_dbuf, tx); ++ zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos, ++ DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); ++ } ++ ++ zfs_deleg_whokey(whokey, ++ dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER, ++ ZFS_DELEG_LOCAL, &uid); ++ if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) { ++ jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); ++ VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0); ++ } ++ ++ for (zap_cursor_init(&zc, mos, pjumpobj); ++ zap_cursor_retrieve(&zc, &za) == 0; ++ zap_cursor_advance(&zc)) { ++ uint64_t zero = 0; ++ ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); ++ ++ VERIFY(zap_update(mos, jumpobj, za.za_name, ++ 8, 1, &zero, tx) == 0); ++ } ++ zap_cursor_fini(&zc); ++} ++ ++/* ++ * set all create time permission on new dataset. ++ */ ++void ++dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr) ++{ ++ dsl_dir_t *dd; ++ uint64_t uid = crgetuid(cr); ++ ++ if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) < ++ SPA_VERSION_DELEGATED_PERMS) ++ return; ++ ++ for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) { ++ uint64_t pzapobj = dd->dd_phys->dd_deleg_zapobj; ++ ++ if (pzapobj == 0) ++ continue; ++ ++ copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx); ++ copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx); ++ } ++} ++ ++int ++dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx) ++{ ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ ++ if (zapobj == 0) ++ return (0); ++ ++ for (zap_cursor_init(&zc, mos, zapobj); ++ zap_cursor_retrieve(&zc, &za) == 0; ++ zap_cursor_advance(&zc)) { ++ ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); ++ VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx)); ++ } ++ zap_cursor_fini(&zc); ++ VERIFY(0 == zap_destroy(mos, zapobj, tx)); ++ return (0); ++} ++ ++boolean_t ++dsl_delegation_on(objset_t *os) ++{ ++ return (!!spa_delegation(os->os_spa)); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(dsl_deleg_get); ++EXPORT_SYMBOL(dsl_deleg_set); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dsl_dir.c linux-3.2.33-go/fs/zfs/zfs/dsl_dir.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dsl_dir.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dsl_dir.c 2012-11-16 23:25:34.347039358 +0100 +@@ -0,0 +1,1422 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "zfs_namecheck.h" ++ ++static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); ++static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx); ++ ++ ++/* ARGSUSED */ ++static void ++dsl_dir_evict(dmu_buf_t *db, void *arg) ++{ ++ dsl_dir_t *dd = arg; ++ ASSERTV(dsl_pool_t *dp = dd->dd_pool;) ++ int t; ++ ++ for (t = 0; t < TXG_SIZE; t++) { ++ ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); ++ ASSERT(dd->dd_tempreserved[t] == 0); ++ ASSERT(dd->dd_space_towrite[t] == 0); ++ } ++ ++ if (dd->dd_parent) ++ dsl_dir_close(dd->dd_parent, dd); ++ ++ spa_close(dd->dd_pool->dp_spa, dd); ++ ++ /* ++ * The props callback list should have been cleaned up by ++ * objset_evict(). ++ */ ++ list_destroy(&dd->dd_prop_cbs); ++ mutex_destroy(&dd->dd_lock); ++ kmem_free(dd, sizeof (dsl_dir_t)); ++} ++ ++int ++dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, ++ const char *tail, void *tag, dsl_dir_t **ddp) ++{ ++ dmu_buf_t *dbuf; ++ dsl_dir_t *dd; ++ int err; ++ ++ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || ++ dsl_pool_sync_context(dp)); ++ ++ err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); ++ if (err) ++ return (err); ++ dd = dmu_buf_get_user(dbuf); ++#ifdef ZFS_DEBUG ++ { ++ dmu_object_info_t doi; ++ dmu_object_info_from_db(dbuf, &doi); ++ ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR); ++ ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); ++ } ++#endif ++ if (dd == NULL) { ++ dsl_dir_t *winner; ++ ++ dd = kmem_zalloc(sizeof (dsl_dir_t), KM_PUSHPAGE); ++ dd->dd_object = ddobj; ++ dd->dd_dbuf = dbuf; ++ dd->dd_pool = dp; ++ dd->dd_phys = dbuf->db_data; ++ mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), ++ offsetof(dsl_prop_cb_record_t, cbr_node)); ++ ++ dsl_dir_snap_cmtime_update(dd); ++ ++ if (dd->dd_phys->dd_parent_obj) { ++ err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, ++ NULL, dd, &dd->dd_parent); ++ if (err) ++ goto errout; ++ if (tail) { ++#ifdef ZFS_DEBUG ++ uint64_t foundobj; ++ ++ err = zap_lookup(dp->dp_meta_objset, ++ dd->dd_parent->dd_phys->dd_child_dir_zapobj, ++ tail, sizeof (foundobj), 1, &foundobj); ++ ASSERT(err || foundobj == ddobj); ++#endif ++ (void) strcpy(dd->dd_myname, tail); ++ } else { ++ err = zap_value_search(dp->dp_meta_objset, ++ dd->dd_parent->dd_phys->dd_child_dir_zapobj, ++ ddobj, 0, dd->dd_myname); ++ } ++ if (err) ++ goto errout; ++ } else { ++ (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); ++ } ++ ++ if (dsl_dir_is_clone(dd)) { ++ dmu_buf_t *origin_bonus; ++ dsl_dataset_phys_t *origin_phys; ++ ++ /* ++ * We can't open the origin dataset, because ++ * that would require opening this dsl_dir. ++ * Just look at its phys directly instead. ++ */ ++ err = dmu_bonus_hold(dp->dp_meta_objset, ++ dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus); ++ if (err) ++ goto errout; ++ origin_phys = origin_bonus->db_data; ++ dd->dd_origin_txg = ++ origin_phys->ds_creation_txg; ++ dmu_buf_rele(origin_bonus, FTAG); ++ } ++ ++ winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, ++ dsl_dir_evict); ++ if (winner) { ++ if (dd->dd_parent) ++ dsl_dir_close(dd->dd_parent, dd); ++ mutex_destroy(&dd->dd_lock); ++ kmem_free(dd, sizeof (dsl_dir_t)); ++ dd = winner; ++ } else { ++ spa_open_ref(dp->dp_spa, dd); ++ } ++ } ++ ++ /* ++ * The dsl_dir_t has both open-to-close and instantiate-to-evict ++ * holds on the spa. We need the open-to-close holds because ++ * otherwise the spa_refcnt wouldn't change when we open a ++ * dir which the spa also has open, so we could incorrectly ++ * think it was OK to unload/export/destroy the pool. We need ++ * the instantiate-to-evict hold because the dsl_dir_t has a ++ * pointer to the dd_pool, which has a pointer to the spa_t. ++ */ ++ spa_open_ref(dp->dp_spa, tag); ++ ASSERT3P(dd->dd_pool, ==, dp); ++ ASSERT3U(dd->dd_object, ==, ddobj); ++ ASSERT3P(dd->dd_dbuf, ==, dbuf); ++ *ddp = dd; ++ return (0); ++ ++errout: ++ if (dd->dd_parent) ++ dsl_dir_close(dd->dd_parent, dd); ++ mutex_destroy(&dd->dd_lock); ++ kmem_free(dd, sizeof (dsl_dir_t)); ++ dmu_buf_rele(dbuf, tag); ++ return (err); ++ ++} ++ ++void ++dsl_dir_close(dsl_dir_t *dd, void *tag) ++{ ++ dprintf_dd(dd, "%s\n", ""); ++ spa_close(dd->dd_pool->dp_spa, tag); ++ dmu_buf_rele(dd->dd_dbuf, tag); ++} ++ ++/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */ ++void ++dsl_dir_name(dsl_dir_t *dd, char *buf) ++{ ++ if (dd->dd_parent) { ++ dsl_dir_name(dd->dd_parent, buf); ++ (void) strcat(buf, "/"); ++ } else { ++ buf[0] = '\0'; ++ } ++ if (!MUTEX_HELD(&dd->dd_lock)) { ++ /* ++ * recursive mutex so that we can use ++ * dprintf_dd() with dd_lock held ++ */ ++ mutex_enter(&dd->dd_lock); ++ (void) strcat(buf, dd->dd_myname); ++ mutex_exit(&dd->dd_lock); ++ } else { ++ (void) strcat(buf, dd->dd_myname); ++ } ++} ++ ++/* Calculate name legnth, avoiding all the strcat calls of dsl_dir_name */ ++int ++dsl_dir_namelen(dsl_dir_t *dd) ++{ ++ int result = 0; ++ ++ if (dd->dd_parent) { ++ /* parent's name + 1 for the "/" */ ++ result = dsl_dir_namelen(dd->dd_parent) + 1; ++ } ++ ++ if (!MUTEX_HELD(&dd->dd_lock)) { ++ /* see dsl_dir_name */ ++ mutex_enter(&dd->dd_lock); ++ result += strlen(dd->dd_myname); ++ mutex_exit(&dd->dd_lock); ++ } else { ++ result += strlen(dd->dd_myname); ++ } ++ ++ return (result); ++} ++ ++static int ++getcomponent(const char *path, char *component, const char **nextp) ++{ ++ char *p; ++ if ((path == NULL) || (path[0] == '\0')) ++ return (ENOENT); ++ /* This would be a good place to reserve some namespace... */ ++ p = strpbrk(path, "/@"); ++ if (p && (p[1] == '/' || p[1] == '@')) { ++ /* two separators in a row */ ++ return (EINVAL); ++ } ++ if (p == NULL || p == path) { ++ /* ++ * if the first thing is an @ or /, it had better be an ++ * @ and it had better not have any more ats or slashes, ++ * and it had better have something after the @. ++ */ ++ if (p != NULL && ++ (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) ++ return (EINVAL); ++ if (strlen(path) >= MAXNAMELEN) ++ return (ENAMETOOLONG); ++ (void) strcpy(component, path); ++ p = NULL; ++ } else if (p[0] == '/') { ++ if (p-path >= MAXNAMELEN) ++ return (ENAMETOOLONG); ++ (void) strncpy(component, path, p - path); ++ component[p-path] = '\0'; ++ p++; ++ } else if (p[0] == '@') { ++ /* ++ * if the next separator is an @, there better not be ++ * any more slashes. ++ */ ++ if (strchr(path, '/')) ++ return (EINVAL); ++ if (p-path >= MAXNAMELEN) ++ return (ENAMETOOLONG); ++ (void) strncpy(component, path, p - path); ++ component[p-path] = '\0'; ++ } else { ++ ASSERT(!"invalid p"); ++ } ++ *nextp = p; ++ return (0); ++} ++ ++/* ++ * same as dsl_open_dir, ignore the first component of name and use the ++ * spa instead ++ */ ++int ++dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, ++ dsl_dir_t **ddp, const char **tailp) ++{ ++ char *buf; ++ const char *next, *nextnext = NULL; ++ int err; ++ dsl_dir_t *dd; ++ dsl_pool_t *dp; ++ uint64_t ddobj; ++ int openedspa = FALSE; ++ ++ dprintf("%s\n", name); ++ ++ buf = kmem_alloc(MAXNAMELEN, KM_SLEEP); ++ err = getcomponent(name, buf, &next); ++ if (err) ++ goto error; ++ if (spa == NULL) { ++ err = spa_open(buf, &spa, FTAG); ++ if (err) { ++ dprintf("spa_open(%s) failed\n", buf); ++ goto error; ++ } ++ openedspa = TRUE; ++ ++ /* XXX this assertion belongs in spa_open */ ++ ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa))); ++ } ++ ++ dp = spa_get_dsl(spa); ++ ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); ++ if (err) { ++ rw_exit(&dp->dp_config_rwlock); ++ if (openedspa) ++ spa_close(spa, FTAG); ++ goto error; ++ } ++ ++ while (next != NULL) { ++ dsl_dir_t *child_ds; ++ err = getcomponent(next, buf, &nextnext); ++ if (err) ++ break; ++ ASSERT(next[0] != '\0'); ++ if (next[0] == '@') ++ break; ++ dprintf("looking up %s in obj%lld\n", ++ buf, dd->dd_phys->dd_child_dir_zapobj); ++ ++ err = zap_lookup(dp->dp_meta_objset, ++ dd->dd_phys->dd_child_dir_zapobj, ++ buf, sizeof (ddobj), 1, &ddobj); ++ if (err) { ++ if (err == ENOENT) ++ err = 0; ++ break; ++ } ++ ++ err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds); ++ if (err) ++ break; ++ dsl_dir_close(dd, tag); ++ dd = child_ds; ++ next = nextnext; ++ } ++ rw_exit(&dp->dp_config_rwlock); ++ ++ if (err) { ++ dsl_dir_close(dd, tag); ++ if (openedspa) ++ spa_close(spa, FTAG); ++ goto error; ++ } ++ ++ /* ++ * It's an error if there's more than one component left, or ++ * tailp==NULL and there's any component left. ++ */ ++ if (next != NULL && ++ (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { ++ /* bad path name */ ++ dsl_dir_close(dd, tag); ++ dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); ++ err = ENOENT; ++ } ++ if (tailp) ++ *tailp = next; ++ if (openedspa) ++ spa_close(spa, FTAG); ++ *ddp = dd; ++error: ++ kmem_free(buf, MAXNAMELEN); ++ return (err); ++} ++ ++/* ++ * Return the dsl_dir_t, and possibly the last component which couldn't ++ * be found in *tail. Return NULL if the path is bogus, or if ++ * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@' ++ * means that the last component is a snapshot. ++ */ ++int ++dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) ++{ ++ return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp)); ++} ++ ++uint64_t ++dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, ++ dmu_tx_t *tx) ++{ ++ objset_t *mos = dp->dp_meta_objset; ++ uint64_t ddobj; ++ dsl_dir_phys_t *ddphys; ++ dmu_buf_t *dbuf; ++ ++ ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, ++ DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); ++ if (pds) { ++ VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, ++ name, sizeof (uint64_t), 1, &ddobj, tx)); ++ } else { ++ /* it's the root dir */ ++ VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, ++ DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); ++ } ++ VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); ++ dmu_buf_will_dirty(dbuf, tx); ++ ddphys = dbuf->db_data; ++ ++ ddphys->dd_creation_time = gethrestime_sec(); ++ if (pds) ++ ddphys->dd_parent_obj = pds->dd_object; ++ ddphys->dd_props_zapobj = zap_create(mos, ++ DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); ++ ddphys->dd_child_dir_zapobj = zap_create(mos, ++ DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); ++ if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) ++ ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; ++ dmu_buf_rele(dbuf, FTAG); ++ ++ return (ddobj); ++} ++ ++/* ARGSUSED */ ++int ++dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ dsl_dir_t *dd = ds->ds_dir; ++ dsl_pool_t *dp = dd->dd_pool; ++ objset_t *mos = dp->dp_meta_objset; ++ int err; ++ uint64_t count; ++ ++ /* ++ * There should be exactly two holds, both from ++ * dsl_dataset_destroy: one on the dd directory, and one on its ++ * head ds. If there are more holds, then a concurrent thread is ++ * performing a lookup inside this dir while we're trying to destroy ++ * it. To minimize this possibility, we perform this check only ++ * in syncing context and fail the operation if we encounter ++ * additional holds. The dp_config_rwlock ensures that nobody else ++ * opens it after we check. ++ */ ++ if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 2) ++ return (EBUSY); ++ ++ err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count); ++ if (err) ++ return (err); ++ if (count != 0) ++ return (EEXIST); ++ ++ return (0); ++} ++ ++void ++dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ dsl_dir_t *dd = ds->ds_dir; ++ objset_t *mos = dd->dd_pool->dp_meta_objset; ++ dsl_prop_setarg_t psa; ++ uint64_t value = 0; ++ uint64_t obj; ++ dd_used_t t; ++ ++ ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock)); ++ ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); ++ ++ /* Remove our reservation. */ ++ dsl_prop_setarg_init_uint64(&psa, "reservation", ++ (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), ++ &value); ++ psa.psa_effective_value = 0; /* predict default value */ ++ ++ dsl_dir_set_reservation_sync(ds, &psa, tx); ++ ++ ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0); ++ ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); ++ for (t = 0; t < DD_USED_NUM; t++) ++ ASSERT3U(dd->dd_phys->dd_used_breakdown[t], ==, 0); ++ ++ VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); ++ VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); ++ VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx)); ++ VERIFY(0 == zap_remove(mos, ++ dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); ++ ++ obj = dd->dd_object; ++ dsl_dir_close(dd, tag); ++ VERIFY(0 == dmu_object_free(mos, obj, tx)); ++} ++ ++boolean_t ++dsl_dir_is_clone(dsl_dir_t *dd) ++{ ++ return (dd->dd_phys->dd_origin_obj && ++ (dd->dd_pool->dp_origin_snap == NULL || ++ dd->dd_phys->dd_origin_obj != ++ dd->dd_pool->dp_origin_snap->ds_object)); ++} ++ ++void ++dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) ++{ ++ mutex_enter(&dd->dd_lock); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, ++ dd->dd_phys->dd_used_bytes); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, ++ dd->dd_phys->dd_reserved); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ++ dd->dd_phys->dd_compressed_bytes == 0 ? 100 : ++ (dd->dd_phys->dd_uncompressed_bytes * 100 / ++ dd->dd_phys->dd_compressed_bytes)); ++ if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, ++ dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, ++ dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, ++ dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]); ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, ++ dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] + ++ dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]); ++ } ++ mutex_exit(&dd->dd_lock); ++ ++ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); ++ if (dsl_dir_is_clone(dd)) { ++ dsl_dataset_t *ds; ++ char buf[MAXNAMELEN]; ++ ++ VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, ++ dd->dd_phys->dd_origin_obj, FTAG, &ds)); ++ dsl_dataset_name(ds, buf); ++ dsl_dataset_rele(ds, FTAG); ++ dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); ++ } ++ rw_exit(&dd->dd_pool->dp_config_rwlock); ++} ++ ++void ++dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) ++{ ++ dsl_pool_t *dp = dd->dd_pool; ++ ++ ASSERT(dd->dd_phys); ++ ++ if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) { ++ /* up the hold count until we can be written out */ ++ dmu_buf_add_ref(dd->dd_dbuf, dd); ++ } ++} ++ ++static int64_t ++parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) ++{ ++ uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved); ++ uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved); ++ return (new_accounted - old_accounted); ++} ++ ++void ++dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) ++{ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ++ dmu_buf_will_dirty(dd->dd_dbuf, tx); ++ ++ mutex_enter(&dd->dd_lock); ++ ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0); ++ dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, ++ dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); ++ dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; ++ mutex_exit(&dd->dd_lock); ++ ++ /* release the hold from dsl_dir_dirty */ ++ dmu_buf_rele(dd->dd_dbuf, dd); ++} ++ ++static uint64_t ++dsl_dir_space_towrite(dsl_dir_t *dd) ++{ ++ uint64_t space = 0; ++ int i; ++ ++ ASSERT(MUTEX_HELD(&dd->dd_lock)); ++ ++ for (i = 0; i < TXG_SIZE; i++) { ++ space += dd->dd_space_towrite[i&TXG_MASK]; ++ ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); ++ } ++ return (space); ++} ++ ++/* ++ * How much space would dd have available if ancestor had delta applied ++ * to it? If ondiskonly is set, we're only interested in what's ++ * on-disk, not estimated pending changes. ++ */ ++uint64_t ++dsl_dir_space_available(dsl_dir_t *dd, ++ dsl_dir_t *ancestor, int64_t delta, int ondiskonly) ++{ ++ uint64_t parentspace, myspace, quota, used; ++ ++ /* ++ * If there are no restrictions otherwise, assume we have ++ * unlimited space available. ++ */ ++ quota = UINT64_MAX; ++ parentspace = UINT64_MAX; ++ ++ if (dd->dd_parent != NULL) { ++ parentspace = dsl_dir_space_available(dd->dd_parent, ++ ancestor, delta, ondiskonly); ++ } ++ ++ mutex_enter(&dd->dd_lock); ++ if (dd->dd_phys->dd_quota != 0) ++ quota = dd->dd_phys->dd_quota; ++ used = dd->dd_phys->dd_used_bytes; ++ if (!ondiskonly) ++ used += dsl_dir_space_towrite(dd); ++ ++ if (dd->dd_parent == NULL) { ++ uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE); ++ quota = MIN(quota, poolsize); ++ } ++ ++ if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) { ++ /* ++ * We have some space reserved, in addition to what our ++ * parent gave us. ++ */ ++ parentspace += dd->dd_phys->dd_reserved - used; ++ } ++ ++ if (dd == ancestor) { ++ ASSERT(delta <= 0); ++ ASSERT(used >= -delta); ++ used += delta; ++ if (parentspace != UINT64_MAX) ++ parentspace -= delta; ++ } ++ ++ if (used > quota) { ++ /* over quota */ ++ myspace = 0; ++ } else { ++ /* ++ * the lesser of the space provided by our parent and ++ * the space left in our quota ++ */ ++ myspace = MIN(parentspace, quota - used); ++ } ++ ++ mutex_exit(&dd->dd_lock); ++ ++ return (myspace); ++} ++ ++struct tempreserve { ++ list_node_t tr_node; ++ dsl_pool_t *tr_dp; ++ dsl_dir_t *tr_ds; ++ uint64_t tr_size; ++}; ++ ++static int ++dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, ++ boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list, ++ dmu_tx_t *tx, boolean_t first) ++{ ++ uint64_t txg = tx->tx_txg; ++ uint64_t est_inflight, used_on_disk, quota, parent_rsrv; ++ uint64_t deferred = 0; ++ struct tempreserve *tr; ++ int retval = EDQUOT; ++ int txgidx = txg & TXG_MASK; ++ int i; ++ uint64_t ref_rsrv = 0; ++ ++ ASSERT3U(txg, !=, 0); ++ ASSERT3S(asize, >, 0); ++ ++ mutex_enter(&dd->dd_lock); ++ ++ /* ++ * Check against the dsl_dir's quota. We don't add in the delta ++ * when checking for over-quota because they get one free hit. ++ */ ++ est_inflight = dsl_dir_space_towrite(dd); ++ for (i = 0; i < TXG_SIZE; i++) ++ est_inflight += dd->dd_tempreserved[i]; ++ used_on_disk = dd->dd_phys->dd_used_bytes; ++ ++ /* ++ * On the first iteration, fetch the dataset's used-on-disk and ++ * refreservation values. Also, if checkrefquota is set, test if ++ * allocating this space would exceed the dataset's refquota. ++ */ ++ if (first && tx->tx_objset) { ++ int error; ++ dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; ++ ++ error = dsl_dataset_check_quota(ds, checkrefquota, ++ asize, est_inflight, &used_on_disk, &ref_rsrv); ++ if (error) { ++ mutex_exit(&dd->dd_lock); ++ return (error); ++ } ++ } ++ ++ /* ++ * If this transaction will result in a net free of space, ++ * we want to let it through. ++ */ ++ if (ignorequota || netfree || dd->dd_phys->dd_quota == 0) ++ quota = UINT64_MAX; ++ else ++ quota = dd->dd_phys->dd_quota; ++ ++ /* ++ * Adjust the quota against the actual pool size at the root ++ * minus any outstanding deferred frees. ++ * To ensure that it's possible to remove files from a full ++ * pool without inducing transient overcommits, we throttle ++ * netfree transactions against a quota that is slightly larger, ++ * but still within the pool's allocation slop. In cases where ++ * we're very close to full, this will allow a steady trickle of ++ * removes to get through. ++ */ ++ if (dd->dd_parent == NULL) { ++ spa_t *spa = dd->dd_pool->dp_spa; ++ uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); ++ deferred = metaslab_class_get_deferred(spa_normal_class(spa)); ++ if (poolsize - deferred < quota) { ++ quota = poolsize - deferred; ++ retval = ENOSPC; ++ } ++ } ++ ++ /* ++ * If they are requesting more space, and our current estimate ++ * is over quota, they get to try again unless the actual ++ * on-disk is over quota and there are no pending changes (which ++ * may free up space for us). ++ */ ++ if (used_on_disk + est_inflight >= quota) { ++ if (est_inflight > 0 || used_on_disk < quota || ++ (retval == ENOSPC && used_on_disk < quota + deferred)) ++ retval = ERESTART; ++ dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " ++ "quota=%lluK tr=%lluK err=%d\n", ++ used_on_disk>>10, est_inflight>>10, ++ quota>>10, asize>>10, retval); ++ mutex_exit(&dd->dd_lock); ++ return (retval); ++ } ++ ++ /* We need to up our estimated delta before dropping dd_lock */ ++ dd->dd_tempreserved[txgidx] += asize; ++ ++ parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, ++ asize - ref_rsrv); ++ mutex_exit(&dd->dd_lock); ++ ++ tr = kmem_zalloc(sizeof (struct tempreserve), KM_PUSHPAGE); ++ tr->tr_ds = dd; ++ tr->tr_size = asize; ++ list_insert_tail(tr_list, tr); ++ ++ /* see if it's OK with our parent */ ++ if (dd->dd_parent && parent_rsrv) { ++ boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0); ++ ++ return (dsl_dir_tempreserve_impl(dd->dd_parent, ++ parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE)); ++ } else { ++ return (0); ++ } ++} ++ ++/* ++ * Reserve space in this dsl_dir, to be used in this tx's txg. ++ * After the space has been dirtied (and dsl_dir_willuse_space() ++ * has been called), the reservation should be canceled, using ++ * dsl_dir_tempreserve_clear(). ++ */ ++int ++dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, ++ uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx) ++{ ++ int err; ++ list_t *tr_list; ++ ++ if (asize == 0) { ++ *tr_cookiep = NULL; ++ return (0); ++ } ++ ++ tr_list = kmem_alloc(sizeof (list_t), KM_PUSHPAGE); ++ list_create(tr_list, sizeof (struct tempreserve), ++ offsetof(struct tempreserve, tr_node)); ++ ASSERT3S(asize, >, 0); ++ ASSERT3S(fsize, >=, 0); ++ ++ err = arc_tempreserve_space(lsize, tx->tx_txg); ++ if (err == 0) { ++ struct tempreserve *tr; ++ ++ tr = kmem_zalloc(sizeof (struct tempreserve), KM_PUSHPAGE); ++ tr->tr_size = lsize; ++ list_insert_tail(tr_list, tr); ++ ++ err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx); ++ } else { ++ if (err == EAGAIN) { ++ txg_delay(dd->dd_pool, tx->tx_txg, 1); ++ err = ERESTART; ++ } ++ dsl_pool_memory_pressure(dd->dd_pool); ++ } ++ ++ if (err == 0) { ++ struct tempreserve *tr; ++ ++ tr = kmem_zalloc(sizeof (struct tempreserve), KM_PUSHPAGE); ++ tr->tr_dp = dd->dd_pool; ++ tr->tr_size = asize; ++ list_insert_tail(tr_list, tr); ++ ++ err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, ++ FALSE, asize > usize, tr_list, tx, TRUE); ++ } ++ ++ if (err) ++ dsl_dir_tempreserve_clear(tr_list, tx); ++ else ++ *tr_cookiep = tr_list; ++ ++ return (err); ++} ++ ++/* ++ * Clear a temporary reservation that we previously made with ++ * dsl_dir_tempreserve_space(). ++ */ ++void ++dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) ++{ ++ int txgidx = tx->tx_txg & TXG_MASK; ++ list_t *tr_list = tr_cookie; ++ struct tempreserve *tr; ++ ++ ASSERT3U(tx->tx_txg, !=, 0); ++ ++ if (tr_cookie == NULL) ++ return; ++ ++ while ((tr = list_head(tr_list))) { ++ if (tr->tr_dp) { ++ dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx); ++ } else if (tr->tr_ds) { ++ mutex_enter(&tr->tr_ds->dd_lock); ++ ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, ++ tr->tr_size); ++ tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; ++ mutex_exit(&tr->tr_ds->dd_lock); ++ } else { ++ arc_tempreserve_clear(tr->tr_size); ++ } ++ list_remove(tr_list, tr); ++ kmem_free(tr, sizeof (struct tempreserve)); ++ } ++ ++ kmem_free(tr_list, sizeof (list_t)); ++} ++ ++static void ++dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) ++{ ++ int64_t parent_space; ++ uint64_t est_used; ++ ++ mutex_enter(&dd->dd_lock); ++ if (space > 0) ++ dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; ++ ++ est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes; ++ parent_space = parent_delta(dd, est_used, space); ++ mutex_exit(&dd->dd_lock); ++ ++ /* Make sure that we clean up dd_space_to* */ ++ dsl_dir_dirty(dd, tx); ++ ++ /* XXX this is potentially expensive and unnecessary... */ ++ if (parent_space && dd->dd_parent) ++ dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx); ++} ++ ++/* ++ * Call in open context when we think we're going to write/free space, ++ * eg. when dirtying data. Be conservative (ie. OK to write less than ++ * this or free more than this, but don't write more or free less). ++ */ ++void ++dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) ++{ ++ dsl_pool_willuse_space(dd->dd_pool, space, tx); ++ dsl_dir_willuse_space_impl(dd, space, tx); ++} ++ ++/* call from syncing context when we actually write/free space for this dd */ ++void ++dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, ++ int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) ++{ ++ int64_t accounted_delta; ++ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); ++ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ASSERT(type < DD_USED_NUM); ++ ++ dsl_dir_dirty(dd, tx); ++ ++ if (needlock) ++ mutex_enter(&dd->dd_lock); ++ accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used); ++ ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used); ++ ASSERT(compressed >= 0 || ++ dd->dd_phys->dd_compressed_bytes >= -compressed); ++ ASSERT(uncompressed >= 0 || ++ dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); ++ dd->dd_phys->dd_used_bytes += used; ++ dd->dd_phys->dd_uncompressed_bytes += uncompressed; ++ dd->dd_phys->dd_compressed_bytes += compressed; ++ ++ if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { ++ ASSERT(used > 0 || ++ dd->dd_phys->dd_used_breakdown[type] >= -used); ++ dd->dd_phys->dd_used_breakdown[type] += used; ++#ifdef DEBUG ++ { ++ dd_used_t t; ++ uint64_t u = 0; ++ for (t = 0; t < DD_USED_NUM; t++) ++ u += dd->dd_phys->dd_used_breakdown[t]; ++ ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes); ++ } ++#endif ++ } ++ if (needlock) ++ mutex_exit(&dd->dd_lock); ++ ++ if (dd->dd_parent != NULL) { ++ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, ++ accounted_delta, compressed, uncompressed, tx); ++ dsl_dir_transfer_space(dd->dd_parent, ++ used - accounted_delta, ++ DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); ++ } ++} ++ ++void ++dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, ++ dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) ++{ ++ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); ++ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ASSERT(oldtype < DD_USED_NUM); ++ ASSERT(newtype < DD_USED_NUM); ++ ++ if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN)) ++ return; ++ ++ dsl_dir_dirty(dd, tx); ++ if (needlock) ++ mutex_enter(&dd->dd_lock); ++ ASSERT(delta > 0 ? ++ dd->dd_phys->dd_used_breakdown[oldtype] >= delta : ++ dd->dd_phys->dd_used_breakdown[newtype] >= -delta); ++ ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta)); ++ dd->dd_phys->dd_used_breakdown[oldtype] -= delta; ++ dd->dd_phys->dd_used_breakdown[newtype] += delta; ++ if (needlock) ++ mutex_exit(&dd->dd_lock); ++} ++ ++static int ++dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ dsl_dir_t *dd = ds->ds_dir; ++ dsl_prop_setarg_t *psa = arg2; ++ int err; ++ uint64_t towrite; ++ ++ if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) ++ return (err); ++ ++ if (psa->psa_effective_value == 0) ++ return (0); ++ ++ mutex_enter(&dd->dd_lock); ++ /* ++ * If we are doing the preliminary check in open context, and ++ * there are pending changes, then don't fail it, since the ++ * pending changes could under-estimate the amount of space to be ++ * freed up. ++ */ ++ towrite = dsl_dir_space_towrite(dd); ++ if ((dmu_tx_is_syncing(tx) || towrite == 0) && ++ (psa->psa_effective_value < dd->dd_phys->dd_reserved || ++ psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) { ++ err = ENOSPC; ++ } ++ mutex_exit(&dd->dd_lock); ++ return (err); ++} ++ ++extern dsl_syncfunc_t dsl_prop_set_sync; ++ ++static void ++dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ dsl_dir_t *dd = ds->ds_dir; ++ dsl_prop_setarg_t *psa = arg2; ++ uint64_t effective_value = psa->psa_effective_value; ++ ++ dsl_prop_set_sync(ds, psa, tx); ++ DSL_PROP_CHECK_PREDICTION(dd, psa); ++ ++ dmu_buf_will_dirty(dd->dd_dbuf, tx); ++ ++ mutex_enter(&dd->dd_lock); ++ dd->dd_phys->dd_quota = effective_value; ++ mutex_exit(&dd->dd_lock); ++} ++ ++int ++dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) ++{ ++ dsl_dir_t *dd; ++ dsl_dataset_t *ds; ++ dsl_prop_setarg_t psa; ++ int err; ++ ++ dsl_prop_setarg_init_uint64(&psa, "quota", source, "a); ++ ++ err = dsl_dataset_hold(ddname, FTAG, &ds); ++ if (err) ++ return (err); ++ ++ err = dsl_dir_open(ddname, FTAG, &dd, NULL); ++ if (err) { ++ dsl_dataset_rele(ds, FTAG); ++ return (err); ++ } ++ ++ ASSERT(ds->ds_dir == dd); ++ ++ /* ++ * If someone removes a file, then tries to set the quota, we want to ++ * make sure the file freeing takes effect. ++ */ ++ txg_wait_open(dd->dd_pool, 0); ++ ++ err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, ++ dsl_dir_set_quota_sync, ds, &psa, 0); ++ ++ dsl_dir_close(dd, FTAG); ++ dsl_dataset_rele(ds, FTAG); ++ return (err); ++} ++ ++int ++dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ dsl_dir_t *dd = ds->ds_dir; ++ dsl_prop_setarg_t *psa = arg2; ++ uint64_t effective_value; ++ uint64_t used, avail; ++ int err; ++ ++ if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) ++ return (err); ++ ++ effective_value = psa->psa_effective_value; ++ ++ /* ++ * If we are doing the preliminary check in open context, the ++ * space estimates may be inaccurate. ++ */ ++ if (!dmu_tx_is_syncing(tx)) ++ return (0); ++ ++ mutex_enter(&dd->dd_lock); ++ used = dd->dd_phys->dd_used_bytes; ++ mutex_exit(&dd->dd_lock); ++ ++ if (dd->dd_parent) { ++ avail = dsl_dir_space_available(dd->dd_parent, ++ NULL, 0, FALSE); ++ } else { ++ avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; ++ } ++ ++ if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) { ++ uint64_t delta = MAX(used, effective_value) - ++ MAX(used, dd->dd_phys->dd_reserved); ++ ++ if (delta > avail) ++ return (ENOSPC); ++ if (dd->dd_phys->dd_quota > 0 && ++ effective_value > dd->dd_phys->dd_quota) ++ return (ENOSPC); ++ } ++ ++ return (0); ++} ++ ++static void ++dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ dsl_dir_t *dd = ds->ds_dir; ++ dsl_prop_setarg_t *psa = arg2; ++ uint64_t effective_value = psa->psa_effective_value; ++ uint64_t used; ++ int64_t delta; ++ ++ dsl_prop_set_sync(ds, psa, tx); ++ DSL_PROP_CHECK_PREDICTION(dd, psa); ++ ++ dmu_buf_will_dirty(dd->dd_dbuf, tx); ++ ++ mutex_enter(&dd->dd_lock); ++ used = dd->dd_phys->dd_used_bytes; ++ delta = MAX(used, effective_value) - ++ MAX(used, dd->dd_phys->dd_reserved); ++ dd->dd_phys->dd_reserved = effective_value; ++ ++ if (dd->dd_parent != NULL) { ++ /* Roll up this additional usage into our ancestors */ ++ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, ++ delta, 0, 0, tx); ++ } ++ mutex_exit(&dd->dd_lock); ++} ++ ++int ++dsl_dir_set_reservation(const char *ddname, zprop_source_t source, ++ uint64_t reservation) ++{ ++ dsl_dir_t *dd; ++ dsl_dataset_t *ds; ++ dsl_prop_setarg_t psa; ++ int err; ++ ++ dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation); ++ ++ err = dsl_dataset_hold(ddname, FTAG, &ds); ++ if (err) ++ return (err); ++ ++ err = dsl_dir_open(ddname, FTAG, &dd, NULL); ++ if (err) { ++ dsl_dataset_rele(ds, FTAG); ++ return (err); ++ } ++ ++ ASSERT(ds->ds_dir == dd); ++ ++ err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check, ++ dsl_dir_set_reservation_sync, ds, &psa, 0); ++ ++ dsl_dir_close(dd, FTAG); ++ dsl_dataset_rele(ds, FTAG); ++ return (err); ++} ++ ++static dsl_dir_t * ++closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) ++{ ++ for (; ds1; ds1 = ds1->dd_parent) { ++ dsl_dir_t *dd; ++ for (dd = ds2; dd; dd = dd->dd_parent) { ++ if (ds1 == dd) ++ return (dd); ++ } ++ } ++ return (NULL); ++} ++ ++/* ++ * If delta is applied to dd, how much of that delta would be applied to ++ * ancestor? Syncing context only. ++ */ ++static int64_t ++would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) ++{ ++ if (dd == ancestor) ++ return (delta); ++ ++ mutex_enter(&dd->dd_lock); ++ delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta); ++ mutex_exit(&dd->dd_lock); ++ return (would_change(dd->dd_parent, delta, ancestor)); ++} ++ ++struct renamearg { ++ dsl_dir_t *newparent; ++ const char *mynewname; ++}; ++ ++static int ++dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dir_t *dd = arg1; ++ struct renamearg *ra = arg2; ++ dsl_pool_t *dp = dd->dd_pool; ++ objset_t *mos = dp->dp_meta_objset; ++ int err; ++ uint64_t val; ++ ++ /* ++ * There should only be one reference, from dmu_objset_rename(). ++ * Fleeting holds are also possible (eg, from "zfs list" getting ++ * stats), but any that are present in open context will likely ++ * be gone by syncing context, so only fail from syncing ++ * context. ++ */ ++ if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1) ++ return (EBUSY); ++ ++ /* check for existing name */ ++ err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, ++ ra->mynewname, 8, 1, &val); ++ if (err == 0) ++ return (EEXIST); ++ if (err != ENOENT) ++ return (err); ++ ++ if (ra->newparent != dd->dd_parent) { ++ /* is there enough space? */ ++ uint64_t myspace = ++ MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved); ++ ++ /* no rename into our descendant */ ++ if (closest_common_ancestor(dd, ra->newparent) == dd) ++ return (EINVAL); ++ ++ if ((err = dsl_dir_transfer_possible(dd->dd_parent, ++ ra->newparent, myspace))) ++ return (err); ++ } ++ ++ return (0); ++} ++ ++static void ++dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dir_t *dd = arg1; ++ struct renamearg *ra = arg2; ++ dsl_pool_t *dp = dd->dd_pool; ++ objset_t *mos = dp->dp_meta_objset; ++ int err; ++ ++ ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2); ++ ++ if (ra->newparent != dd->dd_parent) { ++ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, ++ -dd->dd_phys->dd_used_bytes, ++ -dd->dd_phys->dd_compressed_bytes, ++ -dd->dd_phys->dd_uncompressed_bytes, tx); ++ dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD, ++ dd->dd_phys->dd_used_bytes, ++ dd->dd_phys->dd_compressed_bytes, ++ dd->dd_phys->dd_uncompressed_bytes, tx); ++ ++ if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) { ++ uint64_t unused_rsrv = dd->dd_phys->dd_reserved - ++ dd->dd_phys->dd_used_bytes; ++ ++ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, ++ -unused_rsrv, 0, 0, tx); ++ dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV, ++ unused_rsrv, 0, 0, tx); ++ } ++ } ++ ++ dmu_buf_will_dirty(dd->dd_dbuf, tx); ++ ++ /* remove from old parent zapobj */ ++ err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, ++ dd->dd_myname, tx); ++ ASSERT3U(err, ==, 0); ++ ++ (void) strcpy(dd->dd_myname, ra->mynewname); ++ dsl_dir_close(dd->dd_parent, dd); ++ dd->dd_phys->dd_parent_obj = ra->newparent->dd_object; ++ VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, ++ ra->newparent->dd_object, NULL, dd, &dd->dd_parent)); ++ ++ /* add to new parent zapobj */ ++ err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, ++ dd->dd_myname, 8, 1, &dd->dd_object, tx); ++ ASSERT3U(err, ==, 0); ++ ++ spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, ++ tx, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj); ++} ++ ++int ++dsl_dir_rename(dsl_dir_t *dd, const char *newname) ++{ ++ struct renamearg ra; ++ int err; ++ ++ /* new parent should exist */ ++ err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname); ++ if (err) ++ return (err); ++ ++ /* can't rename to different pool */ ++ if (dd->dd_pool != ra.newparent->dd_pool) { ++ err = ENXIO; ++ goto out; ++ } ++ ++ /* new name should not already exist */ ++ if (ra.mynewname == NULL) { ++ err = EEXIST; ++ goto out; ++ } ++ ++ err = dsl_sync_task_do(dd->dd_pool, ++ dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3); ++ ++out: ++ dsl_dir_close(ra.newparent, FTAG); ++ return (err); ++} ++ ++int ++dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space) ++{ ++ dsl_dir_t *ancestor; ++ int64_t adelta; ++ uint64_t avail; ++ ++ ancestor = closest_common_ancestor(sdd, tdd); ++ adelta = would_change(sdd, -space, ancestor); ++ avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); ++ if (avail < space) ++ return (ENOSPC); ++ ++ return (0); ++} ++ ++timestruc_t ++dsl_dir_snap_cmtime(dsl_dir_t *dd) ++{ ++ timestruc_t t; ++ ++ mutex_enter(&dd->dd_lock); ++ t = dd->dd_snap_cmtime; ++ mutex_exit(&dd->dd_lock); ++ ++ return (t); ++} ++ ++void ++dsl_dir_snap_cmtime_update(dsl_dir_t *dd) ++{ ++ timestruc_t t; ++ ++ gethrestime(&t); ++ mutex_enter(&dd->dd_lock); ++ dd->dd_snap_cmtime = t; ++ mutex_exit(&dd->dd_lock); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(dsl_dir_set_quota); ++EXPORT_SYMBOL(dsl_dir_set_reservation); ++EXPORT_SYMBOL(dsl_dir_open); ++EXPORT_SYMBOL(dsl_dir_close); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dsl_pool.c linux-3.2.33-go/fs/zfs/zfs/dsl_pool.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dsl_pool.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dsl_pool.c 2012-11-16 23:25:34.348039346 +0100 +@@ -0,0 +1,1022 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int zfs_no_write_throttle = 0; ++int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ ++int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ ++int zfs_txg_history = 60; /* statistics for the last N txgs */ ++ ++unsigned long zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ ++unsigned long zfs_write_limit_max = 0; /* max data payload per txg */ ++unsigned long zfs_write_limit_inflated = 0; ++unsigned long zfs_write_limit_override = 0; ++ ++kmutex_t zfs_write_limit_lock; ++ ++static pgcnt_t old_physmem = 0; ++ ++static int ++dsl_pool_txg_history_update(kstat_t *ksp, int rw) ++{ ++ dsl_pool_t *dp = ksp->ks_private; ++ txg_history_t *th; ++ int i = 0; ++ ++ if (rw == KSTAT_WRITE) ++ return (EACCES); ++ ++ if (ksp->ks_data) ++ kmem_free(ksp->ks_data, ksp->ks_data_size); ++ ++ mutex_enter(&dp->dp_lock); ++ ++ ksp->ks_ndata = dp->dp_txg_history_size; ++ ksp->ks_data_size = dp->dp_txg_history_size * sizeof(kstat_txg_t); ++ if (ksp->ks_data_size > 0) ++ ksp->ks_data = kmem_alloc(ksp->ks_data_size, KM_PUSHPAGE); ++ ++ /* Traversed oldest to youngest for the most readable kstat output */ ++ for (th = list_tail(&dp->dp_txg_history); th != NULL; ++ th = list_prev(&dp->dp_txg_history, th)) { ++ mutex_enter(&th->th_lock); ++ ASSERT3S(i + sizeof(kstat_txg_t), <=, ksp->ks_data_size); ++ memcpy(ksp->ks_data + i, &th->th_kstat, sizeof(kstat_txg_t)); ++ i += sizeof(kstat_txg_t); ++ mutex_exit(&th->th_lock); ++ } ++ ++ mutex_exit(&dp->dp_lock); ++ ++ return (0); ++} ++ ++static void ++dsl_pool_txg_history_init(dsl_pool_t *dp, uint64_t txg) ++{ ++ char name[KSTAT_STRLEN]; ++ ++ list_create(&dp->dp_txg_history, sizeof (txg_history_t), ++ offsetof(txg_history_t, th_link)); ++ dsl_pool_txg_history_add(dp, txg); ++ ++ (void) snprintf(name, KSTAT_STRLEN, "txgs-%s", spa_name(dp->dp_spa)); ++ dp->dp_txg_kstat = kstat_create("zfs", 0, name, "misc", ++ KSTAT_TYPE_TXG, 0, KSTAT_FLAG_VIRTUAL); ++ if (dp->dp_txg_kstat) { ++ dp->dp_txg_kstat->ks_data = NULL; ++ dp->dp_txg_kstat->ks_private = dp; ++ dp->dp_txg_kstat->ks_update = dsl_pool_txg_history_update; ++ kstat_install(dp->dp_txg_kstat); ++ } ++} ++ ++static void ++dsl_pool_txg_history_destroy(dsl_pool_t *dp) ++{ ++ txg_history_t *th; ++ ++ if (dp->dp_txg_kstat) { ++ if (dp->dp_txg_kstat->ks_data) ++ kmem_free(dp->dp_txg_kstat->ks_data, ++ dp->dp_txg_kstat->ks_data_size); ++ ++ kstat_delete(dp->dp_txg_kstat); ++ } ++ ++ mutex_enter(&dp->dp_lock); ++ while ((th = list_remove_head(&dp->dp_txg_history))) { ++ dp->dp_txg_history_size--; ++ mutex_destroy(&th->th_lock); ++ kmem_free(th, sizeof(txg_history_t)); ++ } ++ ++ ASSERT3U(dp->dp_txg_history_size, ==, 0); ++ list_destroy(&dp->dp_txg_history); ++ mutex_exit(&dp->dp_lock); ++} ++ ++txg_history_t * ++dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg) ++{ ++ txg_history_t *th, *rm; ++ ++ th = kmem_zalloc(sizeof(txg_history_t), KM_SLEEP); ++ mutex_init(&th->th_lock, NULL, MUTEX_DEFAULT, NULL); ++ th->th_kstat.txg = txg; ++ th->th_kstat.state = TXG_STATE_OPEN; ++ th->th_kstat.birth = gethrtime(); ++ ++ mutex_enter(&dp->dp_lock); ++ ++ list_insert_head(&dp->dp_txg_history, th); ++ dp->dp_txg_history_size++; ++ ++ while (dp->dp_txg_history_size > zfs_txg_history) { ++ dp->dp_txg_history_size--; ++ rm = list_remove_tail(&dp->dp_txg_history); ++ mutex_destroy(&rm->th_lock); ++ kmem_free(rm, sizeof(txg_history_t)); ++ } ++ ++ mutex_exit(&dp->dp_lock); ++ ++ return (th); ++} ++ ++/* ++ * Traversed youngest to oldest because lookups are only done for open ++ * or syncing txgs which are guaranteed to be at the head of the list. ++ * The txg_history_t structure will be returned locked. ++ */ ++txg_history_t * ++dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg) ++{ ++ txg_history_t *th; ++ ++ mutex_enter(&dp->dp_lock); ++ for (th = list_head(&dp->dp_txg_history); th != NULL; ++ th = list_next(&dp->dp_txg_history, th)) { ++ if (th->th_kstat.txg == txg) { ++ mutex_enter(&th->th_lock); ++ break; ++ } ++ } ++ mutex_exit(&dp->dp_lock); ++ ++ return (th); ++} ++ ++void ++dsl_pool_txg_history_put(txg_history_t *th) ++{ ++ mutex_exit(&th->th_lock); ++} ++ ++int ++dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) ++{ ++ uint64_t obj; ++ int err; ++ ++ err = zap_lookup(dp->dp_meta_objset, ++ dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, ++ name, sizeof (obj), 1, &obj); ++ if (err) ++ return (err); ++ ++ return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); ++} ++ ++static dsl_pool_t * ++dsl_pool_open_impl(spa_t *spa, uint64_t txg) ++{ ++ dsl_pool_t *dp; ++ blkptr_t *bp = spa_get_rootblkptr(spa); ++ ++ dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); ++ dp->dp_spa = spa; ++ dp->dp_meta_rootbp = *bp; ++ rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); ++ dp->dp_write_limit = zfs_write_limit_min; ++ txg_init(dp, txg); ++ ++ txg_list_create(&dp->dp_dirty_datasets, ++ offsetof(dsl_dataset_t, ds_dirty_link)); ++ txg_list_create(&dp->dp_dirty_dirs, ++ offsetof(dsl_dir_t, dd_dirty_link)); ++ txg_list_create(&dp->dp_sync_tasks, ++ offsetof(dsl_sync_task_group_t, dstg_node)); ++ list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t), ++ offsetof(dsl_dataset_t, ds_synced_link)); ++ ++ mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri, ++ 1, 4, 0); ++ ++ dsl_pool_txg_history_init(dp, txg); ++ ++ return (dp); ++} ++ ++int ++dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) ++{ ++ int err; ++ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); ++ dsl_dir_t *dd; ++ dsl_dataset_t *ds; ++ uint64_t obj; ++ ++ rw_enter(&dp->dp_config_rwlock, RW_WRITER); ++ err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, ++ &dp->dp_meta_objset); ++ if (err) ++ goto out; ++ ++ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, ++ DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, ++ &dp->dp_root_dir_obj); ++ if (err) ++ goto out; ++ ++ err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, ++ NULL, dp, &dp->dp_root_dir); ++ if (err) ++ goto out; ++ ++ err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); ++ if (err) ++ goto out; ++ ++ if (spa_version(spa) >= SPA_VERSION_ORIGIN) { ++ err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); ++ if (err) ++ goto out; ++ err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, ++ FTAG, &ds); ++ if (err == 0) { ++ err = dsl_dataset_hold_obj(dp, ++ ds->ds_phys->ds_prev_snap_obj, dp, ++ &dp->dp_origin_snap); ++ dsl_dataset_rele(ds, FTAG); ++ } ++ dsl_dir_close(dd, dp); ++ if (err) ++ goto out; ++ } ++ ++ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { ++ err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, ++ &dp->dp_free_dir); ++ if (err) ++ goto out; ++ ++ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, ++ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); ++ if (err) ++ goto out; ++ VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, ++ dp->dp_meta_objset, obj)); ++ } ++ ++ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, ++ DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, ++ &dp->dp_tmp_userrefs_obj); ++ if (err == ENOENT) ++ err = 0; ++ if (err) ++ goto out; ++ ++ err = dsl_scan_init(dp, txg); ++ ++out: ++ rw_exit(&dp->dp_config_rwlock); ++ if (err) ++ dsl_pool_close(dp); ++ else ++ *dpp = dp; ++ ++ return (err); ++} ++ ++void ++dsl_pool_close(dsl_pool_t *dp) ++{ ++ /* drop our references from dsl_pool_open() */ ++ ++ /* ++ * Since we held the origin_snap from "syncing" context (which ++ * includes pool-opening context), it actually only got a "ref" ++ * and not a hold, so just drop that here. ++ */ ++ if (dp->dp_origin_snap) ++ dsl_dataset_drop_ref(dp->dp_origin_snap, dp); ++ if (dp->dp_mos_dir) ++ dsl_dir_close(dp->dp_mos_dir, dp); ++ if (dp->dp_free_dir) ++ dsl_dir_close(dp->dp_free_dir, dp); ++ if (dp->dp_root_dir) ++ dsl_dir_close(dp->dp_root_dir, dp); ++ ++ bpobj_close(&dp->dp_free_bpobj); ++ ++ /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ ++ if (dp->dp_meta_objset) ++ dmu_objset_evict(dp->dp_meta_objset); ++ ++ txg_list_destroy(&dp->dp_dirty_datasets); ++ txg_list_destroy(&dp->dp_sync_tasks); ++ txg_list_destroy(&dp->dp_dirty_dirs); ++ list_destroy(&dp->dp_synced_datasets); ++ ++ arc_flush(dp->dp_spa); ++ txg_fini(dp); ++ dsl_scan_fini(dp); ++ dsl_pool_txg_history_destroy(dp); ++ rw_destroy(&dp->dp_config_rwlock); ++ mutex_destroy(&dp->dp_lock); ++ taskq_destroy(dp->dp_iput_taskq); ++ if (dp->dp_blkstats) ++ kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); ++ kmem_free(dp, sizeof (dsl_pool_t)); ++} ++ ++dsl_pool_t * ++dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) ++{ ++ int err; ++ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); ++ dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); ++ objset_t *os; ++ dsl_dataset_t *ds; ++ uint64_t obj; ++ ++ /* create and open the MOS (meta-objset) */ ++ dp->dp_meta_objset = dmu_objset_create_impl(spa, ++ NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); ++ ++ /* create the pool directory */ ++ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, ++ DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); ++ ASSERT3U(err, ==, 0); ++ ++ /* Initialize scan structures */ ++ VERIFY3U(0, ==, dsl_scan_init(dp, txg)); ++ ++ /* create and open the root dir */ ++ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); ++ VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, ++ NULL, dp, &dp->dp_root_dir)); ++ ++ /* create and open the meta-objset dir */ ++ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); ++ VERIFY(0 == dsl_pool_open_special_dir(dp, ++ MOS_DIR_NAME, &dp->dp_mos_dir)); ++ ++ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { ++ /* create and open the free dir */ ++ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, ++ FREE_DIR_NAME, tx); ++ VERIFY(0 == dsl_pool_open_special_dir(dp, ++ FREE_DIR_NAME, &dp->dp_free_dir)); ++ ++ /* create and open the free_bplist */ ++ obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx); ++ VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, ++ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); ++ VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, ++ dp->dp_meta_objset, obj)); ++ } ++ ++ if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ++ dsl_pool_create_origin(dp, tx); ++ ++ /* create the root dataset */ ++ obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); ++ ++ /* create the root objset */ ++ VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); ++ VERIFY(NULL != (os = dmu_objset_create_impl(dp->dp_spa, ds, ++ dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx))); ++#ifdef _KERNEL ++ zfs_create_fs(os, kcred, zplprops, tx); ++#endif ++ dsl_dataset_rele(ds, FTAG); ++ ++ dmu_tx_commit(tx); ++ ++ return (dp); ++} ++ ++static int ++deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) ++{ ++ dsl_deadlist_t *dl = arg; ++ dsl_pool_t *dp = dmu_objset_pool(dl->dl_os); ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ dsl_deadlist_insert(dl, bp, tx); ++ rw_exit(&dp->dp_config_rwlock); ++ return (0); ++} ++ ++void ++dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) ++{ ++ zio_t *zio; ++ dmu_tx_t *tx; ++ dsl_dir_t *dd; ++ dsl_dataset_t *ds; ++ dsl_sync_task_group_t *dstg; ++ objset_t *mos = dp->dp_meta_objset; ++ hrtime_t start, write_time; ++ uint64_t data_written; ++ int err; ++ ++ /* ++ * We need to copy dp_space_towrite() before doing ++ * dsl_sync_task_group_sync(), because ++ * dsl_dataset_snapshot_reserve_space() will increase ++ * dp_space_towrite but not actually write anything. ++ */ ++ data_written = dp->dp_space_towrite[txg & TXG_MASK]; ++ ++ tx = dmu_tx_create_assigned(dp, txg); ++ ++ dp->dp_read_overhead = 0; ++ start = gethrtime(); ++ ++ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); ++ while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) { ++ /* ++ * We must not sync any non-MOS datasets twice, because ++ * we may have taken a snapshot of them. However, we ++ * may sync newly-created datasets on pass 2. ++ */ ++ ASSERT(!list_link_active(&ds->ds_synced_link)); ++ list_insert_tail(&dp->dp_synced_datasets, ds); ++ dsl_dataset_sync(ds, zio, tx); ++ } ++ DTRACE_PROBE(pool_sync__1setup); ++ err = zio_wait(zio); ++ ++ write_time = gethrtime() - start; ++ ASSERT(err == 0); ++ DTRACE_PROBE(pool_sync__2rootzio); ++ ++ for (ds = list_head(&dp->dp_synced_datasets); ds; ++ ds = list_next(&dp->dp_synced_datasets, ds)) ++ dmu_objset_do_userquota_updates(ds->ds_objset, tx); ++ ++ /* ++ * Sync the datasets again to push out the changes due to ++ * userspace updates. This must be done before we process the ++ * sync tasks, because that could cause a snapshot of a dataset ++ * whose ds_bp will be rewritten when we do this 2nd sync. ++ */ ++ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); ++ while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) { ++ ASSERT(list_link_active(&ds->ds_synced_link)); ++ dmu_buf_rele(ds->ds_dbuf, ds); ++ dsl_dataset_sync(ds, zio, tx); ++ } ++ err = zio_wait(zio); ++ ++ /* ++ * Move dead blocks from the pending deadlist to the on-disk ++ * deadlist. ++ */ ++ for (ds = list_head(&dp->dp_synced_datasets); ds; ++ ds = list_next(&dp->dp_synced_datasets, ds)) { ++ bplist_iterate(&ds->ds_pending_deadlist, ++ deadlist_enqueue_cb, &ds->ds_deadlist, tx); ++ } ++ ++ while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg))) { ++ /* ++ * No more sync tasks should have been added while we ++ * were syncing. ++ */ ++ ASSERT(spa_sync_pass(dp->dp_spa) == 1); ++ dsl_sync_task_group_sync(dstg, tx); ++ } ++ DTRACE_PROBE(pool_sync__3task); ++ ++ start = gethrtime(); ++ while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg))) ++ dsl_dir_sync(dd, tx); ++ write_time += gethrtime() - start; ++ ++ start = gethrtime(); ++ if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || ++ list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { ++ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); ++ dmu_objset_sync(mos, zio, tx); ++ err = zio_wait(zio); ++ ASSERT(err == 0); ++ dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); ++ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); ++ } ++ write_time += gethrtime() - start; ++ DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, ++ hrtime_t, dp->dp_read_overhead); ++ write_time -= dp->dp_read_overhead; ++ ++ dmu_tx_commit(tx); ++ ++ dp->dp_space_towrite[txg & TXG_MASK] = 0; ++ ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); ++ ++ /* ++ * If the write limit max has not been explicitly set, set it ++ * to a fraction of available physical memory (default 1/8th). ++ * Note that we must inflate the limit because the spa ++ * inflates write sizes to account for data replication. ++ * Check this each sync phase to catch changing memory size. ++ */ ++ if (physmem != old_physmem && zfs_write_limit_shift) { ++ mutex_enter(&zfs_write_limit_lock); ++ old_physmem = physmem; ++ zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; ++ zfs_write_limit_inflated = MAX(zfs_write_limit_min, ++ spa_get_asize(dp->dp_spa, zfs_write_limit_max)); ++ mutex_exit(&zfs_write_limit_lock); ++ } ++ ++ /* ++ * Attempt to keep the sync time consistent by adjusting the ++ * amount of write traffic allowed into each transaction group. ++ * Weight the throughput calculation towards the current value: ++ * thru = 3/4 old_thru + 1/4 new_thru ++ * ++ * Note: write_time is in nanosecs, so write_time/MICROSEC ++ * yields millisecs ++ */ ++ ASSERT(zfs_write_limit_min > 0); ++ if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) { ++ uint64_t throughput = data_written / (write_time / MICROSEC); ++ ++ if (dp->dp_throughput) ++ dp->dp_throughput = throughput / 4 + ++ 3 * dp->dp_throughput / 4; ++ else ++ dp->dp_throughput = throughput; ++ dp->dp_write_limit = MIN(zfs_write_limit_inflated, ++ MAX(zfs_write_limit_min, ++ dp->dp_throughput * zfs_txg_synctime_ms)); ++ } ++} ++ ++void ++dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) ++{ ++ dsl_dataset_t *ds; ++ objset_t *os; ++ ++ while ((ds = list_head(&dp->dp_synced_datasets))) { ++ list_remove(&dp->dp_synced_datasets, ds); ++ os = ds->ds_objset; ++ zil_clean(os->os_zil, txg); ++ ASSERT(!dmu_objset_is_dirty(os, txg)); ++ dmu_buf_rele(ds->ds_dbuf, ds); ++ } ++ ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); ++} ++ ++/* ++ * TRUE if the current thread is the tx_sync_thread or if we ++ * are being called from SPA context during pool initialization. ++ */ ++int ++dsl_pool_sync_context(dsl_pool_t *dp) ++{ ++ return (curthread == dp->dp_tx.tx_sync_thread || ++ spa_get_dsl(dp->dp_spa) == NULL); ++} ++ ++uint64_t ++dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) ++{ ++ uint64_t space, resv; ++ ++ /* ++ * Reserve about 1.6% (1/64), or at least 32MB, for allocation ++ * efficiency. ++ * XXX The intent log is not accounted for, so it must fit ++ * within this slop. ++ * ++ * If we're trying to assess whether it's OK to do a free, ++ * cut the reservation in half to allow forward progress ++ * (e.g. make it possible to rm(1) files from a full pool). ++ */ ++ space = spa_get_dspace(dp->dp_spa); ++ resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); ++ if (netfree) ++ resv >>= 1; ++ ++ return (space - resv); ++} ++ ++int ++dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) ++{ ++ uint64_t reserved = 0; ++ uint64_t write_limit = (zfs_write_limit_override ? ++ zfs_write_limit_override : dp->dp_write_limit); ++ ++ if (zfs_no_write_throttle) { ++ atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], ++ space); ++ return (0); ++ } ++ ++ /* ++ * Check to see if we have exceeded the maximum allowed IO for ++ * this transaction group. We can do this without locks since ++ * a little slop here is ok. Note that we do the reserved check ++ * with only half the requested reserve: this is because the ++ * reserve requests are worst-case, and we really don't want to ++ * throttle based off of worst-case estimates. ++ */ ++ if (write_limit > 0) { ++ reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] ++ + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; ++ ++ if (reserved && reserved > write_limit) { ++ DMU_TX_STAT_BUMP(dmu_tx_write_limit); ++ return (ERESTART); ++ } ++ } ++ ++ atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); ++ ++ /* ++ * If this transaction group is over 7/8ths capacity, delay ++ * the caller 1 clock tick. This will slow down the "fill" ++ * rate until the sync process can catch up with us. ++ */ ++ if (reserved && reserved > (write_limit - (write_limit >> 3))) ++ txg_delay(dp, tx->tx_txg, 1); ++ ++ return (0); ++} ++ ++void ++dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) ++{ ++ ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); ++ atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); ++} ++ ++void ++dsl_pool_memory_pressure(dsl_pool_t *dp) ++{ ++ uint64_t space_inuse = 0; ++ int i; ++ ++ if (dp->dp_write_limit == zfs_write_limit_min) ++ return; ++ ++ for (i = 0; i < TXG_SIZE; i++) { ++ space_inuse += dp->dp_space_towrite[i]; ++ space_inuse += dp->dp_tempreserved[i]; ++ } ++ dp->dp_write_limit = MAX(zfs_write_limit_min, ++ MIN(dp->dp_write_limit, space_inuse / 4)); ++} ++ ++void ++dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) ++{ ++ if (space > 0) { ++ mutex_enter(&dp->dp_lock); ++ dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; ++ mutex_exit(&dp->dp_lock); ++ } ++} ++ ++/* ARGSUSED */ ++static int ++upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) ++{ ++ dmu_tx_t *tx = arg; ++ dsl_dataset_t *ds, *prev = NULL; ++ int err; ++ dsl_pool_t *dp = spa_get_dsl(spa); ++ ++ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); ++ if (err) ++ return (err); ++ ++ while (ds->ds_phys->ds_prev_snap_obj != 0) { ++ err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, ++ FTAG, &prev); ++ if (err) { ++ dsl_dataset_rele(ds, FTAG); ++ return (err); ++ } ++ ++ if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) ++ break; ++ dsl_dataset_rele(ds, FTAG); ++ ds = prev; ++ prev = NULL; ++ } ++ ++ if (prev == NULL) { ++ prev = dp->dp_origin_snap; ++ ++ /* ++ * The $ORIGIN can't have any data, or the accounting ++ * will be wrong. ++ */ ++ ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); ++ ++ /* The origin doesn't get attached to itself */ ++ if (ds->ds_object == prev->ds_object) { ++ dsl_dataset_rele(ds, FTAG); ++ return (0); ++ } ++ ++ dmu_buf_will_dirty(ds->ds_dbuf, tx); ++ ds->ds_phys->ds_prev_snap_obj = prev->ds_object; ++ ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; ++ ++ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); ++ ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; ++ ++ dmu_buf_will_dirty(prev->ds_dbuf, tx); ++ prev->ds_phys->ds_num_children++; ++ ++ if (ds->ds_phys->ds_next_snap_obj == 0) { ++ ASSERT(ds->ds_prev == NULL); ++ VERIFY(0 == dsl_dataset_hold_obj(dp, ++ ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); ++ } ++ } ++ ++ ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); ++ ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); ++ ++ if (prev->ds_phys->ds_next_clones_obj == 0) { ++ dmu_buf_will_dirty(prev->ds_dbuf, tx); ++ prev->ds_phys->ds_next_clones_obj = ++ zap_create(dp->dp_meta_objset, ++ DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); ++ } ++ VERIFY(0 == zap_add_int(dp->dp_meta_objset, ++ prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); ++ ++ dsl_dataset_rele(ds, FTAG); ++ if (prev != dp->dp_origin_snap) ++ dsl_dataset_rele(prev, FTAG); ++ return (0); ++} ++ ++void ++dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) ++{ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ASSERT(dp->dp_origin_snap != NULL); ++ ++ VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, ++ tx, DS_FIND_CHILDREN)); ++} ++ ++/* ARGSUSED */ ++static int ++upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) ++{ ++ dmu_tx_t *tx = arg; ++ dsl_dataset_t *ds; ++ dsl_pool_t *dp = spa_get_dsl(spa); ++ objset_t *mos = dp->dp_meta_objset; ++ ++ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); ++ ++ if (ds->ds_dir->dd_phys->dd_origin_obj) { ++ dsl_dataset_t *origin; ++ ++ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, ++ ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); ++ ++ if (origin->ds_dir->dd_phys->dd_clones == 0) { ++ dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); ++ origin->ds_dir->dd_phys->dd_clones = zap_create(mos, ++ DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); ++ } ++ ++ VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, ++ origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); ++ ++ dsl_dataset_rele(origin, FTAG); ++ } ++ ++ dsl_dataset_rele(ds, FTAG); ++ return (0); ++} ++ ++void ++dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) ++{ ++ uint64_t obj; ++ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ++ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); ++ VERIFY(0 == dsl_pool_open_special_dir(dp, ++ FREE_DIR_NAME, &dp->dp_free_dir)); ++ ++ /* ++ * We can't use bpobj_alloc(), because spa_version() still ++ * returns the old version, and we need a new-version bpobj with ++ * subobj support. So call dmu_object_alloc() directly. ++ */ ++ obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, ++ SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); ++ VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, ++ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); ++ VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, ++ dp->dp_meta_objset, obj)); ++ ++ VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, ++ upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); ++} ++ ++void ++dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) ++{ ++ uint64_t dsobj; ++ dsl_dataset_t *ds; ++ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ASSERT(dp->dp_origin_snap == NULL); ++ ++ /* create the origin dir, ds, & snap-ds */ ++ rw_enter(&dp->dp_config_rwlock, RW_WRITER); ++ dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, ++ NULL, 0, kcred, tx); ++ VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); ++ dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx); ++ VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, ++ dp, &dp->dp_origin_snap)); ++ dsl_dataset_rele(ds, FTAG); ++ rw_exit(&dp->dp_config_rwlock); ++} ++ ++taskq_t * ++dsl_pool_iput_taskq(dsl_pool_t *dp) ++{ ++ return (dp->dp_iput_taskq); ++} ++ ++/* ++ * Walk through the pool-wide zap object of temporary snapshot user holds ++ * and release them. ++ */ ++void ++dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) ++{ ++ zap_attribute_t za; ++ zap_cursor_t zc; ++ objset_t *mos = dp->dp_meta_objset; ++ uint64_t zapobj = dp->dp_tmp_userrefs_obj; ++ ++ if (zapobj == 0) ++ return; ++ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); ++ ++ for (zap_cursor_init(&zc, mos, zapobj); ++ zap_cursor_retrieve(&zc, &za) == 0; ++ zap_cursor_advance(&zc)) { ++ char *htag; ++ uint64_t dsobj; ++ ++ htag = strchr(za.za_name, '-'); ++ *htag = '\0'; ++ ++htag; ++ dsobj = strtonum(za.za_name, NULL); ++ (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE); ++ } ++ zap_cursor_fini(&zc); ++} ++ ++/* ++ * Create the pool-wide zap object for storing temporary snapshot holds. ++ */ ++void ++dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) ++{ ++ objset_t *mos = dp->dp_meta_objset; ++ ++ ASSERT(dp->dp_tmp_userrefs_obj == 0); ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ++ dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, ++ DMU_OT_NONE, 0, tx); ++ ++ VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, ++ sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0); ++} ++ ++static int ++dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, ++ const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding) ++{ ++ objset_t *mos = dp->dp_meta_objset; ++ uint64_t zapobj = dp->dp_tmp_userrefs_obj; ++ char *name; ++ int error; ++ ++ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ++ /* ++ * If the pool was created prior to SPA_VERSION_USERREFS, the ++ * zap object for temporary holds might not exist yet. ++ */ ++ if (zapobj == 0) { ++ if (holding) { ++ dsl_pool_user_hold_create_obj(dp, tx); ++ zapobj = dp->dp_tmp_userrefs_obj; ++ } else { ++ return (ENOENT); ++ } ++ } ++ ++ name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); ++ if (holding) ++ error = zap_add(mos, zapobj, name, 8, 1, now, tx); ++ else ++ error = zap_remove(mos, zapobj, name, tx); ++ strfree(name); ++ ++ return (error); ++} ++ ++/* ++ * Add a temporary hold for the given dataset object and tag. ++ */ ++int ++dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, ++ uint64_t *now, dmu_tx_t *tx) ++{ ++ return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); ++} ++ ++/* ++ * Release a temporary hold for the given dataset object and tag. ++ */ ++int ++dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, ++ dmu_tx_t *tx) ++{ ++ return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, ++ tx, B_FALSE)); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++module_param(zfs_no_write_throttle, int, 0644); ++MODULE_PARM_DESC(zfs_no_write_throttle, "Disable write throttling"); ++ ++module_param(zfs_write_limit_shift, int, 0444); ++MODULE_PARM_DESC(zfs_write_limit_shift, "log2(fraction of memory) per txg"); ++ ++module_param(zfs_txg_synctime_ms, int, 0644); ++MODULE_PARM_DESC(zfs_txg_synctime_ms, "Target milliseconds between txg sync"); ++ ++module_param(zfs_txg_history, int, 0644); ++MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs"); ++ ++module_param(zfs_write_limit_min, ulong, 0444); ++MODULE_PARM_DESC(zfs_write_limit_min, "Min txg write limit"); ++ ++module_param(zfs_write_limit_max, ulong, 0444); ++MODULE_PARM_DESC(zfs_write_limit_max, "Max txg write limit"); ++ ++module_param(zfs_write_limit_inflated, ulong, 0444); ++MODULE_PARM_DESC(zfs_write_limit_inflated, "Inflated txg write limit"); ++ ++module_param(zfs_write_limit_override, ulong, 0444); ++MODULE_PARM_DESC(zfs_write_limit_override, "Override txg write limit"); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dsl_prop.c linux-3.2.33-go/fs/zfs/zfs/dsl_prop.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dsl_prop.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dsl_prop.c 2012-11-16 23:25:34.353039289 +0100 +@@ -0,0 +1,1170 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "zfs_prop.h" ++ ++#define ZPROP_INHERIT_SUFFIX "$inherit" ++#define ZPROP_RECVD_SUFFIX "$recvd" ++ ++static int ++dodefault(const char *propname, int intsz, int numints, void *buf) ++{ ++ zfs_prop_t prop; ++ ++ /* ++ * The setonce properties are read-only, BUT they still ++ * have a default value that can be used as the initial ++ * value. ++ */ ++ if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL || ++ (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop))) ++ return (ENOENT); ++ ++ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { ++ if (intsz != 1) ++ return (EOVERFLOW); ++ (void) strncpy(buf, zfs_prop_default_string(prop), ++ numints); ++ } else { ++ if (intsz != 8 || numints < 1) ++ return (EOVERFLOW); ++ ++ *(uint64_t *)buf = zfs_prop_default_numeric(prop); ++ } ++ ++ return (0); ++} ++ ++int ++dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, ++ int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot) ++{ ++ int err = ENOENT; ++ dsl_dir_t *target = dd; ++ objset_t *mos = dd->dd_pool->dp_meta_objset; ++ zfs_prop_t prop; ++ boolean_t inheritable; ++ boolean_t inheriting = B_FALSE; ++ char *inheritstr; ++ char *recvdstr; ++ ++ ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); ++ ++ if (setpoint) ++ setpoint[0] = '\0'; ++ ++ prop = zfs_name_to_prop(propname); ++ inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); ++ inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); ++ recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); ++ ++ /* ++ * Note: dd may become NULL, therefore we shouldn't dereference it ++ * after this loop. ++ */ ++ for (; dd != NULL; dd = dd->dd_parent) { ++ ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); ++ ++ if (dd != target || snapshot) { ++ if (!inheritable) ++ break; ++ inheriting = B_TRUE; ++ } ++ ++ /* Check for a local value. */ ++ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, ++ intsz, numints, buf); ++ if (err != ENOENT) { ++ if (setpoint != NULL && err == 0) ++ dsl_dir_name(dd, setpoint); ++ break; ++ } ++ ++ /* ++ * Skip the check for a received value if there is an explicit ++ * inheritance entry. ++ */ ++ err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, ++ inheritstr); ++ if (err != 0 && err != ENOENT) ++ break; ++ ++ if (err == ENOENT) { ++ /* Check for a received value. */ ++ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, ++ recvdstr, intsz, numints, buf); ++ if (err != ENOENT) { ++ if (setpoint != NULL && err == 0) { ++ if (inheriting) { ++ dsl_dir_name(dd, setpoint); ++ } else { ++ (void) strcpy(setpoint, ++ ZPROP_SOURCE_VAL_RECVD); ++ } ++ } ++ break; ++ } ++ } ++ ++ /* ++ * If we found an explicit inheritance entry, err is zero even ++ * though we haven't yet found the value, so reinitializing err ++ * at the end of the loop (instead of at the beginning) ensures ++ * that err has a valid post-loop value. ++ */ ++ err = ENOENT; ++ } ++ ++ if (err == ENOENT) ++ err = dodefault(propname, intsz, numints, buf); ++ ++ strfree(inheritstr); ++ strfree(recvdstr); ++ ++ return (err); ++} ++ ++int ++dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, ++ int intsz, int numints, void *buf, char *setpoint) ++{ ++ zfs_prop_t prop = zfs_name_to_prop(propname); ++ boolean_t inheritable; ++ boolean_t snapshot; ++ uint64_t zapobj; ++ ++ ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock)); ++ inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); ++ snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)); ++ zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj); ++ ++ if (zapobj != 0) { ++ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; ++ int err; ++ ++ ASSERT(snapshot); ++ ++ /* Check for a local value. */ ++ err = zap_lookup(mos, zapobj, propname, intsz, numints, buf); ++ if (err != ENOENT) { ++ if (setpoint != NULL && err == 0) ++ dsl_dataset_name(ds, setpoint); ++ return (err); ++ } ++ ++ /* ++ * Skip the check for a received value if there is an explicit ++ * inheritance entry. ++ */ ++ if (inheritable) { ++ char *inheritstr = kmem_asprintf("%s%s", propname, ++ ZPROP_INHERIT_SUFFIX); ++ err = zap_contains(mos, zapobj, inheritstr); ++ strfree(inheritstr); ++ if (err != 0 && err != ENOENT) ++ return (err); ++ } ++ ++ if (err == ENOENT) { ++ /* Check for a received value. */ ++ char *recvdstr = kmem_asprintf("%s%s", propname, ++ ZPROP_RECVD_SUFFIX); ++ err = zap_lookup(mos, zapobj, recvdstr, ++ intsz, numints, buf); ++ strfree(recvdstr); ++ if (err != ENOENT) { ++ if (setpoint != NULL && err == 0) ++ (void) strcpy(setpoint, ++ ZPROP_SOURCE_VAL_RECVD); ++ return (err); ++ } ++ } ++ } ++ ++ return (dsl_prop_get_dd(ds->ds_dir, propname, ++ intsz, numints, buf, setpoint, snapshot)); ++} ++ ++/* ++ * Register interest in the named property. We'll call the callback ++ * once to notify it of the current property value, and again each time ++ * the property changes, until this callback is unregistered. ++ * ++ * Return 0 on success, errno if the prop is not an integer value. ++ */ ++int ++dsl_prop_register(dsl_dataset_t *ds, const char *propname, ++ dsl_prop_changed_cb_t *callback, void *cbarg) ++{ ++ dsl_dir_t *dd = ds->ds_dir; ++ dsl_pool_t *dp = dd->dd_pool; ++ uint64_t value; ++ dsl_prop_cb_record_t *cbr; ++ int err; ++ int need_rwlock; ++ ++ need_rwlock = !RW_WRITE_HELD(&dp->dp_config_rwlock); ++ if (need_rwlock) ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ ++ err = dsl_prop_get_ds(ds, propname, 8, 1, &value, NULL); ++ if (err != 0) { ++ if (need_rwlock) ++ rw_exit(&dp->dp_config_rwlock); ++ return (err); ++ } ++ ++ cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_PUSHPAGE); ++ cbr->cbr_ds = ds; ++ cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_PUSHPAGE); ++ (void) strcpy((char *)cbr->cbr_propname, propname); ++ cbr->cbr_func = callback; ++ cbr->cbr_arg = cbarg; ++ mutex_enter(&dd->dd_lock); ++ list_insert_head(&dd->dd_prop_cbs, cbr); ++ mutex_exit(&dd->dd_lock); ++ ++ cbr->cbr_func(cbr->cbr_arg, value); ++ ++ if (need_rwlock) ++ rw_exit(&dp->dp_config_rwlock); ++ return (0); ++} ++ ++int ++dsl_prop_get(const char *dsname, const char *propname, ++ int intsz, int numints, void *buf, char *setpoint) ++{ ++ dsl_dataset_t *ds; ++ int err; ++ ++ err = dsl_dataset_hold(dsname, FTAG, &ds); ++ if (err) ++ return (err); ++ ++ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); ++ err = dsl_prop_get_ds(ds, propname, intsz, numints, buf, setpoint); ++ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); ++ ++ dsl_dataset_rele(ds, FTAG); ++ return (err); ++} ++ ++/* ++ * Get the current property value. It may have changed by the time this ++ * function returns, so it is NOT safe to follow up with ++ * dsl_prop_register() and assume that the value has not changed in ++ * between. ++ * ++ * Return 0 on success, ENOENT if ddname is invalid. ++ */ ++int ++dsl_prop_get_integer(const char *ddname, const char *propname, ++ uint64_t *valuep, char *setpoint) ++{ ++ return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint)); ++} ++ ++void ++dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, ++ zprop_source_t source, uint64_t *value) ++{ ++ psa->psa_name = propname; ++ psa->psa_source = source; ++ psa->psa_intsz = 8; ++ psa->psa_numints = 1; ++ psa->psa_value = value; ++ ++ psa->psa_effective_value = -1ULL; ++} ++ ++/* ++ * Predict the effective value of the given special property if it were set with ++ * the given value and source. This is not a general purpose function. It exists ++ * only to handle the special requirements of the quota and reservation ++ * properties. The fact that these properties are non-inheritable greatly ++ * simplifies the prediction logic. ++ * ++ * Returns 0 on success, a positive error code on failure, or -1 if called with ++ * a property not handled by this function. ++ */ ++int ++dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) ++{ ++ const char *propname = psa->psa_name; ++ zfs_prop_t prop = zfs_name_to_prop(propname); ++ zprop_source_t source = psa->psa_source; ++ objset_t *mos; ++ uint64_t zapobj; ++ uint64_t version; ++ char *recvdstr; ++ int err = 0; ++ ++ switch (prop) { ++ case ZFS_PROP_QUOTA: ++ case ZFS_PROP_RESERVATION: ++ case ZFS_PROP_REFQUOTA: ++ case ZFS_PROP_REFRESERVATION: ++ break; ++ default: ++ return (-1); ++ } ++ ++ mos = dd->dd_pool->dp_meta_objset; ++ zapobj = dd->dd_phys->dd_props_zapobj; ++ recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); ++ ++ version = spa_version(dd->dd_pool->dp_spa); ++ if (version < SPA_VERSION_RECVD_PROPS) { ++ if (source & ZPROP_SRC_NONE) ++ source = ZPROP_SRC_NONE; ++ else if (source & ZPROP_SRC_RECEIVED) ++ source = ZPROP_SRC_LOCAL; ++ } ++ ++ switch ((int)source) { ++ case ZPROP_SRC_NONE: ++ /* Revert to the received value, if any. */ ++ err = zap_lookup(mos, zapobj, recvdstr, 8, 1, ++ &psa->psa_effective_value); ++ if (err == ENOENT) ++ psa->psa_effective_value = 0; ++ break; ++ case ZPROP_SRC_LOCAL: ++ psa->psa_effective_value = *(uint64_t *)psa->psa_value; ++ break; ++ case ZPROP_SRC_RECEIVED: ++ /* ++ * If there's no local setting, then the new received value will ++ * be the effective value. ++ */ ++ err = zap_lookup(mos, zapobj, propname, 8, 1, ++ &psa->psa_effective_value); ++ if (err == ENOENT) ++ psa->psa_effective_value = *(uint64_t *)psa->psa_value; ++ break; ++ case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): ++ /* ++ * We're clearing the received value, so the local setting (if ++ * it exists) remains the effective value. ++ */ ++ err = zap_lookup(mos, zapobj, propname, 8, 1, ++ &psa->psa_effective_value); ++ if (err == ENOENT) ++ psa->psa_effective_value = 0; ++ break; ++ default: ++ cmn_err(CE_PANIC, "unexpected property source: %d", source); ++ } ++ ++ strfree(recvdstr); ++ ++ if (err == ENOENT) ++ return (0); ++ ++ return (err); ++} ++ ++#ifdef ZFS_DEBUG ++void ++dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa) ++{ ++ zfs_prop_t prop = zfs_name_to_prop(psa->psa_name); ++ uint64_t intval; ++ char setpoint[MAXNAMELEN]; ++ uint64_t version = spa_version(dd->dd_pool->dp_spa); ++ int err; ++ ++ if (version < SPA_VERSION_RECVD_PROPS) { ++ switch (prop) { ++ case ZFS_PROP_QUOTA: ++ case ZFS_PROP_RESERVATION: ++ return; ++ default: ++ break; ++ } ++ } ++ ++ err = dsl_prop_get_dd(dd, psa->psa_name, 8, 1, &intval, ++ setpoint, B_FALSE); ++ if (err == 0 && intval != psa->psa_effective_value) { ++ cmn_err(CE_PANIC, "%s property, source: %x, " ++ "predicted effective value: %llu, " ++ "actual effective value: %llu (setpoint: %s)", ++ psa->psa_name, psa->psa_source, ++ (unsigned long long)psa->psa_effective_value, ++ (unsigned long long)intval, setpoint); ++ } ++} ++#endif ++ ++/* ++ * Unregister this callback. Return 0 on success, ENOENT if ddname is ++ * invalid, ENOMSG if no matching callback registered. ++ */ ++int ++dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, ++ dsl_prop_changed_cb_t *callback, void *cbarg) ++{ ++ dsl_dir_t *dd = ds->ds_dir; ++ dsl_prop_cb_record_t *cbr; ++ ++ mutex_enter(&dd->dd_lock); ++ for (cbr = list_head(&dd->dd_prop_cbs); ++ cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { ++ if (cbr->cbr_ds == ds && ++ cbr->cbr_func == callback && ++ cbr->cbr_arg == cbarg && ++ strcmp(cbr->cbr_propname, propname) == 0) ++ break; ++ } ++ ++ if (cbr == NULL) { ++ mutex_exit(&dd->dd_lock); ++ return (ENOMSG); ++ } ++ ++ list_remove(&dd->dd_prop_cbs, cbr); ++ mutex_exit(&dd->dd_lock); ++ kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1); ++ kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); ++ ++ return (0); ++} ++ ++/* ++ * Return the number of callbacks that are registered for this dataset. ++ */ ++int ++dsl_prop_numcb(dsl_dataset_t *ds) ++{ ++ dsl_dir_t *dd = ds->ds_dir; ++ dsl_prop_cb_record_t *cbr; ++ int num = 0; ++ ++ mutex_enter(&dd->dd_lock); ++ for (cbr = list_head(&dd->dd_prop_cbs); ++ cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { ++ if (cbr->cbr_ds == ds) ++ num++; ++ } ++ mutex_exit(&dd->dd_lock); ++ ++ return (num); ++} ++ ++static void ++dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, ++ const char *propname, uint64_t value, int first) ++{ ++ dsl_dir_t *dd; ++ dsl_prop_cb_record_t *cbr; ++ objset_t *mos = dp->dp_meta_objset; ++ zap_cursor_t zc; ++ zap_attribute_t *za; ++ int err; ++ ++ ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); ++ err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd); ++ if (err) ++ return; ++ ++ if (!first) { ++ /* ++ * If the prop is set here, then this change is not ++ * being inherited here or below; stop the recursion. ++ */ ++ err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname); ++ if (err == 0) { ++ dsl_dir_close(dd, FTAG); ++ return; ++ } ++ ASSERT3U(err, ==, ENOENT); ++ } ++ ++ mutex_enter(&dd->dd_lock); ++ for (cbr = list_head(&dd->dd_prop_cbs); cbr; ++ cbr = list_next(&dd->dd_prop_cbs, cbr)) { ++ uint64_t propobj = cbr->cbr_ds->ds_phys->ds_props_obj; ++ ++ if (strcmp(cbr->cbr_propname, propname) != 0) ++ continue; ++ ++ /* ++ * If the property is set on this ds, then it is not ++ * inherited here; don't call the callback. ++ */ ++ if (propobj && 0 == zap_contains(mos, propobj, propname)) ++ continue; ++ ++ cbr->cbr_func(cbr->cbr_arg, value); ++ } ++ mutex_exit(&dd->dd_lock); ++ ++ za = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE); ++ for (zap_cursor_init(&zc, mos, ++ dd->dd_phys->dd_child_dir_zapobj); ++ zap_cursor_retrieve(&zc, za) == 0; ++ zap_cursor_advance(&zc)) { ++ dsl_prop_changed_notify(dp, za->za_first_integer, ++ propname, value, FALSE); ++ } ++ kmem_free(za, sizeof (zap_attribute_t)); ++ zap_cursor_fini(&zc); ++ dsl_dir_close(dd, FTAG); ++} ++ ++void ++dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ dsl_prop_setarg_t *psa = arg2; ++ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; ++ uint64_t zapobj, intval, dummy; ++ int isint; ++ char valbuf[32]; ++ char *valstr = NULL; ++ char *inheritstr; ++ char *recvdstr; ++ char *tbuf = NULL; ++ int err; ++ uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa); ++ const char *propname = psa->psa_name; ++ zprop_source_t source = psa->psa_source; ++ ++ isint = (dodefault(propname, 8, 1, &intval) == 0); ++ ++ if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) { ++ ASSERT(version >= SPA_VERSION_SNAP_PROPS); ++ if (ds->ds_phys->ds_props_obj == 0) { ++ dmu_buf_will_dirty(ds->ds_dbuf, tx); ++ ds->ds_phys->ds_props_obj = ++ zap_create(mos, ++ DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); ++ } ++ zapobj = ds->ds_phys->ds_props_obj; ++ } else { ++ zapobj = ds->ds_dir->dd_phys->dd_props_zapobj; ++ } ++ ++ if (version < SPA_VERSION_RECVD_PROPS) { ++ zfs_prop_t prop = zfs_name_to_prop(propname); ++ if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION) ++ return; ++ ++ if (source & ZPROP_SRC_NONE) ++ source = ZPROP_SRC_NONE; ++ else if (source & ZPROP_SRC_RECEIVED) ++ source = ZPROP_SRC_LOCAL; ++ } ++ ++ inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); ++ recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); ++ ++ switch ((int)source) { ++ case ZPROP_SRC_NONE: ++ /* ++ * revert to received value, if any (inherit -S) ++ * - remove propname ++ * - remove propname$inherit ++ */ ++ err = zap_remove(mos, zapobj, propname, tx); ++ ASSERT(err == 0 || err == ENOENT); ++ err = zap_remove(mos, zapobj, inheritstr, tx); ++ ASSERT(err == 0 || err == ENOENT); ++ break; ++ case ZPROP_SRC_LOCAL: ++ /* ++ * remove propname$inherit ++ * set propname -> value ++ */ ++ err = zap_remove(mos, zapobj, inheritstr, tx); ++ ASSERT(err == 0 || err == ENOENT); ++ VERIFY(0 == zap_update(mos, zapobj, propname, ++ psa->psa_intsz, psa->psa_numints, psa->psa_value, tx)); ++ break; ++ case ZPROP_SRC_INHERITED: ++ /* ++ * explicitly inherit ++ * - remove propname ++ * - set propname$inherit ++ */ ++ err = zap_remove(mos, zapobj, propname, tx); ++ ASSERT(err == 0 || err == ENOENT); ++ if (version >= SPA_VERSION_RECVD_PROPS && ++ dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, ++ NULL) == 0) { ++ dummy = 0; ++ err = zap_update(mos, zapobj, inheritstr, ++ 8, 1, &dummy, tx); ++ ASSERT(err == 0); ++ } ++ break; ++ case ZPROP_SRC_RECEIVED: ++ /* ++ * set propname$recvd -> value ++ */ ++ err = zap_update(mos, zapobj, recvdstr, ++ psa->psa_intsz, psa->psa_numints, psa->psa_value, tx); ++ ASSERT(err == 0); ++ break; ++ case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED): ++ /* ++ * clear local and received settings ++ * - remove propname ++ * - remove propname$inherit ++ * - remove propname$recvd ++ */ ++ err = zap_remove(mos, zapobj, propname, tx); ++ ASSERT(err == 0 || err == ENOENT); ++ err = zap_remove(mos, zapobj, inheritstr, tx); ++ ASSERT(err == 0 || err == ENOENT); ++ /* FALLTHRU */ ++ case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): ++ /* ++ * remove propname$recvd ++ */ ++ err = zap_remove(mos, zapobj, recvdstr, tx); ++ ASSERT(err == 0 || err == ENOENT); ++ break; ++ default: ++ cmn_err(CE_PANIC, "unexpected property source: %d", source); ++ } ++ ++ strfree(inheritstr); ++ strfree(recvdstr); ++ ++ if (isint) { ++ VERIFY(0 == dsl_prop_get_ds(ds, propname, 8, 1, &intval, NULL)); ++ ++ if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) { ++ dsl_prop_cb_record_t *cbr; ++ /* ++ * It's a snapshot; nothing can inherit this ++ * property, so just look for callbacks on this ++ * ds here. ++ */ ++ mutex_enter(&ds->ds_dir->dd_lock); ++ for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr; ++ cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) { ++ if (cbr->cbr_ds == ds && ++ strcmp(cbr->cbr_propname, propname) == 0) ++ cbr->cbr_func(cbr->cbr_arg, intval); ++ } ++ mutex_exit(&ds->ds_dir->dd_lock); ++ } else { ++ dsl_prop_changed_notify(ds->ds_dir->dd_pool, ++ ds->ds_dir->dd_object, propname, intval, TRUE); ++ } ++ ++ (void) snprintf(valbuf, sizeof (valbuf), ++ "%lld", (longlong_t)intval); ++ valstr = valbuf; ++ } else { ++ if (source == ZPROP_SRC_LOCAL) { ++ valstr = (char *)psa->psa_value; ++ } else { ++ tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_PUSHPAGE); ++ if (dsl_prop_get_ds(ds, propname, 1, ++ ZAP_MAXVALUELEN, tbuf, NULL) == 0) ++ valstr = tbuf; ++ } ++ } ++ ++ spa_history_log_internal((source == ZPROP_SRC_NONE || ++ source == ZPROP_SRC_INHERITED) ? LOG_DS_INHERIT : ++ LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, ++ "%s=%s dataset = %llu", propname, ++ (valstr == NULL ? "" : valstr), ds->ds_object); ++ ++ if (tbuf != NULL) ++ kmem_free(tbuf, ZAP_MAXVALUELEN); ++} ++ ++void ++dsl_props_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_dataset_t *ds = arg1; ++ dsl_props_arg_t *pa = arg2; ++ nvlist_t *props = pa->pa_props; ++ dsl_prop_setarg_t psa; ++ nvpair_t *elem = NULL; ++ ++ psa.psa_source = pa->pa_source; ++ ++ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { ++ nvpair_t *pair = elem; ++ ++ psa.psa_name = nvpair_name(pair); ++ ++ if (nvpair_type(pair) == DATA_TYPE_NVLIST) { ++ /* ++ * dsl_prop_get_all_impl() returns properties in this ++ * format. ++ */ ++ nvlist_t *attrs; ++ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); ++ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, ++ &pair) == 0); ++ } ++ ++ if (nvpair_type(pair) == DATA_TYPE_STRING) { ++ VERIFY(nvpair_value_string(pair, ++ (char **)&psa.psa_value) == 0); ++ psa.psa_intsz = 1; ++ psa.psa_numints = strlen(psa.psa_value) + 1; ++ } else { ++ uint64_t intval; ++ VERIFY(nvpair_value_uint64(pair, &intval) == 0); ++ psa.psa_intsz = sizeof (intval); ++ psa.psa_numints = 1; ++ psa.psa_value = &intval; ++ } ++ dsl_prop_set_sync(ds, &psa, tx); ++ } ++} ++ ++void ++dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, ++ dmu_tx_t *tx) ++{ ++ objset_t *mos = dd->dd_pool->dp_meta_objset; ++ uint64_t zapobj = dd->dd_phys->dd_props_zapobj; ++ ++ ASSERT(dmu_tx_is_syncing(tx)); ++ ++ VERIFY(0 == zap_update(mos, zapobj, name, sizeof (val), 1, &val, tx)); ++ ++ dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE); ++ ++ spa_history_log_internal(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, ++ "%s=%llu dataset = %llu", name, (u_longlong_t)val, ++ dd->dd_phys->dd_head_dataset_obj); ++} ++ ++int ++dsl_prop_set(const char *dsname, const char *propname, zprop_source_t source, ++ int intsz, int numints, const void *buf) ++{ ++ dsl_dataset_t *ds; ++ uint64_t version; ++ int err; ++ dsl_prop_setarg_t psa; ++ ++ /* ++ * We must do these checks before we get to the syncfunc, since ++ * it can't fail. ++ */ ++ if (strlen(propname) >= ZAP_MAXNAMELEN) ++ return (ENAMETOOLONG); ++ ++ err = dsl_dataset_hold(dsname, FTAG, &ds); ++ if (err) ++ return (err); ++ ++ version = spa_version(ds->ds_dir->dd_pool->dp_spa); ++ if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ? ++ ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { ++ dsl_dataset_rele(ds, FTAG); ++ return (E2BIG); ++ } ++ if (dsl_dataset_is_snapshot(ds) && ++ version < SPA_VERSION_SNAP_PROPS) { ++ dsl_dataset_rele(ds, FTAG); ++ return (ENOTSUP); ++ } ++ ++ psa.psa_name = propname; ++ psa.psa_source = source; ++ psa.psa_intsz = intsz; ++ psa.psa_numints = numints; ++ psa.psa_value = buf; ++ psa.psa_effective_value = -1ULL; ++ ++ err = dsl_sync_task_do(ds->ds_dir->dd_pool, ++ NULL, dsl_prop_set_sync, ds, &psa, 2); ++ ++ dsl_dataset_rele(ds, FTAG); ++ return (err); ++} ++ ++int ++dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) ++{ ++ dsl_dataset_t *ds; ++ uint64_t version; ++ nvpair_t *elem = NULL; ++ dsl_props_arg_t pa; ++ int err; ++ ++ if ((err = dsl_dataset_hold(dsname, FTAG, &ds))) ++ return (err); ++ /* ++ * Do these checks before the syncfunc, since it can't fail. ++ */ ++ version = spa_version(ds->ds_dir->dd_pool->dp_spa); ++ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { ++ if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { ++ dsl_dataset_rele(ds, FTAG); ++ return (ENAMETOOLONG); ++ } ++ if (nvpair_type(elem) == DATA_TYPE_STRING) { ++ char *valstr; ++ VERIFY(nvpair_value_string(elem, &valstr) == 0); ++ if (strlen(valstr) >= (version < ++ SPA_VERSION_STMF_PROP ? ++ ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { ++ dsl_dataset_rele(ds, FTAG); ++ return (E2BIG); ++ } ++ } ++ } ++ ++ if (dsl_dataset_is_snapshot(ds) && ++ version < SPA_VERSION_SNAP_PROPS) { ++ dsl_dataset_rele(ds, FTAG); ++ return (ENOTSUP); ++ } ++ ++ pa.pa_props = props; ++ pa.pa_source = source; ++ ++ err = dsl_sync_task_do(ds->ds_dir->dd_pool, ++ NULL, dsl_props_set_sync, ds, &pa, 2); ++ ++ dsl_dataset_rele(ds, FTAG); ++ return (err); ++} ++ ++typedef enum dsl_prop_getflags { ++ DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */ ++ DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */ ++ DSL_PROP_GET_LOCAL = 0x4, /* local properties */ ++ DSL_PROP_GET_RECEIVED = 0x8 /* received properties */ ++} dsl_prop_getflags_t; ++ ++static int ++dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, ++ const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv) ++{ ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ int err = 0; ++ ++ for (zap_cursor_init(&zc, mos, propobj); ++ (err = zap_cursor_retrieve(&zc, &za)) == 0; ++ zap_cursor_advance(&zc)) { ++ nvlist_t *propval; ++ zfs_prop_t prop; ++ char buf[ZAP_MAXNAMELEN]; ++ char *valstr; ++ const char *suffix; ++ const char *propname; ++ const char *source; ++ ++ suffix = strchr(za.za_name, '$'); ++ ++ if (suffix == NULL) { ++ /* ++ * Skip local properties if we only want received ++ * properties. ++ */ ++ if (flags & DSL_PROP_GET_RECEIVED) ++ continue; ++ ++ propname = za.za_name; ++ source = setpoint; ++ } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) { ++ /* Skip explicitly inherited entries. */ ++ continue; ++ } else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) { ++ if (flags & DSL_PROP_GET_LOCAL) ++ continue; ++ ++ (void) strncpy(buf, za.za_name, (suffix - za.za_name)); ++ buf[suffix - za.za_name] = '\0'; ++ propname = buf; ++ ++ if (!(flags & DSL_PROP_GET_RECEIVED)) { ++ /* Skip if locally overridden. */ ++ err = zap_contains(mos, propobj, propname); ++ if (err == 0) ++ continue; ++ if (err != ENOENT) ++ break; ++ ++ /* Skip if explicitly inherited. */ ++ valstr = kmem_asprintf("%s%s", propname, ++ ZPROP_INHERIT_SUFFIX); ++ err = zap_contains(mos, propobj, valstr); ++ strfree(valstr); ++ if (err == 0) ++ continue; ++ if (err != ENOENT) ++ break; ++ } ++ ++ source = ((flags & DSL_PROP_GET_INHERITING) ? ++ setpoint : ZPROP_SOURCE_VAL_RECVD); ++ } else { ++ /* ++ * For backward compatibility, skip suffixes we don't ++ * recognize. ++ */ ++ continue; ++ } ++ ++ prop = zfs_name_to_prop(propname); ++ ++ /* Skip non-inheritable properties. */ ++ if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL && ++ !zfs_prop_inheritable(prop)) ++ continue; ++ ++ /* Skip properties not valid for this type. */ ++ if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL && ++ !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT)) ++ continue; ++ ++ /* Skip properties already defined. */ ++ if (nvlist_exists(nv, propname)) ++ continue; ++ ++ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ if (za.za_integer_length == 1) { ++ /* ++ * String property ++ */ ++ char *tmp = kmem_alloc(za.za_num_integers, ++ KM_SLEEP); ++ err = zap_lookup(mos, propobj, ++ za.za_name, 1, za.za_num_integers, tmp); ++ if (err != 0) { ++ kmem_free(tmp, za.za_num_integers); ++ break; ++ } ++ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, ++ tmp) == 0); ++ kmem_free(tmp, za.za_num_integers); ++ } else { ++ /* ++ * Integer property ++ */ ++ ASSERT(za.za_integer_length == 8); ++ (void) nvlist_add_uint64(propval, ZPROP_VALUE, ++ za.za_first_integer); ++ } ++ ++ VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0); ++ VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); ++ nvlist_free(propval); ++ } ++ zap_cursor_fini(&zc); ++ if (err == ENOENT) ++ err = 0; ++ return (err); ++} ++ ++/* ++ * Iterate over all properties for this dataset and return them in an nvlist. ++ */ ++static int ++dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, ++ dsl_prop_getflags_t flags) ++{ ++ dsl_dir_t *dd = ds->ds_dir; ++ dsl_pool_t *dp = dd->dd_pool; ++ objset_t *mos = dp->dp_meta_objset; ++ int err = 0; ++ char setpoint[MAXNAMELEN]; ++ ++ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ ++ if (dsl_dataset_is_snapshot(ds)) ++ flags |= DSL_PROP_GET_SNAPSHOT; ++ ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ ++ if (ds->ds_phys->ds_props_obj != 0) { ++ ASSERT(flags & DSL_PROP_GET_SNAPSHOT); ++ dsl_dataset_name(ds, setpoint); ++ err = dsl_prop_get_all_impl(mos, ds->ds_phys->ds_props_obj, ++ setpoint, flags, *nvp); ++ if (err) ++ goto out; ++ } ++ ++ for (; dd != NULL; dd = dd->dd_parent) { ++ if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) { ++ if (flags & (DSL_PROP_GET_LOCAL | ++ DSL_PROP_GET_RECEIVED)) ++ break; ++ flags |= DSL_PROP_GET_INHERITING; ++ } ++ dsl_dir_name(dd, setpoint); ++ err = dsl_prop_get_all_impl(mos, dd->dd_phys->dd_props_zapobj, ++ setpoint, flags, *nvp); ++ if (err) ++ break; ++ } ++out: ++ rw_exit(&dp->dp_config_rwlock); ++ return (err); ++} ++ ++boolean_t ++dsl_prop_get_hasrecvd(objset_t *os) ++{ ++ dsl_dataset_t *ds = os->os_dsl_dataset; ++ int rc; ++ uint64_t dummy; ++ ++ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); ++ rc = dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, NULL); ++ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); ++ ASSERT(rc != 0 || spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); ++ return (rc == 0); ++} ++ ++static void ++dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source) ++{ ++ dsl_dataset_t *ds = os->os_dsl_dataset; ++ uint64_t dummy = 0; ++ dsl_prop_setarg_t psa; ++ ++ if (spa_version(os->os_spa) < SPA_VERSION_RECVD_PROPS) ++ return; ++ ++ dsl_prop_setarg_init_uint64(&psa, ZPROP_HAS_RECVD, source, &dummy); ++ ++ (void) dsl_sync_task_do(ds->ds_dir->dd_pool, NULL, ++ dsl_prop_set_sync, ds, &psa, 2); ++} ++ ++/* ++ * Call after successfully receiving properties to ensure that only the first ++ * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties. ++ */ ++void ++dsl_prop_set_hasrecvd(objset_t *os) ++{ ++ if (dsl_prop_get_hasrecvd(os)) { ++ ASSERT(spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); ++ return; ++ } ++ dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_LOCAL); ++} ++ ++void ++dsl_prop_unset_hasrecvd(objset_t *os) ++{ ++ dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_NONE); ++} ++ ++int ++dsl_prop_get_all(objset_t *os, nvlist_t **nvp) ++{ ++ return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0)); ++} ++ ++int ++dsl_prop_get_received(objset_t *os, nvlist_t **nvp) ++{ ++ /* ++ * Received properties are not distinguishable from local properties ++ * until the dataset has received properties on or after ++ * SPA_VERSION_RECVD_PROPS. ++ */ ++ dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(os) ? ++ DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL); ++ return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags)); ++} ++ ++void ++dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value) ++{ ++ nvlist_t *propval; ++ const char *propname = zfs_prop_to_name(prop); ++ uint64_t default_value; ++ ++ if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { ++ VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); ++ return; ++ } ++ ++ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); ++ /* Indicate the default source if we can. */ ++ if (dodefault(propname, 8, 1, &default_value) == 0 && ++ value == default_value) { ++ VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0); ++ } ++ VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); ++ nvlist_free(propval); ++} ++ ++void ++dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value) ++{ ++ nvlist_t *propval; ++ const char *propname = zfs_prop_to_name(prop); ++ ++ if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { ++ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); ++ return; ++ } ++ ++ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); ++ VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); ++ nvlist_free(propval); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(dsl_prop_register); ++EXPORT_SYMBOL(dsl_prop_unregister); ++EXPORT_SYMBOL(dsl_prop_numcb); ++EXPORT_SYMBOL(dsl_prop_set); ++EXPORT_SYMBOL(dsl_prop_get); ++EXPORT_SYMBOL(dsl_prop_get_integer); ++EXPORT_SYMBOL(dsl_prop_get_all); ++EXPORT_SYMBOL(dsl_prop_get_received); ++EXPORT_SYMBOL(dsl_prop_get_ds); ++EXPORT_SYMBOL(dsl_prop_get_dd); ++EXPORT_SYMBOL(dsl_prop_nvlist_add_uint64); ++EXPORT_SYMBOL(dsl_prop_nvlist_add_string); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dsl_scan.c linux-3.2.33-go/fs/zfs/zfs/dsl_scan.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dsl_scan.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dsl_scan.c 2012-11-16 23:25:34.353039289 +0100 +@@ -0,0 +1,1814 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef _KERNEL ++#include ++#endif ++ ++typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); ++ ++static scan_cb_t dsl_scan_scrub_cb; ++static dsl_syncfunc_t dsl_scan_cancel_sync; ++static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); ++ ++int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ ++int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ ++int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ ++int zfs_scan_idle = 50; /* idle window in clock ticks */ ++ ++int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ ++int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ ++int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ ++int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ ++int zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ ++enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; ++int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */ ++ ++#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ ++ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ ++ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) ++ ++/* the order has to match pool_scan_type */ ++static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { ++ NULL, ++ dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ ++ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ ++}; ++ ++int ++dsl_scan_init(dsl_pool_t *dp, uint64_t txg) ++{ ++ int err; ++ dsl_scan_t *scn; ++ spa_t *spa = dp->dp_spa; ++ uint64_t f; ++ ++ scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); ++ scn->scn_dp = dp; ++ ++ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, ++ "scrub_func", sizeof (uint64_t), 1, &f); ++ if (err == 0) { ++ /* ++ * There was an old-style scrub in progress. Restart a ++ * new-style scrub from the beginning. ++ */ ++ scn->scn_restart_txg = txg; ++ zfs_dbgmsg("old-style scrub was in progress; " ++ "restarting new-style scrub in txg %llu", ++ scn->scn_restart_txg); ++ ++ /* ++ * Load the queue obj from the old location so that it ++ * can be freed by dsl_scan_done(). ++ */ ++ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, ++ "scrub_queue", sizeof (uint64_t), 1, ++ &scn->scn_phys.scn_queue_obj); ++ } else { ++ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, ++ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, ++ &scn->scn_phys); ++ if (err == ENOENT) ++ return (0); ++ else if (err) ++ return (err); ++ ++ if (scn->scn_phys.scn_state == DSS_SCANNING && ++ spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { ++ /* ++ * A new-type scrub was in progress on an old ++ * pool, and the pool was accessed by old ++ * software. Restart from the beginning, since ++ * the old software may have changed the pool in ++ * the meantime. ++ */ ++ scn->scn_restart_txg = txg; ++ zfs_dbgmsg("new-style scrub was modified " ++ "by old software; restarting in txg %llu", ++ scn->scn_restart_txg); ++ } ++ } ++ ++ spa_scan_stat_init(spa); ++ return (0); ++} ++ ++void ++dsl_scan_fini(dsl_pool_t *dp) ++{ ++ if (dp->dp_scan) { ++ kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); ++ dp->dp_scan = NULL; ++ } ++} ++ ++/* ARGSUSED */ ++static int ++dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_scan_t *scn = arg1; ++ ++ if (scn->scn_phys.scn_state == DSS_SCANNING) ++ return (EBUSY); ++ ++ return (0); ++} ++ ++/* ARGSUSED */ ++static void ++dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_scan_t *scn = arg1; ++ pool_scan_func_t *funcp = arg2; ++ dmu_object_type_t ot = 0; ++ dsl_pool_t *dp = scn->scn_dp; ++ spa_t *spa = dp->dp_spa; ++ ++ ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); ++ ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); ++ bzero(&scn->scn_phys, sizeof (scn->scn_phys)); ++ scn->scn_phys.scn_func = *funcp; ++ scn->scn_phys.scn_state = DSS_SCANNING; ++ scn->scn_phys.scn_min_txg = 0; ++ scn->scn_phys.scn_max_txg = tx->tx_txg; ++ scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ ++ scn->scn_phys.scn_start_time = gethrestime_sec(); ++ scn->scn_phys.scn_errors = 0; ++ scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; ++ scn->scn_restart_txg = 0; ++ spa_scan_stat_init(spa); ++ ++ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { ++ scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; ++ ++ /* rewrite all disk labels */ ++ vdev_config_dirty(spa->spa_root_vdev); ++ ++ if (vdev_resilver_needed(spa->spa_root_vdev, ++ &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { ++ spa_event_notify(spa, NULL, FM_EREPORT_ZFS_RESILVER_START); ++ } else { ++ spa_event_notify(spa, NULL, FM_EREPORT_ZFS_SCRUB_START); ++ } ++ ++ spa->spa_scrub_started = B_TRUE; ++ /* ++ * If this is an incremental scrub, limit the DDT scrub phase ++ * to just the auto-ditto class (for correctness); the rest ++ * of the scrub should go faster using top-down pruning. ++ */ ++ if (scn->scn_phys.scn_min_txg > TXG_INITIAL) ++ scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; ++ ++ } ++ ++ /* back to the generic stuff */ ++ ++ if (dp->dp_blkstats == NULL) { ++ dp->dp_blkstats = kmem_alloc(sizeof (zfs_all_blkstats_t), ++ KM_PUSHPAGE | KM_NODEBUG); ++ } ++ bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); ++ ++ if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) ++ ot = DMU_OT_ZAP_OTHER; ++ ++ scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ++ ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); ++ ++ dsl_scan_sync_state(scn, tx); ++ ++ spa_history_log_internal(LOG_POOL_SCAN, spa, tx, ++ "func=%u mintxg=%llu maxtxg=%llu", ++ *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); ++} ++ ++/* ARGSUSED */ ++static void ++dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) ++{ ++ static const char *old_names[] = { ++ "scrub_bookmark", ++ "scrub_ddt_bookmark", ++ "scrub_ddt_class_max", ++ "scrub_queue", ++ "scrub_min_txg", ++ "scrub_max_txg", ++ "scrub_func", ++ "scrub_errors", ++ NULL ++ }; ++ ++ dsl_pool_t *dp = scn->scn_dp; ++ spa_t *spa = dp->dp_spa; ++ int i; ++ ++ /* Remove any remnants of an old-style scrub. */ ++ for (i = 0; old_names[i]; i++) { ++ (void) zap_remove(dp->dp_meta_objset, ++ DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); ++ } ++ ++ if (scn->scn_phys.scn_queue_obj != 0) { ++ VERIFY(0 == dmu_object_free(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, tx)); ++ scn->scn_phys.scn_queue_obj = 0; ++ } ++ ++ /* ++ * If we were "restarted" from a stopped state, don't bother ++ * with anything else. ++ */ ++ if (scn->scn_phys.scn_state != DSS_SCANNING) ++ return; ++ ++ if (complete) ++ scn->scn_phys.scn_state = DSS_FINISHED; ++ else ++ scn->scn_phys.scn_state = DSS_CANCELED; ++ ++ spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx, ++ "complete=%u", complete); ++ ++ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { ++ mutex_enter(&spa->spa_scrub_lock); ++ while (spa->spa_scrub_inflight > 0) { ++ cv_wait(&spa->spa_scrub_io_cv, ++ &spa->spa_scrub_lock); ++ } ++ mutex_exit(&spa->spa_scrub_lock); ++ spa->spa_scrub_started = B_FALSE; ++ spa->spa_scrub_active = B_FALSE; ++ ++ /* ++ * If the scrub/resilver completed, update all DTLs to ++ * reflect this. Whether it succeeded or not, vacate ++ * all temporary scrub DTLs. ++ */ ++ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, ++ complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); ++ if (complete) { ++ spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ? ++ FM_EREPORT_ZFS_RESILVER_FINISH : ++ FM_EREPORT_ZFS_SCRUB_FINISH); ++ } ++ spa_errlog_rotate(spa); ++ ++ /* ++ * We may have finished replacing a device. ++ * Let the async thread assess this and handle the detach. ++ */ ++ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); ++ } ++ ++ scn->scn_phys.scn_end_time = gethrestime_sec(); ++} ++ ++/* ARGSUSED */ ++static int ++dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_scan_t *scn = arg1; ++ ++ if (scn->scn_phys.scn_state != DSS_SCANNING) ++ return (ENOENT); ++ return (0); ++} ++ ++/* ARGSUSED */ ++static void ++dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ dsl_scan_t *scn = arg1; ++ ++ dsl_scan_done(scn, B_FALSE, tx); ++ dsl_scan_sync_state(scn, tx); ++} ++ ++int ++dsl_scan_cancel(dsl_pool_t *dp) ++{ ++ boolean_t complete = B_FALSE; ++ int err; ++ ++ err = dsl_sync_task_do(dp, dsl_scan_cancel_check, ++ dsl_scan_cancel_sync, dp->dp_scan, &complete, 3); ++ return (err); ++} ++ ++static void dsl_scan_visitbp(blkptr_t *bp, ++ const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf, ++ dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, ++ dmu_tx_t *tx); ++inline __attribute__((always_inline)) static void dsl_scan_visitdnode( ++ dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, ++ dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx); ++ ++void ++dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) ++{ ++ zio_free(dp->dp_spa, txg, bp); ++} ++ ++void ++dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) ++{ ++ ASSERT(dsl_pool_sync_context(dp)); ++ zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); ++} ++ ++int ++dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf, ++ arc_done_func_t *done, void *private, int priority, int zio_flags, ++ uint32_t *arc_flags, const zbookmark_t *zb) ++{ ++ return (arc_read(pio, spa, bpp, pbuf, done, private, ++ priority, zio_flags, arc_flags, zb)); ++} ++ ++int ++dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, ++ arc_done_func_t *done, void *private, int priority, int zio_flags, ++ uint32_t *arc_flags, const zbookmark_t *zb) ++{ ++ return (arc_read_nolock(pio, spa, bpp, done, private, ++ priority, zio_flags, arc_flags, zb)); ++} ++ ++static boolean_t ++bookmark_is_zero(const zbookmark_t *zb) ++{ ++ return (zb->zb_objset == 0 && zb->zb_object == 0 && ++ zb->zb_level == 0 && zb->zb_blkid == 0); ++} ++ ++/* dnp is the dnode for zb1->zb_object */ ++static boolean_t ++bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, ++ const zbookmark_t *zb2) ++{ ++ uint64_t zb1nextL0, zb2thisobj; ++ ++ ASSERT(zb1->zb_objset == zb2->zb_objset); ++ ASSERT(zb2->zb_level == 0); ++ ++ /* ++ * A bookmark in the deadlist is considered to be after ++ * everything else. ++ */ ++ if (zb2->zb_object == DMU_DEADLIST_OBJECT) ++ return (B_TRUE); ++ ++ /* The objset_phys_t isn't before anything. */ ++ if (dnp == NULL) ++ return (B_FALSE); ++ ++ zb1nextL0 = (zb1->zb_blkid + 1) << ++ ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); ++ ++ zb2thisobj = zb2->zb_object ? zb2->zb_object : ++ zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); ++ ++ if (zb1->zb_object == DMU_META_DNODE_OBJECT) { ++ uint64_t nextobj = zb1nextL0 * ++ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; ++ return (nextobj <= zb2thisobj); ++ } ++ ++ if (zb1->zb_object < zb2thisobj) ++ return (B_TRUE); ++ if (zb1->zb_object > zb2thisobj) ++ return (B_FALSE); ++ if (zb2->zb_object == DMU_META_DNODE_OBJECT) ++ return (B_FALSE); ++ return (zb1nextL0 <= zb2->zb_blkid); ++} ++ ++static uint64_t ++dsl_scan_ds_maxtxg(dsl_dataset_t *ds) ++{ ++ uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; ++ if (dsl_dataset_is_snapshot(ds)) ++ return (MIN(smt, ds->ds_phys->ds_creation_txg)); ++ return (smt); ++} ++ ++static void ++dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) ++{ ++ VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset, ++ DMU_POOL_DIRECTORY_OBJECT, ++ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, ++ &scn->scn_phys, tx)); ++} ++ ++static boolean_t ++dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) ++{ ++ uint64_t elapsed_nanosecs; ++ int mintime; ++ ++ /* we never skip user/group accounting objects */ ++ if (zb && (int64_t)zb->zb_object < 0) ++ return (B_FALSE); ++ ++ if (scn->scn_pausing) ++ return (B_TRUE); /* we're already pausing */ ++ ++ if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark)) ++ return (B_FALSE); /* we're resuming */ ++ ++ /* We only know how to resume from level-0 blocks. */ ++ if (zb && zb->zb_level != 0) ++ return (B_FALSE); ++ ++ mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? ++ zfs_resilver_min_time_ms : zfs_scan_min_time_ms; ++ elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; ++ if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || ++ (elapsed_nanosecs / MICROSEC > mintime && ++ txg_sync_waiting(scn->scn_dp)) || ++ spa_shutting_down(scn->scn_dp->dp_spa)) { ++ if (zb) { ++ dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", ++ (longlong_t)zb->zb_objset, ++ (longlong_t)zb->zb_object, ++ (longlong_t)zb->zb_level, ++ (longlong_t)zb->zb_blkid); ++ scn->scn_phys.scn_bookmark = *zb; ++ } ++ dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", ++ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, ++ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, ++ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, ++ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); ++ scn->scn_pausing = B_TRUE; ++ return (B_TRUE); ++ } ++ return (B_FALSE); ++} ++ ++typedef struct zil_scan_arg { ++ dsl_pool_t *zsa_dp; ++ zil_header_t *zsa_zh; ++} zil_scan_arg_t; ++ ++/* ARGSUSED */ ++static int ++dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) ++{ ++ zil_scan_arg_t *zsa = arg; ++ dsl_pool_t *dp = zsa->zsa_dp; ++ dsl_scan_t *scn = dp->dp_scan; ++ zil_header_t *zh = zsa->zsa_zh; ++ zbookmark_t zb; ++ ++ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) ++ return (0); ++ ++ /* ++ * One block ("stubby") can be allocated a long time ago; we ++ * want to visit that one because it has been allocated ++ * (on-disk) even if it hasn't been claimed (even though for ++ * scrub there's nothing to do to it). ++ */ ++ if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) ++ return (0); ++ ++ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], ++ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); ++ ++ VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); ++ return (0); ++} ++ ++/* ARGSUSED */ ++static int ++dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) ++{ ++ if (lrc->lrc_txtype == TX_WRITE) { ++ zil_scan_arg_t *zsa = arg; ++ dsl_pool_t *dp = zsa->zsa_dp; ++ dsl_scan_t *scn = dp->dp_scan; ++ zil_header_t *zh = zsa->zsa_zh; ++ lr_write_t *lr = (lr_write_t *)lrc; ++ blkptr_t *bp = &lr->lr_blkptr; ++ zbookmark_t zb; ++ ++ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) ++ return (0); ++ ++ /* ++ * birth can be < claim_txg if this record's txg is ++ * already txg sync'ed (but this log block contains ++ * other records that are not synced) ++ */ ++ if (claim_txg == 0 || bp->blk_birth < claim_txg) ++ return (0); ++ ++ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], ++ lr->lr_foid, ZB_ZIL_LEVEL, ++ lr->lr_offset / BP_GET_LSIZE(bp)); ++ ++ VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); ++ } ++ return (0); ++} ++ ++static void ++dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) ++{ ++ uint64_t claim_txg = zh->zh_claim_txg; ++ zil_scan_arg_t zsa = { dp, zh }; ++ zilog_t *zilog; ++ ++ /* ++ * We only want to visit blocks that have been claimed but not yet ++ * replayed (or, in read-only mode, blocks that *would* be claimed). ++ */ ++ if (claim_txg == 0 && spa_writeable(dp->dp_spa)) ++ return; ++ ++ zilog = zil_alloc(dp->dp_meta_objset, zh); ++ ++ (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, ++ claim_txg); ++ ++ zil_free(zilog); ++} ++ ++/* ARGSUSED */ ++static void ++dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, ++ uint64_t objset, uint64_t object, uint64_t blkid) ++{ ++ zbookmark_t czb; ++ uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; ++ ++ if (zfs_no_scrub_prefetch) ++ return; ++ ++ if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || ++ (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) ++ return; ++ ++ SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); ++ ++ /* ++ * XXX need to make sure all of these arc_read() prefetches are ++ * done before setting xlateall (similar to dsl_read()) ++ */ ++ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, ++ buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ++ ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); ++} ++ ++static boolean_t ++dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, ++ const zbookmark_t *zb) ++{ ++ /* ++ * We never skip over user/group accounting objects (obj<0) ++ */ ++ if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) && ++ (int64_t)zb->zb_object >= 0) { ++ /* ++ * If we already visited this bp & everything below (in ++ * a prior txg sync), don't bother doing it again. ++ */ ++ if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) ++ return (B_TRUE); ++ ++ /* ++ * If we found the block we're trying to resume from, or ++ * we went past it to a different object, zero it out to ++ * indicate that it's OK to start checking for pausing ++ * again. ++ */ ++ if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || ++ zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { ++ dprintf("resuming at %llx/%llx/%llx/%llx\n", ++ (longlong_t)zb->zb_objset, ++ (longlong_t)zb->zb_object, ++ (longlong_t)zb->zb_level, ++ (longlong_t)zb->zb_blkid); ++ bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); ++ } ++ } ++ return (B_FALSE); ++} ++ ++/* ++ * Return nonzero on i/o error. ++ * Return new buf to write out in *bufp. ++ */ ++inline __attribute__((always_inline)) static int ++dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, ++ dnode_phys_t *dnp, const blkptr_t *bp, ++ const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp) ++{ ++ dsl_pool_t *dp = scn->scn_dp; ++ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; ++ int err; ++ ++ if (BP_GET_LEVEL(bp) > 0) { ++ uint32_t flags = ARC_WAIT; ++ int i; ++ blkptr_t *cbp; ++ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; ++ ++ err = arc_read_nolock(NULL, dp->dp_spa, bp, ++ arc_getbuf_func, bufp, ++ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); ++ if (err) { ++ scn->scn_phys.scn_errors++; ++ return (err); ++ } ++ for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { ++ dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset, ++ zb->zb_object, zb->zb_blkid * epb + i); ++ } ++ for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { ++ zbookmark_t czb; ++ ++ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, ++ zb->zb_level - 1, ++ zb->zb_blkid * epb + i); ++ dsl_scan_visitbp(cbp, &czb, dnp, ++ *bufp, ds, scn, ostype, tx); ++ } ++ } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) { ++ uint32_t flags = ARC_WAIT; ++ ++ err = arc_read_nolock(NULL, dp->dp_spa, bp, ++ arc_getbuf_func, bufp, ++ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); ++ if (err) { ++ scn->scn_phys.scn_errors++; ++ return (err); ++ } ++ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { ++ uint32_t flags = ARC_WAIT; ++ dnode_phys_t *cdnp; ++ int i, j; ++ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; ++ ++ err = arc_read_nolock(NULL, dp->dp_spa, bp, ++ arc_getbuf_func, bufp, ++ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); ++ if (err) { ++ scn->scn_phys.scn_errors++; ++ return (err); ++ } ++ for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) { ++ for (j = 0; j < cdnp->dn_nblkptr; j++) { ++ blkptr_t *cbp = &cdnp->dn_blkptr[j]; ++ dsl_scan_prefetch(scn, *bufp, cbp, ++ zb->zb_objset, zb->zb_blkid * epb + i, j); ++ } ++ } ++ for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) { ++ dsl_scan_visitdnode(scn, ds, ostype, ++ cdnp, *bufp, zb->zb_blkid * epb + i, tx); ++ } ++ ++ } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { ++ uint32_t flags = ARC_WAIT; ++ objset_phys_t *osp; ++ ++ err = arc_read_nolock(NULL, dp->dp_spa, bp, ++ arc_getbuf_func, bufp, ++ ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); ++ if (err) { ++ scn->scn_phys.scn_errors++; ++ return (err); ++ } ++ ++ osp = (*bufp)->b_data; ++ ++ dsl_scan_visitdnode(scn, ds, osp->os_type, ++ &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx); ++ ++ if (OBJSET_BUF_HAS_USERUSED(*bufp)) { ++ /* ++ * We also always visit user/group accounting ++ * objects, and never skip them, even if we are ++ * pausing. This is necessary so that the space ++ * deltas from this txg get integrated. ++ */ ++ dsl_scan_visitdnode(scn, ds, osp->os_type, ++ &osp->os_groupused_dnode, *bufp, ++ DMU_GROUPUSED_OBJECT, tx); ++ dsl_scan_visitdnode(scn, ds, osp->os_type, ++ &osp->os_userused_dnode, *bufp, ++ DMU_USERUSED_OBJECT, tx); ++ } ++ } ++ ++ return (0); ++} ++ ++inline __attribute__((always_inline)) static void ++dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, ++ dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf, ++ uint64_t object, dmu_tx_t *tx) ++{ ++ int j; ++ ++ for (j = 0; j < dnp->dn_nblkptr; j++) { ++ zbookmark_t czb; ++ ++ SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, ++ dnp->dn_nlevels - 1, j); ++ dsl_scan_visitbp(&dnp->dn_blkptr[j], ++ &czb, dnp, buf, ds, scn, ostype, tx); ++ } ++ ++ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { ++ zbookmark_t czb; ++ SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, ++ 0, DMU_SPILL_BLKID); ++ dsl_scan_visitbp(&dnp->dn_spill, ++ &czb, dnp, buf, ds, scn, ostype, tx); ++ } ++} ++ ++/* ++ * The arguments are in this order because mdb can only print the ++ * first 5; we want them to be useful. ++ */ ++static void ++dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, ++ dnode_phys_t *dnp, arc_buf_t *pbuf, ++ dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, ++ dmu_tx_t *tx) ++{ ++ dsl_pool_t *dp = scn->scn_dp; ++ arc_buf_t *buf = NULL; ++ blkptr_t *bp_toread; ++ ++ bp_toread = kmem_alloc(sizeof (blkptr_t), KM_PUSHPAGE); ++ *bp_toread = *bp; ++ ++ /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ ++ ++ if (dsl_scan_check_pause(scn, zb)) ++ goto out; ++ ++ if (dsl_scan_check_resume(scn, dnp, zb)) ++ goto out; ++ ++ if (bp->blk_birth == 0) ++ goto out; ++ ++ scn->scn_visited_this_txg++; ++ ++ /* ++ * This debugging is commented out to conserve stack space. This ++ * function is called recursively and the debugging addes several ++ * bytes to the stack for each call. It can be commented back in ++ * if required to debug an issue in dsl_scan_visitbp(). ++ * ++ * dprintf_bp(bp, ++ * "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p", ++ * ds, ds ? ds->ds_object : 0, ++ * zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, ++ * pbuf, bp); ++ */ ++ ++ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) ++ goto out; ++ ++ if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) { ++ /* ++ * For non-user-accounting blocks, we need to read the ++ * new bp (from a deleted snapshot, found in ++ * check_existing_xlation). If we used the old bp, ++ * pointers inside this block from before we resumed ++ * would be untranslated. ++ * ++ * For user-accounting blocks, we need to read the old ++ * bp, because we will apply the entire space delta to ++ * it (original untranslated -> translations from ++ * deleted snap -> now). ++ */ ++ *bp_toread = *bp; ++ } ++ ++ if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx, ++ &buf) != 0) ++ goto out; ++ ++ /* ++ * If dsl_scan_ddt() has aready visited this block, it will have ++ * already done any translations or scrubbing, so don't call the ++ * callback again. ++ */ ++ if (ddt_class_contains(dp->dp_spa, ++ scn->scn_phys.scn_ddt_class_max, bp)) { ++ ASSERT(buf == NULL); ++ goto out; ++ } ++ ++ /* ++ * If this block is from the future (after cur_max_txg), then we ++ * are doing this on behalf of a deleted snapshot, and we will ++ * revisit the future block on the next pass of this dataset. ++ * Don't scan it now unless we need to because something ++ * under it was modified. ++ */ ++ if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) { ++ scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); ++ } ++ if (buf) ++ (void) arc_buf_remove_ref(buf, &buf); ++out: ++ kmem_free(bp_toread, sizeof(blkptr_t)); ++} ++ ++static void ++dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, ++ dmu_tx_t *tx) ++{ ++ zbookmark_t zb; ++ ++ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ++ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); ++ dsl_scan_visitbp(bp, &zb, NULL, NULL, ++ ds, scn, DMU_OST_NONE, tx); ++ ++ dprintf_ds(ds, "finished scan%s", ""); ++} ++ ++void ++dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) ++{ ++ dsl_pool_t *dp = ds->ds_dir->dd_pool; ++ dsl_scan_t *scn = dp->dp_scan; ++ uint64_t mintxg; ++ ++ if (scn->scn_phys.scn_state != DSS_SCANNING) ++ return; ++ ++ if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { ++ if (dsl_dataset_is_snapshot(ds)) { ++ /* Note, scn_cur_{min,max}_txg stays the same. */ ++ scn->scn_phys.scn_bookmark.zb_objset = ++ ds->ds_phys->ds_next_snap_obj; ++ zfs_dbgmsg("destroying ds %llu; currently traversing; " ++ "reset zb_objset to %llu", ++ (u_longlong_t)ds->ds_object, ++ (u_longlong_t)ds->ds_phys->ds_next_snap_obj); ++ scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; ++ } else { ++ SET_BOOKMARK(&scn->scn_phys.scn_bookmark, ++ ZB_DESTROYED_OBJSET, 0, 0, 0); ++ zfs_dbgmsg("destroying ds %llu; currently traversing; " ++ "reset bookmark to -1,0,0,0", ++ (u_longlong_t)ds->ds_object); ++ } ++ } else if (zap_lookup_int_key(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { ++ ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); ++ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); ++ if (dsl_dataset_is_snapshot(ds)) { ++ /* ++ * We keep the same mintxg; it could be > ++ * ds_creation_txg if the previous snapshot was ++ * deleted too. ++ */ ++ VERIFY(zap_add_int_key(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, ++ ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0); ++ zfs_dbgmsg("destroying ds %llu; in queue; " ++ "replacing with %llu", ++ (u_longlong_t)ds->ds_object, ++ (u_longlong_t)ds->ds_phys->ds_next_snap_obj); ++ } else { ++ zfs_dbgmsg("destroying ds %llu; in queue; removing", ++ (u_longlong_t)ds->ds_object); ++ } ++ } else { ++ zfs_dbgmsg("destroying ds %llu; ignoring", ++ (u_longlong_t)ds->ds_object); ++ } ++ ++ /* ++ * dsl_scan_sync() should be called after this, and should sync ++ * out our changed state, but just to be safe, do it here. ++ */ ++ dsl_scan_sync_state(scn, tx); ++} ++ ++void ++dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) ++{ ++ dsl_pool_t *dp = ds->ds_dir->dd_pool; ++ dsl_scan_t *scn = dp->dp_scan; ++ uint64_t mintxg; ++ ++ if (scn->scn_phys.scn_state != DSS_SCANNING) ++ return; ++ ++ ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); ++ ++ if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { ++ scn->scn_phys.scn_bookmark.zb_objset = ++ ds->ds_phys->ds_prev_snap_obj; ++ zfs_dbgmsg("snapshotting ds %llu; currently traversing; " ++ "reset zb_objset to %llu", ++ (u_longlong_t)ds->ds_object, ++ (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); ++ } else if (zap_lookup_int_key(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { ++ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); ++ VERIFY(zap_add_int_key(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, ++ ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0); ++ zfs_dbgmsg("snapshotting ds %llu; in queue; " ++ "replacing with %llu", ++ (u_longlong_t)ds->ds_object, ++ (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); ++ } ++ dsl_scan_sync_state(scn, tx); ++} ++ ++void ++dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) ++{ ++ dsl_pool_t *dp = ds1->ds_dir->dd_pool; ++ dsl_scan_t *scn = dp->dp_scan; ++ uint64_t mintxg; ++ ++ if (scn->scn_phys.scn_state != DSS_SCANNING) ++ return; ++ ++ if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { ++ scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; ++ zfs_dbgmsg("clone_swap ds %llu; currently traversing; " ++ "reset zb_objset to %llu", ++ (u_longlong_t)ds1->ds_object, ++ (u_longlong_t)ds2->ds_object); ++ } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { ++ scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; ++ zfs_dbgmsg("clone_swap ds %llu; currently traversing; " ++ "reset zb_objset to %llu", ++ (u_longlong_t)ds2->ds_object, ++ (u_longlong_t)ds1->ds_object); ++ } ++ ++ if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ++ ds1->ds_object, &mintxg) == 0) { ++ int err; ++ ++ ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); ++ ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); ++ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); ++ err = zap_add_int_key(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); ++ VERIFY(err == 0 || err == EEXIST); ++ if (err == EEXIST) { ++ /* Both were there to begin with */ ++ VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, ++ ds1->ds_object, mintxg, tx)); ++ } ++ zfs_dbgmsg("clone_swap ds %llu; in queue; " ++ "replacing with %llu", ++ (u_longlong_t)ds1->ds_object, ++ (u_longlong_t)ds2->ds_object); ++ } else if (zap_lookup_int_key(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { ++ ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); ++ ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); ++ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); ++ VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); ++ zfs_dbgmsg("clone_swap ds %llu; in queue; " ++ "replacing with %llu", ++ (u_longlong_t)ds2->ds_object, ++ (u_longlong_t)ds1->ds_object); ++ } ++ ++ dsl_scan_sync_state(scn, tx); ++} ++ ++struct enqueue_clones_arg { ++ dmu_tx_t *tx; ++ uint64_t originobj; ++}; ++ ++/* ARGSUSED */ ++static int ++enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) ++{ ++ struct enqueue_clones_arg *eca = arg; ++ dsl_dataset_t *ds; ++ int err; ++ dsl_pool_t *dp = spa->spa_dsl_pool; ++ dsl_scan_t *scn = dp->dp_scan; ++ ++ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); ++ if (err) ++ return (err); ++ ++ if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { ++ while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { ++ dsl_dataset_t *prev; ++ err = dsl_dataset_hold_obj(dp, ++ ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); ++ ++ dsl_dataset_rele(ds, FTAG); ++ if (err) ++ return (err); ++ ds = prev; ++ } ++ VERIFY(zap_add_int_key(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, ds->ds_object, ++ ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); ++ } ++ dsl_dataset_rele(ds, FTAG); ++ return (0); ++} ++ ++static void ++dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) ++{ ++ dsl_pool_t *dp = scn->scn_dp; ++ dsl_dataset_t *ds; ++ objset_t *os; ++ char *dsname; ++ ++ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); ++ ++ if (dmu_objset_from_ds(ds, &os)) ++ goto out; ++ ++ /* ++ * Only the ZIL in the head (non-snapshot) is valid. Even though ++ * snapshots can have ZIL block pointers (which may be the same ++ * BP as in the head), they must be ignored. So we traverse the ++ * ZIL here, rather than in scan_recurse(), because the regular ++ * snapshot block-sharing rules don't apply to it. ++ */ ++ if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds)) ++ dsl_scan_zil(dp, &os->os_zil_header); ++ ++ /* ++ * Iterate over the bps in this ds. ++ */ ++ dmu_buf_will_dirty(ds->ds_dbuf, tx); ++ dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx); ++ ++ dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_PUSHPAGE); ++ dsl_dataset_name(ds, dsname); ++ zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " ++ "pausing=%u", ++ (longlong_t)dsobj, dsname, ++ (longlong_t)scn->scn_phys.scn_cur_min_txg, ++ (longlong_t)scn->scn_phys.scn_cur_max_txg, ++ (int)scn->scn_pausing); ++ kmem_free(dsname, ZFS_MAXNAMELEN); ++ ++ if (scn->scn_pausing) ++ goto out; ++ ++ /* ++ * We've finished this pass over this dataset. ++ */ ++ ++ /* ++ * If we did not completely visit this dataset, do another pass. ++ */ ++ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { ++ zfs_dbgmsg("incomplete pass; visiting again"); ++ scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; ++ VERIFY(zap_add_int_key(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, ds->ds_object, ++ scn->scn_phys.scn_cur_max_txg, tx) == 0); ++ goto out; ++ } ++ ++ /* ++ * Add descendent datasets to work queue. ++ */ ++ if (ds->ds_phys->ds_next_snap_obj != 0) { ++ VERIFY(zap_add_int_key(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj, ++ ds->ds_phys->ds_creation_txg, tx) == 0); ++ } ++ if (ds->ds_phys->ds_num_children > 1) { ++ boolean_t usenext = B_FALSE; ++ if (ds->ds_phys->ds_next_clones_obj != 0) { ++ uint64_t count; ++ /* ++ * A bug in a previous version of the code could ++ * cause upgrade_clones_cb() to not set ++ * ds_next_snap_obj when it should, leading to a ++ * missing entry. Therefore we can only use the ++ * next_clones_obj when its count is correct. ++ */ ++ int err = zap_count(dp->dp_meta_objset, ++ ds->ds_phys->ds_next_clones_obj, &count); ++ if (err == 0 && ++ count == ds->ds_phys->ds_num_children - 1) ++ usenext = B_TRUE; ++ } ++ ++ if (usenext) { ++ VERIFY(zap_join_key(dp->dp_meta_objset, ++ ds->ds_phys->ds_next_clones_obj, ++ scn->scn_phys.scn_queue_obj, ++ ds->ds_phys->ds_creation_txg, tx) == 0); ++ } else { ++ struct enqueue_clones_arg eca; ++ eca.tx = tx; ++ eca.originobj = ds->ds_object; ++ ++ (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, ++ NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); ++ } ++ } ++ ++out: ++ dsl_dataset_rele(ds, FTAG); ++} ++ ++/* ARGSUSED */ ++static int ++enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) ++{ ++ dmu_tx_t *tx = arg; ++ dsl_dataset_t *ds; ++ int err; ++ dsl_pool_t *dp = spa->spa_dsl_pool; ++ dsl_scan_t *scn = dp->dp_scan; ++ ++ err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); ++ if (err) ++ return (err); ++ ++ while (ds->ds_phys->ds_prev_snap_obj != 0) { ++ dsl_dataset_t *prev; ++ err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, ++ FTAG, &prev); ++ if (err) { ++ dsl_dataset_rele(ds, FTAG); ++ return (err); ++ } ++ ++ /* ++ * If this is a clone, we don't need to worry about it for now. ++ */ ++ if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { ++ dsl_dataset_rele(ds, FTAG); ++ dsl_dataset_rele(prev, FTAG); ++ return (0); ++ } ++ dsl_dataset_rele(ds, FTAG); ++ ds = prev; ++ } ++ ++ VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ++ ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0); ++ dsl_dataset_rele(ds, FTAG); ++ return (0); ++} ++ ++/* ++ * Scrub/dedup interaction. ++ * ++ * If there are N references to a deduped block, we don't want to scrub it ++ * N times -- ideally, we should scrub it exactly once. ++ * ++ * We leverage the fact that the dde's replication class (enum ddt_class) ++ * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest ++ * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. ++ * ++ * To prevent excess scrubbing, the scrub begins by walking the DDT ++ * to find all blocks with refcnt > 1, and scrubs each of these once. ++ * Since there are two replication classes which contain blocks with ++ * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. ++ * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. ++ * ++ * There would be nothing more to say if a block's refcnt couldn't change ++ * during a scrub, but of course it can so we must account for changes ++ * in a block's replication class. ++ * ++ * Here's an example of what can occur: ++ * ++ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 ++ * when visited during the top-down scrub phase, it will be scrubbed twice. ++ * This negates our scrub optimization, but is otherwise harmless. ++ * ++ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 ++ * on each visit during the top-down scrub phase, it will never be scrubbed. ++ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's ++ * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to ++ * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 ++ * while a scrub is in progress, it scrubs the block right then. ++ */ ++static void ++dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) ++{ ++ ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; ++ ddt_entry_t dde; ++ int error; ++ uint64_t n = 0; ++ ++ bzero(&dde, sizeof (ddt_entry_t)); ++ ++ while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { ++ ddt_t *ddt; ++ ++ if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) ++ break; ++ dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", ++ (longlong_t)ddb->ddb_class, ++ (longlong_t)ddb->ddb_type, ++ (longlong_t)ddb->ddb_checksum, ++ (longlong_t)ddb->ddb_cursor); ++ ++ /* There should be no pending changes to the dedup table */ ++ ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; ++ ASSERT(avl_first(&ddt->ddt_tree) == NULL); ++ ++ dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); ++ n++; ++ ++ if (dsl_scan_check_pause(scn, NULL)) ++ break; ++ } ++ ++ zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u", ++ (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max, ++ (int)scn->scn_pausing); ++ ++ ASSERT(error == 0 || error == ENOENT); ++ ASSERT(error != ENOENT || ++ ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); ++} ++ ++/* ARGSUSED */ ++void ++dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, ++ ddt_entry_t *dde, dmu_tx_t *tx) ++{ ++ const ddt_key_t *ddk = &dde->dde_key; ++ ddt_phys_t *ddp = dde->dde_phys; ++ blkptr_t bp; ++ zbookmark_t zb = { 0 }; ++ int p; ++ ++ if (scn->scn_phys.scn_state != DSS_SCANNING) ++ return; ++ ++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { ++ if (ddp->ddp_phys_birth == 0 || ++ ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg) ++ continue; ++ ddt_bp_create(checksum, ddk, ddp, &bp); ++ ++ scn->scn_visited_this_txg++; ++ scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); ++ } ++} ++ ++static void ++dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) ++{ ++ dsl_pool_t *dp = scn->scn_dp; ++ zap_cursor_t *zc; ++ zap_attribute_t *za; ++ ++ if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= ++ scn->scn_phys.scn_ddt_class_max) { ++ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; ++ scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; ++ dsl_scan_ddt(scn, tx); ++ if (scn->scn_pausing) ++ return; ++ } ++ ++ if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { ++ /* First do the MOS & ORIGIN */ ++ ++ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; ++ scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; ++ dsl_scan_visit_rootbp(scn, NULL, ++ &dp->dp_meta_rootbp, tx); ++ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); ++ if (scn->scn_pausing) ++ return; ++ ++ if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { ++ VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, ++ NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); ++ } else { ++ dsl_scan_visitds(scn, ++ dp->dp_origin_snap->ds_object, tx); ++ } ++ ASSERT(!scn->scn_pausing); ++ } else if (scn->scn_phys.scn_bookmark.zb_objset != ++ ZB_DESTROYED_OBJSET) { ++ /* ++ * If we were paused, continue from here. Note if the ++ * ds we were paused on was deleted, the zb_objset may ++ * be -1, so we will skip this and find a new objset ++ * below. ++ */ ++ dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); ++ if (scn->scn_pausing) ++ return; ++ } ++ ++ /* ++ * In case we were paused right at the end of the ds, zero the ++ * bookmark so we don't think that we're still trying to resume. ++ */ ++ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t)); ++ zc = kmem_alloc(sizeof(zap_cursor_t), KM_PUSHPAGE); ++ za = kmem_alloc(sizeof(zap_attribute_t), KM_PUSHPAGE); ++ ++ /* keep pulling things out of the zap-object-as-queue */ ++ while (zap_cursor_init(zc, dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj), ++ zap_cursor_retrieve(zc, za) == 0) { ++ dsl_dataset_t *ds; ++ uint64_t dsobj; ++ ++ dsobj = strtonum(za->za_name, NULL); ++ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, ++ scn->scn_phys.scn_queue_obj, dsobj, tx)); ++ ++ /* Set up min/max txg */ ++ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); ++ if (za->za_first_integer != 0) { ++ scn->scn_phys.scn_cur_min_txg = ++ MAX(scn->scn_phys.scn_min_txg, ++ za->za_first_integer); ++ } else { ++ scn->scn_phys.scn_cur_min_txg = ++ MAX(scn->scn_phys.scn_min_txg, ++ ds->ds_phys->ds_prev_snap_txg); ++ } ++ scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); ++ dsl_dataset_rele(ds, FTAG); ++ ++ dsl_scan_visitds(scn, dsobj, tx); ++ zap_cursor_fini(zc); ++ if (scn->scn_pausing) ++ goto out; ++ } ++ zap_cursor_fini(zc); ++out: ++ kmem_free(za, sizeof(zap_attribute_t)); ++ kmem_free(zc, sizeof(zap_cursor_t)); ++} ++ ++static int ++dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) ++{ ++ dsl_scan_t *scn = arg; ++ uint64_t elapsed_nanosecs; ++ ++ elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; ++ ++ if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || ++ (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && ++ txg_sync_waiting(scn->scn_dp)) || ++ spa_shutting_down(scn->scn_dp->dp_spa)) ++ return (ERESTART); ++ ++ zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, ++ dmu_tx_get_txg(tx), bp, 0)); ++ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, ++ -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), ++ -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); ++ scn->scn_visited_this_txg++; ++ return (0); ++} ++ ++boolean_t ++dsl_scan_active(dsl_scan_t *scn) ++{ ++ spa_t *spa = scn->scn_dp->dp_spa; ++ uint64_t used = 0, comp, uncomp; ++ ++ if (spa->spa_load_state != SPA_LOAD_NONE) ++ return (B_FALSE); ++ if (spa_shutting_down(spa)) ++ return (B_FALSE); ++ ++ if (scn->scn_phys.scn_state == DSS_SCANNING) ++ return (B_TRUE); ++ ++ if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { ++ (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, ++ &used, &comp, &uncomp); ++ } ++ return (used != 0); ++} ++ ++void ++dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) ++{ ++ dsl_scan_t *scn = dp->dp_scan; ++ spa_t *spa = dp->dp_spa; ++ int err; ++ ++ /* ++ * Check for scn_restart_txg before checking spa_load_state, so ++ * that we can restart an old-style scan while the pool is being ++ * imported (see dsl_scan_init). ++ */ ++ if (scn->scn_restart_txg != 0 && ++ scn->scn_restart_txg <= tx->tx_txg) { ++ pool_scan_func_t func = POOL_SCAN_SCRUB; ++ dsl_scan_done(scn, B_FALSE, tx); ++ if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) ++ func = POOL_SCAN_RESILVER; ++ zfs_dbgmsg("restarting scan func=%u txg=%llu", ++ func, tx->tx_txg); ++ dsl_scan_setup_sync(scn, &func, tx); ++ } ++ ++ if (!dsl_scan_active(scn) || ++ spa_sync_pass(dp->dp_spa) > 1) ++ return; ++ ++ scn->scn_visited_this_txg = 0; ++ scn->scn_pausing = B_FALSE; ++ scn->scn_sync_start_time = gethrtime(); ++ spa->spa_scrub_active = B_TRUE; ++ ++ /* ++ * First process the free list. If we pause the free, don't do ++ * any scanning. This ensures that there is no free list when ++ * we are scanning, so the scan code doesn't have to worry about ++ * traversing it. ++ */ ++ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { ++ scn->scn_zio_root = zio_root(dp->dp_spa, NULL, ++ NULL, ZIO_FLAG_MUSTSUCCEED); ++ err = bpobj_iterate(&dp->dp_free_bpobj, ++ dsl_scan_free_cb, scn, tx); ++ VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); ++ if (scn->scn_visited_this_txg) { ++ zfs_dbgmsg("freed %llu blocks in %llums from " ++ "free_bpobj txg %llu", ++ (longlong_t)scn->scn_visited_this_txg, ++ (longlong_t) ++ (gethrtime() - scn->scn_sync_start_time) / MICROSEC, ++ (longlong_t)tx->tx_txg); ++ scn->scn_visited_this_txg = 0; ++ /* ++ * Re-sync the ddt so that we can further modify ++ * it when doing bprewrite. ++ */ ++ ddt_sync(spa, tx->tx_txg); ++ } ++ if (err == ERESTART) ++ return; ++ } ++ ++ if (scn->scn_phys.scn_state != DSS_SCANNING) ++ return; ++ ++ if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= ++ scn->scn_phys.scn_ddt_class_max) { ++ zfs_dbgmsg("doing scan sync txg %llu; " ++ "ddt bm=%llu/%llu/%llu/%llx", ++ (longlong_t)tx->tx_txg, ++ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, ++ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, ++ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, ++ (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); ++ ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); ++ ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); ++ ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); ++ ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); ++ } else { ++ zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", ++ (longlong_t)tx->tx_txg, ++ (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, ++ (longlong_t)scn->scn_phys.scn_bookmark.zb_object, ++ (longlong_t)scn->scn_phys.scn_bookmark.zb_level, ++ (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); ++ } ++ ++ scn->scn_zio_root = zio_root(dp->dp_spa, NULL, ++ NULL, ZIO_FLAG_CANFAIL); ++ dsl_scan_visit(scn, tx); ++ (void) zio_wait(scn->scn_zio_root); ++ scn->scn_zio_root = NULL; ++ ++ zfs_dbgmsg("visited %llu blocks in %llums", ++ (longlong_t)scn->scn_visited_this_txg, ++ (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC); ++ ++ if (!scn->scn_pausing) { ++ /* finished with scan. */ ++ zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg); ++ dsl_scan_done(scn, B_TRUE, tx); ++ } ++ ++ if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { ++ mutex_enter(&spa->spa_scrub_lock); ++ while (spa->spa_scrub_inflight > 0) { ++ cv_wait(&spa->spa_scrub_io_cv, ++ &spa->spa_scrub_lock); ++ } ++ mutex_exit(&spa->spa_scrub_lock); ++ } ++ ++ dsl_scan_sync_state(scn, tx); ++} ++ ++/* ++ * This will start a new scan, or restart an existing one. ++ */ ++void ++dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) ++{ ++ if (txg == 0) { ++ dmu_tx_t *tx; ++ tx = dmu_tx_create_dd(dp->dp_mos_dir); ++ VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); ++ ++ txg = dmu_tx_get_txg(tx); ++ dp->dp_scan->scn_restart_txg = txg; ++ dmu_tx_commit(tx); ++ } else { ++ dp->dp_scan->scn_restart_txg = txg; ++ } ++ zfs_dbgmsg("restarting resilver txg=%llu", txg); ++} ++ ++boolean_t ++dsl_scan_resilvering(dsl_pool_t *dp) ++{ ++ return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && ++ dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); ++} ++ ++/* ++ * scrub consumers ++ */ ++ ++static void ++count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) ++{ ++ int i; ++ ++ /* ++ * If we resume after a reboot, zab will be NULL; don't record ++ * incomplete stats in that case. ++ */ ++ if (zab == NULL) ++ return; ++ ++ for (i = 0; i < 4; i++) { ++ int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; ++ int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; ++ zfs_blkstat_t *zb = &zab->zab_type[l][t]; ++ int equal; ++ ++ zb->zb_count++; ++ zb->zb_asize += BP_GET_ASIZE(bp); ++ zb->zb_lsize += BP_GET_LSIZE(bp); ++ zb->zb_psize += BP_GET_PSIZE(bp); ++ zb->zb_gangs += BP_COUNT_GANG(bp); ++ ++ switch (BP_GET_NDVAS(bp)) { ++ case 2: ++ if (DVA_GET_VDEV(&bp->blk_dva[0]) == ++ DVA_GET_VDEV(&bp->blk_dva[1])) ++ zb->zb_ditto_2_of_2_samevdev++; ++ break; ++ case 3: ++ equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == ++ DVA_GET_VDEV(&bp->blk_dva[1])) + ++ (DVA_GET_VDEV(&bp->blk_dva[0]) == ++ DVA_GET_VDEV(&bp->blk_dva[2])) + ++ (DVA_GET_VDEV(&bp->blk_dva[1]) == ++ DVA_GET_VDEV(&bp->blk_dva[2])); ++ if (equal == 1) ++ zb->zb_ditto_2_of_3_samevdev++; ++ else if (equal == 3) ++ zb->zb_ditto_3_of_3_samevdev++; ++ break; ++ } ++ } ++} ++ ++static void ++dsl_scan_scrub_done(zio_t *zio) ++{ ++ spa_t *spa = zio->io_spa; ++ ++ zio_data_buf_free(zio->io_data, zio->io_size); ++ ++ mutex_enter(&spa->spa_scrub_lock); ++ spa->spa_scrub_inflight--; ++ cv_broadcast(&spa->spa_scrub_io_cv); ++ ++ if (zio->io_error && (zio->io_error != ECKSUM || ++ !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { ++ spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; ++ } ++ mutex_exit(&spa->spa_scrub_lock); ++} ++ ++static int ++dsl_scan_scrub_cb(dsl_pool_t *dp, ++ const blkptr_t *bp, const zbookmark_t *zb) ++{ ++ dsl_scan_t *scn = dp->dp_scan; ++ size_t size = BP_GET_PSIZE(bp); ++ spa_t *spa = dp->dp_spa; ++ uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); ++ boolean_t needs_io = B_FALSE; ++ int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; ++ int zio_priority = 0; ++ int scan_delay = 0; ++ int d; ++ ++ if (phys_birth <= scn->scn_phys.scn_min_txg || ++ phys_birth >= scn->scn_phys.scn_max_txg) ++ return (0); ++ ++ count_block(dp->dp_blkstats, bp); ++ ++ ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); ++ if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { ++ zio_flags |= ZIO_FLAG_SCRUB; ++ zio_priority = ZIO_PRIORITY_SCRUB; ++ needs_io = B_TRUE; ++ scan_delay = zfs_scrub_delay; ++ } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { ++ zio_flags |= ZIO_FLAG_RESILVER; ++ zio_priority = ZIO_PRIORITY_RESILVER; ++ needs_io = B_FALSE; ++ scan_delay = zfs_resilver_delay; ++ } ++ ++ /* If it's an intent log block, failure is expected. */ ++ if (zb->zb_level == ZB_ZIL_LEVEL) ++ zio_flags |= ZIO_FLAG_SPECULATIVE; ++ ++ for (d = 0; d < BP_GET_NDVAS(bp); d++) { ++ vdev_t *vd = vdev_lookup_top(spa, ++ DVA_GET_VDEV(&bp->blk_dva[d])); ++ ++ /* ++ * Keep track of how much data we've examined so that ++ * zpool(1M) status can make useful progress reports. ++ */ ++ scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); ++ spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); ++ ++ /* if it's a resilver, this may not be in the target range */ ++ if (!needs_io) { ++ if (DVA_GET_GANG(&bp->blk_dva[d])) { ++ /* ++ * Gang members may be spread across multiple ++ * vdevs, so the best estimate we have is the ++ * scrub range, which has already been checked. ++ * XXX -- it would be better to change our ++ * allocation policy to ensure that all ++ * gang members reside on the same vdev. ++ */ ++ needs_io = B_TRUE; ++ } else { ++ needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, ++ phys_birth, 1); ++ } ++ } ++ } ++ ++ if (needs_io && !zfs_no_scrub_io) { ++ vdev_t *rvd = spa->spa_root_vdev; ++ uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; ++ void *data = zio_data_buf_alloc(size); ++ ++ mutex_enter(&spa->spa_scrub_lock); ++ while (spa->spa_scrub_inflight >= maxinflight) ++ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); ++ spa->spa_scrub_inflight++; ++ mutex_exit(&spa->spa_scrub_lock); ++ ++ /* ++ * If we're seeing recent (zfs_scan_idle) "important" I/Os ++ * then throttle our workload to limit the impact of a scan. ++ */ ++ if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) ++ delay(scan_delay); ++ ++ zio_nowait(zio_read(NULL, spa, bp, data, size, ++ dsl_scan_scrub_done, NULL, zio_priority, ++ zio_flags, zb)); ++ } ++ ++ /* do not relocate this block */ ++ return (0); ++} ++ ++int ++dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) ++{ ++ spa_t *spa = dp->dp_spa; ++ ++ /* ++ * Purge all vdev caches and probe all devices. We do this here ++ * rather than in sync context because this requires a writer lock ++ * on the spa_config lock, which we can't do from sync context. The ++ * spa_scrub_reopen flag indicates that vdev_open() should not ++ * attempt to start another scrub. ++ */ ++ spa_vdev_state_enter(spa, SCL_NONE); ++ spa->spa_scrub_reopen = B_TRUE; ++ vdev_reopen(spa->spa_root_vdev); ++ spa->spa_scrub_reopen = B_FALSE; ++ (void) spa_vdev_state_exit(spa, NULL, 0); ++ ++ return (dsl_sync_task_do(dp, dsl_scan_setup_check, ++ dsl_scan_setup_sync, dp->dp_scan, &func, 0)); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++module_param(zfs_top_maxinflight, int, 0644); ++MODULE_PARM_DESC(zfs_top_maxinflight, "Max I/Os per top-level"); ++ ++module_param(zfs_resilver_delay, int, 0644); ++MODULE_PARM_DESC(zfs_resilver_delay, "Number of ticks to delay resilver"); ++ ++module_param(zfs_scrub_delay, int, 0644); ++MODULE_PARM_DESC(zfs_scrub_delay, "Number of ticks to delay scrub"); ++ ++module_param(zfs_scan_idle, int, 0644); ++MODULE_PARM_DESC(zfs_scan_idle, "Idle window in clock ticks"); ++ ++module_param(zfs_scan_min_time_ms, int, 0644); ++MODULE_PARM_DESC(zfs_scan_min_time_ms, "Min millisecs to scrub per txg"); ++ ++module_param(zfs_free_min_time_ms, int, 0644); ++MODULE_PARM_DESC(zfs_free_min_time_ms, "Min millisecs to free per txg"); ++ ++module_param(zfs_resilver_min_time_ms, int, 0644); ++MODULE_PARM_DESC(zfs_resilver_min_time_ms, "Min millisecs to resilver per txg"); ++ ++module_param(zfs_no_scrub_io, int, 0644); ++MODULE_PARM_DESC(zfs_no_scrub_io, "Set to disable scrub I/O"); ++ ++module_param(zfs_no_scrub_prefetch, int, 0644); ++MODULE_PARM_DESC(zfs_no_scrub_prefetch, "Set to disable scrub prefetching"); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/dsl_synctask.c linux-3.2.33-go/fs/zfs/zfs/dsl_synctask.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/dsl_synctask.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/dsl_synctask.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,245 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define DST_AVG_BLKSHIFT 14 ++ ++/* ARGSUSED */ ++static int ++dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ return (0); ++} ++ ++dsl_sync_task_group_t * ++dsl_sync_task_group_create(dsl_pool_t *dp) ++{ ++ dsl_sync_task_group_t *dstg; ++ ++ dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP); ++ list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t), ++ offsetof(dsl_sync_task_t, dst_node)); ++ dstg->dstg_pool = dp; ++ ++ return (dstg); ++} ++ ++void ++dsl_sync_task_create(dsl_sync_task_group_t *dstg, ++ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, ++ void *arg1, void *arg2, int blocks_modified) ++{ ++ dsl_sync_task_t *dst; ++ ++ if (checkfunc == NULL) ++ checkfunc = dsl_null_checkfunc; ++ dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP); ++ dst->dst_checkfunc = checkfunc; ++ dst->dst_syncfunc = syncfunc; ++ dst->dst_arg1 = arg1; ++ dst->dst_arg2 = arg2; ++ list_insert_tail(&dstg->dstg_tasks, dst); ++ ++ dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT; ++} ++ ++int ++dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg) ++{ ++ dmu_tx_t *tx; ++ uint64_t txg; ++ dsl_sync_task_t *dst; ++ ++top: ++ tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir); ++ VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); ++ ++ txg = dmu_tx_get_txg(tx); ++ ++ /* Do a preliminary error check. */ ++ dstg->dstg_err = 0; ++ rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER); ++ for (dst = list_head(&dstg->dstg_tasks); dst; ++ dst = list_next(&dstg->dstg_tasks, dst)) { ++#ifdef ZFS_DEBUG ++ /* ++ * Only check half the time, otherwise, the sync-context ++ * check will almost never fail. ++ */ ++ if (spa_get_random(2) == 0) ++ continue; ++#endif ++ dst->dst_err = ++ dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); ++ if (dst->dst_err) ++ dstg->dstg_err = dst->dst_err; ++ } ++ rw_exit(&dstg->dstg_pool->dp_config_rwlock); ++ ++ if (dstg->dstg_err) { ++ dmu_tx_commit(tx); ++ return (dstg->dstg_err); ++ } ++ ++ /* ++ * We don't generally have many sync tasks, so pay the price of ++ * add_tail to get the tasks executed in the right order. ++ */ ++ VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks, ++ dstg, txg)); ++ ++ dmu_tx_commit(tx); ++ ++ txg_wait_synced(dstg->dstg_pool, txg); ++ ++ if (dstg->dstg_err == EAGAIN) { ++ txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE); ++ goto top; ++ } ++ ++ return (dstg->dstg_err); ++} ++ ++void ++dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) ++{ ++ uint64_t txg; ++ ++ dstg->dstg_nowaiter = B_TRUE; ++ txg = dmu_tx_get_txg(tx); ++ /* ++ * We don't generally have many sync tasks, so pay the price of ++ * add_tail to get the tasks executed in the right order. ++ */ ++ VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks, ++ dstg, txg)); ++} ++ ++void ++dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg) ++{ ++ dsl_sync_task_t *dst; ++ ++ while ((dst = list_head(&dstg->dstg_tasks))) { ++ list_remove(&dstg->dstg_tasks, dst); ++ kmem_free(dst, sizeof (dsl_sync_task_t)); ++ } ++ kmem_free(dstg, sizeof (dsl_sync_task_group_t)); ++} ++ ++void ++dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) ++{ ++ dsl_sync_task_t *dst; ++ dsl_pool_t *dp = dstg->dstg_pool; ++ uint64_t quota, used; ++ ++ ASSERT3U(dstg->dstg_err, ==, 0); ++ ++ /* ++ * Check for sufficient space. We just check against what's ++ * on-disk; we don't want any in-flight accounting to get in our ++ * way, because open context may have already used up various ++ * in-core limits (arc_tempreserve, dsl_pool_tempreserve). ++ */ ++ quota = dsl_pool_adjustedsize(dp, B_FALSE) - ++ metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); ++ used = dp->dp_root_dir->dd_phys->dd_used_bytes; ++ /* MOS space is triple-dittoed, so we multiply by 3. */ ++ if (dstg->dstg_space > 0 && used + dstg->dstg_space * 3 > quota) { ++ dstg->dstg_err = ENOSPC; ++ return; ++ } ++ ++ /* ++ * Check for errors by calling checkfuncs. ++ */ ++ rw_enter(&dp->dp_config_rwlock, RW_WRITER); ++ for (dst = list_head(&dstg->dstg_tasks); dst; ++ dst = list_next(&dstg->dstg_tasks, dst)) { ++ dst->dst_err = ++ dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); ++ if (dst->dst_err) ++ dstg->dstg_err = dst->dst_err; ++ } ++ ++ if (dstg->dstg_err == 0) { ++ /* ++ * Execute sync tasks. ++ */ ++ for (dst = list_head(&dstg->dstg_tasks); dst; ++ dst = list_next(&dstg->dstg_tasks, dst)) { ++ dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx); ++ } ++ } ++ rw_exit(&dp->dp_config_rwlock); ++ ++ if (dstg->dstg_nowaiter) ++ dsl_sync_task_group_destroy(dstg); ++} ++ ++int ++dsl_sync_task_do(dsl_pool_t *dp, ++ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, ++ void *arg1, void *arg2, int blocks_modified) ++{ ++ dsl_sync_task_group_t *dstg; ++ int err; ++ ++ ASSERT(spa_writeable(dp->dp_spa)); ++ ++ dstg = dsl_sync_task_group_create(dp); ++ dsl_sync_task_create(dstg, checkfunc, syncfunc, ++ arg1, arg2, blocks_modified); ++ err = dsl_sync_task_group_wait(dstg); ++ dsl_sync_task_group_destroy(dstg); ++ return (err); ++} ++ ++void ++dsl_sync_task_do_nowait(dsl_pool_t *dp, ++ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, ++ void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx) ++{ ++ dsl_sync_task_group_t *dstg; ++ ++ if (!spa_writeable(dp->dp_spa)) ++ return; ++ ++ dstg = dsl_sync_task_group_create(dp); ++ dsl_sync_task_create(dstg, checkfunc, syncfunc, ++ arg1, arg2, blocks_modified); ++ dsl_sync_task_group_nowait(dstg, tx); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(dsl_sync_task_do); ++EXPORT_SYMBOL(dsl_sync_task_do_nowait); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/fm.c linux-3.2.33-go/fs/zfs/zfs/fm.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/fm.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/fm.c 2012-11-16 23:25:34.353039289 +0100 +@@ -0,0 +1,1556 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* ++ * Fault Management Architecture (FMA) Resource and Protocol Support ++ * ++ * The routines contained herein provide services to support kernel subsystems ++ * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089). ++ * ++ * Name-Value Pair Lists ++ * ++ * The embodiment of an FMA protocol element (event, fmri or authority) is a ++ * name-value pair list (nvlist_t). FMA-specific nvlist construtor and ++ * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used ++ * to create an nvpair list using custom allocators. Callers may choose to ++ * allocate either from the kernel memory allocator, or from a preallocated ++ * buffer, useful in constrained contexts like high-level interrupt routines. ++ * ++ * Protocol Event and FMRI Construction ++ * ++ * Convenience routines are provided to construct nvlist events according to ++ * the FMA Event Protocol and Naming Schema specification for ereports and ++ * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes. ++ * ++ * ENA Manipulation ++ * ++ * Routines to generate ENA formats 0, 1 and 2 are available as well as ++ * routines to increment formats 1 and 2. Individual fields within the ++ * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(), ++ * fm_ena_format_get() and fm_ena_gen_get(). ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef _KERNEL ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int zfs_zevent_len_max = 0; ++int zfs_zevent_cols = 80; ++int zfs_zevent_console = 0; ++ ++static int zevent_len_cur = 0; ++static int zevent_waiters = 0; ++static int zevent_flags = 0; ++ ++static kmutex_t zevent_lock; ++static list_t zevent_list; ++static kcondvar_t zevent_cv; ++#endif /* _KERNEL */ ++ ++extern void fastreboot_disable_highpil(void); ++ ++/* ++ * Common fault management kstats to record event generation failures ++ */ ++ ++struct erpt_kstat { ++ kstat_named_t erpt_dropped; /* num erpts dropped on post */ ++ kstat_named_t erpt_set_failed; /* num erpt set failures */ ++ kstat_named_t fmri_set_failed; /* num fmri set failures */ ++ kstat_named_t payload_set_failed; /* num payload set failures */ ++}; ++ ++static struct erpt_kstat erpt_kstat_data = { ++ { "erpt-dropped", KSTAT_DATA_UINT64 }, ++ { "erpt-set-failed", KSTAT_DATA_UINT64 }, ++ { "fmri-set-failed", KSTAT_DATA_UINT64 }, ++ { "payload-set-failed", KSTAT_DATA_UINT64 } ++}; ++ ++kstat_t *fm_ksp; ++ ++#ifdef _KERNEL ++ ++/* ++ * Formatting utility function for fm_nvprintr. We attempt to wrap chunks of ++ * output so they aren't split across console lines, and return the end column. ++ */ ++/*PRINTFLIKE4*/ ++static int ++fm_printf(int depth, int c, int cols, const char *format, ...) ++{ ++ va_list ap; ++ int width; ++ char c1; ++ ++ va_start(ap, format); ++ width = vsnprintf(&c1, sizeof (c1), format, ap); ++ va_end(ap); ++ ++ if (c + width >= cols) { ++ console_printf("\n"); ++ c = 0; ++ if (format[0] != ' ' && depth > 0) { ++ console_printf(" "); ++ c++; ++ } ++ } ++ ++ va_start(ap, format); ++ console_vprintf(format, ap); ++ va_end(ap); ++ ++ return ((c + width) % cols); ++} ++ ++/* ++ * Recursively print a nvlist in the specified column width and return the ++ * column we end up in. This function is called recursively by fm_nvprint(), ++ * below. We generically format the entire nvpair using hexadecimal ++ * integers and strings, and elide any integer arrays. Arrays are basically ++ * used for cache dumps right now, so we suppress them so as not to overwhelm ++ * the amount of console output we produce at panic time. This can be further ++ * enhanced as FMA technology grows based upon the needs of consumers. All ++ * FMA telemetry is logged using the dump device transport, so the console ++ * output serves only as a fallback in case this procedure is unsuccessful. ++ */ ++static int ++fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) ++{ ++ nvpair_t *nvp; ++ ++ for (nvp = nvlist_next_nvpair(nvl, NULL); ++ nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { ++ ++ data_type_t type = nvpair_type(nvp); ++ const char *name = nvpair_name(nvp); ++ ++ boolean_t b; ++ uint8_t i8; ++ uint16_t i16; ++ uint32_t i32; ++ uint64_t i64; ++ char *str; ++ nvlist_t *cnv; ++ ++ if (strcmp(name, FM_CLASS) == 0) ++ continue; /* already printed by caller */ ++ ++ c = fm_printf(d, c, cols, " %s=", name); ++ ++ switch (type) { ++ case DATA_TYPE_BOOLEAN: ++ c = fm_printf(d + 1, c, cols, " 1"); ++ break; ++ ++ case DATA_TYPE_BOOLEAN_VALUE: ++ (void) nvpair_value_boolean_value(nvp, &b); ++ c = fm_printf(d + 1, c, cols, b ? "1" : "0"); ++ break; ++ ++ case DATA_TYPE_BYTE: ++ (void) nvpair_value_byte(nvp, &i8); ++ c = fm_printf(d + 1, c, cols, "0x%x", i8); ++ break; ++ ++ case DATA_TYPE_INT8: ++ (void) nvpair_value_int8(nvp, (void *)&i8); ++ c = fm_printf(d + 1, c, cols, "0x%x", i8); ++ break; ++ ++ case DATA_TYPE_UINT8: ++ (void) nvpair_value_uint8(nvp, &i8); ++ c = fm_printf(d + 1, c, cols, "0x%x", i8); ++ break; ++ ++ case DATA_TYPE_INT16: ++ (void) nvpair_value_int16(nvp, (void *)&i16); ++ c = fm_printf(d + 1, c, cols, "0x%x", i16); ++ break; ++ ++ case DATA_TYPE_UINT16: ++ (void) nvpair_value_uint16(nvp, &i16); ++ c = fm_printf(d + 1, c, cols, "0x%x", i16); ++ break; ++ ++ case DATA_TYPE_INT32: ++ (void) nvpair_value_int32(nvp, (void *)&i32); ++ c = fm_printf(d + 1, c, cols, "0x%x", i32); ++ break; ++ ++ case DATA_TYPE_UINT32: ++ (void) nvpair_value_uint32(nvp, &i32); ++ c = fm_printf(d + 1, c, cols, "0x%x", i32); ++ break; ++ ++ case DATA_TYPE_INT64: ++ (void) nvpair_value_int64(nvp, (void *)&i64); ++ c = fm_printf(d + 1, c, cols, "0x%llx", ++ (u_longlong_t)i64); ++ break; ++ ++ case DATA_TYPE_UINT64: ++ (void) nvpair_value_uint64(nvp, &i64); ++ c = fm_printf(d + 1, c, cols, "0x%llx", ++ (u_longlong_t)i64); ++ break; ++ ++ case DATA_TYPE_HRTIME: ++ (void) nvpair_value_hrtime(nvp, (void *)&i64); ++ c = fm_printf(d + 1, c, cols, "0x%llx", ++ (u_longlong_t)i64); ++ break; ++ ++ case DATA_TYPE_STRING: ++ (void) nvpair_value_string(nvp, &str); ++ c = fm_printf(d + 1, c, cols, "\"%s\"", ++ str ? str : ""); ++ break; ++ ++ case DATA_TYPE_NVLIST: ++ c = fm_printf(d + 1, c, cols, "["); ++ (void) nvpair_value_nvlist(nvp, &cnv); ++ c = fm_nvprintr(cnv, d + 1, c, cols); ++ c = fm_printf(d + 1, c, cols, " ]"); ++ break; ++ ++ case DATA_TYPE_NVLIST_ARRAY: { ++ nvlist_t **val; ++ uint_t i, nelem; ++ ++ c = fm_printf(d + 1, c, cols, "["); ++ (void) nvpair_value_nvlist_array(nvp, &val, &nelem); ++ for (i = 0; i < nelem; i++) { ++ c = fm_nvprintr(val[i], d + 1, c, cols); ++ } ++ c = fm_printf(d + 1, c, cols, " ]"); ++ } ++ break; ++ ++ case DATA_TYPE_INT8_ARRAY: { ++ int8_t *val; ++ uint_t i, nelem; ++ ++ c = fm_printf(d + 1, c, cols, "[ "); ++ (void) nvpair_value_int8_array(nvp, &val, &nelem); ++ for (i = 0; i < nelem; i++) ++ c = fm_printf(d + 1, c, cols, "0x%llx ", ++ (u_longlong_t)val[i]); ++ ++ c = fm_printf(d + 1, c, cols, "]"); ++ break; ++ } ++ ++ case DATA_TYPE_UINT8_ARRAY: { ++ uint8_t *val; ++ uint_t i, nelem; ++ ++ c = fm_printf(d + 1, c, cols, "[ "); ++ (void) nvpair_value_uint8_array(nvp, &val, &nelem); ++ for (i = 0; i < nelem; i++) ++ c = fm_printf(d + 1, c, cols, "0x%llx ", ++ (u_longlong_t)val[i]); ++ ++ c = fm_printf(d + 1, c, cols, "]"); ++ break; ++ } ++ ++ case DATA_TYPE_INT16_ARRAY: { ++ int16_t *val; ++ uint_t i, nelem; ++ ++ c = fm_printf(d + 1, c, cols, "[ "); ++ (void) nvpair_value_int16_array(nvp, &val, &nelem); ++ for (i = 0; i < nelem; i++) ++ c = fm_printf(d + 1, c, cols, "0x%llx ", ++ (u_longlong_t)val[i]); ++ ++ c = fm_printf(d + 1, c, cols, "]"); ++ break; ++ } ++ ++ case DATA_TYPE_UINT16_ARRAY: { ++ uint16_t *val; ++ uint_t i, nelem; ++ ++ c = fm_printf(d + 1, c, cols, "[ "); ++ (void) nvpair_value_uint16_array(nvp, &val, &nelem); ++ for (i = 0; i < nelem; i++) ++ c = fm_printf(d + 1, c, cols, "0x%llx ", ++ (u_longlong_t)val[i]); ++ ++ c = fm_printf(d + 1, c, cols, "]"); ++ break; ++ } ++ ++ case DATA_TYPE_INT32_ARRAY: { ++ int32_t *val; ++ uint_t i, nelem; ++ ++ c = fm_printf(d + 1, c, cols, "[ "); ++ (void) nvpair_value_int32_array(nvp, &val, &nelem); ++ for (i = 0; i < nelem; i++) ++ c = fm_printf(d + 1, c, cols, "0x%llx ", ++ (u_longlong_t)val[i]); ++ ++ c = fm_printf(d + 1, c, cols, "]"); ++ break; ++ } ++ ++ case DATA_TYPE_UINT32_ARRAY: { ++ uint32_t *val; ++ uint_t i, nelem; ++ ++ c = fm_printf(d + 1, c, cols, "[ "); ++ (void) nvpair_value_uint32_array(nvp, &val, &nelem); ++ for (i = 0; i < nelem; i++) ++ c = fm_printf(d + 1, c, cols, "0x%llx ", ++ (u_longlong_t)val[i]); ++ ++ c = fm_printf(d + 1, c, cols, "]"); ++ break; ++ } ++ ++ case DATA_TYPE_INT64_ARRAY: { ++ int64_t *val; ++ uint_t i, nelem; ++ ++ c = fm_printf(d + 1, c, cols, "[ "); ++ (void) nvpair_value_int64_array(nvp, &val, &nelem); ++ for (i = 0; i < nelem; i++) ++ c = fm_printf(d + 1, c, cols, "0x%llx ", ++ (u_longlong_t)val[i]); ++ ++ c = fm_printf(d + 1, c, cols, "]"); ++ break; ++ } ++ ++ case DATA_TYPE_UINT64_ARRAY: { ++ uint64_t *val; ++ uint_t i, nelem; ++ ++ c = fm_printf(d + 1, c, cols, "[ "); ++ (void) nvpair_value_uint64_array(nvp, &val, &nelem); ++ for (i = 0; i < nelem; i++) ++ c = fm_printf(d + 1, c, cols, "0x%llx ", ++ (u_longlong_t)val[i]); ++ ++ c = fm_printf(d + 1, c, cols, "]"); ++ break; ++ } ++ ++ case DATA_TYPE_STRING_ARRAY: ++ case DATA_TYPE_BOOLEAN_ARRAY: ++ case DATA_TYPE_BYTE_ARRAY: ++ c = fm_printf(d + 1, c, cols, "[...]"); ++ break; ++ ++ case DATA_TYPE_UNKNOWN: ++ c = fm_printf(d + 1, c, cols, ""); ++ break; ++ } ++ } ++ ++ return (c); ++} ++ ++void ++fm_nvprint(nvlist_t *nvl) ++{ ++ char *class; ++ int c = 0; ++ ++ console_printf("\n"); ++ ++ if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0) ++ c = fm_printf(0, c, zfs_zevent_cols, "%s", class); ++ ++ if (fm_nvprintr(nvl, 0, c, zfs_zevent_cols) != 0) ++ console_printf("\n"); ++ ++ console_printf("\n"); ++} ++ ++static zevent_t * ++zfs_zevent_alloc(void) ++{ ++ zevent_t *ev; ++ ++ ev = kmem_zalloc(sizeof(zevent_t), KM_PUSHPAGE); ++ if (ev == NULL) ++ return NULL; ++ ++ list_create(&ev->ev_ze_list, sizeof(zfs_zevent_t), ++ offsetof(zfs_zevent_t, ze_node)); ++ list_link_init(&ev->ev_node); ++ ++ return ev; ++} ++ ++static void ++zfs_zevent_free(zevent_t *ev) ++{ ++ /* Run provided cleanup callback */ ++ ev->ev_cb(ev->ev_nvl, ev->ev_detector); ++ ++ list_destroy(&ev->ev_ze_list); ++ kmem_free(ev, sizeof(zevent_t)); ++} ++ ++static void ++zfs_zevent_drain(zevent_t *ev) ++{ ++ zfs_zevent_t *ze; ++ ++ ASSERT(MUTEX_HELD(&zevent_lock)); ++ list_remove(&zevent_list, ev); ++ ++ /* Remove references to this event in all private file data */ ++ while ((ze = list_head(&ev->ev_ze_list)) != NULL) { ++ list_remove(&ev->ev_ze_list, ze); ++ ze->ze_zevent = NULL; ++ ze->ze_dropped++; ++ } ++ ++ zfs_zevent_free(ev); ++} ++ ++void ++zfs_zevent_drain_all(int *count) ++{ ++ zevent_t *ev; ++ ++ mutex_enter(&zevent_lock); ++ while ((ev = list_head(&zevent_list)) != NULL) ++ zfs_zevent_drain(ev); ++ ++ *count = zevent_len_cur; ++ zevent_len_cur = 0; ++ mutex_exit(&zevent_lock); ++} ++ ++/* ++ * New zevents are inserted at the head. If the maximum queue ++ * length is exceeded a zevent will be drained from the tail. ++ * As part of this any user space processes which currently have ++ * a reference to this zevent_t in their private data will have ++ * this reference set to NULL. ++ */ ++static void ++zfs_zevent_insert(zevent_t *ev) ++{ ++ ASSERT(MUTEX_HELD(&zevent_lock)); ++ list_insert_head(&zevent_list, ev); ++ ++ if (zevent_len_cur >= zfs_zevent_len_max) ++ zfs_zevent_drain(list_tail(&zevent_list)); ++ else ++ zevent_len_cur++; ++} ++ ++/* ++ * Post a zevent ++ */ ++void ++zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb) ++{ ++ int64_t tv_array[2]; ++ timestruc_t tv; ++ size_t nvl_size = 0; ++ zevent_t *ev; ++ ++ gethrestime(&tv); ++ tv_array[0] = tv.tv_sec; ++ tv_array[1] = tv.tv_nsec; ++ if (nvlist_add_int64_array(nvl, FM_EREPORT_TIME, tv_array, 2)) { ++ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1); ++ return; ++ } ++ ++ (void) nvlist_size(nvl, &nvl_size, NV_ENCODE_NATIVE); ++ if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) { ++ atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1); ++ return; ++ } ++ ++ if (zfs_zevent_console) ++ fm_nvprint(nvl); ++ ++ ev = zfs_zevent_alloc(); ++ if (ev == NULL) { ++ atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1); ++ return; ++ } ++ ++ ev->ev_nvl = nvl; ++ ev->ev_detector = detector; ++ ev->ev_cb = cb; ++ ++ mutex_enter(&zevent_lock); ++ zfs_zevent_insert(ev); ++ cv_broadcast(&zevent_cv); ++ mutex_exit(&zevent_lock); ++} ++ ++static int ++zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze) ++{ ++ *ze = zfsdev_get_state(minor, ZST_ZEVENT); ++ if (*ze == NULL) ++ return (EBADF); ++ ++ return (0); ++} ++ ++int ++zfs_zevent_fd_hold(int fd, minor_t *minorp, zfs_zevent_t **ze) ++{ ++ file_t *fp; ++ int error; ++ ++ fp = getf(fd); ++ if (fp == NULL) ++ return (EBADF); ++ ++ *minorp = zfsdev_getminor(fp->f_file); ++ error = zfs_zevent_minor_to_state(*minorp, ze); ++ ++ if (error) ++ zfs_zevent_fd_rele(fd); ++ ++ return (error); ++} ++ ++void ++zfs_zevent_fd_rele(int fd) ++{ ++ releasef(fd); ++} ++ ++/* ++ * Get the next zevent in the stream and place a copy in 'event'. This ++ * may fail with ENOMEM if the encoded nvlist size exceeds the passed ++ * 'event_size'. In this case the stream pointer is not advanced and ++ * and 'event_size' is set to the minimum required buffer size. ++ */ ++int ++zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size, ++ uint64_t *dropped) ++{ ++ zevent_t *ev; ++ size_t size; ++ int error = 0; ++ ++ mutex_enter(&zevent_lock); ++ if (ze->ze_zevent == NULL) { ++ /* New stream start at the beginning/tail */ ++ ev = list_tail(&zevent_list); ++ if (ev == NULL) { ++ error = ENOENT; ++ goto out; ++ } ++ } else { ++ /* Existing stream continue with the next element and remove ++ * ourselves from the wait queue for the previous element */ ++ ev = list_prev(&zevent_list, ze->ze_zevent); ++ if (ev == NULL) { ++ error = ENOENT; ++ goto out; ++ } ++ } ++ ++ VERIFY(nvlist_size(ev->ev_nvl, &size, NV_ENCODE_NATIVE) == 0); ++ if (size > *event_size) { ++ *event_size = size; ++ error = ENOMEM; ++ goto out; ++ } ++ ++ if (ze->ze_zevent) ++ list_remove(&ze->ze_zevent->ev_ze_list, ze); ++ ++ ze->ze_zevent = ev; ++ list_insert_head(&ev->ev_ze_list, ze); ++ nvlist_dup(ev->ev_nvl, event, KM_SLEEP); ++ *dropped = ze->ze_dropped; ++ ze->ze_dropped = 0; ++out: ++ mutex_exit(&zevent_lock); ++ ++ return error; ++} ++ ++int ++zfs_zevent_wait(zfs_zevent_t *ze) ++{ ++ int error = 0; ++ ++ mutex_enter(&zevent_lock); ++ ++ if (zevent_flags & ZEVENT_SHUTDOWN) { ++ error = ESHUTDOWN; ++ goto out; ++ } ++ ++ zevent_waiters++; ++ cv_wait_interruptible(&zevent_cv, &zevent_lock); ++ if (issig(JUSTLOOKING)) ++ error = EINTR; ++ ++ zevent_waiters--; ++out: ++ mutex_exit(&zevent_lock); ++ ++ return error; ++} ++ ++void ++zfs_zevent_init(zfs_zevent_t **zep) ++{ ++ zfs_zevent_t *ze; ++ ++ ze = *zep = kmem_zalloc(sizeof (zfs_zevent_t), KM_SLEEP); ++ list_link_init(&ze->ze_node); ++} ++ ++void ++zfs_zevent_destroy(zfs_zevent_t *ze) ++{ ++ mutex_enter(&zevent_lock); ++ if (ze->ze_zevent) ++ list_remove(&ze->ze_zevent->ev_ze_list, ze); ++ mutex_exit(&zevent_lock); ++ ++ kmem_free(ze, sizeof (zfs_zevent_t)); ++} ++#endif /* _KERNEL */ ++ ++/* ++ * Wrapppers for FM nvlist allocators ++ */ ++/* ARGSUSED */ ++static void * ++i_fm_alloc(nv_alloc_t *nva, size_t size) ++{ ++ return (kmem_zalloc(size, KM_PUSHPAGE)); ++} ++ ++/* ARGSUSED */ ++static void ++i_fm_free(nv_alloc_t *nva, void *buf, size_t size) ++{ ++ kmem_free(buf, size); ++} ++ ++const nv_alloc_ops_t fm_mem_alloc_ops = { ++ NULL, ++ NULL, ++ i_fm_alloc, ++ i_fm_free, ++ NULL ++}; ++ ++/* ++ * Create and initialize a new nv_alloc_t for a fixed buffer, buf. A pointer ++ * to the newly allocated nv_alloc_t structure is returned upon success or NULL ++ * is returned to indicate that the nv_alloc structure could not be created. ++ */ ++nv_alloc_t * ++fm_nva_xcreate(char *buf, size_t bufsz) ++{ ++ nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP); ++ ++ if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) { ++ kmem_free(nvhdl, sizeof (nv_alloc_t)); ++ return (NULL); ++ } ++ ++ return (nvhdl); ++} ++ ++/* ++ * Destroy a previously allocated nv_alloc structure. The fixed buffer ++ * associated with nva must be freed by the caller. ++ */ ++void ++fm_nva_xdestroy(nv_alloc_t *nva) ++{ ++ nv_alloc_fini(nva); ++ kmem_free(nva, sizeof (nv_alloc_t)); ++} ++ ++/* ++ * Create a new nv list. A pointer to a new nv list structure is returned ++ * upon success or NULL is returned to indicate that the structure could ++ * not be created. The newly created nv list is created and managed by the ++ * operations installed in nva. If nva is NULL, the default FMA nva ++ * operations are installed and used. ++ * ++ * When called from the kernel and nva == NULL, this function must be called ++ * from passive kernel context with no locks held that can prevent a ++ * sleeping memory allocation from occurring. Otherwise, this function may ++ * be called from other kernel contexts as long a valid nva created via ++ * fm_nva_create() is supplied. ++ */ ++nvlist_t * ++fm_nvlist_create(nv_alloc_t *nva) ++{ ++ int hdl_alloced = 0; ++ nvlist_t *nvl; ++ nv_alloc_t *nvhdl; ++ ++ if (nva == NULL) { ++ nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_PUSHPAGE); ++ ++ if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) { ++ kmem_free(nvhdl, sizeof (nv_alloc_t)); ++ return (NULL); ++ } ++ hdl_alloced = 1; ++ } else { ++ nvhdl = nva; ++ } ++ ++ if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) { ++ if (hdl_alloced) { ++ nv_alloc_fini(nvhdl); ++ kmem_free(nvhdl, sizeof (nv_alloc_t)); ++ } ++ return (NULL); ++ } ++ ++ return (nvl); ++} ++ ++/* ++ * Destroy a previously allocated nvlist structure. flag indicates whether ++ * or not the associated nva structure should be freed (FM_NVA_FREE) or ++ * retained (FM_NVA_RETAIN). Retaining the nv alloc structure allows ++ * it to be re-used for future nvlist creation operations. ++ */ ++void ++fm_nvlist_destroy(nvlist_t *nvl, int flag) ++{ ++ nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl); ++ ++ nvlist_free(nvl); ++ ++ if (nva != NULL) { ++ if (flag == FM_NVA_FREE) ++ fm_nva_xdestroy(nva); ++ } ++} ++ ++int ++i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap) ++{ ++ int nelem, ret = 0; ++ data_type_t type; ++ ++ while (ret == 0 && name != NULL) { ++ type = va_arg(ap, data_type_t); ++ switch (type) { ++ case DATA_TYPE_BYTE: ++ ret = nvlist_add_byte(payload, name, ++ va_arg(ap, uint_t)); ++ break; ++ case DATA_TYPE_BYTE_ARRAY: ++ nelem = va_arg(ap, int); ++ ret = nvlist_add_byte_array(payload, name, ++ va_arg(ap, uchar_t *), nelem); ++ break; ++ case DATA_TYPE_BOOLEAN_VALUE: ++ ret = nvlist_add_boolean_value(payload, name, ++ va_arg(ap, boolean_t)); ++ break; ++ case DATA_TYPE_BOOLEAN_ARRAY: ++ nelem = va_arg(ap, int); ++ ret = nvlist_add_boolean_array(payload, name, ++ va_arg(ap, boolean_t *), nelem); ++ break; ++ case DATA_TYPE_INT8: ++ ret = nvlist_add_int8(payload, name, ++ va_arg(ap, int)); ++ break; ++ case DATA_TYPE_INT8_ARRAY: ++ nelem = va_arg(ap, int); ++ ret = nvlist_add_int8_array(payload, name, ++ va_arg(ap, int8_t *), nelem); ++ break; ++ case DATA_TYPE_UINT8: ++ ret = nvlist_add_uint8(payload, name, ++ va_arg(ap, uint_t)); ++ break; ++ case DATA_TYPE_UINT8_ARRAY: ++ nelem = va_arg(ap, int); ++ ret = nvlist_add_uint8_array(payload, name, ++ va_arg(ap, uint8_t *), nelem); ++ break; ++ case DATA_TYPE_INT16: ++ ret = nvlist_add_int16(payload, name, ++ va_arg(ap, int)); ++ break; ++ case DATA_TYPE_INT16_ARRAY: ++ nelem = va_arg(ap, int); ++ ret = nvlist_add_int16_array(payload, name, ++ va_arg(ap, int16_t *), nelem); ++ break; ++ case DATA_TYPE_UINT16: ++ ret = nvlist_add_uint16(payload, name, ++ va_arg(ap, uint_t)); ++ break; ++ case DATA_TYPE_UINT16_ARRAY: ++ nelem = va_arg(ap, int); ++ ret = nvlist_add_uint16_array(payload, name, ++ va_arg(ap, uint16_t *), nelem); ++ break; ++ case DATA_TYPE_INT32: ++ ret = nvlist_add_int32(payload, name, ++ va_arg(ap, int32_t)); ++ break; ++ case DATA_TYPE_INT32_ARRAY: ++ nelem = va_arg(ap, int); ++ ret = nvlist_add_int32_array(payload, name, ++ va_arg(ap, int32_t *), nelem); ++ break; ++ case DATA_TYPE_UINT32: ++ ret = nvlist_add_uint32(payload, name, ++ va_arg(ap, uint32_t)); ++ break; ++ case DATA_TYPE_UINT32_ARRAY: ++ nelem = va_arg(ap, int); ++ ret = nvlist_add_uint32_array(payload, name, ++ va_arg(ap, uint32_t *), nelem); ++ break; ++ case DATA_TYPE_INT64: ++ ret = nvlist_add_int64(payload, name, ++ va_arg(ap, int64_t)); ++ break; ++ case DATA_TYPE_INT64_ARRAY: ++ nelem = va_arg(ap, int); ++ ret = nvlist_add_int64_array(payload, name, ++ va_arg(ap, int64_t *), nelem); ++ break; ++ case DATA_TYPE_UINT64: ++ ret = nvlist_add_uint64(payload, name, ++ va_arg(ap, uint64_t)); ++ break; ++ case DATA_TYPE_UINT64_ARRAY: ++ nelem = va_arg(ap, int); ++ ret = nvlist_add_uint64_array(payload, name, ++ va_arg(ap, uint64_t *), nelem); ++ break; ++ case DATA_TYPE_STRING: ++ ret = nvlist_add_string(payload, name, ++ va_arg(ap, char *)); ++ break; ++ case DATA_TYPE_STRING_ARRAY: ++ nelem = va_arg(ap, int); ++ ret = nvlist_add_string_array(payload, name, ++ va_arg(ap, char **), nelem); ++ break; ++ case DATA_TYPE_NVLIST: ++ ret = nvlist_add_nvlist(payload, name, ++ va_arg(ap, nvlist_t *)); ++ break; ++ case DATA_TYPE_NVLIST_ARRAY: ++ nelem = va_arg(ap, int); ++ ret = nvlist_add_nvlist_array(payload, name, ++ va_arg(ap, nvlist_t **), nelem); ++ break; ++ default: ++ ret = EINVAL; ++ } ++ ++ name = va_arg(ap, char *); ++ } ++ return (ret); ++} ++ ++void ++fm_payload_set(nvlist_t *payload, ...) ++{ ++ int ret; ++ const char *name; ++ va_list ap; ++ ++ va_start(ap, payload); ++ name = va_arg(ap, char *); ++ ret = i_fm_payload_set(payload, name, ap); ++ va_end(ap); ++ ++ if (ret) ++ atomic_add_64( ++ &erpt_kstat_data.payload_set_failed.value.ui64, 1); ++} ++ ++/* ++ * Set-up and validate the members of an ereport event according to: ++ * ++ * Member name Type Value ++ * ==================================================== ++ * class string ereport ++ * version uint8_t 0 ++ * ena uint64_t ++ * detector nvlist_t ++ * ereport-payload nvlist_t ++ * ++ * We don't actually add a 'version' member to the payload. Really, ++ * the version quoted to us by our caller is that of the category 1 ++ * "ereport" event class (and we require FM_EREPORT_VERS0) but ++ * the payload version of the actual leaf class event under construction ++ * may be something else. Callers should supply a version in the varargs, ++ * or (better) we could take two version arguments - one for the ++ * ereport category 1 classification (expect FM_EREPORT_VERS0) and one ++ * for the leaf class. ++ */ ++void ++fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class, ++ uint64_t ena, const nvlist_t *detector, ...) ++{ ++ char ereport_class[FM_MAX_CLASS]; ++ const char *name; ++ va_list ap; ++ int ret; ++ ++ if (version != FM_EREPORT_VERS0) { ++ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1); ++ return; ++ } ++ ++ (void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s", ++ FM_EREPORT_CLASS, erpt_class); ++ if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) { ++ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1); ++ return; ++ } ++ ++ if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) { ++ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1); ++ } ++ ++ if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR, ++ (nvlist_t *)detector) != 0) { ++ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1); ++ } ++ ++ va_start(ap, detector); ++ name = va_arg(ap, const char *); ++ ret = i_fm_payload_set(ereport, name, ap); ++ va_end(ap); ++ ++ if (ret) ++ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1); ++} ++ ++/* ++ * Set-up and validate the members of an hc fmri according to; ++ * ++ * Member name Type Value ++ * =================================================== ++ * version uint8_t 0 ++ * auth nvlist_t ++ * hc-name string ++ * hc-id string ++ * ++ * Note that auth and hc-id are optional members. ++ */ ++ ++#define HC_MAXPAIRS 20 ++#define HC_MAXNAMELEN 50 ++ ++static int ++fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth) ++{ ++ if (version != FM_HC_SCHEME_VERSION) { ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return (0); ++ } ++ ++ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 || ++ nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) { ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return (0); ++ } ++ ++ if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY, ++ (nvlist_t *)auth) != 0) { ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return (0); ++ } ++ ++ return (1); ++} ++ ++void ++fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth, ++ nvlist_t *snvl, int npairs, ...) ++{ ++ nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri); ++ nvlist_t *pairs[HC_MAXPAIRS]; ++ va_list ap; ++ int i; ++ ++ if (!fm_fmri_hc_set_common(fmri, version, auth)) ++ return; ++ ++ npairs = MIN(npairs, HC_MAXPAIRS); ++ ++ va_start(ap, npairs); ++ for (i = 0; i < npairs; i++) { ++ const char *name = va_arg(ap, const char *); ++ uint32_t id = va_arg(ap, uint32_t); ++ char idstr[11]; ++ ++ (void) snprintf(idstr, sizeof (idstr), "%u", id); ++ ++ pairs[i] = fm_nvlist_create(nva); ++ if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 || ++ nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) { ++ atomic_add_64( ++ &erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ } ++ } ++ va_end(ap); ++ ++ if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0) ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ ++ for (i = 0; i < npairs; i++) ++ fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN); ++ ++ if (snvl != NULL) { ++ if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) { ++ atomic_add_64( ++ &erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ } ++ } ++} ++ ++void ++fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth, ++ nvlist_t *snvl, nvlist_t *bboard, int npairs, ...) ++{ ++ nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri); ++ nvlist_t *pairs[HC_MAXPAIRS]; ++ nvlist_t **hcl; ++ uint_t n; ++ int i, j; ++ va_list ap; ++ char *hcname, *hcid; ++ ++ if (!fm_fmri_hc_set_common(fmri, version, auth)) ++ return; ++ ++ /* ++ * copy the bboard nvpairs to the pairs array ++ */ ++ if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n) ++ != 0) { ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return; ++ } ++ ++ for (i = 0; i < n; i++) { ++ if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME, ++ &hcname) != 0) { ++ atomic_add_64( ++ &erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return; ++ } ++ if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) { ++ atomic_add_64( ++ &erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return; ++ } ++ ++ pairs[i] = fm_nvlist_create(nva); ++ if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 || ++ nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) { ++ for (j = 0; j <= i; j++) { ++ if (pairs[j] != NULL) ++ fm_nvlist_destroy(pairs[j], ++ FM_NVA_RETAIN); ++ } ++ atomic_add_64( ++ &erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return; ++ } ++ } ++ ++ /* ++ * create the pairs from passed in pairs ++ */ ++ npairs = MIN(npairs, HC_MAXPAIRS); ++ ++ va_start(ap, npairs); ++ for (i = n; i < npairs + n; i++) { ++ const char *name = va_arg(ap, const char *); ++ uint32_t id = va_arg(ap, uint32_t); ++ char idstr[11]; ++ (void) snprintf(idstr, sizeof (idstr), "%u", id); ++ pairs[i] = fm_nvlist_create(nva); ++ if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 || ++ nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) { ++ for (j = 0; j <= i; j++) { ++ if (pairs[j] != NULL) ++ fm_nvlist_destroy(pairs[j], ++ FM_NVA_RETAIN); ++ } ++ atomic_add_64( ++ &erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return; ++ } ++ } ++ va_end(ap); ++ ++ /* ++ * Create the fmri hc list ++ */ ++ if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, ++ npairs + n) != 0) { ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return; ++ } ++ ++ for (i = 0; i < npairs + n; i++) { ++ fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN); ++ } ++ ++ if (snvl != NULL) { ++ if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) { ++ atomic_add_64( ++ &erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return; ++ } ++ } ++} ++ ++/* ++ * Set-up and validate the members of an dev fmri according to: ++ * ++ * Member name Type Value ++ * ==================================================== ++ * version uint8_t 0 ++ * auth nvlist_t ++ * devpath string ++ * [devid] string ++ * [target-port-l0id] string ++ * ++ * Note that auth and devid are optional members. ++ */ ++void ++fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth, ++ const char *devpath, const char *devid, const char *tpl0) ++{ ++ int err = 0; ++ ++ if (version != DEV_SCHEME_VERSION0) { ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return; ++ } ++ ++ err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version); ++ err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV); ++ ++ if (auth != NULL) { ++ err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY, ++ (nvlist_t *)auth); ++ } ++ ++ err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath); ++ ++ if (devid != NULL) ++ err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid); ++ ++ if (tpl0 != NULL) ++ err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0); ++ ++ if (err) ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ ++} ++ ++/* ++ * Set-up and validate the members of an cpu fmri according to: ++ * ++ * Member name Type Value ++ * ==================================================== ++ * version uint8_t 0 ++ * auth nvlist_t ++ * cpuid uint32_t ++ * cpumask uint8_t ++ * serial uint64_t ++ * ++ * Note that auth, cpumask, serial are optional members. ++ * ++ */ ++void ++fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth, ++ uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp) ++{ ++ uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64; ++ ++ if (version < CPU_SCHEME_VERSION1) { ++ atomic_add_64(failedp, 1); ++ return; ++ } ++ ++ if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) { ++ atomic_add_64(failedp, 1); ++ return; ++ } ++ ++ if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME, ++ FM_FMRI_SCHEME_CPU) != 0) { ++ atomic_add_64(failedp, 1); ++ return; ++ } ++ ++ if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY, ++ (nvlist_t *)auth) != 0) ++ atomic_add_64(failedp, 1); ++ ++ if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0) ++ atomic_add_64(failedp, 1); ++ ++ if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK, ++ *cpu_maskp) != 0) ++ atomic_add_64(failedp, 1); ++ ++ if (serial_idp == NULL || nvlist_add_string(fmri_cpu, ++ FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0) ++ atomic_add_64(failedp, 1); ++} ++ ++/* ++ * Set-up and validate the members of a mem according to: ++ * ++ * Member name Type Value ++ * ==================================================== ++ * version uint8_t 0 ++ * auth nvlist_t [optional] ++ * unum string ++ * serial string [optional*] ++ * offset uint64_t [optional] ++ * ++ * * serial is required if offset is present ++ */ ++void ++fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth, ++ const char *unum, const char *serial, uint64_t offset) ++{ ++ if (version != MEM_SCHEME_VERSION0) { ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return; ++ } ++ ++ if (!serial && (offset != (uint64_t)-1)) { ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return; ++ } ++ ++ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) { ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return; ++ } ++ ++ if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) { ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return; ++ } ++ ++ if (auth != NULL) { ++ if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY, ++ (nvlist_t *)auth) != 0) { ++ atomic_add_64( ++ &erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ } ++ } ++ ++ if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) { ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ } ++ ++ if (serial != NULL) { ++ if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID, ++ (char **)&serial, 1) != 0) { ++ atomic_add_64( ++ &erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ } ++ if (offset != (uint64_t)-1) { ++ if (nvlist_add_uint64(fmri, FM_FMRI_MEM_OFFSET, ++ offset) != 0) { ++ atomic_add_64(&erpt_kstat_data. ++ fmri_set_failed.value.ui64, 1); ++ } ++ } ++ } ++} ++ ++void ++fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid, ++ uint64_t vdev_guid) ++{ ++ if (version != ZFS_SCHEME_VERSION0) { ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return; ++ } ++ ++ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) { ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return; ++ } ++ ++ if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) { ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ return; ++ } ++ ++ if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) { ++ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ } ++ ++ if (vdev_guid != 0) { ++ if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) { ++ atomic_add_64( ++ &erpt_kstat_data.fmri_set_failed.value.ui64, 1); ++ } ++ } ++} ++ ++uint64_t ++fm_ena_increment(uint64_t ena) ++{ ++ uint64_t new_ena; ++ ++ switch (ENA_FORMAT(ena)) { ++ case FM_ENA_FMT1: ++ new_ena = ena + (1 << ENA_FMT1_GEN_SHFT); ++ break; ++ case FM_ENA_FMT2: ++ new_ena = ena + (1 << ENA_FMT2_GEN_SHFT); ++ break; ++ default: ++ new_ena = 0; ++ } ++ ++ return (new_ena); ++} ++ ++uint64_t ++fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format) ++{ ++ uint64_t ena = 0; ++ ++ switch (format) { ++ case FM_ENA_FMT1: ++ if (timestamp) { ++ ena = (uint64_t)((format & ENA_FORMAT_MASK) | ++ ((cpuid << ENA_FMT1_CPUID_SHFT) & ++ ENA_FMT1_CPUID_MASK) | ++ ((timestamp << ENA_FMT1_TIME_SHFT) & ++ ENA_FMT1_TIME_MASK)); ++ } else { ++ ena = (uint64_t)((format & ENA_FORMAT_MASK) | ++ ((cpuid << ENA_FMT1_CPUID_SHFT) & ++ ENA_FMT1_CPUID_MASK) | ++ ((gethrtime() << ENA_FMT1_TIME_SHFT) & ++ ENA_FMT1_TIME_MASK)); ++ } ++ break; ++ case FM_ENA_FMT2: ++ ena = (uint64_t)((format & ENA_FORMAT_MASK) | ++ ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK)); ++ break; ++ default: ++ break; ++ } ++ ++ return (ena); ++} ++ ++uint64_t ++fm_ena_generate(uint64_t timestamp, uchar_t format) ++{ ++ uint64_t ena; ++ ++ kpreempt_disable(); ++ ena = fm_ena_generate_cpu(timestamp, getcpuid(), format); ++ kpreempt_enable(); ++ ++ return (ena); ++} ++ ++uint64_t ++fm_ena_generation_get(uint64_t ena) ++{ ++ uint64_t gen; ++ ++ switch (ENA_FORMAT(ena)) { ++ case FM_ENA_FMT1: ++ gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT; ++ break; ++ case FM_ENA_FMT2: ++ gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT; ++ break; ++ default: ++ gen = 0; ++ break; ++ } ++ ++ return (gen); ++} ++ ++uchar_t ++fm_ena_format_get(uint64_t ena) ++{ ++ ++ return (ENA_FORMAT(ena)); ++} ++ ++uint64_t ++fm_ena_id_get(uint64_t ena) ++{ ++ uint64_t id; ++ ++ switch (ENA_FORMAT(ena)) { ++ case FM_ENA_FMT1: ++ id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT; ++ break; ++ case FM_ENA_FMT2: ++ id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT; ++ break; ++ default: ++ id = 0; ++ } ++ ++ return (id); ++} ++ ++uint64_t ++fm_ena_time_get(uint64_t ena) ++{ ++ uint64_t time; ++ ++ switch (ENA_FORMAT(ena)) { ++ case FM_ENA_FMT1: ++ time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT; ++ break; ++ case FM_ENA_FMT2: ++ time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT; ++ break; ++ default: ++ time = 0; ++ } ++ ++ return (time); ++} ++ ++#ifdef _KERNEL ++void ++fm_init(void) ++{ ++ zevent_len_cur = 0; ++ zevent_flags = 0; ++ ++ if (zfs_zevent_len_max == 0) ++ zfs_zevent_len_max = ERPT_MAX_ERRS * MAX(max_ncpus, 4); ++ ++ /* Initialize zevent allocation and generation kstats */ ++ fm_ksp = kstat_create("zfs", 0, "fm", "misc", KSTAT_TYPE_NAMED, ++ sizeof (struct erpt_kstat) / sizeof (kstat_named_t), ++ KSTAT_FLAG_VIRTUAL); ++ ++ if (fm_ksp != NULL) { ++ fm_ksp->ks_data = &erpt_kstat_data; ++ kstat_install(fm_ksp); ++ } else { ++ cmn_err(CE_NOTE, "failed to create fm/misc kstat\n"); ++ } ++ ++ mutex_init(&zevent_lock, NULL, MUTEX_DEFAULT, NULL); ++ list_create(&zevent_list, sizeof(zevent_t), offsetof(zevent_t, ev_node)); ++ cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL); ++} ++ ++void ++fm_fini(void) ++{ ++ int count; ++ ++ zfs_zevent_drain_all(&count); ++ ++ mutex_enter(&zevent_lock); ++ cv_broadcast(&zevent_cv); ++ ++ zevent_flags |= ZEVENT_SHUTDOWN; ++ while (zevent_waiters > 0) { ++ mutex_exit(&zevent_lock); ++ schedule(); ++ mutex_enter(&zevent_lock); ++ } ++ mutex_exit(&zevent_lock); ++ ++ cv_destroy(&zevent_cv); ++ list_destroy(&zevent_list); ++ mutex_destroy(&zevent_lock); ++ ++ if (fm_ksp != NULL) { ++ kstat_delete(fm_ksp); ++ fm_ksp = NULL; ++ } ++} ++ ++module_param(zfs_zevent_len_max, int, 0644); ++MODULE_PARM_DESC(zfs_zevent_len_max, "Max event queue length"); ++ ++module_param(zfs_zevent_cols, int, 0644); ++MODULE_PARM_DESC(zfs_zevent_cols, "Max event column width"); ++ ++module_param(zfs_zevent_console, int, 0644); ++MODULE_PARM_DESC(zfs_zevent_console, "Log events to the console"); ++ ++#endif /* _KERNEL */ +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/gzip.c linux-3.2.33-go/fs/zfs/zfs/gzip.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/gzip.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/gzip.c 2012-11-16 23:25:34.353039289 +0100 +@@ -0,0 +1,82 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++ ++ ++#include ++#include ++ ++#ifdef _KERNEL ++ ++#include ++#include ++ ++typedef size_t zlen_t; ++#define compress_func z_compress_level ++#define uncompress_func z_uncompress ++ ++#else /* _KERNEL */ ++ ++#include ++#include ++ ++typedef uLongf zlen_t; ++#define compress_func compress2 ++#define uncompress_func uncompress ++ ++#endif ++ ++size_t ++gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) ++{ ++ zlen_t dstlen = d_len; ++ ++ ASSERT(d_len <= s_len); ++ ++ if (compress_func(d_start, &dstlen, s_start, s_len, n) != Z_OK) { ++ if (d_len != s_len) ++ return (s_len); ++ ++ bcopy(s_start, d_start, s_len); ++ return (s_len); ++ } ++ ++ return ((size_t) dstlen); ++} ++ ++/*ARGSUSED*/ ++int ++gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) ++{ ++ zlen_t dstlen = d_len; ++ ++ ASSERT(d_len >= s_len); ++ ++ if (uncompress_func(d_start, &dstlen, s_start, s_len) != Z_OK) ++ return (-1); ++ ++ return (0); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/lzjb.c linux-3.2.33-go/fs/zfs/zfs/lzjb.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/lzjb.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/lzjb.c 2012-11-16 23:25:34.348039346 +0100 +@@ -0,0 +1,128 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* ++ * We keep our own copy of this algorithm for 3 main reasons: ++ * 1. If we didn't, anyone modifying common/os/compress.c would ++ * directly break our on disk format ++ * 2. Our version of lzjb does not have a number of checks that the ++ * common/os version needs and uses ++ * 3. We initialize the lempel to ensure deterministic results, ++ * so that identical blocks can always be deduplicated. ++ * In particular, we are adding the "feature" that compress() can ++ * take a destination buffer size and returns the compressed length, or the ++ * source length if compression would overflow the destination buffer. ++ */ ++ ++#include ++ ++#define MATCH_BITS 6 ++#define MATCH_MIN 3 ++#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1)) ++#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) ++#define LEMPEL_SIZE 1024 ++ ++/*ARGSUSED*/ ++size_t ++lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) ++{ ++ uchar_t *src = s_start; ++ uchar_t *dst = d_start; ++ uchar_t *cpy, *copymap = NULL; ++ int copymask = 1 << (NBBY - 1); ++ int mlen, offset, hash; ++ uint16_t *hp; ++ uint16_t *lempel; ++ ++ lempel = kmem_zalloc(LEMPEL_SIZE * sizeof (uint16_t), KM_PUSHPAGE); ++ while (src < (uchar_t *)s_start + s_len) { ++ if ((copymask <<= 1) == (1 << NBBY)) { ++ if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) { ++ kmem_free(lempel, LEMPEL_SIZE*sizeof(uint16_t)); ++ return (s_len); ++ } ++ copymask = 1; ++ copymap = dst; ++ *dst++ = 0; ++ } ++ if (src > (uchar_t *)s_start + s_len - MATCH_MAX) { ++ *dst++ = *src++; ++ continue; ++ } ++ hash = (src[0] << 16) + (src[1] << 8) + src[2]; ++ hash += hash >> 9; ++ hash += hash >> 5; ++ hp = &lempel[hash & (LEMPEL_SIZE - 1)]; ++ offset = (intptr_t)(src - *hp) & OFFSET_MASK; ++ *hp = (uint16_t)(uintptr_t)src; ++ cpy = src - offset; ++ if (cpy >= (uchar_t *)s_start && cpy != src && ++ src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) { ++ *copymap |= copymask; ++ for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++) ++ if (src[mlen] != cpy[mlen]) ++ break; ++ *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) | ++ (offset >> NBBY); ++ *dst++ = (uchar_t)offset; ++ src += mlen; ++ } else { ++ *dst++ = *src++; ++ } ++ } ++ ++ kmem_free(lempel, LEMPEL_SIZE * sizeof (uint16_t)); ++ return (dst - (uchar_t *)d_start); ++} ++ ++/*ARGSUSED*/ ++int ++lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) ++{ ++ uchar_t *src = s_start; ++ uchar_t *dst = d_start; ++ uchar_t *d_end = (uchar_t *)d_start + d_len; ++ uchar_t *cpy, copymap = 0; ++ int copymask = 1 << (NBBY - 1); ++ ++ while (dst < d_end) { ++ if ((copymask <<= 1) == (1 << NBBY)) { ++ copymask = 1; ++ copymap = *src++; ++ } ++ if (copymap & copymask) { ++ int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN; ++ int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK; ++ src += 2; ++ if ((cpy = dst - offset) < (uchar_t *)d_start) ++ return (-1); ++ while (--mlen >= 0 && dst < d_end) ++ *dst++ = *cpy++; ++ } else { ++ *dst++ = *src++; ++ } ++ } ++ return (0); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/Makefile linux-3.2.33-go/fs/zfs/zfs/Makefile +--- linux-3.2.33-go.orig/fs/zfs/zfs/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/Makefile 2012-11-16 23:25:34.374039048 +0100 +@@ -0,0 +1,91 @@ ++MODULE := zfs ++ ++EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) -Wno-unused-but-set-variable -DHAVE_SPL -D_KERNEL -DTEXT_DOMAIN=\"zfs-linux-kernel\" -DNDEBUG ++ ++obj-$(CONFIG_ZFS) := $(MODULE).o ++ ++$(MODULE)-objs += arc.o ++$(MODULE)-objs += bplist.o ++$(MODULE)-objs += bpobj.o ++$(MODULE)-objs += dbuf.o ++$(MODULE)-objs += ddt.o ++$(MODULE)-objs += ddt_zap.o ++$(MODULE)-objs += dmu.o ++$(MODULE)-objs += dmu_diff.o ++$(MODULE)-objs += dmu_object.o ++$(MODULE)-objs += dmu_objset.o ++$(MODULE)-objs += dmu_send.o ++$(MODULE)-objs += dmu_traverse.o ++$(MODULE)-objs += dmu_tx.o ++$(MODULE)-objs += dmu_zfetch.o ++$(MODULE)-objs += dnode.o ++$(MODULE)-objs += dnode_sync.o ++$(MODULE)-objs += dsl_dataset.o ++$(MODULE)-objs += dsl_deadlist.o ++$(MODULE)-objs += dsl_deleg.o ++$(MODULE)-objs += dsl_dir.o ++$(MODULE)-objs += dsl_pool.o ++$(MODULE)-objs += dsl_prop.o ++$(MODULE)-objs += dsl_scan.o ++$(MODULE)-objs += dsl_synctask.o ++$(MODULE)-objs += fm.o ++$(MODULE)-objs += gzip.o ++$(MODULE)-objs += lzjb.o ++$(MODULE)-objs += metaslab.o ++$(MODULE)-objs += refcount.o ++$(MODULE)-objs += rrwlock.o ++$(MODULE)-objs += sa.o ++$(MODULE)-objs += sha256.o ++$(MODULE)-objs += spa.o ++$(MODULE)-objs += spa_boot.o ++$(MODULE)-objs += spa_config.o ++$(MODULE)-objs += spa_errlog.o ++$(MODULE)-objs += spa_history.o ++$(MODULE)-objs += spa_misc.o ++$(MODULE)-objs += space_map.o ++$(MODULE)-objs += txg.o ++$(MODULE)-objs += uberblock.o ++$(MODULE)-objs += unique.o ++$(MODULE)-objs += vdev.o ++$(MODULE)-objs += vdev_cache.o ++$(MODULE)-objs += vdev_disk.o ++$(MODULE)-objs += vdev_file.o ++$(MODULE)-objs += vdev_label.o ++$(MODULE)-objs += vdev_mirror.o ++$(MODULE)-objs += vdev_missing.o ++$(MODULE)-objs += vdev_queue.o ++$(MODULE)-objs += vdev_raidz.o ++$(MODULE)-objs += vdev_root.o ++$(MODULE)-objs += zap.o ++$(MODULE)-objs += zap_leaf.o ++$(MODULE)-objs += zap_micro.o ++$(MODULE)-objs += zfs_acl.o ++$(MODULE)-objs += zfs_byteswap.o ++$(MODULE)-objs += zfs_ctldir.o ++$(MODULE)-objs += zfs_debug.o ++$(MODULE)-objs += zfs_dir.o ++$(MODULE)-objs += zfs_fm.o ++$(MODULE)-objs += zfs_fuid.o ++$(MODULE)-objs += zfs_ioctl.o ++$(MODULE)-objs += zfs_log.o ++$(MODULE)-objs += zfs_onexit.o ++$(MODULE)-objs += zfs_replay.o ++$(MODULE)-objs += zfs_rlock.o ++$(MODULE)-objs += zfs_sa.o ++$(MODULE)-objs += zfs_vfsops.o ++$(MODULE)-objs += zfs_vnops.o ++$(MODULE)-objs += zfs_znode.o ++$(MODULE)-objs += zil.o ++$(MODULE)-objs += zio.o ++$(MODULE)-objs += zio_checksum.o ++$(MODULE)-objs += zio_compress.o ++$(MODULE)-objs += zio_inject.o ++$(MODULE)-objs += zle.o ++$(MODULE)-objs += zpl_ctldir.o ++$(MODULE)-objs += zpl_export.o ++$(MODULE)-objs += zpl_file.o ++$(MODULE)-objs += zpl_inode.o ++$(MODULE)-objs += zpl_super.o ++$(MODULE)-objs += zpl_xattr.o ++$(MODULE)-objs += zrlock.o ++$(MODULE)-objs += zvol.o +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/Makefile.in linux-3.2.33-go/fs/zfs/zfs/Makefile.in +--- linux-3.2.33-go.orig/fs/zfs/zfs/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/Makefile.in 2012-11-16 23:25:34.352039300 +0100 +@@ -0,0 +1,91 @@ ++MODULE := zfs ++ ++EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@ ++ ++obj-$(CONFIG_ZFS) := $(MODULE).o ++ ++$(MODULE)-objs += @top_srcdir@/module/zfs/arc.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/ddt.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/ddt_zap.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dmu.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dmu_diff.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dmu_object.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dmu_objset.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dmu_send.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dmu_traverse.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dmu_tx.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dmu_zfetch.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dnode.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dnode_sync.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_dataset.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_deadlist.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_deleg.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_dir.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_pool.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_prop.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_scan.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_synctask.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/fm.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/gzip.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/lzjb.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/metaslab.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/refcount.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/rrwlock.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/sa.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/sha256.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/spa.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/spa_boot.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/spa_config.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/spa_errlog.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/txg.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/unique.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/vdev.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_cache.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_disk.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_file.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_label.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_mirror.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_missing.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_queue.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_raidz.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/vdev_root.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zap.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zap_leaf.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zap_micro.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_acl.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_byteswap.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_ctldir.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_debug.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_dir.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_fm.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_fuid.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_ioctl.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_log.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_onexit.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_replay.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_rlock.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_sa.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_vfsops.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_vnops.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zfs_znode.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zil.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zio.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zio_checksum.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zio_compress.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zio_inject.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zle.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zpl_ctldir.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zpl_export.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zpl_file.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zpl_inode.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zpl_super.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zpl_xattr.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zrlock.o ++$(MODULE)-objs += @top_srcdir@/module/zfs/zvol.o +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/metaslab.c linux-3.2.33-go/fs/zfs/zfs/metaslab.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/metaslab.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/metaslab.c 2012-11-16 23:25:34.350039322 +0100 +@@ -0,0 +1,1748 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define WITH_DF_BLOCK_ALLOCATOR ++ ++/* ++ * Allow allocations to switch to gang blocks quickly. We do this to ++ * avoid having to load lots of space_maps in a given txg. There are, ++ * however, some cases where we want to avoid "fast" ganging and instead ++ * we want to do an exhaustive search of all metaslabs on this device. ++ * Currently we don't allow any gang, zil, or dump device related allocations ++ * to "fast" gang. ++ */ ++#define CAN_FASTGANG(flags) \ ++ (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ ++ METASLAB_GANG_AVOID))) ++ ++uint64_t metaslab_aliquot = 512ULL << 10; ++uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ ++ ++/* ++ * This value defines the number of allowed allocation failures per vdev. ++ * If a device reaches this threshold in a given txg then we consider skipping ++ * allocations on that device. ++ */ ++int zfs_mg_alloc_failures; ++ ++/* ++ * Metaslab debugging: when set, keeps all space maps in core to verify frees. ++ */ ++static int metaslab_debug = 0; ++ ++/* ++ * Minimum size which forces the dynamic allocator to change ++ * it's allocation strategy. Once the space map cannot satisfy ++ * an allocation of this size then it switches to using more ++ * aggressive strategy (i.e search by size rather than offset). ++ */ ++uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; ++ ++/* ++ * The minimum free space, in percent, which must be available ++ * in a space map to continue allocations in a first-fit fashion. ++ * Once the space_map's free space drops below this level we dynamically ++ * switch to using best-fit allocations. ++ */ ++int metaslab_df_free_pct = 4; ++ ++/* ++ * A metaslab is considered "free" if it contains a contiguous ++ * segment which is greater than metaslab_min_alloc_size. ++ */ ++uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; ++ ++/* ++ * Max number of space_maps to prefetch. ++ */ ++int metaslab_prefetch_limit = SPA_DVAS_PER_BP; ++ ++/* ++ * Percentage bonus multiplier for metaslabs that are in the bonus area. ++ */ ++int metaslab_smo_bonus_pct = 150; ++ ++/* ++ * ========================================================================== ++ * Metaslab classes ++ * ========================================================================== ++ */ ++metaslab_class_t * ++metaslab_class_create(spa_t *spa, space_map_ops_t *ops) ++{ ++ metaslab_class_t *mc; ++ ++ mc = kmem_zalloc(sizeof (metaslab_class_t), KM_PUSHPAGE); ++ ++ mc->mc_spa = spa; ++ mc->mc_rotor = NULL; ++ mc->mc_ops = ops; ++ mutex_init(&mc->mc_fastwrite_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ return (mc); ++} ++ ++void ++metaslab_class_destroy(metaslab_class_t *mc) ++{ ++ ASSERT(mc->mc_rotor == NULL); ++ ASSERT(mc->mc_alloc == 0); ++ ASSERT(mc->mc_deferred == 0); ++ ASSERT(mc->mc_space == 0); ++ ASSERT(mc->mc_dspace == 0); ++ ++ mutex_destroy(&mc->mc_fastwrite_lock); ++ kmem_free(mc, sizeof (metaslab_class_t)); ++} ++ ++int ++metaslab_class_validate(metaslab_class_t *mc) ++{ ++ metaslab_group_t *mg; ++ vdev_t *vd; ++ ++ /* ++ * Must hold one of the spa_config locks. ++ */ ++ ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || ++ spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); ++ ++ if ((mg = mc->mc_rotor) == NULL) ++ return (0); ++ ++ do { ++ vd = mg->mg_vd; ++ ASSERT(vd->vdev_mg != NULL); ++ ASSERT3P(vd->vdev_top, ==, vd); ++ ASSERT3P(mg->mg_class, ==, mc); ++ ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); ++ } while ((mg = mg->mg_next) != mc->mc_rotor); ++ ++ return (0); ++} ++ ++void ++metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, ++ int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) ++{ ++ atomic_add_64(&mc->mc_alloc, alloc_delta); ++ atomic_add_64(&mc->mc_deferred, defer_delta); ++ atomic_add_64(&mc->mc_space, space_delta); ++ atomic_add_64(&mc->mc_dspace, dspace_delta); ++} ++ ++uint64_t ++metaslab_class_get_alloc(metaslab_class_t *mc) ++{ ++ return (mc->mc_alloc); ++} ++ ++uint64_t ++metaslab_class_get_deferred(metaslab_class_t *mc) ++{ ++ return (mc->mc_deferred); ++} ++ ++uint64_t ++metaslab_class_get_space(metaslab_class_t *mc) ++{ ++ return (mc->mc_space); ++} ++ ++uint64_t ++metaslab_class_get_dspace(metaslab_class_t *mc) ++{ ++ return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); ++} ++ ++/* ++ * ========================================================================== ++ * Metaslab groups ++ * ========================================================================== ++ */ ++static int ++metaslab_compare(const void *x1, const void *x2) ++{ ++ const metaslab_t *m1 = x1; ++ const metaslab_t *m2 = x2; ++ ++ if (m1->ms_weight < m2->ms_weight) ++ return (1); ++ if (m1->ms_weight > m2->ms_weight) ++ return (-1); ++ ++ /* ++ * If the weights are identical, use the offset to force uniqueness. ++ */ ++ if (m1->ms_map.sm_start < m2->ms_map.sm_start) ++ return (-1); ++ if (m1->ms_map.sm_start > m2->ms_map.sm_start) ++ return (1); ++ ++ ASSERT3P(m1, ==, m2); ++ ++ return (0); ++} ++ ++metaslab_group_t * ++metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) ++{ ++ metaslab_group_t *mg; ++ ++ mg = kmem_zalloc(sizeof (metaslab_group_t), KM_PUSHPAGE); ++ mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); ++ avl_create(&mg->mg_metaslab_tree, metaslab_compare, ++ sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); ++ mg->mg_vd = vd; ++ mg->mg_class = mc; ++ mg->mg_activation_count = 0; ++ ++ return (mg); ++} ++ ++void ++metaslab_group_destroy(metaslab_group_t *mg) ++{ ++ ASSERT(mg->mg_prev == NULL); ++ ASSERT(mg->mg_next == NULL); ++ /* ++ * We may have gone below zero with the activation count ++ * either because we never activated in the first place or ++ * because we're done, and possibly removing the vdev. ++ */ ++ ASSERT(mg->mg_activation_count <= 0); ++ ++ avl_destroy(&mg->mg_metaslab_tree); ++ mutex_destroy(&mg->mg_lock); ++ kmem_free(mg, sizeof (metaslab_group_t)); ++} ++ ++void ++metaslab_group_activate(metaslab_group_t *mg) ++{ ++ metaslab_class_t *mc = mg->mg_class; ++ metaslab_group_t *mgprev, *mgnext; ++ ++ ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); ++ ++ ASSERT(mc->mc_rotor != mg); ++ ASSERT(mg->mg_prev == NULL); ++ ASSERT(mg->mg_next == NULL); ++ ASSERT(mg->mg_activation_count <= 0); ++ ++ if (++mg->mg_activation_count <= 0) ++ return; ++ ++ mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); ++ ++ if ((mgprev = mc->mc_rotor) == NULL) { ++ mg->mg_prev = mg; ++ mg->mg_next = mg; ++ } else { ++ mgnext = mgprev->mg_next; ++ mg->mg_prev = mgprev; ++ mg->mg_next = mgnext; ++ mgprev->mg_next = mg; ++ mgnext->mg_prev = mg; ++ } ++ mc->mc_rotor = mg; ++} ++ ++void ++metaslab_group_passivate(metaslab_group_t *mg) ++{ ++ metaslab_class_t *mc = mg->mg_class; ++ metaslab_group_t *mgprev, *mgnext; ++ ++ ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); ++ ++ if (--mg->mg_activation_count != 0) { ++ ASSERT(mc->mc_rotor != mg); ++ ASSERT(mg->mg_prev == NULL); ++ ASSERT(mg->mg_next == NULL); ++ ASSERT(mg->mg_activation_count < 0); ++ return; ++ } ++ ++ mgprev = mg->mg_prev; ++ mgnext = mg->mg_next; ++ ++ if (mg == mgnext) { ++ mc->mc_rotor = NULL; ++ } else { ++ mc->mc_rotor = mgnext; ++ mgprev->mg_next = mgnext; ++ mgnext->mg_prev = mgprev; ++ } ++ ++ mg->mg_prev = NULL; ++ mg->mg_next = NULL; ++} ++ ++static void ++metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) ++{ ++ mutex_enter(&mg->mg_lock); ++ ASSERT(msp->ms_group == NULL); ++ msp->ms_group = mg; ++ msp->ms_weight = 0; ++ avl_add(&mg->mg_metaslab_tree, msp); ++ mutex_exit(&mg->mg_lock); ++} ++ ++static void ++metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) ++{ ++ mutex_enter(&mg->mg_lock); ++ ASSERT(msp->ms_group == mg); ++ avl_remove(&mg->mg_metaslab_tree, msp); ++ msp->ms_group = NULL; ++ mutex_exit(&mg->mg_lock); ++} ++ ++static void ++metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) ++{ ++ /* ++ * Although in principle the weight can be any value, in ++ * practice we do not use values in the range [1, 510]. ++ */ ++ ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); ++ ASSERT(MUTEX_HELD(&msp->ms_lock)); ++ ++ mutex_enter(&mg->mg_lock); ++ ASSERT(msp->ms_group == mg); ++ avl_remove(&mg->mg_metaslab_tree, msp); ++ msp->ms_weight = weight; ++ avl_add(&mg->mg_metaslab_tree, msp); ++ mutex_exit(&mg->mg_lock); ++} ++ ++/* ++ * ========================================================================== ++ * Common allocator routines ++ * ========================================================================== ++ */ ++static int ++metaslab_segsize_compare(const void *x1, const void *x2) ++{ ++ const space_seg_t *s1 = x1; ++ const space_seg_t *s2 = x2; ++ uint64_t ss_size1 = s1->ss_end - s1->ss_start; ++ uint64_t ss_size2 = s2->ss_end - s2->ss_start; ++ ++ if (ss_size1 < ss_size2) ++ return (-1); ++ if (ss_size1 > ss_size2) ++ return (1); ++ ++ if (s1->ss_start < s2->ss_start) ++ return (-1); ++ if (s1->ss_start > s2->ss_start) ++ return (1); ++ ++ return (0); ++} ++ ++#if defined(WITH_FF_BLOCK_ALLOCATOR) || \ ++ defined(WITH_DF_BLOCK_ALLOCATOR) || \ ++ defined(WITH_CDF_BLOCK_ALLOCATOR) ++/* ++ * This is a helper function that can be used by the allocator to find ++ * a suitable block to allocate. This will search the specified AVL ++ * tree looking for a block that matches the specified criteria. ++ */ ++static uint64_t ++metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, ++ uint64_t align) ++{ ++ space_seg_t *ss, ssearch; ++ avl_index_t where; ++ ++ ssearch.ss_start = *cursor; ++ ssearch.ss_end = *cursor + size; ++ ++ ss = avl_find(t, &ssearch, &where); ++ if (ss == NULL) ++ ss = avl_nearest(t, where, AVL_AFTER); ++ ++ while (ss != NULL) { ++ uint64_t offset = P2ROUNDUP(ss->ss_start, align); ++ ++ if (offset + size <= ss->ss_end) { ++ *cursor = offset + size; ++ return (offset); ++ } ++ ss = AVL_NEXT(t, ss); ++ } ++ ++ /* ++ * If we know we've searched the whole map (*cursor == 0), give up. ++ * Otherwise, reset the cursor to the beginning and try again. ++ */ ++ if (*cursor == 0) ++ return (-1ULL); ++ ++ *cursor = 0; ++ return (metaslab_block_picker(t, cursor, size, align)); ++} ++#endif /* WITH_FF/DF/CDF_BLOCK_ALLOCATOR */ ++ ++static void ++metaslab_pp_load(space_map_t *sm) ++{ ++ space_seg_t *ss; ++ ++ ASSERT(sm->sm_ppd == NULL); ++ sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_PUSHPAGE); ++ ++ sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_PUSHPAGE); ++ avl_create(sm->sm_pp_root, metaslab_segsize_compare, ++ sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); ++ ++ for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) ++ avl_add(sm->sm_pp_root, ss); ++} ++ ++static void ++metaslab_pp_unload(space_map_t *sm) ++{ ++ void *cookie = NULL; ++ ++ kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); ++ sm->sm_ppd = NULL; ++ ++ while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { ++ /* tear down the tree */ ++ } ++ ++ avl_destroy(sm->sm_pp_root); ++ kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); ++ sm->sm_pp_root = NULL; ++} ++ ++/* ARGSUSED */ ++static void ++metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) ++{ ++ /* No need to update cursor */ ++} ++ ++/* ARGSUSED */ ++static void ++metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) ++{ ++ /* No need to update cursor */ ++} ++ ++/* ++ * Return the maximum contiguous segment within the metaslab. ++ */ ++uint64_t ++metaslab_pp_maxsize(space_map_t *sm) ++{ ++ avl_tree_t *t = sm->sm_pp_root; ++ space_seg_t *ss; ++ ++ if (t == NULL || (ss = avl_last(t)) == NULL) ++ return (0ULL); ++ ++ return (ss->ss_end - ss->ss_start); ++} ++ ++#if defined(WITH_FF_BLOCK_ALLOCATOR) ++/* ++ * ========================================================================== ++ * The first-fit block allocator ++ * ========================================================================== ++ */ ++static uint64_t ++metaslab_ff_alloc(space_map_t *sm, uint64_t size) ++{ ++ avl_tree_t *t = &sm->sm_root; ++ uint64_t align = size & -size; ++ uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; ++ ++ return (metaslab_block_picker(t, cursor, size, align)); ++} ++ ++/* ARGSUSED */ ++boolean_t ++metaslab_ff_fragmented(space_map_t *sm) ++{ ++ return (B_TRUE); ++} ++ ++static space_map_ops_t metaslab_ff_ops = { ++ metaslab_pp_load, ++ metaslab_pp_unload, ++ metaslab_ff_alloc, ++ metaslab_pp_claim, ++ metaslab_pp_free, ++ metaslab_pp_maxsize, ++ metaslab_ff_fragmented ++}; ++ ++space_map_ops_t *zfs_metaslab_ops = &metaslab_ff_ops; ++#endif /* WITH_FF_BLOCK_ALLOCATOR */ ++ ++#if defined(WITH_DF_BLOCK_ALLOCATOR) ++/* ++ * ========================================================================== ++ * Dynamic block allocator - ++ * Uses the first fit allocation scheme until space get low and then ++ * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold ++ * and metaslab_df_free_pct to determine when to switch the allocation scheme. ++ * ========================================================================== ++ */ ++static uint64_t ++metaslab_df_alloc(space_map_t *sm, uint64_t size) ++{ ++ avl_tree_t *t = &sm->sm_root; ++ uint64_t align = size & -size; ++ uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; ++ uint64_t max_size = metaslab_pp_maxsize(sm); ++ int free_pct = sm->sm_space * 100 / sm->sm_size; ++ ++ ASSERT(MUTEX_HELD(sm->sm_lock)); ++ ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); ++ ++ if (max_size < size) ++ return (-1ULL); ++ ++ /* ++ * If we're running low on space switch to using the size ++ * sorted AVL tree (best-fit). ++ */ ++ if (max_size < metaslab_df_alloc_threshold || ++ free_pct < metaslab_df_free_pct) { ++ t = sm->sm_pp_root; ++ *cursor = 0; ++ } ++ ++ return (metaslab_block_picker(t, cursor, size, 1ULL)); ++} ++ ++static boolean_t ++metaslab_df_fragmented(space_map_t *sm) ++{ ++ uint64_t max_size = metaslab_pp_maxsize(sm); ++ int free_pct = sm->sm_space * 100 / sm->sm_size; ++ ++ if (max_size >= metaslab_df_alloc_threshold && ++ free_pct >= metaslab_df_free_pct) ++ return (B_FALSE); ++ ++ return (B_TRUE); ++} ++ ++static space_map_ops_t metaslab_df_ops = { ++ metaslab_pp_load, ++ metaslab_pp_unload, ++ metaslab_df_alloc, ++ metaslab_pp_claim, ++ metaslab_pp_free, ++ metaslab_pp_maxsize, ++ metaslab_df_fragmented ++}; ++ ++space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; ++#endif /* WITH_DF_BLOCK_ALLOCATOR */ ++ ++/* ++ * ========================================================================== ++ * Other experimental allocators ++ * ========================================================================== ++ */ ++#if defined(WITH_CDF_BLOCK_ALLOCATOR) ++static uint64_t ++metaslab_cdf_alloc(space_map_t *sm, uint64_t size) ++{ ++ avl_tree_t *t = &sm->sm_root; ++ uint64_t *cursor = (uint64_t *)sm->sm_ppd; ++ uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; ++ uint64_t max_size = metaslab_pp_maxsize(sm); ++ uint64_t rsize = size; ++ uint64_t offset = 0; ++ ++ ASSERT(MUTEX_HELD(sm->sm_lock)); ++ ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); ++ ++ if (max_size < size) ++ return (-1ULL); ++ ++ ASSERT3U(*extent_end, >=, *cursor); ++ ++ /* ++ * If we're running low on space switch to using the size ++ * sorted AVL tree (best-fit). ++ */ ++ if ((*cursor + size) > *extent_end) { ++ ++ t = sm->sm_pp_root; ++ *cursor = *extent_end = 0; ++ ++ if (max_size > 2 * SPA_MAXBLOCKSIZE) ++ rsize = MIN(metaslab_min_alloc_size, max_size); ++ offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); ++ if (offset != -1) ++ *cursor = offset + size; ++ } else { ++ offset = metaslab_block_picker(t, cursor, rsize, 1ULL); ++ } ++ ASSERT3U(*cursor, <=, *extent_end); ++ return (offset); ++} ++ ++static boolean_t ++metaslab_cdf_fragmented(space_map_t *sm) ++{ ++ uint64_t max_size = metaslab_pp_maxsize(sm); ++ ++ if (max_size > (metaslab_min_alloc_size * 10)) ++ return (B_FALSE); ++ return (B_TRUE); ++} ++ ++static space_map_ops_t metaslab_cdf_ops = { ++ metaslab_pp_load, ++ metaslab_pp_unload, ++ metaslab_cdf_alloc, ++ metaslab_pp_claim, ++ metaslab_pp_free, ++ metaslab_pp_maxsize, ++ metaslab_cdf_fragmented ++}; ++ ++space_map_ops_t *zfs_metaslab_ops = &metaslab_cdf_ops; ++#endif /* WITH_CDF_BLOCK_ALLOCATOR */ ++ ++#if defined(WITH_NDF_BLOCK_ALLOCATOR) ++uint64_t metaslab_ndf_clump_shift = 4; ++ ++static uint64_t ++metaslab_ndf_alloc(space_map_t *sm, uint64_t size) ++{ ++ avl_tree_t *t = &sm->sm_root; ++ avl_index_t where; ++ space_seg_t *ss, ssearch; ++ uint64_t hbit = highbit(size); ++ uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; ++ uint64_t max_size = metaslab_pp_maxsize(sm); ++ ++ ASSERT(MUTEX_HELD(sm->sm_lock)); ++ ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); ++ ++ if (max_size < size) ++ return (-1ULL); ++ ++ ssearch.ss_start = *cursor; ++ ssearch.ss_end = *cursor + size; ++ ++ ss = avl_find(t, &ssearch, &where); ++ if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { ++ t = sm->sm_pp_root; ++ ++ ssearch.ss_start = 0; ++ ssearch.ss_end = MIN(max_size, ++ 1ULL << (hbit + metaslab_ndf_clump_shift)); ++ ss = avl_find(t, &ssearch, &where); ++ if (ss == NULL) ++ ss = avl_nearest(t, where, AVL_AFTER); ++ ASSERT(ss != NULL); ++ } ++ ++ if (ss != NULL) { ++ if (ss->ss_start + size <= ss->ss_end) { ++ *cursor = ss->ss_start + size; ++ return (ss->ss_start); ++ } ++ } ++ return (-1ULL); ++} ++ ++static boolean_t ++metaslab_ndf_fragmented(space_map_t *sm) ++{ ++ uint64_t max_size = metaslab_pp_maxsize(sm); ++ ++ if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) ++ return (B_FALSE); ++ return (B_TRUE); ++} ++ ++ ++static space_map_ops_t metaslab_ndf_ops = { ++ metaslab_pp_load, ++ metaslab_pp_unload, ++ metaslab_ndf_alloc, ++ metaslab_pp_claim, ++ metaslab_pp_free, ++ metaslab_pp_maxsize, ++ metaslab_ndf_fragmented ++}; ++ ++space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; ++#endif /* WITH_NDF_BLOCK_ALLOCATOR */ ++ ++/* ++ * ========================================================================== ++ * Metaslabs ++ * ========================================================================== ++ */ ++metaslab_t * ++metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, ++ uint64_t start, uint64_t size, uint64_t txg) ++{ ++ vdev_t *vd = mg->mg_vd; ++ metaslab_t *msp; ++ ++ msp = kmem_zalloc(sizeof (metaslab_t), KM_PUSHPAGE); ++ mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ msp->ms_smo_syncing = *smo; ++ ++ /* ++ * We create the main space map here, but we don't create the ++ * allocmaps and freemaps until metaslab_sync_done(). This serves ++ * two purposes: it allows metaslab_sync_done() to detect the ++ * addition of new space; and for debugging, it ensures that we'd ++ * data fault on any attempt to use this metaslab before it's ready. ++ */ ++ space_map_create(&msp->ms_map, start, size, ++ vd->vdev_ashift, &msp->ms_lock); ++ ++ metaslab_group_add(mg, msp); ++ ++ if (metaslab_debug && smo->smo_object != 0) { ++ mutex_enter(&msp->ms_lock); ++ VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, ++ SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); ++ mutex_exit(&msp->ms_lock); ++ } ++ ++ /* ++ * If we're opening an existing pool (txg == 0) or creating ++ * a new one (txg == TXG_INITIAL), all space is available now. ++ * If we're adding space to an existing pool, the new space ++ * does not become available until after this txg has synced. ++ */ ++ if (txg <= TXG_INITIAL) ++ metaslab_sync_done(msp, 0); ++ ++ if (txg != 0) { ++ vdev_dirty(vd, 0, NULL, txg); ++ vdev_dirty(vd, VDD_METASLAB, msp, txg); ++ } ++ ++ return (msp); ++} ++ ++void ++metaslab_fini(metaslab_t *msp) ++{ ++ metaslab_group_t *mg = msp->ms_group; ++ int t; ++ ++ vdev_space_update(mg->mg_vd, ++ -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); ++ ++ metaslab_group_remove(mg, msp); ++ ++ mutex_enter(&msp->ms_lock); ++ ++ space_map_unload(&msp->ms_map); ++ space_map_destroy(&msp->ms_map); ++ ++ for (t = 0; t < TXG_SIZE; t++) { ++ space_map_destroy(&msp->ms_allocmap[t]); ++ space_map_destroy(&msp->ms_freemap[t]); ++ } ++ ++ for (t = 0; t < TXG_DEFER_SIZE; t++) ++ space_map_destroy(&msp->ms_defermap[t]); ++ ++ ASSERT3S(msp->ms_deferspace, ==, 0); ++ ++ mutex_exit(&msp->ms_lock); ++ mutex_destroy(&msp->ms_lock); ++ ++ kmem_free(msp, sizeof (metaslab_t)); ++} ++ ++#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) ++#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) ++#define METASLAB_ACTIVE_MASK \ ++ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) ++ ++static uint64_t ++metaslab_weight(metaslab_t *msp) ++{ ++ metaslab_group_t *mg = msp->ms_group; ++ space_map_t *sm = &msp->ms_map; ++ space_map_obj_t *smo = &msp->ms_smo; ++ vdev_t *vd = mg->mg_vd; ++ uint64_t weight, space; ++ ++ ASSERT(MUTEX_HELD(&msp->ms_lock)); ++ ++ /* ++ * The baseline weight is the metaslab's free space. ++ */ ++ space = sm->sm_size - smo->smo_alloc; ++ weight = space; ++ ++ /* ++ * Modern disks have uniform bit density and constant angular velocity. ++ * Therefore, the outer recording zones are faster (higher bandwidth) ++ * than the inner zones by the ratio of outer to inner track diameter, ++ * which is typically around 2:1. We account for this by assigning ++ * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). ++ * In effect, this means that we'll select the metaslab with the most ++ * free bandwidth rather than simply the one with the most free space. ++ */ ++ weight = 2 * weight - ++ ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; ++ ASSERT(weight >= space && weight <= 2 * space); ++ ++ /* ++ * For locality, assign higher weight to metaslabs which have ++ * a lower offset than what we've already activated. ++ */ ++ if (sm->sm_start <= mg->mg_bonus_area) ++ weight *= (metaslab_smo_bonus_pct / 100); ++ ASSERT(weight >= space && ++ weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); ++ ++ if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { ++ /* ++ * If this metaslab is one we're actively using, adjust its ++ * weight to make it preferable to any inactive metaslab so ++ * we'll polish it off. ++ */ ++ weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); ++ } ++ return (weight); ++} ++ ++static void ++metaslab_prefetch(metaslab_group_t *mg) ++{ ++ spa_t *spa = mg->mg_vd->vdev_spa; ++ metaslab_t *msp; ++ avl_tree_t *t = &mg->mg_metaslab_tree; ++ int m; ++ ++ mutex_enter(&mg->mg_lock); ++ ++ /* ++ * Prefetch the next potential metaslabs ++ */ ++ for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { ++ space_map_t *sm = &msp->ms_map; ++ space_map_obj_t *smo = &msp->ms_smo; ++ ++ /* If we have reached our prefetch limit then we're done */ ++ if (m >= metaslab_prefetch_limit) ++ break; ++ ++ if (!sm->sm_loaded && smo->smo_object != 0) { ++ mutex_exit(&mg->mg_lock); ++ dmu_prefetch(spa_meta_objset(spa), smo->smo_object, ++ 0ULL, smo->smo_objsize); ++ mutex_enter(&mg->mg_lock); ++ } ++ } ++ mutex_exit(&mg->mg_lock); ++} ++ ++static int ++metaslab_activate(metaslab_t *msp, uint64_t activation_weight) ++{ ++ metaslab_group_t *mg = msp->ms_group; ++ space_map_t *sm = &msp->ms_map; ++ space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; ++ int t; ++ ++ ASSERT(MUTEX_HELD(&msp->ms_lock)); ++ ++ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { ++ space_map_load_wait(sm); ++ if (!sm->sm_loaded) { ++ int error = space_map_load(sm, sm_ops, SM_FREE, ++ &msp->ms_smo, ++ spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); ++ if (error) { ++ metaslab_group_sort(msp->ms_group, msp, 0); ++ return (error); ++ } ++ for (t = 0; t < TXG_DEFER_SIZE; t++) ++ space_map_walk(&msp->ms_defermap[t], ++ space_map_claim, sm); ++ ++ } ++ ++ /* ++ * Track the bonus area as we activate new metaslabs. ++ */ ++ if (sm->sm_start > mg->mg_bonus_area) { ++ mutex_enter(&mg->mg_lock); ++ mg->mg_bonus_area = sm->sm_start; ++ mutex_exit(&mg->mg_lock); ++ } ++ ++ metaslab_group_sort(msp->ms_group, msp, ++ msp->ms_weight | activation_weight); ++ } ++ ASSERT(sm->sm_loaded); ++ ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); ++ ++ return (0); ++} ++ ++static void ++metaslab_passivate(metaslab_t *msp, uint64_t size) ++{ ++ /* ++ * If size < SPA_MINBLOCKSIZE, then we will not allocate from ++ * this metaslab again. In that case, it had better be empty, ++ * or we would be leaving space on the table. ++ */ ++ ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); ++ metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); ++ ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); ++} ++ ++/* ++ * Write a metaslab to disk in the context of the specified transaction group. ++ */ ++void ++metaslab_sync(metaslab_t *msp, uint64_t txg) ++{ ++ vdev_t *vd = msp->ms_group->mg_vd; ++ spa_t *spa = vd->vdev_spa; ++ objset_t *mos = spa_meta_objset(spa); ++ space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; ++ space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; ++ space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; ++ space_map_t *sm = &msp->ms_map; ++ space_map_obj_t *smo = &msp->ms_smo_syncing; ++ dmu_buf_t *db; ++ dmu_tx_t *tx; ++ int t; ++ ++ ASSERT(!vd->vdev_ishole); ++ ++ if (allocmap->sm_space == 0 && freemap->sm_space == 0) ++ return; ++ ++ /* ++ * The only state that can actually be changing concurrently with ++ * metaslab_sync() is the metaslab's ms_map. No other thread can ++ * be modifying this txg's allocmap, freemap, freed_map, or smo. ++ * Therefore, we only hold ms_lock to satify space_map ASSERTs. ++ * We drop it whenever we call into the DMU, because the DMU ++ * can call down to us (e.g. via zio_free()) at any time. ++ */ ++ ++ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); ++ ++ if (smo->smo_object == 0) { ++ ASSERT(smo->smo_objsize == 0); ++ ASSERT(smo->smo_alloc == 0); ++ smo->smo_object = dmu_object_alloc(mos, ++ DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, ++ DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); ++ ASSERT(smo->smo_object != 0); ++ dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * ++ (sm->sm_start >> vd->vdev_ms_shift), ++ sizeof (uint64_t), &smo->smo_object, tx); ++ } ++ ++ mutex_enter(&msp->ms_lock); ++ ++ space_map_walk(freemap, space_map_add, freed_map); ++ ++ if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= ++ 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { ++ /* ++ * The in-core space map representation is twice as compact ++ * as the on-disk one, so it's time to condense the latter ++ * by generating a pure allocmap from first principles. ++ * ++ * This metaslab is 100% allocated, ++ * minus the content of the in-core map (sm), ++ * minus what's been freed this txg (freed_map), ++ * minus deferred frees (ms_defermap[]), ++ * minus allocations from txgs in the future ++ * (because they haven't been committed yet). ++ */ ++ space_map_vacate(allocmap, NULL, NULL); ++ space_map_vacate(freemap, NULL, NULL); ++ ++ space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); ++ ++ space_map_walk(sm, space_map_remove, allocmap); ++ space_map_walk(freed_map, space_map_remove, allocmap); ++ ++ for (t = 0; t < TXG_DEFER_SIZE; t++) ++ space_map_walk(&msp->ms_defermap[t], ++ space_map_remove, allocmap); ++ ++ for (t = 1; t < TXG_CONCURRENT_STATES; t++) ++ space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], ++ space_map_remove, allocmap); ++ ++ mutex_exit(&msp->ms_lock); ++ space_map_truncate(smo, mos, tx); ++ mutex_enter(&msp->ms_lock); ++ } ++ ++ space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); ++ space_map_sync(freemap, SM_FREE, smo, mos, tx); ++ ++ mutex_exit(&msp->ms_lock); ++ ++ VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); ++ dmu_buf_will_dirty(db, tx); ++ ASSERT3U(db->db_size, >=, sizeof (*smo)); ++ bcopy(smo, db->db_data, sizeof (*smo)); ++ dmu_buf_rele(db, FTAG); ++ ++ dmu_tx_commit(tx); ++} ++ ++/* ++ * Called after a transaction group has completely synced to mark ++ * all of the metaslab's free space as usable. ++ */ ++void ++metaslab_sync_done(metaslab_t *msp, uint64_t txg) ++{ ++ space_map_obj_t *smo = &msp->ms_smo; ++ space_map_obj_t *smosync = &msp->ms_smo_syncing; ++ space_map_t *sm = &msp->ms_map; ++ space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; ++ space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; ++ metaslab_group_t *mg = msp->ms_group; ++ vdev_t *vd = mg->mg_vd; ++ int64_t alloc_delta, defer_delta; ++ int t; ++ ++ ASSERT(!vd->vdev_ishole); ++ ++ mutex_enter(&msp->ms_lock); ++ ++ /* ++ * If this metaslab is just becoming available, initialize its ++ * allocmaps and freemaps and add its capacity to the vdev. ++ */ ++ if (freed_map->sm_size == 0) { ++ for (t = 0; t < TXG_SIZE; t++) { ++ space_map_create(&msp->ms_allocmap[t], sm->sm_start, ++ sm->sm_size, sm->sm_shift, sm->sm_lock); ++ space_map_create(&msp->ms_freemap[t], sm->sm_start, ++ sm->sm_size, sm->sm_shift, sm->sm_lock); ++ } ++ ++ for (t = 0; t < TXG_DEFER_SIZE; t++) ++ space_map_create(&msp->ms_defermap[t], sm->sm_start, ++ sm->sm_size, sm->sm_shift, sm->sm_lock); ++ ++ vdev_space_update(vd, 0, 0, sm->sm_size); ++ } ++ ++ alloc_delta = smosync->smo_alloc - smo->smo_alloc; ++ defer_delta = freed_map->sm_space - defer_map->sm_space; ++ ++ vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); ++ ++ ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); ++ ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); ++ ++ /* ++ * If there's a space_map_load() in progress, wait for it to complete ++ * so that we have a consistent view of the in-core space map. ++ * Then, add defer_map (oldest deferred frees) to this map and ++ * transfer freed_map (this txg's frees) to defer_map. ++ */ ++ space_map_load_wait(sm); ++ space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); ++ space_map_vacate(freed_map, space_map_add, defer_map); ++ ++ *smo = *smosync; ++ ++ msp->ms_deferspace += defer_delta; ++ ASSERT3S(msp->ms_deferspace, >=, 0); ++ ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); ++ if (msp->ms_deferspace != 0) { ++ /* ++ * Keep syncing this metaslab until all deferred frees ++ * are back in circulation. ++ */ ++ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); ++ } ++ ++ /* ++ * If the map is loaded but no longer active, evict it as soon as all ++ * future allocations have synced. (If we unloaded it now and then ++ * loaded a moment later, the map wouldn't reflect those allocations.) ++ */ ++ if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { ++ int evictable = 1; ++ ++ for (t = 1; t < TXG_CONCURRENT_STATES; t++) ++ if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) ++ evictable = 0; ++ ++ if (evictable && !metaslab_debug) ++ space_map_unload(sm); ++ } ++ ++ metaslab_group_sort(mg, msp, metaslab_weight(msp)); ++ ++ mutex_exit(&msp->ms_lock); ++} ++ ++void ++metaslab_sync_reassess(metaslab_group_t *mg) ++{ ++ vdev_t *vd = mg->mg_vd; ++ int64_t failures = mg->mg_alloc_failures; ++ int m; ++ ++ /* ++ * Re-evaluate all metaslabs which have lower offsets than the ++ * bonus area. ++ */ ++ for (m = 0; m < vd->vdev_ms_count; m++) { ++ metaslab_t *msp = vd->vdev_ms[m]; ++ ++ if (msp->ms_map.sm_start > mg->mg_bonus_area) ++ break; ++ ++ mutex_enter(&msp->ms_lock); ++ metaslab_group_sort(mg, msp, metaslab_weight(msp)); ++ mutex_exit(&msp->ms_lock); ++ } ++ ++ atomic_add_64(&mg->mg_alloc_failures, -failures); ++ ++ /* ++ * Prefetch the next potential metaslabs ++ */ ++ metaslab_prefetch(mg); ++} ++ ++static uint64_t ++metaslab_distance(metaslab_t *msp, dva_t *dva) ++{ ++ uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; ++ uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; ++ uint64_t start = msp->ms_map.sm_start >> ms_shift; ++ ++ if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) ++ return (1ULL << 63); ++ ++ if (offset < start) ++ return ((start - offset) << ms_shift); ++ if (offset > start) ++ return ((offset - start) << ms_shift); ++ return (0); ++} ++ ++static uint64_t ++metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, ++ uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) ++{ ++ spa_t *spa = mg->mg_vd->vdev_spa; ++ metaslab_t *msp = NULL; ++ uint64_t offset = -1ULL; ++ avl_tree_t *t = &mg->mg_metaslab_tree; ++ uint64_t activation_weight; ++ uint64_t target_distance; ++ int i; ++ ++ activation_weight = METASLAB_WEIGHT_PRIMARY; ++ for (i = 0; i < d; i++) { ++ if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { ++ activation_weight = METASLAB_WEIGHT_SECONDARY; ++ break; ++ } ++ } ++ ++ for (;;) { ++ boolean_t was_active; ++ ++ mutex_enter(&mg->mg_lock); ++ for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { ++ if (msp->ms_weight < asize) { ++ spa_dbgmsg(spa, "%s: failed to meet weight " ++ "requirement: vdev %llu, txg %llu, mg %p, " ++ "msp %p, psize %llu, asize %llu, " ++ "failures %llu, weight %llu", ++ spa_name(spa), mg->mg_vd->vdev_id, txg, ++ mg, msp, psize, asize, ++ mg->mg_alloc_failures, msp->ms_weight); ++ mutex_exit(&mg->mg_lock); ++ return (-1ULL); ++ } ++ was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; ++ if (activation_weight == METASLAB_WEIGHT_PRIMARY) ++ break; ++ ++ target_distance = min_distance + ++ (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); ++ ++ for (i = 0; i < d; i++) ++ if (metaslab_distance(msp, &dva[i]) < ++ target_distance) ++ break; ++ if (i == d) ++ break; ++ } ++ mutex_exit(&mg->mg_lock); ++ if (msp == NULL) ++ return (-1ULL); ++ ++ /* ++ * If we've already reached the allowable number of failed ++ * allocation attempts on this metaslab group then we ++ * consider skipping it. We skip it only if we're allowed ++ * to "fast" gang, the physical size is larger than ++ * a gang block, and we're attempting to allocate from ++ * the primary metaslab. ++ */ ++ if (mg->mg_alloc_failures > zfs_mg_alloc_failures && ++ CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && ++ activation_weight == METASLAB_WEIGHT_PRIMARY) { ++ spa_dbgmsg(spa, "%s: skipping metaslab group: " ++ "vdev %llu, txg %llu, mg %p, psize %llu, " ++ "asize %llu, failures %llu", spa_name(spa), ++ mg->mg_vd->vdev_id, txg, mg, psize, asize, ++ mg->mg_alloc_failures); ++ return (-1ULL); ++ } ++ ++ mutex_enter(&msp->ms_lock); ++ ++ /* ++ * Ensure that the metaslab we have selected is still ++ * capable of handling our request. It's possible that ++ * another thread may have changed the weight while we ++ * were blocked on the metaslab lock. ++ */ ++ if (msp->ms_weight < asize || (was_active && ++ !(msp->ms_weight & METASLAB_ACTIVE_MASK) && ++ activation_weight == METASLAB_WEIGHT_PRIMARY)) { ++ mutex_exit(&msp->ms_lock); ++ continue; ++ } ++ ++ if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && ++ activation_weight == METASLAB_WEIGHT_PRIMARY) { ++ metaslab_passivate(msp, ++ msp->ms_weight & ~METASLAB_ACTIVE_MASK); ++ mutex_exit(&msp->ms_lock); ++ continue; ++ } ++ ++ if (metaslab_activate(msp, activation_weight) != 0) { ++ mutex_exit(&msp->ms_lock); ++ continue; ++ } ++ ++ if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) ++ break; ++ ++ atomic_inc_64(&mg->mg_alloc_failures); ++ ++ metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); ++ ++ mutex_exit(&msp->ms_lock); ++ } ++ ++ if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) ++ vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); ++ ++ space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); ++ ++ mutex_exit(&msp->ms_lock); ++ ++ return (offset); ++} ++ ++/* ++ * Allocate a block for the specified i/o. ++ */ ++static int ++metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, ++ dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) ++{ ++ metaslab_group_t *mg, *fast_mg, *rotor; ++ vdev_t *vd; ++ int dshift = 3; ++ int all_zero; ++ int zio_lock = B_FALSE; ++ boolean_t allocatable; ++ uint64_t offset = -1ULL; ++ uint64_t asize; ++ uint64_t distance; ++ ++ ASSERT(!DVA_IS_VALID(&dva[d])); ++ ++ /* ++ * For testing, make some blocks above a certain size be gang blocks. ++ */ ++ if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) ++ return (ENOSPC); ++ ++ if (flags & METASLAB_FASTWRITE) ++ mutex_enter(&mc->mc_fastwrite_lock); ++ ++ /* ++ * Start at the rotor and loop through all mgs until we find something. ++ * Note that there's no locking on mc_rotor or mc_aliquot because ++ * nothing actually breaks if we miss a few updates -- we just won't ++ * allocate quite as evenly. It all balances out over time. ++ * ++ * If we are doing ditto or log blocks, try to spread them across ++ * consecutive vdevs. If we're forced to reuse a vdev before we've ++ * allocated all of our ditto blocks, then try and spread them out on ++ * that vdev as much as possible. If it turns out to not be possible, ++ * gradually lower our standards until anything becomes acceptable. ++ * Also, allocating on consecutive vdevs (as opposed to random vdevs) ++ * gives us hope of containing our fault domains to something we're ++ * able to reason about. Otherwise, any two top-level vdev failures ++ * will guarantee the loss of data. With consecutive allocation, ++ * only two adjacent top-level vdev failures will result in data loss. ++ * ++ * If we are doing gang blocks (hintdva is non-NULL), try to keep ++ * ourselves on the same vdev as our gang block header. That ++ * way, we can hope for locality in vdev_cache, plus it makes our ++ * fault domains something tractable. ++ */ ++ if (hintdva) { ++ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); ++ ++ /* ++ * It's possible the vdev we're using as the hint no ++ * longer exists (i.e. removed). Consult the rotor when ++ * all else fails. ++ */ ++ if (vd != NULL) { ++ mg = vd->vdev_mg; ++ ++ if (flags & METASLAB_HINTBP_AVOID && ++ mg->mg_next != NULL) ++ mg = mg->mg_next; ++ } else { ++ mg = mc->mc_rotor; ++ } ++ } else if (d != 0) { ++ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); ++ mg = vd->vdev_mg->mg_next; ++ } else if (flags & METASLAB_FASTWRITE) { ++ mg = fast_mg = mc->mc_rotor; ++ ++ do { ++ if (fast_mg->mg_vd->vdev_pending_fastwrite < ++ mg->mg_vd->vdev_pending_fastwrite) ++ mg = fast_mg; ++ } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor); ++ ++ } else { ++ mg = mc->mc_rotor; ++ } ++ ++ /* ++ * If the hint put us into the wrong metaslab class, or into a ++ * metaslab group that has been passivated, just follow the rotor. ++ */ ++ if (mg->mg_class != mc || mg->mg_activation_count <= 0) ++ mg = mc->mc_rotor; ++ ++ rotor = mg; ++top: ++ all_zero = B_TRUE; ++ do { ++ ASSERT(mg->mg_activation_count == 1); ++ ++ vd = mg->mg_vd; ++ ++ /* ++ * Don't allocate from faulted devices. ++ */ ++ if (zio_lock) { ++ spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); ++ allocatable = vdev_allocatable(vd); ++ spa_config_exit(spa, SCL_ZIO, FTAG); ++ } else { ++ allocatable = vdev_allocatable(vd); ++ } ++ if (!allocatable) ++ goto next; ++ ++ /* ++ * Avoid writing single-copy data to a failing vdev ++ */ ++ if ((vd->vdev_stat.vs_write_errors > 0 || ++ vd->vdev_state < VDEV_STATE_HEALTHY) && ++ d == 0 && dshift == 3) { ++ all_zero = B_FALSE; ++ goto next; ++ } ++ ++ ASSERT(mg->mg_class == mc); ++ ++ distance = vd->vdev_asize >> dshift; ++ if (distance <= (1ULL << vd->vdev_ms_shift)) ++ distance = 0; ++ else ++ all_zero = B_FALSE; ++ ++ asize = vdev_psize_to_asize(vd, psize); ++ ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); ++ ++ offset = metaslab_group_alloc(mg, psize, asize, txg, distance, ++ dva, d, flags); ++ if (offset != -1ULL) { ++ /* ++ * If we've just selected this metaslab group, ++ * figure out whether the corresponding vdev is ++ * over- or under-used relative to the pool, ++ * and set an allocation bias to even it out. ++ */ ++ if (mc->mc_aliquot == 0) { ++ vdev_stat_t *vs = &vd->vdev_stat; ++ int64_t vu, cu; ++ ++ vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); ++ cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); ++ ++ /* ++ * Calculate how much more or less we should ++ * try to allocate from this device during ++ * this iteration around the rotor. ++ * For example, if a device is 80% full ++ * and the pool is 20% full then we should ++ * reduce allocations by 60% on this device. ++ * ++ * mg_bias = (20 - 80) * 512K / 100 = -307K ++ * ++ * This reduces allocations by 307K for this ++ * iteration. ++ */ ++ mg->mg_bias = ((cu - vu) * ++ (int64_t)mg->mg_aliquot) / 100; ++ } ++ ++ if ((flags & METASLAB_FASTWRITE) || ++ atomic_add_64_nv(&mc->mc_aliquot, asize) >= ++ mg->mg_aliquot + mg->mg_bias) { ++ mc->mc_rotor = mg->mg_next; ++ mc->mc_aliquot = 0; ++ } ++ ++ DVA_SET_VDEV(&dva[d], vd->vdev_id); ++ DVA_SET_OFFSET(&dva[d], offset); ++ DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); ++ DVA_SET_ASIZE(&dva[d], asize); ++ ++ if (flags & METASLAB_FASTWRITE) { ++ atomic_add_64(&vd->vdev_pending_fastwrite, ++ psize); ++ mutex_exit(&mc->mc_fastwrite_lock); ++ } ++ ++ return (0); ++ } ++next: ++ mc->mc_rotor = mg->mg_next; ++ mc->mc_aliquot = 0; ++ } while ((mg = mg->mg_next) != rotor); ++ ++ if (!all_zero) { ++ dshift++; ++ ASSERT(dshift < 64); ++ goto top; ++ } ++ ++ if (!allocatable && !zio_lock) { ++ dshift = 3; ++ zio_lock = B_TRUE; ++ goto top; ++ } ++ ++ bzero(&dva[d], sizeof (dva_t)); ++ ++ if (flags & METASLAB_FASTWRITE) ++ mutex_exit(&mc->mc_fastwrite_lock); ++ return (ENOSPC); ++} ++ ++/* ++ * Free the block represented by DVA in the context of the specified ++ * transaction group. ++ */ ++static void ++metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) ++{ ++ uint64_t vdev = DVA_GET_VDEV(dva); ++ uint64_t offset = DVA_GET_OFFSET(dva); ++ uint64_t size = DVA_GET_ASIZE(dva); ++ vdev_t *vd; ++ metaslab_t *msp; ++ ++ ASSERT(DVA_IS_VALID(dva)); ++ ++ if (txg > spa_freeze_txg(spa)) ++ return; ++ ++ if ((vd = vdev_lookup_top(spa, vdev)) == NULL || ++ (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { ++ cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", ++ (u_longlong_t)vdev, (u_longlong_t)offset); ++ ASSERT(0); ++ return; ++ } ++ ++ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ++ ++ if (DVA_GET_GANG(dva)) ++ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); ++ ++ mutex_enter(&msp->ms_lock); ++ ++ if (now) { ++ space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], ++ offset, size); ++ space_map_free(&msp->ms_map, offset, size); ++ } else { ++ if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) ++ vdev_dirty(vd, VDD_METASLAB, msp, txg); ++ space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); ++ } ++ ++ mutex_exit(&msp->ms_lock); ++} ++ ++/* ++ * Intent log support: upon opening the pool after a crash, notify the SPA ++ * of blocks that the intent log has allocated for immediate write, but ++ * which are still considered free by the SPA because the last transaction ++ * group didn't commit yet. ++ */ ++static int ++metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) ++{ ++ uint64_t vdev = DVA_GET_VDEV(dva); ++ uint64_t offset = DVA_GET_OFFSET(dva); ++ uint64_t size = DVA_GET_ASIZE(dva); ++ vdev_t *vd; ++ metaslab_t *msp; ++ int error = 0; ++ ++ ASSERT(DVA_IS_VALID(dva)); ++ ++ if ((vd = vdev_lookup_top(spa, vdev)) == NULL || ++ (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) ++ return (ENXIO); ++ ++ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ++ ++ if (DVA_GET_GANG(dva)) ++ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); ++ ++ mutex_enter(&msp->ms_lock); ++ ++ if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) ++ error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); ++ ++ if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) ++ error = ENOENT; ++ ++ if (error || txg == 0) { /* txg == 0 indicates dry run */ ++ mutex_exit(&msp->ms_lock); ++ return (error); ++ } ++ ++ space_map_claim(&msp->ms_map, offset, size); ++ ++ if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ ++ if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) ++ vdev_dirty(vd, VDD_METASLAB, msp, txg); ++ space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); ++ } ++ ++ mutex_exit(&msp->ms_lock); ++ ++ return (0); ++} ++ ++int ++metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, ++ int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) ++{ ++ dva_t *dva = bp->blk_dva; ++ dva_t *hintdva = hintbp->blk_dva; ++ int d, error = 0; ++ ++ ASSERT(bp->blk_birth == 0); ++ ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); ++ ++ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); ++ ++ if (mc->mc_rotor == NULL) { /* no vdevs in this class */ ++ spa_config_exit(spa, SCL_ALLOC, FTAG); ++ return (ENOSPC); ++ } ++ ++ ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); ++ ASSERT(BP_GET_NDVAS(bp) == 0); ++ ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); ++ ++ for (d = 0; d < ndvas; d++) { ++ error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, ++ txg, flags); ++ if (error) { ++ for (d--; d >= 0; d--) { ++ metaslab_free_dva(spa, &dva[d], txg, B_TRUE); ++ bzero(&dva[d], sizeof (dva_t)); ++ } ++ spa_config_exit(spa, SCL_ALLOC, FTAG); ++ return (error); ++ } ++ } ++ ASSERT(error == 0); ++ ASSERT(BP_GET_NDVAS(bp) == ndvas); ++ ++ spa_config_exit(spa, SCL_ALLOC, FTAG); ++ ++ BP_SET_BIRTH(bp, txg, txg); ++ ++ return (0); ++} ++ ++void ++metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) ++{ ++ const dva_t *dva = bp->blk_dva; ++ int d, ndvas = BP_GET_NDVAS(bp); ++ ++ ASSERT(!BP_IS_HOLE(bp)); ++ ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); ++ ++ spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); ++ ++ for (d = 0; d < ndvas; d++) ++ metaslab_free_dva(spa, &dva[d], txg, now); ++ ++ spa_config_exit(spa, SCL_FREE, FTAG); ++} ++ ++int ++metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) ++{ ++ const dva_t *dva = bp->blk_dva; ++ int ndvas = BP_GET_NDVAS(bp); ++ int d, error = 0; ++ ++ ASSERT(!BP_IS_HOLE(bp)); ++ ++ if (txg != 0) { ++ /* ++ * First do a dry run to make sure all DVAs are claimable, ++ * so we don't have to unwind from partial failures below. ++ */ ++ if ((error = metaslab_claim(spa, bp, 0)) != 0) ++ return (error); ++ } ++ ++ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); ++ ++ for (d = 0; d < ndvas; d++) ++ if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) ++ break; ++ ++ spa_config_exit(spa, SCL_ALLOC, FTAG); ++ ++ ASSERT(error == 0 || txg == 0); ++ ++ return (error); ++} ++ ++void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) ++{ ++ const dva_t *dva = bp->blk_dva; ++ int ndvas = BP_GET_NDVAS(bp); ++ uint64_t psize = BP_GET_PSIZE(bp); ++ int d; ++ vdev_t *vd; ++ ++ ASSERT(!BP_IS_HOLE(bp)); ++ ASSERT(psize > 0); ++ ++ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ++ ++ for (d = 0; d < ndvas; d++) { ++ if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) ++ continue; ++ atomic_add_64(&vd->vdev_pending_fastwrite, psize); ++ } ++ ++ spa_config_exit(spa, SCL_VDEV, FTAG); ++} ++ ++void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) ++{ ++ const dva_t *dva = bp->blk_dva; ++ int ndvas = BP_GET_NDVAS(bp); ++ uint64_t psize = BP_GET_PSIZE(bp); ++ int d; ++ vdev_t *vd; ++ ++ ASSERT(!BP_IS_HOLE(bp)); ++ ASSERT(psize > 0); ++ ++ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ++ ++ for (d = 0; d < ndvas; d++) { ++ if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) ++ continue; ++ ASSERT3U(vd->vdev_pending_fastwrite, >=, psize); ++ atomic_sub_64(&vd->vdev_pending_fastwrite, psize); ++ } ++ ++ spa_config_exit(spa, SCL_VDEV, FTAG); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/refcount.c linux-3.2.33-go/fs/zfs/zfs/refcount.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/refcount.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/refcount.c 2012-11-16 23:25:34.349039334 +0100 +@@ -0,0 +1,223 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++ ++#ifdef ZFS_DEBUG ++ ++#ifdef _KERNEL ++int reference_tracking_enable = FALSE; /* runs out of memory too easily */ ++#else ++int reference_tracking_enable = TRUE; ++#endif ++int reference_history = 4; /* tunable */ ++ ++static kmem_cache_t *reference_cache; ++static kmem_cache_t *reference_history_cache; ++ ++void ++refcount_init(void) ++{ ++ reference_cache = kmem_cache_create("reference_cache", ++ sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); ++ ++ reference_history_cache = kmem_cache_create("reference_history_cache", ++ sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); ++} ++ ++void ++refcount_fini(void) ++{ ++ kmem_cache_destroy(reference_cache); ++ kmem_cache_destroy(reference_history_cache); ++} ++ ++void ++refcount_create(refcount_t *rc) ++{ ++ mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); ++ list_create(&rc->rc_list, sizeof (reference_t), ++ offsetof(reference_t, ref_link)); ++ list_create(&rc->rc_removed, sizeof (reference_t), ++ offsetof(reference_t, ref_link)); ++ rc->rc_count = 0; ++ rc->rc_removed_count = 0; ++} ++ ++void ++refcount_destroy_many(refcount_t *rc, uint64_t number) ++{ ++ reference_t *ref; ++ ++ ASSERT(rc->rc_count == number); ++ while ((ref = list_head(&rc->rc_list))) { ++ list_remove(&rc->rc_list, ref); ++ kmem_cache_free(reference_cache, ref); ++ } ++ list_destroy(&rc->rc_list); ++ ++ while ((ref = list_head(&rc->rc_removed))) { ++ list_remove(&rc->rc_removed, ref); ++ kmem_cache_free(reference_history_cache, ref->ref_removed); ++ kmem_cache_free(reference_cache, ref); ++ } ++ list_destroy(&rc->rc_removed); ++ mutex_destroy(&rc->rc_mtx); ++} ++ ++void ++refcount_destroy(refcount_t *rc) ++{ ++ refcount_destroy_many(rc, 0); ++} ++ ++int ++refcount_is_zero(refcount_t *rc) ++{ ++ ASSERT(rc->rc_count >= 0); ++ return (rc->rc_count == 0); ++} ++ ++int64_t ++refcount_count(refcount_t *rc) ++{ ++ ASSERT(rc->rc_count >= 0); ++ return (rc->rc_count); ++} ++ ++int64_t ++refcount_add_many(refcount_t *rc, uint64_t number, void *holder) ++{ ++ reference_t *ref = NULL; ++ int64_t count; ++ ++ if (reference_tracking_enable) { ++ ref = kmem_cache_alloc(reference_cache, KM_PUSHPAGE); ++ ref->ref_holder = holder; ++ ref->ref_number = number; ++ } ++ mutex_enter(&rc->rc_mtx); ++ ASSERT(rc->rc_count >= 0); ++ if (reference_tracking_enable) ++ list_insert_head(&rc->rc_list, ref); ++ rc->rc_count += number; ++ count = rc->rc_count; ++ mutex_exit(&rc->rc_mtx); ++ ++ return (count); ++} ++ ++int64_t ++refcount_add(refcount_t *rc, void *holder) ++{ ++ return (refcount_add_many(rc, 1, holder)); ++} ++ ++int64_t ++refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) ++{ ++ reference_t *ref; ++ int64_t count; ++ ++ mutex_enter(&rc->rc_mtx); ++ ASSERT(rc->rc_count >= number); ++ ++ if (!reference_tracking_enable) { ++ rc->rc_count -= number; ++ count = rc->rc_count; ++ mutex_exit(&rc->rc_mtx); ++ return (count); ++ } ++ ++ for (ref = list_head(&rc->rc_list); ref; ++ ref = list_next(&rc->rc_list, ref)) { ++ if (ref->ref_holder == holder && ref->ref_number == number) { ++ list_remove(&rc->rc_list, ref); ++ if (reference_history > 0) { ++ ref->ref_removed = ++ kmem_cache_alloc(reference_history_cache, ++ KM_PUSHPAGE); ++ list_insert_head(&rc->rc_removed, ref); ++ rc->rc_removed_count++; ++ if (rc->rc_removed_count >= reference_history) { ++ ref = list_tail(&rc->rc_removed); ++ list_remove(&rc->rc_removed, ref); ++ kmem_cache_free(reference_history_cache, ++ ref->ref_removed); ++ kmem_cache_free(reference_cache, ref); ++ rc->rc_removed_count--; ++ } ++ } else { ++ kmem_cache_free(reference_cache, ref); ++ } ++ rc->rc_count -= number; ++ count = rc->rc_count; ++ mutex_exit(&rc->rc_mtx); ++ return (count); ++ } ++ } ++ panic("No such hold %p on refcount %llx", holder, ++ (u_longlong_t)(uintptr_t)rc); ++ return (-1); ++} ++ ++int64_t ++refcount_remove(refcount_t *rc, void *holder) ++{ ++ return (refcount_remove_many(rc, 1, holder)); ++} ++ ++void ++refcount_transfer(refcount_t *dst, refcount_t *src) ++{ ++ int64_t count, removed_count; ++ list_t list, removed; ++ ++ list_create(&list, sizeof (reference_t), ++ offsetof(reference_t, ref_link)); ++ list_create(&removed, sizeof (reference_t), ++ offsetof(reference_t, ref_link)); ++ ++ mutex_enter(&src->rc_mtx); ++ count = src->rc_count; ++ removed_count = src->rc_removed_count; ++ src->rc_count = 0; ++ src->rc_removed_count = 0; ++ list_move_tail(&list, &src->rc_list); ++ list_move_tail(&removed, &src->rc_removed); ++ mutex_exit(&src->rc_mtx); ++ ++ mutex_enter(&dst->rc_mtx); ++ dst->rc_count += count; ++ dst->rc_removed_count += removed_count; ++ list_move_tail(&dst->rc_list, &list); ++ list_move_tail(&dst->rc_removed, &removed); ++ mutex_exit(&dst->rc_mtx); ++ ++ list_destroy(&list); ++ list_destroy(&removed); ++} ++ ++#endif /* ZFS_DEBUG */ +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/rrwlock.c linux-3.2.33-go/fs/zfs/zfs/rrwlock.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/rrwlock.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/rrwlock.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,264 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#include ++#include ++ ++/* ++ * This file contains the implementation of a re-entrant read ++ * reader/writer lock (aka "rrwlock"). ++ * ++ * This is a normal reader/writer lock with the additional feature ++ * of allowing threads who have already obtained a read lock to ++ * re-enter another read lock (re-entrant read) - even if there are ++ * waiting writers. ++ * ++ * Callers who have not obtained a read lock give waiting writers priority. ++ * ++ * The rrwlock_t lock does not allow re-entrant writers, nor does it ++ * allow a re-entrant mix of reads and writes (that is, it does not ++ * allow a caller who has already obtained a read lock to be able to ++ * then grab a write lock without first dropping all read locks, and ++ * vice versa). ++ * ++ * The rrwlock_t uses tsd (thread specific data) to keep a list of ++ * nodes (rrw_node_t), where each node keeps track of which specific ++ * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering ++ * should be rare, a thread that grabs multiple reads on the same rrwlock_t ++ * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the ++ * tsd list can represent a different rrwlock_t. This allows a thread ++ * to enter multiple and unique rrwlock_ts for read locks at the same time. ++ * ++ * Since using tsd exposes some overhead, the rrwlock_t only needs to ++ * keep tsd data when writers are waiting. If no writers are waiting, then ++ * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd ++ * is needed. Once a writer attempts to grab the lock, readers then ++ * keep tsd data and bump the linked readers count (rr_linked_rcount). ++ * ++ * If there are waiting writers and there are anonymous readers, then a ++ * reader doesn't know if it is a re-entrant lock. But since it may be one, ++ * we allow the read to proceed (otherwise it could deadlock). Since once ++ * waiting writers are active, readers no longer bump the anonymous count, ++ * the anonymous readers will eventually flush themselves out. At this point, ++ * readers will be able to tell if they are a re-entrant lock (have a ++ * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then ++ * we must let the proceed. If they are not, then the reader blocks for the ++ * waiting writers. Hence, we do not starve writers. ++ */ ++ ++/* global key for TSD */ ++uint_t rrw_tsd_key; ++ ++typedef struct rrw_node { ++ struct rrw_node *rn_next; ++ rrwlock_t *rn_rrl; ++} rrw_node_t; ++ ++static rrw_node_t * ++rrn_find(rrwlock_t *rrl) ++{ ++ rrw_node_t *rn; ++ ++ if (refcount_count(&rrl->rr_linked_rcount) == 0) ++ return (NULL); ++ ++ for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { ++ if (rn->rn_rrl == rrl) ++ return (rn); ++ } ++ return (NULL); ++} ++ ++/* ++ * Add a node to the head of the singly linked list. ++ */ ++static void ++rrn_add(rrwlock_t *rrl) ++{ ++ rrw_node_t *rn; ++ ++ rn = kmem_alloc(sizeof (*rn), KM_SLEEP); ++ rn->rn_rrl = rrl; ++ rn->rn_next = tsd_get(rrw_tsd_key); ++ VERIFY(tsd_set(rrw_tsd_key, rn) == 0); ++} ++ ++/* ++ * If a node is found for 'rrl', then remove the node from this ++ * thread's list and return TRUE; otherwise return FALSE. ++ */ ++static boolean_t ++rrn_find_and_remove(rrwlock_t *rrl) ++{ ++ rrw_node_t *rn; ++ rrw_node_t *prev = NULL; ++ ++ if (refcount_count(&rrl->rr_linked_rcount) == 0) ++ return (B_FALSE); ++ ++ for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { ++ if (rn->rn_rrl == rrl) { ++ if (prev) ++ prev->rn_next = rn->rn_next; ++ else ++ VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); ++ kmem_free(rn, sizeof (*rn)); ++ return (B_TRUE); ++ } ++ prev = rn; ++ } ++ return (B_FALSE); ++} ++ ++void ++rrw_init(rrwlock_t *rrl) ++{ ++ mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); ++ cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); ++ rrl->rr_writer = NULL; ++ refcount_create(&rrl->rr_anon_rcount); ++ refcount_create(&rrl->rr_linked_rcount); ++ rrl->rr_writer_wanted = B_FALSE; ++} ++ ++void ++rrw_destroy(rrwlock_t *rrl) ++{ ++ mutex_destroy(&rrl->rr_lock); ++ cv_destroy(&rrl->rr_cv); ++ ASSERT(rrl->rr_writer == NULL); ++ refcount_destroy(&rrl->rr_anon_rcount); ++ refcount_destroy(&rrl->rr_linked_rcount); ++} ++ ++static void ++rrw_enter_read(rrwlock_t *rrl, void *tag) ++{ ++ mutex_enter(&rrl->rr_lock); ++#if !defined(DEBUG) && defined(_KERNEL) ++ if (!rrl->rr_writer && !rrl->rr_writer_wanted) { ++ rrl->rr_anon_rcount.rc_count++; ++ mutex_exit(&rrl->rr_lock); ++ return; ++ } ++ DTRACE_PROBE(zfs__rrwfastpath__rdmiss); ++#endif ++ ASSERT(rrl->rr_writer != curthread); ++ ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); ++ ++ while (rrl->rr_writer || (rrl->rr_writer_wanted && ++ refcount_is_zero(&rrl->rr_anon_rcount) && ++ rrn_find(rrl) == NULL)) ++ cv_wait(&rrl->rr_cv, &rrl->rr_lock); ++ ++ if (rrl->rr_writer_wanted) { ++ /* may or may not be a re-entrant enter */ ++ rrn_add(rrl); ++ (void) refcount_add(&rrl->rr_linked_rcount, tag); ++ } else { ++ (void) refcount_add(&rrl->rr_anon_rcount, tag); ++ } ++ ASSERT(rrl->rr_writer == NULL); ++ mutex_exit(&rrl->rr_lock); ++} ++ ++static void ++rrw_enter_write(rrwlock_t *rrl) ++{ ++ mutex_enter(&rrl->rr_lock); ++ ASSERT(rrl->rr_writer != curthread); ++ ++ while (refcount_count(&rrl->rr_anon_rcount) > 0 || ++ refcount_count(&rrl->rr_linked_rcount) > 0 || ++ rrl->rr_writer != NULL) { ++ rrl->rr_writer_wanted = B_TRUE; ++ cv_wait(&rrl->rr_cv, &rrl->rr_lock); ++ } ++ rrl->rr_writer_wanted = B_FALSE; ++ rrl->rr_writer = curthread; ++ mutex_exit(&rrl->rr_lock); ++} ++ ++void ++rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) ++{ ++ if (rw == RW_READER) ++ rrw_enter_read(rrl, tag); ++ else ++ rrw_enter_write(rrl); ++} ++ ++void ++rrw_exit(rrwlock_t *rrl, void *tag) ++{ ++ mutex_enter(&rrl->rr_lock); ++#if !defined(DEBUG) && defined(_KERNEL) ++ if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { ++ rrl->rr_anon_rcount.rc_count--; ++ if (rrl->rr_anon_rcount.rc_count == 0) ++ cv_broadcast(&rrl->rr_cv); ++ mutex_exit(&rrl->rr_lock); ++ return; ++ } ++ DTRACE_PROBE(zfs__rrwfastpath__exitmiss); ++#endif ++ ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || ++ !refcount_is_zero(&rrl->rr_linked_rcount) || ++ rrl->rr_writer != NULL); ++ ++ if (rrl->rr_writer == NULL) { ++ int64_t count; ++ if (rrn_find_and_remove(rrl)) ++ count = refcount_remove(&rrl->rr_linked_rcount, tag); ++ else ++ count = refcount_remove(&rrl->rr_anon_rcount, tag); ++ if (count == 0) ++ cv_broadcast(&rrl->rr_cv); ++ } else { ++ ASSERT(rrl->rr_writer == curthread); ++ ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && ++ refcount_is_zero(&rrl->rr_linked_rcount)); ++ rrl->rr_writer = NULL; ++ cv_broadcast(&rrl->rr_cv); ++ } ++ mutex_exit(&rrl->rr_lock); ++} ++ ++boolean_t ++rrw_held(rrwlock_t *rrl, krw_t rw) ++{ ++ boolean_t held; ++ ++ mutex_enter(&rrl->rr_lock); ++ if (rw == RW_WRITER) { ++ held = (rrl->rr_writer == curthread); ++ } else { ++ held = (!refcount_is_zero(&rrl->rr_anon_rcount) || ++ !refcount_is_zero(&rrl->rr_linked_rcount)); ++ } ++ mutex_exit(&rrl->rr_lock); ++ ++ return (held); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/sa.c linux-3.2.33-go/fs/zfs/zfs/sa.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/sa.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/sa.c 2012-11-16 23:25:34.352039300 +0100 +@@ -0,0 +1,2060 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * ZFS System attributes: ++ * ++ * A generic mechanism to allow for arbitrary attributes ++ * to be stored in a dnode. The data will be stored in the bonus buffer of ++ * the dnode and if necessary a special "spill" block will be used to handle ++ * overflow situations. The spill block will be sized to fit the data ++ * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the ++ * spill block is stored at the end of the current bonus buffer. Any ++ * attributes that would be in the way of the blkptr_t will be relocated ++ * into the spill block. ++ * ++ * Attribute registration: ++ * ++ * Stored persistently on a per dataset basis ++ * a mapping between attribute "string" names and their actual attribute ++ * numeric values, length, and byteswap function. The names are only used ++ * during registration. All attributes are known by their unique attribute ++ * id value. If an attribute can have a variable size then the value ++ * 0 will be used to indicate this. ++ * ++ * Attribute Layout: ++ * ++ * Attribute layouts are a way to compactly store multiple attributes, but ++ * without taking the overhead associated with managing each attribute ++ * individually. Since you will typically have the same set of attributes ++ * stored in the same order a single table will be used to represent that ++ * layout. The ZPL for example will usually have only about 10 different ++ * layouts (regular files, device files, symlinks, ++ * regular files + scanstamp, files/dir with extended attributes, and then ++ * you have the possibility of all of those minus ACL, because it would ++ * be kicked out into the spill block) ++ * ++ * Layouts are simply an array of the attributes and their ++ * ordering i.e. [0, 1, 4, 5, 2] ++ * ++ * Each distinct layout is given a unique layout number and that is whats ++ * stored in the header at the beginning of the SA data buffer. ++ * ++ * A layout only covers a single dbuf (bonus or spill). If a set of ++ * attributes is split up between the bonus buffer and a spill buffer then ++ * two different layouts will be used. This allows us to byteswap the ++ * spill without looking at the bonus buffer and keeps the on disk format of ++ * the bonus and spill buffer the same. ++ * ++ * Adding a single attribute will cause the entire set of attributes to ++ * be rewritten and could result in a new layout number being constructed ++ * as part of the rewrite if no such layout exists for the new set of ++ * attribues. The new attribute will be appended to the end of the already ++ * existing attributes. ++ * ++ * Both the attribute registration and attribute layout information are ++ * stored in normal ZAP attributes. Their should be a small number of ++ * known layouts and the set of attributes is assumed to typically be quite ++ * small. ++ * ++ * The registered attributes and layout "table" information is maintained ++ * in core and a special "sa_os_t" is attached to the objset_t. ++ * ++ * A special interface is provided to allow for quickly applying ++ * a large set of attributes at once. sa_replace_all_by_template() is ++ * used to set an array of attributes. This is used by the ZPL when ++ * creating a brand new file. The template that is passed into the function ++ * specifies the attribute, size for variable length attributes, location of ++ * data and special "data locator" function if the data isn't in a contiguous ++ * location. ++ * ++ * Byteswap implications: ++ * Since the SA attributes are not entirely self describing we can't do ++ * the normal byteswap processing. The special ZAP layout attribute and ++ * attribute registration attributes define the byteswap function and the ++ * size of the attributes, unless it is variable sized. ++ * The normal ZFS byteswapping infrastructure assumes you don't need ++ * to read any objects in order to do the necessary byteswapping. Whereas ++ * SA attributes can only be properly byteswapped if the dataset is opened ++ * and the layout/attribute ZAP attributes are available. Because of this ++ * the SA attributes will be byteswapped when they are first accessed by ++ * the SA code that will read the SA data. ++ */ ++ ++typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t, ++ uint16_t length, int length_idx, boolean_t, void *userp); ++ ++static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype); ++static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab); ++static void *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, ++ void *data); ++static void sa_idx_tab_rele(objset_t *os, void *arg); ++static void sa_copy_data(sa_data_locator_t *func, void *start, void *target, ++ int buflen); ++static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, ++ sa_data_op_t action, sa_data_locator_t *locator, void *datastart, ++ uint16_t buflen, dmu_tx_t *tx); ++ ++arc_byteswap_func_t *sa_bswap_table[] = { ++ byteswap_uint64_array, ++ byteswap_uint32_array, ++ byteswap_uint16_array, ++ byteswap_uint8_array, ++ zfs_acl_byteswap, ++}; ++ ++#define SA_COPY_DATA(f, s, t, l) \ ++ { \ ++ if (f == NULL) { \ ++ if (l == 8) { \ ++ *(uint64_t *)t = *(uint64_t *)s; \ ++ } else if (l == 16) { \ ++ *(uint64_t *)t = *(uint64_t *)s; \ ++ *(uint64_t *)((uintptr_t)t + 8) = \ ++ *(uint64_t *)((uintptr_t)s + 8); \ ++ } else { \ ++ bcopy(s, t, l); \ ++ } \ ++ } else \ ++ sa_copy_data(f, s, t, l); \ ++ } ++ ++/* ++ * This table is fixed and cannot be changed. Its purpose is to ++ * allow the SA code to work with both old/new ZPL file systems. ++ * It contains the list of legacy attributes. These attributes aren't ++ * stored in the "attribute" registry zap objects, since older ZPL file systems ++ * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will ++ * use this static table. ++ */ ++sa_attr_reg_t sa_legacy_attrs[] = { ++ {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, ++ {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, ++ {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, ++ {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, ++ {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, ++ {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, ++ {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, ++ {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, ++ {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, ++ {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, ++ {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, ++ {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, ++ {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, ++ {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, ++ {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, ++ {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, ++}; ++ ++/* ++ * ZPL legacy layout ++ * This is only used for objects of type DMU_OT_ZNODE ++ */ ++sa_attr_type_t sa_legacy_zpl_layout[] = { ++ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ++}; ++ ++/* ++ * Special dummy layout used for buffers with no attributes. ++ */ ++ ++sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; ++ ++static int sa_legacy_attr_count = 16; ++static kmem_cache_t *sa_cache = NULL; ++static kmem_cache_t *spill_cache = NULL; ++ ++/*ARGSUSED*/ ++static int ++sa_cache_constructor(void *buf, void *unused, int kmflag) ++{ ++ sa_handle_t *hdl = buf; ++ ++ hdl->sa_bonus_tab = NULL; ++ hdl->sa_spill_tab = NULL; ++ hdl->sa_os = NULL; ++ hdl->sa_userp = NULL; ++ hdl->sa_bonus = NULL; ++ hdl->sa_spill = NULL; ++ mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL); ++ return (0); ++} ++ ++/*ARGSUSED*/ ++static void ++sa_cache_destructor(void *buf, void *unused) ++{ ++ sa_handle_t *hdl = buf; ++ mutex_destroy(&hdl->sa_lock); ++} ++ ++void ++sa_cache_init(void) ++{ ++ sa_cache = kmem_cache_create("sa_cache", ++ sizeof (sa_handle_t), 0, sa_cache_constructor, ++ sa_cache_destructor, NULL, NULL, NULL, 0); ++ spill_cache = kmem_cache_create("spill_cache", ++ SPA_MAXBLOCKSIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); ++} ++ ++void ++sa_cache_fini(void) ++{ ++ if (sa_cache) ++ kmem_cache_destroy(sa_cache); ++ ++ if (spill_cache) ++ kmem_cache_destroy(spill_cache); ++} ++ ++void * ++sa_spill_alloc(int flags) ++{ ++ return kmem_cache_alloc(spill_cache, flags); ++} ++ ++void ++sa_spill_free(void *obj) ++{ ++ kmem_cache_free(spill_cache, obj); ++} ++ ++static int ++layout_num_compare(const void *arg1, const void *arg2) ++{ ++ const sa_lot_t *node1 = arg1; ++ const sa_lot_t *node2 = arg2; ++ ++ if (node1->lot_num > node2->lot_num) ++ return (1); ++ else if (node1->lot_num < node2->lot_num) ++ return (-1); ++ return (0); ++} ++ ++static int ++layout_hash_compare(const void *arg1, const void *arg2) ++{ ++ const sa_lot_t *node1 = arg1; ++ const sa_lot_t *node2 = arg2; ++ ++ if (node1->lot_hash > node2->lot_hash) ++ return (1); ++ if (node1->lot_hash < node2->lot_hash) ++ return (-1); ++ if (node1->lot_instance > node2->lot_instance) ++ return (1); ++ if (node1->lot_instance < node2->lot_instance) ++ return (-1); ++ return (0); ++} ++ ++boolean_t ++sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count) ++{ ++ int i; ++ ++ if (count != tbf->lot_attr_count) ++ return (1); ++ ++ for (i = 0; i != count; i++) { ++ if (attrs[i] != tbf->lot_attrs[i]) ++ return (1); ++ } ++ return (0); ++} ++ ++#define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF]) ++ ++static uint64_t ++sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count) ++{ ++ int i; ++ uint64_t crc = -1ULL; ++ ++ for (i = 0; i != attr_count; i++) ++ crc ^= SA_ATTR_HASH(attrs[i]); ++ ++ return (crc); ++} ++ ++static int ++sa_get_spill(sa_handle_t *hdl) ++{ ++ int rc; ++ if (hdl->sa_spill == NULL) { ++ if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL, ++ &hdl->sa_spill)) == 0) ++ VERIFY(0 == sa_build_index(hdl, SA_SPILL)); ++ } else { ++ rc = 0; ++ } ++ ++ return (rc); ++} ++ ++/* ++ * Main attribute lookup/update function ++ * returns 0 for success or non zero for failures ++ * ++ * Operates on bulk array, first failure will abort further processing ++ */ ++int ++sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, ++ sa_data_op_t data_op, dmu_tx_t *tx) ++{ ++ sa_os_t *sa = hdl->sa_os->os_sa; ++ int i; ++ int error = 0; ++ sa_buf_type_t buftypes; ++ ++ buftypes = 0; ++ ++ ASSERT(count > 0); ++ for (i = 0; i != count; i++) { ++ ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs); ++ ++ bulk[i].sa_addr = NULL; ++ /* First check the bonus buffer */ ++ ++ if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT( ++ hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) { ++ SA_ATTR_INFO(sa, hdl->sa_bonus_tab, ++ SA_GET_HDR(hdl, SA_BONUS), ++ bulk[i].sa_attr, bulk[i], SA_BONUS, hdl); ++ if (tx && !(buftypes & SA_BONUS)) { ++ dmu_buf_will_dirty(hdl->sa_bonus, tx); ++ buftypes |= SA_BONUS; ++ } ++ } ++ if (bulk[i].sa_addr == NULL && ++ ((error = sa_get_spill(hdl)) == 0)) { ++ if (TOC_ATTR_PRESENT( ++ hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) { ++ SA_ATTR_INFO(sa, hdl->sa_spill_tab, ++ SA_GET_HDR(hdl, SA_SPILL), ++ bulk[i].sa_attr, bulk[i], SA_SPILL, hdl); ++ if (tx && !(buftypes & SA_SPILL) && ++ bulk[i].sa_size == bulk[i].sa_length) { ++ dmu_buf_will_dirty(hdl->sa_spill, tx); ++ buftypes |= SA_SPILL; ++ } ++ } ++ } ++ if (error && error != ENOENT) { ++ return ((error == ECKSUM) ? EIO : error); ++ } ++ ++ switch (data_op) { ++ case SA_LOOKUP: ++ if (bulk[i].sa_addr == NULL) ++ return (ENOENT); ++ if (bulk[i].sa_data) { ++ SA_COPY_DATA(bulk[i].sa_data_func, ++ bulk[i].sa_addr, bulk[i].sa_data, ++ bulk[i].sa_size); ++ } ++ continue; ++ ++ case SA_UPDATE: ++ /* existing rewrite of attr */ ++ if (bulk[i].sa_addr && ++ bulk[i].sa_size == bulk[i].sa_length) { ++ SA_COPY_DATA(bulk[i].sa_data_func, ++ bulk[i].sa_data, bulk[i].sa_addr, ++ bulk[i].sa_length); ++ continue; ++ } else if (bulk[i].sa_addr) { /* attr size change */ ++ error = sa_modify_attrs(hdl, bulk[i].sa_attr, ++ SA_REPLACE, bulk[i].sa_data_func, ++ bulk[i].sa_data, bulk[i].sa_length, tx); ++ } else { /* adding new attribute */ ++ error = sa_modify_attrs(hdl, bulk[i].sa_attr, ++ SA_ADD, bulk[i].sa_data_func, ++ bulk[i].sa_data, bulk[i].sa_length, tx); ++ } ++ if (error) ++ return (error); ++ break; ++ default: ++ break; ++ } ++ } ++ return (error); ++} ++ ++static sa_lot_t * ++sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, ++ uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx) ++{ ++ sa_os_t *sa = os->os_sa; ++ sa_lot_t *tb, *findtb; ++ int i; ++ avl_index_t loc; ++ ++ ASSERT(MUTEX_HELD(&sa->sa_lock)); ++ tb = kmem_zalloc(sizeof (sa_lot_t), KM_PUSHPAGE); ++ tb->lot_attr_count = attr_count; ++ tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, ++ KM_PUSHPAGE); ++ bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count); ++ tb->lot_num = lot_num; ++ tb->lot_hash = hash; ++ tb->lot_instance = 0; ++ ++ if (zapadd) { ++ char attr_name[8]; ++ ++ if (sa->sa_layout_attr_obj == 0) { ++ sa->sa_layout_attr_obj = zap_create(os, ++ DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx); ++ VERIFY(zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1, ++ &sa->sa_layout_attr_obj, tx) == 0); ++ } ++ ++ (void) snprintf(attr_name, sizeof (attr_name), ++ "%d", (int)lot_num); ++ VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj, ++ attr_name, 2, attr_count, attrs, tx)); ++ } ++ ++ list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t), ++ offsetof(sa_idx_tab_t, sa_next)); ++ ++ for (i = 0; i != attr_count; i++) { ++ if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0) ++ tb->lot_var_sizes++; ++ } ++ ++ avl_add(&sa->sa_layout_num_tree, tb); ++ ++ /* verify we don't have a hash collision */ ++ if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) { ++ for (; findtb && findtb->lot_hash == hash; ++ findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) { ++ if (findtb->lot_instance != tb->lot_instance) ++ break; ++ tb->lot_instance++; ++ } ++ } ++ avl_add(&sa->sa_layout_hash_tree, tb); ++ return (tb); ++} ++ ++static void ++sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs, ++ int count, dmu_tx_t *tx, sa_lot_t **lot) ++{ ++ sa_lot_t *tb, tbsearch; ++ avl_index_t loc; ++ sa_os_t *sa = os->os_sa; ++ boolean_t found = B_FALSE; ++ ++ mutex_enter(&sa->sa_lock); ++ tbsearch.lot_hash = hash; ++ tbsearch.lot_instance = 0; ++ tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc); ++ if (tb) { ++ for (; tb && tb->lot_hash == hash; ++ tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) { ++ if (sa_layout_equal(tb, attrs, count) == 0) { ++ found = B_TRUE; ++ break; ++ } ++ } ++ } ++ if (!found) { ++ tb = sa_add_layout_entry(os, attrs, count, ++ avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx); ++ } ++ mutex_exit(&sa->sa_lock); ++ *lot = tb; ++} ++ ++static int ++sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx) ++{ ++ int error; ++ uint32_t blocksize; ++ ++ if (size == 0) { ++ blocksize = SPA_MINBLOCKSIZE; ++ } else if (size > SPA_MAXBLOCKSIZE) { ++ ASSERT(0); ++ return (EFBIG); ++ } else { ++ blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t); ++ } ++ ++ error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx); ++ ASSERT(error == 0); ++ return (error); ++} ++ ++static void ++sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen) ++{ ++ if (func == NULL) { ++ bcopy(datastart, target, buflen); ++ } else { ++ boolean_t start; ++ int bytes; ++ void *dataptr; ++ void *saptr = target; ++ uint32_t length; ++ ++ start = B_TRUE; ++ bytes = 0; ++ while (bytes < buflen) { ++ func(&dataptr, &length, buflen, start, datastart); ++ bcopy(dataptr, saptr, length); ++ saptr = (void *)((caddr_t)saptr + length); ++ bytes += length; ++ start = B_FALSE; ++ } ++ } ++} ++ ++/* ++ * Determine several different sizes ++ * first the sa header size ++ * the number of bytes to be stored ++ * if spill would occur the index in the attribute array is returned ++ * ++ * the boolean will_spill will be set when spilling is necessary. It ++ * is only set when the buftype is SA_BONUS ++ */ ++static int ++sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, ++ dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total, ++ boolean_t *will_spill) ++{ ++ int var_size = 0; ++ int i; ++ int full_space; ++ int hdrsize; ++ boolean_t done = B_FALSE; ++ ++ if (buftype == SA_BONUS && sa->sa_force_spill) { ++ *total = 0; ++ *index = 0; ++ *will_spill = B_TRUE; ++ return (0); ++ } ++ ++ *index = -1; ++ *total = 0; ++ ++ if (buftype == SA_BONUS) ++ *will_spill = B_FALSE; ++ ++ hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 : ++ sizeof (sa_hdr_phys_t); ++ ++ full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size; ++ ++ for (i = 0; i != attr_count; i++) { ++ boolean_t is_var_sz; ++ ++ *total += attr_desc[i].sa_length; ++ if (done) ++ goto next; ++ ++ is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0); ++ if (is_var_sz) { ++ var_size++; ++ } ++ ++ if (is_var_sz && var_size > 1) { ++ if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) + ++ *total < full_space) { ++ hdrsize += sizeof (uint16_t); ++ } else { ++ done = B_TRUE; ++ *index = i; ++ if (buftype == SA_BONUS) ++ *will_spill = B_TRUE; ++ continue; ++ } ++ } ++ ++ /* ++ * find index of where spill *could* occur. ++ * Then continue to count of remainder attribute ++ * space. The sum is used later for sizing bonus ++ * and spill buffer. ++ */ ++ if (buftype == SA_BONUS && *index == -1 && ++ (*total + P2ROUNDUP(hdrsize, 8)) > ++ (full_space - sizeof (blkptr_t))) { ++ *index = i; ++ done = B_TRUE; ++ } ++ ++next: ++ if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space && ++ buftype == SA_BONUS) ++ *will_spill = B_TRUE; ++ } ++ ++ hdrsize = P2ROUNDUP(hdrsize, 8); ++ return (hdrsize); ++} ++ ++#define BUF_SPACE_NEEDED(total, header) (total + header) ++ ++/* ++ * Find layout that corresponds to ordering of attributes ++ * If not found a new layout number is created and added to ++ * persistent layout tables. ++ */ ++static int ++sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, ++ dmu_tx_t *tx) ++{ ++ sa_os_t *sa = hdl->sa_os->os_sa; ++ uint64_t hash; ++ sa_buf_type_t buftype; ++ sa_hdr_phys_t *sahdr; ++ void *data_start; ++ int buf_space; ++ sa_attr_type_t *attrs, *attrs_start; ++ int i, lot_count; ++ int hdrsize, spillhdrsize = 0; ++ int used; ++ dmu_object_type_t bonustype; ++ sa_lot_t *lot; ++ int len_idx; ++ int spill_used; ++ boolean_t spilling; ++ ++ dmu_buf_will_dirty(hdl->sa_bonus, tx); ++ bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus); ++ ++ /* first determine bonus header size and sum of all attributes */ ++ hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus, ++ SA_BONUS, &i, &used, &spilling); ++ ++ if (used > SPA_MAXBLOCKSIZE) ++ return (EFBIG); ++ ++ VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ? ++ MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) : ++ used + hdrsize, tx)); ++ ++ ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) || ++ bonustype == DMU_OT_SA); ++ ++ /* setup and size spill buffer when needed */ ++ if (spilling) { ++ boolean_t dummy; ++ ++ if (hdl->sa_spill == NULL) { ++ VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL, ++ &hdl->sa_spill) == 0); ++ } ++ dmu_buf_will_dirty(hdl->sa_spill, tx); ++ ++ spillhdrsize = sa_find_sizes(sa, &attr_desc[i], ++ attr_count - i, hdl->sa_spill, SA_SPILL, &i, ++ &spill_used, &dummy); ++ ++ if (spill_used > SPA_MAXBLOCKSIZE) ++ return (EFBIG); ++ ++ buf_space = hdl->sa_spill->db_size - spillhdrsize; ++ if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) > ++ hdl->sa_spill->db_size) ++ VERIFY(0 == sa_resize_spill(hdl, ++ BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx)); ++ } ++ ++ /* setup starting pointers to lay down data */ ++ data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize); ++ sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data; ++ buftype = SA_BONUS; ++ ++ if (spilling) ++ buf_space = (sa->sa_force_spill) ? ++ 0 : SA_BLKPTR_SPACE - hdrsize; ++ else ++ buf_space = hdl->sa_bonus->db_size - hdrsize; ++ ++ attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, ++ KM_PUSHPAGE); ++ lot_count = 0; ++ ++ for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) { ++ uint16_t length; ++ ++ attrs[i] = attr_desc[i].sa_attr; ++ length = SA_REGISTERED_LEN(sa, attrs[i]); ++ if (length == 0) ++ length = attr_desc[i].sa_length; ++ ++ if (buf_space < length) { /* switch to spill buffer */ ++ VERIFY(bonustype == DMU_OT_SA); ++ if (buftype == SA_BONUS && !sa->sa_force_spill) { ++ sa_find_layout(hdl->sa_os, hash, attrs_start, ++ lot_count, tx, &lot); ++ SA_SET_HDR(sahdr, lot->lot_num, hdrsize); ++ } ++ ++ buftype = SA_SPILL; ++ hash = -1ULL; ++ len_idx = 0; ++ ++ sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data; ++ sahdr->sa_magic = SA_MAGIC; ++ data_start = (void *)((uintptr_t)sahdr + ++ spillhdrsize); ++ attrs_start = &attrs[i]; ++ buf_space = hdl->sa_spill->db_size - spillhdrsize; ++ lot_count = 0; ++ } ++ hash ^= SA_ATTR_HASH(attrs[i]); ++ attr_desc[i].sa_addr = data_start; ++ attr_desc[i].sa_size = length; ++ SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data, ++ data_start, length); ++ if (sa->sa_attr_table[attrs[i]].sa_length == 0) { ++ sahdr->sa_lengths[len_idx++] = length; ++ } ++ data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + ++ length), 8); ++ buf_space -= P2ROUNDUP(length, 8); ++ lot_count++; ++ } ++ ++ sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); ++ ++ /* ++ * Verify that old znodes always have layout number 0. ++ * Must be DMU_OT_SA for arbitrary layouts ++ */ ++ VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) || ++ (bonustype == DMU_OT_SA && lot->lot_num > 1)); ++ ++ if (bonustype == DMU_OT_SA) { ++ SA_SET_HDR(sahdr, lot->lot_num, ++ buftype == SA_BONUS ? hdrsize : spillhdrsize); ++ } ++ ++ kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count); ++ if (hdl->sa_bonus_tab) { ++ sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); ++ hdl->sa_bonus_tab = NULL; ++ } ++ if (!sa->sa_force_spill) ++ VERIFY(0 == sa_build_index(hdl, SA_BONUS)); ++ if (hdl->sa_spill) { ++ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); ++ if (!spilling) { ++ /* ++ * remove spill block that is no longer needed. ++ */ ++ dmu_buf_rele(hdl->sa_spill, NULL); ++ hdl->sa_spill = NULL; ++ hdl->sa_spill_tab = NULL; ++ VERIFY(0 == dmu_rm_spill(hdl->sa_os, ++ sa_handle_object(hdl), tx)); ++ } else { ++ VERIFY(0 == sa_build_index(hdl, SA_SPILL)); ++ } ++ } ++ ++ return (0); ++} ++ ++static void ++sa_free_attr_table(sa_os_t *sa) ++{ ++ int i; ++ ++ if (sa->sa_attr_table == NULL) ++ return; ++ ++ for (i = 0; i != sa->sa_num_attrs; i++) { ++ if (sa->sa_attr_table[i].sa_name) ++ kmem_free(sa->sa_attr_table[i].sa_name, ++ strlen(sa->sa_attr_table[i].sa_name) + 1); ++ } ++ ++ kmem_free(sa->sa_attr_table, ++ sizeof (sa_attr_table_t) * sa->sa_num_attrs); ++ ++ sa->sa_attr_table = NULL; ++} ++ ++static int ++sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) ++{ ++ sa_os_t *sa = os->os_sa; ++ uint64_t sa_attr_count = 0; ++ uint64_t sa_reg_count = 0; ++ int error = 0; ++ uint64_t attr_value; ++ sa_attr_table_t *tb; ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ int registered_count = 0; ++ int i; ++ dmu_objset_type_t ostype = dmu_objset_type(os); ++ ++ sa->sa_user_table = ++ kmem_zalloc(count * sizeof (sa_attr_type_t), KM_PUSHPAGE); ++ sa->sa_user_table_sz = count * sizeof (sa_attr_type_t); ++ ++ if (sa->sa_reg_attr_obj != 0) { ++ error = zap_count(os, sa->sa_reg_attr_obj, ++ &sa_attr_count); ++ ++ /* ++ * Make sure we retrieved a count and that it isn't zero ++ */ ++ if (error || (error == 0 && sa_attr_count == 0)) { ++ if (error == 0) ++ error = EINVAL; ++ goto bail; ++ } ++ sa_reg_count = sa_attr_count; ++ } ++ ++ if (ostype == DMU_OST_ZFS && sa_attr_count == 0) ++ sa_attr_count += sa_legacy_attr_count; ++ ++ /* Allocate attribute numbers for attributes that aren't registered */ ++ for (i = 0; i != count; i++) { ++ boolean_t found = B_FALSE; ++ int j; ++ ++ if (ostype == DMU_OST_ZFS) { ++ for (j = 0; j != sa_legacy_attr_count; j++) { ++ if (strcmp(reg_attrs[i].sa_name, ++ sa_legacy_attrs[j].sa_name) == 0) { ++ sa->sa_user_table[i] = ++ sa_legacy_attrs[j].sa_attr; ++ found = B_TRUE; ++ } ++ } ++ } ++ if (found) ++ continue; ++ ++ if (sa->sa_reg_attr_obj) ++ error = zap_lookup(os, sa->sa_reg_attr_obj, ++ reg_attrs[i].sa_name, 8, 1, &attr_value); ++ else ++ error = ENOENT; ++ switch (error) { ++ case ENOENT: ++ sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count; ++ sa_attr_count++; ++ break; ++ case 0: ++ sa->sa_user_table[i] = ATTR_NUM(attr_value); ++ break; ++ default: ++ goto bail; ++ } ++ } ++ ++ sa->sa_num_attrs = sa_attr_count; ++ tb = sa->sa_attr_table = ++ kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_PUSHPAGE); ++ ++ /* ++ * Attribute table is constructed from requested attribute list, ++ * previously foreign registered attributes, and also the legacy ++ * ZPL set of attributes. ++ */ ++ ++ if (sa->sa_reg_attr_obj) { ++ for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj); ++ (error = zap_cursor_retrieve(&zc, &za)) == 0; ++ zap_cursor_advance(&zc)) { ++ uint64_t value; ++ value = za.za_first_integer; ++ ++ registered_count++; ++ tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value); ++ tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value); ++ tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value); ++ tb[ATTR_NUM(value)].sa_registered = B_TRUE; ++ ++ if (tb[ATTR_NUM(value)].sa_name) { ++ continue; ++ } ++ tb[ATTR_NUM(value)].sa_name = ++ kmem_zalloc(strlen(za.za_name) +1, KM_PUSHPAGE); ++ (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name, ++ strlen(za.za_name) +1); ++ } ++ zap_cursor_fini(&zc); ++ /* ++ * Make sure we processed the correct number of registered ++ * attributes ++ */ ++ if (registered_count != sa_reg_count) { ++ ASSERT(error != 0); ++ goto bail; ++ } ++ ++ } ++ ++ if (ostype == DMU_OST_ZFS) { ++ for (i = 0; i != sa_legacy_attr_count; i++) { ++ if (tb[i].sa_name) ++ continue; ++ tb[i].sa_attr = sa_legacy_attrs[i].sa_attr; ++ tb[i].sa_length = sa_legacy_attrs[i].sa_length; ++ tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap; ++ tb[i].sa_registered = B_FALSE; ++ tb[i].sa_name = ++ kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1, ++ KM_PUSHPAGE); ++ (void) strlcpy(tb[i].sa_name, ++ sa_legacy_attrs[i].sa_name, ++ strlen(sa_legacy_attrs[i].sa_name) + 1); ++ } ++ } ++ ++ for (i = 0; i != count; i++) { ++ sa_attr_type_t attr_id; ++ ++ attr_id = sa->sa_user_table[i]; ++ if (tb[attr_id].sa_name) ++ continue; ++ ++ tb[attr_id].sa_length = reg_attrs[i].sa_length; ++ tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap; ++ tb[attr_id].sa_attr = attr_id; ++ tb[attr_id].sa_name = ++ kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_PUSHPAGE); ++ (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name, ++ strlen(reg_attrs[i].sa_name) + 1); ++ } ++ ++ sa->sa_need_attr_registration = ++ (sa_attr_count != registered_count); ++ ++ return (0); ++bail: ++ kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t)); ++ sa->sa_user_table = NULL; ++ sa_free_attr_table(sa); ++ return ((error != 0) ? error : EINVAL); ++} ++ ++int ++sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, ++ sa_attr_type_t **user_table) ++{ ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ sa_os_t *sa; ++ dmu_objset_type_t ostype = dmu_objset_type(os); ++ sa_attr_type_t *tb; ++ int error; ++ ++ mutex_enter(&os->os_lock); ++ if (os->os_sa) { ++ mutex_enter(&os->os_sa->sa_lock); ++ mutex_exit(&os->os_lock); ++ tb = os->os_sa->sa_user_table; ++ mutex_exit(&os->os_sa->sa_lock); ++ *user_table = tb; ++ return (0); ++ } ++ ++ sa = kmem_zalloc(sizeof (sa_os_t), KM_PUSHPAGE); ++ mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL); ++ sa->sa_master_obj = sa_obj; ++ ++ os->os_sa = sa; ++ mutex_enter(&sa->sa_lock); ++ mutex_exit(&os->os_lock); ++ avl_create(&sa->sa_layout_num_tree, layout_num_compare, ++ sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node)); ++ avl_create(&sa->sa_layout_hash_tree, layout_hash_compare, ++ sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node)); ++ ++ if (sa_obj) { ++ error = zap_lookup(os, sa_obj, SA_LAYOUTS, ++ 8, 1, &sa->sa_layout_attr_obj); ++ if (error != 0 && error != ENOENT) ++ goto fail; ++ error = zap_lookup(os, sa_obj, SA_REGISTRY, ++ 8, 1, &sa->sa_reg_attr_obj); ++ if (error != 0 && error != ENOENT) ++ goto fail; ++ } ++ ++ if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0) ++ goto fail; ++ ++ if (sa->sa_layout_attr_obj != 0) { ++ uint64_t layout_count; ++ ++ error = zap_count(os, sa->sa_layout_attr_obj, ++ &layout_count); ++ ++ /* ++ * Layout number count should be > 0 ++ */ ++ if (error || (error == 0 && layout_count == 0)) { ++ if (error == 0) ++ error = EINVAL; ++ goto fail; ++ } ++ ++ for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj); ++ (error = zap_cursor_retrieve(&zc, &za)) == 0; ++ zap_cursor_advance(&zc)) { ++ sa_attr_type_t *lot_attrs; ++ uint64_t lot_num; ++ ++ lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) * ++ za.za_num_integers, KM_PUSHPAGE); ++ ++ if ((error = (zap_lookup(os, sa->sa_layout_attr_obj, ++ za.za_name, 2, za.za_num_integers, ++ lot_attrs))) != 0) { ++ kmem_free(lot_attrs, sizeof (sa_attr_type_t) * ++ za.za_num_integers); ++ break; ++ } ++ VERIFY(ddi_strtoull(za.za_name, NULL, 10, ++ (unsigned long long *)&lot_num) == 0); ++ ++ (void) sa_add_layout_entry(os, lot_attrs, ++ za.za_num_integers, lot_num, ++ sa_layout_info_hash(lot_attrs, ++ za.za_num_integers), B_FALSE, NULL); ++ kmem_free(lot_attrs, sizeof (sa_attr_type_t) * ++ za.za_num_integers); ++ } ++ zap_cursor_fini(&zc); ++ ++ /* ++ * Make sure layout count matches number of entries added ++ * to AVL tree ++ */ ++ if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) { ++ ASSERT(error != 0); ++ goto fail; ++ } ++ } ++ ++ /* Add special layout number for old ZNODES */ ++ if (ostype == DMU_OST_ZFS) { ++ (void) sa_add_layout_entry(os, sa_legacy_zpl_layout, ++ sa_legacy_attr_count, 0, ++ sa_layout_info_hash(sa_legacy_zpl_layout, ++ sa_legacy_attr_count), B_FALSE, NULL); ++ ++ (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1, ++ 0, B_FALSE, NULL); ++ } ++ *user_table = os->os_sa->sa_user_table; ++ mutex_exit(&sa->sa_lock); ++ return (0); ++fail: ++ os->os_sa = NULL; ++ sa_free_attr_table(sa); ++ if (sa->sa_user_table) ++ kmem_free(sa->sa_user_table, sa->sa_user_table_sz); ++ mutex_exit(&sa->sa_lock); ++ kmem_free(sa, sizeof (sa_os_t)); ++ return ((error == ECKSUM) ? EIO : error); ++} ++ ++void ++sa_tear_down(objset_t *os) ++{ ++ sa_os_t *sa = os->os_sa; ++ sa_lot_t *layout; ++ void *cookie; ++ ++ kmem_free(sa->sa_user_table, sa->sa_user_table_sz); ++ ++ /* Free up attr table */ ++ ++ sa_free_attr_table(sa); ++ ++ cookie = NULL; ++ while ((layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie))){ ++ sa_idx_tab_t *tab; ++ while ((tab = list_head(&layout->lot_idx_tab))) { ++ ASSERT(refcount_count(&tab->sa_refcount)); ++ sa_idx_tab_rele(os, tab); ++ } ++ } ++ ++ cookie = NULL; ++ while ((layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie))){ ++ kmem_free(layout->lot_attrs, ++ sizeof (sa_attr_type_t) * layout->lot_attr_count); ++ kmem_free(layout, sizeof (sa_lot_t)); ++ } ++ ++ avl_destroy(&sa->sa_layout_hash_tree); ++ avl_destroy(&sa->sa_layout_num_tree); ++ ++ kmem_free(sa, sizeof (sa_os_t)); ++ os->os_sa = NULL; ++} ++ ++void ++sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr, ++ uint16_t length, int length_idx, boolean_t var_length, void *userp) ++{ ++ sa_idx_tab_t *idx_tab = userp; ++ ++ if (var_length) { ++ ASSERT(idx_tab->sa_variable_lengths); ++ idx_tab->sa_variable_lengths[length_idx] = length; ++ } ++ TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx, ++ (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr)); ++} ++ ++static void ++sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type, ++ sa_iterfunc_t func, sa_lot_t *tab, void *userp) ++{ ++ void *data_start; ++ sa_lot_t *tb = tab; ++ sa_lot_t search; ++ avl_index_t loc; ++ sa_os_t *sa = os->os_sa; ++ int i; ++ uint16_t *length_start = NULL; ++ uint8_t length_idx = 0; ++ ++ if (tab == NULL) { ++ search.lot_num = SA_LAYOUT_NUM(hdr, type); ++ tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); ++ ASSERT(tb); ++ } ++ ++ if (IS_SA_BONUSTYPE(type)) { ++ data_start = (void *)P2ROUNDUP(((uintptr_t)hdr + ++ offsetof(sa_hdr_phys_t, sa_lengths) + ++ (sizeof (uint16_t) * tb->lot_var_sizes)), 8); ++ length_start = hdr->sa_lengths; ++ } else { ++ data_start = hdr; ++ } ++ ++ for (i = 0; i != tb->lot_attr_count; i++) { ++ int attr_length, reg_length; ++ uint8_t idx_len; ++ ++ reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length; ++ if (reg_length) { ++ attr_length = reg_length; ++ idx_len = 0; ++ } else { ++ attr_length = length_start[length_idx]; ++ idx_len = length_idx++; ++ } ++ ++ func(hdr, data_start, tb->lot_attrs[i], attr_length, ++ idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp); ++ ++ data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + ++ attr_length), 8); ++ } ++} ++ ++/*ARGSUSED*/ ++void ++sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr, ++ uint16_t length, int length_idx, boolean_t variable_length, void *userp) ++{ ++ sa_handle_t *hdl = userp; ++ sa_os_t *sa = hdl->sa_os->os_sa; ++ ++ sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length); ++} ++ ++void ++sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype) ++{ ++ sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype); ++ dmu_buf_impl_t *db; ++ int num_lengths = 1; ++ int i; ++ ASSERTV(sa_os_t *sa = hdl->sa_os->os_sa); ++ ++ ASSERT(MUTEX_HELD(&sa->sa_lock)); ++ if (sa_hdr_phys->sa_magic == SA_MAGIC) ++ return; ++ ++ db = SA_GET_DB(hdl, buftype); ++ ++ if (buftype == SA_SPILL) { ++ arc_release(db->db_buf, NULL); ++ arc_buf_thaw(db->db_buf); ++ } ++ ++ sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic); ++ sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info); ++ ++ /* ++ * Determine number of variable lenghts in header ++ * The standard 8 byte header has one for free and a ++ * 16 byte header would have 4 + 1; ++ */ ++ if (SA_HDR_SIZE(sa_hdr_phys) > 8) ++ num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1; ++ for (i = 0; i != num_lengths; i++) ++ sa_hdr_phys->sa_lengths[i] = ++ BSWAP_16(sa_hdr_phys->sa_lengths[i]); ++ ++ sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA, ++ sa_byteswap_cb, NULL, hdl); ++ ++ if (buftype == SA_SPILL) ++ arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf); ++} ++ ++static int ++sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) ++{ ++ sa_hdr_phys_t *sa_hdr_phys; ++ dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype); ++ dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db); ++ sa_os_t *sa = hdl->sa_os->os_sa; ++ sa_idx_tab_t *idx_tab; ++ ++ sa_hdr_phys = SA_GET_HDR(hdl, buftype); ++ ++ mutex_enter(&sa->sa_lock); ++ ++ /* Do we need to byteswap? */ ++ ++ /* only check if not old znode */ ++ if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC && ++ sa_hdr_phys->sa_magic != 0) { ++ VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC); ++ sa_byteswap(hdl, buftype); ++ } ++ ++ idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys); ++ ++ if (buftype == SA_BONUS) ++ hdl->sa_bonus_tab = idx_tab; ++ else ++ hdl->sa_spill_tab = idx_tab; ++ ++ mutex_exit(&sa->sa_lock); ++ return (0); ++} ++ ++/*ARGSUSED*/ ++void ++sa_evict(dmu_buf_t *db, void *sap) ++{ ++ panic("evicting sa dbuf %p\n", (void *)db); ++} ++ ++static void ++sa_idx_tab_rele(objset_t *os, void *arg) ++{ ++ sa_os_t *sa = os->os_sa; ++ sa_idx_tab_t *idx_tab = arg; ++ ++ if (idx_tab == NULL) ++ return; ++ ++ mutex_enter(&sa->sa_lock); ++ if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) { ++ list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab); ++ if (idx_tab->sa_variable_lengths) ++ kmem_free(idx_tab->sa_variable_lengths, ++ sizeof (uint16_t) * ++ idx_tab->sa_layout->lot_var_sizes); ++ refcount_destroy(&idx_tab->sa_refcount); ++ kmem_free(idx_tab->sa_idx_tab, ++ sizeof (uint32_t) * sa->sa_num_attrs); ++ kmem_free(idx_tab, sizeof (sa_idx_tab_t)); ++ } ++ mutex_exit(&sa->sa_lock); ++} ++ ++static void ++sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab) ++{ ++ ASSERTV(sa_os_t *sa = os->os_sa); ++ ++ ASSERT(MUTEX_HELD(&sa->sa_lock)); ++ (void) refcount_add(&idx_tab->sa_refcount, NULL); ++} ++ ++void ++sa_spill_rele(sa_handle_t *hdl) ++{ ++ mutex_enter(&hdl->sa_lock); ++ if (hdl->sa_spill) { ++ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); ++ dmu_buf_rele(hdl->sa_spill, NULL); ++ hdl->sa_spill = NULL; ++ hdl->sa_spill_tab = NULL; ++ } ++ mutex_exit(&hdl->sa_lock); ++} ++ ++void ++sa_handle_destroy(sa_handle_t *hdl) ++{ ++ mutex_enter(&hdl->sa_lock); ++ (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl, ++ NULL, NULL, NULL); ++ ++ if (hdl->sa_bonus_tab) { ++ sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); ++ hdl->sa_bonus_tab = NULL; ++ } ++ if (hdl->sa_spill_tab) { ++ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); ++ hdl->sa_spill_tab = NULL; ++ } ++ ++ dmu_buf_rele(hdl->sa_bonus, NULL); ++ ++ if (hdl->sa_spill) ++ dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL); ++ mutex_exit(&hdl->sa_lock); ++ ++ kmem_cache_free(sa_cache, hdl); ++} ++ ++int ++sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, ++ sa_handle_type_t hdl_type, sa_handle_t **handlepp) ++{ ++ int error = 0; ++ sa_handle_t *handle; ++#ifdef ZFS_DEBUG ++ dmu_object_info_t doi; ++ ++ dmu_object_info_from_db(db, &doi); ++ ASSERT(doi.doi_bonus_type == DMU_OT_SA || ++ doi.doi_bonus_type == DMU_OT_ZNODE); ++#endif ++ /* find handle, if it exists */ ++ /* if one doesn't exist then create a new one, and initialize it */ ++ ++ handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL; ++ if (handle == NULL) { ++ sa_handle_t *newhandle; ++ handle = kmem_cache_alloc(sa_cache, KM_SLEEP); ++ handle->sa_userp = userp; ++ handle->sa_bonus = db; ++ handle->sa_os = os; ++ handle->sa_spill = NULL; ++ ++ error = sa_build_index(handle, SA_BONUS); ++ newhandle = (hdl_type == SA_HDL_SHARED) ? ++ dmu_buf_set_user_ie(db, handle, ++ NULL, sa_evict) : NULL; ++ ++ if (newhandle != NULL) { ++ kmem_cache_free(sa_cache, handle); ++ handle = newhandle; ++ } ++ } ++ *handlepp = handle; ++ ++ return (error); ++} ++ ++int ++sa_handle_get(objset_t *objset, uint64_t objid, void *userp, ++ sa_handle_type_t hdl_type, sa_handle_t **handlepp) ++{ ++ dmu_buf_t *db; ++ int error; ++ ++ if ((error = dmu_bonus_hold(objset, objid, NULL, &db))) ++ return (error); ++ ++ return (sa_handle_get_from_db(objset, db, userp, hdl_type, ++ handlepp)); ++} ++ ++int ++sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db) ++{ ++ return (dmu_bonus_hold(objset, obj_num, tag, db)); ++} ++ ++void ++sa_buf_rele(dmu_buf_t *db, void *tag) ++{ ++ dmu_buf_rele(db, tag); ++} ++ ++int ++sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count) ++{ ++ ASSERT(hdl); ++ ASSERT(MUTEX_HELD(&hdl->sa_lock)); ++ return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL)); ++} ++ ++int ++sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen) ++{ ++ int error; ++ sa_bulk_attr_t bulk; ++ ++ bulk.sa_attr = attr; ++ bulk.sa_data = buf; ++ bulk.sa_length = buflen; ++ bulk.sa_data_func = NULL; ++ ++ ASSERT(hdl); ++ mutex_enter(&hdl->sa_lock); ++ error = sa_lookup_impl(hdl, &bulk, 1); ++ mutex_exit(&hdl->sa_lock); ++ return (error); ++} ++ ++#ifdef _KERNEL ++int ++sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) ++{ ++ int error; ++ sa_bulk_attr_t bulk; ++ ++ bulk.sa_data = NULL; ++ bulk.sa_attr = attr; ++ bulk.sa_data_func = NULL; ++ ++ ASSERT(hdl); ++ ++ mutex_enter(&hdl->sa_lock); ++ if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) { ++ error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, ++ uio->uio_resid), UIO_READ, uio); ++ } ++ mutex_exit(&hdl->sa_lock); ++ return (error); ++} ++#endif ++ ++void * ++sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data) ++{ ++ sa_idx_tab_t *idx_tab; ++ sa_hdr_phys_t *hdr = (sa_hdr_phys_t *)data; ++ sa_os_t *sa = os->os_sa; ++ sa_lot_t *tb, search; ++ avl_index_t loc; ++ ++ /* ++ * Deterimine layout number. If SA node and header == 0 then ++ * force the index table to the dummy "1" empty layout. ++ * ++ * The layout number would only be zero for a newly created file ++ * that has not added any attributes yet, or with crypto enabled which ++ * doesn't write any attributes to the bonus buffer. ++ */ ++ ++ search.lot_num = SA_LAYOUT_NUM(hdr, bonustype); ++ ++ tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); ++ ++ /* Verify header size is consistent with layout information */ ++ ASSERT(tb); ++ ASSERT((IS_SA_BONUSTYPE(bonustype) && ++ SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb)) || !IS_SA_BONUSTYPE(bonustype) || ++ (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0)); ++ ++ /* ++ * See if any of the already existing TOC entries can be reused? ++ */ ++ ++ for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab; ++ idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) { ++ boolean_t valid_idx = B_TRUE; ++ int i; ++ ++ if (tb->lot_var_sizes != 0 && ++ idx_tab->sa_variable_lengths != NULL) { ++ for (i = 0; i != tb->lot_var_sizes; i++) { ++ if (hdr->sa_lengths[i] != ++ idx_tab->sa_variable_lengths[i]) { ++ valid_idx = B_FALSE; ++ break; ++ } ++ } ++ } ++ if (valid_idx) { ++ sa_idx_tab_hold(os, idx_tab); ++ return (idx_tab); ++ } ++ } ++ ++ /* No such luck, create a new entry */ ++ idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_PUSHPAGE); ++ idx_tab->sa_idx_tab = ++ kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_PUSHPAGE); ++ idx_tab->sa_layout = tb; ++ refcount_create(&idx_tab->sa_refcount); ++ if (tb->lot_var_sizes) ++ idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) * ++ tb->lot_var_sizes, KM_PUSHPAGE); ++ ++ sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab, ++ tb, idx_tab); ++ sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */ ++ sa_idx_tab_hold(os, idx_tab); /* one for layout */ ++ list_insert_tail(&tb->lot_idx_tab, idx_tab); ++ return (idx_tab); ++} ++ ++void ++sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len, ++ boolean_t start, void *userdata) ++{ ++ ASSERT(start); ++ ++ *dataptr = userdata; ++ *len = total_len; ++} ++ ++static void ++sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) ++{ ++ uint64_t attr_value = 0; ++ sa_os_t *sa = hdl->sa_os->os_sa; ++ sa_attr_table_t *tb = sa->sa_attr_table; ++ int i; ++ ++ mutex_enter(&sa->sa_lock); ++ ++ if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) { ++ mutex_exit(&sa->sa_lock); ++ return; ++ } ++ ++ if (sa->sa_reg_attr_obj == 0) { ++ sa->sa_reg_attr_obj = zap_create(hdl->sa_os, ++ DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx); ++ VERIFY(zap_add(hdl->sa_os, sa->sa_master_obj, ++ SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx) == 0); ++ } ++ for (i = 0; i != sa->sa_num_attrs; i++) { ++ if (sa->sa_attr_table[i].sa_registered) ++ continue; ++ ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length, ++ tb[i].sa_byteswap); ++ VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj, ++ tb[i].sa_name, 8, 1, &attr_value, tx)); ++ tb[i].sa_registered = B_TRUE; ++ } ++ sa->sa_need_attr_registration = B_FALSE; ++ mutex_exit(&sa->sa_lock); ++} ++ ++/* ++ * Replace all attributes with attributes specified in template. ++ * If dnode had a spill buffer then those attributes will be ++ * also be replaced, possibly with just an empty spill block ++ * ++ * This interface is intended to only be used for bulk adding of ++ * attributes for a new file. It will also be used by the ZPL ++ * when converting and old formatted znode to native SA support. ++ */ ++int ++sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, ++ int attr_count, dmu_tx_t *tx) ++{ ++ sa_os_t *sa = hdl->sa_os->os_sa; ++ ++ if (sa->sa_need_attr_registration) ++ sa_attr_register_sync(hdl, tx); ++ return (sa_build_layouts(hdl, attr_desc, attr_count, tx)); ++} ++ ++int ++sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, ++ int attr_count, dmu_tx_t *tx) ++{ ++ int error; ++ ++ mutex_enter(&hdl->sa_lock); ++ error = sa_replace_all_by_template_locked(hdl, attr_desc, ++ attr_count, tx); ++ mutex_exit(&hdl->sa_lock); ++ return (error); ++} ++ ++/* ++ * add/remove/replace a single attribute and then rewrite the entire set ++ * of attributes. ++ */ ++static int ++sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, ++ sa_data_op_t action, sa_data_locator_t *locator, void *datastart, ++ uint16_t buflen, dmu_tx_t *tx) ++{ ++ sa_os_t *sa = hdl->sa_os->os_sa; ++ dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; ++ dnode_t *dn; ++ sa_bulk_attr_t *attr_desc; ++ void *old_data[2]; ++ int bonus_attr_count = 0; ++ int bonus_data_size = 0; ++ int spill_attr_count = 0; ++ int error; ++ uint16_t length; ++ int i, j, k, length_idx; ++ sa_hdr_phys_t *hdr; ++ sa_idx_tab_t *idx_tab; ++ int attr_count; ++ int count; ++ ++ ASSERT(MUTEX_HELD(&hdl->sa_lock)); ++ ++ /* First make of copy of the old data */ ++ ++ DB_DNODE_ENTER(db); ++ dn = DB_DNODE(db); ++ if (dn->dn_bonuslen != 0) { ++ bonus_data_size = hdl->sa_bonus->db_size; ++ old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); ++ bcopy(hdl->sa_bonus->db_data, old_data[0], ++ hdl->sa_bonus->db_size); ++ bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count; ++ } else { ++ old_data[0] = NULL; ++ } ++ DB_DNODE_EXIT(db); ++ ++ /* Bring spill buffer online if it isn't currently */ ++ ++ if ((error = sa_get_spill(hdl)) == 0) { ++ ASSERT3U(hdl->sa_spill->db_size, <=, SPA_MAXBLOCKSIZE); ++ old_data[1] = sa_spill_alloc(KM_SLEEP); ++ bcopy(hdl->sa_spill->db_data, old_data[1], ++ hdl->sa_spill->db_size); ++ spill_attr_count = ++ hdl->sa_spill_tab->sa_layout->lot_attr_count; ++ } else if (error && error != ENOENT) { ++ if (old_data[0]) ++ kmem_free(old_data[0], bonus_data_size); ++ return (error); ++ } else { ++ old_data[1] = NULL; ++ } ++ ++ /* build descriptor of all attributes */ ++ ++ attr_count = bonus_attr_count + spill_attr_count; ++ if (action == SA_ADD) ++ attr_count++; ++ else if (action == SA_REMOVE) ++ attr_count--; ++ ++ attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP); ++ ++ /* ++ * loop through bonus and spill buffer if it exists, and ++ * build up new attr_descriptor to reset the attributes ++ */ ++ k = j = 0; ++ count = bonus_attr_count; ++ hdr = SA_GET_HDR(hdl, SA_BONUS); ++ idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS); ++ for (; k != 2; k++) { ++ /* iterate over each attribute in layout */ ++ for (i = 0, length_idx = 0; i != count; i++) { ++ sa_attr_type_t attr; ++ ++ attr = idx_tab->sa_layout->lot_attrs[i]; ++ if (attr == newattr) { ++ if (action == SA_REMOVE) { ++ j++; ++ continue; ++ } ++ ASSERT(SA_REGISTERED_LEN(sa, attr) == 0); ++ ASSERT(action == SA_REPLACE); ++ SA_ADD_BULK_ATTR(attr_desc, j, attr, ++ locator, datastart, buflen); ++ } else { ++ length = SA_REGISTERED_LEN(sa, attr); ++ if (length == 0) { ++ length = hdr->sa_lengths[length_idx++]; ++ } ++ ++ SA_ADD_BULK_ATTR(attr_desc, j, attr, ++ NULL, (void *) ++ (TOC_OFF(idx_tab->sa_idx_tab[attr]) + ++ (uintptr_t)old_data[k]), length); ++ } ++ } ++ if (k == 0 && hdl->sa_spill) { ++ hdr = SA_GET_HDR(hdl, SA_SPILL); ++ idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL); ++ count = spill_attr_count; ++ } else { ++ break; ++ } ++ } ++ if (action == SA_ADD) { ++ length = SA_REGISTERED_LEN(sa, newattr); ++ if (length == 0) { ++ length = buflen; ++ } ++ SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator, ++ datastart, buflen); ++ } ++ ++ error = sa_build_layouts(hdl, attr_desc, attr_count, tx); ++ ++ if (old_data[0]) ++ kmem_free(old_data[0], bonus_data_size); ++ if (old_data[1]) ++ sa_spill_free(old_data[1]); ++ kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count); ++ ++ return (error); ++} ++ ++static int ++sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, ++ dmu_tx_t *tx) ++{ ++ int error; ++ sa_os_t *sa = hdl->sa_os->os_sa; ++ dmu_object_type_t bonustype; ++ dmu_buf_t *saved_spill; ++ ++ ASSERT(hdl); ++ ASSERT(MUTEX_HELD(&hdl->sa_lock)); ++ ++ bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS)); ++ saved_spill = hdl->sa_spill; ++ ++ /* sync out registration table if necessary */ ++ if (sa->sa_need_attr_registration) ++ sa_attr_register_sync(hdl, tx); ++ ++ error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx); ++ if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb) ++ sa->sa_update_cb(hdl, tx); ++ ++ /* ++ * If saved_spill is NULL and current sa_spill is not NULL that ++ * means we increased the refcount of the spill buffer through ++ * sa_get_spill() or dmu_spill_hold_by_dnode(). Therefore we ++ * must release the hold before calling dmu_tx_commit() to avoid ++ * making a copy of this buffer in dbuf_sync_leaf() due to the ++ * reference count now being greater than 1. ++ */ ++ if (!saved_spill && hdl->sa_spill) { ++ if (hdl->sa_spill_tab) { ++ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); ++ hdl->sa_spill_tab = NULL; ++ } ++ ++ dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL); ++ hdl->sa_spill = NULL; ++ } ++ ++ return (error); ++} ++ ++/* ++ * update or add new attribute ++ */ ++int ++sa_update(sa_handle_t *hdl, sa_attr_type_t type, ++ void *buf, uint32_t buflen, dmu_tx_t *tx) ++{ ++ int error; ++ sa_bulk_attr_t bulk; ++ ++ bulk.sa_attr = type; ++ bulk.sa_data_func = NULL; ++ bulk.sa_length = buflen; ++ bulk.sa_data = buf; ++ ++ mutex_enter(&hdl->sa_lock); ++ error = sa_bulk_update_impl(hdl, &bulk, 1, tx); ++ mutex_exit(&hdl->sa_lock); ++ return (error); ++} ++ ++int ++sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr, ++ uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx) ++{ ++ int error; ++ sa_bulk_attr_t bulk; ++ ++ bulk.sa_attr = attr; ++ bulk.sa_data = userdata; ++ bulk.sa_data_func = locator; ++ bulk.sa_length = buflen; ++ ++ mutex_enter(&hdl->sa_lock); ++ error = sa_bulk_update_impl(hdl, &bulk, 1, tx); ++ mutex_exit(&hdl->sa_lock); ++ return (error); ++} ++ ++/* ++ * Return size of an attribute ++ */ ++ ++int ++sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size) ++{ ++ sa_bulk_attr_t bulk; ++ int error; ++ ++ bulk.sa_data = NULL; ++ bulk.sa_attr = attr; ++ bulk.sa_data_func = NULL; ++ ++ ASSERT(hdl); ++ mutex_enter(&hdl->sa_lock); ++ if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) { ++ mutex_exit(&hdl->sa_lock); ++ return (error); ++ } ++ *size = bulk.sa_size; ++ ++ mutex_exit(&hdl->sa_lock); ++ return (0); ++} ++ ++int ++sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) ++{ ++ ASSERT(hdl); ++ ASSERT(MUTEX_HELD(&hdl->sa_lock)); ++ return (sa_lookup_impl(hdl, attrs, count)); ++} ++ ++int ++sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) ++{ ++ int error; ++ ++ ASSERT(hdl); ++ mutex_enter(&hdl->sa_lock); ++ error = sa_bulk_lookup_locked(hdl, attrs, count); ++ mutex_exit(&hdl->sa_lock); ++ return (error); ++} ++ ++int ++sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx) ++{ ++ int error; ++ ++ ASSERT(hdl); ++ mutex_enter(&hdl->sa_lock); ++ error = sa_bulk_update_impl(hdl, attrs, count, tx); ++ mutex_exit(&hdl->sa_lock); ++ return (error); ++} ++ ++int ++sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx) ++{ ++ int error; ++ ++ mutex_enter(&hdl->sa_lock); ++ error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL, ++ NULL, 0, tx); ++ mutex_exit(&hdl->sa_lock); ++ return (error); ++} ++ ++void ++sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi) ++{ ++ dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi); ++} ++ ++void ++sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) ++{ ++ dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus, ++ blksize, nblocks); ++} ++ ++void ++sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl) ++{ ++ (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus, ++ oldhdl, newhdl, NULL, sa_evict); ++ oldhdl->sa_bonus = NULL; ++} ++ ++void ++sa_set_userp(sa_handle_t *hdl, void *ptr) ++{ ++ hdl->sa_userp = ptr; ++} ++ ++dmu_buf_t * ++sa_get_db(sa_handle_t *hdl) ++{ ++ return ((dmu_buf_t *)hdl->sa_bonus); ++} ++ ++void * ++sa_get_userdata(sa_handle_t *hdl) ++{ ++ return (hdl->sa_userp); ++} ++ ++void ++sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func) ++{ ++ ASSERT(MUTEX_HELD(&os->os_sa->sa_lock)); ++ os->os_sa->sa_update_cb = func; ++} ++ ++void ++sa_register_update_callback(objset_t *os, sa_update_cb_t *func) ++{ ++ ++ mutex_enter(&os->os_sa->sa_lock); ++ sa_register_update_callback_locked(os, func); ++ mutex_exit(&os->os_sa->sa_lock); ++} ++ ++uint64_t ++sa_handle_object(sa_handle_t *hdl) ++{ ++ return (hdl->sa_bonus->db_object); ++} ++ ++boolean_t ++sa_enabled(objset_t *os) ++{ ++ return (os->os_sa == NULL); ++} ++ ++int ++sa_set_sa_object(objset_t *os, uint64_t sa_object) ++{ ++ sa_os_t *sa = os->os_sa; ++ ++ if (sa->sa_master_obj) ++ return (1); ++ ++ sa->sa_master_obj = sa_object; ++ ++ return (0); ++} ++ ++int ++sa_hdrsize(void *arg) ++{ ++ sa_hdr_phys_t *hdr = arg; ++ ++ return (SA_HDR_SIZE(hdr)); ++} ++ ++void ++sa_handle_lock(sa_handle_t *hdl) ++{ ++ ASSERT(hdl); ++ mutex_enter(&hdl->sa_lock); ++} ++ ++void ++sa_handle_unlock(sa_handle_t *hdl) ++{ ++ ASSERT(hdl); ++ mutex_exit(&hdl->sa_lock); ++} ++ ++#ifdef _KERNEL ++EXPORT_SYMBOL(sa_handle_get); ++EXPORT_SYMBOL(sa_handle_get_from_db); ++EXPORT_SYMBOL(sa_handle_destroy); ++EXPORT_SYMBOL(sa_buf_hold); ++EXPORT_SYMBOL(sa_buf_rele); ++EXPORT_SYMBOL(sa_spill_rele); ++EXPORT_SYMBOL(sa_lookup); ++EXPORT_SYMBOL(sa_update); ++EXPORT_SYMBOL(sa_remove); ++EXPORT_SYMBOL(sa_bulk_lookup); ++EXPORT_SYMBOL(sa_bulk_lookup_locked); ++EXPORT_SYMBOL(sa_bulk_update); ++EXPORT_SYMBOL(sa_size); ++EXPORT_SYMBOL(sa_update_from_cb); ++EXPORT_SYMBOL(sa_object_info); ++EXPORT_SYMBOL(sa_object_size); ++EXPORT_SYMBOL(sa_update_user); ++EXPORT_SYMBOL(sa_get_userdata); ++EXPORT_SYMBOL(sa_set_userp); ++EXPORT_SYMBOL(sa_get_db); ++EXPORT_SYMBOL(sa_handle_object); ++EXPORT_SYMBOL(sa_register_update_callback); ++EXPORT_SYMBOL(sa_setup); ++EXPORT_SYMBOL(sa_replace_all_by_template); ++EXPORT_SYMBOL(sa_replace_all_by_template_locked); ++EXPORT_SYMBOL(sa_enabled); ++EXPORT_SYMBOL(sa_cache_init); ++EXPORT_SYMBOL(sa_cache_fini); ++EXPORT_SYMBOL(sa_spill_alloc); ++EXPORT_SYMBOL(sa_spill_free); ++EXPORT_SYMBOL(sa_set_sa_object); ++EXPORT_SYMBOL(sa_hdrsize); ++EXPORT_SYMBOL(sa_handle_lock); ++EXPORT_SYMBOL(sa_handle_unlock); ++EXPORT_SYMBOL(sa_lookup_uio); ++#endif /* _KERNEL */ +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/sha256.c linux-3.2.33-go/fs/zfs/zfs/sha256.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/sha256.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/sha256.c 2012-11-16 23:25:34.352039300 +0100 +@@ -0,0 +1,127 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#include ++#include ++#include ++ ++/* ++ * SHA-256 checksum, as specified in FIPS 180-3, available at: ++ * http://csrc.nist.gov/publications/PubsFIPS.html ++ * ++ * This is a very compact implementation of SHA-256. ++ * It is designed to be simple and portable, not to be fast. ++ */ ++ ++/* ++ * The literal definitions of Ch() and Maj() according to FIPS 180-3 are: ++ * ++ * Ch(x, y, z) (x & y) ^ (~x & z) ++ * Maj(x, y, z) (x & y) ^ (x & z) ^ (y & z) ++ * ++ * We use equivalent logical reductions here that require one less op. ++ */ ++#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) ++#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y)))) ++#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s))) ++#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22)) ++#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25)) ++#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3)) ++#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10)) ++ ++static const uint32_t SHA256_K[64] = { ++ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, ++ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, ++ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, ++ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, ++ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, ++ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, ++ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, ++ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, ++ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, ++ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, ++ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, ++ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, ++ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, ++ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, ++ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, ++ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 ++}; ++ ++static void ++SHA256Transform(uint32_t *H, const uint8_t *cp) ++{ ++ uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64]; ++ ++ for (t = 0; t < 16; t++, cp += 4) ++ W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3]; ++ ++ for (t = 16; t < 64; t++) ++ W[t] = sigma1(W[t - 2]) + W[t - 7] + ++ sigma0(W[t - 15]) + W[t - 16]; ++ ++ a = H[0]; b = H[1]; c = H[2]; d = H[3]; ++ e = H[4]; f = H[5]; g = H[6]; h = H[7]; ++ ++ for (t = 0; t < 64; t++) { ++ T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t]; ++ T2 = SIGMA0(a) + Maj(a, b, c); ++ h = g; g = f; f = e; e = d + T1; ++ d = c; c = b; b = a; a = T1 + T2; ++ } ++ ++ H[0] += a; H[1] += b; H[2] += c; H[3] += d; ++ H[4] += e; H[5] += f; H[6] += g; H[7] += h; ++} ++ ++void ++zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp) ++{ ++ uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, ++ 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; ++ uint8_t pad[128]; ++ int i, padsize; ++ ++ for (i = 0; i < (size & ~63ULL); i += 64) ++ SHA256Transform(H, (uint8_t *)buf + i); ++ ++ for (padsize = 0; i < size; i++) ++ pad[padsize++] = *((uint8_t *)buf + i); ++ ++ for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++) ++ pad[padsize] = 0; ++ ++ for (i = 56; i >= 0; i -= 8) ++ pad[padsize++] = (size << 3) >> i; ++ ++ for (i = 0; i < padsize; i += 64) ++ SHA256Transform(H, pad + i); ++ ++ ZIO_SET_CHECKSUM(zcp, ++ (uint64_t)H[0] << 32 | H[1], ++ (uint64_t)H[2] << 32 | H[3], ++ (uint64_t)H[4] << 32 | H[5], ++ (uint64_t)H[6] << 32 | H[7]); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/spa_boot.c linux-3.2.33-go/fs/zfs/zfs/spa_boot.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/spa_boot.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/spa_boot.c 2012-11-16 23:25:34.353039289 +0100 +@@ -0,0 +1,50 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifdef _KERNEL ++ ++#include ++#include ++#include ++ ++char * ++spa_get_bootprop(char *propname) ++{ ++ char *value; ++ ++ if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), ++ DDI_PROP_DONTPASS, propname, &value) != DDI_SUCCESS) ++ return (NULL); ++ return (value); ++} ++ ++void ++spa_free_bootprop(char *value) ++{ ++ ddi_prop_free(value); ++} ++ ++#endif /* _KERNEL */ +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/spa.c linux-3.2.33-go/fs/zfs/zfs/spa.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/spa.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/spa.c 2012-11-16 23:25:34.350039322 +0100 +@@ -0,0 +1,6019 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ */ ++ ++/* ++ * This file contains all the routines used when modifying on-disk SPA state. ++ * This includes opening, importing, destroying, exporting a pool, and syncing a ++ * pool. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef _KERNEL ++#include ++#include ++#include ++#include ++#include ++#include ++#endif /* _KERNEL */ ++ ++#include "zfs_prop.h" ++#include "zfs_comutil.h" ++ ++typedef enum zti_modes { ++ zti_mode_fixed, /* value is # of threads (min 1) */ ++ zti_mode_online_percent, /* value is % of online CPUs */ ++ zti_mode_batch, /* cpu-intensive; value is ignored */ ++ zti_mode_null, /* don't create a taskq */ ++ zti_nmodes ++} zti_modes_t; ++ ++#define ZTI_FIX(n) { zti_mode_fixed, (n) } ++#define ZTI_PCT(n) { zti_mode_online_percent, (n) } ++#define ZTI_BATCH { zti_mode_batch, 0 } ++#define ZTI_NULL { zti_mode_null, 0 } ++ ++#define ZTI_ONE ZTI_FIX(1) ++ ++typedef struct zio_taskq_info { ++ enum zti_modes zti_mode; ++ uint_t zti_value; ++} zio_taskq_info_t; ++ ++static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { ++ "iss", "iss_h", "int", "int_h" ++}; ++ ++/* ++ * Define the taskq threads for the following I/O types: ++ * NULL, READ, WRITE, FREE, CLAIM, and IOCTL ++ */ ++const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { ++ /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ ++ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, ++ { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, ++ { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(16), ZTI_FIX(5) }, ++ { ZTI_PCT(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, ++ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, ++ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, ++}; ++ ++static dsl_syncfunc_t spa_sync_props; ++static boolean_t spa_has_active_shared_spare(spa_t *spa); ++static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, ++ spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, ++ char **ereport); ++static void spa_vdev_resilver_done(spa_t *spa); ++ ++uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ ++id_t zio_taskq_psrset_bind = PS_NONE; ++boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ ++uint_t zio_taskq_basedc = 80; /* base duty cycle */ ++ ++boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ ++ ++/* ++ * This (illegal) pool name is used when temporarily importing a spa_t in order ++ * to get the vdev stats associated with the imported devices. ++ */ ++#define TRYIMPORT_NAME "$import" ++ ++/* ++ * ========================================================================== ++ * SPA properties routines ++ * ========================================================================== ++ */ ++ ++/* ++ * Add a (source=src, propname=propval) list to an nvlist. ++ */ ++static void ++spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, ++ uint64_t intval, zprop_source_t src) ++{ ++ const char *propname = zpool_prop_to_name(prop); ++ nvlist_t *propval; ++ ++ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); ++ VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); ++ ++ if (strval != NULL) ++ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); ++ else ++ VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); ++ ++ VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); ++ nvlist_free(propval); ++} ++ ++/* ++ * Get property values from the spa configuration. ++ */ ++static void ++spa_prop_get_config(spa_t *spa, nvlist_t **nvp) ++{ ++ vdev_t *rvd = spa->spa_root_vdev; ++ uint64_t size; ++ uint64_t alloc; ++ uint64_t space; ++ uint64_t cap, version; ++ zprop_source_t src = ZPROP_SRC_NONE; ++ spa_config_dirent_t *dp; ++ int c; ++ ++ ASSERT(MUTEX_HELD(&spa->spa_props_lock)); ++ ++ if (rvd != NULL) { ++ alloc = metaslab_class_get_alloc(spa_normal_class(spa)); ++ size = metaslab_class_get_space(spa_normal_class(spa)); ++ spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); ++ spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); ++ spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); ++ spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, ++ size - alloc, src); ++ ++ space = 0; ++ for (c = 0; c < rvd->vdev_children; c++) { ++ vdev_t *tvd = rvd->vdev_child[c]; ++ space += tvd->vdev_max_asize - tvd->vdev_asize; ++ } ++ spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, ++ src); ++ ++ spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, ++ (spa_mode(spa) == FREAD), src); ++ ++ cap = (size == 0) ? 0 : (alloc * 100 / size); ++ spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); ++ ++ spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, ++ ddt_get_pool_dedup_ratio(spa), src); ++ ++ spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, ++ rvd->vdev_state, src); ++ ++ version = spa_version(spa); ++ if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) ++ src = ZPROP_SRC_DEFAULT; ++ else ++ src = ZPROP_SRC_LOCAL; ++ spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); ++ } ++ ++ spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); ++ ++ if (spa->spa_comment != NULL) { ++ spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, ++ 0, ZPROP_SRC_LOCAL); ++ } ++ ++ if (spa->spa_root != NULL) ++ spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, ++ 0, ZPROP_SRC_LOCAL); ++ ++ if ((dp = list_head(&spa->spa_config_list)) != NULL) { ++ if (dp->scd_path == NULL) { ++ spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, ++ "none", 0, ZPROP_SRC_LOCAL); ++ } else if (strcmp(dp->scd_path, spa_config_path) != 0) { ++ spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, ++ dp->scd_path, 0, ZPROP_SRC_LOCAL); ++ } ++ } ++} ++ ++/* ++ * Get zpool property values. ++ */ ++int ++spa_prop_get(spa_t *spa, nvlist_t **nvp) ++{ ++ objset_t *mos = spa->spa_meta_objset; ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ int err; ++ ++ err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_PUSHPAGE); ++ if (err) ++ return err; ++ ++ mutex_enter(&spa->spa_props_lock); ++ ++ /* ++ * Get properties from the spa config. ++ */ ++ spa_prop_get_config(spa, nvp); ++ ++ /* If no pool property object, no more prop to get. */ ++ if (mos == NULL || spa->spa_pool_props_object == 0) { ++ mutex_exit(&spa->spa_props_lock); ++ goto out; ++ } ++ ++ /* ++ * Get properties from the MOS pool property object. ++ */ ++ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); ++ (err = zap_cursor_retrieve(&zc, &za)) == 0; ++ zap_cursor_advance(&zc)) { ++ uint64_t intval = 0; ++ char *strval = NULL; ++ zprop_source_t src = ZPROP_SRC_DEFAULT; ++ zpool_prop_t prop; ++ ++ if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) ++ continue; ++ ++ switch (za.za_integer_length) { ++ case 8: ++ /* integer property */ ++ if (za.za_first_integer != ++ zpool_prop_default_numeric(prop)) ++ src = ZPROP_SRC_LOCAL; ++ ++ if (prop == ZPOOL_PROP_BOOTFS) { ++ dsl_pool_t *dp; ++ dsl_dataset_t *ds = NULL; ++ ++ dp = spa_get_dsl(spa); ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ if ((err = dsl_dataset_hold_obj(dp, ++ za.za_first_integer, FTAG, &ds))) { ++ rw_exit(&dp->dp_config_rwlock); ++ break; ++ } ++ ++ strval = kmem_alloc( ++ MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, ++ KM_PUSHPAGE); ++ dsl_dataset_name(ds, strval); ++ dsl_dataset_rele(ds, FTAG); ++ rw_exit(&dp->dp_config_rwlock); ++ } else { ++ strval = NULL; ++ intval = za.za_first_integer; ++ } ++ ++ spa_prop_add_list(*nvp, prop, strval, intval, src); ++ ++ if (strval != NULL) ++ kmem_free(strval, ++ MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); ++ ++ break; ++ ++ case 1: ++ /* string property */ ++ strval = kmem_alloc(za.za_num_integers, KM_PUSHPAGE); ++ err = zap_lookup(mos, spa->spa_pool_props_object, ++ za.za_name, 1, za.za_num_integers, strval); ++ if (err) { ++ kmem_free(strval, za.za_num_integers); ++ break; ++ } ++ spa_prop_add_list(*nvp, prop, strval, 0, src); ++ kmem_free(strval, za.za_num_integers); ++ break; ++ ++ default: ++ break; ++ } ++ } ++ zap_cursor_fini(&zc); ++ mutex_exit(&spa->spa_props_lock); ++out: ++ if (err && err != ENOENT) { ++ nvlist_free(*nvp); ++ *nvp = NULL; ++ return (err); ++ } ++ ++ return (0); ++} ++ ++/* ++ * Validate the given pool properties nvlist and modify the list ++ * for the property values to be set. ++ */ ++static int ++spa_prop_validate(spa_t *spa, nvlist_t *props) ++{ ++ nvpair_t *elem; ++ int error = 0, reset_bootfs = 0; ++ uint64_t objnum = 0; ++ ++ elem = NULL; ++ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { ++ zpool_prop_t prop; ++ char *propname, *strval; ++ uint64_t intval; ++ objset_t *os; ++ char *slash, *check; ++ ++ propname = nvpair_name(elem); ++ ++ if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) ++ return (EINVAL); ++ ++ switch (prop) { ++ case ZPOOL_PROP_VERSION: ++ error = nvpair_value_uint64(elem, &intval); ++ if (!error && ++ (intval < spa_version(spa) || intval > SPA_VERSION)) ++ error = EINVAL; ++ break; ++ ++ case ZPOOL_PROP_DELEGATION: ++ case ZPOOL_PROP_AUTOREPLACE: ++ case ZPOOL_PROP_LISTSNAPS: ++ case ZPOOL_PROP_AUTOEXPAND: ++ error = nvpair_value_uint64(elem, &intval); ++ if (!error && intval > 1) ++ error = EINVAL; ++ break; ++ ++ case ZPOOL_PROP_BOOTFS: ++ /* ++ * If the pool version is less than SPA_VERSION_BOOTFS, ++ * or the pool is still being created (version == 0), ++ * the bootfs property cannot be set. ++ */ ++ if (spa_version(spa) < SPA_VERSION_BOOTFS) { ++ error = ENOTSUP; ++ break; ++ } ++ ++ /* ++ * Make sure the vdev config is bootable ++ */ ++ if (!vdev_is_bootable(spa->spa_root_vdev)) { ++ error = ENOTSUP; ++ break; ++ } ++ ++ reset_bootfs = 1; ++ ++ error = nvpair_value_string(elem, &strval); ++ ++ if (!error) { ++ uint64_t compress; ++ ++ if (strval == NULL || strval[0] == '\0') { ++ objnum = zpool_prop_default_numeric( ++ ZPOOL_PROP_BOOTFS); ++ break; ++ } ++ ++ if ((error = dmu_objset_hold(strval,FTAG,&os))) ++ break; ++ ++ /* Must be ZPL and not gzip compressed. */ ++ ++ if (dmu_objset_type(os) != DMU_OST_ZFS) { ++ error = ENOTSUP; ++ } else if ((error = dsl_prop_get_integer(strval, ++ zfs_prop_to_name(ZFS_PROP_COMPRESSION), ++ &compress, NULL)) == 0 && ++ !BOOTFS_COMPRESS_VALID(compress)) { ++ error = ENOTSUP; ++ } else { ++ objnum = dmu_objset_id(os); ++ } ++ dmu_objset_rele(os, FTAG); ++ } ++ break; ++ ++ case ZPOOL_PROP_FAILUREMODE: ++ error = nvpair_value_uint64(elem, &intval); ++ if (!error && (intval < ZIO_FAILURE_MODE_WAIT || ++ intval > ZIO_FAILURE_MODE_PANIC)) ++ error = EINVAL; ++ ++ /* ++ * This is a special case which only occurs when ++ * the pool has completely failed. This allows ++ * the user to change the in-core failmode property ++ * without syncing it out to disk (I/Os might ++ * currently be blocked). We do this by returning ++ * EIO to the caller (spa_prop_set) to trick it ++ * into thinking we encountered a property validation ++ * error. ++ */ ++ if (!error && spa_suspended(spa)) { ++ spa->spa_failmode = intval; ++ error = EIO; ++ } ++ break; ++ ++ case ZPOOL_PROP_CACHEFILE: ++ if ((error = nvpair_value_string(elem, &strval)) != 0) ++ break; ++ ++ if (strval[0] == '\0') ++ break; ++ ++ if (strcmp(strval, "none") == 0) ++ break; ++ ++ if (strval[0] != '/') { ++ error = EINVAL; ++ break; ++ } ++ ++ slash = strrchr(strval, '/'); ++ ASSERT(slash != NULL); ++ ++ if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || ++ strcmp(slash, "/..") == 0) ++ error = EINVAL; ++ break; ++ ++ case ZPOOL_PROP_COMMENT: ++ if ((error = nvpair_value_string(elem, &strval)) != 0) ++ break; ++ for (check = strval; *check != '\0'; check++) { ++ if (!isprint(*check)) { ++ error = EINVAL; ++ break; ++ } ++ check++; ++ } ++ if (strlen(strval) > ZPROP_MAX_COMMENT) ++ error = E2BIG; ++ break; ++ ++ case ZPOOL_PROP_DEDUPDITTO: ++ if (spa_version(spa) < SPA_VERSION_DEDUP) ++ error = ENOTSUP; ++ else ++ error = nvpair_value_uint64(elem, &intval); ++ if (error == 0 && ++ intval != 0 && intval < ZIO_DEDUPDITTO_MIN) ++ error = EINVAL; ++ break; ++ ++ default: ++ break; ++ } ++ ++ if (error) ++ break; ++ } ++ ++ if (!error && reset_bootfs) { ++ error = nvlist_remove(props, ++ zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); ++ ++ if (!error) { ++ error = nvlist_add_uint64(props, ++ zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); ++ } ++ } ++ ++ return (error); ++} ++ ++void ++spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) ++{ ++ char *cachefile; ++ spa_config_dirent_t *dp; ++ ++ if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), ++ &cachefile) != 0) ++ return; ++ ++ dp = kmem_alloc(sizeof (spa_config_dirent_t), ++ KM_PUSHPAGE); ++ ++ if (cachefile[0] == '\0') ++ dp->scd_path = spa_strdup(spa_config_path); ++ else if (strcmp(cachefile, "none") == 0) ++ dp->scd_path = NULL; ++ else ++ dp->scd_path = spa_strdup(cachefile); ++ ++ list_insert_head(&spa->spa_config_list, dp); ++ if (need_sync) ++ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); ++} ++ ++int ++spa_prop_set(spa_t *spa, nvlist_t *nvp) ++{ ++ int error; ++ nvpair_t *elem; ++ boolean_t need_sync = B_FALSE; ++ zpool_prop_t prop; ++ ++ if ((error = spa_prop_validate(spa, nvp)) != 0) ++ return (error); ++ ++ elem = NULL; ++ while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { ++ if ((prop = zpool_name_to_prop( ++ nvpair_name(elem))) == ZPROP_INVAL) ++ return (EINVAL); ++ ++ if (prop == ZPOOL_PROP_CACHEFILE || ++ prop == ZPOOL_PROP_ALTROOT || ++ prop == ZPOOL_PROP_READONLY) ++ continue; ++ ++ need_sync = B_TRUE; ++ break; ++ } ++ ++ if (need_sync) ++ return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, ++ spa, nvp, 3)); ++ else ++ return (0); ++} ++ ++/* ++ * If the bootfs property value is dsobj, clear it. ++ */ ++void ++spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) ++{ ++ if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { ++ VERIFY(zap_remove(spa->spa_meta_objset, ++ spa->spa_pool_props_object, ++ zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); ++ spa->spa_bootfs = 0; ++ } ++} ++ ++/* ++ * Change the GUID for the pool. This is done so that we can later ++ * re-import a pool built from a clone of our own vdevs. We will modify ++ * the root vdev's guid, our own pool guid, and then mark all of our ++ * vdevs dirty. Note that we must make sure that all our vdevs are ++ * online when we do this, or else any vdevs that weren't present ++ * would be orphaned from our pool. We are also going to issue a ++ * sysevent to update any watchers. ++ */ ++int ++spa_change_guid(spa_t *spa) ++{ ++ uint64_t oldguid, newguid; ++ uint64_t txg; ++ ++ if (!(spa_mode_global & FWRITE)) ++ return (EROFS); ++ ++ txg = spa_vdev_enter(spa); ++ ++ if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY) ++ return (spa_vdev_exit(spa, NULL, txg, ENXIO)); ++ ++ oldguid = spa_guid(spa); ++ newguid = spa_generate_guid(NULL); ++ ASSERT3U(oldguid, !=, newguid); ++ ++ spa->spa_root_vdev->vdev_guid = newguid; ++ spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid); ++ ++ vdev_config_dirty(spa->spa_root_vdev); ++ ++ spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_REGUID); ++ ++ return (spa_vdev_exit(spa, NULL, txg, 0)); ++} ++ ++/* ++ * ========================================================================== ++ * SPA state manipulation (open/create/destroy/import/export) ++ * ========================================================================== ++ */ ++ ++static int ++spa_error_entry_compare(const void *a, const void *b) ++{ ++ spa_error_entry_t *sa = (spa_error_entry_t *)a; ++ spa_error_entry_t *sb = (spa_error_entry_t *)b; ++ int ret; ++ ++ ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, ++ sizeof (zbookmark_t)); ++ ++ if (ret < 0) ++ return (-1); ++ else if (ret > 0) ++ return (1); ++ else ++ return (0); ++} ++ ++/* ++ * Utility function which retrieves copies of the current logs and ++ * re-initializes them in the process. ++ */ ++void ++spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) ++{ ++ ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); ++ ++ bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); ++ bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); ++ ++ avl_create(&spa->spa_errlist_scrub, ++ spa_error_entry_compare, sizeof (spa_error_entry_t), ++ offsetof(spa_error_entry_t, se_avl)); ++ avl_create(&spa->spa_errlist_last, ++ spa_error_entry_compare, sizeof (spa_error_entry_t), ++ offsetof(spa_error_entry_t, se_avl)); ++} ++ ++static taskq_t * ++spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, ++ uint_t value) ++{ ++ uint_t flags = TASKQ_PREPOPULATE; ++ boolean_t batch = B_FALSE; ++ ++ switch (mode) { ++ case zti_mode_null: ++ return (NULL); /* no taskq needed */ ++ ++ case zti_mode_fixed: ++ ASSERT3U(value, >=, 1); ++ value = MAX(value, 1); ++ break; ++ ++ case zti_mode_batch: ++ batch = B_TRUE; ++ flags |= TASKQ_THREADS_CPU_PCT; ++ value = zio_taskq_batch_pct; ++ break; ++ ++ case zti_mode_online_percent: ++ flags |= TASKQ_THREADS_CPU_PCT; ++ break; ++ ++ default: ++ panic("unrecognized mode for %s taskq (%u:%u) in " ++ "spa_activate()", ++ name, mode, value); ++ break; ++ } ++ ++ if (zio_taskq_sysdc && spa->spa_proc != &p0) { ++ if (batch) ++ flags |= TASKQ_DC_BATCH; ++ ++ return (taskq_create_sysdc(name, value, 50, INT_MAX, ++ spa->spa_proc, zio_taskq_basedc, flags)); ++ } ++ return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, ++ spa->spa_proc, flags)); ++} ++ ++static void ++spa_create_zio_taskqs(spa_t *spa) ++{ ++ int t, q; ++ ++ for (t = 0; t < ZIO_TYPES; t++) { ++ for (q = 0; q < ZIO_TASKQ_TYPES; q++) { ++ const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; ++ enum zti_modes mode = ztip->zti_mode; ++ uint_t value = ztip->zti_value; ++ char name[32]; ++ ++ (void) snprintf(name, sizeof (name), ++ "%s_%s", zio_type_name[t], zio_taskq_types[q]); ++ ++ spa->spa_zio_taskq[t][q] = ++ spa_taskq_create(spa, name, mode, value); ++ } ++ } ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPA_THREAD) ++static void ++spa_thread(void *arg) ++{ ++ callb_cpr_t cprinfo; ++ ++ spa_t *spa = arg; ++ user_t *pu = PTOU(curproc); ++ ++ CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, ++ spa->spa_name); ++ ++ ASSERT(curproc != &p0); ++ (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), ++ "zpool-%s", spa->spa_name); ++ (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); ++ ++ /* bind this thread to the requested psrset */ ++ if (zio_taskq_psrset_bind != PS_NONE) { ++ pool_lock(); ++ mutex_enter(&cpu_lock); ++ mutex_enter(&pidlock); ++ mutex_enter(&curproc->p_lock); ++ ++ if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, ++ 0, NULL, NULL) == 0) { ++ curthread->t_bind_pset = zio_taskq_psrset_bind; ++ } else { ++ cmn_err(CE_WARN, ++ "Couldn't bind process for zfs pool \"%s\" to " ++ "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); ++ } ++ ++ mutex_exit(&curproc->p_lock); ++ mutex_exit(&pidlock); ++ mutex_exit(&cpu_lock); ++ pool_unlock(); ++ } ++ ++ if (zio_taskq_sysdc) { ++ sysdc_thread_enter(curthread, 100, 0); ++ } ++ ++ spa->spa_proc = curproc; ++ spa->spa_did = curthread->t_did; ++ ++ spa_create_zio_taskqs(spa); ++ ++ mutex_enter(&spa->spa_proc_lock); ++ ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); ++ ++ spa->spa_proc_state = SPA_PROC_ACTIVE; ++ cv_broadcast(&spa->spa_proc_cv); ++ ++ CALLB_CPR_SAFE_BEGIN(&cprinfo); ++ while (spa->spa_proc_state == SPA_PROC_ACTIVE) ++ cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); ++ CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); ++ ++ ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); ++ spa->spa_proc_state = SPA_PROC_GONE; ++ spa->spa_proc = &p0; ++ cv_broadcast(&spa->spa_proc_cv); ++ CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ ++ ++ mutex_enter(&curproc->p_lock); ++ lwp_exit(); ++} ++#endif ++ ++/* ++ * Activate an uninitialized pool. ++ */ ++static void ++spa_activate(spa_t *spa, int mode) ++{ ++ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); ++ ++ spa->spa_state = POOL_STATE_ACTIVE; ++ spa->spa_mode = mode; ++ ++ spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); ++ spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); ++ ++ /* Try to create a covering process */ ++ mutex_enter(&spa->spa_proc_lock); ++ ASSERT(spa->spa_proc_state == SPA_PROC_NONE); ++ ASSERT(spa->spa_proc == &p0); ++ spa->spa_did = 0; ++ ++#ifdef HAVE_SPA_THREAD ++ /* Only create a process if we're going to be around a while. */ ++ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { ++ if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, ++ NULL, 0) == 0) { ++ spa->spa_proc_state = SPA_PROC_CREATED; ++ while (spa->spa_proc_state == SPA_PROC_CREATED) { ++ cv_wait(&spa->spa_proc_cv, ++ &spa->spa_proc_lock); ++ } ++ ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); ++ ASSERT(spa->spa_proc != &p0); ++ ASSERT(spa->spa_did != 0); ++ } else { ++#ifdef _KERNEL ++ cmn_err(CE_WARN, ++ "Couldn't create process for zfs pool \"%s\"\n", ++ spa->spa_name); ++#endif ++ } ++ } ++#endif /* HAVE_SPA_THREAD */ ++ mutex_exit(&spa->spa_proc_lock); ++ ++ /* If we didn't create a process, we need to create our taskqs. */ ++ if (spa->spa_proc == &p0) { ++ spa_create_zio_taskqs(spa); ++ } ++ ++ list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), ++ offsetof(vdev_t, vdev_config_dirty_node)); ++ list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), ++ offsetof(vdev_t, vdev_state_dirty_node)); ++ ++ txg_list_create(&spa->spa_vdev_txg_list, ++ offsetof(struct vdev, vdev_txg_node)); ++ ++ avl_create(&spa->spa_errlist_scrub, ++ spa_error_entry_compare, sizeof (spa_error_entry_t), ++ offsetof(spa_error_entry_t, se_avl)); ++ avl_create(&spa->spa_errlist_last, ++ spa_error_entry_compare, sizeof (spa_error_entry_t), ++ offsetof(spa_error_entry_t, se_avl)); ++} ++ ++/* ++ * Opposite of spa_activate(). ++ */ ++static void ++spa_deactivate(spa_t *spa) ++{ ++ int t, q; ++ ++ ASSERT(spa->spa_sync_on == B_FALSE); ++ ASSERT(spa->spa_dsl_pool == NULL); ++ ASSERT(spa->spa_root_vdev == NULL); ++ ASSERT(spa->spa_async_zio_root == NULL); ++ ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); ++ ++ txg_list_destroy(&spa->spa_vdev_txg_list); ++ ++ list_destroy(&spa->spa_config_dirty_list); ++ list_destroy(&spa->spa_state_dirty_list); ++ ++ for (t = 0; t < ZIO_TYPES; t++) { ++ for (q = 0; q < ZIO_TASKQ_TYPES; q++) { ++ if (spa->spa_zio_taskq[t][q] != NULL) ++ taskq_destroy(spa->spa_zio_taskq[t][q]); ++ spa->spa_zio_taskq[t][q] = NULL; ++ } ++ } ++ ++ metaslab_class_destroy(spa->spa_normal_class); ++ spa->spa_normal_class = NULL; ++ ++ metaslab_class_destroy(spa->spa_log_class); ++ spa->spa_log_class = NULL; ++ ++ /* ++ * If this was part of an import or the open otherwise failed, we may ++ * still have errors left in the queues. Empty them just in case. ++ */ ++ spa_errlog_drain(spa); ++ ++ avl_destroy(&spa->spa_errlist_scrub); ++ avl_destroy(&spa->spa_errlist_last); ++ ++ spa->spa_state = POOL_STATE_UNINITIALIZED; ++ ++ mutex_enter(&spa->spa_proc_lock); ++ if (spa->spa_proc_state != SPA_PROC_NONE) { ++ ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); ++ spa->spa_proc_state = SPA_PROC_DEACTIVATE; ++ cv_broadcast(&spa->spa_proc_cv); ++ while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { ++ ASSERT(spa->spa_proc != &p0); ++ cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); ++ } ++ ASSERT(spa->spa_proc_state == SPA_PROC_GONE); ++ spa->spa_proc_state = SPA_PROC_NONE; ++ } ++ ASSERT(spa->spa_proc == &p0); ++ mutex_exit(&spa->spa_proc_lock); ++ ++ /* ++ * We want to make sure spa_thread() has actually exited the ZFS ++ * module, so that the module can't be unloaded out from underneath ++ * it. ++ */ ++ if (spa->spa_did != 0) { ++ thread_join(spa->spa_did); ++ spa->spa_did = 0; ++ } ++} ++ ++/* ++ * Verify a pool configuration, and construct the vdev tree appropriately. This ++ * will create all the necessary vdevs in the appropriate layout, with each vdev ++ * in the CLOSED state. This will prep the pool before open/creation/import. ++ * All vdev validation is done by the vdev_alloc() routine. ++ */ ++static int ++spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, ++ uint_t id, int atype) ++{ ++ nvlist_t **child; ++ uint_t children; ++ int error; ++ int c; ++ ++ if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) ++ return (error); ++ ++ if ((*vdp)->vdev_ops->vdev_op_leaf) ++ return (0); ++ ++ error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, ++ &child, &children); ++ ++ if (error == ENOENT) ++ return (0); ++ ++ if (error) { ++ vdev_free(*vdp); ++ *vdp = NULL; ++ return (EINVAL); ++ } ++ ++ for (c = 0; c < children; c++) { ++ vdev_t *vd; ++ if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, ++ atype)) != 0) { ++ vdev_free(*vdp); ++ *vdp = NULL; ++ return (error); ++ } ++ } ++ ++ ASSERT(*vdp != NULL); ++ ++ return (0); ++} ++ ++/* ++ * Opposite of spa_load(). ++ */ ++static void ++spa_unload(spa_t *spa) ++{ ++ int i; ++ ++ ASSERT(MUTEX_HELD(&spa_namespace_lock)); ++ ++ /* ++ * Stop async tasks. ++ */ ++ spa_async_suspend(spa); ++ ++ /* ++ * Stop syncing. ++ */ ++ if (spa->spa_sync_on) { ++ txg_sync_stop(spa->spa_dsl_pool); ++ spa->spa_sync_on = B_FALSE; ++ } ++ ++ /* ++ * Wait for any outstanding async I/O to complete. ++ */ ++ if (spa->spa_async_zio_root != NULL) { ++ (void) zio_wait(spa->spa_async_zio_root); ++ spa->spa_async_zio_root = NULL; ++ } ++ ++ bpobj_close(&spa->spa_deferred_bpobj); ++ ++ /* ++ * Close the dsl pool. ++ */ ++ if (spa->spa_dsl_pool) { ++ dsl_pool_close(spa->spa_dsl_pool); ++ spa->spa_dsl_pool = NULL; ++ spa->spa_meta_objset = NULL; ++ } ++ ++ ddt_unload(spa); ++ ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ ++ /* ++ * Drop and purge level 2 cache ++ */ ++ spa_l2cache_drop(spa); ++ ++ /* ++ * Close all vdevs. ++ */ ++ if (spa->spa_root_vdev) ++ vdev_free(spa->spa_root_vdev); ++ ASSERT(spa->spa_root_vdev == NULL); ++ ++ for (i = 0; i < spa->spa_spares.sav_count; i++) ++ vdev_free(spa->spa_spares.sav_vdevs[i]); ++ if (spa->spa_spares.sav_vdevs) { ++ kmem_free(spa->spa_spares.sav_vdevs, ++ spa->spa_spares.sav_count * sizeof (void *)); ++ spa->spa_spares.sav_vdevs = NULL; ++ } ++ if (spa->spa_spares.sav_config) { ++ nvlist_free(spa->spa_spares.sav_config); ++ spa->spa_spares.sav_config = NULL; ++ } ++ spa->spa_spares.sav_count = 0; ++ ++ for (i = 0; i < spa->spa_l2cache.sav_count; i++) { ++ vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); ++ vdev_free(spa->spa_l2cache.sav_vdevs[i]); ++ } ++ if (spa->spa_l2cache.sav_vdevs) { ++ kmem_free(spa->spa_l2cache.sav_vdevs, ++ spa->spa_l2cache.sav_count * sizeof (void *)); ++ spa->spa_l2cache.sav_vdevs = NULL; ++ } ++ if (spa->spa_l2cache.sav_config) { ++ nvlist_free(spa->spa_l2cache.sav_config); ++ spa->spa_l2cache.sav_config = NULL; ++ } ++ spa->spa_l2cache.sav_count = 0; ++ ++ spa->spa_async_suspended = 0; ++ ++ if (spa->spa_comment != NULL) { ++ spa_strfree(spa->spa_comment); ++ spa->spa_comment = NULL; ++ } ++ ++ spa_config_exit(spa, SCL_ALL, FTAG); ++} ++ ++/* ++ * Load (or re-load) the current list of vdevs describing the active spares for ++ * this pool. When this is called, we have some form of basic information in ++ * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and ++ * then re-generate a more complete list including status information. ++ */ ++static void ++spa_load_spares(spa_t *spa) ++{ ++ nvlist_t **spares; ++ uint_t nspares; ++ int i; ++ vdev_t *vd, *tvd; ++ ++ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ++ ++ /* ++ * First, close and free any existing spare vdevs. ++ */ ++ for (i = 0; i < spa->spa_spares.sav_count; i++) { ++ vd = spa->spa_spares.sav_vdevs[i]; ++ ++ /* Undo the call to spa_activate() below */ ++ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, ++ B_FALSE)) != NULL && tvd->vdev_isspare) ++ spa_spare_remove(tvd); ++ vdev_close(vd); ++ vdev_free(vd); ++ } ++ ++ if (spa->spa_spares.sav_vdevs) ++ kmem_free(spa->spa_spares.sav_vdevs, ++ spa->spa_spares.sav_count * sizeof (void *)); ++ ++ if (spa->spa_spares.sav_config == NULL) ++ nspares = 0; ++ else ++ VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ++ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); ++ ++ spa->spa_spares.sav_count = (int)nspares; ++ spa->spa_spares.sav_vdevs = NULL; ++ ++ if (nspares == 0) ++ return; ++ ++ /* ++ * Construct the array of vdevs, opening them to get status in the ++ * process. For each spare, there is potentially two different vdev_t ++ * structures associated with it: one in the list of spares (used only ++ * for basic validation purposes) and one in the active vdev ++ * configuration (if it's spared in). During this phase we open and ++ * validate each vdev on the spare list. If the vdev also exists in the ++ * active configuration, then we also mark this vdev as an active spare. ++ */ ++ spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), ++ KM_PUSHPAGE); ++ for (i = 0; i < spa->spa_spares.sav_count; i++) { ++ VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, ++ VDEV_ALLOC_SPARE) == 0); ++ ASSERT(vd != NULL); ++ ++ spa->spa_spares.sav_vdevs[i] = vd; ++ ++ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, ++ B_FALSE)) != NULL) { ++ if (!tvd->vdev_isspare) ++ spa_spare_add(tvd); ++ ++ /* ++ * We only mark the spare active if we were successfully ++ * able to load the vdev. Otherwise, importing a pool ++ * with a bad active spare would result in strange ++ * behavior, because multiple pool would think the spare ++ * is actively in use. ++ * ++ * There is a vulnerability here to an equally bizarre ++ * circumstance, where a dead active spare is later ++ * brought back to life (onlined or otherwise). Given ++ * the rarity of this scenario, and the extra complexity ++ * it adds, we ignore the possibility. ++ */ ++ if (!vdev_is_dead(tvd)) ++ spa_spare_activate(tvd); ++ } ++ ++ vd->vdev_top = vd; ++ vd->vdev_aux = &spa->spa_spares; ++ ++ if (vdev_open(vd) != 0) ++ continue; ++ ++ if (vdev_validate_aux(vd) == 0) ++ spa_spare_add(vd); ++ } ++ ++ /* ++ * Recompute the stashed list of spares, with status information ++ * this time. ++ */ ++ VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, ++ DATA_TYPE_NVLIST_ARRAY) == 0); ++ ++ spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), ++ KM_PUSHPAGE); ++ for (i = 0; i < spa->spa_spares.sav_count; i++) ++ spares[i] = vdev_config_generate(spa, ++ spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); ++ VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ++ ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); ++ for (i = 0; i < spa->spa_spares.sav_count; i++) ++ nvlist_free(spares[i]); ++ kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); ++} ++ ++/* ++ * Load (or re-load) the current list of vdevs describing the active l2cache for ++ * this pool. When this is called, we have some form of basic information in ++ * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and ++ * then re-generate a more complete list including status information. ++ * Devices which are already active have their details maintained, and are ++ * not re-opened. ++ */ ++static void ++spa_load_l2cache(spa_t *spa) ++{ ++ nvlist_t **l2cache; ++ uint_t nl2cache; ++ int i, j, oldnvdevs; ++ uint64_t guid; ++ vdev_t *vd, **oldvdevs, **newvdevs = NULL; ++ spa_aux_vdev_t *sav = &spa->spa_l2cache; ++ ++ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ++ ++ if (sav->sav_config != NULL) { ++ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ++ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); ++ newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_PUSHPAGE); ++ } else { ++ nl2cache = 0; ++ } ++ ++ oldvdevs = sav->sav_vdevs; ++ oldnvdevs = sav->sav_count; ++ sav->sav_vdevs = NULL; ++ sav->sav_count = 0; ++ ++ /* ++ * Process new nvlist of vdevs. ++ */ ++ for (i = 0; i < nl2cache; i++) { ++ VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, ++ &guid) == 0); ++ ++ newvdevs[i] = NULL; ++ for (j = 0; j < oldnvdevs; j++) { ++ vd = oldvdevs[j]; ++ if (vd != NULL && guid == vd->vdev_guid) { ++ /* ++ * Retain previous vdev for add/remove ops. ++ */ ++ newvdevs[i] = vd; ++ oldvdevs[j] = NULL; ++ break; ++ } ++ } ++ ++ if (newvdevs[i] == NULL) { ++ /* ++ * Create new vdev ++ */ ++ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, ++ VDEV_ALLOC_L2CACHE) == 0); ++ ASSERT(vd != NULL); ++ newvdevs[i] = vd; ++ ++ /* ++ * Commit this vdev as an l2cache device, ++ * even if it fails to open. ++ */ ++ spa_l2cache_add(vd); ++ ++ vd->vdev_top = vd; ++ vd->vdev_aux = sav; ++ ++ spa_l2cache_activate(vd); ++ ++ if (vdev_open(vd) != 0) ++ continue; ++ ++ (void) vdev_validate_aux(vd); ++ ++ if (!vdev_is_dead(vd)) ++ l2arc_add_vdev(spa, vd); ++ } ++ } ++ ++ /* ++ * Purge vdevs that were dropped ++ */ ++ for (i = 0; i < oldnvdevs; i++) { ++ uint64_t pool; ++ ++ vd = oldvdevs[i]; ++ if (vd != NULL) { ++ ASSERT(vd->vdev_isl2cache); ++ ++ if (spa_l2cache_exists(vd->vdev_guid, &pool) && ++ pool != 0ULL && l2arc_vdev_present(vd)) ++ l2arc_remove_vdev(vd); ++ vdev_clear_stats(vd); ++ vdev_free(vd); ++ } ++ } ++ ++ if (oldvdevs) ++ kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); ++ ++ if (sav->sav_config == NULL) ++ goto out; ++ ++ sav->sav_vdevs = newvdevs; ++ sav->sav_count = (int)nl2cache; ++ ++ /* ++ * Recompute the stashed list of l2cache devices, with status ++ * information this time. ++ */ ++ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, ++ DATA_TYPE_NVLIST_ARRAY) == 0); ++ ++ l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_PUSHPAGE); ++ for (i = 0; i < sav->sav_count; i++) ++ l2cache[i] = vdev_config_generate(spa, ++ sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); ++ VERIFY(nvlist_add_nvlist_array(sav->sav_config, ++ ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); ++out: ++ for (i = 0; i < sav->sav_count; i++) ++ nvlist_free(l2cache[i]); ++ if (sav->sav_count) ++ kmem_free(l2cache, sav->sav_count * sizeof (void *)); ++} ++ ++static int ++load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) ++{ ++ dmu_buf_t *db; ++ char *packed = NULL; ++ size_t nvsize = 0; ++ int error; ++ *value = NULL; ++ ++ VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); ++ nvsize = *(uint64_t *)db->db_data; ++ dmu_buf_rele(db, FTAG); ++ ++ packed = kmem_alloc(nvsize, KM_PUSHPAGE | KM_NODEBUG); ++ error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, ++ DMU_READ_PREFETCH); ++ if (error == 0) ++ error = nvlist_unpack(packed, nvsize, value, 0); ++ kmem_free(packed, nvsize); ++ ++ return (error); ++} ++ ++/* ++ * Checks to see if the given vdev could not be opened, in which case we post a ++ * sysevent to notify the autoreplace code that the device has been removed. ++ */ ++static void ++spa_check_removed(vdev_t *vd) ++{ ++ int c; ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ spa_check_removed(vd->vdev_child[c]); ++ ++ if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { ++ zfs_ereport_post(FM_EREPORT_RESOURCE_AUTOREPLACE, ++ vd->vdev_spa, vd, NULL, 0, 0); ++ spa_event_notify(vd->vdev_spa, vd, FM_EREPORT_ZFS_DEVICE_CHECK); ++ } ++} ++ ++/* ++ * Validate the current config against the MOS config ++ */ ++static boolean_t ++spa_config_valid(spa_t *spa, nvlist_t *config) ++{ ++ vdev_t *mrvd, *rvd = spa->spa_root_vdev; ++ nvlist_t *nv; ++ int c, i; ++ ++ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); ++ ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); ++ ++ ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); ++ ++ /* ++ * If we're doing a normal import, then build up any additional ++ * diagnostic information about missing devices in this config. ++ * We'll pass this up to the user for further processing. ++ */ ++ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { ++ nvlist_t **child, *nv; ++ uint64_t idx = 0; ++ ++ child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), ++ KM_PUSHPAGE); ++ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); ++ ++ for (c = 0; c < rvd->vdev_children; c++) { ++ vdev_t *tvd = rvd->vdev_child[c]; ++ vdev_t *mtvd = mrvd->vdev_child[c]; ++ ++ if (tvd->vdev_ops == &vdev_missing_ops && ++ mtvd->vdev_ops != &vdev_missing_ops && ++ mtvd->vdev_islog) ++ child[idx++] = vdev_config_generate(spa, mtvd, ++ B_FALSE, 0); ++ } ++ ++ if (idx) { ++ VERIFY(nvlist_add_nvlist_array(nv, ++ ZPOOL_CONFIG_CHILDREN, child, idx) == 0); ++ VERIFY(nvlist_add_nvlist(spa->spa_load_info, ++ ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); ++ ++ for (i = 0; i < idx; i++) ++ nvlist_free(child[i]); ++ } ++ nvlist_free(nv); ++ kmem_free(child, rvd->vdev_children * sizeof (char **)); ++ } ++ ++ /* ++ * Compare the root vdev tree with the information we have ++ * from the MOS config (mrvd). Check each top-level vdev ++ * with the corresponding MOS config top-level (mtvd). ++ */ ++ for (c = 0; c < rvd->vdev_children; c++) { ++ vdev_t *tvd = rvd->vdev_child[c]; ++ vdev_t *mtvd = mrvd->vdev_child[c]; ++ ++ /* ++ * Resolve any "missing" vdevs in the current configuration. ++ * If we find that the MOS config has more accurate information ++ * about the top-level vdev then use that vdev instead. ++ */ ++ if (tvd->vdev_ops == &vdev_missing_ops && ++ mtvd->vdev_ops != &vdev_missing_ops) { ++ ++ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) ++ continue; ++ ++ /* ++ * Device specific actions. ++ */ ++ if (mtvd->vdev_islog) { ++ spa_set_log_state(spa, SPA_LOG_CLEAR); ++ } else { ++ /* ++ * XXX - once we have 'readonly' pool ++ * support we should be able to handle ++ * missing data devices by transitioning ++ * the pool to readonly. ++ */ ++ continue; ++ } ++ ++ /* ++ * Swap the missing vdev with the data we were ++ * able to obtain from the MOS config. ++ */ ++ vdev_remove_child(rvd, tvd); ++ vdev_remove_child(mrvd, mtvd); ++ ++ vdev_add_child(rvd, mtvd); ++ vdev_add_child(mrvd, tvd); ++ ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ vdev_load(mtvd); ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ ++ vdev_reopen(rvd); ++ } else if (mtvd->vdev_islog) { ++ /* ++ * Load the slog device's state from the MOS config ++ * since it's possible that the label does not ++ * contain the most up-to-date information. ++ */ ++ vdev_load_log_state(tvd, mtvd); ++ vdev_reopen(tvd); ++ } ++ } ++ vdev_free(mrvd); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ ++ /* ++ * Ensure we were able to validate the config. ++ */ ++ return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); ++} ++ ++/* ++ * Check for missing log devices ++ */ ++static int ++spa_check_logs(spa_t *spa) ++{ ++ switch (spa->spa_log_state) { ++ default: ++ break; ++ case SPA_LOG_MISSING: ++ /* need to recheck in case slog has been restored */ ++ case SPA_LOG_UNKNOWN: ++ if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, ++ DS_FIND_CHILDREN)) { ++ spa_set_log_state(spa, SPA_LOG_MISSING); ++ return (1); ++ } ++ break; ++ } ++ return (0); ++} ++ ++static boolean_t ++spa_passivate_log(spa_t *spa) ++{ ++ vdev_t *rvd = spa->spa_root_vdev; ++ boolean_t slog_found = B_FALSE; ++ int c; ++ ++ ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); ++ ++ if (!spa_has_slogs(spa)) ++ return (B_FALSE); ++ ++ for (c = 0; c < rvd->vdev_children; c++) { ++ vdev_t *tvd = rvd->vdev_child[c]; ++ metaslab_group_t *mg = tvd->vdev_mg; ++ ++ if (tvd->vdev_islog) { ++ metaslab_group_passivate(mg); ++ slog_found = B_TRUE; ++ } ++ } ++ ++ return (slog_found); ++} ++ ++static void ++spa_activate_log(spa_t *spa) ++{ ++ vdev_t *rvd = spa->spa_root_vdev; ++ int c; ++ ++ ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); ++ ++ for (c = 0; c < rvd->vdev_children; c++) { ++ vdev_t *tvd = rvd->vdev_child[c]; ++ metaslab_group_t *mg = tvd->vdev_mg; ++ ++ if (tvd->vdev_islog) ++ metaslab_group_activate(mg); ++ } ++} ++ ++int ++spa_offline_log(spa_t *spa) ++{ ++ int error = 0; ++ ++ if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, ++ NULL, DS_FIND_CHILDREN)) == 0) { ++ ++ /* ++ * We successfully offlined the log device, sync out the ++ * current txg so that the "stubby" block can be removed ++ * by zil_sync(). ++ */ ++ txg_wait_synced(spa->spa_dsl_pool, 0); ++ } ++ return (error); ++} ++ ++static void ++spa_aux_check_removed(spa_aux_vdev_t *sav) ++{ ++ int i; ++ ++ for (i = 0; i < sav->sav_count; i++) ++ spa_check_removed(sav->sav_vdevs[i]); ++} ++ ++void ++spa_claim_notify(zio_t *zio) ++{ ++ spa_t *spa = zio->io_spa; ++ ++ if (zio->io_error) ++ return; ++ ++ mutex_enter(&spa->spa_props_lock); /* any mutex will do */ ++ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) ++ spa->spa_claim_max_txg = zio->io_bp->blk_birth; ++ mutex_exit(&spa->spa_props_lock); ++} ++ ++typedef struct spa_load_error { ++ uint64_t sle_meta_count; ++ uint64_t sle_data_count; ++} spa_load_error_t; ++ ++static void ++spa_load_verify_done(zio_t *zio) ++{ ++ blkptr_t *bp = zio->io_bp; ++ spa_load_error_t *sle = zio->io_private; ++ dmu_object_type_t type = BP_GET_TYPE(bp); ++ int error = zio->io_error; ++ ++ if (error) { ++ if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && ++ type != DMU_OT_INTENT_LOG) ++ atomic_add_64(&sle->sle_meta_count, 1); ++ else ++ atomic_add_64(&sle->sle_data_count, 1); ++ } ++ zio_data_buf_free(zio->io_data, zio->io_size); ++} ++ ++/*ARGSUSED*/ ++static int ++spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, ++ arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) ++{ ++ if (bp != NULL) { ++ zio_t *rio = arg; ++ size_t size = BP_GET_PSIZE(bp); ++ void *data = zio_data_buf_alloc(size); ++ ++ zio_nowait(zio_read(rio, spa, bp, data, size, ++ spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ++ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ++ ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); ++ } ++ return (0); ++} ++ ++static int ++spa_load_verify(spa_t *spa) ++{ ++ zio_t *rio; ++ spa_load_error_t sle = { 0 }; ++ zpool_rewind_policy_t policy; ++ boolean_t verify_ok = B_FALSE; ++ int error; ++ ++ zpool_get_rewind_policy(spa->spa_config, &policy); ++ ++ if (policy.zrp_request & ZPOOL_NEVER_REWIND) ++ return (0); ++ ++ rio = zio_root(spa, NULL, &sle, ++ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); ++ ++ error = traverse_pool(spa, spa->spa_verify_min_txg, ++ TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); ++ ++ (void) zio_wait(rio); ++ ++ spa->spa_load_meta_errors = sle.sle_meta_count; ++ spa->spa_load_data_errors = sle.sle_data_count; ++ ++ if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && ++ sle.sle_data_count <= policy.zrp_maxdata) { ++ int64_t loss = 0; ++ ++ verify_ok = B_TRUE; ++ spa->spa_load_txg = spa->spa_uberblock.ub_txg; ++ spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; ++ ++ loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; ++ VERIFY(nvlist_add_uint64(spa->spa_load_info, ++ ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); ++ VERIFY(nvlist_add_int64(spa->spa_load_info, ++ ZPOOL_CONFIG_REWIND_TIME, loss) == 0); ++ VERIFY(nvlist_add_uint64(spa->spa_load_info, ++ ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); ++ } else { ++ spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; ++ } ++ ++ if (error) { ++ if (error != ENXIO && error != EIO) ++ error = EIO; ++ return (error); ++ } ++ ++ return (verify_ok ? 0 : EIO); ++} ++ ++/* ++ * Find a value in the pool props object. ++ */ ++static void ++spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) ++{ ++ (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, ++ zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); ++} ++ ++/* ++ * Find a value in the pool directory object. ++ */ ++static int ++spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) ++{ ++ return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, ++ name, sizeof (uint64_t), 1, val)); ++} ++ ++static int ++spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) ++{ ++ vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); ++ return (err); ++} ++ ++/* ++ * Fix up config after a partly-completed split. This is done with the ++ * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off ++ * pool have that entry in their config, but only the splitting one contains ++ * a list of all the guids of the vdevs that are being split off. ++ * ++ * This function determines what to do with that list: either rejoin ++ * all the disks to the pool, or complete the splitting process. To attempt ++ * the rejoin, each disk that is offlined is marked online again, and ++ * we do a reopen() call. If the vdev label for every disk that was ++ * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) ++ * then we call vdev_split() on each disk, and complete the split. ++ * ++ * Otherwise we leave the config alone, with all the vdevs in place in ++ * the original pool. ++ */ ++static void ++spa_try_repair(spa_t *spa, nvlist_t *config) ++{ ++ uint_t extracted; ++ uint64_t *glist; ++ uint_t i, gcount; ++ nvlist_t *nvl; ++ vdev_t **vd; ++ boolean_t attempt_reopen; ++ ++ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) ++ return; ++ ++ /* check that the config is complete */ ++ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, ++ &glist, &gcount) != 0) ++ return; ++ ++ vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_PUSHPAGE); ++ ++ /* attempt to online all the vdevs & validate */ ++ attempt_reopen = B_TRUE; ++ for (i = 0; i < gcount; i++) { ++ if (glist[i] == 0) /* vdev is hole */ ++ continue; ++ ++ vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); ++ if (vd[i] == NULL) { ++ /* ++ * Don't bother attempting to reopen the disks; ++ * just do the split. ++ */ ++ attempt_reopen = B_FALSE; ++ } else { ++ /* attempt to re-online it */ ++ vd[i]->vdev_offline = B_FALSE; ++ } ++ } ++ ++ if (attempt_reopen) { ++ vdev_reopen(spa->spa_root_vdev); ++ ++ /* check each device to see what state it's in */ ++ for (extracted = 0, i = 0; i < gcount; i++) { ++ if (vd[i] != NULL && ++ vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) ++ break; ++ ++extracted; ++ } ++ } ++ ++ /* ++ * If every disk has been moved to the new pool, or if we never ++ * even attempted to look at them, then we split them off for ++ * good. ++ */ ++ if (!attempt_reopen || gcount == extracted) { ++ for (i = 0; i < gcount; i++) ++ if (vd[i] != NULL) ++ vdev_split(vd[i]); ++ vdev_reopen(spa->spa_root_vdev); ++ } ++ ++ kmem_free(vd, gcount * sizeof (vdev_t *)); ++} ++ ++static int ++spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, ++ boolean_t mosconfig) ++{ ++ nvlist_t *config = spa->spa_config; ++ char *ereport = FM_EREPORT_ZFS_POOL; ++ char *comment; ++ int error; ++ uint64_t pool_guid; ++ nvlist_t *nvl; ++ ++ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) ++ return (EINVAL); ++ ++ ASSERT(spa->spa_comment == NULL); ++ if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) ++ spa->spa_comment = spa_strdup(comment); ++ ++ /* ++ * Versioning wasn't explicitly added to the label until later, so if ++ * it's not present treat it as the initial version. ++ */ ++ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, ++ &spa->spa_ubsync.ub_version) != 0) ++ spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; ++ ++ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, ++ &spa->spa_config_txg); ++ ++ if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && ++ spa_guid_exists(pool_guid, 0)) { ++ error = EEXIST; ++ } else { ++ spa->spa_config_guid = pool_guid; ++ ++ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, ++ &nvl) == 0) { ++ VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, ++ KM_PUSHPAGE) == 0); ++ } ++ ++ gethrestime(&spa->spa_loaded_ts); ++ error = spa_load_impl(spa, pool_guid, config, state, type, ++ mosconfig, &ereport); ++ } ++ ++ spa->spa_minref = refcount_count(&spa->spa_refcount); ++ if (error) { ++ if (error != EEXIST) { ++ spa->spa_loaded_ts.tv_sec = 0; ++ spa->spa_loaded_ts.tv_nsec = 0; ++ } ++ if (error != EBADF) { ++ zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); ++ } ++ } ++ spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; ++ spa->spa_ena = 0; ++ ++ return (error); ++} ++ ++/* ++ * Load an existing storage pool, using the pool's builtin spa_config as a ++ * source of configuration information. ++ */ ++__attribute__((always_inline)) ++static inline int ++spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, ++ spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, ++ char **ereport) ++{ ++ int error = 0; ++ nvlist_t *nvroot = NULL; ++ vdev_t *rvd; ++ uberblock_t *ub = &spa->spa_uberblock; ++ uint64_t children, config_cache_txg = spa->spa_config_txg; ++ int orig_mode = spa->spa_mode; ++ int parse; ++ uint64_t obj; ++ ++ /* ++ * If this is an untrusted config, access the pool in read-only mode. ++ * This prevents things like resilvering recently removed devices. ++ */ ++ if (!mosconfig) ++ spa->spa_mode = FREAD; ++ ++ ASSERT(MUTEX_HELD(&spa_namespace_lock)); ++ ++ spa->spa_load_state = state; ++ ++ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) ++ return (EINVAL); ++ ++ parse = (type == SPA_IMPORT_EXISTING ? ++ VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); ++ ++ /* ++ * Create "The Godfather" zio to hold all async IOs ++ */ ++ spa->spa_async_zio_root = zio_root(spa, NULL, NULL, ++ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); ++ ++ /* ++ * Parse the configuration into a vdev tree. We explicitly set the ++ * value that will be returned by spa_version() since parsing the ++ * configuration requires knowing the version number. ++ */ ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ ++ if (error != 0) ++ return (error); ++ ++ ASSERT(spa->spa_root_vdev == rvd); ++ ++ if (type != SPA_IMPORT_ASSEMBLE) { ++ ASSERT(spa_guid(spa) == pool_guid); ++ } ++ ++ /* ++ * Try to open all vdevs, loading each label in the process. ++ */ ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ error = vdev_open(rvd); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ if (error != 0) ++ return (error); ++ ++ /* ++ * We need to validate the vdev labels against the configuration that ++ * we have in hand, which is dependent on the setting of mosconfig. If ++ * mosconfig is true then we're validating the vdev labels based on ++ * that config. Otherwise, we're validating against the cached config ++ * (zpool.cache) that was read when we loaded the zfs module, and then ++ * later we will recursively call spa_load() and validate against ++ * the vdev config. ++ * ++ * If we're assembling a new pool that's been split off from an ++ * existing pool, the labels haven't yet been updated so we skip ++ * validation for now. ++ */ ++ if (type != SPA_IMPORT_ASSEMBLE) { ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ error = vdev_validate(rvd, mosconfig); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ ++ if (error != 0) ++ return (error); ++ ++ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) ++ return (ENXIO); ++ } ++ ++ /* ++ * Find the best uberblock. ++ */ ++ vdev_uberblock_load(NULL, rvd, ub); ++ ++ /* ++ * If we weren't able to find a single valid uberblock, return failure. ++ */ ++ if (ub->ub_txg == 0) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); ++ ++ /* ++ * If the pool is newer than the code, we can't open it. ++ */ ++ if (ub->ub_version > SPA_VERSION) ++ return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); ++ ++ /* ++ * If the vdev guid sum doesn't match the uberblock, we have an ++ * incomplete configuration. We first check to see if the pool ++ * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). ++ * If it is, defer the vdev_guid_sum check till later so we ++ * can handle missing vdevs. ++ */ ++ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, ++ &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && ++ rvd->vdev_guid_sum != ub->ub_guid_sum) ++ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); ++ ++ if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ spa_try_repair(spa, config); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ nvlist_free(spa->spa_config_splitting); ++ spa->spa_config_splitting = NULL; ++ } ++ ++ /* ++ * Initialize internal SPA structures. ++ */ ++ spa->spa_state = POOL_STATE_ACTIVE; ++ spa->spa_ubsync = spa->spa_uberblock; ++ spa->spa_verify_min_txg = spa->spa_extreme_rewind ? ++ TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; ++ spa->spa_first_txg = spa->spa_last_ubsync_txg ? ++ spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; ++ spa->spa_claim_max_txg = spa->spa_first_txg; ++ spa->spa_prev_software_version = ub->ub_software_version; ++ ++ error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); ++ if (error) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; ++ ++ if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ ++ if (!mosconfig) { ++ uint64_t hostid; ++ nvlist_t *policy = NULL, *nvconfig; ++ ++ if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ ++ if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, ++ ZPOOL_CONFIG_HOSTID, &hostid) == 0) { ++ char *hostname; ++ unsigned long myhostid = 0; ++ ++ VERIFY(nvlist_lookup_string(nvconfig, ++ ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); ++ ++#ifdef _KERNEL ++ myhostid = zone_get_hostid(NULL); ++#else /* _KERNEL */ ++ /* ++ * We're emulating the system's hostid in userland, so ++ * we can't use zone_get_hostid(). ++ */ ++ (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); ++#endif /* _KERNEL */ ++ if (hostid != 0 && myhostid != 0 && ++ hostid != myhostid) { ++ nvlist_free(nvconfig); ++ cmn_err(CE_WARN, "pool '%s' could not be " ++ "loaded as it was last accessed by " ++ "another system (host: %s hostid: 0x%lx). " ++ "See: http://zfsonlinux.org/msg/ZFS-8000-EY", ++ spa_name(spa), hostname, ++ (unsigned long)hostid); ++ return (EBADF); ++ } ++ } ++ if (nvlist_lookup_nvlist(spa->spa_config, ++ ZPOOL_REWIND_POLICY, &policy) == 0) ++ VERIFY(nvlist_add_nvlist(nvconfig, ++ ZPOOL_REWIND_POLICY, policy) == 0); ++ ++ spa_config_set(spa, nvconfig); ++ spa_unload(spa); ++ spa_deactivate(spa); ++ spa_activate(spa, orig_mode); ++ ++ return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); ++ } ++ ++ if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); ++ if (error != 0) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ ++ /* ++ * Load the bit that tells us to use the new accounting function ++ * (raid-z deflation). If we have an older pool, this will not ++ * be present. ++ */ ++ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); ++ if (error != 0 && error != ENOENT) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ ++ error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, ++ &spa->spa_creation_version); ++ if (error != 0 && error != ENOENT) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ ++ /* ++ * Load the persistent error log. If we have an older pool, this will ++ * not be present. ++ */ ++ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); ++ if (error != 0 && error != ENOENT) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ ++ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, ++ &spa->spa_errlog_scrub); ++ if (error != 0 && error != ENOENT) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ ++ /* ++ * Load the history object. If we have an older pool, this ++ * will not be present. ++ */ ++ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); ++ if (error != 0 && error != ENOENT) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ ++ /* ++ * If we're assembling the pool from the split-off vdevs of ++ * an existing pool, we don't want to attach the spares & cache ++ * devices. ++ */ ++ ++ /* ++ * Load any hot spares for this pool. ++ */ ++ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); ++ if (error != 0 && error != ENOENT) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ++ ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); ++ if (load_nvlist(spa, spa->spa_spares.sav_object, ++ &spa->spa_spares.sav_config) != 0) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ spa_load_spares(spa); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ } else if (error == 0) { ++ spa->spa_spares.sav_sync = B_TRUE; ++ } ++ ++ /* ++ * Load any level 2 ARC devices for this pool. ++ */ ++ error = spa_dir_prop(spa, DMU_POOL_L2CACHE, ++ &spa->spa_l2cache.sav_object); ++ if (error != 0 && error != ENOENT) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ++ ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); ++ if (load_nvlist(spa, spa->spa_l2cache.sav_object, ++ &spa->spa_l2cache.sav_config) != 0) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ spa_load_l2cache(spa); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ } else if (error == 0) { ++ spa->spa_l2cache.sav_sync = B_TRUE; ++ } ++ ++ spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); ++ ++ error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); ++ if (error && error != ENOENT) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ ++ if (error == 0) { ++ uint64_t autoreplace; ++ ++ spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); ++ spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); ++ spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); ++ spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); ++ spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); ++ spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, ++ &spa->spa_dedup_ditto); ++ ++ spa->spa_autoreplace = (autoreplace != 0); ++ } ++ ++ /* ++ * If the 'autoreplace' property is set, then post a resource notifying ++ * the ZFS DE that it should not issue any faults for unopenable ++ * devices. We also iterate over the vdevs, and post a sysevent for any ++ * unopenable vdevs so that the normal autoreplace handler can take ++ * over. ++ */ ++ if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { ++ spa_check_removed(spa->spa_root_vdev); ++ /* ++ * For the import case, this is done in spa_import(), because ++ * at this point we're using the spare definitions from ++ * the MOS config, not necessarily from the userland config. ++ */ ++ if (state != SPA_LOAD_IMPORT) { ++ spa_aux_check_removed(&spa->spa_spares); ++ spa_aux_check_removed(&spa->spa_l2cache); ++ } ++ } ++ ++ /* ++ * Load the vdev state for all toplevel vdevs. ++ */ ++ vdev_load(rvd); ++ ++ /* ++ * Propagate the leaf DTLs we just loaded all the way up the tree. ++ */ ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ vdev_dtl_reassess(rvd, 0, 0, B_FALSE); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ ++ /* ++ * Load the DDTs (dedup tables). ++ */ ++ error = ddt_load(spa); ++ if (error != 0) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ ++ spa_update_dspace(spa); ++ ++ /* ++ * Validate the config, using the MOS config to fill in any ++ * information which might be missing. If we fail to validate ++ * the config then declare the pool unfit for use. If we're ++ * assembling a pool from a split, the log is not transferred ++ * over. ++ */ ++ if (type != SPA_IMPORT_ASSEMBLE) { ++ nvlist_t *nvconfig; ++ ++ if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); ++ ++ if (!spa_config_valid(spa, nvconfig)) { ++ nvlist_free(nvconfig); ++ return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ++ ENXIO)); ++ } ++ nvlist_free(nvconfig); ++ ++ /* ++ * Now that we've validate the config, check the state of the ++ * root vdev. If it can't be opened, it indicates one or ++ * more toplevel vdevs are faulted. ++ */ ++ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) ++ return (ENXIO); ++ ++ if (spa_check_logs(spa)) { ++ *ereport = FM_EREPORT_ZFS_LOG_REPLAY; ++ return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); ++ } ++ } ++ ++ /* ++ * We've successfully opened the pool, verify that we're ready ++ * to start pushing transactions. ++ */ ++ if (state != SPA_LOAD_TRYIMPORT) { ++ if ((error = spa_load_verify(spa))) ++ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ++ error)); ++ } ++ ++ if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || ++ spa->spa_load_max_txg == UINT64_MAX)) { ++ dmu_tx_t *tx; ++ int need_update = B_FALSE; ++ int c; ++ ++ ASSERT(state != SPA_LOAD_TRYIMPORT); ++ ++ /* ++ * Claim log blocks that haven't been committed yet. ++ * This must all happen in a single txg. ++ * Note: spa_claim_max_txg is updated by spa_claim_notify(), ++ * invoked from zil_claim_log_block()'s i/o done callback. ++ * Price of rollback is that we abandon the log. ++ */ ++ spa->spa_claiming = B_TRUE; ++ ++ tx = dmu_tx_create_assigned(spa_get_dsl(spa), ++ spa_first_txg(spa)); ++ (void) dmu_objset_find(spa_name(spa), ++ zil_claim, tx, DS_FIND_CHILDREN); ++ dmu_tx_commit(tx); ++ ++ spa->spa_claiming = B_FALSE; ++ ++ spa_set_log_state(spa, SPA_LOG_GOOD); ++ spa->spa_sync_on = B_TRUE; ++ txg_sync_start(spa->spa_dsl_pool); ++ ++ /* ++ * Wait for all claims to sync. We sync up to the highest ++ * claimed log block birth time so that claimed log blocks ++ * don't appear to be from the future. spa_claim_max_txg ++ * will have been set for us by either zil_check_log_chain() ++ * (invoked from spa_check_logs()) or zil_claim() above. ++ */ ++ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); ++ ++ /* ++ * If the config cache is stale, or we have uninitialized ++ * metaslabs (see spa_vdev_add()), then update the config. ++ * ++ * If this is a verbatim import, trust the current ++ * in-core spa_config and update the disk labels. ++ */ ++ if (config_cache_txg != spa->spa_config_txg || ++ state == SPA_LOAD_IMPORT || ++ state == SPA_LOAD_RECOVER || ++ (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) ++ need_update = B_TRUE; ++ ++ for (c = 0; c < rvd->vdev_children; c++) ++ if (rvd->vdev_child[c]->vdev_ms_array == 0) ++ need_update = B_TRUE; ++ ++ /* ++ * Update the config cache asychronously in case we're the ++ * root pool, in which case the config cache isn't writable yet. ++ */ ++ if (need_update) ++ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); ++ ++ /* ++ * Check all DTLs to see if anything needs resilvering. ++ */ ++ if (!dsl_scan_resilvering(spa->spa_dsl_pool) && ++ vdev_resilver_needed(rvd, NULL, NULL)) ++ spa_async_request(spa, SPA_ASYNC_RESILVER); ++ ++ /* ++ * Delete any inconsistent datasets. ++ */ ++ (void) dmu_objset_find(spa_name(spa), ++ dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); ++ ++ /* ++ * Clean up any stale temporary dataset userrefs. ++ */ ++ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); ++ } ++ ++ return (0); ++} ++ ++static int ++spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) ++{ ++ int mode = spa->spa_mode; ++ ++ spa_unload(spa); ++ spa_deactivate(spa); ++ ++ spa->spa_load_max_txg--; ++ ++ spa_activate(spa, mode); ++ spa_async_suspend(spa); ++ ++ return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); ++} ++ ++static int ++spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, ++ uint64_t max_request, int rewind_flags) ++{ ++ nvlist_t *config = NULL; ++ int load_error, rewind_error; ++ uint64_t safe_rewind_txg; ++ uint64_t min_txg; ++ ++ if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { ++ spa->spa_load_max_txg = spa->spa_load_txg; ++ spa_set_log_state(spa, SPA_LOG_CLEAR); ++ } else { ++ spa->spa_load_max_txg = max_request; ++ } ++ ++ load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, ++ mosconfig); ++ if (load_error == 0) ++ return (0); ++ ++ if (spa->spa_root_vdev != NULL) ++ config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); ++ ++ spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; ++ spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; ++ ++ if (rewind_flags & ZPOOL_NEVER_REWIND) { ++ nvlist_free(config); ++ return (load_error); ++ } ++ ++ /* Price of rolling back is discarding txgs, including log */ ++ if (state == SPA_LOAD_RECOVER) ++ spa_set_log_state(spa, SPA_LOG_CLEAR); ++ ++ spa->spa_load_max_txg = spa->spa_last_ubsync_txg; ++ safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; ++ min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? ++ TXG_INITIAL : safe_rewind_txg; ++ ++ /* ++ * Continue as long as we're finding errors, we're still within ++ * the acceptable rewind range, and we're still finding uberblocks ++ */ ++ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && ++ spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { ++ if (spa->spa_load_max_txg < safe_rewind_txg) ++ spa->spa_extreme_rewind = B_TRUE; ++ rewind_error = spa_load_retry(spa, state, mosconfig); ++ } ++ ++ spa->spa_extreme_rewind = B_FALSE; ++ spa->spa_load_max_txg = UINT64_MAX; ++ ++ if (config && (rewind_error || state != SPA_LOAD_RECOVER)) ++ spa_config_set(spa, config); ++ ++ return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); ++} ++ ++/* ++ * Pool Open/Import ++ * ++ * The import case is identical to an open except that the configuration is sent ++ * down from userland, instead of grabbed from the configuration cache. For the ++ * case of an open, the pool configuration will exist in the ++ * POOL_STATE_UNINITIALIZED state. ++ * ++ * The stats information (gen/count/ustats) is used to gather vdev statistics at ++ * the same time open the pool, without having to keep around the spa_t in some ++ * ambiguous state. ++ */ ++static int ++spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, ++ nvlist_t **config) ++{ ++ spa_t *spa; ++ spa_load_state_t state = SPA_LOAD_OPEN; ++ int error; ++ int locked = B_FALSE; ++ ++ *spapp = NULL; ++ ++ /* ++ * As disgusting as this is, we need to support recursive calls to this ++ * function because dsl_dir_open() is called during spa_load(), and ends ++ * up calling spa_open() again. The real fix is to figure out how to ++ * avoid dsl_dir_open() calling this in the first place. ++ */ ++ if (mutex_owner(&spa_namespace_lock) != curthread) { ++ mutex_enter(&spa_namespace_lock); ++ locked = B_TRUE; ++ } ++ ++ if ((spa = spa_lookup(pool)) == NULL) { ++ if (locked) ++ mutex_exit(&spa_namespace_lock); ++ return (ENOENT); ++ } ++ ++ if (spa->spa_state == POOL_STATE_UNINITIALIZED) { ++ zpool_rewind_policy_t policy; ++ ++ zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, ++ &policy); ++ if (policy.zrp_request & ZPOOL_DO_REWIND) ++ state = SPA_LOAD_RECOVER; ++ ++ spa_activate(spa, spa_mode_global); ++ ++ if (state != SPA_LOAD_RECOVER) ++ spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; ++ ++ error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, ++ policy.zrp_request); ++ ++ if (error == EBADF) { ++ /* ++ * If vdev_validate() returns failure (indicated by ++ * EBADF), it indicates that one of the vdevs indicates ++ * that the pool has been exported or destroyed. If ++ * this is the case, the config cache is out of sync and ++ * we should remove the pool from the namespace. ++ */ ++ spa_unload(spa); ++ spa_deactivate(spa); ++ spa_config_sync(spa, B_TRUE, B_TRUE); ++ spa_remove(spa); ++ if (locked) ++ mutex_exit(&spa_namespace_lock); ++ return (ENOENT); ++ } ++ ++ if (error) { ++ /* ++ * We can't open the pool, but we still have useful ++ * information: the state of each vdev after the ++ * attempted vdev_open(). Return this to the user. ++ */ ++ if (config != NULL && spa->spa_config) { ++ VERIFY(nvlist_dup(spa->spa_config, config, ++ KM_PUSHPAGE) == 0); ++ VERIFY(nvlist_add_nvlist(*config, ++ ZPOOL_CONFIG_LOAD_INFO, ++ spa->spa_load_info) == 0); ++ } ++ spa_unload(spa); ++ spa_deactivate(spa); ++ spa->spa_last_open_failed = error; ++ if (locked) ++ mutex_exit(&spa_namespace_lock); ++ *spapp = NULL; ++ return (error); ++ } ++ } ++ ++ spa_open_ref(spa, tag); ++ ++ if (config != NULL) ++ *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); ++ ++ /* ++ * If we've recovered the pool, pass back any information we ++ * gathered while doing the load. ++ */ ++ if (state == SPA_LOAD_RECOVER) { ++ VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, ++ spa->spa_load_info) == 0); ++ } ++ ++ if (locked) { ++ spa->spa_last_open_failed = 0; ++ spa->spa_last_ubsync_txg = 0; ++ spa->spa_load_txg = 0; ++ mutex_exit(&spa_namespace_lock); ++ } ++ ++ *spapp = spa; ++ ++ return (0); ++} ++ ++int ++spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, ++ nvlist_t **config) ++{ ++ return (spa_open_common(name, spapp, tag, policy, config)); ++} ++ ++int ++spa_open(const char *name, spa_t **spapp, void *tag) ++{ ++ return (spa_open_common(name, spapp, tag, NULL, NULL)); ++} ++ ++/* ++ * Lookup the given spa_t, incrementing the inject count in the process, ++ * preventing it from being exported or destroyed. ++ */ ++spa_t * ++spa_inject_addref(char *name) ++{ ++ spa_t *spa; ++ ++ mutex_enter(&spa_namespace_lock); ++ if ((spa = spa_lookup(name)) == NULL) { ++ mutex_exit(&spa_namespace_lock); ++ return (NULL); ++ } ++ spa->spa_inject_ref++; ++ mutex_exit(&spa_namespace_lock); ++ ++ return (spa); ++} ++ ++void ++spa_inject_delref(spa_t *spa) ++{ ++ mutex_enter(&spa_namespace_lock); ++ spa->spa_inject_ref--; ++ mutex_exit(&spa_namespace_lock); ++} ++ ++/* ++ * Add spares device information to the nvlist. ++ */ ++static void ++spa_add_spares(spa_t *spa, nvlist_t *config) ++{ ++ nvlist_t **spares; ++ uint_t i, nspares; ++ nvlist_t *nvroot; ++ uint64_t guid; ++ vdev_stat_t *vs; ++ uint_t vsc; ++ uint64_t pool; ++ ++ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); ++ ++ if (spa->spa_spares.sav_count == 0) ++ return; ++ ++ VERIFY(nvlist_lookup_nvlist(config, ++ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); ++ VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ++ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); ++ if (nspares != 0) { ++ VERIFY(nvlist_add_nvlist_array(nvroot, ++ ZPOOL_CONFIG_SPARES, spares, nspares) == 0); ++ VERIFY(nvlist_lookup_nvlist_array(nvroot, ++ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); ++ ++ /* ++ * Go through and find any spares which have since been ++ * repurposed as an active spare. If this is the case, update ++ * their status appropriately. ++ */ ++ for (i = 0; i < nspares; i++) { ++ VERIFY(nvlist_lookup_uint64(spares[i], ++ ZPOOL_CONFIG_GUID, &guid) == 0); ++ if (spa_spare_exists(guid, &pool, NULL) && ++ pool != 0ULL) { ++ VERIFY(nvlist_lookup_uint64_array( ++ spares[i], ZPOOL_CONFIG_VDEV_STATS, ++ (uint64_t **)&vs, &vsc) == 0); ++ vs->vs_state = VDEV_STATE_CANT_OPEN; ++ vs->vs_aux = VDEV_AUX_SPARED; ++ } ++ } ++ } ++} ++ ++/* ++ * Add l2cache device information to the nvlist, including vdev stats. ++ */ ++static void ++spa_add_l2cache(spa_t *spa, nvlist_t *config) ++{ ++ nvlist_t **l2cache; ++ uint_t i, j, nl2cache; ++ nvlist_t *nvroot; ++ uint64_t guid; ++ vdev_t *vd; ++ vdev_stat_t *vs; ++ uint_t vsc; ++ ++ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); ++ ++ if (spa->spa_l2cache.sav_count == 0) ++ return; ++ ++ VERIFY(nvlist_lookup_nvlist(config, ++ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); ++ VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ++ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); ++ if (nl2cache != 0) { ++ VERIFY(nvlist_add_nvlist_array(nvroot, ++ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); ++ VERIFY(nvlist_lookup_nvlist_array(nvroot, ++ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); ++ ++ /* ++ * Update level 2 cache device stats. ++ */ ++ ++ for (i = 0; i < nl2cache; i++) { ++ VERIFY(nvlist_lookup_uint64(l2cache[i], ++ ZPOOL_CONFIG_GUID, &guid) == 0); ++ ++ vd = NULL; ++ for (j = 0; j < spa->spa_l2cache.sav_count; j++) { ++ if (guid == ++ spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { ++ vd = spa->spa_l2cache.sav_vdevs[j]; ++ break; ++ } ++ } ++ ASSERT(vd != NULL); ++ ++ VERIFY(nvlist_lookup_uint64_array(l2cache[i], ++ ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) ++ == 0); ++ vdev_get_stats(vd, vs); ++ } ++ } ++} ++ ++int ++spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) ++{ ++ int error; ++ spa_t *spa; ++ ++ *config = NULL; ++ error = spa_open_common(name, &spa, FTAG, NULL, config); ++ ++ if (spa != NULL) { ++ /* ++ * This still leaves a window of inconsistency where the spares ++ * or l2cache devices could change and the config would be ++ * self-inconsistent. ++ */ ++ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); ++ ++ if (*config != NULL) { ++ uint64_t loadtimes[2]; ++ ++ loadtimes[0] = spa->spa_loaded_ts.tv_sec; ++ loadtimes[1] = spa->spa_loaded_ts.tv_nsec; ++ VERIFY(nvlist_add_uint64_array(*config, ++ ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); ++ ++ VERIFY(nvlist_add_uint64(*config, ++ ZPOOL_CONFIG_ERRCOUNT, ++ spa_get_errlog_size(spa)) == 0); ++ ++ if (spa_suspended(spa)) ++ VERIFY(nvlist_add_uint64(*config, ++ ZPOOL_CONFIG_SUSPENDED, ++ spa->spa_failmode) == 0); ++ ++ spa_add_spares(spa, *config); ++ spa_add_l2cache(spa, *config); ++ } ++ } ++ ++ /* ++ * We want to get the alternate root even for faulted pools, so we cheat ++ * and call spa_lookup() directly. ++ */ ++ if (altroot) { ++ if (spa == NULL) { ++ mutex_enter(&spa_namespace_lock); ++ spa = spa_lookup(name); ++ if (spa) ++ spa_altroot(spa, altroot, buflen); ++ else ++ altroot[0] = '\0'; ++ spa = NULL; ++ mutex_exit(&spa_namespace_lock); ++ } else { ++ spa_altroot(spa, altroot, buflen); ++ } ++ } ++ ++ if (spa != NULL) { ++ spa_config_exit(spa, SCL_CONFIG, FTAG); ++ spa_close(spa, FTAG); ++ } ++ ++ return (error); ++} ++ ++/* ++ * Validate that the auxiliary device array is well formed. We must have an ++ * array of nvlists, each which describes a valid leaf vdev. If this is an ++ * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be ++ * specified, as long as they are well-formed. ++ */ ++static int ++spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, ++ spa_aux_vdev_t *sav, const char *config, uint64_t version, ++ vdev_labeltype_t label) ++{ ++ nvlist_t **dev; ++ uint_t i, ndev; ++ vdev_t *vd; ++ int error; ++ ++ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ++ ++ /* ++ * It's acceptable to have no devs specified. ++ */ ++ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) ++ return (0); ++ ++ if (ndev == 0) ++ return (EINVAL); ++ ++ /* ++ * Make sure the pool is formatted with a version that supports this ++ * device type. ++ */ ++ if (spa_version(spa) < version) ++ return (ENOTSUP); ++ ++ /* ++ * Set the pending device list so we correctly handle device in-use ++ * checking. ++ */ ++ sav->sav_pending = dev; ++ sav->sav_npending = ndev; ++ ++ for (i = 0; i < ndev; i++) { ++ if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, ++ mode)) != 0) ++ goto out; ++ ++ if (!vd->vdev_ops->vdev_op_leaf) { ++ vdev_free(vd); ++ error = EINVAL; ++ goto out; ++ } ++ ++ /* ++ * The L2ARC currently only supports disk devices in ++ * kernel context. For user-level testing, we allow it. ++ */ ++#ifdef _KERNEL ++ if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && ++ strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { ++ error = ENOTBLK; ++ vdev_free(vd); ++ goto out; ++ } ++#endif ++ vd->vdev_top = vd; ++ ++ if ((error = vdev_open(vd)) == 0 && ++ (error = vdev_label_init(vd, crtxg, label)) == 0) { ++ VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, ++ vd->vdev_guid) == 0); ++ } ++ ++ vdev_free(vd); ++ ++ if (error && ++ (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) ++ goto out; ++ else ++ error = 0; ++ } ++ ++out: ++ sav->sav_pending = NULL; ++ sav->sav_npending = 0; ++ return (error); ++} ++ ++static int ++spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) ++{ ++ int error; ++ ++ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ++ ++ if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, ++ &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, ++ VDEV_LABEL_SPARE)) != 0) { ++ return (error); ++ } ++ ++ return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, ++ &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, ++ VDEV_LABEL_L2CACHE)); ++} ++ ++static void ++spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, ++ const char *config) ++{ ++ int i; ++ ++ if (sav->sav_config != NULL) { ++ nvlist_t **olddevs; ++ uint_t oldndevs; ++ nvlist_t **newdevs; ++ ++ /* ++ * Generate new dev list by concatentating with the ++ * current dev list. ++ */ ++ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, ++ &olddevs, &oldndevs) == 0); ++ ++ newdevs = kmem_alloc(sizeof (void *) * ++ (ndevs + oldndevs), KM_PUSHPAGE); ++ for (i = 0; i < oldndevs; i++) ++ VERIFY(nvlist_dup(olddevs[i], &newdevs[i], ++ KM_PUSHPAGE) == 0); ++ for (i = 0; i < ndevs; i++) ++ VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], ++ KM_PUSHPAGE) == 0); ++ ++ VERIFY(nvlist_remove(sav->sav_config, config, ++ DATA_TYPE_NVLIST_ARRAY) == 0); ++ ++ VERIFY(nvlist_add_nvlist_array(sav->sav_config, ++ config, newdevs, ndevs + oldndevs) == 0); ++ for (i = 0; i < oldndevs + ndevs; i++) ++ nvlist_free(newdevs[i]); ++ kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); ++ } else { ++ /* ++ * Generate a new dev list. ++ */ ++ VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, ++ KM_PUSHPAGE) == 0); ++ VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, ++ devs, ndevs) == 0); ++ } ++} ++ ++/* ++ * Stop and drop level 2 ARC devices ++ */ ++void ++spa_l2cache_drop(spa_t *spa) ++{ ++ vdev_t *vd; ++ int i; ++ spa_aux_vdev_t *sav = &spa->spa_l2cache; ++ ++ for (i = 0; i < sav->sav_count; i++) { ++ uint64_t pool; ++ ++ vd = sav->sav_vdevs[i]; ++ ASSERT(vd != NULL); ++ ++ if (spa_l2cache_exists(vd->vdev_guid, &pool) && ++ pool != 0ULL && l2arc_vdev_present(vd)) ++ l2arc_remove_vdev(vd); ++ } ++} ++ ++/* ++ * Pool Creation ++ */ ++int ++spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, ++ const char *history_str, nvlist_t *zplprops) ++{ ++ spa_t *spa; ++ char *altroot = NULL; ++ vdev_t *rvd; ++ dsl_pool_t *dp; ++ dmu_tx_t *tx; ++ int error = 0; ++ uint64_t txg = TXG_INITIAL; ++ nvlist_t **spares, **l2cache; ++ uint_t nspares, nl2cache; ++ uint64_t version, obj; ++ int c; ++ ++ /* ++ * If this pool already exists, return failure. ++ */ ++ mutex_enter(&spa_namespace_lock); ++ if (spa_lookup(pool) != NULL) { ++ mutex_exit(&spa_namespace_lock); ++ return (EEXIST); ++ } ++ ++ /* ++ * Allocate a new spa_t structure. ++ */ ++ (void) nvlist_lookup_string(props, ++ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); ++ spa = spa_add(pool, NULL, altroot); ++ spa_activate(spa, spa_mode_global); ++ ++ if (props && (error = spa_prop_validate(spa, props))) { ++ spa_deactivate(spa); ++ spa_remove(spa); ++ mutex_exit(&spa_namespace_lock); ++ return (error); ++ } ++ ++ if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), ++ &version) != 0) ++ version = SPA_VERSION; ++ ASSERT(version <= SPA_VERSION); ++ ++ spa->spa_first_txg = txg; ++ spa->spa_uberblock.ub_txg = txg - 1; ++ spa->spa_uberblock.ub_version = version; ++ spa->spa_ubsync = spa->spa_uberblock; ++ ++ /* ++ * Create "The Godfather" zio to hold all async IOs ++ */ ++ spa->spa_async_zio_root = zio_root(spa, NULL, NULL, ++ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); ++ ++ /* ++ * Create the root vdev. ++ */ ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ ++ error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); ++ ++ ASSERT(error != 0 || rvd != NULL); ++ ASSERT(error != 0 || spa->spa_root_vdev == rvd); ++ ++ if (error == 0 && !zfs_allocatable_devs(nvroot)) ++ error = EINVAL; ++ ++ if (error == 0 && ++ (error = vdev_create(rvd, txg, B_FALSE)) == 0 && ++ (error = spa_validate_aux(spa, nvroot, txg, ++ VDEV_ALLOC_ADD)) == 0) { ++ for (c = 0; c < rvd->vdev_children; c++) { ++ vdev_metaslab_set_size(rvd->vdev_child[c]); ++ vdev_expand(rvd->vdev_child[c], txg); ++ } ++ } ++ ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ ++ if (error != 0) { ++ spa_unload(spa); ++ spa_deactivate(spa); ++ spa_remove(spa); ++ mutex_exit(&spa_namespace_lock); ++ return (error); ++ } ++ ++ /* ++ * Get the list of spares, if specified. ++ */ ++ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, ++ &spares, &nspares) == 0) { ++ VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, ++ KM_PUSHPAGE) == 0); ++ VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ++ ZPOOL_CONFIG_SPARES, spares, nspares) == 0); ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ spa_load_spares(spa); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ spa->spa_spares.sav_sync = B_TRUE; ++ } ++ ++ /* ++ * Get the list of level 2 cache devices, if specified. ++ */ ++ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, ++ &l2cache, &nl2cache) == 0) { ++ VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, ++ NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); ++ VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ++ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ spa_load_l2cache(spa); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ spa->spa_l2cache.sav_sync = B_TRUE; ++ } ++ ++ spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); ++ spa->spa_meta_objset = dp->dp_meta_objset; ++ ++ /* ++ * Create DDTs (dedup tables). ++ */ ++ ddt_create(spa); ++ ++ spa_update_dspace(spa); ++ ++ tx = dmu_tx_create_assigned(dp, txg); ++ ++ /* ++ * Create the pool config object. ++ */ ++ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, ++ DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, ++ DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); ++ ++ if (zap_add(spa->spa_meta_objset, ++ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, ++ sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { ++ cmn_err(CE_PANIC, "failed to add pool config"); ++ } ++ ++ if (zap_add(spa->spa_meta_objset, ++ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, ++ sizeof (uint64_t), 1, &version, tx) != 0) { ++ cmn_err(CE_PANIC, "failed to add pool version"); ++ } ++ ++ /* Newly created pools with the right version are always deflated. */ ++ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { ++ spa->spa_deflate = TRUE; ++ if (zap_add(spa->spa_meta_objset, ++ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, ++ sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { ++ cmn_err(CE_PANIC, "failed to add deflate"); ++ } ++ } ++ ++ /* ++ * Create the deferred-free bpobj. Turn off compression ++ * because sync-to-convergence takes longer if the blocksize ++ * keeps changing. ++ */ ++ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); ++ dmu_object_set_compress(spa->spa_meta_objset, obj, ++ ZIO_COMPRESS_OFF, tx); ++ if (zap_add(spa->spa_meta_objset, ++ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, ++ sizeof (uint64_t), 1, &obj, tx) != 0) { ++ cmn_err(CE_PANIC, "failed to add bpobj"); ++ } ++ VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, ++ spa->spa_meta_objset, obj)); ++ ++ /* ++ * Create the pool's history object. ++ */ ++ if (version >= SPA_VERSION_ZPOOL_HISTORY) ++ spa_history_create_obj(spa, tx); ++ ++ /* ++ * Set pool properties. ++ */ ++ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); ++ spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); ++ spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); ++ spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); ++ ++ if (props != NULL) { ++ spa_configfile_set(spa, props, B_FALSE); ++ spa_sync_props(spa, props, tx); ++ } ++ ++ dmu_tx_commit(tx); ++ ++ spa->spa_sync_on = B_TRUE; ++ txg_sync_start(spa->spa_dsl_pool); ++ ++ /* ++ * We explicitly wait for the first transaction to complete so that our ++ * bean counters are appropriately updated. ++ */ ++ txg_wait_synced(spa->spa_dsl_pool, txg); ++ ++ spa_config_sync(spa, B_FALSE, B_TRUE); ++ ++ if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) ++ (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); ++ spa_history_log_version(spa, LOG_POOL_CREATE); ++ ++ spa->spa_minref = refcount_count(&spa->spa_refcount); ++ ++ mutex_exit(&spa_namespace_lock); ++ ++ return (0); ++} ++ ++#ifdef _KERNEL ++/* ++ * Get the root pool information from the root disk, then import the root pool ++ * during the system boot up time. ++ */ ++extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); ++ ++static nvlist_t * ++spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) ++{ ++ nvlist_t *config; ++ nvlist_t *nvtop, *nvroot; ++ uint64_t pgid; ++ ++ if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) ++ return (NULL); ++ ++ /* ++ * Add this top-level vdev to the child array. ++ */ ++ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, ++ &nvtop) == 0); ++ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, ++ &pgid) == 0); ++ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); ++ ++ /* ++ * Put this pool's top-level vdevs into a root vdev. ++ */ ++ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); ++ VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, ++ VDEV_TYPE_ROOT) == 0); ++ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); ++ VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); ++ VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, ++ &nvtop, 1) == 0); ++ ++ /* ++ * Replace the existing vdev_tree with the new root vdev in ++ * this pool's configuration (remove the old, add the new). ++ */ ++ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); ++ nvlist_free(nvroot); ++ return (config); ++} ++ ++/* ++ * Walk the vdev tree and see if we can find a device with "better" ++ * configuration. A configuration is "better" if the label on that ++ * device has a more recent txg. ++ */ ++static void ++spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) ++{ ++ int c; ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ spa_alt_rootvdev(vd->vdev_child[c], avd, txg); ++ ++ if (vd->vdev_ops->vdev_op_leaf) { ++ nvlist_t *label; ++ uint64_t label_txg; ++ ++ if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, ++ &label) != 0) ++ return; ++ ++ VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, ++ &label_txg) == 0); ++ ++ /* ++ * Do we have a better boot device? ++ */ ++ if (label_txg > *txg) { ++ *txg = label_txg; ++ *avd = vd; ++ } ++ nvlist_free(label); ++ } ++} ++ ++/* ++ * Import a root pool. ++ * ++ * For x86. devpath_list will consist of devid and/or physpath name of ++ * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). ++ * The GRUB "findroot" command will return the vdev we should boot. ++ * ++ * For Sparc, devpath_list consists the physpath name of the booting device ++ * no matter the rootpool is a single device pool or a mirrored pool. ++ * e.g. ++ * "/pci@1f,0/ide@d/disk@0,0:a" ++ */ ++int ++spa_import_rootpool(char *devpath, char *devid) ++{ ++ spa_t *spa; ++ vdev_t *rvd, *bvd, *avd = NULL; ++ nvlist_t *config, *nvtop; ++ uint64_t guid, txg; ++ char *pname; ++ int error; ++ ++ /* ++ * Read the label from the boot device and generate a configuration. ++ */ ++ config = spa_generate_rootconf(devpath, devid, &guid); ++#if defined(_OBP) && defined(_KERNEL) ++ if (config == NULL) { ++ if (strstr(devpath, "/iscsi/ssd") != NULL) { ++ /* iscsi boot */ ++ get_iscsi_bootpath_phy(devpath); ++ config = spa_generate_rootconf(devpath, devid, &guid); ++ } ++ } ++#endif ++ if (config == NULL) { ++ cmn_err(CE_NOTE, "Can not read the pool label from '%s'", ++ devpath); ++ return (EIO); ++ } ++ ++ VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, ++ &pname) == 0); ++ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); ++ ++ mutex_enter(&spa_namespace_lock); ++ if ((spa = spa_lookup(pname)) != NULL) { ++ /* ++ * Remove the existing root pool from the namespace so that we ++ * can replace it with the correct config we just read in. ++ */ ++ spa_remove(spa); ++ } ++ ++ spa = spa_add(pname, config, NULL); ++ spa->spa_is_root = B_TRUE; ++ spa->spa_import_flags = ZFS_IMPORT_VERBATIM; ++ ++ /* ++ * Build up a vdev tree based on the boot device's label config. ++ */ ++ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, ++ &nvtop) == 0); ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, ++ VDEV_ALLOC_ROOTPOOL); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ if (error) { ++ mutex_exit(&spa_namespace_lock); ++ nvlist_free(config); ++ cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", ++ pname); ++ return (error); ++ } ++ ++ /* ++ * Get the boot vdev. ++ */ ++ if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { ++ cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", ++ (u_longlong_t)guid); ++ error = ENOENT; ++ goto out; ++ } ++ ++ /* ++ * Determine if there is a better boot device. ++ */ ++ avd = bvd; ++ spa_alt_rootvdev(rvd, &avd, &txg); ++ if (avd != bvd) { ++ cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " ++ "try booting from '%s'", avd->vdev_path); ++ error = EINVAL; ++ goto out; ++ } ++ ++ /* ++ * If the boot device is part of a spare vdev then ensure that ++ * we're booting off the active spare. ++ */ ++ if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && ++ !bvd->vdev_isspare) { ++ cmn_err(CE_NOTE, "The boot device is currently spared. Please " ++ "try booting from '%s'", ++ bvd->vdev_parent-> ++ vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); ++ error = EINVAL; ++ goto out; ++ } ++ ++ error = 0; ++ spa_history_log_version(spa, LOG_POOL_IMPORT); ++out: ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ vdev_free(rvd); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ mutex_exit(&spa_namespace_lock); ++ ++ nvlist_free(config); ++ return (error); ++} ++ ++#endif ++ ++/* ++ * Import a non-root pool into the system. ++ */ ++int ++spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) ++{ ++ spa_t *spa; ++ char *altroot = NULL; ++ spa_load_state_t state = SPA_LOAD_IMPORT; ++ zpool_rewind_policy_t policy; ++ uint64_t mode = spa_mode_global; ++ uint64_t readonly = B_FALSE; ++ int error; ++ nvlist_t *nvroot; ++ nvlist_t **spares, **l2cache; ++ uint_t nspares, nl2cache; ++ ++ /* ++ * If a pool with this name exists, return failure. ++ */ ++ mutex_enter(&spa_namespace_lock); ++ if (spa_lookup(pool) != NULL) { ++ mutex_exit(&spa_namespace_lock); ++ return (EEXIST); ++ } ++ ++ /* ++ * Create and initialize the spa structure. ++ */ ++ (void) nvlist_lookup_string(props, ++ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); ++ (void) nvlist_lookup_uint64(props, ++ zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); ++ if (readonly) ++ mode = FREAD; ++ spa = spa_add(pool, config, altroot); ++ spa->spa_import_flags = flags; ++ ++ /* ++ * Verbatim import - Take a pool and insert it into the namespace ++ * as if it had been loaded at boot. ++ */ ++ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { ++ if (props != NULL) ++ spa_configfile_set(spa, props, B_FALSE); ++ ++ spa_config_sync(spa, B_FALSE, B_TRUE); ++ ++ mutex_exit(&spa_namespace_lock); ++ spa_history_log_version(spa, LOG_POOL_IMPORT); ++ ++ return (0); ++ } ++ ++ spa_activate(spa, mode); ++ ++ /* ++ * Don't start async tasks until we know everything is healthy. ++ */ ++ spa_async_suspend(spa); ++ ++ zpool_get_rewind_policy(config, &policy); ++ if (policy.zrp_request & ZPOOL_DO_REWIND) ++ state = SPA_LOAD_RECOVER; ++ ++ /* ++ * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig ++ * because the user-supplied config is actually the one to trust when ++ * doing an import. ++ */ ++ if (state != SPA_LOAD_RECOVER) ++ spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; ++ ++ error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, ++ policy.zrp_request); ++ ++ /* ++ * Propagate anything learned while loading the pool and pass it ++ * back to caller (i.e. rewind info, missing devices, etc). ++ */ ++ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, ++ spa->spa_load_info) == 0); ++ ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ /* ++ * Toss any existing sparelist, as it doesn't have any validity ++ * anymore, and conflicts with spa_has_spare(). ++ */ ++ if (spa->spa_spares.sav_config) { ++ nvlist_free(spa->spa_spares.sav_config); ++ spa->spa_spares.sav_config = NULL; ++ spa_load_spares(spa); ++ } ++ if (spa->spa_l2cache.sav_config) { ++ nvlist_free(spa->spa_l2cache.sav_config); ++ spa->spa_l2cache.sav_config = NULL; ++ spa_load_l2cache(spa); ++ } ++ ++ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, ++ &nvroot) == 0); ++ if (error == 0) ++ error = spa_validate_aux(spa, nvroot, -1ULL, ++ VDEV_ALLOC_SPARE); ++ if (error == 0) ++ error = spa_validate_aux(spa, nvroot, -1ULL, ++ VDEV_ALLOC_L2CACHE); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ ++ if (props != NULL) ++ spa_configfile_set(spa, props, B_FALSE); ++ ++ if (error != 0 || (props && spa_writeable(spa) && ++ (error = spa_prop_set(spa, props)))) { ++ spa_unload(spa); ++ spa_deactivate(spa); ++ spa_remove(spa); ++ mutex_exit(&spa_namespace_lock); ++ return (error); ++ } ++ ++ spa_async_resume(spa); ++ ++ /* ++ * Override any spares and level 2 cache devices as specified by ++ * the user, as these may have correct device names/devids, etc. ++ */ ++ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, ++ &spares, &nspares) == 0) { ++ if (spa->spa_spares.sav_config) ++ VERIFY(nvlist_remove(spa->spa_spares.sav_config, ++ ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); ++ else ++ VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, ++ NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); ++ VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ++ ZPOOL_CONFIG_SPARES, spares, nspares) == 0); ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ spa_load_spares(spa); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ spa->spa_spares.sav_sync = B_TRUE; ++ } ++ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, ++ &l2cache, &nl2cache) == 0) { ++ if (spa->spa_l2cache.sav_config) ++ VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, ++ ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); ++ else ++ VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, ++ NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); ++ VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ++ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ spa_load_l2cache(spa); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ spa->spa_l2cache.sav_sync = B_TRUE; ++ } ++ ++ /* ++ * Check for any removed devices. ++ */ ++ if (spa->spa_autoreplace) { ++ spa_aux_check_removed(&spa->spa_spares); ++ spa_aux_check_removed(&spa->spa_l2cache); ++ } ++ ++ if (spa_writeable(spa)) { ++ /* ++ * Update the config cache to include the newly-imported pool. ++ */ ++ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); ++ } ++ ++ /* ++ * It's possible that the pool was expanded while it was exported. ++ * We kick off an async task to handle this for us. ++ */ ++ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); ++ ++ mutex_exit(&spa_namespace_lock); ++ spa_history_log_version(spa, LOG_POOL_IMPORT); ++ ++ return (0); ++} ++ ++nvlist_t * ++spa_tryimport(nvlist_t *tryconfig) ++{ ++ nvlist_t *config = NULL; ++ char *poolname; ++ spa_t *spa; ++ uint64_t state; ++ int error; ++ ++ if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) ++ return (NULL); ++ ++ if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) ++ return (NULL); ++ ++ /* ++ * Create and initialize the spa structure. ++ */ ++ mutex_enter(&spa_namespace_lock); ++ spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); ++ spa_activate(spa, FREAD); ++ ++ /* ++ * Pass off the heavy lifting to spa_load(). ++ * Pass TRUE for mosconfig because the user-supplied config ++ * is actually the one to trust when doing an import. ++ */ ++ error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); ++ ++ /* ++ * If 'tryconfig' was at least parsable, return the current config. ++ */ ++ if (spa->spa_root_vdev != NULL) { ++ config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); ++ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, ++ poolname) == 0); ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, ++ state) == 0); ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, ++ spa->spa_uberblock.ub_timestamp) == 0); ++ ++ /* ++ * If the bootfs property exists on this pool then we ++ * copy it out so that external consumers can tell which ++ * pools are bootable. ++ */ ++ if ((!error || error == EEXIST) && spa->spa_bootfs) { ++ char *tmpname = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE); ++ ++ /* ++ * We have to play games with the name since the ++ * pool was opened as TRYIMPORT_NAME. ++ */ ++ if (dsl_dsobj_to_dsname(spa_name(spa), ++ spa->spa_bootfs, tmpname) == 0) { ++ char *cp; ++ char *dsname = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE); ++ ++ cp = strchr(tmpname, '/'); ++ if (cp == NULL) { ++ (void) strlcpy(dsname, tmpname, ++ MAXPATHLEN); ++ } else { ++ (void) snprintf(dsname, MAXPATHLEN, ++ "%s/%s", poolname, ++cp); ++ } ++ VERIFY(nvlist_add_string(config, ++ ZPOOL_CONFIG_BOOTFS, dsname) == 0); ++ kmem_free(dsname, MAXPATHLEN); ++ } ++ kmem_free(tmpname, MAXPATHLEN); ++ } ++ ++ /* ++ * Add the list of hot spares and level 2 cache devices. ++ */ ++ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); ++ spa_add_spares(spa, config); ++ spa_add_l2cache(spa, config); ++ spa_config_exit(spa, SCL_CONFIG, FTAG); ++ } ++ ++ spa_unload(spa); ++ spa_deactivate(spa); ++ spa_remove(spa); ++ mutex_exit(&spa_namespace_lock); ++ ++ return (config); ++} ++ ++/* ++ * Pool export/destroy ++ * ++ * The act of destroying or exporting a pool is very simple. We make sure there ++ * is no more pending I/O and any references to the pool are gone. Then, we ++ * update the pool state and sync all the labels to disk, removing the ++ * configuration from the cache afterwards. If the 'hardforce' flag is set, then ++ * we don't sync the labels or remove the configuration cache. ++ */ ++static int ++spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, ++ boolean_t force, boolean_t hardforce) ++{ ++ spa_t *spa; ++ ++ if (oldconfig) ++ *oldconfig = NULL; ++ ++ if (!(spa_mode_global & FWRITE)) ++ return (EROFS); ++ ++ mutex_enter(&spa_namespace_lock); ++ if ((spa = spa_lookup(pool)) == NULL) { ++ mutex_exit(&spa_namespace_lock); ++ return (ENOENT); ++ } ++ ++ /* ++ * Put a hold on the pool, drop the namespace lock, stop async tasks, ++ * reacquire the namespace lock, and see if we can export. ++ */ ++ spa_open_ref(spa, FTAG); ++ mutex_exit(&spa_namespace_lock); ++ spa_async_suspend(spa); ++ mutex_enter(&spa_namespace_lock); ++ spa_close(spa, FTAG); ++ ++ /* ++ * The pool will be in core if it's openable, ++ * in which case we can modify its state. ++ */ ++ if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { ++ /* ++ * Objsets may be open only because they're dirty, so we ++ * have to force it to sync before checking spa_refcnt. ++ */ ++ txg_wait_synced(spa->spa_dsl_pool, 0); ++ ++ /* ++ * A pool cannot be exported or destroyed if there are active ++ * references. If we are resetting a pool, allow references by ++ * fault injection handlers. ++ */ ++ if (!spa_refcount_zero(spa) || ++ (spa->spa_inject_ref != 0 && ++ new_state != POOL_STATE_UNINITIALIZED)) { ++ spa_async_resume(spa); ++ mutex_exit(&spa_namespace_lock); ++ return (EBUSY); ++ } ++ ++ /* ++ * A pool cannot be exported if it has an active shared spare. ++ * This is to prevent other pools stealing the active spare ++ * from an exported pool. At user's own will, such pool can ++ * be forcedly exported. ++ */ ++ if (!force && new_state == POOL_STATE_EXPORTED && ++ spa_has_active_shared_spare(spa)) { ++ spa_async_resume(spa); ++ mutex_exit(&spa_namespace_lock); ++ return (EXDEV); ++ } ++ ++ /* ++ * We want this to be reflected on every label, ++ * so mark them all dirty. spa_unload() will do the ++ * final sync that pushes these changes out. ++ */ ++ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ spa->spa_state = new_state; ++ spa->spa_final_txg = spa_last_synced_txg(spa) + ++ TXG_DEFER_SIZE + 1; ++ vdev_config_dirty(spa->spa_root_vdev); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ } ++ } ++ ++ spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_DESTROY); ++ ++ if (spa->spa_state != POOL_STATE_UNINITIALIZED) { ++ spa_unload(spa); ++ spa_deactivate(spa); ++ } ++ ++ if (oldconfig && spa->spa_config) ++ VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); ++ ++ if (new_state != POOL_STATE_UNINITIALIZED) { ++ if (!hardforce) ++ spa_config_sync(spa, B_TRUE, B_TRUE); ++ spa_remove(spa); ++ } ++ mutex_exit(&spa_namespace_lock); ++ ++ return (0); ++} ++ ++/* ++ * Destroy a storage pool. ++ */ ++int ++spa_destroy(char *pool) ++{ ++ return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, ++ B_FALSE, B_FALSE)); ++} ++ ++/* ++ * Export a storage pool. ++ */ ++int ++spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, ++ boolean_t hardforce) ++{ ++ return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, ++ force, hardforce)); ++} ++ ++/* ++ * Similar to spa_export(), this unloads the spa_t without actually removing it ++ * from the namespace in any way. ++ */ ++int ++spa_reset(char *pool) ++{ ++ return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, ++ B_FALSE, B_FALSE)); ++} ++ ++/* ++ * ========================================================================== ++ * Device manipulation ++ * ========================================================================== ++ */ ++ ++/* ++ * Add a device to a storage pool. ++ */ ++int ++spa_vdev_add(spa_t *spa, nvlist_t *nvroot) ++{ ++ uint64_t txg, id; ++ int error; ++ vdev_t *rvd = spa->spa_root_vdev; ++ vdev_t *vd, *tvd; ++ nvlist_t **spares, **l2cache; ++ uint_t nspares, nl2cache; ++ int c; ++ ++ ASSERT(spa_writeable(spa)); ++ ++ txg = spa_vdev_enter(spa); ++ ++ if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, ++ VDEV_ALLOC_ADD)) != 0) ++ return (spa_vdev_exit(spa, NULL, txg, error)); ++ ++ spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ ++ ++ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, ++ &nspares) != 0) ++ nspares = 0; ++ ++ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, ++ &nl2cache) != 0) ++ nl2cache = 0; ++ ++ if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) ++ return (spa_vdev_exit(spa, vd, txg, EINVAL)); ++ ++ if (vd->vdev_children != 0 && ++ (error = vdev_create(vd, txg, B_FALSE)) != 0) ++ return (spa_vdev_exit(spa, vd, txg, error)); ++ ++ /* ++ * We must validate the spares and l2cache devices after checking the ++ * children. Otherwise, vdev_inuse() will blindly overwrite the spare. ++ */ ++ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) ++ return (spa_vdev_exit(spa, vd, txg, error)); ++ ++ /* ++ * Transfer each new top-level vdev from vd to rvd. ++ */ ++ for (c = 0; c < vd->vdev_children; c++) { ++ ++ /* ++ * Set the vdev id to the first hole, if one exists. ++ */ ++ for (id = 0; id < rvd->vdev_children; id++) { ++ if (rvd->vdev_child[id]->vdev_ishole) { ++ vdev_free(rvd->vdev_child[id]); ++ break; ++ } ++ } ++ tvd = vd->vdev_child[c]; ++ vdev_remove_child(vd, tvd); ++ tvd->vdev_id = id; ++ vdev_add_child(rvd, tvd); ++ vdev_config_dirty(tvd); ++ } ++ ++ if (nspares != 0) { ++ spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, ++ ZPOOL_CONFIG_SPARES); ++ spa_load_spares(spa); ++ spa->spa_spares.sav_sync = B_TRUE; ++ } ++ ++ if (nl2cache != 0) { ++ spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, ++ ZPOOL_CONFIG_L2CACHE); ++ spa_load_l2cache(spa); ++ spa->spa_l2cache.sav_sync = B_TRUE; ++ } ++ ++ /* ++ * We have to be careful when adding new vdevs to an existing pool. ++ * If other threads start allocating from these vdevs before we ++ * sync the config cache, and we lose power, then upon reboot we may ++ * fail to open the pool because there are DVAs that the config cache ++ * can't translate. Therefore, we first add the vdevs without ++ * initializing metaslabs; sync the config cache (via spa_vdev_exit()); ++ * and then let spa_config_update() initialize the new metaslabs. ++ * ++ * spa_load() checks for added-but-not-initialized vdevs, so that ++ * if we lose power at any point in this sequence, the remaining ++ * steps will be completed the next time we load the pool. ++ */ ++ (void) spa_vdev_exit(spa, vd, txg, 0); ++ ++ mutex_enter(&spa_namespace_lock); ++ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); ++ mutex_exit(&spa_namespace_lock); ++ ++ return (0); ++} ++ ++/* ++ * Attach a device to a mirror. The arguments are the path to any device ++ * in the mirror, and the nvroot for the new device. If the path specifies ++ * a device that is not mirrored, we automatically insert the mirror vdev. ++ * ++ * If 'replacing' is specified, the new device is intended to replace the ++ * existing device; in this case the two devices are made into their own ++ * mirror using the 'replacing' vdev, which is functionally identical to ++ * the mirror vdev (it actually reuses all the same ops) but has a few ++ * extra rules: you can't attach to it after it's been created, and upon ++ * completion of resilvering, the first disk (the one being replaced) ++ * is automatically detached. ++ */ ++int ++spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) ++{ ++ uint64_t txg, dtl_max_txg; ++ ASSERTV(vdev_t *rvd = spa->spa_root_vdev;) ++ vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; ++ vdev_ops_t *pvops; ++ char *oldvdpath, *newvdpath; ++ int newvd_isspare; ++ int error; ++ ++ ASSERT(spa_writeable(spa)); ++ ++ txg = spa_vdev_enter(spa); ++ ++ oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); ++ ++ if (oldvd == NULL) ++ return (spa_vdev_exit(spa, NULL, txg, ENODEV)); ++ ++ if (!oldvd->vdev_ops->vdev_op_leaf) ++ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ++ ++ pvd = oldvd->vdev_parent; ++ ++ if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, ++ VDEV_ALLOC_ATTACH)) != 0) ++ return (spa_vdev_exit(spa, NULL, txg, EINVAL)); ++ ++ if (newrootvd->vdev_children != 1) ++ return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); ++ ++ newvd = newrootvd->vdev_child[0]; ++ ++ if (!newvd->vdev_ops->vdev_op_leaf) ++ return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); ++ ++ if ((error = vdev_create(newrootvd, txg, replacing)) != 0) ++ return (spa_vdev_exit(spa, newrootvd, txg, error)); ++ ++ /* ++ * Spares can't replace logs ++ */ ++ if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) ++ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); ++ ++ if (!replacing) { ++ /* ++ * For attach, the only allowable parent is a mirror or the root ++ * vdev. ++ */ ++ if (pvd->vdev_ops != &vdev_mirror_ops && ++ pvd->vdev_ops != &vdev_root_ops) ++ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); ++ ++ pvops = &vdev_mirror_ops; ++ } else { ++ /* ++ * Active hot spares can only be replaced by inactive hot ++ * spares. ++ */ ++ if (pvd->vdev_ops == &vdev_spare_ops && ++ oldvd->vdev_isspare && ++ !spa_has_spare(spa, newvd->vdev_guid)) ++ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); ++ ++ /* ++ * If the source is a hot spare, and the parent isn't already a ++ * spare, then we want to create a new hot spare. Otherwise, we ++ * want to create a replacing vdev. The user is not allowed to ++ * attach to a spared vdev child unless the 'isspare' state is ++ * the same (spare replaces spare, non-spare replaces ++ * non-spare). ++ */ ++ if (pvd->vdev_ops == &vdev_replacing_ops && ++ spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { ++ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); ++ } else if (pvd->vdev_ops == &vdev_spare_ops && ++ newvd->vdev_isspare != oldvd->vdev_isspare) { ++ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); ++ } ++ ++ if (newvd->vdev_isspare) ++ pvops = &vdev_spare_ops; ++ else ++ pvops = &vdev_replacing_ops; ++ } ++ ++ /* ++ * Make sure the new device is big enough. ++ */ ++ if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) ++ return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); ++ ++ /* ++ * The new device cannot have a higher alignment requirement ++ * than the top-level vdev. ++ */ ++ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) ++ return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); ++ ++ /* ++ * If this is an in-place replacement, update oldvd's path and devid ++ * to make it distinguishable from newvd, and unopenable from now on. ++ */ ++ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { ++ spa_strfree(oldvd->vdev_path); ++ oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, ++ KM_PUSHPAGE); ++ (void) sprintf(oldvd->vdev_path, "%s/%s", ++ newvd->vdev_path, "old"); ++ if (oldvd->vdev_devid != NULL) { ++ spa_strfree(oldvd->vdev_devid); ++ oldvd->vdev_devid = NULL; ++ } ++ } ++ ++ /* mark the device being resilvered */ ++ newvd->vdev_resilvering = B_TRUE; ++ ++ /* ++ * If the parent is not a mirror, or if we're replacing, insert the new ++ * mirror/replacing/spare vdev above oldvd. ++ */ ++ if (pvd->vdev_ops != pvops) ++ pvd = vdev_add_parent(oldvd, pvops); ++ ++ ASSERT(pvd->vdev_top->vdev_parent == rvd); ++ ASSERT(pvd->vdev_ops == pvops); ++ ASSERT(oldvd->vdev_parent == pvd); ++ ++ /* ++ * Extract the new device from its root and add it to pvd. ++ */ ++ vdev_remove_child(newrootvd, newvd); ++ newvd->vdev_id = pvd->vdev_children; ++ newvd->vdev_crtxg = oldvd->vdev_crtxg; ++ vdev_add_child(pvd, newvd); ++ ++ tvd = newvd->vdev_top; ++ ASSERT(pvd->vdev_top == tvd); ++ ASSERT(tvd->vdev_parent == rvd); ++ ++ vdev_config_dirty(tvd); ++ ++ /* ++ * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account ++ * for any dmu_sync-ed blocks. It will propagate upward when ++ * spa_vdev_exit() calls vdev_dtl_reassess(). ++ */ ++ dtl_max_txg = txg + TXG_CONCURRENT_STATES; ++ ++ vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, ++ dtl_max_txg - TXG_INITIAL); ++ ++ if (newvd->vdev_isspare) { ++ spa_spare_activate(newvd); ++ spa_event_notify(spa, newvd, FM_EREPORT_ZFS_DEVICE_SPARE); ++ } ++ ++ oldvdpath = spa_strdup(oldvd->vdev_path); ++ newvdpath = spa_strdup(newvd->vdev_path); ++ newvd_isspare = newvd->vdev_isspare; ++ ++ /* ++ * Mark newvd's DTL dirty in this txg. ++ */ ++ vdev_dirty(tvd, VDD_DTL, newvd, txg); ++ ++ /* ++ * Restart the resilver ++ */ ++ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); ++ ++ /* ++ * Commit the config ++ */ ++ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); ++ ++ spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, ++ "%s vdev=%s %s vdev=%s", ++ replacing && newvd_isspare ? "spare in" : ++ replacing ? "replace" : "attach", newvdpath, ++ replacing ? "for" : "to", oldvdpath); ++ ++ spa_strfree(oldvdpath); ++ spa_strfree(newvdpath); ++ ++ if (spa->spa_bootfs) ++ spa_event_notify(spa, newvd, FM_EREPORT_ZFS_BOOTFS_VDEV_ATTACH); ++ ++ return (0); ++} ++ ++/* ++ * Detach a device from a mirror or replacing vdev. ++ * If 'replace_done' is specified, only detach if the parent ++ * is a replacing vdev. ++ */ ++int ++spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) ++{ ++ uint64_t txg; ++ int error; ++ ASSERTV(vdev_t *rvd = spa->spa_root_vdev;) ++ vdev_t *vd, *pvd, *cvd, *tvd; ++ boolean_t unspare = B_FALSE; ++ uint64_t unspare_guid = 0; ++ char *vdpath; ++ int c, t; ++ ++ ASSERT(spa_writeable(spa)); ++ ++ txg = spa_vdev_enter(spa); ++ ++ vd = spa_lookup_by_guid(spa, guid, B_FALSE); ++ ++ if (vd == NULL) ++ return (spa_vdev_exit(spa, NULL, txg, ENODEV)); ++ ++ if (!vd->vdev_ops->vdev_op_leaf) ++ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ++ ++ pvd = vd->vdev_parent; ++ ++ /* ++ * If the parent/child relationship is not as expected, don't do it. ++ * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing ++ * vdev that's replacing B with C. The user's intent in replacing ++ * is to go from M(A,B) to M(A,C). If the user decides to cancel ++ * the replace by detaching C, the expected behavior is to end up ++ * M(A,B). But suppose that right after deciding to detach C, ++ * the replacement of B completes. We would have M(A,C), and then ++ * ask to detach C, which would leave us with just A -- not what ++ * the user wanted. To prevent this, we make sure that the ++ * parent/child relationship hasn't changed -- in this example, ++ * that C's parent is still the replacing vdev R. ++ */ ++ if (pvd->vdev_guid != pguid && pguid != 0) ++ return (spa_vdev_exit(spa, NULL, txg, EBUSY)); ++ ++ /* ++ * Only 'replacing' or 'spare' vdevs can be replaced. ++ */ ++ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && ++ pvd->vdev_ops != &vdev_spare_ops) ++ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ++ ++ ASSERT(pvd->vdev_ops != &vdev_spare_ops || ++ spa_version(spa) >= SPA_VERSION_SPARES); ++ ++ /* ++ * Only mirror, replacing, and spare vdevs support detach. ++ */ ++ if (pvd->vdev_ops != &vdev_replacing_ops && ++ pvd->vdev_ops != &vdev_mirror_ops && ++ pvd->vdev_ops != &vdev_spare_ops) ++ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ++ ++ /* ++ * If this device has the only valid copy of some data, ++ * we cannot safely detach it. ++ */ ++ if (vdev_dtl_required(vd)) ++ return (spa_vdev_exit(spa, NULL, txg, EBUSY)); ++ ++ ASSERT(pvd->vdev_children >= 2); ++ ++ /* ++ * If we are detaching the second disk from a replacing vdev, then ++ * check to see if we changed the original vdev's path to have "/old" ++ * at the end in spa_vdev_attach(). If so, undo that change now. ++ */ ++ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && ++ vd->vdev_path != NULL) { ++ size_t len = strlen(vd->vdev_path); ++ ++ for (c = 0; c < pvd->vdev_children; c++) { ++ cvd = pvd->vdev_child[c]; ++ ++ if (cvd == vd || cvd->vdev_path == NULL) ++ continue; ++ ++ if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && ++ strcmp(cvd->vdev_path + len, "/old") == 0) { ++ spa_strfree(cvd->vdev_path); ++ cvd->vdev_path = spa_strdup(vd->vdev_path); ++ break; ++ } ++ } ++ } ++ ++ /* ++ * If we are detaching the original disk from a spare, then it implies ++ * that the spare should become a real disk, and be removed from the ++ * active spare list for the pool. ++ */ ++ if (pvd->vdev_ops == &vdev_spare_ops && ++ vd->vdev_id == 0 && ++ pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) ++ unspare = B_TRUE; ++ ++ /* ++ * Erase the disk labels so the disk can be used for other things. ++ * This must be done after all other error cases are handled, ++ * but before we disembowel vd (so we can still do I/O to it). ++ * But if we can't do it, don't treat the error as fatal -- ++ * it may be that the unwritability of the disk is the reason ++ * it's being detached! ++ */ ++ error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); ++ ++ /* ++ * Remove vd from its parent and compact the parent's children. ++ */ ++ vdev_remove_child(pvd, vd); ++ vdev_compact_children(pvd); ++ ++ /* ++ * Remember one of the remaining children so we can get tvd below. ++ */ ++ cvd = pvd->vdev_child[pvd->vdev_children - 1]; ++ ++ /* ++ * If we need to remove the remaining child from the list of hot spares, ++ * do it now, marking the vdev as no longer a spare in the process. ++ * We must do this before vdev_remove_parent(), because that can ++ * change the GUID if it creates a new toplevel GUID. For a similar ++ * reason, we must remove the spare now, in the same txg as the detach; ++ * otherwise someone could attach a new sibling, change the GUID, and ++ * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. ++ */ ++ if (unspare) { ++ ASSERT(cvd->vdev_isspare); ++ spa_spare_remove(cvd); ++ unspare_guid = cvd->vdev_guid; ++ (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); ++ cvd->vdev_unspare = B_TRUE; ++ } ++ ++ /* ++ * If the parent mirror/replacing vdev only has one child, ++ * the parent is no longer needed. Remove it from the tree. ++ */ ++ if (pvd->vdev_children == 1) { ++ if (pvd->vdev_ops == &vdev_spare_ops) ++ cvd->vdev_unspare = B_FALSE; ++ vdev_remove_parent(cvd); ++ cvd->vdev_resilvering = B_FALSE; ++ } ++ ++ ++ /* ++ * We don't set tvd until now because the parent we just removed ++ * may have been the previous top-level vdev. ++ */ ++ tvd = cvd->vdev_top; ++ ASSERT(tvd->vdev_parent == rvd); ++ ++ /* ++ * Reevaluate the parent vdev state. ++ */ ++ vdev_propagate_state(cvd); ++ ++ /* ++ * If the 'autoexpand' property is set on the pool then automatically ++ * try to expand the size of the pool. For example if the device we ++ * just detached was smaller than the others, it may be possible to ++ * add metaslabs (i.e. grow the pool). We need to reopen the vdev ++ * first so that we can obtain the updated sizes of the leaf vdevs. ++ */ ++ if (spa->spa_autoexpand) { ++ vdev_reopen(tvd); ++ vdev_expand(tvd, txg); ++ } ++ ++ vdev_config_dirty(tvd); ++ ++ /* ++ * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that ++ * vd->vdev_detached is set and free vd's DTL object in syncing context. ++ * But first make sure we're not on any *other* txg's DTL list, to ++ * prevent vd from being accessed after it's freed. ++ */ ++ vdpath = spa_strdup(vd->vdev_path); ++ for (t = 0; t < TXG_SIZE; t++) ++ (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); ++ vd->vdev_detached = B_TRUE; ++ vdev_dirty(tvd, VDD_DTL, vd, txg); ++ ++ spa_event_notify(spa, vd, FM_EREPORT_ZFS_DEVICE_REMOVE); ++ ++ /* hang on to the spa before we release the lock */ ++ spa_open_ref(spa, FTAG); ++ ++ error = spa_vdev_exit(spa, vd, txg, 0); ++ ++ spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, ++ "vdev=%s", vdpath); ++ spa_strfree(vdpath); ++ ++ /* ++ * If this was the removal of the original device in a hot spare vdev, ++ * then we want to go through and remove the device from the hot spare ++ * list of every other pool. ++ */ ++ if (unspare) { ++ spa_t *altspa = NULL; ++ ++ mutex_enter(&spa_namespace_lock); ++ while ((altspa = spa_next(altspa)) != NULL) { ++ if (altspa->spa_state != POOL_STATE_ACTIVE || ++ altspa == spa) ++ continue; ++ ++ spa_open_ref(altspa, FTAG); ++ mutex_exit(&spa_namespace_lock); ++ (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); ++ mutex_enter(&spa_namespace_lock); ++ spa_close(altspa, FTAG); ++ } ++ mutex_exit(&spa_namespace_lock); ++ ++ /* search the rest of the vdevs for spares to remove */ ++ spa_vdev_resilver_done(spa); ++ } ++ ++ /* all done with the spa; OK to release */ ++ mutex_enter(&spa_namespace_lock); ++ spa_close(spa, FTAG); ++ mutex_exit(&spa_namespace_lock); ++ ++ return (error); ++} ++ ++/* ++ * Split a set of devices from their mirrors, and create a new pool from them. ++ */ ++int ++spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, ++ nvlist_t *props, boolean_t exp) ++{ ++ int error = 0; ++ uint64_t txg, *glist; ++ spa_t *newspa; ++ uint_t c, children, lastlog; ++ nvlist_t **child, *nvl, *tmp; ++ dmu_tx_t *tx; ++ char *altroot = NULL; ++ vdev_t *rvd, **vml = NULL; /* vdev modify list */ ++ boolean_t activate_slog; ++ ++ ASSERT(spa_writeable(spa)); ++ ++ txg = spa_vdev_enter(spa); ++ ++ /* clear the log and flush everything up to now */ ++ activate_slog = spa_passivate_log(spa); ++ (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); ++ error = spa_offline_log(spa); ++ txg = spa_vdev_config_enter(spa); ++ ++ if (activate_slog) ++ spa_activate_log(spa); ++ ++ if (error != 0) ++ return (spa_vdev_exit(spa, NULL, txg, error)); ++ ++ /* check new spa name before going any further */ ++ if (spa_lookup(newname) != NULL) ++ return (spa_vdev_exit(spa, NULL, txg, EEXIST)); ++ ++ /* ++ * scan through all the children to ensure they're all mirrors ++ */ ++ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || ++ nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, ++ &children) != 0) ++ return (spa_vdev_exit(spa, NULL, txg, EINVAL)); ++ ++ /* first, check to ensure we've got the right child count */ ++ rvd = spa->spa_root_vdev; ++ lastlog = 0; ++ for (c = 0; c < rvd->vdev_children; c++) { ++ vdev_t *vd = rvd->vdev_child[c]; ++ ++ /* don't count the holes & logs as children */ ++ if (vd->vdev_islog || vd->vdev_ishole) { ++ if (lastlog == 0) ++ lastlog = c; ++ continue; ++ } ++ ++ lastlog = 0; ++ } ++ if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) ++ return (spa_vdev_exit(spa, NULL, txg, EINVAL)); ++ ++ /* next, ensure no spare or cache devices are part of the split */ ++ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || ++ nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) ++ return (spa_vdev_exit(spa, NULL, txg, EINVAL)); ++ ++ vml = kmem_zalloc(children * sizeof (vdev_t *), KM_PUSHPAGE); ++ glist = kmem_zalloc(children * sizeof (uint64_t), KM_PUSHPAGE); ++ ++ /* then, loop over each vdev and validate it */ ++ for (c = 0; c < children; c++) { ++ uint64_t is_hole = 0; ++ ++ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, ++ &is_hole); ++ ++ if (is_hole != 0) { ++ if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || ++ spa->spa_root_vdev->vdev_child[c]->vdev_islog) { ++ continue; ++ } else { ++ error = EINVAL; ++ break; ++ } ++ } ++ ++ /* which disk is going to be split? */ ++ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, ++ &glist[c]) != 0) { ++ error = EINVAL; ++ break; ++ } ++ ++ /* look it up in the spa */ ++ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); ++ if (vml[c] == NULL) { ++ error = ENODEV; ++ break; ++ } ++ ++ /* make sure there's nothing stopping the split */ ++ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || ++ vml[c]->vdev_islog || ++ vml[c]->vdev_ishole || ++ vml[c]->vdev_isspare || ++ vml[c]->vdev_isl2cache || ++ !vdev_writeable(vml[c]) || ++ vml[c]->vdev_children != 0 || ++ vml[c]->vdev_state != VDEV_STATE_HEALTHY || ++ c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { ++ error = EINVAL; ++ break; ++ } ++ ++ if (vdev_dtl_required(vml[c])) { ++ error = EBUSY; ++ break; ++ } ++ ++ /* we need certain info from the top level */ ++ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, ++ vml[c]->vdev_top->vdev_ms_array) == 0); ++ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, ++ vml[c]->vdev_top->vdev_ms_shift) == 0); ++ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, ++ vml[c]->vdev_top->vdev_asize) == 0); ++ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, ++ vml[c]->vdev_top->vdev_ashift) == 0); ++ } ++ ++ if (error != 0) { ++ kmem_free(vml, children * sizeof (vdev_t *)); ++ kmem_free(glist, children * sizeof (uint64_t)); ++ return (spa_vdev_exit(spa, NULL, txg, error)); ++ } ++ ++ /* stop writers from using the disks */ ++ for (c = 0; c < children; c++) { ++ if (vml[c] != NULL) ++ vml[c]->vdev_offline = B_TRUE; ++ } ++ vdev_reopen(spa->spa_root_vdev); ++ ++ /* ++ * Temporarily record the splitting vdevs in the spa config. This ++ * will disappear once the config is regenerated. ++ */ ++ VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); ++ VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, ++ glist, children) == 0); ++ kmem_free(glist, children * sizeof (uint64_t)); ++ ++ mutex_enter(&spa->spa_props_lock); ++ VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, ++ nvl) == 0); ++ mutex_exit(&spa->spa_props_lock); ++ spa->spa_config_splitting = nvl; ++ vdev_config_dirty(spa->spa_root_vdev); ++ ++ /* configure and create the new pool */ ++ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, ++ exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, ++ spa_version(spa)) == 0); ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, ++ spa->spa_config_txg) == 0); ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, ++ spa_generate_guid(NULL)) == 0); ++ (void) nvlist_lookup_string(props, ++ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); ++ ++ /* add the new pool to the namespace */ ++ newspa = spa_add(newname, config, altroot); ++ newspa->spa_config_txg = spa->spa_config_txg; ++ spa_set_log_state(newspa, SPA_LOG_CLEAR); ++ ++ /* release the spa config lock, retaining the namespace lock */ ++ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); ++ ++ if (zio_injection_enabled) ++ zio_handle_panic_injection(spa, FTAG, 1); ++ ++ spa_activate(newspa, spa_mode_global); ++ spa_async_suspend(newspa); ++ ++ /* create the new pool from the disks of the original pool */ ++ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); ++ if (error) ++ goto out; ++ ++ /* if that worked, generate a real config for the new pool */ ++ if (newspa->spa_root_vdev != NULL) { ++ VERIFY(nvlist_alloc(&newspa->spa_config_splitting, ++ NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); ++ VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, ++ ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); ++ spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, ++ B_TRUE)); ++ } ++ ++ /* set the props */ ++ if (props != NULL) { ++ spa_configfile_set(newspa, props, B_FALSE); ++ error = spa_prop_set(newspa, props); ++ if (error) ++ goto out; ++ } ++ ++ /* flush everything */ ++ txg = spa_vdev_config_enter(newspa); ++ vdev_config_dirty(newspa->spa_root_vdev); ++ (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); ++ ++ if (zio_injection_enabled) ++ zio_handle_panic_injection(spa, FTAG, 2); ++ ++ spa_async_resume(newspa); ++ ++ /* finally, update the original pool's config */ ++ txg = spa_vdev_config_enter(spa); ++ tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); ++ error = dmu_tx_assign(tx, TXG_WAIT); ++ if (error != 0) ++ dmu_tx_abort(tx); ++ for (c = 0; c < children; c++) { ++ if (vml[c] != NULL) { ++ vdev_split(vml[c]); ++ if (error == 0) ++ spa_history_log_internal(LOG_POOL_VDEV_DETACH, ++ spa, tx, "vdev=%s", ++ vml[c]->vdev_path); ++ vdev_free(vml[c]); ++ } ++ } ++ vdev_config_dirty(spa->spa_root_vdev); ++ spa->spa_config_splitting = NULL; ++ nvlist_free(nvl); ++ if (error == 0) ++ dmu_tx_commit(tx); ++ (void) spa_vdev_exit(spa, NULL, txg, 0); ++ ++ if (zio_injection_enabled) ++ zio_handle_panic_injection(spa, FTAG, 3); ++ ++ /* split is complete; log a history record */ ++ spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, ++ "split new pool %s from pool %s", newname, spa_name(spa)); ++ ++ kmem_free(vml, children * sizeof (vdev_t *)); ++ ++ /* if we're not going to mount the filesystems in userland, export */ ++ if (exp) ++ error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, ++ B_FALSE, B_FALSE); ++ ++ return (error); ++ ++out: ++ spa_unload(newspa); ++ spa_deactivate(newspa); ++ spa_remove(newspa); ++ ++ txg = spa_vdev_config_enter(spa); ++ ++ /* re-online all offlined disks */ ++ for (c = 0; c < children; c++) { ++ if (vml[c] != NULL) ++ vml[c]->vdev_offline = B_FALSE; ++ } ++ vdev_reopen(spa->spa_root_vdev); ++ ++ nvlist_free(spa->spa_config_splitting); ++ spa->spa_config_splitting = NULL; ++ (void) spa_vdev_exit(spa, NULL, txg, error); ++ ++ kmem_free(vml, children * sizeof (vdev_t *)); ++ return (error); ++} ++ ++static nvlist_t * ++spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) ++{ ++ int i; ++ ++ for (i = 0; i < count; i++) { ++ uint64_t guid; ++ ++ VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, ++ &guid) == 0); ++ ++ if (guid == target_guid) ++ return (nvpp[i]); ++ } ++ ++ return (NULL); ++} ++ ++static void ++spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, ++ nvlist_t *dev_to_remove) ++{ ++ nvlist_t **newdev = NULL; ++ int i, j; ++ ++ if (count > 1) ++ newdev = kmem_alloc((count - 1) * sizeof (void *), KM_PUSHPAGE); ++ ++ for (i = 0, j = 0; i < count; i++) { ++ if (dev[i] == dev_to_remove) ++ continue; ++ VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_PUSHPAGE) == 0); ++ } ++ ++ VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); ++ VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); ++ ++ for (i = 0; i < count - 1; i++) ++ nvlist_free(newdev[i]); ++ ++ if (count > 1) ++ kmem_free(newdev, (count - 1) * sizeof (void *)); ++} ++ ++/* ++ * Evacuate the device. ++ */ ++static int ++spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) ++{ ++ uint64_t txg; ++ int error = 0; ++ ++ ASSERT(MUTEX_HELD(&spa_namespace_lock)); ++ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); ++ ASSERT(vd == vd->vdev_top); ++ ++ /* ++ * Evacuate the device. We don't hold the config lock as writer ++ * since we need to do I/O but we do keep the ++ * spa_namespace_lock held. Once this completes the device ++ * should no longer have any blocks allocated on it. ++ */ ++ if (vd->vdev_islog) { ++ if (vd->vdev_stat.vs_alloc != 0) ++ error = spa_offline_log(spa); ++ } else { ++ error = ENOTSUP; ++ } ++ ++ if (error) ++ return (error); ++ ++ /* ++ * The evacuation succeeded. Remove any remaining MOS metadata ++ * associated with this vdev, and wait for these changes to sync. ++ */ ++ ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); ++ txg = spa_vdev_config_enter(spa); ++ vd->vdev_removing = B_TRUE; ++ vdev_dirty(vd, 0, NULL, txg); ++ vdev_config_dirty(vd); ++ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); ++ ++ return (0); ++} ++ ++/* ++ * Complete the removal by cleaning up the namespace. ++ */ ++static void ++spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) ++{ ++ vdev_t *rvd = spa->spa_root_vdev; ++ uint64_t id = vd->vdev_id; ++ boolean_t last_vdev = (id == (rvd->vdev_children - 1)); ++ ++ ASSERT(MUTEX_HELD(&spa_namespace_lock)); ++ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ++ ASSERT(vd == vd->vdev_top); ++ ++ /* ++ * Only remove any devices which are empty. ++ */ ++ if (vd->vdev_stat.vs_alloc != 0) ++ return; ++ ++ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); ++ ++ if (list_link_active(&vd->vdev_state_dirty_node)) ++ vdev_state_clean(vd); ++ if (list_link_active(&vd->vdev_config_dirty_node)) ++ vdev_config_clean(vd); ++ ++ vdev_free(vd); ++ ++ if (last_vdev) { ++ vdev_compact_children(rvd); ++ } else { ++ vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); ++ vdev_add_child(rvd, vd); ++ } ++ vdev_config_dirty(rvd); ++ ++ /* ++ * Reassess the health of our root vdev. ++ */ ++ vdev_reopen(rvd); ++} ++ ++/* ++ * Remove a device from the pool - ++ * ++ * Removing a device from the vdev namespace requires several steps ++ * and can take a significant amount of time. As a result we use ++ * the spa_vdev_config_[enter/exit] functions which allow us to ++ * grab and release the spa_config_lock while still holding the namespace ++ * lock. During each step the configuration is synced out. ++ */ ++ ++/* ++ * Remove a device from the pool. Currently, this supports removing only hot ++ * spares, slogs, and level 2 ARC devices. ++ */ ++int ++spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) ++{ ++ vdev_t *vd; ++ metaslab_group_t *mg; ++ nvlist_t **spares, **l2cache, *nv; ++ uint64_t txg = 0; ++ uint_t nspares, nl2cache; ++ int error = 0; ++ boolean_t locked = MUTEX_HELD(&spa_namespace_lock); ++ ++ ASSERT(spa_writeable(spa)); ++ ++ if (!locked) ++ txg = spa_vdev_enter(spa); ++ ++ vd = spa_lookup_by_guid(spa, guid, B_FALSE); ++ ++ if (spa->spa_spares.sav_vdevs != NULL && ++ nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ++ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && ++ (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { ++ /* ++ * Only remove the hot spare if it's not currently in use ++ * in this pool. ++ */ ++ if (vd == NULL || unspare) { ++ spa_vdev_remove_aux(spa->spa_spares.sav_config, ++ ZPOOL_CONFIG_SPARES, spares, nspares, nv); ++ spa_load_spares(spa); ++ spa->spa_spares.sav_sync = B_TRUE; ++ } else { ++ error = EBUSY; ++ } ++ } else if (spa->spa_l2cache.sav_vdevs != NULL && ++ nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ++ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && ++ (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { ++ /* ++ * Cache devices can always be removed. ++ */ ++ spa_vdev_remove_aux(spa->spa_l2cache.sav_config, ++ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); ++ spa_load_l2cache(spa); ++ spa->spa_l2cache.sav_sync = B_TRUE; ++ } else if (vd != NULL && vd->vdev_islog) { ++ ASSERT(!locked); ++ ASSERT(vd == vd->vdev_top); ++ ++ /* ++ * XXX - Once we have bp-rewrite this should ++ * become the common case. ++ */ ++ ++ mg = vd->vdev_mg; ++ ++ /* ++ * Stop allocating from this vdev. ++ */ ++ metaslab_group_passivate(mg); ++ ++ /* ++ * Wait for the youngest allocations and frees to sync, ++ * and then wait for the deferral of those frees to finish. ++ */ ++ spa_vdev_config_exit(spa, NULL, ++ txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); ++ ++ /* ++ * Attempt to evacuate the vdev. ++ */ ++ error = spa_vdev_remove_evacuate(spa, vd); ++ ++ txg = spa_vdev_config_enter(spa); ++ ++ /* ++ * If we couldn't evacuate the vdev, unwind. ++ */ ++ if (error) { ++ metaslab_group_activate(mg); ++ return (spa_vdev_exit(spa, NULL, txg, error)); ++ } ++ ++ /* ++ * Clean up the vdev namespace. ++ */ ++ spa_vdev_remove_from_namespace(spa, vd); ++ ++ } else if (vd != NULL) { ++ /* ++ * Normal vdevs cannot be removed (yet). ++ */ ++ error = ENOTSUP; ++ } else { ++ /* ++ * There is no vdev of any kind with the specified guid. ++ */ ++ error = ENOENT; ++ } ++ ++ if (!locked) ++ return (spa_vdev_exit(spa, NULL, txg, error)); ++ ++ return (error); ++} ++ ++/* ++ * Find any device that's done replacing, or a vdev marked 'unspare' that's ++ * current spared, so we can detach it. ++ */ ++static vdev_t * ++spa_vdev_resilver_done_hunt(vdev_t *vd) ++{ ++ vdev_t *newvd, *oldvd; ++ int c; ++ ++ for (c = 0; c < vd->vdev_children; c++) { ++ oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); ++ if (oldvd != NULL) ++ return (oldvd); ++ } ++ ++ /* ++ * Check for a completed replacement. We always consider the first ++ * vdev in the list to be the oldest vdev, and the last one to be ++ * the newest (see spa_vdev_attach() for how that works). In ++ * the case where the newest vdev is faulted, we will not automatically ++ * remove it after a resilver completes. This is OK as it will require ++ * user intervention to determine which disk the admin wishes to keep. ++ */ ++ if (vd->vdev_ops == &vdev_replacing_ops) { ++ ASSERT(vd->vdev_children > 1); ++ ++ newvd = vd->vdev_child[vd->vdev_children - 1]; ++ oldvd = vd->vdev_child[0]; ++ ++ if (vdev_dtl_empty(newvd, DTL_MISSING) && ++ vdev_dtl_empty(newvd, DTL_OUTAGE) && ++ !vdev_dtl_required(oldvd)) ++ return (oldvd); ++ } ++ ++ /* ++ * Check for a completed resilver with the 'unspare' flag set. ++ */ ++ if (vd->vdev_ops == &vdev_spare_ops) { ++ vdev_t *first = vd->vdev_child[0]; ++ vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; ++ ++ if (last->vdev_unspare) { ++ oldvd = first; ++ newvd = last; ++ } else if (first->vdev_unspare) { ++ oldvd = last; ++ newvd = first; ++ } else { ++ oldvd = NULL; ++ } ++ ++ if (oldvd != NULL && ++ vdev_dtl_empty(newvd, DTL_MISSING) && ++ vdev_dtl_empty(newvd, DTL_OUTAGE) && ++ !vdev_dtl_required(oldvd)) ++ return (oldvd); ++ ++ /* ++ * If there are more than two spares attached to a disk, ++ * and those spares are not required, then we want to ++ * attempt to free them up now so that they can be used ++ * by other pools. Once we're back down to a single ++ * disk+spare, we stop removing them. ++ */ ++ if (vd->vdev_children > 2) { ++ newvd = vd->vdev_child[1]; ++ ++ if (newvd->vdev_isspare && last->vdev_isspare && ++ vdev_dtl_empty(last, DTL_MISSING) && ++ vdev_dtl_empty(last, DTL_OUTAGE) && ++ !vdev_dtl_required(newvd)) ++ return (newvd); ++ } ++ } ++ ++ return (NULL); ++} ++ ++static void ++spa_vdev_resilver_done(spa_t *spa) ++{ ++ vdev_t *vd, *pvd, *ppvd; ++ uint64_t guid, sguid, pguid, ppguid; ++ ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ ++ while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { ++ pvd = vd->vdev_parent; ++ ppvd = pvd->vdev_parent; ++ guid = vd->vdev_guid; ++ pguid = pvd->vdev_guid; ++ ppguid = ppvd->vdev_guid; ++ sguid = 0; ++ /* ++ * If we have just finished replacing a hot spared device, then ++ * we need to detach the parent's first child (the original hot ++ * spare) as well. ++ */ ++ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && ++ ppvd->vdev_children == 2) { ++ ASSERT(pvd->vdev_ops == &vdev_replacing_ops); ++ sguid = ppvd->vdev_child[1]->vdev_guid; ++ } ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) ++ return; ++ if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) ++ return; ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ } ++ ++ spa_config_exit(spa, SCL_ALL, FTAG); ++} ++ ++/* ++ * Update the stored path or FRU for this vdev. ++ */ ++int ++spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, ++ boolean_t ispath) ++{ ++ vdev_t *vd; ++ boolean_t sync = B_FALSE; ++ ++ ASSERT(spa_writeable(spa)); ++ ++ spa_vdev_state_enter(spa, SCL_ALL); ++ ++ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) ++ return (spa_vdev_state_exit(spa, NULL, ENOENT)); ++ ++ if (!vd->vdev_ops->vdev_op_leaf) ++ return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); ++ ++ if (ispath) { ++ if (strcmp(value, vd->vdev_path) != 0) { ++ spa_strfree(vd->vdev_path); ++ vd->vdev_path = spa_strdup(value); ++ sync = B_TRUE; ++ } ++ } else { ++ if (vd->vdev_fru == NULL) { ++ vd->vdev_fru = spa_strdup(value); ++ sync = B_TRUE; ++ } else if (strcmp(value, vd->vdev_fru) != 0) { ++ spa_strfree(vd->vdev_fru); ++ vd->vdev_fru = spa_strdup(value); ++ sync = B_TRUE; ++ } ++ } ++ ++ return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); ++} ++ ++int ++spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) ++{ ++ return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); ++} ++ ++int ++spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) ++{ ++ return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); ++} ++ ++/* ++ * ========================================================================== ++ * SPA Scanning ++ * ========================================================================== ++ */ ++ ++int ++spa_scan_stop(spa_t *spa) ++{ ++ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); ++ if (dsl_scan_resilvering(spa->spa_dsl_pool)) ++ return (EBUSY); ++ return (dsl_scan_cancel(spa->spa_dsl_pool)); ++} ++ ++int ++spa_scan(spa_t *spa, pool_scan_func_t func) ++{ ++ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); ++ ++ if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) ++ return (ENOTSUP); ++ ++ /* ++ * If a resilver was requested, but there is no DTL on a ++ * writeable leaf device, we have nothing to do. ++ */ ++ if (func == POOL_SCAN_RESILVER && ++ !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { ++ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); ++ return (0); ++ } ++ ++ return (dsl_scan(spa->spa_dsl_pool, func)); ++} ++ ++/* ++ * ========================================================================== ++ * SPA async task processing ++ * ========================================================================== ++ */ ++ ++static void ++spa_async_remove(spa_t *spa, vdev_t *vd) ++{ ++ int c; ++ ++ if (vd->vdev_remove_wanted) { ++ vd->vdev_remove_wanted = B_FALSE; ++ vd->vdev_delayed_close = B_FALSE; ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); ++ ++ /* ++ * We want to clear the stats, but we don't want to do a full ++ * vdev_clear() as that will cause us to throw away ++ * degraded/faulted state as well as attempt to reopen the ++ * device, all of which is a waste. ++ */ ++ vd->vdev_stat.vs_read_errors = 0; ++ vd->vdev_stat.vs_write_errors = 0; ++ vd->vdev_stat.vs_checksum_errors = 0; ++ ++ vdev_state_dirty(vd->vdev_top); ++ } ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ spa_async_remove(spa, vd->vdev_child[c]); ++} ++ ++static void ++spa_async_probe(spa_t *spa, vdev_t *vd) ++{ ++ int c; ++ ++ if (vd->vdev_probe_wanted) { ++ vd->vdev_probe_wanted = B_FALSE; ++ vdev_reopen(vd); /* vdev_open() does the actual probe */ ++ } ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ spa_async_probe(spa, vd->vdev_child[c]); ++} ++ ++static void ++spa_async_autoexpand(spa_t *spa, vdev_t *vd) ++{ ++ int c; ++ ++ if (!spa->spa_autoexpand) ++ return; ++ ++ for (c = 0; c < vd->vdev_children; c++) { ++ vdev_t *cvd = vd->vdev_child[c]; ++ spa_async_autoexpand(spa, cvd); ++ } ++ ++ if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) ++ return; ++ ++ spa_event_notify(vd->vdev_spa, vd, FM_EREPORT_ZFS_DEVICE_AUTOEXPAND); ++} ++ ++static void ++spa_async_thread(spa_t *spa) ++{ ++ int tasks, i; ++ ++ ASSERT(spa->spa_sync_on); ++ ++ mutex_enter(&spa->spa_async_lock); ++ tasks = spa->spa_async_tasks; ++ spa->spa_async_tasks = 0; ++ mutex_exit(&spa->spa_async_lock); ++ ++ /* ++ * See if the config needs to be updated. ++ */ ++ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { ++ uint64_t old_space, new_space; ++ ++ mutex_enter(&spa_namespace_lock); ++ old_space = metaslab_class_get_space(spa_normal_class(spa)); ++ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); ++ new_space = metaslab_class_get_space(spa_normal_class(spa)); ++ mutex_exit(&spa_namespace_lock); ++ ++ /* ++ * If the pool grew as a result of the config update, ++ * then log an internal history event. ++ */ ++ if (new_space != old_space) { ++ spa_history_log_internal(LOG_POOL_VDEV_ONLINE, ++ spa, NULL, ++ "pool '%s' size: %llu(+%llu)", ++ spa_name(spa), new_space, new_space - old_space); ++ } ++ } ++ ++ /* ++ * See if any devices need to be marked REMOVED. ++ */ ++ if (tasks & SPA_ASYNC_REMOVE) { ++ spa_vdev_state_enter(spa, SCL_NONE); ++ spa_async_remove(spa, spa->spa_root_vdev); ++ for (i = 0; i < spa->spa_l2cache.sav_count; i++) ++ spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); ++ for (i = 0; i < spa->spa_spares.sav_count; i++) ++ spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); ++ (void) spa_vdev_state_exit(spa, NULL, 0); ++ } ++ ++ if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { ++ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); ++ spa_async_autoexpand(spa, spa->spa_root_vdev); ++ spa_config_exit(spa, SCL_CONFIG, FTAG); ++ } ++ ++ /* ++ * See if any devices need to be probed. ++ */ ++ if (tasks & SPA_ASYNC_PROBE) { ++ spa_vdev_state_enter(spa, SCL_NONE); ++ spa_async_probe(spa, spa->spa_root_vdev); ++ (void) spa_vdev_state_exit(spa, NULL, 0); ++ } ++ ++ /* ++ * If any devices are done replacing, detach them. ++ */ ++ if (tasks & SPA_ASYNC_RESILVER_DONE) ++ spa_vdev_resilver_done(spa); ++ ++ /* ++ * Kick off a resilver. ++ */ ++ if (tasks & SPA_ASYNC_RESILVER) ++ dsl_resilver_restart(spa->spa_dsl_pool, 0); ++ ++ /* ++ * Let the world know that we're done. ++ */ ++ mutex_enter(&spa->spa_async_lock); ++ spa->spa_async_thread = NULL; ++ cv_broadcast(&spa->spa_async_cv); ++ mutex_exit(&spa->spa_async_lock); ++ thread_exit(); ++} ++ ++void ++spa_async_suspend(spa_t *spa) ++{ ++ mutex_enter(&spa->spa_async_lock); ++ spa->spa_async_suspended++; ++ while (spa->spa_async_thread != NULL) ++ cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); ++ mutex_exit(&spa->spa_async_lock); ++} ++ ++void ++spa_async_resume(spa_t *spa) ++{ ++ mutex_enter(&spa->spa_async_lock); ++ ASSERT(spa->spa_async_suspended != 0); ++ spa->spa_async_suspended--; ++ mutex_exit(&spa->spa_async_lock); ++} ++ ++static void ++spa_async_dispatch(spa_t *spa) ++{ ++ mutex_enter(&spa->spa_async_lock); ++ if (spa->spa_async_tasks && !spa->spa_async_suspended && ++ spa->spa_async_thread == NULL && ++ rootdir != NULL && !vn_is_readonly(rootdir)) ++ spa->spa_async_thread = thread_create(NULL, 0, ++ spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); ++ mutex_exit(&spa->spa_async_lock); ++} ++ ++void ++spa_async_request(spa_t *spa, int task) ++{ ++ zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); ++ mutex_enter(&spa->spa_async_lock); ++ spa->spa_async_tasks |= task; ++ mutex_exit(&spa->spa_async_lock); ++} ++ ++/* ++ * ========================================================================== ++ * SPA syncing routines ++ * ========================================================================== ++ */ ++ ++static int ++bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) ++{ ++ bpobj_t *bpo = arg; ++ bpobj_enqueue(bpo, bp, tx); ++ return (0); ++} ++ ++static int ++spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) ++{ ++ zio_t *zio = arg; ++ ++ zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, ++ zio->io_flags)); ++ return (0); ++} ++ ++static void ++spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) ++{ ++ char *packed = NULL; ++ size_t bufsize; ++ size_t nvsize = 0; ++ dmu_buf_t *db; ++ ++ VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); ++ ++ /* ++ * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration ++ * information. This avoids the dbuf_will_dirty() path and ++ * saves us a pre-read to get data we don't actually care about. ++ */ ++ bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); ++ packed = vmem_alloc(bufsize, KM_PUSHPAGE); ++ ++ VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, ++ KM_PUSHPAGE) == 0); ++ bzero(packed + nvsize, bufsize - nvsize); ++ ++ dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); ++ ++ vmem_free(packed, bufsize); ++ ++ VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); ++ dmu_buf_will_dirty(db, tx); ++ *(uint64_t *)db->db_data = nvsize; ++ dmu_buf_rele(db, FTAG); ++} ++ ++static void ++spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, ++ const char *config, const char *entry) ++{ ++ nvlist_t *nvroot; ++ nvlist_t **list; ++ int i; ++ ++ if (!sav->sav_sync) ++ return; ++ ++ /* ++ * Update the MOS nvlist describing the list of available devices. ++ * spa_validate_aux() will have already made sure this nvlist is ++ * valid and the vdevs are labeled appropriately. ++ */ ++ if (sav->sav_object == 0) { ++ sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, ++ DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, ++ sizeof (uint64_t), tx); ++ VERIFY(zap_update(spa->spa_meta_objset, ++ DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, ++ &sav->sav_object, tx) == 0); ++ } ++ ++ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); ++ if (sav->sav_count == 0) { ++ VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); ++ } else { ++ list = kmem_alloc(sav->sav_count * sizeof (void *), KM_PUSHPAGE); ++ for (i = 0; i < sav->sav_count; i++) ++ list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], ++ B_FALSE, VDEV_CONFIG_L2CACHE); ++ VERIFY(nvlist_add_nvlist_array(nvroot, config, list, ++ sav->sav_count) == 0); ++ for (i = 0; i < sav->sav_count; i++) ++ nvlist_free(list[i]); ++ kmem_free(list, sav->sav_count * sizeof (void *)); ++ } ++ ++ spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); ++ nvlist_free(nvroot); ++ ++ sav->sav_sync = B_FALSE; ++} ++ ++static void ++spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) ++{ ++ nvlist_t *config; ++ ++ if (list_is_empty(&spa->spa_config_dirty_list)) ++ return; ++ ++ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); ++ ++ config = spa_config_generate(spa, spa->spa_root_vdev, ++ dmu_tx_get_txg(tx), B_FALSE); ++ ++ spa_config_exit(spa, SCL_STATE, FTAG); ++ ++ if (spa->spa_config_syncing) ++ nvlist_free(spa->spa_config_syncing); ++ spa->spa_config_syncing = config; ++ ++ spa_sync_nvlist(spa, spa->spa_config_object, config, tx); ++} ++ ++/* ++ * Set zpool properties. ++ */ ++static void ++spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ spa_t *spa = arg1; ++ objset_t *mos = spa->spa_meta_objset; ++ nvlist_t *nvp = arg2; ++ nvpair_t *elem; ++ uint64_t intval; ++ char *strval; ++ zpool_prop_t prop; ++ const char *propname; ++ zprop_type_t proptype; ++ ++ mutex_enter(&spa->spa_props_lock); ++ ++ elem = NULL; ++ while ((elem = nvlist_next_nvpair(nvp, elem))) { ++ switch (prop = zpool_name_to_prop(nvpair_name(elem))) { ++ case ZPOOL_PROP_VERSION: ++ /* ++ * Only set version for non-zpool-creation cases ++ * (set/import). spa_create() needs special care ++ * for version setting. ++ */ ++ if (tx->tx_txg != TXG_INITIAL) { ++ VERIFY(nvpair_value_uint64(elem, ++ &intval) == 0); ++ ASSERT(intval <= SPA_VERSION); ++ ASSERT(intval >= spa_version(spa)); ++ spa->spa_uberblock.ub_version = intval; ++ vdev_config_dirty(spa->spa_root_vdev); ++ } ++ break; ++ ++ case ZPOOL_PROP_ALTROOT: ++ /* ++ * 'altroot' is a non-persistent property. It should ++ * have been set temporarily at creation or import time. ++ */ ++ ASSERT(spa->spa_root != NULL); ++ break; ++ ++ case ZPOOL_PROP_READONLY: ++ case ZPOOL_PROP_CACHEFILE: ++ /* ++ * 'readonly' and 'cachefile' are also non-persisitent ++ * properties. ++ */ ++ break; ++ case ZPOOL_PROP_COMMENT: ++ VERIFY(nvpair_value_string(elem, &strval) == 0); ++ if (spa->spa_comment != NULL) ++ spa_strfree(spa->spa_comment); ++ spa->spa_comment = spa_strdup(strval); ++ /* ++ * We need to dirty the configuration on all the vdevs ++ * so that their labels get updated. It's unnecessary ++ * to do this for pool creation since the vdev's ++ * configuratoin has already been dirtied. ++ */ ++ if (tx->tx_txg != TXG_INITIAL) ++ vdev_config_dirty(spa->spa_root_vdev); ++ break; ++ default: ++ /* ++ * Set pool property values in the poolprops mos object. ++ */ ++ if (spa->spa_pool_props_object == 0) { ++ VERIFY((spa->spa_pool_props_object = ++ zap_create(mos, DMU_OT_POOL_PROPS, ++ DMU_OT_NONE, 0, tx)) > 0); ++ ++ VERIFY(zap_update(mos, ++ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, ++ 8, 1, &spa->spa_pool_props_object, tx) ++ == 0); ++ } ++ ++ /* normalize the property name */ ++ propname = zpool_prop_to_name(prop); ++ proptype = zpool_prop_get_type(prop); ++ ++ if (nvpair_type(elem) == DATA_TYPE_STRING) { ++ ASSERT(proptype == PROP_TYPE_STRING); ++ VERIFY(nvpair_value_string(elem, &strval) == 0); ++ VERIFY(zap_update(mos, ++ spa->spa_pool_props_object, propname, ++ 1, strlen(strval) + 1, strval, tx) == 0); ++ ++ } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { ++ VERIFY(nvpair_value_uint64(elem, &intval) == 0); ++ ++ if (proptype == PROP_TYPE_INDEX) { ++ const char *unused; ++ VERIFY(zpool_prop_index_to_string( ++ prop, intval, &unused) == 0); ++ } ++ VERIFY(zap_update(mos, ++ spa->spa_pool_props_object, propname, ++ 8, 1, &intval, tx) == 0); ++ } else { ++ ASSERT(0); /* not allowed */ ++ } ++ ++ switch (prop) { ++ case ZPOOL_PROP_DELEGATION: ++ spa->spa_delegation = intval; ++ break; ++ case ZPOOL_PROP_BOOTFS: ++ spa->spa_bootfs = intval; ++ break; ++ case ZPOOL_PROP_FAILUREMODE: ++ spa->spa_failmode = intval; ++ break; ++ case ZPOOL_PROP_AUTOEXPAND: ++ spa->spa_autoexpand = intval; ++ if (tx->tx_txg != TXG_INITIAL) ++ spa_async_request(spa, ++ SPA_ASYNC_AUTOEXPAND); ++ break; ++ case ZPOOL_PROP_DEDUPDITTO: ++ spa->spa_dedup_ditto = intval; ++ break; ++ default: ++ break; ++ } ++ } ++ ++ /* log internal history if this is not a zpool create */ ++ if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && ++ tx->tx_txg != TXG_INITIAL) { ++ spa_history_log_internal(LOG_POOL_PROPSET, ++ spa, tx, "%s %lld %s", ++ nvpair_name(elem), intval, spa_name(spa)); ++ } ++ } ++ ++ mutex_exit(&spa->spa_props_lock); ++} ++ ++/* ++ * Perform one-time upgrade on-disk changes. spa_version() does not ++ * reflect the new version this txg, so there must be no changes this ++ * txg to anything that the upgrade code depends on after it executes. ++ * Therefore this must be called after dsl_pool_sync() does the sync ++ * tasks. ++ */ ++static void ++spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) ++{ ++ dsl_pool_t *dp = spa->spa_dsl_pool; ++ ++ ASSERT(spa->spa_sync_pass == 1); ++ ++ if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && ++ spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { ++ dsl_pool_create_origin(dp, tx); ++ ++ /* Keeping the origin open increases spa_minref */ ++ spa->spa_minref += 3; ++ } ++ ++ if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && ++ spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { ++ dsl_pool_upgrade_clones(dp, tx); ++ } ++ ++ if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && ++ spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { ++ dsl_pool_upgrade_dir_clones(dp, tx); ++ ++ /* Keeping the freedir open increases spa_minref */ ++ spa->spa_minref += 3; ++ } ++} ++ ++/* ++ * Sync the specified transaction group. New blocks may be dirtied as ++ * part of the process, so we iterate until it converges. ++ */ ++void ++spa_sync(spa_t *spa, uint64_t txg) ++{ ++ dsl_pool_t *dp = spa->spa_dsl_pool; ++ objset_t *mos = spa->spa_meta_objset; ++ bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; ++ bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; ++ vdev_t *rvd = spa->spa_root_vdev; ++ vdev_t *vd; ++ dmu_tx_t *tx; ++ int error; ++ int c; ++ ++ VERIFY(spa_writeable(spa)); ++ ++ /* ++ * Lock out configuration changes. ++ */ ++ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); ++ ++ spa->spa_syncing_txg = txg; ++ spa->spa_sync_pass = 0; ++ ++ /* ++ * If there are any pending vdev state changes, convert them ++ * into config changes that go out with this transaction group. ++ */ ++ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); ++ while (list_head(&spa->spa_state_dirty_list) != NULL) { ++ /* ++ * We need the write lock here because, for aux vdevs, ++ * calling vdev_config_dirty() modifies sav_config. ++ * This is ugly and will become unnecessary when we ++ * eliminate the aux vdev wart by integrating all vdevs ++ * into the root vdev tree. ++ */ ++ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); ++ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); ++ while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { ++ vdev_state_clean(vd); ++ vdev_config_dirty(vd); ++ } ++ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); ++ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); ++ } ++ spa_config_exit(spa, SCL_STATE, FTAG); ++ ++ tx = dmu_tx_create_assigned(dp, txg); ++ ++ /* ++ * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, ++ * set spa_deflate if we have no raid-z vdevs. ++ */ ++ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && ++ spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { ++ int i; ++ ++ for (i = 0; i < rvd->vdev_children; i++) { ++ vd = rvd->vdev_child[i]; ++ if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) ++ break; ++ } ++ if (i == rvd->vdev_children) { ++ spa->spa_deflate = TRUE; ++ VERIFY(0 == zap_add(spa->spa_meta_objset, ++ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, ++ sizeof (uint64_t), 1, &spa->spa_deflate, tx)); ++ } ++ } ++ ++ /* ++ * If anything has changed in this txg, or if someone is waiting ++ * for this txg to sync (eg, spa_vdev_remove()), push the ++ * deferred frees from the previous txg. If not, leave them ++ * alone so that we don't generate work on an otherwise idle ++ * system. ++ */ ++ if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || ++ !txg_list_empty(&dp->dp_dirty_dirs, txg) || ++ !txg_list_empty(&dp->dp_sync_tasks, txg) || ++ ((dsl_scan_active(dp->dp_scan) || ++ txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { ++ zio_t *zio = zio_root(spa, NULL, NULL, 0); ++ VERIFY3U(bpobj_iterate(defer_bpo, ++ spa_free_sync_cb, zio, tx), ==, 0); ++ VERIFY3U(zio_wait(zio), ==, 0); ++ } ++ ++ /* ++ * Iterate to convergence. ++ */ ++ do { ++ int pass = ++spa->spa_sync_pass; ++ ++ spa_sync_config_object(spa, tx); ++ spa_sync_aux_dev(spa, &spa->spa_spares, tx, ++ ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); ++ spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, ++ ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); ++ spa_errlog_sync(spa, txg); ++ dsl_pool_sync(dp, txg); ++ ++ if (pass <= SYNC_PASS_DEFERRED_FREE) { ++ zio_t *zio = zio_root(spa, NULL, NULL, 0); ++ bplist_iterate(free_bpl, spa_free_sync_cb, ++ zio, tx); ++ VERIFY(zio_wait(zio) == 0); ++ } else { ++ bplist_iterate(free_bpl, bpobj_enqueue_cb, ++ defer_bpo, tx); ++ } ++ ++ ddt_sync(spa, txg); ++ dsl_scan_sync(dp, tx); ++ ++ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))) ++ vdev_sync(vd, txg); ++ ++ if (pass == 1) ++ spa_sync_upgrades(spa, tx); ++ ++ } while (dmu_objset_is_dirty(mos, txg)); ++ ++ /* ++ * Rewrite the vdev configuration (which includes the uberblock) ++ * to commit the transaction group. ++ * ++ * If there are no dirty vdevs, we sync the uberblock to a few ++ * random top-level vdevs that are known to be visible in the ++ * config cache (see spa_vdev_add() for a complete description). ++ * If there *are* dirty vdevs, sync the uberblock to all vdevs. ++ */ ++ for (;;) { ++ /* ++ * We hold SCL_STATE to prevent vdev open/close/etc. ++ * while we're attempting to write the vdev labels. ++ */ ++ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); ++ ++ if (list_is_empty(&spa->spa_config_dirty_list)) { ++ vdev_t *svd[SPA_DVAS_PER_BP]; ++ int svdcount = 0; ++ int children = rvd->vdev_children; ++ int c0 = spa_get_random(children); ++ ++ for (c = 0; c < children; c++) { ++ vd = rvd->vdev_child[(c0 + c) % children]; ++ if (vd->vdev_ms_array == 0 || vd->vdev_islog) ++ continue; ++ svd[svdcount++] = vd; ++ if (svdcount == SPA_DVAS_PER_BP) ++ break; ++ } ++ error = vdev_config_sync(svd, svdcount, txg, B_FALSE); ++ if (error != 0) ++ error = vdev_config_sync(svd, svdcount, txg, ++ B_TRUE); ++ } else { ++ error = vdev_config_sync(rvd->vdev_child, ++ rvd->vdev_children, txg, B_FALSE); ++ if (error != 0) ++ error = vdev_config_sync(rvd->vdev_child, ++ rvd->vdev_children, txg, B_TRUE); ++ } ++ ++ spa_config_exit(spa, SCL_STATE, FTAG); ++ ++ if (error == 0) ++ break; ++ zio_suspend(spa, NULL); ++ zio_resume_wait(spa); ++ } ++ dmu_tx_commit(tx); ++ ++ /* ++ * Clear the dirty config list. ++ */ ++ while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) ++ vdev_config_clean(vd); ++ ++ /* ++ * Now that the new config has synced transactionally, ++ * let it become visible to the config cache. ++ */ ++ if (spa->spa_config_syncing != NULL) { ++ spa_config_set(spa, spa->spa_config_syncing); ++ spa->spa_config_txg = txg; ++ spa->spa_config_syncing = NULL; ++ } ++ ++ spa->spa_ubsync = spa->spa_uberblock; ++ ++ dsl_pool_sync_done(dp, txg); ++ ++ /* ++ * Update usable space statistics. ++ */ ++ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))) ++ vdev_sync_done(vd, txg); ++ ++ spa_update_dspace(spa); ++ ++ /* ++ * It had better be the case that we didn't dirty anything ++ * since vdev_config_sync(). ++ */ ++ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ++ ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ++ ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); ++ ++ spa->spa_sync_pass = 0; ++ ++ spa_config_exit(spa, SCL_CONFIG, FTAG); ++ ++ spa_handle_ignored_writes(spa); ++ ++ /* ++ * If any async tasks have been requested, kick them off. ++ */ ++ spa_async_dispatch(spa); ++} ++ ++/* ++ * Sync all pools. We don't want to hold the namespace lock across these ++ * operations, so we take a reference on the spa_t and drop the lock during the ++ * sync. ++ */ ++void ++spa_sync_allpools(void) ++{ ++ spa_t *spa = NULL; ++ mutex_enter(&spa_namespace_lock); ++ while ((spa = spa_next(spa)) != NULL) { ++ if (spa_state(spa) != POOL_STATE_ACTIVE || ++ !spa_writeable(spa) || spa_suspended(spa)) ++ continue; ++ spa_open_ref(spa, FTAG); ++ mutex_exit(&spa_namespace_lock); ++ txg_wait_synced(spa_get_dsl(spa), 0); ++ mutex_enter(&spa_namespace_lock); ++ spa_close(spa, FTAG); ++ } ++ mutex_exit(&spa_namespace_lock); ++} ++ ++/* ++ * ========================================================================== ++ * Miscellaneous routines ++ * ========================================================================== ++ */ ++ ++/* ++ * Remove all pools in the system. ++ */ ++void ++spa_evict_all(void) ++{ ++ spa_t *spa; ++ ++ /* ++ * Remove all cached state. All pools should be closed now, ++ * so every spa in the AVL tree should be unreferenced. ++ */ ++ mutex_enter(&spa_namespace_lock); ++ while ((spa = spa_next(NULL)) != NULL) { ++ /* ++ * Stop async tasks. The async thread may need to detach ++ * a device that's been replaced, which requires grabbing ++ * spa_namespace_lock, so we must drop it here. ++ */ ++ spa_open_ref(spa, FTAG); ++ mutex_exit(&spa_namespace_lock); ++ spa_async_suspend(spa); ++ mutex_enter(&spa_namespace_lock); ++ spa_close(spa, FTAG); ++ ++ if (spa->spa_state != POOL_STATE_UNINITIALIZED) { ++ spa_unload(spa); ++ spa_deactivate(spa); ++ } ++ spa_remove(spa); ++ } ++ mutex_exit(&spa_namespace_lock); ++} ++ ++vdev_t * ++spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) ++{ ++ vdev_t *vd; ++ int i; ++ ++ if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) ++ return (vd); ++ ++ if (aux) { ++ for (i = 0; i < spa->spa_l2cache.sav_count; i++) { ++ vd = spa->spa_l2cache.sav_vdevs[i]; ++ if (vd->vdev_guid == guid) ++ return (vd); ++ } ++ ++ for (i = 0; i < spa->spa_spares.sav_count; i++) { ++ vd = spa->spa_spares.sav_vdevs[i]; ++ if (vd->vdev_guid == guid) ++ return (vd); ++ } ++ } ++ ++ return (NULL); ++} ++ ++void ++spa_upgrade(spa_t *spa, uint64_t version) ++{ ++ ASSERT(spa_writeable(spa)); ++ ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ ++ /* ++ * This should only be called for a non-faulted pool, and since a ++ * future version would result in an unopenable pool, this shouldn't be ++ * possible. ++ */ ++ ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); ++ ASSERT(version >= spa->spa_uberblock.ub_version); ++ ++ spa->spa_uberblock.ub_version = version; ++ vdev_config_dirty(spa->spa_root_vdev); ++ ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ ++ txg_wait_synced(spa_get_dsl(spa), 0); ++} ++ ++boolean_t ++spa_has_spare(spa_t *spa, uint64_t guid) ++{ ++ int i; ++ uint64_t spareguid; ++ spa_aux_vdev_t *sav = &spa->spa_spares; ++ ++ for (i = 0; i < sav->sav_count; i++) ++ if (sav->sav_vdevs[i]->vdev_guid == guid) ++ return (B_TRUE); ++ ++ for (i = 0; i < sav->sav_npending; i++) { ++ if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, ++ &spareguid) == 0 && spareguid == guid) ++ return (B_TRUE); ++ } ++ ++ return (B_FALSE); ++} ++ ++/* ++ * Check if a pool has an active shared spare device. ++ * Note: reference count of an active spare is 2, as a spare and as a replace ++ */ ++static boolean_t ++spa_has_active_shared_spare(spa_t *spa) ++{ ++ int i, refcnt; ++ uint64_t pool; ++ spa_aux_vdev_t *sav = &spa->spa_spares; ++ ++ for (i = 0; i < sav->sav_count; i++) { ++ if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, ++ &refcnt) && pool != 0ULL && pool == spa_guid(spa) && ++ refcnt > 2) ++ return (B_TRUE); ++ } ++ ++ return (B_FALSE); ++} ++ ++/* ++ * Post a FM_EREPORT_ZFS_* event from sys/fm/fs/zfs.h. The payload will be ++ * filled in from the spa and (optionally) the vdev. This doesn't do anything ++ * in the userland libzpool, as we don't want consumers to misinterpret ztest ++ * or zdb as real changes. ++ */ ++void ++spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) ++{ ++#ifdef _KERNEL ++ zfs_ereport_post(name, spa, vd, NULL, 0, 0); ++#endif ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++/* state manipulation functions */ ++EXPORT_SYMBOL(spa_open); ++EXPORT_SYMBOL(spa_open_rewind); ++EXPORT_SYMBOL(spa_get_stats); ++EXPORT_SYMBOL(spa_create); ++EXPORT_SYMBOL(spa_import_rootpool); ++EXPORT_SYMBOL(spa_import); ++EXPORT_SYMBOL(spa_tryimport); ++EXPORT_SYMBOL(spa_destroy); ++EXPORT_SYMBOL(spa_export); ++EXPORT_SYMBOL(spa_reset); ++EXPORT_SYMBOL(spa_async_request); ++EXPORT_SYMBOL(spa_async_suspend); ++EXPORT_SYMBOL(spa_async_resume); ++EXPORT_SYMBOL(spa_inject_addref); ++EXPORT_SYMBOL(spa_inject_delref); ++EXPORT_SYMBOL(spa_scan_stat_init); ++EXPORT_SYMBOL(spa_scan_get_stats); ++ ++/* device maniion */ ++EXPORT_SYMBOL(spa_vdev_add); ++EXPORT_SYMBOL(spa_vdev_attach); ++EXPORT_SYMBOL(spa_vdev_detach); ++EXPORT_SYMBOL(spa_vdev_remove); ++EXPORT_SYMBOL(spa_vdev_setpath); ++EXPORT_SYMBOL(spa_vdev_setfru); ++EXPORT_SYMBOL(spa_vdev_split_mirror); ++ ++/* spare statech is global across all pools) */ ++EXPORT_SYMBOL(spa_spare_add); ++EXPORT_SYMBOL(spa_spare_remove); ++EXPORT_SYMBOL(spa_spare_exists); ++EXPORT_SYMBOL(spa_spare_activate); ++ ++/* L2ARC statech is global across all pools) */ ++EXPORT_SYMBOL(spa_l2cache_add); ++EXPORT_SYMBOL(spa_l2cache_remove); ++EXPORT_SYMBOL(spa_l2cache_exists); ++EXPORT_SYMBOL(spa_l2cache_activate); ++EXPORT_SYMBOL(spa_l2cache_drop); ++ ++/* scanning */ ++EXPORT_SYMBOL(spa_scan); ++EXPORT_SYMBOL(spa_scan_stop); ++ ++/* spa syncing */ ++EXPORT_SYMBOL(spa_sync); /* only for DMU use */ ++EXPORT_SYMBOL(spa_sync_allpools); ++ ++/* properties */ ++EXPORT_SYMBOL(spa_prop_set); ++EXPORT_SYMBOL(spa_prop_get); ++EXPORT_SYMBOL(spa_prop_clear_bootfs); ++ ++/* asynchronous event notification */ ++EXPORT_SYMBOL(spa_event_notify); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/space_map.c linux-3.2.33-go/fs/zfs/zfs/space_map.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/space_map.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/space_map.c 2012-11-16 23:25:34.348039346 +0100 +@@ -0,0 +1,616 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Space map routines. ++ * NOTE: caller is responsible for all locking. ++ */ ++static int ++space_map_seg_compare(const void *x1, const void *x2) ++{ ++ const space_seg_t *s1 = x1; ++ const space_seg_t *s2 = x2; ++ ++ if (s1->ss_start < s2->ss_start) { ++ if (s1->ss_end > s2->ss_start) ++ return (0); ++ return (-1); ++ } ++ if (s1->ss_start > s2->ss_start) { ++ if (s1->ss_start < s2->ss_end) ++ return (0); ++ return (1); ++ } ++ return (0); ++} ++ ++void ++space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift, ++ kmutex_t *lp) ++{ ++ bzero(sm, sizeof (*sm)); ++ ++ cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL); ++ ++ avl_create(&sm->sm_root, space_map_seg_compare, ++ sizeof (space_seg_t), offsetof(struct space_seg, ss_node)); ++ ++ sm->sm_start = start; ++ sm->sm_size = size; ++ sm->sm_shift = shift; ++ sm->sm_lock = lp; ++} ++ ++void ++space_map_destroy(space_map_t *sm) ++{ ++ ASSERT(!sm->sm_loaded && !sm->sm_loading); ++ VERIFY3U(sm->sm_space, ==, 0); ++ avl_destroy(&sm->sm_root); ++ cv_destroy(&sm->sm_load_cv); ++} ++ ++void ++space_map_add(space_map_t *sm, uint64_t start, uint64_t size) ++{ ++ avl_index_t where; ++ space_seg_t ssearch, *ss_before, *ss_after, *ss; ++ uint64_t end = start + size; ++ int merge_before, merge_after; ++ ++ ASSERT(MUTEX_HELD(sm->sm_lock)); ++ VERIFY(size != 0); ++ VERIFY3U(start, >=, sm->sm_start); ++ VERIFY3U(end, <=, sm->sm_start + sm->sm_size); ++ VERIFY(sm->sm_space + size <= sm->sm_size); ++ VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); ++ VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); ++ ++ ssearch.ss_start = start; ++ ssearch.ss_end = end; ++ ss = avl_find(&sm->sm_root, &ssearch, &where); ++ ++ if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) { ++ zfs_panic_recover("zfs: allocating allocated segment" ++ "(offset=%llu size=%llu)\n", ++ (longlong_t)start, (longlong_t)size); ++ return; ++ } ++ ++ /* Make sure we don't overlap with either of our neighbors */ ++ VERIFY(ss == NULL); ++ ++ ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE); ++ ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER); ++ ++ merge_before = (ss_before != NULL && ss_before->ss_end == start); ++ merge_after = (ss_after != NULL && ss_after->ss_start == end); ++ ++ if (merge_before && merge_after) { ++ avl_remove(&sm->sm_root, ss_before); ++ if (sm->sm_pp_root) { ++ avl_remove(sm->sm_pp_root, ss_before); ++ avl_remove(sm->sm_pp_root, ss_after); ++ } ++ ss_after->ss_start = ss_before->ss_start; ++ kmem_free(ss_before, sizeof (*ss_before)); ++ ss = ss_after; ++ } else if (merge_before) { ++ ss_before->ss_end = end; ++ if (sm->sm_pp_root) ++ avl_remove(sm->sm_pp_root, ss_before); ++ ss = ss_before; ++ } else if (merge_after) { ++ ss_after->ss_start = start; ++ if (sm->sm_pp_root) ++ avl_remove(sm->sm_pp_root, ss_after); ++ ss = ss_after; ++ } else { ++ ss = kmem_alloc(sizeof (*ss), KM_PUSHPAGE); ++ ss->ss_start = start; ++ ss->ss_end = end; ++ avl_insert(&sm->sm_root, ss, where); ++ } ++ ++ if (sm->sm_pp_root) ++ avl_add(sm->sm_pp_root, ss); ++ ++ sm->sm_space += size; ++} ++ ++void ++space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) ++{ ++ avl_index_t where; ++ space_seg_t ssearch, *ss, *newseg; ++ uint64_t end = start + size; ++ int left_over, right_over; ++ ++ ASSERT(MUTEX_HELD(sm->sm_lock)); ++ VERIFY(size != 0); ++ VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); ++ VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); ++ ++ ssearch.ss_start = start; ++ ssearch.ss_end = end; ++ ss = avl_find(&sm->sm_root, &ssearch, &where); ++ ++ /* Make sure we completely overlap with someone */ ++ if (ss == NULL) { ++ zfs_panic_recover("zfs: freeing free segment " ++ "(offset=%llu size=%llu)", ++ (longlong_t)start, (longlong_t)size); ++ return; ++ } ++ VERIFY3U(ss->ss_start, <=, start); ++ VERIFY3U(ss->ss_end, >=, end); ++ VERIFY(sm->sm_space - size <= sm->sm_size); ++ ++ left_over = (ss->ss_start != start); ++ right_over = (ss->ss_end != end); ++ ++ if (sm->sm_pp_root) ++ avl_remove(sm->sm_pp_root, ss); ++ ++ if (left_over && right_over) { ++ newseg = kmem_alloc(sizeof (*newseg), KM_PUSHPAGE); ++ newseg->ss_start = end; ++ newseg->ss_end = ss->ss_end; ++ ss->ss_end = start; ++ avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER); ++ if (sm->sm_pp_root) ++ avl_add(sm->sm_pp_root, newseg); ++ } else if (left_over) { ++ ss->ss_end = start; ++ } else if (right_over) { ++ ss->ss_start = end; ++ } else { ++ avl_remove(&sm->sm_root, ss); ++ kmem_free(ss, sizeof (*ss)); ++ ss = NULL; ++ } ++ ++ if (sm->sm_pp_root && ss != NULL) ++ avl_add(sm->sm_pp_root, ss); ++ ++ sm->sm_space -= size; ++} ++ ++boolean_t ++space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) ++{ ++ avl_index_t where; ++ space_seg_t ssearch, *ss; ++ uint64_t end = start + size; ++ ++ ASSERT(MUTEX_HELD(sm->sm_lock)); ++ VERIFY(size != 0); ++ VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); ++ VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); ++ ++ ssearch.ss_start = start; ++ ssearch.ss_end = end; ++ ss = avl_find(&sm->sm_root, &ssearch, &where); ++ ++ return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end); ++} ++ ++void ++space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) ++{ ++ space_seg_t *ss; ++ void *cookie = NULL; ++ ++ ASSERT(MUTEX_HELD(sm->sm_lock)); ++ ++ while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { ++ if (func != NULL) ++ func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); ++ kmem_free(ss, sizeof (*ss)); ++ } ++ sm->sm_space = 0; ++} ++ ++void ++space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) ++{ ++ space_seg_t *ss; ++ ++ ASSERT(MUTEX_HELD(sm->sm_lock)); ++ ++ for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) ++ func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); ++} ++ ++/* ++ * Wait for any in-progress space_map_load() to complete. ++ */ ++void ++space_map_load_wait(space_map_t *sm) ++{ ++ ASSERT(MUTEX_HELD(sm->sm_lock)); ++ ++ while (sm->sm_loading) { ++ ASSERT(!sm->sm_loaded); ++ cv_wait(&sm->sm_load_cv, sm->sm_lock); ++ } ++} ++ ++/* ++ * Note: space_map_load() will drop sm_lock across dmu_read() calls. ++ * The caller must be OK with this. ++ */ ++int ++space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, ++ space_map_obj_t *smo, objset_t *os) ++{ ++ uint64_t *entry, *entry_map, *entry_map_end; ++ uint64_t bufsize, size, offset, end, space; ++ uint64_t mapstart = sm->sm_start; ++ int error = 0; ++ ++ ASSERT(MUTEX_HELD(sm->sm_lock)); ++ ASSERT(!sm->sm_loaded); ++ ASSERT(!sm->sm_loading); ++ ++ sm->sm_loading = B_TRUE; ++ end = smo->smo_objsize; ++ space = smo->smo_alloc; ++ ++ ASSERT(sm->sm_ops == NULL); ++ VERIFY3U(sm->sm_space, ==, 0); ++ ++ if (maptype == SM_FREE) { ++ space_map_add(sm, sm->sm_start, sm->sm_size); ++ space = sm->sm_size - space; ++ } ++ ++ bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT; ++ entry_map = zio_buf_alloc(bufsize); ++ ++ mutex_exit(sm->sm_lock); ++ if (end > bufsize) ++ dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize); ++ mutex_enter(sm->sm_lock); ++ ++ for (offset = 0; offset < end; offset += bufsize) { ++ size = MIN(end - offset, bufsize); ++ VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); ++ VERIFY(size != 0); ++ ++ dprintf("object=%llu offset=%llx size=%llx\n", ++ smo->smo_object, offset, size); ++ ++ mutex_exit(sm->sm_lock); ++ error = dmu_read(os, smo->smo_object, offset, size, entry_map, ++ DMU_READ_PREFETCH); ++ mutex_enter(sm->sm_lock); ++ if (error != 0) ++ break; ++ ++ entry_map_end = entry_map + (size / sizeof (uint64_t)); ++ for (entry = entry_map; entry < entry_map_end; entry++) { ++ uint64_t e = *entry; ++ ++ if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ ++ continue; ++ ++ (SM_TYPE_DECODE(e) == maptype ? ++ space_map_add : space_map_remove)(sm, ++ (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart, ++ SM_RUN_DECODE(e) << sm->sm_shift); ++ } ++ } ++ ++ if (error == 0) { ++ VERIFY3U(sm->sm_space, ==, space); ++ ++ sm->sm_loaded = B_TRUE; ++ sm->sm_ops = ops; ++ if (ops != NULL) ++ ops->smop_load(sm); ++ } else { ++ space_map_vacate(sm, NULL, NULL); ++ } ++ ++ zio_buf_free(entry_map, bufsize); ++ ++ sm->sm_loading = B_FALSE; ++ ++ cv_broadcast(&sm->sm_load_cv); ++ ++ return (error); ++} ++ ++void ++space_map_unload(space_map_t *sm) ++{ ++ ASSERT(MUTEX_HELD(sm->sm_lock)); ++ ++ if (sm->sm_loaded && sm->sm_ops != NULL) ++ sm->sm_ops->smop_unload(sm); ++ ++ sm->sm_loaded = B_FALSE; ++ sm->sm_ops = NULL; ++ ++ space_map_vacate(sm, NULL, NULL); ++} ++ ++uint64_t ++space_map_maxsize(space_map_t *sm) ++{ ++ ASSERT(sm->sm_ops != NULL); ++ return (sm->sm_ops->smop_max(sm)); ++} ++ ++uint64_t ++space_map_alloc(space_map_t *sm, uint64_t size) ++{ ++ uint64_t start; ++ ++ start = sm->sm_ops->smop_alloc(sm, size); ++ if (start != -1ULL) ++ space_map_remove(sm, start, size); ++ return (start); ++} ++ ++void ++space_map_claim(space_map_t *sm, uint64_t start, uint64_t size) ++{ ++ sm->sm_ops->smop_claim(sm, start, size); ++ space_map_remove(sm, start, size); ++} ++ ++void ++space_map_free(space_map_t *sm, uint64_t start, uint64_t size) ++{ ++ space_map_add(sm, start, size); ++ sm->sm_ops->smop_free(sm, start, size); ++} ++ ++/* ++ * Note: space_map_sync() will drop sm_lock across dmu_write() calls. ++ */ ++void ++space_map_sync(space_map_t *sm, uint8_t maptype, ++ space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) ++{ ++ spa_t *spa = dmu_objset_spa(os); ++ void *cookie = NULL; ++ space_seg_t *ss; ++ uint64_t bufsize, start, size, run_len; ++ uint64_t *entry, *entry_map, *entry_map_end; ++ ++ ASSERT(MUTEX_HELD(sm->sm_lock)); ++ ++ if (sm->sm_space == 0) ++ return; ++ ++ dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n", ++ smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa), ++ maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root), ++ sm->sm_space); ++ ++ if (maptype == SM_ALLOC) ++ smo->smo_alloc += sm->sm_space; ++ else ++ smo->smo_alloc -= sm->sm_space; ++ ++ bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t); ++ bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT); ++ entry_map = zio_buf_alloc(bufsize); ++ entry_map_end = entry_map + (bufsize / sizeof (uint64_t)); ++ entry = entry_map; ++ ++ *entry++ = SM_DEBUG_ENCODE(1) | ++ SM_DEBUG_ACTION_ENCODE(maptype) | ++ SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) | ++ SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); ++ ++ while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { ++ size = ss->ss_end - ss->ss_start; ++ start = (ss->ss_start - sm->sm_start) >> sm->sm_shift; ++ ++ sm->sm_space -= size; ++ size >>= sm->sm_shift; ++ ++ while (size) { ++ run_len = MIN(size, SM_RUN_MAX); ++ ++ if (entry == entry_map_end) { ++ mutex_exit(sm->sm_lock); ++ dmu_write(os, smo->smo_object, smo->smo_objsize, ++ bufsize, entry_map, tx); ++ mutex_enter(sm->sm_lock); ++ smo->smo_objsize += bufsize; ++ entry = entry_map; ++ } ++ ++ *entry++ = SM_OFFSET_ENCODE(start) | ++ SM_TYPE_ENCODE(maptype) | ++ SM_RUN_ENCODE(run_len); ++ ++ start += run_len; ++ size -= run_len; ++ } ++ kmem_free(ss, sizeof (*ss)); ++ } ++ ++ if (entry != entry_map) { ++ size = (entry - entry_map) * sizeof (uint64_t); ++ mutex_exit(sm->sm_lock); ++ dmu_write(os, smo->smo_object, smo->smo_objsize, ++ size, entry_map, tx); ++ mutex_enter(sm->sm_lock); ++ smo->smo_objsize += size; ++ } ++ ++ zio_buf_free(entry_map, bufsize); ++ ++ VERIFY3U(sm->sm_space, ==, 0); ++} ++ ++void ++space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) ++{ ++ VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0); ++ ++ smo->smo_objsize = 0; ++ smo->smo_alloc = 0; ++} ++ ++/* ++ * Space map reference trees. ++ * ++ * A space map is a collection of integers. Every integer is either ++ * in the map, or it's not. A space map reference tree generalizes ++ * the idea: it allows its members to have arbitrary reference counts, ++ * as opposed to the implicit reference count of 0 or 1 in a space map. ++ * This representation comes in handy when computing the union or ++ * intersection of multiple space maps. For example, the union of ++ * N space maps is the subset of the reference tree with refcnt >= 1. ++ * The intersection of N space maps is the subset with refcnt >= N. ++ * ++ * [It's very much like a Fourier transform. Unions and intersections ++ * are hard to perform in the 'space map domain', so we convert the maps ++ * into the 'reference count domain', where it's trivial, then invert.] ++ * ++ * vdev_dtl_reassess() uses computations of this form to determine ++ * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev ++ * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev ++ * has an outage wherever refcnt >= vdev_children. ++ */ ++static int ++space_map_ref_compare(const void *x1, const void *x2) ++{ ++ const space_ref_t *sr1 = x1; ++ const space_ref_t *sr2 = x2; ++ ++ if (sr1->sr_offset < sr2->sr_offset) ++ return (-1); ++ if (sr1->sr_offset > sr2->sr_offset) ++ return (1); ++ ++ if (sr1 < sr2) ++ return (-1); ++ if (sr1 > sr2) ++ return (1); ++ ++ return (0); ++} ++ ++void ++space_map_ref_create(avl_tree_t *t) ++{ ++ avl_create(t, space_map_ref_compare, ++ sizeof (space_ref_t), offsetof(space_ref_t, sr_node)); ++} ++ ++void ++space_map_ref_destroy(avl_tree_t *t) ++{ ++ space_ref_t *sr; ++ void *cookie = NULL; ++ ++ while ((sr = avl_destroy_nodes(t, &cookie)) != NULL) ++ kmem_free(sr, sizeof (*sr)); ++ ++ avl_destroy(t); ++} ++ ++static void ++space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt) ++{ ++ space_ref_t *sr; ++ ++ sr = kmem_alloc(sizeof (*sr), KM_PUSHPAGE); ++ sr->sr_offset = offset; ++ sr->sr_refcnt = refcnt; ++ ++ avl_add(t, sr); ++} ++ ++void ++space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, ++ int64_t refcnt) ++{ ++ space_map_ref_add_node(t, start, refcnt); ++ space_map_ref_add_node(t, end, -refcnt); ++} ++ ++/* ++ * Convert (or add) a space map into a reference tree. ++ */ ++void ++space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt) ++{ ++ space_seg_t *ss; ++ ++ ASSERT(MUTEX_HELD(sm->sm_lock)); ++ ++ for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) ++ space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt); ++} ++ ++/* ++ * Convert a reference tree into a space map. The space map will contain ++ * all members of the reference tree for which refcnt >= minref. ++ */ ++void ++space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref) ++{ ++ uint64_t start = -1ULL; ++ int64_t refcnt = 0; ++ space_ref_t *sr; ++ ++ ASSERT(MUTEX_HELD(sm->sm_lock)); ++ ++ space_map_vacate(sm, NULL, NULL); ++ ++ for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) { ++ refcnt += sr->sr_refcnt; ++ if (refcnt >= minref) { ++ if (start == -1ULL) { ++ start = sr->sr_offset; ++ } ++ } else { ++ if (start != -1ULL) { ++ uint64_t end = sr->sr_offset; ++ ASSERT(start <= end); ++ if (end > start) ++ space_map_add(sm, start, end - start); ++ start = -1ULL; ++ } ++ } ++ } ++ ASSERT(refcnt == 0); ++ ASSERT(start == -1ULL); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/spa_config.c linux-3.2.33-go/fs/zfs/zfs/spa_config.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/spa_config.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/spa_config.c 2012-11-16 23:25:34.353039289 +0100 +@@ -0,0 +1,504 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef _KERNEL ++#include ++#include ++#endif ++ ++/* ++ * Pool configuration repository. ++ * ++ * Pool configuration is stored as a packed nvlist on the filesystem. By ++ * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot ++ * (when the ZFS module is loaded). Pools can also have the 'cachefile' ++ * property set that allows them to be stored in an alternate location until ++ * the control of external software. ++ * ++ * For each cache file, we have a single nvlist which holds all the ++ * configuration information. When the module loads, we read this information ++ * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is ++ * maintained independently in spa.c. Whenever the namespace is modified, or ++ * the configuration of a pool is changed, we call spa_config_sync(), which ++ * walks through all the active pools and writes the configuration to disk. ++ */ ++ ++static uint64_t spa_config_generation = 1; ++ ++/* ++ * This can be overridden in userland to preserve an alternate namespace for ++ * userland pools when doing testing. ++ */ ++char *spa_config_path = ZPOOL_CACHE; ++ ++/* ++ * Called when the module is first loaded, this routine loads the configuration ++ * file into the SPA namespace. It does not actually open or load the pools; it ++ * only populates the namespace. ++ */ ++void ++spa_config_load(void) ++{ ++ void *buf = NULL; ++ nvlist_t *nvlist, *child; ++ nvpair_t *nvpair; ++ char *pathname; ++ struct _buf *file; ++ uint64_t fsize; ++ ++ /* ++ * Open the configuration file. ++ */ ++ pathname = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE); ++ ++ (void) snprintf(pathname, MAXPATHLEN, "%s%s", ++ (rootdir != NULL) ? "./" : "", spa_config_path); ++ ++ file = kobj_open_file(pathname); ++ ++ kmem_free(pathname, MAXPATHLEN); ++ ++ if (file == (struct _buf *)-1) ++ return; ++ ++ if (kobj_get_filesize(file, &fsize) != 0) ++ goto out; ++ ++ buf = kmem_alloc(fsize, KM_PUSHPAGE | KM_NODEBUG); ++ ++ /* ++ * Read the nvlist from the file. ++ */ ++ if (kobj_read_file(file, buf, fsize, 0) < 0) ++ goto out; ++ ++ /* ++ * Unpack the nvlist. ++ */ ++ if (nvlist_unpack(buf, fsize, &nvlist, KM_PUSHPAGE) != 0) ++ goto out; ++ ++ /* ++ * Iterate over all elements in the nvlist, creating a new spa_t for ++ * each one with the specified configuration. ++ */ ++ mutex_enter(&spa_namespace_lock); ++ nvpair = NULL; ++ while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { ++ if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) ++ continue; ++ ++ VERIFY(nvpair_value_nvlist(nvpair, &child) == 0); ++ ++ if (spa_lookup(nvpair_name(nvpair)) != NULL) ++ continue; ++ (void) spa_add(nvpair_name(nvpair), child, NULL); ++ } ++ mutex_exit(&spa_namespace_lock); ++ ++ nvlist_free(nvlist); ++ ++out: ++ if (buf != NULL) ++ kmem_free(buf, fsize); ++ ++ kobj_close_file(file); ++} ++ ++static void ++spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) ++{ ++ size_t buflen; ++ char *buf; ++ vnode_t *vp; ++ int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; ++ char *temp; ++ ++ /* ++ * If the nvlist is empty (NULL), then remove the old cachefile. ++ */ ++ if (nvl == NULL) { ++ (void) vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE); ++ return; ++ } ++ ++ /* ++ * Pack the configuration into a buffer. ++ */ ++ VERIFY(nvlist_size(nvl, &buflen, NV_ENCODE_XDR) == 0); ++ ++ buf = kmem_alloc(buflen, KM_PUSHPAGE | KM_NODEBUG); ++ temp = kmem_zalloc(MAXPATHLEN, KM_PUSHPAGE); ++ ++ VERIFY(nvlist_pack(nvl, &buf, &buflen, NV_ENCODE_XDR, ++ KM_PUSHPAGE) == 0); ++ ++ /* ++ * Write the configuration to disk. We need to do the traditional ++ * 'write to temporary file, sync, move over original' to make sure we ++ * always have a consistent view of the data. ++ */ ++ (void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path); ++ ++ if (vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) == 0) { ++ if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, ++ 0, RLIM64_INFINITY, kcred, NULL) == 0 && ++ VOP_FSYNC(vp, FSYNC, kcred, NULL) == 0) { ++ (void) vn_rename(temp, dp->scd_path, UIO_SYSSPACE); ++ } ++ (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL); ++ } ++ ++ (void) vn_remove(temp, UIO_SYSSPACE, RMFILE); ++ ++ kmem_free(buf, buflen); ++ kmem_free(temp, MAXPATHLEN); ++} ++ ++/* ++ * Synchronize pool configuration to disk. This must be called with the ++ * namespace lock held. ++ */ ++void ++spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) ++{ ++ spa_config_dirent_t *dp, *tdp; ++ nvlist_t *nvl; ++ ++ ASSERT(MUTEX_HELD(&spa_namespace_lock)); ++ ++ if (rootdir == NULL || !(spa_mode_global & FWRITE)) ++ return; ++ ++ /* ++ * Iterate over all cachefiles for the pool, past or present. When the ++ * cachefile is changed, the new one is pushed onto this list, allowing ++ * us to update previous cachefiles that no longer contain this pool. ++ */ ++ for (dp = list_head(&target->spa_config_list); dp != NULL; ++ dp = list_next(&target->spa_config_list, dp)) { ++ spa_t *spa = NULL; ++ if (dp->scd_path == NULL) ++ continue; ++ ++ /* ++ * Iterate over all pools, adding any matching pools to 'nvl'. ++ */ ++ nvl = NULL; ++ while ((spa = spa_next(spa)) != NULL) { ++ if (spa == target && removing) ++ continue; ++ ++ mutex_enter(&spa->spa_props_lock); ++ tdp = list_head(&spa->spa_config_list); ++ if (spa->spa_config == NULL || ++ tdp->scd_path == NULL || ++ strcmp(tdp->scd_path, dp->scd_path) != 0) { ++ mutex_exit(&spa->spa_props_lock); ++ continue; ++ } ++ ++ if (nvl == NULL) ++ VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, ++ KM_PUSHPAGE) == 0); ++ ++ VERIFY(nvlist_add_nvlist(nvl, spa->spa_name, ++ spa->spa_config) == 0); ++ mutex_exit(&spa->spa_props_lock); ++ } ++ ++ spa_config_write(dp, nvl); ++ nvlist_free(nvl); ++ } ++ ++ /* ++ * Remove any config entries older than the current one. ++ */ ++ dp = list_head(&target->spa_config_list); ++ while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { ++ list_remove(&target->spa_config_list, tdp); ++ if (tdp->scd_path != NULL) ++ spa_strfree(tdp->scd_path); ++ kmem_free(tdp, sizeof (spa_config_dirent_t)); ++ } ++ ++ spa_config_generation++; ++ ++ if (postsysevent) ++ spa_event_notify(target, NULL, FM_EREPORT_ZFS_CONFIG_SYNC); ++} ++ ++/* ++ * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, ++ * and we don't want to allow the local zone to see all the pools anyway. ++ * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration ++ * information for all pool visible within the zone. ++ */ ++nvlist_t * ++spa_all_configs(uint64_t *generation) ++{ ++ nvlist_t *pools; ++ spa_t *spa = NULL; ++ ++ if (*generation == spa_config_generation) ++ return (NULL); ++ ++ VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); ++ ++ mutex_enter(&spa_namespace_lock); ++ while ((spa = spa_next(spa)) != NULL) { ++ if (INGLOBALZONE(curproc) || ++ zone_dataset_visible(spa_name(spa), NULL)) { ++ mutex_enter(&spa->spa_props_lock); ++ VERIFY(nvlist_add_nvlist(pools, spa_name(spa), ++ spa->spa_config) == 0); ++ mutex_exit(&spa->spa_props_lock); ++ } ++ } ++ *generation = spa_config_generation; ++ mutex_exit(&spa_namespace_lock); ++ ++ return (pools); ++} ++ ++void ++spa_config_set(spa_t *spa, nvlist_t *config) ++{ ++ mutex_enter(&spa->spa_props_lock); ++ if (spa->spa_config != NULL) ++ nvlist_free(spa->spa_config); ++ spa->spa_config = config; ++ mutex_exit(&spa->spa_props_lock); ++} ++ ++/* ++ * Generate the pool's configuration based on the current in-core state. ++ * We infer whether to generate a complete config or just one top-level config ++ * based on whether vd is the root vdev. ++ */ ++nvlist_t * ++spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) ++{ ++ nvlist_t *config, *nvroot; ++ vdev_t *rvd = spa->spa_root_vdev; ++ unsigned long hostid = 0; ++ boolean_t locked = B_FALSE; ++ uint64_t split_guid; ++ ++ if (vd == NULL) { ++ vd = rvd; ++ locked = B_TRUE; ++ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); ++ } ++ ++ ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == ++ (SCL_CONFIG | SCL_STATE)); ++ ++ /* ++ * If txg is -1, report the current value of spa->spa_config_txg. ++ */ ++ if (txg == -1ULL) ++ txg = spa->spa_config_txg; ++ ++ VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); ++ ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, ++ spa_version(spa)) == 0); ++ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, ++ spa_name(spa)) == 0); ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, ++ spa_state(spa)) == 0); ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, ++ txg) == 0); ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, ++ spa_guid(spa)) == 0); ++ VERIFY(spa->spa_comment == NULL || nvlist_add_string(config, ++ ZPOOL_CONFIG_COMMENT, spa->spa_comment) == 0); ++ ++ ++#ifdef _KERNEL ++ hostid = zone_get_hostid(NULL); ++#else /* _KERNEL */ ++ /* ++ * We're emulating the system's hostid in userland, so we can't use ++ * zone_get_hostid(). ++ */ ++ (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); ++#endif /* _KERNEL */ ++ if (hostid != 0) { ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, ++ hostid) == 0); ++ } ++ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, ++ utsname.nodename) == 0); ++ ++ if (vd != rvd) { ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, ++ vd->vdev_top->vdev_guid) == 0); ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID, ++ vd->vdev_guid) == 0); ++ if (vd->vdev_isspare) ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, ++ 1ULL) == 0); ++ if (vd->vdev_islog) ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, ++ 1ULL) == 0); ++ vd = vd->vdev_top; /* label contains top config */ ++ } else { ++ /* ++ * Only add the (potentially large) split information ++ * in the mos config, and not in the vdev labels ++ */ ++ if (spa->spa_config_splitting != NULL) ++ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, ++ spa->spa_config_splitting) == 0); ++ } ++ ++ /* ++ * Add the top-level config. We even add this on pools which ++ * don't support holes in the namespace. ++ */ ++ vdev_top_config_generate(spa, config); ++ ++ /* ++ * If we're splitting, record the original pool's guid. ++ */ ++ if (spa->spa_config_splitting != NULL && ++ nvlist_lookup_uint64(spa->spa_config_splitting, ++ ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, ++ split_guid) == 0); ++ } ++ ++ nvroot = vdev_config_generate(spa, vd, getstats, 0); ++ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); ++ nvlist_free(nvroot); ++ ++ if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { ++ ddt_histogram_t *ddh; ++ ddt_stat_t *dds; ++ ddt_object_t *ddo; ++ ++ ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_PUSHPAGE); ++ ddt_get_dedup_histogram(spa, ddh); ++ VERIFY(nvlist_add_uint64_array(config, ++ ZPOOL_CONFIG_DDT_HISTOGRAM, ++ (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)) == 0); ++ kmem_free(ddh, sizeof (ddt_histogram_t)); ++ ++ ddo = kmem_zalloc(sizeof (ddt_object_t), KM_PUSHPAGE); ++ ddt_get_dedup_object_stats(spa, ddo); ++ VERIFY(nvlist_add_uint64_array(config, ++ ZPOOL_CONFIG_DDT_OBJ_STATS, ++ (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)) == 0); ++ kmem_free(ddo, sizeof (ddt_object_t)); ++ ++ dds = kmem_zalloc(sizeof (ddt_stat_t), KM_PUSHPAGE); ++ ddt_get_dedup_stats(spa, dds); ++ VERIFY(nvlist_add_uint64_array(config, ++ ZPOOL_CONFIG_DDT_STATS, ++ (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)) == 0); ++ kmem_free(dds, sizeof (ddt_stat_t)); ++ } ++ ++ if (locked) ++ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); ++ ++ return (config); ++} ++ ++/* ++ * Update all disk labels, generate a fresh config based on the current ++ * in-core state, and sync the global config cache (do not sync the config ++ * cache if this is a booting rootpool). ++ */ ++void ++spa_config_update(spa_t *spa, int what) ++{ ++ vdev_t *rvd = spa->spa_root_vdev; ++ uint64_t txg; ++ int c; ++ ++ ASSERT(MUTEX_HELD(&spa_namespace_lock)); ++ ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ txg = spa_last_synced_txg(spa) + 1; ++ if (what == SPA_CONFIG_UPDATE_POOL) { ++ vdev_config_dirty(rvd); ++ } else { ++ /* ++ * If we have top-level vdevs that were added but have ++ * not yet been prepared for allocation, do that now. ++ * (It's safe now because the config cache is up to date, ++ * so it will be able to translate the new DVAs.) ++ * See comments in spa_vdev_add() for full details. ++ */ ++ for (c = 0; c < rvd->vdev_children; c++) { ++ vdev_t *tvd = rvd->vdev_child[c]; ++ if (tvd->vdev_ms_array == 0) ++ vdev_metaslab_set_size(tvd); ++ vdev_expand(tvd, txg); ++ } ++ } ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ ++ /* ++ * Wait for the mosconfig to be regenerated and synced. ++ */ ++ txg_wait_synced(spa->spa_dsl_pool, txg); ++ ++ /* ++ * Update the global config cache to reflect the new mosconfig. ++ */ ++ if (!spa->spa_is_root) ++ spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); ++ ++ if (what == SPA_CONFIG_UPDATE_POOL) ++ spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(spa_config_sync); ++EXPORT_SYMBOL(spa_config_load); ++EXPORT_SYMBOL(spa_all_configs); ++EXPORT_SYMBOL(spa_config_set); ++EXPORT_SYMBOL(spa_config_generate); ++EXPORT_SYMBOL(spa_config_update); ++ ++module_param(spa_config_path, charp, 0444); ++MODULE_PARM_DESC(spa_config_path, "SPA config file (/etc/zfs/zpool.cache)"); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/spa_errlog.c linux-3.2.33-go/fs/zfs/zfs/spa_errlog.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/spa_errlog.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/spa_errlog.c 2012-11-16 23:25:34.348039346 +0100 +@@ -0,0 +1,414 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* ++ * Routines to manage the on-disk persistent error log. ++ * ++ * Each pool stores a log of all logical data errors seen during normal ++ * operation. This is actually the union of two distinct logs: the last log, ++ * and the current log. All errors seen are logged to the current log. When a ++ * scrub completes, the current log becomes the last log, the last log is thrown ++ * out, and the current log is reinitialized. This way, if an error is somehow ++ * corrected, a new scrub will show that that it no longer exists, and will be ++ * deleted from the log when the scrub completes. ++ * ++ * The log is stored using a ZAP object whose key is a string form of the ++ * zbookmark tuple (objset, object, level, blkid), and whose contents is an ++ * optional 'objset:object' human-readable string describing the data. When an ++ * error is first logged, this string will be empty, indicating that no name is ++ * known. This prevents us from having to issue a potentially large amount of ++ * I/O to discover the object name during an error path. Instead, we do the ++ * calculation when the data is requested, storing the result so future queries ++ * will be faster. ++ * ++ * This log is then shipped into an nvlist where the key is the dataset name and ++ * the value is the object name. Userland is then responsible for uniquifying ++ * this list and displaying it to the user. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++ ++/* ++ * Convert a bookmark to a string. ++ */ ++static void ++bookmark_to_name(zbookmark_t *zb, char *buf, size_t len) ++{ ++ (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", ++ (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, ++ (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid); ++} ++ ++/* ++ * Convert a string to a bookmark ++ */ ++#ifdef _KERNEL ++static void ++name_to_bookmark(char *buf, zbookmark_t *zb) ++{ ++ zb->zb_objset = strtonum(buf, &buf); ++ ASSERT(*buf == ':'); ++ zb->zb_object = strtonum(buf + 1, &buf); ++ ASSERT(*buf == ':'); ++ zb->zb_level = (int)strtonum(buf + 1, &buf); ++ ASSERT(*buf == ':'); ++ zb->zb_blkid = strtonum(buf + 1, &buf); ++ ASSERT(*buf == '\0'); ++} ++#endif ++ ++/* ++ * Log an uncorrectable error to the persistent error log. We add it to the ++ * spa's list of pending errors. The changes are actually synced out to disk ++ * during spa_errlog_sync(). ++ */ ++void ++spa_log_error(spa_t *spa, zio_t *zio) ++{ ++ zbookmark_t *zb = &zio->io_logical->io_bookmark; ++ spa_error_entry_t search; ++ spa_error_entry_t *new; ++ avl_tree_t *tree; ++ avl_index_t where; ++ ++ /* ++ * If we are trying to import a pool, ignore any errors, as we won't be ++ * writing to the pool any time soon. ++ */ ++ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) ++ return; ++ ++ mutex_enter(&spa->spa_errlist_lock); ++ ++ /* ++ * If we have had a request to rotate the log, log it to the next list ++ * instead of the current one. ++ */ ++ if (spa->spa_scrub_active || spa->spa_scrub_finished) ++ tree = &spa->spa_errlist_scrub; ++ else ++ tree = &spa->spa_errlist_last; ++ ++ search.se_bookmark = *zb; ++ if (avl_find(tree, &search, &where) != NULL) { ++ mutex_exit(&spa->spa_errlist_lock); ++ return; ++ } ++ ++ new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); ++ new->se_bookmark = *zb; ++ avl_insert(tree, new, where); ++ ++ mutex_exit(&spa->spa_errlist_lock); ++} ++ ++/* ++ * Return the number of errors currently in the error log. This is actually the ++ * sum of both the last log and the current log, since we don't know the union ++ * of these logs until we reach userland. ++ */ ++uint64_t ++spa_get_errlog_size(spa_t *spa) ++{ ++ uint64_t total = 0, count; ++ ++ mutex_enter(&spa->spa_errlog_lock); ++ if (spa->spa_errlog_scrub != 0 && ++ zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, ++ &count) == 0) ++ total += count; ++ ++ if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && ++ zap_count(spa->spa_meta_objset, spa->spa_errlog_last, ++ &count) == 0) ++ total += count; ++ mutex_exit(&spa->spa_errlog_lock); ++ ++ mutex_enter(&spa->spa_errlist_lock); ++ total += avl_numnodes(&spa->spa_errlist_last); ++ total += avl_numnodes(&spa->spa_errlist_scrub); ++ mutex_exit(&spa->spa_errlist_lock); ++ ++ return (total); ++} ++ ++#ifdef _KERNEL ++static int ++process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) ++{ ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ zbookmark_t zb; ++ ++ if (obj == 0) ++ return (0); ++ ++ for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); ++ zap_cursor_retrieve(&zc, &za) == 0; ++ zap_cursor_advance(&zc)) { ++ ++ if (*count == 0) { ++ zap_cursor_fini(&zc); ++ return (ENOMEM); ++ } ++ ++ name_to_bookmark(za.za_name, &zb); ++ ++ if (copyout(&zb, (char *)addr + ++ (*count - 1) * sizeof (zbookmark_t), ++ sizeof (zbookmark_t)) != 0) ++ return (EFAULT); ++ ++ *count -= 1; ++ } ++ ++ zap_cursor_fini(&zc); ++ ++ return (0); ++} ++ ++static int ++process_error_list(avl_tree_t *list, void *addr, size_t *count) ++{ ++ spa_error_entry_t *se; ++ ++ for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { ++ ++ if (*count == 0) ++ return (ENOMEM); ++ ++ if (copyout(&se->se_bookmark, (char *)addr + ++ (*count - 1) * sizeof (zbookmark_t), ++ sizeof (zbookmark_t)) != 0) ++ return (EFAULT); ++ ++ *count -= 1; ++ } ++ ++ return (0); ++} ++#endif ++ ++/* ++ * Copy all known errors to userland as an array of bookmarks. This is ++ * actually a union of the on-disk last log and current log, as well as any ++ * pending error requests. ++ * ++ * Because the act of reading the on-disk log could cause errors to be ++ * generated, we have two separate locks: one for the error log and one for the ++ * in-core error lists. We only need the error list lock to log and error, so ++ * we grab the error log lock while we read the on-disk logs, and only pick up ++ * the error list lock when we are finished. ++ */ ++int ++spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) ++{ ++ int ret = 0; ++ ++#ifdef _KERNEL ++ mutex_enter(&spa->spa_errlog_lock); ++ ++ ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); ++ ++ if (!ret && !spa->spa_scrub_finished) ++ ret = process_error_log(spa, spa->spa_errlog_last, uaddr, ++ count); ++ ++ mutex_enter(&spa->spa_errlist_lock); ++ if (!ret) ++ ret = process_error_list(&spa->spa_errlist_scrub, uaddr, ++ count); ++ if (!ret) ++ ret = process_error_list(&spa->spa_errlist_last, uaddr, ++ count); ++ mutex_exit(&spa->spa_errlist_lock); ++ ++ mutex_exit(&spa->spa_errlog_lock); ++#endif ++ ++ return (ret); ++} ++ ++/* ++ * Called when a scrub completes. This simply set a bit which tells which AVL ++ * tree to add new errors. spa_errlog_sync() is responsible for actually ++ * syncing the changes to the underlying objects. ++ */ ++void ++spa_errlog_rotate(spa_t *spa) ++{ ++ mutex_enter(&spa->spa_errlist_lock); ++ spa->spa_scrub_finished = B_TRUE; ++ mutex_exit(&spa->spa_errlist_lock); ++} ++ ++/* ++ * Discard any pending errors from the spa_t. Called when unloading a faulted ++ * pool, as the errors encountered during the open cannot be synced to disk. ++ */ ++void ++spa_errlog_drain(spa_t *spa) ++{ ++ spa_error_entry_t *se; ++ void *cookie; ++ ++ mutex_enter(&spa->spa_errlist_lock); ++ ++ cookie = NULL; ++ while ((se = avl_destroy_nodes(&spa->spa_errlist_last, ++ &cookie)) != NULL) ++ kmem_free(se, sizeof (spa_error_entry_t)); ++ cookie = NULL; ++ while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub, ++ &cookie)) != NULL) ++ kmem_free(se, sizeof (spa_error_entry_t)); ++ ++ mutex_exit(&spa->spa_errlist_lock); ++} ++ ++/* ++ * Process a list of errors into the current on-disk log. ++ */ ++static void ++sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) ++{ ++ spa_error_entry_t *se; ++ char buf[64]; ++ void *cookie; ++ ++ if (avl_numnodes(t) != 0) { ++ /* create log if necessary */ ++ if (*obj == 0) ++ *obj = zap_create(spa->spa_meta_objset, ++ DMU_OT_ERROR_LOG, DMU_OT_NONE, ++ 0, tx); ++ ++ /* add errors to the current log */ ++ for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { ++ char *name = se->se_name ? se->se_name : ""; ++ ++ bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); ++ ++ (void) zap_update(spa->spa_meta_objset, ++ *obj, buf, 1, strlen(name) + 1, name, tx); ++ } ++ ++ /* purge the error list */ ++ cookie = NULL; ++ while ((se = avl_destroy_nodes(t, &cookie)) != NULL) ++ kmem_free(se, sizeof (spa_error_entry_t)); ++ } ++} ++ ++/* ++ * Sync the error log out to disk. This is a little tricky because the act of ++ * writing the error log requires the spa_errlist_lock. So, we need to lock the ++ * error lists, take a copy of the lists, and then reinitialize them. Then, we ++ * drop the error list lock and take the error log lock, at which point we ++ * do the errlog processing. Then, if we encounter an I/O error during this ++ * process, we can successfully add the error to the list. Note that this will ++ * result in the perpetual recycling of errors, but it is an unlikely situation ++ * and not a performance critical operation. ++ */ ++void ++spa_errlog_sync(spa_t *spa, uint64_t txg) ++{ ++ dmu_tx_t *tx; ++ avl_tree_t scrub, last; ++ int scrub_finished; ++ ++ mutex_enter(&spa->spa_errlist_lock); ++ ++ /* ++ * Bail out early under normal circumstances. ++ */ ++ if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && ++ avl_numnodes(&spa->spa_errlist_last) == 0 && ++ !spa->spa_scrub_finished) { ++ mutex_exit(&spa->spa_errlist_lock); ++ return; ++ } ++ ++ spa_get_errlists(spa, &last, &scrub); ++ scrub_finished = spa->spa_scrub_finished; ++ spa->spa_scrub_finished = B_FALSE; ++ ++ mutex_exit(&spa->spa_errlist_lock); ++ mutex_enter(&spa->spa_errlog_lock); ++ ++ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); ++ ++ /* ++ * Sync out the current list of errors. ++ */ ++ sync_error_list(spa, &last, &spa->spa_errlog_last, tx); ++ ++ /* ++ * Rotate the log if necessary. ++ */ ++ if (scrub_finished) { ++ if (spa->spa_errlog_last != 0) ++ VERIFY(dmu_object_free(spa->spa_meta_objset, ++ spa->spa_errlog_last, tx) == 0); ++ spa->spa_errlog_last = spa->spa_errlog_scrub; ++ spa->spa_errlog_scrub = 0; ++ ++ sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx); ++ } ++ ++ /* ++ * Sync out any pending scrub errors. ++ */ ++ sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx); ++ ++ /* ++ * Update the MOS to reflect the new values. ++ */ ++ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, ++ DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, ++ &spa->spa_errlog_last, tx); ++ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, ++ DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, ++ &spa->spa_errlog_scrub, tx); ++ ++ dmu_tx_commit(tx); ++ ++ mutex_exit(&spa->spa_errlog_lock); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++/* error handling */ ++EXPORT_SYMBOL(spa_log_error); ++EXPORT_SYMBOL(spa_get_errlog_size); ++EXPORT_SYMBOL(spa_get_errlog); ++EXPORT_SYMBOL(spa_errlog_rotate); ++EXPORT_SYMBOL(spa_errlog_drain); ++EXPORT_SYMBOL(spa_errlog_sync); ++EXPORT_SYMBOL(spa_get_errlists); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/spa_history.c linux-3.2.33-go/fs/zfs/zfs/spa_history.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/spa_history.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/spa_history.c 2012-11-16 23:25:34.348039346 +0100 +@@ -0,0 +1,514 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "zfs_comutil.h" ++#ifdef _KERNEL ++#include ++#endif ++ ++/* ++ * Routines to manage the on-disk history log. ++ * ++ * The history log is stored as a dmu object containing ++ * tuples. ++ * ++ * Where "record nvlist" is a nvlist containing uint64_ts and strings, and ++ * "packed record length" is the packed length of the "record nvlist" stored ++ * as a little endian uint64_t. ++ * ++ * The log is implemented as a ring buffer, though the original creation ++ * of the pool ('zpool create') is never overwritten. ++ * ++ * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer ++ * of 'spa_history' stores the offsets for logging/retrieving history as ++ * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of ++ * where the 'zpool create' record is stored. This allows us to never ++ * overwrite the original creation of the pool. 'sh_phys_max_off' is the ++ * physical ending offset in bytes of the log. This tells you the length of ++ * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record ++ * is added, 'sh_eof' is incremented by the the size of the record. ++ * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes). ++ * This is where the consumer should start reading from after reading in ++ * the 'zpool create' portion of the log. ++ * ++ * 'sh_records_lost' keeps track of how many records have been overwritten ++ * and permanently lost. ++ */ ++ ++/* convert a logical offset to physical */ ++static uint64_t ++spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp) ++{ ++ uint64_t phys_len; ++ ++ phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len; ++ return ((log_off - shpp->sh_pool_create_len) % phys_len ++ + shpp->sh_pool_create_len); ++} ++ ++void ++spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) ++{ ++ dmu_buf_t *dbp; ++ spa_history_phys_t *shpp; ++ objset_t *mos = spa->spa_meta_objset; ++ ++ ASSERT(spa->spa_history == 0); ++ spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY, ++ SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, ++ sizeof (spa_history_phys_t), tx); ++ ++ VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, ++ DMU_POOL_HISTORY, sizeof (uint64_t), 1, ++ &spa->spa_history, tx) == 0); ++ ++ VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); ++ ASSERT(dbp->db_size >= sizeof (spa_history_phys_t)); ++ ++ shpp = dbp->db_data; ++ dmu_buf_will_dirty(dbp, tx); ++ ++ /* ++ * Figure out maximum size of history log. We set it at ++ * 0.1% of pool size, with a max of 1G and min of 128KB. ++ */ ++ shpp->sh_phys_max_off = ++ metaslab_class_get_dspace(spa_normal_class(spa)) / 1000; ++ shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30); ++ shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); ++ ++ dmu_buf_rele(dbp, FTAG); ++} ++ ++/* ++ * Change 'sh_bof' to the beginning of the next record. ++ */ ++static int ++spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) ++{ ++ objset_t *mos = spa->spa_meta_objset; ++ uint64_t firstread, reclen, phys_bof; ++ char buf[sizeof (reclen)]; ++ int err; ++ ++ phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp); ++ firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); ++ ++ if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, ++ buf, DMU_READ_PREFETCH)) != 0) ++ return (err); ++ if (firstread != sizeof (reclen)) { ++ if ((err = dmu_read(mos, spa->spa_history, ++ shpp->sh_pool_create_len, sizeof (reclen) - firstread, ++ buf + firstread, DMU_READ_PREFETCH)) != 0) ++ return (err); ++ } ++ ++ reclen = LE_64(*((uint64_t *)buf)); ++ shpp->sh_bof += reclen + sizeof (reclen); ++ shpp->sh_records_lost++; ++ return (0); ++} ++ ++static int ++spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, ++ dmu_tx_t *tx) ++{ ++ uint64_t firstwrite, phys_eof; ++ objset_t *mos = spa->spa_meta_objset; ++ int err; ++ ++ ASSERT(MUTEX_HELD(&spa->spa_history_lock)); ++ ++ /* see if we need to reset logical BOF */ ++ while (shpp->sh_phys_max_off - shpp->sh_pool_create_len - ++ (shpp->sh_eof - shpp->sh_bof) <= len) { ++ if ((err = spa_history_advance_bof(spa, shpp)) != 0) { ++ return (err); ++ } ++ } ++ ++ phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); ++ firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof); ++ shpp->sh_eof += len; ++ dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx); ++ ++ len -= firstwrite; ++ if (len > 0) { ++ /* write out the rest at the beginning of physical file */ ++ dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len, ++ len, (char *)buf + firstwrite, tx); ++ } ++ ++ return (0); ++} ++ ++static char * ++spa_history_zone(void) ++{ ++#ifdef _KERNEL ++#ifdef HAVE_SPL ++ return ("linux"); ++#else ++ return (curproc->p_zone->zone_name); ++#endif ++#else ++ return ("global"); ++#endif ++} ++ ++/* ++ * Write out a history event. ++ */ ++/*ARGSUSED*/ ++static void ++spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) ++{ ++ spa_t *spa = arg1; ++ history_arg_t *hap = arg2; ++ const char *history_str = hap->ha_history_str; ++ objset_t *mos = spa->spa_meta_objset; ++ dmu_buf_t *dbp; ++ spa_history_phys_t *shpp; ++ size_t reclen; ++ uint64_t le_len; ++ nvlist_t *nvrecord; ++ char *record_packed = NULL; ++ int ret; ++ ++ /* ++ * If we have an older pool that doesn't have a command ++ * history object, create it now. ++ */ ++ mutex_enter(&spa->spa_history_lock); ++ if (!spa->spa_history) ++ spa_history_create_obj(spa, tx); ++ mutex_exit(&spa->spa_history_lock); ++ ++ /* ++ * Get the offset of where we need to write via the bonus buffer. ++ * Update the offset when the write completes. ++ */ ++ VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); ++ shpp = dbp->db_data; ++ ++ dmu_buf_will_dirty(dbp, tx); ++ ++#ifdef ZFS_DEBUG ++ { ++ dmu_object_info_t doi; ++ dmu_object_info_from_db(dbp, &doi); ++ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); ++ } ++#endif ++ ++ VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); ++ VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME, ++ gethrestime_sec()) == 0); ++ VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, hap->ha_uid) == 0); ++ if (hap->ha_zone != NULL) ++ VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE, ++ hap->ha_zone) == 0); ++#ifdef _KERNEL ++ VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_HOST, ++ utsname.nodename) == 0); ++#endif ++ if (hap->ha_log_type == LOG_CMD_POOL_CREATE || ++ hap->ha_log_type == LOG_CMD_NORMAL) { ++ VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, ++ history_str) == 0); ++ ++ zfs_dbgmsg("command: %s", history_str); ++ } else { ++ VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT, ++ hap->ha_event) == 0); ++ VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TXG, ++ tx->tx_txg) == 0); ++ VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR, ++ history_str) == 0); ++ ++ zfs_dbgmsg("internal %s pool:%s txg:%llu %s", ++ zfs_history_event_names[hap->ha_event], spa_name(spa), ++ (longlong_t)tx->tx_txg, history_str); ++ ++ } ++ ++ VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0); ++ record_packed = kmem_alloc(reclen, KM_PUSHPAGE); ++ ++ VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen, ++ NV_ENCODE_XDR, KM_PUSHPAGE) == 0); ++ ++ mutex_enter(&spa->spa_history_lock); ++ if (hap->ha_log_type == LOG_CMD_POOL_CREATE) ++ VERIFY(shpp->sh_eof == shpp->sh_pool_create_len); ++ ++ /* write out the packed length as little endian */ ++ le_len = LE_64((uint64_t)reclen); ++ ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); ++ if (!ret) ++ ret = spa_history_write(spa, record_packed, reclen, shpp, tx); ++ ++ if (!ret && hap->ha_log_type == LOG_CMD_POOL_CREATE) { ++ shpp->sh_pool_create_len += sizeof (le_len) + reclen; ++ shpp->sh_bof = shpp->sh_pool_create_len; ++ } ++ ++ mutex_exit(&spa->spa_history_lock); ++ nvlist_free(nvrecord); ++ kmem_free(record_packed, reclen); ++ dmu_buf_rele(dbp, FTAG); ++ ++ strfree(hap->ha_history_str); ++ if (hap->ha_zone != NULL) ++ strfree(hap->ha_zone); ++ kmem_free(hap, sizeof (history_arg_t)); ++} ++ ++/* ++ * Write out a history event. ++ */ ++int ++spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what) ++{ ++ history_arg_t *ha; ++ int err = 0; ++ dmu_tx_t *tx; ++ ++ ASSERT(what != LOG_INTERNAL); ++ ++ tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); ++ err = dmu_tx_assign(tx, TXG_WAIT); ++ if (err) { ++ dmu_tx_abort(tx); ++ return (err); ++ } ++ ++ ha = kmem_alloc(sizeof (history_arg_t), KM_PUSHPAGE); ++ ha->ha_history_str = strdup(history_str); ++ ha->ha_zone = strdup(spa_history_zone()); ++ ha->ha_log_type = what; ++ ha->ha_uid = crgetuid(CRED()); ++ ++ /* Kick this off asynchronously; errors are ignored. */ ++ dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, ++ spa_history_log_sync, spa, ha, 0, tx); ++ dmu_tx_commit(tx); ++ ++ /* spa_history_log_sync will free ha and strings */ ++ return (err); ++} ++ ++/* ++ * Read out the command history. ++ */ ++int ++spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) ++{ ++ objset_t *mos = spa->spa_meta_objset; ++ dmu_buf_t *dbp; ++ uint64_t read_len, phys_read_off, phys_eof; ++ uint64_t leftover = 0; ++ spa_history_phys_t *shpp; ++ int err; ++ ++ /* ++ * If the command history doesn't exist (older pool), ++ * that's ok, just return ENOENT. ++ */ ++ if (!spa->spa_history) ++ return (ENOENT); ++ ++ /* ++ * The history is logged asynchronously, so when they request ++ * the first chunk of history, make sure everything has been ++ * synced to disk so that we get it. ++ */ ++ if (*offp == 0 && spa_writeable(spa)) ++ txg_wait_synced(spa_get_dsl(spa), 0); ++ ++ if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) ++ return (err); ++ shpp = dbp->db_data; ++ ++#ifdef ZFS_DEBUG ++ { ++ dmu_object_info_t doi; ++ dmu_object_info_from_db(dbp, &doi); ++ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); ++ } ++#endif ++ ++ mutex_enter(&spa->spa_history_lock); ++ phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); ++ ++ if (*offp < shpp->sh_pool_create_len) { ++ /* read in just the zpool create history */ ++ phys_read_off = *offp; ++ read_len = MIN(*len, shpp->sh_pool_create_len - ++ phys_read_off); ++ } else { ++ /* ++ * Need to reset passed in offset to BOF if the passed in ++ * offset has since been overwritten. ++ */ ++ *offp = MAX(*offp, shpp->sh_bof); ++ phys_read_off = spa_history_log_to_phys(*offp, shpp); ++ ++ /* ++ * Read up to the minimum of what the user passed down or ++ * the EOF (physical or logical). If we hit physical EOF, ++ * use 'leftover' to read from the physical BOF. ++ */ ++ if (phys_read_off <= phys_eof) { ++ read_len = MIN(*len, phys_eof - phys_read_off); ++ } else { ++ read_len = MIN(*len, ++ shpp->sh_phys_max_off - phys_read_off); ++ if (phys_read_off + *len > shpp->sh_phys_max_off) { ++ leftover = MIN(*len - read_len, ++ phys_eof - shpp->sh_pool_create_len); ++ } ++ } ++ } ++ ++ /* offset for consumer to use next */ ++ *offp += read_len + leftover; ++ ++ /* tell the consumer how much you actually read */ ++ *len = read_len + leftover; ++ ++ if (read_len == 0) { ++ mutex_exit(&spa->spa_history_lock); ++ dmu_buf_rele(dbp, FTAG); ++ return (0); ++ } ++ ++ err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, ++ DMU_READ_PREFETCH); ++ if (leftover && err == 0) { ++ err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, ++ leftover, buf + read_len, DMU_READ_PREFETCH); ++ } ++ mutex_exit(&spa->spa_history_lock); ++ ++ dmu_buf_rele(dbp, FTAG); ++ return (err); ++} ++ ++static void ++log_internal(history_internal_events_t event, spa_t *spa, ++ dmu_tx_t *tx, const char *fmt, va_list adx) ++{ ++ history_arg_t *ha; ++ va_list adx_copy; ++ ++ /* ++ * If this is part of creating a pool, not everything is ++ * initialized yet, so don't bother logging the internal events. ++ */ ++ if (tx->tx_txg == TXG_INITIAL) ++ return; ++ ++ ha = kmem_alloc(sizeof (history_arg_t), KM_PUSHPAGE); ++ va_copy(adx_copy, adx); ++ ha->ha_history_str = kmem_vasprintf(fmt, adx_copy); ++ va_end(adx_copy); ++ ha->ha_log_type = LOG_INTERNAL; ++ ha->ha_event = event; ++ ha->ha_zone = NULL; ++ ha->ha_uid = 0; ++ ++ if (dmu_tx_is_syncing(tx)) { ++ spa_history_log_sync(spa, ha, tx); ++ } else { ++ dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, ++ spa_history_log_sync, spa, ha, 0, tx); ++ } ++ /* spa_history_log_sync() will free ha and strings */ ++} ++ ++void ++spa_history_log_internal(history_internal_events_t event, spa_t *spa, ++ dmu_tx_t *tx, const char *fmt, ...) ++{ ++ dmu_tx_t *htx = tx; ++ va_list adx; ++ ++ /* create a tx if we didn't get one */ ++ if (tx == NULL) { ++ htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); ++ if (dmu_tx_assign(htx, TXG_WAIT) != 0) { ++ dmu_tx_abort(htx); ++ return; ++ } ++ } ++ ++ va_start(adx, fmt); ++ log_internal(event, spa, htx, fmt, adx); ++ va_end(adx); ++ ++ /* if we didn't get a tx from the caller, commit the one we made */ ++ if (tx == NULL) ++ dmu_tx_commit(htx); ++} ++ ++void ++spa_history_log_version(spa_t *spa, history_internal_events_t event) ++{ ++#ifdef _KERNEL ++ uint64_t current_vers = spa_version(spa); ++ ++ if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) { ++ spa_history_log_internal(event, spa, NULL, ++ "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s", ++ (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION, ++ utsname.nodename, utsname.release, utsname.version, ++ utsname.machine); ++ } ++ cmn_err(CE_CONT, "!%s version %llu pool %s using %llu", ++ event == LOG_POOL_IMPORT ? "imported" : ++ event == LOG_POOL_CREATE ? "created" : "accessed", ++ (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION); ++#endif ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(spa_history_create_obj); ++EXPORT_SYMBOL(spa_history_get); ++EXPORT_SYMBOL(spa_history_log); ++EXPORT_SYMBOL(spa_history_log_internal); ++EXPORT_SYMBOL(spa_history_log_version); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/spa_misc.c linux-3.2.33-go/fs/zfs/zfs/spa_misc.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/spa_misc.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/spa_misc.c 2012-11-16 23:25:34.352039300 +0100 +@@ -0,0 +1,1755 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "zfs_prop.h" ++ ++/* ++ * SPA locking ++ * ++ * There are four basic locks for managing spa_t structures: ++ * ++ * spa_namespace_lock (global mutex) ++ * ++ * This lock must be acquired to do any of the following: ++ * ++ * - Lookup a spa_t by name ++ * - Add or remove a spa_t from the namespace ++ * - Increase spa_refcount from non-zero ++ * - Check if spa_refcount is zero ++ * - Rename a spa_t ++ * - add/remove/attach/detach devices ++ * - Held for the duration of create/destroy/import/export ++ * ++ * It does not need to handle recursion. A create or destroy may ++ * reference objects (files or zvols) in other pools, but by ++ * definition they must have an existing reference, and will never need ++ * to lookup a spa_t by name. ++ * ++ * spa_refcount (per-spa refcount_t protected by mutex) ++ * ++ * This reference count keep track of any active users of the spa_t. The ++ * spa_t cannot be destroyed or freed while this is non-zero. Internally, ++ * the refcount is never really 'zero' - opening a pool implicitly keeps ++ * some references in the DMU. Internally we check against spa_minref, but ++ * present the image of a zero/non-zero value to consumers. ++ * ++ * spa_config_lock[] (per-spa array of rwlocks) ++ * ++ * This protects the spa_t from config changes, and must be held in ++ * the following circumstances: ++ * ++ * - RW_READER to perform I/O to the spa ++ * - RW_WRITER to change the vdev config ++ * ++ * The locking order is fairly straightforward: ++ * ++ * spa_namespace_lock -> spa_refcount ++ * ++ * The namespace lock must be acquired to increase the refcount from 0 ++ * or to check if it is zero. ++ * ++ * spa_refcount -> spa_config_lock[] ++ * ++ * There must be at least one valid reference on the spa_t to acquire ++ * the config lock. ++ * ++ * spa_namespace_lock -> spa_config_lock[] ++ * ++ * The namespace lock must always be taken before the config lock. ++ * ++ * ++ * The spa_namespace_lock can be acquired directly and is globally visible. ++ * ++ * The namespace is manipulated using the following functions, all of which ++ * require the spa_namespace_lock to be held. ++ * ++ * spa_lookup() Lookup a spa_t by name. ++ * ++ * spa_add() Create a new spa_t in the namespace. ++ * ++ * spa_remove() Remove a spa_t from the namespace. This also ++ * frees up any memory associated with the spa_t. ++ * ++ * spa_next() Returns the next spa_t in the system, or the ++ * first if NULL is passed. ++ * ++ * spa_evict_all() Shutdown and remove all spa_t structures in ++ * the system. ++ * ++ * spa_guid_exists() Determine whether a pool/device guid exists. ++ * ++ * The spa_refcount is manipulated using the following functions: ++ * ++ * spa_open_ref() Adds a reference to the given spa_t. Must be ++ * called with spa_namespace_lock held if the ++ * refcount is currently zero. ++ * ++ * spa_close() Remove a reference from the spa_t. This will ++ * not free the spa_t or remove it from the ++ * namespace. No locking is required. ++ * ++ * spa_refcount_zero() Returns true if the refcount is currently ++ * zero. Must be called with spa_namespace_lock ++ * held. ++ * ++ * The spa_config_lock[] is an array of rwlocks, ordered as follows: ++ * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. ++ * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). ++ * ++ * To read the configuration, it suffices to hold one of these locks as reader. ++ * To modify the configuration, you must hold all locks as writer. To modify ++ * vdev state without altering the vdev tree's topology (e.g. online/offline), ++ * you must hold SCL_STATE and SCL_ZIO as writer. ++ * ++ * We use these distinct config locks to avoid recursive lock entry. ++ * For example, spa_sync() (which holds SCL_CONFIG as reader) induces ++ * block allocations (SCL_ALLOC), which may require reading space maps ++ * from disk (dmu_read() -> zio_read() -> SCL_ZIO). ++ * ++ * The spa config locks cannot be normal rwlocks because we need the ++ * ability to hand off ownership. For example, SCL_ZIO is acquired ++ * by the issuing thread and later released by an interrupt thread. ++ * They do, however, obey the usual write-wanted semantics to prevent ++ * writer (i.e. system administrator) starvation. ++ * ++ * The lock acquisition rules are as follows: ++ * ++ * SCL_CONFIG ++ * Protects changes to the vdev tree topology, such as vdev ++ * add/remove/attach/detach. Protects the dirty config list ++ * (spa_config_dirty_list) and the set of spares and l2arc devices. ++ * ++ * SCL_STATE ++ * Protects changes to pool state and vdev state, such as vdev ++ * online/offline/fault/degrade/clear. Protects the dirty state list ++ * (spa_state_dirty_list) and global pool state (spa_state). ++ * ++ * SCL_ALLOC ++ * Protects changes to metaslab groups and classes. ++ * Held as reader by metaslab_alloc() and metaslab_claim(). ++ * ++ * SCL_ZIO ++ * Held by bp-level zios (those which have no io_vd upon entry) ++ * to prevent changes to the vdev tree. The bp-level zio implicitly ++ * protects all of its vdev child zios, which do not hold SCL_ZIO. ++ * ++ * SCL_FREE ++ * Protects changes to metaslab groups and classes. ++ * Held as reader by metaslab_free(). SCL_FREE is distinct from ++ * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free ++ * blocks in zio_done() while another i/o that holds either ++ * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. ++ * ++ * SCL_VDEV ++ * Held as reader to prevent changes to the vdev tree during trivial ++ * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the ++ * other locks, and lower than all of them, to ensure that it's safe ++ * to acquire regardless of caller context. ++ * ++ * In addition, the following rules apply: ++ * ++ * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. ++ * The lock ordering is SCL_CONFIG > spa_props_lock. ++ * ++ * (b) I/O operations on leaf vdevs. For any zio operation that takes ++ * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), ++ * or zio_write_phys() -- the caller must ensure that the config cannot ++ * cannot change in the interim, and that the vdev cannot be reopened. ++ * SCL_STATE as reader suffices for both. ++ * ++ * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). ++ * ++ * spa_vdev_enter() Acquire the namespace lock and the config lock ++ * for writing. ++ * ++ * spa_vdev_exit() Release the config lock, wait for all I/O ++ * to complete, sync the updated configs to the ++ * cache, and release the namespace lock. ++ * ++ * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). ++ * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual ++ * locking is, always, based on spa_namespace_lock and spa_config_lock[]. ++ * ++ * spa_rename() is also implemented within this file since is requires ++ * manipulation of the namespace. ++ */ ++ ++static avl_tree_t spa_namespace_avl; ++kmutex_t spa_namespace_lock; ++static kcondvar_t spa_namespace_cv; ++static int spa_active_count; ++int spa_max_replication_override = SPA_DVAS_PER_BP; ++ ++static kmutex_t spa_spare_lock; ++static avl_tree_t spa_spare_avl; ++static kmutex_t spa_l2cache_lock; ++static avl_tree_t spa_l2cache_avl; ++ ++kmem_cache_t *spa_buffer_pool; ++int spa_mode_global; ++ ++/* ++ * ========================================================================== ++ * SPA config locking ++ * ========================================================================== ++ */ ++static void ++spa_config_lock_init(spa_t *spa) ++{ ++ int i; ++ ++ for (i = 0; i < SCL_LOCKS; i++) { ++ spa_config_lock_t *scl = &spa->spa_config_lock[i]; ++ mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); ++ cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); ++ refcount_create(&scl->scl_count); ++ scl->scl_writer = NULL; ++ scl->scl_write_wanted = 0; ++ } ++} ++ ++static void ++spa_config_lock_destroy(spa_t *spa) ++{ ++ int i; ++ ++ for (i = 0; i < SCL_LOCKS; i++) { ++ spa_config_lock_t *scl = &spa->spa_config_lock[i]; ++ mutex_destroy(&scl->scl_lock); ++ cv_destroy(&scl->scl_cv); ++ refcount_destroy(&scl->scl_count); ++ ASSERT(scl->scl_writer == NULL); ++ ASSERT(scl->scl_write_wanted == 0); ++ } ++} ++ ++int ++spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) ++{ ++ int i; ++ ++ for (i = 0; i < SCL_LOCKS; i++) { ++ spa_config_lock_t *scl = &spa->spa_config_lock[i]; ++ if (!(locks & (1 << i))) ++ continue; ++ mutex_enter(&scl->scl_lock); ++ if (rw == RW_READER) { ++ if (scl->scl_writer || scl->scl_write_wanted) { ++ mutex_exit(&scl->scl_lock); ++ spa_config_exit(spa, locks ^ (1 << i), tag); ++ return (0); ++ } ++ } else { ++ ASSERT(scl->scl_writer != curthread); ++ if (!refcount_is_zero(&scl->scl_count)) { ++ mutex_exit(&scl->scl_lock); ++ spa_config_exit(spa, locks ^ (1 << i), tag); ++ return (0); ++ } ++ scl->scl_writer = curthread; ++ } ++ (void) refcount_add(&scl->scl_count, tag); ++ mutex_exit(&scl->scl_lock); ++ } ++ return (1); ++} ++ ++void ++spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) ++{ ++ int wlocks_held = 0; ++ int i; ++ ++ for (i = 0; i < SCL_LOCKS; i++) { ++ spa_config_lock_t *scl = &spa->spa_config_lock[i]; ++ if (scl->scl_writer == curthread) ++ wlocks_held |= (1 << i); ++ if (!(locks & (1 << i))) ++ continue; ++ mutex_enter(&scl->scl_lock); ++ if (rw == RW_READER) { ++ while (scl->scl_writer || scl->scl_write_wanted) { ++ cv_wait(&scl->scl_cv, &scl->scl_lock); ++ } ++ } else { ++ ASSERT(scl->scl_writer != curthread); ++ while (!refcount_is_zero(&scl->scl_count)) { ++ scl->scl_write_wanted++; ++ cv_wait(&scl->scl_cv, &scl->scl_lock); ++ scl->scl_write_wanted--; ++ } ++ scl->scl_writer = curthread; ++ } ++ (void) refcount_add(&scl->scl_count, tag); ++ mutex_exit(&scl->scl_lock); ++ } ++ ASSERT(wlocks_held <= locks); ++} ++ ++void ++spa_config_exit(spa_t *spa, int locks, void *tag) ++{ ++ int i; ++ ++ for (i = SCL_LOCKS - 1; i >= 0; i--) { ++ spa_config_lock_t *scl = &spa->spa_config_lock[i]; ++ if (!(locks & (1 << i))) ++ continue; ++ mutex_enter(&scl->scl_lock); ++ ASSERT(!refcount_is_zero(&scl->scl_count)); ++ if (refcount_remove(&scl->scl_count, tag) == 0) { ++ ASSERT(scl->scl_writer == NULL || ++ scl->scl_writer == curthread); ++ scl->scl_writer = NULL; /* OK in either case */ ++ cv_broadcast(&scl->scl_cv); ++ } ++ mutex_exit(&scl->scl_lock); ++ } ++} ++ ++int ++spa_config_held(spa_t *spa, int locks, krw_t rw) ++{ ++ int i, locks_held = 0; ++ ++ for (i = 0; i < SCL_LOCKS; i++) { ++ spa_config_lock_t *scl = &spa->spa_config_lock[i]; ++ if (!(locks & (1 << i))) ++ continue; ++ if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) || ++ (rw == RW_WRITER && scl->scl_writer == curthread)) ++ locks_held |= 1 << i; ++ } ++ ++ return (locks_held); ++} ++ ++/* ++ * ========================================================================== ++ * SPA namespace functions ++ * ========================================================================== ++ */ ++ ++/* ++ * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. ++ * Returns NULL if no matching spa_t is found. ++ */ ++spa_t * ++spa_lookup(const char *name) ++{ ++ static spa_t search; /* spa_t is large; don't allocate on stack */ ++ spa_t *spa; ++ avl_index_t where; ++ char c = 0; ++ char *cp; ++ ++ ASSERT(MUTEX_HELD(&spa_namespace_lock)); ++ ++ /* ++ * If it's a full dataset name, figure out the pool name and ++ * just use that. ++ */ ++ cp = strpbrk(name, "/@"); ++ if (cp) { ++ c = *cp; ++ *cp = '\0'; ++ } ++ ++ (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); ++ spa = avl_find(&spa_namespace_avl, &search, &where); ++ ++ if (cp) ++ *cp = c; ++ ++ return (spa); ++} ++ ++/* ++ * Create an uninitialized spa_t with the given name. Requires ++ * spa_namespace_lock. The caller must ensure that the spa_t doesn't already ++ * exist by calling spa_lookup() first. ++ */ ++spa_t * ++spa_add(const char *name, nvlist_t *config, const char *altroot) ++{ ++ spa_t *spa; ++ spa_config_dirent_t *dp; ++ int t; ++ ++ ASSERT(MUTEX_HELD(&spa_namespace_lock)); ++ ++ spa = kmem_zalloc(sizeof (spa_t), KM_PUSHPAGE | KM_NODEBUG); ++ ++ mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); ++ cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); ++ cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); ++ cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); ++ ++ for (t = 0; t < TXG_SIZE; t++) ++ bplist_create(&spa->spa_free_bplist[t]); ++ ++ (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); ++ spa->spa_state = POOL_STATE_UNINITIALIZED; ++ spa->spa_freeze_txg = UINT64_MAX; ++ spa->spa_final_txg = UINT64_MAX; ++ spa->spa_load_max_txg = UINT64_MAX; ++ spa->spa_proc = &p0; ++ spa->spa_proc_state = SPA_PROC_NONE; ++ ++ refcount_create(&spa->spa_refcount); ++ spa_config_lock_init(spa); ++ ++ avl_add(&spa_namespace_avl, spa); ++ ++ /* ++ * Set the alternate root, if there is one. ++ */ ++ if (altroot) { ++ spa->spa_root = spa_strdup(altroot); ++ spa_active_count++; ++ } ++ ++ /* ++ * Every pool starts with the default cachefile ++ */ ++ list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), ++ offsetof(spa_config_dirent_t, scd_link)); ++ ++ dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_PUSHPAGE); ++ dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); ++ list_insert_head(&spa->spa_config_list, dp); ++ ++ VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, ++ KM_PUSHPAGE) == 0); ++ ++ if (config != NULL) ++ VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); ++ ++ return (spa); ++} ++ ++/* ++ * Removes a spa_t from the namespace, freeing up any memory used. Requires ++ * spa_namespace_lock. This is called only after the spa_t has been closed and ++ * deactivated. ++ */ ++void ++spa_remove(spa_t *spa) ++{ ++ spa_config_dirent_t *dp; ++ int t; ++ ++ ASSERT(MUTEX_HELD(&spa_namespace_lock)); ++ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); ++ ++ nvlist_free(spa->spa_config_splitting); ++ ++ avl_remove(&spa_namespace_avl, spa); ++ cv_broadcast(&spa_namespace_cv); ++ ++ if (spa->spa_root) { ++ spa_strfree(spa->spa_root); ++ spa_active_count--; ++ } ++ ++ while ((dp = list_head(&spa->spa_config_list)) != NULL) { ++ list_remove(&spa->spa_config_list, dp); ++ if (dp->scd_path != NULL) ++ spa_strfree(dp->scd_path); ++ kmem_free(dp, sizeof (spa_config_dirent_t)); ++ } ++ ++ list_destroy(&spa->spa_config_list); ++ ++ nvlist_free(spa->spa_load_info); ++ spa_config_set(spa, NULL); ++ ++ refcount_destroy(&spa->spa_refcount); ++ ++ spa_config_lock_destroy(spa); ++ ++ for (t = 0; t < TXG_SIZE; t++) ++ bplist_destroy(&spa->spa_free_bplist[t]); ++ ++ cv_destroy(&spa->spa_async_cv); ++ cv_destroy(&spa->spa_proc_cv); ++ cv_destroy(&spa->spa_scrub_io_cv); ++ cv_destroy(&spa->spa_suspend_cv); ++ ++ mutex_destroy(&spa->spa_async_lock); ++ mutex_destroy(&spa->spa_errlist_lock); ++ mutex_destroy(&spa->spa_errlog_lock); ++ mutex_destroy(&spa->spa_history_lock); ++ mutex_destroy(&spa->spa_proc_lock); ++ mutex_destroy(&spa->spa_props_lock); ++ mutex_destroy(&spa->spa_scrub_lock); ++ mutex_destroy(&spa->spa_suspend_lock); ++ mutex_destroy(&spa->spa_vdev_top_lock); ++ ++ kmem_free(spa, sizeof (spa_t)); ++} ++ ++/* ++ * Given a pool, return the next pool in the namespace, or NULL if there is ++ * none. If 'prev' is NULL, return the first pool. ++ */ ++spa_t * ++spa_next(spa_t *prev) ++{ ++ ASSERT(MUTEX_HELD(&spa_namespace_lock)); ++ ++ if (prev) ++ return (AVL_NEXT(&spa_namespace_avl, prev)); ++ else ++ return (avl_first(&spa_namespace_avl)); ++} ++ ++/* ++ * ========================================================================== ++ * SPA refcount functions ++ * ========================================================================== ++ */ ++ ++/* ++ * Add a reference to the given spa_t. Must have at least one reference, or ++ * have the namespace lock held. ++ */ ++void ++spa_open_ref(spa_t *spa, void *tag) ++{ ++ ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref || ++ MUTEX_HELD(&spa_namespace_lock)); ++ (void) refcount_add(&spa->spa_refcount, tag); ++} ++ ++/* ++ * Remove a reference to the given spa_t. Must have at least one reference, or ++ * have the namespace lock held. ++ */ ++void ++spa_close(spa_t *spa, void *tag) ++{ ++ ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref || ++ MUTEX_HELD(&spa_namespace_lock)); ++ (void) refcount_remove(&spa->spa_refcount, tag); ++} ++ ++/* ++ * Check to see if the spa refcount is zero. Must be called with ++ * spa_namespace_lock held. We really compare against spa_minref, which is the ++ * number of references acquired when opening a pool ++ */ ++boolean_t ++spa_refcount_zero(spa_t *spa) ++{ ++ ASSERT(MUTEX_HELD(&spa_namespace_lock)); ++ ++ return (refcount_count(&spa->spa_refcount) == spa->spa_minref); ++} ++ ++/* ++ * ========================================================================== ++ * SPA spare and l2cache tracking ++ * ========================================================================== ++ */ ++ ++/* ++ * Hot spares and cache devices are tracked using the same code below, ++ * for 'auxiliary' devices. ++ */ ++ ++typedef struct spa_aux { ++ uint64_t aux_guid; ++ uint64_t aux_pool; ++ avl_node_t aux_avl; ++ int aux_count; ++} spa_aux_t; ++ ++static int ++spa_aux_compare(const void *a, const void *b) ++{ ++ const spa_aux_t *sa = a; ++ const spa_aux_t *sb = b; ++ ++ if (sa->aux_guid < sb->aux_guid) ++ return (-1); ++ else if (sa->aux_guid > sb->aux_guid) ++ return (1); ++ else ++ return (0); ++} ++ ++void ++spa_aux_add(vdev_t *vd, avl_tree_t *avl) ++{ ++ avl_index_t where; ++ spa_aux_t search; ++ spa_aux_t *aux; ++ ++ search.aux_guid = vd->vdev_guid; ++ if ((aux = avl_find(avl, &search, &where)) != NULL) { ++ aux->aux_count++; ++ } else { ++ aux = kmem_zalloc(sizeof (spa_aux_t), KM_PUSHPAGE); ++ aux->aux_guid = vd->vdev_guid; ++ aux->aux_count = 1; ++ avl_insert(avl, aux, where); ++ } ++} ++ ++void ++spa_aux_remove(vdev_t *vd, avl_tree_t *avl) ++{ ++ spa_aux_t search; ++ spa_aux_t *aux; ++ avl_index_t where; ++ ++ search.aux_guid = vd->vdev_guid; ++ aux = avl_find(avl, &search, &where); ++ ++ ASSERT(aux != NULL); ++ ++ if (--aux->aux_count == 0) { ++ avl_remove(avl, aux); ++ kmem_free(aux, sizeof (spa_aux_t)); ++ } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { ++ aux->aux_pool = 0ULL; ++ } ++} ++ ++boolean_t ++spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) ++{ ++ spa_aux_t search, *found; ++ ++ search.aux_guid = guid; ++ found = avl_find(avl, &search, NULL); ++ ++ if (pool) { ++ if (found) ++ *pool = found->aux_pool; ++ else ++ *pool = 0ULL; ++ } ++ ++ if (refcnt) { ++ if (found) ++ *refcnt = found->aux_count; ++ else ++ *refcnt = 0; ++ } ++ ++ return (found != NULL); ++} ++ ++void ++spa_aux_activate(vdev_t *vd, avl_tree_t *avl) ++{ ++ spa_aux_t search, *found; ++ avl_index_t where; ++ ++ search.aux_guid = vd->vdev_guid; ++ found = avl_find(avl, &search, &where); ++ ASSERT(found != NULL); ++ ASSERT(found->aux_pool == 0ULL); ++ ++ found->aux_pool = spa_guid(vd->vdev_spa); ++} ++ ++/* ++ * Spares are tracked globally due to the following constraints: ++ * ++ * - A spare may be part of multiple pools. ++ * - A spare may be added to a pool even if it's actively in use within ++ * another pool. ++ * - A spare in use in any pool can only be the source of a replacement if ++ * the target is a spare in the same pool. ++ * ++ * We keep track of all spares on the system through the use of a reference ++ * counted AVL tree. When a vdev is added as a spare, or used as a replacement ++ * spare, then we bump the reference count in the AVL tree. In addition, we set ++ * the 'vdev_isspare' member to indicate that the device is a spare (active or ++ * inactive). When a spare is made active (used to replace a device in the ++ * pool), we also keep track of which pool its been made a part of. ++ * ++ * The 'spa_spare_lock' protects the AVL tree. These functions are normally ++ * called under the spa_namespace lock as part of vdev reconfiguration. The ++ * separate spare lock exists for the status query path, which does not need to ++ * be completely consistent with respect to other vdev configuration changes. ++ */ ++ ++static int ++spa_spare_compare(const void *a, const void *b) ++{ ++ return (spa_aux_compare(a, b)); ++} ++ ++void ++spa_spare_add(vdev_t *vd) ++{ ++ mutex_enter(&spa_spare_lock); ++ ASSERT(!vd->vdev_isspare); ++ spa_aux_add(vd, &spa_spare_avl); ++ vd->vdev_isspare = B_TRUE; ++ mutex_exit(&spa_spare_lock); ++} ++ ++void ++spa_spare_remove(vdev_t *vd) ++{ ++ mutex_enter(&spa_spare_lock); ++ ASSERT(vd->vdev_isspare); ++ spa_aux_remove(vd, &spa_spare_avl); ++ vd->vdev_isspare = B_FALSE; ++ mutex_exit(&spa_spare_lock); ++} ++ ++boolean_t ++spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) ++{ ++ boolean_t found; ++ ++ mutex_enter(&spa_spare_lock); ++ found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); ++ mutex_exit(&spa_spare_lock); ++ ++ return (found); ++} ++ ++void ++spa_spare_activate(vdev_t *vd) ++{ ++ mutex_enter(&spa_spare_lock); ++ ASSERT(vd->vdev_isspare); ++ spa_aux_activate(vd, &spa_spare_avl); ++ mutex_exit(&spa_spare_lock); ++} ++ ++/* ++ * Level 2 ARC devices are tracked globally for the same reasons as spares. ++ * Cache devices currently only support one pool per cache device, and so ++ * for these devices the aux reference count is currently unused beyond 1. ++ */ ++ ++static int ++spa_l2cache_compare(const void *a, const void *b) ++{ ++ return (spa_aux_compare(a, b)); ++} ++ ++void ++spa_l2cache_add(vdev_t *vd) ++{ ++ mutex_enter(&spa_l2cache_lock); ++ ASSERT(!vd->vdev_isl2cache); ++ spa_aux_add(vd, &spa_l2cache_avl); ++ vd->vdev_isl2cache = B_TRUE; ++ mutex_exit(&spa_l2cache_lock); ++} ++ ++void ++spa_l2cache_remove(vdev_t *vd) ++{ ++ mutex_enter(&spa_l2cache_lock); ++ ASSERT(vd->vdev_isl2cache); ++ spa_aux_remove(vd, &spa_l2cache_avl); ++ vd->vdev_isl2cache = B_FALSE; ++ mutex_exit(&spa_l2cache_lock); ++} ++ ++boolean_t ++spa_l2cache_exists(uint64_t guid, uint64_t *pool) ++{ ++ boolean_t found; ++ ++ mutex_enter(&spa_l2cache_lock); ++ found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); ++ mutex_exit(&spa_l2cache_lock); ++ ++ return (found); ++} ++ ++void ++spa_l2cache_activate(vdev_t *vd) ++{ ++ mutex_enter(&spa_l2cache_lock); ++ ASSERT(vd->vdev_isl2cache); ++ spa_aux_activate(vd, &spa_l2cache_avl); ++ mutex_exit(&spa_l2cache_lock); ++} ++ ++/* ++ * ========================================================================== ++ * SPA vdev locking ++ * ========================================================================== ++ */ ++ ++/* ++ * Lock the given spa_t for the purpose of adding or removing a vdev. ++ * Grabs the global spa_namespace_lock plus the spa config lock for writing. ++ * It returns the next transaction group for the spa_t. ++ */ ++uint64_t ++spa_vdev_enter(spa_t *spa) ++{ ++ mutex_enter(&spa->spa_vdev_top_lock); ++ mutex_enter(&spa_namespace_lock); ++ return (spa_vdev_config_enter(spa)); ++} ++ ++/* ++ * Internal implementation for spa_vdev_enter(). Used when a vdev ++ * operation requires multiple syncs (i.e. removing a device) while ++ * keeping the spa_namespace_lock held. ++ */ ++uint64_t ++spa_vdev_config_enter(spa_t *spa) ++{ ++ ASSERT(MUTEX_HELD(&spa_namespace_lock)); ++ ++ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); ++ ++ return (spa_last_synced_txg(spa) + 1); ++} ++ ++/* ++ * Used in combination with spa_vdev_config_enter() to allow the syncing ++ * of multiple transactions without releasing the spa_namespace_lock. ++ */ ++void ++spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) ++{ ++ int config_changed = B_FALSE; ++ ++ ASSERT(MUTEX_HELD(&spa_namespace_lock)); ++ ASSERT(txg > spa_last_synced_txg(spa)); ++ ++ spa->spa_pending_vdev = NULL; ++ ++ /* ++ * Reassess the DTLs. ++ */ ++ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); ++ ++ if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { ++ config_changed = B_TRUE; ++ spa->spa_config_generation++; ++ } ++ ++ /* ++ * Verify the metaslab classes. ++ */ ++ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); ++ ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); ++ ++ spa_config_exit(spa, SCL_ALL, spa); ++ ++ /* ++ * Panic the system if the specified tag requires it. This ++ * is useful for ensuring that configurations are updated ++ * transactionally. ++ */ ++ if (zio_injection_enabled) ++ zio_handle_panic_injection(spa, tag, 0); ++ ++ /* ++ * Note: this txg_wait_synced() is important because it ensures ++ * that there won't be more than one config change per txg. ++ * This allows us to use the txg as the generation number. ++ */ ++ if (error == 0) ++ txg_wait_synced(spa->spa_dsl_pool, txg); ++ ++ if (vd != NULL) { ++ ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0); ++ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); ++ vdev_free(vd); ++ spa_config_exit(spa, SCL_ALL, spa); ++ } ++ ++ /* ++ * If the config changed, update the config cache. ++ */ ++ if (config_changed) ++ spa_config_sync(spa, B_FALSE, B_TRUE); ++} ++ ++/* ++ * Unlock the spa_t after adding or removing a vdev. Besides undoing the ++ * locking of spa_vdev_enter(), we also want make sure the transactions have ++ * synced to disk, and then update the global configuration cache with the new ++ * information. ++ */ ++int ++spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) ++{ ++ spa_vdev_config_exit(spa, vd, txg, error, FTAG); ++ mutex_exit(&spa_namespace_lock); ++ mutex_exit(&spa->spa_vdev_top_lock); ++ ++ return (error); ++} ++ ++/* ++ * Lock the given spa_t for the purpose of changing vdev state. ++ */ ++void ++spa_vdev_state_enter(spa_t *spa, int oplocks) ++{ ++ int locks = SCL_STATE_ALL | oplocks; ++ ++ /* ++ * Root pools may need to read of the underlying devfs filesystem ++ * when opening up a vdev. Unfortunately if we're holding the ++ * SCL_ZIO lock it will result in a deadlock when we try to issue ++ * the read from the root filesystem. Instead we "prefetch" ++ * the associated vnodes that we need prior to opening the ++ * underlying devices and cache them so that we can prevent ++ * any I/O when we are doing the actual open. ++ */ ++ if (spa_is_root(spa)) { ++ int low = locks & ~(SCL_ZIO - 1); ++ int high = locks & ~low; ++ ++ spa_config_enter(spa, high, spa, RW_WRITER); ++ vdev_hold(spa->spa_root_vdev); ++ spa_config_enter(spa, low, spa, RW_WRITER); ++ } else { ++ spa_config_enter(spa, locks, spa, RW_WRITER); ++ } ++ spa->spa_vdev_locks = locks; ++} ++ ++int ++spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) ++{ ++ boolean_t config_changed = B_FALSE; ++ ++ if (vd != NULL || error == 0) ++ vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, ++ 0, 0, B_FALSE); ++ ++ if (vd != NULL) { ++ vdev_state_dirty(vd->vdev_top); ++ config_changed = B_TRUE; ++ spa->spa_config_generation++; ++ } ++ ++ if (spa_is_root(spa)) ++ vdev_rele(spa->spa_root_vdev); ++ ++ ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); ++ spa_config_exit(spa, spa->spa_vdev_locks, spa); ++ ++ /* ++ * If anything changed, wait for it to sync. This ensures that, ++ * from the system administrator's perspective, zpool(1M) commands ++ * are synchronous. This is important for things like zpool offline: ++ * when the command completes, you expect no further I/O from ZFS. ++ */ ++ if (vd != NULL) ++ txg_wait_synced(spa->spa_dsl_pool, 0); ++ ++ /* ++ * If the config changed, update the config cache. ++ */ ++ if (config_changed) { ++ mutex_enter(&spa_namespace_lock); ++ spa_config_sync(spa, B_FALSE, B_TRUE); ++ mutex_exit(&spa_namespace_lock); ++ } ++ ++ return (error); ++} ++ ++/* ++ * ========================================================================== ++ * Miscellaneous functions ++ * ========================================================================== ++ */ ++ ++/* ++ * Rename a spa_t. ++ */ ++int ++spa_rename(const char *name, const char *newname) ++{ ++ spa_t *spa; ++ int err; ++ ++ /* ++ * Lookup the spa_t and grab the config lock for writing. We need to ++ * actually open the pool so that we can sync out the necessary labels. ++ * It's OK to call spa_open() with the namespace lock held because we ++ * allow recursive calls for other reasons. ++ */ ++ mutex_enter(&spa_namespace_lock); ++ if ((err = spa_open(name, &spa, FTAG)) != 0) { ++ mutex_exit(&spa_namespace_lock); ++ return (err); ++ } ++ ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ ++ avl_remove(&spa_namespace_avl, spa); ++ (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name)); ++ avl_add(&spa_namespace_avl, spa); ++ ++ /* ++ * Sync all labels to disk with the new names by marking the root vdev ++ * dirty and waiting for it to sync. It will pick up the new pool name ++ * during the sync. ++ */ ++ vdev_config_dirty(spa->spa_root_vdev); ++ ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ ++ txg_wait_synced(spa->spa_dsl_pool, 0); ++ ++ /* ++ * Sync the updated config cache. ++ */ ++ spa_config_sync(spa, B_FALSE, B_TRUE); ++ ++ spa_close(spa, FTAG); ++ ++ mutex_exit(&spa_namespace_lock); ++ ++ return (0); ++} ++ ++/* ++ * Return the spa_t associated with given pool_guid, if it exists. If ++ * device_guid is non-zero, determine whether the pool exists *and* contains ++ * a device with the specified device_guid. ++ */ ++spa_t * ++spa_by_guid(uint64_t pool_guid, uint64_t device_guid) ++{ ++ spa_t *spa; ++ avl_tree_t *t = &spa_namespace_avl; ++ ++ ASSERT(MUTEX_HELD(&spa_namespace_lock)); ++ ++ for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { ++ if (spa->spa_state == POOL_STATE_UNINITIALIZED) ++ continue; ++ if (spa->spa_root_vdev == NULL) ++ continue; ++ if (spa_guid(spa) == pool_guid) { ++ if (device_guid == 0) ++ break; ++ ++ if (vdev_lookup_by_guid(spa->spa_root_vdev, ++ device_guid) != NULL) ++ break; ++ ++ /* ++ * Check any devices we may be in the process of adding. ++ */ ++ if (spa->spa_pending_vdev) { ++ if (vdev_lookup_by_guid(spa->spa_pending_vdev, ++ device_guid) != NULL) ++ break; ++ } ++ } ++ } ++ ++ return (spa); ++} ++ ++/* ++ * Determine whether a pool with the given pool_guid exists. ++ */ ++boolean_t ++spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) ++{ ++ return (spa_by_guid(pool_guid, device_guid) != NULL); ++} ++ ++char * ++spa_strdup(const char *s) ++{ ++ size_t len; ++ char *new; ++ ++ len = strlen(s); ++ new = kmem_alloc(len + 1, KM_PUSHPAGE); ++ bcopy(s, new, len); ++ new[len] = '\0'; ++ ++ return (new); ++} ++ ++void ++spa_strfree(char *s) ++{ ++ kmem_free(s, strlen(s) + 1); ++} ++ ++uint64_t ++spa_get_random(uint64_t range) ++{ ++ uint64_t r; ++ ++ ASSERT(range != 0); ++ ++ (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); ++ ++ return (r % range); ++} ++ ++uint64_t ++spa_generate_guid(spa_t *spa) ++{ ++ uint64_t guid = spa_get_random(-1ULL); ++ ++ if (spa != NULL) { ++ while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) ++ guid = spa_get_random(-1ULL); ++ } else { ++ while (guid == 0 || spa_guid_exists(guid, 0)) ++ guid = spa_get_random(-1ULL); ++ } ++ ++ return (guid); ++} ++ ++void ++sprintf_blkptr(char *buf, const blkptr_t *bp) ++{ ++ char *type = NULL; ++ char *checksum = NULL; ++ char *compress = NULL; ++ ++ if (bp != NULL) { ++ type = dmu_ot[BP_GET_TYPE(bp)].ot_name; ++ checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; ++ compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; ++ } ++ ++ SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress); ++} ++ ++void ++spa_freeze(spa_t *spa) ++{ ++ uint64_t freeze_txg = 0; ++ ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ if (spa->spa_freeze_txg == UINT64_MAX) { ++ freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; ++ spa->spa_freeze_txg = freeze_txg; ++ } ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ if (freeze_txg != 0) ++ txg_wait_synced(spa_get_dsl(spa), freeze_txg); ++} ++ ++/* ++ * This is a stripped-down version of strtoull, suitable only for converting ++ * lowercase hexidecimal numbers that don't overflow. ++ */ ++uint64_t ++strtonum(const char *str, char **nptr) ++{ ++ uint64_t val = 0; ++ char c; ++ int digit; ++ ++ while ((c = *str) != '\0') { ++ if (c >= '0' && c <= '9') ++ digit = c - '0'; ++ else if (c >= 'a' && c <= 'f') ++ digit = 10 + c - 'a'; ++ else ++ break; ++ ++ val *= 16; ++ val += digit; ++ ++ str++; ++ } ++ ++ if (nptr) ++ *nptr = (char *)str; ++ ++ return (val); ++} ++ ++/* ++ * ========================================================================== ++ * Accessor functions ++ * ========================================================================== ++ */ ++ ++boolean_t ++spa_shutting_down(spa_t *spa) ++{ ++ return (spa->spa_async_suspended); ++} ++ ++dsl_pool_t * ++spa_get_dsl(spa_t *spa) ++{ ++ return (spa->spa_dsl_pool); ++} ++ ++blkptr_t * ++spa_get_rootblkptr(spa_t *spa) ++{ ++ return (&spa->spa_ubsync.ub_rootbp); ++} ++ ++void ++spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) ++{ ++ spa->spa_uberblock.ub_rootbp = *bp; ++} ++ ++void ++spa_altroot(spa_t *spa, char *buf, size_t buflen) ++{ ++ if (spa->spa_root == NULL) ++ buf[0] = '\0'; ++ else ++ (void) strncpy(buf, spa->spa_root, buflen); ++} ++ ++int ++spa_sync_pass(spa_t *spa) ++{ ++ return (spa->spa_sync_pass); ++} ++ ++char * ++spa_name(spa_t *spa) ++{ ++ return (spa->spa_name); ++} ++ ++uint64_t ++spa_guid(spa_t *spa) ++{ ++ /* ++ * If we fail to parse the config during spa_load(), we can go through ++ * the error path (which posts an ereport) and end up here with no root ++ * vdev. We stash the original pool guid in 'spa_config_guid' to handle ++ * this case. ++ */ ++ if (spa->spa_root_vdev != NULL) ++ return (spa->spa_root_vdev->vdev_guid); ++ else ++ return (spa->spa_config_guid); ++} ++ ++uint64_t ++spa_load_guid(spa_t *spa) ++{ ++ /* ++ * This is a GUID that exists solely as a reference for the ++ * purposes of the arc. It is generated at load time, and ++ * is never written to persistent storage. ++ */ ++ return (spa->spa_load_guid); ++} ++ ++uint64_t ++spa_last_synced_txg(spa_t *spa) ++{ ++ return (spa->spa_ubsync.ub_txg); ++} ++ ++uint64_t ++spa_first_txg(spa_t *spa) ++{ ++ return (spa->spa_first_txg); ++} ++ ++uint64_t ++spa_syncing_txg(spa_t *spa) ++{ ++ return (spa->spa_syncing_txg); ++} ++ ++pool_state_t ++spa_state(spa_t *spa) ++{ ++ return (spa->spa_state); ++} ++ ++spa_load_state_t ++spa_load_state(spa_t *spa) ++{ ++ return (spa->spa_load_state); ++} ++ ++uint64_t ++spa_freeze_txg(spa_t *spa) ++{ ++ return (spa->spa_freeze_txg); ++} ++ ++/* ARGSUSED */ ++uint64_t ++spa_get_asize(spa_t *spa, uint64_t lsize) ++{ ++ /* ++ * The worst case is single-sector max-parity RAID-Z blocks, in which ++ * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) ++ * times the size; so just assume that. Add to this the fact that ++ * we can have up to 3 DVAs per bp, and one more factor of 2 because ++ * the block may be dittoed with up to 3 DVAs by ddt_sync(). ++ */ ++ return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2); ++} ++ ++uint64_t ++spa_get_dspace(spa_t *spa) ++{ ++ return (spa->spa_dspace); ++} ++ ++void ++spa_update_dspace(spa_t *spa) ++{ ++ spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + ++ ddt_get_dedup_dspace(spa); ++} ++ ++/* ++ * Return the failure mode that has been set to this pool. The default ++ * behavior will be to block all I/Os when a complete failure occurs. ++ */ ++uint8_t ++spa_get_failmode(spa_t *spa) ++{ ++ return (spa->spa_failmode); ++} ++ ++boolean_t ++spa_suspended(spa_t *spa) ++{ ++ return (spa->spa_suspended); ++} ++ ++uint64_t ++spa_version(spa_t *spa) ++{ ++ return (spa->spa_ubsync.ub_version); ++} ++ ++boolean_t ++spa_deflate(spa_t *spa) ++{ ++ return (spa->spa_deflate); ++} ++ ++metaslab_class_t * ++spa_normal_class(spa_t *spa) ++{ ++ return (spa->spa_normal_class); ++} ++ ++metaslab_class_t * ++spa_log_class(spa_t *spa) ++{ ++ return (spa->spa_log_class); ++} ++ ++int ++spa_max_replication(spa_t *spa) ++{ ++ /* ++ * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to ++ * handle BPs with more than one DVA allocated. Set our max ++ * replication level accordingly. ++ */ ++ if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) ++ return (1); ++ return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); ++} ++ ++int ++spa_prev_software_version(spa_t *spa) ++{ ++ return (spa->spa_prev_software_version); ++} ++ ++uint64_t ++dva_get_dsize_sync(spa_t *spa, const dva_t *dva) ++{ ++ uint64_t asize = DVA_GET_ASIZE(dva); ++ uint64_t dsize = asize; ++ ++ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); ++ ++ if (asize != 0 && spa->spa_deflate) { ++ vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); ++ dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; ++ } ++ ++ return (dsize); ++} ++ ++uint64_t ++bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) ++{ ++ uint64_t dsize = 0; ++ int d; ++ ++ for (d = 0; d < SPA_DVAS_PER_BP; d++) ++ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); ++ ++ return (dsize); ++} ++ ++uint64_t ++bp_get_dsize(spa_t *spa, const blkptr_t *bp) ++{ ++ uint64_t dsize = 0; ++ int d; ++ ++ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ++ ++ for (d = 0; d < SPA_DVAS_PER_BP; d++) ++ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); ++ ++ spa_config_exit(spa, SCL_VDEV, FTAG); ++ ++ return (dsize); ++} ++ ++/* ++ * ========================================================================== ++ * Initialization and Termination ++ * ========================================================================== ++ */ ++ ++static int ++spa_name_compare(const void *a1, const void *a2) ++{ ++ const spa_t *s1 = a1; ++ const spa_t *s2 = a2; ++ int s; ++ ++ s = strcmp(s1->spa_name, s2->spa_name); ++ if (s > 0) ++ return (1); ++ if (s < 0) ++ return (-1); ++ return (0); ++} ++ ++void ++spa_boot_init(void) ++{ ++ spa_config_load(); ++} ++ ++void ++spa_init(int mode) ++{ ++ mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); ++ cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); ++ ++ avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), ++ offsetof(spa_t, spa_avl)); ++ ++ avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), ++ offsetof(spa_aux_t, aux_avl)); ++ ++ avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), ++ offsetof(spa_aux_t, aux_avl)); ++ ++ spa_mode_global = mode; ++ ++ fm_init(); ++ refcount_init(); ++ unique_init(); ++ zio_init(); ++ dmu_init(); ++ zil_init(); ++ vdev_cache_stat_init(); ++ zfs_prop_init(); ++ zpool_prop_init(); ++ spa_config_load(); ++ l2arc_start(); ++} ++ ++void ++spa_fini(void) ++{ ++ l2arc_stop(); ++ ++ spa_evict_all(); ++ ++ vdev_cache_stat_fini(); ++ zil_fini(); ++ dmu_fini(); ++ zio_fini(); ++ unique_fini(); ++ refcount_fini(); ++ fm_fini(); ++ ++ avl_destroy(&spa_namespace_avl); ++ avl_destroy(&spa_spare_avl); ++ avl_destroy(&spa_l2cache_avl); ++ ++ cv_destroy(&spa_namespace_cv); ++ mutex_destroy(&spa_namespace_lock); ++ mutex_destroy(&spa_spare_lock); ++ mutex_destroy(&spa_l2cache_lock); ++} ++ ++/* ++ * Return whether this pool has slogs. No locking needed. ++ * It's not a problem if the wrong answer is returned as it's only for ++ * performance and not correctness ++ */ ++boolean_t ++spa_has_slogs(spa_t *spa) ++{ ++ return (spa->spa_log_class->mc_rotor != NULL); ++} ++ ++spa_log_state_t ++spa_get_log_state(spa_t *spa) ++{ ++ return (spa->spa_log_state); ++} ++ ++void ++spa_set_log_state(spa_t *spa, spa_log_state_t state) ++{ ++ spa->spa_log_state = state; ++} ++ ++boolean_t ++spa_is_root(spa_t *spa) ++{ ++ return (spa->spa_is_root); ++} ++ ++boolean_t ++spa_writeable(spa_t *spa) ++{ ++ return (!!(spa->spa_mode & FWRITE)); ++} ++ ++int ++spa_mode(spa_t *spa) ++{ ++ return (spa->spa_mode); ++} ++ ++uint64_t ++spa_bootfs(spa_t *spa) ++{ ++ return (spa->spa_bootfs); ++} ++ ++uint64_t ++spa_delegation(spa_t *spa) ++{ ++ return (spa->spa_delegation); ++} ++ ++objset_t * ++spa_meta_objset(spa_t *spa) ++{ ++ return (spa->spa_meta_objset); ++} ++ ++enum zio_checksum ++spa_dedup_checksum(spa_t *spa) ++{ ++ return (spa->spa_dedup_checksum); ++} ++ ++/* ++ * Reset pool scan stat per scan pass (or reboot). ++ */ ++void ++spa_scan_stat_init(spa_t *spa) ++{ ++ /* data not stored on disk */ ++ spa->spa_scan_pass_start = gethrestime_sec(); ++ spa->spa_scan_pass_exam = 0; ++ vdev_scan_stat_init(spa->spa_root_vdev); ++} ++ ++/* ++ * Get scan stats for zpool status reports ++ */ ++int ++spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) ++{ ++ dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; ++ ++ if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) ++ return (ENOENT); ++ bzero(ps, sizeof (pool_scan_stat_t)); ++ ++ /* data stored on disk */ ++ ps->pss_func = scn->scn_phys.scn_func; ++ ps->pss_start_time = scn->scn_phys.scn_start_time; ++ ps->pss_end_time = scn->scn_phys.scn_end_time; ++ ps->pss_to_examine = scn->scn_phys.scn_to_examine; ++ ps->pss_examined = scn->scn_phys.scn_examined; ++ ps->pss_to_process = scn->scn_phys.scn_to_process; ++ ps->pss_processed = scn->scn_phys.scn_processed; ++ ps->pss_errors = scn->scn_phys.scn_errors; ++ ps->pss_state = scn->scn_phys.scn_state; ++ ++ /* data not stored on disk */ ++ ps->pss_pass_start = spa->spa_scan_pass_start; ++ ps->pss_pass_exam = spa->spa_scan_pass_exam; ++ ++ return (0); ++} ++ ++boolean_t ++spa_debug_enabled(spa_t *spa) ++{ ++ return (spa->spa_debug); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++/* Namespace manipulation */ ++EXPORT_SYMBOL(spa_lookup); ++EXPORT_SYMBOL(spa_add); ++EXPORT_SYMBOL(spa_remove); ++EXPORT_SYMBOL(spa_next); ++ ++/* Refcount functions */ ++EXPORT_SYMBOL(spa_open_ref); ++EXPORT_SYMBOL(spa_close); ++EXPORT_SYMBOL(spa_refcount_zero); ++ ++/* Pool configuration lock */ ++EXPORT_SYMBOL(spa_config_tryenter); ++EXPORT_SYMBOL(spa_config_enter); ++EXPORT_SYMBOL(spa_config_exit); ++EXPORT_SYMBOL(spa_config_held); ++ ++/* Pool vdev add/remove lock */ ++EXPORT_SYMBOL(spa_vdev_enter); ++EXPORT_SYMBOL(spa_vdev_exit); ++ ++/* Pool vdev state change lock */ ++EXPORT_SYMBOL(spa_vdev_state_enter); ++EXPORT_SYMBOL(spa_vdev_state_exit); ++ ++/* Accessor functions */ ++EXPORT_SYMBOL(spa_shutting_down); ++EXPORT_SYMBOL(spa_get_dsl); ++EXPORT_SYMBOL(spa_get_rootblkptr); ++EXPORT_SYMBOL(spa_set_rootblkptr); ++EXPORT_SYMBOL(spa_altroot); ++EXPORT_SYMBOL(spa_sync_pass); ++EXPORT_SYMBOL(spa_name); ++EXPORT_SYMBOL(spa_guid); ++EXPORT_SYMBOL(spa_last_synced_txg); ++EXPORT_SYMBOL(spa_first_txg); ++EXPORT_SYMBOL(spa_syncing_txg); ++EXPORT_SYMBOL(spa_version); ++EXPORT_SYMBOL(spa_state); ++EXPORT_SYMBOL(spa_load_state); ++EXPORT_SYMBOL(spa_freeze_txg); ++EXPORT_SYMBOL(spa_get_asize); ++EXPORT_SYMBOL(spa_get_dspace); ++EXPORT_SYMBOL(spa_update_dspace); ++EXPORT_SYMBOL(spa_deflate); ++EXPORT_SYMBOL(spa_normal_class); ++EXPORT_SYMBOL(spa_log_class); ++EXPORT_SYMBOL(spa_max_replication); ++EXPORT_SYMBOL(spa_prev_software_version); ++EXPORT_SYMBOL(spa_get_failmode); ++EXPORT_SYMBOL(spa_suspended); ++EXPORT_SYMBOL(spa_bootfs); ++EXPORT_SYMBOL(spa_delegation); ++EXPORT_SYMBOL(spa_meta_objset); ++ ++/* Miscellaneous support routines */ ++EXPORT_SYMBOL(spa_rename); ++EXPORT_SYMBOL(spa_guid_exists); ++EXPORT_SYMBOL(spa_strdup); ++EXPORT_SYMBOL(spa_strfree); ++EXPORT_SYMBOL(spa_get_random); ++EXPORT_SYMBOL(spa_generate_guid); ++EXPORT_SYMBOL(sprintf_blkptr); ++EXPORT_SYMBOL(spa_freeze); ++EXPORT_SYMBOL(spa_upgrade); ++EXPORT_SYMBOL(spa_evict_all); ++EXPORT_SYMBOL(spa_lookup_by_guid); ++EXPORT_SYMBOL(spa_has_spare); ++EXPORT_SYMBOL(dva_get_dsize_sync); ++EXPORT_SYMBOL(bp_get_dsize_sync); ++EXPORT_SYMBOL(bp_get_dsize); ++EXPORT_SYMBOL(spa_has_slogs); ++EXPORT_SYMBOL(spa_is_root); ++EXPORT_SYMBOL(spa_writeable); ++EXPORT_SYMBOL(spa_mode); ++ ++EXPORT_SYMBOL(spa_namespace_lock); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/txg.c linux-3.2.33-go/fs/zfs/zfs/txg.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/txg.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/txg.c 2012-11-16 23:25:34.348039346 +0100 +@@ -0,0 +1,827 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Pool-wide transaction groups. ++ */ ++ ++static void txg_sync_thread(dsl_pool_t *dp); ++static void txg_quiesce_thread(dsl_pool_t *dp); ++ ++int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ ++ ++/* ++ * Prepare the txg subsystem. ++ */ ++void ++txg_init(dsl_pool_t *dp, uint64_t txg) ++{ ++ tx_state_t *tx = &dp->dp_tx; ++ int c; ++ bzero(tx, sizeof (tx_state_t)); ++ ++ tx->tx_cpu = vmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); ++ ++ for (c = 0; c < max_ncpus; c++) { ++ int i; ++ ++ mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); ++ for (i = 0; i < TXG_SIZE; i++) { ++ cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, ++ NULL); ++ list_create(&tx->tx_cpu[c].tc_callbacks[i], ++ sizeof (dmu_tx_callback_t), ++ offsetof(dmu_tx_callback_t, dcb_node)); ++ } ++ } ++ ++ mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); ++ cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); ++ cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); ++ cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); ++ cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); ++ ++ tx->tx_open_txg = txg; ++} ++ ++/* ++ * Close down the txg subsystem. ++ */ ++void ++txg_fini(dsl_pool_t *dp) ++{ ++ tx_state_t *tx = &dp->dp_tx; ++ int c; ++ ++ ASSERT(tx->tx_threads == 0); ++ ++ mutex_destroy(&tx->tx_sync_lock); ++ ++ cv_destroy(&tx->tx_sync_more_cv); ++ cv_destroy(&tx->tx_sync_done_cv); ++ cv_destroy(&tx->tx_quiesce_more_cv); ++ cv_destroy(&tx->tx_quiesce_done_cv); ++ cv_destroy(&tx->tx_exit_cv); ++ ++ for (c = 0; c < max_ncpus; c++) { ++ int i; ++ ++ mutex_destroy(&tx->tx_cpu[c].tc_lock); ++ for (i = 0; i < TXG_SIZE; i++) { ++ cv_destroy(&tx->tx_cpu[c].tc_cv[i]); ++ list_destroy(&tx->tx_cpu[c].tc_callbacks[i]); ++ } ++ } ++ ++ if (tx->tx_commit_cb_taskq != NULL) ++ taskq_destroy(tx->tx_commit_cb_taskq); ++ ++ vmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); ++ ++ bzero(tx, sizeof (tx_state_t)); ++} ++ ++/* ++ * Start syncing transaction groups. ++ */ ++void ++txg_sync_start(dsl_pool_t *dp) ++{ ++ tx_state_t *tx = &dp->dp_tx; ++ ++ mutex_enter(&tx->tx_sync_lock); ++ ++ dprintf("pool %p\n", dp); ++ ++ ASSERT(tx->tx_threads == 0); ++ ++ tx->tx_threads = 2; ++ ++ tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, ++ dp, 0, &p0, TS_RUN, minclsyspri); ++ ++ /* ++ * The sync thread can need a larger-than-default stack size on ++ * 32-bit x86. This is due in part to nested pools and ++ * scrub_visitbp() recursion. ++ */ ++ tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread, ++ dp, 0, &p0, TS_RUN, minclsyspri); ++ ++ mutex_exit(&tx->tx_sync_lock); ++} ++ ++static void ++txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) ++{ ++ CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); ++ mutex_enter(&tx->tx_sync_lock); ++} ++ ++static void ++txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) ++{ ++ ASSERT(*tpp != NULL); ++ *tpp = NULL; ++ tx->tx_threads--; ++ cv_broadcast(&tx->tx_exit_cv); ++ CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ ++ thread_exit(); ++} ++ ++static void ++txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time) ++{ ++ CALLB_CPR_SAFE_BEGIN(cpr); ++ ++ if (time) ++ (void) cv_timedwait_interruptible(cv, &tx->tx_sync_lock, ++ ddi_get_lbolt() + time); ++ else ++ cv_wait_interruptible(cv, &tx->tx_sync_lock); ++ ++ CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); ++} ++ ++/* ++ * Stop syncing transaction groups. ++ */ ++void ++txg_sync_stop(dsl_pool_t *dp) ++{ ++ tx_state_t *tx = &dp->dp_tx; ++ ++ dprintf("pool %p\n", dp); ++ /* ++ * Finish off any work in progress. ++ */ ++ ASSERT(tx->tx_threads == 2); ++ ++ /* ++ * We need to ensure that we've vacated the deferred space_maps. ++ */ ++ txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); ++ ++ /* ++ * Wake all sync threads and wait for them to die. ++ */ ++ mutex_enter(&tx->tx_sync_lock); ++ ++ ASSERT(tx->tx_threads == 2); ++ ++ tx->tx_exiting = 1; ++ ++ cv_broadcast(&tx->tx_quiesce_more_cv); ++ cv_broadcast(&tx->tx_quiesce_done_cv); ++ cv_broadcast(&tx->tx_sync_more_cv); ++ ++ while (tx->tx_threads != 0) ++ cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); ++ ++ tx->tx_exiting = 0; ++ ++ mutex_exit(&tx->tx_sync_lock); ++} ++ ++uint64_t ++txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) ++{ ++ tx_state_t *tx = &dp->dp_tx; ++ tx_cpu_t *tc; ++ uint64_t txg; ++ ++ /* ++ * It appears the processor id is simply used as a "random" ++ * number to index into the array, and there isn't any other ++ * significance to the chosen tx_cpu. Because.. Why not use ++ * the current cpu to index into the array? ++ */ ++ kpreempt_disable(); ++ tc = &tx->tx_cpu[CPU_SEQID]; ++ kpreempt_enable(); ++ ++ mutex_enter(&tc->tc_lock); ++ ++ txg = tx->tx_open_txg; ++ tc->tc_count[txg & TXG_MASK]++; ++ ++ th->th_cpu = tc; ++ th->th_txg = txg; ++ ++ return (txg); ++} ++ ++void ++txg_rele_to_quiesce(txg_handle_t *th) ++{ ++ tx_cpu_t *tc = th->th_cpu; ++ ++ mutex_exit(&tc->tc_lock); ++} ++ ++void ++txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks) ++{ ++ tx_cpu_t *tc = th->th_cpu; ++ int g = th->th_txg & TXG_MASK; ++ ++ mutex_enter(&tc->tc_lock); ++ list_move_tail(&tc->tc_callbacks[g], tx_callbacks); ++ mutex_exit(&tc->tc_lock); ++} ++ ++void ++txg_rele_to_sync(txg_handle_t *th) ++{ ++ tx_cpu_t *tc = th->th_cpu; ++ int g = th->th_txg & TXG_MASK; ++ ++ mutex_enter(&tc->tc_lock); ++ ASSERT(tc->tc_count[g] != 0); ++ if (--tc->tc_count[g] == 0) ++ cv_broadcast(&tc->tc_cv[g]); ++ mutex_exit(&tc->tc_lock); ++ ++ th->th_cpu = NULL; /* defensive */ ++} ++ ++static void ++txg_quiesce(dsl_pool_t *dp, uint64_t txg) ++{ ++ hrtime_t start; ++ txg_history_t *th; ++ tx_state_t *tx = &dp->dp_tx; ++ int g = txg & TXG_MASK; ++ int c; ++ ++ /* ++ * Grab all tx_cpu locks so nobody else can get into this txg. ++ */ ++ for (c = 0; c < max_ncpus; c++) ++ mutex_enter(&tx->tx_cpu[c].tc_lock); ++ ++ ASSERT(txg == tx->tx_open_txg); ++ tx->tx_open_txg++; ++ ++ /* ++ * Measure how long the txg was open and replace the kstat. ++ */ ++ th = dsl_pool_txg_history_get(dp, txg); ++ th->th_kstat.open_time = gethrtime() - th->th_kstat.birth; ++ th->th_kstat.state = TXG_STATE_QUIESCING; ++ dsl_pool_txg_history_put(th); ++ dsl_pool_txg_history_add(dp, tx->tx_open_txg); ++ ++ /* ++ * Now that we've incremented tx_open_txg, we can let threads ++ * enter the next transaction group. ++ */ ++ for (c = 0; c < max_ncpus; c++) ++ mutex_exit(&tx->tx_cpu[c].tc_lock); ++ ++ /* ++ * Quiesce the transaction group by waiting for everyone to txg_exit(). ++ */ ++ start = gethrtime(); ++ ++ for (c = 0; c < max_ncpus; c++) { ++ tx_cpu_t *tc = &tx->tx_cpu[c]; ++ mutex_enter(&tc->tc_lock); ++ while (tc->tc_count[g] != 0) ++ cv_wait(&tc->tc_cv[g], &tc->tc_lock); ++ mutex_exit(&tc->tc_lock); ++ } ++ ++ /* ++ * Measure how long the txg took to quiesce. ++ */ ++ th = dsl_pool_txg_history_get(dp, txg); ++ th->th_kstat.quiesce_time = gethrtime() - start; ++ dsl_pool_txg_history_put(th); ++} ++ ++static void ++txg_do_callbacks(list_t *cb_list) ++{ ++ dmu_tx_do_callbacks(cb_list, 0); ++ ++ list_destroy(cb_list); ++ ++ kmem_free(cb_list, sizeof (list_t)); ++} ++ ++/* ++ * Dispatch the commit callbacks registered on this txg to worker threads. ++ */ ++static void ++txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) ++{ ++ int c; ++ tx_state_t *tx = &dp->dp_tx; ++ list_t *cb_list; ++ ++ for (c = 0; c < max_ncpus; c++) { ++ tx_cpu_t *tc = &tx->tx_cpu[c]; ++ /* No need to lock tx_cpu_t at this point */ ++ ++ int g = txg & TXG_MASK; ++ ++ if (list_is_empty(&tc->tc_callbacks[g])) ++ continue; ++ ++ if (tx->tx_commit_cb_taskq == NULL) { ++ /* ++ * Commit callback taskq hasn't been created yet. ++ */ ++ tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", ++ 100, minclsyspri, max_ncpus, INT_MAX, ++ TASKQ_THREADS_CPU_PCT | TASKQ_PREPOPULATE); ++ } ++ ++ cb_list = kmem_alloc(sizeof (list_t), KM_PUSHPAGE); ++ list_create(cb_list, sizeof (dmu_tx_callback_t), ++ offsetof(dmu_tx_callback_t, dcb_node)); ++ ++ list_move_tail(cb_list, &tc->tc_callbacks[g]); ++ ++ (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) ++ txg_do_callbacks, cb_list, TQ_SLEEP); ++ } ++} ++ ++/* ++ * Wait for pending commit callbacks of already-synced transactions to finish ++ * processing. ++ * Calling this function from within a commit callback will deadlock. ++ */ ++void ++txg_wait_callbacks(dsl_pool_t *dp) ++{ ++ tx_state_t *tx = &dp->dp_tx; ++ ++ if (tx->tx_commit_cb_taskq != NULL) ++ taskq_wait(tx->tx_commit_cb_taskq); ++} ++ ++static void ++txg_sync_thread(dsl_pool_t *dp) ++{ ++ spa_t *spa = dp->dp_spa; ++ tx_state_t *tx = &dp->dp_tx; ++ callb_cpr_t cpr; ++ uint64_t start, delta; ++ ++#ifdef _KERNEL ++ /* ++ * Annotate this process with a flag that indicates that it is ++ * unsafe to use KM_SLEEP during memory allocations due to the ++ * potential for a deadlock. KM_PUSHPAGE should be used instead. ++ */ ++ current->flags |= PF_NOFS; ++#endif /* _KERNEL */ ++ ++ txg_thread_enter(tx, &cpr); ++ ++ start = delta = 0; ++ for (;;) { ++ hrtime_t hrstart; ++ txg_history_t *th; ++ uint64_t timer, timeout; ++ uint64_t txg; ++ ++ timeout = zfs_txg_timeout * hz; ++ ++ /* ++ * We sync when we're scanning, there's someone waiting ++ * on us, or the quiesce thread has handed off a txg to ++ * us, or we have reached our timeout. ++ */ ++ timer = (delta >= timeout ? 0 : timeout - delta); ++ while (!dsl_scan_active(dp->dp_scan) && ++ !tx->tx_exiting && timer > 0 && ++ tx->tx_synced_txg >= tx->tx_sync_txg_waiting && ++ tx->tx_quiesced_txg == 0) { ++ dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", ++ tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); ++ txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); ++ delta = ddi_get_lbolt() - start; ++ timer = (delta > timeout ? 0 : timeout - delta); ++ } ++ ++ /* ++ * Wait until the quiesce thread hands off a txg to us, ++ * prompting it to do so if necessary. ++ */ ++ while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { ++ if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) ++ tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; ++ cv_broadcast(&tx->tx_quiesce_more_cv); ++ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); ++ } ++ ++ if (tx->tx_exiting) ++ txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); ++ ++ /* ++ * Consume the quiesced txg which has been handed off to ++ * us. This may cause the quiescing thread to now be ++ * able to quiesce another txg, so we must signal it. ++ */ ++ txg = tx->tx_quiesced_txg; ++ tx->tx_quiesced_txg = 0; ++ tx->tx_syncing_txg = txg; ++ cv_broadcast(&tx->tx_quiesce_more_cv); ++ ++ th = dsl_pool_txg_history_get(dp, txg); ++ th->th_kstat.state = TXG_STATE_SYNCING; ++ vdev_get_stats(spa->spa_root_vdev, &th->th_vs1); ++ dsl_pool_txg_history_put(th); ++ ++ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", ++ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); ++ mutex_exit(&tx->tx_sync_lock); ++ ++ start = ddi_get_lbolt(); ++ hrstart = gethrtime(); ++ spa_sync(spa, txg); ++ delta = ddi_get_lbolt() - start; ++ ++ mutex_enter(&tx->tx_sync_lock); ++ tx->tx_synced_txg = txg; ++ tx->tx_syncing_txg = 0; ++ cv_broadcast(&tx->tx_sync_done_cv); ++ ++ /* ++ * Dispatch commit callbacks to worker threads. ++ */ ++ txg_dispatch_callbacks(dp, txg); ++ ++ /* ++ * Measure the txg sync time determine the amount of I/O done. ++ */ ++ th = dsl_pool_txg_history_get(dp, txg); ++ vdev_get_stats(spa->spa_root_vdev, &th->th_vs2); ++ th->th_kstat.sync_time = gethrtime() - hrstart; ++ th->th_kstat.nread = th->th_vs2.vs_bytes[ZIO_TYPE_READ] - ++ th->th_vs1.vs_bytes[ZIO_TYPE_READ]; ++ th->th_kstat.nwritten = th->th_vs2.vs_bytes[ZIO_TYPE_WRITE] - ++ th->th_vs1.vs_bytes[ZIO_TYPE_WRITE]; ++ th->th_kstat.reads = th->th_vs2.vs_ops[ZIO_TYPE_READ] - ++ th->th_vs1.vs_ops[ZIO_TYPE_READ]; ++ th->th_kstat.writes = th->th_vs2.vs_ops[ZIO_TYPE_WRITE] - ++ th->th_vs1.vs_ops[ZIO_TYPE_WRITE]; ++ th->th_kstat.state = TXG_STATE_COMMITTED; ++ dsl_pool_txg_history_put(th); ++ } ++} ++ ++static void ++txg_quiesce_thread(dsl_pool_t *dp) ++{ ++ tx_state_t *tx = &dp->dp_tx; ++ callb_cpr_t cpr; ++ ++ txg_thread_enter(tx, &cpr); ++ ++ for (;;) { ++ uint64_t txg; ++ ++ /* ++ * We quiesce when there's someone waiting on us. ++ * However, we can only have one txg in "quiescing" or ++ * "quiesced, waiting to sync" state. So we wait until ++ * the "quiesced, waiting to sync" txg has been consumed ++ * by the sync thread. ++ */ ++ while (!tx->tx_exiting && ++ (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || ++ tx->tx_quiesced_txg != 0)) ++ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); ++ ++ if (tx->tx_exiting) ++ txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); ++ ++ txg = tx->tx_open_txg; ++ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", ++ txg, tx->tx_quiesce_txg_waiting, ++ tx->tx_sync_txg_waiting); ++ mutex_exit(&tx->tx_sync_lock); ++ txg_quiesce(dp, txg); ++ mutex_enter(&tx->tx_sync_lock); ++ ++ /* ++ * Hand this txg off to the sync thread. ++ */ ++ dprintf("quiesce done, handing off txg %llu\n", txg); ++ tx->tx_quiesced_txg = txg; ++ cv_broadcast(&tx->tx_sync_more_cv); ++ cv_broadcast(&tx->tx_quiesce_done_cv); ++ } ++} ++ ++/* ++ * Delay this thread by 'ticks' if we are still in the open transaction ++ * group and there is already a waiting txg quiesing or quiesced. Abort ++ * the delay if this txg stalls or enters the quiesing state. ++ */ ++void ++txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) ++{ ++ tx_state_t *tx = &dp->dp_tx; ++ clock_t timeout = ddi_get_lbolt() + ticks; ++ ++ /* don't delay if this txg could transition to quiesing immediately */ ++ if (tx->tx_open_txg > txg || ++ tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) ++ return; ++ ++ mutex_enter(&tx->tx_sync_lock); ++ if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) { ++ mutex_exit(&tx->tx_sync_lock); ++ return; ++ } ++ ++ while (ddi_get_lbolt() < timeout && ++ tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) ++ (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock, ++ timeout); ++ ++ DMU_TX_STAT_BUMP(dmu_tx_delay); ++ ++ mutex_exit(&tx->tx_sync_lock); ++} ++ ++void ++txg_wait_synced(dsl_pool_t *dp, uint64_t txg) ++{ ++ tx_state_t *tx = &dp->dp_tx; ++ ++ mutex_enter(&tx->tx_sync_lock); ++ ASSERT(tx->tx_threads == 2); ++ if (txg == 0) ++ txg = tx->tx_open_txg + TXG_DEFER_SIZE; ++ if (tx->tx_sync_txg_waiting < txg) ++ tx->tx_sync_txg_waiting = txg; ++ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", ++ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); ++ while (tx->tx_synced_txg < txg) { ++ dprintf("broadcasting sync more " ++ "tx_synced=%llu waiting=%llu dp=%p\n", ++ tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); ++ cv_broadcast(&tx->tx_sync_more_cv); ++ cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock); ++ } ++ mutex_exit(&tx->tx_sync_lock); ++} ++ ++void ++txg_wait_open(dsl_pool_t *dp, uint64_t txg) ++{ ++ tx_state_t *tx = &dp->dp_tx; ++ ++ mutex_enter(&tx->tx_sync_lock); ++ ASSERT(tx->tx_threads == 2); ++ if (txg == 0) ++ txg = tx->tx_open_txg + 1; ++ if (tx->tx_quiesce_txg_waiting < txg) ++ tx->tx_quiesce_txg_waiting = txg; ++ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", ++ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); ++ while (tx->tx_open_txg < txg) { ++ cv_broadcast(&tx->tx_quiesce_more_cv); ++ cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); ++ } ++ mutex_exit(&tx->tx_sync_lock); ++} ++ ++boolean_t ++txg_stalled(dsl_pool_t *dp) ++{ ++ tx_state_t *tx = &dp->dp_tx; ++ return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); ++} ++ ++boolean_t ++txg_sync_waiting(dsl_pool_t *dp) ++{ ++ tx_state_t *tx = &dp->dp_tx; ++ ++ return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting || ++ tx->tx_quiesced_txg != 0); ++} ++ ++/* ++ * Per-txg object lists. ++ */ ++void ++txg_list_create(txg_list_t *tl, size_t offset) ++{ ++ int t; ++ ++ mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ tl->tl_offset = offset; ++ ++ for (t = 0; t < TXG_SIZE; t++) ++ tl->tl_head[t] = NULL; ++} ++ ++void ++txg_list_destroy(txg_list_t *tl) ++{ ++ int t; ++ ++ for (t = 0; t < TXG_SIZE; t++) ++ ASSERT(txg_list_empty(tl, t)); ++ ++ mutex_destroy(&tl->tl_lock); ++} ++ ++int ++txg_list_empty(txg_list_t *tl, uint64_t txg) ++{ ++ return (tl->tl_head[txg & TXG_MASK] == NULL); ++} ++ ++/* ++ * Add an entry to the list. ++ * Returns 0 if it's a new entry, 1 if it's already there. ++ */ ++int ++txg_list_add(txg_list_t *tl, void *p, uint64_t txg) ++{ ++ int t = txg & TXG_MASK; ++ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); ++ int already_on_list; ++ ++ mutex_enter(&tl->tl_lock); ++ already_on_list = tn->tn_member[t]; ++ if (!already_on_list) { ++ tn->tn_member[t] = 1; ++ tn->tn_next[t] = tl->tl_head[t]; ++ tl->tl_head[t] = tn; ++ } ++ mutex_exit(&tl->tl_lock); ++ ++ return (already_on_list); ++} ++ ++/* ++ * Add an entry to the end of the list (walks list to find end). ++ * Returns 0 if it's a new entry, 1 if it's already there. ++ */ ++int ++txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) ++{ ++ int t = txg & TXG_MASK; ++ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); ++ int already_on_list; ++ ++ mutex_enter(&tl->tl_lock); ++ already_on_list = tn->tn_member[t]; ++ if (!already_on_list) { ++ txg_node_t **tp; ++ ++ for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) ++ continue; ++ ++ tn->tn_member[t] = 1; ++ tn->tn_next[t] = NULL; ++ *tp = tn; ++ } ++ mutex_exit(&tl->tl_lock); ++ ++ return (already_on_list); ++} ++ ++/* ++ * Remove the head of the list and return it. ++ */ ++void * ++txg_list_remove(txg_list_t *tl, uint64_t txg) ++{ ++ int t = txg & TXG_MASK; ++ txg_node_t *tn; ++ void *p = NULL; ++ ++ mutex_enter(&tl->tl_lock); ++ if ((tn = tl->tl_head[t]) != NULL) { ++ p = (char *)tn - tl->tl_offset; ++ tl->tl_head[t] = tn->tn_next[t]; ++ tn->tn_next[t] = NULL; ++ tn->tn_member[t] = 0; ++ } ++ mutex_exit(&tl->tl_lock); ++ ++ return (p); ++} ++ ++/* ++ * Remove a specific item from the list and return it. ++ */ ++void * ++txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) ++{ ++ int t = txg & TXG_MASK; ++ txg_node_t *tn, **tp; ++ ++ mutex_enter(&tl->tl_lock); ++ ++ for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { ++ if ((char *)tn - tl->tl_offset == p) { ++ *tp = tn->tn_next[t]; ++ tn->tn_next[t] = NULL; ++ tn->tn_member[t] = 0; ++ mutex_exit(&tl->tl_lock); ++ return (p); ++ } ++ } ++ ++ mutex_exit(&tl->tl_lock); ++ ++ return (NULL); ++} ++ ++int ++txg_list_member(txg_list_t *tl, void *p, uint64_t txg) ++{ ++ int t = txg & TXG_MASK; ++ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); ++ ++ return (tn->tn_member[t]); ++} ++ ++/* ++ * Walk a txg list -- only safe if you know it's not changing. ++ */ ++void * ++txg_list_head(txg_list_t *tl, uint64_t txg) ++{ ++ int t = txg & TXG_MASK; ++ txg_node_t *tn = tl->tl_head[t]; ++ ++ return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); ++} ++ ++void * ++txg_list_next(txg_list_t *tl, void *p, uint64_t txg) ++{ ++ int t = txg & TXG_MASK; ++ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); ++ ++ tn = tn->tn_next[t]; ++ ++ return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(txg_init); ++EXPORT_SYMBOL(txg_fini); ++EXPORT_SYMBOL(txg_sync_start); ++EXPORT_SYMBOL(txg_sync_stop); ++EXPORT_SYMBOL(txg_hold_open); ++EXPORT_SYMBOL(txg_rele_to_quiesce); ++EXPORT_SYMBOL(txg_rele_to_sync); ++EXPORT_SYMBOL(txg_register_callbacks); ++EXPORT_SYMBOL(txg_delay); ++EXPORT_SYMBOL(txg_wait_synced); ++EXPORT_SYMBOL(txg_wait_open); ++EXPORT_SYMBOL(txg_wait_callbacks); ++EXPORT_SYMBOL(txg_stalled); ++EXPORT_SYMBOL(txg_sync_waiting); ++ ++module_param(zfs_txg_timeout, int, 0644); ++MODULE_PARM_DESC(zfs_txg_timeout, "Max seconds worth of delta per txg"); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/uberblock.c linux-3.2.33-go/fs/zfs/zfs/uberblock.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/uberblock.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/uberblock.c 2012-11-16 23:25:34.349039334 +0100 +@@ -0,0 +1,61 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++ ++int ++uberblock_verify(uberblock_t *ub) ++{ ++ if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) ++ byteswap_uint64_array(ub, sizeof (uberblock_t)); ++ ++ if (ub->ub_magic != UBERBLOCK_MAGIC) ++ return (EINVAL); ++ ++ return (0); ++} ++ ++/* ++ * Update the uberblock and return a boolean value indicating whether ++ * anything changed in this transaction group. ++ */ ++int ++uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg) ++{ ++ ASSERT(ub->ub_txg < txg); ++ ++ /* ++ * We explicitly do not set ub_version here, so that older versions ++ * continue to be written with the previous uberblock version. ++ */ ++ ub->ub_magic = UBERBLOCK_MAGIC; ++ ub->ub_txg = txg; ++ ub->ub_guid_sum = rvd->vdev_guid_sum; ++ ub->ub_timestamp = gethrestime_sec(); ++ ub->ub_software_version = SPA_VERSION; ++ ++ return (ub->ub_rootbp.blk_birth == txg); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/unique.c linux-3.2.33-go/fs/zfs/zfs/unique.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/unique.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/unique.c 2012-11-16 23:25:34.352039300 +0100 +@@ -0,0 +1,116 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++ ++ ++#include ++#include ++#include ++ ++static avl_tree_t unique_avl; ++static kmutex_t unique_mtx; ++ ++typedef struct unique { ++ avl_node_t un_link; ++ uint64_t un_value; ++} unique_t; ++ ++#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1) ++ ++static int ++unique_compare(const void *a, const void *b) ++{ ++ const unique_t *una = a; ++ const unique_t *unb = b; ++ ++ if (una->un_value < unb->un_value) ++ return (-1); ++ if (una->un_value > unb->un_value) ++ return (+1); ++ return (0); ++} ++ ++void ++unique_init(void) ++{ ++ avl_create(&unique_avl, unique_compare, ++ sizeof (unique_t), offsetof(unique_t, un_link)); ++ mutex_init(&unique_mtx, NULL, MUTEX_DEFAULT, NULL); ++} ++ ++void ++unique_fini(void) ++{ ++ avl_destroy(&unique_avl); ++ mutex_destroy(&unique_mtx); ++} ++ ++uint64_t ++unique_create(void) ++{ ++ uint64_t value = unique_insert(0); ++ unique_remove(value); ++ return (value); ++} ++ ++uint64_t ++unique_insert(uint64_t value) ++{ ++ avl_index_t idx; ++ unique_t *un = kmem_alloc(sizeof (unique_t), KM_PUSHPAGE); ++ ++ un->un_value = value; ++ ++ mutex_enter(&unique_mtx); ++ while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK || ++ avl_find(&unique_avl, un, &idx)) { ++ mutex_exit(&unique_mtx); ++ (void) random_get_pseudo_bytes((void*)&un->un_value, ++ sizeof (un->un_value)); ++ un->un_value &= UNIQUE_MASK; ++ mutex_enter(&unique_mtx); ++ } ++ ++ avl_insert(&unique_avl, un, idx); ++ mutex_exit(&unique_mtx); ++ ++ return (un->un_value); ++} ++ ++void ++unique_remove(uint64_t value) ++{ ++ unique_t un_tofind; ++ unique_t *un; ++ ++ un_tofind.un_value = value; ++ mutex_enter(&unique_mtx); ++ un = avl_find(&unique_avl, &un_tofind, NULL); ++ if (un != NULL) { ++ avl_remove(&unique_avl, un); ++ kmem_free(un, sizeof (unique_t)); ++ } ++ mutex_exit(&unique_mtx); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/vdev.c linux-3.2.33-go/fs/zfs/zfs/vdev.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/vdev.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/vdev.c 2012-11-16 23:25:34.349039334 +0100 +@@ -0,0 +1,3207 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Virtual device management. ++ */ ++ ++static vdev_ops_t *vdev_ops_table[] = { ++ &vdev_root_ops, ++ &vdev_raidz_ops, ++ &vdev_mirror_ops, ++ &vdev_replacing_ops, ++ &vdev_spare_ops, ++ &vdev_disk_ops, ++ &vdev_file_ops, ++ &vdev_missing_ops, ++ &vdev_hole_ops, ++ NULL ++}; ++ ++/* maximum scrub/resilver I/O queue per leaf vdev */ ++int zfs_scrub_limit = 10; ++ ++/* ++ * Given a vdev type, return the appropriate ops vector. ++ */ ++static vdev_ops_t * ++vdev_getops(const char *type) ++{ ++ vdev_ops_t *ops, **opspp; ++ ++ for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) ++ if (strcmp(ops->vdev_op_type, type) == 0) ++ break; ++ ++ return (ops); ++} ++ ++/* ++ * Default asize function: return the MAX of psize with the asize of ++ * all children. This is what's used by anything other than RAID-Z. ++ */ ++uint64_t ++vdev_default_asize(vdev_t *vd, uint64_t psize) ++{ ++ uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); ++ uint64_t csize; ++ int c; ++ ++ for (c = 0; c < vd->vdev_children; c++) { ++ csize = vdev_psize_to_asize(vd->vdev_child[c], psize); ++ asize = MAX(asize, csize); ++ } ++ ++ return (asize); ++} ++ ++/* ++ * Get the minimum allocatable size. We define the allocatable size as ++ * the vdev's asize rounded to the nearest metaslab. This allows us to ++ * replace or attach devices which don't have the same physical size but ++ * can still satisfy the same number of allocations. ++ */ ++uint64_t ++vdev_get_min_asize(vdev_t *vd) ++{ ++ vdev_t *pvd = vd->vdev_parent; ++ ++ /* ++ * If our parent is NULL (inactive spare or cache) or is the root, ++ * just return our own asize. ++ */ ++ if (pvd == NULL) ++ return (vd->vdev_asize); ++ ++ /* ++ * The top-level vdev just returns the allocatable size rounded ++ * to the nearest metaslab. ++ */ ++ if (vd == vd->vdev_top) ++ return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); ++ ++ /* ++ * The allocatable space for a raidz vdev is N * sizeof(smallest child), ++ * so each child must provide at least 1/Nth of its asize. ++ */ ++ if (pvd->vdev_ops == &vdev_raidz_ops) ++ return (pvd->vdev_min_asize / pvd->vdev_children); ++ ++ return (pvd->vdev_min_asize); ++} ++ ++void ++vdev_set_min_asize(vdev_t *vd) ++{ ++ int c; ++ vd->vdev_min_asize = vdev_get_min_asize(vd); ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ vdev_set_min_asize(vd->vdev_child[c]); ++} ++ ++vdev_t * ++vdev_lookup_top(spa_t *spa, uint64_t vdev) ++{ ++ vdev_t *rvd = spa->spa_root_vdev; ++ ++ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); ++ ++ if (vdev < rvd->vdev_children) { ++ ASSERT(rvd->vdev_child[vdev] != NULL); ++ return (rvd->vdev_child[vdev]); ++ } ++ ++ return (NULL); ++} ++ ++vdev_t * ++vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) ++{ ++ vdev_t *mvd; ++ int c; ++ ++ if (vd->vdev_guid == guid) ++ return (vd); ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != ++ NULL) ++ return (mvd); ++ ++ return (NULL); ++} ++ ++void ++vdev_add_child(vdev_t *pvd, vdev_t *cvd) ++{ ++ size_t oldsize, newsize; ++ uint64_t id = cvd->vdev_id; ++ vdev_t **newchild; ++ ++ ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ++ ASSERT(cvd->vdev_parent == NULL); ++ ++ cvd->vdev_parent = pvd; ++ ++ if (pvd == NULL) ++ return; ++ ++ ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); ++ ++ oldsize = pvd->vdev_children * sizeof (vdev_t *); ++ pvd->vdev_children = MAX(pvd->vdev_children, id + 1); ++ newsize = pvd->vdev_children * sizeof (vdev_t *); ++ ++ newchild = kmem_zalloc(newsize, KM_PUSHPAGE); ++ if (pvd->vdev_child != NULL) { ++ bcopy(pvd->vdev_child, newchild, oldsize); ++ kmem_free(pvd->vdev_child, oldsize); ++ } ++ ++ pvd->vdev_child = newchild; ++ pvd->vdev_child[id] = cvd; ++ ++ cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ++ ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); ++ ++ /* ++ * Walk up all ancestors to update guid sum. ++ */ ++ for (; pvd != NULL; pvd = pvd->vdev_parent) ++ pvd->vdev_guid_sum += cvd->vdev_guid_sum; ++} ++ ++void ++vdev_remove_child(vdev_t *pvd, vdev_t *cvd) ++{ ++ int c; ++ uint_t id = cvd->vdev_id; ++ ++ ASSERT(cvd->vdev_parent == pvd); ++ ++ if (pvd == NULL) ++ return; ++ ++ ASSERT(id < pvd->vdev_children); ++ ASSERT(pvd->vdev_child[id] == cvd); ++ ++ pvd->vdev_child[id] = NULL; ++ cvd->vdev_parent = NULL; ++ ++ for (c = 0; c < pvd->vdev_children; c++) ++ if (pvd->vdev_child[c]) ++ break; ++ ++ if (c == pvd->vdev_children) { ++ kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); ++ pvd->vdev_child = NULL; ++ pvd->vdev_children = 0; ++ } ++ ++ /* ++ * Walk up all ancestors to update guid sum. ++ */ ++ for (; pvd != NULL; pvd = pvd->vdev_parent) ++ pvd->vdev_guid_sum -= cvd->vdev_guid_sum; ++} ++ ++/* ++ * Remove any holes in the child array. ++ */ ++void ++vdev_compact_children(vdev_t *pvd) ++{ ++ vdev_t **newchild, *cvd; ++ int oldc = pvd->vdev_children; ++ int newc; ++ int c; ++ ++ ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ++ ++ for (c = newc = 0; c < oldc; c++) ++ if (pvd->vdev_child[c]) ++ newc++; ++ ++ newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_PUSHPAGE); ++ ++ for (c = newc = 0; c < oldc; c++) { ++ if ((cvd = pvd->vdev_child[c]) != NULL) { ++ newchild[newc] = cvd; ++ cvd->vdev_id = newc++; ++ } ++ } ++ ++ kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); ++ pvd->vdev_child = newchild; ++ pvd->vdev_children = newc; ++} ++ ++/* ++ * Allocate and minimally initialize a vdev_t. ++ */ ++vdev_t * ++vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) ++{ ++ vdev_t *vd; ++ int t; ++ ++ vd = kmem_zalloc(sizeof (vdev_t), KM_PUSHPAGE); ++ ++ if (spa->spa_root_vdev == NULL) { ++ ASSERT(ops == &vdev_root_ops); ++ spa->spa_root_vdev = vd; ++ spa->spa_load_guid = spa_generate_guid(NULL); ++ } ++ ++ if (guid == 0 && ops != &vdev_hole_ops) { ++ if (spa->spa_root_vdev == vd) { ++ /* ++ * The root vdev's guid will also be the pool guid, ++ * which must be unique among all pools. ++ */ ++ guid = spa_generate_guid(NULL); ++ } else { ++ /* ++ * Any other vdev's guid must be unique within the pool. ++ */ ++ guid = spa_generate_guid(spa); ++ } ++ ASSERT(!spa_guid_exists(spa_guid(spa), guid)); ++ } ++ ++ vd->vdev_spa = spa; ++ vd->vdev_id = id; ++ vd->vdev_guid = guid; ++ vd->vdev_guid_sum = guid; ++ vd->vdev_ops = ops; ++ vd->vdev_state = VDEV_STATE_CLOSED; ++ vd->vdev_ishole = (ops == &vdev_hole_ops); ++ ++ list_link_init(&vd->vdev_config_dirty_node); ++ list_link_init(&vd->vdev_state_dirty_node); ++ mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); ++ for (t = 0; t < DTL_TYPES; t++) { ++ space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0, ++ &vd->vdev_dtl_lock); ++ } ++ txg_list_create(&vd->vdev_ms_list, ++ offsetof(struct metaslab, ms_txg_node)); ++ txg_list_create(&vd->vdev_dtl_list, ++ offsetof(struct vdev, vdev_dtl_node)); ++ vd->vdev_stat.vs_timestamp = gethrtime(); ++ vdev_queue_init(vd); ++ vdev_cache_init(vd); ++ ++ return (vd); ++} ++ ++/* ++ * Allocate a new vdev. The 'alloctype' is used to control whether we are ++ * creating a new vdev or loading an existing one - the behavior is slightly ++ * different for each case. ++ */ ++int ++vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, ++ int alloctype) ++{ ++ vdev_ops_t *ops; ++ char *type; ++ uint64_t guid = 0, islog, nparity; ++ vdev_t *vd; ++ ++ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ++ ++ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) ++ return (EINVAL); ++ ++ if ((ops = vdev_getops(type)) == NULL) ++ return (EINVAL); ++ ++ /* ++ * If this is a load, get the vdev guid from the nvlist. ++ * Otherwise, vdev_alloc_common() will generate one for us. ++ */ ++ if (alloctype == VDEV_ALLOC_LOAD) { ++ uint64_t label_id; ++ ++ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || ++ label_id != id) ++ return (EINVAL); ++ ++ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) ++ return (EINVAL); ++ } else if (alloctype == VDEV_ALLOC_SPARE) { ++ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) ++ return (EINVAL); ++ } else if (alloctype == VDEV_ALLOC_L2CACHE) { ++ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) ++ return (EINVAL); ++ } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { ++ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) ++ return (EINVAL); ++ } ++ ++ /* ++ * The first allocated vdev must be of type 'root'. ++ */ ++ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) ++ return (EINVAL); ++ ++ /* ++ * Determine whether we're a log vdev. ++ */ ++ islog = 0; ++ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); ++ if (islog && spa_version(spa) < SPA_VERSION_SLOGS) ++ return (ENOTSUP); ++ ++ if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) ++ return (ENOTSUP); ++ ++ /* ++ * Set the nparity property for RAID-Z vdevs. ++ */ ++ nparity = -1ULL; ++ if (ops == &vdev_raidz_ops) { ++ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, ++ &nparity) == 0) { ++ if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) ++ return (EINVAL); ++ /* ++ * Previous versions could only support 1 or 2 parity ++ * device. ++ */ ++ if (nparity > 1 && ++ spa_version(spa) < SPA_VERSION_RAIDZ2) ++ return (ENOTSUP); ++ if (nparity > 2 && ++ spa_version(spa) < SPA_VERSION_RAIDZ3) ++ return (ENOTSUP); ++ } else { ++ /* ++ * We require the parity to be specified for SPAs that ++ * support multiple parity levels. ++ */ ++ if (spa_version(spa) >= SPA_VERSION_RAIDZ2) ++ return (EINVAL); ++ /* ++ * Otherwise, we default to 1 parity device for RAID-Z. ++ */ ++ nparity = 1; ++ } ++ } else { ++ nparity = 0; ++ } ++ ASSERT(nparity != -1ULL); ++ ++ vd = vdev_alloc_common(spa, id, guid, ops); ++ ++ vd->vdev_islog = islog; ++ vd->vdev_nparity = nparity; ++ ++ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) ++ vd->vdev_path = spa_strdup(vd->vdev_path); ++ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) ++ vd->vdev_devid = spa_strdup(vd->vdev_devid); ++ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, ++ &vd->vdev_physpath) == 0) ++ vd->vdev_physpath = spa_strdup(vd->vdev_physpath); ++ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) ++ vd->vdev_fru = spa_strdup(vd->vdev_fru); ++ ++ /* ++ * Set the whole_disk property. If it's not specified, leave the value ++ * as -1. ++ */ ++ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, ++ &vd->vdev_wholedisk) != 0) ++ vd->vdev_wholedisk = -1ULL; ++ ++ /* ++ * Look for the 'not present' flag. This will only be set if the device ++ * was not present at the time of import. ++ */ ++ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, ++ &vd->vdev_not_present); ++ ++ /* ++ * Get the alignment requirement. ++ */ ++ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); ++ ++ /* ++ * Retrieve the vdev creation time. ++ */ ++ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, ++ &vd->vdev_crtxg); ++ ++ /* ++ * If we're a top-level vdev, try to load the allocation parameters. ++ */ ++ if (parent && !parent->vdev_parent && ++ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { ++ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, ++ &vd->vdev_ms_array); ++ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, ++ &vd->vdev_ms_shift); ++ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, ++ &vd->vdev_asize); ++ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, ++ &vd->vdev_removing); ++ } ++ ++ if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { ++ ASSERT(alloctype == VDEV_ALLOC_LOAD || ++ alloctype == VDEV_ALLOC_ADD || ++ alloctype == VDEV_ALLOC_SPLIT || ++ alloctype == VDEV_ALLOC_ROOTPOOL); ++ vd->vdev_mg = metaslab_group_create(islog ? ++ spa_log_class(spa) : spa_normal_class(spa), vd); ++ } ++ ++ /* ++ * If we're a leaf vdev, try to load the DTL object and other state. ++ */ ++ if (vd->vdev_ops->vdev_op_leaf && ++ (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || ++ alloctype == VDEV_ALLOC_ROOTPOOL)) { ++ if (alloctype == VDEV_ALLOC_LOAD) { ++ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, ++ &vd->vdev_dtl_smo.smo_object); ++ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, ++ &vd->vdev_unspare); ++ } ++ ++ if (alloctype == VDEV_ALLOC_ROOTPOOL) { ++ uint64_t spare = 0; ++ ++ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, ++ &spare) == 0 && spare) ++ spa_spare_add(vd); ++ } ++ ++ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, ++ &vd->vdev_offline); ++ ++ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING, ++ &vd->vdev_resilvering); ++ ++ /* ++ * When importing a pool, we want to ignore the persistent fault ++ * state, as the diagnosis made on another system may not be ++ * valid in the current context. Local vdevs will ++ * remain in the faulted state. ++ */ ++ if (spa_load_state(spa) == SPA_LOAD_OPEN) { ++ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, ++ &vd->vdev_faulted); ++ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, ++ &vd->vdev_degraded); ++ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, ++ &vd->vdev_removed); ++ ++ if (vd->vdev_faulted || vd->vdev_degraded) { ++ char *aux; ++ ++ vd->vdev_label_aux = ++ VDEV_AUX_ERR_EXCEEDED; ++ if (nvlist_lookup_string(nv, ++ ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && ++ strcmp(aux, "external") == 0) ++ vd->vdev_label_aux = VDEV_AUX_EXTERNAL; ++ } ++ } ++ } ++ ++ /* ++ * Add ourselves to the parent's list of children. ++ */ ++ vdev_add_child(parent, vd); ++ ++ *vdp = vd; ++ ++ return (0); ++} ++ ++void ++vdev_free(vdev_t *vd) ++{ ++ int c, t; ++ spa_t *spa = vd->vdev_spa; ++ ++ /* ++ * vdev_free() implies closing the vdev first. This is simpler than ++ * trying to ensure complicated semantics for all callers. ++ */ ++ vdev_close(vd); ++ ++ ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ++ ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); ++ ++ /* ++ * Free all children. ++ */ ++ for (c = 0; c < vd->vdev_children; c++) ++ vdev_free(vd->vdev_child[c]); ++ ++ ASSERT(vd->vdev_child == NULL); ++ ASSERT(vd->vdev_guid_sum == vd->vdev_guid); ++ ++ /* ++ * Discard allocation state. ++ */ ++ if (vd->vdev_mg != NULL) { ++ vdev_metaslab_fini(vd); ++ metaslab_group_destroy(vd->vdev_mg); ++ } ++ ++ ASSERT3U(vd->vdev_stat.vs_space, ==, 0); ++ ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); ++ ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); ++ ++ /* ++ * Remove this vdev from its parent's child list. ++ */ ++ vdev_remove_child(vd->vdev_parent, vd); ++ ++ ASSERT(vd->vdev_parent == NULL); ++ ++ /* ++ * Clean up vdev structure. ++ */ ++ vdev_queue_fini(vd); ++ vdev_cache_fini(vd); ++ ++ if (vd->vdev_path) ++ spa_strfree(vd->vdev_path); ++ if (vd->vdev_devid) ++ spa_strfree(vd->vdev_devid); ++ if (vd->vdev_physpath) ++ spa_strfree(vd->vdev_physpath); ++ if (vd->vdev_fru) ++ spa_strfree(vd->vdev_fru); ++ ++ if (vd->vdev_isspare) ++ spa_spare_remove(vd); ++ if (vd->vdev_isl2cache) ++ spa_l2cache_remove(vd); ++ ++ txg_list_destroy(&vd->vdev_ms_list); ++ txg_list_destroy(&vd->vdev_dtl_list); ++ ++ mutex_enter(&vd->vdev_dtl_lock); ++ for (t = 0; t < DTL_TYPES; t++) { ++ space_map_unload(&vd->vdev_dtl[t]); ++ space_map_destroy(&vd->vdev_dtl[t]); ++ } ++ mutex_exit(&vd->vdev_dtl_lock); ++ ++ mutex_destroy(&vd->vdev_dtl_lock); ++ mutex_destroy(&vd->vdev_stat_lock); ++ mutex_destroy(&vd->vdev_probe_lock); ++ ++ if (vd == spa->spa_root_vdev) ++ spa->spa_root_vdev = NULL; ++ ++ kmem_free(vd, sizeof (vdev_t)); ++} ++ ++/* ++ * Transfer top-level vdev state from svd to tvd. ++ */ ++static void ++vdev_top_transfer(vdev_t *svd, vdev_t *tvd) ++{ ++ spa_t *spa = svd->vdev_spa; ++ metaslab_t *msp; ++ vdev_t *vd; ++ int t; ++ ++ ASSERT(tvd == tvd->vdev_top); ++ ++ tvd->vdev_ms_array = svd->vdev_ms_array; ++ tvd->vdev_ms_shift = svd->vdev_ms_shift; ++ tvd->vdev_ms_count = svd->vdev_ms_count; ++ ++ svd->vdev_ms_array = 0; ++ svd->vdev_ms_shift = 0; ++ svd->vdev_ms_count = 0; ++ ++ if (tvd->vdev_mg) ++ ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); ++ tvd->vdev_mg = svd->vdev_mg; ++ tvd->vdev_ms = svd->vdev_ms; ++ ++ svd->vdev_mg = NULL; ++ svd->vdev_ms = NULL; ++ ++ if (tvd->vdev_mg != NULL) ++ tvd->vdev_mg->mg_vd = tvd; ++ ++ tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; ++ tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; ++ tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; ++ ++ svd->vdev_stat.vs_alloc = 0; ++ svd->vdev_stat.vs_space = 0; ++ svd->vdev_stat.vs_dspace = 0; ++ ++ for (t = 0; t < TXG_SIZE; t++) { ++ while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) ++ (void) txg_list_add(&tvd->vdev_ms_list, msp, t); ++ while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) ++ (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); ++ if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) ++ (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); ++ } ++ ++ if (list_link_active(&svd->vdev_config_dirty_node)) { ++ vdev_config_clean(svd); ++ vdev_config_dirty(tvd); ++ } ++ ++ if (list_link_active(&svd->vdev_state_dirty_node)) { ++ vdev_state_clean(svd); ++ vdev_state_dirty(tvd); ++ } ++ ++ tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; ++ svd->vdev_deflate_ratio = 0; ++ ++ tvd->vdev_islog = svd->vdev_islog; ++ svd->vdev_islog = 0; ++} ++ ++static void ++vdev_top_update(vdev_t *tvd, vdev_t *vd) ++{ ++ int c; ++ ++ if (vd == NULL) ++ return; ++ ++ vd->vdev_top = tvd; ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ vdev_top_update(tvd, vd->vdev_child[c]); ++} ++ ++/* ++ * Add a mirror/replacing vdev above an existing vdev. ++ */ ++vdev_t * ++vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) ++{ ++ spa_t *spa = cvd->vdev_spa; ++ vdev_t *pvd = cvd->vdev_parent; ++ vdev_t *mvd; ++ ++ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ++ ++ mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); ++ ++ mvd->vdev_asize = cvd->vdev_asize; ++ mvd->vdev_min_asize = cvd->vdev_min_asize; ++ mvd->vdev_max_asize = cvd->vdev_max_asize; ++ mvd->vdev_ashift = cvd->vdev_ashift; ++ mvd->vdev_state = cvd->vdev_state; ++ mvd->vdev_crtxg = cvd->vdev_crtxg; ++ ++ vdev_remove_child(pvd, cvd); ++ vdev_add_child(pvd, mvd); ++ cvd->vdev_id = mvd->vdev_children; ++ vdev_add_child(mvd, cvd); ++ vdev_top_update(cvd->vdev_top, cvd->vdev_top); ++ ++ if (mvd == mvd->vdev_top) ++ vdev_top_transfer(cvd, mvd); ++ ++ return (mvd); ++} ++ ++/* ++ * Remove a 1-way mirror/replacing vdev from the tree. ++ */ ++void ++vdev_remove_parent(vdev_t *cvd) ++{ ++ vdev_t *mvd = cvd->vdev_parent; ++ vdev_t *pvd = mvd->vdev_parent; ++ ++ ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ++ ++ ASSERT(mvd->vdev_children == 1); ++ ASSERT(mvd->vdev_ops == &vdev_mirror_ops || ++ mvd->vdev_ops == &vdev_replacing_ops || ++ mvd->vdev_ops == &vdev_spare_ops); ++ cvd->vdev_ashift = mvd->vdev_ashift; ++ ++ vdev_remove_child(mvd, cvd); ++ vdev_remove_child(pvd, mvd); ++ ++ /* ++ * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. ++ * Otherwise, we could have detached an offline device, and when we ++ * go to import the pool we'll think we have two top-level vdevs, ++ * instead of a different version of the same top-level vdev. ++ */ ++ if (mvd->vdev_top == mvd) { ++ uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; ++ cvd->vdev_orig_guid = cvd->vdev_guid; ++ cvd->vdev_guid += guid_delta; ++ cvd->vdev_guid_sum += guid_delta; ++ } ++ cvd->vdev_id = mvd->vdev_id; ++ vdev_add_child(pvd, cvd); ++ vdev_top_update(cvd->vdev_top, cvd->vdev_top); ++ ++ if (cvd == cvd->vdev_top) ++ vdev_top_transfer(mvd, cvd); ++ ++ ASSERT(mvd->vdev_children == 0); ++ vdev_free(mvd); ++} ++ ++int ++vdev_metaslab_init(vdev_t *vd, uint64_t txg) ++{ ++ spa_t *spa = vd->vdev_spa; ++ objset_t *mos = spa->spa_meta_objset; ++ uint64_t m; ++ uint64_t oldc = vd->vdev_ms_count; ++ uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; ++ metaslab_t **mspp; ++ int error; ++ ++ ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); ++ ++ /* ++ * This vdev is not being allocated from yet or is a hole. ++ */ ++ if (vd->vdev_ms_shift == 0) ++ return (0); ++ ++ ASSERT(!vd->vdev_ishole); ++ ++ /* ++ * Compute the raidz-deflation ratio. Note, we hard-code ++ * in 128k (1 << 17) because it is the current "typical" blocksize. ++ * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change, ++ * or we will inconsistently account for existing bp's. ++ */ ++ vd->vdev_deflate_ratio = (1 << 17) / ++ (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); ++ ++ ASSERT(oldc <= newc); ++ ++ mspp = kmem_zalloc(newc * sizeof (*mspp), KM_PUSHPAGE | KM_NODEBUG); ++ ++ if (oldc != 0) { ++ bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); ++ kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); ++ } ++ ++ vd->vdev_ms = mspp; ++ vd->vdev_ms_count = newc; ++ ++ for (m = oldc; m < newc; m++) { ++ space_map_obj_t smo = { 0, 0, 0 }; ++ if (txg == 0) { ++ uint64_t object = 0; ++ error = dmu_read(mos, vd->vdev_ms_array, ++ m * sizeof (uint64_t), sizeof (uint64_t), &object, ++ DMU_READ_PREFETCH); ++ if (error) ++ return (error); ++ if (object != 0) { ++ dmu_buf_t *db; ++ error = dmu_bonus_hold(mos, object, FTAG, &db); ++ if (error) ++ return (error); ++ ASSERT3U(db->db_size, >=, sizeof (smo)); ++ bcopy(db->db_data, &smo, sizeof (smo)); ++ ASSERT3U(smo.smo_object, ==, object); ++ dmu_buf_rele(db, FTAG); ++ } ++ } ++ vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, ++ m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); ++ } ++ ++ if (txg == 0) ++ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); ++ ++ /* ++ * If the vdev is being removed we don't activate ++ * the metaslabs since we want to ensure that no new ++ * allocations are performed on this device. ++ */ ++ if (oldc == 0 && !vd->vdev_removing) ++ metaslab_group_activate(vd->vdev_mg); ++ ++ if (txg == 0) ++ spa_config_exit(spa, SCL_ALLOC, FTAG); ++ ++ return (0); ++} ++ ++void ++vdev_metaslab_fini(vdev_t *vd) ++{ ++ uint64_t m; ++ uint64_t count = vd->vdev_ms_count; ++ ++ if (vd->vdev_ms != NULL) { ++ metaslab_group_passivate(vd->vdev_mg); ++ for (m = 0; m < count; m++) ++ if (vd->vdev_ms[m] != NULL) ++ metaslab_fini(vd->vdev_ms[m]); ++ kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); ++ vd->vdev_ms = NULL; ++ } ++ ++ ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); ++} ++ ++typedef struct vdev_probe_stats { ++ boolean_t vps_readable; ++ boolean_t vps_writeable; ++ int vps_flags; ++} vdev_probe_stats_t; ++ ++static void ++vdev_probe_done(zio_t *zio) ++{ ++ spa_t *spa = zio->io_spa; ++ vdev_t *vd = zio->io_vd; ++ vdev_probe_stats_t *vps = zio->io_private; ++ ++ ASSERT(vd->vdev_probe_zio != NULL); ++ ++ if (zio->io_type == ZIO_TYPE_READ) { ++ if (zio->io_error == 0) ++ vps->vps_readable = 1; ++ if (zio->io_error == 0 && spa_writeable(spa)) { ++ zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, ++ zio->io_offset, zio->io_size, zio->io_data, ++ ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ++ ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); ++ } else { ++ zio_buf_free(zio->io_data, zio->io_size); ++ } ++ } else if (zio->io_type == ZIO_TYPE_WRITE) { ++ if (zio->io_error == 0) ++ vps->vps_writeable = 1; ++ zio_buf_free(zio->io_data, zio->io_size); ++ } else if (zio->io_type == ZIO_TYPE_NULL) { ++ zio_t *pio; ++ ++ vd->vdev_cant_read |= !vps->vps_readable; ++ vd->vdev_cant_write |= !vps->vps_writeable; ++ ++ if (vdev_readable(vd) && ++ (vdev_writeable(vd) || !spa_writeable(spa))) { ++ zio->io_error = 0; ++ } else { ++ ASSERT(zio->io_error != 0); ++ zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, ++ spa, vd, NULL, 0, 0); ++ zio->io_error = ENXIO; ++ } ++ ++ mutex_enter(&vd->vdev_probe_lock); ++ ASSERT(vd->vdev_probe_zio == zio); ++ vd->vdev_probe_zio = NULL; ++ mutex_exit(&vd->vdev_probe_lock); ++ ++ while ((pio = zio_walk_parents(zio)) != NULL) ++ if (!vdev_accessible(vd, pio)) ++ pio->io_error = ENXIO; ++ ++ kmem_free(vps, sizeof (*vps)); ++ } ++} ++ ++/* ++ * Determine whether this device is accessible by reading and writing ++ * to several known locations: the pad regions of each vdev label ++ * but the first (which we leave alone in case it contains a VTOC). ++ */ ++zio_t * ++vdev_probe(vdev_t *vd, zio_t *zio) ++{ ++ spa_t *spa = vd->vdev_spa; ++ vdev_probe_stats_t *vps = NULL; ++ zio_t *pio; ++ int l; ++ ++ ASSERT(vd->vdev_ops->vdev_op_leaf); ++ ++ /* ++ * Don't probe the probe. ++ */ ++ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) ++ return (NULL); ++ ++ /* ++ * To prevent 'probe storms' when a device fails, we create ++ * just one probe i/o at a time. All zios that want to probe ++ * this vdev will become parents of the probe io. ++ */ ++ mutex_enter(&vd->vdev_probe_lock); ++ ++ if ((pio = vd->vdev_probe_zio) == NULL) { ++ vps = kmem_zalloc(sizeof (*vps), KM_PUSHPAGE); ++ ++ vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ++ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ++ ZIO_FLAG_TRYHARD; ++ ++ if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { ++ /* ++ * vdev_cant_read and vdev_cant_write can only ++ * transition from TRUE to FALSE when we have the ++ * SCL_ZIO lock as writer; otherwise they can only ++ * transition from FALSE to TRUE. This ensures that ++ * any zio looking at these values can assume that ++ * failures persist for the life of the I/O. That's ++ * important because when a device has intermittent ++ * connectivity problems, we want to ensure that ++ * they're ascribed to the device (ENXIO) and not ++ * the zio (EIO). ++ * ++ * Since we hold SCL_ZIO as writer here, clear both ++ * values so the probe can reevaluate from first ++ * principles. ++ */ ++ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; ++ vd->vdev_cant_read = B_FALSE; ++ vd->vdev_cant_write = B_FALSE; ++ } ++ ++ vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, ++ vdev_probe_done, vps, ++ vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); ++ ++ /* ++ * We can't change the vdev state in this context, so we ++ * kick off an async task to do it on our behalf. ++ */ ++ if (zio != NULL) { ++ vd->vdev_probe_wanted = B_TRUE; ++ spa_async_request(spa, SPA_ASYNC_PROBE); ++ } ++ } ++ ++ if (zio != NULL) ++ zio_add_child(zio, pio); ++ ++ mutex_exit(&vd->vdev_probe_lock); ++ ++ if (vps == NULL) { ++ ASSERT(zio != NULL); ++ return (NULL); ++ } ++ ++ for (l = 1; l < VDEV_LABELS; l++) { ++ zio_nowait(zio_read_phys(pio, vd, ++ vdev_label_offset(vd->vdev_psize, l, ++ offsetof(vdev_label_t, vl_pad2)), ++ VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), ++ ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ++ ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); ++ } ++ ++ if (zio == NULL) ++ return (pio); ++ ++ zio_nowait(pio); ++ return (NULL); ++} ++ ++static void ++vdev_open_child(void *arg) ++{ ++ vdev_t *vd = arg; ++ ++ vd->vdev_open_thread = curthread; ++ vd->vdev_open_error = vdev_open(vd); ++ vd->vdev_open_thread = NULL; ++} ++ ++boolean_t ++vdev_uses_zvols(vdev_t *vd) ++{ ++/* ++ * Stacking zpools on top of zvols is unsupported until we implement a method ++ * for determining if an arbitrary block device is a zvol without using the ++ * path. Solaris would check the 'zvol' path component but this does not ++ * exist in the Linux port, so we really should do something like stat the ++ * file and check the major number. This is complicated by the fact that ++ * we need to do this portably in user or kernel space. ++ */ ++#if 0 ++ int c; ++ ++ if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, ++ strlen(ZVOL_DIR)) == 0) ++ return (B_TRUE); ++ for (c = 0; c < vd->vdev_children; c++) ++ if (vdev_uses_zvols(vd->vdev_child[c])) ++ return (B_TRUE); ++#endif ++ return (B_FALSE); ++} ++ ++void ++vdev_open_children(vdev_t *vd) ++{ ++ taskq_t *tq; ++ int children = vd->vdev_children; ++ int c; ++ ++ /* ++ * in order to handle pools on top of zvols, do the opens ++ * in a single thread so that the same thread holds the ++ * spa_namespace_lock ++ */ ++ if (vdev_uses_zvols(vd)) { ++ for (c = 0; c < children; c++) ++ vd->vdev_child[c]->vdev_open_error = ++ vdev_open(vd->vdev_child[c]); ++ return; ++ } ++ tq = taskq_create("vdev_open", children, minclsyspri, ++ children, children, TASKQ_PREPOPULATE); ++ ++ for (c = 0; c < children; c++) ++ VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], ++ TQ_SLEEP) != 0); ++ ++ taskq_destroy(tq); ++} ++ ++/* ++ * Prepare a virtual device for access. ++ */ ++int ++vdev_open(vdev_t *vd) ++{ ++ spa_t *spa = vd->vdev_spa; ++ int error; ++ uint64_t osize = 0; ++ uint64_t max_osize = 0; ++ uint64_t asize, max_asize, psize; ++ uint64_t ashift = 0; ++ int c; ++ ++ ASSERT(vd->vdev_open_thread == curthread || ++ spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ++ ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || ++ vd->vdev_state == VDEV_STATE_CANT_OPEN || ++ vd->vdev_state == VDEV_STATE_OFFLINE); ++ ++ vd->vdev_stat.vs_aux = VDEV_AUX_NONE; ++ vd->vdev_cant_read = B_FALSE; ++ vd->vdev_cant_write = B_FALSE; ++ vd->vdev_min_asize = vdev_get_min_asize(vd); ++ ++ /* ++ * If this vdev is not removed, check its fault status. If it's ++ * faulted, bail out of the open. ++ */ ++ if (!vd->vdev_removed && vd->vdev_faulted) { ++ ASSERT(vd->vdev_children == 0); ++ ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || ++ vd->vdev_label_aux == VDEV_AUX_EXTERNAL); ++ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, ++ vd->vdev_label_aux); ++ return (ENXIO); ++ } else if (vd->vdev_offline) { ++ ASSERT(vd->vdev_children == 0); ++ vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); ++ return (ENXIO); ++ } ++ ++ error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift); ++ ++ /* ++ * Reset the vdev_reopening flag so that we actually close ++ * the vdev on error. ++ */ ++ vd->vdev_reopening = B_FALSE; ++ if (zio_injection_enabled && error == 0) ++ error = zio_handle_device_injection(vd, NULL, ENXIO); ++ ++ if (error) { ++ if (vd->vdev_removed && ++ vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) ++ vd->vdev_removed = B_FALSE; ++ ++ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, ++ vd->vdev_stat.vs_aux); ++ return (error); ++ } ++ ++ vd->vdev_removed = B_FALSE; ++ ++ /* ++ * Recheck the faulted flag now that we have confirmed that ++ * the vdev is accessible. If we're faulted, bail. ++ */ ++ if (vd->vdev_faulted) { ++ ASSERT(vd->vdev_children == 0); ++ ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || ++ vd->vdev_label_aux == VDEV_AUX_EXTERNAL); ++ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, ++ vd->vdev_label_aux); ++ return (ENXIO); ++ } ++ ++ if (vd->vdev_degraded) { ++ ASSERT(vd->vdev_children == 0); ++ vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, ++ VDEV_AUX_ERR_EXCEEDED); ++ } else { ++ vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); ++ } ++ ++ /* ++ * For hole or missing vdevs we just return success. ++ */ ++ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) ++ return (0); ++ ++ for (c = 0; c < vd->vdev_children; c++) { ++ if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { ++ vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, ++ VDEV_AUX_NONE); ++ break; ++ } ++ } ++ ++ osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); ++ max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); ++ ++ if (vd->vdev_children == 0) { ++ if (osize < SPA_MINDEVSIZE) { ++ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_TOO_SMALL); ++ return (EOVERFLOW); ++ } ++ psize = osize; ++ asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); ++ max_asize = max_osize - (VDEV_LABEL_START_SIZE + ++ VDEV_LABEL_END_SIZE); ++ } else { ++ if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - ++ (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { ++ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_TOO_SMALL); ++ return (EOVERFLOW); ++ } ++ psize = 0; ++ asize = osize; ++ max_asize = max_osize; ++ } ++ ++ vd->vdev_psize = psize; ++ ++ /* ++ * Make sure the allocatable size hasn't shrunk. ++ */ ++ if (asize < vd->vdev_min_asize) { ++ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_BAD_LABEL); ++ return (EINVAL); ++ } ++ ++ if (vd->vdev_asize == 0) { ++ /* ++ * This is the first-ever open, so use the computed values. ++ * For testing purposes, a higher ashift can be requested. ++ */ ++ vd->vdev_asize = asize; ++ vd->vdev_max_asize = max_asize; ++ vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); ++ } else { ++ /* ++ * Make sure the alignment requirement hasn't increased. ++ */ ++ if (ashift > vd->vdev_top->vdev_ashift) { ++ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_BAD_LABEL); ++ return (EINVAL); ++ } ++ vd->vdev_max_asize = max_asize; ++ } ++ ++ /* ++ * If all children are healthy and the asize has increased, ++ * then we've experienced dynamic LUN growth. If automatic ++ * expansion is enabled then use the additional space. ++ */ ++ if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && ++ (vd->vdev_expanding || spa->spa_autoexpand)) ++ vd->vdev_asize = asize; ++ ++ vdev_set_min_asize(vd); ++ ++ /* ++ * Ensure we can issue some IO before declaring the ++ * vdev open for business. ++ */ ++ if (vd->vdev_ops->vdev_op_leaf && ++ (error = zio_wait(vdev_probe(vd, NULL))) != 0) { ++ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, ++ VDEV_AUX_ERR_EXCEEDED); ++ return (error); ++ } ++ ++ /* ++ * If a leaf vdev has a DTL, and seems healthy, then kick off a ++ * resilver. But don't do this if we are doing a reopen for a scrub, ++ * since this would just restart the scrub we are already doing. ++ */ ++ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && ++ vdev_resilver_needed(vd, NULL, NULL)) ++ spa_async_request(spa, SPA_ASYNC_RESILVER); ++ ++ return (0); ++} ++ ++/* ++ * Called once the vdevs are all opened, this routine validates the label ++ * contents. This needs to be done before vdev_load() so that we don't ++ * inadvertently do repair I/Os to the wrong device. ++ * ++ * If 'strict' is false ignore the spa guid check. This is necessary because ++ * if the machine crashed during a re-guid the new guid might have been written ++ * to all of the vdev labels, but not the cached config. The strict check ++ * will be performed when the pool is opened again using the mos config. ++ * ++ * This function will only return failure if one of the vdevs indicates that it ++ * has since been destroyed or exported. This is only possible if ++ * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state ++ * will be updated but the function will return 0. ++ */ ++int ++vdev_validate(vdev_t *vd, boolean_t strict) ++{ ++ spa_t *spa = vd->vdev_spa; ++ nvlist_t *label; ++ uint64_t guid = 0, top_guid; ++ uint64_t state; ++ int c; ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ if (vdev_validate(vd->vdev_child[c], strict) != 0) ++ return (EBADF); ++ ++ /* ++ * If the device has already failed, or was marked offline, don't do ++ * any further validation. Otherwise, label I/O will fail and we will ++ * overwrite the previous state. ++ */ ++ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { ++ uint64_t aux_guid = 0; ++ nvlist_t *nvl; ++ ++ if ((label = vdev_label_read_config(vd)) == NULL) { ++ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_BAD_LABEL); ++ return (0); ++ } ++ ++ /* ++ * Determine if this vdev has been split off into another ++ * pool. If so, then refuse to open it. ++ */ ++ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, ++ &aux_guid) == 0 && aux_guid == spa_guid(spa)) { ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_SPLIT_POOL); ++ nvlist_free(label); ++ return (0); ++ } ++ ++ if (strict && (nvlist_lookup_uint64(label, ++ ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || ++ guid != spa_guid(spa))) { ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_CORRUPT_DATA); ++ nvlist_free(label); ++ return (0); ++ } ++ ++ if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) ++ != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, ++ &aux_guid) != 0) ++ aux_guid = 0; ++ ++ /* ++ * If this vdev just became a top-level vdev because its ++ * sibling was detached, it will have adopted the parent's ++ * vdev guid -- but the label may or may not be on disk yet. ++ * Fortunately, either version of the label will have the ++ * same top guid, so if we're a top-level vdev, we can ++ * safely compare to that instead. ++ * ++ * If we split this vdev off instead, then we also check the ++ * original pool's guid. We don't want to consider the vdev ++ * corrupt if it is partway through a split operation. ++ */ ++ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, ++ &guid) != 0 || ++ nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, ++ &top_guid) != 0 || ++ ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && ++ (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_CORRUPT_DATA); ++ nvlist_free(label); ++ return (0); ++ } ++ ++ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, ++ &state) != 0) { ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_CORRUPT_DATA); ++ nvlist_free(label); ++ return (0); ++ } ++ ++ nvlist_free(label); ++ ++ /* ++ * If this is a verbatim import, no need to check the ++ * state of the pool. ++ */ ++ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && ++ spa_load_state(spa) == SPA_LOAD_OPEN && ++ state != POOL_STATE_ACTIVE) ++ return (EBADF); ++ ++ /* ++ * If we were able to open and validate a vdev that was ++ * previously marked permanently unavailable, clear that state ++ * now. ++ */ ++ if (vd->vdev_not_present) ++ vd->vdev_not_present = 0; ++ } ++ ++ return (0); ++} ++ ++/* ++ * Close a virtual device. ++ */ ++void ++vdev_close(vdev_t *vd) ++{ ++ vdev_t *pvd = vd->vdev_parent; ++ ASSERTV(spa_t *spa = vd->vdev_spa); ++ ++ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ++ ++ /* ++ * If our parent is reopening, then we are as well, unless we are ++ * going offline. ++ */ ++ if (pvd != NULL && pvd->vdev_reopening) ++ vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); ++ ++ vd->vdev_ops->vdev_op_close(vd); ++ ++ vdev_cache_purge(vd); ++ ++ /* ++ * We record the previous state before we close it, so that if we are ++ * doing a reopen(), we don't generate FMA ereports if we notice that ++ * it's still faulted. ++ */ ++ vd->vdev_prevstate = vd->vdev_state; ++ ++ if (vd->vdev_offline) ++ vd->vdev_state = VDEV_STATE_OFFLINE; ++ else ++ vd->vdev_state = VDEV_STATE_CLOSED; ++ vd->vdev_stat.vs_aux = VDEV_AUX_NONE; ++} ++ ++void ++vdev_hold(vdev_t *vd) ++{ ++ spa_t *spa = vd->vdev_spa; ++ int c; ++ ++ ASSERT(spa_is_root(spa)); ++ if (spa->spa_state == POOL_STATE_UNINITIALIZED) ++ return; ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ vdev_hold(vd->vdev_child[c]); ++ ++ if (vd->vdev_ops->vdev_op_leaf) ++ vd->vdev_ops->vdev_op_hold(vd); ++} ++ ++void ++vdev_rele(vdev_t *vd) ++{ ++ int c; ++ ++ ASSERT(spa_is_root(vd->vdev_spa)); ++ for (c = 0; c < vd->vdev_children; c++) ++ vdev_rele(vd->vdev_child[c]); ++ ++ if (vd->vdev_ops->vdev_op_leaf) ++ vd->vdev_ops->vdev_op_rele(vd); ++} ++ ++/* ++ * Reopen all interior vdevs and any unopened leaves. We don't actually ++ * reopen leaf vdevs which had previously been opened as they might deadlock ++ * on the spa_config_lock. Instead we only obtain the leaf's physical size. ++ * If the leaf has never been opened then open it, as usual. ++ */ ++void ++vdev_reopen(vdev_t *vd) ++{ ++ spa_t *spa = vd->vdev_spa; ++ ++ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ++ ++ /* set the reopening flag unless we're taking the vdev offline */ ++ vd->vdev_reopening = !vd->vdev_offline; ++ vdev_close(vd); ++ (void) vdev_open(vd); ++ ++ /* ++ * Call vdev_validate() here to make sure we have the same device. ++ * Otherwise, a device with an invalid label could be successfully ++ * opened in response to vdev_reopen(). ++ */ ++ if (vd->vdev_aux) { ++ (void) vdev_validate_aux(vd); ++ if (vdev_readable(vd) && vdev_writeable(vd) && ++ vd->vdev_aux == &spa->spa_l2cache && ++ !l2arc_vdev_present(vd)) ++ l2arc_add_vdev(spa, vd); ++ } else { ++ (void) vdev_validate(vd, B_TRUE); ++ } ++ ++ /* ++ * Reassess parent vdev's health. ++ */ ++ vdev_propagate_state(vd); ++} ++ ++int ++vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) ++{ ++ int error; ++ ++ /* ++ * Normally, partial opens (e.g. of a mirror) are allowed. ++ * For a create, however, we want to fail the request if ++ * there are any components we can't open. ++ */ ++ error = vdev_open(vd); ++ ++ if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { ++ vdev_close(vd); ++ return (error ? error : ENXIO); ++ } ++ ++ /* ++ * Recursively initialize all labels. ++ */ ++ if ((error = vdev_label_init(vd, txg, isreplacing ? ++ VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { ++ vdev_close(vd); ++ return (error); ++ } ++ ++ return (0); ++} ++ ++void ++vdev_metaslab_set_size(vdev_t *vd) ++{ ++ /* ++ * Aim for roughly 200 metaslabs per vdev. ++ */ ++ vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); ++ vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); ++} ++ ++void ++vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) ++{ ++ ASSERT(vd == vd->vdev_top); ++ ASSERT(!vd->vdev_ishole); ++ ASSERT(ISP2(flags)); ++ ASSERT(spa_writeable(vd->vdev_spa)); ++ ++ if (flags & VDD_METASLAB) ++ (void) txg_list_add(&vd->vdev_ms_list, arg, txg); ++ ++ if (flags & VDD_DTL) ++ (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); ++ ++ (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); ++} ++ ++/* ++ * DTLs. ++ * ++ * A vdev's DTL (dirty time log) is the set of transaction groups for which ++ * the vdev has less than perfect replication. There are four kinds of DTL: ++ * ++ * DTL_MISSING: txgs for which the vdev has no valid copies of the data ++ * ++ * DTL_PARTIAL: txgs for which data is available, but not fully replicated ++ * ++ * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon ++ * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of ++ * txgs that was scrubbed. ++ * ++ * DTL_OUTAGE: txgs which cannot currently be read, whether due to ++ * persistent errors or just some device being offline. ++ * Unlike the other three, the DTL_OUTAGE map is not generally ++ * maintained; it's only computed when needed, typically to ++ * determine whether a device can be detached. ++ * ++ * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device ++ * either has the data or it doesn't. ++ * ++ * For interior vdevs such as mirror and RAID-Z the picture is more complex. ++ * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because ++ * if any child is less than fully replicated, then so is its parent. ++ * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, ++ * comprising only those txgs which appear in 'maxfaults' or more children; ++ * those are the txgs we don't have enough replication to read. For example, ++ * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); ++ * thus, its DTL_MISSING consists of the set of txgs that appear in more than ++ * two child DTL_MISSING maps. ++ * ++ * It should be clear from the above that to compute the DTLs and outage maps ++ * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. ++ * Therefore, that is all we keep on disk. When loading the pool, or after ++ * a configuration change, we generate all other DTLs from first principles. ++ */ ++void ++vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) ++{ ++ space_map_t *sm = &vd->vdev_dtl[t]; ++ ++ ASSERT(t < DTL_TYPES); ++ ASSERT(vd != vd->vdev_spa->spa_root_vdev); ++ ASSERT(spa_writeable(vd->vdev_spa)); ++ ++ mutex_enter(sm->sm_lock); ++ if (!space_map_contains(sm, txg, size)) ++ space_map_add(sm, txg, size); ++ mutex_exit(sm->sm_lock); ++} ++ ++boolean_t ++vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) ++{ ++ space_map_t *sm = &vd->vdev_dtl[t]; ++ boolean_t dirty = B_FALSE; ++ ++ ASSERT(t < DTL_TYPES); ++ ASSERT(vd != vd->vdev_spa->spa_root_vdev); ++ ++ mutex_enter(sm->sm_lock); ++ if (sm->sm_space != 0) ++ dirty = space_map_contains(sm, txg, size); ++ mutex_exit(sm->sm_lock); ++ ++ return (dirty); ++} ++ ++boolean_t ++vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) ++{ ++ space_map_t *sm = &vd->vdev_dtl[t]; ++ boolean_t empty; ++ ++ mutex_enter(sm->sm_lock); ++ empty = (sm->sm_space == 0); ++ mutex_exit(sm->sm_lock); ++ ++ return (empty); ++} ++ ++/* ++ * Reassess DTLs after a config change or scrub completion. ++ */ ++void ++vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) ++{ ++ spa_t *spa = vd->vdev_spa; ++ avl_tree_t reftree; ++ int c, t, minref; ++ ++ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ vdev_dtl_reassess(vd->vdev_child[c], txg, ++ scrub_txg, scrub_done); ++ ++ if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) ++ return; ++ ++ if (vd->vdev_ops->vdev_op_leaf) { ++ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; ++ ++ mutex_enter(&vd->vdev_dtl_lock); ++ if (scrub_txg != 0 && ++ (spa->spa_scrub_started || ++ (scn && scn->scn_phys.scn_errors == 0))) { ++ /* ++ * We completed a scrub up to scrub_txg. If we ++ * did it without rebooting, then the scrub dtl ++ * will be valid, so excise the old region and ++ * fold in the scrub dtl. Otherwise, leave the ++ * dtl as-is if there was an error. ++ * ++ * There's little trick here: to excise the beginning ++ * of the DTL_MISSING map, we put it into a reference ++ * tree and then add a segment with refcnt -1 that ++ * covers the range [0, scrub_txg). This means ++ * that each txg in that range has refcnt -1 or 0. ++ * We then add DTL_SCRUB with a refcnt of 2, so that ++ * entries in the range [0, scrub_txg) will have a ++ * positive refcnt -- either 1 or 2. We then convert ++ * the reference tree into the new DTL_MISSING map. ++ */ ++ space_map_ref_create(&reftree); ++ space_map_ref_add_map(&reftree, ++ &vd->vdev_dtl[DTL_MISSING], 1); ++ space_map_ref_add_seg(&reftree, 0, scrub_txg, -1); ++ space_map_ref_add_map(&reftree, ++ &vd->vdev_dtl[DTL_SCRUB], 2); ++ space_map_ref_generate_map(&reftree, ++ &vd->vdev_dtl[DTL_MISSING], 1); ++ space_map_ref_destroy(&reftree); ++ } ++ space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); ++ space_map_walk(&vd->vdev_dtl[DTL_MISSING], ++ space_map_add, &vd->vdev_dtl[DTL_PARTIAL]); ++ if (scrub_done) ++ space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL); ++ space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); ++ if (!vdev_readable(vd)) ++ space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); ++ else ++ space_map_walk(&vd->vdev_dtl[DTL_MISSING], ++ space_map_add, &vd->vdev_dtl[DTL_OUTAGE]); ++ mutex_exit(&vd->vdev_dtl_lock); ++ ++ if (txg != 0) ++ vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); ++ return; ++ } ++ ++ mutex_enter(&vd->vdev_dtl_lock); ++ for (t = 0; t < DTL_TYPES; t++) { ++ /* account for child's outage in parent's missing map */ ++ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; ++ if (t == DTL_SCRUB) ++ continue; /* leaf vdevs only */ ++ if (t == DTL_PARTIAL) ++ minref = 1; /* i.e. non-zero */ ++ else if (vd->vdev_nparity != 0) ++ minref = vd->vdev_nparity + 1; /* RAID-Z */ ++ else ++ minref = vd->vdev_children; /* any kind of mirror */ ++ space_map_ref_create(&reftree); ++ for (c = 0; c < vd->vdev_children; c++) { ++ vdev_t *cvd = vd->vdev_child[c]; ++ mutex_enter(&cvd->vdev_dtl_lock); ++ space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1); ++ mutex_exit(&cvd->vdev_dtl_lock); ++ } ++ space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref); ++ space_map_ref_destroy(&reftree); ++ } ++ mutex_exit(&vd->vdev_dtl_lock); ++} ++ ++static int ++vdev_dtl_load(vdev_t *vd) ++{ ++ spa_t *spa = vd->vdev_spa; ++ space_map_obj_t *smo = &vd->vdev_dtl_smo; ++ objset_t *mos = spa->spa_meta_objset; ++ dmu_buf_t *db; ++ int error; ++ ++ ASSERT(vd->vdev_children == 0); ++ ++ if (smo->smo_object == 0) ++ return (0); ++ ++ ASSERT(!vd->vdev_ishole); ++ ++ if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) ++ return (error); ++ ++ ASSERT3U(db->db_size, >=, sizeof (*smo)); ++ bcopy(db->db_data, smo, sizeof (*smo)); ++ dmu_buf_rele(db, FTAG); ++ ++ mutex_enter(&vd->vdev_dtl_lock); ++ error = space_map_load(&vd->vdev_dtl[DTL_MISSING], ++ NULL, SM_ALLOC, smo, mos); ++ mutex_exit(&vd->vdev_dtl_lock); ++ ++ return (error); ++} ++ ++void ++vdev_dtl_sync(vdev_t *vd, uint64_t txg) ++{ ++ spa_t *spa = vd->vdev_spa; ++ space_map_obj_t *smo = &vd->vdev_dtl_smo; ++ space_map_t *sm = &vd->vdev_dtl[DTL_MISSING]; ++ objset_t *mos = spa->spa_meta_objset; ++ space_map_t smsync; ++ kmutex_t smlock; ++ dmu_buf_t *db; ++ dmu_tx_t *tx; ++ ++ ASSERT(!vd->vdev_ishole); ++ ++ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); ++ ++ if (vd->vdev_detached) { ++ if (smo->smo_object != 0) { ++ VERIFY(0 == dmu_object_free(mos, smo->smo_object, tx)); ++ smo->smo_object = 0; ++ } ++ dmu_tx_commit(tx); ++ return; ++ } ++ ++ if (smo->smo_object == 0) { ++ ASSERT(smo->smo_objsize == 0); ++ ASSERT(smo->smo_alloc == 0); ++ smo->smo_object = dmu_object_alloc(mos, ++ DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, ++ DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); ++ ASSERT(smo->smo_object != 0); ++ vdev_config_dirty(vd->vdev_top); ++ } ++ ++ mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); ++ ++ space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, ++ &smlock); ++ ++ mutex_enter(&smlock); ++ ++ mutex_enter(&vd->vdev_dtl_lock); ++ space_map_walk(sm, space_map_add, &smsync); ++ mutex_exit(&vd->vdev_dtl_lock); ++ ++ space_map_truncate(smo, mos, tx); ++ space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); ++ ++ space_map_destroy(&smsync); ++ ++ mutex_exit(&smlock); ++ mutex_destroy(&smlock); ++ ++ VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); ++ dmu_buf_will_dirty(db, tx); ++ ASSERT3U(db->db_size, >=, sizeof (*smo)); ++ bcopy(smo, db->db_data, sizeof (*smo)); ++ dmu_buf_rele(db, FTAG); ++ ++ dmu_tx_commit(tx); ++} ++ ++/* ++ * Determine whether the specified vdev can be offlined/detached/removed ++ * without losing data. ++ */ ++boolean_t ++vdev_dtl_required(vdev_t *vd) ++{ ++ spa_t *spa = vd->vdev_spa; ++ vdev_t *tvd = vd->vdev_top; ++ uint8_t cant_read = vd->vdev_cant_read; ++ boolean_t required; ++ ++ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ++ ++ if (vd == spa->spa_root_vdev || vd == tvd) ++ return (B_TRUE); ++ ++ /* ++ * Temporarily mark the device as unreadable, and then determine ++ * whether this results in any DTL outages in the top-level vdev. ++ * If not, we can safely offline/detach/remove the device. ++ */ ++ vd->vdev_cant_read = B_TRUE; ++ vdev_dtl_reassess(tvd, 0, 0, B_FALSE); ++ required = !vdev_dtl_empty(tvd, DTL_OUTAGE); ++ vd->vdev_cant_read = cant_read; ++ vdev_dtl_reassess(tvd, 0, 0, B_FALSE); ++ ++ if (!required && zio_injection_enabled) ++ required = !!zio_handle_device_injection(vd, NULL, ECHILD); ++ ++ return (required); ++} ++ ++/* ++ * Determine if resilver is needed, and if so the txg range. ++ */ ++boolean_t ++vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) ++{ ++ boolean_t needed = B_FALSE; ++ uint64_t thismin = UINT64_MAX; ++ uint64_t thismax = 0; ++ int c; ++ ++ if (vd->vdev_children == 0) { ++ mutex_enter(&vd->vdev_dtl_lock); ++ if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 && ++ vdev_writeable(vd)) { ++ space_seg_t *ss; ++ ++ ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); ++ thismin = ss->ss_start - 1; ++ ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); ++ thismax = ss->ss_end; ++ needed = B_TRUE; ++ } ++ mutex_exit(&vd->vdev_dtl_lock); ++ } else { ++ for (c = 0; c < vd->vdev_children; c++) { ++ vdev_t *cvd = vd->vdev_child[c]; ++ uint64_t cmin, cmax; ++ ++ if (vdev_resilver_needed(cvd, &cmin, &cmax)) { ++ thismin = MIN(thismin, cmin); ++ thismax = MAX(thismax, cmax); ++ needed = B_TRUE; ++ } ++ } ++ } ++ ++ if (needed && minp) { ++ *minp = thismin; ++ *maxp = thismax; ++ } ++ return (needed); ++} ++ ++void ++vdev_load(vdev_t *vd) ++{ ++ int c; ++ ++ /* ++ * Recursively load all children. ++ */ ++ for (c = 0; c < vd->vdev_children; c++) ++ vdev_load(vd->vdev_child[c]); ++ ++ /* ++ * If this is a top-level vdev, initialize its metaslabs. ++ */ ++ if (vd == vd->vdev_top && !vd->vdev_ishole && ++ (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || ++ vdev_metaslab_init(vd, 0) != 0)) ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_CORRUPT_DATA); ++ ++ /* ++ * If this is a leaf vdev, load its DTL. ++ */ ++ if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_CORRUPT_DATA); ++} ++ ++/* ++ * The special vdev case is used for hot spares and l2cache devices. Its ++ * sole purpose it to set the vdev state for the associated vdev. To do this, ++ * we make sure that we can open the underlying device, then try to read the ++ * label, and make sure that the label is sane and that it hasn't been ++ * repurposed to another pool. ++ */ ++int ++vdev_validate_aux(vdev_t *vd) ++{ ++ nvlist_t *label; ++ uint64_t guid, version; ++ uint64_t state; ++ ++ if (!vdev_readable(vd)) ++ return (0); ++ ++ if ((label = vdev_label_read_config(vd)) == NULL) { ++ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_CORRUPT_DATA); ++ return (-1); ++ } ++ ++ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || ++ version > SPA_VERSION || ++ nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || ++ guid != vd->vdev_guid || ++ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { ++ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_CORRUPT_DATA); ++ nvlist_free(label); ++ return (-1); ++ } ++ ++ /* ++ * We don't actually check the pool state here. If it's in fact in ++ * use by another pool, we update this fact on the fly when requested. ++ */ ++ nvlist_free(label); ++ return (0); ++} ++ ++void ++vdev_remove(vdev_t *vd, uint64_t txg) ++{ ++ spa_t *spa = vd->vdev_spa; ++ objset_t *mos = spa->spa_meta_objset; ++ dmu_tx_t *tx; ++ int m; ++ ++ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); ++ ++ if (vd->vdev_dtl_smo.smo_object) { ++ ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0); ++ (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx); ++ vd->vdev_dtl_smo.smo_object = 0; ++ } ++ ++ if (vd->vdev_ms != NULL) { ++ for (m = 0; m < vd->vdev_ms_count; m++) { ++ metaslab_t *msp = vd->vdev_ms[m]; ++ ++ if (msp == NULL || msp->ms_smo.smo_object == 0) ++ continue; ++ ++ ASSERT3U(msp->ms_smo.smo_alloc, ==, 0); ++ (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx); ++ msp->ms_smo.smo_object = 0; ++ } ++ } ++ ++ if (vd->vdev_ms_array) { ++ (void) dmu_object_free(mos, vd->vdev_ms_array, tx); ++ vd->vdev_ms_array = 0; ++ vd->vdev_ms_shift = 0; ++ } ++ dmu_tx_commit(tx); ++} ++ ++void ++vdev_sync_done(vdev_t *vd, uint64_t txg) ++{ ++ metaslab_t *msp; ++ boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ++ ++ ASSERT(!vd->vdev_ishole); ++ ++ while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))) ++ metaslab_sync_done(msp, txg); ++ ++ if (reassess) ++ metaslab_sync_reassess(vd->vdev_mg); ++} ++ ++void ++vdev_sync(vdev_t *vd, uint64_t txg) ++{ ++ spa_t *spa = vd->vdev_spa; ++ vdev_t *lvd; ++ metaslab_t *msp; ++ dmu_tx_t *tx; ++ ++ ASSERT(!vd->vdev_ishole); ++ ++ if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { ++ ASSERT(vd == vd->vdev_top); ++ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); ++ vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, ++ DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ++ ASSERT(vd->vdev_ms_array != 0); ++ vdev_config_dirty(vd); ++ dmu_tx_commit(tx); ++ } ++ ++ /* ++ * Remove the metadata associated with this vdev once it's empty. ++ */ ++ if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) ++ vdev_remove(vd, txg); ++ ++ while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { ++ metaslab_sync(msp, txg); ++ (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); ++ } ++ ++ while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) ++ vdev_dtl_sync(lvd, txg); ++ ++ (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); ++} ++ ++uint64_t ++vdev_psize_to_asize(vdev_t *vd, uint64_t psize) ++{ ++ return (vd->vdev_ops->vdev_op_asize(vd, psize)); ++} ++ ++/* ++ * Mark the given vdev faulted. A faulted vdev behaves as if the device could ++ * not be opened, and no I/O is attempted. ++ */ ++int ++vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) ++{ ++ vdev_t *vd, *tvd; ++ ++ spa_vdev_state_enter(spa, SCL_NONE); ++ ++ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) ++ return (spa_vdev_state_exit(spa, NULL, ENODEV)); ++ ++ if (!vd->vdev_ops->vdev_op_leaf) ++ return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); ++ ++ tvd = vd->vdev_top; ++ ++ /* ++ * We don't directly use the aux state here, but if we do a ++ * vdev_reopen(), we need this value to be present to remember why we ++ * were faulted. ++ */ ++ vd->vdev_label_aux = aux; ++ ++ /* ++ * Faulted state takes precedence over degraded. ++ */ ++ vd->vdev_delayed_close = B_FALSE; ++ vd->vdev_faulted = 1ULL; ++ vd->vdev_degraded = 0ULL; ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); ++ ++ /* ++ * If this device has the only valid copy of the data, then ++ * back off and simply mark the vdev as degraded instead. ++ */ ++ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { ++ vd->vdev_degraded = 1ULL; ++ vd->vdev_faulted = 0ULL; ++ ++ /* ++ * If we reopen the device and it's not dead, only then do we ++ * mark it degraded. ++ */ ++ vdev_reopen(tvd); ++ ++ if (vdev_readable(vd)) ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); ++ } ++ ++ return (spa_vdev_state_exit(spa, vd, 0)); ++} ++ ++/* ++ * Mark the given vdev degraded. A degraded vdev is purely an indication to the ++ * user that something is wrong. The vdev continues to operate as normal as far ++ * as I/O is concerned. ++ */ ++int ++vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) ++{ ++ vdev_t *vd; ++ ++ spa_vdev_state_enter(spa, SCL_NONE); ++ ++ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) ++ return (spa_vdev_state_exit(spa, NULL, ENODEV)); ++ ++ if (!vd->vdev_ops->vdev_op_leaf) ++ return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); ++ ++ /* ++ * If the vdev is already faulted, then don't do anything. ++ */ ++ if (vd->vdev_faulted || vd->vdev_degraded) ++ return (spa_vdev_state_exit(spa, NULL, 0)); ++ ++ vd->vdev_degraded = 1ULL; ++ if (!vdev_is_dead(vd)) ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, ++ aux); ++ ++ return (spa_vdev_state_exit(spa, vd, 0)); ++} ++ ++/* ++ * Online the given vdev. If 'unspare' is set, it implies two things. First, ++ * any attached spare device should be detached when the device finishes ++ * resilvering. Second, the online should be treated like a 'test' online case, ++ * so no FMA events are generated if the device fails to open. ++ */ ++int ++vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) ++{ ++ vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; ++ ++ spa_vdev_state_enter(spa, SCL_NONE); ++ ++ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) ++ return (spa_vdev_state_exit(spa, NULL, ENODEV)); ++ ++ if (!vd->vdev_ops->vdev_op_leaf) ++ return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); ++ ++ tvd = vd->vdev_top; ++ vd->vdev_offline = B_FALSE; ++ vd->vdev_tmpoffline = B_FALSE; ++ vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); ++ vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); ++ ++ /* XXX - L2ARC 1.0 does not support expansion */ ++ if (!vd->vdev_aux) { ++ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) ++ pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); ++ } ++ ++ vdev_reopen(tvd); ++ vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; ++ ++ if (!vd->vdev_aux) { ++ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) ++ pvd->vdev_expanding = B_FALSE; ++ } ++ ++ if (newstate) ++ *newstate = vd->vdev_state; ++ if ((flags & ZFS_ONLINE_UNSPARE) && ++ !vdev_is_dead(vd) && vd->vdev_parent && ++ vd->vdev_parent->vdev_ops == &vdev_spare_ops && ++ vd->vdev_parent->vdev_child[0] == vd) ++ vd->vdev_unspare = B_TRUE; ++ ++ if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { ++ ++ /* XXX - L2ARC 1.0 does not support expansion */ ++ if (vd->vdev_aux) ++ return (spa_vdev_state_exit(spa, vd, ENOTSUP)); ++ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); ++ } ++ return (spa_vdev_state_exit(spa, vd, 0)); ++} ++ ++static int ++vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) ++{ ++ vdev_t *vd, *tvd; ++ int error = 0; ++ uint64_t generation; ++ metaslab_group_t *mg; ++ ++top: ++ spa_vdev_state_enter(spa, SCL_ALLOC); ++ ++ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) ++ return (spa_vdev_state_exit(spa, NULL, ENODEV)); ++ ++ if (!vd->vdev_ops->vdev_op_leaf) ++ return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); ++ ++ tvd = vd->vdev_top; ++ mg = tvd->vdev_mg; ++ generation = spa->spa_config_generation + 1; ++ ++ /* ++ * If the device isn't already offline, try to offline it. ++ */ ++ if (!vd->vdev_offline) { ++ /* ++ * If this device has the only valid copy of some data, ++ * don't allow it to be offlined. Log devices are always ++ * expendable. ++ */ ++ if (!tvd->vdev_islog && vd->vdev_aux == NULL && ++ vdev_dtl_required(vd)) ++ return (spa_vdev_state_exit(spa, NULL, EBUSY)); ++ ++ /* ++ * If the top-level is a slog and it has had allocations ++ * then proceed. We check that the vdev's metaslab group ++ * is not NULL since it's possible that we may have just ++ * added this vdev but not yet initialized its metaslabs. ++ */ ++ if (tvd->vdev_islog && mg != NULL) { ++ /* ++ * Prevent any future allocations. ++ */ ++ metaslab_group_passivate(mg); ++ (void) spa_vdev_state_exit(spa, vd, 0); ++ ++ error = spa_offline_log(spa); ++ ++ spa_vdev_state_enter(spa, SCL_ALLOC); ++ ++ /* ++ * Check to see if the config has changed. ++ */ ++ if (error || generation != spa->spa_config_generation) { ++ metaslab_group_activate(mg); ++ if (error) ++ return (spa_vdev_state_exit(spa, ++ vd, error)); ++ (void) spa_vdev_state_exit(spa, vd, 0); ++ goto top; ++ } ++ ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0); ++ } ++ ++ /* ++ * Offline this device and reopen its top-level vdev. ++ * If the top-level vdev is a log device then just offline ++ * it. Otherwise, if this action results in the top-level ++ * vdev becoming unusable, undo it and fail the request. ++ */ ++ vd->vdev_offline = B_TRUE; ++ vdev_reopen(tvd); ++ ++ if (!tvd->vdev_islog && vd->vdev_aux == NULL && ++ vdev_is_dead(tvd)) { ++ vd->vdev_offline = B_FALSE; ++ vdev_reopen(tvd); ++ return (spa_vdev_state_exit(spa, NULL, EBUSY)); ++ } ++ ++ /* ++ * Add the device back into the metaslab rotor so that ++ * once we online the device it's open for business. ++ */ ++ if (tvd->vdev_islog && mg != NULL) ++ metaslab_group_activate(mg); ++ } ++ ++ vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); ++ ++ return (spa_vdev_state_exit(spa, vd, 0)); ++} ++ ++int ++vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) ++{ ++ int error; ++ ++ mutex_enter(&spa->spa_vdev_top_lock); ++ error = vdev_offline_locked(spa, guid, flags); ++ mutex_exit(&spa->spa_vdev_top_lock); ++ ++ return (error); ++} ++ ++/* ++ * Clear the error counts associated with this vdev. Unlike vdev_online() and ++ * vdev_offline(), we assume the spa config is locked. We also clear all ++ * children. If 'vd' is NULL, then the user wants to clear all vdevs. ++ */ ++void ++vdev_clear(spa_t *spa, vdev_t *vd) ++{ ++ vdev_t *rvd = spa->spa_root_vdev; ++ int c; ++ ++ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ++ ++ if (vd == NULL) ++ vd = rvd; ++ ++ vd->vdev_stat.vs_read_errors = 0; ++ vd->vdev_stat.vs_write_errors = 0; ++ vd->vdev_stat.vs_checksum_errors = 0; ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ vdev_clear(spa, vd->vdev_child[c]); ++ ++ /* ++ * If we're in the FAULTED state or have experienced failed I/O, then ++ * clear the persistent state and attempt to reopen the device. We ++ * also mark the vdev config dirty, so that the new faulted state is ++ * written out to disk. ++ */ ++ if (vd->vdev_faulted || vd->vdev_degraded || ++ !vdev_readable(vd) || !vdev_writeable(vd)) { ++ ++ /* ++ * When reopening in reponse to a clear event, it may be due to ++ * a fmadm repair request. In this case, if the device is ++ * still broken, we want to still post the ereport again. ++ */ ++ vd->vdev_forcefault = B_TRUE; ++ ++ vd->vdev_faulted = vd->vdev_degraded = 0ULL; ++ vd->vdev_cant_read = B_FALSE; ++ vd->vdev_cant_write = B_FALSE; ++ ++ vdev_reopen(vd == rvd ? rvd : vd->vdev_top); ++ ++ vd->vdev_forcefault = B_FALSE; ++ ++ if (vd != rvd && vdev_writeable(vd->vdev_top)) ++ vdev_state_dirty(vd->vdev_top); ++ ++ if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) ++ spa_async_request(spa, SPA_ASYNC_RESILVER); ++ ++ spa_event_notify(spa, vd, FM_EREPORT_ZFS_DEVICE_CLEAR); ++ } ++ ++ /* ++ * When clearing a FMA-diagnosed fault, we always want to ++ * unspare the device, as we assume that the original spare was ++ * done in response to the FMA fault. ++ */ ++ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && ++ vd->vdev_parent->vdev_ops == &vdev_spare_ops && ++ vd->vdev_parent->vdev_child[0] == vd) ++ vd->vdev_unspare = B_TRUE; ++} ++ ++boolean_t ++vdev_is_dead(vdev_t *vd) ++{ ++ /* ++ * Holes and missing devices are always considered "dead". ++ * This simplifies the code since we don't have to check for ++ * these types of devices in the various code paths. ++ * Instead we rely on the fact that we skip over dead devices ++ * before issuing I/O to them. ++ */ ++ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || ++ vd->vdev_ops == &vdev_missing_ops); ++} ++ ++boolean_t ++vdev_readable(vdev_t *vd) ++{ ++ return (!vdev_is_dead(vd) && !vd->vdev_cant_read); ++} ++ ++boolean_t ++vdev_writeable(vdev_t *vd) ++{ ++ return (!vdev_is_dead(vd) && !vd->vdev_cant_write); ++} ++ ++boolean_t ++vdev_allocatable(vdev_t *vd) ++{ ++ uint64_t state = vd->vdev_state; ++ ++ /* ++ * We currently allow allocations from vdevs which may be in the ++ * process of reopening (i.e. VDEV_STATE_CLOSED). If the device ++ * fails to reopen then we'll catch it later when we're holding ++ * the proper locks. Note that we have to get the vdev state ++ * in a local variable because although it changes atomically, ++ * we're asking two separate questions about it. ++ */ ++ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && ++ !vd->vdev_cant_write && !vd->vdev_ishole); ++} ++ ++boolean_t ++vdev_accessible(vdev_t *vd, zio_t *zio) ++{ ++ ASSERT(zio->io_vd == vd); ++ ++ if (vdev_is_dead(vd) || vd->vdev_remove_wanted) ++ return (B_FALSE); ++ ++ if (zio->io_type == ZIO_TYPE_READ) ++ return (!vd->vdev_cant_read); ++ ++ if (zio->io_type == ZIO_TYPE_WRITE) ++ return (!vd->vdev_cant_write); ++ ++ return (B_TRUE); ++} ++ ++/* ++ * Get statistics for the given vdev. ++ */ ++void ++vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) ++{ ++ vdev_t *rvd = vd->vdev_spa->spa_root_vdev; ++ int c, t; ++ ++ mutex_enter(&vd->vdev_stat_lock); ++ bcopy(&vd->vdev_stat, vs, sizeof (*vs)); ++ vs->vs_timestamp = gethrtime() - vs->vs_timestamp; ++ vs->vs_state = vd->vdev_state; ++ vs->vs_rsize = vdev_get_min_asize(vd); ++ if (vd->vdev_ops->vdev_op_leaf) ++ vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; ++ vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; ++ mutex_exit(&vd->vdev_stat_lock); ++ ++ /* ++ * If we're getting stats on the root vdev, aggregate the I/O counts ++ * over all top-level vdevs (i.e. the direct children of the root). ++ */ ++ if (vd == rvd) { ++ for (c = 0; c < rvd->vdev_children; c++) { ++ vdev_t *cvd = rvd->vdev_child[c]; ++ vdev_stat_t *cvs = &cvd->vdev_stat; ++ ++ mutex_enter(&vd->vdev_stat_lock); ++ for (t = 0; t < ZIO_TYPES; t++) { ++ vs->vs_ops[t] += cvs->vs_ops[t]; ++ vs->vs_bytes[t] += cvs->vs_bytes[t]; ++ } ++ cvs->vs_scan_removing = cvd->vdev_removing; ++ mutex_exit(&vd->vdev_stat_lock); ++ } ++ } ++} ++ ++void ++vdev_clear_stats(vdev_t *vd) ++{ ++ mutex_enter(&vd->vdev_stat_lock); ++ vd->vdev_stat.vs_space = 0; ++ vd->vdev_stat.vs_dspace = 0; ++ vd->vdev_stat.vs_alloc = 0; ++ mutex_exit(&vd->vdev_stat_lock); ++} ++ ++void ++vdev_scan_stat_init(vdev_t *vd) ++{ ++ vdev_stat_t *vs = &vd->vdev_stat; ++ int c; ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ vdev_scan_stat_init(vd->vdev_child[c]); ++ ++ mutex_enter(&vd->vdev_stat_lock); ++ vs->vs_scan_processed = 0; ++ mutex_exit(&vd->vdev_stat_lock); ++} ++ ++void ++vdev_stat_update(zio_t *zio, uint64_t psize) ++{ ++ spa_t *spa = zio->io_spa; ++ vdev_t *rvd = spa->spa_root_vdev; ++ vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; ++ vdev_t *pvd; ++ uint64_t txg = zio->io_txg; ++ vdev_stat_t *vs = &vd->vdev_stat; ++ zio_type_t type = zio->io_type; ++ int flags = zio->io_flags; ++ ++ /* ++ * If this i/o is a gang leader, it didn't do any actual work. ++ */ ++ if (zio->io_gang_tree) ++ return; ++ ++ if (zio->io_error == 0) { ++ /* ++ * If this is a root i/o, don't count it -- we've already ++ * counted the top-level vdevs, and vdev_get_stats() will ++ * aggregate them when asked. This reduces contention on ++ * the root vdev_stat_lock and implicitly handles blocks ++ * that compress away to holes, for which there is no i/o. ++ * (Holes never create vdev children, so all the counters ++ * remain zero, which is what we want.) ++ * ++ * Note: this only applies to successful i/o (io_error == 0) ++ * because unlike i/o counts, errors are not additive. ++ * When reading a ditto block, for example, failure of ++ * one top-level vdev does not imply a root-level error. ++ */ ++ if (vd == rvd) ++ return; ++ ++ ASSERT(vd == zio->io_vd); ++ ++ if (flags & ZIO_FLAG_IO_BYPASS) ++ return; ++ ++ mutex_enter(&vd->vdev_stat_lock); ++ ++ if (flags & ZIO_FLAG_IO_REPAIR) { ++ if (flags & ZIO_FLAG_SCAN_THREAD) { ++ dsl_scan_phys_t *scn_phys = ++ &spa->spa_dsl_pool->dp_scan->scn_phys; ++ uint64_t *processed = &scn_phys->scn_processed; ++ ++ /* XXX cleanup? */ ++ if (vd->vdev_ops->vdev_op_leaf) ++ atomic_add_64(processed, psize); ++ vs->vs_scan_processed += psize; ++ } ++ ++ if (flags & ZIO_FLAG_SELF_HEAL) ++ vs->vs_self_healed += psize; ++ } ++ ++ vs->vs_ops[type]++; ++ vs->vs_bytes[type] += psize; ++ ++ mutex_exit(&vd->vdev_stat_lock); ++ return; ++ } ++ ++ if (flags & ZIO_FLAG_SPECULATIVE) ++ return; ++ ++ /* ++ * If this is an I/O error that is going to be retried, then ignore the ++ * error. Otherwise, the user may interpret B_FAILFAST I/O errors as ++ * hard errors, when in reality they can happen for any number of ++ * innocuous reasons (bus resets, MPxIO link failure, etc). ++ */ ++ if (zio->io_error == EIO && ++ !(zio->io_flags & ZIO_FLAG_IO_RETRY)) ++ return; ++ ++ /* ++ * Intent logs writes won't propagate their error to the root ++ * I/O so don't mark these types of failures as pool-level ++ * errors. ++ */ ++ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) ++ return; ++ ++ mutex_enter(&vd->vdev_stat_lock); ++ if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { ++ if (zio->io_error == ECKSUM) ++ vs->vs_checksum_errors++; ++ else ++ vs->vs_read_errors++; ++ } ++ if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) ++ vs->vs_write_errors++; ++ mutex_exit(&vd->vdev_stat_lock); ++ ++ if (type == ZIO_TYPE_WRITE && txg != 0 && ++ (!(flags & ZIO_FLAG_IO_REPAIR) || ++ (flags & ZIO_FLAG_SCAN_THREAD) || ++ spa->spa_claiming)) { ++ /* ++ * This is either a normal write (not a repair), or it's ++ * a repair induced by the scrub thread, or it's a repair ++ * made by zil_claim() during spa_load() in the first txg. ++ * In the normal case, we commit the DTL change in the same ++ * txg as the block was born. In the scrub-induced repair ++ * case, we know that scrubs run in first-pass syncing context, ++ * so we commit the DTL change in spa_syncing_txg(spa). ++ * In the zil_claim() case, we commit in spa_first_txg(spa). ++ * ++ * We currently do not make DTL entries for failed spontaneous ++ * self-healing writes triggered by normal (non-scrubbing) ++ * reads, because we have no transactional context in which to ++ * do so -- and it's not clear that it'd be desirable anyway. ++ */ ++ if (vd->vdev_ops->vdev_op_leaf) { ++ uint64_t commit_txg = txg; ++ if (flags & ZIO_FLAG_SCAN_THREAD) { ++ ASSERT(flags & ZIO_FLAG_IO_REPAIR); ++ ASSERT(spa_sync_pass(spa) == 1); ++ vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); ++ commit_txg = spa_syncing_txg(spa); ++ } else if (spa->spa_claiming) { ++ ASSERT(flags & ZIO_FLAG_IO_REPAIR); ++ commit_txg = spa_first_txg(spa); ++ } ++ ASSERT(commit_txg >= spa_syncing_txg(spa)); ++ if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) ++ return; ++ for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) ++ vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); ++ vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); ++ } ++ if (vd != rvd) ++ vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); ++ } ++} ++ ++/* ++ * Update the in-core space usage stats for this vdev, its metaslab class, ++ * and the root vdev. ++ */ ++void ++vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, ++ int64_t space_delta) ++{ ++ int64_t dspace_delta = space_delta; ++ spa_t *spa = vd->vdev_spa; ++ vdev_t *rvd = spa->spa_root_vdev; ++ metaslab_group_t *mg = vd->vdev_mg; ++ metaslab_class_t *mc = mg ? mg->mg_class : NULL; ++ ++ ASSERT(vd == vd->vdev_top); ++ ++ /* ++ * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion ++ * factor. We must calculate this here and not at the root vdev ++ * because the root vdev's psize-to-asize is simply the max of its ++ * childrens', thus not accurate enough for us. ++ */ ++ ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); ++ ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); ++ dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * ++ vd->vdev_deflate_ratio; ++ ++ mutex_enter(&vd->vdev_stat_lock); ++ vd->vdev_stat.vs_alloc += alloc_delta; ++ vd->vdev_stat.vs_space += space_delta; ++ vd->vdev_stat.vs_dspace += dspace_delta; ++ mutex_exit(&vd->vdev_stat_lock); ++ ++ if (mc == spa_normal_class(spa)) { ++ mutex_enter(&rvd->vdev_stat_lock); ++ rvd->vdev_stat.vs_alloc += alloc_delta; ++ rvd->vdev_stat.vs_space += space_delta; ++ rvd->vdev_stat.vs_dspace += dspace_delta; ++ mutex_exit(&rvd->vdev_stat_lock); ++ } ++ ++ if (mc != NULL) { ++ ASSERT(rvd == vd->vdev_parent); ++ ASSERT(vd->vdev_ms_count != 0); ++ ++ metaslab_class_space_update(mc, ++ alloc_delta, defer_delta, space_delta, dspace_delta); ++ } ++} ++ ++/* ++ * Mark a top-level vdev's config as dirty, placing it on the dirty list ++ * so that it will be written out next time the vdev configuration is synced. ++ * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. ++ */ ++void ++vdev_config_dirty(vdev_t *vd) ++{ ++ spa_t *spa = vd->vdev_spa; ++ vdev_t *rvd = spa->spa_root_vdev; ++ int c; ++ ++ ASSERT(spa_writeable(spa)); ++ ++ /* ++ * If this is an aux vdev (as with l2cache and spare devices), then we ++ * update the vdev config manually and set the sync flag. ++ */ ++ if (vd->vdev_aux != NULL) { ++ spa_aux_vdev_t *sav = vd->vdev_aux; ++ nvlist_t **aux; ++ uint_t naux; ++ ++ for (c = 0; c < sav->sav_count; c++) { ++ if (sav->sav_vdevs[c] == vd) ++ break; ++ } ++ ++ if (c == sav->sav_count) { ++ /* ++ * We're being removed. There's nothing more to do. ++ */ ++ ASSERT(sav->sav_sync == B_TRUE); ++ return; ++ } ++ ++ sav->sav_sync = B_TRUE; ++ ++ if (nvlist_lookup_nvlist_array(sav->sav_config, ++ ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { ++ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ++ ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); ++ } ++ ++ ASSERT(c < naux); ++ ++ /* ++ * Setting the nvlist in the middle if the array is a little ++ * sketchy, but it will work. ++ */ ++ nvlist_free(aux[c]); ++ aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); ++ ++ return; ++ } ++ ++ /* ++ * The dirty list is protected by the SCL_CONFIG lock. The caller ++ * must either hold SCL_CONFIG as writer, or must be the sync thread ++ * (which holds SCL_CONFIG as reader). There's only one sync thread, ++ * so this is sufficient to ensure mutual exclusion. ++ */ ++ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || ++ (dsl_pool_sync_context(spa_get_dsl(spa)) && ++ spa_config_held(spa, SCL_CONFIG, RW_READER))); ++ ++ if (vd == rvd) { ++ for (c = 0; c < rvd->vdev_children; c++) ++ vdev_config_dirty(rvd->vdev_child[c]); ++ } else { ++ ASSERT(vd == vd->vdev_top); ++ ++ if (!list_link_active(&vd->vdev_config_dirty_node) && ++ !vd->vdev_ishole) ++ list_insert_head(&spa->spa_config_dirty_list, vd); ++ } ++} ++ ++void ++vdev_config_clean(vdev_t *vd) ++{ ++ spa_t *spa = vd->vdev_spa; ++ ++ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || ++ (dsl_pool_sync_context(spa_get_dsl(spa)) && ++ spa_config_held(spa, SCL_CONFIG, RW_READER))); ++ ++ ASSERT(list_link_active(&vd->vdev_config_dirty_node)); ++ list_remove(&spa->spa_config_dirty_list, vd); ++} ++ ++/* ++ * Mark a top-level vdev's state as dirty, so that the next pass of ++ * spa_sync() can convert this into vdev_config_dirty(). We distinguish ++ * the state changes from larger config changes because they require ++ * much less locking, and are often needed for administrative actions. ++ */ ++void ++vdev_state_dirty(vdev_t *vd) ++{ ++ spa_t *spa = vd->vdev_spa; ++ ++ ASSERT(spa_writeable(spa)); ++ ASSERT(vd == vd->vdev_top); ++ ++ /* ++ * The state list is protected by the SCL_STATE lock. The caller ++ * must either hold SCL_STATE as writer, or must be the sync thread ++ * (which holds SCL_STATE as reader). There's only one sync thread, ++ * so this is sufficient to ensure mutual exclusion. ++ */ ++ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || ++ (dsl_pool_sync_context(spa_get_dsl(spa)) && ++ spa_config_held(spa, SCL_STATE, RW_READER))); ++ ++ if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) ++ list_insert_head(&spa->spa_state_dirty_list, vd); ++} ++ ++void ++vdev_state_clean(vdev_t *vd) ++{ ++ spa_t *spa = vd->vdev_spa; ++ ++ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || ++ (dsl_pool_sync_context(spa_get_dsl(spa)) && ++ spa_config_held(spa, SCL_STATE, RW_READER))); ++ ++ ASSERT(list_link_active(&vd->vdev_state_dirty_node)); ++ list_remove(&spa->spa_state_dirty_list, vd); ++} ++ ++/* ++ * Propagate vdev state up from children to parent. ++ */ ++void ++vdev_propagate_state(vdev_t *vd) ++{ ++ spa_t *spa = vd->vdev_spa; ++ vdev_t *rvd = spa->spa_root_vdev; ++ int degraded = 0, faulted = 0; ++ int corrupted = 0; ++ vdev_t *child; ++ int c; ++ ++ if (vd->vdev_children > 0) { ++ for (c = 0; c < vd->vdev_children; c++) { ++ child = vd->vdev_child[c]; ++ ++ /* ++ * Don't factor holes into the decision. ++ */ ++ if (child->vdev_ishole) ++ continue; ++ ++ if (!vdev_readable(child) || ++ (!vdev_writeable(child) && spa_writeable(spa))) { ++ /* ++ * Root special: if there is a top-level log ++ * device, treat the root vdev as if it were ++ * degraded. ++ */ ++ if (child->vdev_islog && vd == rvd) ++ degraded++; ++ else ++ faulted++; ++ } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { ++ degraded++; ++ } ++ ++ if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) ++ corrupted++; ++ } ++ ++ vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); ++ ++ /* ++ * Root special: if there is a top-level vdev that cannot be ++ * opened due to corrupted metadata, then propagate the root ++ * vdev's aux state as 'corrupt' rather than 'insufficient ++ * replicas'. ++ */ ++ if (corrupted && vd == rvd && ++ rvd->vdev_state == VDEV_STATE_CANT_OPEN) ++ vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_CORRUPT_DATA); ++ } ++ ++ if (vd->vdev_parent) ++ vdev_propagate_state(vd->vdev_parent); ++} ++ ++/* ++ * Set a vdev's state. If this is during an open, we don't update the parent ++ * state, because we're in the process of opening children depth-first. ++ * Otherwise, we propagate the change to the parent. ++ * ++ * If this routine places a device in a faulted state, an appropriate ereport is ++ * generated. ++ */ ++void ++vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) ++{ ++ uint64_t save_state; ++ spa_t *spa = vd->vdev_spa; ++ ++ if (state == vd->vdev_state) { ++ vd->vdev_stat.vs_aux = aux; ++ return; ++ } ++ ++ save_state = vd->vdev_state; ++ ++ vd->vdev_state = state; ++ vd->vdev_stat.vs_aux = aux; ++ ++ /* ++ * If we are setting the vdev state to anything but an open state, then ++ * always close the underlying device unless the device has requested ++ * a delayed close (i.e. we're about to remove or fault the device). ++ * Otherwise, we keep accessible but invalid devices open forever. ++ * We don't call vdev_close() itself, because that implies some extra ++ * checks (offline, etc) that we don't want here. This is limited to ++ * leaf devices, because otherwise closing the device will affect other ++ * children. ++ */ ++ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && ++ vd->vdev_ops->vdev_op_leaf) ++ vd->vdev_ops->vdev_op_close(vd); ++ ++ /* ++ * If we have brought this vdev back into service, we need ++ * to notify fmd so that it can gracefully repair any outstanding ++ * cases due to a missing device. We do this in all cases, even those ++ * that probably don't correlate to a repaired fault. This is sure to ++ * catch all cases, and we let the zfs-retire agent sort it out. If ++ * this is a transient state it's OK, as the retire agent will ++ * double-check the state of the vdev before repairing it. ++ */ ++ if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && ++ vd->vdev_prevstate != state) ++ zfs_post_state_change(spa, vd); ++ ++ if (vd->vdev_removed && ++ state == VDEV_STATE_CANT_OPEN && ++ (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { ++ /* ++ * If the previous state is set to VDEV_STATE_REMOVED, then this ++ * device was previously marked removed and someone attempted to ++ * reopen it. If this failed due to a nonexistent device, then ++ * keep the device in the REMOVED state. We also let this be if ++ * it is one of our special test online cases, which is only ++ * attempting to online the device and shouldn't generate an FMA ++ * fault. ++ */ ++ vd->vdev_state = VDEV_STATE_REMOVED; ++ vd->vdev_stat.vs_aux = VDEV_AUX_NONE; ++ } else if (state == VDEV_STATE_REMOVED) { ++ vd->vdev_removed = B_TRUE; ++ } else if (state == VDEV_STATE_CANT_OPEN) { ++ /* ++ * If we fail to open a vdev during an import or recovery, we ++ * mark it as "not available", which signifies that it was ++ * never there to begin with. Failure to open such a device ++ * is not considered an error. ++ */ ++ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || ++ spa_load_state(spa) == SPA_LOAD_RECOVER) && ++ vd->vdev_ops->vdev_op_leaf) ++ vd->vdev_not_present = 1; ++ ++ /* ++ * Post the appropriate ereport. If the 'prevstate' field is ++ * set to something other than VDEV_STATE_UNKNOWN, it indicates ++ * that this is part of a vdev_reopen(). In this case, we don't ++ * want to post the ereport if the device was already in the ++ * CANT_OPEN state beforehand. ++ * ++ * If the 'checkremove' flag is set, then this is an attempt to ++ * online the device in response to an insertion event. If we ++ * hit this case, then we have detected an insertion event for a ++ * faulted or offline device that wasn't in the removed state. ++ * In this scenario, we don't post an ereport because we are ++ * about to replace the device, or attempt an online with ++ * vdev_forcefault, which will generate the fault for us. ++ */ ++ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && ++ !vd->vdev_not_present && !vd->vdev_checkremove && ++ vd != spa->spa_root_vdev) { ++ const char *class; ++ ++ switch (aux) { ++ case VDEV_AUX_OPEN_FAILED: ++ class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; ++ break; ++ case VDEV_AUX_CORRUPT_DATA: ++ class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; ++ break; ++ case VDEV_AUX_NO_REPLICAS: ++ class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; ++ break; ++ case VDEV_AUX_BAD_GUID_SUM: ++ class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; ++ break; ++ case VDEV_AUX_TOO_SMALL: ++ class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; ++ break; ++ case VDEV_AUX_BAD_LABEL: ++ class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; ++ break; ++ default: ++ class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; ++ } ++ ++ zfs_ereport_post(class, spa, vd, NULL, save_state, 0); ++ } ++ ++ /* Erase any notion of persistent removed state */ ++ vd->vdev_removed = B_FALSE; ++ } else { ++ vd->vdev_removed = B_FALSE; ++ } ++ ++ if (!isopen && vd->vdev_parent) ++ vdev_propagate_state(vd->vdev_parent); ++} ++ ++/* ++ * Check the vdev configuration to ensure that it's capable of supporting ++ * a root pool. ++ */ ++boolean_t ++vdev_is_bootable(vdev_t *vd) ++{ ++#if defined(__sun__) || defined(__sun) ++ /* ++ * Currently, we do not support RAID-Z or partial configuration. ++ * In addition, only a single top-level vdev is allowed and none of the ++ * leaves can be wholedisks. ++ */ ++ int c; ++ ++ if (!vd->vdev_ops->vdev_op_leaf) { ++ char *vdev_type = vd->vdev_ops->vdev_op_type; ++ ++ if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && ++ vd->vdev_children > 1) { ++ return (B_FALSE); ++ } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || ++ strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { ++ return (B_FALSE); ++ } ++ } else if (vd->vdev_wholedisk == 1) { ++ return (B_FALSE); ++ } ++ ++ for (c = 0; c < vd->vdev_children; c++) { ++ if (!vdev_is_bootable(vd->vdev_child[c])) ++ return (B_FALSE); ++ } ++#endif /* __sun__ || __sun */ ++ return (B_TRUE); ++} ++ ++/* ++ * Load the state from the original vdev tree (ovd) which ++ * we've retrieved from the MOS config object. If the original ++ * vdev was offline or faulted then we transfer that state to the ++ * device in the current vdev tree (nvd). ++ */ ++void ++vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) ++{ ++ int c; ++ ++ ASSERT(nvd->vdev_top->vdev_islog); ++ ASSERT(spa_config_held(nvd->vdev_spa, ++ SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ++ ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); ++ ++ for (c = 0; c < nvd->vdev_children; c++) ++ vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); ++ ++ if (nvd->vdev_ops->vdev_op_leaf) { ++ /* ++ * Restore the persistent vdev state ++ */ ++ nvd->vdev_offline = ovd->vdev_offline; ++ nvd->vdev_faulted = ovd->vdev_faulted; ++ nvd->vdev_degraded = ovd->vdev_degraded; ++ nvd->vdev_removed = ovd->vdev_removed; ++ } ++} ++ ++/* ++ * Determine if a log device has valid content. If the vdev was ++ * removed or faulted in the MOS config then we know that ++ * the content on the log device has already been written to the pool. ++ */ ++boolean_t ++vdev_log_state_valid(vdev_t *vd) ++{ ++ int c; ++ ++ if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && ++ !vd->vdev_removed) ++ return (B_TRUE); ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ if (vdev_log_state_valid(vd->vdev_child[c])) ++ return (B_TRUE); ++ ++ return (B_FALSE); ++} ++ ++/* ++ * Expand a vdev if possible. ++ */ ++void ++vdev_expand(vdev_t *vd, uint64_t txg) ++{ ++ ASSERT(vd->vdev_top == vd); ++ ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ++ ++ if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { ++ VERIFY(vdev_metaslab_init(vd, txg) == 0); ++ vdev_config_dirty(vd); ++ } ++} ++ ++/* ++ * Split a vdev. ++ */ ++void ++vdev_split(vdev_t *vd) ++{ ++ vdev_t *cvd, *pvd = vd->vdev_parent; ++ ++ vdev_remove_child(pvd, vd); ++ vdev_compact_children(pvd); ++ ++ cvd = pvd->vdev_child[0]; ++ if (pvd->vdev_children == 1) { ++ vdev_remove_parent(cvd); ++ cvd->vdev_splitting = B_TRUE; ++ } ++ vdev_propagate_state(cvd); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(vdev_fault); ++EXPORT_SYMBOL(vdev_degrade); ++EXPORT_SYMBOL(vdev_online); ++EXPORT_SYMBOL(vdev_offline); ++EXPORT_SYMBOL(vdev_clear); ++ ++module_param(zfs_scrub_limit, int, 0644); ++MODULE_PARM_DESC(zfs_scrub_limit, "Max scrub/resilver I/O per leaf vdev"); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/vdev_cache.c linux-3.2.33-go/fs/zfs/zfs/vdev_cache.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/vdev_cache.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/vdev_cache.c 2012-11-16 23:25:34.347039358 +0100 +@@ -0,0 +1,436 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Virtual device read-ahead caching. ++ * ++ * This file implements a simple LRU read-ahead cache. When the DMU reads ++ * a given block, it will often want other, nearby blocks soon thereafter. ++ * We take advantage of this by reading a larger disk region and caching ++ * the result. In the best case, this can turn 128 back-to-back 512-byte ++ * reads into a single 64k read followed by 127 cache hits; this reduces ++ * latency dramatically. In the worst case, it can turn an isolated 512-byte ++ * read into a 64k read, which doesn't affect latency all that much but is ++ * terribly wasteful of bandwidth. A more intelligent version of the cache ++ * could keep track of access patterns and not do read-ahead unless it sees ++ * at least two temporally close I/Os to the same region. Currently, only ++ * metadata I/O is inflated. A futher enhancement could take advantage of ++ * more semantic information about the I/O. And it could use something ++ * faster than an AVL tree; that was chosen solely for convenience. ++ * ++ * There are five cache operations: allocate, fill, read, write, evict. ++ * ++ * (1) Allocate. This reserves a cache entry for the specified region. ++ * We separate the allocate and fill operations so that multiple threads ++ * don't generate I/O for the same cache miss. ++ * ++ * (2) Fill. When the I/O for a cache miss completes, the fill routine ++ * places the data in the previously allocated cache entry. ++ * ++ * (3) Read. Read data from the cache. ++ * ++ * (4) Write. Update cache contents after write completion. ++ * ++ * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry ++ * if the total cache size exceeds zfs_vdev_cache_size. ++ */ ++ ++/* ++ * These tunables are for performance analysis. ++ */ ++/* ++ * All i/os smaller than zfs_vdev_cache_max will be turned into ++ * 1<ve_offset < ve2->ve_offset) ++ return (-1); ++ if (ve1->ve_offset > ve2->ve_offset) ++ return (1); ++ return (0); ++} ++ ++static int ++vdev_cache_lastused_compare(const void *a1, const void *a2) ++{ ++ const vdev_cache_entry_t *ve1 = a1; ++ const vdev_cache_entry_t *ve2 = a2; ++ ++ if (ve1->ve_lastused < ve2->ve_lastused) ++ return (-1); ++ if (ve1->ve_lastused > ve2->ve_lastused) ++ return (1); ++ ++ /* ++ * Among equally old entries, sort by offset to ensure uniqueness. ++ */ ++ return (vdev_cache_offset_compare(a1, a2)); ++} ++ ++/* ++ * Evict the specified entry from the cache. ++ */ ++static void ++vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) ++{ ++ ASSERT(MUTEX_HELD(&vc->vc_lock)); ++ ASSERT(ve->ve_fill_io == NULL); ++ ASSERT(ve->ve_data != NULL); ++ ++ avl_remove(&vc->vc_lastused_tree, ve); ++ avl_remove(&vc->vc_offset_tree, ve); ++ zio_buf_free(ve->ve_data, VCBS); ++ kmem_free(ve, sizeof (vdev_cache_entry_t)); ++} ++ ++/* ++ * Allocate an entry in the cache. At the point we don't have the data, ++ * we're just creating a placeholder so that multiple threads don't all ++ * go off and read the same blocks. ++ */ ++static vdev_cache_entry_t * ++vdev_cache_allocate(zio_t *zio) ++{ ++ vdev_cache_t *vc = &zio->io_vd->vdev_cache; ++ uint64_t offset = P2ALIGN(zio->io_offset, VCBS); ++ vdev_cache_entry_t *ve; ++ ++ ASSERT(MUTEX_HELD(&vc->vc_lock)); ++ ++ if (zfs_vdev_cache_size == 0) ++ return (NULL); ++ ++ /* ++ * If adding a new entry would exceed the cache size, ++ * evict the oldest entry (LRU). ++ */ ++ if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > ++ zfs_vdev_cache_size) { ++ ve = avl_first(&vc->vc_lastused_tree); ++ if (ve->ve_fill_io != NULL) ++ return (NULL); ++ ASSERT(ve->ve_hits != 0); ++ vdev_cache_evict(vc, ve); ++ } ++ ++ ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_PUSHPAGE); ++ ve->ve_offset = offset; ++ ve->ve_lastused = ddi_get_lbolt(); ++ ve->ve_data = zio_buf_alloc(VCBS); ++ ++ avl_add(&vc->vc_offset_tree, ve); ++ avl_add(&vc->vc_lastused_tree, ve); ++ ++ return (ve); ++} ++ ++static void ++vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) ++{ ++ uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); ++ ++ ASSERT(MUTEX_HELD(&vc->vc_lock)); ++ ASSERT(ve->ve_fill_io == NULL); ++ ++ if (ve->ve_lastused != ddi_get_lbolt()) { ++ avl_remove(&vc->vc_lastused_tree, ve); ++ ve->ve_lastused = ddi_get_lbolt(); ++ avl_add(&vc->vc_lastused_tree, ve); ++ } ++ ++ ve->ve_hits++; ++ bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); ++} ++ ++/* ++ * Fill a previously allocated cache entry with data. ++ */ ++static void ++vdev_cache_fill(zio_t *fio) ++{ ++ vdev_t *vd = fio->io_vd; ++ vdev_cache_t *vc = &vd->vdev_cache; ++ vdev_cache_entry_t *ve = fio->io_private; ++ zio_t *pio; ++ ++ ASSERT(fio->io_size == VCBS); ++ ++ /* ++ * Add data to the cache. ++ */ ++ mutex_enter(&vc->vc_lock); ++ ++ ASSERT(ve->ve_fill_io == fio); ++ ASSERT(ve->ve_offset == fio->io_offset); ++ ASSERT(ve->ve_data == fio->io_data); ++ ++ ve->ve_fill_io = NULL; ++ ++ /* ++ * Even if this cache line was invalidated by a missed write update, ++ * any reads that were queued up before the missed update are still ++ * valid, so we can satisfy them from this line before we evict it. ++ */ ++ while ((pio = zio_walk_parents(fio)) != NULL) ++ vdev_cache_hit(vc, ve, pio); ++ ++ if (fio->io_error || ve->ve_missed_update) ++ vdev_cache_evict(vc, ve); ++ ++ mutex_exit(&vc->vc_lock); ++} ++ ++/* ++ * Read data from the cache. Returns 0 on cache hit, errno on a miss. ++ */ ++int ++vdev_cache_read(zio_t *zio) ++{ ++ vdev_cache_t *vc = &zio->io_vd->vdev_cache; ++ vdev_cache_entry_t *ve, *ve_search; ++ uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); ++ ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);) ++ zio_t *fio; ++ ++ ASSERT(zio->io_type == ZIO_TYPE_READ); ++ ++ if (zio->io_flags & ZIO_FLAG_DONT_CACHE) ++ return (EINVAL); ++ ++ if (zio->io_size > zfs_vdev_cache_max) ++ return (EOVERFLOW); ++ ++ /* ++ * If the I/O straddles two or more cache blocks, don't cache it. ++ */ ++ if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) ++ return (EXDEV); ++ ++ ASSERT(cache_phase + zio->io_size <= VCBS); ++ ++ mutex_enter(&vc->vc_lock); ++ ++ ve_search = kmem_alloc(sizeof(vdev_cache_entry_t), KM_PUSHPAGE); ++ ve_search->ve_offset = cache_offset; ++ ve = avl_find(&vc->vc_offset_tree, ve_search, NULL); ++ kmem_free(ve_search, sizeof(vdev_cache_entry_t)); ++ ++ if (ve != NULL) { ++ if (ve->ve_missed_update) { ++ mutex_exit(&vc->vc_lock); ++ return (ESTALE); ++ } ++ ++ if ((fio = ve->ve_fill_io) != NULL) { ++ zio_vdev_io_bypass(zio); ++ zio_add_child(zio, fio); ++ mutex_exit(&vc->vc_lock); ++ VDCSTAT_BUMP(vdc_stat_delegations); ++ return (0); ++ } ++ ++ vdev_cache_hit(vc, ve, zio); ++ zio_vdev_io_bypass(zio); ++ ++ mutex_exit(&vc->vc_lock); ++ VDCSTAT_BUMP(vdc_stat_hits); ++ return (0); ++ } ++ ++ ve = vdev_cache_allocate(zio); ++ ++ if (ve == NULL) { ++ mutex_exit(&vc->vc_lock); ++ return (ENOMEM); ++ } ++ ++ fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, ++ ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, ++ ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); ++ ++ ve->ve_fill_io = fio; ++ zio_vdev_io_bypass(zio); ++ zio_add_child(zio, fio); ++ ++ mutex_exit(&vc->vc_lock); ++ zio_nowait(fio); ++ VDCSTAT_BUMP(vdc_stat_misses); ++ ++ return (0); ++} ++ ++/* ++ * Update cache contents upon write completion. ++ */ ++void ++vdev_cache_write(zio_t *zio) ++{ ++ vdev_cache_t *vc = &zio->io_vd->vdev_cache; ++ vdev_cache_entry_t *ve, ve_search; ++ uint64_t io_start = zio->io_offset; ++ uint64_t io_end = io_start + zio->io_size; ++ uint64_t min_offset = P2ALIGN(io_start, VCBS); ++ uint64_t max_offset = P2ROUNDUP(io_end, VCBS); ++ avl_index_t where; ++ ++ ASSERT(zio->io_type == ZIO_TYPE_WRITE); ++ ++ mutex_enter(&vc->vc_lock); ++ ++ ve_search.ve_offset = min_offset; ++ ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); ++ ++ if (ve == NULL) ++ ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); ++ ++ while (ve != NULL && ve->ve_offset < max_offset) { ++ uint64_t start = MAX(ve->ve_offset, io_start); ++ uint64_t end = MIN(ve->ve_offset + VCBS, io_end); ++ ++ if (ve->ve_fill_io != NULL) { ++ ve->ve_missed_update = 1; ++ } else { ++ bcopy((char *)zio->io_data + start - io_start, ++ ve->ve_data + start - ve->ve_offset, end - start); ++ } ++ ve = AVL_NEXT(&vc->vc_offset_tree, ve); ++ } ++ mutex_exit(&vc->vc_lock); ++} ++ ++void ++vdev_cache_purge(vdev_t *vd) ++{ ++ vdev_cache_t *vc = &vd->vdev_cache; ++ vdev_cache_entry_t *ve; ++ ++ mutex_enter(&vc->vc_lock); ++ while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) ++ vdev_cache_evict(vc, ve); ++ mutex_exit(&vc->vc_lock); ++} ++ ++void ++vdev_cache_init(vdev_t *vd) ++{ ++ vdev_cache_t *vc = &vd->vdev_cache; ++ ++ mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, ++ sizeof (vdev_cache_entry_t), ++ offsetof(struct vdev_cache_entry, ve_offset_node)); ++ ++ avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, ++ sizeof (vdev_cache_entry_t), ++ offsetof(struct vdev_cache_entry, ve_lastused_node)); ++} ++ ++void ++vdev_cache_fini(vdev_t *vd) ++{ ++ vdev_cache_t *vc = &vd->vdev_cache; ++ ++ vdev_cache_purge(vd); ++ ++ avl_destroy(&vc->vc_offset_tree); ++ avl_destroy(&vc->vc_lastused_tree); ++ ++ mutex_destroy(&vc->vc_lock); ++} ++ ++void ++vdev_cache_stat_init(void) ++{ ++ vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", ++ KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), ++ KSTAT_FLAG_VIRTUAL); ++ if (vdc_ksp != NULL) { ++ vdc_ksp->ks_data = &vdc_stats; ++ kstat_install(vdc_ksp); ++ } ++} ++ ++void ++vdev_cache_stat_fini(void) ++{ ++ if (vdc_ksp != NULL) { ++ kstat_delete(vdc_ksp); ++ vdc_ksp = NULL; ++ } ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++module_param(zfs_vdev_cache_max, int, 0644); ++MODULE_PARM_DESC(zfs_vdev_cache_max, "Inflate reads small than max"); ++ ++module_param(zfs_vdev_cache_size, int, 0444); ++MODULE_PARM_DESC(zfs_vdev_cache_size, "Total size of the per-disk cache"); ++ ++module_param(zfs_vdev_cache_bshift, int, 0644); ++MODULE_PARM_DESC(zfs_vdev_cache_bshift, "Shift size to inflate reads too"); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/vdev_disk.c linux-3.2.33-go/fs/zfs/zfs/vdev_disk.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/vdev_disk.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/vdev_disk.c 2012-11-16 23:25:34.353039289 +0100 +@@ -0,0 +1,841 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Rewritten for Linux by Brian Behlendorf . ++ * LLNL-CODE-403049. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++char *zfs_vdev_scheduler = VDEV_SCHEDULER; ++ ++/* ++ * Virtual device vector for disks. ++ */ ++typedef struct dio_request { ++ struct completion dr_comp; /* Completion for sync IO */ ++ atomic_t dr_ref; /* References */ ++ zio_t *dr_zio; /* Parent ZIO */ ++ int dr_rw; /* Read/Write */ ++ int dr_error; /* Bio error */ ++ int dr_bio_count; /* Count of bio's */ ++ struct bio *dr_bio[0]; /* Attached bio's */ ++} dio_request_t; ++ ++ ++#ifdef HAVE_OPEN_BDEV_EXCLUSIVE ++static fmode_t ++vdev_bdev_mode(int smode) ++{ ++ fmode_t mode = 0; ++ ++ ASSERT3S(smode & (FREAD | FWRITE), !=, 0); ++ ++ if (smode & FREAD) ++ mode |= FMODE_READ; ++ ++ if (smode & FWRITE) ++ mode |= FMODE_WRITE; ++ ++ return mode; ++} ++#else ++static int ++vdev_bdev_mode(int smode) ++{ ++ int mode = 0; ++ ++ ASSERT3S(smode & (FREAD | FWRITE), !=, 0); ++ ++ if ((smode & FREAD) && !(smode & FWRITE)) ++ mode = MS_RDONLY; ++ ++ return mode; ++} ++#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ ++ ++static uint64_t ++bdev_capacity(struct block_device *bdev) ++{ ++ struct hd_struct *part = bdev->bd_part; ++ ++ /* The partition capacity referenced by the block device */ ++ if (part) ++ return (part->nr_sects << 9); ++ ++ /* Otherwise assume the full device capacity */ ++ return (get_capacity(bdev->bd_disk) << 9); ++} ++ ++static void ++vdev_disk_error(zio_t *zio) ++{ ++#ifdef ZFS_DEBUG ++ printk("ZFS: zio error=%d type=%d offset=%llu size=%llu " ++ "flags=%x delay=%llu\n", zio->io_error, zio->io_type, ++ (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, ++ zio->io_flags, (u_longlong_t)zio->io_delay); ++#endif ++} ++ ++/* ++ * Use the Linux 'noop' elevator for zfs managed block devices. This ++ * strikes the ideal balance by allowing the zfs elevator to do all ++ * request ordering and prioritization. While allowing the Linux ++ * elevator to do the maximum front/back merging allowed by the ++ * physical device. This yields the largest possible requests for ++ * the device with the lowest total overhead. ++ */ ++static int ++vdev_elevator_switch(vdev_t *v, char *elevator) ++{ ++ vdev_disk_t *vd = v->vdev_tsd; ++ struct block_device *bdev = vd->vd_bdev; ++ struct request_queue *q = bdev_get_queue(bdev); ++ char *device = bdev->bd_disk->disk_name; ++ int error; ++ ++ /* Skip devices which are not whole disks (partitions) */ ++ if (!v->vdev_wholedisk) ++ return (0); ++ ++ /* Skip devices without schedulers (loop, ram, dm, etc) */ ++ if (!q->elevator || !blk_queue_stackable(q)) ++ return (0); ++ ++ /* Leave existing scheduler when set to "none" */ ++ if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4)) ++ return (0); ++ ++#ifdef HAVE_ELEVATOR_CHANGE ++ error = elevator_change(q, elevator); ++#else ++ /* For pre-2.6.36 kernels elevator_change() is not available. ++ * Therefore we fall back to using a usermodehelper to echo the ++ * elevator into sysfs; This requires /bin/echo and sysfs to be ++ * mounted which may not be true early in the boot process. ++ */ ++# define SET_SCHEDULER_CMD \ ++ "exec 0/sys/block/%s/queue/scheduler " \ ++ " 2>/dev/null; " \ ++ "echo %s" ++ ++ { ++ char *argv[] = { "/bin/sh", "-c", NULL, NULL }; ++ char *envp[] = { NULL }; ++ ++ argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator); ++ error = call_usermodehelper(argv[0], argv, envp, 1); ++ strfree(argv[2]); ++ } ++#endif /* HAVE_ELEVATOR_CHANGE */ ++ if (error) ++ printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n", ++ elevator, v->vdev_path, device, error); ++ ++ return (error); ++} ++ ++/* ++ * Expanding a whole disk vdev involves invoking BLKRRPART on the ++ * whole disk device. This poses a problem, because BLKRRPART will ++ * return EBUSY if one of the disk's partitions is open. That's why ++ * we have to do it here, just before opening the data partition. ++ * Unfortunately, BLKRRPART works by dropping all partitions and ++ * recreating them, which means that for a short time window, all ++ * /dev/sdxN device files disappear (until udev recreates them). ++ * This means two things: ++ * - When we open the data partition just after a BLKRRPART, we ++ * can't do it using the normal device file path because of the ++ * obvious race condition with udev. Instead, we use reliable ++ * kernel APIs to get a handle to the new partition device from ++ * the whole disk device. ++ * - Because vdev_disk_open() initially needs to find the device ++ * using its path, multiple vdev_disk_open() invocations in ++ * short succession on the same disk with BLKRRPARTs in the ++ * middle have a high probability of failure (because of the ++ * race condition with udev). A typical situation where this ++ * might happen is when the zpool userspace tool does a ++ * TRYIMPORT immediately followed by an IMPORT. For this ++ * reason, we only invoke BLKRRPART in the module when strictly ++ * necessary (zpool online -e case), and rely on userspace to ++ * do it when possible. ++ */ ++static struct block_device * ++vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd) ++{ ++#if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) ++ struct block_device *bdev, *result = ERR_PTR(-ENXIO); ++ struct gendisk *disk; ++ int error, partno; ++ ++ bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), vd); ++ if (IS_ERR(bdev)) ++ return bdev; ++ ++ disk = get_gendisk(bdev->bd_dev, &partno); ++ vdev_bdev_close(bdev, vdev_bdev_mode(mode)); ++ ++ if (disk) { ++ bdev = bdget(disk_devt(disk)); ++ if (bdev) { ++ error = blkdev_get(bdev, vdev_bdev_mode(mode), vd); ++ if (error == 0) ++ error = ioctl_by_bdev(bdev, BLKRRPART, 0); ++ vdev_bdev_close(bdev, vdev_bdev_mode(mode)); ++ } ++ ++ bdev = bdget_disk(disk, partno); ++ if (bdev) { ++ error = blkdev_get(bdev, ++ vdev_bdev_mode(mode) | FMODE_EXCL, vd); ++ if (error == 0) ++ result = bdev; ++ } ++ put_disk(disk); ++ } ++ ++ return result; ++#else ++ return ERR_PTR(-EOPNOTSUPP); ++#endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */ ++} ++ ++static int ++vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, ++ uint64_t *ashift) ++{ ++ struct block_device *bdev = ERR_PTR(-ENXIO); ++ vdev_disk_t *vd; ++ int mode, block_size; ++ ++ /* Must have a pathname and it must be absolute. */ ++ if (v->vdev_path == NULL || v->vdev_path[0] != '/') { ++ v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; ++ return EINVAL; ++ } ++ ++ vd = kmem_zalloc(sizeof(vdev_disk_t), KM_PUSHPAGE); ++ if (vd == NULL) ++ return ENOMEM; ++ ++ /* ++ * Devices are always opened by the path provided at configuration ++ * time. This means that if the provided path is a udev by-id path ++ * then drives may be recabled without an issue. If the provided ++ * path is a udev by-path path then the physical location information ++ * will be preserved. This can be critical for more complicated ++ * configurations where drives are located in specific physical ++ * locations to maximize the systems tolerence to component failure. ++ * Alternately you can provide your own udev rule to flexibly map ++ * the drives as you see fit. It is not advised that you use the ++ * /dev/[hd]d devices which may be reorder due to probing order. ++ * Devices in the wrong locations will be detected by the higher ++ * level vdev validation. ++ */ ++ mode = spa_mode(v->vdev_spa); ++ if (v->vdev_wholedisk && v->vdev_expanding) ++ bdev = vdev_disk_rrpart(v->vdev_path, mode, vd); ++ if (IS_ERR(bdev)) ++ bdev = vdev_bdev_open(v->vdev_path, vdev_bdev_mode(mode), vd); ++ if (IS_ERR(bdev)) { ++ kmem_free(vd, sizeof(vdev_disk_t)); ++ return -PTR_ERR(bdev); ++ } ++ ++ v->vdev_tsd = vd; ++ vd->vd_bdev = bdev; ++ block_size = vdev_bdev_block_size(bdev); ++ ++ /* We think the wholedisk property should always be set when this ++ * function is called. ASSERT here so if any legitimate cases exist ++ * where it's not set, we'll find them during debugging. If we never ++ * hit the ASSERT, this and the following conditional statement can be ++ * removed. */ ++ ASSERT3S(v->vdev_wholedisk, !=, -1ULL); ++ ++ /* The wholedisk property was initialized to -1 in vdev_alloc() if it ++ * was unspecified. In that case, check if this is a whole device. ++ * When bdev->bd_contains == bdev we have a whole device and not simply ++ * a partition. */ ++ if (v->vdev_wholedisk == -1ULL) ++ v->vdev_wholedisk = (bdev->bd_contains == bdev); ++ ++ /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ ++ v->vdev_nowritecache = B_FALSE; ++ ++ /* Physical volume size in bytes */ ++ *psize = bdev_capacity(bdev); ++ ++ /* TODO: report possible expansion size */ ++ *max_psize = *psize; ++ ++ /* Based on the minimum sector size set the block size */ ++ *ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; ++ ++ /* Try to set the io scheduler elevator algorithm */ ++ (void) vdev_elevator_switch(v, zfs_vdev_scheduler); ++ ++ return 0; ++} ++ ++static void ++vdev_disk_close(vdev_t *v) ++{ ++ vdev_disk_t *vd = v->vdev_tsd; ++ ++ if (vd == NULL) ++ return; ++ ++ if (vd->vd_bdev != NULL) ++ vdev_bdev_close(vd->vd_bdev, ++ vdev_bdev_mode(spa_mode(v->vdev_spa))); ++ ++ kmem_free(vd, sizeof(vdev_disk_t)); ++ v->vdev_tsd = NULL; ++} ++ ++static dio_request_t * ++vdev_disk_dio_alloc(int bio_count) ++{ ++ dio_request_t *dr; ++ int i; ++ ++ dr = kmem_zalloc(sizeof(dio_request_t) + ++ sizeof(struct bio *) * bio_count, KM_PUSHPAGE); ++ if (dr) { ++ init_completion(&dr->dr_comp); ++ atomic_set(&dr->dr_ref, 0); ++ dr->dr_bio_count = bio_count; ++ dr->dr_error = 0; ++ ++ for (i = 0; i < dr->dr_bio_count; i++) ++ dr->dr_bio[i] = NULL; ++ } ++ ++ return dr; ++} ++ ++static void ++vdev_disk_dio_free(dio_request_t *dr) ++{ ++ int i; ++ ++ for (i = 0; i < dr->dr_bio_count; i++) ++ if (dr->dr_bio[i]) ++ bio_put(dr->dr_bio[i]); ++ ++ kmem_free(dr, sizeof(dio_request_t) + ++ sizeof(struct bio *) * dr->dr_bio_count); ++} ++ ++static int ++vdev_disk_dio_is_sync(dio_request_t *dr) ++{ ++#ifdef HAVE_BIO_RW_SYNC ++ /* BIO_RW_SYNC preferred interface from 2.6.12-2.6.29 */ ++ return (dr->dr_rw & (1 << BIO_RW_SYNC)); ++#else ++# ifdef HAVE_BIO_RW_SYNCIO ++ /* BIO_RW_SYNCIO preferred interface from 2.6.30-2.6.35 */ ++ return (dr->dr_rw & (1 << BIO_RW_SYNCIO)); ++# else ++# ifdef HAVE_REQ_SYNC ++ /* REQ_SYNC preferred interface from 2.6.36-2.6.xx */ ++ return (dr->dr_rw & REQ_SYNC); ++# else ++# error "Unable to determine bio sync flag" ++# endif /* HAVE_REQ_SYNC */ ++# endif /* HAVE_BIO_RW_SYNC */ ++#endif /* HAVE_BIO_RW_SYNCIO */ ++} ++ ++static void ++vdev_disk_dio_get(dio_request_t *dr) ++{ ++ atomic_inc(&dr->dr_ref); ++} ++ ++static int ++vdev_disk_dio_put(dio_request_t *dr) ++{ ++ int rc = atomic_dec_return(&dr->dr_ref); ++ ++ /* ++ * Free the dio_request when the last reference is dropped and ++ * ensure zio_interpret is called only once with the correct zio ++ */ ++ if (rc == 0) { ++ zio_t *zio = dr->dr_zio; ++ int error = dr->dr_error; ++ ++ vdev_disk_dio_free(dr); ++ ++ if (zio) { ++ zio->io_delay = jiffies_to_msecs( ++ jiffies_64 - zio->io_delay); ++ zio->io_error = error; ++ ASSERT3S(zio->io_error, >=, 0); ++ if (zio->io_error) ++ vdev_disk_error(zio); ++ zio_interrupt(zio); ++ } ++ } ++ ++ return rc; ++} ++ ++BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, error) ++{ ++ dio_request_t *dr = bio->bi_private; ++ int rc; ++ ++ /* Fatal error but print some useful debugging before asserting */ ++ if (dr == NULL) ++ PANIC("dr == NULL, bio->bi_private == NULL\n" ++ "bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d\n" ++ "bi_idx: %d, bi_size: %d, bi_end_io: %p, bi_cnt: %d\n", ++ bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt, ++ bio->bi_idx, bio->bi_size, bio->bi_end_io, ++ atomic_read(&bio->bi_cnt)); ++ ++#ifndef HAVE_2ARGS_BIO_END_IO_T ++ if (bio->bi_size) ++ return 1; ++#endif /* HAVE_2ARGS_BIO_END_IO_T */ ++ ++ if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags)) ++ error = -EIO; ++ ++ if (dr->dr_error == 0) ++ dr->dr_error = -error; ++ ++ /* Drop reference aquired by __vdev_disk_physio */ ++ rc = vdev_disk_dio_put(dr); ++ ++ /* Wake up synchronous waiter this is the last outstanding bio */ ++ if ((rc == 1) && vdev_disk_dio_is_sync(dr)) ++ complete(&dr->dr_comp); ++ ++ BIO_END_IO_RETURN(0); ++} ++ ++static inline unsigned long ++bio_nr_pages(void *bio_ptr, unsigned int bio_size) ++{ ++ return ((((unsigned long)bio_ptr + bio_size + PAGE_SIZE - 1) >> ++ PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT)); ++} ++ ++static unsigned int ++bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) ++{ ++ unsigned int offset, size, i; ++ struct page *page; ++ ++ offset = offset_in_page(bio_ptr); ++ for (i = 0; i < bio->bi_max_vecs; i++) { ++ size = PAGE_SIZE - offset; ++ ++ if (bio_size <= 0) ++ break; ++ ++ if (size > bio_size) ++ size = bio_size; ++ ++ if (kmem_virt(bio_ptr)) ++ page = vmalloc_to_page(bio_ptr); ++ else ++ page = virt_to_page(bio_ptr); ++ ++ if (bio_add_page(bio, page, size, offset) != size) ++ break; ++ ++ bio_ptr += size; ++ bio_size -= size; ++ offset = 0; ++ } ++ ++ return bio_size; ++} ++ ++static int ++__vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, ++ size_t kbuf_size, uint64_t kbuf_offset, int flags) ++{ ++ dio_request_t *dr; ++ caddr_t bio_ptr; ++ uint64_t bio_offset; ++ int bio_size, bio_count = 16; ++ int i = 0, error = 0; ++ ++ ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size); ++ ++retry: ++ dr = vdev_disk_dio_alloc(bio_count); ++ if (dr == NULL) ++ return ENOMEM; ++ ++ if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) ++ bio_set_flags_failfast(bdev, &flags); ++ ++ dr->dr_zio = zio; ++ dr->dr_rw = flags; ++ ++ /* ++ * When the IO size exceeds the maximum bio size for the request ++ * queue we are forced to break the IO in multiple bio's and wait ++ * for them all to complete. Ideally, all pool users will set ++ * their volume block size to match the maximum request size and ++ * the common case will be one bio per vdev IO request. ++ */ ++ bio_ptr = kbuf_ptr; ++ bio_offset = kbuf_offset; ++ bio_size = kbuf_size; ++ for (i = 0; i <= dr->dr_bio_count; i++) { ++ ++ /* Finished constructing bio's for given buffer */ ++ if (bio_size <= 0) ++ break; ++ ++ /* ++ * By default only 'bio_count' bio's per dio are allowed. ++ * However, if we find ourselves in a situation where more ++ * are needed we allocate a larger dio and warn the user. ++ */ ++ if (dr->dr_bio_count == i) { ++ vdev_disk_dio_free(dr); ++ bio_count *= 2; ++ goto retry; ++ } ++ ++ dr->dr_bio[i] = bio_alloc(GFP_NOIO, ++ bio_nr_pages(bio_ptr, bio_size)); ++ if (dr->dr_bio[i] == NULL) { ++ vdev_disk_dio_free(dr); ++ return ENOMEM; ++ } ++ ++ /* Matching put called by vdev_disk_physio_completion */ ++ vdev_disk_dio_get(dr); ++ ++ dr->dr_bio[i]->bi_bdev = bdev; ++ dr->dr_bio[i]->bi_sector = bio_offset >> 9; ++ dr->dr_bio[i]->bi_rw = dr->dr_rw; ++ dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; ++ dr->dr_bio[i]->bi_private = dr; ++ ++ /* Remaining size is returned to become the new size */ ++ bio_size = bio_map(dr->dr_bio[i], bio_ptr, bio_size); ++ ++ /* Advance in buffer and construct another bio if needed */ ++ bio_ptr += dr->dr_bio[i]->bi_size; ++ bio_offset += dr->dr_bio[i]->bi_size; ++ } ++ ++ /* Extra reference to protect dio_request during submit_bio */ ++ vdev_disk_dio_get(dr); ++ if (zio) ++ zio->io_delay = jiffies_64; ++ ++ /* Submit all bio's associated with this dio */ ++ for (i = 0; i < dr->dr_bio_count; i++) ++ if (dr->dr_bio[i]) ++ submit_bio(dr->dr_rw, dr->dr_bio[i]); ++ ++ /* ++ * On synchronous blocking requests we wait for all bio the completion ++ * callbacks to run. We will be woken when the last callback runs ++ * for this dio. We are responsible for putting the last dio_request ++ * reference will in turn put back the last bio references. The ++ * only synchronous consumer is vdev_disk_read_rootlabel() all other ++ * IO originating from vdev_disk_io_start() is asynchronous. ++ */ ++ if (vdev_disk_dio_is_sync(dr)) { ++ wait_for_completion(&dr->dr_comp); ++ error = dr->dr_error; ++ ASSERT3S(atomic_read(&dr->dr_ref), ==, 1); ++ } ++ ++ (void)vdev_disk_dio_put(dr); ++ ++ return error; ++} ++ ++int ++vdev_disk_physio(struct block_device *bdev, caddr_t kbuf, ++ size_t size, uint64_t offset, int flags) ++{ ++ bio_set_flags_failfast(bdev, &flags); ++ return __vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags); ++} ++ ++/* 2.6.24 API change */ ++#ifdef HAVE_BIO_EMPTY_BARRIER ++BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc) ++{ ++ zio_t *zio = bio->bi_private; ++ ++ zio->io_delay = jiffies_to_msecs(jiffies_64 - zio->io_delay); ++ zio->io_error = -rc; ++ if (rc && (rc == -EOPNOTSUPP)) ++ zio->io_vd->vdev_nowritecache = B_TRUE; ++ ++ bio_put(bio); ++ ASSERT3S(zio->io_error, >=, 0); ++ if (zio->io_error) ++ vdev_disk_error(zio); ++ zio_interrupt(zio); ++ ++ BIO_END_IO_RETURN(0); ++} ++ ++static int ++vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) ++{ ++ struct request_queue *q; ++ struct bio *bio; ++ ++ q = bdev_get_queue(bdev); ++ if (!q) ++ return ENXIO; ++ ++ bio = bio_alloc(GFP_KERNEL, 0); ++ if (!bio) ++ return ENOMEM; ++ ++ bio->bi_end_io = vdev_disk_io_flush_completion; ++ bio->bi_private = zio; ++ bio->bi_bdev = bdev; ++ zio->io_delay = jiffies_64; ++ submit_bio(VDEV_WRITE_FLUSH_FUA, bio); ++ ++ return 0; ++} ++#else ++static int ++vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) ++{ ++ return ENOTSUP; ++} ++#endif /* HAVE_BIO_EMPTY_BARRIER */ ++ ++static int ++vdev_disk_io_start(zio_t *zio) ++{ ++ vdev_t *v = zio->io_vd; ++ vdev_disk_t *vd = v->vdev_tsd; ++ int flags, error; ++ ++ switch (zio->io_type) { ++ case ZIO_TYPE_IOCTL: ++ ++ if (!vdev_readable(v)) { ++ zio->io_error = ENXIO; ++ return ZIO_PIPELINE_CONTINUE; ++ } ++ ++ switch (zio->io_cmd) { ++ case DKIOCFLUSHWRITECACHE: ++ ++ if (zfs_nocacheflush) ++ break; ++ ++ if (v->vdev_nowritecache) { ++ zio->io_error = ENOTSUP; ++ break; ++ } ++ ++ error = vdev_disk_io_flush(vd->vd_bdev, zio); ++ if (error == 0) ++ return ZIO_PIPELINE_STOP; ++ ++ zio->io_error = error; ++ if (error == ENOTSUP) ++ v->vdev_nowritecache = B_TRUE; ++ ++ break; ++ ++ default: ++ zio->io_error = ENOTSUP; ++ } ++ ++ return ZIO_PIPELINE_CONTINUE; ++ ++ case ZIO_TYPE_WRITE: ++ flags = WRITE; ++ break; ++ ++ case ZIO_TYPE_READ: ++ flags = READ; ++ break; ++ ++ default: ++ zio->io_error = ENOTSUP; ++ return ZIO_PIPELINE_CONTINUE; ++ } ++ ++ error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, ++ zio->io_size, zio->io_offset, flags); ++ if (error) { ++ zio->io_error = error; ++ return ZIO_PIPELINE_CONTINUE; ++ } ++ ++ return ZIO_PIPELINE_STOP; ++} ++ ++static void ++vdev_disk_io_done(zio_t *zio) ++{ ++ /* ++ * If the device returned EIO, we revalidate the media. If it is ++ * determined the media has changed this triggers the asynchronous ++ * removal of the device from the configuration. ++ */ ++ if (zio->io_error == EIO) { ++ vdev_t *v = zio->io_vd; ++ vdev_disk_t *vd = v->vdev_tsd; ++ ++ if (check_disk_change(vd->vd_bdev)) { ++ vdev_bdev_invalidate(vd->vd_bdev); ++ v->vdev_remove_wanted = B_TRUE; ++ spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); ++ } ++ } ++} ++ ++static void ++vdev_disk_hold(vdev_t *vd) ++{ ++ ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); ++ ++ /* We must have a pathname, and it must be absolute. */ ++ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') ++ return; ++ ++ /* ++ * Only prefetch path and devid info if the device has ++ * never been opened. ++ */ ++ if (vd->vdev_tsd != NULL) ++ return; ++ ++ /* XXX: Implement me as a vnode lookup for the device */ ++ vd->vdev_name_vp = NULL; ++ vd->vdev_devid_vp = NULL; ++} ++ ++static void ++vdev_disk_rele(vdev_t *vd) ++{ ++ ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); ++ ++ /* XXX: Implement me as a vnode rele for the device */ ++} ++ ++vdev_ops_t vdev_disk_ops = { ++ vdev_disk_open, ++ vdev_disk_close, ++ vdev_default_asize, ++ vdev_disk_io_start, ++ vdev_disk_io_done, ++ NULL, ++ vdev_disk_hold, ++ vdev_disk_rele, ++ VDEV_TYPE_DISK, /* name of this vdev type */ ++ B_TRUE /* leaf vdev */ ++}; ++ ++/* ++ * Given the root disk device devid or pathname, read the label from ++ * the device, and construct a configuration nvlist. ++ */ ++int ++vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) ++{ ++ struct block_device *bdev; ++ vdev_label_t *label; ++ uint64_t s, size; ++ int i; ++ ++ bdev = vdev_bdev_open(devpath, vdev_bdev_mode(FREAD), NULL); ++ if (IS_ERR(bdev)) ++ return -PTR_ERR(bdev); ++ ++ s = bdev_capacity(bdev); ++ if (s == 0) { ++ vdev_bdev_close(bdev, vdev_bdev_mode(FREAD)); ++ return EIO; ++ } ++ ++ size = P2ALIGN_TYPED(s, sizeof(vdev_label_t), uint64_t); ++ label = vmem_alloc(sizeof(vdev_label_t), KM_PUSHPAGE); ++ ++ for (i = 0; i < VDEV_LABELS; i++) { ++ uint64_t offset, state, txg = 0; ++ ++ /* read vdev label */ ++ offset = vdev_label_offset(size, i, 0); ++ if (vdev_disk_physio(bdev, (caddr_t)label, ++ VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, READ_SYNC) != 0) ++ continue; ++ ++ if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, ++ sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { ++ *config = NULL; ++ continue; ++ } ++ ++ if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, ++ &state) != 0 || state >= POOL_STATE_DESTROYED) { ++ nvlist_free(*config); ++ *config = NULL; ++ continue; ++ } ++ ++ if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, ++ &txg) != 0 || txg == 0) { ++ nvlist_free(*config); ++ *config = NULL; ++ continue; ++ } ++ ++ break; ++ } ++ ++ vmem_free(label, sizeof(vdev_label_t)); ++ vdev_bdev_close(bdev, vdev_bdev_mode(FREAD)); ++ ++ return 0; ++} ++ ++module_param(zfs_vdev_scheduler, charp, 0644); ++MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/vdev_file.c linux-3.2.33-go/fs/zfs/zfs/vdev_file.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/vdev_file.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/vdev_file.c 2012-11-16 23:25:34.352039300 +0100 +@@ -0,0 +1,219 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Virtual device vector for files. ++ */ ++ ++static void ++vdev_file_hold(vdev_t *vd) ++{ ++ ASSERT(vd->vdev_path != NULL); ++} ++ ++static void ++vdev_file_rele(vdev_t *vd) ++{ ++ ASSERT(vd->vdev_path != NULL); ++} ++ ++static int ++vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, ++ uint64_t *ashift) ++{ ++ vdev_file_t *vf; ++ vnode_t *vp; ++ vattr_t vattr; ++ int error; ++ ++ /* ++ * We must have a pathname, and it must be absolute. ++ */ ++ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { ++ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; ++ return (EINVAL); ++ } ++ ++ /* ++ * Reopen the device if it's not currently open. Otherwise, ++ * just update the physical size of the device. ++ */ ++ if (vd->vdev_tsd != NULL) { ++ ASSERT(vd->vdev_reopening); ++ vf = vd->vdev_tsd; ++ goto skip_open; ++ } ++ ++ vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_PUSHPAGE); ++ ++ /* ++ * We always open the files from the root of the global zone, even if ++ * we're in a local zone. If the user has gotten to this point, the ++ * administrator has already decided that the pool should be available ++ * to local zone users, so the underlying devices should be as well. ++ */ ++ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); ++ error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, ++ spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); ++ ++ if (error) { ++ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; ++ return (error); ++ } ++ ++ vf->vf_vnode = vp; ++ ++#ifdef _KERNEL ++ /* ++ * Make sure it's a regular file. ++ */ ++ if (vp->v_type != VREG) { ++ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; ++ return (ENODEV); ++ } ++#endif ++ ++skip_open: ++ /* ++ * Determine the physical size of the file. ++ */ ++ vattr.va_mask = AT_SIZE; ++ error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL); ++ if (error) { ++ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; ++ return (error); ++ } ++ ++ *max_psize = *psize = vattr.va_size; ++ *ashift = SPA_MINBLOCKSHIFT; ++ ++ return (0); ++} ++ ++static void ++vdev_file_close(vdev_t *vd) ++{ ++ vdev_file_t *vf = vd->vdev_tsd; ++ ++ if (vd->vdev_reopening || vf == NULL) ++ return; ++ ++ if (vf->vf_vnode != NULL) { ++ (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL); ++ (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, ++ kcred, NULL); ++ } ++ ++ vd->vdev_delayed_close = B_FALSE; ++ kmem_free(vf, sizeof (vdev_file_t)); ++ vd->vdev_tsd = NULL; ++} ++ ++static int ++vdev_file_io_start(zio_t *zio) ++{ ++ vdev_t *vd = zio->io_vd; ++ vdev_file_t *vf; ++ ssize_t resid = 0; ++ ++ if (!vdev_readable(vd)) { ++ zio->io_error = ENXIO; ++ return (ZIO_PIPELINE_CONTINUE); ++ } ++ ++ vf = vd->vdev_tsd; ++ ++ if (zio->io_type == ZIO_TYPE_IOCTL) { ++ switch (zio->io_cmd) { ++ case DKIOCFLUSHWRITECACHE: ++ zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, ++ kcred, NULL); ++ break; ++ default: ++ zio->io_error = ENOTSUP; ++ } ++ ++ return (ZIO_PIPELINE_CONTINUE); ++ } ++ ++ zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? ++ UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, ++ zio->io_size, zio->io_offset, UIO_SYSSPACE, ++ 0, RLIM64_INFINITY, kcred, &resid); ++ ++ if (resid != 0 && zio->io_error == 0) ++ zio->io_error = ENOSPC; ++ ++ zio_interrupt(zio); ++ ++ return (ZIO_PIPELINE_STOP); ++} ++ ++/* ARGSUSED */ ++static void ++vdev_file_io_done(zio_t *zio) ++{ ++} ++ ++vdev_ops_t vdev_file_ops = { ++ vdev_file_open, ++ vdev_file_close, ++ vdev_default_asize, ++ vdev_file_io_start, ++ vdev_file_io_done, ++ NULL, ++ vdev_file_hold, ++ vdev_file_rele, ++ VDEV_TYPE_FILE, /* name of this vdev type */ ++ B_TRUE /* leaf vdev */ ++}; ++ ++/* ++ * From userland we access disks just like files. ++ */ ++#ifndef _KERNEL ++ ++vdev_ops_t vdev_disk_ops = { ++ vdev_file_open, ++ vdev_file_close, ++ vdev_default_asize, ++ vdev_file_io_start, ++ vdev_file_io_done, ++ NULL, ++ vdev_file_hold, ++ vdev_file_rele, ++ VDEV_TYPE_DISK, /* name of this vdev type */ ++ B_TRUE /* leaf vdev */ ++}; ++ ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/vdev_label.c linux-3.2.33-go/fs/zfs/zfs/vdev_label.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/vdev_label.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/vdev_label.c 2012-11-16 23:25:34.352039300 +0100 +@@ -0,0 +1,1225 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* ++ * Virtual Device Labels ++ * --------------------- ++ * ++ * The vdev label serves several distinct purposes: ++ * ++ * 1. Uniquely identify this device as part of a ZFS pool and confirm its ++ * identity within the pool. ++ * ++ * 2. Verify that all the devices given in a configuration are present ++ * within the pool. ++ * ++ * 3. Determine the uberblock for the pool. ++ * ++ * 4. In case of an import operation, determine the configuration of the ++ * toplevel vdev of which it is a part. ++ * ++ * 5. If an import operation cannot find all the devices in the pool, ++ * provide enough information to the administrator to determine which ++ * devices are missing. ++ * ++ * It is important to note that while the kernel is responsible for writing the ++ * label, it only consumes the information in the first three cases. The ++ * latter information is only consumed in userland when determining the ++ * configuration to import a pool. ++ * ++ * ++ * Label Organization ++ * ------------------ ++ * ++ * Before describing the contents of the label, it's important to understand how ++ * the labels are written and updated with respect to the uberblock. ++ * ++ * When the pool configuration is altered, either because it was newly created ++ * or a device was added, we want to update all the labels such that we can deal ++ * with fatal failure at any point. To this end, each disk has two labels which ++ * are updated before and after the uberblock is synced. Assuming we have ++ * labels and an uberblock with the following transaction groups: ++ * ++ * L1 UB L2 ++ * +------+ +------+ +------+ ++ * | | | | | | ++ * | t10 | | t10 | | t10 | ++ * | | | | | | ++ * +------+ +------+ +------+ ++ * ++ * In this stable state, the labels and the uberblock were all updated within ++ * the same transaction group (10). Each label is mirrored and checksummed, so ++ * that we can detect when we fail partway through writing the label. ++ * ++ * In order to identify which labels are valid, the labels are written in the ++ * following manner: ++ * ++ * 1. For each vdev, update 'L1' to the new label ++ * 2. Update the uberblock ++ * 3. For each vdev, update 'L2' to the new label ++ * ++ * Given arbitrary failure, we can determine the correct label to use based on ++ * the transaction group. If we fail after updating L1 but before updating the ++ * UB, we will notice that L1's transaction group is greater than the uberblock, ++ * so L2 must be valid. If we fail after writing the uberblock but before ++ * writing L2, we will notice that L2's transaction group is less than L1, and ++ * therefore L1 is valid. ++ * ++ * Another added complexity is that not every label is updated when the config ++ * is synced. If we add a single device, we do not want to have to re-write ++ * every label for every device in the pool. This means that both L1 and L2 may ++ * be older than the pool uberblock, because the necessary information is stored ++ * on another vdev. ++ * ++ * ++ * On-disk Format ++ * -------------- ++ * ++ * The vdev label consists of two distinct parts, and is wrapped within the ++ * vdev_label_t structure. The label includes 8k of padding to permit legacy ++ * VTOC disk labels, but is otherwise ignored. ++ * ++ * The first half of the label is a packed nvlist which contains pool wide ++ * properties, per-vdev properties, and configuration information. It is ++ * described in more detail below. ++ * ++ * The latter half of the label consists of a redundant array of uberblocks. ++ * These uberblocks are updated whenever a transaction group is committed, ++ * or when the configuration is updated. When a pool is loaded, we scan each ++ * vdev for the 'best' uberblock. ++ * ++ * ++ * Configuration Information ++ * ------------------------- ++ * ++ * The nvlist describing the pool and vdev contains the following elements: ++ * ++ * version ZFS on-disk version ++ * name Pool name ++ * state Pool state ++ * txg Transaction group in which this label was written ++ * pool_guid Unique identifier for this pool ++ * vdev_tree An nvlist describing vdev tree. ++ * ++ * Each leaf device label also contains the following: ++ * ++ * top_guid Unique ID for top-level vdev in which this is contained ++ * guid Unique ID for the leaf vdev ++ * ++ * The 'vs' configuration follows the format described in 'spa_config.c'. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Basic routines to read and write from a vdev label. ++ * Used throughout the rest of this file. ++ */ ++uint64_t ++vdev_label_offset(uint64_t psize, int l, uint64_t offset) ++{ ++ ASSERT(offset < sizeof (vdev_label_t)); ++ ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0); ++ ++ return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? ++ 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); ++} ++ ++/* ++ * Returns back the vdev label associated with the passed in offset. ++ */ ++int ++vdev_label_number(uint64_t psize, uint64_t offset) ++{ ++ int l; ++ ++ if (offset >= psize - VDEV_LABEL_END_SIZE) { ++ offset -= psize - VDEV_LABEL_END_SIZE; ++ offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t); ++ } ++ l = offset / sizeof (vdev_label_t); ++ return (l < VDEV_LABELS ? l : -1); ++} ++ ++static void ++vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, ++ uint64_t size, zio_done_func_t *done, void *private, int flags) ++{ ++ ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == ++ SCL_STATE_ALL); ++ ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); ++ ++ zio_nowait(zio_read_phys(zio, vd, ++ vdev_label_offset(vd->vdev_psize, l, offset), ++ size, buf, ZIO_CHECKSUM_LABEL, done, private, ++ ZIO_PRIORITY_SYNC_READ, flags, B_TRUE)); ++} ++ ++static void ++vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, ++ uint64_t size, zio_done_func_t *done, void *private, int flags) ++{ ++ ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || ++ (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) == ++ (SCL_CONFIG | SCL_STATE) && ++ dsl_pool_sync_context(spa_get_dsl(zio->io_spa)))); ++ ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); ++ ++ zio_nowait(zio_write_phys(zio, vd, ++ vdev_label_offset(vd->vdev_psize, l, offset), ++ size, buf, ZIO_CHECKSUM_LABEL, done, private, ++ ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE)); ++} ++ ++/* ++ * Generate the nvlist representing this vdev's config. ++ */ ++nvlist_t * ++vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, ++ vdev_config_flag_t flags) ++{ ++ nvlist_t *nv = NULL; ++ ++ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); ++ ++ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, ++ vd->vdev_ops->vdev_op_type) == 0); ++ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE))) ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) ++ == 0); ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); ++ ++ if (vd->vdev_path != NULL) ++ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, ++ vd->vdev_path) == 0); ++ ++ if (vd->vdev_devid != NULL) ++ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, ++ vd->vdev_devid) == 0); ++ ++ if (vd->vdev_physpath != NULL) ++ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, ++ vd->vdev_physpath) == 0); ++ ++ if (vd->vdev_fru != NULL) ++ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU, ++ vd->vdev_fru) == 0); ++ ++ if (vd->vdev_nparity != 0) { ++ ASSERT(strcmp(vd->vdev_ops->vdev_op_type, ++ VDEV_TYPE_RAIDZ) == 0); ++ ++ /* ++ * Make sure someone hasn't managed to sneak a fancy new vdev ++ * into a crufty old storage pool. ++ */ ++ ASSERT(vd->vdev_nparity == 1 || ++ (vd->vdev_nparity <= 2 && ++ spa_version(spa) >= SPA_VERSION_RAIDZ2) || ++ (vd->vdev_nparity <= 3 && ++ spa_version(spa) >= SPA_VERSION_RAIDZ3)); ++ ++ /* ++ * Note that we'll add the nparity tag even on storage pools ++ * that only support a single parity device -- older software ++ * will just ignore it. ++ */ ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, ++ vd->vdev_nparity) == 0); ++ } ++ ++ if (vd->vdev_wholedisk != -1ULL) ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, ++ vd->vdev_wholedisk) == 0); ++ ++ if (vd->vdev_not_present) ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0); ++ ++ if (vd->vdev_isspare) ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0); ++ ++ if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && ++ vd == vd->vdev_top) { ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, ++ vd->vdev_ms_array) == 0); ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, ++ vd->vdev_ms_shift) == 0); ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, ++ vd->vdev_ashift) == 0); ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, ++ vd->vdev_asize) == 0); ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, ++ vd->vdev_islog) == 0); ++ if (vd->vdev_removing) ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, ++ vd->vdev_removing) == 0); ++ } ++ ++ if (vd->vdev_dtl_smo.smo_object != 0) ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, ++ vd->vdev_dtl_smo.smo_object) == 0); ++ ++ if (vd->vdev_crtxg) ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, ++ vd->vdev_crtxg) == 0); ++ ++ if (getstats) { ++ vdev_stat_t vs; ++ pool_scan_stat_t ps; ++ ++ vdev_get_stats(vd, &vs); ++ VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, ++ (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0); ++ ++ /* provide either current or previous scan information */ ++ if (spa_scan_get_stats(spa, &ps) == 0) { ++ VERIFY(nvlist_add_uint64_array(nv, ++ ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, ++ sizeof (pool_scan_stat_t) / sizeof (uint64_t)) ++ == 0); ++ } ++ } ++ ++ if (!vd->vdev_ops->vdev_op_leaf) { ++ nvlist_t **child; ++ int c, idx; ++ ++ ASSERT(!vd->vdev_ishole); ++ ++ child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), ++ KM_PUSHPAGE); ++ ++ for (c = 0, idx = 0; c < vd->vdev_children; c++) { ++ vdev_t *cvd = vd->vdev_child[c]; ++ ++ /* ++ * If we're generating an nvlist of removing ++ * vdevs then skip over any device which is ++ * not being removed. ++ */ ++ if ((flags & VDEV_CONFIG_REMOVING) && ++ !cvd->vdev_removing) ++ continue; ++ ++ child[idx++] = vdev_config_generate(spa, cvd, ++ getstats, flags); ++ } ++ ++ if (idx) { ++ VERIFY(nvlist_add_nvlist_array(nv, ++ ZPOOL_CONFIG_CHILDREN, child, idx) == 0); ++ } ++ ++ for (c = 0; c < idx; c++) ++ nvlist_free(child[c]); ++ ++ kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); ++ ++ } else { ++ const char *aux = NULL; ++ ++ if (vd->vdev_offline && !vd->vdev_tmpoffline) ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, ++ B_TRUE) == 0); ++ if (vd->vdev_resilvering) ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVERING, ++ B_TRUE) == 0); ++ if (vd->vdev_faulted) ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, ++ B_TRUE) == 0); ++ if (vd->vdev_degraded) ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, ++ B_TRUE) == 0); ++ if (vd->vdev_removed) ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, ++ B_TRUE) == 0); ++ if (vd->vdev_unspare) ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, ++ B_TRUE) == 0); ++ if (vd->vdev_ishole) ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, ++ B_TRUE) == 0); ++ ++ switch (vd->vdev_stat.vs_aux) { ++ case VDEV_AUX_ERR_EXCEEDED: ++ aux = "err_exceeded"; ++ break; ++ ++ case VDEV_AUX_EXTERNAL: ++ aux = "external"; ++ break; ++ } ++ ++ if (aux != NULL) ++ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, ++ aux) == 0); ++ ++ if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { ++ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, ++ vd->vdev_orig_guid) == 0); ++ } ++ } ++ ++ return (nv); ++} ++ ++/* ++ * Generate a view of the top-level vdevs. If we currently have holes ++ * in the namespace, then generate an array which contains a list of holey ++ * vdevs. Additionally, add the number of top-level children that currently ++ * exist. ++ */ ++void ++vdev_top_config_generate(spa_t *spa, nvlist_t *config) ++{ ++ vdev_t *rvd = spa->spa_root_vdev; ++ uint64_t *array; ++ uint_t c, idx; ++ ++ array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_PUSHPAGE); ++ ++ for (c = 0, idx = 0; c < rvd->vdev_children; c++) { ++ vdev_t *tvd = rvd->vdev_child[c]; ++ ++ if (tvd->vdev_ishole) ++ array[idx++] = c; ++ } ++ ++ if (idx) { ++ VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY, ++ array, idx) == 0); ++ } ++ ++ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, ++ rvd->vdev_children) == 0); ++ ++ kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); ++} ++ ++nvlist_t * ++vdev_label_read_config(vdev_t *vd) ++{ ++ spa_t *spa = vd->vdev_spa; ++ nvlist_t *config = NULL; ++ vdev_phys_t *vp; ++ zio_t *zio; ++ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ++ ZIO_FLAG_SPECULATIVE; ++ int l; ++ ++ ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ++ ++ if (!vdev_readable(vd)) ++ return (NULL); ++ ++ vp = zio_buf_alloc(sizeof (vdev_phys_t)); ++ ++retry: ++ for (l = 0; l < VDEV_LABELS; l++) { ++ ++ zio = zio_root(spa, NULL, NULL, flags); ++ ++ vdev_label_read(zio, vd, l, vp, ++ offsetof(vdev_label_t, vl_vdev_phys), ++ sizeof (vdev_phys_t), NULL, NULL, flags); ++ ++ if (zio_wait(zio) == 0 && ++ nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), ++ &config, 0) == 0) ++ break; ++ ++ if (config != NULL) { ++ nvlist_free(config); ++ config = NULL; ++ } ++ } ++ ++ if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) { ++ flags |= ZIO_FLAG_TRYHARD; ++ goto retry; ++ } ++ ++ zio_buf_free(vp, sizeof (vdev_phys_t)); ++ ++ return (config); ++} ++ ++/* ++ * Determine if a device is in use. The 'spare_guid' parameter will be filled ++ * in with the device guid if this spare is active elsewhere on the system. ++ */ ++static boolean_t ++vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, ++ uint64_t *spare_guid, uint64_t *l2cache_guid) ++{ ++ spa_t *spa = vd->vdev_spa; ++ uint64_t state, pool_guid, device_guid, txg, spare_pool; ++ uint64_t vdtxg = 0; ++ nvlist_t *label; ++ ++ if (spare_guid) ++ *spare_guid = 0ULL; ++ if (l2cache_guid) ++ *l2cache_guid = 0ULL; ++ ++ /* ++ * Read the label, if any, and perform some basic sanity checks. ++ */ ++ if ((label = vdev_label_read_config(vd)) == NULL) ++ return (B_FALSE); ++ ++ (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, ++ &vdtxg); ++ ++ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, ++ &state) != 0 || ++ nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, ++ &device_guid) != 0) { ++ nvlist_free(label); ++ return (B_FALSE); ++ } ++ ++ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && ++ (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, ++ &pool_guid) != 0 || ++ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, ++ &txg) != 0)) { ++ nvlist_free(label); ++ return (B_FALSE); ++ } ++ ++ nvlist_free(label); ++ ++ /* ++ * Check to see if this device indeed belongs to the pool it claims to ++ * be a part of. The only way this is allowed is if the device is a hot ++ * spare (which we check for later on). ++ */ ++ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && ++ !spa_guid_exists(pool_guid, device_guid) && ++ !spa_spare_exists(device_guid, NULL, NULL) && ++ !spa_l2cache_exists(device_guid, NULL)) ++ return (B_FALSE); ++ ++ /* ++ * If the transaction group is zero, then this an initialized (but ++ * unused) label. This is only an error if the create transaction ++ * on-disk is the same as the one we're using now, in which case the ++ * user has attempted to add the same vdev multiple times in the same ++ * transaction. ++ */ ++ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && ++ txg == 0 && vdtxg == crtxg) ++ return (B_TRUE); ++ ++ /* ++ * Check to see if this is a spare device. We do an explicit check for ++ * spa_has_spare() here because it may be on our pending list of spares ++ * to add. We also check if it is an l2cache device. ++ */ ++ if (spa_spare_exists(device_guid, &spare_pool, NULL) || ++ spa_has_spare(spa, device_guid)) { ++ if (spare_guid) ++ *spare_guid = device_guid; ++ ++ switch (reason) { ++ case VDEV_LABEL_CREATE: ++ case VDEV_LABEL_L2CACHE: ++ return (B_TRUE); ++ ++ case VDEV_LABEL_REPLACE: ++ return (!spa_has_spare(spa, device_guid) || ++ spare_pool != 0ULL); ++ ++ case VDEV_LABEL_SPARE: ++ return (spa_has_spare(spa, device_guid)); ++ default: ++ break; ++ } ++ } ++ ++ /* ++ * Check to see if this is an l2cache device. ++ */ ++ if (spa_l2cache_exists(device_guid, NULL)) ++ return (B_TRUE); ++ ++ /* ++ * We can't rely on a pool's state if it's been imported ++ * read-only. Instead we look to see if the pools is marked ++ * read-only in the namespace and set the state to active. ++ */ ++ if ((spa = spa_by_guid(pool_guid, device_guid)) != NULL && ++ spa_mode(spa) == FREAD) ++ state = POOL_STATE_ACTIVE; ++ ++ /* ++ * If the device is marked ACTIVE, then this device is in use by another ++ * pool on the system. ++ */ ++ return (state == POOL_STATE_ACTIVE); ++} ++ ++/* ++ * Initialize a vdev label. We check to make sure each leaf device is not in ++ * use, and writable. We put down an initial label which we will later ++ * overwrite with a complete label. Note that it's important to do this ++ * sequentially, not in parallel, so that we catch cases of multiple use of the ++ * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with ++ * itself. ++ */ ++int ++vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) ++{ ++ spa_t *spa = vd->vdev_spa; ++ nvlist_t *label; ++ vdev_phys_t *vp; ++ char *pad2; ++ uberblock_t *ub; ++ zio_t *zio; ++ char *buf; ++ size_t buflen; ++ int error; ++ uint64_t spare_guid = 0, l2cache_guid = 0; ++ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; ++ int c, l; ++ vdev_t *pvd; ++ ++ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ if ((error = vdev_label_init(vd->vdev_child[c], ++ crtxg, reason)) != 0) ++ return (error); ++ ++ /* Track the creation time for this vdev */ ++ vd->vdev_crtxg = crtxg; ++ ++ if (!vd->vdev_ops->vdev_op_leaf) ++ return (0); ++ ++ /* ++ * Dead vdevs cannot be initialized. ++ */ ++ if (vdev_is_dead(vd)) ++ return (EIO); ++ ++ /* ++ * Determine if the vdev is in use. ++ */ ++ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT && ++ vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) ++ return (EBUSY); ++ ++ /* ++ * If this is a request to add or replace a spare or l2cache device ++ * that is in use elsewhere on the system, then we must update the ++ * guid (which was initialized to a random value) to reflect the ++ * actual GUID (which is shared between multiple pools). ++ */ ++ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE && ++ spare_guid != 0ULL) { ++ uint64_t guid_delta = spare_guid - vd->vdev_guid; ++ ++ vd->vdev_guid += guid_delta; ++ ++ for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) ++ pvd->vdev_guid_sum += guid_delta; ++ ++ /* ++ * If this is a replacement, then we want to fallthrough to the ++ * rest of the code. If we're adding a spare, then it's already ++ * labeled appropriately and we can just return. ++ */ ++ if (reason == VDEV_LABEL_SPARE) ++ return (0); ++ ASSERT(reason == VDEV_LABEL_REPLACE || ++ reason == VDEV_LABEL_SPLIT); ++ } ++ ++ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE && ++ l2cache_guid != 0ULL) { ++ uint64_t guid_delta = l2cache_guid - vd->vdev_guid; ++ ++ vd->vdev_guid += guid_delta; ++ ++ for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) ++ pvd->vdev_guid_sum += guid_delta; ++ ++ /* ++ * If this is a replacement, then we want to fallthrough to the ++ * rest of the code. If we're adding an l2cache, then it's ++ * already labeled appropriately and we can just return. ++ */ ++ if (reason == VDEV_LABEL_L2CACHE) ++ return (0); ++ ASSERT(reason == VDEV_LABEL_REPLACE); ++ } ++ ++ /* ++ * Initialize its label. ++ */ ++ vp = zio_buf_alloc(sizeof (vdev_phys_t)); ++ bzero(vp, sizeof (vdev_phys_t)); ++ ++ /* ++ * Generate a label describing the pool and our top-level vdev. ++ * We mark it as being from txg 0 to indicate that it's not ++ * really part of an active pool just yet. The labels will ++ * be written again with a meaningful txg by spa_sync(). ++ */ ++ if (reason == VDEV_LABEL_SPARE || ++ (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) { ++ /* ++ * For inactive hot spares, we generate a special label that ++ * identifies as a mutually shared hot spare. We write the ++ * label if we are adding a hot spare, or if we are removing an ++ * active hot spare (in which case we want to revert the ++ * labels). ++ */ ++ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); ++ ++ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, ++ spa_version(spa)) == 0); ++ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, ++ POOL_STATE_SPARE) == 0); ++ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, ++ vd->vdev_guid) == 0); ++ } else if (reason == VDEV_LABEL_L2CACHE || ++ (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) { ++ /* ++ * For level 2 ARC devices, add a special label. ++ */ ++ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); ++ ++ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, ++ spa_version(spa)) == 0); ++ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, ++ POOL_STATE_L2CACHE) == 0); ++ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, ++ vd->vdev_guid) == 0); ++ } else { ++ uint64_t txg = 0ULL; ++ ++ if (reason == VDEV_LABEL_SPLIT) ++ txg = spa->spa_uberblock.ub_txg; ++ label = spa_config_generate(spa, vd, txg, B_FALSE); ++ ++ /* ++ * Add our creation time. This allows us to detect multiple ++ * vdev uses as described above, and automatically expires if we ++ * fail. ++ */ ++ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, ++ crtxg) == 0); ++ } ++ ++ buf = vp->vp_nvlist; ++ buflen = sizeof (vp->vp_nvlist); ++ ++ error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_PUSHPAGE); ++ if (error != 0) { ++ nvlist_free(label); ++ zio_buf_free(vp, sizeof (vdev_phys_t)); ++ /* EFAULT means nvlist_pack ran out of room */ ++ return (error == EFAULT ? ENAMETOOLONG : EINVAL); ++ } ++ ++ /* ++ * Initialize uberblock template. ++ */ ++ ub = zio_buf_alloc(VDEV_UBERBLOCK_RING); ++ bzero(ub, VDEV_UBERBLOCK_RING); ++ *ub = spa->spa_uberblock; ++ ub->ub_txg = 0; ++ ++ /* Initialize the 2nd padding area. */ ++ pad2 = zio_buf_alloc(VDEV_PAD_SIZE); ++ bzero(pad2, VDEV_PAD_SIZE); ++ ++ /* ++ * Write everything in parallel. ++ */ ++retry: ++ zio = zio_root(spa, NULL, NULL, flags); ++ ++ for (l = 0; l < VDEV_LABELS; l++) { ++ ++ vdev_label_write(zio, vd, l, vp, ++ offsetof(vdev_label_t, vl_vdev_phys), ++ sizeof (vdev_phys_t), NULL, NULL, flags); ++ ++ /* ++ * Skip the 1st padding area. ++ * Zero out the 2nd padding area where it might have ++ * left over data from previous filesystem format. ++ */ ++ vdev_label_write(zio, vd, l, pad2, ++ offsetof(vdev_label_t, vl_pad2), ++ VDEV_PAD_SIZE, NULL, NULL, flags); ++ ++ vdev_label_write(zio, vd, l, ub, ++ offsetof(vdev_label_t, vl_uberblock), ++ VDEV_UBERBLOCK_RING, NULL, NULL, flags); ++ } ++ ++ error = zio_wait(zio); ++ ++ if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { ++ flags |= ZIO_FLAG_TRYHARD; ++ goto retry; ++ } ++ ++ nvlist_free(label); ++ zio_buf_free(pad2, VDEV_PAD_SIZE); ++ zio_buf_free(ub, VDEV_UBERBLOCK_RING); ++ zio_buf_free(vp, sizeof (vdev_phys_t)); ++ ++ /* ++ * If this vdev hasn't been previously identified as a spare, then we ++ * mark it as such only if a) we are labeling it as a spare, or b) it ++ * exists as a spare elsewhere in the system. Do the same for ++ * level 2 ARC devices. ++ */ ++ if (error == 0 && !vd->vdev_isspare && ++ (reason == VDEV_LABEL_SPARE || ++ spa_spare_exists(vd->vdev_guid, NULL, NULL))) ++ spa_spare_add(vd); ++ ++ if (error == 0 && !vd->vdev_isl2cache && ++ (reason == VDEV_LABEL_L2CACHE || ++ spa_l2cache_exists(vd->vdev_guid, NULL))) ++ spa_l2cache_add(vd); ++ ++ return (error); ++} ++ ++/* ++ * ========================================================================== ++ * uberblock load/sync ++ * ========================================================================== ++ */ ++ ++/* ++ * Consider the following situation: txg is safely synced to disk. We've ++ * written the first uberblock for txg + 1, and then we lose power. When we ++ * come back up, we fail to see the uberblock for txg + 1 because, say, ++ * it was on a mirrored device and the replica to which we wrote txg + 1 ++ * is now offline. If we then make some changes and sync txg + 1, and then ++ * the missing replica comes back, then for a new seconds we'll have two ++ * conflicting uberblocks on disk with the same txg. The solution is simple: ++ * among uberblocks with equal txg, choose the one with the latest timestamp. ++ */ ++static int ++vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) ++{ ++ if (ub1->ub_txg < ub2->ub_txg) ++ return (-1); ++ if (ub1->ub_txg > ub2->ub_txg) ++ return (1); ++ ++ if (ub1->ub_timestamp < ub2->ub_timestamp) ++ return (-1); ++ if (ub1->ub_timestamp > ub2->ub_timestamp) ++ return (1); ++ ++ return (0); ++} ++ ++static void ++vdev_uberblock_load_done(zio_t *zio) ++{ ++ spa_t *spa = zio->io_spa; ++ zio_t *rio = zio->io_private; ++ uberblock_t *ub = zio->io_data; ++ uberblock_t *ubbest = rio->io_private; ++ ++ ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd)); ++ ++ if (zio->io_error == 0 && uberblock_verify(ub) == 0) { ++ mutex_enter(&rio->io_lock); ++ if (ub->ub_txg <= spa->spa_load_max_txg && ++ vdev_uberblock_compare(ub, ubbest) > 0) ++ *ubbest = *ub; ++ mutex_exit(&rio->io_lock); ++ } ++ ++ zio_buf_free(zio->io_data, zio->io_size); ++} ++ ++void ++vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) ++{ ++ spa_t *spa = vd->vdev_spa; ++ vdev_t *rvd = spa->spa_root_vdev; ++ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ++ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; ++ int c, l, n; ++ ++ if (vd == rvd) { ++ ASSERT(zio == NULL); ++ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); ++ zio = zio_root(spa, NULL, ubbest, flags); ++ bzero(ubbest, sizeof (uberblock_t)); ++ } ++ ++ ASSERT(zio != NULL); ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ vdev_uberblock_load(zio, vd->vdev_child[c], ubbest); ++ ++ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { ++ for (l = 0; l < VDEV_LABELS; l++) { ++ for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { ++ vdev_label_read(zio, vd, l, ++ zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), ++ VDEV_UBERBLOCK_OFFSET(vd, n), ++ VDEV_UBERBLOCK_SIZE(vd), ++ vdev_uberblock_load_done, zio, flags); ++ } ++ } ++ } ++ ++ if (vd == rvd) { ++ (void) zio_wait(zio); ++ spa_config_exit(spa, SCL_ALL, FTAG); ++ } ++} ++ ++/* ++ * On success, increment root zio's count of good writes. ++ * We only get credit for writes to known-visible vdevs; see spa_vdev_add(). ++ */ ++static void ++vdev_uberblock_sync_done(zio_t *zio) ++{ ++ uint64_t *good_writes = zio->io_private; ++ ++ if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) ++ atomic_add_64(good_writes, 1); ++} ++ ++/* ++ * Write the uberblock to all labels of all leaves of the specified vdev. ++ */ ++static void ++vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) ++{ ++ uberblock_t *ubbuf; ++ int c, l, n; ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags); ++ ++ if (!vd->vdev_ops->vdev_op_leaf) ++ return; ++ ++ if (!vdev_writeable(vd)) ++ return; ++ ++ n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); ++ ++ ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); ++ bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); ++ *ubbuf = *ub; ++ ++ for (l = 0; l < VDEV_LABELS; l++) ++ vdev_label_write(zio, vd, l, ubbuf, ++ VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), ++ vdev_uberblock_sync_done, zio->io_private, ++ flags | ZIO_FLAG_DONT_PROPAGATE); ++ ++ zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); ++} ++ ++int ++vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) ++{ ++ spa_t *spa = svd[0]->vdev_spa; ++ zio_t *zio; ++ uint64_t good_writes = 0; ++ int v; ++ ++ zio = zio_root(spa, NULL, &good_writes, flags); ++ ++ for (v = 0; v < svdcount; v++) ++ vdev_uberblock_sync(zio, ub, svd[v], flags); ++ ++ (void) zio_wait(zio); ++ ++ /* ++ * Flush the uberblocks to disk. This ensures that the odd labels ++ * are no longer needed (because the new uberblocks and the even ++ * labels are safely on disk), so it is safe to overwrite them. ++ */ ++ zio = zio_root(spa, NULL, NULL, flags); ++ ++ for (v = 0; v < svdcount; v++) ++ zio_flush(zio, svd[v]); ++ ++ (void) zio_wait(zio); ++ ++ return (good_writes >= 1 ? 0 : EIO); ++} ++ ++/* ++ * On success, increment the count of good writes for our top-level vdev. ++ */ ++static void ++vdev_label_sync_done(zio_t *zio) ++{ ++ uint64_t *good_writes = zio->io_private; ++ ++ if (zio->io_error == 0) ++ atomic_add_64(good_writes, 1); ++} ++ ++/* ++ * If there weren't enough good writes, indicate failure to the parent. ++ */ ++static void ++vdev_label_sync_top_done(zio_t *zio) ++{ ++ uint64_t *good_writes = zio->io_private; ++ ++ if (*good_writes == 0) ++ zio->io_error = EIO; ++ ++ kmem_free(good_writes, sizeof (uint64_t)); ++} ++ ++/* ++ * We ignore errors for log and cache devices, simply free the private data. ++ */ ++static void ++vdev_label_sync_ignore_done(zio_t *zio) ++{ ++ kmem_free(zio->io_private, sizeof (uint64_t)); ++} ++ ++/* ++ * Write all even or odd labels to all leaves of the specified vdev. ++ */ ++static void ++vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) ++{ ++ nvlist_t *label; ++ vdev_phys_t *vp; ++ char *buf; ++ size_t buflen; ++ int c; ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags); ++ ++ if (!vd->vdev_ops->vdev_op_leaf) ++ return; ++ ++ if (!vdev_writeable(vd)) ++ return; ++ ++ /* ++ * Generate a label describing the top-level config to which we belong. ++ */ ++ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); ++ ++ vp = zio_buf_alloc(sizeof (vdev_phys_t)); ++ bzero(vp, sizeof (vdev_phys_t)); ++ ++ buf = vp->vp_nvlist; ++ buflen = sizeof (vp->vp_nvlist); ++ ++ if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_PUSHPAGE) == 0) { ++ for (; l < VDEV_LABELS; l += 2) { ++ vdev_label_write(zio, vd, l, vp, ++ offsetof(vdev_label_t, vl_vdev_phys), ++ sizeof (vdev_phys_t), ++ vdev_label_sync_done, zio->io_private, ++ flags | ZIO_FLAG_DONT_PROPAGATE); ++ } ++ } ++ ++ zio_buf_free(vp, sizeof (vdev_phys_t)); ++ nvlist_free(label); ++} ++ ++int ++vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) ++{ ++ list_t *dl = &spa->spa_config_dirty_list; ++ vdev_t *vd; ++ zio_t *zio; ++ int error; ++ ++ /* ++ * Write the new labels to disk. ++ */ ++ zio = zio_root(spa, NULL, NULL, flags); ++ ++ for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { ++ uint64_t *good_writes; ++ zio_t *vio; ++ ++ ASSERT(!vd->vdev_ishole); ++ ++ good_writes = kmem_zalloc(sizeof (uint64_t), KM_PUSHPAGE); ++ vio = zio_null(zio, spa, NULL, ++ (vd->vdev_islog || vd->vdev_aux != NULL) ? ++ vdev_label_sync_ignore_done : vdev_label_sync_top_done, ++ good_writes, flags); ++ vdev_label_sync(vio, vd, l, txg, flags); ++ zio_nowait(vio); ++ } ++ ++ error = zio_wait(zio); ++ ++ /* ++ * Flush the new labels to disk. ++ */ ++ zio = zio_root(spa, NULL, NULL, flags); ++ ++ for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) ++ zio_flush(zio, vd); ++ ++ (void) zio_wait(zio); ++ ++ return (error); ++} ++ ++/* ++ * Sync the uberblock and any changes to the vdev configuration. ++ * ++ * The order of operations is carefully crafted to ensure that ++ * if the system panics or loses power at any time, the state on disk ++ * is still transactionally consistent. The in-line comments below ++ * describe the failure semantics at each stage. ++ * ++ * Moreover, vdev_config_sync() is designed to be idempotent: if it fails ++ * at any time, you can just call it again, and it will resume its work. ++ */ ++int ++vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard) ++{ ++ spa_t *spa = svd[0]->vdev_spa; ++ uberblock_t *ub = &spa->spa_uberblock; ++ vdev_t *vd; ++ zio_t *zio; ++ int error; ++ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; ++ ++ /* ++ * Normally, we don't want to try too hard to write every label and ++ * uberblock. If there is a flaky disk, we don't want the rest of the ++ * sync process to block while we retry. But if we can't write a ++ * single label out, we should retry with ZIO_FLAG_TRYHARD before ++ * bailing out and declaring the pool faulted. ++ */ ++ if (tryhard) ++ flags |= ZIO_FLAG_TRYHARD; ++ ++ ASSERT(ub->ub_txg <= txg); ++ ++ /* ++ * If this isn't a resync due to I/O errors, ++ * and nothing changed in this transaction group, ++ * and the vdev configuration hasn't changed, ++ * then there's nothing to do. ++ */ ++ if (ub->ub_txg < txg && ++ uberblock_update(ub, spa->spa_root_vdev, txg) == B_FALSE && ++ list_is_empty(&spa->spa_config_dirty_list)) ++ return (0); ++ ++ if (txg > spa_freeze_txg(spa)) ++ return (0); ++ ++ ASSERT(txg <= spa->spa_final_txg); ++ ++ /* ++ * Flush the write cache of every disk that's been written to ++ * in this transaction group. This ensures that all blocks ++ * written in this txg will be committed to stable storage ++ * before any uberblock that references them. ++ */ ++ zio = zio_root(spa, NULL, NULL, flags); ++ ++ for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd; ++ vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) ++ zio_flush(zio, vd); ++ ++ (void) zio_wait(zio); ++ ++ /* ++ * Sync out the even labels (L0, L2) for every dirty vdev. If the ++ * system dies in the middle of this process, that's OK: all of the ++ * even labels that made it to disk will be newer than any uberblock, ++ * and will therefore be considered invalid. The odd labels (L1, L3), ++ * which have not yet been touched, will still be valid. We flush ++ * the new labels to disk to ensure that all even-label updates ++ * are committed to stable storage before the uberblock update. ++ */ ++ if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) ++ return (error); ++ ++ /* ++ * Sync the uberblocks to all vdevs in svd[]. ++ * If the system dies in the middle of this step, there are two cases ++ * to consider, and the on-disk state is consistent either way: ++ * ++ * (1) If none of the new uberblocks made it to disk, then the ++ * previous uberblock will be the newest, and the odd labels ++ * (which had not yet been touched) will be valid with respect ++ * to that uberblock. ++ * ++ * (2) If one or more new uberblocks made it to disk, then they ++ * will be the newest, and the even labels (which had all ++ * been successfully committed) will be valid with respect ++ * to the new uberblocks. ++ */ ++ if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) ++ return (error); ++ ++ /* ++ * Sync out odd labels for every dirty vdev. If the system dies ++ * in the middle of this process, the even labels and the new ++ * uberblocks will suffice to open the pool. The next time ++ * the pool is opened, the first thing we'll do -- before any ++ * user data is modified -- is mark every vdev dirty so that ++ * all labels will be brought up to date. We flush the new labels ++ * to disk to ensure that all odd-label updates are committed to ++ * stable storage before the next transaction group begins. ++ */ ++ return (vdev_label_sync_list(spa, 1, txg, flags)); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/vdev_mirror.c linux-3.2.33-go/fs/zfs/zfs/vdev_mirror.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/vdev_mirror.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/vdev_mirror.c 2012-11-16 23:25:34.349039334 +0100 +@@ -0,0 +1,494 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++/* ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Virtual device vector for mirroring. ++ */ ++ ++typedef struct mirror_child { ++ vdev_t *mc_vd; ++ uint64_t mc_offset; ++ int mc_error; ++ uint8_t mc_tried; ++ uint8_t mc_skipped; ++ uint8_t mc_speculative; ++} mirror_child_t; ++ ++typedef struct mirror_map { ++ int mm_children; ++ int mm_replacing; ++ int mm_preferred; ++ int mm_root; ++ mirror_child_t mm_child[1]; ++} mirror_map_t; ++ ++int vdev_mirror_shift = 21; ++ ++static void ++vdev_mirror_map_free(zio_t *zio) ++{ ++ mirror_map_t *mm = zio->io_vsd; ++ ++ kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); ++} ++ ++static const zio_vsd_ops_t vdev_mirror_vsd_ops = { ++ vdev_mirror_map_free, ++ zio_vsd_default_cksum_report ++}; ++ ++static mirror_map_t * ++vdev_mirror_map_alloc(zio_t *zio) ++{ ++ mirror_map_t *mm = NULL; ++ mirror_child_t *mc; ++ vdev_t *vd = zio->io_vd; ++ int c, d; ++ ++ if (vd == NULL) { ++ dva_t *dva = zio->io_bp->blk_dva; ++ spa_t *spa = zio->io_spa; ++ ++ c = BP_GET_NDVAS(zio->io_bp); ++ ++ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE); ++ mm->mm_children = c; ++ mm->mm_replacing = B_FALSE; ++ mm->mm_preferred = spa_get_random(c); ++ mm->mm_root = B_TRUE; ++ ++ /* ++ * Check the other, lower-index DVAs to see if they're on ++ * the same vdev as the child we picked. If they are, use ++ * them since they are likely to have been allocated from ++ * the primary metaslab in use at the time, and hence are ++ * more likely to have locality with single-copy data. ++ */ ++ for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { ++ if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) ++ mm->mm_preferred = d; ++ } ++ ++ for (c = 0; c < mm->mm_children; c++) { ++ mc = &mm->mm_child[c]; ++ ++ mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); ++ mc->mc_offset = DVA_GET_OFFSET(&dva[c]); ++ } ++ } else { ++ c = vd->vdev_children; ++ ++ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE); ++ mm->mm_children = c; ++ mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || ++ vd->vdev_ops == &vdev_spare_ops); ++ mm->mm_preferred = mm->mm_replacing ? 0 : ++ (zio->io_offset >> vdev_mirror_shift) % c; ++ mm->mm_root = B_FALSE; ++ ++ for (c = 0; c < mm->mm_children; c++) { ++ mc = &mm->mm_child[c]; ++ mc->mc_vd = vd->vdev_child[c]; ++ mc->mc_offset = zio->io_offset; ++ } ++ } ++ ++ zio->io_vsd = mm; ++ zio->io_vsd_ops = &vdev_mirror_vsd_ops; ++ return (mm); ++} ++ ++static int ++vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, ++ uint64_t *ashift) ++{ ++ int numerrors = 0; ++ int lasterror = 0; ++ int c; ++ ++ if (vd->vdev_children == 0) { ++ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; ++ return (EINVAL); ++ } ++ ++ vdev_open_children(vd); ++ ++ for (c = 0; c < vd->vdev_children; c++) { ++ vdev_t *cvd = vd->vdev_child[c]; ++ ++ if (cvd->vdev_open_error) { ++ lasterror = cvd->vdev_open_error; ++ numerrors++; ++ continue; ++ } ++ ++ *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; ++ *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; ++ *ashift = MAX(*ashift, cvd->vdev_ashift); ++ } ++ ++ if (numerrors == vd->vdev_children) { ++ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; ++ return (lasterror); ++ } ++ ++ return (0); ++} ++ ++static void ++vdev_mirror_close(vdev_t *vd) ++{ ++ int c; ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ vdev_close(vd->vdev_child[c]); ++} ++ ++static void ++vdev_mirror_child_done(zio_t *zio) ++{ ++ mirror_child_t *mc = zio->io_private; ++ ++ mc->mc_error = zio->io_error; ++ mc->mc_tried = 1; ++ mc->mc_skipped = 0; ++} ++ ++static void ++vdev_mirror_scrub_done(zio_t *zio) ++{ ++ mirror_child_t *mc = zio->io_private; ++ ++ if (zio->io_error == 0) { ++ zio_t *pio; ++ ++ mutex_enter(&zio->io_lock); ++ while ((pio = zio_walk_parents(zio)) != NULL) { ++ mutex_enter(&pio->io_lock); ++ ASSERT3U(zio->io_size, >=, pio->io_size); ++ bcopy(zio->io_data, pio->io_data, pio->io_size); ++ mutex_exit(&pio->io_lock); ++ } ++ mutex_exit(&zio->io_lock); ++ } ++ ++ zio_buf_free(zio->io_data, zio->io_size); ++ ++ mc->mc_error = zio->io_error; ++ mc->mc_tried = 1; ++ mc->mc_skipped = 0; ++} ++ ++/* ++ * Try to find a child whose DTL doesn't contain the block we want to read. ++ * If we can't, try the read on any vdev we haven't already tried. ++ */ ++static int ++vdev_mirror_child_select(zio_t *zio) ++{ ++ mirror_map_t *mm = zio->io_vsd; ++ mirror_child_t *mc; ++ uint64_t txg = zio->io_txg; ++ int i, c; ++ ++ ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); ++ ++ /* ++ * Try to find a child whose DTL doesn't contain the block to read. ++ * If a child is known to be completely inaccessible (indicated by ++ * vdev_readable() returning B_FALSE), don't even try. ++ */ ++ for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) { ++ if (c >= mm->mm_children) ++ c = 0; ++ mc = &mm->mm_child[c]; ++ if (mc->mc_tried || mc->mc_skipped) ++ continue; ++ if (!vdev_readable(mc->mc_vd)) { ++ mc->mc_error = ENXIO; ++ mc->mc_tried = 1; /* don't even try */ ++ mc->mc_skipped = 1; ++ continue; ++ } ++ if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) ++ return (c); ++ mc->mc_error = ESTALE; ++ mc->mc_skipped = 1; ++ mc->mc_speculative = 1; ++ } ++ ++ /* ++ * Every device is either missing or has this txg in its DTL. ++ * Look for any child we haven't already tried before giving up. ++ */ ++ for (c = 0; c < mm->mm_children; c++) ++ if (!mm->mm_child[c].mc_tried) ++ return (c); ++ ++ /* ++ * Every child failed. There's no place left to look. ++ */ ++ return (-1); ++} ++ ++static int ++vdev_mirror_io_start(zio_t *zio) ++{ ++ mirror_map_t *mm; ++ mirror_child_t *mc; ++ int c, children; ++ ++ mm = vdev_mirror_map_alloc(zio); ++ ++ if (zio->io_type == ZIO_TYPE_READ) { ++ if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { ++ /* ++ * For scrubbing reads we need to allocate a read ++ * buffer for each child and issue reads to all ++ * children. If any child succeeds, it will copy its ++ * data into zio->io_data in vdev_mirror_scrub_done. ++ */ ++ for (c = 0; c < mm->mm_children; c++) { ++ mc = &mm->mm_child[c]; ++ zio_nowait(zio_vdev_child_io(zio, zio->io_bp, ++ mc->mc_vd, mc->mc_offset, ++ zio_buf_alloc(zio->io_size), zio->io_size, ++ zio->io_type, zio->io_priority, 0, ++ vdev_mirror_scrub_done, mc)); ++ } ++ return (ZIO_PIPELINE_CONTINUE); ++ } ++ /* ++ * For normal reads just pick one child. ++ */ ++ c = vdev_mirror_child_select(zio); ++ children = (c >= 0); ++ } else { ++ ASSERT(zio->io_type == ZIO_TYPE_WRITE); ++ ++ /* ++ * Writes go to all children. ++ */ ++ c = 0; ++ children = mm->mm_children; ++ } ++ ++ while (children--) { ++ mc = &mm->mm_child[c]; ++ zio_nowait(zio_vdev_child_io(zio, zio->io_bp, ++ mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, ++ zio->io_type, zio->io_priority, 0, ++ vdev_mirror_child_done, mc)); ++ c++; ++ } ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++static int ++vdev_mirror_worst_error(mirror_map_t *mm) ++{ ++ int c, error[2] = { 0, 0 }; ++ ++ for (c = 0; c < mm->mm_children; c++) { ++ mirror_child_t *mc = &mm->mm_child[c]; ++ int s = mc->mc_speculative; ++ error[s] = zio_worst_error(error[s], mc->mc_error); ++ } ++ ++ return (error[0] ? error[0] : error[1]); ++} ++ ++static void ++vdev_mirror_io_done(zio_t *zio) ++{ ++ mirror_map_t *mm = zio->io_vsd; ++ mirror_child_t *mc; ++ int c; ++ int good_copies = 0; ++ int unexpected_errors = 0; ++ ++ for (c = 0; c < mm->mm_children; c++) { ++ mc = &mm->mm_child[c]; ++ ++ if (mc->mc_error) { ++ if (!mc->mc_skipped) ++ unexpected_errors++; ++ } else if (mc->mc_tried) { ++ good_copies++; ++ } ++ } ++ ++ if (zio->io_type == ZIO_TYPE_WRITE) { ++ /* ++ * XXX -- for now, treat partial writes as success. ++ * ++ * Now that we support write reallocation, it would be better ++ * to treat partial failure as real failure unless there are ++ * no non-degraded top-level vdevs left, and not update DTLs ++ * if we intend to reallocate. ++ */ ++ /* XXPOLICY */ ++ if (good_copies != mm->mm_children) { ++ /* ++ * Always require at least one good copy. ++ * ++ * For ditto blocks (io_vd == NULL), require ++ * all copies to be good. ++ * ++ * XXX -- for replacing vdevs, there's no great answer. ++ * If the old device is really dead, we may not even ++ * be able to access it -- so we only want to ++ * require good writes to the new device. But if ++ * the new device turns out to be flaky, we want ++ * to be able to detach it -- which requires all ++ * writes to the old device to have succeeded. ++ */ ++ if (good_copies == 0 || zio->io_vd == NULL) ++ zio->io_error = vdev_mirror_worst_error(mm); ++ } ++ return; ++ } ++ ++ ASSERT(zio->io_type == ZIO_TYPE_READ); ++ ++ /* ++ * If we don't have a good copy yet, keep trying other children. ++ */ ++ /* XXPOLICY */ ++ if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { ++ ASSERT(c >= 0 && c < mm->mm_children); ++ mc = &mm->mm_child[c]; ++ zio_vdev_io_redone(zio); ++ zio_nowait(zio_vdev_child_io(zio, zio->io_bp, ++ mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, ++ ZIO_TYPE_READ, zio->io_priority, 0, ++ vdev_mirror_child_done, mc)); ++ return; ++ } ++ ++ /* XXPOLICY */ ++ if (good_copies == 0) { ++ zio->io_error = vdev_mirror_worst_error(mm); ++ ASSERT(zio->io_error != 0); ++ } ++ ++ if (good_copies && spa_writeable(zio->io_spa) && ++ (unexpected_errors || ++ (zio->io_flags & ZIO_FLAG_RESILVER) || ++ ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) { ++ /* ++ * Use the good data we have in hand to repair damaged children. ++ */ ++ for (c = 0; c < mm->mm_children; c++) { ++ /* ++ * Don't rewrite known good children. ++ * Not only is it unnecessary, it could ++ * actually be harmful: if the system lost ++ * power while rewriting the only good copy, ++ * there would be no good copies left! ++ */ ++ mc = &mm->mm_child[c]; ++ ++ if (mc->mc_error == 0) { ++ if (mc->mc_tried) ++ continue; ++ if (!(zio->io_flags & ZIO_FLAG_SCRUB) && ++ !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, ++ zio->io_txg, 1)) ++ continue; ++ mc->mc_error = ESTALE; ++ } ++ ++ zio_nowait(zio_vdev_child_io(zio, zio->io_bp, ++ mc->mc_vd, mc->mc_offset, ++ zio->io_data, zio->io_size, ++ ZIO_TYPE_WRITE, zio->io_priority, ++ ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ++ ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); ++ } ++ } ++} ++ ++static void ++vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) ++{ ++ if (faulted == vd->vdev_children) ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_NO_REPLICAS); ++ else if (degraded + faulted != 0) ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); ++ else ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); ++} ++ ++vdev_ops_t vdev_mirror_ops = { ++ vdev_mirror_open, ++ vdev_mirror_close, ++ vdev_default_asize, ++ vdev_mirror_io_start, ++ vdev_mirror_io_done, ++ vdev_mirror_state_change, ++ NULL, ++ NULL, ++ VDEV_TYPE_MIRROR, /* name of this vdev type */ ++ B_FALSE /* not a leaf vdev */ ++}; ++ ++vdev_ops_t vdev_replacing_ops = { ++ vdev_mirror_open, ++ vdev_mirror_close, ++ vdev_default_asize, ++ vdev_mirror_io_start, ++ vdev_mirror_io_done, ++ vdev_mirror_state_change, ++ NULL, ++ NULL, ++ VDEV_TYPE_REPLACING, /* name of this vdev type */ ++ B_FALSE /* not a leaf vdev */ ++}; ++ ++vdev_ops_t vdev_spare_ops = { ++ vdev_mirror_open, ++ vdev_mirror_close, ++ vdev_default_asize, ++ vdev_mirror_io_start, ++ vdev_mirror_io_done, ++ vdev_mirror_state_change, ++ NULL, ++ NULL, ++ VDEV_TYPE_SPARE, /* name of this vdev type */ ++ B_FALSE /* not a leaf vdev */ ++}; +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/vdev_missing.c linux-3.2.33-go/fs/zfs/zfs/vdev_missing.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/vdev_missing.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/vdev_missing.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,106 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++/* ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ */ ++ ++/* ++ * The 'missing' vdev is a special vdev type used only during import. It ++ * signifies a placeholder in the root vdev for some vdev that we know is ++ * missing. We pass it down to the kernel to allow the rest of the ++ * configuration to parsed and an attempt made to open all available devices. ++ * Because its GUID is always 0, we know that the guid sum will mismatch and we ++ * won't be able to open the pool anyway. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* ARGSUSED */ ++static int ++vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, ++ uint64_t *ashift) ++{ ++ /* ++ * Really this should just fail. But then the root vdev will be in the ++ * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is ++ * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we ++ * will fail the GUID sum check before ever trying to open the pool. ++ */ ++ *psize = 0; ++ *max_psize = 0; ++ *ashift = 0; ++ return (0); ++} ++ ++/* ARGSUSED */ ++static void ++vdev_missing_close(vdev_t *vd) ++{ ++} ++ ++/* ARGSUSED */ ++static int ++vdev_missing_io_start(zio_t *zio) ++{ ++ zio->io_error = ENOTSUP; ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++/* ARGSUSED */ ++static void ++vdev_missing_io_done(zio_t *zio) ++{ ++} ++ ++vdev_ops_t vdev_missing_ops = { ++ vdev_missing_open, ++ vdev_missing_close, ++ vdev_default_asize, ++ vdev_missing_io_start, ++ vdev_missing_io_done, ++ NULL, ++ NULL, ++ NULL, ++ VDEV_TYPE_MISSING, /* name of this vdev type */ ++ B_TRUE /* leaf vdev */ ++}; ++ ++vdev_ops_t vdev_hole_ops = { ++ vdev_missing_open, ++ vdev_missing_close, ++ vdev_default_asize, ++ vdev_missing_io_start, ++ vdev_missing_io_done, ++ NULL, ++ NULL, ++ NULL, ++ VDEV_TYPE_HOLE, /* name of this vdev type */ ++ B_TRUE /* leaf vdev */ ++}; +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/vdev_queue.c linux-3.2.33-go/fs/zfs/zfs/vdev_queue.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/vdev_queue.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/vdev_queue.c 2012-11-16 23:25:34.353039289 +0100 +@@ -0,0 +1,462 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * These tunables are for performance analysis. ++ */ ++/* ++ * zfs_vdev_max_pending is the maximum number of i/os concurrently ++ * pending to each device. zfs_vdev_min_pending is the initial number ++ * of i/os pending to each device (before it starts ramping up to ++ * max_pending). ++ */ ++int zfs_vdev_max_pending = 10; ++int zfs_vdev_min_pending = 4; ++ ++/* deadline = pri + ddi_get_lbolt64() >> time_shift) */ ++int zfs_vdev_time_shift = 6; ++ ++/* exponential I/O issue ramp-up rate */ ++int zfs_vdev_ramp_rate = 2; ++ ++/* ++ * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. ++ * For read I/Os, we also aggregate across small adjacency gaps; for writes ++ * we include spans of optional I/Os to aid aggregation at the disk even when ++ * they aren't able to help us aggregate at this level. ++ */ ++int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; ++int zfs_vdev_read_gap_limit = 32 << 10; ++int zfs_vdev_write_gap_limit = 4 << 10; ++ ++/* ++ * Virtual device vector for disk I/O scheduling. ++ */ ++int ++vdev_queue_deadline_compare(const void *x1, const void *x2) ++{ ++ const zio_t *z1 = x1; ++ const zio_t *z2 = x2; ++ ++ if (z1->io_deadline < z2->io_deadline) ++ return (-1); ++ if (z1->io_deadline > z2->io_deadline) ++ return (1); ++ ++ if (z1->io_offset < z2->io_offset) ++ return (-1); ++ if (z1->io_offset > z2->io_offset) ++ return (1); ++ ++ if (z1 < z2) ++ return (-1); ++ if (z1 > z2) ++ return (1); ++ ++ return (0); ++} ++ ++int ++vdev_queue_offset_compare(const void *x1, const void *x2) ++{ ++ const zio_t *z1 = x1; ++ const zio_t *z2 = x2; ++ ++ if (z1->io_offset < z2->io_offset) ++ return (-1); ++ if (z1->io_offset > z2->io_offset) ++ return (1); ++ ++ if (z1 < z2) ++ return (-1); ++ if (z1 > z2) ++ return (1); ++ ++ return (0); ++} ++ ++void ++vdev_queue_init(vdev_t *vd) ++{ ++ vdev_queue_t *vq = &vd->vdev_queue; ++ int i; ++ ++ mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, ++ sizeof (zio_t), offsetof(struct zio, io_deadline_node)); ++ ++ avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, ++ sizeof (zio_t), offsetof(struct zio, io_offset_node)); ++ ++ avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, ++ sizeof (zio_t), offsetof(struct zio, io_offset_node)); ++ ++ avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, ++ sizeof (zio_t), offsetof(struct zio, io_offset_node)); ++ ++ /* ++ * A list of buffers which can be used for aggregate I/O, this ++ * avoids the need to allocate them on demand when memory is low. ++ */ ++ list_create(&vq->vq_io_list, sizeof (vdev_io_t), ++ offsetof(vdev_io_t, vi_node)); ++ ++ for (i = 0; i < zfs_vdev_max_pending; i++) ++ list_insert_tail(&vq->vq_io_list, zio_vdev_alloc()); ++} ++ ++void ++vdev_queue_fini(vdev_t *vd) ++{ ++ vdev_queue_t *vq = &vd->vdev_queue; ++ vdev_io_t *vi; ++ ++ avl_destroy(&vq->vq_deadline_tree); ++ avl_destroy(&vq->vq_read_tree); ++ avl_destroy(&vq->vq_write_tree); ++ avl_destroy(&vq->vq_pending_tree); ++ ++ while ((vi = list_head(&vq->vq_io_list)) != NULL) { ++ list_remove(&vq->vq_io_list, vi); ++ zio_vdev_free(vi); ++ } ++ ++ list_destroy(&vq->vq_io_list); ++ ++ mutex_destroy(&vq->vq_lock); ++} ++ ++static void ++vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) ++{ ++ avl_add(&vq->vq_deadline_tree, zio); ++ avl_add(zio->io_vdev_tree, zio); ++} ++ ++static void ++vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) ++{ ++ avl_remove(&vq->vq_deadline_tree, zio); ++ avl_remove(zio->io_vdev_tree, zio); ++} ++ ++static void ++vdev_queue_agg_io_done(zio_t *aio) ++{ ++ vdev_queue_t *vq = &aio->io_vd->vdev_queue; ++ vdev_io_t *vi = aio->io_data; ++ zio_t *pio; ++ ++ while ((pio = zio_walk_parents(aio)) != NULL) ++ if (aio->io_type == ZIO_TYPE_READ) ++ bcopy((char *)aio->io_data + (pio->io_offset - ++ aio->io_offset), pio->io_data, pio->io_size); ++ ++ mutex_enter(&vq->vq_lock); ++ list_insert_tail(&vq->vq_io_list, vi); ++ mutex_exit(&vq->vq_lock); ++} ++ ++/* ++ * Compute the range spanned by two i/os, which is the endpoint of the last ++ * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). ++ * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); ++ * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. ++ */ ++#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) ++#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) ++ ++static zio_t * ++vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) ++{ ++ zio_t *fio, *lio, *aio, *dio, *nio, *mio; ++ avl_tree_t *t; ++ vdev_io_t *vi; ++ int flags; ++ uint64_t maxspan = MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE); ++ uint64_t maxgap; ++ int stretch; ++ ++again: ++ ASSERT(MUTEX_HELD(&vq->vq_lock)); ++ ++ if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || ++ avl_numnodes(&vq->vq_deadline_tree) == 0) ++ return (NULL); ++ ++ fio = lio = avl_first(&vq->vq_deadline_tree); ++ ++ t = fio->io_vdev_tree; ++ flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; ++ maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; ++ ++ vi = list_head(&vq->vq_io_list); ++ if (vi == NULL) { ++ vi = zio_vdev_alloc(); ++ list_insert_head(&vq->vq_io_list, vi); ++ } ++ ++ if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { ++ /* ++ * We can aggregate I/Os that are sufficiently adjacent and of ++ * the same flavor, as expressed by the AGG_INHERIT flags. ++ * The latter requirement is necessary so that certain ++ * attributes of the I/O, such as whether it's a normal I/O ++ * or a scrub/resilver, can be preserved in the aggregate. ++ * We can include optional I/Os, but don't allow them ++ * to begin a range as they add no benefit in that situation. ++ */ ++ ++ /* ++ * We keep track of the last non-optional I/O. ++ */ ++ mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio; ++ ++ /* ++ * Walk backwards through sufficiently contiguous I/Os ++ * recording the last non-option I/O. ++ */ ++ while ((dio = AVL_PREV(t, fio)) != NULL && ++ (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && ++ IO_SPAN(dio, lio) <= maxspan && ++ IO_GAP(dio, fio) <= maxgap) { ++ fio = dio; ++ if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL)) ++ mio = fio; ++ } ++ ++ /* ++ * Skip any initial optional I/Os. ++ */ ++ while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) { ++ fio = AVL_NEXT(t, fio); ++ ASSERT(fio != NULL); ++ } ++ ++ /* ++ * Walk forward through sufficiently contiguous I/Os. ++ */ ++ while ((dio = AVL_NEXT(t, lio)) != NULL && ++ (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && ++ IO_SPAN(fio, dio) <= maxspan && ++ IO_GAP(lio, dio) <= maxgap) { ++ lio = dio; ++ if (!(lio->io_flags & ZIO_FLAG_OPTIONAL)) ++ mio = lio; ++ } ++ ++ /* ++ * Now that we've established the range of the I/O aggregation ++ * we must decide what to do with trailing optional I/Os. ++ * For reads, there's nothing to do. While we are unable to ++ * aggregate further, it's possible that a trailing optional ++ * I/O would allow the underlying device to aggregate with ++ * subsequent I/Os. We must therefore determine if the next ++ * non-optional I/O is close enough to make aggregation ++ * worthwhile. ++ */ ++ stretch = B_FALSE; ++ if (t != &vq->vq_read_tree && mio != NULL) { ++ nio = lio; ++ while ((dio = AVL_NEXT(t, nio)) != NULL && ++ IO_GAP(nio, dio) == 0 && ++ IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) { ++ nio = dio; ++ if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { ++ stretch = B_TRUE; ++ break; ++ } ++ } ++ } ++ ++ if (stretch) { ++ /* This may be a no-op. */ ++ VERIFY((dio = AVL_NEXT(t, lio)) != NULL); ++ dio->io_flags &= ~ZIO_FLAG_OPTIONAL; ++ } else { ++ while (lio != mio && lio != fio) { ++ ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL); ++ lio = AVL_PREV(t, lio); ++ ASSERT(lio != NULL); ++ } ++ } ++ } ++ ++ if (fio != lio) { ++ uint64_t size = IO_SPAN(fio, lio); ++ ASSERT(size <= maxspan); ++ ASSERT(vi != NULL); ++ ++ aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, ++ vi, size, fio->io_type, ZIO_PRIORITY_AGG, ++ flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, ++ vdev_queue_agg_io_done, NULL); ++ ++ nio = fio; ++ do { ++ dio = nio; ++ nio = AVL_NEXT(t, dio); ++ ASSERT(dio->io_type == aio->io_type); ++ ASSERT(dio->io_vdev_tree == t); ++ ++ if (dio->io_flags & ZIO_FLAG_NODATA) { ++ ASSERT(dio->io_type == ZIO_TYPE_WRITE); ++ bzero((char *)aio->io_data + (dio->io_offset - ++ aio->io_offset), dio->io_size); ++ } else if (dio->io_type == ZIO_TYPE_WRITE) { ++ bcopy(dio->io_data, (char *)aio->io_data + ++ (dio->io_offset - aio->io_offset), ++ dio->io_size); ++ } ++ ++ zio_add_child(dio, aio); ++ vdev_queue_io_remove(vq, dio); ++ zio_vdev_io_bypass(dio); ++ zio_execute(dio); ++ } while (dio != lio); ++ ++ avl_add(&vq->vq_pending_tree, aio); ++ list_remove(&vq->vq_io_list, vi); ++ ++ return (aio); ++ } ++ ++ ASSERT(fio->io_vdev_tree == t); ++ vdev_queue_io_remove(vq, fio); ++ ++ /* ++ * If the I/O is or was optional and therefore has no data, we need to ++ * simply discard it. We need to drop the vdev queue's lock to avoid a ++ * deadlock that we could encounter since this I/O will complete ++ * immediately. ++ */ ++ if (fio->io_flags & ZIO_FLAG_NODATA) { ++ mutex_exit(&vq->vq_lock); ++ zio_vdev_io_bypass(fio); ++ zio_execute(fio); ++ mutex_enter(&vq->vq_lock); ++ goto again; ++ } ++ ++ avl_add(&vq->vq_pending_tree, fio); ++ ++ return (fio); ++} ++ ++zio_t * ++vdev_queue_io(zio_t *zio) ++{ ++ vdev_queue_t *vq = &zio->io_vd->vdev_queue; ++ zio_t *nio; ++ ++ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); ++ ++ if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) ++ return (zio); ++ ++ zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; ++ ++ if (zio->io_type == ZIO_TYPE_READ) ++ zio->io_vdev_tree = &vq->vq_read_tree; ++ else ++ zio->io_vdev_tree = &vq->vq_write_tree; ++ ++ mutex_enter(&vq->vq_lock); ++ ++ zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) + ++ zio->io_priority; ++ ++ vdev_queue_io_add(vq, zio); ++ ++ nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); ++ ++ mutex_exit(&vq->vq_lock); ++ ++ if (nio == NULL) ++ return (NULL); ++ ++ if (nio->io_done == vdev_queue_agg_io_done) { ++ zio_nowait(nio); ++ return (NULL); ++ } ++ ++ return (nio); ++} ++ ++void ++vdev_queue_io_done(zio_t *zio) ++{ ++ vdev_queue_t *vq = &zio->io_vd->vdev_queue; ++ int i; ++ ++ mutex_enter(&vq->vq_lock); ++ ++ avl_remove(&vq->vq_pending_tree, zio); ++ ++ for (i = 0; i < zfs_vdev_ramp_rate; i++) { ++ zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); ++ if (nio == NULL) ++ break; ++ mutex_exit(&vq->vq_lock); ++ if (nio->io_done == vdev_queue_agg_io_done) { ++ zio_nowait(nio); ++ } else { ++ zio_vdev_io_reissue(nio); ++ zio_execute(nio); ++ } ++ mutex_enter(&vq->vq_lock); ++ } ++ ++ mutex_exit(&vq->vq_lock); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++module_param(zfs_vdev_max_pending, int, 0644); ++MODULE_PARM_DESC(zfs_vdev_max_pending, "Max pending per-vdev I/Os"); ++ ++module_param(zfs_vdev_min_pending, int, 0644); ++MODULE_PARM_DESC(zfs_vdev_min_pending, "Min pending per-vdev I/Os"); ++ ++module_param(zfs_vdev_aggregation_limit, int, 0644); ++MODULE_PARM_DESC(zfs_vdev_aggregation_limit, "Max vdev I/O aggregation size"); ++ ++module_param(zfs_vdev_time_shift, int, 0644); ++MODULE_PARM_DESC(zfs_vdev_time_shift, "Deadline time shift for vdev I/O"); ++ ++module_param(zfs_vdev_ramp_rate, int, 0644); ++MODULE_PARM_DESC(zfs_vdev_ramp_rate, "Exponential I/O issue ramp-up rate"); ++ ++module_param(zfs_vdev_read_gap_limit, int, 0644); ++MODULE_PARM_DESC(zfs_vdev_read_gap_limit, "Aggregate read I/O over gap"); ++ ++module_param(zfs_vdev_write_gap_limit, int, 0644); ++MODULE_PARM_DESC(zfs_vdev_write_gap_limit, "Aggregate write I/O over gap"); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/vdev_raidz.c linux-3.2.33-go/fs/zfs/zfs/vdev_raidz.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/vdev_raidz.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/vdev_raidz.c 2012-11-16 23:25:34.348039346 +0100 +@@ -0,0 +1,2153 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Virtual device vector for RAID-Z. ++ * ++ * This vdev supports single, double, and triple parity. For single parity, ++ * we use a simple XOR of all the data columns. For double or triple parity, ++ * we use a special case of Reed-Solomon coding. This extends the ++ * technique described in "The mathematics of RAID-6" by H. Peter Anvin by ++ * drawing on the system described in "A Tutorial on Reed-Solomon Coding for ++ * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the ++ * former is also based. The latter is designed to provide higher performance ++ * for writes. ++ * ++ * Note that the Plank paper claimed to support arbitrary N+M, but was then ++ * amended six years later identifying a critical flaw that invalidates its ++ * claims. Nevertheless, the technique can be adapted to work for up to ++ * triple parity. For additional parity, the amendment "Note: Correction to ++ * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding ++ * is viable, but the additional complexity means that write performance will ++ * suffer. ++ * ++ * All of the methods above operate on a Galois field, defined over the ++ * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements ++ * can be expressed with a single byte. Briefly, the operations on the ++ * field are defined as follows: ++ * ++ * o addition (+) is represented by a bitwise XOR ++ * o subtraction (-) is therefore identical to addition: A + B = A - B ++ * o multiplication of A by 2 is defined by the following bitwise expression: ++ * (A * 2)_7 = A_6 ++ * (A * 2)_6 = A_5 ++ * (A * 2)_5 = A_4 ++ * (A * 2)_4 = A_3 + A_7 ++ * (A * 2)_3 = A_2 + A_7 ++ * (A * 2)_2 = A_1 + A_7 ++ * (A * 2)_1 = A_0 ++ * (A * 2)_0 = A_7 ++ * ++ * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). ++ * As an aside, this multiplication is derived from the error correcting ++ * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. ++ * ++ * Observe that any number in the field (except for 0) can be expressed as a ++ * power of 2 -- a generator for the field. We store a table of the powers of ++ * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can ++ * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather ++ * than field addition). The inverse of a field element A (A^-1) is therefore ++ * A ^ (255 - 1) = A^254. ++ * ++ * The up-to-three parity columns, P, Q, R over several data columns, ++ * D_0, ... D_n-1, can be expressed by field operations: ++ * ++ * P = D_0 + D_1 + ... + D_n-2 + D_n-1 ++ * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 ++ * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 ++ * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 ++ * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 ++ * ++ * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival ++ * XOR operation, and 2 and 4 can be computed quickly and generate linearly- ++ * independent coefficients. (There are no additional coefficients that have ++ * this property which is why the uncorrected Plank method breaks down.) ++ * ++ * See the reconstruction code below for how P, Q and R can used individually ++ * or in concert to recover missing data columns. ++ */ ++ ++typedef struct raidz_col { ++ uint64_t rc_devidx; /* child device index for I/O */ ++ uint64_t rc_offset; /* device offset */ ++ uint64_t rc_size; /* I/O size */ ++ void *rc_data; /* I/O data */ ++ void *rc_gdata; /* used to store the "good" version */ ++ int rc_error; /* I/O error for this device */ ++ uint8_t rc_tried; /* Did we attempt this I/O column? */ ++ uint8_t rc_skipped; /* Did we skip this I/O column? */ ++} raidz_col_t; ++ ++typedef struct raidz_map { ++ uint64_t rm_cols; /* Regular column count */ ++ uint64_t rm_scols; /* Count including skipped columns */ ++ uint64_t rm_bigcols; /* Number of oversized columns */ ++ uint64_t rm_asize; /* Actual total I/O size */ ++ uint64_t rm_missingdata; /* Count of missing data devices */ ++ uint64_t rm_missingparity; /* Count of missing parity devices */ ++ uint64_t rm_firstdatacol; /* First data column/parity count */ ++ uint64_t rm_nskip; /* Skipped sectors for padding */ ++ uint64_t rm_skipstart; /* Column index of padding start */ ++ void *rm_datacopy; /* rm_asize-buffer of copied data */ ++ uintptr_t rm_reports; /* # of referencing checksum reports */ ++ uint8_t rm_freed; /* map no longer has referencing ZIO */ ++ uint8_t rm_ecksuminjected; /* checksum error was injected */ ++ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ ++} raidz_map_t; ++ ++#define VDEV_RAIDZ_P 0 ++#define VDEV_RAIDZ_Q 1 ++#define VDEV_RAIDZ_R 2 ++ ++#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) ++#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) ++ ++/* ++ * We provide a mechanism to perform the field multiplication operation on a ++ * 64-bit value all at once rather than a byte at a time. This works by ++ * creating a mask from the top bit in each byte and using that to ++ * conditionally apply the XOR of 0x1d. ++ */ ++#define VDEV_RAIDZ_64MUL_2(x, mask) \ ++{ \ ++ (mask) = (x) & 0x8080808080808080ULL; \ ++ (mask) = ((mask) << 1) - ((mask) >> 7); \ ++ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ ++ ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ ++} ++ ++#define VDEV_RAIDZ_64MUL_4(x, mask) \ ++{ \ ++ VDEV_RAIDZ_64MUL_2((x), mask); \ ++ VDEV_RAIDZ_64MUL_2((x), mask); \ ++} ++ ++/* ++ * Force reconstruction to use the general purpose method. ++ */ ++int vdev_raidz_default_to_general; ++ ++/* ++ * These two tables represent powers and logs of 2 in the Galois field defined ++ * above. These values were computed by repeatedly multiplying by 2 as above. ++ */ ++static const uint8_t vdev_raidz_pow2[256] = { ++ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, ++ 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, ++ 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, ++ 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, ++ 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, ++ 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, ++ 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, ++ 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, ++ 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, ++ 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, ++ 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, ++ 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, ++ 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, ++ 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, ++ 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, ++ 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, ++ 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, ++ 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, ++ 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, ++ 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, ++ 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, ++ 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, ++ 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, ++ 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, ++ 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, ++ 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, ++ 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, ++ 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, ++ 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, ++ 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, ++ 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, ++ 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 ++}; ++static const uint8_t vdev_raidz_log2[256] = { ++ 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, ++ 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, ++ 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, ++ 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, ++ 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, ++ 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, ++ 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, ++ 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, ++ 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, ++ 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, ++ 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, ++ 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, ++ 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, ++ 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, ++ 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, ++ 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, ++ 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, ++ 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, ++ 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, ++ 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, ++ 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, ++ 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, ++ 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, ++ 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, ++ 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, ++ 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, ++ 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, ++ 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, ++ 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, ++ 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, ++ 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, ++ 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, ++}; ++ ++static void vdev_raidz_generate_parity(raidz_map_t *rm); ++ ++/* ++ * Multiply a given number by 2 raised to the given power. ++ */ ++static uint8_t ++vdev_raidz_exp2(uint_t a, int exp) ++{ ++ if (a == 0) ++ return (0); ++ ++ ASSERT(exp >= 0); ++ ASSERT(vdev_raidz_log2[a] > 0 || a == 1); ++ ++ exp += vdev_raidz_log2[a]; ++ if (exp > 255) ++ exp -= 255; ++ ++ return (vdev_raidz_pow2[exp]); ++} ++ ++static void ++vdev_raidz_map_free(raidz_map_t *rm) ++{ ++ int c; ++ size_t size; ++ ++ for (c = 0; c < rm->rm_firstdatacol; c++) { ++ zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); ++ ++ if (rm->rm_col[c].rc_gdata != NULL) ++ zio_buf_free(rm->rm_col[c].rc_gdata, ++ rm->rm_col[c].rc_size); ++ } ++ ++ size = 0; ++ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) ++ size += rm->rm_col[c].rc_size; ++ ++ if (rm->rm_datacopy != NULL) ++ zio_buf_free(rm->rm_datacopy, size); ++ ++ kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); ++} ++ ++static void ++vdev_raidz_map_free_vsd(zio_t *zio) ++{ ++ raidz_map_t *rm = zio->io_vsd; ++ ++ ASSERT3U(rm->rm_freed, ==, 0); ++ rm->rm_freed = 1; ++ ++ if (rm->rm_reports == 0) ++ vdev_raidz_map_free(rm); ++} ++ ++/*ARGSUSED*/ ++static void ++vdev_raidz_cksum_free(void *arg, size_t ignored) ++{ ++ raidz_map_t *rm = arg; ++ ++ ASSERT3U(rm->rm_reports, >, 0); ++ ++ if (--rm->rm_reports == 0 && rm->rm_freed != 0) ++ vdev_raidz_map_free(rm); ++} ++ ++static void ++vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) ++{ ++ raidz_map_t *rm = zcr->zcr_cbdata; ++ size_t c = zcr->zcr_cbinfo; ++ size_t x; ++ ++ const char *good = NULL; ++ const char *bad = rm->rm_col[c].rc_data; ++ ++ if (good_data == NULL) { ++ zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); ++ return; ++ } ++ ++ if (c < rm->rm_firstdatacol) { ++ /* ++ * The first time through, calculate the parity blocks for ++ * the good data (this relies on the fact that the good ++ * data never changes for a given logical ZIO) ++ */ ++ if (rm->rm_col[0].rc_gdata == NULL) { ++ char *bad_parity[VDEV_RAIDZ_MAXPARITY]; ++ char *buf; ++ ++ /* ++ * Set up the rm_col[]s to generate the parity for ++ * good_data, first saving the parity bufs and ++ * replacing them with buffers to hold the result. ++ */ ++ for (x = 0; x < rm->rm_firstdatacol; x++) { ++ bad_parity[x] = rm->rm_col[x].rc_data; ++ rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = ++ zio_buf_alloc(rm->rm_col[x].rc_size); ++ } ++ ++ /* fill in the data columns from good_data */ ++ buf = (char *)good_data; ++ for (; x < rm->rm_cols; x++) { ++ rm->rm_col[x].rc_data = buf; ++ buf += rm->rm_col[x].rc_size; ++ } ++ ++ /* ++ * Construct the parity from the good data. ++ */ ++ vdev_raidz_generate_parity(rm); ++ ++ /* restore everything back to its original state */ ++ for (x = 0; x < rm->rm_firstdatacol; x++) ++ rm->rm_col[x].rc_data = bad_parity[x]; ++ ++ buf = rm->rm_datacopy; ++ for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { ++ rm->rm_col[x].rc_data = buf; ++ buf += rm->rm_col[x].rc_size; ++ } ++ } ++ ++ ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); ++ good = rm->rm_col[c].rc_gdata; ++ } else { ++ /* adjust good_data to point at the start of our column */ ++ good = good_data; ++ ++ for (x = rm->rm_firstdatacol; x < c; x++) ++ good += rm->rm_col[x].rc_size; ++ } ++ ++ /* we drop the ereport if it ends up that the data was good */ ++ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); ++} ++ ++/* ++ * Invoked indirectly by zfs_ereport_start_checksum(), called ++ * below when our read operation fails completely. The main point ++ * is to keep a copy of everything we read from disk, so that at ++ * vdev_raidz_cksum_finish() time we can compare it with the good data. ++ */ ++static void ++vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) ++{ ++ size_t c = (size_t)(uintptr_t)arg; ++ caddr_t buf; ++ ++ raidz_map_t *rm = zio->io_vsd; ++ size_t size; ++ ++ /* set up the report and bump the refcount */ ++ zcr->zcr_cbdata = rm; ++ zcr->zcr_cbinfo = c; ++ zcr->zcr_finish = vdev_raidz_cksum_finish; ++ zcr->zcr_free = vdev_raidz_cksum_free; ++ ++ rm->rm_reports++; ++ ASSERT3U(rm->rm_reports, >, 0); ++ ++ if (rm->rm_datacopy != NULL) ++ return; ++ ++ /* ++ * It's the first time we're called for this raidz_map_t, so we need ++ * to copy the data aside; there's no guarantee that our zio's buffer ++ * won't be re-used for something else. ++ * ++ * Our parity data is already in separate buffers, so there's no need ++ * to copy them. ++ */ ++ ++ size = 0; ++ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) ++ size += rm->rm_col[c].rc_size; ++ ++ buf = rm->rm_datacopy = zio_buf_alloc(size); ++ ++ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { ++ raidz_col_t *col = &rm->rm_col[c]; ++ ++ bcopy(col->rc_data, buf, col->rc_size); ++ col->rc_data = buf; ++ ++ buf += col->rc_size; ++ } ++ ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); ++} ++ ++static const zio_vsd_ops_t vdev_raidz_vsd_ops = { ++ vdev_raidz_map_free_vsd, ++ vdev_raidz_cksum_report ++}; ++ ++static raidz_map_t * ++vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, ++ uint64_t nparity) ++{ ++ raidz_map_t *rm; ++ uint64_t b = zio->io_offset >> unit_shift; ++ uint64_t s = zio->io_size >> unit_shift; ++ uint64_t f = b % dcols; ++ uint64_t o = (b / dcols) << unit_shift; ++ uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; ++ ++ q = s / (dcols - nparity); ++ r = s - q * (dcols - nparity); ++ bc = (r == 0 ? 0 : r + nparity); ++ tot = s + nparity * (q + (r == 0 ? 0 : 1)); ++ ++ if (q == 0) { ++ acols = bc; ++ scols = MIN(dcols, roundup(bc, nparity + 1)); ++ } else { ++ acols = dcols; ++ scols = dcols; ++ } ++ ++ ASSERT3U(acols, <=, scols); ++ ++ rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_PUSHPAGE); ++ ++ rm->rm_cols = acols; ++ rm->rm_scols = scols; ++ rm->rm_bigcols = bc; ++ rm->rm_skipstart = bc; ++ rm->rm_missingdata = 0; ++ rm->rm_missingparity = 0; ++ rm->rm_firstdatacol = nparity; ++ rm->rm_datacopy = NULL; ++ rm->rm_reports = 0; ++ rm->rm_freed = 0; ++ rm->rm_ecksuminjected = 0; ++ ++ asize = 0; ++ ++ for (c = 0; c < scols; c++) { ++ col = f + c; ++ coff = o; ++ if (col >= dcols) { ++ col -= dcols; ++ coff += 1ULL << unit_shift; ++ } ++ rm->rm_col[c].rc_devidx = col; ++ rm->rm_col[c].rc_offset = coff; ++ rm->rm_col[c].rc_data = NULL; ++ rm->rm_col[c].rc_gdata = NULL; ++ rm->rm_col[c].rc_error = 0; ++ rm->rm_col[c].rc_tried = 0; ++ rm->rm_col[c].rc_skipped = 0; ++ ++ if (c >= acols) ++ rm->rm_col[c].rc_size = 0; ++ else if (c < bc) ++ rm->rm_col[c].rc_size = (q + 1) << unit_shift; ++ else ++ rm->rm_col[c].rc_size = q << unit_shift; ++ ++ asize += rm->rm_col[c].rc_size; ++ } ++ ++ ASSERT3U(asize, ==, tot << unit_shift); ++ rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); ++ rm->rm_nskip = roundup(tot, nparity + 1) - tot; ++ ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); ++ ASSERT3U(rm->rm_nskip, <=, nparity); ++ ++ for (c = 0; c < rm->rm_firstdatacol; c++) ++ rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); ++ ++ rm->rm_col[c].rc_data = zio->io_data; ++ ++ for (c = c + 1; c < acols; c++) ++ rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + ++ rm->rm_col[c - 1].rc_size; ++ ++ /* ++ * If all data stored spans all columns, there's a danger that parity ++ * will always be on the same device and, since parity isn't read ++ * during normal operation, that that device's I/O bandwidth won't be ++ * used effectively. We therefore switch the parity every 1MB. ++ * ++ * ... at least that was, ostensibly, the theory. As a practical ++ * matter unless we juggle the parity between all devices evenly, we ++ * won't see any benefit. Further, occasional writes that aren't a ++ * multiple of the LCM of the number of children and the minimum ++ * stripe width are sufficient to avoid pessimal behavior. ++ * Unfortunately, this decision created an implicit on-disk format ++ * requirement that we need to support for all eternity, but only ++ * for single-parity RAID-Z. ++ * ++ * If we intend to skip a sector in the zeroth column for padding ++ * we must make sure to note this swap. We will never intend to ++ * skip the first column since at least one data and one parity ++ * column must appear in each row. ++ */ ++ ASSERT(rm->rm_cols >= 2); ++ ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); ++ ++ if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { ++ devidx = rm->rm_col[0].rc_devidx; ++ o = rm->rm_col[0].rc_offset; ++ rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; ++ rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; ++ rm->rm_col[1].rc_devidx = devidx; ++ rm->rm_col[1].rc_offset = o; ++ ++ if (rm->rm_skipstart == 0) ++ rm->rm_skipstart = 1; ++ } ++ ++ zio->io_vsd = rm; ++ zio->io_vsd_ops = &vdev_raidz_vsd_ops; ++ return (rm); ++} ++ ++static void ++vdev_raidz_generate_parity_p(raidz_map_t *rm) ++{ ++ uint64_t *p, *src, pcount, ccount, i; ++ int c; ++ ++ pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); ++ ++ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { ++ src = rm->rm_col[c].rc_data; ++ p = rm->rm_col[VDEV_RAIDZ_P].rc_data; ++ ccount = rm->rm_col[c].rc_size / sizeof (src[0]); ++ ++ if (c == rm->rm_firstdatacol) { ++ ASSERT(ccount == pcount); ++ for (i = 0; i < ccount; i++, src++, p++) { ++ *p = *src; ++ } ++ } else { ++ ASSERT(ccount <= pcount); ++ for (i = 0; i < ccount; i++, src++, p++) { ++ *p ^= *src; ++ } ++ } ++ } ++} ++ ++static void ++vdev_raidz_generate_parity_pq(raidz_map_t *rm) ++{ ++ uint64_t *p, *q, *src, pcnt, ccnt, mask, i; ++ int c; ++ ++ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); ++ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == ++ rm->rm_col[VDEV_RAIDZ_Q].rc_size); ++ ++ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { ++ src = rm->rm_col[c].rc_data; ++ p = rm->rm_col[VDEV_RAIDZ_P].rc_data; ++ q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; ++ ++ ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); ++ ++ if (c == rm->rm_firstdatacol) { ++ ASSERT(ccnt == pcnt || ccnt == 0); ++ for (i = 0; i < ccnt; i++, src++, p++, q++) { ++ *p = *src; ++ *q = *src; ++ } ++ for (; i < pcnt; i++, src++, p++, q++) { ++ *p = 0; ++ *q = 0; ++ } ++ } else { ++ ASSERT(ccnt <= pcnt); ++ ++ /* ++ * Apply the algorithm described above by multiplying ++ * the previous result and adding in the new value. ++ */ ++ for (i = 0; i < ccnt; i++, src++, p++, q++) { ++ *p ^= *src; ++ ++ VDEV_RAIDZ_64MUL_2(*q, mask); ++ *q ^= *src; ++ } ++ ++ /* ++ * Treat short columns as though they are full of 0s. ++ * Note that there's therefore nothing needed for P. ++ */ ++ for (; i < pcnt; i++, q++) { ++ VDEV_RAIDZ_64MUL_2(*q, mask); ++ } ++ } ++ } ++} ++ ++static void ++vdev_raidz_generate_parity_pqr(raidz_map_t *rm) ++{ ++ uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; ++ int c; ++ ++ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); ++ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == ++ rm->rm_col[VDEV_RAIDZ_Q].rc_size); ++ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == ++ rm->rm_col[VDEV_RAIDZ_R].rc_size); ++ ++ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { ++ src = rm->rm_col[c].rc_data; ++ p = rm->rm_col[VDEV_RAIDZ_P].rc_data; ++ q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; ++ r = rm->rm_col[VDEV_RAIDZ_R].rc_data; ++ ++ ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); ++ ++ if (c == rm->rm_firstdatacol) { ++ ASSERT(ccnt == pcnt || ccnt == 0); ++ for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { ++ *p = *src; ++ *q = *src; ++ *r = *src; ++ } ++ for (; i < pcnt; i++, src++, p++, q++, r++) { ++ *p = 0; ++ *q = 0; ++ *r = 0; ++ } ++ } else { ++ ASSERT(ccnt <= pcnt); ++ ++ /* ++ * Apply the algorithm described above by multiplying ++ * the previous result and adding in the new value. ++ */ ++ for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { ++ *p ^= *src; ++ ++ VDEV_RAIDZ_64MUL_2(*q, mask); ++ *q ^= *src; ++ ++ VDEV_RAIDZ_64MUL_4(*r, mask); ++ *r ^= *src; ++ } ++ ++ /* ++ * Treat short columns as though they are full of 0s. ++ * Note that there's therefore nothing needed for P. ++ */ ++ for (; i < pcnt; i++, q++, r++) { ++ VDEV_RAIDZ_64MUL_2(*q, mask); ++ VDEV_RAIDZ_64MUL_4(*r, mask); ++ } ++ } ++ } ++} ++ ++/* ++ * Generate RAID parity in the first virtual columns according to the number of ++ * parity columns available. ++ */ ++static void ++vdev_raidz_generate_parity(raidz_map_t *rm) ++{ ++ switch (rm->rm_firstdatacol) { ++ case 1: ++ vdev_raidz_generate_parity_p(rm); ++ break; ++ case 2: ++ vdev_raidz_generate_parity_pq(rm); ++ break; ++ case 3: ++ vdev_raidz_generate_parity_pqr(rm); ++ break; ++ default: ++ cmn_err(CE_PANIC, "invalid RAID-Z configuration"); ++ } ++} ++ ++static int ++vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) ++{ ++ uint64_t *dst, *src, xcount, ccount, count, i; ++ int x = tgts[0]; ++ int c; ++ ++ ASSERT(ntgts == 1); ++ ASSERT(x >= rm->rm_firstdatacol); ++ ASSERT(x < rm->rm_cols); ++ ++ xcount = rm->rm_col[x].rc_size / sizeof (src[0]); ++ ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); ++ ASSERT(xcount > 0); ++ ++ src = rm->rm_col[VDEV_RAIDZ_P].rc_data; ++ dst = rm->rm_col[x].rc_data; ++ for (i = 0; i < xcount; i++, dst++, src++) { ++ *dst = *src; ++ } ++ ++ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { ++ src = rm->rm_col[c].rc_data; ++ dst = rm->rm_col[x].rc_data; ++ ++ if (c == x) ++ continue; ++ ++ ccount = rm->rm_col[c].rc_size / sizeof (src[0]); ++ count = MIN(ccount, xcount); ++ ++ for (i = 0; i < count; i++, dst++, src++) { ++ *dst ^= *src; ++ } ++ } ++ ++ return (1 << VDEV_RAIDZ_P); ++} ++ ++static int ++vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) ++{ ++ uint64_t *dst, *src, xcount, ccount, count, mask, i; ++ uint8_t *b; ++ int x = tgts[0]; ++ int c, j, exp; ++ ++ ASSERT(ntgts == 1); ++ ++ xcount = rm->rm_col[x].rc_size / sizeof (src[0]); ++ ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); ++ ++ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { ++ src = rm->rm_col[c].rc_data; ++ dst = rm->rm_col[x].rc_data; ++ ++ if (c == x) ++ ccount = 0; ++ else ++ ccount = rm->rm_col[c].rc_size / sizeof (src[0]); ++ ++ count = MIN(ccount, xcount); ++ ++ if (c == rm->rm_firstdatacol) { ++ for (i = 0; i < count; i++, dst++, src++) { ++ *dst = *src; ++ } ++ for (; i < xcount; i++, dst++) { ++ *dst = 0; ++ } ++ ++ } else { ++ for (i = 0; i < count; i++, dst++, src++) { ++ VDEV_RAIDZ_64MUL_2(*dst, mask); ++ *dst ^= *src; ++ } ++ ++ for (; i < xcount; i++, dst++) { ++ VDEV_RAIDZ_64MUL_2(*dst, mask); ++ } ++ } ++ } ++ ++ src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; ++ dst = rm->rm_col[x].rc_data; ++ exp = 255 - (rm->rm_cols - 1 - x); ++ ++ for (i = 0; i < xcount; i++, dst++, src++) { ++ *dst ^= *src; ++ for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { ++ *b = vdev_raidz_exp2(*b, exp); ++ } ++ } ++ ++ return (1 << VDEV_RAIDZ_Q); ++} ++ ++static int ++vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) ++{ ++ uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; ++ void *pdata, *qdata; ++ uint64_t xsize, ysize, i; ++ int x = tgts[0]; ++ int y = tgts[1]; ++ ++ ASSERT(ntgts == 2); ++ ASSERT(x < y); ++ ASSERT(x >= rm->rm_firstdatacol); ++ ASSERT(y < rm->rm_cols); ++ ++ ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); ++ ++ /* ++ * Move the parity data aside -- we're going to compute parity as ++ * though columns x and y were full of zeros -- Pxy and Qxy. We want to ++ * reuse the parity generation mechanism without trashing the actual ++ * parity so we make those columns appear to be full of zeros by ++ * setting their lengths to zero. ++ */ ++ pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; ++ qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; ++ xsize = rm->rm_col[x].rc_size; ++ ysize = rm->rm_col[y].rc_size; ++ ++ rm->rm_col[VDEV_RAIDZ_P].rc_data = ++ zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); ++ rm->rm_col[VDEV_RAIDZ_Q].rc_data = ++ zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); ++ rm->rm_col[x].rc_size = 0; ++ rm->rm_col[y].rc_size = 0; ++ ++ vdev_raidz_generate_parity_pq(rm); ++ ++ rm->rm_col[x].rc_size = xsize; ++ rm->rm_col[y].rc_size = ysize; ++ ++ p = pdata; ++ q = qdata; ++ pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; ++ qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; ++ xd = rm->rm_col[x].rc_data; ++ yd = rm->rm_col[y].rc_data; ++ ++ /* ++ * We now have: ++ * Pxy = P + D_x + D_y ++ * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y ++ * ++ * We can then solve for D_x: ++ * D_x = A * (P + Pxy) + B * (Q + Qxy) ++ * where ++ * A = 2^(x - y) * (2^(x - y) + 1)^-1 ++ * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 ++ * ++ * With D_x in hand, we can easily solve for D_y: ++ * D_y = P + Pxy + D_x ++ */ ++ ++ a = vdev_raidz_pow2[255 + x - y]; ++ b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; ++ tmp = 255 - vdev_raidz_log2[a ^ 1]; ++ ++ aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; ++ bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; ++ ++ for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { ++ *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ ++ vdev_raidz_exp2(*q ^ *qxy, bexp); ++ ++ if (i < ysize) ++ *yd = *p ^ *pxy ^ *xd; ++ } ++ ++ zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, ++ rm->rm_col[VDEV_RAIDZ_P].rc_size); ++ zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, ++ rm->rm_col[VDEV_RAIDZ_Q].rc_size); ++ ++ /* ++ * Restore the saved parity data. ++ */ ++ rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; ++ rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; ++ ++ return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); ++} ++ ++/* BEGIN CSTYLED */ ++/* ++ * In the general case of reconstruction, we must solve the system of linear ++ * equations defined by the coeffecients used to generate parity as well as ++ * the contents of the data and parity disks. This can be expressed with ++ * vectors for the original data (D) and the actual data (d) and parity (p) ++ * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): ++ * ++ * __ __ __ __ ++ * | | __ __ | p_0 | ++ * | V | | D_0 | | p_m-1 | ++ * | | x | : | = | d_0 | ++ * | I | | D_n-1 | | : | ++ * | | ~~ ~~ | d_n-1 | ++ * ~~ ~~ ~~ ~~ ++ * ++ * I is simply a square identity matrix of size n, and V is a vandermonde ++ * matrix defined by the coeffecients we chose for the various parity columns ++ * (1, 2, 4). Note that these values were chosen both for simplicity, speedy ++ * computation as well as linear separability. ++ * ++ * __ __ __ __ ++ * | 1 .. 1 1 1 | | p_0 | ++ * | 2^n-1 .. 4 2 1 | __ __ | : | ++ * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | ++ * | 1 .. 0 0 0 | | D_1 | | d_0 | ++ * | 0 .. 0 0 0 | x | D_2 | = | d_1 | ++ * | : : : : | | : | | d_2 | ++ * | 0 .. 1 0 0 | | D_n-1 | | : | ++ * | 0 .. 0 1 0 | ~~ ~~ | : | ++ * | 0 .. 0 0 1 | | d_n-1 | ++ * ~~ ~~ ~~ ~~ ++ * ++ * Note that I, V, d, and p are known. To compute D, we must invert the ++ * matrix and use the known data and parity values to reconstruct the unknown ++ * data values. We begin by removing the rows in V|I and d|p that correspond ++ * to failed or missing columns; we then make V|I square (n x n) and d|p ++ * sized n by removing rows corresponding to unused parity from the bottom up ++ * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' ++ * using Gauss-Jordan elimination. In the example below we use m=3 parity ++ * columns, n=8 data columns, with errors in d_1, d_2, and p_1: ++ * __ __ ++ * | 1 1 1 1 1 1 1 1 | ++ * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks ++ * | 19 205 116 29 64 16 4 1 | / / ++ * | 1 0 0 0 0 0 0 0 | / / ++ * | 0 1 0 0 0 0 0 0 | <--' / ++ * (V|I) = | 0 0 1 0 0 0 0 0 | <---' ++ * | 0 0 0 1 0 0 0 0 | ++ * | 0 0 0 0 1 0 0 0 | ++ * | 0 0 0 0 0 1 0 0 | ++ * | 0 0 0 0 0 0 1 0 | ++ * | 0 0 0 0 0 0 0 1 | ++ * ~~ ~~ ++ * __ __ ++ * | 1 1 1 1 1 1 1 1 | ++ * | 128 64 32 16 8 4 2 1 | ++ * | 19 205 116 29 64 16 4 1 | ++ * | 1 0 0 0 0 0 0 0 | ++ * | 0 1 0 0 0 0 0 0 | ++ * (V|I)' = | 0 0 1 0 0 0 0 0 | ++ * | 0 0 0 1 0 0 0 0 | ++ * | 0 0 0 0 1 0 0 0 | ++ * | 0 0 0 0 0 1 0 0 | ++ * | 0 0 0 0 0 0 1 0 | ++ * | 0 0 0 0 0 0 0 1 | ++ * ~~ ~~ ++ * ++ * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We ++ * have carefully chosen the seed values 1, 2, and 4 to ensure that this ++ * matrix is not singular. ++ * __ __ ++ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | ++ * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | ++ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | ++ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | ++ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | ++ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | ++ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | ++ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | ++ * ~~ ~~ ++ * __ __ ++ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | ++ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | ++ * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | ++ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | ++ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | ++ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | ++ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | ++ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | ++ * ~~ ~~ ++ * __ __ ++ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | ++ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | ++ * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | ++ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | ++ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | ++ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | ++ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | ++ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | ++ * ~~ ~~ ++ * __ __ ++ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | ++ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | ++ * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | ++ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | ++ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | ++ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | ++ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | ++ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | ++ * ~~ ~~ ++ * __ __ ++ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | ++ * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | ++ * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | ++ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | ++ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | ++ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | ++ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | ++ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | ++ * ~~ ~~ ++ * __ __ ++ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | ++ * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | ++ * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | ++ * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | ++ * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | ++ * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | ++ * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | ++ * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | ++ * ~~ ~~ ++ * __ __ ++ * | 0 0 1 0 0 0 0 0 | ++ * | 167 100 5 41 159 169 217 208 | ++ * | 166 100 4 40 158 168 216 209 | ++ * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | ++ * | 0 0 0 0 1 0 0 0 | ++ * | 0 0 0 0 0 1 0 0 | ++ * | 0 0 0 0 0 0 1 0 | ++ * | 0 0 0 0 0 0 0 1 | ++ * ~~ ~~ ++ * ++ * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values ++ * of the missing data. ++ * ++ * As is apparent from the example above, the only non-trivial rows in the ++ * inverse matrix correspond to the data disks that we're trying to ++ * reconstruct. Indeed, those are the only rows we need as the others would ++ * only be useful for reconstructing data known or assumed to be valid. For ++ * that reason, we only build the coefficients in the rows that correspond to ++ * targeted columns. ++ */ ++/* END CSTYLED */ ++ ++static void ++vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, ++ uint8_t **rows) ++{ ++ int i, j; ++ int pow; ++ ++ ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); ++ ++ /* ++ * Fill in the missing rows of interest. ++ */ ++ for (i = 0; i < nmap; i++) { ++ ASSERT3S(0, <=, map[i]); ++ ASSERT3S(map[i], <=, 2); ++ ++ pow = map[i] * n; ++ if (pow > 255) ++ pow -= 255; ++ ASSERT(pow <= 255); ++ ++ for (j = 0; j < n; j++) { ++ pow -= map[i]; ++ if (pow < 0) ++ pow += 255; ++ rows[i][j] = vdev_raidz_pow2[pow]; ++ } ++ } ++} ++ ++static void ++vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, ++ uint8_t **rows, uint8_t **invrows, const uint8_t *used) ++{ ++ int i, j, ii, jj; ++ uint8_t log; ++ ++ /* ++ * Assert that the first nmissing entries from the array of used ++ * columns correspond to parity columns and that subsequent entries ++ * correspond to data columns. ++ */ ++ for (i = 0; i < nmissing; i++) { ++ ASSERT3S(used[i], <, rm->rm_firstdatacol); ++ } ++ for (; i < n; i++) { ++ ASSERT3S(used[i], >=, rm->rm_firstdatacol); ++ } ++ ++ /* ++ * First initialize the storage where we'll compute the inverse rows. ++ */ ++ for (i = 0; i < nmissing; i++) { ++ for (j = 0; j < n; j++) { ++ invrows[i][j] = (i == j) ? 1 : 0; ++ } ++ } ++ ++ /* ++ * Subtract all trivial rows from the rows of consequence. ++ */ ++ for (i = 0; i < nmissing; i++) { ++ for (j = nmissing; j < n; j++) { ++ ASSERT3U(used[j], >=, rm->rm_firstdatacol); ++ jj = used[j] - rm->rm_firstdatacol; ++ ASSERT3S(jj, <, n); ++ invrows[i][j] = rows[i][jj]; ++ rows[i][jj] = 0; ++ } ++ } ++ ++ /* ++ * For each of the rows of interest, we must normalize it and subtract ++ * a multiple of it from the other rows. ++ */ ++ for (i = 0; i < nmissing; i++) { ++ for (j = 0; j < missing[i]; j++) { ++ ASSERT3U(rows[i][j], ==, 0); ++ } ++ ASSERT3U(rows[i][missing[i]], !=, 0); ++ ++ /* ++ * Compute the inverse of the first element and multiply each ++ * element in the row by that value. ++ */ ++ log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; ++ ++ for (j = 0; j < n; j++) { ++ rows[i][j] = vdev_raidz_exp2(rows[i][j], log); ++ invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); ++ } ++ ++ for (ii = 0; ii < nmissing; ii++) { ++ if (i == ii) ++ continue; ++ ++ ASSERT3U(rows[ii][missing[i]], !=, 0); ++ ++ log = vdev_raidz_log2[rows[ii][missing[i]]]; ++ ++ for (j = 0; j < n; j++) { ++ rows[ii][j] ^= ++ vdev_raidz_exp2(rows[i][j], log); ++ invrows[ii][j] ^= ++ vdev_raidz_exp2(invrows[i][j], log); ++ } ++ } ++ } ++ ++ /* ++ * Verify that the data that is left in the rows are properly part of ++ * an identity matrix. ++ */ ++ for (i = 0; i < nmissing; i++) { ++ for (j = 0; j < n; j++) { ++ if (j == missing[i]) { ++ ASSERT3U(rows[i][j], ==, 1); ++ } else { ++ ASSERT3U(rows[i][j], ==, 0); ++ } ++ } ++ } ++} ++ ++static void ++vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, ++ int *missing, uint8_t **invrows, const uint8_t *used) ++{ ++ int i, j, x, cc, c; ++ uint8_t *src; ++ uint64_t ccount; ++ uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; ++ uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; ++ uint8_t log = 0, val; ++ int ll; ++ uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; ++ uint8_t *p, *pp; ++ size_t psize; ++ ++ psize = sizeof (invlog[0][0]) * n * nmissing; ++ p = kmem_alloc(psize, KM_PUSHPAGE); ++ ++ for (pp = p, i = 0; i < nmissing; i++) { ++ invlog[i] = pp; ++ pp += n; ++ } ++ ++ for (i = 0; i < nmissing; i++) { ++ for (j = 0; j < n; j++) { ++ ASSERT3U(invrows[i][j], !=, 0); ++ invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; ++ } ++ } ++ ++ for (i = 0; i < n; i++) { ++ c = used[i]; ++ ASSERT3U(c, <, rm->rm_cols); ++ ++ src = rm->rm_col[c].rc_data; ++ ccount = rm->rm_col[c].rc_size; ++ for (j = 0; j < nmissing; j++) { ++ cc = missing[j] + rm->rm_firstdatacol; ++ ASSERT3U(cc, >=, rm->rm_firstdatacol); ++ ASSERT3U(cc, <, rm->rm_cols); ++ ASSERT3U(cc, !=, c); ++ ++ dst[j] = rm->rm_col[cc].rc_data; ++ dcount[j] = rm->rm_col[cc].rc_size; ++ } ++ ++ ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); ++ ++ for (x = 0; x < ccount; x++, src++) { ++ if (*src != 0) ++ log = vdev_raidz_log2[*src]; ++ ++ for (cc = 0; cc < nmissing; cc++) { ++ if (x >= dcount[cc]) ++ continue; ++ ++ if (*src == 0) { ++ val = 0; ++ } else { ++ if ((ll = log + invlog[cc][i]) >= 255) ++ ll -= 255; ++ val = vdev_raidz_pow2[ll]; ++ } ++ ++ if (i == 0) ++ dst[cc][x] = val; ++ else ++ dst[cc][x] ^= val; ++ } ++ } ++ } ++ ++ kmem_free(p, psize); ++} ++ ++static int ++vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) ++{ ++ int n, i, c, t, tt; ++ int nmissing_rows; ++ int missing_rows[VDEV_RAIDZ_MAXPARITY]; ++ int parity_map[VDEV_RAIDZ_MAXPARITY]; ++ ++ uint8_t *p, *pp; ++ size_t psize; ++ ++ uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; ++ uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; ++ uint8_t *used; ++ ++ int code = 0; ++ ++ ++ n = rm->rm_cols - rm->rm_firstdatacol; ++ ++ /* ++ * Figure out which data columns are missing. ++ */ ++ nmissing_rows = 0; ++ for (t = 0; t < ntgts; t++) { ++ if (tgts[t] >= rm->rm_firstdatacol) { ++ missing_rows[nmissing_rows++] = ++ tgts[t] - rm->rm_firstdatacol; ++ } ++ } ++ ++ /* ++ * Figure out which parity columns to use to help generate the missing ++ * data columns. ++ */ ++ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { ++ ASSERT(tt < ntgts); ++ ASSERT(c < rm->rm_firstdatacol); ++ ++ /* ++ * Skip any targeted parity columns. ++ */ ++ if (c == tgts[tt]) { ++ tt++; ++ continue; ++ } ++ ++ code |= 1 << c; ++ ++ parity_map[i] = c; ++ i++; ++ } ++ ++ ASSERT(code != 0); ++ ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); ++ ++ psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * ++ nmissing_rows * n + sizeof (used[0]) * n; ++ p = kmem_alloc(psize, KM_PUSHPAGE); ++ ++ for (pp = p, i = 0; i < nmissing_rows; i++) { ++ rows[i] = pp; ++ pp += n; ++ invrows[i] = pp; ++ pp += n; ++ } ++ used = pp; ++ ++ for (i = 0; i < nmissing_rows; i++) { ++ used[i] = parity_map[i]; ++ } ++ ++ for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { ++ if (tt < nmissing_rows && ++ c == missing_rows[tt] + rm->rm_firstdatacol) { ++ tt++; ++ continue; ++ } ++ ++ ASSERT3S(i, <, n); ++ used[i] = c; ++ i++; ++ } ++ ++ /* ++ * Initialize the interesting rows of the matrix. ++ */ ++ vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); ++ ++ /* ++ * Invert the matrix. ++ */ ++ vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, ++ invrows, used); ++ ++ /* ++ * Reconstruct the missing data using the generated matrix. ++ */ ++ vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, ++ invrows, used); ++ ++ kmem_free(p, psize); ++ ++ return (code); ++} ++ ++static int ++vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) ++{ ++ int tgts[VDEV_RAIDZ_MAXPARITY], *dt; ++ int ntgts; ++ int i, c; ++ int code; ++ int nbadparity, nbaddata; ++ int parity_valid[VDEV_RAIDZ_MAXPARITY]; ++ ++ /* ++ * The tgts list must already be sorted. ++ */ ++ for (i = 1; i < nt; i++) { ++ ASSERT(t[i] > t[i - 1]); ++ } ++ ++ nbadparity = rm->rm_firstdatacol; ++ nbaddata = rm->rm_cols - nbadparity; ++ ntgts = 0; ++ for (i = 0, c = 0; c < rm->rm_cols; c++) { ++ if (c < rm->rm_firstdatacol) ++ parity_valid[c] = B_FALSE; ++ ++ if (i < nt && c == t[i]) { ++ tgts[ntgts++] = c; ++ i++; ++ } else if (rm->rm_col[c].rc_error != 0) { ++ tgts[ntgts++] = c; ++ } else if (c >= rm->rm_firstdatacol) { ++ nbaddata--; ++ } else { ++ parity_valid[c] = B_TRUE; ++ nbadparity--; ++ } ++ } ++ ++ ASSERT(ntgts >= nt); ++ ASSERT(nbaddata >= 0); ++ ASSERT(nbaddata + nbadparity == ntgts); ++ ++ dt = &tgts[nbadparity]; ++ ++ /* ++ * See if we can use any of our optimized reconstruction routines. ++ */ ++ if (!vdev_raidz_default_to_general) { ++ switch (nbaddata) { ++ case 1: ++ if (parity_valid[VDEV_RAIDZ_P]) ++ return (vdev_raidz_reconstruct_p(rm, dt, 1)); ++ ++ ASSERT(rm->rm_firstdatacol > 1); ++ ++ if (parity_valid[VDEV_RAIDZ_Q]) ++ return (vdev_raidz_reconstruct_q(rm, dt, 1)); ++ ++ ASSERT(rm->rm_firstdatacol > 2); ++ break; ++ ++ case 2: ++ ASSERT(rm->rm_firstdatacol > 1); ++ ++ if (parity_valid[VDEV_RAIDZ_P] && ++ parity_valid[VDEV_RAIDZ_Q]) ++ return (vdev_raidz_reconstruct_pq(rm, dt, 2)); ++ ++ ASSERT(rm->rm_firstdatacol > 2); ++ ++ break; ++ } ++ } ++ ++ code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); ++ ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); ++ ASSERT(code > 0); ++ return (code); ++} ++ ++static int ++vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, ++ uint64_t *ashift) ++{ ++ vdev_t *cvd; ++ uint64_t nparity = vd->vdev_nparity; ++ int c; ++ int lasterror = 0; ++ int numerrors = 0; ++ ++ ASSERT(nparity > 0); ++ ++ if (nparity > VDEV_RAIDZ_MAXPARITY || ++ vd->vdev_children < nparity + 1) { ++ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; ++ return (EINVAL); ++ } ++ ++ vdev_open_children(vd); ++ ++ for (c = 0; c < vd->vdev_children; c++) { ++ cvd = vd->vdev_child[c]; ++ ++ if (cvd->vdev_open_error != 0) { ++ lasterror = cvd->vdev_open_error; ++ numerrors++; ++ continue; ++ } ++ ++ *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; ++ *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; ++ *ashift = MAX(*ashift, cvd->vdev_ashift); ++ } ++ ++ *asize *= vd->vdev_children; ++ *max_asize *= vd->vdev_children; ++ ++ if (numerrors > nparity) { ++ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; ++ return (lasterror); ++ } ++ ++ return (0); ++} ++ ++static void ++vdev_raidz_close(vdev_t *vd) ++{ ++ int c; ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ vdev_close(vd->vdev_child[c]); ++} ++ ++static uint64_t ++vdev_raidz_asize(vdev_t *vd, uint64_t psize) ++{ ++ uint64_t asize; ++ uint64_t ashift = vd->vdev_top->vdev_ashift; ++ uint64_t cols = vd->vdev_children; ++ uint64_t nparity = vd->vdev_nparity; ++ ++ asize = ((psize - 1) >> ashift) + 1; ++ asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); ++ asize = roundup(asize, nparity + 1) << ashift; ++ ++ return (asize); ++} ++ ++static void ++vdev_raidz_child_done(zio_t *zio) ++{ ++ raidz_col_t *rc = zio->io_private; ++ ++ rc->rc_error = zio->io_error; ++ rc->rc_tried = 1; ++ rc->rc_skipped = 0; ++} ++ ++static int ++vdev_raidz_io_start(zio_t *zio) ++{ ++ vdev_t *vd = zio->io_vd; ++ vdev_t *tvd = vd->vdev_top; ++ vdev_t *cvd; ++ raidz_map_t *rm; ++ raidz_col_t *rc; ++ int c, i; ++ ++ rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, ++ vd->vdev_nparity); ++ ++ ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); ++ ++ if (zio->io_type == ZIO_TYPE_WRITE) { ++ vdev_raidz_generate_parity(rm); ++ ++ for (c = 0; c < rm->rm_cols; c++) { ++ rc = &rm->rm_col[c]; ++ cvd = vd->vdev_child[rc->rc_devidx]; ++ zio_nowait(zio_vdev_child_io(zio, NULL, cvd, ++ rc->rc_offset, rc->rc_data, rc->rc_size, ++ zio->io_type, zio->io_priority, 0, ++ vdev_raidz_child_done, rc)); ++ } ++ ++ /* ++ * Generate optional I/Os for any skipped sectors to improve ++ * aggregation contiguity. ++ */ ++ for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { ++ ASSERT(c <= rm->rm_scols); ++ if (c == rm->rm_scols) ++ c = 0; ++ rc = &rm->rm_col[c]; ++ cvd = vd->vdev_child[rc->rc_devidx]; ++ zio_nowait(zio_vdev_child_io(zio, NULL, cvd, ++ rc->rc_offset + rc->rc_size, NULL, ++ 1 << tvd->vdev_ashift, ++ zio->io_type, zio->io_priority, ++ ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); ++ } ++ ++ return (ZIO_PIPELINE_CONTINUE); ++ } ++ ++ ASSERT(zio->io_type == ZIO_TYPE_READ); ++ ++ /* ++ * Iterate over the columns in reverse order so that we hit the parity ++ * last -- any errors along the way will force us to read the parity. ++ */ ++ for (c = rm->rm_cols - 1; c >= 0; c--) { ++ rc = &rm->rm_col[c]; ++ cvd = vd->vdev_child[rc->rc_devidx]; ++ if (!vdev_readable(cvd)) { ++ if (c >= rm->rm_firstdatacol) ++ rm->rm_missingdata++; ++ else ++ rm->rm_missingparity++; ++ rc->rc_error = ENXIO; ++ rc->rc_tried = 1; /* don't even try */ ++ rc->rc_skipped = 1; ++ continue; ++ } ++ if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { ++ if (c >= rm->rm_firstdatacol) ++ rm->rm_missingdata++; ++ else ++ rm->rm_missingparity++; ++ rc->rc_error = ESTALE; ++ rc->rc_skipped = 1; ++ continue; ++ } ++ if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || ++ (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { ++ zio_nowait(zio_vdev_child_io(zio, NULL, cvd, ++ rc->rc_offset, rc->rc_data, rc->rc_size, ++ zio->io_type, zio->io_priority, 0, ++ vdev_raidz_child_done, rc)); ++ } ++ } ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++ ++/* ++ * Report a checksum error for a child of a RAID-Z device. ++ */ ++static void ++raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) ++{ ++ vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; ++ ++ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { ++ zio_bad_cksum_t zbc; ++ raidz_map_t *rm = zio->io_vsd; ++ ++ mutex_enter(&vd->vdev_stat_lock); ++ vd->vdev_stat.vs_checksum_errors++; ++ mutex_exit(&vd->vdev_stat_lock); ++ ++ zbc.zbc_has_cksum = 0; ++ zbc.zbc_injected = rm->rm_ecksuminjected; ++ ++ zfs_ereport_post_checksum(zio->io_spa, vd, zio, ++ rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, ++ &zbc); ++ } ++} ++ ++/* ++ * We keep track of whether or not there were any injected errors, so that ++ * any ereports we generate can note it. ++ */ ++static int ++raidz_checksum_verify(zio_t *zio) ++{ ++ zio_bad_cksum_t zbc; ++ raidz_map_t *rm = zio->io_vsd; ++ int ret; ++ ++ bzero(&zbc, sizeof (zio_bad_cksum_t)); ++ ++ ret = zio_checksum_error(zio, &zbc); ++ if (ret != 0 && zbc.zbc_injected != 0) ++ rm->rm_ecksuminjected = 1; ++ ++ return (ret); ++} ++ ++/* ++ * Generate the parity from the data columns. If we tried and were able to ++ * read the parity without error, verify that the generated parity matches the ++ * data we read. If it doesn't, we fire off a checksum error. Return the ++ * number such failures. ++ */ ++static int ++raidz_parity_verify(zio_t *zio, raidz_map_t *rm) ++{ ++ void *orig[VDEV_RAIDZ_MAXPARITY]; ++ int c, ret = 0; ++ raidz_col_t *rc; ++ ++ for (c = 0; c < rm->rm_firstdatacol; c++) { ++ rc = &rm->rm_col[c]; ++ if (!rc->rc_tried || rc->rc_error != 0) ++ continue; ++ orig[c] = zio_buf_alloc(rc->rc_size); ++ bcopy(rc->rc_data, orig[c], rc->rc_size); ++ } ++ ++ vdev_raidz_generate_parity(rm); ++ ++ for (c = 0; c < rm->rm_firstdatacol; c++) { ++ rc = &rm->rm_col[c]; ++ if (!rc->rc_tried || rc->rc_error != 0) ++ continue; ++ if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { ++ raidz_checksum_error(zio, rc, orig[c]); ++ rc->rc_error = ECKSUM; ++ ret++; ++ } ++ zio_buf_free(orig[c], rc->rc_size); ++ } ++ ++ return (ret); ++} ++ ++/* ++ * Keep statistics on all the ways that we used parity to correct data. ++ */ ++static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; ++ ++static int ++vdev_raidz_worst_error(raidz_map_t *rm) ++{ ++ int c, error = 0; ++ ++ for (c = 0; c < rm->rm_cols; c++) ++ error = zio_worst_error(error, rm->rm_col[c].rc_error); ++ ++ return (error); ++} ++ ++/* ++ * Iterate over all combinations of bad data and attempt a reconstruction. ++ * Note that the algorithm below is non-optimal because it doesn't take into ++ * account how reconstruction is actually performed. For example, with ++ * triple-parity RAID-Z the reconstruction procedure is the same if column 4 ++ * is targeted as invalid as if columns 1 and 4 are targeted since in both ++ * cases we'd only use parity information in column 0. ++ */ ++static int ++vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) ++{ ++ raidz_map_t *rm = zio->io_vsd; ++ raidz_col_t *rc; ++ void *orig[VDEV_RAIDZ_MAXPARITY]; ++ int tstore[VDEV_RAIDZ_MAXPARITY + 2]; ++ int *tgts = &tstore[1]; ++ int curr, next, i, c, n; ++ int code, ret = 0; ++ ++ ASSERT(total_errors < rm->rm_firstdatacol); ++ ++ /* ++ * This simplifies one edge condition. ++ */ ++ tgts[-1] = -1; ++ ++ for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { ++ /* ++ * Initialize the targets array by finding the first n columns ++ * that contain no error. ++ * ++ * If there were no data errors, we need to ensure that we're ++ * always explicitly attempting to reconstruct at least one ++ * data column. To do this, we simply push the highest target ++ * up into the data columns. ++ */ ++ for (c = 0, i = 0; i < n; i++) { ++ if (i == n - 1 && data_errors == 0 && ++ c < rm->rm_firstdatacol) { ++ c = rm->rm_firstdatacol; ++ } ++ ++ while (rm->rm_col[c].rc_error != 0) { ++ c++; ++ ASSERT3S(c, <, rm->rm_cols); ++ } ++ ++ tgts[i] = c++; ++ } ++ ++ /* ++ * Setting tgts[n] simplifies the other edge condition. ++ */ ++ tgts[n] = rm->rm_cols; ++ ++ /* ++ * These buffers were allocated in previous iterations. ++ */ ++ for (i = 0; i < n - 1; i++) { ++ ASSERT(orig[i] != NULL); ++ } ++ ++ orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); ++ ++ curr = 0; ++ next = tgts[curr]; ++ ++ while (curr != n) { ++ tgts[curr] = next; ++ curr = 0; ++ ++ /* ++ * Save off the original data that we're going to ++ * attempt to reconstruct. ++ */ ++ for (i = 0; i < n; i++) { ++ ASSERT(orig[i] != NULL); ++ c = tgts[i]; ++ ASSERT3S(c, >=, 0); ++ ASSERT3S(c, <, rm->rm_cols); ++ rc = &rm->rm_col[c]; ++ bcopy(rc->rc_data, orig[i], rc->rc_size); ++ } ++ ++ /* ++ * Attempt a reconstruction and exit the outer loop on ++ * success. ++ */ ++ code = vdev_raidz_reconstruct(rm, tgts, n); ++ if (raidz_checksum_verify(zio) == 0) { ++ atomic_inc_64(&raidz_corrected[code]); ++ ++ for (i = 0; i < n; i++) { ++ c = tgts[i]; ++ rc = &rm->rm_col[c]; ++ ASSERT(rc->rc_error == 0); ++ if (rc->rc_tried) ++ raidz_checksum_error(zio, rc, ++ orig[i]); ++ rc->rc_error = ECKSUM; ++ } ++ ++ ret = code; ++ goto done; ++ } ++ ++ /* ++ * Restore the original data. ++ */ ++ for (i = 0; i < n; i++) { ++ c = tgts[i]; ++ rc = &rm->rm_col[c]; ++ bcopy(orig[i], rc->rc_data, rc->rc_size); ++ } ++ ++ do { ++ /* ++ * Find the next valid column after the curr ++ * position.. ++ */ ++ for (next = tgts[curr] + 1; ++ next < rm->rm_cols && ++ rm->rm_col[next].rc_error != 0; next++) ++ continue; ++ ++ ASSERT(next <= tgts[curr + 1]); ++ ++ /* ++ * If that spot is available, we're done here. ++ */ ++ if (next != tgts[curr + 1]) ++ break; ++ ++ /* ++ * Otherwise, find the next valid column after ++ * the previous position. ++ */ ++ for (c = tgts[curr - 1] + 1; ++ rm->rm_col[c].rc_error != 0; c++) ++ continue; ++ ++ tgts[curr] = c; ++ curr++; ++ ++ } while (curr != n); ++ } ++ } ++ n--; ++done: ++ for (i = 0; i < n; i++) { ++ zio_buf_free(orig[i], rm->rm_col[0].rc_size); ++ } ++ ++ return (ret); ++} ++ ++static void ++vdev_raidz_io_done(zio_t *zio) ++{ ++ vdev_t *vd = zio->io_vd; ++ vdev_t *cvd; ++ raidz_map_t *rm = zio->io_vsd; ++ raidz_col_t *rc = NULL; ++ int unexpected_errors = 0; ++ int parity_errors = 0; ++ int parity_untried = 0; ++ int data_errors = 0; ++ int total_errors = 0; ++ int n, c; ++ int tgts[VDEV_RAIDZ_MAXPARITY]; ++ int code; ++ ++ ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ ++ ++ ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); ++ ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); ++ ++ for (c = 0; c < rm->rm_cols; c++) { ++ rc = &rm->rm_col[c]; ++ ++ if (rc->rc_error) { ++ ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ ++ ++ if (c < rm->rm_firstdatacol) ++ parity_errors++; ++ else ++ data_errors++; ++ ++ if (!rc->rc_skipped) ++ unexpected_errors++; ++ ++ total_errors++; ++ } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { ++ parity_untried++; ++ } ++ } ++ ++ if (zio->io_type == ZIO_TYPE_WRITE) { ++ /* ++ * XXX -- for now, treat partial writes as a success. ++ * (If we couldn't write enough columns to reconstruct ++ * the data, the I/O failed. Otherwise, good enough.) ++ * ++ * Now that we support write reallocation, it would be better ++ * to treat partial failure as real failure unless there are ++ * no non-degraded top-level vdevs left, and not update DTLs ++ * if we intend to reallocate. ++ */ ++ /* XXPOLICY */ ++ if (total_errors > rm->rm_firstdatacol) ++ zio->io_error = vdev_raidz_worst_error(rm); ++ ++ return; ++ } ++ ++ ASSERT(zio->io_type == ZIO_TYPE_READ); ++ /* ++ * There are three potential phases for a read: ++ * 1. produce valid data from the columns read ++ * 2. read all disks and try again ++ * 3. perform combinatorial reconstruction ++ * ++ * Each phase is progressively both more expensive and less likely to ++ * occur. If we encounter more errors than we can repair or all phases ++ * fail, we have no choice but to return an error. ++ */ ++ ++ /* ++ * If the number of errors we saw was correctable -- less than or equal ++ * to the number of parity disks read -- attempt to produce data that ++ * has a valid checksum. Naturally, this case applies in the absence of ++ * any errors. ++ */ ++ if (total_errors <= rm->rm_firstdatacol - parity_untried) { ++ if (data_errors == 0) { ++ if (raidz_checksum_verify(zio) == 0) { ++ /* ++ * If we read parity information (unnecessarily ++ * as it happens since no reconstruction was ++ * needed) regenerate and verify the parity. ++ * We also regenerate parity when resilvering ++ * so we can write it out to the failed device ++ * later. ++ */ ++ if (parity_errors + parity_untried < ++ rm->rm_firstdatacol || ++ (zio->io_flags & ZIO_FLAG_RESILVER)) { ++ n = raidz_parity_verify(zio, rm); ++ unexpected_errors += n; ++ ASSERT(parity_errors + n <= ++ rm->rm_firstdatacol); ++ } ++ goto done; ++ } ++ } else { ++ /* ++ * We either attempt to read all the parity columns or ++ * none of them. If we didn't try to read parity, we ++ * wouldn't be here in the correctable case. There must ++ * also have been fewer parity errors than parity ++ * columns or, again, we wouldn't be in this code path. ++ */ ++ ASSERT(parity_untried == 0); ++ ASSERT(parity_errors < rm->rm_firstdatacol); ++ ++ /* ++ * Identify the data columns that reported an error. ++ */ ++ n = 0; ++ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { ++ rc = &rm->rm_col[c]; ++ if (rc->rc_error != 0) { ++ ASSERT(n < VDEV_RAIDZ_MAXPARITY); ++ tgts[n++] = c; ++ } ++ } ++ ++ ASSERT(rm->rm_firstdatacol >= n); ++ ++ code = vdev_raidz_reconstruct(rm, tgts, n); ++ ++ if (raidz_checksum_verify(zio) == 0) { ++ atomic_inc_64(&raidz_corrected[code]); ++ ++ /* ++ * If we read more parity disks than were used ++ * for reconstruction, confirm that the other ++ * parity disks produced correct data. This ++ * routine is suboptimal in that it regenerates ++ * the parity that we already used in addition ++ * to the parity that we're attempting to ++ * verify, but this should be a relatively ++ * uncommon case, and can be optimized if it ++ * becomes a problem. Note that we regenerate ++ * parity when resilvering so we can write it ++ * out to failed devices later. ++ */ ++ if (parity_errors < rm->rm_firstdatacol - n || ++ (zio->io_flags & ZIO_FLAG_RESILVER)) { ++ n = raidz_parity_verify(zio, rm); ++ unexpected_errors += n; ++ ASSERT(parity_errors + n <= ++ rm->rm_firstdatacol); ++ } ++ ++ goto done; ++ } ++ } ++ } ++ ++ /* ++ * This isn't a typical situation -- either we got a read error or ++ * a child silently returned bad data. Read every block so we can ++ * try again with as much data and parity as we can track down. If ++ * we've already been through once before, all children will be marked ++ * as tried so we'll proceed to combinatorial reconstruction. ++ */ ++ unexpected_errors = 1; ++ rm->rm_missingdata = 0; ++ rm->rm_missingparity = 0; ++ ++ for (c = 0; c < rm->rm_cols; c++) { ++ if (rm->rm_col[c].rc_tried) ++ continue; ++ ++ zio_vdev_io_redone(zio); ++ do { ++ rc = &rm->rm_col[c]; ++ if (rc->rc_tried) ++ continue; ++ zio_nowait(zio_vdev_child_io(zio, NULL, ++ vd->vdev_child[rc->rc_devidx], ++ rc->rc_offset, rc->rc_data, rc->rc_size, ++ zio->io_type, zio->io_priority, 0, ++ vdev_raidz_child_done, rc)); ++ } while (++c < rm->rm_cols); ++ ++ return; ++ } ++ ++ /* ++ * At this point we've attempted to reconstruct the data given the ++ * errors we detected, and we've attempted to read all columns. There ++ * must, therefore, be one or more additional problems -- silent errors ++ * resulting in invalid data rather than explicit I/O errors resulting ++ * in absent data. We check if there is enough additional data to ++ * possibly reconstruct the data and then perform combinatorial ++ * reconstruction over all possible combinations. If that fails, ++ * we're cooked. ++ */ ++ if (total_errors > rm->rm_firstdatacol) { ++ zio->io_error = vdev_raidz_worst_error(rm); ++ ++ } else if (total_errors < rm->rm_firstdatacol && ++ (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { ++ /* ++ * If we didn't use all the available parity for the ++ * combinatorial reconstruction, verify that the remaining ++ * parity is correct. ++ */ ++ if (code != (1 << rm->rm_firstdatacol) - 1) ++ (void) raidz_parity_verify(zio, rm); ++ } else { ++ /* ++ * We're here because either: ++ * ++ * total_errors == rm_first_datacol, or ++ * vdev_raidz_combrec() failed ++ * ++ * In either case, there is enough bad data to prevent ++ * reconstruction. ++ * ++ * Start checksum ereports for all children which haven't ++ * failed, and the IO wasn't speculative. ++ */ ++ zio->io_error = ECKSUM; ++ ++ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { ++ for (c = 0; c < rm->rm_cols; c++) { ++ rc = &rm->rm_col[c]; ++ if (rc->rc_error == 0) { ++ zio_bad_cksum_t zbc; ++ zbc.zbc_has_cksum = 0; ++ zbc.zbc_injected = ++ rm->rm_ecksuminjected; ++ ++ zfs_ereport_start_checksum( ++ zio->io_spa, ++ vd->vdev_child[rc->rc_devidx], ++ zio, rc->rc_offset, rc->rc_size, ++ (void *)(uintptr_t)c, &zbc); ++ } ++ } ++ } ++ } ++ ++done: ++ zio_checksum_verified(zio); ++ ++ if (zio->io_error == 0 && spa_writeable(zio->io_spa) && ++ (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { ++ /* ++ * Use the good data we have in hand to repair damaged children. ++ */ ++ for (c = 0; c < rm->rm_cols; c++) { ++ rc = &rm->rm_col[c]; ++ cvd = vd->vdev_child[rc->rc_devidx]; ++ ++ if (rc->rc_error == 0) ++ continue; ++ ++ zio_nowait(zio_vdev_child_io(zio, NULL, cvd, ++ rc->rc_offset, rc->rc_data, rc->rc_size, ++ ZIO_TYPE_WRITE, zio->io_priority, ++ ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ++ ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); ++ } ++ } ++} ++ ++static void ++vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) ++{ ++ if (faulted > vd->vdev_nparity) ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_NO_REPLICAS); ++ else if (degraded + faulted != 0) ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); ++ else ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); ++} ++ ++vdev_ops_t vdev_raidz_ops = { ++ vdev_raidz_open, ++ vdev_raidz_close, ++ vdev_raidz_asize, ++ vdev_raidz_io_start, ++ vdev_raidz_io_done, ++ vdev_raidz_state_change, ++ NULL, ++ NULL, ++ VDEV_TYPE_RAIDZ, /* name of this vdev type */ ++ B_FALSE /* not a leaf vdev */ ++}; +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/vdev_root.c linux-3.2.33-go/fs/zfs/zfs/vdev_root.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/vdev_root.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/vdev_root.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,125 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++/* ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Virtual device vector for the pool's root vdev. ++ */ ++ ++/* ++ * We should be able to tolerate one failure with absolutely no damage ++ * to our metadata. Two failures will take out space maps, a bunch of ++ * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy ++ * place to live. When we get smarter, we can liberalize this policy. ++ * e.g. If we haven't lost two consecutive top-level vdevs, then we are ++ * probably fine. Adding bean counters during alloc/free can make this ++ * future guesswork more accurate. ++ */ ++static int ++too_many_errors(vdev_t *vd, int numerrors) ++{ ++ ASSERT3U(numerrors, <=, vd->vdev_children); ++ return (numerrors > 0); ++} ++ ++static int ++vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, ++ uint64_t *ashift) ++{ ++ int lasterror = 0; ++ int numerrors = 0; ++ int c; ++ ++ if (vd->vdev_children == 0) { ++ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; ++ return (EINVAL); ++ } ++ ++ vdev_open_children(vd); ++ ++ for (c = 0; c < vd->vdev_children; c++) { ++ vdev_t *cvd = vd->vdev_child[c]; ++ ++ if (cvd->vdev_open_error && !cvd->vdev_islog) { ++ lasterror = cvd->vdev_open_error; ++ numerrors++; ++ } ++ } ++ ++ if (too_many_errors(vd, numerrors)) { ++ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; ++ return (lasterror); ++ } ++ ++ *asize = 0; ++ *max_asize = 0; ++ *ashift = 0; ++ ++ return (0); ++} ++ ++static void ++vdev_root_close(vdev_t *vd) ++{ ++ int c; ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ vdev_close(vd->vdev_child[c]); ++} ++ ++static void ++vdev_root_state_change(vdev_t *vd, int faulted, int degraded) ++{ ++ if (too_many_errors(vd, faulted)) { ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, ++ VDEV_AUX_NO_REPLICAS); ++ } else if (degraded) { ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); ++ } else { ++ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); ++ } ++} ++ ++vdev_ops_t vdev_root_ops = { ++ vdev_root_open, ++ vdev_root_close, ++ vdev_default_asize, ++ NULL, /* io_start - not applicable to the root */ ++ NULL, /* io_done - not applicable to the root */ ++ vdev_root_state_change, ++ NULL, ++ NULL, ++ VDEV_TYPE_ROOT, /* name of this vdev type */ ++ B_FALSE /* not a leaf vdev */ ++}; +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zap.c linux-3.2.33-go/fs/zfs/zfs/zap.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zap.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zap.c 2012-11-16 23:25:34.348039346 +0100 +@@ -0,0 +1,1354 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* ++ * This file contains the top half of the zfs directory structure ++ * implementation. The bottom half is in zap_leaf.c. ++ * ++ * The zdir is an extendable hash data structure. There is a table of ++ * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are ++ * each a constant size and hold a variable number of directory entries. ++ * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. ++ * ++ * The pointer table holds a power of 2 number of pointers. ++ * (1<zd_data->zd_phys->zd_prefix_len). The bucket pointed to ++ * by the pointer at index i in the table holds entries whose hash value ++ * has a zd_prefix_len - bit prefix ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int fzap_default_block_shift = 14; /* 16k blocksize */ ++ ++static void zap_leaf_pageout(dmu_buf_t *db, void *vl); ++static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); ++ ++ ++void ++fzap_byteswap(void *vbuf, size_t size) ++{ ++ uint64_t block_type; ++ ++ block_type = *(uint64_t *)vbuf; ++ ++ if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) ++ zap_leaf_byteswap(vbuf, size); ++ else { ++ /* it's a ptrtbl block */ ++ byteswap_uint64_array(vbuf, size); ++ } ++} ++ ++void ++fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) ++{ ++ dmu_buf_t *db; ++ zap_leaf_t *l; ++ int i; ++ zap_phys_t *zp; ++ ++ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ++ zap->zap_ismicro = FALSE; ++ ++ (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, ++ &zap->zap_f.zap_phys, zap_evict); ++ ++ mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); ++ zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1; ++ ++ zp = zap->zap_f.zap_phys; ++ /* ++ * explicitly zero it since it might be coming from an ++ * initialized microzap ++ */ ++ bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size); ++ zp->zap_block_type = ZBT_HEADER; ++ zp->zap_magic = ZAP_MAGIC; ++ ++ zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); ++ ++ zp->zap_freeblk = 2; /* block 1 will be the first leaf */ ++ zp->zap_num_leafs = 1; ++ zp->zap_num_entries = 0; ++ zp->zap_salt = zap->zap_salt; ++ zp->zap_normflags = zap->zap_normflags; ++ zp->zap_flags = flags; ++ ++ /* block 1 will be the first leaf */ ++ for (i = 0; i < (1<zap_ptrtbl.zt_shift); i++) ++ ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; ++ ++ /* ++ * set up block 1 - the first leaf ++ */ ++ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, ++ 1<l_dbuf = db; ++ l->l_phys = db->db_data; ++ ++ zap_leaf_init(l, zp->zap_normflags != 0); ++ ++ kmem_free(l, sizeof (zap_leaf_t)); ++ dmu_buf_rele(db, FTAG); ++} ++ ++static int ++zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) ++{ ++ if (RW_WRITE_HELD(&zap->zap_rwlock)) ++ return (1); ++ if (rw_tryupgrade(&zap->zap_rwlock)) { ++ dmu_buf_will_dirty(zap->zap_dbuf, tx); ++ return (1); ++ } ++ return (0); ++} ++ ++/* ++ * Generic routines for dealing with the pointer & cookie tables. ++ */ ++ ++static int ++zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, ++ void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), ++ dmu_tx_t *tx) ++{ ++ uint64_t b, newblk; ++ dmu_buf_t *db_old, *db_new; ++ int err; ++ int bs = FZAP_BLOCK_SHIFT(zap); ++ int hepb = 1<<(bs-4); ++ /* hepb = half the number of entries in a block */ ++ ++ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ++ ASSERT(tbl->zt_blk != 0); ++ ASSERT(tbl->zt_numblks > 0); ++ ++ if (tbl->zt_nextblk != 0) { ++ newblk = tbl->zt_nextblk; ++ } else { ++ newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); ++ tbl->zt_nextblk = newblk; ++ ASSERT3U(tbl->zt_blks_copied, ==, 0); ++ dmu_prefetch(zap->zap_objset, zap->zap_object, ++ tbl->zt_blk << bs, tbl->zt_numblks << bs); ++ } ++ ++ /* ++ * Copy the ptrtbl from the old to new location. ++ */ ++ ++ b = tbl->zt_blks_copied; ++ err = dmu_buf_hold(zap->zap_objset, zap->zap_object, ++ (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); ++ if (err) ++ return (err); ++ ++ /* first half of entries in old[b] go to new[2*b+0] */ ++ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, ++ (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); ++ dmu_buf_will_dirty(db_new, tx); ++ transfer_func(db_old->db_data, db_new->db_data, hepb); ++ dmu_buf_rele(db_new, FTAG); ++ ++ /* second half of entries in old[b] go to new[2*b+1] */ ++ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, ++ (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); ++ dmu_buf_will_dirty(db_new, tx); ++ transfer_func((uint64_t *)db_old->db_data + hepb, ++ db_new->db_data, hepb); ++ dmu_buf_rele(db_new, FTAG); ++ ++ dmu_buf_rele(db_old, FTAG); ++ ++ tbl->zt_blks_copied++; ++ ++ dprintf("copied block %llu of %llu\n", ++ tbl->zt_blks_copied, tbl->zt_numblks); ++ ++ if (tbl->zt_blks_copied == tbl->zt_numblks) { ++ (void) dmu_free_range(zap->zap_objset, zap->zap_object, ++ tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); ++ ++ tbl->zt_blk = newblk; ++ tbl->zt_numblks *= 2; ++ tbl->zt_shift++; ++ tbl->zt_nextblk = 0; ++ tbl->zt_blks_copied = 0; ++ ++ dprintf("finished; numblocks now %llu (%lluk entries)\n", ++ tbl->zt_numblks, 1<<(tbl->zt_shift-10)); ++ } ++ ++ return (0); ++} ++ ++static int ++zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, ++ dmu_tx_t *tx) ++{ ++ int err; ++ uint64_t blk, off; ++ int bs = FZAP_BLOCK_SHIFT(zap); ++ dmu_buf_t *db; ++ ++ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ++ ASSERT(tbl->zt_blk != 0); ++ ++ dprintf("storing %llx at index %llx\n", val, idx); ++ ++ blk = idx >> (bs-3); ++ off = idx & ((1<<(bs-3))-1); ++ ++ err = dmu_buf_hold(zap->zap_objset, zap->zap_object, ++ (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); ++ if (err) ++ return (err); ++ dmu_buf_will_dirty(db, tx); ++ ++ if (tbl->zt_nextblk != 0) { ++ uint64_t idx2 = idx * 2; ++ uint64_t blk2 = idx2 >> (bs-3); ++ uint64_t off2 = idx2 & ((1<<(bs-3))-1); ++ dmu_buf_t *db2; ++ ++ err = dmu_buf_hold(zap->zap_objset, zap->zap_object, ++ (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, ++ DMU_READ_NO_PREFETCH); ++ if (err) { ++ dmu_buf_rele(db, FTAG); ++ return (err); ++ } ++ dmu_buf_will_dirty(db2, tx); ++ ((uint64_t *)db2->db_data)[off2] = val; ++ ((uint64_t *)db2->db_data)[off2+1] = val; ++ dmu_buf_rele(db2, FTAG); ++ } ++ ++ ((uint64_t *)db->db_data)[off] = val; ++ dmu_buf_rele(db, FTAG); ++ ++ return (0); ++} ++ ++static int ++zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) ++{ ++ uint64_t blk, off; ++ int err; ++ dmu_buf_t *db; ++ int bs = FZAP_BLOCK_SHIFT(zap); ++ ++ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ++ ++ blk = idx >> (bs-3); ++ off = idx & ((1<<(bs-3))-1); ++ ++ err = dmu_buf_hold(zap->zap_objset, zap->zap_object, ++ (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); ++ if (err) ++ return (err); ++ *valp = ((uint64_t *)db->db_data)[off]; ++ dmu_buf_rele(db, FTAG); ++ ++ if (tbl->zt_nextblk != 0) { ++ /* ++ * read the nextblk for the sake of i/o error checking, ++ * so that zap_table_load() will catch errors for ++ * zap_table_store. ++ */ ++ blk = (idx*2) >> (bs-3); ++ ++ err = dmu_buf_hold(zap->zap_objset, zap->zap_object, ++ (tbl->zt_nextblk + blk) << bs, FTAG, &db, ++ DMU_READ_NO_PREFETCH); ++ dmu_buf_rele(db, FTAG); ++ } ++ return (err); ++} ++ ++/* ++ * Routines for growing the ptrtbl. ++ */ ++ ++static void ++zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) ++{ ++ int i; ++ for (i = 0; i < n; i++) { ++ uint64_t lb = src[i]; ++ dst[2*i+0] = lb; ++ dst[2*i+1] = lb; ++ } ++} ++ ++static int ++zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) ++{ ++ /* ++ * The pointer table should never use more hash bits than we ++ * have (otherwise we'd be using useless zero bits to index it). ++ * If we are within 2 bits of running out, stop growing, since ++ * this is already an aberrant condition. ++ */ ++ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) ++ return (ENOSPC); ++ ++ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { ++ /* ++ * We are outgrowing the "embedded" ptrtbl (the one ++ * stored in the header block). Give it its own entire ++ * block, which will double the size of the ptrtbl. ++ */ ++ uint64_t newblk; ++ dmu_buf_t *db_new; ++ int err; ++ ++ ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, ++ ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); ++ ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0); ++ ++ newblk = zap_allocate_blocks(zap, 1); ++ err = dmu_buf_hold(zap->zap_objset, zap->zap_object, ++ newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, ++ DMU_READ_NO_PREFETCH); ++ if (err) ++ return (err); ++ dmu_buf_will_dirty(db_new, tx); ++ zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), ++ db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); ++ dmu_buf_rele(db_new, FTAG); ++ ++ zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk; ++ zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1; ++ zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++; ++ ++ ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, ++ zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << ++ (FZAP_BLOCK_SHIFT(zap)-3)); ++ ++ return (0); ++ } else { ++ return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl, ++ zap_ptrtbl_transfer, tx)); ++ } ++} ++ ++static void ++zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) ++{ ++ dmu_buf_will_dirty(zap->zap_dbuf, tx); ++ mutex_enter(&zap->zap_f.zap_num_entries_mtx); ++ ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta); ++ zap->zap_f.zap_phys->zap_num_entries += delta; ++ mutex_exit(&zap->zap_f.zap_num_entries_mtx); ++} ++ ++static uint64_t ++zap_allocate_blocks(zap_t *zap, int nblocks) ++{ ++ uint64_t newblk; ++ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ++ newblk = zap->zap_f.zap_phys->zap_freeblk; ++ zap->zap_f.zap_phys->zap_freeblk += nblocks; ++ return (newblk); ++} ++ ++static zap_leaf_t * ++zap_create_leaf(zap_t *zap, dmu_tx_t *tx) ++{ ++ void *winner; ++ zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_PUSHPAGE); ++ ++ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ++ ++ rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL); ++ rw_enter(&l->l_rwlock, RW_WRITER); ++ l->l_blkid = zap_allocate_blocks(zap, 1); ++ l->l_dbuf = NULL; ++ l->l_phys = NULL; ++ ++ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, ++ l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, ++ DMU_READ_NO_PREFETCH)); ++ winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout); ++ ASSERT(winner == NULL); ++ dmu_buf_will_dirty(l->l_dbuf, tx); ++ ++ zap_leaf_init(l, zap->zap_normflags != 0); ++ ++ zap->zap_f.zap_phys->zap_num_leafs++; ++ ++ return (l); ++} ++ ++int ++fzap_count(zap_t *zap, uint64_t *count) ++{ ++ ASSERT(!zap->zap_ismicro); ++ mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ ++ *count = zap->zap_f.zap_phys->zap_num_entries; ++ mutex_exit(&zap->zap_f.zap_num_entries_mtx); ++ return (0); ++} ++ ++/* ++ * Routines for obtaining zap_leaf_t's ++ */ ++ ++void ++zap_put_leaf(zap_leaf_t *l) ++{ ++ rw_exit(&l->l_rwlock); ++ dmu_buf_rele(l->l_dbuf, NULL); ++} ++ ++_NOTE(ARGSUSED(0)) ++static void ++zap_leaf_pageout(dmu_buf_t *db, void *vl) ++{ ++ zap_leaf_t *l = vl; ++ ++ rw_destroy(&l->l_rwlock); ++ kmem_free(l, sizeof (zap_leaf_t)); ++} ++ ++static zap_leaf_t * ++zap_open_leaf(uint64_t blkid, dmu_buf_t *db) ++{ ++ zap_leaf_t *l, *winner; ++ ++ ASSERT(blkid != 0); ++ ++ l = kmem_alloc(sizeof (zap_leaf_t), KM_PUSHPAGE); ++ rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL); ++ rw_enter(&l->l_rwlock, RW_WRITER); ++ l->l_blkid = blkid; ++ l->l_bs = highbit(db->db_size)-1; ++ l->l_dbuf = db; ++ l->l_phys = NULL; ++ ++ winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout); ++ ++ rw_exit(&l->l_rwlock); ++ if (winner != NULL) { ++ /* someone else set it first */ ++ zap_leaf_pageout(NULL, l); ++ l = winner; ++ } ++ ++ /* ++ * lhr_pad was previously used for the next leaf in the leaf ++ * chain. There should be no chained leafs (as we have removed ++ * support for them). ++ */ ++ ASSERT3U(l->l_phys->l_hdr.lh_pad1, ==, 0); ++ ++ /* ++ * There should be more hash entries than there can be ++ * chunks to put in the hash table ++ */ ++ ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); ++ ++ /* The chunks should begin at the end of the hash table */ ++ ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, (zap_leaf_chunk_t *) ++ &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); ++ ++ /* The chunks should end at the end of the block */ ++ ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - ++ (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size); ++ ++ return (l); ++} ++ ++static int ++zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, ++ zap_leaf_t **lp) ++{ ++ dmu_buf_t *db; ++ zap_leaf_t *l; ++ int bs = FZAP_BLOCK_SHIFT(zap); ++ int err; ++ ++ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ++ ++ err = dmu_buf_hold(zap->zap_objset, zap->zap_object, ++ blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); ++ if (err) ++ return (err); ++ ++ ASSERT3U(db->db_object, ==, zap->zap_object); ++ ASSERT3U(db->db_offset, ==, blkid << bs); ++ ASSERT3U(db->db_size, ==, 1 << bs); ++ ASSERT(blkid != 0); ++ ++ l = dmu_buf_get_user(db); ++ ++ if (l == NULL) ++ l = zap_open_leaf(blkid, db); ++ ++ rw_enter(&l->l_rwlock, lt); ++ /* ++ * Must lock before dirtying, otherwise l->l_phys could change, ++ * causing ASSERT below to fail. ++ */ ++ if (lt == RW_WRITER) ++ dmu_buf_will_dirty(db, tx); ++ ASSERT3U(l->l_blkid, ==, blkid); ++ ASSERT3P(l->l_dbuf, ==, db); ++ ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data); ++ ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF); ++ ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); ++ ++ *lp = l; ++ return (0); ++} ++ ++static int ++zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) ++{ ++ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ++ ++ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { ++ ASSERT3U(idx, <, ++ (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift)); ++ *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); ++ return (0); ++ } else { ++ return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl, ++ idx, valp)); ++ } ++} ++ ++static int ++zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) ++{ ++ ASSERT(tx != NULL); ++ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ++ ++ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) { ++ ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; ++ return (0); ++ } else { ++ return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl, ++ idx, blk, tx)); ++ } ++} ++ ++static int ++zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) ++{ ++ uint64_t idx, blk; ++ int err; ++ ++ ASSERT(zap->zap_dbuf == NULL || ++ zap->zap_f.zap_phys == zap->zap_dbuf->db_data); ++ ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC); ++ idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); ++ err = zap_idx_to_blk(zap, idx, &blk); ++ if (err != 0) ++ return (err); ++ err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); ++ ++ ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) == ++ (*lp)->l_phys->l_hdr.lh_prefix); ++ return (err); ++} ++ ++static int ++zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp) ++{ ++ zap_t *zap = zn->zn_zap; ++ uint64_t hash = zn->zn_hash; ++ zap_leaf_t *nl; ++ int prefix_diff, i, err; ++ uint64_t sibling; ++ int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len; ++ ++ ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); ++ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ++ ++ ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, ++ l->l_phys->l_hdr.lh_prefix); ++ ++ if (zap_tryupgradedir(zap, tx) == 0 || ++ old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) { ++ /* We failed to upgrade, or need to grow the pointer table */ ++ objset_t *os = zap->zap_objset; ++ uint64_t object = zap->zap_object; ++ ++ zap_put_leaf(l); ++ zap_unlockdir(zap); ++ err = zap_lockdir(os, object, tx, RW_WRITER, ++ FALSE, FALSE, &zn->zn_zap); ++ zap = zn->zn_zap; ++ if (err) ++ return (err); ++ ASSERT(!zap->zap_ismicro); ++ ++ while (old_prefix_len == ++ zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) { ++ err = zap_grow_ptrtbl(zap, tx); ++ if (err) ++ return (err); ++ } ++ ++ err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); ++ if (err) ++ return (err); ++ ++ if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) { ++ /* it split while our locks were down */ ++ *lp = l; ++ return (0); ++ } ++ } ++ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ++ ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); ++ ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, ++ l->l_phys->l_hdr.lh_prefix); ++ ++ prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - ++ (old_prefix_len + 1); ++ sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; ++ ++ /* check for i/o errors before doing zap_leaf_split */ ++ for (i = 0; i < (1ULL<l_blkid); ++ } ++ ++ nl = zap_create_leaf(zap, tx); ++ zap_leaf_split(l, nl, zap->zap_normflags != 0); ++ ++ /* set sibling pointers */ ++ for (i = 0; i < (1ULL<l_blkid, tx); ++ ASSERT3U(err, ==, 0); /* we checked for i/o errors above */ ++ } ++ ++ if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) { ++ /* we want the sibling */ ++ zap_put_leaf(l); ++ *lp = nl; ++ } else { ++ zap_put_leaf(nl); ++ *lp = l; ++ } ++ ++ return (0); ++} ++ ++static void ++zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) ++{ ++ zap_t *zap = zn->zn_zap; ++ int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; ++ int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift && ++ l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); ++ ++ zap_put_leaf(l); ++ ++ if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) { ++ int err; ++ ++ /* ++ * We are in the middle of growing the pointer table, or ++ * this leaf will soon make us grow it. ++ */ ++ if (zap_tryupgradedir(zap, tx) == 0) { ++ objset_t *os = zap->zap_objset; ++ uint64_t zapobj = zap->zap_object; ++ ++ zap_unlockdir(zap); ++ err = zap_lockdir(os, zapobj, tx, ++ RW_WRITER, FALSE, FALSE, &zn->zn_zap); ++ zap = zn->zn_zap; ++ if (err) ++ return; ++ } ++ ++ /* could have finished growing while our locks were down */ ++ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift) ++ (void) zap_grow_ptrtbl(zap, tx); ++ } ++} ++ ++static int ++fzap_checkname(zap_name_t *zn) ++{ ++ if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) ++ return (ENAMETOOLONG); ++ return (0); ++} ++ ++static int ++fzap_checksize(uint64_t integer_size, uint64_t num_integers) ++{ ++ /* Only integer sizes supported by C */ ++ switch (integer_size) { ++ case 1: ++ case 2: ++ case 4: ++ case 8: ++ break; ++ default: ++ return (EINVAL); ++ } ++ ++ if (integer_size * num_integers > ZAP_MAXVALUELEN) ++ return (E2BIG); ++ ++ return (0); ++} ++ ++static int ++fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers) ++{ ++ int err; ++ ++ if ((err = fzap_checkname(zn)) != 0) ++ return (err); ++ return (fzap_checksize(integer_size, num_integers)); ++} ++ ++/* ++ * Routines for manipulating attributes. ++ */ ++int ++fzap_lookup(zap_name_t *zn, ++ uint64_t integer_size, uint64_t num_integers, void *buf, ++ char *realname, int rn_len, boolean_t *ncp) ++{ ++ zap_leaf_t *l; ++ int err; ++ zap_entry_handle_t zeh; ++ ++ if ((err = fzap_checkname(zn)) != 0) ++ return (err); ++ ++ err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); ++ if (err != 0) ++ return (err); ++ err = zap_leaf_lookup(l, zn, &zeh); ++ if (err == 0) { ++ if ((err = fzap_checksize(integer_size, num_integers)) != 0) { ++ zap_put_leaf(l); ++ return (err); ++ } ++ ++ err = zap_entry_read(&zeh, integer_size, num_integers, buf); ++ (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname); ++ if (ncp) { ++ *ncp = zap_entry_normalization_conflict(&zeh, ++ zn, NULL, zn->zn_zap); ++ } ++ } ++ ++ zap_put_leaf(l); ++ return (err); ++} ++ ++int ++fzap_add_cd(zap_name_t *zn, ++ uint64_t integer_size, uint64_t num_integers, ++ const void *val, uint32_t cd, dmu_tx_t *tx) ++{ ++ zap_leaf_t *l; ++ int err; ++ zap_entry_handle_t zeh; ++ zap_t *zap = zn->zn_zap; ++ ++ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ++ ASSERT(!zap->zap_ismicro); ++ ASSERT(fzap_check(zn, integer_size, num_integers) == 0); ++ ++ err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); ++ if (err != 0) ++ return (err); ++retry: ++ err = zap_leaf_lookup(l, zn, &zeh); ++ if (err == 0) { ++ err = EEXIST; ++ goto out; ++ } ++ if (err != ENOENT) ++ goto out; ++ ++ err = zap_entry_create(l, zn, cd, ++ integer_size, num_integers, val, &zeh); ++ ++ if (err == 0) { ++ zap_increment_num_entries(zap, 1, tx); ++ } else if (err == EAGAIN) { ++ err = zap_expand_leaf(zn, l, tx, &l); ++ zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ ++ if (err == 0) ++ goto retry; ++ } ++ ++out: ++ if (zap != NULL) ++ zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx); ++ return (err); ++} ++ ++int ++fzap_add(zap_name_t *zn, ++ uint64_t integer_size, uint64_t num_integers, ++ const void *val, dmu_tx_t *tx) ++{ ++ int err = fzap_check(zn, integer_size, num_integers); ++ if (err != 0) ++ return (err); ++ ++ return (fzap_add_cd(zn, integer_size, num_integers, ++ val, ZAP_NEED_CD, tx)); ++} ++ ++int ++fzap_update(zap_name_t *zn, ++ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) ++{ ++ zap_leaf_t *l; ++ int err, create; ++ zap_entry_handle_t zeh; ++ zap_t *zap = zn->zn_zap; ++ ++ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ++ err = fzap_check(zn, integer_size, num_integers); ++ if (err != 0) ++ return (err); ++ ++ err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); ++ if (err != 0) ++ return (err); ++retry: ++ err = zap_leaf_lookup(l, zn, &zeh); ++ create = (err == ENOENT); ++ ASSERT(err == 0 || err == ENOENT); ++ ++ if (create) { ++ err = zap_entry_create(l, zn, ZAP_NEED_CD, ++ integer_size, num_integers, val, &zeh); ++ if (err == 0) ++ zap_increment_num_entries(zap, 1, tx); ++ } else { ++ err = zap_entry_update(&zeh, integer_size, num_integers, val); ++ } ++ ++ if (err == EAGAIN) { ++ err = zap_expand_leaf(zn, l, tx, &l); ++ zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ ++ if (err == 0) ++ goto retry; ++ } ++ ++ if (zap != NULL) ++ zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx); ++ return (err); ++} ++ ++int ++fzap_length(zap_name_t *zn, ++ uint64_t *integer_size, uint64_t *num_integers) ++{ ++ zap_leaf_t *l; ++ int err; ++ zap_entry_handle_t zeh; ++ ++ err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); ++ if (err != 0) ++ return (err); ++ err = zap_leaf_lookup(l, zn, &zeh); ++ if (err != 0) ++ goto out; ++ ++ if (integer_size) ++ *integer_size = zeh.zeh_integer_size; ++ if (num_integers) ++ *num_integers = zeh.zeh_num_integers; ++out: ++ zap_put_leaf(l); ++ return (err); ++} ++ ++int ++fzap_remove(zap_name_t *zn, dmu_tx_t *tx) ++{ ++ zap_leaf_t *l; ++ int err; ++ zap_entry_handle_t zeh; ++ ++ err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l); ++ if (err != 0) ++ return (err); ++ err = zap_leaf_lookup(l, zn, &zeh); ++ if (err == 0) { ++ zap_entry_remove(&zeh); ++ zap_increment_num_entries(zn->zn_zap, -1, tx); ++ } ++ zap_put_leaf(l); ++ return (err); ++} ++ ++void ++fzap_prefetch(zap_name_t *zn) ++{ ++ uint64_t idx, blk; ++ zap_t *zap = zn->zn_zap; ++ int bs; ++ ++ idx = ZAP_HASH_IDX(zn->zn_hash, ++ zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); ++ if (zap_idx_to_blk(zap, idx, &blk) != 0) ++ return; ++ bs = FZAP_BLOCK_SHIFT(zap); ++ dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs); ++} ++ ++/* ++ * Helper functions for consumers. ++ */ ++ ++int ++zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, ++ char *name) ++{ ++ zap_cursor_t zc; ++ zap_attribute_t *za; ++ int err; ++ ++ if (mask == 0) ++ mask = -1ULL; ++ ++ za = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE); ++ for (zap_cursor_init(&zc, os, zapobj); ++ (err = zap_cursor_retrieve(&zc, za)) == 0; ++ zap_cursor_advance(&zc)) { ++ if ((za->za_first_integer & mask) == (value & mask)) { ++ (void) strcpy(name, za->za_name); ++ break; ++ } ++ } ++ zap_cursor_fini(&zc); ++ kmem_free(za, sizeof (zap_attribute_t)); ++ return (err); ++} ++ ++int ++zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) ++{ ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ int err; ++ ++ for (zap_cursor_init(&zc, os, fromobj); ++ zap_cursor_retrieve(&zc, &za) == 0; ++ (void) zap_cursor_advance(&zc)) { ++ if (za.za_integer_length != 8 || za.za_num_integers != 1) ++ return (EINVAL); ++ err = zap_add(os, intoobj, za.za_name, ++ 8, 1, &za.za_first_integer, tx); ++ if (err) ++ return (err); ++ } ++ zap_cursor_fini(&zc); ++ return (0); ++} ++ ++int ++zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, ++ uint64_t value, dmu_tx_t *tx) ++{ ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ int err; ++ ++ for (zap_cursor_init(&zc, os, fromobj); ++ zap_cursor_retrieve(&zc, &za) == 0; ++ (void) zap_cursor_advance(&zc)) { ++ if (za.za_integer_length != 8 || za.za_num_integers != 1) ++ return (EINVAL); ++ err = zap_add(os, intoobj, za.za_name, ++ 8, 1, &value, tx); ++ if (err) ++ return (err); ++ } ++ zap_cursor_fini(&zc); ++ return (0); ++} ++ ++int ++zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, ++ dmu_tx_t *tx) ++{ ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ int err; ++ ++ for (zap_cursor_init(&zc, os, fromobj); ++ zap_cursor_retrieve(&zc, &za) == 0; ++ (void) zap_cursor_advance(&zc)) { ++ uint64_t delta = 0; ++ ++ if (za.za_integer_length != 8 || za.za_num_integers != 1) ++ return (EINVAL); ++ ++ err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta); ++ if (err != 0 && err != ENOENT) ++ return (err); ++ delta += za.za_first_integer; ++ err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx); ++ if (err) ++ return (err); ++ } ++ zap_cursor_fini(&zc); ++ return (0); ++} ++ ++int ++zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) ++{ ++ char name[20]; ++ ++ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); ++ return (zap_add(os, obj, name, 8, 1, &value, tx)); ++} ++ ++int ++zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) ++{ ++ char name[20]; ++ ++ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); ++ return (zap_remove(os, obj, name, tx)); ++} ++ ++int ++zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) ++{ ++ char name[20]; ++ ++ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); ++ return (zap_lookup(os, obj, name, 8, 1, &value)); ++} ++ ++int ++zap_add_int_key(objset_t *os, uint64_t obj, ++ uint64_t key, uint64_t value, dmu_tx_t *tx) ++{ ++ char name[20]; ++ ++ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); ++ return (zap_add(os, obj, name, 8, 1, &value, tx)); ++} ++ ++int ++zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) ++{ ++ char name[20]; ++ ++ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); ++ return (zap_lookup(os, obj, name, 8, 1, valuep)); ++} ++ ++int ++zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, ++ dmu_tx_t *tx) ++{ ++ uint64_t value = 0; ++ int err; ++ ++ if (delta == 0) ++ return (0); ++ ++ err = zap_lookup(os, obj, name, 8, 1, &value); ++ if (err != 0 && err != ENOENT) ++ return (err); ++ value += delta; ++ if (value == 0) ++ err = zap_remove(os, obj, name, tx); ++ else ++ err = zap_update(os, obj, name, 8, 1, &value, tx); ++ return (err); ++} ++ ++int ++zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, ++ dmu_tx_t *tx) ++{ ++ char name[20]; ++ ++ (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); ++ return (zap_increment(os, obj, name, delta, tx)); ++} ++ ++/* ++ * Routines for iterating over the attributes. ++ */ ++ ++int ++fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) ++{ ++ int err = ENOENT; ++ zap_entry_handle_t zeh; ++ zap_leaf_t *l; ++ ++ /* retrieve the next entry at or after zc_hash/zc_cd */ ++ /* if no entry, return ENOENT */ ++ ++ if (zc->zc_leaf && ++ (ZAP_HASH_IDX(zc->zc_hash, ++ zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) != ++ zc->zc_leaf->l_phys->l_hdr.lh_prefix)) { ++ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); ++ zap_put_leaf(zc->zc_leaf); ++ zc->zc_leaf = NULL; ++ } ++ ++again: ++ if (zc->zc_leaf == NULL) { ++ err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, ++ &zc->zc_leaf); ++ if (err != 0) ++ return (err); ++ } else { ++ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); ++ } ++ l = zc->zc_leaf; ++ ++ err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); ++ ++ if (err == ENOENT) { ++ uint64_t nocare = ++ (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1; ++ zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; ++ zc->zc_cd = 0; ++ if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) { ++ zc->zc_hash = -1ULL; ++ } else { ++ zap_put_leaf(zc->zc_leaf); ++ zc->zc_leaf = NULL; ++ goto again; ++ } ++ } ++ ++ if (err == 0) { ++ zc->zc_hash = zeh.zeh_hash; ++ zc->zc_cd = zeh.zeh_cd; ++ za->za_integer_length = zeh.zeh_integer_size; ++ za->za_num_integers = zeh.zeh_num_integers; ++ if (zeh.zeh_num_integers == 0) { ++ za->za_first_integer = 0; ++ } else { ++ err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); ++ ASSERT(err == 0 || err == EOVERFLOW); ++ } ++ err = zap_entry_read_name(zap, &zeh, ++ sizeof (za->za_name), za->za_name); ++ ASSERT(err == 0); ++ ++ za->za_normalization_conflict = ++ zap_entry_normalization_conflict(&zeh, ++ NULL, za->za_name, zap); ++ } ++ rw_exit(&zc->zc_leaf->l_rwlock); ++ return (err); ++} ++ ++static void ++zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) ++{ ++ int i, err; ++ uint64_t lastblk = 0; ++ ++ /* ++ * NB: if a leaf has more pointers than an entire ptrtbl block ++ * can hold, then it'll be accounted for more than once, since ++ * we won't have lastblk. ++ */ ++ for (i = 0; i < len; i++) { ++ zap_leaf_t *l; ++ ++ if (tbl[i] == lastblk) ++ continue; ++ lastblk = tbl[i]; ++ ++ err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); ++ if (err == 0) { ++ zap_leaf_stats(zap, l, zs); ++ zap_put_leaf(l); ++ } ++ } ++} ++ ++int ++fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn) ++{ ++ int err; ++ zap_leaf_t *l; ++ zap_entry_handle_t zeh; ++ ++ if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) ++ return (ENAMETOOLONG); ++ ++ err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l); ++ if (err != 0) ++ return (err); ++ ++ err = zap_leaf_lookup(l, zn, &zeh); ++ if (err != 0) ++ return (err); ++ ++ zc->zc_leaf = l; ++ zc->zc_hash = zeh.zeh_hash; ++ zc->zc_cd = zeh.zeh_cd; ++ ++ return (err); ++} ++ ++void ++fzap_get_stats(zap_t *zap, zap_stats_t *zs) ++{ ++ int bs = FZAP_BLOCK_SHIFT(zap); ++ zs->zs_blocksize = 1ULL << bs; ++ ++ /* ++ * Set zap_phys_t fields ++ */ ++ zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs; ++ zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries; ++ zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk; ++ zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type; ++ zs->zs_magic = zap->zap_f.zap_phys->zap_magic; ++ zs->zs_salt = zap->zap_f.zap_phys->zap_salt; ++ ++ /* ++ * Set zap_ptrtbl fields ++ */ ++ zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; ++ zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk; ++ zs->zs_ptrtbl_blks_copied = ++ zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied; ++ zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk; ++ zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks; ++ zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; ++ ++ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { ++ /* the ptrtbl is entirely in the header block. */ ++ zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), ++ 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); ++ } else { ++ int b; ++ ++ dmu_prefetch(zap->zap_objset, zap->zap_object, ++ zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs, ++ zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs); ++ ++ for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks; ++ b++) { ++ dmu_buf_t *db; ++ int err; ++ ++ err = dmu_buf_hold(zap->zap_objset, zap->zap_object, ++ (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs, ++ FTAG, &db, DMU_READ_NO_PREFETCH); ++ if (err == 0) { ++ zap_stats_ptrtbl(zap, db->db_data, ++ 1<<(bs-3), zs); ++ dmu_buf_rele(db, FTAG); ++ } ++ } ++ } ++} ++ ++int ++fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite, ++ uint64_t *tooverwrite) ++{ ++ zap_t *zap = zn->zn_zap; ++ zap_leaf_t *l; ++ int err; ++ ++ /* ++ * Account for the header block of the fatzap. ++ */ ++ if (!add && dmu_buf_freeable(zap->zap_dbuf)) { ++ *tooverwrite += zap->zap_dbuf->db_size; ++ } else { ++ *towrite += zap->zap_dbuf->db_size; ++ } ++ ++ /* ++ * Account for the pointer table blocks. ++ * If we are adding we need to account for the following cases : ++ * - If the pointer table is embedded, this operation could force an ++ * external pointer table. ++ * - If this already has an external pointer table this operation ++ * could extend the table. ++ */ ++ if (add) { ++ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) ++ *towrite += zap->zap_dbuf->db_size; ++ else ++ *towrite += (zap->zap_dbuf->db_size * 3); ++ } ++ ++ /* ++ * Now, check if the block containing leaf is freeable ++ * and account accordingly. ++ */ ++ err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l); ++ if (err != 0) { ++ return (err); ++ } ++ ++ if (!add && dmu_buf_freeable(l->l_dbuf)) { ++ *tooverwrite += l->l_dbuf->db_size; ++ } else { ++ /* ++ * If this an add operation, the leaf block could split. ++ * Hence, we need to account for an additional leaf block. ++ */ ++ *towrite += (add ? 2 : 1) * l->l_dbuf->db_size; ++ } ++ ++ zap_put_leaf(l); ++ return (0); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zap_leaf.c linux-3.2.33-go/fs/zfs/zfs/zap_leaf.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zap_leaf.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zap_leaf.c 2012-11-16 23:25:34.352039300 +0100 +@@ -0,0 +1,872 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* ++ * The 512-byte leaf is broken into 32 16-byte chunks. ++ * chunk number n means l_chunk[n], even though the header precedes it. ++ * the names are stored null-terminated. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); ++ ++#define CHAIN_END 0xffff /* end of the chunk chain */ ++ ++/* half the (current) minimum block size */ ++#define MAX_ARRAY_BYTES (8<<10) ++ ++#define LEAF_HASH(l, h) \ ++ ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ ++ ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len))) ++ ++#define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)]) ++ ++ ++static void ++zap_memset(void *a, int c, size_t n) ++{ ++ char *cp = a; ++ char *cpend = cp + n; ++ ++ while (cp < cpend) ++ *cp++ = c; ++} ++ ++static void ++stv(int len, void *addr, uint64_t value) ++{ ++ switch (len) { ++ case 1: ++ *(uint8_t *)addr = value; ++ return; ++ case 2: ++ *(uint16_t *)addr = value; ++ return; ++ case 4: ++ *(uint32_t *)addr = value; ++ return; ++ case 8: ++ *(uint64_t *)addr = value; ++ return; ++ } ++ ASSERT(!"bad int len"); ++} ++ ++static uint64_t ++ldv(int len, const void *addr) ++{ ++ switch (len) { ++ case 1: ++ return (*(uint8_t *)addr); ++ case 2: ++ return (*(uint16_t *)addr); ++ case 4: ++ return (*(uint32_t *)addr); ++ case 8: ++ return (*(uint64_t *)addr); ++ } ++ ASSERT(!"bad int len"); ++ return (0xFEEDFACEDEADBEEFULL); ++} ++ ++void ++zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) ++{ ++ int i; ++ zap_leaf_t l; ++ l.l_bs = highbit(size)-1; ++ l.l_phys = buf; ++ ++ buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type); ++ buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix); ++ buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic); ++ buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree); ++ buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries); ++ buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); ++ buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); ++ ++ for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) ++ buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); ++ ++ for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { ++ zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i); ++ struct zap_leaf_entry *le; ++ ++ switch (lc->l_free.lf_type) { ++ case ZAP_CHUNK_ENTRY: ++ le = &lc->l_entry; ++ ++ le->le_type = BSWAP_8(le->le_type); ++ le->le_value_intlen = BSWAP_8(le->le_value_intlen); ++ le->le_next = BSWAP_16(le->le_next); ++ le->le_name_chunk = BSWAP_16(le->le_name_chunk); ++ le->le_name_numints = BSWAP_16(le->le_name_numints); ++ le->le_value_chunk = BSWAP_16(le->le_value_chunk); ++ le->le_value_numints = BSWAP_16(le->le_value_numints); ++ le->le_cd = BSWAP_32(le->le_cd); ++ le->le_hash = BSWAP_64(le->le_hash); ++ break; ++ case ZAP_CHUNK_FREE: ++ lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type); ++ lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next); ++ break; ++ case ZAP_CHUNK_ARRAY: ++ lc->l_array.la_type = BSWAP_8(lc->l_array.la_type); ++ lc->l_array.la_next = BSWAP_16(lc->l_array.la_next); ++ /* la_array doesn't need swapping */ ++ break; ++ default: ++ ASSERT(!"bad leaf type"); ++ } ++ } ++} ++ ++void ++zap_leaf_init(zap_leaf_t *l, boolean_t sort) ++{ ++ int i; ++ ++ l->l_bs = highbit(l->l_dbuf->db_size)-1; ++ zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header)); ++ zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); ++ for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { ++ ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; ++ ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; ++ } ++ ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END; ++ l->l_phys->l_hdr.lh_block_type = ZBT_LEAF; ++ l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC; ++ l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); ++ if (sort) ++ l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; ++} ++ ++/* ++ * Routines which manipulate leaf chunks (l_chunk[]). ++ */ ++ ++static uint16_t ++zap_leaf_chunk_alloc(zap_leaf_t *l) ++{ ++ int chunk; ++ ++ ASSERT(l->l_phys->l_hdr.lh_nfree > 0); ++ ++ chunk = l->l_phys->l_hdr.lh_freelist; ++ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ++ ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); ++ ++ l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next; ++ ++ l->l_phys->l_hdr.lh_nfree--; ++ ++ return (chunk); ++} ++ ++static void ++zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) ++{ ++ struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free; ++ ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l)); ++ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ++ ASSERT(zlf->lf_type != ZAP_CHUNK_FREE); ++ ++ zlf->lf_type = ZAP_CHUNK_FREE; ++ zlf->lf_next = l->l_phys->l_hdr.lh_freelist; ++ bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */ ++ l->l_phys->l_hdr.lh_freelist = chunk; ++ ++ l->l_phys->l_hdr.lh_nfree++; ++} ++ ++/* ++ * Routines which manipulate leaf arrays (zap_leaf_array type chunks). ++ */ ++ ++static uint16_t ++zap_leaf_array_create(zap_leaf_t *l, const char *buf, ++ int integer_size, int num_integers) ++{ ++ uint16_t chunk_head; ++ uint16_t *chunkp = &chunk_head; ++ int byten = 0; ++ uint64_t value = 0; ++ int shift = (integer_size-1)*8; ++ int len = num_integers; ++ ++ ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES); ++ ++ while (len > 0) { ++ uint16_t chunk = zap_leaf_chunk_alloc(l); ++ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; ++ int i; ++ ++ la->la_type = ZAP_CHUNK_ARRAY; ++ for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { ++ if (byten == 0) ++ value = ldv(integer_size, buf); ++ la->la_array[i] = value >> shift; ++ value <<= 8; ++ if (++byten == integer_size) { ++ byten = 0; ++ buf += integer_size; ++ if (--len == 0) ++ break; ++ } ++ } ++ ++ *chunkp = chunk; ++ chunkp = &la->la_next; ++ } ++ *chunkp = CHAIN_END; ++ ++ return (chunk_head); ++} ++ ++static void ++zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) ++{ ++ uint16_t chunk = *chunkp; ++ ++ *chunkp = CHAIN_END; ++ ++ while (chunk != CHAIN_END) { ++ int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; ++ ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==, ++ ZAP_CHUNK_ARRAY); ++ zap_leaf_chunk_free(l, chunk); ++ chunk = nextchunk; ++ } ++} ++ ++/* array_len and buf_len are in integers, not bytes */ ++static void ++zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, ++ int array_int_len, int array_len, int buf_int_len, uint64_t buf_len, ++ void *buf) ++{ ++ int len = MIN(array_len, buf_len); ++ int byten = 0; ++ uint64_t value = 0; ++ char *p = buf; ++ ++ ASSERT3U(array_int_len, <=, buf_int_len); ++ ++ /* Fast path for one 8-byte integer */ ++ if (array_int_len == 8 && buf_int_len == 8 && len == 1) { ++ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; ++ uint8_t *ip = la->la_array; ++ uint64_t *buf64 = buf; ++ ++ *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 | ++ (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 | ++ (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 | ++ (uint64_t)ip[6] << 8 | (uint64_t)ip[7]; ++ return; ++ } ++ ++ /* Fast path for an array of 1-byte integers (eg. the entry name) */ ++ if (array_int_len == 1 && buf_int_len == 1 && ++ buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) { ++ while (chunk != CHAIN_END) { ++ struct zap_leaf_array *la = ++ &ZAP_LEAF_CHUNK(l, chunk).l_array; ++ bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES); ++ p += ZAP_LEAF_ARRAY_BYTES; ++ chunk = la->la_next; ++ } ++ return; ++ } ++ ++ while (len > 0) { ++ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; ++ int i; ++ ++ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ++ for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { ++ value = (value << 8) | la->la_array[i]; ++ byten++; ++ if (byten == array_int_len) { ++ stv(buf_int_len, p, value); ++ byten = 0; ++ len--; ++ if (len == 0) ++ return; ++ p += buf_int_len; ++ } ++ } ++ chunk = la->la_next; ++ } ++} ++ ++static boolean_t ++zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, ++ int chunk, int array_numints) ++{ ++ int bseen = 0; ++ ++ if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) { ++ uint64_t *thiskey; ++ boolean_t match; ++ ++ ASSERT(zn->zn_key_intlen == sizeof (*thiskey)); ++ thiskey = kmem_alloc(array_numints * sizeof (*thiskey), ++ KM_PUSHPAGE); ++ ++ zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints, ++ sizeof (*thiskey), array_numints, thiskey); ++ match = bcmp(thiskey, zn->zn_key_orig, ++ array_numints * sizeof (*thiskey)) == 0; ++ kmem_free(thiskey, array_numints * sizeof (*thiskey)); ++ return (match); ++ } ++ ++ ASSERT(zn->zn_key_intlen == 1); ++ if (zn->zn_matchtype == MT_FIRST) { ++ char *thisname = kmem_alloc(array_numints, KM_PUSHPAGE); ++ boolean_t match; ++ ++ zap_leaf_array_read(l, chunk, sizeof (char), array_numints, ++ sizeof (char), array_numints, thisname); ++ match = zap_match(zn, thisname); ++ kmem_free(thisname, array_numints); ++ return (match); ++ } ++ ++ /* ++ * Fast path for exact matching. ++ * First check that the lengths match, so that we don't read ++ * past the end of the zn_key_orig array. ++ */ ++ if (array_numints != zn->zn_key_orig_numints) ++ return (B_FALSE); ++ while (bseen < array_numints) { ++ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; ++ int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES); ++ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ++ if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread)) ++ break; ++ chunk = la->la_next; ++ bseen += toread; ++ } ++ return (bseen == array_numints); ++} ++ ++/* ++ * Routines which manipulate leaf entries. ++ */ ++ ++int ++zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh) ++{ ++ uint16_t *chunkp; ++ struct zap_leaf_entry *le; ++ ++ ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); ++ ++again: ++ for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash); ++ *chunkp != CHAIN_END; chunkp = &le->le_next) { ++ uint16_t chunk = *chunkp; ++ le = ZAP_LEAF_ENTRY(l, chunk); ++ ++ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ++ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); ++ ++ if (le->le_hash != zn->zn_hash) ++ continue; ++ ++ /* ++ * NB: the entry chain is always sorted by cd on ++ * normalized zap objects, so this will find the ++ * lowest-cd match for MT_FIRST. ++ */ ++ ASSERT(zn->zn_matchtype == MT_EXACT || ++ (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED)); ++ if (zap_leaf_array_match(l, zn, le->le_name_chunk, ++ le->le_name_numints)) { ++ zeh->zeh_num_integers = le->le_value_numints; ++ zeh->zeh_integer_size = le->le_value_intlen; ++ zeh->zeh_cd = le->le_cd; ++ zeh->zeh_hash = le->le_hash; ++ zeh->zeh_chunkp = chunkp; ++ zeh->zeh_leaf = l; ++ return (0); ++ } ++ } ++ ++ /* ++ * NB: we could of course do this in one pass, but that would be ++ * a pain. We'll see if MT_BEST is even used much. ++ */ ++ if (zn->zn_matchtype == MT_BEST) { ++ zn->zn_matchtype = MT_FIRST; ++ goto again; ++ } ++ ++ return (ENOENT); ++} ++ ++/* Return (h1,cd1 >= h2,cd2) */ ++#define HCD_GTEQ(h1, cd1, h2, cd2) \ ++ ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE)) ++ ++int ++zap_leaf_lookup_closest(zap_leaf_t *l, ++ uint64_t h, uint32_t cd, zap_entry_handle_t *zeh) ++{ ++ uint16_t chunk; ++ uint64_t besth = -1ULL; ++ uint32_t bestcd = -1U; ++ uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1; ++ uint16_t lh; ++ struct zap_leaf_entry *le; ++ ++ ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); ++ ++ for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) { ++ for (chunk = l->l_phys->l_hash[lh]; ++ chunk != CHAIN_END; chunk = le->le_next) { ++ le = ZAP_LEAF_ENTRY(l, chunk); ++ ++ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ++ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); ++ ++ if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) && ++ HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) { ++ ASSERT3U(bestlh, >=, lh); ++ bestlh = lh; ++ besth = le->le_hash; ++ bestcd = le->le_cd; ++ ++ zeh->zeh_num_integers = le->le_value_numints; ++ zeh->zeh_integer_size = le->le_value_intlen; ++ zeh->zeh_cd = le->le_cd; ++ zeh->zeh_hash = le->le_hash; ++ zeh->zeh_fakechunk = chunk; ++ zeh->zeh_chunkp = &zeh->zeh_fakechunk; ++ zeh->zeh_leaf = l; ++ } ++ } ++ } ++ ++ return (bestcd == -1U ? ENOENT : 0); ++} ++ ++int ++zap_entry_read(const zap_entry_handle_t *zeh, ++ uint8_t integer_size, uint64_t num_integers, void *buf) ++{ ++ struct zap_leaf_entry *le = ++ ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); ++ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); ++ ++ if (le->le_value_intlen > integer_size) ++ return (EINVAL); ++ ++ zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, ++ le->le_value_intlen, le->le_value_numints, ++ integer_size, num_integers, buf); ++ ++ if (zeh->zeh_num_integers > num_integers) ++ return (EOVERFLOW); ++ return (0); ++ ++} ++ ++int ++zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen, ++ char *buf) ++{ ++ struct zap_leaf_entry *le = ++ ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); ++ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); ++ ++ if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { ++ zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8, ++ le->le_name_numints, 8, buflen / 8, buf); ++ } else { ++ zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1, ++ le->le_name_numints, 1, buflen, buf); ++ } ++ if (le->le_name_numints > buflen) ++ return (EOVERFLOW); ++ return (0); ++} ++ ++int ++zap_entry_update(zap_entry_handle_t *zeh, ++ uint8_t integer_size, uint64_t num_integers, const void *buf) ++{ ++ int delta_chunks; ++ zap_leaf_t *l = zeh->zeh_leaf; ++ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp); ++ ++ delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - ++ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen); ++ ++ if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks) ++ return (EAGAIN); ++ ++ zap_leaf_array_free(l, &le->le_value_chunk); ++ le->le_value_chunk = ++ zap_leaf_array_create(l, buf, integer_size, num_integers); ++ le->le_value_numints = num_integers; ++ le->le_value_intlen = integer_size; ++ return (0); ++} ++ ++void ++zap_entry_remove(zap_entry_handle_t *zeh) ++{ ++ uint16_t entry_chunk; ++ struct zap_leaf_entry *le; ++ zap_leaf_t *l = zeh->zeh_leaf; ++ ++ ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk); ++ ++ entry_chunk = *zeh->zeh_chunkp; ++ le = ZAP_LEAF_ENTRY(l, entry_chunk); ++ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); ++ ++ zap_leaf_array_free(l, &le->le_name_chunk); ++ zap_leaf_array_free(l, &le->le_value_chunk); ++ ++ *zeh->zeh_chunkp = le->le_next; ++ zap_leaf_chunk_free(l, entry_chunk); ++ ++ l->l_phys->l_hdr.lh_nentries--; ++} ++ ++int ++zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, ++ uint8_t integer_size, uint64_t num_integers, const void *buf, ++ zap_entry_handle_t *zeh) ++{ ++ uint16_t chunk; ++ uint16_t *chunkp; ++ struct zap_leaf_entry *le; ++ uint64_t valuelen; ++ int numchunks; ++ uint64_t h = zn->zn_hash; ++ ++ valuelen = integer_size * num_integers; ++ ++ numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * ++ zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen); ++ if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) ++ return (E2BIG); ++ ++ if (cd == ZAP_NEED_CD) { ++ /* find the lowest unused cd */ ++ if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) { ++ cd = 0; ++ ++ for (chunk = *LEAF_HASH_ENTPTR(l, h); ++ chunk != CHAIN_END; chunk = le->le_next) { ++ le = ZAP_LEAF_ENTRY(l, chunk); ++ if (le->le_cd > cd) ++ break; ++ if (le->le_hash == h) { ++ ASSERT3U(cd, ==, le->le_cd); ++ cd++; ++ } ++ } ++ } else { ++ /* old unsorted format; do it the O(n^2) way */ ++ for (cd = 0; ; cd++) { ++ for (chunk = *LEAF_HASH_ENTPTR(l, h); ++ chunk != CHAIN_END; chunk = le->le_next) { ++ le = ZAP_LEAF_ENTRY(l, chunk); ++ if (le->le_hash == h && ++ le->le_cd == cd) { ++ break; ++ } ++ } ++ /* If this cd is not in use, we are good. */ ++ if (chunk == CHAIN_END) ++ break; ++ } ++ } ++ /* ++ * We would run out of space in a block before we could ++ * store enough entries to run out of CD values. ++ */ ++ ASSERT3U(cd, <, zap_maxcd(zn->zn_zap)); ++ } ++ ++ if (l->l_phys->l_hdr.lh_nfree < numchunks) ++ return (EAGAIN); ++ ++ /* make the entry */ ++ chunk = zap_leaf_chunk_alloc(l); ++ le = ZAP_LEAF_ENTRY(l, chunk); ++ le->le_type = ZAP_CHUNK_ENTRY; ++ le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig, ++ zn->zn_key_intlen, zn->zn_key_orig_numints); ++ le->le_name_numints = zn->zn_key_orig_numints; ++ le->le_value_chunk = ++ zap_leaf_array_create(l, buf, integer_size, num_integers); ++ le->le_value_numints = num_integers; ++ le->le_value_intlen = integer_size; ++ le->le_hash = h; ++ le->le_cd = cd; ++ ++ /* link it into the hash chain */ ++ /* XXX if we did the search above, we could just use that */ ++ chunkp = zap_leaf_rehash_entry(l, chunk); ++ ++ l->l_phys->l_hdr.lh_nentries++; ++ ++ zeh->zeh_leaf = l; ++ zeh->zeh_num_integers = num_integers; ++ zeh->zeh_integer_size = le->le_value_intlen; ++ zeh->zeh_cd = le->le_cd; ++ zeh->zeh_hash = le->le_hash; ++ zeh->zeh_chunkp = chunkp; ++ ++ return (0); ++} ++ ++/* ++ * Determine if there is another entry with the same normalized form. ++ * For performance purposes, either zn or name must be provided (the ++ * other can be NULL). Note, there usually won't be any hash ++ * conflicts, in which case we don't need the concatenated/normalized ++ * form of the name. But all callers have one of these on hand anyway, ++ * so might as well take advantage. A cleaner but slower interface ++ * would accept neither argument, and compute the normalized name as ++ * needed (using zap_name_alloc(zap_entry_read_name(zeh))). ++ */ ++boolean_t ++zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, ++ const char *name, zap_t *zap) ++{ ++ uint64_t chunk; ++ struct zap_leaf_entry *le; ++ boolean_t allocdzn = B_FALSE; ++ ++ if (zap->zap_normflags == 0) ++ return (B_FALSE); ++ ++ for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash); ++ chunk != CHAIN_END; chunk = le->le_next) { ++ le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk); ++ if (le->le_hash != zeh->zeh_hash) ++ continue; ++ if (le->le_cd == zeh->zeh_cd) ++ continue; ++ ++ if (zn == NULL) { ++ zn = zap_name_alloc(zap, name, MT_FIRST); ++ allocdzn = B_TRUE; ++ } ++ if (zap_leaf_array_match(zeh->zeh_leaf, zn, ++ le->le_name_chunk, le->le_name_numints)) { ++ if (allocdzn) ++ zap_name_free(zn); ++ return (B_TRUE); ++ } ++ } ++ if (allocdzn) ++ zap_name_free(zn); ++ return (B_FALSE); ++} ++ ++/* ++ * Routines for transferring entries between leafs. ++ */ ++ ++static uint16_t * ++zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry) ++{ ++ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); ++ struct zap_leaf_entry *le2; ++ uint16_t *chunkp; ++ ++ /* ++ * keep the entry chain sorted by cd ++ * NB: this will not cause problems for unsorted leafs, though ++ * it is unnecessary there. ++ */ ++ for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash); ++ *chunkp != CHAIN_END; chunkp = &le2->le_next) { ++ le2 = ZAP_LEAF_ENTRY(l, *chunkp); ++ if (le2->le_cd > le->le_cd) ++ break; ++ } ++ ++ le->le_next = *chunkp; ++ *chunkp = entry; ++ return (chunkp); ++} ++ ++static uint16_t ++zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) ++{ ++ uint16_t new_chunk; ++ uint16_t *nchunkp = &new_chunk; ++ ++ while (chunk != CHAIN_END) { ++ uint16_t nchunk = zap_leaf_chunk_alloc(nl); ++ struct zap_leaf_array *nla = ++ &ZAP_LEAF_CHUNK(nl, nchunk).l_array; ++ struct zap_leaf_array *la = ++ &ZAP_LEAF_CHUNK(l, chunk).l_array; ++ int nextchunk = la->la_next; ++ ++ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ++ ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l)); ++ ++ *nla = *la; /* structure assignment */ ++ ++ zap_leaf_chunk_free(l, chunk); ++ chunk = nextchunk; ++ *nchunkp = nchunk; ++ nchunkp = &nla->la_next; ++ } ++ *nchunkp = CHAIN_END; ++ return (new_chunk); ++} ++ ++static void ++zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) ++{ ++ struct zap_leaf_entry *le, *nle; ++ uint16_t chunk; ++ ++ le = ZAP_LEAF_ENTRY(l, entry); ++ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); ++ ++ chunk = zap_leaf_chunk_alloc(nl); ++ nle = ZAP_LEAF_ENTRY(nl, chunk); ++ *nle = *le; /* structure assignment */ ++ ++ (void) zap_leaf_rehash_entry(nl, chunk); ++ ++ nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); ++ nle->le_value_chunk = ++ zap_leaf_transfer_array(l, le->le_value_chunk, nl); ++ ++ zap_leaf_chunk_free(l, entry); ++ ++ l->l_phys->l_hdr.lh_nentries--; ++ nl->l_phys->l_hdr.lh_nentries++; ++} ++ ++/* ++ * Transfer the entries whose hash prefix ends in 1 to the new leaf. ++ */ ++void ++zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) ++{ ++ int i; ++ int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len; ++ ++ /* set new prefix and prefix_len */ ++ l->l_phys->l_hdr.lh_prefix <<= 1; ++ l->l_phys->l_hdr.lh_prefix_len++; ++ nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1; ++ nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len; ++ ++ /* break existing hash chains */ ++ zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); ++ ++ if (sort) ++ l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; ++ ++ /* ++ * Transfer entries whose hash bit 'bit' is set to nl; rehash ++ * the remaining entries ++ * ++ * NB: We could find entries via the hashtable instead. That ++ * would be O(hashents+numents) rather than O(numblks+numents), ++ * but this accesses memory more sequentially, and when we're ++ * called, the block is usually pretty full. ++ */ ++ for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { ++ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i); ++ if (le->le_type != ZAP_CHUNK_ENTRY) ++ continue; ++ ++ if (le->le_hash & (1ULL << bit)) ++ zap_leaf_transfer_entry(l, i, nl); ++ else ++ (void) zap_leaf_rehash_entry(l, i); ++ } ++} ++ ++void ++zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) ++{ ++ int i, n; ++ ++ n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - ++ l->l_phys->l_hdr.lh_prefix_len; ++ n = MIN(n, ZAP_HISTOGRAM_SIZE-1); ++ zs->zs_leafs_with_2n_pointers[n]++; ++ ++ ++ n = l->l_phys->l_hdr.lh_nentries/5; ++ n = MIN(n, ZAP_HISTOGRAM_SIZE-1); ++ zs->zs_blocks_with_n5_entries[n]++; ++ ++ n = ((1<l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 / ++ (1<zs_blocks_n_tenths_full[n]++; ++ ++ for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { ++ int nentries = 0; ++ int chunk = l->l_phys->l_hash[i]; ++ ++ while (chunk != CHAIN_END) { ++ struct zap_leaf_entry *le = ++ ZAP_LEAF_ENTRY(l, chunk); ++ ++ n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) + ++ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * ++ le->le_value_intlen); ++ n = MIN(n, ZAP_HISTOGRAM_SIZE-1); ++ zs->zs_entries_using_n_chunks[n]++; ++ ++ chunk = le->le_next; ++ nentries++; ++ } ++ ++ n = nentries; ++ n = MIN(n, ZAP_HISTOGRAM_SIZE-1); ++ zs->zs_buckets_with_n_entries[n]++; ++ } ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zap_micro.c linux-3.2.33-go/fs/zfs/zfs/zap_micro.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zap_micro.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zap_micro.c 2012-11-16 23:25:34.353039289 +0100 +@@ -0,0 +1,1500 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef _KERNEL ++#include ++#endif ++ ++static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags); ++ ++uint64_t ++zap_getflags(zap_t *zap) ++{ ++ if (zap->zap_ismicro) ++ return (0); ++ return (zap->zap_u.zap_fat.zap_phys->zap_flags); ++} ++ ++int ++zap_hashbits(zap_t *zap) ++{ ++ if (zap_getflags(zap) & ZAP_FLAG_HASH64) ++ return (48); ++ else ++ return (28); ++} ++ ++uint32_t ++zap_maxcd(zap_t *zap) ++{ ++ if (zap_getflags(zap) & ZAP_FLAG_HASH64) ++ return ((1<<16)-1); ++ else ++ return (-1U); ++} ++ ++static uint64_t ++zap_hash(zap_name_t *zn) ++{ ++ zap_t *zap = zn->zn_zap; ++ uint64_t h = 0; ++ ++ if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { ++ ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); ++ h = *(uint64_t *)zn->zn_key_orig; ++ } else { ++ h = zap->zap_salt; ++ ASSERT(h != 0); ++ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); ++ ++ if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { ++ int i; ++ const uint64_t *wp = zn->zn_key_norm; ++ ++ ASSERT(zn->zn_key_intlen == 8); ++ for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) { ++ int j; ++ uint64_t word = *wp; ++ ++ for (j = 0; j < zn->zn_key_intlen; j++) { ++ h = (h >> 8) ^ ++ zfs_crc64_table[(h ^ word) & 0xFF]; ++ word >>= NBBY; ++ } ++ } ++ } else { ++ int i, len; ++ const uint8_t *cp = zn->zn_key_norm; ++ ++ /* ++ * We previously stored the terminating null on ++ * disk, but didn't hash it, so we need to ++ * continue to not hash it. (The ++ * zn_key_*_numints includes the terminating ++ * null for non-binary keys.) ++ */ ++ len = zn->zn_key_norm_numints - 1; ++ ++ ASSERT(zn->zn_key_intlen == 1); ++ for (i = 0; i < len; cp++, i++) { ++ h = (h >> 8) ^ ++ zfs_crc64_table[(h ^ *cp) & 0xFF]; ++ } ++ } ++ } ++ /* ++ * Don't use all 64 bits, since we need some in the cookie for ++ * the collision differentiator. We MUST use the high bits, ++ * since those are the ones that we first pay attention to when ++ * chosing the bucket. ++ */ ++ h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); ++ ++ return (h); ++} ++ ++static int ++zap_normalize(zap_t *zap, const char *name, char *namenorm) ++{ ++ size_t inlen, outlen; ++ int err; ++ ++ ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); ++ ++ inlen = strlen(name) + 1; ++ outlen = ZAP_MAXNAMELEN; ++ ++ err = 0; ++ (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, ++ zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL | ++ U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err); ++ ++ return (err); ++} ++ ++boolean_t ++zap_match(zap_name_t *zn, const char *matchname) ++{ ++ ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); ++ ++ if (zn->zn_matchtype == MT_FIRST) { ++ char norm[ZAP_MAXNAMELEN]; ++ ++ if (zap_normalize(zn->zn_zap, matchname, norm) != 0) ++ return (B_FALSE); ++ ++ return (strcmp(zn->zn_key_norm, norm) == 0); ++ } else { ++ /* MT_BEST or MT_EXACT */ ++ return (strcmp(zn->zn_key_orig, matchname) == 0); ++ } ++} ++ ++void ++zap_name_free(zap_name_t *zn) ++{ ++ kmem_free(zn, sizeof (zap_name_t)); ++} ++ ++zap_name_t * ++zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) ++{ ++ zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_PUSHPAGE); ++ ++ zn->zn_zap = zap; ++ zn->zn_key_intlen = sizeof (*key); ++ zn->zn_key_orig = key; ++ zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; ++ zn->zn_matchtype = mt; ++ if (zap->zap_normflags) { ++ if (zap_normalize(zap, key, zn->zn_normbuf) != 0) { ++ zap_name_free(zn); ++ return (NULL); ++ } ++ zn->zn_key_norm = zn->zn_normbuf; ++ zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; ++ } else { ++ if (mt != MT_EXACT) { ++ zap_name_free(zn); ++ return (NULL); ++ } ++ zn->zn_key_norm = zn->zn_key_orig; ++ zn->zn_key_norm_numints = zn->zn_key_orig_numints; ++ } ++ ++ zn->zn_hash = zap_hash(zn); ++ return (zn); ++} ++ ++zap_name_t * ++zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) ++{ ++ zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_PUSHPAGE); ++ ++ ASSERT(zap->zap_normflags == 0); ++ zn->zn_zap = zap; ++ zn->zn_key_intlen = sizeof (*key); ++ zn->zn_key_orig = zn->zn_key_norm = key; ++ zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; ++ zn->zn_matchtype = MT_EXACT; ++ ++ zn->zn_hash = zap_hash(zn); ++ return (zn); ++} ++ ++static void ++mzap_byteswap(mzap_phys_t *buf, size_t size) ++{ ++ int i, max; ++ buf->mz_block_type = BSWAP_64(buf->mz_block_type); ++ buf->mz_salt = BSWAP_64(buf->mz_salt); ++ buf->mz_normflags = BSWAP_64(buf->mz_normflags); ++ max = (size / MZAP_ENT_LEN) - 1; ++ for (i = 0; i < max; i++) { ++ buf->mz_chunk[i].mze_value = ++ BSWAP_64(buf->mz_chunk[i].mze_value); ++ buf->mz_chunk[i].mze_cd = ++ BSWAP_32(buf->mz_chunk[i].mze_cd); ++ } ++} ++ ++void ++zap_byteswap(void *buf, size_t size) ++{ ++ uint64_t block_type; ++ ++ block_type = *(uint64_t *)buf; ++ ++ if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { ++ /* ASSERT(magic == ZAP_LEAF_MAGIC); */ ++ mzap_byteswap(buf, size); ++ } else { ++ fzap_byteswap(buf, size); ++ } ++} ++ ++static int ++mze_compare(const void *arg1, const void *arg2) ++{ ++ const mzap_ent_t *mze1 = arg1; ++ const mzap_ent_t *mze2 = arg2; ++ ++ if (mze1->mze_hash > mze2->mze_hash) ++ return (+1); ++ if (mze1->mze_hash < mze2->mze_hash) ++ return (-1); ++ if (mze1->mze_cd > mze2->mze_cd) ++ return (+1); ++ if (mze1->mze_cd < mze2->mze_cd) ++ return (-1); ++ return (0); ++} ++ ++static void ++mze_insert(zap_t *zap, int chunkid, uint64_t hash) ++{ ++ mzap_ent_t *mze; ++ ++ ASSERT(zap->zap_ismicro); ++ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ++ ++ mze = kmem_alloc(sizeof (mzap_ent_t), KM_PUSHPAGE); ++ mze->mze_chunkid = chunkid; ++ mze->mze_hash = hash; ++ mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; ++ ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); ++ avl_add(&zap->zap_m.zap_avl, mze); ++} ++ ++static mzap_ent_t * ++mze_find(zap_name_t *zn) ++{ ++ mzap_ent_t mze_tofind; ++ mzap_ent_t *mze; ++ avl_index_t idx; ++ avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; ++ ++ ASSERT(zn->zn_zap->zap_ismicro); ++ ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); ++ ++ mze_tofind.mze_hash = zn->zn_hash; ++ mze_tofind.mze_cd = 0; ++ ++again: ++ mze = avl_find(avl, &mze_tofind, &idx); ++ if (mze == NULL) ++ mze = avl_nearest(avl, idx, AVL_AFTER); ++ for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { ++ ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); ++ if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) ++ return (mze); ++ } ++ if (zn->zn_matchtype == MT_BEST) { ++ zn->zn_matchtype = MT_FIRST; ++ goto again; ++ } ++ return (NULL); ++} ++ ++static uint32_t ++mze_find_unused_cd(zap_t *zap, uint64_t hash) ++{ ++ mzap_ent_t mze_tofind; ++ mzap_ent_t *mze; ++ avl_index_t idx; ++ avl_tree_t *avl = &zap->zap_m.zap_avl; ++ uint32_t cd; ++ ++ ASSERT(zap->zap_ismicro); ++ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ++ ++ mze_tofind.mze_hash = hash; ++ mze_tofind.mze_cd = 0; ++ ++ cd = 0; ++ for (mze = avl_find(avl, &mze_tofind, &idx); ++ mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { ++ if (mze->mze_cd != cd) ++ break; ++ cd++; ++ } ++ ++ return (cd); ++} ++ ++static void ++mze_remove(zap_t *zap, mzap_ent_t *mze) ++{ ++ ASSERT(zap->zap_ismicro); ++ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ++ ++ avl_remove(&zap->zap_m.zap_avl, mze); ++ kmem_free(mze, sizeof (mzap_ent_t)); ++} ++ ++static void ++mze_destroy(zap_t *zap) ++{ ++ mzap_ent_t *mze; ++ void *avlcookie = NULL; ++ ++ while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))) ++ kmem_free(mze, sizeof (mzap_ent_t)); ++ avl_destroy(&zap->zap_m.zap_avl); ++} ++ ++static zap_t * ++mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) ++{ ++ zap_t *winner; ++ zap_t *zap; ++ int i; ++ ++ ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); ++ ++ zap = kmem_zalloc(sizeof (zap_t), KM_PUSHPAGE); ++ rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); ++ rw_enter(&zap->zap_rwlock, RW_WRITER); ++ zap->zap_objset = os; ++ zap->zap_object = obj; ++ zap->zap_dbuf = db; ++ ++ if (*(uint64_t *)db->db_data != ZBT_MICRO) { ++ mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); ++ zap->zap_f.zap_block_shift = highbit(db->db_size) - 1; ++ } else { ++ zap->zap_ismicro = TRUE; ++ } ++ ++ /* ++ * Make sure that zap_ismicro is set before we let others see ++ * it, because zap_lockdir() checks zap_ismicro without the lock ++ * held. ++ */ ++ winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict); ++ ++ if (winner != NULL) { ++ rw_exit(&zap->zap_rwlock); ++ rw_destroy(&zap->zap_rwlock); ++ if (!zap->zap_ismicro) ++ mutex_destroy(&zap->zap_f.zap_num_entries_mtx); ++ kmem_free(zap, sizeof (zap_t)); ++ return (winner); ++ } ++ ++ if (zap->zap_ismicro) { ++ zap->zap_salt = zap->zap_m.zap_phys->mz_salt; ++ zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags; ++ zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; ++ avl_create(&zap->zap_m.zap_avl, mze_compare, ++ sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); ++ ++ for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { ++ mzap_ent_phys_t *mze = ++ &zap->zap_m.zap_phys->mz_chunk[i]; ++ if (mze->mze_name[0]) { ++ zap_name_t *zn; ++ ++ zap->zap_m.zap_num_entries++; ++ zn = zap_name_alloc(zap, mze->mze_name, ++ MT_EXACT); ++ mze_insert(zap, i, zn->zn_hash); ++ zap_name_free(zn); ++ } ++ } ++ } else { ++ zap->zap_salt = zap->zap_f.zap_phys->zap_salt; ++ zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags; ++ ++ ASSERT3U(sizeof (struct zap_leaf_header), ==, ++ 2*ZAP_LEAF_CHUNKSIZE); ++ ++ /* ++ * The embedded pointer table should not overlap the ++ * other members. ++ */ ++ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, ++ &zap->zap_f.zap_phys->zap_salt); ++ ++ /* ++ * The embedded pointer table should end at the end of ++ * the block ++ */ ++ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, ++ 1<zap_f.zap_phys, ==, ++ zap->zap_dbuf->db_size); ++ } ++ rw_exit(&zap->zap_rwlock); ++ return (zap); ++} ++ ++int ++zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, ++ krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) ++{ ++ zap_t *zap; ++ dmu_buf_t *db; ++ krw_t lt; ++ int err; ++ ++ *zapp = NULL; ++ ++ err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH); ++ if (err) ++ return (err); ++ ++#ifdef ZFS_DEBUG ++ { ++ dmu_object_info_t doi; ++ dmu_object_info_from_db(db, &doi); ++ ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); ++ } ++#endif ++ ++ zap = dmu_buf_get_user(db); ++ if (zap == NULL) ++ zap = mzap_open(os, obj, db); ++ ++ /* ++ * We're checking zap_ismicro without the lock held, in order to ++ * tell what type of lock we want. Once we have some sort of ++ * lock, see if it really is the right type. In practice this ++ * can only be different if it was upgraded from micro to fat, ++ * and micro wanted WRITER but fat only needs READER. ++ */ ++ lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; ++ rw_enter(&zap->zap_rwlock, lt); ++ if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { ++ /* it was upgraded, now we only need reader */ ++ ASSERT(lt == RW_WRITER); ++ ASSERT(RW_READER == ++ (!zap->zap_ismicro && fatreader) ? RW_READER : lti); ++ rw_downgrade(&zap->zap_rwlock); ++ lt = RW_READER; ++ } ++ ++ zap->zap_objset = os; ++ ++ if (lt == RW_WRITER) ++ dmu_buf_will_dirty(db, tx); ++ ++ ASSERT3P(zap->zap_dbuf, ==, db); ++ ++ ASSERT(!zap->zap_ismicro || ++ zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); ++ if (zap->zap_ismicro && tx && adding && ++ zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { ++ uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; ++ if (newsz > MZAP_MAX_BLKSZ) { ++ dprintf("upgrading obj %llu: num_entries=%u\n", ++ obj, zap->zap_m.zap_num_entries); ++ *zapp = zap; ++ return (mzap_upgrade(zapp, tx, 0)); ++ } ++ err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); ++ ASSERT3U(err, ==, 0); ++ zap->zap_m.zap_num_chunks = ++ db->db_size / MZAP_ENT_LEN - 1; ++ } ++ ++ *zapp = zap; ++ return (0); ++} ++ ++void ++zap_unlockdir(zap_t *zap) ++{ ++ rw_exit(&zap->zap_rwlock); ++ dmu_buf_rele(zap->zap_dbuf, NULL); ++} ++ ++static int ++mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) ++{ ++ mzap_phys_t *mzp; ++ int i, sz, nchunks; ++ int err = 0; ++ zap_t *zap = *zapp; ++ ++ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ++ ++ sz = zap->zap_dbuf->db_size; ++ mzp = kmem_alloc(sz, KM_PUSHPAGE | KM_NODEBUG); ++ bcopy(zap->zap_dbuf->db_data, mzp, sz); ++ nchunks = zap->zap_m.zap_num_chunks; ++ ++ if (!flags) { ++ err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, ++ 1ULL << fzap_default_block_shift, 0, tx); ++ if (err) { ++ kmem_free(mzp, sz); ++ return (err); ++ } ++ } ++ ++ dprintf("upgrading obj=%llu with %u chunks\n", ++ zap->zap_object, nchunks); ++ /* XXX destroy the avl later, so we can use the stored hash value */ ++ mze_destroy(zap); ++ ++ fzap_upgrade(zap, tx, flags); ++ ++ for (i = 0; i < nchunks; i++) { ++ mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; ++ zap_name_t *zn; ++ if (mze->mze_name[0] == 0) ++ continue; ++ dprintf("adding %s=%llu\n", ++ mze->mze_name, mze->mze_value); ++ zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); ++ err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx); ++ zap = zn->zn_zap; /* fzap_add_cd() may change zap */ ++ zap_name_free(zn); ++ if (err) ++ break; ++ } ++ kmem_free(mzp, sz); ++ *zapp = zap; ++ return (err); ++} ++ ++static void ++mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, ++ dmu_tx_t *tx) ++{ ++ dmu_buf_t *db; ++ mzap_phys_t *zp; ++ ++ VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); ++ ++#ifdef ZFS_DEBUG ++ { ++ dmu_object_info_t doi; ++ dmu_object_info_from_db(db, &doi); ++ ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); ++ } ++#endif ++ ++ dmu_buf_will_dirty(db, tx); ++ zp = db->db_data; ++ zp->mz_block_type = ZBT_MICRO; ++ zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; ++ zp->mz_normflags = normflags; ++ dmu_buf_rele(db, FTAG); ++ ++ if (flags != 0) { ++ zap_t *zap; ++ /* Only fat zap supports flags; upgrade immediately. */ ++ VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER, ++ B_FALSE, B_FALSE, &zap)); ++ VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags)); ++ zap_unlockdir(zap); ++ } ++} ++ ++int ++zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, ++ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) ++{ ++ return (zap_create_claim_norm(os, obj, ++ 0, ot, bonustype, bonuslen, tx)); ++} ++ ++int ++zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, ++ dmu_object_type_t ot, ++ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) ++{ ++ int err; ++ ++ err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); ++ if (err != 0) ++ return (err); ++ mzap_create_impl(os, obj, normflags, 0, tx); ++ return (0); ++} ++ ++uint64_t ++zap_create(objset_t *os, dmu_object_type_t ot, ++ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) ++{ ++ return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); ++} ++ ++uint64_t ++zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, ++ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) ++{ ++ uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); ++ ++ mzap_create_impl(os, obj, normflags, 0, tx); ++ return (obj); ++} ++ ++uint64_t ++zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, ++ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, ++ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) ++{ ++ uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); ++ ++ ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && ++ leaf_blockshift <= SPA_MAXBLOCKSHIFT && ++ indirect_blockshift >= SPA_MINBLOCKSHIFT && ++ indirect_blockshift <= SPA_MAXBLOCKSHIFT); ++ ++ VERIFY(dmu_object_set_blocksize(os, obj, ++ 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0); ++ ++ mzap_create_impl(os, obj, normflags, flags, tx); ++ return (obj); ++} ++ ++int ++zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) ++{ ++ /* ++ * dmu_object_free will free the object number and free the ++ * data. Freeing the data will cause our pageout function to be ++ * called, which will destroy our data (zap_leaf_t's and zap_t). ++ */ ++ ++ return (dmu_object_free(os, zapobj, tx)); ++} ++ ++_NOTE(ARGSUSED(0)) ++void ++zap_evict(dmu_buf_t *db, void *vzap) ++{ ++ zap_t *zap = vzap; ++ ++ rw_destroy(&zap->zap_rwlock); ++ ++ if (zap->zap_ismicro) ++ mze_destroy(zap); ++ else ++ mutex_destroy(&zap->zap_f.zap_num_entries_mtx); ++ ++ kmem_free(zap, sizeof (zap_t)); ++} ++ ++int ++zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) ++{ ++ zap_t *zap; ++ int err; ++ ++ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); ++ if (err) ++ return (err); ++ if (!zap->zap_ismicro) { ++ err = fzap_count(zap, count); ++ } else { ++ *count = zap->zap_m.zap_num_entries; ++ } ++ zap_unlockdir(zap); ++ return (err); ++} ++ ++/* ++ * zn may be NULL; if not specified, it will be computed if needed. ++ * See also the comment above zap_entry_normalization_conflict(). ++ */ ++static boolean_t ++mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) ++{ ++ mzap_ent_t *other; ++ int direction = AVL_BEFORE; ++ boolean_t allocdzn = B_FALSE; ++ ++ if (zap->zap_normflags == 0) ++ return (B_FALSE); ++ ++again: ++ for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction); ++ other && other->mze_hash == mze->mze_hash; ++ other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { ++ ++ if (zn == NULL) { ++ zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, ++ MT_FIRST); ++ allocdzn = B_TRUE; ++ } ++ if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { ++ if (allocdzn) ++ zap_name_free(zn); ++ return (B_TRUE); ++ } ++ } ++ ++ if (direction == AVL_BEFORE) { ++ direction = AVL_AFTER; ++ goto again; ++ } ++ ++ if (allocdzn) ++ zap_name_free(zn); ++ return (B_FALSE); ++} ++ ++/* ++ * Routines for manipulating attributes. ++ */ ++ ++int ++zap_lookup(objset_t *os, uint64_t zapobj, const char *name, ++ uint64_t integer_size, uint64_t num_integers, void *buf) ++{ ++ return (zap_lookup_norm(os, zapobj, name, integer_size, ++ num_integers, buf, MT_EXACT, NULL, 0, NULL)); ++} ++ ++int ++zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, ++ uint64_t integer_size, uint64_t num_integers, void *buf, ++ matchtype_t mt, char *realname, int rn_len, ++ boolean_t *ncp) ++{ ++ zap_t *zap; ++ int err; ++ mzap_ent_t *mze; ++ zap_name_t *zn; ++ ++ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); ++ if (err) ++ return (err); ++ zn = zap_name_alloc(zap, name, mt); ++ if (zn == NULL) { ++ zap_unlockdir(zap); ++ return (ENOTSUP); ++ } ++ ++ if (!zap->zap_ismicro) { ++ err = fzap_lookup(zn, integer_size, num_integers, buf, ++ realname, rn_len, ncp); ++ } else { ++ mze = mze_find(zn); ++ if (mze == NULL) { ++ err = ENOENT; ++ } else { ++ if (num_integers < 1) { ++ err = EOVERFLOW; ++ } else if (integer_size != 8) { ++ err = EINVAL; ++ } else { ++ *(uint64_t *)buf = ++ MZE_PHYS(zap, mze)->mze_value; ++ (void) strlcpy(realname, ++ MZE_PHYS(zap, mze)->mze_name, rn_len); ++ if (ncp) { ++ *ncp = mzap_normalization_conflict(zap, ++ zn, mze); ++ } ++ } ++ } ++ } ++ zap_name_free(zn); ++ zap_unlockdir(zap); ++ return (err); ++} ++ ++int ++zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, ++ int key_numints) ++{ ++ zap_t *zap; ++ int err; ++ zap_name_t *zn; ++ ++ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); ++ if (err) ++ return (err); ++ zn = zap_name_alloc_uint64(zap, key, key_numints); ++ if (zn == NULL) { ++ zap_unlockdir(zap); ++ return (ENOTSUP); ++ } ++ ++ fzap_prefetch(zn); ++ zap_name_free(zn); ++ zap_unlockdir(zap); ++ return (err); ++} ++ ++int ++zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, ++ int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) ++{ ++ zap_t *zap; ++ int err; ++ zap_name_t *zn; ++ ++ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); ++ if (err) ++ return (err); ++ zn = zap_name_alloc_uint64(zap, key, key_numints); ++ if (zn == NULL) { ++ zap_unlockdir(zap); ++ return (ENOTSUP); ++ } ++ ++ err = fzap_lookup(zn, integer_size, num_integers, buf, ++ NULL, 0, NULL); ++ zap_name_free(zn); ++ zap_unlockdir(zap); ++ return (err); ++} ++ ++int ++zap_contains(objset_t *os, uint64_t zapobj, const char *name) ++{ ++ int err = (zap_lookup_norm(os, zapobj, name, 0, ++ 0, NULL, MT_EXACT, NULL, 0, NULL)); ++ if (err == EOVERFLOW || err == EINVAL) ++ err = 0; /* found, but skipped reading the value */ ++ return (err); ++} ++ ++int ++zap_length(objset_t *os, uint64_t zapobj, const char *name, ++ uint64_t *integer_size, uint64_t *num_integers) ++{ ++ zap_t *zap; ++ int err; ++ mzap_ent_t *mze; ++ zap_name_t *zn; ++ ++ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); ++ if (err) ++ return (err); ++ zn = zap_name_alloc(zap, name, MT_EXACT); ++ if (zn == NULL) { ++ zap_unlockdir(zap); ++ return (ENOTSUP); ++ } ++ if (!zap->zap_ismicro) { ++ err = fzap_length(zn, integer_size, num_integers); ++ } else { ++ mze = mze_find(zn); ++ if (mze == NULL) { ++ err = ENOENT; ++ } else { ++ if (integer_size) ++ *integer_size = 8; ++ if (num_integers) ++ *num_integers = 1; ++ } ++ } ++ zap_name_free(zn); ++ zap_unlockdir(zap); ++ return (err); ++} ++ ++int ++zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, ++ int key_numints, uint64_t *integer_size, uint64_t *num_integers) ++{ ++ zap_t *zap; ++ int err; ++ zap_name_t *zn; ++ ++ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); ++ if (err) ++ return (err); ++ zn = zap_name_alloc_uint64(zap, key, key_numints); ++ if (zn == NULL) { ++ zap_unlockdir(zap); ++ return (ENOTSUP); ++ } ++ err = fzap_length(zn, integer_size, num_integers); ++ zap_name_free(zn); ++ zap_unlockdir(zap); ++ return (err); ++} ++ ++static void ++mzap_addent(zap_name_t *zn, uint64_t value) ++{ ++ int i; ++ zap_t *zap = zn->zn_zap; ++ int start = zap->zap_m.zap_alloc_next; ++ uint32_t cd; ++ ++ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ++ ++#ifdef ZFS_DEBUG ++ for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { ++ ASSERTV(mzap_ent_phys_t *mze=&zap->zap_m.zap_phys->mz_chunk[i]); ++ ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); ++ } ++#endif ++ ++ cd = mze_find_unused_cd(zap, zn->zn_hash); ++ /* given the limited size of the microzap, this can't happen */ ++ ASSERT(cd < zap_maxcd(zap)); ++ ++again: ++ for (i = start; i < zap->zap_m.zap_num_chunks; i++) { ++ mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; ++ if (mze->mze_name[0] == 0) { ++ mze->mze_value = value; ++ mze->mze_cd = cd; ++ (void) strcpy(mze->mze_name, zn->zn_key_orig); ++ zap->zap_m.zap_num_entries++; ++ zap->zap_m.zap_alloc_next = i+1; ++ if (zap->zap_m.zap_alloc_next == ++ zap->zap_m.zap_num_chunks) ++ zap->zap_m.zap_alloc_next = 0; ++ mze_insert(zap, i, zn->zn_hash); ++ return; ++ } ++ } ++ if (start != 0) { ++ start = 0; ++ goto again; ++ } ++ ASSERT(!"out of entries!"); ++} ++ ++int ++zap_add(objset_t *os, uint64_t zapobj, const char *key, ++ int integer_size, uint64_t num_integers, ++ const void *val, dmu_tx_t *tx) ++{ ++ zap_t *zap; ++ int err; ++ mzap_ent_t *mze; ++ const uint64_t *intval = val; ++ zap_name_t *zn; ++ ++ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); ++ if (err) ++ return (err); ++ zn = zap_name_alloc(zap, key, MT_EXACT); ++ if (zn == NULL) { ++ zap_unlockdir(zap); ++ return (ENOTSUP); ++ } ++ if (!zap->zap_ismicro) { ++ err = fzap_add(zn, integer_size, num_integers, val, tx); ++ zap = zn->zn_zap; /* fzap_add() may change zap */ ++ } else if (integer_size != 8 || num_integers != 1 || ++ strlen(key) >= MZAP_NAME_LEN) { ++ err = mzap_upgrade(&zn->zn_zap, tx, 0); ++ if (err == 0) ++ err = fzap_add(zn, integer_size, num_integers, val, tx); ++ zap = zn->zn_zap; /* fzap_add() may change zap */ ++ } else { ++ mze = mze_find(zn); ++ if (mze != NULL) { ++ err = EEXIST; ++ } else { ++ mzap_addent(zn, *intval); ++ } ++ } ++ ASSERT(zap == zn->zn_zap); ++ zap_name_free(zn); ++ if (zap != NULL) /* may be NULL if fzap_add() failed */ ++ zap_unlockdir(zap); ++ return (err); ++} ++ ++int ++zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, ++ int key_numints, int integer_size, uint64_t num_integers, ++ const void *val, dmu_tx_t *tx) ++{ ++ zap_t *zap; ++ int err; ++ zap_name_t *zn; ++ ++ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); ++ if (err) ++ return (err); ++ zn = zap_name_alloc_uint64(zap, key, key_numints); ++ if (zn == NULL) { ++ zap_unlockdir(zap); ++ return (ENOTSUP); ++ } ++ err = fzap_add(zn, integer_size, num_integers, val, tx); ++ zap = zn->zn_zap; /* fzap_add() may change zap */ ++ zap_name_free(zn); ++ if (zap != NULL) /* may be NULL if fzap_add() failed */ ++ zap_unlockdir(zap); ++ return (err); ++} ++ ++int ++zap_update(objset_t *os, uint64_t zapobj, const char *name, ++ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) ++{ ++ zap_t *zap; ++ mzap_ent_t *mze; ++ const uint64_t *intval = val; ++ zap_name_t *zn; ++ int err; ++ ++#ifdef ZFS_DEBUG ++ uint64_t oldval; ++ ++ /* ++ * If there is an old value, it shouldn't change across the ++ * lockdir (eg, due to bprewrite's xlation). ++ */ ++ if (integer_size == 8 && num_integers == 1) ++ (void) zap_lookup(os, zapobj, name, 8, 1, &oldval); ++#endif ++ ++ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); ++ if (err) ++ return (err); ++ zn = zap_name_alloc(zap, name, MT_EXACT); ++ if (zn == NULL) { ++ zap_unlockdir(zap); ++ return (ENOTSUP); ++ } ++ if (!zap->zap_ismicro) { ++ err = fzap_update(zn, integer_size, num_integers, val, tx); ++ zap = zn->zn_zap; /* fzap_update() may change zap */ ++ } else if (integer_size != 8 || num_integers != 1 || ++ strlen(name) >= MZAP_NAME_LEN) { ++ dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", ++ zapobj, integer_size, num_integers, name); ++ err = mzap_upgrade(&zn->zn_zap, tx, 0); ++ if (err == 0) ++ err = fzap_update(zn, integer_size, num_integers, ++ val, tx); ++ zap = zn->zn_zap; /* fzap_update() may change zap */ ++ } else { ++ mze = mze_find(zn); ++ if (mze != NULL) { ++ ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval); ++ MZE_PHYS(zap, mze)->mze_value = *intval; ++ } else { ++ mzap_addent(zn, *intval); ++ } ++ } ++ ASSERT(zap == zn->zn_zap); ++ zap_name_free(zn); ++ if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ ++ zap_unlockdir(zap); ++ return (err); ++} ++ ++int ++zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, ++ int key_numints, ++ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) ++{ ++ zap_t *zap; ++ zap_name_t *zn; ++ int err; ++ ++ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); ++ if (err) ++ return (err); ++ zn = zap_name_alloc_uint64(zap, key, key_numints); ++ if (zn == NULL) { ++ zap_unlockdir(zap); ++ return (ENOTSUP); ++ } ++ err = fzap_update(zn, integer_size, num_integers, val, tx); ++ zap = zn->zn_zap; /* fzap_update() may change zap */ ++ zap_name_free(zn); ++ if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ ++ zap_unlockdir(zap); ++ return (err); ++} ++ ++int ++zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) ++{ ++ return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx)); ++} ++ ++int ++zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, ++ matchtype_t mt, dmu_tx_t *tx) ++{ ++ zap_t *zap; ++ int err; ++ mzap_ent_t *mze; ++ zap_name_t *zn; ++ ++ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); ++ if (err) ++ return (err); ++ zn = zap_name_alloc(zap, name, mt); ++ if (zn == NULL) { ++ zap_unlockdir(zap); ++ return (ENOTSUP); ++ } ++ if (!zap->zap_ismicro) { ++ err = fzap_remove(zn, tx); ++ } else { ++ mze = mze_find(zn); ++ if (mze == NULL) { ++ err = ENOENT; ++ } else { ++ zap->zap_m.zap_num_entries--; ++ bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], ++ sizeof (mzap_ent_phys_t)); ++ mze_remove(zap, mze); ++ } ++ } ++ zap_name_free(zn); ++ zap_unlockdir(zap); ++ return (err); ++} ++ ++int ++zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, ++ int key_numints, dmu_tx_t *tx) ++{ ++ zap_t *zap; ++ int err; ++ zap_name_t *zn; ++ ++ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); ++ if (err) ++ return (err); ++ zn = zap_name_alloc_uint64(zap, key, key_numints); ++ if (zn == NULL) { ++ zap_unlockdir(zap); ++ return (ENOTSUP); ++ } ++ err = fzap_remove(zn, tx); ++ zap_name_free(zn); ++ zap_unlockdir(zap); ++ return (err); ++} ++ ++/* ++ * Routines for iterating over the attributes. ++ */ ++ ++void ++zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, ++ uint64_t serialized) ++{ ++ zc->zc_objset = os; ++ zc->zc_zap = NULL; ++ zc->zc_leaf = NULL; ++ zc->zc_zapobj = zapobj; ++ zc->zc_serialized = serialized; ++ zc->zc_hash = 0; ++ zc->zc_cd = 0; ++} ++ ++void ++zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) ++{ ++ zap_cursor_init_serialized(zc, os, zapobj, 0); ++} ++ ++void ++zap_cursor_fini(zap_cursor_t *zc) ++{ ++ if (zc->zc_zap) { ++ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); ++ zap_unlockdir(zc->zc_zap); ++ zc->zc_zap = NULL; ++ } ++ if (zc->zc_leaf) { ++ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); ++ zap_put_leaf(zc->zc_leaf); ++ zc->zc_leaf = NULL; ++ } ++ zc->zc_objset = NULL; ++} ++ ++uint64_t ++zap_cursor_serialize(zap_cursor_t *zc) ++{ ++ if (zc->zc_hash == -1ULL) ++ return (-1ULL); ++ if (zc->zc_zap == NULL) ++ return (zc->zc_serialized); ++ ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); ++ ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); ++ ++ /* ++ * We want to keep the high 32 bits of the cursor zero if we can, so ++ * that 32-bit programs can access this. So usually use a small ++ * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits ++ * of the cursor. ++ * ++ * [ collision differentiator | zap_hashbits()-bit hash value ] ++ */ ++ return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | ++ ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); ++} ++ ++int ++zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) ++{ ++ int err; ++ avl_index_t idx; ++ mzap_ent_t mze_tofind; ++ mzap_ent_t *mze; ++ ++ if (zc->zc_hash == -1ULL) ++ return (ENOENT); ++ ++ if (zc->zc_zap == NULL) { ++ int hb; ++ err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, ++ RW_READER, TRUE, FALSE, &zc->zc_zap); ++ if (err) ++ return (err); ++ ++ /* ++ * To support zap_cursor_init_serialized, advance, retrieve, ++ * we must add to the existing zc_cd, which may already ++ * be 1 due to the zap_cursor_advance. ++ */ ++ ASSERT(zc->zc_hash == 0); ++ hb = zap_hashbits(zc->zc_zap); ++ zc->zc_hash = zc->zc_serialized << (64 - hb); ++ zc->zc_cd += zc->zc_serialized >> hb; ++ if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ ++ zc->zc_cd = 0; ++ } else { ++ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); ++ } ++ if (!zc->zc_zap->zap_ismicro) { ++ err = fzap_cursor_retrieve(zc->zc_zap, zc, za); ++ } else { ++ err = ENOENT; ++ ++ mze_tofind.mze_hash = zc->zc_hash; ++ mze_tofind.mze_cd = zc->zc_cd; ++ ++ mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); ++ if (mze == NULL) { ++ mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, ++ idx, AVL_AFTER); ++ } ++ if (mze) { ++ mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); ++ ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); ++ za->za_normalization_conflict = ++ mzap_normalization_conflict(zc->zc_zap, NULL, mze); ++ za->za_integer_length = 8; ++ za->za_num_integers = 1; ++ za->za_first_integer = mzep->mze_value; ++ (void) strcpy(za->za_name, mzep->mze_name); ++ zc->zc_hash = mze->mze_hash; ++ zc->zc_cd = mze->mze_cd; ++ err = 0; ++ } else { ++ zc->zc_hash = -1ULL; ++ } ++ } ++ rw_exit(&zc->zc_zap->zap_rwlock); ++ return (err); ++} ++ ++void ++zap_cursor_advance(zap_cursor_t *zc) ++{ ++ if (zc->zc_hash == -1ULL) ++ return; ++ zc->zc_cd++; ++} ++ ++int ++zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) ++{ ++ int err = 0; ++ mzap_ent_t *mze; ++ zap_name_t *zn; ++ ++ if (zc->zc_zap == NULL) { ++ err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, ++ RW_READER, TRUE, FALSE, &zc->zc_zap); ++ if (err) ++ return (err); ++ } else { ++ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); ++ } ++ ++ zn = zap_name_alloc(zc->zc_zap, name, mt); ++ if (zn == NULL) { ++ rw_exit(&zc->zc_zap->zap_rwlock); ++ return (ENOTSUP); ++ } ++ ++ if (!zc->zc_zap->zap_ismicro) { ++ err = fzap_cursor_move_to_key(zc, zn); ++ } else { ++ mze = mze_find(zn); ++ if (mze == NULL) { ++ err = ENOENT; ++ goto out; ++ } ++ zc->zc_hash = mze->mze_hash; ++ zc->zc_cd = mze->mze_cd; ++ } ++ ++out: ++ zap_name_free(zn); ++ rw_exit(&zc->zc_zap->zap_rwlock); ++ return (err); ++} ++ ++int ++zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) ++{ ++ int err; ++ zap_t *zap; ++ ++ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); ++ if (err) ++ return (err); ++ ++ bzero(zs, sizeof (zap_stats_t)); ++ ++ if (zap->zap_ismicro) { ++ zs->zs_blocksize = zap->zap_dbuf->db_size; ++ zs->zs_num_entries = zap->zap_m.zap_num_entries; ++ zs->zs_num_blocks = 1; ++ } else { ++ fzap_get_stats(zap, zs); ++ } ++ zap_unlockdir(zap); ++ return (0); ++} ++ ++int ++zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, ++ uint64_t *towrite, uint64_t *tooverwrite) ++{ ++ zap_t *zap; ++ int err = 0; ++ ++ ++ /* ++ * Since, we don't have a name, we cannot figure out which blocks will ++ * be affected in this operation. So, account for the worst case : ++ * - 3 blocks overwritten: target leaf, ptrtbl block, header block ++ * - 4 new blocks written if adding: ++ * - 2 blocks for possibly split leaves, ++ * - 2 grown ptrtbl blocks ++ * ++ * This also accomodates the case where an add operation to a fairly ++ * large microzap results in a promotion to fatzap. ++ */ ++ if (name == NULL) { ++ *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; ++ return (err); ++ } ++ ++ /* ++ * We lock the zap with adding == FALSE. Because, if we pass ++ * the actual value of add, it could trigger a mzap_upgrade(). ++ * At present we are just evaluating the possibility of this operation ++ * and hence we donot want to trigger an upgrade. ++ */ ++ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); ++ if (err) ++ return (err); ++ ++ if (!zap->zap_ismicro) { ++ zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT); ++ if (zn) { ++ err = fzap_count_write(zn, add, towrite, ++ tooverwrite); ++ zap_name_free(zn); ++ } else { ++ /* ++ * We treat this case as similar to (name == NULL) ++ */ ++ *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; ++ } ++ } else { ++ /* ++ * We are here if (name != NULL) and this is a micro-zap. ++ * We account for the header block depending on whether it ++ * is freeable. ++ * ++ * Incase of an add-operation it is hard to find out ++ * if this add will promote this microzap to fatzap. ++ * Hence, we consider the worst case and account for the ++ * blocks assuming this microzap would be promoted to a ++ * fatzap. ++ * ++ * 1 block overwritten : header block ++ * 4 new blocks written : 2 new split leaf, 2 grown ++ * ptrtbl blocks ++ */ ++ if (dmu_buf_freeable(zap->zap_dbuf)) ++ *tooverwrite += SPA_MAXBLOCKSIZE; ++ else ++ *towrite += SPA_MAXBLOCKSIZE; ++ ++ if (add) { ++ *towrite += 4 * SPA_MAXBLOCKSIZE; ++ } ++ } ++ ++ zap_unlockdir(zap); ++ return (err); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(zap_create); ++EXPORT_SYMBOL(zap_create_norm); ++EXPORT_SYMBOL(zap_create_flags); ++EXPORT_SYMBOL(zap_create_claim); ++EXPORT_SYMBOL(zap_create_claim_norm); ++EXPORT_SYMBOL(zap_destroy); ++EXPORT_SYMBOL(zap_lookup); ++EXPORT_SYMBOL(zap_lookup_norm); ++EXPORT_SYMBOL(zap_lookup_uint64); ++EXPORT_SYMBOL(zap_contains); ++EXPORT_SYMBOL(zap_prefetch_uint64); ++EXPORT_SYMBOL(zap_count_write); ++EXPORT_SYMBOL(zap_add); ++EXPORT_SYMBOL(zap_add_uint64); ++EXPORT_SYMBOL(zap_update); ++EXPORT_SYMBOL(zap_update_uint64); ++EXPORT_SYMBOL(zap_length); ++EXPORT_SYMBOL(zap_length_uint64); ++EXPORT_SYMBOL(zap_remove); ++EXPORT_SYMBOL(zap_remove_norm); ++EXPORT_SYMBOL(zap_remove_uint64); ++EXPORT_SYMBOL(zap_count); ++EXPORT_SYMBOL(zap_value_search); ++EXPORT_SYMBOL(zap_join); ++EXPORT_SYMBOL(zap_join_increment); ++EXPORT_SYMBOL(zap_add_int); ++EXPORT_SYMBOL(zap_remove_int); ++EXPORT_SYMBOL(zap_lookup_int); ++EXPORT_SYMBOL(zap_increment_int); ++EXPORT_SYMBOL(zap_add_int_key); ++EXPORT_SYMBOL(zap_lookup_int_key); ++EXPORT_SYMBOL(zap_increment); ++EXPORT_SYMBOL(zap_cursor_init); ++EXPORT_SYMBOL(zap_cursor_fini); ++EXPORT_SYMBOL(zap_cursor_retrieve); ++EXPORT_SYMBOL(zap_cursor_advance); ++EXPORT_SYMBOL(zap_cursor_serialize); ++EXPORT_SYMBOL(zap_cursor_move_to_key); ++EXPORT_SYMBOL(zap_cursor_init_serialized); ++EXPORT_SYMBOL(zap_get_stats); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_acl.c linux-3.2.33-go/fs/zfs/zfs/zfs_acl.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_acl.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_acl.c 2012-11-16 23:25:34.350039322 +0100 +@@ -0,0 +1,2799 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "fs/fs_subr.h" ++ ++#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE ++#define DENY ACE_ACCESS_DENIED_ACE_TYPE ++#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE ++#define MIN_ACE_TYPE ALLOW ++ ++#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) ++#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ ++ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) ++#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ ++ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) ++#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ ++ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) ++ ++#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ ++ ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ ++ ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ ++ ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) ++ ++#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) ++#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ ++ ACE_DELETE|ACE_DELETE_CHILD) ++#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) ++ ++#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ++ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) ++ ++#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ++ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) ++ ++#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ ++ ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) ++ ++#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) ++ ++#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ ++ ZFS_ACL_PROTECTED) ++ ++#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ ++ ZFS_ACL_OBJ_ACE) ++ ++#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) ++ ++static uint16_t ++zfs_ace_v0_get_type(void *acep) ++{ ++ return (((zfs_oldace_t *)acep)->z_type); ++} ++ ++static uint16_t ++zfs_ace_v0_get_flags(void *acep) ++{ ++ return (((zfs_oldace_t *)acep)->z_flags); ++} ++ ++static uint32_t ++zfs_ace_v0_get_mask(void *acep) ++{ ++ return (((zfs_oldace_t *)acep)->z_access_mask); ++} ++ ++static uint64_t ++zfs_ace_v0_get_who(void *acep) ++{ ++ return (((zfs_oldace_t *)acep)->z_fuid); ++} ++ ++static void ++zfs_ace_v0_set_type(void *acep, uint16_t type) ++{ ++ ((zfs_oldace_t *)acep)->z_type = type; ++} ++ ++static void ++zfs_ace_v0_set_flags(void *acep, uint16_t flags) ++{ ++ ((zfs_oldace_t *)acep)->z_flags = flags; ++} ++ ++static void ++zfs_ace_v0_set_mask(void *acep, uint32_t mask) ++{ ++ ((zfs_oldace_t *)acep)->z_access_mask = mask; ++} ++ ++static void ++zfs_ace_v0_set_who(void *acep, uint64_t who) ++{ ++ ((zfs_oldace_t *)acep)->z_fuid = who; ++} ++ ++/*ARGSUSED*/ ++static size_t ++zfs_ace_v0_size(void *acep) ++{ ++ return (sizeof (zfs_oldace_t)); ++} ++ ++static size_t ++zfs_ace_v0_abstract_size(void) ++{ ++ return (sizeof (zfs_oldace_t)); ++} ++ ++static int ++zfs_ace_v0_mask_off(void) ++{ ++ return (offsetof(zfs_oldace_t, z_access_mask)); ++} ++ ++/*ARGSUSED*/ ++static int ++zfs_ace_v0_data(void *acep, void **datap) ++{ ++ *datap = NULL; ++ return (0); ++} ++ ++static acl_ops_t zfs_acl_v0_ops = { ++ zfs_ace_v0_get_mask, ++ zfs_ace_v0_set_mask, ++ zfs_ace_v0_get_flags, ++ zfs_ace_v0_set_flags, ++ zfs_ace_v0_get_type, ++ zfs_ace_v0_set_type, ++ zfs_ace_v0_get_who, ++ zfs_ace_v0_set_who, ++ zfs_ace_v0_size, ++ zfs_ace_v0_abstract_size, ++ zfs_ace_v0_mask_off, ++ zfs_ace_v0_data ++}; ++ ++static uint16_t ++zfs_ace_fuid_get_type(void *acep) ++{ ++ return (((zfs_ace_hdr_t *)acep)->z_type); ++} ++ ++static uint16_t ++zfs_ace_fuid_get_flags(void *acep) ++{ ++ return (((zfs_ace_hdr_t *)acep)->z_flags); ++} ++ ++static uint32_t ++zfs_ace_fuid_get_mask(void *acep) ++{ ++ return (((zfs_ace_hdr_t *)acep)->z_access_mask); ++} ++ ++static uint64_t ++zfs_ace_fuid_get_who(void *args) ++{ ++ uint16_t entry_type; ++ zfs_ace_t *acep = args; ++ ++ entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; ++ ++ if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || ++ entry_type == ACE_EVERYONE) ++ return (-1); ++ return (((zfs_ace_t *)acep)->z_fuid); ++} ++ ++static void ++zfs_ace_fuid_set_type(void *acep, uint16_t type) ++{ ++ ((zfs_ace_hdr_t *)acep)->z_type = type; ++} ++ ++static void ++zfs_ace_fuid_set_flags(void *acep, uint16_t flags) ++{ ++ ((zfs_ace_hdr_t *)acep)->z_flags = flags; ++} ++ ++static void ++zfs_ace_fuid_set_mask(void *acep, uint32_t mask) ++{ ++ ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; ++} ++ ++static void ++zfs_ace_fuid_set_who(void *arg, uint64_t who) ++{ ++ zfs_ace_t *acep = arg; ++ ++ uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; ++ ++ if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || ++ entry_type == ACE_EVERYONE) ++ return; ++ acep->z_fuid = who; ++} ++ ++static size_t ++zfs_ace_fuid_size(void *acep) ++{ ++ zfs_ace_hdr_t *zacep = acep; ++ uint16_t entry_type; ++ ++ switch (zacep->z_type) { ++ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: ++ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: ++ return (sizeof (zfs_object_ace_t)); ++ case ALLOW: ++ case DENY: ++ entry_type = ++ (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); ++ if (entry_type == ACE_OWNER || ++ entry_type == OWNING_GROUP || ++ entry_type == ACE_EVERYONE) ++ return (sizeof (zfs_ace_hdr_t)); ++ /*FALLTHROUGH*/ ++ default: ++ return (sizeof (zfs_ace_t)); ++ } ++} ++ ++static size_t ++zfs_ace_fuid_abstract_size(void) ++{ ++ return (sizeof (zfs_ace_hdr_t)); ++} ++ ++static int ++zfs_ace_fuid_mask_off(void) ++{ ++ return (offsetof(zfs_ace_hdr_t, z_access_mask)); ++} ++ ++static int ++zfs_ace_fuid_data(void *acep, void **datap) ++{ ++ zfs_ace_t *zacep = acep; ++ zfs_object_ace_t *zobjp; ++ ++ switch (zacep->z_hdr.z_type) { ++ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: ++ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: ++ zobjp = acep; ++ *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); ++ return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); ++ default: ++ *datap = NULL; ++ return (0); ++ } ++} ++ ++static acl_ops_t zfs_acl_fuid_ops = { ++ zfs_ace_fuid_get_mask, ++ zfs_ace_fuid_set_mask, ++ zfs_ace_fuid_get_flags, ++ zfs_ace_fuid_set_flags, ++ zfs_ace_fuid_get_type, ++ zfs_ace_fuid_set_type, ++ zfs_ace_fuid_get_who, ++ zfs_ace_fuid_set_who, ++ zfs_ace_fuid_size, ++ zfs_ace_fuid_abstract_size, ++ zfs_ace_fuid_mask_off, ++ zfs_ace_fuid_data ++}; ++ ++/* ++ * The following three functions are provided for compatibility with ++ * older ZPL version in order to determine if the file use to have ++ * an external ACL and what version of ACL previously existed on the ++ * file. Would really be nice to not need this, sigh. ++ */ ++uint64_t ++zfs_external_acl(znode_t *zp) ++{ ++ zfs_acl_phys_t acl_phys; ++ int error; ++ ++ if (zp->z_is_sa) ++ return (0); ++ ++ /* ++ * Need to deal with a potential ++ * race where zfs_sa_upgrade could cause ++ * z_isa_sa to change. ++ * ++ * If the lookup fails then the state of z_is_sa should have ++ * changed. ++ */ ++ ++ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(ZTOZSB(zp)), ++ &acl_phys, sizeof (acl_phys))) == 0) ++ return (acl_phys.z_acl_extern_obj); ++ else { ++ /* ++ * after upgrade the SA_ZPL_ZNODE_ACL should have been ++ * removed ++ */ ++ VERIFY(zp->z_is_sa && error == ENOENT); ++ return (0); ++ } ++} ++ ++/* ++ * Determine size of ACL in bytes ++ * ++ * This is more complicated than it should be since we have to deal ++ * with old external ACLs. ++ */ ++static int ++zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, ++ zfs_acl_phys_t *aclphys) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ uint64_t acl_count; ++ int size; ++ int error; ++ ++ ASSERT(MUTEX_HELD(&zp->z_acl_lock)); ++ if (zp->z_is_sa) { ++ if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zsb), ++ &size)) != 0) ++ return (error); ++ *aclsize = size; ++ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zsb), ++ &acl_count, sizeof (acl_count))) != 0) ++ return (error); ++ *aclcount = acl_count; ++ } else { ++ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zsb), ++ aclphys, sizeof (*aclphys))) != 0) ++ return (error); ++ ++ if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { ++ *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); ++ *aclcount = aclphys->z_acl_size; ++ } else { ++ *aclsize = aclphys->z_acl_size; ++ *aclcount = aclphys->z_acl_count; ++ } ++ } ++ return (0); ++} ++ ++int ++zfs_znode_acl_version(znode_t *zp) ++{ ++ zfs_acl_phys_t acl_phys; ++ ++ if (zp->z_is_sa) ++ return (ZFS_ACL_VERSION_FUID); ++ else { ++ int error; ++ ++ /* ++ * Need to deal with a potential ++ * race where zfs_sa_upgrade could cause ++ * z_isa_sa to change. ++ * ++ * If the lookup fails then the state of z_is_sa should have ++ * changed. ++ */ ++ if ((error = sa_lookup(zp->z_sa_hdl, ++ SA_ZPL_ZNODE_ACL(ZTOZSB(zp)), ++ &acl_phys, sizeof (acl_phys))) == 0) ++ return (acl_phys.z_acl_version); ++ else { ++ /* ++ * After upgrade SA_ZPL_ZNODE_ACL should have ++ * been removed. ++ */ ++ VERIFY(zp->z_is_sa && error == ENOENT); ++ return (ZFS_ACL_VERSION_FUID); ++ } ++ } ++} ++ ++static int ++zfs_acl_version(int version) ++{ ++ if (version < ZPL_VERSION_FUID) ++ return (ZFS_ACL_VERSION_INITIAL); ++ else ++ return (ZFS_ACL_VERSION_FUID); ++} ++ ++static int ++zfs_acl_version_zp(znode_t *zp) ++{ ++ return (zfs_acl_version(ZTOZSB(zp)->z_version)); ++} ++ ++zfs_acl_t * ++zfs_acl_alloc(int vers) ++{ ++ zfs_acl_t *aclp; ++ ++ aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_PUSHPAGE); ++ list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), ++ offsetof(zfs_acl_node_t, z_next)); ++ aclp->z_version = vers; ++ if (vers == ZFS_ACL_VERSION_FUID) ++ aclp->z_ops = &zfs_acl_fuid_ops; ++ else ++ aclp->z_ops = &zfs_acl_v0_ops; ++ return (aclp); ++} ++ ++zfs_acl_node_t * ++zfs_acl_node_alloc(size_t bytes) ++{ ++ zfs_acl_node_t *aclnode; ++ ++ aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_PUSHPAGE); ++ if (bytes) { ++ aclnode->z_acldata = kmem_alloc(bytes, KM_PUSHPAGE); ++ aclnode->z_allocdata = aclnode->z_acldata; ++ aclnode->z_allocsize = bytes; ++ aclnode->z_size = bytes; ++ } ++ ++ return (aclnode); ++} ++ ++static void ++zfs_acl_node_free(zfs_acl_node_t *aclnode) ++{ ++ if (aclnode->z_allocsize) ++ kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); ++ kmem_free(aclnode, sizeof (zfs_acl_node_t)); ++} ++ ++static void ++zfs_acl_release_nodes(zfs_acl_t *aclp) ++{ ++ zfs_acl_node_t *aclnode; ++ ++ while ((aclnode = list_head(&aclp->z_acl))) { ++ list_remove(&aclp->z_acl, aclnode); ++ zfs_acl_node_free(aclnode); ++ } ++ aclp->z_acl_count = 0; ++ aclp->z_acl_bytes = 0; ++} ++ ++void ++zfs_acl_free(zfs_acl_t *aclp) ++{ ++ zfs_acl_release_nodes(aclp); ++ list_destroy(&aclp->z_acl); ++ kmem_free(aclp, sizeof (zfs_acl_t)); ++} ++ ++static boolean_t ++zfs_acl_valid_ace_type(uint_t type, uint_t flags) ++{ ++ uint16_t entry_type; ++ ++ switch (type) { ++ case ALLOW: ++ case DENY: ++ case ACE_SYSTEM_AUDIT_ACE_TYPE: ++ case ACE_SYSTEM_ALARM_ACE_TYPE: ++ entry_type = flags & ACE_TYPE_FLAGS; ++ return (entry_type == ACE_OWNER || ++ entry_type == OWNING_GROUP || ++ entry_type == ACE_EVERYONE || entry_type == 0 || ++ entry_type == ACE_IDENTIFIER_GROUP); ++ default: ++ if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) ++ return (B_TRUE); ++ } ++ return (B_FALSE); ++} ++ ++static boolean_t ++zfs_ace_valid(umode_t obj_mode, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) ++{ ++ /* ++ * first check type of entry ++ */ ++ ++ if (!zfs_acl_valid_ace_type(type, iflags)) ++ return (B_FALSE); ++ ++ switch (type) { ++ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: ++ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: ++ if (aclp->z_version < ZFS_ACL_VERSION_FUID) ++ return (B_FALSE); ++ aclp->z_hints |= ZFS_ACL_OBJ_ACE; ++ } ++ ++ /* ++ * next check inheritance level flags ++ */ ++ ++ if (S_ISDIR(obj_mode) && ++ (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) ++ aclp->z_hints |= ZFS_INHERIT_ACE; ++ ++ if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { ++ if ((iflags & (ACE_FILE_INHERIT_ACE| ++ ACE_DIRECTORY_INHERIT_ACE)) == 0) { ++ return (B_FALSE); ++ } ++ } ++ ++ return (B_TRUE); ++} ++ ++static void * ++zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, ++ uint32_t *access_mask, uint16_t *iflags, uint16_t *type) ++{ ++ zfs_acl_node_t *aclnode; ++ ++ ASSERT(aclp); ++ ++ if (start == NULL) { ++ aclnode = list_head(&aclp->z_acl); ++ if (aclnode == NULL) ++ return (NULL); ++ ++ aclp->z_next_ace = aclnode->z_acldata; ++ aclp->z_curr_node = aclnode; ++ aclnode->z_ace_idx = 0; ++ } ++ ++ aclnode = aclp->z_curr_node; ++ ++ if (aclnode == NULL) ++ return (NULL); ++ ++ if (aclnode->z_ace_idx >= aclnode->z_ace_count) { ++ aclnode = list_next(&aclp->z_acl, aclnode); ++ if (aclnode == NULL) ++ return (NULL); ++ else { ++ aclp->z_curr_node = aclnode; ++ aclnode->z_ace_idx = 0; ++ aclp->z_next_ace = aclnode->z_acldata; ++ } ++ } ++ ++ if (aclnode->z_ace_idx < aclnode->z_ace_count) { ++ void *acep = aclp->z_next_ace; ++ size_t ace_size; ++ ++ /* ++ * Make sure we don't overstep our bounds ++ */ ++ ace_size = aclp->z_ops->ace_size(acep); ++ ++ if (((caddr_t)acep + ace_size) > ++ ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { ++ return (NULL); ++ } ++ ++ *iflags = aclp->z_ops->ace_flags_get(acep); ++ *type = aclp->z_ops->ace_type_get(acep); ++ *access_mask = aclp->z_ops->ace_mask_get(acep); ++ *who = aclp->z_ops->ace_who_get(acep); ++ aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; ++ aclnode->z_ace_idx++; ++ ++ return ((void *)acep); ++ } ++ return (NULL); ++} ++ ++/*ARGSUSED*/ ++static uint64_t ++zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, ++ uint16_t *flags, uint16_t *type, uint32_t *mask) ++{ ++ zfs_acl_t *aclp = datap; ++ zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; ++ uint64_t who; ++ ++ acep = zfs_acl_next_ace(aclp, acep, &who, mask, ++ flags, type); ++ return ((uint64_t)(uintptr_t)acep); ++} ++ ++/* ++ * Copy ACE to internal ZFS format. ++ * While processing the ACL each ACE will be validated for correctness. ++ * ACE FUIDs will be created later. ++ */ ++int ++zfs_copy_ace_2_fuid(zfs_sb_t *zsb, umode_t obj_mode, zfs_acl_t *aclp, ++ void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, ++ zfs_fuid_info_t **fuidp, cred_t *cr) ++{ ++ int i; ++ uint16_t entry_type; ++ zfs_ace_t *aceptr = z_acl; ++ ace_t *acep = datap; ++ zfs_object_ace_t *zobjacep; ++ ace_object_t *aceobjp; ++ ++ for (i = 0; i != aclcnt; i++) { ++ aceptr->z_hdr.z_access_mask = acep->a_access_mask; ++ aceptr->z_hdr.z_flags = acep->a_flags; ++ aceptr->z_hdr.z_type = acep->a_type; ++ entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; ++ if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && ++ entry_type != ACE_EVERYONE) { ++ aceptr->z_fuid = zfs_fuid_create(zsb, acep->a_who, ++ cr, (entry_type == 0) ? ++ ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); ++ } ++ ++ /* ++ * Make sure ACE is valid ++ */ ++ if (zfs_ace_valid(obj_mode, aclp, aceptr->z_hdr.z_type, ++ aceptr->z_hdr.z_flags) != B_TRUE) ++ return (EINVAL); ++ ++ switch (acep->a_type) { ++ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: ++ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: ++ zobjacep = (zfs_object_ace_t *)aceptr; ++ aceobjp = (ace_object_t *)acep; ++ ++ bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, ++ sizeof (aceobjp->a_obj_type)); ++ bcopy(aceobjp->a_inherit_obj_type, ++ zobjacep->z_inherit_type, ++ sizeof (aceobjp->a_inherit_obj_type)); ++ acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); ++ break; ++ default: ++ acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); ++ } ++ ++ aceptr = (zfs_ace_t *)((caddr_t)aceptr + ++ aclp->z_ops->ace_size(aceptr)); ++ } ++ ++ *size = (caddr_t)aceptr - (caddr_t)z_acl; ++ ++ return (0); ++} ++ ++/* ++ * Copy ZFS ACEs to fixed size ace_t layout ++ */ ++static void ++zfs_copy_fuid_2_ace(zfs_sb_t *zsb, zfs_acl_t *aclp, cred_t *cr, ++ void *datap, int filter) ++{ ++ uint64_t who; ++ uint32_t access_mask; ++ uint16_t iflags, type; ++ zfs_ace_hdr_t *zacep = NULL; ++ ace_t *acep = datap; ++ ace_object_t *objacep; ++ zfs_object_ace_t *zobjacep; ++ size_t ace_size; ++ uint16_t entry_type; ++ ++ while ((zacep = zfs_acl_next_ace(aclp, zacep, ++ &who, &access_mask, &iflags, &type))) { ++ ++ switch (type) { ++ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: ++ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: ++ if (filter) { ++ continue; ++ } ++ zobjacep = (zfs_object_ace_t *)zacep; ++ objacep = (ace_object_t *)acep; ++ bcopy(zobjacep->z_object_type, ++ objacep->a_obj_type, ++ sizeof (zobjacep->z_object_type)); ++ bcopy(zobjacep->z_inherit_type, ++ objacep->a_inherit_obj_type, ++ sizeof (zobjacep->z_inherit_type)); ++ ace_size = sizeof (ace_object_t); ++ break; ++ default: ++ ace_size = sizeof (ace_t); ++ break; ++ } ++ ++ entry_type = (iflags & ACE_TYPE_FLAGS); ++ if ((entry_type != ACE_OWNER && ++ entry_type != OWNING_GROUP && ++ entry_type != ACE_EVERYONE)) { ++ acep->a_who = zfs_fuid_map_id(zsb, who, ++ cr, (entry_type & ACE_IDENTIFIER_GROUP) ? ++ ZFS_ACE_GROUP : ZFS_ACE_USER); ++ } else { ++ acep->a_who = (uid_t)(int64_t)who; ++ } ++ acep->a_access_mask = access_mask; ++ acep->a_flags = iflags; ++ acep->a_type = type; ++ acep = (ace_t *)((caddr_t)acep + ace_size); ++ } ++} ++ ++static int ++zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep, ++ zfs_oldace_t *z_acl, int aclcnt, size_t *size) ++{ ++ int i; ++ zfs_oldace_t *aceptr = z_acl; ++ ++ for (i = 0; i != aclcnt; i++, aceptr++) { ++ aceptr->z_access_mask = acep[i].a_access_mask; ++ aceptr->z_type = acep[i].a_type; ++ aceptr->z_flags = acep[i].a_flags; ++ aceptr->z_fuid = acep[i].a_who; ++ /* ++ * Make sure ACE is valid ++ */ ++ if (zfs_ace_valid(obj_mode, aclp, aceptr->z_type, ++ aceptr->z_flags) != B_TRUE) ++ return (EINVAL); ++ } ++ *size = (caddr_t)aceptr - (caddr_t)z_acl; ++ return (0); ++} ++ ++/* ++ * convert old ACL format to new ++ */ ++void ++zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) ++{ ++ zfs_oldace_t *oldaclp; ++ int i; ++ uint16_t type, iflags; ++ uint32_t access_mask; ++ uint64_t who; ++ void *cookie = NULL; ++ zfs_acl_node_t *newaclnode; ++ ++ ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); ++ /* ++ * First create the ACE in a contiguous piece of memory ++ * for zfs_copy_ace_2_fuid(). ++ * ++ * We only convert an ACL once, so this won't happen ++ * everytime. ++ */ ++ oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, ++ KM_SLEEP); ++ i = 0; ++ while ((cookie = zfs_acl_next_ace(aclp, cookie, &who, ++ &access_mask, &iflags, &type))) { ++ oldaclp[i].z_flags = iflags; ++ oldaclp[i].z_type = type; ++ oldaclp[i].z_fuid = who; ++ oldaclp[i++].z_access_mask = access_mask; ++ } ++ ++ newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * ++ sizeof (zfs_object_ace_t)); ++ aclp->z_ops = &zfs_acl_fuid_ops; ++ VERIFY(zfs_copy_ace_2_fuid(ZTOZSB(zp), ZTOI(zp)->i_mode, ++ aclp, oldaclp, newaclnode->z_acldata, aclp->z_acl_count, ++ &newaclnode->z_size, NULL, cr) == 0); ++ newaclnode->z_ace_count = aclp->z_acl_count; ++ aclp->z_version = ZFS_ACL_VERSION; ++ kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); ++ ++ /* ++ * Release all previous ACL nodes ++ */ ++ ++ zfs_acl_release_nodes(aclp); ++ ++ list_insert_head(&aclp->z_acl, newaclnode); ++ ++ aclp->z_acl_bytes = newaclnode->z_size; ++ aclp->z_acl_count = newaclnode->z_ace_count; ++ ++} ++ ++/* ++ * Convert unix access mask to v4 access mask ++ */ ++static uint32_t ++zfs_unix_to_v4(uint32_t access_mask) ++{ ++ uint32_t new_mask = 0; ++ ++ if (access_mask & S_IXOTH) ++ new_mask |= ACE_EXECUTE; ++ if (access_mask & S_IWOTH) ++ new_mask |= ACE_WRITE_DATA; ++ if (access_mask & S_IROTH) ++ new_mask |= ACE_READ_DATA; ++ return (new_mask); ++} ++ ++static void ++zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, ++ uint16_t access_type, uint64_t fuid, uint16_t entry_type) ++{ ++ uint16_t type = entry_type & ACE_TYPE_FLAGS; ++ ++ aclp->z_ops->ace_mask_set(acep, access_mask); ++ aclp->z_ops->ace_type_set(acep, access_type); ++ aclp->z_ops->ace_flags_set(acep, entry_type); ++ if ((type != ACE_OWNER && type != OWNING_GROUP && ++ type != ACE_EVERYONE)) ++ aclp->z_ops->ace_who_set(acep, fuid); ++} ++ ++/* ++ * Determine mode of file based on ACL. ++ * Also, create FUIDs for any User/Group ACEs ++ */ ++uint64_t ++zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, ++ uint64_t *pflags, uint64_t fuid, uint64_t fgid) ++{ ++ int entry_type; ++ mode_t mode; ++ mode_t seen = 0; ++ zfs_ace_hdr_t *acep = NULL; ++ uint64_t who; ++ uint16_t iflags, type; ++ uint32_t access_mask; ++ boolean_t an_exec_denied = B_FALSE; ++ ++ mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); ++ ++ while ((acep = zfs_acl_next_ace(aclp, acep, &who, ++ &access_mask, &iflags, &type))) { ++ ++ if (!zfs_acl_valid_ace_type(type, iflags)) ++ continue; ++ ++ entry_type = (iflags & ACE_TYPE_FLAGS); ++ ++ /* ++ * Skip over owner@, group@ or everyone@ inherit only ACEs ++ */ ++ if ((iflags & ACE_INHERIT_ONLY_ACE) && ++ (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || ++ entry_type == OWNING_GROUP)) ++ continue; ++ ++ if (entry_type == ACE_OWNER || (entry_type == 0 && ++ who == fuid)) { ++ if ((access_mask & ACE_READ_DATA) && ++ (!(seen & S_IRUSR))) { ++ seen |= S_IRUSR; ++ if (type == ALLOW) { ++ mode |= S_IRUSR; ++ } ++ } ++ if ((access_mask & ACE_WRITE_DATA) && ++ (!(seen & S_IWUSR))) { ++ seen |= S_IWUSR; ++ if (type == ALLOW) { ++ mode |= S_IWUSR; ++ } ++ } ++ if ((access_mask & ACE_EXECUTE) && ++ (!(seen & S_IXUSR))) { ++ seen |= S_IXUSR; ++ if (type == ALLOW) { ++ mode |= S_IXUSR; ++ } ++ } ++ } else if (entry_type == OWNING_GROUP || ++ (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { ++ if ((access_mask & ACE_READ_DATA) && ++ (!(seen & S_IRGRP))) { ++ seen |= S_IRGRP; ++ if (type == ALLOW) { ++ mode |= S_IRGRP; ++ } ++ } ++ if ((access_mask & ACE_WRITE_DATA) && ++ (!(seen & S_IWGRP))) { ++ seen |= S_IWGRP; ++ if (type == ALLOW) { ++ mode |= S_IWGRP; ++ } ++ } ++ if ((access_mask & ACE_EXECUTE) && ++ (!(seen & S_IXGRP))) { ++ seen |= S_IXGRP; ++ if (type == ALLOW) { ++ mode |= S_IXGRP; ++ } ++ } ++ } else if (entry_type == ACE_EVERYONE) { ++ if ((access_mask & ACE_READ_DATA)) { ++ if (!(seen & S_IRUSR)) { ++ seen |= S_IRUSR; ++ if (type == ALLOW) { ++ mode |= S_IRUSR; ++ } ++ } ++ if (!(seen & S_IRGRP)) { ++ seen |= S_IRGRP; ++ if (type == ALLOW) { ++ mode |= S_IRGRP; ++ } ++ } ++ if (!(seen & S_IROTH)) { ++ seen |= S_IROTH; ++ if (type == ALLOW) { ++ mode |= S_IROTH; ++ } ++ } ++ } ++ if ((access_mask & ACE_WRITE_DATA)) { ++ if (!(seen & S_IWUSR)) { ++ seen |= S_IWUSR; ++ if (type == ALLOW) { ++ mode |= S_IWUSR; ++ } ++ } ++ if (!(seen & S_IWGRP)) { ++ seen |= S_IWGRP; ++ if (type == ALLOW) { ++ mode |= S_IWGRP; ++ } ++ } ++ if (!(seen & S_IWOTH)) { ++ seen |= S_IWOTH; ++ if (type == ALLOW) { ++ mode |= S_IWOTH; ++ } ++ } ++ } ++ if ((access_mask & ACE_EXECUTE)) { ++ if (!(seen & S_IXUSR)) { ++ seen |= S_IXUSR; ++ if (type == ALLOW) { ++ mode |= S_IXUSR; ++ } ++ } ++ if (!(seen & S_IXGRP)) { ++ seen |= S_IXGRP; ++ if (type == ALLOW) { ++ mode |= S_IXGRP; ++ } ++ } ++ if (!(seen & S_IXOTH)) { ++ seen |= S_IXOTH; ++ if (type == ALLOW) { ++ mode |= S_IXOTH; ++ } ++ } ++ } ++ } else { ++ /* ++ * Only care if this IDENTIFIER_GROUP or ++ * USER ACE denies execute access to someone, ++ * mode is not affected ++ */ ++ if ((access_mask & ACE_EXECUTE) && type == DENY) ++ an_exec_denied = B_TRUE; ++ } ++ } ++ ++ /* ++ * Failure to allow is effectively a deny, so execute permission ++ * is denied if it was never mentioned or if we explicitly ++ * weren't allowed it. ++ */ ++ if (!an_exec_denied && ++ ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || ++ (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) ++ an_exec_denied = B_TRUE; ++ ++ if (an_exec_denied) ++ *pflags &= ~ZFS_NO_EXECS_DENIED; ++ else ++ *pflags |= ZFS_NO_EXECS_DENIED; ++ ++ return (mode); ++} ++ ++/* ++ * Read an external acl object. If the intent is to modify, always ++ * create a new acl and leave any cached acl in place. ++ */ ++static int ++zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp, ++ boolean_t will_modify) ++{ ++ zfs_acl_t *aclp; ++ int aclsize; ++ int acl_count; ++ zfs_acl_node_t *aclnode; ++ zfs_acl_phys_t znode_acl; ++ int version; ++ int error; ++ boolean_t drop_lock = B_FALSE; ++ ++ ASSERT(MUTEX_HELD(&zp->z_acl_lock)); ++ ++ if (zp->z_acl_cached && !will_modify) { ++ *aclpp = zp->z_acl_cached; ++ return (0); ++ } ++ ++ /* ++ * close race where znode could be upgrade while trying to ++ * read the znode attributes. ++ * ++ * But this could only happen if the file isn't already an SA ++ * znode ++ */ ++ if (!zp->z_is_sa && !have_lock) { ++ mutex_enter(&zp->z_lock); ++ drop_lock = B_TRUE; ++ } ++ version = zfs_znode_acl_version(zp); ++ ++ if ((error = zfs_acl_znode_info(zp, &aclsize, ++ &acl_count, &znode_acl)) != 0) { ++ goto done; ++ } ++ ++ aclp = zfs_acl_alloc(version); ++ ++ aclp->z_acl_count = acl_count; ++ aclp->z_acl_bytes = aclsize; ++ ++ aclnode = zfs_acl_node_alloc(aclsize); ++ aclnode->z_ace_count = aclp->z_acl_count; ++ aclnode->z_size = aclsize; ++ ++ if (!zp->z_is_sa) { ++ if (znode_acl.z_acl_extern_obj) { ++ error = dmu_read(ZTOZSB(zp)->z_os, ++ znode_acl.z_acl_extern_obj, 0, aclnode->z_size, ++ aclnode->z_acldata, DMU_READ_PREFETCH); ++ } else { ++ bcopy(znode_acl.z_ace_data, aclnode->z_acldata, ++ aclnode->z_size); ++ } ++ } else { ++ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(ZTOZSB(zp)), ++ aclnode->z_acldata, aclnode->z_size); ++ } ++ ++ if (error != 0) { ++ zfs_acl_free(aclp); ++ zfs_acl_node_free(aclnode); ++ /* convert checksum errors into IO errors */ ++ if (error == ECKSUM) ++ error = EIO; ++ goto done; ++ } ++ ++ list_insert_head(&aclp->z_acl, aclnode); ++ ++ *aclpp = aclp; ++ if (!will_modify) ++ zp->z_acl_cached = aclp; ++done: ++ if (drop_lock) ++ mutex_exit(&zp->z_lock); ++ return (error); ++} ++ ++/*ARGSUSED*/ ++void ++zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, ++ boolean_t start, void *userdata) ++{ ++ zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; ++ ++ if (start) { ++ cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); ++ } else { ++ cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, ++ cb->cb_acl_node); ++ } ++ *dataptr = cb->cb_acl_node->z_acldata; ++ *length = cb->cb_acl_node->z_size; ++} ++ ++int ++zfs_acl_chown_setattr(znode_t *zp) ++{ ++ int error; ++ zfs_acl_t *aclp; ++ ++ ASSERT(MUTEX_HELD(&zp->z_lock)); ++ ASSERT(MUTEX_HELD(&zp->z_acl_lock)); ++ ++ if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0) ++ zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, ++ &zp->z_pflags, zp->z_uid, zp->z_gid); ++ return (error); ++} ++ ++static void ++acl_trivial_access_masks(mode_t mode, uint32_t *allow0, uint32_t *deny1, ++ uint32_t *deny2, uint32_t *owner, uint32_t *group, uint32_t *everyone) ++{ ++ *deny1 = *deny2 = *allow0 = *group = 0; ++ ++ if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) ++ *deny1 |= ACE_READ_DATA; ++ if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) ++ *deny1 |= ACE_WRITE_DATA; ++ if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) ++ *deny1 |= ACE_EXECUTE; ++ ++ if (!(mode & S_IRGRP) && (mode & S_IROTH)) ++ *deny2 = ACE_READ_DATA; ++ if (!(mode & S_IWGRP) && (mode & S_IWOTH)) ++ *deny2 |= ACE_WRITE_DATA; ++ if (!(mode & S_IXGRP) && (mode & S_IXOTH)) ++ *deny2 |= ACE_EXECUTE; ++ ++ if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) ++ *allow0 |= ACE_READ_DATA; ++ if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) ++ *allow0 |= ACE_WRITE_DATA; ++ if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) ++ *allow0 |= ACE_EXECUTE; ++ ++ *owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| ++ ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| ++ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; ++ if (mode & S_IRUSR) ++ *owner |= ACE_READ_DATA; ++ if (mode & S_IWUSR) ++ *owner |= ACE_WRITE_DATA|ACE_APPEND_DATA; ++ if (mode & S_IXUSR) ++ *owner |= ACE_EXECUTE; ++ ++ *group = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS| ++ ACE_SYNCHRONIZE; ++ if (mode & S_IRGRP) ++ *group |= ACE_READ_DATA; ++ if (mode & S_IWGRP) ++ *group |= ACE_WRITE_DATA|ACE_APPEND_DATA; ++ if (mode & S_IXGRP) ++ *group |= ACE_EXECUTE; ++ ++ *everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS| ++ ACE_SYNCHRONIZE; ++ if (mode & S_IROTH) ++ *everyone |= ACE_READ_DATA; ++ if (mode & S_IWOTH) ++ *everyone |= ACE_WRITE_DATA|ACE_APPEND_DATA; ++ if (mode & S_IXOTH) ++ *everyone |= ACE_EXECUTE; ++} ++ ++/* ++ * ace_trivial: ++ * determine whether an ace_t acl is trivial ++ * ++ * Trivialness implies that the acl is composed of only ++ * owner, group, everyone entries. ACL can't ++ * have read_acl denied, and write_owner/write_acl/write_attributes ++ * can only be owner@ entry. ++ */ ++static int ++ace_trivial_common(void *acep, int aclcnt, ++ uint64_t (*walk)(void *, uint64_t, int aclcnt, ++ uint16_t *, uint16_t *, uint32_t *)) ++{ ++ uint16_t flags; ++ uint32_t mask; ++ uint16_t type; ++ uint64_t cookie = 0; ++ ++ while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) { ++ switch (flags & ACE_TYPE_FLAGS) { ++ case ACE_OWNER: ++ case ACE_GROUP|ACE_IDENTIFIER_GROUP: ++ case ACE_EVERYONE: ++ break; ++ default: ++ return (1); ++ } ++ ++ if (flags & (ACE_FILE_INHERIT_ACE| ++ ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| ++ ACE_INHERIT_ONLY_ACE)) ++ return (1); ++ ++ /* ++ * Special check for some special bits ++ * ++ * Don't allow anybody to deny reading basic ++ * attributes or a files ACL. ++ */ ++ if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && ++ (type == ACE_ACCESS_DENIED_ACE_TYPE)) ++ return (1); ++ ++ /* ++ * Delete permissions are never set by default ++ */ ++ if (mask & (ACE_DELETE|ACE_DELETE_CHILD)) ++ return (1); ++ /* ++ * only allow owner@ to have ++ * write_acl/write_owner/write_attributes/write_xattr/ ++ */ ++ if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && ++ (!(flags & ACE_OWNER) && (mask & ++ (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES| ++ ACE_WRITE_NAMED_ATTRS)))) ++ return (1); ++ ++ } ++ ++ return (0); ++} ++ ++/* ++ * common code for setting ACLs. ++ * ++ * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. ++ * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's ++ * already checked the acl and knows whether to inherit. ++ */ ++int ++zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) ++{ ++ int error; ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ dmu_object_type_t otype; ++ zfs_acl_locator_cb_t locate = { 0 }; ++ uint64_t mode; ++ sa_bulk_attr_t bulk[5]; ++ uint64_t ctime[2]; ++ int count = 0; ++ ++ mode = zp->z_mode; ++ ++ mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, ++ zp->z_uid, zp->z_gid); ++ ++ zp->z_mode = mode; ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zsb), NULL, ++ &mode, sizeof (mode)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL, ++ &zp->z_pflags, sizeof (zp->z_pflags)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, ++ &ctime, sizeof (ctime)); ++ ++ if (zp->z_acl_cached) { ++ zfs_acl_free(zp->z_acl_cached); ++ zp->z_acl_cached = NULL; ++ } ++ ++ /* ++ * Upgrade needed? ++ */ ++ if (!zsb->z_use_fuids) { ++ otype = DMU_OT_OLDACL; ++ } else { ++ if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && ++ (zsb->z_version >= ZPL_VERSION_FUID)) ++ zfs_acl_xform(zp, aclp, cr); ++ ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); ++ otype = DMU_OT_ACL; ++ } ++ ++ /* ++ * Arrgh, we have to handle old on disk format ++ * as well as newer (preferred) SA format. ++ */ ++ ++ if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ ++ locate.cb_aclp = aclp; ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zsb), ++ zfs_acl_data_locator, &locate, aclp->z_acl_bytes); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zsb), ++ NULL, &aclp->z_acl_count, sizeof (uint64_t)); ++ } else { /* Painful legacy way */ ++ zfs_acl_node_t *aclnode; ++ uint64_t off = 0; ++ zfs_acl_phys_t acl_phys; ++ uint64_t aoid; ++ ++ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zsb), ++ &acl_phys, sizeof (acl_phys))) != 0) ++ return (error); ++ ++ aoid = acl_phys.z_acl_extern_obj; ++ ++ if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { ++ /* ++ * If ACL was previously external and we are now ++ * converting to new ACL format then release old ++ * ACL object and create a new one. ++ */ ++ if (aoid && ++ aclp->z_version != acl_phys.z_acl_version) { ++ error = dmu_object_free(zsb->z_os, aoid, tx); ++ if (error) ++ return (error); ++ aoid = 0; ++ } ++ if (aoid == 0) { ++ aoid = dmu_object_alloc(zsb->z_os, ++ otype, aclp->z_acl_bytes, ++ otype == DMU_OT_ACL ? ++ DMU_OT_SYSACL : DMU_OT_NONE, ++ otype == DMU_OT_ACL ? ++ DN_MAX_BONUSLEN : 0, tx); ++ } else { ++ (void) dmu_object_set_blocksize(zsb->z_os, ++ aoid, aclp->z_acl_bytes, 0, tx); ++ } ++ acl_phys.z_acl_extern_obj = aoid; ++ for (aclnode = list_head(&aclp->z_acl); aclnode; ++ aclnode = list_next(&aclp->z_acl, aclnode)) { ++ if (aclnode->z_ace_count == 0) ++ continue; ++ dmu_write(zsb->z_os, aoid, off, ++ aclnode->z_size, aclnode->z_acldata, tx); ++ off += aclnode->z_size; ++ } ++ } else { ++ void *start = acl_phys.z_ace_data; ++ /* ++ * Migrating back embedded? ++ */ ++ if (acl_phys.z_acl_extern_obj) { ++ error = dmu_object_free(zsb->z_os, ++ acl_phys.z_acl_extern_obj, tx); ++ if (error) ++ return (error); ++ acl_phys.z_acl_extern_obj = 0; ++ } ++ ++ for (aclnode = list_head(&aclp->z_acl); aclnode; ++ aclnode = list_next(&aclp->z_acl, aclnode)) { ++ if (aclnode->z_ace_count == 0) ++ continue; ++ bcopy(aclnode->z_acldata, start, ++ aclnode->z_size); ++ start = (caddr_t)start + aclnode->z_size; ++ } ++ } ++ /* ++ * If Old version then swap count/bytes to match old ++ * layout of znode_acl_phys_t. ++ */ ++ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { ++ acl_phys.z_acl_size = aclp->z_acl_count; ++ acl_phys.z_acl_count = aclp->z_acl_bytes; ++ } else { ++ acl_phys.z_acl_size = aclp->z_acl_bytes; ++ acl_phys.z_acl_count = aclp->z_acl_count; ++ } ++ acl_phys.z_acl_version = aclp->z_version; ++ ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zsb), NULL, ++ &acl_phys, sizeof (acl_phys)); ++ } ++ ++ /* ++ * Replace ACL wide bits, but first clear them. ++ */ ++ zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; ++ ++ zp->z_pflags |= aclp->z_hints; ++ ++ if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) ++ zp->z_pflags |= ZFS_ACL_TRIVIAL; ++ ++ zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE); ++ return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); ++} ++ ++static void ++zfs_acl_chmod(zfs_sb_t *zsb, uint64_t mode, zfs_acl_t *aclp) ++{ ++ void *acep = NULL; ++ uint64_t who; ++ int new_count, new_bytes; ++ int ace_size; ++ int entry_type; ++ uint16_t iflags, type; ++ uint32_t access_mask; ++ zfs_acl_node_t *newnode; ++ size_t abstract_size = aclp->z_ops->ace_abstract_size(); ++ void *zacep; ++ uint32_t owner, group, everyone; ++ uint32_t deny1, deny2, allow0; ++ ++ new_count = new_bytes = 0; ++ ++ acl_trivial_access_masks((mode_t)mode, &allow0, &deny1, &deny2, ++ &owner, &group, &everyone); ++ ++ newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); ++ ++ zacep = newnode->z_acldata; ++ if (allow0) { ++ zfs_set_ace(aclp, zacep, allow0, ALLOW, -1, ACE_OWNER); ++ zacep = (void *)((uintptr_t)zacep + abstract_size); ++ new_count++; ++ new_bytes += abstract_size; ++ } if (deny1) { ++ zfs_set_ace(aclp, zacep, deny1, DENY, -1, ACE_OWNER); ++ zacep = (void *)((uintptr_t)zacep + abstract_size); ++ new_count++; ++ new_bytes += abstract_size; ++ } ++ if (deny2) { ++ zfs_set_ace(aclp, zacep, deny2, DENY, -1, OWNING_GROUP); ++ zacep = (void *)((uintptr_t)zacep + abstract_size); ++ new_count++; ++ new_bytes += abstract_size; ++ } ++ ++ while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, ++ &iflags, &type))) { ++ uint16_t inherit_flags; ++ ++ entry_type = (iflags & ACE_TYPE_FLAGS); ++ inherit_flags = (iflags & ALL_INHERIT); ++ ++ if ((entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || ++ (entry_type == OWNING_GROUP)) && ++ ((inherit_flags & ACE_INHERIT_ONLY_ACE) == 0)) { ++ continue; ++ } ++ ++ if ((type != ALLOW && type != DENY) || ++ (inherit_flags & ACE_INHERIT_ONLY_ACE)) { ++ if (inherit_flags) ++ aclp->z_hints |= ZFS_INHERIT_ACE; ++ switch (type) { ++ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: ++ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: ++ aclp->z_hints |= ZFS_ACL_OBJ_ACE; ++ break; ++ } ++ } else { ++ ++ /* ++ * Limit permissions to be no greater than ++ * group permissions ++ */ ++ if (zsb->z_acl_inherit == ZFS_ACL_RESTRICTED) { ++ if (!(mode & S_IRGRP)) ++ access_mask &= ~ACE_READ_DATA; ++ if (!(mode & S_IWGRP)) ++ access_mask &= ++ ~(ACE_WRITE_DATA|ACE_APPEND_DATA); ++ if (!(mode & S_IXGRP)) ++ access_mask &= ~ACE_EXECUTE; ++ access_mask &= ++ ~(ACE_WRITE_OWNER|ACE_WRITE_ACL| ++ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS); ++ } ++ } ++ zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); ++ ace_size = aclp->z_ops->ace_size(acep); ++ zacep = (void *)((uintptr_t)zacep + ace_size); ++ new_count++; ++ new_bytes += ace_size; ++ } ++ zfs_set_ace(aclp, zacep, owner, 0, -1, ACE_OWNER); ++ zacep = (void *)((uintptr_t)zacep + abstract_size); ++ zfs_set_ace(aclp, zacep, group, 0, -1, OWNING_GROUP); ++ zacep = (void *)((uintptr_t)zacep + abstract_size); ++ zfs_set_ace(aclp, zacep, everyone, 0, -1, ACE_EVERYONE); ++ ++ new_count += 3; ++ new_bytes += abstract_size * 3; ++ zfs_acl_release_nodes(aclp); ++ aclp->z_acl_count = new_count; ++ aclp->z_acl_bytes = new_bytes; ++ newnode->z_ace_count = new_count; ++ newnode->z_size = new_bytes; ++ list_insert_tail(&aclp->z_acl, newnode); ++} ++ ++void ++zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) ++{ ++ mutex_enter(&zp->z_acl_lock); ++ mutex_enter(&zp->z_lock); ++ *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); ++ (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; ++ zfs_acl_chmod(ZTOZSB(zp), mode, *aclp); ++ mutex_exit(&zp->z_lock); ++ mutex_exit(&zp->z_acl_lock); ++ ASSERT(*aclp); ++} ++ ++/* ++ * strip off write_owner and write_acl ++ */ ++static void ++zfs_restricted_update(zfs_sb_t *zsb, zfs_acl_t *aclp, void *acep) ++{ ++ uint32_t mask = aclp->z_ops->ace_mask_get(acep); ++ ++ if ((zsb->z_acl_inherit == ZFS_ACL_RESTRICTED) && ++ (aclp->z_ops->ace_type_get(acep) == ALLOW)) { ++ mask &= ~RESTRICTED_CLEAR; ++ aclp->z_ops->ace_mask_set(acep, mask); ++ } ++} ++ ++/* ++ * Should ACE be inherited? ++ */ ++static int ++zfs_ace_can_use(umode_t obj_mode, uint16_t acep_flags) ++{ ++ int iflags = (acep_flags & 0xf); ++ ++ if (S_ISDIR(obj_mode) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) ++ return (1); ++ else if (iflags & ACE_FILE_INHERIT_ACE) ++ return (!(S_ISDIR(obj_mode) && ++ (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); ++ return (0); ++} ++ ++/* ++ * inherit inheritable ACEs from parent ++ */ ++static zfs_acl_t * ++zfs_acl_inherit(zfs_sb_t *zsb, umode_t obj_mode, zfs_acl_t *paclp, ++ uint64_t mode, boolean_t *need_chmod) ++{ ++ void *pacep; ++ void *acep; ++ zfs_acl_node_t *aclnode; ++ zfs_acl_t *aclp = NULL; ++ uint64_t who; ++ uint32_t access_mask; ++ uint16_t iflags, newflags, type; ++ size_t ace_size; ++ void *data1, *data2; ++ size_t data1sz, data2sz; ++ boolean_t vdir = S_ISDIR(obj_mode); ++ boolean_t vreg = S_ISREG(obj_mode); ++ boolean_t passthrough, passthrough_x, noallow; ++ ++ passthrough_x = ++ zsb->z_acl_inherit == ZFS_ACL_PASSTHROUGH_X; ++ passthrough = passthrough_x || ++ zsb->z_acl_inherit == ZFS_ACL_PASSTHROUGH; ++ noallow = ++ zsb->z_acl_inherit == ZFS_ACL_NOALLOW; ++ ++ *need_chmod = B_TRUE; ++ pacep = NULL; ++ aclp = zfs_acl_alloc(paclp->z_version); ++ if (zsb->z_acl_inherit == ZFS_ACL_DISCARD || S_ISLNK(obj_mode)) ++ return (aclp); ++ while ((pacep = zfs_acl_next_ace(paclp, pacep, &who, ++ &access_mask, &iflags, &type))) { ++ ++ /* ++ * don't inherit bogus ACEs ++ */ ++ if (!zfs_acl_valid_ace_type(type, iflags)) ++ continue; ++ ++ if (noallow && type == ALLOW) ++ continue; ++ ++ ace_size = aclp->z_ops->ace_size(pacep); ++ ++ if (!zfs_ace_can_use(obj_mode, iflags)) ++ continue; ++ ++ /* ++ * If owner@, group@, or everyone@ inheritable ++ * then zfs_acl_chmod() isn't needed. ++ */ ++ if (passthrough && ++ ((iflags & (ACE_OWNER|ACE_EVERYONE)) || ++ ((iflags & OWNING_GROUP) == ++ OWNING_GROUP)) && (vreg || (vdir && (iflags & ++ ACE_DIRECTORY_INHERIT_ACE)))) { ++ *need_chmod = B_FALSE; ++ } ++ ++ if (!vdir && passthrough_x && ++ ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) { ++ access_mask &= ~ACE_EXECUTE; ++ } ++ ++ aclnode = zfs_acl_node_alloc(ace_size); ++ list_insert_tail(&aclp->z_acl, aclnode); ++ acep = aclnode->z_acldata; ++ ++ zfs_set_ace(aclp, acep, access_mask, type, ++ who, iflags|ACE_INHERITED_ACE); ++ ++ /* ++ * Copy special opaque data if any ++ */ ++ if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) { ++ VERIFY((data2sz = aclp->z_ops->ace_data(acep, ++ &data2)) == data1sz); ++ bcopy(data1, data2, data2sz); ++ } ++ ++ aclp->z_acl_count++; ++ aclnode->z_ace_count++; ++ aclp->z_acl_bytes += aclnode->z_size; ++ newflags = aclp->z_ops->ace_flags_get(acep); ++ ++ if (vdir) ++ aclp->z_hints |= ZFS_INHERIT_ACE; ++ ++ if ((iflags & ACE_NO_PROPAGATE_INHERIT_ACE) || !vdir) { ++ newflags &= ~ALL_INHERIT; ++ aclp->z_ops->ace_flags_set(acep, ++ newflags|ACE_INHERITED_ACE); ++ zfs_restricted_update(zsb, aclp, acep); ++ continue; ++ } ++ ++ ASSERT(vdir); ++ ++ /* ++ * If only FILE_INHERIT is set then turn on ++ * inherit_only ++ */ ++ if ((iflags & (ACE_FILE_INHERIT_ACE | ++ ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { ++ newflags |= ACE_INHERIT_ONLY_ACE; ++ aclp->z_ops->ace_flags_set(acep, ++ newflags|ACE_INHERITED_ACE); ++ } else { ++ newflags &= ~ACE_INHERIT_ONLY_ACE; ++ aclp->z_ops->ace_flags_set(acep, ++ newflags|ACE_INHERITED_ACE); ++ } ++ } ++ return (aclp); ++} ++ ++/* ++ * Create file system object initial permissions ++ * including inheritable ACEs. ++ */ ++int ++zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, ++ vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) ++{ ++ int error; ++ zfs_sb_t *zsb = ZTOZSB(dzp); ++ zfs_acl_t *paclp; ++#ifdef HAVE_KSID ++ gid_t gid; ++#endif /* HAVE_KSID */ ++ boolean_t need_chmod = B_TRUE; ++ boolean_t inherited = B_FALSE; ++ ++ bzero(acl_ids, sizeof (zfs_acl_ids_t)); ++ acl_ids->z_mode = vap->va_mode; ++ ++ if (vsecp) ++ if ((error = zfs_vsec_2_aclp(zsb, vap->va_mode, vsecp, ++ cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) ++ return (error); ++ ++ acl_ids->z_fuid = vap->va_uid; ++ acl_ids->z_fgid = vap->va_gid; ++#ifdef HAVE_KSID ++ /* ++ * Determine uid and gid. ++ */ ++ if ((flag & IS_ROOT_NODE) || zsb->z_replay || ++ ((flag & IS_XATTR) && (S_ISDIR(vap->va_mode)))) { ++ acl_ids->z_fuid = zfs_fuid_create(zsb, (uint64_t)vap->va_uid, ++ cr, ZFS_OWNER, &acl_ids->z_fuidp); ++ acl_ids->z_fgid = zfs_fuid_create(zsb, (uint64_t)vap->va_gid, ++ cr, ZFS_GROUP, &acl_ids->z_fuidp); ++ gid = vap->va_gid; ++ } else { ++ acl_ids->z_fuid = zfs_fuid_create_cred(zsb, ZFS_OWNER, ++ cr, &acl_ids->z_fuidp); ++ acl_ids->z_fgid = 0; ++ if (vap->va_mask & AT_GID) { ++ acl_ids->z_fgid = zfs_fuid_create(zsb, ++ (uint64_t)vap->va_gid, ++ cr, ZFS_GROUP, &acl_ids->z_fuidp); ++ gid = vap->va_gid; ++ if (acl_ids->z_fgid != dzp->z_gid && ++ !groupmember(vap->va_gid, cr) && ++ secpolicy_vnode_create_gid(cr) != 0) ++ acl_ids->z_fgid = 0; ++ } ++ if (acl_ids->z_fgid == 0) { ++ if (dzp->z_mode & S_ISGID) { ++ char *domain; ++ uint32_t rid; ++ ++ acl_ids->z_fgid = dzp->z_gid; ++ gid = zfs_fuid_map_id(zsb, acl_ids->z_fgid, ++ cr, ZFS_GROUP); ++ ++ if (zsb->z_use_fuids && ++ IS_EPHEMERAL(acl_ids->z_fgid)) { ++ domain = zfs_fuid_idx_domain( ++ &zsb->z_fuid_idx, ++ FUID_INDEX(acl_ids->z_fgid)); ++ rid = FUID_RID(acl_ids->z_fgid); ++ zfs_fuid_node_add(&acl_ids->z_fuidp, ++ domain, rid, ++ FUID_INDEX(acl_ids->z_fgid), ++ acl_ids->z_fgid, ZFS_GROUP); ++ } ++ } else { ++ acl_ids->z_fgid = zfs_fuid_create_cred(zsb, ++ ZFS_GROUP, cr, &acl_ids->z_fuidp); ++ gid = crgetgid(cr); ++ } ++ } ++ } ++#endif /* HAVE_KSID */ ++ ++ /* ++ * If we're creating a directory, and the parent directory has the ++ * set-GID bit set, set in on the new directory. ++ * Otherwise, if the user is neither privileged nor a member of the ++ * file's new group, clear the file's set-GID bit. ++ */ ++ ++ if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && ++ (S_ISDIR(vap->va_mode))) { ++ acl_ids->z_mode |= S_ISGID; ++ } else { ++ if ((acl_ids->z_mode & S_ISGID) && ++ secpolicy_vnode_setids_setgids(cr, gid) != 0) ++ acl_ids->z_mode &= ~S_ISGID; ++ } ++ ++ if (acl_ids->z_aclp == NULL) { ++ mutex_enter(&dzp->z_acl_lock); ++ mutex_enter(&dzp->z_lock); ++ if (!(flag & IS_ROOT_NODE) && (S_ISDIR(ZTOI(dzp)->i_mode) && ++ (dzp->z_pflags & ZFS_INHERIT_ACE)) && ++ !(dzp->z_pflags & ZFS_XATTR)) { ++ VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, ++ &paclp, B_FALSE)); ++ acl_ids->z_aclp = zfs_acl_inherit(zsb, ++ vap->va_mode, paclp, acl_ids->z_mode, &need_chmod); ++ inherited = B_TRUE; ++ } else { ++ acl_ids->z_aclp = ++ zfs_acl_alloc(zfs_acl_version_zp(dzp)); ++ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; ++ } ++ mutex_exit(&dzp->z_lock); ++ mutex_exit(&dzp->z_acl_lock); ++ if (need_chmod) { ++ acl_ids->z_aclp->z_hints |= S_ISDIR(vap->va_mode) ? ++ ZFS_ACL_AUTO_INHERIT : 0; ++ zfs_acl_chmod(zsb, acl_ids->z_mode, acl_ids->z_aclp); ++ } ++ } ++ ++ if (inherited || vsecp) { ++ acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, ++ acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, ++ acl_ids->z_fuid, acl_ids->z_fgid); ++ if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) ++ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; ++ } ++ ++ return (0); ++} ++ ++/* ++ * Free ACL and fuid_infop, but not the acl_ids structure ++ */ ++void ++zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) ++{ ++ if (acl_ids->z_aclp) ++ zfs_acl_free(acl_ids->z_aclp); ++ if (acl_ids->z_fuidp) ++ zfs_fuid_info_free(acl_ids->z_fuidp); ++ acl_ids->z_aclp = NULL; ++ acl_ids->z_fuidp = NULL; ++} ++ ++boolean_t ++zfs_acl_ids_overquota(zfs_sb_t *zsb, zfs_acl_ids_t *acl_ids) ++{ ++ return (zfs_fuid_overquota(zsb, B_FALSE, acl_ids->z_fuid) || ++ zfs_fuid_overquota(zsb, B_TRUE, acl_ids->z_fgid)); ++} ++ ++/* ++ * Retrieve a files ACL ++ */ ++int ++zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) ++{ ++ zfs_acl_t *aclp; ++ ulong_t mask; ++ int error; ++ int count = 0; ++ int largeace = 0; ++ ++ mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | ++ VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); ++ ++ if (mask == 0) ++ return (ENOSYS); ++ ++ if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))) ++ return (error); ++ ++ mutex_enter(&zp->z_acl_lock); ++ ++ error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); ++ if (error != 0) { ++ mutex_exit(&zp->z_acl_lock); ++ return (error); ++ } ++ ++ /* ++ * Scan ACL to determine number of ACEs ++ */ ++ if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { ++ void *zacep = NULL; ++ uint64_t who; ++ uint32_t access_mask; ++ uint16_t type, iflags; ++ ++ while ((zacep = zfs_acl_next_ace(aclp, zacep, ++ &who, &access_mask, &iflags, &type))) { ++ switch (type) { ++ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: ++ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: ++ largeace++; ++ continue; ++ default: ++ count++; ++ } ++ } ++ vsecp->vsa_aclcnt = count; ++ } else ++ count = (int)aclp->z_acl_count; ++ ++ if (mask & VSA_ACECNT) { ++ vsecp->vsa_aclcnt = count; ++ } ++ ++ if (mask & VSA_ACE) { ++ size_t aclsz; ++ ++ aclsz = count * sizeof (ace_t) + ++ sizeof (ace_object_t) * largeace; ++ ++ vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); ++ vsecp->vsa_aclentsz = aclsz; ++ ++ if (aclp->z_version == ZFS_ACL_VERSION_FUID) ++ zfs_copy_fuid_2_ace(ZTOZSB(zp), aclp, cr, ++ vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); ++ else { ++ zfs_acl_node_t *aclnode; ++ void *start = vsecp->vsa_aclentp; ++ ++ for (aclnode = list_head(&aclp->z_acl); aclnode; ++ aclnode = list_next(&aclp->z_acl, aclnode)) { ++ bcopy(aclnode->z_acldata, start, ++ aclnode->z_size); ++ start = (caddr_t)start + aclnode->z_size; ++ } ++ ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp == ++ aclp->z_acl_bytes); ++ } ++ } ++ if (mask & VSA_ACE_ACLFLAGS) { ++ vsecp->vsa_aclflags = 0; ++ if (zp->z_pflags & ZFS_ACL_DEFAULTED) ++ vsecp->vsa_aclflags |= ACL_DEFAULTED; ++ if (zp->z_pflags & ZFS_ACL_PROTECTED) ++ vsecp->vsa_aclflags |= ACL_PROTECTED; ++ if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) ++ vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; ++ } ++ ++ mutex_exit(&zp->z_acl_lock); ++ ++ return (0); ++} ++ ++int ++zfs_vsec_2_aclp(zfs_sb_t *zsb, umode_t obj_mode, ++ vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) ++{ ++ zfs_acl_t *aclp; ++ zfs_acl_node_t *aclnode; ++ int aclcnt = vsecp->vsa_aclcnt; ++ int error; ++ ++ if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) ++ return (EINVAL); ++ ++ aclp = zfs_acl_alloc(zfs_acl_version(zsb->z_version)); ++ ++ aclp->z_hints = 0; ++ aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); ++ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { ++ if ((error = zfs_copy_ace_2_oldace(obj_mode, aclp, ++ (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, ++ aclcnt, &aclnode->z_size)) != 0) { ++ zfs_acl_free(aclp); ++ zfs_acl_node_free(aclnode); ++ return (error); ++ } ++ } else { ++ if ((error = zfs_copy_ace_2_fuid(zsb, obj_mode, aclp, ++ vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, ++ &aclnode->z_size, fuidp, cr)) != 0) { ++ zfs_acl_free(aclp); ++ zfs_acl_node_free(aclnode); ++ return (error); ++ } ++ } ++ aclp->z_acl_bytes = aclnode->z_size; ++ aclnode->z_ace_count = aclcnt; ++ aclp->z_acl_count = aclcnt; ++ list_insert_head(&aclp->z_acl, aclnode); ++ ++ /* ++ * If flags are being set then add them to z_hints ++ */ ++ if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { ++ if (vsecp->vsa_aclflags & ACL_PROTECTED) ++ aclp->z_hints |= ZFS_ACL_PROTECTED; ++ if (vsecp->vsa_aclflags & ACL_DEFAULTED) ++ aclp->z_hints |= ZFS_ACL_DEFAULTED; ++ if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) ++ aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; ++ } ++ ++ *zaclp = aclp; ++ ++ return (0); ++} ++ ++/* ++ * Set a files ACL ++ */ ++int ++zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ zilog_t *zilog = zsb->z_log; ++ ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); ++ dmu_tx_t *tx; ++ int error; ++ zfs_acl_t *aclp; ++ zfs_fuid_info_t *fuidp = NULL; ++ boolean_t fuid_dirtied; ++ uint64_t acl_obj; ++ ++ if (mask == 0) ++ return (ENOSYS); ++ ++ if (zp->z_pflags & ZFS_IMMUTABLE) ++ return (EPERM); ++ ++ if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) ++ return (error); ++ ++ error = zfs_vsec_2_aclp(zsb, ZTOI(zp)->i_mode, vsecp, cr, &fuidp, ++ &aclp); ++ if (error) ++ return (error); ++ ++ /* ++ * If ACL wide flags aren't being set then preserve any ++ * existing flags. ++ */ ++ if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { ++ aclp->z_hints |= ++ (zp->z_pflags & V4_ACL_WIDE_FLAGS); ++ } ++top: ++ mutex_enter(&zp->z_acl_lock); ++ mutex_enter(&zp->z_lock); ++ ++ tx = dmu_tx_create(zsb->z_os); ++ ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); ++ ++ fuid_dirtied = zsb->z_fuid_dirty; ++ if (fuid_dirtied) ++ zfs_fuid_txhold(zsb, tx); ++ ++ /* ++ * If old version and ACL won't fit in bonus and we aren't ++ * upgrading then take out necessary DMU holds ++ */ ++ ++ if ((acl_obj = zfs_external_acl(zp)) != 0) { ++ if (zsb->z_version >= ZPL_VERSION_FUID && ++ zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { ++ dmu_tx_hold_free(tx, acl_obj, 0, ++ DMU_OBJECT_END); ++ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ++ aclp->z_acl_bytes); ++ } else { ++ dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); ++ } ++ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { ++ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); ++ } ++ ++ zfs_sa_upgrade_txholds(tx, zp); ++ error = dmu_tx_assign(tx, TXG_NOWAIT); ++ if (error) { ++ mutex_exit(&zp->z_acl_lock); ++ mutex_exit(&zp->z_lock); ++ ++ if (error == ERESTART) { ++ dmu_tx_wait(tx); ++ dmu_tx_abort(tx); ++ goto top; ++ } ++ dmu_tx_abort(tx); ++ zfs_acl_free(aclp); ++ return (error); ++ } ++ ++ error = zfs_aclset_common(zp, aclp, cr, tx); ++ ASSERT(error == 0); ++ ASSERT(zp->z_acl_cached == NULL); ++ zp->z_acl_cached = aclp; ++ ++ if (fuid_dirtied) ++ zfs_fuid_sync(zsb, tx); ++ ++ zfs_log_acl(zilog, tx, zp, vsecp, fuidp); ++ ++ if (fuidp) ++ zfs_fuid_info_free(fuidp); ++ dmu_tx_commit(tx); ++ ++ mutex_exit(&zp->z_lock); ++ mutex_exit(&zp->z_acl_lock); ++ ++ return (error); ++} ++ ++/* ++ * Check accesses of interest (AoI) against attributes of the dataset ++ * such as read-only. Returns zero if no AoI conflict with dataset ++ * attributes, otherwise an appropriate errno is returned. ++ */ ++static int ++zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) ++{ ++ if ((v4_mode & WRITE_MASK) && (zfs_is_readonly(ZTOZSB(zp))) && ++ (!S_ISDEV(ZTOI(zp)->i_mode) || ++ (S_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) { ++ return (EROFS); ++ } ++ ++ /* ++ * Only check for READONLY on non-directories. ++ */ ++ if ((v4_mode & WRITE_MASK_DATA) && ++ ((!S_ISDIR(ZTOI(zp)->i_mode) && ++ (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) || ++ (S_ISDIR(ZTOI(zp)->i_mode) && ++ (zp->z_pflags & ZFS_IMMUTABLE)))) { ++ return (EPERM); ++ } ++ ++ if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && ++ (zp->z_pflags & ZFS_NOUNLINK)) { ++ return (EPERM); ++ } ++ ++ if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && ++ (zp->z_pflags & ZFS_AV_QUARANTINED))) { ++ return (EACCES); ++ } ++ ++ return (0); ++} ++ ++/* ++ * The primary usage of this function is to loop through all of the ++ * ACEs in the znode, determining what accesses of interest (AoI) to ++ * the caller are allowed or denied. The AoI are expressed as bits in ++ * the working_mode parameter. As each ACE is processed, bits covered ++ * by that ACE are removed from the working_mode. This removal ++ * facilitates two things. The first is that when the working mode is ++ * empty (= 0), we know we've looked at all the AoI. The second is ++ * that the ACE interpretation rules don't allow a later ACE to undo ++ * something granted or denied by an earlier ACE. Removing the ++ * discovered access or denial enforces this rule. At the end of ++ * processing the ACEs, all AoI that were found to be denied are ++ * placed into the working_mode, giving the caller a mask of denied ++ * accesses. Returns: ++ * 0 if all AoI granted ++ * EACCESS if the denied mask is non-zero ++ * other error if abnormal failure (e.g., IO error) ++ * ++ * A secondary usage of the function is to determine if any of the ++ * AoI are granted. If an ACE grants any access in ++ * the working_mode, we immediately short circuit out of the function. ++ * This mode is chosen by setting anyaccess to B_TRUE. The ++ * working_mode is not a denied access mask upon exit if the function ++ * is used in this manner. ++ */ ++static int ++zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, ++ boolean_t anyaccess, cred_t *cr) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ zfs_acl_t *aclp; ++ int error; ++ uid_t uid = crgetuid(cr); ++ uint64_t who; ++ uint16_t type, iflags; ++ uint16_t entry_type; ++ uint32_t access_mask; ++ uint32_t deny_mask = 0; ++ zfs_ace_hdr_t *acep = NULL; ++ boolean_t checkit; ++ uid_t gowner; ++ uid_t fowner; ++ ++ zfs_fuid_map_ids(zp, cr, &fowner, &gowner); ++ ++ mutex_enter(&zp->z_acl_lock); ++ ++ error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); ++ if (error != 0) { ++ mutex_exit(&zp->z_acl_lock); ++ return (error); ++ } ++ ++ ASSERT(zp->z_acl_cached); ++ ++ while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, ++ &iflags, &type))) { ++ uint32_t mask_matched; ++ ++ if (!zfs_acl_valid_ace_type(type, iflags)) ++ continue; ++ ++ if (S_ISDIR(ZTOI(zp)->i_mode) && ++ (iflags & ACE_INHERIT_ONLY_ACE)) ++ continue; ++ ++ /* Skip ACE if it does not affect any AoI */ ++ mask_matched = (access_mask & *working_mode); ++ if (!mask_matched) ++ continue; ++ ++ entry_type = (iflags & ACE_TYPE_FLAGS); ++ ++ checkit = B_FALSE; ++ ++ switch (entry_type) { ++ case ACE_OWNER: ++ if (uid == fowner) ++ checkit = B_TRUE; ++ break; ++ case OWNING_GROUP: ++ who = gowner; ++ /*FALLTHROUGH*/ ++ case ACE_IDENTIFIER_GROUP: ++ checkit = zfs_groupmember(zsb, who, cr); ++ break; ++ case ACE_EVERYONE: ++ checkit = B_TRUE; ++ break; ++ ++ /* USER Entry */ ++ default: ++ if (entry_type == 0) { ++ uid_t newid; ++ ++ newid = zfs_fuid_map_id(zsb, who, cr, ++ ZFS_ACE_USER); ++ if (newid != IDMAP_WK_CREATOR_OWNER_UID && ++ uid == newid) ++ checkit = B_TRUE; ++ break; ++ } else { ++ mutex_exit(&zp->z_acl_lock); ++ return (EIO); ++ } ++ } ++ ++ if (checkit) { ++ if (type == DENY) { ++ DTRACE_PROBE3(zfs__ace__denies, ++ znode_t *, zp, ++ zfs_ace_hdr_t *, acep, ++ uint32_t, mask_matched); ++ deny_mask |= mask_matched; ++ } else { ++ DTRACE_PROBE3(zfs__ace__allows, ++ znode_t *, zp, ++ zfs_ace_hdr_t *, acep, ++ uint32_t, mask_matched); ++ if (anyaccess) { ++ mutex_exit(&zp->z_acl_lock); ++ return (0); ++ } ++ } ++ *working_mode &= ~mask_matched; ++ } ++ ++ /* Are we done? */ ++ if (*working_mode == 0) ++ break; ++ } ++ ++ mutex_exit(&zp->z_acl_lock); ++ ++ /* Put the found 'denies' back on the working mode */ ++ if (deny_mask) { ++ *working_mode |= deny_mask; ++ return (EACCES); ++ } else if (*working_mode) { ++ return (-1); ++ } ++ ++ return (0); ++} ++ ++/* ++ * Return true if any access whatsoever granted, we don't actually ++ * care what access is granted. ++ */ ++boolean_t ++zfs_has_access(znode_t *zp, cred_t *cr) ++{ ++ uint32_t have = ACE_ALL_PERMS; ++ ++ if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { ++ uid_t owner; ++ ++ owner = zfs_fuid_map_id(ZTOZSB(zp), zp->z_uid, cr, ZFS_OWNER); ++ return (secpolicy_vnode_any_access(cr, ZTOI(zp), owner) == 0); ++ } ++ return (B_TRUE); ++} ++ ++static int ++zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, ++ boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ int err; ++ ++ *working_mode = v4_mode; ++ *check_privs = B_TRUE; ++ ++ /* ++ * Short circuit empty requests ++ */ ++ if (v4_mode == 0 || zsb->z_replay) { ++ *working_mode = 0; ++ return (0); ++ } ++ ++ if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { ++ *check_privs = B_FALSE; ++ return (err); ++ } ++ ++ /* ++ * The caller requested that the ACL check be skipped. This ++ * would only happen if the caller checked VOP_ACCESS() with a ++ * 32 bit ACE mask and already had the appropriate permissions. ++ */ ++ if (skipaclchk) { ++ *working_mode = 0; ++ return (0); ++ } ++ ++ return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); ++} ++ ++static int ++zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, ++ cred_t *cr) ++{ ++ if (*working_mode != ACE_WRITE_DATA) ++ return (EACCES); ++ ++ return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, ++ check_privs, B_FALSE, cr)); ++} ++ ++int ++zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) ++{ ++ boolean_t owner = B_FALSE; ++ boolean_t groupmbr = B_FALSE; ++ boolean_t is_attr; ++ uid_t uid = crgetuid(cr); ++ int error; ++ ++ if (zdp->z_pflags & ZFS_AV_QUARANTINED) ++ return (EACCES); ++ ++ is_attr = ((zdp->z_pflags & ZFS_XATTR) && ++ (S_ISDIR(ZTOI(zdp)->i_mode))); ++ if (is_attr) ++ goto slow; ++ ++ ++ mutex_enter(&zdp->z_acl_lock); ++ ++ if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) { ++ mutex_exit(&zdp->z_acl_lock); ++ return (0); ++ } ++ ++ if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) { ++ mutex_exit(&zdp->z_acl_lock); ++ goto slow; ++ } ++ ++ if (uid == zdp->z_uid) { ++ owner = B_TRUE; ++ if (zdp->z_mode & S_IXUSR) { ++ mutex_exit(&zdp->z_acl_lock); ++ return (0); ++ } else { ++ mutex_exit(&zdp->z_acl_lock); ++ goto slow; ++ } ++ } ++ if (groupmember(zdp->z_gid, cr)) { ++ groupmbr = B_TRUE; ++ if (zdp->z_mode & S_IXGRP) { ++ mutex_exit(&zdp->z_acl_lock); ++ return (0); ++ } else { ++ mutex_exit(&zdp->z_acl_lock); ++ goto slow; ++ } ++ } ++ if (!owner && !groupmbr) { ++ if (zdp->z_mode & S_IXOTH) { ++ mutex_exit(&zdp->z_acl_lock); ++ return (0); ++ } ++ } ++ ++ mutex_exit(&zdp->z_acl_lock); ++ ++slow: ++ DTRACE_PROBE(zfs__fastpath__execute__access__miss); ++ ZFS_ENTER(ZTOZSB(zdp)); ++ error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); ++ ZFS_EXIT(ZTOZSB(zdp)); ++ return (error); ++} ++ ++/* ++ * Determine whether Access should be granted/denied. ++ * The least priv subsytem is always consulted as a basic privilege ++ * can define any form of access. ++ */ ++int ++zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) ++{ ++ uint32_t working_mode; ++ int error; ++ int is_attr; ++ boolean_t check_privs; ++ znode_t *xzp; ++ znode_t *check_zp = zp; ++ mode_t needed_bits; ++ uid_t owner; ++ ++ is_attr = ((zp->z_pflags & ZFS_XATTR) && S_ISDIR(ZTOI(zp)->i_mode)); ++ ++ /* ++ * If attribute then validate against base file ++ */ ++ if (is_attr) { ++ uint64_t parent; ++ ++ if ((error = sa_lookup(zp->z_sa_hdl, ++ SA_ZPL_PARENT(ZTOZSB(zp)), &parent, ++ sizeof (parent))) != 0) ++ return (error); ++ ++ if ((error = zfs_zget(ZTOZSB(zp), ++ parent, &xzp)) != 0) { ++ return (error); ++ } ++ ++ check_zp = xzp; ++ ++ /* ++ * fixup mode to map to xattr perms ++ */ ++ ++ if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { ++ mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); ++ mode |= ACE_WRITE_NAMED_ATTRS; ++ } ++ ++ if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { ++ mode &= ~(ACE_READ_DATA|ACE_EXECUTE); ++ mode |= ACE_READ_NAMED_ATTRS; ++ } ++ } ++ ++ owner = zfs_fuid_map_id(ZTOZSB(zp), zp->z_uid, cr, ZFS_OWNER); ++ /* ++ * Map the bits required to the standard inode flags ++ * S_IRUSR|S_IWUSR|S_IXUSR in the needed_bits. Map the bits ++ * mapped by working_mode (currently missing) in missing_bits. ++ * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), ++ * needed_bits. ++ */ ++ needed_bits = 0; ++ ++ working_mode = mode; ++ if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && ++ owner == crgetuid(cr)) ++ working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); ++ ++ if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| ++ ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) ++ needed_bits |= S_IRUSR; ++ if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| ++ ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) ++ needed_bits |= S_IWUSR; ++ if (working_mode & ACE_EXECUTE) ++ needed_bits |= S_IXUSR; ++ ++ if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, ++ &check_privs, skipaclchk, cr)) == 0) { ++ if (is_attr) ++ iput(ZTOI(xzp)); ++ return (secpolicy_vnode_access2(cr, ZTOI(zp), owner, ++ needed_bits, needed_bits)); ++ } ++ ++ if (error && !check_privs) { ++ if (is_attr) ++ iput(ZTOI(xzp)); ++ return (error); ++ } ++ ++ if (error && (flags & V_APPEND)) { ++ error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); ++ } ++ ++ if (error && check_privs) { ++ mode_t checkmode = 0; ++ ++ /* ++ * First check for implicit owner permission on ++ * read_acl/read_attributes ++ */ ++ ++ error = 0; ++ ASSERT(working_mode != 0); ++ ++ if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && ++ owner == crgetuid(cr))) ++ working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); ++ ++ if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| ++ ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) ++ checkmode |= S_IRUSR; ++ if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| ++ ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) ++ checkmode |= S_IWUSR; ++ if (working_mode & ACE_EXECUTE) ++ checkmode |= S_IXUSR; ++ ++ error = secpolicy_vnode_access2(cr, ZTOI(check_zp), owner, ++ needed_bits & ~checkmode, needed_bits); ++ ++ if (error == 0 && (working_mode & ACE_WRITE_OWNER)) ++ error = secpolicy_vnode_chown(cr, owner); ++ if (error == 0 && (working_mode & ACE_WRITE_ACL)) ++ error = secpolicy_vnode_setdac(cr, owner); ++ ++ if (error == 0 && (working_mode & ++ (ACE_DELETE|ACE_DELETE_CHILD))) ++ error = secpolicy_vnode_remove(cr); ++ ++ if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { ++ error = secpolicy_vnode_chown(cr, owner); ++ } ++ if (error == 0) { ++ /* ++ * See if any bits other than those already checked ++ * for are still present. If so then return EACCES ++ */ ++ if (working_mode & ~(ZFS_CHECKED_MASKS)) { ++ error = EACCES; ++ } ++ } ++ } else if (error == 0) { ++ error = secpolicy_vnode_access2(cr, ZTOI(zp), owner, ++ needed_bits, needed_bits); ++ } ++ ++ ++ if (is_attr) ++ iput(ZTOI(xzp)); ++ ++ return (error); ++} ++ ++/* ++ * Translate traditional unix S_IRUSR/S_IWUSR/S_IXUSR mode into ++ * native ACL format and call zfs_zaccess() ++ */ ++int ++zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) ++{ ++ return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); ++} ++ ++/* ++ * Access function for secpolicy_vnode_setattr ++ */ ++int ++zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) ++{ ++ int v4_mode = zfs_unix_to_v4(mode >> 6); ++ ++ return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); ++} ++ ++static int ++zfs_delete_final_check(znode_t *zp, znode_t *dzp, ++ mode_t available_perms, cred_t *cr) ++{ ++ int error; ++ uid_t downer; ++ ++ downer = zfs_fuid_map_id(ZTOZSB(dzp), dzp->z_uid, cr, ZFS_OWNER); ++ ++ error = secpolicy_vnode_access2(cr, ZTOI(dzp), ++ downer, available_perms, S_IWUSR|S_IXUSR); ++ ++ if (error == 0) ++ error = zfs_sticky_remove_access(dzp, zp, cr); ++ ++ return (error); ++} ++ ++/* ++ * Determine whether Access should be granted/deny, without ++ * consulting least priv subsystem. ++ * ++ * ++ * The following chart is the recommended NFSv4 enforcement for ++ * ability to delete an object. ++ * ++ * ------------------------------------------------------- ++ * | Parent Dir | Target Object Permissions | ++ * | permissions | | ++ * ------------------------------------------------------- ++ * | | ACL Allows | ACL Denies| Delete | ++ * | | Delete | Delete | unspecified| ++ * ------------------------------------------------------- ++ * | ACL Allows | Permit | Permit | Permit | ++ * | DELETE_CHILD | | ++ * ------------------------------------------------------- ++ * | ACL Denies | Permit | Deny | Deny | ++ * | DELETE_CHILD | | | | ++ * ------------------------------------------------------- ++ * | ACL specifies | | | | ++ * | only allow | Permit | Permit | Permit | ++ * | write and | | | | ++ * | execute | | | | ++ * ------------------------------------------------------- ++ * | ACL denies | | | | ++ * | write and | Permit | Deny | Deny | ++ * | execute | | | | ++ * ------------------------------------------------------- ++ * ^ ++ * | ++ * No search privilege, can't even look up file? ++ * ++ */ ++int ++zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) ++{ ++ uint32_t dzp_working_mode = 0; ++ uint32_t zp_working_mode = 0; ++ int dzp_error, zp_error; ++ mode_t available_perms; ++ boolean_t dzpcheck_privs = B_TRUE; ++ boolean_t zpcheck_privs = B_TRUE; ++ ++ /* ++ * We want specific DELETE permissions to ++ * take precedence over WRITE/EXECUTE. We don't ++ * want an ACL such as this to mess us up. ++ * user:joe:write_data:deny,user:joe:delete:allow ++ * ++ * However, deny permissions may ultimately be overridden ++ * by secpolicy_vnode_access(). ++ * ++ * We will ask for all of the necessary permissions and then ++ * look at the working modes from the directory and target object ++ * to determine what was found. ++ */ ++ ++ if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) ++ return (EPERM); ++ ++ /* ++ * First row ++ * If the directory permissions allow the delete, we are done. ++ */ ++ if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, ++ &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) ++ return (0); ++ ++ /* ++ * If target object has delete permission then we are done ++ */ ++ if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, ++ &zpcheck_privs, B_FALSE, cr)) == 0) ++ return (0); ++ ++ ASSERT(dzp_error && zp_error); ++ ++ if (!dzpcheck_privs) ++ return (dzp_error); ++ if (!zpcheck_privs) ++ return (zp_error); ++ ++ /* ++ * Second row ++ * ++ * If directory returns EACCES then delete_child was denied ++ * due to deny delete_child. In this case send the request through ++ * secpolicy_vnode_remove(). We don't use zfs_delete_final_check() ++ * since that *could* allow the delete based on write/execute permission ++ * and we want delete permissions to override write/execute. ++ */ ++ ++ if (dzp_error == EACCES) ++ return (secpolicy_vnode_remove(cr)); ++ ++ /* ++ * Third Row ++ * only need to see if we have write/execute on directory. ++ */ ++ ++ dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, ++ &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); ++ ++ if (dzp_error != 0 && !dzpcheck_privs) ++ return (dzp_error); ++ ++ /* ++ * Fourth row ++ */ ++ ++ available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : S_IWUSR; ++ available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : S_IXUSR; ++ ++ return (zfs_delete_final_check(zp, dzp, available_perms, cr)); ++ ++} ++ ++int ++zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, ++ znode_t *tzp, cred_t *cr) ++{ ++ int add_perm; ++ int error; ++ ++ if (szp->z_pflags & ZFS_AV_QUARANTINED) ++ return (EACCES); ++ ++ add_perm = S_ISDIR(ZTOI(szp)->i_mode) ? ++ ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; ++ ++ /* ++ * Rename permissions are combination of delete permission + ++ * add file/subdir permission. ++ */ ++ ++ /* ++ * first make sure we do the delete portion. ++ * ++ * If that succeeds then check for add_file/add_subdir permissions ++ */ ++ ++ if ((error = zfs_zaccess_delete(sdzp, szp, cr))) ++ return (error); ++ ++ /* ++ * If we have a tzp, see if we can delete it? ++ */ ++ if (tzp) { ++ if ((error = zfs_zaccess_delete(tdzp, tzp, cr))) ++ return (error); ++ } ++ ++ /* ++ * Now check for add permissions ++ */ ++ error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); ++ ++ return (error); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_byteswap.c linux-3.2.33-go/fs/zfs/zfs/zfs_byteswap.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_byteswap.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_byteswap.c 2012-11-16 23:25:34.349039334 +0100 +@@ -0,0 +1,205 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++void ++zfs_oldace_byteswap(ace_t *ace, int ace_cnt) ++{ ++ int i; ++ ++ for (i = 0; i != ace_cnt; i++, ace++) { ++ ace->a_who = BSWAP_32(ace->a_who); ++ ace->a_access_mask = BSWAP_32(ace->a_access_mask); ++ ace->a_flags = BSWAP_16(ace->a_flags); ++ ace->a_type = BSWAP_16(ace->a_type); ++ } ++} ++ ++/* ++ * swap ace_t and ace_oject_t ++ */ ++void ++zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) ++{ ++ caddr_t end; ++ caddr_t ptr; ++ zfs_ace_t *zacep = NULL; ++ ace_t *acep; ++ uint16_t entry_type; ++ size_t entry_size; ++ int ace_type; ++ ++ end = (caddr_t)buf + size; ++ ptr = buf; ++ ++ while (ptr < end) { ++ if (zfs_layout) { ++ /* ++ * Avoid overrun. Embedded aces can have one ++ * of several sizes. We don't know exactly ++ * how many our present, only the size of the ++ * buffer containing them. That size may be ++ * larger than needed to hold the aces ++ * present. As long as we do not do any ++ * swapping beyond the end of our block we are ++ * okay. It it safe to swap any non-ace data ++ * within the block since it is just zeros. ++ */ ++ if (ptr + sizeof (zfs_ace_hdr_t) > end) { ++ break; ++ } ++ zacep = (zfs_ace_t *)ptr; ++ zacep->z_hdr.z_access_mask = ++ BSWAP_32(zacep->z_hdr.z_access_mask); ++ zacep->z_hdr.z_flags = BSWAP_16(zacep->z_hdr.z_flags); ++ ace_type = zacep->z_hdr.z_type = ++ BSWAP_16(zacep->z_hdr.z_type); ++ entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS; ++ } else { ++ /* Overrun avoidance */ ++ if (ptr + sizeof (ace_t) > end) { ++ break; ++ } ++ acep = (ace_t *)ptr; ++ acep->a_access_mask = BSWAP_32(acep->a_access_mask); ++ acep->a_flags = BSWAP_16(acep->a_flags); ++ ace_type = acep->a_type = BSWAP_16(acep->a_type); ++ acep->a_who = BSWAP_32(acep->a_who); ++ entry_type = acep->a_flags & ACE_TYPE_FLAGS; ++ } ++ switch (entry_type) { ++ case ACE_OWNER: ++ case ACE_EVERYONE: ++ case (ACE_IDENTIFIER_GROUP | ACE_GROUP): ++ entry_size = zfs_layout ? ++ sizeof (zfs_ace_hdr_t) : sizeof (ace_t); ++ break; ++ case ACE_IDENTIFIER_GROUP: ++ default: ++ /* Overrun avoidance */ ++ if (zfs_layout) { ++ if (ptr + sizeof (zfs_ace_t) <= end) { ++ zacep->z_fuid = BSWAP_64(zacep->z_fuid); ++ } else { ++ entry_size = sizeof (zfs_ace_t); ++ break; ++ } ++ } ++ switch (ace_type) { ++ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: ++ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: ++ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: ++ entry_size = zfs_layout ? ++ sizeof (zfs_object_ace_t) : ++ sizeof (ace_object_t); ++ break; ++ default: ++ entry_size = zfs_layout ? sizeof (zfs_ace_t) : ++ sizeof (ace_t); ++ break; ++ } ++ } ++ ptr = ptr + entry_size; ++ } ++} ++ ++/* ARGSUSED */ ++void ++zfs_oldacl_byteswap(void *buf, size_t size) ++{ ++ int cnt; ++ ++ /* ++ * Arggh, since we don't know how many ACEs are in ++ * the array, we have to swap the entire block ++ */ ++ ++ cnt = size / sizeof (ace_t); ++ ++ zfs_oldace_byteswap((ace_t *)buf, cnt); ++} ++ ++/* ARGSUSED */ ++void ++zfs_acl_byteswap(void *buf, size_t size) ++{ ++ zfs_ace_byteswap(buf, size, B_TRUE); ++} ++ ++void ++zfs_znode_byteswap(void *buf, size_t size) ++{ ++ znode_phys_t *zp = buf; ++ ++ ASSERT(size >= sizeof (znode_phys_t)); ++ ++ zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]); ++ zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]); ++ zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]); ++ zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]); ++ zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]); ++ zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]); ++ zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]); ++ zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]); ++ zp->zp_gen = BSWAP_64(zp->zp_gen); ++ zp->zp_mode = BSWAP_64(zp->zp_mode); ++ zp->zp_size = BSWAP_64(zp->zp_size); ++ zp->zp_parent = BSWAP_64(zp->zp_parent); ++ zp->zp_links = BSWAP_64(zp->zp_links); ++ zp->zp_xattr = BSWAP_64(zp->zp_xattr); ++ zp->zp_rdev = BSWAP_64(zp->zp_rdev); ++ zp->zp_flags = BSWAP_64(zp->zp_flags); ++ zp->zp_uid = BSWAP_64(zp->zp_uid); ++ zp->zp_gid = BSWAP_64(zp->zp_gid); ++ zp->zp_zap = BSWAP_64(zp->zp_zap); ++ zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]); ++ zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]); ++ zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]); ++ ++ zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj); ++ zp->zp_acl.z_acl_size = BSWAP_32(zp->zp_acl.z_acl_size); ++ zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version); ++ zp->zp_acl.z_acl_count = BSWAP_16(zp->zp_acl.z_acl_count); ++ if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) { ++ zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0], ++ ZFS_ACE_SPACE); ++ } else { ++ zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0], ++ ACE_SLOT_CNT); ++ } ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(zfs_oldacl_byteswap); ++EXPORT_SYMBOL(zfs_acl_byteswap); ++EXPORT_SYMBOL(zfs_znode_byteswap); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_ctldir.c linux-3.2.33-go/fs/zfs/zfs/zfs_ctldir.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_ctldir.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_ctldir.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,992 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (C) 2011 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * LLNL-CODE-403049. ++ * Rewritten for Linux by: ++ * Rohan Puri ++ * Brian Behlendorf ++ */ ++ ++/* ++ * ZFS control directory (a.k.a. ".zfs") ++ * ++ * This directory provides a common location for all ZFS meta-objects. ++ * Currently, this is only the 'snapshot' and 'shares' directory, but this may ++ * expand in the future. The elements are built dynamically, as the hierarchy ++ * does not actually exist on disk. ++ * ++ * For 'snapshot', we don't want to have all snapshots always mounted, because ++ * this would take up a huge amount of space in /etc/mnttab. We have three ++ * types of objects: ++ * ++ * ctldir ------> snapshotdir -------> snapshot ++ * | ++ * | ++ * V ++ * mounted fs ++ * ++ * The 'snapshot' node contains just enough information to lookup '..' and act ++ * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we ++ * perform an automount of the underlying filesystem and return the ++ * corresponding inode. ++ * ++ * All mounts are handled automatically by an user mode helper which invokes ++ * the mount mount procedure. Unmounts are handled by allowing the mount ++ * point to expire so the kernel may automatically unmount it. ++ * ++ * The '.zfs', '.zfs/snapshot', and all directories created under ++ * '.zfs/snapshot' (ie: '.zfs/snapshot/') all share the same ++ * share the same zfs_sb_t as the head filesystem (what '.zfs' lives under). ++ * ++ * File systems mounted on top of the '.zfs/snapshot/' paths ++ * (ie: snapshots) are complete ZFS filesystems and have their own unique ++ * zfs_sb_t. However, the fsid reported by these mounts will be the same ++ * as that used by the parent zfs_sb_t to make NFS happy. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "zfs_namecheck.h" ++ ++/* ++ * Control Directory Tunables (.zfs) ++ */ ++int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT; ++ ++static zfs_snapentry_t * ++zfsctl_sep_alloc(void) ++{ ++ return kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP); ++} ++ ++void ++zfsctl_sep_free(zfs_snapentry_t *sep) ++{ ++ kmem_free(sep->se_name, MAXNAMELEN); ++ kmem_free(sep->se_path, PATH_MAX); ++ kmem_free(sep, sizeof (zfs_snapentry_t)); ++} ++ ++/* ++ * Attempt to expire an automounted snapshot, unmounts are attempted every ++ * 'zfs_expire_snapshot' seconds until they succeed. The work request is ++ * responsible for rescheduling itself and freeing the zfs_expire_snapshot_t. ++ */ ++static void ++zfsctl_expire_snapshot(void *data) ++{ ++ zfs_snapentry_t *sep; ++ zfs_sb_t *zsb; ++ int error; ++ ++ sep = spl_get_work_data(data, zfs_snapentry_t, se_work.work); ++ zsb = ITOZSB(sep->se_inode); ++ ++ error = zfsctl_unmount_snapshot(zsb, sep->se_name, MNT_EXPIRE); ++ if (error == EBUSY) ++ schedule_delayed_work(&sep->se_work, zfs_expire_snapshot * HZ); ++} ++ ++int ++snapentry_compare(const void *a, const void *b) ++{ ++ const zfs_snapentry_t *sa = a; ++ const zfs_snapentry_t *sb = b; ++ int ret = strcmp(sa->se_name, sb->se_name); ++ ++ if (ret < 0) ++ return (-1); ++ else if (ret > 0) ++ return (1); ++ else ++ return (0); ++} ++ ++boolean_t ++zfsctl_is_node(struct inode *ip) ++{ ++ return (ITOZ(ip)->z_is_ctldir); ++} ++ ++boolean_t ++zfsctl_is_snapdir(struct inode *ip) ++{ ++ return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS)); ++} ++ ++/* ++ * Allocate a new inode with the passed id and ops. ++ */ ++static struct inode * ++zfsctl_inode_alloc(zfs_sb_t *zsb, uint64_t id, ++ const struct file_operations *fops, const struct inode_operations *ops) ++{ ++ struct timespec now = current_fs_time(zsb->z_sb); ++ struct inode *ip; ++ znode_t *zp; ++ ++ ip = new_inode(zsb->z_sb); ++ if (ip == NULL) ++ return (NULL); ++ ++ zp = ITOZ(ip); ++ ASSERT3P(zp->z_dirlocks, ==, NULL); ++ ASSERT3P(zp->z_acl_cached, ==, NULL); ++ ASSERT3P(zp->z_xattr_cached, ==, NULL); ++ zp->z_id = id; ++ zp->z_unlinked = 0; ++ zp->z_atime_dirty = 0; ++ zp->z_zn_prefetch = 0; ++ zp->z_moved = 0; ++ zp->z_sa_hdl = NULL; ++ zp->z_blksz = 0; ++ zp->z_seq = 0; ++ zp->z_mapcnt = 0; ++ zp->z_gen = 0; ++ zp->z_size = 0; ++ zp->z_atime[0] = 0; ++ zp->z_atime[1] = 0; ++ zp->z_links = 0; ++ zp->z_pflags = 0; ++ zp->z_uid = 0; ++ zp->z_gid = 0; ++ zp->z_mode = 0; ++ zp->z_sync_cnt = 0; ++ zp->z_is_zvol = B_FALSE; ++ zp->z_is_mapped = B_FALSE; ++ zp->z_is_ctldir = B_TRUE; ++ zp->z_is_sa = B_FALSE; ++ ip->i_ino = id; ++ ip->i_mode = (S_IFDIR | S_IRUGO | S_IXUGO); ++ ip->i_uid = 0; ++ ip->i_gid = 0; ++ ip->i_blkbits = SPA_MINBLOCKSHIFT; ++ ip->i_atime = now; ++ ip->i_mtime = now; ++ ip->i_ctime = now; ++ ip->i_fop = fops; ++ ip->i_op = ops; ++ ++ if (insert_inode_locked(ip)) { ++ unlock_new_inode(ip); ++ iput(ip); ++ return (NULL); ++ } ++ ++ mutex_enter(&zsb->z_znodes_lock); ++ list_insert_tail(&zsb->z_all_znodes, zp); ++ zsb->z_nr_znodes++; ++ membar_producer(); ++ mutex_exit(&zsb->z_znodes_lock); ++ ++ unlock_new_inode(ip); ++ ++ return (ip); ++} ++ ++/* ++ * Lookup the inode with given id, it will be allocated if needed. ++ */ ++static struct inode * ++zfsctl_inode_lookup(zfs_sb_t *zsb, uint64_t id, ++ const struct file_operations *fops, const struct inode_operations *ops) ++{ ++ struct inode *ip = NULL; ++ ++ while (ip == NULL) { ++ ip = ilookup(zsb->z_sb, (unsigned long)id); ++ if (ip) ++ break; ++ ++ /* May fail due to concurrent zfsctl_inode_alloc() */ ++ ip = zfsctl_inode_alloc(zsb, id, fops, ops); ++ } ++ ++ return (ip); ++} ++ ++/* ++ * Free zfsctl inode specific structures, currently there are none. ++ */ ++void ++zfsctl_inode_destroy(struct inode *ip) ++{ ++ return; ++} ++ ++/* ++ * An inode is being evicted from the cache. ++ */ ++void ++zfsctl_inode_inactive(struct inode *ip) ++{ ++ if (zfsctl_is_snapdir(ip)) ++ zfsctl_snapdir_inactive(ip); ++} ++ ++/* ++ * Create the '.zfs' directory. This directory is cached as part of the VFS ++ * structure. This results in a hold on the zfs_sb_t. The code in zfs_umount() ++ * therefore checks against a vfs_count of 2 instead of 1. This reference ++ * is removed when the ctldir is destroyed in the unmount. All other entities ++ * under the '.zfs' directory are created dynamically as needed. ++ * ++ * Because the dynamically created '.zfs' directory entries assume the use ++ * of 64-bit inode numbers this support must be disabled on 32-bit systems. ++ */ ++int ++zfsctl_create(zfs_sb_t *zsb) ++{ ++#if defined(CONFIG_64BIT) ++ ASSERT(zsb->z_ctldir == NULL); ++ ++ zsb->z_ctldir = zfsctl_inode_alloc(zsb, ZFSCTL_INO_ROOT, ++ &zpl_fops_root, &zpl_ops_root); ++ if (zsb->z_ctldir == NULL) ++ return (ENOENT); ++ ++ return (0); ++#else ++ return (EOPNOTSUPP); ++#endif /* CONFIG_64BIT */ ++} ++ ++/* ++ * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. ++ */ ++void ++zfsctl_destroy(zfs_sb_t *zsb) ++{ ++ iput(zsb->z_ctldir); ++ zsb->z_ctldir = NULL; ++} ++ ++/* ++ * Given a root znode, retrieve the associated .zfs directory. ++ * Add a hold to the vnode and return it. ++ */ ++struct inode * ++zfsctl_root(znode_t *zp) ++{ ++ ASSERT(zfs_has_ctldir(zp)); ++ igrab(ZTOZSB(zp)->z_ctldir); ++ return (ZTOZSB(zp)->z_ctldir); ++} ++ ++/*ARGSUSED*/ ++int ++zfsctl_fid(struct inode *ip, fid_t *fidp) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ uint64_t object = zp->z_id; ++ zfid_short_t *zfid; ++ int i; ++ ++ ZFS_ENTER(zsb); ++ ++ if (fidp->fid_len < SHORT_FID_LEN) { ++ fidp->fid_len = SHORT_FID_LEN; ++ ZFS_EXIT(zsb); ++ return (ENOSPC); ++ } ++ ++ zfid = (zfid_short_t *)fidp; ++ ++ zfid->zf_len = SHORT_FID_LEN; ++ ++ for (i = 0; i < sizeof (zfid->zf_object); i++) ++ zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); ++ ++ /* .zfs znodes always have a generation number of 0 */ ++ for (i = 0; i < sizeof (zfid->zf_gen); i++) ++ zfid->zf_gen[i] = 0; ++ ++ ZFS_EXIT(zsb); ++ return (0); ++} ++ ++static int ++zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname) ++{ ++ objset_t *os = ITOZSB(ip)->z_os; ++ ++ if (snapshot_namecheck(name, NULL, NULL) != 0) ++ return (EILSEQ); ++ ++ dmu_objset_name(os, zname); ++ if ((strlen(zname) + 1 + strlen(name)) >= len) ++ return (ENAMETOOLONG); ++ ++ (void) strcat(zname, "@"); ++ (void) strcat(zname, name); ++ ++ return (0); ++} ++ ++static int ++zfsctl_snapshot_zpath(struct path *path, int len, char *zpath) ++{ ++ char *path_buffer, *path_ptr; ++ int path_len, error = 0; ++ ++ path_buffer = kmem_alloc(len, KM_SLEEP); ++ ++ path_ptr = d_path(path, path_buffer, len); ++ if (IS_ERR(path_ptr)) { ++ error = -PTR_ERR(path_ptr); ++ goto out; ++ } ++ ++ path_len = path_buffer + len - 1 - path_ptr; ++ if (path_len > len) { ++ error = EFAULT; ++ goto out; ++ } ++ ++ memcpy(zpath, path_ptr, path_len); ++ zpath[path_len] = '\0'; ++out: ++ kmem_free(path_buffer, len); ++ ++ return (error); ++} ++ ++/* ++ * Special case the handling of "..". ++ */ ++/* ARGSUSED */ ++int ++zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp, ++ int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) ++{ ++ zfs_sb_t *zsb = ITOZSB(dip); ++ int error = 0; ++ ++ ZFS_ENTER(zsb); ++ ++ if (strcmp(name, "..") == 0) { ++ *ipp = dip->i_sb->s_root->d_inode; ++ } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) { ++ *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIR, ++ &zpl_fops_snapdir, &zpl_ops_snapdir); ++ } else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) { ++ *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SHARES, ++ &zpl_fops_shares, &zpl_ops_shares); ++ } else { ++ *ipp = NULL; ++ } ++ ++ if (*ipp == NULL) ++ error = ENOENT; ++ ++ ZFS_EXIT(zsb); ++ ++ return (error); ++} ++ ++/* ++ * Lookup entry point for the 'snapshot' directory. Try to open the ++ * snapshot if it exist, creating the pseudo filesystem inode as necessary. ++ * Perform a mount of the associated dataset on top of the inode. ++ */ ++/* ARGSUSED */ ++int ++zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp, ++ int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) ++{ ++ zfs_sb_t *zsb = ITOZSB(dip); ++ uint64_t id; ++ int error; ++ ++ ZFS_ENTER(zsb); ++ ++ error = dmu_snapshot_id(zsb->z_os, name, &id); ++ if (error) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIRS - id, ++ &simple_dir_operations, &simple_dir_inode_operations); ++ if (*ipp) { ++#ifdef HAVE_AUTOMOUNT ++ (*ipp)->i_flags |= S_AUTOMOUNT; ++#endif /* HAVE_AUTOMOUNT */ ++ } else { ++ error = ENOENT; ++ } ++ ++ ZFS_EXIT(zsb); ++ ++ return (error); ++} ++ ++static void ++zfsctl_rename_snap(zfs_sb_t *zsb, zfs_snapentry_t *sep, const char *name) ++{ ++ avl_index_t where; ++ ++ ASSERT(MUTEX_HELD(&zsb->z_ctldir_lock)); ++ ASSERT(sep != NULL); ++ ++ /* ++ * Change the name in the AVL tree. ++ */ ++ avl_remove(&zsb->z_ctldir_snaps, sep); ++ (void) strcpy(sep->se_name, name); ++ VERIFY(avl_find(&zsb->z_ctldir_snaps, sep, &where) == NULL); ++ avl_insert(&zsb->z_ctldir_snaps, sep, where); ++} ++ ++/* ++ * Renaming a directory under '.zfs/snapshot' will automatically trigger ++ * a rename of the snapshot to the new given name. The rename is confined ++ * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere. ++ */ ++/*ARGSUSED*/ ++int ++zfsctl_snapdir_rename(struct inode *sdip, char *sname, ++ struct inode *tdip, char *tname, cred_t *cr, int flags) ++{ ++ zfs_sb_t *zsb = ITOZSB(sdip); ++ zfs_snapentry_t search, *sep; ++ avl_index_t where; ++ char *to, *from, *real; ++ int error; ++ ++ ZFS_ENTER(zsb); ++ ++ to = kmem_alloc(MAXNAMELEN, KM_SLEEP); ++ from = kmem_alloc(MAXNAMELEN, KM_SLEEP); ++ real = kmem_alloc(MAXNAMELEN, KM_SLEEP); ++ ++ if (zsb->z_case == ZFS_CASE_INSENSITIVE) { ++ error = dmu_snapshot_realname(zsb->z_os, sname, real, ++ MAXNAMELEN, NULL); ++ if (error == 0) { ++ sname = real; ++ } else if (error != ENOTSUP) { ++ goto out; ++ } ++ } ++ ++ error = zfsctl_snapshot_zname(sdip, sname, MAXNAMELEN, from); ++ if (!error) ++ error = zfsctl_snapshot_zname(tdip, tname, MAXNAMELEN, to); ++ if (!error) ++ error = zfs_secpolicy_rename_perms(from, to, cr); ++ if (error) ++ goto out; ++ ++ /* ++ * Cannot move snapshots out of the snapdir. ++ */ ++ if (sdip != tdip) { ++ error = EINVAL; ++ goto out; ++ } ++ ++ /* ++ * No-op when names are identical. ++ */ ++ if (strcmp(sname, tname) == 0) { ++ error = 0; ++ goto out; ++ } ++ ++ mutex_enter(&zsb->z_ctldir_lock); ++ ++ error = dmu_objset_rename(from, to, B_FALSE); ++ if (error) ++ goto out_unlock; ++ ++ search.se_name = (char *)sname; ++ sep = avl_find(&zsb->z_ctldir_snaps, &search, &where); ++ if (sep) ++ zfsctl_rename_snap(zsb, sep, tname); ++ ++out_unlock: ++ mutex_exit(&zsb->z_ctldir_lock); ++out: ++ kmem_free(from, MAXNAMELEN); ++ kmem_free(to, MAXNAMELEN); ++ kmem_free(real, MAXNAMELEN); ++ ++ ZFS_EXIT(zsb); ++ ++ return (error); ++} ++ ++/* ++ * Removing a directory under '.zfs/snapshot' will automatically trigger ++ * the removal of the snapshot with the given name. ++ */ ++/* ARGSUSED */ ++int ++zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags) ++{ ++ zfs_sb_t *zsb = ITOZSB(dip); ++ char *snapname, *real; ++ int error; ++ ++ ZFS_ENTER(zsb); ++ ++ snapname = kmem_alloc(MAXNAMELEN, KM_SLEEP); ++ real = kmem_alloc(MAXNAMELEN, KM_SLEEP); ++ ++ if (zsb->z_case == ZFS_CASE_INSENSITIVE) { ++ error = dmu_snapshot_realname(zsb->z_os, name, real, ++ MAXNAMELEN, NULL); ++ if (error == 0) { ++ name = real; ++ } else if (error != ENOTSUP) { ++ goto out; ++ } ++ } ++ ++ error = zfsctl_snapshot_zname(dip, name, MAXNAMELEN, snapname); ++ if (!error) ++ error = zfs_secpolicy_destroy_perms(snapname, cr); ++ if (error) ++ goto out; ++ ++ error = zfsctl_unmount_snapshot(zsb, name, MNT_FORCE); ++ if ((error == 0) || (error == ENOENT)) ++ error = dmu_objset_destroy(snapname, B_FALSE); ++out: ++ kmem_free(snapname, MAXNAMELEN); ++ kmem_free(real, MAXNAMELEN); ++ ++ ZFS_EXIT(zsb); ++ ++ return (error); ++} ++ ++/* ++ * Creating a directory under '.zfs/snapshot' will automatically trigger ++ * the creation of a new snapshot with the given name. ++ */ ++/* ARGSUSED */ ++int ++zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, ++ struct inode **ipp, cred_t *cr, int flags) ++{ ++ zfs_sb_t *zsb = ITOZSB(dip); ++ char *dsname; ++ int error; ++ ++ dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); ++ ++ if (snapshot_namecheck(dirname, NULL, NULL) != 0) { ++ error = EILSEQ; ++ goto out; ++ } ++ ++ dmu_objset_name(zsb->z_os, dsname); ++ ++ error = zfs_secpolicy_snapshot_perms(dsname, cr); ++ if (error) ++ goto out; ++ ++ if (error == 0) { ++ error = dmu_objset_snapshot(dsname, dirname, ++ NULL, NULL, B_FALSE, B_FALSE, -1); ++ if (error) ++ goto out; ++ ++ error = zfsctl_snapdir_lookup(dip, dirname, ipp, ++ 0, cr, NULL, NULL); ++ } ++out: ++ kmem_free(dsname, MAXNAMELEN); ++ ++ return (error); ++} ++ ++/* ++ * When a .zfs/snapshot/ inode is evicted they must be removed ++ * from the snapshot list. This will normally happen as part of the auto ++ * unmount, however in the case of a manual snapshot unmount this will be ++ * the only notification we receive. ++ */ ++void ++zfsctl_snapdir_inactive(struct inode *ip) ++{ ++ zfs_sb_t *zsb = ITOZSB(ip); ++ zfs_snapentry_t *sep, *next; ++ ++ mutex_enter(&zsb->z_ctldir_lock); ++ ++ sep = avl_first(&zsb->z_ctldir_snaps); ++ while (sep != NULL) { ++ next = AVL_NEXT(&zsb->z_ctldir_snaps, sep); ++ ++ if (sep->se_inode == ip) { ++ avl_remove(&zsb->z_ctldir_snaps, sep); ++ cancel_delayed_work_sync(&sep->se_work); ++ zfsctl_sep_free(sep); ++ break; ++ } ++ sep = next; ++ } ++ ++ mutex_exit(&zsb->z_ctldir_lock); ++} ++ ++/* ++ * Attempt to unmount a snapshot by making a call to user space. ++ * There is no assurance that this can or will succeed, is just a ++ * best effort. In the case where it does fail, perhaps because ++ * it's in use, the unmount will fail harmlessly. ++ */ ++#define SET_UNMOUNT_CMD \ ++ "exec 0/dev/null " \ ++ " 2>/dev/null; " \ ++ "umount -t zfs -n '%s%s'" ++ ++static int ++__zfsctl_unmount_snapshot(zfs_snapentry_t *sep, int flags) ++{ ++ char *argv[] = { "/bin/sh", "-c", NULL, NULL }; ++ char *envp[] = { NULL }; ++ int error; ++ ++ argv[2] = kmem_asprintf(SET_UNMOUNT_CMD, ++ flags & MNT_FORCE ? "-f " : "", sep->se_path); ++ error = call_usermodehelper(argv[0], argv, envp, 1); ++ strfree(argv[2]); ++ ++ /* ++ * The umount system utility will return 256 on error. We must ++ * assume this error is because the file system is busy so it is ++ * converted to the more sensible EBUSY. ++ */ ++ if (error) ++ error = EBUSY; ++ ++ /* ++ * This was the result of a manual unmount, cancel the delayed work ++ * to prevent zfsctl_expire_snapshot() from attempting a unmount. ++ */ ++ if ((error == 0) && !(flags & MNT_EXPIRE)) ++ cancel_delayed_work(&sep->se_work); ++ ++ return (error); ++} ++ ++int ++zfsctl_unmount_snapshot(zfs_sb_t *zsb, char *name, int flags) ++{ ++ zfs_snapentry_t search; ++ zfs_snapentry_t *sep; ++ int error = 0; ++ ++ mutex_enter(&zsb->z_ctldir_lock); ++ ++ search.se_name = name; ++ sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL); ++ if (sep) { ++ avl_remove(&zsb->z_ctldir_snaps, sep); ++ error = __zfsctl_unmount_snapshot(sep, flags); ++ if (error == EBUSY) ++ avl_add(&zsb->z_ctldir_snaps, sep); ++ else ++ zfsctl_sep_free(sep); ++ } else { ++ error = ENOENT; ++ } ++ ++ mutex_exit(&zsb->z_ctldir_lock); ++ ASSERT3S(error, >=, 0); ++ ++ return (error); ++} ++ ++/* ++ * Traverse all mounted snapshots and attempt to unmount them. This ++ * is best effort, on failure EEXIST is returned and count will be set ++ * to the number of file snapshots which could not be unmounted. ++ */ ++int ++zfsctl_unmount_snapshots(zfs_sb_t *zsb, int flags, int *count) ++{ ++ zfs_snapentry_t *sep, *next; ++ int error = 0; ++ ++ *count = 0; ++ ++ ASSERT(zsb->z_ctldir != NULL); ++ mutex_enter(&zsb->z_ctldir_lock); ++ ++ sep = avl_first(&zsb->z_ctldir_snaps); ++ while (sep != NULL) { ++ next = AVL_NEXT(&zsb->z_ctldir_snaps, sep); ++ avl_remove(&zsb->z_ctldir_snaps, sep); ++ error = __zfsctl_unmount_snapshot(sep, flags); ++ if (error == EBUSY) { ++ avl_add(&zsb->z_ctldir_snaps, sep); ++ (*count)++; ++ } else { ++ zfsctl_sep_free(sep); ++ } ++ ++ sep = next; ++ } ++ ++ mutex_exit(&zsb->z_ctldir_lock); ++ ++ return ((*count > 0) ? EEXIST : 0); ++} ++ ++#define SET_MOUNT_CMD \ ++ "exec 0/dev/null " \ ++ " 2>/dev/null; " \ ++ "mount -t zfs -n '%s' '%s'" ++ ++int ++zfsctl_mount_snapshot(struct path *path, int flags) ++{ ++ struct dentry *dentry = path->dentry; ++ struct inode *ip = dentry->d_inode; ++ zfs_sb_t *zsb = ITOZSB(ip); ++ char *full_name, *full_path; ++ zfs_snapentry_t *sep; ++ zfs_snapentry_t search; ++ char *argv[] = { "/bin/sh", "-c", NULL, NULL }; ++ char *envp[] = { NULL }; ++ int error; ++ ++ ZFS_ENTER(zsb); ++ ++ full_name = kmem_zalloc(MAXNAMELEN, KM_SLEEP); ++ full_path = kmem_zalloc(PATH_MAX, KM_SLEEP); ++ ++ error = zfsctl_snapshot_zname(ip, dname(dentry), MAXNAMELEN, full_name); ++ if (error) ++ goto error; ++ ++ error = zfsctl_snapshot_zpath(path, PATH_MAX, full_path); ++ if (error) ++ goto error; ++ ++ /* ++ * Attempt to mount the snapshot from user space. Normally this ++ * would be done using the vfs_kern_mount() function, however that ++ * function is marked GPL-only and cannot be used. On error we ++ * careful to log the real error to the console and return EISDIR ++ * to safely abort the automount. This should be very rare. ++ */ ++ argv[2] = kmem_asprintf(SET_MOUNT_CMD, full_name, full_path); ++ error = call_usermodehelper(argv[0], argv, envp, 1); ++ strfree(argv[2]); ++ if (error) { ++ printk("ZFS: Unable to automount %s at %s: %d\n", ++ full_name, full_path, error); ++ error = EISDIR; ++ goto error; ++ } ++ ++ mutex_enter(&zsb->z_ctldir_lock); ++ ++ /* ++ * Ensure a previous entry does not exist, if it does safely remove ++ * it any cancel the outstanding expiration. This can occur when a ++ * snapshot is manually unmounted and then an automount is triggered. ++ */ ++ search.se_name = full_name; ++ sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL); ++ if (sep) { ++ avl_remove(&zsb->z_ctldir_snaps, sep); ++ cancel_delayed_work_sync(&sep->se_work); ++ zfsctl_sep_free(sep); ++ } ++ ++ sep = zfsctl_sep_alloc(); ++ sep->se_name = full_name; ++ sep->se_path = full_path; ++ sep->se_inode = ip; ++ avl_add(&zsb->z_ctldir_snaps, sep); ++ ++ spl_init_delayed_work(&sep->se_work, zfsctl_expire_snapshot, sep); ++ schedule_delayed_work(&sep->se_work, zfs_expire_snapshot * HZ); ++ ++ mutex_exit(&zsb->z_ctldir_lock); ++error: ++ if (error) { ++ kmem_free(full_name, MAXNAMELEN); ++ kmem_free(full_path, PATH_MAX); ++ } ++ ++ ZFS_EXIT(zsb); ++ ++ return (error); ++} ++ ++/* ++ * Check if this super block has a matching objset id. ++ */ ++static int ++zfsctl_test_super(struct super_block *sb, void *objsetidp) ++{ ++ zfs_sb_t *zsb = sb->s_fs_info; ++ uint64_t objsetid = *(uint64_t *)objsetidp; ++ ++ return (dmu_objset_id(zsb->z_os) == objsetid); ++} ++ ++/* ++ * Prevent a new super block from being allocated if an existing one ++ * could not be located. We only want to preform a lookup operation. ++ */ ++static int ++zfsctl_set_super(struct super_block *sb, void *objsetidp) ++{ ++ return (-EEXIST); ++} ++ ++int ++zfsctl_lookup_objset(struct super_block *sb, uint64_t objsetid, zfs_sb_t **zsbp) ++{ ++ zfs_sb_t *zsb = sb->s_fs_info; ++ struct super_block *sbp; ++ zfs_snapentry_t *sep; ++ uint64_t id; ++ int error; ++ ++ ASSERT(zsb->z_ctldir != NULL); ++ ++ mutex_enter(&zsb->z_ctldir_lock); ++ ++ /* ++ * Verify that the snapshot is mounted. ++ */ ++ sep = avl_first(&zsb->z_ctldir_snaps); ++ while (sep != NULL) { ++ error = dmu_snapshot_id(zsb->z_os, sep->se_name, &id); ++ if (error) ++ goto out; ++ ++ if (id == objsetid) ++ break; ++ ++ sep = AVL_NEXT(&zsb->z_ctldir_snaps, sep); ++ } ++ ++ if (sep != NULL) { ++ /* ++ * Lookup the mounted root rather than the covered mount ++ * point. This may fail if the snapshot has just been ++ * unmounted by an unrelated user space process. This ++ * race cannot occur to an expired mount point because ++ * we hold the zsb->z_ctldir_lock to prevent the race. ++ */ ++ sbp = zpl_sget(&zpl_fs_type, zfsctl_test_super, ++ zfsctl_set_super, 0, &id); ++ if (IS_ERR(sbp)) { ++ error = -PTR_ERR(sbp); ++ } else { ++ *zsbp = sbp->s_fs_info; ++ deactivate_super(sbp); ++ } ++ } else { ++ error = EINVAL; ++ } ++out: ++ mutex_exit(&zsb->z_ctldir_lock); ++ ASSERT3S(error, >=, 0); ++ ++ return (error); ++} ++ ++/* ARGSUSED */ ++int ++zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, ++ int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) ++{ ++ zfs_sb_t *zsb = ITOZSB(dip); ++ struct inode *ip; ++ znode_t *dzp; ++ int error; ++ ++ ZFS_ENTER(zsb); ++ ++ if (zsb->z_shares_dir == 0) { ++ ZFS_EXIT(zsb); ++ return (ENOTSUP); ++ } ++ ++ error = zfs_zget(zsb, zsb->z_shares_dir, &dzp); ++ if (error) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ error = zfs_lookup(ZTOI(dzp), name, &ip, 0, cr, NULL, NULL); ++ ++ iput(ZTOI(dzp)); ++ ZFS_EXIT(zsb); ++ ++ return (error); ++} ++ ++ ++/* ++ * Initialize the various pieces we'll need to create and manipulate .zfs ++ * directories. Currently this is unused but available. ++ */ ++void ++zfsctl_init(void) ++{ ++} ++ ++/* ++ * Cleanup the various pieces we needed for .zfs directories. In particular ++ * ensure the expiry timer is canceled safely. ++ */ ++void ++zfsctl_fini(void) ++{ ++} ++ ++module_param(zfs_expire_snapshot, int, 0644); ++MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot"); +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_debug.c linux-3.2.33-go/fs/zfs/zfs/zfs_debug.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_debug.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_debug.c 2012-11-16 23:25:34.348039346 +0100 +@@ -0,0 +1,83 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++ ++/* ++ * Enable various debugging features. ++ */ ++int zfs_flags = 0; ++ ++/* ++ * zfs_recover can be set to nonzero to attempt to recover from ++ * otherwise-fatal errors, typically caused by on-disk corruption. When ++ * set, calls to zfs_panic_recover() will turn into warning messages. ++ */ ++int zfs_recover = 0; ++ ++ ++void ++zfs_panic_recover(const char *fmt, ...) ++{ ++ va_list adx; ++ ++ va_start(adx, fmt); ++ vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); ++ va_end(adx); ++} ++ ++/* ++ * Debug logging is enabled by default for production kernel builds. ++ * The overhead for this is negligible and the logs can be valuable when ++ * debugging. For non-production user space builds all debugging except ++ * logging is enabled since performance is no longer a concern. ++ */ ++void ++zfs_dbgmsg_init(void) ++{ ++ if (zfs_flags == 0) { ++#if defined(_KERNEL) ++ zfs_flags = ZFS_DEBUG_DPRINTF; ++ spl_debug_set_mask(spl_debug_get_mask() | SD_DPRINTF); ++ spl_debug_set_subsys(spl_debug_get_subsys() | SS_USER1); ++#else ++ zfs_flags = ~ZFS_DEBUG_DPRINTF; ++#endif /* _KERNEL */ ++ } ++} ++ ++void ++zfs_dbgmsg_fini(void) ++{ ++ return; ++} ++ ++ ++#if defined(_KERNEL) ++module_param(zfs_flags, int, 0644); ++MODULE_PARM_DESC(zfs_flags, "Set additional debugging flags"); ++ ++module_param(zfs_recover, int, 0644); ++MODULE_PARM_DESC(zfs_recover, "Set to attempt to recover from fatal errors"); ++#endif /* _KERNEL */ +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_dir.c linux-3.2.33-go/fs/zfs/zfs/zfs_dir.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_dir.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_dir.c 2012-11-16 23:25:34.352039300 +0100 +@@ -0,0 +1,1124 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "fs/fs_subr.h" ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups ++ * of names after deciding which is the appropriate lookup interface. ++ */ ++static int ++zfs_match_find(zfs_sb_t *zsb, znode_t *dzp, char *name, boolean_t exact, ++ boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid) ++{ ++ boolean_t conflict = B_FALSE; ++ int error; ++ ++ if (zsb->z_norm) { ++ matchtype_t mt = MT_FIRST; ++ size_t bufsz = 0; ++ char *buf = NULL; ++ ++ if (rpnp) { ++ buf = rpnp->pn_buf; ++ bufsz = rpnp->pn_bufsize; ++ } ++ if (exact) ++ mt = MT_EXACT; ++ /* ++ * In the non-mixed case we only expect there would ever ++ * be one match, but we need to use the normalizing lookup. ++ */ ++ error = zap_lookup_norm(zsb->z_os, dzp->z_id, name, 8, 1, ++ zoid, mt, buf, bufsz, &conflict); ++ } else { ++ error = zap_lookup(zsb->z_os, dzp->z_id, name, 8, 1, zoid); ++ } ++ ++ /* ++ * Allow multiple entries provided the first entry is ++ * the object id. Non-zpl consumers may safely make ++ * use of the additional space. ++ * ++ * XXX: This should be a feature flag for compatibility ++ */ ++ if (error == EOVERFLOW) ++ error = 0; ++ ++ if (zsb->z_norm && !error && deflags) ++ *deflags = conflict ? ED_CASE_CONFLICT : 0; ++ ++ *zoid = ZFS_DIRENT_OBJ(*zoid); ++ ++#ifdef HAVE_DNLC ++ if (error == ENOENT && update) ++ dnlc_update(ZTOI(dzp), name, DNLC_NO_VNODE); ++#endif /* HAVE_DNLC */ ++ ++ return (error); ++} ++ ++/* ++ * Lock a directory entry. A dirlock on protects that name ++ * in dzp's directory zap object. As long as you hold a dirlock, you can ++ * assume two things: (1) dzp cannot be reaped, and (2) no other thread ++ * can change the zap entry for (i.e. link or unlink) this name. ++ * ++ * Input arguments: ++ * dzp - znode for directory ++ * name - name of entry to lock ++ * flag - ZNEW: if the entry already exists, fail with EEXIST. ++ * ZEXISTS: if the entry does not exist, fail with ENOENT. ++ * ZSHARED: allow concurrent access with other ZSHARED callers. ++ * ZXATTR: we want dzp's xattr directory ++ * ZCILOOK: On a mixed sensitivity file system, ++ * this lookup should be case-insensitive. ++ * ZCIEXACT: On a purely case-insensitive file system, ++ * this lookup should be case-sensitive. ++ * ZRENAMING: we are locking for renaming, force narrow locks ++ * ZHAVELOCK: Don't grab the z_name_lock for this call. The ++ * current thread already holds it. ++ * ++ * Output arguments: ++ * zpp - pointer to the znode for the entry (NULL if there isn't one) ++ * dlpp - pointer to the dirlock for this entry (NULL on error) ++ * direntflags - (case-insensitive lookup only) ++ * flags if multiple case-sensitive matches exist in directory ++ * realpnp - (case-insensitive lookup only) ++ * actual name matched within the directory ++ * ++ * Return value: 0 on success or errno on failure. ++ * ++ * NOTE: Always checks for, and rejects, '.' and '..'. ++ * NOTE: For case-insensitive file systems we take wide locks (see below), ++ * but return znode pointers to a single match. ++ */ ++int ++zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, ++ int flag, int *direntflags, pathname_t *realpnp) ++{ ++ zfs_sb_t *zsb = ZTOZSB(dzp); ++ zfs_dirlock_t *dl; ++ boolean_t update; ++ boolean_t exact; ++ uint64_t zoid; ++#ifdef HAVE_DNLC ++ vnode_t *vp = NULL; ++#endif /* HAVE_DNLC */ ++ int error = 0; ++ int cmpflags; ++ ++ *zpp = NULL; ++ *dlpp = NULL; ++ ++ /* ++ * Verify that we are not trying to lock '.', '..', or '.zfs' ++ */ ++ if ((name[0] == '.' && ++ (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) || ++ (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)) ++ return (EEXIST); ++ ++ /* ++ * Case sensitivity and normalization preferences are set when ++ * the file system is created. These are stored in the ++ * zsb->z_case and zsb->z_norm fields. These choices ++ * affect what vnodes can be cached in the DNLC, how we ++ * perform zap lookups, and the "width" of our dirlocks. ++ * ++ * A normal dirlock locks a single name. Note that with ++ * normalization a name can be composed multiple ways, but ++ * when normalized, these names all compare equal. A wide ++ * dirlock locks multiple names. We need these when the file ++ * system is supporting mixed-mode access. It is sometimes ++ * necessary to lock all case permutations of file name at ++ * once so that simultaneous case-insensitive/case-sensitive ++ * behaves as rationally as possible. ++ */ ++ ++ /* ++ * Decide if exact matches should be requested when performing ++ * a zap lookup on file systems supporting case-insensitive ++ * access. ++ */ ++ exact = ++ ((zsb->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) || ++ ((zsb->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK)); ++ ++ /* ++ * Only look in or update the DNLC if we are looking for the ++ * name on a file system that does not require normalization ++ * or case folding. We can also look there if we happen to be ++ * on a non-normalizing, mixed sensitivity file system IF we ++ * are looking for the exact name. ++ * ++ * Maybe can add TO-UPPERed version of name to dnlc in ci-only ++ * case for performance improvement? ++ */ ++ update = !zsb->z_norm || ++ ((zsb->z_case == ZFS_CASE_MIXED) && ++ !(zsb->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); ++ ++ /* ++ * ZRENAMING indicates we are in a situation where we should ++ * take narrow locks regardless of the file system's ++ * preferences for normalizing and case folding. This will ++ * prevent us deadlocking trying to grab the same wide lock ++ * twice if the two names happen to be case-insensitive ++ * matches. ++ */ ++ if (flag & ZRENAMING) ++ cmpflags = 0; ++ else ++ cmpflags = zsb->z_norm; ++ ++ /* ++ * Wait until there are no locks on this name. ++ * ++ * Don't grab the the lock if it is already held. However, cannot ++ * have both ZSHARED and ZHAVELOCK together. ++ */ ++ ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); ++ if (!(flag & ZHAVELOCK)) ++ rw_enter(&dzp->z_name_lock, RW_READER); ++ ++ mutex_enter(&dzp->z_lock); ++ for (;;) { ++ if (dzp->z_unlinked) { ++ mutex_exit(&dzp->z_lock); ++ if (!(flag & ZHAVELOCK)) ++ rw_exit(&dzp->z_name_lock); ++ return (ENOENT); ++ } ++ for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { ++ if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, ++ U8_UNICODE_LATEST, &error) == 0) || error != 0) ++ break; ++ } ++ if (error != 0) { ++ mutex_exit(&dzp->z_lock); ++ if (!(flag & ZHAVELOCK)) ++ rw_exit(&dzp->z_name_lock); ++ return (ENOENT); ++ } ++ if (dl == NULL) { ++ /* ++ * Allocate a new dirlock and add it to the list. ++ */ ++ dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); ++ cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); ++ dl->dl_name = name; ++ dl->dl_sharecnt = 0; ++ dl->dl_namelock = 0; ++ dl->dl_namesize = 0; ++ dl->dl_dzp = dzp; ++ dl->dl_next = dzp->z_dirlocks; ++ dzp->z_dirlocks = dl; ++ break; ++ } ++ if ((flag & ZSHARED) && dl->dl_sharecnt != 0) ++ break; ++ cv_wait(&dl->dl_cv, &dzp->z_lock); ++ } ++ ++ /* ++ * If the z_name_lock was NOT held for this dirlock record it. ++ */ ++ if (flag & ZHAVELOCK) ++ dl->dl_namelock = 1; ++ ++ if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { ++ /* ++ * We're the second shared reference to dl. Make a copy of ++ * dl_name in case the first thread goes away before we do. ++ * Note that we initialize the new name before storing its ++ * pointer into dl_name, because the first thread may load ++ * dl->dl_name at any time. He'll either see the old value, ++ * which is his, or the new shared copy; either is OK. ++ */ ++ dl->dl_namesize = strlen(dl->dl_name) + 1; ++ name = kmem_alloc(dl->dl_namesize, KM_SLEEP); ++ bcopy(dl->dl_name, name, dl->dl_namesize); ++ dl->dl_name = name; ++ } ++ ++ mutex_exit(&dzp->z_lock); ++ ++ /* ++ * We have a dirlock on the name. (Note that it is the dirlock, ++ * not the dzp's z_lock, that protects the name in the zap object.) ++ * See if there's an object by this name; if so, put a hold on it. ++ */ ++ if (flag & ZXATTR) { ++ error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zsb), &zoid, ++ sizeof (zoid)); ++ if (error == 0) ++ error = (zoid == 0 ? ENOENT : 0); ++ } else { ++#ifdef HAVE_DNLC ++ if (update) ++ vp = dnlc_lookup(ZTOI(dzp), name); ++ if (vp == DNLC_NO_VNODE) { ++ iput(vp); ++ error = ENOENT; ++ } else if (vp) { ++ if (flag & ZNEW) { ++ zfs_dirent_unlock(dl); ++ iput(vp); ++ return (EEXIST); ++ } ++ *dlpp = dl; ++ *zpp = VTOZ(vp); ++ return (0); ++ } else { ++ error = zfs_match_find(zsb, dzp, name, exact, ++ update, direntflags, realpnp, &zoid); ++ } ++#else ++ error = zfs_match_find(zsb, dzp, name, exact, ++ update, direntflags, realpnp, &zoid); ++#endif /* HAVE_DNLC */ ++ } ++ if (error) { ++ if (error != ENOENT || (flag & ZEXISTS)) { ++ zfs_dirent_unlock(dl); ++ return (error); ++ } ++ } else { ++ if (flag & ZNEW) { ++ zfs_dirent_unlock(dl); ++ return (EEXIST); ++ } ++ error = zfs_zget(zsb, zoid, zpp); ++ if (error) { ++ zfs_dirent_unlock(dl); ++ return (error); ++ } ++#ifdef HAVE_DNLC ++ if (!(flag & ZXATTR) && update) ++ dnlc_update(ZTOI(dzp), name, ZTOI(*zpp)); ++#endif /* HAVE_DNLC */ ++ } ++ ++ *dlpp = dl; ++ ++ return (0); ++} ++ ++/* ++ * Unlock this directory entry and wake anyone who was waiting for it. ++ */ ++void ++zfs_dirent_unlock(zfs_dirlock_t *dl) ++{ ++ znode_t *dzp = dl->dl_dzp; ++ zfs_dirlock_t **prev_dl, *cur_dl; ++ ++ mutex_enter(&dzp->z_lock); ++ ++ if (!dl->dl_namelock) ++ rw_exit(&dzp->z_name_lock); ++ ++ if (dl->dl_sharecnt > 1) { ++ dl->dl_sharecnt--; ++ mutex_exit(&dzp->z_lock); ++ return; ++ } ++ prev_dl = &dzp->z_dirlocks; ++ while ((cur_dl = *prev_dl) != dl) ++ prev_dl = &cur_dl->dl_next; ++ *prev_dl = dl->dl_next; ++ cv_broadcast(&dl->dl_cv); ++ mutex_exit(&dzp->z_lock); ++ ++ if (dl->dl_namesize != 0) ++ kmem_free(dl->dl_name, dl->dl_namesize); ++ cv_destroy(&dl->dl_cv); ++ kmem_free(dl, sizeof (*dl)); ++} ++ ++/* ++ * Look up an entry in a directory. ++ * ++ * NOTE: '.' and '..' are handled as special cases because ++ * no directory entries are actually stored for them. If this is ++ * the root of a filesystem, then '.zfs' is also treated as a ++ * special pseudo-directory. ++ */ ++int ++zfs_dirlook(znode_t *dzp, char *name, struct inode **ipp, int flags, ++ int *deflg, pathname_t *rpnp) ++{ ++ zfs_dirlock_t *dl; ++ znode_t *zp; ++ int error = 0; ++ uint64_t parent; ++ ++ if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { ++ *ipp = ZTOI(dzp); ++ igrab(*ipp); ++ } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { ++ zfs_sb_t *zsb = ZTOZSB(dzp); ++ ++ /* ++ * If we are a snapshot mounted under .zfs, return ++ * the inode pointer for the snapshot directory. ++ */ ++ if ((error = sa_lookup(dzp->z_sa_hdl, ++ SA_ZPL_PARENT(zsb), &parent, sizeof (parent))) != 0) ++ return (error); ++ ++ if (parent == dzp->z_id && zsb->z_parent != zsb) { ++ error = zfsctl_root_lookup(zsb->z_parent->z_ctldir, ++ "snapshot", ipp, 0, kcred, NULL, NULL); ++ return (error); ++ } ++ rw_enter(&dzp->z_parent_lock, RW_READER); ++ error = zfs_zget(zsb, parent, &zp); ++ if (error == 0) ++ *ipp = ZTOI(zp); ++ rw_exit(&dzp->z_parent_lock); ++ } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { ++ *ipp = zfsctl_root(dzp); ++ } else { ++ int zf; ++ ++ zf = ZEXISTS | ZSHARED; ++ if (flags & FIGNORECASE) ++ zf |= ZCILOOK; ++ ++ error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); ++ if (error == 0) { ++ *ipp = ZTOI(zp); ++ zfs_dirent_unlock(dl); ++ dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ ++ } ++ rpnp = NULL; ++ } ++ ++ if ((flags & FIGNORECASE) && rpnp && !error) ++ (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize); ++ ++ return (error); ++} ++ ++/* ++ * unlinked Set (formerly known as the "delete queue") Error Handling ++ * ++ * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we ++ * don't specify the name of the entry that we will be manipulating. We ++ * also fib and say that we won't be adding any new entries to the ++ * unlinked set, even though we might (this is to lower the minimum file ++ * size that can be deleted in a full filesystem). So on the small ++ * chance that the nlink list is using a fat zap (ie. has more than ++ * 2000 entries), we *may* not pre-read a block that's needed. ++ * Therefore it is remotely possible for some of the assertions ++ * regarding the unlinked set below to fail due to i/o error. On a ++ * nondebug system, this will result in the space being leaked. ++ */ ++void ++zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ ++ ASSERT(zp->z_unlinked); ++ ASSERT(zp->z_links == 0); ++ ++ VERIFY3U(0, ==, ++ zap_add_int(zsb->z_os, zsb->z_unlinkedobj, zp->z_id, tx)); ++} ++ ++/* ++ * Delete the entire contents of a directory. Return a count ++ * of the number of entries that could not be deleted. If we encounter ++ * an error, return a count of at least one so that the directory stays ++ * in the unlinked set. ++ * ++ * NOTE: this function assumes that the directory is inactive, ++ * so there is no need to lock its entries before deletion. ++ * Also, it assumes the directory contents is *only* regular ++ * files. ++ */ ++static int ++zfs_purgedir(znode_t *dzp) ++{ ++ zap_cursor_t zc; ++ zap_attribute_t zap; ++ znode_t *xzp; ++ dmu_tx_t *tx; ++ zfs_sb_t *zsb = ZTOZSB(dzp); ++ zfs_dirlock_t dl; ++ int skipped = 0; ++ int error; ++ ++ for (zap_cursor_init(&zc, zsb->z_os, dzp->z_id); ++ (error = zap_cursor_retrieve(&zc, &zap)) == 0; ++ zap_cursor_advance(&zc)) { ++ error = zfs_zget(zsb, ++ ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); ++ if (error) { ++ skipped += 1; ++ continue; ++ } ++ ++ ASSERT(S_ISREG(ZTOI(xzp)->i_mode)||S_ISLNK(ZTOI(xzp)->i_mode)); ++ ++ tx = dmu_tx_create(zsb->z_os); ++ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); ++ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); ++ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); ++ dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL); ++ /* Is this really needed ? */ ++ zfs_sa_upgrade_txholds(tx, xzp); ++ error = dmu_tx_assign(tx, TXG_WAIT); ++ if (error) { ++ dmu_tx_abort(tx); ++ iput(ZTOI(xzp)); ++ skipped += 1; ++ continue; ++ } ++ bzero(&dl, sizeof (dl)); ++ dl.dl_dzp = dzp; ++ dl.dl_name = zap.za_name; ++ ++ error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); ++ if (error) ++ skipped += 1; ++ dmu_tx_commit(tx); ++ ++ iput(ZTOI(xzp)); ++ } ++ zap_cursor_fini(&zc); ++ if (error != ENOENT) ++ skipped += 1; ++ return (skipped); ++} ++ ++/* ++ * Clean up any znodes that had no links when we either crashed or ++ * (force) umounted the file system. ++ */ ++void ++zfs_unlinked_drain(zfs_sb_t *zsb) ++{ ++ zap_cursor_t zc; ++ zap_attribute_t zap; ++ dmu_object_info_t doi; ++ znode_t *zp; ++ int error; ++ ++ /* ++ * Interate over the contents of the unlinked set. ++ */ ++ for (zap_cursor_init(&zc, zsb->z_os, zsb->z_unlinkedobj); ++ zap_cursor_retrieve(&zc, &zap) == 0; ++ zap_cursor_advance(&zc)) { ++ ++ /* ++ * See what kind of object we have in list ++ */ ++ ++ error = dmu_object_info(zsb->z_os, zap.za_first_integer, &doi); ++ if (error != 0) ++ continue; ++ ++ ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || ++ (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); ++ /* ++ * We need to re-mark these list entries for deletion, ++ * so we pull them back into core and set zp->z_unlinked. ++ */ ++ error = zfs_zget(zsb, zap.za_first_integer, &zp); ++ ++ /* ++ * We may pick up znodes that are already marked for deletion. ++ * This could happen during the purge of an extended attribute ++ * directory. All we need to do is skip over them, since they ++ * are already in the system marked z_unlinked. ++ */ ++ if (error != 0) ++ continue; ++ ++ zp->z_unlinked = B_TRUE; ++ ++ /* ++ * If this is an attribute directory, purge its contents. ++ */ ++ if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) { ++ /* ++ * We don't need to check the return value of ++ * zfs_purgedir here, because zfs_rmnode will just ++ * return this xattr directory to the unlinked set ++ * until all of its xattrs are gone. ++ */ ++ (void) zfs_purgedir(zp); ++ } ++ ++ iput(ZTOI(zp)); ++ } ++ zap_cursor_fini(&zc); ++} ++ ++void ++zfs_rmnode(znode_t *zp) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ objset_t *os = zsb->z_os; ++ znode_t *xzp = NULL; ++ dmu_tx_t *tx; ++ uint64_t acl_obj; ++ uint64_t xattr_obj; ++ uint64_t count; ++ int error; ++ ++ ASSERT(zp->z_links == 0); ++ ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0); ++ ++ /* ++ * If this is an attribute directory, purge its contents. ++ */ ++ if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) { ++ error = zap_count(os, zp->z_id, &count); ++ if (error) { ++ zfs_znode_dmu_fini(zp); ++ return; ++ } ++ ++ if (count > 0) { ++ taskq_t *taskq; ++ ++ /* ++ * There are still directory entries in this xattr ++ * directory. Let zfs_unlinked_drain() deal with ++ * them to avoid deadlocking this process in the ++ * zfs_purgedir()->zfs_zget()->ilookup() callpath ++ * on the xattr inode's I_FREEING bit. ++ */ ++ taskq = dsl_pool_iput_taskq(dmu_objset_pool(os)); ++ taskq_dispatch(taskq, (task_func_t *) ++ zfs_unlinked_drain, zsb, TQ_SLEEP); ++ ++ zfs_znode_dmu_fini(zp); ++ return; ++ } ++ } ++ ++ /* ++ * Free up all the data in the file. ++ */ ++ error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); ++ if (error) { ++ /* ++ * Not enough space. Leave the file in the unlinked set. ++ */ ++ zfs_znode_dmu_fini(zp); ++ return; ++ } ++ ++ /* ++ * If the file has extended attributes, we're going to unlink ++ * the xattr dir. ++ */ ++ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zsb), ++ &xattr_obj, sizeof (xattr_obj)); ++ if (error == 0 && xattr_obj) { ++ error = zfs_zget(zsb, xattr_obj, &xzp); ++ ASSERT(error == 0); ++ } ++ ++ acl_obj = zfs_external_acl(zp); ++ ++ /* ++ * Set up the final transaction. ++ */ ++ tx = dmu_tx_create(os); ++ dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); ++ dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL); ++ if (xzp) { ++ dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, TRUE, NULL); ++ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); ++ } ++ if (acl_obj) ++ dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); ++ ++ zfs_sa_upgrade_txholds(tx, zp); ++ error = dmu_tx_assign(tx, TXG_WAIT); ++ if (error) { ++ /* ++ * Not enough space to delete the file. Leave it in the ++ * unlinked set, leaking it until the fs is remounted (at ++ * which point we'll call zfs_unlinked_drain() to process it). ++ */ ++ dmu_tx_abort(tx); ++ zfs_znode_dmu_fini(zp); ++ goto out; ++ } ++ ++ if (xzp) { ++ ASSERT(error == 0); ++ mutex_enter(&xzp->z_lock); ++ xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ ++ xzp->z_links = 0; /* no more links to it */ ++ VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zsb), ++ &xzp->z_links, sizeof (xzp->z_links), tx)); ++ mutex_exit(&xzp->z_lock); ++ zfs_unlinked_add(xzp, tx); ++ } ++ ++ /* Remove this znode from the unlinked set */ ++ VERIFY3U(0, ==, ++ zap_remove_int(zsb->z_os, zsb->z_unlinkedobj, zp->z_id, tx)); ++ ++ zfs_znode_delete(zp, tx); ++ ++ dmu_tx_commit(tx); ++out: ++ if (xzp) ++ iput(ZTOI(xzp)); ++} ++ ++static uint64_t ++zfs_dirent(znode_t *zp, uint64_t mode) ++{ ++ uint64_t de = zp->z_id; ++ ++ if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE) ++ de |= IFTODT(mode) << 60; ++ return (de); ++} ++ ++/* ++ * Link zp into dl. Can only fail if zp has been unlinked. ++ */ ++int ++zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) ++{ ++ znode_t *dzp = dl->dl_dzp; ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ uint64_t value; ++ int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); ++ sa_bulk_attr_t bulk[5]; ++ uint64_t mtime[2], ctime[2]; ++ int count = 0; ++ int error; ++ ++ mutex_enter(&zp->z_lock); ++ ++ if (!(flag & ZRENAMING)) { ++ if (zp->z_unlinked) { /* no new links to unlinked zp */ ++ ASSERT(!(flag & (ZNEW | ZEXISTS))); ++ mutex_exit(&zp->z_lock); ++ return (ENOENT); ++ } ++ zp->z_links++; ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zsb), NULL, ++ &zp->z_links, sizeof (zp->z_links)); ++ ++ } ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zsb), NULL, ++ &dzp->z_id, sizeof (dzp->z_id)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL, ++ &zp->z_pflags, sizeof (zp->z_pflags)); ++ ++ if (!(flag & ZNEW)) { ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, ++ ctime, sizeof (ctime)); ++ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ++ ctime, B_TRUE); ++ } ++ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ++ ASSERT(error == 0); ++ ++ mutex_exit(&zp->z_lock); ++ ++ mutex_enter(&dzp->z_lock); ++ dzp->z_size++; ++ dzp->z_links += zp_is_dir; ++ count = 0; ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zsb), NULL, ++ &dzp->z_size, sizeof (dzp->z_size)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zsb), NULL, ++ &dzp->z_links, sizeof (dzp->z_links)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, ++ mtime, sizeof (mtime)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, ++ ctime, sizeof (ctime)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL, ++ &dzp->z_pflags, sizeof (dzp->z_pflags)); ++ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); ++ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); ++ ASSERT(error == 0); ++ mutex_exit(&dzp->z_lock); ++ ++ value = zfs_dirent(zp, zp->z_mode); ++ error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, ++ 8, 1, &value, tx); ++ ASSERT(error == 0); ++ ++ return (0); ++} ++ ++static int ++zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, ++ int flag) ++{ ++ int error; ++ ++ if (ZTOZSB(zp)->z_norm) { ++ if (((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE) && ++ (flag & ZCIEXACT)) || ++ ((ZTOZSB(zp)->z_case == ZFS_CASE_MIXED) && ++ !(flag & ZCILOOK))) ++ error = zap_remove_norm(ZTOZSB(zp)->z_os, ++ dzp->z_id, dl->dl_name, MT_EXACT, tx); ++ else ++ error = zap_remove_norm(ZTOZSB(zp)->z_os, ++ dzp->z_id, dl->dl_name, MT_FIRST, tx); ++ } else { ++ error = zap_remove(ZTOZSB(zp)->z_os, ++ dzp->z_id, dl->dl_name, tx); ++ } ++ ++ return (error); ++} ++ ++/* ++ * Unlink zp from dl, and mark zp for deletion if this was the last link. Can ++ * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY). ++ * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. ++ * If it's non-NULL, we use it to indicate whether the znode needs deletion, ++ * and it's the caller's job to do it. ++ */ ++int ++zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, ++ boolean_t *unlinkedp) ++{ ++ znode_t *dzp = dl->dl_dzp; ++ zfs_sb_t *zsb = ZTOZSB(dzp); ++ int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); ++ boolean_t unlinked = B_FALSE; ++ sa_bulk_attr_t bulk[5]; ++ uint64_t mtime[2], ctime[2]; ++ int count = 0; ++ int error; ++ ++#ifdef HAVE_DNLC ++ dnlc_remove(ZTOI(dzp), dl->dl_name); ++#endif /* HAVE_DNLC */ ++ ++ if (!(flag & ZRENAMING)) { ++ mutex_enter(&zp->z_lock); ++ ++ if (zp_is_dir && !zfs_dirempty(zp)) { ++ mutex_exit(&zp->z_lock); ++ return (ENOTEMPTY); ++ } ++ ++ /* ++ * If we get here, we are going to try to remove the object. ++ * First try removing the name from the directory; if that ++ * fails, return the error. ++ */ ++ error = zfs_dropname(dl, zp, dzp, tx, flag); ++ if (error != 0) { ++ mutex_exit(&zp->z_lock); ++ return (error); ++ } ++ ++ if (zp->z_links <= zp_is_dir) { ++ zfs_panic_recover("zfs: link count on %lu is %u, " ++ "should be at least %u", zp->z_id, ++ (int)zp->z_links, zp_is_dir + 1); ++ zp->z_links = zp_is_dir + 1; ++ } ++ if (--zp->z_links == zp_is_dir) { ++ zp->z_unlinked = B_TRUE; ++ zp->z_links = 0; ++ unlinked = B_TRUE; ++ } else { ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), ++ NULL, &ctime, sizeof (ctime)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), ++ NULL, &zp->z_pflags, sizeof (zp->z_pflags)); ++ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, ++ B_TRUE); ++ } ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zsb), ++ NULL, &zp->z_links, sizeof (zp->z_links)); ++ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ++ count = 0; ++ ASSERT(error == 0); ++ mutex_exit(&zp->z_lock); ++ } else { ++ error = zfs_dropname(dl, zp, dzp, tx, flag); ++ if (error != 0) ++ return (error); ++ } ++ ++ mutex_enter(&dzp->z_lock); ++ dzp->z_size--; /* one dirent removed */ ++ dzp->z_links -= zp_is_dir; /* ".." link from zp */ ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zsb), ++ NULL, &dzp->z_links, sizeof (dzp->z_links)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zsb), ++ NULL, &dzp->z_size, sizeof (dzp->z_size)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), ++ NULL, ctime, sizeof (ctime)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), ++ NULL, mtime, sizeof (mtime)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), ++ NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); ++ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); ++ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); ++ ASSERT(error == 0); ++ mutex_exit(&dzp->z_lock); ++ ++ if (unlinkedp != NULL) ++ *unlinkedp = unlinked; ++ else if (unlinked) ++ zfs_unlinked_add(zp, tx); ++ ++ return (0); ++} ++ ++/* ++ * Indicate whether the directory is empty. Works with or without z_lock ++ * held, but can only be consider a hint in the latter case. Returns true ++ * if only "." and ".." remain and there's no work in progress. ++ */ ++boolean_t ++zfs_dirempty(znode_t *dzp) ++{ ++ return (dzp->z_size == 2 && dzp->z_dirlocks == 0); ++} ++ ++int ++zfs_make_xattrdir(znode_t *zp, vattr_t *vap, struct inode **xipp, cred_t *cr) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ znode_t *xzp; ++ dmu_tx_t *tx; ++ int error; ++ zfs_acl_ids_t acl_ids; ++ boolean_t fuid_dirtied; ++#ifdef DEBUG ++ uint64_t parent; ++#endif ++ ++ *xipp = NULL; ++ ++ if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))) ++ return (error); ++ ++ if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, ++ &acl_ids)) != 0) ++ return (error); ++ if (zfs_acl_ids_overquota(zsb, &acl_ids)) { ++ zfs_acl_ids_free(&acl_ids); ++ return (EDQUOT); ++ } ++ ++top: ++ tx = dmu_tx_create(zsb->z_os); ++ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ++ ZFS_SA_BASE_ATTR_SIZE); ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); ++ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); ++ fuid_dirtied = zsb->z_fuid_dirty; ++ if (fuid_dirtied) ++ zfs_fuid_txhold(zsb, tx); ++ error = dmu_tx_assign(tx, TXG_NOWAIT); ++ if (error) { ++ if (error == ERESTART) { ++ dmu_tx_wait(tx); ++ dmu_tx_abort(tx); ++ goto top; ++ } ++ zfs_acl_ids_free(&acl_ids); ++ dmu_tx_abort(tx); ++ return (error); ++ } ++ zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); ++ ++ if (fuid_dirtied) ++ zfs_fuid_sync(zsb, tx); ++ ++#ifdef DEBUG ++ error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zsb), ++ &parent, sizeof (parent)); ++ ASSERT(error == 0 && parent == zp->z_id); ++#endif ++ ++ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zsb), &xzp->z_id, ++ sizeof (xzp->z_id), tx)); ++ ++ (void) zfs_log_create(zsb->z_log, tx, TX_MKXATTR, zp, ++ xzp, "", NULL, acl_ids.z_fuidp, vap); ++ ++ zfs_acl_ids_free(&acl_ids); ++ dmu_tx_commit(tx); ++ ++ *xipp = ZTOI(xzp); ++ ++ return (0); ++} ++ ++/* ++ * Return a znode for the extended attribute directory for zp. ++ * ** If the directory does not already exist, it is created ** ++ * ++ * IN: zp - znode to obtain attribute directory from ++ * cr - credentials of caller ++ * flags - flags from the VOP_LOOKUP call ++ * ++ * OUT: xipp - pointer to extended attribute znode ++ * ++ * RETURN: 0 on success ++ * error number on failure ++ */ ++int ++zfs_get_xattrdir(znode_t *zp, struct inode **xipp, cred_t *cr, int flags) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ znode_t *xzp; ++ zfs_dirlock_t *dl; ++ vattr_t va; ++ int error; ++top: ++ error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); ++ if (error) ++ return (error); ++ ++ if (xzp != NULL) { ++ *xipp = ZTOI(xzp); ++ zfs_dirent_unlock(dl); ++ return (0); ++ } ++ ++ if (!(flags & CREATE_XATTR_DIR)) { ++ zfs_dirent_unlock(dl); ++ return (ENOENT); ++ } ++ ++ if (zfs_is_readonly(zsb)) { ++ zfs_dirent_unlock(dl); ++ return (EROFS); ++ } ++ ++ /* ++ * The ability to 'create' files in an attribute ++ * directory comes from the write_xattr permission on the base file. ++ * ++ * The ability to 'search' an attribute directory requires ++ * read_xattr permission on the base file. ++ * ++ * Once in a directory the ability to read/write attributes ++ * is controlled by the permissions on the attribute file. ++ */ ++ va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID; ++ va.va_mode = S_IFDIR | S_ISVTX | 0777; ++ zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); ++ ++ va.va_dentry = NULL; ++ error = zfs_make_xattrdir(zp, &va, xipp, cr); ++ zfs_dirent_unlock(dl); ++ ++ if (error == ERESTART) { ++ /* NB: we already did dmu_tx_wait() if necessary */ ++ goto top; ++ } ++ ++ return (error); ++} ++ ++/* ++ * Decide whether it is okay to remove within a sticky directory. ++ * ++ * In sticky directories, write access is not sufficient; ++ * you can remove entries from a directory only if: ++ * ++ * you own the directory, ++ * you own the entry, ++ * the entry is a plain file and you have write access, ++ * or you are privileged (checked in secpolicy...). ++ * ++ * The function returns 0 if remove access is granted. ++ */ ++int ++zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) ++{ ++ uid_t uid; ++ uid_t downer; ++ uid_t fowner; ++ zfs_sb_t *zsb = ZTOZSB(zdp); ++ ++ if (zsb->z_replay) ++ return (0); ++ ++ if ((zdp->z_mode & S_ISVTX) == 0) ++ return (0); ++ ++ downer = zfs_fuid_map_id(zsb, zdp->z_uid, cr, ZFS_OWNER); ++ fowner = zfs_fuid_map_id(zsb, zp->z_uid, cr, ZFS_OWNER); ++ ++ if ((uid = crgetuid(cr)) == downer || uid == fowner || ++ (S_ISDIR(ZTOI(zp)->i_mode) && ++ zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) ++ return (0); ++ else ++ return (secpolicy_vnode_remove(cr)); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_fm.c linux-3.2.33-go/fs/zfs/zfs/zfs_fm.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_fm.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_fm.c 2012-11-16 23:25:34.349039334 +0100 +@@ -0,0 +1,875 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++/* ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * This general routine is responsible for generating all the different ZFS ++ * ereports. The payload is dependent on the class, and which arguments are ++ * supplied to the function: ++ * ++ * EREPORT POOL VDEV IO ++ * block X X X ++ * data X X ++ * device X X ++ * pool X ++ * ++ * If we are in a loading state, all errors are chained together by the same ++ * SPA-wide ENA (Error Numeric Association). ++ * ++ * For isolated I/O requests, we get the ENA from the zio_t. The propagation ++ * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want ++ * to chain together all ereports associated with a logical piece of data. For ++ * read I/Os, there are basically three 'types' of I/O, which form a roughly ++ * layered diagram: ++ * ++ * +---------------+ ++ * | Aggregate I/O | No associated logical data or device ++ * +---------------+ ++ * | ++ * V ++ * +---------------+ Reads associated with a piece of logical data. ++ * | Read I/O | This includes reads on behalf of RAID-Z, ++ * +---------------+ mirrors, gang blocks, retries, etc. ++ * | ++ * V ++ * +---------------+ Reads associated with a particular device, but ++ * | Physical I/O | no logical data. Issued as part of vdev caching ++ * +---------------+ and I/O aggregation. ++ * ++ * Note that 'physical I/O' here is not the same terminology as used in the rest ++ * of ZIO. Typically, 'physical I/O' simply means that there is no attached ++ * blockpointer. But I/O with no associated block pointer can still be related ++ * to a logical piece of data (i.e. RAID-Z requests). ++ * ++ * Purely physical I/O always have unique ENAs. They are not related to a ++ * particular piece of logical data, and therefore cannot be chained together. ++ * We still generate an ereport, but the DE doesn't correlate it with any ++ * logical piece of data. When such an I/O fails, the delegated I/O requests ++ * will issue a retry, which will trigger the 'real' ereport with the correct ++ * ENA. ++ * ++ * We keep track of the ENA for a ZIO chain through the 'io_logical' member. ++ * When a new logical I/O is issued, we set this to point to itself. Child I/Os ++ * then inherit this pointer, so that when it is first set subsequent failures ++ * will use the same ENA. For vdev cache fill and queue aggregation I/O, ++ * this pointer is set to NULL, and no ereport will be generated (since it ++ * doesn't actually correspond to any particular device or piece of data, ++ * and the caller will always retry without caching or queueing anyway). ++ * ++ * For checksum errors, we want to include more information about the actual ++ * error which occurs. Accordingly, we build an ereport when the error is ++ * noticed, but instead of sending it in immediately, we hang it off of the ++ * io_cksum_report field of the logical IO. When the logical IO completes ++ * (successfully or not), zfs_ereport_finish_checksum() is called with the ++ * good and bad versions of the buffer (if available), and we annotate the ++ * ereport with information about the differences. ++ */ ++#ifdef _KERNEL ++static void ++zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) ++{ ++ if (nvl) ++ fm_nvlist_destroy(nvl, FM_NVA_FREE); ++ ++ if (detector) ++ fm_nvlist_destroy(detector, FM_NVA_FREE); ++} ++ ++static void ++zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, ++ const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, ++ uint64_t stateoroffset, uint64_t size) ++{ ++ nvlist_t *ereport, *detector; ++ ++ uint64_t ena; ++ char class[64]; ++ ++ /* ++ * If we are doing a spa_tryimport() or in recovery mode, ++ * ignore errors. ++ */ ++ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || ++ spa_load_state(spa) == SPA_LOAD_RECOVER) ++ return; ++ ++ /* ++ * If we are in the middle of opening a pool, and the previous attempt ++ * failed, don't bother logging any new ereports - we're just going to ++ * get the same diagnosis anyway. ++ */ ++ if (spa_load_state(spa) != SPA_LOAD_NONE && ++ spa->spa_last_open_failed) ++ return; ++ ++ if (zio != NULL) { ++ /* ++ * If this is not a read or write zio, ignore the error. This ++ * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. ++ */ ++ if (zio->io_type != ZIO_TYPE_READ && ++ zio->io_type != ZIO_TYPE_WRITE) ++ return; ++ ++ if (vd != NULL) { ++ /* ++ * If the vdev has already been marked as failing due ++ * to a failed probe, then ignore any subsequent I/O ++ * errors, as the DE will automatically fault the vdev ++ * on the first such failure. This also catches cases ++ * where vdev_remove_wanted is set and the device has ++ * not yet been asynchronously placed into the REMOVED ++ * state. ++ */ ++ if (zio->io_vd == vd && !vdev_accessible(vd, zio)) ++ return; ++ ++ /* ++ * Ignore checksum errors for reads from DTL regions of ++ * leaf vdevs. ++ */ ++ if (zio->io_type == ZIO_TYPE_READ && ++ zio->io_error == ECKSUM && ++ vd->vdev_ops->vdev_op_leaf && ++ vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) ++ return; ++ } ++ } ++ ++ /* ++ * For probe failure, we want to avoid posting ereports if we've ++ * already removed the device in the meantime. ++ */ ++ if (vd != NULL && ++ strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && ++ (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) ++ return; ++ ++ if ((ereport = fm_nvlist_create(NULL)) == NULL) ++ return; ++ ++ if ((detector = fm_nvlist_create(NULL)) == NULL) { ++ fm_nvlist_destroy(ereport, FM_NVA_FREE); ++ return; ++ } ++ ++ /* ++ * Serialize ereport generation ++ */ ++ mutex_enter(&spa->spa_errlist_lock); ++ ++ /* ++ * Determine the ENA to use for this event. If we are in a loading ++ * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use ++ * a root zio-wide ENA. Otherwise, simply use a unique ENA. ++ */ ++ if (spa_load_state(spa) != SPA_LOAD_NONE) { ++ if (spa->spa_ena == 0) ++ spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); ++ ena = spa->spa_ena; ++ } else if (zio != NULL && zio->io_logical != NULL) { ++ if (zio->io_logical->io_ena == 0) ++ zio->io_logical->io_ena = ++ fm_ena_generate(0, FM_ENA_FMT1); ++ ena = zio->io_logical->io_ena; ++ } else { ++ ena = fm_ena_generate(0, FM_ENA_FMT1); ++ } ++ ++ /* ++ * Construct the full class, detector, and other standard FMA fields. ++ */ ++ (void) snprintf(class, sizeof (class), "%s.%s", ++ ZFS_ERROR_CLASS, subclass); ++ ++ fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), ++ vd != NULL ? vd->vdev_guid : 0); ++ ++ fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); ++ ++ /* ++ * Construct the per-ereport payload, depending on which parameters are ++ * passed in. ++ */ ++ ++ /* ++ * Generic payload members common to all ereports. ++ */ ++ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL, ++ DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, ++ DATA_TYPE_UINT64, spa_guid(spa), ++ FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, ++ spa_load_state(spa), NULL); ++ ++ if (spa != NULL) { ++ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, ++ DATA_TYPE_STRING, ++ spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? ++ FM_EREPORT_FAILMODE_WAIT : ++ spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? ++ FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, ++ NULL); ++ } ++ ++ if (vd != NULL) { ++ vdev_t *pvd = vd->vdev_parent; ++ ++ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, ++ DATA_TYPE_UINT64, vd->vdev_guid, ++ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, ++ DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); ++ if (vd->vdev_path != NULL) ++ fm_payload_set(ereport, ++ FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, ++ DATA_TYPE_STRING, vd->vdev_path, NULL); ++ if (vd->vdev_devid != NULL) ++ fm_payload_set(ereport, ++ FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, ++ DATA_TYPE_STRING, vd->vdev_devid, NULL); ++ if (vd->vdev_fru != NULL) ++ fm_payload_set(ereport, ++ FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, ++ DATA_TYPE_STRING, vd->vdev_fru, NULL); ++ ++ if (pvd != NULL) { ++ fm_payload_set(ereport, ++ FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, ++ DATA_TYPE_UINT64, pvd->vdev_guid, ++ FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, ++ DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, ++ NULL); ++ if (pvd->vdev_path) ++ fm_payload_set(ereport, ++ FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, ++ DATA_TYPE_STRING, pvd->vdev_path, NULL); ++ if (pvd->vdev_devid) ++ fm_payload_set(ereport, ++ FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, ++ DATA_TYPE_STRING, pvd->vdev_devid, NULL); ++ } ++ } ++ ++ if (zio != NULL) { ++ /* ++ * Payload common to all I/Os. ++ */ ++ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, ++ DATA_TYPE_INT32, zio->io_error, NULL); ++ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, ++ DATA_TYPE_INT32, zio->io_flags, NULL); ++ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, ++ DATA_TYPE_UINT32, zio->io_stage, NULL); ++ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, ++ DATA_TYPE_UINT32, zio->io_pipeline, NULL); ++ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, ++ DATA_TYPE_UINT64, zio->io_delay, NULL); ++ ++ /* ++ * If the 'size' parameter is non-zero, it indicates this is a ++ * RAID-Z or other I/O where the physical offset and length are ++ * provided for us, instead of within the zio_t. ++ */ ++ if (vd != NULL) { ++ if (size) ++ fm_payload_set(ereport, ++ FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, ++ DATA_TYPE_UINT64, stateoroffset, ++ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, ++ DATA_TYPE_UINT64, size, NULL); ++ else ++ fm_payload_set(ereport, ++ FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, ++ DATA_TYPE_UINT64, zio->io_offset, ++ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, ++ DATA_TYPE_UINT64, zio->io_size, NULL); ++ } ++ ++ /* ++ * Payload for I/Os with corresponding logical information. ++ */ ++ if (zio->io_logical != NULL) ++ fm_payload_set(ereport, ++ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, ++ DATA_TYPE_UINT64, ++ zio->io_logical->io_bookmark.zb_objset, ++ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, ++ DATA_TYPE_UINT64, ++ zio->io_logical->io_bookmark.zb_object, ++ FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, ++ DATA_TYPE_INT64, ++ zio->io_logical->io_bookmark.zb_level, ++ FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, ++ DATA_TYPE_UINT64, ++ zio->io_logical->io_bookmark.zb_blkid, NULL); ++ } else if (vd != NULL) { ++ /* ++ * If we have a vdev but no zio, this is a device fault, and the ++ * 'stateoroffset' parameter indicates the previous state of the ++ * vdev. ++ */ ++ fm_payload_set(ereport, ++ FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, ++ DATA_TYPE_UINT64, stateoroffset, NULL); ++ } ++ ++ mutex_exit(&spa->spa_errlist_lock); ++ ++ *ereport_out = ereport; ++ *detector_out = detector; ++} ++ ++/* if it's <= 128 bytes, save the corruption directly */ ++#define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) ++ ++#define MAX_RANGES 16 ++ ++typedef struct zfs_ecksum_info { ++ /* histograms of set and cleared bits by bit number in a 64-bit word */ ++ uint16_t zei_histogram_set[sizeof (uint64_t) * NBBY]; ++ uint16_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; ++ ++ /* inline arrays of bits set and cleared. */ ++ uint64_t zei_bits_set[ZFM_MAX_INLINE]; ++ uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; ++ ++ /* ++ * for each range, the number of bits set and cleared. The Hamming ++ * distance between the good and bad buffers is the sum of them all. ++ */ ++ uint32_t zei_range_sets[MAX_RANGES]; ++ uint32_t zei_range_clears[MAX_RANGES]; ++ ++ struct zei_ranges { ++ uint32_t zr_start; ++ uint32_t zr_end; ++ } zei_ranges[MAX_RANGES]; ++ ++ size_t zei_range_count; ++ uint32_t zei_mingap; ++ uint32_t zei_allowed_mingap; ++ ++} zfs_ecksum_info_t; ++ ++static void ++update_histogram(uint64_t value_arg, uint16_t *hist, uint32_t *count) ++{ ++ size_t i; ++ size_t bits = 0; ++ uint64_t value = BE_64(value_arg); ++ ++ /* We store the bits in big-endian (largest-first) order */ ++ for (i = 0; i < 64; i++) { ++ if (value & (1ull << i)) { ++ hist[63 - i]++; ++ ++bits; ++ } ++ } ++ /* update the count of bits changed */ ++ *count += bits; ++} ++ ++/* ++ * We've now filled up the range array, and need to increase "mingap" and ++ * shrink the range list accordingly. zei_mingap is always the smallest ++ * distance between array entries, so we set the new_allowed_gap to be ++ * one greater than that. We then go through the list, joining together ++ * any ranges which are closer than the new_allowed_gap. ++ * ++ * By construction, there will be at least one. We also update zei_mingap ++ * to the new smallest gap, to prepare for our next invocation. ++ */ ++static void ++zei_shrink_ranges(zfs_ecksum_info_t *eip) ++{ ++ uint32_t mingap = UINT32_MAX; ++ uint32_t new_allowed_gap = eip->zei_mingap + 1; ++ ++ size_t idx, output; ++ size_t max = eip->zei_range_count; ++ ++ struct zei_ranges *r = eip->zei_ranges; ++ ++ ASSERT3U(eip->zei_range_count, >, 0); ++ ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); ++ ++ output = idx = 0; ++ while (idx < max - 1) { ++ uint32_t start = r[idx].zr_start; ++ uint32_t end = r[idx].zr_end; ++ ++ while (idx < max - 1) { ++ uint32_t nstart, nend, gap; ++ ++ idx++; ++ nstart = r[idx].zr_start; ++ nend = r[idx].zr_end; ++ ++ gap = nstart - end; ++ if (gap < new_allowed_gap) { ++ end = nend; ++ continue; ++ } ++ if (gap < mingap) ++ mingap = gap; ++ break; ++ } ++ r[output].zr_start = start; ++ r[output].zr_end = end; ++ output++; ++ } ++ ASSERT3U(output, <, eip->zei_range_count); ++ eip->zei_range_count = output; ++ eip->zei_mingap = mingap; ++ eip->zei_allowed_mingap = new_allowed_gap; ++} ++ ++static void ++zei_add_range(zfs_ecksum_info_t *eip, int start, int end) ++{ ++ struct zei_ranges *r = eip->zei_ranges; ++ size_t count = eip->zei_range_count; ++ ++ if (count >= MAX_RANGES) { ++ zei_shrink_ranges(eip); ++ count = eip->zei_range_count; ++ } ++ if (count == 0) { ++ eip->zei_mingap = UINT32_MAX; ++ eip->zei_allowed_mingap = 1; ++ } else { ++ int gap = start - r[count - 1].zr_end; ++ ++ if (gap < eip->zei_allowed_mingap) { ++ r[count - 1].zr_end = end; ++ return; ++ } ++ if (gap < eip->zei_mingap) ++ eip->zei_mingap = gap; ++ } ++ r[count].zr_start = start; ++ r[count].zr_end = end; ++ eip->zei_range_count++; ++} ++ ++static size_t ++zei_range_total_size(zfs_ecksum_info_t *eip) ++{ ++ struct zei_ranges *r = eip->zei_ranges; ++ size_t count = eip->zei_range_count; ++ size_t result = 0; ++ size_t idx; ++ ++ for (idx = 0; idx < count; idx++) ++ result += (r[idx].zr_end - r[idx].zr_start); ++ ++ return (result); ++} ++ ++static zfs_ecksum_info_t * ++annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, ++ const uint8_t *goodbuf, const uint8_t *badbuf, size_t size, ++ boolean_t drop_if_identical) ++{ ++ const uint64_t *good = (const uint64_t *)goodbuf; ++ const uint64_t *bad = (const uint64_t *)badbuf; ++ ++ uint64_t allset = 0; ++ uint64_t allcleared = 0; ++ ++ size_t nui64s = size / sizeof (uint64_t); ++ ++ size_t inline_size; ++ int no_inline = 0; ++ size_t idx; ++ size_t range; ++ ++ size_t offset = 0; ++ ssize_t start = -1; ++ ++ zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_PUSHPAGE); ++ ++ /* don't do any annotation for injected checksum errors */ ++ if (info != NULL && info->zbc_injected) ++ return (eip); ++ ++ if (info != NULL && info->zbc_has_cksum) { ++ fm_payload_set(ereport, ++ FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, ++ DATA_TYPE_UINT64_ARRAY, ++ sizeof (info->zbc_expected) / sizeof (uint64_t), ++ (uint64_t *)&info->zbc_expected, ++ FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, ++ DATA_TYPE_UINT64_ARRAY, ++ sizeof (info->zbc_actual) / sizeof (uint64_t), ++ (uint64_t *)&info->zbc_actual, ++ FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, ++ DATA_TYPE_STRING, ++ info->zbc_checksum_name, ++ NULL); ++ ++ if (info->zbc_byteswapped) { ++ fm_payload_set(ereport, ++ FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, ++ DATA_TYPE_BOOLEAN, 1, ++ NULL); ++ } ++ } ++ ++ if (badbuf == NULL || goodbuf == NULL) ++ return (eip); ++ ++ ASSERT3U(nui64s, <=, UINT16_MAX); ++ ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); ++ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); ++ ASSERT3U(size, <=, UINT32_MAX); ++ ++ /* build up the range list by comparing the two buffers. */ ++ for (idx = 0; idx < nui64s; idx++) { ++ if (good[idx] == bad[idx]) { ++ if (start == -1) ++ continue; ++ ++ zei_add_range(eip, start, idx); ++ start = -1; ++ } else { ++ if (start != -1) ++ continue; ++ ++ start = idx; ++ } ++ } ++ if (start != -1) ++ zei_add_range(eip, start, idx); ++ ++ /* See if it will fit in our inline buffers */ ++ inline_size = zei_range_total_size(eip); ++ if (inline_size > ZFM_MAX_INLINE) ++ no_inline = 1; ++ ++ /* ++ * If there is no change and we want to drop if the buffers are ++ * identical, do so. ++ */ ++ if (inline_size == 0 && drop_if_identical) { ++ kmem_free(eip, sizeof (*eip)); ++ return (NULL); ++ } ++ ++ /* ++ * Now walk through the ranges, filling in the details of the ++ * differences. Also convert our uint64_t-array offsets to byte ++ * offsets. ++ */ ++ for (range = 0; range < eip->zei_range_count; range++) { ++ size_t start = eip->zei_ranges[range].zr_start; ++ size_t end = eip->zei_ranges[range].zr_end; ++ ++ for (idx = start; idx < end; idx++) { ++ uint64_t set, cleared; ++ ++ // bits set in bad, but not in good ++ set = ((~good[idx]) & bad[idx]); ++ // bits set in good, but not in bad ++ cleared = (good[idx] & (~bad[idx])); ++ ++ allset |= set; ++ allcleared |= cleared; ++ ++ if (!no_inline) { ++ ASSERT3U(offset, <, inline_size); ++ eip->zei_bits_set[offset] = set; ++ eip->zei_bits_cleared[offset] = cleared; ++ offset++; ++ } ++ ++ update_histogram(set, eip->zei_histogram_set, ++ &eip->zei_range_sets[range]); ++ update_histogram(cleared, eip->zei_histogram_cleared, ++ &eip->zei_range_clears[range]); ++ } ++ ++ /* convert to byte offsets */ ++ eip->zei_ranges[range].zr_start *= sizeof (uint64_t); ++ eip->zei_ranges[range].zr_end *= sizeof (uint64_t); ++ } ++ eip->zei_allowed_mingap *= sizeof (uint64_t); ++ inline_size *= sizeof (uint64_t); ++ ++ /* fill in ereport */ ++ fm_payload_set(ereport, ++ FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, ++ DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, ++ (uint32_t *)eip->zei_ranges, ++ FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, ++ DATA_TYPE_UINT32, eip->zei_allowed_mingap, ++ FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, ++ DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, ++ FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, ++ DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, ++ NULL); ++ ++ if (!no_inline) { ++ fm_payload_set(ereport, ++ FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, ++ DATA_TYPE_UINT8_ARRAY, ++ inline_size, (uint8_t *)eip->zei_bits_set, ++ FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, ++ DATA_TYPE_UINT8_ARRAY, ++ inline_size, (uint8_t *)eip->zei_bits_cleared, ++ NULL); ++ } else { ++ fm_payload_set(ereport, ++ FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, ++ DATA_TYPE_UINT16_ARRAY, ++ NBBY * sizeof (uint64_t), eip->zei_histogram_set, ++ FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, ++ DATA_TYPE_UINT16_ARRAY, ++ NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, ++ NULL); ++ } ++ return (eip); ++} ++#endif ++ ++void ++zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, ++ uint64_t stateoroffset, uint64_t size) ++{ ++#ifdef _KERNEL ++ nvlist_t *ereport = NULL; ++ nvlist_t *detector = NULL; ++ ++ zfs_ereport_start(&ereport, &detector, ++ subclass, spa, vd, zio, stateoroffset, size); ++ ++ if (ereport == NULL) ++ return; ++ ++ /* Cleanup is handled by the callback function */ ++ zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); ++#endif ++} ++ ++void ++zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, ++ struct zio *zio, uint64_t offset, uint64_t length, void *arg, ++ zio_bad_cksum_t *info) ++{ ++ zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_PUSHPAGE); ++ ++ if (zio->io_vsd != NULL) ++ zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); ++ else ++ zio_vsd_default_cksum_report(zio, report, arg); ++ ++ /* copy the checksum failure information if it was provided */ ++ if (info != NULL) { ++ report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_PUSHPAGE); ++ bcopy(info, report->zcr_ckinfo, sizeof (*info)); ++ } ++ ++ report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; ++ report->zcr_length = length; ++ ++#ifdef _KERNEL ++ zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, ++ FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); ++ ++ if (report->zcr_ereport == NULL) { ++ report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo); ++ if (report->zcr_ckinfo != NULL) { ++ kmem_free(report->zcr_ckinfo, ++ sizeof (*report->zcr_ckinfo)); ++ } ++ kmem_free(report, sizeof (*report)); ++ return; ++ } ++#endif ++ ++ mutex_enter(&spa->spa_errlist_lock); ++ report->zcr_next = zio->io_logical->io_cksum_report; ++ zio->io_logical->io_cksum_report = report; ++ mutex_exit(&spa->spa_errlist_lock); ++} ++ ++void ++zfs_ereport_finish_checksum(zio_cksum_report_t *report, ++ const void *good_data, const void *bad_data, boolean_t drop_if_identical) ++{ ++#ifdef _KERNEL ++ zfs_ecksum_info_t *info = NULL; ++ info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, ++ good_data, bad_data, report->zcr_length, drop_if_identical); ++ ++ if (info != NULL) ++ zfs_zevent_post(report->zcr_ereport, ++ report->zcr_detector, zfs_zevent_post_cb); ++ ++ report->zcr_ereport = report->zcr_detector = NULL; ++ if (info != NULL) ++ kmem_free(info, sizeof (*info)); ++#endif ++} ++ ++void ++zfs_ereport_free_checksum(zio_cksum_report_t *rpt) ++{ ++#ifdef _KERNEL ++ if (rpt->zcr_ereport != NULL) { ++ fm_nvlist_destroy(rpt->zcr_ereport, ++ FM_NVA_FREE); ++ fm_nvlist_destroy(rpt->zcr_detector, ++ FM_NVA_FREE); ++ } ++#endif ++ rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); ++ ++ if (rpt->zcr_ckinfo != NULL) ++ kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); ++ ++ kmem_free(rpt, sizeof (*rpt)); ++} ++ ++void ++zfs_ereport_send_interim_checksum(zio_cksum_report_t *report) ++{ ++#ifdef _KERNEL ++ zfs_zevent_post(report->zcr_ereport, report->zcr_detector, NULL); ++#endif ++} ++ ++void ++zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, ++ struct zio *zio, uint64_t offset, uint64_t length, ++ const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc) ++{ ++#ifdef _KERNEL ++ nvlist_t *ereport = NULL; ++ nvlist_t *detector = NULL; ++ zfs_ecksum_info_t *info; ++ ++ zfs_ereport_start(&ereport, &detector, ++ FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); ++ ++ if (ereport == NULL) ++ return; ++ ++ info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, ++ B_FALSE); ++ ++ if (info != NULL) { ++ zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); ++ kmem_free(info, sizeof (*info)); ++ } ++#endif ++} ++ ++static void ++zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) ++{ ++#ifdef _KERNEL ++ nvlist_t *resource; ++ char class[64]; ++ ++ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) ++ return; ++ ++ if ((resource = fm_nvlist_create(NULL)) == NULL) ++ return; ++ ++ (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE, ++ ZFS_ERROR_CLASS, name); ++ VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); ++ VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); ++ VERIFY(nvlist_add_uint64(resource, ++ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0); ++ if (vd) { ++ VERIFY(nvlist_add_uint64(resource, ++ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0); ++ VERIFY(nvlist_add_uint64(resource, ++ FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state) == 0); ++ } ++ ++ zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); ++#endif ++} ++ ++/* ++ * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev ++ * has been removed from the system. This will cause the DE to ignore any ++ * recent I/O errors, inferring that they are due to the asynchronous device ++ * removal. ++ */ ++void ++zfs_post_remove(spa_t *spa, vdev_t *vd) ++{ ++ zfs_post_common(spa, vd, FM_EREPORT_RESOURCE_REMOVED); ++} ++ ++/* ++ * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool ++ * has the 'autoreplace' property set, and therefore any broken vdevs will be ++ * handled by higher level logic, and no vdev fault should be generated. ++ */ ++void ++zfs_post_autoreplace(spa_t *spa, vdev_t *vd) ++{ ++ zfs_post_common(spa, vd, FM_EREPORT_RESOURCE_AUTOREPLACE); ++} ++ ++/* ++ * The 'resource.fs.zfs.statechange' event is an internal signal that the ++ * given vdev has transitioned its state to DEGRADED or HEALTHY. This will ++ * cause the retire agent to repair any outstanding fault management cases ++ * open because the device was not found (fault.fs.zfs.device). ++ */ ++void ++zfs_post_state_change(spa_t *spa, vdev_t *vd) ++{ ++ zfs_post_common(spa, vd, FM_EREPORT_RESOURCE_STATECHANGE); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(zfs_ereport_post); ++EXPORT_SYMBOL(zfs_ereport_post_checksum); ++EXPORT_SYMBOL(zfs_post_remove); ++EXPORT_SYMBOL(zfs_post_autoreplace); ++EXPORT_SYMBOL(zfs_post_state_change); ++#endif /* _KERNEL */ +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_fuid.c linux-3.2.33-go/fs/zfs/zfs/zfs_fuid.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_fuid.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_fuid.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,775 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef _KERNEL ++#include ++#include ++#include ++#include ++#endif ++#include ++ ++/* ++ * FUID Domain table(s). ++ * ++ * The FUID table is stored as a packed nvlist of an array ++ * of nvlists which contain an index, domain string and offset ++ * ++ * During file system initialization the nvlist(s) are read and ++ * two AVL trees are created. One tree is keyed by the index number ++ * and the other by the domain string. Nodes are never removed from ++ * trees, but new entries may be added. If a new entry is added then ++ * the zsb->z_fuid_dirty flag is set to true and the caller will then ++ * be responsible for calling zfs_fuid_sync() to sync the changes to disk. ++ * ++ */ ++ ++#define FUID_IDX "fuid_idx" ++#define FUID_DOMAIN "fuid_domain" ++#define FUID_OFFSET "fuid_offset" ++#define FUID_NVP_ARRAY "fuid_nvlist" ++ ++typedef struct fuid_domain { ++ avl_node_t f_domnode; ++ avl_node_t f_idxnode; ++ ksiddomain_t *f_ksid; ++ uint64_t f_idx; ++} fuid_domain_t; ++ ++static char *nulldomain = ""; ++ ++/* ++ * Compare two indexes. ++ */ ++static int ++idx_compare(const void *arg1, const void *arg2) ++{ ++ const fuid_domain_t *node1 = arg1; ++ const fuid_domain_t *node2 = arg2; ++ ++ if (node1->f_idx < node2->f_idx) ++ return (-1); ++ else if (node1->f_idx > node2->f_idx) ++ return (1); ++ return (0); ++} ++ ++/* ++ * Compare two domain strings. ++ */ ++static int ++domain_compare(const void *arg1, const void *arg2) ++{ ++ const fuid_domain_t *node1 = arg1; ++ const fuid_domain_t *node2 = arg2; ++ int val; ++ ++ val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name); ++ if (val == 0) ++ return (0); ++ return (val > 0 ? 1 : -1); ++} ++ ++void ++zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree) ++{ ++ avl_create(idx_tree, idx_compare, ++ sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); ++ avl_create(domain_tree, domain_compare, ++ sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode)); ++} ++ ++/* ++ * load initial fuid domain and idx trees. This function is used by ++ * both the kernel and zdb. ++ */ ++uint64_t ++zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, ++ avl_tree_t *domain_tree) ++{ ++ dmu_buf_t *db; ++ uint64_t fuid_size; ++ ++ ASSERT(fuid_obj != 0); ++ VERIFY(0 == dmu_bonus_hold(os, fuid_obj, ++ FTAG, &db)); ++ fuid_size = *(uint64_t *)db->db_data; ++ dmu_buf_rele(db, FTAG); ++ ++ if (fuid_size) { ++ nvlist_t **fuidnvp; ++ nvlist_t *nvp = NULL; ++ uint_t count; ++ char *packed; ++ int i; ++ ++ packed = kmem_alloc(fuid_size, KM_SLEEP); ++ VERIFY(dmu_read(os, fuid_obj, 0, ++ fuid_size, packed, DMU_READ_PREFETCH) == 0); ++ VERIFY(nvlist_unpack(packed, fuid_size, ++ &nvp, 0) == 0); ++ VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, ++ &fuidnvp, &count) == 0); ++ ++ for (i = 0; i != count; i++) { ++ fuid_domain_t *domnode; ++ char *domain; ++ uint64_t idx; ++ ++ VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN, ++ &domain) == 0); ++ VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX, ++ &idx) == 0); ++ ++ domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); ++ ++ domnode->f_idx = idx; ++ domnode->f_ksid = ksid_lookupdomain(domain); ++ avl_add(idx_tree, domnode); ++ avl_add(domain_tree, domnode); ++ } ++ nvlist_free(nvp); ++ kmem_free(packed, fuid_size); ++ } ++ return (fuid_size); ++} ++ ++void ++zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree) ++{ ++ fuid_domain_t *domnode; ++ void *cookie; ++ ++ cookie = NULL; ++ while ((domnode = avl_destroy_nodes(domain_tree, &cookie))) ++ ksiddomain_rele(domnode->f_ksid); ++ ++ avl_destroy(domain_tree); ++ cookie = NULL; ++ while ((domnode = avl_destroy_nodes(idx_tree, &cookie))) ++ kmem_free(domnode, sizeof (fuid_domain_t)); ++ avl_destroy(idx_tree); ++} ++ ++char * ++zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx) ++{ ++ fuid_domain_t searchnode, *findnode; ++ avl_index_t loc; ++ ++ searchnode.f_idx = idx; ++ ++ findnode = avl_find(idx_tree, &searchnode, &loc); ++ ++ return (findnode ? findnode->f_ksid->kd_name : nulldomain); ++} ++ ++#ifdef _KERNEL ++/* ++ * Load the fuid table(s) into memory. ++ */ ++static void ++zfs_fuid_init(zfs_sb_t *zsb) ++{ ++ rw_enter(&zsb->z_fuid_lock, RW_WRITER); ++ ++ if (zsb->z_fuid_loaded) { ++ rw_exit(&zsb->z_fuid_lock); ++ return; ++ } ++ ++ zfs_fuid_avl_tree_create(&zsb->z_fuid_idx, &zsb->z_fuid_domain); ++ ++ (void) zap_lookup(zsb->z_os, MASTER_NODE_OBJ, ++ ZFS_FUID_TABLES, 8, 1, &zsb->z_fuid_obj); ++ if (zsb->z_fuid_obj != 0) { ++ zsb->z_fuid_size = zfs_fuid_table_load(zsb->z_os, ++ zsb->z_fuid_obj, &zsb->z_fuid_idx, ++ &zsb->z_fuid_domain); ++ } ++ ++ zsb->z_fuid_loaded = B_TRUE; ++ rw_exit(&zsb->z_fuid_lock); ++} ++ ++/* ++ * sync out AVL trees to persistent storage. ++ */ ++void ++zfs_fuid_sync(zfs_sb_t *zsb, dmu_tx_t *tx) ++{ ++ nvlist_t *nvp; ++ nvlist_t **fuids; ++ size_t nvsize = 0; ++ char *packed; ++ dmu_buf_t *db; ++ fuid_domain_t *domnode; ++ int numnodes; ++ int i; ++ ++ if (!zsb->z_fuid_dirty) { ++ return; ++ } ++ ++ rw_enter(&zsb->z_fuid_lock, RW_WRITER); ++ ++ /* ++ * First see if table needs to be created? ++ */ ++ if (zsb->z_fuid_obj == 0) { ++ zsb->z_fuid_obj = dmu_object_alloc(zsb->z_os, ++ DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, ++ sizeof (uint64_t), tx); ++ VERIFY(zap_add(zsb->z_os, MASTER_NODE_OBJ, ++ ZFS_FUID_TABLES, sizeof (uint64_t), 1, ++ &zsb->z_fuid_obj, tx) == 0); ++ } ++ ++ VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ ++ numnodes = avl_numnodes(&zsb->z_fuid_idx); ++ fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP); ++ for (i = 0, domnode = avl_first(&zsb->z_fuid_domain); domnode; i++, ++ domnode = AVL_NEXT(&zsb->z_fuid_domain, domnode)) { ++ VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, ++ domnode->f_idx) == 0); ++ VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0); ++ VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN, ++ domnode->f_ksid->kd_name) == 0); ++ } ++ VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, ++ fuids, numnodes) == 0); ++ for (i = 0; i != numnodes; i++) ++ nvlist_free(fuids[i]); ++ kmem_free(fuids, numnodes * sizeof (void *)); ++ VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); ++ packed = kmem_alloc(nvsize, KM_SLEEP); ++ VERIFY(nvlist_pack(nvp, &packed, &nvsize, ++ NV_ENCODE_XDR, KM_SLEEP) == 0); ++ nvlist_free(nvp); ++ zsb->z_fuid_size = nvsize; ++ dmu_write(zsb->z_os, zsb->z_fuid_obj, 0, zsb->z_fuid_size, packed, tx); ++ kmem_free(packed, zsb->z_fuid_size); ++ VERIFY(0 == dmu_bonus_hold(zsb->z_os, zsb->z_fuid_obj, ++ FTAG, &db)); ++ dmu_buf_will_dirty(db, tx); ++ *(uint64_t *)db->db_data = zsb->z_fuid_size; ++ dmu_buf_rele(db, FTAG); ++ ++ zsb->z_fuid_dirty = B_FALSE; ++ rw_exit(&zsb->z_fuid_lock); ++} ++ ++/* ++ * Query domain table for a given domain. ++ * ++ * If domain isn't found and addok is set, it is added to AVL trees and ++ * the zsb->z_fuid_dirty flag will be set to TRUE. It will then be ++ * necessary for the caller or another thread to detect the dirty table ++ * and sync out the changes. ++ */ ++int ++zfs_fuid_find_by_domain(zfs_sb_t *zsb, const char *domain, ++ char **retdomain, boolean_t addok) ++{ ++ fuid_domain_t searchnode, *findnode; ++ avl_index_t loc; ++ krw_t rw = RW_READER; ++ ++ /* ++ * If the dummy "nobody" domain then return an index of 0 ++ * to cause the created FUID to be a standard POSIX id ++ * for the user nobody. ++ */ ++ if (domain[0] == '\0') { ++ if (retdomain) ++ *retdomain = nulldomain; ++ return (0); ++ } ++ ++ searchnode.f_ksid = ksid_lookupdomain(domain); ++ if (retdomain) ++ *retdomain = searchnode.f_ksid->kd_name; ++ if (!zsb->z_fuid_loaded) ++ zfs_fuid_init(zsb); ++ ++retry: ++ rw_enter(&zsb->z_fuid_lock, rw); ++ findnode = avl_find(&zsb->z_fuid_domain, &searchnode, &loc); ++ ++ if (findnode) { ++ rw_exit(&zsb->z_fuid_lock); ++ ksiddomain_rele(searchnode.f_ksid); ++ return (findnode->f_idx); ++ } else if (addok) { ++ fuid_domain_t *domnode; ++ uint64_t retidx; ++ ++ if (rw == RW_READER && !rw_tryupgrade(&zsb->z_fuid_lock)) { ++ rw_exit(&zsb->z_fuid_lock); ++ rw = RW_WRITER; ++ goto retry; ++ } ++ ++ domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); ++ domnode->f_ksid = searchnode.f_ksid; ++ ++ retidx = domnode->f_idx = avl_numnodes(&zsb->z_fuid_idx) + 1; ++ ++ avl_add(&zsb->z_fuid_domain, domnode); ++ avl_add(&zsb->z_fuid_idx, domnode); ++ zsb->z_fuid_dirty = B_TRUE; ++ rw_exit(&zsb->z_fuid_lock); ++ return (retidx); ++ } else { ++ rw_exit(&zsb->z_fuid_lock); ++ return (-1); ++ } ++} ++ ++/* ++ * Query domain table by index, returning domain string ++ * ++ * Returns a pointer from an avl node of the domain string. ++ * ++ */ ++const char * ++zfs_fuid_find_by_idx(zfs_sb_t *zsb, uint32_t idx) ++{ ++ char *domain; ++ ++ if (idx == 0 || !zsb->z_use_fuids) ++ return (NULL); ++ ++ if (!zsb->z_fuid_loaded) ++ zfs_fuid_init(zsb); ++ ++ rw_enter(&zsb->z_fuid_lock, RW_READER); ++ ++ if (zsb->z_fuid_obj || zsb->z_fuid_dirty) ++ domain = zfs_fuid_idx_domain(&zsb->z_fuid_idx, idx); ++ else ++ domain = nulldomain; ++ rw_exit(&zsb->z_fuid_lock); ++ ++ ASSERT(domain); ++ return (domain); ++} ++ ++void ++zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp) ++{ ++ *uidp = zfs_fuid_map_id(ZTOZSB(zp), zp->z_uid, cr, ZFS_OWNER); ++ *gidp = zfs_fuid_map_id(ZTOZSB(zp), zp->z_gid, cr, ZFS_GROUP); ++} ++ ++uid_t ++zfs_fuid_map_id(zfs_sb_t *zsb, uint64_t fuid, ++ cred_t *cr, zfs_fuid_type_t type) ++{ ++#ifdef HAVE_KSID ++ uint32_t index = FUID_INDEX(fuid); ++ const char *domain; ++ uid_t id; ++ ++ if (index == 0) ++ return (fuid); ++ ++ domain = zfs_fuid_find_by_idx(zsb, index); ++ ASSERT(domain != NULL); ++ ++ if (type == ZFS_OWNER || type == ZFS_ACE_USER) { ++ (void) kidmap_getuidbysid(crgetzone(cr), domain, ++ FUID_RID(fuid), &id); ++ } else { ++ (void) kidmap_getgidbysid(crgetzone(cr), domain, ++ FUID_RID(fuid), &id); ++ } ++ return (id); ++#else ++ /* ++ * The Linux port only supports POSIX IDs, use the passed id. ++ */ ++ return (fuid); ++#endif /* HAVE_KSID */ ++} ++ ++/* ++ * Add a FUID node to the list of fuid's being created for this ++ * ACL ++ * ++ * If ACL has multiple domains, then keep only one copy of each unique ++ * domain. ++ */ ++void ++zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, ++ uint64_t idx, uint64_t id, zfs_fuid_type_t type) ++{ ++ zfs_fuid_t *fuid; ++ zfs_fuid_domain_t *fuid_domain; ++ zfs_fuid_info_t *fuidp; ++ uint64_t fuididx; ++ boolean_t found = B_FALSE; ++ ++ if (*fuidpp == NULL) ++ *fuidpp = zfs_fuid_info_alloc(); ++ ++ fuidp = *fuidpp; ++ /* ++ * First find fuid domain index in linked list ++ * ++ * If one isn't found then create an entry. ++ */ ++ ++ for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains); ++ fuid_domain; fuid_domain = list_next(&fuidp->z_domains, ++ fuid_domain), fuididx++) { ++ if (idx == fuid_domain->z_domidx) { ++ found = B_TRUE; ++ break; ++ } ++ } ++ ++ if (!found) { ++ fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP); ++ fuid_domain->z_domain = domain; ++ fuid_domain->z_domidx = idx; ++ list_insert_tail(&fuidp->z_domains, fuid_domain); ++ fuidp->z_domain_str_sz += strlen(domain) + 1; ++ fuidp->z_domain_cnt++; ++ } ++ ++ if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) { ++ ++ /* ++ * Now allocate fuid entry and add it on the end of the list ++ */ ++ ++ fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); ++ fuid->z_id = id; ++ fuid->z_domidx = idx; ++ fuid->z_logfuid = FUID_ENCODE(fuididx, rid); ++ ++ list_insert_tail(&fuidp->z_fuids, fuid); ++ fuidp->z_fuid_cnt++; ++ } else { ++ if (type == ZFS_OWNER) ++ fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid); ++ else ++ fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid); ++ } ++} ++ ++#ifdef HAVE_KSID ++/* ++ * Create a file system FUID, based on information in the users cred ++ * ++ * If cred contains KSID_OWNER then it should be used to determine ++ * the uid otherwise cred's uid will be used. By default cred's gid ++ * is used unless it's an ephemeral ID in which case KSID_GROUP will ++ * be used if it exists. ++ */ ++uint64_t ++zfs_fuid_create_cred(zfs_sb_t *zsb, zfs_fuid_type_t type, ++ cred_t *cr, zfs_fuid_info_t **fuidp) ++{ ++ uint64_t idx; ++ ksid_t *ksid; ++ uint32_t rid; ++ char *kdomain; ++ const char *domain; ++ uid_t id; ++ ++ VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); ++ ++ ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP); ++ ++ if (!zsb->z_use_fuids || (ksid == NULL)) { ++ id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr); ++ ++ if (IS_EPHEMERAL(id)) ++ return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY); ++ ++ return ((uint64_t)id); ++ } ++ ++ /* ++ * ksid is present and FUID is supported ++ */ ++ id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr); ++ ++ if (!IS_EPHEMERAL(id)) ++ return ((uint64_t)id); ++ ++ if (type == ZFS_GROUP) ++ id = ksid_getid(ksid); ++ ++ rid = ksid_getrid(ksid); ++ domain = ksid_getdomain(ksid); ++ ++ idx = zfs_fuid_find_by_domain(zsb, domain, &kdomain, B_TRUE); ++ ++ zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type); ++ ++ return (FUID_ENCODE(idx, rid)); ++} ++#endif /* HAVE_KSID */ ++ ++/* ++ * Create a file system FUID for an ACL ace ++ * or a chown/chgrp of the file. ++ * This is similar to zfs_fuid_create_cred, except that ++ * we can't find the domain + rid information in the ++ * cred. Instead we have to query Winchester for the ++ * domain and rid. ++ * ++ * During replay operations the domain+rid information is ++ * found in the zfs_fuid_info_t that the replay code has ++ * attached to the zsb of the file system. ++ */ ++uint64_t ++zfs_fuid_create(zfs_sb_t *zsb, uint64_t id, cred_t *cr, ++ zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp) ++{ ++#ifdef HAVE_KSID ++ const char *domain; ++ char *kdomain; ++ uint32_t fuid_idx = FUID_INDEX(id); ++ uint32_t rid; ++ idmap_stat status; ++ uint64_t idx; ++ zfs_fuid_t *zfuid = NULL; ++ zfs_fuid_info_t *fuidp; ++ ++ /* ++ * If POSIX ID, or entry is already a FUID then ++ * just return the id ++ * ++ * We may also be handed an already FUID'ized id via ++ * chmod. ++ */ ++ ++ if (!zsb->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0) ++ return (id); ++ ++ if (zsb->z_replay) { ++ fuidp = zsb->z_fuid_replay; ++ ++ /* ++ * If we are passed an ephemeral id, but no ++ * fuid_info was logged then return NOBODY. ++ * This is most likely a result of idmap service ++ * not being available. ++ */ ++ if (fuidp == NULL) ++ return (UID_NOBODY); ++ ++ switch (type) { ++ case ZFS_ACE_USER: ++ case ZFS_ACE_GROUP: ++ zfuid = list_head(&fuidp->z_fuids); ++ rid = FUID_RID(zfuid->z_logfuid); ++ idx = FUID_INDEX(zfuid->z_logfuid); ++ break; ++ case ZFS_OWNER: ++ rid = FUID_RID(fuidp->z_fuid_owner); ++ idx = FUID_INDEX(fuidp->z_fuid_owner); ++ break; ++ case ZFS_GROUP: ++ rid = FUID_RID(fuidp->z_fuid_group); ++ idx = FUID_INDEX(fuidp->z_fuid_group); ++ break; ++ }; ++ domain = fuidp->z_domain_table[idx -1]; ++ } else { ++ if (type == ZFS_OWNER || type == ZFS_ACE_USER) ++ status = kidmap_getsidbyuid(crgetzone(cr), id, ++ &domain, &rid); ++ else ++ status = kidmap_getsidbygid(crgetzone(cr), id, ++ &domain, &rid); ++ ++ if (status != 0) { ++ /* ++ * When returning nobody we will need to ++ * make a dummy fuid table entry for logging ++ * purposes. ++ */ ++ rid = UID_NOBODY; ++ domain = nulldomain; ++ } ++ } ++ ++ idx = zfs_fuid_find_by_domain(zsb, domain, &kdomain, B_TRUE); ++ ++ if (!zsb->z_replay) ++ zfs_fuid_node_add(fuidpp, kdomain, ++ rid, idx, id, type); ++ else if (zfuid != NULL) { ++ list_remove(&fuidp->z_fuids, zfuid); ++ kmem_free(zfuid, sizeof (zfs_fuid_t)); ++ } ++ return (FUID_ENCODE(idx, rid)); ++#else ++ /* ++ * The Linux port only supports POSIX IDs, use the passed id. ++ */ ++ return (id); ++#endif ++} ++ ++void ++zfs_fuid_destroy(zfs_sb_t *zsb) ++{ ++ rw_enter(&zsb->z_fuid_lock, RW_WRITER); ++ if (!zsb->z_fuid_loaded) { ++ rw_exit(&zsb->z_fuid_lock); ++ return; ++ } ++ zfs_fuid_table_destroy(&zsb->z_fuid_idx, &zsb->z_fuid_domain); ++ rw_exit(&zsb->z_fuid_lock); ++} ++ ++/* ++ * Allocate zfs_fuid_info for tracking FUIDs created during ++ * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR() ++ */ ++zfs_fuid_info_t * ++zfs_fuid_info_alloc(void) ++{ ++ zfs_fuid_info_t *fuidp; ++ ++ fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP); ++ list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t), ++ offsetof(zfs_fuid_domain_t, z_next)); ++ list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t), ++ offsetof(zfs_fuid_t, z_next)); ++ return (fuidp); ++} ++ ++/* ++ * Release all memory associated with zfs_fuid_info_t ++ */ ++void ++zfs_fuid_info_free(zfs_fuid_info_t *fuidp) ++{ ++ zfs_fuid_t *zfuid; ++ zfs_fuid_domain_t *zdomain; ++ ++ while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) { ++ list_remove(&fuidp->z_fuids, zfuid); ++ kmem_free(zfuid, sizeof (zfs_fuid_t)); ++ } ++ ++ if (fuidp->z_domain_table != NULL) ++ kmem_free(fuidp->z_domain_table, ++ (sizeof (char **)) * fuidp->z_domain_cnt); ++ ++ while ((zdomain = list_head(&fuidp->z_domains)) != NULL) { ++ list_remove(&fuidp->z_domains, zdomain); ++ kmem_free(zdomain, sizeof (zfs_fuid_domain_t)); ++ } ++ ++ kmem_free(fuidp, sizeof (zfs_fuid_info_t)); ++} ++ ++/* ++ * Check to see if id is a groupmember. If cred ++ * has ksid info then sidlist is checked first ++ * and if still not found then POSIX groups are checked ++ * ++ * Will use a straight FUID compare when possible. ++ */ ++boolean_t ++zfs_groupmember(zfs_sb_t *zsb, uint64_t id, cred_t *cr) ++{ ++#ifdef HAVE_KSID ++ ksid_t *ksid = crgetsid(cr, KSID_GROUP); ++ ksidlist_t *ksidlist = crgetsidlist(cr); ++ uid_t gid; ++ ++ if (ksid && ksidlist) { ++ int i; ++ ksid_t *ksid_groups; ++ uint32_t idx = FUID_INDEX(id); ++ uint32_t rid = FUID_RID(id); ++ ++ ksid_groups = ksidlist->ksl_sids; ++ ++ for (i = 0; i != ksidlist->ksl_nsid; i++) { ++ if (idx == 0) { ++ if (id != IDMAP_WK_CREATOR_GROUP_GID && ++ id == ksid_groups[i].ks_id) { ++ return (B_TRUE); ++ } ++ } else { ++ const char *domain; ++ ++ domain = zfs_fuid_find_by_idx(zsb, idx); ++ ASSERT(domain != NULL); ++ ++ if (strcmp(domain, ++ IDMAP_WK_CREATOR_SID_AUTHORITY) == 0) ++ return (B_FALSE); ++ ++ if ((strcmp(domain, ++ ksid_groups[i].ks_domain->kd_name) == 0) && ++ rid == ksid_groups[i].ks_rid) ++ return (B_TRUE); ++ } ++ } ++ } ++ ++ /* ++ * Not found in ksidlist, check posix groups ++ */ ++ gid = zfs_fuid_map_id(zsb, id, cr, ZFS_GROUP); ++ return (groupmember(gid, cr)); ++#else ++ return (B_TRUE); ++#endif ++} ++ ++void ++zfs_fuid_txhold(zfs_sb_t *zsb, dmu_tx_t *tx) ++{ ++ if (zsb->z_fuid_obj == 0) { ++ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); ++ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ++ FUID_SIZE_ESTIMATE(zsb)); ++ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); ++ } else { ++ dmu_tx_hold_bonus(tx, zsb->z_fuid_obj); ++ dmu_tx_hold_write(tx, zsb->z_fuid_obj, 0, ++ FUID_SIZE_ESTIMATE(zsb)); ++ } ++} ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_ioctl.c linux-3.2.33-go/fs/zfs/zfs/zfs_ioctl.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_ioctl.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_ioctl.c 2012-11-16 23:25:34.347039358 +0100 +@@ -0,0 +1,5259 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Portions Copyright 2011 Martin Matuska ++ * Portions Copyright 2012 Pawel Jakub Dawidek ++ * Copyright (c) 2012, Joyent, Inc. All rights reserved. ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ * Copyright (c) 2012, Joyent, Inc. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "zfs_namecheck.h" ++#include "zfs_prop.h" ++#include "zfs_deleg.h" ++#include "zfs_comutil.h" ++ ++kmutex_t zfsdev_state_lock; ++list_t zfsdev_state_list; ++ ++extern void zfs_init(void); ++extern void zfs_fini(void); ++ ++typedef int zfs_ioc_func_t(zfs_cmd_t *); ++typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *); ++ ++typedef enum { ++ NO_NAME, ++ POOL_NAME, ++ DATASET_NAME ++} zfs_ioc_namecheck_t; ++ ++typedef enum { ++ POOL_CHECK_NONE = 1 << 0, ++ POOL_CHECK_SUSPENDED = 1 << 1, ++ POOL_CHECK_READONLY = 1 << 2 ++} zfs_ioc_poolcheck_t; ++ ++typedef struct zfs_ioc_vec { ++ zfs_ioc_func_t *zvec_func; ++ zfs_secpolicy_func_t *zvec_secpolicy; ++ zfs_ioc_namecheck_t zvec_namecheck; ++ boolean_t zvec_his_log; ++ zfs_ioc_poolcheck_t zvec_pool_check; ++} zfs_ioc_vec_t; ++ ++/* This array is indexed by zfs_userquota_prop_t */ ++static const char *userquota_perms[] = { ++ ZFS_DELEG_PERM_USERUSED, ++ ZFS_DELEG_PERM_USERQUOTA, ++ ZFS_DELEG_PERM_GROUPUSED, ++ ZFS_DELEG_PERM_GROUPQUOTA, ++}; ++ ++static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); ++static int zfs_check_settable(const char *name, nvpair_t *property, ++ cred_t *cr); ++static int zfs_check_clearable(char *dataset, nvlist_t *props, ++ nvlist_t **errors); ++static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, ++ boolean_t *); ++int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t **); ++ ++static void ++history_str_free(char *buf) ++{ ++ kmem_free(buf, HIS_MAX_RECORD_LEN); ++} ++ ++static char * ++history_str_get(zfs_cmd_t *zc) ++{ ++ char *buf; ++ ++ if (zc->zc_history == 0) ++ return (NULL); ++ ++ buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP | KM_NODEBUG); ++ if (copyinstr((void *)(uintptr_t)zc->zc_history, ++ buf, HIS_MAX_RECORD_LEN, NULL) != 0) { ++ history_str_free(buf); ++ return (NULL); ++ } ++ ++ buf[HIS_MAX_RECORD_LEN -1] = '\0'; ++ ++ return (buf); ++} ++ ++/* ++ * Check to see if the named dataset is currently defined as bootable ++ */ ++static boolean_t ++zfs_is_bootfs(const char *name) ++{ ++ objset_t *os; ++ ++ if (dmu_objset_hold(name, FTAG, &os) == 0) { ++ boolean_t ret; ++ ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os))); ++ dmu_objset_rele(os, FTAG); ++ return (ret); ++ } ++ return (B_FALSE); ++} ++ ++/* ++ * zfs_earlier_version ++ * ++ * Return non-zero if the spa version is less than requested version. ++ */ ++static int ++zfs_earlier_version(const char *name, int version) ++{ ++ spa_t *spa; ++ ++ if (spa_open(name, &spa, FTAG) == 0) { ++ if (spa_version(spa) < version) { ++ spa_close(spa, FTAG); ++ return (1); ++ } ++ spa_close(spa, FTAG); ++ } ++ return (0); ++} ++ ++/* ++ * zpl_earlier_version ++ * ++ * Return TRUE if the ZPL version is less than requested version. ++ */ ++static boolean_t ++zpl_earlier_version(const char *name, int version) ++{ ++ objset_t *os; ++ boolean_t rc = B_TRUE; ++ ++ if (dmu_objset_hold(name, FTAG, &os) == 0) { ++ uint64_t zplversion; ++ ++ if (dmu_objset_type(os) != DMU_OST_ZFS) { ++ dmu_objset_rele(os, FTAG); ++ return (B_TRUE); ++ } ++ /* XXX reading from non-owned objset */ ++ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) ++ rc = zplversion < version; ++ dmu_objset_rele(os, FTAG); ++ } ++ return (rc); ++} ++ ++static void ++zfs_log_history(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ char *buf; ++ ++ if ((buf = history_str_get(zc)) == NULL) ++ return; ++ ++ if (spa_open(zc->zc_name, &spa, FTAG) == 0) { ++ if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) ++ (void) spa_history_log(spa, buf, LOG_CMD_NORMAL); ++ spa_close(spa, FTAG); ++ } ++ history_str_free(buf); ++} ++ ++/* ++ * Policy for top-level read operations (list pools). Requires no privileges, ++ * and can be used in the local zone, as there is no associated dataset. ++ */ ++/* ARGSUSED */ ++static int ++zfs_secpolicy_none(zfs_cmd_t *zc, cred_t *cr) ++{ ++ return (0); ++} ++ ++/* ++ * Policy for dataset read operations (list children, get statistics). Requires ++ * no privileges, but must be visible in the local zone. ++ */ ++/* ARGSUSED */ ++static int ++zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr) ++{ ++ if (INGLOBALZONE(curproc) || ++ zone_dataset_visible(zc->zc_name, NULL)) ++ return (0); ++ ++ return (ENOENT); ++} ++ ++static int ++zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) ++{ ++ int writable = 1; ++ ++ /* ++ * The dataset must be visible by this zone -- check this first ++ * so they don't see EPERM on something they shouldn't know about. ++ */ ++ if (!INGLOBALZONE(curproc) && ++ !zone_dataset_visible(dataset, &writable)) ++ return (ENOENT); ++ ++ if (INGLOBALZONE(curproc)) { ++ /* ++ * If the fs is zoned, only root can access it from the ++ * global zone. ++ */ ++ if (secpolicy_zfs(cr) && zoned) ++ return (EPERM); ++ } else { ++ /* ++ * If we are in a local zone, the 'zoned' property must be set. ++ */ ++ if (!zoned) ++ return (EPERM); ++ ++ /* must be writable by this zone */ ++ if (!writable) ++ return (EPERM); ++ } ++ return (0); ++} ++ ++static int ++zfs_dozonecheck(const char *dataset, cred_t *cr) ++{ ++ uint64_t zoned; ++ ++ if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL)) ++ return (ENOENT); ++ ++ return (zfs_dozonecheck_impl(dataset, zoned, cr)); ++} ++ ++static int ++zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) ++{ ++ uint64_t zoned; ++ ++ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); ++ if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL)) { ++ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); ++ return (ENOENT); ++ } ++ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); ++ ++ return (zfs_dozonecheck_impl(dataset, zoned, cr)); ++} ++ ++/* ++ * If name ends in a '@', then require recursive permissions. ++ */ ++int ++zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) ++{ ++ int error; ++ boolean_t descendent = B_FALSE; ++ dsl_dataset_t *ds; ++ char *at; ++ ++ at = strchr(name, '@'); ++ if (at != NULL && at[1] == '\0') { ++ *at = '\0'; ++ descendent = B_TRUE; ++ } ++ ++ error = dsl_dataset_hold(name, FTAG, &ds); ++ if (at != NULL) ++ *at = '@'; ++ if (error != 0) ++ return (error); ++ ++ error = zfs_dozonecheck_ds(name, ds, cr); ++ if (error == 0) { ++ error = secpolicy_zfs(cr); ++ if (error) ++ error = dsl_deleg_access_impl(ds, descendent, perm, cr); ++ } ++ ++ dsl_dataset_rele(ds, FTAG); ++ return (error); ++} ++ ++int ++zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, ++ const char *perm, cred_t *cr) ++{ ++ int error; ++ ++ error = zfs_dozonecheck_ds(name, ds, cr); ++ if (error == 0) { ++ error = secpolicy_zfs(cr); ++ if (error) ++ error = dsl_deleg_access_impl(ds, B_FALSE, perm, cr); ++ } ++ return (error); ++} ++ ++/* ++ * Policy for setting the security label property. ++ * ++ * Returns 0 for success, non-zero for access and other errors. ++ */ ++static int ++zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) ++{ ++#ifdef HAVE_MLSLABEL ++ char ds_hexsl[MAXNAMELEN]; ++ bslabel_t ds_sl, new_sl; ++ boolean_t new_default = FALSE; ++ uint64_t zoned; ++ int needed_priv = -1; ++ int error; ++ ++ /* First get the existing dataset label. */ ++ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), ++ 1, sizeof (ds_hexsl), &ds_hexsl, NULL); ++ if (error) ++ return (EPERM); ++ ++ if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) ++ new_default = TRUE; ++ ++ /* The label must be translatable */ ++ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) ++ return (EINVAL); ++ ++ /* ++ * In a non-global zone, disallow attempts to set a label that ++ * doesn't match that of the zone; otherwise no other checks ++ * are needed. ++ */ ++ if (!INGLOBALZONE(curproc)) { ++ if (new_default || !blequal(&new_sl, CR_SL(CRED()))) ++ return (EPERM); ++ return (0); ++ } ++ ++ /* ++ * For global-zone datasets (i.e., those whose zoned property is ++ * "off", verify that the specified new label is valid for the ++ * global zone. ++ */ ++ if (dsl_prop_get_integer(name, ++ zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) ++ return (EPERM); ++ if (!zoned) { ++ if (zfs_check_global_label(name, strval) != 0) ++ return (EPERM); ++ } ++ ++ /* ++ * If the existing dataset label is nondefault, check if the ++ * dataset is mounted (label cannot be changed while mounted). ++ * Get the zfs_sb_t; if there isn't one, then the dataset isn't ++ * mounted (or isn't a dataset, doesn't exist, ...). ++ */ ++ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { ++ objset_t *os; ++ static char *setsl_tag = "setsl_tag"; ++ ++ /* ++ * Try to own the dataset; abort if there is any error, ++ * (e.g., already mounted, in use, or other error). ++ */ ++ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, ++ setsl_tag, &os); ++ if (error) ++ return (EPERM); ++ ++ dmu_objset_disown(os, setsl_tag); ++ ++ if (new_default) { ++ needed_priv = PRIV_FILE_DOWNGRADE_SL; ++ goto out_check; ++ } ++ ++ if (hexstr_to_label(strval, &new_sl) != 0) ++ return (EPERM); ++ ++ if (blstrictdom(&ds_sl, &new_sl)) ++ needed_priv = PRIV_FILE_DOWNGRADE_SL; ++ else if (blstrictdom(&new_sl, &ds_sl)) ++ needed_priv = PRIV_FILE_UPGRADE_SL; ++ } else { ++ /* dataset currently has a default label */ ++ if (!new_default) ++ needed_priv = PRIV_FILE_UPGRADE_SL; ++ } ++ ++out_check: ++ if (needed_priv != -1) ++ return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); ++ return (0); ++#else ++ return ENOTSUP; ++#endif /* HAVE_MLSLABEL */ ++} ++ ++static int ++zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, ++ cred_t *cr) ++{ ++ char *strval; ++ ++ /* ++ * Check permissions for special properties. ++ */ ++ switch (prop) { ++ default: ++ break; ++ case ZFS_PROP_ZONED: ++ /* ++ * Disallow setting of 'zoned' from within a local zone. ++ */ ++ if (!INGLOBALZONE(curproc)) ++ return (EPERM); ++ break; ++ ++ case ZFS_PROP_QUOTA: ++ if (!INGLOBALZONE(curproc)) { ++ uint64_t zoned; ++ char setpoint[MAXNAMELEN]; ++ /* ++ * Unprivileged users are allowed to modify the ++ * quota on things *under* (ie. contained by) ++ * the thing they own. ++ */ ++ if (dsl_prop_get_integer(dsname, "zoned", &zoned, ++ setpoint)) ++ return (EPERM); ++ if (!zoned || strlen(dsname) <= strlen(setpoint)) ++ return (EPERM); ++ } ++ break; ++ ++ case ZFS_PROP_MLSLABEL: ++ if (!is_system_labeled()) ++ return (EPERM); ++ ++ if (nvpair_value_string(propval, &strval) == 0) { ++ int err; ++ ++ err = zfs_set_slabel_policy(dsname, strval, CRED()); ++ if (err != 0) ++ return (err); ++ } ++ break; ++ } ++ ++ return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); ++} ++ ++int ++zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr) ++{ ++ int error; ++ ++ error = zfs_dozonecheck(zc->zc_name, cr); ++ if (error) ++ return (error); ++ ++ /* ++ * permission to set permissions will be evaluated later in ++ * dsl_deleg_can_allow() ++ */ ++ return (0); ++} ++ ++int ++zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr) ++{ ++ return (zfs_secpolicy_write_perms(zc->zc_name, ++ ZFS_DELEG_PERM_ROLLBACK, cr)); ++} ++ ++int ++zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) ++{ ++ spa_t *spa; ++ dsl_pool_t *dp; ++ dsl_dataset_t *ds; ++ char *cp; ++ int error; ++ ++ /* ++ * Generate the current snapshot name from the given objsetid, then ++ * use that name for the secpolicy/zone checks. ++ */ ++ cp = strchr(zc->zc_name, '@'); ++ if (cp == NULL) ++ return (EINVAL); ++ error = spa_open(zc->zc_name, &spa, FTAG); ++ if (error) ++ return (error); ++ ++ dp = spa_get_dsl(spa); ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); ++ rw_exit(&dp->dp_config_rwlock); ++ spa_close(spa, FTAG); ++ if (error) ++ return (error); ++ ++ dsl_dataset_name(ds, zc->zc_name); ++ ++ error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ++ ZFS_DELEG_PERM_SEND, cr); ++ dsl_dataset_rele(ds, FTAG); ++ ++ return (error); ++} ++ ++#ifdef HAVE_SMB_SHARE ++static int ++zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr) ++{ ++ vnode_t *vp; ++ int error; ++ ++ if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, ++ NO_FOLLOW, NULL, &vp)) != 0) ++ return (error); ++ ++ /* Now make sure mntpnt and dataset are ZFS */ ++ ++ if (vp->v_vfsp->vfs_fstype != zfsfstype || ++ (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), ++ zc->zc_name) != 0)) { ++ VN_RELE(vp); ++ return (EPERM); ++ } ++ ++ VN_RELE(vp); ++ return (dsl_deleg_access(zc->zc_name, ++ ZFS_DELEG_PERM_SHARE, cr)); ++} ++#endif /* HAVE_SMB_SHARE */ ++ ++int ++zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) ++{ ++#ifdef HAVE_SMB_SHARE ++ if (!INGLOBALZONE(curproc)) ++ return (EPERM); ++ ++ if (secpolicy_nfs(cr) == 0) { ++ return (0); ++ } else { ++ return (zfs_secpolicy_deleg_share(zc, cr)); ++ } ++#else ++ return (ENOTSUP); ++#endif /* HAVE_SMB_SHARE */ ++} ++ ++int ++zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr) ++{ ++#ifdef HAVE_SMB_SHARE ++ if (!INGLOBALZONE(curproc)) ++ return (EPERM); ++ ++ if (secpolicy_smb(cr) == 0) { ++ return (0); ++ } else { ++ return (zfs_secpolicy_deleg_share(zc, cr)); ++ } ++#else ++ return (ENOTSUP); ++#endif /* HAVE_SMB_SHARE */ ++} ++ ++static int ++zfs_get_parent(const char *datasetname, char *parent, int parentsize) ++{ ++ char *cp; ++ ++ /* ++ * Remove the @bla or /bla from the end of the name to get the parent. ++ */ ++ (void) strncpy(parent, datasetname, parentsize); ++ cp = strrchr(parent, '@'); ++ if (cp != NULL) { ++ cp[0] = '\0'; ++ } else { ++ cp = strrchr(parent, '/'); ++ if (cp == NULL) ++ return (ENOENT); ++ cp[0] = '\0'; ++ } ++ ++ return (0); ++} ++ ++int ++zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) ++{ ++ int error; ++ ++ if ((error = zfs_secpolicy_write_perms(name, ++ ZFS_DELEG_PERM_MOUNT, cr)) != 0) ++ return (error); ++ ++ return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); ++} ++ ++static int ++zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr) ++{ ++ return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); ++} ++ ++/* ++ * Destroying snapshots with delegated permissions requires ++ * descendent mount and destroy permissions. ++ */ ++static int ++zfs_secpolicy_destroy_recursive(zfs_cmd_t *zc, cred_t *cr) ++{ ++ int error; ++ char *dsname; ++ ++ dsname = kmem_asprintf("%s@", zc->zc_name); ++ ++ error = zfs_secpolicy_destroy_perms(dsname, cr); ++ if (error == ENOENT) ++ error = zfs_secpolicy_destroy_perms(zc->zc_name, cr); ++ ++ strfree(dsname); ++ return (error); ++} ++ ++int ++zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) ++{ ++ char parentname[MAXNAMELEN]; ++ int error; ++ ++ if ((error = zfs_secpolicy_write_perms(from, ++ ZFS_DELEG_PERM_RENAME, cr)) != 0) ++ return (error); ++ ++ if ((error = zfs_secpolicy_write_perms(from, ++ ZFS_DELEG_PERM_MOUNT, cr)) != 0) ++ return (error); ++ ++ if ((error = zfs_get_parent(to, parentname, ++ sizeof (parentname))) != 0) ++ return (error); ++ ++ if ((error = zfs_secpolicy_write_perms(parentname, ++ ZFS_DELEG_PERM_CREATE, cr)) != 0) ++ return (error); ++ ++ if ((error = zfs_secpolicy_write_perms(parentname, ++ ZFS_DELEG_PERM_MOUNT, cr)) != 0) ++ return (error); ++ ++ return (error); ++} ++ ++static int ++zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr) ++{ ++ return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); ++} ++ ++static int ++zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) ++{ ++ char parentname[MAXNAMELEN]; ++ objset_t *clone; ++ int error; ++ ++ error = zfs_secpolicy_write_perms(zc->zc_name, ++ ZFS_DELEG_PERM_PROMOTE, cr); ++ if (error) ++ return (error); ++ ++ error = dmu_objset_hold(zc->zc_name, FTAG, &clone); ++ ++ if (error == 0) { ++ dsl_dataset_t *pclone = NULL; ++ dsl_dir_t *dd; ++ dd = clone->os_dsl_dataset->ds_dir; ++ ++ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); ++ error = dsl_dataset_hold_obj(dd->dd_pool, ++ dd->dd_phys->dd_origin_obj, FTAG, &pclone); ++ rw_exit(&dd->dd_pool->dp_config_rwlock); ++ if (error) { ++ dmu_objset_rele(clone, FTAG); ++ return (error); ++ } ++ ++ error = zfs_secpolicy_write_perms(zc->zc_name, ++ ZFS_DELEG_PERM_MOUNT, cr); ++ ++ dsl_dataset_name(pclone, parentname); ++ dmu_objset_rele(clone, FTAG); ++ dsl_dataset_rele(pclone, FTAG); ++ if (error == 0) ++ error = zfs_secpolicy_write_perms(parentname, ++ ZFS_DELEG_PERM_PROMOTE, cr); ++ } ++ return (error); ++} ++ ++static int ++zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr) ++{ ++ int error; ++ ++ if ((error = zfs_secpolicy_write_perms(zc->zc_name, ++ ZFS_DELEG_PERM_RECEIVE, cr)) != 0) ++ return (error); ++ ++ if ((error = zfs_secpolicy_write_perms(zc->zc_name, ++ ZFS_DELEG_PERM_MOUNT, cr)) != 0) ++ return (error); ++ ++ return (zfs_secpolicy_write_perms(zc->zc_name, ++ ZFS_DELEG_PERM_CREATE, cr)); ++} ++ ++int ++zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) ++{ ++ return (zfs_secpolicy_write_perms(name, ++ ZFS_DELEG_PERM_SNAPSHOT, cr)); ++} ++ ++static int ++zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr) ++{ ++ ++ return (zfs_secpolicy_snapshot_perms(zc->zc_name, cr)); ++} ++ ++static int ++zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) ++{ ++ char parentname[MAXNAMELEN]; ++ int error; ++ ++ if ((error = zfs_get_parent(zc->zc_name, parentname, ++ sizeof (parentname))) != 0) ++ return (error); ++ ++ if (zc->zc_value[0] != '\0') { ++ if ((error = zfs_secpolicy_write_perms(zc->zc_value, ++ ZFS_DELEG_PERM_CLONE, cr)) != 0) ++ return (error); ++ } ++ ++ if ((error = zfs_secpolicy_write_perms(parentname, ++ ZFS_DELEG_PERM_CREATE, cr)) != 0) ++ return (error); ++ ++ error = zfs_secpolicy_write_perms(parentname, ++ ZFS_DELEG_PERM_MOUNT, cr); ++ ++ return (error); ++} ++ ++/* ++ * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires ++ * SYS_CONFIG privilege, which is not available in a local zone. ++ */ ++/* ARGSUSED */ ++static int ++zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr) ++{ ++ if (secpolicy_sys_config(cr, B_FALSE) != 0) ++ return (EPERM); ++ ++ return (0); ++} ++ ++/* ++ * Policy for object to name lookups. ++ */ ++/* ARGSUSED */ ++static int ++zfs_secpolicy_diff(zfs_cmd_t *zc, cred_t *cr) ++{ ++ int error; ++ ++ if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) ++ return (0); ++ ++ error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); ++ return (error); ++} ++ ++/* ++ * Policy for fault injection. Requires all privileges. ++ */ ++/* ARGSUSED */ ++static int ++zfs_secpolicy_inject(zfs_cmd_t *zc, cred_t *cr) ++{ ++ return (secpolicy_zinject(cr)); ++} ++ ++static int ++zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) ++{ ++ zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); ++ ++ if (prop == ZPROP_INVAL) { ++ if (!zfs_prop_user(zc->zc_value)) ++ return (EINVAL); ++ return (zfs_secpolicy_write_perms(zc->zc_name, ++ ZFS_DELEG_PERM_USERPROP, cr)); ++ } else { ++ return (zfs_secpolicy_setprop(zc->zc_name, prop, ++ NULL, cr)); ++ } ++} ++ ++static int ++zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr) ++{ ++ int err = zfs_secpolicy_read(zc, cr); ++ if (err) ++ return (err); ++ ++ if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) ++ return (EINVAL); ++ ++ if (zc->zc_value[0] == 0) { ++ /* ++ * They are asking about a posix uid/gid. If it's ++ * themself, allow it. ++ */ ++ if (zc->zc_objset_type == ZFS_PROP_USERUSED || ++ zc->zc_objset_type == ZFS_PROP_USERQUOTA) { ++ if (zc->zc_guid == crgetuid(cr)) ++ return (0); ++ } else { ++ if (groupmember(zc->zc_guid, cr)) ++ return (0); ++ } ++ } ++ ++ return (zfs_secpolicy_write_perms(zc->zc_name, ++ userquota_perms[zc->zc_objset_type], cr)); ++} ++ ++static int ++zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr) ++{ ++ int err = zfs_secpolicy_read(zc, cr); ++ if (err) ++ return (err); ++ ++ if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) ++ return (EINVAL); ++ ++ return (zfs_secpolicy_write_perms(zc->zc_name, ++ userquota_perms[zc->zc_objset_type], cr)); ++} ++ ++static int ++zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr) ++{ ++ return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, ++ NULL, cr)); ++} ++ ++static int ++zfs_secpolicy_hold(zfs_cmd_t *zc, cred_t *cr) ++{ ++ return (zfs_secpolicy_write_perms(zc->zc_name, ++ ZFS_DELEG_PERM_HOLD, cr)); ++} ++ ++static int ++zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr) ++{ ++ return (zfs_secpolicy_write_perms(zc->zc_name, ++ ZFS_DELEG_PERM_RELEASE, cr)); ++} ++ ++/* ++ * Policy for allowing temporary snapshots to be taken or released ++ */ ++static int ++zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, cred_t *cr) ++{ ++ /* ++ * A temporary snapshot is the same as a snapshot, ++ * hold, destroy and release all rolled into one. ++ * Delegated diff alone is sufficient that we allow this. ++ */ ++ int error; ++ ++ if ((error = zfs_secpolicy_write_perms(zc->zc_name, ++ ZFS_DELEG_PERM_DIFF, cr)) == 0) ++ return (0); ++ ++ error = zfs_secpolicy_snapshot(zc, cr); ++ if (!error) ++ error = zfs_secpolicy_hold(zc, cr); ++ if (!error) ++ error = zfs_secpolicy_release(zc, cr); ++ if (!error) ++ error = zfs_secpolicy_destroy(zc, cr); ++ return (error); ++} ++ ++/* ++ * Returns the nvlist as specified by the user in the zfs_cmd_t. ++ */ ++static int ++get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) ++{ ++ char *packed; ++ int error; ++ nvlist_t *list = NULL; ++ ++ /* ++ * Read in and unpack the user-supplied nvlist. ++ */ ++ if (size == 0) ++ return (EINVAL); ++ ++ packed = kmem_alloc(size, KM_SLEEP | KM_NODEBUG); ++ ++ if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, ++ iflag)) != 0) { ++ kmem_free(packed, size); ++ return (error); ++ } ++ ++ if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { ++ kmem_free(packed, size); ++ return (error); ++ } ++ ++ kmem_free(packed, size); ++ ++ *nvp = list; ++ return (0); ++} ++ ++static int ++fit_error_list(zfs_cmd_t *zc, nvlist_t **errors) ++{ ++ size_t size; ++ ++ VERIFY(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0); ++ ++ if (size > zc->zc_nvlist_dst_size) { ++ nvpair_t *more_errors; ++ int n = 0; ++ ++ if (zc->zc_nvlist_dst_size < 1024) ++ return (ENOMEM); ++ ++ VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, 0) == 0); ++ more_errors = nvlist_prev_nvpair(*errors, NULL); ++ ++ do { ++ nvpair_t *pair = nvlist_prev_nvpair(*errors, ++ more_errors); ++ VERIFY(nvlist_remove_nvpair(*errors, pair) == 0); ++ n++; ++ VERIFY(nvlist_size(*errors, &size, ++ NV_ENCODE_NATIVE) == 0); ++ } while (size > zc->zc_nvlist_dst_size); ++ ++ VERIFY(nvlist_remove_nvpair(*errors, more_errors) == 0); ++ VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, n) == 0); ++ ASSERT(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0); ++ ASSERT(size <= zc->zc_nvlist_dst_size); ++ } ++ ++ return (0); ++} ++ ++static int ++put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) ++{ ++ char *packed = NULL; ++ int error = 0; ++ size_t size; ++ ++ VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); ++ ++ if (size > zc->zc_nvlist_dst_size) { ++ error = ENOMEM; ++ } else { ++ packed = kmem_alloc(size, KM_SLEEP | KM_NODEBUG); ++ VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, ++ KM_SLEEP) == 0); ++ if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, ++ size, zc->zc_iflags) != 0) ++ error = EFAULT; ++ kmem_free(packed, size); ++ } ++ ++ zc->zc_nvlist_dst_size = size; ++ return (error); ++} ++ ++static int ++get_zfs_sb(const char *dsname, zfs_sb_t **zsbp) ++{ ++ objset_t *os; ++ int error; ++ ++ error = dmu_objset_hold(dsname, FTAG, &os); ++ if (error) ++ return (error); ++ if (dmu_objset_type(os) != DMU_OST_ZFS) { ++ dmu_objset_rele(os, FTAG); ++ return (EINVAL); ++ } ++ ++ mutex_enter(&os->os_user_ptr_lock); ++ *zsbp = dmu_objset_get_user(os); ++ if (*zsbp && (*zsbp)->z_sb) { ++ atomic_inc(&((*zsbp)->z_sb->s_active)); ++ } else { ++ error = ESRCH; ++ } ++ mutex_exit(&os->os_user_ptr_lock); ++ dmu_objset_rele(os, FTAG); ++ return (error); ++} ++ ++/* ++ * Find a zfs_sb_t for a mounted filesystem, or create our own, in which ++ * case its z_sb will be NULL, and it will be opened as the owner. ++ */ ++static int ++zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer) ++{ ++ int error = 0; ++ ++ if (get_zfs_sb(name, zsbp) != 0) ++ error = zfs_sb_create(name, zsbp); ++ if (error == 0) { ++ rrw_enter(&(*zsbp)->z_teardown_lock, (writer) ? RW_WRITER : ++ RW_READER, tag); ++ if ((*zsbp)->z_unmounted) { ++ /* ++ * XXX we could probably try again, since the unmounting ++ * thread should be just about to disassociate the ++ * objset from the zfsvfs. ++ */ ++ rrw_exit(&(*zsbp)->z_teardown_lock, tag); ++ return (EBUSY); ++ } ++ } ++ return (error); ++} ++ ++static void ++zfs_sb_rele(zfs_sb_t *zsb, void *tag) ++{ ++ rrw_exit(&zsb->z_teardown_lock, tag); ++ ++ if (zsb->z_sb) { ++ deactivate_super(zsb->z_sb); ++ } else { ++ dmu_objset_disown(zsb->z_os, zsb); ++ zfs_sb_free(zsb); ++ } ++} ++ ++static int ++zfs_ioc_pool_create(zfs_cmd_t *zc) ++{ ++ int error; ++ nvlist_t *config, *props = NULL; ++ nvlist_t *rootprops = NULL; ++ nvlist_t *zplprops = NULL; ++ char *buf; ++ ++ if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, ++ zc->zc_iflags, &config))) ++ return (error); ++ ++ if (zc->zc_nvlist_src_size != 0 && (error = ++ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, ++ zc->zc_iflags, &props))) { ++ nvlist_free(config); ++ return (error); ++ } ++ ++ if (props) { ++ nvlist_t *nvl = NULL; ++ uint64_t version = SPA_VERSION; ++ ++ (void) nvlist_lookup_uint64(props, ++ zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); ++ if (version < SPA_VERSION_INITIAL || version > SPA_VERSION) { ++ error = EINVAL; ++ goto pool_props_bad; ++ } ++ (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); ++ if (nvl) { ++ error = nvlist_dup(nvl, &rootprops, KM_SLEEP); ++ if (error != 0) { ++ nvlist_free(config); ++ nvlist_free(props); ++ return (error); ++ } ++ (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); ++ } ++ VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ error = zfs_fill_zplprops_root(version, rootprops, ++ zplprops, NULL); ++ if (error) ++ goto pool_props_bad; ++ } ++ ++ buf = history_str_get(zc); ++ ++ error = spa_create(zc->zc_name, config, props, buf, zplprops); ++ ++ /* ++ * Set the remaining root properties ++ */ ++ if (!error && (error = zfs_set_prop_nvlist(zc->zc_name, ++ ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) ++ (void) spa_destroy(zc->zc_name); ++ ++ if (buf != NULL) ++ history_str_free(buf); ++ ++pool_props_bad: ++ nvlist_free(rootprops); ++ nvlist_free(zplprops); ++ nvlist_free(config); ++ nvlist_free(props); ++ ++ return (error); ++} ++ ++static int ++zfs_ioc_pool_destroy(zfs_cmd_t *zc) ++{ ++ int error; ++ zfs_log_history(zc); ++ error = spa_destroy(zc->zc_name); ++ if (error == 0) ++ zvol_remove_minors(zc->zc_name); ++ return (error); ++} ++ ++static int ++zfs_ioc_pool_import(zfs_cmd_t *zc) ++{ ++ nvlist_t *config, *props = NULL; ++ uint64_t guid; ++ int error; ++ ++ if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, ++ zc->zc_iflags, &config)) != 0) ++ return (error); ++ ++ if (zc->zc_nvlist_src_size != 0 && (error = ++ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, ++ zc->zc_iflags, &props))) { ++ nvlist_free(config); ++ return (error); ++ } ++ ++ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || ++ guid != zc->zc_guid) ++ error = EINVAL; ++ else ++ error = spa_import(zc->zc_name, config, props, zc->zc_cookie); ++ ++ if (zc->zc_nvlist_dst != 0) { ++ int err; ++ ++ if ((err = put_nvlist(zc, config)) != 0) ++ error = err; ++ } ++ ++ if (error == 0) ++ zvol_create_minors(zc->zc_name); ++ ++ nvlist_free(config); ++ ++ if (props) ++ nvlist_free(props); ++ ++ return (error); ++} ++ ++static int ++zfs_ioc_pool_export(zfs_cmd_t *zc) ++{ ++ int error; ++ boolean_t force = (boolean_t)zc->zc_cookie; ++ boolean_t hardforce = (boolean_t)zc->zc_guid; ++ ++ zfs_log_history(zc); ++ error = spa_export(zc->zc_name, NULL, force, hardforce); ++ if (error == 0) ++ zvol_remove_minors(zc->zc_name); ++ return (error); ++} ++ ++static int ++zfs_ioc_pool_configs(zfs_cmd_t *zc) ++{ ++ nvlist_t *configs; ++ int error; ++ ++ if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) ++ return (EEXIST); ++ ++ error = put_nvlist(zc, configs); ++ ++ nvlist_free(configs); ++ ++ return (error); ++} ++ ++static int ++zfs_ioc_pool_stats(zfs_cmd_t *zc) ++{ ++ nvlist_t *config; ++ int error; ++ int ret = 0; ++ ++ error = spa_get_stats(zc->zc_name, &config, zc->zc_value, ++ sizeof (zc->zc_value)); ++ ++ if (config != NULL) { ++ ret = put_nvlist(zc, config); ++ nvlist_free(config); ++ ++ /* ++ * The config may be present even if 'error' is non-zero. ++ * In this case we return success, and preserve the real errno ++ * in 'zc_cookie'. ++ */ ++ zc->zc_cookie = error; ++ } else { ++ ret = error; ++ } ++ ++ return (ret); ++} ++ ++/* ++ * Try to import the given pool, returning pool stats as appropriate so that ++ * user land knows which devices are available and overall pool health. ++ */ ++static int ++zfs_ioc_pool_tryimport(zfs_cmd_t *zc) ++{ ++ nvlist_t *tryconfig, *config; ++ int error; ++ ++ if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, ++ zc->zc_iflags, &tryconfig)) != 0) ++ return (error); ++ ++ config = spa_tryimport(tryconfig); ++ ++ nvlist_free(tryconfig); ++ ++ if (config == NULL) ++ return (EINVAL); ++ ++ error = put_nvlist(zc, config); ++ nvlist_free(config); ++ ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of the pool ++ * zc_cookie scan func (pool_scan_func_t) ++ */ ++static int ++zfs_ioc_pool_scan(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ int error; ++ ++ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) ++ return (error); ++ ++ if (zc->zc_cookie == POOL_SCAN_NONE) ++ error = spa_scan_stop(spa); ++ else ++ error = spa_scan(spa, zc->zc_cookie); ++ ++ spa_close(spa, FTAG); ++ ++ return (error); ++} ++ ++static int ++zfs_ioc_pool_freeze(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ int error; ++ ++ error = spa_open(zc->zc_name, &spa, FTAG); ++ if (error == 0) { ++ spa_freeze(spa); ++ spa_close(spa, FTAG); ++ } ++ return (error); ++} ++ ++static int ++zfs_ioc_pool_upgrade(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ int error; ++ ++ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) ++ return (error); ++ ++ if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) { ++ spa_close(spa, FTAG); ++ return (EINVAL); ++ } ++ ++ spa_upgrade(spa, zc->zc_cookie); ++ spa_close(spa, FTAG); ++ ++ return (error); ++} ++ ++static int ++zfs_ioc_pool_get_history(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ char *hist_buf; ++ uint64_t size; ++ int error; ++ ++ if ((size = zc->zc_history_len) == 0) ++ return (EINVAL); ++ ++ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) ++ return (error); ++ ++ if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { ++ spa_close(spa, FTAG); ++ return (ENOTSUP); ++ } ++ ++ hist_buf = vmem_alloc(size, KM_SLEEP); ++ if ((error = spa_history_get(spa, &zc->zc_history_offset, ++ &zc->zc_history_len, hist_buf)) == 0) { ++ error = ddi_copyout(hist_buf, ++ (void *)(uintptr_t)zc->zc_history, ++ zc->zc_history_len, zc->zc_iflags); ++ } ++ ++ spa_close(spa, FTAG); ++ vmem_free(hist_buf, size); ++ return (error); ++} ++ ++static int ++zfs_ioc_pool_reguid(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ int error; ++ ++ error = spa_open(zc->zc_name, &spa, FTAG); ++ if (error == 0) { ++ error = spa_change_guid(spa); ++ spa_close(spa, FTAG); ++ } ++ return (error); ++} ++ ++static int ++zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) ++{ ++ int error; ++ ++ if ((error = dsl_dsobj_to_dsname(zc->zc_name,zc->zc_obj,zc->zc_value))) ++ return (error); ++ ++ return (0); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * zc_obj object to find ++ * ++ * outputs: ++ * zc_value name of object ++ */ ++static int ++zfs_ioc_obj_to_path(zfs_cmd_t *zc) ++{ ++ objset_t *os; ++ int error; ++ ++ /* XXX reading from objset not owned */ ++ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) ++ return (error); ++ if (dmu_objset_type(os) != DMU_OST_ZFS) { ++ dmu_objset_rele(os, FTAG); ++ return (EINVAL); ++ } ++ error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, ++ sizeof (zc->zc_value)); ++ dmu_objset_rele(os, FTAG); ++ ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * zc_obj object to find ++ * ++ * outputs: ++ * zc_stat stats on object ++ * zc_value path to object ++ */ ++static int ++zfs_ioc_obj_to_stats(zfs_cmd_t *zc) ++{ ++ objset_t *os; ++ int error; ++ ++ /* XXX reading from objset not owned */ ++ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) ++ return (error); ++ if (dmu_objset_type(os) != DMU_OST_ZFS) { ++ dmu_objset_rele(os, FTAG); ++ return (EINVAL); ++ } ++ error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, ++ sizeof (zc->zc_value)); ++ dmu_objset_rele(os, FTAG); ++ ++ return (error); ++} ++ ++static int ++zfs_ioc_vdev_add(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ int error; ++ nvlist_t *config, **l2cache, **spares; ++ uint_t nl2cache = 0, nspares = 0; ++ ++ error = spa_open(zc->zc_name, &spa, FTAG); ++ if (error != 0) ++ return (error); ++ ++ error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, ++ zc->zc_iflags, &config); ++ (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, ++ &l2cache, &nl2cache); ++ ++ (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, ++ &spares, &nspares); ++ ++ /* ++ * A root pool with concatenated devices is not supported. ++ * Thus, can not add a device to a root pool. ++ * ++ * Intent log device can not be added to a rootpool because ++ * during mountroot, zil is replayed, a seperated log device ++ * can not be accessed during the mountroot time. ++ * ++ * l2cache and spare devices are ok to be added to a rootpool. ++ */ ++ if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) { ++ nvlist_free(config); ++ spa_close(spa, FTAG); ++ return (EDOM); ++ } ++ ++ if (error == 0) { ++ error = spa_vdev_add(spa, config); ++ nvlist_free(config); ++ } ++ spa_close(spa, FTAG); ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of the pool ++ * zc_nvlist_conf nvlist of devices to remove ++ * zc_cookie to stop the remove? ++ */ ++static int ++zfs_ioc_vdev_remove(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ int error; ++ ++ error = spa_open(zc->zc_name, &spa, FTAG); ++ if (error != 0) ++ return (error); ++ error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); ++ spa_close(spa, FTAG); ++ return (error); ++} ++ ++static int ++zfs_ioc_vdev_set_state(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ int error; ++ vdev_state_t newstate = VDEV_STATE_UNKNOWN; ++ ++ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) ++ return (error); ++ switch (zc->zc_cookie) { ++ case VDEV_STATE_ONLINE: ++ error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); ++ break; ++ ++ case VDEV_STATE_OFFLINE: ++ error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); ++ break; ++ ++ case VDEV_STATE_FAULTED: ++ if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && ++ zc->zc_obj != VDEV_AUX_EXTERNAL) ++ zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; ++ ++ error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); ++ break; ++ ++ case VDEV_STATE_DEGRADED: ++ if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && ++ zc->zc_obj != VDEV_AUX_EXTERNAL) ++ zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; ++ ++ error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); ++ break; ++ ++ default: ++ error = EINVAL; ++ } ++ zc->zc_cookie = newstate; ++ spa_close(spa, FTAG); ++ return (error); ++} ++ ++static int ++zfs_ioc_vdev_attach(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ int replacing = zc->zc_cookie; ++ nvlist_t *config; ++ int error; ++ ++ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) ++ return (error); ++ ++ if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, ++ zc->zc_iflags, &config)) == 0) { ++ error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); ++ nvlist_free(config); ++ } ++ ++ spa_close(spa, FTAG); ++ return (error); ++} ++ ++static int ++zfs_ioc_vdev_detach(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ int error; ++ ++ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) ++ return (error); ++ ++ error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); ++ ++ spa_close(spa, FTAG); ++ return (error); ++} ++ ++static int ++zfs_ioc_vdev_split(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ nvlist_t *config, *props = NULL; ++ int error; ++ boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); ++ ++ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) ++ return (error); ++ ++ if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, ++ zc->zc_iflags, &config))) { ++ spa_close(spa, FTAG); ++ return (error); ++ } ++ ++ if (zc->zc_nvlist_src_size != 0 && (error = ++ get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, ++ zc->zc_iflags, &props))) { ++ spa_close(spa, FTAG); ++ nvlist_free(config); ++ return (error); ++ } ++ ++ error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); ++ ++ spa_close(spa, FTAG); ++ ++ nvlist_free(config); ++ nvlist_free(props); ++ ++ return (error); ++} ++ ++static int ++zfs_ioc_vdev_setpath(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ char *path = zc->zc_value; ++ uint64_t guid = zc->zc_guid; ++ int error; ++ ++ error = spa_open(zc->zc_name, &spa, FTAG); ++ if (error != 0) ++ return (error); ++ ++ error = spa_vdev_setpath(spa, guid, path); ++ spa_close(spa, FTAG); ++ return (error); ++} ++ ++static int ++zfs_ioc_vdev_setfru(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ char *fru = zc->zc_value; ++ uint64_t guid = zc->zc_guid; ++ int error; ++ ++ error = spa_open(zc->zc_name, &spa, FTAG); ++ if (error != 0) ++ return (error); ++ ++ error = spa_vdev_setfru(spa, guid, fru); ++ spa_close(spa, FTAG); ++ return (error); ++} ++ ++static int ++zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) ++{ ++ int error = 0; ++ nvlist_t *nv; ++ ++ dmu_objset_fast_stat(os, &zc->zc_objset_stats); ++ ++ if (zc->zc_nvlist_dst != 0 && ++ (error = dsl_prop_get_all(os, &nv)) == 0) { ++ dmu_objset_stats(os, nv); ++ /* ++ * NB: zvol_get_stats() will read the objset contents, ++ * which we aren't supposed to do with a ++ * DS_MODE_USER hold, because it could be ++ * inconsistent. So this is a bit of a workaround... ++ * XXX reading with out owning ++ */ ++ if (!zc->zc_objset_stats.dds_inconsistent && ++ dmu_objset_type(os) == DMU_OST_ZVOL) { ++ error = zvol_get_stats(os, nv); ++ if (error == EIO) ++ return (error); ++ VERIFY3S(error, ==, 0); ++ } ++ if (error == 0) ++ error = put_nvlist(zc, nv); ++ nvlist_free(nv); ++ } ++ ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * zc_nvlist_dst_size size of buffer for property nvlist ++ * ++ * outputs: ++ * zc_objset_stats stats ++ * zc_nvlist_dst property nvlist ++ * zc_nvlist_dst_size size of property nvlist ++ */ ++static int ++zfs_ioc_objset_stats(zfs_cmd_t *zc) ++{ ++ objset_t *os = NULL; ++ int error; ++ ++ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) ++ return (error); ++ ++ error = zfs_ioc_objset_stats_impl(zc, os); ++ ++ dmu_objset_rele(os, FTAG); ++ ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * zc_nvlist_dst_size size of buffer for property nvlist ++ * ++ * outputs: ++ * zc_nvlist_dst received property nvlist ++ * zc_nvlist_dst_size size of received property nvlist ++ * ++ * Gets received properties (distinct from local properties on or after ++ * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from ++ * local property values. ++ */ ++static int ++zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) ++{ ++ objset_t *os = NULL; ++ int error; ++ nvlist_t *nv; ++ ++ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) ++ return (error); ++ ++ /* ++ * Without this check, we would return local property values if the ++ * caller has not already received properties on or after ++ * SPA_VERSION_RECVD_PROPS. ++ */ ++ if (!dsl_prop_get_hasrecvd(os)) { ++ dmu_objset_rele(os, FTAG); ++ return (ENOTSUP); ++ } ++ ++ if (zc->zc_nvlist_dst != 0 && ++ (error = dsl_prop_get_received(os, &nv)) == 0) { ++ error = put_nvlist(zc, nv); ++ nvlist_free(nv); ++ } ++ ++ dmu_objset_rele(os, FTAG); ++ return (error); ++} ++ ++static int ++nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) ++{ ++ uint64_t value; ++ int error; ++ ++ /* ++ * zfs_get_zplprop() will either find a value or give us ++ * the default value (if there is one). ++ */ ++ if ((error = zfs_get_zplprop(os, prop, &value)) != 0) ++ return (error); ++ VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); ++ return (0); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * zc_nvlist_dst_size size of buffer for zpl property nvlist ++ * ++ * outputs: ++ * zc_nvlist_dst zpl property nvlist ++ * zc_nvlist_dst_size size of zpl property nvlist ++ */ ++static int ++zfs_ioc_objset_zplprops(zfs_cmd_t *zc) ++{ ++ objset_t *os; ++ int err; ++ ++ /* XXX reading without owning */ ++ if ((err = dmu_objset_hold(zc->zc_name, FTAG, &os))) ++ return (err); ++ ++ dmu_objset_fast_stat(os, &zc->zc_objset_stats); ++ ++ /* ++ * NB: nvl_add_zplprop() will read the objset contents, ++ * which we aren't supposed to do with a DS_MODE_USER ++ * hold, because it could be inconsistent. ++ */ ++ if (zc->zc_nvlist_dst != 0 && ++ !zc->zc_objset_stats.dds_inconsistent && ++ dmu_objset_type(os) == DMU_OST_ZFS) { ++ nvlist_t *nv; ++ ++ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && ++ (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && ++ (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && ++ (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) ++ err = put_nvlist(zc, nv); ++ nvlist_free(nv); ++ } else { ++ err = ENOENT; ++ } ++ dmu_objset_rele(os, FTAG); ++ return (err); ++} ++ ++static boolean_t ++dataset_name_hidden(const char *name) ++{ ++ /* ++ * Skip over datasets that are not visible in this zone, ++ * internal datasets (which have a $ in their name), and ++ * temporary datasets (which have a % in their name). ++ */ ++ if (strchr(name, '$') != NULL) ++ return (B_TRUE); ++ if (strchr(name, '%') != NULL) ++ return (B_TRUE); ++ if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL)) ++ return (B_TRUE); ++ return (B_FALSE); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * zc_cookie zap cursor ++ * zc_nvlist_dst_size size of buffer for property nvlist ++ * ++ * outputs: ++ * zc_name name of next filesystem ++ * zc_cookie zap cursor ++ * zc_objset_stats stats ++ * zc_nvlist_dst property nvlist ++ * zc_nvlist_dst_size size of property nvlist ++ */ ++static int ++zfs_ioc_dataset_list_next(zfs_cmd_t *zc) ++{ ++ objset_t *os; ++ int error; ++ char *p; ++ size_t orig_len = strlen(zc->zc_name); ++ ++top: ++ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) { ++ if (error == ENOENT) ++ error = ESRCH; ++ return (error); ++ } ++ ++ p = strrchr(zc->zc_name, '/'); ++ if (p == NULL || p[1] != '\0') ++ (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); ++ p = zc->zc_name + strlen(zc->zc_name); ++ ++ /* ++ * Pre-fetch the datasets. dmu_objset_prefetch() always returns 0 ++ * but is not declared void because its called by dmu_objset_find(). ++ */ ++ if (zc->zc_cookie == 0) { ++ uint64_t cookie = 0; ++ int len = sizeof (zc->zc_name) - (p - zc->zc_name); ++ ++ while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) { ++ if (!dataset_name_hidden(zc->zc_name)) ++ (void) dmu_objset_prefetch(zc->zc_name, NULL); ++ } ++ } ++ ++ do { ++ error = dmu_dir_list_next(os, ++ sizeof (zc->zc_name) - (p - zc->zc_name), p, ++ NULL, &zc->zc_cookie); ++ if (error == ENOENT) ++ error = ESRCH; ++ } while (error == 0 && dataset_name_hidden(zc->zc_name)); ++ dmu_objset_rele(os, FTAG); ++ ++ /* ++ * If it's an internal dataset (ie. with a '$' in its name), ++ * don't try to get stats for it, otherwise we'll return ENOENT. ++ */ ++ if (error == 0 && strchr(zc->zc_name, '$') == NULL) { ++ error = zfs_ioc_objset_stats(zc); /* fill in the stats */ ++ if (error == ENOENT) { ++ /* We lost a race with destroy, get the next one. */ ++ zc->zc_name[orig_len] = '\0'; ++ goto top; ++ } ++ } ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * zc_cookie zap cursor ++ * zc_nvlist_dst_size size of buffer for property nvlist ++ * ++ * outputs: ++ * zc_name name of next snapshot ++ * zc_objset_stats stats ++ * zc_nvlist_dst property nvlist ++ * zc_nvlist_dst_size size of property nvlist ++ */ ++static int ++zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) ++{ ++ objset_t *os; ++ int error; ++ ++top: ++ if (zc->zc_cookie == 0 && !zc->zc_simple) ++ (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch, ++ NULL, DS_FIND_SNAPSHOTS); ++ ++ error = dmu_objset_hold(zc->zc_name, FTAG, &os); ++ if (error) ++ return (error == ENOENT ? ESRCH : error); ++ ++ /* ++ * A dataset name of maximum length cannot have any snapshots, ++ * so exit immediately. ++ */ ++ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) { ++ dmu_objset_rele(os, FTAG); ++ return (ESRCH); ++ } ++ ++ error = dmu_snapshot_list_next(os, ++ sizeof (zc->zc_name) - strlen(zc->zc_name), ++ zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, ++ NULL); ++ ++ if (error == 0 && !zc->zc_simple) { ++ dsl_dataset_t *ds; ++ dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; ++ ++ /* ++ * Since we probably don't have a hold on this snapshot, ++ * it's possible that the objsetid could have been destroyed ++ * and reused for a new objset. It's OK if this happens during ++ * a zfs send operation, since the new createtxg will be ++ * beyond the range we're interested in. ++ */ ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds); ++ rw_exit(&dp->dp_config_rwlock); ++ if (error) { ++ if (error == ENOENT) { ++ /* Racing with destroy, get the next one. */ ++ *strchr(zc->zc_name, '@') = '\0'; ++ dmu_objset_rele(os, FTAG); ++ goto top; ++ } ++ } else { ++ objset_t *ossnap; ++ ++ error = dmu_objset_from_ds(ds, &ossnap); ++ if (error == 0) ++ error = zfs_ioc_objset_stats_impl(zc, ossnap); ++ dsl_dataset_rele(ds, FTAG); ++ } ++ } else if (error == ENOENT) { ++ error = ESRCH; ++ } ++ ++ dmu_objset_rele(os, FTAG); ++ /* if we failed, undo the @ that we tacked on to zc_name */ ++ if (error) ++ *strchr(zc->zc_name, '@') = '\0'; ++ return (error); ++} ++ ++static int ++zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) ++{ ++ const char *propname = nvpair_name(pair); ++ uint64_t *valary; ++ unsigned int vallen; ++ const char *domain; ++ char *dash; ++ zfs_userquota_prop_t type; ++ uint64_t rid; ++ uint64_t quota; ++ zfs_sb_t *zsb; ++ int err; ++ ++ if (nvpair_type(pair) == DATA_TYPE_NVLIST) { ++ nvlist_t *attrs; ++ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); ++ if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, ++ &pair) != 0) ++ return (EINVAL); ++ } ++ ++ /* ++ * A correctly constructed propname is encoded as ++ * userquota@-. ++ */ ++ if ((dash = strchr(propname, '-')) == NULL || ++ nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || ++ vallen != 3) ++ return (EINVAL); ++ ++ domain = dash + 1; ++ type = valary[0]; ++ rid = valary[1]; ++ quota = valary[2]; ++ ++ err = zfs_sb_hold(dsname, FTAG, &zsb, B_FALSE); ++ if (err == 0) { ++ err = zfs_set_userquota(zsb, type, domain, rid, quota); ++ zfs_sb_rele(zsb, FTAG); ++ } ++ ++ return (err); ++} ++ ++/* ++ * If the named property is one that has a special function to set its value, ++ * return 0 on success and a positive error code on failure; otherwise if it is ++ * not one of the special properties handled by this function, return -1. ++ * ++ * XXX: It would be better for callers of the property interface if we handled ++ * these special cases in dsl_prop.c (in the dsl layer). ++ */ ++static int ++zfs_prop_set_special(const char *dsname, zprop_source_t source, ++ nvpair_t *pair) ++{ ++ const char *propname = nvpair_name(pair); ++ zfs_prop_t prop = zfs_name_to_prop(propname); ++ uint64_t intval; ++ int err; ++ ++ if (prop == ZPROP_INVAL) { ++ if (zfs_prop_userquota(propname)) ++ return (zfs_prop_set_userquota(dsname, pair)); ++ return (-1); ++ } ++ ++ if (nvpair_type(pair) == DATA_TYPE_NVLIST) { ++ nvlist_t *attrs; ++ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); ++ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, ++ &pair) == 0); ++ } ++ ++ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) ++ return (-1); ++ ++ VERIFY(0 == nvpair_value_uint64(pair, &intval)); ++ ++ switch (prop) { ++ case ZFS_PROP_QUOTA: ++ err = dsl_dir_set_quota(dsname, source, intval); ++ break; ++ case ZFS_PROP_REFQUOTA: ++ err = dsl_dataset_set_quota(dsname, source, intval); ++ break; ++ case ZFS_PROP_RESERVATION: ++ err = dsl_dir_set_reservation(dsname, source, intval); ++ break; ++ case ZFS_PROP_REFRESERVATION: ++ err = dsl_dataset_set_reservation(dsname, source, intval); ++ break; ++ case ZFS_PROP_VOLSIZE: ++ err = zvol_set_volsize(dsname, intval); ++ break; ++ case ZFS_PROP_VERSION: ++ { ++ zfs_sb_t *zsb; ++ ++ if ((err = zfs_sb_hold(dsname, FTAG, &zsb, B_TRUE)) != 0) ++ break; ++ ++ err = zfs_set_version(zsb, intval); ++ zfs_sb_rele(zsb, FTAG); ++ ++ if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { ++ zfs_cmd_t *zc; ++ ++ zc = kmem_zalloc(sizeof (zfs_cmd_t), ++ KM_SLEEP | KM_NODEBUG); ++ (void) strcpy(zc->zc_name, dsname); ++ (void) zfs_ioc_userspace_upgrade(zc); ++ kmem_free(zc, sizeof (zfs_cmd_t)); ++ } ++ break; ++ } ++ ++ default: ++ err = -1; ++ } ++ ++ return (err); ++} ++ ++/* ++ * This function is best effort. If it fails to set any of the given properties, ++ * it continues to set as many as it can and returns the first error ++ * encountered. If the caller provides a non-NULL errlist, it also gives the ++ * complete list of names of all the properties it failed to set along with the ++ * corresponding error numbers. The caller is responsible for freeing the ++ * returned errlist. ++ * ++ * If every property is set successfully, zero is returned and the list pointed ++ * at by errlist is NULL. ++ */ ++int ++zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, ++ nvlist_t **errlist) ++{ ++ nvpair_t *pair; ++ nvpair_t *propval; ++ int rv = 0; ++ uint64_t intval; ++ char *strval; ++ nvlist_t *genericnvl; ++ nvlist_t *errors; ++ nvlist_t *retrynvl; ++ ++ VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ VERIFY(nvlist_alloc(&retrynvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ ++retry: ++ pair = NULL; ++ while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { ++ const char *propname = nvpair_name(pair); ++ zfs_prop_t prop = zfs_name_to_prop(propname); ++ int err = 0; ++ ++ /* decode the property value */ ++ propval = pair; ++ if (nvpair_type(pair) == DATA_TYPE_NVLIST) { ++ nvlist_t *attrs; ++ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); ++ if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, ++ &propval) != 0) ++ err = EINVAL; ++ } ++ ++ /* Validate value type */ ++ if (err == 0 && prop == ZPROP_INVAL) { ++ if (zfs_prop_user(propname)) { ++ if (nvpair_type(propval) != DATA_TYPE_STRING) ++ err = EINVAL; ++ } else if (zfs_prop_userquota(propname)) { ++ if (nvpair_type(propval) != ++ DATA_TYPE_UINT64_ARRAY) ++ err = EINVAL; ++ } else { ++ err = EINVAL; ++ } ++ } else if (err == 0) { ++ if (nvpair_type(propval) == DATA_TYPE_STRING) { ++ if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) ++ err = EINVAL; ++ } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { ++ const char *unused; ++ ++ VERIFY(nvpair_value_uint64(propval, ++ &intval) == 0); ++ ++ switch (zfs_prop_get_type(prop)) { ++ case PROP_TYPE_NUMBER: ++ break; ++ case PROP_TYPE_STRING: ++ err = EINVAL; ++ break; ++ case PROP_TYPE_INDEX: ++ if (zfs_prop_index_to_string(prop, ++ intval, &unused) != 0) ++ err = EINVAL; ++ break; ++ default: ++ cmn_err(CE_PANIC, ++ "unknown property type"); ++ } ++ } else { ++ err = EINVAL; ++ } ++ } ++ ++ /* Validate permissions */ ++ if (err == 0) ++ err = zfs_check_settable(dsname, pair, CRED()); ++ ++ if (err == 0) { ++ err = zfs_prop_set_special(dsname, source, pair); ++ if (err == -1) { ++ /* ++ * For better performance we build up a list of ++ * properties to set in a single transaction. ++ */ ++ err = nvlist_add_nvpair(genericnvl, pair); ++ } else if (err != 0 && nvl != retrynvl) { ++ /* ++ * This may be a spurious error caused by ++ * receiving quota and reservation out of order. ++ * Try again in a second pass. ++ */ ++ err = nvlist_add_nvpair(retrynvl, pair); ++ } ++ } ++ ++ if (err != 0) ++ VERIFY(nvlist_add_int32(errors, propname, err) == 0); ++ } ++ ++ if (nvl != retrynvl && !nvlist_empty(retrynvl)) { ++ nvl = retrynvl; ++ goto retry; ++ } ++ ++ if (!nvlist_empty(genericnvl) && ++ dsl_props_set(dsname, source, genericnvl) != 0) { ++ /* ++ * If this fails, we still want to set as many properties as we ++ * can, so try setting them individually. ++ */ ++ pair = NULL; ++ while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { ++ const char *propname = nvpair_name(pair); ++ int err = 0; ++ ++ propval = pair; ++ if (nvpair_type(pair) == DATA_TYPE_NVLIST) { ++ nvlist_t *attrs; ++ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); ++ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, ++ &propval) == 0); ++ } ++ ++ if (nvpair_type(propval) == DATA_TYPE_STRING) { ++ VERIFY(nvpair_value_string(propval, ++ &strval) == 0); ++ err = dsl_prop_set(dsname, propname, source, 1, ++ strlen(strval) + 1, strval); ++ } else { ++ VERIFY(nvpair_value_uint64(propval, ++ &intval) == 0); ++ err = dsl_prop_set(dsname, propname, source, 8, ++ 1, &intval); ++ } ++ ++ if (err != 0) { ++ VERIFY(nvlist_add_int32(errors, propname, ++ err) == 0); ++ } ++ } ++ } ++ nvlist_free(genericnvl); ++ nvlist_free(retrynvl); ++ ++ if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { ++ nvlist_free(errors); ++ errors = NULL; ++ } else { ++ VERIFY(nvpair_value_int32(pair, &rv) == 0); ++ } ++ ++ if (errlist == NULL) ++ nvlist_free(errors); ++ else ++ *errlist = errors; ++ ++ return (rv); ++} ++ ++/* ++ * Check that all the properties are valid user properties. ++ */ ++static int ++zfs_check_userprops(char *fsname, nvlist_t *nvl) ++{ ++ nvpair_t *pair = NULL; ++ int error = 0; ++ ++ while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { ++ const char *propname = nvpair_name(pair); ++ char *valstr; ++ ++ if (!zfs_prop_user(propname) || ++ nvpair_type(pair) != DATA_TYPE_STRING) ++ return (EINVAL); ++ ++ if ((error = zfs_secpolicy_write_perms(fsname, ++ ZFS_DELEG_PERM_USERPROP, CRED()))) ++ return (error); ++ ++ if (strlen(propname) >= ZAP_MAXNAMELEN) ++ return (ENAMETOOLONG); ++ ++ VERIFY(nvpair_value_string(pair, &valstr) == 0); ++ if (strlen(valstr) >= ZAP_MAXVALUELEN) ++ return (E2BIG); ++ } ++ return (0); ++} ++ ++static void ++props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) ++{ ++ nvpair_t *pair; ++ ++ VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ ++ pair = NULL; ++ while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { ++ if (nvlist_exists(skipped, nvpair_name(pair))) ++ continue; ++ ++ VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); ++ } ++} ++ ++static int ++clear_received_props(objset_t *os, const char *fs, nvlist_t *props, ++ nvlist_t *skipped) ++{ ++ int err = 0; ++ nvlist_t *cleared_props = NULL; ++ props_skip(props, skipped, &cleared_props); ++ if (!nvlist_empty(cleared_props)) { ++ /* ++ * Acts on local properties until the dataset has received ++ * properties at least once on or after SPA_VERSION_RECVD_PROPS. ++ */ ++ zprop_source_t flags = (ZPROP_SRC_NONE | ++ (dsl_prop_get_hasrecvd(os) ? ZPROP_SRC_RECEIVED : 0)); ++ err = zfs_set_prop_nvlist(fs, flags, cleared_props, NULL); ++ } ++ nvlist_free(cleared_props); ++ return (err); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * zc_value name of property to set ++ * zc_nvlist_src{_size} nvlist of properties to apply ++ * zc_cookie received properties flag ++ * ++ * outputs: ++ * zc_nvlist_dst{_size} error for each unapplied received property ++ */ ++static int ++zfs_ioc_set_prop(zfs_cmd_t *zc) ++{ ++ nvlist_t *nvl; ++ boolean_t received = zc->zc_cookie; ++ zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : ++ ZPROP_SRC_LOCAL); ++ nvlist_t *errors = NULL; ++ int error; ++ ++ if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, ++ zc->zc_iflags, &nvl)) != 0) ++ return (error); ++ ++ if (received) { ++ nvlist_t *origprops; ++ objset_t *os; ++ ++ if (dmu_objset_hold(zc->zc_name, FTAG, &os) == 0) { ++ if (dsl_prop_get_received(os, &origprops) == 0) { ++ (void) clear_received_props(os, ++ zc->zc_name, origprops, nvl); ++ nvlist_free(origprops); ++ } ++ ++ dsl_prop_set_hasrecvd(os); ++ dmu_objset_rele(os, FTAG); ++ } ++ } ++ ++ error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, &errors); ++ ++ if (zc->zc_nvlist_dst != 0 && errors != NULL) { ++ (void) put_nvlist(zc, errors); ++ } ++ ++ nvlist_free(errors); ++ nvlist_free(nvl); ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * zc_value name of property to inherit ++ * zc_cookie revert to received value if TRUE ++ * ++ * outputs: none ++ */ ++static int ++zfs_ioc_inherit_prop(zfs_cmd_t *zc) ++{ ++ const char *propname = zc->zc_value; ++ zfs_prop_t prop = zfs_name_to_prop(propname); ++ boolean_t received = zc->zc_cookie; ++ zprop_source_t source = (received ++ ? ZPROP_SRC_NONE /* revert to received value, if any */ ++ : ZPROP_SRC_INHERITED); /* explicitly inherit */ ++ ++ if (received) { ++ nvlist_t *dummy; ++ nvpair_t *pair; ++ zprop_type_t type; ++ int err; ++ ++ /* ++ * zfs_prop_set_special() expects properties in the form of an ++ * nvpair with type info. ++ */ ++ if (prop == ZPROP_INVAL) { ++ if (!zfs_prop_user(propname)) ++ return (EINVAL); ++ ++ type = PROP_TYPE_STRING; ++ } else if (prop == ZFS_PROP_VOLSIZE || ++ prop == ZFS_PROP_VERSION) { ++ return (EINVAL); ++ } else { ++ type = zfs_prop_get_type(prop); ++ } ++ ++ VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ ++ switch (type) { ++ case PROP_TYPE_STRING: ++ VERIFY(0 == nvlist_add_string(dummy, propname, "")); ++ break; ++ case PROP_TYPE_NUMBER: ++ case PROP_TYPE_INDEX: ++ VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); ++ break; ++ default: ++ nvlist_free(dummy); ++ return (EINVAL); ++ } ++ ++ pair = nvlist_next_nvpair(dummy, NULL); ++ err = zfs_prop_set_special(zc->zc_name, source, pair); ++ nvlist_free(dummy); ++ if (err != -1) ++ return (err); /* special property already handled */ ++ } else { ++ /* ++ * Only check this in the non-received case. We want to allow ++ * 'inherit -S' to revert non-inheritable properties like quota ++ * and reservation to the received or default values even though ++ * they are not considered inheritable. ++ */ ++ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) ++ return (EINVAL); ++ } ++ ++ /* the property name has been validated by zfs_secpolicy_inherit() */ ++ return (dsl_prop_set(zc->zc_name, zc->zc_value, source, 0, 0, NULL)); ++} ++ ++static int ++zfs_ioc_pool_set_props(zfs_cmd_t *zc) ++{ ++ nvlist_t *props; ++ spa_t *spa; ++ int error; ++ nvpair_t *pair; ++ ++ if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, ++ zc->zc_iflags, &props))) ++ return (error); ++ ++ /* ++ * If the only property is the configfile, then just do a spa_lookup() ++ * to handle the faulted case. ++ */ ++ pair = nvlist_next_nvpair(props, NULL); ++ if (pair != NULL && strcmp(nvpair_name(pair), ++ zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && ++ nvlist_next_nvpair(props, pair) == NULL) { ++ mutex_enter(&spa_namespace_lock); ++ if ((spa = spa_lookup(zc->zc_name)) != NULL) { ++ spa_configfile_set(spa, props, B_FALSE); ++ spa_config_sync(spa, B_FALSE, B_TRUE); ++ } ++ mutex_exit(&spa_namespace_lock); ++ if (spa != NULL) { ++ nvlist_free(props); ++ return (0); ++ } ++ } ++ ++ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { ++ nvlist_free(props); ++ return (error); ++ } ++ ++ error = spa_prop_set(spa, props); ++ ++ nvlist_free(props); ++ spa_close(spa, FTAG); ++ ++ return (error); ++} ++ ++static int ++zfs_ioc_pool_get_props(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ int error; ++ nvlist_t *nvp = NULL; ++ ++ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { ++ /* ++ * If the pool is faulted, there may be properties we can still ++ * get (such as altroot and cachefile), so attempt to get them ++ * anyway. ++ */ ++ mutex_enter(&spa_namespace_lock); ++ if ((spa = spa_lookup(zc->zc_name)) != NULL) ++ error = spa_prop_get(spa, &nvp); ++ mutex_exit(&spa_namespace_lock); ++ } else { ++ error = spa_prop_get(spa, &nvp); ++ spa_close(spa, FTAG); ++ } ++ ++ if (error == 0 && zc->zc_nvlist_dst != 0) ++ error = put_nvlist(zc, nvp); ++ else ++ error = EFAULT; ++ ++ nvlist_free(nvp); ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of volume ++ * ++ * outputs: none ++ */ ++static int ++zfs_ioc_create_minor(zfs_cmd_t *zc) ++{ ++ return (zvol_create_minor(zc->zc_name)); ++} ++ ++/* ++ * inputs: ++ * zc_name name of volume ++ * ++ * outputs: none ++ */ ++static int ++zfs_ioc_remove_minor(zfs_cmd_t *zc) ++{ ++ return (zvol_remove_minor(zc->zc_name)); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * zc_nvlist_src{_size} nvlist of delegated permissions ++ * zc_perm_action allow/unallow flag ++ * ++ * outputs: none ++ */ ++static int ++zfs_ioc_set_fsacl(zfs_cmd_t *zc) ++{ ++ int error; ++ nvlist_t *fsaclnv = NULL; ++ ++ if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, ++ zc->zc_iflags, &fsaclnv)) != 0) ++ return (error); ++ ++ /* ++ * Verify nvlist is constructed correctly ++ */ ++ if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { ++ nvlist_free(fsaclnv); ++ return (EINVAL); ++ } ++ ++ /* ++ * If we don't have PRIV_SYS_MOUNT, then validate ++ * that user is allowed to hand out each permission in ++ * the nvlist(s) ++ */ ++ ++ error = secpolicy_zfs(CRED()); ++ if (error) { ++ if (zc->zc_perm_action == B_FALSE) { ++ error = dsl_deleg_can_allow(zc->zc_name, ++ fsaclnv, CRED()); ++ } else { ++ error = dsl_deleg_can_unallow(zc->zc_name, ++ fsaclnv, CRED()); ++ } ++ } ++ ++ if (error == 0) ++ error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); ++ ++ nvlist_free(fsaclnv); ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * ++ * outputs: ++ * zc_nvlist_src{_size} nvlist of delegated permissions ++ */ ++static int ++zfs_ioc_get_fsacl(zfs_cmd_t *zc) ++{ ++ nvlist_t *nvp; ++ int error; ++ ++ if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { ++ error = put_nvlist(zc, nvp); ++ nvlist_free(nvp); ++ } ++ ++ return (error); ++} ++ ++/* ARGSUSED */ ++static void ++zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) ++{ ++ zfs_creat_t *zct = arg; ++ ++ zfs_create_fs(os, cr, zct->zct_zplprops, tx); ++} ++ ++#define ZFS_PROP_UNDEFINED ((uint64_t)-1) ++ ++/* ++ * inputs: ++ * createprops list of properties requested by creator ++ * default_zplver zpl version to use if unspecified in createprops ++ * fuids_ok fuids allowed in this version of the spa? ++ * os parent objset pointer (NULL if root fs) ++ * ++ * outputs: ++ * zplprops values for the zplprops we attach to the master node object ++ * is_ci true if requested file system will be purely case-insensitive ++ * ++ * Determine the settings for utf8only, normalization and ++ * casesensitivity. Specific values may have been requested by the ++ * creator and/or we can inherit values from the parent dataset. If ++ * the file system is of too early a vintage, a creator can not ++ * request settings for these properties, even if the requested ++ * setting is the default value. We don't actually want to create dsl ++ * properties for these, so remove them from the source nvlist after ++ * processing. ++ */ ++static int ++zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, ++ boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, ++ nvlist_t *zplprops, boolean_t *is_ci) ++{ ++ uint64_t sense = ZFS_PROP_UNDEFINED; ++ uint64_t norm = ZFS_PROP_UNDEFINED; ++ uint64_t u8 = ZFS_PROP_UNDEFINED; ++ int error; ++ ++ ASSERT(zplprops != NULL); ++ ++ /* ++ * Pull out creator prop choices, if any. ++ */ ++ if (createprops) { ++ (void) nvlist_lookup_uint64(createprops, ++ zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); ++ (void) nvlist_lookup_uint64(createprops, ++ zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); ++ (void) nvlist_remove_all(createprops, ++ zfs_prop_to_name(ZFS_PROP_NORMALIZE)); ++ (void) nvlist_lookup_uint64(createprops, ++ zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); ++ (void) nvlist_remove_all(createprops, ++ zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); ++ (void) nvlist_lookup_uint64(createprops, ++ zfs_prop_to_name(ZFS_PROP_CASE), &sense); ++ (void) nvlist_remove_all(createprops, ++ zfs_prop_to_name(ZFS_PROP_CASE)); ++ } ++ ++ /* ++ * If the zpl version requested is whacky or the file system ++ * or pool is version is too "young" to support normalization ++ * and the creator tried to set a value for one of the props, ++ * error out. ++ */ ++ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || ++ (zplver >= ZPL_VERSION_FUID && !fuids_ok) || ++ (zplver >= ZPL_VERSION_SA && !sa_ok) || ++ (zplver < ZPL_VERSION_NORMALIZATION && ++ (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || ++ sense != ZFS_PROP_UNDEFINED))) ++ return (ENOTSUP); ++ ++ /* ++ * Put the version in the zplprops ++ */ ++ VERIFY(nvlist_add_uint64(zplprops, ++ zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); ++ ++ if (norm == ZFS_PROP_UNDEFINED && ++ (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) ++ return (error); ++ VERIFY(nvlist_add_uint64(zplprops, ++ zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); ++ ++ /* ++ * If we're normalizing, names must always be valid UTF-8 strings. ++ */ ++ if (norm) ++ u8 = 1; ++ if (u8 == ZFS_PROP_UNDEFINED && ++ (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) ++ return (error); ++ VERIFY(nvlist_add_uint64(zplprops, ++ zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); ++ ++ if (sense == ZFS_PROP_UNDEFINED && ++ (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) ++ return (error); ++ VERIFY(nvlist_add_uint64(zplprops, ++ zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); ++ ++ if (is_ci) ++ *is_ci = (sense == ZFS_CASE_INSENSITIVE); ++ ++ return (0); ++} ++ ++static int ++zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, ++ nvlist_t *zplprops, boolean_t *is_ci) ++{ ++ boolean_t fuids_ok, sa_ok; ++ uint64_t zplver = ZPL_VERSION; ++ objset_t *os = NULL; ++ char parentname[MAXNAMELEN]; ++ char *cp; ++ spa_t *spa; ++ uint64_t spa_vers; ++ int error; ++ ++ (void) strlcpy(parentname, dataset, sizeof (parentname)); ++ cp = strrchr(parentname, '/'); ++ ASSERT(cp != NULL); ++ cp[0] = '\0'; ++ ++ if ((error = spa_open(dataset, &spa, FTAG)) != 0) ++ return (error); ++ ++ spa_vers = spa_version(spa); ++ spa_close(spa, FTAG); ++ ++ zplver = zfs_zpl_version_map(spa_vers); ++ fuids_ok = (zplver >= ZPL_VERSION_FUID); ++ sa_ok = (zplver >= ZPL_VERSION_SA); ++ ++ /* ++ * Open parent object set so we can inherit zplprop values. ++ */ ++ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) ++ return (error); ++ ++ error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, ++ zplprops, is_ci); ++ dmu_objset_rele(os, FTAG); ++ return (error); ++} ++ ++static int ++zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, ++ nvlist_t *zplprops, boolean_t *is_ci) ++{ ++ boolean_t fuids_ok; ++ boolean_t sa_ok; ++ uint64_t zplver = ZPL_VERSION; ++ int error; ++ ++ zplver = zfs_zpl_version_map(spa_vers); ++ fuids_ok = (zplver >= ZPL_VERSION_FUID); ++ sa_ok = (zplver >= ZPL_VERSION_SA); ++ ++ error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, ++ createprops, zplprops, is_ci); ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_objset_type type of objset to create (fs vs zvol) ++ * zc_name name of new objset ++ * zc_value name of snapshot to clone from (may be empty) ++ * zc_nvlist_src{_size} nvlist of properties to apply ++ * ++ * outputs: none ++ */ ++static int ++zfs_ioc_create(zfs_cmd_t *zc) ++{ ++ objset_t *clone; ++ int error = 0; ++ zfs_creat_t zct; ++ nvlist_t *nvprops = NULL; ++ void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); ++ dmu_objset_type_t type = zc->zc_objset_type; ++ ++ switch (type) { ++ ++ case DMU_OST_ZFS: ++ cbfunc = zfs_create_cb; ++ break; ++ ++ case DMU_OST_ZVOL: ++ cbfunc = zvol_create_cb; ++ break; ++ ++ default: ++ cbfunc = NULL; ++ break; ++ } ++ if (strchr(zc->zc_name, '@') || ++ strchr(zc->zc_name, '%')) ++ return (EINVAL); ++ ++ if (zc->zc_nvlist_src != 0 && ++ (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, ++ zc->zc_iflags, &nvprops)) != 0) ++ return (error); ++ ++ zct.zct_zplprops = NULL; ++ zct.zct_props = nvprops; ++ ++ if (zc->zc_value[0] != '\0') { ++ /* ++ * We're creating a clone of an existing snapshot. ++ */ ++ zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; ++ if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) { ++ nvlist_free(nvprops); ++ return (EINVAL); ++ } ++ ++ error = dmu_objset_hold(zc->zc_value, FTAG, &clone); ++ if (error) { ++ nvlist_free(nvprops); ++ return (error); ++ } ++ ++ error = dmu_objset_clone(zc->zc_name, dmu_objset_ds(clone), 0); ++ dmu_objset_rele(clone, FTAG); ++ if (error) { ++ nvlist_free(nvprops); ++ return (error); ++ } ++ } else { ++ boolean_t is_insensitive = B_FALSE; ++ ++ if (cbfunc == NULL) { ++ nvlist_free(nvprops); ++ return (EINVAL); ++ } ++ ++ if (type == DMU_OST_ZVOL) { ++ uint64_t volsize, volblocksize; ++ ++ if (nvprops == NULL || ++ nvlist_lookup_uint64(nvprops, ++ zfs_prop_to_name(ZFS_PROP_VOLSIZE), ++ &volsize) != 0) { ++ nvlist_free(nvprops); ++ return (EINVAL); ++ } ++ ++ if ((error = nvlist_lookup_uint64(nvprops, ++ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), ++ &volblocksize)) != 0 && error != ENOENT) { ++ nvlist_free(nvprops); ++ return (EINVAL); ++ } ++ ++ if (error != 0) ++ volblocksize = zfs_prop_default_numeric( ++ ZFS_PROP_VOLBLOCKSIZE); ++ ++ if ((error = zvol_check_volblocksize( ++ volblocksize)) != 0 || ++ (error = zvol_check_volsize(volsize, ++ volblocksize)) != 0) { ++ nvlist_free(nvprops); ++ return (error); ++ } ++ } else if (type == DMU_OST_ZFS) { ++ int error; ++ ++ /* ++ * We have to have normalization and ++ * case-folding flags correct when we do the ++ * file system creation, so go figure them out ++ * now. ++ */ ++ VERIFY(nvlist_alloc(&zct.zct_zplprops, ++ NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ error = zfs_fill_zplprops(zc->zc_name, nvprops, ++ zct.zct_zplprops, &is_insensitive); ++ if (error != 0) { ++ nvlist_free(nvprops); ++ nvlist_free(zct.zct_zplprops); ++ return (error); ++ } ++ } ++ error = dmu_objset_create(zc->zc_name, type, ++ is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); ++ nvlist_free(zct.zct_zplprops); ++ } ++ ++ /* ++ * It would be nice to do this atomically. ++ */ ++ if (error == 0) { ++ error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL, ++ nvprops, NULL); ++ if (error != 0) ++ (void) dmu_objset_destroy(zc->zc_name, B_FALSE); ++ } ++ nvlist_free(nvprops); ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * zc_value short name of snapshot ++ * zc_cookie recursive flag ++ * zc_nvlist_src[_size] property list ++ * ++ * outputs: ++ * zc_value short snapname (i.e. part after the '@') ++ */ ++static int ++zfs_ioc_snapshot(zfs_cmd_t *zc) ++{ ++ nvlist_t *nvprops = NULL; ++ int error; ++ boolean_t recursive = zc->zc_cookie; ++ ++ if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) ++ return (EINVAL); ++ ++ if (zc->zc_nvlist_src != 0 && ++ (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, ++ zc->zc_iflags, &nvprops)) != 0) ++ return (error); ++ ++ error = zfs_check_userprops(zc->zc_name, nvprops); ++ if (error) ++ goto out; ++ ++ if (!nvlist_empty(nvprops) && ++ zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) { ++ error = ENOTSUP; ++ goto out; ++ } ++ ++ error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, NULL, ++ nvprops, recursive, B_FALSE, -1); ++ ++out: ++ nvlist_free(nvprops); ++ return (error); ++} ++ ++/* ++ * inputs: ++ * name dataset name, or when 'arg == NULL' the full snapshot name ++ * arg short snapshot name (i.e. part after the '@') ++ */ ++int ++zfs_unmount_snap(const char *name, void *arg) ++{ ++ zfs_sb_t *zsb = NULL; ++ char *dsname; ++ char *snapname; ++ char *fullname; ++ char *ptr; ++ int error; ++ ++ if (arg) { ++ dsname = strdup(name); ++ snapname = strdup(arg); ++ } else { ++ ptr = strchr(name, '@'); ++ if (ptr) { ++ dsname = strdup(name); ++ dsname[ptr - name] = '\0'; ++ snapname = strdup(ptr + 1); ++ } else { ++ return (0); ++ } ++ } ++ ++ fullname = kmem_asprintf("%s@%s", dsname, snapname); ++ ++ error = zfs_sb_hold(dsname, FTAG, &zsb, B_FALSE); ++ if (error == 0) { ++ error = zfsctl_unmount_snapshot(zsb, fullname, MNT_FORCE); ++ zfs_sb_rele(zsb, FTAG); ++ ++ /* Allow ENOENT for consistency with upstream */ ++ if (error == ENOENT) ++ error = 0; ++ } ++ ++ strfree(dsname); ++ strfree(snapname); ++ strfree(fullname); ++ ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem, snaps must be under it ++ * zc_nvlist_src[_size] full names of snapshots to destroy ++ * zc_defer_destroy mark for deferred destroy ++ * ++ * outputs: ++ * zc_name on failure, name of failed snapshot ++ */ ++static int ++zfs_ioc_destroy_snaps_nvl(zfs_cmd_t *zc) ++{ ++ int err, len; ++ nvlist_t *nvl; ++ nvpair_t *pair; ++ ++ if ((err = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, ++ zc->zc_iflags, &nvl)) != 0) ++ return (err); ++ ++ len = strlen(zc->zc_name); ++ for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL; ++ pair = nvlist_next_nvpair(nvl, pair)) { ++ const char *name = nvpair_name(pair); ++ /* ++ * The snap name must be underneath the zc_name. This ensures ++ * that our permission checks were legitimate. ++ */ ++ if (strncmp(zc->zc_name, name, len) != 0 || ++ (name[len] != '@' && name[len] != '/')) { ++ nvlist_free(nvl); ++ return (EINVAL); ++ } ++ ++ (void) zfs_unmount_snap(name, NULL); ++ (void) zvol_remove_minor(name); ++ } ++ ++ err = dmu_snapshots_destroy_nvl(nvl, zc->zc_defer_destroy, ++ zc->zc_name); ++ nvlist_free(nvl); ++ return (err); ++} ++ ++/* ++ * inputs: ++ * zc_name name of dataset to destroy ++ * zc_objset_type type of objset ++ * zc_defer_destroy mark for deferred destroy ++ * ++ * outputs: none ++ */ ++static int ++zfs_ioc_destroy(zfs_cmd_t *zc) ++{ ++ int err; ++ if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) { ++ err = zfs_unmount_snap(zc->zc_name, NULL); ++ if (err) ++ return (err); ++ } ++ ++ err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy); ++ if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) ++ (void) zvol_remove_minor(zc->zc_name); ++ return (err); ++} ++ ++/* ++ * inputs: ++ * zc_name name of dataset to rollback (to most recent snapshot) ++ * ++ * outputs: none ++ */ ++static int ++zfs_ioc_rollback(zfs_cmd_t *zc) ++{ ++ dsl_dataset_t *ds, *clone; ++ int error; ++ zfs_sb_t *zsb; ++ char *clone_name; ++ ++ error = dsl_dataset_hold(zc->zc_name, FTAG, &ds); ++ if (error) ++ return (error); ++ ++ /* must not be a snapshot */ ++ if (dsl_dataset_is_snapshot(ds)) { ++ dsl_dataset_rele(ds, FTAG); ++ return (EINVAL); ++ } ++ ++ /* must have a most recent snapshot */ ++ if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) { ++ dsl_dataset_rele(ds, FTAG); ++ return (EINVAL); ++ } ++ ++ /* ++ * Create clone of most recent snapshot. ++ */ ++ clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name); ++ error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT); ++ if (error) ++ goto out; ++ ++ error = dsl_dataset_own(clone_name, B_TRUE, FTAG, &clone); ++ if (error) ++ goto out; ++ ++ /* ++ * Do clone swap. ++ */ ++ if (get_zfs_sb(zc->zc_name, &zsb) == 0) { ++ error = zfs_suspend_fs(zsb); ++ if (error == 0) { ++ int resume_err; ++ ++ if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { ++ error = dsl_dataset_clone_swap(clone, ds, ++ B_TRUE); ++ dsl_dataset_disown(ds, FTAG); ++ ds = NULL; ++ } else { ++ error = EBUSY; ++ } ++ resume_err = zfs_resume_fs(zsb, zc->zc_name); ++ error = error ? error : resume_err; ++ } ++ deactivate_super(zsb->z_sb); ++ } else { ++ if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { ++ error = dsl_dataset_clone_swap(clone, ds, B_TRUE); ++ dsl_dataset_disown(ds, FTAG); ++ ds = NULL; ++ } else { ++ error = EBUSY; ++ } ++ } ++ ++ /* ++ * Destroy clone (which also closes it). ++ */ ++ (void) dsl_dataset_destroy(clone, FTAG, B_FALSE); ++ ++out: ++ strfree(clone_name); ++ if (ds) ++ dsl_dataset_rele(ds, FTAG); ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name old name of dataset ++ * zc_value new name of dataset ++ * zc_cookie recursive flag (only valid for snapshots) ++ * ++ * outputs: none ++ */ ++static int ++zfs_ioc_rename(zfs_cmd_t *zc) ++{ ++ boolean_t recursive = zc->zc_cookie & 1; ++ int err; ++ ++ zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; ++ if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || ++ strchr(zc->zc_value, '%')) ++ return (EINVAL); ++ ++ /* ++ * Unmount snapshot unless we're doing a recursive rename, ++ * in which case the dataset code figures out which snapshots ++ * to unmount. ++ */ ++ if (!recursive && strchr(zc->zc_name, '@') != NULL && ++ zc->zc_objset_type == DMU_OST_ZFS) { ++ err = zfs_unmount_snap(zc->zc_name, NULL); ++ if (err) ++ return (err); ++ } ++ ++ err = dmu_objset_rename(zc->zc_name, zc->zc_value, recursive); ++ if ((err == 0) && (zc->zc_objset_type == DMU_OST_ZVOL)) { ++ (void) zvol_remove_minor(zc->zc_name); ++ (void) zvol_create_minor(zc->zc_value); ++ } ++ ++ return (err); ++} ++ ++static int ++zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) ++{ ++ const char *propname = nvpair_name(pair); ++ boolean_t issnap = (strchr(dsname, '@') != NULL); ++ zfs_prop_t prop = zfs_name_to_prop(propname); ++ uint64_t intval; ++ int err; ++ ++ if (prop == ZPROP_INVAL) { ++ if (zfs_prop_user(propname)) { ++ if ((err = zfs_secpolicy_write_perms(dsname, ++ ZFS_DELEG_PERM_USERPROP, cr))) ++ return (err); ++ return (0); ++ } ++ ++ if (!issnap && zfs_prop_userquota(propname)) { ++ const char *perm = NULL; ++ const char *uq_prefix = ++ zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; ++ const char *gq_prefix = ++ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; ++ ++ if (strncmp(propname, uq_prefix, ++ strlen(uq_prefix)) == 0) { ++ perm = ZFS_DELEG_PERM_USERQUOTA; ++ } else if (strncmp(propname, gq_prefix, ++ strlen(gq_prefix)) == 0) { ++ perm = ZFS_DELEG_PERM_GROUPQUOTA; ++ } else { ++ /* USERUSED and GROUPUSED are read-only */ ++ return (EINVAL); ++ } ++ ++ if ((err = zfs_secpolicy_write_perms(dsname, perm, cr))) ++ return (err); ++ return (0); ++ } ++ ++ return (EINVAL); ++ } ++ ++ if (issnap) ++ return (EINVAL); ++ ++ if (nvpair_type(pair) == DATA_TYPE_NVLIST) { ++ /* ++ * dsl_prop_get_all_impl() returns properties in this ++ * format. ++ */ ++ nvlist_t *attrs; ++ VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); ++ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, ++ &pair) == 0); ++ } ++ ++ /* ++ * Check that this value is valid for this pool version ++ */ ++ switch (prop) { ++ case ZFS_PROP_COMPRESSION: ++ /* ++ * If the user specified gzip compression, make sure ++ * the SPA supports it. We ignore any errors here since ++ * we'll catch them later. ++ */ ++ if (nvpair_type(pair) == DATA_TYPE_UINT64 && ++ nvpair_value_uint64(pair, &intval) == 0) { ++ if (intval >= ZIO_COMPRESS_GZIP_1 && ++ intval <= ZIO_COMPRESS_GZIP_9 && ++ zfs_earlier_version(dsname, ++ SPA_VERSION_GZIP_COMPRESSION)) { ++ return (ENOTSUP); ++ } ++ ++ if (intval == ZIO_COMPRESS_ZLE && ++ zfs_earlier_version(dsname, ++ SPA_VERSION_ZLE_COMPRESSION)) ++ return (ENOTSUP); ++ ++ /* ++ * If this is a bootable dataset then ++ * verify that the compression algorithm ++ * is supported for booting. We must return ++ * something other than ENOTSUP since it ++ * implies a downrev pool version. ++ */ ++ if (zfs_is_bootfs(dsname) && ++ !BOOTFS_COMPRESS_VALID(intval)) { ++ return (ERANGE); ++ } ++ } ++ break; ++ ++ case ZFS_PROP_COPIES: ++ if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) ++ return (ENOTSUP); ++ break; ++ ++ case ZFS_PROP_DEDUP: ++ if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) ++ return (ENOTSUP); ++ break; ++ ++ case ZFS_PROP_SHARESMB: ++ if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) ++ return (ENOTSUP); ++ break; ++ ++ case ZFS_PROP_ACLINHERIT: ++ if (nvpair_type(pair) == DATA_TYPE_UINT64 && ++ nvpair_value_uint64(pair, &intval) == 0) { ++ if (intval == ZFS_ACL_PASSTHROUGH_X && ++ zfs_earlier_version(dsname, ++ SPA_VERSION_PASSTHROUGH_X)) ++ return (ENOTSUP); ++ } ++ break; ++ default: ++ break; ++ } ++ ++ return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); ++} ++ ++/* ++ * Removes properties from the given props list that fail permission checks ++ * needed to clear them and to restore them in case of a receive error. For each ++ * property, make sure we have both set and inherit permissions. ++ * ++ * Returns the first error encountered if any permission checks fail. If the ++ * caller provides a non-NULL errlist, it also gives the complete list of names ++ * of all the properties that failed a permission check along with the ++ * corresponding error numbers. The caller is responsible for freeing the ++ * returned errlist. ++ * ++ * If every property checks out successfully, zero is returned and the list ++ * pointed at by errlist is NULL. ++ */ ++static int ++zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist) ++{ ++ zfs_cmd_t *zc; ++ nvpair_t *pair, *next_pair; ++ nvlist_t *errors; ++ int err, rv = 0; ++ ++ if (props == NULL) ++ return (0); ++ ++ VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ ++ zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP | KM_NODEBUG); ++ (void) strcpy(zc->zc_name, dataset); ++ pair = nvlist_next_nvpair(props, NULL); ++ while (pair != NULL) { ++ next_pair = nvlist_next_nvpair(props, pair); ++ ++ (void) strcpy(zc->zc_value, nvpair_name(pair)); ++ if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || ++ (err = zfs_secpolicy_inherit(zc, CRED())) != 0) { ++ VERIFY(nvlist_remove_nvpair(props, pair) == 0); ++ VERIFY(nvlist_add_int32(errors, ++ zc->zc_value, err) == 0); ++ } ++ pair = next_pair; ++ } ++ kmem_free(zc, sizeof (zfs_cmd_t)); ++ ++ if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { ++ nvlist_free(errors); ++ errors = NULL; ++ } else { ++ VERIFY(nvpair_value_int32(pair, &rv) == 0); ++ } ++ ++ if (errlist == NULL) ++ nvlist_free(errors); ++ else ++ *errlist = errors; ++ ++ return (rv); ++} ++ ++static boolean_t ++propval_equals(nvpair_t *p1, nvpair_t *p2) ++{ ++ if (nvpair_type(p1) == DATA_TYPE_NVLIST) { ++ /* dsl_prop_get_all_impl() format */ ++ nvlist_t *attrs; ++ VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); ++ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, ++ &p1) == 0); ++ } ++ ++ if (nvpair_type(p2) == DATA_TYPE_NVLIST) { ++ nvlist_t *attrs; ++ VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); ++ VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, ++ &p2) == 0); ++ } ++ ++ if (nvpair_type(p1) != nvpair_type(p2)) ++ return (B_FALSE); ++ ++ if (nvpair_type(p1) == DATA_TYPE_STRING) { ++ char *valstr1, *valstr2; ++ ++ VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); ++ VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); ++ return (strcmp(valstr1, valstr2) == 0); ++ } else { ++ uint64_t intval1, intval2; ++ ++ VERIFY(nvpair_value_uint64(p1, &intval1) == 0); ++ VERIFY(nvpair_value_uint64(p2, &intval2) == 0); ++ return (intval1 == intval2); ++ } ++} ++ ++/* ++ * Remove properties from props if they are not going to change (as determined ++ * by comparison with origprops). Remove them from origprops as well, since we ++ * do not need to clear or restore properties that won't change. ++ */ ++static void ++props_reduce(nvlist_t *props, nvlist_t *origprops) ++{ ++ nvpair_t *pair, *next_pair; ++ ++ if (origprops == NULL) ++ return; /* all props need to be received */ ++ ++ pair = nvlist_next_nvpair(props, NULL); ++ while (pair != NULL) { ++ const char *propname = nvpair_name(pair); ++ nvpair_t *match; ++ ++ next_pair = nvlist_next_nvpair(props, pair); ++ ++ if ((nvlist_lookup_nvpair(origprops, propname, ++ &match) != 0) || !propval_equals(pair, match)) ++ goto next; /* need to set received value */ ++ ++ /* don't clear the existing received value */ ++ (void) nvlist_remove_nvpair(origprops, match); ++ /* don't bother receiving the property */ ++ (void) nvlist_remove_nvpair(props, pair); ++next: ++ pair = next_pair; ++ } ++} ++ ++#ifdef DEBUG ++static boolean_t zfs_ioc_recv_inject_err; ++#endif ++ ++/* ++ * inputs: ++ * zc_name name of containing filesystem ++ * zc_nvlist_src{_size} nvlist of properties to apply ++ * zc_value name of snapshot to create ++ * zc_string name of clone origin (if DRR_FLAG_CLONE) ++ * zc_cookie file descriptor to recv from ++ * zc_begin_record the BEGIN record of the stream (not byteswapped) ++ * zc_guid force flag ++ * zc_cleanup_fd cleanup-on-exit file descriptor ++ * zc_action_handle handle for this guid/ds mapping (or zero on first call) ++ * ++ * outputs: ++ * zc_cookie number of bytes read ++ * zc_nvlist_dst{_size} error for each unapplied received property ++ * zc_obj zprop_errflags_t ++ * zc_action_handle handle for this guid/ds mapping ++ */ ++static int ++zfs_ioc_recv(zfs_cmd_t *zc) ++{ ++ file_t *fp; ++ objset_t *os; ++ dmu_recv_cookie_t drc; ++ boolean_t force = (boolean_t)zc->zc_guid; ++ int fd; ++ int error = 0; ++ int props_error = 0; ++ nvlist_t *errors; ++ offset_t off; ++ nvlist_t *props = NULL; /* sent properties */ ++ nvlist_t *origprops = NULL; /* existing properties */ ++ objset_t *origin = NULL; ++ char *tosnap; ++ char tofs[ZFS_MAXNAMELEN]; ++ boolean_t first_recvd_props = B_FALSE; ++ ++ if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || ++ strchr(zc->zc_value, '@') == NULL || ++ strchr(zc->zc_value, '%')) ++ return (EINVAL); ++ ++ (void) strcpy(tofs, zc->zc_value); ++ tosnap = strchr(tofs, '@'); ++ *tosnap++ = '\0'; ++ ++ if (zc->zc_nvlist_src != 0 && ++ (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, ++ zc->zc_iflags, &props)) != 0) ++ return (error); ++ ++ fd = zc->zc_cookie; ++ fp = getf(fd); ++ if (fp == NULL) { ++ nvlist_free(props); ++ return (EBADF); ++ } ++ ++ VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); ++ ++ if (props && dmu_objset_hold(tofs, FTAG, &os) == 0) { ++ if ((spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) && ++ !dsl_prop_get_hasrecvd(os)) { ++ first_recvd_props = B_TRUE; ++ } ++ ++ /* ++ * If new received properties are supplied, they are to ++ * completely replace the existing received properties, so stash ++ * away the existing ones. ++ */ ++ if (dsl_prop_get_received(os, &origprops) == 0) { ++ nvlist_t *errlist = NULL; ++ /* ++ * Don't bother writing a property if its value won't ++ * change (and avoid the unnecessary security checks). ++ * ++ * The first receive after SPA_VERSION_RECVD_PROPS is a ++ * special case where we blow away all local properties ++ * regardless. ++ */ ++ if (!first_recvd_props) ++ props_reduce(props, origprops); ++ if (zfs_check_clearable(tofs, origprops, ++ &errlist) != 0) ++ (void) nvlist_merge(errors, errlist, 0); ++ nvlist_free(errlist); ++ } ++ ++ dmu_objset_rele(os, FTAG); ++ } ++ ++ if (zc->zc_string[0]) { ++ error = dmu_objset_hold(zc->zc_string, FTAG, &origin); ++ if (error) ++ goto out; ++ } ++ ++ error = dmu_recv_begin(tofs, tosnap, zc->zc_top_ds, ++ &zc->zc_begin_record, force, origin, &drc); ++ if (origin) ++ dmu_objset_rele(origin, FTAG); ++ if (error) ++ goto out; ++ ++ /* ++ * Set properties before we receive the stream so that they are applied ++ * to the new data. Note that we must call dmu_recv_stream() if ++ * dmu_recv_begin() succeeds. ++ */ ++ if (props) { ++ nvlist_t *errlist; ++ ++ if (dmu_objset_from_ds(drc.drc_logical_ds, &os) == 0) { ++ if (drc.drc_newfs) { ++ if (spa_version(os->os_spa) >= ++ SPA_VERSION_RECVD_PROPS) ++ first_recvd_props = B_TRUE; ++ } else if (origprops != NULL) { ++ if (clear_received_props(os, tofs, origprops, ++ first_recvd_props ? NULL : props) != 0) ++ zc->zc_obj |= ZPROP_ERR_NOCLEAR; ++ } else { ++ zc->zc_obj |= ZPROP_ERR_NOCLEAR; ++ } ++ dsl_prop_set_hasrecvd(os); ++ } else if (!drc.drc_newfs) { ++ zc->zc_obj |= ZPROP_ERR_NOCLEAR; ++ } ++ ++ (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, ++ props, &errlist); ++ (void) nvlist_merge(errors, errlist, 0); ++ nvlist_free(errlist); ++ } ++ ++ if (fit_error_list(zc, &errors) != 0 || put_nvlist(zc, errors) != 0) { ++ /* ++ * Caller made zc->zc_nvlist_dst less than the minimum expected ++ * size or supplied an invalid address. ++ */ ++ props_error = EINVAL; ++ } ++ ++ off = fp->f_offset; ++ error = dmu_recv_stream(&drc, fp->f_vnode, &off, zc->zc_cleanup_fd, ++ &zc->zc_action_handle); ++ ++ if (error == 0) { ++ zfs_sb_t *zsb = NULL; ++ ++ if (get_zfs_sb(tofs, &zsb) == 0) { ++ /* online recv */ ++ int end_err; ++ ++ error = zfs_suspend_fs(zsb); ++ /* ++ * If the suspend fails, then the recv_end will ++ * likely also fail, and clean up after itself. ++ */ ++ end_err = dmu_recv_end(&drc); ++ if (error == 0) ++ error = zfs_resume_fs(zsb, tofs); ++ error = error ? error : end_err; ++ deactivate_super(zsb->z_sb); ++ } else { ++ error = dmu_recv_end(&drc); ++ } ++ } ++ ++ zc->zc_cookie = off - fp->f_offset; ++ if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) ++ fp->f_offset = off; ++ ++#ifdef DEBUG ++ if (zfs_ioc_recv_inject_err) { ++ zfs_ioc_recv_inject_err = B_FALSE; ++ error = 1; ++ } ++#endif ++ /* ++ * On error, restore the original props. ++ */ ++ if (error && props) { ++ if (dmu_objset_hold(tofs, FTAG, &os) == 0) { ++ if (clear_received_props(os, tofs, props, NULL) != 0) { ++ /* ++ * We failed to clear the received properties. ++ * Since we may have left a $recvd value on the ++ * system, we can't clear the $hasrecvd flag. ++ */ ++ zc->zc_obj |= ZPROP_ERR_NORESTORE; ++ } else if (first_recvd_props) { ++ dsl_prop_unset_hasrecvd(os); ++ } ++ dmu_objset_rele(os, FTAG); ++ } else if (!drc.drc_newfs) { ++ /* We failed to clear the received properties. */ ++ zc->zc_obj |= ZPROP_ERR_NORESTORE; ++ } ++ ++ if (origprops == NULL && !drc.drc_newfs) { ++ /* We failed to stash the original properties. */ ++ zc->zc_obj |= ZPROP_ERR_NORESTORE; ++ } ++ ++ /* ++ * dsl_props_set() will not convert RECEIVED to LOCAL on or ++ * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL ++ * explictly if we're restoring local properties cleared in the ++ * first new-style receive. ++ */ ++ if (origprops != NULL && ++ zfs_set_prop_nvlist(tofs, (first_recvd_props ? ++ ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), ++ origprops, NULL) != 0) { ++ /* ++ * We stashed the original properties but failed to ++ * restore them. ++ */ ++ zc->zc_obj |= ZPROP_ERR_NORESTORE; ++ } ++ } ++out: ++ nvlist_free(props); ++ nvlist_free(origprops); ++ nvlist_free(errors); ++ releasef(fd); ++ ++ if (error == 0) ++ error = props_error; ++ ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of snapshot to send ++ * zc_cookie file descriptor to send stream to ++ * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) ++ * zc_sendobj objsetid of snapshot to send ++ * zc_fromobj objsetid of incremental fromsnap (may be zero) ++ * zc_guid if set, estimate size of stream only. zc_cookie is ignored. ++ * output size in zc_objset_type. ++ * ++ * outputs: none ++ */ ++static int ++zfs_ioc_send(zfs_cmd_t *zc) ++{ ++ objset_t *fromsnap = NULL; ++ objset_t *tosnap; ++ int error; ++ offset_t off; ++ dsl_dataset_t *ds; ++ dsl_dataset_t *dsfrom = NULL; ++ spa_t *spa; ++ dsl_pool_t *dp; ++ boolean_t estimate = (zc->zc_guid != 0); ++ ++ error = spa_open(zc->zc_name, &spa, FTAG); ++ if (error) ++ return (error); ++ ++ dp = spa_get_dsl(spa); ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); ++ rw_exit(&dp->dp_config_rwlock); ++ if (error) { ++ spa_close(spa, FTAG); ++ return (error); ++ } ++ ++ error = dmu_objset_from_ds(ds, &tosnap); ++ if (error) { ++ dsl_dataset_rele(ds, FTAG); ++ spa_close(spa, FTAG); ++ return (error); ++ } ++ ++ if (zc->zc_fromobj != 0) { ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &dsfrom); ++ rw_exit(&dp->dp_config_rwlock); ++ spa_close(spa, FTAG); ++ if (error) { ++ dsl_dataset_rele(ds, FTAG); ++ return (error); ++ } ++ error = dmu_objset_from_ds(dsfrom, &fromsnap); ++ if (error) { ++ dsl_dataset_rele(dsfrom, FTAG); ++ dsl_dataset_rele(ds, FTAG); ++ return (error); ++ } ++ } else { ++ spa_close(spa, FTAG); ++ } ++ ++ if (estimate) { ++ error = dmu_send_estimate(tosnap, fromsnap, zc->zc_obj, ++ &zc->zc_objset_type); ++ } else { ++ file_t *fp = getf(zc->zc_cookie); ++ if (fp == NULL) { ++ dsl_dataset_rele(ds, FTAG); ++ if (dsfrom) ++ dsl_dataset_rele(dsfrom, FTAG); ++ return (EBADF); ++ } ++ ++ off = fp->f_offset; ++ error = dmu_send(tosnap, fromsnap, zc->zc_obj, ++ zc->zc_cookie, fp->f_vnode, &off); ++ ++ if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) ++ fp->f_offset = off; ++ releasef(zc->zc_cookie); ++ } ++ if (dsfrom) ++ dsl_dataset_rele(dsfrom, FTAG); ++ dsl_dataset_rele(ds, FTAG); ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of snapshot on which to report progress ++ * zc_cookie file descriptor of send stream ++ * ++ * outputs: ++ * zc_cookie number of bytes written in send stream thus far ++ */ ++static int ++zfs_ioc_send_progress(zfs_cmd_t *zc) ++{ ++ dsl_dataset_t *ds; ++ dmu_sendarg_t *dsp = NULL; ++ int error; ++ ++ if ((error = dsl_dataset_hold(zc->zc_name, FTAG, &ds)) != 0) ++ return (error); ++ ++ mutex_enter(&ds->ds_sendstream_lock); ++ ++ /* ++ * Iterate over all the send streams currently active on this dataset. ++ * If there's one which matches the specified file descriptor _and_ the ++ * stream was started by the current process, return the progress of ++ * that stream. ++ */ ++ ++ for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; ++ dsp = list_next(&ds->ds_sendstreams, dsp)) { ++ if (dsp->dsa_outfd == zc->zc_cookie && ++ dsp->dsa_proc->group_leader == curproc->group_leader) ++ break; ++ } ++ ++ if (dsp != NULL) ++ zc->zc_cookie = *(dsp->dsa_off); ++ else ++ error = ENOENT; ++ ++ mutex_exit(&ds->ds_sendstream_lock); ++ dsl_dataset_rele(ds, FTAG); ++ return (error); ++} ++ ++static int ++zfs_ioc_inject_fault(zfs_cmd_t *zc) ++{ ++ int id, error; ++ ++ error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, ++ &zc->zc_inject_record); ++ ++ if (error == 0) ++ zc->zc_guid = (uint64_t)id; ++ ++ return (error); ++} ++ ++static int ++zfs_ioc_clear_fault(zfs_cmd_t *zc) ++{ ++ return (zio_clear_fault((int)zc->zc_guid)); ++} ++ ++static int ++zfs_ioc_inject_list_next(zfs_cmd_t *zc) ++{ ++ int id = (int)zc->zc_guid; ++ int error; ++ ++ error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), ++ &zc->zc_inject_record); ++ ++ zc->zc_guid = id; ++ ++ return (error); ++} ++ ++static int ++zfs_ioc_error_log(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ int error; ++ size_t count = (size_t)zc->zc_nvlist_dst_size; ++ ++ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) ++ return (error); ++ ++ error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, ++ &count); ++ if (error == 0) ++ zc->zc_nvlist_dst_size = count; ++ else ++ zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); ++ ++ spa_close(spa, FTAG); ++ ++ return (error); ++} ++ ++static int ++zfs_ioc_clear(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ vdev_t *vd; ++ int error; ++ ++ /* ++ * On zpool clear we also fix up missing slogs ++ */ ++ mutex_enter(&spa_namespace_lock); ++ spa = spa_lookup(zc->zc_name); ++ if (spa == NULL) { ++ mutex_exit(&spa_namespace_lock); ++ return (EIO); ++ } ++ if (spa_get_log_state(spa) == SPA_LOG_MISSING) { ++ /* we need to let spa_open/spa_load clear the chains */ ++ spa_set_log_state(spa, SPA_LOG_CLEAR); ++ } ++ spa->spa_last_open_failed = 0; ++ mutex_exit(&spa_namespace_lock); ++ ++ if (zc->zc_cookie & ZPOOL_NO_REWIND) { ++ error = spa_open(zc->zc_name, &spa, FTAG); ++ } else { ++ nvlist_t *policy; ++ nvlist_t *config = NULL; ++ ++ if (zc->zc_nvlist_src == 0) ++ return (EINVAL); ++ ++ if ((error = get_nvlist(zc->zc_nvlist_src, ++ zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { ++ error = spa_open_rewind(zc->zc_name, &spa, FTAG, ++ policy, &config); ++ if (config != NULL) { ++ int err; ++ ++ if ((err = put_nvlist(zc, config)) != 0) ++ error = err; ++ nvlist_free(config); ++ } ++ nvlist_free(policy); ++ } ++ } ++ ++ if (error) ++ return (error); ++ ++ spa_vdev_state_enter(spa, SCL_NONE); ++ ++ if (zc->zc_guid == 0) { ++ vd = NULL; ++ } else { ++ vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); ++ if (vd == NULL) { ++ (void) spa_vdev_state_exit(spa, NULL, ENODEV); ++ spa_close(spa, FTAG); ++ return (ENODEV); ++ } ++ } ++ ++ vdev_clear(spa, vd); ++ ++ (void) spa_vdev_state_exit(spa, NULL, 0); ++ ++ /* ++ * Resume any suspended I/Os. ++ */ ++ if (zio_resume(spa) != 0) ++ error = EIO; ++ ++ spa_close(spa, FTAG); ++ ++ return (error); ++} ++ ++static int ++zfs_ioc_pool_reopen(zfs_cmd_t *zc) ++{ ++ spa_t *spa; ++ int error; ++ ++ error = spa_open(zc->zc_name, &spa, FTAG); ++ if (error) ++ return (error); ++ ++ spa_vdev_state_enter(spa, SCL_NONE); ++ ++ /* ++ * If a resilver is already in progress then set the ++ * spa_scrub_reopen flag to B_TRUE so that we don't restart ++ * the scan as a side effect of the reopen. Otherwise, let ++ * vdev_open() decided if a resilver is required. ++ */ ++ spa->spa_scrub_reopen = dsl_scan_resilvering(spa->spa_dsl_pool); ++ vdev_reopen(spa->spa_root_vdev); ++ spa->spa_scrub_reopen = B_FALSE; ++ ++ (void) spa_vdev_state_exit(spa, NULL, 0); ++ spa_close(spa, FTAG); ++ return (0); ++} ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * zc_value name of origin snapshot ++ * ++ * outputs: ++ * zc_string name of conflicting snapshot, if there is one ++ */ ++static int ++zfs_ioc_promote(zfs_cmd_t *zc) ++{ ++ char *cp; ++ ++ /* ++ * We don't need to unmount *all* the origin fs's snapshots, but ++ * it's easier. ++ */ ++ cp = strchr(zc->zc_value, '@'); ++ if (cp) ++ *cp = '\0'; ++ (void) dmu_objset_find(zc->zc_value, ++ zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS); ++ return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); ++} ++ ++/* ++ * Retrieve a single {user|group}{used|quota}@... property. ++ * ++ * inputs: ++ * zc_name name of filesystem ++ * zc_objset_type zfs_userquota_prop_t ++ * zc_value domain name (eg. "S-1-234-567-89") ++ * zc_guid RID/UID/GID ++ * ++ * outputs: ++ * zc_cookie property value ++ */ ++static int ++zfs_ioc_userspace_one(zfs_cmd_t *zc) ++{ ++ zfs_sb_t *zsb; ++ int error; ++ ++ if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) ++ return (EINVAL); ++ ++ error = zfs_sb_hold(zc->zc_name, FTAG, &zsb, B_FALSE); ++ if (error) ++ return (error); ++ ++ error = zfs_userspace_one(zsb, ++ zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); ++ zfs_sb_rele(zsb, FTAG); ++ ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * zc_cookie zap cursor ++ * zc_objset_type zfs_userquota_prop_t ++ * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) ++ * ++ * outputs: ++ * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) ++ * zc_cookie zap cursor ++ */ ++static int ++zfs_ioc_userspace_many(zfs_cmd_t *zc) ++{ ++ zfs_sb_t *zsb; ++ int bufsize = zc->zc_nvlist_dst_size; ++ int error; ++ void *buf; ++ ++ if (bufsize <= 0) ++ return (ENOMEM); ++ ++ error = zfs_sb_hold(zc->zc_name, FTAG, &zsb, B_FALSE); ++ if (error) ++ return (error); ++ ++ buf = vmem_alloc(bufsize, KM_SLEEP); ++ ++ error = zfs_userspace_many(zsb, zc->zc_objset_type, &zc->zc_cookie, ++ buf, &zc->zc_nvlist_dst_size); ++ ++ if (error == 0) { ++ error = xcopyout(buf, ++ (void *)(uintptr_t)zc->zc_nvlist_dst, ++ zc->zc_nvlist_dst_size); ++ } ++ vmem_free(buf, bufsize); ++ zfs_sb_rele(zsb, FTAG); ++ ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * ++ * outputs: ++ * none ++ */ ++static int ++zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) ++{ ++ objset_t *os; ++ int error = 0; ++ zfs_sb_t *zsb; ++ ++ if (get_zfs_sb(zc->zc_name, &zsb) == 0) { ++ if (!dmu_objset_userused_enabled(zsb->z_os)) { ++ /* ++ * If userused is not enabled, it may be because the ++ * objset needs to be closed & reopened (to grow the ++ * objset_phys_t). Suspend/resume the fs will do that. ++ */ ++ error = zfs_suspend_fs(zsb); ++ if (error == 0) ++ error = zfs_resume_fs(zsb, zc->zc_name); ++ } ++ if (error == 0) ++ error = dmu_objset_userspace_upgrade(zsb->z_os); ++ deactivate_super(zsb->z_sb); ++ } else { ++ /* XXX kind of reading contents without owning */ ++ error = dmu_objset_hold(zc->zc_name, FTAG, &os); ++ if (error) ++ return (error); ++ ++ error = dmu_objset_userspace_upgrade(os); ++ dmu_objset_rele(os, FTAG); ++ } ++ ++ return (error); ++} ++ ++static int ++zfs_ioc_share(zfs_cmd_t *zc) ++{ ++ return (ENOSYS); ++} ++ ++ace_t full_access[] = { ++ {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} ++}; ++ ++/* ++ * inputs: ++ * zc_name name of containing filesystem ++ * zc_obj object # beyond which we want next in-use object # ++ * ++ * outputs: ++ * zc_obj next in-use object # ++ */ ++static int ++zfs_ioc_next_obj(zfs_cmd_t *zc) ++{ ++ objset_t *os = NULL; ++ int error; ++ ++ error = dmu_objset_hold(zc->zc_name, FTAG, &os); ++ if (error) ++ return (error); ++ ++ error = dmu_object_next(os, &zc->zc_obj, B_FALSE, ++ os->os_dsl_dataset->ds_phys->ds_prev_snap_txg); ++ ++ dmu_objset_rele(os, FTAG); ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * zc_value prefix name for snapshot ++ * zc_cleanup_fd cleanup-on-exit file descriptor for calling process ++ * ++ * outputs: ++ */ ++static int ++zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) ++{ ++ char *snap_name; ++ int error; ++ ++ snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, ++ (u_longlong_t)ddi_get_lbolt64()); ++ ++ if (strlen(snap_name) >= MAXNAMELEN) { ++ strfree(snap_name); ++ return (E2BIG); ++ } ++ ++ error = dmu_objset_snapshot(zc->zc_name, snap_name, snap_name, ++ NULL, B_FALSE, B_TRUE, zc->zc_cleanup_fd); ++ if (error != 0) { ++ strfree(snap_name); ++ return (error); ++ } ++ ++ (void) strcpy(zc->zc_value, snap_name); ++ strfree(snap_name); ++ return (0); ++} ++ ++/* ++ * inputs: ++ * zc_name name of "to" snapshot ++ * zc_value name of "from" snapshot ++ * zc_cookie file descriptor to write diff data on ++ * ++ * outputs: ++ * dmu_diff_record_t's to the file descriptor ++ */ ++static int ++zfs_ioc_diff(zfs_cmd_t *zc) ++{ ++ objset_t *fromsnap; ++ objset_t *tosnap; ++ file_t *fp; ++ offset_t off; ++ int error; ++ ++ error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap); ++ if (error) ++ return (error); ++ ++ error = dmu_objset_hold(zc->zc_value, FTAG, &fromsnap); ++ if (error) { ++ dmu_objset_rele(tosnap, FTAG); ++ return (error); ++ } ++ ++ fp = getf(zc->zc_cookie); ++ if (fp == NULL) { ++ dmu_objset_rele(fromsnap, FTAG); ++ dmu_objset_rele(tosnap, FTAG); ++ return (EBADF); ++ } ++ ++ off = fp->f_offset; ++ ++ error = dmu_diff(tosnap, fromsnap, fp->f_vnode, &off); ++ ++ if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) ++ fp->f_offset = off; ++ releasef(zc->zc_cookie); ++ ++ dmu_objset_rele(fromsnap, FTAG); ++ dmu_objset_rele(tosnap, FTAG); ++ return (error); ++} ++ ++/* ++ * Remove all ACL files in shares dir ++ */ ++#ifdef HAVE_SMB_SHARE ++static int ++zfs_smb_acl_purge(znode_t *dzp) ++{ ++ zap_cursor_t zc; ++ zap_attribute_t zap; ++ zfs_sb_t *zsb = ZTOZSB(dzp); ++ int error; ++ ++ for (zap_cursor_init(&zc, zsb->z_os, dzp->z_id); ++ (error = zap_cursor_retrieve(&zc, &zap)) == 0; ++ zap_cursor_advance(&zc)) { ++ if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred, ++ NULL, 0)) != 0) ++ break; ++ } ++ zap_cursor_fini(&zc); ++ return (error); ++} ++#endif /* HAVE_SMB_SHARE */ ++ ++static int ++zfs_ioc_smb_acl(zfs_cmd_t *zc) ++{ ++#ifdef HAVE_SMB_SHARE ++ vnode_t *vp; ++ znode_t *dzp; ++ vnode_t *resourcevp = NULL; ++ znode_t *sharedir; ++ zfs_sb_t *zsb; ++ nvlist_t *nvlist; ++ char *src, *target; ++ vattr_t vattr; ++ vsecattr_t vsec; ++ int error = 0; ++ ++ if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, ++ NO_FOLLOW, NULL, &vp)) != 0) ++ return (error); ++ ++ /* Now make sure mntpnt and dataset are ZFS */ ++ ++ if (vp->v_vfsp->vfs_fstype != zfsfstype || ++ (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), ++ zc->zc_name) != 0)) { ++ VN_RELE(vp); ++ return (EINVAL); ++ } ++ ++ dzp = VTOZ(vp); ++ zsb = ZTOZSB(dzp); ++ ZFS_ENTER(zsb); ++ ++ /* ++ * Create share dir if its missing. ++ */ ++ mutex_enter(&zsb->z_lock); ++ if (zsb->z_shares_dir == 0) { ++ dmu_tx_t *tx; ++ ++ tx = dmu_tx_create(zsb->z_os); ++ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE, ++ ZFS_SHARES_DIR); ++ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); ++ error = dmu_tx_assign(tx, TXG_WAIT); ++ if (error) { ++ dmu_tx_abort(tx); ++ } else { ++ error = zfs_create_share_dir(zsb, tx); ++ dmu_tx_commit(tx); ++ } ++ if (error) { ++ mutex_exit(&zsb->z_lock); ++ VN_RELE(vp); ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ } ++ mutex_exit(&zsb->z_lock); ++ ++ ASSERT(zsb->z_shares_dir); ++ if ((error = zfs_zget(zsb, zsb->z_shares_dir, &sharedir)) != 0) { ++ VN_RELE(vp); ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ switch (zc->zc_cookie) { ++ case ZFS_SMB_ACL_ADD: ++ vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; ++ vattr.va_mode = S_IFREG|0777; ++ vattr.va_uid = 0; ++ vattr.va_gid = 0; ++ ++ vsec.vsa_mask = VSA_ACE; ++ vsec.vsa_aclentp = &full_access; ++ vsec.vsa_aclentsz = sizeof (full_access); ++ vsec.vsa_aclcnt = 1; ++ ++ error = VOP_CREATE(ZTOV(sharedir), zc->zc_string, ++ &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec); ++ if (resourcevp) ++ VN_RELE(resourcevp); ++ break; ++ ++ case ZFS_SMB_ACL_REMOVE: ++ error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred, ++ NULL, 0); ++ break; ++ ++ case ZFS_SMB_ACL_RENAME: ++ if ((error = get_nvlist(zc->zc_nvlist_src, ++ zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) { ++ VN_RELE(vp); ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) || ++ nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET, ++ &target)) { ++ VN_RELE(vp); ++ VN_RELE(ZTOV(sharedir)); ++ ZFS_EXIT(zsb); ++ nvlist_free(nvlist); ++ return (error); ++ } ++ error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target, ++ kcred, NULL, 0); ++ nvlist_free(nvlist); ++ break; ++ ++ case ZFS_SMB_ACL_PURGE: ++ error = zfs_smb_acl_purge(sharedir); ++ break; ++ ++ default: ++ error = EINVAL; ++ break; ++ } ++ ++ VN_RELE(vp); ++ VN_RELE(ZTOV(sharedir)); ++ ++ ZFS_EXIT(zsb); ++ ++ return (error); ++#else ++ return (ENOTSUP); ++#endif /* HAVE_SMB_SHARE */ ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * zc_value short name of snap ++ * zc_string user-supplied tag for this hold ++ * zc_cookie recursive flag ++ * zc_temphold set if hold is temporary ++ * zc_cleanup_fd cleanup-on-exit file descriptor for calling process ++ * zc_sendobj if non-zero, the objid for zc_name@zc_value ++ * zc_createtxg if zc_sendobj is non-zero, snap must have zc_createtxg ++ * ++ * outputs: none ++ */ ++static int ++zfs_ioc_hold(zfs_cmd_t *zc) ++{ ++ boolean_t recursive = zc->zc_cookie; ++ spa_t *spa; ++ dsl_pool_t *dp; ++ dsl_dataset_t *ds; ++ int error; ++ minor_t minor = 0; ++ ++ if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) ++ return (EINVAL); ++ ++ if (zc->zc_sendobj == 0) { ++ return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value, ++ zc->zc_string, recursive, zc->zc_temphold, ++ zc->zc_cleanup_fd)); ++ } ++ ++ if (recursive) ++ return (EINVAL); ++ ++ error = spa_open(zc->zc_name, &spa, FTAG); ++ if (error) ++ return (error); ++ ++ dp = spa_get_dsl(spa); ++ rw_enter(&dp->dp_config_rwlock, RW_READER); ++ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); ++ rw_exit(&dp->dp_config_rwlock); ++ spa_close(spa, FTAG); ++ if (error) ++ return (error); ++ ++ /* ++ * Until we have a hold on this snapshot, it's possible that ++ * zc_sendobj could've been destroyed and reused as part ++ * of a later txg. Make sure we're looking at the right object. ++ */ ++ if (zc->zc_createtxg != ds->ds_phys->ds_creation_txg) { ++ dsl_dataset_rele(ds, FTAG); ++ return (ENOENT); ++ } ++ ++ if (zc->zc_cleanup_fd != -1 && zc->zc_temphold) { ++ error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); ++ if (error) { ++ dsl_dataset_rele(ds, FTAG); ++ return (error); ++ } ++ } ++ ++ error = dsl_dataset_user_hold_for_send(ds, zc->zc_string, ++ zc->zc_temphold); ++ if (minor != 0) { ++ if (error == 0) { ++ dsl_register_onexit_hold_cleanup(ds, zc->zc_string, ++ minor); ++ } ++ zfs_onexit_fd_rele(zc->zc_cleanup_fd); ++ } ++ dsl_dataset_rele(ds, FTAG); ++ ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name name of dataset from which we're releasing a user hold ++ * zc_value short name of snap ++ * zc_string user-supplied tag for this hold ++ * zc_cookie recursive flag ++ * ++ * outputs: none ++ */ ++static int ++zfs_ioc_release(zfs_cmd_t *zc) ++{ ++ boolean_t recursive = zc->zc_cookie; ++ ++ if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) ++ return (EINVAL); ++ ++ return (dsl_dataset_user_release(zc->zc_name, zc->zc_value, ++ zc->zc_string, recursive)); ++} ++ ++/* ++ * inputs: ++ * zc_name name of filesystem ++ * ++ * outputs: ++ * zc_nvlist_src{_size} nvlist of snapshot holds ++ */ ++static int ++zfs_ioc_get_holds(zfs_cmd_t *zc) ++{ ++ nvlist_t *nvp; ++ int error; ++ ++ if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) { ++ error = put_nvlist(zc, nvp); ++ nvlist_free(nvp); ++ } ++ ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_guid flags (ZEVENT_NONBLOCK) ++ * ++ * outputs: ++ * zc_nvlist_dst next nvlist event ++ * zc_cookie dropped events since last get ++ * zc_cleanup_fd cleanup-on-exit file descriptor ++ */ ++static int ++zfs_ioc_events_next(zfs_cmd_t *zc) ++{ ++ zfs_zevent_t *ze; ++ nvlist_t *event = NULL; ++ minor_t minor; ++ uint64_t dropped = 0; ++ int error; ++ ++ error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); ++ if (error != 0) ++ return (error); ++ ++ do { ++ error = zfs_zevent_next(ze, &event, ++ &zc->zc_nvlist_dst_size, &dropped); ++ if (event != NULL) { ++ zc->zc_cookie = dropped; ++ error = put_nvlist(zc, event); ++ nvlist_free(event); ++ } ++ ++ if (zc->zc_guid & ZEVENT_NONBLOCK) ++ break; ++ ++ if ((error == 0) || (error != ENOENT)) ++ break; ++ ++ error = zfs_zevent_wait(ze); ++ if (error) ++ break; ++ } while (1); ++ ++ zfs_zevent_fd_rele(zc->zc_cleanup_fd); ++ ++ return (error); ++} ++ ++/* ++ * outputs: ++ * zc_cookie cleared events count ++ */ ++static int ++zfs_ioc_events_clear(zfs_cmd_t *zc) ++{ ++ int count; ++ ++ zfs_zevent_drain_all(&count); ++ zc->zc_cookie = count; ++ ++ return 0; ++} ++ ++/* ++ * inputs: ++ * zc_name name of new filesystem or snapshot ++ * zc_value full name of old snapshot ++ * ++ * outputs: ++ * zc_cookie space in bytes ++ * zc_objset_type compressed space in bytes ++ * zc_perm_action uncompressed space in bytes ++ */ ++static int ++zfs_ioc_space_written(zfs_cmd_t *zc) ++{ ++ int error; ++ dsl_dataset_t *new, *old; ++ ++ error = dsl_dataset_hold(zc->zc_name, FTAG, &new); ++ if (error != 0) ++ return (error); ++ error = dsl_dataset_hold(zc->zc_value, FTAG, &old); ++ if (error != 0) { ++ dsl_dataset_rele(new, FTAG); ++ return (error); ++ } ++ ++ error = dsl_dataset_space_written(old, new, &zc->zc_cookie, ++ &zc->zc_objset_type, &zc->zc_perm_action); ++ dsl_dataset_rele(old, FTAG); ++ dsl_dataset_rele(new, FTAG); ++ return (error); ++} ++ ++/* ++ * inputs: ++ * zc_name full name of last snapshot ++ * zc_value full name of first snapshot ++ * ++ * outputs: ++ * zc_cookie space in bytes ++ * zc_objset_type compressed space in bytes ++ * zc_perm_action uncompressed space in bytes ++ */ ++static int ++zfs_ioc_space_snaps(zfs_cmd_t *zc) ++{ ++ int error; ++ dsl_dataset_t *new, *old; ++ ++ error = dsl_dataset_hold(zc->zc_name, FTAG, &new); ++ if (error != 0) ++ return (error); ++ error = dsl_dataset_hold(zc->zc_value, FTAG, &old); ++ if (error != 0) { ++ dsl_dataset_rele(new, FTAG); ++ return (error); ++ } ++ ++ error = dsl_dataset_space_wouldfree(old, new, &zc->zc_cookie, ++ &zc->zc_objset_type, &zc->zc_perm_action); ++ dsl_dataset_rele(old, FTAG); ++ dsl_dataset_rele(new, FTAG); ++ return (error); ++} ++ ++/* ++ * pool create, destroy, and export don't log the history as part of ++ * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export ++ * do the logging of those commands. ++ */ ++static zfs_ioc_vec_t zfs_ioc_vec[] = { ++ { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_pool_scan, zfs_secpolicy_config, POOL_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, ++ POOL_CHECK_READONLY }, ++ { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE, ++ POOL_CHECK_SUSPENDED }, ++ { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, ++ POOL_CHECK_SUSPENDED }, ++ { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, ++ POOL_CHECK_SUSPENDED }, ++ { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_create_minor, zfs_secpolicy_config, DATASET_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_remove_minor, zfs_secpolicy_config, DATASET_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_destroy_snaps_nvl, zfs_secpolicy_destroy_recursive, ++ DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, POOL_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_obj_to_path, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, ++ POOL_CHECK_SUSPENDED }, ++ { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one, DATASET_NAME, ++ B_FALSE, POOL_CHECK_NONE }, ++ { zfs_ioc_userspace_many, zfs_secpolicy_userspace_many, DATASET_NAME, ++ B_FALSE, POOL_CHECK_NONE }, ++ { zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, ++ DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE, ++ POOL_CHECK_SUSPENDED }, ++ { zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_next_obj, zfs_secpolicy_read, DATASET_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_diff, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, DATASET_NAME, ++ B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_obj_to_stats, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, ++ POOL_CHECK_SUSPENDED }, ++ { zfs_ioc_events_next, zfs_secpolicy_config, NO_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_events_clear, zfs_secpolicy_config, NO_NAME, B_FALSE, ++ POOL_CHECK_NONE }, ++ { zfs_ioc_pool_reguid, zfs_secpolicy_config, POOL_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, ++ { zfs_ioc_space_written, zfs_secpolicy_read, DATASET_NAME, B_FALSE, ++ POOL_CHECK_SUSPENDED }, ++ { zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, B_FALSE, ++ POOL_CHECK_SUSPENDED }, ++ { zfs_ioc_pool_reopen, zfs_secpolicy_config, POOL_NAME, B_TRUE, ++ POOL_CHECK_SUSPENDED }, ++ { zfs_ioc_send_progress, zfs_secpolicy_read, DATASET_NAME, B_FALSE, ++ POOL_CHECK_NONE } ++}; ++ ++int ++pool_status_check(const char *name, zfs_ioc_namecheck_t type, ++ zfs_ioc_poolcheck_t check) ++{ ++ spa_t *spa; ++ int error; ++ ++ ASSERT(type == POOL_NAME || type == DATASET_NAME); ++ ++ if (check & POOL_CHECK_NONE) ++ return (0); ++ ++ error = spa_open(name, &spa, FTAG); ++ if (error == 0) { ++ if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) ++ error = EAGAIN; ++ else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) ++ error = EROFS; ++ spa_close(spa, FTAG); ++ } ++ return (error); ++} ++ ++static void * ++zfsdev_get_state_impl(minor_t minor, enum zfsdev_state_type which) ++{ ++ zfsdev_state_t *zs; ++ ++ ASSERT(MUTEX_HELD(&zfsdev_state_lock)); ++ ++ for (zs = list_head(&zfsdev_state_list); zs != NULL; ++ zs = list_next(&zfsdev_state_list, zs)) { ++ if (zs->zs_minor == minor) { ++ switch (which) { ++ case ZST_ONEXIT: return (zs->zs_onexit); ++ case ZST_ZEVENT: return (zs->zs_zevent); ++ case ZST_ALL: return (zs); ++ } ++ } ++ } ++ ++ return NULL; ++} ++ ++void * ++zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) ++{ ++ void *ptr; ++ ++ mutex_enter(&zfsdev_state_lock); ++ ptr = zfsdev_get_state_impl(minor, which); ++ mutex_exit(&zfsdev_state_lock); ++ ++ return ptr; ++} ++ ++minor_t ++zfsdev_getminor(struct file *filp) ++{ ++ ASSERT(filp != NULL); ++ ASSERT(filp->private_data != NULL); ++ ++ return (((zfsdev_state_t *)filp->private_data)->zs_minor); ++} ++ ++/* ++ * Find a free minor number. The zfsdev_state_list is expected to ++ * be short since it is only a list of currently open file handles. ++ */ ++minor_t ++zfsdev_minor_alloc(void) ++{ ++ static minor_t last_minor = 0; ++ minor_t m; ++ ++ ASSERT(MUTEX_HELD(&zfsdev_state_lock)); ++ ++ for (m = last_minor + 1; m != last_minor; m++) { ++ if (m > ZFSDEV_MAX_MINOR) ++ m = 1; ++ if (zfsdev_get_state_impl(m, ZST_ALL) == NULL) { ++ last_minor = m; ++ return (m); ++ } ++ } ++ ++ return (0); ++} ++ ++static int ++zfsdev_state_init(struct file *filp) ++{ ++ zfsdev_state_t *zs; ++ minor_t minor; ++ ++ ASSERT(MUTEX_HELD(&zfsdev_state_lock)); ++ ++ minor = zfsdev_minor_alloc(); ++ if (minor == 0) ++ return (ENXIO); ++ ++ zs = kmem_zalloc( sizeof(zfsdev_state_t), KM_SLEEP); ++ if (zs == NULL) ++ return (ENOMEM); ++ ++ zs->zs_file = filp; ++ zs->zs_minor = minor; ++ filp->private_data = zs; ++ ++ zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit); ++ zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent); ++ ++ list_insert_tail(&zfsdev_state_list, zs); ++ ++ return (0); ++} ++ ++static int ++zfsdev_state_destroy(struct file *filp) ++{ ++ zfsdev_state_t *zs; ++ ++ ASSERT(MUTEX_HELD(&zfsdev_state_lock)); ++ ASSERT(filp->private_data != NULL); ++ ++ zs = filp->private_data; ++ zfs_onexit_destroy(zs->zs_onexit); ++ zfs_zevent_destroy(zs->zs_zevent); ++ ++ list_remove(&zfsdev_state_list, zs); ++ kmem_free(zs, sizeof(zfsdev_state_t)); ++ ++ return 0; ++} ++ ++static int ++zfsdev_open(struct inode *ino, struct file *filp) ++{ ++ int error; ++ ++ mutex_enter(&zfsdev_state_lock); ++ error = zfsdev_state_init(filp); ++ mutex_exit(&zfsdev_state_lock); ++ ++ return (-error); ++} ++ ++static int ++zfsdev_release(struct inode *ino, struct file *filp) ++{ ++ int error; ++ ++ mutex_enter(&zfsdev_state_lock); ++ error = zfsdev_state_destroy(filp); ++ mutex_exit(&zfsdev_state_lock); ++ ++ return (-error); ++} ++ ++static long ++zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) ++{ ++ zfs_cmd_t *zc; ++ uint_t vec; ++ int error, rc, flag = 0; ++ ++ vec = cmd - ZFS_IOC; ++ if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) ++ return (-EINVAL); ++ ++ zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP | KM_NODEBUG); ++ ++ error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); ++ if (error != 0) ++ error = EFAULT; ++ ++ if ((error == 0) && !(flag & FKIOCTL)) ++ error = zfs_ioc_vec[vec].zvec_secpolicy(zc, CRED()); ++ ++ /* ++ * Ensure that all pool/dataset names are valid before we pass down to ++ * the lower layers. ++ */ ++ if (error == 0) { ++ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; ++ zc->zc_iflags = flag & FKIOCTL; ++ switch (zfs_ioc_vec[vec].zvec_namecheck) { ++ case POOL_NAME: ++ if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) ++ error = EINVAL; ++ error = pool_status_check(zc->zc_name, ++ zfs_ioc_vec[vec].zvec_namecheck, ++ zfs_ioc_vec[vec].zvec_pool_check); ++ break; ++ ++ case DATASET_NAME: ++ if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) ++ error = EINVAL; ++ error = pool_status_check(zc->zc_name, ++ zfs_ioc_vec[vec].zvec_namecheck, ++ zfs_ioc_vec[vec].zvec_pool_check); ++ break; ++ ++ case NO_NAME: ++ break; ++ } ++ } ++ ++ if (error == 0) ++ error = zfs_ioc_vec[vec].zvec_func(zc); ++ ++ rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); ++ if (error == 0) { ++ if (rc != 0) ++ error = EFAULT; ++ if (zfs_ioc_vec[vec].zvec_his_log) ++ zfs_log_history(zc); ++ } ++ ++ kmem_free(zc, sizeof (zfs_cmd_t)); ++ return (-error); ++} ++ ++#ifdef CONFIG_COMPAT ++static long ++zfsdev_compat_ioctl(struct file *filp, unsigned cmd, unsigned long arg) ++{ ++ return zfsdev_ioctl(filp, cmd, arg); ++} ++#else ++#define zfsdev_compat_ioctl NULL ++#endif ++ ++static const struct file_operations zfsdev_fops = { ++ .open = zfsdev_open, ++ .release = zfsdev_release, ++ .unlocked_ioctl = zfsdev_ioctl, ++ .compat_ioctl = zfsdev_compat_ioctl, ++ .owner = THIS_MODULE, ++}; ++ ++static struct miscdevice zfs_misc = { ++ .minor = MISC_DYNAMIC_MINOR, ++ .name = ZFS_DRIVER, ++ .fops = &zfsdev_fops, ++}; ++ ++static int ++zfs_attach(void) ++{ ++ int error; ++ ++ mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); ++ list_create(&zfsdev_state_list, sizeof (zfsdev_state_t), ++ offsetof(zfsdev_state_t, zs_next)); ++ ++ error = misc_register(&zfs_misc); ++ if (error) { ++ printk(KERN_INFO "ZFS: misc_register() failed %d\n", error); ++ return (error); ++ } ++ ++ return (0); ++} ++ ++static void ++zfs_detach(void) ++{ ++ int error; ++ ++ error = misc_deregister(&zfs_misc); ++ if (error) ++ printk(KERN_INFO "ZFS: misc_deregister() failed %d\n", error); ++ ++ mutex_destroy(&zfsdev_state_lock); ++ list_destroy(&zfsdev_state_list); ++} ++ ++uint_t zfs_fsyncer_key; ++extern uint_t rrw_tsd_key; ++ ++#ifdef DEBUG ++#define ZFS_DEBUG_STR " (DEBUG mode)" ++#else ++#define ZFS_DEBUG_STR "" ++#endif ++ ++int ++_init(void) ++{ ++ int error; ++ ++ spa_init(FREAD | FWRITE); ++ zfs_init(); ++ ++ if ((error = zvol_init()) != 0) ++ goto out1; ++ ++ if ((error = zfs_attach()) != 0) ++ goto out2; ++ ++ tsd_create(&zfs_fsyncer_key, NULL); ++ tsd_create(&rrw_tsd_key, NULL); ++ ++ printk(KERN_NOTICE "ZFS: Loaded module v%s-%s%s, " ++ "ZFS pool version %s, ZFS filesystem version %s\n", ++ ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR, ++ SPA_VERSION_STRING, ZPL_VERSION_STRING); ++ ++ return (0); ++ ++out2: ++ (void) zvol_fini(); ++out1: ++ zfs_fini(); ++ spa_fini(); ++ printk(KERN_NOTICE "ZFS: Failed to Load ZFS Filesystem v%s-%s%s" ++ ", rc = %d\n", ZFS_META_VERSION, ZFS_META_RELEASE, ++ ZFS_DEBUG_STR, error); ++ ++ return (error); ++} ++ ++int ++_fini(void) ++{ ++ zfs_detach(); ++ zvol_fini(); ++ zfs_fini(); ++ spa_fini(); ++ ++ tsd_destroy(&zfs_fsyncer_key); ++ tsd_destroy(&rrw_tsd_key); ++ ++ printk(KERN_NOTICE "ZFS: Unloaded module v%s-%s%s\n", ++ ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR); ++ ++ return (0); ++} ++ ++#ifdef HAVE_SPL ++spl_module_init(_init); ++spl_module_exit(_fini); ++ ++MODULE_DESCRIPTION("ZFS"); ++MODULE_AUTHOR(ZFS_META_AUTHOR); ++MODULE_LICENSE(ZFS_META_LICENSE); ++#endif /* HAVE_SPL */ +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_log.c linux-3.2.33-go/fs/zfs/zfs/zfs_log.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_log.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_log.c 2012-11-16 23:25:34.348039346 +0100 +@@ -0,0 +1,682 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * These zfs_log_* functions must be called within a dmu tx, in one ++ * of 2 contexts depending on zilog->z_replay: ++ * ++ * Non replay mode ++ * --------------- ++ * We need to record the transaction so that if it is committed to ++ * the Intent Log then it can be replayed. An intent log transaction ++ * structure (itx_t) is allocated and all the information necessary to ++ * possibly replay the transaction is saved in it. The itx is then assigned ++ * a sequence number and inserted in the in-memory list anchored in the zilog. ++ * ++ * Replay mode ++ * ----------- ++ * We need to mark the intent log record as replayed in the log header. ++ * This is done in the same transaction as the replay so that they ++ * commit atomically. ++ */ ++ ++int ++zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap) ++{ ++ int isxvattr = (vap->va_mask & ATTR_XVATTR); ++ switch (type) { ++ case Z_FILE: ++ if (vsecp == NULL && !isxvattr) ++ return (TX_CREATE); ++ if (vsecp && isxvattr) ++ return (TX_CREATE_ACL_ATTR); ++ if (vsecp) ++ return (TX_CREATE_ACL); ++ else ++ return (TX_CREATE_ATTR); ++ /*NOTREACHED*/ ++ case Z_DIR: ++ if (vsecp == NULL && !isxvattr) ++ return (TX_MKDIR); ++ if (vsecp && isxvattr) ++ return (TX_MKDIR_ACL_ATTR); ++ if (vsecp) ++ return (TX_MKDIR_ACL); ++ else ++ return (TX_MKDIR_ATTR); ++ case Z_XATTRDIR: ++ return (TX_MKXATTR); ++ } ++ ASSERT(0); ++ return (TX_MAX_TYPE); ++} ++ ++/* ++ * build up the log data necessary for logging xvattr_t ++ * First lr_attr_t is initialized. following the lr_attr_t ++ * is the mapsize and attribute bitmap copied from the xvattr_t. ++ * Following the bitmap and bitmapsize two 64 bit words are reserved ++ * for the create time which may be set. Following the create time ++ * records a single 64 bit integer which has the bits to set on ++ * replay for the xvattr. ++ */ ++static void ++zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) ++{ ++ uint32_t *bitmap; ++ uint64_t *attrs; ++ uint64_t *crtime; ++ xoptattr_t *xoap; ++ void *scanstamp; ++ int i; ++ ++ xoap = xva_getxoptattr(xvap); ++ ASSERT(xoap); ++ ++ lrattr->lr_attr_masksize = xvap->xva_mapsize; ++ bitmap = &lrattr->lr_attr_bitmap; ++ for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) { ++ *bitmap = xvap->xva_reqattrmap[i]; ++ } ++ ++ /* Now pack the attributes up in a single uint64_t */ ++ attrs = (uint64_t *)bitmap; ++ crtime = attrs + 1; ++ scanstamp = (caddr_t)(crtime + 2); ++ *attrs = 0; ++ if (XVA_ISSET_REQ(xvap, XAT_READONLY)) ++ *attrs |= (xoap->xoa_readonly == 0) ? 0 : ++ XAT0_READONLY; ++ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) ++ *attrs |= (xoap->xoa_hidden == 0) ? 0 : ++ XAT0_HIDDEN; ++ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) ++ *attrs |= (xoap->xoa_system == 0) ? 0 : ++ XAT0_SYSTEM; ++ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) ++ *attrs |= (xoap->xoa_archive == 0) ? 0 : ++ XAT0_ARCHIVE; ++ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) ++ *attrs |= (xoap->xoa_immutable == 0) ? 0 : ++ XAT0_IMMUTABLE; ++ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) ++ *attrs |= (xoap->xoa_nounlink == 0) ? 0 : ++ XAT0_NOUNLINK; ++ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) ++ *attrs |= (xoap->xoa_appendonly == 0) ? 0 : ++ XAT0_APPENDONLY; ++ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) ++ *attrs |= (xoap->xoa_opaque == 0) ? 0 : ++ XAT0_APPENDONLY; ++ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) ++ *attrs |= (xoap->xoa_nodump == 0) ? 0 : ++ XAT0_NODUMP; ++ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) ++ *attrs |= (xoap->xoa_av_quarantined == 0) ? 0 : ++ XAT0_AV_QUARANTINED; ++ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) ++ *attrs |= (xoap->xoa_av_modified == 0) ? 0 : ++ XAT0_AV_MODIFIED; ++ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) ++ ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime); ++ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ++ bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ); ++ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) ++ *attrs |= (xoap->xoa_reparse == 0) ? 0 : ++ XAT0_REPARSE; ++ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) ++ *attrs |= (xoap->xoa_offline == 0) ? 0 : ++ XAT0_OFFLINE; ++ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) ++ *attrs |= (xoap->xoa_sparse == 0) ? 0 : ++ XAT0_SPARSE; ++} ++ ++static void * ++zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start) ++{ ++ zfs_fuid_t *zfuid; ++ uint64_t *fuidloc = start; ++ ++ /* First copy in the ACE FUIDs */ ++ for (zfuid = list_head(&fuidp->z_fuids); zfuid; ++ zfuid = list_next(&fuidp->z_fuids, zfuid)) { ++ *fuidloc++ = zfuid->z_logfuid; ++ } ++ return (fuidloc); ++} ++ ++ ++static void * ++zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start) ++{ ++ zfs_fuid_domain_t *zdomain; ++ ++ /* now copy in the domain info, if any */ ++ if (fuidp->z_domain_str_sz != 0) { ++ for (zdomain = list_head(&fuidp->z_domains); zdomain; ++ zdomain = list_next(&fuidp->z_domains, zdomain)) { ++ bcopy((void *)zdomain->z_domain, start, ++ strlen(zdomain->z_domain) + 1); ++ start = (caddr_t)start + ++ strlen(zdomain->z_domain) + 1; ++ } ++ } ++ return (start); ++} ++ ++/* ++ * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, ++ * TX_MKDIR_ATTR and TX_MKXATTR ++ * transactions. ++ * ++ * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID ++ * domain information appended prior to the name. In this case the ++ * uid/gid in the log record will be a log centric FUID. ++ * ++ * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that ++ * may contain attributes, ACL and optional fuid information. ++ * ++ * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify ++ * and ACL and normal users/groups in the ACEs. ++ * ++ * There may be an optional xvattr attribute information similar ++ * to zfs_log_setattr. ++ * ++ * Also, after the file name "domain" strings may be appended. ++ */ ++void ++zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, ++ znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp, ++ zfs_fuid_info_t *fuidp, vattr_t *vap) ++{ ++ itx_t *itx; ++ lr_create_t *lr; ++ lr_acl_create_t *lracl; ++ xvattr_t *xvap = (xvattr_t *)vap; ++ size_t aclsize = 0; ++ size_t xvatsize = 0; ++ size_t txsize; ++ void *end; ++ size_t lrsize; ++ size_t namesize = strlen(name) + 1; ++ size_t fuidsz = 0; ++ ++ if (zil_replaying(zilog, tx)) ++ return; ++ ++ /* ++ * If we have FUIDs present then add in space for ++ * domains and ACE fuid's if any. ++ */ ++ if (fuidp) { ++ fuidsz += fuidp->z_domain_str_sz; ++ fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t); ++ } ++ ++ if (vap->va_mask & ATTR_XVATTR) ++ xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize); ++ ++ if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR || ++ (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR || ++ (int)txtype == TX_MKXATTR) { ++ txsize = sizeof (*lr) + namesize + fuidsz + xvatsize; ++ lrsize = sizeof (*lr); ++ } else { ++ aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0; ++ txsize = ++ sizeof (lr_acl_create_t) + namesize + fuidsz + ++ ZIL_ACE_LENGTH(aclsize) + xvatsize; ++ lrsize = sizeof (lr_acl_create_t); ++ } ++ ++ itx = zil_itx_create(txtype, txsize); ++ ++ lr = (lr_create_t *)&itx->itx_lr; ++ lr->lr_doid = dzp->z_id; ++ lr->lr_foid = zp->z_id; ++ lr->lr_mode = zp->z_mode; ++ if (!IS_EPHEMERAL(zp->z_uid)) { ++ lr->lr_uid = (uint64_t)zp->z_uid; ++ } else { ++ lr->lr_uid = fuidp->z_fuid_owner; ++ } ++ if (!IS_EPHEMERAL(zp->z_gid)) { ++ lr->lr_gid = (uint64_t)zp->z_gid; ++ } else { ++ lr->lr_gid = fuidp->z_fuid_group; ++ } ++ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &lr->lr_gen, ++ sizeof (uint64_t)); ++ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), ++ lr->lr_crtime, sizeof (uint64_t) * 2); ++ ++ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(ZTOZSB(zp)), &lr->lr_rdev, ++ sizeof (lr->lr_rdev)) != 0) ++ lr->lr_rdev = 0; ++ ++ /* ++ * Fill in xvattr info if any ++ */ ++ if (vap->va_mask & ATTR_XVATTR) { ++ zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap); ++ end = (caddr_t)lr + lrsize + xvatsize; ++ } else { ++ end = (caddr_t)lr + lrsize; ++ } ++ ++ /* Now fill in any ACL info */ ++ ++ if (vsecp) { ++ lracl = (lr_acl_create_t *)&itx->itx_lr; ++ lracl->lr_aclcnt = vsecp->vsa_aclcnt; ++ lracl->lr_acl_bytes = aclsize; ++ lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; ++ lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; ++ if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS) ++ lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; ++ else ++ lracl->lr_acl_flags = 0; ++ ++ bcopy(vsecp->vsa_aclentp, end, aclsize); ++ end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize); ++ } ++ ++ /* drop in FUID info */ ++ if (fuidp) { ++ end = zfs_log_fuid_ids(fuidp, end); ++ end = zfs_log_fuid_domains(fuidp, end); ++ } ++ /* ++ * Now place file name in log record ++ */ ++ bcopy(name, end, namesize); ++ ++ zil_itx_assign(zilog, itx, tx); ++} ++ ++/* ++ * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions. ++ */ ++void ++zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, ++ znode_t *dzp, char *name, uint64_t foid) ++{ ++ itx_t *itx; ++ lr_remove_t *lr; ++ size_t namesize = strlen(name) + 1; ++ ++ if (zil_replaying(zilog, tx)) ++ return; ++ ++ itx = zil_itx_create(txtype, sizeof (*lr) + namesize); ++ lr = (lr_remove_t *)&itx->itx_lr; ++ lr->lr_doid = dzp->z_id; ++ bcopy(name, (char *)(lr + 1), namesize); ++ ++ itx->itx_oid = foid; ++ ++ zil_itx_assign(zilog, itx, tx); ++} ++ ++/* ++ * zfs_log_link() handles TX_LINK transactions. ++ */ ++void ++zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, ++ znode_t *dzp, znode_t *zp, char *name) ++{ ++ itx_t *itx; ++ lr_link_t *lr; ++ size_t namesize = strlen(name) + 1; ++ ++ if (zil_replaying(zilog, tx)) ++ return; ++ ++ itx = zil_itx_create(txtype, sizeof (*lr) + namesize); ++ lr = (lr_link_t *)&itx->itx_lr; ++ lr->lr_doid = dzp->z_id; ++ lr->lr_link_obj = zp->z_id; ++ bcopy(name, (char *)(lr + 1), namesize); ++ ++ zil_itx_assign(zilog, itx, tx); ++} ++ ++/* ++ * zfs_log_symlink() handles TX_SYMLINK transactions. ++ */ ++void ++zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, ++ znode_t *dzp, znode_t *zp, char *name, char *link) ++{ ++ itx_t *itx; ++ lr_create_t *lr; ++ size_t namesize = strlen(name) + 1; ++ size_t linksize = strlen(link) + 1; ++ ++ if (zil_replaying(zilog, tx)) ++ return; ++ ++ itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); ++ lr = (lr_create_t *)&itx->itx_lr; ++ lr->lr_doid = dzp->z_id; ++ lr->lr_foid = zp->z_id; ++ lr->lr_uid = zp->z_uid; ++ lr->lr_gid = zp->z_gid; ++ lr->lr_mode = zp->z_mode; ++ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &lr->lr_gen, ++ sizeof (uint64_t)); ++ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), ++ lr->lr_crtime, sizeof (uint64_t) * 2); ++ bcopy(name, (char *)(lr + 1), namesize); ++ bcopy(link, (char *)(lr + 1) + namesize, linksize); ++ ++ zil_itx_assign(zilog, itx, tx); ++} ++ ++/* ++ * zfs_log_rename() handles TX_RENAME transactions. ++ */ ++void ++zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, ++ znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) ++{ ++ itx_t *itx; ++ lr_rename_t *lr; ++ size_t snamesize = strlen(sname) + 1; ++ size_t dnamesize = strlen(dname) + 1; ++ ++ if (zil_replaying(zilog, tx)) ++ return; ++ ++ itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); ++ lr = (lr_rename_t *)&itx->itx_lr; ++ lr->lr_sdoid = sdzp->z_id; ++ lr->lr_tdoid = tdzp->z_id; ++ bcopy(sname, (char *)(lr + 1), snamesize); ++ bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize); ++ itx->itx_oid = szp->z_id; ++ ++ zil_itx_assign(zilog, itx, tx); ++} ++ ++/* ++ * zfs_log_write() handles TX_WRITE transactions. ++ */ ++long zfs_immediate_write_sz = 32768; ++ ++void ++zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, ++ znode_t *zp, offset_t off, ssize_t resid, int ioflag) ++{ ++ itx_wr_state_t write_state; ++ boolean_t slogging; ++ uintptr_t fsync_cnt; ++ ssize_t immediate_write_sz; ++ ++ if (zil_replaying(zilog, tx) || zp->z_unlinked) ++ return; ++ ++ immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) ++ ? 0 : (ssize_t)zfs_immediate_write_sz; ++ ++ slogging = spa_has_slogs(zilog->zl_spa) && ++ (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); ++ if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz) ++ write_state = WR_INDIRECT; ++ else if (ioflag & (FSYNC | FDSYNC)) ++ write_state = WR_COPIED; ++ else ++ write_state = WR_NEED_COPY; ++ ++ if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) { ++ (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); ++ } ++ ++ while (resid) { ++ itx_t *itx; ++ lr_write_t *lr; ++ ssize_t len; ++ ++ /* ++ * If the write would overflow the largest block then split it. ++ */ ++ if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA) ++ len = SPA_MAXBLOCKSIZE >> 1; ++ else ++ len = resid; ++ ++ itx = zil_itx_create(txtype, sizeof (*lr) + ++ (write_state == WR_COPIED ? len : 0)); ++ lr = (lr_write_t *)&itx->itx_lr; ++ if (write_state == WR_COPIED && dmu_read(ZTOZSB(zp)->z_os, ++ zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { ++ zil_itx_destroy(itx); ++ itx = zil_itx_create(txtype, sizeof (*lr)); ++ lr = (lr_write_t *)&itx->itx_lr; ++ write_state = WR_NEED_COPY; ++ } ++ ++ itx->itx_wr_state = write_state; ++ if (write_state == WR_NEED_COPY) ++ itx->itx_sod += len; ++ lr->lr_foid = zp->z_id; ++ lr->lr_offset = off; ++ lr->lr_length = len; ++ lr->lr_blkoff = 0; ++ BP_ZERO(&lr->lr_blkptr); ++ ++ itx->itx_private = ZTOZSB(zp); ++ ++ if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) && ++ (fsync_cnt == 0)) ++ itx->itx_sync = B_FALSE; ++ ++ zil_itx_assign(zilog, itx, tx); ++ ++ off += len; ++ resid -= len; ++ } ++} ++ ++/* ++ * zfs_log_truncate() handles TX_TRUNCATE transactions. ++ */ ++void ++zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, ++ znode_t *zp, uint64_t off, uint64_t len) ++{ ++ itx_t *itx; ++ lr_truncate_t *lr; ++ ++ if (zil_replaying(zilog, tx) || zp->z_unlinked) ++ return; ++ ++ itx = zil_itx_create(txtype, sizeof (*lr)); ++ lr = (lr_truncate_t *)&itx->itx_lr; ++ lr->lr_foid = zp->z_id; ++ lr->lr_offset = off; ++ lr->lr_length = len; ++ ++ itx->itx_sync = (zp->z_sync_cnt != 0); ++ zil_itx_assign(zilog, itx, tx); ++} ++ ++/* ++ * zfs_log_setattr() handles TX_SETATTR transactions. ++ */ ++void ++zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, ++ znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) ++{ ++ itx_t *itx; ++ lr_setattr_t *lr; ++ xvattr_t *xvap = (xvattr_t *)vap; ++ size_t recsize = sizeof (lr_setattr_t); ++ void *start; ++ ++ if (zil_replaying(zilog, tx) || zp->z_unlinked) ++ return; ++ ++ /* ++ * If XVATTR set, then log record size needs to allow ++ * for lr_attr_t + xvattr mask, mapsize and create time ++ * plus actual attribute values ++ */ ++ if (vap->va_mask & ATTR_XVATTR) ++ recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize); ++ ++ if (fuidp) ++ recsize += fuidp->z_domain_str_sz; ++ ++ itx = zil_itx_create(txtype, recsize); ++ lr = (lr_setattr_t *)&itx->itx_lr; ++ lr->lr_foid = zp->z_id; ++ lr->lr_mask = (uint64_t)mask_applied; ++ lr->lr_mode = (uint64_t)vap->va_mode; ++ if ((mask_applied & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ++ lr->lr_uid = fuidp->z_fuid_owner; ++ else ++ lr->lr_uid = (uint64_t)vap->va_uid; ++ ++ if ((mask_applied & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ++ lr->lr_gid = fuidp->z_fuid_group; ++ else ++ lr->lr_gid = (uint64_t)vap->va_gid; ++ ++ lr->lr_size = (uint64_t)vap->va_size; ++ ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime); ++ ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime); ++ start = (lr_setattr_t *)(lr + 1); ++ if (vap->va_mask & ATTR_XVATTR) { ++ zfs_log_xvattr((lr_attr_t *)start, xvap); ++ start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize); ++ } ++ ++ /* ++ * Now stick on domain information if any on end ++ */ ++ ++ if (fuidp) ++ (void) zfs_log_fuid_domains(fuidp, start); ++ ++ itx->itx_sync = (zp->z_sync_cnt != 0); ++ zil_itx_assign(zilog, itx, tx); ++} ++ ++/* ++ * zfs_log_acl() handles TX_ACL transactions. ++ */ ++void ++zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, ++ vsecattr_t *vsecp, zfs_fuid_info_t *fuidp) ++{ ++ itx_t *itx; ++ lr_acl_v0_t *lrv0; ++ lr_acl_t *lr; ++ int txtype; ++ int lrsize; ++ size_t txsize; ++ size_t aclbytes = vsecp->vsa_aclentsz; ++ ++ if (zil_replaying(zilog, tx) || zp->z_unlinked) ++ return; ++ ++ txtype = (ZTOZSB(zp)->z_version < ZPL_VERSION_FUID) ? ++ TX_ACL_V0 : TX_ACL; ++ ++ if (txtype == TX_ACL) ++ lrsize = sizeof (*lr); ++ else ++ lrsize = sizeof (*lrv0); ++ ++ txsize = lrsize + ++ ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) + ++ (fuidp ? fuidp->z_domain_str_sz : 0) + ++ sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0); ++ ++ itx = zil_itx_create(txtype, txsize); ++ ++ lr = (lr_acl_t *)&itx->itx_lr; ++ lr->lr_foid = zp->z_id; ++ if (txtype == TX_ACL) { ++ lr->lr_acl_bytes = aclbytes; ++ lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; ++ lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; ++ if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) ++ lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; ++ else ++ lr->lr_acl_flags = 0; ++ } ++ lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt; ++ ++ if (txtype == TX_ACL_V0) { ++ lrv0 = (lr_acl_v0_t *)lr; ++ bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes); ++ } else { ++ void *start = (ace_t *)(lr + 1); ++ ++ bcopy(vsecp->vsa_aclentp, start, aclbytes); ++ ++ start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes); ++ ++ if (fuidp) { ++ start = zfs_log_fuid_ids(fuidp, start); ++ (void) zfs_log_fuid_domains(fuidp, start); ++ } ++ } ++ ++ itx->itx_sync = (zp->z_sync_cnt != 0); ++ zil_itx_assign(zilog, itx, tx); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++module_param(zfs_immediate_write_sz, long, 0644); ++MODULE_PARM_DESC(zfs_immediate_write_sz, "Largest data block to write to zil"); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_onexit.c linux-3.2.33-go/fs/zfs/zfs/zfs_onexit.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_onexit.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_onexit.c 2012-11-16 23:25:34.347039358 +0100 +@@ -0,0 +1,247 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * ZFS kernel routines may add/delete callback routines to be invoked ++ * upon process exit (triggered via the close operation from the /dev/zfs ++ * driver). ++ * ++ * These cleanup callbacks are intended to allow for the accumulation ++ * of kernel state across multiple ioctls. User processes participate ++ * simply by opening ZFS_DEV. This causes the ZFS driver to do create ++ * some private data for the file descriptor and generating a unique ++ * minor number. The process then passes along that file descriptor to ++ * each ioctl that might have a cleanup operation. ++ * ++ * Consumers of the onexit routines should call zfs_onexit_fd_hold() early ++ * on to validate the given fd and add a reference to its file table entry. ++ * This allows the consumer to do its work and then add a callback, knowing ++ * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers ++ * should call zfs_onexit_fd_rele(). ++ * ++ * A simple example is zfs_ioc_recv(), where we might create an AVL tree ++ * with dataset/GUID mappings and then reuse that tree on subsequent ++ * zfs_ioc_recv() calls. ++ * ++ * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc() ++ * the AVL tree and pass it along with a callback function to ++ * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the ++ * callback and return an action handle. ++ * ++ * The action handle is then passed from user space to subsequent ++ * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree ++ * by calling zfs_onexit_cb_data() with the device minor number and ++ * action handle. ++ * ++ * If the user process exits abnormally, the callback is invoked implicitly ++ * as part of the driver close operation. Once the user space process is ++ * finished with the accumulated kernel state, it can also just call close(2) ++ * on the cleanup fd to trigger the cleanup callback. ++ */ ++ ++void ++zfs_onexit_init(zfs_onexit_t **zop) ++{ ++ zfs_onexit_t *zo; ++ ++ zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP); ++ mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL); ++ list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t), ++ offsetof(zfs_onexit_action_node_t, za_link)); ++} ++ ++void ++zfs_onexit_destroy(zfs_onexit_t *zo) ++{ ++ zfs_onexit_action_node_t *ap; ++ ++ mutex_enter(&zo->zo_lock); ++ while ((ap = list_head(&zo->zo_actions)) != NULL) { ++ list_remove(&zo->zo_actions, ap); ++ mutex_exit(&zo->zo_lock); ++ ap->za_func(ap->za_data); ++ kmem_free(ap, sizeof (zfs_onexit_action_node_t)); ++ mutex_enter(&zo->zo_lock); ++ } ++ mutex_exit(&zo->zo_lock); ++ ++ list_destroy(&zo->zo_actions); ++ mutex_destroy(&zo->zo_lock); ++ kmem_free(zo, sizeof (zfs_onexit_t)); ++} ++ ++static int ++zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) ++{ ++ *zo = zfsdev_get_state(minor, ZST_ONEXIT); ++ if (*zo == NULL) ++ return (EBADF); ++ ++ return (0); ++} ++ ++/* ++ * Consumers might need to operate by minor number instead of fd, since ++ * they might be running in another thread (e.g. txg_sync_thread). Callers ++ * of this function must call zfs_onexit_fd_rele() when they're finished ++ * using the minor number. ++ */ ++int ++zfs_onexit_fd_hold(int fd, minor_t *minorp) ++{ ++ file_t *fp; ++ zfs_onexit_t *zo; ++ ++ fp = getf(fd); ++ if (fp == NULL) ++ return (EBADF); ++ ++ *minorp = zfsdev_getminor(fp->f_file); ++ return (zfs_onexit_minor_to_state(*minorp, &zo)); ++} ++ ++void ++zfs_onexit_fd_rele(int fd) ++{ ++ releasef(fd); ++} ++ ++/* ++ * Add a callback to be invoked when the calling process exits. ++ */ ++int ++zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, ++ uint64_t *action_handle) ++{ ++ zfs_onexit_t *zo; ++ zfs_onexit_action_node_t *ap; ++ int error; ++ ++ error = zfs_onexit_minor_to_state(minor, &zo); ++ if (error) ++ return (error); ++ ++ ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP); ++ list_link_init(&ap->za_link); ++ ap->za_func = func; ++ ap->za_data = data; ++ ++ mutex_enter(&zo->zo_lock); ++ list_insert_tail(&zo->zo_actions, ap); ++ mutex_exit(&zo->zo_lock); ++ if (action_handle) ++ *action_handle = (uint64_t)(uintptr_t)ap; ++ ++ return (0); ++} ++ ++static zfs_onexit_action_node_t * ++zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle) ++{ ++ zfs_onexit_action_node_t *match; ++ zfs_onexit_action_node_t *ap; ++ list_t *l; ++ ++ ASSERT(MUTEX_HELD(&zo->zo_lock)); ++ ++ match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle; ++ l = &zo->zo_actions; ++ for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) { ++ if (match == ap) ++ break; ++ } ++ return (ap); ++} ++ ++/* ++ * Delete the callback, triggering it first if 'fire' is set. ++ */ ++int ++zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) ++{ ++ zfs_onexit_t *zo; ++ zfs_onexit_action_node_t *ap; ++ int error; ++ ++ error = zfs_onexit_minor_to_state(minor, &zo); ++ if (error) ++ return (error); ++ ++ mutex_enter(&zo->zo_lock); ++ ap = zfs_onexit_find_cb(zo, action_handle); ++ if (ap != NULL) { ++ list_remove(&zo->zo_actions, ap); ++ mutex_exit(&zo->zo_lock); ++ if (fire) ++ ap->za_func(ap->za_data); ++ kmem_free(ap, sizeof (zfs_onexit_action_node_t)); ++ } else { ++ mutex_exit(&zo->zo_lock); ++ error = ENOENT; ++ } ++ ++ return (error); ++} ++ ++/* ++ * Return the data associated with this callback. This allows consumers ++ * of the cleanup-on-exit interfaces to stash kernel data across system ++ * calls, knowing that it will be cleaned up if the calling process exits. ++ */ ++int ++zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) ++{ ++ zfs_onexit_t *zo; ++ zfs_onexit_action_node_t *ap; ++ int error; ++ ++ *data = NULL; ++ ++ error = zfs_onexit_minor_to_state(minor, &zo); ++ if (error) ++ return (error); ++ ++ mutex_enter(&zo->zo_lock); ++ ap = zfs_onexit_find_cb(zo, action_handle); ++ if (ap != NULL) ++ *data = ap->za_data; ++ else ++ error = ENOENT; ++ mutex_exit(&zo->zo_lock); ++ ++ return (error); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_replay.c linux-3.2.33-go/fs/zfs/zfs/zfs_replay.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_replay.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_replay.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,935 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2012 Cyril Plisko. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Functions to replay ZFS intent log (ZIL) records ++ * The functions are called through a function vector (zfs_replay_vector) ++ * which is indexed by the transaction type. ++ */ ++ ++static void ++zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, ++ uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) ++{ ++ bzero(vap, sizeof (*vap)); ++ vap->va_mask = (uint_t)mask; ++ vap->va_type = IFTOVT(mode); ++ vap->va_mode = mode; ++ vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid; ++ vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid; ++ vap->va_rdev = rdev; ++ vap->va_nodeid = nodeid; ++} ++ ++/* ARGSUSED */ ++static int ++zfs_replay_error(zfs_sb_t *zsb, lr_t *lr, boolean_t byteswap) ++{ ++ return (ENOTSUP); ++} ++ ++static void ++zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) ++{ ++ xoptattr_t *xoap = NULL; ++ uint64_t *attrs; ++ uint64_t *crtime; ++ uint32_t *bitmap; ++ void *scanstamp; ++ int i; ++ ++ xvap->xva_vattr.va_mask |= ATTR_XVATTR; ++ if ((xoap = xva_getxoptattr(xvap)) == NULL) { ++ xvap->xva_vattr.va_mask &= ~ATTR_XVATTR; /* shouldn't happen */ ++ return; ++ } ++ ++ ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize); ++ ++ bitmap = &lrattr->lr_attr_bitmap; ++ for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++) ++ xvap->xva_reqattrmap[i] = *bitmap; ++ ++ attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1); ++ crtime = attrs + 1; ++ scanstamp = (caddr_t)(crtime + 2); ++ ++ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) ++ xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0); ++ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) ++ xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0); ++ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) ++ xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0); ++ if (XVA_ISSET_REQ(xvap, XAT_READONLY)) ++ xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0); ++ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) ++ xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0); ++ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) ++ xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0); ++ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) ++ xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0); ++ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) ++ xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0); ++ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) ++ xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0); ++ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) ++ xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0); ++ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) ++ xoap->xoa_av_quarantined = ++ ((*attrs & XAT0_AV_QUARANTINED) != 0); ++ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) ++ ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime); ++ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ++ bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ); ++ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) ++ xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0); ++ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) ++ xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0); ++ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) ++ xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0); ++} ++ ++static int ++zfs_replay_domain_cnt(uint64_t uid, uint64_t gid) ++{ ++ uint64_t uid_idx; ++ uint64_t gid_idx; ++ int domcnt = 0; ++ ++ uid_idx = FUID_INDEX(uid); ++ gid_idx = FUID_INDEX(gid); ++ if (uid_idx) ++ domcnt++; ++ if (gid_idx > 0 && gid_idx != uid_idx) ++ domcnt++; ++ ++ return (domcnt); ++} ++ ++static void * ++zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start, ++ int domcnt) ++{ ++ int i; ++ ++ for (i = 0; i != domcnt; i++) { ++ fuid_infop->z_domain_table[i] = start; ++ start = (caddr_t)start + strlen(start) + 1; ++ } ++ ++ return (start); ++} ++ ++/* ++ * Set the uid/gid in the fuid_info structure. ++ */ ++static void ++zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid) ++{ ++ /* ++ * If owner or group are log specific FUIDs then slurp up ++ * domain information and build zfs_fuid_info_t ++ */ ++ if (IS_EPHEMERAL(uid)) ++ fuid_infop->z_fuid_owner = uid; ++ ++ if (IS_EPHEMERAL(gid)) ++ fuid_infop->z_fuid_group = gid; ++} ++ ++/* ++ * Load fuid domains into fuid_info_t ++ */ ++static zfs_fuid_info_t * ++zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid) ++{ ++ int domcnt; ++ ++ zfs_fuid_info_t *fuid_infop; ++ ++ fuid_infop = zfs_fuid_info_alloc(); ++ ++ domcnt = zfs_replay_domain_cnt(uid, gid); ++ ++ if (domcnt == 0) ++ return (fuid_infop); ++ ++ fuid_infop->z_domain_table = ++ kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); ++ ++ zfs_replay_fuid_ugid(fuid_infop, uid, gid); ++ ++ fuid_infop->z_domain_cnt = domcnt; ++ *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt); ++ return (fuid_infop); ++} ++ ++/* ++ * load zfs_fuid_t's and fuid_domains into fuid_info_t ++ */ ++static zfs_fuid_info_t * ++zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid, ++ uint64_t gid) ++{ ++ uint64_t *log_fuid = (uint64_t *)start; ++ zfs_fuid_info_t *fuid_infop; ++ int i; ++ ++ fuid_infop = zfs_fuid_info_alloc(); ++ fuid_infop->z_domain_cnt = domcnt; ++ ++ fuid_infop->z_domain_table = ++ kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP); ++ ++ for (i = 0; i != idcnt; i++) { ++ zfs_fuid_t *zfuid; ++ ++ zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); ++ zfuid->z_logfuid = *log_fuid; ++ zfuid->z_id = -1; ++ zfuid->z_domidx = 0; ++ list_insert_tail(&fuid_infop->z_fuids, zfuid); ++ log_fuid++; ++ } ++ ++ zfs_replay_fuid_ugid(fuid_infop, uid, gid); ++ ++ *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt); ++ return (fuid_infop); ++} ++ ++static void ++zfs_replay_swap_attrs(lr_attr_t *lrattr) ++{ ++ /* swap the lr_attr structure */ ++ byteswap_uint32_array(lrattr, sizeof (*lrattr)); ++ /* swap the bitmap */ ++ byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) * ++ sizeof (uint32_t)); ++ /* swap the attributes, create time + 64 bit word for attributes */ ++ byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) * ++ (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t)); ++} ++ ++/* ++ * Replay file create with optional ACL, xvattr information as well ++ * as option FUID information. ++ */ ++static int ++zfs_replay_create_acl(zfs_sb_t *zsb, lr_acl_create_t *lracl, boolean_t byteswap) ++{ ++ char *name = NULL; /* location determined later */ ++ lr_create_t *lr = (lr_create_t *)lracl; ++ znode_t *dzp; ++ struct inode *ip = NULL; ++ xvattr_t xva; ++ int vflg = 0; ++ vsecattr_t vsec = { 0 }; ++ lr_attr_t *lrattr; ++ void *aclstart; ++ void *fuidstart; ++ size_t xvatlen = 0; ++ uint64_t txtype; ++ int error; ++ ++ txtype = (lr->lr_common.lrc_txtype & ~TX_CI); ++ if (byteswap) { ++ byteswap_uint64_array(lracl, sizeof (*lracl)); ++ if (txtype == TX_CREATE_ACL_ATTR || ++ txtype == TX_MKDIR_ACL_ATTR) { ++ lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); ++ zfs_replay_swap_attrs(lrattr); ++ xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); ++ } ++ ++ aclstart = (caddr_t)(lracl + 1) + xvatlen; ++ zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE); ++ /* swap fuids */ ++ if (lracl->lr_fuidcnt) { ++ byteswap_uint64_array((caddr_t)aclstart + ++ ZIL_ACE_LENGTH(lracl->lr_acl_bytes), ++ lracl->lr_fuidcnt * sizeof (uint64_t)); ++ } ++ } ++ ++ if ((error = zfs_zget(zsb, lr->lr_doid, &dzp)) != 0) ++ return (error); ++ ++ xva_init(&xva); ++ zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, ++ lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); ++ ++ /* ++ * All forms of zfs create (create, mkdir, mkxattrdir, symlink) ++ * eventually end up in zfs_mknode(), which assigns the object's ++ * creation time and generation number. The generic zfs_create() ++ * doesn't have either concept, so we smuggle the values inside ++ * the vattr's otherwise unused va_ctime and va_nblocks fields. ++ */ ++ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); ++ xva.xva_vattr.va_nblocks = lr->lr_gen; ++ ++ error = dmu_object_info(zsb->z_os, lr->lr_foid, NULL); ++ if (error != ENOENT) ++ goto bail; ++ ++ if (lr->lr_common.lrc_txtype & TX_CI) ++ vflg |= FIGNORECASE; ++ switch (txtype) { ++ case TX_CREATE_ACL: ++ aclstart = (caddr_t)(lracl + 1); ++ fuidstart = (caddr_t)aclstart + ++ ZIL_ACE_LENGTH(lracl->lr_acl_bytes); ++ zsb->z_fuid_replay = zfs_replay_fuids(fuidstart, ++ (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, ++ lr->lr_uid, lr->lr_gid); ++ /*FALLTHROUGH*/ ++ case TX_CREATE_ACL_ATTR: ++ if (name == NULL) { ++ lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); ++ xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); ++ xva.xva_vattr.va_mask |= ATTR_XVATTR; ++ zfs_replay_xvattr(lrattr, &xva); ++ } ++ vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; ++ vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; ++ vsec.vsa_aclcnt = lracl->lr_aclcnt; ++ vsec.vsa_aclentsz = lracl->lr_acl_bytes; ++ vsec.vsa_aclflags = lracl->lr_acl_flags; ++ if (zsb->z_fuid_replay == NULL) { ++ fuidstart = (caddr_t)(lracl + 1) + xvatlen + ++ ZIL_ACE_LENGTH(lracl->lr_acl_bytes); ++ zsb->z_fuid_replay = ++ zfs_replay_fuids(fuidstart, ++ (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, ++ lr->lr_uid, lr->lr_gid); ++ } ++ ++ error = zfs_create(ZTOI(dzp), name, &xva.xva_vattr, ++ 0, 0, &ip, kcred, vflg, &vsec); ++ break; ++ case TX_MKDIR_ACL: ++ aclstart = (caddr_t)(lracl + 1); ++ fuidstart = (caddr_t)aclstart + ++ ZIL_ACE_LENGTH(lracl->lr_acl_bytes); ++ zsb->z_fuid_replay = zfs_replay_fuids(fuidstart, ++ (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, ++ lr->lr_uid, lr->lr_gid); ++ /*FALLTHROUGH*/ ++ case TX_MKDIR_ACL_ATTR: ++ if (name == NULL) { ++ lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); ++ xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); ++ zfs_replay_xvattr(lrattr, &xva); ++ } ++ vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; ++ vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; ++ vsec.vsa_aclcnt = lracl->lr_aclcnt; ++ vsec.vsa_aclentsz = lracl->lr_acl_bytes; ++ vsec.vsa_aclflags = lracl->lr_acl_flags; ++ if (zsb->z_fuid_replay == NULL) { ++ fuidstart = (caddr_t)(lracl + 1) + xvatlen + ++ ZIL_ACE_LENGTH(lracl->lr_acl_bytes); ++ zsb->z_fuid_replay = ++ zfs_replay_fuids(fuidstart, ++ (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, ++ lr->lr_uid, lr->lr_gid); ++ } ++ error = zfs_mkdir(ZTOI(dzp), name, &xva.xva_vattr, ++ &ip, kcred, vflg, &vsec); ++ break; ++ default: ++ error = ENOTSUP; ++ } ++ ++bail: ++ if (error == 0 && ip != NULL) ++ iput(ip); ++ ++ iput(ZTOI(dzp)); ++ ++ if (zsb->z_fuid_replay) ++ zfs_fuid_info_free(zsb->z_fuid_replay); ++ zsb->z_fuid_replay = NULL; ++ ++ return (error); ++} ++ ++static int ++zfs_replay_create(zfs_sb_t *zsb, lr_create_t *lr, boolean_t byteswap) ++{ ++ char *name = NULL; /* location determined later */ ++ char *link; /* symlink content follows name */ ++ znode_t *dzp; ++ struct inode *ip = NULL; ++ xvattr_t xva; ++ int vflg = 0; ++ size_t lrsize = sizeof (lr_create_t); ++ lr_attr_t *lrattr; ++ void *start; ++ size_t xvatlen; ++ uint64_t txtype; ++ int error; ++ ++ txtype = (lr->lr_common.lrc_txtype & ~TX_CI); ++ if (byteswap) { ++ byteswap_uint64_array(lr, sizeof (*lr)); ++ if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) ++ zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); ++ } ++ ++ ++ if ((error = zfs_zget(zsb, lr->lr_doid, &dzp)) != 0) ++ return (error); ++ ++ xva_init(&xva); ++ zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, ++ lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid); ++ ++ /* ++ * All forms of zfs create (create, mkdir, mkxattrdir, symlink) ++ * eventually end up in zfs_mknode(), which assigns the object's ++ * creation time and generation number. The generic zfs_create() ++ * doesn't have either concept, so we smuggle the values inside ++ * the vattr's otherwise unused va_ctime and va_nblocks fields. ++ */ ++ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); ++ xva.xva_vattr.va_nblocks = lr->lr_gen; ++ ++ error = dmu_object_info(zsb->z_os, lr->lr_foid, NULL); ++ if (error != ENOENT) ++ goto out; ++ ++ if (lr->lr_common.lrc_txtype & TX_CI) ++ vflg |= FIGNORECASE; ++ ++ /* ++ * Symlinks don't have fuid info, and CIFS never creates ++ * symlinks. ++ * ++ * The _ATTR versions will grab the fuid info in their subcases. ++ */ ++ if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK && ++ (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR && ++ (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) { ++ start = (lr + 1); ++ zsb->z_fuid_replay = ++ zfs_replay_fuid_domain(start, &start, ++ lr->lr_uid, lr->lr_gid); ++ } ++ ++ switch (txtype) { ++ case TX_CREATE_ATTR: ++ lrattr = (lr_attr_t *)(caddr_t)(lr + 1); ++ xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); ++ zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); ++ start = (caddr_t)(lr + 1) + xvatlen; ++ zsb->z_fuid_replay = ++ zfs_replay_fuid_domain(start, &start, ++ lr->lr_uid, lr->lr_gid); ++ name = (char *)start; ++ ++ /*FALLTHROUGH*/ ++ case TX_CREATE: ++ if (name == NULL) ++ name = (char *)start; ++ ++ error = zfs_create(ZTOI(dzp), name, &xva.xva_vattr, ++ 0, 0, &ip, kcred, vflg, NULL); ++ break; ++ case TX_MKDIR_ATTR: ++ lrattr = (lr_attr_t *)(caddr_t)(lr + 1); ++ xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); ++ zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); ++ start = (caddr_t)(lr + 1) + xvatlen; ++ zsb->z_fuid_replay = ++ zfs_replay_fuid_domain(start, &start, ++ lr->lr_uid, lr->lr_gid); ++ name = (char *)start; ++ ++ /*FALLTHROUGH*/ ++ case TX_MKDIR: ++ if (name == NULL) ++ name = (char *)(lr + 1); ++ ++ error = zfs_mkdir(ZTOI(dzp), name, &xva.xva_vattr, ++ &ip, kcred, vflg, NULL); ++ break; ++ case TX_MKXATTR: ++ error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &ip, kcred); ++ break; ++ case TX_SYMLINK: ++ name = (char *)(lr + 1); ++ link = name + strlen(name) + 1; ++ error = zfs_symlink(ZTOI(dzp), name, &xva.xva_vattr, ++ link, &ip, kcred, vflg); ++ break; ++ default: ++ error = ENOTSUP; ++ } ++ ++out: ++ if (error == 0 && ip != NULL) ++ iput(ip); ++ ++ iput(ZTOI(dzp)); ++ ++ if (zsb->z_fuid_replay) ++ zfs_fuid_info_free(zsb->z_fuid_replay); ++ zsb->z_fuid_replay = NULL; ++ return (error); ++} ++ ++static int ++zfs_replay_remove(zfs_sb_t *zsb, lr_remove_t *lr, boolean_t byteswap) ++{ ++ char *name = (char *)(lr + 1); /* name follows lr_remove_t */ ++ znode_t *dzp; ++ int error; ++ int vflg = 0; ++ ++ if (byteswap) ++ byteswap_uint64_array(lr, sizeof (*lr)); ++ ++ if ((error = zfs_zget(zsb, lr->lr_doid, &dzp)) != 0) ++ return (error); ++ ++ if (lr->lr_common.lrc_txtype & TX_CI) ++ vflg |= FIGNORECASE; ++ ++ switch ((int)lr->lr_common.lrc_txtype) { ++ case TX_REMOVE: ++ error = zfs_remove(ZTOI(dzp), name, kcred); ++ break; ++ case TX_RMDIR: ++ error = zfs_rmdir(ZTOI(dzp), name, NULL, kcred, vflg); ++ break; ++ default: ++ error = ENOTSUP; ++ } ++ ++ iput(ZTOI(dzp)); ++ ++ return (error); ++} ++ ++static int ++zfs_replay_link(zfs_sb_t *zsb, lr_link_t *lr, boolean_t byteswap) ++{ ++ char *name = (char *)(lr + 1); /* name follows lr_link_t */ ++ znode_t *dzp, *zp; ++ int error; ++ int vflg = 0; ++ ++ if (byteswap) ++ byteswap_uint64_array(lr, sizeof (*lr)); ++ ++ if ((error = zfs_zget(zsb, lr->lr_doid, &dzp)) != 0) ++ return (error); ++ ++ if ((error = zfs_zget(zsb, lr->lr_link_obj, &zp)) != 0) { ++ iput(ZTOI(dzp)); ++ return (error); ++ } ++ ++ if (lr->lr_common.lrc_txtype & TX_CI) ++ vflg |= FIGNORECASE; ++ ++ error = zfs_link(ZTOI(dzp), ZTOI(zp), name, kcred); ++ ++ iput(ZTOI(zp)); ++ iput(ZTOI(dzp)); ++ ++ return (error); ++} ++ ++static int ++zfs_replay_rename(zfs_sb_t *zsb, lr_rename_t *lr, boolean_t byteswap) ++{ ++ char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ ++ char *tname = sname + strlen(sname) + 1; ++ znode_t *sdzp, *tdzp; ++ int error; ++ int vflg = 0; ++ ++ if (byteswap) ++ byteswap_uint64_array(lr, sizeof (*lr)); ++ ++ if ((error = zfs_zget(zsb, lr->lr_sdoid, &sdzp)) != 0) ++ return (error); ++ ++ if ((error = zfs_zget(zsb, lr->lr_tdoid, &tdzp)) != 0) { ++ iput(ZTOI(sdzp)); ++ return (error); ++ } ++ ++ if (lr->lr_common.lrc_txtype & TX_CI) ++ vflg |= FIGNORECASE; ++ ++ error = zfs_rename(ZTOI(sdzp), sname, ZTOI(tdzp), tname, kcred, vflg); ++ ++ iput(ZTOI(tdzp)); ++ iput(ZTOI(sdzp)); ++ ++ return (error); ++} ++ ++static int ++zfs_replay_write(zfs_sb_t *zsb, lr_write_t *lr, boolean_t byteswap) ++{ ++ char *data = (char *)(lr + 1); /* data follows lr_write_t */ ++ znode_t *zp; ++ int error, written; ++ uint64_t eod, offset, length; ++ ++ if (byteswap) ++ byteswap_uint64_array(lr, sizeof (*lr)); ++ ++ if ((error = zfs_zget(zsb, lr->lr_foid, &zp)) != 0) { ++ /* ++ * As we can log writes out of order, it's possible the ++ * file has been removed. In this case just drop the write ++ * and return success. ++ */ ++ if (error == ENOENT) ++ error = 0; ++ return (error); ++ } ++ ++ offset = lr->lr_offset; ++ length = lr->lr_length; ++ eod = offset + length; /* end of data for this write */ ++ ++ /* ++ * This may be a write from a dmu_sync() for a whole block, ++ * and may extend beyond the current end of the file. ++ * We can't just replay what was written for this TX_WRITE as ++ * a future TX_WRITE2 may extend the eof and the data for that ++ * write needs to be there. So we write the whole block and ++ * reduce the eof. This needs to be done within the single dmu ++ * transaction created within vn_rdwr -> zfs_write. So a possible ++ * new end of file is passed through in zsb->z_replay_eof ++ */ ++ ++ zsb->z_replay_eof = 0; /* 0 means don't change end of file */ ++ ++ /* If it's a dmu_sync() block, write the whole block */ ++ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { ++ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); ++ if (length < blocksize) { ++ offset -= offset % blocksize; ++ length = blocksize; ++ } ++ if (zp->z_size < eod) ++ zsb->z_replay_eof = eod; ++ } ++ ++ written = zpl_write_common(ZTOI(zp), data, length, offset, ++ UIO_SYSSPACE, 0, kcred); ++ if (written < 0) ++ error = -written; ++ else if (written < length) ++ error = EIO; /* short write */ ++ ++ iput(ZTOI(zp)); ++ zsb->z_replay_eof = 0; /* safety */ ++ ++ return (error); ++} ++ ++/* ++ * TX_WRITE2 are only generated when dmu_sync() returns EALREADY ++ * meaning the pool block is already being synced. So now that we always write ++ * out full blocks, all we have to do is expand the eof if ++ * the file is grown. ++ */ ++static int ++zfs_replay_write2(zfs_sb_t *zsb, lr_write_t *lr, boolean_t byteswap) ++{ ++ znode_t *zp; ++ int error; ++ uint64_t end; ++ ++ if (byteswap) ++ byteswap_uint64_array(lr, sizeof (*lr)); ++ ++ if ((error = zfs_zget(zsb, lr->lr_foid, &zp)) != 0) ++ return (error); ++ ++top: ++ end = lr->lr_offset + lr->lr_length; ++ if (end > zp->z_size) { ++ dmu_tx_t *tx = dmu_tx_create(zsb->z_os); ++ ++ zp->z_size = end; ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); ++ error = dmu_tx_assign(tx, TXG_WAIT); ++ if (error) { ++ iput(ZTOI(zp)); ++ if (error == ERESTART) { ++ dmu_tx_wait(tx); ++ dmu_tx_abort(tx); ++ goto top; ++ } ++ dmu_tx_abort(tx); ++ return (error); ++ } ++ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zsb), ++ (void *)&zp->z_size, sizeof (uint64_t), tx); ++ ++ /* Ensure the replayed seq is updated */ ++ (void) zil_replaying(zsb->z_log, tx); ++ ++ dmu_tx_commit(tx); ++ } ++ ++ iput(ZTOI(zp)); ++ ++ return (error); ++} ++ ++static int ++zfs_replay_truncate(zfs_sb_t *zsb, lr_truncate_t *lr, boolean_t byteswap) ++{ ++ znode_t *zp; ++ flock64_t fl; ++ int error; ++ ++ if (byteswap) ++ byteswap_uint64_array(lr, sizeof (*lr)); ++ ++ if ((error = zfs_zget(zsb, lr->lr_foid, &zp)) != 0) ++ return (error); ++ ++ bzero(&fl, sizeof (fl)); ++ fl.l_type = F_WRLCK; ++ fl.l_whence = 0; ++ fl.l_start = lr->lr_offset; ++ fl.l_len = lr->lr_length; ++ ++ error = zfs_space(ZTOI(zp), F_FREESP, &fl, FWRITE | FOFFMAX, ++ lr->lr_offset, kcred); ++ ++ iput(ZTOI(zp)); ++ ++ return (error); ++} ++ ++static int ++zfs_replay_setattr(zfs_sb_t *zsb, lr_setattr_t *lr, boolean_t byteswap) ++{ ++ znode_t *zp; ++ xvattr_t xva; ++ vattr_t *vap = &xva.xva_vattr; ++ int error; ++ void *start; ++ ++ xva_init(&xva); ++ if (byteswap) { ++ byteswap_uint64_array(lr, sizeof (*lr)); ++ ++ if ((lr->lr_mask & ATTR_XVATTR) && ++ zsb->z_version >= ZPL_VERSION_INITIAL) ++ zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); ++ } ++ ++ if ((error = zfs_zget(zsb, lr->lr_foid, &zp)) != 0) ++ return (error); ++ ++ zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode, ++ lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); ++ ++ vap->va_size = lr->lr_size; ++ ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime); ++ ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime); ++ ++ /* ++ * Fill in xvattr_t portions if necessary. ++ */ ++ ++ start = (lr_setattr_t *)(lr + 1); ++ if (vap->va_mask & ATTR_XVATTR) { ++ zfs_replay_xvattr((lr_attr_t *)start, &xva); ++ start = (caddr_t)start + ++ ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize); ++ } else ++ xva.xva_vattr.va_mask &= ~ATTR_XVATTR; ++ ++ zsb->z_fuid_replay = zfs_replay_fuid_domain(start, &start, ++ lr->lr_uid, lr->lr_gid); ++ ++ error = zfs_setattr(ZTOI(zp), vap, 0, kcred); ++ ++ zfs_fuid_info_free(zsb->z_fuid_replay); ++ zsb->z_fuid_replay = NULL; ++ iput(ZTOI(zp)); ++ ++ return (error); ++} ++ ++static int ++zfs_replay_acl_v0(zfs_sb_t *zsb, lr_acl_v0_t *lr, boolean_t byteswap) ++{ ++ ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ ++ vsecattr_t vsa; ++ znode_t *zp; ++ int error; ++ ++ if (byteswap) { ++ byteswap_uint64_array(lr, sizeof (*lr)); ++ zfs_oldace_byteswap(ace, lr->lr_aclcnt); ++ } ++ ++ if ((error = zfs_zget(zsb, lr->lr_foid, &zp)) != 0) ++ return (error); ++ ++ bzero(&vsa, sizeof (vsa)); ++ vsa.vsa_mask = VSA_ACE | VSA_ACECNT; ++ vsa.vsa_aclcnt = lr->lr_aclcnt; ++ vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt; ++ vsa.vsa_aclflags = 0; ++ vsa.vsa_aclentp = ace; ++ ++ error = zfs_setsecattr(ZTOI(zp), &vsa, 0, kcred); ++ ++ iput(ZTOI(zp)); ++ ++ return (error); ++} ++ ++/* ++ * Replaying ACLs is complicated by FUID support. ++ * The log record may contain some optional data ++ * to be used for replaying FUID's. These pieces ++ * are the actual FUIDs that were created initially. ++ * The FUID table index may no longer be valid and ++ * during zfs_create() a new index may be assigned. ++ * Because of this the log will contain the original ++ * doman+rid in order to create a new FUID. ++ * ++ * The individual ACEs may contain an ephemeral uid/gid which is no ++ * longer valid and will need to be replaced with an actual FUID. ++ * ++ */ ++static int ++zfs_replay_acl(zfs_sb_t *zsb, lr_acl_t *lr, boolean_t byteswap) ++{ ++ ace_t *ace = (ace_t *)(lr + 1); ++ vsecattr_t vsa; ++ znode_t *zp; ++ int error; ++ ++ if (byteswap) { ++ byteswap_uint64_array(lr, sizeof (*lr)); ++ zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE); ++ if (lr->lr_fuidcnt) { ++ byteswap_uint64_array((caddr_t)ace + ++ ZIL_ACE_LENGTH(lr->lr_acl_bytes), ++ lr->lr_fuidcnt * sizeof (uint64_t)); ++ } ++ } ++ ++ if ((error = zfs_zget(zsb, lr->lr_foid, &zp)) != 0) ++ return (error); ++ ++ bzero(&vsa, sizeof (vsa)); ++ vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS; ++ vsa.vsa_aclcnt = lr->lr_aclcnt; ++ vsa.vsa_aclentp = ace; ++ vsa.vsa_aclentsz = lr->lr_acl_bytes; ++ vsa.vsa_aclflags = lr->lr_acl_flags; ++ ++ if (lr->lr_fuidcnt) { ++ void *fuidstart = (caddr_t)ace + ++ ZIL_ACE_LENGTH(lr->lr_acl_bytes); ++ ++ zsb->z_fuid_replay = ++ zfs_replay_fuids(fuidstart, &fuidstart, ++ lr->lr_fuidcnt, lr->lr_domcnt, 0, 0); ++ } ++ ++ error = zfs_setsecattr(ZTOI(zp), &vsa, 0, kcred); ++ ++ if (zsb->z_fuid_replay) ++ zfs_fuid_info_free(zsb->z_fuid_replay); ++ ++ zsb->z_fuid_replay = NULL; ++ iput(ZTOI(zp)); ++ ++ return (error); ++} ++ ++/* ++ * Callback vectors for replaying records ++ */ ++zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { ++ (zil_replay_func_t *)zfs_replay_error, /* no such type */ ++ (zil_replay_func_t *)zfs_replay_create, /* TX_CREATE */ ++ (zil_replay_func_t *)zfs_replay_create, /* TX_MKDIR */ ++ (zil_replay_func_t *)zfs_replay_create, /* TX_MKXATTR */ ++ (zil_replay_func_t *)zfs_replay_create, /* TX_SYMLINK */ ++ (zil_replay_func_t *)zfs_replay_remove, /* TX_REMOVE */ ++ (zil_replay_func_t *)zfs_replay_remove, /* TX_RMDIR */ ++ (zil_replay_func_t *)zfs_replay_link, /* TX_LINK */ ++ (zil_replay_func_t *)zfs_replay_rename, /* TX_RENAME */ ++ (zil_replay_func_t *)zfs_replay_write, /* TX_WRITE */ ++ (zil_replay_func_t *)zfs_replay_truncate, /* TX_TRUNCATE */ ++ (zil_replay_func_t *)zfs_replay_setattr, /* TX_SETATTR */ ++ (zil_replay_func_t *)zfs_replay_acl_v0, /* TX_ACL_V0 */ ++ (zil_replay_func_t *)zfs_replay_acl, /* TX_ACL */ ++ (zil_replay_func_t *)zfs_replay_create_acl, /* TX_CREATE_ACL */ ++ (zil_replay_func_t *)zfs_replay_create, /* TX_CREATE_ATTR */ ++ (zil_replay_func_t *)zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */ ++ (zil_replay_func_t *)zfs_replay_create_acl, /* TX_MKDIR_ACL */ ++ (zil_replay_func_t *)zfs_replay_create, /* TX_MKDIR_ATTR */ ++ (zil_replay_func_t *)zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ ++ (zil_replay_func_t *)zfs_replay_write2, /* TX_WRITE2 */ ++}; +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_rlock.c linux-3.2.33-go/fs/zfs/zfs/zfs_rlock.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_rlock.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_rlock.c 2012-11-16 23:25:34.352039300 +0100 +@@ -0,0 +1,625 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++/* ++ * This file contains the code to implement file range locking in ++ * ZFS, although there isn't much specific to ZFS (all that comes to mind ++ * support for growing the blocksize). ++ * ++ * Interface ++ * --------- ++ * Defined in zfs_rlock.h but essentially: ++ * rl = zfs_range_lock(zp, off, len, lock_type); ++ * zfs_range_unlock(rl); ++ * zfs_range_reduce(rl, off, len); ++ * ++ * AVL tree ++ * -------- ++ * An AVL tree is used to maintain the state of the existing ranges ++ * that are locked for exclusive (writer) or shared (reader) use. ++ * The starting range offset is used for searching and sorting the tree. ++ * ++ * Common case ++ * ----------- ++ * The (hopefully) usual case is of no overlaps or contention for ++ * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree ++ * searched that finds no overlap, and *this* rl_t is placed in the tree. ++ * ++ * Overlaps/Reference counting/Proxy locks ++ * --------------------------------------- ++ * The avl code only allows one node at a particular offset. Also it's very ++ * inefficient to search through all previous entries looking for overlaps ++ * (because the very 1st in the ordered list might be at offset 0 but ++ * cover the whole file). ++ * So this implementation uses reference counts and proxy range locks. ++ * Firstly, only reader locks use reference counts and proxy locks, ++ * because writer locks are exclusive. ++ * When a reader lock overlaps with another then a proxy lock is created ++ * for that range and replaces the original lock. If the overlap ++ * is exact then the reference count of the proxy is simply incremented. ++ * Otherwise, the proxy lock is split into smaller lock ranges and ++ * new proxy locks created for non overlapping ranges. ++ * The reference counts are adjusted accordingly. ++ * Meanwhile, the orginal lock is kept around (this is the callers handle) ++ * and its offset and length are used when releasing the lock. ++ * ++ * Thread coordination ++ * ------------------- ++ * In order to make wakeups efficient and to ensure multiple continuous ++ * readers on a range don't starve a writer for the same range lock, ++ * two condition variables are allocated in each rl_t. ++ * If a writer (or reader) can't get a range it initialises the writer ++ * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; ++ * and waits on that cv. When a thread unlocks that range it wakes up all ++ * writers then all readers before destroying the lock. ++ * ++ * Append mode writes ++ * ------------------ ++ * Append mode writes need to lock a range at the end of a file. ++ * The offset of the end of the file is determined under the ++ * range locking mutex, and the lock type converted from RL_APPEND to ++ * RL_WRITER and the range locked. ++ * ++ * Grow block handling ++ * ------------------- ++ * ZFS supports multiple block sizes currently upto 128K. The smallest ++ * block size is used for the file which is grown as needed. During this ++ * growth all other writers and readers must be excluded. ++ * So if the block size needs to be grown then the whole file is ++ * exclusively locked, then later the caller will reduce the lock ++ * range to just the range to be written using zfs_reduce_range. ++ */ ++ ++#include ++ ++/* ++ * Check if a write lock can be grabbed, or wait and recheck until available. ++ */ ++static void ++zfs_range_lock_writer(znode_t *zp, rl_t *new) ++{ ++ avl_tree_t *tree = &zp->z_range_avl; ++ rl_t *rl; ++ avl_index_t where; ++ uint64_t end_size; ++ uint64_t off = new->r_off; ++ uint64_t len = new->r_len; ++ ++ for (;;) { ++ /* ++ * Range locking is also used by zvol and uses a ++ * dummied up znode. However, for zvol, we don't need to ++ * append or grow blocksize, and besides we don't have ++ * a "sa" data or zfs_sb_t - so skip that processing. ++ * ++ * Yes, this is ugly, and would be solved by not handling ++ * grow or append in range lock code. If that was done then ++ * we could make the range locking code generically available ++ * to other non-zfs consumers. ++ */ ++ if (!zp->z_is_zvol) { /* caller is ZPL */ ++ /* ++ * If in append mode pick up the current end of file. ++ * This is done under z_range_lock to avoid races. ++ */ ++ if (new->r_type == RL_APPEND) ++ new->r_off = zp->z_size; ++ ++ /* ++ * If we need to grow the block size then grab the whole ++ * file range. This is also done under z_range_lock to ++ * avoid races. ++ */ ++ end_size = MAX(zp->z_size, new->r_off + len); ++ if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || ++ zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { ++ new->r_off = 0; ++ new->r_len = UINT64_MAX; ++ } ++ } ++ ++ /* ++ * First check for the usual case of no locks ++ */ ++ if (avl_numnodes(tree) == 0) { ++ new->r_type = RL_WRITER; /* convert to writer */ ++ avl_add(tree, new); ++ return; ++ } ++ ++ /* ++ * Look for any locks in the range. ++ */ ++ rl = avl_find(tree, new, &where); ++ if (rl) ++ goto wait; /* already locked at same offset */ ++ ++ rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); ++ if (rl && (rl->r_off < new->r_off + new->r_len)) ++ goto wait; ++ ++ rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); ++ if (rl && rl->r_off + rl->r_len > new->r_off) ++ goto wait; ++ ++ new->r_type = RL_WRITER; /* convert possible RL_APPEND */ ++ avl_insert(tree, new, where); ++ return; ++wait: ++ if (!rl->r_write_wanted) { ++ cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); ++ rl->r_write_wanted = B_TRUE; ++ } ++ cv_wait(&rl->r_wr_cv, &zp->z_range_lock); ++ ++ /* reset to original */ ++ new->r_off = off; ++ new->r_len = len; ++ } ++} ++ ++/* ++ * If this is an original (non-proxy) lock then replace it by ++ * a proxy and return the proxy. ++ */ ++static rl_t * ++zfs_range_proxify(avl_tree_t *tree, rl_t *rl) ++{ ++ rl_t *proxy; ++ ++ if (rl->r_proxy) ++ return (rl); /* already a proxy */ ++ ++ ASSERT3U(rl->r_cnt, ==, 1); ++ ASSERT(rl->r_write_wanted == B_FALSE); ++ ASSERT(rl->r_read_wanted == B_FALSE); ++ avl_remove(tree, rl); ++ rl->r_cnt = 0; ++ ++ /* create a proxy range lock */ ++ proxy = kmem_alloc(sizeof (rl_t), KM_PUSHPAGE); ++ proxy->r_off = rl->r_off; ++ proxy->r_len = rl->r_len; ++ proxy->r_cnt = 1; ++ proxy->r_type = RL_READER; ++ proxy->r_proxy = B_TRUE; ++ proxy->r_write_wanted = B_FALSE; ++ proxy->r_read_wanted = B_FALSE; ++ avl_add(tree, proxy); ++ ++ return (proxy); ++} ++ ++/* ++ * Split the range lock at the supplied offset ++ * returning the *front* proxy. ++ */ ++static rl_t * ++zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) ++{ ++ rl_t *front, *rear; ++ ++ ASSERT3U(rl->r_len, >, 1); ++ ASSERT3U(off, >, rl->r_off); ++ ASSERT3U(off, <, rl->r_off + rl->r_len); ++ ASSERT(rl->r_write_wanted == B_FALSE); ++ ASSERT(rl->r_read_wanted == B_FALSE); ++ ++ /* create the rear proxy range lock */ ++ rear = kmem_alloc(sizeof (rl_t), KM_PUSHPAGE); ++ rear->r_off = off; ++ rear->r_len = rl->r_off + rl->r_len - off; ++ rear->r_cnt = rl->r_cnt; ++ rear->r_type = RL_READER; ++ rear->r_proxy = B_TRUE; ++ rear->r_write_wanted = B_FALSE; ++ rear->r_read_wanted = B_FALSE; ++ ++ front = zfs_range_proxify(tree, rl); ++ front->r_len = off - rl->r_off; ++ ++ avl_insert_here(tree, rear, front, AVL_AFTER); ++ return (front); ++} ++ ++/* ++ * Create and add a new proxy range lock for the supplied range. ++ */ ++static void ++zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) ++{ ++ rl_t *rl; ++ ++ ASSERT(len); ++ rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); ++ rl->r_off = off; ++ rl->r_len = len; ++ rl->r_cnt = 1; ++ rl->r_type = RL_READER; ++ rl->r_proxy = B_TRUE; ++ rl->r_write_wanted = B_FALSE; ++ rl->r_read_wanted = B_FALSE; ++ avl_add(tree, rl); ++} ++ ++static void ++zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) ++{ ++ rl_t *next; ++ uint64_t off = new->r_off; ++ uint64_t len = new->r_len; ++ ++ /* ++ * prev arrives either: ++ * - pointing to an entry at the same offset ++ * - pointing to the entry with the closest previous offset whose ++ * range may overlap with the new range ++ * - null, if there were no ranges starting before the new one ++ */ ++ if (prev) { ++ if (prev->r_off + prev->r_len <= off) { ++ prev = NULL; ++ } else if (prev->r_off != off) { ++ /* ++ * convert to proxy if needed then ++ * split this entry and bump ref count ++ */ ++ prev = zfs_range_split(tree, prev, off); ++ prev = AVL_NEXT(tree, prev); /* move to rear range */ ++ } ++ } ++ ASSERT((prev == NULL) || (prev->r_off == off)); ++ ++ if (prev) ++ next = prev; ++ else ++ next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); ++ ++ if (next == NULL || off + len <= next->r_off) { ++ /* no overlaps, use the original new rl_t in the tree */ ++ avl_insert(tree, new, where); ++ return; ++ } ++ ++ if (off < next->r_off) { ++ /* Add a proxy for initial range before the overlap */ ++ zfs_range_new_proxy(tree, off, next->r_off - off); ++ } ++ ++ new->r_cnt = 0; /* will use proxies in tree */ ++ /* ++ * We now search forward through the ranges, until we go past the end ++ * of the new range. For each entry we make it a proxy if it ++ * isn't already, then bump its reference count. If there's any ++ * gaps between the ranges then we create a new proxy range. ++ */ ++ for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { ++ if (off + len <= next->r_off) ++ break; ++ if (prev && prev->r_off + prev->r_len < next->r_off) { ++ /* there's a gap */ ++ ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); ++ zfs_range_new_proxy(tree, prev->r_off + prev->r_len, ++ next->r_off - (prev->r_off + prev->r_len)); ++ } ++ if (off + len == next->r_off + next->r_len) { ++ /* exact overlap with end */ ++ next = zfs_range_proxify(tree, next); ++ next->r_cnt++; ++ return; ++ } ++ if (off + len < next->r_off + next->r_len) { ++ /* new range ends in the middle of this block */ ++ next = zfs_range_split(tree, next, off + len); ++ next->r_cnt++; ++ return; ++ } ++ ASSERT3U(off + len, >, next->r_off + next->r_len); ++ next = zfs_range_proxify(tree, next); ++ next->r_cnt++; ++ } ++ ++ /* Add the remaining end range. */ ++ zfs_range_new_proxy(tree, prev->r_off + prev->r_len, ++ (off + len) - (prev->r_off + prev->r_len)); ++} ++ ++/* ++ * Check if a reader lock can be grabbed, or wait and recheck until available. ++ */ ++static void ++zfs_range_lock_reader(znode_t *zp, rl_t *new) ++{ ++ avl_tree_t *tree = &zp->z_range_avl; ++ rl_t *prev, *next; ++ avl_index_t where; ++ uint64_t off = new->r_off; ++ uint64_t len = new->r_len; ++ ++ /* ++ * Look for any writer locks in the range. ++ */ ++retry: ++ prev = avl_find(tree, new, &where); ++ if (prev == NULL) ++ prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); ++ ++ /* ++ * Check the previous range for a writer lock overlap. ++ */ ++ if (prev && (off < prev->r_off + prev->r_len)) { ++ if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { ++ if (!prev->r_read_wanted) { ++ cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); ++ prev->r_read_wanted = B_TRUE; ++ } ++ cv_wait(&prev->r_rd_cv, &zp->z_range_lock); ++ goto retry; ++ } ++ if (off + len < prev->r_off + prev->r_len) ++ goto got_lock; ++ } ++ ++ /* ++ * Search through the following ranges to see if there's ++ * write lock any overlap. ++ */ ++ if (prev) ++ next = AVL_NEXT(tree, prev); ++ else ++ next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); ++ for (; next; next = AVL_NEXT(tree, next)) { ++ if (off + len <= next->r_off) ++ goto got_lock; ++ if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { ++ if (!next->r_read_wanted) { ++ cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); ++ next->r_read_wanted = B_TRUE; ++ } ++ cv_wait(&next->r_rd_cv, &zp->z_range_lock); ++ goto retry; ++ } ++ if (off + len <= next->r_off + next->r_len) ++ goto got_lock; ++ } ++ ++got_lock: ++ /* ++ * Add the read lock, which may involve splitting existing ++ * locks and bumping ref counts (r_cnt). ++ */ ++ zfs_range_add_reader(tree, new, prev, where); ++} ++ ++/* ++ * Lock a range (offset, length) as either shared (RL_READER) ++ * or exclusive (RL_WRITER). Returns the range lock structure ++ * for later unlocking or reduce range (if entire file ++ * previously locked as RL_WRITER). ++ */ ++rl_t * ++zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) ++{ ++ rl_t *new; ++ ++ ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); ++ ++ new = kmem_alloc(sizeof (rl_t), KM_PUSHPAGE); ++ new->r_zp = zp; ++ new->r_off = off; ++ if (len + off < off) /* overflow */ ++ len = UINT64_MAX - off; ++ new->r_len = len; ++ new->r_cnt = 1; /* assume it's going to be in the tree */ ++ new->r_type = type; ++ new->r_proxy = B_FALSE; ++ new->r_write_wanted = B_FALSE; ++ new->r_read_wanted = B_FALSE; ++ ++ mutex_enter(&zp->z_range_lock); ++ if (type == RL_READER) { ++ /* ++ * First check for the usual case of no locks ++ */ ++ if (avl_numnodes(&zp->z_range_avl) == 0) ++ avl_add(&zp->z_range_avl, new); ++ else ++ zfs_range_lock_reader(zp, new); ++ } else ++ zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */ ++ mutex_exit(&zp->z_range_lock); ++ return (new); ++} ++ ++static void ++zfs_range_free(void *arg) ++{ ++ rl_t *rl = arg; ++ ++ if (rl->r_write_wanted) ++ cv_destroy(&rl->r_wr_cv); ++ ++ if (rl->r_read_wanted) ++ cv_destroy(&rl->r_rd_cv); ++ ++ kmem_free(rl, sizeof (rl_t)); ++} ++ ++/* ++ * Unlock a reader lock ++ */ ++static void ++zfs_range_unlock_reader(znode_t *zp, rl_t *remove, list_t *free_list) ++{ ++ avl_tree_t *tree = &zp->z_range_avl; ++ rl_t *rl, *next = NULL; ++ uint64_t len; ++ ++ /* ++ * The common case is when the remove entry is in the tree ++ * (cnt == 1) meaning there's been no other reader locks overlapping ++ * with this one. Otherwise the remove entry will have been ++ * removed from the tree and replaced by proxies (one or ++ * more ranges mapping to the entire range). ++ */ ++ if (remove->r_cnt == 1) { ++ avl_remove(tree, remove); ++ ++ if (remove->r_write_wanted) ++ cv_broadcast(&remove->r_wr_cv); ++ ++ if (remove->r_read_wanted) ++ cv_broadcast(&remove->r_rd_cv); ++ ++ list_insert_tail(free_list, remove); ++ } else { ++ ASSERT3U(remove->r_cnt, ==, 0); ++ ASSERT3U(remove->r_write_wanted, ==, 0); ++ ASSERT3U(remove->r_read_wanted, ==, 0); ++ /* ++ * Find start proxy representing this reader lock, ++ * then decrement ref count on all proxies ++ * that make up this range, freeing them as needed. ++ */ ++ rl = avl_find(tree, remove, NULL); ++ ASSERT(rl); ++ ASSERT(rl->r_cnt); ++ ASSERT(rl->r_type == RL_READER); ++ for (len = remove->r_len; len != 0; rl = next) { ++ len -= rl->r_len; ++ if (len) { ++ next = AVL_NEXT(tree, rl); ++ ASSERT(next); ++ ASSERT(rl->r_off + rl->r_len == next->r_off); ++ ASSERT(next->r_cnt); ++ ASSERT(next->r_type == RL_READER); ++ } ++ rl->r_cnt--; ++ if (rl->r_cnt == 0) { ++ avl_remove(tree, rl); ++ ++ if (rl->r_write_wanted) ++ cv_broadcast(&rl->r_wr_cv); ++ ++ if (rl->r_read_wanted) ++ cv_broadcast(&rl->r_rd_cv); ++ ++ list_insert_tail(free_list, rl); ++ } ++ } ++ ++ kmem_free(remove, sizeof (rl_t)); ++ } ++} ++ ++/* ++ * Unlock range and destroy range lock structure. ++ */ ++void ++zfs_range_unlock(rl_t *rl) ++{ ++ znode_t *zp = rl->r_zp; ++ list_t free_list; ++ rl_t *free_rl; ++ ++ ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER); ++ ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0); ++ ASSERT(!rl->r_proxy); ++ list_create(&free_list, sizeof(rl_t), offsetof(rl_t, rl_node)); ++ ++ mutex_enter(&zp->z_range_lock); ++ if (rl->r_type == RL_WRITER) { ++ /* writer locks can't be shared or split */ ++ avl_remove(&zp->z_range_avl, rl); ++ if (rl->r_write_wanted) ++ cv_broadcast(&rl->r_wr_cv); ++ ++ if (rl->r_read_wanted) ++ cv_broadcast(&rl->r_rd_cv); ++ ++ list_insert_tail(&free_list, rl); ++ } else { ++ /* ++ * lock may be shared, let zfs_range_unlock_reader() ++ * release the zp->z_range_lock lock and free the rl_t ++ */ ++ zfs_range_unlock_reader(zp, rl, &free_list); ++ } ++ mutex_exit(&zp->z_range_lock); ++ ++ while ((free_rl = list_head(&free_list)) != NULL) { ++ list_remove(&free_list, free_rl); ++ zfs_range_free(free_rl); ++ } ++ ++ list_destroy(&free_list); ++} ++ ++/* ++ * Reduce range locked as RL_WRITER from whole file to specified range. ++ * Asserts the whole file is exclusivly locked and so there's only one ++ * entry in the tree. ++ */ ++void ++zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) ++{ ++ znode_t *zp = rl->r_zp; ++ ++ /* Ensure there are no other locks */ ++ ASSERT(avl_numnodes(&zp->z_range_avl) == 1); ++ ASSERT(rl->r_off == 0); ++ ASSERT(rl->r_type == RL_WRITER); ++ ASSERT(!rl->r_proxy); ++ ASSERT3U(rl->r_len, ==, UINT64_MAX); ++ ASSERT3U(rl->r_cnt, ==, 1); ++ ++ mutex_enter(&zp->z_range_lock); ++ rl->r_off = off; ++ rl->r_len = len; ++ ++ if (rl->r_write_wanted) ++ cv_broadcast(&rl->r_wr_cv); ++ if (rl->r_read_wanted) ++ cv_broadcast(&rl->r_rd_cv); ++ ++ mutex_exit(&zp->z_range_lock); ++} ++ ++/* ++ * AVL comparison function used to order range locks ++ * Locks are ordered on the start offset of the range. ++ */ ++int ++zfs_range_compare(const void *arg1, const void *arg2) ++{ ++ const rl_t *rl1 = arg1; ++ const rl_t *rl2 = arg2; ++ ++ if (rl1->r_off > rl2->r_off) ++ return (1); ++ if (rl1->r_off < rl2->r_off) ++ return (-1); ++ return (0); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_sa.c linux-3.2.33-go/fs/zfs/zfs/zfs_sa.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_sa.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_sa.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,425 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * ZPL attribute registration table. ++ * Order of attributes doesn't matter ++ * a unique value will be assigned for each ++ * attribute that is file system specific ++ * ++ * This is just the set of ZPL attributes that this ++ * version of ZFS deals with natively. The file system ++ * could have other attributes stored in files, but they will be ++ * ignored. The SA framework will preserve them, just that ++ * this version of ZFS won't change or delete them. ++ */ ++ ++sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { ++ {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, ++ {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, ++ {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, ++ {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, ++ {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, ++ {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, ++ {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, ++ {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, ++ {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, ++ {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, ++ {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, ++ {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, ++ {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, ++ {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, ++ {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, ++ {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, ++ {"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0}, ++ {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0}, ++ {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0}, ++ {"ZPL_DACL_ACES", 0, SA_ACL, 0}, ++ {"ZPL_DXATTR", 0, SA_UINT8_ARRAY, 0}, ++ {NULL, 0, 0, 0} ++}; ++ ++#ifdef _KERNEL ++int ++zfs_sa_readlink(znode_t *zp, uio_t *uio) ++{ ++ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); ++ size_t bufsz; ++ int error; ++ ++ bufsz = zp->z_size; ++ if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) { ++ error = uiomove((caddr_t)db->db_data + ++ ZFS_OLD_ZNODE_PHYS_SIZE, ++ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); ++ } else { ++ dmu_buf_t *dbp; ++ if ((error = dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id, ++ 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) { ++ error = uiomove(dbp->db_data, ++ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); ++ dmu_buf_rele(dbp, FTAG); ++ } ++ } ++ return (error); ++} ++ ++void ++zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) ++{ ++ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); ++ ++ if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) { ++ VERIFY(dmu_set_bonus(db, ++ len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0); ++ if (len) { ++ bcopy(link, (caddr_t)db->db_data + ++ ZFS_OLD_ZNODE_PHYS_SIZE, len); ++ } ++ } else { ++ dmu_buf_t *dbp; ++ ++ zfs_grow_blocksize(zp, len, tx); ++ VERIFY(0 == dmu_buf_hold(ZTOZSB(zp)->z_os, ++ zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)); ++ ++ dmu_buf_will_dirty(dbp, tx); ++ ++ ASSERT3U(len, <=, dbp->db_size); ++ bcopy(link, dbp->db_data, len); ++ dmu_buf_rele(dbp, FTAG); ++ } ++} ++ ++void ++zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ xoptattr_t *xoap; ++ ++ ASSERT(MUTEX_HELD(&zp->z_lock)); ++ VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); ++ if (zp->z_is_sa) { ++ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zsb), ++ &xoap->xoa_av_scanstamp, ++ sizeof (xoap->xoa_av_scanstamp)) != 0) ++ return; ++ } else { ++ dmu_object_info_t doi; ++ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); ++ int len; ++ ++ if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP)) ++ return; ++ ++ sa_object_info(zp->z_sa_hdl, &doi); ++ len = sizeof (xoap->xoa_av_scanstamp) + ++ ZFS_OLD_ZNODE_PHYS_SIZE; ++ ++ if (len <= doi.doi_bonus_size) { ++ (void) memcpy(xoap->xoa_av_scanstamp, ++ (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, ++ sizeof (xoap->xoa_av_scanstamp)); ++ } ++ } ++ XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); ++} ++ ++void ++zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ xoptattr_t *xoap; ++ ++ ASSERT(MUTEX_HELD(&zp->z_lock)); ++ VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); ++ if (zp->z_is_sa) ++ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zsb), ++ &xoap->xoa_av_scanstamp, ++ sizeof (xoap->xoa_av_scanstamp), tx)); ++ else { ++ dmu_object_info_t doi; ++ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); ++ int len; ++ ++ sa_object_info(zp->z_sa_hdl, &doi); ++ len = sizeof (xoap->xoa_av_scanstamp) + ++ ZFS_OLD_ZNODE_PHYS_SIZE; ++ if (len > doi.doi_bonus_size) ++ VERIFY(dmu_set_bonus(db, len, tx) == 0); ++ (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, ++ xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp)); ++ ++ zp->z_pflags |= ZFS_BONUS_SCANSTAMP; ++ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zsb), ++ &zp->z_pflags, sizeof (uint64_t), tx)); ++ } ++} ++ ++int ++zfs_sa_get_xattr(znode_t *zp) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ char *obj; ++ int size; ++ int error; ++ ++ ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); ++ ASSERT(!zp->z_xattr_cached); ++ ASSERT(zp->z_is_sa); ++ ++ error = sa_size(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), &size); ++ if (error) { ++ if (error == ENOENT) ++ return nvlist_alloc(&zp->z_xattr_cached, ++ NV_UNIQUE_NAME, KM_SLEEP); ++ else ++ return (error); ++ } ++ ++ obj = sa_spill_alloc(KM_SLEEP); ++ ++ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), obj, size); ++ if (error == 0) ++ error = nvlist_unpack(obj, size, &zp->z_xattr_cached, KM_SLEEP); ++ ++ sa_spill_free(obj); ++ ++ return (error); ++} ++ ++int ++zfs_sa_set_xattr(znode_t *zp) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ dmu_tx_t *tx; ++ char *obj; ++ size_t size; ++ int error; ++ ++ ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock)); ++ ASSERT(zp->z_xattr_cached); ++ ASSERT(zp->z_is_sa); ++ ++ error = nvlist_size(zp->z_xattr_cached, &size, NV_ENCODE_XDR); ++ if (error) ++ goto out; ++ ++ obj = sa_spill_alloc(KM_SLEEP); ++ ++ error = nvlist_pack(zp->z_xattr_cached, &obj, &size, ++ NV_ENCODE_XDR, KM_SLEEP); ++ if (error) ++ goto out_free; ++ ++ tx = dmu_tx_create(zsb->z_os); ++ dmu_tx_hold_sa_create(tx, size); ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); ++ ++ error = dmu_tx_assign(tx, TXG_WAIT); ++ if (error) { ++ dmu_tx_abort(tx); ++ } else { ++ error = sa_update(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), ++ obj, size, tx); ++ if (error) ++ dmu_tx_abort(tx); ++ else ++ dmu_tx_commit(tx); ++ } ++out_free: ++ sa_spill_free(obj); ++out: ++ return (error); ++} ++ ++/* ++ * I'm not convinced we should do any of this upgrade. ++ * since the SA code can read both old/new znode formats ++ * with probably little to know performance difference. ++ * ++ * All new files will be created with the new format. ++ */ ++ ++void ++zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) ++{ ++ dmu_buf_t *db = sa_get_db(hdl); ++ znode_t *zp = sa_get_userdata(hdl); ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ int count = 0; ++ sa_bulk_attr_t *bulk, *sa_attrs; ++ zfs_acl_locator_cb_t locate = { 0 }; ++ uint64_t uid, gid, mode, rdev, xattr, parent; ++ uint64_t crtime[2], mtime[2], ctime[2]; ++ zfs_acl_phys_t znode_acl; ++ char scanstamp[AV_SCANSTAMP_SZ]; ++ boolean_t drop_lock = B_FALSE; ++ ++ /* ++ * No upgrade if ACL isn't cached ++ * since we won't know which locks are held ++ * and ready the ACL would require special "locked" ++ * interfaces that would be messy ++ */ ++ if (zp->z_acl_cached == NULL || S_ISLNK(ZTOI(zp)->i_mode)) ++ return; ++ ++ /* ++ * If the z_lock is held and we aren't the owner ++ * the just return since we don't want to deadlock ++ * trying to update the status of z_is_sa. This ++ * file can then be upgraded at a later time. ++ * ++ * Otherwise, we know we are doing the ++ * sa_update() that caused us to enter this function. ++ */ ++ if (mutex_owner(&zp->z_lock) != curthread) { ++ if (mutex_tryenter(&zp->z_lock) == 0) ++ return; ++ else ++ drop_lock = B_TRUE; ++ } ++ ++ /* First do a bulk query of the attributes that aren't cached */ ++ bulk = kmem_alloc(sizeof(sa_bulk_attr_t) * 20, KM_SLEEP); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, &mtime, 16); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, &ctime, 16); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zsb), NULL, &crtime, 16); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zsb), NULL, &mode, 8); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zsb), NULL, &parent, 8); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zsb), NULL, &xattr, 8); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zsb), NULL, &rdev, 8); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zsb), NULL, &uid, 8); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zsb), NULL, &gid, 8); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zsb), NULL, ++ &znode_acl, 88); ++ ++ if (sa_bulk_lookup_locked(hdl, bulk, count) != 0) { ++ kmem_free(bulk, sizeof(sa_bulk_attr_t) * 20); ++ goto done; ++ } ++ ++ /* ++ * While the order here doesn't matter its best to try and organize ++ * it is such a way to pick up an already existing layout number ++ */ ++ count = 0; ++ sa_attrs = kmem_zalloc(sizeof(sa_bulk_attr_t) * 20, KM_SLEEP); ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zsb), NULL, &mode, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zsb), NULL, ++ &zp->z_size, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zsb), ++ NULL, &zp->z_gen, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zsb), NULL, &uid, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zsb), NULL, &gid, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zsb), ++ NULL, &parent, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zsb), NULL, ++ &zp->z_pflags, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zsb), NULL, ++ zp->z_atime, 16); ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zsb), NULL, ++ &mtime, 16); ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zsb), NULL, ++ &ctime, 16); ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zsb), NULL, ++ &crtime, 16); ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zsb), NULL, ++ &zp->z_links, 8); ++ if (S_ISBLK(ZTOI(zp)->i_mode) || S_ISCHR(ZTOI(zp)->i_mode)) ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zsb), NULL, ++ &rdev, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zsb), NULL, ++ &zp->z_acl_cached->z_acl_count, 8); ++ ++ if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID) ++ zfs_acl_xform(zp, zp->z_acl_cached, CRED()); ++ ++ locate.cb_aclp = zp->z_acl_cached; ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zsb), ++ zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes); ++ ++ if (xattr) ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zsb), ++ NULL, &xattr, 8); ++ ++ /* if scanstamp then add scanstamp */ ++ ++ if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { ++ bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, ++ scanstamp, AV_SCANSTAMP_SZ); ++ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zsb), ++ NULL, scanstamp, AV_SCANSTAMP_SZ); ++ zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; ++ } ++ ++ VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0); ++ VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs, ++ count, tx) == 0); ++ if (znode_acl.z_acl_extern_obj) ++ VERIFY(0 == dmu_object_free(zsb->z_os, ++ znode_acl.z_acl_extern_obj, tx)); ++ ++ zp->z_is_sa = B_TRUE; ++ kmem_free(sa_attrs, sizeof(sa_bulk_attr_t) * 20); ++ kmem_free(bulk, sizeof(sa_bulk_attr_t) * 20); ++done: ++ if (drop_lock) ++ mutex_exit(&zp->z_lock); ++} ++ ++void ++zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp) ++{ ++ if (!ZTOZSB(zp)->z_use_sa || zp->z_is_sa) ++ return; ++ ++ ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); ++ ++ if (zfs_external_acl(zp)) { ++ dmu_tx_hold_free(tx, zfs_external_acl(zp), 0, ++ DMU_OBJECT_END); ++ } ++} ++ ++EXPORT_SYMBOL(zfs_attr_table); ++EXPORT_SYMBOL(zfs_sa_readlink); ++EXPORT_SYMBOL(zfs_sa_symlink); ++EXPORT_SYMBOL(zfs_sa_get_scanstamp); ++EXPORT_SYMBOL(zfs_sa_set_scanstamp); ++EXPORT_SYMBOL(zfs_sa_get_xattr); ++EXPORT_SYMBOL(zfs_sa_set_xattr); ++EXPORT_SYMBOL(zfs_sa_upgrade); ++EXPORT_SYMBOL(zfs_sa_upgrade_txholds); ++ ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_vfsops.c linux-3.2.33-go/fs/zfs/zfs/zfs_vfsops.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_vfsops.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_vfsops.c 2012-11-16 23:25:34.350039322 +0100 +@@ -0,0 +1,1593 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* Portions Copyright 2010 Robert Milkowski */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "fs/fs_subr.h" ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "zfs_comutil.h" ++ ++ ++/*ARGSUSED*/ ++int ++zfs_sync(struct super_block *sb, int wait, cred_t *cr) ++{ ++ zfs_sb_t *zsb = sb->s_fs_info; ++ ++ /* ++ * Data integrity is job one. We don't want a compromised kernel ++ * writing to the storage pool, so we never sync during panic. ++ */ ++ if (unlikely(oops_in_progress)) ++ return (0); ++ ++ /* ++ * Semantically, the only requirement is that the sync be initiated. ++ * The DMU syncs out txgs frequently, so there's nothing to do. ++ */ ++ if (!wait) ++ return (0); ++ ++ if (zsb != NULL) { ++ /* ++ * Sync a specific filesystem. ++ */ ++ dsl_pool_t *dp; ++ ++ ZFS_ENTER(zsb); ++ dp = dmu_objset_pool(zsb->z_os); ++ ++ /* ++ * If the system is shutting down, then skip any ++ * filesystems which may exist on a suspended pool. ++ */ ++ if (spa_suspended(dp->dp_spa)) { ++ ZFS_EXIT(zsb); ++ return (0); ++ } ++ ++ if (zsb->z_log != NULL) ++ zil_commit(zsb->z_log, 0); ++ ++ ZFS_EXIT(zsb); ++ } else { ++ /* ++ * Sync all ZFS filesystems. This is what happens when you ++ * run sync(1M). Unlike other filesystems, ZFS honors the ++ * request by waiting for all pools to commit all dirty data. ++ */ ++ spa_sync_allpools(); ++ } ++ ++ return (0); ++} ++EXPORT_SYMBOL(zfs_sync); ++ ++boolean_t ++zfs_is_readonly(zfs_sb_t *zsb) ++{ ++ return (!!(zsb->z_sb->s_flags & MS_RDONLY)); ++} ++EXPORT_SYMBOL(zfs_is_readonly); ++ ++static void ++atime_changed_cb(void *arg, uint64_t newval) ++{ ++ ((zfs_sb_t *)arg)->z_atime = newval; ++} ++ ++static void ++xattr_changed_cb(void *arg, uint64_t newval) ++{ ++ zfs_sb_t *zsb = arg; ++ ++ if (newval == ZFS_XATTR_OFF) { ++ zsb->z_flags &= ~ZSB_XATTR; ++ } else { ++ zsb->z_flags |= ZSB_XATTR; ++ ++ if (newval == ZFS_XATTR_SA) ++ zsb->z_xattr_sa = B_TRUE; ++ else ++ zsb->z_xattr_sa = B_FALSE; ++ } ++} ++ ++static void ++blksz_changed_cb(void *arg, uint64_t newval) ++{ ++ zfs_sb_t *zsb = arg; ++ ++ if (newval < SPA_MINBLOCKSIZE || ++ newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) ++ newval = SPA_MAXBLOCKSIZE; ++ ++ zsb->z_max_blksz = newval; ++} ++ ++static void ++readonly_changed_cb(void *arg, uint64_t newval) ++{ ++ zfs_sb_t *zsb = arg; ++ struct super_block *sb = zsb->z_sb; ++ ++ if (sb == NULL) ++ return; ++ ++ if (newval) ++ sb->s_flags |= MS_RDONLY; ++ else ++ sb->s_flags &= ~MS_RDONLY; ++} ++ ++static void ++devices_changed_cb(void *arg, uint64_t newval) ++{ ++} ++ ++static void ++setuid_changed_cb(void *arg, uint64_t newval) ++{ ++} ++ ++static void ++exec_changed_cb(void *arg, uint64_t newval) ++{ ++} ++ ++static void ++nbmand_changed_cb(void *arg, uint64_t newval) ++{ ++ zfs_sb_t *zsb = arg; ++ struct super_block *sb = zsb->z_sb; ++ ++ if (sb == NULL) ++ return; ++ ++ if (newval == TRUE) ++ sb->s_flags |= MS_MANDLOCK; ++ else ++ sb->s_flags &= ~MS_MANDLOCK; ++} ++ ++static void ++snapdir_changed_cb(void *arg, uint64_t newval) ++{ ++ ((zfs_sb_t *)arg)->z_show_ctldir = newval; ++} ++ ++static void ++vscan_changed_cb(void *arg, uint64_t newval) ++{ ++ ((zfs_sb_t *)arg)->z_vscan = newval; ++} ++ ++static void ++acl_inherit_changed_cb(void *arg, uint64_t newval) ++{ ++ ((zfs_sb_t *)arg)->z_acl_inherit = newval; ++} ++ ++int ++zfs_register_callbacks(zfs_sb_t *zsb) ++{ ++ struct dsl_dataset *ds = NULL; ++ objset_t *os = zsb->z_os; ++ int error = 0; ++ ++ if (zfs_is_readonly(zsb) || !spa_writeable(dmu_objset_spa(os))) ++ readonly_changed_cb(zsb, B_TRUE); ++ ++ /* ++ * Register property callbacks. ++ * ++ * It would probably be fine to just check for i/o error from ++ * the first prop_register(), but I guess I like to go ++ * overboard... ++ */ ++ ds = dmu_objset_ds(os); ++ error = dsl_prop_register(ds, ++ "atime", atime_changed_cb, zsb); ++ error = error ? error : dsl_prop_register(ds, ++ "xattr", xattr_changed_cb, zsb); ++ error = error ? error : dsl_prop_register(ds, ++ "recordsize", blksz_changed_cb, zsb); ++ error = error ? error : dsl_prop_register(ds, ++ "readonly", readonly_changed_cb, zsb); ++ error = error ? error : dsl_prop_register(ds, ++ "devices", devices_changed_cb, zsb); ++ error = error ? error : dsl_prop_register(ds, ++ "setuid", setuid_changed_cb, zsb); ++ error = error ? error : dsl_prop_register(ds, ++ "exec", exec_changed_cb, zsb); ++ error = error ? error : dsl_prop_register(ds, ++ "snapdir", snapdir_changed_cb, zsb); ++ error = error ? error : dsl_prop_register(ds, ++ "aclinherit", acl_inherit_changed_cb, zsb); ++ error = error ? error : dsl_prop_register(ds, ++ "vscan", vscan_changed_cb, zsb); ++ error = error ? error : dsl_prop_register(ds, ++ "nbmand", nbmand_changed_cb, zsb); ++ if (error) ++ goto unregister; ++ ++ return (0); ++ ++unregister: ++ /* ++ * We may attempt to unregister some callbacks that are not ++ * registered, but this is OK; it will simply return ENOMSG, ++ * which we will ignore. ++ */ ++ (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zsb); ++ (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zsb); ++ (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zsb); ++ (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zsb); ++ (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zsb); ++ (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zsb); ++ (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zsb); ++ (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zsb); ++ (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, ++ zsb); ++ (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zsb); ++ (void) dsl_prop_unregister(ds, "nbmand", nbmand_changed_cb, zsb); ++ ++ return (error); ++} ++EXPORT_SYMBOL(zfs_register_callbacks); ++ ++static int ++zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, ++ uint64_t *userp, uint64_t *groupp) ++{ ++ znode_phys_t *znp = data; ++ int error = 0; ++ ++ /* ++ * Is it a valid type of object to track? ++ */ ++ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) ++ return (ENOENT); ++ ++ /* ++ * If we have a NULL data pointer ++ * then assume the id's aren't changing and ++ * return EEXIST to the dmu to let it know to ++ * use the same ids ++ */ ++ if (data == NULL) ++ return (EEXIST); ++ ++ if (bonustype == DMU_OT_ZNODE) { ++ *userp = znp->zp_uid; ++ *groupp = znp->zp_gid; ++ } else { ++ int hdrsize; ++ ++ ASSERT(bonustype == DMU_OT_SA); ++ hdrsize = sa_hdrsize(data); ++ ++ if (hdrsize != 0) { ++ *userp = *((uint64_t *)((uintptr_t)data + hdrsize + ++ SA_UID_OFFSET)); ++ *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + ++ SA_GID_OFFSET)); ++ } else { ++ /* ++ * This should only happen for newly created ++ * files that haven't had the znode data filled ++ * in yet. ++ */ ++ *userp = 0; ++ *groupp = 0; ++ } ++ } ++ return (error); ++} ++ ++static void ++fuidstr_to_sid(zfs_sb_t *zsb, const char *fuidstr, ++ char *domainbuf, int buflen, uid_t *ridp) ++{ ++ uint64_t fuid; ++ const char *domain; ++ ++ fuid = strtonum(fuidstr, NULL); ++ ++ domain = zfs_fuid_find_by_idx(zsb, FUID_INDEX(fuid)); ++ if (domain) ++ (void) strlcpy(domainbuf, domain, buflen); ++ else ++ domainbuf[0] = '\0'; ++ *ridp = FUID_RID(fuid); ++} ++ ++static uint64_t ++zfs_userquota_prop_to_obj(zfs_sb_t *zsb, zfs_userquota_prop_t type) ++{ ++ switch (type) { ++ case ZFS_PROP_USERUSED: ++ return (DMU_USERUSED_OBJECT); ++ case ZFS_PROP_GROUPUSED: ++ return (DMU_GROUPUSED_OBJECT); ++ case ZFS_PROP_USERQUOTA: ++ return (zsb->z_userquota_obj); ++ case ZFS_PROP_GROUPQUOTA: ++ return (zsb->z_groupquota_obj); ++ default: ++ return (ENOTSUP); ++ } ++ return (0); ++} ++ ++int ++zfs_userspace_many(zfs_sb_t *zsb, zfs_userquota_prop_t type, ++ uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) ++{ ++ int error; ++ zap_cursor_t zc; ++ zap_attribute_t za; ++ zfs_useracct_t *buf = vbuf; ++ uint64_t obj; ++ ++ if (!dmu_objset_userspace_present(zsb->z_os)) ++ return (ENOTSUP); ++ ++ obj = zfs_userquota_prop_to_obj(zsb, type); ++ if (obj == 0) { ++ *bufsizep = 0; ++ return (0); ++ } ++ ++ for (zap_cursor_init_serialized(&zc, zsb->z_os, obj, *cookiep); ++ (error = zap_cursor_retrieve(&zc, &za)) == 0; ++ zap_cursor_advance(&zc)) { ++ if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > ++ *bufsizep) ++ break; ++ ++ fuidstr_to_sid(zsb, za.za_name, ++ buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); ++ ++ buf->zu_space = za.za_first_integer; ++ buf++; ++ } ++ if (error == ENOENT) ++ error = 0; ++ ++ ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); ++ *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; ++ *cookiep = zap_cursor_serialize(&zc); ++ zap_cursor_fini(&zc); ++ return (error); ++} ++EXPORT_SYMBOL(zfs_userspace_many); ++ ++/* ++ * buf must be big enough (eg, 32 bytes) ++ */ ++static int ++id_to_fuidstr(zfs_sb_t *zsb, const char *domain, uid_t rid, ++ char *buf, boolean_t addok) ++{ ++ uint64_t fuid; ++ int domainid = 0; ++ ++ if (domain && domain[0]) { ++ domainid = zfs_fuid_find_by_domain(zsb, domain, NULL, addok); ++ if (domainid == -1) ++ return (ENOENT); ++ } ++ fuid = FUID_ENCODE(domainid, rid); ++ (void) sprintf(buf, "%llx", (longlong_t)fuid); ++ return (0); ++} ++ ++int ++zfs_userspace_one(zfs_sb_t *zsb, zfs_userquota_prop_t type, ++ const char *domain, uint64_t rid, uint64_t *valp) ++{ ++ char buf[32]; ++ int err; ++ uint64_t obj; ++ ++ *valp = 0; ++ ++ if (!dmu_objset_userspace_present(zsb->z_os)) ++ return (ENOTSUP); ++ ++ obj = zfs_userquota_prop_to_obj(zsb, type); ++ if (obj == 0) ++ return (0); ++ ++ err = id_to_fuidstr(zsb, domain, rid, buf, B_FALSE); ++ if (err) ++ return (err); ++ ++ err = zap_lookup(zsb->z_os, obj, buf, 8, 1, valp); ++ if (err == ENOENT) ++ err = 0; ++ return (err); ++} ++EXPORT_SYMBOL(zfs_userspace_one); ++ ++int ++zfs_set_userquota(zfs_sb_t *zsb, zfs_userquota_prop_t type, ++ const char *domain, uint64_t rid, uint64_t quota) ++{ ++ char buf[32]; ++ int err; ++ dmu_tx_t *tx; ++ uint64_t *objp; ++ boolean_t fuid_dirtied; ++ ++ if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) ++ return (EINVAL); ++ ++ if (zsb->z_version < ZPL_VERSION_USERSPACE) ++ return (ENOTSUP); ++ ++ objp = (type == ZFS_PROP_USERQUOTA) ? &zsb->z_userquota_obj : ++ &zsb->z_groupquota_obj; ++ ++ err = id_to_fuidstr(zsb, domain, rid, buf, B_TRUE); ++ if (err) ++ return (err); ++ fuid_dirtied = zsb->z_fuid_dirty; ++ ++ tx = dmu_tx_create(zsb->z_os); ++ dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); ++ if (*objp == 0) { ++ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ++ zfs_userquota_prop_prefixes[type]); ++ } ++ if (fuid_dirtied) ++ zfs_fuid_txhold(zsb, tx); ++ err = dmu_tx_assign(tx, TXG_WAIT); ++ if (err) { ++ dmu_tx_abort(tx); ++ return (err); ++ } ++ ++ mutex_enter(&zsb->z_lock); ++ if (*objp == 0) { ++ *objp = zap_create(zsb->z_os, DMU_OT_USERGROUP_QUOTA, ++ DMU_OT_NONE, 0, tx); ++ VERIFY(0 == zap_add(zsb->z_os, MASTER_NODE_OBJ, ++ zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); ++ } ++ mutex_exit(&zsb->z_lock); ++ ++ if (quota == 0) { ++ err = zap_remove(zsb->z_os, *objp, buf, tx); ++ if (err == ENOENT) ++ err = 0; ++ } else { ++ err = zap_update(zsb->z_os, *objp, buf, 8, 1, "a, tx); ++ } ++ ASSERT(err == 0); ++ if (fuid_dirtied) ++ zfs_fuid_sync(zsb, tx); ++ dmu_tx_commit(tx); ++ return (err); ++} ++EXPORT_SYMBOL(zfs_set_userquota); ++ ++boolean_t ++zfs_fuid_overquota(zfs_sb_t *zsb, boolean_t isgroup, uint64_t fuid) ++{ ++ char buf[32]; ++ uint64_t used, quota, usedobj, quotaobj; ++ int err; ++ ++ usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; ++ quotaobj = isgroup ? zsb->z_groupquota_obj : zsb->z_userquota_obj; ++ ++ if (quotaobj == 0 || zsb->z_replay) ++ return (B_FALSE); ++ ++ (void) sprintf(buf, "%llx", (longlong_t)fuid); ++ err = zap_lookup(zsb->z_os, quotaobj, buf, 8, 1, "a); ++ if (err != 0) ++ return (B_FALSE); ++ ++ err = zap_lookup(zsb->z_os, usedobj, buf, 8, 1, &used); ++ if (err != 0) ++ return (B_FALSE); ++ return (used >= quota); ++} ++EXPORT_SYMBOL(zfs_fuid_overquota); ++ ++boolean_t ++zfs_owner_overquota(zfs_sb_t *zsb, znode_t *zp, boolean_t isgroup) ++{ ++ uint64_t fuid; ++ uint64_t quotaobj; ++ ++ quotaobj = isgroup ? zsb->z_groupquota_obj : zsb->z_userquota_obj; ++ ++ fuid = isgroup ? zp->z_gid : zp->z_uid; ++ ++ if (quotaobj == 0 || zsb->z_replay) ++ return (B_FALSE); ++ ++ return (zfs_fuid_overquota(zsb, isgroup, fuid)); ++} ++EXPORT_SYMBOL(zfs_owner_overquota); ++ ++int ++zfs_sb_create(const char *osname, zfs_sb_t **zsbp) ++{ ++ objset_t *os; ++ zfs_sb_t *zsb; ++ uint64_t zval; ++ int i, error; ++ uint64_t sa_obj; ++ ++ zsb = kmem_zalloc(sizeof (zfs_sb_t), KM_SLEEP | KM_NODEBUG); ++ ++ /* ++ * We claim to always be readonly so we can open snapshots; ++ * other ZPL code will prevent us from writing to snapshots. ++ */ ++ error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zsb, &os); ++ if (error) { ++ kmem_free(zsb, sizeof (zfs_sb_t)); ++ return (error); ++ } ++ ++ /* ++ * Initialize the zfs-specific filesystem structure. ++ * Should probably make this a kmem cache, shuffle fields, ++ * and just bzero up to z_hold_mtx[]. ++ */ ++ zsb->z_sb = NULL; ++ zsb->z_parent = zsb; ++ zsb->z_max_blksz = SPA_MAXBLOCKSIZE; ++ zsb->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; ++ zsb->z_os = os; ++ ++ error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zsb->z_version); ++ if (error) { ++ goto out; ++ } else if (zsb->z_version > ++ zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { ++ (void) printk("Can't mount a version %lld file system " ++ "on a version %lld pool\n. Pool must be upgraded to mount " ++ "this file system.", (u_longlong_t)zsb->z_version, ++ (u_longlong_t)spa_version(dmu_objset_spa(os))); ++ error = ENOTSUP; ++ goto out; ++ } ++ if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) ++ goto out; ++ zsb->z_norm = (int)zval; ++ ++ if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) ++ goto out; ++ zsb->z_utf8 = (zval != 0); ++ ++ if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) ++ goto out; ++ zsb->z_case = (uint_t)zval; ++ ++ /* ++ * Fold case on file systems that are always or sometimes case ++ * insensitive. ++ */ ++ if (zsb->z_case == ZFS_CASE_INSENSITIVE || ++ zsb->z_case == ZFS_CASE_MIXED) ++ zsb->z_norm |= U8_TEXTPREP_TOUPPER; ++ ++ zsb->z_use_fuids = USE_FUIDS(zsb->z_version, zsb->z_os); ++ zsb->z_use_sa = USE_SA(zsb->z_version, zsb->z_os); ++ ++ if (zsb->z_use_sa) { ++ /* should either have both of these objects or none */ ++ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, ++ &sa_obj); ++ if (error) ++ goto out; ++ ++ error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &zval); ++ if ((error == 0) && (zval == ZFS_XATTR_SA)) ++ zsb->z_xattr_sa = B_TRUE; ++ } else { ++ /* ++ * Pre SA versions file systems should never touch ++ * either the attribute registration or layout objects. ++ */ ++ sa_obj = 0; ++ } ++ ++ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, ++ &zsb->z_attr_table); ++ if (error) ++ goto out; ++ ++ if (zsb->z_version >= ZPL_VERSION_SA) ++ sa_register_update_callback(os, zfs_sa_upgrade); ++ ++ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, ++ &zsb->z_root); ++ if (error) ++ goto out; ++ ASSERT(zsb->z_root != 0); ++ ++ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, ++ &zsb->z_unlinkedobj); ++ if (error) ++ goto out; ++ ++ error = zap_lookup(os, MASTER_NODE_OBJ, ++ zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], ++ 8, 1, &zsb->z_userquota_obj); ++ if (error && error != ENOENT) ++ goto out; ++ ++ error = zap_lookup(os, MASTER_NODE_OBJ, ++ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], ++ 8, 1, &zsb->z_groupquota_obj); ++ if (error && error != ENOENT) ++ goto out; ++ ++ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, ++ &zsb->z_fuid_obj); ++ if (error && error != ENOENT) ++ goto out; ++ ++ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, ++ &zsb->z_shares_dir); ++ if (error && error != ENOENT) ++ goto out; ++ ++ mutex_init(&zsb->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&zsb->z_lock, NULL, MUTEX_DEFAULT, NULL); ++ list_create(&zsb->z_all_znodes, sizeof (znode_t), ++ offsetof(znode_t, z_link_node)); ++ rrw_init(&zsb->z_teardown_lock); ++ rw_init(&zsb->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); ++ rw_init(&zsb->z_fuid_lock, NULL, RW_DEFAULT, NULL); ++ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) ++ mutex_init(&zsb->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); ++ ++ avl_create(&zsb->z_ctldir_snaps, snapentry_compare, ++ sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node)); ++ mutex_init(&zsb->z_ctldir_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ *zsbp = zsb; ++ return (0); ++ ++out: ++ dmu_objset_disown(os, zsb); ++ *zsbp = NULL; ++ kmem_free(zsb, sizeof (zfs_sb_t)); ++ return (error); ++} ++EXPORT_SYMBOL(zfs_sb_create); ++ ++int ++zfs_sb_setup(zfs_sb_t *zsb, boolean_t mounting) ++{ ++ int error; ++ ++ error = zfs_register_callbacks(zsb); ++ if (error) ++ return (error); ++ ++ /* ++ * Set the objset user_ptr to track its zsb. ++ */ ++ mutex_enter(&zsb->z_os->os_user_ptr_lock); ++ dmu_objset_set_user(zsb->z_os, zsb); ++ mutex_exit(&zsb->z_os->os_user_ptr_lock); ++ ++ zsb->z_log = zil_open(zsb->z_os, zfs_get_data); ++ ++ /* ++ * If we are not mounting (ie: online recv), then we don't ++ * have to worry about replaying the log as we blocked all ++ * operations out since we closed the ZIL. ++ */ ++ if (mounting) { ++ boolean_t readonly; ++ ++ /* ++ * During replay we remove the read only flag to ++ * allow replays to succeed. ++ */ ++ readonly = zfs_is_readonly(zsb); ++ if (readonly != 0) ++ readonly_changed_cb(zsb, B_FALSE); ++ else ++ zfs_unlinked_drain(zsb); ++ ++ /* ++ * Parse and replay the intent log. ++ * ++ * Because of ziltest, this must be done after ++ * zfs_unlinked_drain(). (Further note: ziltest ++ * doesn't use readonly mounts, where ++ * zfs_unlinked_drain() isn't called.) This is because ++ * ziltest causes spa_sync() to think it's committed, ++ * but actually it is not, so the intent log contains ++ * many txg's worth of changes. ++ * ++ * In particular, if object N is in the unlinked set in ++ * the last txg to actually sync, then it could be ++ * actually freed in a later txg and then reallocated ++ * in a yet later txg. This would write a "create ++ * object N" record to the intent log. Normally, this ++ * would be fine because the spa_sync() would have ++ * written out the fact that object N is free, before ++ * we could write the "create object N" intent log ++ * record. ++ * ++ * But when we are in ziltest mode, we advance the "open ++ * txg" without actually spa_sync()-ing the changes to ++ * disk. So we would see that object N is still ++ * allocated and in the unlinked set, and there is an ++ * intent log record saying to allocate it. ++ */ ++ if (spa_writeable(dmu_objset_spa(zsb->z_os))) { ++ if (zil_replay_disable) { ++ zil_destroy(zsb->z_log, B_FALSE); ++ } else { ++ zsb->z_replay = B_TRUE; ++ zil_replay(zsb->z_os, zsb, ++ zfs_replay_vector); ++ zsb->z_replay = B_FALSE; ++ } ++ } ++ ++ /* restore readonly bit */ ++ if (readonly != 0) ++ readonly_changed_cb(zsb, B_TRUE); ++ } ++ ++ return (0); ++} ++EXPORT_SYMBOL(zfs_sb_setup); ++ ++void ++zfs_sb_free(zfs_sb_t *zsb) ++{ ++ int i; ++ ++ zfs_fuid_destroy(zsb); ++ ++ mutex_destroy(&zsb->z_znodes_lock); ++ mutex_destroy(&zsb->z_lock); ++ list_destroy(&zsb->z_all_znodes); ++ rrw_destroy(&zsb->z_teardown_lock); ++ rw_destroy(&zsb->z_teardown_inactive_lock); ++ rw_destroy(&zsb->z_fuid_lock); ++ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) ++ mutex_destroy(&zsb->z_hold_mtx[i]); ++ mutex_destroy(&zsb->z_ctldir_lock); ++ avl_destroy(&zsb->z_ctldir_snaps); ++ kmem_free(zsb, sizeof (zfs_sb_t)); ++} ++EXPORT_SYMBOL(zfs_sb_free); ++ ++static void ++zfs_set_fuid_feature(zfs_sb_t *zsb) ++{ ++ zsb->z_use_fuids = USE_FUIDS(zsb->z_version, zsb->z_os); ++ zsb->z_use_sa = USE_SA(zsb->z_version, zsb->z_os); ++} ++ ++void ++zfs_unregister_callbacks(zfs_sb_t *zsb) ++{ ++ objset_t *os = zsb->z_os; ++ struct dsl_dataset *ds; ++ ++ /* ++ * Unregister properties. ++ */ ++ if (!dmu_objset_is_snapshot(os)) { ++ ds = dmu_objset_ds(os); ++ VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, ++ zsb) == 0); ++ ++ VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, ++ zsb) == 0); ++ ++ VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, ++ zsb) == 0); ++ ++ VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, ++ zsb) == 0); ++ ++ VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb, ++ zsb) == 0); ++ ++ VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, ++ zsb) == 0); ++ ++ VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, ++ zsb) == 0); ++ ++ VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, ++ zsb) == 0); ++ ++ VERIFY(dsl_prop_unregister(ds, "aclinherit", ++ acl_inherit_changed_cb, zsb) == 0); ++ ++ VERIFY(dsl_prop_unregister(ds, "vscan", ++ vscan_changed_cb, zsb) == 0); ++ ++ VERIFY(dsl_prop_unregister(ds, "nbmand", ++ nbmand_changed_cb, zsb) == 0); ++ } ++} ++EXPORT_SYMBOL(zfs_unregister_callbacks); ++ ++#ifdef HAVE_MLSLABEL ++/* ++ * zfs_check_global_label: ++ * Check that the hex label string is appropriate for the dataset ++ * being mounted into the global_zone proper. ++ * ++ * Return an error if the hex label string is not default or ++ * admin_low/admin_high. For admin_low labels, the corresponding ++ * dataset must be readonly. ++ */ ++int ++zfs_check_global_label(const char *dsname, const char *hexsl) ++{ ++ if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) ++ return (0); ++ if (strcasecmp(hexsl, ADMIN_HIGH) == 0) ++ return (0); ++ if (strcasecmp(hexsl, ADMIN_LOW) == 0) { ++ /* must be readonly */ ++ uint64_t rdonly; ++ ++ if (dsl_prop_get_integer(dsname, ++ zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) ++ return (EACCES); ++ return (rdonly ? 0 : EACCES); ++ } ++ return (EACCES); ++} ++EXPORT_SYMBOL(zfs_check_global_label); ++#endif /* HAVE_MLSLABEL */ ++ ++int ++zfs_statvfs(struct dentry *dentry, struct kstatfs *statp) ++{ ++ zfs_sb_t *zsb = dentry->d_sb->s_fs_info; ++ uint64_t refdbytes, availbytes, usedobjs, availobjs; ++ uint64_t fsid; ++ uint32_t bshift; ++ ++ ZFS_ENTER(zsb); ++ ++ dmu_objset_space(zsb->z_os, ++ &refdbytes, &availbytes, &usedobjs, &availobjs); ++ ++ fsid = dmu_objset_fsid_guid(zsb->z_os); ++ /* ++ * The underlying storage pool actually uses multiple block ++ * size. Under Solaris frsize (fragment size) is reported as ++ * the smallest block size we support, and bsize (block size) ++ * as the filesystem's maximum block size. Unfortunately, ++ * under Linux the fragment size and block size are often used ++ * interchangeably. Thus we are forced to report both of them ++ * as the filesystem's maximum block size. ++ */ ++ statp->f_frsize = zsb->z_max_blksz; ++ statp->f_bsize = zsb->z_max_blksz; ++ bshift = fls(statp->f_bsize) - 1; ++ ++ /* ++ * The following report "total" blocks of various kinds in ++ * the file system, but reported in terms of f_bsize - the ++ * "preferred" size. ++ */ ++ ++ statp->f_blocks = (refdbytes + availbytes) >> bshift; ++ statp->f_bfree = availbytes >> bshift; ++ statp->f_bavail = statp->f_bfree; /* no root reservation */ ++ ++ /* ++ * statvfs() should really be called statufs(), because it assumes ++ * static metadata. ZFS doesn't preallocate files, so the best ++ * we can do is report the max that could possibly fit in f_files, ++ * and that minus the number actually used in f_ffree. ++ * For f_ffree, report the smaller of the number of object available ++ * and the number of blocks (each object will take at least a block). ++ */ ++ statp->f_ffree = MIN(availobjs, availbytes >> DNODE_SHIFT); ++ statp->f_files = statp->f_ffree + usedobjs; ++ statp->f_fsid.val[0] = (uint32_t)fsid; ++ statp->f_fsid.val[1] = (uint32_t)(fsid >> 32); ++ statp->f_type = ZFS_SUPER_MAGIC; ++ statp->f_namelen = ZFS_MAXNAMELEN; ++ ++ /* ++ * We have all of 40 characters to stuff a string here. ++ * Is there anything useful we could/should provide? ++ */ ++ bzero(statp->f_spare, sizeof (statp->f_spare)); ++ ++ ZFS_EXIT(zsb); ++ return (0); ++} ++EXPORT_SYMBOL(zfs_statvfs); ++ ++int ++zfs_root(zfs_sb_t *zsb, struct inode **ipp) ++{ ++ znode_t *rootzp; ++ int error; ++ ++ ZFS_ENTER(zsb); ++ ++ error = zfs_zget(zsb, zsb->z_root, &rootzp); ++ if (error == 0) ++ *ipp = ZTOI(rootzp); ++ ++ ZFS_EXIT(zsb); ++ return (error); ++} ++EXPORT_SYMBOL(zfs_root); ++ ++#ifdef HAVE_SHRINK ++int ++zfs_sb_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) ++{ ++ zfs_sb_t *zsb = sb->s_fs_info; ++ struct shrinker *shrinker = &sb->s_shrink; ++ struct shrink_control sc = { ++ .nr_to_scan = nr_to_scan, ++ .gfp_mask = GFP_KERNEL, ++ }; ++ ++ ZFS_ENTER(zsb); ++ *objects = (*shrinker->shrink)(shrinker, &sc); ++ ZFS_EXIT(zsb); ++ ++ return (0); ++} ++EXPORT_SYMBOL(zfs_sb_prune); ++#endif /* HAVE_SHRINK */ ++ ++/* ++ * Teardown the zfs_sb_t::z_os. ++ * ++ * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' ++ * and 'z_teardown_inactive_lock' held. ++ */ ++int ++zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) ++{ ++ znode_t *zp; ++ ++ rrw_enter(&zsb->z_teardown_lock, RW_WRITER, FTAG); ++ ++ if (!unmounting) { ++ /* ++ * We purge the parent filesystem's super block as the ++ * parent filesystem and all of its snapshots have their ++ * inode's super block set to the parent's filesystem's ++ * super block. Note, 'z_parent' is self referential ++ * for non-snapshots. ++ */ ++ shrink_dcache_sb(zsb->z_parent->z_sb); ++ (void) spl_invalidate_inodes(zsb->z_parent->z_sb, 0); ++ } ++ ++ /* ++ * Drain the iput_taskq to ensure all active references to the ++ * zfs_sb_t have been handled only then can it be safely destroyed. ++ */ ++ taskq_wait(dsl_pool_iput_taskq(dmu_objset_pool(zsb->z_os))); ++ ++ /* ++ * Close the zil. NB: Can't close the zil while zfs_inactive ++ * threads are blocked as zil_close can call zfs_inactive. ++ */ ++ if (zsb->z_log) { ++ zil_close(zsb->z_log); ++ zsb->z_log = NULL; ++ } ++ ++ rw_enter(&zsb->z_teardown_inactive_lock, RW_WRITER); ++ ++ /* ++ * If we are not unmounting (ie: online recv) and someone already ++ * unmounted this file system while we were doing the switcheroo, ++ * or a reopen of z_os failed then just bail out now. ++ */ ++ if (!unmounting && (zsb->z_unmounted || zsb->z_os == NULL)) { ++ rw_exit(&zsb->z_teardown_inactive_lock); ++ rrw_exit(&zsb->z_teardown_lock, FTAG); ++ return (EIO); ++ } ++ ++ /* ++ * At this point there are no vops active, and any new vops will ++ * fail with EIO since we have z_teardown_lock for writer (only ++ * relavent for forced unmount). ++ * ++ * Release all holds on dbufs. ++ */ ++ mutex_enter(&zsb->z_znodes_lock); ++ for (zp = list_head(&zsb->z_all_znodes); zp != NULL; ++ zp = list_next(&zsb->z_all_znodes, zp)) ++ if (zp->z_sa_hdl) { ++ ASSERT(atomic_read(&ZTOI(zp)->i_count) > 0); ++ zfs_znode_dmu_fini(zp); ++ } ++ mutex_exit(&zsb->z_znodes_lock); ++ ++ /* ++ * If we are unmounting, set the unmounted flag and let new vops ++ * unblock. zfs_inactive will have the unmounted behavior, and all ++ * other vops will fail with EIO. ++ */ ++ if (unmounting) { ++ zsb->z_unmounted = B_TRUE; ++ rrw_exit(&zsb->z_teardown_lock, FTAG); ++ rw_exit(&zsb->z_teardown_inactive_lock); ++ } ++ ++ /* ++ * z_os will be NULL if there was an error in attempting to reopen ++ * zsb, so just return as the properties had already been ++ * ++ * unregistered and cached data had been evicted before. ++ */ ++ if (zsb->z_os == NULL) ++ return (0); ++ ++ /* ++ * Unregister properties. ++ */ ++ zfs_unregister_callbacks(zsb); ++ ++ /* ++ * Evict cached data ++ */ ++ if (dsl_dataset_is_dirty(dmu_objset_ds(zsb->z_os)) && ++ !zfs_is_readonly(zsb)) ++ txg_wait_synced(dmu_objset_pool(zsb->z_os), 0); ++ (void) dmu_objset_evict_dbufs(zsb->z_os); ++ ++ return (0); ++} ++EXPORT_SYMBOL(zfs_sb_teardown); ++ ++#if defined(HAVE_BDI) && !defined(HAVE_BDI_SETUP_AND_REGISTER) ++atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0); ++#endif /* HAVE_BDI && !HAVE_BDI_SETUP_AND_REGISTER */ ++ ++int ++zfs_domount(struct super_block *sb, void *data, int silent) ++{ ++ zpl_mount_data_t *zmd = data; ++ const char *osname = zmd->z_osname; ++ zfs_sb_t *zsb; ++ struct inode *root_inode; ++ uint64_t recordsize; ++ int error; ++ ++ error = zfs_sb_create(osname, &zsb); ++ if (error) ++ return (error); ++ ++ if ((error = dsl_prop_get_integer(osname, "recordsize", ++ &recordsize, NULL))) ++ goto out; ++ ++ zsb->z_sb = sb; ++ sb->s_fs_info = zsb; ++ sb->s_magic = ZFS_SUPER_MAGIC; ++ sb->s_maxbytes = MAX_LFS_FILESIZE; ++ sb->s_time_gran = 1; ++ sb->s_blocksize = recordsize; ++ sb->s_blocksize_bits = ilog2(recordsize); ++ ++#ifdef HAVE_BDI ++ /* ++ * 2.6.32 API change, ++ * Added backing_device_info (BDI) per super block interfaces. A BDI ++ * must be configured when using a non-device backed filesystem for ++ * proper writeback. This is not required for older pdflush kernels. ++ * ++ * NOTE: Linux read-ahead is disabled in favor of zfs read-ahead. ++ */ ++ zsb->z_bdi.ra_pages = 0; ++ sb->s_bdi = &zsb->z_bdi; ++ ++ error = -bdi_setup_and_register(&zsb->z_bdi, "zfs", BDI_CAP_MAP_COPY); ++ if (error) ++ goto out; ++#endif /* HAVE_BDI */ ++ ++ /* Set callback operations for the file system. */ ++ sb->s_op = &zpl_super_operations; ++ sb->s_xattr = zpl_xattr_handlers; ++ sb->s_export_op = &zpl_export_operations; ++ ++ /* Set features for file system. */ ++ zfs_set_fuid_feature(zsb); ++ ++ if (dmu_objset_is_snapshot(zsb->z_os)) { ++ uint64_t pval; ++ ++ atime_changed_cb(zsb, B_FALSE); ++ readonly_changed_cb(zsb, B_TRUE); ++ if ((error = dsl_prop_get_integer(osname,"xattr",&pval,NULL))) ++ goto out; ++ xattr_changed_cb(zsb, pval); ++ zsb->z_issnap = B_TRUE; ++ zsb->z_os->os_sync = ZFS_SYNC_DISABLED; ++ ++ mutex_enter(&zsb->z_os->os_user_ptr_lock); ++ dmu_objset_set_user(zsb->z_os, zsb); ++ mutex_exit(&zsb->z_os->os_user_ptr_lock); ++ } else { ++ error = zfs_sb_setup(zsb, B_TRUE); ++ } ++ ++ /* Allocate a root inode for the filesystem. */ ++ error = zfs_root(zsb, &root_inode); ++ if (error) { ++ (void) zfs_umount(sb); ++ goto out; ++ } ++ ++ /* Allocate a root dentry for the filesystem */ ++ sb->s_root = d_make_root(root_inode); ++ if (sb->s_root == NULL) { ++ (void) zfs_umount(sb); ++ error = ENOMEM; ++ goto out; ++ } ++ ++ if (!zsb->z_issnap) ++ zfsctl_create(zsb); ++out: ++ if (error) { ++ dmu_objset_disown(zsb->z_os, zsb); ++ zfs_sb_free(zsb); ++ } ++ ++ return (error); ++} ++EXPORT_SYMBOL(zfs_domount); ++ ++/* ++ * Called when an unmount is requested and certain sanity checks have ++ * already passed. At this point no dentries or inodes have been reclaimed ++ * from their respective caches. We drop the extra reference on the .zfs ++ * control directory to allow everything to be reclaimed. All snapshots ++ * must already have been unmounted to reach this point. ++ */ ++void ++zfs_preumount(struct super_block *sb) ++{ ++ zfs_sb_t *zsb = sb->s_fs_info; ++ ++ if (zsb != NULL && zsb->z_ctldir != NULL) ++ zfsctl_destroy(zsb); ++} ++EXPORT_SYMBOL(zfs_preumount); ++ ++/* ++ * Called once all other unmount released tear down has occurred. ++ * It is our responsibility to release any remaining infrastructure. ++ */ ++/*ARGSUSED*/ ++int ++zfs_umount(struct super_block *sb) ++{ ++ zfs_sb_t *zsb = sb->s_fs_info; ++ objset_t *os; ++ ++ VERIFY(zfs_sb_teardown(zsb, B_TRUE) == 0); ++ os = zsb->z_os; ++ ++#ifdef HAVE_BDI ++ bdi_destroy(sb->s_bdi); ++#endif /* HAVE_BDI */ ++ ++ /* ++ * z_os will be NULL if there was an error in ++ * attempting to reopen zsb. ++ */ ++ if (os != NULL) { ++ /* ++ * Unset the objset user_ptr. ++ */ ++ mutex_enter(&os->os_user_ptr_lock); ++ dmu_objset_set_user(os, NULL); ++ mutex_exit(&os->os_user_ptr_lock); ++ ++ /* ++ * Finally release the objset ++ */ ++ dmu_objset_disown(os, zsb); ++ } ++ ++ zfs_sb_free(zsb); ++ return (0); ++} ++EXPORT_SYMBOL(zfs_umount); ++ ++int ++zfs_remount(struct super_block *sb, int *flags, char *data) ++{ ++ /* ++ * All namespace flags (MNT_*) and super block flags (MS_*) will ++ * be handled by the Linux VFS. Only handle custom options here. ++ */ ++ return (0); ++} ++EXPORT_SYMBOL(zfs_remount); ++ ++int ++zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) ++{ ++ zfs_sb_t *zsb = sb->s_fs_info; ++ znode_t *zp; ++ uint64_t object = 0; ++ uint64_t fid_gen = 0; ++ uint64_t gen_mask; ++ uint64_t zp_gen; ++ int i, err; ++ ++ *ipp = NULL; ++ ++ ZFS_ENTER(zsb); ++ ++ if (fidp->fid_len == LONG_FID_LEN) { ++ zfid_long_t *zlfid = (zfid_long_t *)fidp; ++ uint64_t objsetid = 0; ++ uint64_t setgen = 0; ++ ++ for (i = 0; i < sizeof (zlfid->zf_setid); i++) ++ objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); ++ ++ for (i = 0; i < sizeof (zlfid->zf_setgen); i++) ++ setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); ++ ++ ZFS_EXIT(zsb); ++ ++ err = zfsctl_lookup_objset(sb, objsetid, &zsb); ++ if (err) ++ return (EINVAL); ++ ++ ZFS_ENTER(zsb); ++ } ++ ++ if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { ++ zfid_short_t *zfid = (zfid_short_t *)fidp; ++ ++ for (i = 0; i < sizeof (zfid->zf_object); i++) ++ object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); ++ ++ for (i = 0; i < sizeof (zfid->zf_gen); i++) ++ fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); ++ } else { ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ ++ /* A zero fid_gen means we are in the .zfs control directories */ ++ if (fid_gen == 0 && ++ (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { ++ *ipp = zsb->z_ctldir; ++ ASSERT(*ipp != NULL); ++ if (object == ZFSCTL_INO_SNAPDIR) { ++ VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, ++ 0, kcred, NULL, NULL) == 0); ++ } else { ++ igrab(*ipp); ++ } ++ ZFS_EXIT(zsb); ++ return (0); ++ } ++ ++ gen_mask = -1ULL >> (64 - 8 * i); ++ ++ dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); ++ if ((err = zfs_zget(zsb, object, &zp))) { ++ ZFS_EXIT(zsb); ++ return (err); ++ } ++ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zsb), &zp_gen, ++ sizeof (uint64_t)); ++ zp_gen = zp_gen & gen_mask; ++ if (zp_gen == 0) ++ zp_gen = 1; ++ if (zp->z_unlinked || zp_gen != fid_gen) { ++ dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); ++ iput(ZTOI(zp)); ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ ++ *ipp = ZTOI(zp); ++ if (*ipp) ++ zfs_inode_update(ITOZ(*ipp)); ++ ++ ZFS_EXIT(zsb); ++ return (0); ++} ++EXPORT_SYMBOL(zfs_vget); ++ ++/* ++ * Block out VOPs and close zfs_sb_t::z_os ++ * ++ * Note, if successful, then we return with the 'z_teardown_lock' and ++ * 'z_teardown_inactive_lock' write held. ++ */ ++int ++zfs_suspend_fs(zfs_sb_t *zsb) ++{ ++ int error; ++ ++ if ((error = zfs_sb_teardown(zsb, B_FALSE)) != 0) ++ return (error); ++ dmu_objset_disown(zsb->z_os, zsb); ++ ++ return (0); ++} ++EXPORT_SYMBOL(zfs_suspend_fs); ++ ++/* ++ * Reopen zfs_sb_t::z_os and release VOPs. ++ */ ++int ++zfs_resume_fs(zfs_sb_t *zsb, const char *osname) ++{ ++ int err, err2; ++ ++ ASSERT(RRW_WRITE_HELD(&zsb->z_teardown_lock)); ++ ASSERT(RW_WRITE_HELD(&zsb->z_teardown_inactive_lock)); ++ ++ err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zsb, &zsb->z_os); ++ if (err) { ++ zsb->z_os = NULL; ++ } else { ++ znode_t *zp; ++ uint64_t sa_obj = 0; ++ ++ err2 = zap_lookup(zsb->z_os, MASTER_NODE_OBJ, ++ ZFS_SA_ATTRS, 8, 1, &sa_obj); ++ ++ if ((err || err2) && zsb->z_version >= ZPL_VERSION_SA) ++ goto bail; ++ ++ ++ if ((err = sa_setup(zsb->z_os, sa_obj, ++ zfs_attr_table, ZPL_END, &zsb->z_attr_table)) != 0) ++ goto bail; ++ ++ VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0); ++ ++ /* ++ * Attempt to re-establish all the active znodes with ++ * their dbufs. If a zfs_rezget() fails, then we'll let ++ * any potential callers discover that via ZFS_ENTER_VERIFY_VP ++ * when they try to use their znode. ++ */ ++ mutex_enter(&zsb->z_znodes_lock); ++ for (zp = list_head(&zsb->z_all_znodes); zp; ++ zp = list_next(&zsb->z_all_znodes, zp)) { ++ (void) zfs_rezget(zp); ++ } ++ mutex_exit(&zsb->z_znodes_lock); ++ ++ } ++ ++bail: ++ /* release the VOPs */ ++ rw_exit(&zsb->z_teardown_inactive_lock); ++ rrw_exit(&zsb->z_teardown_lock, FTAG); ++ ++ if (err) { ++ /* ++ * Since we couldn't reopen zfs_sb_t::z_os, force ++ * unmount this file system. ++ */ ++ (void) zfs_umount(zsb->z_sb); ++ } ++ return (err); ++} ++EXPORT_SYMBOL(zfs_resume_fs); ++ ++int ++zfs_set_version(zfs_sb_t *zsb, uint64_t newvers) ++{ ++ int error; ++ objset_t *os = zsb->z_os; ++ dmu_tx_t *tx; ++ ++ if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) ++ return (EINVAL); ++ ++ if (newvers < zsb->z_version) ++ return (EINVAL); ++ ++ if (zfs_spa_version_map(newvers) > ++ spa_version(dmu_objset_spa(zsb->z_os))) ++ return (ENOTSUP); ++ ++ tx = dmu_tx_create(os); ++ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); ++ if (newvers >= ZPL_VERSION_SA && !zsb->z_use_sa) { ++ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ++ ZFS_SA_ATTRS); ++ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); ++ } ++ error = dmu_tx_assign(tx, TXG_WAIT); ++ if (error) { ++ dmu_tx_abort(tx); ++ return (error); ++ } ++ ++ error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, ++ 8, 1, &newvers, tx); ++ ++ if (error) { ++ dmu_tx_commit(tx); ++ return (error); ++ } ++ ++ if (newvers >= ZPL_VERSION_SA && !zsb->z_use_sa) { ++ uint64_t sa_obj; ++ ++ ASSERT3U(spa_version(dmu_objset_spa(zsb->z_os)), >=, ++ SPA_VERSION_SA); ++ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, ++ DMU_OT_NONE, 0, tx); ++ ++ error = zap_add(os, MASTER_NODE_OBJ, ++ ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ++ ASSERT3U(error, ==, 0); ++ ++ VERIFY(0 == sa_set_sa_object(os, sa_obj)); ++ sa_register_update_callback(os, zfs_sa_upgrade); ++ } ++ ++ spa_history_log_internal(LOG_DS_UPGRADE, ++ dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu", ++ zsb->z_version, newvers, dmu_objset_id(os)); ++ ++ dmu_tx_commit(tx); ++ ++ zsb->z_version = newvers; ++ ++ if (zsb->z_version >= ZPL_VERSION_FUID) ++ zfs_set_fuid_feature(zsb); ++ ++ return (0); ++} ++EXPORT_SYMBOL(zfs_set_version); ++ ++/* ++ * Read a property stored within the master node. ++ */ ++int ++zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) ++{ ++ const char *pname; ++ int error = ENOENT; ++ ++ /* ++ * Look up the file system's value for the property. For the ++ * version property, we look up a slightly different string. ++ */ ++ if (prop == ZFS_PROP_VERSION) ++ pname = ZPL_VERSION_STR; ++ else ++ pname = zfs_prop_to_name(prop); ++ ++ if (os != NULL) ++ error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); ++ ++ if (error == ENOENT) { ++ /* No value set, use the default value */ ++ switch (prop) { ++ case ZFS_PROP_VERSION: ++ *value = ZPL_VERSION; ++ break; ++ case ZFS_PROP_NORMALIZE: ++ case ZFS_PROP_UTF8ONLY: ++ *value = 0; ++ break; ++ case ZFS_PROP_CASE: ++ *value = ZFS_CASE_SENSITIVE; ++ break; ++ default: ++ return (error); ++ } ++ error = 0; ++ } ++ return (error); ++} ++EXPORT_SYMBOL(zfs_get_zplprop); ++ ++void ++zfs_init(void) ++{ ++ zfsctl_init(); ++ zfs_znode_init(); ++ dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); ++ register_filesystem(&zpl_fs_type); ++ (void) arc_add_prune_callback(zpl_prune_sbs, NULL); ++} ++ ++void ++zfs_fini(void) ++{ ++ unregister_filesystem(&zpl_fs_type); ++ zfs_znode_fini(); ++ zfsctl_fini(); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_vnops.c linux-3.2.33-go/fs/zfs/zfs/zfs_vnops.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_vnops.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_vnops.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,4466 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* Portions Copyright 2007 Jeremy Teo */ ++/* Portions Copyright 2010 Robert Milkowski */ ++ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "fs/fs_subr.h" ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Programming rules. ++ * ++ * Each vnode op performs some logical unit of work. To do this, the ZPL must ++ * properly lock its in-core state, create a DMU transaction, do the work, ++ * record this work in the intent log (ZIL), commit the DMU transaction, ++ * and wait for the intent log to commit if it is a synchronous operation. ++ * Moreover, the vnode ops must work in both normal and log replay context. ++ * The ordering of events is important to avoid deadlocks and references ++ * to freed memory. The example below illustrates the following Big Rules: ++ * ++ * (1) A check must be made in each zfs thread for a mounted file system. ++ * This is done avoiding races using ZFS_ENTER(zsb). ++ * A ZFS_EXIT(zsb) is needed before all returns. Any znodes ++ * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros ++ * can return EIO from the calling function. ++ * ++ * (2) iput() should always be the last thing except for zil_commit() ++ * (if necessary) and ZFS_EXIT(). This is for 3 reasons: ++ * First, if it's the last reference, the vnode/znode ++ * can be freed, so the zp may point to freed memory. Second, the last ++ * reference will call zfs_zinactive(), which may induce a lot of work -- ++ * pushing cached pages (which acquires range locks) and syncing out ++ * cached atime changes. Third, zfs_zinactive() may require a new tx, ++ * which could deadlock the system if you were already holding one. ++ * If you must call iput() within a tx then use iput_ASYNC(). ++ * ++ * (3) All range locks must be grabbed before calling dmu_tx_assign(), ++ * as they can span dmu_tx_assign() calls. ++ * ++ * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign(). ++ * This is critical because we don't want to block while holding locks. ++ * Note, in particular, that if a lock is sometimes acquired before ++ * the tx assigns, and sometimes after (e.g. z_lock), then failing to ++ * use a non-blocking assign can deadlock the system. The scenario: ++ * ++ * Thread A has grabbed a lock before calling dmu_tx_assign(). ++ * Thread B is in an already-assigned tx, and blocks for this lock. ++ * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() ++ * forever, because the previous txg can't quiesce until B's tx commits. ++ * ++ * If dmu_tx_assign() returns ERESTART and zsb->z_assign is TXG_NOWAIT, ++ * then drop all locks, call dmu_tx_wait(), and try again. ++ * ++ * (5) If the operation succeeded, generate the intent log entry for it ++ * before dropping locks. This ensures that the ordering of events ++ * in the intent log matches the order in which they actually occurred. ++ * During ZIL replay the zfs_log_* functions will update the sequence ++ * number to indicate the zil transaction has replayed. ++ * ++ * (6) At the end of each vnode op, the DMU tx must always commit, ++ * regardless of whether there were any errors. ++ * ++ * (7) After dropping all locks, invoke zil_commit(zilog, foid) ++ * to ensure that synchronous semantics are provided when necessary. ++ * ++ * In general, this is how things should be ordered in each vnode op: ++ * ++ * ZFS_ENTER(zsb); // exit if unmounted ++ * top: ++ * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) ++ * rw_enter(...); // grab any other locks you need ++ * tx = dmu_tx_create(...); // get DMU tx ++ * dmu_tx_hold_*(); // hold each object you might modify ++ * error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign ++ * if (error) { ++ * rw_exit(...); // drop locks ++ * zfs_dirent_unlock(dl); // unlock directory entry ++ * iput(...); // release held vnodes ++ * if (error == ERESTART) { ++ * dmu_tx_wait(tx); ++ * dmu_tx_abort(tx); ++ * goto top; ++ * } ++ * dmu_tx_abort(tx); // abort DMU tx ++ * ZFS_EXIT(zsb); // finished in zfs ++ * return (error); // really out of space ++ * } ++ * error = do_real_work(); // do whatever this VOP does ++ * if (error == 0) ++ * zfs_log_*(...); // on success, make ZIL entry ++ * dmu_tx_commit(tx); // commit DMU tx -- error or not ++ * rw_exit(...); // drop locks ++ * zfs_dirent_unlock(dl); // unlock directory entry ++ * iput(...); // release held vnodes ++ * zil_commit(zilog, foid); // synchronous when necessary ++ * ZFS_EXIT(zsb); // finished in zfs ++ * return (error); // done, report error ++ */ ++ ++/* ++ * Virus scanning is unsupported. It would be possible to add a hook ++ * here to performance the required virus scan. This could be done ++ * entirely in the kernel or potentially as an update to invoke a ++ * scanning utility. ++ */ ++static int ++zfs_vscan(struct inode *ip, cred_t *cr, int async) ++{ ++ return (0); ++} ++ ++/* ARGSUSED */ ++int ++zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ ++ /* Honor ZFS_APPENDONLY file attribute */ ++ if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ++ ((flag & O_APPEND) == 0)) { ++ ZFS_EXIT(zsb); ++ return (EPERM); ++ } ++ ++ /* Virus scan eligible files on open */ ++ if (!zfs_has_ctldir(zp) && zsb->z_vscan && S_ISREG(ip->i_mode) && ++ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { ++ if (zfs_vscan(ip, cr, 0) != 0) { ++ ZFS_EXIT(zsb); ++ return (EACCES); ++ } ++ } ++ ++ /* Keep a count of the synchronous opens in the znode */ ++ if (flag & O_SYNC) ++ atomic_inc_32(&zp->z_sync_cnt); ++ ++ ZFS_EXIT(zsb); ++ return (0); ++} ++EXPORT_SYMBOL(zfs_open); ++ ++/* ARGSUSED */ ++int ++zfs_close(struct inode *ip, int flag, cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ ++ /* ++ * Zero the synchronous opens in the znode. Under Linux the ++ * zfs_close() hook is not symmetric with zfs_open(), it is ++ * only called once when the last reference is dropped. ++ */ ++ if (flag & O_SYNC) ++ zp->z_sync_cnt = 0; ++ ++ if (!zfs_has_ctldir(zp) && zsb->z_vscan && S_ISREG(ip->i_mode) && ++ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) ++ VERIFY(zfs_vscan(ip, cr, 1) == 0); ++ ++ ZFS_EXIT(zsb); ++ return (0); ++} ++EXPORT_SYMBOL(zfs_close); ++ ++#if defined(_KERNEL) ++/* ++ * When a file is memory mapped, we must keep the IO data synchronized ++ * between the DMU cache and the memory mapped pages. What this means: ++ * ++ * On Write: If we find a memory mapped page, we write to *both* ++ * the page and the dmu buffer. ++ */ ++static void ++update_pages(struct inode *ip, int64_t start, int len, ++ objset_t *os, uint64_t oid) ++{ ++ struct address_space *mp = ip->i_mapping; ++ struct page *pp; ++ uint64_t nbytes; ++ int64_t off; ++ void *pb; ++ ++ off = start & (PAGE_CACHE_SIZE-1); ++ for (start &= PAGE_CACHE_MASK; len > 0; start += PAGE_CACHE_SIZE) { ++ nbytes = MIN(PAGE_CACHE_SIZE - off, len); ++ ++ pp = find_lock_page(mp, start >> PAGE_CACHE_SHIFT); ++ if (pp) { ++ if (mapping_writably_mapped(mp)) ++ flush_dcache_page(pp); ++ ++ pb = kmap(pp); ++ (void) dmu_read(os, oid, start+off, nbytes, pb+off, ++ DMU_READ_PREFETCH); ++ kunmap(pp); ++ ++ if (mapping_writably_mapped(mp)) ++ flush_dcache_page(pp); ++ ++ mark_page_accessed(pp); ++ SetPageUptodate(pp); ++ ClearPageError(pp); ++ unlock_page(pp); ++ page_cache_release(pp); ++ } ++ ++ len -= nbytes; ++ off = 0; ++ } ++} ++ ++/* ++ * When a file is memory mapped, we must keep the IO data synchronized ++ * between the DMU cache and the memory mapped pages. What this means: ++ * ++ * On Read: We "read" preferentially from memory mapped pages, ++ * else we default from the dmu buffer. ++ * ++ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when ++ * the file is memory mapped. ++ */ ++static int ++mappedread(struct inode *ip, int nbytes, uio_t *uio) ++{ ++ struct address_space *mp = ip->i_mapping; ++ struct page *pp; ++ znode_t *zp = ITOZ(ip); ++ objset_t *os = ITOZSB(ip)->z_os; ++ int64_t start, off; ++ uint64_t bytes; ++ int len = nbytes; ++ int error = 0; ++ void *pb; ++ ++ start = uio->uio_loffset; ++ off = start & (PAGE_CACHE_SIZE-1); ++ for (start &= PAGE_CACHE_MASK; len > 0; start += PAGE_CACHE_SIZE) { ++ bytes = MIN(PAGE_CACHE_SIZE - off, len); ++ ++ pp = find_lock_page(mp, start >> PAGE_CACHE_SHIFT); ++ if (pp) { ++ ASSERT(PageUptodate(pp)); ++ ++ pb = kmap(pp); ++ error = uiomove(pb + off, bytes, UIO_READ, uio); ++ kunmap(pp); ++ ++ if (mapping_writably_mapped(mp)) ++ flush_dcache_page(pp); ++ ++ mark_page_accessed(pp); ++ unlock_page(pp); ++ page_cache_release(pp); ++ } else { ++ error = dmu_read_uio(os, zp->z_id, uio, bytes); ++ } ++ ++ len -= bytes; ++ off = 0; ++ if (error) ++ break; ++ } ++ return (error); ++} ++#endif /* _KERNEL */ ++ ++unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */ ++ ++/* ++ * Read bytes from specified file into supplied buffer. ++ * ++ * IN: ip - inode of file to be read from. ++ * uio - structure supplying read location, range info, ++ * and return buffer. ++ * ioflag - FSYNC flags; used to provide FRSYNC semantics. ++ * O_DIRECT flag; used to bypass page cache. ++ * cr - credentials of caller. ++ * ++ * OUT: uio - updated offset and range, buffer filled. ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Side Effects: ++ * inode - atime updated if byte count > 0 ++ */ ++/* ARGSUSED */ ++int ++zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ objset_t *os; ++ ssize_t n, nbytes; ++ int error = 0; ++ rl_t *rl; ++#ifdef HAVE_UIO_ZEROCOPY ++ xuio_t *xuio = NULL; ++#endif /* HAVE_UIO_ZEROCOPY */ ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ os = zsb->z_os; ++ ++ if (zp->z_pflags & ZFS_AV_QUARANTINED) { ++ ZFS_EXIT(zsb); ++ return (EACCES); ++ } ++ ++ /* ++ * Validate file offset ++ */ ++ if (uio->uio_loffset < (offset_t)0) { ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ ++ /* ++ * Fasttrack empty reads ++ */ ++ if (uio->uio_resid == 0) { ++ ZFS_EXIT(zsb); ++ return (0); ++ } ++ ++ /* ++ * Check for mandatory locks ++ */ ++ if (mandatory_lock(ip) && ++ !lock_may_read(ip, uio->uio_loffset, uio->uio_resid)) { ++ ZFS_EXIT(zsb); ++ return (EAGAIN); ++ } ++ ++ /* ++ * If we're in FRSYNC mode, sync out this znode before reading it. ++ */ ++ if (ioflag & FRSYNC || zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) ++ zil_commit(zsb->z_log, zp->z_id); ++ ++ /* ++ * Lock the range against changes. ++ */ ++ rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); ++ ++ /* ++ * If we are reading past end-of-file we can skip ++ * to the end; but we might still need to set atime. ++ */ ++ if (uio->uio_loffset >= zp->z_size) { ++ error = 0; ++ goto out; ++ } ++ ++ ASSERT(uio->uio_loffset < zp->z_size); ++ n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); ++ ++#ifdef HAVE_UIO_ZEROCOPY ++ if ((uio->uio_extflg == UIO_XUIO) && ++ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { ++ int nblk; ++ int blksz = zp->z_blksz; ++ uint64_t offset = uio->uio_loffset; ++ ++ xuio = (xuio_t *)uio; ++ if ((ISP2(blksz))) { ++ nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, ++ blksz)) / blksz; ++ } else { ++ ASSERT(offset + n <= blksz); ++ nblk = 1; ++ } ++ (void) dmu_xuio_init(xuio, nblk); ++ ++ if (vn_has_cached_data(ip)) { ++ /* ++ * For simplicity, we always allocate a full buffer ++ * even if we only expect to read a portion of a block. ++ */ ++ while (--nblk >= 0) { ++ (void) dmu_xuio_add(xuio, ++ dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), ++ blksz), 0, blksz); ++ } ++ } ++ } ++#endif /* HAVE_UIO_ZEROCOPY */ ++ ++ while (n > 0) { ++ nbytes = MIN(n, zfs_read_chunk_size - ++ P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); ++ ++ if (zp->z_is_mapped && !(ioflag & O_DIRECT)) ++ error = mappedread(ip, nbytes, uio); ++ else ++ error = dmu_read_uio(os, zp->z_id, uio, nbytes); ++ ++ if (error) { ++ /* convert checksum errors into IO errors */ ++ if (error == ECKSUM) ++ error = EIO; ++ break; ++ } ++ ++ n -= nbytes; ++ } ++out: ++ zfs_range_unlock(rl); ++ ++ ZFS_ACCESSTIME_STAMP(zsb, zp); ++ zfs_inode_update(zp); ++ ZFS_EXIT(zsb); ++ return (error); ++} ++EXPORT_SYMBOL(zfs_read); ++ ++/* ++ * Write the bytes to a file. ++ * ++ * IN: ip - inode of file to be written to. ++ * uio - structure supplying write location, range info, ++ * and data buffer. ++ * ioflag - FAPPEND flag set if in append mode. ++ * O_DIRECT flag; used to bypass page cache. ++ * cr - credentials of caller. ++ * ++ * OUT: uio - updated offset and range. ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Timestamps: ++ * ip - ctime|mtime updated if byte count > 0 ++ */ ++ ++/* ARGSUSED */ ++int ++zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ rlim64_t limit = uio->uio_limit; ++ ssize_t start_resid = uio->uio_resid; ++ ssize_t tx_bytes; ++ uint64_t end_size; ++ dmu_tx_t *tx; ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ zilog_t *zilog; ++ offset_t woff; ++ ssize_t n, nbytes; ++ rl_t *rl; ++ int max_blksz = zsb->z_max_blksz; ++ int error = 0; ++ arc_buf_t *abuf; ++ iovec_t *aiov = NULL; ++ xuio_t *xuio = NULL; ++ int i_iov = 0; ++ iovec_t *iovp = uio->uio_iov; ++ int write_eof; ++ int count = 0; ++ sa_bulk_attr_t bulk[4]; ++ uint64_t mtime[2], ctime[2]; ++ ASSERTV(int iovcnt = uio->uio_iovcnt); ++ ++ /* ++ * Fasttrack empty write ++ */ ++ n = start_resid; ++ if (n == 0) ++ return (0); ++ ++ if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) ++ limit = MAXOFFSET_T; ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, &mtime, 16); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, &ctime, 16); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zsb), NULL, &zp->z_size, 8); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL, ++ &zp->z_pflags, 8); ++ ++ /* ++ * If immutable or not appending then return EPERM ++ */ ++ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || ++ ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && ++ (uio->uio_loffset < zp->z_size))) { ++ ZFS_EXIT(zsb); ++ return (EPERM); ++ } ++ ++ zilog = zsb->z_log; ++ ++ /* ++ * Validate file offset ++ */ ++ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; ++ if (woff < 0) { ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ ++ /* ++ * Check for mandatory locks before calling zfs_range_lock() ++ * in order to prevent a deadlock with locks set via fcntl(). ++ */ ++ if (mandatory_lock(ip) && !lock_may_write(ip, woff, n)) { ++ ZFS_EXIT(zsb); ++ return (EAGAIN); ++ } ++ ++#ifdef HAVE_UIO_ZEROCOPY ++ /* ++ * Pre-fault the pages to ensure slow (eg NFS) pages ++ * don't hold up txg. ++ * Skip this if uio contains loaned arc_buf. ++ */ ++ if ((uio->uio_extflg == UIO_XUIO) && ++ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) ++ xuio = (xuio_t *)uio; ++ else ++ uio_prefaultpages(MIN(n, max_blksz), uio); ++#endif /* HAVE_UIO_ZEROCOPY */ ++ ++ /* ++ * If in append mode, set the io offset pointer to eof. ++ */ ++ if (ioflag & FAPPEND) { ++ /* ++ * Obtain an appending range lock to guarantee file append ++ * semantics. We reset the write offset once we have the lock. ++ */ ++ rl = zfs_range_lock(zp, 0, n, RL_APPEND); ++ woff = rl->r_off; ++ if (rl->r_len == UINT64_MAX) { ++ /* ++ * We overlocked the file because this write will cause ++ * the file block size to increase. ++ * Note that zp_size cannot change with this lock held. ++ */ ++ woff = zp->z_size; ++ } ++ uio->uio_loffset = woff; ++ } else { ++ /* ++ * Note that if the file block size will change as a result of ++ * this write, then this range lock will lock the entire file ++ * so that we can re-write the block safely. ++ */ ++ rl = zfs_range_lock(zp, woff, n, RL_WRITER); ++ } ++ ++ if (woff >= limit) { ++ zfs_range_unlock(rl); ++ ZFS_EXIT(zsb); ++ return (EFBIG); ++ } ++ ++ if ((woff + n) > limit || woff > (limit - n)) ++ n = limit - woff; ++ ++ /* Will this write extend the file length? */ ++ write_eof = (woff + n > zp->z_size); ++ ++ end_size = MAX(zp->z_size, woff + n); ++ ++ /* ++ * Write the file in reasonable size chunks. Each chunk is written ++ * in a separate transaction; this keeps the intent log records small ++ * and allows us to do more fine-grained space accounting. ++ */ ++ while (n > 0) { ++ abuf = NULL; ++ woff = uio->uio_loffset; ++again: ++ if (zfs_owner_overquota(zsb, zp, B_FALSE) || ++ zfs_owner_overquota(zsb, zp, B_TRUE)) { ++ if (abuf != NULL) ++ dmu_return_arcbuf(abuf); ++ error = EDQUOT; ++ break; ++ } ++ ++ if (xuio && abuf == NULL) { ++ ASSERT(i_iov < iovcnt); ++ aiov = &iovp[i_iov]; ++ abuf = dmu_xuio_arcbuf(xuio, i_iov); ++ dmu_xuio_clear(xuio, i_iov); ++ ASSERT((aiov->iov_base == abuf->b_data) || ++ ((char *)aiov->iov_base - (char *)abuf->b_data + ++ aiov->iov_len == arc_buf_size(abuf))); ++ i_iov++; ++ } else if (abuf == NULL && n >= max_blksz && ++ woff >= zp->z_size && ++ P2PHASE(woff, max_blksz) == 0 && ++ zp->z_blksz == max_blksz) { ++ /* ++ * This write covers a full block. "Borrow" a buffer ++ * from the dmu so that we can fill it before we enter ++ * a transaction. This avoids the possibility of ++ * holding up the transaction if the data copy hangs ++ * up on a pagefault (e.g., from an NFS server mapping). ++ */ ++ size_t cbytes; ++ ++ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), ++ max_blksz); ++ ASSERT(abuf != NULL); ++ ASSERT(arc_buf_size(abuf) == max_blksz); ++ if ((error = uiocopy(abuf->b_data, max_blksz, ++ UIO_WRITE, uio, &cbytes))) { ++ dmu_return_arcbuf(abuf); ++ break; ++ } ++ ASSERT(cbytes == max_blksz); ++ } ++ ++ /* ++ * Start a transaction. ++ */ ++ tx = dmu_tx_create(zsb->z_os); ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); ++ dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); ++ zfs_sa_upgrade_txholds(tx, zp); ++ error = dmu_tx_assign(tx, TXG_NOWAIT); ++ if (error) { ++ if (error == ERESTART) { ++ dmu_tx_wait(tx); ++ dmu_tx_abort(tx); ++ goto again; ++ } ++ dmu_tx_abort(tx); ++ if (abuf != NULL) ++ dmu_return_arcbuf(abuf); ++ break; ++ } ++ ++ /* ++ * If zfs_range_lock() over-locked we grow the blocksize ++ * and then reduce the lock range. This will only happen ++ * on the first iteration since zfs_range_reduce() will ++ * shrink down r_len to the appropriate size. ++ */ ++ if (rl->r_len == UINT64_MAX) { ++ uint64_t new_blksz; ++ ++ if (zp->z_blksz > max_blksz) { ++ ASSERT(!ISP2(zp->z_blksz)); ++ new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); ++ } else { ++ new_blksz = MIN(end_size, max_blksz); ++ } ++ zfs_grow_blocksize(zp, new_blksz, tx); ++ zfs_range_reduce(rl, woff, n); ++ } ++ ++ /* ++ * XXX - should we really limit each write to z_max_blksz? ++ * Perhaps we should use SPA_MAXBLOCKSIZE chunks? ++ */ ++ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); ++ ++ if (abuf == NULL) { ++ tx_bytes = uio->uio_resid; ++ error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), ++ uio, nbytes, tx); ++ tx_bytes -= uio->uio_resid; ++ } else { ++ tx_bytes = nbytes; ++ ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); ++ /* ++ * If this is not a full block write, but we are ++ * extending the file past EOF and this data starts ++ * block-aligned, use assign_arcbuf(). Otherwise, ++ * write via dmu_write(). ++ */ ++ if (tx_bytes < max_blksz && (!write_eof || ++ aiov->iov_base != abuf->b_data)) { ++ ASSERT(xuio); ++ dmu_write(zsb->z_os, zp->z_id, woff, ++ aiov->iov_len, aiov->iov_base, tx); ++ dmu_return_arcbuf(abuf); ++ xuio_stat_wbuf_copied(); ++ } else { ++ ASSERT(xuio || tx_bytes == max_blksz); ++ dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), ++ woff, abuf, tx); ++ } ++ ASSERT(tx_bytes <= uio->uio_resid); ++ uioskip(uio, tx_bytes); ++ } ++ ++ if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) ++ update_pages(ip, woff, tx_bytes, zsb->z_os, zp->z_id); ++ ++ /* ++ * If we made no progress, we're done. If we made even ++ * partial progress, update the znode and ZIL accordingly. ++ */ ++ if (tx_bytes == 0) { ++ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zsb), ++ (void *)&zp->z_size, sizeof (uint64_t), tx); ++ dmu_tx_commit(tx); ++ ASSERT(error != 0); ++ break; ++ } ++ ++ /* ++ * Clear Set-UID/Set-GID bits on successful write if not ++ * privileged and at least one of the excute bits is set. ++ * ++ * It would be nice to to this after all writes have ++ * been done, but that would still expose the ISUID/ISGID ++ * to another app after the partial write is committed. ++ * ++ * Note: we don't call zfs_fuid_map_id() here because ++ * user 0 is not an ephemeral uid. ++ */ ++ mutex_enter(&zp->z_acl_lock); ++ if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | ++ (S_IXUSR >> 6))) != 0 && ++ (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && ++ secpolicy_vnode_setid_retain(cr, ++ (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { ++ uint64_t newmode; ++ zp->z_mode &= ~(S_ISUID | S_ISGID); ++ newmode = zp->z_mode; ++ (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zsb), ++ (void *)&newmode, sizeof (uint64_t), tx); ++ } ++ mutex_exit(&zp->z_acl_lock); ++ ++ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, ++ B_TRUE); ++ ++ /* ++ * Update the file size (zp_size) if it has changed; ++ * account for possible concurrent updates. ++ */ ++ while ((end_size = zp->z_size) < uio->uio_loffset) { ++ (void) atomic_cas_64(&zp->z_size, end_size, ++ uio->uio_loffset); ++ ASSERT(error == 0); ++ } ++ /* ++ * If we are replaying and eof is non zero then force ++ * the file size to the specified eof. Note, there's no ++ * concurrency during replay. ++ */ ++ if (zsb->z_replay && zsb->z_replay_eof != 0) ++ zp->z_size = zsb->z_replay_eof; ++ ++ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ++ ++ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); ++ dmu_tx_commit(tx); ++ ++ if (error != 0) ++ break; ++ ASSERT(tx_bytes == nbytes); ++ n -= nbytes; ++ ++ if (!xuio && n > 0) ++ uio_prefaultpages(MIN(n, max_blksz), uio); ++ } ++ ++ zfs_range_unlock(rl); ++ ++ /* ++ * If we're in replay mode, or we made no progress, return error. ++ * Otherwise, it's at least a partial write, so it's successful. ++ */ ++ if (zsb->z_replay || uio->uio_resid == start_resid) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ if (ioflag & (FSYNC | FDSYNC) || ++ zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) ++ zil_commit(zilog, zp->z_id); ++ ++ zfs_inode_update(zp); ++ ZFS_EXIT(zsb); ++ return (0); ++} ++EXPORT_SYMBOL(zfs_write); ++ ++static void ++iput_async(struct inode *ip, taskq_t *taskq) ++{ ++ ASSERT(atomic_read(&ip->i_count) > 0); ++ if (atomic_read(&ip->i_count) == 1) ++ taskq_dispatch(taskq, (task_func_t *)iput, ip, TQ_PUSHPAGE); ++ else ++ iput(ip); ++} ++ ++void ++zfs_get_done(zgd_t *zgd, int error) ++{ ++ znode_t *zp = zgd->zgd_private; ++ objset_t *os = ZTOZSB(zp)->z_os; ++ ++ if (zgd->zgd_db) ++ dmu_buf_rele(zgd->zgd_db, zgd); ++ ++ zfs_range_unlock(zgd->zgd_rl); ++ ++ /* ++ * Release the vnode asynchronously as we currently have the ++ * txg stopped from syncing. ++ */ ++ iput_async(ZTOI(zp), dsl_pool_iput_taskq(dmu_objset_pool(os))); ++ ++ if (error == 0 && zgd->zgd_bp) ++ zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); ++ ++ kmem_free(zgd, sizeof (zgd_t)); ++} ++ ++#ifdef DEBUG ++static int zil_fault_io = 0; ++#endif ++ ++/* ++ * Get data to generate a TX_WRITE intent log record. ++ */ ++int ++zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) ++{ ++ zfs_sb_t *zsb = arg; ++ objset_t *os = zsb->z_os; ++ znode_t *zp; ++ uint64_t object = lr->lr_foid; ++ uint64_t offset = lr->lr_offset; ++ uint64_t size = lr->lr_length; ++ blkptr_t *bp = &lr->lr_blkptr; ++ dmu_buf_t *db; ++ zgd_t *zgd; ++ int error = 0; ++ ++ ASSERT(zio != NULL); ++ ASSERT(size != 0); ++ ++ /* ++ * Nothing to do if the file has been removed ++ */ ++ if (zfs_zget(zsb, object, &zp) != 0) ++ return (ENOENT); ++ if (zp->z_unlinked) { ++ /* ++ * Release the vnode asynchronously as we currently have the ++ * txg stopped from syncing. ++ */ ++ iput_async(ZTOI(zp), dsl_pool_iput_taskq(dmu_objset_pool(os))); ++ return (ENOENT); ++ } ++ ++ zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_PUSHPAGE); ++ zgd->zgd_zilog = zsb->z_log; ++ zgd->zgd_private = zp; ++ ++ /* ++ * Write records come in two flavors: immediate and indirect. ++ * For small writes it's cheaper to store the data with the ++ * log record (immediate); for large writes it's cheaper to ++ * sync the data and get a pointer to it (indirect) so that ++ * we don't have to write the data twice. ++ */ ++ if (buf != NULL) { /* immediate write */ ++ zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); ++ /* test for truncation needs to be done while range locked */ ++ if (offset >= zp->z_size) { ++ error = ENOENT; ++ } else { ++ error = dmu_read(os, object, offset, size, buf, ++ DMU_READ_NO_PREFETCH); ++ } ++ ASSERT(error == 0 || error == ENOENT); ++ } else { /* indirect write */ ++ /* ++ * Have to lock the whole block to ensure when it's ++ * written out and it's checksum is being calculated ++ * that no one can change the data. We need to re-check ++ * blocksize after we get the lock in case it's changed! ++ */ ++ for (;;) { ++ uint64_t blkoff; ++ size = zp->z_blksz; ++ blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; ++ offset -= blkoff; ++ zgd->zgd_rl = zfs_range_lock(zp, offset, size, ++ RL_READER); ++ if (zp->z_blksz == size) ++ break; ++ offset += blkoff; ++ zfs_range_unlock(zgd->zgd_rl); ++ } ++ /* test for truncation needs to be done while range locked */ ++ if (lr->lr_offset >= zp->z_size) ++ error = ENOENT; ++#ifdef DEBUG ++ if (zil_fault_io) { ++ error = EIO; ++ zil_fault_io = 0; ++ } ++#endif ++ if (error == 0) ++ error = dmu_buf_hold(os, object, offset, zgd, &db, ++ DMU_READ_NO_PREFETCH); ++ ++ if (error == 0) { ++ zgd->zgd_db = db; ++ zgd->zgd_bp = bp; ++ ++ ASSERT(db->db_offset == offset); ++ ASSERT(db->db_size == size); ++ ++ error = dmu_sync(zio, lr->lr_common.lrc_txg, ++ zfs_get_done, zgd); ++ ASSERT(error || lr->lr_length <= zp->z_blksz); ++ ++ /* ++ * On success, we need to wait for the write I/O ++ * initiated by dmu_sync() to complete before we can ++ * release this dbuf. We will finish everything up ++ * in the zfs_get_done() callback. ++ */ ++ if (error == 0) ++ return (0); ++ ++ if (error == EALREADY) { ++ lr->lr_common.lrc_txtype = TX_WRITE2; ++ error = 0; ++ } ++ } ++ } ++ ++ zfs_get_done(zgd, error); ++ ++ return (error); ++} ++ ++/*ARGSUSED*/ ++int ++zfs_access(struct inode *ip, int mode, int flag, cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ int error; ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ ++ if (flag & V_ACE_MASK) ++ error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); ++ else ++ error = zfs_zaccess_rwx(zp, mode, flag, cr); ++ ++ ZFS_EXIT(zsb); ++ return (error); ++} ++EXPORT_SYMBOL(zfs_access); ++ ++/* ++ * Lookup an entry in a directory, or an extended attribute directory. ++ * If it exists, return a held inode reference for it. ++ * ++ * IN: dip - inode of directory to search. ++ * nm - name of entry to lookup. ++ * flags - LOOKUP_XATTR set if looking for an attribute. ++ * cr - credentials of caller. ++ * direntflags - directory lookup flags ++ * realpnp - returned pathname. ++ * ++ * OUT: ipp - inode of located entry, NULL if not found. ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Timestamps: ++ * NA ++ */ ++/* ARGSUSED */ ++int ++zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, ++ cred_t *cr, int *direntflags, pathname_t *realpnp) ++{ ++ znode_t *zdp = ITOZ(dip); ++ zfs_sb_t *zsb = ITOZSB(dip); ++ int error = 0; ++ ++ /* fast path */ ++ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { ++ ++ if (!S_ISDIR(dip->i_mode)) { ++ return (ENOTDIR); ++ } else if (zdp->z_sa_hdl == NULL) { ++ return (EIO); ++ } ++ ++ if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { ++ error = zfs_fastaccesschk_execute(zdp, cr); ++ if (!error) { ++ *ipp = dip; ++ igrab(*ipp); ++ return (0); ++ } ++ return (error); ++#ifdef HAVE_DNLC ++ } else { ++ vnode_t *tvp = dnlc_lookup(dvp, nm); ++ ++ if (tvp) { ++ error = zfs_fastaccesschk_execute(zdp, cr); ++ if (error) { ++ iput(tvp); ++ return (error); ++ } ++ if (tvp == DNLC_NO_VNODE) { ++ iput(tvp); ++ return (ENOENT); ++ } else { ++ *vpp = tvp; ++ return (specvp_check(vpp, cr)); ++ } ++ } ++#endif /* HAVE_DNLC */ ++ } ++ } ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zdp); ++ ++ *ipp = NULL; ++ ++ if (flags & LOOKUP_XATTR) { ++ /* ++ * We don't allow recursive attributes.. ++ * Maybe someday we will. ++ */ ++ if (zdp->z_pflags & ZFS_XATTR) { ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ ++ if ((error = zfs_get_xattrdir(zdp, ipp, cr, flags))) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ /* ++ * Do we have permission to get into attribute directory? ++ */ ++ ++ if ((error = zfs_zaccess(ITOZ(*ipp), ACE_EXECUTE, 0, ++ B_FALSE, cr))) { ++ iput(*ipp); ++ *ipp = NULL; ++ } ++ ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ if (!S_ISDIR(dip->i_mode)) { ++ ZFS_EXIT(zsb); ++ return (ENOTDIR); ++ } ++ ++ /* ++ * Check accessibility of directory. ++ */ ++ ++ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ if (zsb->z_utf8 && u8_validate(nm, strlen(nm), ++ NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ++ ZFS_EXIT(zsb); ++ return (EILSEQ); ++ } ++ ++ error = zfs_dirlook(zdp, nm, ipp, flags, direntflags, realpnp); ++ if ((error == 0) && (*ipp)) ++ zfs_inode_update(ITOZ(*ipp)); ++ ++ ZFS_EXIT(zsb); ++ return (error); ++} ++EXPORT_SYMBOL(zfs_lookup); ++ ++/* ++ * Attempt to create a new entry in a directory. If the entry ++ * already exists, truncate the file if permissible, else return ++ * an error. Return the ip of the created or trunc'd file. ++ * ++ * IN: dip - inode of directory to put new file entry in. ++ * name - name of new file entry. ++ * vap - attributes of new file. ++ * excl - flag indicating exclusive or non-exclusive mode. ++ * mode - mode to open file with. ++ * cr - credentials of caller. ++ * flag - large file flag [UNUSED]. ++ * vsecp - ACL to be set ++ * ++ * OUT: ipp - inode of created or trunc'd entry. ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Timestamps: ++ * dip - ctime|mtime updated if new entry created ++ * ip - ctime|mtime always, atime if new ++ */ ++ ++/* ARGSUSED */ ++int ++zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, ++ int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) ++{ ++ znode_t *zp, *dzp = ITOZ(dip); ++ zfs_sb_t *zsb = ITOZSB(dip); ++ zilog_t *zilog; ++ objset_t *os; ++ zfs_dirlock_t *dl; ++ dmu_tx_t *tx; ++ int error; ++ uid_t uid; ++ gid_t gid; ++ zfs_acl_ids_t acl_ids; ++ boolean_t fuid_dirtied; ++ boolean_t have_acl = B_FALSE; ++ ++ /* ++ * If we have an ephemeral id, ACL, or XVATTR then ++ * make sure file system is at proper version ++ */ ++ ++ gid = crgetgid(cr); ++ uid = crgetuid(cr); ++ ++ if (zsb->z_use_fuids == B_FALSE && ++ (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) ++ return (EINVAL); ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(dzp); ++ os = zsb->z_os; ++ zilog = zsb->z_log; ++ ++ if (zsb->z_utf8 && u8_validate(name, strlen(name), ++ NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ++ ZFS_EXIT(zsb); ++ return (EILSEQ); ++ } ++ ++ if (vap->va_mask & ATTR_XVATTR) { ++ if ((error = secpolicy_xvattr((xvattr_t *)vap, ++ crgetuid(cr), cr, vap->va_mode)) != 0) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ } ++ ++top: ++ *ipp = NULL; ++ if (*name == '\0') { ++ /* ++ * Null component name refers to the directory itself. ++ */ ++ igrab(dip); ++ zp = dzp; ++ dl = NULL; ++ error = 0; ++ } else { ++ /* possible igrab(zp) */ ++ int zflg = 0; ++ ++ if (flag & FIGNORECASE) ++ zflg |= ZCILOOK; ++ ++ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, ++ NULL, NULL); ++ if (error) { ++ if (have_acl) ++ zfs_acl_ids_free(&acl_ids); ++ if (strcmp(name, "..") == 0) ++ error = EISDIR; ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ } ++ ++ if (zp == NULL) { ++ uint64_t txtype; ++ ++ /* ++ * Create a new file object and update the directory ++ * to reference it. ++ */ ++ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { ++ if (have_acl) ++ zfs_acl_ids_free(&acl_ids); ++ goto out; ++ } ++ ++ /* ++ * We only support the creation of regular files in ++ * extended attribute directories. ++ */ ++ ++ if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { ++ if (have_acl) ++ zfs_acl_ids_free(&acl_ids); ++ error = EINVAL; ++ goto out; ++ } ++ ++ if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, ++ cr, vsecp, &acl_ids)) != 0) ++ goto out; ++ have_acl = B_TRUE; ++ ++ if (zfs_acl_ids_overquota(zsb, &acl_ids)) { ++ zfs_acl_ids_free(&acl_ids); ++ error = EDQUOT; ++ goto out; ++ } ++ ++ tx = dmu_tx_create(os); ++ ++ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ++ ZFS_SA_BASE_ATTR_SIZE); ++ ++ fuid_dirtied = zsb->z_fuid_dirty; ++ if (fuid_dirtied) ++ zfs_fuid_txhold(zsb, tx); ++ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); ++ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); ++ if (!zsb->z_use_sa && ++ acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { ++ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, ++ 0, acl_ids.z_aclp->z_acl_bytes); ++ } ++ error = dmu_tx_assign(tx, TXG_NOWAIT); ++ if (error) { ++ zfs_dirent_unlock(dl); ++ if (error == ERESTART) { ++ dmu_tx_wait(tx); ++ dmu_tx_abort(tx); ++ goto top; ++ } ++ zfs_acl_ids_free(&acl_ids); ++ dmu_tx_abort(tx); ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); ++ ++ if (fuid_dirtied) ++ zfs_fuid_sync(zsb, tx); ++ ++ (void) zfs_link_create(dl, zp, tx, ZNEW); ++ txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); ++ if (flag & FIGNORECASE) ++ txtype |= TX_CI; ++ zfs_log_create(zilog, tx, txtype, dzp, zp, name, ++ vsecp, acl_ids.z_fuidp, vap); ++ zfs_acl_ids_free(&acl_ids); ++ dmu_tx_commit(tx); ++ } else { ++ int aflags = (flag & FAPPEND) ? V_APPEND : 0; ++ ++ if (have_acl) ++ zfs_acl_ids_free(&acl_ids); ++ have_acl = B_FALSE; ++ ++ /* ++ * A directory entry already exists for this name. ++ */ ++ /* ++ * Can't truncate an existing file if in exclusive mode. ++ */ ++ if (excl) { ++ error = EEXIST; ++ goto out; ++ } ++ /* ++ * Can't open a directory for writing. ++ */ ++ if (S_ISDIR(ZTOI(zp)->i_mode)) { ++ error = EISDIR; ++ goto out; ++ } ++ /* ++ * Verify requested access to file. ++ */ ++ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { ++ goto out; ++ } ++ ++ mutex_enter(&dzp->z_lock); ++ dzp->z_seq++; ++ mutex_exit(&dzp->z_lock); ++ ++ /* ++ * Truncate regular files if requested. ++ */ ++ if (S_ISREG(ZTOI(zp)->i_mode) && ++ (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { ++ /* we can't hold any locks when calling zfs_freesp() */ ++ zfs_dirent_unlock(dl); ++ dl = NULL; ++ error = zfs_freesp(zp, 0, 0, mode, TRUE); ++ } ++ } ++out: ++ ++ if (dl) ++ zfs_dirent_unlock(dl); ++ ++ if (error) { ++ if (zp) ++ iput(ZTOI(zp)); ++ } else { ++ zfs_inode_update(dzp); ++ zfs_inode_update(zp); ++ *ipp = ZTOI(zp); ++ } ++ ++ if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) ++ zil_commit(zilog, 0); ++ ++ ZFS_EXIT(zsb); ++ return (error); ++} ++EXPORT_SYMBOL(zfs_create); ++ ++/* ++ * Remove an entry from a directory. ++ * ++ * IN: dip - inode of directory to remove entry from. ++ * name - name of entry to remove. ++ * cr - credentials of caller. ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Timestamps: ++ * dip - ctime|mtime ++ * ip - ctime (if nlink > 0) ++ */ ++ ++uint64_t null_xattr = 0; ++ ++/*ARGSUSED*/ ++int ++zfs_remove(struct inode *dip, char *name, cred_t *cr) ++{ ++ znode_t *zp, *dzp = ITOZ(dip); ++ znode_t *xzp; ++ struct inode *ip; ++ zfs_sb_t *zsb = ITOZSB(dip); ++ zilog_t *zilog; ++ uint64_t xattr_obj; ++ uint64_t xattr_obj_unlinked = 0; ++ uint64_t obj = 0; ++ zfs_dirlock_t *dl; ++ dmu_tx_t *tx; ++ boolean_t unlinked; ++ uint64_t txtype; ++ pathname_t *realnmp = NULL; ++#ifdef HAVE_PN_UTILS ++ pathname_t realnm; ++#endif /* HAVE_PN_UTILS */ ++ int error; ++ int zflg = ZEXISTS; ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(dzp); ++ zilog = zsb->z_log; ++ ++#ifdef HAVE_PN_UTILS ++ if (flags & FIGNORECASE) { ++ zflg |= ZCILOOK; ++ pn_alloc(&realnm); ++ realnmp = &realnm; ++ } ++#endif /* HAVE_PN_UTILS */ ++ ++top: ++ xattr_obj = 0; ++ xzp = NULL; ++ /* ++ * Attempt to lock directory; fail if entry doesn't exist. ++ */ ++ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, ++ NULL, realnmp))) { ++#ifdef HAVE_PN_UTILS ++ if (realnmp) ++ pn_free(realnmp); ++#endif /* HAVE_PN_UTILS */ ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ ip = ZTOI(zp); ++ ++ if ((error = zfs_zaccess_delete(dzp, zp, cr))) { ++ goto out; ++ } ++ ++ /* ++ * Need to use rmdir for removing directories. ++ */ ++ if (S_ISDIR(ip->i_mode)) { ++ error = EPERM; ++ goto out; ++ } ++ ++#ifdef HAVE_DNLC ++ if (realnmp) ++ dnlc_remove(dvp, realnmp->pn_buf); ++ else ++ dnlc_remove(dvp, name); ++#endif /* HAVE_DNLC */ ++ ++ /* ++ * We never delete the znode and always place it in the unlinked ++ * set. The dentry cache will always hold the last reference and ++ * is responsible for safely freeing the znode. ++ */ ++ obj = zp->z_id; ++ tx = dmu_tx_create(zsb->z_os); ++ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); ++ zfs_sa_upgrade_txholds(tx, zp); ++ zfs_sa_upgrade_txholds(tx, dzp); ++ ++ /* are there any extended attributes? */ ++ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zsb), ++ &xattr_obj, sizeof (xattr_obj)); ++ if (error == 0 && xattr_obj) { ++ error = zfs_zget(zsb, xattr_obj, &xzp); ++ ASSERT3U(error, ==, 0); ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); ++ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); ++ } ++ ++ /* charge as an update -- would be nice not to charge at all */ ++ dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL); ++ ++ error = dmu_tx_assign(tx, TXG_NOWAIT); ++ if (error) { ++ zfs_dirent_unlock(dl); ++ iput(ip); ++ if (xzp) ++ iput(ZTOI(xzp)); ++ if (error == ERESTART) { ++ dmu_tx_wait(tx); ++ dmu_tx_abort(tx); ++ goto top; ++ } ++#ifdef HAVE_PN_UTILS ++ if (realnmp) ++ pn_free(realnmp); ++#endif /* HAVE_PN_UTILS */ ++ dmu_tx_abort(tx); ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ /* ++ * Remove the directory entry. ++ */ ++ error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); ++ ++ if (error) { ++ dmu_tx_commit(tx); ++ goto out; ++ } ++ ++ if (unlinked) { ++ /* ++ * Hold z_lock so that we can make sure that the ACL obj ++ * hasn't changed. Could have been deleted due to ++ * zfs_sa_upgrade(). ++ */ ++ mutex_enter(&zp->z_lock); ++ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zsb), ++ &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); ++ mutex_exit(&zp->z_lock); ++ zfs_unlinked_add(zp, tx); ++ } ++ ++ txtype = TX_REMOVE; ++#ifdef HAVE_PN_UTILS ++ if (flags & FIGNORECASE) ++ txtype |= TX_CI; ++#endif /* HAVE_PN_UTILS */ ++ zfs_log_remove(zilog, tx, txtype, dzp, name, obj); ++ ++ dmu_tx_commit(tx); ++out: ++#ifdef HAVE_PN_UTILS ++ if (realnmp) ++ pn_free(realnmp); ++#endif /* HAVE_PN_UTILS */ ++ ++ zfs_dirent_unlock(dl); ++ zfs_inode_update(dzp); ++ zfs_inode_update(zp); ++ if (xzp) ++ zfs_inode_update(xzp); ++ ++ iput(ip); ++ if (xzp) ++ iput(ZTOI(xzp)); ++ ++ if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) ++ zil_commit(zilog, 0); ++ ++ ZFS_EXIT(zsb); ++ return (error); ++} ++EXPORT_SYMBOL(zfs_remove); ++ ++/* ++ * Create a new directory and insert it into dip using the name ++ * provided. Return a pointer to the inserted directory. ++ * ++ * IN: dip - inode of directory to add subdir to. ++ * dirname - name of new directory. ++ * vap - attributes of new directory. ++ * cr - credentials of caller. ++ * vsecp - ACL to be set ++ * ++ * OUT: ipp - inode of created directory. ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Timestamps: ++ * dip - ctime|mtime updated ++ * ipp - ctime|mtime|atime updated ++ */ ++/*ARGSUSED*/ ++int ++zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, ++ cred_t *cr, int flags, vsecattr_t *vsecp) ++{ ++ znode_t *zp, *dzp = ITOZ(dip); ++ zfs_sb_t *zsb = ITOZSB(dip); ++ zilog_t *zilog; ++ zfs_dirlock_t *dl; ++ uint64_t txtype; ++ dmu_tx_t *tx; ++ int error; ++ int zf = ZNEW; ++ uid_t uid; ++ gid_t gid = crgetgid(cr); ++ zfs_acl_ids_t acl_ids; ++ boolean_t fuid_dirtied; ++ ++ ASSERT(S_ISDIR(vap->va_mode)); ++ ++ /* ++ * If we have an ephemeral id, ACL, or XVATTR then ++ * make sure file system is at proper version ++ */ ++ ++ uid = crgetuid(cr); ++ if (zsb->z_use_fuids == B_FALSE && ++ (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) ++ return (EINVAL); ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(dzp); ++ zilog = zsb->z_log; ++ ++ if (dzp->z_pflags & ZFS_XATTR) { ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ ++ if (zsb->z_utf8 && u8_validate(dirname, ++ strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ++ ZFS_EXIT(zsb); ++ return (EILSEQ); ++ } ++ if (flags & FIGNORECASE) ++ zf |= ZCILOOK; ++ ++ if (vap->va_mask & ATTR_XVATTR) { ++ if ((error = secpolicy_xvattr((xvattr_t *)vap, ++ crgetuid(cr), cr, vap->va_mode)) != 0) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ } ++ ++ if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, ++ vsecp, &acl_ids)) != 0) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ /* ++ * First make sure the new directory doesn't exist. ++ * ++ * Existence is checked first to make sure we don't return ++ * EACCES instead of EEXIST which can cause some applications ++ * to fail. ++ */ ++top: ++ *ipp = NULL; ++ ++ if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, ++ NULL, NULL))) { ++ zfs_acl_ids_free(&acl_ids); ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { ++ zfs_acl_ids_free(&acl_ids); ++ zfs_dirent_unlock(dl); ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ if (zfs_acl_ids_overquota(zsb, &acl_ids)) { ++ zfs_acl_ids_free(&acl_ids); ++ zfs_dirent_unlock(dl); ++ ZFS_EXIT(zsb); ++ return (EDQUOT); ++ } ++ ++ /* ++ * Add a new entry to the directory. ++ */ ++ tx = dmu_tx_create(zsb->z_os); ++ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); ++ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); ++ fuid_dirtied = zsb->z_fuid_dirty; ++ if (fuid_dirtied) ++ zfs_fuid_txhold(zsb, tx); ++ if (!zsb->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { ++ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ++ acl_ids.z_aclp->z_acl_bytes); ++ } ++ ++ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ++ ZFS_SA_BASE_ATTR_SIZE); ++ ++ error = dmu_tx_assign(tx, TXG_NOWAIT); ++ if (error) { ++ zfs_dirent_unlock(dl); ++ if (error == ERESTART) { ++ dmu_tx_wait(tx); ++ dmu_tx_abort(tx); ++ goto top; ++ } ++ zfs_acl_ids_free(&acl_ids); ++ dmu_tx_abort(tx); ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ /* ++ * Create new node. ++ */ ++ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); ++ ++ if (fuid_dirtied) ++ zfs_fuid_sync(zsb, tx); ++ ++ /* ++ * Now put new name in parent dir. ++ */ ++ (void) zfs_link_create(dl, zp, tx, ZNEW); ++ ++ *ipp = ZTOI(zp); ++ ++ txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); ++ if (flags & FIGNORECASE) ++ txtype |= TX_CI; ++ zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, ++ acl_ids.z_fuidp, vap); ++ ++ zfs_acl_ids_free(&acl_ids); ++ ++ dmu_tx_commit(tx); ++ ++ zfs_dirent_unlock(dl); ++ ++ if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) ++ zil_commit(zilog, 0); ++ ++ zfs_inode_update(dzp); ++ zfs_inode_update(zp); ++ ZFS_EXIT(zsb); ++ return (0); ++} ++EXPORT_SYMBOL(zfs_mkdir); ++ ++/* ++ * Remove a directory subdir entry. If the current working ++ * directory is the same as the subdir to be removed, the ++ * remove will fail. ++ * ++ * IN: dip - inode of directory to remove from. ++ * name - name of directory to be removed. ++ * cwd - inode of current working directory. ++ * cr - credentials of caller. ++ * flags - case flags ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Timestamps: ++ * dip - ctime|mtime updated ++ */ ++/*ARGSUSED*/ ++int ++zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr, ++ int flags) ++{ ++ znode_t *dzp = ITOZ(dip); ++ znode_t *zp; ++ struct inode *ip; ++ zfs_sb_t *zsb = ITOZSB(dip); ++ zilog_t *zilog; ++ zfs_dirlock_t *dl; ++ dmu_tx_t *tx; ++ int error; ++ int zflg = ZEXISTS; ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(dzp); ++ zilog = zsb->z_log; ++ ++ if (flags & FIGNORECASE) ++ zflg |= ZCILOOK; ++top: ++ zp = NULL; ++ ++ /* ++ * Attempt to lock directory; fail if entry doesn't exist. ++ */ ++ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, ++ NULL, NULL))) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ ip = ZTOI(zp); ++ ++ if ((error = zfs_zaccess_delete(dzp, zp, cr))) { ++ goto out; ++ } ++ ++ if (!S_ISDIR(ip->i_mode)) { ++ error = ENOTDIR; ++ goto out; ++ } ++ ++ if (ip == cwd) { ++ error = EINVAL; ++ goto out; ++ } ++ ++ /* ++ * Grab a lock on the directory to make sure that noone is ++ * trying to add (or lookup) entries while we are removing it. ++ */ ++ rw_enter(&zp->z_name_lock, RW_WRITER); ++ ++ /* ++ * Grab a lock on the parent pointer to make sure we play well ++ * with the treewalk and directory rename code. ++ */ ++ rw_enter(&zp->z_parent_lock, RW_WRITER); ++ ++ tx = dmu_tx_create(zsb->z_os); ++ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); ++ dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL); ++ zfs_sa_upgrade_txholds(tx, zp); ++ zfs_sa_upgrade_txholds(tx, dzp); ++ error = dmu_tx_assign(tx, TXG_NOWAIT); ++ if (error) { ++ rw_exit(&zp->z_parent_lock); ++ rw_exit(&zp->z_name_lock); ++ zfs_dirent_unlock(dl); ++ iput(ip); ++ if (error == ERESTART) { ++ dmu_tx_wait(tx); ++ dmu_tx_abort(tx); ++ goto top; ++ } ++ dmu_tx_abort(tx); ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ error = zfs_link_destroy(dl, zp, tx, zflg, NULL); ++ ++ if (error == 0) { ++ uint64_t txtype = TX_RMDIR; ++ if (flags & FIGNORECASE) ++ txtype |= TX_CI; ++ zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); ++ } ++ ++ dmu_tx_commit(tx); ++ ++ rw_exit(&zp->z_parent_lock); ++ rw_exit(&zp->z_name_lock); ++out: ++ zfs_dirent_unlock(dl); ++ ++ zfs_inode_update(dzp); ++ zfs_inode_update(zp); ++ iput(ip); ++ ++ if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) ++ zil_commit(zilog, 0); ++ ++ ZFS_EXIT(zsb); ++ return (error); ++} ++EXPORT_SYMBOL(zfs_rmdir); ++ ++/* ++ * Read as many directory entries as will fit into the provided ++ * dirent buffer from the given directory cursor position. ++ * ++ * IN: ip - inode of directory to read. ++ * dirent - buffer for directory entries. ++ * ++ * OUT: dirent - filler buffer of directory entries. ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Timestamps: ++ * ip - atime updated ++ * ++ * Note that the low 4 bits of the cookie returned by zap is always zero. ++ * This allows us to use the low range for "special" directory entries: ++ * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, ++ * we use the offset 2 for the '.zfs' directory. ++ */ ++/* ARGSUSED */ ++int ++zfs_readdir(struct inode *ip, void *dirent, filldir_t filldir, ++ loff_t *pos, cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ objset_t *os; ++ zap_cursor_t zc; ++ zap_attribute_t zap; ++ int outcount; ++ int error; ++ uint8_t prefetch; ++ int done = 0; ++ uint64_t parent; ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ ++ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zsb), ++ &parent, sizeof (parent))) != 0) ++ goto out; ++ ++ /* ++ * Quit if directory has been removed (posix) ++ */ ++ error = 0; ++ if (zp->z_unlinked) ++ goto out; ++ ++ os = zsb->z_os; ++ prefetch = zp->z_zn_prefetch; ++ ++ /* ++ * Initialize the iterator cursor. ++ */ ++ if (*pos <= 3) { ++ /* ++ * Start iteration from the beginning of the directory. ++ */ ++ zap_cursor_init(&zc, os, zp->z_id); ++ } else { ++ /* ++ * The offset is a serialized cursor. ++ */ ++ zap_cursor_init_serialized(&zc, os, zp->z_id, *pos); ++ } ++ ++ /* ++ * Transform to file-system independent format ++ */ ++ outcount = 0; ++ ++ while (!done) { ++ uint64_t objnum; ++ /* ++ * Special case `.', `..', and `.zfs'. ++ */ ++ if (*pos == 0) { ++ (void) strcpy(zap.za_name, "."); ++ zap.za_normalization_conflict = 0; ++ objnum = zp->z_id; ++ } else if (*pos == 1) { ++ (void) strcpy(zap.za_name, ".."); ++ zap.za_normalization_conflict = 0; ++ objnum = parent; ++ } else if (*pos == 2 && zfs_show_ctldir(zp)) { ++ (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); ++ zap.za_normalization_conflict = 0; ++ objnum = ZFSCTL_INO_ROOT; ++ } else { ++ /* ++ * Grab next entry. ++ */ ++ if ((error = zap_cursor_retrieve(&zc, &zap))) { ++ if (error == ENOENT) ++ break; ++ else ++ goto update; ++ } ++ ++ /* ++ * Allow multiple entries provided the first entry is ++ * the object id. Non-zpl consumers may safely make ++ * use of the additional space. ++ * ++ * XXX: This should be a feature flag for compatibility ++ */ ++ if (zap.za_integer_length != 8 || ++ zap.za_num_integers == 0) { ++ cmn_err(CE_WARN, "zap_readdir: bad directory " ++ "entry, obj = %lld, offset = %lld, " ++ "length = %d, num = %lld\n", ++ (u_longlong_t)zp->z_id, ++ (u_longlong_t)*pos, ++ zap.za_integer_length, ++ (u_longlong_t)zap.za_num_integers); ++ error = ENXIO; ++ goto update; ++ } ++ ++ objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); ++ } ++ done = filldir(dirent, zap.za_name, strlen(zap.za_name), ++ zap_cursor_serialize(&zc), objnum, 0); ++ if (done) { ++ break; ++ } ++ ++ /* Prefetch znode */ ++ if (prefetch) { ++ dmu_prefetch(os, objnum, 0, 0); ++ } ++ ++ if (*pos > 2 || (*pos == 2 && !zfs_show_ctldir(zp))) { ++ zap_cursor_advance(&zc); ++ *pos = zap_cursor_serialize(&zc); ++ } else { ++ (*pos)++; ++ } ++ } ++ zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ ++ ++update: ++ zap_cursor_fini(&zc); ++ if (error == ENOENT) ++ error = 0; ++ ++ ZFS_ACCESSTIME_STAMP(zsb, zp); ++ zfs_inode_update(zp); ++ ++out: ++ ZFS_EXIT(zsb); ++ ++ return (error); ++} ++EXPORT_SYMBOL(zfs_readdir); ++ ++ulong_t zfs_fsync_sync_cnt = 4; ++ ++int ++zfs_fsync(struct inode *ip, int syncflag, cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ ++ (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); ++ ++ if (zsb->z_os->os_sync != ZFS_SYNC_DISABLED) { ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ zil_commit(zsb->z_log, zp->z_id); ++ ZFS_EXIT(zsb); ++ } ++ return (0); ++} ++EXPORT_SYMBOL(zfs_fsync); ++ ++ ++/* ++ * Get the requested file attributes and place them in the provided ++ * vattr structure. ++ * ++ * IN: ip - inode of file. ++ * vap - va_mask identifies requested attributes. ++ * If ATTR_XVATTR set, then optional attrs are requested ++ * flags - ATTR_NOACLCHECK (CIFS server context) ++ * cr - credentials of caller. ++ * ++ * OUT: vap - attribute values. ++ * ++ * RETURN: 0 (always succeeds) ++ */ ++/* ARGSUSED */ ++int ++zfs_getattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ int error = 0; ++ uint64_t links; ++ uint64_t mtime[2], ctime[2]; ++ xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ ++ xoptattr_t *xoap = NULL; ++ boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ++ sa_bulk_attr_t bulk[2]; ++ int count = 0; ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ ++ zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); ++ ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, &mtime, 16); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, &ctime, 16); ++ ++ if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ /* ++ * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. ++ * Also, if we are the owner don't bother, since owner should ++ * always be allowed to read basic attributes of file. ++ */ ++ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && ++ (vap->va_uid != crgetuid(cr))) { ++ if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, ++ skipaclchk, cr))) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ } ++ ++ /* ++ * Return all attributes. It's cheaper to provide the answer ++ * than to determine whether we were asked the question. ++ */ ++ ++ mutex_enter(&zp->z_lock); ++ vap->va_type = vn_mode_to_vtype(zp->z_mode); ++ vap->va_mode = zp->z_mode; ++ vap->va_fsid = ZTOI(zp)->i_sb->s_dev; ++ vap->va_nodeid = zp->z_id; ++ if ((zp->z_id == zsb->z_root) && zfs_show_ctldir(zp)) ++ links = zp->z_links + 1; ++ else ++ links = zp->z_links; ++ vap->va_nlink = MIN(links, ZFS_LINK_MAX); ++ vap->va_size = i_size_read(ip); ++ vap->va_rdev = ip->i_rdev; ++ vap->va_seq = ip->i_generation; ++ ++ /* ++ * Add in any requested optional attributes and the create time. ++ * Also set the corresponding bits in the returned attribute bitmap. ++ */ ++ if ((xoap = xva_getxoptattr(xvap)) != NULL && zsb->z_use_fuids) { ++ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { ++ xoap->xoa_archive = ++ ((zp->z_pflags & ZFS_ARCHIVE) != 0); ++ XVA_SET_RTN(xvap, XAT_ARCHIVE); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { ++ xoap->xoa_readonly = ++ ((zp->z_pflags & ZFS_READONLY) != 0); ++ XVA_SET_RTN(xvap, XAT_READONLY); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { ++ xoap->xoa_system = ++ ((zp->z_pflags & ZFS_SYSTEM) != 0); ++ XVA_SET_RTN(xvap, XAT_SYSTEM); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { ++ xoap->xoa_hidden = ++ ((zp->z_pflags & ZFS_HIDDEN) != 0); ++ XVA_SET_RTN(xvap, XAT_HIDDEN); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ++ xoap->xoa_nounlink = ++ ((zp->z_pflags & ZFS_NOUNLINK) != 0); ++ XVA_SET_RTN(xvap, XAT_NOUNLINK); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { ++ xoap->xoa_immutable = ++ ((zp->z_pflags & ZFS_IMMUTABLE) != 0); ++ XVA_SET_RTN(xvap, XAT_IMMUTABLE); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { ++ xoap->xoa_appendonly = ++ ((zp->z_pflags & ZFS_APPENDONLY) != 0); ++ XVA_SET_RTN(xvap, XAT_APPENDONLY); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ++ xoap->xoa_nodump = ++ ((zp->z_pflags & ZFS_NODUMP) != 0); ++ XVA_SET_RTN(xvap, XAT_NODUMP); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { ++ xoap->xoa_opaque = ++ ((zp->z_pflags & ZFS_OPAQUE) != 0); ++ XVA_SET_RTN(xvap, XAT_OPAQUE); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ++ xoap->xoa_av_quarantined = ++ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); ++ XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { ++ xoap->xoa_av_modified = ++ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); ++ XVA_SET_RTN(xvap, XAT_AV_MODIFIED); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && ++ S_ISREG(ip->i_mode)) { ++ zfs_sa_get_scanstamp(zp, xvap); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { ++ uint64_t times[2]; ++ ++ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zsb), ++ times, sizeof (times)); ++ ZFS_TIME_DECODE(&xoap->xoa_createtime, times); ++ XVA_SET_RTN(xvap, XAT_CREATETIME); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ++ xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); ++ XVA_SET_RTN(xvap, XAT_REPARSE); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_GEN)) { ++ xoap->xoa_generation = zp->z_gen; ++ XVA_SET_RTN(xvap, XAT_GEN); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { ++ xoap->xoa_offline = ++ ((zp->z_pflags & ZFS_OFFLINE) != 0); ++ XVA_SET_RTN(xvap, XAT_OFFLINE); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { ++ xoap->xoa_sparse = ++ ((zp->z_pflags & ZFS_SPARSE) != 0); ++ XVA_SET_RTN(xvap, XAT_SPARSE); ++ } ++ } ++ ++ ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); ++ ZFS_TIME_DECODE(&vap->va_mtime, mtime); ++ ZFS_TIME_DECODE(&vap->va_ctime, ctime); ++ ++ mutex_exit(&zp->z_lock); ++ ++ sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks); ++ ++ if (zp->z_blksz == 0) { ++ /* ++ * Block size hasn't been set; suggest maximal I/O transfers. ++ */ ++ vap->va_blksize = zsb->z_max_blksz; ++ } ++ ++ ZFS_EXIT(zsb); ++ return (0); ++} ++EXPORT_SYMBOL(zfs_getattr); ++ ++/* ++ * Get the basic file attributes and place them in the provided kstat ++ * structure. The inode is assumed to be the authoritative source ++ * for most of the attributes. However, the znode currently has the ++ * authoritative atime, blksize, and block count. ++ * ++ * IN: ip - inode of file. ++ * ++ * OUT: sp - kstat values. ++ * ++ * RETURN: 0 (always succeeds) ++ */ ++/* ARGSUSED */ ++int ++zfs_getattr_fast(struct inode *ip, struct kstat *sp) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ ++ mutex_enter(&zp->z_lock); ++ ++ generic_fillattr(ip, sp); ++ ZFS_TIME_DECODE(&sp->atime, zp->z_atime); ++ ++ sa_object_size(zp->z_sa_hdl, (uint32_t *)&sp->blksize, &sp->blocks); ++ if (unlikely(zp->z_blksz == 0)) { ++ /* ++ * Block size hasn't been set; suggest maximal I/O transfers. ++ */ ++ sp->blksize = zsb->z_max_blksz; ++ } ++ ++ mutex_exit(&zp->z_lock); ++ ++ ZFS_EXIT(zsb); ++ ++ return (0); ++} ++EXPORT_SYMBOL(zfs_getattr_fast); ++ ++/* ++ * Set the file attributes to the values contained in the ++ * vattr structure. ++ * ++ * IN: ip - inode of file to be modified. ++ * vap - new attribute values. ++ * If ATTR_XVATTR set, then optional attrs are being set ++ * flags - ATTR_UTIME set if non-default time values provided. ++ * - ATTR_NOACLCHECK (CIFS context only). ++ * cr - credentials of caller. ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Timestamps: ++ * ip - ctime updated, mtime updated if size changed. ++ */ ++/* ARGSUSED */ ++int ++zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ zilog_t *zilog; ++ dmu_tx_t *tx; ++ vattr_t oldva; ++ xvattr_t *tmpxvattr; ++ uint_t mask = vap->va_mask; ++ uint_t saved_mask; ++ int trim_mask = 0; ++ uint64_t new_mode; ++ uint64_t new_uid, new_gid; ++ uint64_t xattr_obj; ++ uint64_t mtime[2], ctime[2]; ++ znode_t *attrzp; ++ int need_policy = FALSE; ++ int err, err2; ++ zfs_fuid_info_t *fuidp = NULL; ++ xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ ++ xoptattr_t *xoap; ++ zfs_acl_t *aclp; ++ boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ++ boolean_t fuid_dirtied = B_FALSE; ++ sa_bulk_attr_t *bulk, *xattr_bulk; ++ int count = 0, xattr_count = 0; ++ ++ if (mask == 0) ++ return (0); ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ ++ zilog = zsb->z_log; ++ ++ /* ++ * Make sure that if we have ephemeral uid/gid or xvattr specified ++ * that file system is at proper version level ++ */ ++ ++ if (zsb->z_use_fuids == B_FALSE && ++ (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || ++ ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || ++ (mask & ATTR_XVATTR))) { ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ ++ if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { ++ ZFS_EXIT(zsb); ++ return (EISDIR); ++ } ++ ++ if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ ++ /* ++ * If this is an xvattr_t, then get a pointer to the structure of ++ * optional attributes. If this is NULL, then we have a vattr_t. ++ */ ++ xoap = xva_getxoptattr(xvap); ++ ++ tmpxvattr = kmem_alloc(sizeof(xvattr_t), KM_SLEEP); ++ xva_init(tmpxvattr); ++ ++ bulk = kmem_alloc(sizeof(sa_bulk_attr_t) * 7, KM_SLEEP); ++ xattr_bulk = kmem_alloc(sizeof(sa_bulk_attr_t) * 7, KM_SLEEP); ++ ++ /* ++ * Immutable files can only alter immutable bit and atime ++ */ ++ if ((zp->z_pflags & ZFS_IMMUTABLE) && ++ ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || ++ ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ++ err = EPERM; ++ goto out3; ++ } ++ ++ if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { ++ err = EPERM; ++ goto out3; ++ } ++ ++ /* ++ * Verify timestamps doesn't overflow 32 bits. ++ * ZFS can handle large timestamps, but 32bit syscalls can't ++ * handle times greater than 2039. This check should be removed ++ * once large timestamps are fully supported. ++ */ ++ if (mask & (ATTR_ATIME | ATTR_MTIME)) { ++ if (((mask & ATTR_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ++ ((mask & ATTR_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { ++ err = EOVERFLOW; ++ goto out3; ++ } ++ } ++ ++top: ++ attrzp = NULL; ++ aclp = NULL; ++ ++ /* Can this be moved to before the top label? */ ++ if (zfs_is_readonly(zsb)) { ++ err = EROFS; ++ goto out3; ++ } ++ ++ /* ++ * First validate permissions ++ */ ++ ++ if (mask & ATTR_SIZE) { ++ err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); ++ if (err) ++ goto out3; ++ ++ truncate_setsize(ip, vap->va_size); ++ ++ /* ++ * XXX - Note, we are not providing any open ++ * mode flags here (like FNDELAY), so we may ++ * block if there are locks present... this ++ * should be addressed in openat(). ++ */ ++ /* XXX - would it be OK to generate a log record here? */ ++ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); ++ if (err) ++ goto out3; ++ } ++ ++ if (mask & (ATTR_ATIME|ATTR_MTIME) || ++ ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || ++ XVA_ISSET_REQ(xvap, XAT_READONLY) || ++ XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || ++ XVA_ISSET_REQ(xvap, XAT_OFFLINE) || ++ XVA_ISSET_REQ(xvap, XAT_SPARSE) || ++ XVA_ISSET_REQ(xvap, XAT_CREATETIME) || ++ XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { ++ need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, ++ skipaclchk, cr); ++ } ++ ++ if (mask & (ATTR_UID|ATTR_GID)) { ++ int idmask = (mask & (ATTR_UID|ATTR_GID)); ++ int take_owner; ++ int take_group; ++ ++ /* ++ * NOTE: even if a new mode is being set, ++ * we may clear S_ISUID/S_ISGID bits. ++ */ ++ ++ if (!(mask & ATTR_MODE)) ++ vap->va_mode = zp->z_mode; ++ ++ /* ++ * Take ownership or chgrp to group we are a member of ++ */ ++ ++ take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); ++ take_group = (mask & ATTR_GID) && ++ zfs_groupmember(zsb, vap->va_gid, cr); ++ ++ /* ++ * If both ATTR_UID and ATTR_GID are set then take_owner and ++ * take_group must both be set in order to allow taking ++ * ownership. ++ * ++ * Otherwise, send the check through secpolicy_vnode_setattr() ++ * ++ */ ++ ++ if (((idmask == (ATTR_UID|ATTR_GID)) && ++ take_owner && take_group) || ++ ((idmask == ATTR_UID) && take_owner) || ++ ((idmask == ATTR_GID) && take_group)) { ++ if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, ++ skipaclchk, cr) == 0) { ++ /* ++ * Remove setuid/setgid for non-privileged users ++ */ ++ (void) secpolicy_setid_clear(vap, cr); ++ trim_mask = (mask & (ATTR_UID|ATTR_GID)); ++ } else { ++ need_policy = TRUE; ++ } ++ } else { ++ need_policy = TRUE; ++ } ++ } ++ ++ mutex_enter(&zp->z_lock); ++ oldva.va_mode = zp->z_mode; ++ zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); ++ if (mask & ATTR_XVATTR) { ++ /* ++ * Update xvattr mask to include only those attributes ++ * that are actually changing. ++ * ++ * the bits will be restored prior to actually setting ++ * the attributes so the caller thinks they were set. ++ */ ++ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { ++ if (xoap->xoa_appendonly != ++ ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { ++ need_policy = TRUE; ++ } else { ++ XVA_CLR_REQ(xvap, XAT_APPENDONLY); ++ XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); ++ } ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ++ if (xoap->xoa_nounlink != ++ ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { ++ need_policy = TRUE; ++ } else { ++ XVA_CLR_REQ(xvap, XAT_NOUNLINK); ++ XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); ++ } ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { ++ if (xoap->xoa_immutable != ++ ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { ++ need_policy = TRUE; ++ } else { ++ XVA_CLR_REQ(xvap, XAT_IMMUTABLE); ++ XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); ++ } ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ++ if (xoap->xoa_nodump != ++ ((zp->z_pflags & ZFS_NODUMP) != 0)) { ++ need_policy = TRUE; ++ } else { ++ XVA_CLR_REQ(xvap, XAT_NODUMP); ++ XVA_SET_REQ(tmpxvattr, XAT_NODUMP); ++ } ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { ++ if (xoap->xoa_av_modified != ++ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { ++ need_policy = TRUE; ++ } else { ++ XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); ++ XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); ++ } ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ++ if ((!S_ISREG(ip->i_mode) && ++ xoap->xoa_av_quarantined) || ++ xoap->xoa_av_quarantined != ++ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { ++ need_policy = TRUE; ++ } else { ++ XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); ++ XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); ++ } ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ++ mutex_exit(&zp->z_lock); ++ err = EPERM; ++ goto out3; ++ } ++ ++ if (need_policy == FALSE && ++ (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || ++ XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { ++ need_policy = TRUE; ++ } ++ } ++ ++ mutex_exit(&zp->z_lock); ++ ++ if (mask & ATTR_MODE) { ++ if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { ++ err = secpolicy_setid_setsticky_clear(ip, vap, ++ &oldva, cr); ++ if (err) ++ goto out3; ++ ++ trim_mask |= ATTR_MODE; ++ } else { ++ need_policy = TRUE; ++ } ++ } ++ ++ if (need_policy) { ++ /* ++ * If trim_mask is set then take ownership ++ * has been granted or write_acl is present and user ++ * has the ability to modify mode. In that case remove ++ * UID|GID and or MODE from mask so that ++ * secpolicy_vnode_setattr() doesn't revoke it. ++ */ ++ ++ if (trim_mask) { ++ saved_mask = vap->va_mask; ++ vap->va_mask &= ~trim_mask; ++ } ++ err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, ++ (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); ++ if (err) ++ goto out3; ++ ++ if (trim_mask) ++ vap->va_mask |= saved_mask; ++ } ++ ++ /* ++ * secpolicy_vnode_setattr, or take ownership may have ++ * changed va_mask ++ */ ++ mask = vap->va_mask; ++ ++ if ((mask & (ATTR_UID | ATTR_GID))) { ++ err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zsb), ++ &xattr_obj, sizeof (xattr_obj)); ++ ++ if (err == 0 && xattr_obj) { ++ err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); ++ if (err) ++ goto out2; ++ } ++ if (mask & ATTR_UID) { ++ new_uid = zfs_fuid_create(zsb, ++ (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); ++ if (new_uid != zp->z_uid && ++ zfs_fuid_overquota(zsb, B_FALSE, new_uid)) { ++ if (attrzp) ++ iput(ZTOI(attrzp)); ++ err = EDQUOT; ++ goto out2; ++ } ++ } ++ ++ if (mask & ATTR_GID) { ++ new_gid = zfs_fuid_create(zsb, (uint64_t)vap->va_gid, ++ cr, ZFS_GROUP, &fuidp); ++ if (new_gid != zp->z_gid && ++ zfs_fuid_overquota(zsb, B_TRUE, new_gid)) { ++ if (attrzp) ++ iput(ZTOI(attrzp)); ++ err = EDQUOT; ++ goto out2; ++ } ++ } ++ } ++ tx = dmu_tx_create(zsb->z_os); ++ ++ if (mask & ATTR_MODE) { ++ uint64_t pmode = zp->z_mode; ++ uint64_t acl_obj; ++ new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); ++ ++ zfs_acl_chmod_setattr(zp, &aclp, new_mode); ++ ++ mutex_enter(&zp->z_lock); ++ if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { ++ /* ++ * Are we upgrading ACL from old V0 format ++ * to V1 format? ++ */ ++ if (zsb->z_version >= ZPL_VERSION_FUID && ++ zfs_znode_acl_version(zp) == ++ ZFS_ACL_VERSION_INITIAL) { ++ dmu_tx_hold_free(tx, acl_obj, 0, ++ DMU_OBJECT_END); ++ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, ++ 0, aclp->z_acl_bytes); ++ } else { ++ dmu_tx_hold_write(tx, acl_obj, 0, ++ aclp->z_acl_bytes); ++ } ++ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { ++ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, ++ 0, aclp->z_acl_bytes); ++ } ++ mutex_exit(&zp->z_lock); ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); ++ } else { ++ if ((mask & ATTR_XVATTR) && ++ XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); ++ else ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); ++ } ++ ++ if (attrzp) { ++ dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); ++ } ++ ++ fuid_dirtied = zsb->z_fuid_dirty; ++ if (fuid_dirtied) ++ zfs_fuid_txhold(zsb, tx); ++ ++ zfs_sa_upgrade_txholds(tx, zp); ++ ++ err = dmu_tx_assign(tx, TXG_NOWAIT); ++ if (err) { ++ if (err == ERESTART) ++ dmu_tx_wait(tx); ++ goto out; ++ } ++ ++ count = 0; ++ /* ++ * Set each attribute requested. ++ * We group settings according to the locks they need to acquire. ++ * ++ * Note: you cannot set ctime directly, although it will be ++ * updated as a side-effect of calling this function. ++ */ ++ ++ ++ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) ++ mutex_enter(&zp->z_acl_lock); ++ mutex_enter(&zp->z_lock); ++ ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL, ++ &zp->z_pflags, sizeof (zp->z_pflags)); ++ ++ if (attrzp) { ++ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) ++ mutex_enter(&attrzp->z_acl_lock); ++ mutex_enter(&attrzp->z_lock); ++ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, ++ SA_ZPL_FLAGS(zsb), NULL, &attrzp->z_pflags, ++ sizeof (attrzp->z_pflags)); ++ } ++ ++ if (mask & (ATTR_UID|ATTR_GID)) { ++ ++ if (mask & ATTR_UID) { ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zsb), NULL, ++ &new_uid, sizeof (new_uid)); ++ zp->z_uid = new_uid; ++ if (attrzp) { ++ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, ++ SA_ZPL_UID(zsb), NULL, &new_uid, ++ sizeof (new_uid)); ++ attrzp->z_uid = new_uid; ++ } ++ } ++ ++ if (mask & ATTR_GID) { ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zsb), ++ NULL, &new_gid, sizeof (new_gid)); ++ zp->z_gid = new_gid; ++ if (attrzp) { ++ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, ++ SA_ZPL_GID(zsb), NULL, &new_gid, ++ sizeof (new_gid)); ++ attrzp->z_gid = new_gid; ++ } ++ } ++ if (!(mask & ATTR_MODE)) { ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zsb), ++ NULL, &new_mode, sizeof (new_mode)); ++ new_mode = zp->z_mode; ++ } ++ err = zfs_acl_chown_setattr(zp); ++ ASSERT(err == 0); ++ if (attrzp) { ++ err = zfs_acl_chown_setattr(attrzp); ++ ASSERT(err == 0); ++ } ++ } ++ ++ if (mask & ATTR_MODE) { ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zsb), NULL, ++ &new_mode, sizeof (new_mode)); ++ zp->z_mode = new_mode; ++ ASSERT3P(aclp, !=, NULL); ++ err = zfs_aclset_common(zp, aclp, cr, tx); ++ ASSERT3U(err, ==, 0); ++ if (zp->z_acl_cached) ++ zfs_acl_free(zp->z_acl_cached); ++ zp->z_acl_cached = aclp; ++ aclp = NULL; ++ } ++ ++ ++ if (mask & ATTR_ATIME) { ++ ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zsb), NULL, ++ &zp->z_atime, sizeof (zp->z_atime)); ++ } ++ ++ if (mask & ATTR_MTIME) { ++ ZFS_TIME_ENCODE(&vap->va_mtime, mtime); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, ++ mtime, sizeof (mtime)); ++ } ++ ++ /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ ++ if (mask & ATTR_SIZE && !(mask & ATTR_MTIME)) { ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), ++ NULL, mtime, sizeof (mtime)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, ++ &ctime, sizeof (ctime)); ++ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, ++ B_TRUE); ++ } else if (mask != 0) { ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, ++ &ctime, sizeof (ctime)); ++ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, ++ B_TRUE); ++ if (attrzp) { ++ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, ++ SA_ZPL_CTIME(zsb), NULL, ++ &ctime, sizeof (ctime)); ++ zfs_tstamp_update_setup(attrzp, STATE_CHANGED, ++ mtime, ctime, B_TRUE); ++ } ++ } ++ /* ++ * Do this after setting timestamps to prevent timestamp ++ * update from toggling bit ++ */ ++ ++ if (xoap && (mask & ATTR_XVATTR)) { ++ ++ /* ++ * restore trimmed off masks ++ * so that return masks can be set for caller. ++ */ ++ ++ if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { ++ XVA_SET_REQ(xvap, XAT_APPENDONLY); ++ } ++ if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { ++ XVA_SET_REQ(xvap, XAT_NOUNLINK); ++ } ++ if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { ++ XVA_SET_REQ(xvap, XAT_IMMUTABLE); ++ } ++ if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { ++ XVA_SET_REQ(xvap, XAT_NODUMP); ++ } ++ if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { ++ XVA_SET_REQ(xvap, XAT_AV_MODIFIED); ++ } ++ if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { ++ XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); ++ } ++ ++ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ++ ASSERT(S_ISREG(ip->i_mode)); ++ ++ zfs_xvattr_set(zp, xvap, tx); ++ } ++ ++ if (fuid_dirtied) ++ zfs_fuid_sync(zsb, tx); ++ ++ if (mask != 0) ++ zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); ++ ++ mutex_exit(&zp->z_lock); ++ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) ++ mutex_exit(&zp->z_acl_lock); ++ ++ if (attrzp) { ++ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) ++ mutex_exit(&attrzp->z_acl_lock); ++ mutex_exit(&attrzp->z_lock); ++ } ++out: ++ if (err == 0 && attrzp) { ++ err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, ++ xattr_count, tx); ++ ASSERT(err2 == 0); ++ } ++ ++ if (attrzp) ++ iput(ZTOI(attrzp)); ++ if (aclp) ++ zfs_acl_free(aclp); ++ ++ if (fuidp) { ++ zfs_fuid_info_free(fuidp); ++ fuidp = NULL; ++ } ++ ++ if (err) { ++ dmu_tx_abort(tx); ++ if (err == ERESTART) ++ goto top; ++ } else { ++ err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ++ dmu_tx_commit(tx); ++ zfs_inode_update(zp); ++ } ++ ++out2: ++ if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) ++ zil_commit(zilog, 0); ++ ++out3: ++ kmem_free(xattr_bulk, sizeof(sa_bulk_attr_t) * 7); ++ kmem_free(bulk, sizeof(sa_bulk_attr_t) * 7); ++ kmem_free(tmpxvattr, sizeof(xvattr_t)); ++ ZFS_EXIT(zsb); ++ return (err); ++} ++EXPORT_SYMBOL(zfs_setattr); ++ ++typedef struct zfs_zlock { ++ krwlock_t *zl_rwlock; /* lock we acquired */ ++ znode_t *zl_znode; /* znode we held */ ++ struct zfs_zlock *zl_next; /* next in list */ ++} zfs_zlock_t; ++ ++/* ++ * Drop locks and release vnodes that were held by zfs_rename_lock(). ++ */ ++static void ++zfs_rename_unlock(zfs_zlock_t **zlpp) ++{ ++ zfs_zlock_t *zl; ++ ++ while ((zl = *zlpp) != NULL) { ++ if (zl->zl_znode != NULL) ++ iput(ZTOI(zl->zl_znode)); ++ rw_exit(zl->zl_rwlock); ++ *zlpp = zl->zl_next; ++ kmem_free(zl, sizeof (*zl)); ++ } ++} ++ ++/* ++ * Search back through the directory tree, using the ".." entries. ++ * Lock each directory in the chain to prevent concurrent renames. ++ * Fail any attempt to move a directory into one of its own descendants. ++ * XXX - z_parent_lock can overlap with map or grow locks ++ */ ++static int ++zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) ++{ ++ zfs_zlock_t *zl; ++ znode_t *zp = tdzp; ++ uint64_t rootid = ZTOZSB(zp)->z_root; ++ uint64_t oidp = zp->z_id; ++ krwlock_t *rwlp = &szp->z_parent_lock; ++ krw_t rw = RW_WRITER; ++ ++ /* ++ * First pass write-locks szp and compares to zp->z_id. ++ * Later passes read-lock zp and compare to zp->z_parent. ++ */ ++ do { ++ if (!rw_tryenter(rwlp, rw)) { ++ /* ++ * Another thread is renaming in this path. ++ * Note that if we are a WRITER, we don't have any ++ * parent_locks held yet. ++ */ ++ if (rw == RW_READER && zp->z_id > szp->z_id) { ++ /* ++ * Drop our locks and restart ++ */ ++ zfs_rename_unlock(&zl); ++ *zlpp = NULL; ++ zp = tdzp; ++ oidp = zp->z_id; ++ rwlp = &szp->z_parent_lock; ++ rw = RW_WRITER; ++ continue; ++ } else { ++ /* ++ * Wait for other thread to drop its locks ++ */ ++ rw_enter(rwlp, rw); ++ } ++ } ++ ++ zl = kmem_alloc(sizeof (*zl), KM_SLEEP); ++ zl->zl_rwlock = rwlp; ++ zl->zl_znode = NULL; ++ zl->zl_next = *zlpp; ++ *zlpp = zl; ++ ++ if (oidp == szp->z_id) /* We're a descendant of szp */ ++ return (EINVAL); ++ ++ if (oidp == rootid) /* We've hit the top */ ++ return (0); ++ ++ if (rw == RW_READER) { /* i.e. not the first pass */ ++ int error = zfs_zget(ZTOZSB(zp), oidp, &zp); ++ if (error) ++ return (error); ++ zl->zl_znode = zp; ++ } ++ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), ++ &oidp, sizeof (oidp)); ++ rwlp = &zp->z_parent_lock; ++ rw = RW_READER; ++ ++ } while (zp->z_id != sdzp->z_id); ++ ++ return (0); ++} ++ ++/* ++ * Move an entry from the provided source directory to the target ++ * directory. Change the entry name as indicated. ++ * ++ * IN: sdip - Source directory containing the "old entry". ++ * snm - Old entry name. ++ * tdip - Target directory to contain the "new entry". ++ * tnm - New entry name. ++ * cr - credentials of caller. ++ * flags - case flags ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Timestamps: ++ * sdip,tdip - ctime|mtime updated ++ */ ++/*ARGSUSED*/ ++int ++zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, ++ cred_t *cr, int flags) ++{ ++ znode_t *tdzp, *szp, *tzp; ++ znode_t *sdzp = ITOZ(sdip); ++ zfs_sb_t *zsb = ITOZSB(sdip); ++ zilog_t *zilog; ++ zfs_dirlock_t *sdl, *tdl; ++ dmu_tx_t *tx; ++ zfs_zlock_t *zl; ++ int cmp, serr, terr; ++ int error = 0; ++ int zflg = 0; ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(sdzp); ++ zilog = zsb->z_log; ++ ++ if (tdip->i_sb != sdip->i_sb) { ++ ZFS_EXIT(zsb); ++ return (EXDEV); ++ } ++ ++ tdzp = ITOZ(tdip); ++ ZFS_VERIFY_ZP(tdzp); ++ if (zsb->z_utf8 && u8_validate(tnm, ++ strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ++ ZFS_EXIT(zsb); ++ return (EILSEQ); ++ } ++ ++ if (flags & FIGNORECASE) ++ zflg |= ZCILOOK; ++ ++top: ++ szp = NULL; ++ tzp = NULL; ++ zl = NULL; ++ ++ /* ++ * This is to prevent the creation of links into attribute space ++ * by renaming a linked file into/outof an attribute directory. ++ * See the comment in zfs_link() for why this is considered bad. ++ */ ++ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ ++ /* ++ * Lock source and target directory entries. To prevent deadlock, ++ * a lock ordering must be defined. We lock the directory with ++ * the smallest object id first, or if it's a tie, the one with ++ * the lexically first name. ++ */ ++ if (sdzp->z_id < tdzp->z_id) { ++ cmp = -1; ++ } else if (sdzp->z_id > tdzp->z_id) { ++ cmp = 1; ++ } else { ++ /* ++ * First compare the two name arguments without ++ * considering any case folding. ++ */ ++ int nofold = (zsb->z_norm & ~U8_TEXTPREP_TOUPPER); ++ ++ cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); ++ ASSERT(error == 0 || !zsb->z_utf8); ++ if (cmp == 0) { ++ /* ++ * POSIX: "If the old argument and the new argument ++ * both refer to links to the same existing file, ++ * the rename() function shall return successfully ++ * and perform no other action." ++ */ ++ ZFS_EXIT(zsb); ++ return (0); ++ } ++ /* ++ * If the file system is case-folding, then we may ++ * have some more checking to do. A case-folding file ++ * system is either supporting mixed case sensitivity ++ * access or is completely case-insensitive. Note ++ * that the file system is always case preserving. ++ * ++ * In mixed sensitivity mode case sensitive behavior ++ * is the default. FIGNORECASE must be used to ++ * explicitly request case insensitive behavior. ++ * ++ * If the source and target names provided differ only ++ * by case (e.g., a request to rename 'tim' to 'Tim'), ++ * we will treat this as a special case in the ++ * case-insensitive mode: as long as the source name ++ * is an exact match, we will allow this to proceed as ++ * a name-change request. ++ */ ++ if ((zsb->z_case == ZFS_CASE_INSENSITIVE || ++ (zsb->z_case == ZFS_CASE_MIXED && ++ flags & FIGNORECASE)) && ++ u8_strcmp(snm, tnm, 0, zsb->z_norm, U8_UNICODE_LATEST, ++ &error) == 0) { ++ /* ++ * case preserving rename request, require exact ++ * name matches ++ */ ++ zflg |= ZCIEXACT; ++ zflg &= ~ZCILOOK; ++ } ++ } ++ ++ /* ++ * If the source and destination directories are the same, we should ++ * grab the z_name_lock of that directory only once. ++ */ ++ if (sdzp == tdzp) { ++ zflg |= ZHAVELOCK; ++ rw_enter(&sdzp->z_name_lock, RW_READER); ++ } ++ ++ if (cmp < 0) { ++ serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ++ ZEXISTS | zflg, NULL, NULL); ++ terr = zfs_dirent_lock(&tdl, ++ tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); ++ } else { ++ terr = zfs_dirent_lock(&tdl, ++ tdzp, tnm, &tzp, zflg, NULL, NULL); ++ serr = zfs_dirent_lock(&sdl, ++ sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, ++ NULL, NULL); ++ } ++ ++ if (serr) { ++ /* ++ * Source entry invalid or not there. ++ */ ++ if (!terr) { ++ zfs_dirent_unlock(tdl); ++ if (tzp) ++ iput(ZTOI(tzp)); ++ } ++ ++ if (sdzp == tdzp) ++ rw_exit(&sdzp->z_name_lock); ++ ++ if (strcmp(snm, "..") == 0) ++ serr = EINVAL; ++ ZFS_EXIT(zsb); ++ return (serr); ++ } ++ if (terr) { ++ zfs_dirent_unlock(sdl); ++ iput(ZTOI(szp)); ++ ++ if (sdzp == tdzp) ++ rw_exit(&sdzp->z_name_lock); ++ ++ if (strcmp(tnm, "..") == 0) ++ terr = EINVAL; ++ ZFS_EXIT(zsb); ++ return (terr); ++ } ++ ++ /* ++ * Must have write access at the source to remove the old entry ++ * and write access at the target to create the new entry. ++ * Note that if target and source are the same, this can be ++ * done in a single check. ++ */ ++ ++ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) ++ goto out; ++ ++ if (S_ISDIR(ZTOI(szp)->i_mode)) { ++ /* ++ * Check to make sure rename is valid. ++ * Can't do a move like this: /usr/a/b to /usr/a/b/c/d ++ */ ++ if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) ++ goto out; ++ } ++ ++ /* ++ * Does target exist? ++ */ ++ if (tzp) { ++ /* ++ * Source and target must be the same type. ++ */ ++ if (S_ISDIR(ZTOI(szp)->i_mode)) { ++ if (!S_ISDIR(ZTOI(tzp)->i_mode)) { ++ error = ENOTDIR; ++ goto out; ++ } ++ } else { ++ if (S_ISDIR(ZTOI(tzp)->i_mode)) { ++ error = EISDIR; ++ goto out; ++ } ++ } ++ /* ++ * POSIX dictates that when the source and target ++ * entries refer to the same file object, rename ++ * must do nothing and exit without error. ++ */ ++ if (szp->z_id == tzp->z_id) { ++ error = 0; ++ goto out; ++ } ++ } ++ ++ tx = dmu_tx_create(zsb->z_os); ++ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); ++ dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); ++ dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); ++ dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); ++ if (sdzp != tdzp) { ++ dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); ++ zfs_sa_upgrade_txholds(tx, tdzp); ++ } ++ if (tzp) { ++ dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); ++ zfs_sa_upgrade_txholds(tx, tzp); ++ } ++ ++ zfs_sa_upgrade_txholds(tx, szp); ++ dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL); ++ error = dmu_tx_assign(tx, TXG_NOWAIT); ++ if (error) { ++ if (zl != NULL) ++ zfs_rename_unlock(&zl); ++ zfs_dirent_unlock(sdl); ++ zfs_dirent_unlock(tdl); ++ ++ if (sdzp == tdzp) ++ rw_exit(&sdzp->z_name_lock); ++ ++ iput(ZTOI(szp)); ++ if (tzp) ++ iput(ZTOI(tzp)); ++ if (error == ERESTART) { ++ dmu_tx_wait(tx); ++ dmu_tx_abort(tx); ++ goto top; ++ } ++ dmu_tx_abort(tx); ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ if (tzp) /* Attempt to remove the existing target */ ++ error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); ++ ++ if (error == 0) { ++ error = zfs_link_create(tdl, szp, tx, ZRENAMING); ++ if (error == 0) { ++ szp->z_pflags |= ZFS_AV_MODIFIED; ++ ++ error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zsb), ++ (void *)&szp->z_pflags, sizeof (uint64_t), tx); ++ ASSERT3U(error, ==, 0); ++ ++ error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); ++ if (error == 0) { ++ zfs_log_rename(zilog, tx, TX_RENAME | ++ (flags & FIGNORECASE ? TX_CI : 0), sdzp, ++ sdl->dl_name, tdzp, tdl->dl_name, szp); ++ } else { ++ /* ++ * At this point, we have successfully created ++ * the target name, but have failed to remove ++ * the source name. Since the create was done ++ * with the ZRENAMING flag, there are ++ * complications; for one, the link count is ++ * wrong. The easiest way to deal with this ++ * is to remove the newly created target, and ++ * return the original error. This must ++ * succeed; fortunately, it is very unlikely to ++ * fail, since we just created it. ++ */ ++ VERIFY3U(zfs_link_destroy(tdl, szp, tx, ++ ZRENAMING, NULL), ==, 0); ++ } ++ } ++ } ++ ++ dmu_tx_commit(tx); ++out: ++ if (zl != NULL) ++ zfs_rename_unlock(&zl); ++ ++ zfs_dirent_unlock(sdl); ++ zfs_dirent_unlock(tdl); ++ ++ zfs_inode_update(sdzp); ++ if (sdzp == tdzp) ++ rw_exit(&sdzp->z_name_lock); ++ ++ if (sdzp != tdzp) ++ zfs_inode_update(tdzp); ++ ++ zfs_inode_update(szp); ++ iput(ZTOI(szp)); ++ if (tzp) { ++ zfs_inode_update(tzp); ++ iput(ZTOI(tzp)); ++ } ++ ++ if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) ++ zil_commit(zilog, 0); ++ ++ ZFS_EXIT(zsb); ++ return (error); ++} ++EXPORT_SYMBOL(zfs_rename); ++ ++/* ++ * Insert the indicated symbolic reference entry into the directory. ++ * ++ * IN: dip - Directory to contain new symbolic link. ++ * link - Name for new symlink entry. ++ * vap - Attributes of new entry. ++ * target - Target path of new symlink. ++ * ++ * cr - credentials of caller. ++ * flags - case flags ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Timestamps: ++ * dip - ctime|mtime updated ++ */ ++/*ARGSUSED*/ ++int ++zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link, ++ struct inode **ipp, cred_t *cr, int flags) ++{ ++ znode_t *zp, *dzp = ITOZ(dip); ++ zfs_dirlock_t *dl; ++ dmu_tx_t *tx; ++ zfs_sb_t *zsb = ITOZSB(dip); ++ zilog_t *zilog; ++ uint64_t len = strlen(link); ++ int error; ++ int zflg = ZNEW; ++ zfs_acl_ids_t acl_ids; ++ boolean_t fuid_dirtied; ++ uint64_t txtype = TX_SYMLINK; ++ ++ ASSERT(S_ISLNK(vap->va_mode)); ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(dzp); ++ zilog = zsb->z_log; ++ ++ if (zsb->z_utf8 && u8_validate(name, strlen(name), ++ NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ++ ZFS_EXIT(zsb); ++ return (EILSEQ); ++ } ++ if (flags & FIGNORECASE) ++ zflg |= ZCILOOK; ++ ++ if (len > MAXPATHLEN) { ++ ZFS_EXIT(zsb); ++ return (ENAMETOOLONG); ++ } ++ ++ if ((error = zfs_acl_ids_create(dzp, 0, ++ vap, cr, NULL, &acl_ids)) != 0) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++top: ++ *ipp = NULL; ++ ++ /* ++ * Attempt to lock directory; fail if entry already exists. ++ */ ++ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); ++ if (error) { ++ zfs_acl_ids_free(&acl_ids); ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { ++ zfs_acl_ids_free(&acl_ids); ++ zfs_dirent_unlock(dl); ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ if (zfs_acl_ids_overquota(zsb, &acl_ids)) { ++ zfs_acl_ids_free(&acl_ids); ++ zfs_dirent_unlock(dl); ++ ZFS_EXIT(zsb); ++ return (EDQUOT); ++ } ++ tx = dmu_tx_create(zsb->z_os); ++ fuid_dirtied = zsb->z_fuid_dirty; ++ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); ++ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); ++ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ++ ZFS_SA_BASE_ATTR_SIZE + len); ++ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); ++ if (!zsb->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { ++ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ++ acl_ids.z_aclp->z_acl_bytes); ++ } ++ if (fuid_dirtied) ++ zfs_fuid_txhold(zsb, tx); ++ error = dmu_tx_assign(tx, TXG_NOWAIT); ++ if (error) { ++ zfs_dirent_unlock(dl); ++ if (error == ERESTART) { ++ dmu_tx_wait(tx); ++ dmu_tx_abort(tx); ++ goto top; ++ } ++ zfs_acl_ids_free(&acl_ids); ++ dmu_tx_abort(tx); ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ /* ++ * Create a new object for the symlink. ++ * for version 4 ZPL datsets the symlink will be an SA attribute ++ */ ++ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); ++ ++ if (fuid_dirtied) ++ zfs_fuid_sync(zsb, tx); ++ ++ mutex_enter(&zp->z_lock); ++ if (zp->z_is_sa) ++ error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zsb), ++ link, len, tx); ++ else ++ zfs_sa_symlink(zp, link, len, tx); ++ mutex_exit(&zp->z_lock); ++ ++ zp->z_size = len; ++ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zsb), ++ &zp->z_size, sizeof (zp->z_size), tx); ++ /* ++ * Insert the new object into the directory. ++ */ ++ (void) zfs_link_create(dl, zp, tx, ZNEW); ++ ++ if (flags & FIGNORECASE) ++ txtype |= TX_CI; ++ zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); ++ ++ zfs_inode_update(dzp); ++ zfs_inode_update(zp); ++ ++ zfs_acl_ids_free(&acl_ids); ++ ++ dmu_tx_commit(tx); ++ ++ zfs_dirent_unlock(dl); ++ ++ *ipp = ZTOI(zp); ++ ++ if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) ++ zil_commit(zilog, 0); ++ ++ ZFS_EXIT(zsb); ++ return (error); ++} ++EXPORT_SYMBOL(zfs_symlink); ++ ++/* ++ * Return, in the buffer contained in the provided uio structure, ++ * the symbolic path referred to by ip. ++ * ++ * IN: ip - inode of symbolic link ++ * uio - structure to contain the link path. ++ * cr - credentials of caller. ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Timestamps: ++ * ip - atime updated ++ */ ++/* ARGSUSED */ ++int ++zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ int error; ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ ++ mutex_enter(&zp->z_lock); ++ if (zp->z_is_sa) ++ error = sa_lookup_uio(zp->z_sa_hdl, ++ SA_ZPL_SYMLINK(zsb), uio); ++ else ++ error = zfs_sa_readlink(zp, uio); ++ mutex_exit(&zp->z_lock); ++ ++ ZFS_ACCESSTIME_STAMP(zsb, zp); ++ zfs_inode_update(zp); ++ ZFS_EXIT(zsb); ++ return (error); ++} ++EXPORT_SYMBOL(zfs_readlink); ++ ++/* ++ * Insert a new entry into directory tdip referencing sip. ++ * ++ * IN: tdip - Directory to contain new entry. ++ * sip - inode of new entry. ++ * name - name of new entry. ++ * cr - credentials of caller. ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Timestamps: ++ * tdip - ctime|mtime updated ++ * sip - ctime updated ++ */ ++/* ARGSUSED */ ++int ++zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr) ++{ ++ znode_t *dzp = ITOZ(tdip); ++ znode_t *tzp, *szp; ++ zfs_sb_t *zsb = ITOZSB(tdip); ++ zilog_t *zilog; ++ zfs_dirlock_t *dl; ++ dmu_tx_t *tx; ++ int error; ++ int zf = ZNEW; ++ uint64_t parent; ++ uid_t owner; ++ ++ ASSERT(S_ISDIR(tdip->i_mode)); ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(dzp); ++ zilog = zsb->z_log; ++ ++ /* ++ * POSIX dictates that we return EPERM here. ++ * Better choices include ENOTSUP or EISDIR. ++ */ ++ if (S_ISDIR(sip->i_mode)) { ++ ZFS_EXIT(zsb); ++ return (EPERM); ++ } ++ ++ if (sip->i_sb != tdip->i_sb) { ++ ZFS_EXIT(zsb); ++ return (EXDEV); ++ } ++ ++ szp = ITOZ(sip); ++ ZFS_VERIFY_ZP(szp); ++ ++ /* Prevent links to .zfs/shares files */ ++ ++ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zsb), ++ &parent, sizeof (uint64_t))) != 0) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ if (parent == zsb->z_shares_dir) { ++ ZFS_EXIT(zsb); ++ return (EPERM); ++ } ++ ++ if (zsb->z_utf8 && u8_validate(name, ++ strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ++ ZFS_EXIT(zsb); ++ return (EILSEQ); ++ } ++#ifdef HAVE_PN_UTILS ++ if (flags & FIGNORECASE) ++ zf |= ZCILOOK; ++#endif /* HAVE_PN_UTILS */ ++ ++ /* ++ * We do not support links between attributes and non-attributes ++ * because of the potential security risk of creating links ++ * into "normal" file space in order to circumvent restrictions ++ * imposed in attribute space. ++ */ ++ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ ++ owner = zfs_fuid_map_id(zsb, szp->z_uid, cr, ZFS_OWNER); ++ if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { ++ ZFS_EXIT(zsb); ++ return (EPERM); ++ } ++ ++ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++top: ++ /* ++ * Attempt to lock directory; fail if entry already exists. ++ */ ++ error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); ++ if (error) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ tx = dmu_tx_create(zsb->z_os); ++ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); ++ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); ++ zfs_sa_upgrade_txholds(tx, szp); ++ zfs_sa_upgrade_txholds(tx, dzp); ++ error = dmu_tx_assign(tx, TXG_NOWAIT); ++ if (error) { ++ zfs_dirent_unlock(dl); ++ if (error == ERESTART) { ++ dmu_tx_wait(tx); ++ dmu_tx_abort(tx); ++ goto top; ++ } ++ dmu_tx_abort(tx); ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ error = zfs_link_create(dl, szp, tx, 0); ++ ++ if (error == 0) { ++ uint64_t txtype = TX_LINK; ++#ifdef HAVE_PN_UTILS ++ if (flags & FIGNORECASE) ++ txtype |= TX_CI; ++#endif /* HAVE_PN_UTILS */ ++ zfs_log_link(zilog, tx, txtype, dzp, szp, name); ++ } ++ ++ dmu_tx_commit(tx); ++ ++ zfs_dirent_unlock(dl); ++ ++ if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) ++ zil_commit(zilog, 0); ++ ++ zfs_inode_update(dzp); ++ zfs_inode_update(szp); ++ ZFS_EXIT(zsb); ++ return (error); ++} ++EXPORT_SYMBOL(zfs_link); ++ ++static void ++zfs_putpage_commit_cb(void *arg, int error) ++{ ++ struct page *pp = arg; ++ ++ if (error) { ++ __set_page_dirty_nobuffers(pp); ++ ++ if (error != ECANCELED) ++ SetPageError(pp); ++ } else { ++ ClearPageError(pp); ++ } ++ ++ end_page_writeback(pp); ++} ++ ++/* ++ * Push a page out to disk, once the page is on stable storage the ++ * registered commit callback will be run as notification of completion. ++ * ++ * IN: ip - page mapped for inode. ++ * pp - page to push (page is locked) ++ * wbc - writeback control data ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Timestamps: ++ * ip - ctime|mtime updated ++ */ ++/* ARGSUSED */ ++int ++zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ loff_t offset; ++ loff_t pgoff; ++ unsigned int pglen; ++ rl_t *rl; ++ dmu_tx_t *tx; ++ caddr_t va; ++ int err = 0; ++ uint64_t mtime[2], ctime[2]; ++ sa_bulk_attr_t bulk[3]; ++ int cnt = 0; ++ int sync; ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ ++ ASSERT(PageLocked(pp)); ++ ++ pgoff = page_offset(pp); /* Page byte-offset in file */ ++ offset = i_size_read(ip); /* File length in bytes */ ++ pglen = MIN(PAGE_CACHE_SIZE, /* Page length in bytes */ ++ P2ROUNDUP(offset, PAGE_CACHE_SIZE)-pgoff); ++ ++ /* Page is beyond end of file */ ++ if (pgoff >= offset) { ++ unlock_page(pp); ++ ZFS_EXIT(zsb); ++ return (0); ++ } ++ ++ /* Truncate page length to end of file */ ++ if (pgoff + pglen > offset) ++ pglen = offset - pgoff; ++ ++#if 0 ++ /* ++ * FIXME: Allow mmap writes past its quota. The correct fix ++ * is to register a page_mkwrite() handler to count the page ++ * against its quota when it is about to be dirtied. ++ */ ++ if (zfs_owner_overquota(zsb, zp, B_FALSE) || ++ zfs_owner_overquota(zsb, zp, B_TRUE)) { ++ err = EDQUOT; ++ } ++#endif ++ ++ set_page_writeback(pp); ++ unlock_page(pp); ++ ++ rl = zfs_range_lock(zp, pgoff, pglen, RL_WRITER); ++ tx = dmu_tx_create(zsb->z_os); ++ ++ sync = ((zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) || ++ (wbc->sync_mode == WB_SYNC_ALL)); ++ if (!sync) ++ dmu_tx_callback_register(tx, zfs_putpage_commit_cb, pp); ++ ++ dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); ++ ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); ++ zfs_sa_upgrade_txholds(tx, zp); ++ err = dmu_tx_assign(tx, TXG_NOWAIT); ++ if (err != 0) { ++ if (err == ERESTART) ++ dmu_tx_wait(tx); ++ ++ /* Will call all registered commit callbacks */ ++ dmu_tx_abort(tx); ++ ++ /* ++ * For the synchronous case the commit callback must be ++ * explicitly called because there is no registered callback. ++ */ ++ if (sync) ++ zfs_putpage_commit_cb(pp, ECANCELED); ++ ++ zfs_range_unlock(rl); ++ ZFS_EXIT(zsb); ++ return (err); ++ } ++ ++ va = kmap(pp); ++ ASSERT3U(pglen, <=, PAGE_CACHE_SIZE); ++ dmu_write(zsb->z_os, zp->z_id, pgoff, pglen, va, tx); ++ kunmap(pp); ++ ++ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zsb), NULL, &mtime, 16); ++ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zsb), NULL, &ctime, 16); ++ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zsb), NULL, &zp->z_pflags, 8); ++ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); ++ zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0); ++ ++ dmu_tx_commit(tx); ++ zfs_range_unlock(rl); ++ ASSERT3S(err, ==, 0); ++ ++ if (sync) { ++ zil_commit(zsb->z_log, zp->z_id); ++ zfs_putpage_commit_cb(pp, err); ++ } ++ ++ ZFS_EXIT(zsb); ++ return (err); ++} ++ ++/*ARGSUSED*/ ++void ++zfs_inactive(struct inode *ip) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ int error; ++ ++ if (zfsctl_is_node(ip)) { ++ zfsctl_inode_inactive(ip); ++ return; ++ } ++ ++ rw_enter(&zsb->z_teardown_inactive_lock, RW_READER); ++ if (zp->z_sa_hdl == NULL) { ++ rw_exit(&zsb->z_teardown_inactive_lock); ++ return; ++ } ++ ++ if (zp->z_atime_dirty && zp->z_unlinked == 0) { ++ dmu_tx_t *tx = dmu_tx_create(zsb->z_os); ++ ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); ++ zfs_sa_upgrade_txholds(tx, zp); ++ error = dmu_tx_assign(tx, TXG_WAIT); ++ if (error) { ++ dmu_tx_abort(tx); ++ } else { ++ mutex_enter(&zp->z_lock); ++ (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zsb), ++ (void *)&zp->z_atime, sizeof (zp->z_atime), tx); ++ zp->z_atime_dirty = 0; ++ mutex_exit(&zp->z_lock); ++ dmu_tx_commit(tx); ++ } ++ } ++ ++ zfs_zinactive(zp); ++ rw_exit(&zsb->z_teardown_inactive_lock); ++} ++EXPORT_SYMBOL(zfs_inactive); ++ ++/* ++ * Bounds-check the seek operation. ++ * ++ * IN: ip - inode seeking within ++ * ooff - old file offset ++ * noffp - pointer to new file offset ++ * ct - caller context ++ * ++ * RETURN: 0 if success ++ * EINVAL if new offset invalid ++ */ ++/* ARGSUSED */ ++int ++zfs_seek(struct inode *ip, offset_t ooff, offset_t *noffp) ++{ ++ if (S_ISDIR(ip->i_mode)) ++ return (0); ++ return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); ++} ++EXPORT_SYMBOL(zfs_seek); ++ ++/* ++ * Fill pages with data from the disk. ++ */ ++static int ++zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ objset_t *os; ++ struct page *cur_pp; ++ u_offset_t io_off, total; ++ size_t io_len; ++ loff_t i_size; ++ unsigned page_idx; ++ int err; ++ ++ os = zsb->z_os; ++ io_len = nr_pages << PAGE_CACHE_SHIFT; ++ i_size = i_size_read(ip); ++ io_off = page_offset(pl[0]); ++ ++ if (io_off + io_len > i_size) ++ io_len = i_size - io_off; ++ ++ /* ++ * Iterate over list of pages and read each page individually. ++ */ ++ page_idx = 0; ++ cur_pp = pl[0]; ++ for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { ++ caddr_t va; ++ ++ va = kmap(cur_pp); ++ err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, ++ DMU_READ_PREFETCH); ++ kunmap(cur_pp); ++ if (err) { ++ /* convert checksum errors into IO errors */ ++ if (err == ECKSUM) ++ err = EIO; ++ return (err); ++ } ++ cur_pp = pl[++page_idx]; ++ } ++ ++ return (0); ++} ++ ++/* ++ * Uses zfs_fillpage to read data from the file and fill the pages. ++ * ++ * IN: ip - inode of file to get data from. ++ * pl - list of pages to read ++ * nr_pages - number of pages to read ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Timestamps: ++ * vp - atime updated ++ */ ++/* ARGSUSED */ ++int ++zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ int err; ++ ++ if (pl == NULL) ++ return (0); ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ ++ err = zfs_fillpage(ip, pl, nr_pages); ++ ++ if (!err) ++ ZFS_ACCESSTIME_STAMP(zsb, zp); ++ ++ ZFS_EXIT(zsb); ++ return (err); ++} ++EXPORT_SYMBOL(zfs_getpage); ++ ++/* ++ * Check ZFS specific permissions to memory map a section of a file. ++ * ++ * IN: ip - inode of the file to mmap ++ * off - file offset ++ * addrp - start address in memory region ++ * len - length of memory region ++ * vm_flags- address flags ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ */ ++/*ARGSUSED*/ ++int ++zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, ++ unsigned long vm_flags) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ ++ if ((vm_flags & VM_WRITE) && (zp->z_pflags & ++ (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { ++ ZFS_EXIT(zsb); ++ return (EPERM); ++ } ++ ++ if ((vm_flags & (VM_READ | VM_EXEC)) && ++ (zp->z_pflags & ZFS_AV_QUARANTINED)) { ++ ZFS_EXIT(zsb); ++ return (EACCES); ++ } ++ ++ if (off < 0 || len > MAXOFFSET_T - off) { ++ ZFS_EXIT(zsb); ++ return (ENXIO); ++ } ++ ++ ZFS_EXIT(zsb); ++ return (0); ++} ++EXPORT_SYMBOL(zfs_map); ++ ++/* ++ * convoff - converts the given data (start, whence) to the ++ * given whence. ++ */ ++int ++convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) ++{ ++ vattr_t vap; ++ int error; ++ ++ if ((lckdat->l_whence == 2) || (whence == 2)) { ++ if ((error = zfs_getattr(ip, &vap, 0, CRED()) != 0)) ++ return (error); ++ } ++ ++ switch (lckdat->l_whence) { ++ case 1: ++ lckdat->l_start += offset; ++ break; ++ case 2: ++ lckdat->l_start += vap.va_size; ++ /* FALLTHRU */ ++ case 0: ++ break; ++ default: ++ return (EINVAL); ++ } ++ ++ if (lckdat->l_start < 0) ++ return (EINVAL); ++ ++ switch (whence) { ++ case 1: ++ lckdat->l_start -= offset; ++ break; ++ case 2: ++ lckdat->l_start -= vap.va_size; ++ /* FALLTHRU */ ++ case 0: ++ break; ++ default: ++ return (EINVAL); ++ } ++ ++ lckdat->l_whence = (short)whence; ++ return (0); ++} ++ ++/* ++ * Free or allocate space in a file. Currently, this function only ++ * supports the `F_FREESP' command. However, this command is somewhat ++ * misnamed, as its functionality includes the ability to allocate as ++ * well as free space. ++ * ++ * IN: ip - inode of file to free data in. ++ * cmd - action to take (only F_FREESP supported). ++ * bfp - section of file to free/alloc. ++ * flag - current file open mode flags. ++ * offset - current file offset. ++ * cr - credentials of caller [UNUSED]. ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ * ++ * Timestamps: ++ * ip - ctime|mtime updated ++ */ ++/* ARGSUSED */ ++int ++zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag, ++ offset_t offset, cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ uint64_t off, len; ++ int error; ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ ++ if (cmd != F_FREESP) { ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ ++ if ((error = convoff(ip, bfp, 0, offset))) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ if (bfp->l_len < 0) { ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ ++ /* ++ * Permissions aren't checked on Solaris because on this OS ++ * zfs_space() can only be called with an opened file handle. ++ * On Linux we can get here through truncate_range() which ++ * operates directly on inodes, so we need to check access rights. ++ */ ++ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ off = bfp->l_start; ++ len = bfp->l_len; /* 0 means from off to end of file */ ++ ++ error = zfs_freesp(zp, off, len, flag, TRUE); ++ ++ ZFS_EXIT(zsb); ++ return (error); ++} ++EXPORT_SYMBOL(zfs_space); ++ ++/*ARGSUSED*/ ++int ++zfs_fid(struct inode *ip, fid_t *fidp) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ uint32_t gen; ++ uint64_t gen64; ++ uint64_t object = zp->z_id; ++ zfid_short_t *zfid; ++ int size, i, error; ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ ++ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zsb), ++ &gen64, sizeof (uint64_t))) != 0) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ gen = (uint32_t)gen64; ++ ++ size = (zsb->z_parent != zsb) ? LONG_FID_LEN : SHORT_FID_LEN; ++ if (fidp->fid_len < size) { ++ fidp->fid_len = size; ++ ZFS_EXIT(zsb); ++ return (ENOSPC); ++ } ++ ++ zfid = (zfid_short_t *)fidp; ++ ++ zfid->zf_len = size; ++ ++ for (i = 0; i < sizeof (zfid->zf_object); i++) ++ zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); ++ ++ /* Must have a non-zero generation number to distinguish from .zfs */ ++ if (gen == 0) ++ gen = 1; ++ for (i = 0; i < sizeof (zfid->zf_gen); i++) ++ zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); ++ ++ if (size == LONG_FID_LEN) { ++ uint64_t objsetid = dmu_objset_id(zsb->z_os); ++ zfid_long_t *zlfid; ++ ++ zlfid = (zfid_long_t *)fidp; ++ ++ for (i = 0; i < sizeof (zlfid->zf_setid); i++) ++ zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); ++ ++ /* XXX - this should be the generation number for the objset */ ++ for (i = 0; i < sizeof (zlfid->zf_setgen); i++) ++ zlfid->zf_setgen[i] = 0; ++ } ++ ++ ZFS_EXIT(zsb); ++ return (0); ++} ++EXPORT_SYMBOL(zfs_fid); ++ ++/*ARGSUSED*/ ++int ++zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ int error; ++ boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ error = zfs_getacl(zp, vsecp, skipaclchk, cr); ++ ZFS_EXIT(zsb); ++ ++ return (error); ++} ++EXPORT_SYMBOL(zfs_getsecattr); ++ ++/*ARGSUSED*/ ++int ++zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ int error; ++ boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ++ zilog_t *zilog = zsb->z_log; ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ ++ error = zfs_setacl(zp, vsecp, skipaclchk, cr); ++ ++ if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) ++ zil_commit(zilog, 0); ++ ++ ZFS_EXIT(zsb); ++ return (error); ++} ++EXPORT_SYMBOL(zfs_setsecattr); ++ ++#ifdef HAVE_UIO_ZEROCOPY ++/* ++ * Tunable, both must be a power of 2. ++ * ++ * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf ++ * zcr_blksz_max: if set to less than the file block size, allow loaning out of ++ * an arcbuf for a partial block read ++ */ ++int zcr_blksz_min = (1 << 10); /* 1K */ ++int zcr_blksz_max = (1 << 17); /* 128K */ ++ ++/*ARGSUSED*/ ++static int ++zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ITOZSB(ip); ++ int max_blksz = zsb->z_max_blksz; ++ uio_t *uio = &xuio->xu_uio; ++ ssize_t size = uio->uio_resid; ++ offset_t offset = uio->uio_loffset; ++ int blksz; ++ int fullblk, i; ++ arc_buf_t *abuf; ++ ssize_t maxsize; ++ int preamble, postamble; ++ ++ if (xuio->xu_type != UIOTYPE_ZEROCOPY) ++ return (EINVAL); ++ ++ ZFS_ENTER(zsb); ++ ZFS_VERIFY_ZP(zp); ++ switch (ioflag) { ++ case UIO_WRITE: ++ /* ++ * Loan out an arc_buf for write if write size is bigger than ++ * max_blksz, and the file's block size is also max_blksz. ++ */ ++ blksz = max_blksz; ++ if (size < blksz || zp->z_blksz != blksz) { ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ /* ++ * Caller requests buffers for write before knowing where the ++ * write offset might be (e.g. NFS TCP write). ++ */ ++ if (offset == -1) { ++ preamble = 0; ++ } else { ++ preamble = P2PHASE(offset, blksz); ++ if (preamble) { ++ preamble = blksz - preamble; ++ size -= preamble; ++ } ++ } ++ ++ postamble = P2PHASE(size, blksz); ++ size -= postamble; ++ ++ fullblk = size / blksz; ++ (void) dmu_xuio_init(xuio, ++ (preamble != 0) + fullblk + (postamble != 0)); ++ ++ /* ++ * Have to fix iov base/len for partial buffers. They ++ * currently represent full arc_buf's. ++ */ ++ if (preamble) { ++ /* data begins in the middle of the arc_buf */ ++ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), ++ blksz); ++ ASSERT(abuf); ++ (void) dmu_xuio_add(xuio, abuf, ++ blksz - preamble, preamble); ++ } ++ ++ for (i = 0; i < fullblk; i++) { ++ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), ++ blksz); ++ ASSERT(abuf); ++ (void) dmu_xuio_add(xuio, abuf, 0, blksz); ++ } ++ ++ if (postamble) { ++ /* data ends in the middle of the arc_buf */ ++ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), ++ blksz); ++ ASSERT(abuf); ++ (void) dmu_xuio_add(xuio, abuf, 0, postamble); ++ } ++ break; ++ case UIO_READ: ++ /* ++ * Loan out an arc_buf for read if the read size is larger than ++ * the current file block size. Block alignment is not ++ * considered. Partial arc_buf will be loaned out for read. ++ */ ++ blksz = zp->z_blksz; ++ if (blksz < zcr_blksz_min) ++ blksz = zcr_blksz_min; ++ if (blksz > zcr_blksz_max) ++ blksz = zcr_blksz_max; ++ /* avoid potential complexity of dealing with it */ ++ if (blksz > max_blksz) { ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ ++ maxsize = zp->z_size - uio->uio_loffset; ++ if (size > maxsize) ++ size = maxsize; ++ ++ if (size < blksz) { ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ break; ++ default: ++ ZFS_EXIT(zsb); ++ return (EINVAL); ++ } ++ ++ uio->uio_extflg = UIO_XUIO; ++ XUIO_XUZC_RW(xuio) = ioflag; ++ ZFS_EXIT(zsb); ++ return (0); ++} ++ ++/*ARGSUSED*/ ++static int ++zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr) ++{ ++ int i; ++ arc_buf_t *abuf; ++ int ioflag = XUIO_XUZC_RW(xuio); ++ ++ ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); ++ ++ i = dmu_xuio_cnt(xuio); ++ while (i-- > 0) { ++ abuf = dmu_xuio_arcbuf(xuio, i); ++ /* ++ * if abuf == NULL, it must be a write buffer ++ * that has been returned in zfs_write(). ++ */ ++ if (abuf) ++ dmu_return_arcbuf(abuf); ++ ASSERT(abuf || ioflag == UIO_WRITE); ++ } ++ ++ dmu_xuio_fini(xuio); ++ return (0); ++} ++#endif /* HAVE_UIO_ZEROCOPY */ ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++module_param(zfs_read_chunk_size, long, 0644); ++MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk"); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zfs_znode.c linux-3.2.33-go/fs/zfs/zfs/zfs_znode.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zfs_znode.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zfs_znode.c 2012-11-16 23:25:34.349039334 +0100 +@@ -0,0 +1,1800 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* Portions Copyright 2007 Jeremy Teo */ ++ ++#ifdef _KERNEL ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "fs/fs_subr.h" ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#endif /* _KERNEL */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "zfs_prop.h" ++#include "zfs_comutil.h" ++ ++/* ++ * Define ZNODE_STATS to turn on statistic gathering. By default, it is only ++ * turned on when DEBUG is also defined. ++ */ ++#ifdef DEBUG ++#define ZNODE_STATS ++#endif /* DEBUG */ ++ ++#ifdef ZNODE_STATS ++#define ZNODE_STAT_ADD(stat) ((stat)++) ++#else ++#define ZNODE_STAT_ADD(stat) /* nothing */ ++#endif /* ZNODE_STATS */ ++ ++/* ++ * Functions needed for userland (ie: libzpool) are not put under ++ * #ifdef_KERNEL; the rest of the functions have dependencies ++ * (such as VFS logic) that will not compile easily in userland. ++ */ ++#ifdef _KERNEL ++ ++static kmem_cache_t *znode_cache = NULL; ++ ++/*ARGSUSED*/ ++static int ++zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) ++{ ++ znode_t *zp = buf; ++ ++ inode_init_once(ZTOI(zp)); ++ list_link_init(&zp->z_link_node); ++ ++ mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); ++ rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); ++ rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); ++ mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); ++ rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); ++ ++ mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); ++ avl_create(&zp->z_range_avl, zfs_range_compare, ++ sizeof (rl_t), offsetof(rl_t, r_node)); ++ ++ zp->z_dirlocks = NULL; ++ zp->z_acl_cached = NULL; ++ zp->z_xattr_cached = NULL; ++ zp->z_moved = 0; ++ return (0); ++} ++ ++/*ARGSUSED*/ ++static void ++zfs_znode_cache_destructor(void *buf, void *arg) ++{ ++ znode_t *zp = buf; ++ ++ ASSERT(!list_link_active(&zp->z_link_node)); ++ mutex_destroy(&zp->z_lock); ++ rw_destroy(&zp->z_parent_lock); ++ rw_destroy(&zp->z_name_lock); ++ mutex_destroy(&zp->z_acl_lock); ++ rw_destroy(&zp->z_xattr_lock); ++ avl_destroy(&zp->z_range_avl); ++ mutex_destroy(&zp->z_range_lock); ++ ++ ASSERT(zp->z_dirlocks == NULL); ++ ASSERT(zp->z_acl_cached == NULL); ++ ASSERT(zp->z_xattr_cached == NULL); ++} ++ ++void ++zfs_znode_init(void) ++{ ++ /* ++ * Initialize zcache ++ */ ++ ASSERT(znode_cache == NULL); ++ znode_cache = kmem_cache_create("zfs_znode_cache", ++ sizeof (znode_t), 0, zfs_znode_cache_constructor, ++ zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_KMEM); ++} ++ ++void ++zfs_znode_fini(void) ++{ ++ /* ++ * Cleanup zcache ++ */ ++ if (znode_cache) ++ kmem_cache_destroy(znode_cache); ++ znode_cache = NULL; ++} ++ ++int ++zfs_create_share_dir(zfs_sb_t *zsb, dmu_tx_t *tx) ++{ ++#ifdef HAVE_SMB_SHARE ++ zfs_acl_ids_t acl_ids; ++ vattr_t vattr; ++ znode_t *sharezp; ++ vnode_t *vp; ++ znode_t *zp; ++ int error; ++ ++ vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; ++ vattr.va_mode = S_IFDIR | 0555; ++ vattr.va_uid = crgetuid(kcred); ++ vattr.va_gid = crgetgid(kcred); ++ ++ sharezp = kmem_cache_alloc(znode_cache, KM_PUSHPAGE); ++ sharezp->z_moved = 0; ++ sharezp->z_unlinked = 0; ++ sharezp->z_atime_dirty = 0; ++ sharezp->z_zfsvfs = zfsvfs; ++ sharezp->z_is_sa = zfsvfs->z_use_sa; ++ ++ vp = ZTOV(sharezp); ++ vn_reinit(vp); ++ vp->v_type = VDIR; ++ ++ VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, ++ kcred, NULL, &acl_ids)); ++ zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); ++ ASSERT3P(zp, ==, sharezp); ++ ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */ ++ POINTER_INVALIDATE(&sharezp->z_zfsvfs); ++ error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, ++ ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); ++ zfsvfs->z_shares_dir = sharezp->z_id; ++ ++ zfs_acl_ids_free(&acl_ids); ++ // ZTOV(sharezp)->v_count = 0; ++ sa_handle_destroy(sharezp->z_sa_hdl); ++ kmem_cache_free(znode_cache, sharezp); ++ ++ return (error); ++#else ++ return (0); ++#endif /* HAVE_SMB_SHARE */ ++} ++ ++static void ++zfs_znode_sa_init(zfs_sb_t *zsb, znode_t *zp, ++ dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) ++{ ++ ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zsb, zp->z_id))); ++ ++ mutex_enter(&zp->z_lock); ++ ++ ASSERT(zp->z_sa_hdl == NULL); ++ ASSERT(zp->z_acl_cached == NULL); ++ if (sa_hdl == NULL) { ++ VERIFY(0 == sa_handle_get_from_db(zsb->z_os, db, zp, ++ SA_HDL_SHARED, &zp->z_sa_hdl)); ++ } else { ++ zp->z_sa_hdl = sa_hdl; ++ sa_set_userp(sa_hdl, zp); ++ } ++ ++ zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; ++ ++ mutex_exit(&zp->z_lock); ++} ++ ++void ++zfs_znode_dmu_fini(znode_t *zp) ++{ ++ ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(ZTOZSB(zp), zp->z_id)) || ++ zp->z_unlinked || ++ RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); ++ ++ sa_handle_destroy(zp->z_sa_hdl); ++ zp->z_sa_hdl = NULL; ++} ++ ++/* ++ * Called by new_inode() to allocate a new inode. ++ */ ++int ++zfs_inode_alloc(struct super_block *sb, struct inode **ip) ++{ ++ znode_t *zp; ++ ++ zp = kmem_cache_alloc(znode_cache, KM_PUSHPAGE); ++ *ip = ZTOI(zp); ++ ++ return (0); ++} ++ ++/* ++ * Called in multiple places when an inode should be destroyed. ++ */ ++void ++zfs_inode_destroy(struct inode *ip) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ ++ if (zfsctl_is_node(ip)) ++ zfsctl_inode_destroy(ip); ++ ++ mutex_enter(&zsb->z_znodes_lock); ++ list_remove(&zsb->z_all_znodes, zp); ++ zsb->z_nr_znodes--; ++ mutex_exit(&zsb->z_znodes_lock); ++ ++ if (zp->z_acl_cached) { ++ zfs_acl_free(zp->z_acl_cached); ++ zp->z_acl_cached = NULL; ++ } ++ ++ if (zp->z_xattr_cached) { ++ nvlist_free(zp->z_xattr_cached); ++ zp->z_xattr_cached = NULL; ++ } ++ ++ kmem_cache_free(znode_cache, zp); ++} ++ ++static void ++zfs_inode_set_ops(zfs_sb_t *zsb, struct inode *ip) ++{ ++ uint64_t rdev = 0; ++ ++ switch (ip->i_mode & S_IFMT) { ++ case S_IFREG: ++ ip->i_op = &zpl_inode_operations; ++ ip->i_fop = &zpl_file_operations; ++ ip->i_mapping->a_ops = &zpl_address_space_operations; ++ break; ++ ++ case S_IFDIR: ++ ip->i_op = &zpl_dir_inode_operations; ++ ip->i_fop = &zpl_dir_file_operations; ++ ITOZ(ip)->z_zn_prefetch = B_TRUE; ++ break; ++ ++ case S_IFLNK: ++ ip->i_op = &zpl_symlink_inode_operations; ++ break; ++ ++ /* ++ * rdev is only stored in a SA only for device files. ++ */ ++ case S_IFCHR: ++ case S_IFBLK: ++ VERIFY(sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zsb), ++ &rdev, sizeof (rdev)) == 0); ++ /*FALLTHROUGH*/ ++ case S_IFIFO: ++ case S_IFSOCK: ++ init_special_inode(ip, ip->i_mode, rdev); ++ ip->i_op = &zpl_special_inode_operations; ++ break; ++ ++ default: ++ printk("ZFS: Invalid mode: 0x%x\n", ip->i_mode); ++ VERIFY(0); ++ } ++} ++ ++/* ++ * Construct a znode+inode and initialize. ++ * ++ * This does not do a call to dmu_set_user() that is ++ * up to the caller to do, in case you don't want to ++ * return the znode ++ */ ++static znode_t * ++zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz, ++ dmu_object_type_t obj_type, uint64_t obj, sa_handle_t *hdl, ++ struct dentry *dentry, struct inode *dip) ++{ ++ znode_t *zp; ++ struct inode *ip; ++ uint64_t parent; ++ sa_bulk_attr_t bulk[9]; ++ int count = 0; ++ ++ ASSERT(zsb != NULL); ++ ++ ip = new_inode(zsb->z_sb); ++ if (ip == NULL) ++ return (NULL); ++ ++ zp = ITOZ(ip); ++ ASSERT(zp->z_dirlocks == NULL); ++ ASSERT3P(zp->z_acl_cached, ==, NULL); ++ ASSERT3P(zp->z_xattr_cached, ==, NULL); ++ zp->z_moved = 0; ++ zp->z_sa_hdl = NULL; ++ zp->z_unlinked = 0; ++ zp->z_atime_dirty = 0; ++ zp->z_mapcnt = 0; ++ zp->z_id = db->db_object; ++ zp->z_blksz = blksz; ++ zp->z_seq = 0x7A4653; ++ zp->z_sync_cnt = 0; ++ zp->z_is_zvol = B_FALSE; ++ zp->z_is_mapped = B_FALSE; ++ zp->z_is_ctldir = B_FALSE; ++ ++ zfs_znode_sa_init(zsb, zp, db, obj_type, hdl); ++ ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zsb), NULL, &zp->z_mode, 8); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zsb), NULL, &zp->z_gen, 8); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zsb), NULL, &zp->z_size, 8); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zsb), NULL, &zp->z_links, 8); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL, ++ &zp->z_pflags, 8); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zsb), NULL, ++ &parent, 8); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zsb), NULL, ++ &zp->z_atime, 16); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zsb), NULL, &zp->z_uid, 8); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zsb), NULL, &zp->z_gid, 8); ++ ++ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) { ++ if (hdl == NULL) ++ sa_handle_destroy(zp->z_sa_hdl); ++ ++ goto error; ++ } ++ ++ ip->i_ino = obj; ++ zfs_inode_update(zp); ++ zfs_inode_set_ops(zsb, ip); ++ ++ if (insert_inode_locked(ip)) ++ goto error; ++ ++ if (dentry) { ++ if (zpl_xattr_security_init(ip, dip, &dentry->d_name)) ++ goto error; ++ ++ d_instantiate(dentry, ip); ++ } ++ ++ mutex_enter(&zsb->z_znodes_lock); ++ list_insert_tail(&zsb->z_all_znodes, zp); ++ zsb->z_nr_znodes++; ++ membar_producer(); ++ mutex_exit(&zsb->z_znodes_lock); ++ ++ unlock_new_inode(ip); ++ return (zp); ++ ++error: ++ unlock_new_inode(ip); ++ iput(ip); ++ return NULL; ++} ++ ++/* ++ * Update the embedded inode given the znode. We should work toward ++ * eliminating this function as soon as possible by removing values ++ * which are duplicated between the znode and inode. If the generic ++ * inode has the correct field it should be used, and the ZFS code ++ * updated to access the inode. This can be done incrementally. ++ */ ++void ++zfs_inode_update(znode_t *zp) ++{ ++ zfs_sb_t *zsb; ++ struct inode *ip; ++ uint32_t blksize; ++ uint64_t atime[2], mtime[2], ctime[2]; ++ ++ ASSERT(zp != NULL); ++ zsb = ZTOZSB(zp); ++ ip = ZTOI(zp); ++ ++ /* Skip .zfs control nodes which do not exist on disk. */ ++ if (zfsctl_is_node(ip)) ++ return; ++ ++ sa_lookup(zp->z_sa_hdl, SA_ZPL_ATIME(zsb), &atime, 16); ++ sa_lookup(zp->z_sa_hdl, SA_ZPL_MTIME(zsb), &mtime, 16); ++ sa_lookup(zp->z_sa_hdl, SA_ZPL_CTIME(zsb), &ctime, 16); ++ ++ spin_lock(&ip->i_lock); ++ ip->i_generation = zp->z_gen; ++ ip->i_uid = zp->z_uid; ++ ip->i_gid = zp->z_gid; ++ set_nlink(ip, zp->z_links); ++ ip->i_mode = zp->z_mode; ++ ip->i_blkbits = SPA_MINBLOCKSHIFT; ++ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, ++ (u_longlong_t *)&ip->i_blocks); ++ ++ ZFS_TIME_DECODE(&ip->i_atime, atime); ++ ZFS_TIME_DECODE(&ip->i_mtime, mtime); ++ ZFS_TIME_DECODE(&ip->i_ctime, ctime); ++ ++ i_size_write(ip, zp->z_size); ++ spin_unlock(&ip->i_lock); ++} ++ ++static uint64_t empty_xattr; ++static uint64_t pad[4]; ++static zfs_acl_phys_t acl_phys; ++/* ++ * Create a new DMU object to hold a zfs znode. ++ * ++ * IN: dzp - parent directory for new znode ++ * vap - file attributes for new znode ++ * tx - dmu transaction id for zap operations ++ * cr - credentials of caller ++ * flag - flags: ++ * IS_ROOT_NODE - new object will be root ++ * IS_XATTR - new object is an attribute ++ * bonuslen - length of bonus buffer ++ * setaclp - File/Dir initial ACL ++ * fuidp - Tracks fuid allocation. ++ * ++ * OUT: zpp - allocated znode ++ * ++ */ ++void ++zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, ++ uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) ++{ ++ uint64_t crtime[2], atime[2], mtime[2], ctime[2]; ++ uint64_t mode, size, links, parent, pflags; ++ uint64_t dzp_pflags = 0; ++ uint64_t rdev = 0; ++ zfs_sb_t *zsb = ZTOZSB(dzp); ++ dmu_buf_t *db; ++ timestruc_t now; ++ uint64_t gen, obj; ++ int err; ++ int bonuslen; ++ sa_handle_t *sa_hdl; ++ dmu_object_type_t obj_type; ++ sa_bulk_attr_t *sa_attrs; ++ int cnt = 0; ++ zfs_acl_locator_cb_t locate = { 0 }; ++ ++ if (zsb->z_replay) { ++ obj = vap->va_nodeid; ++ now = vap->va_ctime; /* see zfs_replay_create() */ ++ gen = vap->va_nblocks; /* ditto */ ++ } else { ++ obj = 0; ++ gethrestime(&now); ++ gen = dmu_tx_get_txg(tx); ++ } ++ ++ obj_type = zsb->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; ++ bonuslen = (obj_type == DMU_OT_SA) ? ++ DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE; ++ ++ /* ++ * Create a new DMU object. ++ */ ++ /* ++ * There's currently no mechanism for pre-reading the blocks that will ++ * be needed to allocate a new object, so we accept the small chance ++ * that there will be an i/o error and we will fail one of the ++ * assertions below. ++ */ ++ if (S_ISDIR(vap->va_mode)) { ++ if (zsb->z_replay) { ++ err = zap_create_claim_norm(zsb->z_os, obj, ++ zsb->z_norm, DMU_OT_DIRECTORY_CONTENTS, ++ obj_type, bonuslen, tx); ++ ASSERT3U(err, ==, 0); ++ } else { ++ obj = zap_create_norm(zsb->z_os, ++ zsb->z_norm, DMU_OT_DIRECTORY_CONTENTS, ++ obj_type, bonuslen, tx); ++ } ++ } else { ++ if (zsb->z_replay) { ++ err = dmu_object_claim(zsb->z_os, obj, ++ DMU_OT_PLAIN_FILE_CONTENTS, 0, ++ obj_type, bonuslen, tx); ++ ASSERT3U(err, ==, 0); ++ } else { ++ obj = dmu_object_alloc(zsb->z_os, ++ DMU_OT_PLAIN_FILE_CONTENTS, 0, ++ obj_type, bonuslen, tx); ++ } ++ } ++ ++ ZFS_OBJ_HOLD_ENTER(zsb, obj); ++ VERIFY(0 == sa_buf_hold(zsb->z_os, obj, NULL, &db)); ++ ++ /* ++ * If this is the root, fix up the half-initialized parent pointer ++ * to reference the just-allocated physical data area. ++ */ ++ if (flag & IS_ROOT_NODE) { ++ dzp->z_id = obj; ++ } else { ++ dzp_pflags = dzp->z_pflags; ++ } ++ ++ /* ++ * If parent is an xattr, so am I. ++ */ ++ if (dzp_pflags & ZFS_XATTR) { ++ flag |= IS_XATTR; ++ } ++ ++ if (zsb->z_use_fuids) ++ pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; ++ else ++ pflags = 0; ++ ++ if (S_ISDIR(vap->va_mode)) { ++ size = 2; /* contents ("." and "..") */ ++ links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; ++ } else { ++ size = links = 0; ++ } ++ ++ if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) ++ rdev = vap->va_rdev; ++ ++ parent = dzp->z_id; ++ mode = acl_ids->z_mode; ++ if (flag & IS_XATTR) ++ pflags |= ZFS_XATTR; ++ ++ /* ++ * No execs denied will be deterimed when zfs_mode_compute() is called. ++ */ ++ pflags |= acl_ids->z_aclp->z_hints & ++ (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| ++ ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); ++ ++ ZFS_TIME_ENCODE(&now, crtime); ++ ZFS_TIME_ENCODE(&now, ctime); ++ ++ if (vap->va_mask & ATTR_ATIME) { ++ ZFS_TIME_ENCODE(&vap->va_atime, atime); ++ } else { ++ ZFS_TIME_ENCODE(&now, atime); ++ } ++ ++ if (vap->va_mask & ATTR_MTIME) { ++ ZFS_TIME_ENCODE(&vap->va_mtime, mtime); ++ } else { ++ ZFS_TIME_ENCODE(&now, mtime); ++ } ++ ++ /* Now add in all of the "SA" attributes */ ++ VERIFY(0 == sa_handle_get_from_db(zsb->z_os, db, NULL, SA_HDL_SHARED, ++ &sa_hdl)); ++ ++ /* ++ * Setup the array of attributes to be replaced/set on the new file ++ * ++ * order for DMU_OT_ZNODE is critical since it needs to be constructed ++ * in the old znode_phys_t format. Don't change this ordering ++ */ ++ sa_attrs = kmem_alloc(sizeof(sa_bulk_attr_t) * ZPL_END, KM_PUSHPAGE); ++ ++ if (obj_type == DMU_OT_ZNODE) { ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zsb), ++ NULL, &atime, 16); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zsb), ++ NULL, &mtime, 16); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zsb), ++ NULL, &ctime, 16); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zsb), ++ NULL, &crtime, 16); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zsb), ++ NULL, &gen, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zsb), ++ NULL, &mode, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zsb), ++ NULL, &size, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zsb), ++ NULL, &parent, 8); ++ } else { ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zsb), ++ NULL, &mode, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zsb), ++ NULL, &size, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zsb), ++ NULL, &gen, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zsb), ++ NULL, &acl_ids->z_fuid, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zsb), ++ NULL, &acl_ids->z_fgid, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zsb), ++ NULL, &parent, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zsb), ++ NULL, &pflags, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zsb), ++ NULL, &atime, 16); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zsb), ++ NULL, &mtime, 16); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zsb), ++ NULL, &ctime, 16); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zsb), ++ NULL, &crtime, 16); ++ } ++ ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zsb), NULL, &links, 8); ++ ++ if (obj_type == DMU_OT_ZNODE) { ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zsb), NULL, ++ &empty_xattr, 8); ++ } ++ if (obj_type == DMU_OT_ZNODE || ++ (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zsb), ++ NULL, &rdev, 8); ++ } ++ if (obj_type == DMU_OT_ZNODE) { ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zsb), ++ NULL, &pflags, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zsb), NULL, ++ &acl_ids->z_fuid, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zsb), NULL, ++ &acl_ids->z_fgid, 8); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zsb), NULL, pad, ++ sizeof (uint64_t) * 4); ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zsb), NULL, ++ &acl_phys, sizeof (zfs_acl_phys_t)); ++ } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zsb), NULL, ++ &acl_ids->z_aclp->z_acl_count, 8); ++ locate.cb_aclp = acl_ids->z_aclp; ++ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zsb), ++ zfs_acl_data_locator, &locate, ++ acl_ids->z_aclp->z_acl_bytes); ++ mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, ++ acl_ids->z_fuid, acl_ids->z_fgid); ++ } ++ ++ VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); ++ ++ if (!(flag & IS_ROOT_NODE)) { ++ *zpp = zfs_znode_alloc(zsb, db, 0, obj_type, obj, sa_hdl, ++ vap->va_dentry, ZTOI(dzp)); ++ ASSERT(*zpp != NULL); ++ ASSERT(dzp != NULL); ++ } else { ++ /* ++ * If we are creating the root node, the "parent" we ++ * passed in is the znode for the root. ++ */ ++ *zpp = dzp; ++ ++ (*zpp)->z_sa_hdl = sa_hdl; ++ } ++ ++ (*zpp)->z_pflags = pflags; ++ (*zpp)->z_mode = mode; ++ ++ if (obj_type == DMU_OT_ZNODE || ++ acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { ++ err = zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx); ++ ASSERT3S(err, ==, 0); ++ } ++ kmem_free(sa_attrs, sizeof(sa_bulk_attr_t) * ZPL_END); ++ ZFS_OBJ_HOLD_EXIT(zsb, obj); ++} ++ ++/* ++ * zfs_xvattr_set only updates the in-core attributes ++ * it is assumed the caller will be doing an sa_bulk_update ++ * to push the changes out ++ */ ++void ++zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) ++{ ++ xoptattr_t *xoap; ++ ++ xoap = xva_getxoptattr(xvap); ++ ASSERT(xoap); ++ ++ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { ++ uint64_t times[2]; ++ ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); ++ (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), ++ ×, sizeof (times), tx); ++ XVA_SET_RTN(xvap, XAT_CREATETIME); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { ++ ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, ++ zp->z_pflags, tx); ++ XVA_SET_RTN(xvap, XAT_READONLY); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { ++ ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, ++ zp->z_pflags, tx); ++ XVA_SET_RTN(xvap, XAT_HIDDEN); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { ++ ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, ++ zp->z_pflags, tx); ++ XVA_SET_RTN(xvap, XAT_SYSTEM); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { ++ ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, ++ zp->z_pflags, tx); ++ XVA_SET_RTN(xvap, XAT_ARCHIVE); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { ++ ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, ++ zp->z_pflags, tx); ++ XVA_SET_RTN(xvap, XAT_IMMUTABLE); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ++ ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, ++ zp->z_pflags, tx); ++ XVA_SET_RTN(xvap, XAT_NOUNLINK); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { ++ ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, ++ zp->z_pflags, tx); ++ XVA_SET_RTN(xvap, XAT_APPENDONLY); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ++ ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, ++ zp->z_pflags, tx); ++ XVA_SET_RTN(xvap, XAT_NODUMP); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { ++ ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, ++ zp->z_pflags, tx); ++ XVA_SET_RTN(xvap, XAT_OPAQUE); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ++ ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, ++ xoap->xoa_av_quarantined, zp->z_pflags, tx); ++ XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { ++ ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, ++ zp->z_pflags, tx); ++ XVA_SET_RTN(xvap, XAT_AV_MODIFIED); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { ++ zfs_sa_set_scanstamp(zp, xvap, tx); ++ XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ++ ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, ++ zp->z_pflags, tx); ++ XVA_SET_RTN(xvap, XAT_REPARSE); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { ++ ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, ++ zp->z_pflags, tx); ++ XVA_SET_RTN(xvap, XAT_OFFLINE); ++ } ++ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { ++ ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, ++ zp->z_pflags, tx); ++ XVA_SET_RTN(xvap, XAT_SPARSE); ++ } ++} ++ ++int ++zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp) ++{ ++ dmu_object_info_t doi; ++ dmu_buf_t *db; ++ znode_t *zp; ++ int err; ++ sa_handle_t *hdl; ++ struct inode *ip; ++ ++ *zpp = NULL; ++ ++again: ++ ip = ilookup(zsb->z_sb, obj_num); ++ ++ ZFS_OBJ_HOLD_ENTER(zsb, obj_num); ++ ++ err = sa_buf_hold(zsb->z_os, obj_num, NULL, &db); ++ if (err) { ++ ZFS_OBJ_HOLD_EXIT(zsb, obj_num); ++ iput(ip); ++ return (err); ++ } ++ ++ dmu_object_info_from_db(db, &doi); ++ if (doi.doi_bonus_type != DMU_OT_SA && ++ (doi.doi_bonus_type != DMU_OT_ZNODE || ++ (doi.doi_bonus_type == DMU_OT_ZNODE && ++ doi.doi_bonus_size < sizeof (znode_phys_t)))) { ++ sa_buf_rele(db, NULL); ++ ZFS_OBJ_HOLD_EXIT(zsb, obj_num); ++ iput(ip); ++ return (EINVAL); ++ } ++ ++ hdl = dmu_buf_get_user(db); ++ if (hdl != NULL) { ++ if (ip == NULL) { ++ /* ++ * ilookup returned NULL, which means ++ * the znode is dying - but the SA handle isn't ++ * quite dead yet, we need to drop any locks ++ * we're holding, re-schedule the task and try again. ++ */ ++ sa_buf_rele(db, NULL); ++ ZFS_OBJ_HOLD_EXIT(zsb, obj_num); ++ ++ schedule(); ++ goto again; ++ } ++ ++ zp = sa_get_userdata(hdl); ++ ++ /* ++ * Since "SA" does immediate eviction we ++ * should never find a sa handle that doesn't ++ * know about the znode. ++ */ ++ ++ ASSERT3P(zp, !=, NULL); ++ ++ mutex_enter(&zp->z_lock); ++ ASSERT3U(zp->z_id, ==, obj_num); ++ if (zp->z_unlinked) { ++ err = ENOENT; ++ } else { ++ igrab(ZTOI(zp)); ++ *zpp = zp; ++ err = 0; ++ } ++ sa_buf_rele(db, NULL); ++ mutex_exit(&zp->z_lock); ++ ZFS_OBJ_HOLD_EXIT(zsb, obj_num); ++ iput(ip); ++ return (err); ++ } ++ ++ ASSERT3P(ip, ==, NULL); ++ ++ /* ++ * Not found create new znode/vnode but only if file exists. ++ * ++ * There is a small window where zfs_vget() could ++ * find this object while a file create is still in ++ * progress. This is checked for in zfs_znode_alloc() ++ * ++ * if zfs_znode_alloc() fails it will drop the hold on the ++ * bonus buffer. ++ */ ++ zp = zfs_znode_alloc(zsb, db, doi.doi_data_block_size, ++ doi.doi_bonus_type, obj_num, NULL, NULL, NULL); ++ if (zp == NULL) { ++ err = ENOENT; ++ } else { ++ *zpp = zp; ++ } ++ ZFS_OBJ_HOLD_EXIT(zsb, obj_num); ++ return (err); ++} ++ ++int ++zfs_rezget(znode_t *zp) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ dmu_object_info_t doi; ++ dmu_buf_t *db; ++ uint64_t obj_num = zp->z_id; ++ uint64_t mode; ++ sa_bulk_attr_t bulk[8]; ++ int err; ++ int count = 0; ++ uint64_t gen; ++ ++ ZFS_OBJ_HOLD_ENTER(zsb, obj_num); ++ ++ mutex_enter(&zp->z_acl_lock); ++ if (zp->z_acl_cached) { ++ zfs_acl_free(zp->z_acl_cached); ++ zp->z_acl_cached = NULL; ++ } ++ ++ mutex_exit(&zp->z_acl_lock); ++ ASSERT(zp->z_sa_hdl == NULL); ++ err = sa_buf_hold(zsb->z_os, obj_num, NULL, &db); ++ if (err) { ++ ZFS_OBJ_HOLD_EXIT(zsb, obj_num); ++ return (err); ++ } ++ ++ dmu_object_info_from_db(db, &doi); ++ if (doi.doi_bonus_type != DMU_OT_SA && ++ (doi.doi_bonus_type != DMU_OT_ZNODE || ++ (doi.doi_bonus_type == DMU_OT_ZNODE && ++ doi.doi_bonus_size < sizeof (znode_phys_t)))) { ++ sa_buf_rele(db, NULL); ++ ZFS_OBJ_HOLD_EXIT(zsb, obj_num); ++ return (EINVAL); ++ } ++ ++ zfs_znode_sa_init(zsb, zp, db, doi.doi_bonus_type, NULL); ++ ++ /* reload cached values */ ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zsb), NULL, ++ &gen, sizeof (gen)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zsb), NULL, ++ &zp->z_size, sizeof (zp->z_size)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zsb), NULL, ++ &zp->z_links, sizeof (zp->z_links)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL, ++ &zp->z_pflags, sizeof (zp->z_pflags)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zsb), NULL, ++ &zp->z_atime, sizeof (zp->z_atime)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zsb), NULL, ++ &zp->z_uid, sizeof (zp->z_uid)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zsb), NULL, ++ &zp->z_gid, sizeof (zp->z_gid)); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zsb), NULL, ++ &mode, sizeof (mode)); ++ ++ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { ++ zfs_znode_dmu_fini(zp); ++ ZFS_OBJ_HOLD_EXIT(zsb, obj_num); ++ return (EIO); ++ } ++ ++ zp->z_mode = mode; ++ ++ if (gen != zp->z_gen) { ++ zfs_znode_dmu_fini(zp); ++ ZFS_OBJ_HOLD_EXIT(zsb, obj_num); ++ return (EIO); ++ } ++ ++ zp->z_unlinked = (zp->z_links == 0); ++ zp->z_blksz = doi.doi_data_block_size; ++ ++ ZFS_OBJ_HOLD_EXIT(zsb, obj_num); ++ ++ return (0); ++} ++ ++void ++zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ objset_t *os = zsb->z_os; ++ uint64_t obj = zp->z_id; ++ uint64_t acl_obj = zfs_external_acl(zp); ++ ++ ZFS_OBJ_HOLD_ENTER(zsb, obj); ++ if (acl_obj) { ++ VERIFY(!zp->z_is_sa); ++ VERIFY(0 == dmu_object_free(os, acl_obj, tx)); ++ } ++ VERIFY(0 == dmu_object_free(os, obj, tx)); ++ zfs_znode_dmu_fini(zp); ++ ZFS_OBJ_HOLD_EXIT(zsb, obj); ++} ++ ++void ++zfs_zinactive(znode_t *zp) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ uint64_t z_id = zp->z_id; ++ boolean_t drop_mutex = 0; ++ ++ ASSERT(zp->z_sa_hdl); ++ ++ /* ++ * Don't allow a zfs_zget() while were trying to release this znode. ++ * ++ * Linux allows direct memory reclaim which means that any KM_SLEEP ++ * allocation may trigger inode eviction. This can lead to a deadlock ++ * through the ->shrink_icache_memory()->evict()->zfs_inactive()-> ++ * zfs_zinactive() call path. To avoid this deadlock the process ++ * must not reacquire the mutex when it is already holding it. ++ */ ++ if (!ZFS_OBJ_HOLD_OWNED(zsb, z_id)) { ++ ZFS_OBJ_HOLD_ENTER(zsb, z_id); ++ drop_mutex = 1; ++ } ++ ++ mutex_enter(&zp->z_lock); ++ ++ /* ++ * If this was the last reference to a file with no links, ++ * remove the file from the file system. ++ */ ++ if (zp->z_unlinked) { ++ mutex_exit(&zp->z_lock); ++ ++ if (drop_mutex) ++ ZFS_OBJ_HOLD_EXIT(zsb, z_id); ++ ++ zfs_rmnode(zp); ++ return; ++ } ++ ++ mutex_exit(&zp->z_lock); ++ zfs_znode_dmu_fini(zp); ++ ++ if (drop_mutex) ++ ZFS_OBJ_HOLD_EXIT(zsb, z_id); ++} ++ ++void ++zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], ++ uint64_t ctime[2], boolean_t have_tx) ++{ ++ timestruc_t now; ++ ++ gethrestime(&now); ++ ++ if (have_tx) { /* will sa_bulk_update happen really soon? */ ++ zp->z_atime_dirty = 0; ++ zp->z_seq++; ++ } else { ++ zp->z_atime_dirty = 1; ++ } ++ ++ if (flag & ATTR_ATIME) { ++ ZFS_TIME_ENCODE(&now, zp->z_atime); ++ } ++ ++ if (flag & ATTR_MTIME) { ++ ZFS_TIME_ENCODE(&now, mtime); ++ if (ZTOZSB(zp)->z_use_fuids) { ++ zp->z_pflags |= (ZFS_ARCHIVE | ++ ZFS_AV_MODIFIED); ++ } ++ } ++ ++ if (flag & ATTR_CTIME) { ++ ZFS_TIME_ENCODE(&now, ctime); ++ if (ZTOZSB(zp)->z_use_fuids) ++ zp->z_pflags |= ZFS_ARCHIVE; ++ } ++} ++ ++/* ++ * Grow the block size for a file. ++ * ++ * IN: zp - znode of file to free data in. ++ * size - requested block size ++ * tx - open transaction. ++ * ++ * NOTE: this function assumes that the znode is write locked. ++ */ ++void ++zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) ++{ ++ int error; ++ u_longlong_t dummy; ++ ++ if (size <= zp->z_blksz) ++ return; ++ /* ++ * If the file size is already greater than the current blocksize, ++ * we will not grow. If there is more than one block in a file, ++ * the blocksize cannot change. ++ */ ++ if (zp->z_blksz && zp->z_size > zp->z_blksz) ++ return; ++ ++ error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id, ++ size, 0, tx); ++ ++ if (error == ENOTSUP) ++ return; ++ ASSERT3U(error, ==, 0); ++ ++ /* What blocksize did we actually get? */ ++ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); ++} ++ ++/* ++ * Increase the file length ++ * ++ * IN: zp - znode of file to free data in. ++ * end - new end-of-file ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ */ ++static int ++zfs_extend(znode_t *zp, uint64_t end) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ dmu_tx_t *tx; ++ rl_t *rl; ++ uint64_t newblksz; ++ int error; ++ ++ /* ++ * We will change zp_size, lock the whole file. ++ */ ++ rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); ++ ++ /* ++ * Nothing to do if file already at desired length. ++ */ ++ if (end <= zp->z_size) { ++ zfs_range_unlock(rl); ++ return (0); ++ } ++top: ++ tx = dmu_tx_create(zsb->z_os); ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); ++ zfs_sa_upgrade_txholds(tx, zp); ++ if (end > zp->z_blksz && ++ (!ISP2(zp->z_blksz) || zp->z_blksz < zsb->z_max_blksz)) { ++ /* ++ * We are growing the file past the current block size. ++ */ ++ if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { ++ ASSERT(!ISP2(zp->z_blksz)); ++ newblksz = MIN(end, SPA_MAXBLOCKSIZE); ++ } else { ++ newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); ++ } ++ dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); ++ } else { ++ newblksz = 0; ++ } ++ ++ error = dmu_tx_assign(tx, TXG_NOWAIT); ++ if (error) { ++ if (error == ERESTART) { ++ dmu_tx_wait(tx); ++ dmu_tx_abort(tx); ++ goto top; ++ } ++ dmu_tx_abort(tx); ++ zfs_range_unlock(rl); ++ return (error); ++ } ++ ++ if (newblksz) ++ zfs_grow_blocksize(zp, newblksz, tx); ++ ++ zp->z_size = end; ++ ++ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), ++ &zp->z_size, sizeof (zp->z_size), tx)); ++ ++ zfs_range_unlock(rl); ++ ++ dmu_tx_commit(tx); ++ ++ return (0); ++} ++ ++/* ++ * Free space in a file. ++ * ++ * IN: zp - znode of file to free data in. ++ * off - start of section to free. ++ * len - length of section to free. ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ */ ++static int ++zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ rl_t *rl; ++ int error; ++ ++ /* ++ * Lock the range being freed. ++ */ ++ rl = zfs_range_lock(zp, off, len, RL_WRITER); ++ ++ /* ++ * Nothing to do if file already at desired length. ++ */ ++ if (off >= zp->z_size) { ++ zfs_range_unlock(rl); ++ return (0); ++ } ++ ++ if (off + len > zp->z_size) ++ len = zp->z_size - off; ++ ++ error = dmu_free_long_range(zsb->z_os, zp->z_id, off, len); ++ ++ zfs_range_unlock(rl); ++ ++ return (error); ++} ++ ++/* ++ * Truncate a file ++ * ++ * IN: zp - znode of file to free data in. ++ * end - new end-of-file. ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ */ ++static int ++zfs_trunc(znode_t *zp, uint64_t end) ++{ ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ dmu_tx_t *tx; ++ rl_t *rl; ++ int error; ++ sa_bulk_attr_t bulk[2]; ++ int count = 0; ++ ++ /* ++ * We will change zp_size, lock the whole file. ++ */ ++ rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); ++ ++ /* ++ * Nothing to do if file already at desired length. ++ */ ++ if (end >= zp->z_size) { ++ zfs_range_unlock(rl); ++ return (0); ++ } ++ ++ error = dmu_free_long_range(zsb->z_os, zp->z_id, end, -1); ++ if (error) { ++ zfs_range_unlock(rl); ++ return (error); ++ } ++top: ++ tx = dmu_tx_create(zsb->z_os); ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); ++ zfs_sa_upgrade_txholds(tx, zp); ++ error = dmu_tx_assign(tx, TXG_NOWAIT); ++ if (error) { ++ if (error == ERESTART) { ++ dmu_tx_wait(tx); ++ dmu_tx_abort(tx); ++ goto top; ++ } ++ dmu_tx_abort(tx); ++ zfs_range_unlock(rl); ++ return (error); ++ } ++ ++ zp->z_size = end; ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zsb), ++ NULL, &zp->z_size, sizeof (zp->z_size)); ++ ++ if (end == 0) { ++ zp->z_pflags &= ~ZFS_SPARSE; ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), ++ NULL, &zp->z_pflags, 8); ++ } ++ VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); ++ ++ dmu_tx_commit(tx); ++ ++ zfs_range_unlock(rl); ++ ++ return (0); ++} ++ ++/* ++ * Free space in a file ++ * ++ * IN: zp - znode of file to free data in. ++ * off - start of range ++ * len - end of range (0 => EOF) ++ * flag - current file open mode flags. ++ * log - TRUE if this action should be logged ++ * ++ * RETURN: 0 if success ++ * error code if failure ++ */ ++int ++zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) ++{ ++ struct inode *ip = ZTOI(zp); ++ dmu_tx_t *tx; ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ zilog_t *zilog = zsb->z_log; ++ uint64_t mode; ++ uint64_t mtime[2], ctime[2]; ++ sa_bulk_attr_t bulk[3]; ++ int count = 0; ++ int error; ++ ++ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zsb), &mode, ++ sizeof (mode))) != 0) ++ return (error); ++ ++ if (off > zp->z_size) { ++ error = zfs_extend(zp, off+len); ++ if (error == 0 && log) ++ goto log; ++ else ++ return (error); ++ } ++ ++ /* ++ * Check for any locks in the region to be freed. ++ */ ++ if (ip->i_flock && mandatory_lock(ip)) { ++ uint64_t length = (len ? len : zp->z_size - off); ++ if (!lock_may_write(ip, off, length)) ++ return (EAGAIN); ++ } ++ ++ if (len == 0) { ++ error = zfs_trunc(zp, off); ++ } else { ++ if ((error = zfs_free_range(zp, off, len)) == 0 && ++ off + len > zp->z_size) ++ error = zfs_extend(zp, off+len); ++ } ++ if (error || !log) ++ return (error); ++log: ++ tx = dmu_tx_create(zsb->z_os); ++ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); ++ zfs_sa_upgrade_txholds(tx, zp); ++ error = dmu_tx_assign(tx, TXG_NOWAIT); ++ if (error) { ++ if (error == ERESTART) { ++ dmu_tx_wait(tx); ++ dmu_tx_abort(tx); ++ goto log; ++ } ++ dmu_tx_abort(tx); ++ return (error); ++ } ++ ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, mtime, 16); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, ctime, 16); ++ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), ++ NULL, &zp->z_pflags, 8); ++ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); ++ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ++ ASSERT(error == 0); ++ ++ zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); ++ ++ dmu_tx_commit(tx); ++ zfs_inode_update(zp); ++ return (0); ++} ++ ++void ++zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) ++{ ++ struct super_block *sb; ++ zfs_sb_t *zsb; ++ uint64_t moid, obj, sa_obj, version; ++ uint64_t sense = ZFS_CASE_SENSITIVE; ++ uint64_t norm = 0; ++ nvpair_t *elem; ++ int error; ++ int i; ++ znode_t *rootzp = NULL; ++ vattr_t vattr; ++ znode_t *zp; ++ zfs_acl_ids_t acl_ids; ++ ++ /* ++ * First attempt to create master node. ++ */ ++ /* ++ * In an empty objset, there are no blocks to read and thus ++ * there can be no i/o errors (which we assert below). ++ */ ++ moid = MASTER_NODE_OBJ; ++ error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, ++ DMU_OT_NONE, 0, tx); ++ ASSERT(error == 0); ++ ++ /* ++ * Set starting attributes. ++ */ ++ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); ++ elem = NULL; ++ while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { ++ /* For the moment we expect all zpl props to be uint64_ts */ ++ uint64_t val; ++ char *name; ++ ++ ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); ++ VERIFY(nvpair_value_uint64(elem, &val) == 0); ++ name = nvpair_name(elem); ++ if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { ++ if (val < version) ++ version = val; ++ } else { ++ error = zap_update(os, moid, name, 8, 1, &val, tx); ++ } ++ ASSERT(error == 0); ++ if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) ++ norm = val; ++ else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) ++ sense = val; ++ } ++ ASSERT(version != 0); ++ error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); ++ ++ /* ++ * Create zap object used for SA attribute registration ++ */ ++ ++ if (version >= ZPL_VERSION_SA) { ++ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, ++ DMU_OT_NONE, 0, tx); ++ error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ++ ASSERT(error == 0); ++ } else { ++ sa_obj = 0; ++ } ++ /* ++ * Create a delete queue. ++ */ ++ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); ++ ++ error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ++ ASSERT(error == 0); ++ ++ /* ++ * Create root znode. Create minimal znode/inode/zsb/sb ++ * to allow zfs_mknode to work. ++ */ ++ vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; ++ vattr.va_mode = S_IFDIR|0755; ++ vattr.va_uid = crgetuid(cr); ++ vattr.va_gid = crgetgid(cr); ++ ++ rootzp = kmem_cache_alloc(znode_cache, KM_PUSHPAGE); ++ rootzp->z_moved = 0; ++ rootzp->z_unlinked = 0; ++ rootzp->z_atime_dirty = 0; ++ rootzp->z_is_sa = USE_SA(version, os); ++ ++ zsb = kmem_zalloc(sizeof (zfs_sb_t), KM_PUSHPAGE); ++ zsb->z_os = os; ++ zsb->z_parent = zsb; ++ zsb->z_version = version; ++ zsb->z_use_fuids = USE_FUIDS(version, os); ++ zsb->z_use_sa = USE_SA(version, os); ++ zsb->z_norm = norm; ++ ++ sb = kmem_zalloc(sizeof (struct super_block), KM_PUSHPAGE); ++ sb->s_fs_info = zsb; ++ ++ ZTOI(rootzp)->i_sb = sb; ++ ++ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, ++ &zsb->z_attr_table); ++ ++ ASSERT(error == 0); ++ ++ /* ++ * Fold case on file systems that are always or sometimes case ++ * insensitive. ++ */ ++ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) ++ zsb->z_norm |= U8_TEXTPREP_TOUPPER; ++ ++ mutex_init(&zsb->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); ++ list_create(&zsb->z_all_znodes, sizeof (znode_t), ++ offsetof(znode_t, z_link_node)); ++ ++ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) ++ mutex_init(&zsb->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); ++ ++ VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, ++ cr, NULL, &acl_ids)); ++ zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ++ ASSERT3P(zp, ==, rootzp); ++ error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ++ ASSERT(error == 0); ++ zfs_acl_ids_free(&acl_ids); ++ ++ atomic_set(&ZTOI(rootzp)->i_count, 0); ++ sa_handle_destroy(rootzp->z_sa_hdl); ++ kmem_cache_free(znode_cache, rootzp); ++ ++ /* ++ * Create shares directory ++ */ ++ error = zfs_create_share_dir(zsb, tx); ++ ASSERT(error == 0); ++ ++ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) ++ mutex_destroy(&zsb->z_hold_mtx[i]); ++ ++ kmem_free(sb, sizeof (struct super_block)); ++ kmem_free(zsb, sizeof (zfs_sb_t)); ++} ++#endif /* _KERNEL */ ++ ++static int ++zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) ++{ ++ uint64_t sa_obj = 0; ++ int error; ++ ++ error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); ++ if (error != 0 && error != ENOENT) ++ return (error); ++ ++ error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); ++ return (error); ++} ++ ++static int ++zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, ++ dmu_buf_t **db, void *tag) ++{ ++ dmu_object_info_t doi; ++ int error; ++ ++ if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) ++ return (error); ++ ++ dmu_object_info_from_db(*db, &doi); ++ if ((doi.doi_bonus_type != DMU_OT_SA && ++ doi.doi_bonus_type != DMU_OT_ZNODE) || ++ (doi.doi_bonus_type == DMU_OT_ZNODE && ++ doi.doi_bonus_size < sizeof (znode_phys_t))) { ++ sa_buf_rele(*db, tag); ++ return (ENOTSUP); ++ } ++ ++ error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); ++ if (error != 0) { ++ sa_buf_rele(*db, tag); ++ return (error); ++ } ++ ++ return (0); ++} ++ ++void ++zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) ++{ ++ sa_handle_destroy(hdl); ++ sa_buf_rele(db, tag); ++} ++ ++/* ++ * Given an object number, return its parent object number and whether ++ * or not the object is an extended attribute directory. ++ */ ++static int ++zfs_obj_to_pobj(sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, ++ int *is_xattrdir) ++{ ++ uint64_t parent; ++ uint64_t pflags; ++ uint64_t mode; ++ sa_bulk_attr_t bulk[3]; ++ int count = 0; ++ int error; ++ ++ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, ++ &parent, sizeof (parent)); ++ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, ++ &pflags, sizeof (pflags)); ++ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, ++ &mode, sizeof (mode)); ++ ++ if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) ++ return (error); ++ ++ *pobjp = parent; ++ *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); ++ ++ return (0); ++} ++ ++/* ++ * Given an object number, return some zpl level statistics ++ */ ++static int ++zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, ++ zfs_stat_t *sb) ++{ ++ sa_bulk_attr_t bulk[4]; ++ int count = 0; ++ ++ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, ++ &sb->zs_mode, sizeof (sb->zs_mode)); ++ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, ++ &sb->zs_gen, sizeof (sb->zs_gen)); ++ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, ++ &sb->zs_links, sizeof (sb->zs_links)); ++ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, ++ &sb->zs_ctime, sizeof (sb->zs_ctime)); ++ ++ return (sa_bulk_lookup(hdl, bulk, count)); ++} ++ ++static int ++zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, ++ sa_attr_type_t *sa_table, char *buf, int len) ++{ ++ sa_handle_t *sa_hdl; ++ sa_handle_t *prevhdl = NULL; ++ dmu_buf_t *prevdb = NULL; ++ dmu_buf_t *sa_db = NULL; ++ char *path = buf + len - 1; ++ int error; ++ ++ *path = '\0'; ++ sa_hdl = hdl; ++ ++ for (;;) { ++ uint64_t pobj; ++ char component[MAXNAMELEN + 2]; ++ size_t complen; ++ int is_xattrdir; ++ ++ if (prevdb) ++ zfs_release_sa_handle(prevhdl, prevdb, FTAG); ++ ++ if ((error = zfs_obj_to_pobj(sa_hdl, sa_table, &pobj, ++ &is_xattrdir)) != 0) ++ break; ++ ++ if (pobj == obj) { ++ if (path[0] != '/') ++ *--path = '/'; ++ break; ++ } ++ ++ component[0] = '/'; ++ if (is_xattrdir) { ++ (void) sprintf(component + 1, ""); ++ } else { ++ error = zap_value_search(osp, pobj, obj, ++ ZFS_DIRENT_OBJ(-1ULL), component + 1); ++ if (error != 0) ++ break; ++ } ++ ++ complen = strlen(component); ++ path -= complen; ++ ASSERT(path >= buf); ++ bcopy(component, path, complen); ++ obj = pobj; ++ ++ if (sa_hdl != hdl) { ++ prevhdl = sa_hdl; ++ prevdb = sa_db; ++ } ++ error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); ++ if (error != 0) { ++ sa_hdl = prevhdl; ++ sa_db = prevdb; ++ break; ++ } ++ } ++ ++ if (sa_hdl != NULL && sa_hdl != hdl) { ++ ASSERT(sa_db != NULL); ++ zfs_release_sa_handle(sa_hdl, sa_db, FTAG); ++ } ++ ++ if (error == 0) ++ (void) memmove(buf, path, buf + len - path); ++ ++ return (error); ++} ++ ++int ++zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) ++{ ++ sa_attr_type_t *sa_table; ++ sa_handle_t *hdl; ++ dmu_buf_t *db; ++ int error; ++ ++ error = zfs_sa_setup(osp, &sa_table); ++ if (error != 0) ++ return (error); ++ ++ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); ++ if (error != 0) ++ return (error); ++ ++ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); ++ ++ zfs_release_sa_handle(hdl, db, FTAG); ++ return (error); ++} ++ ++int ++zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, ++ char *buf, int len) ++{ ++ char *path = buf + len - 1; ++ sa_attr_type_t *sa_table; ++ sa_handle_t *hdl; ++ dmu_buf_t *db; ++ int error; ++ ++ *path = '\0'; ++ ++ error = zfs_sa_setup(osp, &sa_table); ++ if (error != 0) ++ return (error); ++ ++ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); ++ if (error != 0) ++ return (error); ++ ++ error = zfs_obj_to_stats_impl(hdl, sa_table, sb); ++ if (error != 0) { ++ zfs_release_sa_handle(hdl, db, FTAG); ++ return (error); ++ } ++ ++ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); ++ ++ zfs_release_sa_handle(hdl, db, FTAG); ++ return (error); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++EXPORT_SYMBOL(zfs_create_fs); ++EXPORT_SYMBOL(zfs_obj_to_path); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zil.c linux-3.2.33-go/fs/zfs/zfs/zil.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zil.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zil.c 2012-11-16 23:25:34.352039300 +0100 +@@ -0,0 +1,2111 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ */ ++ ++/* Portions Copyright 2010 Robert Milkowski */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * The zfs intent log (ZIL) saves transaction records of system calls ++ * that change the file system in memory with enough information ++ * to be able to replay them. These are stored in memory until ++ * either the DMU transaction group (txg) commits them to the stable pool ++ * and they can be discarded, or they are flushed to the stable log ++ * (also in the pool) due to a fsync, O_DSYNC or other synchronous ++ * requirement. In the event of a panic or power fail then those log ++ * records (transactions) are replayed. ++ * ++ * There is one ZIL per file system. Its on-disk (pool) format consists ++ * of 3 parts: ++ * ++ * - ZIL header ++ * - ZIL blocks ++ * - ZIL records ++ * ++ * A log record holds a system call transaction. Log blocks can ++ * hold many log records and the blocks are chained together. ++ * Each ZIL block contains a block pointer (blkptr_t) to the next ++ * ZIL block in the chain. The ZIL header points to the first ++ * block in the chain. Note there is not a fixed place in the pool ++ * to hold blocks. They are dynamically allocated and freed as ++ * needed from the blocks available. Figure X shows the ZIL structure: ++ */ ++ ++/* ++ * See zil.h for more information about these fields. ++ */ ++zil_stats_t zil_stats = { ++ { "zil_commit_count", KSTAT_DATA_UINT64 }, ++ { "zil_commit_writer_count", KSTAT_DATA_UINT64 }, ++ { "zil_itx_count", KSTAT_DATA_UINT64 }, ++ { "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, ++ { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, ++ { "zil_itx_copied_count", KSTAT_DATA_UINT64 }, ++ { "zil_itx_copied_bytes", KSTAT_DATA_UINT64 }, ++ { "zil_itx_needcopy_count", KSTAT_DATA_UINT64 }, ++ { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, ++ { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, ++ { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, ++ { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, ++ { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, ++}; ++ ++static kstat_t *zil_ksp; ++ ++/* ++ * This global ZIL switch affects all pools ++ */ ++int zil_replay_disable = 0; /* disable intent logging replay */ ++ ++/* ++ * Tunable parameter for debugging or performance analysis. Setting ++ * zfs_nocacheflush will cause corruption on power loss if a volatile ++ * out-of-order write cache is enabled. ++ */ ++int zfs_nocacheflush = 0; ++ ++static kmem_cache_t *zil_lwb_cache; ++ ++static void zil_async_to_sync(zilog_t *zilog, uint64_t foid); ++ ++#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ ++ sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) ++ ++ ++/* ++ * ziltest is by and large an ugly hack, but very useful in ++ * checking replay without tedious work. ++ * When running ziltest we want to keep all itx's and so maintain ++ * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG ++ * We subtract TXG_CONCURRENT_STATES to allow for common code. ++ */ ++#define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES) ++ ++static int ++zil_bp_compare(const void *x1, const void *x2) ++{ ++ const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; ++ const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; ++ ++ if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) ++ return (-1); ++ if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) ++ return (1); ++ ++ if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) ++ return (-1); ++ if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) ++ return (1); ++ ++ return (0); ++} ++ ++static void ++zil_bp_tree_init(zilog_t *zilog) ++{ ++ avl_create(&zilog->zl_bp_tree, zil_bp_compare, ++ sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); ++} ++ ++static void ++zil_bp_tree_fini(zilog_t *zilog) ++{ ++ avl_tree_t *t = &zilog->zl_bp_tree; ++ zil_bp_node_t *zn; ++ void *cookie = NULL; ++ ++ while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) ++ kmem_free(zn, sizeof (zil_bp_node_t)); ++ ++ avl_destroy(t); ++} ++ ++int ++zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) ++{ ++ avl_tree_t *t = &zilog->zl_bp_tree; ++ const dva_t *dva = BP_IDENTITY(bp); ++ zil_bp_node_t *zn; ++ avl_index_t where; ++ ++ if (avl_find(t, dva, &where) != NULL) ++ return (EEXIST); ++ ++ zn = kmem_alloc(sizeof (zil_bp_node_t), KM_PUSHPAGE); ++ zn->zn_dva = *dva; ++ avl_insert(t, zn, where); ++ ++ return (0); ++} ++ ++static zil_header_t * ++zil_header_in_syncing_context(zilog_t *zilog) ++{ ++ return ((zil_header_t *)zilog->zl_header); ++} ++ ++static void ++zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) ++{ ++ zio_cksum_t *zc = &bp->blk_cksum; ++ ++ zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); ++ zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); ++ zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); ++ zc->zc_word[ZIL_ZC_SEQ] = 1ULL; ++} ++ ++/* ++ * Read a log block and make sure it's valid. ++ */ ++static int ++zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, ++ char **end) ++{ ++ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; ++ uint32_t aflags = ARC_WAIT; ++ arc_buf_t *abuf = NULL; ++ zbookmark_t zb; ++ int error; ++ ++ if (zilog->zl_header->zh_claim_txg == 0) ++ zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; ++ ++ if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) ++ zio_flags |= ZIO_FLAG_SPECULATIVE; ++ ++ SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ++ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); ++ ++ error = dsl_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ++ ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); ++ ++ if (error == 0) { ++ zio_cksum_t cksum = bp->blk_cksum; ++ ++ /* ++ * Validate the checksummed log block. ++ * ++ * Sequence numbers should be... sequential. The checksum ++ * verifier for the next block should be bp's checksum plus 1. ++ * ++ * Also check the log chain linkage and size used. ++ */ ++ cksum.zc_word[ZIL_ZC_SEQ]++; ++ ++ if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { ++ zil_chain_t *zilc = abuf->b_data; ++ char *lr = (char *)(zilc + 1); ++ uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); ++ ++ if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, ++ sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { ++ error = ECKSUM; ++ } else { ++ bcopy(lr, dst, len); ++ *end = (char *)dst + len; ++ *nbp = zilc->zc_next_blk; ++ } ++ } else { ++ char *lr = abuf->b_data; ++ uint64_t size = BP_GET_LSIZE(bp); ++ zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; ++ ++ if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, ++ sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || ++ (zilc->zc_nused > (size - sizeof (*zilc)))) { ++ error = ECKSUM; ++ } else { ++ bcopy(lr, dst, zilc->zc_nused); ++ *end = (char *)dst + zilc->zc_nused; ++ *nbp = zilc->zc_next_blk; ++ } ++ } ++ ++ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); ++ } ++ ++ return (error); ++} ++ ++/* ++ * Read a TX_WRITE log data block. ++ */ ++static int ++zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) ++{ ++ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; ++ const blkptr_t *bp = &lr->lr_blkptr; ++ uint32_t aflags = ARC_WAIT; ++ arc_buf_t *abuf = NULL; ++ zbookmark_t zb; ++ int error; ++ ++ if (BP_IS_HOLE(bp)) { ++ if (wbuf != NULL) ++ bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); ++ return (0); ++ } ++ ++ if (zilog->zl_header->zh_claim_txg == 0) ++ zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; ++ ++ SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ++ ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); ++ ++ error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ++ ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); ++ ++ if (error == 0) { ++ if (wbuf != NULL) ++ bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); ++ (void) arc_buf_remove_ref(abuf, &abuf); ++ } ++ ++ return (error); ++} ++ ++/* ++ * Parse the intent log, and call parse_func for each valid record within. ++ */ ++int ++zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, ++ zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) ++{ ++ const zil_header_t *zh = zilog->zl_header; ++ boolean_t claimed = !!zh->zh_claim_txg; ++ uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; ++ uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; ++ uint64_t max_blk_seq = 0; ++ uint64_t max_lr_seq = 0; ++ uint64_t blk_count = 0; ++ uint64_t lr_count = 0; ++ blkptr_t blk, next_blk; ++ char *lrbuf, *lrp; ++ int error = 0; ++ ++ bzero(&next_blk, sizeof(blkptr_t)); ++ ++ /* ++ * Old logs didn't record the maximum zh_claim_lr_seq. ++ */ ++ if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) ++ claim_lr_seq = UINT64_MAX; ++ ++ /* ++ * Starting at the block pointed to by zh_log we read the log chain. ++ * For each block in the chain we strongly check that block to ++ * ensure its validity. We stop when an invalid block is found. ++ * For each block pointer in the chain we call parse_blk_func(). ++ * For each record in each valid block we call parse_lr_func(). ++ * If the log has been claimed, stop if we encounter a sequence ++ * number greater than the highest claimed sequence number. ++ */ ++ lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE); ++ zil_bp_tree_init(zilog); ++ ++ for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { ++ uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; ++ int reclen; ++ char *end = NULL; ++ ++ if (blk_seq > claim_blk_seq) ++ break; ++ if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) ++ break; ++ ASSERT3U(max_blk_seq, <, blk_seq); ++ max_blk_seq = blk_seq; ++ blk_count++; ++ ++ if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) ++ break; ++ ++ error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); ++ if (error) ++ break; ++ ++ for (lrp = lrbuf; lrp < end; lrp += reclen) { ++ lr_t *lr = (lr_t *)lrp; ++ reclen = lr->lrc_reclen; ++ ASSERT3U(reclen, >=, sizeof (lr_t)); ++ if (lr->lrc_seq > claim_lr_seq) ++ goto done; ++ if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) ++ goto done; ++ ASSERT3U(max_lr_seq, <, lr->lrc_seq); ++ max_lr_seq = lr->lrc_seq; ++ lr_count++; ++ } ++ } ++done: ++ zilog->zl_parse_error = error; ++ zilog->zl_parse_blk_seq = max_blk_seq; ++ zilog->zl_parse_lr_seq = max_lr_seq; ++ zilog->zl_parse_blk_count = blk_count; ++ zilog->zl_parse_lr_count = lr_count; ++ ++ ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || ++ (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); ++ ++ zil_bp_tree_fini(zilog); ++ zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE); ++ ++ return (error); ++} ++ ++static int ++zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) ++{ ++ /* ++ * Claim log block if not already committed and not already claimed. ++ * If tx == NULL, just verify that the block is claimable. ++ */ ++ if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) ++ return (0); ++ ++ return (zio_wait(zio_claim(NULL, zilog->zl_spa, ++ tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, ++ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); ++} ++ ++static int ++zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) ++{ ++ lr_write_t *lr = (lr_write_t *)lrc; ++ int error; ++ ++ if (lrc->lrc_txtype != TX_WRITE) ++ return (0); ++ ++ /* ++ * If the block is not readable, don't claim it. This can happen ++ * in normal operation when a log block is written to disk before ++ * some of the dmu_sync() blocks it points to. In this case, the ++ * transaction cannot have been committed to anyone (we would have ++ * waited for all writes to be stable first), so it is semantically ++ * correct to declare this the end of the log. ++ */ ++ if (lr->lr_blkptr.blk_birth >= first_txg && ++ (error = zil_read_log_data(zilog, lr, NULL)) != 0) ++ return (error); ++ return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); ++} ++ ++/* ARGSUSED */ ++static int ++zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) ++{ ++ zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp); ++ ++ return (0); ++} ++ ++static int ++zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) ++{ ++ lr_write_t *lr = (lr_write_t *)lrc; ++ blkptr_t *bp = &lr->lr_blkptr; ++ ++ /* ++ * If we previously claimed it, we need to free it. ++ */ ++ if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && ++ bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0) ++ zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); ++ ++ return (0); ++} ++ ++static lwb_t * ++zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg, boolean_t fastwrite) ++{ ++ lwb_t *lwb; ++ ++ lwb = kmem_cache_alloc(zil_lwb_cache, KM_PUSHPAGE); ++ lwb->lwb_zilog = zilog; ++ lwb->lwb_blk = *bp; ++ lwb->lwb_fastwrite = fastwrite; ++ lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); ++ lwb->lwb_max_txg = txg; ++ lwb->lwb_zio = NULL; ++ lwb->lwb_tx = NULL; ++ if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { ++ lwb->lwb_nused = sizeof (zil_chain_t); ++ lwb->lwb_sz = BP_GET_LSIZE(bp); ++ } else { ++ lwb->lwb_nused = 0; ++ lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); ++ } ++ ++ mutex_enter(&zilog->zl_lock); ++ list_insert_tail(&zilog->zl_lwb_list, lwb); ++ mutex_exit(&zilog->zl_lock); ++ ++ return (lwb); ++} ++ ++/* ++ * Create an on-disk intent log. ++ */ ++static lwb_t * ++zil_create(zilog_t *zilog) ++{ ++ const zil_header_t *zh = zilog->zl_header; ++ lwb_t *lwb = NULL; ++ uint64_t txg = 0; ++ dmu_tx_t *tx = NULL; ++ blkptr_t blk; ++ int error = 0; ++ boolean_t fastwrite = FALSE; ++ ++ /* ++ * Wait for any previous destroy to complete. ++ */ ++ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); ++ ++ ASSERT(zh->zh_claim_txg == 0); ++ ASSERT(zh->zh_replay_seq == 0); ++ ++ blk = zh->zh_log; ++ ++ /* ++ * Allocate an initial log block if: ++ * - there isn't one already ++ * - the existing block is the wrong endianess ++ */ ++ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { ++ tx = dmu_tx_create(zilog->zl_os); ++ VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); ++ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); ++ txg = dmu_tx_get_txg(tx); ++ ++ if (!BP_IS_HOLE(&blk)) { ++ zio_free_zil(zilog->zl_spa, txg, &blk); ++ BP_ZERO(&blk); ++ } ++ ++ error = zio_alloc_zil(zilog->zl_spa, txg, &blk, ++ ZIL_MIN_BLKSZ, B_TRUE); ++ fastwrite = TRUE; ++ ++ if (error == 0) ++ zil_init_log_chain(zilog, &blk); ++ } ++ ++ /* ++ * Allocate a log write buffer (lwb) for the first log block. ++ */ ++ if (error == 0) ++ lwb = zil_alloc_lwb(zilog, &blk, txg, fastwrite); ++ ++ /* ++ * If we just allocated the first log block, commit our transaction ++ * and wait for zil_sync() to stuff the block poiner into zh_log. ++ * (zh is part of the MOS, so we cannot modify it in open context.) ++ */ ++ if (tx != NULL) { ++ dmu_tx_commit(tx); ++ txg_wait_synced(zilog->zl_dmu_pool, txg); ++ } ++ ++ ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); ++ ++ return (lwb); ++} ++ ++/* ++ * In one tx, free all log blocks and clear the log header. ++ * If keep_first is set, then we're replaying a log with no content. ++ * We want to keep the first block, however, so that the first ++ * synchronous transaction doesn't require a txg_wait_synced() ++ * in zil_create(). We don't need to txg_wait_synced() here either ++ * when keep_first is set, because both zil_create() and zil_destroy() ++ * will wait for any in-progress destroys to complete. ++ */ ++void ++zil_destroy(zilog_t *zilog, boolean_t keep_first) ++{ ++ const zil_header_t *zh = zilog->zl_header; ++ lwb_t *lwb; ++ dmu_tx_t *tx; ++ uint64_t txg; ++ ++ /* ++ * Wait for any previous destroy to complete. ++ */ ++ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); ++ ++ zilog->zl_old_header = *zh; /* debugging aid */ ++ ++ if (BP_IS_HOLE(&zh->zh_log)) ++ return; ++ ++ tx = dmu_tx_create(zilog->zl_os); ++ VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); ++ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); ++ txg = dmu_tx_get_txg(tx); ++ ++ mutex_enter(&zilog->zl_lock); ++ ++ ASSERT3U(zilog->zl_destroy_txg, <, txg); ++ zilog->zl_destroy_txg = txg; ++ zilog->zl_keep_first = keep_first; ++ ++ if (!list_is_empty(&zilog->zl_lwb_list)) { ++ ASSERT(zh->zh_claim_txg == 0); ++ VERIFY(!keep_first); ++ while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { ++ ASSERT(lwb->lwb_zio == NULL); ++ if (lwb->lwb_fastwrite) ++ metaslab_fastwrite_unmark(zilog->zl_spa, ++ &lwb->lwb_blk); ++ list_remove(&zilog->zl_lwb_list, lwb); ++ if (lwb->lwb_buf != NULL) ++ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); ++ zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk); ++ kmem_cache_free(zil_lwb_cache, lwb); ++ } ++ } else if (!keep_first) { ++ (void) zil_parse(zilog, zil_free_log_block, ++ zil_free_log_record, tx, zh->zh_claim_txg); ++ } ++ mutex_exit(&zilog->zl_lock); ++ ++ dmu_tx_commit(tx); ++} ++ ++int ++zil_claim(const char *osname, void *txarg) ++{ ++ dmu_tx_t *tx = txarg; ++ uint64_t first_txg = dmu_tx_get_txg(tx); ++ zilog_t *zilog; ++ zil_header_t *zh; ++ objset_t *os; ++ int error; ++ ++ error = dmu_objset_hold(osname, FTAG, &os); ++ if (error) { ++ cmn_err(CE_WARN, "can't open objset for %s", osname); ++ return (0); ++ } ++ ++ zilog = dmu_objset_zil(os); ++ zh = zil_header_in_syncing_context(zilog); ++ ++ if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) { ++ if (!BP_IS_HOLE(&zh->zh_log)) ++ zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log); ++ BP_ZERO(&zh->zh_log); ++ dsl_dataset_dirty(dmu_objset_ds(os), tx); ++ dmu_objset_rele(os, FTAG); ++ return (0); ++ } ++ ++ /* ++ * Claim all log blocks if we haven't already done so, and remember ++ * the highest claimed sequence number. This ensures that if we can ++ * read only part of the log now (e.g. due to a missing device), ++ * but we can read the entire log later, we will not try to replay ++ * or destroy beyond the last block we successfully claimed. ++ */ ++ ASSERT3U(zh->zh_claim_txg, <=, first_txg); ++ if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { ++ (void) zil_parse(zilog, zil_claim_log_block, ++ zil_claim_log_record, tx, first_txg); ++ zh->zh_claim_txg = first_txg; ++ zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; ++ zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; ++ if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) ++ zh->zh_flags |= ZIL_REPLAY_NEEDED; ++ zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; ++ dsl_dataset_dirty(dmu_objset_ds(os), tx); ++ } ++ ++ ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); ++ dmu_objset_rele(os, FTAG); ++ return (0); ++} ++ ++/* ++ * Check the log by walking the log chain. ++ * Checksum errors are ok as they indicate the end of the chain. ++ * Any other error (no device or read failure) returns an error. ++ */ ++int ++zil_check_log_chain(const char *osname, void *tx) ++{ ++ zilog_t *zilog; ++ objset_t *os; ++ blkptr_t *bp; ++ int error; ++ ++ ASSERT(tx == NULL); ++ ++ error = dmu_objset_hold(osname, FTAG, &os); ++ if (error) { ++ cmn_err(CE_WARN, "can't open objset for %s", osname); ++ return (0); ++ } ++ ++ zilog = dmu_objset_zil(os); ++ bp = (blkptr_t *)&zilog->zl_header->zh_log; ++ ++ /* ++ * Check the first block and determine if it's on a log device ++ * which may have been removed or faulted prior to loading this ++ * pool. If so, there's no point in checking the rest of the log ++ * as its content should have already been synced to the pool. ++ */ ++ if (!BP_IS_HOLE(bp)) { ++ vdev_t *vd; ++ boolean_t valid = B_TRUE; ++ ++ spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); ++ vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); ++ if (vd->vdev_islog && vdev_is_dead(vd)) ++ valid = vdev_log_state_valid(vd); ++ spa_config_exit(os->os_spa, SCL_STATE, FTAG); ++ ++ if (!valid) { ++ dmu_objset_rele(os, FTAG); ++ return (0); ++ } ++ } ++ ++ /* ++ * Because tx == NULL, zil_claim_log_block() will not actually claim ++ * any blocks, but just determine whether it is possible to do so. ++ * In addition to checking the log chain, zil_claim_log_block() ++ * will invoke zio_claim() with a done func of spa_claim_notify(), ++ * which will update spa_max_claim_txg. See spa_load() for details. ++ */ ++ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, ++ zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa)); ++ ++ dmu_objset_rele(os, FTAG); ++ ++ return ((error == ECKSUM || error == ENOENT) ? 0 : error); ++} ++ ++static int ++zil_vdev_compare(const void *x1, const void *x2) ++{ ++ const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; ++ const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; ++ ++ if (v1 < v2) ++ return (-1); ++ if (v1 > v2) ++ return (1); ++ ++ return (0); ++} ++ ++void ++zil_add_block(zilog_t *zilog, const blkptr_t *bp) ++{ ++ avl_tree_t *t = &zilog->zl_vdev_tree; ++ avl_index_t where; ++ zil_vdev_node_t *zv, zvsearch; ++ int ndvas = BP_GET_NDVAS(bp); ++ int i; ++ ++ if (zfs_nocacheflush) ++ return; ++ ++ ASSERT(zilog->zl_writer); ++ ++ /* ++ * Even though we're zl_writer, we still need a lock because the ++ * zl_get_data() callbacks may have dmu_sync() done callbacks ++ * that will run concurrently. ++ */ ++ mutex_enter(&zilog->zl_vdev_lock); ++ for (i = 0; i < ndvas; i++) { ++ zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); ++ if (avl_find(t, &zvsearch, &where) == NULL) { ++ zv = kmem_alloc(sizeof (*zv), KM_PUSHPAGE); ++ zv->zv_vdev = zvsearch.zv_vdev; ++ avl_insert(t, zv, where); ++ } ++ } ++ mutex_exit(&zilog->zl_vdev_lock); ++} ++ ++static void ++zil_flush_vdevs(zilog_t *zilog) ++{ ++ spa_t *spa = zilog->zl_spa; ++ avl_tree_t *t = &zilog->zl_vdev_tree; ++ void *cookie = NULL; ++ zil_vdev_node_t *zv; ++ zio_t *zio; ++ ++ ASSERT(zilog->zl_writer); ++ ++ /* ++ * We don't need zl_vdev_lock here because we're the zl_writer, ++ * and all zl_get_data() callbacks are done. ++ */ ++ if (avl_numnodes(t) == 0) ++ return; ++ ++ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); ++ ++ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); ++ ++ while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { ++ vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); ++ if (vd != NULL) ++ zio_flush(zio, vd); ++ kmem_free(zv, sizeof (*zv)); ++ } ++ ++ /* ++ * Wait for all the flushes to complete. Not all devices actually ++ * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. ++ */ ++ (void) zio_wait(zio); ++ ++ spa_config_exit(spa, SCL_STATE, FTAG); ++} ++ ++/* ++ * Function called when a log block write completes ++ */ ++static void ++zil_lwb_write_done(zio_t *zio) ++{ ++ lwb_t *lwb = zio->io_private; ++ zilog_t *zilog = lwb->lwb_zilog; ++ dmu_tx_t *tx = lwb->lwb_tx; ++ ++ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); ++ ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); ++ ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); ++ ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); ++ ASSERT(!BP_IS_GANG(zio->io_bp)); ++ ASSERT(!BP_IS_HOLE(zio->io_bp)); ++ ASSERT(zio->io_bp->blk_fill == 0); ++ ++ /* ++ * Ensure the lwb buffer pointer is cleared before releasing ++ * the txg. If we have had an allocation failure and ++ * the txg is waiting to sync then we want want zil_sync() ++ * to remove the lwb so that it's not picked up as the next new ++ * one in zil_commit_writer(). zil_sync() will only remove ++ * the lwb if lwb_buf is null. ++ */ ++ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); ++ mutex_enter(&zilog->zl_lock); ++ lwb->lwb_zio = NULL; ++ lwb->lwb_fastwrite = FALSE; ++ lwb->lwb_buf = NULL; ++ lwb->lwb_tx = NULL; ++ mutex_exit(&zilog->zl_lock); ++ ++ /* ++ * Now that we've written this log block, we have a stable pointer ++ * to the next block in the chain, so it's OK to let the txg in ++ * which we allocated the next block sync. ++ */ ++ dmu_tx_commit(tx); ++} ++ ++/* ++ * Initialize the io for a log block. ++ */ ++static void ++zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) ++{ ++ zbookmark_t zb; ++ ++ SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ++ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, ++ lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); ++ ++ if (zilog->zl_root_zio == NULL) { ++ zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, ++ ZIO_FLAG_CANFAIL); ++ } ++ ++ /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ ++ mutex_enter(&zilog->zl_lock); ++ if (lwb->lwb_zio == NULL) { ++ if (!lwb->lwb_fastwrite) { ++ metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); ++ lwb->lwb_fastwrite = 1; ++ } ++ lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, ++ 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), ++ zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, ++ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ++ ZIO_FLAG_FASTWRITE, &zb); ++ } ++ mutex_exit(&zilog->zl_lock); ++} ++ ++/* ++ * Define a limited set of intent log block sizes. ++ * These must be a multiple of 4KB. Note only the amount used (again ++ * aligned to 4KB) actually gets written. However, we can't always just ++ * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted. ++ */ ++uint64_t zil_block_buckets[] = { ++ 4096, /* non TX_WRITE */ ++ 8192+4096, /* data base */ ++ 32*1024 + 4096, /* NFS writes */ ++ UINT64_MAX ++}; ++ ++/* ++ * Use the slog as long as the current commit size is less than the ++ * limit or the total list size is less than 2X the limit. Limit ++ * checking is disabled by setting zil_slog_limit to UINT64_MAX. ++ */ ++unsigned long zil_slog_limit = 1024 * 1024; ++#define USE_SLOG(zilog) (((zilog)->zl_cur_used < zil_slog_limit) || \ ++ ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))) ++ ++/* ++ * Start a log block write and advance to the next log block. ++ * Calls are serialized. ++ */ ++static lwb_t * ++zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) ++{ ++ lwb_t *nlwb = NULL; ++ zil_chain_t *zilc; ++ spa_t *spa = zilog->zl_spa; ++ blkptr_t *bp; ++ dmu_tx_t *tx; ++ uint64_t txg; ++ uint64_t zil_blksz, wsz; ++ int i, error; ++ boolean_t use_slog; ++ ++ if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { ++ zilc = (zil_chain_t *)lwb->lwb_buf; ++ bp = &zilc->zc_next_blk; ++ } else { ++ zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); ++ bp = &zilc->zc_next_blk; ++ } ++ ++ ASSERT(lwb->lwb_nused <= lwb->lwb_sz); ++ ++ /* ++ * Allocate the next block and save its address in this block ++ * before writing it in order to establish the log chain. ++ * Note that if the allocation of nlwb synced before we wrote ++ * the block that points at it (lwb), we'd leak it if we crashed. ++ * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). ++ * We dirty the dataset to ensure that zil_sync() will be called ++ * to clean up in the event of allocation failure or I/O failure. ++ */ ++ tx = dmu_tx_create(zilog->zl_os); ++ VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); ++ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); ++ txg = dmu_tx_get_txg(tx); ++ ++ lwb->lwb_tx = tx; ++ ++ /* ++ * Log blocks are pre-allocated. Here we select the size of the next ++ * block, based on size used in the last block. ++ * - first find the smallest bucket that will fit the block from a ++ * limited set of block sizes. This is because it's faster to write ++ * blocks allocated from the same metaslab as they are adjacent or ++ * close. ++ * - next find the maximum from the new suggested size and an array of ++ * previous sizes. This lessens a picket fence effect of wrongly ++ * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k ++ * requests. ++ * ++ * Note we only write what is used, but we can't just allocate ++ * the maximum block size because we can exhaust the available ++ * pool log space. ++ */ ++ zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); ++ for (i = 0; zil_blksz > zil_block_buckets[i]; i++) ++ continue; ++ zil_blksz = zil_block_buckets[i]; ++ if (zil_blksz == UINT64_MAX) ++ zil_blksz = SPA_MAXBLOCKSIZE; ++ zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; ++ for (i = 0; i < ZIL_PREV_BLKS; i++) ++ zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); ++ zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); ++ ++ BP_ZERO(bp); ++ use_slog = USE_SLOG(zilog); ++ error = zio_alloc_zil(spa, txg, bp, zil_blksz, USE_SLOG(zilog)); ++ if (use_slog) ++ { ++ ZIL_STAT_BUMP(zil_itx_metaslab_slog_count); ++ ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused); ++ } ++ else ++ { ++ ZIL_STAT_BUMP(zil_itx_metaslab_normal_count); ++ ZIL_STAT_INCR(zil_itx_metaslab_normal_bytes, lwb->lwb_nused); ++ } ++ if (!error) { ++ ASSERT3U(bp->blk_birth, ==, txg); ++ bp->blk_cksum = lwb->lwb_blk.blk_cksum; ++ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; ++ ++ /* ++ * Allocate a new log write buffer (lwb). ++ */ ++ nlwb = zil_alloc_lwb(zilog, bp, txg, TRUE); ++ ++ /* Record the block for later vdev flushing */ ++ zil_add_block(zilog, &lwb->lwb_blk); ++ } ++ ++ if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { ++ /* For Slim ZIL only write what is used. */ ++ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); ++ ASSERT3U(wsz, <=, lwb->lwb_sz); ++ zio_shrink(lwb->lwb_zio, wsz); ++ ++ } else { ++ wsz = lwb->lwb_sz; ++ } ++ ++ zilc->zc_pad = 0; ++ zilc->zc_nused = lwb->lwb_nused; ++ zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; ++ ++ /* ++ * clear unused data for security ++ */ ++ bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); ++ ++ zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */ ++ ++ /* ++ * If there was an allocation failure then nlwb will be null which ++ * forces a txg_wait_synced(). ++ */ ++ return (nlwb); ++} ++ ++static lwb_t * ++zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) ++{ ++ lr_t *lrc = &itx->itx_lr; /* common log record */ ++ lr_write_t *lrw = (lr_write_t *)lrc; ++ char *lr_buf; ++ uint64_t txg = lrc->lrc_txg; ++ uint64_t reclen = lrc->lrc_reclen; ++ uint64_t dlen = 0; ++ ++ if (lwb == NULL) ++ return (NULL); ++ ++ ASSERT(lwb->lwb_buf != NULL); ++ ++ if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) ++ dlen = P2ROUNDUP_TYPED( ++ lrw->lr_length, sizeof (uint64_t), uint64_t); ++ ++ zilog->zl_cur_used += (reclen + dlen); ++ ++ zil_lwb_write_init(zilog, lwb); ++ ++ /* ++ * If this record won't fit in the current log block, start a new one. ++ */ ++ if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { ++ lwb = zil_lwb_write_start(zilog, lwb); ++ if (lwb == NULL) ++ return (NULL); ++ zil_lwb_write_init(zilog, lwb); ++ ASSERT(LWB_EMPTY(lwb)); ++ if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { ++ txg_wait_synced(zilog->zl_dmu_pool, txg); ++ return (lwb); ++ } ++ } ++ ++ lr_buf = lwb->lwb_buf + lwb->lwb_nused; ++ bcopy(lrc, lr_buf, reclen); ++ lrc = (lr_t *)lr_buf; ++ lrw = (lr_write_t *)lrc; ++ ++ ZIL_STAT_BUMP(zil_itx_count); ++ ++ /* ++ * If it's a write, fetch the data or get its blkptr as appropriate. ++ */ ++ if (lrc->lrc_txtype == TX_WRITE) { ++ if (txg > spa_freeze_txg(zilog->zl_spa)) ++ txg_wait_synced(zilog->zl_dmu_pool, txg); ++ if (itx->itx_wr_state == WR_COPIED) { ++ ZIL_STAT_BUMP(zil_itx_copied_count); ++ ZIL_STAT_INCR(zil_itx_copied_bytes, lrw->lr_length); ++ } else { ++ char *dbuf; ++ int error; ++ ++ if (dlen) { ++ ASSERT(itx->itx_wr_state == WR_NEED_COPY); ++ dbuf = lr_buf + reclen; ++ lrw->lr_common.lrc_reclen += dlen; ++ ZIL_STAT_BUMP(zil_itx_needcopy_count); ++ ZIL_STAT_INCR(zil_itx_needcopy_bytes, lrw->lr_length); ++ } else { ++ ASSERT(itx->itx_wr_state == WR_INDIRECT); ++ dbuf = NULL; ++ ZIL_STAT_BUMP(zil_itx_indirect_count); ++ ZIL_STAT_INCR(zil_itx_indirect_bytes, lrw->lr_length); ++ } ++ error = zilog->zl_get_data( ++ itx->itx_private, lrw, dbuf, lwb->lwb_zio); ++ if (error == EIO) { ++ txg_wait_synced(zilog->zl_dmu_pool, txg); ++ return (lwb); ++ } ++ if (error) { ++ ASSERT(error == ENOENT || error == EEXIST || ++ error == EALREADY); ++ return (lwb); ++ } ++ } ++ } ++ ++ /* ++ * We're actually making an entry, so update lrc_seq to be the ++ * log record sequence number. Note that this is generally not ++ * equal to the itx sequence number because not all transactions ++ * are synchronous, and sometimes spa_sync() gets there first. ++ */ ++ lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ ++ lwb->lwb_nused += reclen + dlen; ++ lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); ++ ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); ++ ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0); ++ ++ return (lwb); ++} ++ ++itx_t * ++zil_itx_create(uint64_t txtype, size_t lrsize) ++{ ++ itx_t *itx; ++ ++ lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); ++ ++ itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, ++ KM_PUSHPAGE | KM_NODEBUG); ++ itx->itx_lr.lrc_txtype = txtype; ++ itx->itx_lr.lrc_reclen = lrsize; ++ itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ ++ itx->itx_lr.lrc_seq = 0; /* defensive */ ++ itx->itx_sync = B_TRUE; /* default is synchronous */ ++ ++ return (itx); ++} ++ ++void ++zil_itx_destroy(itx_t *itx) ++{ ++ kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); ++} ++ ++/* ++ * Free up the sync and async itxs. The itxs_t has already been detached ++ * so no locks are needed. ++ */ ++static void ++zil_itxg_clean(itxs_t *itxs) ++{ ++ itx_t *itx; ++ list_t *list; ++ avl_tree_t *t; ++ void *cookie; ++ itx_async_node_t *ian; ++ ++ list = &itxs->i_sync_list; ++ while ((itx = list_head(list)) != NULL) { ++ list_remove(list, itx); ++ kmem_free(itx, offsetof(itx_t, itx_lr) + ++ itx->itx_lr.lrc_reclen); ++ } ++ ++ cookie = NULL; ++ t = &itxs->i_async_tree; ++ while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { ++ list = &ian->ia_list; ++ while ((itx = list_head(list)) != NULL) { ++ list_remove(list, itx); ++ kmem_free(itx, offsetof(itx_t, itx_lr) + ++ itx->itx_lr.lrc_reclen); ++ } ++ list_destroy(list); ++ kmem_free(ian, sizeof (itx_async_node_t)); ++ } ++ avl_destroy(t); ++ ++ kmem_free(itxs, sizeof (itxs_t)); ++} ++ ++static int ++zil_aitx_compare(const void *x1, const void *x2) ++{ ++ const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; ++ const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; ++ ++ if (o1 < o2) ++ return (-1); ++ if (o1 > o2) ++ return (1); ++ ++ return (0); ++} ++ ++/* ++ * Remove all async itx with the given oid. ++ */ ++static void ++zil_remove_async(zilog_t *zilog, uint64_t oid) ++{ ++ uint64_t otxg, txg; ++ itx_async_node_t *ian; ++ avl_tree_t *t; ++ avl_index_t where; ++ list_t clean_list; ++ itx_t *itx; ++ ++ ASSERT(oid != 0); ++ list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); ++ ++ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ ++ otxg = ZILTEST_TXG; ++ else ++ otxg = spa_last_synced_txg(zilog->zl_spa) + 1; ++ ++ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { ++ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; ++ ++ mutex_enter(&itxg->itxg_lock); ++ if (itxg->itxg_txg != txg) { ++ mutex_exit(&itxg->itxg_lock); ++ continue; ++ } ++ ++ /* ++ * Locate the object node and append its list. ++ */ ++ t = &itxg->itxg_itxs->i_async_tree; ++ ian = avl_find(t, &oid, &where); ++ if (ian != NULL) ++ list_move_tail(&clean_list, &ian->ia_list); ++ mutex_exit(&itxg->itxg_lock); ++ } ++ while ((itx = list_head(&clean_list)) != NULL) { ++ list_remove(&clean_list, itx); ++ kmem_free(itx, offsetof(itx_t, itx_lr) + ++ itx->itx_lr.lrc_reclen); ++ } ++ list_destroy(&clean_list); ++} ++ ++void ++zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) ++{ ++ uint64_t txg; ++ itxg_t *itxg; ++ itxs_t *itxs, *clean = NULL; ++ ++ /* ++ * Object ids can be re-instantiated in the next txg so ++ * remove any async transactions to avoid future leaks. ++ * This can happen if a fsync occurs on the re-instantiated ++ * object for a WR_INDIRECT or WR_NEED_COPY write, which gets ++ * the new file data and flushes a write record for the old object. ++ */ ++ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) ++ zil_remove_async(zilog, itx->itx_oid); ++ ++ /* ++ * Ensure the data of a renamed file is committed before the rename. ++ */ ++ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) ++ zil_async_to_sync(zilog, itx->itx_oid); ++ ++ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) ++ txg = ZILTEST_TXG; ++ else ++ txg = dmu_tx_get_txg(tx); ++ ++ itxg = &zilog->zl_itxg[txg & TXG_MASK]; ++ mutex_enter(&itxg->itxg_lock); ++ itxs = itxg->itxg_itxs; ++ if (itxg->itxg_txg != txg) { ++ if (itxs != NULL) { ++ /* ++ * The zil_clean callback hasn't got around to cleaning ++ * this itxg. Save the itxs for release below. ++ * This should be rare. ++ */ ++ atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); ++ itxg->itxg_sod = 0; ++ clean = itxg->itxg_itxs; ++ } ++ ASSERT(itxg->itxg_sod == 0); ++ itxg->itxg_txg = txg; ++ itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_PUSHPAGE); ++ ++ list_create(&itxs->i_sync_list, sizeof (itx_t), ++ offsetof(itx_t, itx_node)); ++ avl_create(&itxs->i_async_tree, zil_aitx_compare, ++ sizeof (itx_async_node_t), ++ offsetof(itx_async_node_t, ia_node)); ++ } ++ if (itx->itx_sync) { ++ list_insert_tail(&itxs->i_sync_list, itx); ++ atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod); ++ itxg->itxg_sod += itx->itx_sod; ++ } else { ++ avl_tree_t *t = &itxs->i_async_tree; ++ uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid; ++ itx_async_node_t *ian; ++ avl_index_t where; ++ ++ ian = avl_find(t, &foid, &where); ++ if (ian == NULL) { ++ ian = kmem_alloc(sizeof (itx_async_node_t), KM_PUSHPAGE); ++ list_create(&ian->ia_list, sizeof (itx_t), ++ offsetof(itx_t, itx_node)); ++ ian->ia_foid = foid; ++ avl_insert(t, ian, where); ++ } ++ list_insert_tail(&ian->ia_list, itx); ++ } ++ ++ itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); ++ mutex_exit(&itxg->itxg_lock); ++ ++ /* Release the old itxs now we've dropped the lock */ ++ if (clean != NULL) ++ zil_itxg_clean(clean); ++} ++ ++/* ++ * If there are any in-memory intent log transactions which have now been ++ * synced then start up a taskq to free them. ++ */ ++void ++zil_clean(zilog_t *zilog, uint64_t synced_txg) ++{ ++ itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; ++ itxs_t *clean_me; ++ ++ mutex_enter(&itxg->itxg_lock); ++ if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { ++ mutex_exit(&itxg->itxg_lock); ++ return; ++ } ++ ASSERT3U(itxg->itxg_txg, <=, synced_txg); ++ ASSERT(itxg->itxg_txg != 0); ++ ASSERT(zilog->zl_clean_taskq != NULL); ++ atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); ++ itxg->itxg_sod = 0; ++ clean_me = itxg->itxg_itxs; ++ itxg->itxg_itxs = NULL; ++ itxg->itxg_txg = 0; ++ mutex_exit(&itxg->itxg_lock); ++ /* ++ * Preferably start a task queue to free up the old itxs but ++ * if taskq_dispatch can't allocate resources to do that then ++ * free it in-line. This should be rare. Note, using TQ_SLEEP ++ * created a bad performance problem. ++ */ ++ if (taskq_dispatch(zilog->zl_clean_taskq, ++ (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0) ++ zil_itxg_clean(clean_me); ++} ++ ++/* ++ * Get the list of itxs to commit into zl_itx_commit_list. ++ */ ++static void ++zil_get_commit_list(zilog_t *zilog) ++{ ++ uint64_t otxg, txg; ++ list_t *commit_list = &zilog->zl_itx_commit_list; ++ uint64_t push_sod = 0; ++ ++ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ ++ otxg = ZILTEST_TXG; ++ else ++ otxg = spa_last_synced_txg(zilog->zl_spa) + 1; ++ ++ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { ++ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; ++ ++ mutex_enter(&itxg->itxg_lock); ++ if (itxg->itxg_txg != txg) { ++ mutex_exit(&itxg->itxg_lock); ++ continue; ++ } ++ ++ list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); ++ push_sod += itxg->itxg_sod; ++ itxg->itxg_sod = 0; ++ ++ mutex_exit(&itxg->itxg_lock); ++ } ++ atomic_add_64(&zilog->zl_itx_list_sz, -push_sod); ++} ++ ++/* ++ * Move the async itxs for a specified object to commit into sync lists. ++ */ ++static void ++zil_async_to_sync(zilog_t *zilog, uint64_t foid) ++{ ++ uint64_t otxg, txg; ++ itx_async_node_t *ian; ++ avl_tree_t *t; ++ avl_index_t where; ++ ++ if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ ++ otxg = ZILTEST_TXG; ++ else ++ otxg = spa_last_synced_txg(zilog->zl_spa) + 1; ++ ++ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { ++ itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; ++ ++ mutex_enter(&itxg->itxg_lock); ++ if (itxg->itxg_txg != txg) { ++ mutex_exit(&itxg->itxg_lock); ++ continue; ++ } ++ ++ /* ++ * If a foid is specified then find that node and append its ++ * list. Otherwise walk the tree appending all the lists ++ * to the sync list. We add to the end rather than the ++ * beginning to ensure the create has happened. ++ */ ++ t = &itxg->itxg_itxs->i_async_tree; ++ if (foid != 0) { ++ ian = avl_find(t, &foid, &where); ++ if (ian != NULL) { ++ list_move_tail(&itxg->itxg_itxs->i_sync_list, ++ &ian->ia_list); ++ } ++ } else { ++ void *cookie = NULL; ++ ++ while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { ++ list_move_tail(&itxg->itxg_itxs->i_sync_list, ++ &ian->ia_list); ++ list_destroy(&ian->ia_list); ++ kmem_free(ian, sizeof (itx_async_node_t)); ++ } ++ } ++ mutex_exit(&itxg->itxg_lock); ++ } ++} ++ ++static void ++zil_commit_writer(zilog_t *zilog) ++{ ++ uint64_t txg; ++ itx_t *itx; ++ lwb_t *lwb; ++ spa_t *spa = zilog->zl_spa; ++ int error = 0; ++ ++ ASSERT(zilog->zl_root_zio == NULL); ++ ++ mutex_exit(&zilog->zl_lock); ++ ++ zil_get_commit_list(zilog); ++ ++ /* ++ * Return if there's nothing to commit before we dirty the fs by ++ * calling zil_create(). ++ */ ++ if (list_head(&zilog->zl_itx_commit_list) == NULL) { ++ mutex_enter(&zilog->zl_lock); ++ return; ++ } ++ ++ if (zilog->zl_suspend) { ++ lwb = NULL; ++ } else { ++ lwb = list_tail(&zilog->zl_lwb_list); ++ if (lwb == NULL) ++ lwb = zil_create(zilog); ++ } ++ ++ DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); ++ while ((itx = list_head(&zilog->zl_itx_commit_list))) { ++ txg = itx->itx_lr.lrc_txg; ++ ASSERT(txg); ++ ++ if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa)) ++ lwb = zil_lwb_commit(zilog, itx, lwb); ++ list_remove(&zilog->zl_itx_commit_list, itx); ++ kmem_free(itx, offsetof(itx_t, itx_lr) ++ + itx->itx_lr.lrc_reclen); ++ } ++ DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); ++ ++ /* write the last block out */ ++ if (lwb != NULL && lwb->lwb_zio != NULL) ++ lwb = zil_lwb_write_start(zilog, lwb); ++ ++ zilog->zl_cur_used = 0; ++ ++ /* ++ * Wait if necessary for the log blocks to be on stable storage. ++ */ ++ if (zilog->zl_root_zio) { ++ error = zio_wait(zilog->zl_root_zio); ++ zilog->zl_root_zio = NULL; ++ zil_flush_vdevs(zilog); ++ } ++ ++ if (error || lwb == NULL) ++ txg_wait_synced(zilog->zl_dmu_pool, 0); ++ ++ mutex_enter(&zilog->zl_lock); ++ ++ /* ++ * Remember the highest committed log sequence number for ztest. ++ * We only update this value when all the log writes succeeded, ++ * because ztest wants to ASSERT that it got the whole log chain. ++ */ ++ if (error == 0 && lwb != NULL) ++ zilog->zl_commit_lr_seq = zilog->zl_lr_seq; ++} ++ ++/* ++ * Commit zfs transactions to stable storage. ++ * If foid is 0 push out all transactions, otherwise push only those ++ * for that object or might reference that object. ++ * ++ * itxs are committed in batches. In a heavily stressed zil there will be ++ * a commit writer thread who is writing out a bunch of itxs to the log ++ * for a set of committing threads (cthreads) in the same batch as the writer. ++ * Those cthreads are all waiting on the same cv for that batch. ++ * ++ * There will also be a different and growing batch of threads that are ++ * waiting to commit (qthreads). When the committing batch completes ++ * a transition occurs such that the cthreads exit and the qthreads become ++ * cthreads. One of the new cthreads becomes the writer thread for the ++ * batch. Any new threads arriving become new qthreads. ++ * ++ * Only 2 condition variables are needed and there's no transition ++ * between the two cvs needed. They just flip-flop between qthreads ++ * and cthreads. ++ * ++ * Using this scheme we can efficiently wakeup up only those threads ++ * that have been committed. ++ */ ++void ++zil_commit(zilog_t *zilog, uint64_t foid) ++{ ++ uint64_t mybatch; ++ ++ if (zilog->zl_sync == ZFS_SYNC_DISABLED) ++ return; ++ ++ ZIL_STAT_BUMP(zil_commit_count); ++ ++ /* move the async itxs for the foid to the sync queues */ ++ zil_async_to_sync(zilog, foid); ++ ++ mutex_enter(&zilog->zl_lock); ++ mybatch = zilog->zl_next_batch; ++ while (zilog->zl_writer) { ++ cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock); ++ if (mybatch <= zilog->zl_com_batch) { ++ mutex_exit(&zilog->zl_lock); ++ return; ++ } ++ } ++ ++ zilog->zl_next_batch++; ++ zilog->zl_writer = B_TRUE; ++ ZIL_STAT_BUMP(zil_commit_writer_count); ++ zil_commit_writer(zilog); ++ zilog->zl_com_batch = mybatch; ++ zilog->zl_writer = B_FALSE; ++ ++ /* wake up one thread to become the next writer */ ++ cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]); ++ ++ /* wake up all threads waiting for this batch to be committed */ ++ cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]); ++ ++ mutex_exit(&zilog->zl_lock); ++} ++ ++/* ++ * Called in syncing context to free committed log blocks and update log header. ++ */ ++void ++zil_sync(zilog_t *zilog, dmu_tx_t *tx) ++{ ++ zil_header_t *zh = zil_header_in_syncing_context(zilog); ++ uint64_t txg = dmu_tx_get_txg(tx); ++ spa_t *spa = zilog->zl_spa; ++ uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; ++ lwb_t *lwb; ++ ++ /* ++ * We don't zero out zl_destroy_txg, so make sure we don't try ++ * to destroy it twice. ++ */ ++ if (spa_sync_pass(spa) != 1) ++ return; ++ ++ mutex_enter(&zilog->zl_lock); ++ ++ ASSERT(zilog->zl_stop_sync == 0); ++ ++ if (*replayed_seq != 0) { ++ ASSERT(zh->zh_replay_seq < *replayed_seq); ++ zh->zh_replay_seq = *replayed_seq; ++ *replayed_seq = 0; ++ } ++ ++ if (zilog->zl_destroy_txg == txg) { ++ blkptr_t blk = zh->zh_log; ++ ++ ASSERT(list_head(&zilog->zl_lwb_list) == NULL); ++ ++ bzero(zh, sizeof (zil_header_t)); ++ bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); ++ ++ if (zilog->zl_keep_first) { ++ /* ++ * If this block was part of log chain that couldn't ++ * be claimed because a device was missing during ++ * zil_claim(), but that device later returns, ++ * then this block could erroneously appear valid. ++ * To guard against this, assign a new GUID to the new ++ * log chain so it doesn't matter what blk points to. ++ */ ++ zil_init_log_chain(zilog, &blk); ++ zh->zh_log = blk; ++ } ++ } ++ ++ while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { ++ zh->zh_log = lwb->lwb_blk; ++ if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) ++ break; ++ ++ ASSERT(lwb->lwb_zio == NULL); ++ ++ list_remove(&zilog->zl_lwb_list, lwb); ++ zio_free_zil(spa, txg, &lwb->lwb_blk); ++ kmem_cache_free(zil_lwb_cache, lwb); ++ ++ /* ++ * If we don't have anything left in the lwb list then ++ * we've had an allocation failure and we need to zero ++ * out the zil_header blkptr so that we don't end ++ * up freeing the same block twice. ++ */ ++ if (list_head(&zilog->zl_lwb_list) == NULL) ++ BP_ZERO(&zh->zh_log); ++ } ++ ++ /* ++ * Remove fastwrite on any blocks that have been pre-allocated for ++ * the next commit. This prevents fastwrite counter pollution by ++ * unused, long-lived LWBs. ++ */ ++ for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) { ++ if (lwb->lwb_fastwrite && !lwb->lwb_zio) { ++ metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); ++ lwb->lwb_fastwrite = 0; ++ } ++ } ++ ++ mutex_exit(&zilog->zl_lock); ++} ++ ++void ++zil_init(void) ++{ ++ zil_lwb_cache = kmem_cache_create("zil_lwb_cache", ++ sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0); ++ ++ zil_ksp = kstat_create("zfs", 0, "zil", "misc", ++ KSTAT_TYPE_NAMED, sizeof(zil_stats) / sizeof(kstat_named_t), ++ KSTAT_FLAG_VIRTUAL); ++ ++ if (zil_ksp != NULL) { ++ zil_ksp->ks_data = &zil_stats; ++ kstat_install(zil_ksp); ++ } ++} ++ ++void ++zil_fini(void) ++{ ++ kmem_cache_destroy(zil_lwb_cache); ++ ++ if (zil_ksp != NULL) { ++ kstat_delete(zil_ksp); ++ zil_ksp = NULL; ++ } ++} ++ ++void ++zil_set_sync(zilog_t *zilog, uint64_t sync) ++{ ++ zilog->zl_sync = sync; ++} ++ ++void ++zil_set_logbias(zilog_t *zilog, uint64_t logbias) ++{ ++ zilog->zl_logbias = logbias; ++} ++ ++zilog_t * ++zil_alloc(objset_t *os, zil_header_t *zh_phys) ++{ ++ zilog_t *zilog; ++ int i; ++ ++ zilog = kmem_zalloc(sizeof (zilog_t), KM_PUSHPAGE); ++ ++ zilog->zl_header = zh_phys; ++ zilog->zl_os = os; ++ zilog->zl_spa = dmu_objset_spa(os); ++ zilog->zl_dmu_pool = dmu_objset_pool(os); ++ zilog->zl_destroy_txg = TXG_INITIAL - 1; ++ zilog->zl_logbias = dmu_objset_logbias(os); ++ zilog->zl_sync = dmu_objset_syncprop(os); ++ zilog->zl_next_batch = 1; ++ ++ mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ for (i = 0; i < TXG_SIZE; i++) { ++ mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, ++ MUTEX_DEFAULT, NULL); ++ } ++ ++ list_create(&zilog->zl_lwb_list, sizeof (lwb_t), ++ offsetof(lwb_t, lwb_node)); ++ ++ list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), ++ offsetof(itx_t, itx_node)); ++ ++ mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ avl_create(&zilog->zl_vdev_tree, zil_vdev_compare, ++ sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); ++ ++ cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); ++ cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); ++ cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL); ++ cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL); ++ ++ return (zilog); ++} ++ ++void ++zil_free(zilog_t *zilog) ++{ ++ int i; ++ ++ zilog->zl_stop_sync = 1; ++ ++ ASSERT(list_is_empty(&zilog->zl_lwb_list)); ++ list_destroy(&zilog->zl_lwb_list); ++ ++ avl_destroy(&zilog->zl_vdev_tree); ++ mutex_destroy(&zilog->zl_vdev_lock); ++ ++ ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); ++ list_destroy(&zilog->zl_itx_commit_list); ++ ++ for (i = 0; i < TXG_SIZE; i++) { ++ /* ++ * It's possible for an itx to be generated that doesn't dirty ++ * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() ++ * callback to remove the entry. We remove those here. ++ * ++ * Also free up the ziltest itxs. ++ */ ++ if (zilog->zl_itxg[i].itxg_itxs) ++ zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); ++ mutex_destroy(&zilog->zl_itxg[i].itxg_lock); ++ } ++ ++ mutex_destroy(&zilog->zl_lock); ++ ++ cv_destroy(&zilog->zl_cv_writer); ++ cv_destroy(&zilog->zl_cv_suspend); ++ cv_destroy(&zilog->zl_cv_batch[0]); ++ cv_destroy(&zilog->zl_cv_batch[1]); ++ ++ kmem_free(zilog, sizeof (zilog_t)); ++} ++ ++/* ++ * Open an intent log. ++ */ ++zilog_t * ++zil_open(objset_t *os, zil_get_data_t *get_data) ++{ ++ zilog_t *zilog = dmu_objset_zil(os); ++ ++ ASSERT(zilog->zl_clean_taskq == NULL); ++ ASSERT(zilog->zl_get_data == NULL); ++ ASSERT(list_is_empty(&zilog->zl_lwb_list)); ++ ++ zilog->zl_get_data = get_data; ++ zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, ++ 2, 2, TASKQ_PREPOPULATE); ++ ++ return (zilog); ++} ++ ++/* ++ * Close an intent log. ++ */ ++void ++zil_close(zilog_t *zilog) ++{ ++ lwb_t *lwb; ++ uint64_t txg = 0; ++ ++ zil_commit(zilog, 0); /* commit all itx */ ++ ++ /* ++ * The lwb_max_txg for the stubby lwb will reflect the last activity ++ * for the zil. After a txg_wait_synced() on the txg we know all the ++ * callbacks have occurred that may clean the zil. Only then can we ++ * destroy the zl_clean_taskq. ++ */ ++ mutex_enter(&zilog->zl_lock); ++ lwb = list_tail(&zilog->zl_lwb_list); ++ if (lwb != NULL) ++ txg = lwb->lwb_max_txg; ++ mutex_exit(&zilog->zl_lock); ++ if (txg) ++ txg_wait_synced(zilog->zl_dmu_pool, txg); ++ ++ taskq_destroy(zilog->zl_clean_taskq); ++ zilog->zl_clean_taskq = NULL; ++ zilog->zl_get_data = NULL; ++ ++ /* ++ * We should have only one LWB left on the list; remove it now. ++ */ ++ mutex_enter(&zilog->zl_lock); ++ lwb = list_head(&zilog->zl_lwb_list); ++ if (lwb != NULL) { ++ ASSERT(lwb == list_tail(&zilog->zl_lwb_list)); ++ ASSERT(lwb->lwb_zio == NULL); ++ if (lwb->lwb_fastwrite) ++ metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); ++ list_remove(&zilog->zl_lwb_list, lwb); ++ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); ++ kmem_cache_free(zil_lwb_cache, lwb); ++ } ++ mutex_exit(&zilog->zl_lock); ++} ++ ++/* ++ * Suspend an intent log. While in suspended mode, we still honor ++ * synchronous semantics, but we rely on txg_wait_synced() to do it. ++ * We suspend the log briefly when taking a snapshot so that the snapshot ++ * contains all the data it's supposed to, and has an empty intent log. ++ */ ++int ++zil_suspend(zilog_t *zilog) ++{ ++ const zil_header_t *zh = zilog->zl_header; ++ ++ mutex_enter(&zilog->zl_lock); ++ if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ ++ mutex_exit(&zilog->zl_lock); ++ return (EBUSY); ++ } ++ if (zilog->zl_suspend++ != 0) { ++ /* ++ * Someone else already began a suspend. ++ * Just wait for them to finish. ++ */ ++ while (zilog->zl_suspending) ++ cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); ++ mutex_exit(&zilog->zl_lock); ++ return (0); ++ } ++ zilog->zl_suspending = B_TRUE; ++ mutex_exit(&zilog->zl_lock); ++ ++ zil_commit(zilog, 0); ++ ++ zil_destroy(zilog, B_FALSE); ++ ++ mutex_enter(&zilog->zl_lock); ++ zilog->zl_suspending = B_FALSE; ++ cv_broadcast(&zilog->zl_cv_suspend); ++ mutex_exit(&zilog->zl_lock); ++ ++ return (0); ++} ++ ++void ++zil_resume(zilog_t *zilog) ++{ ++ mutex_enter(&zilog->zl_lock); ++ ASSERT(zilog->zl_suspend != 0); ++ zilog->zl_suspend--; ++ mutex_exit(&zilog->zl_lock); ++} ++ ++typedef struct zil_replay_arg { ++ zil_replay_func_t **zr_replay; ++ void *zr_arg; ++ boolean_t zr_byteswap; ++ char *zr_lr; ++} zil_replay_arg_t; ++ ++static int ++zil_replay_error(zilog_t *zilog, lr_t *lr, int error) ++{ ++ char name[MAXNAMELEN]; ++ ++ zilog->zl_replaying_seq--; /* didn't actually replay this one */ ++ ++ dmu_objset_name(zilog->zl_os, name); ++ ++ cmn_err(CE_WARN, "ZFS replay transaction error %d, " ++ "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, ++ (u_longlong_t)lr->lrc_seq, ++ (u_longlong_t)(lr->lrc_txtype & ~TX_CI), ++ (lr->lrc_txtype & TX_CI) ? "CI" : ""); ++ ++ return (error); ++} ++ ++static int ++zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) ++{ ++ zil_replay_arg_t *zr = zra; ++ const zil_header_t *zh = zilog->zl_header; ++ uint64_t reclen = lr->lrc_reclen; ++ uint64_t txtype = lr->lrc_txtype; ++ int error = 0; ++ ++ zilog->zl_replaying_seq = lr->lrc_seq; ++ ++ if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ ++ return (0); ++ ++ if (lr->lrc_txg < claim_txg) /* already committed */ ++ return (0); ++ ++ /* Strip case-insensitive bit, still present in log record */ ++ txtype &= ~TX_CI; ++ ++ if (txtype == 0 || txtype >= TX_MAX_TYPE) ++ return (zil_replay_error(zilog, lr, EINVAL)); ++ ++ /* ++ * If this record type can be logged out of order, the object ++ * (lr_foid) may no longer exist. That's legitimate, not an error. ++ */ ++ if (TX_OOO(txtype)) { ++ error = dmu_object_info(zilog->zl_os, ++ ((lr_ooo_t *)lr)->lr_foid, NULL); ++ if (error == ENOENT || error == EEXIST) ++ return (0); ++ } ++ ++ /* ++ * Make a copy of the data so we can revise and extend it. ++ */ ++ bcopy(lr, zr->zr_lr, reclen); ++ ++ /* ++ * If this is a TX_WRITE with a blkptr, suck in the data. ++ */ ++ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { ++ error = zil_read_log_data(zilog, (lr_write_t *)lr, ++ zr->zr_lr + reclen); ++ if (error) ++ return (zil_replay_error(zilog, lr, error)); ++ } ++ ++ /* ++ * The log block containing this lr may have been byteswapped ++ * so that we can easily examine common fields like lrc_txtype. ++ * However, the log is a mix of different record types, and only the ++ * replay vectors know how to byteswap their records. Therefore, if ++ * the lr was byteswapped, undo it before invoking the replay vector. ++ */ ++ if (zr->zr_byteswap) ++ byteswap_uint64_array(zr->zr_lr, reclen); ++ ++ /* ++ * We must now do two things atomically: replay this log record, ++ * and update the log header sequence number to reflect the fact that ++ * we did so. At the end of each replay function the sequence number ++ * is updated if we are in replay mode. ++ */ ++ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); ++ if (error) { ++ /* ++ * The DMU's dnode layer doesn't see removes until the txg ++ * commits, so a subsequent claim can spuriously fail with ++ * EEXIST. So if we receive any error we try syncing out ++ * any removes then retry the transaction. Note that we ++ * specify B_FALSE for byteswap now, so we don't do it twice. ++ */ ++ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); ++ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); ++ if (error) ++ return (zil_replay_error(zilog, lr, error)); ++ } ++ return (0); ++} ++ ++/* ARGSUSED */ ++static int ++zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) ++{ ++ zilog->zl_replay_blks++; ++ ++ return (0); ++} ++ ++/* ++ * If this dataset has a non-empty intent log, replay it and destroy it. ++ */ ++void ++zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) ++{ ++ zilog_t *zilog = dmu_objset_zil(os); ++ const zil_header_t *zh = zilog->zl_header; ++ zil_replay_arg_t zr; ++ ++ if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { ++ zil_destroy(zilog, B_TRUE); ++ return; ++ } ++ ++ zr.zr_replay = replay_func; ++ zr.zr_arg = arg; ++ zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); ++ zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_PUSHPAGE); ++ ++ /* ++ * Wait for in-progress removes to sync before starting replay. ++ */ ++ txg_wait_synced(zilog->zl_dmu_pool, 0); ++ ++ zilog->zl_replay = B_TRUE; ++ zilog->zl_replay_time = ddi_get_lbolt(); ++ ASSERT(zilog->zl_replay_blks == 0); ++ (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, ++ zh->zh_claim_txg); ++ vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); ++ ++ zil_destroy(zilog, B_FALSE); ++ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); ++ zilog->zl_replay = B_FALSE; ++} ++ ++boolean_t ++zil_replaying(zilog_t *zilog, dmu_tx_t *tx) ++{ ++ if (zilog->zl_sync == ZFS_SYNC_DISABLED) ++ return (B_TRUE); ++ ++ if (zilog->zl_replay) { ++ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); ++ zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = ++ zilog->zl_replaying_seq; ++ return (B_TRUE); ++ } ++ ++ return (B_FALSE); ++} ++ ++/* ARGSUSED */ ++int ++zil_vdev_offline(const char *osname, void *arg) ++{ ++ objset_t *os; ++ zilog_t *zilog; ++ int error; ++ ++ error = dmu_objset_hold(osname, FTAG, &os); ++ if (error) ++ return (error); ++ ++ zilog = dmu_objset_zil(os); ++ if (zil_suspend(zilog) != 0) ++ error = EEXIST; ++ else ++ zil_resume(zilog); ++ dmu_objset_rele(os, FTAG); ++ return (error); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++module_param(zil_replay_disable, int, 0644); ++MODULE_PARM_DESC(zil_replay_disable, "Disable intent logging replay"); ++ ++module_param(zfs_nocacheflush, int, 0644); ++MODULE_PARM_DESC(zfs_nocacheflush, "Disable cache flushes"); ++ ++module_param(zil_slog_limit, ulong, 0644); ++MODULE_PARM_DESC(zil_slog_limit, "Max commit bytes to separate log device"); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zio.c linux-3.2.33-go/fs/zfs/zfs/zio.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zio.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zio.c 2012-11-16 23:25:34.348039346 +0100 +@@ -0,0 +1,3166 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * ========================================================================== ++ * I/O priority table ++ * ========================================================================== ++ */ ++uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { ++ 0, /* ZIO_PRIORITY_NOW */ ++ 0, /* ZIO_PRIORITY_SYNC_READ */ ++ 0, /* ZIO_PRIORITY_SYNC_WRITE */ ++ 0, /* ZIO_PRIORITY_LOG_WRITE */ ++ 1, /* ZIO_PRIORITY_CACHE_FILL */ ++ 1, /* ZIO_PRIORITY_AGG */ ++ 4, /* ZIO_PRIORITY_FREE */ ++ 4, /* ZIO_PRIORITY_ASYNC_WRITE */ ++ 6, /* ZIO_PRIORITY_ASYNC_READ */ ++ 10, /* ZIO_PRIORITY_RESILVER */ ++ 20, /* ZIO_PRIORITY_SCRUB */ ++ 2, /* ZIO_PRIORITY_DDT_PREFETCH */ ++}; ++ ++/* ++ * ========================================================================== ++ * I/O type descriptions ++ * ========================================================================== ++ */ ++char *zio_type_name[ZIO_TYPES] = { ++ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl" ++}; ++ ++/* ++ * ========================================================================== ++ * I/O kmem caches ++ * ========================================================================== ++ */ ++kmem_cache_t *zio_cache; ++kmem_cache_t *zio_link_cache; ++kmem_cache_t *zio_vdev_cache; ++kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; ++kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; ++int zio_bulk_flags = 0; ++int zio_delay_max = ZIO_DELAY_MAX; ++ ++#ifdef _KERNEL ++extern vmem_t *zio_alloc_arena; ++#endif ++extern int zfs_mg_alloc_failures; ++ ++/* ++ * An allocating zio is one that either currently has the DVA allocate ++ * stage set or will have it later in its lifetime. ++ */ ++#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) ++ ++int zio_requeue_io_start_cut_in_line = 1; ++ ++#ifdef ZFS_DEBUG ++int zio_buf_debug_limit = 16384; ++#else ++int zio_buf_debug_limit = 0; ++#endif ++ ++static inline void __zio_execute(zio_t *zio); ++ ++static int ++zio_cons(void *arg, void *unused, int kmflag) ++{ ++ zio_t *zio = arg; ++ ++ bzero(zio, sizeof (zio_t)); ++ ++ mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); ++ cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); ++ ++ list_create(&zio->io_parent_list, sizeof (zio_link_t), ++ offsetof(zio_link_t, zl_parent_node)); ++ list_create(&zio->io_child_list, sizeof (zio_link_t), ++ offsetof(zio_link_t, zl_child_node)); ++ ++ return (0); ++} ++ ++static void ++zio_dest(void *arg, void *unused) ++{ ++ zio_t *zio = arg; ++ ++ mutex_destroy(&zio->io_lock); ++ cv_destroy(&zio->io_cv); ++ list_destroy(&zio->io_parent_list); ++ list_destroy(&zio->io_child_list); ++} ++ ++void ++zio_init(void) ++{ ++ size_t c; ++ vmem_t *data_alloc_arena = NULL; ++ ++#ifdef _KERNEL ++ data_alloc_arena = zio_alloc_arena; ++#endif ++ zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, ++ zio_cons, zio_dest, NULL, NULL, NULL, KMC_KMEM); ++ zio_link_cache = kmem_cache_create("zio_link_cache", ++ sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM); ++ zio_vdev_cache = kmem_cache_create("zio_vdev_cache", sizeof(vdev_io_t), ++ PAGESIZE, NULL, NULL, NULL, NULL, NULL, KMC_VMEM); ++ ++ /* ++ * For small buffers, we want a cache for each multiple of ++ * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache ++ * for each quarter-power of 2. For large buffers, we want ++ * a cache for each multiple of PAGESIZE. ++ */ ++ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { ++ size_t size = (c + 1) << SPA_MINBLOCKSHIFT; ++ size_t p2 = size; ++ size_t align = 0; ++ ++ while (p2 & (p2 - 1)) ++ p2 &= p2 - 1; ++ ++ if (size <= 4 * SPA_MINBLOCKSIZE) { ++ align = SPA_MINBLOCKSIZE; ++ } else if (P2PHASE(size, PAGESIZE) == 0) { ++ align = PAGESIZE; ++ } else if (P2PHASE(size, p2 >> 2) == 0) { ++ align = p2 >> 2; ++ } ++ ++ if (align != 0) { ++ char name[36]; ++ int flags = zio_bulk_flags; ++ ++ /* ++ * The smallest buffers (512b) are heavily used and ++ * experience a lot of churn. The slabs allocated ++ * for them are also relatively small (32K). Thus ++ * in over to avoid expensive calls to vmalloc() we ++ * make an exception to the usual slab allocation ++ * policy and force these buffers to be kmem backed. ++ */ ++ if (size == (1 << SPA_MINBLOCKSHIFT)) ++ flags |= KMC_KMEM; ++ ++ (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); ++ zio_buf_cache[c] = kmem_cache_create(name, size, ++ align, NULL, NULL, NULL, NULL, NULL, flags); ++ ++ (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); ++ zio_data_buf_cache[c] = kmem_cache_create(name, size, ++ align, NULL, NULL, NULL, NULL, ++ data_alloc_arena, flags); ++ } ++ } ++ ++ while (--c != 0) { ++ ASSERT(zio_buf_cache[c] != NULL); ++ if (zio_buf_cache[c - 1] == NULL) ++ zio_buf_cache[c - 1] = zio_buf_cache[c]; ++ ++ ASSERT(zio_data_buf_cache[c] != NULL); ++ if (zio_data_buf_cache[c - 1] == NULL) ++ zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; ++ } ++ ++ /* ++ * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs ++ * to fail 3 times per txg or 8 failures, whichever is greater. ++ */ ++ zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); ++ ++ zio_inject_init(); ++} ++ ++void ++zio_fini(void) ++{ ++ size_t c; ++ kmem_cache_t *last_cache = NULL; ++ kmem_cache_t *last_data_cache = NULL; ++ ++ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { ++ if (zio_buf_cache[c] != last_cache) { ++ last_cache = zio_buf_cache[c]; ++ kmem_cache_destroy(zio_buf_cache[c]); ++ } ++ zio_buf_cache[c] = NULL; ++ ++ if (zio_data_buf_cache[c] != last_data_cache) { ++ last_data_cache = zio_data_buf_cache[c]; ++ kmem_cache_destroy(zio_data_buf_cache[c]); ++ } ++ zio_data_buf_cache[c] = NULL; ++ } ++ ++ kmem_cache_destroy(zio_vdev_cache); ++ kmem_cache_destroy(zio_link_cache); ++ kmem_cache_destroy(zio_cache); ++ ++ zio_inject_fini(); ++} ++ ++/* ++ * ========================================================================== ++ * Allocate and free I/O buffers ++ * ========================================================================== ++ */ ++ ++/* ++ * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a ++ * crashdump if the kernel panics, so use it judiciously. Obviously, it's ++ * useful to inspect ZFS metadata, but if possible, we should avoid keeping ++ * excess / transient data in-core during a crashdump. ++ */ ++void * ++zio_buf_alloc(size_t size) ++{ ++ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; ++ ++ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); ++ ++ return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE | KM_NODEBUG)); ++} ++ ++/* ++ * Use zio_data_buf_alloc to allocate data. The data will not appear in a ++ * crashdump if the kernel panics. This exists so that we will limit the amount ++ * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount ++ * of kernel heap dumped to disk when the kernel panics) ++ */ ++void * ++zio_data_buf_alloc(size_t size) ++{ ++ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; ++ ++ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); ++ ++ return (kmem_cache_alloc(zio_data_buf_cache[c], ++ KM_PUSHPAGE | KM_NODEBUG)); ++} ++ ++void ++zio_buf_free(void *buf, size_t size) ++{ ++ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; ++ ++ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); ++ ++ kmem_cache_free(zio_buf_cache[c], buf); ++} ++ ++void ++zio_data_buf_free(void *buf, size_t size) ++{ ++ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; ++ ++ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); ++ ++ kmem_cache_free(zio_data_buf_cache[c], buf); ++} ++ ++/* ++ * Dedicated I/O buffers to ensure that memory fragmentation never prevents ++ * or significantly delays the issuing of a zio. These buffers are used ++ * to aggregate I/O and could be used for raidz stripes. ++ */ ++void * ++zio_vdev_alloc(void) ++{ ++ return (kmem_cache_alloc(zio_vdev_cache, KM_PUSHPAGE)); ++} ++ ++void ++zio_vdev_free(void *buf) ++{ ++ kmem_cache_free(zio_vdev_cache, buf); ++ ++} ++ ++/* ++ * ========================================================================== ++ * Push and pop I/O transform buffers ++ * ========================================================================== ++ */ ++static void ++zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, ++ zio_transform_func_t *transform) ++{ ++ zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_PUSHPAGE); ++ ++ zt->zt_orig_data = zio->io_data; ++ zt->zt_orig_size = zio->io_size; ++ zt->zt_bufsize = bufsize; ++ zt->zt_transform = transform; ++ ++ zt->zt_next = zio->io_transform_stack; ++ zio->io_transform_stack = zt; ++ ++ zio->io_data = data; ++ zio->io_size = size; ++} ++ ++static void ++zio_pop_transforms(zio_t *zio) ++{ ++ zio_transform_t *zt; ++ ++ while ((zt = zio->io_transform_stack) != NULL) { ++ if (zt->zt_transform != NULL) ++ zt->zt_transform(zio, ++ zt->zt_orig_data, zt->zt_orig_size); ++ ++ if (zt->zt_bufsize != 0) ++ zio_buf_free(zio->io_data, zt->zt_bufsize); ++ ++ zio->io_data = zt->zt_orig_data; ++ zio->io_size = zt->zt_orig_size; ++ zio->io_transform_stack = zt->zt_next; ++ ++ kmem_free(zt, sizeof (zio_transform_t)); ++ } ++} ++ ++/* ++ * ========================================================================== ++ * I/O transform callbacks for subblocks and decompression ++ * ========================================================================== ++ */ ++static void ++zio_subblock(zio_t *zio, void *data, uint64_t size) ++{ ++ ASSERT(zio->io_size > size); ++ ++ if (zio->io_type == ZIO_TYPE_READ) ++ bcopy(zio->io_data, data, size); ++} ++ ++static void ++zio_decompress(zio_t *zio, void *data, uint64_t size) ++{ ++ if (zio->io_error == 0 && ++ zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), ++ zio->io_data, data, zio->io_size, size) != 0) ++ zio->io_error = EIO; ++} ++ ++/* ++ * ========================================================================== ++ * I/O parent/child relationships and pipeline interlocks ++ * ========================================================================== ++ */ ++/* ++ * NOTE - Callers to zio_walk_parents() and zio_walk_children must ++ * continue calling these functions until they return NULL. ++ * Otherwise, the next caller will pick up the list walk in ++ * some indeterminate state. (Otherwise every caller would ++ * have to pass in a cookie to keep the state represented by ++ * io_walk_link, which gets annoying.) ++ */ ++zio_t * ++zio_walk_parents(zio_t *cio) ++{ ++ zio_link_t *zl = cio->io_walk_link; ++ list_t *pl = &cio->io_parent_list; ++ ++ zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); ++ cio->io_walk_link = zl; ++ ++ if (zl == NULL) ++ return (NULL); ++ ++ ASSERT(zl->zl_child == cio); ++ return (zl->zl_parent); ++} ++ ++zio_t * ++zio_walk_children(zio_t *pio) ++{ ++ zio_link_t *zl = pio->io_walk_link; ++ list_t *cl = &pio->io_child_list; ++ ++ zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); ++ pio->io_walk_link = zl; ++ ++ if (zl == NULL) ++ return (NULL); ++ ++ ASSERT(zl->zl_parent == pio); ++ return (zl->zl_child); ++} ++ ++zio_t * ++zio_unique_parent(zio_t *cio) ++{ ++ zio_t *pio = zio_walk_parents(cio); ++ ++ VERIFY(zio_walk_parents(cio) == NULL); ++ return (pio); ++} ++ ++void ++zio_add_child(zio_t *pio, zio_t *cio) ++{ ++ zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_PUSHPAGE); ++ int w; ++ ++ /* ++ * Logical I/Os can have logical, gang, or vdev children. ++ * Gang I/Os can have gang or vdev children. ++ * Vdev I/Os can only have vdev children. ++ * The following ASSERT captures all of these constraints. ++ */ ++ ASSERT(cio->io_child_type <= pio->io_child_type); ++ ++ zl->zl_parent = pio; ++ zl->zl_child = cio; ++ ++ mutex_enter(&cio->io_lock); ++ mutex_enter(&pio->io_lock); ++ ++ ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); ++ ++ for (w = 0; w < ZIO_WAIT_TYPES; w++) ++ pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; ++ ++ list_insert_head(&pio->io_child_list, zl); ++ list_insert_head(&cio->io_parent_list, zl); ++ ++ pio->io_child_count++; ++ cio->io_parent_count++; ++ ++ mutex_exit(&pio->io_lock); ++ mutex_exit(&cio->io_lock); ++} ++ ++static void ++zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) ++{ ++ ASSERT(zl->zl_parent == pio); ++ ASSERT(zl->zl_child == cio); ++ ++ mutex_enter(&cio->io_lock); ++ mutex_enter(&pio->io_lock); ++ ++ list_remove(&pio->io_child_list, zl); ++ list_remove(&cio->io_parent_list, zl); ++ ++ pio->io_child_count--; ++ cio->io_parent_count--; ++ ++ mutex_exit(&pio->io_lock); ++ mutex_exit(&cio->io_lock); ++ ++ kmem_cache_free(zio_link_cache, zl); ++} ++ ++static boolean_t ++zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) ++{ ++ uint64_t *countp = &zio->io_children[child][wait]; ++ boolean_t waiting = B_FALSE; ++ ++ mutex_enter(&zio->io_lock); ++ ASSERT(zio->io_stall == NULL); ++ if (*countp != 0) { ++ zio->io_stage >>= 1; ++ zio->io_stall = countp; ++ waiting = B_TRUE; ++ } ++ mutex_exit(&zio->io_lock); ++ ++ return (waiting); ++} ++ ++__attribute__((always_inline)) ++static inline void ++zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) ++{ ++ uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; ++ int *errorp = &pio->io_child_error[zio->io_child_type]; ++ ++ mutex_enter(&pio->io_lock); ++ if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) ++ *errorp = zio_worst_error(*errorp, zio->io_error); ++ pio->io_reexecute |= zio->io_reexecute; ++ ASSERT3U(*countp, >, 0); ++ if (--*countp == 0 && pio->io_stall == countp) { ++ pio->io_stall = NULL; ++ mutex_exit(&pio->io_lock); ++ __zio_execute(pio); ++ } else { ++ mutex_exit(&pio->io_lock); ++ } ++} ++ ++static void ++zio_inherit_child_errors(zio_t *zio, enum zio_child c) ++{ ++ if (zio->io_child_error[c] != 0 && zio->io_error == 0) ++ zio->io_error = zio->io_child_error[c]; ++} ++ ++/* ++ * ========================================================================== ++ * Create the various types of I/O (read, write, free, etc) ++ * ========================================================================== ++ */ ++static zio_t * ++zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ++ void *data, uint64_t size, zio_done_func_t *done, void *private, ++ zio_type_t type, int priority, enum zio_flag flags, ++ vdev_t *vd, uint64_t offset, const zbookmark_t *zb, ++ enum zio_stage stage, enum zio_stage pipeline) ++{ ++ zio_t *zio; ++ ++ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); ++ ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); ++ ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ++ ++ ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ++ ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ++ ASSERT(vd || stage == ZIO_STAGE_OPEN); ++ ++ zio = kmem_cache_alloc(zio_cache, KM_PUSHPAGE); ++ ++ if (vd != NULL) ++ zio->io_child_type = ZIO_CHILD_VDEV; ++ else if (flags & ZIO_FLAG_GANG_CHILD) ++ zio->io_child_type = ZIO_CHILD_GANG; ++ else if (flags & ZIO_FLAG_DDT_CHILD) ++ zio->io_child_type = ZIO_CHILD_DDT; ++ else ++ zio->io_child_type = ZIO_CHILD_LOGICAL; ++ ++ if (bp != NULL) { ++ zio->io_logical = NULL; ++ zio->io_bp = (blkptr_t *)bp; ++ zio->io_bp_copy = *bp; ++ zio->io_bp_orig = *bp; ++ if (type != ZIO_TYPE_WRITE || ++ zio->io_child_type == ZIO_CHILD_DDT) ++ zio->io_bp = &zio->io_bp_copy; /* so caller can free */ ++ if (zio->io_child_type == ZIO_CHILD_LOGICAL) ++ zio->io_logical = zio; ++ if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) ++ pipeline |= ZIO_GANG_STAGES; ++ } else { ++ zio->io_logical = NULL; ++ zio->io_bp = NULL; ++ bzero(&zio->io_bp_copy, sizeof (blkptr_t)); ++ bzero(&zio->io_bp_orig, sizeof (blkptr_t)); ++ } ++ ++ zio->io_spa = spa; ++ zio->io_txg = txg; ++ zio->io_ready = NULL; ++ zio->io_done = done; ++ zio->io_private = private; ++ zio->io_prev_space_delta = 0; ++ zio->io_type = type; ++ zio->io_priority = priority; ++ zio->io_vd = vd; ++ zio->io_vsd = NULL; ++ zio->io_vsd_ops = NULL; ++ zio->io_offset = offset; ++ zio->io_deadline = 0; ++ zio->io_orig_data = zio->io_data = data; ++ zio->io_orig_size = zio->io_size = size; ++ zio->io_orig_flags = zio->io_flags = flags; ++ zio->io_orig_stage = zio->io_stage = stage; ++ zio->io_orig_pipeline = zio->io_pipeline = pipeline; ++ bzero(&zio->io_prop, sizeof (zio_prop_t)); ++ zio->io_cmd = 0; ++ zio->io_reexecute = 0; ++ zio->io_bp_override = NULL; ++ zio->io_walk_link = NULL; ++ zio->io_transform_stack = NULL; ++ zio->io_delay = 0; ++ zio->io_error = 0; ++ zio->io_child_count = 0; ++ zio->io_parent_count = 0; ++ zio->io_stall = NULL; ++ zio->io_gang_leader = NULL; ++ zio->io_gang_tree = NULL; ++ zio->io_executor = NULL; ++ zio->io_waiter = NULL; ++ zio->io_cksum_report = NULL; ++ zio->io_ena = 0; ++ bzero(zio->io_child_error, sizeof (int) * ZIO_CHILD_TYPES); ++ bzero(zio->io_children, ++ sizeof (uint64_t) * ZIO_CHILD_TYPES * ZIO_WAIT_TYPES); ++ bzero(&zio->io_bookmark, sizeof (zbookmark_t)); ++ ++ zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); ++ zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); ++ ++ if (zb != NULL) ++ zio->io_bookmark = *zb; ++ ++ if (pio != NULL) { ++ if (zio->io_logical == NULL) ++ zio->io_logical = pio->io_logical; ++ if (zio->io_child_type == ZIO_CHILD_GANG) ++ zio->io_gang_leader = pio->io_gang_leader; ++ zio_add_child(pio, zio); ++ } ++ ++ taskq_init_ent(&zio->io_tqent); ++ ++ return (zio); ++} ++ ++static void ++zio_destroy(zio_t *zio) ++{ ++ kmem_cache_free(zio_cache, zio); ++} ++ ++zio_t * ++zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, ++ void *private, enum zio_flag flags) ++{ ++ zio_t *zio; ++ ++ zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, ++ ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ++ ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); ++ ++ return (zio); ++} ++ ++zio_t * ++zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) ++{ ++ return (zio_null(NULL, spa, NULL, done, private, flags)); ++} ++ ++zio_t * ++zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, ++ void *data, uint64_t size, zio_done_func_t *done, void *private, ++ int priority, enum zio_flag flags, const zbookmark_t *zb) ++{ ++ zio_t *zio; ++ ++ zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, ++ data, size, done, private, ++ ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ++ ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ++ ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); ++ ++ return (zio); ++} ++ ++zio_t * ++zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, ++ void *data, uint64_t size, const zio_prop_t *zp, ++ zio_done_func_t *ready, zio_done_func_t *done, void *private, ++ int priority, enum zio_flag flags, const zbookmark_t *zb) ++{ ++ zio_t *zio; ++ ++ ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && ++ zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && ++ zp->zp_compress >= ZIO_COMPRESS_OFF && ++ zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && ++ zp->zp_type < DMU_OT_NUMTYPES && ++ zp->zp_level < 32 && ++ zp->zp_copies > 0 && ++ zp->zp_copies <= spa_max_replication(spa) && ++ zp->zp_dedup <= 1 && ++ zp->zp_dedup_verify <= 1); ++ ++ zio = zio_create(pio, spa, txg, bp, data, size, done, private, ++ ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ++ ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ++ ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); ++ ++ zio->io_ready = ready; ++ zio->io_prop = *zp; ++ ++ return (zio); ++} ++ ++zio_t * ++zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, ++ uint64_t size, zio_done_func_t *done, void *private, int priority, ++ enum zio_flag flags, zbookmark_t *zb) ++{ ++ zio_t *zio; ++ ++ zio = zio_create(pio, spa, txg, bp, data, size, done, private, ++ ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ++ ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); ++ ++ return (zio); ++} ++ ++void ++zio_write_override(zio_t *zio, blkptr_t *bp, int copies) ++{ ++ ASSERT(zio->io_type == ZIO_TYPE_WRITE); ++ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ++ ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ++ ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); ++ ++ zio->io_prop.zp_copies = copies; ++ zio->io_bp_override = bp; ++} ++ ++void ++zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) ++{ ++ bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); ++} ++ ++zio_t * ++zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ++ enum zio_flag flags) ++{ ++ zio_t *zio; ++ ++ dprintf_bp(bp, "freeing in txg %llu, pass %u", ++ (longlong_t)txg, spa->spa_sync_pass); ++ ++ ASSERT(!BP_IS_HOLE(bp)); ++ ASSERT(spa_syncing_txg(spa) == txg); ++ ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); ++ ++ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), ++ NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, ++ NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); ++ ++ return (zio); ++} ++ ++zio_t * ++zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ++ zio_done_func_t *done, void *private, enum zio_flag flags) ++{ ++ zio_t *zio; ++ ++ /* ++ * A claim is an allocation of a specific block. Claims are needed ++ * to support immediate writes in the intent log. The issue is that ++ * immediate writes contain committed data, but in a txg that was ++ * *not* committed. Upon opening the pool after an unclean shutdown, ++ * the intent log claims all blocks that contain immediate write data ++ * so that the SPA knows they're in use. ++ * ++ * All claims *must* be resolved in the first txg -- before the SPA ++ * starts allocating blocks -- so that nothing is allocated twice. ++ * If txg == 0 we just verify that the block is claimable. ++ */ ++ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); ++ ASSERT(txg == spa_first_txg(spa) || txg == 0); ++ ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ ++ ++ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), ++ done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, ++ NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); ++ ++ return (zio); ++} ++ ++zio_t * ++zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, ++ zio_done_func_t *done, void *private, int priority, enum zio_flag flags) ++{ ++ zio_t *zio; ++ int c; ++ ++ if (vd->vdev_children == 0) { ++ zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, ++ ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, ++ ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); ++ ++ zio->io_cmd = cmd; ++ } else { ++ zio = zio_null(pio, spa, NULL, NULL, NULL, flags); ++ ++ for (c = 0; c < vd->vdev_children; c++) ++ zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, ++ done, private, priority, flags)); ++ } ++ ++ return (zio); ++} ++ ++zio_t * ++zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, ++ void *data, int checksum, zio_done_func_t *done, void *private, ++ int priority, enum zio_flag flags, boolean_t labels) ++{ ++ zio_t *zio; ++ ++ ASSERT(vd->vdev_children == 0); ++ ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || ++ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ++ ASSERT3U(offset + size, <=, vd->vdev_psize); ++ ++ zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, ++ ZIO_TYPE_READ, priority, flags, vd, offset, NULL, ++ ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); ++ ++ zio->io_prop.zp_checksum = checksum; ++ ++ return (zio); ++} ++ ++zio_t * ++zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, ++ void *data, int checksum, zio_done_func_t *done, void *private, ++ int priority, enum zio_flag flags, boolean_t labels) ++{ ++ zio_t *zio; ++ ++ ASSERT(vd->vdev_children == 0); ++ ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || ++ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ++ ASSERT3U(offset + size, <=, vd->vdev_psize); ++ ++ zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, ++ ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, ++ ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); ++ ++ zio->io_prop.zp_checksum = checksum; ++ ++ if (zio_checksum_table[checksum].ci_eck) { ++ /* ++ * zec checksums are necessarily destructive -- they modify ++ * the end of the write buffer to hold the verifier/checksum. ++ * Therefore, we must make a local copy in case the data is ++ * being written to multiple places in parallel. ++ */ ++ void *wbuf = zio_buf_alloc(size); ++ bcopy(data, wbuf, size); ++ zio_push_transform(zio, wbuf, size, size, NULL); ++ } ++ ++ return (zio); ++} ++ ++/* ++ * Create a child I/O to do some work for us. ++ */ ++zio_t * ++zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, ++ void *data, uint64_t size, int type, int priority, enum zio_flag flags, ++ zio_done_func_t *done, void *private) ++{ ++ enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; ++ zio_t *zio; ++ ++ ASSERT(vd->vdev_parent == ++ (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); ++ ++ if (type == ZIO_TYPE_READ && bp != NULL) { ++ /* ++ * If we have the bp, then the child should perform the ++ * checksum and the parent need not. This pushes error ++ * detection as close to the leaves as possible and ++ * eliminates redundant checksums in the interior nodes. ++ */ ++ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; ++ pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; ++ } ++ ++ if (vd->vdev_children == 0) ++ offset += VDEV_LABEL_START_SIZE; ++ ++ flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; ++ ++ /* ++ * If we've decided to do a repair, the write is not speculative -- ++ * even if the original read was. ++ */ ++ if (flags & ZIO_FLAG_IO_REPAIR) ++ flags &= ~ZIO_FLAG_SPECULATIVE; ++ ++ zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, ++ done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ++ ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ++ ++ return (zio); ++} ++ ++zio_t * ++zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, ++ int type, int priority, enum zio_flag flags, ++ zio_done_func_t *done, void *private) ++{ ++ zio_t *zio; ++ ++ ASSERT(vd->vdev_ops->vdev_op_leaf); ++ ++ zio = zio_create(NULL, vd->vdev_spa, 0, NULL, ++ data, size, done, private, type, priority, ++ flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, ++ vd, offset, NULL, ++ ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); ++ ++ return (zio); ++} ++ ++void ++zio_flush(zio_t *zio, vdev_t *vd) ++{ ++ zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, ++ NULL, NULL, ZIO_PRIORITY_NOW, ++ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); ++} ++ ++void ++zio_shrink(zio_t *zio, uint64_t size) ++{ ++ ASSERT(zio->io_executor == NULL); ++ ASSERT(zio->io_orig_size == zio->io_size); ++ ASSERT(size <= zio->io_size); ++ ++ /* ++ * We don't shrink for raidz because of problems with the ++ * reconstruction when reading back less than the block size. ++ * Note, BP_IS_RAIDZ() assumes no compression. ++ */ ++ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); ++ if (!BP_IS_RAIDZ(zio->io_bp)) ++ zio->io_orig_size = zio->io_size = size; ++} ++ ++/* ++ * ========================================================================== ++ * Prepare to read and write logical blocks ++ * ========================================================================== ++ */ ++ ++static int ++zio_read_bp_init(zio_t *zio) ++{ ++ blkptr_t *bp = zio->io_bp; ++ ++ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && ++ zio->io_child_type == ZIO_CHILD_LOGICAL && ++ !(zio->io_flags & ZIO_FLAG_RAW)) { ++ uint64_t psize = BP_GET_PSIZE(bp); ++ void *cbuf = zio_buf_alloc(psize); ++ ++ zio_push_transform(zio, cbuf, psize, psize, zio_decompress); ++ } ++ ++ if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) ++ zio->io_flags |= ZIO_FLAG_DONT_CACHE; ++ ++ if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) ++ zio->io_flags |= ZIO_FLAG_DONT_CACHE; ++ ++ if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) ++ zio->io_pipeline = ZIO_DDT_READ_PIPELINE; ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++static int ++zio_write_bp_init(zio_t *zio) ++{ ++ spa_t *spa = zio->io_spa; ++ zio_prop_t *zp = &zio->io_prop; ++ enum zio_compress compress = zp->zp_compress; ++ blkptr_t *bp = zio->io_bp; ++ uint64_t lsize = zio->io_size; ++ uint64_t psize = lsize; ++ int pass = 1; ++ ++ /* ++ * If our children haven't all reached the ready stage, ++ * wait for them and then repeat this pipeline stage. ++ */ ++ if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || ++ zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) ++ return (ZIO_PIPELINE_STOP); ++ ++ if (!IO_IS_ALLOCATING(zio)) ++ return (ZIO_PIPELINE_CONTINUE); ++ ++ ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ++ ++ if (zio->io_bp_override) { ++ ASSERT(bp->blk_birth != zio->io_txg); ++ ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); ++ ++ *bp = *zio->io_bp_override; ++ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ++ ++ if (BP_IS_HOLE(bp) || !zp->zp_dedup) ++ return (ZIO_PIPELINE_CONTINUE); ++ ++ ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || ++ zp->zp_dedup_verify); ++ ++ if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { ++ BP_SET_DEDUP(bp, 1); ++ zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; ++ return (ZIO_PIPELINE_CONTINUE); ++ } ++ zio->io_bp_override = NULL; ++ BP_ZERO(bp); ++ } ++ ++ if (bp->blk_birth == zio->io_txg) { ++ /* ++ * We're rewriting an existing block, which means we're ++ * working on behalf of spa_sync(). For spa_sync() to ++ * converge, it must eventually be the case that we don't ++ * have to allocate new blocks. But compression changes ++ * the blocksize, which forces a reallocate, and makes ++ * convergence take longer. Therefore, after the first ++ * few passes, stop compressing to ensure convergence. ++ */ ++ pass = spa_sync_pass(spa); ++ ++ ASSERT(zio->io_txg == spa_syncing_txg(spa)); ++ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ++ ASSERT(!BP_GET_DEDUP(bp)); ++ ++ if (pass > SYNC_PASS_DONT_COMPRESS) ++ compress = ZIO_COMPRESS_OFF; ++ ++ /* Make sure someone doesn't change their mind on overwrites */ ++ ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), ++ spa_max_replication(spa)) == BP_GET_NDVAS(bp)); ++ } ++ ++ if (compress != ZIO_COMPRESS_OFF) { ++ void *cbuf = zio_buf_alloc(lsize); ++ psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); ++ if (psize == 0 || psize == lsize) { ++ compress = ZIO_COMPRESS_OFF; ++ zio_buf_free(cbuf, lsize); ++ } else { ++ ASSERT(psize < lsize); ++ zio_push_transform(zio, cbuf, psize, lsize, NULL); ++ } ++ } ++ ++ /* ++ * The final pass of spa_sync() must be all rewrites, but the first ++ * few passes offer a trade-off: allocating blocks defers convergence, ++ * but newly allocated blocks are sequential, so they can be written ++ * to disk faster. Therefore, we allow the first few passes of ++ * spa_sync() to allocate new blocks, but force rewrites after that. ++ * There should only be a handful of blocks after pass 1 in any case. ++ */ ++ if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && ++ pass > SYNC_PASS_REWRITE) { ++ enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; ++ ASSERT(psize != 0); ++ zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; ++ zio->io_flags |= ZIO_FLAG_IO_REWRITE; ++ } else { ++ BP_ZERO(bp); ++ zio->io_pipeline = ZIO_WRITE_PIPELINE; ++ } ++ ++ if (psize == 0) { ++ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ++ } else { ++ ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); ++ BP_SET_LSIZE(bp, lsize); ++ BP_SET_PSIZE(bp, psize); ++ BP_SET_COMPRESS(bp, compress); ++ BP_SET_CHECKSUM(bp, zp->zp_checksum); ++ BP_SET_TYPE(bp, zp->zp_type); ++ BP_SET_LEVEL(bp, zp->zp_level); ++ BP_SET_DEDUP(bp, zp->zp_dedup); ++ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); ++ if (zp->zp_dedup) { ++ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ++ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ++ zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; ++ } ++ } ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++static int ++zio_free_bp_init(zio_t *zio) ++{ ++ blkptr_t *bp = zio->io_bp; ++ ++ if (zio->io_child_type == ZIO_CHILD_LOGICAL) { ++ if (BP_GET_DEDUP(bp)) ++ zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; ++ } ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++/* ++ * ========================================================================== ++ * Execute the I/O pipeline ++ * ========================================================================== ++ */ ++ ++static void ++zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) ++{ ++ spa_t *spa = zio->io_spa; ++ zio_type_t t = zio->io_type; ++ int flags = (cutinline ? TQ_FRONT : 0); ++ ++ /* ++ * If we're a config writer or a probe, the normal issue and ++ * interrupt threads may all be blocked waiting for the config lock. ++ * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. ++ */ ++ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) ++ t = ZIO_TYPE_NULL; ++ ++ /* ++ * A similar issue exists for the L2ARC write thread until L2ARC 2.0. ++ */ ++ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) ++ t = ZIO_TYPE_NULL; ++ ++ /* ++ * If this is a high priority I/O, then use the high priority taskq. ++ */ ++ if (zio->io_priority == ZIO_PRIORITY_NOW && ++ spa->spa_zio_taskq[t][q + 1] != NULL) ++ q++; ++ ++ ASSERT3U(q, <, ZIO_TASKQ_TYPES); ++ ++ /* ++ * NB: We are assuming that the zio can only be dispatched ++ * to a single taskq at a time. It would be a grievous error ++ * to dispatch the zio to another taskq at the same time. ++ */ ++ ASSERT(taskq_empty_ent(&zio->io_tqent)); ++ taskq_dispatch_ent(spa->spa_zio_taskq[t][q], ++ (task_func_t *)zio_execute, zio, flags, &zio->io_tqent); ++} ++ ++static boolean_t ++zio_taskq_member(zio_t *zio, enum zio_taskq_type q) ++{ ++ kthread_t *executor = zio->io_executor; ++ spa_t *spa = zio->io_spa; ++ zio_type_t t; ++ ++ for (t = 0; t < ZIO_TYPES; t++) ++ if (taskq_member(spa->spa_zio_taskq[t][q], executor)) ++ return (B_TRUE); ++ ++ return (B_FALSE); ++} ++ ++static int ++zio_issue_async(zio_t *zio) ++{ ++ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); ++ ++ return (ZIO_PIPELINE_STOP); ++} ++ ++void ++zio_interrupt(zio_t *zio) ++{ ++ zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); ++} ++ ++/* ++ * Execute the I/O pipeline until one of the following occurs: ++ * (1) the I/O completes; (2) the pipeline stalls waiting for ++ * dependent child I/Os; (3) the I/O issues, so we're waiting ++ * for an I/O completion interrupt; (4) the I/O is delegated by ++ * vdev-level caching or aggregation; (5) the I/O is deferred ++ * due to vdev-level queueing; (6) the I/O is handed off to ++ * another thread. In all cases, the pipeline stops whenever ++ * there's no CPU work; it never burns a thread in cv_wait(). ++ * ++ * There's no locking on io_stage because there's no legitimate way ++ * for multiple threads to be attempting to process the same I/O. ++ */ ++static zio_pipe_stage_t *zio_pipeline[]; ++ ++/* ++ * zio_execute() is a wrapper around the static function ++ * __zio_execute() so that we can force __zio_execute() to be ++ * inlined. This reduces stack overhead which is important ++ * because __zio_execute() is called recursively in several zio ++ * code paths. zio_execute() itself cannot be inlined because ++ * it is externally visible. ++ */ ++void ++zio_execute(zio_t *zio) ++{ ++ __zio_execute(zio); ++} ++ ++__attribute__((always_inline)) ++static inline void ++__zio_execute(zio_t *zio) ++{ ++ zio->io_executor = curthread; ++ ++ while (zio->io_stage < ZIO_STAGE_DONE) { ++ enum zio_stage pipeline = zio->io_pipeline; ++ enum zio_stage stage = zio->io_stage; ++ dsl_pool_t *dsl; ++ boolean_t cut; ++ int rv; ++ ++ ASSERT(!MUTEX_HELD(&zio->io_lock)); ++ ASSERT(ISP2(stage)); ++ ASSERT(zio->io_stall == NULL); ++ ++ do { ++ stage <<= 1; ++ } while ((stage & pipeline) == 0); ++ ++ ASSERT(stage <= ZIO_STAGE_DONE); ++ ++ dsl = spa_get_dsl(zio->io_spa); ++ cut = (stage == ZIO_STAGE_VDEV_IO_START) ? ++ zio_requeue_io_start_cut_in_line : B_FALSE; ++ ++ /* ++ * If we are in interrupt context and this pipeline stage ++ * will grab a config lock that is held across I/O, ++ * or may wait for an I/O that needs an interrupt thread ++ * to complete, issue async to avoid deadlock. ++ * ++ * If we are in the txg_sync_thread or being called ++ * during pool init issue async to minimize stack depth. ++ * Both of these call paths may be recursively called. ++ * ++ * For VDEV_IO_START, we cut in line so that the io will ++ * be sent to disk promptly. ++ */ ++ if (((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && ++ zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) || ++ (dsl != NULL && dsl_pool_sync_context(dsl))) { ++ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); ++ return; ++ } ++ ++ zio->io_stage = stage; ++ rv = zio_pipeline[highbit(stage) - 1](zio); ++ ++ if (rv == ZIO_PIPELINE_STOP) ++ return; ++ ++ ASSERT(rv == ZIO_PIPELINE_CONTINUE); ++ } ++} ++ ++ ++/* ++ * ========================================================================== ++ * Initiate I/O, either sync or async ++ * ========================================================================== ++ */ ++int ++zio_wait(zio_t *zio) ++{ ++ uint64_t timeout; ++ int error; ++ ++ ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ++ ASSERT(zio->io_executor == NULL); ++ ++ zio->io_waiter = curthread; ++ timeout = ddi_get_lbolt() + (zio_delay_max / MILLISEC * hz); ++ ++ __zio_execute(zio); ++ ++ mutex_enter(&zio->io_lock); ++ while (zio->io_executor != NULL) { ++ /* ++ * Wake up periodically to prevent the kernel from complaining ++ * about a blocked task. However, check zio_delay_max to see ++ * if the I/O has exceeded the timeout and post an ereport. ++ */ ++ cv_timedwait_interruptible(&zio->io_cv, &zio->io_lock, ++ ddi_get_lbolt() + hz); ++ ++ if (timeout && (ddi_get_lbolt() > timeout)) { ++ zio->io_delay = zio_delay_max; ++ zfs_ereport_post(FM_EREPORT_ZFS_DELAY, ++ zio->io_spa, zio->io_vd, zio, 0, 0); ++ timeout = 0; ++ } ++ } ++ mutex_exit(&zio->io_lock); ++ ++ error = zio->io_error; ++ zio_destroy(zio); ++ ++ return (error); ++} ++ ++void ++zio_nowait(zio_t *zio) ++{ ++ ASSERT(zio->io_executor == NULL); ++ ++ if (zio->io_child_type == ZIO_CHILD_LOGICAL && ++ zio_unique_parent(zio) == NULL) { ++ /* ++ * This is a logical async I/O with no parent to wait for it. ++ * We add it to the spa_async_root_zio "Godfather" I/O which ++ * will ensure they complete prior to unloading the pool. ++ */ ++ spa_t *spa = zio->io_spa; ++ ++ zio_add_child(spa->spa_async_zio_root, zio); ++ } ++ ++ __zio_execute(zio); ++} ++ ++/* ++ * ========================================================================== ++ * Reexecute or suspend/resume failed I/O ++ * ========================================================================== ++ */ ++ ++static void ++zio_reexecute(zio_t *pio) ++{ ++ zio_t *cio, *cio_next; ++ int c, w; ++ ++ ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ++ ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ++ ASSERT(pio->io_gang_leader == NULL); ++ ASSERT(pio->io_gang_tree == NULL); ++ ++ pio->io_flags = pio->io_orig_flags; ++ pio->io_stage = pio->io_orig_stage; ++ pio->io_pipeline = pio->io_orig_pipeline; ++ pio->io_reexecute = 0; ++ pio->io_error = 0; ++ for (w = 0; w < ZIO_WAIT_TYPES; w++) ++ pio->io_state[w] = 0; ++ for (c = 0; c < ZIO_CHILD_TYPES; c++) ++ pio->io_child_error[c] = 0; ++ ++ if (IO_IS_ALLOCATING(pio)) ++ BP_ZERO(pio->io_bp); ++ ++ /* ++ * As we reexecute pio's children, new children could be created. ++ * New children go to the head of pio's io_child_list, however, ++ * so we will (correctly) not reexecute them. The key is that ++ * the remainder of pio's io_child_list, from 'cio_next' onward, ++ * cannot be affected by any side effects of reexecuting 'cio'. ++ */ ++ for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { ++ cio_next = zio_walk_children(pio); ++ mutex_enter(&pio->io_lock); ++ for (w = 0; w < ZIO_WAIT_TYPES; w++) ++ pio->io_children[cio->io_child_type][w]++; ++ mutex_exit(&pio->io_lock); ++ zio_reexecute(cio); ++ } ++ ++ /* ++ * Now that all children have been reexecuted, execute the parent. ++ * We don't reexecute "The Godfather" I/O here as it's the ++ * responsibility of the caller to wait on him. ++ */ ++ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) ++ __zio_execute(pio); ++} ++ ++void ++zio_suspend(spa_t *spa, zio_t *zio) ++{ ++ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) ++ fm_panic("Pool '%s' has encountered an uncorrectable I/O " ++ "failure and the failure mode property for this pool " ++ "is set to panic.", spa_name(spa)); ++ ++ zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); ++ ++ mutex_enter(&spa->spa_suspend_lock); ++ ++ if (spa->spa_suspend_zio_root == NULL) ++ spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, ++ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ++ ZIO_FLAG_GODFATHER); ++ ++ spa->spa_suspended = B_TRUE; ++ ++ if (zio != NULL) { ++ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ++ ASSERT(zio != spa->spa_suspend_zio_root); ++ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ++ ASSERT(zio_unique_parent(zio) == NULL); ++ ASSERT(zio->io_stage == ZIO_STAGE_DONE); ++ zio_add_child(spa->spa_suspend_zio_root, zio); ++ } ++ ++ mutex_exit(&spa->spa_suspend_lock); ++} ++ ++int ++zio_resume(spa_t *spa) ++{ ++ zio_t *pio; ++ ++ /* ++ * Reexecute all previously suspended i/o. ++ */ ++ mutex_enter(&spa->spa_suspend_lock); ++ spa->spa_suspended = B_FALSE; ++ cv_broadcast(&spa->spa_suspend_cv); ++ pio = spa->spa_suspend_zio_root; ++ spa->spa_suspend_zio_root = NULL; ++ mutex_exit(&spa->spa_suspend_lock); ++ ++ if (pio == NULL) ++ return (0); ++ ++ zio_reexecute(pio); ++ return (zio_wait(pio)); ++} ++ ++void ++zio_resume_wait(spa_t *spa) ++{ ++ mutex_enter(&spa->spa_suspend_lock); ++ while (spa_suspended(spa)) ++ cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); ++ mutex_exit(&spa->spa_suspend_lock); ++} ++ ++/* ++ * ========================================================================== ++ * Gang blocks. ++ * ++ * A gang block is a collection of small blocks that looks to the DMU ++ * like one large block. When zio_dva_allocate() cannot find a block ++ * of the requested size, due to either severe fragmentation or the pool ++ * being nearly full, it calls zio_write_gang_block() to construct the ++ * block from smaller fragments. ++ * ++ * A gang block consists of a gang header (zio_gbh_phys_t) and up to ++ * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like ++ * an indirect block: it's an array of block pointers. It consumes ++ * only one sector and hence is allocatable regardless of fragmentation. ++ * The gang header's bps point to its gang members, which hold the data. ++ * ++ * Gang blocks are self-checksumming, using the bp's ++ * as the verifier to ensure uniqueness of the SHA256 checksum. ++ * Critically, the gang block bp's blk_cksum is the checksum of the data, ++ * not the gang header. This ensures that data block signatures (needed for ++ * deduplication) are independent of how the block is physically stored. ++ * ++ * Gang blocks can be nested: a gang member may itself be a gang block. ++ * Thus every gang block is a tree in which root and all interior nodes are ++ * gang headers, and the leaves are normal blocks that contain user data. ++ * The root of the gang tree is called the gang leader. ++ * ++ * To perform any operation (read, rewrite, free, claim) on a gang block, ++ * zio_gang_assemble() first assembles the gang tree (minus data leaves) ++ * in the io_gang_tree field of the original logical i/o by recursively ++ * reading the gang leader and all gang headers below it. This yields ++ * an in-core tree containing the contents of every gang header and the ++ * bps for every constituent of the gang block. ++ * ++ * With the gang tree now assembled, zio_gang_issue() just walks the gang tree ++ * and invokes a callback on each bp. To free a gang block, zio_gang_issue() ++ * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. ++ * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). ++ * zio_read_gang() is a wrapper around zio_read() that omits reading gang ++ * headers, since we already have those in io_gang_tree. zio_rewrite_gang() ++ * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() ++ * of the gang header plus zio_checksum_compute() of the data to update the ++ * gang header's blk_cksum as described above. ++ * ++ * The two-phase assemble/issue model solves the problem of partial failure -- ++ * what if you'd freed part of a gang block but then couldn't read the ++ * gang header for another part? Assembling the entire gang tree first ++ * ensures that all the necessary gang header I/O has succeeded before ++ * starting the actual work of free, claim, or write. Once the gang tree ++ * is assembled, free and claim are in-memory operations that cannot fail. ++ * ++ * In the event that a gang write fails, zio_dva_unallocate() walks the ++ * gang tree to immediately free (i.e. insert back into the space map) ++ * everything we've allocated. This ensures that we don't get ENOSPC ++ * errors during repeated suspend/resume cycles due to a flaky device. ++ * ++ * Gang rewrites only happen during sync-to-convergence. If we can't assemble ++ * the gang tree, we won't modify the block, so we can safely defer the free ++ * (knowing that the block is still intact). If we *can* assemble the gang ++ * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free ++ * each constituent bp and we can allocate a new block on the next sync pass. ++ * ++ * In all cases, the gang tree allows complete recovery from partial failure. ++ * ========================================================================== ++ */ ++ ++static zio_t * ++zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) ++{ ++ if (gn != NULL) ++ return (pio); ++ ++ return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), ++ NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), ++ &pio->io_bookmark)); ++} ++ ++zio_t * ++zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) ++{ ++ zio_t *zio; ++ ++ if (gn != NULL) { ++ zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, ++ gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, ++ ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); ++ /* ++ * As we rewrite each gang header, the pipeline will compute ++ * a new gang block header checksum for it; but no one will ++ * compute a new data checksum, so we do that here. The one ++ * exception is the gang leader: the pipeline already computed ++ * its data checksum because that stage precedes gang assembly. ++ * (Presently, nothing actually uses interior data checksums; ++ * this is just good hygiene.) ++ */ ++ if (gn != pio->io_gang_leader->io_gang_tree) { ++ zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), ++ data, BP_GET_PSIZE(bp)); ++ } ++ /* ++ * If we are here to damage data for testing purposes, ++ * leave the GBH alone so that we can detect the damage. ++ */ ++ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) ++ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; ++ } else { ++ zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, ++ data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, ++ ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); ++ } ++ ++ return (zio); ++} ++ ++/* ARGSUSED */ ++zio_t * ++zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) ++{ ++ return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ++ ZIO_GANG_CHILD_FLAGS(pio))); ++} ++ ++/* ARGSUSED */ ++zio_t * ++zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) ++{ ++ return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, ++ NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); ++} ++ ++static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { ++ NULL, ++ zio_read_gang, ++ zio_rewrite_gang, ++ zio_free_gang, ++ zio_claim_gang, ++ NULL ++}; ++ ++static void zio_gang_tree_assemble_done(zio_t *zio); ++ ++static zio_gang_node_t * ++zio_gang_node_alloc(zio_gang_node_t **gnpp) ++{ ++ zio_gang_node_t *gn; ++ ++ ASSERT(*gnpp == NULL); ++ ++ gn = kmem_zalloc(sizeof (*gn), KM_PUSHPAGE); ++ gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); ++ *gnpp = gn; ++ ++ return (gn); ++} ++ ++static void ++zio_gang_node_free(zio_gang_node_t **gnpp) ++{ ++ zio_gang_node_t *gn = *gnpp; ++ int g; ++ ++ for (g = 0; g < SPA_GBH_NBLKPTRS; g++) ++ ASSERT(gn->gn_child[g] == NULL); ++ ++ zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); ++ kmem_free(gn, sizeof (*gn)); ++ *gnpp = NULL; ++} ++ ++static void ++zio_gang_tree_free(zio_gang_node_t **gnpp) ++{ ++ zio_gang_node_t *gn = *gnpp; ++ int g; ++ ++ if (gn == NULL) ++ return; ++ ++ for (g = 0; g < SPA_GBH_NBLKPTRS; g++) ++ zio_gang_tree_free(&gn->gn_child[g]); ++ ++ zio_gang_node_free(gnpp); ++} ++ ++static void ++zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) ++{ ++ zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); ++ ++ ASSERT(gio->io_gang_leader == gio); ++ ASSERT(BP_IS_GANG(bp)); ++ ++ zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, ++ SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, ++ gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); ++} ++ ++static void ++zio_gang_tree_assemble_done(zio_t *zio) ++{ ++ zio_t *gio = zio->io_gang_leader; ++ zio_gang_node_t *gn = zio->io_private; ++ blkptr_t *bp = zio->io_bp; ++ int g; ++ ++ ASSERT(gio == zio_unique_parent(zio)); ++ ASSERT(zio->io_child_count == 0); ++ ++ if (zio->io_error) ++ return; ++ ++ if (BP_SHOULD_BYTESWAP(bp)) ++ byteswap_uint64_array(zio->io_data, zio->io_size); ++ ++ ASSERT(zio->io_data == gn->gn_gbh); ++ ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ++ ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); ++ ++ for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { ++ blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; ++ if (!BP_IS_GANG(gbp)) ++ continue; ++ zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); ++ } ++} ++ ++static void ++zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) ++{ ++ zio_t *gio = pio->io_gang_leader; ++ zio_t *zio; ++ int g; ++ ++ ASSERT(BP_IS_GANG(bp) == !!gn); ++ ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); ++ ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); ++ ++ /* ++ * If you're a gang header, your data is in gn->gn_gbh. ++ * If you're a gang member, your data is in 'data' and gn == NULL. ++ */ ++ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); ++ ++ if (gn != NULL) { ++ ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); ++ ++ for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { ++ blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; ++ if (BP_IS_HOLE(gbp)) ++ continue; ++ zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); ++ data = (char *)data + BP_GET_PSIZE(gbp); ++ } ++ } ++ ++ if (gn == gio->io_gang_tree) ++ ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); ++ ++ if (zio != pio) ++ zio_nowait(zio); ++} ++ ++static int ++zio_gang_assemble(zio_t *zio) ++{ ++ blkptr_t *bp = zio->io_bp; ++ ++ ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); ++ ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ++ ++ zio->io_gang_leader = zio; ++ ++ zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++static int ++zio_gang_issue(zio_t *zio) ++{ ++ blkptr_t *bp = zio->io_bp; ++ ++ if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) ++ return (ZIO_PIPELINE_STOP); ++ ++ ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); ++ ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ++ ++ if (zio->io_child_error[ZIO_CHILD_GANG] == 0) ++ zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); ++ else ++ zio_gang_tree_free(&zio->io_gang_tree); ++ ++ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++static void ++zio_write_gang_member_ready(zio_t *zio) ++{ ++ zio_t *pio = zio_unique_parent(zio); ++ ASSERTV(zio_t *gio = zio->io_gang_leader;) ++ dva_t *cdva = zio->io_bp->blk_dva; ++ dva_t *pdva = pio->io_bp->blk_dva; ++ uint64_t asize; ++ int d; ++ ++ if (BP_IS_HOLE(zio->io_bp)) ++ return; ++ ++ ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ++ ++ ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ++ ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ++ ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ++ ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); ++ ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); ++ ++ mutex_enter(&pio->io_lock); ++ for (d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ++ ASSERT(DVA_GET_GANG(&pdva[d])); ++ asize = DVA_GET_ASIZE(&pdva[d]); ++ asize += DVA_GET_ASIZE(&cdva[d]); ++ DVA_SET_ASIZE(&pdva[d], asize); ++ } ++ mutex_exit(&pio->io_lock); ++} ++ ++static int ++zio_write_gang_block(zio_t *pio) ++{ ++ spa_t *spa = pio->io_spa; ++ blkptr_t *bp = pio->io_bp; ++ zio_t *gio = pio->io_gang_leader; ++ zio_t *zio; ++ zio_gang_node_t *gn, **gnpp; ++ zio_gbh_phys_t *gbh; ++ uint64_t txg = pio->io_txg; ++ uint64_t resid = pio->io_size; ++ uint64_t lsize; ++ int copies = gio->io_prop.zp_copies; ++ int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); ++ zio_prop_t zp; ++ int g, error; ++ ++ error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, ++ bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, ++ METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); ++ if (error) { ++ pio->io_error = error; ++ return (ZIO_PIPELINE_CONTINUE); ++ } ++ ++ if (pio == gio) { ++ gnpp = &gio->io_gang_tree; ++ } else { ++ gnpp = pio->io_private; ++ ASSERT(pio->io_ready == zio_write_gang_member_ready); ++ } ++ ++ gn = zio_gang_node_alloc(gnpp); ++ gbh = gn->gn_gbh; ++ bzero(gbh, SPA_GANGBLOCKSIZE); ++ ++ /* ++ * Create the gang header. ++ */ ++ zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, ++ pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); ++ ++ /* ++ * Create and nowait the gang children. ++ */ ++ for (g = 0; resid != 0; resid -= lsize, g++) { ++ lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), ++ SPA_MINBLOCKSIZE); ++ ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); ++ ++ zp.zp_checksum = gio->io_prop.zp_checksum; ++ zp.zp_compress = ZIO_COMPRESS_OFF; ++ zp.zp_type = DMU_OT_NONE; ++ zp.zp_level = 0; ++ zp.zp_copies = gio->io_prop.zp_copies; ++ zp.zp_dedup = 0; ++ zp.zp_dedup_verify = 0; ++ ++ zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], ++ (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, ++ zio_write_gang_member_ready, NULL, &gn->gn_child[g], ++ pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), ++ &pio->io_bookmark)); ++ } ++ ++ /* ++ * Set pio's pipeline to just wait for zio to finish. ++ */ ++ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ++ ++ /* ++ * We didn't allocate this bp, so make sure it doesn't get unmarked. ++ */ ++ pio->io_flags &= ~ZIO_FLAG_FASTWRITE; ++ ++ zio_nowait(zio); ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++/* ++ * ========================================================================== ++ * Dedup ++ * ========================================================================== ++ */ ++static void ++zio_ddt_child_read_done(zio_t *zio) ++{ ++ blkptr_t *bp = zio->io_bp; ++ ddt_entry_t *dde = zio->io_private; ++ ddt_phys_t *ddp; ++ zio_t *pio = zio_unique_parent(zio); ++ ++ mutex_enter(&pio->io_lock); ++ ddp = ddt_phys_select(dde, bp); ++ if (zio->io_error == 0) ++ ddt_phys_clear(ddp); /* this ddp doesn't need repair */ ++ if (zio->io_error == 0 && dde->dde_repair_data == NULL) ++ dde->dde_repair_data = zio->io_data; ++ else ++ zio_buf_free(zio->io_data, zio->io_size); ++ mutex_exit(&pio->io_lock); ++} ++ ++static int ++zio_ddt_read_start(zio_t *zio) ++{ ++ blkptr_t *bp = zio->io_bp; ++ int p; ++ ++ ASSERT(BP_GET_DEDUP(bp)); ++ ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ++ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ++ ++ if (zio->io_child_error[ZIO_CHILD_DDT]) { ++ ddt_t *ddt = ddt_select(zio->io_spa, bp); ++ ddt_entry_t *dde = ddt_repair_start(ddt, bp); ++ ddt_phys_t *ddp = dde->dde_phys; ++ ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); ++ blkptr_t blk; ++ ++ ASSERT(zio->io_vsd == NULL); ++ zio->io_vsd = dde; ++ ++ if (ddp_self == NULL) ++ return (ZIO_PIPELINE_CONTINUE); ++ ++ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { ++ if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) ++ continue; ++ ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, ++ &blk); ++ zio_nowait(zio_read(zio, zio->io_spa, &blk, ++ zio_buf_alloc(zio->io_size), zio->io_size, ++ zio_ddt_child_read_done, dde, zio->io_priority, ++ ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, ++ &zio->io_bookmark)); ++ } ++ return (ZIO_PIPELINE_CONTINUE); ++ } ++ ++ zio_nowait(zio_read(zio, zio->io_spa, bp, ++ zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, ++ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++static int ++zio_ddt_read_done(zio_t *zio) ++{ ++ blkptr_t *bp = zio->io_bp; ++ ++ if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) ++ return (ZIO_PIPELINE_STOP); ++ ++ ASSERT(BP_GET_DEDUP(bp)); ++ ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ++ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ++ ++ if (zio->io_child_error[ZIO_CHILD_DDT]) { ++ ddt_t *ddt = ddt_select(zio->io_spa, bp); ++ ddt_entry_t *dde = zio->io_vsd; ++ if (ddt == NULL) { ++ ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); ++ return (ZIO_PIPELINE_CONTINUE); ++ } ++ if (dde == NULL) { ++ zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; ++ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); ++ return (ZIO_PIPELINE_STOP); ++ } ++ if (dde->dde_repair_data != NULL) { ++ bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); ++ zio->io_child_error[ZIO_CHILD_DDT] = 0; ++ } ++ ddt_repair_done(ddt, dde); ++ zio->io_vsd = NULL; ++ } ++ ++ ASSERT(zio->io_vsd == NULL); ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++static boolean_t ++zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) ++{ ++ spa_t *spa = zio->io_spa; ++ int p; ++ ++ /* ++ * Note: we compare the original data, not the transformed data, ++ * because when zio->io_bp is an override bp, we will not have ++ * pushed the I/O transforms. That's an important optimization ++ * because otherwise we'd compress/encrypt all dmu_sync() data twice. ++ */ ++ for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ++ zio_t *lio = dde->dde_lead_zio[p]; ++ ++ if (lio != NULL) { ++ return (lio->io_orig_size != zio->io_orig_size || ++ bcmp(zio->io_orig_data, lio->io_orig_data, ++ zio->io_orig_size) != 0); ++ } ++ } ++ ++ for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ++ ddt_phys_t *ddp = &dde->dde_phys[p]; ++ ++ if (ddp->ddp_phys_birth != 0) { ++ arc_buf_t *abuf = NULL; ++ uint32_t aflags = ARC_WAIT; ++ blkptr_t blk = *zio->io_bp; ++ int error; ++ ++ ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); ++ ++ ddt_exit(ddt); ++ ++ error = arc_read_nolock(NULL, spa, &blk, ++ arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, ++ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, ++ &aflags, &zio->io_bookmark); ++ ++ if (error == 0) { ++ if (arc_buf_size(abuf) != zio->io_orig_size || ++ bcmp(abuf->b_data, zio->io_orig_data, ++ zio->io_orig_size) != 0) ++ error = EEXIST; ++ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); ++ } ++ ++ ddt_enter(ddt); ++ return (error != 0); ++ } ++ } ++ ++ return (B_FALSE); ++} ++ ++static void ++zio_ddt_child_write_ready(zio_t *zio) ++{ ++ int p = zio->io_prop.zp_copies; ++ ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ++ ddt_entry_t *dde = zio->io_private; ++ ddt_phys_t *ddp = &dde->dde_phys[p]; ++ zio_t *pio; ++ ++ if (zio->io_error) ++ return; ++ ++ ddt_enter(ddt); ++ ++ ASSERT(dde->dde_lead_zio[p] == zio); ++ ++ ddt_phys_fill(ddp, zio->io_bp); ++ ++ while ((pio = zio_walk_parents(zio)) != NULL) ++ ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ++ ++ ddt_exit(ddt); ++} ++ ++static void ++zio_ddt_child_write_done(zio_t *zio) ++{ ++ int p = zio->io_prop.zp_copies; ++ ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ++ ddt_entry_t *dde = zio->io_private; ++ ddt_phys_t *ddp = &dde->dde_phys[p]; ++ ++ ddt_enter(ddt); ++ ++ ASSERT(ddp->ddp_refcnt == 0); ++ ASSERT(dde->dde_lead_zio[p] == zio); ++ dde->dde_lead_zio[p] = NULL; ++ ++ if (zio->io_error == 0) { ++ while (zio_walk_parents(zio) != NULL) ++ ddt_phys_addref(ddp); ++ } else { ++ ddt_phys_clear(ddp); ++ } ++ ++ ddt_exit(ddt); ++} ++ ++static void ++zio_ddt_ditto_write_done(zio_t *zio) ++{ ++ int p = DDT_PHYS_DITTO; ++ blkptr_t *bp = zio->io_bp; ++ ddt_t *ddt = ddt_select(zio->io_spa, bp); ++ ddt_entry_t *dde = zio->io_private; ++ ddt_phys_t *ddp = &dde->dde_phys[p]; ++ ddt_key_t *ddk = &dde->dde_key; ++ ASSERTV(zio_prop_t *zp = &zio->io_prop); ++ ++ ddt_enter(ddt); ++ ++ ASSERT(ddp->ddp_refcnt == 0); ++ ASSERT(dde->dde_lead_zio[p] == zio); ++ dde->dde_lead_zio[p] = NULL; ++ ++ if (zio->io_error == 0) { ++ ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); ++ ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); ++ ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); ++ if (ddp->ddp_phys_birth != 0) ++ ddt_phys_free(ddt, ddk, ddp, zio->io_txg); ++ ddt_phys_fill(ddp, bp); ++ } ++ ++ ddt_exit(ddt); ++} ++ ++static int ++zio_ddt_write(zio_t *zio) ++{ ++ spa_t *spa = zio->io_spa; ++ blkptr_t *bp = zio->io_bp; ++ uint64_t txg = zio->io_txg; ++ zio_prop_t *zp = &zio->io_prop; ++ int p = zp->zp_copies; ++ int ditto_copies; ++ zio_t *cio = NULL; ++ zio_t *dio = NULL; ++ ddt_t *ddt = ddt_select(spa, bp); ++ ddt_entry_t *dde; ++ ddt_phys_t *ddp; ++ ++ ASSERT(BP_GET_DEDUP(bp)); ++ ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ++ ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ++ ++ ddt_enter(ddt); ++ dde = ddt_lookup(ddt, bp, B_TRUE); ++ ddp = &dde->dde_phys[p]; ++ ++ if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { ++ /* ++ * If we're using a weak checksum, upgrade to a strong checksum ++ * and try again. If we're already using a strong checksum, ++ * we can't resolve it, so just convert to an ordinary write. ++ * (And automatically e-mail a paper to Nature?) ++ */ ++ if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { ++ zp->zp_checksum = spa_dedup_checksum(spa); ++ zio_pop_transforms(zio); ++ zio->io_stage = ZIO_STAGE_OPEN; ++ BP_ZERO(bp); ++ } else { ++ zp->zp_dedup = 0; ++ } ++ zio->io_pipeline = ZIO_WRITE_PIPELINE; ++ ddt_exit(ddt); ++ return (ZIO_PIPELINE_CONTINUE); ++ } ++ ++ ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); ++ ASSERT(ditto_copies < SPA_DVAS_PER_BP); ++ ++ if (ditto_copies > ddt_ditto_copies_present(dde) && ++ dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { ++ zio_prop_t czp = *zp; ++ ++ czp.zp_copies = ditto_copies; ++ ++ /* ++ * If we arrived here with an override bp, we won't have run ++ * the transform stack, so we won't have the data we need to ++ * generate a child i/o. So, toss the override bp and restart. ++ * This is safe, because using the override bp is just an ++ * optimization; and it's rare, so the cost doesn't matter. ++ */ ++ if (zio->io_bp_override) { ++ zio_pop_transforms(zio); ++ zio->io_stage = ZIO_STAGE_OPEN; ++ zio->io_pipeline = ZIO_WRITE_PIPELINE; ++ zio->io_bp_override = NULL; ++ BP_ZERO(bp); ++ ddt_exit(ddt); ++ return (ZIO_PIPELINE_CONTINUE); ++ } ++ ++ dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, ++ zio->io_orig_size, &czp, NULL, ++ zio_ddt_ditto_write_done, dde, zio->io_priority, ++ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); ++ ++ zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); ++ dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; ++ } ++ ++ if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { ++ if (ddp->ddp_phys_birth != 0) ++ ddt_bp_fill(ddp, bp, txg); ++ if (dde->dde_lead_zio[p] != NULL) ++ zio_add_child(zio, dde->dde_lead_zio[p]); ++ else ++ ddt_phys_addref(ddp); ++ } else if (zio->io_bp_override) { ++ ASSERT(bp->blk_birth == txg); ++ ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ++ ddt_phys_fill(ddp, bp); ++ ddt_phys_addref(ddp); ++ } else { ++ cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, ++ zio->io_orig_size, zp, zio_ddt_child_write_ready, ++ zio_ddt_child_write_done, dde, zio->io_priority, ++ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); ++ ++ zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); ++ dde->dde_lead_zio[p] = cio; ++ } ++ ++ ddt_exit(ddt); ++ ++ if (cio) ++ zio_nowait(cio); ++ if (dio) ++ zio_nowait(dio); ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++ddt_entry_t *freedde; /* for debugging */ ++ ++static int ++zio_ddt_free(zio_t *zio) ++{ ++ spa_t *spa = zio->io_spa; ++ blkptr_t *bp = zio->io_bp; ++ ddt_t *ddt = ddt_select(spa, bp); ++ ddt_entry_t *dde; ++ ddt_phys_t *ddp; ++ ++ ASSERT(BP_GET_DEDUP(bp)); ++ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ++ ++ ddt_enter(ddt); ++ freedde = dde = ddt_lookup(ddt, bp, B_TRUE); ++ ddp = ddt_phys_select(dde, bp); ++ ddt_phys_decref(ddp); ++ ddt_exit(ddt); ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++/* ++ * ========================================================================== ++ * Allocate and free blocks ++ * ========================================================================== ++ */ ++static int ++zio_dva_allocate(zio_t *zio) ++{ ++ spa_t *spa = zio->io_spa; ++ metaslab_class_t *mc = spa_normal_class(spa); ++ blkptr_t *bp = zio->io_bp; ++ int error; ++ int flags = 0; ++ ++ if (zio->io_gang_leader == NULL) { ++ ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ++ zio->io_gang_leader = zio; ++ } ++ ++ ASSERT(BP_IS_HOLE(bp)); ++ ASSERT3U(BP_GET_NDVAS(bp), ==, 0); ++ ASSERT3U(zio->io_prop.zp_copies, >, 0); ++ ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ++ ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); ++ ++ /* ++ * The dump device does not support gang blocks so allocation on ++ * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid ++ * the "fast" gang feature. ++ */ ++ flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; ++ flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? ++ METASLAB_GANG_CHILD : 0; ++ flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; ++ error = metaslab_alloc(spa, mc, zio->io_size, bp, ++ zio->io_prop.zp_copies, zio->io_txg, NULL, flags); ++ ++ if (error) { ++ spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " ++ "size %llu, error %d", spa_name(spa), zio, zio->io_size, ++ error); ++ if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) ++ return (zio_write_gang_block(zio)); ++ zio->io_error = error; ++ } ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++static int ++zio_dva_free(zio_t *zio) ++{ ++ metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++static int ++zio_dva_claim(zio_t *zio) ++{ ++ int error; ++ ++ error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); ++ if (error) ++ zio->io_error = error; ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++/* ++ * Undo an allocation. This is used by zio_done() when an I/O fails ++ * and we want to give back the block we just allocated. ++ * This handles both normal blocks and gang blocks. ++ */ ++static void ++zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) ++{ ++ int g; ++ ++ ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ++ ASSERT(zio->io_bp_override == NULL); ++ ++ if (!BP_IS_HOLE(bp)) ++ metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); ++ ++ if (gn != NULL) { ++ for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { ++ zio_dva_unallocate(zio, gn->gn_child[g], ++ &gn->gn_gbh->zg_blkptr[g]); ++ } ++ } ++} ++ ++/* ++ * Try to allocate an intent log block. Return 0 on success, errno on failure. ++ */ ++int ++zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size, ++ boolean_t use_slog) ++{ ++ int error = 1; ++ ++ ASSERT(txg > spa_syncing_txg(spa)); ++ ++ /* ++ * ZIL blocks are always contiguous (i.e. not gang blocks) so we ++ * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" ++ * when allocating them. ++ */ ++ if (use_slog) { ++ error = metaslab_alloc(spa, spa_log_class(spa), size, ++ new_bp, 1, txg, NULL, ++ METASLAB_FASTWRITE | METASLAB_GANG_AVOID); ++ } ++ ++ if (error) { ++ error = metaslab_alloc(spa, spa_normal_class(spa), size, ++ new_bp, 1, txg, NULL, ++ METASLAB_FASTWRITE | METASLAB_GANG_AVOID); ++ } ++ ++ if (error == 0) { ++ BP_SET_LSIZE(new_bp, size); ++ BP_SET_PSIZE(new_bp, size); ++ BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); ++ BP_SET_CHECKSUM(new_bp, ++ spa_version(spa) >= SPA_VERSION_SLIM_ZIL ++ ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); ++ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); ++ BP_SET_LEVEL(new_bp, 0); ++ BP_SET_DEDUP(new_bp, 0); ++ BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); ++ } ++ ++ return (error); ++} ++ ++/* ++ * Free an intent log block. ++ */ ++void ++zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) ++{ ++ ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); ++ ASSERT(!BP_IS_GANG(bp)); ++ ++ zio_free(spa, txg, bp); ++} ++ ++/* ++ * ========================================================================== ++ * Read and write to physical devices ++ * ========================================================================== ++ */ ++static int ++zio_vdev_io_start(zio_t *zio) ++{ ++ vdev_t *vd = zio->io_vd; ++ uint64_t align; ++ spa_t *spa = zio->io_spa; ++ ++ ASSERT(zio->io_error == 0); ++ ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); ++ ++ if (vd == NULL) { ++ if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) ++ spa_config_enter(spa, SCL_ZIO, zio, RW_READER); ++ ++ /* ++ * The mirror_ops handle multiple DVAs in a single BP. ++ */ ++ return (vdev_mirror_ops.vdev_op_io_start(zio)); ++ } ++ ++ /* ++ * We keep track of time-sensitive I/Os so that the scan thread ++ * can quickly react to certain workloads. In particular, we care ++ * about non-scrubbing, top-level reads and writes with the following ++ * characteristics: ++ * - synchronous writes of user data to non-slog devices ++ * - any reads of user data ++ * When these conditions are met, adjust the timestamp of spa_last_io ++ * which allows the scan thread to adjust its workload accordingly. ++ */ ++ if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && ++ vd == vd->vdev_top && !vd->vdev_islog && ++ zio->io_bookmark.zb_objset != DMU_META_OBJSET && ++ zio->io_txg != spa_syncing_txg(spa)) { ++ uint64_t old = spa->spa_last_io; ++ uint64_t new = ddi_get_lbolt64(); ++ if (old != new) ++ (void) atomic_cas_64(&spa->spa_last_io, old, new); ++ } ++ ++ align = 1ULL << vd->vdev_top->vdev_ashift; ++ ++ if (P2PHASE(zio->io_size, align) != 0) { ++ uint64_t asize = P2ROUNDUP(zio->io_size, align); ++ char *abuf = zio_buf_alloc(asize); ++ ASSERT(vd == vd->vdev_top); ++ if (zio->io_type == ZIO_TYPE_WRITE) { ++ bcopy(zio->io_data, abuf, zio->io_size); ++ bzero(abuf + zio->io_size, asize - zio->io_size); ++ } ++ zio_push_transform(zio, abuf, asize, asize, zio_subblock); ++ } ++ ++ ASSERT(P2PHASE(zio->io_offset, align) == 0); ++ ASSERT(P2PHASE(zio->io_size, align) == 0); ++ VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); ++ ++ /* ++ * If this is a repair I/O, and there's no self-healing involved -- ++ * that is, we're just resilvering what we expect to resilver -- ++ * then don't do the I/O unless zio's txg is actually in vd's DTL. ++ * This prevents spurious resilvering with nested replication. ++ * For example, given a mirror of mirrors, (A+B)+(C+D), if only ++ * A is out of date, we'll read from C+D, then use the data to ++ * resilver A+B -- but we don't actually want to resilver B, just A. ++ * The top-level mirror has no way to know this, so instead we just ++ * discard unnecessary repairs as we work our way down the vdev tree. ++ * The same logic applies to any form of nested replication: ++ * ditto + mirror, RAID-Z + replacing, etc. This covers them all. ++ */ ++ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && ++ !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && ++ zio->io_txg != 0 && /* not a delegated i/o */ ++ !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ++ ASSERT(zio->io_type == ZIO_TYPE_WRITE); ++ zio_vdev_io_bypass(zio); ++ return (ZIO_PIPELINE_CONTINUE); ++ } ++ ++ if (vd->vdev_ops->vdev_op_leaf && ++ (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { ++ ++ if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) ++ return (ZIO_PIPELINE_CONTINUE); ++ ++ if ((zio = vdev_queue_io(zio)) == NULL) ++ return (ZIO_PIPELINE_STOP); ++ ++ if (!vdev_accessible(vd, zio)) { ++ zio->io_error = ENXIO; ++ zio_interrupt(zio); ++ return (ZIO_PIPELINE_STOP); ++ } ++ } ++ ++ return (vd->vdev_ops->vdev_op_io_start(zio)); ++} ++ ++static int ++zio_vdev_io_done(zio_t *zio) ++{ ++ vdev_t *vd = zio->io_vd; ++ vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; ++ boolean_t unexpected_error = B_FALSE; ++ ++ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) ++ return (ZIO_PIPELINE_STOP); ++ ++ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); ++ ++ if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { ++ ++ vdev_queue_io_done(zio); ++ ++ if (zio->io_type == ZIO_TYPE_WRITE) ++ vdev_cache_write(zio); ++ ++ if (zio_injection_enabled && zio->io_error == 0) ++ zio->io_error = zio_handle_device_injection(vd, ++ zio, EIO); ++ ++ if (zio_injection_enabled && zio->io_error == 0) ++ zio->io_error = zio_handle_label_injection(zio, EIO); ++ ++ if (zio->io_error) { ++ if (!vdev_accessible(vd, zio)) { ++ zio->io_error = ENXIO; ++ } else { ++ unexpected_error = B_TRUE; ++ } ++ } ++ } ++ ++ ops->vdev_op_io_done(zio); ++ ++ if (unexpected_error) ++ VERIFY(vdev_probe(vd, zio) == NULL); ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++/* ++ * For non-raidz ZIOs, we can just copy aside the bad data read from the ++ * disk, and use that to finish the checksum ereport later. ++ */ ++static void ++zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, ++ const void *good_buf) ++{ ++ /* no processing needed */ ++ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); ++} ++ ++/*ARGSUSED*/ ++void ++zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) ++{ ++ void *buf = zio_buf_alloc(zio->io_size); ++ ++ bcopy(zio->io_data, buf, zio->io_size); ++ ++ zcr->zcr_cbinfo = zio->io_size; ++ zcr->zcr_cbdata = buf; ++ zcr->zcr_finish = zio_vsd_default_cksum_finish; ++ zcr->zcr_free = zio_buf_free; ++} ++ ++static int ++zio_vdev_io_assess(zio_t *zio) ++{ ++ vdev_t *vd = zio->io_vd; ++ ++ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) ++ return (ZIO_PIPELINE_STOP); ++ ++ if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) ++ spa_config_exit(zio->io_spa, SCL_ZIO, zio); ++ ++ if (zio->io_vsd != NULL) { ++ zio->io_vsd_ops->vsd_free(zio); ++ zio->io_vsd = NULL; ++ } ++ ++ if (zio_injection_enabled && zio->io_error == 0) ++ zio->io_error = zio_handle_fault_injection(zio, EIO); ++ ++ /* ++ * If the I/O failed, determine whether we should attempt to retry it. ++ * ++ * On retry, we cut in line in the issue queue, since we don't want ++ * compression/checksumming/etc. work to prevent our (cheap) IO reissue. ++ */ ++ if (zio->io_error && vd == NULL && ++ !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ++ ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ++ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ ++ zio->io_error = 0; ++ zio->io_flags |= ZIO_FLAG_IO_RETRY | ++ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; ++ zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; ++ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, ++ zio_requeue_io_start_cut_in_line); ++ return (ZIO_PIPELINE_STOP); ++ } ++ ++ /* ++ * If we got an error on a leaf device, convert it to ENXIO ++ * if the device is not accessible at all. ++ */ ++ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && ++ !vdev_accessible(vd, zio)) ++ zio->io_error = ENXIO; ++ ++ /* ++ * If we can't write to an interior vdev (mirror or RAID-Z), ++ * set vdev_cant_write so that we stop trying to allocate from it. ++ */ ++ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && ++ vd != NULL && !vd->vdev_ops->vdev_op_leaf) ++ vd->vdev_cant_write = B_TRUE; ++ ++ if (zio->io_error) ++ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++void ++zio_vdev_io_reissue(zio_t *zio) ++{ ++ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ++ ASSERT(zio->io_error == 0); ++ ++ zio->io_stage >>= 1; ++} ++ ++void ++zio_vdev_io_redone(zio_t *zio) ++{ ++ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); ++ ++ zio->io_stage >>= 1; ++} ++ ++void ++zio_vdev_io_bypass(zio_t *zio) ++{ ++ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ++ ASSERT(zio->io_error == 0); ++ ++ zio->io_flags |= ZIO_FLAG_IO_BYPASS; ++ zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; ++} ++ ++/* ++ * ========================================================================== ++ * Generate and verify checksums ++ * ========================================================================== ++ */ ++static int ++zio_checksum_generate(zio_t *zio) ++{ ++ blkptr_t *bp = zio->io_bp; ++ enum zio_checksum checksum; ++ ++ if (bp == NULL) { ++ /* ++ * This is zio_write_phys(). ++ * We're either generating a label checksum, or none at all. ++ */ ++ checksum = zio->io_prop.zp_checksum; ++ ++ if (checksum == ZIO_CHECKSUM_OFF) ++ return (ZIO_PIPELINE_CONTINUE); ++ ++ ASSERT(checksum == ZIO_CHECKSUM_LABEL); ++ } else { ++ if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ++ ASSERT(!IO_IS_ALLOCATING(zio)); ++ checksum = ZIO_CHECKSUM_GANG_HEADER; ++ } else { ++ checksum = BP_GET_CHECKSUM(bp); ++ } ++ } ++ ++ zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++static int ++zio_checksum_verify(zio_t *zio) ++{ ++ zio_bad_cksum_t info; ++ blkptr_t *bp = zio->io_bp; ++ int error; ++ ++ ASSERT(zio->io_vd != NULL); ++ ++ if (bp == NULL) { ++ /* ++ * This is zio_read_phys(). ++ * We're either verifying a label checksum, or nothing at all. ++ */ ++ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) ++ return (ZIO_PIPELINE_CONTINUE); ++ ++ ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); ++ } ++ ++ if ((error = zio_checksum_error(zio, &info)) != 0) { ++ zio->io_error = error; ++ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { ++ zfs_ereport_start_checksum(zio->io_spa, ++ zio->io_vd, zio, zio->io_offset, ++ zio->io_size, NULL, &info); ++ } ++ } ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++/* ++ * Called by RAID-Z to ensure we don't compute the checksum twice. ++ */ ++void ++zio_checksum_verified(zio_t *zio) ++{ ++ zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; ++} ++ ++/* ++ * ========================================================================== ++ * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. ++ * An error of 0 indictes success. ENXIO indicates whole-device failure, ++ * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO ++ * indicate errors that are specific to one I/O, and most likely permanent. ++ * Any other error is presumed to be worse because we weren't expecting it. ++ * ========================================================================== ++ */ ++int ++zio_worst_error(int e1, int e2) ++{ ++ static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; ++ int r1, r2; ++ ++ for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) ++ if (e1 == zio_error_rank[r1]) ++ break; ++ ++ for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) ++ if (e2 == zio_error_rank[r2]) ++ break; ++ ++ return (r1 > r2 ? e1 : e2); ++} ++ ++/* ++ * ========================================================================== ++ * I/O completion ++ * ========================================================================== ++ */ ++static int ++zio_ready(zio_t *zio) ++{ ++ blkptr_t *bp = zio->io_bp; ++ zio_t *pio, *pio_next; ++ ++ if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || ++ zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) ++ return (ZIO_PIPELINE_STOP); ++ ++ if (zio->io_ready) { ++ ASSERT(IO_IS_ALLOCATING(zio)); ++ ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ++ ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); ++ ++ zio->io_ready(zio); ++ } ++ ++ if (bp != NULL && bp != &zio->io_bp_copy) ++ zio->io_bp_copy = *bp; ++ ++ if (zio->io_error) ++ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ++ ++ mutex_enter(&zio->io_lock); ++ zio->io_state[ZIO_WAIT_READY] = 1; ++ pio = zio_walk_parents(zio); ++ mutex_exit(&zio->io_lock); ++ ++ /* ++ * As we notify zio's parents, new parents could be added. ++ * New parents go to the head of zio's io_parent_list, however, ++ * so we will (correctly) not notify them. The remainder of zio's ++ * io_parent_list, from 'pio_next' onward, cannot change because ++ * all parents must wait for us to be done before they can be done. ++ */ ++ for (; pio != NULL; pio = pio_next) { ++ pio_next = zio_walk_parents(zio); ++ zio_notify_parent(pio, zio, ZIO_WAIT_READY); ++ } ++ ++ if (zio->io_flags & ZIO_FLAG_NODATA) { ++ if (BP_IS_GANG(bp)) { ++ zio->io_flags &= ~ZIO_FLAG_NODATA; ++ } else { ++ ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); ++ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; ++ } ++ } ++ ++ if (zio_injection_enabled && ++ zio->io_spa->spa_syncing_txg == zio->io_txg) ++ zio_handle_ignored_writes(zio); ++ ++ return (ZIO_PIPELINE_CONTINUE); ++} ++ ++static int ++zio_done(zio_t *zio) ++{ ++ zio_t *pio, *pio_next; ++ int c, w; ++ ++ /* ++ * If our children haven't all completed, ++ * wait for them and then repeat this pipeline stage. ++ */ ++ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || ++ zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || ++ zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || ++ zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) ++ return (ZIO_PIPELINE_STOP); ++ ++ for (c = 0; c < ZIO_CHILD_TYPES; c++) ++ for (w = 0; w < ZIO_WAIT_TYPES; w++) ++ ASSERT(zio->io_children[c][w] == 0); ++ ++ if (zio->io_bp != NULL) { ++ ASSERT(zio->io_bp->blk_pad[0] == 0); ++ ASSERT(zio->io_bp->blk_pad[1] == 0); ++ ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || ++ (zio->io_bp == zio_unique_parent(zio)->io_bp)); ++ if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) && ++ zio->io_bp_override == NULL && ++ !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ++ ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp)); ++ ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ++ ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || ++ (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); ++ } ++ } ++ ++ /* ++ * If there were child vdev/gang/ddt errors, they apply to us now. ++ */ ++ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); ++ zio_inherit_child_errors(zio, ZIO_CHILD_GANG); ++ zio_inherit_child_errors(zio, ZIO_CHILD_DDT); ++ ++ /* ++ * If the I/O on the transformed data was successful, generate any ++ * checksum reports now while we still have the transformed data. ++ */ ++ if (zio->io_error == 0) { ++ while (zio->io_cksum_report != NULL) { ++ zio_cksum_report_t *zcr = zio->io_cksum_report; ++ uint64_t align = zcr->zcr_align; ++ uint64_t asize = P2ROUNDUP(zio->io_size, align); ++ char *abuf = zio->io_data; ++ ++ if (asize != zio->io_size) { ++ abuf = zio_buf_alloc(asize); ++ bcopy(zio->io_data, abuf, zio->io_size); ++ bzero(abuf + zio->io_size, asize - zio->io_size); ++ } ++ ++ zio->io_cksum_report = zcr->zcr_next; ++ zcr->zcr_next = NULL; ++ zcr->zcr_finish(zcr, abuf); ++ zfs_ereport_free_checksum(zcr); ++ ++ if (asize != zio->io_size) ++ zio_buf_free(abuf, asize); ++ } ++ } ++ ++ zio_pop_transforms(zio); /* note: may set zio->io_error */ ++ ++ vdev_stat_update(zio, zio->io_size); ++ ++ /* ++ * When an I/O completes but was slow post an ereport. ++ */ ++ if (zio->io_delay >= zio_delay_max) ++ zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, ++ zio->io_vd, zio, 0, 0); ++ ++ if (zio->io_error) { ++ /* ++ * If this I/O is attached to a particular vdev, ++ * generate an error message describing the I/O failure ++ * at the block level. We ignore these errors if the ++ * device is currently unavailable. ++ */ ++ if (zio->io_error != ECKSUM && zio->io_vd != NULL && ++ !vdev_is_dead(zio->io_vd)) ++ zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, ++ zio->io_vd, zio, 0, 0); ++ ++ if ((zio->io_error == EIO || !(zio->io_flags & ++ (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && ++ zio == zio->io_logical) { ++ /* ++ * For logical I/O requests, tell the SPA to log the ++ * error and generate a logical data ereport. ++ */ ++ spa_log_error(zio->io_spa, zio); ++ zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, zio, ++ 0, 0); ++ } ++ } ++ ++ if (zio->io_error && zio == zio->io_logical) { ++ /* ++ * Determine whether zio should be reexecuted. This will ++ * propagate all the way to the root via zio_notify_parent(). ++ */ ++ ASSERT(zio->io_vd == NULL && zio->io_bp != NULL); ++ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ++ ++ if (IO_IS_ALLOCATING(zio) && ++ !(zio->io_flags & ZIO_FLAG_CANFAIL)) { ++ if (zio->io_error != ENOSPC) ++ zio->io_reexecute |= ZIO_REEXECUTE_NOW; ++ else ++ zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; ++ } ++ ++ if ((zio->io_type == ZIO_TYPE_READ || ++ zio->io_type == ZIO_TYPE_FREE) && ++ !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && ++ zio->io_error == ENXIO && ++ spa_load_state(zio->io_spa) == SPA_LOAD_NONE && ++ spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) ++ zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; ++ ++ if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) ++ zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; ++ ++ /* ++ * Here is a possibly good place to attempt to do ++ * either combinatorial reconstruction or error correction ++ * based on checksums. It also might be a good place ++ * to send out preliminary ereports before we suspend ++ * processing. ++ */ ++ } ++ ++ /* ++ * If there were logical child errors, they apply to us now. ++ * We defer this until now to avoid conflating logical child ++ * errors with errors that happened to the zio itself when ++ * updating vdev stats and reporting FMA events above. ++ */ ++ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); ++ ++ if ((zio->io_error || zio->io_reexecute) && ++ IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && ++ !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) ++ zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); ++ ++ zio_gang_tree_free(&zio->io_gang_tree); ++ ++ /* ++ * Godfather I/Os should never suspend. ++ */ ++ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && ++ (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) ++ zio->io_reexecute = 0; ++ ++ if (zio->io_reexecute) { ++ /* ++ * This is a logical I/O that wants to reexecute. ++ * ++ * Reexecute is top-down. When an i/o fails, if it's not ++ * the root, it simply notifies its parent and sticks around. ++ * The parent, seeing that it still has children in zio_done(), ++ * does the same. This percolates all the way up to the root. ++ * The root i/o will reexecute or suspend the entire tree. ++ * ++ * This approach ensures that zio_reexecute() honors ++ * all the original i/o dependency relationships, e.g. ++ * parents not executing until children are ready. ++ */ ++ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ++ ++ zio->io_gang_leader = NULL; ++ ++ mutex_enter(&zio->io_lock); ++ zio->io_state[ZIO_WAIT_DONE] = 1; ++ mutex_exit(&zio->io_lock); ++ ++ /* ++ * "The Godfather" I/O monitors its children but is ++ * not a true parent to them. It will track them through ++ * the pipeline but severs its ties whenever they get into ++ * trouble (e.g. suspended). This allows "The Godfather" ++ * I/O to return status without blocking. ++ */ ++ for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { ++ zio_link_t *zl = zio->io_walk_link; ++ pio_next = zio_walk_parents(zio); ++ ++ if ((pio->io_flags & ZIO_FLAG_GODFATHER) && ++ (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { ++ zio_remove_child(pio, zio, zl); ++ zio_notify_parent(pio, zio, ZIO_WAIT_DONE); ++ } ++ } ++ ++ if ((pio = zio_unique_parent(zio)) != NULL) { ++ /* ++ * We're not a root i/o, so there's nothing to do ++ * but notify our parent. Don't propagate errors ++ * upward since we haven't permanently failed yet. ++ */ ++ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ++ zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; ++ zio_notify_parent(pio, zio, ZIO_WAIT_DONE); ++ } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { ++ /* ++ * We'd fail again if we reexecuted now, so suspend ++ * until conditions improve (e.g. device comes online). ++ */ ++ zio_suspend(zio->io_spa, zio); ++ } else { ++ /* ++ * Reexecution is potentially a huge amount of work. ++ * Hand it off to the otherwise-unused claim taskq. ++ */ ++ ASSERT(taskq_empty_ent(&zio->io_tqent)); ++ (void) taskq_dispatch_ent( ++ zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], ++ (task_func_t *)zio_reexecute, zio, 0, ++ &zio->io_tqent); ++ } ++ return (ZIO_PIPELINE_STOP); ++ } ++ ++ ASSERT(zio->io_child_count == 0); ++ ASSERT(zio->io_reexecute == 0); ++ ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); ++ ++ /* ++ * Report any checksum errors, since the I/O is complete. ++ */ ++ while (zio->io_cksum_report != NULL) { ++ zio_cksum_report_t *zcr = zio->io_cksum_report; ++ zio->io_cksum_report = zcr->zcr_next; ++ zcr->zcr_next = NULL; ++ zcr->zcr_finish(zcr, NULL); ++ zfs_ereport_free_checksum(zcr); ++ } ++ ++ if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && ++ !BP_IS_HOLE(zio->io_bp)) { ++ metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); ++ } ++ ++ /* ++ * It is the responsibility of the done callback to ensure that this ++ * particular zio is no longer discoverable for adoption, and as ++ * such, cannot acquire any new parents. ++ */ ++ if (zio->io_done) ++ zio->io_done(zio); ++ ++ mutex_enter(&zio->io_lock); ++ zio->io_state[ZIO_WAIT_DONE] = 1; ++ mutex_exit(&zio->io_lock); ++ ++ for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { ++ zio_link_t *zl = zio->io_walk_link; ++ pio_next = zio_walk_parents(zio); ++ zio_remove_child(pio, zio, zl); ++ zio_notify_parent(pio, zio, ZIO_WAIT_DONE); ++ } ++ ++ if (zio->io_waiter != NULL) { ++ mutex_enter(&zio->io_lock); ++ zio->io_executor = NULL; ++ cv_broadcast(&zio->io_cv); ++ mutex_exit(&zio->io_lock); ++ } else { ++ zio_destroy(zio); ++ } ++ ++ return (ZIO_PIPELINE_STOP); ++} ++ ++/* ++ * ========================================================================== ++ * I/O pipeline definition ++ * ========================================================================== ++ */ ++static zio_pipe_stage_t *zio_pipeline[] = { ++ NULL, ++ zio_read_bp_init, ++ zio_free_bp_init, ++ zio_issue_async, ++ zio_write_bp_init, ++ zio_checksum_generate, ++ zio_ddt_read_start, ++ zio_ddt_read_done, ++ zio_ddt_write, ++ zio_ddt_free, ++ zio_gang_assemble, ++ zio_gang_issue, ++ zio_dva_allocate, ++ zio_dva_free, ++ zio_dva_claim, ++ zio_ready, ++ zio_vdev_io_start, ++ zio_vdev_io_done, ++ zio_vdev_io_assess, ++ zio_checksum_verify, ++ zio_done ++}; ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++/* Fault injection */ ++EXPORT_SYMBOL(zio_injection_enabled); ++EXPORT_SYMBOL(zio_inject_fault); ++EXPORT_SYMBOL(zio_inject_list_next); ++EXPORT_SYMBOL(zio_clear_fault); ++EXPORT_SYMBOL(zio_handle_fault_injection); ++EXPORT_SYMBOL(zio_handle_device_injection); ++EXPORT_SYMBOL(zio_handle_label_injection); ++EXPORT_SYMBOL(zio_priority_table); ++EXPORT_SYMBOL(zio_type_name); ++ ++module_param(zio_bulk_flags, int, 0644); ++MODULE_PARM_DESC(zio_bulk_flags, "Additional flags to pass to bulk buffers"); ++ ++module_param(zio_delay_max, int, 0644); ++MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event"); ++ ++module_param(zio_requeue_io_start_cut_in_line, int, 0644); ++MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O"); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zio_checksum.c linux-3.2.33-go/fs/zfs/zfs/zio_checksum.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zio_checksum.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zio_checksum.c 2012-11-16 23:25:34.352039300 +0100 +@@ -0,0 +1,274 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Checksum vectors. ++ * ++ * In the SPA, everything is checksummed. We support checksum vectors ++ * for three distinct reasons: ++ * ++ * 1. Different kinds of data need different levels of protection. ++ * For SPA metadata, we always want a very strong checksum. ++ * For user data, we let users make the trade-off between speed ++ * and checksum strength. ++ * ++ * 2. Cryptographic hash and MAC algorithms are an area of active research. ++ * It is likely that in future hash functions will be at least as strong ++ * as current best-of-breed, and may be substantially faster as well. ++ * We want the ability to take advantage of these new hashes as soon as ++ * they become available. ++ * ++ * 3. If someone develops hardware that can compute a strong hash quickly, ++ * we want the ability to take advantage of that hardware. ++ * ++ * Of course, we don't want a checksum upgrade to invalidate existing ++ * data, so we store the checksum *function* in eight bits of the bp. ++ * This gives us room for up to 256 different checksum functions. ++ * ++ * When writing a block, we always checksum it with the latest-and-greatest ++ * checksum function of the appropriate strength. When reading a block, ++ * we compare the expected checksum against the actual checksum, which we ++ * compute via the checksum function specified by BP_GET_CHECKSUM(bp). ++ */ ++ ++/*ARGSUSED*/ ++static void ++zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) ++{ ++ ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); ++} ++ ++zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { ++ {{NULL, NULL}, 0, 0, 0, "inherit"}, ++ {{NULL, NULL}, 0, 0, 0, "on"}, ++ {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"}, ++ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"}, ++ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"}, ++ {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"}, ++ {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, ++ {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, ++ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"}, ++ {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"}, ++}; ++ ++enum zio_checksum ++zio_checksum_select(enum zio_checksum child, enum zio_checksum parent) ++{ ++ ASSERT(child < ZIO_CHECKSUM_FUNCTIONS); ++ ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS); ++ ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); ++ ++ if (child == ZIO_CHECKSUM_INHERIT) ++ return (parent); ++ ++ if (child == ZIO_CHECKSUM_ON) ++ return (ZIO_CHECKSUM_ON_VALUE); ++ ++ return (child); ++} ++ ++enum zio_checksum ++zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, ++ enum zio_checksum parent) ++{ ++ ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS); ++ ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS); ++ ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); ++ ++ if (child == ZIO_CHECKSUM_INHERIT) ++ return (parent); ++ ++ if (child == ZIO_CHECKSUM_ON) ++ return (spa_dedup_checksum(spa)); ++ ++ if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY)) ++ return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY); ++ ++ ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup || ++ (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF); ++ ++ return (child); ++} ++ ++/* ++ * Set the external verifier for a gang block based on , ++ * a tuple which is guaranteed to be unique for the life of the pool. ++ */ ++static void ++zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp) ++{ ++ dva_t *dva = BP_IDENTITY(bp); ++ uint64_t txg = BP_PHYSICAL_BIRTH(bp); ++ ++ ASSERT(BP_IS_GANG(bp)); ++ ++ ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); ++} ++ ++/* ++ * Set the external verifier for a label block based on its offset. ++ * The vdev is implicit, and the txg is unknowable at pool open time -- ++ * hence the logic in vdev_uberblock_load() to find the most recent copy. ++ */ ++static void ++zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) ++{ ++ ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); ++} ++ ++/* ++ * Generate the checksum. ++ */ ++void ++zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, ++ void *data, uint64_t size) ++{ ++ blkptr_t *bp = zio->io_bp; ++ uint64_t offset = zio->io_offset; ++ zio_checksum_info_t *ci = &zio_checksum_table[checksum]; ++ zio_cksum_t cksum; ++ ++ ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS); ++ ASSERT(ci->ci_func[0] != NULL); ++ ++ if (ci->ci_eck) { ++ zio_eck_t *eck; ++ ++ if (checksum == ZIO_CHECKSUM_ZILOG2) { ++ zil_chain_t *zilc = data; ++ ++ size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ, ++ uint64_t); ++ eck = &zilc->zc_eck; ++ } else { ++ eck = (zio_eck_t *)((char *)data + size) - 1; ++ } ++ if (checksum == ZIO_CHECKSUM_GANG_HEADER) ++ zio_checksum_gang_verifier(&eck->zec_cksum, bp); ++ else if (checksum == ZIO_CHECKSUM_LABEL) ++ zio_checksum_label_verifier(&eck->zec_cksum, offset); ++ else ++ bp->blk_cksum = eck->zec_cksum; ++ eck->zec_magic = ZEC_MAGIC; ++ ci->ci_func[0](data, size, &cksum); ++ eck->zec_cksum = cksum; ++ } else { ++ ci->ci_func[0](data, size, &bp->blk_cksum); ++ } ++} ++ ++int ++zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) ++{ ++ blkptr_t *bp = zio->io_bp; ++ uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : ++ (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); ++ int byteswap; ++ int error; ++ uint64_t size = (bp == NULL ? zio->io_size : ++ (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); ++ uint64_t offset = zio->io_offset; ++ void *data = zio->io_data; ++ zio_checksum_info_t *ci = &zio_checksum_table[checksum]; ++ zio_cksum_t actual_cksum, expected_cksum, verifier; ++ ++ if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) ++ return (EINVAL); ++ ++ if (ci->ci_eck) { ++ zio_eck_t *eck; ++ ++ if (checksum == ZIO_CHECKSUM_ZILOG2) { ++ zil_chain_t *zilc = data; ++ uint64_t nused; ++ ++ eck = &zilc->zc_eck; ++ if (eck->zec_magic == ZEC_MAGIC) ++ nused = zilc->zc_nused; ++ else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) ++ nused = BSWAP_64(zilc->zc_nused); ++ else ++ return (ECKSUM); ++ ++ if (nused > size) ++ return (ECKSUM); ++ ++ size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); ++ } else { ++ eck = (zio_eck_t *)((char *)data + size) - 1; ++ } ++ ++ if (checksum == ZIO_CHECKSUM_GANG_HEADER) ++ zio_checksum_gang_verifier(&verifier, bp); ++ else if (checksum == ZIO_CHECKSUM_LABEL) ++ zio_checksum_label_verifier(&verifier, offset); ++ else ++ verifier = bp->blk_cksum; ++ ++ byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); ++ ++ if (byteswap) ++ byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); ++ ++ expected_cksum = eck->zec_cksum; ++ eck->zec_cksum = verifier; ++ ci->ci_func[byteswap](data, size, &actual_cksum); ++ eck->zec_cksum = expected_cksum; ++ ++ if (byteswap) ++ byteswap_uint64_array(&expected_cksum, ++ sizeof (zio_cksum_t)); ++ } else { ++ ASSERT(!BP_IS_GANG(bp)); ++ byteswap = BP_SHOULD_BYTESWAP(bp); ++ expected_cksum = bp->blk_cksum; ++ ci->ci_func[byteswap](data, size, &actual_cksum); ++ } ++ ++ info->zbc_expected = expected_cksum; ++ info->zbc_actual = actual_cksum; ++ info->zbc_checksum_name = ci->ci_name; ++ info->zbc_byteswapped = byteswap; ++ info->zbc_injected = 0; ++ info->zbc_has_cksum = 1; ++ ++ if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) ++ return (ECKSUM); ++ ++ if (zio_injection_enabled && !zio->io_error && ++ (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) { ++ ++ info->zbc_injected = 1; ++ return (error); ++ } ++ ++ return (0); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zio_compress.c linux-3.2.33-go/fs/zfs/zfs/zio_compress.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zio_compress.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zio_compress.c 2012-11-16 23:25:34.352039300 +0100 +@@ -0,0 +1,132 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Compression vectors. ++ */ ++ ++zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { ++ {NULL, NULL, 0, "inherit"}, ++ {NULL, NULL, 0, "on"}, ++ {NULL, NULL, 0, "uncompressed"}, ++ {lzjb_compress, lzjb_decompress, 0, "lzjb"}, ++ {NULL, NULL, 0, "empty"}, ++ {gzip_compress, gzip_decompress, 1, "gzip-1"}, ++ {gzip_compress, gzip_decompress, 2, "gzip-2"}, ++ {gzip_compress, gzip_decompress, 3, "gzip-3"}, ++ {gzip_compress, gzip_decompress, 4, "gzip-4"}, ++ {gzip_compress, gzip_decompress, 5, "gzip-5"}, ++ {gzip_compress, gzip_decompress, 6, "gzip-6"}, ++ {gzip_compress, gzip_decompress, 7, "gzip-7"}, ++ {gzip_compress, gzip_decompress, 8, "gzip-8"}, ++ {gzip_compress, gzip_decompress, 9, "gzip-9"}, ++ {zle_compress, zle_decompress, 64, "zle"}, ++}; ++ ++enum zio_compress ++zio_compress_select(enum zio_compress child, enum zio_compress parent) ++{ ++ ASSERT(child < ZIO_COMPRESS_FUNCTIONS); ++ ASSERT(parent < ZIO_COMPRESS_FUNCTIONS); ++ ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON); ++ ++ if (child == ZIO_COMPRESS_INHERIT) ++ return (parent); ++ ++ if (child == ZIO_COMPRESS_ON) ++ return (ZIO_COMPRESS_ON_VALUE); ++ ++ return (child); ++} ++ ++size_t ++zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) ++{ ++ uint64_t *word, *word_end; ++ size_t c_len, d_len, r_len; ++ zio_compress_info_t *ci = &zio_compress_table[c]; ++ ++ ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); ++ ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); ++ ++ /* ++ * If the data is all zeroes, we don't even need to allocate ++ * a block for it. We indicate this by returning zero size. ++ */ ++ word_end = (uint64_t *)((char *)src + s_len); ++ for (word = src; word < word_end; word++) ++ if (*word != 0) ++ break; ++ ++ if (word == word_end) ++ return (0); ++ ++ if (c == ZIO_COMPRESS_EMPTY) ++ return (s_len); ++ ++ /* Compress at least 12.5% */ ++ d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE); ++ if (d_len == 0) ++ return (s_len); ++ ++ c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level); ++ ++ if (c_len > d_len) ++ return (s_len); ++ ++ /* ++ * Cool. We compressed at least as much as we were hoping to. ++ * For both security and repeatability, pad out the last sector. ++ */ ++ r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE); ++ if (r_len > c_len) { ++ bzero((char *)dst + c_len, r_len - c_len); ++ c_len = r_len; ++ } ++ ++ ASSERT3U(c_len, <=, d_len); ++ ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0); ++ ++ return (c_len); ++} ++ ++int ++zio_decompress_data(enum zio_compress c, void *src, void *dst, ++ size_t s_len, size_t d_len) ++{ ++ zio_compress_info_t *ci = &zio_compress_table[c]; ++ ++ if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) ++ return (EINVAL); ++ ++ return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zio_inject.c linux-3.2.33-go/fs/zfs/zfs/zio_inject.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zio_inject.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zio_inject.c 2012-11-16 23:25:34.347039358 +0100 +@@ -0,0 +1,520 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* ++ * ZFS fault injection ++ * ++ * To handle fault injection, we keep track of a series of zinject_record_t ++ * structures which describe which logical block(s) should be injected with a ++ * fault. These are kept in a global list. Each record corresponds to a given ++ * spa_t and maintains a special hold on the spa_t so that it cannot be deleted ++ * or exported while the injection record exists. ++ * ++ * Device level injection is done using the 'zi_guid' field. If this is set, it ++ * means that the error is destined for a particular device, not a piece of ++ * data. ++ * ++ * This is a rather poor data structure and algorithm, but we don't expect more ++ * than a few faults at any one time, so it should be sufficient for our needs. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++uint32_t zio_injection_enabled = 0; ++ ++typedef struct inject_handler { ++ int zi_id; ++ spa_t *zi_spa; ++ zinject_record_t zi_record; ++ list_node_t zi_link; ++} inject_handler_t; ++ ++static list_t inject_handlers; ++static krwlock_t inject_lock; ++static int inject_next_id = 1; ++ ++/* ++ * Returns true if the given record matches the I/O in progress. ++ */ ++static boolean_t ++zio_match_handler(zbookmark_t *zb, uint64_t type, ++ zinject_record_t *record, int error) ++{ ++ /* ++ * Check for a match against the MOS, which is based on type ++ */ ++ if (zb->zb_objset == DMU_META_OBJSET && ++ record->zi_objset == DMU_META_OBJSET && ++ record->zi_object == DMU_META_DNODE_OBJECT) { ++ if (record->zi_type == DMU_OT_NONE || ++ type == record->zi_type) ++ return (record->zi_freq == 0 || ++ spa_get_random(100) < record->zi_freq); ++ else ++ return (B_FALSE); ++ } ++ ++ /* ++ * Check for an exact match. ++ */ ++ if (zb->zb_objset == record->zi_objset && ++ zb->zb_object == record->zi_object && ++ zb->zb_level == record->zi_level && ++ zb->zb_blkid >= record->zi_start && ++ zb->zb_blkid <= record->zi_end && ++ error == record->zi_error) ++ return (record->zi_freq == 0 || ++ spa_get_random(100) < record->zi_freq); ++ ++ return (B_FALSE); ++} ++ ++/* ++ * Panic the system when a config change happens in the function ++ * specified by tag. ++ */ ++void ++zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) ++{ ++ inject_handler_t *handler; ++ ++ rw_enter(&inject_lock, RW_READER); ++ ++ for (handler = list_head(&inject_handlers); handler != NULL; ++ handler = list_next(&inject_handlers, handler)) { ++ ++ if (spa != handler->zi_spa) ++ continue; ++ ++ if (handler->zi_record.zi_type == type && ++ strcmp(tag, handler->zi_record.zi_func) == 0) ++ panic("Panic requested in function %s\n", tag); ++ } ++ ++ rw_exit(&inject_lock); ++} ++ ++/* ++ * Determine if the I/O in question should return failure. Returns the errno ++ * to be returned to the caller. ++ */ ++int ++zio_handle_fault_injection(zio_t *zio, int error) ++{ ++ int ret = 0; ++ inject_handler_t *handler; ++ ++ /* ++ * Ignore I/O not associated with any logical data. ++ */ ++ if (zio->io_logical == NULL) ++ return (0); ++ ++ /* ++ * Currently, we only support fault injection on reads. ++ */ ++ if (zio->io_type != ZIO_TYPE_READ) ++ return (0); ++ ++ rw_enter(&inject_lock, RW_READER); ++ ++ for (handler = list_head(&inject_handlers); handler != NULL; ++ handler = list_next(&inject_handlers, handler)) { ++ ++ /* Ignore errors not destined for this pool */ ++ if (zio->io_spa != handler->zi_spa) ++ continue; ++ ++ /* Ignore device errors and panic injection */ ++ if (handler->zi_record.zi_guid != 0 || ++ handler->zi_record.zi_func[0] != '\0' || ++ handler->zi_record.zi_duration != 0) ++ continue; ++ ++ /* If this handler matches, return EIO */ ++ if (zio_match_handler(&zio->io_logical->io_bookmark, ++ zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, ++ &handler->zi_record, error)) { ++ ret = error; ++ break; ++ } ++ } ++ ++ rw_exit(&inject_lock); ++ ++ return (ret); ++} ++ ++/* ++ * Determine if the zio is part of a label update and has an injection ++ * handler associated with that portion of the label. Currently, we ++ * allow error injection in either the nvlist or the uberblock region of ++ * of the vdev label. ++ */ ++int ++zio_handle_label_injection(zio_t *zio, int error) ++{ ++ inject_handler_t *handler; ++ vdev_t *vd = zio->io_vd; ++ uint64_t offset = zio->io_offset; ++ int label; ++ int ret = 0; ++ ++ if (offset >= VDEV_LABEL_START_SIZE && ++ offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) ++ return (0); ++ ++ rw_enter(&inject_lock, RW_READER); ++ ++ for (handler = list_head(&inject_handlers); handler != NULL; ++ handler = list_next(&inject_handlers, handler)) { ++ uint64_t start = handler->zi_record.zi_start; ++ uint64_t end = handler->zi_record.zi_end; ++ ++ /* Ignore device only faults or panic injection */ ++ if (handler->zi_record.zi_start == 0 || ++ handler->zi_record.zi_func[0] != '\0' || ++ handler->zi_record.zi_duration != 0) ++ continue; ++ ++ /* ++ * The injection region is the relative offsets within a ++ * vdev label. We must determine the label which is being ++ * updated and adjust our region accordingly. ++ */ ++ label = vdev_label_number(vd->vdev_psize, offset); ++ start = vdev_label_offset(vd->vdev_psize, label, start); ++ end = vdev_label_offset(vd->vdev_psize, label, end); ++ ++ if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && ++ (offset >= start && offset <= end)) { ++ ret = error; ++ break; ++ } ++ } ++ rw_exit(&inject_lock); ++ return (ret); ++} ++ ++ ++int ++zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) ++{ ++ inject_handler_t *handler; ++ int ret = 0; ++ ++ /* ++ * We skip over faults in the labels unless it's during ++ * device open (i.e. zio == NULL). ++ */ ++ if (zio != NULL) { ++ uint64_t offset = zio->io_offset; ++ ++ if (offset < VDEV_LABEL_START_SIZE || ++ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) ++ return (0); ++ } ++ ++ rw_enter(&inject_lock, RW_READER); ++ ++ for (handler = list_head(&inject_handlers); handler != NULL; ++ handler = list_next(&inject_handlers, handler)) { ++ ++ /* ++ * Ignore label specific faults, panic injection ++ * or fake writes ++ */ ++ if (handler->zi_record.zi_start != 0 || ++ handler->zi_record.zi_func[0] != '\0' || ++ handler->zi_record.zi_duration != 0) ++ continue; ++ ++ if (vd->vdev_guid == handler->zi_record.zi_guid) { ++ if (handler->zi_record.zi_failfast && ++ (zio == NULL || (zio->io_flags & ++ (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { ++ continue; ++ } ++ ++ /* Handle type specific I/O failures */ ++ if (zio != NULL && ++ handler->zi_record.zi_iotype != ZIO_TYPES && ++ handler->zi_record.zi_iotype != zio->io_type) ++ continue; ++ ++ if (handler->zi_record.zi_error == error) { ++ /* ++ * For a failed open, pretend like the device ++ * has gone away. ++ */ ++ if (error == ENXIO) ++ vd->vdev_stat.vs_aux = ++ VDEV_AUX_OPEN_FAILED; ++ ++ /* ++ * Treat these errors as if they had been ++ * retried so that all the appropriate stats ++ * and FMA events are generated. ++ */ ++ if (!handler->zi_record.zi_failfast && ++ zio != NULL) ++ zio->io_flags |= ZIO_FLAG_IO_RETRY; ++ ++ ret = error; ++ break; ++ } ++ if (handler->zi_record.zi_error == ENXIO) { ++ ret = EIO; ++ break; ++ } ++ } ++ } ++ ++ rw_exit(&inject_lock); ++ ++ return (ret); ++} ++ ++/* ++ * Simulate hardware that ignores cache flushes. For requested number ++ * of seconds nix the actual writing to disk. ++ */ ++void ++zio_handle_ignored_writes(zio_t *zio) ++{ ++ inject_handler_t *handler; ++ ++ rw_enter(&inject_lock, RW_READER); ++ ++ for (handler = list_head(&inject_handlers); handler != NULL; ++ handler = list_next(&inject_handlers, handler)) { ++ ++ /* Ignore errors not destined for this pool */ ++ if (zio->io_spa != handler->zi_spa) ++ continue; ++ ++ if (handler->zi_record.zi_duration == 0) ++ continue; ++ ++ /* ++ * Positive duration implies # of seconds, negative ++ * a number of txgs ++ */ ++ if (handler->zi_record.zi_timer == 0) { ++ if (handler->zi_record.zi_duration > 0) ++ handler->zi_record.zi_timer = ddi_get_lbolt64(); ++ else ++ handler->zi_record.zi_timer = zio->io_txg; ++ } ++ ++ /* Have a "problem" writing 60% of the time */ ++ if (spa_get_random(100) < 60) ++ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; ++ break; ++ } ++ ++ rw_exit(&inject_lock); ++} ++ ++void ++spa_handle_ignored_writes(spa_t *spa) ++{ ++ inject_handler_t *handler; ++ ++ if (zio_injection_enabled == 0) ++ return; ++ ++ rw_enter(&inject_lock, RW_READER); ++ ++ for (handler = list_head(&inject_handlers); handler != NULL; ++ handler = list_next(&inject_handlers, handler)) { ++ ++ /* Ignore errors not destined for this pool */ ++ if (spa != handler->zi_spa) ++ continue; ++ ++ if (handler->zi_record.zi_duration == 0) ++ continue; ++ ++ if (handler->zi_record.zi_duration > 0) { ++ VERIFY(handler->zi_record.zi_timer == 0 || ++ handler->zi_record.zi_timer + ++ handler->zi_record.zi_duration * hz > ++ ddi_get_lbolt64()); ++ } else { ++ /* duration is negative so the subtraction here adds */ ++ VERIFY(handler->zi_record.zi_timer == 0 || ++ handler->zi_record.zi_timer - ++ handler->zi_record.zi_duration >= ++ spa_syncing_txg(spa)); ++ } ++ } ++ ++ rw_exit(&inject_lock); ++} ++ ++/* ++ * Create a new handler for the given record. We add it to the list, adding ++ * a reference to the spa_t in the process. We increment zio_injection_enabled, ++ * which is the switch to trigger all fault injection. ++ */ ++int ++zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) ++{ ++ inject_handler_t *handler; ++ int error; ++ spa_t *spa; ++ ++ /* ++ * If this is pool-wide metadata, make sure we unload the corresponding ++ * spa_t, so that the next attempt to load it will trigger the fault. ++ * We call spa_reset() to unload the pool appropriately. ++ */ ++ if (flags & ZINJECT_UNLOAD_SPA) ++ if ((error = spa_reset(name)) != 0) ++ return (error); ++ ++ if (!(flags & ZINJECT_NULL)) { ++ /* ++ * spa_inject_ref() will add an injection reference, which will ++ * prevent the pool from being removed from the namespace while ++ * still allowing it to be unloaded. ++ */ ++ if ((spa = spa_inject_addref(name)) == NULL) ++ return (ENOENT); ++ ++ handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); ++ ++ rw_enter(&inject_lock, RW_WRITER); ++ ++ *id = handler->zi_id = inject_next_id++; ++ handler->zi_spa = spa; ++ handler->zi_record = *record; ++ list_insert_tail(&inject_handlers, handler); ++ atomic_add_32(&zio_injection_enabled, 1); ++ ++ rw_exit(&inject_lock); ++ } ++ ++ /* ++ * Flush the ARC, so that any attempts to read this data will end up ++ * going to the ZIO layer. Note that this is a little overkill, but ++ * we don't have the necessary ARC interfaces to do anything else, and ++ * fault injection isn't a performance critical path. ++ */ ++ if (flags & ZINJECT_FLUSH_ARC) ++ arc_flush(NULL); ++ ++ return (0); ++} ++ ++/* ++ * Returns the next record with an ID greater than that supplied to the ++ * function. Used to iterate over all handlers in the system. ++ */ ++int ++zio_inject_list_next(int *id, char *name, size_t buflen, ++ zinject_record_t *record) ++{ ++ inject_handler_t *handler; ++ int ret; ++ ++ mutex_enter(&spa_namespace_lock); ++ rw_enter(&inject_lock, RW_READER); ++ ++ for (handler = list_head(&inject_handlers); handler != NULL; ++ handler = list_next(&inject_handlers, handler)) ++ if (handler->zi_id > *id) ++ break; ++ ++ if (handler) { ++ *record = handler->zi_record; ++ *id = handler->zi_id; ++ (void) strncpy(name, spa_name(handler->zi_spa), buflen); ++ ret = 0; ++ } else { ++ ret = ENOENT; ++ } ++ ++ rw_exit(&inject_lock); ++ mutex_exit(&spa_namespace_lock); ++ ++ return (ret); ++} ++ ++/* ++ * Clear the fault handler with the given identifier, or return ENOENT if none ++ * exists. ++ */ ++int ++zio_clear_fault(int id) ++{ ++ inject_handler_t *handler; ++ ++ rw_enter(&inject_lock, RW_WRITER); ++ ++ for (handler = list_head(&inject_handlers); handler != NULL; ++ handler = list_next(&inject_handlers, handler)) ++ if (handler->zi_id == id) ++ break; ++ ++ if (handler == NULL) { ++ rw_exit(&inject_lock); ++ return (ENOENT); ++ } ++ ++ list_remove(&inject_handlers, handler); ++ rw_exit(&inject_lock); ++ ++ spa_inject_delref(handler->zi_spa); ++ kmem_free(handler, sizeof (inject_handler_t)); ++ atomic_add_32(&zio_injection_enabled, -1); ++ ++ return (0); ++} ++ ++void ++zio_inject_init(void) ++{ ++ rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); ++ list_create(&inject_handlers, sizeof (inject_handler_t), ++ offsetof(inject_handler_t, zi_link)); ++} ++ ++void ++zio_inject_fini(void) ++{ ++ list_destroy(&inject_handlers); ++ rw_destroy(&inject_lock); ++} ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++module_param(zio_injection_enabled, int, 0644); ++MODULE_PARM_DESC(zio_injection_enabled, "Enable fault injection"); ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zle.c linux-3.2.33-go/fs/zfs/zfs/zle.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zle.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zle.c 2012-11-16 23:25:34.352039300 +0100 +@@ -0,0 +1,86 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++/* ++ * Zero-length encoding. This is a fast and simple algorithm to eliminate ++ * runs of zeroes. Each chunk of compressed data begins with a length byte, b. ++ * If b < n (where n is the compression parameter) then the next b + 1 bytes ++ * are literal values. If b >= n then the next (256 - b + 1) bytes are zero. ++ */ ++#include ++#include ++ ++size_t ++zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) ++{ ++ uchar_t *src = s_start; ++ uchar_t *dst = d_start; ++ uchar_t *s_end = src + s_len; ++ uchar_t *d_end = dst + d_len; ++ ++ while (src < s_end && dst < d_end - 1) { ++ uchar_t *first = src; ++ uchar_t *len = dst++; ++ if (src[0] == 0) { ++ uchar_t *last = src + (256 - n); ++ while (src < MIN(last, s_end) && src[0] == 0) ++ src++; ++ *len = src - first - 1 + n; ++ } else { ++ uchar_t *last = src + n; ++ if (d_end - dst < n) ++ break; ++ while (src < MIN(last, s_end) - 1 && (src[0] | src[1])) ++ *dst++ = *src++; ++ if (src[0]) ++ *dst++ = *src++; ++ *len = src - first - 1; ++ } ++ } ++ return (src == s_end ? dst - (uchar_t *)d_start : s_len); ++} ++ ++int ++zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) ++{ ++ uchar_t *src = s_start; ++ uchar_t *dst = d_start; ++ uchar_t *s_end = src + s_len; ++ uchar_t *d_end = dst + d_len; ++ ++ while (src < s_end && dst < d_end) { ++ int len = 1 + *src++; ++ if (len <= n) { ++ while (len-- != 0) ++ *dst++ = *src++; ++ } else { ++ len -= n; ++ while (len-- != 0) ++ *dst++ = 0; ++ } ++ } ++ return (dst == d_end ? 0 : -1); ++} +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zpl_ctldir.c linux-3.2.33-go/fs/zfs/zfs/zpl_ctldir.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zpl_ctldir.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zpl_ctldir.c 2012-11-16 23:25:34.349039334 +0100 +@@ -0,0 +1,534 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (C) 2011 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * LLNL-CODE-403049. ++ * Rewritten for Linux by: ++ * Rohan Puri ++ * Brian Behlendorf ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Common open routine. Disallow any write access. ++ */ ++/* ARGSUSED */ ++static int ++zpl_common_open(struct inode *ip, struct file *filp) ++{ ++ if (filp->f_mode & FMODE_WRITE) ++ return (-EACCES); ++ ++ return generic_file_open(ip, filp); ++} ++ ++static int ++zpl_common_readdir(struct file *filp, void *dirent, filldir_t filldir) ++{ ++ struct dentry *dentry = filp->f_path.dentry; ++ struct inode *ip = dentry->d_inode; ++ int error = 0; ++ ++ switch (filp->f_pos) { ++ case 0: ++ error = filldir(dirent, ".", 1, 0, ip->i_ino, DT_DIR); ++ if (error) ++ break; ++ ++ filp->f_pos++; ++ /* fall-thru */ ++ case 1: ++ error = filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR); ++ if (error) ++ break; ++ ++ filp->f_pos++; ++ /* fall-thru */ ++ default: ++ break; ++ } ++ ++ return (error); ++} ++ ++/* ++ * Get root directory contents. ++ */ ++static int ++zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir) ++{ ++ struct dentry *dentry = filp->f_path.dentry; ++ struct inode *ip = dentry->d_inode; ++ zfs_sb_t *zsb = ITOZSB(ip); ++ int error = 0; ++ ++ ZFS_ENTER(zsb); ++ ++ switch (filp->f_pos) { ++ case 0: ++ error = filldir(dirent, ".", 1, 0, ip->i_ino, DT_DIR); ++ if (error) ++ goto out; ++ ++ filp->f_pos++; ++ /* fall-thru */ ++ case 1: ++ error = filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR); ++ if (error) ++ goto out; ++ ++ filp->f_pos++; ++ /* fall-thru */ ++ case 2: ++ error = filldir(dirent, ZFS_SNAPDIR_NAME, ++ strlen(ZFS_SNAPDIR_NAME), 2, ZFSCTL_INO_SNAPDIR, DT_DIR); ++ if (error) ++ goto out; ++ ++ filp->f_pos++; ++ /* fall-thru */ ++ case 3: ++ error = filldir(dirent, ZFS_SHAREDIR_NAME, ++ strlen(ZFS_SHAREDIR_NAME), 3, ZFSCTL_INO_SHARES, DT_DIR); ++ if (error) ++ goto out; ++ ++ filp->f_pos++; ++ /* fall-thru */ ++ } ++out: ++ ZFS_EXIT(zsb); ++ ++ return (error); ++} ++ ++/* ++ * Get root directory attributes. ++ */ ++/* ARGSUSED */ ++static int ++zpl_root_getattr(struct vfsmount *mnt, struct dentry *dentry, ++ struct kstat *stat) ++{ ++ int error; ++ ++ error = simple_getattr(mnt, dentry, stat); ++ stat->atime = CURRENT_TIME; ++ ++ return (error); ++} ++ ++static struct dentry * ++#ifdef HAVE_LOOKUP_NAMEIDATA ++zpl_root_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd) ++#else ++zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) ++#endif ++{ ++ cred_t *cr = CRED(); ++ struct inode *ip; ++ int error; ++ ++ crhold(cr); ++ error = -zfsctl_root_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL); ++ ASSERT3S(error, <=, 0); ++ crfree(cr); ++ ++ if (error) { ++ if (error == -ENOENT) ++ return d_splice_alias(NULL, dentry); ++ else ++ return ERR_PTR(error); ++ } ++ ++ return d_splice_alias(ip, dentry); ++} ++ ++/* ++ * The '.zfs' control directory file and inode operations. ++ */ ++const struct file_operations zpl_fops_root = { ++ .open = zpl_common_open, ++ .llseek = generic_file_llseek, ++ .read = generic_read_dir, ++ .readdir = zpl_root_readdir, ++}; ++ ++const struct inode_operations zpl_ops_root = { ++ .lookup = zpl_root_lookup, ++ .getattr = zpl_root_getattr, ++}; ++ ++static struct dentry * ++#ifdef HAVE_LOOKUP_NAMEIDATA ++zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, ++ struct nameidata *nd) ++#else ++zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, ++ unsigned int flags) ++#endif ++ ++{ ++ cred_t *cr = CRED(); ++ struct inode *ip; ++ int error; ++ ++ crhold(cr); ++ error = -zfsctl_snapdir_lookup(dip, dname(dentry), &ip, ++ 0, cr, NULL, NULL); ++ ASSERT3S(error, <=, 0); ++ crfree(cr); ++ ++ if (error) { ++ if (error == -ENOENT) ++ return d_splice_alias(NULL, dentry); ++ else ++ return ERR_PTR(error); ++ } ++ ++ /* ++ * Auto mounting of snapshots is only supported for 2.6.37 and ++ * newer kernels. Prior to this kernel the ops->follow_link() ++ * callback was used as a hack to trigger the mount. The ++ * resulting vfsmount was then explicitly grafted in to the ++ * name space. While it might be possible to add compatibility ++ * code to accomplish this it would require considerable care. ++ */ ++#ifdef HAVE_AUTOMOUNT ++ dentry->d_op = &zpl_dops_snapdirs; ++#endif /* HAVE_AUTOMOUNT */ ++ ++ return d_splice_alias(ip, dentry); ++} ++ ++/* ARGSUSED */ ++static int ++zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir) ++{ ++ struct dentry *dentry = filp->f_path.dentry; ++ struct inode *dip = dentry->d_inode; ++ zfs_sb_t *zsb = ITOZSB(dip); ++ char snapname[MAXNAMELEN]; ++ uint64_t id, cookie; ++ boolean_t case_conflict; ++ int error = 0; ++ ++ ZFS_ENTER(zsb); ++ ++ cookie = filp->f_pos; ++ switch (filp->f_pos) { ++ case 0: ++ error = filldir(dirent, ".", 1, 0, dip->i_ino, DT_DIR); ++ if (error) ++ goto out; ++ ++ filp->f_pos++; ++ /* fall-thru */ ++ case 1: ++ error = filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR); ++ if (error) ++ goto out; ++ ++ filp->f_pos++; ++ /* fall-thru */ ++ default: ++ while (error == 0) { ++ error = -dmu_snapshot_list_next(zsb->z_os, MAXNAMELEN, ++ snapname, &id, &cookie, &case_conflict); ++ if (error) ++ goto out; ++ ++ error = filldir(dirent, snapname, strlen(snapname), ++ filp->f_pos, ZFSCTL_INO_SHARES - id, DT_DIR); ++ if (error) ++ goto out; ++ ++ filp->f_pos = cookie; ++ } ++ } ++out: ++ ZFS_EXIT(zsb); ++ ++ if (error == -ENOENT) ++ return (0); ++ ++ return (error); ++} ++ ++int ++zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry, ++ struct inode *tdip, struct dentry *tdentry) ++{ ++ cred_t *cr = CRED(); ++ int error; ++ ++ crhold(cr); ++ error = -zfsctl_snapdir_rename(sdip, dname(sdentry), ++ tdip, dname(tdentry), cr, 0); ++ ASSERT3S(error, <=, 0); ++ crfree(cr); ++ ++ return (error); ++} ++ ++static int ++zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry) ++{ ++ cred_t *cr = CRED(); ++ int error; ++ ++ crhold(cr); ++ error = -zfsctl_snapdir_remove(dip, dname(dentry), cr, 0); ++ ASSERT3S(error, <=, 0); ++ crfree(cr); ++ ++ return (error); ++} ++ ++static int ++zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, zpl_umode_t mode) ++{ ++ cred_t *cr = CRED(); ++ vattr_t *vap; ++ struct inode *ip; ++ int error; ++ ++ crhold(cr); ++ vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); ++ zpl_vap_init(vap, dip, dentry, mode | S_IFDIR, cr); ++ ++ error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); ++ if (error == 0) { ++#ifdef HAVE_AUTOMOUNT ++ dentry->d_op = &zpl_dops_snapdirs; ++#endif /* HAVE_AUTOMOUNT */ ++ d_instantiate(dentry, ip); ++ } ++ ++ kmem_free(vap, sizeof(vattr_t)); ++ ASSERT3S(error, <=, 0); ++ crfree(cr); ++ ++ return (error); ++} ++ ++#ifdef HAVE_AUTOMOUNT ++static struct vfsmount * ++zpl_snapdir_automount(struct path *path) ++{ ++ struct dentry *dentry = path->dentry; ++ int error; ++ ++ /* ++ * We must briefly disable automounts for this dentry because the ++ * user space mount utility will trigger another lookup on this ++ * directory. That will result in zpl_snapdir_automount() being ++ * called repeatedly. The DCACHE_NEED_AUTOMOUNT flag can be ++ * safely reset once the mount completes. ++ */ ++ dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT; ++ error = -zfsctl_mount_snapshot(path, 0); ++ dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; ++ if (error) ++ return ERR_PTR(error); ++ ++ /* ++ * Rather than returning the new vfsmount for the snapshot we must ++ * return NULL to indicate a mount collision. This is done because ++ * the user space mount calls do_add_mount() which adds the vfsmount ++ * to the name space. If we returned the new mount here it would be ++ * added again to the vfsmount list resulting in list corruption. ++ */ ++ return (NULL); ++} ++#endif /* HAVE_AUTOMOUNT */ ++ ++/* ++ * Get snapshot directory attributes. ++ */ ++/* ARGSUSED */ ++static int ++zpl_snapdir_getattr(struct vfsmount *mnt, struct dentry *dentry, ++ struct kstat *stat) ++{ ++ zfs_sb_t *zsb = ITOZSB(dentry->d_inode); ++ int error; ++ ++ ZFS_ENTER(zsb); ++ error = simple_getattr(mnt, dentry, stat); ++ stat->nlink = stat->size = avl_numnodes(&zsb->z_ctldir_snaps) + 2; ++ stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zsb->z_os); ++ stat->atime = CURRENT_TIME; ++ ZFS_EXIT(zsb); ++ ++ return (error); ++} ++ ++/* ++ * The '.zfs/snapshot' directory file operations. These mainly control ++ * generating the list of available snapshots when doing an 'ls' in the ++ * directory. See zpl_snapdir_readdir(). ++ */ ++const struct file_operations zpl_fops_snapdir = { ++ .open = zpl_common_open, ++ .llseek = generic_file_llseek, ++ .read = generic_read_dir, ++ .readdir = zpl_snapdir_readdir, ++}; ++ ++/* ++ * The '.zfs/snapshot' directory inode operations. These mainly control ++ * creating an inode for a snapshot directory and initializing the needed ++ * infrastructure to automount the snapshot. See zpl_snapdir_lookup(). ++ */ ++const struct inode_operations zpl_ops_snapdir = { ++ .lookup = zpl_snapdir_lookup, ++ .getattr = zpl_snapdir_getattr, ++ .rename = zpl_snapdir_rename, ++ .rmdir = zpl_snapdir_rmdir, ++ .mkdir = zpl_snapdir_mkdir, ++}; ++ ++#ifdef HAVE_AUTOMOUNT ++const struct dentry_operations zpl_dops_snapdirs = { ++ .d_automount = zpl_snapdir_automount, ++}; ++#endif /* HAVE_AUTOMOUNT */ ++ ++static struct dentry * ++#ifdef HAVE_LOOKUP_NAMEIDATA ++zpl_shares_lookup(struct inode *dip, struct dentry *dentry, ++ struct nameidata *nd) ++#else ++zpl_shares_lookup(struct inode *dip, struct dentry *dentry, ++ unsigned int flags) ++#endif ++{ ++ cred_t *cr = CRED(); ++ struct inode *ip = NULL; ++ int error; ++ ++ crhold(cr); ++ error = -zfsctl_shares_lookup(dip, dname(dentry), &ip, ++ 0, cr, NULL, NULL); ++ ASSERT3S(error, <=, 0); ++ crfree(cr); ++ ++ if (error) { ++ if (error == -ENOENT) ++ return d_splice_alias(NULL, dentry); ++ else ++ return ERR_PTR(error); ++ } ++ ++ return d_splice_alias(ip, dentry); ++} ++ ++/* ARGSUSED */ ++static int ++zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir) ++{ ++ cred_t *cr = CRED(); ++ struct dentry *dentry = filp->f_path.dentry; ++ struct inode *ip = dentry->d_inode; ++ zfs_sb_t *zsb = ITOZSB(ip); ++ znode_t *dzp; ++ int error; ++ ++ ZFS_ENTER(zsb); ++ ++ if (zsb->z_shares_dir == 0) { ++ error = zpl_common_readdir(filp, dirent, filldir); ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ error = -zfs_zget(zsb, zsb->z_shares_dir, &dzp); ++ if (error) { ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ crhold(cr); ++ error = -zfs_readdir(ZTOI(dzp), dirent, filldir, &filp->f_pos, cr); ++ crfree(cr); ++ ++ iput(ZTOI(dzp)); ++ ZFS_EXIT(zsb); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++/* ARGSUSED */ ++static int ++zpl_shares_getattr(struct vfsmount *mnt, struct dentry *dentry, ++ struct kstat *stat) ++{ ++ struct inode *ip = dentry->d_inode; ++ zfs_sb_t *zsb = ITOZSB(ip); ++ znode_t *dzp; ++ int error; ++ ++ ZFS_ENTER(zsb); ++ ++ if (zsb->z_shares_dir == 0) { ++ error = simple_getattr(mnt, dentry, stat); ++ stat->nlink = stat->size = 2; ++ stat->atime = CURRENT_TIME; ++ ZFS_EXIT(zsb); ++ return (error); ++ } ++ ++ error = -zfs_zget(zsb, zsb->z_shares_dir, &dzp); ++ if (error == 0) ++ error = -zfs_getattr_fast(dentry->d_inode, stat); ++ ++ iput(ZTOI(dzp)); ++ ZFS_EXIT(zsb); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++/* ++ * The '.zfs/shares' directory file operations. ++ */ ++const struct file_operations zpl_fops_shares = { ++ .open = zpl_common_open, ++ .llseek = generic_file_llseek, ++ .read = generic_read_dir, ++ .readdir = zpl_shares_readdir, ++}; ++ ++/* ++ * The '.zfs/shares' directory inode operations. ++ */ ++const struct inode_operations zpl_ops_shares = { ++ .lookup = zpl_shares_lookup, ++ .getattr = zpl_shares_getattr, ++}; +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zpl_export.c linux-3.2.33-go/fs/zfs/zfs/zpl_export.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zpl_export.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zpl_export.c 2012-11-16 23:25:34.348039346 +0100 +@@ -0,0 +1,148 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2011 Gunnar Beutner ++ * Copyright (c) 2012 Cyril Plisko. All rights reserved. ++ */ ++ ++ ++#include ++#include ++#include ++#include ++ ++ ++static int ++#ifdef HAVE_ENCODE_FH_WITH_INODE ++zpl_encode_fh(struct inode *ip, __u32 *fh, int *max_len, struct inode *parent) ++{ ++#else ++zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable) ++{ ++ struct inode *ip = dentry->d_inode; ++#endif /* HAVE_ENCODE_FH_WITH_INODE */ ++ fid_t *fid = (fid_t *)fh; ++ int len_bytes, rc; ++ ++ len_bytes = *max_len * sizeof (__u32); ++ ++ if (len_bytes < offsetof(fid_t, fid_data)) ++ return 255; ++ ++ fid->fid_len = len_bytes - offsetof(fid_t, fid_data); ++ ++ if (zfsctl_is_node(ip)) ++ rc = zfsctl_fid(ip, fid); ++ else ++ rc = zfs_fid(ip, fid); ++ ++ len_bytes = offsetof(fid_t, fid_data) + fid->fid_len; ++ *max_len = roundup(len_bytes, sizeof (__u32)) / sizeof (__u32); ++ ++ return (rc == 0 ? FILEID_INO32_GEN : 255); ++} ++ ++static struct dentry * ++zpl_dentry_obtain_alias(struct inode *ip) ++{ ++ struct dentry *result; ++ ++#ifdef HAVE_D_OBTAIN_ALIAS ++ result = d_obtain_alias(ip); ++#else ++ result = d_alloc_anon(ip); ++ ++ if (result == NULL) { ++ iput(ip); ++ result = ERR_PTR(-ENOMEM); ++ } ++#endif /* HAVE_D_OBTAIN_ALIAS */ ++ ++ return result; ++} ++ ++static struct dentry * ++zpl_fh_to_dentry(struct super_block *sb, struct fid *fh, ++ int fh_len, int fh_type) ++{ ++ fid_t *fid = (fid_t *)fh; ++ struct inode *ip; ++ int len_bytes, rc; ++ ++ len_bytes = fh_len * sizeof (__u32); ++ ++ if (fh_type != FILEID_INO32_GEN || ++ len_bytes < offsetof(fid_t, fid_data) || ++ len_bytes < offsetof(fid_t, fid_data) + fid->fid_len) ++ return ERR_PTR(-EINVAL); ++ ++ rc = zfs_vget(sb, &ip, fid); ++ ++ if (rc != 0) ++ return ERR_PTR(-rc); ++ ++ ASSERT((ip != NULL) && !IS_ERR(ip)); ++ ++ return zpl_dentry_obtain_alias(ip); ++} ++ ++static struct dentry * ++zpl_get_parent(struct dentry *child) ++{ ++ cred_t *cr = CRED(); ++ struct inode *ip; ++ int error; ++ ++ crhold(cr); ++ error = -zfs_lookup(child->d_inode, "..", &ip, 0, cr, NULL, NULL); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ if (error) ++ return ERR_PTR(error); ++ ++ return zpl_dentry_obtain_alias(ip); ++} ++ ++#ifdef HAVE_COMMIT_METADATA ++static int ++zpl_commit_metadata(struct inode *inode) ++{ ++ cred_t *cr = CRED(); ++ int error; ++ ++ crhold(cr); ++ error = -zfs_fsync(inode, 0, cr); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return error; ++} ++#endif /* HAVE_COMMIT_METADATA */ ++ ++const struct export_operations zpl_export_operations = { ++ .encode_fh = zpl_encode_fh, ++ .fh_to_dentry = zpl_fh_to_dentry, ++ .get_parent = zpl_get_parent, ++#ifdef HAVE_COMMIT_METADATA ++ .commit_metadata= zpl_commit_metadata, ++#endif /* HAVE_COMMIT_METADATA */ ++}; +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zpl_file.c linux-3.2.33-go/fs/zfs/zfs/zpl_file.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zpl_file.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zpl_file.c 2012-11-16 23:25:34.350039322 +0100 +@@ -0,0 +1,462 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2011, Lawrence Livermore National Security, LLC. ++ */ ++ ++ ++#include ++#include ++#include ++#include ++ ++ ++static int ++zpl_open(struct inode *ip, struct file *filp) ++{ ++ cred_t *cr = CRED(); ++ int error; ++ ++ crhold(cr); ++ error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ if (error) ++ return (error); ++ ++ return generic_file_open(ip, filp); ++} ++ ++static int ++zpl_release(struct inode *ip, struct file *filp) ++{ ++ cred_t *cr = CRED(); ++ int error; ++ ++ crhold(cr); ++ error = -zfs_close(ip, filp->f_flags, cr); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++static int ++zpl_readdir(struct file *filp, void *dirent, filldir_t filldir) ++{ ++ struct dentry *dentry = filp->f_path.dentry; ++ cred_t *cr = CRED(); ++ int error; ++ ++ crhold(cr); ++ error = -zfs_readdir(dentry->d_inode, dirent, filldir, ++ &filp->f_pos, cr); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++#if defined(HAVE_FSYNC_WITH_DENTRY) ++/* ++ * Linux 2.6.x - 2.6.34 API, ++ * Through 2.6.34 the nfsd kernel server would pass a NULL 'file struct *' ++ * to the fops->fsync() hook. For this reason, we must be careful not to ++ * use filp unconditionally. ++ */ ++static int ++zpl_fsync(struct file *filp, struct dentry *dentry, int datasync) ++{ ++ cred_t *cr = CRED(); ++ int error; ++ ++ crhold(cr); ++ error = -zfs_fsync(dentry->d_inode, datasync, cr); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++#elif defined(HAVE_FSYNC_WITHOUT_DENTRY) ++/* ++ * Linux 2.6.35 - 3.0 API, ++ * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed ++ * redundant. The dentry is still accessible via filp->f_path.dentry, ++ * and we are guaranteed that filp will never be NULL. ++ */ ++static int ++zpl_fsync(struct file *filp, int datasync) ++{ ++ struct inode *inode = filp->f_mapping->host; ++ cred_t *cr = CRED(); ++ int error; ++ ++ crhold(cr); ++ error = -zfs_fsync(inode, datasync, cr); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++#elif defined(HAVE_FSYNC_RANGE) ++/* ++ * Linux 3.1 - 3.x API, ++ * As of 3.1 the responsibility to call filemap_write_and_wait_range() has ++ * been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex ++ * lock is no longer held by the caller, for zfs we don't require the lock ++ * to be held so we don't acquire it. ++ */ ++static int ++zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) ++{ ++ struct inode *inode = filp->f_mapping->host; ++ cred_t *cr = CRED(); ++ int error; ++ ++ error = filemap_write_and_wait_range(inode->i_mapping, start, end); ++ if (error) ++ return (error); ++ ++ crhold(cr); ++ error = -zfs_fsync(inode, datasync, cr); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++#else ++#error "Unsupported fops->fsync() implementation" ++#endif ++ ++ssize_t ++zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t pos, ++ uio_seg_t segment, int flags, cred_t *cr) ++{ ++ int error; ++ struct iovec iov; ++ uio_t uio; ++ ++ iov.iov_base = (void *)buf; ++ iov.iov_len = len; ++ ++ uio.uio_iov = &iov; ++ uio.uio_resid = len; ++ uio.uio_iovcnt = 1; ++ uio.uio_loffset = pos; ++ uio.uio_limit = MAXOFFSET_T; ++ uio.uio_segflg = segment; ++ ++ error = -zfs_read(ip, &uio, flags, cr); ++ if (error < 0) ++ return (error); ++ ++ return (len - uio.uio_resid); ++} ++ ++static ssize_t ++zpl_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) ++{ ++ cred_t *cr = CRED(); ++ ssize_t read; ++ ++ crhold(cr); ++ read = zpl_read_common(filp->f_mapping->host, buf, len, *ppos, ++ UIO_USERSPACE, filp->f_flags, cr); ++ crfree(cr); ++ ++ if (read < 0) ++ return (read); ++ ++ *ppos += read; ++ return (read); ++} ++ ++ssize_t ++zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t pos, ++ uio_seg_t segment, int flags, cred_t *cr) ++{ ++ int error; ++ struct iovec iov; ++ uio_t uio; ++ ++ iov.iov_base = (void *)buf; ++ iov.iov_len = len; ++ ++ uio.uio_iov = &iov; ++ uio.uio_resid = len, ++ uio.uio_iovcnt = 1; ++ uio.uio_loffset = pos; ++ uio.uio_limit = MAXOFFSET_T; ++ uio.uio_segflg = segment; ++ ++ error = -zfs_write(ip, &uio, flags, cr); ++ if (error < 0) ++ return (error); ++ ++ return (len - uio.uio_resid); ++} ++ ++static ssize_t ++zpl_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) ++{ ++ cred_t *cr = CRED(); ++ ssize_t wrote; ++ ++ crhold(cr); ++ wrote = zpl_write_common(filp->f_mapping->host, buf, len, *ppos, ++ UIO_USERSPACE, filp->f_flags, cr); ++ crfree(cr); ++ ++ if (wrote < 0) ++ return (wrote); ++ ++ *ppos += wrote; ++ return (wrote); ++} ++ ++/* ++ * It's worth taking a moment to describe how mmap is implemented ++ * for zfs because it differs considerably from other Linux filesystems. ++ * However, this issue is handled the same way under OpenSolaris. ++ * ++ * The issue is that by design zfs bypasses the Linux page cache and ++ * leaves all caching up to the ARC. This has been shown to work ++ * well for the common read(2)/write(2) case. However, mmap(2) ++ * is problem because it relies on being tightly integrated with the ++ * page cache. To handle this we cache mmap'ed files twice, once in ++ * the ARC and a second time in the page cache. The code is careful ++ * to keep both copies synchronized. ++ * ++ * When a file with an mmap'ed region is written to using write(2) ++ * both the data in the ARC and existing pages in the page cache ++ * are updated. For a read(2) data will be read first from the page ++ * cache then the ARC if needed. Neither a write(2) or read(2) will ++ * will ever result in new pages being added to the page cache. ++ * ++ * New pages are added to the page cache only via .readpage() which ++ * is called when the vfs needs to read a page off disk to back the ++ * virtual memory region. These pages may be modified without ++ * notifying the ARC and will be written out periodically via ++ * .writepage(). This will occur due to either a sync or the usual ++ * page aging behavior. Note because a read(2) of a mmap'ed file ++ * will always check the page cache first even when the ARC is out ++ * of date correct data will still be returned. ++ * ++ * While this implementation ensures correct behavior it does have ++ * have some drawbacks. The most obvious of which is that it ++ * increases the required memory footprint when access mmap'ed ++ * files. It also adds additional complexity to the code keeping ++ * both caches synchronized. ++ * ++ * Longer term it may be possible to cleanly resolve this wart by ++ * mapping page cache pages directly on to the ARC buffers. The ++ * Linux address space operations are flexible enough to allow ++ * selection of which pages back a particular index. The trick ++ * would be working out the details of which subsystem is in ++ * charge, the ARC, the page cache, or both. It may also prove ++ * helpful to move the ARC buffers to a scatter-gather lists ++ * rather than a vmalloc'ed region. ++ */ ++static int ++zpl_mmap(struct file *filp, struct vm_area_struct *vma) ++{ ++ struct inode *ip = filp->f_mapping->host; ++ znode_t *zp = ITOZ(ip); ++ int error; ++ ++ error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, ++ (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); ++ if (error) ++ return (error); ++ ++ error = generic_file_mmap(filp, vma); ++ if (error) ++ return (error); ++ ++ mutex_enter(&zp->z_lock); ++ zp->z_is_mapped = 1; ++ mutex_exit(&zp->z_lock); ++ ++ return (error); ++} ++ ++/* ++ * Populate a page with data for the Linux page cache. This function is ++ * only used to support mmap(2). There will be an identical copy of the ++ * data in the ARC which is kept up to date via .write() and .writepage(). ++ * ++ * Current this function relies on zpl_read_common() and the O_DIRECT ++ * flag to read in a page. This works but the more correct way is to ++ * update zfs_fillpage() to be Linux friendly and use that interface. ++ */ ++static int ++zpl_readpage(struct file *filp, struct page *pp) ++{ ++ struct inode *ip; ++ struct page *pl[1]; ++ int error = 0; ++ ++ ASSERT(PageLocked(pp)); ++ ip = pp->mapping->host; ++ pl[0] = pp; ++ ++ error = -zfs_getpage(ip, pl, 1); ++ ++ if (error) { ++ SetPageError(pp); ++ ClearPageUptodate(pp); ++ } else { ++ ClearPageError(pp); ++ SetPageUptodate(pp); ++ flush_dcache_page(pp); ++ } ++ ++ unlock_page(pp); ++ return error; ++} ++ ++/* ++ * Populate a set of pages with data for the Linux page cache. This ++ * function will only be called for read ahead and never for demand ++ * paging. For simplicity, the code relies on read_cache_pages() to ++ * correctly lock each page for IO and call zpl_readpage(). ++ */ ++static int ++zpl_readpages(struct file *filp, struct address_space *mapping, ++ struct list_head *pages, unsigned nr_pages) ++{ ++ return (read_cache_pages(mapping, pages, ++ (filler_t *)zpl_readpage, filp)); ++} ++ ++int ++zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) ++{ ++ struct address_space *mapping = data; ++ ++ ASSERT(PageLocked(pp)); ++ ASSERT(!PageWriteback(pp)); ++ ASSERT(!(current->flags & PF_NOFS)); ++ ++ /* ++ * Annotate this call path with a flag that indicates that it is ++ * unsafe to use KM_SLEEP during memory allocations due to the ++ * potential for a deadlock. KM_PUSHPAGE should be used instead. ++ */ ++ current->flags |= PF_NOFS; ++ (void) zfs_putpage(mapping->host, pp, wbc); ++ current->flags &= ~PF_NOFS; ++ ++ return (0); ++} ++ ++static int ++zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) ++{ ++ return write_cache_pages(mapping, wbc, zpl_putpage, mapping); ++} ++ ++/* ++ * Write out dirty pages to the ARC, this function is only required to ++ * support mmap(2). Mapped pages may be dirtied by memory operations ++ * which never call .write(). These dirty pages are kept in sync with ++ * the ARC buffers via this hook. ++ */ ++static int ++zpl_writepage(struct page *pp, struct writeback_control *wbc) ++{ ++ return zpl_putpage(pp, wbc, pp->mapping); ++} ++ ++/* ++ * The only flag combination which matches the behavior of zfs_space() ++ * is FALLOC_FL_PUNCH_HOLE. This flag was introduced in the 2.6.38 kernel. ++ */ ++long ++zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) ++{ ++ cred_t *cr = CRED(); ++ int error = -EOPNOTSUPP; ++ ++ if (mode & FALLOC_FL_KEEP_SIZE) ++ return (-EOPNOTSUPP); ++ ++ crhold(cr); ++ ++#ifdef FALLOC_FL_PUNCH_HOLE ++ if (mode & FALLOC_FL_PUNCH_HOLE) { ++ flock64_t bf; ++ ++ bf.l_type = F_WRLCK; ++ bf.l_whence = 0; ++ bf.l_start = offset; ++ bf.l_len = len; ++ bf.l_pid = 0; ++ ++ error = -zfs_space(ip, F_FREESP, &bf, FWRITE, offset, cr); ++ } ++#endif /* FALLOC_FL_PUNCH_HOLE */ ++ ++ crfree(cr); ++ ++ ASSERT3S(error, <=, 0); ++ return (error); ++} ++ ++#ifdef HAVE_FILE_FALLOCATE ++static long ++zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) ++{ ++ return zpl_fallocate_common(filp->f_path.dentry->d_inode, ++ mode, offset, len); ++} ++#endif /* HAVE_FILE_FALLOCATE */ ++ ++const struct address_space_operations zpl_address_space_operations = { ++ .readpages = zpl_readpages, ++ .readpage = zpl_readpage, ++ .writepage = zpl_writepage, ++ .writepages = zpl_writepages, ++}; ++ ++const struct file_operations zpl_file_operations = { ++ .open = zpl_open, ++ .release = zpl_release, ++ .llseek = generic_file_llseek, ++ .read = zpl_read, ++ .write = zpl_write, ++ .readdir = zpl_readdir, ++ .mmap = zpl_mmap, ++ .fsync = zpl_fsync, ++#ifdef HAVE_FILE_FALLOCATE ++ .fallocate = zpl_fallocate, ++#endif /* HAVE_FILE_FALLOCATE */ ++}; ++ ++const struct file_operations zpl_dir_file_operations = { ++ .llseek = generic_file_llseek, ++ .read = generic_read_dir, ++ .readdir = zpl_readdir, ++ .fsync = zpl_fsync, ++}; +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zpl_inode.c linux-3.2.33-go/fs/zfs/zfs/zpl_inode.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zpl_inode.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zpl_inode.c 2012-11-16 23:25:34.350039322 +0100 +@@ -0,0 +1,438 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2011, Lawrence Livermore National Security, LLC. ++ */ ++ ++ ++#include ++#include ++#include ++#include ++#include ++ ++ ++static struct dentry * ++#ifdef HAVE_LOOKUP_NAMEIDATA ++zpl_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) ++#else ++zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) ++#endif ++{ ++ cred_t *cr = CRED(); ++ struct inode *ip; ++ int error; ++ ++ crhold(cr); ++ error = -zfs_lookup(dir, dname(dentry), &ip, 0, cr, NULL, NULL); ++ ASSERT3S(error, <=, 0); ++ crfree(cr); ++ ++ if (error) { ++ if (error == -ENOENT) ++ return d_splice_alias(NULL, dentry); ++ else ++ return ERR_PTR(error); ++ } ++ ++ return d_splice_alias(ip, dentry); ++} ++ ++void ++zpl_vap_init(vattr_t *vap, struct inode *dir, struct dentry *dentry, ++ zpl_umode_t mode, cred_t *cr) ++{ ++ vap->va_mask = ATTR_MODE; ++ vap->va_mode = mode; ++ vap->va_dentry = dentry; ++ vap->va_uid = crgetfsuid(cr); ++ ++ if (dir && dir->i_mode & S_ISGID) { ++ vap->va_gid = dir->i_gid; ++ if (S_ISDIR(mode)) ++ vap->va_mode |= S_ISGID; ++ } else { ++ vap->va_gid = crgetfsgid(cr); ++ } ++} ++ ++static int ++#ifdef HAVE_CREATE_NAMEIDATA ++zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, ++ struct nameidata *nd) ++#else ++zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, ++ bool flag) ++#endif ++{ ++ cred_t *cr = CRED(); ++ struct inode *ip; ++ vattr_t *vap; ++ int error; ++ ++ crhold(cr); ++ vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); ++ zpl_vap_init(vap, dir, dentry, mode, cr); ++ ++ error = -zfs_create(dir, (char *)dentry->d_name.name, ++ vap, 0, mode, &ip, cr, 0, NULL); ++ kmem_free(vap, sizeof(vattr_t)); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++static int ++zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, ++ dev_t rdev) ++{ ++ cred_t *cr = CRED(); ++ struct inode *ip; ++ vattr_t *vap; ++ int error; ++ ++ /* ++ * We currently expect Linux to supply rdev=0 for all sockets ++ * and fifos, but we want to know if this behavior ever changes. ++ */ ++ if (S_ISSOCK(mode) || S_ISFIFO(mode)) ++ ASSERT(rdev == 0); ++ ++ crhold(cr); ++ vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); ++ zpl_vap_init(vap, dir, dentry, mode, cr); ++ vap->va_rdev = rdev; ++ ++ error = -zfs_create(dir, (char *)dentry->d_name.name, ++ vap, 0, mode, &ip, cr, 0, NULL); ++ kmem_free(vap, sizeof(vattr_t)); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (-error); ++} ++ ++static int ++zpl_unlink(struct inode *dir, struct dentry *dentry) ++{ ++ cred_t *cr = CRED(); ++ int error; ++ ++ crhold(cr); ++ error = -zfs_remove(dir, dname(dentry), cr); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++static int ++zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) ++{ ++ cred_t *cr = CRED(); ++ vattr_t *vap; ++ struct inode *ip; ++ int error; ++ ++ crhold(cr); ++ vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); ++ zpl_vap_init(vap, dir, dentry, mode | S_IFDIR, cr); ++ ++ error = -zfs_mkdir(dir, dname(dentry), vap, &ip, cr, 0, NULL); ++ kmem_free(vap, sizeof(vattr_t)); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++static int ++zpl_rmdir(struct inode * dir, struct dentry *dentry) ++{ ++ cred_t *cr = CRED(); ++ int error; ++ ++ crhold(cr); ++ error = -zfs_rmdir(dir, dname(dentry), NULL, cr, 0); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++static int ++zpl_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) ++{ ++ boolean_t issnap = ITOZSB(dentry->d_inode)->z_issnap; ++ int error; ++ ++ /* ++ * Ensure MNT_SHRINKABLE is set on snapshots to ensure they are ++ * unmounted automatically with the parent file system. This ++ * is done on the first getattr because it's not easy to get the ++ * vfsmount structure at mount time. This call path is explicitly ++ * marked unlikely to avoid any performance impact. FWIW, ext4 ++ * resorts to a similar trick for sysadmin convenience. ++ */ ++ if (unlikely(issnap && !(mnt->mnt_flags & MNT_SHRINKABLE))) ++ mnt->mnt_flags |= MNT_SHRINKABLE; ++ ++ error = -zfs_getattr_fast(dentry->d_inode, stat); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++static int ++zpl_setattr(struct dentry *dentry, struct iattr *ia) ++{ ++ cred_t *cr = CRED(); ++ vattr_t *vap; ++ int error; ++ ++ error = inode_change_ok(dentry->d_inode, ia); ++ if (error) ++ return (error); ++ ++ crhold(cr); ++ vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); ++ vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK; ++ vap->va_mode = ia->ia_mode; ++ vap->va_uid = ia->ia_uid; ++ vap->va_gid = ia->ia_gid; ++ vap->va_size = ia->ia_size; ++ vap->va_atime = ia->ia_atime; ++ vap->va_mtime = ia->ia_mtime; ++ vap->va_ctime = ia->ia_ctime; ++ ++ error = -zfs_setattr(dentry->d_inode, vap, 0, cr); ++ ++ kmem_free(vap, sizeof(vattr_t)); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++static int ++zpl_rename(struct inode *sdip, struct dentry *sdentry, ++ struct inode *tdip, struct dentry *tdentry) ++{ ++ cred_t *cr = CRED(); ++ int error; ++ ++ crhold(cr); ++ error = -zfs_rename(sdip, dname(sdentry), tdip, dname(tdentry), cr, 0); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++static int ++zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) ++{ ++ cred_t *cr = CRED(); ++ vattr_t *vap; ++ struct inode *ip; ++ int error; ++ ++ crhold(cr); ++ vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); ++ zpl_vap_init(vap, dir, dentry, S_IFLNK | S_IRWXUGO, cr); ++ ++ error = -zfs_symlink(dir, dname(dentry), vap, (char *)name, &ip, cr, 0); ++ kmem_free(vap, sizeof(vattr_t)); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++static void * ++zpl_follow_link(struct dentry *dentry, struct nameidata *nd) ++{ ++ cred_t *cr = CRED(); ++ struct inode *ip = dentry->d_inode; ++ struct iovec iov; ++ uio_t uio; ++ char *link; ++ int error; ++ ++ crhold(cr); ++ ++ iov.iov_len = MAXPATHLEN; ++ iov.iov_base = link = kmem_zalloc(MAXPATHLEN, KM_SLEEP); ++ ++ uio.uio_iov = &iov; ++ uio.uio_iovcnt = 1; ++ uio.uio_resid = (MAXPATHLEN - 1); ++ uio.uio_segflg = UIO_SYSSPACE; ++ ++ error = -zfs_readlink(ip, &uio, cr); ++ if (error) { ++ kmem_free(link, MAXPATHLEN); ++ nd_set_link(nd, ERR_PTR(error)); ++ } else { ++ nd_set_link(nd, link); ++ } ++ ++ crfree(cr); ++ return (NULL); ++} ++ ++static void ++zpl_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr) ++{ ++ const char *link = nd_get_link(nd); ++ ++ if (!IS_ERR(link)) ++ kmem_free(link, MAXPATHLEN); ++} ++ ++static int ++zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) ++{ ++ cred_t *cr = CRED(); ++ struct inode *ip = old_dentry->d_inode; ++ int error; ++ ++ if (ip->i_nlink >= ZFS_LINK_MAX) ++ return -EMLINK; ++ ++ crhold(cr); ++ ip->i_ctime = CURRENT_TIME_SEC; ++ igrab(ip); /* Use ihold() if available */ ++ ++ error = -zfs_link(dir, ip, dname(dentry), cr); ++ if (error) { ++ iput(ip); ++ goto out; ++ } ++ ++ d_instantiate(dentry, ip); ++out: ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++#ifdef HAVE_INODE_TRUNCATE_RANGE ++static void ++zpl_truncate_range(struct inode* ip, loff_t start, loff_t end) ++{ ++ cred_t *cr = CRED(); ++ flock64_t bf; ++ ++ ASSERT3S(start, <=, end); ++ ++ /* ++ * zfs_freesp() will interpret (len == 0) as meaning "truncate until ++ * the end of the file". We don't want that. ++ */ ++ if (start == end) ++ return; ++ ++ crhold(cr); ++ ++ bf.l_type = F_WRLCK; ++ bf.l_whence = 0; ++ bf.l_start = start; ++ bf.l_len = end - start; ++ bf.l_pid = 0; ++ zfs_space(ip, F_FREESP, &bf, FWRITE, start, cr); ++ ++ crfree(cr); ++} ++#endif /* HAVE_INODE_TRUNCATE_RANGE */ ++ ++#ifdef HAVE_INODE_FALLOCATE ++static long ++zpl_fallocate(struct inode *ip, int mode, loff_t offset, loff_t len) ++{ ++ return zpl_fallocate_common(ip, mode, offset, len); ++} ++#endif /* HAVE_INODE_FALLOCATE */ ++ ++ ++const struct inode_operations zpl_inode_operations = { ++ .create = zpl_create, ++ .link = zpl_link, ++ .unlink = zpl_unlink, ++ .symlink = zpl_symlink, ++ .mkdir = zpl_mkdir, ++ .rmdir = zpl_rmdir, ++ .mknod = zpl_mknod, ++ .rename = zpl_rename, ++ .setattr = zpl_setattr, ++ .getattr = zpl_getattr, ++ .setxattr = generic_setxattr, ++ .getxattr = generic_getxattr, ++ .removexattr = generic_removexattr, ++ .listxattr = zpl_xattr_list, ++#ifdef HAVE_INODE_TRUNCATE_RANGE ++ .truncate_range = zpl_truncate_range, ++#endif /* HAVE_INODE_TRUNCATE_RANGE */ ++#ifdef HAVE_INODE_FALLOCATE ++ .fallocate = zpl_fallocate, ++#endif /* HAVE_INODE_FALLOCATE */ ++}; ++ ++const struct inode_operations zpl_dir_inode_operations = { ++ .create = zpl_create, ++ .lookup = zpl_lookup, ++ .link = zpl_link, ++ .unlink = zpl_unlink, ++ .symlink = zpl_symlink, ++ .mkdir = zpl_mkdir, ++ .rmdir = zpl_rmdir, ++ .mknod = zpl_mknod, ++ .rename = zpl_rename, ++ .setattr = zpl_setattr, ++ .getattr = zpl_getattr, ++ .setxattr = generic_setxattr, ++ .getxattr = generic_getxattr, ++ .removexattr = generic_removexattr, ++ .listxattr = zpl_xattr_list, ++}; ++ ++const struct inode_operations zpl_symlink_inode_operations = { ++ .readlink = generic_readlink, ++ .follow_link = zpl_follow_link, ++ .put_link = zpl_put_link, ++ .setattr = zpl_setattr, ++ .getattr = zpl_getattr, ++ .setxattr = generic_setxattr, ++ .getxattr = generic_getxattr, ++ .removexattr = generic_removexattr, ++ .listxattr = zpl_xattr_list, ++}; ++ ++const struct inode_operations zpl_special_inode_operations = { ++ .setattr = zpl_setattr, ++ .getattr = zpl_getattr, ++ .setxattr = generic_setxattr, ++ .getxattr = generic_getxattr, ++ .removexattr = generic_removexattr, ++ .listxattr = zpl_xattr_list, ++}; +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zpl_super.c linux-3.2.33-go/fs/zfs/zfs/zpl_super.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zpl_super.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zpl_super.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,342 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2011, Lawrence Livermore National Security, LLC. ++ */ ++ ++ ++#include ++#include ++#include ++#include ++#include ++ ++ ++static struct inode * ++zpl_inode_alloc(struct super_block *sb) ++{ ++ struct inode *ip; ++ ++ VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0); ++ ip->i_version = 1; ++ ++ return (ip); ++} ++ ++static void ++zpl_inode_destroy(struct inode *ip) ++{ ++ ASSERT(atomic_read(&ip->i_count) == 0); ++ zfs_inode_destroy(ip); ++} ++ ++/* ++ * When ->drop_inode() is called its return value indicates if the ++ * inode should be evicted from the inode cache. If the inode is ++ * unhashed and has no links the default policy is to evict it ++ * immediately. ++ * ++ * Prior to 2.6.36 this eviction was accomplished by the vfs calling ++ * ->delete_inode(). It was ->delete_inode()'s responsibility to ++ * truncate the inode pages and call clear_inode(). The call to ++ * clear_inode() synchronously invalidates all the buffers and ++ * calls ->clear_inode(). It was ->clear_inode()'s responsibility ++ * to cleanup and filesystem specific data before freeing the inode. ++ * ++ * This elaborate mechanism was replaced by ->evict_inode() which ++ * does the job of both ->delete_inode() and ->clear_inode(). It ++ * will be called exactly once, and when it returns the inode must ++ * be in a state where it can simply be freed.i ++ * ++ * The ->evict_inode() callback must minimally truncate the inode pages, ++ * and call clear_inode(). For 2.6.35 and later kernels this will ++ * simply update the inode state, with the sync occurring before the ++ * truncate in evict(). For earlier kernels clear_inode() maps to ++ * end_writeback() which is responsible for completing all outstanding ++ * write back. In either case, once this is done it is safe to cleanup ++ * any remaining inode specific data via zfs_inactive(). ++ * remaining filesystem specific data. ++ */ ++#ifdef HAVE_EVICT_INODE ++static void ++zpl_evict_inode(struct inode *ip) ++{ ++ truncate_setsize(ip, 0); ++ clear_inode(ip); ++ zfs_inactive(ip); ++} ++ ++#else ++ ++static void ++zpl_clear_inode(struct inode *ip) ++{ ++ zfs_inactive(ip); ++} ++ ++static void ++zpl_inode_delete(struct inode *ip) ++{ ++ truncate_setsize(ip, 0); ++ clear_inode(ip); ++} ++ ++#endif /* HAVE_EVICT_INODE */ ++ ++static void ++zpl_put_super(struct super_block *sb) ++{ ++ int error; ++ ++ error = -zfs_umount(sb); ++ ASSERT3S(error, <=, 0); ++} ++ ++static int ++zpl_sync_fs(struct super_block *sb, int wait) ++{ ++ cred_t *cr = CRED(); ++ int error; ++ ++ crhold(cr); ++ error = -zfs_sync(sb, wait, cr); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++static int ++zpl_statfs(struct dentry *dentry, struct kstatfs *statp) ++{ ++ int error; ++ ++ error = -zfs_statvfs(dentry, statp); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++static int ++zpl_remount_fs(struct super_block *sb, int *flags, char *data) ++{ ++ int error; ++ error = -zfs_remount(sb, flags, data); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++static void ++zpl_umount_begin(struct super_block *sb) ++{ ++ zfs_sb_t *zsb = sb->s_fs_info; ++ int count; ++ ++ /* ++ * Best effort to unmount snapshots in .zfs/snapshot/. Normally this ++ * isn't required because snapshots have the MNT_SHRINKABLE flag set. ++ */ ++ if (zsb->z_ctldir) ++ (void) zfsctl_unmount_snapshots(zsb, MNT_FORCE, &count); ++} ++ ++/* ++ * The Linux VFS automatically handles the following flags: ++ * MNT_NOSUID, MNT_NODEV, MNT_NOEXEC, MNT_NOATIME, MNT_READONLY ++ */ ++#ifdef HAVE_SHOW_OPTIONS_WITH_DENTRY ++static int ++zpl_show_options(struct seq_file *seq, struct dentry *root) ++{ ++ zfs_sb_t *zsb = root->d_sb->s_fs_info; ++ ++ seq_printf(seq, ",%s", zsb->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); ++ ++ return (0); ++} ++#else ++static int ++zpl_show_options(struct seq_file *seq, struct vfsmount *vfsp) ++{ ++ zfs_sb_t *zsb = vfsp->mnt_sb->s_fs_info; ++ ++ seq_printf(seq, ",%s", zsb->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); ++ ++ return (0); ++} ++#endif /* HAVE_SHOW_OPTIONS_WITH_DENTRY */ ++ ++static int ++zpl_fill_super(struct super_block *sb, void *data, int silent) ++{ ++ int error; ++ ++ error = -zfs_domount(sb, data, silent); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++#ifdef HAVE_MOUNT_NODEV ++static struct dentry * ++zpl_mount(struct file_system_type *fs_type, int flags, ++ const char *osname, void *data) ++{ ++ zpl_mount_data_t zmd = { osname, data }; ++ ++ return mount_nodev(fs_type, flags, &zmd, zpl_fill_super); ++} ++#else ++static int ++zpl_get_sb(struct file_system_type *fs_type, int flags, ++ const char *osname, void *data, struct vfsmount *mnt) ++{ ++ zpl_mount_data_t zmd = { osname, data }; ++ ++ return get_sb_nodev(fs_type, flags, &zmd, zpl_fill_super, mnt); ++} ++#endif /* HAVE_MOUNT_NODEV */ ++ ++static void ++zpl_kill_sb(struct super_block *sb) ++{ ++ zfs_preumount(sb); ++ kill_anon_super(sb); ++} ++ ++#ifdef HAVE_SHRINK ++/* ++ * Linux 3.1 - 3.x API ++ * ++ * The Linux 3.1 API introduced per-sb cache shrinkers to replace the ++ * global ones. This allows us a mechanism to cleanly target a specific ++ * zfs file system when the dnode and inode caches grow too large. ++ * ++ * In addition, the 3.0 kernel added the iterate_supers_type() helper ++ * function which is used to safely walk all of the zfs file systems. ++ */ ++static void ++zpl_prune_sb(struct super_block *sb, void *arg) ++{ ++ int objects = 0; ++ int error; ++ ++ error = -zfs_sb_prune(sb, *(unsigned long *)arg, &objects); ++ ASSERT3S(error, <=, 0); ++ ++ return; ++} ++ ++void ++zpl_prune_sbs(int64_t bytes_to_scan, void *private) ++{ ++ unsigned long nr_to_scan = (bytes_to_scan / sizeof(znode_t)); ++ ++ iterate_supers_type(&zpl_fs_type, zpl_prune_sb, &nr_to_scan); ++ kmem_reap(); ++} ++#else ++/* ++ * Linux 2.6.x - 3.0 API ++ * ++ * These are best effort interfaces are provided by the SPL to induce ++ * the Linux VM subsystem to reclaim a fraction of the both dnode and ++ * inode caches. Ideally, we want to just target the zfs file systems ++ * however our only option is to reclaim from them all. ++ */ ++void ++zpl_prune_sbs(int64_t bytes_to_scan, void *private) ++{ ++ unsigned long nr_to_scan = (bytes_to_scan / sizeof(znode_t)); ++ ++ shrink_dcache_memory(nr_to_scan, GFP_KERNEL); ++ shrink_icache_memory(nr_to_scan, GFP_KERNEL); ++ kmem_reap(); ++} ++#endif /* HAVE_SHRINK */ ++ ++#ifdef HAVE_NR_CACHED_OBJECTS ++static int ++zpl_nr_cached_objects(struct super_block *sb) ++{ ++ zfs_sb_t *zsb = sb->s_fs_info; ++ int nr; ++ ++ mutex_enter(&zsb->z_znodes_lock); ++ nr = zsb->z_nr_znodes; ++ mutex_exit(&zsb->z_znodes_lock); ++ ++ return (nr); ++} ++#endif /* HAVE_NR_CACHED_OBJECTS */ ++ ++#ifdef HAVE_FREE_CACHED_OBJECTS ++/* ++ * Attempt to evict some meta data from the cache. The ARC operates in ++ * terms of bytes while the Linux VFS uses objects. Now because this is ++ * just a best effort eviction and the exact values aren't critical so we ++ * extrapolate from an object count to a byte size using the znode_t size. ++ */ ++static void ++zpl_free_cached_objects(struct super_block *sb, int nr_to_scan) ++{ ++ arc_adjust_meta(nr_to_scan * sizeof(znode_t), B_FALSE); ++} ++#endif /* HAVE_FREE_CACHED_OBJECTS */ ++ ++const struct super_operations zpl_super_operations = { ++ .alloc_inode = zpl_inode_alloc, ++ .destroy_inode = zpl_inode_destroy, ++ .dirty_inode = NULL, ++ .write_inode = NULL, ++ .drop_inode = NULL, ++#ifdef HAVE_EVICT_INODE ++ .evict_inode = zpl_evict_inode, ++#else ++ .clear_inode = zpl_clear_inode, ++ .delete_inode = zpl_inode_delete, ++#endif /* HAVE_EVICT_INODE */ ++ .put_super = zpl_put_super, ++ .sync_fs = zpl_sync_fs, ++ .statfs = zpl_statfs, ++ .remount_fs = zpl_remount_fs, ++ .umount_begin = zpl_umount_begin, ++ .show_options = zpl_show_options, ++ .show_stats = NULL, ++#ifdef HAVE_NR_CACHED_OBJECTS ++ .nr_cached_objects = zpl_nr_cached_objects, ++#endif /* HAVE_NR_CACHED_OBJECTS */ ++#ifdef HAVE_FREE_CACHED_OBJECTS ++ .free_cached_objects = zpl_free_cached_objects, ++#endif /* HAVE_FREE_CACHED_OBJECTS */ ++}; ++ ++struct file_system_type zpl_fs_type = { ++ .owner = THIS_MODULE, ++ .name = ZFS_DRIVER, ++#ifdef HAVE_MOUNT_NODEV ++ .mount = zpl_mount, ++#else ++ .get_sb = zpl_get_sb, ++#endif /* HAVE_MOUNT_NODEV */ ++ .kill_sb = zpl_kill_sb, ++}; +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zpl_xattr.c linux-3.2.33-go/fs/zfs/zfs/zpl_xattr.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zpl_xattr.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zpl_xattr.c 2012-11-16 23:25:34.351039311 +0100 +@@ -0,0 +1,678 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2011, Lawrence Livermore National Security, LLC. ++ * ++ * Extended attributes (xattr) on Solaris are implemented as files ++ * which exist in a hidden xattr directory. These extended attributes ++ * can be accessed using the attropen() system call which opens ++ * the extended attribute. It can then be manipulated just like ++ * a standard file descriptor. This has a couple advantages such ++ * as practically no size limit on the file, and the extended ++ * attributes permissions may differ from those of the parent file. ++ * This interface is really quite clever, but it's also completely ++ * different than what is supported on Linux. It also comes with a ++ * steep performance penalty when accessing small xattrs because they ++ * are not stored with the parent file. ++ * ++ * Under Linux extended attributes are manipulated by the system ++ * calls getxattr(2), setxattr(2), and listxattr(2). They consider ++ * extended attributes to be name/value pairs where the name is a ++ * NULL terminated string. The name must also include one of the ++ * following namespace prefixes: ++ * ++ * user - No restrictions and is available to user applications. ++ * trusted - Restricted to kernel and root (CAP_SYS_ADMIN) use. ++ * system - Used for access control lists (system.nfs4_acl, etc). ++ * security - Used by SELinux to store a files security context. ++ * ++ * The value under Linux to limited to 65536 bytes of binary data. ++ * In practice, individual xattrs tend to be much smaller than this ++ * and are typically less than 100 bytes. A good example of this ++ * are the security.selinux xattrs which are less than 100 bytes and ++ * exist for every file when xattr labeling is enabled. ++ * ++ * The Linux xattr implemenation has been written to take advantage of ++ * this typical usage. When the dataset property 'xattr=sa' is set, ++ * then xattrs will be preferentially stored as System Attributes (SA). ++ * This allows tiny xattrs (~100 bytes) to be stored with the dnode and ++ * up to 64k of xattrs to be stored in the spill block. If additional ++ * xattr space is required, which is unlikely under Linux, they will ++ * be stored using the traditional directory approach. ++ * ++ * This optimization results in roughly a 3x performance improvement ++ * when accessing xattrs because it avoids the need to perform a seek ++ * for every xattr value. When multiple xattrs are stored per-file ++ * the performance improvements are even greater because all of the ++ * xattrs stored in the spill block will be cached. ++ * ++ * However, by default SA based xattrs are disabled in the Linux port ++ * to maximize compatibility with other implementations. If you do ++ * enable SA based xattrs then they will not be visible on platforms ++ * which do not support this feature. ++ * ++ * NOTE: One additional consequence of the xattr directory implementation ++ * is that when an extended attribute is manipulated an inode is created. ++ * This inode will exist in the Linux inode cache but there will be no ++ * associated entry in the dentry cache which references it. This is ++ * safe but it may result in some confusion. Enabling SA based xattrs ++ * largely avoids the issue except in the overflow case. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++typedef struct xattr_filldir { ++ size_t size; ++ size_t offset; ++ char *buf; ++ struct inode *inode; ++} xattr_filldir_t; ++ ++static int ++zpl_xattr_filldir(void *arg, const char *name, int name_len, ++ loff_t offset, uint64_t objnum, unsigned int d_type) ++{ ++ xattr_filldir_t *xf = arg; ++ ++ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) ++ if (!(ITOZSB(xf->inode)->z_flags & ZSB_XATTR)) ++ return (0); ++ ++ if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) ++ if (!capable(CAP_SYS_ADMIN)) ++ return (0); ++ ++ /* When xf->buf is NULL only calculate the required size. */ ++ if (xf->buf) { ++ if (xf->offset + name_len + 1 > xf->size) ++ return (-ERANGE); ++ ++ memcpy(xf->buf + xf->offset, name, name_len); ++ xf->buf[xf->offset + name_len] = '\0'; ++ } ++ ++ xf->offset += (name_len + 1); ++ ++ return (0); ++} ++ ++static ssize_t ++zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr) ++{ ++ struct inode *ip = xf->inode; ++ struct inode *dxip = NULL; ++ loff_t pos = 3; /* skip '.', '..', and '.zfs' entries. */ ++ int error; ++ ++ /* Lookup the xattr directory */ ++ error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL); ++ if (error) { ++ if (error == -ENOENT) ++ error = 0; ++ ++ return (error); ++ } ++ ++ /* Fill provided buffer via zpl_zattr_filldir helper */ ++ error = -zfs_readdir(dxip, (void *)xf, zpl_xattr_filldir, &pos, cr); ++ iput(dxip); ++ ++ return (error); ++} ++ ++static ssize_t ++zpl_xattr_list_sa(xattr_filldir_t *xf) ++{ ++ znode_t *zp = ITOZ(xf->inode); ++ nvpair_t *nvp = NULL; ++ int error = 0; ++ ++ mutex_enter(&zp->z_lock); ++ if (zp->z_xattr_cached == NULL) ++ error = -zfs_sa_get_xattr(zp); ++ mutex_exit(&zp->z_lock); ++ ++ if (error) ++ return (error); ++ ++ ASSERT(zp->z_xattr_cached); ++ ++ while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) { ++ ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY); ++ ++ error = zpl_xattr_filldir((void *)xf, nvpair_name(nvp), ++ strlen(nvpair_name(nvp)), 0, 0, 0); ++ if (error) ++ return (error); ++ } ++ ++ return (0); ++} ++ ++ssize_t ++zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) ++{ ++ znode_t *zp = ITOZ(dentry->d_inode); ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ xattr_filldir_t xf = { buffer_size, 0, buffer, dentry->d_inode }; ++ cred_t *cr = CRED(); ++ int error = 0; ++ ++ crhold(cr); ++ rw_enter(&zp->z_xattr_lock, RW_READER); ++ ++ if (zsb->z_use_sa && zp->z_is_sa) { ++ error = zpl_xattr_list_sa(&xf); ++ if (error) ++ goto out; ++ } ++ ++ error = zpl_xattr_list_dir(&xf, cr); ++ if (error) ++ goto out; ++ ++ error = xf.offset; ++out: ++ ++ rw_exit(&zp->z_xattr_lock); ++ crfree(cr); ++ ++ return (error); ++} ++ ++static int ++zpl_xattr_get_dir(struct inode *ip, const char *name, void *value, ++ size_t size, cred_t *cr) ++{ ++ struct inode *dxip = NULL; ++ struct inode *xip = NULL; ++ int error; ++ ++ /* Lookup the xattr directory */ ++ error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL); ++ if (error) ++ goto out; ++ ++ /* Lookup a specific xattr name in the directory */ ++ error = -zfs_lookup(dxip, (char *)name, &xip, 0, cr, NULL, NULL); ++ if (error) ++ goto out; ++ ++ if (!size) { ++ error = i_size_read(xip); ++ goto out; ++ } ++ ++ error = zpl_read_common(xip, value, size, 0, UIO_SYSSPACE, 0, cr); ++out: ++ if (xip) ++ iput(xip); ++ ++ if (dxip) ++ iput(dxip); ++ ++ return (error); ++} ++ ++static int ++zpl_xattr_get_sa(struct inode *ip, const char *name, void *value, size_t size) ++{ ++ znode_t *zp = ITOZ(ip); ++ uchar_t *nv_value; ++ uint_t nv_size; ++ int error = 0; ++ ++ ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); ++ ++ mutex_enter(&zp->z_lock); ++ if (zp->z_xattr_cached == NULL) ++ error = -zfs_sa_get_xattr(zp); ++ mutex_exit(&zp->z_lock); ++ ++ if (error) ++ return (error); ++ ++ ASSERT(zp->z_xattr_cached); ++ error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name, ++ &nv_value, &nv_size); ++ if (error) ++ return (error); ++ ++ if (!size) ++ return (nv_size); ++ ++ memcpy(value, nv_value, MIN(size, nv_size)); ++ ++ return (MIN(size, nv_size)); ++} ++ ++static int ++__zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size, ++ cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ int error; ++ ++ ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); ++ ++ if (zsb->z_use_sa && zp->z_is_sa) { ++ error = zpl_xattr_get_sa(ip, name, value, size); ++ if (error >= 0) ++ goto out; ++ } ++ ++ error = zpl_xattr_get_dir(ip, name, value, size, cr); ++out: ++ if (error == -ENOENT) ++ error = -ENODATA; ++ ++ return (error); ++} ++ ++static int ++zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size) ++{ ++ znode_t *zp = ITOZ(ip); ++ cred_t *cr = CRED(); ++ int error; ++ ++ crhold(cr); ++ rw_enter(&zp->z_xattr_lock, RW_READER); ++ error = __zpl_xattr_get(ip, name, value, size, cr); ++ rw_exit(&zp->z_xattr_lock); ++ crfree(cr); ++ ++ return (error); ++} ++ ++static int ++zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, ++ size_t size, int flags, cred_t *cr) ++{ ++ struct inode *dxip = NULL; ++ struct inode *xip = NULL; ++ vattr_t *vap = NULL; ++ ssize_t wrote; ++ int error; ++ const int xattr_mode = S_IFREG | 0644; ++ ++ /* Lookup the xattr directory and create it if required. */ ++ error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR | CREATE_XATTR_DIR, ++ cr, NULL, NULL); ++ if (error) ++ goto out; ++ ++ /* Lookup a specific xattr name in the directory */ ++ error = -zfs_lookup(dxip, (char *)name, &xip, 0, cr, NULL, NULL); ++ if (error && (error != -ENOENT)) ++ goto out; ++ ++ error = 0; ++ ++ /* Remove a specific name xattr when value is set to NULL. */ ++ if (value == NULL) { ++ if (xip) ++ error = -zfs_remove(dxip, (char *)name, cr); ++ ++ goto out; ++ } ++ ++ /* Lookup failed create a new xattr. */ ++ if (xip == NULL) { ++ vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); ++ vap->va_mode = xattr_mode; ++ vap->va_mask = ATTR_MODE; ++ vap->va_uid = crgetfsuid(cr); ++ vap->va_gid = crgetfsgid(cr); ++ ++ error = -zfs_create(dxip, (char *)name, vap, 0, 0644, &xip, ++ cr, 0, NULL); ++ if (error) ++ goto out; ++ } ++ ++ ASSERT(xip != NULL); ++ ++ error = -zfs_freesp(ITOZ(xip), 0, 0, xattr_mode, TRUE); ++ if (error) ++ goto out; ++ ++ wrote = zpl_write_common(xip, value, size, 0, UIO_SYSSPACE, 0, cr); ++ if (wrote < 0) ++ error = wrote; ++ ++out: ++ if (vap) ++ kmem_free(vap, sizeof(vattr_t)); ++ ++ if (xip) ++ iput(xip); ++ ++ if (dxip) ++ iput(dxip); ++ ++ if (error == -ENOENT) ++ error = -ENODATA; ++ ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++static int ++zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value, ++ size_t size, int flags, cred_t *cr) ++{ ++ znode_t *zp = ITOZ(ip); ++ nvlist_t *nvl; ++ size_t sa_size; ++ int error; ++ ++ ASSERT(zp->z_xattr_cached); ++ nvl = zp->z_xattr_cached; ++ ++ if (value == NULL) { ++ error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); ++ if (error == -ENOENT) ++ error = zpl_xattr_set_dir(ip, name, NULL, 0, flags, cr); ++ } else { ++ /* Limited to 32k to keep nvpair memory allocations small */ ++ if (size > DXATTR_MAX_ENTRY_SIZE) ++ return (-EFBIG); ++ ++ /* Prevent the DXATTR SA from consuming the entire SA region */ ++ error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); ++ if (error) ++ return (error); ++ ++ if (sa_size > DXATTR_MAX_SA_SIZE) ++ return (-EFBIG); ++ ++ error = -nvlist_add_byte_array(nvl, name, ++ (uchar_t *)value, size); ++ if (error) ++ return (error); ++ } ++ ++ /* Update the SA for additions, modifications, and removals. */ ++ if (!error) ++ error = -zfs_sa_set_xattr(zp); ++ ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++static int ++zpl_xattr_set(struct inode *ip, const char *name, const void *value, ++ size_t size, int flags) ++{ ++ znode_t *zp = ITOZ(ip); ++ zfs_sb_t *zsb = ZTOZSB(zp); ++ cred_t *cr = CRED(); ++ int error; ++ ++ crhold(cr); ++ rw_enter(&ITOZ(ip)->z_xattr_lock, RW_WRITER); ++ ++ /* ++ * Before setting the xattr check to see if it already exists. ++ * This is done to ensure the following optional flags are honored. ++ * ++ * XATTR_CREATE: fail if xattr already exists ++ * XATTR_REPLACE: fail if xattr does not exist ++ */ ++ error = __zpl_xattr_get(ip, name, NULL, 0, cr); ++ if (error < 0) { ++ if (error != -ENODATA) ++ goto out; ++ ++ if ((error == -ENODATA) && (flags & XATTR_REPLACE)) ++ goto out; ++ } else { ++ error = -EEXIST; ++ if (flags & XATTR_CREATE) ++ goto out; ++ } ++ ++ /* Preferentially store the xattr as a SA for better performance */ ++ if (zsb->z_use_sa && zsb->z_xattr_sa && zp->z_is_sa) { ++ error = zpl_xattr_set_sa(ip, name, value, size, flags, cr); ++ if (error == 0) ++ goto out; ++ } ++ ++ error = zpl_xattr_set_dir(ip, name, value, size, flags, cr); ++out: ++ rw_exit(&ITOZ(ip)->z_xattr_lock); ++ crfree(cr); ++ ASSERT3S(error, <=, 0); ++ ++ return (error); ++} ++ ++static int ++__zpl_xattr_user_get(struct inode *ip, const char *name, ++ void *value, size_t size) ++{ ++ char *xattr_name; ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ ++ if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) ++ return -EOPNOTSUPP; ++ ++ xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); ++ error = zpl_xattr_get(ip, xattr_name, value, size); ++ strfree(xattr_name); ++ ++ return (error); ++} ++ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get); ++ ++static int ++__zpl_xattr_user_set(struct inode *ip, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ char *xattr_name; ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ ++ if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) ++ return -EOPNOTSUPP; ++ ++ xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); ++ error = zpl_xattr_set(ip, xattr_name, value, size, flags); ++ strfree(xattr_name); ++ ++ return (error); ++} ++ZPL_XATTR_SET_WRAPPER(zpl_xattr_user_set); ++ ++xattr_handler_t zpl_xattr_user_handler = { ++ .prefix = XATTR_USER_PREFIX, ++ .get = zpl_xattr_user_get, ++ .set = zpl_xattr_user_set, ++}; ++ ++static int ++__zpl_xattr_trusted_get(struct inode *ip, const char *name, ++ void *value, size_t size) ++{ ++ char *xattr_name; ++ int error; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EACCES; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ ++ xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); ++ error = zpl_xattr_get(ip, xattr_name, value, size); ++ strfree(xattr_name); ++ ++ return (error); ++} ++ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get); ++ ++static int ++__zpl_xattr_trusted_set(struct inode *ip, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ char *xattr_name; ++ int error; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EACCES; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ ++ xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); ++ error = zpl_xattr_set(ip, xattr_name, value, size, flags); ++ strfree(xattr_name); ++ ++ return (error); ++} ++ZPL_XATTR_SET_WRAPPER(zpl_xattr_trusted_set); ++ ++xattr_handler_t zpl_xattr_trusted_handler = { ++ .prefix = XATTR_TRUSTED_PREFIX, ++ .get = zpl_xattr_trusted_get, ++ .set = zpl_xattr_trusted_set, ++}; ++ ++static int ++__zpl_xattr_security_get(struct inode *ip, const char *name, ++ void *value, size_t size) ++{ ++ char *xattr_name; ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ ++ xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); ++ error = zpl_xattr_get(ip, xattr_name, value, size); ++ strfree(xattr_name); ++ ++ return (error); ++} ++ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get); ++ ++static int ++__zpl_xattr_security_set(struct inode *ip, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ char *xattr_name; ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ ++ xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); ++ error = zpl_xattr_set(ip, xattr_name, value, size, flags); ++ strfree(xattr_name); ++ ++ return (error); ++} ++ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set); ++ ++#ifdef HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY ++static int ++__zpl_xattr_security_init(struct inode *ip, const struct xattr *xattrs, ++ void *fs_info) ++{ ++ const struct xattr *xattr; ++ int error = 0; ++ ++ for (xattr = xattrs; xattr->name != NULL; xattr++) { ++ error = __zpl_xattr_security_set(ip, ++ xattr->name, xattr->value, xattr->value_len, 0); ++ ++ if (error < 0) ++ break; ++ } ++ ++ return (error); ++} ++ ++int ++zpl_xattr_security_init(struct inode *ip, struct inode *dip, ++ const struct qstr *qstr) ++{ ++ return security_inode_init_security(ip, dip, qstr, ++ &__zpl_xattr_security_init, NULL); ++} ++ ++#else ++int ++zpl_xattr_security_init(struct inode *ip, struct inode *dip, ++ const struct qstr *qstr) ++{ ++ int error; ++ size_t len; ++ void *value; ++ char *name; ++ ++ error = zpl_security_inode_init_security(ip, dip, qstr, ++ &name, &value, &len); ++ if (error) { ++ if (error == -EOPNOTSUPP) ++ return 0; ++ return (error); ++ } ++ ++ error = __zpl_xattr_security_set(ip, name, value, len, 0); ++ ++ kfree(name); ++ kfree(value); ++ ++ return (error); ++} ++#endif /* HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY */ ++ ++xattr_handler_t zpl_xattr_security_handler = { ++ .prefix = XATTR_SECURITY_PREFIX, ++ .get = zpl_xattr_security_get, ++ .set = zpl_xattr_security_set, ++}; ++ ++xattr_handler_t *zpl_xattr_handlers[] = { ++ &zpl_xattr_security_handler, ++ &zpl_xattr_trusted_handler, ++ &zpl_xattr_user_handler, ++#ifdef HAVE_POSIX_ACLS ++ &zpl_xattr_acl_access_handler, ++ &zpl_xattr_acl_default_handler, ++#endif /* HAVE_POSIX_ACLS */ ++ NULL ++}; +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zrlock.c linux-3.2.33-go/fs/zfs/zfs/zrlock.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zrlock.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zrlock.c 2012-11-16 23:25:34.350039322 +0100 +@@ -0,0 +1,207 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* ++ * A Zero Reference Lock (ZRL) is a reference count that can lock out new ++ * references only when the count is zero and only without waiting if the count ++ * is not already zero. It is similar to a read-write lock in that it allows ++ * multiple readers and only a single writer, but it does not allow a writer to ++ * block while waiting for readers to exit, and therefore the question of ++ * reader/writer priority is moot (no WRWANT bit). Since the equivalent of ++ * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it ++ * is perfectly safe for the same reader to acquire the same lock multiple ++ * times. The fact that a ZRL is reentrant for readers (through multiple calls ++ * to zrl_add()) makes it convenient for determining whether something is ++ * actively referenced without the fuss of flagging lock ownership across ++ * function calls. ++ */ ++#include ++ ++/* ++ * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is ++ * treated as zero references. ++ */ ++#define ZRL_LOCKED ((uint32_t)-1) ++#define ZRL_DESTROYED -2 ++ ++void ++zrl_init(zrlock_t *zrl) ++{ ++ mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL); ++ zrl->zr_refcount = 0; ++ cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL); ++#ifdef ZFS_DEBUG ++ zrl->zr_owner = NULL; ++ zrl->zr_caller = NULL; ++#endif ++} ++ ++void ++zrl_destroy(zrlock_t *zrl) ++{ ++ ASSERT(zrl->zr_refcount == 0); ++ ++ mutex_destroy(&zrl->zr_mtx); ++ zrl->zr_refcount = ZRL_DESTROYED; ++ cv_destroy(&zrl->zr_cv); ++} ++ ++void ++#ifdef ZFS_DEBUG ++zrl_add_debug(zrlock_t *zrl, const char *zc) ++#else ++zrl_add(zrlock_t *zrl) ++#endif ++{ ++ uint32_t n = (uint32_t)zrl->zr_refcount; ++ ++ while (n != ZRL_LOCKED) { ++ uint32_t cas = atomic_cas_32( ++ (uint32_t *)&zrl->zr_refcount, n, n + 1); ++ if (cas == n) { ++ ASSERT((int32_t)n >= 0); ++#ifdef ZFS_DEBUG ++ if (zrl->zr_owner == curthread) { ++ DTRACE_PROBE2(zrlock__reentry, ++ zrlock_t *, zrl, uint32_t, n); ++ } ++ zrl->zr_owner = curthread; ++ zrl->zr_caller = zc; ++#endif ++ return; ++ } ++ n = cas; ++ } ++ ++ mutex_enter(&zrl->zr_mtx); ++ while (zrl->zr_refcount == ZRL_LOCKED) { ++ cv_wait(&zrl->zr_cv, &zrl->zr_mtx); ++ } ++ ASSERT(zrl->zr_refcount >= 0); ++ zrl->zr_refcount++; ++#ifdef ZFS_DEBUG ++ zrl->zr_owner = curthread; ++ zrl->zr_caller = zc; ++#endif ++ mutex_exit(&zrl->zr_mtx); ++} ++ ++void ++zrl_remove(zrlock_t *zrl) ++{ ++ uint32_t n; ++ ++ n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount); ++ ASSERT((int32_t)n >= 0); ++#ifdef ZFS_DEBUG ++ if (zrl->zr_owner == curthread) { ++ zrl->zr_owner = NULL; ++ zrl->zr_caller = NULL; ++ } ++#endif ++} ++ ++int ++zrl_tryenter(zrlock_t *zrl) ++{ ++ uint32_t n = (uint32_t)zrl->zr_refcount; ++ ++ if (n == 0) { ++ uint32_t cas = atomic_cas_32( ++ (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED); ++ if (cas == 0) { ++#ifdef ZFS_DEBUG ++ ASSERT(zrl->zr_owner == NULL); ++ zrl->zr_owner = curthread; ++#endif ++ return (1); ++ } ++ } ++ ++ ASSERT((int32_t)n > ZRL_DESTROYED); ++ ++ return (0); ++} ++ ++void ++zrl_exit(zrlock_t *zrl) ++{ ++ ASSERT(zrl->zr_refcount == ZRL_LOCKED); ++ ++ mutex_enter(&zrl->zr_mtx); ++#ifdef ZFS_DEBUG ++ ASSERT(zrl->zr_owner == curthread); ++ zrl->zr_owner = NULL; ++ membar_producer(); /* make sure the owner store happens first */ ++#endif ++ zrl->zr_refcount = 0; ++ cv_broadcast(&zrl->zr_cv); ++ mutex_exit(&zrl->zr_mtx); ++} ++ ++int ++zrl_refcount(zrlock_t *zrl) ++{ ++ int n; ++ ++ ASSERT(zrl->zr_refcount > ZRL_DESTROYED); ++ ++ n = (int)zrl->zr_refcount; ++ return (n <= 0 ? 0 : n); ++} ++ ++int ++zrl_is_zero(zrlock_t *zrl) ++{ ++ ASSERT(zrl->zr_refcount > ZRL_DESTROYED); ++ ++ return (zrl->zr_refcount <= 0); ++} ++ ++int ++zrl_is_locked(zrlock_t *zrl) ++{ ++ ASSERT(zrl->zr_refcount > ZRL_DESTROYED); ++ ++ return (zrl->zr_refcount == ZRL_LOCKED); ++} ++ ++#ifdef ZFS_DEBUG ++kthread_t * ++zrl_owner(zrlock_t *zrl) ++{ ++ return (zrl->zr_owner); ++} ++#endif ++ ++#if defined(_KERNEL) && defined(HAVE_SPL) ++ ++#ifdef ZFS_DEBUG ++EXPORT_SYMBOL(zrl_add_debug); ++#else ++EXPORT_SYMBOL(zrl_add); ++#endif ++EXPORT_SYMBOL(zrl_remove); ++ ++#endif +diff -uNr linux-3.2.33-go.orig/fs/zfs/zfs/zvol.c linux-3.2.33-go/fs/zfs/zfs/zvol.c +--- linux-3.2.33-go.orig/fs/zfs/zfs/zvol.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zfs/zvol.c 2012-11-16 23:25:34.350039322 +0100 +@@ -0,0 +1,1503 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Rewritten for Linux by Brian Behlendorf . ++ * LLNL-CODE-403049. ++ * ++ * ZFS volume emulation driver. ++ * ++ * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. ++ * Volumes are accessed through the symbolic links named: ++ * ++ * /dev// ++ * ++ * Volumes are persistent through reboot and module load. No user command ++ * needs to be run before opening and using a device. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++unsigned int zvol_inhibit_dev = 0; ++unsigned int zvol_major = ZVOL_MAJOR; ++unsigned int zvol_threads = 32; ++unsigned long zvol_max_discard_blocks = 16384; ++ ++static taskq_t *zvol_taskq; ++static kmutex_t zvol_state_lock; ++static list_t zvol_state_list; ++static char *zvol_tag = "zvol_tag"; ++ ++/* ++ * The in-core state of each volume. ++ */ ++typedef struct zvol_state { ++ char zv_name[MAXNAMELEN]; /* name */ ++ uint64_t zv_volsize; /* advertised space */ ++ uint64_t zv_volblocksize;/* volume block size */ ++ objset_t *zv_objset; /* objset handle */ ++ uint32_t zv_flags; /* ZVOL_* flags */ ++ uint32_t zv_open_count; /* open counts */ ++ uint32_t zv_changed; /* disk changed */ ++ zilog_t *zv_zilog; /* ZIL handle */ ++ znode_t zv_znode; /* for range locking */ ++ dmu_buf_t *zv_dbuf; /* bonus handle */ ++ dev_t zv_dev; /* device id */ ++ struct gendisk *zv_disk; /* generic disk */ ++ struct request_queue *zv_queue; /* request queue */ ++ spinlock_t zv_lock; /* request queue lock */ ++ list_node_t zv_next; /* next zvol_state_t linkage */ ++} zvol_state_t; ++ ++#define ZVOL_RDONLY 0x1 ++ ++/* ++ * Find the next available range of ZVOL_MINORS minor numbers. The ++ * zvol_state_list is kept in ascending minor order so we simply need ++ * to scan the list for the first gap in the sequence. This allows us ++ * to recycle minor number as devices are created and removed. ++ */ ++static int ++zvol_find_minor(unsigned *minor) ++{ ++ zvol_state_t *zv; ++ ++ *minor = 0; ++ ASSERT(MUTEX_HELD(&zvol_state_lock)); ++ for (zv = list_head(&zvol_state_list); zv != NULL; ++ zv = list_next(&zvol_state_list, zv), *minor += ZVOL_MINORS) { ++ if (MINOR(zv->zv_dev) != MINOR(*minor)) ++ break; ++ } ++ ++ /* All minors are in use */ ++ if (*minor >= (1 << MINORBITS)) ++ return ENXIO; ++ ++ return 0; ++} ++ ++/* ++ * Find a zvol_state_t given the full major+minor dev_t. ++ */ ++static zvol_state_t * ++zvol_find_by_dev(dev_t dev) ++{ ++ zvol_state_t *zv; ++ ++ ASSERT(MUTEX_HELD(&zvol_state_lock)); ++ for (zv = list_head(&zvol_state_list); zv != NULL; ++ zv = list_next(&zvol_state_list, zv)) { ++ if (zv->zv_dev == dev) ++ return zv; ++ } ++ ++ return NULL; ++} ++ ++/* ++ * Find a zvol_state_t given the name provided at zvol_alloc() time. ++ */ ++static zvol_state_t * ++zvol_find_by_name(const char *name) ++{ ++ zvol_state_t *zv; ++ ++ ASSERT(MUTEX_HELD(&zvol_state_lock)); ++ for (zv = list_head(&zvol_state_list); zv != NULL; ++ zv = list_next(&zvol_state_list, zv)) { ++ if (!strncmp(zv->zv_name, name, MAXNAMELEN)) ++ return zv; ++ } ++ ++ return NULL; ++} ++ ++/* ++ * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation. ++ */ ++void ++zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) ++{ ++ zfs_creat_t *zct = arg; ++ nvlist_t *nvprops = zct->zct_props; ++ int error; ++ uint64_t volblocksize, volsize; ++ ++ VERIFY(nvlist_lookup_uint64(nvprops, ++ zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); ++ if (nvlist_lookup_uint64(nvprops, ++ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) ++ volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); ++ ++ /* ++ * These properties must be removed from the list so the generic ++ * property setting step won't apply to them. ++ */ ++ VERIFY(nvlist_remove_all(nvprops, ++ zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); ++ (void) nvlist_remove_all(nvprops, ++ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); ++ ++ error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, ++ DMU_OT_NONE, 0, tx); ++ ASSERT(error == 0); ++ ++ error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, ++ DMU_OT_NONE, 0, tx); ++ ASSERT(error == 0); ++ ++ error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); ++ ASSERT(error == 0); ++} ++ ++/* ++ * ZFS_IOC_OBJSET_STATS entry point. ++ */ ++int ++zvol_get_stats(objset_t *os, nvlist_t *nv) ++{ ++ int error; ++ dmu_object_info_t *doi; ++ uint64_t val; ++ ++ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); ++ if (error) ++ return (error); ++ ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); ++ doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); ++ error = dmu_object_info(os, ZVOL_OBJ, doi); ++ ++ if (error == 0) { ++ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, ++ doi->doi_data_block_size); ++ } ++ ++ kmem_free(doi, sizeof(dmu_object_info_t)); ++ ++ return (error); ++} ++ ++/* ++ * Sanity check volume size. ++ */ ++int ++zvol_check_volsize(uint64_t volsize, uint64_t blocksize) ++{ ++ if (volsize == 0) ++ return (EINVAL); ++ ++ if (volsize % blocksize != 0) ++ return (EINVAL); ++ ++#ifdef _ILP32 ++ if (volsize - 1 > MAXOFFSET_T) ++ return (EOVERFLOW); ++#endif ++ return (0); ++} ++ ++/* ++ * Ensure the zap is flushed then inform the VFS of the capacity change. ++ */ ++static int ++zvol_update_volsize(zvol_state_t *zv, uint64_t volsize, objset_t *os) ++{ ++ struct block_device *bdev; ++ dmu_tx_t *tx; ++ int error; ++ ++ ASSERT(MUTEX_HELD(&zvol_state_lock)); ++ ++ tx = dmu_tx_create(os); ++ dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); ++ error = dmu_tx_assign(tx, TXG_WAIT); ++ if (error) { ++ dmu_tx_abort(tx); ++ return (error); ++ } ++ ++ error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, ++ &volsize, tx); ++ dmu_tx_commit(tx); ++ ++ if (error) ++ return (error); ++ ++ error = dmu_free_long_range(os, ++ ZVOL_OBJ, volsize, DMU_OBJECT_END); ++ if (error) ++ return (error); ++ ++ bdev = bdget_disk(zv->zv_disk, 0); ++ if (!bdev) ++ return (EIO); ++/* ++ * 2.6.28 API change ++ * Added check_disk_size_change() helper function. ++ */ ++#ifdef HAVE_CHECK_DISK_SIZE_CHANGE ++ set_capacity(zv->zv_disk, volsize >> 9); ++ zv->zv_volsize = volsize; ++ check_disk_size_change(zv->zv_disk, bdev); ++#else ++ zv->zv_volsize = volsize; ++ zv->zv_changed = 1; ++ (void) check_disk_change(bdev); ++#endif /* HAVE_CHECK_DISK_SIZE_CHANGE */ ++ ++ bdput(bdev); ++ ++ return (0); ++} ++ ++/* ++ * Set ZFS_PROP_VOLSIZE set entry point. ++ */ ++int ++zvol_set_volsize(const char *name, uint64_t volsize) ++{ ++ zvol_state_t *zv; ++ dmu_object_info_t *doi; ++ objset_t *os = NULL; ++ uint64_t readonly; ++ int error; ++ ++ mutex_enter(&zvol_state_lock); ++ ++ zv = zvol_find_by_name(name); ++ if (zv == NULL) { ++ error = ENXIO; ++ goto out; ++ } ++ ++ doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); ++ ++ error = dmu_objset_hold(name, FTAG, &os); ++ if (error) ++ goto out_doi; ++ ++ if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) != 0 || ++ (error = zvol_check_volsize(volsize,doi->doi_data_block_size)) != 0) ++ goto out_doi; ++ ++ VERIFY(dsl_prop_get_integer(name, "readonly", &readonly, NULL) == 0); ++ if (readonly) { ++ error = EROFS; ++ goto out_doi; ++ } ++ ++ if (get_disk_ro(zv->zv_disk) || (zv->zv_flags & ZVOL_RDONLY)) { ++ error = EROFS; ++ goto out_doi; ++ } ++ ++ error = zvol_update_volsize(zv, volsize, os); ++out_doi: ++ kmem_free(doi, sizeof(dmu_object_info_t)); ++out: ++ if (os) ++ dmu_objset_rele(os, FTAG); ++ ++ mutex_exit(&zvol_state_lock); ++ ++ return (error); ++} ++ ++/* ++ * Sanity check volume block size. ++ */ ++int ++zvol_check_volblocksize(uint64_t volblocksize) ++{ ++ if (volblocksize < SPA_MINBLOCKSIZE || ++ volblocksize > SPA_MAXBLOCKSIZE || ++ !ISP2(volblocksize)) ++ return (EDOM); ++ ++ return (0); ++} ++ ++/* ++ * Set ZFS_PROP_VOLBLOCKSIZE set entry point. ++ */ ++int ++zvol_set_volblocksize(const char *name, uint64_t volblocksize) ++{ ++ zvol_state_t *zv; ++ dmu_tx_t *tx; ++ int error; ++ ++ mutex_enter(&zvol_state_lock); ++ ++ zv = zvol_find_by_name(name); ++ if (zv == NULL) { ++ error = ENXIO; ++ goto out; ++ } ++ ++ if (get_disk_ro(zv->zv_disk) || (zv->zv_flags & ZVOL_RDONLY)) { ++ error = EROFS; ++ goto out; ++ } ++ ++ tx = dmu_tx_create(zv->zv_objset); ++ dmu_tx_hold_bonus(tx, ZVOL_OBJ); ++ error = dmu_tx_assign(tx, TXG_WAIT); ++ if (error) { ++ dmu_tx_abort(tx); ++ } else { ++ error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, ++ volblocksize, 0, tx); ++ if (error == ENOTSUP) ++ error = EBUSY; ++ dmu_tx_commit(tx); ++ if (error == 0) ++ zv->zv_volblocksize = volblocksize; ++ } ++out: ++ mutex_exit(&zvol_state_lock); ++ ++ return (error); ++} ++ ++/* ++ * Replay a TX_WRITE ZIL transaction that didn't get committed ++ * after a system failure ++ */ ++static int ++zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) ++{ ++ objset_t *os = zv->zv_objset; ++ char *data = (char *)(lr + 1); /* data follows lr_write_t */ ++ uint64_t off = lr->lr_offset; ++ uint64_t len = lr->lr_length; ++ dmu_tx_t *tx; ++ int error; ++ ++ if (byteswap) ++ byteswap_uint64_array(lr, sizeof (*lr)); ++ ++ tx = dmu_tx_create(os); ++ dmu_tx_hold_write(tx, ZVOL_OBJ, off, len); ++ error = dmu_tx_assign(tx, TXG_WAIT); ++ if (error) { ++ dmu_tx_abort(tx); ++ } else { ++ dmu_write(os, ZVOL_OBJ, off, len, data, tx); ++ dmu_tx_commit(tx); ++ } ++ ++ return (error); ++} ++ ++static int ++zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) ++{ ++ return (ENOTSUP); ++} ++ ++/* ++ * Callback vectors for replaying records. ++ * Only TX_WRITE is needed for zvol. ++ */ ++zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { ++ (zil_replay_func_t *)zvol_replay_err, /* no such transaction type */ ++ (zil_replay_func_t *)zvol_replay_err, /* TX_CREATE */ ++ (zil_replay_func_t *)zvol_replay_err, /* TX_MKDIR */ ++ (zil_replay_func_t *)zvol_replay_err, /* TX_MKXATTR */ ++ (zil_replay_func_t *)zvol_replay_err, /* TX_SYMLINK */ ++ (zil_replay_func_t *)zvol_replay_err, /* TX_REMOVE */ ++ (zil_replay_func_t *)zvol_replay_err, /* TX_RMDIR */ ++ (zil_replay_func_t *)zvol_replay_err, /* TX_LINK */ ++ (zil_replay_func_t *)zvol_replay_err, /* TX_RENAME */ ++ (zil_replay_func_t *)zvol_replay_write, /* TX_WRITE */ ++ (zil_replay_func_t *)zvol_replay_err, /* TX_TRUNCATE */ ++ (zil_replay_func_t *)zvol_replay_err, /* TX_SETATTR */ ++ (zil_replay_func_t *)zvol_replay_err, /* TX_ACL */ ++}; ++ ++/* ++ * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. ++ * ++ * We store data in the log buffers if it's small enough. ++ * Otherwise we will later flush the data out via dmu_sync(). ++ */ ++ssize_t zvol_immediate_write_sz = 32768; ++ ++static void ++zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, ++ uint64_t offset, uint64_t size, int sync) ++{ ++ uint32_t blocksize = zv->zv_volblocksize; ++ zilog_t *zilog = zv->zv_zilog; ++ boolean_t slogging; ++ ssize_t immediate_write_sz; ++ ++ if (zil_replaying(zilog, tx)) ++ return; ++ ++ immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) ++ ? 0 : zvol_immediate_write_sz; ++ slogging = spa_has_slogs(zilog->zl_spa) && ++ (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); ++ ++ while (size) { ++ itx_t *itx; ++ lr_write_t *lr; ++ ssize_t len; ++ itx_wr_state_t write_state; ++ ++ /* ++ * Unlike zfs_log_write() we can be called with ++ * up to DMU_MAX_ACCESS/2 (5MB) writes. ++ */ ++ if (blocksize > immediate_write_sz && !slogging && ++ size >= blocksize && offset % blocksize == 0) { ++ write_state = WR_INDIRECT; /* uses dmu_sync */ ++ len = blocksize; ++ } else if (sync) { ++ write_state = WR_COPIED; ++ len = MIN(ZIL_MAX_LOG_DATA, size); ++ } else { ++ write_state = WR_NEED_COPY; ++ len = MIN(ZIL_MAX_LOG_DATA, size); ++ } ++ ++ itx = zil_itx_create(TX_WRITE, sizeof (*lr) + ++ (write_state == WR_COPIED ? len : 0)); ++ lr = (lr_write_t *)&itx->itx_lr; ++ if (write_state == WR_COPIED && dmu_read(zv->zv_objset, ++ ZVOL_OBJ, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) { ++ zil_itx_destroy(itx); ++ itx = zil_itx_create(TX_WRITE, sizeof (*lr)); ++ lr = (lr_write_t *)&itx->itx_lr; ++ write_state = WR_NEED_COPY; ++ } ++ ++ itx->itx_wr_state = write_state; ++ if (write_state == WR_NEED_COPY) ++ itx->itx_sod += len; ++ lr->lr_foid = ZVOL_OBJ; ++ lr->lr_offset = offset; ++ lr->lr_length = len; ++ lr->lr_blkoff = 0; ++ BP_ZERO(&lr->lr_blkptr); ++ ++ itx->itx_private = zv; ++ itx->itx_sync = sync; ++ ++ (void) zil_itx_assign(zilog, itx, tx); ++ ++ offset += len; ++ size -= len; ++ } ++} ++ ++/* ++ * Common write path running under the zvol taskq context. This function ++ * is responsible for copying the request structure data in to the DMU and ++ * signaling the request queue with the result of the copy. ++ */ ++static void ++zvol_write(void *arg) ++{ ++ struct request *req = (struct request *)arg; ++ struct request_queue *q = req->q; ++ zvol_state_t *zv = q->queuedata; ++ uint64_t offset = blk_rq_pos(req) << 9; ++ uint64_t size = blk_rq_bytes(req); ++ int error = 0; ++ dmu_tx_t *tx; ++ rl_t *rl; ++ ++ /* ++ * Annotate this call path with a flag that indicates that it is ++ * unsafe to use KM_SLEEP during memory allocations due to the ++ * potential for a deadlock. KM_PUSHPAGE should be used instead. ++ */ ++ ASSERT(!(current->flags & PF_NOFS)); ++ current->flags |= PF_NOFS; ++ ++ if (req->cmd_flags & VDEV_REQ_FLUSH) ++ zil_commit(zv->zv_zilog, ZVOL_OBJ); ++ ++ /* ++ * Some requests are just for flush and nothing else. ++ */ ++ if (size == 0) { ++ blk_end_request(req, 0, size); ++ goto out; ++ } ++ ++ rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER); ++ ++ tx = dmu_tx_create(zv->zv_objset); ++ dmu_tx_hold_write(tx, ZVOL_OBJ, offset, size); ++ ++ /* This will only fail for ENOSPC */ ++ error = dmu_tx_assign(tx, TXG_WAIT); ++ if (error) { ++ dmu_tx_abort(tx); ++ zfs_range_unlock(rl); ++ blk_end_request(req, -error, size); ++ goto out; ++ } ++ ++ error = dmu_write_req(zv->zv_objset, ZVOL_OBJ, req, tx); ++ if (error == 0) ++ zvol_log_write(zv, tx, offset, size, ++ req->cmd_flags & VDEV_REQ_FUA); ++ ++ dmu_tx_commit(tx); ++ zfs_range_unlock(rl); ++ ++ if ((req->cmd_flags & VDEV_REQ_FUA) || ++ zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) ++ zil_commit(zv->zv_zilog, ZVOL_OBJ); ++ ++ blk_end_request(req, -error, size); ++out: ++ current->flags &= ~PF_NOFS; ++} ++ ++#ifdef HAVE_BLK_QUEUE_DISCARD ++static void ++zvol_discard(void *arg) ++{ ++ struct request *req = (struct request *)arg; ++ struct request_queue *q = req->q; ++ zvol_state_t *zv = q->queuedata; ++ uint64_t start = blk_rq_pos(req) << 9; ++ uint64_t end = start + blk_rq_bytes(req); ++ int error; ++ rl_t *rl; ++ ++ /* ++ * Annotate this call path with a flag that indicates that it is ++ * unsafe to use KM_SLEEP during memory allocations due to the ++ * potential for a deadlock. KM_PUSHPAGE should be used instead. ++ */ ++ ASSERT(!(current->flags & PF_NOFS)); ++ current->flags |= PF_NOFS; ++ ++ if (end > zv->zv_volsize) { ++ blk_end_request(req, -EIO, blk_rq_bytes(req)); ++ goto out; ++ } ++ ++ /* ++ * Align the request to volume block boundaries. If we don't, ++ * then this will force dnode_free_range() to zero out the ++ * unaligned parts, which is slow (read-modify-write) and ++ * useless since we are not freeing any space by doing so. ++ */ ++ start = P2ROUNDUP(start, zv->zv_volblocksize); ++ end = P2ALIGN(end, zv->zv_volblocksize); ++ ++ if (start >= end) { ++ blk_end_request(req, 0, blk_rq_bytes(req)); ++ goto out; ++ } ++ ++ rl = zfs_range_lock(&zv->zv_znode, start, end - start, RL_WRITER); ++ ++ error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end - start); ++ ++ /* ++ * TODO: maybe we should add the operation to the log. ++ */ ++ ++ zfs_range_unlock(rl); ++ ++ blk_end_request(req, -error, blk_rq_bytes(req)); ++out: ++ current->flags &= ~PF_NOFS; ++} ++#endif /* HAVE_BLK_QUEUE_DISCARD */ ++ ++/* ++ * Common read path running under the zvol taskq context. This function ++ * is responsible for copying the requested data out of the DMU and in to ++ * a linux request structure. It then must signal the request queue with ++ * an error code describing the result of the copy. ++ */ ++static void ++zvol_read(void *arg) ++{ ++ struct request *req = (struct request *)arg; ++ struct request_queue *q = req->q; ++ zvol_state_t *zv = q->queuedata; ++ uint64_t offset = blk_rq_pos(req) << 9; ++ uint64_t size = blk_rq_bytes(req); ++ int error; ++ rl_t *rl; ++ ++ if (size == 0) { ++ blk_end_request(req, 0, size); ++ return; ++ } ++ ++ rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); ++ ++ error = dmu_read_req(zv->zv_objset, ZVOL_OBJ, req); ++ ++ zfs_range_unlock(rl); ++ ++ /* convert checksum errors into IO errors */ ++ if (error == ECKSUM) ++ error = EIO; ++ ++ blk_end_request(req, -error, size); ++} ++ ++/* ++ * Request will be added back to the request queue and retried if ++ * it cannot be immediately dispatched to the taskq for handling ++ */ ++static inline void ++zvol_dispatch(task_func_t func, struct request *req) ++{ ++ if (!taskq_dispatch(zvol_taskq, func, (void *)req, TQ_NOSLEEP)) ++ blk_requeue_request(req->q, req); ++} ++ ++/* ++ * Common request path. Rather than registering a custom make_request() ++ * function we use the generic Linux version. This is done because it allows ++ * us to easily merge read requests which would otherwise we performed ++ * synchronously by the DMU. This is less critical in write case where the ++ * DMU will perform the correct merging within a transaction group. Using ++ * the generic make_request() also let's use leverage the fact that the ++ * elevator with ensure correct ordering in regards to barrior IOs. On ++ * the downside it means that in the write case we end up doing request ++ * merging twice once in the elevator and once in the DMU. ++ * ++ * The request handler is called under a spin lock so all the real work ++ * is handed off to be done in the context of the zvol taskq. This function ++ * simply performs basic request sanity checking and hands off the request. ++ */ ++static void ++zvol_request(struct request_queue *q) ++{ ++ zvol_state_t *zv = q->queuedata; ++ struct request *req; ++ unsigned int size; ++ ++ while ((req = blk_fetch_request(q)) != NULL) { ++ size = blk_rq_bytes(req); ++ ++ if (size != 0 && blk_rq_pos(req) + blk_rq_sectors(req) > ++ get_capacity(zv->zv_disk)) { ++ printk(KERN_INFO ++ "%s: bad access: block=%llu, count=%lu\n", ++ req->rq_disk->disk_name, ++ (long long unsigned)blk_rq_pos(req), ++ (long unsigned)blk_rq_sectors(req)); ++ __blk_end_request(req, -EIO, size); ++ continue; ++ } ++ ++ if (!blk_fs_request(req)) { ++ printk(KERN_INFO "%s: non-fs cmd\n", ++ req->rq_disk->disk_name); ++ __blk_end_request(req, -EIO, size); ++ continue; ++ } ++ ++ switch (rq_data_dir(req)) { ++ case READ: ++ zvol_dispatch(zvol_read, req); ++ break; ++ case WRITE: ++ if (unlikely(get_disk_ro(zv->zv_disk)) || ++ unlikely(zv->zv_flags & ZVOL_RDONLY)) { ++ __blk_end_request(req, -EROFS, size); ++ break; ++ } ++ ++#ifdef HAVE_BLK_QUEUE_DISCARD ++ if (req->cmd_flags & VDEV_REQ_DISCARD) { ++ zvol_dispatch(zvol_discard, req); ++ break; ++ } ++#endif /* HAVE_BLK_QUEUE_DISCARD */ ++ ++ zvol_dispatch(zvol_write, req); ++ break; ++ default: ++ printk(KERN_INFO "%s: unknown cmd: %d\n", ++ req->rq_disk->disk_name, (int)rq_data_dir(req)); ++ __blk_end_request(req, -EIO, size); ++ break; ++ } ++ } ++} ++ ++static void ++zvol_get_done(zgd_t *zgd, int error) ++{ ++ if (zgd->zgd_db) ++ dmu_buf_rele(zgd->zgd_db, zgd); ++ ++ zfs_range_unlock(zgd->zgd_rl); ++ ++ if (error == 0 && zgd->zgd_bp) ++ zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); ++ ++ kmem_free(zgd, sizeof (zgd_t)); ++} ++ ++/* ++ * Get data to generate a TX_WRITE intent log record. ++ */ ++static int ++zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) ++{ ++ zvol_state_t *zv = arg; ++ objset_t *os = zv->zv_objset; ++ uint64_t offset = lr->lr_offset; ++ uint64_t size = lr->lr_length; ++ dmu_buf_t *db; ++ zgd_t *zgd; ++ int error; ++ ++ ASSERT(zio != NULL); ++ ASSERT(size != 0); ++ ++ zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_PUSHPAGE); ++ zgd->zgd_zilog = zv->zv_zilog; ++ zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); ++ ++ /* ++ * Write records come in two flavors: immediate and indirect. ++ * For small writes it's cheaper to store the data with the ++ * log record (immediate); for large writes it's cheaper to ++ * sync the data and get a pointer to it (indirect) so that ++ * we don't have to write the data twice. ++ */ ++ if (buf != NULL) { /* immediate write */ ++ error = dmu_read(os, ZVOL_OBJ, offset, size, buf, ++ DMU_READ_NO_PREFETCH); ++ } else { ++ size = zv->zv_volblocksize; ++ offset = P2ALIGN_TYPED(offset, size, uint64_t); ++ error = dmu_buf_hold(os, ZVOL_OBJ, offset, zgd, &db, ++ DMU_READ_NO_PREFETCH); ++ if (error == 0) { ++ zgd->zgd_db = db; ++ zgd->zgd_bp = &lr->lr_blkptr; ++ ++ ASSERT(db != NULL); ++ ASSERT(db->db_offset == offset); ++ ASSERT(db->db_size == size); ++ ++ error = dmu_sync(zio, lr->lr_common.lrc_txg, ++ zvol_get_done, zgd); ++ ++ if (error == 0) ++ return (0); ++ } ++ } ++ ++ zvol_get_done(zgd, error); ++ ++ return (error); ++} ++ ++/* ++ * The zvol_state_t's are inserted in increasing MINOR(dev_t) order. ++ */ ++static void ++zvol_insert(zvol_state_t *zv_insert) ++{ ++ zvol_state_t *zv = NULL; ++ ++ ASSERT(MUTEX_HELD(&zvol_state_lock)); ++ ASSERT3U(MINOR(zv_insert->zv_dev) & ZVOL_MINOR_MASK, ==, 0); ++ for (zv = list_head(&zvol_state_list); zv != NULL; ++ zv = list_next(&zvol_state_list, zv)) { ++ if (MINOR(zv->zv_dev) > MINOR(zv_insert->zv_dev)) ++ break; ++ } ++ ++ list_insert_before(&zvol_state_list, zv, zv_insert); ++} ++ ++/* ++ * Simply remove the zvol from to list of zvols. ++ */ ++static void ++zvol_remove(zvol_state_t *zv_remove) ++{ ++ ASSERT(MUTEX_HELD(&zvol_state_lock)); ++ list_remove(&zvol_state_list, zv_remove); ++} ++ ++static int ++zvol_first_open(zvol_state_t *zv) ++{ ++ objset_t *os; ++ uint64_t volsize; ++ int error; ++ uint64_t ro; ++ ++ /* lie and say we're read-only */ ++ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os); ++ if (error) ++ return (-error); ++ ++ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); ++ if (error) { ++ dmu_objset_disown(os, zvol_tag); ++ return (-error); ++ } ++ ++ zv->zv_objset = os; ++ error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); ++ if (error) { ++ dmu_objset_disown(os, zvol_tag); ++ return (-error); ++ } ++ ++ set_capacity(zv->zv_disk, volsize >> 9); ++ zv->zv_volsize = volsize; ++ zv->zv_zilog = zil_open(os, zvol_get_data); ++ ++ VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL) == 0); ++ if (ro || dmu_objset_is_snapshot(os)) { ++ set_disk_ro(zv->zv_disk, 1); ++ zv->zv_flags |= ZVOL_RDONLY; ++ } else { ++ set_disk_ro(zv->zv_disk, 0); ++ zv->zv_flags &= ~ZVOL_RDONLY; ++ } ++ ++ return (-error); ++} ++ ++static void ++zvol_last_close(zvol_state_t *zv) ++{ ++ zil_close(zv->zv_zilog); ++ zv->zv_zilog = NULL; ++ ++ dmu_buf_rele(zv->zv_dbuf, zvol_tag); ++ zv->zv_dbuf = NULL; ++ ++ /* ++ * Evict cached data ++ */ ++ if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) && ++ !(zv->zv_flags & ZVOL_RDONLY)) ++ txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); ++ (void) dmu_objset_evict_dbufs(zv->zv_objset); ++ ++ dmu_objset_disown(zv->zv_objset, zvol_tag); ++ zv->zv_objset = NULL; ++} ++ ++static int ++zvol_open(struct block_device *bdev, fmode_t flag) ++{ ++ zvol_state_t *zv = bdev->bd_disk->private_data; ++ int error = 0, drop_mutex = 0; ++ ++ /* ++ * If the caller is already holding the mutex do not take it ++ * again, this will happen as part of zvol_create_minor(). ++ * Once add_disk() is called the device is live and the kernel ++ * will attempt to open it to read the partition information. ++ */ ++ if (!mutex_owned(&zvol_state_lock)) { ++ mutex_enter(&zvol_state_lock); ++ drop_mutex = 1; ++ } ++ ++ ASSERT3P(zv, !=, NULL); ++ ++ if (zv->zv_open_count == 0) { ++ error = zvol_first_open(zv); ++ if (error) ++ goto out_mutex; ++ } ++ ++ if ((flag & FMODE_WRITE) && ++ (get_disk_ro(zv->zv_disk) || (zv->zv_flags & ZVOL_RDONLY))) { ++ error = -EROFS; ++ goto out_open_count; ++ } ++ ++ zv->zv_open_count++; ++ ++out_open_count: ++ if (zv->zv_open_count == 0) ++ zvol_last_close(zv); ++ ++out_mutex: ++ if (drop_mutex) ++ mutex_exit(&zvol_state_lock); ++ ++ check_disk_change(bdev); ++ ++ return (error); ++} ++ ++static int ++zvol_release(struct gendisk *disk, fmode_t mode) ++{ ++ zvol_state_t *zv = disk->private_data; ++ int drop_mutex = 0; ++ ++ if (!mutex_owned(&zvol_state_lock)) { ++ mutex_enter(&zvol_state_lock); ++ drop_mutex = 1; ++ } ++ ++ ASSERT3P(zv, !=, NULL); ++ ASSERT3U(zv->zv_open_count, >, 0); ++ zv->zv_open_count--; ++ if (zv->zv_open_count == 0) ++ zvol_last_close(zv); ++ ++ if (drop_mutex) ++ mutex_exit(&zvol_state_lock); ++ ++ return (0); ++} ++ ++static int ++zvol_ioctl(struct block_device *bdev, fmode_t mode, ++ unsigned int cmd, unsigned long arg) ++{ ++ zvol_state_t *zv = bdev->bd_disk->private_data; ++ int error = 0; ++ ++ if (zv == NULL) ++ return (-ENXIO); ++ ++ switch (cmd) { ++ case BLKFLSBUF: ++ zil_commit(zv->zv_zilog, ZVOL_OBJ); ++ break; ++ case BLKZNAME: ++ error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); ++ break; ++ ++ default: ++ error = -ENOTTY; ++ break; ++ ++ } ++ ++ return (error); ++} ++ ++#ifdef CONFIG_COMPAT ++static int ++zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, ++ unsigned cmd, unsigned long arg) ++{ ++ return zvol_ioctl(bdev, mode, cmd, arg); ++} ++#else ++#define zvol_compat_ioctl NULL ++#endif ++ ++static int zvol_media_changed(struct gendisk *disk) ++{ ++ zvol_state_t *zv = disk->private_data; ++ ++ return zv->zv_changed; ++} ++ ++static int zvol_revalidate_disk(struct gendisk *disk) ++{ ++ zvol_state_t *zv = disk->private_data; ++ ++ zv->zv_changed = 0; ++ set_capacity(zv->zv_disk, zv->zv_volsize >> 9); ++ ++ return 0; ++} ++ ++/* ++ * Provide a simple virtual geometry for legacy compatibility. For devices ++ * smaller than 1 MiB a small head and sector count is used to allow very ++ * tiny devices. For devices over 1 Mib a standard head and sector count ++ * is used to keep the cylinders count reasonable. ++ */ ++static int ++zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) ++{ ++ zvol_state_t *zv = bdev->bd_disk->private_data; ++ sector_t sectors = get_capacity(zv->zv_disk); ++ ++ if (sectors > 2048) { ++ geo->heads = 16; ++ geo->sectors = 63; ++ } else { ++ geo->heads = 2; ++ geo->sectors = 4; ++ } ++ ++ geo->start = 0; ++ geo->cylinders = sectors / (geo->heads * geo->sectors); ++ ++ return 0; ++} ++ ++static struct kobject * ++zvol_probe(dev_t dev, int *part, void *arg) ++{ ++ zvol_state_t *zv; ++ struct kobject *kobj; ++ ++ mutex_enter(&zvol_state_lock); ++ zv = zvol_find_by_dev(dev); ++ kobj = zv ? get_disk(zv->zv_disk) : NULL; ++ mutex_exit(&zvol_state_lock); ++ ++ return kobj; ++} ++ ++#ifdef HAVE_BDEV_BLOCK_DEVICE_OPERATIONS ++static struct block_device_operations zvol_ops = { ++ .open = zvol_open, ++ .release = zvol_release, ++ .ioctl = zvol_ioctl, ++ .compat_ioctl = zvol_compat_ioctl, ++ .media_changed = zvol_media_changed, ++ .revalidate_disk = zvol_revalidate_disk, ++ .getgeo = zvol_getgeo, ++ .owner = THIS_MODULE, ++}; ++ ++#else /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */ ++ ++static int ++zvol_open_by_inode(struct inode *inode, struct file *file) ++{ ++ return zvol_open(inode->i_bdev, file->f_mode); ++} ++ ++static int ++zvol_release_by_inode(struct inode *inode, struct file *file) ++{ ++ return zvol_release(inode->i_bdev->bd_disk, file->f_mode); ++} ++ ++static int ++zvol_ioctl_by_inode(struct inode *inode, struct file *file, ++ unsigned int cmd, unsigned long arg) ++{ ++ if (file == NULL || inode == NULL) ++ return -EINVAL; ++ return zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg); ++} ++ ++# ifdef CONFIG_COMPAT ++static long ++zvol_compat_ioctl_by_inode(struct file *file, ++ unsigned int cmd, unsigned long arg) ++{ ++ if (file == NULL) ++ return -EINVAL; ++ return zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev, ++ file->f_mode, cmd, arg); ++} ++# else ++# define zvol_compat_ioctl_by_inode NULL ++# endif ++ ++static struct block_device_operations zvol_ops = { ++ .open = zvol_open_by_inode, ++ .release = zvol_release_by_inode, ++ .ioctl = zvol_ioctl_by_inode, ++ .compat_ioctl = zvol_compat_ioctl_by_inode, ++ .media_changed = zvol_media_changed, ++ .revalidate_disk = zvol_revalidate_disk, ++ .getgeo = zvol_getgeo, ++ .owner = THIS_MODULE, ++}; ++#endif /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */ ++ ++/* ++ * Allocate memory for a new zvol_state_t and setup the required ++ * request queue and generic disk structures for the block device. ++ */ ++static zvol_state_t * ++zvol_alloc(dev_t dev, const char *name) ++{ ++ zvol_state_t *zv; ++ int error = 0; ++ ++ zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); ++ if (zv == NULL) ++ goto out; ++ ++ zv->zv_queue = blk_init_queue(zvol_request, &zv->zv_lock); ++ if (zv->zv_queue == NULL) ++ goto out_kmem; ++ ++#ifdef HAVE_ELEVATOR_CHANGE ++ error = elevator_change(zv->zv_queue, "noop"); ++#endif /* HAVE_ELEVATOR_CHANGE */ ++ if (error) { ++ printk("ZFS: Unable to set \"%s\" scheduler for zvol %s: %d\n", ++ "noop", name, error); ++ goto out_queue; ++ } ++ ++#ifdef HAVE_BLK_QUEUE_FLUSH ++ blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA); ++#else ++ blk_queue_ordered(zv->zv_queue, QUEUE_ORDERED_DRAIN, NULL); ++#endif /* HAVE_BLK_QUEUE_FLUSH */ ++ ++ zv->zv_disk = alloc_disk(ZVOL_MINORS); ++ if (zv->zv_disk == NULL) ++ goto out_queue; ++ ++ zv->zv_queue->queuedata = zv; ++ zv->zv_dev = dev; ++ zv->zv_open_count = 0; ++ strlcpy(zv->zv_name, name, MAXNAMELEN); ++ ++ mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); ++ avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, ++ sizeof (rl_t), offsetof(rl_t, r_node)); ++ zv->zv_znode.z_is_zvol = TRUE; ++ ++ spin_lock_init(&zv->zv_lock); ++ list_link_init(&zv->zv_next); ++ ++ zv->zv_disk->major = zvol_major; ++ zv->zv_disk->first_minor = (dev & MINORMASK); ++ zv->zv_disk->fops = &zvol_ops; ++ zv->zv_disk->private_data = zv; ++ zv->zv_disk->queue = zv->zv_queue; ++ snprintf(zv->zv_disk->disk_name, DISK_NAME_LEN, "%s%d", ++ ZVOL_DEV_NAME, (dev & MINORMASK)); ++ ++ return zv; ++ ++out_queue: ++ blk_cleanup_queue(zv->zv_queue); ++out_kmem: ++ kmem_free(zv, sizeof (zvol_state_t)); ++out: ++ return NULL; ++} ++ ++/* ++ * Cleanup then free a zvol_state_t which was created by zvol_alloc(). ++ */ ++static void ++zvol_free(zvol_state_t *zv) ++{ ++ avl_destroy(&zv->zv_znode.z_range_avl); ++ mutex_destroy(&zv->zv_znode.z_range_lock); ++ ++ del_gendisk(zv->zv_disk); ++ blk_cleanup_queue(zv->zv_queue); ++ put_disk(zv->zv_disk); ++ ++ kmem_free(zv, sizeof (zvol_state_t)); ++} ++ ++static int ++__zvol_create_minor(const char *name) ++{ ++ zvol_state_t *zv; ++ objset_t *os; ++ dmu_object_info_t *doi; ++ uint64_t volsize; ++ unsigned minor = 0; ++ int error = 0; ++ ++ ASSERT(MUTEX_HELD(&zvol_state_lock)); ++ ++ zv = zvol_find_by_name(name); ++ if (zv) { ++ error = EEXIST; ++ goto out; ++ } ++ ++ doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); ++ ++ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); ++ if (error) ++ goto out_doi; ++ ++ error = dmu_object_info(os, ZVOL_OBJ, doi); ++ if (error) ++ goto out_dmu_objset_disown; ++ ++ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); ++ if (error) ++ goto out_dmu_objset_disown; ++ ++ error = zvol_find_minor(&minor); ++ if (error) ++ goto out_dmu_objset_disown; ++ ++ zv = zvol_alloc(MKDEV(zvol_major, minor), name); ++ if (zv == NULL) { ++ error = EAGAIN; ++ goto out_dmu_objset_disown; ++ } ++ ++ if (dmu_objset_is_snapshot(os)) ++ zv->zv_flags |= ZVOL_RDONLY; ++ ++ zv->zv_volblocksize = doi->doi_data_block_size; ++ zv->zv_volsize = volsize; ++ zv->zv_objset = os; ++ ++ set_capacity(zv->zv_disk, zv->zv_volsize >> 9); ++ ++ blk_queue_max_hw_sectors(zv->zv_queue, UINT_MAX); ++ blk_queue_max_segments(zv->zv_queue, UINT16_MAX); ++ blk_queue_max_segment_size(zv->zv_queue, UINT_MAX); ++ blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize); ++ blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize); ++#ifdef HAVE_BLK_QUEUE_DISCARD ++ blk_queue_max_discard_sectors(zv->zv_queue, ++ (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); ++ blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize); ++ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue); ++#endif ++#ifdef HAVE_BLK_QUEUE_NONROT ++ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue); ++#endif ++ ++ if (zil_replay_disable) ++ zil_destroy(dmu_objset_zil(os), B_FALSE); ++ else ++ zil_replay(os, zv, zvol_replay_vector); ++ ++out_dmu_objset_disown: ++ dmu_objset_disown(os, zvol_tag); ++ zv->zv_objset = NULL; ++out_doi: ++ kmem_free(doi, sizeof(dmu_object_info_t)); ++out: ++ ++ if (error == 0) { ++ zvol_insert(zv); ++ add_disk(zv->zv_disk); ++ } ++ ++ return (error); ++} ++ ++/* ++ * Create a block device minor node and setup the linkage between it ++ * and the specified volume. Once this function returns the block ++ * device is live and ready for use. ++ */ ++int ++zvol_create_minor(const char *name) ++{ ++ int error; ++ ++ mutex_enter(&zvol_state_lock); ++ error = __zvol_create_minor(name); ++ mutex_exit(&zvol_state_lock); ++ ++ return (error); ++} ++ ++static int ++__zvol_remove_minor(const char *name) ++{ ++ zvol_state_t *zv; ++ ++ ASSERT(MUTEX_HELD(&zvol_state_lock)); ++ ++ zv = zvol_find_by_name(name); ++ if (zv == NULL) ++ return (ENXIO); ++ ++ if (zv->zv_open_count > 0) ++ return (EBUSY); ++ ++ zvol_remove(zv); ++ zvol_free(zv); ++ ++ return (0); ++} ++ ++/* ++ * Remove a block device minor node for the specified volume. ++ */ ++int ++zvol_remove_minor(const char *name) ++{ ++ int error; ++ ++ mutex_enter(&zvol_state_lock); ++ error = __zvol_remove_minor(name); ++ mutex_exit(&zvol_state_lock); ++ ++ return (error); ++} ++ ++static int ++zvol_create_minors_cb(spa_t *spa, uint64_t dsobj, ++ const char *dsname, void *arg) ++{ ++ if (strchr(dsname, '/') == NULL) ++ return 0; ++ ++ (void) __zvol_create_minor(dsname); ++ return (0); ++} ++ ++/* ++ * Create minors for specified pool, if pool is NULL create minors ++ * for all available pools. ++ */ ++int ++zvol_create_minors(const char *pool) ++{ ++ spa_t *spa = NULL; ++ int error = 0; ++ ++ if (zvol_inhibit_dev) ++ return (0); ++ ++ mutex_enter(&zvol_state_lock); ++ if (pool) { ++ error = dmu_objset_find_spa(NULL, pool, zvol_create_minors_cb, ++ NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); ++ } else { ++ mutex_enter(&spa_namespace_lock); ++ while ((spa = spa_next(spa)) != NULL) { ++ error = dmu_objset_find_spa(NULL, ++ spa_name(spa), zvol_create_minors_cb, NULL, ++ DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); ++ if (error) ++ break; ++ } ++ mutex_exit(&spa_namespace_lock); ++ } ++ mutex_exit(&zvol_state_lock); ++ ++ return error; ++} ++ ++/* ++ * Remove minors for specified pool, if pool is NULL remove all minors. ++ */ ++void ++zvol_remove_minors(const char *pool) ++{ ++ zvol_state_t *zv, *zv_next; ++ char *str; ++ ++ if (zvol_inhibit_dev) ++ return; ++ ++ str = kmem_zalloc(MAXNAMELEN, KM_SLEEP); ++ if (pool) { ++ (void) strncpy(str, pool, strlen(pool)); ++ (void) strcat(str, "/"); ++ } ++ ++ mutex_enter(&zvol_state_lock); ++ for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { ++ zv_next = list_next(&zvol_state_list, zv); ++ ++ if (pool == NULL || !strncmp(str, zv->zv_name, strlen(str))) { ++ zvol_remove(zv); ++ zvol_free(zv); ++ } ++ } ++ mutex_exit(&zvol_state_lock); ++ kmem_free(str, MAXNAMELEN); ++} ++ ++int ++zvol_init(void) ++{ ++ int error; ++ ++ zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_threads, maxclsyspri, ++ zvol_threads, INT_MAX, TASKQ_PREPOPULATE); ++ if (zvol_taskq == NULL) { ++ printk(KERN_INFO "ZFS: taskq_create() failed\n"); ++ return (-ENOMEM); ++ } ++ ++ error = register_blkdev(zvol_major, ZVOL_DRIVER); ++ if (error) { ++ printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); ++ taskq_destroy(zvol_taskq); ++ return (error); ++ } ++ ++ blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, ++ THIS_MODULE, zvol_probe, NULL, NULL); ++ ++ mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); ++ list_create(&zvol_state_list, sizeof (zvol_state_t), ++ offsetof(zvol_state_t, zv_next)); ++ ++ (void) zvol_create_minors(NULL); ++ ++ return (0); ++} ++ ++void ++zvol_fini(void) ++{ ++ zvol_remove_minors(NULL); ++ blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); ++ unregister_blkdev(zvol_major, ZVOL_DRIVER); ++ taskq_destroy(zvol_taskq); ++ mutex_destroy(&zvol_state_lock); ++ list_destroy(&zvol_state_list); ++} ++ ++module_param(zvol_inhibit_dev, uint, 0644); ++MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); ++ ++module_param(zvol_major, uint, 0444); ++MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); ++ ++module_param(zvol_threads, uint, 0444); ++MODULE_PARM_DESC(zvol_threads, "Number of threads for zvol device"); ++ ++module_param(zvol_max_discard_blocks, ulong, 0444); ++MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard at once"); +diff -uNr linux-3.2.33-go.orig/fs/zfs/zpios/Makefile linux-3.2.33-go/fs/zfs/zpios/Makefile +--- linux-3.2.33-go.orig/fs/zfs/zpios/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zpios/Makefile 2012-11-16 23:25:34.376039025 +0100 +@@ -0,0 +1,7 @@ ++MODULE := zpios ++ ++EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) -Wno-unused-but-set-variable -DHAVE_SPL -D_KERNEL -DTEXT_DOMAIN=\"zfs-linux-kernel\" -DNDEBUG ++ ++obj-$(CONFIG_ZFS) := $(MODULE).o ++ ++$(MODULE)-objs += pios.o +diff -uNr linux-3.2.33-go.orig/fs/zfs/zpios/Makefile.in linux-3.2.33-go/fs/zfs/zpios/Makefile.in +--- linux-3.2.33-go.orig/fs/zfs/zpios/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zpios/Makefile.in 2012-11-16 23:25:34.354039278 +0100 +@@ -0,0 +1,7 @@ ++MODULE := zpios ++ ++EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@ ++ ++obj-$(CONFIG_ZFS) := $(MODULE).o ++ ++$(MODULE)-objs += @top_srcdir@/module/zpios/pios.o +diff -uNr linux-3.2.33-go.orig/fs/zfs/zpios/pios.c linux-3.2.33-go/fs/zfs/zpios/pios.c +--- linux-3.2.33-go.orig/fs/zfs/zpios/pios.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/fs/zfs/zpios/pios.c 2012-11-16 23:25:34.354039278 +0100 +@@ -0,0 +1,1330 @@ ++/*****************************************************************************\ ++ * ZPIOS is a heavily modified version of the original PIOS test code. ++ * It is designed to have the test code running in the Linux kernel ++ * against ZFS while still being flexibly controled from user space. ++ * ++ * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * LLNL-CODE-403049 ++ * ++ * Original PIOS Test Code ++ * Copyright (C) 2004 Cluster File Systems, Inc. ++ * Written by Peter Braam ++ * Atul Vidwansa ++ * Milind Dumbare ++ * ++ * This file is part of ZFS on Linux. ++ * For details, see . ++ * ++ * ZPIOS is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * ZPIOS is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with ZPIOS. If not, see . ++\*****************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include "zpios-internal.h" ++ ++ ++static spl_class *zpios_class; ++static spl_device *zpios_device; ++static char *zpios_tag = "zpios_tag"; ++ ++static ++int zpios_upcall(char *path, char *phase, run_args_t *run_args, int rc) ++{ ++ /* This is stack heavy but it should be OK since we are only ++ * making the upcall between tests when the stack is shallow. ++ */ ++ char id[16], chunk_size[16], region_size[16], thread_count[16]; ++ char region_count[16], offset[16], region_noise[16], chunk_noise[16]; ++ char thread_delay[16], flags[16], result[8]; ++ char *argv[16], *envp[4]; ++ ++ if ((path == NULL) || (strlen(path) == 0)) ++ return -ENOENT; ++ ++ snprintf(id, 15, "%d", run_args->id); ++ snprintf(chunk_size, 15, "%lu", (long unsigned)run_args->chunk_size); ++ snprintf(region_size, 15, "%lu",(long unsigned) run_args->region_size); ++ snprintf(thread_count, 15, "%u", run_args->thread_count); ++ snprintf(region_count, 15, "%u", run_args->region_count); ++ snprintf(offset, 15, "%lu", (long unsigned)run_args->offset); ++ snprintf(region_noise, 15, "%u", run_args->region_noise); ++ snprintf(chunk_noise, 15, "%u", run_args->chunk_noise); ++ snprintf(thread_delay, 15, "%u", run_args->thread_delay); ++ snprintf(flags, 15, "0x%x", run_args->flags); ++ snprintf(result, 7, "%d", rc); ++ ++ /* Passing 15 args to registered pre/post upcall */ ++ argv[0] = path; ++ argv[1] = phase; ++ argv[2] = strlen(run_args->log) ? run_args->log : ""; ++ argv[3] = id; ++ argv[4] = run_args->pool; ++ argv[5] = chunk_size; ++ argv[6] = region_size; ++ argv[7] = thread_count; ++ argv[8] = region_count; ++ argv[9] = offset; ++ argv[10] = region_noise; ++ argv[11] = chunk_noise; ++ argv[12] = thread_delay; ++ argv[13] = flags; ++ argv[14] = result; ++ argv[15] = NULL; ++ ++ /* Passing environment for user space upcall */ ++ envp[0] = "HOME=/"; ++ envp[1] = "TERM=linux"; ++ envp[2] = "PATH=/sbin:/usr/sbin:/bin:/usr/bin"; ++ envp[3] = NULL; ++ ++ return call_usermodehelper(path, argv, envp, 1); ++} ++ ++static uint64_t ++zpios_dmu_object_create(run_args_t *run_args, objset_t *os) ++{ ++ struct dmu_tx *tx; ++ uint64_t obj = 0ULL; ++ int rc; ++ ++ tx = dmu_tx_create(os); ++ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, OBJ_SIZE); ++ rc = dmu_tx_assign(tx, TXG_WAIT); ++ if (rc) { ++ zpios_print(run_args->file, ++ "dmu_tx_assign() failed: %d\n", rc); ++ dmu_tx_abort(tx); ++ return obj; ++ } ++ ++ obj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, ++ DMU_OT_NONE, 0, tx); ++ rc = dmu_object_set_blocksize(os, obj, 128ULL << 10, 0, tx); ++ if (rc) { ++ zpios_print(run_args->file, ++ "dmu_object_set_blocksize() failed: %d\n", rc); ++ dmu_tx_abort(tx); ++ return obj; ++ } ++ ++ dmu_tx_commit(tx); ++ ++ return obj; ++} ++ ++static int ++zpios_dmu_object_free(run_args_t *run_args, objset_t *os, uint64_t obj) ++{ ++ struct dmu_tx *tx; ++ int rc; ++ ++ tx = dmu_tx_create(os); ++ dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END); ++ rc = dmu_tx_assign(tx, TXG_WAIT); ++ if (rc) { ++ zpios_print(run_args->file, ++ "dmu_tx_assign() failed: %d\n", rc); ++ dmu_tx_abort(tx); ++ return rc; ++ } ++ ++ rc = dmu_object_free(os, obj, tx); ++ if (rc) { ++ zpios_print(run_args->file, ++ "dmu_object_free() failed: %d\n", rc); ++ dmu_tx_abort(tx); ++ return rc; ++ } ++ ++ dmu_tx_commit(tx); ++ ++ return 0; ++} ++ ++static int ++zpios_dmu_setup(run_args_t *run_args) ++{ ++ zpios_time_t *t = &(run_args->stats.cr_time); ++ objset_t *os; ++ char name[32]; ++ uint64_t obj = 0ULL; ++ int i, rc = 0, rc2; ++ ++ (void)zpios_upcall(run_args->pre, PHASE_PRE_CREATE, run_args, 0); ++ t->start = zpios_timespec_now(); ++ ++ (void)snprintf(name, 32, "%s/id_%d", run_args->pool, run_args->id); ++ rc = dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL); ++ if (rc) { ++ zpios_print(run_args->file, "Error dmu_objset_create(%s, ...) " ++ "failed: %d\n", name, rc); ++ goto out; ++ } ++ ++ rc = dmu_objset_own(name, DMU_OST_OTHER, 0, zpios_tag, &os); ++ if (rc) { ++ zpios_print(run_args->file, "Error dmu_objset_own(%s, ...) " ++ "failed: %d\n", name, rc); ++ goto out_destroy; ++ } ++ ++ if (!(run_args->flags & DMU_FPP)) { ++ obj = zpios_dmu_object_create(run_args, os); ++ if (obj == 0) { ++ rc = -EBADF; ++ zpios_print(run_args->file, "Error zpios_dmu_" ++ "object_create() failed, %d\n", rc); ++ goto out_destroy; ++ } ++ } ++ ++ for (i = 0; i < run_args->region_count; i++) { ++ zpios_region_t *region; ++ ++ region = &run_args->regions[i]; ++ mutex_init(®ion->lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ if (run_args->flags & DMU_FPP) { ++ /* File per process */ ++ region->obj.os = os; ++ region->obj.obj = zpios_dmu_object_create(run_args, os); ++ ASSERT(region->obj.obj > 0); /* XXX - Handle this */ ++ region->wr_offset = run_args->offset; ++ region->rd_offset = run_args->offset; ++ region->init_offset = run_args->offset; ++ region->max_offset = run_args->offset + ++ run_args->region_size; ++ } else { ++ /* Single shared file */ ++ region->obj.os = os; ++ region->obj.obj = obj; ++ region->wr_offset = run_args->offset * i; ++ region->rd_offset = run_args->offset * i; ++ region->init_offset = run_args->offset * i; ++ region->max_offset = run_args->offset * ++ i + run_args->region_size; ++ } ++ } ++ ++ run_args->os = os; ++out_destroy: ++ if (rc) { ++ rc2 = dmu_objset_destroy(name, B_FALSE); ++ if (rc2) ++ zpios_print(run_args->file, "Error dmu_objset_destroy" ++ "(%s, ...) failed: %d\n", name, rc2); ++ } ++out: ++ t->stop = zpios_timespec_now(); ++ t->delta = zpios_timespec_sub(t->stop, t->start); ++ (void)zpios_upcall(run_args->post, PHASE_POST_CREATE, run_args, rc); ++ ++ return rc; ++} ++ ++static int ++zpios_setup_run(run_args_t **run_args, zpios_cmd_t *kcmd, struct file *file) ++{ ++ run_args_t *ra; ++ int rc, size; ++ ++ size = sizeof(*ra) + kcmd->cmd_region_count * sizeof(zpios_region_t); ++ ++ ra = vmem_zalloc(size, KM_SLEEP); ++ if (ra == NULL) { ++ zpios_print(file, "Unable to vmem_zalloc() %d bytes " ++ "for regions\n", size); ++ return -ENOMEM; ++ } ++ ++ *run_args = ra; ++ strncpy(ra->pool, kcmd->cmd_pool, ZPIOS_NAME_SIZE - 1); ++ strncpy(ra->pre, kcmd->cmd_pre, ZPIOS_PATH_SIZE - 1); ++ strncpy(ra->post, kcmd->cmd_post, ZPIOS_PATH_SIZE - 1); ++ strncpy(ra->log, kcmd->cmd_log, ZPIOS_PATH_SIZE - 1); ++ ra->id = kcmd->cmd_id; ++ ra->chunk_size = kcmd->cmd_chunk_size; ++ ra->thread_count = kcmd->cmd_thread_count; ++ ra->region_count = kcmd->cmd_region_count; ++ ra->region_size = kcmd->cmd_region_size; ++ ra->offset = kcmd->cmd_offset; ++ ra->region_noise = kcmd->cmd_region_noise; ++ ra->chunk_noise = kcmd->cmd_chunk_noise; ++ ra->thread_delay = kcmd->cmd_thread_delay; ++ ra->flags = kcmd->cmd_flags; ++ ra->stats.wr_data = 0; ++ ra->stats.wr_chunks = 0; ++ ra->stats.rd_data = 0; ++ ra->stats.rd_chunks = 0; ++ ra->region_next = 0; ++ ra->file = file; ++ mutex_init(&ra->lock_work, NULL, MUTEX_DEFAULT, NULL); ++ mutex_init(&ra->lock_ctl, NULL, MUTEX_DEFAULT, NULL); ++ ++ (void)zpios_upcall(ra->pre, PHASE_PRE_RUN, ra, 0); ++ ++ rc = zpios_dmu_setup(ra); ++ if (rc) { ++ mutex_destroy(&ra->lock_ctl); ++ mutex_destroy(&ra->lock_work); ++ vmem_free(ra, size); ++ *run_args = NULL; ++ } ++ ++ return rc; ++} ++ ++static int ++zpios_get_work_item(run_args_t *run_args, dmu_obj_t *obj, __u64 *offset, ++ __u32 *chunk_size, zpios_region_t **region, __u32 flags) ++{ ++ int i, j, count = 0; ++ unsigned int random_int; ++ ++ get_random_bytes(&random_int, sizeof(unsigned int)); ++ ++ mutex_enter(&run_args->lock_work); ++ i = run_args->region_next; ++ ++ /* XXX: I don't much care for this chunk selection mechansim ++ * there's the potential to burn a lot of time here doing nothing ++ * useful while holding the global lock. This could give some ++ * misleading performance results. I'll fix it latter. ++ */ ++ while (count < run_args->region_count) { ++ __u64 *rw_offset; ++ zpios_time_t *rw_time; ++ ++ j = i % run_args->region_count; ++ *region = &(run_args->regions[j]); ++ ++ if (flags & DMU_WRITE) { ++ rw_offset = &((*region)->wr_offset); ++ rw_time = &((*region)->stats.wr_time); ++ } else { ++ rw_offset = &((*region)->rd_offset); ++ rw_time = &((*region)->stats.rd_time); ++ } ++ ++ /* test if region is fully written */ ++ if (*rw_offset + *chunk_size > (*region)->max_offset) { ++ i++; ++ count++; ++ ++ if (unlikely(rw_time->stop.ts_sec == 0) && ++ unlikely(rw_time->stop.ts_nsec == 0)) ++ rw_time->stop = zpios_timespec_now(); ++ ++ continue; ++ } ++ ++ *offset = *rw_offset; ++ *obj = (*region)->obj; ++ *rw_offset += *chunk_size; ++ ++ /* update ctl structure */ ++ if (run_args->region_noise) { ++ get_random_bytes(&random_int, sizeof(unsigned int)); ++ run_args->region_next += random_int % run_args->region_noise; ++ } else { ++ run_args->region_next++; ++ } ++ ++ mutex_exit(&run_args->lock_work); ++ return 1; ++ } ++ ++ /* nothing left to do */ ++ mutex_exit(&run_args->lock_work); ++ ++ return 0; ++} ++ ++static void ++zpios_remove_objset(run_args_t *run_args) ++{ ++ zpios_time_t *t = &(run_args->stats.rm_time); ++ zpios_region_t *region; ++ char name[32]; ++ int rc = 0, i; ++ ++ (void)zpios_upcall(run_args->pre, PHASE_PRE_REMOVE, run_args, 0); ++ t->start = zpios_timespec_now(); ++ ++ (void)snprintf(name, 32, "%s/id_%d", run_args->pool, run_args->id); ++ ++ if (run_args->flags & DMU_REMOVE) { ++ if (run_args->flags & DMU_FPP) { ++ for (i = 0; i < run_args->region_count; i++) { ++ region = &run_args->regions[i]; ++ rc = zpios_dmu_object_free(run_args, ++ region->obj.os, ++ region->obj.obj); ++ if (rc) ++ zpios_print(run_args->file, "Error " ++ "removing object %d, %d\n", ++ (int)region->obj.obj, rc); ++ } ++ } else { ++ region = &run_args->regions[0]; ++ rc = zpios_dmu_object_free(run_args, ++ region->obj.os, ++ region->obj.obj); ++ if (rc) ++ zpios_print(run_args->file, "Error " ++ "removing object %d, %d\n", ++ (int)region->obj.obj, rc); ++ } ++ } ++ ++ dmu_objset_disown(run_args->os, zpios_tag); ++ ++ if (run_args->flags & DMU_REMOVE) { ++ rc = dmu_objset_destroy(name, B_FALSE); ++ if (rc) ++ zpios_print(run_args->file, "Error dmu_objset_destroy" ++ "(%s, ...) failed: %d\n", name, rc); ++ } ++ ++ t->stop = zpios_timespec_now(); ++ t->delta = zpios_timespec_sub(t->stop, t->start); ++ (void)zpios_upcall(run_args->post, PHASE_POST_REMOVE, run_args, rc); ++} ++ ++static void ++zpios_cleanup_run(run_args_t *run_args) ++{ ++ int i, size = 0; ++ ++ if (run_args == NULL) ++ return; ++ ++ if (run_args->threads != NULL) { ++ for (i = 0; i < run_args->thread_count; i++) { ++ if (run_args->threads[i]) { ++ mutex_destroy(&run_args->threads[i]->lock); ++ kmem_free(run_args->threads[i], ++ sizeof(thread_data_t)); ++ } ++ } ++ ++ kmem_free(run_args->threads, ++ sizeof(thread_data_t *) * run_args->thread_count); ++ } ++ ++ for (i = 0; i < run_args->region_count; i++) ++ mutex_destroy(&run_args->regions[i].lock); ++ ++ mutex_destroy(&run_args->lock_work); ++ mutex_destroy(&run_args->lock_ctl); ++ size = run_args->region_count * sizeof(zpios_region_t); ++ ++ vmem_free(run_args, sizeof(*run_args) + size); ++} ++ ++static int ++zpios_dmu_write(run_args_t *run_args, objset_t *os, uint64_t object, ++ uint64_t offset, uint64_t size, const void *buf) ++{ ++ struct dmu_tx *tx; ++ int rc, how = TXG_WAIT; ++// int flags = 0; ++ ++ if (run_args->flags & DMU_WRITE_NOWAIT) ++ how = TXG_NOWAIT; ++ ++ while (1) { ++ tx = dmu_tx_create(os); ++ dmu_tx_hold_write(tx, object, offset, size); ++ rc = dmu_tx_assign(tx, how); ++ ++ if (rc) { ++ if (rc == ERESTART && how == TXG_NOWAIT) { ++ dmu_tx_wait(tx); ++ dmu_tx_abort(tx); ++ continue; ++ } ++ zpios_print(run_args->file, ++ "Error in dmu_tx_assign(), %d", rc); ++ dmu_tx_abort(tx); ++ return rc; ++ } ++ break; ++ } ++ ++// if (run_args->flags & DMU_WRITE_ZC) ++// flags |= DMU_WRITE_ZEROCOPY; ++ ++ dmu_write(os, object, offset, size, buf, tx); ++ dmu_tx_commit(tx); ++ ++ return 0; ++} ++ ++static int ++zpios_dmu_read(run_args_t *run_args, objset_t *os, uint64_t object, ++ uint64_t offset, uint64_t size, void *buf) ++{ ++ int flags = 0; ++ ++// if (run_args->flags & DMU_READ_ZC) ++// flags |= DMU_READ_ZEROCOPY; ++ ++ if (run_args->flags & DMU_READ_NOPF) ++ flags |= DMU_READ_NO_PREFETCH; ++ ++ return dmu_read(os, object, offset, size, buf, flags); ++} ++ ++static int ++zpios_thread_main(void *data) ++{ ++ thread_data_t *thr = (thread_data_t *)data; ++ run_args_t *run_args = thr->run_args; ++ zpios_time_t t; ++ dmu_obj_t obj; ++ __u64 offset; ++ __u32 chunk_size; ++ zpios_region_t *region; ++ char *buf; ++ unsigned int random_int; ++ int chunk_noise = run_args->chunk_noise; ++ int chunk_noise_tmp = 0; ++ int thread_delay = run_args->thread_delay; ++ int thread_delay_tmp = 0; ++ int i, rc = 0; ++ ++ if (chunk_noise) { ++ get_random_bytes(&random_int, sizeof(unsigned int)); ++ chunk_noise_tmp = (random_int % (chunk_noise * 2))-chunk_noise; ++ } ++ ++ /* It's OK to vmem_alloc() this memory because it will be copied ++ * in to the slab and pointers to the slab copy will be setup in ++ * the bio when the IO is submitted. This of course is not ideal ++ * since we want a zero-copy IO path if possible. It would be nice ++ * to have direct access to those slab entries. ++ */ ++ chunk_size = run_args->chunk_size + chunk_noise_tmp; ++ buf = (char *)vmem_alloc(chunk_size, KM_SLEEP); ++ ASSERT(buf); ++ ++ /* Trivial data verification pattern for now. */ ++ if (run_args->flags & DMU_VERIFY) ++ memset(buf, 'z', chunk_size); ++ ++ /* Write phase */ ++ mutex_enter(&thr->lock); ++ thr->stats.wr_time.start = zpios_timespec_now(); ++ mutex_exit(&thr->lock); ++ ++ while (zpios_get_work_item(run_args, &obj, &offset, ++ &chunk_size, ®ion, DMU_WRITE)) { ++ if (thread_delay) { ++ get_random_bytes(&random_int, sizeof(unsigned int)); ++ thread_delay_tmp = random_int % thread_delay; ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_timeout(thread_delay_tmp); /* In jiffies */ ++ } ++ ++ t.start = zpios_timespec_now(); ++ rc = zpios_dmu_write(run_args, obj.os, obj.obj, ++ offset, chunk_size, buf); ++ t.stop = zpios_timespec_now(); ++ t.delta = zpios_timespec_sub(t.stop, t.start); ++ ++ if (rc) { ++ zpios_print(run_args->file, "IO error while doing " ++ "dmu_write(): %d\n", rc); ++ break; ++ } ++ ++ mutex_enter(&thr->lock); ++ thr->stats.wr_data += chunk_size; ++ thr->stats.wr_chunks++; ++ thr->stats.wr_time.delta = zpios_timespec_add( ++ thr->stats.wr_time.delta, t.delta); ++ mutex_exit(&thr->lock); ++ ++ mutex_enter(®ion->lock); ++ region->stats.wr_data += chunk_size; ++ region->stats.wr_chunks++; ++ region->stats.wr_time.delta = zpios_timespec_add( ++ region->stats.wr_time.delta, t.delta); ++ ++ /* First time region was accessed */ ++ if (region->init_offset == offset) ++ region->stats.wr_time.start = t.start; ++ ++ mutex_exit(®ion->lock); ++ } ++ ++ mutex_enter(&run_args->lock_ctl); ++ run_args->threads_done++; ++ mutex_exit(&run_args->lock_ctl); ++ ++ mutex_enter(&thr->lock); ++ thr->rc = rc; ++ thr->stats.wr_time.stop = zpios_timespec_now(); ++ mutex_exit(&thr->lock); ++ wake_up(&run_args->waitq); ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule(); ++ ++ /* Check if we should exit */ ++ mutex_enter(&thr->lock); ++ rc = thr->rc; ++ mutex_exit(&thr->lock); ++ if (rc) ++ goto out; ++ ++ /* Read phase */ ++ mutex_enter(&thr->lock); ++ thr->stats.rd_time.start = zpios_timespec_now(); ++ mutex_exit(&thr->lock); ++ ++ while (zpios_get_work_item(run_args, &obj, &offset, ++ &chunk_size, ®ion, DMU_READ)) { ++ if (thread_delay) { ++ get_random_bytes(&random_int, sizeof(unsigned int)); ++ thread_delay_tmp = random_int % thread_delay; ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_timeout(thread_delay_tmp); /* In jiffies */ ++ } ++ ++ if (run_args->flags & DMU_VERIFY) ++ memset(buf, 0, chunk_size); ++ ++ t.start = zpios_timespec_now(); ++ rc = zpios_dmu_read(run_args, obj.os, obj.obj, ++ offset, chunk_size, buf); ++ t.stop = zpios_timespec_now(); ++ t.delta = zpios_timespec_sub(t.stop, t.start); ++ ++ if (rc) { ++ zpios_print(run_args->file, "IO error while doing " ++ "dmu_read(): %d\n", rc); ++ break; ++ } ++ ++ /* Trivial data verification, expensive! */ ++ if (run_args->flags & DMU_VERIFY) { ++ for (i = 0; i < chunk_size; i++) { ++ if (buf[i] != 'z') { ++ zpios_print(run_args->file, ++ "IO verify error: %d/%d/%d\n", ++ (int)obj.obj, (int)offset, ++ (int)chunk_size); ++ break; ++ } ++ } ++ } ++ ++ mutex_enter(&thr->lock); ++ thr->stats.rd_data += chunk_size; ++ thr->stats.rd_chunks++; ++ thr->stats.rd_time.delta = zpios_timespec_add( ++ thr->stats.rd_time.delta, t.delta); ++ mutex_exit(&thr->lock); ++ ++ mutex_enter(®ion->lock); ++ region->stats.rd_data += chunk_size; ++ region->stats.rd_chunks++; ++ region->stats.rd_time.delta = zpios_timespec_add( ++ region->stats.rd_time.delta, t.delta); ++ ++ /* First time region was accessed */ ++ if (region->init_offset == offset) ++ region->stats.rd_time.start = t.start; ++ ++ mutex_exit(®ion->lock); ++ } ++ ++ mutex_enter(&run_args->lock_ctl); ++ run_args->threads_done++; ++ mutex_exit(&run_args->lock_ctl); ++ ++ mutex_enter(&thr->lock); ++ thr->rc = rc; ++ thr->stats.rd_time.stop = zpios_timespec_now(); ++ mutex_exit(&thr->lock); ++ wake_up(&run_args->waitq); ++ ++out: ++ vmem_free(buf, chunk_size); ++ do_exit(0); ++ ++ return rc; /* Unreachable, due to do_exit() */ ++} ++ ++static int ++zpios_thread_done(run_args_t *run_args) ++{ ++ ASSERT(run_args->threads_done <= run_args->thread_count); ++ return (run_args->threads_done == run_args->thread_count); ++} ++ ++static int ++zpios_threads_run(run_args_t *run_args) ++{ ++ struct task_struct *tsk, **tsks; ++ thread_data_t *thr = NULL; ++ zpios_time_t *tt = &(run_args->stats.total_time); ++ zpios_time_t *tw = &(run_args->stats.wr_time); ++ zpios_time_t *tr = &(run_args->stats.rd_time); ++ int i, rc = 0, tc = run_args->thread_count; ++ ++ tsks = kmem_zalloc(sizeof(struct task_struct *) * tc, KM_SLEEP); ++ if (tsks == NULL) { ++ rc = -ENOMEM; ++ goto cleanup2; ++ } ++ ++ run_args->threads = kmem_zalloc(sizeof(thread_data_t *) * tc, KM_SLEEP); ++ if (run_args->threads == NULL) { ++ rc = -ENOMEM; ++ goto cleanup; ++ } ++ ++ init_waitqueue_head(&run_args->waitq); ++ run_args->threads_done = 0; ++ ++ /* Create all the needed threads which will sleep until awoken */ ++ for (i = 0; i < tc; i++) { ++ thr = kmem_zalloc(sizeof(thread_data_t), KM_SLEEP); ++ if (thr == NULL) { ++ rc = -ENOMEM; ++ goto taskerr; ++ } ++ ++ thr->thread_no = i; ++ thr->run_args = run_args; ++ thr->rc = 0; ++ mutex_init(&thr->lock, NULL, MUTEX_DEFAULT, NULL); ++ run_args->threads[i] = thr; ++ ++ tsk = kthread_create(zpios_thread_main, (void *)thr, ++ "%s/%d", "zpios_io", i); ++ if (IS_ERR(tsk)) { ++ rc = -EINVAL; ++ goto taskerr; ++ } ++ ++ tsks[i] = tsk; ++ } ++ ++ tt->start = zpios_timespec_now(); ++ ++ /* Wake up all threads for write phase */ ++ (void)zpios_upcall(run_args->pre, PHASE_PRE_WRITE, run_args, 0); ++ for (i = 0; i < tc; i++) ++ wake_up_process(tsks[i]); ++ ++ /* Wait for write phase to complete */ ++ tw->start = zpios_timespec_now(); ++ wait_event(run_args->waitq, zpios_thread_done(run_args)); ++ tw->stop = zpios_timespec_now(); ++ (void)zpios_upcall(run_args->post, PHASE_POST_WRITE, run_args, rc); ++ ++ for (i = 0; i < tc; i++) { ++ thr = run_args->threads[i]; ++ ++ mutex_enter(&thr->lock); ++ ++ if (!rc && thr->rc) ++ rc = thr->rc; ++ ++ run_args->stats.wr_data += thr->stats.wr_data; ++ run_args->stats.wr_chunks += thr->stats.wr_chunks; ++ mutex_exit(&thr->lock); ++ } ++ ++ if (rc) { ++ /* Wake up all threads and tell them to exit */ ++ for (i = 0; i < tc; i++) { ++ mutex_enter(&thr->lock); ++ thr->rc = rc; ++ mutex_exit(&thr->lock); ++ ++ wake_up_process(tsks[i]); ++ } ++ goto out; ++ } ++ ++ mutex_enter(&run_args->lock_ctl); ++ ASSERT(run_args->threads_done == run_args->thread_count); ++ run_args->threads_done = 0; ++ mutex_exit(&run_args->lock_ctl); ++ ++ /* Wake up all threads for read phase */ ++ (void)zpios_upcall(run_args->pre, PHASE_PRE_READ, run_args, 0); ++ for (i = 0; i < tc; i++) ++ wake_up_process(tsks[i]); ++ ++ /* Wait for read phase to complete */ ++ tr->start = zpios_timespec_now(); ++ wait_event(run_args->waitq, zpios_thread_done(run_args)); ++ tr->stop = zpios_timespec_now(); ++ (void)zpios_upcall(run_args->post, PHASE_POST_READ, run_args, rc); ++ ++ for (i = 0; i < tc; i++) { ++ thr = run_args->threads[i]; ++ ++ mutex_enter(&thr->lock); ++ ++ if (!rc && thr->rc) ++ rc = thr->rc; ++ ++ run_args->stats.rd_data += thr->stats.rd_data; ++ run_args->stats.rd_chunks += thr->stats.rd_chunks; ++ mutex_exit(&thr->lock); ++ } ++out: ++ tt->stop = zpios_timespec_now(); ++ tt->delta = zpios_timespec_sub(tt->stop, tt->start); ++ tw->delta = zpios_timespec_sub(tw->stop, tw->start); ++ tr->delta = zpios_timespec_sub(tr->stop, tr->start); ++ ++cleanup: ++ kmem_free(tsks, sizeof(struct task_struct *) * tc); ++cleanup2: ++ /* Returns first encountered thread error (if any) */ ++ return rc; ++ ++taskerr: ++ /* Destroy all threads that were created successfully */ ++ for (i = 0; i < tc; i++) ++ if (tsks[i] != NULL) ++ (void) kthread_stop(tsks[i]); ++ ++ goto cleanup; ++} ++ ++static int ++zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, ++ int data_size, void *data) ++{ ++ run_args_t *run_args = { 0 }; ++ zpios_stats_t *stats = (zpios_stats_t *)data; ++ int i, n, m, size, rc; ++ ++ if ((!kcmd->cmd_chunk_size) || (!kcmd->cmd_region_size) || ++ (!kcmd->cmd_thread_count) || (!kcmd->cmd_region_count)) { ++ zpios_print(file, "Invalid chunk_size, region_size, " ++ "thread_count, or region_count, %d\n", -EINVAL); ++ return -EINVAL; ++ } ++ ++ if (!(kcmd->cmd_flags & DMU_WRITE) || ++ !(kcmd->cmd_flags & DMU_READ)) { ++ zpios_print(file, "Invalid flags, minimally DMU_WRITE " ++ "and DMU_READ must be set, %d\n", -EINVAL); ++ return -EINVAL; ++ } ++ ++ if ((kcmd->cmd_flags & (DMU_WRITE_ZC | DMU_READ_ZC)) && ++ (kcmd->cmd_flags & DMU_VERIFY)) { ++ zpios_print(file, "Invalid flags, DMU_*_ZC incompatible " ++ "with DMU_VERIFY, used for performance analysis " ++ "only, %d\n", -EINVAL); ++ return -EINVAL; ++ } ++ ++ /* Opaque data on return contains structs of the following form: ++ * ++ * zpios_stat_t stats[]; ++ * stats[0] = run_args->stats; ++ * stats[1-N] = threads[N]->stats; ++ * stats[N+1-M] = regions[M]->stats; ++ * ++ * Where N is the number of threads, and M is the number of regions. ++ */ ++ size = (sizeof(zpios_stats_t) + ++ (kcmd->cmd_thread_count * sizeof(zpios_stats_t)) + ++ (kcmd->cmd_region_count * sizeof(zpios_stats_t))); ++ if (data_size < size) { ++ zpios_print(file, "Invalid size, command data buffer " ++ "size too small, (%d < %d)\n", data_size, size); ++ return -ENOSPC; ++ } ++ ++ rc = zpios_setup_run(&run_args, kcmd, file); ++ if (rc) ++ return rc; ++ ++ rc = zpios_threads_run(run_args); ++ zpios_remove_objset(run_args); ++ if (rc) ++ goto cleanup; ++ ++ if (stats) { ++ n = 1; ++ m = 1 + kcmd->cmd_thread_count; ++ stats[0] = run_args->stats; ++ ++ for (i = 0; i < kcmd->cmd_thread_count; i++) ++ stats[n+i] = run_args->threads[i]->stats; ++ ++ for (i = 0; i < kcmd->cmd_region_count; i++) ++ stats[m+i] = run_args->regions[i].stats; ++ } ++ ++cleanup: ++ zpios_cleanup_run(run_args); ++ ++ (void)zpios_upcall(kcmd->cmd_post, PHASE_POST_RUN, run_args, 0); ++ ++ return rc; ++} ++ ++static int ++zpios_open(struct inode *inode, struct file *file) ++{ ++ unsigned int minor = iminor(inode); ++ zpios_info_t *info; ++ ++ if (minor >= ZPIOS_MINORS) ++ return -ENXIO; ++ ++ info = (zpios_info_t *)kmem_alloc(sizeof(*info), KM_SLEEP); ++ if (info == NULL) ++ return -ENOMEM; ++ ++ spin_lock_init(&info->info_lock); ++ info->info_size = ZPIOS_INFO_BUFFER_SIZE; ++ info->info_buffer = (char *)vmem_alloc(ZPIOS_INFO_BUFFER_SIZE,KM_SLEEP); ++ if (info->info_buffer == NULL) { ++ kmem_free(info, sizeof(*info)); ++ return -ENOMEM; ++ } ++ ++ info->info_head = info->info_buffer; ++ file->private_data = (void *)info; ++ ++ return 0; ++} ++ ++static int ++zpios_release(struct inode *inode, struct file *file) ++{ ++ unsigned int minor = iminor(inode); ++ zpios_info_t *info = (zpios_info_t *)file->private_data; ++ ++ if (minor >= ZPIOS_MINORS) ++ return -ENXIO; ++ ++ ASSERT(info); ++ ASSERT(info->info_buffer); ++ ++ vmem_free(info->info_buffer, ZPIOS_INFO_BUFFER_SIZE); ++ kmem_free(info, sizeof(*info)); ++ ++ return 0; ++} ++ ++static int ++zpios_buffer_clear(struct file *file, zpios_cfg_t *kcfg, unsigned long arg) ++{ ++ zpios_info_t *info = (zpios_info_t *)file->private_data; ++ ++ ASSERT(info); ++ ASSERT(info->info_buffer); ++ ++ spin_lock(&info->info_lock); ++ memset(info->info_buffer, 0, info->info_size); ++ info->info_head = info->info_buffer; ++ spin_unlock(&info->info_lock); ++ ++ return 0; ++} ++ ++static int ++zpios_buffer_size(struct file *file, zpios_cfg_t *kcfg, unsigned long arg) ++{ ++ zpios_info_t *info = (zpios_info_t *)file->private_data; ++ char *buf; ++ int min, size, rc = 0; ++ ++ ASSERT(info); ++ ASSERT(info->info_buffer); ++ ++ spin_lock(&info->info_lock); ++ if (kcfg->cfg_arg1 > 0) { ++ ++ size = kcfg->cfg_arg1; ++ buf = (char *)vmem_alloc(size, KM_SLEEP); ++ if (buf == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ /* Zero fill and truncate contents when coping buffer */ ++ min = ((size < info->info_size) ? size : info->info_size); ++ memset(buf, 0, size); ++ memcpy(buf, info->info_buffer, min); ++ vmem_free(info->info_buffer, info->info_size); ++ info->info_size = size; ++ info->info_buffer = buf; ++ info->info_head = info->info_buffer; ++ } ++ ++ kcfg->cfg_rc1 = info->info_size; ++ ++ if (copy_to_user((struct zpios_cfg_t __user *)arg, kcfg, sizeof(*kcfg))) ++ rc = -EFAULT; ++out: ++ spin_unlock(&info->info_lock); ++ ++ return rc; ++} ++ ++static int ++zpios_ioctl_cfg(struct file *file, unsigned long arg) ++{ ++ zpios_cfg_t kcfg; ++ int rc = 0; ++ ++ if (copy_from_user(&kcfg, (zpios_cfg_t *)arg, sizeof(kcfg))) ++ return -EFAULT; ++ ++ if (kcfg.cfg_magic != ZPIOS_CFG_MAGIC) { ++ zpios_print(file, "Bad config magic 0x%x != 0x%x\n", ++ kcfg.cfg_magic, ZPIOS_CFG_MAGIC); ++ return -EINVAL; ++ } ++ ++ switch (kcfg.cfg_cmd) { ++ case ZPIOS_CFG_BUFFER_CLEAR: ++ /* cfg_arg1 - Unused ++ * cfg_rc1 - Unused ++ */ ++ rc = zpios_buffer_clear(file, &kcfg, arg); ++ break; ++ case ZPIOS_CFG_BUFFER_SIZE: ++ /* cfg_arg1 - 0 - query size; >0 resize ++ * cfg_rc1 - Set to current buffer size ++ */ ++ rc = zpios_buffer_size(file, &kcfg, arg); ++ break; ++ default: ++ zpios_print(file, "Bad config command %d\n", ++ kcfg.cfg_cmd); ++ rc = -EINVAL; ++ break; ++ } ++ ++ return rc; ++} ++ ++static int ++zpios_ioctl_cmd(struct file *file, unsigned long arg) ++{ ++ zpios_cmd_t *kcmd; ++ void *data = NULL; ++ int rc = -EINVAL; ++ ++ kcmd = kmem_alloc(sizeof(zpios_cmd_t), KM_SLEEP); ++ if (kcmd == NULL) { ++ zpios_print(file, "Unable to kmem_alloc() %ld byte for " ++ "zpios_cmd_t\n", (long int)sizeof(zpios_cmd_t)); ++ return -ENOMEM; ++ } ++ ++ rc = copy_from_user(kcmd, (zpios_cfg_t *)arg, sizeof(zpios_cmd_t)); ++ if (rc) { ++ zpios_print(file, "Unable to copy command structure " ++ "from user to kernel memory, %d\n", rc); ++ goto out_cmd; ++ } ++ ++ if (kcmd->cmd_magic != ZPIOS_CMD_MAGIC) { ++ zpios_print(file, "Bad command magic 0x%x != 0x%x\n", ++ kcmd->cmd_magic, ZPIOS_CFG_MAGIC); ++ rc = -EINVAL; ++ goto out_cmd; ++ } ++ ++ /* Allocate memory for any opaque data the caller needed to pass on */ ++ if (kcmd->cmd_data_size > 0) { ++ data = (void *)vmem_alloc(kcmd->cmd_data_size, KM_SLEEP); ++ if (data == NULL) { ++ zpios_print(file, "Unable to vmem_alloc() %ld " ++ "bytes for data buffer\n", ++ (long)kcmd->cmd_data_size); ++ rc = -ENOMEM; ++ goto out_cmd; ++ } ++ ++ rc = copy_from_user(data, (void *)(arg + offsetof(zpios_cmd_t, ++ cmd_data_str)), kcmd->cmd_data_size); ++ if (rc) { ++ zpios_print(file, "Unable to copy data buffer " ++ "from user to kernel memory, %d\n", rc); ++ goto out_data; ++ } ++ } ++ ++ rc = zpios_do_one_run(file, kcmd, kcmd->cmd_data_size, data); ++ ++ if (data != NULL) { ++ /* If the test failed do not print out the stats */ ++ if (rc) ++ goto out_data; ++ ++ rc = copy_to_user((void *)(arg + offsetof(zpios_cmd_t, ++ cmd_data_str)), data, kcmd->cmd_data_size); ++ if (rc) { ++ zpios_print(file, "Unable to copy data buffer " ++ "from kernel to user memory, %d\n", rc); ++ rc = -EFAULT; ++ } ++ ++out_data: ++ vmem_free(data, kcmd->cmd_data_size); ++ } ++out_cmd: ++ kmem_free(kcmd, sizeof(zpios_cmd_t)); ++ ++ return rc; ++} ++ ++static long ++zpios_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ unsigned int minor = iminor(file->f_dentry->d_inode); ++ int rc = 0; ++ ++ /* Ignore tty ioctls */ ++ if ((cmd & 0xffffff00) == ((int)'T') << 8) ++ return -ENOTTY; ++ ++ if (minor >= ZPIOS_MINORS) ++ return -ENXIO; ++ ++ switch (cmd) { ++ case ZPIOS_CFG: ++ rc = zpios_ioctl_cfg(file, arg); ++ break; ++ case ZPIOS_CMD: ++ rc = zpios_ioctl_cmd(file, arg); ++ break; ++ default: ++ zpios_print(file, "Bad ioctl command %d\n", cmd); ++ rc = -EINVAL; ++ break; ++ } ++ ++ return rc; ++} ++ ++#ifdef CONFIG_COMPAT ++/* Compatibility handler for ioctls from 32-bit ELF binaries */ ++static long ++zpios_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ return zpios_unlocked_ioctl(file, cmd, arg); ++} ++#endif /* CONFIG_COMPAT */ ++ ++/* I'm not sure why you would want to write in to this buffer from ++ * user space since its principle use is to pass test status info ++ * back to the user space, but I don't see any reason to prevent it. ++ */ ++static ssize_t ++zpios_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ unsigned int minor = iminor(file->f_dentry->d_inode); ++ zpios_info_t *info = (zpios_info_t *)file->private_data; ++ int rc = 0; ++ ++ if (minor >= ZPIOS_MINORS) ++ return -ENXIO; ++ ++ ASSERT(info); ++ ASSERT(info->info_buffer); ++ ++ spin_lock(&info->info_lock); ++ ++ /* Write beyond EOF */ ++ if (*ppos >= info->info_size) { ++ rc = -EFBIG; ++ goto out; ++ } ++ ++ /* Resize count if beyond EOF */ ++ if (*ppos + count > info->info_size) ++ count = info->info_size - *ppos; ++ ++ if (copy_from_user(info->info_buffer, buf, count)) { ++ rc = -EFAULT; ++ goto out; ++ } ++ ++ *ppos += count; ++ rc = count; ++out: ++ spin_unlock(&info->info_lock); ++ return rc; ++} ++ ++static ssize_t ++zpios_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ unsigned int minor = iminor(file->f_dentry->d_inode); ++ zpios_info_t *info = (zpios_info_t *)file->private_data; ++ int rc = 0; ++ ++ if (minor >= ZPIOS_MINORS) ++ return -ENXIO; ++ ++ ASSERT(info); ++ ASSERT(info->info_buffer); ++ ++ spin_lock(&info->info_lock); ++ ++ /* Read beyond EOF */ ++ if (*ppos >= info->info_size) ++ goto out; ++ ++ /* Resize count if beyond EOF */ ++ if (*ppos + count > info->info_size) ++ count = info->info_size - *ppos; ++ ++ if (copy_to_user(buf, info->info_buffer + *ppos, count)) { ++ rc = -EFAULT; ++ goto out; ++ } ++ ++ *ppos += count; ++ rc = count; ++out: ++ spin_unlock(&info->info_lock); ++ return rc; ++} ++ ++static loff_t zpios_seek(struct file *file, loff_t offset, int origin) ++{ ++ unsigned int minor = iminor(file->f_dentry->d_inode); ++ zpios_info_t *info = (zpios_info_t *)file->private_data; ++ int rc = -EINVAL; ++ ++ if (minor >= ZPIOS_MINORS) ++ return -ENXIO; ++ ++ ASSERT(info); ++ ASSERT(info->info_buffer); ++ ++ spin_lock(&info->info_lock); ++ ++ switch (origin) { ++ case 0: /* SEEK_SET - No-op just do it */ ++ break; ++ case 1: /* SEEK_CUR - Seek from current */ ++ offset = file->f_pos + offset; ++ break; ++ case 2: /* SEEK_END - Seek from end */ ++ offset = info->info_size + offset; ++ break; ++ } ++ ++ if (offset >= 0) { ++ file->f_pos = offset; ++ file->f_version = 0; ++ rc = offset; ++ } ++ ++ spin_unlock(&info->info_lock); ++ ++ return rc; ++} ++ ++static struct cdev zpios_cdev; ++static struct file_operations zpios_fops = { ++ .owner = THIS_MODULE, ++ .open = zpios_open, ++ .release = zpios_release, ++ .unlocked_ioctl = zpios_unlocked_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = zpios_compat_ioctl, ++#endif ++ .read = zpios_read, ++ .write = zpios_write, ++ .llseek = zpios_seek, ++}; ++ ++static int ++zpios_init(void) ++{ ++ dev_t dev; ++ int rc; ++ ++ dev = MKDEV(ZPIOS_MAJOR, 0); ++ if ((rc = register_chrdev_region(dev, ZPIOS_MINORS, ZPIOS_NAME))) ++ goto error; ++ ++ /* Support for registering a character driver */ ++ cdev_init(&zpios_cdev, &zpios_fops); ++ zpios_cdev.owner = THIS_MODULE; ++ kobject_set_name(&zpios_cdev.kobj, ZPIOS_NAME); ++ if ((rc = cdev_add(&zpios_cdev, dev, ZPIOS_MINORS))) { ++ printk(KERN_ERR "ZPIOS: Error adding cdev, %d\n", rc); ++ kobject_put(&zpios_cdev.kobj); ++ unregister_chrdev_region(dev, ZPIOS_MINORS); ++ goto error; ++ } ++ ++ /* Support for udev make driver info available in sysfs */ ++ zpios_class = spl_class_create(THIS_MODULE, ZPIOS_NAME); ++ if (IS_ERR(zpios_class)) { ++ rc = PTR_ERR(zpios_class); ++ printk(KERN_ERR "ZPIOS: Error creating zpios class, %d\n", rc); ++ cdev_del(&zpios_cdev); ++ unregister_chrdev_region(dev, ZPIOS_MINORS); ++ goto error; ++ } ++ ++ zpios_device = spl_device_create(zpios_class, NULL, ++ dev, NULL, ZPIOS_NAME); ++ return 0; ++error: ++ printk(KERN_ERR "ZPIOS: Error registering zpios device, %d\n", rc); ++ return rc; ++} ++ ++static int ++zpios_fini(void) ++{ ++ dev_t dev = MKDEV(ZPIOS_MAJOR, 0); ++ ++ spl_device_destroy(zpios_class, zpios_device, dev); ++ spl_class_destroy(zpios_class); ++ cdev_del(&zpios_cdev); ++ unregister_chrdev_region(dev, ZPIOS_MINORS); ++ ++ return 0; ++} ++ ++spl_module_init(zpios_init); ++spl_module_exit(zpios_fini); ++ ++MODULE_AUTHOR("LLNL / Sun"); ++MODULE_DESCRIPTION("Kernel PIOS implementation"); ++MODULE_LICENSE("GPL"); +diff -uNr linux-3.2.33-go.orig/include/spl/fs/fs_subr.h linux-3.2.33-go/include/spl/fs/fs_subr.h +--- linux-3.2.33-go.orig/include/spl/fs/fs_subr.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/fs/fs_subr.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_FS_FS_SUBR_H ++#define _SPL_FS_FS_SUBR_H ++ ++#endif /* SPL_FS_FS_SUBR_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/bitops_compat.h linux-3.2.33-go/include/spl/linux/bitops_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/bitops_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/bitops_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,43 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_BITOPS_COMPAT_H ++#define _SPL_BITOPS_COMPAT_H ++ ++#include ++ ++#ifndef HAVE_FLS64 ++ ++static inline int fls64(__u64 x) ++{ ++ __u32 h = x >> 32; ++ if (h) ++ return fls(h) + 32; ++ return fls(x); ++} ++ ++#endif /* HAVE_FLS64 */ ++ ++#endif /* _SPL_BITOPS_COMPAT_H */ ++ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/compiler_compat.h linux-3.2.33-go/include/spl/linux/compiler_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/compiler_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/compiler_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,47 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_COMPILER_COMPAT_H ++#define _SPL_COMPILER_COMPAT_H ++ ++#include ++ ++#ifndef ACCESS_ONCE ++/* ++ * Prevent the compiler from merging or refetching accesses. The compiler ++ * is also forbidden from reordering successive instances of ACCESS_ONCE(), ++ * but only when the compiler is aware of some particular ordering. One way ++ * to make the compiler aware of ordering is to put the two invocations of ++ * ACCESS_ONCE() in different C statements. ++ * ++ * This macro does absolutely -nothing- to prevent the CPU from reordering, ++ * merging, or refetching absolutely anything at any time. Its main intended ++ * use is to mediate communication between process-level code and irq/NMI ++ * handlers, all running on the same CPU. ++ */ ++/* Taken from 2.6.33.2 */ ++# define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) ++#endif ++ ++#endif /* _SPL_COMPILER_COMPAT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/file_compat.h linux-3.2.33-go/include/spl/linux/file_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/file_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/file_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,93 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_FILE_COMPAT_H ++#define _SPL_FILE_COMPAT_H ++ ++#include ++#ifdef HAVE_FDTABLE_HEADER ++#include ++#endif ++ ++static inline struct file * ++spl_filp_open(const char *name, int flags, int mode, int *err) ++{ ++ struct file *filp = NULL; ++ int rc; ++ ++ filp = filp_open(name, flags, mode); ++ if (IS_ERR(filp)) { ++ rc = PTR_ERR(filp); ++ if (err) ++ *err = rc; ++ filp = NULL; ++ } ++ return filp; ++} ++ ++#define spl_filp_close(f) filp_close(f, NULL) ++#define spl_filp_poff(f) (&(f)->f_pos) ++#define spl_filp_write(fp, b, s, p) (fp)->f_op->write((fp), (b), (s), p) ++ ++#ifdef HAVE_VFS_FSYNC ++# ifdef HAVE_2ARGS_VFS_FSYNC ++# define spl_filp_fsync(fp, sync) vfs_fsync(fp, sync) ++# else ++# define spl_filp_fsync(fp, sync) vfs_fsync(fp, (fp)->f_dentry, sync) ++# endif /* HAVE_2ARGS_VFS_FSYNC */ ++#else ++# include ++# define spl_filp_fsync(fp, sync) file_fsync(fp, (fp)->f_dentry, sync) ++#endif /* HAVE_VFS_FSYNC */ ++ ++#ifdef HAVE_INODE_I_MUTEX ++#define spl_inode_lock(ip) (mutex_lock(&(ip)->i_mutex)) ++#define spl_inode_lock_nested(ip, type) (mutex_lock_nested((&(ip)->i_mutex), \ ++ (type))) ++#define spl_inode_unlock(ip) (mutex_unlock(&(ip)->i_mutex)) ++#else ++#define spl_inode_lock(ip) (down(&(ip)->i_sem)) ++#define spl_inode_unlock(ip) (up(&(ip)->i_sem)) ++#endif /* HAVE_INODE_I_MUTEX */ ++ ++#ifdef HAVE_KERN_PATH_PARENT_HEADER ++# ifndef HAVE_KERN_PATH_PARENT_SYMBOL ++typedef int (*kern_path_parent_t)(const char *, struct nameidata *); ++extern kern_path_parent_t kern_path_parent_fn; ++# define spl_kern_path_parent(path, nd) kern_path_parent_fn(path, nd) ++# else ++# define spl_kern_path_parent(path, nd) kern_path_parent(path, nd) ++# endif /* HAVE_KERN_PATH_PARENT_SYMBOL */ ++#else ++# define spl_kern_path_parent(path, nd) path_lookup(path, LOOKUP_PARENT, nd) ++#endif /* HAVE_KERN_PATH_PARENT_HEADER */ ++ ++#ifdef HAVE_KERN_PATH_LOCKED ++typedef struct dentry * (*kern_path_locked_t)(const char *, struct path *); ++extern kern_path_locked_t kern_path_locked_fn; ++# define spl_kern_path_locked(name, path) kern_path_locked_fn(name, path) ++#endif /* HAVE_KERN_PATH_LOCKED */ ++ ++#endif /* SPL_FILE_COMPAT_H */ ++ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/kallsyms_compat.h linux-3.2.33-go/include/spl/linux/kallsyms_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/kallsyms_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/kallsyms_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,43 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_KALLSYMS_COMPAT_H ++#define _SPL_KALLSYMS_COMPAT_H ++ ++#define SYMBOL_POISON ((void*)0xabcddcba) ++ ++#ifdef HAVE_KALLSYMS_LOOKUP_NAME ++ ++#include ++#define spl_kallsyms_lookup_name(name) kallsyms_lookup_name(name) ++ ++#else ++ ++typedef unsigned long (*kallsyms_lookup_name_t)(const char *); ++extern kallsyms_lookup_name_t spl_kallsyms_lookup_name_fn; ++#define spl_kallsyms_lookup_name(name) spl_kallsyms_lookup_name_fn(name) ++ ++#endif /* HAVE_KALLSYMS_LOOKUP_NAME */ ++ ++#endif /* _SPL_KALLSYMS_COMPAT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/list_compat.h linux-3.2.33-go/include/spl/linux/list_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/list_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/list_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,51 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_LIST_COMPAT_H ++#define _SPL_LIST_COMPAT_H ++ ++#include ++ ++#ifndef list_for_each_entry_safe_reverse ++ ++/** ++ * list_for_each_entry_safe_reverse ++ * @pos: the type * to use as a loop cursor. ++ * @n: another type * to use as temporary storage ++ * @head: the head for your list. ++ * @member: the name of the list_struct within the struct. ++ * ++ * Iterate backwards over list of given type, safe against removal ++ * of list entry. ++ */ ++#define list_for_each_entry_safe_reverse(pos, n, head, member) \ ++ for (pos = list_entry((head)->prev, typeof(*pos), member), \ ++ n = list_entry(pos->member.prev, typeof(*pos), member); \ ++ &pos->member != (head); \ ++ pos = n, n = list_entry(n->member.prev, typeof(*n), member)) ++ ++#endif /* list_for_each_entry_safe_reverse */ ++ ++#endif /* SPL_LIST_COMPAT_H */ ++ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/math64_compat.h linux-3.2.33-go/include/spl/linux/math64_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/math64_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/math64_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,32 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_MATH64_COMPAT_H ++#define _SPL_MATH64_COMPAT_H ++ ++#ifndef abs64 ++#define abs64(x) ({ uint64_t t = (x) >> 63; ((x) ^ t) - t; }) ++#endif ++ ++#endif /* _SPL_MATH64_COMPAT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/mm_compat.h linux-3.2.33-go/include/spl/linux/mm_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/mm_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/mm_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,282 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_MM_COMPAT_H ++#define _SPL_MM_COMPAT_H ++ ++#include ++#include ++ ++/* ++ * Linux 2.6.31 API Change. ++ * Individual pages_{min,low,high} moved in to watermark array. ++ */ ++#ifndef min_wmark_pages ++#define min_wmark_pages(z) (z->pages_min) ++#endif ++ ++#ifndef low_wmark_pages ++#define low_wmark_pages(z) (z->pages_low) ++#endif ++ ++#ifndef high_wmark_pages ++#define high_wmark_pages(z) (z->pages_high) ++#endif ++ ++/* ++ * 2.6.37 API compat, ++ * The function invalidate_inodes() is no longer exported by the kernel. ++ * The prototype however is still available which means it is safe ++ * to acquire the symbol's address using spl_kallsyms_lookup_name(). ++ * ++ * 2.6.39 API compat, ++ * As for 2.6.39 invalidate_inodes() was updated to take a second ++ * argument which controls how dirty inodes should be handled. ++ */ ++#if defined(HAVE_INVALIDATE_INODES) || defined(HAVE_INVALIDATE_INODES_CHECK) ++# ifdef HAVE_2ARGS_INVALIDATE_INODES ++# define spl_invalidate_inodes(sb, kd) invalidate_inodes(sb, kd) ++# else ++# define spl_invalidate_inodes(sb, kd) invalidate_inodes(sb) ++# endif /* HAVE_2ARGS_INVALIDATE_INODES */ ++#else ++# ifdef HAVE_2ARGS_INVALIDATE_INODES ++typedef int (*invalidate_inodes_t)(struct super_block *sb, bool kd); ++extern invalidate_inodes_t invalidate_inodes_fn; ++# define spl_invalidate_inodes(sb, kd) invalidate_inodes_fn(sb, kd) ++# else ++typedef int (*invalidate_inodes_t)(struct super_block *sb); ++extern invalidate_inodes_t invalidate_inodes_fn; ++# define spl_invalidate_inodes(sb, kd) invalidate_inodes_fn(sb) ++# endif /* HAVE_2ARGS_INVALIDATE_INODES */ ++#endif /* HAVE_INVALIDATE_INODES || HAVE_INVALIDATE_INODES_CHECK */ ++ ++#if !defined(HAVE_SHRINK_CONTROL_STRUCT) ++struct shrink_control { ++ gfp_t gfp_mask; ++ unsigned long nr_to_scan; ++}; ++#endif /* HAVE_SHRINK_CONTROL_STRUCT */ ++ ++/* ++ * 2.6.xx API compat, ++ * There currently exists no exposed API to partially shrink the dcache. ++ * The expected mechanism to shrink the cache is a registered shrinker ++ * which is called during memory pressure. ++ */ ++#ifndef HAVE_SHRINK_DCACHE_MEMORY ++# if defined(HAVE_SHRINK_CONTROL_STRUCT) ++typedef int (*shrink_dcache_memory_t)(struct shrinker *, ++ struct shrink_control *); ++extern shrink_dcache_memory_t shrink_dcache_memory_fn; ++# define shrink_dcache_memory(nr, gfp) \ ++({ \ ++ struct shrink_control sc = { .nr_to_scan = nr, .gfp_mask = gfp }; \ ++ int __ret__ = 0; \ ++ \ ++ if (shrink_dcache_memory_fn) \ ++ __ret__ = shrink_dcache_memory_fn(NULL, &sc); \ ++ \ ++ __ret__; \ ++}) ++# elif defined(HAVE_3ARGS_SHRINKER_CALLBACK) ++typedef int (*shrink_dcache_memory_t)(struct shrinker *, int, gfp_t); ++extern shrink_dcache_memory_t shrink_dcache_memory_fn; ++# define shrink_dcache_memory(nr, gfp) \ ++({ \ ++ int __ret__ = 0; \ ++ \ ++ if (shrink_dcache_memory_fn) \ ++ __ret__ = shrink_dcache_memory_fn(NULL, nr, gfp); \ ++ \ ++ __ret__; \ ++}) ++# else ++typedef int (*shrink_dcache_memory_t)(int, gfp_t); ++extern shrink_dcache_memory_t shrink_dcache_memory_fn; ++# define shrink_dcache_memory(nr, gfp) \ ++({ \ ++ int __ret__ = 0; \ ++ \ ++ if (shrink_dcache_memory_fn) \ ++ __ret__ = shrink_dcache_memory_fn(nr, gfp); \ ++ \ ++ __ret__; \ ++}) ++# endif /* HAVE_3ARGS_SHRINKER_CALLBACK */ ++#endif /* HAVE_SHRINK_DCACHE_MEMORY */ ++ ++/* ++ * 2.6.xx API compat, ++ * There currently exists no exposed API to partially shrink the icache. ++ * The expected mechanism to shrink the cache is a registered shrinker ++ * which is called during memory pressure. ++ */ ++#ifndef HAVE_SHRINK_ICACHE_MEMORY ++# if defined(HAVE_SHRINK_CONTROL_STRUCT) ++typedef int (*shrink_icache_memory_t)(struct shrinker *, ++ struct shrink_control *); ++extern shrink_icache_memory_t shrink_icache_memory_fn; ++# define shrink_icache_memory(nr, gfp) \ ++({ \ ++ struct shrink_control sc = { .nr_to_scan = nr, .gfp_mask = gfp }; \ ++ int __ret__ = 0; \ ++ \ ++ if (shrink_icache_memory_fn) \ ++ __ret__ = shrink_icache_memory_fn(NULL, &sc); \ ++ \ ++ __ret__; \ ++}) ++# elif defined(HAVE_3ARGS_SHRINKER_CALLBACK) ++typedef int (*shrink_icache_memory_t)(struct shrinker *, int, gfp_t); ++extern shrink_icache_memory_t shrink_icache_memory_fn; ++# define shrink_icache_memory(nr, gfp) \ ++({ \ ++ int __ret__ = 0; \ ++ \ ++ if (shrink_icache_memory_fn) \ ++ __ret__ = shrink_icache_memory_fn(NULL, nr, gfp); \ ++ \ ++ __ret__; \ ++}) ++# else ++typedef int (*shrink_icache_memory_t)(int, gfp_t); ++extern shrink_icache_memory_t shrink_icache_memory_fn; ++# define shrink_icache_memory(nr, gfp) \ ++({ \ ++ int __ret__ = 0; \ ++ \ ++ if (shrink_icache_memory_fn) \ ++ __ret__ = shrink_icache_memory_fn(nr, gfp); \ ++ \ ++ __ret__; \ ++}) ++# endif /* HAVE_3ARGS_SHRINKER_CALLBACK */ ++#endif /* HAVE_SHRINK_ICACHE_MEMORY */ ++ ++/* ++ * Linux 2.6. - 2.6. Shrinker API Compatibility. ++ */ ++#ifdef HAVE_SET_SHRINKER ++typedef struct spl_shrinker { ++ struct shrinker *shrinker; ++ shrinker_t fn; ++ int seeks; ++} spl_shrinker_t; ++ ++static inline void ++spl_register_shrinker(spl_shrinker_t *ss) ++{ ++ ss->shrinker = set_shrinker(ss->seeks, ss->fn); ++} ++ ++static inline void ++spl_unregister_shrinker(spl_shrinker_t *ss) ++{ ++ remove_shrinker(ss->shrinker); ++} ++ ++# define SPL_SHRINKER_DECLARE(s, x, y) \ ++ static spl_shrinker_t s = { \ ++ .shrinker = NULL, \ ++ .fn = x, \ ++ .seeks = y \ ++ } ++ ++# define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ ++ static int fn(int, unsigned int) ++# define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ ++static int \ ++fn(int nr_to_scan, unsigned int gfp_mask) \ ++{ \ ++ struct shrink_control sc; \ ++ \ ++ sc.nr_to_scan = nr_to_scan; \ ++ sc.gfp_mask = gfp_mask; \ ++ \ ++ return __ ## fn(NULL, &sc); \ ++} ++ ++#else ++ ++# define spl_register_shrinker(x) register_shrinker(x) ++# define spl_unregister_shrinker(x) unregister_shrinker(x) ++# define SPL_SHRINKER_DECLARE(s, x, y) \ ++ static struct shrinker s = { \ ++ .shrink = x, \ ++ .seeks = y \ ++ } ++ ++/* ++ * Linux 2.6. - 2.6. Shrinker API Compatibility. ++ */ ++# if defined(HAVE_SHRINK_CONTROL_STRUCT) ++# define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ ++ static int fn(struct shrinker *, struct shrink_control *) ++# define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ ++static int \ ++fn(struct shrinker *shrink, struct shrink_control *sc) { \ ++ return __ ## fn(shrink, sc); \ ++} ++ ++/* ++ * Linux 2.6. - 2.6. Shrinker API Compatibility. ++ */ ++# elif defined(HAVE_3ARGS_SHRINKER_CALLBACK) ++# define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ ++ static int fn(struct shrinker *, int, unsigned int) ++# define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ ++static int \ ++fn(struct shrinker *shrink, int nr_to_scan, unsigned int gfp_mask) \ ++{ \ ++ struct shrink_control sc; \ ++ \ ++ sc.nr_to_scan = nr_to_scan; \ ++ sc.gfp_mask = gfp_mask; \ ++ \ ++ return __ ## fn(shrink, &sc); \ ++} ++ ++/* ++ * Linux 2.6. - 2.6. Shrinker API Compatibility. ++ */ ++# else ++# define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ ++ static int fn(int, unsigned int) ++# define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ ++static int \ ++fn(int nr_to_scan, unsigned int gfp_mask) \ ++{ \ ++ struct shrink_control sc; \ ++ \ ++ sc.nr_to_scan = nr_to_scan; \ ++ sc.gfp_mask = gfp_mask; \ ++ \ ++ return __ ## fn(NULL, &sc); \ ++} ++ ++# endif ++#endif /* HAVE_SET_SHRINKER */ ++ ++#endif /* SPL_MM_COMPAT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/module_compat.h linux-3.2.33-go/include/spl/linux/module_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/module_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/module_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,59 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_MODULE_COMPAT_H ++#define _SPL_MODULE_COMPAT_H ++ ++#include ++ ++#define spl_module_init(init_fn) \ ++static int \ ++spl_##init_fn(void) \ ++{ \ ++ int rc; \ ++ \ ++ spl_setup(); \ ++ rc = init_fn(); \ ++ \ ++ return rc; \ ++} \ ++ \ ++module_init(spl_##init_fn) ++ ++#define spl_module_exit(exit_fn) \ ++static void \ ++spl_##exit_fn(void) \ ++{ \ ++ int rc; \ ++ \ ++ rc = exit_fn(); \ ++ spl_cleanup(); \ ++ if (rc) \ ++ printk(KERN_ERR "SPL: Failure %d unloading " \ ++ "dependent module\n", rc); \ ++} \ ++ \ ++module_exit(spl_##exit_fn) ++ ++#endif /* _SPL_MODULE_COMPAT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/mutex_compat.h linux-3.2.33-go/include/spl/linux/mutex_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/mutex_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/mutex_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,36 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_MUTEX_COMPAT_H ++#define _SPL_MUTEX_COMPAT_H ++ ++#include ++ ++/* mutex_lock_nested() introduced in 2.6.18 */ ++#ifndef HAVE_MUTEX_LOCK_NESTED ++# define mutex_lock_nested(lock, subclass) mutex_lock(lock) ++#endif /* HAVE_MUTEX_LOCK_NESTED */ ++ ++#endif /* _SPL_MUTEX_COMPAT_H */ ++ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/proc_compat.h linux-3.2.33-go/include/spl/linux/proc_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/proc_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/proc_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,53 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_PROC_H ++#define _SPL_PROC_H ++ ++#include ++ ++#ifdef CONFIG_SYSCTL ++#ifdef HAVE_2ARGS_REGISTER_SYSCTL ++#define spl_register_sysctl_table(t, a) register_sysctl_table(t, a) ++#else ++#define spl_register_sysctl_table(t, a) register_sysctl_table(t) ++#endif /* HAVE_2ARGS_REGISTER_SYSCTL */ ++#define spl_unregister_sysctl_table(t) unregister_sysctl_table(t) ++#endif /* CONFIG_SYSCTL */ ++ ++#ifdef HAVE_CTL_NAME ++#define CTL_NAME(cname) .ctl_name = (cname), ++#else ++#define CTL_NAME(cname) ++#endif ++ ++extern struct proc_dir_entry *proc_spl_kstat; ++struct proc_dir_entry *proc_dir_entry_find(struct proc_dir_entry *root, ++ const char *str); ++int proc_dir_entries(struct proc_dir_entry *root); ++ ++int spl_proc_init(void); ++void spl_proc_fini(void); ++ ++#endif /* SPL_PROC_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/rwsem_compat.h linux-3.2.33-go/include/spl/linux/rwsem_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/rwsem_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/rwsem_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,66 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_RWSEM_COMPAT_H ++#define _SPL_RWSEM_COMPAT_H ++ ++#include ++ ++#if defined(RWSEM_SPINLOCK_IS_RAW) ++#define spl_rwsem_lock_irqsave(lk, fl) raw_spin_lock_irqsave(lk, fl) ++#define spl_rwsem_unlock_irqrestore(lk, fl) raw_spin_unlock_irqrestore(lk, fl) ++#define spl_rwsem_trylock_irqsave(lk, fl) raw_spin_trylock_irqsave(lk, fl) ++#else ++#define spl_rwsem_lock_irqsave(lk, fl) spin_lock_irqsave(lk, fl) ++#define spl_rwsem_unlock_irqrestore(lk, fl) spin_unlock_irqrestore(lk, fl) ++#define spl_rwsem_trylock_irqsave(lk, fl) spin_trylock_irqsave(lk, fl) ++#endif /* RWSEM_SPINLOCK_IS_RAW */ ++ ++/* ++ * Prior to Linux 2.6.33 there existed a race condition in rwsem_is_locked(). ++ * The semaphore's activity was checked outside of the wait_lock which ++ * could result in some readers getting the incorrect activity value. ++ * ++ * When a kernel without this fix is detected the SPL takes responsibility ++ * for acquiring the wait_lock to avoid this race. ++ */ ++#if defined(RWSEM_IS_LOCKED_TAKES_WAIT_LOCK) ++#define spl_rwsem_is_locked(rwsem) rwsem_is_locked(rwsem) ++#else ++static inline int ++spl_rwsem_is_locked(struct rw_semaphore *rwsem) ++{ ++ unsigned long flags; ++ int rc = 1; ++ ++ if (spl_rwsem_trylock_irqsave(&rwsem->wait_lock, flags)) { ++ rc = rwsem_is_locked(rwsem); ++ spl_rwsem_unlock_irqrestore(&rwsem->wait_lock, flags); ++ } ++ ++ return (rc); ++} ++#endif /* RWSEM_IS_LOCKED_TAKES_WAIT_LOCK */ ++ ++#endif /* _SPL_RWSEM_COMPAT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/smp_compat.h linux-3.2.33-go/include/spl/linux/smp_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/smp_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/smp_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,40 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SMP_COMPAT_H ++#define _SPL_SMP_COMPAT_H ++ ++#include ++ ++#ifdef HAVE_3ARGS_ON_EACH_CPU ++ ++#define spl_on_each_cpu(func,info,wait) on_each_cpu(func,info,wait) ++ ++#else ++ ++#define spl_on_each_cpu(func,info,wait) on_each_cpu(func,info,0,wait) ++ ++#endif /* HAVE_3ARGS_ON_EACH_CPU */ ++ ++#endif /* _SPL_SMP_COMPAT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/sysctl_compat.h linux-3.2.33-go/include/spl/linux/sysctl_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/sysctl_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/sysctl_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,96 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SYSCTL_COMPAT_H ++#define _SPL_SYSCTL_COMPAT_H ++ ++#include ++ ++/* proc_handler() / proc_do* API changes ++ * 2.6.x - 2.6.31: 6 args, prototype includes 'struct file *' ++ * 2.6.32 - 2.6.y: 5 args, removed unused 'struct file *' from prototype ++ * ++ * Generic SPL_PROC_HANDLER() macro should be used for correct prototypes. ++ * It will define the following function arguments which can and should be ++ * used with the spl_proc_* helper macros. ++ * ++ * struct ctl_table *table, ++ * int write, ++ * struct file *filp [2.6.31 and earlier kernels], ++ * void __user *buffer, ++ * size_t *lenp, ++ * loff_t *ppos, ++ */ ++#ifdef HAVE_5ARGS_PROC_HANDLER ++ ++#define SPL_PROC_HANDLER(proc_handler) \ ++static int \ ++proc_handler(struct ctl_table *table, int write, \ ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++ ++#define spl_proc_dostring(table, write, filp, buffer, lenp, ppos) \ ++ proc_dostring(table, write, buffer, lenp, ppos) ++#define spl_proc_dointvec(table, write, filp, buffer, lenp, ppos) \ ++ proc_dointvec(table, write, buffer, lenp, ppos) ++#define spl_proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos) \ ++ proc_dointvec_minmax(table, write, buffer, lenp, ppos) ++#define spl_proc_dointvec_jiffies(table, write, filp, buffer, lenp, ppos) \ ++ proc_dointvec_jiffies(table, write, buffer, lenp, ppos) ++#define spl_proc_dointvec_userhz_jiffies(table,write,filp,buffer,lenp,ppos) \ ++ proc_dointvec_userhz_jiffies(table, write, buffer, lenp, ppos) ++#define spl_proc_dointvec_ms_jiffies(table,write,filp,buffer,lenp,ppos) \ ++ proc_dointvec_ms_jiffies(table, write, buffer, lenp, ppos) ++#define spl_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos) \ ++ proc_doulongvec_minmax(table, write, buffer, lenp, ppos) ++#define spl_proc_doulongvec_ms_jiffies_minmax(table,write,filp,buffer,lenp,ppos)\ ++ proc_doulongvec_ms_jiffies_minmax(table, write, buffer, lenp, ppos) ++ ++#else /* HAVE_5ARGS_PROC_HANDLER */ ++ ++#define SPL_PROC_HANDLER(proc_handler) \ ++static int \ ++proc_handler(struct ctl_table *table, int write, struct file *filp, \ ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++ ++#define spl_proc_dostring(table, write, filp, buffer, lenp, ppos) \ ++ proc_dostring(table, write, filp, buffer, lenp, ppos) ++#define spl_proc_dointvec(table, write, filp, buffer, lenp, ppos) \ ++ proc_dointvec(table, write, filp, buffer, lenp, ppos) ++#define spl_proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos) \ ++ proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos) ++#define spl_proc_dointvec_jiffies(table, write, filp, buffer, lenp, ppos) \ ++ proc_dointvec_jiffies(table, write, filp, buffer, lenp, ppos) ++#define spl_proc_dointvec_userhz_jiffies(table,write,filp,buffer,lenp,ppos) \ ++ proc_dointvec_userhz_jiffies(table, write, filp, buffer, lenp, ppos) ++#define spl_proc_dointvec_ms_jiffies(table, write, filp, buffer, lenp, ppos) \ ++ proc_dointvec_ms_jiffies(table, write, filp, buffer, lenp, ppos) ++#define spl_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos) \ ++ proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos) ++#define spl_proc_doulongvec_ms_jiffies_minmax(table,write,filp,buffer,lenp,ppos) \ ++ proc_doulongvec_ms_jiffies_minmax(table,write,filp,buffer,lenp,ppos) ++ ++ ++#endif /* HAVE_5ARGS_PROC_HANDLER */ ++ ++#endif /* _SPL_SYSCTL_COMPAT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/time_compat.h linux-3.2.33-go/include/spl/linux/time_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/time_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/time_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,45 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_TIME_COMPAT_H ++#define _SPL_TIME_COMPAT_H ++ ++#include ++ ++/* timespec_sub() API changes ++ * 2.6.18 - 2.6.x: Inline function provided by linux/time.h ++ */ ++#ifndef HAVE_TIMESPEC_SUB ++static inline struct timespec ++timespec_sub(struct timespec lhs, struct timespec rhs) ++{ ++ struct timespec ts_delta; ++ set_normalized_timespec(&ts_delta, lhs.tv_sec - rhs.tv_sec, ++ lhs.tv_nsec - rhs.tv_nsec); ++ return ts_delta; ++} ++#endif /* HAVE_TIMESPEC_SUB */ ++ ++#endif /* _SPL_TIME_COMPAT_H */ ++ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/uaccess_compat.h linux-3.2.33-go/include/spl/linux/uaccess_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/uaccess_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/uaccess_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,35 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_UACCESS_COMPAT_H ++#define _SPL_UACCESS_COMPAT_H ++ ++#ifdef HAVE_UACCESS_HEADER ++#include ++#else ++#include ++#endif ++ ++#endif /* _SPL_UACCESS_COMPAT_H */ ++ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/workqueue_compat.h linux-3.2.33-go/include/spl/linux/workqueue_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/workqueue_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/workqueue_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,49 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_WORKQUEUE_COMPAT_H ++#define _SPL_WORKQUEUE_COMPAT_H ++ ++#include ++#include ++ ++#ifdef HAVE_3ARGS_INIT_WORK ++ ++#define delayed_work work_struct ++ ++#define spl_init_work(wq, cb, d) INIT_WORK((wq), (void *)(cb), \ ++ (void *)(d)) ++#define spl_init_delayed_work(wq,cb,d) INIT_WORK((wq), (void *)(cb), \ ++ (void *)(d)) ++#define spl_get_work_data(d, t, f) (t *)(d) ++ ++#else ++ ++#define spl_init_work(wq, cb, d) INIT_WORK((wq), (void *)(cb)); ++#define spl_init_delayed_work(wq,cb,d) INIT_DELAYED_WORK((wq), (void *)(cb)); ++#define spl_get_work_data(d, t, f) (t *)container_of(d, t, f) ++ ++#endif /* HAVE_3ARGS_INIT_WORK */ ++ ++#endif /* _SPL_WORKQUEUE_COMPAT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/linux/zlib_compat.h linux-3.2.33-go/include/spl/linux/zlib_compat.h +--- linux-3.2.33-go.orig/include/spl/linux/zlib_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/linux/zlib_compat.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,37 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2011 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_ZLIB_COMPAT_H ++#define _SPL_ZLIB_COMPAT_H ++ ++#include ++ ++#ifdef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE ++#define spl_zlib_deflate_workspacesize(wb, ml) \ ++ zlib_deflate_workspacesize(wb, ml) ++#else ++#define spl_zlib_deflate_workspacesize(wb, ml) \ ++ zlib_deflate_workspacesize() ++#endif /* HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */ ++ ++#endif /* SPL_ZLIB_COMPAT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/Makefile linux-3.2.33-go/include/spl/Makefile +--- linux-3.2.33-go.orig/include/spl/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/Makefile 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,515 @@ ++# Makefile.in generated by automake 1.11.6 from Makefile.am. ++# include/Makefile. Generated from Makefile.in by configure. ++ ++# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, ++# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software ++# Foundation, Inc. ++# This Makefile.in is free software; the Free Software Foundation ++# gives unlimited permission to copy and/or distribute it, ++# with or without modifications, as long as this notice is preserved. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without ++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A ++# PARTICULAR PURPOSE. ++ ++ ++ ++ ++am__make_dryrun = \ ++ { \ ++ am__dry=no; \ ++ case $$MAKEFLAGS in \ ++ *\\[\ \ ]*) \ ++ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ ++ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ ++ *) \ ++ for am__flg in $$MAKEFLAGS; do \ ++ case $$am__flg in \ ++ *=*|--*) ;; \ ++ *n*) am__dry=yes; break;; \ ++ esac; \ ++ done;; \ ++ esac; \ ++ test $$am__dry = yes; \ ++ } ++pkgdatadir = $(datadir)/spl ++pkgincludedir = $(includedir)/spl ++pkglibdir = $(libdir)/spl ++pkglibexecdir = $(libexecdir)/spl ++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd ++install_sh_DATA = $(install_sh) -c -m 644 ++install_sh_PROGRAM = $(install_sh) -c ++install_sh_SCRIPT = $(install_sh) -c ++INSTALL_HEADER = $(INSTALL_DATA) ++transform = $(program_transform_name) ++NORMAL_INSTALL = : ++PRE_INSTALL = : ++POST_INSTALL = : ++NORMAL_UNINSTALL = : ++PRE_UNINSTALL = : ++POST_UNINSTALL = : ++build_triplet = x86_64-unknown-linux-gnu ++host_triplet = x86_64-unknown-linux-gnu ++target_triplet = x86_64-unknown-linux-gnu ++subdir = include ++DIST_COMMON = $(noinst_HEADERS) $(srcdir)/Makefile.am \ ++ $(srcdir)/Makefile.in ++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ++am__aclocal_m4_deps = $(top_srcdir)/config/spl-build.m4 \ ++ $(top_srcdir)/config/spl-meta.m4 $(top_srcdir)/configure.ac ++am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ ++ $(ACLOCAL_M4) ++mkinstalldirs = $(install_sh) -d ++CONFIG_HEADER = $(top_builddir)/spl_config.h ++CONFIG_CLEAN_FILES = ++CONFIG_CLEAN_VPATH_FILES = ++AM_V_GEN = $(am__v_GEN_$(V)) ++am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) ++am__v_GEN_0 = @echo " GEN " $@; ++AM_V_at = $(am__v_at_$(V)) ++am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) ++am__v_at_0 = @ ++SOURCES = ++DIST_SOURCES = ++am__can_run_installinfo = \ ++ case $$AM_UPDATE_INFO_DIR in \ ++ n|no|NO) false;; \ ++ *) (install-info --version) >/dev/null 2>&1;; \ ++ esac ++HEADERS = $(noinst_HEADERS) ++ETAGS = etags ++CTAGS = ctags ++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ++ACLOCAL = ${SHELL} /root/spl-0.6.0-rc12/config/missing --run aclocal-1.11 ++ALIEN = alien ++ALIEN_VERSION = ++AMTAR = $${TAR-tar} ++AM_DEFAULT_VERBOSITY = 1 ++AR = ar ++AUTOCONF = ${SHELL} /root/spl-0.6.0-rc12/config/missing --run autoconf ++AUTOHEADER = ${SHELL} /root/spl-0.6.0-rc12/config/missing --run autoheader ++AUTOMAKE = ${SHELL} /root/spl-0.6.0-rc12/config/missing --run automake-1.11 ++AWK = gawk ++CC = gcc ++CCDEPMODE = depmode=gcc3 ++CFLAGS = -g -O2 ++CPP = gcc -E ++CPPFLAGS = ++CYGPATH_W = echo ++DEBUG_CFLAGS = -DNDEBUG ++DEBUG_KMEM = _with_debug_kmem ++DEBUG_KMEM_TRACKING = _without_debug_kmem_tracking ++DEBUG_LOG = _with_debug_log ++DEBUG_SPL = _without_debug ++DEFAULT_PACKAGE = tgz ++DEFS = -DHAVE_CONFIG_H ++DEPDIR = .deps ++DLLTOOL = false ++DPKG = dpkg ++DPKGBUILD = dpkg-buildpackage ++DPKGBUILD_VERSION = ++DPKG_VERSION = ++DSYMUTIL = ++DUMPBIN = ++ECHO_C = ++ECHO_N = -n ++ECHO_T = ++EGREP = /bin/grep -E ++EXEEXT = ++FGREP = /bin/grep -F ++GREP = /bin/grep ++HAVE_ALIEN = no ++HAVE_DPKG = no ++HAVE_DPKGBUILD = no ++HAVE_MAKEPKG = ++HAVE_PACMAN = ++HAVE_RPM = yes ++HAVE_RPMBUILD = yes ++INSTALL = /usr/bin/install -c ++INSTALL_DATA = ${INSTALL} -m 644 ++INSTALL_PROGRAM = ${INSTALL} ++INSTALL_SCRIPT = ${INSTALL} ++INSTALL_STRIP_PROGRAM = $(install_sh) -c -s ++KERNELCPPFLAGS = -DHAVE_GPL_ONLY_SYMBOLS -Wstrict-prototypes -DNDEBUG -DDEBUG_LOG -DDEBUG_KMEM ++KERNELMAKE_PARAMS = O=/usr/src/linux-3.6.0-sabayon ++LD = /usr/x86_64-pc-linux-gnu/bin/ld -m elf_x86_64 ++LDFLAGS = ++LIBOBJS = ++LIBS = ++LIBTOOL = $(SHELL) $(top_builddir)/libtool ++LICENSE = GPL ++LINUX = /usr/src/linux-3.2.33-go ++LINUX_OBJ = /usr/src/linux-3.6.0-sabayon ++LINUX_SYMBOLS = NONE ++LINUX_VERSION = 3.6.0-sabayon ++LIPO = ++LN_S = ln -s ++LTLIBOBJS = ++MAINT = # ++MAKEINFO = ${SHELL} /root/spl-0.6.0-rc12/config/missing --run makeinfo ++MAKEPKG = ++MAKEPKG_VERSION = ++MANIFEST_TOOL = : ++MKDIR_P = /bin/mkdir -p ++NM = /usr/bin/nm -B ++NMEDIT = ++OBJDUMP = objdump ++OBJEXT = o ++OTOOL = ++OTOOL64 = ++PACKAGE = spl ++PACKAGE_BUGREPORT = ++PACKAGE_NAME = ++PACKAGE_STRING = ++PACKAGE_TARNAME = ++PACKAGE_URL = ++PACKAGE_VERSION = ++PACMAN = ++PACMAN_VERSION = ++PATH_SEPARATOR = : ++RANLIB = ranlib ++RPM = rpm ++RPMBUILD = rpmbuild ++RPMBUILD_VERSION = 4.10.0 ++RPM_VERSION = 4.10.0 ++SED = /bin/sed ++SET_MAKE = ++SHELL = /bin/sh ++SPL_CONFIG = all ++SPL_META_ALIAS = spl-0.6.0-rc12 ++SPL_META_AUTHOR = ++SPL_META_DATA = ++SPL_META_LT_AGE = ++SPL_META_LT_CURRENT = ++SPL_META_LT_REVISION = ++SPL_META_NAME = spl ++SPL_META_RELEASE = rc12 ++SPL_META_VERSION = 0.6.0 ++STRIP = strip ++VENDOR = gentoo ++VERSION = 0.6.0 ++abs_builddir = /root/spl-0.6.0-rc12/include ++abs_srcdir = /root/spl-0.6.0-rc12/include ++abs_top_builddir = /root/spl-0.6.0-rc12 ++abs_top_srcdir = /root/spl-0.6.0-rc12 ++ac_ct_AR = ar ++ac_ct_CC = gcc ++ac_ct_DUMPBIN = ++am__include = include ++am__leading_dot = . ++am__quote = ++am__tar = $${TAR-tar} chof - "$$tardir" ++am__untar = $${TAR-tar} xf - ++bindir = ${exec_prefix}/bin ++build = x86_64-unknown-linux-gnu ++build_alias = ++build_cpu = x86_64 ++build_os = linux-gnu ++build_vendor = unknown ++builddir = . ++datadir = ${datarootdir} ++datarootdir = ${prefix}/share ++docdir = ${datarootdir}/doc/${PACKAGE} ++dvidir = ${docdir} ++exec_prefix = ${prefix} ++host = x86_64-unknown-linux-gnu ++host_alias = ++host_cpu = x86_64 ++host_os = linux-gnu ++host_vendor = unknown ++htmldir = ${docdir} ++includedir = ${prefix}/include ++infodir = ${datarootdir}/info ++install_sh = ${SHELL} /root/spl-0.6.0-rc12/config/install-sh ++libdir = ${exec_prefix}/lib ++libexecdir = ${exec_prefix}/libexec ++localedir = ${datarootdir}/locale ++localstatedir = ${prefix}/var ++mandir = ${datarootdir}/man ++mkdir_p = /bin/mkdir -p ++oldincludedir = /usr/include ++pdfdir = ${docdir} ++prefix = /usr/local ++program_transform_name = s,x,x, ++psdir = ${docdir} ++sbindir = ${exec_prefix}/sbin ++sharedstatedir = ${prefix}/com ++srcdir = . ++sysconfdir = ${prefix}/etc ++target = x86_64-unknown-linux-gnu ++target_alias = ++target_cpu = x86_64 ++target_os = linux-gnu ++target_vendor = unknown ++top_build_prefix = ../ ++top_builddir = .. ++top_srcdir = .. ++ ++# All headers are referenced by this top level Makefile.am are ++# noinst_HEADERS because they are not installed in the usual include ++# location. We do not want to be using $includedir for this. ++# Installation is handled by the custom install-data-local rule. ++noinst_HEADERS = $(top_srcdir)/include/*.h \ ++ $(top_srcdir)/include/fs/*.h $(top_srcdir)/include/linux/*.h \ ++ $(top_srcdir)/include/rpc/*.h \ ++ $(top_srcdir)/include/sharefs/*.h \ ++ $(top_srcdir)/include/sys/fm/*.h \ ++ $(top_srcdir)/include/sys/fs/*.h \ ++ $(top_srcdir)/include/sys/sysevent/*.h \ ++ $(top_srcdir)/include/sys/*.h $(top_srcdir)/include/util/*.h \ ++ $(top_srcdir)/include/vm/*.h ++all: all-am ++ ++.SUFFIXES: ++$(srcdir)/Makefile.in: # $(srcdir)/Makefile.am $(am__configure_deps) ++ @for dep in $?; do \ ++ case '$(am__configure_deps)' in \ ++ *$$dep*) \ ++ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ ++ && { if test -f $@; then exit 0; else break; fi; }; \ ++ exit 1;; \ ++ esac; \ ++ done; \ ++ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu include/Makefile'; \ ++ $(am__cd) $(top_srcdir) && \ ++ $(AUTOMAKE) --gnu include/Makefile ++.PRECIOUS: Makefile ++Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status ++ @case '$?' in \ ++ *config.status*) \ ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ ++ *) \ ++ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ ++ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ ++ esac; ++ ++$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++ ++$(top_srcdir)/configure: # $(am__configure_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(ACLOCAL_M4): # $(am__aclocal_m4_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(am__aclocal_m4_deps): ++ ++mostlyclean-libtool: ++ -rm -f *.lo ++ ++clean-libtool: ++ -rm -rf .libs _libs ++ ++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ mkid -fID $$unique ++tags: TAGS ++ ++TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ set x; \ ++ here=`pwd`; \ ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ shift; \ ++ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ ++ test -n "$$unique" || unique=$$empty_fix; \ ++ if test $$# -gt 0; then \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ "$$@" $$unique; \ ++ else \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ $$unique; \ ++ fi; \ ++ fi ++ctags: CTAGS ++CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ test -z "$(CTAGS_ARGS)$$unique" \ ++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ ++ $$unique ++ ++GTAGS: ++ here=`$(am__cd) $(top_builddir) && pwd` \ ++ && $(am__cd) $(top_srcdir) \ ++ && gtags -i $(GTAGS_ARGS) "$$here" ++ ++distclean-tags: ++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags ++ ++distdir: $(DISTFILES) ++ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ list='$(DISTFILES)'; \ ++ dist_files=`for file in $$list; do echo $$file; done | \ ++ sed -e "s|^$$srcdirstrip/||;t" \ ++ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ ++ case $$dist_files in \ ++ */*) $(MKDIR_P) `echo "$$dist_files" | \ ++ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ ++ sort -u` ;; \ ++ esac; \ ++ for file in $$dist_files; do \ ++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ ++ if test -d $$d/$$file; then \ ++ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ ++ if test -d "$(distdir)/$$file"; then \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ ++ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ ++ else \ ++ test -f "$(distdir)/$$file" \ ++ || cp -p $$d/$$file "$(distdir)/$$file" \ ++ || exit 1; \ ++ fi; \ ++ done ++check-am: all-am ++check: check-am ++all-am: Makefile $(HEADERS) ++installdirs: ++install: install-am ++install-exec: install-exec-am ++install-data: install-data-am ++uninstall: uninstall-am ++ ++install-am: all-am ++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am ++ ++installcheck: installcheck-am ++install-strip: ++ if test -z '$(STRIP)'; then \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ install; \ ++ else \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ ++ fi ++mostlyclean-generic: ++ ++clean-generic: ++ ++distclean-generic: ++ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) ++ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) ++ ++maintainer-clean-generic: ++ @echo "This command is intended for maintainers to use" ++ @echo "it deletes files that may require special tools to rebuild." ++clean: clean-am ++ ++clean-am: clean-generic clean-libtool mostlyclean-am ++ ++distclean: distclean-am ++ -rm -f Makefile ++distclean-am: clean-am distclean-generic distclean-tags ++ ++dvi: dvi-am ++ ++dvi-am: ++ ++html: html-am ++ ++html-am: ++ ++info: info-am ++ ++info-am: ++ ++install-data-am: install-data-local ++ ++install-dvi: install-dvi-am ++ ++install-dvi-am: ++ ++install-exec-am: ++ ++install-html: install-html-am ++ ++install-html-am: ++ ++install-info: install-info-am ++ ++install-info-am: ++ ++install-man: ++ ++install-pdf: install-pdf-am ++ ++install-pdf-am: ++ ++install-ps: install-ps-am ++ ++install-ps-am: ++ ++installcheck-am: ++ ++maintainer-clean: maintainer-clean-am ++ -rm -f Makefile ++maintainer-clean-am: distclean-am maintainer-clean-generic ++ ++mostlyclean: mostlyclean-am ++ ++mostlyclean-am: mostlyclean-generic mostlyclean-libtool ++ ++pdf: pdf-am ++ ++pdf-am: ++ ++ps: ps-am ++ ++ps-am: ++ ++uninstall-am: uninstall-local ++ ++.MAKE: install-am install-strip ++ ++.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ ++ clean-libtool ctags distclean distclean-generic \ ++ distclean-libtool distclean-tags distdir dvi dvi-am html \ ++ html-am info info-am install install-am install-data \ ++ install-data-am install-data-local install-dvi install-dvi-am \ ++ install-exec install-exec-am install-html install-html-am \ ++ install-info install-info-am install-man install-pdf \ ++ install-pdf-am install-ps install-ps-am install-strip \ ++ installcheck installcheck-am installdirs maintainer-clean \ ++ maintainer-clean-generic mostlyclean mostlyclean-generic \ ++ mostlyclean-libtool pdf pdf-am ps ps-am tags uninstall \ ++ uninstall-am uninstall-local ++ ++ ++install-data-local: ++ release=$(SPL_META_VERSION)-$(SPL_META_RELEASE); \ ++ instdest=$(DESTDIR)/usr/src/spl-$$release/$(LINUX_VERSION); \ ++ instfiles=`find . -name '*.h'`; \ ++ for instfile in $$instfiles; do \ ++ $(INSTALL) -D $$instfile $$instdest/$$instfile; \ ++ done ++ ++uninstall-local: ++ release=$(SPL_META_VERSION)-$(SPL_META_RELEASE); \ ++ instdest=$(DESTDIR)/usr/src/spl-$$release/$(LINUX_VERSION); \ ++ $(RM) -R $$instdest ++ ++# Tell versions [3.59,3.63) of GNU make to not export all variables. ++# Otherwise a system limit (for SysV at least) may be exceeded. ++.NOEXPORT: +diff -uNr linux-3.2.33-go.orig/include/spl/Makefile.am linux-3.2.33-go/include/spl/Makefile.am +--- linux-3.2.33-go.orig/include/spl/Makefile.am 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/Makefile.am 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,28 @@ ++# All headers are referenced by this top level Makefile.am are ++# noinst_HEADERS because they are not installed in the usual include ++# location. We do not want to be using $includedir for this. ++# Installation is handled by the custom install-data-local rule. ++noinst_HEADERS = $(top_srcdir)/include/*.h ++noinst_HEADERS += $(top_srcdir)/include/fs/*.h ++noinst_HEADERS += $(top_srcdir)/include/linux/*.h ++noinst_HEADERS += $(top_srcdir)/include/rpc/*.h ++noinst_HEADERS += $(top_srcdir)/include/sharefs/*.h ++noinst_HEADERS += $(top_srcdir)/include/sys/fm/*.h ++noinst_HEADERS += $(top_srcdir)/include/sys/fs/*.h ++noinst_HEADERS += $(top_srcdir)/include/sys/sysevent/*.h ++noinst_HEADERS += $(top_srcdir)/include/sys/*.h ++noinst_HEADERS += $(top_srcdir)/include/util/*.h ++noinst_HEADERS += $(top_srcdir)/include/vm/*.h ++ ++install-data-local: ++ release=$(SPL_META_VERSION)-$(SPL_META_RELEASE); \ ++ instdest=$(DESTDIR)/usr/src/spl-$$release/$(LINUX_VERSION); \ ++ instfiles=`find . -name '*.h'`; \ ++ for instfile in $$instfiles; do \ ++ $(INSTALL) -D $$instfile $$instdest/$$instfile; \ ++ done ++ ++uninstall-local: ++ release=$(SPL_META_VERSION)-$(SPL_META_RELEASE); \ ++ instdest=$(DESTDIR)/usr/src/spl-$$release/$(LINUX_VERSION); \ ++ $(RM) -R $$instdest +diff -uNr linux-3.2.33-go.orig/include/spl/Makefile.in linux-3.2.33-go/include/spl/Makefile.in +--- linux-3.2.33-go.orig/include/spl/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/Makefile.in 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,515 @@ ++# Makefile.in generated by automake 1.11.6 from Makefile.am. ++# @configure_input@ ++ ++# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, ++# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software ++# Foundation, Inc. ++# This Makefile.in is free software; the Free Software Foundation ++# gives unlimited permission to copy and/or distribute it, ++# with or without modifications, as long as this notice is preserved. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without ++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A ++# PARTICULAR PURPOSE. ++ ++@SET_MAKE@ ++ ++VPATH = @srcdir@ ++am__make_dryrun = \ ++ { \ ++ am__dry=no; \ ++ case $$MAKEFLAGS in \ ++ *\\[\ \ ]*) \ ++ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ ++ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ ++ *) \ ++ for am__flg in $$MAKEFLAGS; do \ ++ case $$am__flg in \ ++ *=*|--*) ;; \ ++ *n*) am__dry=yes; break;; \ ++ esac; \ ++ done;; \ ++ esac; \ ++ test $$am__dry = yes; \ ++ } ++pkgdatadir = $(datadir)/@PACKAGE@ ++pkgincludedir = $(includedir)/@PACKAGE@ ++pkglibdir = $(libdir)/@PACKAGE@ ++pkglibexecdir = $(libexecdir)/@PACKAGE@ ++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd ++install_sh_DATA = $(install_sh) -c -m 644 ++install_sh_PROGRAM = $(install_sh) -c ++install_sh_SCRIPT = $(install_sh) -c ++INSTALL_HEADER = $(INSTALL_DATA) ++transform = $(program_transform_name) ++NORMAL_INSTALL = : ++PRE_INSTALL = : ++POST_INSTALL = : ++NORMAL_UNINSTALL = : ++PRE_UNINSTALL = : ++POST_UNINSTALL = : ++build_triplet = @build@ ++host_triplet = @host@ ++target_triplet = @target@ ++subdir = include ++DIST_COMMON = $(noinst_HEADERS) $(srcdir)/Makefile.am \ ++ $(srcdir)/Makefile.in ++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ++am__aclocal_m4_deps = $(top_srcdir)/config/spl-build.m4 \ ++ $(top_srcdir)/config/spl-meta.m4 $(top_srcdir)/configure.ac ++am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ ++ $(ACLOCAL_M4) ++mkinstalldirs = $(install_sh) -d ++CONFIG_HEADER = $(top_builddir)/spl_config.h ++CONFIG_CLEAN_FILES = ++CONFIG_CLEAN_VPATH_FILES = ++AM_V_GEN = $(am__v_GEN_@AM_V@) ++am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) ++am__v_GEN_0 = @echo " GEN " $@; ++AM_V_at = $(am__v_at_@AM_V@) ++am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) ++am__v_at_0 = @ ++SOURCES = ++DIST_SOURCES = ++am__can_run_installinfo = \ ++ case $$AM_UPDATE_INFO_DIR in \ ++ n|no|NO) false;; \ ++ *) (install-info --version) >/dev/null 2>&1;; \ ++ esac ++HEADERS = $(noinst_HEADERS) ++ETAGS = etags ++CTAGS = ctags ++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ++ACLOCAL = @ACLOCAL@ ++ALIEN = @ALIEN@ ++ALIEN_VERSION = @ALIEN_VERSION@ ++AMTAR = @AMTAR@ ++AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ++AR = @AR@ ++AUTOCONF = @AUTOCONF@ ++AUTOHEADER = @AUTOHEADER@ ++AUTOMAKE = @AUTOMAKE@ ++AWK = @AWK@ ++CC = @CC@ ++CCDEPMODE = @CCDEPMODE@ ++CFLAGS = @CFLAGS@ ++CPP = @CPP@ ++CPPFLAGS = @CPPFLAGS@ ++CYGPATH_W = @CYGPATH_W@ ++DEBUG_CFLAGS = @DEBUG_CFLAGS@ ++DEBUG_KMEM = @DEBUG_KMEM@ ++DEBUG_KMEM_TRACKING = @DEBUG_KMEM_TRACKING@ ++DEBUG_LOG = @DEBUG_LOG@ ++DEBUG_SPL = @DEBUG_SPL@ ++DEFAULT_PACKAGE = @DEFAULT_PACKAGE@ ++DEFS = @DEFS@ ++DEPDIR = @DEPDIR@ ++DLLTOOL = @DLLTOOL@ ++DPKG = @DPKG@ ++DPKGBUILD = @DPKGBUILD@ ++DPKGBUILD_VERSION = @DPKGBUILD_VERSION@ ++DPKG_VERSION = @DPKG_VERSION@ ++DSYMUTIL = @DSYMUTIL@ ++DUMPBIN = @DUMPBIN@ ++ECHO_C = @ECHO_C@ ++ECHO_N = @ECHO_N@ ++ECHO_T = @ECHO_T@ ++EGREP = @EGREP@ ++EXEEXT = @EXEEXT@ ++FGREP = @FGREP@ ++GREP = @GREP@ ++HAVE_ALIEN = @HAVE_ALIEN@ ++HAVE_DPKG = @HAVE_DPKG@ ++HAVE_DPKGBUILD = @HAVE_DPKGBUILD@ ++HAVE_MAKEPKG = @HAVE_MAKEPKG@ ++HAVE_PACMAN = @HAVE_PACMAN@ ++HAVE_RPM = @HAVE_RPM@ ++HAVE_RPMBUILD = @HAVE_RPMBUILD@ ++INSTALL = @INSTALL@ ++INSTALL_DATA = @INSTALL_DATA@ ++INSTALL_PROGRAM = @INSTALL_PROGRAM@ ++INSTALL_SCRIPT = @INSTALL_SCRIPT@ ++INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ ++KERNELCPPFLAGS = @KERNELCPPFLAGS@ ++KERNELMAKE_PARAMS = @KERNELMAKE_PARAMS@ ++LD = @LD@ ++LDFLAGS = @LDFLAGS@ ++LIBOBJS = @LIBOBJS@ ++LIBS = @LIBS@ ++LIBTOOL = @LIBTOOL@ ++LICENSE = @LICENSE@ ++LINUX = @LINUX@ ++LINUX_OBJ = @LINUX_OBJ@ ++LINUX_SYMBOLS = @LINUX_SYMBOLS@ ++LINUX_VERSION = @LINUX_VERSION@ ++LIPO = @LIPO@ ++LN_S = @LN_S@ ++LTLIBOBJS = @LTLIBOBJS@ ++MAINT = @MAINT@ ++MAKEINFO = @MAKEINFO@ ++MAKEPKG = @MAKEPKG@ ++MAKEPKG_VERSION = @MAKEPKG_VERSION@ ++MANIFEST_TOOL = @MANIFEST_TOOL@ ++MKDIR_P = @MKDIR_P@ ++NM = @NM@ ++NMEDIT = @NMEDIT@ ++OBJDUMP = @OBJDUMP@ ++OBJEXT = @OBJEXT@ ++OTOOL = @OTOOL@ ++OTOOL64 = @OTOOL64@ ++PACKAGE = @PACKAGE@ ++PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ ++PACKAGE_NAME = @PACKAGE_NAME@ ++PACKAGE_STRING = @PACKAGE_STRING@ ++PACKAGE_TARNAME = @PACKAGE_TARNAME@ ++PACKAGE_URL = @PACKAGE_URL@ ++PACKAGE_VERSION = @PACKAGE_VERSION@ ++PACMAN = @PACMAN@ ++PACMAN_VERSION = @PACMAN_VERSION@ ++PATH_SEPARATOR = @PATH_SEPARATOR@ ++RANLIB = @RANLIB@ ++RPM = @RPM@ ++RPMBUILD = @RPMBUILD@ ++RPMBUILD_VERSION = @RPMBUILD_VERSION@ ++RPM_VERSION = @RPM_VERSION@ ++SED = @SED@ ++SET_MAKE = @SET_MAKE@ ++SHELL = @SHELL@ ++SPL_CONFIG = @SPL_CONFIG@ ++SPL_META_ALIAS = @SPL_META_ALIAS@ ++SPL_META_AUTHOR = @SPL_META_AUTHOR@ ++SPL_META_DATA = @SPL_META_DATA@ ++SPL_META_LT_AGE = @SPL_META_LT_AGE@ ++SPL_META_LT_CURRENT = @SPL_META_LT_CURRENT@ ++SPL_META_LT_REVISION = @SPL_META_LT_REVISION@ ++SPL_META_NAME = @SPL_META_NAME@ ++SPL_META_RELEASE = @SPL_META_RELEASE@ ++SPL_META_VERSION = @SPL_META_VERSION@ ++STRIP = @STRIP@ ++VENDOR = @VENDOR@ ++VERSION = @VERSION@ ++abs_builddir = @abs_builddir@ ++abs_srcdir = @abs_srcdir@ ++abs_top_builddir = @abs_top_builddir@ ++abs_top_srcdir = @abs_top_srcdir@ ++ac_ct_AR = @ac_ct_AR@ ++ac_ct_CC = @ac_ct_CC@ ++ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ ++am__include = @am__include@ ++am__leading_dot = @am__leading_dot@ ++am__quote = @am__quote@ ++am__tar = @am__tar@ ++am__untar = @am__untar@ ++bindir = @bindir@ ++build = @build@ ++build_alias = @build_alias@ ++build_cpu = @build_cpu@ ++build_os = @build_os@ ++build_vendor = @build_vendor@ ++builddir = @builddir@ ++datadir = @datadir@ ++datarootdir = @datarootdir@ ++docdir = @docdir@ ++dvidir = @dvidir@ ++exec_prefix = @exec_prefix@ ++host = @host@ ++host_alias = @host_alias@ ++host_cpu = @host_cpu@ ++host_os = @host_os@ ++host_vendor = @host_vendor@ ++htmldir = @htmldir@ ++includedir = @includedir@ ++infodir = @infodir@ ++install_sh = @install_sh@ ++libdir = @libdir@ ++libexecdir = @libexecdir@ ++localedir = @localedir@ ++localstatedir = @localstatedir@ ++mandir = @mandir@ ++mkdir_p = @mkdir_p@ ++oldincludedir = @oldincludedir@ ++pdfdir = @pdfdir@ ++prefix = @prefix@ ++program_transform_name = @program_transform_name@ ++psdir = @psdir@ ++sbindir = @sbindir@ ++sharedstatedir = @sharedstatedir@ ++srcdir = @srcdir@ ++sysconfdir = @sysconfdir@ ++target = @target@ ++target_alias = @target_alias@ ++target_cpu = @target_cpu@ ++target_os = @target_os@ ++target_vendor = @target_vendor@ ++top_build_prefix = @top_build_prefix@ ++top_builddir = @top_builddir@ ++top_srcdir = @top_srcdir@ ++ ++# All headers are referenced by this top level Makefile.am are ++# noinst_HEADERS because they are not installed in the usual include ++# location. We do not want to be using $includedir for this. ++# Installation is handled by the custom install-data-local rule. ++noinst_HEADERS = $(top_srcdir)/include/*.h \ ++ $(top_srcdir)/include/fs/*.h $(top_srcdir)/include/linux/*.h \ ++ $(top_srcdir)/include/rpc/*.h \ ++ $(top_srcdir)/include/sharefs/*.h \ ++ $(top_srcdir)/include/sys/fm/*.h \ ++ $(top_srcdir)/include/sys/fs/*.h \ ++ $(top_srcdir)/include/sys/sysevent/*.h \ ++ $(top_srcdir)/include/sys/*.h $(top_srcdir)/include/util/*.h \ ++ $(top_srcdir)/include/vm/*.h ++all: all-am ++ ++.SUFFIXES: ++$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) ++ @for dep in $?; do \ ++ case '$(am__configure_deps)' in \ ++ *$$dep*) \ ++ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ ++ && { if test -f $@; then exit 0; else break; fi; }; \ ++ exit 1;; \ ++ esac; \ ++ done; \ ++ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu include/Makefile'; \ ++ $(am__cd) $(top_srcdir) && \ ++ $(AUTOMAKE) --gnu include/Makefile ++.PRECIOUS: Makefile ++Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status ++ @case '$?' in \ ++ *config.status*) \ ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ ++ *) \ ++ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ ++ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ ++ esac; ++ ++$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++ ++$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(am__aclocal_m4_deps): ++ ++mostlyclean-libtool: ++ -rm -f *.lo ++ ++clean-libtool: ++ -rm -rf .libs _libs ++ ++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ mkid -fID $$unique ++tags: TAGS ++ ++TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ set x; \ ++ here=`pwd`; \ ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ shift; \ ++ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ ++ test -n "$$unique" || unique=$$empty_fix; \ ++ if test $$# -gt 0; then \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ "$$@" $$unique; \ ++ else \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ $$unique; \ ++ fi; \ ++ fi ++ctags: CTAGS ++CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ test -z "$(CTAGS_ARGS)$$unique" \ ++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ ++ $$unique ++ ++GTAGS: ++ here=`$(am__cd) $(top_builddir) && pwd` \ ++ && $(am__cd) $(top_srcdir) \ ++ && gtags -i $(GTAGS_ARGS) "$$here" ++ ++distclean-tags: ++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags ++ ++distdir: $(DISTFILES) ++ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ list='$(DISTFILES)'; \ ++ dist_files=`for file in $$list; do echo $$file; done | \ ++ sed -e "s|^$$srcdirstrip/||;t" \ ++ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ ++ case $$dist_files in \ ++ */*) $(MKDIR_P) `echo "$$dist_files" | \ ++ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ ++ sort -u` ;; \ ++ esac; \ ++ for file in $$dist_files; do \ ++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ ++ if test -d $$d/$$file; then \ ++ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ ++ if test -d "$(distdir)/$$file"; then \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ ++ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ ++ else \ ++ test -f "$(distdir)/$$file" \ ++ || cp -p $$d/$$file "$(distdir)/$$file" \ ++ || exit 1; \ ++ fi; \ ++ done ++check-am: all-am ++check: check-am ++all-am: Makefile $(HEADERS) ++installdirs: ++install: install-am ++install-exec: install-exec-am ++install-data: install-data-am ++uninstall: uninstall-am ++ ++install-am: all-am ++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am ++ ++installcheck: installcheck-am ++install-strip: ++ if test -z '$(STRIP)'; then \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ install; \ ++ else \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ ++ fi ++mostlyclean-generic: ++ ++clean-generic: ++ ++distclean-generic: ++ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) ++ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) ++ ++maintainer-clean-generic: ++ @echo "This command is intended for maintainers to use" ++ @echo "it deletes files that may require special tools to rebuild." ++clean: clean-am ++ ++clean-am: clean-generic clean-libtool mostlyclean-am ++ ++distclean: distclean-am ++ -rm -f Makefile ++distclean-am: clean-am distclean-generic distclean-tags ++ ++dvi: dvi-am ++ ++dvi-am: ++ ++html: html-am ++ ++html-am: ++ ++info: info-am ++ ++info-am: ++ ++install-data-am: install-data-local ++ ++install-dvi: install-dvi-am ++ ++install-dvi-am: ++ ++install-exec-am: ++ ++install-html: install-html-am ++ ++install-html-am: ++ ++install-info: install-info-am ++ ++install-info-am: ++ ++install-man: ++ ++install-pdf: install-pdf-am ++ ++install-pdf-am: ++ ++install-ps: install-ps-am ++ ++install-ps-am: ++ ++installcheck-am: ++ ++maintainer-clean: maintainer-clean-am ++ -rm -f Makefile ++maintainer-clean-am: distclean-am maintainer-clean-generic ++ ++mostlyclean: mostlyclean-am ++ ++mostlyclean-am: mostlyclean-generic mostlyclean-libtool ++ ++pdf: pdf-am ++ ++pdf-am: ++ ++ps: ps-am ++ ++ps-am: ++ ++uninstall-am: uninstall-local ++ ++.MAKE: install-am install-strip ++ ++.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ ++ clean-libtool ctags distclean distclean-generic \ ++ distclean-libtool distclean-tags distdir dvi dvi-am html \ ++ html-am info info-am install install-am install-data \ ++ install-data-am install-data-local install-dvi install-dvi-am \ ++ install-exec install-exec-am install-html install-html-am \ ++ install-info install-info-am install-man install-pdf \ ++ install-pdf-am install-ps install-ps-am install-strip \ ++ installcheck installcheck-am installdirs maintainer-clean \ ++ maintainer-clean-generic mostlyclean mostlyclean-generic \ ++ mostlyclean-libtool pdf pdf-am ps ps-am tags uninstall \ ++ uninstall-am uninstall-local ++ ++ ++install-data-local: ++ release=$(SPL_META_VERSION)-$(SPL_META_RELEASE); \ ++ instdest=$(DESTDIR)/usr/src/spl-$$release/$(LINUX_VERSION); \ ++ instfiles=`find . -name '*.h'`; \ ++ for instfile in $$instfiles; do \ ++ $(INSTALL) -D $$instfile $$instdest/$$instfile; \ ++ done ++ ++uninstall-local: ++ release=$(SPL_META_VERSION)-$(SPL_META_RELEASE); \ ++ instdest=$(DESTDIR)/usr/src/spl-$$release/$(LINUX_VERSION); \ ++ $(RM) -R $$instdest ++ ++# Tell versions [3.59,3.63) of GNU make to not export all variables. ++# Otherwise a system limit (for SysV at least) may be exceeded. ++.NOEXPORT: +diff -uNr linux-3.2.33-go.orig/include/spl/rpc/types.h linux-3.2.33-go/include/spl/rpc/types.h +--- linux-3.2.33-go.orig/include/spl/rpc/types.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/rpc/types.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,30 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_RPC_TYPES_H ++#define _SPL_RPC_TYPES_H ++ ++typedef int bool_t; ++ ++#endif /* SPL_RPC_TYPES_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/rpc/xdr.h linux-3.2.33-go/include/spl/rpc/xdr.h +--- linux-3.2.33-go.orig/include/spl/rpc/xdr.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/rpc/xdr.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,155 @@ ++/*****************************************************************************\ ++ * Copyright (c) 2008 Sun Microsystems, Inc. ++ * Written by Ricardo Correia ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_RPC_XDR_H ++#define _SPL_RPC_XDR_H ++ ++#include ++#include ++ ++/* ++ * XDR enums and types. ++ */ ++enum xdr_op { ++ XDR_ENCODE, ++ XDR_DECODE ++}; ++ ++struct xdr_ops; ++ ++typedef struct { ++ struct xdr_ops *x_ops; /* Also used to let caller know if ++ xdrmem_create() succeeds (sigh..) */ ++ caddr_t x_addr; /* Current buffer addr */ ++ caddr_t x_addr_end; /* End of the buffer */ ++ enum xdr_op x_op; /* Stream direction */ ++} XDR; ++ ++typedef bool_t (*xdrproc_t)(XDR *xdrs, void *ptr); ++ ++struct xdr_ops { ++ bool_t (*xdr_control)(XDR *, int, void *); ++ ++ bool_t (*xdr_char)(XDR *, char *); ++ bool_t (*xdr_u_short)(XDR *, unsigned short *); ++ bool_t (*xdr_u_int)(XDR *, unsigned *); ++ bool_t (*xdr_u_longlong_t)(XDR *, u_longlong_t *); ++ ++ bool_t (*xdr_opaque)(XDR *, caddr_t, const uint_t); ++ bool_t (*xdr_string)(XDR *, char **, const uint_t); ++ bool_t (*xdr_array)(XDR *, caddr_t *, uint_t *, const uint_t, ++ const uint_t, const xdrproc_t); ++}; ++ ++/* ++ * XDR control operator. ++ */ ++#define XDR_GET_BYTES_AVAIL 1 ++ ++struct xdr_bytesrec { ++ bool_t xc_is_last_record; ++ size_t xc_num_avail; ++}; ++ ++/* ++ * XDR functions. ++ */ ++void xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size, ++ const enum xdr_op op); ++#define xdr_destroy(xdrs) ((void) 0) /* Currently not needed. If needed later, ++ we'll add it to struct xdr_ops */ ++ ++#define xdr_control(xdrs, req, info) (xdrs)->x_ops->xdr_control((xdrs), \ ++ (req), (info)) ++ ++/* ++ * For precaution, the following are defined as static inlines instead of macros ++ * to get some amount of type safety. ++ * ++ * Also, macros wouldn't work in the case where typecasting is done, because it ++ * must be possible to reference the functions' addresses by these names. ++ */ ++static inline bool_t xdr_char(XDR *xdrs, char *cp) ++{ ++ return xdrs->x_ops->xdr_char(xdrs, cp); ++} ++ ++static inline bool_t xdr_u_short(XDR *xdrs, unsigned short *usp) ++{ ++ return xdrs->x_ops->xdr_u_short(xdrs, usp); ++} ++ ++static inline bool_t xdr_short(XDR *xdrs, short *sp) ++{ ++ BUILD_BUG_ON(sizeof(short) != 2); ++ return xdrs->x_ops->xdr_u_short(xdrs, (unsigned short *) sp); ++} ++ ++static inline bool_t xdr_u_int(XDR *xdrs, unsigned *up) ++{ ++ return xdrs->x_ops->xdr_u_int(xdrs, up); ++} ++ ++static inline bool_t xdr_int(XDR *xdrs, int *ip) ++{ ++ BUILD_BUG_ON(sizeof(int) != 4); ++ return xdrs->x_ops->xdr_u_int(xdrs, (unsigned *) ip); ++} ++ ++static inline bool_t xdr_u_longlong_t(XDR *xdrs, u_longlong_t *ullp) ++{ ++ return xdrs->x_ops->xdr_u_longlong_t(xdrs, ullp); ++} ++ ++static inline bool_t xdr_longlong_t(XDR *xdrs, longlong_t *llp) ++{ ++ BUILD_BUG_ON(sizeof(longlong_t) != 8); ++ return xdrs->x_ops->xdr_u_longlong_t(xdrs, (u_longlong_t *) llp); ++} ++ ++/* ++ * Fixed-length opaque data. ++ */ ++static inline bool_t xdr_opaque(XDR *xdrs, caddr_t cp, const uint_t cnt) ++{ ++ return xdrs->x_ops->xdr_opaque(xdrs, cp, cnt); ++} ++ ++/* ++ * Variable-length string. ++ * The *sp buffer must have (maxsize + 1) bytes. ++ */ ++static inline bool_t xdr_string(XDR *xdrs, char **sp, const uint_t maxsize) ++{ ++ return xdrs->x_ops->xdr_string(xdrs, sp, maxsize); ++} ++ ++/* ++ * Variable-length arrays. ++ */ ++static inline bool_t xdr_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, ++ const uint_t maxsize, const uint_t elsize, const xdrproc_t elproc) ++{ ++ return xdrs->x_ops->xdr_array(xdrs, arrp, sizep, maxsize, elsize, ++ elproc); ++} ++ ++#endif /* SPL_RPC_XDR_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sharefs/share.h linux-3.2.33-go/include/spl/sharefs/share.h +--- linux-3.2.33-go.orig/include/spl/sharefs/share.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sharefs/share.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SHARE_H ++#define _SPL_SHARE_H ++ ++#endif /* SPL_SHARE_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/splat-ctl.h linux-3.2.33-go/include/spl/splat-ctl.h +--- linux-3.2.33-go.orig/include/spl/splat-ctl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/splat-ctl.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,111 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPLAT_CTL_H ++#define _SPLAT_CTL_H ++ ++#include ++ ++/* ++ * Contains shared definitions for both user space and kernel space. To ++ * ensure 32-bit/64-bit interoperability over ioctl()'s only types with ++ * fixed sizes can be used. ++ */ ++#define SPLAT_MAJOR 225 /* XXX - Arbitrary */ ++#define SPLAT_MINORS 1 ++#define SPLAT_NAME "splatctl" ++#define SPLAT_DEV "/dev/splatctl" ++ ++#define SPLAT_NAME_SIZE 20 ++#define SPLAT_DESC_SIZE 60 ++ ++typedef struct splat_user { ++ char name[SPLAT_NAME_SIZE]; /* Short name */ ++ char desc[SPLAT_DESC_SIZE]; /* Short description */ ++ __u32 id; /* Unique numeric id */ ++} splat_user_t; ++ ++#define SPLAT_CFG_MAGIC 0x15263748U ++typedef struct splat_cfg { ++ __u32 cfg_magic; /* Unique magic */ ++ __u32 cfg_cmd; /* Configure command */ ++ __s32 cfg_arg1; /* Configure command arg 1 */ ++ __s32 cfg_rc1; /* Configure response 1 */ ++ union { ++ struct { ++ __u32 size; ++ splat_user_t descs[0]; ++ } splat_subsystems; ++ struct { ++ __u32 size; ++ splat_user_t descs[0]; ++ } splat_tests; ++ } cfg_data; ++} splat_cfg_t; ++ ++#define SPLAT_CMD_MAGIC 0x9daebfc0U ++typedef struct splat_cmd { ++ __u32 cmd_magic; /* Unique magic */ ++ __u32 cmd_subsystem; /* Target subsystem */ ++ __u32 cmd_test; /* Subsystem test */ ++ __u32 cmd_data_size; /* Opaque data size */ ++ char cmd_data_str[0]; /* Opaque data region */ ++} splat_cmd_t; ++ ++/* Valid ioctls */ ++#define SPLAT_CFG _IOWR('f', 101, splat_cfg_t) ++#define SPLAT_CMD _IOWR('f', 102, splat_cmd_t) ++ ++/* Valid configuration commands */ ++#define SPLAT_CFG_BUFFER_CLEAR 0x001 /* Clear text buffer */ ++#define SPLAT_CFG_BUFFER_SIZE 0x002 /* Resize text buffer */ ++#define SPLAT_CFG_SUBSYSTEM_COUNT 0x101 /* Number of subsystem */ ++#define SPLAT_CFG_SUBSYSTEM_LIST 0x102 /* List of N subsystems */ ++#define SPLAT_CFG_TEST_COUNT 0x201 /* Number of tests */ ++#define SPLAT_CFG_TEST_LIST 0x202 /* List of N tests */ ++ ++/* ++ * Valid subsystem and test commands are defined in each subsystem as ++ * SPLAT_SUBSYSTEM_*. We do need to be careful to avoid collisions, the ++ * currently defined subsystems are as follows: ++ */ ++#define SPLAT_SUBSYSTEM_KMEM 0x0100 ++#define SPLAT_SUBSYSTEM_TASKQ 0x0200 ++#define SPLAT_SUBSYSTEM_KRNG 0x0300 ++#define SPLAT_SUBSYSTEM_MUTEX 0x0400 ++#define SPLAT_SUBSYSTEM_CONDVAR 0x0500 ++#define SPLAT_SUBSYSTEM_THREAD 0x0600 ++#define SPLAT_SUBSYSTEM_RWLOCK 0x0700 ++#define SPLAT_SUBSYSTEM_TIME 0x0800 ++#define SPLAT_SUBSYSTEM_VNODE 0x0900 ++#define SPLAT_SUBSYSTEM_KOBJ 0x0a00 ++#define SPLAT_SUBSYSTEM_ATOMIC 0x0b00 ++#define SPLAT_SUBSYSTEM_LIST 0x0c00 ++#define SPLAT_SUBSYSTEM_GENERIC 0x0d00 ++#define SPLAT_SUBSYSTEM_CRED 0x0e00 ++#define SPLAT_SUBSYSTEM_ZLIB 0x0f00 ++#define SPLAT_SUBSYSTEM_LINUX 0x1000 ++#define SPLAT_SUBSYSTEM_UNKNOWN 0xff00 ++ ++#endif /* _SPLAT_CTL_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/spl-ctl.h linux-3.2.33-go/include/spl/spl-ctl.h +--- linux-3.2.33-go.orig/include/spl/spl-ctl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/spl-ctl.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,45 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _DEBUG_CTL_H ++#define _DEBUG_CTL_H ++ ++/* ++ * Contains shared definitions which both the user space ++ * and kernel space portions of splat must agree on. ++ */ ++typedef struct spl_debug_header { ++ int ph_len; ++ int ph_flags; ++ int ph_subsys; ++ int ph_mask; ++ int ph_cpu_id; ++ int ph_sec; ++ long ph_usec; ++ int ph_stack; ++ int ph_pid; ++ int ph_line_num; ++} spl_debug_header_t; ++ ++#endif /* _DEBUG_CTL_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/spl-debug.h linux-3.2.33-go/include/spl/spl-debug.h +--- linux-3.2.33-go.orig/include/spl/spl-debug.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/spl-debug.h 2012-11-16 23:22:32.402192954 +0100 +@@ -0,0 +1,276 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++/* ++ * Available debug functions. These function should be used by any ++ * package which needs to integrate with the SPL log infrastructure. ++ * ++ * SDEBUG() - Log debug message with specified mask. ++ * SDEBUG_LIMIT() - Log just 1 debug message with specified mask. ++ * SWARN() - Log a warning message. ++ * SERROR() - Log an error message. ++ * SEMERG() - Log an emergency error message. ++ * SCONSOLE() - Log a generic message to the console. ++ * ++ * SENTRY - Log entry point to a function. ++ * SEXIT - Log exit point from a function. ++ * SRETURN(x) - Log return from a function. ++ * SGOTO(x, y) - Log goto within a function. ++ */ ++ ++#ifndef _SPL_DEBUG_INTERNAL_H ++#define _SPL_DEBUG_INTERNAL_H ++ ++#include ++#include ++ ++#define SS_UNDEFINED 0x00000001 ++#define SS_ATOMIC 0x00000002 ++#define SS_KOBJ 0x00000004 ++#define SS_VNODE 0x00000008 ++#define SS_TIME 0x00000010 ++#define SS_RWLOCK 0x00000020 ++#define SS_THREAD 0x00000040 ++#define SS_CONDVAR 0x00000080 ++#define SS_MUTEX 0x00000100 ++#define SS_RNG 0x00000200 ++#define SS_TASKQ 0x00000400 ++#define SS_KMEM 0x00000800 ++#define SS_DEBUG 0x00001000 ++#define SS_GENERIC 0x00002000 ++#define SS_PROC 0x00004000 ++#define SS_MODULE 0x00008000 ++#define SS_CRED 0x00010000 ++#define SS_KSTAT 0x00020000 ++#define SS_XDR 0x00040000 ++#define SS_TSD 0x00080000 ++#define SS_ZLIB 0x00100000 ++#define SS_USER1 0x01000000 ++#define SS_USER2 0x02000000 ++#define SS_USER3 0x04000000 ++#define SS_USER4 0x08000000 ++#define SS_USER5 0x10000000 ++#define SS_USER6 0x20000000 ++#define SS_USER7 0x40000000 ++#define SS_USER8 0x80000000 ++#define SS_DEBUG_SUBSYS SS_UNDEFINED ++ ++#define SD_TRACE 0x00000001 ++#define SD_INFO 0x00000002 ++#define SD_WARNING 0x00000004 ++#define SD_ERROR 0x00000008 ++#define SD_EMERG 0x00000010 ++#define SD_CONSOLE 0x00000020 ++#define SD_IOCTL 0x00000040 ++#define SD_DPRINTF 0x00000080 ++#define SD_OTHER 0x00000100 ++#define SD_CANTMASK (SD_ERROR | SD_EMERG | SD_WARNING | SD_CONSOLE) ++ ++/* Debug log support enabled */ ++#ifdef DEBUG_LOG ++ ++#define __SDEBUG(cdls, subsys, mask, format, a...) \ ++do { \ ++ if (((mask) & SD_CANTMASK) != 0 || \ ++ ((spl_debug_mask & (mask)) != 0 && \ ++ (spl_debug_subsys & (subsys)) != 0)) \ ++ spl_debug_msg(cdls, subsys, mask, __FILE__, \ ++ __FUNCTION__, __LINE__, format, ## a); \ ++} while (0) ++ ++#define SDEBUG(mask, format, a...) \ ++ __SDEBUG(NULL, SS_DEBUG_SUBSYS, mask, format, ## a) ++ ++#define __SDEBUG_LIMIT(subsys, mask, format, a...) \ ++do { \ ++ static spl_debug_limit_state_t cdls; \ ++ \ ++ __SDEBUG(&cdls, subsys, mask, format, ## a); \ ++} while (0) ++ ++#define SDEBUG_LIMIT(mask, format, a...) \ ++ __SDEBUG_LIMIT(SS_DEBUG_SUBSYS, mask, format, ## a) ++ ++#define SWARN(fmt, a...) SDEBUG_LIMIT(SD_WARNING, fmt, ## a) ++#define SERROR(fmt, a...) SDEBUG_LIMIT(SD_ERROR, fmt, ## a) ++#define SEMERG(fmt, a...) SDEBUG_LIMIT(SD_EMERG, fmt, ## a) ++#define SCONSOLE(mask, fmt, a...) SDEBUG(SD_CONSOLE | (mask), fmt, ## a) ++ ++#define SENTRY SDEBUG(SD_TRACE, "Process entered\n") ++#define SEXIT SDEBUG(SD_TRACE, "Process leaving\n") ++ ++#define SRETURN(rc) \ ++do { \ ++ typeof(rc) RETURN__ret = (rc); \ ++ SDEBUG(SD_TRACE, "Process leaving (rc=%lu : %ld : %lx)\n", \ ++ (long)RETURN__ret, (long)RETURN__ret, (long)RETURN__ret); \ ++ return RETURN__ret; \ ++} while (0) ++ ++#define SGOTO(label, rc) \ ++do { \ ++ long GOTO__ret = (long)(rc); \ ++ SDEBUG(SD_TRACE,"Process leaving via %s (rc=%lu : %ld : %lx)\n",\ ++ #label, (unsigned long)GOTO__ret, (signed long)GOTO__ret, \ ++ (signed long)GOTO__ret); \ ++ goto label; \ ++} while (0) ++ ++typedef struct { ++ unsigned long cdls_next; ++ int cdls_count; ++ long cdls_delay; ++} spl_debug_limit_state_t; ++ ++/* Global debug variables */ ++extern unsigned long spl_debug_subsys; ++extern unsigned long spl_debug_mask; ++extern unsigned long spl_debug_printk; ++extern int spl_debug_mb; ++extern unsigned int spl_debug_binary; ++extern unsigned int spl_debug_catastrophe; ++extern unsigned int spl_debug_panic_on_bug; ++extern char spl_debug_file_path[PATH_MAX]; ++extern unsigned int spl_console_ratelimit; ++extern long spl_console_max_delay; ++extern long spl_console_min_delay; ++extern unsigned int spl_console_backoff; ++extern unsigned int spl_debug_stack; ++ ++/* Exported debug functions */ ++extern int spl_debug_mask2str(char *str, int size, unsigned long mask, int ss); ++extern int spl_debug_str2mask(unsigned long *mask, const char *str, int ss); ++extern unsigned long spl_debug_set_mask(unsigned long mask); ++extern unsigned long spl_debug_get_mask(void); ++extern unsigned long spl_debug_set_subsys(unsigned long mask); ++extern unsigned long spl_debug_get_subsys(void); ++extern int spl_debug_set_mb(int mb); ++extern int spl_debug_get_mb(void); ++extern int spl_debug_dumplog(int flags); ++extern void spl_debug_dumpstack(struct task_struct *tsk); ++extern void spl_debug_bug(char *file, const char *fn, const int line, int fl); ++extern int spl_debug_msg(void *arg, int subsys, int mask, const char *file, ++ const char *fn, const int line, const char *format, ...); ++extern int spl_debug_clear_buffer(void); ++extern int spl_debug_mark_buffer(char *text); ++ ++int spl_debug_init(void); ++void spl_debug_fini(void); ++ ++/* Debug log support disabled */ ++#else /* DEBUG_LOG */ ++ ++#define __SDEBUG(x, y, mask, fmt, a...) ((void)0) ++#define SDEBUG(mask, fmt, a...) ((void)0) ++#define SDEBUG_LIMIT(x, y, fmt, a...) ((void)0) ++#define SWARN(fmt, a...) ((void)0) ++#define SERROR(fmt, a...) ((void)0) ++#define SEMERG(fmt, a...) ((void)0) ++#define SCONSOLE(mask, fmt, a...) ((void)0) ++ ++#define SENTRY ((void)0) ++#define SEXIT ((void)0) ++#define SRETURN(x) return (x) ++#define SGOTO(x, y) { ((void)(y)); goto x; } ++ ++static inline unsigned long ++spl_debug_set_mask(unsigned long mask) { ++ return (0); ++} ++ ++static inline unsigned long ++spl_debug_get_mask(void) { ++ return (0); ++} ++ ++static inline unsigned long ++spl_debug_set_subsys(unsigned long mask) { ++ return (0); ++} ++ ++static inline unsigned long ++spl_debug_get_subsys(void) { ++ return (0); ++} ++ ++static inline int ++spl_debug_set_mb(int mb) { ++ return (0); ++} ++ ++static inline int ++spl_debug_get_mb(void) { ++ return (0); ++} ++ ++static inline int ++spl_debug_dumplog(int flags) ++{ ++ return (0); ++} ++ ++static inline void ++spl_debug_dumpstack(struct task_struct *tsk) ++{ ++ return; ++} ++ ++static inline void ++spl_debug_bug(char *file, const char *fn, const int line, int fl) ++{ ++ return; ++} ++ ++static inline int ++spl_debug_msg(void *arg, int subsys, int mask, const char *file, ++ const char *fn, const int line, const char *format, ...) ++{ ++ return (0); ++} ++ ++static inline int ++spl_debug_clear_buffer(void) ++{ ++ return (0); ++} ++ ++static inline int ++spl_debug_mark_buffer(char *text) ++{ ++ return (0); ++} ++ ++static inline int ++spl_debug_init(void) { ++ return (0); ++} ++ ++static inline void ++spl_debug_fini(void) { ++ return; ++} ++ ++#endif /* DEBUG_LOG */ ++ ++#endif /* SPL_DEBUG_INTERNAL_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/spl-device.h linux-3.2.33-go/include/spl/spl-device.h +--- linux-3.2.33-go.orig/include/spl/spl-device.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/spl-device.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,90 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_DEVICE_H ++#define _SPL_DEVICE_H ++ ++#include ++ ++/* ++ * Preferred API from 2.6.18 to 2.6.26+ ++ */ ++#ifdef HAVE_DEVICE_CREATE ++ ++typedef struct class spl_class; ++typedef struct device spl_device; ++ ++#define spl_class_create(mod, name) class_create(mod, name) ++#define spl_class_destroy(cls) class_destroy(cls) ++ ++# ifdef HAVE_5ARGS_DEVICE_CREATE ++# define spl_device_create(cls, parent, devt, drvdata, fmt, args...) \ ++ device_create(cls, parent, devt, drvdata, fmt, ## args) ++# else ++# define spl_device_create(cls, parent, devt, drvdata, fmt, args...) \ ++ device_create(cls, parent, devt, fmt, ## args) ++# endif ++ ++#define spl_device_destroy(cls, cls_dev, devt) \ ++ device_destroy(cls, devt) ++ ++/* ++ * Preferred API from 2.6.13 to 2.6.17 ++ * Depricated in 2.6.18 ++ * Removed in 2.6.26 ++ */ ++#else ++#ifdef HAVE_CLASS_DEVICE_CREATE ++ ++typedef struct class spl_class; ++typedef struct class_device spl_device; ++ ++#define spl_class_create(mod, name) class_create(mod, name) ++#define spl_class_destroy(cls) class_destroy(cls) ++#define spl_device_create(cls, parent, devt, device, fmt, args...) \ ++ class_device_create(cls, parent, devt, device, fmt, ## args) ++#define spl_device_destroy(cls, cls_dev, devt) \ ++ class_device_unregister(cls_dev) ++ ++/* ++ * Prefered API from 2.6.0 to 2.6.12 ++ * Depricated in 2.6.13 ++ * Removed in 2.6.13 ++ */ ++#else /* Legacy API */ ++ ++typedef struct class_simple spl_class; ++typedef struct class_device spl_class_device; ++ ++#define spl_class_create(mod, name) class_simple_create(mod, name) ++#define spl_class_destroy(cls) class_simple_destroy(cls) ++#define spl_device_create(cls, parent, devt, device, fmt, args...) \ ++ class_simple_device_add(cls, devt, device, fmt, ## args) ++#define spl_device_destroy(cls, cls_dev, devt) \ ++ class_simple_device_remove(devt) ++ ++#endif /* HAVE_CLASS_DEVICE_CREATE */ ++#endif /* HAVE_DEVICE_CREATE */ ++ ++#endif /* _SPL_DEVICE_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/spl-trace.h linux-3.2.33-go/include/spl/spl-trace.h +--- linux-3.2.33-go.orig/include/spl/spl-trace.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/spl-trace.h 2012-11-16 23:22:32.402192954 +0100 +@@ -0,0 +1,132 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_TRACE_H ++#define _SPL_TRACE_H ++ ++#define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT)) ++#define TCD_STOCK_PAGES (TCD_MAX_PAGES) ++#define TRACE_CONSOLE_BUFFER_SIZE 1024 ++ ++#define SPL_DEFAULT_MAX_DELAY (600 * HZ) ++#define SPL_DEFAULT_MIN_DELAY ((HZ + 1) / 2) ++#define SPL_DEFAULT_BACKOFF 2 ++ ++#define DL_NOTHREAD 0x0001 /* Do not create a new thread */ ++#define DL_SINGLE_CPU 0x0002 /* Collect pages from this CPU*/ ++ ++typedef struct dumplog_priv { ++ wait_queue_head_t dp_waitq; ++ pid_t dp_pid; ++ int dp_flags; ++ atomic_t dp_done; ++} dumplog_priv_t; ++ ++/* Three trace data types */ ++typedef enum { ++ TCD_TYPE_PROC, ++ TCD_TYPE_SOFTIRQ, ++ TCD_TYPE_IRQ, ++ TCD_TYPE_MAX ++} tcd_type_t; ++ ++union trace_data_union { ++ struct trace_cpu_data { ++ /* pages with trace records not yet processed by tracefiled */ ++ struct list_head tcd_pages; ++ /* number of pages on ->tcd_pages */ ++ unsigned long tcd_cur_pages; ++ /* Max number of pages allowed on ->tcd_pages */ ++ unsigned long tcd_max_pages; ++ ++ /* ++ * preallocated pages to write trace records into. Pages from ++ * ->tcd_stock_pages are moved to ->tcd_pages by spl_debug_msg(). ++ * ++ * This list is necessary, because on some platforms it's ++ * impossible to perform efficient atomic page allocation in a ++ * non-blockable context. ++ * ++ * Such platforms fill ->tcd_stock_pages "on occasion", when ++ * tracing code is entered in blockable context. ++ * ++ * trace_get_tage_try() tries to get a page from ++ * ->tcd_stock_pages first and resorts to atomic page ++ * allocation only if this queue is empty. ->tcd_stock_pages ++ * is replenished when tracing code is entered in blocking ++ * context (darwin-tracefile.c:trace_get_tcd()). We try to ++ * maintain TCD_STOCK_PAGES (40 by default) pages in this ++ * queue. Atomic allocation is only required if more than ++ * TCD_STOCK_PAGES pagesful are consumed by trace records all ++ * emitted in non-blocking contexts. Which is quite unlikely. ++ */ ++ struct list_head tcd_stock_pages; ++ /* number of pages on ->tcd_stock_pages */ ++ unsigned long tcd_cur_stock_pages; ++ ++ unsigned short tcd_shutting_down; ++ unsigned short tcd_cpu; ++ unsigned short tcd_type; ++ /* The factors to share debug memory. */ ++ unsigned short tcd_pages_factor; ++ ++ /* ++ * This spinlock is needed to workaround the problem of ++ * set_cpus_allowed() being GPL-only. Since we cannot ++ * schedule a thread on a specific CPU when dumping the ++ * pages, we must use the spinlock for mutual exclusion. ++ */ ++ spinlock_t tcd_lock; ++ unsigned long tcd_lock_flags; ++ } tcd; ++ char __pad[L1_CACHE_ALIGN(sizeof(struct trace_cpu_data))]; ++}; ++ ++extern union trace_data_union (*trace_data[TCD_TYPE_MAX])[NR_CPUS]; ++ ++#define tcd_for_each(tcd, i, j) \ ++ for (i = 0; i < TCD_TYPE_MAX && trace_data[i]; i++) \ ++ for (j = 0, ((tcd) = &(*trace_data[i])[j].tcd); \ ++ j < num_possible_cpus(); j++, (tcd) = &(*trace_data[i])[j].tcd) ++ ++#define tcd_for_each_type_lock(tcd, i, cpu) \ ++ for (i = 0; i < TCD_TYPE_MAX && trace_data[i] && \ ++ (tcd = &(*trace_data[i])[cpu].tcd) && \ ++ trace_lock_tcd(tcd); trace_unlock_tcd(tcd), i++) ++ ++struct trace_page { ++ struct page *page; /* page itself */ ++ struct list_head linkage; /* Used by trace_data_union */ ++ unsigned int used; /* number of bytes used within this page */ ++ unsigned short cpu; /* cpu that owns this page */ ++ unsigned short type; /* type(context) of this page */ ++}; ++ ++struct page_collection { ++ struct list_head pc_pages; ++ spinlock_t pc_lock; ++ int pc_want_daemon_pages; ++}; ++ ++#endif /* SPL_TRACE_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/strings.h linux-3.2.33-go/include/spl/strings.h +--- linux-3.2.33-go.orig/include/spl/strings.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/strings.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_STRINGS_H ++#define _SPL_STRINGS_H ++ ++#endif /* SPL_STRINGS_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/acl.h linux-3.2.33-go/include/spl/sys/acl.h +--- linux-3.2.33-go.orig/include/spl/sys/acl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/acl.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,117 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_ACL_H ++#define _SPL_ACL_H ++ ++#include ++ ++typedef struct ace { ++ uid_t a_who; ++ uint32_t a_access_mask; ++ uint16_t a_flags; ++ uint16_t a_type; ++} ace_t; ++ ++typedef struct ace_object { ++ uid_t a_who; /* uid or gid */ ++ uint32_t a_access_mask; /* read,write,... */ ++ uint16_t a_flags; /* see below */ ++ uint16_t a_type; /* allow or deny */ ++ uint8_t a_obj_type[16]; /* obj type */ ++ uint8_t a_inherit_obj_type[16]; /* inherit obj */ ++} ace_object_t; ++ ++#define MAX_ACL_ENTRIES 1024 ++ ++#define ACE_READ_DATA 0x00000001 ++#define ACE_LIST_DIRECTORY 0x00000001 ++#define ACE_WRITE_DATA 0x00000002 ++#define ACE_ADD_FILE 0x00000002 ++#define ACE_APPEND_DATA 0x00000004 ++#define ACE_ADD_SUBDIRECTORY 0x00000004 ++#define ACE_READ_NAMED_ATTRS 0x00000008 ++#define ACE_WRITE_NAMED_ATTRS 0x00000010 ++#define ACE_EXECUTE 0x00000020 ++#define ACE_DELETE_CHILD 0x00000040 ++#define ACE_READ_ATTRIBUTES 0x00000080 ++#define ACE_WRITE_ATTRIBUTES 0x00000100 ++#define ACE_DELETE 0x00010000 ++#define ACE_READ_ACL 0x00020000 ++#define ACE_WRITE_ACL 0x00040000 ++#define ACE_WRITE_OWNER 0x00080000 ++#define ACE_SYNCHRONIZE 0x00100000 ++ ++#define ACE_FILE_INHERIT_ACE 0x0001 ++#define ACE_DIRECTORY_INHERIT_ACE 0x0002 ++#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004 ++#define ACE_INHERIT_ONLY_ACE 0x0008 ++#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010 ++#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020 ++#define ACE_IDENTIFIER_GROUP 0x0040 ++#define ACE_INHERITED_ACE 0x0080 ++#define ACE_OWNER 0x1000 ++#define ACE_GROUP 0x2000 ++#define ACE_EVERYONE 0x4000 ++ ++#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000 ++#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001 ++#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002 ++#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003 ++ ++#define ACL_AUTO_INHERIT 0x0001 ++#define ACL_PROTECTED 0x0002 ++#define ACL_DEFAULTED 0x0004 ++#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED|ACL_DEFAULTED) ++ ++#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 ++#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 ++#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 ++#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 ++#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 ++#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 ++#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A ++#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B ++#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C ++#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D ++#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E ++#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F ++#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 ++ ++#define ACE_ALL_TYPES 0x001F ++ ++#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE|ACE_IDENTIFIER_GROUP) ++ ++#define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ++ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ ++ ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ ++ ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ ++ ACE_WRITE_OWNER|ACE_SYNCHRONIZE) ++ ++#define VSA_ACE 0x0010 ++#define VSA_ACECNT 0x0020 ++#define VSA_ACE_ALLTYPES 0x0040 ++#define VSA_ACE_ACLFLAGS 0x0080 ++ ++#endif /* _SPL_ACL_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/acl_impl.h linux-3.2.33-go/include/spl/sys/acl_impl.h +--- linux-3.2.33-go.orig/include/spl/sys/acl_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/acl_impl.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_ACL_IMPL_H ++#define _SPL_ACL_IMPL_H ++ ++#endif /* _SPL_ACL_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/atomic.h linux-3.2.33-go/include/spl/sys/atomic.h +--- linux-3.2.33-go.orig/include/spl/sys/atomic.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/atomic.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,296 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_ATOMIC_H ++#define _SPL_ATOMIC_H ++ ++#include ++#include ++#include ++ ++#ifndef HAVE_ATOMIC64_CMPXCHG ++#define atomic64_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n))) ++#endif ++ ++#ifndef HAVE_ATOMIC64_XCHG ++#define atomic64_xchg(v, n) (xchg(&((v)->counter), n)) ++#endif ++ ++/* ++ * Two approaches to atomic operations are implemented each with its ++ * own benefits are drawbacks imposed by the Solaris API. Neither ++ * approach handles the issue of word breaking when using a 64-bit ++ * atomic variable on a 32-bit arch. The Solaris API would need to ++ * add an atomic read call to correctly support this. ++ * ++ * When ATOMIC_SPINLOCK is defined all atomic operations will be ++ * serialized through global spin locks. This is bad for performance ++ * but it does allow a simple generic implementation. ++ * ++ * When ATOMIC_SPINLOCK is not defined the Linux atomic operations ++ * are used. This is safe as long as the core Linux implementation ++ * doesn't change because we are relying on the fact that an atomic ++ * type is really just a uint32 or uint64. If this changes at some ++ * point in the future we need to fall-back to the spin approach. ++ */ ++#ifdef ATOMIC_SPINLOCK ++extern spinlock_t atomic32_lock; ++extern spinlock_t atomic64_lock; ++ ++static __inline__ void ++atomic_inc_32(volatile uint32_t *target) ++{ ++ spin_lock(&atomic32_lock); ++ (*target)++; ++ spin_unlock(&atomic32_lock); ++} ++ ++static __inline__ void ++atomic_dec_32(volatile uint32_t *target) ++{ ++ spin_lock(&atomic32_lock); ++ (*target)--; ++ spin_unlock(&atomic32_lock); ++} ++ ++static __inline__ void ++atomic_add_32(volatile uint32_t *target, int32_t delta) ++{ ++ spin_lock(&atomic32_lock); ++ *target += delta; ++ spin_unlock(&atomic32_lock); ++} ++ ++static __inline__ void ++atomic_sub_32(volatile uint32_t *target, int32_t delta) ++{ ++ spin_lock(&atomic32_lock); ++ *target -= delta; ++ spin_unlock(&atomic32_lock); ++} ++ ++static __inline__ uint32_t ++atomic_inc_32_nv(volatile uint32_t *target) ++{ ++ uint32_t nv; ++ ++ spin_lock(&atomic32_lock); ++ nv = ++(*target); ++ spin_unlock(&atomic32_lock); ++ ++ return nv; ++} ++ ++static __inline__ uint32_t ++atomic_dec_32_nv(volatile uint32_t *target) ++{ ++ uint32_t nv; ++ ++ spin_lock(&atomic32_lock); ++ nv = --(*target); ++ spin_unlock(&atomic32_lock); ++ ++ return nv; ++} ++ ++static __inline__ uint32_t ++atomic_add_32_nv(volatile uint32_t *target, uint32_t delta) ++{ ++ uint32_t nv; ++ ++ spin_lock(&atomic32_lock); ++ *target += delta; ++ nv = *target; ++ spin_unlock(&atomic32_lock); ++ ++ return nv; ++} ++ ++static __inline__ uint32_t ++atomic_sub_32_nv(volatile uint32_t *target, uint32_t delta) ++{ ++ uint32_t nv; ++ ++ spin_lock(&atomic32_lock); ++ *target -= delta; ++ nv = *target; ++ spin_unlock(&atomic32_lock); ++ ++ return nv; ++} ++ ++static __inline__ uint32_t ++atomic_cas_32(volatile uint32_t *target, uint32_t cmp, ++ uint32_t newval) ++{ ++ uint32_t rc; ++ ++ spin_lock(&atomic32_lock); ++ rc = *target; ++ if (*target == cmp) ++ *target = newval; ++ ++ spin_unlock(&atomic32_lock); ++ ++ return rc; ++} ++ ++static __inline__ void ++atomic_inc_64(volatile uint64_t *target) ++{ ++ spin_lock(&atomic64_lock); ++ (*target)++; ++ spin_unlock(&atomic64_lock); ++} ++ ++static __inline__ void ++atomic_dec_64(volatile uint64_t *target) ++{ ++ spin_lock(&atomic64_lock); ++ (*target)--; ++ spin_unlock(&atomic64_lock); ++} ++ ++static __inline__ void ++atomic_add_64(volatile uint64_t *target, uint64_t delta) ++{ ++ spin_lock(&atomic64_lock); ++ *target += delta; ++ spin_unlock(&atomic64_lock); ++} ++ ++static __inline__ void ++atomic_sub_64(volatile uint64_t *target, uint64_t delta) ++{ ++ spin_lock(&atomic64_lock); ++ *target -= delta; ++ spin_unlock(&atomic64_lock); ++} ++ ++static __inline__ uint64_t ++atomic_inc_64_nv(volatile uint64_t *target) ++{ ++ uint64_t nv; ++ ++ spin_lock(&atomic64_lock); ++ nv = ++(*target); ++ spin_unlock(&atomic64_lock); ++ ++ return nv; ++} ++ ++static __inline__ uint64_t ++atomic_dec_64_nv(volatile uint64_t *target) ++{ ++ uint64_t nv; ++ ++ spin_lock(&atomic64_lock); ++ nv = --(*target); ++ spin_unlock(&atomic64_lock); ++ ++ return nv; ++} ++ ++static __inline__ uint64_t ++atomic_add_64_nv(volatile uint64_t *target, uint64_t delta) ++{ ++ uint64_t nv; ++ ++ spin_lock(&atomic64_lock); ++ *target += delta; ++ nv = *target; ++ spin_unlock(&atomic64_lock); ++ ++ return nv; ++} ++ ++static __inline__ uint64_t ++atomic_sub_64_nv(volatile uint64_t *target, uint64_t delta) ++{ ++ uint64_t nv; ++ ++ spin_lock(&atomic64_lock); ++ *target -= delta; ++ nv = *target; ++ spin_unlock(&atomic64_lock); ++ ++ return nv; ++} ++ ++static __inline__ uint64_t ++atomic_cas_64(volatile uint64_t *target, uint64_t cmp, ++ uint64_t newval) ++{ ++ uint64_t rc; ++ ++ spin_lock(&atomic64_lock); ++ rc = *target; ++ if (*target == cmp) ++ *target = newval; ++ spin_unlock(&atomic64_lock); ++ ++ return rc; ++} ++ ++ ++#else /* ATOMIC_SPINLOCK */ ++ ++#define atomic_inc_32(v) atomic_inc((atomic_t *)(v)) ++#define atomic_dec_32(v) atomic_dec((atomic_t *)(v)) ++#define atomic_add_32(v, i) atomic_add((i), (atomic_t *)(v)) ++#define atomic_sub_32(v, i) atomic_sub((i), (atomic_t *)(v)) ++#define atomic_inc_32_nv(v) atomic_inc_return((atomic_t *)(v)) ++#define atomic_dec_32_nv(v) atomic_dec_return((atomic_t *)(v)) ++#define atomic_add_32_nv(v, i) atomic_add_return((i), (atomic_t *)(v)) ++#define atomic_sub_32_nv(v, i) atomic_sub_return((i), (atomic_t *)(v)) ++#define atomic_cas_32(v, x, y) atomic_cmpxchg((atomic_t *)(v), x, y) ++#define atomic_inc_64(v) atomic64_inc((atomic64_t *)(v)) ++#define atomic_dec_64(v) atomic64_dec((atomic64_t *)(v)) ++#define atomic_add_64(v, i) atomic64_add((i), (atomic64_t *)(v)) ++#define atomic_sub_64(v, i) atomic64_sub((i), (atomic64_t *)(v)) ++#define atomic_inc_64_nv(v) atomic64_inc_return((atomic64_t *)(v)) ++#define atomic_dec_64_nv(v) atomic64_dec_return((atomic64_t *)(v)) ++#define atomic_add_64_nv(v, i) atomic64_add_return((i), (atomic64_t *)(v)) ++#define atomic_sub_64_nv(v, i) atomic64_sub_return((i), (atomic64_t *)(v)) ++#define atomic_cas_64(v, x, y) atomic64_cmpxchg((atomic64_t *)(v), x, y) ++ ++#endif /* ATOMIC_SPINLOCK */ ++ ++#ifdef _LP64 ++static __inline__ void * ++atomic_cas_ptr(volatile void *target, void *cmp, void *newval) ++{ ++ return (void *)atomic_cas_64((volatile uint64_t *)target, ++ (uint64_t)cmp, (uint64_t)newval); ++} ++#else /* _LP64 */ ++static __inline__ void * ++atomic_cas_ptr(volatile void *target, void *cmp, void *newval) ++{ ++ return (void *)atomic_cas_32((volatile uint32_t *)target, ++ (uint32_t)cmp, (uint32_t)newval); ++} ++#endif /* _LP64 */ ++ ++#endif /* _SPL_ATOMIC_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/attr.h linux-3.2.33-go/include/spl/sys/attr.h +--- linux-3.2.33-go.orig/include/spl/sys/attr.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/attr.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_ATTR_H ++#define _SPL_ATTR_H ++ ++#endif /* SPL_ATTR_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/bitmap.h linux-3.2.33-go/include/spl/sys/bitmap.h +--- linux-3.2.33-go.orig/include/spl/sys/bitmap.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/bitmap.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_BITMAP_H ++#define _SPL_BITMAP_H ++ ++#endif /* SPL_BITMAP_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/bootconf.h linux-3.2.33-go/include/spl/sys/bootconf.h +--- linux-3.2.33-go.orig/include/spl/sys/bootconf.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/bootconf.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_BOOTCONF_H ++#define _SPL_BOOTCONF_H ++ ++#endif /* SPL_BOOTCONF_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/bootprops.h linux-3.2.33-go/include/spl/sys/bootprops.h +--- linux-3.2.33-go.orig/include/spl/sys/bootprops.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/bootprops.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_BOOTPROPS_H ++#define _SPL_BOOTPROPS_H ++ ++#endif /* SPL_BOOTPROPS_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/buf.h linux-3.2.33-go/include/spl/sys/buf.h +--- linux-3.2.33-go.orig/include/spl/sys/buf.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/buf.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_BUF_H ++#define _SPL_BUF_H ++ ++#endif /* SPL_BUF_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/byteorder.h linux-3.2.33-go/include/spl/sys/byteorder.h +--- linux-3.2.33-go.orig/include/spl/sys/byteorder.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/byteorder.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,37 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_BYTEORDER_H ++#define _SPL_BYTEORDER_H ++ ++#include ++ ++#define LE_16(x) cpu_to_le16(x) ++#define LE_32(x) cpu_to_le32(x) ++#define LE_64(x) cpu_to_le64(x) ++#define BE_16(x) cpu_to_be16(x) ++#define BE_32(x) cpu_to_be32(x) ++#define BE_64(x) cpu_to_be64(x) ++ ++#endif /* SPL_BYTEORDER_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/callb.h linux-3.2.33-go/include/spl/sys/callb.h +--- linux-3.2.33-go.orig/include/spl/sys/callb.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/callb.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,55 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_CALLB_H ++#define _SPL_CALLB_H ++ ++#include ++#include ++ ++#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp)); ++ ++typedef struct callb_cpr { ++ kmutex_t *cc_lockp; ++} callb_cpr_t; ++ ++#define CALLB_CPR_INIT(cp, lockp, func, name) { \ ++ (cp)->cc_lockp = lockp; \ ++} ++ ++#define CALLB_CPR_SAFE_BEGIN(cp) { \ ++ CALLB_CPR_ASSERT(cp); \ ++} ++ ++#define CALLB_CPR_SAFE_END(cp, lockp) { \ ++ CALLB_CPR_ASSERT(cp); \ ++} ++ ++#define CALLB_CPR_EXIT(cp) { \ ++ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ ++ mutex_exit((cp)->cc_lockp); \ ++} ++ ++#endif /* _SPL_CALLB_H */ ++ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/cmn_err.h linux-3.2.33-go/include/spl/sys/cmn_err.h +--- linux-3.2.33-go.orig/include/spl/sys/cmn_err.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/cmn_err.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,42 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_CMN_ERR_H ++#define _SPL_CMN_ERR_H ++ ++#include ++ ++#define CE_CONT 0 /* continuation */ ++#define CE_NOTE 1 /* notice */ ++#define CE_WARN 2 /* warning */ ++#define CE_PANIC 3 /* panic */ ++#define CE_IGNORE 4 /* print nothing */ ++ ++extern void cmn_err(int, const char *, ...); ++extern void vcmn_err(int, const char *, __va_list); ++extern void vpanic(const char *, __va_list); ++ ++#define fm_panic panic ++ ++#endif /* SPL_CMN_ERR_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/compress.h linux-3.2.33-go/include/spl/sys/compress.h +--- linux-3.2.33-go.orig/include/spl/sys/compress.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/compress.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_COMPRESS_H ++#define _SPL_COMPRESS_H ++ ++#endif /* SPL_COMPRESS_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/condvar.h linux-3.2.33-go/include/spl/sys/condvar.h +--- linux-3.2.33-go.orig/include/spl/sys/condvar.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/condvar.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,71 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_CONDVAR_H ++#define _SPL_CONDVAR_H ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * The kcondvar_t struct is protected by mutex taken externally before ++ * calling any of the wait/signal funs, and passed into the wait funs. ++ */ ++#define CV_MAGIC 0x346545f4 ++#define CV_DESTROY 0x346545f5 ++ ++typedef struct { ++ int cv_magic; ++ wait_queue_head_t cv_event; ++ wait_queue_head_t cv_destroy; ++ atomic_t cv_refs; ++ atomic_t cv_waiters; ++ kmutex_t *cv_mutex; ++} kcondvar_t; ++ ++typedef enum { CV_DEFAULT=0, CV_DRIVER } kcv_type_t; ++ ++extern void __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg); ++extern void __cv_destroy(kcondvar_t *cvp); ++extern void __cv_wait(kcondvar_t *cvp, kmutex_t *mp); ++extern void __cv_wait_interruptible(kcondvar_t *cvp, kmutex_t *mp); ++extern clock_t __cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time); ++extern clock_t __cv_timedwait_interruptible(kcondvar_t *cvp, kmutex_t *mp, ++ clock_t exp_time); ++extern void __cv_signal(kcondvar_t *cvp); ++extern void __cv_broadcast(kcondvar_t *cvp); ++ ++#define cv_init(cvp, name, type, arg) __cv_init(cvp, name, type, arg) ++#define cv_destroy(cvp) __cv_destroy(cvp) ++#define cv_wait(cvp, mp) __cv_wait(cvp, mp) ++#define cv_wait_interruptible(cvp, mp) __cv_wait_interruptible(cvp,mp) ++#define cv_timedwait(cvp, mp, t) __cv_timedwait(cvp, mp, t) ++#define cv_timedwait_interruptible(cvp, mp, t) \ ++ __cv_timedwait_interruptible(cvp, mp, t) ++#define cv_signal(cvp) __cv_signal(cvp) ++#define cv_broadcast(cvp) __cv_broadcast(cvp) ++ ++#endif /* _SPL_CONDVAR_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/conf.h linux-3.2.33-go/include/spl/sys/conf.h +--- linux-3.2.33-go.orig/include/spl/sys/conf.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/conf.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_CONF_H ++#define _SPL_CONF_H ++ ++#endif /* SPL_CONF_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/console.h linux-3.2.33-go/include/spl/sys/console.h +--- linux-3.2.33-go.orig/include/spl/sys/console.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/console.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,44 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_CONSOLE_H ++#define _SPL_CONSOLE_H ++ ++void ++console_vprintf(const char *fmt, va_list args) ++{ ++ vprintk(fmt, args); ++} ++ ++void ++console_printf(const char *fmt, ...) ++{ ++ va_list args; ++ ++ va_start(args, fmt); ++ console_vprintf(fmt, args); ++ va_end(args); ++} ++ ++#endif /* _SPL_CONSOLE_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/cpupart.h linux-3.2.33-go/include/spl/sys/cpupart.h +--- linux-3.2.33-go.orig/include/spl/sys/cpupart.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/cpupart.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_CPUPART_H ++#define _SPL_CPUPART_H ++ ++#endif /* SPL_CPUPART_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/cpuvar.h linux-3.2.33-go/include/spl/sys/cpuvar.h +--- linux-3.2.33-go.orig/include/spl/sys/cpuvar.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/cpuvar.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_CPUVAR_H ++#define _SPL_CPUVAR_H ++ ++#endif /* SPL_CPUVAR_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/crc32.h linux-3.2.33-go/include/spl/sys/crc32.h +--- linux-3.2.33-go.orig/include/spl/sys/crc32.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/crc32.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_CRC32_H ++#define _SPL_CRC32_H ++ ++#endif /* SPL_CRC32_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/cred.h linux-3.2.33-go/include/spl/sys/cred.h +--- linux-3.2.33-go.orig/include/spl/sys/cred.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/cred.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,62 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_CRED_H ++#define _SPL_CRED_H ++ ++#include ++#include ++#include ++ ++#ifdef HAVE_CRED_STRUCT ++ ++typedef struct cred cred_t; ++ ++#define kcred ((cred_t *)(init_task.cred)) ++#define CRED() ((cred_t *)current_cred()) ++ ++#else ++ ++typedef struct task_struct cred_t; ++ ++#define kcred ((cred_t *)&init_task) ++#define CRED() ((cred_t *)current) ++ ++#endif /* HAVE_CRED_STRUCT */ ++ ++extern void crhold(cred_t *cr); ++extern void crfree(cred_t *cr); ++extern uid_t crgetuid(const cred_t *cr); ++extern uid_t crgetruid(const cred_t *cr); ++extern uid_t crgetsuid(const cred_t *cr); ++extern uid_t crgetfsuid(const cred_t *cr); ++extern gid_t crgetgid(const cred_t *cr); ++extern gid_t crgetrgid(const cred_t *cr); ++extern gid_t crgetsgid(const cred_t *cr); ++extern gid_t crgetfsgid(const cred_t *cr); ++extern int crgetngroups(const cred_t *cr); ++extern gid_t * crgetgroups(const cred_t *cr); ++extern int groupmember(gid_t gid, const cred_t *cr); ++ ++#endif /* _SPL_CRED_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/ctype.h linux-3.2.33-go/include/spl/sys/ctype.h +--- linux-3.2.33-go.orig/include/spl/sys/ctype.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/ctype.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,30 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_CTYPE_H ++#define _SPL_CTYPE_H ++ ++#include ++ ++#endif /* SPL_CTYPE_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/ddi.h linux-3.2.33-go/include/spl/sys/ddi.h +--- linux-3.2.33-go.orig/include/spl/sys/ddi.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/ddi.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_DDI_H ++#define _SPL_DDI_H ++ ++#endif /* SPL_DDI_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/debug.h linux-3.2.33-go/include/spl/sys/debug.h +--- linux-3.2.33-go.orig/include/spl/sys/debug.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/debug.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,142 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++/* ++ * Available Solaris debug functions. All of the ASSERT() macros will be ++ * compiled out when NDEBUG is defined, this is the default behavior for ++ * the SPL. To enable assertions use the --enable-debug with configure. ++ * The VERIFY() functions are never compiled out and cannot be disabled. ++ * ++ * PANIC() - Panic the node and print message. ++ * ASSERT() - Assert X is true, if not panic. ++ * ASSERTF() - Assert X is true, if not panic and print message. ++ * ASSERTV() - Wraps a variable declaration which is only used by ASSERT(). ++ * ASSERT3S() - Assert signed X OP Y is true, if not panic. ++ * ASSERT3U() - Assert unsigned X OP Y is true, if not panic. ++ * ASSERT3P() - Assert pointer X OP Y is true, if not panic. ++ * VERIFY() - Verify X is true, if not panic. ++ * VERIFY3S() - Verify signed X OP Y is true, if not panic. ++ * VERIFY3U() - Verify unsigned X OP Y is true, if not panic. ++ * VERIFY3P() - Verify pointer X OP Y is true, if not panic. ++ */ ++ ++#ifndef _SPL_DEBUG_H ++#define _SPL_DEBUG_H ++ ++#include ++ ++#ifdef NDEBUG /* Debugging Disabled */ ++ ++/* Define SPL_DEBUG_STR to make clear which ASSERT definitions are used */ ++#define SPL_DEBUG_STR "" ++ ++#define PANIC(fmt, a...) \ ++do { \ ++ printk(KERN_EMERG fmt, ## a); \ ++ spl_debug_bug(__FILE__, __FUNCTION__, __LINE__, 0); \ ++} while (0) ++ ++#define __ASSERT(x) ((void)0) ++#define ASSERT(x) ((void)0) ++#define ASSERTF(x, y, z...) ((void)0) ++#define ASSERTV(x) ++#define VERIFY(cond) \ ++do { \ ++ if (unlikely(!(cond))) \ ++ PANIC("VERIFY(" #cond ") failed\n"); \ ++} while (0) ++ ++#define VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE, FMT, CAST) \ ++do { \ ++ if (!((TYPE)(LEFT) OP (TYPE)(RIGHT))) \ ++ PANIC("VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ ++ "failed (" FMT " " #OP " " FMT ")\n", \ ++ CAST (LEFT), CAST (RIGHT)); \ ++} while (0) ++ ++#define VERIFY3S(x,y,z) VERIFY3_IMPL(x, y, z, int64_t, "%lld", (long long)) ++#define VERIFY3U(x,y,z) VERIFY3_IMPL(x, y, z, uint64_t, "%llu", \ ++ (unsigned long long)) ++#define VERIFY3P(x,y,z) VERIFY3_IMPL(x, y, z, uintptr_t, "%p", (void *)) ++ ++#define ASSERT3S(x,y,z) ((void)0) ++#define ASSERT3U(x,y,z) ((void)0) ++#define ASSERT3P(x,y,z) ((void)0) ++ ++#else /* Debugging Enabled */ ++ ++/* Define SPL_DEBUG_STR to make clear which ASSERT definitions are used */ ++#define SPL_DEBUG_STR " (DEBUG mode)" ++ ++#define PANIC(fmt, a...) \ ++do { \ ++ spl_debug_msg(NULL, 0, 0, \ ++ __FILE__, __FUNCTION__, __LINE__, fmt, ## a); \ ++ spl_debug_bug(__FILE__, __FUNCTION__, __LINE__, 0); \ ++} while (0) ++ ++/* ASSERTION that is safe to use within the debug system */ ++#define __ASSERT(cond) \ ++do { \ ++ if (unlikely(!(cond))) { \ ++ printk(KERN_EMERG "ASSERTION(" #cond ") failed\n"); \ ++ BUG(); \ ++ } \ ++} while (0) ++ ++/* ASSERTION that will debug log used outside the debug sysytem */ ++#define ASSERT(cond) \ ++do { \ ++ if (unlikely(!(cond))) \ ++ PANIC("ASSERTION(" #cond ") failed\n"); \ ++} while (0) ++ ++#define ASSERTF(cond, fmt, a...) \ ++do { \ ++ if (unlikely(!(cond))) \ ++ PANIC("ASSERTION(" #cond ") failed: " fmt, ## a); \ ++} while (0) ++ ++#define VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE, FMT, CAST) \ ++do { \ ++ if (!((TYPE)(LEFT) OP (TYPE)(RIGHT))) \ ++ PANIC("VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ ++ "failed (" FMT " " #OP " " FMT ")\n", \ ++ CAST (LEFT), CAST (RIGHT)); \ ++} while (0) ++ ++#define VERIFY3S(x,y,z) VERIFY3_IMPL(x, y, z, int64_t, "%lld", (long long)) ++#define VERIFY3U(x,y,z) VERIFY3_IMPL(x, y, z, uint64_t, "%llu", \ ++ (unsigned long long)) ++#define VERIFY3P(x,y,z) VERIFY3_IMPL(x, y, z, uintptr_t, "%p", (void *)) ++ ++#define ASSERT3S(x,y,z) VERIFY3S(x, y, z) ++#define ASSERT3U(x,y,z) VERIFY3U(x, y, z) ++#define ASSERT3P(x,y,z) VERIFY3P(x, y, z) ++ ++#define ASSERTV(x) x ++#define VERIFY(x) ASSERT(x) ++ ++#endif /* NDEBUG */ ++#endif /* SPL_DEBUG_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/dirent.h linux-3.2.33-go/include/spl/sys/dirent.h +--- linux-3.2.33-go.orig/include/spl/sys/dirent.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/dirent.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_DIRENT_H ++#define _SPL_DIRENT_H ++ ++#endif /* SPL_DIRENT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/disp.h linux-3.2.33-go/include/spl/sys/disp.h +--- linux-3.2.33-go.orig/include/spl/sys/disp.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/disp.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,33 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_DISP_H ++#define _SPL_DISP_H ++ ++#include ++ ++#define kpreempt_disable() preempt_disable() ++#define kpreempt_enable() preempt_enable() ++ ++#endif /* SPL_DISP_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/dkio.h linux-3.2.33-go/include/spl/sys/dkio.h +--- linux-3.2.33-go.orig/include/spl/sys/dkio.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/dkio.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,38 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_DKIO_H ++#define _SPL_DKIO_H ++ ++struct dk_callback { ++ void (*dkc_callback)(void *dkc_cookie, int error); ++ void *dkc_cookie; ++ int dkc_flag; ++}; ++ ++#define DKIOC (0x04 << 8) ++#define DKIOCFLUSHWRITECACHE (DKIOC | 34) ++#define DKIOCTRIM (DKIOC | 35) ++ ++#endif /* _SPL_DKIO_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/dklabel.h linux-3.2.33-go/include/spl/sys/dklabel.h +--- linux-3.2.33-go.orig/include/spl/sys/dklabel.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/dklabel.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_DKLABEL_H ++#define _SPL_DKLABEL_H ++ ++#endif /* _SPL_DKLABEL_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/dnlc.h linux-3.2.33-go/include/spl/sys/dnlc.h +--- linux-3.2.33-go.orig/include/spl/sys/dnlc.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/dnlc.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,46 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_DNLC_H ++#define _SPL_DNLC_H ++ ++/* ++ * Reduce the dcache and icache then reap the free'd slabs. Note the ++ * interface takes a reclaim percentage but we don't have easy access to ++ * the total number of entries to calculate the reclaim count. However, ++ * in practice this doesn't need to be even close to correct. We simply ++ * need to reclaim some useful fraction of the cache. The caller can ++ * determine if more needs to be done. ++ */ ++static inline void ++dnlc_reduce_cache(void *reduce_percent) ++{ ++ int nr = (uintptr_t)reduce_percent * 10000; ++ ++ shrink_dcache_memory(nr, GFP_KERNEL); ++ shrink_icache_memory(nr, GFP_KERNEL); ++ kmem_reap(); ++} ++ ++#endif /* SPL_DNLC_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/dumphdr.h linux-3.2.33-go/include/spl/sys/dumphdr.h +--- linux-3.2.33-go.orig/include/spl/sys/dumphdr.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/dumphdr.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_DUMPHDR_H ++#define _SPL_DUMPHDR_H ++ ++#endif /* SPL_DUMPHDR_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/efi_partition.h linux-3.2.33-go/include/spl/sys/efi_partition.h +--- linux-3.2.33-go.orig/include/spl/sys/efi_partition.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/efi_partition.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_EFI_PARTITION_H ++#define _SPL_EFI_PARTITION_H ++ ++#endif /* SPL_EFI_PARTITION_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/errno.h linux-3.2.33-go/include/spl/sys/errno.h +--- linux-3.2.33-go.orig/include/spl/sys/errno.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/errno.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_ERRNO_H ++#define _SPL_ERRNO_H ++ ++#endif /* SPL_ERRNO_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/extdirent.h linux-3.2.33-go/include/spl/sys/extdirent.h +--- linux-3.2.33-go.orig/include/spl/sys/extdirent.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/extdirent.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,29 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2010 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_EXTDIRENT_H ++#define _SPL_EXTDIRENT_H ++ ++#define ED_CASE_CONFLICT 0x10 ++ ++#endif /* _SPL_EXTDIRENT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/fcntl.h linux-3.2.33-go/include/spl/sys/fcntl.h +--- linux-3.2.33-go.orig/include/spl/sys/fcntl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/fcntl.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,37 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2010 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_FCNTL_H ++#define _SPL_FCNTL_H ++ ++#include ++ ++#define F_FREESP 11 ++ ++#ifdef CONFIG_64BIT ++typedef struct flock flock64_t; ++#else ++typedef struct flock64 flock64_t; ++#endif /* CONFIG_64BIT */ ++ ++#endif /* _SPL_FCNTL_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/file.h linux-3.2.33-go/include/spl/sys/file.h +--- linux-3.2.33-go.orig/include/spl/sys/file.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/file.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,31 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_FILE_H ++#define _SPL_FILE_H ++ ++#define FIGNORECASE 0x00080000 ++#define FKIOCTL 0x80000000 ++ ++#endif /* SPL_FILE_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/fm/protocol.h linux-3.2.33-go/include/spl/sys/fm/protocol.h +--- linux-3.2.33-go.orig/include/spl/sys/fm/protocol.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/fm/protocol.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_FM_PROTOCOL_H ++#define _SPL_FM_PROTOCOL_H ++ ++#endif /* _SPL_FM_PROTOCOL_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/fm/util.h linux-3.2.33-go/include/spl/sys/fm/util.h +--- linux-3.2.33-go.orig/include/spl/sys/fm/util.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/fm/util.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_FM_UTIL_H ++#define _SPL_FM_UTIL_H ++ ++#endif /* _SPL_FM_UTIL_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/fs/swapnode.h linux-3.2.33-go/include/spl/sys/fs/swapnode.h +--- linux-3.2.33-go.orig/include/spl/sys/fs/swapnode.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/fs/swapnode.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SWAPNODE_H ++#define _SPL_SWAPNODE_H ++ ++#endif /* SPL_SWAPNODE_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/idmap.h linux-3.2.33-go/include/spl/sys/idmap.h +--- linux-3.2.33-go.orig/include/spl/sys/idmap.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/idmap.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,29 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2010 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_IDMAP_H ++#define _SPL_IDMAP_H ++ ++#define IDMAP_WK_CREATOR_OWNER_UID 2147483648U ++ ++#endif /* SPL_IDMAP_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/int_limits.h linux-3.2.33-go/include/spl/sys/int_limits.h +--- linux-3.2.33-go.orig/include/spl/sys/int_limits.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/int_limits.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_INT_LIMITS_H ++#define _SPL_INT_LIMITS_H ++ ++#endif /* SPL_INT_LIMITS_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/inttypes.h linux-3.2.33-go/include/spl/sys/inttypes.h +--- linux-3.2.33-go.orig/include/spl/sys/inttypes.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/inttypes.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_INTTYPES_H ++#define _SPL_INTTYPES_H ++ ++#endif /* SPL_INTTYPES_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/int_types.h linux-3.2.33-go/include/spl/sys/int_types.h +--- linux-3.2.33-go.orig/include/spl/sys/int_types.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/int_types.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,30 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_INT_TYPES_H ++#define _SPL_INT_TYPES_H ++ ++#include ++ ++#endif /* SPL_INT_TYPES_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/isa_defs.h linux-3.2.33-go/include/spl/sys/isa_defs.h +--- linux-3.2.33-go.orig/include/spl/sys/isa_defs.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/isa_defs.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,120 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_ISA_DEFS_H ++#define _SPL_ISA_DEFS_H ++ ++/* x86_64 arch specific defines */ ++#if defined(__x86_64) || defined(__x86_64__) ++ ++#if !defined(__x86_64) ++#define __x86_64 ++#endif ++ ++#if !defined(__amd64) ++#define __amd64 ++#endif ++ ++#if !defined(__x86) ++#define __x86 ++#endif ++ ++#if !defined(_LP64) ++#define _LP64 ++#endif ++ ++/* i386 arch specific defines */ ++#elif defined(__i386) || defined(__i386__) ++ ++#if !defined(__i386) ++#define __i386 ++#endif ++ ++#if !defined(__x86) ++#define __x86 ++#endif ++ ++#if !defined(_ILP32) ++#define _ILP32 ++#endif ++ ++/* powerpc (ppc64) arch specific defines */ ++#elif defined(__powerpc) || defined(__powerpc__) ++ ++#if !defined(__powerpc) ++#define __powerpc ++#endif ++ ++#if !defined(__powerpc__) ++#define __powerpc__ ++#endif ++ ++#if !defined(_LP64) ++#define _LP64 ++#endif ++ ++/* arm arch specific defines */ ++#elif defined(__arm) || defined(__arm__) ++ ++#if !defined(__arm) ++#define __arm ++#endif ++ ++#if !defined(__arm__) ++#define __arm__ ++#endif ++ ++#if defined(__ARMEL__) ++#define _LITTLE_ENDIAN ++#else ++#define _BIG_ENDIAN ++#endif ++ ++#else /* Currently only x86_64, i386, arm, and powerpc arches supported */ ++#error "Unsupported ISA type" ++#endif ++ ++#if defined(_ILP32) && defined(_LP64) ++#error "Both _ILP32 and _LP64 are defined" ++#endif ++ ++#include ++ ++#if defined(__LITTLE_ENDIAN) && !defined(_LITTLE_ENDIAN) ++#define _LITTLE_ENDIAN __LITTLE_ENDIAN ++#endif ++ ++#if defined(__BIG_ENDIAN) && !defined(_BIG_ENDIAN) ++#define _BIG_ENDIAN __BIG_ENDIAN ++#endif ++ ++#if defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) ++#error "Both _LITTLE_ENDIAN and _BIG_ENDIAN are defined" ++#endif ++ ++#if !defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN) ++#error "Neither _LITTLE_ENDIAN or _BIG_ENDIAN are defined" ++#endif ++ ++#endif /* _SPL_ISA_DEFS_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/kidmap.h linux-3.2.33-go/include/spl/sys/kidmap.h +--- linux-3.2.33-go.orig/include/spl/sys/kidmap.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/kidmap.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,30 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_KIDMAP_H ++#define _SPL_KIDMAP_H ++ ++#include ++ ++#endif /* SPL_KIDMAP_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/kmem.h linux-3.2.33-go/include/spl/sys/kmem.h +--- linux-3.2.33-go.orig/include/spl/sys/kmem.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/kmem.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,512 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_KMEM_H ++#define _SPL_KMEM_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Memory allocation interfaces ++ */ ++#define KM_SLEEP GFP_KERNEL /* Can sleep, never fails */ ++#define KM_NOSLEEP GFP_ATOMIC /* Can not sleep, may fail */ ++#define KM_PUSHPAGE (GFP_NOIO | __GFP_HIGH) /* Use reserved memory */ ++#define KM_NODEBUG __GFP_NOWARN /* Suppress warnings */ ++#define KM_FLAGS __GFP_BITS_MASK ++#define KM_VMFLAGS GFP_LEVEL_MASK ++ ++/* ++ * Used internally, the kernel does not need to support this flag ++ */ ++#ifndef __GFP_ZERO ++# define __GFP_ZERO 0x8000 ++#endif ++ ++/* ++ * PF_NOFS is a per-process debug flag which is set in current->flags to ++ * detect when a process is performing an unsafe allocation. All tasks ++ * with PF_NOFS set must strictly use KM_PUSHPAGE for allocations because ++ * if they enter direct reclaim and initiate I/O the may deadlock. ++ * ++ * When debugging is disabled, any incorrect usage will be detected and ++ * a call stack with warning will be printed to the console. The flags ++ * will then be automatically corrected to allow for safe execution. If ++ * debugging is enabled this will be treated as a fatal condition. ++ * ++ * To avoid any risk of conflicting with the existing PF_ flags. The ++ * PF_NOFS bit shadows the rarely used PF_MUTEX_TESTER bit. Only when ++ * CONFIG_RT_MUTEX_TESTER is not set, and we know this bit is unused, ++ * will the PF_NOFS bit be valid. Happily, most existing distributions ++ * ship a kernel with CONFIG_RT_MUTEX_TESTER disabled. ++ */ ++#if !defined(CONFIG_RT_MUTEX_TESTER) && defined(PF_MUTEX_TESTER) ++# define PF_NOFS PF_MUTEX_TESTER ++ ++static inline void ++sanitize_flags(struct task_struct *p, gfp_t *flags) ++{ ++ if (unlikely((p->flags & PF_NOFS) && (*flags & (__GFP_IO|__GFP_FS)))) { ++# ifdef NDEBUG ++ SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "Fixing allocation for " ++ "task %s (%d) which used GFP flags 0x%x with PF_NOFS set\n", ++ p->comm, p->pid, flags); ++ spl_debug_dumpstack(p); ++ *flags &= ~(__GFP_IO|__GFP_FS); ++# else ++ PANIC("FATAL allocation for task %s (%d) which used GFP " ++ "flags 0x%x with PF_NOFS set\n", p->comm, p->pid, flags); ++# endif /* NDEBUG */ ++ } ++} ++#else ++# define PF_NOFS 0x00000000 ++# define sanitize_flags(p, fl) ((void)0) ++#endif /* !defined(CONFIG_RT_MUTEX_TESTER) && defined(PF_MUTEX_TESTER) */ ++ ++/* ++ * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as ++ * early as 2.6.32. To avoid this issue when it occurs in upstream kernels ++ * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC). ++ * I would prefer the caller handle the failure case cleanly but we are ++ * trying to emulate Solaris and those are not the Solaris semantics. ++ */ ++static inline void * ++kmalloc_nofail(size_t size, gfp_t flags) ++{ ++ void *ptr; ++ ++ sanitize_flags(current, &flags); ++ ++ do { ++ ptr = kmalloc(size, flags); ++ } while (ptr == NULL && (flags & __GFP_WAIT)); ++ ++ return ptr; ++} ++ ++static inline void * ++kzalloc_nofail(size_t size, gfp_t flags) ++{ ++ void *ptr; ++ ++ sanitize_flags(current, &flags); ++ ++ do { ++ ptr = kzalloc(size, flags); ++ } while (ptr == NULL && (flags & __GFP_WAIT)); ++ ++ return ptr; ++} ++ ++static inline void * ++kmalloc_node_nofail(size_t size, gfp_t flags, int node) ++{ ++#ifdef HAVE_KMALLOC_NODE ++ void *ptr; ++ ++ sanitize_flags(current, &flags); ++ ++ do { ++ ptr = kmalloc_node(size, flags, node); ++ } while (ptr == NULL && (flags & __GFP_WAIT)); ++ ++ return ptr; ++#else ++ return kmalloc_nofail(size, flags); ++#endif /* HAVE_KMALLOC_NODE */ ++} ++ ++static inline void * ++vmalloc_nofail(size_t size, gfp_t flags) ++{ ++ void *ptr; ++ ++ sanitize_flags(current, &flags); ++ ++ /* ++ * Retry failed __vmalloc() allocations once every second. The ++ * rational for the delay is that the likely failure modes are: ++ * ++ * 1) The system has completely exhausted memory, in which case ++ * delaying 1 second for the memory reclaim to run is reasonable ++ * to avoid thrashing the system. ++ * 2) The system has memory but has exhausted the small virtual ++ * address space available on 32-bit systems. Retrying the ++ * allocation immediately will only result in spinning on the ++ * virtual address space lock. It is better delay a second and ++ * hope that another process will free some of the address space. ++ * But the bottom line is there is not much we can actually do ++ * since we can never safely return a failure and honor the ++ * Solaris semantics. ++ */ ++ while (1) { ++ ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); ++ if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule_timeout(HZ); ++ } else { ++ break; ++ } ++ } ++ ++ return ptr; ++} ++ ++static inline void * ++vzalloc_nofail(size_t size, gfp_t flags) ++{ ++ void *ptr; ++ ++ ptr = vmalloc_nofail(size, flags); ++ if (ptr) ++ memset(ptr, 0, (size)); ++ ++ return ptr; ++} ++ ++#ifdef DEBUG_KMEM ++ ++/* ++ * Memory accounting functions to be used only when DEBUG_KMEM is set. ++ */ ++# ifdef HAVE_ATOMIC64_T ++ ++# define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) ++# define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) ++# define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used) ++# define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size) ++# define vmem_alloc_used_add(size) atomic64_add(size, &vmem_alloc_used) ++# define vmem_alloc_used_sub(size) atomic64_sub(size, &vmem_alloc_used) ++# define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used) ++# define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size) ++ ++extern atomic64_t kmem_alloc_used; ++extern unsigned long long kmem_alloc_max; ++extern atomic64_t vmem_alloc_used; ++extern unsigned long long vmem_alloc_max; ++ ++# else /* HAVE_ATOMIC64_T */ ++ ++# define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used) ++# define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used) ++# define kmem_alloc_used_read() atomic_read(&kmem_alloc_used) ++# define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size) ++# define vmem_alloc_used_add(size) atomic_add(size, &vmem_alloc_used) ++# define vmem_alloc_used_sub(size) atomic_sub(size, &vmem_alloc_used) ++# define vmem_alloc_used_read() atomic_read(&vmem_alloc_used) ++# define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size) ++ ++extern atomic_t kmem_alloc_used; ++extern unsigned long long kmem_alloc_max; ++extern atomic_t vmem_alloc_used; ++extern unsigned long long vmem_alloc_max; ++ ++# endif /* HAVE_ATOMIC64_T */ ++ ++# ifdef DEBUG_KMEM_TRACKING ++/* ++ * DEBUG_KMEM && DEBUG_KMEM_TRACKING ++ * ++ * The maximum level of memory debugging. All memory will be accounted ++ * for and each allocation will be explicitly tracked. Any allocation ++ * which is leaked will be reported on module unload and the exact location ++ * where that memory was allocation will be reported. This level of memory ++ * tracking will have a significant impact on performance and should only ++ * be enabled for debugging. This feature may be enabled by passing ++ * --enable-debug-kmem-tracking to configure. ++ */ ++# define kmem_alloc(sz, fl) kmem_alloc_track((sz), (fl), \ ++ __FUNCTION__, __LINE__, 0, 0) ++# define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)|__GFP_ZERO,\ ++ __FUNCTION__, __LINE__, 0, 0) ++# define kmem_alloc_node(sz, fl, nd) kmem_alloc_track((sz), (fl), \ ++ __FUNCTION__, __LINE__, 1, nd) ++# define kmem_free(ptr, sz) kmem_free_track((ptr), (sz)) ++ ++# define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \ ++ __FUNCTION__, __LINE__) ++# define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\ ++ __FUNCTION__, __LINE__) ++# define vmem_free(ptr, sz) vmem_free_track((ptr), (sz)) ++ ++extern void *kmem_alloc_track(size_t, int, const char *, int, int, int); ++extern void kmem_free_track(const void *, size_t); ++extern void *vmem_alloc_track(size_t, int, const char *, int); ++extern void vmem_free_track(const void *, size_t); ++ ++# else /* DEBUG_KMEM_TRACKING */ ++/* ++ * DEBUG_KMEM && !DEBUG_KMEM_TRACKING ++ * ++ * The default build will set DEBUG_KEM. This provides basic memory ++ * accounting with little to no impact on performance. When the module ++ * is unloaded in any memory was leaked the total number of leaked bytes ++ * will be reported on the console. To disable this basic accounting ++ * pass the --disable-debug-kmem option to configure. ++ */ ++# define kmem_alloc(sz, fl) kmem_alloc_debug((sz), (fl), \ ++ __FUNCTION__, __LINE__, 0, 0) ++# define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ ++ __FUNCTION__, __LINE__, 0, 0) ++# define kmem_alloc_node(sz, fl, nd) kmem_alloc_debug((sz), (fl), \ ++ __FUNCTION__, __LINE__, 1, nd) ++# define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz)) ++ ++# define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \ ++ __FUNCTION__, __LINE__) ++# define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ ++ __FUNCTION__, __LINE__) ++# define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz)) ++ ++extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int); ++extern void kmem_free_debug(const void *, size_t); ++extern void *vmem_alloc_debug(size_t, int, const char *, int); ++extern void vmem_free_debug(const void *, size_t); ++ ++# endif /* DEBUG_KMEM_TRACKING */ ++#else /* DEBUG_KMEM */ ++/* ++ * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING ++ * ++ * All debugging is disabled. There will be no overhead even for ++ * minimal memory accounting. To enable basic accounting pass the ++ * --enable-debug-kmem option to configure. ++ */ ++# define kmem_alloc(sz, fl) kmalloc_nofail((sz), (fl)) ++# define kmem_zalloc(sz, fl) kzalloc_nofail((sz), (fl)) ++# define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd)) ++# define kmem_free(ptr, sz) ((void)(sz), kfree(ptr)) ++ ++# define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl)) ++# define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl)) ++# define vmem_free(ptr, sz) ((void)(sz), vfree(ptr)) ++ ++#endif /* DEBUG_KMEM */ ++ ++extern int kmem_debugging(void); ++extern char *kmem_vasprintf(const char *fmt, va_list ap); ++extern char *kmem_asprintf(const char *fmt, ...); ++extern char *strdup(const char *str); ++extern void strfree(char *str); ++ ++ ++/* ++ * Slab allocation interfaces. The SPL slab differs from the standard ++ * Linux SLAB or SLUB primarily in that each cache may be backed by slabs ++ * allocated from the physical or virtal memory address space. The virtual ++ * slabs allow for good behavior when allocation large objects of identical ++ * size. This slab implementation also supports both constructors and ++ * destructions which the Linux slab does not. ++ */ ++enum { ++ KMC_BIT_NOTOUCH = 0, /* Don't update ages */ ++ KMC_BIT_NODEBUG = 1, /* Default behavior */ ++ KMC_BIT_NOMAGAZINE = 2, /* XXX: Unsupported */ ++ KMC_BIT_NOHASH = 3, /* XXX: Unsupported */ ++ KMC_BIT_QCACHE = 4, /* XXX: Unsupported */ ++ KMC_BIT_KMEM = 5, /* Use kmem cache */ ++ KMC_BIT_VMEM = 6, /* Use vmem cache */ ++ KMC_BIT_OFFSLAB = 7, /* Objects not on slab */ ++ KMC_BIT_NOEMERGENCY = 8, /* Disable emergency objects */ ++ KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */ ++ KMC_BIT_GROWING = 15, /* Growing in progress */ ++ KMC_BIT_REAPING = 16, /* Reaping in progress */ ++ KMC_BIT_DESTROY = 17, /* Destroy in progress */ ++ KMC_BIT_TOTAL = 18, /* Proc handler helper bit */ ++ KMC_BIT_ALLOC = 19, /* Proc handler helper bit */ ++ KMC_BIT_MAX = 20, /* Proc handler helper bit */ ++}; ++ ++/* kmem move callback return values */ ++typedef enum kmem_cbrc { ++ KMEM_CBRC_YES = 0, /* Object moved */ ++ KMEM_CBRC_NO = 1, /* Object not moved */ ++ KMEM_CBRC_LATER = 2, /* Object not moved, try again later */ ++ KMEM_CBRC_DONT_NEED = 3, /* Neither object is needed */ ++ KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */ ++} kmem_cbrc_t; ++ ++#define KMC_NOTOUCH (1 << KMC_BIT_NOTOUCH) ++#define KMC_NODEBUG (1 << KMC_BIT_NODEBUG) ++#define KMC_NOMAGAZINE (1 << KMC_BIT_NOMAGAZINE) ++#define KMC_NOHASH (1 << KMC_BIT_NOHASH) ++#define KMC_QCACHE (1 << KMC_BIT_QCACHE) ++#define KMC_KMEM (1 << KMC_BIT_KMEM) ++#define KMC_VMEM (1 << KMC_BIT_VMEM) ++#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB) ++#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY) ++#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED) ++#define KMC_GROWING (1 << KMC_BIT_GROWING) ++#define KMC_REAPING (1 << KMC_BIT_REAPING) ++#define KMC_DESTROY (1 << KMC_BIT_DESTROY) ++#define KMC_TOTAL (1 << KMC_BIT_TOTAL) ++#define KMC_ALLOC (1 << KMC_BIT_ALLOC) ++#define KMC_MAX (1 << KMC_BIT_MAX) ++ ++#define KMC_REAP_CHUNK INT_MAX ++#define KMC_DEFAULT_SEEKS 1 ++ ++extern struct list_head spl_kmem_cache_list; ++extern struct rw_semaphore spl_kmem_cache_sem; ++ ++#define SKM_MAGIC 0x2e2e2e2e ++#define SKO_MAGIC 0x20202020 ++#define SKS_MAGIC 0x22222222 ++#define SKC_MAGIC 0x2c2c2c2c ++ ++#define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */ ++#define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */ ++#define SPL_KMEM_CACHE_OBJ_PER_SLAB 16 /* Target objects per slab */ ++#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 8 /* Minimum objects per slab */ ++#define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */ ++ ++#define POINTER_IS_VALID(p) 0 /* Unimplemented */ ++#define POINTER_INVALIDATE(pp) /* Unimplemented */ ++ ++typedef int (*spl_kmem_ctor_t)(void *, void *, int); ++typedef void (*spl_kmem_dtor_t)(void *, void *); ++typedef void (*spl_kmem_reclaim_t)(void *); ++ ++typedef struct spl_kmem_magazine { ++ uint32_t skm_magic; /* Sanity magic */ ++ uint32_t skm_avail; /* Available objects */ ++ uint32_t skm_size; /* Magazine size */ ++ uint32_t skm_refill; /* Batch refill size */ ++ struct spl_kmem_cache *skm_cache; /* Owned by cache */ ++ struct delayed_work skm_work; /* Magazine reclaim work */ ++ unsigned long skm_age; /* Last cache access */ ++ unsigned int skm_cpu; /* Owned by cpu */ ++ void *skm_objs[0]; /* Object pointers */ ++} spl_kmem_magazine_t; ++ ++typedef struct spl_kmem_obj { ++ uint32_t sko_magic; /* Sanity magic */ ++ void *sko_addr; /* Buffer address */ ++ struct spl_kmem_slab *sko_slab; /* Owned by slab */ ++ struct list_head sko_list; /* Free object list linkage */ ++} spl_kmem_obj_t; ++ ++typedef struct spl_kmem_slab { ++ uint32_t sks_magic; /* Sanity magic */ ++ uint32_t sks_objs; /* Objects per slab */ ++ struct spl_kmem_cache *sks_cache; /* Owned by cache */ ++ struct list_head sks_list; /* Slab list linkage */ ++ struct list_head sks_free_list; /* Free object list */ ++ unsigned long sks_age; /* Last modify jiffie */ ++ uint32_t sks_ref; /* Ref count used objects */ ++} spl_kmem_slab_t; ++ ++typedef struct spl_kmem_alloc { ++ struct spl_kmem_cache *ska_cache; /* Owned by cache */ ++ int ska_flags; /* Allocation flags */ ++ struct delayed_work ska_work; /* Allocation work */ ++} spl_kmem_alloc_t; ++ ++typedef struct spl_kmem_emergency { ++ struct rb_node ske_node; /* Emergency tree linkage */ ++ void *ske_obj; /* Buffer address */ ++} spl_kmem_emergency_t; ++ ++typedef struct spl_kmem_cache { ++ uint32_t skc_magic; /* Sanity magic */ ++ uint32_t skc_name_size; /* Name length */ ++ char *skc_name; /* Name string */ ++ spl_kmem_magazine_t *skc_mag[NR_CPUS]; /* Per-CPU warm cache */ ++ uint32_t skc_mag_size; /* Magazine size */ ++ uint32_t skc_mag_refill; /* Magazine refill count */ ++ spl_kmem_ctor_t skc_ctor; /* Constructor */ ++ spl_kmem_dtor_t skc_dtor; /* Destructor */ ++ spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */ ++ void *skc_private; /* Private data */ ++ void *skc_vmp; /* Unused */ ++ unsigned long skc_flags; /* Flags */ ++ uint32_t skc_obj_size; /* Object size */ ++ uint32_t skc_obj_align; /* Object alignment */ ++ uint32_t skc_slab_objs; /* Objects per slab */ ++ uint32_t skc_slab_size; /* Slab size */ ++ uint32_t skc_delay; /* Slab reclaim interval */ ++ uint32_t skc_reap; /* Slab reclaim count */ ++ atomic_t skc_ref; /* Ref count callers */ ++ struct delayed_work skc_work; /* Slab reclaim work */ ++ struct list_head skc_list; /* List of caches linkage */ ++ struct list_head skc_complete_list;/* Completely alloc'ed */ ++ struct list_head skc_partial_list; /* Partially alloc'ed */ ++ struct rb_root skc_emergency_tree; /* Min sized objects */ ++ spinlock_t skc_lock; /* Cache lock */ ++ wait_queue_head_t skc_waitq; /* Allocation waiters */ ++ uint64_t skc_slab_fail; /* Slab alloc failures */ ++ uint64_t skc_slab_create;/* Slab creates */ ++ uint64_t skc_slab_destroy;/* Slab destroys */ ++ uint64_t skc_slab_total; /* Slab total current */ ++ uint64_t skc_slab_alloc; /* Slab alloc current */ ++ uint64_t skc_slab_max; /* Slab max historic */ ++ uint64_t skc_obj_total; /* Obj total current */ ++ uint64_t skc_obj_alloc; /* Obj alloc current */ ++ uint64_t skc_obj_max; /* Obj max historic */ ++ uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */ ++ uint64_t skc_obj_emergency; /* Obj emergency current */ ++ uint64_t skc_obj_emergency_max; /* Obj emergency max */ ++} spl_kmem_cache_t; ++#define kmem_cache_t spl_kmem_cache_t ++ ++extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size, ++ size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, ++ spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags); ++extern void spl_kmem_cache_set_move(spl_kmem_cache_t *, ++ kmem_cbrc_t (*)(void *, void *, size_t, void *)); ++extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); ++extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); ++extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); ++extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count); ++extern void spl_kmem_reap(void); ++ ++int spl_kmem_init_kallsyms_lookup(void); ++int spl_kmem_init(void); ++void spl_kmem_fini(void); ++ ++#define kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) \ ++ spl_kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) ++#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) ++#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) ++#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) ++#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) ++#define kmem_cache_reap_now(skc) \ ++ spl_kmem_cache_reap_now(skc, skc->skc_reap) ++#define kmem_reap() spl_kmem_reap() ++#define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \ ++ ((ptr) < (void *)VMALLOC_END)) ++ ++#endif /* _SPL_KMEM_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/kobj.h linux-3.2.33-go/include/spl/sys/kobj.h +--- linux-3.2.33-go.orig/include/spl/sys/kobj.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/kobj.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,42 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_KOBJ_H ++#define _SPL_KOBJ_H ++ ++#include ++ ++typedef struct _buf { ++ vnode_t *vp; ++} _buf_t; ++ ++typedef struct _buf buf_t; ++ ++extern struct _buf *kobj_open_file(const char *name); ++extern void kobj_close_file(struct _buf *file); ++extern int kobj_read_file(struct _buf *file, char *buf, ++ ssize_t size, offset_t off); ++extern int kobj_get_filesize(struct _buf *file, uint64_t *size); ++ ++#endif /* SPL_KOBJ_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/kstat.h linux-3.2.33-go/include/spl/sys/kstat.h +--- linux-3.2.33-go.orig/include/spl/sys/kstat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/kstat.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,194 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_KSTAT_H ++#define _SPL_KSTAT_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define KSTAT_STRLEN 31 ++ ++/* For reference valid classes are: ++ * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc ++ */ ++ ++#define KSTAT_TYPE_RAW 0 /* can be anything; ks_ndata >= 1 */ ++#define KSTAT_TYPE_NAMED 1 /* name/value pair; ks_ndata >= 1 */ ++#define KSTAT_TYPE_INTR 2 /* interrupt stats; ks_ndata == 1 */ ++#define KSTAT_TYPE_IO 3 /* I/O stats; ks_ndata == 1 */ ++#define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */ ++#define KSTAT_TYPE_TXG 5 /* txg sync; ks_ndata >= 1 */ ++#define KSTAT_NUM_TYPES 6 ++ ++#define KSTAT_DATA_CHAR 0 ++#define KSTAT_DATA_INT32 1 ++#define KSTAT_DATA_UINT32 2 ++#define KSTAT_DATA_INT64 3 ++#define KSTAT_DATA_UINT64 4 ++#define KSTAT_DATA_LONG 5 ++#define KSTAT_DATA_ULONG 6 ++#define KSTAT_DATA_STRING 7 ++#define KSTAT_NUM_DATAS 8 ++ ++#define KSTAT_INTR_HARD 0 ++#define KSTAT_INTR_SOFT 1 ++#define KSTAT_INTR_WATCHDOG 2 ++#define KSTAT_INTR_SPURIOUS 3 ++#define KSTAT_INTR_MULTSVC 4 ++#define KSTAT_NUM_INTRS 5 ++ ++#define KSTAT_FLAG_VIRTUAL 0x01 ++#define KSTAT_FLAG_VAR_SIZE 0x02 ++#define KSTAT_FLAG_WRITABLE 0x04 ++#define KSTAT_FLAG_PERSISTENT 0x08 ++#define KSTAT_FLAG_DORMANT 0x10 ++#define KSTAT_FLAG_UNSUPPORTED (KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_WRITABLE | \ ++ KSTAT_FLAG_PERSISTENT | KSTAT_FLAG_DORMANT) ++ ++ ++#define KS_MAGIC 0x9d9d9d9d ++ ++/* Dynamic updates */ ++#define KSTAT_READ 0 ++#define KSTAT_WRITE 1 ++ ++struct kstat_s; ++ ++typedef int kid_t; /* unique kstat id */ ++typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */ ++ ++typedef struct kstat_s { ++ int ks_magic; /* magic value */ ++ kid_t ks_kid; /* unique kstat ID */ ++ hrtime_t ks_crtime; /* creation time */ ++ hrtime_t ks_snaptime; /* last access time */ ++ char ks_module[KSTAT_STRLEN+1]; /* provider module name */ ++ int ks_instance; /* provider module instance */ ++ char ks_name[KSTAT_STRLEN+1]; /* kstat name */ ++ char ks_class[KSTAT_STRLEN+1]; /* kstat class */ ++ uchar_t ks_type; /* kstat data type */ ++ uchar_t ks_flags; /* kstat flags */ ++ void *ks_data; /* kstat type-specific data */ ++ uint_t ks_ndata; /* # of type-specific data records */ ++ size_t ks_data_size; /* size of kstat data section */ ++ struct proc_dir_entry *ks_proc; /* proc linkage */ ++ kstat_update_t *ks_update; /* dynamic updates */ ++ void *ks_private; /* private data */ ++ kmutex_t ks_lock; /* kstat data lock */ ++ struct list_head ks_list; /* kstat linkage */ ++} kstat_t; ++ ++typedef struct kstat_named_s { ++ char name[KSTAT_STRLEN]; /* name of counter */ ++ uchar_t data_type; /* data type */ ++ union { ++ char c[16]; /* 128-bit int */ ++ int32_t i32; /* 32-bit signed int */ ++ uint32_t ui32; /* 32-bit unsigned int */ ++ int64_t i64; /* 64-bit signed int */ ++ uint64_t ui64; /* 64-bit unsigned int */ ++ long l; /* native signed long */ ++ ulong_t ul; /* native unsigned long */ ++ struct { ++ union { ++ char *ptr; /* NULL-term string */ ++ char __pad[8]; /* 64-bit padding */ ++ } addr; ++ uint32_t len; /* # bytes for strlen + '\0' */ ++ } string; ++ } value; ++} kstat_named_t; ++ ++#define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr) ++#define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len) ++ ++typedef struct kstat_intr { ++ uint_t intrs[KSTAT_NUM_INTRS]; ++} kstat_intr_t; ++ ++typedef struct kstat_io { ++ u_longlong_t nread; /* number of bytes read */ ++ u_longlong_t nwritten; /* number of bytes written */ ++ uint_t reads; /* number of read operations */ ++ uint_t writes; /* number of write operations */ ++ hrtime_t wtime; /* cumulative wait (pre-service) time */ ++ hrtime_t wlentime; /* cumulative wait length*time product*/ ++ hrtime_t wlastupdate; /* last time wait queue changed */ ++ hrtime_t rtime; /* cumulative run (service) time */ ++ hrtime_t rlentime; /* cumulative run length*time product */ ++ hrtime_t rlastupdate; /* last time run queue changed */ ++ uint_t wcnt; /* count of elements in wait state */ ++ uint_t rcnt; /* count of elements in run state */ ++} kstat_io_t; ++ ++typedef struct kstat_timer { ++ char name[KSTAT_STRLEN+1]; /* event name */ ++ u_longlong_t num_events; /* number of events */ ++ hrtime_t elapsed_time; /* cumulative elapsed time */ ++ hrtime_t min_time; /* shortest event duration */ ++ hrtime_t max_time; /* longest event duration */ ++ hrtime_t start_time; /* previous event start time */ ++ hrtime_t stop_time; /* previous event stop time */ ++} kstat_timer_t; ++ ++typedef enum kstat_txg_state { ++ TXG_STATE_OPEN = 1, ++ TXG_STATE_QUIESCING = 2, ++ TXG_STATE_SYNCING = 3, ++ TXG_STATE_COMMITTED = 4, ++} kstat_txg_state_t; ++ ++typedef struct kstat_txg { ++ u_longlong_t txg; /* txg id */ ++ kstat_txg_state_t state; /* txg state */ ++ hrtime_t birth; /* birth time stamp */ ++ u_longlong_t nread; /* number of bytes read */ ++ u_longlong_t nwritten; /* number of bytes written */ ++ uint_t reads; /* number of read operations */ ++ uint_t writes; /* number of write operations */ ++ hrtime_t open_time; /* open time */ ++ hrtime_t quiesce_time;/* quiesce time */ ++ hrtime_t sync_time; /* sync time */ ++} kstat_txg_t; ++ ++int spl_kstat_init(void); ++void spl_kstat_fini(void); ++ ++extern kstat_t *__kstat_create(const char *ks_module, int ks_instance, ++ const char *ks_name, const char *ks_class, ++ uchar_t ks_type, uint_t ks_ndata, ++ uchar_t ks_flags); ++extern void __kstat_install(kstat_t *ksp); ++extern void __kstat_delete(kstat_t *ksp); ++ ++#define kstat_create(m,i,n,c,t,s,f) __kstat_create(m,i,n,c,t,s,f) ++#define kstat_install(k) __kstat_install(k) ++#define kstat_delete(k) __kstat_delete(k) ++ ++#endif /* _SPL_KSTAT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/list.h linux-3.2.33-go/include/spl/sys/list.h +--- linux-3.2.33-go.orig/include/spl/sys/list.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/list.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,219 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_LIST_H ++#define _SPL_LIST_H ++ ++#include ++#include ++ ++/* ++ * NOTE: I have implemented the Solaris list API in terms of the native ++ * linux API. This has certain advantages in terms of leveraging the linux ++ * list debugging infrastructure, but it also means that the internals of a ++ * list differ slightly than on Solaris. This is not a problem as long as ++ * all callers stick to the published API. The two major differences are: ++ * ++ * 1) A list_node_t is mapped to a linux list_head struct which changes ++ * the name of the list_next/list_prev pointers to next/prev respectively. ++ * ++ * 2) A list_node_t which is not attached to a list on Solaris is denoted ++ * by having its list_next/list_prev pointers set to NULL. Under linux ++ * the next/prev pointers are set to LIST_POISON1 and LIST_POISON2 ++ * respectively. At this moment this only impacts the implementation ++ * of the list_link_init() and list_link_active() functions. ++ */ ++ ++typedef struct list_head list_node_t; ++ ++typedef struct list { ++ size_t list_size; ++ size_t list_offset; ++ list_node_t list_head; ++} list_t; ++ ++#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) ++#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) ++ ++static inline int ++list_is_empty(list_t *list) ++{ ++ return list_empty(&list->list_head); ++} ++ ++static inline void ++list_link_init(list_node_t *node) ++{ ++ node->next = LIST_POISON1; ++ node->prev = LIST_POISON2; ++} ++ ++static inline void ++list_create(list_t *list, size_t size, size_t offset) ++{ ++ ASSERT(list); ++ ASSERT(size > 0); ++ ASSERT(size >= offset + sizeof(list_node_t)); ++ ++ list->list_size = size; ++ list->list_offset = offset; ++ INIT_LIST_HEAD(&list->list_head); ++} ++ ++static inline void ++list_destroy(list_t *list) ++{ ++ ASSERT(list); ++ ASSERT(list_is_empty(list)); ++ ++ list_del(&list->list_head); ++} ++ ++static inline void ++list_insert_head(list_t *list, void *object) ++{ ++ list_add(list_d2l(list, object), &list->list_head); ++} ++ ++static inline void ++list_insert_tail(list_t *list, void *object) ++{ ++ list_add_tail(list_d2l(list, object), &list->list_head); ++} ++ ++static inline void ++list_insert_after(list_t *list, void *object, void *nobject) ++{ ++ if (object == NULL) ++ list_insert_head(list, nobject); ++ else ++ list_add(list_d2l(list, nobject), list_d2l(list, object)); ++} ++ ++static inline void ++list_insert_before(list_t *list, void *object, void *nobject) ++{ ++ if (object == NULL) ++ list_insert_tail(list, nobject); ++ else ++ list_add_tail(list_d2l(list, nobject), list_d2l(list, object)); ++} ++ ++static inline void ++list_remove(list_t *list, void *object) ++{ ++ ASSERT(!list_is_empty(list)); ++ list_del(list_d2l(list, object)); ++} ++ ++static inline void * ++list_remove_head(list_t *list) ++{ ++ list_node_t *head = list->list_head.next; ++ if (head == &list->list_head) ++ return NULL; ++ ++ list_del(head); ++ return list_object(list, head); ++} ++ ++static inline void * ++list_remove_tail(list_t *list) ++{ ++ list_node_t *tail = list->list_head.prev; ++ if (tail == &list->list_head) ++ return NULL; ++ ++ list_del(tail); ++ return list_object(list, tail); ++} ++ ++static inline void * ++list_head(list_t *list) ++{ ++ if (list_is_empty(list)) ++ return NULL; ++ ++ return list_object(list, list->list_head.next); ++} ++ ++static inline void * ++list_tail(list_t *list) ++{ ++ if (list_is_empty(list)) ++ return NULL; ++ ++ return list_object(list, list->list_head.prev); ++} ++ ++static inline void * ++list_next(list_t *list, void *object) ++{ ++ list_node_t *node = list_d2l(list, object); ++ ++ if (node->next != &list->list_head) ++ return list_object(list, node->next); ++ ++ return NULL; ++} ++ ++static inline void * ++list_prev(list_t *list, void *object) ++{ ++ list_node_t *node = list_d2l(list, object); ++ ++ if (node->prev != &list->list_head) ++ return list_object(list, node->prev); ++ ++ return NULL; ++} ++ ++static inline int ++list_link_active(list_node_t *node) ++{ ++ return (node->next != LIST_POISON1) && (node->prev != LIST_POISON2); ++} ++ ++static inline void ++spl_list_move_tail(list_t *dst, list_t *src) ++{ ++ list_splice_init(&src->list_head, dst->list_head.prev); ++} ++ ++#define list_move_tail(dst, src) spl_list_move_tail(dst, src) ++ ++static inline void ++list_link_replace(list_node_t *old_node, list_node_t *new_node) ++{ ++ ASSERT(list_link_active(old_node)); ++ ASSERT(!list_link_active(new_node)); ++ ++ new_node->next = old_node->next; ++ new_node->prev = old_node->prev; ++ old_node->prev->next = new_node; ++ old_node->next->prev = new_node; ++ list_link_init(old_node); ++} ++ ++#endif /* SPL_LIST_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/mkdev.h linux-3.2.33-go/include/spl/sys/mkdev.h +--- linux-3.2.33-go.orig/include/spl/sys/mkdev.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/mkdev.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_MKDEV_H ++#define _SPL_MKDEV_H ++ ++#endif /* SPL_MKDEV_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/mntent.h linux-3.2.33-go/include/spl/sys/mntent.h +--- linux-3.2.33-go.orig/include/spl/sys/mntent.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/mntent.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_MNTENT_H ++#define _SPL_MNTENT_H ++ ++#endif /* SPL_MNTENT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/modctl.h linux-3.2.33-go/include/spl/sys/modctl.h +--- linux-3.2.33-go.orig/include/spl/sys/modctl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/modctl.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_MODCTL_H ++#define _SPL_MODCTL_H ++ ++#endif /* SPL_MODCTL_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/mode.h linux-3.2.33-go/include/spl/sys/mode.h +--- linux-3.2.33-go.orig/include/spl/sys/mode.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/mode.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,32 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_MODE_H ++#define _SPL_MODE_H ++ ++#define IFTOVT(mode) vn_mode_to_vtype(mode) ++#define VTTOIF(vtype) vn_vtype_to_mode(vtype) ++#define MAKEIMODE(T, M) (VTTOIF(T) | ((M) & ~S_IFMT)) ++ ++#endif /* SPL_MODE_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/mount.h linux-3.2.33-go/include/spl/sys/mount.h +--- linux-3.2.33-go.orig/include/spl/sys/mount.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/mount.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_MOUNT_H ++#define _SPL_MOUNT_H ++ ++#endif /* SPL_MOUNT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/mutex.h linux-3.2.33-go/include/spl/sys/mutex.h +--- linux-3.2.33-go.orig/include/spl/sys/mutex.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/mutex.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,218 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_MUTEX_H ++#define _SPL_MUTEX_H ++ ++#include ++#include ++#include ++ ++typedef enum { ++ MUTEX_DEFAULT = 0, ++ MUTEX_SPIN = 1, ++ MUTEX_ADAPTIVE = 2 ++} kmutex_type_t; ++ ++#if defined(HAVE_MUTEX_OWNER) && defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) ++ ++/* ++ * We define a 1-field struct rather than a straight typedef to enforce type ++ * safety. ++ */ ++typedef struct { ++ struct mutex m; ++} kmutex_t; ++ ++static inline kthread_t * ++mutex_owner(kmutex_t *mp) ++{ ++#if defined(HAVE_MUTEX_OWNER_TASK_STRUCT) ++ return ACCESS_ONCE(mp->m.owner); ++#else ++ struct thread_info *owner = ACCESS_ONCE(mp->m.owner); ++ if (owner) ++ return owner->task; ++ ++ return NULL; ++#endif ++} ++ ++#define mutex_owned(mp) (mutex_owner(mp) == current) ++#define MUTEX_HELD(mp) mutex_owned(mp) ++#define MUTEX_NOT_HELD(mp) (!MUTEX_HELD(mp)) ++#undef mutex_init ++#define mutex_init(mp, name, type, ibc) \ ++({ \ ++ static struct lock_class_key __key; \ ++ ASSERT(type == MUTEX_DEFAULT); \ ++ \ ++ __mutex_init(&(mp)->m, #mp, &__key); \ ++}) ++ ++#undef mutex_destroy ++#define mutex_destroy(mp) \ ++({ \ ++ VERIFY3P(mutex_owner(mp), ==, NULL); \ ++}) ++ ++#define mutex_tryenter(mp) mutex_trylock(&(mp)->m) ++#define mutex_enter(mp) \ ++({ \ ++ ASSERT3P(mutex_owner(mp), !=, current); \ ++ mutex_lock(&(mp)->m); \ ++ }) ++#define mutex_exit(mp) mutex_unlock(&(mp)->m) ++ ++#ifdef HAVE_GPL_ONLY_SYMBOLS ++# define mutex_enter_nested(mp, sc) mutex_lock_nested(&(mp)->m, sc) ++#else ++# define mutex_enter_nested(mp, sc) mutex_enter(mp) ++#endif /* HAVE_GPL_ONLY_SYMBOLS */ ++ ++#else /* HAVE_MUTEX_OWNER */ ++ ++typedef struct { ++ struct mutex m_mutex; ++ kthread_t *m_owner; ++} kmutex_t; ++ ++#ifdef HAVE_TASK_CURR ++extern int spl_mutex_spin_max(void); ++#else /* HAVE_TASK_CURR */ ++# define task_curr(owner) 0 ++# define spl_mutex_spin_max() 0 ++#endif /* HAVE_TASK_CURR */ ++ ++#define MUTEX(mp) (&((mp)->m_mutex)) ++ ++static inline void ++spl_mutex_set_owner(kmutex_t *mp) ++{ ++ mp->m_owner = current; ++} ++ ++static inline void ++spl_mutex_clear_owner(kmutex_t *mp) ++{ ++ mp->m_owner = NULL; ++} ++ ++#define mutex_owner(mp) (ACCESS_ONCE((mp)->m_owner)) ++#define mutex_owned(mp) (mutex_owner(mp) == current) ++#define MUTEX_HELD(mp) mutex_owned(mp) ++#define MUTEX_NOT_HELD(mp) (!MUTEX_HELD(mp)) ++ ++/* ++ * The following functions must be a #define and not static inline. ++ * This ensures that the native linux mutex functions (lock/unlock) ++ * will be correctly located in the users code which is important ++ * for the built in kernel lock analysis tools ++ */ ++#undef mutex_init ++#define mutex_init(mp, name, type, ibc) \ ++({ \ ++ static struct lock_class_key __key; \ ++ ASSERT(type == MUTEX_DEFAULT); \ ++ \ ++ __mutex_init(MUTEX(mp), #mp, &__key); \ ++ spl_mutex_clear_owner(mp); \ ++}) ++ ++#undef mutex_destroy ++#define mutex_destroy(mp) \ ++({ \ ++ VERIFY3P(mutex_owner(mp), ==, NULL); \ ++}) ++ ++#define mutex_tryenter(mp) \ ++({ \ ++ int _rc_; \ ++ \ ++ if ((_rc_ = mutex_trylock(MUTEX(mp))) == 1) \ ++ spl_mutex_set_owner(mp); \ ++ \ ++ _rc_; \ ++}) ++ ++/* ++ * Adaptive mutexs assume that the lock may be held by a task running ++ * on a different cpu. The expectation is that the task will drop the ++ * lock before leaving the head of the run queue. So the ideal thing ++ * to do is spin until we acquire the lock and avoid a context switch. ++ * However it is also possible the task holding the lock yields the ++ * processor with out dropping lock. In this case, we know it's going ++ * to be a while so we stop spinning and go to sleep waiting for the ++ * lock to be available. This should strike the optimum balance ++ * between spinning and sleeping waiting for a lock. ++ */ ++#define mutex_enter(mp) \ ++({ \ ++ kthread_t *_owner_; \ ++ int _rc_, _count_; \ ++ \ ++ _rc_ = 0; \ ++ _count_ = 0; \ ++ _owner_ = mutex_owner(mp); \ ++ ASSERT3P(_owner_, !=, current); \ ++ \ ++ while (_owner_ && task_curr(_owner_) && \ ++ _count_ <= spl_mutex_spin_max()) { \ ++ if ((_rc_ = mutex_trylock(MUTEX(mp)))) \ ++ break; \ ++ \ ++ _count_++; \ ++ } \ ++ \ ++ if (!_rc_) \ ++ mutex_lock(MUTEX(mp)); \ ++ \ ++ spl_mutex_set_owner(mp); \ ++}) ++ ++#define mutex_exit(mp) \ ++({ \ ++ spl_mutex_clear_owner(mp); \ ++ mutex_unlock(MUTEX(mp)); \ ++}) ++ ++#ifdef HAVE_GPL_ONLY_SYMBOLS ++# define mutex_enter_nested(mp, sc) \ ++({ \ ++ mutex_lock_nested(MUTEX(mp), sc); \ ++ spl_mutex_set_owner(mp); \ ++}) ++#else ++# define mutex_enter_nested(mp, sc) \ ++({ \ ++ mutex_enter(mp); \ ++}) ++#endif ++ ++#endif /* HAVE_MUTEX_OWNER */ ++ ++int spl_mutex_init(void); ++void spl_mutex_fini(void); ++ ++#endif /* _SPL_MUTEX_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/note.h linux-3.2.33-go/include/spl/sys/note.h +--- linux-3.2.33-go.orig/include/spl/sys/note.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/note.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_NOTE_H ++#define _SPL_NOTE_H ++ ++#endif /* SPL_NOTE_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/open.h linux-3.2.33-go/include/spl/sys/open.h +--- linux-3.2.33-go.orig/include/spl/sys/open.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/open.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_OPEN_H ++#define _SPL_OPEN_H ++ ++#endif /* SPL_OPEN_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/param.h linux-3.2.33-go/include/spl/sys/param.h +--- linux-3.2.33-go.orig/include/spl/sys/param.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/param.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,36 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_PARAM_H ++#define _SPL_PARAM_H ++ ++#include ++ ++/* Pages to bytes and back */ ++#define ptob(pages) (pages << PAGE_SHIFT) ++#define btop(bytes) (bytes >> PAGE_SHIFT) ++ ++#define MAXUID UINT32_MAX ++ ++#endif /* SPL_PARAM_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/pathname.h linux-3.2.33-go/include/spl/sys/pathname.h +--- linux-3.2.33-go.orig/include/spl/sys/pathname.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/pathname.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,35 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_PATHNAME_H ++#define _SPL_PATHNAME_H ++ ++typedef struct pathname { ++ char *pn_buf; /* underlying storage */ ++ char *pn_path; /* remaining pathname */ ++ size_t pn_pathlen; /* remaining length */ ++ size_t pn_bufsize; /* total size of pn_buf */ ++} pathname_t; ++ ++#endif /* SPL_PATHNAME_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/policy.h linux-3.2.33-go/include/spl/sys/policy.h +--- linux-3.2.33-go.orig/include/spl/sys/policy.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/policy.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,47 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_POLICY_H ++#define _SPL_POLICY_H ++ ++#define secpolicy_fs_unmount(c,vfs) (0) ++#define secpolicy_nfs(c) (0) ++#define secpolicy_sys_config(c,co) (0) ++#define secpolicy_zfs(c) (0) ++#define secpolicy_zinject(c) (0) ++#define secpolicy_vnode_setids_setgids(c,id) (0) ++#define secpolicy_vnode_setid_retain(c, sr) (0) ++#define secpolicy_setid_clear(v, c) (0) ++#define secpolicy_vnode_any_access(c,vp,o) (0) ++#define secpolicy_vnode_access2(c,cp,o,m1,m2) (0) ++#define secpolicy_vnode_chown(c,o) (0) ++#define secpolicy_vnode_setdac(c,o) (0) ++#define secpolicy_vnode_remove(c) (0) ++#define secpolicy_vnode_setattr(c,v,a,o,f,func,n) (0) ++#define secpolicy_xvattr(x, o, c, t) (0) ++#define secpolicy_vnode_stky_modify(c) (0) ++#define secpolicy_setid_setsticky_clear(v,a,o,c) (0) ++#define secpolicy_basic_link(c) (0) ++ ++#endif /* SPL_POLICY_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/pool.h linux-3.2.33-go/include/spl/sys/pool.h +--- linux-3.2.33-go.orig/include/spl/sys/pool.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/pool.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,30 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_POOL_H ++#define _SPL_POOL_H ++ ++#include ++ ++#endif /* SPL_POOL_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/priv_impl.h linux-3.2.33-go/include/spl/sys/priv_impl.h +--- linux-3.2.33-go.orig/include/spl/sys/priv_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/priv_impl.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_PRIV_IMPL_H ++#define _SPL_PRIV_IMPL_H ++ ++#endif /* _SPL_PRIV_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/processor.h linux-3.2.33-go/include/spl/sys/processor.h +--- linux-3.2.33-go.orig/include/spl/sys/processor.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/processor.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,32 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_PROCESSOR_H ++#define _SPL_PROCESSOR_H ++ ++#define getcpuid() smp_processor_id() ++ ++typedef int processorid_t; ++ ++#endif /* _SPL_PROCESSOR_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/proc.h linux-3.2.33-go/include/spl/sys/proc.h +--- linux-3.2.33-go.orig/include/spl/sys/proc.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/proc.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_PROC_H ++#define _SPL_PROC_H ++ ++#endif /* SPL_PROC_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/pset.h linux-3.2.33-go/include/spl/sys/pset.h +--- linux-3.2.33-go.orig/include/spl/sys/pset.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/pset.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,38 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_PSET_H ++#define _SPL_PSET_H ++ ++typedef int psetid_t; ++ ++/* special processor set id's */ ++#define PS_NONE -1 ++#define PS_QUERY -2 ++#define PS_MYID -3 ++#define PS_SOFT -4 ++#define PS_HARD -5 ++#define PS_QUERY_TYPE -6 ++ ++#endif /* SPL_PSET_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/random.h linux-3.2.33-go/include/spl/sys/random.h +--- linux-3.2.33-go.orig/include/spl/sys/random.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/random.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,45 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_RANDOM_H ++#define _SPL_RANDOM_H ++ ++#include ++#include ++ ++static __inline__ int ++random_get_bytes(uint8_t *ptr, size_t len) ++{ ++ get_random_bytes((void *)ptr,(int)len); ++ return 0; ++} ++ ++static __inline__ int ++random_get_pseudo_bytes(uint8_t *ptr, size_t len) ++{ ++ get_random_bytes((void *)ptr,(int)len); ++ return 0; ++} ++ ++#endif /* _SPL_RANDOM_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/refstr.h linux-3.2.33-go/include/spl/sys/refstr.h +--- linux-3.2.33-go.orig/include/spl/sys/refstr.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/refstr.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_REFSTR_H ++#define _SPL_REFSTR_H ++ ++#endif /* SPL_REFSTR_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/resource.h linux-3.2.33-go/include/spl/sys/resource.h +--- linux-3.2.33-go.orig/include/spl/sys/resource.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/resource.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,30 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_RESOURCE_H ++#define _SPL_RESOURCE_H ++ ++#include ++ ++#endif /* SPL_RESOURCE_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/rwlock.h linux-3.2.33-go/include/spl/sys/rwlock.h +--- linux-3.2.33-go.orig/include/spl/sys/rwlock.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/rwlock.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,214 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_RWLOCK_H ++#define _SPL_RWLOCK_H ++ ++#include ++#include ++#include ++ ++typedef enum { ++ RW_DRIVER = 2, ++ RW_DEFAULT = 4 ++} krw_type_t; ++ ++typedef enum { ++ RW_NONE = 0, ++ RW_WRITER = 1, ++ RW_READER = 2 ++} krw_t; ++ ++typedef struct { ++ struct rw_semaphore rw_rwlock; ++ kthread_t *rw_owner; ++} krwlock_t; ++ ++#define SEM(rwp) ((struct rw_semaphore *)(rwp)) ++ ++static inline void ++spl_rw_set_owner(krwlock_t *rwp) ++{ ++ unsigned long flags; ++ ++ spl_rwsem_lock_irqsave(&SEM(rwp)->wait_lock, flags); ++ rwp->rw_owner = current; ++ spl_rwsem_unlock_irqrestore(&SEM(rwp)->wait_lock, flags); ++} ++ ++static inline void ++spl_rw_clear_owner(krwlock_t *rwp) ++{ ++ unsigned long flags; ++ ++ spl_rwsem_lock_irqsave(&SEM(rwp)->wait_lock, flags); ++ rwp->rw_owner = NULL; ++ spl_rwsem_unlock_irqrestore(&SEM(rwp)->wait_lock, flags); ++} ++ ++static inline kthread_t * ++rw_owner(krwlock_t *rwp) ++{ ++ unsigned long flags; ++ kthread_t *owner; ++ ++ spl_rwsem_lock_irqsave(&SEM(rwp)->wait_lock, flags); ++ owner = rwp->rw_owner; ++ spl_rwsem_unlock_irqrestore(&SEM(rwp)->wait_lock, flags); ++ ++ return owner; ++} ++ ++static inline int ++RW_READ_HELD(krwlock_t *rwp) ++{ ++ return (spl_rwsem_is_locked(SEM(rwp)) && ++ rw_owner(rwp) == NULL); ++} ++ ++static inline int ++RW_WRITE_HELD(krwlock_t *rwp) ++{ ++ return (spl_rwsem_is_locked(SEM(rwp)) && ++ rw_owner(rwp) == current); ++} ++ ++static inline int ++RW_LOCK_HELD(krwlock_t *rwp) ++{ ++ return spl_rwsem_is_locked(SEM(rwp)); ++} ++ ++/* ++ * The following functions must be a #define and not static inline. ++ * This ensures that the native linux semaphore functions (down/up) ++ * will be correctly located in the users code which is important ++ * for the built in kernel lock analysis tools ++ */ ++#define rw_init(rwp, name, type, arg) \ ++({ \ ++ static struct lock_class_key __key; \ ++ \ ++ __init_rwsem(SEM(rwp), #rwp, &__key); \ ++ spl_rw_clear_owner(rwp); \ ++}) ++ ++#define rw_destroy(rwp) \ ++({ \ ++ VERIFY(!RW_LOCK_HELD(rwp)); \ ++}) ++ ++#define rw_tryenter(rwp, rw) \ ++({ \ ++ int _rc_ = 0; \ ++ \ ++ switch (rw) { \ ++ case RW_READER: \ ++ _rc_ = down_read_trylock(SEM(rwp)); \ ++ break; \ ++ case RW_WRITER: \ ++ if ((_rc_ = down_write_trylock(SEM(rwp)))) \ ++ spl_rw_set_owner(rwp); \ ++ break; \ ++ default: \ ++ VERIFY(0); \ ++ } \ ++ _rc_; \ ++}) ++ ++#define rw_enter(rwp, rw) \ ++({ \ ++ switch (rw) { \ ++ case RW_READER: \ ++ down_read(SEM(rwp)); \ ++ break; \ ++ case RW_WRITER: \ ++ down_write(SEM(rwp)); \ ++ spl_rw_set_owner(rwp); \ ++ break; \ ++ default: \ ++ VERIFY(0); \ ++ } \ ++}) ++ ++#define rw_exit(rwp) \ ++({ \ ++ if (RW_WRITE_HELD(rwp)) { \ ++ spl_rw_clear_owner(rwp); \ ++ up_write(SEM(rwp)); \ ++ } else { \ ++ ASSERT(RW_READ_HELD(rwp)); \ ++ up_read(SEM(rwp)); \ ++ } \ ++}) ++ ++#define rw_downgrade(rwp) \ ++({ \ ++ spl_rw_clear_owner(rwp); \ ++ downgrade_write(SEM(rwp)); \ ++}) ++ ++#if defined(CONFIG_RWSEM_GENERIC_SPINLOCK) ++/* ++ * For the generic implementations of rw-semaphores the following is ++ * true. If your semaphore implementation internally represents the ++ * semaphore state differently then special case handling is required. ++ * - if activity/count is 0 then there are no active readers or writers ++ * - if activity/count is +ve then that is the number of active readers ++ * - if activity/count is -1 then there is one active writer ++ */ ++ ++extern void __up_read_locked(struct rw_semaphore *); ++extern int __down_write_trylock_locked(struct rw_semaphore *); ++ ++#define rw_tryupgrade(rwp) \ ++({ \ ++ unsigned long _flags_; \ ++ int _rc_ = 0; \ ++ \ ++ spl_rwsem_lock_irqsave(&SEM(rwp)->wait_lock, _flags_); \ ++ if ((list_empty(&SEM(rwp)->wait_list)) && \ ++ (SEM(rwp)->activity == 1)) { \ ++ __up_read_locked(SEM(rwp)); \ ++ VERIFY(_rc_ = __down_write_trylock_locked(SEM(rwp))); \ ++ (rwp)->rw_owner = current; \ ++ } \ ++ spl_rwsem_unlock_irqrestore(&SEM(rwp)->wait_lock, _flags_); \ ++ _rc_; \ ++}) ++#else ++/* ++ * rw_tryupgrade() can be implemented correctly but for each supported ++ * arch we will need a custom implementation. For the x86 implementation ++ * it looks like a custom cmpxchg() to atomically check and promote the ++ * rwsem would be safe. For now that's not worth the trouble so in this ++ * case rw_tryupgrade() has just been disabled. ++ */ ++#define rw_tryupgrade(rwp) ({ 0; }) ++#endif ++ ++int spl_rw_init(void); ++void spl_rw_fini(void); ++ ++#endif /* _SPL_RWLOCK_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/sdt.h linux-3.2.33-go/include/spl/sys/sdt.h +--- linux-3.2.33-go.orig/include/spl/sys/sdt.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/sdt.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SDT_H ++#define _SPL_SDT_H ++ ++#endif /* SPL_SDT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/sid.h linux-3.2.33-go/include/spl/sys/sid.h +--- linux-3.2.33-go.orig/include/spl/sys/sid.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/sid.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,61 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SID_H ++#define _SPL_SID_H ++ ++typedef struct ksiddomain { ++ char *kd_name; ++} ksiddomain_t; ++ ++typedef enum ksid_index { ++ KSID_USER, ++ KSID_GROUP, ++ KSID_OWNER, ++ KSID_COUNT ++} ksid_index_t; ++ ++typedef int ksid_t; ++ ++static inline ksiddomain_t * ++ksid_lookupdomain(const char *dom) ++{ ++ ksiddomain_t *kd; ++ int len = strlen(dom); ++ ++ kd = kmem_zalloc(sizeof(ksiddomain_t), KM_SLEEP); ++ kd->kd_name = kmem_zalloc(len + 1, KM_SLEEP); ++ memcpy(kd->kd_name, dom, len); ++ ++ return (kd); ++} ++ ++static inline void ++ksiddomain_rele(ksiddomain_t *ksid) ++{ ++ kmem_free(ksid->kd_name, strlen(ksid->kd_name) + 1); ++ kmem_free(ksid, sizeof(ksiddomain_t)); ++} ++ ++#endif /* _SPL_SID_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/signal.h linux-3.2.33-go/include/spl/sys/signal.h +--- linux-3.2.33-go.orig/include/spl/sys/signal.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/signal.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,50 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SIGNAL_H ++#define _SPL_SIGNAL_H ++ ++#include ++ ++#define FORREAL 0 /* Usual side-effects */ ++#define JUSTLOOKING 1 /* Don't stop the process */ ++ ++/* The "why" argument indicates the allowable side-effects of the call: ++ * ++ * FORREAL: Extract the next pending signal from p_sig into p_cursig; ++ * stop the process if a stop has been requested or if a traced signal ++ * is pending. ++ * ++ * JUSTLOOKING: Don't stop the process, just indicate whether or not ++ * a signal might be pending (FORREAL is needed to tell for sure). ++ */ ++static __inline__ int ++issig(int why) ++{ ++ ASSERT(why == FORREAL || why == JUSTLOOKING); ++ ++ return signal_pending(current); ++} ++ ++#endif /* SPL_SIGNAL_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/stat.h linux-3.2.33-go/include/spl/sys/stat.h +--- linux-3.2.33-go.orig/include/spl/sys/stat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/stat.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,30 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_STAT_H ++#define _SPL_STAT_H ++ ++#include ++ ++#endif /* SPL_STAT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/stropts.h linux-3.2.33-go/include/spl/sys/stropts.h +--- linux-3.2.33-go.orig/include/spl/sys/stropts.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/stropts.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_STROPTS_H ++#define _SPL_STROPTS_H ++ ++#endif /* SPL_STROPTS_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/sunddi.h linux-3.2.33-go/include/spl/sys/sunddi.h +--- linux-3.2.33-go.orig/include/spl/sys/sunddi.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/sunddi.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,60 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SUNDDI_H ++#define _SPL_SUNDDI_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++typedef int ddi_devid_t; ++ ++#define DDI_DEV_T_NONE ((dev_t)-1) ++#define DDI_DEV_T_ANY ((dev_t)-2) ++#define DI_MAJOR_T_UNKNOWN ((major_t)0) ++ ++#define DDI_PROP_DONTPASS 0x0001 ++#define DDI_PROP_CANSLEEP 0x0002 ++ ++#define DDI_SUCCESS 0 ++#define DDI_FAILURE -1 ++ ++#define ddi_prop_lookup_string(x1,x2,x3,x4,x5) (*x5 = NULL) ++#define ddi_prop_free(x) (void)0 ++#define ddi_root_node() (void)0 ++ ++extern int ddi_strtoul(const char *, char **, int, unsigned long *); ++extern int ddi_strtol(const char *, char **, int, long *); ++extern int ddi_strtoull(const char *, char **, int, unsigned long long *); ++extern int ddi_strtoll(const char *, char **, int, long long *); ++ ++extern int ddi_copyin(const void *from, void *to, size_t len, int flags); ++extern int ddi_copyout(const void *from, void *to, size_t len, int flags); ++ ++#endif /* SPL_SUNDDI_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/sunldi.h linux-3.2.33-go/include/spl/sys/sunldi.h +--- linux-3.2.33-go.orig/include/spl/sys/sunldi.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/sunldi.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,56 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SUNLDI_H ++#define _SPL_SUNLDI_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define SECTOR_SIZE 512 ++ ++typedef struct modlinkage { ++ int ml_rev; ++ struct modlfs *ml_modlfs; ++ struct modldrv *ml_modldrv; ++ major_t ml_major; ++ unsigned ml_minors; ++ void *pad1; ++} modlinkage_t; ++ ++typedef struct ldi_ident { ++ char li_modname[MAXNAMELEN]; ++ dev_t li_dev; ++} *ldi_ident_t; ++ ++typedef struct block_device *ldi_handle_t; ++ ++extern int ldi_ident_from_mod(struct modlinkage *modlp, ldi_ident_t *lip); ++extern void ldi_ident_release(ldi_ident_t li); ++ ++#endif /* SPL_SUNLDI_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/sysdc.h linux-3.2.33-go/include/spl/sys/sysdc.h +--- linux-3.2.33-go.orig/include/spl/sys/sysdc.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/sysdc.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SYSDC_H ++#define _SPL_SYSDC_H ++ ++#endif /* SPL_SYSDC_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/sysevent/eventdefs.h linux-3.2.33-go/include/spl/sys/sysevent/eventdefs.h +--- linux-3.2.33-go.orig/include/spl/sys/sysevent/eventdefs.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/sysevent/eventdefs.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SYSEVENT_EVENTDEFS_H ++#define _SPL_SYSEVENT_EVENTDEFS_H ++ ++#endif /* _SPL_SYSEVENT_EVENTDEFS_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/sysevent.h linux-3.2.33-go/include/spl/sys/sysevent.h +--- linux-3.2.33-go.orig/include/spl/sys/sysevent.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/sysevent.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SYSEVENT_H ++#define _SPL_SYSEVENT_H ++ ++#endif /* _SPL_SYSEVENT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/sysmacros.h linux-3.2.33-go/include/spl/sys/sysmacros.h +--- linux-3.2.33-go.orig/include/spl/sys/sysmacros.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/sysmacros.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,217 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SYSMACROS_H ++#define _SPL_SYSMACROS_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifndef _KERNEL ++#define _KERNEL __KERNEL__ ++#endif ++ ++#define FALSE 0 ++#define TRUE 1 ++ ++#define INT8_MAX (127) ++#define INT8_MIN (-128) ++#define UINT8_MAX (255) ++#define UINT8_MIN (0) ++ ++#define INT16_MAX (32767) ++#define INT16_MIN (-32768) ++#define UINT16_MAX (65535) ++#define UINT16_MIN (0) ++ ++#define INT32_MAX INT_MAX ++#define INT32_MIN INT_MIN ++#define UINT32_MAX UINT_MAX ++#define UINT32_MIN UINT_MIN ++ ++#define INT64_MAX LLONG_MAX ++#define INT64_MIN LLONG_MIN ++#define UINT64_MAX ULLONG_MAX ++#define UINT64_MIN ULLONG_MIN ++ ++#define NBBY 8 ++#define ENOTSUP EOPNOTSUPP ++ ++#define MAXMSGLEN 256 ++#define MAXNAMELEN 256 ++#define MAXPATHLEN PATH_MAX ++#define MAXOFFSET_T LLONG_MAX ++#define MAXBSIZE 8192 ++#define DEV_BSIZE 512 ++#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ ++ ++#define proc_pageout NULL ++#define curproc current ++#define max_ncpus num_possible_cpus() ++#define CPU_SEQID smp_processor_id() ++#define _NOTE(x) ++#define is_system_labeled() 0 ++ ++#ifndef RLIM64_INFINITY ++#define RLIM64_INFINITY (~0ULL) ++#endif ++ ++/* 0..MAX_PRIO-1: Process priority ++ * 0..MAX_RT_PRIO-1: RT priority tasks ++ * MAX_RT_PRIO..MAX_PRIO-1: SCHED_NORMAL tasks ++ * ++ * Treat shim tasks as SCHED_NORMAL tasks ++ */ ++#define minclsyspri (MAX_RT_PRIO) ++#define maxclsyspri (MAX_PRIO-1) ++ ++#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) ++#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) ++ ++/* Missing macros ++ */ ++#define PAGESIZE PAGE_SIZE ++ ++/* from Solaris sys/byteorder.h */ ++#define BSWAP_8(x) ((x) & 0xff) ++#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) ++#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) ++#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) ++ ++/* Map some simple functions. ++ */ ++#define bzero(ptr,size) memset(ptr,0,size) ++#define bcopy(src,dest,size) memmove(dest,src,size) ++#define bcmp(src,dest,size) memcmp((src), (dest), (size_t)(size)) ++ ++/* Dtrace probes do not exist in the linux kernel */ ++#ifdef DTRACE_PROBE ++#undef DTRACE_PROBE ++#endif /* DTRACE_PROBE */ ++#define DTRACE_PROBE(a) ((void)0) ++ ++#ifdef DTRACE_PROBE1 ++#undef DTRACE_PROBE1 ++#endif /* DTRACE_PROBE1 */ ++#define DTRACE_PROBE1(a, b, c) ((void)0) ++ ++#ifdef DTRACE_PROBE2 ++#undef DTRACE_PROBE2 ++#endif /* DTRACE_PROBE2 */ ++#define DTRACE_PROBE2(a, b, c, d, e) ((void)0) ++ ++#ifdef DTRACE_PROBE3 ++#undef DTRACE_PROBE3 ++#endif /* DTRACE_PROBE3 */ ++#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0) ++ ++#ifdef DTRACE_PROBE4 ++#undef DTRACE_PROBE4 ++#endif /* DTRACE_PROBE4 */ ++#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void)0) ++ ++/* Missing globals */ ++extern char spl_version[32]; ++extern unsigned long spl_hostid; ++extern char hw_serial[11]; ++ ++/* Missing misc functions */ ++extern int highbit(unsigned long i); ++extern uint32_t zone_get_hostid(void *zone); ++extern void spl_setup(void); ++extern void spl_cleanup(void); ++ ++#define makedevice(maj,min) makedev(maj,min) ++ ++/* common macros */ ++#ifndef MIN ++#define MIN(a, b) ((a) < (b) ? (a) : (b)) ++#endif ++#ifndef MAX ++#define MAX(a, b) ((a) < (b) ? (b) : (a)) ++#endif ++#ifndef ABS ++#define ABS(a) ((a) < 0 ? -(a) : (a)) ++#endif ++#ifndef DIV_ROUND_UP ++#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) ++#endif ++#ifndef roundup ++#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) ++#endif ++ ++/* ++ * Compatibility macros/typedefs needed for Solaris -> Linux port ++ */ ++#define P2ALIGN(x, align) ((x) & -(align)) ++#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) ++#define P2ROUNDUP(x, align) (-(-(x) & -(align))) ++#define P2PHASE(x, align) ((x) & ((align) - 1)) ++#define P2NPHASE(x, align) (-(x) & ((align) - 1)) ++#define ISP2(x) (((x) & ((x) - 1)) == 0) ++#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1))==0) ++#define P2BOUNDARY(off, len, align) \ ++ (((off) ^ ((off) + (len) - 1)) > (align) - 1) ++ ++/* ++ * Typed version of the P2* macros. These macros should be used to ensure ++ * that the result is correctly calculated based on the data type of (x), ++ * which is passed in as the last argument, regardless of the data ++ * type of the alignment. For example, if (x) is of type uint64_t, ++ * and we want to round it up to a page boundary using "PAGESIZE" as ++ * the alignment, we can do either ++ * ++ * P2ROUNDUP(x, (uint64_t)PAGESIZE) ++ * or ++ * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) ++ */ ++#define P2ALIGN_TYPED(x, align, type) \ ++ ((type)(x) & -(type)(align)) ++#define P2PHASE_TYPED(x, align, type) \ ++ ((type)(x) & ((type)(align) - 1)) ++#define P2NPHASE_TYPED(x, align, type) \ ++ (-(type)(x) & ((type)(align) - 1)) ++#define P2ROUNDUP_TYPED(x, align, type) \ ++ (-(-(type)(x) & -(type)(align))) ++#define P2END_TYPED(x, align, type) \ ++ (-(~(type)(x) & -(type)(align))) ++#define P2PHASEUP_TYPED(x, align, phase, type) \ ++ ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) ++#define P2CROSS_TYPED(x, y, align, type) \ ++ (((type)(x) ^ (type)(y)) > (type)(align) - 1) ++#define P2SAMEHIGHBIT_TYPED(x, y, type) \ ++ (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) ++ ++#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof) ++ ++/* avoid any possibility of clashing with version */ ++ ++#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) ++#endif ++ ++#endif /* _SPL_SYSMACROS_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/systeminfo.h linux-3.2.33-go/include/spl/sys/systeminfo.h +--- linux-3.2.33-go.orig/include/spl/sys/systeminfo.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/systeminfo.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,37 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SYSTEMINFO_H ++#define _SPL_SYSTEMINFO_H ++ ++#define HW_INVALID_HOSTID 0xFFFFFFFF /* an invalid hostid */ ++#define HW_HOSTID_LEN 11 /* minimum buffer size needed */ ++ /* to hold a decimal or hex */ ++ /* hostid string */ ++ ++/* Supplemental definitions for Linux. */ ++#define HW_HOSTID_PATH "/etc/hostid" /* binary configuration file */ ++#define HW_HOSTID_MASK 0xFFFFFFFF /* significant hostid bits */ ++ ++#endif /* SPL_SYSTEMINFO_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/systm.h linux-3.2.33-go/include/spl/sys/systm.h +--- linux-3.2.33-go.orig/include/spl/sys/systm.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/systm.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,32 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SYSTM_H ++#define _SPL_SYSTM_H ++ ++#include ++ ++typedef uintptr_t pc_t; ++ ++#endif /* SPL_SYSTM_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/taskq.h linux-3.2.33-go/include/spl/sys/taskq.h +--- linux-3.2.33-go.orig/include/spl/sys/taskq.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/taskq.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,133 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_TASKQ_H ++#define _SPL_TASKQ_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define TASKQ_NAMELEN 31 ++ ++#define TASKQ_PREPOPULATE 0x00000001 ++#define TASKQ_CPR_SAFE 0x00000002 ++#define TASKQ_DYNAMIC 0x00000004 ++#define TASKQ_THREADS_CPU_PCT 0x00000008 ++#define TASKQ_DC_BATCH 0x00000010 ++ ++typedef unsigned long taskqid_t; ++typedef void (task_func_t)(void *); ++ ++typedef struct taskq_ent { ++ spinlock_t tqent_lock; ++ struct list_head tqent_list; ++ taskqid_t tqent_id; ++ task_func_t *tqent_func; ++ void *tqent_arg; ++ uintptr_t tqent_flags; ++} taskq_ent_t; ++ ++#define TQENT_FLAG_PREALLOC 0x1 ++ ++/* ++ * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as ++ * KM_SLEEP/KM_NOSLEEP. TQ_NOQUEUE/TQ_NOALLOC are set particularly ++ * large so as not to conflict with already used GFP_* defines. ++ */ ++#define TQ_SLEEP 0x00000000 ++#define TQ_NOSLEEP 0x00000001 ++#define TQ_PUSHPAGE 0x00000002 ++#define TQ_NOQUEUE 0x01000000 ++#define TQ_NOALLOC 0x02000000 ++#define TQ_NEW 0x04000000 ++#define TQ_FRONT 0x08000000 ++#define TQ_ACTIVE 0x80000000 ++ ++typedef struct taskq { ++ spinlock_t tq_lock; /* protects taskq_t */ ++ unsigned long tq_lock_flags; /* interrupt state */ ++ const char *tq_name; /* taskq name */ ++ struct list_head tq_thread_list;/* list of all threads */ ++ struct list_head tq_active_list;/* list of active threads */ ++ int tq_nactive; /* # of active threads */ ++ int tq_nthreads; /* # of total threads */ ++ int tq_pri; /* priority */ ++ int tq_minalloc; /* min task_t pool size */ ++ int tq_maxalloc; /* max task_t pool size */ ++ int tq_nalloc; /* cur task_t pool size */ ++ uint_t tq_flags; /* flags */ ++ taskqid_t tq_next_id; /* next pend/work id */ ++ taskqid_t tq_lowest_id; /* lowest pend/work id */ ++ struct list_head tq_free_list; /* free task_t's */ ++ struct list_head tq_pend_list; /* pending task_t's */ ++ struct list_head tq_prio_list; /* priority pending task_t's */ ++ wait_queue_head_t tq_work_waitq; /* new work waitq */ ++ wait_queue_head_t tq_wait_waitq; /* wait waitq */ ++} taskq_t; ++ ++typedef struct taskq_thread { ++ struct list_head tqt_thread_list; ++ struct list_head tqt_active_list; ++ struct task_struct *tqt_thread; ++ taskq_t *tqt_tq; ++ taskqid_t tqt_id; ++ uintptr_t tqt_flags; ++} taskq_thread_t; ++ ++/* Global system-wide dynamic task queue available for all consumers */ ++extern taskq_t *system_taskq; ++ ++extern taskqid_t __taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); ++extern void __taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); ++extern int __taskq_empty_ent(taskq_ent_t *); ++extern void __taskq_init_ent(taskq_ent_t *); ++extern taskq_t *__taskq_create(const char *, int, pri_t, int, int, uint_t); ++extern void __taskq_destroy(taskq_t *); ++extern void __taskq_wait_id(taskq_t *, taskqid_t); ++extern void __taskq_wait(taskq_t *); ++extern int __taskq_member(taskq_t *, void *); ++ ++int spl_taskq_init(void); ++void spl_taskq_fini(void); ++ ++#define taskq_member(tq, t) __taskq_member(tq, t) ++#define taskq_wait_id(tq, id) __taskq_wait_id(tq, id) ++#define taskq_wait(tq) __taskq_wait(tq) ++#define taskq_dispatch(tq, f, p, fl) __taskq_dispatch(tq, f, p, fl) ++#define taskq_dispatch_ent(tq, f, p, fl, t) __taskq_dispatch_ent(tq, f, p, fl, t) ++#define taskq_empty_ent(t) __taskq_empty_ent(t) ++#define taskq_init_ent(t) __taskq_init_ent(t) ++#define taskq_create(n, th, p, mi, ma, fl) __taskq_create(n, th, p, mi, ma, fl) ++#define taskq_create_proc(n, th, p, mi, ma, pr, fl) \ ++ __taskq_create(n, th, p, mi, ma, fl) ++#define taskq_create_sysdc(n, th, mi, ma, pr, dc, fl) \ ++ __taskq_create(n, th, maxclsyspri, mi, ma, fl) ++#define taskq_destroy(tq) __taskq_destroy(tq) ++ ++#endif /* _SPL_TASKQ_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/thread.h linux-3.2.33-go/include/spl/sys/thread.h +--- linux-3.2.33-go.orig/include/spl/sys/thread.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/thread.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,61 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_THREAD_H ++#define _SPL_THREAD_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Thread interfaces ++ */ ++#define TP_MAGIC 0x53535353 ++ ++#define TS_SLEEP TASK_INTERRUPTIBLE ++#define TS_RUN TASK_RUNNING ++#define TS_ZOMB EXIT_ZOMBIE ++#define TS_STOPPED TASK_STOPPED ++ ++typedef void (*thread_func_t)(void *); ++ ++#define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ ++ __thread_create(stk, stksize, (thread_func_t)func, \ ++ #func, arg, len, pp, state, pri) ++#define thread_exit() __thread_exit() ++#define thread_join(t) VERIFY(0) ++#define curthread current ++ ++extern kthread_t *__thread_create(caddr_t stk, size_t stksize, ++ thread_func_t func, const char *name, ++ void *args, size_t len, proc_t *pp, ++ int state, pri_t pri); ++extern void __thread_exit(void); ++ ++#endif /* _SPL_THREAD_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/time.h linux-3.2.33-go/include/spl/sys/time.h +--- linux-3.2.33-go.orig/include/spl/sys/time.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/time.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,93 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_TIME_H ++#define _SPL_TIME_H ++ ++/* ++ * Structure returned by gettimeofday(2) system call, ++ * and used in other calls. ++ */ ++#include ++#include ++#include ++#include ++ ++#if defined(CONFIG_64BIT) ++#define TIME_MAX INT64_MAX ++#define TIME_MIN INT64_MIN ++#else ++#define TIME_MAX INT32_MAX ++#define TIME_MIN INT32_MIN ++#endif ++ ++#define SEC 1 ++#define MILLISEC 1000 ++#define MICROSEC 1000000 ++#define NANOSEC 1000000000 ++ ++/* Already defined in include/linux/time.h */ ++#undef CLOCK_THREAD_CPUTIME_ID ++#undef CLOCK_REALTIME ++#undef CLOCK_MONOTONIC ++#undef CLOCK_PROCESS_CPUTIME_ID ++ ++typedef enum clock_type { ++ __CLOCK_REALTIME0 = 0, /* obsolete; same as CLOCK_REALTIME */ ++ CLOCK_VIRTUAL = 1, /* thread's user-level CPU clock */ ++ CLOCK_THREAD_CPUTIME_ID = 2, /* thread's user+system CPU clock */ ++ CLOCK_REALTIME = 3, /* wall clock */ ++ CLOCK_MONOTONIC = 4, /* high resolution monotonic clock */ ++ CLOCK_PROCESS_CPUTIME_ID = 5, /* process's user+system CPU clock */ ++ CLOCK_HIGHRES = CLOCK_MONOTONIC, /* alternate name */ ++ CLOCK_PROF = CLOCK_THREAD_CPUTIME_ID,/* alternate name */ ++} clock_type_t; ++ ++#define hz \ ++({ \ ++ ASSERT(HZ >= 100 && HZ <= MICROSEC); \ ++ HZ; \ ++}) ++ ++extern void __gethrestime(timestruc_t *); ++extern int __clock_gettime(clock_type_t, timespec_t *); ++extern hrtime_t __gethrtime(void); ++ ++#define gethrestime(ts) __gethrestime(ts) ++#define clock_gettime(fl, tp) __clock_gettime(fl, tp) ++#define gethrtime() __gethrtime() ++ ++static __inline__ time_t ++gethrestime_sec(void) ++{ ++ timestruc_t now; ++ ++ __gethrestime(&now); ++ return now.tv_sec; ++} ++ ++#define TIMESPEC_OVERFLOW(ts) \ ++ ((ts)->tv_sec < TIME_MIN || (ts)->tv_sec > TIME_MAX) ++ ++#endif /* _SPL_TIME_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/timer.h linux-3.2.33-go/include/spl/sys/timer.h +--- linux-3.2.33-go.orig/include/spl/sys/timer.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/timer.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,41 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_TIMER_H ++#define _SPL_TIMER_H ++ ++#include ++#include ++#include ++ ++#define lbolt ((clock_t)jiffies) ++#define lbolt64 ((int64_t)get_jiffies_64()) ++ ++#define ddi_get_lbolt() ((clock_t)jiffies) ++#define ddi_get_lbolt64() ((int64_t)get_jiffies_64()) ++ ++#define delay(ticks) schedule_timeout((long)(ticks)) ++ ++#endif /* _SPL_TIMER_H */ ++ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/t_lock.h linux-3.2.33-go/include/spl/sys/t_lock.h +--- linux-3.2.33-go.orig/include/spl/sys/t_lock.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/t_lock.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,33 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_T_LOCK_H ++#define _SPL_T_LOCK_H ++ ++#include ++#include ++#include ++#include ++ ++#endif /* SPL_T_LOCK_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/tsd.h linux-3.2.33-go/include/spl/sys/tsd.h +--- linux-3.2.33-go.orig/include/spl/sys/tsd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/tsd.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,45 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2010 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_TSD_H ++#define _SPL_TSD_H ++ ++#include ++ ++#define TSD_HASH_TABLE_BITS_DEFAULT 9 ++#define TSD_KEYS_MAX 32768 ++#define DTOR_PID (PID_MAX_LIMIT+1) ++#define PID_KEY (TSD_KEYS_MAX+1) ++ ++typedef void (*dtor_func_t)(void *); ++ ++extern int tsd_set(uint_t, void *); ++extern void *tsd_get(uint_t); ++extern void tsd_create(uint_t *, dtor_func_t); ++extern void tsd_destroy(uint_t *); ++extern void tsd_exit(void); ++ ++int spl_tsd_init(void); ++void spl_tsd_fini(void); ++ ++#endif /* _SPL_TSD_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/types32.h linux-3.2.33-go/include/spl/sys/types32.h +--- linux-3.2.33-go.orig/include/spl/sys/types32.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/types32.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,36 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_TYPES32_H ++#define _SPL_TYPES32_H ++ ++#include ++#include ++ ++typedef uint32_t caddr32_t; ++typedef int32_t daddr32_t; ++typedef int32_t time32_t; ++typedef uint32_t size32_t; ++ ++#endif /* _SPL_TYPES32_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/types.h linux-3.2.33-go/include/spl/sys/types.h +--- linux-3.2.33-go.orig/include/spl/sys/types.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/types.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,91 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_TYPES_H ++#define _SPL_TYPES_H ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifndef HAVE_UINTPTR_T ++typedef unsigned long uintptr_t; ++#endif ++ ++#ifndef ULLONG_MAX ++#define ULLONG_MAX (~0ULL) ++#endif ++ ++#ifndef LLONG_MAX ++#define LLONG_MAX ((long long)(~0ULL>>1)) ++#endif ++ ++typedef enum { B_FALSE=0, B_TRUE=1 } boolean_t; ++typedef unsigned long intptr_t; ++typedef unsigned long ulong_t; ++typedef unsigned int uint_t; ++typedef unsigned char uchar_t; ++typedef unsigned long long u_longlong_t; ++typedef unsigned long long u_offset_t; ++typedef unsigned long long rlim64_t; ++typedef long long longlong_t; ++typedef long long offset_t; ++typedef struct task_struct kthread_t; ++typedef struct task_struct proc_t; ++typedef struct vmem { } vmem_t; ++typedef short pri_t; ++typedef struct timespec timestruc_t; /* definition per SVr4 */ ++typedef struct timespec timespec_t; ++typedef longlong_t hrtime_t; ++typedef unsigned short ushort_t; ++typedef u_longlong_t len_t; ++typedef longlong_t diskaddr_t; ++typedef ushort_t o_mode_t; ++typedef uint_t major_t; ++typedef uint_t minor_t; ++typedef ulong_t pfn_t; ++typedef ulong_t pgcnt_t; ++typedef long spgcnt_t; ++typedef short index_t; ++typedef int id_t; ++ ++extern proc_t p0; ++ ++#endif /* _SPL_TYPES_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/u8_textprep.h linux-3.2.33-go/include/spl/sys/u8_textprep.h +--- linux-3.2.33-go.orig/include/spl/sys/u8_textprep.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/u8_textprep.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_U8_TEXTPREP_H ++#define _SPL_U8_TEXTPREP_H ++ ++#endif /* SPL_U8_TEXTPREP_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/uio.h linux-3.2.33-go/include/spl/sys/uio.h +--- linux-3.2.33-go.orig/include/spl/sys/uio.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/uio.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,99 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_UIO_H ++#define _SPL_UIO_H ++ ++#include ++#include ++#include ++ ++typedef struct iovec iovec_t; ++ ++typedef enum uio_rw { ++ UIO_READ = 0, ++ UIO_WRITE = 1, ++} uio_rw_t; ++ ++typedef enum uio_seg { ++ UIO_USERSPACE = 0, ++ UIO_SYSSPACE = 1, ++ UIO_USERISPACE= 2, ++} uio_seg_t; ++ ++typedef struct uio { ++ struct iovec *uio_iov; ++ int uio_iovcnt; ++ offset_t uio_loffset; ++ uio_seg_t uio_segflg; ++ uint16_t uio_fmode; ++ uint16_t uio_extflg; ++ offset_t uio_limit; ++ ssize_t uio_resid; ++} uio_t; ++ ++typedef struct aio_req { ++ uio_t *aio_uio; ++ void *aio_private; ++} aio_req_t; ++ ++typedef enum xuio_type { ++ UIOTYPE_ASYNCIO, ++ UIOTYPE_ZEROCOPY, ++} xuio_type_t; ++ ++ ++#define UIOA_IOV_MAX 16 ++ ++typedef struct uioa_page_s { ++ int uioa_pfncnt; ++ void **uioa_ppp; ++ caddr_t uioa_base; ++ size_t uioa_len; ++} uioa_page_t; ++ ++typedef struct xuio { ++ uio_t xu_uio; ++ enum xuio_type xu_type; ++ union { ++ struct { ++ uint32_t xu_a_state; ++ ssize_t xu_a_mbytes; ++ uioa_page_t *xu_a_lcur; ++ void **xu_a_lppp; ++ void *xu_a_hwst[4]; ++ uioa_page_t xu_a_locked[UIOA_IOV_MAX]; ++ } xu_aio; ++ ++ struct { ++ int xu_zc_rw; ++ void *xu_zc_priv; ++ } xu_zc; ++ } xu_ext; ++} xuio_t; ++ ++#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv ++#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw ++ ++#endif /* SPL_UIO_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/unistd.h linux-3.2.33-go/include/spl/sys/unistd.h +--- linux-3.2.33-go.orig/include/spl/sys/unistd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/unistd.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_UNISTD_H ++#define _SPL_UNISTD_H ++ ++#endif /* SPL_UNISTD_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/utsname.h linux-3.2.33-go/include/spl/sys/utsname.h +--- linux-3.2.33-go.orig/include/spl/sys/utsname.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/utsname.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,34 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_UTSNAME_H ++#define _SPL_UTSNAME_H ++ ++#include ++ ++extern struct new_utsname *__utsname(void); ++ ++#define utsname (*__utsname()) ++ ++#endif /* SPL_UTSNAME_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/va_list.h linux-3.2.33-go/include/spl/sys/va_list.h +--- linux-3.2.33-go.orig/include/spl/sys/va_list.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/va_list.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_VA_LIST_H ++#define _SPL_VA_LIST_H ++ ++#endif /* SPL_VA_LIST_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/varargs.h linux-3.2.33-go/include/spl/sys/varargs.h +--- linux-3.2.33-go.orig/include/spl/sys/varargs.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/varargs.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,30 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_VARARGS_H ++#define _SPL_VARARGS_H ++ ++#define __va_list va_list ++ ++#endif /* SPL_VARARGS_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/vfs.h linux-3.2.33-go/include/spl/sys/vfs.h +--- linux-3.2.33-go.orig/include/spl/sys/vfs.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/vfs.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,51 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_ZFS_H ++#define _SPL_ZFS_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define MAXFIDSZ 64 ++ ++typedef struct spl_fid { ++ union { ++ long fid_pad; ++ struct { ++ ushort_t len; /* length of data in bytes */ ++ char data[MAXFIDSZ];/* data (variable len) */ ++ } _fid; ++ } un; ++} fid_t; ++ ++#define fid_len un._fid.len ++#define fid_data un._fid.data ++ ++#endif /* SPL_ZFS_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/vfs_opreg.h linux-3.2.33-go/include/spl/sys/vfs_opreg.h +--- linux-3.2.33-go.orig/include/spl/sys/vfs_opreg.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/vfs_opreg.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_OPREG_H ++#define _SPL_OPREG_H ++ ++#endif /* SPL_OPREG_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/vmsystm.h linux-3.2.33-go/include/spl/sys/vmsystm.h +--- linux-3.2.33-go.orig/include/spl/sys/vmsystm.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/vmsystm.h 2012-11-16 23:22:32.405192918 +0100 +@@ -0,0 +1,181 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_VMSYSTM_H ++#define _SPL_VMSYSTM_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* These values are loosely coupled with the VM page reclaim. ++ * Linux uses its own heuristics to trigger page reclamation, and ++ * because those interface are difficult to interface with. These ++ * values should only be considered as a rough guide to the system ++ * memory state and not as direct evidence that page reclamation. ++ * is or is not currently in progress. ++ */ ++#define membar_producer() smp_wmb() ++ ++#define physmem num_physpages ++#define freemem nr_free_pages() ++#define availrmem spl_kmem_availrmem() ++ ++extern pgcnt_t minfree; /* Sum of zone->pages_min */ ++extern pgcnt_t desfree; /* Sum of zone->pages_low */ ++extern pgcnt_t lotsfree; /* Sum of zone->pages_high */ ++extern pgcnt_t needfree; /* Always 0 unused in new Solaris */ ++extern pgcnt_t swapfs_minfree; /* Solaris default value */ ++extern pgcnt_t swapfs_reserve; /* Solaris default value */ ++ ++extern vmem_t *heap_arena; /* primary kernel heap arena */ ++extern vmem_t *zio_alloc_arena; /* arena for zio caches */ ++extern vmem_t *zio_arena; /* arena for allocating zio memory */ ++ ++extern pgcnt_t spl_kmem_availrmem(void); ++extern size_t vmem_size(vmem_t *vmp, int typemask); ++ ++/* ++ * The following symbols are available for use within the kernel ++ * itself, and they used to be available in older kernels. But it ++ * looks like they have been removed perhaps due to lack of use. ++ * For our purposes we need them to access the global memory state ++ * of the system, which is even available to user space process ++ * in /proc/meminfo. It's odd to me that there is no kernel API ++ * to get the same information, minimally the proc handler for ++ * the above mentioned /proc/meminfo file would make use of it. ++ */ ++ ++/* Source linux/fs/proc/mmu.c */ ++#ifndef HAVE_GET_VMALLOC_INFO ++#ifdef CONFIG_MMU ++ ++struct vmalloc_info { ++ unsigned long used; ++ unsigned long largest_chunk; ++}; ++ ++typedef void (*get_vmalloc_info_t)(struct vmalloc_info *); ++extern get_vmalloc_info_t get_vmalloc_info_fn; ++ ++# define VMEM_ALLOC 0x01 ++# define VMEM_FREE 0x02 ++# define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) ++# define get_vmalloc_info(vmi) get_vmalloc_info_fn(vmi) ++#else ++# error "CONFIG_MMU must be defined" ++#endif /* CONFIG_MMU */ ++#endif /* HAVE_GET_VMALLOC_INFO */ ++ ++#ifdef HAVE_PGDAT_HELPERS ++/* Source linux/mm/mmzone.c */ ++# ifndef HAVE_FIRST_ONLINE_PGDAT ++typedef struct pglist_data *(*first_online_pgdat_t)(void); ++extern first_online_pgdat_t first_online_pgdat_fn; ++# define first_online_pgdat() first_online_pgdat_fn() ++# endif /* HAVE_FIRST_ONLINE_PGDAT */ ++ ++# ifndef HAVE_NEXT_ONLINE_PGDAT ++typedef struct pglist_data *(*next_online_pgdat_t)(struct pglist_data *); ++extern next_online_pgdat_t next_online_pgdat_fn; ++# define next_online_pgdat(pgd) next_online_pgdat_fn(pgd) ++# endif /* HAVE_NEXT_ONLINE_PGDAT */ ++ ++# ifndef HAVE_NEXT_ZONE ++typedef struct zone *(*next_zone_t)(struct zone *); ++extern next_zone_t next_zone_fn; ++# define next_zone(zone) next_zone_fn(zone) ++# endif /* HAVE_NEXT_ZONE */ ++ ++#else /* HAVE_PGDAT_HELPERS */ ++ ++# ifndef HAVE_PGDAT_LIST ++extern struct pglist_data *pgdat_list_addr; ++# define pgdat_list pgdat_list_addr ++# endif /* HAVE_PGDAT_LIST */ ++ ++#endif /* HAVE_PGDAT_HELPERS */ ++ ++/* Source linux/mm/vmstat.c */ ++#if defined(NEED_GET_ZONE_COUNTS) && !defined(HAVE_GET_ZONE_COUNTS) ++typedef void (*get_zone_counts_t)(unsigned long *, unsigned long *, ++ unsigned long *); ++extern get_zone_counts_t get_zone_counts_fn; ++# define get_zone_counts(a,i,f) get_zone_counts_fn(a,i,f) ++#endif /* NEED_GET_ZONE_COUNTS && !HAVE_GET_ZONE_COUNTS */ ++ ++typedef enum spl_zone_stat_item { ++ SPL_NR_FREE_PAGES, ++ SPL_NR_INACTIVE, ++ SPL_NR_ACTIVE, ++ SPL_NR_ZONE_STAT_ITEMS ++} spl_zone_stat_item_t; ++ ++extern unsigned long spl_global_page_state(spl_zone_stat_item_t); ++ ++#define xcopyin(from, to, size) copy_from_user(to, from, size) ++#define xcopyout(from, to, size) copy_to_user(to, from, size) ++ ++static __inline__ int ++copyin(const void *from, void *to, size_t len) ++{ ++ /* On error copyin routine returns -1 */ ++ if (xcopyin(from, to, len)) ++ return -1; ++ ++ return 0; ++} ++ ++static __inline__ int ++copyout(const void *from, void *to, size_t len) ++{ ++ /* On error copyout routine returns -1 */ ++ if (xcopyout(from, to, len)) ++ return -1; ++ ++ return 0; ++} ++ ++static __inline__ int ++copyinstr(const void *from, void *to, size_t len, size_t *done) ++{ ++ size_t rc; ++ ++ if (len == 0) ++ return -ENAMETOOLONG; ++ ++ /* XXX: Should return ENAMETOOLONG if 'strlen(from) > len' */ ++ ++ memset(to, 0, len); ++ rc = copyin(from, to, len - 1); ++ if (done != NULL) ++ *done = rc; ++ ++ return 0; ++} ++ ++#endif /* SPL_VMSYSTM_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/vnode.h linux-3.2.33-go/include/spl/sys/vnode.h +--- linux-3.2.33-go.orig/include/spl/sys/vnode.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/vnode.h 2012-11-16 23:22:32.404192930 +0100 +@@ -0,0 +1,213 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_VNODE_H ++#define _SPL_VNODE_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Prior to linux-2.6.33 only O_DSYNC semantics were implemented and ++ * they used the O_SYNC flag. As of linux-2.6.33 the this behavior ++ * was properly split in to O_SYNC and O_DSYNC respectively. ++ */ ++#ifndef O_DSYNC ++#define O_DSYNC O_SYNC ++#endif ++ ++#define FREAD 1 ++#define FWRITE 2 ++#define FCREAT O_CREAT ++#define FTRUNC O_TRUNC ++#define FOFFMAX O_LARGEFILE ++#define FSYNC O_SYNC ++#define FDSYNC O_DSYNC ++#define FRSYNC O_SYNC ++#define FEXCL O_EXCL ++#define FDIRECT O_DIRECT ++#define FAPPEND O_APPEND ++ ++#define FNODSYNC 0x10000 /* fsync pseudo flag */ ++#define FNOFOLLOW 0x20000 /* don't follow symlinks */ ++ ++#define F_FREESP 11 /* Free file space */ ++ ++ ++/* ++ * The vnode AT_ flags are mapped to the Linux ATTR_* flags. ++ * This allows them to be used safely with an iattr structure. ++ * The AT_XVATTR flag has been added and mapped to the upper ++ * bit range to avoid conflicting with the standard Linux set. ++ */ ++#undef AT_UID ++#undef AT_GID ++ ++#define AT_MODE ATTR_MODE ++#define AT_UID ATTR_UID ++#define AT_GID ATTR_GID ++#define AT_SIZE ATTR_SIZE ++#define AT_ATIME ATTR_ATIME ++#define AT_MTIME ATTR_MTIME ++#define AT_CTIME ATTR_CTIME ++ ++#define ATTR_XVATTR (1 << 31) ++#define AT_XVATTR ATTR_XVATTR ++ ++#define ATTR_IATTR_MASK (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_SIZE | \ ++ ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_FILE) ++ ++#define CRCREAT 0x01 ++#define RMFILE 0x02 ++ ++#define B_INVAL 0x01 ++#define B_TRUNC 0x02 ++ ++#define LOOKUP_DIR 0x01 ++#define LOOKUP_XATTR 0x02 ++#define CREATE_XATTR_DIR 0x04 ++#define ATTR_NOACLCHECK 0x20 ++ ++#ifdef HAVE_PATH_IN_NAMEIDATA ++# define nd_dentry path.dentry ++# define nd_mnt path.mnt ++#else ++# define nd_dentry dentry ++# define nd_mnt mnt ++#endif ++ ++typedef enum vtype { ++ VNON = 0, ++ VREG = 1, ++ VDIR = 2, ++ VBLK = 3, ++ VCHR = 4, ++ VLNK = 5, ++ VFIFO = 6, ++ VDOOR = 7, ++ VPROC = 8, ++ VSOCK = 9, ++ VPORT = 10, ++ VBAD = 11 ++} vtype_t; ++ ++typedef struct vattr { ++ enum vtype va_type; /* vnode type */ ++ u_int va_mask; /* attribute bit-mask */ ++ u_short va_mode; /* acc mode */ ++ uid_t va_uid; /* owner uid */ ++ gid_t va_gid; /* owner gid */ ++ long va_fsid; /* fs id */ ++ long va_nodeid; /* node # */ ++ uint32_t va_nlink; /* # links */ ++ uint64_t va_size; /* file size */ ++ struct timespec va_atime; /* last acc */ ++ struct timespec va_mtime; /* last mod */ ++ struct timespec va_ctime; /* last chg */ ++ dev_t va_rdev; /* dev */ ++ uint64_t va_nblocks; /* space used */ ++ uint32_t va_blksize; /* block size */ ++ uint32_t va_seq; /* sequence */ ++ struct dentry *va_dentry; /* dentry to wire */ ++} vattr_t; ++ ++typedef struct vnode { ++ struct file *v_file; ++ kmutex_t v_lock; /* protects vnode fields */ ++ uint_t v_flag; /* vnode flags (see below) */ ++ uint_t v_count; /* reference count */ ++ void *v_data; /* private data for fs */ ++ struct vfs *v_vfsp; /* ptr to containing VFS */ ++ struct stdata *v_stream; /* associated stream */ ++ enum vtype v_type; /* vnode type */ ++ dev_t v_rdev; /* device (VCHR, VBLK) */ ++ gfp_t v_gfp_mask; /* original mapping gfp mask */ ++} vnode_t; ++ ++typedef struct vn_file { ++ int f_fd; /* linux fd for lookup */ ++ struct task_struct *f_task; /* linux task this fd belongs to */ ++ struct file *f_file; /* linux file struct */ ++ atomic_t f_ref; /* ref count */ ++ kmutex_t f_lock; /* struct lock */ ++ loff_t f_offset; /* offset */ ++ vnode_t *f_vnode; /* vnode */ ++ struct list_head f_list; /* list referenced file_t's */ ++} file_t; ++ ++extern vnode_t *vn_alloc(int flag); ++void vn_free(vnode_t *vp); ++extern vtype_t vn_mode_to_vtype(mode_t); ++extern mode_t vn_vtype_to_mode(vtype_t); ++extern int vn_open(const char *path, uio_seg_t seg, int flags, int mode, ++ vnode_t **vpp, int x1, void *x2); ++extern int vn_openat(const char *path, uio_seg_t seg, int flags, int mode, ++ vnode_t **vpp, int x1, void *x2, vnode_t *vp, int fd); ++extern int vn_rdwr(uio_rw_t uio, vnode_t *vp, void *addr, ssize_t len, ++ offset_t off, uio_seg_t seg, int x1, rlim64_t x2, ++ void *x3, ssize_t *residp); ++extern int vn_close(vnode_t *vp, int flags, int x1, int x2, void *x3, void *x4); ++extern int vn_seek(vnode_t *vp, offset_t o, offset_t *op, void *ct); ++ ++extern int vn_remove(const char *path, uio_seg_t seg, int flags); ++extern int vn_rename(const char *path1, const char *path2, int x1); ++extern int vn_getattr(vnode_t *vp, vattr_t *vap, int flags, void *x3, void *x4); ++extern int vn_fsync(vnode_t *vp, int flags, void *x3, void *x4); ++extern int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag, ++ offset_t offset, void *x6, void *x7); ++extern file_t *vn_getf(int fd); ++extern void vn_releasef(int fd); ++extern int vn_set_pwd(const char *filename); ++ ++int spl_vn_init_kallsyms_lookup(void); ++int spl_vn_init(void); ++void spl_vn_fini(void); ++ ++#define VOP_CLOSE vn_close ++#define VOP_SEEK vn_seek ++#define VOP_GETATTR vn_getattr ++#define VOP_FSYNC vn_fsync ++#define VOP_SPACE vn_space ++#define VOP_PUTPAGE(vp, o, s, f, x1, x2) ((void)0) ++#define vn_is_readonly(vp) 0 ++#define getf vn_getf ++#define releasef vn_releasef ++ ++extern vnode_t *rootdir; ++ ++#endif /* SPL_VNODE_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/zmod.h linux-3.2.33-go/include/spl/sys/zmod.h +--- linux-3.2.33-go.orig/include/spl/sys/zmod.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/zmod.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,69 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * z_compress_level/z_uncompress are nearly identical copies of the ++ * compress2/uncompress functions provided by the official zlib package ++ * available at http://zlib.net/. The only changes made we to slightly ++ * adapt the functions called to match the linux kernel implementation ++ * of zlib. The full zlib license follows: ++ * ++ * zlib.h -- interface of the 'zlib' general purpose compression library ++ * version 1.2.5, April 19th, 2010 ++ * ++ * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ * ++ * Jean-loup Gailly ++ * Mark Adler ++\*****************************************************************************/ ++ ++#ifndef _SPL_ZMOD_H ++#define _SPL_ZMOD_H ++ ++#include ++#include ++ ++extern int z_compress_level(void *dest, size_t *destLen, const void *source, ++ size_t sourceLen, int level); ++extern int z_uncompress(void *dest, size_t *destLen, const void *source, ++ size_t sourceLen); ++ ++int spl_zlib_init(void); ++void spl_zlib_fini(void); ++ ++#endif /* SPL_ZMOD_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/sys/zone.h linux-3.2.33-go/include/spl/sys/zone.h +--- linux-3.2.33-go.orig/include/spl/sys/zone.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/sys/zone.h 2012-11-16 23:22:32.406192907 +0100 +@@ -0,0 +1,33 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_ZONE_H ++#define _SPL_ZONE_H ++ ++#include ++ ++#define zone_dataset_visible(x, y) (1) ++#define INGLOBALZONE(z) (1) ++ ++#endif /* SPL_ZONE_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/unistd.h linux-3.2.33-go/include/spl/unistd.h +--- linux-3.2.33-go.orig/include/spl/unistd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/unistd.h 2012-11-16 23:22:32.407192896 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_UNISTD_H ++#define _SPL_UNISTD_H ++ ++#endif /* SPL_UNISTD_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/util/qsort.h linux-3.2.33-go/include/spl/util/qsort.h +--- linux-3.2.33-go.orig/include/spl/util/qsort.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/util/qsort.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,32 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_QSORT_H ++#define _SPL_QSORT_H ++ ++#include ++ ++#define qsort(base, num, size, cmp) sort(base, num, size, cmp, NULL) ++ ++#endif /* SPL_QSORT_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/util/sscanf.h linux-3.2.33-go/include/spl/util/sscanf.h +--- linux-3.2.33-go.orig/include/spl/util/sscanf.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/util/sscanf.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_UTIL_SSCANF_H ++#define _SPL_UTIL_SSCANF_H ++ ++#endif /* SPL_UTIL_SSCAN_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/vm/anon.h linux-3.2.33-go/include/spl/vm/anon.h +--- linux-3.2.33-go.orig/include/spl/vm/anon.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/vm/anon.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_VM_ANON_H ++#define _SPL_VM_ANON_H ++ ++#endif /* SPL_VM_ANON_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/vm/pvn.h linux-3.2.33-go/include/spl/vm/pvn.h +--- linux-3.2.33-go.orig/include/spl/vm/pvn.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/vm/pvn.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,28 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_VM_PVN_H ++#define _SPL_VM_PVN_H ++ ++#endif /* SPL_VM_PVN_H */ +diff -uNr linux-3.2.33-go.orig/include/spl/vm/seg_kmem.h linux-3.2.33-go/include/spl/vm/seg_kmem.h +--- linux-3.2.33-go.orig/include/spl/vm/seg_kmem.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/spl/vm/seg_kmem.h 2012-11-16 23:22:32.403192942 +0100 +@@ -0,0 +1,30 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPL_SEG_KMEM_H ++#define _SPL_SEG_KMEM_H ++ ++#include ++ ++#endif /* SPL_SEG_KMEM_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/libnvpair.h linux-3.2.33-go/include/zfs/libnvpair.h +--- linux-3.2.33-go.orig/include/zfs/libnvpair.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/libnvpair.h 2012-11-16 23:25:34.344039393 +0100 +@@ -0,0 +1,194 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _LIBNVPAIR_H ++#define _LIBNVPAIR_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * All interfaces described in this file are private to Solaris, and ++ * are subject to change at any time and without notice. The public ++ * nvlist/nvpair interfaces, as documented in manpage sections 3NVPAIR, ++ * are all imported from included above. ++ */ ++ ++extern int nvpair_value_match(nvpair_t *, int, char *, char **); ++extern int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, ++ char **); ++ ++extern void nvlist_print(FILE *, nvlist_t *); ++extern void dump_nvlist(nvlist_t *, int); ++ ++/* ++ * Private nvlist printing interface that allows the caller some control ++ * over output rendering (as opposed to nvlist_print and dump_nvlist). ++ * ++ * Obtain an opaque nvlist_prtctl_t cookie using nvlist_prtctl_alloc ++ * (NULL on failure); on return the cookie is set up for default formatting ++ * and rendering. Quote the cookie in subsequent customisation functions and ++ * then pass the cookie to nvlist_prt to render the nvlist. Finally, ++ * use nvlist_prtctl_free to release the cookie. ++ * ++ * For all nvlist_lookup_xxx and nvlist_lookup_xxx_array functions ++ * we have a corresponding brace of functions that appoint replacement ++ * rendering functions: ++ * ++ * extern void nvlist_prtctl_xxx(nvlist_prtctl_t, ++ * void (*)(nvlist_prtctl_t ctl, void *private, const char *name, ++ * xxxtype value)) ++ * ++ * and ++ * ++ * extern void nvlist_prtctl_xxx_array(nvlist_prtctl_t, ++ * void (*)(nvlist_prtctl_t ctl, void *private, const char *name, ++ * xxxtype value, uint_t count)) ++ * ++ * where xxxtype is the C datatype corresponding to xxx, eg int8_t for "int8" ++ * and char * for "string". The function that is appointed to render the ++ * specified datatype receives as arguments the cookie, the nvlist ++ * member name, the value of that member (or a pointer for array function), ++ * and (for array rendering functions) a count of the number of elements. ++ */ ++ ++typedef struct nvlist_prtctl *nvlist_prtctl_t; /* opaque */ ++ ++enum nvlist_indent_mode { ++ NVLIST_INDENT_ABS, /* Absolute indentation */ ++ NVLIST_INDENT_TABBED /* Indent with tabstops */ ++}; ++ ++extern nvlist_prtctl_t nvlist_prtctl_alloc(void); ++extern void nvlist_prtctl_free(nvlist_prtctl_t); ++extern void nvlist_prt(nvlist_t *, nvlist_prtctl_t); ++ ++/* Output stream */ ++extern void nvlist_prtctl_setdest(nvlist_prtctl_t, FILE *); ++extern FILE *nvlist_prtctl_getdest(nvlist_prtctl_t); ++ ++/* Indentation mode, start indent, indent increment; default tabbed/0/1 */ ++extern void nvlist_prtctl_setindent(nvlist_prtctl_t, enum nvlist_indent_mode, ++ int, int); ++extern void nvlist_prtctl_doindent(nvlist_prtctl_t, int); ++ ++enum nvlist_prtctl_fmt { ++ NVLIST_FMT_MEMBER_NAME, /* name fmt; default "%s = " */ ++ NVLIST_FMT_MEMBER_POSTAMBLE, /* after nvlist member; default "\n" */ ++ NVLIST_FMT_BTWN_ARRAY /* between array members; default " " */ ++}; ++ ++extern void nvlist_prtctl_setfmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, ++ const char *); ++extern void nvlist_prtctl_dofmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, ...); ++ ++/* ++ * Function prototypes for interfaces that appoint a new rendering function ++ * for single-valued nvlist members. ++ * ++ * A replacement function receives arguments as follows: ++ * ++ * nvlist_prtctl_t Print control structure; do not change preferences ++ * for this object from a print callback function. ++ * ++ * void * The function-private cookie argument registered ++ * when the replacement function was appointed. ++ * ++ * nvlist_t * The full nvlist that is being processed. The ++ * rendering function is called to render a single ++ * member (name and value passed as below) but it may ++ * want to reference or incorporate other aspects of ++ * the full nvlist. ++ * ++ * const char * Member name to render ++ * ++ * valtype Value of the member to render ++ * ++ * The function must return non-zero if it has rendered output for this ++ * member, or 0 if it wants to default to standard rendering for this ++ * one member. ++ */ ++ ++#define NVLIST_PRINTCTL_SVDECL(funcname, valtype) \ ++ extern void funcname(nvlist_prtctl_t, \ ++ int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, valtype), \ ++ void *) ++ ++NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean, int); ++NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean_value, boolean_t); ++NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_byte, uchar_t); ++NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int8, int8_t); ++NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint8, uint8_t); ++NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int16, int16_t); ++NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint16, uint16_t); ++NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int32, int32_t); ++NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint32, uint32_t); ++NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int64, int64_t); ++NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint64, uint64_t); ++NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_double, double); ++NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_string, char *); ++NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_hrtime, hrtime_t); ++NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_nvlist, nvlist_t *); ++ ++#undef NVLIST_PRINTCTL_SVDECL /* was just for "clarity" above */ ++ ++/* ++ * Function prototypes for interfaces that appoint a new rendering function ++ * for array-valued nvlist members. ++ * ++ * One additional argument is taken: uint_t for the number of array elements ++ * ++ * Return values as above. ++ */ ++#define NVLIST_PRINTCTL_AVDECL(funcname, vtype) \ ++ extern void funcname(nvlist_prtctl_t, \ ++ int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, uint_t), \ ++ void *) ++ ++NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_boolean_array, boolean_t *); ++NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_byte_array, uchar_t *); ++NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int8_array, int8_t *); ++NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint8_array, uint8_t *); ++NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int16_array, int16_t *); ++NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint16_array, uint16_t *); ++NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int32_array, int32_t *); ++NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint32_array, uint32_t *); ++NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int64_array, int64_t *); ++NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint64_array, uint64_t *); ++NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_string_array, char **); ++NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_nvlist_array, nvlist_t **); ++ ++#undef NVLIST_PRINTCTL_AVDECL /* was just for "clarity" above */ ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _LIBNVPAIR_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/libuutil_common.h linux-3.2.33-go/include/zfs/libuutil_common.h +--- linux-3.2.33-go.orig/include/zfs/libuutil_common.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/libuutil_common.h 2012-11-16 23:25:34.344039393 +0100 +@@ -0,0 +1,35 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _LIBUUTIL_COMMON_H ++#define _LIBUUTIL_COMMON_H ++ ++ ++ ++#include ++#include ++ ++#endif /* _LIBUUTIL_COMMON_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/libuutil.h linux-3.2.33-go/include/zfs/libuutil.h +--- linux-3.2.33-go.orig/include/zfs/libuutil.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/libuutil.h 2012-11-16 23:25:34.344039393 +0100 +@@ -0,0 +1,390 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _LIBUUTIL_H ++#define _LIBUUTIL_H ++ ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * Standard flags codes. ++ */ ++#define UU_DEFAULT 0 ++ ++/* ++ * Standard error codes. ++ */ ++#define UU_ERROR_NONE 0 /* no error */ ++#define UU_ERROR_INVALID_ARGUMENT 1 /* invalid argument */ ++#define UU_ERROR_UNKNOWN_FLAG 2 /* passed flag invalid */ ++#define UU_ERROR_NO_MEMORY 3 /* out of memory */ ++#define UU_ERROR_CALLBACK_FAILED 4 /* callback-initiated error */ ++#define UU_ERROR_NOT_SUPPORTED 5 /* operation not supported */ ++#define UU_ERROR_EMPTY 6 /* no value provided */ ++#define UU_ERROR_UNDERFLOW 7 /* value is too small */ ++#define UU_ERROR_OVERFLOW 8 /* value is too value */ ++#define UU_ERROR_INVALID_CHAR 9 /* value contains unexpected char */ ++#define UU_ERROR_INVALID_DIGIT 10 /* value contains digit not in base */ ++ ++#define UU_ERROR_SYSTEM 99 /* underlying system error */ ++#define UU_ERROR_UNKNOWN 100 /* error status not known */ ++ ++/* ++ * Standard program exit codes. ++ */ ++#define UU_EXIT_OK (*(uu_exit_ok())) ++#define UU_EXIT_FATAL (*(uu_exit_fatal())) ++#define UU_EXIT_USAGE (*(uu_exit_usage())) ++ ++/* ++ * Exit status profiles. ++ */ ++#define UU_PROFILE_DEFAULT 0 ++#define UU_PROFILE_LAUNCHER 1 ++ ++/* ++ * Error reporting functions. ++ */ ++uint32_t uu_error(void); ++const char *uu_strerror(uint32_t); ++ ++/* ++ * Program notification functions. ++ */ ++extern void uu_alt_exit(int); ++extern const char *uu_setpname(char *); ++extern const char *uu_getpname(void); ++/*PRINTFLIKE1*/ ++extern void uu_warn(const char *, ...); ++extern void uu_vwarn(const char *, va_list); ++/*PRINTFLIKE1*/ ++extern void uu_die(const char *, ...) __NORETURN; ++extern void uu_vdie(const char *, va_list) __NORETURN; ++/*PRINTFLIKE2*/ ++extern void uu_xdie(int, const char *, ...) __NORETURN; ++extern void uu_vxdie(int, const char *, va_list) __NORETURN; ++ ++/* ++ * Exit status functions (not to be used directly) ++ */ ++extern int *uu_exit_ok(void); ++extern int *uu_exit_fatal(void); ++extern int *uu_exit_usage(void); ++ ++/* ++ * string->number conversions ++ */ ++extern int uu_strtoint(const char *, void *, size_t, int, int64_t, int64_t); ++extern int uu_strtouint(const char *, void *, size_t, int, uint64_t, uint64_t); ++ ++/* ++ * Debug print facility functions. ++ */ ++typedef struct uu_dprintf uu_dprintf_t; ++ ++typedef enum { ++ UU_DPRINTF_SILENT, ++ UU_DPRINTF_FATAL, ++ UU_DPRINTF_WARNING, ++ UU_DPRINTF_NOTICE, ++ UU_DPRINTF_INFO, ++ UU_DPRINTF_DEBUG ++} uu_dprintf_severity_t; ++ ++extern uu_dprintf_t *uu_dprintf_create(const char *, uu_dprintf_severity_t, ++ uint_t); ++/*PRINTFLIKE3*/ ++extern void uu_dprintf(uu_dprintf_t *, uu_dprintf_severity_t, ++ const char *, ...); ++extern void uu_dprintf_destroy(uu_dprintf_t *); ++extern const char *uu_dprintf_getname(uu_dprintf_t *); ++ ++/* ++ * Identifier test flags and function. ++ */ ++#define UU_NAME_DOMAIN 0x1 /* allow SUNW, or com.sun, prefix */ ++#define UU_NAME_PATH 0x2 /* allow '/'-delimited paths */ ++ ++int uu_check_name(const char *, uint_t); ++ ++/* ++ * File creation functions. ++ */ ++extern int uu_open_tmp(const char *dir, uint_t uflags); ++ ++/* ++ * Convenience functions. ++ */ ++#define UU_NELEM(a) (sizeof (a) / sizeof ((a)[0])) ++ ++/*PRINTFLIKE1*/ ++extern char *uu_msprintf(const char *format, ...); ++extern void *uu_zalloc(size_t); ++extern char *uu_strdup(const char *); ++extern void uu_free(void *); ++ ++extern boolean_t uu_strcaseeq(const char *a, const char *b); ++extern boolean_t uu_streq(const char *a, const char *b); ++extern char *uu_strndup(const char *s, size_t n); ++extern boolean_t uu_strbw(const char *a, const char *b); ++extern void *uu_memdup(const void *buf, size_t sz); ++extern void uu_dump(FILE *out, const char *prefix, const void *buf, size_t len); ++ ++/* ++ * Comparison function type definition. ++ * Developers should be careful in their use of the _private argument. If you ++ * break interface guarantees, you get undefined behavior. ++ */ ++typedef int uu_compare_fn_t(const void *__left, const void *__right, ++ void *__private); ++ ++/* ++ * Walk variant flags. ++ * A data structure need not provide support for all variants and ++ * combinations. Refer to the appropriate documentation. ++ */ ++#define UU_WALK_ROBUST 0x00000001 /* walk can survive removes */ ++#define UU_WALK_REVERSE 0x00000002 /* reverse walk order */ ++ ++#define UU_WALK_PREORDER 0x00000010 /* walk tree in pre-order */ ++#define UU_WALK_POSTORDER 0x00000020 /* walk tree in post-order */ ++ ++/* ++ * Walk callback function return codes. ++ */ ++#define UU_WALK_ERROR -1 ++#define UU_WALK_NEXT 0 ++#define UU_WALK_DONE 1 ++ ++/* ++ * Walk callback function type definition. ++ */ ++typedef int uu_walk_fn_t(void *_elem, void *_private); ++ ++/* ++ * lists: opaque structures ++ */ ++typedef struct uu_list_pool uu_list_pool_t; ++typedef struct uu_list uu_list_t; ++ ++typedef struct uu_list_node { ++ uintptr_t uln_opaque[2]; ++} uu_list_node_t; ++ ++typedef struct uu_list_walk uu_list_walk_t; ++ ++typedef uintptr_t uu_list_index_t; ++ ++/* ++ * lists: interface ++ * ++ * basic usage: ++ * typedef struct foo { ++ * ... ++ * uu_list_node_t foo_node; ++ * ... ++ * } foo_t; ++ * ++ * static int ++ * foo_compare(void *l_arg, void *r_arg, void *private) ++ * { ++ * foo_t *l = l_arg; ++ * foo_t *r = r_arg; ++ * ++ * if (... l greater than r ...) ++ * return (1); ++ * if (... l less than r ...) ++ * return (-1); ++ * return (0); ++ * } ++ * ++ * ... ++ * // at initialization time ++ * foo_pool = uu_list_pool_create("foo_pool", ++ * sizeof (foo_t), offsetof(foo_t, foo_node), foo_compare, ++ * debugging? 0 : UU_AVL_POOL_DEBUG); ++ * ... ++ */ ++uu_list_pool_t *uu_list_pool_create(const char *, size_t, size_t, ++ uu_compare_fn_t *, uint32_t); ++#define UU_LIST_POOL_DEBUG 0x00000001 ++ ++void uu_list_pool_destroy(uu_list_pool_t *); ++ ++/* ++ * usage: ++ * ++ * foo_t *a; ++ * a = malloc(sizeof(*a)); ++ * uu_list_node_init(a, &a->foo_list, pool); ++ * ... ++ * uu_list_node_fini(a, &a->foo_list, pool); ++ * free(a); ++ */ ++void uu_list_node_init(void *, uu_list_node_t *, uu_list_pool_t *); ++void uu_list_node_fini(void *, uu_list_node_t *, uu_list_pool_t *); ++ ++uu_list_t *uu_list_create(uu_list_pool_t *, void *_parent, uint32_t); ++#define UU_LIST_DEBUG 0x00000001 ++#define UU_LIST_SORTED 0x00000002 /* list is sorted */ ++ ++void uu_list_destroy(uu_list_t *); /* list must be empty */ ++ ++size_t uu_list_numnodes(uu_list_t *); ++ ++void *uu_list_first(uu_list_t *); ++void *uu_list_last(uu_list_t *); ++ ++void *uu_list_next(uu_list_t *, void *); ++void *uu_list_prev(uu_list_t *, void *); ++ ++int uu_list_walk(uu_list_t *, uu_walk_fn_t *, void *, uint32_t); ++ ++uu_list_walk_t *uu_list_walk_start(uu_list_t *, uint32_t); ++void *uu_list_walk_next(uu_list_walk_t *); ++void uu_list_walk_end(uu_list_walk_t *); ++ ++void *uu_list_find(uu_list_t *, void *, void *, uu_list_index_t *); ++void uu_list_insert(uu_list_t *, void *, uu_list_index_t); ++ ++void *uu_list_nearest_next(uu_list_t *, uu_list_index_t); ++void *uu_list_nearest_prev(uu_list_t *, uu_list_index_t); ++ ++void *uu_list_teardown(uu_list_t *, void **); ++ ++void uu_list_remove(uu_list_t *, void *); ++ ++/* ++ * lists: interfaces for non-sorted lists only ++ */ ++int uu_list_insert_before(uu_list_t *, void *_target, void *_elem); ++int uu_list_insert_after(uu_list_t *, void *_target, void *_elem); ++ ++/* ++ * avl trees: opaque structures ++ */ ++typedef struct uu_avl_pool uu_avl_pool_t; ++typedef struct uu_avl uu_avl_t; ++ ++typedef struct uu_avl_node { ++#ifdef _LP64 ++ uintptr_t uan_opaque[3]; ++#else ++ uintptr_t uan_opaque[4]; ++#endif ++} uu_avl_node_t; ++ ++typedef struct uu_avl_walk uu_avl_walk_t; ++ ++typedef uintptr_t uu_avl_index_t; ++ ++/* ++ * avl trees: interface ++ * ++ * basic usage: ++ * typedef struct foo { ++ * ... ++ * uu_avl_node_t foo_node; ++ * ... ++ * } foo_t; ++ * ++ * static int ++ * foo_compare(void *l_arg, void *r_arg, void *private) ++ * { ++ * foo_t *l = l_arg; ++ * foo_t *r = r_arg; ++ * ++ * if (... l greater than r ...) ++ * return (1); ++ * if (... l less than r ...) ++ * return (-1); ++ * return (0); ++ * } ++ * ++ * ... ++ * // at initialization time ++ * foo_pool = uu_avl_pool_create("foo_pool", ++ * sizeof (foo_t), offsetof(foo_t, foo_node), foo_compare, ++ * debugging? 0 : UU_AVL_POOL_DEBUG); ++ * ... ++ */ ++uu_avl_pool_t *uu_avl_pool_create(const char *, size_t, size_t, ++ uu_compare_fn_t *, uint32_t); ++#define UU_AVL_POOL_DEBUG 0x00000001 ++ ++void uu_avl_pool_destroy(uu_avl_pool_t *); ++ ++/* ++ * usage: ++ * ++ * foo_t *a; ++ * a = malloc(sizeof(*a)); ++ * uu_avl_node_init(a, &a->foo_avl, pool); ++ * ... ++ * uu_avl_node_fini(a, &a->foo_avl, pool); ++ * free(a); ++ */ ++void uu_avl_node_init(void *, uu_avl_node_t *, uu_avl_pool_t *); ++void uu_avl_node_fini(void *, uu_avl_node_t *, uu_avl_pool_t *); ++ ++uu_avl_t *uu_avl_create(uu_avl_pool_t *, void *_parent, uint32_t); ++#define UU_AVL_DEBUG 0x00000001 ++ ++void uu_avl_destroy(uu_avl_t *); /* list must be empty */ ++ ++size_t uu_avl_numnodes(uu_avl_t *); ++ ++void *uu_avl_first(uu_avl_t *); ++void *uu_avl_last(uu_avl_t *); ++ ++void *uu_avl_next(uu_avl_t *, void *); ++void *uu_avl_prev(uu_avl_t *, void *); ++ ++int uu_avl_walk(uu_avl_t *, uu_walk_fn_t *, void *, uint32_t); ++ ++uu_avl_walk_t *uu_avl_walk_start(uu_avl_t *, uint32_t); ++void *uu_avl_walk_next(uu_avl_walk_t *); ++void uu_avl_walk_end(uu_avl_walk_t *); ++ ++void *uu_avl_find(uu_avl_t *, void *, void *, uu_avl_index_t *); ++void uu_avl_insert(uu_avl_t *, void *, uu_avl_index_t); ++ ++void *uu_avl_nearest_next(uu_avl_t *, uu_avl_index_t); ++void *uu_avl_nearest_prev(uu_avl_t *, uu_avl_index_t); ++ ++void *uu_avl_teardown(uu_avl_t *, void **); ++ ++void uu_avl_remove(uu_avl_t *, void *); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _LIBUUTIL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/libuutil_impl.h linux-3.2.33-go/include/zfs/libuutil_impl.h +--- linux-3.2.33-go.orig/include/zfs/libuutil_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/libuutil_impl.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,181 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License, Version 1.0 only ++ * (the "License"). You may not use this file except in compliance ++ * with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2005 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _LIBUUTIL_IMPL_H ++#define _LIBUUTIL_IMPL_H ++ ++ ++ ++#include ++#include ++ ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++void uu_set_error(uint_t); ++ ++ ++/*PRINTFLIKE1*/ ++void uu_panic(const char *format, ...); ++ ++ ++struct uu_dprintf { ++ char *uud_name; ++ uu_dprintf_severity_t uud_severity; ++ uint_t uud_flags; ++}; ++ ++/* ++ * For debugging purposes, libuutil keeps around linked lists of all uu_lists ++ * and uu_avls, along with pointers to their parents. These can cause false ++ * negatives when looking for memory leaks, so we encode the pointers by ++ * storing them with swapped endianness; this is not perfect, but it's about ++ * the best we can do without wasting a lot of space. ++ */ ++#ifdef _LP64 ++#define UU_PTR_ENCODE(ptr) BSWAP_64((uintptr_t)(void *)(ptr)) ++#else ++#define UU_PTR_ENCODE(ptr) BSWAP_32((uintptr_t)(void *)(ptr)) ++#endif ++ ++#define UU_PTR_DECODE(ptr) ((void *)UU_PTR_ENCODE(ptr)) ++ ++/* ++ * uu_list structures ++ */ ++typedef struct uu_list_node_impl { ++ struct uu_list_node_impl *uln_next; ++ struct uu_list_node_impl *uln_prev; ++} uu_list_node_impl_t; ++ ++struct uu_list_walk { ++ uu_list_walk_t *ulw_next; ++ uu_list_walk_t *ulw_prev; ++ ++ uu_list_t *ulw_list; ++ int8_t ulw_dir; ++ uint8_t ulw_robust; ++ uu_list_node_impl_t *ulw_next_result; ++}; ++ ++struct uu_list { ++ uintptr_t ul_next_enc; ++ uintptr_t ul_prev_enc; ++ ++ uu_list_pool_t *ul_pool; ++ uintptr_t ul_parent_enc; /* encoded parent pointer */ ++ size_t ul_offset; ++ size_t ul_numnodes; ++ uint8_t ul_debug; ++ uint8_t ul_sorted; ++ uint8_t ul_index; /* mark for uu_list_index_ts */ ++ ++ uu_list_node_impl_t ul_null_node; ++ uu_list_walk_t ul_null_walk; /* for robust walkers */ ++}; ++ ++#define UU_LIST_PTR(ptr) ((uu_list_t *)UU_PTR_DECODE(ptr)) ++ ++#define UU_LIST_POOL_MAXNAME 64 ++ ++struct uu_list_pool { ++ uu_list_pool_t *ulp_next; ++ uu_list_pool_t *ulp_prev; ++ ++ char ulp_name[UU_LIST_POOL_MAXNAME]; ++ size_t ulp_nodeoffset; ++ size_t ulp_objsize; ++ uu_compare_fn_t *ulp_cmp; ++ uint8_t ulp_debug; ++ uint8_t ulp_last_index; ++ pthread_mutex_t ulp_lock; /* protects null_list */ ++ uu_list_t ulp_null_list; ++}; ++ ++/* ++ * uu_avl structures ++ */ ++typedef struct avl_node uu_avl_node_impl_t; ++ ++struct uu_avl_walk { ++ uu_avl_walk_t *uaw_next; ++ uu_avl_walk_t *uaw_prev; ++ ++ uu_avl_t *uaw_avl; ++ void *uaw_next_result; ++ int8_t uaw_dir; ++ uint8_t uaw_robust; ++}; ++ ++struct uu_avl { ++ uintptr_t ua_next_enc; ++ uintptr_t ua_prev_enc; ++ ++ uu_avl_pool_t *ua_pool; ++ uintptr_t ua_parent_enc; ++ uint8_t ua_debug; ++ uint8_t ua_index; /* mark for uu_avl_index_ts */ ++ ++ struct avl_tree ua_tree; ++ uu_avl_walk_t ua_null_walk; ++}; ++ ++#define UU_AVL_PTR(x) ((uu_avl_t *)UU_PTR_DECODE(x)) ++ ++#define UU_AVL_POOL_MAXNAME 64 ++ ++struct uu_avl_pool { ++ uu_avl_pool_t *uap_next; ++ uu_avl_pool_t *uap_prev; ++ ++ char uap_name[UU_AVL_POOL_MAXNAME]; ++ size_t uap_nodeoffset; ++ size_t uap_objsize; ++ uu_compare_fn_t *uap_cmp; ++ uint8_t uap_debug; ++ uint8_t uap_last_index; ++ pthread_mutex_t uap_lock; /* protects null_avl */ ++ uu_avl_t uap_null_avl; ++}; ++ ++/* ++ * atfork() handlers ++ */ ++void uu_avl_lockup(void); ++void uu_avl_release(void); ++ ++void uu_list_lockup(void); ++void uu_list_release(void); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _LIBUUTIL_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/libzfs.h linux-3.2.33-go/include/zfs/libzfs.h +--- linux-3.2.33-go.orig/include/zfs/libzfs.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/libzfs.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,745 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ * Copyright (c) 2012, Joyent, Inc. All rights reserved. ++ */ ++ ++#ifndef _LIBZFS_H ++#define _LIBZFS_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * Miscellaneous ZFS constants ++ */ ++#define ZFS_MAXNAMELEN MAXNAMELEN ++#define ZPOOL_MAXNAMELEN MAXNAMELEN ++#define ZFS_MAXPROPLEN MAXPATHLEN ++#define ZPOOL_MAXPROPLEN MAXPATHLEN ++ ++/* ++ * Default device paths ++ */ ++#define DISK_ROOT "/dev" ++#define UDISK_ROOT "/dev/disk" ++ ++#define DEFAULT_IMPORT_PATH_SIZE 8 ++extern char *zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE]; ++ ++/* ++ * libzfs errors ++ */ ++enum { ++ EZFS_NOMEM = 2000, /* out of memory */ ++ EZFS_BADPROP, /* invalid property value */ ++ EZFS_PROPREADONLY, /* cannot set readonly property */ ++ EZFS_PROPTYPE, /* property does not apply to dataset type */ ++ EZFS_PROPNONINHERIT, /* property is not inheritable */ ++ EZFS_PROPSPACE, /* bad quota or reservation */ ++ EZFS_BADTYPE, /* dataset is not of appropriate type */ ++ EZFS_BUSY, /* pool or dataset is busy */ ++ EZFS_EXISTS, /* pool or dataset already exists */ ++ EZFS_NOENT, /* no such pool or dataset */ ++ EZFS_BADSTREAM, /* bad backup stream */ ++ EZFS_DSREADONLY, /* dataset is readonly */ ++ EZFS_VOLTOOBIG, /* volume is too large for 32-bit system */ ++ EZFS_INVALIDNAME, /* invalid dataset name */ ++ EZFS_BADRESTORE, /* unable to restore to destination */ ++ EZFS_BADBACKUP, /* backup failed */ ++ EZFS_BADTARGET, /* bad attach/detach/replace target */ ++ EZFS_NODEVICE, /* no such device in pool */ ++ EZFS_BADDEV, /* invalid device to add */ ++ EZFS_NOREPLICAS, /* no valid replicas */ ++ EZFS_RESILVERING, /* currently resilvering */ ++ EZFS_BADVERSION, /* unsupported version */ ++ EZFS_POOLUNAVAIL, /* pool is currently unavailable */ ++ EZFS_DEVOVERFLOW, /* too many devices in one vdev */ ++ EZFS_BADPATH, /* must be an absolute path */ ++ EZFS_CROSSTARGET, /* rename or clone across pool or dataset */ ++ EZFS_ZONED, /* used improperly in local zone */ ++ EZFS_MOUNTFAILED, /* failed to mount dataset */ ++ EZFS_UMOUNTFAILED, /* failed to unmount dataset */ ++ EZFS_UNSHARENFSFAILED, /* unshare(1M) failed */ ++ EZFS_SHARENFSFAILED, /* share(1M) failed */ ++ EZFS_PERM, /* permission denied */ ++ EZFS_NOSPC, /* out of space */ ++ EZFS_FAULT, /* bad address */ ++ EZFS_IO, /* I/O error */ ++ EZFS_INTR, /* signal received */ ++ EZFS_ISSPARE, /* device is a hot spare */ ++ EZFS_INVALCONFIG, /* invalid vdev configuration */ ++ EZFS_RECURSIVE, /* recursive dependency */ ++ EZFS_NOHISTORY, /* no history object */ ++ EZFS_POOLPROPS, /* couldn't retrieve pool props */ ++ EZFS_POOL_NOTSUP, /* ops not supported for this type of pool */ ++ EZFS_POOL_INVALARG, /* invalid argument for this pool operation */ ++ EZFS_NAMETOOLONG, /* dataset name is too long */ ++ EZFS_OPENFAILED, /* open of device failed */ ++ EZFS_NOCAP, /* couldn't get capacity */ ++ EZFS_LABELFAILED, /* write of label failed */ ++ EZFS_BADWHO, /* invalid permission who */ ++ EZFS_BADPERM, /* invalid permission */ ++ EZFS_BADPERMSET, /* invalid permission set name */ ++ EZFS_NODELEGATION, /* delegated administration is disabled */ ++ EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */ ++ EZFS_SHARESMBFAILED, /* failed to share over smb */ ++ EZFS_BADCACHE, /* bad cache file */ ++ EZFS_ISL2CACHE, /* device is for the level 2 ARC */ ++ EZFS_VDEVNOTSUP, /* unsupported vdev type */ ++ EZFS_NOTSUP, /* ops not supported on this dataset */ ++ EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */ ++ EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */ ++ EZFS_REFTAG_RELE, /* snapshot release: tag not found */ ++ EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */ ++ EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */ ++ EZFS_PIPEFAILED, /* pipe create failed */ ++ EZFS_THREADCREATEFAILED, /* thread create failed */ ++ EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ ++ EZFS_SCRUBBING, /* currently scrubbing */ ++ EZFS_NO_SCRUB, /* no active scrub */ ++ EZFS_DIFF, /* general failure of zfs diff */ ++ EZFS_DIFFDATA, /* bad zfs diff data */ ++ EZFS_POOLREADONLY, /* pool is in read-only mode */ ++ EZFS_UNKNOWN ++}; ++ ++/* ++ * The following data structures are all part ++ * of the zfs_allow_t data structure which is ++ * used for printing 'allow' permissions. ++ * It is a linked list of zfs_allow_t's which ++ * then contain avl tree's for user/group/sets/... ++ * and each one of the entries in those trees have ++ * avl tree's for the permissions they belong to and ++ * whether they are local,descendent or local+descendent ++ * permissions. The AVL trees are used primarily for ++ * sorting purposes, but also so that we can quickly find ++ * a given user and or permission. ++ */ ++typedef struct zfs_perm_node { ++ avl_node_t z_node; ++ char z_pname[MAXPATHLEN]; ++} zfs_perm_node_t; ++ ++typedef struct zfs_allow_node { ++ avl_node_t z_node; ++ char z_key[MAXPATHLEN]; /* name, such as joe */ ++ avl_tree_t z_localdescend; /* local+descendent perms */ ++ avl_tree_t z_local; /* local permissions */ ++ avl_tree_t z_descend; /* descendent permissions */ ++} zfs_allow_node_t; ++ ++typedef struct zfs_allow { ++ struct zfs_allow *z_next; ++ char z_setpoint[MAXPATHLEN]; ++ avl_tree_t z_sets; ++ avl_tree_t z_crperms; ++ avl_tree_t z_user; ++ avl_tree_t z_group; ++ avl_tree_t z_everyone; ++} zfs_allow_t; ++ ++/* ++ * Basic handle types ++ */ ++typedef struct zfs_handle zfs_handle_t; ++typedef struct zpool_handle zpool_handle_t; ++typedef struct libzfs_handle libzfs_handle_t; ++ ++/* ++ * Library initialization ++ */ ++extern libzfs_handle_t *libzfs_init(void); ++extern void libzfs_fini(libzfs_handle_t *); ++ ++extern libzfs_handle_t *zpool_get_handle(zpool_handle_t *); ++extern libzfs_handle_t *zfs_get_handle(zfs_handle_t *); ++ ++extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t); ++ ++extern int libzfs_errno(libzfs_handle_t *); ++extern const char *libzfs_error_action(libzfs_handle_t *); ++extern const char *libzfs_error_description(libzfs_handle_t *); ++extern void libzfs_mnttab_init(libzfs_handle_t *); ++extern void libzfs_mnttab_fini(libzfs_handle_t *); ++extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t); ++extern int libzfs_mnttab_find(libzfs_handle_t *, const char *, ++ struct mnttab *); ++extern void libzfs_mnttab_add(libzfs_handle_t *, const char *, ++ const char *, const char *); ++extern void libzfs_mnttab_remove(libzfs_handle_t *, const char *); ++ ++/* ++ * Basic handle functions ++ */ ++extern zpool_handle_t *zpool_open(libzfs_handle_t *, const char *); ++extern zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *); ++extern void zpool_close(zpool_handle_t *); ++extern const char *zpool_get_name(zpool_handle_t *); ++extern int zpool_get_state(zpool_handle_t *); ++extern char *zpool_state_to_name(vdev_state_t, vdev_aux_t); ++extern void zpool_free_handles(libzfs_handle_t *); ++ ++/* ++ * Iterate over all active pools in the system. ++ */ ++typedef int (*zpool_iter_f)(zpool_handle_t *, void *); ++extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *); ++ ++/* ++ * Functions to create and destroy pools ++ */ ++extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, ++ nvlist_t *, nvlist_t *); ++extern int zpool_destroy(zpool_handle_t *); ++extern int zpool_add(zpool_handle_t *, nvlist_t *); ++ ++typedef struct splitflags { ++ /* do not split, but return the config that would be split off */ ++ int dryrun : 1; ++ ++ /* after splitting, import the pool */ ++ int import : 1; ++} splitflags_t; ++ ++/* ++ * Functions to manipulate pool and vdev state ++ */ ++extern int zpool_scan(zpool_handle_t *, pool_scan_func_t); ++extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); ++extern int zpool_reguid(zpool_handle_t *); ++extern int zpool_reopen(zpool_handle_t *); ++ ++extern int zpool_vdev_online(zpool_handle_t *, const char *, int, ++ vdev_state_t *); ++extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); ++extern int zpool_vdev_attach(zpool_handle_t *, const char *, ++ const char *, nvlist_t *, int); ++extern int zpool_vdev_detach(zpool_handle_t *, const char *); ++extern int zpool_vdev_remove(zpool_handle_t *, const char *); ++extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, ++ splitflags_t); ++ ++extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); ++extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); ++extern int zpool_vdev_clear(zpool_handle_t *, uint64_t); ++ ++extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, ++ boolean_t *, boolean_t *); ++extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, ++ boolean_t *, boolean_t *, boolean_t *); ++extern int zpool_label_disk_wait(char *, int); ++extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *); ++ ++/* ++ * Functions to manage pool properties ++ */ ++extern int zpool_set_prop(zpool_handle_t *, const char *, const char *); ++extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, ++ size_t proplen, zprop_source_t *); ++extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, ++ zprop_source_t *); ++ ++extern const char *zpool_prop_to_name(zpool_prop_t); ++extern const char *zpool_prop_values(zpool_prop_t); ++ ++/* ++ * Pool health statistics. ++ */ ++typedef enum { ++ /* ++ * The following correspond to faults as defined in the (fault.fs.zfs.*) ++ * event namespace. Each is associated with a corresponding message ID. ++ */ ++ ZPOOL_STATUS_CORRUPT_CACHE, /* corrupt /kernel/drv/zpool.cache */ ++ ZPOOL_STATUS_MISSING_DEV_R, /* missing device with replicas */ ++ ZPOOL_STATUS_MISSING_DEV_NR, /* missing device with no replicas */ ++ ZPOOL_STATUS_CORRUPT_LABEL_R, /* bad device label with replicas */ ++ ZPOOL_STATUS_CORRUPT_LABEL_NR, /* bad device label with no replicas */ ++ ZPOOL_STATUS_BAD_GUID_SUM, /* sum of device guids didn't match */ ++ ZPOOL_STATUS_CORRUPT_POOL, /* pool metadata is corrupted */ ++ ZPOOL_STATUS_CORRUPT_DATA, /* data errors in user (meta)data */ ++ ZPOOL_STATUS_FAILING_DEV, /* device experiencing errors */ ++ ZPOOL_STATUS_VERSION_NEWER, /* newer on-disk version */ ++ ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */ ++ ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */ ++ ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */ ++ ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ ++ ++ /* ++ * These faults have no corresponding message ID. At the time we are ++ * checking the status, the original reason for the FMA fault (I/O or ++ * checksum errors) has been lost. ++ */ ++ ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */ ++ ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */ ++ ++ /* ++ * The following are not faults per se, but still an error possibly ++ * requiring administrative attention. There is no corresponding ++ * message ID. ++ */ ++ ZPOOL_STATUS_VERSION_OLDER, /* older on-disk version */ ++ ZPOOL_STATUS_RESILVERING, /* device being resilvered */ ++ ZPOOL_STATUS_OFFLINE_DEV, /* device online */ ++ ZPOOL_STATUS_REMOVED_DEV, /* removed device */ ++ ++ /* ++ * Finally, the following indicates a healthy pool. ++ */ ++ ZPOOL_STATUS_OK ++} zpool_status_t; ++ ++extern zpool_status_t zpool_get_status(zpool_handle_t *, char **); ++extern zpool_status_t zpool_import_status(nvlist_t *, char **); ++extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh); ++ ++/* ++ * Statistics and configuration functions. ++ */ ++extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); ++extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *); ++extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **); ++ ++/* ++ * Import and export functions ++ */ ++extern int zpool_export(zpool_handle_t *, boolean_t); ++extern int zpool_export_force(zpool_handle_t *); ++extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, ++ char *altroot); ++extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, ++ nvlist_t *, int); ++ ++/* ++ * Search for pools to import ++ */ ++ ++typedef struct importargs { ++ char **path; /* a list of paths to search */ ++ int paths; /* number of paths to search */ ++ char *poolname; /* name of a pool to find */ ++ uint64_t guid; /* guid of a pool to find */ ++ char *cachefile; /* cachefile to use for import */ ++ int can_be_active : 1; /* can the pool be active? */ ++ int unique : 1; /* does 'poolname' already exist? */ ++ int exists : 1; /* set on return if pool already exists */ ++} importargs_t; ++ ++extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *); ++ ++/* legacy pool search routines */ ++extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **); ++extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *, ++ char *, uint64_t); ++ ++/* ++ * Miscellaneous pool functions ++ */ ++struct zfs_cmd; ++ ++extern const char *zfs_history_event_names[LOG_END]; ++ ++extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, ++ boolean_t verbose); ++extern int zpool_upgrade(zpool_handle_t *, uint64_t); ++extern int zpool_get_history(zpool_handle_t *, nvlist_t **); ++extern int zpool_history_unpack(char *, uint64_t, uint64_t *, ++ nvlist_t ***, uint_t *); ++extern void zpool_set_history_str(const char *subcommand, int argc, ++ char **argv, char *history_str); ++extern int zpool_stage_history(libzfs_handle_t *, const char *); ++extern int zpool_events_next(libzfs_handle_t *, nvlist_t **, int *, int, int); ++extern int zpool_events_clear(libzfs_handle_t *, int *); ++extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, ++ size_t len); ++extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); ++extern int zpool_get_physpath(zpool_handle_t *, char *, size_t); ++extern void zpool_explain_recover(libzfs_handle_t *, const char *, int, ++ nvlist_t *); ++ ++/* ++ * Basic handle manipulations. These functions do not create or destroy the ++ * underlying datasets, only the references to them. ++ */ ++extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int); ++extern zfs_handle_t *zfs_handle_dup(zfs_handle_t *); ++extern void zfs_close(zfs_handle_t *); ++extern zfs_type_t zfs_get_type(const zfs_handle_t *); ++extern const char *zfs_get_name(const zfs_handle_t *); ++extern zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *); ++ ++/* ++ * Property management functions. Some functions are shared with the kernel, ++ * and are found in sys/fs/zfs.h. ++ */ ++ ++/* ++ * zfs dataset property management ++ */ ++extern const char *zfs_prop_default_string(zfs_prop_t); ++extern uint64_t zfs_prop_default_numeric(zfs_prop_t); ++extern const char *zfs_prop_column_name(zfs_prop_t); ++extern boolean_t zfs_prop_align_right(zfs_prop_t); ++ ++extern nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t, ++ nvlist_t *, uint64_t, zfs_handle_t *, const char *); ++ ++extern const char *zfs_prop_to_name(zfs_prop_t); ++extern int zfs_prop_set(zfs_handle_t *, const char *, const char *); ++extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, ++ zprop_source_t *, char *, size_t, boolean_t); ++extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t, ++ boolean_t); ++extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, ++ zprop_source_t *, char *, size_t); ++extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, ++ uint64_t *propvalue); ++extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, ++ char *propbuf, int proplen, boolean_t literal); ++extern int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, ++ uint64_t *propvalue); ++extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, ++ char *propbuf, int proplen, boolean_t literal); ++extern int zfs_get_snapused_int(zfs_handle_t *firstsnap, zfs_handle_t *lastsnap, ++ uint64_t *usedp); ++extern uint64_t getprop_uint64(zfs_handle_t *, zfs_prop_t, char **); ++extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); ++extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); ++extern const char *zfs_prop_values(zfs_prop_t); ++extern int zfs_prop_is_string(zfs_prop_t prop); ++extern nvlist_t *zfs_get_user_props(zfs_handle_t *); ++extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *); ++extern nvlist_t *zfs_get_clones_nvl(zfs_handle_t *); ++ ++typedef struct zprop_list { ++ int pl_prop; ++ char *pl_user_prop; ++ struct zprop_list *pl_next; ++ boolean_t pl_all; ++ size_t pl_width; ++ size_t pl_recvd_width; ++ boolean_t pl_fixed; ++} zprop_list_t; ++ ++extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t); ++extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); ++ ++#define ZFS_MOUNTPOINT_NONE "none" ++#define ZFS_MOUNTPOINT_LEGACY "legacy" ++ ++/* ++ * zpool property management ++ */ ++extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **); ++extern const char *zpool_prop_default_string(zpool_prop_t); ++extern uint64_t zpool_prop_default_numeric(zpool_prop_t); ++extern const char *zpool_prop_column_name(zpool_prop_t); ++extern boolean_t zpool_prop_align_right(zpool_prop_t); ++ ++/* ++ * Functions shared by zfs and zpool property management. ++ */ ++extern int zprop_iter(zprop_func func, void *cb, boolean_t show_all, ++ boolean_t ordered, zfs_type_t type); ++extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **, ++ zfs_type_t); ++extern void zprop_free_list(zprop_list_t *); ++ ++#define ZFS_GET_NCOLS 5 ++ ++typedef enum { ++ GET_COL_NONE, ++ GET_COL_NAME, ++ GET_COL_PROPERTY, ++ GET_COL_VALUE, ++ GET_COL_RECVD, ++ GET_COL_SOURCE ++} zfs_get_column_t; ++ ++/* ++ * Functions for printing zfs or zpool properties ++ */ ++typedef struct zprop_get_cbdata { ++ int cb_sources; ++ zfs_get_column_t cb_columns[ZFS_GET_NCOLS]; ++ int cb_colwidths[ZFS_GET_NCOLS + 1]; ++ boolean_t cb_scripted; ++ boolean_t cb_literal; ++ boolean_t cb_first; ++ zprop_list_t *cb_proplist; ++ zfs_type_t cb_type; ++} zprop_get_cbdata_t; ++ ++void zprop_print_one_property(const char *, zprop_get_cbdata_t *, ++ const char *, const char *, zprop_source_t, const char *, ++ const char *); ++ ++/* ++ * Iterator functions. ++ */ ++typedef int (*zfs_iter_f)(zfs_handle_t *, void *); ++extern int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *); ++extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *); ++extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *); ++extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); ++extern int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *); ++extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *); ++extern int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *); ++ ++typedef struct get_all_cb { ++ zfs_handle_t **cb_handles; ++ size_t cb_alloc; ++ size_t cb_used; ++ boolean_t cb_verbose; ++ int (*cb_getone)(zfs_handle_t *, void *); ++} get_all_cb_t; ++ ++void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); ++int libzfs_dataset_cmp(const void *, const void *); ++ ++/* ++ * Functions to create and destroy datasets. ++ */ ++extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t, ++ nvlist_t *); ++extern int zfs_create_ancestors(libzfs_handle_t *, const char *); ++extern int zfs_destroy(zfs_handle_t *, boolean_t); ++extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); ++extern int zfs_destroy_snaps_nvl(zfs_handle_t *, nvlist_t *, boolean_t); ++extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); ++extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); ++extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); ++extern int zfs_rename(zfs_handle_t *, const char *, boolean_t, boolean_t); ++ ++typedef struct sendflags { ++ /* print informational messages (ie, -v was specified) */ ++ boolean_t verbose; ++ ++ /* recursive send (ie, -R) */ ++ boolean_t replicate; ++ ++ /* for incrementals, do all intermediate snapshots */ ++ boolean_t doall; ++ ++ /* if dataset is a clone, do incremental from its origin */ ++ boolean_t fromorigin; ++ ++ /* do deduplication */ ++ boolean_t dedup; ++ ++ /* send properties (ie, -p) */ ++ boolean_t props; ++ ++ /* do not send (no-op, ie. -n) */ ++ boolean_t dryrun; ++ ++ /* parsable verbose output (ie. -P) */ ++ boolean_t parsable; ++ ++ /* show progress (ie. -v) */ ++ boolean_t progress; ++} sendflags_t; ++ ++typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); ++ ++extern int zfs_send(zfs_handle_t *, const char *, const char *, ++ sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); ++ ++extern int zfs_promote(zfs_handle_t *); ++extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t, ++ boolean_t, boolean_t, int, uint64_t, uint64_t); ++extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); ++extern int zfs_get_holds(zfs_handle_t *, nvlist_t **); ++extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *); ++ ++typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, ++ uid_t rid, uint64_t space); ++ ++extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t, ++ zfs_userspace_cb_t, void *); ++ ++extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **); ++extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *); ++ ++typedef struct recvflags { ++ /* print informational messages (ie, -v was specified) */ ++ boolean_t verbose; ++ ++ /* the destination is a prefix, not the exact fs (ie, -d) */ ++ boolean_t isprefix; ++ ++ /* ++ * Only the tail of the sent snapshot path is appended to the ++ * destination to determine the received snapshot name (ie, -e). ++ */ ++ boolean_t istail; ++ ++ /* do not actually do the recv, just check if it would work (ie, -n) */ ++ boolean_t dryrun; ++ ++ /* rollback/destroy filesystems as necessary (eg, -F) */ ++ boolean_t force; ++ ++ /* set "canmount=off" on all modified filesystems */ ++ boolean_t canmountoff; ++ ++ /* byteswap flag is used internally; callers need not specify */ ++ boolean_t byteswap; ++ ++ /* do not mount file systems as they are extracted (private) */ ++ boolean_t nomount; ++} recvflags_t; ++ ++extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t *, ++ int, avl_tree_t *); ++ ++typedef enum diff_flags { ++ ZFS_DIFF_PARSEABLE = 0x1, ++ ZFS_DIFF_TIMESTAMP = 0x2, ++ ZFS_DIFF_CLASSIFY = 0x4 ++} diff_flags_t; ++ ++extern int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *, ++ int); ++ ++/* ++ * Miscellaneous functions. ++ */ ++extern const char *zfs_type_to_name(zfs_type_t); ++extern void zfs_refresh_properties(zfs_handle_t *); ++extern int zfs_name_valid(const char *, zfs_type_t); ++extern zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, char *, zfs_type_t); ++extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, ++ zfs_type_t); ++extern int zfs_spa_version(zfs_handle_t *, int *); ++extern int zfs_append_partition(char *path, size_t max_len); ++extern int zfs_resolve_shortname(const char *name, char *path, size_t pathlen); ++extern int zfs_strcmp_pathname(char *name, char *cmp_name, int wholedisk); ++ ++/* ++ * Mount support functions. ++ */ ++extern boolean_t is_mounted(libzfs_handle_t *, const char *special, char **); ++extern boolean_t zfs_is_mounted(zfs_handle_t *, char **); ++extern int zfs_mount(zfs_handle_t *, const char *, int); ++extern int zfs_unmount(zfs_handle_t *, const char *, int); ++extern int zfs_unmountall(zfs_handle_t *, int); ++ ++/* ++ * Share support functions. ++ */ ++extern boolean_t zfs_is_shared(zfs_handle_t *); ++extern int zfs_share(zfs_handle_t *); ++extern int zfs_unshare(zfs_handle_t *); ++ ++/* ++ * Protocol-specific share support functions. ++ */ ++extern boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **); ++extern boolean_t zfs_is_shared_smb(zfs_handle_t *, char **); ++extern int zfs_share_nfs(zfs_handle_t *); ++extern int zfs_share_smb(zfs_handle_t *); ++extern int zfs_shareall(zfs_handle_t *); ++extern int zfs_unshare_nfs(zfs_handle_t *, const char *); ++extern int zfs_unshare_smb(zfs_handle_t *, const char *); ++extern int zfs_unshareall_nfs(zfs_handle_t *); ++extern int zfs_unshareall_smb(zfs_handle_t *); ++extern int zfs_unshareall_bypath(zfs_handle_t *, const char *); ++extern int zfs_unshareall(zfs_handle_t *); ++extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, ++ void *, void *, int, zfs_share_op_t); ++ ++/* ++ * Utility function to convert a number to a human-readable form. ++ */ ++extern void zfs_nicenum(uint64_t, char *, size_t); ++extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); ++ ++/* ++ * Utility functions to run an external process. ++ */ ++#define STDOUT_VERBOSE 0x01 ++#define STDERR_VERBOSE 0x02 ++ ++int libzfs_run_process(const char *, char **, int flags); ++int libzfs_load_module(const char *); ++ ++/* ++ * Given a device or file, determine if it is part of a pool. ++ */ ++extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, ++ boolean_t *); ++ ++/* ++ * Label manipulation. ++ */ ++extern int zpool_read_label(int, nvlist_t **); ++extern int zpool_clear_label(int); ++ ++/* ++ * Management interfaces for SMB ACL files ++ */ ++ ++int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *); ++int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *); ++int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *); ++int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); ++ ++/* ++ * Enable and disable datasets within a pool by mounting/unmounting and ++ * sharing/unsharing them. ++ */ ++extern int zpool_enable_datasets(zpool_handle_t *, const char *, int); ++extern int zpool_disable_datasets(zpool_handle_t *, boolean_t); ++ ++/* ++ * Mappings between vdev and FRU. ++ */ ++extern void libzfs_fru_refresh(libzfs_handle_t *); ++extern const char *libzfs_fru_lookup(libzfs_handle_t *, const char *); ++extern const char *libzfs_fru_devpath(libzfs_handle_t *, const char *); ++extern boolean_t libzfs_fru_compare(libzfs_handle_t *, const char *, ++ const char *); ++extern boolean_t libzfs_fru_notself(libzfs_handle_t *, const char *); ++extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _LIBZFS_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/libzfs_impl.h linux-3.2.33-go/include/zfs/libzfs_impl.h +--- linux-3.2.33-go.orig/include/zfs/libzfs_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/libzfs_impl.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,222 @@ ++/* ++ * CDDL HEADER SART ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ */ ++ ++#ifndef _LIBFS_IMPL_H ++#define _LIBFS_IMPL_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#if defined(HAVE_LIBTOPO) ++#include ++#endif /* HAVE_LIBTOPO */ ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#ifdef VERIFY ++#undef VERIFY ++#endif ++#define VERIFY verify ++ ++typedef struct libzfs_fru { ++ char *zf_device; ++ char *zf_fru; ++ struct libzfs_fru *zf_chain; ++ struct libzfs_fru *zf_next; ++} libzfs_fru_t; ++ ++struct libzfs_handle { ++ int libzfs_error; ++ int libzfs_fd; ++ FILE *libzfs_mnttab; ++ FILE *libzfs_sharetab; ++ zpool_handle_t *libzfs_pool_handles; ++ uu_avl_pool_t *libzfs_ns_avlpool; ++ uu_avl_t *libzfs_ns_avl; ++ uint64_t libzfs_ns_gen; ++ int libzfs_desc_active; ++ char libzfs_action[1024]; ++ char libzfs_desc[1024]; ++ char *libzfs_log_str; ++ int libzfs_printerr; ++ int libzfs_storeerr; /* stuff error messages into buffer */ ++ void *libzfs_sharehdl; /* libshare handle */ ++ uint_t libzfs_shareflags; ++ boolean_t libzfs_mnttab_enable; ++ avl_tree_t libzfs_mnttab_cache; ++ int libzfs_pool_iter; ++#if defined(HAVE_LIBTOPO) ++ topo_hdl_t *libzfs_topo_hdl; ++ libzfs_fru_t **libzfs_fru_hash; ++ libzfs_fru_t *libzfs_fru_list; ++#endif /* HAVE_LIBTOPO */ ++ char libzfs_chassis_id[256]; ++}; ++ ++#define ZFSSHARE_MISS 0x01 /* Didn't find entry in cache */ ++ ++struct zfs_handle { ++ libzfs_handle_t *zfs_hdl; ++ zpool_handle_t *zpool_hdl; ++ char zfs_name[ZFS_MAXNAMELEN]; ++ zfs_type_t zfs_type; /* type including snapshot */ ++ zfs_type_t zfs_head_type; /* type excluding snapshot */ ++ dmu_objset_stats_t zfs_dmustats; ++ nvlist_t *zfs_props; ++ nvlist_t *zfs_user_props; ++ nvlist_t *zfs_recvd_props; ++ boolean_t zfs_mntcheck; ++ char *zfs_mntopts; ++ uint8_t *zfs_props_table; ++}; ++ ++/* ++ * This is different from checking zfs_type, because it will also catch ++ * snapshots of volumes. ++ */ ++#define ZFS_IS_VOLUME(zhp) ((zhp)->zfs_head_type == ZFS_TYPE_VOLUME) ++ ++struct zpool_handle { ++ libzfs_handle_t *zpool_hdl; ++ zpool_handle_t *zpool_next; ++ char zpool_name[ZPOOL_MAXNAMELEN]; ++ int zpool_state; ++ size_t zpool_config_size; ++ nvlist_t *zpool_config; ++ nvlist_t *zpool_old_config; ++ nvlist_t *zpool_props; ++ diskaddr_t zpool_start_block; ++}; ++ ++typedef enum { ++ PROTO_NFS = 0, ++ PROTO_SMB = 1, ++ PROTO_END = 2 ++} zfs_share_proto_t; ++ ++/* ++ * The following can be used as a bitmask and any new values ++ * added must preserve that capability. ++ */ ++typedef enum { ++ SHARED_NOT_SHARED = 0x0, ++ SHARED_NFS = 0x2, ++ SHARED_SMB = 0x4 ++} zfs_share_type_t; ++ ++int zfs_error(libzfs_handle_t *, int, const char *); ++int zfs_error_fmt(libzfs_handle_t *, int, const char *, ...); ++void zfs_error_aux(libzfs_handle_t *, const char *, ...); ++void *zfs_alloc(libzfs_handle_t *, size_t); ++void *zfs_realloc(libzfs_handle_t *, void *, size_t, size_t); ++char *zfs_asprintf(libzfs_handle_t *, const char *, ...); ++char *zfs_strdup(libzfs_handle_t *, const char *); ++int no_memory(libzfs_handle_t *); ++ ++int zfs_standard_error(libzfs_handle_t *, int, const char *); ++int zfs_standard_error_fmt(libzfs_handle_t *, int, const char *, ...); ++int zpool_standard_error(libzfs_handle_t *, int, const char *); ++int zpool_standard_error_fmt(libzfs_handle_t *, int, const char *, ...); ++ ++int get_dependents(libzfs_handle_t *, boolean_t, const char *, char ***, ++ size_t *); ++zfs_handle_t *make_dataset_handle_zc(libzfs_handle_t *, zfs_cmd_t *); ++zfs_handle_t *make_dataset_simple_handle_zc(zfs_handle_t *, zfs_cmd_t *); ++ ++int zprop_parse_value(libzfs_handle_t *, nvpair_t *, int, zfs_type_t, ++ nvlist_t *, char **, uint64_t *, const char *); ++int zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, ++ zfs_type_t type); ++ ++/* ++ * Use this changelist_gather() flag to force attempting mounts ++ * on each change node regardless of whether or not it is currently ++ * mounted. ++ */ ++#define CL_GATHER_MOUNT_ALWAYS 1 ++ ++typedef struct prop_changelist prop_changelist_t; ++ ++int zcmd_alloc_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, size_t); ++int zcmd_write_src_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *); ++int zcmd_write_conf_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *); ++int zcmd_expand_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *); ++int zcmd_read_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t **); ++void zcmd_free_nvlists(zfs_cmd_t *); ++ ++int changelist_prefix(prop_changelist_t *); ++int changelist_postfix(prop_changelist_t *); ++void changelist_rename(prop_changelist_t *, const char *, const char *); ++void changelist_remove(prop_changelist_t *, const char *); ++void changelist_free(prop_changelist_t *); ++prop_changelist_t *changelist_gather(zfs_handle_t *, zfs_prop_t, int, int); ++int changelist_unshare(prop_changelist_t *, zfs_share_proto_t *); ++int changelist_haszonedchild(prop_changelist_t *); ++ ++void remove_mountpoint(zfs_handle_t *); ++int create_parents(libzfs_handle_t *, char *, int); ++boolean_t isa_child_of(const char *dataset, const char *parent); ++ ++zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *); ++ ++int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **); ++ ++int zvol_create_link(libzfs_handle_t *, const char *); ++int zvol_remove_link(libzfs_handle_t *, const char *); ++boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *); ++ ++int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, ++ boolean_t modifying); ++ ++void namespace_clear(libzfs_handle_t *); ++ ++/* ++ * libshare (sharemgr) interfaces used internally. ++ */ ++ ++extern int zfs_init_libshare(libzfs_handle_t *, int); ++extern void zfs_uninit_libshare(libzfs_handle_t *); ++extern int zfs_parse_options(char *, zfs_share_proto_t); ++ ++extern int zfs_unshare_proto(zfs_handle_t *, ++ const char *, zfs_share_proto_t *); ++ ++extern void libzfs_fru_clear(libzfs_handle_t *, boolean_t); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _LIBFS_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/linux/blkdev_compat.h linux-3.2.33-go/include/zfs/linux/blkdev_compat.h +--- linux-3.2.33-go.orig/include/zfs/linux/blkdev_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/linux/blkdev_compat.h 2012-11-16 23:25:34.345039382 +0100 +@@ -0,0 +1,458 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (C) 2011 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * LLNL-CODE-403049. ++ */ ++ ++#ifndef _ZFS_BLKDEV_H ++#define _ZFS_BLKDEV_H ++ ++#include ++#include ++ ++#ifndef HAVE_FMODE_T ++typedef unsigned __bitwise__ fmode_t; ++#endif /* HAVE_FMODE_T */ ++ ++#ifndef HAVE_BLK_FETCH_REQUEST ++static inline struct request * ++blk_fetch_request(struct request_queue *q) ++{ ++ struct request *req; ++ ++ req = elv_next_request(q); ++ if (req) ++ blkdev_dequeue_request(req); ++ ++ return req; ++} ++#endif /* HAVE_BLK_FETCH_REQUEST */ ++ ++#ifndef HAVE_BLK_REQUEUE_REQUEST ++static inline void ++blk_requeue_request(request_queue_t *q, struct request *req) ++{ ++ elv_requeue_request(q, req); ++} ++#endif /* HAVE_BLK_REQUEUE_REQUEST */ ++ ++#ifndef HAVE_BLK_END_REQUEST ++static inline bool ++__blk_end_request(struct request *req, int error, unsigned int nr_bytes) ++{ ++ LIST_HEAD(list); ++ ++ /* ++ * Request has already been dequeued but 2.6.18 version of ++ * end_request() unconditionally dequeues the request so we ++ * add it to a local list to prevent hitting the BUG_ON. ++ */ ++ list_add(&req->queuelist, &list); ++ ++ /* ++ * The old API required the driver to end each segment and not ++ * the entire request. In our case we always need to end the ++ * entire request partial requests are not supported. ++ */ ++ req->hard_cur_sectors = nr_bytes >> 9; ++ end_request(req, ((error == 0) ? 1 : error)); ++ ++ return 0; ++} ++ ++static inline bool ++blk_end_request(struct request *req, int error, unsigned int nr_bytes) ++{ ++ struct request_queue *q = req->q; ++ bool rc; ++ ++ spin_lock_irq(q->queue_lock); ++ rc = __blk_end_request(req, error, nr_bytes); ++ spin_unlock_irq(q->queue_lock); ++ ++ return rc; ++} ++#else ++# ifdef HAVE_BLK_END_REQUEST_GPL_ONLY ++/* ++ * Define required to avoid conflicting 2.6.29 non-static prototype for a ++ * GPL-only version of the helper. As of 2.6.31 the helper is available ++ * to non-GPL modules and is not explicitly exported GPL-only. ++ */ ++# define __blk_end_request __blk_end_request_x ++# define blk_end_request blk_end_request_x ++ ++static inline bool ++__blk_end_request_x(struct request *req, int error, unsigned int nr_bytes) ++{ ++ /* ++ * The old API required the driver to end each segment and not ++ * the entire request. In our case we always need to end the ++ * entire request partial requests are not supported. ++ */ ++ req->hard_cur_sectors = nr_bytes >> 9; ++ end_request(req, ((error == 0) ? 1 : error)); ++ ++ return 0; ++} ++static inline bool ++blk_end_request_x(struct request *req, int error, unsigned int nr_bytes) ++{ ++ struct request_queue *q = req->q; ++ bool rc; ++ ++ spin_lock_irq(q->queue_lock); ++ rc = __blk_end_request_x(req, error, nr_bytes); ++ spin_unlock_irq(q->queue_lock); ++ ++ return rc; ++} ++# endif /* HAVE_BLK_END_REQUEST_GPL_ONLY */ ++#endif /* HAVE_BLK_END_REQUEST */ ++ ++/* ++ * 2.6.36 API change, ++ * The blk_queue_flush() interface has replaced blk_queue_ordered() ++ * interface. However, while the old interface was available to all the ++ * new one is GPL-only. Thus if the GPL-only version is detected we ++ * implement our own trivial helper compatibility funcion. The hope is ++ * that long term this function will be opened up. ++ */ ++#if defined(HAVE_BLK_QUEUE_FLUSH) && defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY) ++#define blk_queue_flush __blk_queue_flush ++static inline void ++__blk_queue_flush(struct request_queue *q, unsigned int flags) ++{ ++ q->flush_flags = flags & (REQ_FLUSH | REQ_FUA); ++} ++#endif /* HAVE_BLK_QUEUE_FLUSH && HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */ ++ ++#ifndef HAVE_BLK_RQ_POS ++static inline sector_t ++blk_rq_pos(struct request *req) ++{ ++ return req->sector; ++} ++#endif /* HAVE_BLK_RQ_POS */ ++ ++#ifndef HAVE_BLK_RQ_SECTORS ++static inline unsigned int ++blk_rq_sectors(struct request *req) ++{ ++ return req->nr_sectors; ++} ++#endif /* HAVE_BLK_RQ_SECTORS */ ++ ++#if !defined(HAVE_BLK_RQ_BYTES) || defined(HAVE_BLK_RQ_BYTES_GPL_ONLY) ++/* ++ * Define required to avoid conflicting 2.6.29 non-static prototype for a ++ * GPL-only version of the helper. As of 2.6.31 the helper is available ++ * to non-GPL modules in the form of a static inline in the header. ++ */ ++#define blk_rq_bytes __blk_rq_bytes ++static inline unsigned int ++__blk_rq_bytes(struct request *req) ++{ ++ return blk_rq_sectors(req) << 9; ++} ++#endif /* !HAVE_BLK_RQ_BYTES || HAVE_BLK_RQ_BYTES_GPL_ONLY */ ++ ++/* ++ * Most of the blk_* macros were removed in 2.6.36. Ostensibly this was ++ * done to improve readability and allow easier grepping. However, from ++ * a portability stand point the macros are helpful. Therefore the needed ++ * macros are redefined here if they are missing from the kernel. ++ */ ++#ifndef blk_fs_request ++#define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS) ++#endif ++ ++/* ++ * 2.6.27 API change, ++ * The blk_queue_stackable() queue flag was added in 2.6.27 to handle dm ++ * stacking drivers. Prior to this request stacking drivers were detected ++ * by checking (q->request_fn == NULL), for earlier kernels we revert to ++ * this legacy behavior. ++ */ ++#ifndef blk_queue_stackable ++#define blk_queue_stackable(q) ((q)->request_fn == NULL) ++#endif ++ ++/* ++ * 2.6.34 API change, ++ * The blk_queue_max_hw_sectors() function replaces blk_queue_max_sectors(). ++ */ ++#ifndef HAVE_BLK_QUEUE_MAX_HW_SECTORS ++#define blk_queue_max_hw_sectors __blk_queue_max_hw_sectors ++static inline void ++__blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) ++{ ++ blk_queue_max_sectors(q, max_hw_sectors); ++} ++#endif ++ ++/* ++ * 2.6.34 API change, ++ * The blk_queue_max_segments() function consolidates ++ * blk_queue_max_hw_segments() and blk_queue_max_phys_segments(). ++ */ ++#ifndef HAVE_BLK_QUEUE_MAX_SEGMENTS ++#define blk_queue_max_segments __blk_queue_max_segments ++static inline void ++__blk_queue_max_segments(struct request_queue *q, unsigned short max_segments) ++{ ++ blk_queue_max_phys_segments(q, max_segments); ++ blk_queue_max_hw_segments(q, max_segments); ++} ++#endif ++ ++/* ++ * 2.6.30 API change, ++ * The blk_queue_physical_block_size() function was introduced to ++ * indicate the smallest I/O the device can write without incurring ++ * a read-modify-write penalty. For older kernels this is a no-op. ++ */ ++#ifndef HAVE_BLK_QUEUE_PHYSICAL_BLOCK_SIZE ++#define blk_queue_physical_block_size(q, x) ((void)(0)) ++#endif ++ ++/* ++ * 2.6.30 API change, ++ * The blk_queue_io_opt() function was added to indicate the optimal ++ * I/O size for the device. For older kernels this is a no-op. ++ */ ++#ifndef HAVE_BLK_QUEUE_IO_OPT ++#define blk_queue_io_opt(q, x) ((void)(0)) ++#endif ++ ++#ifndef HAVE_GET_DISK_RO ++static inline int ++get_disk_ro(struct gendisk *disk) ++{ ++ int policy = 0; ++ ++ if (disk->part[0]) ++ policy = disk->part[0]->policy; ++ ++ return policy; ++} ++#endif /* HAVE_GET_DISK_RO */ ++ ++#ifndef HAVE_RQ_IS_SYNC ++static inline bool ++rq_is_sync(struct request *req) ++{ ++ return (req->flags & REQ_RW_SYNC); ++} ++#endif /* HAVE_RQ_IS_SYNC */ ++ ++#ifndef HAVE_RQ_FOR_EACH_SEGMENT ++struct req_iterator { ++ int i; ++ struct bio *bio; ++}; ++ ++# define for_each_bio(_bio) \ ++ for (; _bio; _bio = _bio->bi_next) ++ ++# define __rq_for_each_bio(_bio, rq) \ ++ if ((rq->bio)) \ ++ for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) ++ ++# define rq_for_each_segment(bvl, _rq, _iter) \ ++ __rq_for_each_bio(_iter.bio, _rq) \ ++ bio_for_each_segment(bvl, _iter.bio, _iter.i) ++#endif /* HAVE_RQ_FOR_EACH_SEGMENT */ ++ ++/* ++ * Portable helper for correctly setting the FAILFAST flags. The ++ * correct usage has changed 3 times from 2.6.12 to 2.6.38. ++ */ ++static inline void ++bio_set_flags_failfast(struct block_device *bdev, int *flags) ++{ ++#ifdef CONFIG_BUG ++ /* ++ * Disable FAILFAST for loopback devices because of the ++ * following incorrect BUG_ON() in loop_make_request(). ++ * This support is also disabled for md devices because the ++ * test suite layers md devices on top of loopback devices. ++ * This may be removed when the loopback driver is fixed. ++ * ++ * BUG_ON(!lo || (rw != READ && rw != WRITE)); ++ */ ++ if ((MAJOR(bdev->bd_dev) == LOOP_MAJOR) || ++ (MAJOR(bdev->bd_dev) == MD_MAJOR)) ++ return; ++ ++#ifdef BLOCK_EXT_MAJOR ++ if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) ++ return; ++#endif /* BLOCK_EXT_MAJOR */ ++#endif /* CONFIG_BUG */ ++ ++#ifdef HAVE_BIO_RW_FAILFAST_DTD ++ /* BIO_RW_FAILFAST_* preferred interface from 2.6.28 - 2.6.35 */ ++ *flags |= ++ ((1 << BIO_RW_FAILFAST_DEV) | ++ (1 << BIO_RW_FAILFAST_TRANSPORT) | ++ (1 << BIO_RW_FAILFAST_DRIVER)); ++#else ++# ifdef HAVE_BIO_RW_FAILFAST ++ /* BIO_RW_FAILFAST preferred interface from 2.6.12 - 2.6.27 */ ++ *flags |= (1 << BIO_RW_FAILFAST); ++# else ++# ifdef HAVE_REQ_FAILFAST_MASK ++ /* REQ_FAILFAST_* preferred interface from 2.6.36 - 2.6.xx, ++ * the BIO_* and REQ_* flags were unified under REQ_* flags. */ ++ *flags |= REQ_FAILFAST_MASK; ++# endif /* HAVE_REQ_FAILFAST_MASK */ ++# endif /* HAVE_BIO_RW_FAILFAST */ ++#endif /* HAVE_BIO_RW_FAILFAST_DTD */ ++} ++ ++/* ++ * Maximum disk label length, it may be undefined for some kernels. ++ */ ++#ifndef DISK_NAME_LEN ++#define DISK_NAME_LEN 32 ++#endif /* DISK_NAME_LEN */ ++ ++/* ++ * 2.6.24 API change, ++ * The bio_end_io() prototype changed slightly. These are helper ++ * macro's to ensure the prototype and return value are handled. ++ */ ++#ifdef HAVE_2ARGS_BIO_END_IO_T ++# define BIO_END_IO_PROTO(fn, x, y, z) static void fn(struct bio *x, int z) ++# define BIO_END_IO_RETURN(rc) return ++#else ++# define BIO_END_IO_PROTO(fn, x, y, z) static int fn(struct bio *x, \ ++ unsigned int y, int z) ++# define BIO_END_IO_RETURN(rc) return rc ++#endif /* HAVE_2ARGS_BIO_END_IO_T */ ++ ++/* ++ * 2.6.38 - 2.6.x API, ++ * blkdev_get_by_path() ++ * blkdev_put() ++ * ++ * 2.6.28 - 2.6.37 API, ++ * open_bdev_exclusive() ++ * close_bdev_exclusive() ++ * ++ * 2.6.12 - 2.6.27 API, ++ * open_bdev_excl() ++ * close_bdev_excl() ++ * ++ * Used to exclusively open a block device from within the kernel. ++ */ ++#if defined(HAVE_BLKDEV_GET_BY_PATH) ++# define vdev_bdev_open(path, md, hld) blkdev_get_by_path(path, \ ++ (md) | FMODE_EXCL, hld) ++# define vdev_bdev_close(bdev, md) blkdev_put(bdev, (md) | FMODE_EXCL) ++#elif defined(HAVE_OPEN_BDEV_EXCLUSIVE) ++# define vdev_bdev_open(path, md, hld) open_bdev_exclusive(path, md, hld) ++# define vdev_bdev_close(bdev, md) close_bdev_exclusive(bdev, md) ++#else ++# define vdev_bdev_open(path, md, hld) open_bdev_excl(path, md, hld) ++# define vdev_bdev_close(bdev, md) close_bdev_excl(bdev) ++#endif /* HAVE_BLKDEV_GET_BY_PATH | HAVE_OPEN_BDEV_EXCLUSIVE */ ++ ++/* ++ * 2.6.22 API change ++ * The function invalidate_bdev() lost it's second argument because ++ * it was unused. ++ */ ++#ifdef HAVE_1ARG_INVALIDATE_BDEV ++# define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev) ++#else ++# define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev, 1) ++#endif /* HAVE_1ARG_INVALIDATE_BDEV */ ++ ++/* ++ * 2.6.30 API change ++ * Change to make it explicit there this is the logical block size. ++ */ ++#ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE ++# define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev) ++#else ++# define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev) ++#endif ++ ++/* ++ * 2.6.37 API change ++ * The WRITE_FLUSH, WRITE_FUA, and WRITE_FLUSH_FUA flags have been ++ * introduced as a replacement for WRITE_BARRIER. This was done to ++ * allow richer semantics to be expressed to the block layer. It is ++ * the block layers responsibility to choose the correct way to ++ * implement these semantics. ++ * ++ * The existence of these flags implies that REQ_FLUSH an REQ_FUA are ++ * defined. Thus we can safely define VDEV_REQ_FLUSH and VDEV_REQ_FUA ++ * compatibility macros. ++ */ ++#ifdef WRITE_FLUSH_FUA ++# define VDEV_WRITE_FLUSH_FUA WRITE_FLUSH_FUA ++# define VDEV_REQ_FLUSH REQ_FLUSH ++# define VDEV_REQ_FUA REQ_FUA ++#else ++# define VDEV_WRITE_FLUSH_FUA WRITE_BARRIER ++# define VDEV_REQ_FLUSH REQ_HARDBARRIER ++# define VDEV_REQ_FUA REQ_HARDBARRIER ++#endif ++ ++/* ++ * 2.6.32 API change ++ * Use the normal I/O patch for discards. ++ */ ++#ifdef REQ_DISCARD ++# define VDEV_REQ_DISCARD REQ_DISCARD ++#endif ++ ++/* ++ * 2.6.33 API change ++ * Discard granularity and alignment restrictions may now be set. For ++ * older kernels which do not support this it is safe to skip it. ++ */ ++#ifdef HAVE_DISCARD_GRANULARITY ++static inline void ++blk_queue_discard_granularity(struct request_queue *q, unsigned int dg) ++{ ++ q->limits.discard_granularity = dg; ++} ++#else ++#define blk_queue_discard_granularity(x, dg) ((void)0) ++#endif /* HAVE_DISCARD_GRANULARITY */ ++ ++/* ++ * Default Linux IO Scheduler, ++ * Setting the scheduler to noop will allow the Linux IO scheduler to ++ * still perform front and back merging, while leaving the request ++ * ordering and prioritization to the ZFS IO scheduler. ++ */ ++#define VDEV_SCHEDULER "noop" ++ ++#endif /* _ZFS_BLKDEV_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/linux/dcache_compat.h linux-3.2.33-go/include/zfs/linux/dcache_compat.h +--- linux-3.2.33-go.orig/include/zfs/linux/dcache_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/linux/dcache_compat.h 2012-11-16 23:25:34.345039382 +0100 +@@ -0,0 +1,38 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (C) 2011 Lawrence Livermore National Security, LLC. ++ */ ++ ++#ifndef _ZFS_DCACHE_H ++#define _ZFS_DCACHE_H ++ ++#include ++ ++#define dname(dentry) ((char *)((dentry)->d_name.name)) ++#define dlen(dentry) ((int)((dentry)->d_name.len)) ++ ++#ifndef HAVE_D_MAKE_ROOT ++#define d_make_root(inode) d_alloc_root(inode) ++#endif /* HAVE_D_MAKE_ROOT */ ++ ++#endif /* _ZFS_DCACHE_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/linux/Makefile linux-3.2.33-go/include/zfs/linux/Makefile +--- linux-3.2.33-go.orig/include/zfs/linux/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/linux/Makefile 2012-11-16 23:25:34.345039382 +0100 +@@ -0,0 +1,664 @@ ++# Makefile.in generated by automake 1.11.6 from Makefile.am. ++# include/linux/Makefile. Generated from Makefile.in by configure. ++ ++# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, ++# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software ++# Foundation, Inc. ++# This Makefile.in is free software; the Free Software Foundation ++# gives unlimited permission to copy and/or distribute it, ++# with or without modifications, as long as this notice is preserved. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without ++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A ++# PARTICULAR PURPOSE. ++ ++ ++ ++ ++am__make_dryrun = \ ++ { \ ++ am__dry=no; \ ++ case $$MAKEFLAGS in \ ++ *\\[\ \ ]*) \ ++ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ ++ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ ++ *) \ ++ for am__flg in $$MAKEFLAGS; do \ ++ case $$am__flg in \ ++ *=*|--*) ;; \ ++ *n*) am__dry=yes; break;; \ ++ esac; \ ++ done;; \ ++ esac; \ ++ test $$am__dry = yes; \ ++ } ++pkgdatadir = $(datadir)/zfs ++pkgincludedir = $(includedir)/zfs ++pkglibdir = $(libdir)/zfs ++pkglibexecdir = $(libexecdir)/zfs ++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd ++install_sh_DATA = $(install_sh) -c -m 644 ++install_sh_PROGRAM = $(install_sh) -c ++install_sh_SCRIPT = $(install_sh) -c ++INSTALL_HEADER = $(INSTALL_DATA) ++transform = $(program_transform_name) ++NORMAL_INSTALL = : ++PRE_INSTALL = : ++POST_INSTALL = : ++NORMAL_UNINSTALL = : ++PRE_UNINSTALL = : ++POST_UNINSTALL = : ++build_triplet = x86_64-unknown-linux-gnu ++host_triplet = x86_64-unknown-linux-gnu ++target_triplet = x86_64-unknown-linux-gnu ++subdir = include/linux ++DIST_COMMON = $(am__kernel_HEADERS_DIST) $(libzfs_HEADERS) \ ++ $(srcdir)/Makefile.am $(srcdir)/Makefile.in ++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ++am__aclocal_m4_deps = \ ++ $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \ ++ $(top_srcdir)/config/kernel-automount.m4 \ ++ $(top_srcdir)/config/kernel-bdev-block-device-operations.m4 \ ++ $(top_srcdir)/config/kernel-bdev-logical-size.m4 \ ++ $(top_srcdir)/config/kernel-bdi-setup-and-register.m4 \ ++ $(top_srcdir)/config/kernel-bdi.m4 \ ++ $(top_srcdir)/config/kernel-bio-empty-barrier.m4 \ ++ $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \ ++ $(top_srcdir)/config/kernel-bio-failfast.m4 \ ++ $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \ ++ $(top_srcdir)/config/kernel-blk-end-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-fetch-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-discard.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-flush.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \ ++ $(top_srcdir)/config/kernel-blk-requeue-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-pos.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get.m4 \ ++ $(top_srcdir)/config/kernel-check-disk-size-change.m4 \ ++ $(top_srcdir)/config/kernel-clear-inode.m4 \ ++ $(top_srcdir)/config/kernel-commit-metadata.m4 \ ++ $(top_srcdir)/config/kernel-create-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-d-make-root.m4 \ ++ $(top_srcdir)/config/kernel-d-obtain-alias.m4 \ ++ $(top_srcdir)/config/kernel-discard-granularity.m4 \ ++ $(top_srcdir)/config/kernel-elevator-change.m4 \ ++ $(top_srcdir)/config/kernel-encode-fh-inode.m4 \ ++ $(top_srcdir)/config/kernel-evict-inode.m4 \ ++ $(top_srcdir)/config/kernel-fallocate.m4 \ ++ $(top_srcdir)/config/kernel-fmode-t.m4 \ ++ $(top_srcdir)/config/kernel-fsync.m4 \ ++ $(top_srcdir)/config/kernel-get-disk-ro.m4 \ ++ $(top_srcdir)/config/kernel-get-gendisk.m4 \ ++ $(top_srcdir)/config/kernel-insert-inode-locked.m4 \ ++ $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \ ++ $(top_srcdir)/config/kernel-kobj-name-len.m4 \ ++ $(top_srcdir)/config/kernel-lookup-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \ ++ $(top_srcdir)/config/kernel-mount-nodev.m4 \ ++ $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \ ++ $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \ ++ $(top_srcdir)/config/kernel-rq-is_sync.m4 \ ++ $(top_srcdir)/config/kernel-security-inode-init.m4 \ ++ $(top_srcdir)/config/kernel-set-nlink.m4 \ ++ $(top_srcdir)/config/kernel-sget-args.m4 \ ++ $(top_srcdir)/config/kernel-show-options.m4 \ ++ $(top_srcdir)/config/kernel-shrink.m4 \ ++ $(top_srcdir)/config/kernel-truncate-range.m4 \ ++ $(top_srcdir)/config/kernel-truncate-setsize.m4 \ ++ $(top_srcdir)/config/kernel-xattr-handler.m4 \ ++ $(top_srcdir)/config/kernel.m4 \ ++ $(top_srcdir)/config/user-arch.m4 \ ++ $(top_srcdir)/config/user-frame-larger-than.m4 \ ++ $(top_srcdir)/config/user-ioctl.m4 \ ++ $(top_srcdir)/config/user-libblkid.m4 \ ++ $(top_srcdir)/config/user-libuuid.m4 \ ++ $(top_srcdir)/config/user-nptl_guard_within_stack.m4 \ ++ $(top_srcdir)/config/user-selinux.m4 \ ++ $(top_srcdir)/config/user-udev.m4 \ ++ $(top_srcdir)/config/user-zlib.m4 $(top_srcdir)/config/user.m4 \ ++ $(top_srcdir)/config/zfs-build.m4 \ ++ $(top_srcdir)/config/zfs-meta.m4 $(top_srcdir)/configure.ac ++am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ ++ $(ACLOCAL_M4) ++mkinstalldirs = $(install_sh) -d ++CONFIG_HEADER = $(top_builddir)/zfs_config.h ++CONFIG_CLEAN_FILES = ++CONFIG_CLEAN_VPATH_FILES = ++AM_V_GEN = $(am__v_GEN_$(V)) ++am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) ++am__v_GEN_0 = @echo " GEN " $@; ++AM_V_at = $(am__v_at_$(V)) ++am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) ++am__v_at_0 = @ ++SOURCES = ++DIST_SOURCES = ++am__can_run_installinfo = \ ++ case $$AM_UPDATE_INFO_DIR in \ ++ n|no|NO) false;; \ ++ *) (install-info --version) >/dev/null 2>&1;; \ ++ esac ++am__kernel_HEADERS_DIST = $(top_srcdir)/include/linux/dcache_compat.h \ ++ $(top_srcdir)/include/linux/xattr_compat.h \ ++ $(top_srcdir)/include/linux/vfs_compat.h \ ++ $(top_srcdir)/include/linux/blkdev_compat.h ++am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; ++am__vpath_adj = case $$p in \ ++ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ ++ *) f=$$p;; \ ++ esac; ++am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; ++am__install_max = 40 ++am__nobase_strip_setup = \ ++ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` ++am__nobase_strip = \ ++ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" ++am__nobase_list = $(am__nobase_strip_setup); \ ++ for p in $$list; do echo "$$p $$p"; done | \ ++ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ ++ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ ++ if (++n[$$2] == $(am__install_max)) \ ++ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ ++ END { for (dir in files) print dir, files[dir] }' ++am__base_list = \ ++ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ ++ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' ++am__uninstall_files_from_dir = { \ ++ test -z "$$files" \ ++ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ ++ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ ++ $(am__cd) "$$dir" && rm -f $$files; }; \ ++ } ++am__installdirs = "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)" ++HEADERS = $(kernel_HEADERS) $(libzfs_HEADERS) ++ETAGS = etags ++CTAGS = ctags ++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ++ACLOCAL = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run aclocal-1.11 ++ALIEN = alien ++ALIEN_VERSION = ++AMTAR = $${TAR-tar} ++AM_DEFAULT_VERBOSITY = 1 ++AR = ar ++AUTOCONF = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run autoconf ++AUTOHEADER = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run autoheader ++AUTOMAKE = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run automake-1.11 ++AWK = gawk ++CC = gcc ++CCAS = gcc ++CCASDEPMODE = depmode=gcc3 ++CCASFLAGS = -g -O2 ++CCDEPMODE = depmode=gcc3 ++CFLAGS = -g -O2 ++CPP = gcc -E ++CPPFLAGS = ++CYGPATH_W = echo ++DEBUG_CFLAGS = -DNDEBUG ++DEBUG_DMU_TX = _without_debug_dmu_tx ++DEBUG_STACKFLAGS = ++DEBUG_ZFS = _without_debug ++DEFAULT_INIT_DIR = ${prefix}/etc/init.d ++DEFAULT_INIT_SCRIPT = gentoo ++DEFAULT_PACKAGE = tgz ++DEFS = -DHAVE_CONFIG_H ++DEPDIR = .deps ++DLLTOOL = false ++DPKG = dpkg ++DPKGBUILD = dpkg-buildpackage ++DPKGBUILD_VERSION = ++DPKG_VERSION = ++DSYMUTIL = ++DUMPBIN = ++ECHO_C = ++ECHO_N = -n ++ECHO_T = ++EGREP = /bin/grep -E ++EXEEXT = ++FGREP = /bin/grep -F ++FRAME_LARGER_THAN = -Wframe-larger-than=1024 ++GREP = /bin/grep ++HAVE_ALIEN = no ++HAVE_DPKG = no ++HAVE_DPKGBUILD = no ++HAVE_MAKEPKG = ++HAVE_PACMAN = ++HAVE_RPM = yes ++HAVE_RPMBUILD = yes ++INSTALL = /usr/bin/install -c ++INSTALL_DATA = ${INSTALL} -m 644 ++INSTALL_PROGRAM = ${INSTALL} ++INSTALL_SCRIPT = ${INSTALL} ++INSTALL_STRIP_PROGRAM = $(install_sh) -c -s ++KERNELCPPFLAGS = -Wno-unused-but-set-variable -DHAVE_SPL -D_KERNEL -DTEXT_DOMAIN=\"zfs-linux-kernel\" -DNDEBUG ++KERNELMAKE_PARAMS = O=/usr/src/linux-3.6.0-sabayon ++LD = /usr/x86_64-pc-linux-gnu/bin/ld -m elf_x86_64 ++LDFLAGS = ++LIBBLKID = ++LIBOBJS = ++LIBS = -luuid -luuid -lz -lz -lz ++LIBSELINUX = ++LIBTOOL = $(SHELL) $(top_builddir)/libtool ++LIBUUID = -luuid ++LINUX = /usr/src/linux-3.2.33-go ++LINUX_OBJ = /usr/src/linux-3.6.0-sabayon ++LINUX_SYMBOLS = NONE ++LINUX_VERSION = 3.6.0-sabayon ++LIPO = ++LN_S = ln -s ++LTLIBOBJS = ++MAINT = # ++MAKEINFO = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run makeinfo ++MAKEPKG = ++MAKEPKG_VERSION = ++MANIFEST_TOOL = : ++MKDIR_P = /bin/mkdir -p ++NM = /usr/bin/nm -B ++NMEDIT = ++NO_UNUSED_BUT_SET_VARIABLE = -Wno-unused-but-set-variable ++OBJDUMP = objdump ++OBJEXT = o ++OTOOL = ++OTOOL64 = ++PACKAGE = zfs ++PACKAGE_BUGREPORT = ++PACKAGE_NAME = ++PACKAGE_STRING = ++PACKAGE_TARNAME = ++PACKAGE_URL = ++PACKAGE_VERSION = ++PACMAN = ++PACMAN_VERSION = ++PATH_SEPARATOR = : ++RANLIB = ranlib ++RPM = rpm ++RPMBUILD = rpmbuild ++RPMBUILD_VERSION = 4.10.0 ++RPM_VERSION = 4.10.0 ++SED = /bin/sed ++SET_MAKE = ++SHELL = /bin/sh ++SPL = /usr/src/linux-3.2.33-go ++SPL_OBJ = /usr/src/linux-3.2.33-go ++SPL_SYMBOLS = NONE ++SPL_VERSION = 0.6.0-rc12 ++STRIP = strip ++TARGET_ASM_DIR = asm-x86_64 ++VENDOR = gentoo ++VERSION = 0.6.0 ++ZFS_CONFIG = all ++ZFS_META_ALIAS = zfs-0.6.0-rc12 ++ZFS_META_AUTHOR = Sun Microsystems/Oracle, Lawrence Livermore National Laboratory ++ZFS_META_DATA = ++ZFS_META_LICENSE = CDDL ++ZFS_META_LT_AGE = ++ZFS_META_LT_CURRENT = ++ZFS_META_LT_REVISION = ++ZFS_META_NAME = zfs ++ZFS_META_RELEASE = rc12 ++ZFS_META_VERSION = 0.6.0 ++ZLIB = -lz ++abs_builddir = /root/zfs-0.6.0-rc12/include/linux ++abs_srcdir = /root/zfs-0.6.0-rc12/include/linux ++abs_top_builddir = /root/zfs-0.6.0-rc12 ++abs_top_srcdir = /root/zfs-0.6.0-rc12 ++ac_ct_AR = ar ++ac_ct_CC = gcc ++ac_ct_DUMPBIN = ++am__include = include ++am__leading_dot = . ++am__quote = ++am__tar = $${TAR-tar} chof - "$$tardir" ++am__untar = $${TAR-tar} xf - ++bindir = ${exec_prefix}/bin ++build = x86_64-unknown-linux-gnu ++build_alias = ++build_cpu = x86_64 ++build_os = linux-gnu ++build_vendor = unknown ++builddir = . ++datadir = ${datarootdir} ++datarootdir = ${prefix}/share ++docdir = ${datarootdir}/doc/${PACKAGE} ++dvidir = ${docdir} ++exec_prefix = ${prefix} ++host = x86_64-unknown-linux-gnu ++host_alias = ++host_cpu = x86_64 ++host_os = linux-gnu ++host_vendor = unknown ++htmldir = ${docdir} ++includedir = ${prefix}/include ++infodir = ${datarootdir}/info ++install_sh = ${SHELL} /root/zfs-0.6.0-rc12/config/install-sh ++libdir = ${exec_prefix}/lib ++libexecdir = ${exec_prefix}/libexec ++localedir = ${datarootdir}/locale ++localstatedir = ${prefix}/var ++mandir = ${datarootdir}/man ++mkdir_p = /bin/mkdir -p ++oldincludedir = /usr/include ++pdfdir = ${docdir} ++prefix = /usr/local ++program_transform_name = s,x,x, ++psdir = ${docdir} ++sbindir = ${exec_prefix}/sbin ++sharedstatedir = ${prefix}/com ++srcdir = . ++sysconfdir = ${prefix}/etc ++target = x86_64-unknown-linux-gnu ++target_alias = ++target_cpu = x86_64 ++target_os = linux-gnu ++target_vendor = unknown ++top_build_prefix = ../../ ++top_builddir = ../.. ++top_srcdir = ../.. ++udevdir = ${exec_prefix}/lib/udev ++udevruledir = ${udevdir}/rules.d ++COMMON_H = ++KERNEL_H = \ ++ $(top_srcdir)/include/linux/dcache_compat.h \ ++ $(top_srcdir)/include/linux/xattr_compat.h \ ++ $(top_srcdir)/include/linux/vfs_compat.h \ ++ $(top_srcdir)/include/linux/blkdev_compat.h ++ ++USER_H = ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++libzfsdir = $(includedir)/libzfs/linux ++libzfs_HEADERS = $(COMMON_H) $(USER_H) ++#kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION)/linux ++#kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++all: all-am ++ ++.SUFFIXES: ++$(srcdir)/Makefile.in: # $(srcdir)/Makefile.am $(am__configure_deps) ++ @for dep in $?; do \ ++ case '$(am__configure_deps)' in \ ++ *$$dep*) \ ++ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ ++ && { if test -f $@; then exit 0; else break; fi; }; \ ++ exit 1;; \ ++ esac; \ ++ done; \ ++ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu include/linux/Makefile'; \ ++ $(am__cd) $(top_srcdir) && \ ++ $(AUTOMAKE) --gnu include/linux/Makefile ++.PRECIOUS: Makefile ++Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status ++ @case '$?' in \ ++ *config.status*) \ ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ ++ *) \ ++ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ ++ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ ++ esac; ++ ++$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++ ++$(top_srcdir)/configure: # $(am__configure_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(ACLOCAL_M4): # $(am__aclocal_m4_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(am__aclocal_m4_deps): ++ ++mostlyclean-libtool: ++ -rm -f *.lo ++ ++clean-libtool: ++ -rm -rf .libs _libs ++install-kernelHEADERS: $(kernel_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(kerneldir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(kerneldir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(kerneldir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(kerneldir)" || exit $$?; \ ++ done ++ ++uninstall-kernelHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(kerneldir)'; $(am__uninstall_files_from_dir) ++install-libzfsHEADERS: $(libzfs_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(libzfsdir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(libzfsdir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libzfsdir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(libzfsdir)" || exit $$?; \ ++ done ++ ++uninstall-libzfsHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(libzfsdir)'; $(am__uninstall_files_from_dir) ++ ++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ mkid -fID $$unique ++tags: TAGS ++ ++TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ set x; \ ++ here=`pwd`; \ ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ shift; \ ++ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ ++ test -n "$$unique" || unique=$$empty_fix; \ ++ if test $$# -gt 0; then \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ "$$@" $$unique; \ ++ else \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ $$unique; \ ++ fi; \ ++ fi ++ctags: CTAGS ++CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ test -z "$(CTAGS_ARGS)$$unique" \ ++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ ++ $$unique ++ ++GTAGS: ++ here=`$(am__cd) $(top_builddir) && pwd` \ ++ && $(am__cd) $(top_srcdir) \ ++ && gtags -i $(GTAGS_ARGS) "$$here" ++ ++distclean-tags: ++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags ++ ++distdir: $(DISTFILES) ++ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ list='$(DISTFILES)'; \ ++ dist_files=`for file in $$list; do echo $$file; done | \ ++ sed -e "s|^$$srcdirstrip/||;t" \ ++ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ ++ case $$dist_files in \ ++ */*) $(MKDIR_P) `echo "$$dist_files" | \ ++ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ ++ sort -u` ;; \ ++ esac; \ ++ for file in $$dist_files; do \ ++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ ++ if test -d $$d/$$file; then \ ++ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ ++ if test -d "$(distdir)/$$file"; then \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ ++ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ ++ else \ ++ test -f "$(distdir)/$$file" \ ++ || cp -p $$d/$$file "$(distdir)/$$file" \ ++ || exit 1; \ ++ fi; \ ++ done ++check-am: all-am ++check: check-am ++all-am: Makefile $(HEADERS) ++installdirs: ++ for dir in "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)"; do \ ++ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ ++ done ++install: install-am ++install-exec: install-exec-am ++install-data: install-data-am ++uninstall: uninstall-am ++ ++install-am: all-am ++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am ++ ++installcheck: installcheck-am ++install-strip: ++ if test -z '$(STRIP)'; then \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ install; \ ++ else \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ ++ fi ++mostlyclean-generic: ++ ++clean-generic: ++ ++distclean-generic: ++ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) ++ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) ++ ++maintainer-clean-generic: ++ @echo "This command is intended for maintainers to use" ++ @echo "it deletes files that may require special tools to rebuild." ++clean: clean-am ++ ++clean-am: clean-generic clean-libtool mostlyclean-am ++ ++distclean: distclean-am ++ -rm -f Makefile ++distclean-am: clean-am distclean-generic distclean-tags ++ ++dvi: dvi-am ++ ++dvi-am: ++ ++html: html-am ++ ++html-am: ++ ++info: info-am ++ ++info-am: ++ ++install-data-am: install-kernelHEADERS install-libzfsHEADERS ++ ++install-dvi: install-dvi-am ++ ++install-dvi-am: ++ ++install-exec-am: ++ ++install-html: install-html-am ++ ++install-html-am: ++ ++install-info: install-info-am ++ ++install-info-am: ++ ++install-man: ++ ++install-pdf: install-pdf-am ++ ++install-pdf-am: ++ ++install-ps: install-ps-am ++ ++install-ps-am: ++ ++installcheck-am: ++ ++maintainer-clean: maintainer-clean-am ++ -rm -f Makefile ++maintainer-clean-am: distclean-am maintainer-clean-generic ++ ++mostlyclean: mostlyclean-am ++ ++mostlyclean-am: mostlyclean-generic mostlyclean-libtool ++ ++pdf: pdf-am ++ ++pdf-am: ++ ++ps: ps-am ++ ++ps-am: ++ ++uninstall-am: uninstall-kernelHEADERS uninstall-libzfsHEADERS ++ ++.MAKE: install-am install-strip ++ ++.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ ++ clean-libtool ctags distclean distclean-generic \ ++ distclean-libtool distclean-tags distdir dvi dvi-am html \ ++ html-am info info-am install install-am install-data \ ++ install-data-am install-dvi install-dvi-am install-exec \ ++ install-exec-am install-html install-html-am install-info \ ++ install-info-am install-kernelHEADERS install-libzfsHEADERS \ ++ install-man install-pdf install-pdf-am install-ps \ ++ install-ps-am install-strip installcheck installcheck-am \ ++ installdirs maintainer-clean maintainer-clean-generic \ ++ mostlyclean mostlyclean-generic mostlyclean-libtool pdf pdf-am \ ++ ps ps-am tags uninstall uninstall-am uninstall-kernelHEADERS \ ++ uninstall-libzfsHEADERS ++ ++ ++# Tell versions [3.59,3.63) of GNU make to not export all variables. ++# Otherwise a system limit (for SysV at least) may be exceeded. ++.NOEXPORT: +diff -uNr linux-3.2.33-go.orig/include/zfs/linux/Makefile.am linux-3.2.33-go/include/zfs/linux/Makefile.am +--- linux-3.2.33-go.orig/include/zfs/linux/Makefile.am 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/linux/Makefile.am 2012-11-16 23:25:34.345039382 +0100 +@@ -0,0 +1,21 @@ ++COMMON_H = ++ ++KERNEL_H = \ ++ $(top_srcdir)/include/linux/dcache_compat.h \ ++ $(top_srcdir)/include/linux/xattr_compat.h \ ++ $(top_srcdir)/include/linux/vfs_compat.h \ ++ $(top_srcdir)/include/linux/blkdev_compat.h ++ ++USER_H = ++ ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++ ++if CONFIG_USER ++libzfsdir = $(includedir)/libzfs/linux ++libzfs_HEADERS = $(COMMON_H) $(USER_H) ++endif ++ ++if CONFIG_KERNEL ++kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION)/linux ++kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++endif +diff -uNr linux-3.2.33-go.orig/include/zfs/linux/Makefile.in linux-3.2.33-go/include/zfs/linux/Makefile.in +--- linux-3.2.33-go.orig/include/zfs/linux/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/linux/Makefile.in 2012-11-16 23:25:34.345039382 +0100 +@@ -0,0 +1,664 @@ ++# Makefile.in generated by automake 1.11.6 from Makefile.am. ++# @configure_input@ ++ ++# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, ++# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software ++# Foundation, Inc. ++# This Makefile.in is free software; the Free Software Foundation ++# gives unlimited permission to copy and/or distribute it, ++# with or without modifications, as long as this notice is preserved. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without ++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A ++# PARTICULAR PURPOSE. ++ ++@SET_MAKE@ ++ ++VPATH = @srcdir@ ++am__make_dryrun = \ ++ { \ ++ am__dry=no; \ ++ case $$MAKEFLAGS in \ ++ *\\[\ \ ]*) \ ++ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ ++ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ ++ *) \ ++ for am__flg in $$MAKEFLAGS; do \ ++ case $$am__flg in \ ++ *=*|--*) ;; \ ++ *n*) am__dry=yes; break;; \ ++ esac; \ ++ done;; \ ++ esac; \ ++ test $$am__dry = yes; \ ++ } ++pkgdatadir = $(datadir)/@PACKAGE@ ++pkgincludedir = $(includedir)/@PACKAGE@ ++pkglibdir = $(libdir)/@PACKAGE@ ++pkglibexecdir = $(libexecdir)/@PACKAGE@ ++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd ++install_sh_DATA = $(install_sh) -c -m 644 ++install_sh_PROGRAM = $(install_sh) -c ++install_sh_SCRIPT = $(install_sh) -c ++INSTALL_HEADER = $(INSTALL_DATA) ++transform = $(program_transform_name) ++NORMAL_INSTALL = : ++PRE_INSTALL = : ++POST_INSTALL = : ++NORMAL_UNINSTALL = : ++PRE_UNINSTALL = : ++POST_UNINSTALL = : ++build_triplet = @build@ ++host_triplet = @host@ ++target_triplet = @target@ ++subdir = include/linux ++DIST_COMMON = $(am__kernel_HEADERS_DIST) $(libzfs_HEADERS) \ ++ $(srcdir)/Makefile.am $(srcdir)/Makefile.in ++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ++am__aclocal_m4_deps = \ ++ $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \ ++ $(top_srcdir)/config/kernel-automount.m4 \ ++ $(top_srcdir)/config/kernel-bdev-block-device-operations.m4 \ ++ $(top_srcdir)/config/kernel-bdev-logical-size.m4 \ ++ $(top_srcdir)/config/kernel-bdi-setup-and-register.m4 \ ++ $(top_srcdir)/config/kernel-bdi.m4 \ ++ $(top_srcdir)/config/kernel-bio-empty-barrier.m4 \ ++ $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \ ++ $(top_srcdir)/config/kernel-bio-failfast.m4 \ ++ $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \ ++ $(top_srcdir)/config/kernel-blk-end-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-fetch-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-discard.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-flush.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \ ++ $(top_srcdir)/config/kernel-blk-requeue-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-pos.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get.m4 \ ++ $(top_srcdir)/config/kernel-check-disk-size-change.m4 \ ++ $(top_srcdir)/config/kernel-clear-inode.m4 \ ++ $(top_srcdir)/config/kernel-commit-metadata.m4 \ ++ $(top_srcdir)/config/kernel-create-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-d-make-root.m4 \ ++ $(top_srcdir)/config/kernel-d-obtain-alias.m4 \ ++ $(top_srcdir)/config/kernel-discard-granularity.m4 \ ++ $(top_srcdir)/config/kernel-elevator-change.m4 \ ++ $(top_srcdir)/config/kernel-encode-fh-inode.m4 \ ++ $(top_srcdir)/config/kernel-evict-inode.m4 \ ++ $(top_srcdir)/config/kernel-fallocate.m4 \ ++ $(top_srcdir)/config/kernel-fmode-t.m4 \ ++ $(top_srcdir)/config/kernel-fsync.m4 \ ++ $(top_srcdir)/config/kernel-get-disk-ro.m4 \ ++ $(top_srcdir)/config/kernel-get-gendisk.m4 \ ++ $(top_srcdir)/config/kernel-insert-inode-locked.m4 \ ++ $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \ ++ $(top_srcdir)/config/kernel-kobj-name-len.m4 \ ++ $(top_srcdir)/config/kernel-lookup-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \ ++ $(top_srcdir)/config/kernel-mount-nodev.m4 \ ++ $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \ ++ $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \ ++ $(top_srcdir)/config/kernel-rq-is_sync.m4 \ ++ $(top_srcdir)/config/kernel-security-inode-init.m4 \ ++ $(top_srcdir)/config/kernel-set-nlink.m4 \ ++ $(top_srcdir)/config/kernel-sget-args.m4 \ ++ $(top_srcdir)/config/kernel-show-options.m4 \ ++ $(top_srcdir)/config/kernel-shrink.m4 \ ++ $(top_srcdir)/config/kernel-truncate-range.m4 \ ++ $(top_srcdir)/config/kernel-truncate-setsize.m4 \ ++ $(top_srcdir)/config/kernel-xattr-handler.m4 \ ++ $(top_srcdir)/config/kernel.m4 \ ++ $(top_srcdir)/config/user-arch.m4 \ ++ $(top_srcdir)/config/user-frame-larger-than.m4 \ ++ $(top_srcdir)/config/user-ioctl.m4 \ ++ $(top_srcdir)/config/user-libblkid.m4 \ ++ $(top_srcdir)/config/user-libuuid.m4 \ ++ $(top_srcdir)/config/user-nptl_guard_within_stack.m4 \ ++ $(top_srcdir)/config/user-selinux.m4 \ ++ $(top_srcdir)/config/user-udev.m4 \ ++ $(top_srcdir)/config/user-zlib.m4 $(top_srcdir)/config/user.m4 \ ++ $(top_srcdir)/config/zfs-build.m4 \ ++ $(top_srcdir)/config/zfs-meta.m4 $(top_srcdir)/configure.ac ++am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ ++ $(ACLOCAL_M4) ++mkinstalldirs = $(install_sh) -d ++CONFIG_HEADER = $(top_builddir)/zfs_config.h ++CONFIG_CLEAN_FILES = ++CONFIG_CLEAN_VPATH_FILES = ++AM_V_GEN = $(am__v_GEN_@AM_V@) ++am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) ++am__v_GEN_0 = @echo " GEN " $@; ++AM_V_at = $(am__v_at_@AM_V@) ++am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) ++am__v_at_0 = @ ++SOURCES = ++DIST_SOURCES = ++am__can_run_installinfo = \ ++ case $$AM_UPDATE_INFO_DIR in \ ++ n|no|NO) false;; \ ++ *) (install-info --version) >/dev/null 2>&1;; \ ++ esac ++am__kernel_HEADERS_DIST = $(top_srcdir)/include/linux/dcache_compat.h \ ++ $(top_srcdir)/include/linux/xattr_compat.h \ ++ $(top_srcdir)/include/linux/vfs_compat.h \ ++ $(top_srcdir)/include/linux/blkdev_compat.h ++am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; ++am__vpath_adj = case $$p in \ ++ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ ++ *) f=$$p;; \ ++ esac; ++am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; ++am__install_max = 40 ++am__nobase_strip_setup = \ ++ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` ++am__nobase_strip = \ ++ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" ++am__nobase_list = $(am__nobase_strip_setup); \ ++ for p in $$list; do echo "$$p $$p"; done | \ ++ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ ++ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ ++ if (++n[$$2] == $(am__install_max)) \ ++ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ ++ END { for (dir in files) print dir, files[dir] }' ++am__base_list = \ ++ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ ++ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' ++am__uninstall_files_from_dir = { \ ++ test -z "$$files" \ ++ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ ++ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ ++ $(am__cd) "$$dir" && rm -f $$files; }; \ ++ } ++am__installdirs = "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)" ++HEADERS = $(kernel_HEADERS) $(libzfs_HEADERS) ++ETAGS = etags ++CTAGS = ctags ++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ++ACLOCAL = @ACLOCAL@ ++ALIEN = @ALIEN@ ++ALIEN_VERSION = @ALIEN_VERSION@ ++AMTAR = @AMTAR@ ++AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ++AR = @AR@ ++AUTOCONF = @AUTOCONF@ ++AUTOHEADER = @AUTOHEADER@ ++AUTOMAKE = @AUTOMAKE@ ++AWK = @AWK@ ++CC = @CC@ ++CCAS = @CCAS@ ++CCASDEPMODE = @CCASDEPMODE@ ++CCASFLAGS = @CCASFLAGS@ ++CCDEPMODE = @CCDEPMODE@ ++CFLAGS = @CFLAGS@ ++CPP = @CPP@ ++CPPFLAGS = @CPPFLAGS@ ++CYGPATH_W = @CYGPATH_W@ ++DEBUG_CFLAGS = @DEBUG_CFLAGS@ ++DEBUG_DMU_TX = @DEBUG_DMU_TX@ ++DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@ ++DEBUG_ZFS = @DEBUG_ZFS@ ++DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@ ++DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@ ++DEFAULT_PACKAGE = @DEFAULT_PACKAGE@ ++DEFS = @DEFS@ ++DEPDIR = @DEPDIR@ ++DLLTOOL = @DLLTOOL@ ++DPKG = @DPKG@ ++DPKGBUILD = @DPKGBUILD@ ++DPKGBUILD_VERSION = @DPKGBUILD_VERSION@ ++DPKG_VERSION = @DPKG_VERSION@ ++DSYMUTIL = @DSYMUTIL@ ++DUMPBIN = @DUMPBIN@ ++ECHO_C = @ECHO_C@ ++ECHO_N = @ECHO_N@ ++ECHO_T = @ECHO_T@ ++EGREP = @EGREP@ ++EXEEXT = @EXEEXT@ ++FGREP = @FGREP@ ++FRAME_LARGER_THAN = @FRAME_LARGER_THAN@ ++GREP = @GREP@ ++HAVE_ALIEN = @HAVE_ALIEN@ ++HAVE_DPKG = @HAVE_DPKG@ ++HAVE_DPKGBUILD = @HAVE_DPKGBUILD@ ++HAVE_MAKEPKG = @HAVE_MAKEPKG@ ++HAVE_PACMAN = @HAVE_PACMAN@ ++HAVE_RPM = @HAVE_RPM@ ++HAVE_RPMBUILD = @HAVE_RPMBUILD@ ++INSTALL = @INSTALL@ ++INSTALL_DATA = @INSTALL_DATA@ ++INSTALL_PROGRAM = @INSTALL_PROGRAM@ ++INSTALL_SCRIPT = @INSTALL_SCRIPT@ ++INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ ++KERNELCPPFLAGS = @KERNELCPPFLAGS@ ++KERNELMAKE_PARAMS = @KERNELMAKE_PARAMS@ ++LD = @LD@ ++LDFLAGS = @LDFLAGS@ ++LIBBLKID = @LIBBLKID@ ++LIBOBJS = @LIBOBJS@ ++LIBS = @LIBS@ ++LIBSELINUX = @LIBSELINUX@ ++LIBTOOL = @LIBTOOL@ ++LIBUUID = @LIBUUID@ ++LINUX = @LINUX@ ++LINUX_OBJ = @LINUX_OBJ@ ++LINUX_SYMBOLS = @LINUX_SYMBOLS@ ++LINUX_VERSION = @LINUX_VERSION@ ++LIPO = @LIPO@ ++LN_S = @LN_S@ ++LTLIBOBJS = @LTLIBOBJS@ ++MAINT = @MAINT@ ++MAKEINFO = @MAKEINFO@ ++MAKEPKG = @MAKEPKG@ ++MAKEPKG_VERSION = @MAKEPKG_VERSION@ ++MANIFEST_TOOL = @MANIFEST_TOOL@ ++MKDIR_P = @MKDIR_P@ ++NM = @NM@ ++NMEDIT = @NMEDIT@ ++NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@ ++OBJDUMP = @OBJDUMP@ ++OBJEXT = @OBJEXT@ ++OTOOL = @OTOOL@ ++OTOOL64 = @OTOOL64@ ++PACKAGE = @PACKAGE@ ++PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ ++PACKAGE_NAME = @PACKAGE_NAME@ ++PACKAGE_STRING = @PACKAGE_STRING@ ++PACKAGE_TARNAME = @PACKAGE_TARNAME@ ++PACKAGE_URL = @PACKAGE_URL@ ++PACKAGE_VERSION = @PACKAGE_VERSION@ ++PACMAN = @PACMAN@ ++PACMAN_VERSION = @PACMAN_VERSION@ ++PATH_SEPARATOR = @PATH_SEPARATOR@ ++RANLIB = @RANLIB@ ++RPM = @RPM@ ++RPMBUILD = @RPMBUILD@ ++RPMBUILD_VERSION = @RPMBUILD_VERSION@ ++RPM_VERSION = @RPM_VERSION@ ++SED = @SED@ ++SET_MAKE = @SET_MAKE@ ++SHELL = @SHELL@ ++SPL = @SPL@ ++SPL_OBJ = @SPL_OBJ@ ++SPL_SYMBOLS = @SPL_SYMBOLS@ ++SPL_VERSION = @SPL_VERSION@ ++STRIP = @STRIP@ ++TARGET_ASM_DIR = @TARGET_ASM_DIR@ ++VENDOR = @VENDOR@ ++VERSION = @VERSION@ ++ZFS_CONFIG = @ZFS_CONFIG@ ++ZFS_META_ALIAS = @ZFS_META_ALIAS@ ++ZFS_META_AUTHOR = @ZFS_META_AUTHOR@ ++ZFS_META_DATA = @ZFS_META_DATA@ ++ZFS_META_LICENSE = @ZFS_META_LICENSE@ ++ZFS_META_LT_AGE = @ZFS_META_LT_AGE@ ++ZFS_META_LT_CURRENT = @ZFS_META_LT_CURRENT@ ++ZFS_META_LT_REVISION = @ZFS_META_LT_REVISION@ ++ZFS_META_NAME = @ZFS_META_NAME@ ++ZFS_META_RELEASE = @ZFS_META_RELEASE@ ++ZFS_META_VERSION = @ZFS_META_VERSION@ ++ZLIB = @ZLIB@ ++abs_builddir = @abs_builddir@ ++abs_srcdir = @abs_srcdir@ ++abs_top_builddir = @abs_top_builddir@ ++abs_top_srcdir = @abs_top_srcdir@ ++ac_ct_AR = @ac_ct_AR@ ++ac_ct_CC = @ac_ct_CC@ ++ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ ++am__include = @am__include@ ++am__leading_dot = @am__leading_dot@ ++am__quote = @am__quote@ ++am__tar = @am__tar@ ++am__untar = @am__untar@ ++bindir = @bindir@ ++build = @build@ ++build_alias = @build_alias@ ++build_cpu = @build_cpu@ ++build_os = @build_os@ ++build_vendor = @build_vendor@ ++builddir = @builddir@ ++datadir = @datadir@ ++datarootdir = @datarootdir@ ++docdir = @docdir@ ++dvidir = @dvidir@ ++exec_prefix = @exec_prefix@ ++host = @host@ ++host_alias = @host_alias@ ++host_cpu = @host_cpu@ ++host_os = @host_os@ ++host_vendor = @host_vendor@ ++htmldir = @htmldir@ ++includedir = @includedir@ ++infodir = @infodir@ ++install_sh = @install_sh@ ++libdir = @libdir@ ++libexecdir = @libexecdir@ ++localedir = @localedir@ ++localstatedir = @localstatedir@ ++mandir = @mandir@ ++mkdir_p = @mkdir_p@ ++oldincludedir = @oldincludedir@ ++pdfdir = @pdfdir@ ++prefix = @prefix@ ++program_transform_name = @program_transform_name@ ++psdir = @psdir@ ++sbindir = @sbindir@ ++sharedstatedir = @sharedstatedir@ ++srcdir = @srcdir@ ++sysconfdir = @sysconfdir@ ++target = @target@ ++target_alias = @target_alias@ ++target_cpu = @target_cpu@ ++target_os = @target_os@ ++target_vendor = @target_vendor@ ++top_build_prefix = @top_build_prefix@ ++top_builddir = @top_builddir@ ++top_srcdir = @top_srcdir@ ++udevdir = @udevdir@ ++udevruledir = @udevruledir@ ++COMMON_H = ++KERNEL_H = \ ++ $(top_srcdir)/include/linux/dcache_compat.h \ ++ $(top_srcdir)/include/linux/xattr_compat.h \ ++ $(top_srcdir)/include/linux/vfs_compat.h \ ++ $(top_srcdir)/include/linux/blkdev_compat.h ++ ++USER_H = ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++@CONFIG_USER_TRUE@libzfsdir = $(includedir)/libzfs/linux ++@CONFIG_USER_TRUE@libzfs_HEADERS = $(COMMON_H) $(USER_H) ++@CONFIG_KERNEL_TRUE@kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION)/linux ++@CONFIG_KERNEL_TRUE@kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++all: all-am ++ ++.SUFFIXES: ++$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) ++ @for dep in $?; do \ ++ case '$(am__configure_deps)' in \ ++ *$$dep*) \ ++ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ ++ && { if test -f $@; then exit 0; else break; fi; }; \ ++ exit 1;; \ ++ esac; \ ++ done; \ ++ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu include/linux/Makefile'; \ ++ $(am__cd) $(top_srcdir) && \ ++ $(AUTOMAKE) --gnu include/linux/Makefile ++.PRECIOUS: Makefile ++Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status ++ @case '$?' in \ ++ *config.status*) \ ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ ++ *) \ ++ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ ++ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ ++ esac; ++ ++$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++ ++$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(am__aclocal_m4_deps): ++ ++mostlyclean-libtool: ++ -rm -f *.lo ++ ++clean-libtool: ++ -rm -rf .libs _libs ++install-kernelHEADERS: $(kernel_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(kerneldir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(kerneldir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(kerneldir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(kerneldir)" || exit $$?; \ ++ done ++ ++uninstall-kernelHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(kerneldir)'; $(am__uninstall_files_from_dir) ++install-libzfsHEADERS: $(libzfs_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(libzfsdir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(libzfsdir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libzfsdir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(libzfsdir)" || exit $$?; \ ++ done ++ ++uninstall-libzfsHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(libzfsdir)'; $(am__uninstall_files_from_dir) ++ ++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ mkid -fID $$unique ++tags: TAGS ++ ++TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ set x; \ ++ here=`pwd`; \ ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ shift; \ ++ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ ++ test -n "$$unique" || unique=$$empty_fix; \ ++ if test $$# -gt 0; then \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ "$$@" $$unique; \ ++ else \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ $$unique; \ ++ fi; \ ++ fi ++ctags: CTAGS ++CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ test -z "$(CTAGS_ARGS)$$unique" \ ++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ ++ $$unique ++ ++GTAGS: ++ here=`$(am__cd) $(top_builddir) && pwd` \ ++ && $(am__cd) $(top_srcdir) \ ++ && gtags -i $(GTAGS_ARGS) "$$here" ++ ++distclean-tags: ++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags ++ ++distdir: $(DISTFILES) ++ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ list='$(DISTFILES)'; \ ++ dist_files=`for file in $$list; do echo $$file; done | \ ++ sed -e "s|^$$srcdirstrip/||;t" \ ++ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ ++ case $$dist_files in \ ++ */*) $(MKDIR_P) `echo "$$dist_files" | \ ++ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ ++ sort -u` ;; \ ++ esac; \ ++ for file in $$dist_files; do \ ++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ ++ if test -d $$d/$$file; then \ ++ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ ++ if test -d "$(distdir)/$$file"; then \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ ++ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ ++ else \ ++ test -f "$(distdir)/$$file" \ ++ || cp -p $$d/$$file "$(distdir)/$$file" \ ++ || exit 1; \ ++ fi; \ ++ done ++check-am: all-am ++check: check-am ++all-am: Makefile $(HEADERS) ++installdirs: ++ for dir in "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)"; do \ ++ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ ++ done ++install: install-am ++install-exec: install-exec-am ++install-data: install-data-am ++uninstall: uninstall-am ++ ++install-am: all-am ++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am ++ ++installcheck: installcheck-am ++install-strip: ++ if test -z '$(STRIP)'; then \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ install; \ ++ else \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ ++ fi ++mostlyclean-generic: ++ ++clean-generic: ++ ++distclean-generic: ++ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) ++ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) ++ ++maintainer-clean-generic: ++ @echo "This command is intended for maintainers to use" ++ @echo "it deletes files that may require special tools to rebuild." ++clean: clean-am ++ ++clean-am: clean-generic clean-libtool mostlyclean-am ++ ++distclean: distclean-am ++ -rm -f Makefile ++distclean-am: clean-am distclean-generic distclean-tags ++ ++dvi: dvi-am ++ ++dvi-am: ++ ++html: html-am ++ ++html-am: ++ ++info: info-am ++ ++info-am: ++ ++install-data-am: install-kernelHEADERS install-libzfsHEADERS ++ ++install-dvi: install-dvi-am ++ ++install-dvi-am: ++ ++install-exec-am: ++ ++install-html: install-html-am ++ ++install-html-am: ++ ++install-info: install-info-am ++ ++install-info-am: ++ ++install-man: ++ ++install-pdf: install-pdf-am ++ ++install-pdf-am: ++ ++install-ps: install-ps-am ++ ++install-ps-am: ++ ++installcheck-am: ++ ++maintainer-clean: maintainer-clean-am ++ -rm -f Makefile ++maintainer-clean-am: distclean-am maintainer-clean-generic ++ ++mostlyclean: mostlyclean-am ++ ++mostlyclean-am: mostlyclean-generic mostlyclean-libtool ++ ++pdf: pdf-am ++ ++pdf-am: ++ ++ps: ps-am ++ ++ps-am: ++ ++uninstall-am: uninstall-kernelHEADERS uninstall-libzfsHEADERS ++ ++.MAKE: install-am install-strip ++ ++.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ ++ clean-libtool ctags distclean distclean-generic \ ++ distclean-libtool distclean-tags distdir dvi dvi-am html \ ++ html-am info info-am install install-am install-data \ ++ install-data-am install-dvi install-dvi-am install-exec \ ++ install-exec-am install-html install-html-am install-info \ ++ install-info-am install-kernelHEADERS install-libzfsHEADERS \ ++ install-man install-pdf install-pdf-am install-ps \ ++ install-ps-am install-strip installcheck installcheck-am \ ++ installdirs maintainer-clean maintainer-clean-generic \ ++ mostlyclean mostlyclean-generic mostlyclean-libtool pdf pdf-am \ ++ ps ps-am tags uninstall uninstall-am uninstall-kernelHEADERS \ ++ uninstall-libzfsHEADERS ++ ++ ++# Tell versions [3.59,3.63) of GNU make to not export all variables. ++# Otherwise a system limit (for SysV at least) may be exceeded. ++.NOEXPORT: +diff -uNr linux-3.2.33-go.orig/include/zfs/linux/vfs_compat.h linux-3.2.33-go/include/zfs/linux/vfs_compat.h +--- linux-3.2.33-go.orig/include/zfs/linux/vfs_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/linux/vfs_compat.h 2012-11-16 23:25:34.345039382 +0100 +@@ -0,0 +1,144 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (C) 2011 Lawrence Livermore National Security, LLC. ++ */ ++ ++#ifndef _ZFS_VFS_H ++#define _ZFS_VFS_H ++ ++/* ++ * 2.6.28 API change, ++ * Added insert_inode_locked() helper function, prior to this most callers ++ * used insert_inode_hash(). The older method doesn't check for collisions ++ * in the inode_hashtable but it still acceptible for use. ++ */ ++#ifndef HAVE_INSERT_INODE_LOCKED ++static inline int ++insert_inode_locked(struct inode *ip) ++{ ++ insert_inode_hash(ip); ++ return (0); ++} ++#endif /* HAVE_INSERT_INODE_LOCKED */ ++ ++/* ++ * 2.6.35 API change, ++ * Add truncate_setsize() if it is not exported by the Linux kernel. ++ * ++ * Truncate the inode and pages associated with the inode. The pages are ++ * unmapped and removed from cache. ++ */ ++#ifndef HAVE_TRUNCATE_SETSIZE ++static inline void ++truncate_setsize(struct inode *ip, loff_t new) ++{ ++ struct address_space *mapping = ip->i_mapping; ++ ++ i_size_write(ip, new); ++ ++ unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); ++ truncate_inode_pages(mapping, new); ++ unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); ++} ++#endif /* HAVE_TRUNCATE_SETSIZE */ ++ ++#if defined(HAVE_BDI) && !defined(HAVE_BDI_SETUP_AND_REGISTER) ++/* ++ * 2.6.34 API change, ++ * Add bdi_setup_and_register() function if not yet provided by kernel. ++ * It is used to quickly initialize and register a BDI for the filesystem. ++ */ ++extern atomic_long_t zfs_bdi_seq; ++ ++static inline int ++bdi_setup_and_register(struct backing_dev_info *bdi,char *name,unsigned int cap) ++{ ++ char tmp[32]; ++ int error; ++ ++ bdi->name = name; ++ bdi->capabilities = cap; ++ error = bdi_init(bdi); ++ if (error) ++ return (error); ++ ++ sprintf(tmp, "%.28s%s", name, "-%d"); ++ error = bdi_register(bdi, NULL, tmp, ++ atomic_long_inc_return(&zfs_bdi_seq)); ++ if (error) { ++ bdi_destroy(bdi); ++ return (error); ++ } ++ ++ return (error); ++} ++#endif /* HAVE_BDI && !HAVE_BDI_SETUP_AND_REGISTER */ ++ ++/* ++ * 3.2-rc1 API change, ++ * Add set_nlink() if it is not exported by the Linux kernel. ++ * ++ * i_nlink is read-only in Linux 3.2, but it can be set directly in ++ * earlier kernels. ++ */ ++#ifndef HAVE_SET_NLINK ++static inline void ++set_nlink(struct inode *inode, unsigned int nlink) ++{ ++ inode->i_nlink = nlink; ++} ++#endif /* HAVE_SET_NLINK */ ++ ++/* ++ * 3.3 API change, ++ * The VFS .create, .mkdir and .mknod callbacks were updated to take a ++ * umode_t type rather than an int. To cleanly handle both definitions ++ * the zpl_umode_t type is introduced and set accordingly. ++ */ ++#ifdef HAVE_MKDIR_UMODE_T ++typedef umode_t zpl_umode_t; ++#else ++typedef int zpl_umode_t; ++#endif ++ ++/* ++ * 3.5 API change, ++ * The clear_inode() function replaces end_writeback() and introduces an ++ * ordering change regarding when the inode_sync_wait() occurs. See the ++ * configure check in config/kernel-clear-inode.m4 for full details. ++ */ ++#if defined(HAVE_EVICT_INODE) && !defined(HAVE_CLEAR_INODE) ++#define clear_inode(ip) end_writeback(ip) ++#endif /* HAVE_EVICT_INODE && !HAVE_CLEAR_INODE */ ++ ++/* ++ * 3.6 API change, ++ * The sget() helper function now takes the mount flags as an argument. ++ */ ++#ifdef HAVE_5ARG_SGET ++#define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, fl, mtd) ++#else ++#define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, mtd) ++#endif /* HAVE_5ARG_SGET */ ++ ++#endif /* _ZFS_VFS_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/linux/xattr_compat.h linux-3.2.33-go/include/zfs/linux/xattr_compat.h +--- linux-3.2.33-go.orig/include/zfs/linux/xattr_compat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/linux/xattr_compat.h 2012-11-16 23:25:34.345039382 +0100 +@@ -0,0 +1,95 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (C) 2011 Lawrence Livermore National Security, LLC. ++ */ ++ ++#ifndef _ZFS_XATTR_H ++#define _ZFS_XATTR_H ++ ++/* ++ * 2.6.35 API change, ++ * The const keyword was added to the 'struct xattr_handler' in the ++ * generic Linux super_block structure. To handle this we define an ++ * appropriate xattr_handler_t typedef which can be used. This was ++ * the preferred solution because it keeps the code clean and readable. ++ */ ++#ifdef HAVE_CONST_XATTR_HANDLER ++typedef const struct xattr_handler xattr_handler_t; ++#else ++typedef struct xattr_handler xattr_handler_t; ++#endif ++ ++/* ++ * 2.6.33 API change, ++ * The xattr_hander->get() callback was changed to take a dentry ++ * instead of an inode, and a handler_flags argument was added. ++ */ ++#ifdef HAVE_DENTRY_XATTR_GET ++#define ZPL_XATTR_GET_WRAPPER(fn) \ ++static int \ ++fn(struct dentry *dentry, const char *name, void *buffer, size_t size, \ ++ int unused_handler_flags) \ ++{ \ ++ return __ ## fn(dentry->d_inode, name, buffer, size); \ ++} ++#else ++#define ZPL_XATTR_GET_WRAPPER(fn) \ ++static int \ ++fn(struct inode *ip, const char *name, void *buffer, size_t size) \ ++{ \ ++ return __ ## fn(ip, name, buffer, size); \ ++} ++#endif /* HAVE_DENTRY_XATTR_GET */ ++ ++/* ++ * 2.6.33 API change, ++ * The xattr_hander->set() callback was changed to take a dentry ++ * instead of an inode, and a handler_flags argument was added. ++ */ ++#ifdef HAVE_DENTRY_XATTR_SET ++#define ZPL_XATTR_SET_WRAPPER(fn) \ ++static int \ ++fn(struct dentry *dentry, const char *name, const void *buffer, \ ++ size_t size, int flags, int unused_handler_flags) \ ++{ \ ++ return __ ## fn(dentry->d_inode, name, buffer, size, flags); \ ++} ++#else ++#define ZPL_XATTR_SET_WRAPPER(fn) \ ++static int \ ++fn(struct inode *ip, const char *name, const void *buffer, \ ++ size_t size, int flags) \ ++{ \ ++ return __ ## fn(ip, name, buffer, size, flags); \ ++} ++#endif /* HAVE_DENTRY_XATTR_SET */ ++ ++#ifdef HAVE_6ARGS_SECURITY_INODE_INIT_SECURITY ++#define zpl_security_inode_init_security(ip, dip, qstr, nm, val, len) \ ++ security_inode_init_security(ip, dip, qstr, nm, val, len) ++#else ++#define zpl_security_inode_init_security(ip, dip, qstr, nm, val, len) \ ++ security_inode_init_security(ip, dip, nm, val, len) ++#endif /* HAVE_6ARGS_SECURITY_INODE_INIT_SECURITY */ ++ ++#endif /* _ZFS_XATTR_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/Makefile linux-3.2.33-go/include/zfs/Makefile +--- linux-3.2.33-go.orig/include/zfs/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/Makefile 2012-11-16 23:25:34.344039393 +0100 +@@ -0,0 +1,841 @@ ++# Makefile.in generated by automake 1.11.6 from Makefile.am. ++# include/Makefile. Generated from Makefile.in by configure. ++ ++# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, ++# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software ++# Foundation, Inc. ++# This Makefile.in is free software; the Free Software Foundation ++# gives unlimited permission to copy and/or distribute it, ++# with or without modifications, as long as this notice is preserved. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without ++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A ++# PARTICULAR PURPOSE. ++ ++ ++ ++ ++am__make_dryrun = \ ++ { \ ++ am__dry=no; \ ++ case $$MAKEFLAGS in \ ++ *\\[\ \ ]*) \ ++ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ ++ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ ++ *) \ ++ for am__flg in $$MAKEFLAGS; do \ ++ case $$am__flg in \ ++ *=*|--*) ;; \ ++ *n*) am__dry=yes; break;; \ ++ esac; \ ++ done;; \ ++ esac; \ ++ test $$am__dry = yes; \ ++ } ++pkgdatadir = $(datadir)/zfs ++pkgincludedir = $(includedir)/zfs ++pkglibdir = $(libdir)/zfs ++pkglibexecdir = $(libexecdir)/zfs ++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd ++install_sh_DATA = $(install_sh) -c -m 644 ++install_sh_PROGRAM = $(install_sh) -c ++install_sh_SCRIPT = $(install_sh) -c ++INSTALL_HEADER = $(INSTALL_DATA) ++transform = $(program_transform_name) ++NORMAL_INSTALL = : ++PRE_INSTALL = : ++POST_INSTALL = : ++NORMAL_UNINSTALL = : ++PRE_UNINSTALL = : ++POST_UNINSTALL = : ++build_triplet = x86_64-unknown-linux-gnu ++host_triplet = x86_64-unknown-linux-gnu ++target_triplet = x86_64-unknown-linux-gnu ++subdir = include ++DIST_COMMON = $(am__kernel_HEADERS_DIST) $(am__libzfs_HEADERS_DIST) \ ++ $(srcdir)/Makefile.am $(srcdir)/Makefile.in ++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ++am__aclocal_m4_deps = \ ++ $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \ ++ $(top_srcdir)/config/kernel-automount.m4 \ ++ $(top_srcdir)/config/kernel-bdev-block-device-operations.m4 \ ++ $(top_srcdir)/config/kernel-bdev-logical-size.m4 \ ++ $(top_srcdir)/config/kernel-bdi-setup-and-register.m4 \ ++ $(top_srcdir)/config/kernel-bdi.m4 \ ++ $(top_srcdir)/config/kernel-bio-empty-barrier.m4 \ ++ $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \ ++ $(top_srcdir)/config/kernel-bio-failfast.m4 \ ++ $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \ ++ $(top_srcdir)/config/kernel-blk-end-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-fetch-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-discard.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-flush.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \ ++ $(top_srcdir)/config/kernel-blk-requeue-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-pos.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get.m4 \ ++ $(top_srcdir)/config/kernel-check-disk-size-change.m4 \ ++ $(top_srcdir)/config/kernel-clear-inode.m4 \ ++ $(top_srcdir)/config/kernel-commit-metadata.m4 \ ++ $(top_srcdir)/config/kernel-create-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-d-make-root.m4 \ ++ $(top_srcdir)/config/kernel-d-obtain-alias.m4 \ ++ $(top_srcdir)/config/kernel-discard-granularity.m4 \ ++ $(top_srcdir)/config/kernel-elevator-change.m4 \ ++ $(top_srcdir)/config/kernel-encode-fh-inode.m4 \ ++ $(top_srcdir)/config/kernel-evict-inode.m4 \ ++ $(top_srcdir)/config/kernel-fallocate.m4 \ ++ $(top_srcdir)/config/kernel-fmode-t.m4 \ ++ $(top_srcdir)/config/kernel-fsync.m4 \ ++ $(top_srcdir)/config/kernel-get-disk-ro.m4 \ ++ $(top_srcdir)/config/kernel-get-gendisk.m4 \ ++ $(top_srcdir)/config/kernel-insert-inode-locked.m4 \ ++ $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \ ++ $(top_srcdir)/config/kernel-kobj-name-len.m4 \ ++ $(top_srcdir)/config/kernel-lookup-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \ ++ $(top_srcdir)/config/kernel-mount-nodev.m4 \ ++ $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \ ++ $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \ ++ $(top_srcdir)/config/kernel-rq-is_sync.m4 \ ++ $(top_srcdir)/config/kernel-security-inode-init.m4 \ ++ $(top_srcdir)/config/kernel-set-nlink.m4 \ ++ $(top_srcdir)/config/kernel-sget-args.m4 \ ++ $(top_srcdir)/config/kernel-show-options.m4 \ ++ $(top_srcdir)/config/kernel-shrink.m4 \ ++ $(top_srcdir)/config/kernel-truncate-range.m4 \ ++ $(top_srcdir)/config/kernel-truncate-setsize.m4 \ ++ $(top_srcdir)/config/kernel-xattr-handler.m4 \ ++ $(top_srcdir)/config/kernel.m4 \ ++ $(top_srcdir)/config/user-arch.m4 \ ++ $(top_srcdir)/config/user-frame-larger-than.m4 \ ++ $(top_srcdir)/config/user-ioctl.m4 \ ++ $(top_srcdir)/config/user-libblkid.m4 \ ++ $(top_srcdir)/config/user-libuuid.m4 \ ++ $(top_srcdir)/config/user-nptl_guard_within_stack.m4 \ ++ $(top_srcdir)/config/user-selinux.m4 \ ++ $(top_srcdir)/config/user-udev.m4 \ ++ $(top_srcdir)/config/user-zlib.m4 $(top_srcdir)/config/user.m4 \ ++ $(top_srcdir)/config/zfs-build.m4 \ ++ $(top_srcdir)/config/zfs-meta.m4 $(top_srcdir)/configure.ac ++am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ ++ $(ACLOCAL_M4) ++mkinstalldirs = $(install_sh) -d ++CONFIG_HEADER = $(top_builddir)/zfs_config.h ++CONFIG_CLEAN_FILES = ++CONFIG_CLEAN_VPATH_FILES = ++AM_V_GEN = $(am__v_GEN_$(V)) ++am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) ++am__v_GEN_0 = @echo " GEN " $@; ++AM_V_at = $(am__v_at_$(V)) ++am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) ++am__v_at_0 = @ ++SOURCES = ++DIST_SOURCES = ++RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \ ++ html-recursive info-recursive install-data-recursive \ ++ install-dvi-recursive install-exec-recursive \ ++ install-html-recursive install-info-recursive \ ++ install-pdf-recursive install-ps-recursive install-recursive \ ++ installcheck-recursive installdirs-recursive pdf-recursive \ ++ ps-recursive uninstall-recursive ++am__can_run_installinfo = \ ++ case $$AM_UPDATE_INFO_DIR in \ ++ n|no|NO) false;; \ ++ *) (install-info --version) >/dev/null 2>&1;; \ ++ esac ++am__kernel_HEADERS_DIST = $(top_srcdir)/include/zfs_comutil.h \ ++ $(top_srcdir)/include/zfs_deleg.h \ ++ $(top_srcdir)/include/zfs_fletcher.h \ ++ $(top_srcdir)/include/zfs_namecheck.h \ ++ $(top_srcdir)/include/zfs_prop.h \ ++ $(top_srcdir)/include/zpios-ctl.h \ ++ $(top_srcdir)/include/zpios-internal.h ++am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; ++am__vpath_adj = case $$p in \ ++ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ ++ *) f=$$p;; \ ++ esac; ++am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; ++am__install_max = 40 ++am__nobase_strip_setup = \ ++ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` ++am__nobase_strip = \ ++ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" ++am__nobase_list = $(am__nobase_strip_setup); \ ++ for p in $$list; do echo "$$p $$p"; done | \ ++ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ ++ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ ++ if (++n[$$2] == $(am__install_max)) \ ++ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ ++ END { for (dir in files) print dir, files[dir] }' ++am__base_list = \ ++ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ ++ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' ++am__uninstall_files_from_dir = { \ ++ test -z "$$files" \ ++ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ ++ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ ++ $(am__cd) "$$dir" && rm -f $$files; }; \ ++ } ++am__installdirs = "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)" ++am__libzfs_HEADERS_DIST = $(top_srcdir)/include/zfs_comutil.h \ ++ $(top_srcdir)/include/zfs_deleg.h \ ++ $(top_srcdir)/include/zfs_fletcher.h \ ++ $(top_srcdir)/include/zfs_namecheck.h \ ++ $(top_srcdir)/include/zfs_prop.h \ ++ $(top_srcdir)/include/zpios-ctl.h \ ++ $(top_srcdir)/include/libnvpair.h \ ++ $(top_srcdir)/include/libuutil_common.h \ ++ $(top_srcdir)/include/libuutil.h \ ++ $(top_srcdir)/include/libuutil_impl.h \ ++ $(top_srcdir)/include/libzfs.h \ ++ $(top_srcdir)/include/libzfs_impl.h ++HEADERS = $(kernel_HEADERS) $(libzfs_HEADERS) ++RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \ ++ distclean-recursive maintainer-clean-recursive ++AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \ ++ $(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \ ++ distdir ++ETAGS = etags ++CTAGS = ctags ++DIST_SUBDIRS = $(SUBDIRS) ++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ++am__relativize = \ ++ dir0=`pwd`; \ ++ sed_first='s,^\([^/]*\)/.*$$,\1,'; \ ++ sed_rest='s,^[^/]*/*,,'; \ ++ sed_last='s,^.*/\([^/]*\)$$,\1,'; \ ++ sed_butlast='s,/*[^/]*$$,,'; \ ++ while test -n "$$dir1"; do \ ++ first=`echo "$$dir1" | sed -e "$$sed_first"`; \ ++ if test "$$first" != "."; then \ ++ if test "$$first" = ".."; then \ ++ dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \ ++ dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \ ++ else \ ++ first2=`echo "$$dir2" | sed -e "$$sed_first"`; \ ++ if test "$$first2" = "$$first"; then \ ++ dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \ ++ else \ ++ dir2="../$$dir2"; \ ++ fi; \ ++ dir0="$$dir0"/"$$first"; \ ++ fi; \ ++ fi; \ ++ dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \ ++ done; \ ++ reldir="$$dir2" ++ACLOCAL = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run aclocal-1.11 ++ALIEN = alien ++ALIEN_VERSION = ++AMTAR = $${TAR-tar} ++AM_DEFAULT_VERBOSITY = 1 ++AR = ar ++AUTOCONF = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run autoconf ++AUTOHEADER = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run autoheader ++AUTOMAKE = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run automake-1.11 ++AWK = gawk ++CC = gcc ++CCAS = gcc ++CCASDEPMODE = depmode=gcc3 ++CCASFLAGS = -g -O2 ++CCDEPMODE = depmode=gcc3 ++CFLAGS = -g -O2 ++CPP = gcc -E ++CPPFLAGS = ++CYGPATH_W = echo ++DEBUG_CFLAGS = -DNDEBUG ++DEBUG_DMU_TX = _without_debug_dmu_tx ++DEBUG_STACKFLAGS = ++DEBUG_ZFS = _without_debug ++DEFAULT_INIT_DIR = ${prefix}/etc/init.d ++DEFAULT_INIT_SCRIPT = gentoo ++DEFAULT_PACKAGE = tgz ++DEFS = -DHAVE_CONFIG_H ++DEPDIR = .deps ++DLLTOOL = false ++DPKG = dpkg ++DPKGBUILD = dpkg-buildpackage ++DPKGBUILD_VERSION = ++DPKG_VERSION = ++DSYMUTIL = ++DUMPBIN = ++ECHO_C = ++ECHO_N = -n ++ECHO_T = ++EGREP = /bin/grep -E ++EXEEXT = ++FGREP = /bin/grep -F ++FRAME_LARGER_THAN = -Wframe-larger-than=1024 ++GREP = /bin/grep ++HAVE_ALIEN = no ++HAVE_DPKG = no ++HAVE_DPKGBUILD = no ++HAVE_MAKEPKG = ++HAVE_PACMAN = ++HAVE_RPM = yes ++HAVE_RPMBUILD = yes ++INSTALL = /usr/bin/install -c ++INSTALL_DATA = ${INSTALL} -m 644 ++INSTALL_PROGRAM = ${INSTALL} ++INSTALL_SCRIPT = ${INSTALL} ++INSTALL_STRIP_PROGRAM = $(install_sh) -c -s ++KERNELCPPFLAGS = -Wno-unused-but-set-variable -DHAVE_SPL -D_KERNEL -DTEXT_DOMAIN=\"zfs-linux-kernel\" -DNDEBUG ++KERNELMAKE_PARAMS = O=/usr/src/linux-3.6.0-sabayon ++LD = /usr/x86_64-pc-linux-gnu/bin/ld -m elf_x86_64 ++LDFLAGS = ++LIBBLKID = ++LIBOBJS = ++LIBS = -luuid -luuid -lz -lz -lz ++LIBSELINUX = ++LIBTOOL = $(SHELL) $(top_builddir)/libtool ++LIBUUID = -luuid ++LINUX = /usr/src/linux-3.2.33-go ++LINUX_OBJ = /usr/src/linux-3.6.0-sabayon ++LINUX_SYMBOLS = NONE ++LINUX_VERSION = 3.6.0-sabayon ++LIPO = ++LN_S = ln -s ++LTLIBOBJS = ++MAINT = # ++MAKEINFO = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run makeinfo ++MAKEPKG = ++MAKEPKG_VERSION = ++MANIFEST_TOOL = : ++MKDIR_P = /bin/mkdir -p ++NM = /usr/bin/nm -B ++NMEDIT = ++NO_UNUSED_BUT_SET_VARIABLE = -Wno-unused-but-set-variable ++OBJDUMP = objdump ++OBJEXT = o ++OTOOL = ++OTOOL64 = ++PACKAGE = zfs ++PACKAGE_BUGREPORT = ++PACKAGE_NAME = ++PACKAGE_STRING = ++PACKAGE_TARNAME = ++PACKAGE_URL = ++PACKAGE_VERSION = ++PACMAN = ++PACMAN_VERSION = ++PATH_SEPARATOR = : ++RANLIB = ranlib ++RPM = rpm ++RPMBUILD = rpmbuild ++RPMBUILD_VERSION = 4.10.0 ++RPM_VERSION = 4.10.0 ++SED = /bin/sed ++SET_MAKE = ++SHELL = /bin/sh ++SPL = /usr/src/linux-3.2.33-go ++SPL_OBJ = /usr/src/linux-3.2.33-go ++SPL_SYMBOLS = NONE ++SPL_VERSION = 0.6.0-rc12 ++STRIP = strip ++TARGET_ASM_DIR = asm-x86_64 ++VENDOR = gentoo ++VERSION = 0.6.0 ++ZFS_CONFIG = all ++ZFS_META_ALIAS = zfs-0.6.0-rc12 ++ZFS_META_AUTHOR = Sun Microsystems/Oracle, Lawrence Livermore National Laboratory ++ZFS_META_DATA = ++ZFS_META_LICENSE = CDDL ++ZFS_META_LT_AGE = ++ZFS_META_LT_CURRENT = ++ZFS_META_LT_REVISION = ++ZFS_META_NAME = zfs ++ZFS_META_RELEASE = rc12 ++ZFS_META_VERSION = 0.6.0 ++ZLIB = -lz ++abs_builddir = /root/zfs-0.6.0-rc12/include ++abs_srcdir = /root/zfs-0.6.0-rc12/include ++abs_top_builddir = /root/zfs-0.6.0-rc12 ++abs_top_srcdir = /root/zfs-0.6.0-rc12 ++ac_ct_AR = ar ++ac_ct_CC = gcc ++ac_ct_DUMPBIN = ++am__include = include ++am__leading_dot = . ++am__quote = ++am__tar = $${TAR-tar} chof - "$$tardir" ++am__untar = $${TAR-tar} xf - ++bindir = ${exec_prefix}/bin ++build = x86_64-unknown-linux-gnu ++build_alias = ++build_cpu = x86_64 ++build_os = linux-gnu ++build_vendor = unknown ++builddir = . ++datadir = ${datarootdir} ++datarootdir = ${prefix}/share ++docdir = ${datarootdir}/doc/${PACKAGE} ++dvidir = ${docdir} ++exec_prefix = ${prefix} ++host = x86_64-unknown-linux-gnu ++host_alias = ++host_cpu = x86_64 ++host_os = linux-gnu ++host_vendor = unknown ++htmldir = ${docdir} ++includedir = ${prefix}/include ++infodir = ${datarootdir}/info ++install_sh = ${SHELL} /root/zfs-0.6.0-rc12/config/install-sh ++libdir = ${exec_prefix}/lib ++libexecdir = ${exec_prefix}/libexec ++localedir = ${datarootdir}/locale ++localstatedir = ${prefix}/var ++mandir = ${datarootdir}/man ++mkdir_p = /bin/mkdir -p ++oldincludedir = /usr/include ++pdfdir = ${docdir} ++prefix = /usr/local ++program_transform_name = s,x,x, ++psdir = ${docdir} ++sbindir = ${exec_prefix}/sbin ++sharedstatedir = ${prefix}/com ++srcdir = . ++sysconfdir = ${prefix}/etc ++target = x86_64-unknown-linux-gnu ++target_alias = ++target_cpu = x86_64 ++target_os = linux-gnu ++target_vendor = unknown ++top_build_prefix = ../ ++top_builddir = .. ++top_srcdir = .. ++udevdir = ${exec_prefix}/lib/udev ++udevruledir = ${udevdir}/rules.d ++SUBDIRS = linux sys ++COMMON_H = \ ++ $(top_srcdir)/include/zfs_comutil.h \ ++ $(top_srcdir)/include/zfs_deleg.h \ ++ $(top_srcdir)/include/zfs_fletcher.h \ ++ $(top_srcdir)/include/zfs_namecheck.h \ ++ $(top_srcdir)/include/zfs_prop.h \ ++ $(top_srcdir)/include/zpios-ctl.h ++ ++KERNEL_H = \ ++ $(top_srcdir)/include/zpios-internal.h ++ ++USER_H = \ ++ $(top_srcdir)/include/libnvpair.h \ ++ $(top_srcdir)/include/libuutil_common.h \ ++ $(top_srcdir)/include/libuutil.h \ ++ $(top_srcdir)/include/libuutil_impl.h \ ++ $(top_srcdir)/include/libzfs.h \ ++ $(top_srcdir)/include/libzfs_impl.h ++ ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++libzfsdir = $(includedir)/libzfs ++libzfs_HEADERS = $(COMMON_H) $(USER_H) ++#kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION) ++#kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++all: all-recursive ++ ++.SUFFIXES: ++$(srcdir)/Makefile.in: # $(srcdir)/Makefile.am $(am__configure_deps) ++ @for dep in $?; do \ ++ case '$(am__configure_deps)' in \ ++ *$$dep*) \ ++ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ ++ && { if test -f $@; then exit 0; else break; fi; }; \ ++ exit 1;; \ ++ esac; \ ++ done; \ ++ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu include/Makefile'; \ ++ $(am__cd) $(top_srcdir) && \ ++ $(AUTOMAKE) --gnu include/Makefile ++.PRECIOUS: Makefile ++Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status ++ @case '$?' in \ ++ *config.status*) \ ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ ++ *) \ ++ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ ++ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ ++ esac; ++ ++$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++ ++$(top_srcdir)/configure: # $(am__configure_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(ACLOCAL_M4): # $(am__aclocal_m4_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(am__aclocal_m4_deps): ++ ++mostlyclean-libtool: ++ -rm -f *.lo ++ ++clean-libtool: ++ -rm -rf .libs _libs ++install-kernelHEADERS: $(kernel_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(kerneldir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(kerneldir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(kerneldir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(kerneldir)" || exit $$?; \ ++ done ++ ++uninstall-kernelHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(kerneldir)'; $(am__uninstall_files_from_dir) ++install-libzfsHEADERS: $(libzfs_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(libzfsdir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(libzfsdir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libzfsdir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(libzfsdir)" || exit $$?; \ ++ done ++ ++uninstall-libzfsHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(libzfsdir)'; $(am__uninstall_files_from_dir) ++ ++# This directory's subdirectories are mostly independent; you can cd ++# into them and run `make' without going through this Makefile. ++# To change the values of `make' variables: instead of editing Makefiles, ++# (1) if the variable is set in `config.status', edit `config.status' ++# (which will cause the Makefiles to be regenerated when you run `make'); ++# (2) otherwise, pass the desired values on the `make' command line. ++$(RECURSIVE_TARGETS): ++ @fail= failcom='exit 1'; \ ++ for f in x $$MAKEFLAGS; do \ ++ case $$f in \ ++ *=* | --[!k]*);; \ ++ *k*) failcom='fail=yes';; \ ++ esac; \ ++ done; \ ++ dot_seen=no; \ ++ target=`echo $@ | sed s/-recursive//`; \ ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ echo "Making $$target in $$subdir"; \ ++ if test "$$subdir" = "."; then \ ++ dot_seen=yes; \ ++ local_target="$$target-am"; \ ++ else \ ++ local_target="$$target"; \ ++ fi; \ ++ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ ++ || eval $$failcom; \ ++ done; \ ++ if test "$$dot_seen" = "no"; then \ ++ $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ ++ fi; test -z "$$fail" ++ ++$(RECURSIVE_CLEAN_TARGETS): ++ @fail= failcom='exit 1'; \ ++ for f in x $$MAKEFLAGS; do \ ++ case $$f in \ ++ *=* | --[!k]*);; \ ++ *k*) failcom='fail=yes';; \ ++ esac; \ ++ done; \ ++ dot_seen=no; \ ++ case "$@" in \ ++ distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \ ++ *) list='$(SUBDIRS)' ;; \ ++ esac; \ ++ rev=''; for subdir in $$list; do \ ++ if test "$$subdir" = "."; then :; else \ ++ rev="$$subdir $$rev"; \ ++ fi; \ ++ done; \ ++ rev="$$rev ."; \ ++ target=`echo $@ | sed s/-recursive//`; \ ++ for subdir in $$rev; do \ ++ echo "Making $$target in $$subdir"; \ ++ if test "$$subdir" = "."; then \ ++ local_target="$$target-am"; \ ++ else \ ++ local_target="$$target"; \ ++ fi; \ ++ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ ++ || eval $$failcom; \ ++ done && test -z "$$fail" ++tags-recursive: ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \ ++ done ++ctags-recursive: ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \ ++ done ++ ++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ mkid -fID $$unique ++tags: TAGS ++ ++TAGS: tags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ set x; \ ++ here=`pwd`; \ ++ if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \ ++ include_option=--etags-include; \ ++ empty_fix=.; \ ++ else \ ++ include_option=--include; \ ++ empty_fix=; \ ++ fi; \ ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ if test "$$subdir" = .; then :; else \ ++ test ! -f $$subdir/TAGS || \ ++ set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \ ++ fi; \ ++ done; \ ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ shift; \ ++ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ ++ test -n "$$unique" || unique=$$empty_fix; \ ++ if test $$# -gt 0; then \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ "$$@" $$unique; \ ++ else \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ $$unique; \ ++ fi; \ ++ fi ++ctags: CTAGS ++CTAGS: ctags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ test -z "$(CTAGS_ARGS)$$unique" \ ++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ ++ $$unique ++ ++GTAGS: ++ here=`$(am__cd) $(top_builddir) && pwd` \ ++ && $(am__cd) $(top_srcdir) \ ++ && gtags -i $(GTAGS_ARGS) "$$here" ++ ++distclean-tags: ++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags ++ ++distdir: $(DISTFILES) ++ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ list='$(DISTFILES)'; \ ++ dist_files=`for file in $$list; do echo $$file; done | \ ++ sed -e "s|^$$srcdirstrip/||;t" \ ++ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ ++ case $$dist_files in \ ++ */*) $(MKDIR_P) `echo "$$dist_files" | \ ++ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ ++ sort -u` ;; \ ++ esac; \ ++ for file in $$dist_files; do \ ++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ ++ if test -d $$d/$$file; then \ ++ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ ++ if test -d "$(distdir)/$$file"; then \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ ++ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ ++ else \ ++ test -f "$(distdir)/$$file" \ ++ || cp -p $$d/$$file "$(distdir)/$$file" \ ++ || exit 1; \ ++ fi; \ ++ done ++ @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \ ++ if test "$$subdir" = .; then :; else \ ++ $(am__make_dryrun) \ ++ || test -d "$(distdir)/$$subdir" \ ++ || $(MKDIR_P) "$(distdir)/$$subdir" \ ++ || exit 1; \ ++ dir1=$$subdir; dir2="$(distdir)/$$subdir"; \ ++ $(am__relativize); \ ++ new_distdir=$$reldir; \ ++ dir1=$$subdir; dir2="$(top_distdir)"; \ ++ $(am__relativize); \ ++ new_top_distdir=$$reldir; \ ++ echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \ ++ echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \ ++ ($(am__cd) $$subdir && \ ++ $(MAKE) $(AM_MAKEFLAGS) \ ++ top_distdir="$$new_top_distdir" \ ++ distdir="$$new_distdir" \ ++ am__remove_distdir=: \ ++ am__skip_length_check=: \ ++ am__skip_mode_fix=: \ ++ distdir) \ ++ || exit 1; \ ++ fi; \ ++ done ++check-am: all-am ++check: check-recursive ++all-am: Makefile $(HEADERS) ++installdirs: installdirs-recursive ++installdirs-am: ++ for dir in "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)"; do \ ++ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ ++ done ++install: install-recursive ++install-exec: install-exec-recursive ++install-data: install-data-recursive ++uninstall: uninstall-recursive ++ ++install-am: all-am ++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am ++ ++installcheck: installcheck-recursive ++install-strip: ++ if test -z '$(STRIP)'; then \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ install; \ ++ else \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ ++ fi ++mostlyclean-generic: ++ ++clean-generic: ++ ++distclean-generic: ++ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) ++ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) ++ ++maintainer-clean-generic: ++ @echo "This command is intended for maintainers to use" ++ @echo "it deletes files that may require special tools to rebuild." ++clean: clean-recursive ++ ++clean-am: clean-generic clean-libtool mostlyclean-am ++ ++distclean: distclean-recursive ++ -rm -f Makefile ++distclean-am: clean-am distclean-generic distclean-tags ++ ++dvi: dvi-recursive ++ ++dvi-am: ++ ++html: html-recursive ++ ++html-am: ++ ++info: info-recursive ++ ++info-am: ++ ++install-data-am: install-kernelHEADERS install-libzfsHEADERS ++ ++install-dvi: install-dvi-recursive ++ ++install-dvi-am: ++ ++install-exec-am: ++ ++install-html: install-html-recursive ++ ++install-html-am: ++ ++install-info: install-info-recursive ++ ++install-info-am: ++ ++install-man: ++ ++install-pdf: install-pdf-recursive ++ ++install-pdf-am: ++ ++install-ps: install-ps-recursive ++ ++install-ps-am: ++ ++installcheck-am: ++ ++maintainer-clean: maintainer-clean-recursive ++ -rm -f Makefile ++maintainer-clean-am: distclean-am maintainer-clean-generic ++ ++mostlyclean: mostlyclean-recursive ++ ++mostlyclean-am: mostlyclean-generic mostlyclean-libtool ++ ++pdf: pdf-recursive ++ ++pdf-am: ++ ++ps: ps-recursive ++ ++ps-am: ++ ++uninstall-am: uninstall-kernelHEADERS uninstall-libzfsHEADERS ++ ++.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \ ++ install-am install-strip tags-recursive ++ ++.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \ ++ all all-am check check-am clean clean-generic clean-libtool \ ++ ctags ctags-recursive distclean distclean-generic \ ++ distclean-libtool distclean-tags distdir dvi dvi-am html \ ++ html-am info info-am install install-am install-data \ ++ install-data-am install-dvi install-dvi-am install-exec \ ++ install-exec-am install-html install-html-am install-info \ ++ install-info-am install-kernelHEADERS install-libzfsHEADERS \ ++ install-man install-pdf install-pdf-am install-ps \ ++ install-ps-am install-strip installcheck installcheck-am \ ++ installdirs installdirs-am maintainer-clean \ ++ maintainer-clean-generic mostlyclean mostlyclean-generic \ ++ mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \ ++ uninstall uninstall-am uninstall-kernelHEADERS \ ++ uninstall-libzfsHEADERS ++ ++ ++# Tell versions [3.59,3.63) of GNU make to not export all variables. ++# Otherwise a system limit (for SysV at least) may be exceeded. ++.NOEXPORT: +diff -uNr linux-3.2.33-go.orig/include/zfs/Makefile.am linux-3.2.33-go/include/zfs/Makefile.am +--- linux-3.2.33-go.orig/include/zfs/Makefile.am 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/Makefile.am 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,32 @@ ++SUBDIRS = linux sys ++ ++COMMON_H = \ ++ $(top_srcdir)/include/zfs_comutil.h \ ++ $(top_srcdir)/include/zfs_deleg.h \ ++ $(top_srcdir)/include/zfs_fletcher.h \ ++ $(top_srcdir)/include/zfs_namecheck.h \ ++ $(top_srcdir)/include/zfs_prop.h \ ++ $(top_srcdir)/include/zpios-ctl.h ++ ++KERNEL_H = \ ++ $(top_srcdir)/include/zpios-internal.h ++ ++USER_H = \ ++ $(top_srcdir)/include/libnvpair.h \ ++ $(top_srcdir)/include/libuutil_common.h \ ++ $(top_srcdir)/include/libuutil.h \ ++ $(top_srcdir)/include/libuutil_impl.h \ ++ $(top_srcdir)/include/libzfs.h \ ++ $(top_srcdir)/include/libzfs_impl.h ++ ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++ ++if CONFIG_USER ++libzfsdir = $(includedir)/libzfs ++libzfs_HEADERS = $(COMMON_H) $(USER_H) ++endif ++ ++if CONFIG_KERNEL ++kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION) ++kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++endif +diff -uNr linux-3.2.33-go.orig/include/zfs/Makefile.in linux-3.2.33-go/include/zfs/Makefile.in +--- linux-3.2.33-go.orig/include/zfs/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/Makefile.in 2012-11-16 23:25:34.344039393 +0100 +@@ -0,0 +1,841 @@ ++# Makefile.in generated by automake 1.11.6 from Makefile.am. ++# @configure_input@ ++ ++# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, ++# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software ++# Foundation, Inc. ++# This Makefile.in is free software; the Free Software Foundation ++# gives unlimited permission to copy and/or distribute it, ++# with or without modifications, as long as this notice is preserved. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without ++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A ++# PARTICULAR PURPOSE. ++ ++@SET_MAKE@ ++ ++VPATH = @srcdir@ ++am__make_dryrun = \ ++ { \ ++ am__dry=no; \ ++ case $$MAKEFLAGS in \ ++ *\\[\ \ ]*) \ ++ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ ++ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ ++ *) \ ++ for am__flg in $$MAKEFLAGS; do \ ++ case $$am__flg in \ ++ *=*|--*) ;; \ ++ *n*) am__dry=yes; break;; \ ++ esac; \ ++ done;; \ ++ esac; \ ++ test $$am__dry = yes; \ ++ } ++pkgdatadir = $(datadir)/@PACKAGE@ ++pkgincludedir = $(includedir)/@PACKAGE@ ++pkglibdir = $(libdir)/@PACKAGE@ ++pkglibexecdir = $(libexecdir)/@PACKAGE@ ++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd ++install_sh_DATA = $(install_sh) -c -m 644 ++install_sh_PROGRAM = $(install_sh) -c ++install_sh_SCRIPT = $(install_sh) -c ++INSTALL_HEADER = $(INSTALL_DATA) ++transform = $(program_transform_name) ++NORMAL_INSTALL = : ++PRE_INSTALL = : ++POST_INSTALL = : ++NORMAL_UNINSTALL = : ++PRE_UNINSTALL = : ++POST_UNINSTALL = : ++build_triplet = @build@ ++host_triplet = @host@ ++target_triplet = @target@ ++subdir = include ++DIST_COMMON = $(am__kernel_HEADERS_DIST) $(am__libzfs_HEADERS_DIST) \ ++ $(srcdir)/Makefile.am $(srcdir)/Makefile.in ++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ++am__aclocal_m4_deps = \ ++ $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \ ++ $(top_srcdir)/config/kernel-automount.m4 \ ++ $(top_srcdir)/config/kernel-bdev-block-device-operations.m4 \ ++ $(top_srcdir)/config/kernel-bdev-logical-size.m4 \ ++ $(top_srcdir)/config/kernel-bdi-setup-and-register.m4 \ ++ $(top_srcdir)/config/kernel-bdi.m4 \ ++ $(top_srcdir)/config/kernel-bio-empty-barrier.m4 \ ++ $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \ ++ $(top_srcdir)/config/kernel-bio-failfast.m4 \ ++ $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \ ++ $(top_srcdir)/config/kernel-blk-end-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-fetch-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-discard.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-flush.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \ ++ $(top_srcdir)/config/kernel-blk-requeue-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-pos.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get.m4 \ ++ $(top_srcdir)/config/kernel-check-disk-size-change.m4 \ ++ $(top_srcdir)/config/kernel-clear-inode.m4 \ ++ $(top_srcdir)/config/kernel-commit-metadata.m4 \ ++ $(top_srcdir)/config/kernel-create-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-d-make-root.m4 \ ++ $(top_srcdir)/config/kernel-d-obtain-alias.m4 \ ++ $(top_srcdir)/config/kernel-discard-granularity.m4 \ ++ $(top_srcdir)/config/kernel-elevator-change.m4 \ ++ $(top_srcdir)/config/kernel-encode-fh-inode.m4 \ ++ $(top_srcdir)/config/kernel-evict-inode.m4 \ ++ $(top_srcdir)/config/kernel-fallocate.m4 \ ++ $(top_srcdir)/config/kernel-fmode-t.m4 \ ++ $(top_srcdir)/config/kernel-fsync.m4 \ ++ $(top_srcdir)/config/kernel-get-disk-ro.m4 \ ++ $(top_srcdir)/config/kernel-get-gendisk.m4 \ ++ $(top_srcdir)/config/kernel-insert-inode-locked.m4 \ ++ $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \ ++ $(top_srcdir)/config/kernel-kobj-name-len.m4 \ ++ $(top_srcdir)/config/kernel-lookup-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \ ++ $(top_srcdir)/config/kernel-mount-nodev.m4 \ ++ $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \ ++ $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \ ++ $(top_srcdir)/config/kernel-rq-is_sync.m4 \ ++ $(top_srcdir)/config/kernel-security-inode-init.m4 \ ++ $(top_srcdir)/config/kernel-set-nlink.m4 \ ++ $(top_srcdir)/config/kernel-sget-args.m4 \ ++ $(top_srcdir)/config/kernel-show-options.m4 \ ++ $(top_srcdir)/config/kernel-shrink.m4 \ ++ $(top_srcdir)/config/kernel-truncate-range.m4 \ ++ $(top_srcdir)/config/kernel-truncate-setsize.m4 \ ++ $(top_srcdir)/config/kernel-xattr-handler.m4 \ ++ $(top_srcdir)/config/kernel.m4 \ ++ $(top_srcdir)/config/user-arch.m4 \ ++ $(top_srcdir)/config/user-frame-larger-than.m4 \ ++ $(top_srcdir)/config/user-ioctl.m4 \ ++ $(top_srcdir)/config/user-libblkid.m4 \ ++ $(top_srcdir)/config/user-libuuid.m4 \ ++ $(top_srcdir)/config/user-nptl_guard_within_stack.m4 \ ++ $(top_srcdir)/config/user-selinux.m4 \ ++ $(top_srcdir)/config/user-udev.m4 \ ++ $(top_srcdir)/config/user-zlib.m4 $(top_srcdir)/config/user.m4 \ ++ $(top_srcdir)/config/zfs-build.m4 \ ++ $(top_srcdir)/config/zfs-meta.m4 $(top_srcdir)/configure.ac ++am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ ++ $(ACLOCAL_M4) ++mkinstalldirs = $(install_sh) -d ++CONFIG_HEADER = $(top_builddir)/zfs_config.h ++CONFIG_CLEAN_FILES = ++CONFIG_CLEAN_VPATH_FILES = ++AM_V_GEN = $(am__v_GEN_@AM_V@) ++am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) ++am__v_GEN_0 = @echo " GEN " $@; ++AM_V_at = $(am__v_at_@AM_V@) ++am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) ++am__v_at_0 = @ ++SOURCES = ++DIST_SOURCES = ++RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \ ++ html-recursive info-recursive install-data-recursive \ ++ install-dvi-recursive install-exec-recursive \ ++ install-html-recursive install-info-recursive \ ++ install-pdf-recursive install-ps-recursive install-recursive \ ++ installcheck-recursive installdirs-recursive pdf-recursive \ ++ ps-recursive uninstall-recursive ++am__can_run_installinfo = \ ++ case $$AM_UPDATE_INFO_DIR in \ ++ n|no|NO) false;; \ ++ *) (install-info --version) >/dev/null 2>&1;; \ ++ esac ++am__kernel_HEADERS_DIST = $(top_srcdir)/include/zfs_comutil.h \ ++ $(top_srcdir)/include/zfs_deleg.h \ ++ $(top_srcdir)/include/zfs_fletcher.h \ ++ $(top_srcdir)/include/zfs_namecheck.h \ ++ $(top_srcdir)/include/zfs_prop.h \ ++ $(top_srcdir)/include/zpios-ctl.h \ ++ $(top_srcdir)/include/zpios-internal.h ++am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; ++am__vpath_adj = case $$p in \ ++ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ ++ *) f=$$p;; \ ++ esac; ++am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; ++am__install_max = 40 ++am__nobase_strip_setup = \ ++ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` ++am__nobase_strip = \ ++ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" ++am__nobase_list = $(am__nobase_strip_setup); \ ++ for p in $$list; do echo "$$p $$p"; done | \ ++ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ ++ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ ++ if (++n[$$2] == $(am__install_max)) \ ++ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ ++ END { for (dir in files) print dir, files[dir] }' ++am__base_list = \ ++ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ ++ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' ++am__uninstall_files_from_dir = { \ ++ test -z "$$files" \ ++ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ ++ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ ++ $(am__cd) "$$dir" && rm -f $$files; }; \ ++ } ++am__installdirs = "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)" ++am__libzfs_HEADERS_DIST = $(top_srcdir)/include/zfs_comutil.h \ ++ $(top_srcdir)/include/zfs_deleg.h \ ++ $(top_srcdir)/include/zfs_fletcher.h \ ++ $(top_srcdir)/include/zfs_namecheck.h \ ++ $(top_srcdir)/include/zfs_prop.h \ ++ $(top_srcdir)/include/zpios-ctl.h \ ++ $(top_srcdir)/include/libnvpair.h \ ++ $(top_srcdir)/include/libuutil_common.h \ ++ $(top_srcdir)/include/libuutil.h \ ++ $(top_srcdir)/include/libuutil_impl.h \ ++ $(top_srcdir)/include/libzfs.h \ ++ $(top_srcdir)/include/libzfs_impl.h ++HEADERS = $(kernel_HEADERS) $(libzfs_HEADERS) ++RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \ ++ distclean-recursive maintainer-clean-recursive ++AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \ ++ $(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \ ++ distdir ++ETAGS = etags ++CTAGS = ctags ++DIST_SUBDIRS = $(SUBDIRS) ++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ++am__relativize = \ ++ dir0=`pwd`; \ ++ sed_first='s,^\([^/]*\)/.*$$,\1,'; \ ++ sed_rest='s,^[^/]*/*,,'; \ ++ sed_last='s,^.*/\([^/]*\)$$,\1,'; \ ++ sed_butlast='s,/*[^/]*$$,,'; \ ++ while test -n "$$dir1"; do \ ++ first=`echo "$$dir1" | sed -e "$$sed_first"`; \ ++ if test "$$first" != "."; then \ ++ if test "$$first" = ".."; then \ ++ dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \ ++ dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \ ++ else \ ++ first2=`echo "$$dir2" | sed -e "$$sed_first"`; \ ++ if test "$$first2" = "$$first"; then \ ++ dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \ ++ else \ ++ dir2="../$$dir2"; \ ++ fi; \ ++ dir0="$$dir0"/"$$first"; \ ++ fi; \ ++ fi; \ ++ dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \ ++ done; \ ++ reldir="$$dir2" ++ACLOCAL = @ACLOCAL@ ++ALIEN = @ALIEN@ ++ALIEN_VERSION = @ALIEN_VERSION@ ++AMTAR = @AMTAR@ ++AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ++AR = @AR@ ++AUTOCONF = @AUTOCONF@ ++AUTOHEADER = @AUTOHEADER@ ++AUTOMAKE = @AUTOMAKE@ ++AWK = @AWK@ ++CC = @CC@ ++CCAS = @CCAS@ ++CCASDEPMODE = @CCASDEPMODE@ ++CCASFLAGS = @CCASFLAGS@ ++CCDEPMODE = @CCDEPMODE@ ++CFLAGS = @CFLAGS@ ++CPP = @CPP@ ++CPPFLAGS = @CPPFLAGS@ ++CYGPATH_W = @CYGPATH_W@ ++DEBUG_CFLAGS = @DEBUG_CFLAGS@ ++DEBUG_DMU_TX = @DEBUG_DMU_TX@ ++DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@ ++DEBUG_ZFS = @DEBUG_ZFS@ ++DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@ ++DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@ ++DEFAULT_PACKAGE = @DEFAULT_PACKAGE@ ++DEFS = @DEFS@ ++DEPDIR = @DEPDIR@ ++DLLTOOL = @DLLTOOL@ ++DPKG = @DPKG@ ++DPKGBUILD = @DPKGBUILD@ ++DPKGBUILD_VERSION = @DPKGBUILD_VERSION@ ++DPKG_VERSION = @DPKG_VERSION@ ++DSYMUTIL = @DSYMUTIL@ ++DUMPBIN = @DUMPBIN@ ++ECHO_C = @ECHO_C@ ++ECHO_N = @ECHO_N@ ++ECHO_T = @ECHO_T@ ++EGREP = @EGREP@ ++EXEEXT = @EXEEXT@ ++FGREP = @FGREP@ ++FRAME_LARGER_THAN = @FRAME_LARGER_THAN@ ++GREP = @GREP@ ++HAVE_ALIEN = @HAVE_ALIEN@ ++HAVE_DPKG = @HAVE_DPKG@ ++HAVE_DPKGBUILD = @HAVE_DPKGBUILD@ ++HAVE_MAKEPKG = @HAVE_MAKEPKG@ ++HAVE_PACMAN = @HAVE_PACMAN@ ++HAVE_RPM = @HAVE_RPM@ ++HAVE_RPMBUILD = @HAVE_RPMBUILD@ ++INSTALL = @INSTALL@ ++INSTALL_DATA = @INSTALL_DATA@ ++INSTALL_PROGRAM = @INSTALL_PROGRAM@ ++INSTALL_SCRIPT = @INSTALL_SCRIPT@ ++INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ ++KERNELCPPFLAGS = @KERNELCPPFLAGS@ ++KERNELMAKE_PARAMS = @KERNELMAKE_PARAMS@ ++LD = @LD@ ++LDFLAGS = @LDFLAGS@ ++LIBBLKID = @LIBBLKID@ ++LIBOBJS = @LIBOBJS@ ++LIBS = @LIBS@ ++LIBSELINUX = @LIBSELINUX@ ++LIBTOOL = @LIBTOOL@ ++LIBUUID = @LIBUUID@ ++LINUX = @LINUX@ ++LINUX_OBJ = @LINUX_OBJ@ ++LINUX_SYMBOLS = @LINUX_SYMBOLS@ ++LINUX_VERSION = @LINUX_VERSION@ ++LIPO = @LIPO@ ++LN_S = @LN_S@ ++LTLIBOBJS = @LTLIBOBJS@ ++MAINT = @MAINT@ ++MAKEINFO = @MAKEINFO@ ++MAKEPKG = @MAKEPKG@ ++MAKEPKG_VERSION = @MAKEPKG_VERSION@ ++MANIFEST_TOOL = @MANIFEST_TOOL@ ++MKDIR_P = @MKDIR_P@ ++NM = @NM@ ++NMEDIT = @NMEDIT@ ++NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@ ++OBJDUMP = @OBJDUMP@ ++OBJEXT = @OBJEXT@ ++OTOOL = @OTOOL@ ++OTOOL64 = @OTOOL64@ ++PACKAGE = @PACKAGE@ ++PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ ++PACKAGE_NAME = @PACKAGE_NAME@ ++PACKAGE_STRING = @PACKAGE_STRING@ ++PACKAGE_TARNAME = @PACKAGE_TARNAME@ ++PACKAGE_URL = @PACKAGE_URL@ ++PACKAGE_VERSION = @PACKAGE_VERSION@ ++PACMAN = @PACMAN@ ++PACMAN_VERSION = @PACMAN_VERSION@ ++PATH_SEPARATOR = @PATH_SEPARATOR@ ++RANLIB = @RANLIB@ ++RPM = @RPM@ ++RPMBUILD = @RPMBUILD@ ++RPMBUILD_VERSION = @RPMBUILD_VERSION@ ++RPM_VERSION = @RPM_VERSION@ ++SED = @SED@ ++SET_MAKE = @SET_MAKE@ ++SHELL = @SHELL@ ++SPL = @SPL@ ++SPL_OBJ = @SPL_OBJ@ ++SPL_SYMBOLS = @SPL_SYMBOLS@ ++SPL_VERSION = @SPL_VERSION@ ++STRIP = @STRIP@ ++TARGET_ASM_DIR = @TARGET_ASM_DIR@ ++VENDOR = @VENDOR@ ++VERSION = @VERSION@ ++ZFS_CONFIG = @ZFS_CONFIG@ ++ZFS_META_ALIAS = @ZFS_META_ALIAS@ ++ZFS_META_AUTHOR = @ZFS_META_AUTHOR@ ++ZFS_META_DATA = @ZFS_META_DATA@ ++ZFS_META_LICENSE = @ZFS_META_LICENSE@ ++ZFS_META_LT_AGE = @ZFS_META_LT_AGE@ ++ZFS_META_LT_CURRENT = @ZFS_META_LT_CURRENT@ ++ZFS_META_LT_REVISION = @ZFS_META_LT_REVISION@ ++ZFS_META_NAME = @ZFS_META_NAME@ ++ZFS_META_RELEASE = @ZFS_META_RELEASE@ ++ZFS_META_VERSION = @ZFS_META_VERSION@ ++ZLIB = @ZLIB@ ++abs_builddir = @abs_builddir@ ++abs_srcdir = @abs_srcdir@ ++abs_top_builddir = @abs_top_builddir@ ++abs_top_srcdir = @abs_top_srcdir@ ++ac_ct_AR = @ac_ct_AR@ ++ac_ct_CC = @ac_ct_CC@ ++ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ ++am__include = @am__include@ ++am__leading_dot = @am__leading_dot@ ++am__quote = @am__quote@ ++am__tar = @am__tar@ ++am__untar = @am__untar@ ++bindir = @bindir@ ++build = @build@ ++build_alias = @build_alias@ ++build_cpu = @build_cpu@ ++build_os = @build_os@ ++build_vendor = @build_vendor@ ++builddir = @builddir@ ++datadir = @datadir@ ++datarootdir = @datarootdir@ ++docdir = @docdir@ ++dvidir = @dvidir@ ++exec_prefix = @exec_prefix@ ++host = @host@ ++host_alias = @host_alias@ ++host_cpu = @host_cpu@ ++host_os = @host_os@ ++host_vendor = @host_vendor@ ++htmldir = @htmldir@ ++includedir = @includedir@ ++infodir = @infodir@ ++install_sh = @install_sh@ ++libdir = @libdir@ ++libexecdir = @libexecdir@ ++localedir = @localedir@ ++localstatedir = @localstatedir@ ++mandir = @mandir@ ++mkdir_p = @mkdir_p@ ++oldincludedir = @oldincludedir@ ++pdfdir = @pdfdir@ ++prefix = @prefix@ ++program_transform_name = @program_transform_name@ ++psdir = @psdir@ ++sbindir = @sbindir@ ++sharedstatedir = @sharedstatedir@ ++srcdir = @srcdir@ ++sysconfdir = @sysconfdir@ ++target = @target@ ++target_alias = @target_alias@ ++target_cpu = @target_cpu@ ++target_os = @target_os@ ++target_vendor = @target_vendor@ ++top_build_prefix = @top_build_prefix@ ++top_builddir = @top_builddir@ ++top_srcdir = @top_srcdir@ ++udevdir = @udevdir@ ++udevruledir = @udevruledir@ ++SUBDIRS = linux sys ++COMMON_H = \ ++ $(top_srcdir)/include/zfs_comutil.h \ ++ $(top_srcdir)/include/zfs_deleg.h \ ++ $(top_srcdir)/include/zfs_fletcher.h \ ++ $(top_srcdir)/include/zfs_namecheck.h \ ++ $(top_srcdir)/include/zfs_prop.h \ ++ $(top_srcdir)/include/zpios-ctl.h ++ ++KERNEL_H = \ ++ $(top_srcdir)/include/zpios-internal.h ++ ++USER_H = \ ++ $(top_srcdir)/include/libnvpair.h \ ++ $(top_srcdir)/include/libuutil_common.h \ ++ $(top_srcdir)/include/libuutil.h \ ++ $(top_srcdir)/include/libuutil_impl.h \ ++ $(top_srcdir)/include/libzfs.h \ ++ $(top_srcdir)/include/libzfs_impl.h ++ ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++@CONFIG_USER_TRUE@libzfsdir = $(includedir)/libzfs ++@CONFIG_USER_TRUE@libzfs_HEADERS = $(COMMON_H) $(USER_H) ++@CONFIG_KERNEL_TRUE@kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION) ++@CONFIG_KERNEL_TRUE@kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++all: all-recursive ++ ++.SUFFIXES: ++$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) ++ @for dep in $?; do \ ++ case '$(am__configure_deps)' in \ ++ *$$dep*) \ ++ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ ++ && { if test -f $@; then exit 0; else break; fi; }; \ ++ exit 1;; \ ++ esac; \ ++ done; \ ++ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu include/Makefile'; \ ++ $(am__cd) $(top_srcdir) && \ ++ $(AUTOMAKE) --gnu include/Makefile ++.PRECIOUS: Makefile ++Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status ++ @case '$?' in \ ++ *config.status*) \ ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ ++ *) \ ++ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ ++ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ ++ esac; ++ ++$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++ ++$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(am__aclocal_m4_deps): ++ ++mostlyclean-libtool: ++ -rm -f *.lo ++ ++clean-libtool: ++ -rm -rf .libs _libs ++install-kernelHEADERS: $(kernel_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(kerneldir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(kerneldir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(kerneldir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(kerneldir)" || exit $$?; \ ++ done ++ ++uninstall-kernelHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(kerneldir)'; $(am__uninstall_files_from_dir) ++install-libzfsHEADERS: $(libzfs_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(libzfsdir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(libzfsdir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libzfsdir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(libzfsdir)" || exit $$?; \ ++ done ++ ++uninstall-libzfsHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(libzfsdir)'; $(am__uninstall_files_from_dir) ++ ++# This directory's subdirectories are mostly independent; you can cd ++# into them and run `make' without going through this Makefile. ++# To change the values of `make' variables: instead of editing Makefiles, ++# (1) if the variable is set in `config.status', edit `config.status' ++# (which will cause the Makefiles to be regenerated when you run `make'); ++# (2) otherwise, pass the desired values on the `make' command line. ++$(RECURSIVE_TARGETS): ++ @fail= failcom='exit 1'; \ ++ for f in x $$MAKEFLAGS; do \ ++ case $$f in \ ++ *=* | --[!k]*);; \ ++ *k*) failcom='fail=yes';; \ ++ esac; \ ++ done; \ ++ dot_seen=no; \ ++ target=`echo $@ | sed s/-recursive//`; \ ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ echo "Making $$target in $$subdir"; \ ++ if test "$$subdir" = "."; then \ ++ dot_seen=yes; \ ++ local_target="$$target-am"; \ ++ else \ ++ local_target="$$target"; \ ++ fi; \ ++ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ ++ || eval $$failcom; \ ++ done; \ ++ if test "$$dot_seen" = "no"; then \ ++ $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ ++ fi; test -z "$$fail" ++ ++$(RECURSIVE_CLEAN_TARGETS): ++ @fail= failcom='exit 1'; \ ++ for f in x $$MAKEFLAGS; do \ ++ case $$f in \ ++ *=* | --[!k]*);; \ ++ *k*) failcom='fail=yes';; \ ++ esac; \ ++ done; \ ++ dot_seen=no; \ ++ case "$@" in \ ++ distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \ ++ *) list='$(SUBDIRS)' ;; \ ++ esac; \ ++ rev=''; for subdir in $$list; do \ ++ if test "$$subdir" = "."; then :; else \ ++ rev="$$subdir $$rev"; \ ++ fi; \ ++ done; \ ++ rev="$$rev ."; \ ++ target=`echo $@ | sed s/-recursive//`; \ ++ for subdir in $$rev; do \ ++ echo "Making $$target in $$subdir"; \ ++ if test "$$subdir" = "."; then \ ++ local_target="$$target-am"; \ ++ else \ ++ local_target="$$target"; \ ++ fi; \ ++ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ ++ || eval $$failcom; \ ++ done && test -z "$$fail" ++tags-recursive: ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \ ++ done ++ctags-recursive: ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \ ++ done ++ ++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ mkid -fID $$unique ++tags: TAGS ++ ++TAGS: tags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ set x; \ ++ here=`pwd`; \ ++ if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \ ++ include_option=--etags-include; \ ++ empty_fix=.; \ ++ else \ ++ include_option=--include; \ ++ empty_fix=; \ ++ fi; \ ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ if test "$$subdir" = .; then :; else \ ++ test ! -f $$subdir/TAGS || \ ++ set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \ ++ fi; \ ++ done; \ ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ shift; \ ++ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ ++ test -n "$$unique" || unique=$$empty_fix; \ ++ if test $$# -gt 0; then \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ "$$@" $$unique; \ ++ else \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ $$unique; \ ++ fi; \ ++ fi ++ctags: CTAGS ++CTAGS: ctags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ test -z "$(CTAGS_ARGS)$$unique" \ ++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ ++ $$unique ++ ++GTAGS: ++ here=`$(am__cd) $(top_builddir) && pwd` \ ++ && $(am__cd) $(top_srcdir) \ ++ && gtags -i $(GTAGS_ARGS) "$$here" ++ ++distclean-tags: ++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags ++ ++distdir: $(DISTFILES) ++ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ list='$(DISTFILES)'; \ ++ dist_files=`for file in $$list; do echo $$file; done | \ ++ sed -e "s|^$$srcdirstrip/||;t" \ ++ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ ++ case $$dist_files in \ ++ */*) $(MKDIR_P) `echo "$$dist_files" | \ ++ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ ++ sort -u` ;; \ ++ esac; \ ++ for file in $$dist_files; do \ ++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ ++ if test -d $$d/$$file; then \ ++ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ ++ if test -d "$(distdir)/$$file"; then \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ ++ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ ++ else \ ++ test -f "$(distdir)/$$file" \ ++ || cp -p $$d/$$file "$(distdir)/$$file" \ ++ || exit 1; \ ++ fi; \ ++ done ++ @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \ ++ if test "$$subdir" = .; then :; else \ ++ $(am__make_dryrun) \ ++ || test -d "$(distdir)/$$subdir" \ ++ || $(MKDIR_P) "$(distdir)/$$subdir" \ ++ || exit 1; \ ++ dir1=$$subdir; dir2="$(distdir)/$$subdir"; \ ++ $(am__relativize); \ ++ new_distdir=$$reldir; \ ++ dir1=$$subdir; dir2="$(top_distdir)"; \ ++ $(am__relativize); \ ++ new_top_distdir=$$reldir; \ ++ echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \ ++ echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \ ++ ($(am__cd) $$subdir && \ ++ $(MAKE) $(AM_MAKEFLAGS) \ ++ top_distdir="$$new_top_distdir" \ ++ distdir="$$new_distdir" \ ++ am__remove_distdir=: \ ++ am__skip_length_check=: \ ++ am__skip_mode_fix=: \ ++ distdir) \ ++ || exit 1; \ ++ fi; \ ++ done ++check-am: all-am ++check: check-recursive ++all-am: Makefile $(HEADERS) ++installdirs: installdirs-recursive ++installdirs-am: ++ for dir in "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)"; do \ ++ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ ++ done ++install: install-recursive ++install-exec: install-exec-recursive ++install-data: install-data-recursive ++uninstall: uninstall-recursive ++ ++install-am: all-am ++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am ++ ++installcheck: installcheck-recursive ++install-strip: ++ if test -z '$(STRIP)'; then \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ install; \ ++ else \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ ++ fi ++mostlyclean-generic: ++ ++clean-generic: ++ ++distclean-generic: ++ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) ++ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) ++ ++maintainer-clean-generic: ++ @echo "This command is intended for maintainers to use" ++ @echo "it deletes files that may require special tools to rebuild." ++clean: clean-recursive ++ ++clean-am: clean-generic clean-libtool mostlyclean-am ++ ++distclean: distclean-recursive ++ -rm -f Makefile ++distclean-am: clean-am distclean-generic distclean-tags ++ ++dvi: dvi-recursive ++ ++dvi-am: ++ ++html: html-recursive ++ ++html-am: ++ ++info: info-recursive ++ ++info-am: ++ ++install-data-am: install-kernelHEADERS install-libzfsHEADERS ++ ++install-dvi: install-dvi-recursive ++ ++install-dvi-am: ++ ++install-exec-am: ++ ++install-html: install-html-recursive ++ ++install-html-am: ++ ++install-info: install-info-recursive ++ ++install-info-am: ++ ++install-man: ++ ++install-pdf: install-pdf-recursive ++ ++install-pdf-am: ++ ++install-ps: install-ps-recursive ++ ++install-ps-am: ++ ++installcheck-am: ++ ++maintainer-clean: maintainer-clean-recursive ++ -rm -f Makefile ++maintainer-clean-am: distclean-am maintainer-clean-generic ++ ++mostlyclean: mostlyclean-recursive ++ ++mostlyclean-am: mostlyclean-generic mostlyclean-libtool ++ ++pdf: pdf-recursive ++ ++pdf-am: ++ ++ps: ps-recursive ++ ++ps-am: ++ ++uninstall-am: uninstall-kernelHEADERS uninstall-libzfsHEADERS ++ ++.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \ ++ install-am install-strip tags-recursive ++ ++.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \ ++ all all-am check check-am clean clean-generic clean-libtool \ ++ ctags ctags-recursive distclean distclean-generic \ ++ distclean-libtool distclean-tags distdir dvi dvi-am html \ ++ html-am info info-am install install-am install-data \ ++ install-data-am install-dvi install-dvi-am install-exec \ ++ install-exec-am install-html install-html-am install-info \ ++ install-info-am install-kernelHEADERS install-libzfsHEADERS \ ++ install-man install-pdf install-pdf-am install-ps \ ++ install-ps-am install-strip installcheck installcheck-am \ ++ installdirs installdirs-am maintainer-clean \ ++ maintainer-clean-generic mostlyclean mostlyclean-generic \ ++ mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \ ++ uninstall uninstall-am uninstall-kernelHEADERS \ ++ uninstall-libzfsHEADERS ++ ++ ++# Tell versions [3.59,3.63) of GNU make to not export all variables. ++# Otherwise a system limit (for SysV at least) may be exceeded. ++.NOEXPORT: +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/arc.h linux-3.2.33-go/include/zfs/sys/arc.h +--- linux-3.2.33-go.orig/include/zfs/sys/arc.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/arc.h 2012-11-16 23:25:34.344039393 +0100 +@@ -0,0 +1,162 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_ARC_H ++#define _SYS_ARC_H ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#include ++#include ++#include ++#include ++ ++typedef struct arc_buf_hdr arc_buf_hdr_t; ++typedef struct arc_buf arc_buf_t; ++typedef struct arc_prune arc_prune_t; ++typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private); ++typedef void arc_prune_func_t(int64_t bytes, void *private); ++typedef int arc_evict_func_t(void *private); ++ ++/* generic arc_done_func_t's which you can use */ ++arc_done_func_t arc_bcopy_func; ++arc_done_func_t arc_getbuf_func; ++ ++/* generic arc_prune_func_t wrapper for callbacks */ ++struct arc_prune { ++ arc_prune_func_t *p_pfunc; ++ void *p_private; ++ list_node_t p_node; ++ refcount_t p_refcnt; ++}; ++ ++struct arc_buf { ++ arc_buf_hdr_t *b_hdr; ++ arc_buf_t *b_next; ++ kmutex_t b_evict_lock; ++ krwlock_t b_data_lock; ++ void *b_data; ++ arc_evict_func_t *b_efunc; ++ void *b_private; ++}; ++ ++typedef enum arc_buf_contents { ++ ARC_BUFC_DATA, /* buffer contains data */ ++ ARC_BUFC_METADATA, /* buffer contains metadata */ ++ ARC_BUFC_NUMTYPES ++} arc_buf_contents_t; ++/* ++ * These are the flags we pass into calls to the arc ++ */ ++#define ARC_WAIT (1 << 1) /* perform I/O synchronously */ ++#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */ ++#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */ ++#define ARC_CACHED (1 << 4) /* I/O was already in cache */ ++#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */ ++ ++/* ++ * The following breakdows of arc_size exist for kstat only. ++ */ ++typedef enum arc_space_type { ++ ARC_SPACE_DATA, ++ ARC_SPACE_HDRS, ++ ARC_SPACE_L2HDRS, ++ ARC_SPACE_OTHER, ++ ARC_SPACE_NUMTYPES ++} arc_space_type_t; ++ ++void arc_space_consume(uint64_t space, arc_space_type_t type); ++void arc_space_return(uint64_t space, arc_space_type_t type); ++void *arc_data_buf_alloc(uint64_t space); ++void arc_data_buf_free(void *buf, uint64_t space); ++arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, ++ arc_buf_contents_t type); ++arc_buf_t *arc_loan_buf(spa_t *spa, int size); ++void arc_return_buf(arc_buf_t *buf, void *tag); ++void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); ++void arc_buf_add_ref(arc_buf_t *buf, void *tag); ++int arc_buf_remove_ref(arc_buf_t *buf, void *tag); ++int arc_buf_size(arc_buf_t *buf); ++void arc_release(arc_buf_t *buf, void *tag); ++int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, ++ zbookmark_t *zb); ++int arc_released(arc_buf_t *buf); ++int arc_has_callback(arc_buf_t *buf); ++void arc_buf_freeze(arc_buf_t *buf); ++void arc_buf_thaw(arc_buf_t *buf); ++#ifdef ZFS_DEBUG ++int arc_referenced(arc_buf_t *buf); ++#endif ++ ++int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, ++ arc_done_func_t *done, void *private, int priority, int zio_flags, ++ uint32_t *arc_flags, const zbookmark_t *zb); ++int arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, ++ arc_done_func_t *done, void *private, int priority, int flags, ++ uint32_t *arc_flags, const zbookmark_t *zb); ++zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ++ blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, ++ arc_done_func_t *ready, arc_done_func_t *done, void *private, ++ int priority, int zio_flags, const zbookmark_t *zb); ++ ++arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *private); ++void arc_remove_prune_callback(arc_prune_t *p); ++ ++void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); ++int arc_buf_evict(arc_buf_t *buf); ++ ++void arc_adjust_meta(int64_t adjustment, boolean_t may_prune); ++void arc_flush(spa_t *spa); ++void arc_tempreserve_clear(uint64_t reserve); ++int arc_tempreserve_space(uint64_t reserve, uint64_t txg); ++ ++void arc_init(void); ++void arc_fini(void); ++ ++/* ++ * Level 2 ARC ++ */ ++ ++void l2arc_add_vdev(spa_t *spa, vdev_t *vd); ++void l2arc_remove_vdev(vdev_t *vd); ++boolean_t l2arc_vdev_present(vdev_t *vd); ++void l2arc_init(void); ++void l2arc_fini(void); ++void l2arc_start(void); ++void l2arc_stop(void); ++ ++/* Global tunings */ ++extern int zfs_write_limit_shift; ++extern unsigned long zfs_write_limit_max; ++extern kmutex_t zfs_write_limit_lock; ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_ARC_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/avl.h linux-3.2.33-go/include/zfs/sys/avl.h +--- linux-3.2.33-go.orig/include/zfs/sys/avl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/avl.h 2012-11-16 23:25:34.339039449 +0100 +@@ -0,0 +1,309 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _AVL_H ++#define _AVL_H ++ ++/* ++ * This is a private header file. Applications should not directly include ++ * this file. ++ */ ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#include ++#include ++ ++/* ++ * This is a generic implemenatation of AVL trees for use in the Solaris kernel. ++ * The interfaces provide an efficient way of implementing an ordered set of ++ * data structures. ++ * ++ * AVL trees provide an alternative to using an ordered linked list. Using AVL ++ * trees will usually be faster, however they requires more storage. An ordered ++ * linked list in general requires 2 pointers in each data structure. The ++ * AVL tree implementation uses 3 pointers. The following chart gives the ++ * approximate performance of operations with the different approaches: ++ * ++ * Operation Link List AVL tree ++ * --------- -------- -------- ++ * lookup O(n) O(log(n)) ++ * ++ * insert 1 node constant constant ++ * ++ * delete 1 node constant between constant and O(log(n)) ++ * ++ * delete all nodes O(n) O(n) ++ * ++ * visit the next ++ * or prev node constant between constant and O(log(n)) ++ * ++ * ++ * The data structure nodes are anchored at an "avl_tree_t" (the equivalent ++ * of a list header) and the individual nodes will have a field of ++ * type "avl_node_t" (corresponding to list pointers). ++ * ++ * The type "avl_index_t" is used to indicate a position in the list for ++ * certain calls. ++ * ++ * The usage scenario is generally: ++ * ++ * 1. Create the list/tree with: avl_create() ++ * ++ * followed by any mixture of: ++ * ++ * 2a. Insert nodes with: avl_add(), or avl_find() and avl_insert() ++ * ++ * 2b. Visited elements with: ++ * avl_first() - returns the lowest valued node ++ * avl_last() - returns the highest valued node ++ * AVL_NEXT() - given a node go to next higher one ++ * AVL_PREV() - given a node go to previous lower one ++ * ++ * 2c. Find the node with the closest value either less than or greater ++ * than a given value with avl_nearest(). ++ * ++ * 2d. Remove individual nodes from the list/tree with avl_remove(). ++ * ++ * and finally when the list is being destroyed ++ * ++ * 3. Use avl_destroy_nodes() to quickly process/free up any remaining nodes. ++ * Note that once you use avl_destroy_nodes(), you can no longer ++ * use any routine except avl_destroy_nodes() and avl_destoy(). ++ * ++ * 4. Use avl_destroy() to destroy the AVL tree itself. ++ * ++ * Any locking for multiple thread access is up to the user to provide, just ++ * as is needed for any linked list implementation. ++ */ ++ ++ ++/* ++ * Type used for the root of the AVL tree. ++ */ ++typedef struct avl_tree avl_tree_t; ++ ++/* ++ * The data nodes in the AVL tree must have a field of this type. ++ */ ++typedef struct avl_node avl_node_t; ++ ++/* ++ * An opaque type used to locate a position in the tree where a node ++ * would be inserted. ++ */ ++typedef uintptr_t avl_index_t; ++ ++ ++/* ++ * Direction constants used for avl_nearest(). ++ */ ++#define AVL_BEFORE (0) ++#define AVL_AFTER (1) ++ ++ ++/* ++ * Prototypes ++ * ++ * Where not otherwise mentioned, "void *" arguments are a pointer to the ++ * user data structure which must contain a field of type avl_node_t. ++ * ++ * Also assume the user data structures looks like: ++ * stuct my_type { ++ * ... ++ * avl_node_t my_link; ++ * ... ++ * }; ++ */ ++ ++/* ++ * Initialize an AVL tree. Arguments are: ++ * ++ * tree - the tree to be initialized ++ * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 ++ * -1 for <, 0 for ==, and +1 for > ++ * size - the value of sizeof(struct my_type) ++ * offset - the value of OFFSETOF(struct my_type, my_link) ++ */ ++extern void avl_create(avl_tree_t *tree, ++ int (*compar) (const void *, const void *), size_t size, size_t offset); ++ ++ ++/* ++ * Find a node with a matching value in the tree. Returns the matching node ++ * found. If not found, it returns NULL and then if "where" is not NULL it sets ++ * "where" for use with avl_insert() or avl_nearest(). ++ * ++ * node - node that has the value being looked for ++ * where - position for use with avl_nearest() or avl_insert(), may be NULL ++ */ ++extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where); ++ ++/* ++ * Insert a node into the tree. ++ * ++ * node - the node to insert ++ * where - position as returned from avl_find() ++ */ ++extern void avl_insert(avl_tree_t *tree, void *node, avl_index_t where); ++ ++/* ++ * Insert "new_data" in "tree" in the given "direction" either after ++ * or before the data "here". ++ * ++ * This might be usefull for avl clients caching recently accessed ++ * data to avoid doing avl_find() again for insertion. ++ * ++ * new_data - new data to insert ++ * here - existing node in "tree" ++ * direction - either AVL_AFTER or AVL_BEFORE the data "here". ++ */ ++extern void avl_insert_here(avl_tree_t *tree, void *new_data, void *here, ++ int direction); ++ ++ ++/* ++ * Return the first or last valued node in the tree. Will return NULL ++ * if the tree is empty. ++ * ++ */ ++extern void *avl_first(avl_tree_t *tree); ++extern void *avl_last(avl_tree_t *tree); ++ ++ ++/* ++ * Return the next or previous valued node in the tree. ++ * AVL_NEXT() will return NULL if at the last node. ++ * AVL_PREV() will return NULL if at the first node. ++ * ++ * node - the node from which the next or previous node is found ++ */ ++#define AVL_NEXT(tree, node) avl_walk(tree, node, AVL_AFTER) ++#define AVL_PREV(tree, node) avl_walk(tree, node, AVL_BEFORE) ++ ++ ++/* ++ * Find the node with the nearest value either greater or less than ++ * the value from a previous avl_find(). Returns the node or NULL if ++ * there isn't a matching one. ++ * ++ * where - position as returned from avl_find() ++ * direction - either AVL_BEFORE or AVL_AFTER ++ * ++ * EXAMPLE get the greatest node that is less than a given value: ++ * ++ * avl_tree_t *tree; ++ * struct my_data look_for_value = {....}; ++ * struct my_data *node; ++ * struct my_data *less; ++ * avl_index_t where; ++ * ++ * node = avl_find(tree, &look_for_value, &where); ++ * if (node != NULL) ++ * less = AVL_PREV(tree, node); ++ * else ++ * less = avl_nearest(tree, where, AVL_BEFORE); ++ */ ++extern void *avl_nearest(avl_tree_t *tree, avl_index_t where, int direction); ++ ++ ++/* ++ * Add a single node to the tree. ++ * The node must not be in the tree, and it must not ++ * compare equal to any other node already in the tree. ++ * ++ * node - the node to add ++ */ ++extern void avl_add(avl_tree_t *tree, void *node); ++ ++ ++/* ++ * Remove a single node from the tree. The node must be in the tree. ++ * ++ * node - the node to remove ++ */ ++extern void avl_remove(avl_tree_t *tree, void *node); ++ ++/* ++ * Reinsert a node only if its order has changed relative to its nearest ++ * neighbors. To optimize performance avl_update_lt() checks only the previous ++ * node and avl_update_gt() checks only the next node. Use avl_update_lt() and ++ * avl_update_gt() only if you know the direction in which the order of the ++ * node may change. ++ */ ++extern boolean_t avl_update(avl_tree_t *, void *); ++extern boolean_t avl_update_lt(avl_tree_t *, void *); ++extern boolean_t avl_update_gt(avl_tree_t *, void *); ++ ++/* ++ * Return the number of nodes in the tree ++ */ ++extern ulong_t avl_numnodes(avl_tree_t *tree); ++ ++/* ++ * Return B_TRUE if there are zero nodes in the tree, B_FALSE otherwise. ++ */ ++extern boolean_t avl_is_empty(avl_tree_t *tree); ++ ++/* ++ * Used to destroy any remaining nodes in a tree. The cookie argument should ++ * be initialized to NULL before the first call. Returns a node that has been ++ * removed from the tree and may be free()'d. Returns NULL when the tree is ++ * empty. ++ * ++ * Once you call avl_destroy_nodes(), you can only continuing calling it and ++ * finally avl_destroy(). No other AVL routines will be valid. ++ * ++ * cookie - a "void *" used to save state between calls to avl_destroy_nodes() ++ * ++ * EXAMPLE: ++ * avl_tree_t *tree; ++ * struct my_data *node; ++ * void *cookie; ++ * ++ * cookie = NULL; ++ * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL) ++ * free(node); ++ * avl_destroy(tree); ++ */ ++extern void *avl_destroy_nodes(avl_tree_t *tree, void **cookie); ++ ++ ++/* ++ * Final destroy of an AVL tree. Arguments are: ++ * ++ * tree - the empty tree to destroy ++ */ ++extern void avl_destroy(avl_tree_t *tree); ++ ++ ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _AVL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/avl_impl.h linux-3.2.33-go/include/zfs/sys/avl_impl.h +--- linux-3.2.33-go.orig/include/zfs/sys/avl_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/avl_impl.h 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,164 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License, Version 1.0 only ++ * (the "License"). You may not use this file except in compliance ++ * with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2004 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _AVL_IMPL_H ++#define _AVL_IMPL_H ++ ++ ++ ++/* ++ * This is a private header file. Applications should not directly include ++ * this file. ++ */ ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++ ++/* ++ * generic AVL tree implementation for kernel use ++ * ++ * There are 5 pieces of information stored for each node in an AVL tree ++ * ++ * pointer to less than child ++ * pointer to greater than child ++ * a pointer to the parent of this node ++ * an indication [0/1] of which child I am of my parent ++ * a "balance" (-1, 0, +1) indicating which child tree is taller ++ * ++ * Since they only need 3 bits, the last two fields are packed into the ++ * bottom bits of the parent pointer on 64 bit machines to save on space. ++ */ ++ ++#ifndef _LP64 ++ ++struct avl_node { ++ struct avl_node *avl_child[2]; /* left/right children */ ++ struct avl_node *avl_parent; /* this node's parent */ ++ unsigned short avl_child_index; /* my index in parent's avl_child[] */ ++ short avl_balance; /* balance value: -1, 0, +1 */ ++}; ++ ++#define AVL_XPARENT(n) ((n)->avl_parent) ++#define AVL_SETPARENT(n, p) ((n)->avl_parent = (p)) ++ ++#define AVL_XCHILD(n) ((n)->avl_child_index) ++#define AVL_SETCHILD(n, c) ((n)->avl_child_index = (unsigned short)(c)) ++ ++#define AVL_XBALANCE(n) ((n)->avl_balance) ++#define AVL_SETBALANCE(n, b) ((n)->avl_balance = (short)(b)) ++ ++#else /* _LP64 */ ++ ++/* ++ * for 64 bit machines, avl_pcb contains parent pointer, balance and child_index ++ * values packed in the following manner: ++ * ++ * |63 3| 2 |1 0 | ++ * |-------------------------------------|-----------------|-------------| ++ * | avl_parent hi order bits | avl_child_index | avl_balance | ++ * | | | + 1 | ++ * |-------------------------------------|-----------------|-------------| ++ * ++ */ ++struct avl_node { ++ struct avl_node *avl_child[2]; /* left/right children nodes */ ++ uintptr_t avl_pcb; /* parent, child_index, balance */ ++}; ++ ++/* ++ * macros to extract/set fields in avl_pcb ++ * ++ * pointer to the parent of the current node is the high order bits ++ */ ++#define AVL_XPARENT(n) ((struct avl_node *)((n)->avl_pcb & ~7)) ++#define AVL_SETPARENT(n, p) \ ++ ((n)->avl_pcb = (((n)->avl_pcb & 7) | (uintptr_t)(p))) ++ ++/* ++ * index of this node in its parent's avl_child[]: bit #2 ++ */ ++#define AVL_XCHILD(n) (((n)->avl_pcb >> 2) & 1) ++#define AVL_SETCHILD(n, c) \ ++ ((n)->avl_pcb = (uintptr_t)(((n)->avl_pcb & ~4) | ((c) << 2))) ++ ++/* ++ * balance indication for a node, lowest 2 bits. A valid balance is ++ * -1, 0, or +1, and is encoded by adding 1 to the value to get the ++ * unsigned values of 0, 1, 2. ++ */ ++#define AVL_XBALANCE(n) ((int)(((n)->avl_pcb & 3) - 1)) ++#define AVL_SETBALANCE(n, b) \ ++ ((n)->avl_pcb = (uintptr_t)((((n)->avl_pcb & ~3) | ((b) + 1)))) ++ ++#endif /* _LP64 */ ++ ++ ++ ++/* ++ * switch between a node and data pointer for a given tree ++ * the value of "o" is tree->avl_offset ++ */ ++#define AVL_NODE2DATA(n, o) ((void *)((uintptr_t)(n) - (o))) ++#define AVL_DATA2NODE(d, o) ((struct avl_node *)((uintptr_t)(d) + (o))) ++ ++ ++ ++/* ++ * macros used to create/access an avl_index_t ++ */ ++#define AVL_INDEX2NODE(x) ((avl_node_t *)((x) & ~1)) ++#define AVL_INDEX2CHILD(x) ((x) & 1) ++#define AVL_MKINDEX(n, c) ((avl_index_t)(n) | (c)) ++ ++ ++/* ++ * The tree structure. The fields avl_root, avl_compar, and avl_offset come ++ * first since they are needed for avl_find(). We want them to fit into ++ * a single 64 byte cache line to make avl_find() as fast as possible. ++ */ ++struct avl_tree { ++ struct avl_node *avl_root; /* root node in tree */ ++ int (*avl_compar)(const void *, const void *); ++ size_t avl_offset; /* offsetof(type, avl_link_t field) */ ++ ulong_t avl_numnodes; /* number of nodes in the tree */ ++ size_t avl_size; /* sizeof user type struct */ ++}; ++ ++ ++/* ++ * This will only by used via AVL_NEXT() or AVL_PREV() ++ */ ++extern void *avl_walk(struct avl_tree *, void *, int); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _AVL_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/bplist.h linux-3.2.33-go/include/zfs/sys/bplist.h +--- linux-3.2.33-go.orig/include/zfs/sys/bplist.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/bplist.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,57 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_BPLIST_H ++#define _SYS_BPLIST_H ++ ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++typedef struct bplist_entry { ++ blkptr_t bpe_blk; ++ list_node_t bpe_node; ++} bplist_entry_t; ++ ++typedef struct bplist { ++ kmutex_t bpl_lock; ++ list_t bpl_list; ++} bplist_t; ++ ++typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); ++ ++void bplist_create(bplist_t *bpl); ++void bplist_destroy(bplist_t *bpl); ++void bplist_append(bplist_t *bpl, const blkptr_t *bp); ++void bplist_iterate(bplist_t *bpl, bplist_itor_t *func, ++ void *arg, dmu_tx_t *tx); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_BPLIST_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/bpobj.h linux-3.2.33-go/include/zfs/sys/bpobj.h +--- linux-3.2.33-go.orig/include/zfs/sys/bpobj.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/bpobj.h 2012-11-16 23:25:34.344039393 +0100 +@@ -0,0 +1,91 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_BPOBJ_H ++#define _SYS_BPOBJ_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++typedef struct bpobj_phys { ++ /* ++ * This is the bonus buffer for the dead lists. The object's ++ * contents is an array of bpo_entries blkptr_t's, representing ++ * a total of bpo_bytes physical space. ++ */ ++ uint64_t bpo_num_blkptrs; ++ uint64_t bpo_bytes; ++ uint64_t bpo_comp; ++ uint64_t bpo_uncomp; ++ uint64_t bpo_subobjs; ++ uint64_t bpo_num_subobjs; ++} bpobj_phys_t; ++ ++#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t)) ++#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t)) ++ ++typedef struct bpobj { ++ kmutex_t bpo_lock; ++ objset_t *bpo_os; ++ uint64_t bpo_object; ++ int bpo_epb; ++ uint8_t bpo_havecomp; ++ uint8_t bpo_havesubobj; ++ bpobj_phys_t *bpo_phys; ++ dmu_buf_t *bpo_dbuf; ++ dmu_buf_t *bpo_cached_dbuf; ++} bpobj_t; ++ ++typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); ++ ++uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx); ++void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); ++ ++int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object); ++void bpobj_close(bpobj_t *bpo); ++ ++int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx); ++int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *); ++int bpobj_iterate_dbg(bpobj_t *bpo, uint64_t *itorp, blkptr_t *bp); ++ ++void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx); ++void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx); ++ ++int bpobj_space(bpobj_t *bpo, ++ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); ++int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, ++ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_BPOBJ_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dbuf.h linux-3.2.33-go/include/zfs/sys/dbuf.h +--- linux-3.2.33-go.orig/include/zfs/sys/dbuf.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dbuf.h 2012-11-16 23:25:34.344039393 +0100 +@@ -0,0 +1,372 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_DBUF_H ++#define _SYS_DBUF_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#define IN_DMU_SYNC 2 ++ ++/* ++ * define flags for dbuf_read ++ */ ++ ++#define DB_RF_MUST_SUCCEED (1 << 0) ++#define DB_RF_CANFAIL (1 << 1) ++#define DB_RF_HAVESTRUCT (1 << 2) ++#define DB_RF_NOPREFETCH (1 << 3) ++#define DB_RF_NEVERWAIT (1 << 4) ++#define DB_RF_CACHED (1 << 5) ++ ++/* ++ * The simplified state transition diagram for dbufs looks like: ++ * ++ * +----> READ ----+ ++ * | | ++ * | V ++ * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) ++ * | ^ ^ ++ * | | | ++ * +----> FILL ----+ | ++ * | | ++ * | | ++ * +--------> NOFILL -------+ ++ */ ++typedef enum dbuf_states { ++ DB_UNCACHED, ++ DB_FILL, ++ DB_NOFILL, ++ DB_READ, ++ DB_CACHED, ++ DB_EVICTING ++} dbuf_states_t; ++ ++struct dnode; ++struct dmu_tx; ++ ++/* ++ * level = 0 means the user data ++ * level = 1 means the single indirect block ++ * etc. ++ */ ++ ++struct dmu_buf_impl; ++ ++typedef enum override_states { ++ DR_NOT_OVERRIDDEN, ++ DR_IN_DMU_SYNC, ++ DR_OVERRIDDEN ++} override_states_t; ++ ++typedef struct dbuf_dirty_record { ++ /* link on our parents dirty list */ ++ list_node_t dr_dirty_node; ++ ++ /* transaction group this data will sync in */ ++ uint64_t dr_txg; ++ ++ /* zio of outstanding write IO */ ++ zio_t *dr_zio; ++ ++ /* pointer back to our dbuf */ ++ struct dmu_buf_impl *dr_dbuf; ++ ++ /* pointer to next dirty record */ ++ struct dbuf_dirty_record *dr_next; ++ ++ /* pointer to parent dirty record */ ++ struct dbuf_dirty_record *dr_parent; ++ ++ union dirty_types { ++ struct dirty_indirect { ++ ++ /* protect access to list */ ++ kmutex_t dr_mtx; ++ ++ /* Our list of dirty children */ ++ list_t dr_children; ++ } di; ++ struct dirty_leaf { ++ ++ /* ++ * dr_data is set when we dirty the buffer ++ * so that we can retain the pointer even if it ++ * gets COW'd in a subsequent transaction group. ++ */ ++ arc_buf_t *dr_data; ++ blkptr_t dr_overridden_by; ++ override_states_t dr_override_state; ++ uint8_t dr_copies; ++ } dl; ++ } dt; ++} dbuf_dirty_record_t; ++ ++typedef struct dmu_buf_impl { ++ /* ++ * The following members are immutable, with the exception of ++ * db.db_data, which is protected by db_mtx. ++ */ ++ ++ /* the publicly visible structure */ ++ dmu_buf_t db; ++ ++ /* the objset we belong to */ ++ struct objset *db_objset; ++ ++ /* ++ * handle to safely access the dnode we belong to (NULL when evicted) ++ */ ++ struct dnode_handle *db_dnode_handle; ++ ++ /* ++ * our parent buffer; if the dnode points to us directly, ++ * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf ++ * only accessed by sync thread ??? ++ * (NULL when evicted) ++ * May change from NULL to non-NULL under the protection of db_mtx ++ * (see dbuf_check_blkptr()) ++ */ ++ struct dmu_buf_impl *db_parent; ++ ++ /* ++ * link for hash table of all dmu_buf_impl_t's ++ */ ++ struct dmu_buf_impl *db_hash_next; ++ ++ /* our block number */ ++ uint64_t db_blkid; ++ ++ /* ++ * Pointer to the blkptr_t which points to us. May be NULL if we ++ * don't have one yet. (NULL when evicted) ++ */ ++ blkptr_t *db_blkptr; ++ ++ /* ++ * Our indirection level. Data buffers have db_level==0. ++ * Indirect buffers which point to data buffers have ++ * db_level==1. etc. Buffers which contain dnodes have ++ * db_level==0, since the dnodes are stored in a file. ++ */ ++ uint8_t db_level; ++ ++ /* db_mtx protects the members below */ ++ kmutex_t db_mtx; ++ ++ /* ++ * Current state of the buffer ++ */ ++ dbuf_states_t db_state; ++ ++ /* ++ * Refcount accessed by dmu_buf_{hold,rele}. ++ * If nonzero, the buffer can't be destroyed. ++ * Protected by db_mtx. ++ */ ++ refcount_t db_holds; ++ ++ /* buffer holding our data */ ++ arc_buf_t *db_buf; ++ ++ kcondvar_t db_changed; ++ dbuf_dirty_record_t *db_data_pending; ++ ++ /* pointer to most recent dirty record for this buffer */ ++ dbuf_dirty_record_t *db_last_dirty; ++ ++ /* ++ * Our link on the owner dnodes's dn_dbufs list. ++ * Protected by its dn_dbufs_mtx. ++ */ ++ list_node_t db_link; ++ ++ /* Data which is unique to data (leaf) blocks: */ ++ ++ /* stuff we store for the user (see dmu_buf_set_user) */ ++ void *db_user_ptr; ++ void **db_user_data_ptr_ptr; ++ dmu_buf_evict_func_t *db_evict_func; ++ ++ uint8_t db_immediate_evict; ++ uint8_t db_freed_in_flight; ++ ++ uint8_t db_dirtycnt; ++} dmu_buf_impl_t; ++ ++/* Note: the dbuf hash table is exposed only for the mdb module */ ++#define DBUF_MUTEXES 256 ++#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) ++typedef struct dbuf_hash_table { ++ uint64_t hash_table_mask; ++ dmu_buf_impl_t **hash_table; ++ kmutex_t hash_mutexes[DBUF_MUTEXES]; ++} dbuf_hash_table_t; ++ ++ ++uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); ++ ++void dbuf_create_bonus(struct dnode *dn); ++int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx); ++ ++void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx); ++ ++dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); ++dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, ++ void *tag); ++int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, ++ void *tag, dmu_buf_impl_t **dbp); ++ ++void dbuf_prefetch(struct dnode *dn, uint64_t blkid); ++ ++void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); ++uint64_t dbuf_refcount(dmu_buf_impl_t *db); ++ ++void dbuf_rele(dmu_buf_impl_t *db, void *tag); ++void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag); ++ ++dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid); ++ ++int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); ++void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); ++void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); ++void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); ++void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); ++void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); ++void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); ++dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); ++arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); ++ ++void dbuf_clear(dmu_buf_impl_t *db); ++void dbuf_evict(dmu_buf_impl_t *db); ++ ++void dbuf_unoverride(dbuf_dirty_record_t *dr); ++void dbuf_sync_list(list_t *list, dmu_tx_t *tx); ++void dbuf_release_bp(dmu_buf_impl_t *db); ++ ++void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, ++ struct dmu_tx *); ++ ++void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); ++ ++#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) ++#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) ++#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) ++#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db))) ++#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db))) ++#define DB_GET_SPA(_spa_p, _db) { \ ++ dnode_t *__dn; \ ++ DB_DNODE_ENTER(_db); \ ++ __dn = DB_DNODE(_db); \ ++ *(_spa_p) = __dn->dn_objset->os_spa; \ ++ DB_DNODE_EXIT(_db); \ ++} ++#define DB_GET_OBJSET(_os_p, _db) { \ ++ dnode_t *__dn; \ ++ DB_DNODE_ENTER(_db); \ ++ __dn = DB_DNODE(_db); \ ++ *(_os_p) = __dn->dn_objset; \ ++ DB_DNODE_EXIT(_db); \ ++} ++ ++void dbuf_init(void); ++void dbuf_fini(void); ++ ++boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); ++ ++#define DBUF_IS_METADATA(_db) \ ++ (dbuf_is_metadata(_db)) ++ ++#define DBUF_GET_BUFC_TYPE(_db) \ ++ (DBUF_IS_METADATA(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) ++ ++#define DBUF_IS_CACHEABLE(_db) \ ++ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ ++ (DBUF_IS_METADATA(_db) && \ ++ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) ++ ++#define DBUF_IS_L2CACHEABLE(_db) \ ++ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ ++ (DBUF_IS_METADATA(_db) && \ ++ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) ++ ++#ifdef ZFS_DEBUG ++ ++/* ++ * There should be a ## between the string literal and fmt, to make it ++ * clear that we're joining two strings together, but gcc does not ++ * support that preprocessor token. ++ */ ++#define dprintf_dbuf(dbuf, fmt, ...) do { \ ++ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ ++ char __db_buf[32]; \ ++ uint64_t __db_obj = (dbuf)->db.db_object; \ ++ if (__db_obj == DMU_META_DNODE_OBJECT) \ ++ (void) strcpy(__db_buf, "mdn"); \ ++ else \ ++ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ ++ (u_longlong_t)__db_obj); \ ++ dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \ ++ "obj=%s lvl=%u blkid=%lld " fmt, \ ++ __db_buf, (dbuf)->db_level, \ ++ (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \ ++ } \ ++_NOTE(CONSTCOND) } while (0) ++ ++#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ ++ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ ++ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_PUSHPAGE); \ ++ sprintf_blkptr(__blkbuf, bp); \ ++ dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ ++ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ ++ } \ ++_NOTE(CONSTCOND) } while (0) ++ ++#define DBUF_VERIFY(db) dbuf_verify(db) ++ ++#else ++ ++#define dprintf_dbuf(db, fmt, ...) ++#define dprintf_dbuf_bp(db, bp, fmt, ...) ++#define DBUF_VERIFY(db) ++ ++#endif ++ ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DBUF_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/ddt.h linux-3.2.33-go/include/zfs/sys/ddt.h +--- linux-3.2.33-go.orig/include/zfs/sys/ddt.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/ddt.h 2012-11-16 23:25:34.339039449 +0100 +@@ -0,0 +1,246 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_DDT_H ++#define _SYS_DDT_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * On-disk DDT formats, in the desired search order (newest version first). ++ */ ++enum ddt_type { ++ DDT_TYPE_ZAP = 0, ++ DDT_TYPES ++}; ++ ++/* ++ * DDT classes, in the desired search order (highest replication level first). ++ */ ++enum ddt_class { ++ DDT_CLASS_DITTO = 0, ++ DDT_CLASS_DUPLICATE, ++ DDT_CLASS_UNIQUE, ++ DDT_CLASSES ++}; ++ ++#define DDT_TYPE_CURRENT 0 ++ ++#define DDT_COMPRESS_BYTEORDER_MASK 0x80 ++#define DDT_COMPRESS_FUNCTION_MASK 0x7f ++ ++/* ++ * On-disk ddt entry: key (name) and physical storage (value). ++ */ ++typedef struct ddt_key { ++ zio_cksum_t ddk_cksum; /* 256-bit block checksum */ ++ uint64_t ddk_prop; /* LSIZE, PSIZE, compression */ ++} ddt_key_t; ++ ++/* ++ * ddk_prop layout: ++ * ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * | 0 | 0 | 0 | comp | PSIZE | LSIZE | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ */ ++#define DDK_GET_LSIZE(ddk) \ ++ BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) ++#define DDK_SET_LSIZE(ddk, x) \ ++ BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) ++ ++#define DDK_GET_PSIZE(ddk) \ ++ BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) ++#define DDK_SET_PSIZE(ddk, x) \ ++ BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) ++ ++#define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 8) ++#define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 8, x) ++ ++#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t)) ++ ++typedef struct ddt_phys { ++ dva_t ddp_dva[SPA_DVAS_PER_BP]; ++ uint64_t ddp_refcnt; ++ uint64_t ddp_phys_birth; ++} ddt_phys_t; ++ ++enum ddt_phys_type { ++ DDT_PHYS_DITTO = 0, ++ DDT_PHYS_SINGLE = 1, ++ DDT_PHYS_DOUBLE = 2, ++ DDT_PHYS_TRIPLE = 3, ++ DDT_PHYS_TYPES ++}; ++ ++/* ++ * In-core ddt entry ++ */ ++struct ddt_entry { ++ ddt_key_t dde_key; ++ ddt_phys_t dde_phys[DDT_PHYS_TYPES]; ++ zio_t *dde_lead_zio[DDT_PHYS_TYPES]; ++ void *dde_repair_data; ++ enum ddt_type dde_type; ++ enum ddt_class dde_class; ++ uint8_t dde_loading; ++ uint8_t dde_loaded; ++ kcondvar_t dde_cv; ++ avl_node_t dde_node; ++}; ++ ++/* ++ * In-core ddt ++ */ ++struct ddt { ++ kmutex_t ddt_lock; ++ avl_tree_t ddt_tree; ++ avl_tree_t ddt_repair_tree; ++ enum zio_checksum ddt_checksum; ++ spa_t *ddt_spa; ++ objset_t *ddt_os; ++ uint64_t ddt_stat_object; ++ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; ++ ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; ++ ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; ++ ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; ++ avl_node_t ddt_node; ++}; ++ ++/* ++ * In-core and on-disk bookmark for DDT walks ++ */ ++typedef struct ddt_bookmark { ++ uint64_t ddb_class; ++ uint64_t ddb_type; ++ uint64_t ddb_checksum; ++ uint64_t ddb_cursor; ++} ddt_bookmark_t; ++ ++/* ++ * Ops vector to access a specific DDT object type. ++ */ ++typedef struct ddt_ops { ++ char ddt_op_name[32]; ++ int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, ++ boolean_t prehash); ++ int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); ++ int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde); ++ void (*ddt_op_prefetch)(objset_t *os, uint64_t object, ++ ddt_entry_t *dde); ++ int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde, ++ dmu_tx_t *tx); ++ int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde, ++ dmu_tx_t *tx); ++ int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde, ++ uint64_t *walk); ++ int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); ++} ddt_ops_t; ++ ++#define DDT_NAMELEN 80 ++ ++extern void ddt_object_name(ddt_t *ddt, enum ddt_type type, ++ enum ddt_class class, char *name); ++extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type, ++ enum ddt_class class, uint64_t *walk, ddt_entry_t *dde); ++extern int ddt_object_count(ddt_t *ddt, enum ddt_type type, ++ enum ddt_class class, uint64_t *count); ++extern int ddt_object_info(ddt_t *ddt, enum ddt_type type, ++ enum ddt_class class, dmu_object_info_t *); ++extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type, ++ enum ddt_class class); ++ ++extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, ++ uint64_t txg); ++extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, ++ const ddt_phys_t *ddp, blkptr_t *bp); ++ ++extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); ++ ++extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp); ++extern void ddt_phys_clear(ddt_phys_t *ddp); ++extern void ddt_phys_addref(ddt_phys_t *ddp); ++extern void ddt_phys_decref(ddt_phys_t *ddp); ++extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, ++ uint64_t txg); ++extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp); ++extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); ++ ++extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); ++ ++extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); ++extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); ++extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh); ++extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo); ++extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh); ++extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); ++ ++extern uint64_t ddt_get_dedup_dspace(spa_t *spa); ++extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa); ++ ++extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ++ ddt_phys_t *ddp_willref); ++extern int ddt_ditto_copies_present(ddt_entry_t *dde); ++ ++extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len); ++extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len); ++ ++extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); ++extern void ddt_enter(ddt_t *ddt); ++extern void ddt_exit(ddt_t *ddt); ++extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); ++extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); ++extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); ++ ++extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class, ++ const blkptr_t *bp); ++ ++extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); ++extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde); ++ ++extern int ddt_entry_compare(const void *x1, const void *x2); ++ ++extern void ddt_create(spa_t *spa); ++extern int ddt_load(spa_t *spa); ++extern void ddt_unload(spa_t *spa); ++extern void ddt_sync(spa_t *spa, uint64_t txg); ++extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); ++extern int ddt_object_update(ddt_t *ddt, enum ddt_type type, ++ enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx); ++ ++extern const ddt_ops_t ddt_zap_ops; ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DDT_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dmu.h linux-3.2.33-go/include/zfs/sys/dmu.h +--- linux-3.2.33-go.orig/include/zfs/sys/dmu.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dmu.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,749 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ * Copyright (c) 2012, Joyent, Inc. All rights reserved. ++ */ ++ ++/* Portions Copyright 2010 Robert Milkowski */ ++ ++#ifndef _SYS_DMU_H ++#define _SYS_DMU_H ++ ++/* ++ * This file describes the interface that the DMU provides for its ++ * consumers. ++ * ++ * The DMU also interacts with the SPA. That interface is described in ++ * dmu_spa.h. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct page; ++struct vnode; ++struct spa; ++struct zilog; ++struct zio; ++struct blkptr; ++struct zap_cursor; ++struct dsl_dataset; ++struct dsl_pool; ++struct dnode; ++struct drr_begin; ++struct drr_end; ++struct zbookmark; ++struct spa; ++struct nvlist; ++struct arc_buf; ++struct zio_prop; ++struct sa_handle; ++ ++typedef struct objset objset_t; ++typedef struct dmu_tx dmu_tx_t; ++typedef struct dsl_dir dsl_dir_t; ++ ++typedef enum dmu_object_type { ++ DMU_OT_NONE, ++ /* general: */ ++ DMU_OT_OBJECT_DIRECTORY, /* ZAP */ ++ DMU_OT_OBJECT_ARRAY, /* UINT64 */ ++ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ ++ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ ++ DMU_OT_BPOBJ, /* UINT64 */ ++ DMU_OT_BPOBJ_HDR, /* UINT64 */ ++ /* spa: */ ++ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ ++ DMU_OT_SPACE_MAP, /* UINT64 */ ++ /* zil: */ ++ DMU_OT_INTENT_LOG, /* UINT64 */ ++ /* dmu: */ ++ DMU_OT_DNODE, /* DNODE */ ++ DMU_OT_OBJSET, /* OBJSET */ ++ /* dsl: */ ++ DMU_OT_DSL_DIR, /* UINT64 */ ++ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ ++ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ ++ DMU_OT_DSL_PROPS, /* ZAP */ ++ DMU_OT_DSL_DATASET, /* UINT64 */ ++ /* zpl: */ ++ DMU_OT_ZNODE, /* ZNODE */ ++ DMU_OT_OLDACL, /* Old ACL */ ++ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ ++ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ ++ DMU_OT_MASTER_NODE, /* ZAP */ ++ DMU_OT_UNLINKED_SET, /* ZAP */ ++ /* zvol: */ ++ DMU_OT_ZVOL, /* UINT8 */ ++ DMU_OT_ZVOL_PROP, /* ZAP */ ++ /* other; for testing only! */ ++ DMU_OT_PLAIN_OTHER, /* UINT8 */ ++ DMU_OT_UINT64_OTHER, /* UINT64 */ ++ DMU_OT_ZAP_OTHER, /* ZAP */ ++ /* new object types: */ ++ DMU_OT_ERROR_LOG, /* ZAP */ ++ DMU_OT_SPA_HISTORY, /* UINT8 */ ++ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ ++ DMU_OT_POOL_PROPS, /* ZAP */ ++ DMU_OT_DSL_PERMS, /* ZAP */ ++ DMU_OT_ACL, /* ACL */ ++ DMU_OT_SYSACL, /* SYSACL */ ++ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ ++ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ ++ DMU_OT_NEXT_CLONES, /* ZAP */ ++ DMU_OT_SCAN_QUEUE, /* ZAP */ ++ DMU_OT_USERGROUP_USED, /* ZAP */ ++ DMU_OT_USERGROUP_QUOTA, /* ZAP */ ++ DMU_OT_USERREFS, /* ZAP */ ++ DMU_OT_DDT_ZAP, /* ZAP */ ++ DMU_OT_DDT_STATS, /* ZAP */ ++ DMU_OT_SA, /* System attr */ ++ DMU_OT_SA_MASTER_NODE, /* ZAP */ ++ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ ++ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ ++ DMU_OT_SCAN_XLATE, /* ZAP */ ++ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ ++ DMU_OT_DEADLIST, /* ZAP */ ++ DMU_OT_DEADLIST_HDR, /* UINT64 */ ++ DMU_OT_DSL_CLONES, /* ZAP */ ++ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ ++ DMU_OT_NUMTYPES ++} dmu_object_type_t; ++ ++typedef enum dmu_objset_type { ++ DMU_OST_NONE, ++ DMU_OST_META, ++ DMU_OST_ZFS, ++ DMU_OST_ZVOL, ++ DMU_OST_OTHER, /* For testing only! */ ++ DMU_OST_ANY, /* Be careful! */ ++ DMU_OST_NUMTYPES ++} dmu_objset_type_t; ++ ++void byteswap_uint64_array(void *buf, size_t size); ++void byteswap_uint32_array(void *buf, size_t size); ++void byteswap_uint16_array(void *buf, size_t size); ++void byteswap_uint8_array(void *buf, size_t size); ++void zap_byteswap(void *buf, size_t size); ++void zfs_oldacl_byteswap(void *buf, size_t size); ++void zfs_acl_byteswap(void *buf, size_t size); ++void zfs_znode_byteswap(void *buf, size_t size); ++ ++#define DS_FIND_SNAPSHOTS (1<<0) ++#define DS_FIND_CHILDREN (1<<1) ++ ++/* ++ * The maximum number of bytes that can be accessed as part of one ++ * operation, including metadata. ++ */ ++#define DMU_MAX_ACCESS (10<<20) /* 10MB */ ++#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ ++ ++#define DMU_USERUSED_OBJECT (-1ULL) ++#define DMU_GROUPUSED_OBJECT (-2ULL) ++#define DMU_DEADLIST_OBJECT (-3ULL) ++ ++/* ++ * artificial blkids for bonus buffer and spill blocks ++ */ ++#define DMU_BONUS_BLKID (-1ULL) ++#define DMU_SPILL_BLKID (-2ULL) ++/* ++ * Public routines to create, destroy, open, and close objsets. ++ */ ++int dmu_objset_hold(const char *name, void *tag, objset_t **osp); ++int dmu_objset_own(const char *name, dmu_objset_type_t type, ++ boolean_t readonly, void *tag, objset_t **osp); ++void dmu_objset_rele(objset_t *os, void *tag); ++void dmu_objset_disown(objset_t *os, void *tag); ++int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); ++ ++int dmu_objset_evict_dbufs(objset_t *os); ++int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, ++ void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); ++int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, ++ uint64_t flags); ++int dmu_objset_destroy(const char *name, boolean_t defer); ++int dmu_snapshots_destroy_nvl(struct nvlist *snaps, boolean_t defer, char *); ++int dmu_objset_snapshot(char *fsname, char *snapname, char *tag, ++ struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd); ++int dmu_objset_rename(const char *name, const char *newname, ++ boolean_t recursive); ++int dmu_objset_find(char *name, int func(const char *, void *), void *arg, ++ int flags); ++void dmu_objset_byteswap(void *buf, size_t size); ++ ++typedef struct dmu_buf { ++ uint64_t db_object; /* object that this buffer is part of */ ++ uint64_t db_offset; /* byte offset in this object */ ++ uint64_t db_size; /* size of buffer in bytes */ ++ void *db_data; /* data in buffer */ ++} dmu_buf_t; ++ ++typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); ++ ++/* ++ * The names of zap entries in the DIRECTORY_OBJECT of the MOS. ++ */ ++#define DMU_POOL_DIRECTORY_OBJECT 1 ++#define DMU_POOL_CONFIG "config" ++#define DMU_POOL_ROOT_DATASET "root_dataset" ++#define DMU_POOL_SYNC_BPOBJ "sync_bplist" ++#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" ++#define DMU_POOL_ERRLOG_LAST "errlog_last" ++#define DMU_POOL_SPARES "spares" ++#define DMU_POOL_DEFLATE "deflate" ++#define DMU_POOL_HISTORY "history" ++#define DMU_POOL_PROPS "pool_props" ++#define DMU_POOL_L2CACHE "l2cache" ++#define DMU_POOL_TMP_USERREFS "tmp_userrefs" ++#define DMU_POOL_DDT "DDT-%s-%s-%s" ++#define DMU_POOL_DDT_STATS "DDT-statistics" ++#define DMU_POOL_CREATION_VERSION "creation_version" ++#define DMU_POOL_SCAN "scan" ++#define DMU_POOL_FREE_BPOBJ "free_bpobj" ++ ++/* ++ * Allocate an object from this objset. The range of object numbers ++ * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. ++ * ++ * The transaction must be assigned to a txg. The newly allocated ++ * object will be "held" in the transaction (ie. you can modify the ++ * newly allocated object in this transaction). ++ * ++ * dmu_object_alloc() chooses an object and returns it in *objectp. ++ * ++ * dmu_object_claim() allocates a specific object number. If that ++ * number is already allocated, it fails and returns EEXIST. ++ * ++ * Return 0 on success, or ENOSPC or EEXIST as specified above. ++ */ ++uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, ++ int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); ++int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, ++ int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); ++int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, ++ int blocksize, dmu_object_type_t bonustype, int bonuslen); ++ ++/* ++ * Free an object from this objset. ++ * ++ * The object's data will be freed as well (ie. you don't need to call ++ * dmu_free(object, 0, -1, tx)). ++ * ++ * The object need not be held in the transaction. ++ * ++ * If there are any holds on this object's buffers (via dmu_buf_hold()), ++ * or tx holds on the object (via dmu_tx_hold_object()), you can not ++ * free it; it fails and returns EBUSY. ++ * ++ * If the object is not allocated, it fails and returns ENOENT. ++ * ++ * Return 0 on success, or EBUSY or ENOENT as specified above. ++ */ ++int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); ++ ++/* ++ * Find the next allocated or free object. ++ * ++ * The objectp parameter is in-out. It will be updated to be the next ++ * object which is allocated. Ignore objects which have not been ++ * modified since txg. ++ * ++ * XXX Can only be called on a objset with no dirty data. ++ * ++ * Returns 0 on success, or ENOENT if there are no more objects. ++ */ ++int dmu_object_next(objset_t *os, uint64_t *objectp, ++ boolean_t hole, uint64_t txg); ++ ++/* ++ * Set the data blocksize for an object. ++ * ++ * The object cannot have any blocks allcated beyond the first. If ++ * the first block is allocated already, the new size must be greater ++ * than the current block size. If these conditions are not met, ++ * ENOTSUP will be returned. ++ * ++ * Returns 0 on success, or EBUSY if there are any holds on the object ++ * contents, or ENOTSUP as described above. ++ */ ++int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, ++ int ibs, dmu_tx_t *tx); ++ ++/* ++ * Set the checksum property on a dnode. The new checksum algorithm will ++ * apply to all newly written blocks; existing blocks will not be affected. ++ */ ++void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, ++ dmu_tx_t *tx); ++ ++/* ++ * Set the compress property on a dnode. The new compression algorithm will ++ * apply to all newly written blocks; existing blocks will not be affected. ++ */ ++void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, ++ dmu_tx_t *tx); ++ ++/* ++ * Decide how to write a block: checksum, compression, number of copies, etc. ++ */ ++#define WP_NOFILL 0x1 ++#define WP_DMU_SYNC 0x2 ++#define WP_SPILL 0x4 ++ ++void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, ++ struct zio_prop *zp); ++/* ++ * The bonus data is accessed more or less like a regular buffer. ++ * You must dmu_bonus_hold() to get the buffer, which will give you a ++ * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus ++ * data. As with any normal buffer, you must call dmu_buf_read() to ++ * read db_data, dmu_buf_will_dirty() before modifying it, and the ++ * object must be held in an assigned transaction before calling ++ * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus ++ * buffer as well. You must release your hold with dmu_buf_rele(). ++ */ ++int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); ++int dmu_bonus_max(void); ++int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); ++int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); ++dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); ++int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); ++ ++/* ++ * Special spill buffer support used by "SA" framework ++ */ ++ ++int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); ++int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags, ++ void *tag, dmu_buf_t **dbp); ++int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); ++ ++/* ++ * Obtain the DMU buffer from the specified object which contains the ++ * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so ++ * that it will remain in memory. You must release the hold with ++ * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your ++ * hold. You must have a hold on any dmu_buf_t* you pass to the DMU. ++ * ++ * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill ++ * on the returned buffer before reading or writing the buffer's ++ * db_data. The comments for those routines describe what particular ++ * operations are valid after calling them. ++ * ++ * The object number must be a valid, allocated object number. ++ */ ++int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, ++ void *tag, dmu_buf_t **, int flags); ++void dmu_buf_add_ref(dmu_buf_t *db, void* tag); ++void dmu_buf_rele(dmu_buf_t *db, void *tag); ++uint64_t dmu_buf_refcount(dmu_buf_t *db); ++ ++/* ++ * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a ++ * range of an object. A pointer to an array of dmu_buf_t*'s is ++ * returned (in *dbpp). ++ * ++ * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and ++ * frees the array. The hold on the array of buffers MUST be released ++ * with dmu_buf_rele_array. You can NOT release the hold on each buffer ++ * individually with dmu_buf_rele. ++ */ ++int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, ++ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); ++void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); ++ ++/* ++ * Returns NULL on success, or the existing user ptr if it's already ++ * been set. ++ * ++ * user_ptr is for use by the user and can be obtained via dmu_buf_get_user(). ++ * ++ * user_data_ptr_ptr should be NULL, or a pointer to a pointer which ++ * will be set to db->db_data when you are allowed to access it. Note ++ * that db->db_data (the pointer) can change when you do dmu_buf_read(), ++ * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill(). ++ * *user_data_ptr_ptr will be set to the new value when it changes. ++ * ++ * If non-NULL, pageout func will be called when this buffer is being ++ * excised from the cache, so that you can clean up the data structure ++ * pointed to by user_ptr. ++ * ++ * dmu_evict_user() will call the pageout func for all buffers in a ++ * objset with a given pageout func. ++ */ ++void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr, ++ dmu_buf_evict_func_t *pageout_func); ++/* ++ * set_user_ie is the same as set_user, but request immediate eviction ++ * when hold count goes to zero. ++ */ ++void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, ++ void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func); ++void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, ++ void *user_ptr, void *user_data_ptr_ptr, ++ dmu_buf_evict_func_t *pageout_func); ++void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); ++ ++/* ++ * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set. ++ */ ++void *dmu_buf_get_user(dmu_buf_t *db); ++ ++/* ++ * Indicate that you are going to modify the buffer's data (db_data). ++ * ++ * The transaction (tx) must be assigned to a txg (ie. you've called ++ * dmu_tx_assign()). The buffer's object must be held in the tx ++ * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). ++ */ ++void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); ++ ++/* ++ * Tells if the given dbuf is freeable. ++ */ ++boolean_t dmu_buf_freeable(dmu_buf_t *); ++ ++/* ++ * You must create a transaction, then hold the objects which you will ++ * (or might) modify as part of this transaction. Then you must assign ++ * the transaction to a transaction group. Once the transaction has ++ * been assigned, you can modify buffers which belong to held objects as ++ * part of this transaction. You can't modify buffers before the ++ * transaction has been assigned; you can't modify buffers which don't ++ * belong to objects which this transaction holds; you can't hold ++ * objects once the transaction has been assigned. You may hold an ++ * object which you are going to free (with dmu_object_free()), but you ++ * don't have to. ++ * ++ * You can abort the transaction before it has been assigned. ++ * ++ * Note that you may hold buffers (with dmu_buf_hold) at any time, ++ * regardless of transaction state. ++ */ ++ ++#define DMU_NEW_OBJECT (-1ULL) ++#define DMU_OBJECT_END (-1ULL) ++ ++dmu_tx_t *dmu_tx_create(objset_t *os); ++void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); ++void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, ++ uint64_t len); ++void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); ++void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); ++void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); ++void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); ++void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); ++void dmu_tx_abort(dmu_tx_t *tx); ++int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); ++void dmu_tx_wait(dmu_tx_t *tx); ++void dmu_tx_commit(dmu_tx_t *tx); ++ ++/* ++ * To register a commit callback, dmu_tx_callback_register() must be called. ++ * ++ * dcb_data is a pointer to caller private data that is passed on as a ++ * callback parameter. The caller is responsible for properly allocating and ++ * freeing it. ++ * ++ * When registering a callback, the transaction must be already created, but ++ * it cannot be committed or aborted. It can be assigned to a txg or not. ++ * ++ * The callback will be called after the transaction has been safely written ++ * to stable storage and will also be called if the dmu_tx is aborted. ++ * If there is any error which prevents the transaction from being committed to ++ * disk, the callback will be called with a value of error != 0. ++ */ ++typedef void dmu_tx_callback_func_t(void *dcb_data, int error); ++ ++void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, ++ void *dcb_data); ++ ++/* ++ * Free up the data blocks for a defined range of a file. If size is ++ * zero, the range from offset to end-of-file is freed. ++ */ ++int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, ++ uint64_t size, dmu_tx_t *tx); ++int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, ++ uint64_t size); ++int dmu_free_object(objset_t *os, uint64_t object); ++ ++/* ++ * Convenience functions. ++ * ++ * Canfail routines will return 0 on success, or an errno if there is a ++ * nonrecoverable I/O error. ++ */ ++#define DMU_READ_PREFETCH 0 /* prefetch */ ++#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ ++int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, ++ void *buf, uint32_t flags); ++void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, ++ const void *buf, dmu_tx_t *tx); ++void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, ++ dmu_tx_t *tx); ++#ifdef _KERNEL ++#include ++int dmu_read_req(objset_t *os, uint64_t object, struct request *req); ++int dmu_write_req(objset_t *os, uint64_t object, struct request *req, ++ dmu_tx_t *tx); ++int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); ++int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, ++ dmu_tx_t *tx); ++int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, ++ dmu_tx_t *tx); ++#endif ++struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); ++void dmu_return_arcbuf(struct arc_buf *buf); ++void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, ++ dmu_tx_t *tx); ++int dmu_xuio_init(struct xuio *uio, int niov); ++void dmu_xuio_fini(struct xuio *uio); ++int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, ++ size_t n); ++int dmu_xuio_cnt(struct xuio *uio); ++struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); ++void dmu_xuio_clear(struct xuio *uio, int i); ++void xuio_stat_wbuf_copied(void); ++void xuio_stat_wbuf_nocopy(void); ++ ++extern int zfs_prefetch_disable; ++ ++/* ++ * Asynchronously try to read in the data. ++ */ ++void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, ++ uint64_t len); ++ ++typedef struct dmu_object_info { ++ /* All sizes are in bytes unless otherwise indicated. */ ++ uint32_t doi_data_block_size; ++ uint32_t doi_metadata_block_size; ++ dmu_object_type_t doi_type; ++ dmu_object_type_t doi_bonus_type; ++ uint64_t doi_bonus_size; ++ uint8_t doi_indirection; /* 2 = dnode->indirect->data */ ++ uint8_t doi_checksum; ++ uint8_t doi_compress; ++ uint8_t doi_pad[5]; ++ uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ ++ uint64_t doi_max_offset; ++ uint64_t doi_fill_count; /* number of non-empty blocks */ ++} dmu_object_info_t; ++ ++typedef void arc_byteswap_func_t(void *buf, size_t size); ++ ++typedef struct dmu_object_type_info { ++ arc_byteswap_func_t *ot_byteswap; ++ boolean_t ot_metadata; ++ char *ot_name; ++} dmu_object_type_info_t; ++ ++extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; ++ ++/* ++ * Get information on a DMU object. ++ * ++ * Return 0 on success or ENOENT if object is not allocated. ++ * ++ * If doi is NULL, just indicates whether the object exists. ++ */ ++int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); ++void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); ++void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); ++void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, ++ u_longlong_t *nblk512); ++ ++typedef struct dmu_objset_stats { ++ uint64_t dds_num_clones; /* number of clones of this */ ++ uint64_t dds_creation_txg; ++ uint64_t dds_guid; ++ dmu_objset_type_t dds_type; ++ uint8_t dds_is_snapshot; ++ uint8_t dds_inconsistent; ++ char dds_origin[MAXNAMELEN]; ++} dmu_objset_stats_t; ++ ++/* ++ * Get stats on a dataset. ++ */ ++void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); ++ ++/* ++ * Add entries to the nvlist for all the objset's properties. See ++ * zfs_prop_table[] and zfs(1m) for details on the properties. ++ */ ++void dmu_objset_stats(objset_t *os, struct nvlist *nv); ++ ++/* ++ * Get the space usage statistics for statvfs(). ++ * ++ * refdbytes is the amount of space "referenced" by this objset. ++ * availbytes is the amount of space available to this objset, taking ++ * into account quotas & reservations, assuming that no other objsets ++ * use the space first. These values correspond to the 'referenced' and ++ * 'available' properties, described in the zfs(1m) manpage. ++ * ++ * usedobjs and availobjs are the number of objects currently allocated, ++ * and available. ++ */ ++void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, ++ uint64_t *usedobjsp, uint64_t *availobjsp); ++ ++/* ++ * The fsid_guid is a 56-bit ID that can change to avoid collisions. ++ * (Contrast with the ds_guid which is a 64-bit ID that will never ++ * change, so there is a small probability that it will collide.) ++ */ ++uint64_t dmu_objset_fsid_guid(objset_t *os); ++ ++/* ++ * Get the [cm]time for an objset's snapshot dir ++ */ ++timestruc_t dmu_objset_snap_cmtime(objset_t *os); ++ ++int dmu_objset_is_snapshot(objset_t *os); ++ ++extern struct spa *dmu_objset_spa(objset_t *os); ++extern struct zilog *dmu_objset_zil(objset_t *os); ++extern struct dsl_pool *dmu_objset_pool(objset_t *os); ++extern struct dsl_dataset *dmu_objset_ds(objset_t *os); ++extern void dmu_objset_name(objset_t *os, char *buf); ++extern dmu_objset_type_t dmu_objset_type(objset_t *os); ++extern uint64_t dmu_objset_id(objset_t *os); ++extern uint64_t dmu_objset_syncprop(objset_t *os); ++extern uint64_t dmu_objset_logbias(objset_t *os); ++extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, ++ uint64_t *id, uint64_t *offp, boolean_t *case_conflict); ++extern int dmu_snapshot_id(objset_t *os, const char *snapname, uint64_t *idp); ++extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, ++ int maxlen, boolean_t *conflict); ++extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, ++ uint64_t *idp, uint64_t *offp); ++ ++typedef int objset_used_cb_t(dmu_object_type_t bonustype, ++ void *bonus, uint64_t *userp, uint64_t *groupp); ++extern void dmu_objset_register_type(dmu_objset_type_t ost, ++ objset_used_cb_t *cb); ++extern void dmu_objset_set_user(objset_t *os, void *user_ptr); ++extern void *dmu_objset_get_user(objset_t *os); ++ ++/* ++ * Return the txg number for the given assigned transaction. ++ */ ++uint64_t dmu_tx_get_txg(dmu_tx_t *tx); ++ ++/* ++ * Synchronous write. ++ * If a parent zio is provided this function initiates a write on the ++ * provided buffer as a child of the parent zio. ++ * In the absence of a parent zio, the write is completed synchronously. ++ * At write completion, blk is filled with the bp of the written block. ++ * Note that while the data covered by this function will be on stable ++ * storage when the write completes this new data does not become a ++ * permanent part of the file until the associated transaction commits. ++ */ ++ ++/* ++ * {zfs,zvol,ztest}_get_done() args ++ */ ++typedef struct zgd { ++ struct zilog *zgd_zilog; ++ struct blkptr *zgd_bp; ++ dmu_buf_t *zgd_db; ++ struct rl *zgd_rl; ++ void *zgd_private; ++} zgd_t; ++ ++typedef void dmu_sync_cb_t(zgd_t *arg, int error); ++int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); ++ ++/* ++ * Find the next hole or data block in file starting at *off ++ * Return found offset in *off. Return ESRCH for end of file. ++ */ ++int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, ++ uint64_t *off); ++ ++/* ++ * Initial setup and final teardown. ++ */ ++extern void dmu_init(void); ++extern void dmu_fini(void); ++ ++typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, ++ uint64_t object, uint64_t offset, int len); ++void dmu_traverse_objset(objset_t *os, uint64_t txg_start, ++ dmu_traverse_cb_t cb, void *arg); ++ ++int dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, ++ int outfd, struct vnode *vp, offset_t *off); ++int dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorign, ++ uint64_t *sizep); ++ ++typedef struct dmu_recv_cookie { ++ /* ++ * This structure is opaque! ++ * ++ * If logical and real are different, we are recving the stream ++ * into the "real" temporary clone, and then switching it with ++ * the "logical" target. ++ */ ++ struct dsl_dataset *drc_logical_ds; ++ struct dsl_dataset *drc_real_ds; ++ struct drr_begin *drc_drrb; ++ char *drc_tosnap; ++ char *drc_top_ds; ++ boolean_t drc_newfs; ++ boolean_t drc_force; ++ struct avl_tree *drc_guid_to_ds_map; ++} dmu_recv_cookie_t; ++ ++int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *, ++ boolean_t force, objset_t *origin, dmu_recv_cookie_t *); ++int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, ++ int cleanup_fd, uint64_t *action_handlep); ++int dmu_recv_end(dmu_recv_cookie_t *drc); ++ ++int dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, ++ offset_t *off); ++ ++/* CRC64 table */ ++#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ ++extern uint64_t zfs_crc64_table[256]; ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DMU_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dmu_impl.h linux-3.2.33-go/include/zfs/sys/dmu_impl.h +--- linux-3.2.33-go.orig/include/zfs/sys/dmu_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dmu_impl.h 2012-11-16 23:25:34.342039415 +0100 +@@ -0,0 +1,274 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ * Copyright (c) 2012, Joyent, Inc. All rights reserved. ++ */ ++ ++#ifndef _SYS_DMU_IMPL_H ++#define _SYS_DMU_IMPL_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * This is the locking strategy for the DMU. Numbers in parenthesis are ++ * cases that use that lock order, referenced below: ++ * ++ * ARC is self-contained ++ * bplist is self-contained ++ * refcount is self-contained ++ * txg is self-contained (hopefully!) ++ * zst_lock ++ * zf_rwlock ++ * ++ * XXX try to improve evicting path? ++ * ++ * dp_config_rwlock > os_obj_lock > dn_struct_rwlock > ++ * dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs ++ * ++ * dp_config_rwlock ++ * must be held before: everything ++ * protects dd namespace changes ++ * protects property changes globally ++ * held from: ++ * dsl_dir_open/r: ++ * dsl_dir_create_sync/w: ++ * dsl_dir_sync_destroy/w: ++ * dsl_dir_rename_sync/w: ++ * dsl_prop_changed_notify/r: ++ * ++ * os_obj_lock ++ * must be held before: ++ * everything except dp_config_rwlock ++ * protects os_obj_next ++ * held from: ++ * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock ++ * ++ * dn_struct_rwlock ++ * must be held before: ++ * everything except dp_config_rwlock and os_obj_lock ++ * protects structure of dnode (eg. nlevels) ++ * db_blkptr can change when syncing out change to nlevels ++ * dn_maxblkid ++ * dn_nlevels ++ * dn_*blksz* ++ * phys nlevels, maxblkid, physical blkptr_t's (?) ++ * held from: ++ * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch ++ * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz) ++ * dmu_tx_count_free: ++ * dbuf_read_impl: db_mtx, dmu_zfetch() ++ * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch() ++ * dbuf_new_size: db_mtx ++ * dbuf_dirty: db_mtx ++ * dbuf_findbp: (callers, phys? - the real need) ++ * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?) ++ * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx ++ * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp() ++ * dnode_sync/w (increase_indirection): db_mtx (phys) ++ * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*) ++ * dnode_new_blkid/w: (dn_maxblkid) ++ * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid) ++ * dnode_next_offset: (phys) ++ * ++ * dn_dbufs_mtx ++ * must be held before: ++ * db_mtx, hash_mutexes ++ * protects: ++ * dn_dbufs ++ * dn_evicted ++ * held from: ++ * dmu_evict_user: db_mtx (dn_dbufs) ++ * dbuf_free_range: db_mtx (dn_dbufs) ++ * dbuf_remove_ref: db_mtx, callees: ++ * dbuf_hash_remove: hash_mutexes, db_mtx ++ * dbuf_create: hash_mutexes, db_mtx (dn_dbufs) ++ * dnode_set_blksz: (dn_dbufs) ++ * ++ * hash_mutexes (global) ++ * must be held before: ++ * db_mtx ++ * protects dbuf_hash_table (global) and db_hash_next ++ * held from: ++ * dbuf_find: db_mtx ++ * dbuf_hash_insert: db_mtx ++ * dbuf_hash_remove: db_mtx ++ * ++ * db_mtx (meta-leaf) ++ * must be held before: ++ * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes) ++ * protects: ++ * db_state ++ * db_holds ++ * db_buf ++ * db_changed ++ * db_data_pending ++ * db_dirtied ++ * db_link ++ * db_dirty_node (??) ++ * db_dirtycnt ++ * db_d.* ++ * db.* ++ * held from: ++ * dbuf_dirty: dn_mtx, dn_dirty_mtx ++ * dbuf_dirty->dsl_dir_willuse_space: dd_lock ++ * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock ++ * dbuf_undirty: dn_dirty_mtx (db_d) ++ * dbuf_write_done: dn_dirty_mtx (db_state) ++ * dbuf_* ++ * dmu_buf_update_user: none (db_d) ++ * dmu_evict_user: none (db_d) (maybe can eliminate) ++ * dbuf_find: none (db_holds) ++ * dbuf_hash_insert: none (db_holds) ++ * dmu_buf_read_array_impl: none (db_state, db_changed) ++ * dmu_sync: none (db_dirty_node, db_d) ++ * dnode_reallocate: none (db) ++ * ++ * dn_mtx (leaf) ++ * protects: ++ * dn_dirty_dbufs ++ * dn_ranges ++ * phys accounting ++ * dn_allocated_txg ++ * dn_free_txg ++ * dn_assigned_txg ++ * dd_assigned_tx ++ * dn_notxholds ++ * dn_dirtyctx ++ * dn_dirtyctx_firstset ++ * (dn_phys copy fields?) ++ * (dn_phys contents?) ++ * held from: ++ * dnode_* ++ * dbuf_dirty: none ++ * dbuf_sync: none (phys accounting) ++ * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs) ++ * dbuf_write_done: none (phys accounting) ++ * dmu_object_info_from_dnode: none (accounting) ++ * dmu_tx_commit: none ++ * dmu_tx_hold_object_impl: none ++ * dmu_tx_try_assign: dn_notxholds(cv) ++ * dmu_tx_unassign: none ++ * ++ * dd_lock ++ * must be held before: ++ * ds_lock ++ * ancestors' dd_lock ++ * protects: ++ * dd_prop_cbs ++ * dd_sync_* ++ * dd_used_bytes ++ * dd_tempreserved ++ * dd_space_towrite ++ * dd_myname ++ * dd_phys accounting? ++ * held from: ++ * dsl_dir_* ++ * dsl_prop_changed_notify: none (dd_prop_cbs) ++ * dsl_prop_register: none (dd_prop_cbs) ++ * dsl_prop_unregister: none (dd_prop_cbs) ++ * dsl_dataset_block_freeable: none (dd_sync_*) ++ * ++ * os_lock (leaf) ++ * protects: ++ * os_dirty_dnodes ++ * os_free_dnodes ++ * os_dnodes ++ * os_downgraded_dbufs ++ * dn_dirtyblksz ++ * dn_dirty_link ++ * held from: ++ * dnode_create: none (os_dnodes) ++ * dnode_destroy: none (os_dnodes) ++ * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes) ++ * dnode_free: none (dn_dirtyblksz, os_*_dnodes) ++ * ++ * ds_lock ++ * protects: ++ * ds_objset ++ * ds_open_refcount ++ * ds_snapname ++ * ds_phys accounting ++ * ds_phys userrefs zapobj ++ * ds_reserved ++ * held from: ++ * dsl_dataset_* ++ * ++ * dr_mtx (leaf) ++ * protects: ++ * dr_children ++ * held from: ++ * dbuf_dirty ++ * dbuf_undirty ++ * dbuf_sync_indirect ++ * dnode_new_blkid ++ */ ++ ++struct objset; ++struct dmu_pool; ++ ++typedef struct dmu_xuio { ++ int next; ++ int cnt; ++ struct arc_buf **bufs; ++ iovec_t *iovp; ++} dmu_xuio_t; ++ ++/* ++ * The list of data whose inclusion in a send stream can be pending from ++ * one call to backup_cb to another. Multiple calls to dump_free() and ++ * dump_freeobjects() can be aggregated into a single DRR_FREE or ++ * DRR_FREEOBJECTS replay record. ++ */ ++typedef enum { ++ PENDING_NONE, ++ PENDING_FREE, ++ PENDING_FREEOBJECTS ++} dmu_pendop_t; ++ ++typedef struct dmu_sendarg { ++ list_node_t dsa_link; ++ dmu_replay_record_t *dsa_drr; ++ vnode_t *dsa_vp; ++ int dsa_outfd; ++ proc_t *dsa_proc; ++ offset_t *dsa_off; ++ objset_t *dsa_os; ++ zio_cksum_t dsa_zc; ++ uint64_t dsa_toguid; ++ int dsa_err; ++ dmu_pendop_t dsa_pending_op; ++} dmu_sendarg_t; ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DMU_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dmu_objset.h linux-3.2.33-go/include/zfs/sys/dmu_objset.h +--- linux-3.2.33-go.orig/include/zfs/sys/dmu_objset.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dmu_objset.h 2012-11-16 23:25:34.339039449 +0100 +@@ -0,0 +1,182 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* Portions Copyright 2010 Robert Milkowski */ ++ ++#ifndef _SYS_DMU_OBJSET_H ++#define _SYS_DMU_OBJSET_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++extern krwlock_t os_lock; ++ ++struct dsl_dataset; ++struct dmu_tx; ++ ++#define OBJSET_PHYS_SIZE 2048 ++#define OBJSET_OLD_PHYS_SIZE 1024 ++ ++#define OBJSET_BUF_HAS_USERUSED(buf) \ ++ (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE) ++ ++#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0) ++ ++typedef struct objset_phys { ++ dnode_phys_t os_meta_dnode; ++ zil_header_t os_zil_header; ++ uint64_t os_type; ++ uint64_t os_flags; ++ char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 - ++ sizeof (zil_header_t) - sizeof (uint64_t)*2]; ++ dnode_phys_t os_userused_dnode; ++ dnode_phys_t os_groupused_dnode; ++} objset_phys_t; ++ ++struct objset { ++ /* Immutable: */ ++ struct dsl_dataset *os_dsl_dataset; ++ spa_t *os_spa; ++ arc_buf_t *os_phys_buf; ++ objset_phys_t *os_phys; ++ /* ++ * The following "special" dnodes have no parent and are exempt from ++ * dnode_move(), but they root their descendents in this objset using ++ * handles anyway, so that all access to dnodes from dbufs consistently ++ * uses handles. ++ */ ++ dnode_handle_t os_meta_dnode; ++ dnode_handle_t os_userused_dnode; ++ dnode_handle_t os_groupused_dnode; ++ zilog_t *os_zil; ++ ++ /* can change, under dsl_dir's locks: */ ++ uint8_t os_checksum; ++ uint8_t os_compress; ++ uint8_t os_copies; ++ uint8_t os_dedup_checksum; ++ uint8_t os_dedup_verify; ++ uint8_t os_logbias; ++ uint8_t os_primary_cache; ++ uint8_t os_secondary_cache; ++ uint8_t os_sync; ++ ++ /* no lock needed: */ ++ struct dmu_tx *os_synctx; /* XXX sketchy */ ++ blkptr_t *os_rootbp; ++ zil_header_t os_zil_header; ++ list_t os_synced_dnodes; ++ uint64_t os_flags; ++ ++ /* Protected by os_obj_lock */ ++ kmutex_t os_obj_lock; ++ uint64_t os_obj_next; ++ ++ /* Protected by os_lock */ ++ kmutex_t os_lock; ++ list_t os_dirty_dnodes[TXG_SIZE]; ++ list_t os_free_dnodes[TXG_SIZE]; ++ list_t os_dnodes; ++ list_t os_downgraded_dbufs; ++ ++ /* stuff we store for the user */ ++ kmutex_t os_user_ptr_lock; ++ void *os_user_ptr; ++ ++ /* SA layout/attribute registration */ ++ sa_os_t *os_sa; ++}; ++ ++#define DMU_META_OBJSET 0 ++#define DMU_META_DNODE_OBJECT 0 ++#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0) ++#define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode) ++#define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode) ++#define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode) ++ ++#define DMU_OS_IS_L2CACHEABLE(os) \ ++ ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ ++ (os)->os_secondary_cache == ZFS_CACHE_METADATA) ++ ++/* called from zpl */ ++int dmu_objset_hold(const char *name, void *tag, objset_t **osp); ++int dmu_objset_own(const char *name, dmu_objset_type_t type, ++ boolean_t readonly, void *tag, objset_t **osp); ++void dmu_objset_rele(objset_t *os, void *tag); ++void dmu_objset_disown(objset_t *os, void *tag); ++int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp); ++ ++int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, ++ void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); ++int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, ++ uint64_t flags); ++int dmu_objset_destroy(const char *name, boolean_t defer); ++int dmu_objset_snapshot(char *fsname, char *snapname, char *tag, ++ struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd); ++void dmu_objset_stats(objset_t *os, nvlist_t *nv); ++void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); ++void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, ++ uint64_t *usedobjsp, uint64_t *availobjsp); ++uint64_t dmu_objset_fsid_guid(objset_t *os); ++int dmu_objset_find(char *name, int func(const char *, void *), void *arg, ++ int flags); ++int dmu_objset_find_spa(spa_t *spa, const char *name, ++ int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags); ++int dmu_objset_prefetch(const char *name, void *arg); ++void dmu_objset_byteswap(void *buf, size_t size); ++int dmu_objset_evict_dbufs(objset_t *os); ++timestruc_t dmu_objset_snap_cmtime(objset_t *os); ++ ++/* called from dsl */ ++void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); ++boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg); ++objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, ++ blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); ++int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, ++ objset_t **osp); ++void dmu_objset_evict(objset_t *os); ++void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx); ++void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx); ++boolean_t dmu_objset_userused_enabled(objset_t *os); ++int dmu_objset_userspace_upgrade(objset_t *os); ++boolean_t dmu_objset_userspace_present(objset_t *os); ++ ++void dmu_objset_init(void); ++void dmu_objset_fini(void); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DMU_OBJSET_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dmu_traverse.h linux-3.2.33-go/include/zfs/sys/dmu_traverse.h +--- linux-3.2.33-go.orig/include/zfs/sys/dmu_traverse.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dmu_traverse.h 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,64 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_DMU_TRAVERSE_H ++#define _SYS_DMU_TRAVERSE_H ++ ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct dnode_phys; ++struct dsl_dataset; ++struct zilog; ++struct arc_buf; ++ ++typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, ++ struct arc_buf *pbuf, const zbookmark_t *zb, const struct dnode_phys *dnp, ++ void *arg); ++ ++#define TRAVERSE_PRE (1<<0) ++#define TRAVERSE_POST (1<<1) ++#define TRAVERSE_PREFETCH_METADATA (1<<2) ++#define TRAVERSE_PREFETCH_DATA (1<<3) ++#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA) ++#define TRAVERSE_HARD (1<<4) ++ ++/* Special traverse error return value to indicate skipping of children */ ++#define TRAVERSE_VISIT_NO_CHILDREN -1 ++ ++int traverse_dataset(struct dsl_dataset *ds, ++ uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); ++int traverse_pool(spa_t *spa, ++ uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DMU_TRAVERSE_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dmu_tx.h linux-3.2.33-go/include/zfs/sys/dmu_tx.h +--- linux-3.2.33-go.orig/include/zfs/sys/dmu_tx.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dmu_tx.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,176 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_DMU_TX_H ++#define _SYS_DMU_TX_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct dmu_buf_impl; ++struct dmu_tx_hold; ++struct dnode_link; ++struct dsl_pool; ++struct dnode; ++struct dsl_dir; ++ ++struct dmu_tx { ++ /* ++ * No synchronization is needed because a tx can only be handled ++ * by one thread. ++ */ ++ list_t tx_holds; /* list of dmu_tx_hold_t */ ++ objset_t *tx_objset; ++ struct dsl_dir *tx_dir; ++ struct dsl_pool *tx_pool; ++ uint64_t tx_txg; ++ uint64_t tx_lastsnap_txg; ++ uint64_t tx_lasttried_txg; ++ txg_handle_t tx_txgh; ++ void *tx_tempreserve_cookie; ++ struct dmu_tx_hold *tx_needassign_txh; ++ list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */ ++ uint8_t tx_anyobj; ++ int tx_err; ++#ifdef DEBUG_DMU_TX ++ uint64_t tx_space_towrite; ++ uint64_t tx_space_tofree; ++ uint64_t tx_space_tooverwrite; ++ uint64_t tx_space_tounref; ++ refcount_t tx_space_written; ++ refcount_t tx_space_freed; ++#endif ++}; ++ ++enum dmu_tx_hold_type { ++ THT_NEWOBJECT, ++ THT_WRITE, ++ THT_BONUS, ++ THT_FREE, ++ THT_ZAP, ++ THT_SPACE, ++ THT_SPILL, ++ THT_NUMTYPES ++}; ++ ++typedef struct dmu_tx_hold { ++ dmu_tx_t *txh_tx; ++ list_node_t txh_node; ++ struct dnode *txh_dnode; ++ uint64_t txh_space_towrite; ++ uint64_t txh_space_tofree; ++ uint64_t txh_space_tooverwrite; ++ uint64_t txh_space_tounref; ++ uint64_t txh_memory_tohold; ++ uint64_t txh_fudge; ++#ifdef DEBUG_DMU_TX ++ enum dmu_tx_hold_type txh_type; ++ uint64_t txh_arg1; ++ uint64_t txh_arg2; ++#endif ++} dmu_tx_hold_t; ++ ++typedef struct dmu_tx_callback { ++ list_node_t dcb_node; /* linked to tx_callbacks list */ ++ dmu_tx_callback_func_t *dcb_func; /* caller function pointer */ ++ void *dcb_data; /* caller private data */ ++} dmu_tx_callback_t; ++ ++/* ++ * Used for dmu tx kstat. ++ */ ++typedef struct dmu_tx_stats { ++ kstat_named_t dmu_tx_assigned; ++ kstat_named_t dmu_tx_delay; ++ kstat_named_t dmu_tx_error; ++ kstat_named_t dmu_tx_suspended; ++ kstat_named_t dmu_tx_group; ++ kstat_named_t dmu_tx_how; ++ kstat_named_t dmu_tx_memory_reserve; ++ kstat_named_t dmu_tx_memory_reclaim; ++ kstat_named_t dmu_tx_memory_inflight; ++ kstat_named_t dmu_tx_dirty_throttle; ++ kstat_named_t dmu_tx_write_limit; ++ kstat_named_t dmu_tx_quota; ++} dmu_tx_stats_t; ++ ++extern dmu_tx_stats_t dmu_tx_stats; ++ ++#define DMU_TX_STAT_INCR(stat, val) \ ++ atomic_add_64(&dmu_tx_stats.stat.value.ui64, (val)); ++#define DMU_TX_STAT_BUMP(stat) \ ++ DMU_TX_STAT_INCR(stat, 1); ++ ++/* ++ * These routines are defined in dmu.h, and are called by the user. ++ */ ++dmu_tx_t *dmu_tx_create(objset_t *dd); ++int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); ++void dmu_tx_commit(dmu_tx_t *tx); ++void dmu_tx_abort(dmu_tx_t *tx); ++uint64_t dmu_tx_get_txg(dmu_tx_t *tx); ++void dmu_tx_wait(dmu_tx_t *tx); ++ ++void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, ++ void *dcb_data); ++void dmu_tx_do_callbacks(list_t *cb_list, int error); ++ ++/* ++ * These routines are defined in dmu_spa.h, and are called by the SPA. ++ */ ++extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg); ++ ++/* ++ * These routines are only called by the DMU. ++ */ ++dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd); ++int dmu_tx_is_syncing(dmu_tx_t *tx); ++int dmu_tx_private_ok(dmu_tx_t *tx); ++void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object); ++void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta); ++void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); ++int dmu_tx_holds(dmu_tx_t *tx, uint64_t object); ++void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); ++ ++#ifdef DEBUG_DMU_TX ++#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db) ++#else ++#define DMU_TX_DIRTY_BUF(tx, db) ++#endif ++ ++void dmu_tx_init(void); ++void dmu_tx_fini(void); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DMU_TX_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dmu_zfetch.h linux-3.2.33-go/include/zfs/sys/dmu_zfetch.h +--- linux-3.2.33-go.orig/include/zfs/sys/dmu_zfetch.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dmu_zfetch.h 2012-11-16 23:25:34.342039415 +0100 +@@ -0,0 +1,76 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _DFETCH_H ++#define _DFETCH_H ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++extern unsigned long zfetch_array_rd_sz; ++ ++struct dnode; /* so we can reference dnode */ ++ ++typedef enum zfetch_dirn { ++ ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */ ++ ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */ ++} zfetch_dirn_t; ++ ++typedef struct zstream { ++ uint64_t zst_offset; /* offset of starting block in range */ ++ uint64_t zst_len; /* length of range, in blocks */ ++ zfetch_dirn_t zst_direction; /* direction of prefetch */ ++ uint64_t zst_stride; /* length of stride, in blocks */ ++ uint64_t zst_ph_offset; /* prefetch offset, in blocks */ ++ uint64_t zst_cap; /* prefetch limit (cap), in blocks */ ++ kmutex_t zst_lock; /* protects stream */ ++ clock_t zst_last; /* lbolt of last prefetch */ ++ avl_node_t zst_node; /* embed avl node here */ ++} zstream_t; ++ ++typedef struct zfetch { ++ krwlock_t zf_rwlock; /* protects zfetch structure */ ++ list_t zf_stream; /* AVL tree of zstream_t's */ ++ struct dnode *zf_dnode; /* dnode that owns this zfetch */ ++ uint32_t zf_stream_cnt; /* # of active streams */ ++ uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */ ++} zfetch_t; ++ ++void zfetch_init(void); ++void zfetch_fini(void); ++ ++void dmu_zfetch_init(zfetch_t *, struct dnode *); ++void dmu_zfetch_rele(zfetch_t *); ++void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int); ++ ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _DFETCH_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dnode.h linux-3.2.33-go/include/zfs/sys/dnode.h +--- linux-3.2.33-go.orig/include/zfs/sys/dnode.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dnode.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,329 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_DNODE_H ++#define _SYS_DNODE_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * dnode_hold() flags. ++ */ ++#define DNODE_MUST_BE_ALLOCATED 1 ++#define DNODE_MUST_BE_FREE 2 ++ ++/* ++ * dnode_next_offset() flags. ++ */ ++#define DNODE_FIND_HOLE 1 ++#define DNODE_FIND_BACKWARDS 2 ++#define DNODE_FIND_HAVELOCK 4 ++ ++/* ++ * Fixed constants. ++ */ ++#define DNODE_SHIFT 9 /* 512 bytes */ ++#define DN_MIN_INDBLKSHIFT 10 /* 1k */ ++#define DN_MAX_INDBLKSHIFT 14 /* 16k */ ++#define DNODE_BLOCK_SHIFT 14 /* 16k */ ++#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */ ++#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */ ++#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ ++ ++/* ++ * dnode id flags ++ * ++ * Note: a file will never ever have its ++ * ids moved from bonus->spill ++ * and only in a crypto environment would it be on spill ++ */ ++#define DN_ID_CHKED_BONUS 0x1 ++#define DN_ID_CHKED_SPILL 0x2 ++#define DN_ID_OLD_EXIST 0x4 ++#define DN_ID_NEW_EXIST 0x8 ++ ++/* ++ * Derived constants. ++ */ ++#define DNODE_SIZE (1 << DNODE_SHIFT) ++#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) ++#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) ++#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) ++#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1) ++#define DN_KILL_SPILLBLK (1) ++ ++#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) ++#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) ++#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) ++#define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT) ++ ++/* The +2 here is a cheesy way to round up */ ++#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ ++ (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT))) ++ ++#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ ++ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) ++ ++#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \ ++ (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT) ++ ++#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) ++ ++struct dmu_buf_impl; ++struct objset; ++struct zio; ++ ++enum dnode_dirtycontext { ++ DN_UNDIRTIED, ++ DN_DIRTY_OPEN, ++ DN_DIRTY_SYNC ++}; ++ ++/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ ++#define DNODE_FLAG_USED_BYTES (1<<0) ++#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1) ++ ++/* Does dnode have a SA spill blkptr in bonus? */ ++#define DNODE_FLAG_SPILL_BLKPTR (1<<2) ++ ++typedef struct dnode_phys { ++ uint8_t dn_type; /* dmu_object_type_t */ ++ uint8_t dn_indblkshift; /* ln2(indirect block size) */ ++ uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */ ++ uint8_t dn_nblkptr; /* length of dn_blkptr */ ++ uint8_t dn_bonustype; /* type of data in bonus buffer */ ++ uint8_t dn_checksum; /* ZIO_CHECKSUM type */ ++ uint8_t dn_compress; /* ZIO_COMPRESS type */ ++ uint8_t dn_flags; /* DNODE_FLAG_* */ ++ uint16_t dn_datablkszsec; /* data block size in 512b sectors */ ++ uint16_t dn_bonuslen; /* length of dn_bonus */ ++ uint8_t dn_pad2[4]; ++ ++ /* accounting is protected by dn_dirty_mtx */ ++ uint64_t dn_maxblkid; /* largest allocated block ID */ ++ uint64_t dn_used; /* bytes (or sectors) of disk space */ ++ ++ uint64_t dn_pad3[4]; ++ ++ blkptr_t dn_blkptr[1]; ++ uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)]; ++ blkptr_t dn_spill; ++} dnode_phys_t; ++ ++typedef struct dnode { ++ /* ++ * dn_struct_rwlock protects the structure of the dnode, ++ * including the number of levels of indirection (dn_nlevels), ++ * dn_maxblkid, and dn_next_* ++ */ ++ krwlock_t dn_struct_rwlock; ++ ++ /* Our link on dn_objset->os_dnodes list; protected by os_lock. */ ++ list_node_t dn_link; ++ ++ /* immutable: */ ++ struct objset *dn_objset; ++ uint64_t dn_object; ++ struct dmu_buf_impl *dn_dbuf; ++ struct dnode_handle *dn_handle; ++ dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ ++ ++ /* ++ * Copies of stuff in dn_phys. They're valid in the open ++ * context (eg. even before the dnode is first synced). ++ * Where necessary, these are protected by dn_struct_rwlock. ++ */ ++ dmu_object_type_t dn_type; /* object type */ ++ uint16_t dn_bonuslen; /* bonus length */ ++ uint8_t dn_bonustype; /* bonus type */ ++ uint8_t dn_nblkptr; /* number of blkptrs (immutable) */ ++ uint8_t dn_checksum; /* ZIO_CHECKSUM type */ ++ uint8_t dn_compress; /* ZIO_COMPRESS type */ ++ uint8_t dn_nlevels; ++ uint8_t dn_indblkshift; ++ uint8_t dn_datablkshift; /* zero if blksz not power of 2! */ ++ uint8_t dn_moved; /* Has this dnode been moved? */ ++ uint16_t dn_datablkszsec; /* in 512b sectors */ ++ uint32_t dn_datablksz; /* in bytes */ ++ uint64_t dn_maxblkid; ++ uint8_t dn_next_nblkptr[TXG_SIZE]; ++ uint8_t dn_next_nlevels[TXG_SIZE]; ++ uint8_t dn_next_indblkshift[TXG_SIZE]; ++ uint8_t dn_next_bonustype[TXG_SIZE]; ++ uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */ ++ uint16_t dn_next_bonuslen[TXG_SIZE]; ++ uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ ++ ++ /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */ ++ uint32_t dn_dbufs_count; /* count of dn_dbufs */ ++ ++ /* protected by os_lock: */ ++ list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ ++ ++ /* protected by dn_mtx: */ ++ kmutex_t dn_mtx; ++ list_t dn_dirty_records[TXG_SIZE]; ++ avl_tree_t dn_ranges[TXG_SIZE]; ++ uint64_t dn_allocated_txg; ++ uint64_t dn_free_txg; ++ uint64_t dn_assigned_txg; ++ kcondvar_t dn_notxholds; ++ enum dnode_dirtycontext dn_dirtyctx; ++ uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */ ++ ++ /* protected by own devices */ ++ refcount_t dn_tx_holds; ++ refcount_t dn_holds; ++ ++ kmutex_t dn_dbufs_mtx; ++ list_t dn_dbufs; /* descendent dbufs */ ++ ++ /* protected by dn_struct_rwlock */ ++ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ ++ ++ boolean_t dn_have_spill; /* have spill or are spilling */ ++ ++ /* parent IO for current sync write */ ++ zio_t *dn_zio; ++ ++ /* used in syncing context */ ++ uint64_t dn_oldused; /* old phys used bytes */ ++ uint64_t dn_oldflags; /* old phys dn_flags */ ++ uint64_t dn_olduid, dn_oldgid; ++ uint64_t dn_newuid, dn_newgid; ++ int dn_id_flags; ++ ++ /* holds prefetch structure */ ++ struct zfetch dn_zfetch; ++} dnode_t; ++ ++/* ++ * Adds a level of indirection between the dbuf and the dnode to avoid ++ * iterating descendent dbufs in dnode_move(). Handles are not allocated ++ * individually, but as an array of child dnodes in dnode_hold_impl(). ++ */ ++typedef struct dnode_handle { ++ /* Protects dnh_dnode from modification by dnode_move(). */ ++ zrlock_t dnh_zrlock; ++ dnode_t *dnh_dnode; ++} dnode_handle_t; ++ ++typedef struct dnode_children { ++ size_t dnc_count; /* number of children */ ++ dnode_handle_t dnc_children[1]; /* sized dynamically */ ++} dnode_children_t; ++ ++typedef struct free_range { ++ avl_node_t fr_node; ++ uint64_t fr_blkid; ++ uint64_t fr_nblks; ++} free_range_t; ++ ++dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp, ++ uint64_t object, dnode_handle_t *dnh); ++void dnode_special_close(dnode_handle_t *dnh); ++ ++void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); ++void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx); ++void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx); ++ ++int dnode_hold(struct objset *dd, uint64_t object, ++ void *ref, dnode_t **dnp); ++int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, ++ void *ref, dnode_t **dnp); ++boolean_t dnode_add_ref(dnode_t *dn, void *ref); ++void dnode_rele(dnode_t *dn, void *ref); ++void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); ++void dnode_sync(dnode_t *dn, dmu_tx_t *tx); ++void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ++ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); ++void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, ++ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); ++void dnode_free(dnode_t *dn, dmu_tx_t *tx); ++void dnode_byteswap(dnode_phys_t *dnp); ++void dnode_buf_byteswap(void *buf, size_t size); ++void dnode_verify(dnode_t *dn); ++int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); ++uint64_t dnode_current_max_length(dnode_t *dn); ++void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); ++void dnode_clear_range(dnode_t *dn, uint64_t blkid, ++ uint64_t nblks, dmu_tx_t *tx); ++void dnode_diduse_space(dnode_t *dn, int64_t space); ++void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx); ++void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t); ++uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); ++void dnode_init(void); ++void dnode_fini(void); ++int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, ++ int minlvl, uint64_t blkfill, uint64_t txg); ++void dnode_evict_dbufs(dnode_t *dn); ++ ++#ifdef ZFS_DEBUG ++ ++/* ++ * There should be a ## between the string literal and fmt, to make it ++ * clear that we're joining two strings together, but that piece of shit ++ * gcc doesn't support that preprocessor token. ++ */ ++#define dprintf_dnode(dn, fmt, ...) do { \ ++ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ ++ char __db_buf[32]; \ ++ uint64_t __db_obj = (dn)->dn_object; \ ++ if (__db_obj == DMU_META_DNODE_OBJECT) \ ++ (void) strcpy(__db_buf, "mdn"); \ ++ else \ ++ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ ++ (u_longlong_t)__db_obj);\ ++ dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \ ++ __db_buf, __VA_ARGS__); \ ++ } \ ++_NOTE(CONSTCOND) } while (0) ++ ++#define DNODE_VERIFY(dn) dnode_verify(dn) ++#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx) ++ ++#else ++ ++#define dprintf_dnode(db, fmt, ...) ++#define DNODE_VERIFY(dn) ++#define FREE_VERIFY(db, start, end, tx) ++ ++#endif ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DNODE_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dsl_dataset.h linux-3.2.33-go/include/zfs/sys/dsl_dataset.h +--- linux-3.2.33-go.orig/include/zfs/sys/dsl_dataset.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dsl_dataset.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,293 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ * Copyright (c) 2012, Joyent, Inc. All rights reserved. ++ */ ++ ++#ifndef _SYS_DSL_DATASET_H ++#define _SYS_DSL_DATASET_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct dsl_dataset; ++struct dsl_dir; ++struct dsl_pool; ++ ++#define DS_FLAG_INCONSISTENT (1ULL<<0) ++#define DS_IS_INCONSISTENT(ds) \ ++ ((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) ++/* ++ * NB: nopromote can not yet be set, but we want support for it in this ++ * on-disk version, so that we don't need to upgrade for it later. It ++ * will be needed when we implement 'zfs split' (where the split off ++ * clone should not be promoted). ++ */ ++#define DS_FLAG_NOPROMOTE (1ULL<<1) ++ ++/* ++ * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly ++ * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE, ++ * refquota/refreservations). ++ */ ++#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2) ++ ++/* ++ * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called ++ * on a dataset. This allows the dataset to be destroyed using 'zfs release'. ++ */ ++#define DS_FLAG_DEFER_DESTROY (1ULL<<3) ++#define DS_IS_DEFER_DESTROY(ds) \ ++ ((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY) ++ ++/* ++ * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose ++ * name lookups should be performed case-insensitively. ++ */ ++#define DS_FLAG_CI_DATASET (1ULL<<16) ++ ++typedef struct dsl_dataset_phys { ++ uint64_t ds_dir_obj; /* DMU_OT_DSL_DIR */ ++ uint64_t ds_prev_snap_obj; /* DMU_OT_DSL_DATASET */ ++ uint64_t ds_prev_snap_txg; ++ uint64_t ds_next_snap_obj; /* DMU_OT_DSL_DATASET */ ++ uint64_t ds_snapnames_zapobj; /* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */ ++ uint64_t ds_num_children; /* clone/snap children; ==0 for head */ ++ uint64_t ds_creation_time; /* seconds since 1970 */ ++ uint64_t ds_creation_txg; ++ uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */ ++ uint64_t ds_used_bytes; ++ uint64_t ds_compressed_bytes; ++ uint64_t ds_uncompressed_bytes; ++ uint64_t ds_unique_bytes; /* only relevant to snapshots */ ++ /* ++ * The ds_fsid_guid is a 56-bit ID that can change to avoid ++ * collisions. The ds_guid is a 64-bit ID that will never ++ * change, so there is a small probability that it will collide. ++ */ ++ uint64_t ds_fsid_guid; ++ uint64_t ds_guid; ++ uint64_t ds_flags; /* DS_FLAG_* */ ++ blkptr_t ds_bp; ++ uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */ ++ uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */ ++ uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */ ++ uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */ ++} dsl_dataset_phys_t; ++ ++typedef struct dsl_dataset { ++ /* Immutable: */ ++ struct dsl_dir *ds_dir; ++ dsl_dataset_phys_t *ds_phys; ++ dmu_buf_t *ds_dbuf; ++ uint64_t ds_object; ++ uint64_t ds_fsid_guid; ++ ++ /* only used in syncing context, only valid for non-snapshots: */ ++ struct dsl_dataset *ds_prev; ++ ++ /* has internal locking: */ ++ dsl_deadlist_t ds_deadlist; ++ bplist_t ds_pending_deadlist; ++ ++ /* to protect against multiple concurrent incremental recv */ ++ kmutex_t ds_recvlock; ++ ++ /* protected by lock on pool's dp_dirty_datasets list */ ++ txg_node_t ds_dirty_link; ++ list_node_t ds_synced_link; ++ ++ /* ++ * ds_phys->ds_ is also protected by ds_lock. ++ * Protected by ds_lock: ++ */ ++ kmutex_t ds_lock; ++ objset_t *ds_objset; ++ uint64_t ds_userrefs; ++ ++ /* ++ * ds_owner is protected by the ds_rwlock and the ds_lock ++ */ ++ krwlock_t ds_rwlock; ++ kcondvar_t ds_exclusive_cv; ++ void *ds_owner; ++ ++ /* no locking; only for making guesses */ ++ uint64_t ds_trysnap_txg; ++ ++ /* for objset_open() */ ++ kmutex_t ds_opening_lock; ++ ++ uint64_t ds_reserved; /* cached refreservation */ ++ uint64_t ds_quota; /* cached refquota */ ++ ++ kmutex_t ds_sendstream_lock; ++ list_t ds_sendstreams; ++ ++ /* Protected by ds_lock; keep at end of struct for better locality */ ++ char ds_snapname[MAXNAMELEN]; ++} dsl_dataset_t; ++ ++struct dsl_ds_destroyarg { ++ dsl_dataset_t *ds; /* ds to destroy */ ++ dsl_dataset_t *rm_origin; /* also remove our origin? */ ++ boolean_t is_origin_rm; /* set if removing origin snap */ ++ boolean_t defer; /* destroy -d requested? */ ++ boolean_t releasing; /* destroying due to release? */ ++ boolean_t need_prep; /* do we need to retry due to EBUSY? */ ++}; ++ ++/* ++ * The max length of a temporary tag prefix is the number of hex digits ++ * required to express UINT64_MAX plus one for the hyphen. ++ */ ++#define MAX_TAG_PREFIX_LEN 17 ++ ++struct dsl_ds_holdarg { ++ dsl_sync_task_group_t *dstg; ++ char *htag; ++ char *snapname; ++ boolean_t recursive; ++ boolean_t gotone; ++ boolean_t temphold; ++ char failed[MAXPATHLEN]; ++}; ++ ++#define dsl_dataset_is_snapshot(ds) \ ++ ((ds)->ds_phys->ds_num_children != 0) ++ ++#define DS_UNIQUE_IS_ACCURATE(ds) \ ++ (((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) ++ ++int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp); ++int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, ++ void *tag, dsl_dataset_t **); ++int dsl_dataset_own(const char *name, boolean_t inconsistentok, ++ void *tag, dsl_dataset_t **dsp); ++int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, ++ boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp); ++void dsl_dataset_name(dsl_dataset_t *ds, char *name); ++void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); ++void dsl_dataset_disown(dsl_dataset_t *ds, void *tag); ++void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag); ++boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, ++ void *tag); ++void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag); ++void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, ++ minor_t minor); ++uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, ++ dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); ++uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, ++ uint64_t flags, dmu_tx_t *tx); ++int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer); ++int dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer); ++dsl_checkfunc_t dsl_dataset_destroy_check; ++dsl_syncfunc_t dsl_dataset_destroy_sync; ++dsl_checkfunc_t dsl_dataset_snapshot_check; ++dsl_syncfunc_t dsl_dataset_snapshot_sync; ++dsl_syncfunc_t dsl_dataset_user_hold_sync; ++int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive); ++int dsl_dataset_promote(const char *name, char *conflsnap); ++int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, ++ boolean_t force); ++int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, ++ boolean_t recursive, boolean_t temphold, int cleanup_fd); ++int dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, ++ boolean_t temphold); ++int dsl_dataset_user_release(char *dsname, char *snapname, char *htag, ++ boolean_t recursive); ++int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj, ++ char *htag, boolean_t retry); ++int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp); ++ ++blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); ++void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); ++ ++spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); ++ ++boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds); ++ ++void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); ++ ++void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, ++ dmu_tx_t *tx); ++int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, ++ dmu_tx_t *tx, boolean_t async); ++boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, ++ uint64_t blk_birth); ++uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); ++ ++void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); ++void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv); ++void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat); ++void dsl_dataset_space(dsl_dataset_t *ds, ++ uint64_t *refdbytesp, uint64_t *availbytesp, ++ uint64_t *usedobjsp, uint64_t *availobjsp); ++uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds); ++int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, ++ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); ++int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last, ++ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); ++boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds); ++ ++int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); ++ ++int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, ++ uint64_t asize, uint64_t inflight, uint64_t *used, ++ uint64_t *ref_rsrv); ++int dsl_dataset_set_quota(const char *dsname, zprop_source_t source, ++ uint64_t quota); ++dsl_syncfunc_t dsl_dataset_set_quota_sync; ++int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, ++ uint64_t reservation); ++ ++int dsl_destroy_inconsistent(const char *dsname, void *arg); ++ ++#ifdef ZFS_DEBUG ++#define dprintf_ds(ds, fmt, ...) do { \ ++ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ ++ char *__ds_name = kmem_alloc(MAXNAMELEN, KM_PUSHPAGE); \ ++ dsl_dataset_name(ds, __ds_name); \ ++ dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \ ++ kmem_free(__ds_name, MAXNAMELEN); \ ++ } \ ++_NOTE(CONSTCOND) } while (0) ++#else ++#define dprintf_ds(dd, fmt, ...) ++#endif ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DSL_DATASET_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dsl_deadlist.h linux-3.2.33-go/include/zfs/sys/dsl_deadlist.h +--- linux-3.2.33-go.orig/include/zfs/sys/dsl_deadlist.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dsl_deadlist.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,87 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_DSL_DEADLIST_H ++#define _SYS_DSL_DEADLIST_H ++ ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct dmu_buf; ++struct dsl_dataset; ++ ++typedef struct dsl_deadlist_phys { ++ uint64_t dl_used; ++ uint64_t dl_comp; ++ uint64_t dl_uncomp; ++ uint64_t dl_pad[37]; /* pad out to 320b for future expansion */ ++} dsl_deadlist_phys_t; ++ ++typedef struct dsl_deadlist { ++ objset_t *dl_os; ++ uint64_t dl_object; ++ avl_tree_t dl_tree; ++ boolean_t dl_havetree; ++ struct dmu_buf *dl_dbuf; ++ dsl_deadlist_phys_t *dl_phys; ++ kmutex_t dl_lock; ++ ++ /* if it's the old on-disk format: */ ++ bpobj_t dl_bpobj; ++ boolean_t dl_oldfmt; ++} dsl_deadlist_t; ++ ++typedef struct dsl_deadlist_entry { ++ avl_node_t dle_node; ++ uint64_t dle_mintxg; ++ bpobj_t dle_bpobj; ++} dsl_deadlist_entry_t; ++ ++void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object); ++void dsl_deadlist_close(dsl_deadlist_t *dl); ++uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx); ++void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx); ++void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx); ++void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); ++void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); ++uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, ++ uint64_t mrs_obj, dmu_tx_t *tx); ++void dsl_deadlist_space(dsl_deadlist_t *dl, ++ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); ++void dsl_deadlist_space_range(dsl_deadlist_t *dl, ++ uint64_t mintxg, uint64_t maxtxg, ++ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); ++void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx); ++void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, ++ dmu_tx_t *tx); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DSL_DEADLIST_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dsl_deleg.h linux-3.2.33-go/include/zfs/sys/dsl_deleg.h +--- linux-3.2.33-go.orig/include/zfs/sys/dsl_deleg.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dsl_deleg.h 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,80 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ */ ++ ++#ifndef _SYS_DSL_DELEG_H ++#define _SYS_DSL_DELEG_H ++ ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#define ZFS_DELEG_PERM_NONE "" ++#define ZFS_DELEG_PERM_CREATE "create" ++#define ZFS_DELEG_PERM_DESTROY "destroy" ++#define ZFS_DELEG_PERM_SNAPSHOT "snapshot" ++#define ZFS_DELEG_PERM_ROLLBACK "rollback" ++#define ZFS_DELEG_PERM_CLONE "clone" ++#define ZFS_DELEG_PERM_PROMOTE "promote" ++#define ZFS_DELEG_PERM_RENAME "rename" ++#define ZFS_DELEG_PERM_MOUNT "mount" ++#define ZFS_DELEG_PERM_SHARE "share" ++#define ZFS_DELEG_PERM_SEND "send" ++#define ZFS_DELEG_PERM_RECEIVE "receive" ++#define ZFS_DELEG_PERM_ALLOW "allow" ++#define ZFS_DELEG_PERM_USERPROP "userprop" ++#define ZFS_DELEG_PERM_VSCAN "vscan" ++#define ZFS_DELEG_PERM_USERQUOTA "userquota" ++#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" ++#define ZFS_DELEG_PERM_USERUSED "userused" ++#define ZFS_DELEG_PERM_GROUPUSED "groupused" ++#define ZFS_DELEG_PERM_HOLD "hold" ++#define ZFS_DELEG_PERM_RELEASE "release" ++#define ZFS_DELEG_PERM_DIFF "diff" ++ ++/* ++ * Note: the names of properties that are marked delegatable are also ++ * valid delegated permissions ++ */ ++ ++int dsl_deleg_get(const char *ddname, nvlist_t **nvp); ++int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset); ++int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr); ++int dsl_deleg_access_impl(struct dsl_dataset *ds, boolean_t descendent, ++ const char *perm, cred_t *cr); ++void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr); ++int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr); ++int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr); ++int dsl_deleg_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx); ++boolean_t dsl_delegation_on(objset_t *os); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DSL_DELEG_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dsl_dir.h linux-3.2.33-go/include/zfs/sys/dsl_dir.h +--- linux-3.2.33-go.orig/include/zfs/sys/dsl_dir.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dsl_dir.h 2012-11-16 23:25:34.342039415 +0100 +@@ -0,0 +1,167 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_DSL_DIR_H ++#define _SYS_DSL_DIR_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct dsl_dataset; ++ ++typedef enum dd_used { ++ DD_USED_HEAD, ++ DD_USED_SNAP, ++ DD_USED_CHILD, ++ DD_USED_CHILD_RSRV, ++ DD_USED_REFRSRV, ++ DD_USED_NUM ++} dd_used_t; ++ ++#define DD_FLAG_USED_BREAKDOWN (1<<0) ++ ++typedef struct dsl_dir_phys { ++ uint64_t dd_creation_time; /* not actually used */ ++ uint64_t dd_head_dataset_obj; ++ uint64_t dd_parent_obj; ++ uint64_t dd_origin_obj; ++ uint64_t dd_child_dir_zapobj; ++ /* ++ * how much space our children are accounting for; for leaf ++ * datasets, == physical space used by fs + snaps ++ */ ++ uint64_t dd_used_bytes; ++ uint64_t dd_compressed_bytes; ++ uint64_t dd_uncompressed_bytes; ++ /* Administrative quota setting */ ++ uint64_t dd_quota; ++ /* Administrative reservation setting */ ++ uint64_t dd_reserved; ++ uint64_t dd_props_zapobj; ++ uint64_t dd_deleg_zapobj; /* dataset delegation permissions */ ++ uint64_t dd_flags; ++ uint64_t dd_used_breakdown[DD_USED_NUM]; ++ uint64_t dd_clones; /* dsl_dir objects */ ++ uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */ ++} dsl_dir_phys_t; ++ ++struct dsl_dir { ++ /* These are immutable; no lock needed: */ ++ uint64_t dd_object; ++ dsl_dir_phys_t *dd_phys; ++ dmu_buf_t *dd_dbuf; ++ dsl_pool_t *dd_pool; ++ ++ /* protected by lock on pool's dp_dirty_dirs list */ ++ txg_node_t dd_dirty_link; ++ ++ /* protected by dp_config_rwlock */ ++ dsl_dir_t *dd_parent; ++ ++ /* Protected by dd_lock */ ++ kmutex_t dd_lock; ++ list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */ ++ timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */ ++ uint64_t dd_origin_txg; ++ ++ /* gross estimate of space used by in-flight tx's */ ++ uint64_t dd_tempreserved[TXG_SIZE]; ++ /* amount of space we expect to write; == amount of dirty data */ ++ int64_t dd_space_towrite[TXG_SIZE]; ++ ++ /* protected by dd_lock; keep at end of struct for better locality */ ++ char dd_myname[MAXNAMELEN]; ++}; ++ ++void dsl_dir_close(dsl_dir_t *dd, void *tag); ++int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail); ++int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **, ++ const char **tailp); ++int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, ++ const char *tail, void *tag, dsl_dir_t **); ++void dsl_dir_name(dsl_dir_t *dd, char *buf); ++int dsl_dir_namelen(dsl_dir_t *dd); ++uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, ++ const char *name, dmu_tx_t *tx); ++dsl_checkfunc_t dsl_dir_destroy_check; ++dsl_syncfunc_t dsl_dir_destroy_sync; ++void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); ++uint64_t dsl_dir_space_available(dsl_dir_t *dd, ++ dsl_dir_t *ancestor, int64_t delta, int ondiskonly); ++void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); ++void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); ++int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, ++ uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep, ++ dmu_tx_t *tx); ++void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); ++void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); ++void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, ++ int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); ++void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, ++ dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); ++int dsl_dir_set_quota(const char *ddname, zprop_source_t source, ++ uint64_t quota); ++int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, ++ uint64_t reservation); ++int dsl_dir_rename(dsl_dir_t *dd, const char *newname); ++int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space); ++int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx); ++boolean_t dsl_dir_is_clone(dsl_dir_t *dd); ++void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, ++ uint64_t reservation, cred_t *cr, dmu_tx_t *tx); ++void dsl_dir_snap_cmtime_update(dsl_dir_t *dd); ++timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd); ++ ++/* internal reserved dir name */ ++#define MOS_DIR_NAME "$MOS" ++#define ORIGIN_DIR_NAME "$ORIGIN" ++#define XLATION_DIR_NAME "$XLATION" ++#define FREE_DIR_NAME "$FREE" ++ ++#ifdef ZFS_DEBUG ++#define dprintf_dd(dd, fmt, ...) do { \ ++ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ ++ char *__ds_name = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, \ ++ KM_PUSHPAGE); \ ++ dsl_dir_name(dd, __ds_name); \ ++ dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \ ++ kmem_free(__ds_name, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); \ ++ } \ ++_NOTE(CONSTCOND) } while (0) ++#else ++#define dprintf_dd(dd, fmt, ...) ++#endif ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DSL_DIR_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dsl_pool.h linux-3.2.33-go/include/zfs/sys/dsl_pool.h +--- linux-3.2.33-go.orig/include/zfs/sys/dsl_pool.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dsl_pool.h 2012-11-16 23:25:34.344039393 +0100 +@@ -0,0 +1,166 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_DSL_POOL_H ++#define _SYS_DSL_POOL_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct objset; ++struct dsl_dir; ++struct dsl_dataset; ++struct dsl_pool; ++struct dmu_tx; ++struct dsl_scan; ++ ++/* These macros are for indexing into the zfs_all_blkstats_t. */ ++#define DMU_OT_DEFERRED DMU_OT_NONE ++#define DMU_OT_TOTAL DMU_OT_NUMTYPES ++ ++typedef struct zfs_blkstat { ++ uint64_t zb_count; ++ uint64_t zb_asize; ++ uint64_t zb_lsize; ++ uint64_t zb_psize; ++ uint64_t zb_gangs; ++ uint64_t zb_ditto_2_of_2_samevdev; ++ uint64_t zb_ditto_2_of_3_samevdev; ++ uint64_t zb_ditto_3_of_3_samevdev; ++} zfs_blkstat_t; ++ ++typedef struct zfs_all_blkstats { ++ zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1]; ++} zfs_all_blkstats_t; ++ ++typedef struct txg_history { ++ kstat_txg_t th_kstat; ++ vdev_stat_t th_vs1; ++ vdev_stat_t th_vs2; ++ kmutex_t th_lock; ++ list_node_t th_link; ++} txg_history_t; ++ ++typedef struct dsl_pool { ++ /* Immutable */ ++ spa_t *dp_spa; ++ struct objset *dp_meta_objset; ++ struct dsl_dir *dp_root_dir; ++ struct dsl_dir *dp_mos_dir; ++ struct dsl_dir *dp_free_dir; ++ struct dsl_dataset *dp_origin_snap; ++ uint64_t dp_root_dir_obj; ++ struct taskq *dp_iput_taskq; ++ kstat_t *dp_txg_kstat; ++ ++ /* No lock needed - sync context only */ ++ blkptr_t dp_meta_rootbp; ++ list_t dp_synced_datasets; ++ hrtime_t dp_read_overhead; ++ uint64_t dp_throughput; /* bytes per millisec */ ++ uint64_t dp_write_limit; ++ uint64_t dp_tmp_userrefs_obj; ++ bpobj_t dp_free_bpobj; ++ ++ struct dsl_scan *dp_scan; ++ ++ /* Uses dp_lock */ ++ kmutex_t dp_lock; ++ uint64_t dp_space_towrite[TXG_SIZE]; ++ uint64_t dp_tempreserved[TXG_SIZE]; ++ uint64_t dp_txg_history_size; ++ list_t dp_txg_history; ++ ++ ++ /* Has its own locking */ ++ tx_state_t dp_tx; ++ txg_list_t dp_dirty_datasets; ++ txg_list_t dp_dirty_dirs; ++ txg_list_t dp_sync_tasks; ++ ++ /* ++ * Protects administrative changes (properties, namespace) ++ * It is only held for write in syncing context. Therefore ++ * syncing context does not need to ever have it for read, since ++ * nobody else could possibly have it for write. ++ */ ++ krwlock_t dp_config_rwlock; ++ ++ zfs_all_blkstats_t *dp_blkstats; ++} dsl_pool_t; ++ ++int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); ++void dsl_pool_close(dsl_pool_t *dp); ++dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg); ++void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); ++void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg); ++int dsl_pool_sync_context(dsl_pool_t *dp); ++uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree); ++uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree); ++int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx); ++void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); ++void dsl_pool_memory_pressure(dsl_pool_t *dp); ++void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); ++void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); ++void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, ++ const blkptr_t *bpp); ++int dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf, ++ arc_done_func_t *done, void *private, int priority, int zio_flags, ++ uint32_t *arc_flags, const zbookmark_t *zb); ++int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, ++ arc_done_func_t *done, void *private, int priority, int zio_flags, ++ uint32_t *arc_flags, const zbookmark_t *zb); ++void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); ++void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); ++void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx); ++ ++taskq_t *dsl_pool_iput_taskq(dsl_pool_t *dp); ++ ++extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, ++ const char *tag, uint64_t *now, dmu_tx_t *tx); ++extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, ++ const char *tag, dmu_tx_t *tx); ++extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); ++int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); ++ ++txg_history_t *dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg); ++txg_history_t *dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg); ++void dsl_pool_txg_history_put(txg_history_t *th); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DSL_POOL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dsl_prop.h linux-3.2.33-go/include/zfs/sys/dsl_prop.h +--- linux-3.2.33-go.orig/include/zfs/sys/dsl_prop.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dsl_prop.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,119 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_DSL_PROP_H ++#define _SYS_DSL_PROP_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct dsl_dataset; ++struct dsl_dir; ++ ++/* The callback func may not call into the DMU or DSL! */ ++typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval); ++ ++typedef struct dsl_prop_cb_record { ++ list_node_t cbr_node; /* link on dd_prop_cbs */ ++ struct dsl_dataset *cbr_ds; ++ const char *cbr_propname; ++ dsl_prop_changed_cb_t *cbr_func; ++ void *cbr_arg; ++} dsl_prop_cb_record_t; ++ ++typedef struct dsl_props_arg { ++ nvlist_t *pa_props; ++ zprop_source_t pa_source; ++} dsl_props_arg_t; ++ ++typedef struct dsl_prop_set_arg { ++ const char *psa_name; ++ zprop_source_t psa_source; ++ int psa_intsz; ++ int psa_numints; ++ const void *psa_value; ++ ++ /* ++ * Used to handle the special requirements of the quota and reservation ++ * properties. ++ */ ++ uint64_t psa_effective_value; ++} dsl_prop_setarg_t; ++ ++int dsl_prop_register(struct dsl_dataset *ds, const char *propname, ++ dsl_prop_changed_cb_t *callback, void *cbarg); ++int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname, ++ dsl_prop_changed_cb_t *callback, void *cbarg); ++int dsl_prop_numcb(struct dsl_dataset *ds); ++ ++int dsl_prop_get(const char *ddname, const char *propname, ++ int intsz, int numints, void *buf, char *setpoint); ++int dsl_prop_get_integer(const char *ddname, const char *propname, ++ uint64_t *valuep, char *setpoint); ++int dsl_prop_get_all(objset_t *os, nvlist_t **nvp); ++int dsl_prop_get_received(objset_t *os, nvlist_t **nvp); ++int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname, ++ int intsz, int numints, void *buf, char *setpoint); ++int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, ++ int intsz, int numints, void *buf, char *setpoint, ++ boolean_t snapshot); ++ ++dsl_syncfunc_t dsl_props_set_sync; ++int dsl_prop_set(const char *ddname, const char *propname, ++ zprop_source_t source, int intsz, int numints, const void *buf); ++int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl); ++void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, ++ dmu_tx_t *tx); ++ ++void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, ++ zprop_source_t source, uint64_t *value); ++int dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa); ++#ifdef ZFS_DEBUG ++void dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa); ++#define DSL_PROP_CHECK_PREDICTION(dd, psa) \ ++ dsl_prop_check_prediction((dd), (psa)) ++#else ++#define DSL_PROP_CHECK_PREDICTION(dd, psa) /* nothing */ ++#endif ++ ++/* flag first receive on or after SPA_VERSION_RECVD_PROPS */ ++boolean_t dsl_prop_get_hasrecvd(objset_t *os); ++void dsl_prop_set_hasrecvd(objset_t *os); ++void dsl_prop_unset_hasrecvd(objset_t *os); ++ ++void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); ++void dsl_prop_nvlist_add_string(nvlist_t *nv, ++ zfs_prop_t prop, const char *value); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DSL_PROP_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dsl_scan.h linux-3.2.33-go/include/zfs/sys/dsl_scan.h +--- linux-3.2.33-go.orig/include/zfs/sys/dsl_scan.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dsl_scan.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,108 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_DSL_SCAN_H ++#define _SYS_DSL_SCAN_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct objset; ++struct dsl_dir; ++struct dsl_dataset; ++struct dsl_pool; ++struct dmu_tx; ++ ++/* ++ * All members of this structure must be uint64_t, for byteswap ++ * purposes. ++ */ ++typedef struct dsl_scan_phys { ++ uint64_t scn_func; /* pool_scan_func_t */ ++ uint64_t scn_state; /* dsl_scan_state_t */ ++ uint64_t scn_queue_obj; ++ uint64_t scn_min_txg; ++ uint64_t scn_max_txg; ++ uint64_t scn_cur_min_txg; ++ uint64_t scn_cur_max_txg; ++ uint64_t scn_start_time; ++ uint64_t scn_end_time; ++ uint64_t scn_to_examine; /* total bytes to be scanned */ ++ uint64_t scn_examined; /* bytes scanned so far */ ++ uint64_t scn_to_process; ++ uint64_t scn_processed; ++ uint64_t scn_errors; /* scan I/O error count */ ++ uint64_t scn_ddt_class_max; ++ ddt_bookmark_t scn_ddt_bookmark; ++ zbookmark_t scn_bookmark; ++ uint64_t scn_flags; /* dsl_scan_flags_t */ ++} dsl_scan_phys_t; ++ ++#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t)) ++ ++typedef enum dsl_scan_flags { ++ DSF_VISIT_DS_AGAIN = 1<<0, ++} dsl_scan_flags_t; ++ ++typedef struct dsl_scan { ++ struct dsl_pool *scn_dp; ++ ++ boolean_t scn_pausing; ++ uint64_t scn_restart_txg; ++ uint64_t scn_sync_start_time; ++ zio_t *scn_zio_root; ++ ++ /* for debugging / information */ ++ uint64_t scn_visited_this_txg; ++ ++ dsl_scan_phys_t scn_phys; ++} dsl_scan_t; ++ ++int dsl_scan_init(struct dsl_pool *dp, uint64_t txg); ++void dsl_scan_fini(struct dsl_pool *dp); ++void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); ++int dsl_scan_cancel(struct dsl_pool *); ++int dsl_scan(struct dsl_pool *, pool_scan_func_t); ++void dsl_resilver_restart(struct dsl_pool *, uint64_t txg); ++boolean_t dsl_scan_resilvering(struct dsl_pool *dp); ++boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); ++void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, ++ ddt_entry_t *dde, dmu_tx_t *tx); ++void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); ++void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); ++void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, ++ struct dmu_tx *tx); ++boolean_t dsl_scan_active(dsl_scan_t *scn); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DSL_SCAN_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/dsl_synctask.h linux-3.2.33-go/include/zfs/sys/dsl_synctask.h +--- linux-3.2.33-go.orig/include/zfs/sys/dsl_synctask.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/dsl_synctask.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,79 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_DSL_SYNCTASK_H ++#define _SYS_DSL_SYNCTASK_H ++ ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct dsl_pool; ++ ++typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *); ++typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *); ++ ++typedef struct dsl_sync_task { ++ list_node_t dst_node; ++ dsl_checkfunc_t *dst_checkfunc; ++ dsl_syncfunc_t *dst_syncfunc; ++ void *dst_arg1; ++ void *dst_arg2; ++ int dst_err; ++} dsl_sync_task_t; ++ ++typedef struct dsl_sync_task_group { ++ txg_node_t dstg_node; ++ list_t dstg_tasks; ++ struct dsl_pool *dstg_pool; ++ uint64_t dstg_txg; ++ int dstg_err; ++ int dstg_space; ++ boolean_t dstg_nowaiter; ++} dsl_sync_task_group_t; ++ ++dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp); ++void dsl_sync_task_create(dsl_sync_task_group_t *dstg, ++ dsl_checkfunc_t *, dsl_syncfunc_t *, ++ void *arg1, void *arg2, int blocks_modified); ++int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg); ++void dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); ++void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg); ++void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); ++ ++int dsl_sync_task_do(struct dsl_pool *dp, ++ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, ++ void *arg1, void *arg2, int blocks_modified); ++void dsl_sync_task_do_nowait(struct dsl_pool *dp, ++ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, ++ void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_DSL_SYNCTASK_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/efi_partition.h linux-3.2.33-go/include/zfs/sys/efi_partition.h +--- linux-3.2.33-go.orig/include/zfs/sys/efi_partition.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/efi_partition.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,244 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_EFI_PARTITION_H ++#define _SYS_EFI_PARTITION_H ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * GUID Partition Table Header ++ */ ++ ++#define EFI_MIN_LABEL_SIZE 92 ++#define EFI_LABEL_SIZE 512 ++#define LEN_EFI_PAD (EFI_LABEL_SIZE - \ ++ ((5 * sizeof (diskaddr_t)) + \ ++ (7 * sizeof (uint_t)) + \ ++ (8 * sizeof (char)) + \ ++ (1 * (sizeof (struct uuid))))) ++ ++#define EFI_SIGNATURE 0x5452415020494645ULL ++ ++/* EFI Guid Partition Table Header -- little endian on-disk format */ ++typedef struct efi_gpt { ++ uint64_t efi_gpt_Signature; ++ uint_t efi_gpt_Revision; ++ uint_t efi_gpt_HeaderSize; ++ uint_t efi_gpt_HeaderCRC32; ++ uint_t efi_gpt_Reserved1; ++ diskaddr_t efi_gpt_MyLBA; ++ diskaddr_t efi_gpt_AlternateLBA; ++ diskaddr_t efi_gpt_FirstUsableLBA; ++ diskaddr_t efi_gpt_LastUsableLBA; ++ struct uuid efi_gpt_DiskGUID; ++ diskaddr_t efi_gpt_PartitionEntryLBA; ++ uint_t efi_gpt_NumberOfPartitionEntries; ++ uint_t efi_gpt_SizeOfPartitionEntry; ++ uint_t efi_gpt_PartitionEntryArrayCRC32; ++ char efi_gpt_Reserved2[LEN_EFI_PAD]; ++} efi_gpt_t; ++ ++/* EFI Guid Partition Entry Attributes -- little endian format */ ++typedef struct efi_gpe_Attrs { ++ uint32_t PartitionAttrs :16, ++ Reserved2 :16; ++ uint32_t Reserved1 :31, ++ RequiredPartition :1; ++} efi_gpe_Attrs_t; ++ ++/* ++ * 6a96237f-1dd2-11b2-99a6-080020736631 V_UNASSIGNED (not used as such) ++ * 6a82cb45-1dd2-11b2-99a6-080020736631 V_BOOT ++ * 6a85cf4d-1dd2-11b2-99a6-080020736631 V_ROOT ++ * 6a87c46f-1dd2-11b2-99a6-080020736631 V_SWAP ++ * 6a898cc3-1dd2-11b2-99a6-080020736631 V_USR ++ * 6a8b642b-1dd2-11b2-99a6-080020736631 V_BACKUP ++ * 6a8d2ac7-1dd2-11b2-99a6-080020736631 V_STAND (not used) ++ * 6a8ef2e9-1dd2-11b2-99a6-080020736631 V_VAR ++ * 6a90ba39-1dd2-11b2-99a6-080020736631 V_HOME ++ * 6a9283a5-1dd2-11b2-99a6-080020736631 V_ALTSCTR ++ * 6a945a3b-1dd2-11b2-99a6-080020736631 V_CACHE ++ */ ++ ++#define EFI_UNUSED { 0x00000000, 0x0000, 0x0000, 0x00, 0x00, \ ++ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } } ++#define EFI_RESV1 { 0x6a96237f, 0x1dd2, 0x11b2, 0x99, 0xa6, \ ++ { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } ++#define EFI_BOOT { 0x6a82cb45, 0x1dd2, 0x11b2, 0x99, 0xa6, \ ++ { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } ++#define EFI_ROOT { 0x6a85cf4d, 0x1dd2, 0x11b2, 0x99, 0xa6, \ ++ { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } ++#define EFI_SWAP { 0x6a87c46f, 0x1dd2, 0x11b2, 0x99, 0xa6, \ ++ { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } ++#define EFI_USR { 0x6a898cc3, 0x1dd2, 0x11b2, 0x99, 0xa6, \ ++ { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } ++#define EFI_BACKUP { 0x6a8b642b, 0x1dd2, 0x11b2, 0x99, 0xa6, \ ++ { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } ++#define EFI_RESV2 { 0x6a8d2ac7, 0x1dd2, 0x11b2, 0x99, 0xa6, \ ++ { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } ++#define EFI_VAR { 0x6a8ef2e9, 0x1dd2, 0x11b2, 0x99, 0xa6, \ ++ { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } ++#define EFI_HOME { 0x6a90ba39, 0x1dd2, 0x11b2, 0x99, 0xa6, \ ++ { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } ++#define EFI_ALTSCTR { 0x6a9283a5, 0x1dd2, 0x11b2, 0x99, 0xa6, \ ++ { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } ++#define EFI_RESERVED { 0x6a945a3b, 0x1dd2, 0x11b2, 0x99, 0xa6, \ ++ { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } ++#define EFI_SYSTEM { 0xC12A7328, 0xF81F, 0x11d2, 0xBA, 0x4B, \ ++ { 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B } } ++#define EFI_LEGACY_MBR { 0x024DEE41, 0x33E7, 0x11d3, 0x9D, 0x69, \ ++ { 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F } } ++#define EFI_SYMC_PUB { 0x6a9630d1, 0x1dd2, 0x11b2, 0x99, 0xa6, \ ++ { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } ++#define EFI_SYMC_CDS { 0x6a980767, 0x1dd2, 0x11b2, 0x99, 0xa6, \ ++ { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } ++#define EFI_MSFT_RESV { 0xE3C9E316, 0x0B5C, 0x4DB8, 0x81, 0x7D, \ ++ { 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE } } ++#define EFI_DELL_BASIC { 0xebd0a0a2, 0xb9e5, 0x4433, 0x87, 0xc0, \ ++ { 0x68, 0xb6, 0xb7, 0x26, 0x99, 0xc7 } } ++#define EFI_DELL_RAID { 0xa19d880f, 0x05fc, 0x4d3b, 0xa0, 0x06, \ ++ { 0x74, 0x3f, 0x0f, 0x84, 0x91, 0x1e } } ++#define EFI_DELL_SWAP { 0x0657fd6d, 0xa4ab, 0x43c4, 0x84, 0xe5, \ ++ { 0x09, 0x33, 0xc8, 0x4b, 0x4f, 0x4f } } ++#define EFI_DELL_LVM { 0xe6d6d379, 0xf507, 0x44c2, 0xa2, 0x3c, \ ++ { 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28 } } ++#define EFI_DELL_RESV { 0x8da63339, 0x0007, 0x60c0, 0xc4, 0x36, \ ++ { 0x08, 0x3a, 0xc8, 0x23, 0x09, 0x08 } } ++#define EFI_AAPL_HFS { 0x48465300, 0x0000, 0x11aa, 0xaa, 0x11, \ ++ { 0x00, 0x30, 0x65, 0x43, 0xec, 0xac } } ++#define EFI_AAPL_UFS { 0x55465300, 0x0000, 0x11aa, 0xaa, 0x11, \ ++ { 0x00, 0x30, 0x65, 0x43, 0xec, 0xac } } ++ ++/* minimum # of bytes for partition table entires, per EFI spec */ ++#define EFI_MIN_ARRAY_SIZE (16 * 1024) ++ ++#define EFI_PART_NAME_LEN 36 ++ ++/* size of the "reserved" partition, in blocks */ ++#define EFI_MIN_RESV_SIZE (16 * 1024) ++ ++/* EFI Guid Partition Entry */ ++typedef struct efi_gpe { ++ struct uuid efi_gpe_PartitionTypeGUID; ++ struct uuid efi_gpe_UniquePartitionGUID; ++ diskaddr_t efi_gpe_StartingLBA; ++ diskaddr_t efi_gpe_EndingLBA; ++ efi_gpe_Attrs_t efi_gpe_Attributes; ++ ushort_t efi_gpe_PartitionName[EFI_PART_NAME_LEN]; ++} efi_gpe_t; ++ ++/* ++ * passed to the useful (we hope) routines (efi_alloc_and_read and ++ * efi_write) that take this VTOC-like struct. These routines handle ++ * converting this struct into the EFI struct, generate UUIDs and ++ * checksums, and perform any necessary byte-swapping to the on-disk ++ * format. ++ */ ++/* Solaris library abstraction for EFI partitons */ ++typedef struct dk_part { ++ diskaddr_t p_start; /* starting LBA */ ++ diskaddr_t p_size; /* size in blocks */ ++ struct uuid p_guid; /* partion type GUID */ ++ ushort_t p_tag; /* converted to part'n type GUID */ ++ ushort_t p_flag; /* attributes */ ++ char p_name[EFI_PART_NAME_LEN]; /* partition name */ ++ struct uuid p_uguid; /* unique partition GUID */ ++ uint_t p_resv[8]; /* future use - set to zero */ ++} dk_part_t; ++ ++/* Solaris library abstraction for an EFI GPT */ ++#define EFI_VERSION102 0x00010002 ++#define EFI_VERSION100 0x00010000 ++#define EFI_VERSION_CURRENT EFI_VERSION100 ++typedef struct dk_gpt { ++ uint_t efi_version; /* set to EFI_VERSION_CURRENT */ ++ uint_t efi_nparts; /* number of partitions below */ ++ uint_t efi_part_size; /* size of each partition entry */ ++ /* efi_part_size is unused */ ++ uint_t efi_lbasize; /* size of block in bytes */ ++ diskaddr_t efi_last_lba; /* last block on the disk */ ++ diskaddr_t efi_first_u_lba; /* first block after labels */ ++ diskaddr_t efi_last_u_lba; /* last block before backup labels */ ++ struct uuid efi_disk_uguid; /* unique disk GUID */ ++ uint_t efi_flags; ++ uint_t efi_reserved1; /* future use - set to zero */ ++ diskaddr_t efi_altern_lba; /* lba of alternate GPT header */ ++ uint_t efi_reserved[12]; /* future use - set to zero */ ++ struct dk_part efi_parts[1]; /* array of partitions */ ++} dk_gpt_t; ++ ++/* possible values for "efi_flags" */ ++#define EFI_GPT_PRIMARY_CORRUPT 0x1 /* primary label corrupt */ ++ ++/* the private ioctl between libefi and the driver */ ++typedef struct dk_efi { ++ diskaddr_t dki_lba; /* starting block */ ++ len_t dki_length; /* length in bytes */ ++ union { ++ efi_gpt_t *_dki_data; ++ uint64_t _dki_data_64; ++ } dki_un; ++#define dki_data dki_un._dki_data ++#define dki_data_64 dki_un._dki_data_64 ++} dk_efi_t; ++ ++struct partition64 { ++ struct uuid p_type; ++ uint_t p_partno; ++ uint_t p_resv1; ++ diskaddr_t p_start; ++ diskaddr_t p_size; ++}; ++ ++/* ++ * Number of EFI partitions ++ */ ++#if defined(__linux__) ++#define EFI_NUMPAR 128 /* Expected by parted-1.8.1 */ ++#else ++#define EFI_NUMPAR 9 ++#endif ++ ++#ifndef _KERNEL ++extern int efi_alloc_and_init(int, uint32_t, struct dk_gpt **); ++extern int efi_alloc_and_read(int, struct dk_gpt **); ++extern int efi_write(int, struct dk_gpt *); ++extern int efi_rescan(int); ++extern void efi_free(struct dk_gpt *); ++extern int efi_type(int); ++extern void efi_err_check(struct dk_gpt *); ++extern int efi_auto_sense(int fd, struct dk_gpt **); ++extern int efi_use_whole_disk(int fd); ++#endif ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_EFI_PARTITION_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/fm/fs/Makefile linux-3.2.33-go/include/zfs/sys/fm/fs/Makefile +--- linux-3.2.33-go.orig/include/zfs/sys/fm/fs/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/fm/fs/Makefile 2012-11-16 23:25:34.343039404 +0100 +@@ -0,0 +1,659 @@ ++# Makefile.in generated by automake 1.11.6 from Makefile.am. ++# include/sys/fm/fs/Makefile. Generated from Makefile.in by configure. ++ ++# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, ++# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software ++# Foundation, Inc. ++# This Makefile.in is free software; the Free Software Foundation ++# gives unlimited permission to copy and/or distribute it, ++# with or without modifications, as long as this notice is preserved. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without ++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A ++# PARTICULAR PURPOSE. ++ ++ ++ ++ ++am__make_dryrun = \ ++ { \ ++ am__dry=no; \ ++ case $$MAKEFLAGS in \ ++ *\\[\ \ ]*) \ ++ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ ++ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ ++ *) \ ++ for am__flg in $$MAKEFLAGS; do \ ++ case $$am__flg in \ ++ *=*|--*) ;; \ ++ *n*) am__dry=yes; break;; \ ++ esac; \ ++ done;; \ ++ esac; \ ++ test $$am__dry = yes; \ ++ } ++pkgdatadir = $(datadir)/zfs ++pkgincludedir = $(includedir)/zfs ++pkglibdir = $(libdir)/zfs ++pkglibexecdir = $(libexecdir)/zfs ++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd ++install_sh_DATA = $(install_sh) -c -m 644 ++install_sh_PROGRAM = $(install_sh) -c ++install_sh_SCRIPT = $(install_sh) -c ++INSTALL_HEADER = $(INSTALL_DATA) ++transform = $(program_transform_name) ++NORMAL_INSTALL = : ++PRE_INSTALL = : ++POST_INSTALL = : ++NORMAL_UNINSTALL = : ++PRE_UNINSTALL = : ++POST_UNINSTALL = : ++build_triplet = x86_64-unknown-linux-gnu ++host_triplet = x86_64-unknown-linux-gnu ++target_triplet = x86_64-unknown-linux-gnu ++subdir = include/sys/fm/fs ++DIST_COMMON = $(am__kernel_HEADERS_DIST) $(am__libzfs_HEADERS_DIST) \ ++ $(srcdir)/Makefile.am $(srcdir)/Makefile.in ++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ++am__aclocal_m4_deps = \ ++ $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \ ++ $(top_srcdir)/config/kernel-automount.m4 \ ++ $(top_srcdir)/config/kernel-bdev-block-device-operations.m4 \ ++ $(top_srcdir)/config/kernel-bdev-logical-size.m4 \ ++ $(top_srcdir)/config/kernel-bdi-setup-and-register.m4 \ ++ $(top_srcdir)/config/kernel-bdi.m4 \ ++ $(top_srcdir)/config/kernel-bio-empty-barrier.m4 \ ++ $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \ ++ $(top_srcdir)/config/kernel-bio-failfast.m4 \ ++ $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \ ++ $(top_srcdir)/config/kernel-blk-end-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-fetch-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-discard.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-flush.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \ ++ $(top_srcdir)/config/kernel-blk-requeue-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-pos.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get.m4 \ ++ $(top_srcdir)/config/kernel-check-disk-size-change.m4 \ ++ $(top_srcdir)/config/kernel-clear-inode.m4 \ ++ $(top_srcdir)/config/kernel-commit-metadata.m4 \ ++ $(top_srcdir)/config/kernel-create-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-d-make-root.m4 \ ++ $(top_srcdir)/config/kernel-d-obtain-alias.m4 \ ++ $(top_srcdir)/config/kernel-discard-granularity.m4 \ ++ $(top_srcdir)/config/kernel-elevator-change.m4 \ ++ $(top_srcdir)/config/kernel-encode-fh-inode.m4 \ ++ $(top_srcdir)/config/kernel-evict-inode.m4 \ ++ $(top_srcdir)/config/kernel-fallocate.m4 \ ++ $(top_srcdir)/config/kernel-fmode-t.m4 \ ++ $(top_srcdir)/config/kernel-fsync.m4 \ ++ $(top_srcdir)/config/kernel-get-disk-ro.m4 \ ++ $(top_srcdir)/config/kernel-get-gendisk.m4 \ ++ $(top_srcdir)/config/kernel-insert-inode-locked.m4 \ ++ $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \ ++ $(top_srcdir)/config/kernel-kobj-name-len.m4 \ ++ $(top_srcdir)/config/kernel-lookup-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \ ++ $(top_srcdir)/config/kernel-mount-nodev.m4 \ ++ $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \ ++ $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \ ++ $(top_srcdir)/config/kernel-rq-is_sync.m4 \ ++ $(top_srcdir)/config/kernel-security-inode-init.m4 \ ++ $(top_srcdir)/config/kernel-set-nlink.m4 \ ++ $(top_srcdir)/config/kernel-sget-args.m4 \ ++ $(top_srcdir)/config/kernel-show-options.m4 \ ++ $(top_srcdir)/config/kernel-shrink.m4 \ ++ $(top_srcdir)/config/kernel-truncate-range.m4 \ ++ $(top_srcdir)/config/kernel-truncate-setsize.m4 \ ++ $(top_srcdir)/config/kernel-xattr-handler.m4 \ ++ $(top_srcdir)/config/kernel.m4 \ ++ $(top_srcdir)/config/user-arch.m4 \ ++ $(top_srcdir)/config/user-frame-larger-than.m4 \ ++ $(top_srcdir)/config/user-ioctl.m4 \ ++ $(top_srcdir)/config/user-libblkid.m4 \ ++ $(top_srcdir)/config/user-libuuid.m4 \ ++ $(top_srcdir)/config/user-nptl_guard_within_stack.m4 \ ++ $(top_srcdir)/config/user-selinux.m4 \ ++ $(top_srcdir)/config/user-udev.m4 \ ++ $(top_srcdir)/config/user-zlib.m4 $(top_srcdir)/config/user.m4 \ ++ $(top_srcdir)/config/zfs-build.m4 \ ++ $(top_srcdir)/config/zfs-meta.m4 $(top_srcdir)/configure.ac ++am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ ++ $(ACLOCAL_M4) ++mkinstalldirs = $(install_sh) -d ++CONFIG_HEADER = $(top_builddir)/zfs_config.h ++CONFIG_CLEAN_FILES = ++CONFIG_CLEAN_VPATH_FILES = ++AM_V_GEN = $(am__v_GEN_$(V)) ++am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) ++am__v_GEN_0 = @echo " GEN " $@; ++AM_V_at = $(am__v_at_$(V)) ++am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) ++am__v_at_0 = @ ++SOURCES = ++DIST_SOURCES = ++am__can_run_installinfo = \ ++ case $$AM_UPDATE_INFO_DIR in \ ++ n|no|NO) false;; \ ++ *) (install-info --version) >/dev/null 2>&1;; \ ++ esac ++am__kernel_HEADERS_DIST = $(top_srcdir)/include/sys/fm/fs/zfs.h ++am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; ++am__vpath_adj = case $$p in \ ++ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ ++ *) f=$$p;; \ ++ esac; ++am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; ++am__install_max = 40 ++am__nobase_strip_setup = \ ++ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` ++am__nobase_strip = \ ++ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" ++am__nobase_list = $(am__nobase_strip_setup); \ ++ for p in $$list; do echo "$$p $$p"; done | \ ++ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ ++ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ ++ if (++n[$$2] == $(am__install_max)) \ ++ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ ++ END { for (dir in files) print dir, files[dir] }' ++am__base_list = \ ++ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ ++ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' ++am__uninstall_files_from_dir = { \ ++ test -z "$$files" \ ++ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ ++ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ ++ $(am__cd) "$$dir" && rm -f $$files; }; \ ++ } ++am__installdirs = "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)" ++am__libzfs_HEADERS_DIST = $(top_srcdir)/include/sys/fm/fs/zfs.h ++HEADERS = $(kernel_HEADERS) $(libzfs_HEADERS) ++ETAGS = etags ++CTAGS = ctags ++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ++ACLOCAL = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run aclocal-1.11 ++ALIEN = alien ++ALIEN_VERSION = ++AMTAR = $${TAR-tar} ++AM_DEFAULT_VERBOSITY = 1 ++AR = ar ++AUTOCONF = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run autoconf ++AUTOHEADER = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run autoheader ++AUTOMAKE = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run automake-1.11 ++AWK = gawk ++CC = gcc ++CCAS = gcc ++CCASDEPMODE = depmode=gcc3 ++CCASFLAGS = -g -O2 ++CCDEPMODE = depmode=gcc3 ++CFLAGS = -g -O2 ++CPP = gcc -E ++CPPFLAGS = ++CYGPATH_W = echo ++DEBUG_CFLAGS = -DNDEBUG ++DEBUG_DMU_TX = _without_debug_dmu_tx ++DEBUG_STACKFLAGS = ++DEBUG_ZFS = _without_debug ++DEFAULT_INIT_DIR = ${prefix}/etc/init.d ++DEFAULT_INIT_SCRIPT = gentoo ++DEFAULT_PACKAGE = tgz ++DEFS = -DHAVE_CONFIG_H ++DEPDIR = .deps ++DLLTOOL = false ++DPKG = dpkg ++DPKGBUILD = dpkg-buildpackage ++DPKGBUILD_VERSION = ++DPKG_VERSION = ++DSYMUTIL = ++DUMPBIN = ++ECHO_C = ++ECHO_N = -n ++ECHO_T = ++EGREP = /bin/grep -E ++EXEEXT = ++FGREP = /bin/grep -F ++FRAME_LARGER_THAN = -Wframe-larger-than=1024 ++GREP = /bin/grep ++HAVE_ALIEN = no ++HAVE_DPKG = no ++HAVE_DPKGBUILD = no ++HAVE_MAKEPKG = ++HAVE_PACMAN = ++HAVE_RPM = yes ++HAVE_RPMBUILD = yes ++INSTALL = /usr/bin/install -c ++INSTALL_DATA = ${INSTALL} -m 644 ++INSTALL_PROGRAM = ${INSTALL} ++INSTALL_SCRIPT = ${INSTALL} ++INSTALL_STRIP_PROGRAM = $(install_sh) -c -s ++KERNELCPPFLAGS = -Wno-unused-but-set-variable -DHAVE_SPL -D_KERNEL -DTEXT_DOMAIN=\"zfs-linux-kernel\" -DNDEBUG ++KERNELMAKE_PARAMS = O=/usr/src/linux-3.6.0-sabayon ++LD = /usr/x86_64-pc-linux-gnu/bin/ld -m elf_x86_64 ++LDFLAGS = ++LIBBLKID = ++LIBOBJS = ++LIBS = -luuid -luuid -lz -lz -lz ++LIBSELINUX = ++LIBTOOL = $(SHELL) $(top_builddir)/libtool ++LIBUUID = -luuid ++LINUX = /usr/src/linux-3.2.33-go ++LINUX_OBJ = /usr/src/linux-3.6.0-sabayon ++LINUX_SYMBOLS = NONE ++LINUX_VERSION = 3.6.0-sabayon ++LIPO = ++LN_S = ln -s ++LTLIBOBJS = ++MAINT = # ++MAKEINFO = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run makeinfo ++MAKEPKG = ++MAKEPKG_VERSION = ++MANIFEST_TOOL = : ++MKDIR_P = /bin/mkdir -p ++NM = /usr/bin/nm -B ++NMEDIT = ++NO_UNUSED_BUT_SET_VARIABLE = -Wno-unused-but-set-variable ++OBJDUMP = objdump ++OBJEXT = o ++OTOOL = ++OTOOL64 = ++PACKAGE = zfs ++PACKAGE_BUGREPORT = ++PACKAGE_NAME = ++PACKAGE_STRING = ++PACKAGE_TARNAME = ++PACKAGE_URL = ++PACKAGE_VERSION = ++PACMAN = ++PACMAN_VERSION = ++PATH_SEPARATOR = : ++RANLIB = ranlib ++RPM = rpm ++RPMBUILD = rpmbuild ++RPMBUILD_VERSION = 4.10.0 ++RPM_VERSION = 4.10.0 ++SED = /bin/sed ++SET_MAKE = ++SHELL = /bin/sh ++SPL = /usr/src/linux-3.2.33-go ++SPL_OBJ = /usr/src/linux-3.2.33-go ++SPL_SYMBOLS = NONE ++SPL_VERSION = 0.6.0-rc12 ++STRIP = strip ++TARGET_ASM_DIR = asm-x86_64 ++VENDOR = gentoo ++VERSION = 0.6.0 ++ZFS_CONFIG = all ++ZFS_META_ALIAS = zfs-0.6.0-rc12 ++ZFS_META_AUTHOR = Sun Microsystems/Oracle, Lawrence Livermore National Laboratory ++ZFS_META_DATA = ++ZFS_META_LICENSE = CDDL ++ZFS_META_LT_AGE = ++ZFS_META_LT_CURRENT = ++ZFS_META_LT_REVISION = ++ZFS_META_NAME = zfs ++ZFS_META_RELEASE = rc12 ++ZFS_META_VERSION = 0.6.0 ++ZLIB = -lz ++abs_builddir = /root/zfs-0.6.0-rc12/include/sys/fm/fs ++abs_srcdir = /root/zfs-0.6.0-rc12/include/sys/fm/fs ++abs_top_builddir = /root/zfs-0.6.0-rc12 ++abs_top_srcdir = /root/zfs-0.6.0-rc12 ++ac_ct_AR = ar ++ac_ct_CC = gcc ++ac_ct_DUMPBIN = ++am__include = include ++am__leading_dot = . ++am__quote = ++am__tar = $${TAR-tar} chof - "$$tardir" ++am__untar = $${TAR-tar} xf - ++bindir = ${exec_prefix}/bin ++build = x86_64-unknown-linux-gnu ++build_alias = ++build_cpu = x86_64 ++build_os = linux-gnu ++build_vendor = unknown ++builddir = . ++datadir = ${datarootdir} ++datarootdir = ${prefix}/share ++docdir = ${datarootdir}/doc/${PACKAGE} ++dvidir = ${docdir} ++exec_prefix = ${prefix} ++host = x86_64-unknown-linux-gnu ++host_alias = ++host_cpu = x86_64 ++host_os = linux-gnu ++host_vendor = unknown ++htmldir = ${docdir} ++includedir = ${prefix}/include ++infodir = ${datarootdir}/info ++install_sh = ${SHELL} /root/zfs-0.6.0-rc12/config/install-sh ++libdir = ${exec_prefix}/lib ++libexecdir = ${exec_prefix}/libexec ++localedir = ${datarootdir}/locale ++localstatedir = ${prefix}/var ++mandir = ${datarootdir}/man ++mkdir_p = /bin/mkdir -p ++oldincludedir = /usr/include ++pdfdir = ${docdir} ++prefix = /usr/local ++program_transform_name = s,x,x, ++psdir = ${docdir} ++sbindir = ${exec_prefix}/sbin ++sharedstatedir = ${prefix}/com ++srcdir = . ++sysconfdir = ${prefix}/etc ++target = x86_64-unknown-linux-gnu ++target_alias = ++target_cpu = x86_64 ++target_os = linux-gnu ++target_vendor = unknown ++top_build_prefix = ../../../../ ++top_builddir = ../../../.. ++top_srcdir = ../../../.. ++udevdir = ${exec_prefix}/lib/udev ++udevruledir = ${udevdir}/rules.d ++COMMON_H = \ ++ $(top_srcdir)/include/sys/fm/fs/zfs.h ++ ++KERNEL_H = ++USER_H = ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++libzfsdir = $(includedir)/libzfs/sys/fm/fs ++libzfs_HEADERS = $(COMMON_H) $(USER_H) ++#kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION)/sys/fm/fs ++#kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++all: all-am ++ ++.SUFFIXES: ++$(srcdir)/Makefile.in: # $(srcdir)/Makefile.am $(am__configure_deps) ++ @for dep in $?; do \ ++ case '$(am__configure_deps)' in \ ++ *$$dep*) \ ++ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ ++ && { if test -f $@; then exit 0; else break; fi; }; \ ++ exit 1;; \ ++ esac; \ ++ done; \ ++ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu include/sys/fm/fs/Makefile'; \ ++ $(am__cd) $(top_srcdir) && \ ++ $(AUTOMAKE) --gnu include/sys/fm/fs/Makefile ++.PRECIOUS: Makefile ++Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status ++ @case '$?' in \ ++ *config.status*) \ ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ ++ *) \ ++ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ ++ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ ++ esac; ++ ++$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++ ++$(top_srcdir)/configure: # $(am__configure_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(ACLOCAL_M4): # $(am__aclocal_m4_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(am__aclocal_m4_deps): ++ ++mostlyclean-libtool: ++ -rm -f *.lo ++ ++clean-libtool: ++ -rm -rf .libs _libs ++install-kernelHEADERS: $(kernel_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(kerneldir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(kerneldir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(kerneldir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(kerneldir)" || exit $$?; \ ++ done ++ ++uninstall-kernelHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(kerneldir)'; $(am__uninstall_files_from_dir) ++install-libzfsHEADERS: $(libzfs_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(libzfsdir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(libzfsdir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libzfsdir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(libzfsdir)" || exit $$?; \ ++ done ++ ++uninstall-libzfsHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(libzfsdir)'; $(am__uninstall_files_from_dir) ++ ++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ mkid -fID $$unique ++tags: TAGS ++ ++TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ set x; \ ++ here=`pwd`; \ ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ shift; \ ++ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ ++ test -n "$$unique" || unique=$$empty_fix; \ ++ if test $$# -gt 0; then \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ "$$@" $$unique; \ ++ else \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ $$unique; \ ++ fi; \ ++ fi ++ctags: CTAGS ++CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ test -z "$(CTAGS_ARGS)$$unique" \ ++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ ++ $$unique ++ ++GTAGS: ++ here=`$(am__cd) $(top_builddir) && pwd` \ ++ && $(am__cd) $(top_srcdir) \ ++ && gtags -i $(GTAGS_ARGS) "$$here" ++ ++distclean-tags: ++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags ++ ++distdir: $(DISTFILES) ++ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ list='$(DISTFILES)'; \ ++ dist_files=`for file in $$list; do echo $$file; done | \ ++ sed -e "s|^$$srcdirstrip/||;t" \ ++ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ ++ case $$dist_files in \ ++ */*) $(MKDIR_P) `echo "$$dist_files" | \ ++ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ ++ sort -u` ;; \ ++ esac; \ ++ for file in $$dist_files; do \ ++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ ++ if test -d $$d/$$file; then \ ++ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ ++ if test -d "$(distdir)/$$file"; then \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ ++ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ ++ else \ ++ test -f "$(distdir)/$$file" \ ++ || cp -p $$d/$$file "$(distdir)/$$file" \ ++ || exit 1; \ ++ fi; \ ++ done ++check-am: all-am ++check: check-am ++all-am: Makefile $(HEADERS) ++installdirs: ++ for dir in "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)"; do \ ++ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ ++ done ++install: install-am ++install-exec: install-exec-am ++install-data: install-data-am ++uninstall: uninstall-am ++ ++install-am: all-am ++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am ++ ++installcheck: installcheck-am ++install-strip: ++ if test -z '$(STRIP)'; then \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ install; \ ++ else \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ ++ fi ++mostlyclean-generic: ++ ++clean-generic: ++ ++distclean-generic: ++ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) ++ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) ++ ++maintainer-clean-generic: ++ @echo "This command is intended for maintainers to use" ++ @echo "it deletes files that may require special tools to rebuild." ++clean: clean-am ++ ++clean-am: clean-generic clean-libtool mostlyclean-am ++ ++distclean: distclean-am ++ -rm -f Makefile ++distclean-am: clean-am distclean-generic distclean-tags ++ ++dvi: dvi-am ++ ++dvi-am: ++ ++html: html-am ++ ++html-am: ++ ++info: info-am ++ ++info-am: ++ ++install-data-am: install-kernelHEADERS install-libzfsHEADERS ++ ++install-dvi: install-dvi-am ++ ++install-dvi-am: ++ ++install-exec-am: ++ ++install-html: install-html-am ++ ++install-html-am: ++ ++install-info: install-info-am ++ ++install-info-am: ++ ++install-man: ++ ++install-pdf: install-pdf-am ++ ++install-pdf-am: ++ ++install-ps: install-ps-am ++ ++install-ps-am: ++ ++installcheck-am: ++ ++maintainer-clean: maintainer-clean-am ++ -rm -f Makefile ++maintainer-clean-am: distclean-am maintainer-clean-generic ++ ++mostlyclean: mostlyclean-am ++ ++mostlyclean-am: mostlyclean-generic mostlyclean-libtool ++ ++pdf: pdf-am ++ ++pdf-am: ++ ++ps: ps-am ++ ++ps-am: ++ ++uninstall-am: uninstall-kernelHEADERS uninstall-libzfsHEADERS ++ ++.MAKE: install-am install-strip ++ ++.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ ++ clean-libtool ctags distclean distclean-generic \ ++ distclean-libtool distclean-tags distdir dvi dvi-am html \ ++ html-am info info-am install install-am install-data \ ++ install-data-am install-dvi install-dvi-am install-exec \ ++ install-exec-am install-html install-html-am install-info \ ++ install-info-am install-kernelHEADERS install-libzfsHEADERS \ ++ install-man install-pdf install-pdf-am install-ps \ ++ install-ps-am install-strip installcheck installcheck-am \ ++ installdirs maintainer-clean maintainer-clean-generic \ ++ mostlyclean mostlyclean-generic mostlyclean-libtool pdf pdf-am \ ++ ps ps-am tags uninstall uninstall-am uninstall-kernelHEADERS \ ++ uninstall-libzfsHEADERS ++ ++ ++# Tell versions [3.59,3.63) of GNU make to not export all variables. ++# Otherwise a system limit (for SysV at least) may be exceeded. ++.NOEXPORT: +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/fm/fs/Makefile.am linux-3.2.33-go/include/zfs/sys/fm/fs/Makefile.am +--- linux-3.2.33-go.orig/include/zfs/sys/fm/fs/Makefile.am 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/fm/fs/Makefile.am 2012-11-16 23:25:34.343039404 +0100 +@@ -0,0 +1,18 @@ ++COMMON_H = \ ++ $(top_srcdir)/include/sys/fm/fs/zfs.h ++ ++KERNEL_H = ++ ++USER_H = ++ ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++ ++if CONFIG_USER ++libzfsdir = $(includedir)/libzfs/sys/fm/fs ++libzfs_HEADERS = $(COMMON_H) $(USER_H) ++endif ++ ++if CONFIG_KERNEL ++kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION)/sys/fm/fs ++kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++endif +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/fm/fs/Makefile.in linux-3.2.33-go/include/zfs/sys/fm/fs/Makefile.in +--- linux-3.2.33-go.orig/include/zfs/sys/fm/fs/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/fm/fs/Makefile.in 2012-11-16 23:25:34.343039404 +0100 +@@ -0,0 +1,659 @@ ++# Makefile.in generated by automake 1.11.6 from Makefile.am. ++# @configure_input@ ++ ++# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, ++# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software ++# Foundation, Inc. ++# This Makefile.in is free software; the Free Software Foundation ++# gives unlimited permission to copy and/or distribute it, ++# with or without modifications, as long as this notice is preserved. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without ++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A ++# PARTICULAR PURPOSE. ++ ++@SET_MAKE@ ++ ++VPATH = @srcdir@ ++am__make_dryrun = \ ++ { \ ++ am__dry=no; \ ++ case $$MAKEFLAGS in \ ++ *\\[\ \ ]*) \ ++ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ ++ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ ++ *) \ ++ for am__flg in $$MAKEFLAGS; do \ ++ case $$am__flg in \ ++ *=*|--*) ;; \ ++ *n*) am__dry=yes; break;; \ ++ esac; \ ++ done;; \ ++ esac; \ ++ test $$am__dry = yes; \ ++ } ++pkgdatadir = $(datadir)/@PACKAGE@ ++pkgincludedir = $(includedir)/@PACKAGE@ ++pkglibdir = $(libdir)/@PACKAGE@ ++pkglibexecdir = $(libexecdir)/@PACKAGE@ ++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd ++install_sh_DATA = $(install_sh) -c -m 644 ++install_sh_PROGRAM = $(install_sh) -c ++install_sh_SCRIPT = $(install_sh) -c ++INSTALL_HEADER = $(INSTALL_DATA) ++transform = $(program_transform_name) ++NORMAL_INSTALL = : ++PRE_INSTALL = : ++POST_INSTALL = : ++NORMAL_UNINSTALL = : ++PRE_UNINSTALL = : ++POST_UNINSTALL = : ++build_triplet = @build@ ++host_triplet = @host@ ++target_triplet = @target@ ++subdir = include/sys/fm/fs ++DIST_COMMON = $(am__kernel_HEADERS_DIST) $(am__libzfs_HEADERS_DIST) \ ++ $(srcdir)/Makefile.am $(srcdir)/Makefile.in ++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ++am__aclocal_m4_deps = \ ++ $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \ ++ $(top_srcdir)/config/kernel-automount.m4 \ ++ $(top_srcdir)/config/kernel-bdev-block-device-operations.m4 \ ++ $(top_srcdir)/config/kernel-bdev-logical-size.m4 \ ++ $(top_srcdir)/config/kernel-bdi-setup-and-register.m4 \ ++ $(top_srcdir)/config/kernel-bdi.m4 \ ++ $(top_srcdir)/config/kernel-bio-empty-barrier.m4 \ ++ $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \ ++ $(top_srcdir)/config/kernel-bio-failfast.m4 \ ++ $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \ ++ $(top_srcdir)/config/kernel-blk-end-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-fetch-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-discard.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-flush.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \ ++ $(top_srcdir)/config/kernel-blk-requeue-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-pos.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get.m4 \ ++ $(top_srcdir)/config/kernel-check-disk-size-change.m4 \ ++ $(top_srcdir)/config/kernel-clear-inode.m4 \ ++ $(top_srcdir)/config/kernel-commit-metadata.m4 \ ++ $(top_srcdir)/config/kernel-create-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-d-make-root.m4 \ ++ $(top_srcdir)/config/kernel-d-obtain-alias.m4 \ ++ $(top_srcdir)/config/kernel-discard-granularity.m4 \ ++ $(top_srcdir)/config/kernel-elevator-change.m4 \ ++ $(top_srcdir)/config/kernel-encode-fh-inode.m4 \ ++ $(top_srcdir)/config/kernel-evict-inode.m4 \ ++ $(top_srcdir)/config/kernel-fallocate.m4 \ ++ $(top_srcdir)/config/kernel-fmode-t.m4 \ ++ $(top_srcdir)/config/kernel-fsync.m4 \ ++ $(top_srcdir)/config/kernel-get-disk-ro.m4 \ ++ $(top_srcdir)/config/kernel-get-gendisk.m4 \ ++ $(top_srcdir)/config/kernel-insert-inode-locked.m4 \ ++ $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \ ++ $(top_srcdir)/config/kernel-kobj-name-len.m4 \ ++ $(top_srcdir)/config/kernel-lookup-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \ ++ $(top_srcdir)/config/kernel-mount-nodev.m4 \ ++ $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \ ++ $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \ ++ $(top_srcdir)/config/kernel-rq-is_sync.m4 \ ++ $(top_srcdir)/config/kernel-security-inode-init.m4 \ ++ $(top_srcdir)/config/kernel-set-nlink.m4 \ ++ $(top_srcdir)/config/kernel-sget-args.m4 \ ++ $(top_srcdir)/config/kernel-show-options.m4 \ ++ $(top_srcdir)/config/kernel-shrink.m4 \ ++ $(top_srcdir)/config/kernel-truncate-range.m4 \ ++ $(top_srcdir)/config/kernel-truncate-setsize.m4 \ ++ $(top_srcdir)/config/kernel-xattr-handler.m4 \ ++ $(top_srcdir)/config/kernel.m4 \ ++ $(top_srcdir)/config/user-arch.m4 \ ++ $(top_srcdir)/config/user-frame-larger-than.m4 \ ++ $(top_srcdir)/config/user-ioctl.m4 \ ++ $(top_srcdir)/config/user-libblkid.m4 \ ++ $(top_srcdir)/config/user-libuuid.m4 \ ++ $(top_srcdir)/config/user-nptl_guard_within_stack.m4 \ ++ $(top_srcdir)/config/user-selinux.m4 \ ++ $(top_srcdir)/config/user-udev.m4 \ ++ $(top_srcdir)/config/user-zlib.m4 $(top_srcdir)/config/user.m4 \ ++ $(top_srcdir)/config/zfs-build.m4 \ ++ $(top_srcdir)/config/zfs-meta.m4 $(top_srcdir)/configure.ac ++am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ ++ $(ACLOCAL_M4) ++mkinstalldirs = $(install_sh) -d ++CONFIG_HEADER = $(top_builddir)/zfs_config.h ++CONFIG_CLEAN_FILES = ++CONFIG_CLEAN_VPATH_FILES = ++AM_V_GEN = $(am__v_GEN_@AM_V@) ++am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) ++am__v_GEN_0 = @echo " GEN " $@; ++AM_V_at = $(am__v_at_@AM_V@) ++am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) ++am__v_at_0 = @ ++SOURCES = ++DIST_SOURCES = ++am__can_run_installinfo = \ ++ case $$AM_UPDATE_INFO_DIR in \ ++ n|no|NO) false;; \ ++ *) (install-info --version) >/dev/null 2>&1;; \ ++ esac ++am__kernel_HEADERS_DIST = $(top_srcdir)/include/sys/fm/fs/zfs.h ++am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; ++am__vpath_adj = case $$p in \ ++ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ ++ *) f=$$p;; \ ++ esac; ++am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; ++am__install_max = 40 ++am__nobase_strip_setup = \ ++ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` ++am__nobase_strip = \ ++ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" ++am__nobase_list = $(am__nobase_strip_setup); \ ++ for p in $$list; do echo "$$p $$p"; done | \ ++ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ ++ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ ++ if (++n[$$2] == $(am__install_max)) \ ++ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ ++ END { for (dir in files) print dir, files[dir] }' ++am__base_list = \ ++ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ ++ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' ++am__uninstall_files_from_dir = { \ ++ test -z "$$files" \ ++ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ ++ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ ++ $(am__cd) "$$dir" && rm -f $$files; }; \ ++ } ++am__installdirs = "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)" ++am__libzfs_HEADERS_DIST = $(top_srcdir)/include/sys/fm/fs/zfs.h ++HEADERS = $(kernel_HEADERS) $(libzfs_HEADERS) ++ETAGS = etags ++CTAGS = ctags ++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ++ACLOCAL = @ACLOCAL@ ++ALIEN = @ALIEN@ ++ALIEN_VERSION = @ALIEN_VERSION@ ++AMTAR = @AMTAR@ ++AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ++AR = @AR@ ++AUTOCONF = @AUTOCONF@ ++AUTOHEADER = @AUTOHEADER@ ++AUTOMAKE = @AUTOMAKE@ ++AWK = @AWK@ ++CC = @CC@ ++CCAS = @CCAS@ ++CCASDEPMODE = @CCASDEPMODE@ ++CCASFLAGS = @CCASFLAGS@ ++CCDEPMODE = @CCDEPMODE@ ++CFLAGS = @CFLAGS@ ++CPP = @CPP@ ++CPPFLAGS = @CPPFLAGS@ ++CYGPATH_W = @CYGPATH_W@ ++DEBUG_CFLAGS = @DEBUG_CFLAGS@ ++DEBUG_DMU_TX = @DEBUG_DMU_TX@ ++DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@ ++DEBUG_ZFS = @DEBUG_ZFS@ ++DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@ ++DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@ ++DEFAULT_PACKAGE = @DEFAULT_PACKAGE@ ++DEFS = @DEFS@ ++DEPDIR = @DEPDIR@ ++DLLTOOL = @DLLTOOL@ ++DPKG = @DPKG@ ++DPKGBUILD = @DPKGBUILD@ ++DPKGBUILD_VERSION = @DPKGBUILD_VERSION@ ++DPKG_VERSION = @DPKG_VERSION@ ++DSYMUTIL = @DSYMUTIL@ ++DUMPBIN = @DUMPBIN@ ++ECHO_C = @ECHO_C@ ++ECHO_N = @ECHO_N@ ++ECHO_T = @ECHO_T@ ++EGREP = @EGREP@ ++EXEEXT = @EXEEXT@ ++FGREP = @FGREP@ ++FRAME_LARGER_THAN = @FRAME_LARGER_THAN@ ++GREP = @GREP@ ++HAVE_ALIEN = @HAVE_ALIEN@ ++HAVE_DPKG = @HAVE_DPKG@ ++HAVE_DPKGBUILD = @HAVE_DPKGBUILD@ ++HAVE_MAKEPKG = @HAVE_MAKEPKG@ ++HAVE_PACMAN = @HAVE_PACMAN@ ++HAVE_RPM = @HAVE_RPM@ ++HAVE_RPMBUILD = @HAVE_RPMBUILD@ ++INSTALL = @INSTALL@ ++INSTALL_DATA = @INSTALL_DATA@ ++INSTALL_PROGRAM = @INSTALL_PROGRAM@ ++INSTALL_SCRIPT = @INSTALL_SCRIPT@ ++INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ ++KERNELCPPFLAGS = @KERNELCPPFLAGS@ ++KERNELMAKE_PARAMS = @KERNELMAKE_PARAMS@ ++LD = @LD@ ++LDFLAGS = @LDFLAGS@ ++LIBBLKID = @LIBBLKID@ ++LIBOBJS = @LIBOBJS@ ++LIBS = @LIBS@ ++LIBSELINUX = @LIBSELINUX@ ++LIBTOOL = @LIBTOOL@ ++LIBUUID = @LIBUUID@ ++LINUX = @LINUX@ ++LINUX_OBJ = @LINUX_OBJ@ ++LINUX_SYMBOLS = @LINUX_SYMBOLS@ ++LINUX_VERSION = @LINUX_VERSION@ ++LIPO = @LIPO@ ++LN_S = @LN_S@ ++LTLIBOBJS = @LTLIBOBJS@ ++MAINT = @MAINT@ ++MAKEINFO = @MAKEINFO@ ++MAKEPKG = @MAKEPKG@ ++MAKEPKG_VERSION = @MAKEPKG_VERSION@ ++MANIFEST_TOOL = @MANIFEST_TOOL@ ++MKDIR_P = @MKDIR_P@ ++NM = @NM@ ++NMEDIT = @NMEDIT@ ++NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@ ++OBJDUMP = @OBJDUMP@ ++OBJEXT = @OBJEXT@ ++OTOOL = @OTOOL@ ++OTOOL64 = @OTOOL64@ ++PACKAGE = @PACKAGE@ ++PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ ++PACKAGE_NAME = @PACKAGE_NAME@ ++PACKAGE_STRING = @PACKAGE_STRING@ ++PACKAGE_TARNAME = @PACKAGE_TARNAME@ ++PACKAGE_URL = @PACKAGE_URL@ ++PACKAGE_VERSION = @PACKAGE_VERSION@ ++PACMAN = @PACMAN@ ++PACMAN_VERSION = @PACMAN_VERSION@ ++PATH_SEPARATOR = @PATH_SEPARATOR@ ++RANLIB = @RANLIB@ ++RPM = @RPM@ ++RPMBUILD = @RPMBUILD@ ++RPMBUILD_VERSION = @RPMBUILD_VERSION@ ++RPM_VERSION = @RPM_VERSION@ ++SED = @SED@ ++SET_MAKE = @SET_MAKE@ ++SHELL = @SHELL@ ++SPL = @SPL@ ++SPL_OBJ = @SPL_OBJ@ ++SPL_SYMBOLS = @SPL_SYMBOLS@ ++SPL_VERSION = @SPL_VERSION@ ++STRIP = @STRIP@ ++TARGET_ASM_DIR = @TARGET_ASM_DIR@ ++VENDOR = @VENDOR@ ++VERSION = @VERSION@ ++ZFS_CONFIG = @ZFS_CONFIG@ ++ZFS_META_ALIAS = @ZFS_META_ALIAS@ ++ZFS_META_AUTHOR = @ZFS_META_AUTHOR@ ++ZFS_META_DATA = @ZFS_META_DATA@ ++ZFS_META_LICENSE = @ZFS_META_LICENSE@ ++ZFS_META_LT_AGE = @ZFS_META_LT_AGE@ ++ZFS_META_LT_CURRENT = @ZFS_META_LT_CURRENT@ ++ZFS_META_LT_REVISION = @ZFS_META_LT_REVISION@ ++ZFS_META_NAME = @ZFS_META_NAME@ ++ZFS_META_RELEASE = @ZFS_META_RELEASE@ ++ZFS_META_VERSION = @ZFS_META_VERSION@ ++ZLIB = @ZLIB@ ++abs_builddir = @abs_builddir@ ++abs_srcdir = @abs_srcdir@ ++abs_top_builddir = @abs_top_builddir@ ++abs_top_srcdir = @abs_top_srcdir@ ++ac_ct_AR = @ac_ct_AR@ ++ac_ct_CC = @ac_ct_CC@ ++ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ ++am__include = @am__include@ ++am__leading_dot = @am__leading_dot@ ++am__quote = @am__quote@ ++am__tar = @am__tar@ ++am__untar = @am__untar@ ++bindir = @bindir@ ++build = @build@ ++build_alias = @build_alias@ ++build_cpu = @build_cpu@ ++build_os = @build_os@ ++build_vendor = @build_vendor@ ++builddir = @builddir@ ++datadir = @datadir@ ++datarootdir = @datarootdir@ ++docdir = @docdir@ ++dvidir = @dvidir@ ++exec_prefix = @exec_prefix@ ++host = @host@ ++host_alias = @host_alias@ ++host_cpu = @host_cpu@ ++host_os = @host_os@ ++host_vendor = @host_vendor@ ++htmldir = @htmldir@ ++includedir = @includedir@ ++infodir = @infodir@ ++install_sh = @install_sh@ ++libdir = @libdir@ ++libexecdir = @libexecdir@ ++localedir = @localedir@ ++localstatedir = @localstatedir@ ++mandir = @mandir@ ++mkdir_p = @mkdir_p@ ++oldincludedir = @oldincludedir@ ++pdfdir = @pdfdir@ ++prefix = @prefix@ ++program_transform_name = @program_transform_name@ ++psdir = @psdir@ ++sbindir = @sbindir@ ++sharedstatedir = @sharedstatedir@ ++srcdir = @srcdir@ ++sysconfdir = @sysconfdir@ ++target = @target@ ++target_alias = @target_alias@ ++target_cpu = @target_cpu@ ++target_os = @target_os@ ++target_vendor = @target_vendor@ ++top_build_prefix = @top_build_prefix@ ++top_builddir = @top_builddir@ ++top_srcdir = @top_srcdir@ ++udevdir = @udevdir@ ++udevruledir = @udevruledir@ ++COMMON_H = \ ++ $(top_srcdir)/include/sys/fm/fs/zfs.h ++ ++KERNEL_H = ++USER_H = ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++@CONFIG_USER_TRUE@libzfsdir = $(includedir)/libzfs/sys/fm/fs ++@CONFIG_USER_TRUE@libzfs_HEADERS = $(COMMON_H) $(USER_H) ++@CONFIG_KERNEL_TRUE@kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION)/sys/fm/fs ++@CONFIG_KERNEL_TRUE@kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++all: all-am ++ ++.SUFFIXES: ++$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) ++ @for dep in $?; do \ ++ case '$(am__configure_deps)' in \ ++ *$$dep*) \ ++ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ ++ && { if test -f $@; then exit 0; else break; fi; }; \ ++ exit 1;; \ ++ esac; \ ++ done; \ ++ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu include/sys/fm/fs/Makefile'; \ ++ $(am__cd) $(top_srcdir) && \ ++ $(AUTOMAKE) --gnu include/sys/fm/fs/Makefile ++.PRECIOUS: Makefile ++Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status ++ @case '$?' in \ ++ *config.status*) \ ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ ++ *) \ ++ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ ++ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ ++ esac; ++ ++$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++ ++$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(am__aclocal_m4_deps): ++ ++mostlyclean-libtool: ++ -rm -f *.lo ++ ++clean-libtool: ++ -rm -rf .libs _libs ++install-kernelHEADERS: $(kernel_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(kerneldir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(kerneldir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(kerneldir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(kerneldir)" || exit $$?; \ ++ done ++ ++uninstall-kernelHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(kerneldir)'; $(am__uninstall_files_from_dir) ++install-libzfsHEADERS: $(libzfs_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(libzfsdir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(libzfsdir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libzfsdir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(libzfsdir)" || exit $$?; \ ++ done ++ ++uninstall-libzfsHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(libzfsdir)'; $(am__uninstall_files_from_dir) ++ ++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ mkid -fID $$unique ++tags: TAGS ++ ++TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ set x; \ ++ here=`pwd`; \ ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ shift; \ ++ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ ++ test -n "$$unique" || unique=$$empty_fix; \ ++ if test $$# -gt 0; then \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ "$$@" $$unique; \ ++ else \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ $$unique; \ ++ fi; \ ++ fi ++ctags: CTAGS ++CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ test -z "$(CTAGS_ARGS)$$unique" \ ++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ ++ $$unique ++ ++GTAGS: ++ here=`$(am__cd) $(top_builddir) && pwd` \ ++ && $(am__cd) $(top_srcdir) \ ++ && gtags -i $(GTAGS_ARGS) "$$here" ++ ++distclean-tags: ++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags ++ ++distdir: $(DISTFILES) ++ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ list='$(DISTFILES)'; \ ++ dist_files=`for file in $$list; do echo $$file; done | \ ++ sed -e "s|^$$srcdirstrip/||;t" \ ++ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ ++ case $$dist_files in \ ++ */*) $(MKDIR_P) `echo "$$dist_files" | \ ++ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ ++ sort -u` ;; \ ++ esac; \ ++ for file in $$dist_files; do \ ++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ ++ if test -d $$d/$$file; then \ ++ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ ++ if test -d "$(distdir)/$$file"; then \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ ++ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ ++ else \ ++ test -f "$(distdir)/$$file" \ ++ || cp -p $$d/$$file "$(distdir)/$$file" \ ++ || exit 1; \ ++ fi; \ ++ done ++check-am: all-am ++check: check-am ++all-am: Makefile $(HEADERS) ++installdirs: ++ for dir in "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)"; do \ ++ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ ++ done ++install: install-am ++install-exec: install-exec-am ++install-data: install-data-am ++uninstall: uninstall-am ++ ++install-am: all-am ++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am ++ ++installcheck: installcheck-am ++install-strip: ++ if test -z '$(STRIP)'; then \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ install; \ ++ else \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ ++ fi ++mostlyclean-generic: ++ ++clean-generic: ++ ++distclean-generic: ++ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) ++ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) ++ ++maintainer-clean-generic: ++ @echo "This command is intended for maintainers to use" ++ @echo "it deletes files that may require special tools to rebuild." ++clean: clean-am ++ ++clean-am: clean-generic clean-libtool mostlyclean-am ++ ++distclean: distclean-am ++ -rm -f Makefile ++distclean-am: clean-am distclean-generic distclean-tags ++ ++dvi: dvi-am ++ ++dvi-am: ++ ++html: html-am ++ ++html-am: ++ ++info: info-am ++ ++info-am: ++ ++install-data-am: install-kernelHEADERS install-libzfsHEADERS ++ ++install-dvi: install-dvi-am ++ ++install-dvi-am: ++ ++install-exec-am: ++ ++install-html: install-html-am ++ ++install-html-am: ++ ++install-info: install-info-am ++ ++install-info-am: ++ ++install-man: ++ ++install-pdf: install-pdf-am ++ ++install-pdf-am: ++ ++install-ps: install-ps-am ++ ++install-ps-am: ++ ++installcheck-am: ++ ++maintainer-clean: maintainer-clean-am ++ -rm -f Makefile ++maintainer-clean-am: distclean-am maintainer-clean-generic ++ ++mostlyclean: mostlyclean-am ++ ++mostlyclean-am: mostlyclean-generic mostlyclean-libtool ++ ++pdf: pdf-am ++ ++pdf-am: ++ ++ps: ps-am ++ ++ps-am: ++ ++uninstall-am: uninstall-kernelHEADERS uninstall-libzfsHEADERS ++ ++.MAKE: install-am install-strip ++ ++.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ ++ clean-libtool ctags distclean distclean-generic \ ++ distclean-libtool distclean-tags distdir dvi dvi-am html \ ++ html-am info info-am install install-am install-data \ ++ install-data-am install-dvi install-dvi-am install-exec \ ++ install-exec-am install-html install-html-am install-info \ ++ install-info-am install-kernelHEADERS install-libzfsHEADERS \ ++ install-man install-pdf install-pdf-am install-ps \ ++ install-ps-am install-strip installcheck installcheck-am \ ++ installdirs maintainer-clean maintainer-clean-generic \ ++ mostlyclean mostlyclean-generic mostlyclean-libtool pdf pdf-am \ ++ ps ps-am tags uninstall uninstall-am uninstall-kernelHEADERS \ ++ uninstall-libzfsHEADERS ++ ++ ++# Tell versions [3.59,3.63) of GNU make to not export all variables. ++# Otherwise a system limit (for SysV at least) may be exceeded. ++.NOEXPORT: +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/fm/fs/zfs.h linux-3.2.33-go/include/zfs/sys/fm/fs/zfs.h +--- linux-3.2.33-go.orig/include/zfs/sys/fm/fs/zfs.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/fm/fs/zfs.h 2012-11-16 23:25:34.343039404 +0100 +@@ -0,0 +1,115 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_FM_FS_ZFS_H ++#define _SYS_FM_FS_ZFS_H ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#define ZFS_ERROR_CLASS "fs.zfs" ++ ++#define FM_EREPORT_ZFS_CHECKSUM "checksum" ++#define FM_EREPORT_ZFS_IO "io" ++#define FM_EREPORT_ZFS_DATA "data" ++#define FM_EREPORT_ZFS_DELAY "delay" ++#define FM_EREPORT_ZFS_CONFIG_SYNC "config.sync" ++#define FM_EREPORT_ZFS_POOL "zpool" ++#define FM_EREPORT_ZFS_POOL_DESTROY "zpool.destroy" ++#define FM_EREPORT_ZFS_POOL_REGUID "zpool.reguid" ++#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" ++#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" ++#define FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA "vdev.corrupt_data" ++#define FM_EREPORT_ZFS_DEVICE_NO_REPLICAS "vdev.no_replicas" ++#define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum" ++#define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small" ++#define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label" ++#define FM_EREPORT_ZFS_DEVICE_REMOVE "vdev.remove" ++#define FM_EREPORT_ZFS_DEVICE_CLEAR "vdev.clear" ++#define FM_EREPORT_ZFS_DEVICE_CHECK "vdev.check" ++#define FM_EREPORT_ZFS_DEVICE_SPARE "vdev.spare" ++#define FM_EREPORT_ZFS_DEVICE_AUTOEXPAND "vdev.autoexpand" ++#define FM_EREPORT_ZFS_IO_FAILURE "io_failure" ++#define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure" ++#define FM_EREPORT_ZFS_LOG_REPLAY "log_replay" ++#define FM_EREPORT_ZFS_RESILVER_START "resilver.start" ++#define FM_EREPORT_ZFS_RESILVER_FINISH "resilver.finish" ++#define FM_EREPORT_ZFS_SCRUB_START "scrub.start" ++#define FM_EREPORT_ZFS_SCRUB_FINISH "scrub.finish" ++#define FM_EREPORT_ZFS_BOOTFS_VDEV_ATTACH "bootfs.vdev.attach" ++ ++#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool" ++#define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode" ++#define FM_EREPORT_PAYLOAD_ZFS_POOL_GUID "pool_guid" ++#define FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT "pool_context" ++#define FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID "vdev_guid" ++#define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type" ++#define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path" ++#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" ++#define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" ++#define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state" ++#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" ++#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" ++#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" ++#define FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID "parent_devid" ++#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET "zio_objset" ++#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT "zio_object" ++#define FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL "zio_level" ++#define FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID "zio_blkid" ++#define FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR "zio_err" ++#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset" ++#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size" ++#define FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS "zio_flags" ++#define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage" ++#define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline" ++#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay" ++#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" ++#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected" ++#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual" ++#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm" ++#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap" ++#define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges" ++#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap" ++#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS "bad_range_sets" ++#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears" ++#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits" ++#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits" ++#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram" ++#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram" ++ ++#define FM_EREPORT_FAILMODE_WAIT "wait" ++#define FM_EREPORT_FAILMODE_CONTINUE "continue" ++#define FM_EREPORT_FAILMODE_PANIC "panic" ++ ++#define FM_EREPORT_RESOURCE_REMOVED "removed" ++#define FM_EREPORT_RESOURCE_AUTOREPLACE "autoreplace" ++#define FM_EREPORT_RESOURCE_STATECHANGE "statechange" ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_FM_FS_ZFS_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/fm/Makefile linux-3.2.33-go/include/zfs/sys/fm/Makefile +--- linux-3.2.33-go.orig/include/zfs/sys/fm/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/fm/Makefile 2012-11-16 23:25:34.343039404 +0100 +@@ -0,0 +1,813 @@ ++# Makefile.in generated by automake 1.11.6 from Makefile.am. ++# include/sys/fm/Makefile. Generated from Makefile.in by configure. ++ ++# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, ++# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software ++# Foundation, Inc. ++# This Makefile.in is free software; the Free Software Foundation ++# gives unlimited permission to copy and/or distribute it, ++# with or without modifications, as long as this notice is preserved. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without ++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A ++# PARTICULAR PURPOSE. ++ ++ ++ ++ ++am__make_dryrun = \ ++ { \ ++ am__dry=no; \ ++ case $$MAKEFLAGS in \ ++ *\\[\ \ ]*) \ ++ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ ++ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ ++ *) \ ++ for am__flg in $$MAKEFLAGS; do \ ++ case $$am__flg in \ ++ *=*|--*) ;; \ ++ *n*) am__dry=yes; break;; \ ++ esac; \ ++ done;; \ ++ esac; \ ++ test $$am__dry = yes; \ ++ } ++pkgdatadir = $(datadir)/zfs ++pkgincludedir = $(includedir)/zfs ++pkglibdir = $(libdir)/zfs ++pkglibexecdir = $(libexecdir)/zfs ++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd ++install_sh_DATA = $(install_sh) -c -m 644 ++install_sh_PROGRAM = $(install_sh) -c ++install_sh_SCRIPT = $(install_sh) -c ++INSTALL_HEADER = $(INSTALL_DATA) ++transform = $(program_transform_name) ++NORMAL_INSTALL = : ++PRE_INSTALL = : ++POST_INSTALL = : ++NORMAL_UNINSTALL = : ++PRE_UNINSTALL = : ++POST_UNINSTALL = : ++build_triplet = x86_64-unknown-linux-gnu ++host_triplet = x86_64-unknown-linux-gnu ++target_triplet = x86_64-unknown-linux-gnu ++subdir = include/sys/fm ++DIST_COMMON = $(am__kernel_HEADERS_DIST) $(am__libzfs_HEADERS_DIST) \ ++ $(srcdir)/Makefile.am $(srcdir)/Makefile.in ++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ++am__aclocal_m4_deps = \ ++ $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \ ++ $(top_srcdir)/config/kernel-automount.m4 \ ++ $(top_srcdir)/config/kernel-bdev-block-device-operations.m4 \ ++ $(top_srcdir)/config/kernel-bdev-logical-size.m4 \ ++ $(top_srcdir)/config/kernel-bdi-setup-and-register.m4 \ ++ $(top_srcdir)/config/kernel-bdi.m4 \ ++ $(top_srcdir)/config/kernel-bio-empty-barrier.m4 \ ++ $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \ ++ $(top_srcdir)/config/kernel-bio-failfast.m4 \ ++ $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \ ++ $(top_srcdir)/config/kernel-blk-end-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-fetch-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-discard.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-flush.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \ ++ $(top_srcdir)/config/kernel-blk-requeue-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-pos.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get.m4 \ ++ $(top_srcdir)/config/kernel-check-disk-size-change.m4 \ ++ $(top_srcdir)/config/kernel-clear-inode.m4 \ ++ $(top_srcdir)/config/kernel-commit-metadata.m4 \ ++ $(top_srcdir)/config/kernel-create-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-d-make-root.m4 \ ++ $(top_srcdir)/config/kernel-d-obtain-alias.m4 \ ++ $(top_srcdir)/config/kernel-discard-granularity.m4 \ ++ $(top_srcdir)/config/kernel-elevator-change.m4 \ ++ $(top_srcdir)/config/kernel-encode-fh-inode.m4 \ ++ $(top_srcdir)/config/kernel-evict-inode.m4 \ ++ $(top_srcdir)/config/kernel-fallocate.m4 \ ++ $(top_srcdir)/config/kernel-fmode-t.m4 \ ++ $(top_srcdir)/config/kernel-fsync.m4 \ ++ $(top_srcdir)/config/kernel-get-disk-ro.m4 \ ++ $(top_srcdir)/config/kernel-get-gendisk.m4 \ ++ $(top_srcdir)/config/kernel-insert-inode-locked.m4 \ ++ $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \ ++ $(top_srcdir)/config/kernel-kobj-name-len.m4 \ ++ $(top_srcdir)/config/kernel-lookup-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \ ++ $(top_srcdir)/config/kernel-mount-nodev.m4 \ ++ $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \ ++ $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \ ++ $(top_srcdir)/config/kernel-rq-is_sync.m4 \ ++ $(top_srcdir)/config/kernel-security-inode-init.m4 \ ++ $(top_srcdir)/config/kernel-set-nlink.m4 \ ++ $(top_srcdir)/config/kernel-sget-args.m4 \ ++ $(top_srcdir)/config/kernel-show-options.m4 \ ++ $(top_srcdir)/config/kernel-shrink.m4 \ ++ $(top_srcdir)/config/kernel-truncate-range.m4 \ ++ $(top_srcdir)/config/kernel-truncate-setsize.m4 \ ++ $(top_srcdir)/config/kernel-xattr-handler.m4 \ ++ $(top_srcdir)/config/kernel.m4 \ ++ $(top_srcdir)/config/user-arch.m4 \ ++ $(top_srcdir)/config/user-frame-larger-than.m4 \ ++ $(top_srcdir)/config/user-ioctl.m4 \ ++ $(top_srcdir)/config/user-libblkid.m4 \ ++ $(top_srcdir)/config/user-libuuid.m4 \ ++ $(top_srcdir)/config/user-nptl_guard_within_stack.m4 \ ++ $(top_srcdir)/config/user-selinux.m4 \ ++ $(top_srcdir)/config/user-udev.m4 \ ++ $(top_srcdir)/config/user-zlib.m4 $(top_srcdir)/config/user.m4 \ ++ $(top_srcdir)/config/zfs-build.m4 \ ++ $(top_srcdir)/config/zfs-meta.m4 $(top_srcdir)/configure.ac ++am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ ++ $(ACLOCAL_M4) ++mkinstalldirs = $(install_sh) -d ++CONFIG_HEADER = $(top_builddir)/zfs_config.h ++CONFIG_CLEAN_FILES = ++CONFIG_CLEAN_VPATH_FILES = ++AM_V_GEN = $(am__v_GEN_$(V)) ++am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) ++am__v_GEN_0 = @echo " GEN " $@; ++AM_V_at = $(am__v_at_$(V)) ++am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) ++am__v_at_0 = @ ++SOURCES = ++DIST_SOURCES = ++RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \ ++ html-recursive info-recursive install-data-recursive \ ++ install-dvi-recursive install-exec-recursive \ ++ install-html-recursive install-info-recursive \ ++ install-pdf-recursive install-ps-recursive install-recursive \ ++ installcheck-recursive installdirs-recursive pdf-recursive \ ++ ps-recursive uninstall-recursive ++am__can_run_installinfo = \ ++ case $$AM_UPDATE_INFO_DIR in \ ++ n|no|NO) false;; \ ++ *) (install-info --version) >/dev/null 2>&1;; \ ++ esac ++am__kernel_HEADERS_DIST = $(top_srcdir)/include/sys/fm/protocol.h \ ++ $(top_srcdir)/include/sys/fm/util.h ++am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; ++am__vpath_adj = case $$p in \ ++ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ ++ *) f=$$p;; \ ++ esac; ++am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; ++am__install_max = 40 ++am__nobase_strip_setup = \ ++ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` ++am__nobase_strip = \ ++ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" ++am__nobase_list = $(am__nobase_strip_setup); \ ++ for p in $$list; do echo "$$p $$p"; done | \ ++ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ ++ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ ++ if (++n[$$2] == $(am__install_max)) \ ++ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ ++ END { for (dir in files) print dir, files[dir] }' ++am__base_list = \ ++ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ ++ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' ++am__uninstall_files_from_dir = { \ ++ test -z "$$files" \ ++ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ ++ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ ++ $(am__cd) "$$dir" && rm -f $$files; }; \ ++ } ++am__installdirs = "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)" ++am__libzfs_HEADERS_DIST = $(top_srcdir)/include/sys/fm/protocol.h \ ++ $(top_srcdir)/include/sys/fm/util.h ++HEADERS = $(kernel_HEADERS) $(libzfs_HEADERS) ++RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \ ++ distclean-recursive maintainer-clean-recursive ++AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \ ++ $(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \ ++ distdir ++ETAGS = etags ++CTAGS = ctags ++DIST_SUBDIRS = $(SUBDIRS) ++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ++am__relativize = \ ++ dir0=`pwd`; \ ++ sed_first='s,^\([^/]*\)/.*$$,\1,'; \ ++ sed_rest='s,^[^/]*/*,,'; \ ++ sed_last='s,^.*/\([^/]*\)$$,\1,'; \ ++ sed_butlast='s,/*[^/]*$$,,'; \ ++ while test -n "$$dir1"; do \ ++ first=`echo "$$dir1" | sed -e "$$sed_first"`; \ ++ if test "$$first" != "."; then \ ++ if test "$$first" = ".."; then \ ++ dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \ ++ dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \ ++ else \ ++ first2=`echo "$$dir2" | sed -e "$$sed_first"`; \ ++ if test "$$first2" = "$$first"; then \ ++ dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \ ++ else \ ++ dir2="../$$dir2"; \ ++ fi; \ ++ dir0="$$dir0"/"$$first"; \ ++ fi; \ ++ fi; \ ++ dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \ ++ done; \ ++ reldir="$$dir2" ++ACLOCAL = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run aclocal-1.11 ++ALIEN = alien ++ALIEN_VERSION = ++AMTAR = $${TAR-tar} ++AM_DEFAULT_VERBOSITY = 1 ++AR = ar ++AUTOCONF = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run autoconf ++AUTOHEADER = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run autoheader ++AUTOMAKE = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run automake-1.11 ++AWK = gawk ++CC = gcc ++CCAS = gcc ++CCASDEPMODE = depmode=gcc3 ++CCASFLAGS = -g -O2 ++CCDEPMODE = depmode=gcc3 ++CFLAGS = -g -O2 ++CPP = gcc -E ++CPPFLAGS = ++CYGPATH_W = echo ++DEBUG_CFLAGS = -DNDEBUG ++DEBUG_DMU_TX = _without_debug_dmu_tx ++DEBUG_STACKFLAGS = ++DEBUG_ZFS = _without_debug ++DEFAULT_INIT_DIR = ${prefix}/etc/init.d ++DEFAULT_INIT_SCRIPT = gentoo ++DEFAULT_PACKAGE = tgz ++DEFS = -DHAVE_CONFIG_H ++DEPDIR = .deps ++DLLTOOL = false ++DPKG = dpkg ++DPKGBUILD = dpkg-buildpackage ++DPKGBUILD_VERSION = ++DPKG_VERSION = ++DSYMUTIL = ++DUMPBIN = ++ECHO_C = ++ECHO_N = -n ++ECHO_T = ++EGREP = /bin/grep -E ++EXEEXT = ++FGREP = /bin/grep -F ++FRAME_LARGER_THAN = -Wframe-larger-than=1024 ++GREP = /bin/grep ++HAVE_ALIEN = no ++HAVE_DPKG = no ++HAVE_DPKGBUILD = no ++HAVE_MAKEPKG = ++HAVE_PACMAN = ++HAVE_RPM = yes ++HAVE_RPMBUILD = yes ++INSTALL = /usr/bin/install -c ++INSTALL_DATA = ${INSTALL} -m 644 ++INSTALL_PROGRAM = ${INSTALL} ++INSTALL_SCRIPT = ${INSTALL} ++INSTALL_STRIP_PROGRAM = $(install_sh) -c -s ++KERNELCPPFLAGS = -Wno-unused-but-set-variable -DHAVE_SPL -D_KERNEL -DTEXT_DOMAIN=\"zfs-linux-kernel\" -DNDEBUG ++KERNELMAKE_PARAMS = O=/usr/src/linux-3.6.0-sabayon ++LD = /usr/x86_64-pc-linux-gnu/bin/ld -m elf_x86_64 ++LDFLAGS = ++LIBBLKID = ++LIBOBJS = ++LIBS = -luuid -luuid -lz -lz -lz ++LIBSELINUX = ++LIBTOOL = $(SHELL) $(top_builddir)/libtool ++LIBUUID = -luuid ++LINUX = /usr/src/linux-3.2.33-go ++LINUX_OBJ = /usr/src/linux-3.6.0-sabayon ++LINUX_SYMBOLS = NONE ++LINUX_VERSION = 3.6.0-sabayon ++LIPO = ++LN_S = ln -s ++LTLIBOBJS = ++MAINT = # ++MAKEINFO = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run makeinfo ++MAKEPKG = ++MAKEPKG_VERSION = ++MANIFEST_TOOL = : ++MKDIR_P = /bin/mkdir -p ++NM = /usr/bin/nm -B ++NMEDIT = ++NO_UNUSED_BUT_SET_VARIABLE = -Wno-unused-but-set-variable ++OBJDUMP = objdump ++OBJEXT = o ++OTOOL = ++OTOOL64 = ++PACKAGE = zfs ++PACKAGE_BUGREPORT = ++PACKAGE_NAME = ++PACKAGE_STRING = ++PACKAGE_TARNAME = ++PACKAGE_URL = ++PACKAGE_VERSION = ++PACMAN = ++PACMAN_VERSION = ++PATH_SEPARATOR = : ++RANLIB = ranlib ++RPM = rpm ++RPMBUILD = rpmbuild ++RPMBUILD_VERSION = 4.10.0 ++RPM_VERSION = 4.10.0 ++SED = /bin/sed ++SET_MAKE = ++SHELL = /bin/sh ++SPL = /usr/src/linux-3.2.33-go ++SPL_OBJ = /usr/src/linux-3.2.33-go ++SPL_SYMBOLS = NONE ++SPL_VERSION = 0.6.0-rc12 ++STRIP = strip ++TARGET_ASM_DIR = asm-x86_64 ++VENDOR = gentoo ++VERSION = 0.6.0 ++ZFS_CONFIG = all ++ZFS_META_ALIAS = zfs-0.6.0-rc12 ++ZFS_META_AUTHOR = Sun Microsystems/Oracle, Lawrence Livermore National Laboratory ++ZFS_META_DATA = ++ZFS_META_LICENSE = CDDL ++ZFS_META_LT_AGE = ++ZFS_META_LT_CURRENT = ++ZFS_META_LT_REVISION = ++ZFS_META_NAME = zfs ++ZFS_META_RELEASE = rc12 ++ZFS_META_VERSION = 0.6.0 ++ZLIB = -lz ++abs_builddir = /root/zfs-0.6.0-rc12/include/sys/fm ++abs_srcdir = /root/zfs-0.6.0-rc12/include/sys/fm ++abs_top_builddir = /root/zfs-0.6.0-rc12 ++abs_top_srcdir = /root/zfs-0.6.0-rc12 ++ac_ct_AR = ar ++ac_ct_CC = gcc ++ac_ct_DUMPBIN = ++am__include = include ++am__leading_dot = . ++am__quote = ++am__tar = $${TAR-tar} chof - "$$tardir" ++am__untar = $${TAR-tar} xf - ++bindir = ${exec_prefix}/bin ++build = x86_64-unknown-linux-gnu ++build_alias = ++build_cpu = x86_64 ++build_os = linux-gnu ++build_vendor = unknown ++builddir = . ++datadir = ${datarootdir} ++datarootdir = ${prefix}/share ++docdir = ${datarootdir}/doc/${PACKAGE} ++dvidir = ${docdir} ++exec_prefix = ${prefix} ++host = x86_64-unknown-linux-gnu ++host_alias = ++host_cpu = x86_64 ++host_os = linux-gnu ++host_vendor = unknown ++htmldir = ${docdir} ++includedir = ${prefix}/include ++infodir = ${datarootdir}/info ++install_sh = ${SHELL} /root/zfs-0.6.0-rc12/config/install-sh ++libdir = ${exec_prefix}/lib ++libexecdir = ${exec_prefix}/libexec ++localedir = ${datarootdir}/locale ++localstatedir = ${prefix}/var ++mandir = ${datarootdir}/man ++mkdir_p = /bin/mkdir -p ++oldincludedir = /usr/include ++pdfdir = ${docdir} ++prefix = /usr/local ++program_transform_name = s,x,x, ++psdir = ${docdir} ++sbindir = ${exec_prefix}/sbin ++sharedstatedir = ${prefix}/com ++srcdir = . ++sysconfdir = ${prefix}/etc ++target = x86_64-unknown-linux-gnu ++target_alias = ++target_cpu = x86_64 ++target_os = linux-gnu ++target_vendor = unknown ++top_build_prefix = ../../../ ++top_builddir = ../../.. ++top_srcdir = ../../.. ++udevdir = ${exec_prefix}/lib/udev ++udevruledir = ${udevdir}/rules.d ++SUBDIRS = fs ++COMMON_H = \ ++ $(top_srcdir)/include/sys/fm/protocol.h \ ++ $(top_srcdir)/include/sys/fm/util.h ++ ++KERNEL_H = ++USER_H = ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++libzfsdir = $(includedir)/libzfs/sys/fm ++libzfs_HEADERS = $(COMMON_H) $(USER_H) ++#kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION)/sys/fm ++#kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++all: all-recursive ++ ++.SUFFIXES: ++$(srcdir)/Makefile.in: # $(srcdir)/Makefile.am $(am__configure_deps) ++ @for dep in $?; do \ ++ case '$(am__configure_deps)' in \ ++ *$$dep*) \ ++ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ ++ && { if test -f $@; then exit 0; else break; fi; }; \ ++ exit 1;; \ ++ esac; \ ++ done; \ ++ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu include/sys/fm/Makefile'; \ ++ $(am__cd) $(top_srcdir) && \ ++ $(AUTOMAKE) --gnu include/sys/fm/Makefile ++.PRECIOUS: Makefile ++Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status ++ @case '$?' in \ ++ *config.status*) \ ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ ++ *) \ ++ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ ++ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ ++ esac; ++ ++$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++ ++$(top_srcdir)/configure: # $(am__configure_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(ACLOCAL_M4): # $(am__aclocal_m4_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(am__aclocal_m4_deps): ++ ++mostlyclean-libtool: ++ -rm -f *.lo ++ ++clean-libtool: ++ -rm -rf .libs _libs ++install-kernelHEADERS: $(kernel_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(kerneldir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(kerneldir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(kerneldir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(kerneldir)" || exit $$?; \ ++ done ++ ++uninstall-kernelHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(kerneldir)'; $(am__uninstall_files_from_dir) ++install-libzfsHEADERS: $(libzfs_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(libzfsdir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(libzfsdir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libzfsdir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(libzfsdir)" || exit $$?; \ ++ done ++ ++uninstall-libzfsHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(libzfsdir)'; $(am__uninstall_files_from_dir) ++ ++# This directory's subdirectories are mostly independent; you can cd ++# into them and run `make' without going through this Makefile. ++# To change the values of `make' variables: instead of editing Makefiles, ++# (1) if the variable is set in `config.status', edit `config.status' ++# (which will cause the Makefiles to be regenerated when you run `make'); ++# (2) otherwise, pass the desired values on the `make' command line. ++$(RECURSIVE_TARGETS): ++ @fail= failcom='exit 1'; \ ++ for f in x $$MAKEFLAGS; do \ ++ case $$f in \ ++ *=* | --[!k]*);; \ ++ *k*) failcom='fail=yes';; \ ++ esac; \ ++ done; \ ++ dot_seen=no; \ ++ target=`echo $@ | sed s/-recursive//`; \ ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ echo "Making $$target in $$subdir"; \ ++ if test "$$subdir" = "."; then \ ++ dot_seen=yes; \ ++ local_target="$$target-am"; \ ++ else \ ++ local_target="$$target"; \ ++ fi; \ ++ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ ++ || eval $$failcom; \ ++ done; \ ++ if test "$$dot_seen" = "no"; then \ ++ $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ ++ fi; test -z "$$fail" ++ ++$(RECURSIVE_CLEAN_TARGETS): ++ @fail= failcom='exit 1'; \ ++ for f in x $$MAKEFLAGS; do \ ++ case $$f in \ ++ *=* | --[!k]*);; \ ++ *k*) failcom='fail=yes';; \ ++ esac; \ ++ done; \ ++ dot_seen=no; \ ++ case "$@" in \ ++ distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \ ++ *) list='$(SUBDIRS)' ;; \ ++ esac; \ ++ rev=''; for subdir in $$list; do \ ++ if test "$$subdir" = "."; then :; else \ ++ rev="$$subdir $$rev"; \ ++ fi; \ ++ done; \ ++ rev="$$rev ."; \ ++ target=`echo $@ | sed s/-recursive//`; \ ++ for subdir in $$rev; do \ ++ echo "Making $$target in $$subdir"; \ ++ if test "$$subdir" = "."; then \ ++ local_target="$$target-am"; \ ++ else \ ++ local_target="$$target"; \ ++ fi; \ ++ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ ++ || eval $$failcom; \ ++ done && test -z "$$fail" ++tags-recursive: ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \ ++ done ++ctags-recursive: ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \ ++ done ++ ++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ mkid -fID $$unique ++tags: TAGS ++ ++TAGS: tags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ set x; \ ++ here=`pwd`; \ ++ if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \ ++ include_option=--etags-include; \ ++ empty_fix=.; \ ++ else \ ++ include_option=--include; \ ++ empty_fix=; \ ++ fi; \ ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ if test "$$subdir" = .; then :; else \ ++ test ! -f $$subdir/TAGS || \ ++ set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \ ++ fi; \ ++ done; \ ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ shift; \ ++ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ ++ test -n "$$unique" || unique=$$empty_fix; \ ++ if test $$# -gt 0; then \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ "$$@" $$unique; \ ++ else \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ $$unique; \ ++ fi; \ ++ fi ++ctags: CTAGS ++CTAGS: ctags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ test -z "$(CTAGS_ARGS)$$unique" \ ++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ ++ $$unique ++ ++GTAGS: ++ here=`$(am__cd) $(top_builddir) && pwd` \ ++ && $(am__cd) $(top_srcdir) \ ++ && gtags -i $(GTAGS_ARGS) "$$here" ++ ++distclean-tags: ++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags ++ ++distdir: $(DISTFILES) ++ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ list='$(DISTFILES)'; \ ++ dist_files=`for file in $$list; do echo $$file; done | \ ++ sed -e "s|^$$srcdirstrip/||;t" \ ++ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ ++ case $$dist_files in \ ++ */*) $(MKDIR_P) `echo "$$dist_files" | \ ++ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ ++ sort -u` ;; \ ++ esac; \ ++ for file in $$dist_files; do \ ++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ ++ if test -d $$d/$$file; then \ ++ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ ++ if test -d "$(distdir)/$$file"; then \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ ++ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ ++ else \ ++ test -f "$(distdir)/$$file" \ ++ || cp -p $$d/$$file "$(distdir)/$$file" \ ++ || exit 1; \ ++ fi; \ ++ done ++ @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \ ++ if test "$$subdir" = .; then :; else \ ++ $(am__make_dryrun) \ ++ || test -d "$(distdir)/$$subdir" \ ++ || $(MKDIR_P) "$(distdir)/$$subdir" \ ++ || exit 1; \ ++ dir1=$$subdir; dir2="$(distdir)/$$subdir"; \ ++ $(am__relativize); \ ++ new_distdir=$$reldir; \ ++ dir1=$$subdir; dir2="$(top_distdir)"; \ ++ $(am__relativize); \ ++ new_top_distdir=$$reldir; \ ++ echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \ ++ echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \ ++ ($(am__cd) $$subdir && \ ++ $(MAKE) $(AM_MAKEFLAGS) \ ++ top_distdir="$$new_top_distdir" \ ++ distdir="$$new_distdir" \ ++ am__remove_distdir=: \ ++ am__skip_length_check=: \ ++ am__skip_mode_fix=: \ ++ distdir) \ ++ || exit 1; \ ++ fi; \ ++ done ++check-am: all-am ++check: check-recursive ++all-am: Makefile $(HEADERS) ++installdirs: installdirs-recursive ++installdirs-am: ++ for dir in "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)"; do \ ++ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ ++ done ++install: install-recursive ++install-exec: install-exec-recursive ++install-data: install-data-recursive ++uninstall: uninstall-recursive ++ ++install-am: all-am ++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am ++ ++installcheck: installcheck-recursive ++install-strip: ++ if test -z '$(STRIP)'; then \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ install; \ ++ else \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ ++ fi ++mostlyclean-generic: ++ ++clean-generic: ++ ++distclean-generic: ++ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) ++ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) ++ ++maintainer-clean-generic: ++ @echo "This command is intended for maintainers to use" ++ @echo "it deletes files that may require special tools to rebuild." ++clean: clean-recursive ++ ++clean-am: clean-generic clean-libtool mostlyclean-am ++ ++distclean: distclean-recursive ++ -rm -f Makefile ++distclean-am: clean-am distclean-generic distclean-tags ++ ++dvi: dvi-recursive ++ ++dvi-am: ++ ++html: html-recursive ++ ++html-am: ++ ++info: info-recursive ++ ++info-am: ++ ++install-data-am: install-kernelHEADERS install-libzfsHEADERS ++ ++install-dvi: install-dvi-recursive ++ ++install-dvi-am: ++ ++install-exec-am: ++ ++install-html: install-html-recursive ++ ++install-html-am: ++ ++install-info: install-info-recursive ++ ++install-info-am: ++ ++install-man: ++ ++install-pdf: install-pdf-recursive ++ ++install-pdf-am: ++ ++install-ps: install-ps-recursive ++ ++install-ps-am: ++ ++installcheck-am: ++ ++maintainer-clean: maintainer-clean-recursive ++ -rm -f Makefile ++maintainer-clean-am: distclean-am maintainer-clean-generic ++ ++mostlyclean: mostlyclean-recursive ++ ++mostlyclean-am: mostlyclean-generic mostlyclean-libtool ++ ++pdf: pdf-recursive ++ ++pdf-am: ++ ++ps: ps-recursive ++ ++ps-am: ++ ++uninstall-am: uninstall-kernelHEADERS uninstall-libzfsHEADERS ++ ++.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \ ++ install-am install-strip tags-recursive ++ ++.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \ ++ all all-am check check-am clean clean-generic clean-libtool \ ++ ctags ctags-recursive distclean distclean-generic \ ++ distclean-libtool distclean-tags distdir dvi dvi-am html \ ++ html-am info info-am install install-am install-data \ ++ install-data-am install-dvi install-dvi-am install-exec \ ++ install-exec-am install-html install-html-am install-info \ ++ install-info-am install-kernelHEADERS install-libzfsHEADERS \ ++ install-man install-pdf install-pdf-am install-ps \ ++ install-ps-am install-strip installcheck installcheck-am \ ++ installdirs installdirs-am maintainer-clean \ ++ maintainer-clean-generic mostlyclean mostlyclean-generic \ ++ mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \ ++ uninstall uninstall-am uninstall-kernelHEADERS \ ++ uninstall-libzfsHEADERS ++ ++ ++# Tell versions [3.59,3.63) of GNU make to not export all variables. ++# Otherwise a system limit (for SysV at least) may be exceeded. ++.NOEXPORT: +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/fm/Makefile.am linux-3.2.33-go/include/zfs/sys/fm/Makefile.am +--- linux-3.2.33-go.orig/include/zfs/sys/fm/Makefile.am 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/fm/Makefile.am 2012-11-16 23:25:34.343039404 +0100 +@@ -0,0 +1,21 @@ ++SUBDIRS = fs ++ ++COMMON_H = \ ++ $(top_srcdir)/include/sys/fm/protocol.h \ ++ $(top_srcdir)/include/sys/fm/util.h ++ ++KERNEL_H = ++ ++USER_H = ++ ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++ ++if CONFIG_USER ++libzfsdir = $(includedir)/libzfs/sys/fm ++libzfs_HEADERS = $(COMMON_H) $(USER_H) ++endif ++ ++if CONFIG_KERNEL ++kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION)/sys/fm ++kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++endif +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/fm/Makefile.in linux-3.2.33-go/include/zfs/sys/fm/Makefile.in +--- linux-3.2.33-go.orig/include/zfs/sys/fm/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/fm/Makefile.in 2012-11-16 23:25:34.344039393 +0100 +@@ -0,0 +1,813 @@ ++# Makefile.in generated by automake 1.11.6 from Makefile.am. ++# @configure_input@ ++ ++# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, ++# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software ++# Foundation, Inc. ++# This Makefile.in is free software; the Free Software Foundation ++# gives unlimited permission to copy and/or distribute it, ++# with or without modifications, as long as this notice is preserved. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without ++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A ++# PARTICULAR PURPOSE. ++ ++@SET_MAKE@ ++ ++VPATH = @srcdir@ ++am__make_dryrun = \ ++ { \ ++ am__dry=no; \ ++ case $$MAKEFLAGS in \ ++ *\\[\ \ ]*) \ ++ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ ++ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ ++ *) \ ++ for am__flg in $$MAKEFLAGS; do \ ++ case $$am__flg in \ ++ *=*|--*) ;; \ ++ *n*) am__dry=yes; break;; \ ++ esac; \ ++ done;; \ ++ esac; \ ++ test $$am__dry = yes; \ ++ } ++pkgdatadir = $(datadir)/@PACKAGE@ ++pkgincludedir = $(includedir)/@PACKAGE@ ++pkglibdir = $(libdir)/@PACKAGE@ ++pkglibexecdir = $(libexecdir)/@PACKAGE@ ++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd ++install_sh_DATA = $(install_sh) -c -m 644 ++install_sh_PROGRAM = $(install_sh) -c ++install_sh_SCRIPT = $(install_sh) -c ++INSTALL_HEADER = $(INSTALL_DATA) ++transform = $(program_transform_name) ++NORMAL_INSTALL = : ++PRE_INSTALL = : ++POST_INSTALL = : ++NORMAL_UNINSTALL = : ++PRE_UNINSTALL = : ++POST_UNINSTALL = : ++build_triplet = @build@ ++host_triplet = @host@ ++target_triplet = @target@ ++subdir = include/sys/fm ++DIST_COMMON = $(am__kernel_HEADERS_DIST) $(am__libzfs_HEADERS_DIST) \ ++ $(srcdir)/Makefile.am $(srcdir)/Makefile.in ++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ++am__aclocal_m4_deps = \ ++ $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \ ++ $(top_srcdir)/config/kernel-automount.m4 \ ++ $(top_srcdir)/config/kernel-bdev-block-device-operations.m4 \ ++ $(top_srcdir)/config/kernel-bdev-logical-size.m4 \ ++ $(top_srcdir)/config/kernel-bdi-setup-and-register.m4 \ ++ $(top_srcdir)/config/kernel-bdi.m4 \ ++ $(top_srcdir)/config/kernel-bio-empty-barrier.m4 \ ++ $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \ ++ $(top_srcdir)/config/kernel-bio-failfast.m4 \ ++ $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \ ++ $(top_srcdir)/config/kernel-blk-end-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-fetch-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-discard.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-flush.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \ ++ $(top_srcdir)/config/kernel-blk-requeue-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-pos.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get.m4 \ ++ $(top_srcdir)/config/kernel-check-disk-size-change.m4 \ ++ $(top_srcdir)/config/kernel-clear-inode.m4 \ ++ $(top_srcdir)/config/kernel-commit-metadata.m4 \ ++ $(top_srcdir)/config/kernel-create-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-d-make-root.m4 \ ++ $(top_srcdir)/config/kernel-d-obtain-alias.m4 \ ++ $(top_srcdir)/config/kernel-discard-granularity.m4 \ ++ $(top_srcdir)/config/kernel-elevator-change.m4 \ ++ $(top_srcdir)/config/kernel-encode-fh-inode.m4 \ ++ $(top_srcdir)/config/kernel-evict-inode.m4 \ ++ $(top_srcdir)/config/kernel-fallocate.m4 \ ++ $(top_srcdir)/config/kernel-fmode-t.m4 \ ++ $(top_srcdir)/config/kernel-fsync.m4 \ ++ $(top_srcdir)/config/kernel-get-disk-ro.m4 \ ++ $(top_srcdir)/config/kernel-get-gendisk.m4 \ ++ $(top_srcdir)/config/kernel-insert-inode-locked.m4 \ ++ $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \ ++ $(top_srcdir)/config/kernel-kobj-name-len.m4 \ ++ $(top_srcdir)/config/kernel-lookup-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \ ++ $(top_srcdir)/config/kernel-mount-nodev.m4 \ ++ $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \ ++ $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \ ++ $(top_srcdir)/config/kernel-rq-is_sync.m4 \ ++ $(top_srcdir)/config/kernel-security-inode-init.m4 \ ++ $(top_srcdir)/config/kernel-set-nlink.m4 \ ++ $(top_srcdir)/config/kernel-sget-args.m4 \ ++ $(top_srcdir)/config/kernel-show-options.m4 \ ++ $(top_srcdir)/config/kernel-shrink.m4 \ ++ $(top_srcdir)/config/kernel-truncate-range.m4 \ ++ $(top_srcdir)/config/kernel-truncate-setsize.m4 \ ++ $(top_srcdir)/config/kernel-xattr-handler.m4 \ ++ $(top_srcdir)/config/kernel.m4 \ ++ $(top_srcdir)/config/user-arch.m4 \ ++ $(top_srcdir)/config/user-frame-larger-than.m4 \ ++ $(top_srcdir)/config/user-ioctl.m4 \ ++ $(top_srcdir)/config/user-libblkid.m4 \ ++ $(top_srcdir)/config/user-libuuid.m4 \ ++ $(top_srcdir)/config/user-nptl_guard_within_stack.m4 \ ++ $(top_srcdir)/config/user-selinux.m4 \ ++ $(top_srcdir)/config/user-udev.m4 \ ++ $(top_srcdir)/config/user-zlib.m4 $(top_srcdir)/config/user.m4 \ ++ $(top_srcdir)/config/zfs-build.m4 \ ++ $(top_srcdir)/config/zfs-meta.m4 $(top_srcdir)/configure.ac ++am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ ++ $(ACLOCAL_M4) ++mkinstalldirs = $(install_sh) -d ++CONFIG_HEADER = $(top_builddir)/zfs_config.h ++CONFIG_CLEAN_FILES = ++CONFIG_CLEAN_VPATH_FILES = ++AM_V_GEN = $(am__v_GEN_@AM_V@) ++am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) ++am__v_GEN_0 = @echo " GEN " $@; ++AM_V_at = $(am__v_at_@AM_V@) ++am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) ++am__v_at_0 = @ ++SOURCES = ++DIST_SOURCES = ++RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \ ++ html-recursive info-recursive install-data-recursive \ ++ install-dvi-recursive install-exec-recursive \ ++ install-html-recursive install-info-recursive \ ++ install-pdf-recursive install-ps-recursive install-recursive \ ++ installcheck-recursive installdirs-recursive pdf-recursive \ ++ ps-recursive uninstall-recursive ++am__can_run_installinfo = \ ++ case $$AM_UPDATE_INFO_DIR in \ ++ n|no|NO) false;; \ ++ *) (install-info --version) >/dev/null 2>&1;; \ ++ esac ++am__kernel_HEADERS_DIST = $(top_srcdir)/include/sys/fm/protocol.h \ ++ $(top_srcdir)/include/sys/fm/util.h ++am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; ++am__vpath_adj = case $$p in \ ++ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ ++ *) f=$$p;; \ ++ esac; ++am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; ++am__install_max = 40 ++am__nobase_strip_setup = \ ++ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` ++am__nobase_strip = \ ++ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" ++am__nobase_list = $(am__nobase_strip_setup); \ ++ for p in $$list; do echo "$$p $$p"; done | \ ++ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ ++ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ ++ if (++n[$$2] == $(am__install_max)) \ ++ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ ++ END { for (dir in files) print dir, files[dir] }' ++am__base_list = \ ++ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ ++ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' ++am__uninstall_files_from_dir = { \ ++ test -z "$$files" \ ++ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ ++ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ ++ $(am__cd) "$$dir" && rm -f $$files; }; \ ++ } ++am__installdirs = "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)" ++am__libzfs_HEADERS_DIST = $(top_srcdir)/include/sys/fm/protocol.h \ ++ $(top_srcdir)/include/sys/fm/util.h ++HEADERS = $(kernel_HEADERS) $(libzfs_HEADERS) ++RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \ ++ distclean-recursive maintainer-clean-recursive ++AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \ ++ $(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \ ++ distdir ++ETAGS = etags ++CTAGS = ctags ++DIST_SUBDIRS = $(SUBDIRS) ++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ++am__relativize = \ ++ dir0=`pwd`; \ ++ sed_first='s,^\([^/]*\)/.*$$,\1,'; \ ++ sed_rest='s,^[^/]*/*,,'; \ ++ sed_last='s,^.*/\([^/]*\)$$,\1,'; \ ++ sed_butlast='s,/*[^/]*$$,,'; \ ++ while test -n "$$dir1"; do \ ++ first=`echo "$$dir1" | sed -e "$$sed_first"`; \ ++ if test "$$first" != "."; then \ ++ if test "$$first" = ".."; then \ ++ dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \ ++ dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \ ++ else \ ++ first2=`echo "$$dir2" | sed -e "$$sed_first"`; \ ++ if test "$$first2" = "$$first"; then \ ++ dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \ ++ else \ ++ dir2="../$$dir2"; \ ++ fi; \ ++ dir0="$$dir0"/"$$first"; \ ++ fi; \ ++ fi; \ ++ dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \ ++ done; \ ++ reldir="$$dir2" ++ACLOCAL = @ACLOCAL@ ++ALIEN = @ALIEN@ ++ALIEN_VERSION = @ALIEN_VERSION@ ++AMTAR = @AMTAR@ ++AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ++AR = @AR@ ++AUTOCONF = @AUTOCONF@ ++AUTOHEADER = @AUTOHEADER@ ++AUTOMAKE = @AUTOMAKE@ ++AWK = @AWK@ ++CC = @CC@ ++CCAS = @CCAS@ ++CCASDEPMODE = @CCASDEPMODE@ ++CCASFLAGS = @CCASFLAGS@ ++CCDEPMODE = @CCDEPMODE@ ++CFLAGS = @CFLAGS@ ++CPP = @CPP@ ++CPPFLAGS = @CPPFLAGS@ ++CYGPATH_W = @CYGPATH_W@ ++DEBUG_CFLAGS = @DEBUG_CFLAGS@ ++DEBUG_DMU_TX = @DEBUG_DMU_TX@ ++DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@ ++DEBUG_ZFS = @DEBUG_ZFS@ ++DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@ ++DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@ ++DEFAULT_PACKAGE = @DEFAULT_PACKAGE@ ++DEFS = @DEFS@ ++DEPDIR = @DEPDIR@ ++DLLTOOL = @DLLTOOL@ ++DPKG = @DPKG@ ++DPKGBUILD = @DPKGBUILD@ ++DPKGBUILD_VERSION = @DPKGBUILD_VERSION@ ++DPKG_VERSION = @DPKG_VERSION@ ++DSYMUTIL = @DSYMUTIL@ ++DUMPBIN = @DUMPBIN@ ++ECHO_C = @ECHO_C@ ++ECHO_N = @ECHO_N@ ++ECHO_T = @ECHO_T@ ++EGREP = @EGREP@ ++EXEEXT = @EXEEXT@ ++FGREP = @FGREP@ ++FRAME_LARGER_THAN = @FRAME_LARGER_THAN@ ++GREP = @GREP@ ++HAVE_ALIEN = @HAVE_ALIEN@ ++HAVE_DPKG = @HAVE_DPKG@ ++HAVE_DPKGBUILD = @HAVE_DPKGBUILD@ ++HAVE_MAKEPKG = @HAVE_MAKEPKG@ ++HAVE_PACMAN = @HAVE_PACMAN@ ++HAVE_RPM = @HAVE_RPM@ ++HAVE_RPMBUILD = @HAVE_RPMBUILD@ ++INSTALL = @INSTALL@ ++INSTALL_DATA = @INSTALL_DATA@ ++INSTALL_PROGRAM = @INSTALL_PROGRAM@ ++INSTALL_SCRIPT = @INSTALL_SCRIPT@ ++INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ ++KERNELCPPFLAGS = @KERNELCPPFLAGS@ ++KERNELMAKE_PARAMS = @KERNELMAKE_PARAMS@ ++LD = @LD@ ++LDFLAGS = @LDFLAGS@ ++LIBBLKID = @LIBBLKID@ ++LIBOBJS = @LIBOBJS@ ++LIBS = @LIBS@ ++LIBSELINUX = @LIBSELINUX@ ++LIBTOOL = @LIBTOOL@ ++LIBUUID = @LIBUUID@ ++LINUX = @LINUX@ ++LINUX_OBJ = @LINUX_OBJ@ ++LINUX_SYMBOLS = @LINUX_SYMBOLS@ ++LINUX_VERSION = @LINUX_VERSION@ ++LIPO = @LIPO@ ++LN_S = @LN_S@ ++LTLIBOBJS = @LTLIBOBJS@ ++MAINT = @MAINT@ ++MAKEINFO = @MAKEINFO@ ++MAKEPKG = @MAKEPKG@ ++MAKEPKG_VERSION = @MAKEPKG_VERSION@ ++MANIFEST_TOOL = @MANIFEST_TOOL@ ++MKDIR_P = @MKDIR_P@ ++NM = @NM@ ++NMEDIT = @NMEDIT@ ++NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@ ++OBJDUMP = @OBJDUMP@ ++OBJEXT = @OBJEXT@ ++OTOOL = @OTOOL@ ++OTOOL64 = @OTOOL64@ ++PACKAGE = @PACKAGE@ ++PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ ++PACKAGE_NAME = @PACKAGE_NAME@ ++PACKAGE_STRING = @PACKAGE_STRING@ ++PACKAGE_TARNAME = @PACKAGE_TARNAME@ ++PACKAGE_URL = @PACKAGE_URL@ ++PACKAGE_VERSION = @PACKAGE_VERSION@ ++PACMAN = @PACMAN@ ++PACMAN_VERSION = @PACMAN_VERSION@ ++PATH_SEPARATOR = @PATH_SEPARATOR@ ++RANLIB = @RANLIB@ ++RPM = @RPM@ ++RPMBUILD = @RPMBUILD@ ++RPMBUILD_VERSION = @RPMBUILD_VERSION@ ++RPM_VERSION = @RPM_VERSION@ ++SED = @SED@ ++SET_MAKE = @SET_MAKE@ ++SHELL = @SHELL@ ++SPL = @SPL@ ++SPL_OBJ = @SPL_OBJ@ ++SPL_SYMBOLS = @SPL_SYMBOLS@ ++SPL_VERSION = @SPL_VERSION@ ++STRIP = @STRIP@ ++TARGET_ASM_DIR = @TARGET_ASM_DIR@ ++VENDOR = @VENDOR@ ++VERSION = @VERSION@ ++ZFS_CONFIG = @ZFS_CONFIG@ ++ZFS_META_ALIAS = @ZFS_META_ALIAS@ ++ZFS_META_AUTHOR = @ZFS_META_AUTHOR@ ++ZFS_META_DATA = @ZFS_META_DATA@ ++ZFS_META_LICENSE = @ZFS_META_LICENSE@ ++ZFS_META_LT_AGE = @ZFS_META_LT_AGE@ ++ZFS_META_LT_CURRENT = @ZFS_META_LT_CURRENT@ ++ZFS_META_LT_REVISION = @ZFS_META_LT_REVISION@ ++ZFS_META_NAME = @ZFS_META_NAME@ ++ZFS_META_RELEASE = @ZFS_META_RELEASE@ ++ZFS_META_VERSION = @ZFS_META_VERSION@ ++ZLIB = @ZLIB@ ++abs_builddir = @abs_builddir@ ++abs_srcdir = @abs_srcdir@ ++abs_top_builddir = @abs_top_builddir@ ++abs_top_srcdir = @abs_top_srcdir@ ++ac_ct_AR = @ac_ct_AR@ ++ac_ct_CC = @ac_ct_CC@ ++ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ ++am__include = @am__include@ ++am__leading_dot = @am__leading_dot@ ++am__quote = @am__quote@ ++am__tar = @am__tar@ ++am__untar = @am__untar@ ++bindir = @bindir@ ++build = @build@ ++build_alias = @build_alias@ ++build_cpu = @build_cpu@ ++build_os = @build_os@ ++build_vendor = @build_vendor@ ++builddir = @builddir@ ++datadir = @datadir@ ++datarootdir = @datarootdir@ ++docdir = @docdir@ ++dvidir = @dvidir@ ++exec_prefix = @exec_prefix@ ++host = @host@ ++host_alias = @host_alias@ ++host_cpu = @host_cpu@ ++host_os = @host_os@ ++host_vendor = @host_vendor@ ++htmldir = @htmldir@ ++includedir = @includedir@ ++infodir = @infodir@ ++install_sh = @install_sh@ ++libdir = @libdir@ ++libexecdir = @libexecdir@ ++localedir = @localedir@ ++localstatedir = @localstatedir@ ++mandir = @mandir@ ++mkdir_p = @mkdir_p@ ++oldincludedir = @oldincludedir@ ++pdfdir = @pdfdir@ ++prefix = @prefix@ ++program_transform_name = @program_transform_name@ ++psdir = @psdir@ ++sbindir = @sbindir@ ++sharedstatedir = @sharedstatedir@ ++srcdir = @srcdir@ ++sysconfdir = @sysconfdir@ ++target = @target@ ++target_alias = @target_alias@ ++target_cpu = @target_cpu@ ++target_os = @target_os@ ++target_vendor = @target_vendor@ ++top_build_prefix = @top_build_prefix@ ++top_builddir = @top_builddir@ ++top_srcdir = @top_srcdir@ ++udevdir = @udevdir@ ++udevruledir = @udevruledir@ ++SUBDIRS = fs ++COMMON_H = \ ++ $(top_srcdir)/include/sys/fm/protocol.h \ ++ $(top_srcdir)/include/sys/fm/util.h ++ ++KERNEL_H = ++USER_H = ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++@CONFIG_USER_TRUE@libzfsdir = $(includedir)/libzfs/sys/fm ++@CONFIG_USER_TRUE@libzfs_HEADERS = $(COMMON_H) $(USER_H) ++@CONFIG_KERNEL_TRUE@kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION)/sys/fm ++@CONFIG_KERNEL_TRUE@kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++all: all-recursive ++ ++.SUFFIXES: ++$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) ++ @for dep in $?; do \ ++ case '$(am__configure_deps)' in \ ++ *$$dep*) \ ++ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ ++ && { if test -f $@; then exit 0; else break; fi; }; \ ++ exit 1;; \ ++ esac; \ ++ done; \ ++ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu include/sys/fm/Makefile'; \ ++ $(am__cd) $(top_srcdir) && \ ++ $(AUTOMAKE) --gnu include/sys/fm/Makefile ++.PRECIOUS: Makefile ++Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status ++ @case '$?' in \ ++ *config.status*) \ ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ ++ *) \ ++ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ ++ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ ++ esac; ++ ++$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++ ++$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(am__aclocal_m4_deps): ++ ++mostlyclean-libtool: ++ -rm -f *.lo ++ ++clean-libtool: ++ -rm -rf .libs _libs ++install-kernelHEADERS: $(kernel_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(kerneldir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(kerneldir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(kerneldir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(kerneldir)" || exit $$?; \ ++ done ++ ++uninstall-kernelHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(kerneldir)'; $(am__uninstall_files_from_dir) ++install-libzfsHEADERS: $(libzfs_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(libzfsdir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(libzfsdir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libzfsdir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(libzfsdir)" || exit $$?; \ ++ done ++ ++uninstall-libzfsHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(libzfsdir)'; $(am__uninstall_files_from_dir) ++ ++# This directory's subdirectories are mostly independent; you can cd ++# into them and run `make' without going through this Makefile. ++# To change the values of `make' variables: instead of editing Makefiles, ++# (1) if the variable is set in `config.status', edit `config.status' ++# (which will cause the Makefiles to be regenerated when you run `make'); ++# (2) otherwise, pass the desired values on the `make' command line. ++$(RECURSIVE_TARGETS): ++ @fail= failcom='exit 1'; \ ++ for f in x $$MAKEFLAGS; do \ ++ case $$f in \ ++ *=* | --[!k]*);; \ ++ *k*) failcom='fail=yes';; \ ++ esac; \ ++ done; \ ++ dot_seen=no; \ ++ target=`echo $@ | sed s/-recursive//`; \ ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ echo "Making $$target in $$subdir"; \ ++ if test "$$subdir" = "."; then \ ++ dot_seen=yes; \ ++ local_target="$$target-am"; \ ++ else \ ++ local_target="$$target"; \ ++ fi; \ ++ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ ++ || eval $$failcom; \ ++ done; \ ++ if test "$$dot_seen" = "no"; then \ ++ $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ ++ fi; test -z "$$fail" ++ ++$(RECURSIVE_CLEAN_TARGETS): ++ @fail= failcom='exit 1'; \ ++ for f in x $$MAKEFLAGS; do \ ++ case $$f in \ ++ *=* | --[!k]*);; \ ++ *k*) failcom='fail=yes';; \ ++ esac; \ ++ done; \ ++ dot_seen=no; \ ++ case "$@" in \ ++ distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \ ++ *) list='$(SUBDIRS)' ;; \ ++ esac; \ ++ rev=''; for subdir in $$list; do \ ++ if test "$$subdir" = "."; then :; else \ ++ rev="$$subdir $$rev"; \ ++ fi; \ ++ done; \ ++ rev="$$rev ."; \ ++ target=`echo $@ | sed s/-recursive//`; \ ++ for subdir in $$rev; do \ ++ echo "Making $$target in $$subdir"; \ ++ if test "$$subdir" = "."; then \ ++ local_target="$$target-am"; \ ++ else \ ++ local_target="$$target"; \ ++ fi; \ ++ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ ++ || eval $$failcom; \ ++ done && test -z "$$fail" ++tags-recursive: ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \ ++ done ++ctags-recursive: ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \ ++ done ++ ++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ mkid -fID $$unique ++tags: TAGS ++ ++TAGS: tags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ set x; \ ++ here=`pwd`; \ ++ if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \ ++ include_option=--etags-include; \ ++ empty_fix=.; \ ++ else \ ++ include_option=--include; \ ++ empty_fix=; \ ++ fi; \ ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ if test "$$subdir" = .; then :; else \ ++ test ! -f $$subdir/TAGS || \ ++ set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \ ++ fi; \ ++ done; \ ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ shift; \ ++ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ ++ test -n "$$unique" || unique=$$empty_fix; \ ++ if test $$# -gt 0; then \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ "$$@" $$unique; \ ++ else \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ $$unique; \ ++ fi; \ ++ fi ++ctags: CTAGS ++CTAGS: ctags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ test -z "$(CTAGS_ARGS)$$unique" \ ++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ ++ $$unique ++ ++GTAGS: ++ here=`$(am__cd) $(top_builddir) && pwd` \ ++ && $(am__cd) $(top_srcdir) \ ++ && gtags -i $(GTAGS_ARGS) "$$here" ++ ++distclean-tags: ++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags ++ ++distdir: $(DISTFILES) ++ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ list='$(DISTFILES)'; \ ++ dist_files=`for file in $$list; do echo $$file; done | \ ++ sed -e "s|^$$srcdirstrip/||;t" \ ++ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ ++ case $$dist_files in \ ++ */*) $(MKDIR_P) `echo "$$dist_files" | \ ++ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ ++ sort -u` ;; \ ++ esac; \ ++ for file in $$dist_files; do \ ++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ ++ if test -d $$d/$$file; then \ ++ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ ++ if test -d "$(distdir)/$$file"; then \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ ++ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ ++ else \ ++ test -f "$(distdir)/$$file" \ ++ || cp -p $$d/$$file "$(distdir)/$$file" \ ++ || exit 1; \ ++ fi; \ ++ done ++ @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \ ++ if test "$$subdir" = .; then :; else \ ++ $(am__make_dryrun) \ ++ || test -d "$(distdir)/$$subdir" \ ++ || $(MKDIR_P) "$(distdir)/$$subdir" \ ++ || exit 1; \ ++ dir1=$$subdir; dir2="$(distdir)/$$subdir"; \ ++ $(am__relativize); \ ++ new_distdir=$$reldir; \ ++ dir1=$$subdir; dir2="$(top_distdir)"; \ ++ $(am__relativize); \ ++ new_top_distdir=$$reldir; \ ++ echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \ ++ echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \ ++ ($(am__cd) $$subdir && \ ++ $(MAKE) $(AM_MAKEFLAGS) \ ++ top_distdir="$$new_top_distdir" \ ++ distdir="$$new_distdir" \ ++ am__remove_distdir=: \ ++ am__skip_length_check=: \ ++ am__skip_mode_fix=: \ ++ distdir) \ ++ || exit 1; \ ++ fi; \ ++ done ++check-am: all-am ++check: check-recursive ++all-am: Makefile $(HEADERS) ++installdirs: installdirs-recursive ++installdirs-am: ++ for dir in "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)"; do \ ++ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ ++ done ++install: install-recursive ++install-exec: install-exec-recursive ++install-data: install-data-recursive ++uninstall: uninstall-recursive ++ ++install-am: all-am ++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am ++ ++installcheck: installcheck-recursive ++install-strip: ++ if test -z '$(STRIP)'; then \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ install; \ ++ else \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ ++ fi ++mostlyclean-generic: ++ ++clean-generic: ++ ++distclean-generic: ++ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) ++ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) ++ ++maintainer-clean-generic: ++ @echo "This command is intended for maintainers to use" ++ @echo "it deletes files that may require special tools to rebuild." ++clean: clean-recursive ++ ++clean-am: clean-generic clean-libtool mostlyclean-am ++ ++distclean: distclean-recursive ++ -rm -f Makefile ++distclean-am: clean-am distclean-generic distclean-tags ++ ++dvi: dvi-recursive ++ ++dvi-am: ++ ++html: html-recursive ++ ++html-am: ++ ++info: info-recursive ++ ++info-am: ++ ++install-data-am: install-kernelHEADERS install-libzfsHEADERS ++ ++install-dvi: install-dvi-recursive ++ ++install-dvi-am: ++ ++install-exec-am: ++ ++install-html: install-html-recursive ++ ++install-html-am: ++ ++install-info: install-info-recursive ++ ++install-info-am: ++ ++install-man: ++ ++install-pdf: install-pdf-recursive ++ ++install-pdf-am: ++ ++install-ps: install-ps-recursive ++ ++install-ps-am: ++ ++installcheck-am: ++ ++maintainer-clean: maintainer-clean-recursive ++ -rm -f Makefile ++maintainer-clean-am: distclean-am maintainer-clean-generic ++ ++mostlyclean: mostlyclean-recursive ++ ++mostlyclean-am: mostlyclean-generic mostlyclean-libtool ++ ++pdf: pdf-recursive ++ ++pdf-am: ++ ++ps: ps-recursive ++ ++ps-am: ++ ++uninstall-am: uninstall-kernelHEADERS uninstall-libzfsHEADERS ++ ++.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \ ++ install-am install-strip tags-recursive ++ ++.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \ ++ all all-am check check-am clean clean-generic clean-libtool \ ++ ctags ctags-recursive distclean distclean-generic \ ++ distclean-libtool distclean-tags distdir dvi dvi-am html \ ++ html-am info info-am install install-am install-data \ ++ install-data-am install-dvi install-dvi-am install-exec \ ++ install-exec-am install-html install-html-am install-info \ ++ install-info-am install-kernelHEADERS install-libzfsHEADERS \ ++ install-man install-pdf install-pdf-am install-ps \ ++ install-ps-am install-strip installcheck installcheck-am \ ++ installdirs installdirs-am maintainer-clean \ ++ maintainer-clean-generic mostlyclean mostlyclean-generic \ ++ mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \ ++ uninstall uninstall-am uninstall-kernelHEADERS \ ++ uninstall-libzfsHEADERS ++ ++ ++# Tell versions [3.59,3.63) of GNU make to not export all variables. ++# Otherwise a system limit (for SysV at least) may be exceeded. ++.NOEXPORT: +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/fm/protocol.h linux-3.2.33-go/include/zfs/sys/fm/protocol.h +--- linux-3.2.33-go.orig/include/zfs/sys/fm/protocol.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/fm/protocol.h 2012-11-16 23:25:34.344039393 +0100 +@@ -0,0 +1,367 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_FM_PROTOCOL_H ++#define _SYS_FM_PROTOCOL_H ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#ifdef _KERNEL ++#include ++#include ++#else ++#include ++#include ++#endif ++#include ++ ++/* FM common member names */ ++#define FM_CLASS "class" ++#define FM_VERSION "version" ++ ++/* FM protocol category 1 class names */ ++#define FM_EREPORT_CLASS "ereport" ++#define FM_FAULT_CLASS "fault" ++#define FM_DEFECT_CLASS "defect" ++#define FM_RSRC_CLASS "resource" ++#define FM_LIST_EVENT "list" ++#define FM_IREPORT_CLASS "ireport" ++ ++/* FM list.* event class values */ ++#define FM_LIST_SUSPECT_CLASS FM_LIST_EVENT ".suspect" ++#define FM_LIST_ISOLATED_CLASS FM_LIST_EVENT ".isolated" ++#define FM_LIST_REPAIRED_CLASS FM_LIST_EVENT ".repaired" ++#define FM_LIST_UPDATED_CLASS FM_LIST_EVENT ".updated" ++#define FM_LIST_RESOLVED_CLASS FM_LIST_EVENT ".resolved" ++ ++/* ereport class subcategory values */ ++#define FM_ERROR_CPU "cpu" ++#define FM_ERROR_IO "io" ++ ++/* ereport version and payload member names */ ++#define FM_EREPORT_VERS0 0 ++#define FM_EREPORT_VERSION FM_EREPORT_VERS0 ++ ++/* ereport payload member names */ ++#define FM_EREPORT_DETECTOR "detector" ++#define FM_EREPORT_ENA "ena" ++#define FM_EREPORT_TIME "time" ++ ++/* list.* event payload member names */ ++#define FM_LIST_EVENT_SIZE "list-sz" ++ ++/* ireport.* event payload member names */ ++#define FM_IREPORT_DETECTOR "detector" ++#define FM_IREPORT_UUID "uuid" ++#define FM_IREPORT_PRIORITY "pri" ++#define FM_IREPORT_ATTRIBUTES "attr" ++ ++/* ++ * list.suspect, isolated, updated, repaired and resolved ++ * versions/payload member names. ++ */ ++#define FM_SUSPECT_UUID "uuid" ++#define FM_SUSPECT_DIAG_CODE "code" ++#define FM_SUSPECT_DIAG_TIME "diag-time" ++#define FM_SUSPECT_DE "de" ++#define FM_SUSPECT_FAULT_LIST "fault-list" ++#define FM_SUSPECT_FAULT_SZ "fault-list-sz" ++#define FM_SUSPECT_FAULT_STATUS "fault-status" ++#define FM_SUSPECT_INJECTED "__injected" ++#define FM_SUSPECT_MESSAGE "message" ++#define FM_SUSPECT_RETIRE "retire" ++#define FM_SUSPECT_RESPONSE "response" ++#define FM_SUSPECT_SEVERITY "severity" ++ ++#define FM_SUSPECT_VERS0 0 ++#define FM_SUSPECT_VERSION FM_SUSPECT_VERS0 ++ ++#define FM_SUSPECT_FAULTY 0x1 ++#define FM_SUSPECT_UNUSABLE 0x2 ++#define FM_SUSPECT_NOT_PRESENT 0x4 ++#define FM_SUSPECT_DEGRADED 0x8 ++#define FM_SUSPECT_REPAIRED 0x10 ++#define FM_SUSPECT_REPLACED 0x20 ++#define FM_SUSPECT_ACQUITTED 0x40 ++ ++/* fault event versions and payload member names */ ++#define FM_FAULT_VERS0 0 ++#define FM_FAULT_VERSION FM_FAULT_VERS0 ++ ++#define FM_FAULT_ASRU "asru" ++#define FM_FAULT_FRU "fru" ++#define FM_FAULT_FRU_LABEL "fru-label" ++#define FM_FAULT_CERTAINTY "certainty" ++#define FM_FAULT_RESOURCE "resource" ++#define FM_FAULT_LOCATION "location" ++ ++/* resource event versions and payload member names */ ++#define FM_RSRC_VERS0 0 ++#define FM_RSRC_VERSION FM_RSRC_VERS0 ++#define FM_RSRC_RESOURCE "resource" ++ ++/* resource.fm.asru.* payload member names */ ++#define FM_RSRC_ASRU_UUID "uuid" ++#define FM_RSRC_ASRU_CODE "code" ++#define FM_RSRC_ASRU_FAULTY "faulty" ++#define FM_RSRC_ASRU_REPAIRED "repaired" ++#define FM_RSRC_ASRU_REPLACED "replaced" ++#define FM_RSRC_ASRU_ACQUITTED "acquitted" ++#define FM_RSRC_ASRU_RESOLVED "resolved" ++#define FM_RSRC_ASRU_UNUSABLE "unusable" ++#define FM_RSRC_ASRU_EVENT "event" ++ ++/* resource.fm.xprt.* versions and payload member names */ ++#define FM_RSRC_XPRT_VERS0 0 ++#define FM_RSRC_XPRT_VERSION FM_RSRC_XPRT_VERS0 ++#define FM_RSRC_XPRT_UUID "uuid" ++#define FM_RSRC_XPRT_SUBCLASS "subclass" ++#define FM_RSRC_XPRT_FAULT_STATUS "fault-status" ++#define FM_RSRC_XPRT_FAULT_HAS_ASRU "fault-has-asru" ++ ++/* ++ * FM ENA Format Macros ++ */ ++#define ENA_FORMAT_MASK 0x3 ++#define ENA_FORMAT(ena) ((ena) & ENA_FORMAT_MASK) ++ ++/* ENA format types */ ++#define FM_ENA_FMT0 0 ++#define FM_ENA_FMT1 1 ++#define FM_ENA_FMT2 2 ++ ++/* Format 1 */ ++#define ENA_FMT1_GEN_MASK 0x00000000000003FCull ++#define ENA_FMT1_ID_MASK 0xFFFFFFFFFFFFFC00ull ++#define ENA_FMT1_CPUID_MASK 0x00000000000FFC00ull ++#define ENA_FMT1_TIME_MASK 0xFFFFFFFFFFF00000ull ++#define ENA_FMT1_GEN_SHFT 2 ++#define ENA_FMT1_ID_SHFT 10 ++#define ENA_FMT1_CPUID_SHFT ENA_FMT1_ID_SHFT ++#define ENA_FMT1_TIME_SHFT 20 ++ ++/* Format 2 */ ++#define ENA_FMT2_GEN_MASK 0x00000000000003FCull ++#define ENA_FMT2_ID_MASK 0xFFFFFFFFFFFFFC00ull ++#define ENA_FMT2_TIME_MASK ENA_FMT2_ID_MASK ++#define ENA_FMT2_GEN_SHFT 2 ++#define ENA_FMT2_ID_SHFT 10 ++#define ENA_FMT2_TIME_SHFT ENA_FMT2_ID_SHFT ++ ++/* Common FMRI type names */ ++#define FM_FMRI_AUTHORITY "authority" ++#define FM_FMRI_SCHEME "scheme" ++#define FM_FMRI_SVC_AUTHORITY "svc-authority" ++#define FM_FMRI_FACILITY "facility" ++ ++/* FMRI authority-type member names */ ++#define FM_FMRI_AUTH_CHASSIS "chassis-id" ++#define FM_FMRI_AUTH_PRODUCT_SN "product-sn" ++#define FM_FMRI_AUTH_PRODUCT "product-id" ++#define FM_FMRI_AUTH_DOMAIN "domain-id" ++#define FM_FMRI_AUTH_SERVER "server-id" ++#define FM_FMRI_AUTH_HOST "host-id" ++ ++#define FM_AUTH_VERS0 0 ++#define FM_FMRI_AUTH_VERSION FM_AUTH_VERS0 ++ ++/* scheme name values */ ++#define FM_FMRI_SCHEME_FMD "fmd" ++#define FM_FMRI_SCHEME_DEV "dev" ++#define FM_FMRI_SCHEME_HC "hc" ++#define FM_FMRI_SCHEME_SVC "svc" ++#define FM_FMRI_SCHEME_CPU "cpu" ++#define FM_FMRI_SCHEME_MEM "mem" ++#define FM_FMRI_SCHEME_MOD "mod" ++#define FM_FMRI_SCHEME_PKG "pkg" ++#define FM_FMRI_SCHEME_LEGACY "legacy-hc" ++#define FM_FMRI_SCHEME_ZFS "zfs" ++#define FM_FMRI_SCHEME_SW "sw" ++ ++/* Scheme versions */ ++#define FMD_SCHEME_VERSION0 0 ++#define FM_FMD_SCHEME_VERSION FMD_SCHEME_VERSION0 ++#define DEV_SCHEME_VERSION0 0 ++#define FM_DEV_SCHEME_VERSION DEV_SCHEME_VERSION0 ++#define FM_HC_VERS0 0 ++#define FM_HC_SCHEME_VERSION FM_HC_VERS0 ++#define CPU_SCHEME_VERSION0 0 ++#define CPU_SCHEME_VERSION1 1 ++#define FM_CPU_SCHEME_VERSION CPU_SCHEME_VERSION1 ++#define MEM_SCHEME_VERSION0 0 ++#define FM_MEM_SCHEME_VERSION MEM_SCHEME_VERSION0 ++#define MOD_SCHEME_VERSION0 0 ++#define FM_MOD_SCHEME_VERSION MOD_SCHEME_VERSION0 ++#define PKG_SCHEME_VERSION0 0 ++#define FM_PKG_SCHEME_VERSION PKG_SCHEME_VERSION0 ++#define LEGACY_SCHEME_VERSION0 0 ++#define FM_LEGACY_SCHEME_VERSION LEGACY_SCHEME_VERSION0 ++#define SVC_SCHEME_VERSION0 0 ++#define FM_SVC_SCHEME_VERSION SVC_SCHEME_VERSION0 ++#define ZFS_SCHEME_VERSION0 0 ++#define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0 ++#define SW_SCHEME_VERSION0 0 ++#define FM_SW_SCHEME_VERSION SW_SCHEME_VERSION0 ++ ++/* hc scheme member names */ ++#define FM_FMRI_HC_SERIAL_ID "serial" ++#define FM_FMRI_HC_PART "part" ++#define FM_FMRI_HC_REVISION "revision" ++#define FM_FMRI_HC_ROOT "hc-root" ++#define FM_FMRI_HC_LIST_SZ "hc-list-sz" ++#define FM_FMRI_HC_LIST "hc-list" ++#define FM_FMRI_HC_SPECIFIC "hc-specific" ++ ++/* facility member names */ ++#define FM_FMRI_FACILITY_NAME "facility-name" ++#define FM_FMRI_FACILITY_TYPE "facility-type" ++ ++/* hc-list version and member names */ ++#define FM_FMRI_HC_NAME "hc-name" ++#define FM_FMRI_HC_ID "hc-id" ++ ++#define HC_LIST_VERSION0 0 ++#define FM_HC_LIST_VERSION HC_LIST_VERSION0 ++ ++/* hc-specific member names */ ++#define FM_FMRI_HC_SPECIFIC_OFFSET "offset" ++#define FM_FMRI_HC_SPECIFIC_PHYSADDR "physaddr" ++ ++/* fmd module scheme member names */ ++#define FM_FMRI_FMD_NAME "mod-name" ++#define FM_FMRI_FMD_VERSION "mod-version" ++ ++/* dev scheme member names */ ++#define FM_FMRI_DEV_ID "devid" ++#define FM_FMRI_DEV_TGTPTLUN0 "target-port-l0id" ++#define FM_FMRI_DEV_PATH "device-path" ++ ++/* pkg scheme member names */ ++#define FM_FMRI_PKG_BASEDIR "pkg-basedir" ++#define FM_FMRI_PKG_INST "pkg-inst" ++#define FM_FMRI_PKG_VERSION "pkg-version" ++ ++/* svc scheme member names */ ++#define FM_FMRI_SVC_NAME "svc-name" ++#define FM_FMRI_SVC_INSTANCE "svc-instance" ++#define FM_FMRI_SVC_CONTRACT_ID "svc-contract-id" ++ ++/* svc-authority member names */ ++#define FM_FMRI_SVC_AUTH_SCOPE "scope" ++#define FM_FMRI_SVC_AUTH_SYSTEM_FQN "system-fqn" ++ ++/* cpu scheme member names */ ++#define FM_FMRI_CPU_ID "cpuid" ++#define FM_FMRI_CPU_SERIAL_ID "serial" ++#define FM_FMRI_CPU_MASK "cpumask" ++#define FM_FMRI_CPU_VID "cpuvid" ++#define FM_FMRI_CPU_CPUFRU "cpufru" ++#define FM_FMRI_CPU_CACHE_INDEX "cacheindex" ++#define FM_FMRI_CPU_CACHE_WAY "cacheway" ++#define FM_FMRI_CPU_CACHE_BIT "cachebit" ++#define FM_FMRI_CPU_CACHE_TYPE "cachetype" ++ ++#define FM_FMRI_CPU_CACHE_TYPE_L2 0 ++#define FM_FMRI_CPU_CACHE_TYPE_L3 1 ++ ++/* legacy-hc scheme member names */ ++#define FM_FMRI_LEGACY_HC "component" ++#define FM_FMRI_LEGACY_HC_PREFIX FM_FMRI_SCHEME_HC":///" \ ++ FM_FMRI_LEGACY_HC"=" ++ ++/* mem scheme member names */ ++#define FM_FMRI_MEM_UNUM "unum" ++#define FM_FMRI_MEM_SERIAL_ID "serial" ++#define FM_FMRI_MEM_PHYSADDR "physaddr" ++#define FM_FMRI_MEM_MEMCONFIG "memconfig" ++#define FM_FMRI_MEM_OFFSET "offset" ++ ++/* mod scheme member names */ ++#define FM_FMRI_MOD_PKG "mod-pkg" ++#define FM_FMRI_MOD_NAME "mod-name" ++#define FM_FMRI_MOD_ID "mod-id" ++#define FM_FMRI_MOD_DESC "mod-desc" ++ ++/* zfs scheme member names */ ++#define FM_FMRI_ZFS_POOL "pool" ++#define FM_FMRI_ZFS_VDEV "vdev" ++ ++/* sw scheme member names - extra indentation for members of an nvlist */ ++#define FM_FMRI_SW_OBJ "object" ++#define FM_FMRI_SW_OBJ_PATH "path" ++#define FM_FMRI_SW_OBJ_ROOT "root" ++#define FM_FMRI_SW_OBJ_PKG "pkg" ++#define FM_FMRI_SW_SITE "site" ++#define FM_FMRI_SW_SITE_TOKEN "token" ++#define FM_FMRI_SW_SITE_MODULE "module" ++#define FM_FMRI_SW_SITE_FILE "file" ++#define FM_FMRI_SW_SITE_LINE "line" ++#define FM_FMRI_SW_SITE_FUNC "func" ++#define FM_FMRI_SW_CTXT "context" ++#define FM_FMRI_SW_CTXT_ORIGIN "origin" ++#define FM_FMRI_SW_CTXT_EXECNAME "execname" ++#define FM_FMRI_SW_CTXT_PID "pid" ++#define FM_FMRI_SW_CTXT_ZONE "zone" ++#define FM_FMRI_SW_CTXT_CTID "ctid" ++#define FM_FMRI_SW_CTXT_STACK "stack" ++#define FM_NVA_FREE 0 /* free allocator on nvlist_destroy */ ++#define FM_NVA_RETAIN 1 /* keep allocator on nvlist_destroy */ ++ ++extern nv_alloc_t *fm_nva_xcreate(char *, size_t); ++extern void fm_nva_xdestroy(nv_alloc_t *); ++extern nvlist_t *fm_nvlist_create(nv_alloc_t *); ++extern void fm_nvlist_destroy(nvlist_t *, int); ++extern void fm_ereport_set(nvlist_t *, int, const char *, uint64_t, ++ const nvlist_t *, ...); ++extern void fm_payload_set(nvlist_t *, ...); ++extern int i_fm_payload_set(nvlist_t *, const char *, va_list); ++extern void fm_fmri_hc_set(nvlist_t *, int, const nvlist_t *, nvlist_t *, ++ int, ...); ++extern void fm_fmri_dev_set(nvlist_t *, int, const nvlist_t *, const char *, ++ const char *, const char *); ++extern void fm_fmri_de_set(nvlist_t *, int, const nvlist_t *, const char *); ++extern void fm_fmri_cpu_set(nvlist_t *, int, const nvlist_t *, uint32_t, ++ uint8_t *, const char *); ++extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *, ++ const char *, uint64_t); ++extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t); ++extern void fm_fmri_hc_create(nvlist_t *, int, const nvlist_t *, nvlist_t *, ++ nvlist_t *, int, ...); ++ ++extern uint64_t fm_ena_increment(uint64_t); ++extern uint64_t fm_ena_generate(uint64_t, uchar_t); ++extern uint64_t fm_ena_generate_cpu(uint64_t, processorid_t, uchar_t); ++extern uint64_t fm_ena_generation_get(uint64_t); ++extern uchar_t fm_ena_format_get(uint64_t); ++extern uint64_t fm_ena_id_get(uint64_t); ++extern uint64_t fm_ena_time_get(uint64_t); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_FM_PROTOCOL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/fm/util.h linux-3.2.33-go/include/zfs/sys/fm/util.h +--- linux-3.2.33-go.orig/include/zfs/sys/fm/util.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/fm/util.h 2012-11-16 23:25:34.343039404 +0100 +@@ -0,0 +1,115 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_FM_UTIL_H ++#define _SYS_FM_UTIL_H ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#include ++ ++/* ++ * Shared user/kernel definitions for class length, error channel name, ++ * and kernel event publisher string. ++ */ ++#define FM_MAX_CLASS 100 ++#define FM_ERROR_CHAN "com.sun:fm:error" ++#define FM_PUB "fm" ++ ++/* ++ * ereport dump device transport support ++ * ++ * Ereports are written out to the dump device at a proscribed offset from the ++ * end, similar to in-transit log messages. The ereports are represented as a ++ * erpt_dump_t header followed by ed_size bytes of packed native nvlist data. ++ * ++ * NOTE: All of these constants and the header must be defined so they have the ++ * same representation for *both* 32-bit and 64-bit producers and consumers. ++ */ ++#define ERPT_MAGIC 0xf00d4eddU ++#define ERPT_MAX_ERRS 16 ++#define ERPT_DATA_SZ (6 * 1024) ++#define ERPT_EVCH_MAX 256 ++#define ERPT_HIWAT 64 ++ ++typedef struct erpt_dump { ++ uint32_t ed_magic; /* ERPT_MAGIC or zero to indicate end */ ++ uint32_t ed_chksum; /* checksum32() of packed nvlist data */ ++ uint32_t ed_size; /* ereport (nvl) fixed buf size */ ++ uint32_t ed_pad; /* reserved for future use */ ++ hrtime_t ed_hrt_nsec; /* hrtime of this ereport */ ++ hrtime_t ed_hrt_base; /* hrtime sample corresponding to ed_tod_base */ ++ struct { ++ uint64_t sec; /* seconds since gettimeofday() Epoch */ ++ uint64_t nsec; /* nanoseconds past ed_tod_base.sec */ ++ } ed_tod_base; ++} erpt_dump_t; ++ ++#ifdef _KERNEL ++ ++#define ZEVENT_SHUTDOWN 0x1 ++ ++typedef void zevent_cb_t(nvlist_t *, nvlist_t *); ++ ++typedef struct zevent_s { ++ nvlist_t *ev_nvl; /* protected by the zevent_lock */ ++ nvlist_t *ev_detector; /* " */ ++ list_t ev_ze_list; /* " */ ++ list_node_t ev_node; /* " */ ++ zevent_cb_t *ev_cb; /* " */ ++} zevent_t; ++ ++typedef struct zfs_zevent { ++ zevent_t *ze_zevent; /* protected by the zevent_lock */ ++ list_node_t ze_node; /* " */ ++ uint64_t ze_dropped; /* " */ ++} zfs_zevent_t; ++ ++extern void fm_init(void); ++extern void fm_fini(void); ++extern void fm_nvprint(nvlist_t *); ++extern void zfs_zevent_post(nvlist_t *, nvlist_t *, zevent_cb_t *); ++extern void zfs_zevent_drain_all(int *); ++extern int zfs_zevent_fd_hold(int, minor_t *, zfs_zevent_t **); ++extern void zfs_zevent_fd_rele(int); ++extern int zfs_zevent_next(zfs_zevent_t *, nvlist_t **, uint64_t *, uint64_t *); ++extern int zfs_zevent_wait(zfs_zevent_t *); ++extern void zfs_zevent_init(zfs_zevent_t **); ++extern void zfs_zevent_destroy(zfs_zevent_t *); ++ ++#else ++ ++static inline void fm_init(void) { } ++static inline void fm_fini(void) { } ++ ++#endif /* _KERNEL */ ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_FM_UTIL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/fs/Makefile linux-3.2.33-go/include/zfs/sys/fs/Makefile +--- linux-3.2.33-go.orig/include/zfs/sys/fs/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/fs/Makefile 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,659 @@ ++# Makefile.in generated by automake 1.11.6 from Makefile.am. ++# include/sys/fs/Makefile. Generated from Makefile.in by configure. ++ ++# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, ++# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software ++# Foundation, Inc. ++# This Makefile.in is free software; the Free Software Foundation ++# gives unlimited permission to copy and/or distribute it, ++# with or without modifications, as long as this notice is preserved. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without ++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A ++# PARTICULAR PURPOSE. ++ ++ ++ ++ ++am__make_dryrun = \ ++ { \ ++ am__dry=no; \ ++ case $$MAKEFLAGS in \ ++ *\\[\ \ ]*) \ ++ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ ++ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ ++ *) \ ++ for am__flg in $$MAKEFLAGS; do \ ++ case $$am__flg in \ ++ *=*|--*) ;; \ ++ *n*) am__dry=yes; break;; \ ++ esac; \ ++ done;; \ ++ esac; \ ++ test $$am__dry = yes; \ ++ } ++pkgdatadir = $(datadir)/zfs ++pkgincludedir = $(includedir)/zfs ++pkglibdir = $(libdir)/zfs ++pkglibexecdir = $(libexecdir)/zfs ++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd ++install_sh_DATA = $(install_sh) -c -m 644 ++install_sh_PROGRAM = $(install_sh) -c ++install_sh_SCRIPT = $(install_sh) -c ++INSTALL_HEADER = $(INSTALL_DATA) ++transform = $(program_transform_name) ++NORMAL_INSTALL = : ++PRE_INSTALL = : ++POST_INSTALL = : ++NORMAL_UNINSTALL = : ++PRE_UNINSTALL = : ++POST_UNINSTALL = : ++build_triplet = x86_64-unknown-linux-gnu ++host_triplet = x86_64-unknown-linux-gnu ++target_triplet = x86_64-unknown-linux-gnu ++subdir = include/sys/fs ++DIST_COMMON = $(am__kernel_HEADERS_DIST) $(am__libzfs_HEADERS_DIST) \ ++ $(srcdir)/Makefile.am $(srcdir)/Makefile.in ++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ++am__aclocal_m4_deps = \ ++ $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \ ++ $(top_srcdir)/config/kernel-automount.m4 \ ++ $(top_srcdir)/config/kernel-bdev-block-device-operations.m4 \ ++ $(top_srcdir)/config/kernel-bdev-logical-size.m4 \ ++ $(top_srcdir)/config/kernel-bdi-setup-and-register.m4 \ ++ $(top_srcdir)/config/kernel-bdi.m4 \ ++ $(top_srcdir)/config/kernel-bio-empty-barrier.m4 \ ++ $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \ ++ $(top_srcdir)/config/kernel-bio-failfast.m4 \ ++ $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \ ++ $(top_srcdir)/config/kernel-blk-end-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-fetch-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-discard.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-flush.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \ ++ $(top_srcdir)/config/kernel-blk-requeue-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-pos.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get.m4 \ ++ $(top_srcdir)/config/kernel-check-disk-size-change.m4 \ ++ $(top_srcdir)/config/kernel-clear-inode.m4 \ ++ $(top_srcdir)/config/kernel-commit-metadata.m4 \ ++ $(top_srcdir)/config/kernel-create-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-d-make-root.m4 \ ++ $(top_srcdir)/config/kernel-d-obtain-alias.m4 \ ++ $(top_srcdir)/config/kernel-discard-granularity.m4 \ ++ $(top_srcdir)/config/kernel-elevator-change.m4 \ ++ $(top_srcdir)/config/kernel-encode-fh-inode.m4 \ ++ $(top_srcdir)/config/kernel-evict-inode.m4 \ ++ $(top_srcdir)/config/kernel-fallocate.m4 \ ++ $(top_srcdir)/config/kernel-fmode-t.m4 \ ++ $(top_srcdir)/config/kernel-fsync.m4 \ ++ $(top_srcdir)/config/kernel-get-disk-ro.m4 \ ++ $(top_srcdir)/config/kernel-get-gendisk.m4 \ ++ $(top_srcdir)/config/kernel-insert-inode-locked.m4 \ ++ $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \ ++ $(top_srcdir)/config/kernel-kobj-name-len.m4 \ ++ $(top_srcdir)/config/kernel-lookup-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \ ++ $(top_srcdir)/config/kernel-mount-nodev.m4 \ ++ $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \ ++ $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \ ++ $(top_srcdir)/config/kernel-rq-is_sync.m4 \ ++ $(top_srcdir)/config/kernel-security-inode-init.m4 \ ++ $(top_srcdir)/config/kernel-set-nlink.m4 \ ++ $(top_srcdir)/config/kernel-sget-args.m4 \ ++ $(top_srcdir)/config/kernel-show-options.m4 \ ++ $(top_srcdir)/config/kernel-shrink.m4 \ ++ $(top_srcdir)/config/kernel-truncate-range.m4 \ ++ $(top_srcdir)/config/kernel-truncate-setsize.m4 \ ++ $(top_srcdir)/config/kernel-xattr-handler.m4 \ ++ $(top_srcdir)/config/kernel.m4 \ ++ $(top_srcdir)/config/user-arch.m4 \ ++ $(top_srcdir)/config/user-frame-larger-than.m4 \ ++ $(top_srcdir)/config/user-ioctl.m4 \ ++ $(top_srcdir)/config/user-libblkid.m4 \ ++ $(top_srcdir)/config/user-libuuid.m4 \ ++ $(top_srcdir)/config/user-nptl_guard_within_stack.m4 \ ++ $(top_srcdir)/config/user-selinux.m4 \ ++ $(top_srcdir)/config/user-udev.m4 \ ++ $(top_srcdir)/config/user-zlib.m4 $(top_srcdir)/config/user.m4 \ ++ $(top_srcdir)/config/zfs-build.m4 \ ++ $(top_srcdir)/config/zfs-meta.m4 $(top_srcdir)/configure.ac ++am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ ++ $(ACLOCAL_M4) ++mkinstalldirs = $(install_sh) -d ++CONFIG_HEADER = $(top_builddir)/zfs_config.h ++CONFIG_CLEAN_FILES = ++CONFIG_CLEAN_VPATH_FILES = ++AM_V_GEN = $(am__v_GEN_$(V)) ++am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) ++am__v_GEN_0 = @echo " GEN " $@; ++AM_V_at = $(am__v_at_$(V)) ++am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) ++am__v_at_0 = @ ++SOURCES = ++DIST_SOURCES = ++am__can_run_installinfo = \ ++ case $$AM_UPDATE_INFO_DIR in \ ++ n|no|NO) false;; \ ++ *) (install-info --version) >/dev/null 2>&1;; \ ++ esac ++am__kernel_HEADERS_DIST = $(top_srcdir)/include/sys/fs/zfs.h ++am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; ++am__vpath_adj = case $$p in \ ++ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ ++ *) f=$$p;; \ ++ esac; ++am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; ++am__install_max = 40 ++am__nobase_strip_setup = \ ++ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` ++am__nobase_strip = \ ++ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" ++am__nobase_list = $(am__nobase_strip_setup); \ ++ for p in $$list; do echo "$$p $$p"; done | \ ++ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ ++ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ ++ if (++n[$$2] == $(am__install_max)) \ ++ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ ++ END { for (dir in files) print dir, files[dir] }' ++am__base_list = \ ++ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ ++ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' ++am__uninstall_files_from_dir = { \ ++ test -z "$$files" \ ++ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ ++ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ ++ $(am__cd) "$$dir" && rm -f $$files; }; \ ++ } ++am__installdirs = "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)" ++am__libzfs_HEADERS_DIST = $(top_srcdir)/include/sys/fs/zfs.h ++HEADERS = $(kernel_HEADERS) $(libzfs_HEADERS) ++ETAGS = etags ++CTAGS = ctags ++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ++ACLOCAL = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run aclocal-1.11 ++ALIEN = alien ++ALIEN_VERSION = ++AMTAR = $${TAR-tar} ++AM_DEFAULT_VERBOSITY = 1 ++AR = ar ++AUTOCONF = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run autoconf ++AUTOHEADER = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run autoheader ++AUTOMAKE = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run automake-1.11 ++AWK = gawk ++CC = gcc ++CCAS = gcc ++CCASDEPMODE = depmode=gcc3 ++CCASFLAGS = -g -O2 ++CCDEPMODE = depmode=gcc3 ++CFLAGS = -g -O2 ++CPP = gcc -E ++CPPFLAGS = ++CYGPATH_W = echo ++DEBUG_CFLAGS = -DNDEBUG ++DEBUG_DMU_TX = _without_debug_dmu_tx ++DEBUG_STACKFLAGS = ++DEBUG_ZFS = _without_debug ++DEFAULT_INIT_DIR = ${prefix}/etc/init.d ++DEFAULT_INIT_SCRIPT = gentoo ++DEFAULT_PACKAGE = tgz ++DEFS = -DHAVE_CONFIG_H ++DEPDIR = .deps ++DLLTOOL = false ++DPKG = dpkg ++DPKGBUILD = dpkg-buildpackage ++DPKGBUILD_VERSION = ++DPKG_VERSION = ++DSYMUTIL = ++DUMPBIN = ++ECHO_C = ++ECHO_N = -n ++ECHO_T = ++EGREP = /bin/grep -E ++EXEEXT = ++FGREP = /bin/grep -F ++FRAME_LARGER_THAN = -Wframe-larger-than=1024 ++GREP = /bin/grep ++HAVE_ALIEN = no ++HAVE_DPKG = no ++HAVE_DPKGBUILD = no ++HAVE_MAKEPKG = ++HAVE_PACMAN = ++HAVE_RPM = yes ++HAVE_RPMBUILD = yes ++INSTALL = /usr/bin/install -c ++INSTALL_DATA = ${INSTALL} -m 644 ++INSTALL_PROGRAM = ${INSTALL} ++INSTALL_SCRIPT = ${INSTALL} ++INSTALL_STRIP_PROGRAM = $(install_sh) -c -s ++KERNELCPPFLAGS = -Wno-unused-but-set-variable -DHAVE_SPL -D_KERNEL -DTEXT_DOMAIN=\"zfs-linux-kernel\" -DNDEBUG ++KERNELMAKE_PARAMS = O=/usr/src/linux-3.6.0-sabayon ++LD = /usr/x86_64-pc-linux-gnu/bin/ld -m elf_x86_64 ++LDFLAGS = ++LIBBLKID = ++LIBOBJS = ++LIBS = -luuid -luuid -lz -lz -lz ++LIBSELINUX = ++LIBTOOL = $(SHELL) $(top_builddir)/libtool ++LIBUUID = -luuid ++LINUX = /usr/src/linux-3.2.33-go ++LINUX_OBJ = /usr/src/linux-3.6.0-sabayon ++LINUX_SYMBOLS = NONE ++LINUX_VERSION = 3.6.0-sabayon ++LIPO = ++LN_S = ln -s ++LTLIBOBJS = ++MAINT = # ++MAKEINFO = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run makeinfo ++MAKEPKG = ++MAKEPKG_VERSION = ++MANIFEST_TOOL = : ++MKDIR_P = /bin/mkdir -p ++NM = /usr/bin/nm -B ++NMEDIT = ++NO_UNUSED_BUT_SET_VARIABLE = -Wno-unused-but-set-variable ++OBJDUMP = objdump ++OBJEXT = o ++OTOOL = ++OTOOL64 = ++PACKAGE = zfs ++PACKAGE_BUGREPORT = ++PACKAGE_NAME = ++PACKAGE_STRING = ++PACKAGE_TARNAME = ++PACKAGE_URL = ++PACKAGE_VERSION = ++PACMAN = ++PACMAN_VERSION = ++PATH_SEPARATOR = : ++RANLIB = ranlib ++RPM = rpm ++RPMBUILD = rpmbuild ++RPMBUILD_VERSION = 4.10.0 ++RPM_VERSION = 4.10.0 ++SED = /bin/sed ++SET_MAKE = ++SHELL = /bin/sh ++SPL = /usr/src/linux-3.2.33-go ++SPL_OBJ = /usr/src/linux-3.2.33-go ++SPL_SYMBOLS = NONE ++SPL_VERSION = 0.6.0-rc12 ++STRIP = strip ++TARGET_ASM_DIR = asm-x86_64 ++VENDOR = gentoo ++VERSION = 0.6.0 ++ZFS_CONFIG = all ++ZFS_META_ALIAS = zfs-0.6.0-rc12 ++ZFS_META_AUTHOR = Sun Microsystems/Oracle, Lawrence Livermore National Laboratory ++ZFS_META_DATA = ++ZFS_META_LICENSE = CDDL ++ZFS_META_LT_AGE = ++ZFS_META_LT_CURRENT = ++ZFS_META_LT_REVISION = ++ZFS_META_NAME = zfs ++ZFS_META_RELEASE = rc12 ++ZFS_META_VERSION = 0.6.0 ++ZLIB = -lz ++abs_builddir = /root/zfs-0.6.0-rc12/include/sys/fs ++abs_srcdir = /root/zfs-0.6.0-rc12/include/sys/fs ++abs_top_builddir = /root/zfs-0.6.0-rc12 ++abs_top_srcdir = /root/zfs-0.6.0-rc12 ++ac_ct_AR = ar ++ac_ct_CC = gcc ++ac_ct_DUMPBIN = ++am__include = include ++am__leading_dot = . ++am__quote = ++am__tar = $${TAR-tar} chof - "$$tardir" ++am__untar = $${TAR-tar} xf - ++bindir = ${exec_prefix}/bin ++build = x86_64-unknown-linux-gnu ++build_alias = ++build_cpu = x86_64 ++build_os = linux-gnu ++build_vendor = unknown ++builddir = . ++datadir = ${datarootdir} ++datarootdir = ${prefix}/share ++docdir = ${datarootdir}/doc/${PACKAGE} ++dvidir = ${docdir} ++exec_prefix = ${prefix} ++host = x86_64-unknown-linux-gnu ++host_alias = ++host_cpu = x86_64 ++host_os = linux-gnu ++host_vendor = unknown ++htmldir = ${docdir} ++includedir = ${prefix}/include ++infodir = ${datarootdir}/info ++install_sh = ${SHELL} /root/zfs-0.6.0-rc12/config/install-sh ++libdir = ${exec_prefix}/lib ++libexecdir = ${exec_prefix}/libexec ++localedir = ${datarootdir}/locale ++localstatedir = ${prefix}/var ++mandir = ${datarootdir}/man ++mkdir_p = /bin/mkdir -p ++oldincludedir = /usr/include ++pdfdir = ${docdir} ++prefix = /usr/local ++program_transform_name = s,x,x, ++psdir = ${docdir} ++sbindir = ${exec_prefix}/sbin ++sharedstatedir = ${prefix}/com ++srcdir = . ++sysconfdir = ${prefix}/etc ++target = x86_64-unknown-linux-gnu ++target_alias = ++target_cpu = x86_64 ++target_os = linux-gnu ++target_vendor = unknown ++top_build_prefix = ../../../ ++top_builddir = ../../.. ++top_srcdir = ../../.. ++udevdir = ${exec_prefix}/lib/udev ++udevruledir = ${udevdir}/rules.d ++COMMON_H = \ ++ $(top_srcdir)/include/sys/fs/zfs.h ++ ++KERNEL_H = ++USER_H = ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++libzfsdir = $(includedir)/libzfs/sys/fs ++libzfs_HEADERS = $(COMMON_H) $(USER_H) ++#kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION)/sys/fs ++#kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++all: all-am ++ ++.SUFFIXES: ++$(srcdir)/Makefile.in: # $(srcdir)/Makefile.am $(am__configure_deps) ++ @for dep in $?; do \ ++ case '$(am__configure_deps)' in \ ++ *$$dep*) \ ++ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ ++ && { if test -f $@; then exit 0; else break; fi; }; \ ++ exit 1;; \ ++ esac; \ ++ done; \ ++ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu include/sys/fs/Makefile'; \ ++ $(am__cd) $(top_srcdir) && \ ++ $(AUTOMAKE) --gnu include/sys/fs/Makefile ++.PRECIOUS: Makefile ++Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status ++ @case '$?' in \ ++ *config.status*) \ ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ ++ *) \ ++ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ ++ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ ++ esac; ++ ++$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++ ++$(top_srcdir)/configure: # $(am__configure_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(ACLOCAL_M4): # $(am__aclocal_m4_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(am__aclocal_m4_deps): ++ ++mostlyclean-libtool: ++ -rm -f *.lo ++ ++clean-libtool: ++ -rm -rf .libs _libs ++install-kernelHEADERS: $(kernel_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(kerneldir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(kerneldir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(kerneldir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(kerneldir)" || exit $$?; \ ++ done ++ ++uninstall-kernelHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(kerneldir)'; $(am__uninstall_files_from_dir) ++install-libzfsHEADERS: $(libzfs_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(libzfsdir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(libzfsdir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libzfsdir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(libzfsdir)" || exit $$?; \ ++ done ++ ++uninstall-libzfsHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(libzfsdir)'; $(am__uninstall_files_from_dir) ++ ++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ mkid -fID $$unique ++tags: TAGS ++ ++TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ set x; \ ++ here=`pwd`; \ ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ shift; \ ++ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ ++ test -n "$$unique" || unique=$$empty_fix; \ ++ if test $$# -gt 0; then \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ "$$@" $$unique; \ ++ else \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ $$unique; \ ++ fi; \ ++ fi ++ctags: CTAGS ++CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ test -z "$(CTAGS_ARGS)$$unique" \ ++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ ++ $$unique ++ ++GTAGS: ++ here=`$(am__cd) $(top_builddir) && pwd` \ ++ && $(am__cd) $(top_srcdir) \ ++ && gtags -i $(GTAGS_ARGS) "$$here" ++ ++distclean-tags: ++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags ++ ++distdir: $(DISTFILES) ++ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ list='$(DISTFILES)'; \ ++ dist_files=`for file in $$list; do echo $$file; done | \ ++ sed -e "s|^$$srcdirstrip/||;t" \ ++ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ ++ case $$dist_files in \ ++ */*) $(MKDIR_P) `echo "$$dist_files" | \ ++ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ ++ sort -u` ;; \ ++ esac; \ ++ for file in $$dist_files; do \ ++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ ++ if test -d $$d/$$file; then \ ++ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ ++ if test -d "$(distdir)/$$file"; then \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ ++ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ ++ else \ ++ test -f "$(distdir)/$$file" \ ++ || cp -p $$d/$$file "$(distdir)/$$file" \ ++ || exit 1; \ ++ fi; \ ++ done ++check-am: all-am ++check: check-am ++all-am: Makefile $(HEADERS) ++installdirs: ++ for dir in "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)"; do \ ++ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ ++ done ++install: install-am ++install-exec: install-exec-am ++install-data: install-data-am ++uninstall: uninstall-am ++ ++install-am: all-am ++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am ++ ++installcheck: installcheck-am ++install-strip: ++ if test -z '$(STRIP)'; then \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ install; \ ++ else \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ ++ fi ++mostlyclean-generic: ++ ++clean-generic: ++ ++distclean-generic: ++ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) ++ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) ++ ++maintainer-clean-generic: ++ @echo "This command is intended for maintainers to use" ++ @echo "it deletes files that may require special tools to rebuild." ++clean: clean-am ++ ++clean-am: clean-generic clean-libtool mostlyclean-am ++ ++distclean: distclean-am ++ -rm -f Makefile ++distclean-am: clean-am distclean-generic distclean-tags ++ ++dvi: dvi-am ++ ++dvi-am: ++ ++html: html-am ++ ++html-am: ++ ++info: info-am ++ ++info-am: ++ ++install-data-am: install-kernelHEADERS install-libzfsHEADERS ++ ++install-dvi: install-dvi-am ++ ++install-dvi-am: ++ ++install-exec-am: ++ ++install-html: install-html-am ++ ++install-html-am: ++ ++install-info: install-info-am ++ ++install-info-am: ++ ++install-man: ++ ++install-pdf: install-pdf-am ++ ++install-pdf-am: ++ ++install-ps: install-ps-am ++ ++install-ps-am: ++ ++installcheck-am: ++ ++maintainer-clean: maintainer-clean-am ++ -rm -f Makefile ++maintainer-clean-am: distclean-am maintainer-clean-generic ++ ++mostlyclean: mostlyclean-am ++ ++mostlyclean-am: mostlyclean-generic mostlyclean-libtool ++ ++pdf: pdf-am ++ ++pdf-am: ++ ++ps: ps-am ++ ++ps-am: ++ ++uninstall-am: uninstall-kernelHEADERS uninstall-libzfsHEADERS ++ ++.MAKE: install-am install-strip ++ ++.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ ++ clean-libtool ctags distclean distclean-generic \ ++ distclean-libtool distclean-tags distdir dvi dvi-am html \ ++ html-am info info-am install install-am install-data \ ++ install-data-am install-dvi install-dvi-am install-exec \ ++ install-exec-am install-html install-html-am install-info \ ++ install-info-am install-kernelHEADERS install-libzfsHEADERS \ ++ install-man install-pdf install-pdf-am install-ps \ ++ install-ps-am install-strip installcheck installcheck-am \ ++ installdirs maintainer-clean maintainer-clean-generic \ ++ mostlyclean mostlyclean-generic mostlyclean-libtool pdf pdf-am \ ++ ps ps-am tags uninstall uninstall-am uninstall-kernelHEADERS \ ++ uninstall-libzfsHEADERS ++ ++ ++# Tell versions [3.59,3.63) of GNU make to not export all variables. ++# Otherwise a system limit (for SysV at least) may be exceeded. ++.NOEXPORT: +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/fs/Makefile.am linux-3.2.33-go/include/zfs/sys/fs/Makefile.am +--- linux-3.2.33-go.orig/include/zfs/sys/fs/Makefile.am 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/fs/Makefile.am 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,18 @@ ++COMMON_H = \ ++ $(top_srcdir)/include/sys/fs/zfs.h ++ ++KERNEL_H = ++ ++USER_H = ++ ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++ ++if CONFIG_USER ++libzfsdir = $(includedir)/libzfs/sys/fs ++libzfs_HEADERS = $(COMMON_H) $(USER_H) ++endif ++ ++if CONFIG_KERNEL ++kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION)/sys/fs ++kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++endif +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/fs/Makefile.in linux-3.2.33-go/include/zfs/sys/fs/Makefile.in +--- linux-3.2.33-go.orig/include/zfs/sys/fs/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/fs/Makefile.in 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,659 @@ ++# Makefile.in generated by automake 1.11.6 from Makefile.am. ++# @configure_input@ ++ ++# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, ++# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software ++# Foundation, Inc. ++# This Makefile.in is free software; the Free Software Foundation ++# gives unlimited permission to copy and/or distribute it, ++# with or without modifications, as long as this notice is preserved. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without ++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A ++# PARTICULAR PURPOSE. ++ ++@SET_MAKE@ ++ ++VPATH = @srcdir@ ++am__make_dryrun = \ ++ { \ ++ am__dry=no; \ ++ case $$MAKEFLAGS in \ ++ *\\[\ \ ]*) \ ++ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ ++ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ ++ *) \ ++ for am__flg in $$MAKEFLAGS; do \ ++ case $$am__flg in \ ++ *=*|--*) ;; \ ++ *n*) am__dry=yes; break;; \ ++ esac; \ ++ done;; \ ++ esac; \ ++ test $$am__dry = yes; \ ++ } ++pkgdatadir = $(datadir)/@PACKAGE@ ++pkgincludedir = $(includedir)/@PACKAGE@ ++pkglibdir = $(libdir)/@PACKAGE@ ++pkglibexecdir = $(libexecdir)/@PACKAGE@ ++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd ++install_sh_DATA = $(install_sh) -c -m 644 ++install_sh_PROGRAM = $(install_sh) -c ++install_sh_SCRIPT = $(install_sh) -c ++INSTALL_HEADER = $(INSTALL_DATA) ++transform = $(program_transform_name) ++NORMAL_INSTALL = : ++PRE_INSTALL = : ++POST_INSTALL = : ++NORMAL_UNINSTALL = : ++PRE_UNINSTALL = : ++POST_UNINSTALL = : ++build_triplet = @build@ ++host_triplet = @host@ ++target_triplet = @target@ ++subdir = include/sys/fs ++DIST_COMMON = $(am__kernel_HEADERS_DIST) $(am__libzfs_HEADERS_DIST) \ ++ $(srcdir)/Makefile.am $(srcdir)/Makefile.in ++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ++am__aclocal_m4_deps = \ ++ $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \ ++ $(top_srcdir)/config/kernel-automount.m4 \ ++ $(top_srcdir)/config/kernel-bdev-block-device-operations.m4 \ ++ $(top_srcdir)/config/kernel-bdev-logical-size.m4 \ ++ $(top_srcdir)/config/kernel-bdi-setup-and-register.m4 \ ++ $(top_srcdir)/config/kernel-bdi.m4 \ ++ $(top_srcdir)/config/kernel-bio-empty-barrier.m4 \ ++ $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \ ++ $(top_srcdir)/config/kernel-bio-failfast.m4 \ ++ $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \ ++ $(top_srcdir)/config/kernel-blk-end-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-fetch-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-discard.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-flush.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \ ++ $(top_srcdir)/config/kernel-blk-requeue-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-pos.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get.m4 \ ++ $(top_srcdir)/config/kernel-check-disk-size-change.m4 \ ++ $(top_srcdir)/config/kernel-clear-inode.m4 \ ++ $(top_srcdir)/config/kernel-commit-metadata.m4 \ ++ $(top_srcdir)/config/kernel-create-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-d-make-root.m4 \ ++ $(top_srcdir)/config/kernel-d-obtain-alias.m4 \ ++ $(top_srcdir)/config/kernel-discard-granularity.m4 \ ++ $(top_srcdir)/config/kernel-elevator-change.m4 \ ++ $(top_srcdir)/config/kernel-encode-fh-inode.m4 \ ++ $(top_srcdir)/config/kernel-evict-inode.m4 \ ++ $(top_srcdir)/config/kernel-fallocate.m4 \ ++ $(top_srcdir)/config/kernel-fmode-t.m4 \ ++ $(top_srcdir)/config/kernel-fsync.m4 \ ++ $(top_srcdir)/config/kernel-get-disk-ro.m4 \ ++ $(top_srcdir)/config/kernel-get-gendisk.m4 \ ++ $(top_srcdir)/config/kernel-insert-inode-locked.m4 \ ++ $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \ ++ $(top_srcdir)/config/kernel-kobj-name-len.m4 \ ++ $(top_srcdir)/config/kernel-lookup-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \ ++ $(top_srcdir)/config/kernel-mount-nodev.m4 \ ++ $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \ ++ $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \ ++ $(top_srcdir)/config/kernel-rq-is_sync.m4 \ ++ $(top_srcdir)/config/kernel-security-inode-init.m4 \ ++ $(top_srcdir)/config/kernel-set-nlink.m4 \ ++ $(top_srcdir)/config/kernel-sget-args.m4 \ ++ $(top_srcdir)/config/kernel-show-options.m4 \ ++ $(top_srcdir)/config/kernel-shrink.m4 \ ++ $(top_srcdir)/config/kernel-truncate-range.m4 \ ++ $(top_srcdir)/config/kernel-truncate-setsize.m4 \ ++ $(top_srcdir)/config/kernel-xattr-handler.m4 \ ++ $(top_srcdir)/config/kernel.m4 \ ++ $(top_srcdir)/config/user-arch.m4 \ ++ $(top_srcdir)/config/user-frame-larger-than.m4 \ ++ $(top_srcdir)/config/user-ioctl.m4 \ ++ $(top_srcdir)/config/user-libblkid.m4 \ ++ $(top_srcdir)/config/user-libuuid.m4 \ ++ $(top_srcdir)/config/user-nptl_guard_within_stack.m4 \ ++ $(top_srcdir)/config/user-selinux.m4 \ ++ $(top_srcdir)/config/user-udev.m4 \ ++ $(top_srcdir)/config/user-zlib.m4 $(top_srcdir)/config/user.m4 \ ++ $(top_srcdir)/config/zfs-build.m4 \ ++ $(top_srcdir)/config/zfs-meta.m4 $(top_srcdir)/configure.ac ++am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ ++ $(ACLOCAL_M4) ++mkinstalldirs = $(install_sh) -d ++CONFIG_HEADER = $(top_builddir)/zfs_config.h ++CONFIG_CLEAN_FILES = ++CONFIG_CLEAN_VPATH_FILES = ++AM_V_GEN = $(am__v_GEN_@AM_V@) ++am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) ++am__v_GEN_0 = @echo " GEN " $@; ++AM_V_at = $(am__v_at_@AM_V@) ++am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) ++am__v_at_0 = @ ++SOURCES = ++DIST_SOURCES = ++am__can_run_installinfo = \ ++ case $$AM_UPDATE_INFO_DIR in \ ++ n|no|NO) false;; \ ++ *) (install-info --version) >/dev/null 2>&1;; \ ++ esac ++am__kernel_HEADERS_DIST = $(top_srcdir)/include/sys/fs/zfs.h ++am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; ++am__vpath_adj = case $$p in \ ++ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ ++ *) f=$$p;; \ ++ esac; ++am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; ++am__install_max = 40 ++am__nobase_strip_setup = \ ++ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` ++am__nobase_strip = \ ++ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" ++am__nobase_list = $(am__nobase_strip_setup); \ ++ for p in $$list; do echo "$$p $$p"; done | \ ++ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ ++ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ ++ if (++n[$$2] == $(am__install_max)) \ ++ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ ++ END { for (dir in files) print dir, files[dir] }' ++am__base_list = \ ++ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ ++ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' ++am__uninstall_files_from_dir = { \ ++ test -z "$$files" \ ++ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ ++ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ ++ $(am__cd) "$$dir" && rm -f $$files; }; \ ++ } ++am__installdirs = "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)" ++am__libzfs_HEADERS_DIST = $(top_srcdir)/include/sys/fs/zfs.h ++HEADERS = $(kernel_HEADERS) $(libzfs_HEADERS) ++ETAGS = etags ++CTAGS = ctags ++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ++ACLOCAL = @ACLOCAL@ ++ALIEN = @ALIEN@ ++ALIEN_VERSION = @ALIEN_VERSION@ ++AMTAR = @AMTAR@ ++AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ++AR = @AR@ ++AUTOCONF = @AUTOCONF@ ++AUTOHEADER = @AUTOHEADER@ ++AUTOMAKE = @AUTOMAKE@ ++AWK = @AWK@ ++CC = @CC@ ++CCAS = @CCAS@ ++CCASDEPMODE = @CCASDEPMODE@ ++CCASFLAGS = @CCASFLAGS@ ++CCDEPMODE = @CCDEPMODE@ ++CFLAGS = @CFLAGS@ ++CPP = @CPP@ ++CPPFLAGS = @CPPFLAGS@ ++CYGPATH_W = @CYGPATH_W@ ++DEBUG_CFLAGS = @DEBUG_CFLAGS@ ++DEBUG_DMU_TX = @DEBUG_DMU_TX@ ++DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@ ++DEBUG_ZFS = @DEBUG_ZFS@ ++DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@ ++DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@ ++DEFAULT_PACKAGE = @DEFAULT_PACKAGE@ ++DEFS = @DEFS@ ++DEPDIR = @DEPDIR@ ++DLLTOOL = @DLLTOOL@ ++DPKG = @DPKG@ ++DPKGBUILD = @DPKGBUILD@ ++DPKGBUILD_VERSION = @DPKGBUILD_VERSION@ ++DPKG_VERSION = @DPKG_VERSION@ ++DSYMUTIL = @DSYMUTIL@ ++DUMPBIN = @DUMPBIN@ ++ECHO_C = @ECHO_C@ ++ECHO_N = @ECHO_N@ ++ECHO_T = @ECHO_T@ ++EGREP = @EGREP@ ++EXEEXT = @EXEEXT@ ++FGREP = @FGREP@ ++FRAME_LARGER_THAN = @FRAME_LARGER_THAN@ ++GREP = @GREP@ ++HAVE_ALIEN = @HAVE_ALIEN@ ++HAVE_DPKG = @HAVE_DPKG@ ++HAVE_DPKGBUILD = @HAVE_DPKGBUILD@ ++HAVE_MAKEPKG = @HAVE_MAKEPKG@ ++HAVE_PACMAN = @HAVE_PACMAN@ ++HAVE_RPM = @HAVE_RPM@ ++HAVE_RPMBUILD = @HAVE_RPMBUILD@ ++INSTALL = @INSTALL@ ++INSTALL_DATA = @INSTALL_DATA@ ++INSTALL_PROGRAM = @INSTALL_PROGRAM@ ++INSTALL_SCRIPT = @INSTALL_SCRIPT@ ++INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ ++KERNELCPPFLAGS = @KERNELCPPFLAGS@ ++KERNELMAKE_PARAMS = @KERNELMAKE_PARAMS@ ++LD = @LD@ ++LDFLAGS = @LDFLAGS@ ++LIBBLKID = @LIBBLKID@ ++LIBOBJS = @LIBOBJS@ ++LIBS = @LIBS@ ++LIBSELINUX = @LIBSELINUX@ ++LIBTOOL = @LIBTOOL@ ++LIBUUID = @LIBUUID@ ++LINUX = @LINUX@ ++LINUX_OBJ = @LINUX_OBJ@ ++LINUX_SYMBOLS = @LINUX_SYMBOLS@ ++LINUX_VERSION = @LINUX_VERSION@ ++LIPO = @LIPO@ ++LN_S = @LN_S@ ++LTLIBOBJS = @LTLIBOBJS@ ++MAINT = @MAINT@ ++MAKEINFO = @MAKEINFO@ ++MAKEPKG = @MAKEPKG@ ++MAKEPKG_VERSION = @MAKEPKG_VERSION@ ++MANIFEST_TOOL = @MANIFEST_TOOL@ ++MKDIR_P = @MKDIR_P@ ++NM = @NM@ ++NMEDIT = @NMEDIT@ ++NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@ ++OBJDUMP = @OBJDUMP@ ++OBJEXT = @OBJEXT@ ++OTOOL = @OTOOL@ ++OTOOL64 = @OTOOL64@ ++PACKAGE = @PACKAGE@ ++PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ ++PACKAGE_NAME = @PACKAGE_NAME@ ++PACKAGE_STRING = @PACKAGE_STRING@ ++PACKAGE_TARNAME = @PACKAGE_TARNAME@ ++PACKAGE_URL = @PACKAGE_URL@ ++PACKAGE_VERSION = @PACKAGE_VERSION@ ++PACMAN = @PACMAN@ ++PACMAN_VERSION = @PACMAN_VERSION@ ++PATH_SEPARATOR = @PATH_SEPARATOR@ ++RANLIB = @RANLIB@ ++RPM = @RPM@ ++RPMBUILD = @RPMBUILD@ ++RPMBUILD_VERSION = @RPMBUILD_VERSION@ ++RPM_VERSION = @RPM_VERSION@ ++SED = @SED@ ++SET_MAKE = @SET_MAKE@ ++SHELL = @SHELL@ ++SPL = @SPL@ ++SPL_OBJ = @SPL_OBJ@ ++SPL_SYMBOLS = @SPL_SYMBOLS@ ++SPL_VERSION = @SPL_VERSION@ ++STRIP = @STRIP@ ++TARGET_ASM_DIR = @TARGET_ASM_DIR@ ++VENDOR = @VENDOR@ ++VERSION = @VERSION@ ++ZFS_CONFIG = @ZFS_CONFIG@ ++ZFS_META_ALIAS = @ZFS_META_ALIAS@ ++ZFS_META_AUTHOR = @ZFS_META_AUTHOR@ ++ZFS_META_DATA = @ZFS_META_DATA@ ++ZFS_META_LICENSE = @ZFS_META_LICENSE@ ++ZFS_META_LT_AGE = @ZFS_META_LT_AGE@ ++ZFS_META_LT_CURRENT = @ZFS_META_LT_CURRENT@ ++ZFS_META_LT_REVISION = @ZFS_META_LT_REVISION@ ++ZFS_META_NAME = @ZFS_META_NAME@ ++ZFS_META_RELEASE = @ZFS_META_RELEASE@ ++ZFS_META_VERSION = @ZFS_META_VERSION@ ++ZLIB = @ZLIB@ ++abs_builddir = @abs_builddir@ ++abs_srcdir = @abs_srcdir@ ++abs_top_builddir = @abs_top_builddir@ ++abs_top_srcdir = @abs_top_srcdir@ ++ac_ct_AR = @ac_ct_AR@ ++ac_ct_CC = @ac_ct_CC@ ++ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ ++am__include = @am__include@ ++am__leading_dot = @am__leading_dot@ ++am__quote = @am__quote@ ++am__tar = @am__tar@ ++am__untar = @am__untar@ ++bindir = @bindir@ ++build = @build@ ++build_alias = @build_alias@ ++build_cpu = @build_cpu@ ++build_os = @build_os@ ++build_vendor = @build_vendor@ ++builddir = @builddir@ ++datadir = @datadir@ ++datarootdir = @datarootdir@ ++docdir = @docdir@ ++dvidir = @dvidir@ ++exec_prefix = @exec_prefix@ ++host = @host@ ++host_alias = @host_alias@ ++host_cpu = @host_cpu@ ++host_os = @host_os@ ++host_vendor = @host_vendor@ ++htmldir = @htmldir@ ++includedir = @includedir@ ++infodir = @infodir@ ++install_sh = @install_sh@ ++libdir = @libdir@ ++libexecdir = @libexecdir@ ++localedir = @localedir@ ++localstatedir = @localstatedir@ ++mandir = @mandir@ ++mkdir_p = @mkdir_p@ ++oldincludedir = @oldincludedir@ ++pdfdir = @pdfdir@ ++prefix = @prefix@ ++program_transform_name = @program_transform_name@ ++psdir = @psdir@ ++sbindir = @sbindir@ ++sharedstatedir = @sharedstatedir@ ++srcdir = @srcdir@ ++sysconfdir = @sysconfdir@ ++target = @target@ ++target_alias = @target_alias@ ++target_cpu = @target_cpu@ ++target_os = @target_os@ ++target_vendor = @target_vendor@ ++top_build_prefix = @top_build_prefix@ ++top_builddir = @top_builddir@ ++top_srcdir = @top_srcdir@ ++udevdir = @udevdir@ ++udevruledir = @udevruledir@ ++COMMON_H = \ ++ $(top_srcdir)/include/sys/fs/zfs.h ++ ++KERNEL_H = ++USER_H = ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++@CONFIG_USER_TRUE@libzfsdir = $(includedir)/libzfs/sys/fs ++@CONFIG_USER_TRUE@libzfs_HEADERS = $(COMMON_H) $(USER_H) ++@CONFIG_KERNEL_TRUE@kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION)/sys/fs ++@CONFIG_KERNEL_TRUE@kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++all: all-am ++ ++.SUFFIXES: ++$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) ++ @for dep in $?; do \ ++ case '$(am__configure_deps)' in \ ++ *$$dep*) \ ++ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ ++ && { if test -f $@; then exit 0; else break; fi; }; \ ++ exit 1;; \ ++ esac; \ ++ done; \ ++ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu include/sys/fs/Makefile'; \ ++ $(am__cd) $(top_srcdir) && \ ++ $(AUTOMAKE) --gnu include/sys/fs/Makefile ++.PRECIOUS: Makefile ++Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status ++ @case '$?' in \ ++ *config.status*) \ ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ ++ *) \ ++ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ ++ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ ++ esac; ++ ++$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++ ++$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(am__aclocal_m4_deps): ++ ++mostlyclean-libtool: ++ -rm -f *.lo ++ ++clean-libtool: ++ -rm -rf .libs _libs ++install-kernelHEADERS: $(kernel_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(kerneldir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(kerneldir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(kerneldir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(kerneldir)" || exit $$?; \ ++ done ++ ++uninstall-kernelHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(kerneldir)'; $(am__uninstall_files_from_dir) ++install-libzfsHEADERS: $(libzfs_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(libzfsdir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(libzfsdir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libzfsdir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(libzfsdir)" || exit $$?; \ ++ done ++ ++uninstall-libzfsHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(libzfsdir)'; $(am__uninstall_files_from_dir) ++ ++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ mkid -fID $$unique ++tags: TAGS ++ ++TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ set x; \ ++ here=`pwd`; \ ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ shift; \ ++ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ ++ test -n "$$unique" || unique=$$empty_fix; \ ++ if test $$# -gt 0; then \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ "$$@" $$unique; \ ++ else \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ $$unique; \ ++ fi; \ ++ fi ++ctags: CTAGS ++CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ test -z "$(CTAGS_ARGS)$$unique" \ ++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ ++ $$unique ++ ++GTAGS: ++ here=`$(am__cd) $(top_builddir) && pwd` \ ++ && $(am__cd) $(top_srcdir) \ ++ && gtags -i $(GTAGS_ARGS) "$$here" ++ ++distclean-tags: ++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags ++ ++distdir: $(DISTFILES) ++ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ list='$(DISTFILES)'; \ ++ dist_files=`for file in $$list; do echo $$file; done | \ ++ sed -e "s|^$$srcdirstrip/||;t" \ ++ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ ++ case $$dist_files in \ ++ */*) $(MKDIR_P) `echo "$$dist_files" | \ ++ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ ++ sort -u` ;; \ ++ esac; \ ++ for file in $$dist_files; do \ ++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ ++ if test -d $$d/$$file; then \ ++ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ ++ if test -d "$(distdir)/$$file"; then \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ ++ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ ++ else \ ++ test -f "$(distdir)/$$file" \ ++ || cp -p $$d/$$file "$(distdir)/$$file" \ ++ || exit 1; \ ++ fi; \ ++ done ++check-am: all-am ++check: check-am ++all-am: Makefile $(HEADERS) ++installdirs: ++ for dir in "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)"; do \ ++ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ ++ done ++install: install-am ++install-exec: install-exec-am ++install-data: install-data-am ++uninstall: uninstall-am ++ ++install-am: all-am ++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am ++ ++installcheck: installcheck-am ++install-strip: ++ if test -z '$(STRIP)'; then \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ install; \ ++ else \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ ++ fi ++mostlyclean-generic: ++ ++clean-generic: ++ ++distclean-generic: ++ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) ++ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) ++ ++maintainer-clean-generic: ++ @echo "This command is intended for maintainers to use" ++ @echo "it deletes files that may require special tools to rebuild." ++clean: clean-am ++ ++clean-am: clean-generic clean-libtool mostlyclean-am ++ ++distclean: distclean-am ++ -rm -f Makefile ++distclean-am: clean-am distclean-generic distclean-tags ++ ++dvi: dvi-am ++ ++dvi-am: ++ ++html: html-am ++ ++html-am: ++ ++info: info-am ++ ++info-am: ++ ++install-data-am: install-kernelHEADERS install-libzfsHEADERS ++ ++install-dvi: install-dvi-am ++ ++install-dvi-am: ++ ++install-exec-am: ++ ++install-html: install-html-am ++ ++install-html-am: ++ ++install-info: install-info-am ++ ++install-info-am: ++ ++install-man: ++ ++install-pdf: install-pdf-am ++ ++install-pdf-am: ++ ++install-ps: install-ps-am ++ ++install-ps-am: ++ ++installcheck-am: ++ ++maintainer-clean: maintainer-clean-am ++ -rm -f Makefile ++maintainer-clean-am: distclean-am maintainer-clean-generic ++ ++mostlyclean: mostlyclean-am ++ ++mostlyclean-am: mostlyclean-generic mostlyclean-libtool ++ ++pdf: pdf-am ++ ++pdf-am: ++ ++ps: ps-am ++ ++ps-am: ++ ++uninstall-am: uninstall-kernelHEADERS uninstall-libzfsHEADERS ++ ++.MAKE: install-am install-strip ++ ++.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ ++ clean-libtool ctags distclean distclean-generic \ ++ distclean-libtool distclean-tags distdir dvi dvi-am html \ ++ html-am info info-am install install-am install-data \ ++ install-data-am install-dvi install-dvi-am install-exec \ ++ install-exec-am install-html install-html-am install-info \ ++ install-info-am install-kernelHEADERS install-libzfsHEADERS \ ++ install-man install-pdf install-pdf-am install-ps \ ++ install-ps-am install-strip installcheck installcheck-am \ ++ installdirs maintainer-clean maintainer-clean-generic \ ++ mostlyclean mostlyclean-generic mostlyclean-libtool pdf pdf-am \ ++ ps ps-am tags uninstall uninstall-am uninstall-kernelHEADERS \ ++ uninstall-libzfsHEADERS ++ ++ ++# Tell versions [3.59,3.63) of GNU make to not export all variables. ++# Otherwise a system limit (for SysV at least) may be exceeded. ++.NOEXPORT: +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/fs/zfs.h linux-3.2.33-go/include/zfs/sys/fs/zfs.h +--- linux-3.2.33-go.orig/include/zfs/sys/fs/zfs.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/fs/zfs.h 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,948 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ * Copyright (c) 2012, Joyent, Inc. All rights reserved. ++ */ ++ ++/* Portions Copyright 2010 Robert Milkowski */ ++ ++#ifndef _SYS_FS_ZFS_H ++#define _SYS_FS_ZFS_H ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * Types and constants shared between userland and the kernel. ++ */ ++ ++/* ++ * Each dataset can be one of the following types. These constants can be ++ * combined into masks that can be passed to various functions. ++ */ ++typedef enum { ++ ZFS_TYPE_FILESYSTEM = 0x1, ++ ZFS_TYPE_SNAPSHOT = 0x2, ++ ZFS_TYPE_VOLUME = 0x4, ++ ZFS_TYPE_POOL = 0x8 ++} zfs_type_t; ++ ++#define ZFS_TYPE_DATASET \ ++ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) ++ ++#define ZAP_MAXNAMELEN 256 ++#define ZAP_MAXVALUELEN (1024 * 8) ++#define ZAP_OLDMAXVALUELEN 1024 ++ ++/* ++ * Dataset properties are identified by these constants and must be added to ++ * the end of this list to ensure that external consumers are not affected ++ * by the change. If you make any changes to this list, be sure to update ++ * the property table in usr/src/common/zfs/zfs_prop.c. ++ */ ++typedef enum { ++ ZFS_PROP_TYPE, ++ ZFS_PROP_CREATION, ++ ZFS_PROP_USED, ++ ZFS_PROP_AVAILABLE, ++ ZFS_PROP_REFERENCED, ++ ZFS_PROP_COMPRESSRATIO, ++ ZFS_PROP_MOUNTED, ++ ZFS_PROP_ORIGIN, ++ ZFS_PROP_QUOTA, ++ ZFS_PROP_RESERVATION, ++ ZFS_PROP_VOLSIZE, ++ ZFS_PROP_VOLBLOCKSIZE, ++ ZFS_PROP_RECORDSIZE, ++ ZFS_PROP_MOUNTPOINT, ++ ZFS_PROP_SHARENFS, ++ ZFS_PROP_CHECKSUM, ++ ZFS_PROP_COMPRESSION, ++ ZFS_PROP_ATIME, ++ ZFS_PROP_DEVICES, ++ ZFS_PROP_EXEC, ++ ZFS_PROP_SETUID, ++ ZFS_PROP_READONLY, ++ ZFS_PROP_ZONED, ++ ZFS_PROP_SNAPDIR, ++ ZFS_PROP_PRIVATE, /* not exposed to user, temporary */ ++ ZFS_PROP_ACLINHERIT, ++ ZFS_PROP_CREATETXG, /* not exposed to the user */ ++ ZFS_PROP_NAME, /* not exposed to the user */ ++ ZFS_PROP_CANMOUNT, ++ ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ ++ ZFS_PROP_XATTR, ++ ZFS_PROP_NUMCLONES, /* not exposed to the user */ ++ ZFS_PROP_COPIES, ++ ZFS_PROP_VERSION, ++ ZFS_PROP_UTF8ONLY, ++ ZFS_PROP_NORMALIZE, ++ ZFS_PROP_CASE, ++ ZFS_PROP_VSCAN, ++ ZFS_PROP_NBMAND, ++ ZFS_PROP_SHARESMB, ++ ZFS_PROP_REFQUOTA, ++ ZFS_PROP_REFRESERVATION, ++ ZFS_PROP_GUID, ++ ZFS_PROP_PRIMARYCACHE, ++ ZFS_PROP_SECONDARYCACHE, ++ ZFS_PROP_USEDSNAP, ++ ZFS_PROP_USEDDS, ++ ZFS_PROP_USEDCHILD, ++ ZFS_PROP_USEDREFRESERV, ++ ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ ++ ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ ++ ZFS_PROP_DEFER_DESTROY, ++ ZFS_PROP_USERREFS, ++ ZFS_PROP_LOGBIAS, ++ ZFS_PROP_UNIQUE, /* not exposed to the user */ ++ ZFS_PROP_OBJSETID, /* not exposed to the user */ ++ ZFS_PROP_DEDUP, ++ ZFS_PROP_MLSLABEL, ++ ZFS_PROP_SYNC, ++ ZFS_PROP_REFRATIO, ++ ZFS_PROP_WRITTEN, ++ ZFS_PROP_CLONES, ++ ZFS_NUM_PROPS ++} zfs_prop_t; ++ ++typedef enum { ++ ZFS_PROP_USERUSED, ++ ZFS_PROP_USERQUOTA, ++ ZFS_PROP_GROUPUSED, ++ ZFS_PROP_GROUPQUOTA, ++ ZFS_NUM_USERQUOTA_PROPS ++} zfs_userquota_prop_t; ++ ++extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; ++ ++/* ++ * Pool properties are identified by these constants and must be added to the ++ * end of this list to ensure that external consumers are not affected ++ * by the change. If you make any changes to this list, be sure to update ++ * the property table in usr/src/common/zfs/zpool_prop.c. ++ */ ++typedef enum { ++ ZPOOL_PROP_NAME, ++ ZPOOL_PROP_SIZE, ++ ZPOOL_PROP_CAPACITY, ++ ZPOOL_PROP_ALTROOT, ++ ZPOOL_PROP_HEALTH, ++ ZPOOL_PROP_GUID, ++ ZPOOL_PROP_VERSION, ++ ZPOOL_PROP_BOOTFS, ++ ZPOOL_PROP_DELEGATION, ++ ZPOOL_PROP_AUTOREPLACE, ++ ZPOOL_PROP_CACHEFILE, ++ ZPOOL_PROP_FAILUREMODE, ++ ZPOOL_PROP_LISTSNAPS, ++ ZPOOL_PROP_AUTOEXPAND, ++ ZPOOL_PROP_DEDUPDITTO, ++ ZPOOL_PROP_DEDUPRATIO, ++ ZPOOL_PROP_FREE, ++ ZPOOL_PROP_ALLOCATED, ++ ZPOOL_PROP_READONLY, ++ ZPOOL_PROP_ASHIFT, ++ ZPOOL_PROP_COMMENT, ++ ZPOOL_PROP_EXPANDSZ, ++ ZPOOL_NUM_PROPS ++} zpool_prop_t; ++ ++/* Small enough to not hog a whole line of printout in zpool(1M). */ ++#define ZPROP_MAX_COMMENT 32 ++ ++#define ZPROP_CONT -2 ++#define ZPROP_INVAL -1 ++ ++#define ZPROP_VALUE "value" ++#define ZPROP_SOURCE "source" ++ ++typedef enum { ++ ZPROP_SRC_NONE = 0x1, ++ ZPROP_SRC_DEFAULT = 0x2, ++ ZPROP_SRC_TEMPORARY = 0x4, ++ ZPROP_SRC_LOCAL = 0x8, ++ ZPROP_SRC_INHERITED = 0x10, ++ ZPROP_SRC_RECEIVED = 0x20 ++} zprop_source_t; ++ ++#define ZPROP_SRC_ALL 0x3f ++ ++#define ZPROP_SOURCE_VAL_RECVD "$recvd" ++#define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS" ++/* ++ * Dataset flag implemented as a special entry in the props zap object ++ * indicating that the dataset has received properties on or after ++ * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties ++ * just as it did in earlier versions, and thereafter, local properties are ++ * preserved. ++ */ ++#define ZPROP_HAS_RECVD "$hasrecvd" ++ ++typedef enum { ++ ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */ ++ ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */ ++} zprop_errflags_t; ++ ++typedef int (*zprop_func)(int, void *); ++ ++/* ++ * Properties to be set on the root file system of a new pool ++ * are stuffed into their own nvlist, which is then included in ++ * the properties nvlist with the pool properties. ++ */ ++#define ZPOOL_ROOTFS_PROPS "root-props-nvl" ++ ++/* ++ * Dataset property functions shared between libzfs and kernel. ++ */ ++const char *zfs_prop_default_string(zfs_prop_t); ++uint64_t zfs_prop_default_numeric(zfs_prop_t); ++boolean_t zfs_prop_readonly(zfs_prop_t); ++boolean_t zfs_prop_inheritable(zfs_prop_t); ++boolean_t zfs_prop_setonce(zfs_prop_t); ++const char *zfs_prop_to_name(zfs_prop_t); ++zfs_prop_t zfs_name_to_prop(const char *); ++boolean_t zfs_prop_user(const char *); ++boolean_t zfs_prop_userquota(const char *); ++boolean_t zfs_prop_written(const char *); ++int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); ++int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); ++uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); ++boolean_t zfs_prop_valid_for_type(int, zfs_type_t); ++ ++/* ++ * Pool property functions shared between libzfs and kernel. ++ */ ++zpool_prop_t zpool_name_to_prop(const char *); ++const char *zpool_prop_to_name(zpool_prop_t); ++const char *zpool_prop_default_string(zpool_prop_t); ++uint64_t zpool_prop_default_numeric(zpool_prop_t); ++boolean_t zpool_prop_readonly(zpool_prop_t); ++int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); ++int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); ++uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); ++ ++/* ++ * Definitions for the Delegation. ++ */ ++typedef enum { ++ ZFS_DELEG_WHO_UNKNOWN = 0, ++ ZFS_DELEG_USER = 'u', ++ ZFS_DELEG_USER_SETS = 'U', ++ ZFS_DELEG_GROUP = 'g', ++ ZFS_DELEG_GROUP_SETS = 'G', ++ ZFS_DELEG_EVERYONE = 'e', ++ ZFS_DELEG_EVERYONE_SETS = 'E', ++ ZFS_DELEG_CREATE = 'c', ++ ZFS_DELEG_CREATE_SETS = 'C', ++ ZFS_DELEG_NAMED_SET = 's', ++ ZFS_DELEG_NAMED_SET_SETS = 'S' ++} zfs_deleg_who_type_t; ++ ++typedef enum { ++ ZFS_DELEG_NONE = 0, ++ ZFS_DELEG_PERM_LOCAL = 1, ++ ZFS_DELEG_PERM_DESCENDENT = 2, ++ ZFS_DELEG_PERM_LOCALDESCENDENT = 3, ++ ZFS_DELEG_PERM_CREATE = 4 ++} zfs_deleg_inherit_t; ++ ++#define ZFS_DELEG_PERM_UID "uid" ++#define ZFS_DELEG_PERM_GID "gid" ++#define ZFS_DELEG_PERM_GROUPS "groups" ++ ++#define ZFS_MLSLABEL_DEFAULT "none" ++ ++#define ZFS_SMB_ACL_SRC "src" ++#define ZFS_SMB_ACL_TARGET "target" ++ ++typedef enum { ++ ZFS_CANMOUNT_OFF = 0, ++ ZFS_CANMOUNT_ON = 1, ++ ZFS_CANMOUNT_NOAUTO = 2 ++} zfs_canmount_type_t; ++ ++typedef enum { ++ ZFS_LOGBIAS_LATENCY = 0, ++ ZFS_LOGBIAS_THROUGHPUT = 1 ++} zfs_logbias_op_t; ++ ++typedef enum zfs_share_op { ++ ZFS_SHARE_NFS = 0, ++ ZFS_UNSHARE_NFS = 1, ++ ZFS_SHARE_SMB = 2, ++ ZFS_UNSHARE_SMB = 3 ++} zfs_share_op_t; ++ ++typedef enum zfs_smb_acl_op { ++ ZFS_SMB_ACL_ADD, ++ ZFS_SMB_ACL_REMOVE, ++ ZFS_SMB_ACL_RENAME, ++ ZFS_SMB_ACL_PURGE ++} zfs_smb_acl_op_t; ++ ++typedef enum zfs_cache_type { ++ ZFS_CACHE_NONE = 0, ++ ZFS_CACHE_METADATA = 1, ++ ZFS_CACHE_ALL = 2 ++} zfs_cache_type_t; ++ ++typedef enum { ++ ZFS_SYNC_STANDARD = 0, ++ ZFS_SYNC_ALWAYS = 1, ++ ZFS_SYNC_DISABLED = 2 ++} zfs_sync_type_t; ++ ++typedef enum { ++ ZFS_XATTR_OFF = 0, ++ ZFS_XATTR_DIR = 1, ++ ZFS_XATTR_SA = 2 ++} zfs_xattr_type_t; ++ ++/* ++ * On-disk version number. ++ */ ++#define SPA_VERSION_1 1ULL ++#define SPA_VERSION_2 2ULL ++#define SPA_VERSION_3 3ULL ++#define SPA_VERSION_4 4ULL ++#define SPA_VERSION_5 5ULL ++#define SPA_VERSION_6 6ULL ++#define SPA_VERSION_7 7ULL ++#define SPA_VERSION_8 8ULL ++#define SPA_VERSION_9 9ULL ++#define SPA_VERSION_10 10ULL ++#define SPA_VERSION_11 11ULL ++#define SPA_VERSION_12 12ULL ++#define SPA_VERSION_13 13ULL ++#define SPA_VERSION_14 14ULL ++#define SPA_VERSION_15 15ULL ++#define SPA_VERSION_16 16ULL ++#define SPA_VERSION_17 17ULL ++#define SPA_VERSION_18 18ULL ++#define SPA_VERSION_19 19ULL ++#define SPA_VERSION_20 20ULL ++#define SPA_VERSION_21 21ULL ++#define SPA_VERSION_22 22ULL ++#define SPA_VERSION_23 23ULL ++#define SPA_VERSION_24 24ULL ++#define SPA_VERSION_25 25ULL ++#define SPA_VERSION_26 26ULL ++#define SPA_VERSION_27 27ULL ++#define SPA_VERSION_28 28ULL ++ ++/* ++ * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk ++ * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, ++ * and do the appropriate changes. Also bump the version number in ++ * usr/src/grub/capability. ++ */ ++#define SPA_VERSION SPA_VERSION_28 ++#define SPA_VERSION_STRING "28" ++ ++/* ++ * Symbolic names for the changes that caused a SPA_VERSION switch. ++ * Used in the code when checking for presence or absence of a feature. ++ * Feel free to define multiple symbolic names for each version if there ++ * were multiple changes to on-disk structures during that version. ++ * ++ * NOTE: When checking the current SPA_VERSION in your code, be sure ++ * to use spa_version() since it reports the version of the ++ * last synced uberblock. Checking the in-flight version can ++ * be dangerous in some cases. ++ */ ++#define SPA_VERSION_INITIAL SPA_VERSION_1 ++#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 ++#define SPA_VERSION_SPARES SPA_VERSION_3 ++#define SPA_VERSION_RAIDZ2 SPA_VERSION_3 ++#define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3 ++#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 ++#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 ++#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 ++#define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5 ++#define SPA_VERSION_BOOTFS SPA_VERSION_6 ++#define SPA_VERSION_SLOGS SPA_VERSION_7 ++#define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 ++#define SPA_VERSION_FUID SPA_VERSION_9 ++#define SPA_VERSION_REFRESERVATION SPA_VERSION_9 ++#define SPA_VERSION_REFQUOTA SPA_VERSION_9 ++#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 ++#define SPA_VERSION_L2CACHE SPA_VERSION_10 ++#define SPA_VERSION_NEXT_CLONES SPA_VERSION_11 ++#define SPA_VERSION_ORIGIN SPA_VERSION_11 ++#define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 ++#define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 ++#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 ++#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 ++#define SPA_VERSION_USERSPACE SPA_VERSION_15 ++#define SPA_VERSION_STMF_PROP SPA_VERSION_16 ++#define SPA_VERSION_RAIDZ3 SPA_VERSION_17 ++#define SPA_VERSION_USERREFS SPA_VERSION_18 ++#define SPA_VERSION_HOLES SPA_VERSION_19 ++#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 ++#define SPA_VERSION_DEDUP SPA_VERSION_21 ++#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 ++#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 ++#define SPA_VERSION_SA SPA_VERSION_24 ++#define SPA_VERSION_SCAN SPA_VERSION_25 ++#define SPA_VERSION_DIR_CLONES SPA_VERSION_26 ++#define SPA_VERSION_DEADLISTS SPA_VERSION_26 ++#define SPA_VERSION_FAST_SNAP SPA_VERSION_27 ++#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 ++ ++/* ++ * ZPL version - rev'd whenever an incompatible on-disk format change ++ * occurs. This is independent of SPA/DMU/ZAP versioning. You must ++ * also update the version_table[] and help message in zfs_prop.c. ++ * ++ * When changing, be sure to teach GRUB how to read the new format! ++ * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*} ++ */ ++#define ZPL_VERSION_1 1ULL ++#define ZPL_VERSION_2 2ULL ++#define ZPL_VERSION_3 3ULL ++#define ZPL_VERSION_4 4ULL ++#define ZPL_VERSION_5 5ULL ++#define ZPL_VERSION ZPL_VERSION_5 ++#define ZPL_VERSION_STRING "5" ++ ++#define ZPL_VERSION_INITIAL ZPL_VERSION_1 ++#define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 ++#define ZPL_VERSION_FUID ZPL_VERSION_3 ++#define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 ++#define ZPL_VERSION_SYSATTR ZPL_VERSION_3 ++#define ZPL_VERSION_USERSPACE ZPL_VERSION_4 ++#define ZPL_VERSION_SA ZPL_VERSION_5 ++ ++/* Rewind request information */ ++#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ ++#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ ++#define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */ ++#define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */ ++#define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */ ++#define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */ ++#define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */ ++ ++typedef struct zpool_rewind_policy { ++ uint32_t zrp_request; /* rewind behavior requested */ ++ uint64_t zrp_maxmeta; /* max acceptable meta-data errors */ ++ uint64_t zrp_maxdata; /* max acceptable data errors */ ++ uint64_t zrp_txg; /* specific txg to load */ ++} zpool_rewind_policy_t; ++ ++/* ++ * The following are configuration names used in the nvlist describing a pool's ++ * configuration. ++ */ ++#define ZPOOL_CONFIG_VERSION "version" ++#define ZPOOL_CONFIG_POOL_NAME "name" ++#define ZPOOL_CONFIG_POOL_STATE "state" ++#define ZPOOL_CONFIG_POOL_TXG "txg" ++#define ZPOOL_CONFIG_POOL_GUID "pool_guid" ++#define ZPOOL_CONFIG_CREATE_TXG "create_txg" ++#define ZPOOL_CONFIG_TOP_GUID "top_guid" ++#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" ++#define ZPOOL_CONFIG_TYPE "type" ++#define ZPOOL_CONFIG_CHILDREN "children" ++#define ZPOOL_CONFIG_ID "id" ++#define ZPOOL_CONFIG_GUID "guid" ++#define ZPOOL_CONFIG_PATH "path" ++#define ZPOOL_CONFIG_DEVID "devid" ++#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" ++#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" ++#define ZPOOL_CONFIG_ASHIFT "ashift" ++#define ZPOOL_CONFIG_ASIZE "asize" ++#define ZPOOL_CONFIG_DTL "DTL" ++#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ ++#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ ++#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" ++#define ZPOOL_CONFIG_ERRCOUNT "error_count" ++#define ZPOOL_CONFIG_NOT_PRESENT "not_present" ++#define ZPOOL_CONFIG_SPARES "spares" ++#define ZPOOL_CONFIG_IS_SPARE "is_spare" ++#define ZPOOL_CONFIG_NPARITY "nparity" ++#define ZPOOL_CONFIG_HOSTID "hostid" ++#define ZPOOL_CONFIG_HOSTNAME "hostname" ++#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" ++#define ZPOOL_CONFIG_UNSPARE "unspare" ++#define ZPOOL_CONFIG_PHYS_PATH "phys_path" ++#define ZPOOL_CONFIG_IS_LOG "is_log" ++#define ZPOOL_CONFIG_L2CACHE "l2cache" ++#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array" ++#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" ++#define ZPOOL_CONFIG_IS_HOLE "is_hole" ++#define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" ++#define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" ++#define ZPOOL_CONFIG_DDT_STATS "ddt_stats" ++#define ZPOOL_CONFIG_SPLIT "splitcfg" ++#define ZPOOL_CONFIG_ORIG_GUID "orig_guid" ++#define ZPOOL_CONFIG_SPLIT_GUID "split_guid" ++#define ZPOOL_CONFIG_SPLIT_LIST "guid_list" ++#define ZPOOL_CONFIG_REMOVING "removing" ++#define ZPOOL_CONFIG_RESILVERING "resilvering" ++#define ZPOOL_CONFIG_COMMENT "comment" ++#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ ++#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ ++#define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ ++#define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ ++#define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ ++/* ++ * The persistent vdev state is stored as separate values rather than a single ++ * 'vdev_state' entry. This is because a device can be in multiple states, such ++ * as offline and degraded. ++ */ ++#define ZPOOL_CONFIG_OFFLINE "offline" ++#define ZPOOL_CONFIG_FAULTED "faulted" ++#define ZPOOL_CONFIG_DEGRADED "degraded" ++#define ZPOOL_CONFIG_REMOVED "removed" ++#define ZPOOL_CONFIG_FRU "fru" ++#define ZPOOL_CONFIG_AUX_STATE "aux_state" ++ ++/* Rewind policy parameters */ ++#define ZPOOL_REWIND_POLICY "rewind-policy" ++#define ZPOOL_REWIND_REQUEST "rewind-request" ++#define ZPOOL_REWIND_REQUEST_TXG "rewind-request-txg" ++#define ZPOOL_REWIND_META_THRESH "rewind-meta-thresh" ++#define ZPOOL_REWIND_DATA_THRESH "rewind-data-thresh" ++ ++/* Rewind data discovered */ ++#define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts" ++#define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" ++#define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" ++ ++#define VDEV_TYPE_ROOT "root" ++#define VDEV_TYPE_MIRROR "mirror" ++#define VDEV_TYPE_REPLACING "replacing" ++#define VDEV_TYPE_RAIDZ "raidz" ++#define VDEV_TYPE_DISK "disk" ++#define VDEV_TYPE_FILE "file" ++#define VDEV_TYPE_MISSING "missing" ++#define VDEV_TYPE_HOLE "hole" ++#define VDEV_TYPE_SPARE "spare" ++#define VDEV_TYPE_LOG "log" ++#define VDEV_TYPE_L2CACHE "l2cache" ++ ++/* ++ * This is needed in userland to report the minimum necessary device size. ++ */ ++#define SPA_MINDEVSIZE (64ULL << 20) ++ ++/* ++ * The location of the pool configuration repository, shared between kernel and ++ * userland. ++ */ ++#define ZPOOL_CACHE "/etc/zfs/zpool.cache" ++ ++/* ++ * vdev states are ordered from least to most healthy. ++ * A vdev that's CANT_OPEN or below is considered unusable. ++ */ ++typedef enum vdev_state { ++ VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ ++ VDEV_STATE_CLOSED, /* Not currently open */ ++ VDEV_STATE_OFFLINE, /* Not allowed to open */ ++ VDEV_STATE_REMOVED, /* Explicitly removed from system */ ++ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ ++ VDEV_STATE_FAULTED, /* External request to fault device */ ++ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ ++ VDEV_STATE_HEALTHY /* Presumed good */ ++} vdev_state_t; ++ ++#define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY ++ ++/* ++ * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field ++ * of the vdev stats structure uses these constants to distinguish why. ++ */ ++typedef enum vdev_aux { ++ VDEV_AUX_NONE, /* no error */ ++ VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ ++ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ ++ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ ++ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ ++ VDEV_AUX_TOO_SMALL, /* vdev size is too small */ ++ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ ++ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ ++ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ ++ VDEV_AUX_SPARED, /* hot spare used in another pool */ ++ VDEV_AUX_ERR_EXCEEDED, /* too many errors */ ++ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ ++ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ ++ VDEV_AUX_EXTERNAL, /* external diagnosis */ ++ VDEV_AUX_SPLIT_POOL /* vdev was split off into another pool */ ++} vdev_aux_t; ++ ++/* ++ * pool state. The following states are written to disk as part of the normal ++ * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining ++ * states are software abstractions used at various levels to communicate ++ * pool state. ++ */ ++typedef enum pool_state { ++ POOL_STATE_ACTIVE = 0, /* In active use */ ++ POOL_STATE_EXPORTED, /* Explicitly exported */ ++ POOL_STATE_DESTROYED, /* Explicitly destroyed */ ++ POOL_STATE_SPARE, /* Reserved for hot spare use */ ++ POOL_STATE_L2CACHE, /* Level 2 ARC device */ ++ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ ++ POOL_STATE_UNAVAIL, /* Internal libzfs state */ ++ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ ++} pool_state_t; ++ ++/* ++ * Scan Functions. ++ */ ++typedef enum pool_scan_func { ++ POOL_SCAN_NONE, ++ POOL_SCAN_SCRUB, ++ POOL_SCAN_RESILVER, ++ POOL_SCAN_FUNCS ++} pool_scan_func_t; ++ ++/* ++ * ZIO types. Needed to interpret vdev statistics below. ++ */ ++typedef enum zio_type { ++ ZIO_TYPE_NULL = 0, ++ ZIO_TYPE_READ, ++ ZIO_TYPE_WRITE, ++ ZIO_TYPE_FREE, ++ ZIO_TYPE_CLAIM, ++ ZIO_TYPE_IOCTL, ++ ZIO_TYPES ++} zio_type_t; ++ ++/* ++ * Pool statistics. Note: all fields should be 64-bit because this ++ * is passed between kernel and userland as an nvlist uint64 array. ++ */ ++typedef struct pool_scan_stat { ++ /* values stored on disk */ ++ uint64_t pss_func; /* pool_scan_func_t */ ++ uint64_t pss_state; /* dsl_scan_state_t */ ++ uint64_t pss_start_time; /* scan start time */ ++ uint64_t pss_end_time; /* scan end time */ ++ uint64_t pss_to_examine; /* total bytes to scan */ ++ uint64_t pss_examined; /* total examined bytes */ ++ uint64_t pss_to_process; /* total bytes to process */ ++ uint64_t pss_processed; /* total processed bytes */ ++ uint64_t pss_errors; /* scan errors */ ++ ++ /* values not stored on disk */ ++ uint64_t pss_pass_exam; /* examined bytes per scan pass */ ++ uint64_t pss_pass_start; /* start time of a scan pass */ ++} pool_scan_stat_t; ++ ++typedef enum dsl_scan_state { ++ DSS_NONE, ++ DSS_SCANNING, ++ DSS_FINISHED, ++ DSS_CANCELED, ++ DSS_NUM_STATES ++} dsl_scan_state_t; ++ ++ ++/* ++ * Vdev statistics. Note: all fields should be 64-bit because this ++ * is passed between kernel and userland as an nvlist uint64 array. ++ */ ++typedef struct vdev_stat { ++ hrtime_t vs_timestamp; /* time since vdev load */ ++ uint64_t vs_state; /* vdev state */ ++ uint64_t vs_aux; /* see vdev_aux_t */ ++ uint64_t vs_alloc; /* space allocated */ ++ uint64_t vs_space; /* total capacity */ ++ uint64_t vs_dspace; /* deflated capacity */ ++ uint64_t vs_rsize; /* replaceable dev size */ ++ uint64_t vs_esize; /* expandable dev size */ ++ uint64_t vs_ops[ZIO_TYPES]; /* operation count */ ++ uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */ ++ uint64_t vs_read_errors; /* read errors */ ++ uint64_t vs_write_errors; /* write errors */ ++ uint64_t vs_checksum_errors; /* checksum errors */ ++ uint64_t vs_self_healed; /* self-healed bytes */ ++ uint64_t vs_scan_removing; /* removing? */ ++ uint64_t vs_scan_processed; /* scan processed bytes */ ++} vdev_stat_t; ++ ++/* ++ * DDT statistics. Note: all fields should be 64-bit because this ++ * is passed between kernel and userland as an nvlist uint64 array. ++ */ ++typedef struct ddt_object { ++ uint64_t ddo_count; /* number of elments in ddt */ ++ uint64_t ddo_dspace; /* size of ddt on disk */ ++ uint64_t ddo_mspace; /* size of ddt in-core */ ++} ddt_object_t; ++ ++typedef struct ddt_stat { ++ uint64_t dds_blocks; /* blocks */ ++ uint64_t dds_lsize; /* logical size */ ++ uint64_t dds_psize; /* physical size */ ++ uint64_t dds_dsize; /* deflated allocated size */ ++ uint64_t dds_ref_blocks; /* referenced blocks */ ++ uint64_t dds_ref_lsize; /* referenced lsize * refcnt */ ++ uint64_t dds_ref_psize; /* referenced psize * refcnt */ ++ uint64_t dds_ref_dsize; /* referenced dsize * refcnt */ ++} ddt_stat_t; ++ ++typedef struct ddt_histogram { ++ ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ ++} ddt_histogram_t; ++ ++#define ZVOL_DRIVER "zvol" ++#define ZFS_DRIVER "zfs" ++#define ZFS_DEV "/dev/zfs" ++ ++/* general zvol path */ ++#define ZVOL_DIR "/dev" ++ ++#define ZVOL_MAJOR 230 ++#define ZVOL_MINOR_BITS 4 ++#define ZVOL_MINOR_MASK ((1U << ZVOL_MINOR_BITS) - 1) ++#define ZVOL_MINORS (1 << 4) ++#define ZVOL_DEV_NAME "zd" ++ ++#define ZVOL_PROP_NAME "name" ++#define ZVOL_DEFAULT_BLOCKSIZE 8192 ++ ++/* ++ * /dev/zfs ioctl numbers. ++ */ ++#define ZFS_IOC ('Z' << 8) ++ ++typedef enum zfs_ioc { ++ ZFS_IOC_POOL_CREATE = ZFS_IOC, ++ ZFS_IOC_POOL_DESTROY, ++ ZFS_IOC_POOL_IMPORT, ++ ZFS_IOC_POOL_EXPORT, ++ ZFS_IOC_POOL_CONFIGS, ++ ZFS_IOC_POOL_STATS, ++ ZFS_IOC_POOL_TRYIMPORT, ++ ZFS_IOC_POOL_SCAN, ++ ZFS_IOC_POOL_FREEZE, ++ ZFS_IOC_POOL_UPGRADE, ++ ZFS_IOC_POOL_GET_HISTORY, ++ ZFS_IOC_VDEV_ADD, ++ ZFS_IOC_VDEV_REMOVE, ++ ZFS_IOC_VDEV_SET_STATE, ++ ZFS_IOC_VDEV_ATTACH, ++ ZFS_IOC_VDEV_DETACH, ++ ZFS_IOC_VDEV_SETPATH, ++ ZFS_IOC_VDEV_SETFRU, ++ ZFS_IOC_OBJSET_STATS, ++ ZFS_IOC_OBJSET_ZPLPROPS, ++ ZFS_IOC_DATASET_LIST_NEXT, ++ ZFS_IOC_SNAPSHOT_LIST_NEXT, ++ ZFS_IOC_SET_PROP, ++ ZFS_IOC_CREATE_MINOR, ++ ZFS_IOC_REMOVE_MINOR, ++ ZFS_IOC_CREATE, ++ ZFS_IOC_DESTROY, ++ ZFS_IOC_ROLLBACK, ++ ZFS_IOC_RENAME, ++ ZFS_IOC_RECV, ++ ZFS_IOC_SEND, ++ ZFS_IOC_INJECT_FAULT, ++ ZFS_IOC_CLEAR_FAULT, ++ ZFS_IOC_INJECT_LIST_NEXT, ++ ZFS_IOC_ERROR_LOG, ++ ZFS_IOC_CLEAR, ++ ZFS_IOC_PROMOTE, ++ ZFS_IOC_DESTROY_SNAPS_NVL, ++ ZFS_IOC_SNAPSHOT, ++ ZFS_IOC_DSOBJ_TO_DSNAME, ++ ZFS_IOC_OBJ_TO_PATH, ++ ZFS_IOC_POOL_SET_PROPS, ++ ZFS_IOC_POOL_GET_PROPS, ++ ZFS_IOC_SET_FSACL, ++ ZFS_IOC_GET_FSACL, ++ ZFS_IOC_SHARE, ++ ZFS_IOC_INHERIT_PROP, ++ ZFS_IOC_SMB_ACL, ++ ZFS_IOC_USERSPACE_ONE, ++ ZFS_IOC_USERSPACE_MANY, ++ ZFS_IOC_USERSPACE_UPGRADE, ++ ZFS_IOC_HOLD, ++ ZFS_IOC_RELEASE, ++ ZFS_IOC_GET_HOLDS, ++ ZFS_IOC_OBJSET_RECVD_PROPS, ++ ZFS_IOC_VDEV_SPLIT, ++ ZFS_IOC_NEXT_OBJ, ++ ZFS_IOC_DIFF, ++ ZFS_IOC_TMP_SNAPSHOT, ++ ZFS_IOC_OBJ_TO_STATS, ++ ZFS_IOC_EVENTS_NEXT, ++ ZFS_IOC_EVENTS_CLEAR, ++ ZFS_IOC_POOL_REGUID, ++ ZFS_IOC_SPACE_WRITTEN, ++ ZFS_IOC_SPACE_SNAPS, ++ ZFS_IOC_POOL_REOPEN, ++ ZFS_IOC_SEND_PROGRESS, ++} zfs_ioc_t; ++ ++/* ++ * zvol ioctl to get dataset name ++ */ ++#define BLKZNAME _IOR(0x12,125,char[ZFS_MAXNAMELEN]) ++ ++/* ++ * Internal SPA load state. Used by FMA diagnosis engine. ++ */ ++typedef enum { ++ SPA_LOAD_NONE, /* no load in progress */ ++ SPA_LOAD_OPEN, /* normal open */ ++ SPA_LOAD_IMPORT, /* import in progress */ ++ SPA_LOAD_TRYIMPORT, /* tryimport in progress */ ++ SPA_LOAD_RECOVER, /* recovery requested */ ++ SPA_LOAD_ERROR /* load failed */ ++} spa_load_state_t; ++ ++/* ++ * Bookmark name values. ++ */ ++#define ZPOOL_ERR_LIST "error list" ++#define ZPOOL_ERR_DATASET "dataset" ++#define ZPOOL_ERR_OBJECT "object" ++ ++#define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) ++ ++/* ++ * The following are names used in the nvlist describing ++ * the pool's history log. ++ */ ++#define ZPOOL_HIST_RECORD "history record" ++#define ZPOOL_HIST_TIME "history time" ++#define ZPOOL_HIST_CMD "history command" ++#define ZPOOL_HIST_WHO "history who" ++#define ZPOOL_HIST_ZONE "history zone" ++#define ZPOOL_HIST_HOST "history hostname" ++#define ZPOOL_HIST_TXG "history txg" ++#define ZPOOL_HIST_INT_EVENT "history internal event" ++#define ZPOOL_HIST_INT_STR "history internal str" ++ ++/* ++ * Flags for ZFS_IOC_VDEV_SET_STATE ++ */ ++#define ZFS_ONLINE_CHECKREMOVE 0x1 ++#define ZFS_ONLINE_UNSPARE 0x2 ++#define ZFS_ONLINE_FORCEFAULT 0x4 ++#define ZFS_ONLINE_EXPAND 0x8 ++#define ZFS_OFFLINE_TEMPORARY 0x1 ++ ++/* ++ * Flags for ZFS_IOC_POOL_IMPORT ++ */ ++#define ZFS_IMPORT_NORMAL 0x0 ++#define ZFS_IMPORT_VERBATIM 0x1 ++#define ZFS_IMPORT_ANY_HOST 0x2 ++#define ZFS_IMPORT_MISSING_LOG 0x4 ++#define ZFS_IMPORT_ONLY 0x8 ++ ++/* ++ * Sysevent payload members. ZFS will generate the following sysevents with the ++ * given payloads: ++ * ++ * ESC_ZFS_RESILVER_START ++ * ESC_ZFS_RESILVER_END ++ * ESC_ZFS_POOL_DESTROY ++ * ESC_ZFS_POOL_REGUID ++ * ++ * ZFS_EV_POOL_NAME DATA_TYPE_STRING ++ * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 ++ * ++ * ESC_ZFS_VDEV_REMOVE ++ * ESC_ZFS_VDEV_CLEAR ++ * ESC_ZFS_VDEV_CHECK ++ * ++ * ZFS_EV_POOL_NAME DATA_TYPE_STRING ++ * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 ++ * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) ++ * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 ++ */ ++#define ZFS_EV_POOL_NAME "pool_name" ++#define ZFS_EV_POOL_GUID "pool_guid" ++#define ZFS_EV_VDEV_PATH "vdev_path" ++#define ZFS_EV_VDEV_GUID "vdev_guid" ++ ++/* ++ * Note: This is encoded on-disk, so new events must be added to the ++ * end, and unused events can not be removed. Be sure to edit ++ * libzfs_pool.c: hist_event_table[]. ++ */ ++typedef enum history_internal_events { ++ LOG_NO_EVENT = 0, ++ LOG_POOL_CREATE, ++ LOG_POOL_VDEV_ADD, ++ LOG_POOL_REMOVE, ++ LOG_POOL_DESTROY, ++ LOG_POOL_EXPORT, ++ LOG_POOL_IMPORT, ++ LOG_POOL_VDEV_ATTACH, ++ LOG_POOL_VDEV_REPLACE, ++ LOG_POOL_VDEV_DETACH, ++ LOG_POOL_VDEV_ONLINE, ++ LOG_POOL_VDEV_OFFLINE, ++ LOG_POOL_UPGRADE, ++ LOG_POOL_CLEAR, ++ LOG_POOL_SCAN, ++ LOG_POOL_PROPSET, ++ LOG_DS_CREATE, ++ LOG_DS_CLONE, ++ LOG_DS_DESTROY, ++ LOG_DS_DESTROY_BEGIN, ++ LOG_DS_INHERIT, ++ LOG_DS_PROPSET, ++ LOG_DS_QUOTA, ++ LOG_DS_PERM_UPDATE, ++ LOG_DS_PERM_REMOVE, ++ LOG_DS_PERM_WHO_REMOVE, ++ LOG_DS_PROMOTE, ++ LOG_DS_RECEIVE, ++ LOG_DS_RENAME, ++ LOG_DS_RESERVATION, ++ LOG_DS_REPLAY_INC_SYNC, ++ LOG_DS_REPLAY_FULL_SYNC, ++ LOG_DS_ROLLBACK, ++ LOG_DS_SNAPSHOT, ++ LOG_DS_UPGRADE, ++ LOG_DS_REFQUOTA, ++ LOG_DS_REFRESERV, ++ LOG_POOL_SCAN_DONE, ++ LOG_DS_USER_HOLD, ++ LOG_DS_USER_RELEASE, ++ LOG_POOL_SPLIT, ++ LOG_END ++} history_internal_events_t; ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_FS_ZFS_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/Makefile linux-3.2.33-go/include/zfs/sys/Makefile +--- linux-3.2.33-go.orig/include/zfs/sys/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/Makefile 2012-11-16 23:25:34.342039415 +0100 +@@ -0,0 +1,1029 @@ ++# Makefile.in generated by automake 1.11.6 from Makefile.am. ++# include/sys/Makefile. Generated from Makefile.in by configure. ++ ++# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, ++# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software ++# Foundation, Inc. ++# This Makefile.in is free software; the Free Software Foundation ++# gives unlimited permission to copy and/or distribute it, ++# with or without modifications, as long as this notice is preserved. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without ++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A ++# PARTICULAR PURPOSE. ++ ++ ++ ++ ++am__make_dryrun = \ ++ { \ ++ am__dry=no; \ ++ case $$MAKEFLAGS in \ ++ *\\[\ \ ]*) \ ++ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ ++ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ ++ *) \ ++ for am__flg in $$MAKEFLAGS; do \ ++ case $$am__flg in \ ++ *=*|--*) ;; \ ++ *n*) am__dry=yes; break;; \ ++ esac; \ ++ done;; \ ++ esac; \ ++ test $$am__dry = yes; \ ++ } ++pkgdatadir = $(datadir)/zfs ++pkgincludedir = $(includedir)/zfs ++pkglibdir = $(libdir)/zfs ++pkglibexecdir = $(libexecdir)/zfs ++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd ++install_sh_DATA = $(install_sh) -c -m 644 ++install_sh_PROGRAM = $(install_sh) -c ++install_sh_SCRIPT = $(install_sh) -c ++INSTALL_HEADER = $(INSTALL_DATA) ++transform = $(program_transform_name) ++NORMAL_INSTALL = : ++PRE_INSTALL = : ++POST_INSTALL = : ++NORMAL_UNINSTALL = : ++PRE_UNINSTALL = : ++POST_UNINSTALL = : ++build_triplet = x86_64-unknown-linux-gnu ++host_triplet = x86_64-unknown-linux-gnu ++target_triplet = x86_64-unknown-linux-gnu ++subdir = include/sys ++DIST_COMMON = $(am__kernel_HEADERS_DIST) $(am__libzfs_HEADERS_DIST) \ ++ $(srcdir)/Makefile.am $(srcdir)/Makefile.in ++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ++am__aclocal_m4_deps = \ ++ $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \ ++ $(top_srcdir)/config/kernel-automount.m4 \ ++ $(top_srcdir)/config/kernel-bdev-block-device-operations.m4 \ ++ $(top_srcdir)/config/kernel-bdev-logical-size.m4 \ ++ $(top_srcdir)/config/kernel-bdi-setup-and-register.m4 \ ++ $(top_srcdir)/config/kernel-bdi.m4 \ ++ $(top_srcdir)/config/kernel-bio-empty-barrier.m4 \ ++ $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \ ++ $(top_srcdir)/config/kernel-bio-failfast.m4 \ ++ $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \ ++ $(top_srcdir)/config/kernel-blk-end-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-fetch-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-discard.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-flush.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \ ++ $(top_srcdir)/config/kernel-blk-requeue-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-pos.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get.m4 \ ++ $(top_srcdir)/config/kernel-check-disk-size-change.m4 \ ++ $(top_srcdir)/config/kernel-clear-inode.m4 \ ++ $(top_srcdir)/config/kernel-commit-metadata.m4 \ ++ $(top_srcdir)/config/kernel-create-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-d-make-root.m4 \ ++ $(top_srcdir)/config/kernel-d-obtain-alias.m4 \ ++ $(top_srcdir)/config/kernel-discard-granularity.m4 \ ++ $(top_srcdir)/config/kernel-elevator-change.m4 \ ++ $(top_srcdir)/config/kernel-encode-fh-inode.m4 \ ++ $(top_srcdir)/config/kernel-evict-inode.m4 \ ++ $(top_srcdir)/config/kernel-fallocate.m4 \ ++ $(top_srcdir)/config/kernel-fmode-t.m4 \ ++ $(top_srcdir)/config/kernel-fsync.m4 \ ++ $(top_srcdir)/config/kernel-get-disk-ro.m4 \ ++ $(top_srcdir)/config/kernel-get-gendisk.m4 \ ++ $(top_srcdir)/config/kernel-insert-inode-locked.m4 \ ++ $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \ ++ $(top_srcdir)/config/kernel-kobj-name-len.m4 \ ++ $(top_srcdir)/config/kernel-lookup-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \ ++ $(top_srcdir)/config/kernel-mount-nodev.m4 \ ++ $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \ ++ $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \ ++ $(top_srcdir)/config/kernel-rq-is_sync.m4 \ ++ $(top_srcdir)/config/kernel-security-inode-init.m4 \ ++ $(top_srcdir)/config/kernel-set-nlink.m4 \ ++ $(top_srcdir)/config/kernel-sget-args.m4 \ ++ $(top_srcdir)/config/kernel-show-options.m4 \ ++ $(top_srcdir)/config/kernel-shrink.m4 \ ++ $(top_srcdir)/config/kernel-truncate-range.m4 \ ++ $(top_srcdir)/config/kernel-truncate-setsize.m4 \ ++ $(top_srcdir)/config/kernel-xattr-handler.m4 \ ++ $(top_srcdir)/config/kernel.m4 \ ++ $(top_srcdir)/config/user-arch.m4 \ ++ $(top_srcdir)/config/user-frame-larger-than.m4 \ ++ $(top_srcdir)/config/user-ioctl.m4 \ ++ $(top_srcdir)/config/user-libblkid.m4 \ ++ $(top_srcdir)/config/user-libuuid.m4 \ ++ $(top_srcdir)/config/user-nptl_guard_within_stack.m4 \ ++ $(top_srcdir)/config/user-selinux.m4 \ ++ $(top_srcdir)/config/user-udev.m4 \ ++ $(top_srcdir)/config/user-zlib.m4 $(top_srcdir)/config/user.m4 \ ++ $(top_srcdir)/config/zfs-build.m4 \ ++ $(top_srcdir)/config/zfs-meta.m4 $(top_srcdir)/configure.ac ++am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ ++ $(ACLOCAL_M4) ++mkinstalldirs = $(install_sh) -d ++CONFIG_HEADER = $(top_builddir)/zfs_config.h ++CONFIG_CLEAN_FILES = ++CONFIG_CLEAN_VPATH_FILES = ++AM_V_GEN = $(am__v_GEN_$(V)) ++am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) ++am__v_GEN_0 = @echo " GEN " $@; ++AM_V_at = $(am__v_at_$(V)) ++am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) ++am__v_at_0 = @ ++SOURCES = ++DIST_SOURCES = ++RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \ ++ html-recursive info-recursive install-data-recursive \ ++ install-dvi-recursive install-exec-recursive \ ++ install-html-recursive install-info-recursive \ ++ install-pdf-recursive install-ps-recursive install-recursive \ ++ installcheck-recursive installdirs-recursive pdf-recursive \ ++ ps-recursive uninstall-recursive ++am__can_run_installinfo = \ ++ case $$AM_UPDATE_INFO_DIR in \ ++ n|no|NO) false;; \ ++ *) (install-info --version) >/dev/null 2>&1;; \ ++ esac ++am__kernel_HEADERS_DIST = $(top_srcdir)/include/sys/arc.h \ ++ $(top_srcdir)/include/sys/avl.h \ ++ $(top_srcdir)/include/sys/avl_impl.h \ ++ $(top_srcdir)/include/sys/bplist.h \ ++ $(top_srcdir)/include/sys/bpobj.h \ ++ $(top_srcdir)/include/sys/dbuf.h \ ++ $(top_srcdir)/include/sys/ddt.h \ ++ $(top_srcdir)/include/sys/dmu.h \ ++ $(top_srcdir)/include/sys/dmu_impl.h \ ++ $(top_srcdir)/include/sys/dmu_objset.h \ ++ $(top_srcdir)/include/sys/dmu_traverse.h \ ++ $(top_srcdir)/include/sys/dmu_tx.h \ ++ $(top_srcdir)/include/sys/dmu_zfetch.h \ ++ $(top_srcdir)/include/sys/dnode.h \ ++ $(top_srcdir)/include/sys/dsl_dataset.h \ ++ $(top_srcdir)/include/sys/dsl_deadlist.h \ ++ $(top_srcdir)/include/sys/dsl_deleg.h \ ++ $(top_srcdir)/include/sys/dsl_dir.h \ ++ $(top_srcdir)/include/sys/dsl_pool.h \ ++ $(top_srcdir)/include/sys/dsl_prop.h \ ++ $(top_srcdir)/include/sys/dsl_scan.h \ ++ $(top_srcdir)/include/sys/dsl_synctask.h \ ++ $(top_srcdir)/include/sys/efi_partition.h \ ++ $(top_srcdir)/include/sys/metaslab.h \ ++ $(top_srcdir)/include/sys/metaslab_impl.h \ ++ $(top_srcdir)/include/sys/nvpair.h \ ++ $(top_srcdir)/include/sys/nvpair_impl.h \ ++ $(top_srcdir)/include/sys/refcount.h \ ++ $(top_srcdir)/include/sys/rrwlock.h \ ++ $(top_srcdir)/include/sys/sa.h \ ++ $(top_srcdir)/include/sys/sa_impl.h \ ++ $(top_srcdir)/include/sys/spa_boot.h \ ++ $(top_srcdir)/include/sys/space_map.h \ ++ $(top_srcdir)/include/sys/spa.h \ ++ $(top_srcdir)/include/sys/spa_impl.h \ ++ $(top_srcdir)/include/sys/txg.h \ ++ $(top_srcdir)/include/sys/txg_impl.h \ ++ $(top_srcdir)/include/sys/u8_textprep_data.h \ ++ $(top_srcdir)/include/sys/u8_textprep.h \ ++ $(top_srcdir)/include/sys/uberblock.h \ ++ $(top_srcdir)/include/sys/uberblock_impl.h \ ++ $(top_srcdir)/include/sys/uio_impl.h \ ++ $(top_srcdir)/include/sys/unique.h \ ++ $(top_srcdir)/include/sys/uuid.h \ ++ $(top_srcdir)/include/sys/vdev_disk.h \ ++ $(top_srcdir)/include/sys/vdev_file.h \ ++ $(top_srcdir)/include/sys/vdev.h \ ++ $(top_srcdir)/include/sys/vdev_impl.h \ ++ $(top_srcdir)/include/sys/xvattr.h \ ++ $(top_srcdir)/include/sys/zap.h \ ++ $(top_srcdir)/include/sys/zap_impl.h \ ++ $(top_srcdir)/include/sys/zap_leaf.h \ ++ $(top_srcdir)/include/sys/zfs_acl.h \ ++ $(top_srcdir)/include/sys/zfs_context.h \ ++ $(top_srcdir)/include/sys/zfs_ctldir.h \ ++ $(top_srcdir)/include/sys/zfs_debug.h \ ++ $(top_srcdir)/include/sys/zfs_dir.h \ ++ $(top_srcdir)/include/sys/zfs_fuid.h \ ++ $(top_srcdir)/include/sys/zfs_rlock.h \ ++ $(top_srcdir)/include/sys/zfs_sa.h \ ++ $(top_srcdir)/include/sys/zfs_stat.h \ ++ $(top_srcdir)/include/sys/zfs_vfsops.h \ ++ $(top_srcdir)/include/sys/zfs_znode.h \ ++ $(top_srcdir)/include/sys/zfs_vnops.h \ ++ $(top_srcdir)/include/sys/zil.h \ ++ $(top_srcdir)/include/sys/zil_impl.h \ ++ $(top_srcdir)/include/sys/zio_checksum.h \ ++ $(top_srcdir)/include/sys/zio_compress.h \ ++ $(top_srcdir)/include/sys/zio.h \ ++ $(top_srcdir)/include/sys/zio_impl.h \ ++ $(top_srcdir)/include/sys/zrlock.h \ ++ $(top_srcdir)/include/sys/zfs_ioctl.h \ ++ $(top_srcdir)/include/sys/zfs_onexit.h \ ++ ${top_srcdir}/include/sys/zpl.h \ ++ $(top_srcdir)/include/sys/zvol.h ++am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; ++am__vpath_adj = case $$p in \ ++ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ ++ *) f=$$p;; \ ++ esac; ++am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; ++am__install_max = 40 ++am__nobase_strip_setup = \ ++ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` ++am__nobase_strip = \ ++ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" ++am__nobase_list = $(am__nobase_strip_setup); \ ++ for p in $$list; do echo "$$p $$p"; done | \ ++ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ ++ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ ++ if (++n[$$2] == $(am__install_max)) \ ++ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ ++ END { for (dir in files) print dir, files[dir] }' ++am__base_list = \ ++ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ ++ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' ++am__uninstall_files_from_dir = { \ ++ test -z "$$files" \ ++ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ ++ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ ++ $(am__cd) "$$dir" && rm -f $$files; }; \ ++ } ++am__installdirs = "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)" ++am__libzfs_HEADERS_DIST = $(top_srcdir)/include/sys/arc.h \ ++ $(top_srcdir)/include/sys/avl.h \ ++ $(top_srcdir)/include/sys/avl_impl.h \ ++ $(top_srcdir)/include/sys/bplist.h \ ++ $(top_srcdir)/include/sys/bpobj.h \ ++ $(top_srcdir)/include/sys/dbuf.h \ ++ $(top_srcdir)/include/sys/ddt.h \ ++ $(top_srcdir)/include/sys/dmu.h \ ++ $(top_srcdir)/include/sys/dmu_impl.h \ ++ $(top_srcdir)/include/sys/dmu_objset.h \ ++ $(top_srcdir)/include/sys/dmu_traverse.h \ ++ $(top_srcdir)/include/sys/dmu_tx.h \ ++ $(top_srcdir)/include/sys/dmu_zfetch.h \ ++ $(top_srcdir)/include/sys/dnode.h \ ++ $(top_srcdir)/include/sys/dsl_dataset.h \ ++ $(top_srcdir)/include/sys/dsl_deadlist.h \ ++ $(top_srcdir)/include/sys/dsl_deleg.h \ ++ $(top_srcdir)/include/sys/dsl_dir.h \ ++ $(top_srcdir)/include/sys/dsl_pool.h \ ++ $(top_srcdir)/include/sys/dsl_prop.h \ ++ $(top_srcdir)/include/sys/dsl_scan.h \ ++ $(top_srcdir)/include/sys/dsl_synctask.h \ ++ $(top_srcdir)/include/sys/efi_partition.h \ ++ $(top_srcdir)/include/sys/metaslab.h \ ++ $(top_srcdir)/include/sys/metaslab_impl.h \ ++ $(top_srcdir)/include/sys/nvpair.h \ ++ $(top_srcdir)/include/sys/nvpair_impl.h \ ++ $(top_srcdir)/include/sys/refcount.h \ ++ $(top_srcdir)/include/sys/rrwlock.h \ ++ $(top_srcdir)/include/sys/sa.h \ ++ $(top_srcdir)/include/sys/sa_impl.h \ ++ $(top_srcdir)/include/sys/spa_boot.h \ ++ $(top_srcdir)/include/sys/space_map.h \ ++ $(top_srcdir)/include/sys/spa.h \ ++ $(top_srcdir)/include/sys/spa_impl.h \ ++ $(top_srcdir)/include/sys/txg.h \ ++ $(top_srcdir)/include/sys/txg_impl.h \ ++ $(top_srcdir)/include/sys/u8_textprep_data.h \ ++ $(top_srcdir)/include/sys/u8_textprep.h \ ++ $(top_srcdir)/include/sys/uberblock.h \ ++ $(top_srcdir)/include/sys/uberblock_impl.h \ ++ $(top_srcdir)/include/sys/uio_impl.h \ ++ $(top_srcdir)/include/sys/unique.h \ ++ $(top_srcdir)/include/sys/uuid.h \ ++ $(top_srcdir)/include/sys/vdev_disk.h \ ++ $(top_srcdir)/include/sys/vdev_file.h \ ++ $(top_srcdir)/include/sys/vdev.h \ ++ $(top_srcdir)/include/sys/vdev_impl.h \ ++ $(top_srcdir)/include/sys/xvattr.h \ ++ $(top_srcdir)/include/sys/zap.h \ ++ $(top_srcdir)/include/sys/zap_impl.h \ ++ $(top_srcdir)/include/sys/zap_leaf.h \ ++ $(top_srcdir)/include/sys/zfs_acl.h \ ++ $(top_srcdir)/include/sys/zfs_context.h \ ++ $(top_srcdir)/include/sys/zfs_ctldir.h \ ++ $(top_srcdir)/include/sys/zfs_debug.h \ ++ $(top_srcdir)/include/sys/zfs_dir.h \ ++ $(top_srcdir)/include/sys/zfs_fuid.h \ ++ $(top_srcdir)/include/sys/zfs_rlock.h \ ++ $(top_srcdir)/include/sys/zfs_sa.h \ ++ $(top_srcdir)/include/sys/zfs_stat.h \ ++ $(top_srcdir)/include/sys/zfs_vfsops.h \ ++ $(top_srcdir)/include/sys/zfs_znode.h \ ++ $(top_srcdir)/include/sys/zfs_vnops.h \ ++ $(top_srcdir)/include/sys/zil.h \ ++ $(top_srcdir)/include/sys/zil_impl.h \ ++ $(top_srcdir)/include/sys/zio_checksum.h \ ++ $(top_srcdir)/include/sys/zio_compress.h \ ++ $(top_srcdir)/include/sys/zio.h \ ++ $(top_srcdir)/include/sys/zio_impl.h \ ++ $(top_srcdir)/include/sys/zrlock.h ++HEADERS = $(kernel_HEADERS) $(libzfs_HEADERS) ++RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \ ++ distclean-recursive maintainer-clean-recursive ++AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \ ++ $(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \ ++ distdir ++ETAGS = etags ++CTAGS = ctags ++DIST_SUBDIRS = $(SUBDIRS) ++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ++am__relativize = \ ++ dir0=`pwd`; \ ++ sed_first='s,^\([^/]*\)/.*$$,\1,'; \ ++ sed_rest='s,^[^/]*/*,,'; \ ++ sed_last='s,^.*/\([^/]*\)$$,\1,'; \ ++ sed_butlast='s,/*[^/]*$$,,'; \ ++ while test -n "$$dir1"; do \ ++ first=`echo "$$dir1" | sed -e "$$sed_first"`; \ ++ if test "$$first" != "."; then \ ++ if test "$$first" = ".."; then \ ++ dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \ ++ dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \ ++ else \ ++ first2=`echo "$$dir2" | sed -e "$$sed_first"`; \ ++ if test "$$first2" = "$$first"; then \ ++ dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \ ++ else \ ++ dir2="../$$dir2"; \ ++ fi; \ ++ dir0="$$dir0"/"$$first"; \ ++ fi; \ ++ fi; \ ++ dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \ ++ done; \ ++ reldir="$$dir2" ++ACLOCAL = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run aclocal-1.11 ++ALIEN = alien ++ALIEN_VERSION = ++AMTAR = $${TAR-tar} ++AM_DEFAULT_VERBOSITY = 1 ++AR = ar ++AUTOCONF = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run autoconf ++AUTOHEADER = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run autoheader ++AUTOMAKE = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run automake-1.11 ++AWK = gawk ++CC = gcc ++CCAS = gcc ++CCASDEPMODE = depmode=gcc3 ++CCASFLAGS = -g -O2 ++CCDEPMODE = depmode=gcc3 ++CFLAGS = -g -O2 ++CPP = gcc -E ++CPPFLAGS = ++CYGPATH_W = echo ++DEBUG_CFLAGS = -DNDEBUG ++DEBUG_DMU_TX = _without_debug_dmu_tx ++DEBUG_STACKFLAGS = ++DEBUG_ZFS = _without_debug ++DEFAULT_INIT_DIR = ${prefix}/etc/init.d ++DEFAULT_INIT_SCRIPT = gentoo ++DEFAULT_PACKAGE = tgz ++DEFS = -DHAVE_CONFIG_H ++DEPDIR = .deps ++DLLTOOL = false ++DPKG = dpkg ++DPKGBUILD = dpkg-buildpackage ++DPKGBUILD_VERSION = ++DPKG_VERSION = ++DSYMUTIL = ++DUMPBIN = ++ECHO_C = ++ECHO_N = -n ++ECHO_T = ++EGREP = /bin/grep -E ++EXEEXT = ++FGREP = /bin/grep -F ++FRAME_LARGER_THAN = -Wframe-larger-than=1024 ++GREP = /bin/grep ++HAVE_ALIEN = no ++HAVE_DPKG = no ++HAVE_DPKGBUILD = no ++HAVE_MAKEPKG = ++HAVE_PACMAN = ++HAVE_RPM = yes ++HAVE_RPMBUILD = yes ++INSTALL = /usr/bin/install -c ++INSTALL_DATA = ${INSTALL} -m 644 ++INSTALL_PROGRAM = ${INSTALL} ++INSTALL_SCRIPT = ${INSTALL} ++INSTALL_STRIP_PROGRAM = $(install_sh) -c -s ++KERNELCPPFLAGS = -Wno-unused-but-set-variable -DHAVE_SPL -D_KERNEL -DTEXT_DOMAIN=\"zfs-linux-kernel\" -DNDEBUG ++KERNELMAKE_PARAMS = O=/usr/src/linux-3.6.0-sabayon ++LD = /usr/x86_64-pc-linux-gnu/bin/ld -m elf_x86_64 ++LDFLAGS = ++LIBBLKID = ++LIBOBJS = ++LIBS = -luuid -luuid -lz -lz -lz ++LIBSELINUX = ++LIBTOOL = $(SHELL) $(top_builddir)/libtool ++LIBUUID = -luuid ++LINUX = /usr/src/linux-3.2.33-go ++LINUX_OBJ = /usr/src/linux-3.6.0-sabayon ++LINUX_SYMBOLS = NONE ++LINUX_VERSION = 3.6.0-sabayon ++LIPO = ++LN_S = ln -s ++LTLIBOBJS = ++MAINT = # ++MAKEINFO = ${SHELL} /root/zfs-0.6.0-rc12/config/missing --run makeinfo ++MAKEPKG = ++MAKEPKG_VERSION = ++MANIFEST_TOOL = : ++MKDIR_P = /bin/mkdir -p ++NM = /usr/bin/nm -B ++NMEDIT = ++NO_UNUSED_BUT_SET_VARIABLE = -Wno-unused-but-set-variable ++OBJDUMP = objdump ++OBJEXT = o ++OTOOL = ++OTOOL64 = ++PACKAGE = zfs ++PACKAGE_BUGREPORT = ++PACKAGE_NAME = ++PACKAGE_STRING = ++PACKAGE_TARNAME = ++PACKAGE_URL = ++PACKAGE_VERSION = ++PACMAN = ++PACMAN_VERSION = ++PATH_SEPARATOR = : ++RANLIB = ranlib ++RPM = rpm ++RPMBUILD = rpmbuild ++RPMBUILD_VERSION = 4.10.0 ++RPM_VERSION = 4.10.0 ++SED = /bin/sed ++SET_MAKE = ++SHELL = /bin/sh ++SPL = /usr/src/linux-3.2.33-go ++SPL_OBJ = /usr/src/linux-3.2.33-go ++SPL_SYMBOLS = NONE ++SPL_VERSION = 0.6.0-rc12 ++STRIP = strip ++TARGET_ASM_DIR = asm-x86_64 ++VENDOR = gentoo ++VERSION = 0.6.0 ++ZFS_CONFIG = all ++ZFS_META_ALIAS = zfs-0.6.0-rc12 ++ZFS_META_AUTHOR = Sun Microsystems/Oracle, Lawrence Livermore National Laboratory ++ZFS_META_DATA = ++ZFS_META_LICENSE = CDDL ++ZFS_META_LT_AGE = ++ZFS_META_LT_CURRENT = ++ZFS_META_LT_REVISION = ++ZFS_META_NAME = zfs ++ZFS_META_RELEASE = rc12 ++ZFS_META_VERSION = 0.6.0 ++ZLIB = -lz ++abs_builddir = /root/zfs-0.6.0-rc12/include/sys ++abs_srcdir = /root/zfs-0.6.0-rc12/include/sys ++abs_top_builddir = /root/zfs-0.6.0-rc12 ++abs_top_srcdir = /root/zfs-0.6.0-rc12 ++ac_ct_AR = ar ++ac_ct_CC = gcc ++ac_ct_DUMPBIN = ++am__include = include ++am__leading_dot = . ++am__quote = ++am__tar = $${TAR-tar} chof - "$$tardir" ++am__untar = $${TAR-tar} xf - ++bindir = ${exec_prefix}/bin ++build = x86_64-unknown-linux-gnu ++build_alias = ++build_cpu = x86_64 ++build_os = linux-gnu ++build_vendor = unknown ++builddir = . ++datadir = ${datarootdir} ++datarootdir = ${prefix}/share ++docdir = ${datarootdir}/doc/${PACKAGE} ++dvidir = ${docdir} ++exec_prefix = ${prefix} ++host = x86_64-unknown-linux-gnu ++host_alias = ++host_cpu = x86_64 ++host_os = linux-gnu ++host_vendor = unknown ++htmldir = ${docdir} ++includedir = ${prefix}/include ++infodir = ${datarootdir}/info ++install_sh = ${SHELL} /root/zfs-0.6.0-rc12/config/install-sh ++libdir = ${exec_prefix}/lib ++libexecdir = ${exec_prefix}/libexec ++localedir = ${datarootdir}/locale ++localstatedir = ${prefix}/var ++mandir = ${datarootdir}/man ++mkdir_p = /bin/mkdir -p ++oldincludedir = /usr/include ++pdfdir = ${docdir} ++prefix = /usr/local ++program_transform_name = s,x,x, ++psdir = ${docdir} ++sbindir = ${exec_prefix}/sbin ++sharedstatedir = ${prefix}/com ++srcdir = . ++sysconfdir = ${prefix}/etc ++target = x86_64-unknown-linux-gnu ++target_alias = ++target_cpu = x86_64 ++target_os = linux-gnu ++target_vendor = unknown ++top_build_prefix = ../../ ++top_builddir = ../.. ++top_srcdir = ../.. ++udevdir = ${exec_prefix}/lib/udev ++udevruledir = ${udevdir}/rules.d ++SUBDIRS = fm fs ++COMMON_H = \ ++ $(top_srcdir)/include/sys/arc.h \ ++ $(top_srcdir)/include/sys/avl.h \ ++ $(top_srcdir)/include/sys/avl_impl.h \ ++ $(top_srcdir)/include/sys/bplist.h \ ++ $(top_srcdir)/include/sys/bpobj.h \ ++ $(top_srcdir)/include/sys/dbuf.h \ ++ $(top_srcdir)/include/sys/ddt.h \ ++ $(top_srcdir)/include/sys/dmu.h \ ++ $(top_srcdir)/include/sys/dmu_impl.h \ ++ $(top_srcdir)/include/sys/dmu_objset.h \ ++ $(top_srcdir)/include/sys/dmu_traverse.h \ ++ $(top_srcdir)/include/sys/dmu_tx.h \ ++ $(top_srcdir)/include/sys/dmu_zfetch.h \ ++ $(top_srcdir)/include/sys/dnode.h \ ++ $(top_srcdir)/include/sys/dsl_dataset.h \ ++ $(top_srcdir)/include/sys/dsl_deadlist.h \ ++ $(top_srcdir)/include/sys/dsl_deleg.h \ ++ $(top_srcdir)/include/sys/dsl_dir.h \ ++ $(top_srcdir)/include/sys/dsl_pool.h \ ++ $(top_srcdir)/include/sys/dsl_prop.h \ ++ $(top_srcdir)/include/sys/dsl_scan.h \ ++ $(top_srcdir)/include/sys/dsl_synctask.h \ ++ $(top_srcdir)/include/sys/efi_partition.h \ ++ $(top_srcdir)/include/sys/metaslab.h \ ++ $(top_srcdir)/include/sys/metaslab_impl.h \ ++ $(top_srcdir)/include/sys/nvpair.h \ ++ $(top_srcdir)/include/sys/nvpair_impl.h \ ++ $(top_srcdir)/include/sys/refcount.h \ ++ $(top_srcdir)/include/sys/rrwlock.h \ ++ $(top_srcdir)/include/sys/sa.h \ ++ $(top_srcdir)/include/sys/sa_impl.h \ ++ $(top_srcdir)/include/sys/spa_boot.h \ ++ $(top_srcdir)/include/sys/space_map.h \ ++ $(top_srcdir)/include/sys/spa.h \ ++ $(top_srcdir)/include/sys/spa_impl.h \ ++ $(top_srcdir)/include/sys/txg.h \ ++ $(top_srcdir)/include/sys/txg_impl.h \ ++ $(top_srcdir)/include/sys/u8_textprep_data.h \ ++ $(top_srcdir)/include/sys/u8_textprep.h \ ++ $(top_srcdir)/include/sys/uberblock.h \ ++ $(top_srcdir)/include/sys/uberblock_impl.h \ ++ $(top_srcdir)/include/sys/uio_impl.h \ ++ $(top_srcdir)/include/sys/unique.h \ ++ $(top_srcdir)/include/sys/uuid.h \ ++ $(top_srcdir)/include/sys/vdev_disk.h \ ++ $(top_srcdir)/include/sys/vdev_file.h \ ++ $(top_srcdir)/include/sys/vdev.h \ ++ $(top_srcdir)/include/sys/vdev_impl.h \ ++ $(top_srcdir)/include/sys/xvattr.h \ ++ $(top_srcdir)/include/sys/zap.h \ ++ $(top_srcdir)/include/sys/zap_impl.h \ ++ $(top_srcdir)/include/sys/zap_leaf.h \ ++ $(top_srcdir)/include/sys/zfs_acl.h \ ++ $(top_srcdir)/include/sys/zfs_context.h \ ++ $(top_srcdir)/include/sys/zfs_ctldir.h \ ++ $(top_srcdir)/include/sys/zfs_debug.h \ ++ $(top_srcdir)/include/sys/zfs_dir.h \ ++ $(top_srcdir)/include/sys/zfs_fuid.h \ ++ $(top_srcdir)/include/sys/zfs_rlock.h \ ++ $(top_srcdir)/include/sys/zfs_sa.h \ ++ $(top_srcdir)/include/sys/zfs_stat.h \ ++ $(top_srcdir)/include/sys/zfs_vfsops.h \ ++ $(top_srcdir)/include/sys/zfs_znode.h \ ++ $(top_srcdir)/include/sys/zfs_vnops.h \ ++ $(top_srcdir)/include/sys/zil.h \ ++ $(top_srcdir)/include/sys/zil_impl.h \ ++ $(top_srcdir)/include/sys/zio_checksum.h \ ++ $(top_srcdir)/include/sys/zio_compress.h \ ++ $(top_srcdir)/include/sys/zio.h \ ++ $(top_srcdir)/include/sys/zio_impl.h \ ++ $(top_srcdir)/include/sys/zrlock.h ++ ++KERNEL_H = \ ++ $(top_srcdir)/include/sys/zfs_ioctl.h \ ++ $(top_srcdir)/include/sys/zfs_onexit.h \ ++ ${top_srcdir}/include/sys/zpl.h \ ++ $(top_srcdir)/include/sys/zvol.h ++ ++USER_H = ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++libzfsdir = $(includedir)/libzfs/sys ++libzfs_HEADERS = $(COMMON_H) $(USER_H) ++#kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION)/sys ++#kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++all: all-recursive ++ ++.SUFFIXES: ++$(srcdir)/Makefile.in: # $(srcdir)/Makefile.am $(am__configure_deps) ++ @for dep in $?; do \ ++ case '$(am__configure_deps)' in \ ++ *$$dep*) \ ++ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ ++ && { if test -f $@; then exit 0; else break; fi; }; \ ++ exit 1;; \ ++ esac; \ ++ done; \ ++ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu include/sys/Makefile'; \ ++ $(am__cd) $(top_srcdir) && \ ++ $(AUTOMAKE) --gnu include/sys/Makefile ++.PRECIOUS: Makefile ++Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status ++ @case '$?' in \ ++ *config.status*) \ ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ ++ *) \ ++ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ ++ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ ++ esac; ++ ++$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++ ++$(top_srcdir)/configure: # $(am__configure_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(ACLOCAL_M4): # $(am__aclocal_m4_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(am__aclocal_m4_deps): ++ ++mostlyclean-libtool: ++ -rm -f *.lo ++ ++clean-libtool: ++ -rm -rf .libs _libs ++install-kernelHEADERS: $(kernel_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(kerneldir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(kerneldir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(kerneldir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(kerneldir)" || exit $$?; \ ++ done ++ ++uninstall-kernelHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(kerneldir)'; $(am__uninstall_files_from_dir) ++install-libzfsHEADERS: $(libzfs_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(libzfsdir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(libzfsdir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libzfsdir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(libzfsdir)" || exit $$?; \ ++ done ++ ++uninstall-libzfsHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(libzfsdir)'; $(am__uninstall_files_from_dir) ++ ++# This directory's subdirectories are mostly independent; you can cd ++# into them and run `make' without going through this Makefile. ++# To change the values of `make' variables: instead of editing Makefiles, ++# (1) if the variable is set in `config.status', edit `config.status' ++# (which will cause the Makefiles to be regenerated when you run `make'); ++# (2) otherwise, pass the desired values on the `make' command line. ++$(RECURSIVE_TARGETS): ++ @fail= failcom='exit 1'; \ ++ for f in x $$MAKEFLAGS; do \ ++ case $$f in \ ++ *=* | --[!k]*);; \ ++ *k*) failcom='fail=yes';; \ ++ esac; \ ++ done; \ ++ dot_seen=no; \ ++ target=`echo $@ | sed s/-recursive//`; \ ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ echo "Making $$target in $$subdir"; \ ++ if test "$$subdir" = "."; then \ ++ dot_seen=yes; \ ++ local_target="$$target-am"; \ ++ else \ ++ local_target="$$target"; \ ++ fi; \ ++ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ ++ || eval $$failcom; \ ++ done; \ ++ if test "$$dot_seen" = "no"; then \ ++ $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ ++ fi; test -z "$$fail" ++ ++$(RECURSIVE_CLEAN_TARGETS): ++ @fail= failcom='exit 1'; \ ++ for f in x $$MAKEFLAGS; do \ ++ case $$f in \ ++ *=* | --[!k]*);; \ ++ *k*) failcom='fail=yes';; \ ++ esac; \ ++ done; \ ++ dot_seen=no; \ ++ case "$@" in \ ++ distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \ ++ *) list='$(SUBDIRS)' ;; \ ++ esac; \ ++ rev=''; for subdir in $$list; do \ ++ if test "$$subdir" = "."; then :; else \ ++ rev="$$subdir $$rev"; \ ++ fi; \ ++ done; \ ++ rev="$$rev ."; \ ++ target=`echo $@ | sed s/-recursive//`; \ ++ for subdir in $$rev; do \ ++ echo "Making $$target in $$subdir"; \ ++ if test "$$subdir" = "."; then \ ++ local_target="$$target-am"; \ ++ else \ ++ local_target="$$target"; \ ++ fi; \ ++ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ ++ || eval $$failcom; \ ++ done && test -z "$$fail" ++tags-recursive: ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \ ++ done ++ctags-recursive: ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \ ++ done ++ ++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ mkid -fID $$unique ++tags: TAGS ++ ++TAGS: tags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ set x; \ ++ here=`pwd`; \ ++ if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \ ++ include_option=--etags-include; \ ++ empty_fix=.; \ ++ else \ ++ include_option=--include; \ ++ empty_fix=; \ ++ fi; \ ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ if test "$$subdir" = .; then :; else \ ++ test ! -f $$subdir/TAGS || \ ++ set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \ ++ fi; \ ++ done; \ ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ shift; \ ++ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ ++ test -n "$$unique" || unique=$$empty_fix; \ ++ if test $$# -gt 0; then \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ "$$@" $$unique; \ ++ else \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ $$unique; \ ++ fi; \ ++ fi ++ctags: CTAGS ++CTAGS: ctags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ test -z "$(CTAGS_ARGS)$$unique" \ ++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ ++ $$unique ++ ++GTAGS: ++ here=`$(am__cd) $(top_builddir) && pwd` \ ++ && $(am__cd) $(top_srcdir) \ ++ && gtags -i $(GTAGS_ARGS) "$$here" ++ ++distclean-tags: ++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags ++ ++distdir: $(DISTFILES) ++ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ list='$(DISTFILES)'; \ ++ dist_files=`for file in $$list; do echo $$file; done | \ ++ sed -e "s|^$$srcdirstrip/||;t" \ ++ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ ++ case $$dist_files in \ ++ */*) $(MKDIR_P) `echo "$$dist_files" | \ ++ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ ++ sort -u` ;; \ ++ esac; \ ++ for file in $$dist_files; do \ ++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ ++ if test -d $$d/$$file; then \ ++ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ ++ if test -d "$(distdir)/$$file"; then \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ ++ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ ++ else \ ++ test -f "$(distdir)/$$file" \ ++ || cp -p $$d/$$file "$(distdir)/$$file" \ ++ || exit 1; \ ++ fi; \ ++ done ++ @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \ ++ if test "$$subdir" = .; then :; else \ ++ $(am__make_dryrun) \ ++ || test -d "$(distdir)/$$subdir" \ ++ || $(MKDIR_P) "$(distdir)/$$subdir" \ ++ || exit 1; \ ++ dir1=$$subdir; dir2="$(distdir)/$$subdir"; \ ++ $(am__relativize); \ ++ new_distdir=$$reldir; \ ++ dir1=$$subdir; dir2="$(top_distdir)"; \ ++ $(am__relativize); \ ++ new_top_distdir=$$reldir; \ ++ echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \ ++ echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \ ++ ($(am__cd) $$subdir && \ ++ $(MAKE) $(AM_MAKEFLAGS) \ ++ top_distdir="$$new_top_distdir" \ ++ distdir="$$new_distdir" \ ++ am__remove_distdir=: \ ++ am__skip_length_check=: \ ++ am__skip_mode_fix=: \ ++ distdir) \ ++ || exit 1; \ ++ fi; \ ++ done ++check-am: all-am ++check: check-recursive ++all-am: Makefile $(HEADERS) ++installdirs: installdirs-recursive ++installdirs-am: ++ for dir in "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)"; do \ ++ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ ++ done ++install: install-recursive ++install-exec: install-exec-recursive ++install-data: install-data-recursive ++uninstall: uninstall-recursive ++ ++install-am: all-am ++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am ++ ++installcheck: installcheck-recursive ++install-strip: ++ if test -z '$(STRIP)'; then \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ install; \ ++ else \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ ++ fi ++mostlyclean-generic: ++ ++clean-generic: ++ ++distclean-generic: ++ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) ++ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) ++ ++maintainer-clean-generic: ++ @echo "This command is intended for maintainers to use" ++ @echo "it deletes files that may require special tools to rebuild." ++clean: clean-recursive ++ ++clean-am: clean-generic clean-libtool mostlyclean-am ++ ++distclean: distclean-recursive ++ -rm -f Makefile ++distclean-am: clean-am distclean-generic distclean-tags ++ ++dvi: dvi-recursive ++ ++dvi-am: ++ ++html: html-recursive ++ ++html-am: ++ ++info: info-recursive ++ ++info-am: ++ ++install-data-am: install-kernelHEADERS install-libzfsHEADERS ++ ++install-dvi: install-dvi-recursive ++ ++install-dvi-am: ++ ++install-exec-am: ++ ++install-html: install-html-recursive ++ ++install-html-am: ++ ++install-info: install-info-recursive ++ ++install-info-am: ++ ++install-man: ++ ++install-pdf: install-pdf-recursive ++ ++install-pdf-am: ++ ++install-ps: install-ps-recursive ++ ++install-ps-am: ++ ++installcheck-am: ++ ++maintainer-clean: maintainer-clean-recursive ++ -rm -f Makefile ++maintainer-clean-am: distclean-am maintainer-clean-generic ++ ++mostlyclean: mostlyclean-recursive ++ ++mostlyclean-am: mostlyclean-generic mostlyclean-libtool ++ ++pdf: pdf-recursive ++ ++pdf-am: ++ ++ps: ps-recursive ++ ++ps-am: ++ ++uninstall-am: uninstall-kernelHEADERS uninstall-libzfsHEADERS ++ ++.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \ ++ install-am install-strip tags-recursive ++ ++.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \ ++ all all-am check check-am clean clean-generic clean-libtool \ ++ ctags ctags-recursive distclean distclean-generic \ ++ distclean-libtool distclean-tags distdir dvi dvi-am html \ ++ html-am info info-am install install-am install-data \ ++ install-data-am install-dvi install-dvi-am install-exec \ ++ install-exec-am install-html install-html-am install-info \ ++ install-info-am install-kernelHEADERS install-libzfsHEADERS \ ++ install-man install-pdf install-pdf-am install-ps \ ++ install-ps-am install-strip installcheck installcheck-am \ ++ installdirs installdirs-am maintainer-clean \ ++ maintainer-clean-generic mostlyclean mostlyclean-generic \ ++ mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \ ++ uninstall uninstall-am uninstall-kernelHEADERS \ ++ uninstall-libzfsHEADERS ++ ++ ++# Tell versions [3.59,3.63) of GNU make to not export all variables. ++# Otherwise a system limit (for SysV at least) may be exceeded. ++.NOEXPORT: +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/Makefile.am linux-3.2.33-go/include/zfs/sys/Makefile.am +--- linux-3.2.33-go.orig/include/zfs/sys/Makefile.am 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/Makefile.am 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,94 @@ ++SUBDIRS = fm fs ++ ++COMMON_H = \ ++ $(top_srcdir)/include/sys/arc.h \ ++ $(top_srcdir)/include/sys/avl.h \ ++ $(top_srcdir)/include/sys/avl_impl.h \ ++ $(top_srcdir)/include/sys/bplist.h \ ++ $(top_srcdir)/include/sys/bpobj.h \ ++ $(top_srcdir)/include/sys/dbuf.h \ ++ $(top_srcdir)/include/sys/ddt.h \ ++ $(top_srcdir)/include/sys/dmu.h \ ++ $(top_srcdir)/include/sys/dmu_impl.h \ ++ $(top_srcdir)/include/sys/dmu_objset.h \ ++ $(top_srcdir)/include/sys/dmu_traverse.h \ ++ $(top_srcdir)/include/sys/dmu_tx.h \ ++ $(top_srcdir)/include/sys/dmu_zfetch.h \ ++ $(top_srcdir)/include/sys/dnode.h \ ++ $(top_srcdir)/include/sys/dsl_dataset.h \ ++ $(top_srcdir)/include/sys/dsl_deadlist.h \ ++ $(top_srcdir)/include/sys/dsl_deleg.h \ ++ $(top_srcdir)/include/sys/dsl_dir.h \ ++ $(top_srcdir)/include/sys/dsl_pool.h \ ++ $(top_srcdir)/include/sys/dsl_prop.h \ ++ $(top_srcdir)/include/sys/dsl_scan.h \ ++ $(top_srcdir)/include/sys/dsl_synctask.h \ ++ $(top_srcdir)/include/sys/efi_partition.h \ ++ $(top_srcdir)/include/sys/metaslab.h \ ++ $(top_srcdir)/include/sys/metaslab_impl.h \ ++ $(top_srcdir)/include/sys/nvpair.h \ ++ $(top_srcdir)/include/sys/nvpair_impl.h \ ++ $(top_srcdir)/include/sys/refcount.h \ ++ $(top_srcdir)/include/sys/rrwlock.h \ ++ $(top_srcdir)/include/sys/sa.h \ ++ $(top_srcdir)/include/sys/sa_impl.h \ ++ $(top_srcdir)/include/sys/spa_boot.h \ ++ $(top_srcdir)/include/sys/space_map.h \ ++ $(top_srcdir)/include/sys/spa.h \ ++ $(top_srcdir)/include/sys/spa_impl.h \ ++ $(top_srcdir)/include/sys/txg.h \ ++ $(top_srcdir)/include/sys/txg_impl.h \ ++ $(top_srcdir)/include/sys/u8_textprep_data.h \ ++ $(top_srcdir)/include/sys/u8_textprep.h \ ++ $(top_srcdir)/include/sys/uberblock.h \ ++ $(top_srcdir)/include/sys/uberblock_impl.h \ ++ $(top_srcdir)/include/sys/uio_impl.h \ ++ $(top_srcdir)/include/sys/unique.h \ ++ $(top_srcdir)/include/sys/uuid.h \ ++ $(top_srcdir)/include/sys/vdev_disk.h \ ++ $(top_srcdir)/include/sys/vdev_file.h \ ++ $(top_srcdir)/include/sys/vdev.h \ ++ $(top_srcdir)/include/sys/vdev_impl.h \ ++ $(top_srcdir)/include/sys/xvattr.h \ ++ $(top_srcdir)/include/sys/zap.h \ ++ $(top_srcdir)/include/sys/zap_impl.h \ ++ $(top_srcdir)/include/sys/zap_leaf.h \ ++ $(top_srcdir)/include/sys/zfs_acl.h \ ++ $(top_srcdir)/include/sys/zfs_context.h \ ++ $(top_srcdir)/include/sys/zfs_ctldir.h \ ++ $(top_srcdir)/include/sys/zfs_debug.h \ ++ $(top_srcdir)/include/sys/zfs_dir.h \ ++ $(top_srcdir)/include/sys/zfs_fuid.h \ ++ $(top_srcdir)/include/sys/zfs_rlock.h \ ++ $(top_srcdir)/include/sys/zfs_sa.h \ ++ $(top_srcdir)/include/sys/zfs_stat.h \ ++ $(top_srcdir)/include/sys/zfs_vfsops.h \ ++ $(top_srcdir)/include/sys/zfs_znode.h \ ++ $(top_srcdir)/include/sys/zfs_vnops.h \ ++ $(top_srcdir)/include/sys/zil.h \ ++ $(top_srcdir)/include/sys/zil_impl.h \ ++ $(top_srcdir)/include/sys/zio_checksum.h \ ++ $(top_srcdir)/include/sys/zio_compress.h \ ++ $(top_srcdir)/include/sys/zio.h \ ++ $(top_srcdir)/include/sys/zio_impl.h \ ++ $(top_srcdir)/include/sys/zrlock.h ++ ++KERNEL_H = \ ++ $(top_srcdir)/include/sys/zfs_ioctl.h \ ++ $(top_srcdir)/include/sys/zfs_onexit.h \ ++ ${top_srcdir}/include/sys/zpl.h \ ++ $(top_srcdir)/include/sys/zvol.h ++ ++USER_H = ++ ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++ ++if CONFIG_USER ++libzfsdir = $(includedir)/libzfs/sys ++libzfs_HEADERS = $(COMMON_H) $(USER_H) ++endif ++ ++if CONFIG_KERNEL ++kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION)/sys ++kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++endif +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/Makefile.in linux-3.2.33-go/include/zfs/sys/Makefile.in +--- linux-3.2.33-go.orig/include/zfs/sys/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/Makefile.in 2012-11-16 23:25:34.342039415 +0100 +@@ -0,0 +1,1029 @@ ++# Makefile.in generated by automake 1.11.6 from Makefile.am. ++# @configure_input@ ++ ++# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, ++# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software ++# Foundation, Inc. ++# This Makefile.in is free software; the Free Software Foundation ++# gives unlimited permission to copy and/or distribute it, ++# with or without modifications, as long as this notice is preserved. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without ++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A ++# PARTICULAR PURPOSE. ++ ++@SET_MAKE@ ++ ++VPATH = @srcdir@ ++am__make_dryrun = \ ++ { \ ++ am__dry=no; \ ++ case $$MAKEFLAGS in \ ++ *\\[\ \ ]*) \ ++ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ ++ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ ++ *) \ ++ for am__flg in $$MAKEFLAGS; do \ ++ case $$am__flg in \ ++ *=*|--*) ;; \ ++ *n*) am__dry=yes; break;; \ ++ esac; \ ++ done;; \ ++ esac; \ ++ test $$am__dry = yes; \ ++ } ++pkgdatadir = $(datadir)/@PACKAGE@ ++pkgincludedir = $(includedir)/@PACKAGE@ ++pkglibdir = $(libdir)/@PACKAGE@ ++pkglibexecdir = $(libexecdir)/@PACKAGE@ ++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd ++install_sh_DATA = $(install_sh) -c -m 644 ++install_sh_PROGRAM = $(install_sh) -c ++install_sh_SCRIPT = $(install_sh) -c ++INSTALL_HEADER = $(INSTALL_DATA) ++transform = $(program_transform_name) ++NORMAL_INSTALL = : ++PRE_INSTALL = : ++POST_INSTALL = : ++NORMAL_UNINSTALL = : ++PRE_UNINSTALL = : ++POST_UNINSTALL = : ++build_triplet = @build@ ++host_triplet = @host@ ++target_triplet = @target@ ++subdir = include/sys ++DIST_COMMON = $(am__kernel_HEADERS_DIST) $(am__libzfs_HEADERS_DIST) \ ++ $(srcdir)/Makefile.am $(srcdir)/Makefile.in ++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ++am__aclocal_m4_deps = \ ++ $(top_srcdir)/config/always-no-unused-but-set-variable.m4 \ ++ $(top_srcdir)/config/kernel-automount.m4 \ ++ $(top_srcdir)/config/kernel-bdev-block-device-operations.m4 \ ++ $(top_srcdir)/config/kernel-bdev-logical-size.m4 \ ++ $(top_srcdir)/config/kernel-bdi-setup-and-register.m4 \ ++ $(top_srcdir)/config/kernel-bdi.m4 \ ++ $(top_srcdir)/config/kernel-bio-empty-barrier.m4 \ ++ $(top_srcdir)/config/kernel-bio-end-io-t-args.m4 \ ++ $(top_srcdir)/config/kernel-bio-failfast.m4 \ ++ $(top_srcdir)/config/kernel-bio-rw-syncio.m4 \ ++ $(top_srcdir)/config/kernel-blk-end-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-fetch-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-discard.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-flush.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-io-opt.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-hw-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-max-segments.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-nonrot.m4 \ ++ $(top_srcdir)/config/kernel-blk-queue-physical-block-size.m4 \ ++ $(top_srcdir)/config/kernel-blk-requeue-request.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-bytes.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-pos.m4 \ ++ $(top_srcdir)/config/kernel-blk-rq-sectors.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get-by-path.m4 \ ++ $(top_srcdir)/config/kernel-blkdev-get.m4 \ ++ $(top_srcdir)/config/kernel-check-disk-size-change.m4 \ ++ $(top_srcdir)/config/kernel-clear-inode.m4 \ ++ $(top_srcdir)/config/kernel-commit-metadata.m4 \ ++ $(top_srcdir)/config/kernel-create-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-d-make-root.m4 \ ++ $(top_srcdir)/config/kernel-d-obtain-alias.m4 \ ++ $(top_srcdir)/config/kernel-discard-granularity.m4 \ ++ $(top_srcdir)/config/kernel-elevator-change.m4 \ ++ $(top_srcdir)/config/kernel-encode-fh-inode.m4 \ ++ $(top_srcdir)/config/kernel-evict-inode.m4 \ ++ $(top_srcdir)/config/kernel-fallocate.m4 \ ++ $(top_srcdir)/config/kernel-fmode-t.m4 \ ++ $(top_srcdir)/config/kernel-fsync.m4 \ ++ $(top_srcdir)/config/kernel-get-disk-ro.m4 \ ++ $(top_srcdir)/config/kernel-get-gendisk.m4 \ ++ $(top_srcdir)/config/kernel-insert-inode-locked.m4 \ ++ $(top_srcdir)/config/kernel-invalidate-bdev-args.m4 \ ++ $(top_srcdir)/config/kernel-kobj-name-len.m4 \ ++ $(top_srcdir)/config/kernel-lookup-nameidata.m4 \ ++ $(top_srcdir)/config/kernel-mkdir-umode-t.m4 \ ++ $(top_srcdir)/config/kernel-mount-nodev.m4 \ ++ $(top_srcdir)/config/kernel-open-bdev-exclusive.m4 \ ++ $(top_srcdir)/config/kernel-rq-for-each_segment.m4 \ ++ $(top_srcdir)/config/kernel-rq-is_sync.m4 \ ++ $(top_srcdir)/config/kernel-security-inode-init.m4 \ ++ $(top_srcdir)/config/kernel-set-nlink.m4 \ ++ $(top_srcdir)/config/kernel-sget-args.m4 \ ++ $(top_srcdir)/config/kernel-show-options.m4 \ ++ $(top_srcdir)/config/kernel-shrink.m4 \ ++ $(top_srcdir)/config/kernel-truncate-range.m4 \ ++ $(top_srcdir)/config/kernel-truncate-setsize.m4 \ ++ $(top_srcdir)/config/kernel-xattr-handler.m4 \ ++ $(top_srcdir)/config/kernel.m4 \ ++ $(top_srcdir)/config/user-arch.m4 \ ++ $(top_srcdir)/config/user-frame-larger-than.m4 \ ++ $(top_srcdir)/config/user-ioctl.m4 \ ++ $(top_srcdir)/config/user-libblkid.m4 \ ++ $(top_srcdir)/config/user-libuuid.m4 \ ++ $(top_srcdir)/config/user-nptl_guard_within_stack.m4 \ ++ $(top_srcdir)/config/user-selinux.m4 \ ++ $(top_srcdir)/config/user-udev.m4 \ ++ $(top_srcdir)/config/user-zlib.m4 $(top_srcdir)/config/user.m4 \ ++ $(top_srcdir)/config/zfs-build.m4 \ ++ $(top_srcdir)/config/zfs-meta.m4 $(top_srcdir)/configure.ac ++am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ ++ $(ACLOCAL_M4) ++mkinstalldirs = $(install_sh) -d ++CONFIG_HEADER = $(top_builddir)/zfs_config.h ++CONFIG_CLEAN_FILES = ++CONFIG_CLEAN_VPATH_FILES = ++AM_V_GEN = $(am__v_GEN_@AM_V@) ++am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) ++am__v_GEN_0 = @echo " GEN " $@; ++AM_V_at = $(am__v_at_@AM_V@) ++am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) ++am__v_at_0 = @ ++SOURCES = ++DIST_SOURCES = ++RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \ ++ html-recursive info-recursive install-data-recursive \ ++ install-dvi-recursive install-exec-recursive \ ++ install-html-recursive install-info-recursive \ ++ install-pdf-recursive install-ps-recursive install-recursive \ ++ installcheck-recursive installdirs-recursive pdf-recursive \ ++ ps-recursive uninstall-recursive ++am__can_run_installinfo = \ ++ case $$AM_UPDATE_INFO_DIR in \ ++ n|no|NO) false;; \ ++ *) (install-info --version) >/dev/null 2>&1;; \ ++ esac ++am__kernel_HEADERS_DIST = $(top_srcdir)/include/sys/arc.h \ ++ $(top_srcdir)/include/sys/avl.h \ ++ $(top_srcdir)/include/sys/avl_impl.h \ ++ $(top_srcdir)/include/sys/bplist.h \ ++ $(top_srcdir)/include/sys/bpobj.h \ ++ $(top_srcdir)/include/sys/dbuf.h \ ++ $(top_srcdir)/include/sys/ddt.h \ ++ $(top_srcdir)/include/sys/dmu.h \ ++ $(top_srcdir)/include/sys/dmu_impl.h \ ++ $(top_srcdir)/include/sys/dmu_objset.h \ ++ $(top_srcdir)/include/sys/dmu_traverse.h \ ++ $(top_srcdir)/include/sys/dmu_tx.h \ ++ $(top_srcdir)/include/sys/dmu_zfetch.h \ ++ $(top_srcdir)/include/sys/dnode.h \ ++ $(top_srcdir)/include/sys/dsl_dataset.h \ ++ $(top_srcdir)/include/sys/dsl_deadlist.h \ ++ $(top_srcdir)/include/sys/dsl_deleg.h \ ++ $(top_srcdir)/include/sys/dsl_dir.h \ ++ $(top_srcdir)/include/sys/dsl_pool.h \ ++ $(top_srcdir)/include/sys/dsl_prop.h \ ++ $(top_srcdir)/include/sys/dsl_scan.h \ ++ $(top_srcdir)/include/sys/dsl_synctask.h \ ++ $(top_srcdir)/include/sys/efi_partition.h \ ++ $(top_srcdir)/include/sys/metaslab.h \ ++ $(top_srcdir)/include/sys/metaslab_impl.h \ ++ $(top_srcdir)/include/sys/nvpair.h \ ++ $(top_srcdir)/include/sys/nvpair_impl.h \ ++ $(top_srcdir)/include/sys/refcount.h \ ++ $(top_srcdir)/include/sys/rrwlock.h \ ++ $(top_srcdir)/include/sys/sa.h \ ++ $(top_srcdir)/include/sys/sa_impl.h \ ++ $(top_srcdir)/include/sys/spa_boot.h \ ++ $(top_srcdir)/include/sys/space_map.h \ ++ $(top_srcdir)/include/sys/spa.h \ ++ $(top_srcdir)/include/sys/spa_impl.h \ ++ $(top_srcdir)/include/sys/txg.h \ ++ $(top_srcdir)/include/sys/txg_impl.h \ ++ $(top_srcdir)/include/sys/u8_textprep_data.h \ ++ $(top_srcdir)/include/sys/u8_textprep.h \ ++ $(top_srcdir)/include/sys/uberblock.h \ ++ $(top_srcdir)/include/sys/uberblock_impl.h \ ++ $(top_srcdir)/include/sys/uio_impl.h \ ++ $(top_srcdir)/include/sys/unique.h \ ++ $(top_srcdir)/include/sys/uuid.h \ ++ $(top_srcdir)/include/sys/vdev_disk.h \ ++ $(top_srcdir)/include/sys/vdev_file.h \ ++ $(top_srcdir)/include/sys/vdev.h \ ++ $(top_srcdir)/include/sys/vdev_impl.h \ ++ $(top_srcdir)/include/sys/xvattr.h \ ++ $(top_srcdir)/include/sys/zap.h \ ++ $(top_srcdir)/include/sys/zap_impl.h \ ++ $(top_srcdir)/include/sys/zap_leaf.h \ ++ $(top_srcdir)/include/sys/zfs_acl.h \ ++ $(top_srcdir)/include/sys/zfs_context.h \ ++ $(top_srcdir)/include/sys/zfs_ctldir.h \ ++ $(top_srcdir)/include/sys/zfs_debug.h \ ++ $(top_srcdir)/include/sys/zfs_dir.h \ ++ $(top_srcdir)/include/sys/zfs_fuid.h \ ++ $(top_srcdir)/include/sys/zfs_rlock.h \ ++ $(top_srcdir)/include/sys/zfs_sa.h \ ++ $(top_srcdir)/include/sys/zfs_stat.h \ ++ $(top_srcdir)/include/sys/zfs_vfsops.h \ ++ $(top_srcdir)/include/sys/zfs_znode.h \ ++ $(top_srcdir)/include/sys/zfs_vnops.h \ ++ $(top_srcdir)/include/sys/zil.h \ ++ $(top_srcdir)/include/sys/zil_impl.h \ ++ $(top_srcdir)/include/sys/zio_checksum.h \ ++ $(top_srcdir)/include/sys/zio_compress.h \ ++ $(top_srcdir)/include/sys/zio.h \ ++ $(top_srcdir)/include/sys/zio_impl.h \ ++ $(top_srcdir)/include/sys/zrlock.h \ ++ $(top_srcdir)/include/sys/zfs_ioctl.h \ ++ $(top_srcdir)/include/sys/zfs_onexit.h \ ++ ${top_srcdir}/include/sys/zpl.h \ ++ $(top_srcdir)/include/sys/zvol.h ++am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; ++am__vpath_adj = case $$p in \ ++ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ ++ *) f=$$p;; \ ++ esac; ++am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; ++am__install_max = 40 ++am__nobase_strip_setup = \ ++ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` ++am__nobase_strip = \ ++ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" ++am__nobase_list = $(am__nobase_strip_setup); \ ++ for p in $$list; do echo "$$p $$p"; done | \ ++ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ ++ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ ++ if (++n[$$2] == $(am__install_max)) \ ++ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ ++ END { for (dir in files) print dir, files[dir] }' ++am__base_list = \ ++ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ ++ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' ++am__uninstall_files_from_dir = { \ ++ test -z "$$files" \ ++ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ ++ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ ++ $(am__cd) "$$dir" && rm -f $$files; }; \ ++ } ++am__installdirs = "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)" ++am__libzfs_HEADERS_DIST = $(top_srcdir)/include/sys/arc.h \ ++ $(top_srcdir)/include/sys/avl.h \ ++ $(top_srcdir)/include/sys/avl_impl.h \ ++ $(top_srcdir)/include/sys/bplist.h \ ++ $(top_srcdir)/include/sys/bpobj.h \ ++ $(top_srcdir)/include/sys/dbuf.h \ ++ $(top_srcdir)/include/sys/ddt.h \ ++ $(top_srcdir)/include/sys/dmu.h \ ++ $(top_srcdir)/include/sys/dmu_impl.h \ ++ $(top_srcdir)/include/sys/dmu_objset.h \ ++ $(top_srcdir)/include/sys/dmu_traverse.h \ ++ $(top_srcdir)/include/sys/dmu_tx.h \ ++ $(top_srcdir)/include/sys/dmu_zfetch.h \ ++ $(top_srcdir)/include/sys/dnode.h \ ++ $(top_srcdir)/include/sys/dsl_dataset.h \ ++ $(top_srcdir)/include/sys/dsl_deadlist.h \ ++ $(top_srcdir)/include/sys/dsl_deleg.h \ ++ $(top_srcdir)/include/sys/dsl_dir.h \ ++ $(top_srcdir)/include/sys/dsl_pool.h \ ++ $(top_srcdir)/include/sys/dsl_prop.h \ ++ $(top_srcdir)/include/sys/dsl_scan.h \ ++ $(top_srcdir)/include/sys/dsl_synctask.h \ ++ $(top_srcdir)/include/sys/efi_partition.h \ ++ $(top_srcdir)/include/sys/metaslab.h \ ++ $(top_srcdir)/include/sys/metaslab_impl.h \ ++ $(top_srcdir)/include/sys/nvpair.h \ ++ $(top_srcdir)/include/sys/nvpair_impl.h \ ++ $(top_srcdir)/include/sys/refcount.h \ ++ $(top_srcdir)/include/sys/rrwlock.h \ ++ $(top_srcdir)/include/sys/sa.h \ ++ $(top_srcdir)/include/sys/sa_impl.h \ ++ $(top_srcdir)/include/sys/spa_boot.h \ ++ $(top_srcdir)/include/sys/space_map.h \ ++ $(top_srcdir)/include/sys/spa.h \ ++ $(top_srcdir)/include/sys/spa_impl.h \ ++ $(top_srcdir)/include/sys/txg.h \ ++ $(top_srcdir)/include/sys/txg_impl.h \ ++ $(top_srcdir)/include/sys/u8_textprep_data.h \ ++ $(top_srcdir)/include/sys/u8_textprep.h \ ++ $(top_srcdir)/include/sys/uberblock.h \ ++ $(top_srcdir)/include/sys/uberblock_impl.h \ ++ $(top_srcdir)/include/sys/uio_impl.h \ ++ $(top_srcdir)/include/sys/unique.h \ ++ $(top_srcdir)/include/sys/uuid.h \ ++ $(top_srcdir)/include/sys/vdev_disk.h \ ++ $(top_srcdir)/include/sys/vdev_file.h \ ++ $(top_srcdir)/include/sys/vdev.h \ ++ $(top_srcdir)/include/sys/vdev_impl.h \ ++ $(top_srcdir)/include/sys/xvattr.h \ ++ $(top_srcdir)/include/sys/zap.h \ ++ $(top_srcdir)/include/sys/zap_impl.h \ ++ $(top_srcdir)/include/sys/zap_leaf.h \ ++ $(top_srcdir)/include/sys/zfs_acl.h \ ++ $(top_srcdir)/include/sys/zfs_context.h \ ++ $(top_srcdir)/include/sys/zfs_ctldir.h \ ++ $(top_srcdir)/include/sys/zfs_debug.h \ ++ $(top_srcdir)/include/sys/zfs_dir.h \ ++ $(top_srcdir)/include/sys/zfs_fuid.h \ ++ $(top_srcdir)/include/sys/zfs_rlock.h \ ++ $(top_srcdir)/include/sys/zfs_sa.h \ ++ $(top_srcdir)/include/sys/zfs_stat.h \ ++ $(top_srcdir)/include/sys/zfs_vfsops.h \ ++ $(top_srcdir)/include/sys/zfs_znode.h \ ++ $(top_srcdir)/include/sys/zfs_vnops.h \ ++ $(top_srcdir)/include/sys/zil.h \ ++ $(top_srcdir)/include/sys/zil_impl.h \ ++ $(top_srcdir)/include/sys/zio_checksum.h \ ++ $(top_srcdir)/include/sys/zio_compress.h \ ++ $(top_srcdir)/include/sys/zio.h \ ++ $(top_srcdir)/include/sys/zio_impl.h \ ++ $(top_srcdir)/include/sys/zrlock.h ++HEADERS = $(kernel_HEADERS) $(libzfs_HEADERS) ++RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \ ++ distclean-recursive maintainer-clean-recursive ++AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \ ++ $(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \ ++ distdir ++ETAGS = etags ++CTAGS = ctags ++DIST_SUBDIRS = $(SUBDIRS) ++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ++am__relativize = \ ++ dir0=`pwd`; \ ++ sed_first='s,^\([^/]*\)/.*$$,\1,'; \ ++ sed_rest='s,^[^/]*/*,,'; \ ++ sed_last='s,^.*/\([^/]*\)$$,\1,'; \ ++ sed_butlast='s,/*[^/]*$$,,'; \ ++ while test -n "$$dir1"; do \ ++ first=`echo "$$dir1" | sed -e "$$sed_first"`; \ ++ if test "$$first" != "."; then \ ++ if test "$$first" = ".."; then \ ++ dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \ ++ dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \ ++ else \ ++ first2=`echo "$$dir2" | sed -e "$$sed_first"`; \ ++ if test "$$first2" = "$$first"; then \ ++ dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \ ++ else \ ++ dir2="../$$dir2"; \ ++ fi; \ ++ dir0="$$dir0"/"$$first"; \ ++ fi; \ ++ fi; \ ++ dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \ ++ done; \ ++ reldir="$$dir2" ++ACLOCAL = @ACLOCAL@ ++ALIEN = @ALIEN@ ++ALIEN_VERSION = @ALIEN_VERSION@ ++AMTAR = @AMTAR@ ++AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ ++AR = @AR@ ++AUTOCONF = @AUTOCONF@ ++AUTOHEADER = @AUTOHEADER@ ++AUTOMAKE = @AUTOMAKE@ ++AWK = @AWK@ ++CC = @CC@ ++CCAS = @CCAS@ ++CCASDEPMODE = @CCASDEPMODE@ ++CCASFLAGS = @CCASFLAGS@ ++CCDEPMODE = @CCDEPMODE@ ++CFLAGS = @CFLAGS@ ++CPP = @CPP@ ++CPPFLAGS = @CPPFLAGS@ ++CYGPATH_W = @CYGPATH_W@ ++DEBUG_CFLAGS = @DEBUG_CFLAGS@ ++DEBUG_DMU_TX = @DEBUG_DMU_TX@ ++DEBUG_STACKFLAGS = @DEBUG_STACKFLAGS@ ++DEBUG_ZFS = @DEBUG_ZFS@ ++DEFAULT_INIT_DIR = @DEFAULT_INIT_DIR@ ++DEFAULT_INIT_SCRIPT = @DEFAULT_INIT_SCRIPT@ ++DEFAULT_PACKAGE = @DEFAULT_PACKAGE@ ++DEFS = @DEFS@ ++DEPDIR = @DEPDIR@ ++DLLTOOL = @DLLTOOL@ ++DPKG = @DPKG@ ++DPKGBUILD = @DPKGBUILD@ ++DPKGBUILD_VERSION = @DPKGBUILD_VERSION@ ++DPKG_VERSION = @DPKG_VERSION@ ++DSYMUTIL = @DSYMUTIL@ ++DUMPBIN = @DUMPBIN@ ++ECHO_C = @ECHO_C@ ++ECHO_N = @ECHO_N@ ++ECHO_T = @ECHO_T@ ++EGREP = @EGREP@ ++EXEEXT = @EXEEXT@ ++FGREP = @FGREP@ ++FRAME_LARGER_THAN = @FRAME_LARGER_THAN@ ++GREP = @GREP@ ++HAVE_ALIEN = @HAVE_ALIEN@ ++HAVE_DPKG = @HAVE_DPKG@ ++HAVE_DPKGBUILD = @HAVE_DPKGBUILD@ ++HAVE_MAKEPKG = @HAVE_MAKEPKG@ ++HAVE_PACMAN = @HAVE_PACMAN@ ++HAVE_RPM = @HAVE_RPM@ ++HAVE_RPMBUILD = @HAVE_RPMBUILD@ ++INSTALL = @INSTALL@ ++INSTALL_DATA = @INSTALL_DATA@ ++INSTALL_PROGRAM = @INSTALL_PROGRAM@ ++INSTALL_SCRIPT = @INSTALL_SCRIPT@ ++INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ ++KERNELCPPFLAGS = @KERNELCPPFLAGS@ ++KERNELMAKE_PARAMS = @KERNELMAKE_PARAMS@ ++LD = @LD@ ++LDFLAGS = @LDFLAGS@ ++LIBBLKID = @LIBBLKID@ ++LIBOBJS = @LIBOBJS@ ++LIBS = @LIBS@ ++LIBSELINUX = @LIBSELINUX@ ++LIBTOOL = @LIBTOOL@ ++LIBUUID = @LIBUUID@ ++LINUX = @LINUX@ ++LINUX_OBJ = @LINUX_OBJ@ ++LINUX_SYMBOLS = @LINUX_SYMBOLS@ ++LINUX_VERSION = @LINUX_VERSION@ ++LIPO = @LIPO@ ++LN_S = @LN_S@ ++LTLIBOBJS = @LTLIBOBJS@ ++MAINT = @MAINT@ ++MAKEINFO = @MAKEINFO@ ++MAKEPKG = @MAKEPKG@ ++MAKEPKG_VERSION = @MAKEPKG_VERSION@ ++MANIFEST_TOOL = @MANIFEST_TOOL@ ++MKDIR_P = @MKDIR_P@ ++NM = @NM@ ++NMEDIT = @NMEDIT@ ++NO_UNUSED_BUT_SET_VARIABLE = @NO_UNUSED_BUT_SET_VARIABLE@ ++OBJDUMP = @OBJDUMP@ ++OBJEXT = @OBJEXT@ ++OTOOL = @OTOOL@ ++OTOOL64 = @OTOOL64@ ++PACKAGE = @PACKAGE@ ++PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ ++PACKAGE_NAME = @PACKAGE_NAME@ ++PACKAGE_STRING = @PACKAGE_STRING@ ++PACKAGE_TARNAME = @PACKAGE_TARNAME@ ++PACKAGE_URL = @PACKAGE_URL@ ++PACKAGE_VERSION = @PACKAGE_VERSION@ ++PACMAN = @PACMAN@ ++PACMAN_VERSION = @PACMAN_VERSION@ ++PATH_SEPARATOR = @PATH_SEPARATOR@ ++RANLIB = @RANLIB@ ++RPM = @RPM@ ++RPMBUILD = @RPMBUILD@ ++RPMBUILD_VERSION = @RPMBUILD_VERSION@ ++RPM_VERSION = @RPM_VERSION@ ++SED = @SED@ ++SET_MAKE = @SET_MAKE@ ++SHELL = @SHELL@ ++SPL = @SPL@ ++SPL_OBJ = @SPL_OBJ@ ++SPL_SYMBOLS = @SPL_SYMBOLS@ ++SPL_VERSION = @SPL_VERSION@ ++STRIP = @STRIP@ ++TARGET_ASM_DIR = @TARGET_ASM_DIR@ ++VENDOR = @VENDOR@ ++VERSION = @VERSION@ ++ZFS_CONFIG = @ZFS_CONFIG@ ++ZFS_META_ALIAS = @ZFS_META_ALIAS@ ++ZFS_META_AUTHOR = @ZFS_META_AUTHOR@ ++ZFS_META_DATA = @ZFS_META_DATA@ ++ZFS_META_LICENSE = @ZFS_META_LICENSE@ ++ZFS_META_LT_AGE = @ZFS_META_LT_AGE@ ++ZFS_META_LT_CURRENT = @ZFS_META_LT_CURRENT@ ++ZFS_META_LT_REVISION = @ZFS_META_LT_REVISION@ ++ZFS_META_NAME = @ZFS_META_NAME@ ++ZFS_META_RELEASE = @ZFS_META_RELEASE@ ++ZFS_META_VERSION = @ZFS_META_VERSION@ ++ZLIB = @ZLIB@ ++abs_builddir = @abs_builddir@ ++abs_srcdir = @abs_srcdir@ ++abs_top_builddir = @abs_top_builddir@ ++abs_top_srcdir = @abs_top_srcdir@ ++ac_ct_AR = @ac_ct_AR@ ++ac_ct_CC = @ac_ct_CC@ ++ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ ++am__include = @am__include@ ++am__leading_dot = @am__leading_dot@ ++am__quote = @am__quote@ ++am__tar = @am__tar@ ++am__untar = @am__untar@ ++bindir = @bindir@ ++build = @build@ ++build_alias = @build_alias@ ++build_cpu = @build_cpu@ ++build_os = @build_os@ ++build_vendor = @build_vendor@ ++builddir = @builddir@ ++datadir = @datadir@ ++datarootdir = @datarootdir@ ++docdir = @docdir@ ++dvidir = @dvidir@ ++exec_prefix = @exec_prefix@ ++host = @host@ ++host_alias = @host_alias@ ++host_cpu = @host_cpu@ ++host_os = @host_os@ ++host_vendor = @host_vendor@ ++htmldir = @htmldir@ ++includedir = @includedir@ ++infodir = @infodir@ ++install_sh = @install_sh@ ++libdir = @libdir@ ++libexecdir = @libexecdir@ ++localedir = @localedir@ ++localstatedir = @localstatedir@ ++mandir = @mandir@ ++mkdir_p = @mkdir_p@ ++oldincludedir = @oldincludedir@ ++pdfdir = @pdfdir@ ++prefix = @prefix@ ++program_transform_name = @program_transform_name@ ++psdir = @psdir@ ++sbindir = @sbindir@ ++sharedstatedir = @sharedstatedir@ ++srcdir = @srcdir@ ++sysconfdir = @sysconfdir@ ++target = @target@ ++target_alias = @target_alias@ ++target_cpu = @target_cpu@ ++target_os = @target_os@ ++target_vendor = @target_vendor@ ++top_build_prefix = @top_build_prefix@ ++top_builddir = @top_builddir@ ++top_srcdir = @top_srcdir@ ++udevdir = @udevdir@ ++udevruledir = @udevruledir@ ++SUBDIRS = fm fs ++COMMON_H = \ ++ $(top_srcdir)/include/sys/arc.h \ ++ $(top_srcdir)/include/sys/avl.h \ ++ $(top_srcdir)/include/sys/avl_impl.h \ ++ $(top_srcdir)/include/sys/bplist.h \ ++ $(top_srcdir)/include/sys/bpobj.h \ ++ $(top_srcdir)/include/sys/dbuf.h \ ++ $(top_srcdir)/include/sys/ddt.h \ ++ $(top_srcdir)/include/sys/dmu.h \ ++ $(top_srcdir)/include/sys/dmu_impl.h \ ++ $(top_srcdir)/include/sys/dmu_objset.h \ ++ $(top_srcdir)/include/sys/dmu_traverse.h \ ++ $(top_srcdir)/include/sys/dmu_tx.h \ ++ $(top_srcdir)/include/sys/dmu_zfetch.h \ ++ $(top_srcdir)/include/sys/dnode.h \ ++ $(top_srcdir)/include/sys/dsl_dataset.h \ ++ $(top_srcdir)/include/sys/dsl_deadlist.h \ ++ $(top_srcdir)/include/sys/dsl_deleg.h \ ++ $(top_srcdir)/include/sys/dsl_dir.h \ ++ $(top_srcdir)/include/sys/dsl_pool.h \ ++ $(top_srcdir)/include/sys/dsl_prop.h \ ++ $(top_srcdir)/include/sys/dsl_scan.h \ ++ $(top_srcdir)/include/sys/dsl_synctask.h \ ++ $(top_srcdir)/include/sys/efi_partition.h \ ++ $(top_srcdir)/include/sys/metaslab.h \ ++ $(top_srcdir)/include/sys/metaslab_impl.h \ ++ $(top_srcdir)/include/sys/nvpair.h \ ++ $(top_srcdir)/include/sys/nvpair_impl.h \ ++ $(top_srcdir)/include/sys/refcount.h \ ++ $(top_srcdir)/include/sys/rrwlock.h \ ++ $(top_srcdir)/include/sys/sa.h \ ++ $(top_srcdir)/include/sys/sa_impl.h \ ++ $(top_srcdir)/include/sys/spa_boot.h \ ++ $(top_srcdir)/include/sys/space_map.h \ ++ $(top_srcdir)/include/sys/spa.h \ ++ $(top_srcdir)/include/sys/spa_impl.h \ ++ $(top_srcdir)/include/sys/txg.h \ ++ $(top_srcdir)/include/sys/txg_impl.h \ ++ $(top_srcdir)/include/sys/u8_textprep_data.h \ ++ $(top_srcdir)/include/sys/u8_textprep.h \ ++ $(top_srcdir)/include/sys/uberblock.h \ ++ $(top_srcdir)/include/sys/uberblock_impl.h \ ++ $(top_srcdir)/include/sys/uio_impl.h \ ++ $(top_srcdir)/include/sys/unique.h \ ++ $(top_srcdir)/include/sys/uuid.h \ ++ $(top_srcdir)/include/sys/vdev_disk.h \ ++ $(top_srcdir)/include/sys/vdev_file.h \ ++ $(top_srcdir)/include/sys/vdev.h \ ++ $(top_srcdir)/include/sys/vdev_impl.h \ ++ $(top_srcdir)/include/sys/xvattr.h \ ++ $(top_srcdir)/include/sys/zap.h \ ++ $(top_srcdir)/include/sys/zap_impl.h \ ++ $(top_srcdir)/include/sys/zap_leaf.h \ ++ $(top_srcdir)/include/sys/zfs_acl.h \ ++ $(top_srcdir)/include/sys/zfs_context.h \ ++ $(top_srcdir)/include/sys/zfs_ctldir.h \ ++ $(top_srcdir)/include/sys/zfs_debug.h \ ++ $(top_srcdir)/include/sys/zfs_dir.h \ ++ $(top_srcdir)/include/sys/zfs_fuid.h \ ++ $(top_srcdir)/include/sys/zfs_rlock.h \ ++ $(top_srcdir)/include/sys/zfs_sa.h \ ++ $(top_srcdir)/include/sys/zfs_stat.h \ ++ $(top_srcdir)/include/sys/zfs_vfsops.h \ ++ $(top_srcdir)/include/sys/zfs_znode.h \ ++ $(top_srcdir)/include/sys/zfs_vnops.h \ ++ $(top_srcdir)/include/sys/zil.h \ ++ $(top_srcdir)/include/sys/zil_impl.h \ ++ $(top_srcdir)/include/sys/zio_checksum.h \ ++ $(top_srcdir)/include/sys/zio_compress.h \ ++ $(top_srcdir)/include/sys/zio.h \ ++ $(top_srcdir)/include/sys/zio_impl.h \ ++ $(top_srcdir)/include/sys/zrlock.h ++ ++KERNEL_H = \ ++ $(top_srcdir)/include/sys/zfs_ioctl.h \ ++ $(top_srcdir)/include/sys/zfs_onexit.h \ ++ ${top_srcdir}/include/sys/zpl.h \ ++ $(top_srcdir)/include/sys/zvol.h ++ ++USER_H = ++EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) ++@CONFIG_USER_TRUE@libzfsdir = $(includedir)/libzfs/sys ++@CONFIG_USER_TRUE@libzfs_HEADERS = $(COMMON_H) $(USER_H) ++@CONFIG_KERNEL_TRUE@kerneldir = /usr/src/zfs-$(ZFS_META_VERSION)-$(ZFS_META_RELEASE)/$(LINUX_VERSION)/sys ++@CONFIG_KERNEL_TRUE@kernel_HEADERS = $(COMMON_H) $(KERNEL_H) ++all: all-recursive ++ ++.SUFFIXES: ++$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) ++ @for dep in $?; do \ ++ case '$(am__configure_deps)' in \ ++ *$$dep*) \ ++ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ ++ && { if test -f $@; then exit 0; else break; fi; }; \ ++ exit 1;; \ ++ esac; \ ++ done; \ ++ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu include/sys/Makefile'; \ ++ $(am__cd) $(top_srcdir) && \ ++ $(AUTOMAKE) --gnu include/sys/Makefile ++.PRECIOUS: Makefile ++Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status ++ @case '$?' in \ ++ *config.status*) \ ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ ++ *) \ ++ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ ++ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ ++ esac; ++ ++$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++ ++$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) ++ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ++$(am__aclocal_m4_deps): ++ ++mostlyclean-libtool: ++ -rm -f *.lo ++ ++clean-libtool: ++ -rm -rf .libs _libs ++install-kernelHEADERS: $(kernel_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(kerneldir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(kerneldir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(kerneldir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(kerneldir)" || exit $$?; \ ++ done ++ ++uninstall-kernelHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(kernel_HEADERS)'; test -n "$(kerneldir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(kerneldir)'; $(am__uninstall_files_from_dir) ++install-libzfsHEADERS: $(libzfs_HEADERS) ++ @$(NORMAL_INSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ if test -n "$$list"; then \ ++ echo " $(MKDIR_P) '$(DESTDIR)$(libzfsdir)'"; \ ++ $(MKDIR_P) "$(DESTDIR)$(libzfsdir)" || exit 1; \ ++ fi; \ ++ for p in $$list; do \ ++ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ ++ echo "$$d$$p"; \ ++ done | $(am__base_list) | \ ++ while read files; do \ ++ echo " $(INSTALL_HEADER) $$files '$(DESTDIR)$(libzfsdir)'"; \ ++ $(INSTALL_HEADER) $$files "$(DESTDIR)$(libzfsdir)" || exit $$?; \ ++ done ++ ++uninstall-libzfsHEADERS: ++ @$(NORMAL_UNINSTALL) ++ @list='$(libzfs_HEADERS)'; test -n "$(libzfsdir)" || list=; \ ++ files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \ ++ dir='$(DESTDIR)$(libzfsdir)'; $(am__uninstall_files_from_dir) ++ ++# This directory's subdirectories are mostly independent; you can cd ++# into them and run `make' without going through this Makefile. ++# To change the values of `make' variables: instead of editing Makefiles, ++# (1) if the variable is set in `config.status', edit `config.status' ++# (which will cause the Makefiles to be regenerated when you run `make'); ++# (2) otherwise, pass the desired values on the `make' command line. ++$(RECURSIVE_TARGETS): ++ @fail= failcom='exit 1'; \ ++ for f in x $$MAKEFLAGS; do \ ++ case $$f in \ ++ *=* | --[!k]*);; \ ++ *k*) failcom='fail=yes';; \ ++ esac; \ ++ done; \ ++ dot_seen=no; \ ++ target=`echo $@ | sed s/-recursive//`; \ ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ echo "Making $$target in $$subdir"; \ ++ if test "$$subdir" = "."; then \ ++ dot_seen=yes; \ ++ local_target="$$target-am"; \ ++ else \ ++ local_target="$$target"; \ ++ fi; \ ++ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ ++ || eval $$failcom; \ ++ done; \ ++ if test "$$dot_seen" = "no"; then \ ++ $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ ++ fi; test -z "$$fail" ++ ++$(RECURSIVE_CLEAN_TARGETS): ++ @fail= failcom='exit 1'; \ ++ for f in x $$MAKEFLAGS; do \ ++ case $$f in \ ++ *=* | --[!k]*);; \ ++ *k*) failcom='fail=yes';; \ ++ esac; \ ++ done; \ ++ dot_seen=no; \ ++ case "$@" in \ ++ distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \ ++ *) list='$(SUBDIRS)' ;; \ ++ esac; \ ++ rev=''; for subdir in $$list; do \ ++ if test "$$subdir" = "."; then :; else \ ++ rev="$$subdir $$rev"; \ ++ fi; \ ++ done; \ ++ rev="$$rev ."; \ ++ target=`echo $@ | sed s/-recursive//`; \ ++ for subdir in $$rev; do \ ++ echo "Making $$target in $$subdir"; \ ++ if test "$$subdir" = "."; then \ ++ local_target="$$target-am"; \ ++ else \ ++ local_target="$$target"; \ ++ fi; \ ++ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ ++ || eval $$failcom; \ ++ done && test -z "$$fail" ++tags-recursive: ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \ ++ done ++ctags-recursive: ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \ ++ done ++ ++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ mkid -fID $$unique ++tags: TAGS ++ ++TAGS: tags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ set x; \ ++ here=`pwd`; \ ++ if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \ ++ include_option=--etags-include; \ ++ empty_fix=.; \ ++ else \ ++ include_option=--include; \ ++ empty_fix=; \ ++ fi; \ ++ list='$(SUBDIRS)'; for subdir in $$list; do \ ++ if test "$$subdir" = .; then :; else \ ++ test ! -f $$subdir/TAGS || \ ++ set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \ ++ fi; \ ++ done; \ ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ shift; \ ++ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ ++ test -n "$$unique" || unique=$$empty_fix; \ ++ if test $$# -gt 0; then \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ "$$@" $$unique; \ ++ else \ ++ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ ++ $$unique; \ ++ fi; \ ++ fi ++ctags: CTAGS ++CTAGS: ctags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ ++ $(TAGS_FILES) $(LISP) ++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ ++ unique=`for i in $$list; do \ ++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ ++ done | \ ++ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ ++ END { if (nonempty) { for (i in files) print i; }; }'`; \ ++ test -z "$(CTAGS_ARGS)$$unique" \ ++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ ++ $$unique ++ ++GTAGS: ++ here=`$(am__cd) $(top_builddir) && pwd` \ ++ && $(am__cd) $(top_srcdir) \ ++ && gtags -i $(GTAGS_ARGS) "$$here" ++ ++distclean-tags: ++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags ++ ++distdir: $(DISTFILES) ++ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ ++ list='$(DISTFILES)'; \ ++ dist_files=`for file in $$list; do echo $$file; done | \ ++ sed -e "s|^$$srcdirstrip/||;t" \ ++ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ ++ case $$dist_files in \ ++ */*) $(MKDIR_P) `echo "$$dist_files" | \ ++ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ ++ sort -u` ;; \ ++ esac; \ ++ for file in $$dist_files; do \ ++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ ++ if test -d $$d/$$file; then \ ++ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ ++ if test -d "$(distdir)/$$file"; then \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ ++ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ ++ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ ++ fi; \ ++ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ ++ else \ ++ test -f "$(distdir)/$$file" \ ++ || cp -p $$d/$$file "$(distdir)/$$file" \ ++ || exit 1; \ ++ fi; \ ++ done ++ @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \ ++ if test "$$subdir" = .; then :; else \ ++ $(am__make_dryrun) \ ++ || test -d "$(distdir)/$$subdir" \ ++ || $(MKDIR_P) "$(distdir)/$$subdir" \ ++ || exit 1; \ ++ dir1=$$subdir; dir2="$(distdir)/$$subdir"; \ ++ $(am__relativize); \ ++ new_distdir=$$reldir; \ ++ dir1=$$subdir; dir2="$(top_distdir)"; \ ++ $(am__relativize); \ ++ new_top_distdir=$$reldir; \ ++ echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \ ++ echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \ ++ ($(am__cd) $$subdir && \ ++ $(MAKE) $(AM_MAKEFLAGS) \ ++ top_distdir="$$new_top_distdir" \ ++ distdir="$$new_distdir" \ ++ am__remove_distdir=: \ ++ am__skip_length_check=: \ ++ am__skip_mode_fix=: \ ++ distdir) \ ++ || exit 1; \ ++ fi; \ ++ done ++check-am: all-am ++check: check-recursive ++all-am: Makefile $(HEADERS) ++installdirs: installdirs-recursive ++installdirs-am: ++ for dir in "$(DESTDIR)$(kerneldir)" "$(DESTDIR)$(libzfsdir)"; do \ ++ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ ++ done ++install: install-recursive ++install-exec: install-exec-recursive ++install-data: install-data-recursive ++uninstall: uninstall-recursive ++ ++install-am: all-am ++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am ++ ++installcheck: installcheck-recursive ++install-strip: ++ if test -z '$(STRIP)'; then \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ install; \ ++ else \ ++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ ++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ ++ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ ++ fi ++mostlyclean-generic: ++ ++clean-generic: ++ ++distclean-generic: ++ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) ++ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) ++ ++maintainer-clean-generic: ++ @echo "This command is intended for maintainers to use" ++ @echo "it deletes files that may require special tools to rebuild." ++clean: clean-recursive ++ ++clean-am: clean-generic clean-libtool mostlyclean-am ++ ++distclean: distclean-recursive ++ -rm -f Makefile ++distclean-am: clean-am distclean-generic distclean-tags ++ ++dvi: dvi-recursive ++ ++dvi-am: ++ ++html: html-recursive ++ ++html-am: ++ ++info: info-recursive ++ ++info-am: ++ ++install-data-am: install-kernelHEADERS install-libzfsHEADERS ++ ++install-dvi: install-dvi-recursive ++ ++install-dvi-am: ++ ++install-exec-am: ++ ++install-html: install-html-recursive ++ ++install-html-am: ++ ++install-info: install-info-recursive ++ ++install-info-am: ++ ++install-man: ++ ++install-pdf: install-pdf-recursive ++ ++install-pdf-am: ++ ++install-ps: install-ps-recursive ++ ++install-ps-am: ++ ++installcheck-am: ++ ++maintainer-clean: maintainer-clean-recursive ++ -rm -f Makefile ++maintainer-clean-am: distclean-am maintainer-clean-generic ++ ++mostlyclean: mostlyclean-recursive ++ ++mostlyclean-am: mostlyclean-generic mostlyclean-libtool ++ ++pdf: pdf-recursive ++ ++pdf-am: ++ ++ps: ps-recursive ++ ++ps-am: ++ ++uninstall-am: uninstall-kernelHEADERS uninstall-libzfsHEADERS ++ ++.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \ ++ install-am install-strip tags-recursive ++ ++.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \ ++ all all-am check check-am clean clean-generic clean-libtool \ ++ ctags ctags-recursive distclean distclean-generic \ ++ distclean-libtool distclean-tags distdir dvi dvi-am html \ ++ html-am info info-am install install-am install-data \ ++ install-data-am install-dvi install-dvi-am install-exec \ ++ install-exec-am install-html install-html-am install-info \ ++ install-info-am install-kernelHEADERS install-libzfsHEADERS \ ++ install-man install-pdf install-pdf-am install-ps \ ++ install-ps-am install-strip installcheck installcheck-am \ ++ installdirs installdirs-am maintainer-clean \ ++ maintainer-clean-generic mostlyclean mostlyclean-generic \ ++ mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \ ++ uninstall uninstall-am uninstall-kernelHEADERS \ ++ uninstall-libzfsHEADERS ++ ++ ++# Tell versions [3.59,3.63) of GNU make to not export all variables. ++# Otherwise a system limit (for SysV at least) may be exceeded. ++.NOEXPORT: +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/metaslab.h linux-3.2.33-go/include/zfs/sys/metaslab.h +--- linux-3.2.33-go.orig/include/zfs/sys/metaslab.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/metaslab.h 2012-11-16 23:25:34.343039404 +0100 +@@ -0,0 +1,86 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ */ ++ ++#ifndef _SYS_METASLAB_H ++#define _SYS_METASLAB_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++extern space_map_ops_t *zfs_metaslab_ops; ++ ++extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, ++ uint64_t start, uint64_t size, uint64_t txg); ++extern void metaslab_fini(metaslab_t *msp); ++extern void metaslab_sync(metaslab_t *msp, uint64_t txg); ++extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg); ++extern void metaslab_sync_reassess(metaslab_group_t *mg); ++ ++#define METASLAB_HINTBP_FAVOR 0x0 ++#define METASLAB_HINTBP_AVOID 0x1 ++#define METASLAB_GANG_HEADER 0x2 ++#define METASLAB_GANG_CHILD 0x4 ++#define METASLAB_GANG_AVOID 0x8 ++#define METASLAB_FASTWRITE 0x10 ++ ++extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, ++ blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags); ++extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, ++ boolean_t now); ++extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); ++extern void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp); ++extern void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp); ++ ++extern metaslab_class_t *metaslab_class_create(spa_t *spa, ++ space_map_ops_t *ops); ++extern void metaslab_class_destroy(metaslab_class_t *mc); ++extern int metaslab_class_validate(metaslab_class_t *mc); ++ ++extern void metaslab_class_space_update(metaslab_class_t *mc, ++ int64_t alloc_delta, int64_t defer_delta, ++ int64_t space_delta, int64_t dspace_delta); ++extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc); ++extern uint64_t metaslab_class_get_space(metaslab_class_t *mc); ++extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc); ++extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc); ++ ++extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, ++ vdev_t *vd); ++extern void metaslab_group_destroy(metaslab_group_t *mg); ++extern void metaslab_group_activate(metaslab_group_t *mg); ++extern void metaslab_group_passivate(metaslab_group_t *mg); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_METASLAB_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/metaslab_impl.h linux-3.2.33-go/include/zfs/sys/metaslab_impl.h +--- linux-3.2.33-go.orig/include/zfs/sys/metaslab_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/metaslab_impl.h 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,92 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ */ ++ ++#ifndef _SYS_METASLAB_IMPL_H ++#define _SYS_METASLAB_IMPL_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct metaslab_class { ++ spa_t *mc_spa; ++ metaslab_group_t *mc_rotor; ++ space_map_ops_t *mc_ops; ++ uint64_t mc_aliquot; ++ uint64_t mc_alloc; /* total allocated space */ ++ uint64_t mc_deferred; /* total deferred frees */ ++ uint64_t mc_space; /* total space (alloc + free) */ ++ uint64_t mc_dspace; /* total deflated space */ ++ kmutex_t mc_fastwrite_lock; ++}; ++ ++struct metaslab_group { ++ kmutex_t mg_lock; ++ avl_tree_t mg_metaslab_tree; ++ uint64_t mg_aliquot; ++ uint64_t mg_bonus_area; ++ uint64_t mg_alloc_failures; ++ int64_t mg_bias; ++ int64_t mg_activation_count; ++ metaslab_class_t *mg_class; ++ vdev_t *mg_vd; ++ metaslab_group_t *mg_prev; ++ metaslab_group_t *mg_next; ++}; ++ ++/* ++ * Each metaslab's free space is tracked in space map object in the MOS, ++ * which is only updated in syncing context. Each time we sync a txg, ++ * we append the allocs and frees from that txg to the space map object. ++ * When the txg is done syncing, metaslab_sync_done() updates ms_smo ++ * to ms_smo_syncing. Everything in ms_smo is always safe to allocate. ++ */ ++struct metaslab { ++ kmutex_t ms_lock; /* metaslab lock */ ++ space_map_obj_t ms_smo; /* synced space map object */ ++ space_map_obj_t ms_smo_syncing; /* syncing space map object */ ++ space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */ ++ space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */ ++ space_map_t ms_defermap[TXG_DEFER_SIZE]; /* deferred frees */ ++ space_map_t ms_map; /* in-core free space map */ ++ int64_t ms_deferspace; /* sum of ms_defermap[] space */ ++ uint64_t ms_weight; /* weight vs. others in group */ ++ metaslab_group_t *ms_group; /* metaslab group */ ++ avl_node_t ms_group_node; /* node in metaslab group tree */ ++ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ ++}; ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_METASLAB_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/nvpair.h linux-3.2.33-go/include/zfs/sys/nvpair.h +--- linux-3.2.33-go.orig/include/zfs/sys/nvpair.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/nvpair.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,281 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_NVPAIR_H ++#define _SYS_NVPAIR_H ++ ++#include ++#include ++#include ++ ++#if defined(_KERNEL) && !defined(_BOOT) ++#include ++#endif ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++typedef enum { ++ DATA_TYPE_UNKNOWN = 0, ++ DATA_TYPE_BOOLEAN, ++ DATA_TYPE_BYTE, ++ DATA_TYPE_INT16, ++ DATA_TYPE_UINT16, ++ DATA_TYPE_INT32, ++ DATA_TYPE_UINT32, ++ DATA_TYPE_INT64, ++ DATA_TYPE_UINT64, ++ DATA_TYPE_STRING, ++ DATA_TYPE_BYTE_ARRAY, ++ DATA_TYPE_INT16_ARRAY, ++ DATA_TYPE_UINT16_ARRAY, ++ DATA_TYPE_INT32_ARRAY, ++ DATA_TYPE_UINT32_ARRAY, ++ DATA_TYPE_INT64_ARRAY, ++ DATA_TYPE_UINT64_ARRAY, ++ DATA_TYPE_STRING_ARRAY, ++ DATA_TYPE_HRTIME, ++ DATA_TYPE_NVLIST, ++ DATA_TYPE_NVLIST_ARRAY, ++ DATA_TYPE_BOOLEAN_VALUE, ++ DATA_TYPE_INT8, ++ DATA_TYPE_UINT8, ++ DATA_TYPE_BOOLEAN_ARRAY, ++ DATA_TYPE_INT8_ARRAY, ++#if !defined(_KERNEL) ++ DATA_TYPE_UINT8_ARRAY, ++ DATA_TYPE_DOUBLE ++#else ++ DATA_TYPE_UINT8_ARRAY ++#endif ++} data_type_t; ++ ++typedef struct nvpair { ++ int32_t nvp_size; /* size of this nvpair */ ++ int16_t nvp_name_sz; /* length of name string */ ++ int16_t nvp_reserve; /* not used */ ++ int32_t nvp_value_elem; /* number of elements for array types */ ++ data_type_t nvp_type; /* type of value */ ++ /* name string */ ++ /* aligned ptr array for string arrays */ ++ /* aligned array of data for value */ ++} nvpair_t; ++ ++/* nvlist header */ ++typedef struct nvlist { ++ int32_t nvl_version; ++ uint32_t nvl_nvflag; /* persistent flags */ ++ uint64_t nvl_priv; /* ptr to private data if not packed */ ++ uint32_t nvl_flag; ++ int32_t nvl_pad; /* currently not used, for alignment */ ++} nvlist_t; ++ ++/* nvp implementation version */ ++#define NV_VERSION 0 ++ ++/* nvlist pack encoding */ ++#define NV_ENCODE_NATIVE 0 ++#define NV_ENCODE_XDR 1 ++ ++/* nvlist persistent unique name flags, stored in nvl_nvflags */ ++#define NV_UNIQUE_NAME 0x1 ++#define NV_UNIQUE_NAME_TYPE 0x2 ++ ++/* nvlist lookup pairs related flags */ ++#define NV_FLAG_NOENTOK 0x1 ++ ++/* convenience macros */ ++#define NV_ALIGN(x) (((ulong_t)(x) + 7ul) & ~7ul) ++#define NV_ALIGN4(x) (((x) + 3) & ~3) ++ ++#define NVP_SIZE(nvp) ((nvp)->nvp_size) ++#define NVP_NAME(nvp) ((char *)(nvp) + sizeof (nvpair_t)) ++#define NVP_TYPE(nvp) ((nvp)->nvp_type) ++#define NVP_NELEM(nvp) ((nvp)->nvp_value_elem) ++#define NVP_VALUE(nvp) ((char *)(nvp) + NV_ALIGN(sizeof (nvpair_t) \ ++ + (nvp)->nvp_name_sz)) ++ ++#define NVL_VERSION(nvl) ((nvl)->nvl_version) ++#define NVL_SIZE(nvl) ((nvl)->nvl_size) ++#define NVL_FLAG(nvl) ((nvl)->nvl_flag) ++ ++/* NV allocator framework */ ++typedef struct nv_alloc_ops nv_alloc_ops_t; ++ ++typedef struct nv_alloc { ++ const nv_alloc_ops_t *nva_ops; ++ void *nva_arg; ++} nv_alloc_t; ++ ++struct nv_alloc_ops { ++ int (*nv_ao_init)(nv_alloc_t *, __va_list); ++ void (*nv_ao_fini)(nv_alloc_t *); ++ void *(*nv_ao_alloc)(nv_alloc_t *, size_t); ++ void (*nv_ao_free)(nv_alloc_t *, void *, size_t); ++ void (*nv_ao_reset)(nv_alloc_t *); ++}; ++ ++extern const nv_alloc_ops_t *nv_fixed_ops; ++extern nv_alloc_t *nv_alloc_nosleep; ++ ++#if defined(_KERNEL) && !defined(_BOOT) ++extern nv_alloc_t *nv_alloc_sleep; ++#endif ++ ++int nv_alloc_init(nv_alloc_t *, const nv_alloc_ops_t *, /* args */ ...); ++void nv_alloc_reset(nv_alloc_t *); ++void nv_alloc_fini(nv_alloc_t *); ++ ++/* list management */ ++int nvlist_alloc(nvlist_t **, uint_t, int); ++void nvlist_free(nvlist_t *); ++int nvlist_size(nvlist_t *, size_t *, int); ++int nvlist_pack(nvlist_t *, char **, size_t *, int, int); ++int nvlist_unpack(char *, size_t, nvlist_t **, int); ++int nvlist_dup(nvlist_t *, nvlist_t **, int); ++int nvlist_merge(nvlist_t *, nvlist_t *, int); ++ ++uint_t nvlist_nvflag(nvlist_t *); ++ ++int nvlist_xalloc(nvlist_t **, uint_t, nv_alloc_t *); ++int nvlist_xpack(nvlist_t *, char **, size_t *, int, nv_alloc_t *); ++int nvlist_xunpack(char *, size_t, nvlist_t **, nv_alloc_t *); ++int nvlist_xdup(nvlist_t *, nvlist_t **, nv_alloc_t *); ++nv_alloc_t *nvlist_lookup_nv_alloc(nvlist_t *); ++ ++int nvlist_add_nvpair(nvlist_t *, nvpair_t *); ++int nvlist_add_boolean(nvlist_t *, const char *); ++int nvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); ++int nvlist_add_byte(nvlist_t *, const char *, uchar_t); ++int nvlist_add_int8(nvlist_t *, const char *, int8_t); ++int nvlist_add_uint8(nvlist_t *, const char *, uint8_t); ++int nvlist_add_int16(nvlist_t *, const char *, int16_t); ++int nvlist_add_uint16(nvlist_t *, const char *, uint16_t); ++int nvlist_add_int32(nvlist_t *, const char *, int32_t); ++int nvlist_add_uint32(nvlist_t *, const char *, uint32_t); ++int nvlist_add_int64(nvlist_t *, const char *, int64_t); ++int nvlist_add_uint64(nvlist_t *, const char *, uint64_t); ++int nvlist_add_string(nvlist_t *, const char *, const char *); ++int nvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); ++int nvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t); ++int nvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t); ++int nvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t); ++int nvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t); ++int nvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t); ++int nvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t); ++int nvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t); ++int nvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t); ++int nvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t); ++int nvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t); ++int nvlist_add_string_array(nvlist_t *, const char *, char *const *, uint_t); ++int nvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t); ++int nvlist_add_hrtime(nvlist_t *, const char *, hrtime_t); ++#if !defined(_KERNEL) ++int nvlist_add_double(nvlist_t *, const char *, double); ++#endif ++ ++int nvlist_remove(nvlist_t *, const char *, data_type_t); ++int nvlist_remove_all(nvlist_t *, const char *); ++int nvlist_remove_nvpair(nvlist_t *, nvpair_t *); ++ ++int nvlist_lookup_boolean(nvlist_t *, const char *); ++int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *); ++int nvlist_lookup_byte(nvlist_t *, const char *, uchar_t *); ++int nvlist_lookup_int8(nvlist_t *, const char *, int8_t *); ++int nvlist_lookup_uint8(nvlist_t *, const char *, uint8_t *); ++int nvlist_lookup_int16(nvlist_t *, const char *, int16_t *); ++int nvlist_lookup_uint16(nvlist_t *, const char *, uint16_t *); ++int nvlist_lookup_int32(nvlist_t *, const char *, int32_t *); ++int nvlist_lookup_uint32(nvlist_t *, const char *, uint32_t *); ++int nvlist_lookup_int64(nvlist_t *, const char *, int64_t *); ++int nvlist_lookup_uint64(nvlist_t *, const char *, uint64_t *); ++int nvlist_lookup_string(nvlist_t *, const char *, char **); ++int nvlist_lookup_nvlist(nvlist_t *, const char *, nvlist_t **); ++int nvlist_lookup_boolean_array(nvlist_t *, const char *, ++ boolean_t **, uint_t *); ++int nvlist_lookup_byte_array(nvlist_t *, const char *, uchar_t **, uint_t *); ++int nvlist_lookup_int8_array(nvlist_t *, const char *, int8_t **, uint_t *); ++int nvlist_lookup_uint8_array(nvlist_t *, const char *, uint8_t **, uint_t *); ++int nvlist_lookup_int16_array(nvlist_t *, const char *, int16_t **, uint_t *); ++int nvlist_lookup_uint16_array(nvlist_t *, const char *, uint16_t **, uint_t *); ++int nvlist_lookup_int32_array(nvlist_t *, const char *, int32_t **, uint_t *); ++int nvlist_lookup_uint32_array(nvlist_t *, const char *, uint32_t **, uint_t *); ++int nvlist_lookup_int64_array(nvlist_t *, const char *, int64_t **, uint_t *); ++int nvlist_lookup_uint64_array(nvlist_t *, const char *, uint64_t **, uint_t *); ++int nvlist_lookup_string_array(nvlist_t *, const char *, char ***, uint_t *); ++int nvlist_lookup_nvlist_array(nvlist_t *, const char *, ++ nvlist_t ***, uint_t *); ++int nvlist_lookup_hrtime(nvlist_t *, const char *, hrtime_t *); ++int nvlist_lookup_pairs(nvlist_t *, int, ...); ++#if !defined(_KERNEL) ++int nvlist_lookup_double(nvlist_t *, const char *, double *); ++#endif ++ ++int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **); ++int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, nvpair_t **, ++ int *, char **); ++boolean_t nvlist_exists(nvlist_t *, const char *); ++boolean_t nvlist_empty(nvlist_t *); ++ ++/* processing nvpair */ ++nvpair_t *nvlist_next_nvpair(nvlist_t *, nvpair_t *); ++nvpair_t *nvlist_prev_nvpair(nvlist_t *, nvpair_t *); ++char *nvpair_name(nvpair_t *); ++data_type_t nvpair_type(nvpair_t *); ++int nvpair_type_is_array(nvpair_t *); ++int nvpair_value_boolean_value(nvpair_t *, boolean_t *); ++int nvpair_value_byte(nvpair_t *, uchar_t *); ++int nvpair_value_int8(nvpair_t *, int8_t *); ++int nvpair_value_uint8(nvpair_t *, uint8_t *); ++int nvpair_value_int16(nvpair_t *, int16_t *); ++int nvpair_value_uint16(nvpair_t *, uint16_t *); ++int nvpair_value_int32(nvpair_t *, int32_t *); ++int nvpair_value_uint32(nvpair_t *, uint32_t *); ++int nvpair_value_int64(nvpair_t *, int64_t *); ++int nvpair_value_uint64(nvpair_t *, uint64_t *); ++int nvpair_value_string(nvpair_t *, char **); ++int nvpair_value_nvlist(nvpair_t *, nvlist_t **); ++int nvpair_value_boolean_array(nvpair_t *, boolean_t **, uint_t *); ++int nvpair_value_byte_array(nvpair_t *, uchar_t **, uint_t *); ++int nvpair_value_int8_array(nvpair_t *, int8_t **, uint_t *); ++int nvpair_value_uint8_array(nvpair_t *, uint8_t **, uint_t *); ++int nvpair_value_int16_array(nvpair_t *, int16_t **, uint_t *); ++int nvpair_value_uint16_array(nvpair_t *, uint16_t **, uint_t *); ++int nvpair_value_int32_array(nvpair_t *, int32_t **, uint_t *); ++int nvpair_value_uint32_array(nvpair_t *, uint32_t **, uint_t *); ++int nvpair_value_int64_array(nvpair_t *, int64_t **, uint_t *); ++int nvpair_value_uint64_array(nvpair_t *, uint64_t **, uint_t *); ++int nvpair_value_string_array(nvpair_t *, char ***, uint_t *); ++int nvpair_value_nvlist_array(nvpair_t *, nvlist_t ***, uint_t *); ++int nvpair_value_hrtime(nvpair_t *, hrtime_t *); ++#if !defined(_KERNEL) ++int nvpair_value_double(nvpair_t *, double *); ++#endif ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_NVPAIR_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/nvpair_impl.h linux-3.2.33-go/include/zfs/sys/nvpair_impl.h +--- linux-3.2.33-go.orig/include/zfs/sys/nvpair_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/nvpair_impl.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,73 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License, Version 1.0 only ++ * (the "License"). You may not use this file except in compliance ++ * with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2004 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _NVPAIR_IMPL_H ++#define _NVPAIR_IMPL_H ++ ++ ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#include ++ ++/* ++ * The structures here provided for information and debugging purposes only ++ * may be changed in the future. ++ */ ++ ++/* ++ * implementation linked list for pre-packed data ++ */ ++typedef struct i_nvp i_nvp_t; ++ ++struct i_nvp { ++ union { ++ uint64_t _nvi_align; /* ensure alignment */ ++ struct { ++ i_nvp_t *_nvi_next; /* pointer to next nvpair */ ++ i_nvp_t *_nvi_prev; /* pointer to prev nvpair */ ++ } _nvi; ++ } _nvi_un; ++ nvpair_t nvi_nvp; /* nvpair */ ++}; ++#define nvi_next _nvi_un._nvi._nvi_next ++#define nvi_prev _nvi_un._nvi._nvi_prev ++ ++typedef struct { ++ i_nvp_t *nvp_list; /* linked list of nvpairs */ ++ i_nvp_t *nvp_last; /* last nvpair */ ++ i_nvp_t *nvp_curr; /* current walker nvpair */ ++ nv_alloc_t *nvp_nva; /* pluggable allocator */ ++ uint32_t nvp_stat; /* internal state */ ++} nvpriv_t; ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _NVPAIR_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/refcount.h linux-3.2.33-go/include/zfs/sys/refcount.h +--- linux-3.2.33-go.orig/include/zfs/sys/refcount.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/refcount.h 2012-11-16 23:25:34.342039415 +0100 +@@ -0,0 +1,107 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_REFCOUNT_H ++#define _SYS_REFCOUNT_H ++ ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * If the reference is held only by the calling function and not any ++ * particular object, use FTAG (which is a string) for the holder_tag. ++ * Otherwise, use the object that holds the reference. ++ */ ++#define FTAG ((char *)__func__) ++ ++#ifdef ZFS_DEBUG ++typedef struct reference { ++ list_node_t ref_link; ++ void *ref_holder; ++ uint64_t ref_number; ++ uint8_t *ref_removed; ++} reference_t; ++ ++typedef struct refcount { ++ kmutex_t rc_mtx; ++ list_t rc_list; ++ list_t rc_removed; ++ int64_t rc_count; ++ int64_t rc_removed_count; ++} refcount_t; ++ ++/* Note: refcount_t must be initialized with refcount_create() */ ++ ++void refcount_create(refcount_t *rc); ++void refcount_destroy(refcount_t *rc); ++void refcount_destroy_many(refcount_t *rc, uint64_t number); ++int refcount_is_zero(refcount_t *rc); ++int64_t refcount_count(refcount_t *rc); ++int64_t refcount_add(refcount_t *rc, void *holder_tag); ++int64_t refcount_remove(refcount_t *rc, void *holder_tag); ++int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag); ++int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag); ++void refcount_transfer(refcount_t *dst, refcount_t *src); ++ ++void refcount_init(void); ++void refcount_fini(void); ++ ++#else /* ZFS_DEBUG */ ++ ++typedef struct refcount { ++ uint64_t rc_count; ++} refcount_t; ++ ++#define refcount_create(rc) ((rc)->rc_count = 0) ++#define refcount_destroy(rc) ((rc)->rc_count = 0) ++#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0) ++#define refcount_is_zero(rc) ((rc)->rc_count == 0) ++#define refcount_count(rc) ((rc)->rc_count) ++#define refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1) ++#define refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1) ++#define refcount_add_many(rc, number, holder) \ ++ atomic_add_64_nv(&(rc)->rc_count, number) ++#define refcount_remove_many(rc, number, holder) \ ++ atomic_add_64_nv(&(rc)->rc_count, -number) ++#define refcount_transfer(dst, src) { \ ++ uint64_t __tmp = (src)->rc_count; \ ++ atomic_add_64(&(src)->rc_count, -__tmp); \ ++ atomic_add_64(&(dst)->rc_count, __tmp); \ ++} ++ ++#define refcount_init() ++#define refcount_fini() ++ ++#endif /* ZFS_DEBUG */ ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_REFCOUNT_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/rrwlock.h linux-3.2.33-go/include/zfs/sys/rrwlock.h +--- linux-3.2.33-go.orig/include/zfs/sys/rrwlock.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/rrwlock.h 2012-11-16 23:25:34.342039415 +0100 +@@ -0,0 +1,80 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_RR_RW_LOCK_H ++#define _SYS_RR_RW_LOCK_H ++ ++ ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#include ++#include ++#include ++ ++/* ++ * A reader-writer lock implementation that allows re-entrant reads, but ++ * still gives writers priority on "new" reads. ++ * ++ * See rrwlock.c for more details about the implementation. ++ * ++ * Fields of the rrwlock_t structure: ++ * - rr_lock: protects modification and reading of rrwlock_t fields ++ * - rr_cv: cv for waking up readers or waiting writers ++ * - rr_writer: thread id of the current writer ++ * - rr_anon_rount: number of active anonymous readers ++ * - rr_linked_rcount: total number of non-anonymous active readers ++ * - rr_writer_wanted: a writer wants the lock ++ */ ++typedef struct rrwlock { ++ kmutex_t rr_lock; ++ kcondvar_t rr_cv; ++ kthread_t *rr_writer; ++ refcount_t rr_anon_rcount; ++ refcount_t rr_linked_rcount; ++ boolean_t rr_writer_wanted; ++} rrwlock_t; ++ ++/* ++ * 'tag' is used in reference counting tracking. The ++ * 'tag' must be the same in a rrw_enter() as in its ++ * corresponding rrw_exit(). ++ */ ++void rrw_init(rrwlock_t *rrl); ++void rrw_destroy(rrwlock_t *rrl); ++void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag); ++void rrw_exit(rrwlock_t *rrl, void *tag); ++boolean_t rrw_held(rrwlock_t *rrl, krw_t rw); ++ ++#define RRW_READ_HELD(x) rrw_held(x, RW_READER) ++#define RRW_WRITE_HELD(x) rrw_held(x, RW_WRITER) ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_RR_RW_LOCK_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/sa.h linux-3.2.33-go/include/zfs/sys/sa.h +--- linux-3.2.33-go.orig/include/zfs/sys/sa.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/sa.h 2012-11-16 23:25:34.344039393 +0100 +@@ -0,0 +1,173 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_SA_H ++#define _SYS_SA_H ++ ++#include ++ ++/* ++ * Currently available byteswap functions. ++ * If it all possible new attributes should used ++ * one of the already defined byteswap functions. ++ * If a new byteswap function is added then the ++ * ZPL/Pool version will need to be bumped. ++ */ ++ ++typedef enum sa_bswap_type { ++ SA_UINT64_ARRAY, ++ SA_UINT32_ARRAY, ++ SA_UINT16_ARRAY, ++ SA_UINT8_ARRAY, ++ SA_ACL, ++} sa_bswap_type_t; ++ ++typedef uint16_t sa_attr_type_t; ++ ++/* ++ * Attribute to register support for. ++ */ ++typedef struct sa_attr_reg { ++ char *sa_name; /* attribute name */ ++ uint16_t sa_length; ++ sa_bswap_type_t sa_byteswap; /* bswap functon enum */ ++ sa_attr_type_t sa_attr; /* filled in during registration */ ++} sa_attr_reg_t; ++ ++ ++typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t, ++ boolean_t, void *userptr); ++ ++/* ++ * array of attributes to store. ++ * ++ * This array should be treated as opaque/private data. ++ * The SA_BULK_ADD_ATTR() macro should be used for manipulating ++ * the array. ++ * ++ * When sa_replace_all_by_template() is used the attributes ++ * will be stored in the order defined in the array, except that ++ * the attributes may be split between the bonus and the spill buffer ++ * ++ */ ++typedef struct sa_bulk_attr { ++ void *sa_data; ++ sa_data_locator_t *sa_data_func; ++ uint16_t sa_length; ++ sa_attr_type_t sa_attr; ++ /* the following are private to the sa framework */ ++ void *sa_addr; ++ uint16_t sa_buftype; ++ uint16_t sa_size; ++} sa_bulk_attr_t; ++ ++ ++/* ++ * special macro for adding entries for bulk attr support ++ * bulk - sa_bulk_attr_t ++ * count - integer that will be incremented during each add ++ * attr - attribute to manipulate ++ * func - function for accessing data. ++ * data - pointer to data. ++ * len - length of data ++ */ ++ ++#define SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \ ++{ \ ++ b[idx].sa_attr = attr;\ ++ b[idx].sa_data_func = func; \ ++ b[idx].sa_data = data; \ ++ b[idx++].sa_length = len; \ ++} ++ ++typedef struct sa_os sa_os_t; ++ ++typedef enum sa_handle_type { ++ SA_HDL_SHARED, ++ SA_HDL_PRIVATE ++} sa_handle_type_t; ++ ++struct sa_handle; ++typedef void *sa_lookup_tab_t; ++typedef struct sa_handle sa_handle_t; ++ ++typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx); ++ ++int sa_handle_get(objset_t *, uint64_t, void *userp, ++ sa_handle_type_t, sa_handle_t **); ++int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp, ++ sa_handle_type_t, sa_handle_t **); ++void sa_handle_destroy(sa_handle_t *); ++int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **); ++void sa_buf_rele(dmu_buf_t *, void *); ++int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen); ++int sa_update(sa_handle_t *, sa_attr_type_t, void *buf, ++ uint32_t buflen, dmu_tx_t *); ++int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *); ++int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count); ++int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count); ++int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *); ++int sa_size(sa_handle_t *, sa_attr_type_t, int *); ++int sa_update_from_cb(sa_handle_t *, sa_attr_type_t, ++ uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *); ++void sa_object_info(sa_handle_t *, dmu_object_info_t *); ++void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *); ++void sa_update_user(sa_handle_t *, sa_handle_t *); ++void *sa_get_userdata(sa_handle_t *); ++void sa_set_userp(sa_handle_t *, void *); ++dmu_buf_t *sa_get_db(sa_handle_t *); ++uint64_t sa_handle_object(sa_handle_t *); ++boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size); ++void sa_spill_rele(sa_handle_t *); ++void sa_register_update_callback(objset_t *, sa_update_cb_t *); ++int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **); ++void sa_tear_down(objset_t *); ++int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *, ++ int, dmu_tx_t *); ++int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *, ++ int, dmu_tx_t *); ++boolean_t sa_enabled(objset_t *); ++void sa_cache_init(void); ++void sa_cache_fini(void); ++void *sa_spill_alloc(int); ++void sa_spill_free(void *); ++int sa_set_sa_object(objset_t *, uint64_t); ++int sa_hdrsize(void *); ++void sa_handle_lock(sa_handle_t *); ++void sa_handle_unlock(sa_handle_t *); ++ ++#ifdef _KERNEL ++int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *); ++#endif ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_SA_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/sa_impl.h linux-3.2.33-go/include/zfs/sys/sa_impl.h +--- linux-3.2.33-go.orig/include/zfs/sys/sa_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/sa_impl.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,287 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_SA_IMPL_H ++#define _SYS_SA_IMPL_H ++ ++#include ++#include ++#include ++ ++/* ++ * Array of known attributes and their ++ * various characteristics. ++ */ ++typedef struct sa_attr_table { ++ sa_attr_type_t sa_attr; ++ uint8_t sa_registered; ++ uint16_t sa_length; ++ sa_bswap_type_t sa_byteswap; ++ char *sa_name; ++} sa_attr_table_t; ++ ++/* ++ * Zap attribute format for attribute registration ++ * ++ * 64 56 48 40 32 24 16 8 0 ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * | unused | len | bswap | attr num | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * ++ * Zap attribute format for layout information. ++ * ++ * layout information is stored as an array of attribute numbers ++ * The name of the attribute is the layout number (0, 1, 2, ...) ++ * ++ * 16 0 ++ * +---- ---+ ++ * | attr # | ++ * +--------+ ++ * | attr # | ++ * +--- ----+ ++ * ...... ++ * ++ */ ++ ++#define ATTR_BSWAP(x) BF32_GET(x, 16, 8) ++#define ATTR_LENGTH(x) BF32_GET(x, 24, 16) ++#define ATTR_NUM(x) BF32_GET(x, 0, 16) ++#define ATTR_ENCODE(x, attr, length, bswap) \ ++{ \ ++ BF64_SET(x, 24, 16, length); \ ++ BF64_SET(x, 16, 8, bswap); \ ++ BF64_SET(x, 0, 16, attr); \ ++} ++ ++#define TOC_OFF(x) BF32_GET(x, 0, 23) ++#define TOC_ATTR_PRESENT(x) BF32_GET(x, 31, 1) ++#define TOC_LEN_IDX(x) BF32_GET(x, 24, 4) ++#define TOC_ATTR_ENCODE(x, len_idx, offset) \ ++{ \ ++ BF32_SET(x, 31, 1, 1); \ ++ BF32_SET(x, 24, 7, len_idx); \ ++ BF32_SET(x, 0, 24, offset); \ ++} ++ ++#define SA_LAYOUTS "LAYOUTS" ++#define SA_REGISTRY "REGISTRY" ++ ++/* ++ * Each unique layout will have their own table ++ * sa_lot (layout_table) ++ */ ++typedef struct sa_lot { ++ avl_node_t lot_num_node; ++ avl_node_t lot_hash_node; ++ uint64_t lot_num; ++ uint64_t lot_hash; ++ sa_attr_type_t *lot_attrs; /* array of attr #'s */ ++ uint32_t lot_var_sizes; /* how many aren't fixed size */ ++ uint32_t lot_attr_count; /* total attr count */ ++ list_t lot_idx_tab; /* should be only a couple of entries */ ++ int lot_instance; /* used with lot_hash to identify entry */ ++} sa_lot_t; ++ ++/* index table of offsets */ ++typedef struct sa_idx_tab { ++ list_node_t sa_next; ++ sa_lot_t *sa_layout; ++ uint16_t *sa_variable_lengths; ++ refcount_t sa_refcount; ++ uint32_t *sa_idx_tab; /* array of offsets */ ++} sa_idx_tab_t; ++ ++/* ++ * Since the offset/index information into the actual data ++ * will usually be identical we can share that information with ++ * all handles that have the exact same offsets. ++ * ++ * You would typically only have a large number of different table of ++ * contents if you had a several variable sized attributes. ++ * ++ * Two AVL trees are used to track the attribute layout numbers. ++ * one is keyed by number and will be consulted when a DMU_OT_SA ++ * object is first read. The second tree is keyed by the hash signature ++ * of the attributes and will be consulted when an attribute is added ++ * to determine if we already have an instance of that layout. Both ++ * of these tree's are interconnected. The only difference is that ++ * when an entry is found in the "hash" tree the list of attributes will ++ * need to be compared against the list of attributes you have in hand. ++ * The assumption is that typically attributes will just be updated and ++ * adding a completely new attribute is a very rare operation. ++ */ ++struct sa_os { ++ kmutex_t sa_lock; ++ boolean_t sa_need_attr_registration; ++ boolean_t sa_force_spill; ++ uint64_t sa_master_obj; ++ uint64_t sa_reg_attr_obj; ++ uint64_t sa_layout_attr_obj; ++ int sa_num_attrs; ++ sa_attr_table_t *sa_attr_table; /* private attr table */ ++ sa_update_cb_t *sa_update_cb; ++ avl_tree_t sa_layout_num_tree; /* keyed by layout number */ ++ avl_tree_t sa_layout_hash_tree; /* keyed by layout hash value */ ++ int sa_user_table_sz; ++ sa_attr_type_t *sa_user_table; /* user name->attr mapping table */ ++}; ++ ++/* ++ * header for all bonus and spill buffers. ++ * The header has a fixed portion with a variable number ++ * of "lengths" depending on the number of variable sized ++ * attribues which are determined by the "layout number" ++ */ ++ ++#define SA_MAGIC 0x2F505A /* ZFS SA */ ++typedef struct sa_hdr_phys { ++ uint32_t sa_magic; ++ uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */ ++ uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */ ++ /* ... Data follows the lengths. */ ++} sa_hdr_phys_t; ++ ++/* ++ * sa_hdr_phys -> sa_layout_info ++ * ++ * 16 10 0 ++ * +--------+-------+ ++ * | hdrsz |layout | ++ * +--------+-------+ ++ * ++ * Bits 0-10 are the layout number ++ * Bits 11-16 are the size of the header. ++ * The hdrsize is the number * 8 ++ * ++ * For example. ++ * hdrsz of 1 ==> 8 byte header ++ * 2 ==> 16 byte header ++ * ++ */ ++ ++#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) ++#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0) ++#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \ ++{ \ ++ BF32_SET_SB(x, 10, 6, 3, 0, size); \ ++ BF32_SET(x, 0, 10, num); \ ++} ++ ++typedef enum sa_buf_type { ++ SA_BONUS = 1, ++ SA_SPILL = 2 ++} sa_buf_type_t; ++ ++typedef enum sa_data_op { ++ SA_LOOKUP, ++ SA_UPDATE, ++ SA_ADD, ++ SA_REPLACE, ++ SA_REMOVE ++} sa_data_op_t; ++ ++/* ++ * Opaque handle used for most sa functions ++ * ++ * This needs to be kept as small as possible. ++ */ ++ ++struct sa_handle { ++ kmutex_t sa_lock; ++ dmu_buf_t *sa_bonus; ++ dmu_buf_t *sa_spill; ++ objset_t *sa_os; ++ void *sa_userp; ++ sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */ ++ sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */ ++}; ++ ++#define SA_GET_DB(hdl, type) \ ++ (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill) ++ ++#define SA_GET_HDR(hdl, type) \ ++ ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \ ++ type))->db.db_data)) ++ ++#define SA_IDX_TAB_GET(hdl, type) \ ++ (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab) ++ ++#define IS_SA_BONUSTYPE(a) \ ++ ((a == DMU_OT_SA) ? B_TRUE : B_FALSE) ++ ++#define SA_BONUSTYPE_FROM_DB(db) \ ++ (dmu_get_bonustype((dmu_buf_t *)db)) ++ ++#define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t)) ++ ++#define SA_LAYOUT_NUM(x, type) \ ++ ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \ ++ ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x)))) ++ ++ ++#define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length ++ ++#define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\ ++ hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \ ++ SA_REGISTERED_LEN(sa, attr)) ++ ++#define SA_SET_HDR(hdr, num, size) \ ++ { \ ++ hdr->sa_magic = SA_MAGIC; \ ++ SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \ ++ } ++ ++#define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \ ++ { \ ++ bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \ ++ bulk.sa_buftype = type; \ ++ bulk.sa_addr = \ ++ (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \ ++ (uintptr_t)hdr); \ ++} ++ ++#define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \ ++ (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \ ++ (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \ ++ sizeof (uint16_t), 8) : 0))) ++ ++int sa_add_impl(sa_handle_t *, sa_attr_type_t, ++ uint32_t, sa_data_locator_t, void *, dmu_tx_t *); ++ ++void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *); ++int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *); ++ ++void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *); ++int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t, ++ uint16_t *, sa_hdr_phys_t *); ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_SA_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/spa_boot.h linux-3.2.33-go/include/zfs/sys/spa_boot.h +--- linux-3.2.33-go.orig/include/zfs/sys/spa_boot.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/spa_boot.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,42 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_SPA_BOOT_H ++#define _SYS_SPA_BOOT_H ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++extern char *spa_get_bootprop(char *prop); ++extern void spa_free_bootprop(char *prop); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_SPA_BOOT_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/space_map.h linux-3.2.33-go/include/zfs/sys/space_map.h +--- linux-3.2.33-go.orig/include/zfs/sys/space_map.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/space_map.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,179 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_SPACE_MAP_H ++#define _SYS_SPACE_MAP_H ++ ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++typedef struct space_map_ops space_map_ops_t; ++ ++typedef struct space_map { ++ avl_tree_t sm_root; /* AVL tree of map segments */ ++ uint64_t sm_space; /* sum of all segments in the map */ ++ uint64_t sm_start; /* start of map */ ++ uint64_t sm_size; /* size of map */ ++ uint8_t sm_shift; /* unit shift */ ++ uint8_t sm_pad[3]; /* unused */ ++ uint8_t sm_loaded; /* map loaded? */ ++ uint8_t sm_loading; /* map loading? */ ++ kcondvar_t sm_load_cv; /* map load completion */ ++ space_map_ops_t *sm_ops; /* space map block picker ops vector */ ++ avl_tree_t *sm_pp_root; /* picker-private AVL tree */ ++ void *sm_ppd; /* picker-private data */ ++ kmutex_t *sm_lock; /* pointer to lock that protects map */ ++} space_map_t; ++ ++typedef struct space_seg { ++ avl_node_t ss_node; /* AVL node */ ++ avl_node_t ss_pp_node; /* AVL picker-private node */ ++ uint64_t ss_start; /* starting offset of this segment */ ++ uint64_t ss_end; /* ending offset (non-inclusive) */ ++} space_seg_t; ++ ++typedef struct space_ref { ++ avl_node_t sr_node; /* AVL node */ ++ uint64_t sr_offset; /* offset (start or end) */ ++ int64_t sr_refcnt; /* associated reference count */ ++} space_ref_t; ++ ++typedef struct space_map_obj { ++ uint64_t smo_object; /* on-disk space map object */ ++ uint64_t smo_objsize; /* size of the object */ ++ uint64_t smo_alloc; /* space allocated from the map */ ++} space_map_obj_t; ++ ++struct space_map_ops { ++ void (*smop_load)(space_map_t *sm); ++ void (*smop_unload)(space_map_t *sm); ++ uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size); ++ void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size); ++ void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size); ++ uint64_t (*smop_max)(space_map_t *sm); ++ boolean_t (*smop_fragmented)(space_map_t *sm); ++}; ++ ++/* ++ * debug entry ++ * ++ * 1 3 10 50 ++ * ,---+--------+------------+---------------------------------. ++ * | 1 | action | syncpass | txg (lower bits) | ++ * `---+--------+------------+---------------------------------' ++ * 63 62 60 59 50 49 0 ++ * ++ * ++ * ++ * non-debug entry ++ * ++ * 1 47 1 15 ++ * ,-----------------------------------------------------------. ++ * | 0 | offset (sm_shift units) | type | run | ++ * `-----------------------------------------------------------' ++ * 63 62 17 16 15 0 ++ */ ++ ++/* All this stuff takes and returns bytes */ ++#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1) ++#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15) ++#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) ++#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) ++#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47) ++#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47) ++#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1) ++#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1) ++ ++#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3) ++#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3) ++ ++#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10) ++#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10) ++ ++#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50) ++#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50) ++ ++#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) ++ ++#define SM_ALLOC 0x0 ++#define SM_FREE 0x1 ++ ++/* ++ * The data for a given space map can be kept on blocks of any size. ++ * Larger blocks entail fewer i/o operations, but they also cause the ++ * DMU to keep more data in-core, and also to waste more i/o bandwidth ++ * when only a few blocks have changed since the last transaction group. ++ * This could use a lot more research, but for now, set the freelist ++ * block size to 4k (2^12). ++ */ ++#define SPACE_MAP_BLOCKSHIFT 12 ++ ++typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size); ++ ++extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size, ++ uint8_t shift, kmutex_t *lp); ++extern void space_map_destroy(space_map_t *sm); ++extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size); ++extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size); ++extern boolean_t space_map_contains(space_map_t *sm, ++ uint64_t start, uint64_t size); ++extern void space_map_vacate(space_map_t *sm, ++ space_map_func_t *func, space_map_t *mdest); ++extern void space_map_walk(space_map_t *sm, ++ space_map_func_t *func, space_map_t *mdest); ++ ++extern void space_map_load_wait(space_map_t *sm); ++extern int space_map_load(space_map_t *sm, space_map_ops_t *ops, ++ uint8_t maptype, space_map_obj_t *smo, objset_t *os); ++extern void space_map_unload(space_map_t *sm); ++ ++extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size); ++extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size); ++extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size); ++extern uint64_t space_map_maxsize(space_map_t *sm); ++ ++extern void space_map_sync(space_map_t *sm, uint8_t maptype, ++ space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx); ++extern void space_map_truncate(space_map_obj_t *smo, ++ objset_t *os, dmu_tx_t *tx); ++ ++extern void space_map_ref_create(avl_tree_t *t); ++extern void space_map_ref_destroy(avl_tree_t *t); ++extern void space_map_ref_add_seg(avl_tree_t *t, ++ uint64_t start, uint64_t end, int64_t refcnt); ++extern void space_map_ref_add_map(avl_tree_t *t, ++ space_map_t *sm, int64_t refcnt); ++extern void space_map_ref_generate_map(avl_tree_t *t, ++ space_map_t *sm, int64_t minref); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_SPACE_MAP_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/spa.h linux-3.2.33-go/include/zfs/sys/spa.h +--- linux-3.2.33-go.orig/include/zfs/sys/spa.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/spa.h 2012-11-16 23:25:34.342039415 +0100 +@@ -0,0 +1,718 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ */ ++ ++#ifndef _SYS_SPA_H ++#define _SYS_SPA_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * Forward references that lots of things need. ++ */ ++typedef struct spa spa_t; ++typedef struct vdev vdev_t; ++typedef struct metaslab metaslab_t; ++typedef struct metaslab_group metaslab_group_t; ++typedef struct metaslab_class metaslab_class_t; ++typedef struct zio zio_t; ++typedef struct zilog zilog_t; ++typedef struct spa_aux_vdev spa_aux_vdev_t; ++typedef struct ddt ddt_t; ++typedef struct ddt_entry ddt_entry_t; ++struct dsl_pool; ++ ++/* ++ * General-purpose 32-bit and 64-bit bitfield encodings. ++ */ ++#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) ++#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) ++#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) ++#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) ++ ++#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) ++#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) ++ ++#define BF32_SET(x, low, len, val) \ ++ ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len)) ++#define BF64_SET(x, low, len, val) \ ++ ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)) ++ ++#define BF32_GET_SB(x, low, len, shift, bias) \ ++ ((BF32_GET(x, low, len) + (bias)) << (shift)) ++#define BF64_GET_SB(x, low, len, shift, bias) \ ++ ((BF64_GET(x, low, len) + (bias)) << (shift)) ++ ++#define BF32_SET_SB(x, low, len, shift, bias, val) \ ++ BF32_SET(x, low, len, ((val) >> (shift)) - (bias)) ++#define BF64_SET_SB(x, low, len, shift, bias, val) \ ++ BF64_SET(x, low, len, ((val) >> (shift)) - (bias)) ++ ++/* ++ * We currently support nine block sizes, from 512 bytes to 128K. ++ * We could go higher, but the benefits are near-zero and the cost ++ * of COWing a giant block to modify one byte would become excessive. ++ */ ++#define SPA_MINBLOCKSHIFT 9 ++#define SPA_MAXBLOCKSHIFT 17 ++#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) ++#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) ++ ++#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1) ++ ++/* ++ * Size of block to hold the configuration data (a packed nvlist) ++ */ ++#define SPA_CONFIG_BLOCKSIZE (1 << 14) ++ ++/* ++ * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. ++ * The ASIZE encoding should be at least 64 times larger (6 more bits) ++ * to support up to 4-way RAID-Z mirror mode with worst-case gang block ++ * overhead, three DVAs per bp, plus one more bit in case we do anything ++ * else that expands the ASIZE. ++ */ ++#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ ++#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ ++#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ ++ ++/* ++ * All SPA data is represented by 128-bit data virtual addresses (DVAs). ++ * The members of the dva_t should be considered opaque outside the SPA. ++ */ ++typedef struct dva { ++ uint64_t dva_word[2]; ++} dva_t; ++ ++/* ++ * Each block has a 256-bit checksum -- strong enough for cryptographic hashes. ++ */ ++typedef struct zio_cksum { ++ uint64_t zc_word[4]; ++} zio_cksum_t; ++ ++/* ++ * Each block is described by its DVAs, time of birth, checksum, etc. ++ * The word-by-word, bit-by-bit layout of the blkptr is as follows: ++ * ++ * 64 56 48 40 32 24 16 8 0 ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * 0 | vdev1 | GRID | ASIZE | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * 1 |G| offset1 | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * 2 | vdev2 | GRID | ASIZE | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * 3 |G| offset2 | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * 4 | vdev3 | GRID | ASIZE | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * 5 |G| offset3 | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * 7 | padding | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * 8 | padding | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * 9 | physical birth txg | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * a | logical birth txg | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * b | fill count | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * c | checksum[0] | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * d | checksum[1] | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * e | checksum[2] | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * f | checksum[3] | ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * ++ * Legend: ++ * ++ * vdev virtual device ID ++ * offset offset into virtual device ++ * LSIZE logical size ++ * PSIZE physical size (after compression) ++ * ASIZE allocated size (including RAID-Z parity and gang block headers) ++ * GRID RAID-Z layout information (reserved for future use) ++ * cksum checksum function ++ * comp compression function ++ * G gang block indicator ++ * B byteorder (endianness) ++ * D dedup ++ * X unused ++ * lvl level of indirection ++ * type DMU object type ++ * phys birth txg of block allocation; zero if same as logical birth txg ++ * log. birth transaction group in which the block was logically born ++ * fill count number of non-zero blocks under this bp ++ * checksum[4] 256-bit checksum of the data this bp describes ++ */ ++#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ ++#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ ++ ++typedef struct blkptr { ++ dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ ++ uint64_t blk_prop; /* size, compression, type, etc */ ++ uint64_t blk_pad[2]; /* Extra space for the future */ ++ uint64_t blk_phys_birth; /* txg when block was allocated */ ++ uint64_t blk_birth; /* transaction group at birth */ ++ uint64_t blk_fill; /* fill count */ ++ zio_cksum_t blk_cksum; /* 256-bit checksum */ ++} blkptr_t; ++ ++/* ++ * Macros to get and set fields in a bp or DVA. ++ */ ++#define DVA_GET_ASIZE(dva) \ ++ BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0) ++#define DVA_SET_ASIZE(dva, x) \ ++ BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x) ++ ++#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) ++#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) ++ ++#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32) ++#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x) ++ ++#define DVA_GET_OFFSET(dva) \ ++ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) ++#define DVA_SET_OFFSET(dva, x) \ ++ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) ++ ++#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) ++#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) ++ ++#define BP_GET_LSIZE(bp) \ ++ BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) ++#define BP_SET_LSIZE(bp, x) \ ++ BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) ++ ++#define BP_GET_PSIZE(bp) \ ++ BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) ++#define BP_SET_PSIZE(bp, x) \ ++ BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) ++ ++#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) ++#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) ++ ++#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) ++#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) ++ ++#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) ++#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) ++ ++#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) ++#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) ++ ++#define BP_GET_PROP_BIT_61(bp) BF64_GET((bp)->blk_prop, 61, 1) ++#define BP_SET_PROP_BIT_61(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x) ++ ++#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) ++#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) ++ ++#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1)) ++#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) ++ ++#define BP_PHYSICAL_BIRTH(bp) \ ++ ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) ++ ++#define BP_SET_BIRTH(bp, logical, physical) \ ++{ \ ++ (bp)->blk_birth = (logical); \ ++ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ ++} ++ ++#define BP_GET_ASIZE(bp) \ ++ (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ ++ DVA_GET_ASIZE(&(bp)->blk_dva[2])) ++ ++#define BP_GET_UCSIZE(bp) \ ++ ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \ ++ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) ++ ++#define BP_GET_NDVAS(bp) \ ++ (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ ++ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ ++ !!DVA_GET_ASIZE(&(bp)->blk_dva[2])) ++ ++#define BP_COUNT_GANG(bp) \ ++ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ ++ DVA_GET_GANG(&(bp)->blk_dva[1]) + \ ++ DVA_GET_GANG(&(bp)->blk_dva[2])) ++ ++#define DVA_EQUAL(dva1, dva2) \ ++ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ ++ (dva1)->dva_word[0] == (dva2)->dva_word[0]) ++ ++#define BP_EQUAL(bp1, bp2) \ ++ (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ ++ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ ++ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ ++ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) ++ ++#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ ++ (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ ++ ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ ++ ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ ++ ((zc1).zc_word[3] - (zc2).zc_word[3]))) ++ ++#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) ++ ++#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ ++{ \ ++ (zcp)->zc_word[0] = w0; \ ++ (zcp)->zc_word[1] = w1; \ ++ (zcp)->zc_word[2] = w2; \ ++ (zcp)->zc_word[3] = w3; \ ++} ++ ++#define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) ++#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp)) ++#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0) ++ ++/* BP_IS_RAIDZ(bp) assumes no block compression */ ++#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ ++ BP_GET_PSIZE(bp)) ++ ++#define BP_ZERO(bp) \ ++{ \ ++ (bp)->blk_dva[0].dva_word[0] = 0; \ ++ (bp)->blk_dva[0].dva_word[1] = 0; \ ++ (bp)->blk_dva[1].dva_word[0] = 0; \ ++ (bp)->blk_dva[1].dva_word[1] = 0; \ ++ (bp)->blk_dva[2].dva_word[0] = 0; \ ++ (bp)->blk_dva[2].dva_word[1] = 0; \ ++ (bp)->blk_prop = 0; \ ++ (bp)->blk_pad[0] = 0; \ ++ (bp)->blk_pad[1] = 0; \ ++ (bp)->blk_phys_birth = 0; \ ++ (bp)->blk_birth = 0; \ ++ (bp)->blk_fill = 0; \ ++ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ ++} ++ ++/* ++ * Note: the byteorder is either 0 or -1, both of which are palindromes. ++ * This simplifies the endianness handling a bit. ++ */ ++#ifdef _BIG_ENDIAN ++#define ZFS_HOST_BYTEORDER (0ULL) ++#else ++#define ZFS_HOST_BYTEORDER (-1ULL) ++#endif ++ ++#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) ++ ++#define BP_SPRINTF_LEN 320 ++ ++/* ++ * This macro allows code sharing between zfs, libzpool, and mdb. ++ * 'func' is either snprintf() or mdb_snprintf(). ++ * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. ++ */ ++#define SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress) \ ++{ \ ++ static const char *copyname[] = \ ++ { "zero", "single", "double", "triple" }; \ ++ int size = BP_SPRINTF_LEN; \ ++ int len = 0; \ ++ int copies = 0; \ ++ int d; \ ++ \ ++ if (bp == NULL) { \ ++ len = func(buf + len, size - len, ""); \ ++ } else if (BP_IS_HOLE(bp)) { \ ++ len = func(buf + len, size - len, ""); \ ++ } else { \ ++ for (d = 0; d < BP_GET_NDVAS(bp); d++) { \ ++ const dva_t *dva = &bp->blk_dva[d]; \ ++ if (DVA_IS_VALID(dva)) \ ++ copies++; \ ++ len += func(buf + len, size - len, \ ++ "DVA[%d]=<%llu:%llx:%llx>%c", d, \ ++ (u_longlong_t)DVA_GET_VDEV(dva), \ ++ (u_longlong_t)DVA_GET_OFFSET(dva), \ ++ (u_longlong_t)DVA_GET_ASIZE(dva), \ ++ ws); \ ++ } \ ++ if (BP_IS_GANG(bp) && \ ++ DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ ++ DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ ++ copies--; \ ++ len += func(buf + len, size - len, \ ++ "[L%llu %s] %s %s %s %s %s %s%c" \ ++ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ ++ "cksum=%llx:%llx:%llx:%llx", \ ++ (u_longlong_t)BP_GET_LEVEL(bp), \ ++ type, \ ++ checksum, \ ++ compress, \ ++ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ ++ BP_IS_GANG(bp) ? "gang" : "contiguous", \ ++ BP_GET_DEDUP(bp) ? "dedup" : "unique", \ ++ copyname[copies], \ ++ ws, \ ++ (u_longlong_t)BP_GET_LSIZE(bp), \ ++ (u_longlong_t)BP_GET_PSIZE(bp), \ ++ (u_longlong_t)bp->blk_birth, \ ++ (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ ++ (u_longlong_t)bp->blk_fill, \ ++ ws, \ ++ (u_longlong_t)bp->blk_cksum.zc_word[0], \ ++ (u_longlong_t)bp->blk_cksum.zc_word[1], \ ++ (u_longlong_t)bp->blk_cksum.zc_word[2], \ ++ (u_longlong_t)bp->blk_cksum.zc_word[3]); \ ++ } \ ++ ASSERT(len < size); \ ++} ++ ++#include ++ ++#define BP_GET_BUFC_TYPE(bp) \ ++ (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \ ++ ARC_BUFC_METADATA : ARC_BUFC_DATA); ++ ++typedef enum spa_import_type { ++ SPA_IMPORT_EXISTING, ++ SPA_IMPORT_ASSEMBLE ++} spa_import_type_t; ++ ++/* state manipulation functions */ ++extern int spa_open(const char *pool, spa_t **, void *tag); ++extern int spa_open_rewind(const char *pool, spa_t **, void *tag, ++ nvlist_t *policy, nvlist_t **config); ++extern int spa_get_stats(const char *pool, nvlist_t **config, ++ char *altroot, size_t buflen); ++extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, ++ const char *history_str, nvlist_t *zplprops); ++extern int spa_import_rootpool(char *devpath, char *devid); ++extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, ++ uint64_t flags); ++extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); ++extern int spa_destroy(char *pool); ++extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, ++ boolean_t hardforce); ++extern int spa_reset(char *pool); ++extern void spa_async_request(spa_t *spa, int flag); ++extern void spa_async_unrequest(spa_t *spa, int flag); ++extern void spa_async_suspend(spa_t *spa); ++extern void spa_async_resume(spa_t *spa); ++extern spa_t *spa_inject_addref(char *pool); ++extern void spa_inject_delref(spa_t *spa); ++extern void spa_scan_stat_init(spa_t *spa); ++extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); ++ ++#define SPA_ASYNC_CONFIG_UPDATE 0x01 ++#define SPA_ASYNC_REMOVE 0x02 ++#define SPA_ASYNC_PROBE 0x04 ++#define SPA_ASYNC_RESILVER_DONE 0x08 ++#define SPA_ASYNC_RESILVER 0x10 ++#define SPA_ASYNC_AUTOEXPAND 0x20 ++#define SPA_ASYNC_REMOVE_DONE 0x40 ++#define SPA_ASYNC_REMOVE_STOP 0x80 ++ ++/* ++ * Controls the behavior of spa_vdev_remove(). ++ */ ++#define SPA_REMOVE_UNSPARE 0x01 ++#define SPA_REMOVE_DONE 0x02 ++ ++/* device manipulation */ ++extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); ++extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, ++ int replacing); ++extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, ++ int replace_done); ++extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); ++extern boolean_t spa_vdev_remove_active(spa_t *spa); ++extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); ++extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); ++extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, ++ nvlist_t *props, boolean_t exp); ++ ++/* spare state (which is global across all pools) */ ++extern void spa_spare_add(vdev_t *vd); ++extern void spa_spare_remove(vdev_t *vd); ++extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); ++extern void spa_spare_activate(vdev_t *vd); ++ ++/* L2ARC state (which is global across all pools) */ ++extern void spa_l2cache_add(vdev_t *vd); ++extern void spa_l2cache_remove(vdev_t *vd); ++extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); ++extern void spa_l2cache_activate(vdev_t *vd); ++extern void spa_l2cache_drop(spa_t *spa); ++ ++/* scanning */ ++extern int spa_scan(spa_t *spa, pool_scan_func_t func); ++extern int spa_scan_stop(spa_t *spa); ++ ++/* spa syncing */ ++extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ ++extern void spa_sync_allpools(void); ++ ++/* ++ * DEFERRED_FREE must be large enough that regular blocks are not ++ * deferred. XXX so can't we change it back to 1? ++ */ ++#define SYNC_PASS_DEFERRED_FREE 2 /* defer frees after this pass */ ++#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */ ++#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */ ++ ++/* spa namespace global mutex */ ++extern kmutex_t spa_namespace_lock; ++ ++/* ++ * SPA configuration functions in spa_config.c ++ */ ++ ++#define SPA_CONFIG_UPDATE_POOL 0 ++#define SPA_CONFIG_UPDATE_VDEVS 1 ++ ++extern void spa_config_sync(spa_t *, boolean_t, boolean_t); ++extern void spa_config_load(void); ++extern nvlist_t *spa_all_configs(uint64_t *); ++extern void spa_config_set(spa_t *spa, nvlist_t *config); ++extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, ++ int getstats); ++extern void spa_config_update(spa_t *spa, int what); ++ ++/* ++ * Miscellaneous SPA routines in spa_misc.c ++ */ ++ ++/* Namespace manipulation */ ++extern spa_t *spa_lookup(const char *name); ++extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); ++extern void spa_remove(spa_t *spa); ++extern spa_t *spa_next(spa_t *prev); ++ ++/* Refcount functions */ ++extern void spa_open_ref(spa_t *spa, void *tag); ++extern void spa_close(spa_t *spa, void *tag); ++extern boolean_t spa_refcount_zero(spa_t *spa); ++ ++#define SCL_NONE 0x00 ++#define SCL_CONFIG 0x01 ++#define SCL_STATE 0x02 ++#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ ++#define SCL_ALLOC 0x08 ++#define SCL_ZIO 0x10 ++#define SCL_FREE 0x20 ++#define SCL_VDEV 0x40 ++#define SCL_LOCKS 7 ++#define SCL_ALL ((1 << SCL_LOCKS) - 1) ++#define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) ++ ++/* Pool configuration locks */ ++extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); ++extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); ++extern void spa_config_exit(spa_t *spa, int locks, void *tag); ++extern int spa_config_held(spa_t *spa, int locks, krw_t rw); ++ ++/* Pool vdev add/remove lock */ ++extern uint64_t spa_vdev_enter(spa_t *spa); ++extern uint64_t spa_vdev_config_enter(spa_t *spa); ++extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, ++ int error, char *tag); ++extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); ++ ++/* Pool vdev state change lock */ ++extern void spa_vdev_state_enter(spa_t *spa, int oplock); ++extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); ++ ++/* Log state */ ++typedef enum spa_log_state { ++ SPA_LOG_UNKNOWN = 0, /* unknown log state */ ++ SPA_LOG_MISSING, /* missing log(s) */ ++ SPA_LOG_CLEAR, /* clear the log(s) */ ++ SPA_LOG_GOOD, /* log(s) are good */ ++} spa_log_state_t; ++ ++extern spa_log_state_t spa_get_log_state(spa_t *spa); ++extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); ++extern int spa_offline_log(spa_t *spa); ++ ++/* Log claim callback */ ++extern void spa_claim_notify(zio_t *zio); ++ ++/* Accessor functions */ ++extern boolean_t spa_shutting_down(spa_t *spa); ++extern struct dsl_pool *spa_get_dsl(spa_t *spa); ++extern blkptr_t *spa_get_rootblkptr(spa_t *spa); ++extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); ++extern void spa_altroot(spa_t *, char *, size_t); ++extern int spa_sync_pass(spa_t *spa); ++extern char *spa_name(spa_t *spa); ++extern uint64_t spa_guid(spa_t *spa); ++extern uint64_t spa_load_guid(spa_t *spa); ++extern uint64_t spa_last_synced_txg(spa_t *spa); ++extern uint64_t spa_first_txg(spa_t *spa); ++extern uint64_t spa_syncing_txg(spa_t *spa); ++extern uint64_t spa_version(spa_t *spa); ++extern pool_state_t spa_state(spa_t *spa); ++extern spa_load_state_t spa_load_state(spa_t *spa); ++extern uint64_t spa_freeze_txg(spa_t *spa); ++extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); ++extern uint64_t spa_get_dspace(spa_t *spa); ++extern void spa_update_dspace(spa_t *spa); ++extern uint64_t spa_version(spa_t *spa); ++extern boolean_t spa_deflate(spa_t *spa); ++extern metaslab_class_t *spa_normal_class(spa_t *spa); ++extern metaslab_class_t *spa_log_class(spa_t *spa); ++extern int spa_max_replication(spa_t *spa); ++extern int spa_prev_software_version(spa_t *spa); ++extern int spa_busy(void); ++extern uint8_t spa_get_failmode(spa_t *spa); ++extern boolean_t spa_suspended(spa_t *spa); ++extern uint64_t spa_bootfs(spa_t *spa); ++extern uint64_t spa_delegation(spa_t *spa); ++extern objset_t *spa_meta_objset(spa_t *spa); ++ ++/* Miscellaneous support routines */ ++extern int spa_rename(const char *oldname, const char *newname); ++extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); ++extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); ++extern char *spa_strdup(const char *); ++extern void spa_strfree(char *); ++extern uint64_t spa_get_random(uint64_t range); ++extern uint64_t spa_generate_guid(spa_t *spa); ++extern void sprintf_blkptr(char *buf, const blkptr_t *bp); ++extern void spa_freeze(spa_t *spa); ++extern int spa_change_guid(spa_t *spa); ++extern void spa_upgrade(spa_t *spa, uint64_t version); ++extern void spa_evict_all(void); ++extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, ++ boolean_t l2cache); ++extern boolean_t spa_has_spare(spa_t *, uint64_t guid); ++extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); ++extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); ++extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); ++extern boolean_t spa_has_slogs(spa_t *spa); ++extern boolean_t spa_is_root(spa_t *spa); ++extern boolean_t spa_writeable(spa_t *spa); ++ ++extern int spa_mode(spa_t *spa); ++extern uint64_t strtonum(const char *str, char **nptr); ++ ++/* history logging */ ++typedef enum history_log_type { ++ LOG_CMD_POOL_CREATE, ++ LOG_CMD_NORMAL, ++ LOG_INTERNAL ++} history_log_type_t; ++ ++typedef struct history_arg { ++ char *ha_history_str; ++ history_log_type_t ha_log_type; ++ history_internal_events_t ha_event; ++ char *ha_zone; ++ uid_t ha_uid; ++} history_arg_t; ++ ++extern char *spa_his_ievent_table[]; ++ ++extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); ++extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, ++ char *his_buf); ++extern int spa_history_log(spa_t *spa, const char *his_buf, ++ history_log_type_t what); ++extern void spa_history_log_internal(history_internal_events_t event, ++ spa_t *spa, dmu_tx_t *tx, const char *fmt, ...); ++extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt); ++ ++/* error handling */ ++struct zbookmark; ++extern void spa_log_error(spa_t *spa, zio_t *zio); ++extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, ++ zio_t *zio, uint64_t stateoroffset, uint64_t length); ++extern void zfs_post_remove(spa_t *spa, vdev_t *vd); ++extern void zfs_post_state_change(spa_t *spa, vdev_t *vd); ++extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); ++extern uint64_t spa_get_errlog_size(spa_t *spa); ++extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); ++extern void spa_errlog_rotate(spa_t *spa); ++extern void spa_errlog_drain(spa_t *spa); ++extern void spa_errlog_sync(spa_t *spa, uint64_t txg); ++extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); ++ ++/* vdev cache */ ++extern void vdev_cache_stat_init(void); ++extern void vdev_cache_stat_fini(void); ++ ++/* Initialization and termination */ ++extern void spa_init(int flags); ++extern void spa_fini(void); ++extern void spa_boot_init(void); ++ ++/* properties */ ++extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); ++extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); ++extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); ++extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); ++ ++/* asynchronous event notification */ ++extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); ++ ++#ifdef ZFS_DEBUG ++#define dprintf_bp(bp, fmt, ...) do { \ ++ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ ++ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_PUSHPAGE); \ ++ sprintf_blkptr(__blkbuf, (bp)); \ ++ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ ++ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ ++ } \ ++_NOTE(CONSTCOND) } while (0) ++#else ++#define dprintf_bp(bp, fmt, ...) ++#endif ++ ++extern boolean_t spa_debug_enabled(spa_t *spa); ++#define spa_dbgmsg(spa, ...) \ ++{ \ ++ if (spa_debug_enabled(spa)) \ ++ zfs_dbgmsg(__VA_ARGS__); \ ++} ++ ++extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_SPA_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/spa_impl.h linux-3.2.33-go/include/zfs/sys/spa_impl.h +--- linux-3.2.33-go.orig/include/zfs/sys/spa_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/spa_impl.h 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,240 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2011 by Delphix. All rights reserved. ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ */ ++ ++#ifndef _SYS_SPA_IMPL_H ++#define _SYS_SPA_IMPL_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++typedef struct spa_error_entry { ++ zbookmark_t se_bookmark; ++ char *se_name; ++ avl_node_t se_avl; ++} spa_error_entry_t; ++ ++typedef struct spa_history_phys { ++ uint64_t sh_pool_create_len; /* ending offset of zpool create */ ++ uint64_t sh_phys_max_off; /* physical EOF */ ++ uint64_t sh_bof; /* logical BOF */ ++ uint64_t sh_eof; /* logical EOF */ ++ uint64_t sh_records_lost; /* num of records overwritten */ ++} spa_history_phys_t; ++ ++struct spa_aux_vdev { ++ uint64_t sav_object; /* MOS object for device list */ ++ nvlist_t *sav_config; /* cached device config */ ++ vdev_t **sav_vdevs; /* devices */ ++ int sav_count; /* number devices */ ++ boolean_t sav_sync; /* sync the device list */ ++ nvlist_t **sav_pending; /* pending device additions */ ++ uint_t sav_npending; /* # pending devices */ ++}; ++ ++typedef struct spa_config_lock { ++ kmutex_t scl_lock; ++ kthread_t *scl_writer; ++ int scl_write_wanted; ++ kcondvar_t scl_cv; ++ refcount_t scl_count; ++} spa_config_lock_t; ++ ++typedef struct spa_config_dirent { ++ list_node_t scd_link; ++ char *scd_path; ++} spa_config_dirent_t; ++ ++enum zio_taskq_type { ++ ZIO_TASKQ_ISSUE = 0, ++ ZIO_TASKQ_ISSUE_HIGH, ++ ZIO_TASKQ_INTERRUPT, ++ ZIO_TASKQ_INTERRUPT_HIGH, ++ ZIO_TASKQ_TYPES ++}; ++ ++/* ++ * State machine for the zpool-pooname process. The states transitions ++ * are done as follows: ++ * ++ * From To Routine ++ * PROC_NONE -> PROC_CREATED spa_activate() ++ * PROC_CREATED -> PROC_ACTIVE spa_thread() ++ * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate() ++ * PROC_DEACTIVATE -> PROC_GONE spa_thread() ++ * PROC_GONE -> PROC_NONE spa_deactivate() ++ */ ++typedef enum spa_proc_state { ++ SPA_PROC_NONE, /* spa_proc = &p0, no process created */ ++ SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */ ++ SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */ ++ SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */ ++ SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */ ++} spa_proc_state_t; ++ ++struct spa { ++ /* ++ * Fields protected by spa_namespace_lock. ++ */ ++ char spa_name[MAXNAMELEN]; /* pool name */ ++ char *spa_comment; /* comment */ ++ avl_node_t spa_avl; /* node in spa_namespace_avl */ ++ nvlist_t *spa_config; /* last synced config */ ++ nvlist_t *spa_config_syncing; /* currently syncing config */ ++ nvlist_t *spa_config_splitting; /* config for splitting */ ++ nvlist_t *spa_load_info; /* info and errors from load */ ++ uint64_t spa_config_txg; /* txg of last config change */ ++ int spa_sync_pass; /* iterate-to-convergence */ ++ pool_state_t spa_state; /* pool state */ ++ int spa_inject_ref; /* injection references */ ++ uint8_t spa_sync_on; /* sync threads are running */ ++ spa_load_state_t spa_load_state; /* current load operation */ ++ uint64_t spa_import_flags; /* import specific flags */ ++ taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; ++ dsl_pool_t *spa_dsl_pool; ++ metaslab_class_t *spa_normal_class; /* normal data class */ ++ metaslab_class_t *spa_log_class; /* intent log data class */ ++ uint64_t spa_first_txg; /* first txg after spa_open() */ ++ uint64_t spa_final_txg; /* txg of export/destroy */ ++ uint64_t spa_freeze_txg; /* freeze pool at this txg */ ++ uint64_t spa_load_max_txg; /* best initial ub_txg */ ++ uint64_t spa_claim_max_txg; /* highest claimed birth txg */ ++ timespec_t spa_loaded_ts; /* 1st successful open time */ ++ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ ++ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ ++ vdev_t *spa_root_vdev; /* top-level vdev container */ ++ uint64_t spa_config_guid; /* config pool guid */ ++ uint64_t spa_load_guid; /* spa_load initialized guid */ ++ list_t spa_config_dirty_list; /* vdevs with dirty config */ ++ list_t spa_state_dirty_list; /* vdevs with dirty state */ ++ spa_aux_vdev_t spa_spares; /* hot spares */ ++ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ ++ uint64_t spa_config_object; /* MOS object for pool config */ ++ uint64_t spa_config_generation; /* config generation number */ ++ uint64_t spa_syncing_txg; /* txg currently syncing */ ++ bpobj_t spa_deferred_bpobj; /* deferred-free bplist */ ++ bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */ ++ uberblock_t spa_ubsync; /* last synced uberblock */ ++ uberblock_t spa_uberblock; /* current uberblock */ ++ boolean_t spa_extreme_rewind; /* rewind past deferred frees */ ++ uint64_t spa_last_io; /* lbolt of last non-scan I/O */ ++ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ ++ uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ ++ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ ++ uint8_t spa_scrub_active; /* active or suspended? */ ++ uint8_t spa_scrub_type; /* type of scrub we're doing */ ++ uint8_t spa_scrub_finished; /* indicator to rotate logs */ ++ uint8_t spa_scrub_started; /* started since last boot */ ++ uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ ++ uint64_t spa_scan_pass_start; /* start time per pass/reboot */ ++ uint64_t spa_scan_pass_exam; /* examined bytes per pass */ ++ kmutex_t spa_async_lock; /* protect async state */ ++ kthread_t *spa_async_thread; /* thread doing async task */ ++ int spa_async_suspended; /* async tasks suspended */ ++ kcondvar_t spa_async_cv; /* wait for thread_exit() */ ++ uint16_t spa_async_tasks; /* async task mask */ ++ char *spa_root; /* alternate root directory */ ++ uint64_t spa_ena; /* spa-wide ereport ENA */ ++ int spa_last_open_failed; /* error if last open failed */ ++ uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */ ++ uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */ ++ uint64_t spa_load_txg; /* ub txg that loaded */ ++ uint64_t spa_load_txg_ts; /* timestamp from that ub */ ++ uint64_t spa_load_meta_errors; /* verify metadata err count */ ++ uint64_t spa_load_data_errors; /* verify data err count */ ++ uint64_t spa_verify_min_txg; /* start txg of verify scrub */ ++ kmutex_t spa_errlog_lock; /* error log lock */ ++ uint64_t spa_errlog_last; /* last error log object */ ++ uint64_t spa_errlog_scrub; /* scrub error log object */ ++ kmutex_t spa_errlist_lock; /* error list/ereport lock */ ++ avl_tree_t spa_errlist_last; /* last error list */ ++ avl_tree_t spa_errlist_scrub; /* scrub error list */ ++ uint64_t spa_deflate; /* should we deflate? */ ++ uint64_t spa_history; /* history object */ ++ kmutex_t spa_history_lock; /* history lock */ ++ vdev_t *spa_pending_vdev; /* pending vdev additions */ ++ kmutex_t spa_props_lock; /* property lock */ ++ uint64_t spa_pool_props_object; /* object for properties */ ++ uint64_t spa_bootfs; /* default boot filesystem */ ++ uint64_t spa_failmode; /* failure mode for the pool */ ++ uint64_t spa_delegation; /* delegation on/off */ ++ list_t spa_config_list; /* previous cache file(s) */ ++ zio_t *spa_async_zio_root; /* root of all async I/O */ ++ zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ ++ kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ ++ kcondvar_t spa_suspend_cv; /* notification of resume */ ++ uint8_t spa_suspended; /* pool is suspended */ ++ uint8_t spa_claiming; /* pool is doing zil_claim() */ ++ boolean_t spa_debug; /* debug enabled? */ ++ boolean_t spa_is_root; /* pool is root */ ++ int spa_minref; /* num refs when first opened */ ++ int spa_mode; /* FREAD | FWRITE */ ++ spa_log_state_t spa_log_state; /* log state */ ++ uint64_t spa_autoexpand; /* lun expansion on/off */ ++ ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */ ++ uint64_t spa_ddt_stat_object; /* DDT statistics */ ++ uint64_t spa_dedup_ditto; /* dedup ditto threshold */ ++ uint64_t spa_dedup_checksum; /* default dedup checksum */ ++ uint64_t spa_dspace; /* dspace in normal class */ ++ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ ++ kmutex_t spa_proc_lock; /* protects spa_proc* */ ++ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ ++ spa_proc_state_t spa_proc_state; /* see definition */ ++ proc_t *spa_proc; /* "zpool-poolname" process */ ++ uint64_t spa_did; /* if procp != p0, did of t1 */ ++ boolean_t spa_autoreplace; /* autoreplace set in open */ ++ int spa_vdev_locks; /* locks grabbed */ ++ uint64_t spa_creation_version; /* version at pool creation */ ++ uint64_t spa_prev_software_version; ++ /* ++ * spa_refcnt & spa_config_lock must be the last elements ++ * because refcount_t changes size based on compilation options. ++ * In order for the MDB module to function correctly, the other ++ * fields must remain in the same location. ++ */ ++ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ ++ refcount_t spa_refcount; /* number of opens */ ++}; ++ ++extern char *spa_config_path; ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_SPA_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/txg.h linux-3.2.33-go/include/zfs/sys/txg.h +--- linux-3.2.33-go.orig/include/zfs/sys/txg.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/txg.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,140 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_TXG_H ++#define _SYS_TXG_H ++ ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */ ++#define TXG_SIZE 4 /* next power of 2 */ ++#define TXG_MASK (TXG_SIZE - 1) /* mask for size */ ++#define TXG_INITIAL TXG_SIZE /* initial txg */ ++#define TXG_IDX (txg & TXG_MASK) ++ ++/* Number of txgs worth of frees we defer adding to in-core spacemaps */ ++#define TXG_DEFER_SIZE 2 ++ ++#define TXG_WAIT 1ULL ++#define TXG_NOWAIT 2ULL ++ ++typedef struct tx_cpu tx_cpu_t; ++ ++typedef struct txg_handle { ++ tx_cpu_t *th_cpu; ++ uint64_t th_txg; ++} txg_handle_t; ++ ++typedef struct txg_node { ++ struct txg_node *tn_next[TXG_SIZE]; ++ uint8_t tn_member[TXG_SIZE]; ++} txg_node_t; ++ ++typedef struct txg_list { ++ kmutex_t tl_lock; ++ size_t tl_offset; ++ txg_node_t *tl_head[TXG_SIZE]; ++} txg_list_t; ++ ++struct dsl_pool; ++ ++extern void txg_init(struct dsl_pool *dp, uint64_t txg); ++extern void txg_fini(struct dsl_pool *dp); ++extern void txg_sync_start(struct dsl_pool *dp); ++extern void txg_sync_stop(struct dsl_pool *dp); ++extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp); ++extern void txg_rele_to_quiesce(txg_handle_t *txghp); ++extern void txg_rele_to_sync(txg_handle_t *txghp); ++extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); ++ ++/* ++ * Delay the caller by the specified number of ticks or until ++ * the txg closes (whichever comes first). This is intended ++ * to be used to throttle writers when the system nears its ++ * capacity. ++ */ ++extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks); ++ ++/* ++ * Wait until the given transaction group has finished syncing. ++ * Try to make this happen as soon as possible (eg. kick off any ++ * necessary syncs immediately). If txg==0, wait for the currently open ++ * txg to finish syncing. ++ */ ++extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg); ++ ++/* ++ * Wait until the given transaction group, or one after it, is ++ * the open transaction group. Try to make this happen as soon ++ * as possible (eg. kick off any necessary syncs immediately). ++ * If txg == 0, wait for the next open txg. ++ */ ++extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg); ++ ++/* ++ * Returns TRUE if we are "backed up" waiting for the syncing ++ * transaction to complete; otherwise returns FALSE. ++ */ ++extern boolean_t txg_stalled(struct dsl_pool *dp); ++ ++/* returns TRUE if someone is waiting for the next txg to sync */ ++extern boolean_t txg_sync_waiting(struct dsl_pool *dp); ++ ++/* ++ * Wait for pending commit callbacks of already-synced transactions to finish ++ * processing. ++ */ ++extern void txg_wait_callbacks(struct dsl_pool *dp); ++ ++/* ++ * Per-txg object lists. ++ */ ++ ++#define TXG_CLEAN(txg) ((txg) - 1) ++ ++extern void txg_list_create(txg_list_t *tl, size_t offset); ++extern void txg_list_destroy(txg_list_t *tl); ++extern int txg_list_empty(txg_list_t *tl, uint64_t txg); ++extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg); ++extern int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg); ++extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); ++extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg); ++extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg); ++extern void *txg_list_head(txg_list_t *tl, uint64_t txg); ++extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg); ++ ++/* Global tuning */ ++extern int zfs_txg_timeout; ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_TXG_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/txg_impl.h linux-3.2.33-go/include/zfs/sys/txg_impl.h +--- linux-3.2.33-go.orig/include/zfs/sys/txg_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/txg_impl.h 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,75 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_TXG_IMPL_H ++#define _SYS_TXG_IMPL_H ++ ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct tx_cpu { ++ kmutex_t tc_lock; ++ kcondvar_t tc_cv[TXG_SIZE]; ++ uint64_t tc_count[TXG_SIZE]; ++ list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ ++ char tc_pad[16]; ++}; ++ ++typedef struct tx_state { ++ tx_cpu_t *tx_cpu; /* protects right to enter txg */ ++ kmutex_t tx_sync_lock; /* protects tx_state_t */ ++ uint64_t tx_open_txg; /* currently open txg id */ ++ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ ++ uint64_t tx_syncing_txg; /* currently syncing txg id */ ++ uint64_t tx_synced_txg; /* last synced txg id */ ++ ++ uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */ ++ uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */ ++ ++ kcondvar_t tx_sync_more_cv; ++ kcondvar_t tx_sync_done_cv; ++ kcondvar_t tx_quiesce_more_cv; ++ kcondvar_t tx_quiesce_done_cv; ++ kcondvar_t tx_timeout_cv; ++ kcondvar_t tx_exit_cv; /* wait for all threads to exit */ ++ ++ uint8_t tx_threads; /* number of threads */ ++ uint8_t tx_exiting; /* set when we're exiting */ ++ ++ kthread_t *tx_sync_thread; ++ kthread_t *tx_quiesce_thread; ++ ++ taskq_t *tx_commit_cb_taskq; /* commit callback taskq */ ++} tx_state_t; ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_TXG_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/u8_textprep_data.h linux-3.2.33-go/include/zfs/sys/u8_textprep_data.h +--- linux-3.2.33-go.orig/include/zfs/sys/u8_textprep_data.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/u8_textprep_data.h 2012-11-16 23:25:34.341039426 +0100 +@@ -0,0 +1,35376 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++/* ++ * COPYRIGHT AND PERMISSION NOTICE ++ * ++ * Copyright (c) 1991-2006 Unicode, Inc. All rights reserved. Distributed under ++ * the Terms of Use in http://www.unicode.org/copyright.html. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining ++ * a copy of the Unicode data files and any associated documentation (the ++ * "Data Files") or Unicode software and any associated documentation (the ++ * "Software") to deal in the Data Files or Software without restriction, ++ * including without limitation the rights to use, copy, modify, merge, ++ * publish, distribute, and/or sell copies of the Data Files or Software, and ++ * to permit persons to whom the Data Files or Software are furnished to do so, ++ * provided that (a) the above copyright notice(s) and this permission notice ++ * appear with all copies of the Data Files or Software, (b) both the above ++ * copyright notice(s) and this permission notice appear in associated ++ * documentation, and (c) there is clear notice in each modified Data File or ++ * in the Software as well as in the documentation associated with the Data ++ * File(s) or Software that the data or software has been modified. ++ * ++ * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY ++ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF ++ * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS ++ * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR ++ * CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, ++ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER ++ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE ++ * OF THE DATA FILES OR SOFTWARE. ++ * ++ * Except as contained in this notice, the name of a copyright holder shall not ++ * be used in advertising or otherwise to promote the sale, use or other ++ * dealings in these Data Files or Software without prior written authorization ++ * of the copyright holder. ++ * ++ * Unicode and the Unicode logo are trademarks of Unicode, Inc., and may be ++ * registered in some jurisdictions. All other trademarks and registered ++ * trademarks mentioned herein are the property of their respective owners. ++ */ ++/* ++ * This file has been modified by Sun Microsystems, Inc. ++ */ ++ ++#ifndef _SYS_U8_TEXTPREP_DATA_H ++#define _SYS_U8_TEXTPREP_DATA_H ++ ++ ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * To get to the combining class data, composition mappings, decomposition ++ * mappings, and case conversion mappings of Unicode, the data structures ++ * formulated and their meanings are like the following: ++ * ++ * Each UTF-8 character is seen as a 4-byte entity so that U+0061 (or 0x61 in ++ * UTF-8) would be seen as 0x00 0x00 0x00 0x61. Similarly, U+1D15E would be ++ * 0xF0 0x9D 0x85 0x9E in UTF-8. ++ * ++ * The first byte (MSB) value is an index to the b1_tbl, such as ++ * u8_common_b1_tbl and u8_composition_b1_tbl tables. A b1_tbl has ++ * indices to b2_tbl tables that have indices to b3_tbl. Each b3_tbl has ++ * either indices to b4_tbl or indices to b4_tbl and base values for ++ * displacement calculations later by using the u8_displacement_t type at ++ * below. Each b4_tbl table then has indices to the final tables. ++ * ++ * As an example, if we have a character with code value of U+1D15E which is ++ * 0xF0 0x9D 0x85 0x9E in UTF-8, the target decomposition character bytes ++ * that will be mapped by the mapping procedure would be the ones between ++ * the start_index and the end_index computed as like the following: ++ * ++ * b2_tbl_id = u8_common_b1_tbl[0][0xF0]; ++ * b3_tbl_id = u8_decomp_b2_tbl[0][b2_tbl_id][0x9D]; ++ * b4_tbl_id = u8_decomp_b3_tbl[0][b3_tbl_id][0x85].tbl_id; ++ * b4_base = u8_decomp_b3_tbl[0][b3_tbl_id][0x85].base; ++ * if (b4_tbl_id >= 0x8000) { ++ * b4_tbl_id -= 0x8000; ++ * start_index = u8_decomp_b4_16bit_tbl[0][b4_tbl_id][0x9E]; ++ * end_index = u8_decomp_b4_16bit_tbl[0][b4_tbl_id][0x9E + 1]; ++ * } else { ++ * start_index = u8_decomp_b4_tbl[0][b4_tbl_id][0x9E]; ++ * end_index = u8_decomp_b4_tbl[0][b4_tbl_id][0x9E + 1]; ++ * } ++ * ++ * The start_index and the end_index can be used to retrieve the bytes ++ * possibly of multiple UTF-8 characters from the final tables. ++ * ++ * The "[0]" at the above indicates this is for Unicode Version 3.2.0 data ++ * as of today. Consequently, the "[1]" indicates another Unicode version ++ * data and it is Unicode 5.0.0 as of today. ++ * ++ * The mapping procedures and the data structures are more or less similar or ++ * alike among different mappings. You might want to read the u8_textprep.c ++ * for specific details. ++ * ++ * The tool programs created and used to generate the tables in this file are ++ * saved at PSARC/2007/149/materials/ as tools.tar.gz file. ++ */ ++ ++/* The following is a component type for the b4_tbl vectors. */ ++typedef struct { ++ uint16_t tbl_id; ++ uint16_t base; ++} u8_displacement_t; ++ ++/* ++ * The U8_TBL_ELEMENT_NOT_DEF macro indicates a byte that is not defined or ++ * used. The U8_TBL_ELEMENT_FILLER indicates the end of a UTF-8 character at ++ * the final tables. ++ */ ++#define U8_TBL_ELEMENT_NOT_DEF (0xff) ++#define N_ U8_TBL_ELEMENT_NOT_DEF ++ ++#define U8_TBL_ELEMENT_FILLER (0xf7) ++#define FIL_ U8_TBL_ELEMENT_FILLER ++ ++/* ++ * The common b1_tbl for combining class, decompositions, tolower, and ++ * toupper case conversion mappings. ++ */ ++static const uchar_t u8_common_b1_tbl[2][256] = { ++ { ++ 0, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 1, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { ++ 0, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 1, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++}; ++ ++static const uchar_t u8_combining_class_b2_tbl[2][2][256] = { ++ { ++ { ++ 0, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 1, 2, 3, 4, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, 5, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, 6, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ ++ }, ++ { ++ { ++ 0, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 1, 2, 3, 4, N_, N_, N_, N_, ++ N_, N_, 5, N_, N_, N_, N_, 6, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 7, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, 8, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ ++ }, ++ ++}; ++ ++static const uchar_t u8_combining_class_b3_tbl[2][9][256] = { ++ { ++ { /* Third byte table 0. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, 0, 1, N_, N_, ++ N_, N_, 2, N_, N_, N_, 3, 4, ++ N_, 5, N_, 6, 7, 8, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 1. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, 9, 10, 11, 12, ++ 13, 14, 15, 16, 17, 18, N_, 19, ++ N_, 20, N_, 21, N_, 22, N_, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 2. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 32, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, 33, N_, N_, 34, ++ N_, N_, 35, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 3. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, 36, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 4. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 37, N_, 38, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 5. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, 39, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 40, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 6. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, 41, 42, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 7. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 8. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ }, ++ { ++ { /* Third byte table 0. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, 0, 1, N_, N_, ++ N_, N_, 2, N_, N_, N_, 3, 4, ++ 5, 6, N_, 7, 8, 9, N_, 10, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 1. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, 11, 12, 13, 14, ++ 15, 16, 17, 18, 19, 20, N_, 21, ++ N_, 22, 23, 24, N_, 25, N_, 26, ++ 27, 28, 29, 30, 31, 32, 33, 34, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 2. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 35, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, 36, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, 37, N_, N_, 38, ++ N_, N_, 39, N_, 40, N_, N_, N_, ++ 41, N_, N_, N_, 42, 43, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, 44, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 3. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, 45, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 4. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 46, N_, 47, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 5. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 48, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 6. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, 49, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 50, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 7. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 51, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { /* Third byte table 8. */ ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, 52, 53, N_, ++ N_, 54, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ }, ++}; ++ ++/* ++ * Unlike other b4_tbl, the b4_tbl for combining class data has ++ * the combining class values not indices to the final tables. ++ */ ++static const uchar_t u8_combining_class_b4_tbl[2][55][256] = { ++ { ++ { /* Fourth byte table 0. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 230, 230, 230, 230, 230, 230, 230, 230, ++ 230, 230, 230, 230, 230, 230, 230, 230, ++ 230, 230, 230, 230, 230, 232, 220, 220, ++ 220, 220, 232, 216, 220, 220, 220, 220, ++ 220, 202, 202, 220, 220, 220, 220, 202, ++ 202, 220, 220, 220, 220, 220, 220, 220, ++ 220, 220, 220, 220, 1, 1, 1, 1, ++ 1, 220, 220, 220, 220, 230, 230, 230, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 1. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 230, 230, 230, 230, 230, 240, 230, 220, ++ 220, 220, 230, 230, 230, 220, 220, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 234, 234, 233, 230, 230, 230, 230, 230, ++ 230, 230, 230, 230, 230, 230, 230, 230, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 2. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 230, 230, 230, 230, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 3. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 220, 230, 230, 230, 230, 220, 230, ++ 230, 230, 222, 220, 230, 230, 230, 230, ++ 230, 230, 0, 220, 220, 220, 220, 220, ++ 230, 230, 220, 230, 230, 222, 228, 230, ++ 10, 11, 12, 13, 14, 15, 16, 17, ++ 18, 19, 0, 20, 21, 22, 0, 23, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 4. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 24, 25, 0, 230, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 5. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 27, 28, 29, 30, 31, ++ 32, 33, 34, 230, 230, 220, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 35, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 6. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 230, 230, ++ 230, 230, 230, 230, 230, 0, 0, 230, ++ 230, 230, 230, 220, 230, 0, 0, 230, ++ 230, 0, 220, 230, 230, 220, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 7. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 36, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 230, 220, 230, 230, 220, 230, 230, 220, ++ 220, 220, 230, 220, 220, 230, 220, 230, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 8. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 230, 230, 220, 230, 220, 230, 220, 230, ++ 220, 230, 230, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 9. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 10. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 230, 220, 230, 230, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 11. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 12. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 13. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 14. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 15. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 16. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 17. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 18. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 19. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 20. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 84, 91, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 21. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 22. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 23. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 9, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 24. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 103, 103, 9, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 25. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 107, 107, 107, 107, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 26. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 118, 118, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 27. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 122, 122, 122, 122, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 28. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 220, 220, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 220, 0, 220, ++ 0, 216, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 29. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 129, 130, 0, 132, 0, 0, 0, ++ 0, 0, 130, 130, 130, 130, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 30. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 130, 0, 230, 230, 9, 0, 230, 230, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 31. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 220, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 32. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 7, ++ 0, 9, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 33. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 9, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 9, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 34. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 9, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 35. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 228, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 36. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 230, 230, 1, 1, 230, 230, 230, 230, ++ 1, 1, 1, 230, 230, 0, 0, 0, ++ 0, 230, 0, 0, 0, 1, 1, 230, ++ 220, 230, 1, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 37. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 218, 228, 232, 222, 224, 224, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 38. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 8, 8, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 39. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 26, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 40. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 230, 230, 230, 230, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 41. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 216, 216, 1, ++ 1, 1, 0, 0, 0, 226, 216, 216, ++ 216, 216, 216, 0, 0, 0, 0, 0, ++ 0, 0, 0, 220, 220, 220, 220, 220, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 42. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 220, 220, 220, 0, 0, 230, 230, 230, ++ 230, 230, 220, 220, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 230, 230, 230, 230, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 43. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 44. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 45. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 46. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 47. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 48. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 49. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 50. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 51. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 52. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 53. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 54. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ }, ++ { ++ { /* Fourth byte table 0. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 230, 230, 230, 230, 230, 230, 230, 230, ++ 230, 230, 230, 230, 230, 230, 230, 230, ++ 230, 230, 230, 230, 230, 232, 220, 220, ++ 220, 220, 232, 216, 220, 220, 220, 220, ++ 220, 202, 202, 220, 220, 220, 220, 202, ++ 202, 220, 220, 220, 220, 220, 220, 220, ++ 220, 220, 220, 220, 1, 1, 1, 1, ++ 1, 220, 220, 220, 220, 230, 230, 230, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 1. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 230, 230, 230, 230, 230, 240, 230, 220, ++ 220, 220, 230, 230, 230, 220, 220, 0, ++ 230, 230, 230, 220, 220, 220, 220, 230, ++ 232, 220, 220, 230, 233, 234, 234, 233, ++ 234, 234, 233, 230, 230, 230, 230, 230, ++ 230, 230, 230, 230, 230, 230, 230, 230, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 2. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 230, 230, 230, 230, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 3. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 220, 230, 230, 230, 230, 220, 230, ++ 230, 230, 222, 220, 230, 230, 230, 230, ++ 230, 230, 220, 220, 220, 220, 220, 220, ++ 230, 230, 220, 230, 230, 222, 228, 230, ++ 10, 11, 12, 13, 14, 15, 16, 17, ++ 18, 19, 19, 20, 21, 22, 0, 23, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 4. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 24, 25, 0, 230, 220, 0, 18, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 5. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 230, 230, 230, 230, 230, 230, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 6. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 27, 28, 29, 30, 31, ++ 32, 33, 34, 230, 230, 220, 220, 230, ++ 230, 230, 230, 230, 220, 230, 230, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 35, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 7. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 230, 230, ++ 230, 230, 230, 230, 230, 0, 0, 230, ++ 230, 230, 230, 220, 230, 0, 0, 230, ++ 230, 0, 220, 230, 230, 220, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 8. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 36, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 230, 220, 230, 230, 220, 230, 230, 220, ++ 220, 220, 230, 220, 220, 230, 220, 230, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 9. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 230, 230, 220, 230, 220, 230, 220, 230, ++ 220, 230, 230, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 10. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 230, 230, 230, 230, 230, ++ 230, 230, 220, 230, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 11. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 12. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 230, 220, 230, 230, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 13. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 14. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 15. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 16. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 17. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 18. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 19. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 20. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 21. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 22. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 84, 91, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 23. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 24. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 25. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 9, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 26. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 9, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 27. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 103, 103, 9, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 28. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 107, 107, 107, 107, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 29. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 118, 118, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 30. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 122, 122, 122, 122, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 31. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 220, 220, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 220, 0, 220, ++ 0, 216, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 32. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 129, 130, 0, 132, 0, 0, 0, ++ 0, 0, 130, 130, 130, 130, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 33. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 130, 0, 230, 230, 9, 0, 230, 230, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 34. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 220, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 35. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 7, ++ 0, 9, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 36. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 230, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 37. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 9, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 9, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 38. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 9, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 230, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 39. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 228, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 40. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 222, 230, 220, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 41. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 230, ++ 220, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 42. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 43. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 9, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 230, 220, 230, 230, 230, ++ 230, 230, 230, 230, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 44. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 230, 230, 220, 230, 230, 230, 230, 230, ++ 230, 230, 220, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 230, 220, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 45. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 230, 230, 1, 1, 230, 230, 230, 230, ++ 1, 1, 1, 230, 230, 0, 0, 0, ++ 0, 230, 0, 0, 0, 1, 1, 230, ++ 220, 230, 1, 1, 220, 220, 220, 220, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 46. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 218, 228, 232, 222, 224, 224, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 47. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 8, 8, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 48. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 9, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 49. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 26, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 50. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 230, 230, 230, 230, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 51. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 220, 0, 230, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 230, 1, 220, 0, 0, 0, 0, 9, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 52. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 216, 216, 1, ++ 1, 1, 0, 0, 0, 226, 216, 216, ++ 216, 216, 216, 0, 0, 0, 0, 0, ++ 0, 0, 0, 220, 220, 220, 220, 220, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 53. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 220, 220, 220, 0, 0, 230, 230, 230, ++ 230, 230, 220, 220, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 230, 230, 230, 230, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { /* Fourth byte table 54. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 230, 230, 230, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ }, ++ }, ++}; ++ ++static const uchar_t u8_composition_b1_tbl[2][256] = { ++ { ++ 0, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { ++ 0, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++}; ++ ++static const uchar_t u8_composition_b2_tbl[2][1][256] = { ++ { ++ { ++ 0, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 1, 2, 3, 4, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ ++ }, ++ { ++ { ++ 0, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 1, 2, 3, 4, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ ++ }, ++ ++}; ++ ++static const u8_displacement_t u8_composition_b3_tbl[2][5][256] = { ++ { ++ { /* Third byte table 0. */ ++ { 0x8000, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 0, 2470 }, ++ { 0x8001, 2491 }, { 1, 2871 }, { 2, 2959 }, ++ { 3, 3061 }, { 4, 3212 }, { 5, 3226 }, ++ { N_, 0 }, { 6, 3270 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 0x8002, 3277 }, ++ { 7, 3774 }, { 8, 3949 }, { 9, 4198 }, ++ { N_, 0 }, { 10, 4265 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 11, 4293 }, { 12, 4312 }, { N_, 0 }, ++ { 13, 4326 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 1. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 14, 4347 }, ++ { N_, 0 }, { N_, 0 }, { 15, 4374 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 16, 4391 }, ++ { 17, 4416 }, { 18, 4425 }, { N_, 0 }, ++ { 19, 4451 }, { 20, 4460 }, { 21, 4469 }, ++ { N_, 0 }, { 22, 4503 }, { N_, 0 }, ++ { 23, 4529 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 2. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 24, 4563 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 25, 4572 }, { 26, 4588 }, ++ { 27, 4620 }, { 28, 4666 }, { 0x8003, 4682 }, ++ { 0x8004, 5254 }, { 29, 5616 }, { 30, 5646 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 3. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 31, 5684 }, ++ { 32, 5708 }, { 33, 5732 }, { 34, 5780 }, ++ { 35, 5900 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 4. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 36, 6012 }, { 37, 6241 }, { 38, 6358 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ }, ++ { ++ { /* Third byte table 0. */ ++ { 0x8000, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 0, 2470 }, ++ { 0x8001, 2491 }, { 1, 2871 }, { 2, 2959 }, ++ { 3, 3061 }, { 4, 3212 }, { 5, 3226 }, ++ { N_, 0 }, { 6, 3270 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 0x8002, 3277 }, ++ { 7, 3774 }, { 8, 3949 }, { 9, 4198 }, ++ { N_, 0 }, { 10, 4265 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 11, 4293 }, { 12, 4312 }, { N_, 0 }, ++ { 13, 4326 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 1. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 14, 4347 }, ++ { N_, 0 }, { N_, 0 }, { 15, 4374 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 16, 4391 }, ++ { 17, 4416 }, { 18, 4425 }, { N_, 0 }, ++ { 19, 4451 }, { 20, 4460 }, { 21, 4469 }, ++ { N_, 0 }, { 22, 4503 }, { N_, 0 }, ++ { 23, 4529 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 2. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 24, 4563 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 25, 4572 }, { 26, 4662 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 27, 4671 }, { 28, 4687 }, ++ { 29, 4719 }, { 30, 4765 }, { 0x8003, 4781 }, ++ { 0x8004, 5353 }, { 31, 5715 }, { 32, 5745 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 3. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 33, 5783 }, ++ { 34, 5807 }, { 35, 5831 }, { 36, 5879 }, ++ { 37, 5999 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 4. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 38, 6111 }, { 39, 6340 }, { 40, 6457 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ }, ++}; ++ ++static const uchar_t u8_composition_b4_tbl[2][41][257] = { ++ { ++ { /* Fourth byte table 0. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, ++ }, ++ { /* Fourth byte table 1. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 29, 58, 58, 58, 58, ++ 58, 58, 58, 58, 58, 58, 58, 58, ++ 58, 58, 58, 73, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, ++ }, ++ { /* Fourth byte table 2. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 15, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 38, 46, 46, 46, 46, ++ 46, 54, 62, 62, 62, 62, 62, 62, ++ 62, 70, 78, 86, 94, 94, 94, 94, ++ 94, 94, 94, 94, 94, 94, 94, 94, ++ 94, 94, 94, 94, 94, 94, 94, 94, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, ++ }, ++ { /* Fourth byte table 3. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 36, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 108, 144, 144, 144, 144, 144, 144, 144, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, ++ }, ++ { /* Fourth byte table 4. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 7, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, ++ }, ++ { /* Fourth byte table 5. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 7, ++ 14, 22, 30, 30, 30, 30, 30, 37, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, ++ }, ++ { /* Fourth byte table 6. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, ++ }, ++ { /* Fourth byte table 7. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 15, 15, 15, 15, 70, 70, ++ 70, 70, 112, 133, 154, 154, 154, 162, ++ 162, 162, 162, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, ++ }, ++ { /* Fourth byte table 8. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 20, 20, 20, 27, 27, 46, 59, ++ 66, 91, 91, 98, 98, 98, 98, 105, ++ 105, 105, 105, 105, 130, 130, 130, 130, ++ 137, 137, 137, 137, 144, 144, 151, 151, ++ 151, 164, 164, 164, 171, 171, 190, 203, ++ 210, 235, 235, 242, 242, 242, 242, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, ++ }, ++ { /* Fourth byte table 9. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 25, 25, 25, 25, ++ 32, 32, 32, 32, 39, 39, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 53, ++ 53, 53, 53, 53, 53, 53, 53, 53, ++ 53, 53, 53, 53, 53, 53, 53, 53, ++ 53, 53, 53, 53, 53, 53, 53, 53, ++ 53, 53, 53, 53, 53, 60, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, ++ }, ++ { /* Fourth byte table 10. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 21, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, ++ }, ++ { /* Fourth byte table 11. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, ++ }, ++ { /* Fourth byte table 12. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 7, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, ++ }, ++ { /* Fourth byte table 13. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 14, 14, 14, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, ++ }, ++ { /* Fourth byte table 14. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 9, 9, 9, 9, 9, 9, 9, ++ 9, 18, 18, 18, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, ++ }, ++ { /* Fourth byte table 15. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, ++ }, ++ { /* Fourth byte table 16. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, ++ }, ++ { /* Fourth byte table 17. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, ++ }, ++ { /* Fourth byte table 18. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 17, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, ++ }, ++ { /* Fourth byte table 19. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, ++ }, ++ { /* Fourth byte table 20. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, ++ }, ++ { /* Fourth byte table 21. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 25, ++ 25, 25, 25, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, ++ }, ++ { /* Fourth byte table 22. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 17, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, ++ }, ++ { /* Fourth byte table 23. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 25, 25, 25, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, ++ }, ++ { /* Fourth byte table 24. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, ++ }, ++ { /* Fourth byte table 25. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 8, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, ++ }, ++ { /* Fourth byte table 26. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 8, 16, 16, 16, 16, ++ 16, 16, 16, 24, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, ++ }, ++ { /* Fourth byte table 27. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 15, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 38, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, ++ }, ++ { /* Fourth byte table 28. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 8, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, ++ }, ++ { /* Fourth byte table 29. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, ++ }, ++ { /* Fourth byte table 30. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 16, ++ 16, 16, 16, 16, 16, 16, 16, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, ++ }, ++ { /* Fourth byte table 31. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 8, 8, 16, 16, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, ++ }, ++ { /* Fourth byte table 32. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 8, 8, 16, 16, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, ++ }, ++ { /* Fourth byte table 33. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 8, 8, 8, 8, ++ 8, 16, 16, 16, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 32, 32, 40, 40, ++ 40, 40, 40, 40, 40, 40, 40, 40, ++ 40, 40, 40, 40, 40, 40, 40, 40, ++ 40, 40, 40, 40, 40, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, ++ }, ++ { /* Fourth byte table 34. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 8, 8, 16, 16, ++ 16, 24, 24, 24, 24, 24, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 40, 40, 40, 48, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 64, 72, 72, 72, 80, ++ 88, 88, 88, 96, 104, 112, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, ++ }, ++ { /* Fourth byte table 35. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 8, 16, 16, 16, 24, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 40, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 56, 56, 56, 56, 56, ++ 56, 64, 72, 72, 80, 80, 80, 80, ++ 80, 80, 80, 88, 96, 104, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, ++ }, ++ { /* Fourth byte table 36. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 9, ++ 9, 9, 9, 9, 18, 18, 27, 27, ++ 36, 36, 45, 45, 54, 54, 63, 63, ++ 72, 72, 81, 81, 90, 90, 99, 99, ++ 108, 108, 117, 117, 117, 126, 126, 135, ++ 135, 144, 144, 144, 144, 144, 144, 144, ++ 161, 161, 161, 178, 178, 178, 195, 195, ++ 195, 212, 212, 212, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, ++ }, ++ { /* Fourth byte table 37. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 18, ++ 18, 18, 18, 18, 27, 27, 36, 36, ++ 45, 45, 54, 54, 63, 63, 72, 72, ++ 81, 81, 90, 90, 99, 99, 108, 108, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, ++ }, ++ { /* Fourth byte table 38. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 9, 9, 9, 18, 18, 27, ++ 27, 36, 36, 36, 36, 36, 36, 36, ++ 53, 53, 53, 70, 70, 70, 87, 87, ++ 87, 104, 104, 104, 121, 121, 121, 121, ++ 121, 121, 121, 121, 121, 121, 121, 121, ++ 121, 121, 121, 121, 121, 121, 121, 121, ++ 130, 139, 148, 157, 157, 157, 157, 157, ++ 157, 157, 157, 157, 157, 157, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, ++ }, ++ { /* Fourth byte table 39. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 40. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ }, ++ { ++ { /* Fourth byte table 0. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, ++ }, ++ { /* Fourth byte table 1. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 29, 58, 58, 58, 58, ++ 58, 58, 58, 58, 58, 58, 58, 58, ++ 58, 58, 58, 73, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, ++ }, ++ { /* Fourth byte table 2. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 15, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 38, 46, 46, 46, 46, ++ 46, 54, 62, 62, 62, 62, 62, 62, ++ 62, 70, 78, 86, 94, 94, 94, 94, ++ 94, 94, 94, 94, 94, 94, 94, 94, ++ 94, 94, 94, 94, 94, 94, 94, 94, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, 102, 102, 102, 102, 102, 102, 102, ++ 102, ++ }, ++ { /* Fourth byte table 3. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 36, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 108, 144, 144, 144, 144, 144, 144, 144, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, 151, 151, 151, 151, 151, 151, 151, ++ 151, ++ }, ++ { /* Fourth byte table 4. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 7, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, ++ }, ++ { /* Fourth byte table 5. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 7, ++ 14, 22, 30, 30, 30, 30, 30, 37, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, ++ }, ++ { /* Fourth byte table 6. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, ++ }, ++ { /* Fourth byte table 7. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 15, 15, 15, 15, 70, 70, ++ 70, 70, 112, 133, 154, 154, 154, 162, ++ 162, 162, 162, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, ++ }, ++ { /* Fourth byte table 8. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 20, 20, 20, 27, 27, 46, 59, ++ 66, 91, 91, 98, 98, 98, 98, 105, ++ 105, 105, 105, 105, 130, 130, 130, 130, ++ 137, 137, 137, 137, 144, 144, 151, 151, ++ 151, 164, 164, 164, 171, 171, 190, 203, ++ 210, 235, 235, 242, 242, 242, 242, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, 249, 249, 249, 249, 249, 249, 249, ++ 249, ++ }, ++ { /* Fourth byte table 9. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 25, 25, 25, 25, ++ 32, 32, 32, 32, 39, 39, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 53, ++ 53, 53, 53, 53, 53, 53, 53, 53, ++ 53, 53, 53, 53, 53, 53, 53, 53, ++ 53, 53, 53, 53, 53, 53, 53, 53, ++ 53, 53, 53, 53, 53, 60, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, ++ }, ++ { /* Fourth byte table 10. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 21, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, ++ }, ++ { /* Fourth byte table 11. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, ++ }, ++ { /* Fourth byte table 12. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 7, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, ++ }, ++ { /* Fourth byte table 13. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 14, 14, 14, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, ++ }, ++ { /* Fourth byte table 14. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 9, 9, 9, 9, 9, 9, 9, ++ 9, 18, 18, 18, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, ++ }, ++ { /* Fourth byte table 15. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, ++ }, ++ { /* Fourth byte table 16. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, ++ }, ++ { /* Fourth byte table 17. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, ++ }, ++ { /* Fourth byte table 18. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 17, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, ++ }, ++ { /* Fourth byte table 19. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, ++ }, ++ { /* Fourth byte table 20. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, ++ }, ++ { /* Fourth byte table 21. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 25, ++ 25, 25, 25, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, ++ }, ++ { /* Fourth byte table 22. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 17, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, ++ }, ++ { /* Fourth byte table 23. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 25, 25, 25, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 34, 34, ++ 34, ++ }, ++ { /* Fourth byte table 24. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, ++ }, ++ { /* Fourth byte table 25. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 9, 9, ++ 18, 18, 27, 27, 36, 36, 45, 45, ++ 45, 45, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 63, 63, 72, 72, 81, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, ++ }, ++ { /* Fourth byte table 26. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, ++ }, ++ { /* Fourth byte table 27. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 8, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, ++ }, ++ { /* Fourth byte table 28. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 8, 16, 16, 16, 16, ++ 16, 16, 16, 24, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, ++ }, ++ { /* Fourth byte table 29. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 15, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 38, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, ++ }, ++ { /* Fourth byte table 30. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 8, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, ++ }, ++ { /* Fourth byte table 31. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, ++ }, ++ { /* Fourth byte table 32. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 16, ++ 16, 16, 16, 16, 16, 16, 16, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, ++ }, ++ { /* Fourth byte table 33. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 8, 8, 16, 16, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, ++ }, ++ { /* Fourth byte table 34. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 8, 8, 16, 16, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, ++ }, ++ { /* Fourth byte table 35. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 8, 8, 8, 8, ++ 8, 16, 16, 16, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 32, 32, 40, 40, ++ 40, 40, 40, 40, 40, 40, 40, 40, ++ 40, 40, 40, 40, 40, 40, 40, 40, ++ 40, 40, 40, 40, 40, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, ++ }, ++ { /* Fourth byte table 36. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 8, 8, 16, 16, ++ 16, 24, 24, 24, 24, 24, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 40, 40, 40, 48, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 64, 72, 72, 72, 80, ++ 88, 88, 88, 96, 104, 112, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, 120, 120, 120, 120, 120, 120, 120, ++ 120, ++ }, ++ { /* Fourth byte table 37. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 8, 16, 16, 16, 24, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 40, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 56, 56, 56, 56, 56, ++ 56, 64, 72, 72, 80, 80, 80, 80, ++ 80, 80, 80, 88, 96, 104, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, ++ }, ++ { /* Fourth byte table 38. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 9, ++ 9, 9, 9, 9, 18, 18, 27, 27, ++ 36, 36, 45, 45, 54, 54, 63, 63, ++ 72, 72, 81, 81, 90, 90, 99, 99, ++ 108, 108, 117, 117, 117, 126, 126, 135, ++ 135, 144, 144, 144, 144, 144, 144, 144, ++ 161, 161, 161, 178, 178, 178, 195, 195, ++ 195, 212, 212, 212, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, 229, 229, 229, 229, 229, 229, 229, ++ 229, ++ }, ++ { /* Fourth byte table 39. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 18, ++ 18, 18, 18, 18, 27, 27, 36, 36, ++ 45, 45, 54, 54, 63, 63, 72, 72, ++ 81, 81, 90, 90, 99, 99, 108, 108, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, 117, 117, 117, 117, 117, 117, 117, ++ 117, ++ }, ++ { /* Fourth byte table 40. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 9, 9, 9, 18, 18, 27, ++ 27, 36, 36, 36, 36, 36, 36, 36, ++ 53, 53, 53, 70, 70, 70, 87, 87, ++ 87, 104, 104, 104, 121, 121, 121, 121, ++ 121, 121, 121, 121, 121, 121, 121, 121, ++ 121, 121, 121, 121, 121, 121, 121, 121, ++ 130, 139, 148, 157, 157, 157, 157, 157, ++ 157, 157, 157, 157, 157, 157, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, 166, 166, 166, 166, 166, 166, 166, ++ 166, ++ }, ++ }, ++}; ++ ++static const uint16_t u8_composition_b4_16bit_tbl[2][5][257] = { ++ { ++ { /* Fourth byte 16-bit table 0. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 8, 16, 24, ++ 24, 24, 124, 146, 177, 219, 327, 335, ++ 379, 427, 521, 528, 562, 602, 624, 683, ++ 782, 797, 797, 849, 894, 941, 1061, 1076, ++ 1118, 1133, 1193, 1233, 1233, 1233, 1233, 1233, ++ 1233, 1233, 1333, 1355, 1386, 1428, 1536, 1544, ++ 1588, 1643, 1731, 1744, 1778, 1818, 1840, 1899, ++ 1998, 2013, 2013, 2065, 2110, 2164, 2284, 2299, ++ 2348, 2363, 2430, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, ++ }, ++ { /* Fourth byte 16-bit table 1. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 29, 29, 36, 43, 56, ++ 64, 64, 64, 93, 93, 93, 93, 93, ++ 101, 101, 101, 101, 101, 130, 151, 158, ++ 158, 165, 165, 165, 165, 190, 190, 190, ++ 190, 190, 190, 219, 219, 226, 233, 246, ++ 254, 254, 254, 283, 283, 283, 283, 283, ++ 291, 291, 291, 291, 291, 320, 341, 348, ++ 348, 355, 355, 355, 355, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, ++ }, ++ { /* Fourth byte 16-bit table 2. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 49, 49, 49, 49, 77, 77, ++ 112, 112, 160, 160, 160, 160, 160, 160, ++ 188, 188, 196, 196, 196, 196, 237, 237, ++ 237, 237, 272, 272, 272, 280, 280, 288, ++ 288, 288, 344, 344, 344, 344, 372, 372, ++ 414, 414, 469, 469, 469, 469, 469, 469, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, ++ }, ++ { /* Fourth byte 16-bit table 3. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 29, 58, 66, 74, 82, 90, 98, ++ 106, 135, 164, 172, 180, 188, 196, 204, ++ 212, 227, 242, 242, 242, 242, 242, 242, ++ 242, 257, 272, 272, 272, 272, 272, 272, ++ 272, 301, 330, 338, 346, 354, 362, 370, ++ 378, 407, 436, 444, 452, 460, 468, 476, ++ 484, 506, 528, 528, 528, 528, 528, 528, ++ 528, 550, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, ++ }, ++ { /* Fourth byte 16-bit table 4. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 15, 30, 30, 30, 30, 30, 30, ++ 30, 45, 60, 60, 60, 60, 60, 60, ++ 60, 82, 104, 104, 104, 104, 104, 104, ++ 104, 104, 126, 126, 126, 126, 126, 126, ++ 126, 155, 184, 192, 200, 208, 216, 224, ++ 232, 261, 290, 298, 306, 314, 322, 330, ++ 338, 346, 346, 346, 346, 354, 354, 354, ++ 354, 354, 354, 354, 354, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, ++ }, ++ }, ++ { ++ { /* Fourth byte 16-bit table 0. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 8, 16, 24, ++ 24, 24, 124, 146, 177, 219, 327, 335, ++ 379, 427, 521, 528, 562, 602, 624, 683, ++ 782, 797, 797, 849, 894, 941, 1061, 1076, ++ 1118, 1133, 1193, 1233, 1233, 1233, 1233, 1233, ++ 1233, 1233, 1333, 1355, 1386, 1428, 1536, 1544, ++ 1588, 1643, 1731, 1744, 1778, 1818, 1840, 1899, ++ 1998, 2013, 2013, 2065, 2110, 2164, 2284, 2299, ++ 2348, 2363, 2430, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, ++ 2470, ++ }, ++ { /* Fourth byte 16-bit table 1. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 29, 29, 36, 43, 56, ++ 64, 64, 64, 93, 93, 93, 93, 93, ++ 101, 101, 101, 101, 101, 130, 151, 158, ++ 158, 165, 165, 165, 165, 190, 190, 190, ++ 190, 190, 190, 219, 219, 226, 233, 246, ++ 254, 254, 254, 283, 283, 283, 283, 283, ++ 291, 291, 291, 291, 291, 320, 341, 348, ++ 348, 355, 355, 355, 355, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, 380, 380, 380, 380, 380, 380, 380, ++ 380, ++ }, ++ { /* Fourth byte 16-bit table 2. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 49, 49, 49, 49, 77, 77, ++ 112, 112, 160, 160, 160, 160, 160, 160, ++ 188, 188, 196, 196, 196, 196, 237, 237, ++ 237, 237, 272, 272, 272, 280, 280, 288, ++ 288, 288, 344, 344, 344, 344, 372, 372, ++ 414, 414, 469, 469, 469, 469, 469, 469, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, 497, 497, 497, 497, 497, 497, 497, ++ 497, ++ }, ++ { /* Fourth byte 16-bit table 3. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 29, 58, 66, 74, 82, 90, 98, ++ 106, 135, 164, 172, 180, 188, 196, 204, ++ 212, 227, 242, 242, 242, 242, 242, 242, ++ 242, 257, 272, 272, 272, 272, 272, 272, ++ 272, 301, 330, 338, 346, 354, 362, 370, ++ 378, 407, 436, 444, 452, 460, 468, 476, ++ 484, 506, 528, 528, 528, 528, 528, 528, ++ 528, 550, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, 572, 572, 572, 572, 572, 572, 572, ++ 572, ++ }, ++ { /* Fourth byte 16-bit table 4. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 15, 30, 30, 30, 30, 30, 30, ++ 30, 45, 60, 60, 60, 60, 60, 60, ++ 60, 82, 104, 104, 104, 104, 104, 104, ++ 104, 104, 126, 126, 126, 126, 126, 126, ++ 126, 155, 184, 192, 200, 208, 216, 224, ++ 232, 261, 290, 298, 306, 314, 322, 330, ++ 338, 346, 346, 346, 346, 354, 354, 354, ++ 354, 354, 354, 354, 354, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, 362, 362, 362, 362, 362, 362, 362, ++ 362, ++ }, ++ }, ++}; ++ ++static const uchar_t u8_composition_final_tbl[2][6623] = { ++ { ++ 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAE, FIL_, ++ 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xA0, FIL_, ++ 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAF, FIL_, ++ 0x10, 0xCC, 0x86, FIL_, 0xC4, 0x82, FIL_, 0xCC, ++ 0x87, FIL_, 0xC8, 0xA6, FIL_, 0xCC, 0x8F, FIL_, ++ 0xC8, 0x80, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x82, ++ FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x81, FIL_, 0xCC, ++ 0x80, FIL_, 0xC3, 0x80, FIL_, 0xCC, 0x83, FIL_, ++ 0xC3, 0x83, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, ++ 0xA0, FIL_, 0xCC, 0xA5, FIL_, 0xE1, 0xB8, 0x80, ++ FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x82, FIL_, 0xCC, ++ 0x84, FIL_, 0xC4, 0x80, FIL_, 0xCC, 0x88, FIL_, ++ 0xC3, 0x84, FIL_, 0xCC, 0x8A, FIL_, 0xC3, 0x85, ++ FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x84, FIL_, 0xCC, ++ 0x89, FIL_, 0xE1, 0xBA, 0xA2, FIL_, 0xCC, 0x8C, ++ FIL_, 0xC7, 0x8D, FIL_, 0x03, 0xCC, 0x87, FIL_, ++ 0xE1, 0xB8, 0x82, FIL_, 0xCC, 0xB1, FIL_, 0xE1, ++ 0xB8, 0x86, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, ++ 0x84, FIL_, 0x05, 0xCC, 0xA7, FIL_, 0xC3, 0x87, ++ FIL_, 0xCC, 0x81, FIL_, 0xC4, 0x86, FIL_, 0xCC, ++ 0x8C, FIL_, 0xC4, 0x8C, FIL_, 0xCC, 0x87, FIL_, ++ 0xC4, 0x8A, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0x88, ++ FIL_, 0x06, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0x8E, ++ FIL_, 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0x90, FIL_, ++ 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x92, FIL_, 0xCC, ++ 0x87, FIL_, 0xE1, 0xB8, 0x8A, FIL_, 0xCC, 0x8C, ++ FIL_, 0xC4, 0x8E, FIL_, 0xCC, 0xA3, FIL_, 0xE1, ++ 0xB8, 0x8C, FIL_, 0x11, 0xCC, 0x80, FIL_, 0xC3, ++ 0x88, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x89, FIL_, ++ 0xCC, 0x82, FIL_, 0xC3, 0x8A, FIL_, 0xCC, 0x88, ++ FIL_, 0xC3, 0x8B, FIL_, 0xCC, 0xA7, FIL_, 0xC8, ++ 0xA8, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x86, FIL_, ++ 0xCC, 0x8F, FIL_, 0xC8, 0x84, FIL_, 0xCC, 0x89, ++ FIL_, 0xE1, 0xBA, 0xBA, FIL_, 0xCC, 0xB0, FIL_, ++ 0xE1, 0xB8, 0x9A, FIL_, 0xCC, 0xAD, FIL_, 0xE1, ++ 0xB8, 0x98, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBA, ++ 0xBC, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0xB8, ++ FIL_, 0xCC, 0x84, FIL_, 0xC4, 0x92, FIL_, 0xCC, ++ 0x86, FIL_, 0xC4, 0x94, FIL_, 0xCC, 0x87, FIL_, ++ 0xC4, 0x96, FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x98, ++ FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x9A, FIL_, 0x01, ++ 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9E, FIL_, 0x07, ++ 0xCC, 0x8C, FIL_, 0xC7, 0xA6, FIL_, 0xCC, 0x87, ++ FIL_, 0xC4, 0xA0, FIL_, 0xCC, 0x84, FIL_, 0xE1, ++ 0xB8, 0xA0, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0x9C, ++ FIL_, 0xCC, 0x81, FIL_, 0xC7, 0xB4, FIL_, 0xCC, ++ 0xA7, FIL_, 0xC4, 0xA2, FIL_, 0xCC, 0x86, FIL_, ++ 0xC4, 0x9E, FIL_, 0x07, 0xCC, 0xAE, FIL_, 0xE1, ++ 0xB8, 0xAA, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB8, ++ 0xA2, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA6, ++ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0xA4, FIL_, ++ 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0xA8, FIL_, 0xCC, ++ 0x8C, FIL_, 0xC8, 0x9E, FIL_, 0xCC, 0x82, FIL_, ++ 0xC4, 0xA4, FIL_, 0x0F, 0xCC, 0x84, FIL_, 0xC4, ++ 0xAA, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x8C, FIL_, ++ 0xCC, 0xA8, FIL_, 0xC4, 0xAE, FIL_, 0xCC, 0x83, ++ FIL_, 0xC4, 0xA8, FIL_, 0xCC, 0x88, FIL_, 0xC3, ++ 0x8F, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x8D, FIL_, ++ 0xCC, 0x8F, FIL_, 0xC8, 0x88, FIL_, 0xCC, 0x86, ++ FIL_, 0xC4, 0xAC, FIL_, 0xCC, 0x91, FIL_, 0xC8, ++ 0x8A, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x8F, FIL_, ++ 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x88, FIL_, 0xCC, ++ 0x87, FIL_, 0xC4, 0xB0, FIL_, 0xCC, 0xA3, FIL_, ++ 0xE1, 0xBB, 0x8A, FIL_, 0xCC, 0xB0, FIL_, 0xE1, ++ 0xB8, 0xAC, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x8E, ++ FIL_, 0x01, 0xCC, 0x82, FIL_, 0xC4, 0xB4, FIL_, ++ 0x05, 0xCC, 0x8C, FIL_, 0xC7, 0xA8, FIL_, 0xCC, ++ 0xB1, FIL_, 0xE1, 0xB8, 0xB4, FIL_, 0xCC, 0x81, ++ FIL_, 0xE1, 0xB8, 0xB0, FIL_, 0xCC, 0xA7, FIL_, ++ 0xC4, 0xB6, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, ++ 0xB2, FIL_, 0x06, 0xCC, 0xA7, FIL_, 0xC4, 0xBB, ++ FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0xBD, FIL_, 0xCC, ++ 0xB1, FIL_, 0xE1, 0xB8, 0xBA, FIL_, 0xCC, 0xA3, ++ FIL_, 0xE1, 0xB8, 0xB6, FIL_, 0xCC, 0xAD, FIL_, ++ 0xE1, 0xB8, 0xBC, FIL_, 0xCC, 0x81, FIL_, 0xC4, ++ 0xB9, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xB8, ++ 0xBE, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x82, ++ FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x80, FIL_, ++ 0x09, 0xCC, 0x80, FIL_, 0xC7, 0xB8, FIL_, 0xCC, ++ 0xAD, FIL_, 0xE1, 0xB9, 0x8A, FIL_, 0xCC, 0x87, ++ FIL_, 0xE1, 0xB9, 0x84, FIL_, 0xCC, 0xB1, FIL_, ++ 0xE1, 0xB9, 0x88, FIL_, 0xCC, 0x83, FIL_, 0xC3, ++ 0x91, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x86, ++ FIL_, 0xCC, 0x81, FIL_, 0xC5, 0x83, FIL_, 0xCC, ++ 0xA7, FIL_, 0xC5, 0x85, FIL_, 0xCC, 0x8C, FIL_, ++ 0xC5, 0x87, FIL_, 0x10, 0xCC, 0xA8, FIL_, 0xC7, ++ 0xAA, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x8E, FIL_, ++ 0xCC, 0x80, FIL_, 0xC3, 0x92, FIL_, 0xCC, 0x9B, ++ FIL_, 0xC6, 0xA0, FIL_, 0xCC, 0x8F, FIL_, 0xC8, ++ 0x8C, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x93, FIL_, ++ 0xCC, 0x87, FIL_, 0xC8, 0xAE, FIL_, 0xCC, 0x8C, ++ FIL_, 0xC7, 0x91, FIL_, 0xCC, 0xA3, FIL_, 0xE1, ++ 0xBB, 0x8C, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x94, ++ FIL_, 0xCC, 0x84, FIL_, 0xC5, 0x8C, FIL_, 0xCC, ++ 0x83, FIL_, 0xC3, 0x95, FIL_, 0xCC, 0x86, FIL_, ++ 0xC5, 0x8E, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x96, ++ FIL_, 0xCC, 0x8B, FIL_, 0xC5, 0x90, FIL_, 0xCC, ++ 0x89, FIL_, 0xE1, 0xBB, 0x8E, FIL_, 0x02, 0xCC, ++ 0x87, FIL_, 0xE1, 0xB9, 0x96, FIL_, 0xCC, 0x81, ++ FIL_, 0xE1, 0xB9, 0x94, FIL_, 0x08, 0xCC, 0x91, ++ FIL_, 0xC8, 0x92, FIL_, 0xCC, 0xA7, FIL_, 0xC5, ++ 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0x98, FIL_, ++ 0xCC, 0xB1, FIL_, 0xE1, 0xB9, 0x9E, FIL_, 0xCC, ++ 0xA3, FIL_, 0xE1, 0xB9, 0x9A, FIL_, 0xCC, 0x87, ++ FIL_, 0xE1, 0xB9, 0x98, FIL_, 0xCC, 0x81, FIL_, ++ 0xC5, 0x94, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x90, ++ FIL_, 0x07, 0xCC, 0x81, FIL_, 0xC5, 0x9A, FIL_, ++ 0xCC, 0x82, FIL_, 0xC5, 0x9C, FIL_, 0xCC, 0xA7, ++ FIL_, 0xC5, 0x9E, FIL_, 0xCC, 0x8C, FIL_, 0xC5, ++ 0xA0, FIL_, 0xCC, 0xA6, FIL_, 0xC8, 0x98, FIL_, ++ 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA0, FIL_, 0xCC, ++ 0xA3, FIL_, 0xE1, 0xB9, 0xA2, FIL_, 0x07, 0xCC, ++ 0x8C, FIL_, 0xC5, 0xA4, FIL_, 0xCC, 0xB1, FIL_, ++ 0xE1, 0xB9, 0xAE, FIL_, 0xCC, 0xA6, FIL_, 0xC8, ++ 0x9A, FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0xA2, FIL_, ++ 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xAA, FIL_, 0xCC, ++ 0xAD, FIL_, 0xE1, 0xB9, 0xB0, FIL_, 0xCC, 0xA3, ++ FIL_, 0xE1, 0xB9, 0xAC, FIL_, 0x13, 0xCC, 0xA8, ++ FIL_, 0xC5, 0xB2, FIL_, 0xCC, 0x83, FIL_, 0xC5, ++ 0xA8, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0xAA, FIL_, ++ 0xCC, 0x81, FIL_, 0xC3, 0x9A, FIL_, 0xCC, 0x86, ++ FIL_, 0xC5, 0xAC, FIL_, 0xCC, 0x8A, FIL_, 0xC5, ++ 0xAE, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x99, FIL_, ++ 0xCC, 0x91, FIL_, 0xC8, 0x96, FIL_, 0xCC, 0x8B, ++ FIL_, 0xC5, 0xB0, FIL_, 0xCC, 0xA4, FIL_, 0xE1, ++ 0xB9, 0xB2, FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB9, ++ 0xB4, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x94, FIL_, ++ 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB6, FIL_, 0xCC, ++ 0x9B, FIL_, 0xC6, 0xAF, FIL_, 0xCC, 0x82, FIL_, ++ 0xC3, 0x9B, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x9C, ++ FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x93, FIL_, 0xCC, ++ 0xA3, FIL_, 0xE1, 0xBB, 0xA4, FIL_, 0xCC, 0x89, ++ FIL_, 0xE1, 0xBB, 0xA6, FIL_, 0x02, 0xCC, 0x83, ++ FIL_, 0xE1, 0xB9, 0xBC, FIL_, 0xCC, 0xA3, FIL_, ++ 0xE1, 0xB9, 0xBE, FIL_, 0x06, 0xCC, 0x82, FIL_, ++ 0xC5, 0xB4, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xBA, ++ 0x84, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xBA, 0x86, ++ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x88, FIL_, ++ 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0x82, FIL_, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBA, 0x80, FIL_, 0x02, 0xCC, ++ 0x87, FIL_, 0xE1, 0xBA, 0x8A, FIL_, 0xCC, 0x88, ++ FIL_, 0xE1, 0xBA, 0x8C, FIL_, 0x09, 0xCC, 0x89, ++ FIL_, 0xE1, 0xBB, 0xB6, FIL_, 0xCC, 0x87, FIL_, ++ 0xE1, 0xBA, 0x8E, FIL_, 0xCC, 0xA3, FIL_, 0xE1, ++ 0xBB, 0xB4, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x9D, ++ FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xB2, FIL_, 0xCC, ++ 0x82, FIL_, 0xC5, 0xB6, FIL_, 0xCC, 0x88, FIL_, ++ 0xC5, 0xB8, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, ++ 0xB2, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xB8, ++ FIL_, 0x06, 0xCC, 0x87, FIL_, 0xC5, 0xBB, FIL_, ++ 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x92, FIL_, 0xCC, ++ 0x8C, FIL_, 0xC5, 0xBD, FIL_, 0xCC, 0xB1, FIL_, ++ 0xE1, 0xBA, 0x94, FIL_, 0xCC, 0x82, FIL_, 0xE1, ++ 0xBA, 0x90, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0xB9, ++ FIL_, 0x10, 0xCC, 0x8C, FIL_, 0xC7, 0x8E, FIL_, ++ 0xCC, 0x8F, FIL_, 0xC8, 0x81, FIL_, 0xCC, 0xA8, ++ FIL_, 0xC4, 0x85, FIL_, 0xCC, 0xA3, FIL_, 0xE1, ++ 0xBA, 0xA1, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x83, ++ FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA3, FIL_, ++ 0xCC, 0x84, FIL_, 0xC4, 0x81, FIL_, 0xCC, 0x91, ++ FIL_, 0xC8, 0x83, FIL_, 0xCC, 0x8A, FIL_, 0xC3, ++ 0xA5, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0xA4, FIL_, ++ 0xCC, 0x83, FIL_, 0xC3, 0xA3, FIL_, 0xCC, 0x82, ++ FIL_, 0xC3, 0xA2, FIL_, 0xCC, 0x81, FIL_, 0xC3, ++ 0xA1, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xA0, FIL_, ++ 0xCC, 0x87, FIL_, 0xC8, 0xA7, FIL_, 0xCC, 0xA5, ++ FIL_, 0xE1, 0xB8, 0x81, FIL_, 0x03, 0xCC, 0xB1, ++ FIL_, 0xE1, 0xB8, 0x87, FIL_, 0xCC, 0xA3, FIL_, ++ 0xE1, 0xB8, 0x85, FIL_, 0xCC, 0x87, FIL_, 0xE1, ++ 0xB8, 0x83, FIL_, 0x05, 0xCC, 0x87, FIL_, 0xC4, ++ 0x8B, FIL_, 0xCC, 0xA7, FIL_, 0xC3, 0xA7, FIL_, ++ 0xCC, 0x82, FIL_, 0xC4, 0x89, FIL_, 0xCC, 0x8C, ++ FIL_, 0xC4, 0x8D, FIL_, 0xCC, 0x81, FIL_, 0xC4, ++ 0x87, FIL_, 0x06, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, ++ 0x93, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x8B, ++ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0x8D, FIL_, ++ 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0x8F, FIL_, 0xCC, ++ 0xA7, FIL_, 0xE1, 0xB8, 0x91, FIL_, 0xCC, 0x8C, ++ FIL_, 0xC4, 0x8F, FIL_, 0x11, 0xCC, 0xA8, FIL_, ++ 0xC4, 0x99, FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x9B, ++ FIL_, 0xCC, 0x87, FIL_, 0xC4, 0x97, FIL_, 0xCC, ++ 0x88, FIL_, 0xC3, 0xAB, FIL_, 0xCC, 0xA3, FIL_, ++ 0xE1, 0xBA, 0xB9, FIL_, 0xCC, 0xB0, FIL_, 0xE1, ++ 0xB8, 0x9B, FIL_, 0xCC, 0x84, FIL_, 0xC4, 0x93, ++ FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x99, FIL_, ++ 0xCC, 0x83, FIL_, 0xE1, 0xBA, 0xBD, FIL_, 0xCC, ++ 0x86, FIL_, 0xC4, 0x95, FIL_, 0xCC, 0xA7, FIL_, ++ 0xC8, 0xA9, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, ++ 0xBB, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x85, FIL_, ++ 0xCC, 0x81, FIL_, 0xC3, 0xA9, FIL_, 0xCC, 0x91, ++ FIL_, 0xC8, 0x87, FIL_, 0xCC, 0x80, FIL_, 0xC3, ++ 0xA8, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xAA, FIL_, ++ 0x01, 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9F, FIL_, ++ 0x07, 0xCC, 0x86, FIL_, 0xC4, 0x9F, FIL_, 0xCC, ++ 0xA7, FIL_, 0xC4, 0xA3, FIL_, 0xCC, 0x81, FIL_, ++ 0xC7, 0xB5, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0x9D, ++ FIL_, 0xCC, 0x87, FIL_, 0xC4, 0xA1, FIL_, 0xCC, ++ 0x8C, FIL_, 0xC7, 0xA7, FIL_, 0xCC, 0x84, FIL_, ++ 0xE1, 0xB8, 0xA1, FIL_, 0x08, 0xCC, 0x8C, FIL_, ++ 0xC8, 0x9F, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0xA5, ++ FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA7, FIL_, ++ 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0xA3, FIL_, 0xCC, ++ 0xB1, FIL_, 0xE1, 0xBA, 0x96, FIL_, 0xCC, 0xA3, ++ FIL_, 0xE1, 0xB8, 0xA5, FIL_, 0xCC, 0xA7, FIL_, ++ 0xE1, 0xB8, 0xA9, FIL_, 0xCC, 0xAE, FIL_, 0xE1, ++ 0xB8, 0xAB, FIL_, 0x0E, 0xCC, 0x81, FIL_, 0xC3, ++ 0xAD, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xAC, FIL_, ++ 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0x8B, FIL_, 0xCC, ++ 0x8C, FIL_, 0xC7, 0x90, FIL_, 0xCC, 0x89, FIL_, ++ 0xE1, 0xBB, 0x89, FIL_, 0xCC, 0x91, FIL_, 0xC8, ++ 0x8B, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x89, FIL_, ++ 0xCC, 0x82, FIL_, 0xC3, 0xAE, FIL_, 0xCC, 0xB0, ++ FIL_, 0xE1, 0xB8, 0xAD, FIL_, 0xCC, 0xA8, FIL_, ++ 0xC4, 0xAF, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0xAD, ++ FIL_, 0xCC, 0x84, FIL_, 0xC4, 0xAB, FIL_, 0xCC, ++ 0x83, FIL_, 0xC4, 0xA9, FIL_, 0xCC, 0x88, FIL_, ++ 0xC3, 0xAF, FIL_, 0x02, 0xCC, 0x82, FIL_, 0xC4, ++ 0xB5, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0xB0, FIL_, ++ 0x05, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0xB3, FIL_, ++ 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xB1, FIL_, 0xCC, ++ 0xA7, FIL_, 0xC4, 0xB7, FIL_, 0xCC, 0x8C, FIL_, ++ 0xC7, 0xA9, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, ++ 0xB5, FIL_, 0x06, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, ++ 0xB7, FIL_, 0xCC, 0x81, FIL_, 0xC4, 0xBA, FIL_, ++ 0xCC, 0xA7, FIL_, 0xC4, 0xBC, FIL_, 0xCC, 0x8C, ++ FIL_, 0xC4, 0xBE, FIL_, 0xCC, 0xB1, FIL_, 0xE1, ++ 0xB8, 0xBB, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, ++ 0xBD, FIL_, 0x03, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, ++ 0x83, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xBF, ++ FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x81, FIL_, ++ 0x09, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x87, FIL_, ++ 0xCC, 0x83, FIL_, 0xC3, 0xB1, FIL_, 0xCC, 0x87, ++ FIL_, 0xE1, 0xB9, 0x85, FIL_, 0xCC, 0xB1, FIL_, ++ 0xE1, 0xB9, 0x89, FIL_, 0xCC, 0x81, FIL_, 0xC5, ++ 0x84, FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x86, FIL_, ++ 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0x8B, FIL_, 0xCC, ++ 0x8C, FIL_, 0xC5, 0x88, FIL_, 0xCC, 0x80, FIL_, ++ 0xC7, 0xB9, FIL_, 0x10, 0xCC, 0x89, FIL_, 0xE1, ++ 0xBB, 0x8F, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0xB3, ++ FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xB2, FIL_, 0xCC, ++ 0x87, FIL_, 0xC8, 0xAF, FIL_, 0xCC, 0x8F, FIL_, ++ 0xC8, 0x8D, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, ++ 0x8D, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0x8D, FIL_, ++ 0xCC, 0x8C, FIL_, 0xC7, 0x92, FIL_, 0xCC, 0x86, ++ FIL_, 0xC5, 0x8F, FIL_, 0xCC, 0x8B, FIL_, 0xC5, ++ 0x91, FIL_, 0xCC, 0x9B, FIL_, 0xC6, 0xA1, FIL_, ++ 0xCC, 0x91, FIL_, 0xC8, 0x8F, FIL_, 0xCC, 0xA8, ++ FIL_, 0xC7, 0xAB, FIL_, 0xCC, 0x88, FIL_, 0xC3, ++ 0xB6, FIL_, 0xCC, 0x83, FIL_, 0xC3, 0xB5, FIL_, ++ 0xCC, 0x82, FIL_, 0xC3, 0xB4, FIL_, 0x02, 0xCC, ++ 0x87, FIL_, 0xE1, 0xB9, 0x97, FIL_, 0xCC, 0x81, ++ FIL_, 0xE1, 0xB9, 0x95, FIL_, 0x08, 0xCC, 0xB1, ++ FIL_, 0xE1, 0xB9, 0x9F, FIL_, 0xCC, 0x87, FIL_, ++ 0xE1, 0xB9, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC5, ++ 0x95, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x91, FIL_, ++ 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x9B, FIL_, 0xCC, ++ 0x8C, FIL_, 0xC5, 0x99, FIL_, 0xCC, 0x91, FIL_, ++ 0xC8, 0x93, FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x97, ++ FIL_, 0x07, 0xCC, 0xA6, FIL_, 0xC8, 0x99, FIL_, ++ 0xCC, 0x8C, FIL_, 0xC5, 0xA1, FIL_, 0xCC, 0x81, ++ FIL_, 0xC5, 0x9B, FIL_, 0xCC, 0x87, FIL_, 0xE1, ++ 0xB9, 0xA1, FIL_, 0xCC, 0x82, FIL_, 0xC5, 0x9D, ++ FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x9F, FIL_, 0xCC, ++ 0xA3, FIL_, 0xE1, 0xB9, 0xA3, FIL_, 0x08, 0xCC, ++ 0x88, FIL_, 0xE1, 0xBA, 0x97, FIL_, 0xCC, 0xAD, ++ FIL_, 0xE1, 0xB9, 0xB1, FIL_, 0xCC, 0xB1, FIL_, ++ 0xE1, 0xB9, 0xAF, FIL_, 0xCC, 0xA3, FIL_, 0xE1, ++ 0xB9, 0xAD, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0xA5, ++ FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0xA3, FIL_, 0xCC, ++ 0x87, FIL_, 0xE1, 0xB9, 0xAB, FIL_, 0xCC, 0xA6, ++ FIL_, 0xC8, 0x9B, FIL_, 0x13, 0xCC, 0x81, FIL_, ++ 0xC3, 0xBA, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x97, ++ FIL_, 0xCC, 0x83, FIL_, 0xC5, 0xA9, FIL_, 0xCC, ++ 0x8F, FIL_, 0xC8, 0x95, FIL_, 0xCC, 0xA8, FIL_, ++ 0xC5, 0xB3, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xBB, ++ FIL_, 0xCC, 0x88, FIL_, 0xC3, 0xBC, FIL_, 0xCC, ++ 0x80, FIL_, 0xC3, 0xB9, FIL_, 0xCC, 0xA3, FIL_, ++ 0xE1, 0xBB, 0xA5, FIL_, 0xCC, 0xA4, FIL_, 0xE1, ++ 0xB9, 0xB3, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, ++ 0xA7, FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB9, 0xB5, ++ FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB7, FIL_, ++ 0xCC, 0x9B, FIL_, 0xC6, 0xB0, FIL_, 0xCC, 0x84, ++ FIL_, 0xC5, 0xAB, FIL_, 0xCC, 0x8B, FIL_, 0xC5, ++ 0xB1, FIL_, 0xCC, 0x86, FIL_, 0xC5, 0xAD, FIL_, ++ 0xCC, 0x8C, FIL_, 0xC7, 0x94, FIL_, 0xCC, 0x8A, ++ FIL_, 0xC5, 0xAF, FIL_, 0x02, 0xCC, 0x83, FIL_, ++ 0xE1, 0xB9, 0xBD, FIL_, 0xCC, 0xA3, FIL_, 0xE1, ++ 0xB9, 0xBF, FIL_, 0x07, 0xCC, 0x82, FIL_, 0xC5, ++ 0xB5, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0x81, ++ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0x83, FIL_, ++ 0xCC, 0x88, FIL_, 0xE1, 0xBA, 0x85, FIL_, 0xCC, ++ 0xA3, FIL_, 0xE1, 0xBA, 0x89, FIL_, 0xCC, 0x87, ++ FIL_, 0xE1, 0xBA, 0x87, FIL_, 0xCC, 0x8A, FIL_, ++ 0xE1, 0xBA, 0x98, FIL_, 0x02, 0xCC, 0x87, FIL_, ++ 0xE1, 0xBA, 0x8B, FIL_, 0xCC, 0x88, FIL_, 0xE1, ++ 0xBA, 0x8D, FIL_, 0x0A, 0xCC, 0x87, FIL_, 0xE1, ++ 0xBA, 0x8F, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, ++ 0xB9, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xB3, ++ FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0xB7, FIL_, ++ 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xB5, FIL_, 0xCC, ++ 0x82, FIL_, 0xC5, 0xB7, FIL_, 0xCC, 0x84, FIL_, ++ 0xC8, 0xB3, FIL_, 0xCC, 0x8A, FIL_, 0xE1, 0xBA, ++ 0x99, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0xBF, FIL_, ++ 0xCC, 0x81, FIL_, 0xC3, 0xBD, FIL_, 0x06, 0xCC, ++ 0x8C, FIL_, 0xC5, 0xBE, FIL_, 0xCC, 0x87, FIL_, ++ 0xC5, 0xBC, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xBA, ++ 0x95, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x93, ++ FIL_, 0xCC, 0x81, FIL_, 0xC5, 0xBA, FIL_, 0xCC, ++ 0x82, FIL_, 0xE1, 0xBA, 0x91, FIL_, 0x03, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBF, 0xAD, FIL_, 0xCD, 0x82, ++ FIL_, 0xE1, 0xBF, 0x81, FIL_, 0xCC, 0x81, FIL_, ++ 0xCE, 0x85, FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1, ++ 0xBA, 0xA8, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBA, ++ 0xAA, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xA4, ++ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xA6, FIL_, ++ 0x01, 0xCC, 0x84, FIL_, 0xC7, 0x9E, FIL_, 0x01, ++ 0xCC, 0x81, FIL_, 0xC7, 0xBA, FIL_, 0x02, 0xCC, ++ 0x84, FIL_, 0xC7, 0xA2, FIL_, 0xCC, 0x81, FIL_, ++ 0xC7, 0xBC, FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1, ++ 0xB8, 0x88, FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1, ++ 0xBA, 0xBE, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, ++ 0x80, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0x84, ++ FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x82, FIL_, ++ 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xAE, FIL_, ++ 0x04, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0x96, FIL_, ++ 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x90, FIL_, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBB, 0x92, FIL_, 0xCC, 0x89, ++ FIL_, 0xE1, 0xBB, 0x94, FIL_, 0x03, 0xCC, 0x84, ++ FIL_, 0xC8, 0xAC, FIL_, 0xCC, 0x81, FIL_, 0xE1, ++ 0xB9, 0x8C, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB9, ++ 0x8E, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAA, ++ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xC7, 0xBE, FIL_, ++ 0x04, 0xCC, 0x80, FIL_, 0xC7, 0x9B, FIL_, 0xCC, ++ 0x84, FIL_, 0xC7, 0x95, FIL_, 0xCC, 0x8C, FIL_, ++ 0xC7, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC7, 0x97, ++ FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA9, ++ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xA7, FIL_, ++ 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xA5, FIL_, 0xCC, ++ 0x83, FIL_, 0xE1, 0xBA, 0xAB, FIL_, 0x01, 0xCC, ++ 0x84, FIL_, 0xC7, 0x9F, FIL_, 0x01, 0xCC, 0x81, ++ FIL_, 0xC7, 0xBB, FIL_, 0x02, 0xCC, 0x84, FIL_, ++ 0xC7, 0xA3, FIL_, 0xCC, 0x81, FIL_, 0xC7, 0xBD, ++ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x89, ++ FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x83, ++ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xBF, FIL_, ++ 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0x81, FIL_, 0xCC, ++ 0x83, FIL_, 0xE1, 0xBB, 0x85, FIL_, 0x01, 0xCC, ++ 0x81, FIL_, 0xE1, 0xB8, 0xAF, FIL_, 0x04, 0xCC, ++ 0x83, FIL_, 0xE1, 0xBB, 0x97, FIL_, 0xCC, 0x89, ++ FIL_, 0xE1, 0xBB, 0x95, FIL_, 0xCC, 0x80, FIL_, ++ 0xE1, 0xBB, 0x93, FIL_, 0xCC, 0x81, FIL_, 0xE1, ++ 0xBB, 0x91, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, ++ 0xB9, 0x8D, FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xAD, ++ FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB9, 0x8F, FIL_, ++ 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAB, FIL_, 0x01, ++ 0xCC, 0x81, FIL_, 0xC7, 0xBF, FIL_, 0x04, 0xCC, ++ 0x81, FIL_, 0xC7, 0x98, FIL_, 0xCC, 0x84, FIL_, ++ 0xC7, 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x9A, ++ FIL_, 0xCC, 0x80, FIL_, 0xC7, 0x9C, FIL_, 0x04, ++ 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xB0, FIL_, 0xCC, ++ 0x81, FIL_, 0xE1, 0xBA, 0xAE, FIL_, 0xCC, 0x83, ++ FIL_, 0xE1, 0xBA, 0xB4, FIL_, 0xCC, 0x89, FIL_, ++ 0xE1, 0xBA, 0xB2, FIL_, 0x04, 0xCC, 0x80, FIL_, ++ 0xE1, 0xBA, 0xB1, FIL_, 0xCC, 0x83, FIL_, 0xE1, ++ 0xBA, 0xB5, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, ++ 0xAF, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xB3, ++ FIL_, 0x02, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x96, ++ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x94, FIL_, ++ 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x95, FIL_, ++ 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x97, FIL_, 0x02, ++ 0xCC, 0x80, FIL_, 0xE1, 0xB9, 0x90, FIL_, 0xCC, ++ 0x81, FIL_, 0xE1, 0xB9, 0x92, FIL_, 0x02, 0xCC, ++ 0x80, FIL_, 0xE1, 0xB9, 0x91, FIL_, 0xCC, 0x81, ++ FIL_, 0xE1, 0xB9, 0x93, FIL_, 0x01, 0xCC, 0x87, ++ FIL_, 0xE1, 0xB9, 0xA4, FIL_, 0x01, 0xCC, 0x87, ++ FIL_, 0xE1, 0xB9, 0xA5, FIL_, 0x01, 0xCC, 0x87, ++ FIL_, 0xE1, 0xB9, 0xA6, FIL_, 0x01, 0xCC, 0x87, ++ FIL_, 0xE1, 0xB9, 0xA7, FIL_, 0x01, 0xCC, 0x81, ++ FIL_, 0xE1, 0xB9, 0xB8, FIL_, 0x01, 0xCC, 0x81, ++ FIL_, 0xE1, 0xB9, 0xB9, FIL_, 0x01, 0xCC, 0x88, ++ FIL_, 0xE1, 0xB9, 0xBA, FIL_, 0x01, 0xCC, 0x88, ++ FIL_, 0xE1, 0xB9, 0xBB, FIL_, 0x01, 0xCC, 0x87, ++ FIL_, 0xE1, 0xBA, 0x9B, FIL_, 0x05, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBB, 0x9C, FIL_, 0xCC, 0x81, FIL_, ++ 0xE1, 0xBB, 0x9A, FIL_, 0xCC, 0xA3, FIL_, 0xE1, ++ 0xBB, 0xA2, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, ++ 0xA0, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x9E, ++ FIL_, 0x05, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xA1, ++ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x9B, FIL_, ++ 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xA3, FIL_, 0xCC, ++ 0x89, FIL_, 0xE1, 0xBB, 0x9F, FIL_, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBB, 0x9D, FIL_, 0x05, 0xCC, 0x83, ++ FIL_, 0xE1, 0xBB, 0xAE, FIL_, 0xCC, 0xA3, FIL_, ++ 0xE1, 0xBB, 0xB0, FIL_, 0xCC, 0x89, FIL_, 0xE1, ++ 0xBB, 0xAC, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB, ++ 0xA8, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xAA, ++ FIL_, 0x05, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xB1, ++ FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xAF, FIL_, ++ 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0xAD, FIL_, 0xCC, ++ 0x81, FIL_, 0xE1, 0xBB, 0xA9, FIL_, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBB, 0xAB, FIL_, 0x01, 0xCC, 0x8C, ++ FIL_, 0xC7, 0xAE, FIL_, 0x01, 0xCC, 0x84, FIL_, ++ 0xC7, 0xAC, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, ++ 0xAD, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA0, ++ FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA1, FIL_, ++ 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9C, FIL_, ++ 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9D, FIL_, ++ 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xB0, FIL_, 0x01, ++ 0xCC, 0x84, FIL_, 0xC8, 0xB1, FIL_, 0x01, 0xCC, ++ 0x8C, FIL_, 0xC7, 0xAF, FIL_, 0x07, 0xCC, 0x93, ++ FIL_, 0xE1, 0xBC, 0x88, FIL_, 0xCC, 0x94, FIL_, ++ 0xE1, 0xBC, 0x89, FIL_, 0xCC, 0x81, FIL_, 0xCE, ++ 0x86, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xBC, ++ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBE, 0xBA, FIL_, ++ 0xCC, 0x84, FIL_, 0xE1, 0xBE, 0xB9, FIL_, 0xCC, ++ 0x86, FIL_, 0xE1, 0xBE, 0xB8, FIL_, 0x04, 0xCC, ++ 0x81, FIL_, 0xCE, 0x88, FIL_, 0xCC, 0x94, FIL_, ++ 0xE1, 0xBC, 0x99, FIL_, 0xCC, 0x93, FIL_, 0xE1, ++ 0xBC, 0x98, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, ++ 0x88, FIL_, 0x05, 0xCC, 0x94, FIL_, 0xE1, 0xBC, ++ 0xA9, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x8A, ++ FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x89, FIL_, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBF, 0x8C, FIL_, 0xCC, 0x93, ++ FIL_, 0xE1, 0xBC, 0xA8, FIL_, 0x07, 0xCC, 0x81, ++ FIL_, 0xCE, 0x8A, FIL_, 0xCC, 0x88, FIL_, 0xCE, ++ 0xAA, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0x98, ++ FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xBF, 0x99, FIL_, ++ 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0xB8, FIL_, 0xCC, ++ 0x94, FIL_, 0xE1, 0xBC, 0xB9, FIL_, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBF, 0x9A, FIL_, 0x04, 0xCC, 0x94, ++ FIL_, 0xE1, 0xBD, 0x89, FIL_, 0xCC, 0x80, FIL_, ++ 0xE1, 0xBF, 0xB8, FIL_, 0xCC, 0x81, FIL_, 0xCE, ++ 0x8C, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD, 0x88, ++ FIL_, 0x01, 0xCC, 0x94, FIL_, 0xE1, 0xBF, 0xAC, ++ FIL_, 0x06, 0xCC, 0x81, FIL_, 0xCE, 0x8E, FIL_, ++ 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0xA8, FIL_, 0xCC, ++ 0x94, FIL_, 0xE1, 0xBD, 0x99, FIL_, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBF, 0xAA, FIL_, 0xCC, 0x84, FIL_, ++ 0xE1, 0xBF, 0xA9, FIL_, 0xCC, 0x88, FIL_, 0xCE, ++ 0xAB, FIL_, 0x05, 0xCC, 0x80, FIL_, 0xE1, 0xBF, ++ 0xBA, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x8F, FIL_, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xBC, FIL_, 0xCC, ++ 0x94, FIL_, 0xE1, 0xBD, 0xA9, FIL_, 0xCC, 0x93, ++ FIL_, 0xE1, 0xBD, 0xA8, FIL_, 0x01, 0xCD, 0x85, ++ FIL_, 0xE1, 0xBE, 0xB4, FIL_, 0x01, 0xCD, 0x85, ++ FIL_, 0xE1, 0xBF, 0x84, FIL_, 0x08, 0xCC, 0x81, ++ FIL_, 0xCE, 0xAC, FIL_, 0xCC, 0x80, FIL_, 0xE1, ++ 0xBD, 0xB0, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC, ++ 0x80, FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0x81, ++ FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBE, 0xB6, FIL_, ++ 0xCC, 0x86, FIL_, 0xE1, 0xBE, 0xB0, FIL_, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0xB3, FIL_, 0xCC, 0x84, ++ FIL_, 0xE1, 0xBE, 0xB1, FIL_, 0x04, 0xCC, 0x81, ++ FIL_, 0xCE, 0xAD, FIL_, 0xCC, 0x94, FIL_, 0xE1, ++ 0xBC, 0x91, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, ++ 0xB2, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0x90, ++ FIL_, 0x06, 0xCC, 0x81, FIL_, 0xCE, 0xAE, FIL_, ++ 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xB4, FIL_, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBF, 0x83, FIL_, 0xCD, 0x82, ++ FIL_, 0xE1, 0xBF, 0x86, FIL_, 0xCC, 0x94, FIL_, ++ 0xE1, 0xBC, 0xA1, FIL_, 0xCC, 0x93, FIL_, 0xE1, ++ 0xBC, 0xA0, FIL_, 0x08, 0xCD, 0x82, FIL_, 0xE1, ++ 0xBF, 0x96, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF, ++ 0x90, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0xB0, ++ FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAF, FIL_, 0xCC, ++ 0x94, FIL_, 0xE1, 0xBC, 0xB1, FIL_, 0xCC, 0x84, ++ FIL_, 0xE1, 0xBF, 0x91, FIL_, 0xCC, 0x88, FIL_, ++ 0xCF, 0x8A, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, ++ 0xB6, FIL_, 0x04, 0xCC, 0x81, FIL_, 0xCF, 0x8C, ++ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xB8, FIL_, ++ 0xCC, 0x93, FIL_, 0xE1, 0xBD, 0x80, FIL_, 0xCC, ++ 0x94, FIL_, 0xE1, 0xBD, 0x81, FIL_, 0x02, 0xCC, ++ 0x93, FIL_, 0xE1, 0xBF, 0xA4, FIL_, 0xCC, 0x94, ++ FIL_, 0xE1, 0xBF, 0xA5, FIL_, 0x08, 0xCC, 0x93, ++ FIL_, 0xE1, 0xBD, 0x90, FIL_, 0xCC, 0x94, FIL_, ++ 0xE1, 0xBD, 0x91, FIL_, 0xCC, 0x86, FIL_, 0xE1, ++ 0xBF, 0xA0, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, ++ 0xA6, FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xBF, 0xA1, ++ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xBA, FIL_, ++ 0xCC, 0x81, FIL_, 0xCF, 0x8D, FIL_, 0xCC, 0x88, ++ FIL_, 0xCF, 0x8B, FIL_, 0x06, 0xCC, 0x94, FIL_, ++ 0xE1, 0xBD, 0xA1, FIL_, 0xCD, 0x85, FIL_, 0xE1, ++ 0xBF, 0xB3, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, ++ 0xBC, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0xB6, ++ FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD, 0xA0, FIL_, ++ 0xCC, 0x81, FIL_, 0xCF, 0x8E, FIL_, 0x03, 0xCD, ++ 0x82, FIL_, 0xE1, 0xBF, 0x97, FIL_, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBF, 0x92, FIL_, 0xCC, 0x81, FIL_, ++ 0xCE, 0x90, FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, ++ 0xBF, 0xA2, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xB0, ++ FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0xA7, FIL_, ++ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB4, FIL_, ++ 0x02, 0xCC, 0x88, FIL_, 0xCF, 0x94, FIL_, 0xCC, ++ 0x81, FIL_, 0xCF, 0x93, FIL_, 0x01, 0xCC, 0x88, ++ FIL_, 0xD0, 0x87, FIL_, 0x02, 0xCC, 0x86, FIL_, ++ 0xD3, 0x90, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x92, ++ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x83, FIL_, ++ 0x03, 0xCC, 0x86, FIL_, 0xD3, 0x96, FIL_, 0xCC, ++ 0x80, FIL_, 0xD0, 0x80, FIL_, 0xCC, 0x88, FIL_, ++ 0xD0, 0x81, FIL_, 0x02, 0xCC, 0x88, FIL_, 0xD3, ++ 0x9C, FIL_, 0xCC, 0x86, FIL_, 0xD3, 0x81, FIL_, ++ 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9E, FIL_, 0x04, ++ 0xCC, 0x80, FIL_, 0xD0, 0x8D, FIL_, 0xCC, 0x88, ++ FIL_, 0xD3, 0xA4, FIL_, 0xCC, 0x86, FIL_, 0xD0, ++ 0x99, FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xA2, FIL_, ++ 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x8C, FIL_, 0x01, ++ 0xCC, 0x88, FIL_, 0xD3, 0xA6, FIL_, 0x04, 0xCC, ++ 0x86, FIL_, 0xD0, 0x8E, FIL_, 0xCC, 0x8B, FIL_, ++ 0xD3, 0xB2, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0xB0, ++ FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xAE, FIL_, 0x01, ++ 0xCC, 0x88, FIL_, 0xD3, 0xB4, FIL_, 0x01, 0xCC, ++ 0x88, FIL_, 0xD3, 0xB8, FIL_, 0x01, 0xCC, 0x88, ++ FIL_, 0xD3, 0xAC, FIL_, 0x02, 0xCC, 0x86, FIL_, ++ 0xD3, 0x91, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x93, ++ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x93, FIL_, ++ 0x03, 0xCC, 0x80, FIL_, 0xD1, 0x90, FIL_, 0xCC, ++ 0x88, FIL_, 0xD1, 0x91, FIL_, 0xCC, 0x86, FIL_, ++ 0xD3, 0x97, FIL_, 0x02, 0xCC, 0x88, FIL_, 0xD3, ++ 0x9D, FIL_, 0xCC, 0x86, FIL_, 0xD3, 0x82, FIL_, ++ 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9F, FIL_, 0x04, ++ 0xCC, 0x88, FIL_, 0xD3, 0xA5, FIL_, 0xCC, 0x86, ++ FIL_, 0xD0, 0xB9, FIL_, 0xCC, 0x80, FIL_, 0xD1, ++ 0x9D, FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xA3, FIL_, ++ 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x9C, FIL_, 0x01, ++ 0xCC, 0x88, FIL_, 0xD3, 0xA7, FIL_, 0x04, 0xCC, ++ 0x84, FIL_, 0xD3, 0xAF, FIL_, 0xCC, 0x86, FIL_, ++ 0xD1, 0x9E, FIL_, 0xCC, 0x8B, FIL_, 0xD3, 0xB3, ++ FIL_, 0xCC, 0x88, FIL_, 0xD3, 0xB1, FIL_, 0x01, ++ 0xCC, 0x88, FIL_, 0xD3, 0xB5, FIL_, 0x01, 0xCC, ++ 0x88, FIL_, 0xD3, 0xB9, FIL_, 0x01, 0xCC, 0x88, ++ FIL_, 0xD3, 0xAD, FIL_, 0x01, 0xCC, 0x88, FIL_, ++ 0xD1, 0x97, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1, ++ 0xB6, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1, 0xB7, ++ FIL_, 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9A, FIL_, ++ 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9B, FIL_, 0x01, ++ 0xCC, 0x88, FIL_, 0xD3, 0xAA, FIL_, 0x01, 0xCC, ++ 0x88, FIL_, 0xD3, 0xAB, FIL_, 0x03, 0xD9, 0x94, ++ FIL_, 0xD8, 0xA3, FIL_, 0xD9, 0x93, FIL_, 0xD8, ++ 0xA2, FIL_, 0xD9, 0x95, FIL_, 0xD8, 0xA5, FIL_, ++ 0x01, 0xD9, 0x94, FIL_, 0xD8, 0xA4, FIL_, 0x01, ++ 0xD9, 0x94, FIL_, 0xD8, 0xA6, FIL_, 0x01, 0xD9, ++ 0x94, FIL_, 0xDB, 0x82, FIL_, 0x01, 0xD9, 0x94, ++ FIL_, 0xDB, 0x93, FIL_, 0x01, 0xD9, 0x94, FIL_, ++ 0xDB, 0x80, FIL_, 0x01, 0xE0, 0xA4, 0xBC, FIL_, ++ 0xE0, 0xA4, 0xA9, FIL_, 0x01, 0xE0, 0xA4, 0xBC, ++ FIL_, 0xE0, 0xA4, 0xB1, FIL_, 0x01, 0xE0, 0xA4, ++ 0xBC, FIL_, 0xE0, 0xA4, 0xB4, FIL_, 0x02, 0xE0, ++ 0xA6, 0xBE, FIL_, 0xE0, 0xA7, 0x8B, FIL_, 0xE0, ++ 0xA7, 0x97, FIL_, 0xE0, 0xA7, 0x8C, FIL_, 0x03, ++ 0xE0, 0xAD, 0x97, FIL_, 0xE0, 0xAD, 0x8C, FIL_, ++ 0xE0, 0xAC, 0xBE, FIL_, 0xE0, 0xAD, 0x8B, FIL_, ++ 0xE0, 0xAD, 0x96, FIL_, 0xE0, 0xAD, 0x88, FIL_, ++ 0x01, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAE, 0x94, ++ FIL_, 0x02, 0xE0, 0xAE, 0xBE, FIL_, 0xE0, 0xAF, ++ 0x8A, FIL_, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAF, ++ 0x8C, FIL_, 0x01, 0xE0, 0xAE, 0xBE, FIL_, 0xE0, ++ 0xAF, 0x8B, FIL_, 0x01, 0xE0, 0xB1, 0x96, FIL_, ++ 0xE0, 0xB1, 0x88, FIL_, 0x01, 0xE0, 0xB3, 0x95, ++ FIL_, 0xE0, 0xB3, 0x80, FIL_, 0x03, 0xE0, 0xB3, ++ 0x95, FIL_, 0xE0, 0xB3, 0x87, FIL_, 0xE0, 0xB3, ++ 0x82, FIL_, 0xE0, 0xB3, 0x8A, FIL_, 0xE0, 0xB3, ++ 0x96, FIL_, 0xE0, 0xB3, 0x88, FIL_, 0x01, 0xE0, ++ 0xB3, 0x95, FIL_, 0xE0, 0xB3, 0x8B, FIL_, 0x02, ++ 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8A, FIL_, ++ 0xE0, 0xB5, 0x97, FIL_, 0xE0, 0xB5, 0x8C, FIL_, ++ 0x01, 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8B, ++ FIL_, 0x03, 0xE0, 0xB7, 0x8F, FIL_, 0xE0, 0xB7, ++ 0x9C, FIL_, 0xE0, 0xB7, 0x8A, FIL_, 0xE0, 0xB7, ++ 0x9A, FIL_, 0xE0, 0xB7, 0x9F, FIL_, 0xE0, 0xB7, ++ 0x9E, FIL_, 0x01, 0xE0, 0xB7, 0x8A, FIL_, 0xE0, ++ 0xB7, 0x9D, FIL_, 0x01, 0xE1, 0x80, 0xAE, FIL_, ++ 0xE1, 0x80, 0xA6, FIL_, 0x01, 0xCC, 0x84, FIL_, ++ 0xE1, 0xB8, 0xB8, FIL_, 0x01, 0xCC, 0x84, FIL_, ++ 0xE1, 0xB8, 0xB9, FIL_, 0x01, 0xCC, 0x84, FIL_, ++ 0xE1, 0xB9, 0x9C, FIL_, 0x01, 0xCC, 0x84, FIL_, ++ 0xE1, 0xB9, 0x9D, FIL_, 0x01, 0xCC, 0x87, FIL_, ++ 0xE1, 0xB9, 0xA8, FIL_, 0x01, 0xCC, 0x87, FIL_, ++ 0xE1, 0xB9, 0xA9, FIL_, 0x02, 0xCC, 0x86, FIL_, ++ 0xE1, 0xBA, 0xB6, FIL_, 0xCC, 0x82, FIL_, 0xE1, ++ 0xBA, 0xAC, FIL_, 0x02, 0xCC, 0x86, FIL_, 0xE1, ++ 0xBA, 0xB7, FIL_, 0xCC, 0x82, FIL_, 0xE1, 0xBA, ++ 0xAD, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB, ++ 0x86, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB, ++ 0x87, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB, ++ 0x98, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB, ++ 0x99, FIL_, 0x04, 0xCC, 0x80, FIL_, 0xE1, 0xBC, ++ 0x82, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x84, ++ FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x80, FIL_, ++ 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0x86, FIL_, 0x04, ++ 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0x87, FIL_, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBC, 0x83, FIL_, 0xCC, 0x81, ++ FIL_, 0xE1, 0xBC, 0x85, FIL_, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0x81, FIL_, 0x01, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0x82, FIL_, 0x01, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0x83, FIL_, 0x01, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0x84, FIL_, 0x01, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0x85, FIL_, 0x01, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0x86, FIL_, 0x01, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0x87, FIL_, 0x04, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0x88, FIL_, 0xCC, 0x80, FIL_, 0xE1, ++ 0xBC, 0x8A, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC, ++ 0x8E, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x8C, ++ FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x8D, ++ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x8B, FIL_, ++ 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0x8F, FIL_, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0x89, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0x8A, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0x8B, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0x8C, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0x8D, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0x8E, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0x8F, FIL_, 0x02, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBC, 0x92, FIL_, 0xCC, 0x81, ++ FIL_, 0xE1, 0xBC, 0x94, FIL_, 0x02, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBC, 0x93, FIL_, 0xCC, 0x81, FIL_, ++ 0xE1, 0xBC, 0x95, FIL_, 0x02, 0xCC, 0x80, FIL_, ++ 0xE1, 0xBC, 0x9A, FIL_, 0xCC, 0x81, FIL_, 0xE1, ++ 0xBC, 0x9C, FIL_, 0x02, 0xCC, 0x80, FIL_, 0xE1, ++ 0xBC, 0x9B, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, ++ 0x9D, FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBC, ++ 0xA6, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x90, ++ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xA4, FIL_, ++ 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xA2, FIL_, 0x04, ++ 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xA3, FIL_, 0xCC, ++ 0x81, FIL_, 0xE1, 0xBC, 0xA5, FIL_, 0xCD, 0x82, ++ FIL_, 0xE1, 0xBC, 0xA7, FIL_, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0x91, FIL_, 0x01, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0x92, FIL_, 0x01, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0x93, FIL_, 0x01, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0x94, FIL_, 0x01, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0x95, FIL_, 0x01, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0x96, FIL_, 0x01, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0x97, FIL_, 0x04, 0xCD, 0x82, FIL_, ++ 0xE1, 0xBC, 0xAE, FIL_, 0xCC, 0x81, FIL_, 0xE1, ++ 0xBC, 0xAC, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, ++ 0x98, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xAA, ++ FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xAF, ++ FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x99, FIL_, ++ 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xAD, FIL_, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBC, 0xAB, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0x9A, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0x9B, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0x9C, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0x9D, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0x9E, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0x9F, FIL_, 0x03, 0xCC, ++ 0x81, FIL_, 0xE1, 0xBC, 0xB4, FIL_, 0xCD, 0x82, ++ FIL_, 0xE1, 0xBC, 0xB6, FIL_, 0xCC, 0x80, FIL_, ++ 0xE1, 0xBC, 0xB2, FIL_, 0x03, 0xCC, 0x81, FIL_, ++ 0xE1, 0xBC, 0xB5, FIL_, 0xCD, 0x82, FIL_, 0xE1, ++ 0xBC, 0xB7, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, ++ 0xB3, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xBC, ++ 0xBC, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xBA, ++ FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xBE, FIL_, ++ 0x03, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xBB, FIL_, ++ 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xBF, FIL_, 0xCC, ++ 0x81, FIL_, 0xE1, 0xBC, 0xBD, FIL_, 0x02, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBD, 0x82, FIL_, 0xCC, 0x81, ++ FIL_, 0xE1, 0xBD, 0x84, FIL_, 0x02, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBD, 0x83, FIL_, 0xCC, 0x81, FIL_, ++ 0xE1, 0xBD, 0x85, FIL_, 0x02, 0xCC, 0x81, FIL_, ++ 0xE1, 0xBD, 0x8C, FIL_, 0xCC, 0x80, FIL_, 0xE1, ++ 0xBD, 0x8A, FIL_, 0x02, 0xCC, 0x81, FIL_, 0xE1, ++ 0xBD, 0x8D, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, ++ 0x8B, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xBD, ++ 0x94, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0x96, ++ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x92, FIL_, ++ 0x03, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0x97, FIL_, ++ 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x95, FIL_, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBD, 0x93, FIL_, 0x03, 0xCC, ++ 0x81, FIL_, 0xE1, 0xBD, 0x9D, FIL_, 0xCD, 0x82, ++ FIL_, 0xE1, 0xBD, 0x9F, FIL_, 0xCC, 0x80, FIL_, ++ 0xE1, 0xBD, 0x9B, FIL_, 0x04, 0xCC, 0x81, FIL_, ++ 0xE1, 0xBD, 0xA4, FIL_, 0xCC, 0x80, FIL_, 0xE1, ++ 0xBD, 0xA2, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, ++ 0xA6, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA0, ++ FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xA7, ++ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0xA5, FIL_, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA1, FIL_, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBD, 0xA3, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0xA2, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0xA3, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0xA4, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0xA5, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0xA6, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0xA7, FIL_, 0x04, 0xCC, ++ 0x81, FIL_, 0xE1, 0xBD, 0xAC, FIL_, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBD, 0xAA, FIL_, 0xCD, 0x82, FIL_, ++ 0xE1, 0xBD, 0xAE, FIL_, 0xCD, 0x85, FIL_, 0xE1, ++ 0xBE, 0xA8, FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1, ++ 0xBD, 0xAD, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, ++ 0xA9, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xAF, ++ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xAB, FIL_, ++ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAA, FIL_, ++ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAB, FIL_, ++ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAC, FIL_, ++ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAD, FIL_, ++ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAE, FIL_, ++ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAF, FIL_, ++ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xB2, FIL_, ++ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0x82, FIL_, ++ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB2, FIL_, ++ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xB7, FIL_, ++ 0x03, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0x8F, FIL_, ++ 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x8D, FIL_, 0xCC, ++ 0x81, FIL_, 0xE1, 0xBF, 0x8E, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBF, 0x87, FIL_, 0x01, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBF, 0xB7, FIL_, 0x03, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBF, 0x9D, FIL_, 0xCD, 0x82, ++ FIL_, 0xE1, 0xBF, 0x9F, FIL_, 0xCC, 0x81, FIL_, ++ 0xE1, 0xBF, 0x9E, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x86, 0x9A, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x86, 0x9B, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x86, 0xAE, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x87, 0x8D, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x87, 0x8F, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x87, 0x8E, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x88, 0x84, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x88, 0x89, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x88, 0x8C, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x88, 0xA4, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x88, 0xA6, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x89, 0x81, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x89, 0x84, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x89, 0x87, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x89, 0x89, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x89, 0xAD, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x89, 0xA2, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x89, 0xB0, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x89, 0xB1, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x89, 0xB4, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x89, 0xB5, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x89, 0xB8, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x89, 0xB9, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8A, 0x80, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8A, 0x81, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8B, 0xA0, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8B, 0xA1, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8A, 0x84, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8A, 0x85, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8A, 0x88, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8A, 0x89, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8B, 0xA2, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8B, 0xA3, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8A, 0xAC, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8A, 0xAD, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8A, 0xAE, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8A, 0xAF, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8B, 0xAA, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8B, 0xAB, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8B, 0xAC, FIL_, 0x01, 0xCC, 0xB8, FIL_, ++ 0xE2, 0x8B, 0xAD, FIL_, 0x01, 0xE3, 0x82, 0x99, ++ FIL_, 0xE3, 0x82, 0x94, FIL_, 0x01, 0xE3, 0x82, ++ 0x99, FIL_, 0xE3, 0x81, 0x8C, FIL_, 0x01, 0xE3, ++ 0x82, 0x99, FIL_, 0xE3, 0x81, 0x8E, FIL_, 0x01, ++ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x90, FIL_, ++ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x92, ++ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, ++ 0x94, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, ++ 0x81, 0x96, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, ++ 0xE3, 0x81, 0x98, FIL_, 0x01, 0xE3, 0x82, 0x99, ++ FIL_, 0xE3, 0x81, 0x9A, FIL_, 0x01, 0xE3, 0x82, ++ 0x99, FIL_, 0xE3, 0x81, 0x9C, FIL_, 0x01, 0xE3, ++ 0x82, 0x99, FIL_, 0xE3, 0x81, 0x9E, FIL_, 0x01, ++ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xA0, FIL_, ++ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xA2, ++ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, ++ 0xA5, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, ++ 0x81, 0xA7, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, ++ 0xE3, 0x81, 0xA9, FIL_, 0x02, 0xE3, 0x82, 0x9A, ++ FIL_, 0xE3, 0x81, 0xB1, FIL_, 0xE3, 0x82, 0x99, ++ FIL_, 0xE3, 0x81, 0xB0, FIL_, 0x02, 0xE3, 0x82, ++ 0x9A, FIL_, 0xE3, 0x81, 0xB4, FIL_, 0xE3, 0x82, ++ 0x99, FIL_, 0xE3, 0x81, 0xB3, FIL_, 0x02, 0xE3, ++ 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xB7, FIL_, 0xE3, ++ 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB6, FIL_, 0x02, ++ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB9, FIL_, ++ 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xBA, FIL_, ++ 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xBC, ++ FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xBD, ++ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, ++ 0x9E, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, ++ 0x83, 0xB4, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, ++ 0xE3, 0x82, 0xAC, FIL_, 0x01, 0xE3, 0x82, 0x99, ++ FIL_, 0xE3, 0x82, 0xAE, FIL_, 0x01, 0xE3, 0x82, ++ 0x99, FIL_, 0xE3, 0x82, 0xB0, FIL_, 0x01, 0xE3, ++ 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB2, FIL_, 0x01, ++ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB4, FIL_, ++ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB6, ++ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, ++ 0xB8, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, ++ 0x82, 0xBA, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, ++ 0xE3, 0x82, 0xBC, FIL_, 0x01, 0xE3, 0x82, 0x99, ++ FIL_, 0xE3, 0x82, 0xBE, FIL_, 0x01, 0xE3, 0x82, ++ 0x99, FIL_, 0xE3, 0x83, 0x80, FIL_, 0x01, 0xE3, ++ 0x82, 0x99, FIL_, 0xE3, 0x83, 0x82, FIL_, 0x01, ++ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x85, FIL_, ++ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x87, ++ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, ++ 0x89, FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, ++ 0x83, 0x90, FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, ++ 0x83, 0x91, FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_, ++ 0xE3, 0x83, 0x93, FIL_, 0xE3, 0x82, 0x9A, FIL_, ++ 0xE3, 0x83, 0x94, FIL_, 0x02, 0xE3, 0x82, 0x99, ++ FIL_, 0xE3, 0x83, 0x96, FIL_, 0xE3, 0x82, 0x9A, ++ FIL_, 0xE3, 0x83, 0x97, FIL_, 0x02, 0xE3, 0x82, ++ 0x9A, FIL_, 0xE3, 0x83, 0x9A, FIL_, 0xE3, 0x82, ++ 0x99, FIL_, 0xE3, 0x83, 0x99, FIL_, 0x02, 0xE3, ++ 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x9D, FIL_, 0xE3, ++ 0x82, 0x99, FIL_, 0xE3, 0x83, 0x9C, FIL_, 0x01, ++ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0xB7, FIL_, ++ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0xB8, ++ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, ++ 0xB9, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, ++ 0x83, 0xBA, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, ++ 0xE3, 0x83, 0xBE, FIL_, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, ++ }, ++ { ++ 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAE, FIL_, ++ 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xA0, FIL_, ++ 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAF, FIL_, ++ 0x10, 0xCC, 0xA5, FIL_, 0xE1, 0xB8, 0x80, FIL_, ++ 0xCC, 0x87, FIL_, 0xC8, 0xA6, FIL_, 0xCC, 0x83, ++ FIL_, 0xC3, 0x83, FIL_, 0xCC, 0x91, FIL_, 0xC8, ++ 0x82, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x80, FIL_, ++ 0xCC, 0x8A, FIL_, 0xC3, 0x85, FIL_, 0xCC, 0x88, ++ FIL_, 0xC3, 0x84, FIL_, 0xCC, 0x89, FIL_, 0xE1, ++ 0xBA, 0xA2, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, ++ 0xA0, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x8D, FIL_, ++ 0xCC, 0x80, FIL_, 0xC3, 0x80, FIL_, 0xCC, 0x81, ++ FIL_, 0xC3, 0x81, FIL_, 0xCC, 0x82, FIL_, 0xC3, ++ 0x82, FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x84, FIL_, ++ 0xCC, 0x86, FIL_, 0xC4, 0x82, FIL_, 0xCC, 0x84, ++ FIL_, 0xC4, 0x80, FIL_, 0x03, 0xCC, 0xB1, FIL_, ++ 0xE1, 0xB8, 0x86, FIL_, 0xCC, 0x87, FIL_, 0xE1, ++ 0xB8, 0x82, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, ++ 0x84, FIL_, 0x05, 0xCC, 0xA7, FIL_, 0xC3, 0x87, ++ FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x8C, FIL_, 0xCC, ++ 0x81, FIL_, 0xC4, 0x86, FIL_, 0xCC, 0x82, FIL_, ++ 0xC4, 0x88, FIL_, 0xCC, 0x87, FIL_, 0xC4, 0x8A, ++ FIL_, 0x06, 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0x90, ++ FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x8E, FIL_, 0xCC, ++ 0xB1, FIL_, 0xE1, 0xB8, 0x8E, FIL_, 0xCC, 0xAD, ++ FIL_, 0xE1, 0xB8, 0x92, FIL_, 0xCC, 0xA3, FIL_, ++ 0xE1, 0xB8, 0x8C, FIL_, 0xCC, 0x87, FIL_, 0xE1, ++ 0xB8, 0x8A, FIL_, 0x11, 0xCC, 0x84, FIL_, 0xC4, ++ 0x92, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x94, FIL_, ++ 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0xB8, FIL_, 0xCC, ++ 0x91, FIL_, 0xC8, 0x86, FIL_, 0xCC, 0x82, FIL_, ++ 0xC3, 0x8A, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x84, ++ FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x98, FIL_, ++ 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xBA, FIL_, 0xCC, ++ 0xA7, FIL_, 0xC8, 0xA8, FIL_, 0xCC, 0x8C, FIL_, ++ 0xC4, 0x9A, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x88, ++ FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x98, FIL_, 0xCC, ++ 0x83, FIL_, 0xE1, 0xBA, 0xBC, FIL_, 0xCC, 0x87, ++ FIL_, 0xC4, 0x96, FIL_, 0xCC, 0x81, FIL_, 0xC3, ++ 0x89, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x8B, FIL_, ++ 0xCC, 0xB0, FIL_, 0xE1, 0xB8, 0x9A, FIL_, 0x01, ++ 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9E, FIL_, 0x07, ++ 0xCC, 0x8C, FIL_, 0xC7, 0xA6, FIL_, 0xCC, 0x86, ++ FIL_, 0xC4, 0x9E, FIL_, 0xCC, 0x82, FIL_, 0xC4, ++ 0x9C, FIL_, 0xCC, 0xA7, FIL_, 0xC4, 0xA2, FIL_, ++ 0xCC, 0x84, FIL_, 0xE1, 0xB8, 0xA0, FIL_, 0xCC, ++ 0x81, FIL_, 0xC7, 0xB4, FIL_, 0xCC, 0x87, FIL_, ++ 0xC4, 0xA0, FIL_, 0x07, 0xCC, 0x87, FIL_, 0xE1, ++ 0xB8, 0xA2, FIL_, 0xCC, 0xA7, FIL_, 0xE1, 0xB8, ++ 0xA8, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0xA4, FIL_, ++ 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA6, FIL_, 0xCC, ++ 0x8C, FIL_, 0xC8, 0x9E, FIL_, 0xCC, 0xAE, FIL_, ++ 0xE1, 0xB8, 0xAA, FIL_, 0xCC, 0xA3, FIL_, 0xE1, ++ 0xB8, 0xA4, FIL_, 0x0F, 0xCC, 0xB0, FIL_, 0xE1, ++ 0xB8, 0xAC, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x8F, ++ FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x8C, FIL_, 0xCC, ++ 0x89, FIL_, 0xE1, 0xBB, 0x88, FIL_, 0xCC, 0xA3, ++ FIL_, 0xE1, 0xBB, 0x8A, FIL_, 0xCC, 0x91, FIL_, ++ 0xC8, 0x8A, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x8F, ++ FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x8E, FIL_, 0xCC, ++ 0x81, FIL_, 0xC3, 0x8D, FIL_, 0xCC, 0x83, FIL_, ++ 0xC4, 0xA8, FIL_, 0xCC, 0x87, FIL_, 0xC4, 0xB0, ++ FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x88, FIL_, 0xCC, ++ 0xA8, FIL_, 0xC4, 0xAE, FIL_, 0xCC, 0x86, FIL_, ++ 0xC4, 0xAC, FIL_, 0xCC, 0x84, FIL_, 0xC4, 0xAA, ++ FIL_, 0x01, 0xCC, 0x82, FIL_, 0xC4, 0xB4, FIL_, ++ 0x05, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xB0, FIL_, ++ 0xCC, 0x8C, FIL_, 0xC7, 0xA8, FIL_, 0xCC, 0xB1, ++ FIL_, 0xE1, 0xB8, 0xB4, FIL_, 0xCC, 0xA7, FIL_, ++ 0xC4, 0xB6, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, ++ 0xB2, FIL_, 0x06, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, ++ 0xB6, FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0xBD, FIL_, ++ 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0xBC, FIL_, 0xCC, ++ 0xB1, FIL_, 0xE1, 0xB8, 0xBA, FIL_, 0xCC, 0xA7, ++ FIL_, 0xC4, 0xBB, FIL_, 0xCC, 0x81, FIL_, 0xC4, ++ 0xB9, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xB8, ++ 0xBE, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x80, ++ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x82, FIL_, ++ 0x09, 0xCC, 0x83, FIL_, 0xC3, 0x91, FIL_, 0xCC, ++ 0x81, FIL_, 0xC5, 0x83, FIL_, 0xCC, 0xA7, FIL_, ++ 0xC5, 0x85, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0x87, ++ FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x84, FIL_, ++ 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x86, FIL_, 0xCC, ++ 0xB1, FIL_, 0xE1, 0xB9, 0x88, FIL_, 0xCC, 0xAD, ++ FIL_, 0xE1, 0xB9, 0x8A, FIL_, 0xCC, 0x80, FIL_, ++ 0xC7, 0xB8, FIL_, 0x10, 0xCC, 0x89, FIL_, 0xE1, ++ 0xBB, 0x8E, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0x8C, ++ FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x94, FIL_, 0xCC, ++ 0x86, FIL_, 0xC5, 0x8E, FIL_, 0xCC, 0x83, FIL_, ++ 0xC3, 0x95, FIL_, 0xCC, 0x8B, FIL_, 0xC5, 0x90, ++ FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x96, FIL_, 0xCC, ++ 0x9B, FIL_, 0xC6, 0xA0, FIL_, 0xCC, 0x91, FIL_, ++ 0xC8, 0x8E, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x91, ++ FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x8C, FIL_, 0xCC, ++ 0xA3, FIL_, 0xE1, 0xBB, 0x8C, FIL_, 0xCC, 0x80, ++ FIL_, 0xC3, 0x92, FIL_, 0xCC, 0xA8, FIL_, 0xC7, ++ 0xAA, FIL_, 0xCC, 0x87, FIL_, 0xC8, 0xAE, FIL_, ++ 0xCC, 0x81, FIL_, 0xC3, 0x93, FIL_, 0x02, 0xCC, ++ 0x87, FIL_, 0xE1, 0xB9, 0x96, FIL_, 0xCC, 0x81, ++ FIL_, 0xE1, 0xB9, 0x94, FIL_, 0x08, 0xCC, 0xA7, ++ FIL_, 0xC5, 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC5, ++ 0x98, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x92, FIL_, ++ 0xCC, 0x8F, FIL_, 0xC8, 0x90, FIL_, 0xCC, 0x81, ++ FIL_, 0xC5, 0x94, FIL_, 0xCC, 0x87, FIL_, 0xE1, ++ 0xB9, 0x98, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB9, ++ 0x9E, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x9A, ++ FIL_, 0x07, 0xCC, 0xA6, FIL_, 0xC8, 0x98, FIL_, ++ 0xCC, 0x81, FIL_, 0xC5, 0x9A, FIL_, 0xCC, 0x82, ++ FIL_, 0xC5, 0x9C, FIL_, 0xCC, 0xA7, FIL_, 0xC5, ++ 0x9E, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0xA0, FIL_, ++ 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA0, FIL_, 0xCC, ++ 0xA3, FIL_, 0xE1, 0xB9, 0xA2, FIL_, 0x07, 0xCC, ++ 0xA6, FIL_, 0xC8, 0x9A, FIL_, 0xCC, 0x87, FIL_, ++ 0xE1, 0xB9, 0xAA, FIL_, 0xCC, 0xA3, FIL_, 0xE1, ++ 0xB9, 0xAC, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB9, ++ 0xAE, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB0, ++ FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0xA2, FIL_, 0xCC, ++ 0x8C, FIL_, 0xC5, 0xA4, FIL_, 0x13, 0xCC, 0x8A, ++ FIL_, 0xC5, 0xAE, FIL_, 0xCC, 0x88, FIL_, 0xC3, ++ 0x9C, FIL_, 0xCC, 0x8B, FIL_, 0xC5, 0xB0, FIL_, ++ 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB6, FIL_, 0xCC, ++ 0xA8, FIL_, 0xC5, 0xB2, FIL_, 0xCC, 0x8C, FIL_, ++ 0xC7, 0x93, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x99, ++ FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x94, FIL_, 0xCC, ++ 0xA3, FIL_, 0xE1, 0xBB, 0xA4, FIL_, 0xCC, 0xA4, ++ FIL_, 0xE1, 0xB9, 0xB2, FIL_, 0xCC, 0x81, FIL_, ++ 0xC3, 0x9A, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x9B, ++ FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB9, 0xB4, FIL_, ++ 0xCC, 0x83, FIL_, 0xC5, 0xA8, FIL_, 0xCC, 0x89, ++ FIL_, 0xE1, 0xBB, 0xA6, FIL_, 0xCC, 0x84, FIL_, ++ 0xC5, 0xAA, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x96, ++ FIL_, 0xCC, 0x86, FIL_, 0xC5, 0xAC, FIL_, 0xCC, ++ 0x9B, FIL_, 0xC6, 0xAF, FIL_, 0x02, 0xCC, 0xA3, ++ FIL_, 0xE1, 0xB9, 0xBE, FIL_, 0xCC, 0x83, FIL_, ++ 0xE1, 0xB9, 0xBC, FIL_, 0x06, 0xCC, 0x88, FIL_, ++ 0xE1, 0xBA, 0x84, FIL_, 0xCC, 0x81, FIL_, 0xE1, ++ 0xBA, 0x82, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, ++ 0x80, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x88, ++ FIL_, 0xCC, 0x82, FIL_, 0xC5, 0xB4, FIL_, 0xCC, ++ 0x87, FIL_, 0xE1, 0xBA, 0x86, FIL_, 0x02, 0xCC, ++ 0x88, FIL_, 0xE1, 0xBA, 0x8C, FIL_, 0xCC, 0x87, ++ FIL_, 0xE1, 0xBA, 0x8A, FIL_, 0x09, 0xCC, 0x89, ++ FIL_, 0xE1, 0xBB, 0xB6, FIL_, 0xCC, 0xA3, FIL_, ++ 0xE1, 0xBB, 0xB4, FIL_, 0xCC, 0x80, FIL_, 0xE1, ++ 0xBB, 0xB2, FIL_, 0xCC, 0x88, FIL_, 0xC5, 0xB8, ++ FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x9D, FIL_, 0xCC, ++ 0x83, FIL_, 0xE1, 0xBB, 0xB8, FIL_, 0xCC, 0x87, ++ FIL_, 0xE1, 0xBA, 0x8E, FIL_, 0xCC, 0x84, FIL_, ++ 0xC8, 0xB2, FIL_, 0xCC, 0x82, FIL_, 0xC5, 0xB6, ++ FIL_, 0x06, 0xCC, 0x82, FIL_, 0xE1, 0xBA, 0x90, ++ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x92, FIL_, ++ 0xCC, 0xB1, FIL_, 0xE1, 0xBA, 0x94, FIL_, 0xCC, ++ 0x8C, FIL_, 0xC5, 0xBD, FIL_, 0xCC, 0x87, FIL_, ++ 0xC5, 0xBB, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0xB9, ++ FIL_, 0x10, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0xA1, ++ FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x85, FIL_, 0xCC, ++ 0x81, FIL_, 0xC3, 0xA1, FIL_, 0xCC, 0x82, FIL_, ++ 0xC3, 0xA2, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, ++ 0xA3, FIL_, 0xCC, 0x83, FIL_, 0xC3, 0xA3, FIL_, ++ 0xCC, 0x8C, FIL_, 0xC7, 0x8E, FIL_, 0xCC, 0x8A, ++ FIL_, 0xC3, 0xA5, FIL_, 0xCC, 0x88, FIL_, 0xC3, ++ 0xA4, FIL_, 0xCC, 0x87, FIL_, 0xC8, 0xA7, FIL_, ++ 0xCC, 0x91, FIL_, 0xC8, 0x83, FIL_, 0xCC, 0xA5, ++ FIL_, 0xE1, 0xB8, 0x81, FIL_, 0xCC, 0x84, FIL_, ++ 0xC4, 0x81, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x81, ++ FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x83, FIL_, 0xCC, ++ 0x80, FIL_, 0xC3, 0xA0, FIL_, 0x03, 0xCC, 0xA3, ++ FIL_, 0xE1, 0xB8, 0x85, FIL_, 0xCC, 0x87, FIL_, ++ 0xE1, 0xB8, 0x83, FIL_, 0xCC, 0xB1, FIL_, 0xE1, ++ 0xB8, 0x87, FIL_, 0x05, 0xCC, 0x87, FIL_, 0xC4, ++ 0x8B, FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x8D, FIL_, ++ 0xCC, 0x82, FIL_, 0xC4, 0x89, FIL_, 0xCC, 0x81, ++ FIL_, 0xC4, 0x87, FIL_, 0xCC, 0xA7, FIL_, 0xC3, ++ 0xA7, FIL_, 0x06, 0xCC, 0x87, FIL_, 0xE1, 0xB8, ++ 0x8B, FIL_, 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0x91, ++ FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0x8F, FIL_, ++ 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0x8D, FIL_, 0xCC, ++ 0x8C, FIL_, 0xC4, 0x8F, FIL_, 0xCC, 0xAD, FIL_, ++ 0xE1, 0xB8, 0x93, FIL_, 0x11, 0xCC, 0x80, FIL_, ++ 0xC3, 0xA8, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0xA9, ++ FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xAA, FIL_, 0xCC, ++ 0x88, FIL_, 0xC3, 0xAB, FIL_, 0xCC, 0x84, FIL_, ++ 0xC4, 0x93, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x95, ++ FIL_, 0xCC, 0x87, FIL_, 0xC4, 0x97, FIL_, 0xCC, ++ 0xA8, FIL_, 0xC4, 0x99, FIL_, 0xCC, 0x8C, FIL_, ++ 0xC4, 0x9B, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x85, ++ FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x87, FIL_, 0xCC, ++ 0xA3, FIL_, 0xE1, 0xBA, 0xB9, FIL_, 0xCC, 0xA7, ++ FIL_, 0xC8, 0xA9, FIL_, 0xCC, 0x83, FIL_, 0xE1, ++ 0xBA, 0xBD, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, ++ 0xBB, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x99, ++ FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB8, 0x9B, FIL_, ++ 0x01, 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9F, FIL_, ++ 0x07, 0xCC, 0x86, FIL_, 0xC4, 0x9F, FIL_, 0xCC, ++ 0x87, FIL_, 0xC4, 0xA1, FIL_, 0xCC, 0x82, FIL_, ++ 0xC4, 0x9D, FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xB8, ++ 0xA1, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0xA7, FIL_, ++ 0xCC, 0xA7, FIL_, 0xC4, 0xA3, FIL_, 0xCC, 0x81, ++ FIL_, 0xC7, 0xB5, FIL_, 0x08, 0xCC, 0xA7, FIL_, ++ 0xE1, 0xB8, 0xA9, FIL_, 0xCC, 0xB1, FIL_, 0xE1, ++ 0xBA, 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC8, 0x9F, ++ FIL_, 0xCC, 0xAE, FIL_, 0xE1, 0xB8, 0xAB, FIL_, ++ 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA7, FIL_, 0xCC, ++ 0xA3, FIL_, 0xE1, 0xB8, 0xA5, FIL_, 0xCC, 0x87, ++ FIL_, 0xE1, 0xB8, 0xA3, FIL_, 0xCC, 0x82, FIL_, ++ 0xC4, 0xA5, FIL_, 0x0E, 0xCC, 0x88, FIL_, 0xC3, ++ 0xAF, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x89, ++ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0x8B, FIL_, ++ 0xCC, 0x82, FIL_, 0xC3, 0xAE, FIL_, 0xCC, 0x81, ++ FIL_, 0xC3, 0xAD, FIL_, 0xCC, 0x80, FIL_, 0xC3, ++ 0xAC, FIL_, 0xCC, 0x83, FIL_, 0xC4, 0xA9, FIL_, ++ 0xCC, 0x84, FIL_, 0xC4, 0xAB, FIL_, 0xCC, 0x86, ++ FIL_, 0xC4, 0xAD, FIL_, 0xCC, 0xA8, FIL_, 0xC4, ++ 0xAF, FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB8, 0xAD, ++ FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x90, FIL_, 0xCC, ++ 0x91, FIL_, 0xC8, 0x8B, FIL_, 0xCC, 0x8F, FIL_, ++ 0xC8, 0x89, FIL_, 0x02, 0xCC, 0x8C, FIL_, 0xC7, ++ 0xB0, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0xB5, FIL_, ++ 0x05, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0xB5, FIL_, ++ 0xCC, 0xA7, FIL_, 0xC4, 0xB7, FIL_, 0xCC, 0x8C, ++ FIL_, 0xC7, 0xA9, FIL_, 0xCC, 0x81, FIL_, 0xE1, ++ 0xB8, 0xB1, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, ++ 0xB3, FIL_, 0x06, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, ++ 0xB7, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0xBD, ++ FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0xBB, FIL_, ++ 0xCC, 0xA7, FIL_, 0xC4, 0xBC, FIL_, 0xCC, 0x81, ++ FIL_, 0xC4, 0xBA, FIL_, 0xCC, 0x8C, FIL_, 0xC4, ++ 0xBE, FIL_, 0x03, 0xCC, 0x87, FIL_, 0xE1, 0xB9, ++ 0x81, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x83, ++ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xBF, FIL_, ++ 0x09, 0xCC, 0x80, FIL_, 0xC7, 0xB9, FIL_, 0xCC, ++ 0xAD, FIL_, 0xE1, 0xB9, 0x8B, FIL_, 0xCC, 0x83, ++ FIL_, 0xC3, 0xB1, FIL_, 0xCC, 0x81, FIL_, 0xC5, ++ 0x84, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x87, ++ FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB9, 0x89, FIL_, ++ 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x85, FIL_, 0xCC, ++ 0xA7, FIL_, 0xC5, 0x86, FIL_, 0xCC, 0x8C, FIL_, ++ 0xC5, 0x88, FIL_, 0x10, 0xCC, 0xA3, FIL_, 0xE1, ++ 0xBB, 0x8D, FIL_, 0xCC, 0x87, FIL_, 0xC8, 0xAF, ++ FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xB2, FIL_, 0xCC, ++ 0x91, FIL_, 0xC8, 0x8F, FIL_, 0xCC, 0x89, FIL_, ++ 0xE1, 0xBB, 0x8F, FIL_, 0xCC, 0x88, FIL_, 0xC3, ++ 0xB6, FIL_, 0xCC, 0x83, FIL_, 0xC3, 0xB5, FIL_, ++ 0xCC, 0x81, FIL_, 0xC3, 0xB3, FIL_, 0xCC, 0x8C, ++ FIL_, 0xC7, 0x92, FIL_, 0xCC, 0xA8, FIL_, 0xC7, ++ 0xAB, FIL_, 0xCC, 0x9B, FIL_, 0xC6, 0xA1, FIL_, ++ 0xCC, 0x84, FIL_, 0xC5, 0x8D, FIL_, 0xCC, 0x86, ++ FIL_, 0xC5, 0x8F, FIL_, 0xCC, 0x8B, FIL_, 0xC5, ++ 0x91, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xB4, FIL_, ++ 0xCC, 0x8F, FIL_, 0xC8, 0x8D, FIL_, 0x02, 0xCC, ++ 0x87, FIL_, 0xE1, 0xB9, 0x97, FIL_, 0xCC, 0x81, ++ FIL_, 0xE1, 0xB9, 0x95, FIL_, 0x08, 0xCC, 0x8C, ++ FIL_, 0xC5, 0x99, FIL_, 0xCC, 0xA3, FIL_, 0xE1, ++ 0xB9, 0x9B, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0x95, ++ FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x97, FIL_, 0xCC, ++ 0xB1, FIL_, 0xE1, 0xB9, 0x9F, FIL_, 0xCC, 0x87, ++ FIL_, 0xE1, 0xB9, 0x99, FIL_, 0xCC, 0x91, FIL_, ++ 0xC8, 0x93, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x91, ++ FIL_, 0x07, 0xCC, 0xA7, FIL_, 0xC5, 0x9F, FIL_, ++ 0xCC, 0x82, FIL_, 0xC5, 0x9D, FIL_, 0xCC, 0x87, ++ FIL_, 0xE1, 0xB9, 0xA1, FIL_, 0xCC, 0xA6, FIL_, ++ 0xC8, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0x9B, ++ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0xA3, FIL_, ++ 0xCC, 0x8C, FIL_, 0xC5, 0xA1, FIL_, 0x08, 0xCC, ++ 0xA6, FIL_, 0xC8, 0x9B, FIL_, 0xCC, 0xAD, FIL_, ++ 0xE1, 0xB9, 0xB1, FIL_, 0xCC, 0xB1, FIL_, 0xE1, ++ 0xB9, 0xAF, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, ++ 0xAD, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xAB, ++ FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0xA5, FIL_, 0xCC, ++ 0xA7, FIL_, 0xC5, 0xA3, FIL_, 0xCC, 0x88, FIL_, ++ 0xE1, 0xBA, 0x97, FIL_, 0x13, 0xCC, 0x8A, FIL_, ++ 0xC5, 0xAF, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x95, ++ FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x94, FIL_, 0xCC, ++ 0x80, FIL_, 0xC3, 0xB9, FIL_, 0xCC, 0x9B, FIL_, ++ 0xC6, 0xB0, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xBB, ++ FIL_, 0xCC, 0x81, FIL_, 0xC3, 0xBA, FIL_, 0xCC, ++ 0x88, FIL_, 0xC3, 0xBC, FIL_, 0xCC, 0x83, FIL_, ++ 0xC5, 0xA9, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, ++ 0xA7, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0xAB, FIL_, ++ 0xCC, 0x86, FIL_, 0xC5, 0xAD, FIL_, 0xCC, 0xAD, ++ FIL_, 0xE1, 0xB9, 0xB7, FIL_, 0xCC, 0x8B, FIL_, ++ 0xC5, 0xB1, FIL_, 0xCC, 0xA8, FIL_, 0xC5, 0xB3, ++ FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x97, FIL_, 0xCC, ++ 0xA4, FIL_, 0xE1, 0xB9, 0xB3, FIL_, 0xCC, 0xA3, ++ FIL_, 0xE1, 0xBB, 0xA5, FIL_, 0xCC, 0xB0, FIL_, ++ 0xE1, 0xB9, 0xB5, FIL_, 0x02, 0xCC, 0x83, FIL_, ++ 0xE1, 0xB9, 0xBD, FIL_, 0xCC, 0xA3, FIL_, 0xE1, ++ 0xB9, 0xBF, FIL_, 0x07, 0xCC, 0x8A, FIL_, 0xE1, ++ 0xBA, 0x98, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xBA, ++ 0x87, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0x83, ++ FIL_, 0xCC, 0x82, FIL_, 0xC5, 0xB5, FIL_, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBA, 0x81, FIL_, 0xCC, 0xA3, ++ FIL_, 0xE1, 0xBA, 0x89, FIL_, 0xCC, 0x88, FIL_, ++ 0xE1, 0xBA, 0x85, FIL_, 0x02, 0xCC, 0x87, FIL_, ++ 0xE1, 0xBA, 0x8B, FIL_, 0xCC, 0x88, FIL_, 0xE1, ++ 0xBA, 0x8D, FIL_, 0x0A, 0xCC, 0x87, FIL_, 0xE1, ++ 0xBA, 0x8F, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, ++ 0xB5, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0xB7, ++ FIL_, 0xCC, 0x8A, FIL_, 0xE1, 0xBA, 0x99, FIL_, ++ 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xB3, FIL_, 0xCC, ++ 0x83, FIL_, 0xE1, 0xBB, 0xB9, FIL_, 0xCC, 0x88, ++ FIL_, 0xC3, 0xBF, FIL_, 0xCC, 0x81, FIL_, 0xC3, ++ 0xBD, FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xB3, FIL_, ++ 0xCC, 0x82, FIL_, 0xC5, 0xB7, FIL_, 0x06, 0xCC, ++ 0xB1, FIL_, 0xE1, 0xBA, 0x95, FIL_, 0xCC, 0xA3, ++ FIL_, 0xE1, 0xBA, 0x93, FIL_, 0xCC, 0x82, FIL_, ++ 0xE1, 0xBA, 0x91, FIL_, 0xCC, 0x81, FIL_, 0xC5, ++ 0xBA, FIL_, 0xCC, 0x87, FIL_, 0xC5, 0xBC, FIL_, ++ 0xCC, 0x8C, FIL_, 0xC5, 0xBE, FIL_, 0x03, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBF, 0xAD, FIL_, 0xCD, 0x82, ++ FIL_, 0xE1, 0xBF, 0x81, FIL_, 0xCC, 0x81, FIL_, ++ 0xCE, 0x85, FIL_, 0x04, 0xCC, 0x83, FIL_, 0xE1, ++ 0xBA, 0xAA, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, ++ 0xA4, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA8, ++ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xA6, FIL_, ++ 0x01, 0xCC, 0x84, FIL_, 0xC7, 0x9E, FIL_, 0x01, ++ 0xCC, 0x81, FIL_, 0xC7, 0xBA, FIL_, 0x02, 0xCC, ++ 0x84, FIL_, 0xC7, 0xA2, FIL_, 0xCC, 0x81, FIL_, ++ 0xC7, 0xBC, FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1, ++ 0xB8, 0x88, FIL_, 0x04, 0xCC, 0x83, FIL_, 0xE1, ++ 0xBB, 0x84, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, ++ 0x80, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x82, ++ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xBE, FIL_, ++ 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xAE, FIL_, ++ 0x04, 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x90, FIL_, ++ 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0x92, FIL_, 0xCC, ++ 0x89, FIL_, 0xE1, 0xBB, 0x94, FIL_, 0xCC, 0x83, ++ FIL_, 0xE1, 0xBB, 0x96, FIL_, 0x03, 0xCC, 0x84, ++ FIL_, 0xC8, 0xAC, FIL_, 0xCC, 0x88, FIL_, 0xE1, ++ 0xB9, 0x8E, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xB9, ++ 0x8C, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAA, ++ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xC7, 0xBE, FIL_, ++ 0x04, 0xCC, 0x80, FIL_, 0xC7, 0x9B, FIL_, 0xCC, ++ 0x84, FIL_, 0xC7, 0x95, FIL_, 0xCC, 0x8C, FIL_, ++ 0xC7, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC7, 0x97, ++ FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xA5, ++ FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBA, 0xAB, FIL_, ++ 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA9, FIL_, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBA, 0xA7, FIL_, 0x01, 0xCC, ++ 0x84, FIL_, 0xC7, 0x9F, FIL_, 0x01, 0xCC, 0x81, ++ FIL_, 0xC7, 0xBB, FIL_, 0x02, 0xCC, 0x81, FIL_, ++ 0xC7, 0xBD, FIL_, 0xCC, 0x84, FIL_, 0xC7, 0xA3, ++ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x89, ++ FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x83, ++ FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0x85, FIL_, ++ 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0x81, FIL_, 0xCC, ++ 0x81, FIL_, 0xE1, 0xBA, 0xBF, FIL_, 0x01, 0xCC, ++ 0x81, FIL_, 0xE1, 0xB8, 0xAF, FIL_, 0x04, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBB, 0x93, FIL_, 0xCC, 0x81, ++ FIL_, 0xE1, 0xBB, 0x91, FIL_, 0xCC, 0x83, FIL_, ++ 0xE1, 0xBB, 0x97, FIL_, 0xCC, 0x89, FIL_, 0xE1, ++ 0xBB, 0x95, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, ++ 0xB9, 0x8D, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB9, ++ 0x8F, FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xAD, FIL_, ++ 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAB, FIL_, 0x01, ++ 0xCC, 0x81, FIL_, 0xC7, 0xBF, FIL_, 0x04, 0xCC, ++ 0x8C, FIL_, 0xC7, 0x9A, FIL_, 0xCC, 0x84, FIL_, ++ 0xC7, 0x96, FIL_, 0xCC, 0x80, FIL_, 0xC7, 0x9C, ++ FIL_, 0xCC, 0x81, FIL_, 0xC7, 0x98, FIL_, 0x04, ++ 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xAE, FIL_, 0xCC, ++ 0x83, FIL_, 0xE1, 0xBA, 0xB4, FIL_, 0xCC, 0x89, ++ FIL_, 0xE1, 0xBA, 0xB2, FIL_, 0xCC, 0x80, FIL_, ++ 0xE1, 0xBA, 0xB0, FIL_, 0x04, 0xCC, 0x83, FIL_, ++ 0xE1, 0xBA, 0xB5, FIL_, 0xCC, 0x80, FIL_, 0xE1, ++ 0xBA, 0xB1, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, ++ 0xAF, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xB3, ++ FIL_, 0x02, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x96, ++ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x94, FIL_, ++ 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x95, FIL_, ++ 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x97, FIL_, 0x02, ++ 0xCC, 0x80, FIL_, 0xE1, 0xB9, 0x90, FIL_, 0xCC, ++ 0x81, FIL_, 0xE1, 0xB9, 0x92, FIL_, 0x02, 0xCC, ++ 0x81, FIL_, 0xE1, 0xB9, 0x93, FIL_, 0xCC, 0x80, ++ FIL_, 0xE1, 0xB9, 0x91, FIL_, 0x01, 0xCC, 0x87, ++ FIL_, 0xE1, 0xB9, 0xA4, FIL_, 0x01, 0xCC, 0x87, ++ FIL_, 0xE1, 0xB9, 0xA5, FIL_, 0x01, 0xCC, 0x87, ++ FIL_, 0xE1, 0xB9, 0xA6, FIL_, 0x01, 0xCC, 0x87, ++ FIL_, 0xE1, 0xB9, 0xA7, FIL_, 0x01, 0xCC, 0x81, ++ FIL_, 0xE1, 0xB9, 0xB8, FIL_, 0x01, 0xCC, 0x81, ++ FIL_, 0xE1, 0xB9, 0xB9, FIL_, 0x01, 0xCC, 0x88, ++ FIL_, 0xE1, 0xB9, 0xBA, FIL_, 0x01, 0xCC, 0x88, ++ FIL_, 0xE1, 0xB9, 0xBB, FIL_, 0x01, 0xCC, 0x87, ++ FIL_, 0xE1, 0xBA, 0x9B, FIL_, 0x05, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBB, 0x9C, FIL_, 0xCC, 0x89, FIL_, ++ 0xE1, 0xBB, 0x9E, FIL_, 0xCC, 0x83, FIL_, 0xE1, ++ 0xBB, 0xA0, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB, ++ 0x9A, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xA2, ++ FIL_, 0x05, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xA1, ++ FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xA3, FIL_, ++ 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x9B, FIL_, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBB, 0x9D, FIL_, 0xCC, 0x89, ++ FIL_, 0xE1, 0xBB, 0x9F, FIL_, 0x05, 0xCC, 0x81, ++ FIL_, 0xE1, 0xBB, 0xA8, FIL_, 0xCC, 0x80, FIL_, ++ 0xE1, 0xBB, 0xAA, FIL_, 0xCC, 0x89, FIL_, 0xE1, ++ 0xBB, 0xAC, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, ++ 0xAE, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xB0, ++ FIL_, 0x05, 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xAB, ++ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0xA9, FIL_, ++ 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xAF, FIL_, 0xCC, ++ 0xA3, FIL_, 0xE1, 0xBB, 0xB1, FIL_, 0xCC, 0x89, ++ FIL_, 0xE1, 0xBB, 0xAD, FIL_, 0x01, 0xCC, 0x8C, ++ FIL_, 0xC7, 0xAE, FIL_, 0x01, 0xCC, 0x84, FIL_, ++ 0xC7, 0xAC, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, ++ 0xAD, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA0, ++ FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA1, FIL_, ++ 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9C, FIL_, ++ 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9D, FIL_, ++ 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xB0, FIL_, 0x01, ++ 0xCC, 0x84, FIL_, 0xC8, 0xB1, FIL_, 0x01, 0xCC, ++ 0x8C, FIL_, 0xC7, 0xAF, FIL_, 0x07, 0xCC, 0x93, ++ FIL_, 0xE1, 0xBC, 0x88, FIL_, 0xCC, 0x81, FIL_, ++ 0xCE, 0x86, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBE, ++ 0xB8, FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xBE, 0xB9, ++ FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0x89, FIL_, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xBC, FIL_, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBE, 0xBA, FIL_, 0x04, 0xCC, ++ 0x94, FIL_, 0xE1, 0xBC, 0x99, FIL_, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBF, 0x88, FIL_, 0xCC, 0x81, FIL_, ++ 0xCE, 0x88, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC, ++ 0x98, FIL_, 0x05, 0xCD, 0x85, FIL_, 0xE1, 0xBF, ++ 0x8C, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x89, FIL_, ++ 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x8A, FIL_, 0xCC, ++ 0x93, FIL_, 0xE1, 0xBC, 0xA8, FIL_, 0xCC, 0x94, ++ FIL_, 0xE1, 0xBC, 0xA9, FIL_, 0x07, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBF, 0x9A, FIL_, 0xCC, 0x84, FIL_, ++ 0xE1, 0xBF, 0x99, FIL_, 0xCC, 0x93, FIL_, 0xE1, ++ 0xBC, 0xB8, FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBC, ++ 0xB9, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0x98, ++ FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x8A, FIL_, 0xCC, ++ 0x88, FIL_, 0xCE, 0xAA, FIL_, 0x04, 0xCC, 0x81, ++ FIL_, 0xCE, 0x8C, FIL_, 0xCC, 0x94, FIL_, 0xE1, ++ 0xBD, 0x89, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD, ++ 0x88, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0xB8, ++ FIL_, 0x01, 0xCC, 0x94, FIL_, 0xE1, 0xBF, 0xAC, ++ FIL_, 0x06, 0xCC, 0x94, FIL_, 0xE1, 0xBD, 0x99, ++ FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0xA8, FIL_, ++ 0xCC, 0x88, FIL_, 0xCE, 0xAB, FIL_, 0xCC, 0x84, ++ FIL_, 0xE1, 0xBF, 0xA9, FIL_, 0xCC, 0x81, FIL_, ++ 0xCE, 0x8E, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, ++ 0xAA, FIL_, 0x05, 0xCC, 0x93, FIL_, 0xE1, 0xBD, ++ 0xA8, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xBC, ++ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0xBA, FIL_, ++ 0xCC, 0x94, FIL_, 0xE1, 0xBD, 0xA9, FIL_, 0xCC, ++ 0x81, FIL_, 0xCE, 0x8F, FIL_, 0x01, 0xCD, 0x85, ++ FIL_, 0xE1, 0xBE, 0xB4, FIL_, 0x01, 0xCD, 0x85, ++ FIL_, 0xE1, 0xBF, 0x84, FIL_, 0x08, 0xCD, 0x85, ++ FIL_, 0xE1, 0xBE, 0xB3, FIL_, 0xCC, 0x84, FIL_, ++ 0xE1, 0xBE, 0xB1, FIL_, 0xCC, 0x86, FIL_, 0xE1, ++ 0xBE, 0xB0, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, ++ 0xB0, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAC, FIL_, ++ 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0x81, FIL_, 0xCC, ++ 0x93, FIL_, 0xE1, 0xBC, 0x80, FIL_, 0xCD, 0x82, ++ FIL_, 0xE1, 0xBE, 0xB6, FIL_, 0x04, 0xCC, 0x93, ++ FIL_, 0xE1, 0xBC, 0x90, FIL_, 0xCC, 0x80, FIL_, ++ 0xE1, 0xBD, 0xB2, FIL_, 0xCC, 0x94, FIL_, 0xE1, ++ 0xBC, 0x91, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAD, ++ FIL_, 0x06, 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0xA1, ++ FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAE, FIL_, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBF, 0x83, FIL_, 0xCD, 0x82, ++ FIL_, 0xE1, 0xBF, 0x86, FIL_, 0xCC, 0x93, FIL_, ++ 0xE1, 0xBC, 0xA0, FIL_, 0xCC, 0x80, FIL_, 0xE1, ++ 0xBD, 0xB4, FIL_, 0x08, 0xCC, 0x88, FIL_, 0xCF, ++ 0x8A, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAF, FIL_, ++ 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0xB0, FIL_, 0xCC, ++ 0x94, FIL_, 0xE1, 0xBC, 0xB1, FIL_, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBD, 0xB6, FIL_, 0xCC, 0x86, FIL_, ++ 0xE1, 0xBF, 0x90, FIL_, 0xCC, 0x84, FIL_, 0xE1, ++ 0xBF, 0x91, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, ++ 0x96, FIL_, 0x04, 0xCC, 0x93, FIL_, 0xE1, 0xBD, ++ 0x80, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xB8, ++ FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBD, 0x81, FIL_, ++ 0xCC, 0x81, FIL_, 0xCF, 0x8C, FIL_, 0x02, 0xCC, ++ 0x93, FIL_, 0xE1, 0xBF, 0xA4, FIL_, 0xCC, 0x94, ++ FIL_, 0xE1, 0xBF, 0xA5, FIL_, 0x08, 0xCC, 0x81, ++ FIL_, 0xCF, 0x8D, FIL_, 0xCC, 0x94, FIL_, 0xE1, ++ 0xBD, 0x91, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, ++ 0xA6, FIL_, 0xCC, 0x88, FIL_, 0xCF, 0x8B, FIL_, ++ 0xCC, 0x84, FIL_, 0xE1, 0xBF, 0xA1, FIL_, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBD, 0xBA, FIL_, 0xCC, 0x93, ++ FIL_, 0xE1, 0xBD, 0x90, FIL_, 0xCC, 0x86, FIL_, ++ 0xE1, 0xBF, 0xA0, FIL_, 0x06, 0xCC, 0x80, FIL_, ++ 0xE1, 0xBD, 0xBC, FIL_, 0xCC, 0x94, FIL_, 0xE1, ++ 0xBD, 0xA1, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD, ++ 0xA0, FIL_, 0xCC, 0x81, FIL_, 0xCF, 0x8E, FIL_, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB3, FIL_, 0xCD, ++ 0x82, FIL_, 0xE1, 0xBF, 0xB6, FIL_, 0x03, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBF, 0x92, FIL_, 0xCD, 0x82, ++ FIL_, 0xE1, 0xBF, 0x97, FIL_, 0xCC, 0x81, FIL_, ++ 0xCE, 0x90, FIL_, 0x03, 0xCD, 0x82, FIL_, 0xE1, ++ 0xBF, 0xA7, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, ++ 0xA2, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xB0, FIL_, ++ 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB4, FIL_, ++ 0x02, 0xCC, 0x88, FIL_, 0xCF, 0x94, FIL_, 0xCC, ++ 0x81, FIL_, 0xCF, 0x93, FIL_, 0x01, 0xCC, 0x88, ++ FIL_, 0xD0, 0x87, FIL_, 0x02, 0xCC, 0x88, FIL_, ++ 0xD3, 0x92, FIL_, 0xCC, 0x86, FIL_, 0xD3, 0x90, ++ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x83, FIL_, ++ 0x03, 0xCC, 0x88, FIL_, 0xD0, 0x81, FIL_, 0xCC, ++ 0x80, FIL_, 0xD0, 0x80, FIL_, 0xCC, 0x86, FIL_, ++ 0xD3, 0x96, FIL_, 0x02, 0xCC, 0x86, FIL_, 0xD3, ++ 0x81, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x9C, FIL_, ++ 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9E, FIL_, 0x04, ++ 0xCC, 0x84, FIL_, 0xD3, 0xA2, FIL_, 0xCC, 0x88, ++ FIL_, 0xD3, 0xA4, FIL_, 0xCC, 0x86, FIL_, 0xD0, ++ 0x99, FIL_, 0xCC, 0x80, FIL_, 0xD0, 0x8D, FIL_, ++ 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x8C, FIL_, 0x01, ++ 0xCC, 0x88, FIL_, 0xD3, 0xA6, FIL_, 0x04, 0xCC, ++ 0x8B, FIL_, 0xD3, 0xB2, FIL_, 0xCC, 0x88, FIL_, ++ 0xD3, 0xB0, FIL_, 0xCC, 0x86, FIL_, 0xD0, 0x8E, ++ FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xAE, FIL_, 0x01, ++ 0xCC, 0x88, FIL_, 0xD3, 0xB4, FIL_, 0x01, 0xCC, ++ 0x88, FIL_, 0xD3, 0xB8, FIL_, 0x01, 0xCC, 0x88, ++ FIL_, 0xD3, 0xAC, FIL_, 0x02, 0xCC, 0x86, FIL_, ++ 0xD3, 0x91, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x93, ++ FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x93, FIL_, ++ 0x03, 0xCC, 0x80, FIL_, 0xD1, 0x90, FIL_, 0xCC, ++ 0x86, FIL_, 0xD3, 0x97, FIL_, 0xCC, 0x88, FIL_, ++ 0xD1, 0x91, FIL_, 0x02, 0xCC, 0x86, FIL_, 0xD3, ++ 0x82, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x9D, FIL_, ++ 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9F, FIL_, 0x04, ++ 0xCC, 0x86, FIL_, 0xD0, 0xB9, FIL_, 0xCC, 0x88, ++ FIL_, 0xD3, 0xA5, FIL_, 0xCC, 0x84, FIL_, 0xD3, ++ 0xA3, FIL_, 0xCC, 0x80, FIL_, 0xD1, 0x9D, FIL_, ++ 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x9C, FIL_, 0x01, ++ 0xCC, 0x88, FIL_, 0xD3, 0xA7, FIL_, 0x04, 0xCC, ++ 0x8B, FIL_, 0xD3, 0xB3, FIL_, 0xCC, 0x84, FIL_, ++ 0xD3, 0xAF, FIL_, 0xCC, 0x86, FIL_, 0xD1, 0x9E, ++ FIL_, 0xCC, 0x88, FIL_, 0xD3, 0xB1, FIL_, 0x01, ++ 0xCC, 0x88, FIL_, 0xD3, 0xB5, FIL_, 0x01, 0xCC, ++ 0x88, FIL_, 0xD3, 0xB9, FIL_, 0x01, 0xCC, 0x88, ++ FIL_, 0xD3, 0xAD, FIL_, 0x01, 0xCC, 0x88, FIL_, ++ 0xD1, 0x97, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1, ++ 0xB6, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1, 0xB7, ++ FIL_, 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9A, FIL_, ++ 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9B, FIL_, 0x01, ++ 0xCC, 0x88, FIL_, 0xD3, 0xAA, FIL_, 0x01, 0xCC, ++ 0x88, FIL_, 0xD3, 0xAB, FIL_, 0x03, 0xD9, 0x94, ++ FIL_, 0xD8, 0xA3, FIL_, 0xD9, 0x95, FIL_, 0xD8, ++ 0xA5, FIL_, 0xD9, 0x93, FIL_, 0xD8, 0xA2, FIL_, ++ 0x01, 0xD9, 0x94, FIL_, 0xD8, 0xA4, FIL_, 0x01, ++ 0xD9, 0x94, FIL_, 0xD8, 0xA6, FIL_, 0x01, 0xD9, ++ 0x94, FIL_, 0xDB, 0x82, FIL_, 0x01, 0xD9, 0x94, ++ FIL_, 0xDB, 0x93, FIL_, 0x01, 0xD9, 0x94, FIL_, ++ 0xDB, 0x80, FIL_, 0x01, 0xE0, 0xA4, 0xBC, FIL_, ++ 0xE0, 0xA4, 0xA9, FIL_, 0x01, 0xE0, 0xA4, 0xBC, ++ FIL_, 0xE0, 0xA4, 0xB1, FIL_, 0x01, 0xE0, 0xA4, ++ 0xBC, FIL_, 0xE0, 0xA4, 0xB4, FIL_, 0x02, 0xE0, ++ 0xA6, 0xBE, FIL_, 0xE0, 0xA7, 0x8B, FIL_, 0xE0, ++ 0xA7, 0x97, FIL_, 0xE0, 0xA7, 0x8C, FIL_, 0x03, ++ 0xE0, 0xAD, 0x96, FIL_, 0xE0, 0xAD, 0x88, FIL_, ++ 0xE0, 0xAC, 0xBE, FIL_, 0xE0, 0xAD, 0x8B, FIL_, ++ 0xE0, 0xAD, 0x97, FIL_, 0xE0, 0xAD, 0x8C, FIL_, ++ 0x01, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAE, 0x94, ++ FIL_, 0x02, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAF, ++ 0x8C, FIL_, 0xE0, 0xAE, 0xBE, FIL_, 0xE0, 0xAF, ++ 0x8A, FIL_, 0x01, 0xE0, 0xAE, 0xBE, FIL_, 0xE0, ++ 0xAF, 0x8B, FIL_, 0x01, 0xE0, 0xB1, 0x96, FIL_, ++ 0xE0, 0xB1, 0x88, FIL_, 0x01, 0xE0, 0xB3, 0x95, ++ FIL_, 0xE0, 0xB3, 0x80, FIL_, 0x03, 0xE0, 0xB3, ++ 0x82, FIL_, 0xE0, 0xB3, 0x8A, FIL_, 0xE0, 0xB3, ++ 0x96, FIL_, 0xE0, 0xB3, 0x88, FIL_, 0xE0, 0xB3, ++ 0x95, FIL_, 0xE0, 0xB3, 0x87, FIL_, 0x01, 0xE0, ++ 0xB3, 0x95, FIL_, 0xE0, 0xB3, 0x8B, FIL_, 0x02, ++ 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8A, FIL_, ++ 0xE0, 0xB5, 0x97, FIL_, 0xE0, 0xB5, 0x8C, FIL_, ++ 0x01, 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8B, ++ FIL_, 0x03, 0xE0, 0xB7, 0x9F, FIL_, 0xE0, 0xB7, ++ 0x9E, FIL_, 0xE0, 0xB7, 0x8A, FIL_, 0xE0, 0xB7, ++ 0x9A, FIL_, 0xE0, 0xB7, 0x8F, FIL_, 0xE0, 0xB7, ++ 0x9C, FIL_, 0x01, 0xE0, 0xB7, 0x8A, FIL_, 0xE0, ++ 0xB7, 0x9D, FIL_, 0x01, 0xE1, 0x80, 0xAE, FIL_, ++ 0xE1, 0x80, 0xA6, FIL_, 0x01, 0xE1, 0xAC, 0xB5, ++ FIL_, 0xE1, 0xAC, 0x86, FIL_, 0x01, 0xE1, 0xAC, ++ 0xB5, FIL_, 0xE1, 0xAC, 0x88, FIL_, 0x01, 0xE1, ++ 0xAC, 0xB5, FIL_, 0xE1, 0xAC, 0x8A, FIL_, 0x01, ++ 0xE1, 0xAC, 0xB5, FIL_, 0xE1, 0xAC, 0x8C, FIL_, ++ 0x01, 0xE1, 0xAC, 0xB5, FIL_, 0xE1, 0xAC, 0x8E, ++ FIL_, 0x01, 0xE1, 0xAC, 0xB5, FIL_, 0xE1, 0xAC, ++ 0x92, FIL_, 0x01, 0xE1, 0xAC, 0xB5, FIL_, 0xE1, ++ 0xAC, 0xBB, FIL_, 0x01, 0xE1, 0xAC, 0xB5, FIL_, ++ 0xE1, 0xAC, 0xBD, FIL_, 0x01, 0xE1, 0xAC, 0xB5, ++ FIL_, 0xE1, 0xAD, 0x80, FIL_, 0x01, 0xE1, 0xAC, ++ 0xB5, FIL_, 0xE1, 0xAD, 0x81, FIL_, 0x01, 0xE1, ++ 0xAC, 0xB5, FIL_, 0xE1, 0xAD, 0x83, FIL_, 0x01, ++ 0xCC, 0x84, FIL_, 0xE1, 0xB8, 0xB8, FIL_, 0x01, ++ 0xCC, 0x84, FIL_, 0xE1, 0xB8, 0xB9, FIL_, 0x01, ++ 0xCC, 0x84, FIL_, 0xE1, 0xB9, 0x9C, FIL_, 0x01, ++ 0xCC, 0x84, FIL_, 0xE1, 0xB9, 0x9D, FIL_, 0x01, ++ 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA8, FIL_, 0x01, ++ 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA9, FIL_, 0x02, ++ 0xCC, 0x86, FIL_, 0xE1, 0xBA, 0xB6, FIL_, 0xCC, ++ 0x82, FIL_, 0xE1, 0xBA, 0xAC, FIL_, 0x02, 0xCC, ++ 0x82, FIL_, 0xE1, 0xBA, 0xAD, FIL_, 0xCC, 0x86, ++ FIL_, 0xE1, 0xBA, 0xB7, FIL_, 0x01, 0xCC, 0x82, ++ FIL_, 0xE1, 0xBB, 0x86, FIL_, 0x01, 0xCC, 0x82, ++ FIL_, 0xE1, 0xBB, 0x87, FIL_, 0x01, 0xCC, 0x82, ++ FIL_, 0xE1, 0xBB, 0x98, FIL_, 0x01, 0xCC, 0x82, ++ FIL_, 0xE1, 0xBB, 0x99, FIL_, 0x04, 0xCD, 0x85, ++ FIL_, 0xE1, 0xBE, 0x80, FIL_, 0xCD, 0x82, FIL_, ++ 0xE1, 0xBC, 0x86, FIL_, 0xCC, 0x80, FIL_, 0xE1, ++ 0xBC, 0x82, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, ++ 0x84, FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBC, ++ 0x87, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x85, ++ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x83, FIL_, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x81, FIL_, 0x01, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x82, FIL_, 0x01, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x83, FIL_, 0x01, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x84, FIL_, 0x01, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x85, FIL_, 0x01, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x86, FIL_, 0x01, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x87, FIL_, 0x04, ++ 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x8C, FIL_, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBC, 0x8A, FIL_, 0xCD, 0x85, ++ FIL_, 0xE1, 0xBE, 0x88, FIL_, 0xCD, 0x82, FIL_, ++ 0xE1, 0xBC, 0x8E, FIL_, 0x04, 0xCC, 0x80, FIL_, ++ 0xE1, 0xBC, 0x8B, FIL_, 0xCD, 0x82, FIL_, 0xE1, ++ 0xBC, 0x8F, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, ++ 0x8D, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x89, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8A, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8B, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8C, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8D, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8E, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8F, ++ FIL_, 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x92, ++ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x94, FIL_, ++ 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x93, FIL_, ++ 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x95, FIL_, 0x02, ++ 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x9A, FIL_, 0xCC, ++ 0x81, FIL_, 0xE1, 0xBC, 0x9C, FIL_, 0x02, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBC, 0x9B, FIL_, 0xCC, 0x81, ++ FIL_, 0xE1, 0xBC, 0x9D, FIL_, 0x04, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBC, 0xA2, FIL_, 0xCC, 0x81, FIL_, ++ 0xE1, 0xBC, 0xA4, FIL_, 0xCD, 0x82, FIL_, 0xE1, ++ 0xBC, 0xA6, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, ++ 0x90, FIL_, 0x04, 0xCD, 0x85, FIL_, 0xE1, 0xBE, ++ 0x91, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xA5, ++ FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xA7, FIL_, ++ 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xA3, FIL_, 0x01, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x92, FIL_, 0x01, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x93, FIL_, 0x01, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x94, FIL_, 0x01, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x95, FIL_, 0x01, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x96, FIL_, 0x01, ++ 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x97, FIL_, 0x04, ++ 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xAC, FIL_, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBC, 0xAA, FIL_, 0xCD, 0x85, ++ FIL_, 0xE1, 0xBE, 0x98, FIL_, 0xCD, 0x82, FIL_, ++ 0xE1, 0xBC, 0xAE, FIL_, 0x04, 0xCD, 0x82, FIL_, ++ 0xE1, 0xBC, 0xAF, FIL_, 0xCD, 0x85, FIL_, 0xE1, ++ 0xBE, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, ++ 0xAD, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xAB, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9A, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9B, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9C, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9D, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9E, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9F, ++ FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xB4, ++ FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xB2, FIL_, ++ 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xB6, FIL_, 0x03, ++ 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xB3, FIL_, 0xCD, ++ 0x82, FIL_, 0xE1, 0xBC, 0xB7, FIL_, 0xCC, 0x81, ++ FIL_, 0xE1, 0xBC, 0xB5, FIL_, 0x03, 0xCC, 0x81, ++ FIL_, 0xE1, 0xBC, 0xBC, FIL_, 0xCC, 0x80, FIL_, ++ 0xE1, 0xBC, 0xBA, FIL_, 0xCD, 0x82, FIL_, 0xE1, ++ 0xBC, 0xBE, FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, ++ 0xBC, 0xBB, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC, ++ 0xBF, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xBD, ++ FIL_, 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x82, ++ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x84, FIL_, ++ 0x02, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x85, FIL_, ++ 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x83, FIL_, 0x02, ++ 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x8A, FIL_, 0xCC, ++ 0x81, FIL_, 0xE1, 0xBD, 0x8C, FIL_, 0x02, 0xCC, ++ 0x80, FIL_, 0xE1, 0xBD, 0x8B, FIL_, 0xCC, 0x81, ++ FIL_, 0xE1, 0xBD, 0x8D, FIL_, 0x03, 0xCD, 0x82, ++ FIL_, 0xE1, 0xBD, 0x96, FIL_, 0xCC, 0x80, FIL_, ++ 0xE1, 0xBD, 0x92, FIL_, 0xCC, 0x81, FIL_, 0xE1, ++ 0xBD, 0x94, FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, ++ 0xBD, 0x93, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, ++ 0x97, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x95, ++ FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x9B, ++ FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0x9F, FIL_, ++ 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x9D, FIL_, 0x04, ++ 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xA6, FIL_, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0xA0, FIL_, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBD, 0xA2, FIL_, 0xCC, 0x81, FIL_, ++ 0xE1, 0xBD, 0xA4, FIL_, 0x04, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0xA1, FIL_, 0xCD, 0x82, FIL_, 0xE1, ++ 0xBD, 0xA7, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, ++ 0xA5, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xA3, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA2, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA3, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA4, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA5, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA6, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA7, ++ FIL_, 0x04, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xAA, ++ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0xAC, FIL_, ++ 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xAE, FIL_, 0xCD, ++ 0x85, FIL_, 0xE1, 0xBE, 0xA8, FIL_, 0x04, 0xCD, ++ 0x82, FIL_, 0xE1, 0xBD, 0xAF, FIL_, 0xCC, 0x80, ++ FIL_, 0xE1, 0xBD, 0xAB, FIL_, 0xCD, 0x85, FIL_, ++ 0xE1, 0xBE, 0xA9, FIL_, 0xCC, 0x81, FIL_, 0xE1, ++ 0xBD, 0xAD, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, ++ 0xBE, 0xAA, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, ++ 0xBE, 0xAB, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, ++ 0xBE, 0xAC, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, ++ 0xBE, 0xAD, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, ++ 0xBE, 0xAE, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, ++ 0xBE, 0xAF, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, ++ 0xBE, 0xB2, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, ++ 0xBF, 0x82, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, ++ 0xBF, 0xB2, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, ++ 0xBE, 0xB7, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, ++ 0xBF, 0x8E, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, ++ 0x8D, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0x8F, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0x87, ++ FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB7, ++ FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x9D, ++ FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBF, 0x9E, FIL_, ++ 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0x9F, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x86, 0x9A, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x86, 0x9B, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x86, 0xAE, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x87, 0x8D, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x87, 0x8F, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x87, 0x8E, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0x84, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0x89, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0x8C, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0xA4, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0xA6, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x81, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x84, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x87, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x89, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAD, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xA2, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB0, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB1, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB4, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB5, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB8, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB9, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x80, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x81, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA0, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA1, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x84, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x85, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x88, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x89, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA2, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA3, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAC, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAD, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAE, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAF, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAA, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAB, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAC, FIL_, 0x01, ++ 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAD, FIL_, 0x01, ++ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0x94, FIL_, ++ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x8C, ++ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, ++ 0x8E, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, ++ 0x81, 0x90, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, ++ 0xE3, 0x81, 0x92, FIL_, 0x01, 0xE3, 0x82, 0x99, ++ FIL_, 0xE3, 0x81, 0x94, FIL_, 0x01, 0xE3, 0x82, ++ 0x99, FIL_, 0xE3, 0x81, 0x96, FIL_, 0x01, 0xE3, ++ 0x82, 0x99, FIL_, 0xE3, 0x81, 0x98, FIL_, 0x01, ++ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x9A, FIL_, ++ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x9C, ++ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, ++ 0x9E, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, ++ 0x81, 0xA0, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, ++ 0xE3, 0x81, 0xA2, FIL_, 0x01, 0xE3, 0x82, 0x99, ++ FIL_, 0xE3, 0x81, 0xA5, FIL_, 0x01, 0xE3, 0x82, ++ 0x99, FIL_, 0xE3, 0x81, 0xA7, FIL_, 0x01, 0xE3, ++ 0x82, 0x99, FIL_, 0xE3, 0x81, 0xA9, FIL_, 0x02, ++ 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xB1, FIL_, ++ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB0, FIL_, ++ 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB3, ++ FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xB4, ++ FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, ++ 0xB6, FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, ++ 0xB7, FIL_, 0x02, 0xE3, 0x82, 0x9A, FIL_, 0xE3, ++ 0x81, 0xBA, FIL_, 0xE3, 0x82, 0x99, FIL_, 0xE3, ++ 0x81, 0xB9, FIL_, 0x02, 0xE3, 0x82, 0x9A, FIL_, ++ 0xE3, 0x81, 0xBD, FIL_, 0xE3, 0x82, 0x99, FIL_, ++ 0xE3, 0x81, 0xBC, FIL_, 0x01, 0xE3, 0x82, 0x99, ++ FIL_, 0xE3, 0x82, 0x9E, FIL_, 0x01, 0xE3, 0x82, ++ 0x99, FIL_, 0xE3, 0x83, 0xB4, FIL_, 0x01, 0xE3, ++ 0x82, 0x99, FIL_, 0xE3, 0x82, 0xAC, FIL_, 0x01, ++ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xAE, FIL_, ++ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB0, ++ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, ++ 0xB2, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, ++ 0x82, 0xB4, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, ++ 0xE3, 0x82, 0xB6, FIL_, 0x01, 0xE3, 0x82, 0x99, ++ FIL_, 0xE3, 0x82, 0xB8, FIL_, 0x01, 0xE3, 0x82, ++ 0x99, FIL_, 0xE3, 0x82, 0xBA, FIL_, 0x01, 0xE3, ++ 0x82, 0x99, FIL_, 0xE3, 0x82, 0xBC, FIL_, 0x01, ++ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xBE, FIL_, ++ 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x80, ++ FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, ++ 0x82, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, ++ 0x83, 0x85, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, ++ 0xE3, 0x83, 0x87, FIL_, 0x01, 0xE3, 0x82, 0x99, ++ FIL_, 0xE3, 0x83, 0x89, FIL_, 0x02, 0xE3, 0x82, ++ 0x99, FIL_, 0xE3, 0x83, 0x90, FIL_, 0xE3, 0x82, ++ 0x9A, FIL_, 0xE3, 0x83, 0x91, FIL_, 0x02, 0xE3, ++ 0x82, 0x99, FIL_, 0xE3, 0x83, 0x93, FIL_, 0xE3, ++ 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x94, FIL_, 0x02, ++ 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x97, FIL_, ++ 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x96, FIL_, ++ 0x02, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x9A, ++ FIL_, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x99, ++ FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, ++ 0x9C, FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x83, ++ 0x9D, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, ++ 0x83, 0xB7, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, ++ 0xE3, 0x83, 0xB8, FIL_, 0x01, 0xE3, 0x82, 0x99, ++ FIL_, 0xE3, 0x83, 0xB9, FIL_, 0x01, 0xE3, 0x82, ++ 0x99, FIL_, 0xE3, 0x83, 0xBA, FIL_, 0x01, 0xE3, ++ 0x82, 0x99, FIL_, 0xE3, 0x83, 0xBE, FIL_, ++ }, ++}; ++ ++static const uchar_t u8_decomp_b2_tbl[2][2][256] = { ++ { ++ { ++ 0, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 1, 2, 3, 4, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, 5, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, 6, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, 7, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ ++ }, ++ { ++ { ++ 0, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 1, 2, 3, 4, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, 5, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, 6, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, 7, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ ++ }, ++ ++}; ++ ++static const u8_displacement_t u8_decomp_b3_tbl[2][8][256] = { ++ { ++ { /* Third byte table 0. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 0, 0 }, ++ { 1, 35 }, { 2, 247 }, { 3, 474 }, ++ { 4, 693 }, { 5, 709 }, { 6, 951 }, ++ { N_, 0 }, { 7, 1139 }, { 8, 1152 }, ++ { N_, 0 }, { 9, 1177 }, { 10, 1199 }, ++ { 11, 1295 }, { 12, 1360 }, { 13, 1405 }, ++ { N_, 0 }, { 14, 1450 }, { N_, 0 }, ++ { N_, 0 }, { 15, 1620 }, { N_, 0 }, ++ { 16, 1624 }, { 17, 1649 }, { N_, 0 }, ++ { 18, 1665 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 1. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 19, 1680 }, ++ { 20, 1701 }, { N_, 0 }, { 21, 1757 }, ++ { 22, 1792 }, { 23, 1806 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 24, 1834 }, ++ { 25, 1869 }, { 26, 1876 }, { N_, 0 }, ++ { 27, 1897 }, { N_, 0 }, { 28, 1904 }, ++ { N_, 0 }, { 29, 1942 }, { N_, 0 }, ++ { 30, 1963 }, { 31, 1994 }, { N_, 0 }, ++ { 32, 2000 }, { 33, 2006 }, { 34, 2018 }, ++ { 35, 2021 }, { 36, 2109 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 2. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 37, 2158 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 0x8000, 2165 }, { 0x8001, 2445 }, ++ { 0x8002, 2741 }, { 0x8003, 3029 }, { 0x8004, 3337 }, ++ { 0x8005, 3725 }, { 0x8006, 4053 }, { 0x8007, 4536 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 3. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 38, 4895 }, ++ { 39, 4964 }, { 40, 4999 }, { N_, 0 }, ++ { 41, 5018 }, { 42, 5098 }, { 43, 5230 }, ++ { 44, 5248 }, { 45, 5266 }, { 46, 5326 }, ++ { 47, 5410 }, { 48, 5470 }, { 49, 5518 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 50, 5526 }, { 51, 5596 }, ++ { 52, 5767 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 53, 5810 }, { 54, 5822 }, { N_, 0 }, ++ { 55, 5830 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 56, 5836 }, { 57, 5839 }, { 58, 5842 }, ++ { 59, 6034 }, { 60, 6226 }, { 61, 6418 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 4. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 62, 6484 }, ++ { 63, 6497 }, { 64, 6672 }, { 65, 6770 }, ++ { 66, 6923 }, { 67, 6968 }, { 68, 7160 }, ++ { N_, 0 }, { 0x8008, 7247 }, { 69, 7597 }, ++ { 70, 7773 }, { 71, 7950 }, { 0x8009, 8142 }, ++ { 0x800A, 8919 }, { 72, 9351 }, { 73, 9522 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 5. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 0x800B, 9743 }, ++ { 0x800C, 9999 }, { 0x800D, 10255 }, { 0x800E, 10511 }, ++ { 74, 10767 }, { 75, 10967 }, { N_, 0 }, ++ { N_, 0 }, { 76, 11139 }, { 77, 11303 }, ++ { 78, 11468 }, { 79, 11576 }, { 0x800F, 11740 }, ++ { 0x8010, 12006 }, { 0x8011, 12280 }, { 0x8012, 12546 }, ++ { 80, 12812 }, { 0x8013, 13060 }, { 0x8014, 13348 }, ++ { 81, 13720 }, { 82, 13898 }, { 83, 13933 }, ++ { 84, 14045 }, { 85, 14197 }, { 86, 14347 }, ++ { 87, 14410 }, { 88, 14540 }, { 89, 14729 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 6. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 90, 14829 }, { 91, 14912 }, ++ { 92, 14969 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 93, 14982 }, { 94, 15046 }, { 95, 15109 }, ++ { 96, 15163 }, { 97, 15225 }, { 98, 15282 }, ++ { 99, 15341 }, { 100, 15405 }, { 101, 15469 }, ++ { 102, 15533 }, { 103, 15597 }, { 104, 15681 }, ++ { 105, 15812 }, { 106, 15942 }, { 107, 16072 }, ++ { 108, 16202 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 7. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 0x8015, 16273 }, { 0x8016, 16536 }, ++ { 0x8017, 16799 }, { 0x8018, 17064 }, { 0x8019, 17329 }, ++ { 0x801A, 17601 }, { 0x801B, 17878 }, { 0x801C, 18147 }, ++ { 109, 18419 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ }, ++ { ++ { /* Third byte table 0. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 0, 0 }, ++ { 1, 35 }, { 2, 247 }, { 3, 474 }, ++ { 4, 693 }, { 5, 709 }, { 6, 951 }, ++ { N_, 0 }, { 7, 1139 }, { 8, 1152 }, ++ { N_, 0 }, { 9, 1177 }, { 10, 1199 }, ++ { 11, 1295 }, { 12, 1362 }, { 13, 1407 }, ++ { N_, 0 }, { 14, 1452 }, { N_, 0 }, ++ { N_, 0 }, { 15, 1622 }, { N_, 0 }, ++ { 16, 1626 }, { 17, 1651 }, { N_, 0 }, ++ { 18, 1667 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 1. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 19, 1682 }, ++ { 20, 1703 }, { N_, 0 }, { 21, 1759 }, ++ { 22, 1794 }, { 23, 1808 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 24, 1836 }, ++ { 25, 1871 }, { 26, 1878 }, { N_, 0 }, ++ { 27, 1899 }, { N_, 0 }, { 28, 1906 }, ++ { N_, 0 }, { 29, 1944 }, { N_, 0 }, ++ { 30, 1965 }, { 31, 1996 }, { N_, 0 }, ++ { 32, 2002 }, { 33, 2008 }, { 34, 2020 }, ++ { 35, 2023 }, { 36, 2111 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 2. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 37, 2160 }, ++ { N_, 0 }, { N_, 0 }, { 38, 2167 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 39, 2170 }, { 40, 2226 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 41, 2247 }, { 42, 2268 }, { 43, 2340 }, ++ { N_, 0 }, { 0x8000, 2414 }, { 0x8001, 2694 }, ++ { 0x8002, 2990 }, { 0x8003, 3278 }, { 0x8004, 3586 }, ++ { 0x8005, 3974 }, { 0x8006, 4302 }, { 0x8007, 4785 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 3. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 44, 5144 }, ++ { 45, 5213 }, { 46, 5248 }, { N_, 0 }, ++ { 47, 5273 }, { 48, 5358 }, { 49, 5490 }, ++ { 50, 5508 }, { 51, 5526 }, { 52, 5586 }, ++ { 53, 5670 }, { 54, 5730 }, { 55, 5778 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 56, 5786 }, { 57, 5856 }, ++ { 58, 6027 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 59, 6070 }, { 60, 6082 }, { N_, 0 }, ++ { 61, 6090 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 62, 6096 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 63, 6099 }, { 64, 6102 }, { 65, 6105 }, ++ { 66, 6297 }, { 67, 6489 }, { 68, 6681 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 4. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 69, 6747 }, ++ { 70, 6760 }, { 71, 6935 }, { 72, 7033 }, ++ { 73, 7186 }, { 74, 7231 }, { 75, 7423 }, ++ { N_, 0 }, { 0x8008, 7510 }, { 76, 7891 }, ++ { 77, 8103 }, { 78, 8280 }, { 0x8009, 8482 }, ++ { 0x800A, 9259 }, { 79, 9701 }, { 80, 9872 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 5. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 0x800B, 10106 }, ++ { 0x800C, 10362 }, { 0x800D, 10618 }, { 0x800E, 10874 }, ++ { 81, 11130 }, { 82, 11330 }, { 0x800F, 11566 }, ++ { 83, 11822 }, { 84, 11932 }, { 85, 12096 }, ++ { 86, 12261 }, { 87, 12369 }, { 0x8010, 12533 }, ++ { 0x8011, 12799 }, { 0x8012, 13073 }, { 0x8013, 13339 }, ++ { 88, 13605 }, { 0x8014, 13853 }, { 0x8015, 14141 }, ++ { 89, 14513 }, { 90, 14691 }, { 91, 14746 }, ++ { 92, 14860 }, { 93, 15012 }, { 94, 15162 }, ++ { 95, 15225 }, { 96, 15355 }, { 97, 15544 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 6. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 98, 15644 }, { 99, 15727 }, ++ { 100, 15784 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 101, 15797 }, { 102, 15861 }, { 103, 15924 }, ++ { 104, 15978 }, { 105, 16041 }, { 106, 16098 }, ++ { 107, 16157 }, { 108, 16221 }, { 109, 16285 }, ++ { 110, 16349 }, { 111, 16413 }, { 112, 16501 }, ++ { 113, 16632 }, { 114, 16762 }, { 115, 16892 }, ++ { 116, 17022 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ { /* Third byte table 7. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 0x8016, 17097 }, { 0x8017, 17360 }, ++ { 0x8018, 17623 }, { 0x8019, 17888 }, { 0x801A, 18153 }, ++ { 0x801B, 18425 }, { 0x801C, 18702 }, { 0x801D, 18971 }, ++ { 117, 19243 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, ++ }, ++ }, ++}; ++ ++static const uchar_t u8_decomp_b4_tbl[2][118][257] = { ++ { ++ { /* Fourth byte table 0. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 1, 1, 1, 1, 1, 1, ++ 1, 4, 4, 5, 5, 5, 5, 5, ++ 8, 8, 8, 9, 10, 13, 15, 15, ++ 15, 18, 19, 20, 20, 25, 30, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, ++ }, ++ { /* Fourth byte table 1. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 24, ++ 28, 32, 36, 40, 44, 48, 52, 56, ++ 60, 60, 64, 68, 72, 76, 80, 84, ++ 84, 84, 88, 92, 96, 100, 104, 104, ++ 104, 108, 112, 116, 120, 124, 128, 128, ++ 132, 136, 140, 144, 148, 152, 156, 160, ++ 164, 164, 168, 172, 176, 180, 184, 188, ++ 188, 188, 192, 196, 200, 204, 208, 208, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, ++ }, ++ { /* Fourth byte table 2. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 64, 64, 68, 72, 76, 80, 84, ++ 88, 92, 96, 100, 104, 108, 112, 116, ++ 120, 124, 128, 132, 136, 140, 144, 144, ++ 144, 148, 152, 156, 160, 164, 168, 172, ++ 176, 180, 180, 182, 184, 188, 192, 196, ++ 200, 200, 204, 208, 212, 216, 220, 224, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, ++ }, ++ { /* Fourth byte table 3. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 3, 3, 7, 11, 15, 19, ++ 23, 27, 30, 30, 30, 34, 38, 42, ++ 46, 50, 54, 54, 54, 58, 62, 66, ++ 70, 74, 78, 82, 86, 90, 94, 98, ++ 102, 106, 110, 114, 118, 122, 126, 126, ++ 126, 130, 134, 138, 142, 146, 150, 154, ++ 158, 162, 166, 170, 174, 178, 182, 186, ++ 190, 194, 198, 202, 206, 210, 214, 218, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, ++ }, ++ { /* Fourth byte table 4. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 12, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, ++ }, ++ { /* Fourth byte table 5. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 4, 8, 12, ++ 14, 16, 18, 20, 22, 24, 28, 32, ++ 36, 40, 44, 48, 52, 56, 62, 68, ++ 74, 80, 86, 92, 98, 104, 104, 110, ++ 116, 122, 128, 133, 138, 138, 138, 142, ++ 146, 150, 154, 158, 162, 168, 174, 179, ++ 184, 188, 190, 192, 194, 198, 202, 202, ++ 202, 206, 210, 216, 222, 227, 232, 237, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, ++ }, ++ { /* Fourth byte table 6. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 112, 112, 116, ++ 120, 120, 120, 120, 120, 120, 120, 124, ++ 128, 132, 136, 142, 148, 154, 160, 164, ++ 168, 174, 180, 184, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, ++ }, ++ { /* Fourth byte table 7. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 3, 4, 5, 7, 9, 11, ++ 12, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, ++ }, ++ { /* Fourth byte table 8. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 18, ++ 18, 20, 21, 22, 23, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, ++ }, ++ { /* Fourth byte table 9. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 6, 9, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 17, 17, 17, ++ 17, 17, 17, 20, 20, 20, 20, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, ++ }, ++ { /* Fourth byte table 10. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 3, 14, 19, ++ 22, 27, 32, 37, 37, 42, 42, 47, ++ 52, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 64, 69, 74, 79, 84, ++ 89, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 11. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 5, 10, 15, 20, 25, ++ 25, 27, 29, 31, 41, 51, 53, 55, ++ 55, 55, 55, 55, 55, 55, 55, 55, ++ 55, 55, 55, 55, 55, 55, 55, 55, ++ 55, 55, 55, 55, 55, 55, 55, 55, ++ 55, 57, 59, 61, 61, 63, 65, 65, ++ 65, 65, 65, 65, 65, 65, 65, 65, ++ 65, 65, 65, 65, 65, 65, 65, 65, ++ 65, 65, 65, 65, 65, 65, 65, 65, ++ 65, 65, 65, 65, 65, 65, 65, 65, ++ 65, 65, 65, 65, 65, 65, 65, 65, ++ 65, 65, 65, 65, 65, 65, 65, 65, ++ 65, 65, 65, 65, 65, 65, 65, 65, ++ 65, 65, 65, 65, 65, 65, 65, 65, ++ 65, 65, 65, 65, 65, 65, 65, 65, ++ 65, ++ }, ++ { /* Fourth byte table 12. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 10, 10, 15, 15, 15, 15, ++ 20, 20, 20, 20, 20, 25, 30, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 40, 40, 40, 40, 40, 40, ++ 40, 40, 40, 40, 40, 40, 40, 40, ++ 40, 40, 40, 40, 40, 40, 40, 40, ++ 40, 40, 40, 40, 40, 40, 40, 40, ++ 40, 40, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, ++ }, ++ { /* Fourth byte table 13. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 10, 10, 15, 15, 15, 15, ++ 20, 20, 20, 20, 20, 25, 30, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 40, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, ++ }, ++ { /* Fourth byte table 14. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 5, 10, 10, 10, 10, 10, ++ 10, 10, 10, 10, 10, 10, 10, 10, ++ 10, 15, 20, 25, 30, 30, 30, 35, ++ 40, 40, 40, 45, 50, 55, 60, 65, ++ 70, 70, 70, 75, 80, 85, 90, 95, ++ 100, 100, 100, 105, 110, 115, 120, 125, ++ 130, 135, 140, 145, 150, 155, 160, 160, ++ 160, 165, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, ++ }, ++ { /* Fourth byte table 15. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, ++ }, ++ { /* Fourth byte table 16. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 5, 10, 15, 20, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, ++ }, ++ { /* Fourth byte table 17. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 4, 8, ++ 12, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, ++ }, ++ { /* Fourth byte table 18. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 5, 10, 10, 10, 10, 10, ++ 10, 10, 10, 10, 10, 10, 10, 10, ++ 10, 10, 10, 10, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, ++ }, ++ { /* Fourth byte table 19. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 7, 7, 7, 7, 7, 7, ++ 7, 7, 14, 14, 14, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, ++ }, ++ { /* Fourth byte table 20. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 14, 21, 28, 35, 42, 49, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, ++ }, ++ { /* Fourth byte table 21. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 21, 28, 28, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, ++ }, ++ { /* Fourth byte table 22. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 7, 7, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, ++ }, ++ { /* Fourth byte table 23. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 7, 14, 21, 21, 21, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, ++ }, ++ { /* Fourth byte table 24. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 7, 7, 14, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 28, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, ++ }, ++ { /* Fourth byte table 25. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, ++ }, ++ { /* Fourth byte table 26. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 7, 14, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, ++ }, ++ { /* Fourth byte table 27. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, ++ }, ++ { /* Fourth byte table 28. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 7, 7, 7, 7, 7, 7, ++ 14, 21, 21, 28, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, ++ }, ++ { /* Fourth byte table 29. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 7, 14, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, ++ }, ++ { /* Fourth byte table 30. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 7, 7, 14, 24, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, ++ }, ++ { /* Fourth byte table 31. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, ++ }, ++ { /* Fourth byte table 32. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, ++ }, ++ { /* Fourth byte table 33. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 6, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, ++ }, ++ { /* Fourth byte table 34. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, ++ }, ++ { /* Fourth byte table 35. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 14, 14, ++ 14, 14, 14, 21, 21, 21, 21, 21, ++ 28, 28, 28, 28, 28, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 42, 42, 42, 42, 42, 42, ++ 42, 42, 42, 42, 49, 49, 56, 63, ++ 72, 79, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, ++ }, ++ { /* Fourth byte table 36. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 21, 21, ++ 21, 21, 21, 28, 28, 28, 28, 28, ++ 35, 35, 35, 35, 35, 42, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 42, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, ++ }, ++ { /* Fourth byte table 37. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, ++ }, ++ { /* Fourth byte table 38. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 13, 14, 15, 16, 17, ++ 18, 19, 20, 21, 21, 21, 21, 21, ++ 21, 21, 24, 24, 24, 24, 24, 24, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 28, 30, 33, ++ 33, 33, 33, 33, 33, 33, 33, 33, ++ 34, 34, 34, 34, 40, 49, 49, 55, ++ 64, 64, 64, 64, 64, 66, 66, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, ++ }, ++ { /* Fourth byte table 39. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 2, 4, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 20, 21, 21, 21, 22, 23, 24, ++ 25, 26, 27, 28, 31, 32, 33, 34, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, ++ }, ++ { /* Fourth byte table 40. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 14, 15, 16, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 17, 17, 17, 17, 17, 17, 17, ++ 17, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, ++ }, ++ { /* Fourth byte table 41. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 7, 10, 10, 13, 16, ++ 18, 18, 21, 22, 23, 24, 25, 26, ++ 28, 29, 30, 31, 32, 32, 33, 35, ++ 35, 35, 36, 37, 38, 39, 40, 40, ++ 40, 42, 45, 47, 47, 48, 48, 51, ++ 51, 52, 52, 54, 58, 59, 60, 60, ++ 61, 62, 63, 63, 64, 65, 67, 69, ++ 71, 73, 74, 74, 74, 74, 76, 78, ++ 80, 80, 80, 80, 80, 80, 80, 80, ++ 80, 80, 80, 80, 80, 80, 80, 80, ++ 80, 80, 80, 80, 80, 80, 80, 80, ++ 80, 80, 80, 80, 80, 80, 80, 80, ++ 80, 80, 80, 80, 80, 80, 80, 80, ++ 80, 80, 80, 80, 80, 80, 80, 80, ++ 80, 80, 80, 80, 80, 80, 80, 80, ++ 80, 80, 80, 80, 80, 80, 80, 80, ++ 80, ++ }, ++ { /* Fourth byte table 42. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 3, 3, 3, 3, 4, 5, ++ 6, 7, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 13, 18, 23, 28, ++ 33, 38, 43, 48, 53, 58, 63, 68, ++ 72, 73, 75, 78, 80, 81, 83, 86, ++ 90, 92, 93, 95, 98, 99, 100, 101, ++ 102, 103, 105, 108, 110, 111, 113, 116, ++ 120, 122, 123, 125, 128, 129, 130, 131, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, ++ }, ++ { /* Fourth byte table 43. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 6, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, ++ }, ++ { /* Fourth byte table 44. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 6, 12, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, ++ }, ++ { /* Fourth byte table 45. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 6, 6, 6, ++ 6, 6, 12, 12, 12, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 24, 24, 30, ++ 30, 30, 30, 30, 30, 36, 45, 45, ++ 51, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, ++ }, ++ { /* Fourth byte table 46. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 6, 6, 6, 12, 12, 12, ++ 18, 18, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 28, 28, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 40, 44, ++ 48, 54, 60, 60, 60, 66, 72, 72, ++ 72, 78, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, ++ }, ++ { /* Fourth byte table 47. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 12, 12, 18, 24, 24, ++ 24, 30, 36, 36, 36, 36, 36, 36, ++ 36, 36, 36, 36, 36, 36, 36, 36, ++ 36, 36, 36, 36, 36, 36, 36, 36, ++ 36, 36, 36, 36, 36, 36, 36, 36, ++ 36, 36, 36, 36, 36, 42, 48, 54, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, ++ }, ++ { /* Fourth byte table 48. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 18, 24, 24, 24, 24, ++ 24, 24, 24, 30, 36, 42, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, ++ }, ++ { /* Fourth byte table 49. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 4, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, ++ }, ++ { /* Fourth byte table 50. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 11, 13, 15, 17, 19, 21, ++ 23, 25, 27, 29, 31, 34, 37, 40, ++ 43, 46, 49, 52, 55, 58, 62, 66, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, ++ }, ++ { /* Fourth byte table 51. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 48, 50, 53, 56, 59, 62, 65, 68, ++ 71, 74, 77, 80, 83, 86, 89, 92, ++ 95, 98, 101, 104, 107, 110, 113, 116, ++ 119, 122, 125, 128, 131, 134, 137, 140, ++ 143, 146, 149, 152, 155, 158, 161, 162, ++ 163, 164, 165, 166, 167, 168, 169, 170, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, ++ }, ++ { /* Fourth byte table 52. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 32, 33, 34, 35, 36, 37, 38, 39, ++ 40, 41, 42, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, ++ }, ++ { /* Fourth byte table 53. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, ++ }, ++ { /* Fourth byte table 54. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 3, 5, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, ++ }, ++ { /* Fourth byte table 55. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, ++ }, ++ { /* Fourth byte table 56. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, ++ }, ++ { /* Fourth byte table 57. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, ++ }, ++ { /* Fourth byte table 58. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 96, 99, 102, 105, 108, 111, 114, 117, ++ 120, 123, 126, 129, 132, 135, 138, 141, ++ 144, 147, 150, 153, 156, 159, 162, 165, ++ 168, 171, 174, 177, 180, 183, 186, 189, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, ++ }, ++ { /* Fourth byte table 59. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 96, 99, 102, 105, 108, 111, 114, 117, ++ 120, 123, 126, 129, 132, 135, 138, 141, ++ 144, 147, 150, 153, 156, 159, 162, 165, ++ 168, 171, 174, 177, 180, 183, 186, 189, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, ++ }, ++ { /* Fourth byte table 60. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 96, 99, 102, 105, 108, 111, 114, 117, ++ 120, 123, 126, 129, 132, 135, 138, 141, ++ 144, 147, 150, 153, 156, 159, 162, 165, ++ 168, 171, 174, 177, 180, 183, 186, 189, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, ++ }, ++ { /* Fourth byte table 61. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, ++ }, ++ { /* Fourth byte table 62. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 4, ++ 4, 7, 10, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, ++ }, ++ { /* Fourth byte table 63. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 7, 7, 14, ++ 14, 21, 21, 28, 28, 35, 35, 42, ++ 42, 49, 49, 56, 56, 63, 63, 70, ++ 70, 77, 77, 84, 84, 84, 91, 91, ++ 98, 98, 105, 105, 105, 105, 105, 105, ++ 105, 112, 119, 119, 126, 133, 133, 140, ++ 147, 147, 154, 161, 161, 168, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, ++ }, ++ { /* Fourth byte table 64. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 7, 7, 7, ++ 7, 7, 7, 7, 11, 15, 15, 22, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 35, 35, 42, ++ 42, 49, 49, 56, 56, 63, 63, 70, ++ 70, 77, 77, 84, 84, 91, 91, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, ++ }, ++ { /* Fourth byte table 65. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 7, 14, 14, 14, 21, 21, ++ 28, 28, 35, 35, 35, 35, 35, 35, ++ 35, 42, 49, 49, 56, 63, 63, 70, ++ 77, 77, 84, 91, 91, 98, 105, 105, ++ 105, 105, 105, 105, 105, 105, 105, 105, ++ 105, 105, 105, 105, 105, 105, 105, 105, ++ 105, 105, 105, 105, 105, 112, 112, 112, ++ 119, 126, 133, 140, 140, 140, 140, 147, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, ++ }, ++ { /* Fourth byte table 66. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 6, 9, 12, 15, 18, ++ 21, 24, 27, 30, 33, 36, 39, 42, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, ++ }, ++ { /* Fourth byte table 67. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 96, 99, 102, 105, 108, 111, 114, 117, ++ 120, 123, 126, 129, 132, 135, 138, 141, ++ 144, 147, 150, 153, 156, 159, 162, 165, ++ 168, 171, 174, 177, 180, 183, 186, 189, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, ++ }, ++ { /* Fourth byte table 68. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 45, 45, 45, 48, 51, 54, 57, 60, ++ 63, 66, 69, 72, 75, 78, 81, 84, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, ++ }, ++ { /* Fourth byte table 69. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 10, 15, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 22, 24, 26, 28, 30, 32, ++ 34, 36, 38, 40, 42, 44, 46, 48, ++ 50, 53, 56, 59, 62, 65, 68, 71, ++ 74, 77, 80, 83, 86, 89, 92, 98, ++ 104, 110, 116, 122, 128, 134, 140, 146, ++ 152, 158, 164, 170, 176, 176, 176, 176, ++ 176, 176, 176, 176, 176, 176, 176, 176, ++ 176, 176, 176, 176, 176, 176, 176, 176, ++ 176, 176, 176, 176, 176, 176, 176, 176, ++ 176, 176, 176, 176, 176, 176, 176, 176, ++ 176, 176, 176, 176, 176, 176, 176, 176, ++ 176, 176, 176, 176, 176, 176, 176, 176, ++ 176, 176, 176, 176, 176, 176, 176, 176, ++ 176, 176, 176, 176, 176, 176, 176, 176, ++ 176, ++ }, ++ { /* Fourth byte table 70. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 96, 99, 102, 105, 108, 111, 114, 117, ++ 120, 123, 126, 129, 132, 135, 138, 141, ++ 144, 147, 149, 151, 153, 155, 157, 159, ++ 161, 163, 165, 167, 169, 171, 173, 175, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, ++ }, ++ { /* Fourth byte table 71. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 41, 46, 51, 51, 51, 51, ++ 51, 54, 57, 60, 63, 66, 69, 72, ++ 75, 78, 81, 84, 87, 90, 93, 96, ++ 99, 102, 105, 108, 111, 114, 117, 120, ++ 123, 126, 129, 132, 135, 138, 141, 144, ++ 147, 150, 153, 156, 159, 162, 165, 168, ++ 171, 174, 177, 180, 183, 186, 189, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, ++ }, ++ { /* Fourth byte table 72. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 7, 9, 11, 13, 15, ++ 17, 20, 24, 26, 28, 31, 34, 36, ++ 38, 40, 43, 46, 49, 52, 55, 57, ++ 59, 61, 63, 65, 68, 70, 72, 74, ++ 77, 80, 82, 85, 88, 91, 93, 96, ++ 101, 107, 109, 112, 115, 118, 121, 128, ++ 136, 138, 140, 143, 145, 147, 149, 152, ++ 154, 156, 158, 160, 162, 165, 167, 169, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, ++ }, ++ { /* Fourth byte table 73. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 10, 12, 14, 16, 22, ++ 25, 27, 29, 31, 33, 35, 37, 39, ++ 41, 43, 45, 48, 50, 52, 55, 58, ++ 60, 64, 67, 69, 71, 73, 75, 75, ++ 75, 79, 83, 87, 91, 95, 99, 103, ++ 107, 111, 116, 121, 126, 131, 136, 141, ++ 146, 151, 156, 161, 166, 171, 176, 181, ++ 186, 191, 196, 201, 206, 211, 216, 221, ++ 221, 221, 221, 221, 221, 221, 221, 221, ++ 221, 221, 221, 221, 221, 221, 221, 221, ++ 221, 221, 221, 221, 221, 221, 221, 221, ++ 221, 221, 221, 221, 221, 221, 221, 221, ++ 221, 221, 221, 221, 221, 221, 221, 221, ++ 221, 221, 221, 221, 221, 221, 221, 221, ++ 221, 221, 221, 221, 221, 221, 221, 221, ++ 221, 221, 221, 221, 221, 221, 221, 221, ++ 221, ++ }, ++ { /* Fourth byte table 74. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 56, ++ 56, 60, 60, 64, 64, 64, 68, 72, ++ 76, 80, 84, 88, 92, 96, 100, 104, ++ 104, 108, 108, 112, 112, 112, 116, 120, ++ 120, 120, 120, 124, 128, 132, 136, 136, ++ 136, 140, 144, 148, 152, 156, 160, 164, ++ 168, 172, 176, 180, 184, 188, 192, 196, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, ++ }, ++ { /* Fourth byte table 75. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 128, 132, 136, 140, 144, 148, 152, 156, ++ 160, 164, 168, 172, 172, 172, 172, 172, ++ 172, 172, 172, 172, 172, 172, 172, 172, ++ 172, 172, 172, 172, 172, 172, 172, 172, ++ 172, 172, 172, 172, 172, 172, 172, 172, ++ 172, 172, 172, 172, 172, 172, 172, 172, ++ 172, 172, 172, 172, 172, 172, 172, 172, ++ 172, 172, 172, 172, 172, 172, 172, 172, ++ 172, 172, 172, 172, 172, 172, 172, 172, ++ 172, 172, 172, 172, 172, 172, 172, 172, ++ 172, 172, 172, 172, 172, 172, 172, 172, ++ 172, 172, 172, 172, 172, 172, 172, 172, ++ 172, ++ }, ++ { /* Fourth byte table 76. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 9, 12, 14, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 20, 24, 28, 32, ++ 36, 36, 36, 36, 36, 36, 41, 41, ++ 46, 48, 50, 52, 54, 56, 58, 60, ++ 62, 64, 65, 70, 75, 82, 89, 94, ++ 99, 104, 109, 114, 119, 124, 129, 134, ++ 134, 139, 144, 149, 154, 159, 159, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, ++ }, ++ { /* Fourth byte table 77. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 10, 10, 15, 20, 20, 25, ++ 30, 35, 40, 45, 50, 55, 60, 65, ++ 69, 71, 73, 75, 77, 79, 81, 83, ++ 85, 87, 89, 91, 93, 95, 97, 99, ++ 101, 103, 105, 107, 109, 111, 113, 115, ++ 117, 119, 121, 123, 125, 127, 129, 131, ++ 133, 135, 137, 139, 141, 143, 145, 147, ++ 149, 151, 153, 155, 157, 159, 161, 163, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, ++ }, ++ { /* Fourth byte table 78. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 48, 50, 52, 54, 56, 58, 60, 62, ++ 64, 66, 68, 70, 72, 76, 80, 82, ++ 84, 86, 88, 90, 92, 94, 96, 98, ++ 100, 104, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, ++ }, ++ { /* Fourth byte table 79. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 2, 4, 6, 8, ++ 10, 12, 14, 16, 18, 20, 24, 26, ++ 28, 30, 32, 34, 36, 38, 40, 42, ++ 44, 46, 48, 54, 60, 66, 72, 78, ++ 84, 90, 96, 102, 108, 114, 120, 126, ++ 132, 138, 144, 150, 156, 158, 160, 162, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, ++ }, ++ { /* Fourth byte table 80. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 128, 132, 136, 140, 144, 148, 152, 156, ++ 160, 164, 168, 172, 176, 180, 184, 188, ++ 192, 196, 200, 204, 208, 212, 216, 220, ++ 224, 228, 232, 236, 240, 244, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, ++ }, ++ { /* Fourth byte table 81. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 18, 24, 30, 36, 42, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 54, 60, 68, 76, 84, 92, 100, ++ 108, 116, 122, 155, 170, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, ++ }, ++ { /* Fourth byte table 82. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 5, 8, 9, 10, 11, 12, ++ 13, 14, 17, 20, 23, 26, 29, 32, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, ++ }, ++ { /* Fourth byte table 83. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 15, 15, ++ 15, 15, 18, 21, 24, 27, 28, 29, ++ 30, 31, 34, 35, 35, 36, 37, 38, ++ 39, 42, 43, 44, 45, 46, 49, 52, ++ 53, 54, 55, 56, 57, 58, 59, 60, ++ 60, 61, 62, 63, 64, 64, 64, 64, ++ 64, 67, 71, 74, 74, 77, 77, 80, ++ 84, 87, 91, 94, 98, 101, 105, 108, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, ++ }, ++ { /* Fourth byte table 84. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 6, 10, 14, 18, 22, 26, ++ 30, 34, 38, 42, 46, 50, 52, 54, ++ 56, 58, 60, 62, 64, 66, 68, 70, ++ 72, 74, 76, 78, 80, 82, 84, 86, ++ 88, 90, 92, 94, 96, 98, 100, 102, ++ 104, 106, 108, 110, 112, 114, 116, 118, ++ 120, 122, 124, 126, 128, 130, 132, 134, ++ 136, 138, 140, 142, 144, 146, 148, 150, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, ++ }, ++ { /* Fourth byte table 85. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 48, 50, 52, 54, 56, 58, 60, 62, ++ 64, 66, 68, 70, 72, 74, 76, 78, ++ 80, 82, 84, 86, 88, 90, 92, 94, ++ 96, 98, 100, 102, 104, 106, 112, 118, ++ 124, 130, 136, 142, 146, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, ++ }, ++ { /* Fourth byte table 86. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 1, 2, 3, 4, 5, 6, ++ 7, 8, 9, 10, 11, 12, 13, 14, ++ 15, 16, 17, 18, 19, 20, 21, 22, ++ 23, 24, 25, 26, 27, 28, 29, 30, ++ 31, 32, 33, 34, 35, 36, 37, 38, ++ 39, 40, 41, 42, 43, 44, 45, 46, ++ 47, 48, 49, 50, 51, 52, 53, 54, ++ 55, 56, 57, 58, 59, 60, 61, 62, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, ++ }, ++ { /* Fourth byte table 87. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 34, 37, 40, 43, 46, 49, 52, 55, ++ 58, 61, 64, 67, 70, 73, 76, 79, ++ 82, 85, 88, 91, 94, 97, 100, 103, ++ 106, 109, 112, 115, 118, 121, 124, 127, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, ++ }, ++ { /* Fourth byte table 88. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 96, 99, 102, 105, 108, 111, 114, 117, ++ 120, 123, 126, 129, 132, 135, 138, 141, ++ 144, 147, 150, 153, 156, 159, 162, 165, ++ 168, 171, 174, 177, 180, 183, 186, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, ++ }, ++ { /* Fourth byte table 89. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 3, 6, 9, 12, 15, ++ 18, 18, 18, 21, 24, 27, 30, 33, ++ 36, 36, 36, 39, 42, 45, 48, 51, ++ 54, 54, 54, 57, 60, 63, 63, 63, ++ 63, 65, 67, 69, 72, 74, 76, 79, ++ 79, 82, 85, 88, 91, 94, 97, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, ++ }, ++ { /* Fourth byte table 90. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 9, ++ 18, 31, 44, 57, 70, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, ++ }, ++ { /* Fourth byte table 91. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 9, 18, 31, 44, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, ++ }, ++ { /* Fourth byte table 92. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, ++ }, ++ { /* Fourth byte table 93. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 32, 33, 34, 35, 36, 37, 38, 39, ++ 40, 41, 42, 43, 44, 45, 46, 47, ++ 48, 49, 50, 51, 52, 53, 54, 55, ++ 56, 57, 58, 59, 60, 61, 62, 63, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, ++ }, ++ { /* Fourth byte table 94. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 21, 22, ++ 23, 24, 25, 26, 27, 28, 29, 30, ++ 31, 32, 33, 34, 35, 36, 37, 38, ++ 39, 40, 41, 42, 43, 44, 45, 46, ++ 47, 48, 49, 50, 51, 52, 53, 54, ++ 55, 56, 57, 58, 59, 60, 61, 62, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, ++ }, ++ { /* Fourth byte table 95. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 29, 30, ++ 31, 31, 31, 32, 32, 32, 33, 34, ++ 34, 34, 35, 36, 37, 38, 38, 39, ++ 40, 41, 42, 43, 44, 45, 46, 47, ++ 48, 49, 50, 50, 51, 51, 52, 53, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, ++ }, ++ { /* Fourth byte table 96. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 1, 2, 3, 3, 4, 5, ++ 6, 7, 8, 9, 10, 11, 12, 13, ++ 14, 15, 16, 17, 18, 19, 20, 21, ++ 22, 23, 24, 25, 26, 27, 28, 29, ++ 30, 31, 32, 33, 34, 35, 36, 37, ++ 38, 39, 40, 41, 42, 43, 44, 45, ++ 46, 47, 48, 49, 50, 51, 52, 53, ++ 54, 55, 56, 57, 58, 59, 60, 61, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, ++ }, ++ { /* Fourth byte table 97. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 6, ++ 7, 8, 9, 10, 10, 10, 11, 12, ++ 13, 14, 15, 16, 17, 18, 18, 19, ++ 20, 21, 22, 23, 24, 25, 25, 26, ++ 27, 28, 29, 30, 31, 32, 33, 34, ++ 35, 36, 37, 38, 39, 40, 41, 42, ++ 43, 44, 45, 46, 47, 48, 49, 50, ++ 51, 52, 53, 53, 54, 55, 56, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, ++ }, ++ { /* Fourth byte table 98. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 5, 6, ++ 6, 6, 6, 7, 8, 9, 10, 11, ++ 12, 13, 13, 14, 15, 16, 17, 18, ++ 19, 20, 21, 22, 23, 24, 25, 26, ++ 27, 28, 29, 30, 31, 32, 33, 34, ++ 35, 36, 37, 38, 39, 40, 41, 42, ++ 43, 44, 45, 46, 47, 48, 49, 50, ++ 51, 52, 53, 54, 55, 56, 57, 58, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, ++ }, ++ { /* Fourth byte table 99. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 32, 33, 34, 35, 36, 37, 38, 39, ++ 40, 41, 42, 43, 44, 45, 46, 47, ++ 48, 49, 50, 51, 52, 53, 54, 55, ++ 56, 57, 58, 59, 60, 61, 62, 63, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, ++ }, ++ { /* Fourth byte table 100. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 32, 33, 34, 35, 36, 37, 38, 39, ++ 40, 41, 42, 43, 44, 45, 46, 47, ++ 48, 49, 50, 51, 52, 53, 54, 55, ++ 56, 57, 58, 59, 60, 61, 62, 63, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, ++ }, ++ { /* Fourth byte table 101. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 32, 33, 34, 35, 36, 37, 38, 39, ++ 40, 41, 42, 43, 44, 45, 46, 47, ++ 48, 49, 50, 51, 52, 53, 54, 55, ++ 56, 57, 58, 59, 60, 61, 62, 63, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, ++ }, ++ { /* Fourth byte table 102. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 32, 33, 34, 35, 36, 37, 38, 39, ++ 40, 41, 42, 43, 44, 45, 46, 47, ++ 48, 49, 50, 51, 52, 53, 54, 55, ++ 56, 57, 58, 59, 60, 61, 62, 63, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, ++ }, ++ { /* Fourth byte table 103. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 32, 33, 34, 35, 36, 36, 36, 36, ++ 36, 38, 40, 42, 44, 46, 48, 50, ++ 52, 54, 56, 58, 60, 62, 64, 66, ++ 68, 70, 72, 74, 76, 78, 80, 82, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, ++ }, ++ { /* Fourth byte table 104. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 5, 7, 9, 11, 13, 15, ++ 17, 19, 21, 23, 25, 27, 29, 31, ++ 33, 35, 37, 39, 41, 43, 45, 47, ++ 49, 51, 53, 55, 58, 60, 62, 64, ++ 66, 68, 70, 72, 74, 76, 78, 80, ++ 82, 84, 86, 88, 90, 92, 94, 96, ++ 98, 100, 102, 104, 106, 108, 110, 112, ++ 114, 116, 118, 120, 123, 125, 127, 129, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, ++ }, ++ { /* Fourth byte table 105. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 45, 47, ++ 49, 51, 53, 55, 57, 59, 61, 63, ++ 65, 67, 69, 71, 73, 75, 77, 79, ++ 81, 83, 85, 87, 89, 91, 93, 95, ++ 97, 99, 101, 103, 105, 107, 110, 112, ++ 114, 116, 118, 120, 122, 124, 126, 128, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, ++ }, ++ { /* Fourth byte table 106. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 33, 35, 37, 39, 41, 43, 45, 47, ++ 49, 51, 53, 55, 57, 59, 61, 63, ++ 65, 67, 69, 71, 73, 75, 77, 79, ++ 81, 83, 85, 87, 89, 91, 93, 95, ++ 98, 100, 102, 104, 106, 108, 110, 112, ++ 114, 116, 118, 120, 122, 124, 126, 128, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, ++ }, ++ { /* Fourth byte table 107. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 21, 23, 25, 27, 29, 31, ++ 33, 35, 37, 39, 41, 43, 45, 47, ++ 49, 51, 53, 55, 57, 59, 61, 63, ++ 65, 67, 69, 71, 73, 75, 77, 79, ++ 81, 83, 86, 88, 90, 92, 94, 96, ++ 98, 100, 102, 104, 106, 108, 110, 112, ++ 114, 116, 118, 120, 122, 124, 126, 128, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, ++ }, ++ { /* Fourth byte table 108. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 9, 11, 13, 15, ++ 17, 19, 21, 21, 21, 21, 21, 22, ++ 23, 24, 25, 26, 27, 28, 29, 30, ++ 31, 32, 33, 34, 35, 36, 37, 38, ++ 39, 40, 41, 42, 43, 44, 45, 46, ++ 47, 48, 49, 50, 51, 52, 53, 54, ++ 55, 56, 57, 58, 59, 60, 61, 62, ++ 63, 64, 65, 66, 67, 68, 69, 70, ++ 71, 71, 71, 71, 71, 71, 71, 71, ++ 71, 71, 71, 71, 71, 71, 71, 71, ++ 71, 71, 71, 71, 71, 71, 71, 71, ++ 71, 71, 71, 71, 71, 71, 71, 71, ++ 71, 71, 71, 71, 71, 71, 71, 71, ++ 71, 71, 71, 71, 71, 71, 71, 71, ++ 71, 71, 71, 71, 71, 71, 71, 71, ++ 71, 71, 71, 71, 71, 71, 71, 71, ++ 71, ++ }, ++ { /* Fourth byte table 109. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 9, 13, 17, 21, 25, 29, ++ 33, 37, 42, 46, 50, 54, 58, 62, ++ 66, 71, 75, 80, 85, 90, 94, 98, ++ 102, 106, 110, 114, 118, 122, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, ++ }, ++ { /* Fourth byte table 110. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 111. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 112. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 113. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 114. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 115. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 116. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 117. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ }, ++ { ++ { /* Fourth byte table 0. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 1, 1, 1, 1, 1, 1, ++ 1, 4, 4, 5, 5, 5, 5, 5, ++ 8, 8, 8, 9, 10, 13, 15, 15, ++ 15, 18, 19, 20, 20, 25, 30, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, ++ }, ++ { /* Fourth byte table 1. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 24, ++ 28, 32, 36, 40, 44, 48, 52, 56, ++ 60, 60, 64, 68, 72, 76, 80, 84, ++ 84, 84, 88, 92, 96, 100, 104, 104, ++ 104, 108, 112, 116, 120, 124, 128, 128, ++ 132, 136, 140, 144, 148, 152, 156, 160, ++ 164, 164, 168, 172, 176, 180, 184, 188, ++ 188, 188, 192, 196, 200, 204, 208, 208, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, ++ }, ++ { /* Fourth byte table 2. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 64, 64, 68, 72, 76, 80, 84, ++ 88, 92, 96, 100, 104, 108, 112, 116, ++ 120, 124, 128, 132, 136, 140, 144, 144, ++ 144, 148, 152, 156, 160, 164, 168, 172, ++ 176, 180, 180, 182, 184, 188, 192, 196, ++ 200, 200, 204, 208, 212, 216, 220, 224, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, 227, 227, 227, 227, 227, 227, 227, ++ 227, ++ }, ++ { /* Fourth byte table 3. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 3, 3, 7, 11, 15, 19, ++ 23, 27, 30, 30, 30, 34, 38, 42, ++ 46, 50, 54, 54, 54, 58, 62, 66, ++ 70, 74, 78, 82, 86, 90, 94, 98, ++ 102, 106, 110, 114, 118, 122, 126, 126, ++ 126, 130, 134, 138, 142, 146, 150, 154, ++ 158, 162, 166, 170, 174, 178, 182, 186, ++ 190, 194, 198, 202, 206, 210, 214, 218, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, 219, 219, 219, 219, 219, 219, 219, ++ 219, ++ }, ++ { /* Fourth byte table 4. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 12, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, ++ }, ++ { /* Fourth byte table 5. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 4, 8, 12, ++ 14, 16, 18, 20, 22, 24, 28, 32, ++ 36, 40, 44, 48, 52, 56, 62, 68, ++ 74, 80, 86, 92, 98, 104, 104, 110, ++ 116, 122, 128, 133, 138, 138, 138, 142, ++ 146, 150, 154, 158, 162, 168, 174, 179, ++ 184, 188, 190, 192, 194, 198, 202, 202, ++ 202, 206, 210, 216, 222, 227, 232, 237, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, 242, 242, 242, 242, 242, 242, 242, ++ 242, ++ }, ++ { /* Fourth byte table 6. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 112, 112, 116, ++ 120, 120, 120, 120, 120, 120, 120, 124, ++ 128, 132, 136, 142, 148, 154, 160, 164, ++ 168, 174, 180, 184, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, 188, 188, 188, 188, 188, 188, 188, ++ 188, ++ }, ++ { /* Fourth byte table 7. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 3, 4, 5, 7, 9, 11, ++ 12, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, ++ }, ++ { /* Fourth byte table 8. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 18, ++ 18, 20, 21, 22, 23, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, ++ }, ++ { /* Fourth byte table 9. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 6, 9, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 17, 17, 17, ++ 17, 17, 17, 20, 20, 20, 20, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, ++ }, ++ { /* Fourth byte table 10. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 3, 14, 19, ++ 22, 27, 32, 37, 37, 42, 42, 47, ++ 52, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 64, 69, 74, 79, 84, ++ 89, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 11. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 5, 10, 15, 20, 25, ++ 25, 27, 29, 31, 41, 51, 53, 55, ++ 55, 55, 55, 55, 55, 55, 55, 55, ++ 55, 55, 55, 55, 55, 55, 55, 55, ++ 55, 55, 55, 55, 55, 55, 55, 55, ++ 55, 57, 59, 61, 61, 63, 65, 65, ++ 65, 65, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, 67, 67, 67, 67, 67, 67, 67, ++ 67, ++ }, ++ { /* Fourth byte table 12. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 10, 10, 15, 15, 15, 15, ++ 20, 20, 20, 20, 20, 25, 30, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 40, 40, 40, 40, 40, 40, ++ 40, 40, 40, 40, 40, 40, 40, 40, ++ 40, 40, 40, 40, 40, 40, 40, 40, ++ 40, 40, 40, 40, 40, 40, 40, 40, ++ 40, 40, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, ++ }, ++ { /* Fourth byte table 13. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 10, 10, 15, 15, 15, 15, ++ 20, 20, 20, 20, 20, 25, 30, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 40, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, ++ }, ++ { /* Fourth byte table 14. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 5, 10, 10, 10, 10, 10, ++ 10, 10, 10, 10, 10, 10, 10, 10, ++ 10, 15, 20, 25, 30, 30, 30, 35, ++ 40, 40, 40, 45, 50, 55, 60, 65, ++ 70, 70, 70, 75, 80, 85, 90, 95, ++ 100, 100, 100, 105, 110, 115, 120, 125, ++ 130, 135, 140, 145, 150, 155, 160, 160, ++ 160, 165, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, 170, 170, 170, 170, 170, 170, 170, ++ 170, ++ }, ++ { /* Fourth byte table 15. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, ++ }, ++ { /* Fourth byte table 16. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 5, 10, 15, 20, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, ++ }, ++ { /* Fourth byte table 17. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 4, 8, ++ 12, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, ++ }, ++ { /* Fourth byte table 18. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 5, 10, 10, 10, 10, 10, ++ 10, 10, 10, 10, 10, 10, 10, 10, ++ 10, 10, 10, 10, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, ++ }, ++ { /* Fourth byte table 19. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 7, 7, 7, 7, 7, 7, ++ 7, 7, 14, 14, 14, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, ++ }, ++ { /* Fourth byte table 20. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 14, 21, 28, 35, 42, 49, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, ++ }, ++ { /* Fourth byte table 21. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 21, 28, 28, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, ++ }, ++ { /* Fourth byte table 22. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 7, 7, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, ++ }, ++ { /* Fourth byte table 23. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 7, 14, 21, 21, 21, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, ++ }, ++ { /* Fourth byte table 24. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 7, 7, 14, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 28, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, ++ }, ++ { /* Fourth byte table 25. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, ++ }, ++ { /* Fourth byte table 26. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 7, 14, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, ++ }, ++ { /* Fourth byte table 27. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, ++ }, ++ { /* Fourth byte table 28. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 7, 7, 7, 7, 7, 7, ++ 14, 21, 21, 28, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, ++ }, ++ { /* Fourth byte table 29. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 7, 14, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, ++ }, ++ { /* Fourth byte table 30. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 7, 7, 14, 24, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, 31, 31, 31, 31, 31, 31, 31, ++ 31, ++ }, ++ { /* Fourth byte table 31. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, ++ }, ++ { /* Fourth byte table 32. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, ++ }, ++ { /* Fourth byte table 33. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 6, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, ++ }, ++ { /* Fourth byte table 34. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, ++ }, ++ { /* Fourth byte table 35. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 14, 14, ++ 14, 14, 14, 21, 21, 21, 21, 21, ++ 28, 28, 28, 28, 28, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 42, 42, 42, 42, 42, 42, ++ 42, 42, 42, 42, 49, 49, 56, 63, ++ 72, 79, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, ++ }, ++ { /* Fourth byte table 36. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 21, 21, ++ 21, 21, 21, 28, 28, 28, 28, 28, ++ 35, 35, 35, 35, 35, 42, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 42, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, 49, 49, 49, 49, 49, 49, 49, ++ 49, ++ }, ++ { /* Fourth byte table 37. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, 7, 7, 7, 7, 7, 7, 7, ++ 7, ++ }, ++ { /* Fourth byte table 38. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, ++ }, ++ { /* Fourth byte table 39. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 7, ++ 7, 14, 14, 21, 21, 28, 28, 35, ++ 35, 35, 35, 42, 42, 42, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 42, 42, 42, 49, 49, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, ++ }, ++ { /* Fourth byte table 40. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 14, 14, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, ++ }, ++ { /* Fourth byte table 41. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 1, 3, 4, ++ 4, 5, 6, 8, 9, 10, 11, 12, ++ 13, 14, 15, 16, 16, 17, 19, 20, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, ++ }, ++ { /* Fourth byte table 42. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 6, 8, 11, ++ 12, 13, 14, 16, 18, 20, 21, 21, ++ 22, 23, 25, 26, 28, 31, 34, 35, ++ 36, 37, 40, 42, 43, 46, 48, 50, ++ 52, 54, 56, 57, 58, 59, 60, 62, ++ 64, 66, 68, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, ++ }, ++ { /* Fourth byte table 43. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 2, 3, 5, 7, ++ 9, 10, 12, 14, 16, 18, 20, 22, ++ 25, 27, 29, 32, 34, 36, 38, 40, ++ 42, 44, 46, 48, 50, 52, 54, 56, ++ 58, 61, 63, 65, 66, 68, 70, 72, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, ++ }, ++ { /* Fourth byte table 44. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 13, 14, 15, 16, 17, ++ 18, 19, 20, 21, 21, 21, 21, 21, ++ 21, 21, 24, 24, 24, 24, 24, 24, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 28, 30, 33, ++ 33, 33, 33, 33, 33, 33, 33, 33, ++ 34, 34, 34, 34, 40, 49, 49, 55, ++ 64, 64, 64, 64, 64, 66, 66, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, 69, 69, 69, 69, 69, 69, 69, ++ 69, ++ }, ++ { /* Fourth byte table 45. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 2, 4, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 19, 19, ++ 19, 20, 21, 21, 21, 22, 23, 24, ++ 25, 26, 27, 28, 31, 32, 33, 34, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, 35, 35, 35, 35, 35, 35, 35, ++ 35, ++ }, ++ { /* Fourth byte table 46. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 14, 15, 16, 17, ++ 17, 18, 19, 20, 21, 23, 23, 23, ++ 23, 23, 23, 23, 23, 23, 23, 23, ++ 23, 23, 23, 23, 23, 23, 23, 23, ++ 23, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, 25, 25, 25, 25, 25, 25, 25, ++ 25, ++ }, ++ { /* Fourth byte table 47. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 7, 10, 10, 13, 16, ++ 18, 18, 21, 22, 23, 24, 25, 26, ++ 28, 29, 30, 31, 32, 32, 33, 35, ++ 35, 35, 36, 37, 38, 39, 40, 40, ++ 40, 42, 45, 47, 47, 48, 48, 51, ++ 51, 52, 52, 54, 58, 59, 60, 60, ++ 61, 62, 63, 63, 64, 65, 67, 69, ++ 71, 73, 74, 74, 77, 79, 81, 83, ++ 85, 85, 85, 85, 85, 85, 85, 85, ++ 85, 85, 85, 85, 85, 85, 85, 85, ++ 85, 85, 85, 85, 85, 85, 85, 85, ++ 85, 85, 85, 85, 85, 85, 85, 85, ++ 85, 85, 85, 85, 85, 85, 85, 85, ++ 85, 85, 85, 85, 85, 85, 85, 85, ++ 85, 85, 85, 85, 85, 85, 85, 85, ++ 85, 85, 85, 85, 85, 85, 85, 85, ++ 85, ++ }, ++ { /* Fourth byte table 48. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 3, 3, 3, 3, 4, 5, ++ 6, 7, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 13, 18, 23, 28, ++ 33, 38, 43, 48, 53, 58, 63, 68, ++ 72, 73, 75, 78, 80, 81, 83, 86, ++ 90, 92, 93, 95, 98, 99, 100, 101, ++ 102, 103, 105, 108, 110, 111, 113, 116, ++ 120, 122, 123, 125, 128, 129, 130, 131, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, 132, 132, 132, 132, 132, 132, 132, ++ 132, ++ }, ++ { /* Fourth byte table 49. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 6, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, ++ }, ++ { /* Fourth byte table 50. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 6, 12, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, ++ }, ++ { /* Fourth byte table 51. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 6, 6, 6, ++ 6, 6, 12, 12, 12, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 24, 24, 30, ++ 30, 30, 30, 30, 30, 36, 45, 45, ++ 51, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, ++ }, ++ { /* Fourth byte table 52. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 6, 6, 6, 12, 12, 12, ++ 18, 18, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 28, 28, 34, 34, 34, 34, 34, ++ 34, 34, 34, 34, 34, 34, 40, 44, ++ 48, 54, 60, 60, 60, 66, 72, 72, ++ 72, 78, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, ++ }, ++ { /* Fourth byte table 53. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 12, 12, 18, 24, 24, ++ 24, 30, 36, 36, 36, 36, 36, 36, ++ 36, 36, 36, 36, 36, 36, 36, 36, ++ 36, 36, 36, 36, 36, 36, 36, 36, ++ 36, 36, 36, 36, 36, 36, 36, 36, ++ 36, 36, 36, 36, 36, 42, 48, 54, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, ++ }, ++ { /* Fourth byte table 54. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 18, 24, 24, 24, 24, ++ 24, 24, 24, 30, 36, 42, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, ++ }, ++ { /* Fourth byte table 55. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 4, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, ++ }, ++ { /* Fourth byte table 56. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 11, 13, 15, 17, 19, 21, ++ 23, 25, 27, 29, 31, 34, 37, 40, ++ 43, 46, 49, 52, 55, 58, 62, 66, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, ++ }, ++ { /* Fourth byte table 57. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 48, 50, 53, 56, 59, 62, 65, 68, ++ 71, 74, 77, 80, 83, 86, 89, 92, ++ 95, 98, 101, 104, 107, 110, 113, 116, ++ 119, 122, 125, 128, 131, 134, 137, 140, ++ 143, 146, 149, 152, 155, 158, 161, 162, ++ 163, 164, 165, 166, 167, 168, 169, 170, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, ++ }, ++ { /* Fourth byte table 58. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 32, 33, 34, 35, 36, 37, 38, 39, ++ 40, 41, 42, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, 43, 43, 43, 43, 43, 43, 43, ++ 43, ++ }, ++ { /* Fourth byte table 59. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, ++ }, ++ { /* Fourth byte table 60. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 3, 5, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, ++ }, ++ { /* Fourth byte table 61. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, 6, 6, 6, 6, 6, 6, 6, ++ 6, ++ }, ++ { /* Fourth byte table 62. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, ++ }, ++ { /* Fourth byte table 63. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, ++ }, ++ { /* Fourth byte table 64. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, ++ }, ++ { /* Fourth byte table 65. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 96, 99, 102, 105, 108, 111, 114, 117, ++ 120, 123, 126, 129, 132, 135, 138, 141, ++ 144, 147, 150, 153, 156, 159, 162, 165, ++ 168, 171, 174, 177, 180, 183, 186, 189, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, ++ }, ++ { /* Fourth byte table 66. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 96, 99, 102, 105, 108, 111, 114, 117, ++ 120, 123, 126, 129, 132, 135, 138, 141, ++ 144, 147, 150, 153, 156, 159, 162, 165, ++ 168, 171, 174, 177, 180, 183, 186, 189, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, ++ }, ++ { /* Fourth byte table 67. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 96, 99, 102, 105, 108, 111, 114, 117, ++ 120, 123, 126, 129, 132, 135, 138, 141, ++ 144, 147, 150, 153, 156, 159, 162, 165, ++ 168, 171, 174, 177, 180, 183, 186, 189, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, ++ }, ++ { /* Fourth byte table 68. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, ++ }, ++ { /* Fourth byte table 69. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 4, ++ 4, 7, 10, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, ++ }, ++ { /* Fourth byte table 70. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 7, 7, 14, ++ 14, 21, 21, 28, 28, 35, 35, 42, ++ 42, 49, 49, 56, 56, 63, 63, 70, ++ 70, 77, 77, 84, 84, 84, 91, 91, ++ 98, 98, 105, 105, 105, 105, 105, 105, ++ 105, 112, 119, 119, 126, 133, 133, 140, ++ 147, 147, 154, 161, 161, 168, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, 175, 175, 175, 175, 175, 175, 175, ++ 175, ++ }, ++ { /* Fourth byte table 71. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 7, 7, 7, ++ 7, 7, 7, 7, 11, 15, 15, 22, ++ 28, 28, 28, 28, 28, 28, 28, 28, ++ 28, 28, 28, 28, 28, 35, 35, 42, ++ 42, 49, 49, 56, 56, 63, 63, 70, ++ 70, 77, 77, 84, 84, 91, 91, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, 98, 98, 98, 98, 98, 98, 98, ++ 98, ++ }, ++ { /* Fourth byte table 72. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 7, 14, 14, 14, 21, 21, ++ 28, 28, 35, 35, 35, 35, 35, 35, ++ 35, 42, 49, 49, 56, 63, 63, 70, ++ 77, 77, 84, 91, 91, 98, 105, 105, ++ 105, 105, 105, 105, 105, 105, 105, 105, ++ 105, 105, 105, 105, 105, 105, 105, 105, ++ 105, 105, 105, 105, 105, 112, 112, 112, ++ 119, 126, 133, 140, 140, 140, 140, 147, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, 153, 153, 153, 153, 153, 153, 153, ++ 153, ++ }, ++ { /* Fourth byte table 73. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 6, 9, 12, 15, 18, ++ 21, 24, 27, 30, 33, 36, 39, 42, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, 45, 45, 45, 45, 45, 45, 45, ++ 45, ++ }, ++ { /* Fourth byte table 74. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 96, 99, 102, 105, 108, 111, 114, 117, ++ 120, 123, 126, 129, 132, 135, 138, 141, ++ 144, 147, 150, 153, 156, 159, 162, 165, ++ 168, 171, 174, 177, 180, 183, 186, 189, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, 192, 192, 192, 192, 192, 192, 192, ++ 192, ++ }, ++ { /* Fourth byte table 75. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 45, 45, 45, 48, 51, 54, 57, 60, ++ 63, 66, 69, 72, 75, 78, 81, 84, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, ++ }, ++ { /* Fourth byte table 76. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 10, 15, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 23, 25, 27, 29, 31, 33, 35, ++ 37, 39, 41, 43, 45, 47, 49, 51, ++ 53, 56, 59, 62, 65, 68, 71, 74, ++ 77, 80, 83, 86, 89, 92, 95, 101, ++ 107, 113, 119, 125, 131, 137, 143, 149, ++ 155, 161, 167, 173, 179, 194, 206, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, 212, 212, 212, 212, 212, 212, 212, ++ 212, ++ }, ++ { /* Fourth byte table 77. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 96, 99, 102, 105, 108, 111, 114, 117, ++ 120, 123, 126, 129, 132, 135, 138, 141, ++ 144, 147, 149, 151, 153, 155, 157, 159, ++ 161, 163, 165, 167, 169, 171, 173, 175, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, 177, 177, 177, 177, 177, 177, 177, ++ 177, ++ }, ++ { /* Fourth byte table 78. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 41, 46, 51, 53, 56, 58, ++ 61, 64, 67, 70, 73, 76, 79, 82, ++ 85, 88, 91, 94, 97, 100, 103, 106, ++ 109, 112, 115, 118, 121, 124, 127, 130, ++ 133, 136, 139, 142, 145, 148, 151, 154, ++ 157, 160, 163, 166, 169, 172, 175, 178, ++ 181, 184, 187, 190, 193, 196, 199, 202, ++ 202, 202, 202, 202, 202, 202, 202, 202, ++ 202, 202, 202, 202, 202, 202, 202, 202, ++ 202, 202, 202, 202, 202, 202, 202, 202, ++ 202, 202, 202, 202, 202, 202, 202, 202, ++ 202, 202, 202, 202, 202, 202, 202, 202, ++ 202, 202, 202, 202, 202, 202, 202, 202, ++ 202, 202, 202, 202, 202, 202, 202, 202, ++ 202, 202, 202, 202, 202, 202, 202, 202, ++ 202, ++ }, ++ { /* Fourth byte table 79. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 7, 9, 11, 13, 15, ++ 17, 20, 24, 26, 28, 31, 34, 36, ++ 38, 40, 43, 46, 49, 52, 55, 57, ++ 59, 61, 63, 65, 68, 70, 72, 74, ++ 77, 80, 82, 85, 88, 91, 93, 96, ++ 101, 107, 109, 112, 115, 118, 121, 128, ++ 136, 138, 140, 143, 145, 147, 149, 152, ++ 154, 156, 158, 160, 162, 165, 167, 169, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, 171, 171, 171, 171, 171, 171, 171, ++ 171, ++ }, ++ { /* Fourth byte table 80. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 10, 12, 14, 16, 22, ++ 25, 27, 29, 31, 33, 35, 37, 39, ++ 41, 43, 45, 48, 50, 52, 55, 58, ++ 60, 64, 67, 69, 71, 73, 75, 80, ++ 85, 89, 93, 97, 101, 105, 109, 113, ++ 117, 121, 126, 131, 136, 141, 146, 151, ++ 156, 161, 166, 171, 176, 181, 186, 191, ++ 196, 201, 206, 211, 216, 221, 226, 231, ++ 234, 234, 234, 234, 234, 234, 234, 234, ++ 234, 234, 234, 234, 234, 234, 234, 234, ++ 234, 234, 234, 234, 234, 234, 234, 234, ++ 234, 234, 234, 234, 234, 234, 234, 234, ++ 234, 234, 234, 234, 234, 234, 234, 234, ++ 234, 234, 234, 234, 234, 234, 234, 234, ++ 234, 234, 234, 234, 234, 234, 234, 234, ++ 234, 234, 234, 234, 234, 234, 234, 234, ++ 234, ++ }, ++ { /* Fourth byte table 81. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 56, ++ 56, 60, 60, 64, 64, 64, 68, 72, ++ 76, 80, 84, 88, 92, 96, 100, 104, ++ 104, 108, 108, 112, 112, 112, 116, 120, ++ 120, 120, 120, 124, 128, 132, 136, 136, ++ 136, 140, 144, 148, 152, 156, 160, 164, ++ 168, 172, 176, 180, 184, 188, 192, 196, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, 200, 200, 200, 200, 200, 200, 200, ++ 200, ++ }, ++ { /* Fourth byte table 82. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 128, 132, 136, 140, 144, 148, 152, 156, ++ 160, 164, 168, 172, 172, 172, 172, 172, ++ 172, 176, 180, 184, 188, 192, 196, 200, ++ 204, 208, 212, 216, 220, 224, 228, 232, ++ 236, 236, 236, 236, 236, 236, 236, 236, ++ 236, 236, 236, 236, 236, 236, 236, 236, ++ 236, 236, 236, 236, 236, 236, 236, 236, ++ 236, 236, 236, 236, 236, 236, 236, 236, ++ 236, 236, 236, 236, 236, 236, 236, 236, ++ 236, 236, 236, 236, 236, 236, 236, 236, ++ 236, 236, 236, 236, 236, 236, 236, 236, ++ 236, 236, 236, 236, 236, 236, 236, 236, ++ 236, ++ }, ++ { /* Fourth byte table 83. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 65, 70, 75, 79, 83, 87, 92, 97, ++ 102, 106, 110, 110, 110, 110, 110, 110, ++ 110, 110, 110, 110, 110, 110, 110, 110, ++ 110, 110, 110, 110, 110, 110, 110, 110, ++ 110, 110, 110, 110, 110, 110, 110, 110, ++ 110, 110, 110, 110, 110, 110, 110, 110, ++ 110, 110, 110, 110, 110, 110, 110, 110, ++ 110, 110, 110, 110, 110, 110, 110, 110, ++ 110, 110, 110, 110, 110, 110, 110, 110, ++ 110, 110, 110, 110, 110, 110, 110, 110, ++ 110, 110, 110, 110, 110, 110, 110, 110, ++ 110, 110, 110, 110, 110, 110, 110, 110, ++ 110, 110, 110, 110, 110, 110, 110, 110, ++ 110, 110, 110, 110, 110, 110, 110, 110, ++ 110, ++ }, ++ { /* Fourth byte table 84. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 9, 12, 14, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 20, 24, 28, 32, ++ 36, 36, 36, 36, 36, 36, 41, 41, ++ 46, 48, 50, 52, 54, 56, 58, 60, ++ 62, 64, 65, 70, 75, 82, 89, 94, ++ 99, 104, 109, 114, 119, 124, 129, 134, ++ 134, 139, 144, 149, 154, 159, 159, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, ++ }, ++ { /* Fourth byte table 85. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 10, 10, 15, 20, 20, 25, ++ 30, 35, 40, 45, 50, 55, 60, 65, ++ 69, 71, 73, 75, 77, 79, 81, 83, ++ 85, 87, 89, 91, 93, 95, 97, 99, ++ 101, 103, 105, 107, 109, 111, 113, 115, ++ 117, 119, 121, 123, 125, 127, 129, 131, ++ 133, 135, 137, 139, 141, 143, 145, 147, ++ 149, 151, 153, 155, 157, 159, 161, 163, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, 165, 165, 165, 165, 165, 165, 165, ++ 165, ++ }, ++ { /* Fourth byte table 86. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 48, 50, 52, 54, 56, 58, 60, 62, ++ 64, 66, 68, 70, 72, 76, 80, 82, ++ 84, 86, 88, 90, 92, 94, 96, 98, ++ 100, 104, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, 108, 108, 108, 108, 108, 108, 108, ++ 108, ++ }, ++ { /* Fourth byte table 87. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 2, 4, 6, 8, ++ 10, 12, 14, 16, 18, 20, 24, 26, ++ 28, 30, 32, 34, 36, 38, 40, 42, ++ 44, 46, 48, 54, 60, 66, 72, 78, ++ 84, 90, 96, 102, 108, 114, 120, 126, ++ 132, 138, 144, 150, 156, 158, 160, 162, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, 164, 164, 164, 164, 164, 164, 164, ++ 164, ++ }, ++ { /* Fourth byte table 88. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 128, 132, 136, 140, 144, 148, 152, 156, ++ 160, 164, 168, 172, 176, 180, 184, 188, ++ 192, 196, 200, 204, 208, 212, 216, 220, ++ 224, 228, 232, 236, 240, 244, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, 248, 248, 248, 248, 248, 248, 248, ++ 248, ++ }, ++ { /* Fourth byte table 89. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 18, 24, 30, 36, 42, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 54, 60, 68, 76, 84, 92, 100, ++ 108, 116, 122, 155, 170, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, 178, 178, 178, 178, 178, 178, 178, ++ 178, ++ }, ++ { /* Fourth byte table 90. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 4, 7, 8, 9, 10, 11, ++ 14, 17, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 22, 25, 28, 29, 30, 31, 32, ++ 33, 34, 37, 40, 43, 46, 49, 52, ++ 55, 55, 55, 55, 55, 55, 55, 55, ++ 55, 55, 55, 55, 55, 55, 55, 55, ++ 55, 55, 55, 55, 55, 55, 55, 55, ++ 55, 55, 55, 55, 55, 55, 55, 55, ++ 55, 55, 55, 55, 55, 55, 55, 55, ++ 55, 55, 55, 55, 55, 55, 55, 55, ++ 55, 55, 55, 55, 55, 55, 55, 55, ++ 55, 55, 55, 55, 55, 55, 55, 55, ++ 55, ++ }, ++ { /* Fourth byte table 91. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 15, 15, ++ 16, 17, 20, 23, 26, 29, 30, 31, ++ 32, 33, 36, 37, 37, 38, 39, 40, ++ 41, 44, 45, 46, 47, 48, 51, 54, ++ 55, 56, 57, 58, 59, 60, 61, 62, ++ 62, 63, 64, 65, 66, 66, 66, 66, ++ 66, 69, 73, 76, 76, 79, 79, 82, ++ 86, 89, 93, 96, 100, 103, 107, 110, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, ++ }, ++ { /* Fourth byte table 92. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 6, 10, 14, 18, 22, 26, ++ 30, 34, 38, 42, 46, 50, 52, 54, ++ 56, 58, 60, 62, 64, 66, 68, 70, ++ 72, 74, 76, 78, 80, 82, 84, 86, ++ 88, 90, 92, 94, 96, 98, 100, 102, ++ 104, 106, 108, 110, 112, 114, 116, 118, ++ 120, 122, 124, 126, 128, 130, 132, 134, ++ 136, 138, 140, 142, 144, 146, 148, 150, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, ++ }, ++ { /* Fourth byte table 93. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 48, 50, 52, 54, 56, 58, 60, 62, ++ 64, 66, 68, 70, 72, 74, 76, 78, ++ 80, 82, 84, 86, 88, 90, 92, 94, ++ 96, 98, 100, 102, 104, 106, 112, 118, ++ 124, 130, 136, 142, 146, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, 150, 150, 150, 150, 150, 150, 150, ++ 150, ++ }, ++ { /* Fourth byte table 94. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 1, 2, 3, 4, 5, 6, ++ 7, 8, 9, 10, 11, 12, 13, 14, ++ 15, 16, 17, 18, 19, 20, 21, 22, ++ 23, 24, 25, 26, 27, 28, 29, 30, ++ 31, 32, 33, 34, 35, 36, 37, 38, ++ 39, 40, 41, 42, 43, 44, 45, 46, ++ 47, 48, 49, 50, 51, 52, 53, 54, ++ 55, 56, 57, 58, 59, 60, 61, 62, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, ++ }, ++ { /* Fourth byte table 95. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 34, 37, 40, 43, 46, 49, 52, 55, ++ 58, 61, 64, 67, 70, 73, 76, 79, ++ 82, 85, 88, 91, 94, 97, 100, 103, ++ 106, 109, 112, 115, 118, 121, 124, 127, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, ++ }, ++ { /* Fourth byte table 96. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 96, 99, 102, 105, 108, 111, 114, 117, ++ 120, 123, 126, 129, 132, 135, 138, 141, ++ 144, 147, 150, 153, 156, 159, 162, 165, ++ 168, 171, 174, 177, 180, 183, 186, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, 189, 189, 189, 189, 189, 189, 189, ++ 189, ++ }, ++ { /* Fourth byte table 97. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 3, 6, 9, 12, 15, ++ 18, 18, 18, 21, 24, 27, 30, 33, ++ 36, 36, 36, 39, 42, 45, 48, 51, ++ 54, 54, 54, 57, 60, 63, 63, 63, ++ 63, 65, 67, 69, 72, 74, 76, 79, ++ 79, 82, 85, 88, 91, 94, 97, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, 100, 100, 100, 100, 100, 100, 100, ++ 100, ++ }, ++ { /* Fourth byte table 98. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 9, ++ 18, 31, 44, 57, 70, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, ++ }, ++ { /* Fourth byte table 99. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 9, 18, 31, 44, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, ++ }, ++ { /* Fourth byte table 100. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, 13, 13, 13, 13, 13, 13, 13, ++ 13, ++ }, ++ { /* Fourth byte table 101. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 32, 33, 34, 35, 36, 37, 38, 39, ++ 40, 41, 42, 43, 44, 45, 46, 47, ++ 48, 49, 50, 51, 52, 53, 54, 55, ++ 56, 57, 58, 59, 60, 61, 62, 63, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, ++ }, ++ { /* Fourth byte table 102. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 21, 22, ++ 23, 24, 25, 26, 27, 28, 29, 30, ++ 31, 32, 33, 34, 35, 36, 37, 38, ++ 39, 40, 41, 42, 43, 44, 45, 46, ++ 47, 48, 49, 50, 51, 52, 53, 54, ++ 55, 56, 57, 58, 59, 60, 61, 62, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, ++ }, ++ { /* Fourth byte table 103. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 29, 30, ++ 31, 31, 31, 32, 32, 32, 33, 34, ++ 34, 34, 35, 36, 37, 38, 38, 39, ++ 40, 41, 42, 43, 44, 45, 46, 47, ++ 48, 49, 50, 50, 51, 51, 52, 53, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, ++ }, ++ { /* Fourth byte table 104. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 4, 5, 6, ++ 7, 8, 9, 10, 11, 12, 13, 14, ++ 15, 16, 17, 18, 19, 20, 21, 22, ++ 23, 24, 25, 26, 27, 28, 29, 30, ++ 31, 32, 33, 34, 35, 36, 37, 38, ++ 39, 40, 41, 42, 43, 44, 45, 46, ++ 47, 48, 49, 50, 51, 52, 53, 54, ++ 55, 56, 57, 58, 59, 60, 61, 62, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, ++ }, ++ { /* Fourth byte table 105. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 6, ++ 7, 8, 9, 10, 10, 10, 11, 12, ++ 13, 14, 15, 16, 17, 18, 18, 19, ++ 20, 21, 22, 23, 24, 25, 25, 26, ++ 27, 28, 29, 30, 31, 32, 33, 34, ++ 35, 36, 37, 38, 39, 40, 41, 42, ++ 43, 44, 45, 46, 47, 48, 49, 50, ++ 51, 52, 53, 53, 54, 55, 56, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, ++ }, ++ { /* Fourth byte table 106. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 5, 6, ++ 6, 6, 6, 7, 8, 9, 10, 11, ++ 12, 13, 13, 14, 15, 16, 17, 18, ++ 19, 20, 21, 22, 23, 24, 25, 26, ++ 27, 28, 29, 30, 31, 32, 33, 34, ++ 35, 36, 37, 38, 39, 40, 41, 42, ++ 43, 44, 45, 46, 47, 48, 49, 50, ++ 51, 52, 53, 54, 55, 56, 57, 58, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, 59, 59, 59, 59, 59, 59, 59, ++ 59, ++ }, ++ { /* Fourth byte table 107. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 32, 33, 34, 35, 36, 37, 38, 39, ++ 40, 41, 42, 43, 44, 45, 46, 47, ++ 48, 49, 50, 51, 52, 53, 54, 55, ++ 56, 57, 58, 59, 60, 61, 62, 63, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, ++ }, ++ { /* Fourth byte table 108. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 32, 33, 34, 35, 36, 37, 38, 39, ++ 40, 41, 42, 43, 44, 45, 46, 47, ++ 48, 49, 50, 51, 52, 53, 54, 55, ++ 56, 57, 58, 59, 60, 61, 62, 63, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, ++ }, ++ { /* Fourth byte table 109. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 32, 33, 34, 35, 36, 37, 38, 39, ++ 40, 41, 42, 43, 44, 45, 46, 47, ++ 48, 49, 50, 51, 52, 53, 54, 55, ++ 56, 57, 58, 59, 60, 61, 62, 63, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, ++ }, ++ { /* Fourth byte table 110. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 32, 33, 34, 35, 36, 37, 38, 39, ++ 40, 41, 42, 43, 44, 45, 46, 47, ++ 48, 49, 50, 51, 52, 53, 54, 55, ++ 56, 57, 58, 59, 60, 61, 62, 63, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, ++ }, ++ { /* Fourth byte table 111. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 1, 2, 3, 4, 5, 6, 7, ++ 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, ++ 24, 25, 26, 27, 28, 29, 30, 31, ++ 32, 33, 34, 35, 36, 38, 40, 40, ++ 40, 42, 44, 46, 48, 50, 52, 54, ++ 56, 58, 60, 62, 64, 66, 68, 70, ++ 72, 74, 76, 78, 80, 82, 84, 86, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, 88, 88, 88, 88, 88, 88, 88, ++ 88, ++ }, ++ { /* Fourth byte table 112. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 5, 7, 9, 11, 13, 15, ++ 17, 19, 21, 23, 25, 27, 29, 31, ++ 33, 35, 37, 39, 41, 43, 45, 47, ++ 49, 51, 53, 55, 58, 60, 62, 64, ++ 66, 68, 70, 72, 74, 76, 78, 80, ++ 82, 84, 86, 88, 90, 92, 94, 96, ++ 98, 100, 102, 104, 106, 108, 110, 112, ++ 114, 116, 118, 120, 123, 125, 127, 129, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, 131, 131, 131, 131, 131, 131, 131, ++ 131, ++ }, ++ { /* Fourth byte table 113. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 45, 47, ++ 49, 51, 53, 55, 57, 59, 61, 63, ++ 65, 67, 69, 71, 73, 75, 77, 79, ++ 81, 83, 85, 87, 89, 91, 93, 95, ++ 97, 99, 101, 103, 105, 107, 110, 112, ++ 114, 116, 118, 120, 122, 124, 126, 128, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, ++ }, ++ { /* Fourth byte table 114. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 33, 35, 37, 39, 41, 43, 45, 47, ++ 49, 51, 53, 55, 57, 59, 61, 63, ++ 65, 67, 69, 71, 73, 75, 77, 79, ++ 81, 83, 85, 87, 89, 91, 93, 95, ++ 98, 100, 102, 104, 106, 108, 110, 112, ++ 114, 116, 118, 120, 122, 124, 126, 128, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, ++ }, ++ { /* Fourth byte table 115. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 21, 23, 25, 27, 29, 31, ++ 33, 35, 37, 39, 41, 43, 45, 47, ++ 49, 51, 53, 55, 57, 59, 61, 63, ++ 65, 67, 69, 71, 73, 75, 77, 79, ++ 81, 83, 86, 88, 90, 92, 94, 96, ++ 98, 100, 102, 104, 106, 108, 110, 112, ++ 114, 116, 118, 120, 122, 124, 126, 128, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, 130, 130, 130, 130, 130, 130, 130, ++ 130, ++ }, ++ { /* Fourth byte table 116. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 9, 11, 13, 15, ++ 17, 19, 21, 23, 25, 25, 25, 26, ++ 27, 28, 29, 30, 31, 32, 33, 34, ++ 35, 36, 37, 38, 39, 40, 41, 42, ++ 43, 44, 45, 46, 47, 48, 49, 50, ++ 51, 52, 53, 54, 55, 56, 57, 58, ++ 59, 60, 61, 62, 63, 64, 65, 66, ++ 67, 68, 69, 70, 71, 72, 73, 74, ++ 75, 75, 75, 75, 75, 75, 75, 75, ++ 75, 75, 75, 75, 75, 75, 75, 75, ++ 75, 75, 75, 75, 75, 75, 75, 75, ++ 75, 75, 75, 75, 75, 75, 75, 75, ++ 75, 75, 75, 75, 75, 75, 75, 75, ++ 75, 75, 75, 75, 75, 75, 75, 75, ++ 75, 75, 75, 75, 75, 75, 75, 75, ++ 75, 75, 75, 75, 75, 75, 75, 75, ++ 75, ++ }, ++ { /* Fourth byte table 117. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 9, 13, 17, 21, 25, 29, ++ 33, 37, 42, 46, 50, 54, 58, 62, ++ 66, 71, 75, 80, 85, 90, 94, 98, ++ 102, 106, 110, 114, 118, 122, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, 127, 127, 127, 127, 127, 127, 127, ++ 127, ++ }, ++ }, ++}; ++ ++static const uint16_t u8_decomp_b4_16bit_tbl[2][30][257] = { ++ { ++ { /* Fourth byte 16-bit table 0. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 38, 44, 48, 52, 56, 60, 64, ++ 68, 72, 76, 80, 84, 90, 96, 102, ++ 108, 112, 116, 120, 124, 130, 136, 140, ++ 144, 148, 152, 156, 160, 164, 168, 172, ++ 176, 180, 184, 188, 192, 196, 200, 206, ++ 212, 216, 220, 224, 228, 232, 236, 240, ++ 244, 250, 256, 260, 264, 268, 272, 276, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, ++ }, ++ { /* Fourth byte 16-bit table 1. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 54, 60, 66, ++ 72, 78, 84, 90, 96, 100, 104, 108, ++ 112, 116, 120, 124, 128, 134, 140, 144, ++ 148, 152, 156, 160, 164, 170, 176, 182, ++ 188, 194, 200, 204, 208, 212, 216, 220, ++ 224, 228, 232, 236, 240, 244, 248, 252, ++ 256, 262, 268, 274, 280, 284, 288, 292, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, ++ }, ++ { /* Fourth byte 16-bit table 2. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 107, 116, 116, 116, 116, ++ 116, 120, 124, 128, 132, 138, 144, 150, ++ 156, 162, 168, 174, 180, 186, 192, 198, ++ 204, 210, 216, 222, 228, 234, 240, 246, ++ 252, 256, 260, 264, 268, 272, 276, 282, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, ++ }, ++ { /* Fourth byte 16-bit table 3. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 18, 24, 30, 36, 42, ++ 48, 52, 56, 60, 64, 68, 72, 76, ++ 80, 86, 92, 98, 104, 110, 116, 122, ++ 128, 134, 140, 146, 152, 158, 164, 170, ++ 176, 182, 188, 194, 200, 204, 208, 212, ++ 216, 222, 228, 234, 240, 246, 252, 258, ++ 264, 270, 276, 280, 284, 288, 292, 296, ++ 300, 304, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, ++ }, ++ { /* Fourth byte 16-bit table 4. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 10, 17, 24, 31, 38, 45, ++ 52, 57, 62, 69, 76, 83, 90, 97, ++ 104, 109, 114, 121, 128, 135, 142, 142, ++ 142, 147, 152, 159, 166, 173, 180, 180, ++ 180, 185, 190, 197, 204, 211, 218, 225, ++ 232, 237, 242, 249, 256, 263, 270, 277, ++ 284, 289, 294, 301, 308, 315, 322, 329, ++ 336, 341, 346, 353, 360, 367, 374, 381, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, ++ }, ++ { /* Fourth byte 16-bit table 5. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 10, 17, 24, 31, 38, 38, ++ 38, 43, 48, 55, 62, 69, 76, 76, ++ 76, 81, 86, 93, 100, 107, 114, 121, ++ 128, 128, 133, 133, 140, 140, 147, 147, ++ 154, 159, 164, 171, 178, 185, 192, 199, ++ 206, 211, 216, 223, 230, 237, 244, 251, ++ 258, 263, 268, 273, 278, 283, 288, 293, ++ 298, 303, 308, 313, 318, 323, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, ++ }, ++ { /* Fourth byte 16-bit table 6. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 14, 23, 32, 41, 50, 59, ++ 68, 75, 82, 91, 100, 109, 118, 127, ++ 136, 143, 150, 159, 168, 177, 186, 195, ++ 204, 211, 218, 227, 236, 245, 254, 263, ++ 272, 279, 286, 295, 304, 313, 322, 331, ++ 340, 347, 354, 363, 372, 381, 390, 399, ++ 408, 413, 418, 425, 430, 437, 437, 442, ++ 449, 454, 459, 464, 469, 474, 477, 480, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, ++ }, ++ { /* Fourth byte 16-bit table 7. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 14, 21, 26, 33, 33, 38, ++ 45, 50, 55, 60, 65, 70, 82, 94, ++ 106, 111, 116, 123, 130, 130, 130, 135, ++ 142, 147, 152, 157, 162, 162, 174, 186, ++ 198, 203, 208, 215, 222, 227, 232, 237, ++ 244, 249, 254, 259, 264, 269, 280, 291, ++ 293, 293, 293, 300, 305, 312, 312, 317, ++ 324, 329, 334, 339, 344, 349, 356, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, ++ }, ++ { /* Fourth byte 16-bit table 8. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 10, 15, 20, 25, 30, 35, ++ 40, 45, 50, 55, 60, 65, 70, 78, ++ 86, 94, 102, 110, 118, 126, 134, 142, ++ 150, 158, 166, 174, 182, 190, 190, 190, ++ 190, 195, 200, 205, 210, 215, 220, 225, ++ 230, 235, 240, 245, 250, 255, 260, 265, ++ 270, 275, 280, 285, 290, 295, 300, 305, ++ 310, 315, 320, 325, 330, 335, 340, 345, ++ 350, 350, 350, 350, 350, 350, 350, 350, ++ 350, 350, 350, 350, 350, 350, 350, 350, ++ 350, 350, 350, 350, 350, 350, 350, 350, ++ 350, 350, 350, 350, 350, 350, 350, 350, ++ 350, 350, 350, 350, 350, 350, 350, 350, ++ 350, 350, 350, 350, 350, 350, 350, 350, ++ 350, 350, 350, 350, 350, 350, 350, 350, ++ 350, 350, 350, 350, 350, 350, 350, 350, ++ 350, ++ }, ++ { /* Fourth byte 16-bit table 9. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 15, 27, 42, 51, 66, 75, 84, ++ 102, 114, 123, 132, 141, 153, 165, 177, ++ 189, 201, 213, 225, 243, 249, 267, 285, ++ 300, 312, 330, 348, 360, 369, 378, 390, ++ 402, 417, 432, 441, 450, 462, 471, 480, ++ 486, 492, 501, 510, 528, 540, 555, 573, ++ 585, 594, 603, 621, 633, 651, 660, 675, ++ 684, 696, 705, 717, 732, 744, 759, 771, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, ++ }, ++ { /* Fourth byte 16-bit table 10. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 15, 24, 33, 45, 54, 63, 72, ++ 87, 99, 105, 123, 132, 147, 159, 171, ++ 180, 189, 201, 207, 219, 234, 240, 258, ++ 267, 271, 275, 279, 283, 287, 291, 295, ++ 299, 303, 307, 312, 317, 322, 327, 332, ++ 337, 342, 347, 352, 357, 362, 367, 372, ++ 377, 382, 385, 387, 389, 392, 394, 396, ++ 396, 396, 396, 396, 402, 408, 414, 420, ++ 432, 432, 432, 432, 432, 432, 432, 432, ++ 432, 432, 432, 432, 432, 432, 432, 432, ++ 432, 432, 432, 432, 432, 432, 432, 432, ++ 432, 432, 432, 432, 432, 432, 432, 432, ++ 432, 432, 432, 432, 432, 432, 432, 432, ++ 432, 432, 432, 432, 432, 432, 432, 432, ++ 432, 432, 432, 432, 432, 432, 432, 432, ++ 432, 432, 432, 432, 432, 432, 432, 432, ++ 432, ++ }, ++ { /* Fourth byte 16-bit table 11. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 128, 132, 136, 140, 144, 148, 152, 156, ++ 160, 164, 168, 172, 176, 180, 184, 188, ++ 192, 196, 200, 204, 208, 212, 216, 220, ++ 224, 228, 232, 236, 240, 244, 248, 252, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, ++ }, ++ { /* Fourth byte 16-bit table 12. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 128, 132, 136, 140, 144, 148, 152, 156, ++ 160, 164, 168, 172, 176, 180, 184, 188, ++ 192, 196, 200, 204, 208, 212, 216, 220, ++ 224, 228, 232, 236, 240, 244, 248, 252, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, ++ }, ++ { /* Fourth byte 16-bit table 13. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 128, 132, 136, 140, 144, 148, 152, 156, ++ 160, 164, 168, 172, 176, 180, 184, 188, ++ 192, 196, 200, 204, 208, 212, 216, 220, ++ 224, 228, 232, 236, 240, 244, 248, 252, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, ++ }, ++ { /* Fourth byte 16-bit table 14. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 128, 132, 136, 140, 144, 148, 152, 156, ++ 160, 164, 168, 172, 176, 180, 184, 188, ++ 192, 196, 200, 204, 208, 212, 216, 220, ++ 224, 228, 232, 236, 240, 244, 248, 252, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, ++ }, ++ { /* Fourth byte 16-bit table 15. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 18, 24, 30, 34, 38, ++ 42, 46, 50, 54, 58, 62, 66, 70, ++ 74, 78, 82, 86, 90, 94, 98, 102, ++ 106, 110, 114, 118, 122, 126, 130, 134, ++ 138, 142, 146, 150, 154, 158, 162, 166, ++ 170, 174, 178, 182, 186, 190, 194, 198, ++ 202, 206, 210, 214, 218, 222, 226, 230, ++ 234, 238, 242, 246, 250, 254, 258, 262, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, ++ }, ++ { /* Fourth byte 16-bit table 16. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 125, ++ 130, 135, 140, 145, 150, 156, 162, 168, ++ 174, 180, 186, 190, 194, 198, 202, 206, ++ 210, 214, 218, 222, 226, 230, 234, 238, ++ 242, 246, 250, 254, 258, 262, 266, 270, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, ++ }, ++ { /* Fourth byte 16-bit table 17. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 98, 104, 110, 116, 122, 126, 130, 134, ++ 138, 142, 146, 150, 154, 158, 162, 166, ++ 170, 174, 178, 182, 186, 190, 194, 198, ++ 202, 206, 210, 214, 218, 222, 226, 230, ++ 234, 238, 242, 246, 250, 254, 258, 262, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, ++ }, ++ { /* Fourth byte 16-bit table 18. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 130, 136, 140, 144, 148, 152, 156, 160, ++ 164, 168, 172, 176, 180, 184, 188, 192, ++ 196, 200, 204, 210, 216, 222, 226, 230, ++ 234, 238, 242, 246, 250, 254, 258, 262, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, ++ }, ++ { /* Fourth byte 16-bit table 19. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 18, 24, 30, 36, 42, ++ 48, 54, 60, 66, 72, 78, 84, 90, ++ 96, 102, 108, 114, 120, 126, 132, 138, ++ 144, 150, 156, 162, 168, 174, 180, 186, ++ 192, 198, 204, 210, 216, 222, 228, 234, ++ 240, 246, 252, 258, 264, 270, 276, 282, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, ++ }, ++ { /* Fourth byte 16-bit table 20. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 18, 24, 30, 36, 42, ++ 48, 54, 60, 66, 72, 78, 84, 90, ++ 96, 96, 96, 102, 108, 114, 120, 126, ++ 132, 138, 144, 150, 156, 162, 168, 174, ++ 180, 186, 192, 198, 204, 210, 216, 222, ++ 228, 234, 240, 246, 252, 258, 264, 270, ++ 276, 282, 288, 294, 300, 306, 312, 318, ++ 324, 330, 336, 342, 348, 354, 360, 366, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, ++ }, ++ { /* Fourth byte 16-bit table 21. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 17, 21, 25, 29, ++ 33, 37, 41, 45, 49, 53, 58, 62, ++ 66, 70, 74, 79, 83, 87, 91, 96, ++ 100, 104, 108, 112, 116, 121, 125, 129, ++ 133, 137, 141, 145, 149, 153, 157, 161, ++ 165, 169, 173, 177, 181, 185, 189, 193, ++ 197, 201, 205, 209, 213, 218, 222, 226, ++ 230, 235, 239, 243, 247, 251, 255, 259, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, ++ }, ++ { /* Fourth byte 16-bit table 22. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 105, 109, 113, 117, 121, 125, ++ 129, 134, 139, 143, 147, 151, 155, 159, ++ 163, 167, 171, 175, 179, 184, 188, 192, ++ 196, 200, 205, 209, 213, 217, 221, 225, ++ 229, 233, 237, 241, 246, 250, 255, 259, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, ++ }, ++ { /* Fourth byte 16-bit table 23. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 41, 45, 49, 53, 57, 61, ++ 66, 70, 75, 80, 84, 88, 92, 96, ++ 101, 106, 110, 114, 118, 122, 126, 130, ++ 134, 138, 142, 146, 150, 155, 159, 163, ++ 167, 171, 175, 179, 183, 187, 191, 195, ++ 199, 203, 207, 211, 215, 219, 223, 227, ++ 231, 236, 240, 244, 248, 252, 256, 261, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, ++ }, ++ { /* Fourth byte 16-bit table 24. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 45, 49, 53, 57, 61, ++ 65, 69, 73, 77, 81, 85, 89, 93, ++ 97, 101, 105, 109, 113, 117, 122, 126, ++ 130, 134, 138, 142, 147, 151, 155, 159, ++ 163, 167, 171, 175, 179, 184, 188, 192, ++ 196, 201, 205, 209, 213, 217, 221, 225, ++ 230, 235, 240, 244, 249, 253, 257, 261, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, ++ }, ++ { /* Fourth byte 16-bit table 25. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 29, ++ 33, 37, 41, 45, 49, 53, 58, 62, ++ 66, 71, 76, 80, 84, 88, 92, 96, ++ 100, 104, 108, 112, 117, 121, 126, 130, ++ 135, 139, 143, 147, 152, 156, 160, 165, ++ 170, 174, 178, 182, 186, 190, 194, 198, ++ 202, 206, 210, 214, 218, 222, 227, 231, ++ 236, 240, 245, 249, 254, 259, 264, 268, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, ++ }, ++ { /* Fourth byte 16-bit table 26. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 9, 14, 19, 24, 28, 32, ++ 36, 40, 44, 48, 52, 56, 61, 65, ++ 69, 73, 77, 82, 86, 91, 96, 100, ++ 104, 108, 112, 116, 120, 125, 130, 135, ++ 139, 143, 148, 152, 156, 160, 165, 169, ++ 173, 177, 181, 185, 190, 194, 198, 202, ++ 206, 210, 214, 219, 224, 228, 233, 237, ++ 242, 246, 250, 254, 259, 264, 268, 273, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, ++ }, ++ { /* Fourth byte 16-bit table 27. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 9, 13, 17, 21, 25, 29, ++ 34, 39, 44, 49, 53, 57, 61, 65, ++ 69, 73, 77, 81, 85, 89, 93, 97, ++ 102, 106, 110, 114, 118, 122, 126, 130, ++ 134, 138, 142, 146, 150, 155, 160, 165, ++ 169, 173, 177, 181, 186, 190, 195, 199, ++ 203, 208, 213, 217, 221, 225, 229, 233, ++ 237, 241, 245, 249, 253, 257, 261, 265, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, ++ }, ++ { /* Fourth byte 16-bit table 28. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 25, 29, ++ 33, 37, 41, 45, 50, 55, 59, 63, ++ 67, 71, 75, 79, 84, 88, 92, 96, ++ 100, 105, 110, 114, 118, 122, 127, 131, ++ 135, 140, 145, 149, 153, 157, 162, 166, ++ 170, 174, 178, 182, 186, 190, 195, 199, ++ 203, 207, 212, 216, 220, 224, 228, 233, ++ 238, 242, 246, 250, 255, 259, 264, 268, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, ++ }, ++ { /* Fourth byte 16-bit table 29. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ }, ++ { ++ { /* Fourth byte 16-bit table 0. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 38, 44, 48, 52, 56, 60, 64, ++ 68, 72, 76, 80, 84, 90, 96, 102, ++ 108, 112, 116, 120, 124, 130, 136, 140, ++ 144, 148, 152, 156, 160, 164, 168, 172, ++ 176, 180, 184, 188, 192, 196, 200, 206, ++ 212, 216, 220, 224, 228, 232, 236, 240, ++ 244, 250, 256, 260, 264, 268, 272, 276, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, 280, 280, 280, 280, 280, 280, 280, ++ 280, ++ }, ++ { /* Fourth byte 16-bit table 1. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 54, 60, 66, ++ 72, 78, 84, 90, 96, 100, 104, 108, ++ 112, 116, 120, 124, 128, 134, 140, 144, ++ 148, 152, 156, 160, 164, 170, 176, 182, ++ 188, 194, 200, 204, 208, 212, 216, 220, ++ 224, 228, 232, 236, 240, 244, 248, 252, ++ 256, 262, 268, 274, 280, 284, 288, 292, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, 296, 296, 296, 296, 296, 296, 296, ++ 296, ++ }, ++ { /* Fourth byte 16-bit table 2. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 107, 116, 116, 116, 116, ++ 116, 120, 124, 128, 132, 138, 144, 150, ++ 156, 162, 168, 174, 180, 186, 192, 198, ++ 204, 210, 216, 222, 228, 234, 240, 246, ++ 252, 256, 260, 264, 268, 272, 276, 282, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, ++ }, ++ { /* Fourth byte 16-bit table 3. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 18, 24, 30, 36, 42, ++ 48, 52, 56, 60, 64, 68, 72, 76, ++ 80, 86, 92, 98, 104, 110, 116, 122, ++ 128, 134, 140, 146, 152, 158, 164, 170, ++ 176, 182, 188, 194, 200, 204, 208, 212, ++ 216, 222, 228, 234, 240, 246, 252, 258, ++ 264, 270, 276, 280, 284, 288, 292, 296, ++ 300, 304, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, 308, 308, 308, 308, 308, 308, 308, ++ 308, ++ }, ++ { /* Fourth byte 16-bit table 4. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 10, 17, 24, 31, 38, 45, ++ 52, 57, 62, 69, 76, 83, 90, 97, ++ 104, 109, 114, 121, 128, 135, 142, 142, ++ 142, 147, 152, 159, 166, 173, 180, 180, ++ 180, 185, 190, 197, 204, 211, 218, 225, ++ 232, 237, 242, 249, 256, 263, 270, 277, ++ 284, 289, 294, 301, 308, 315, 322, 329, ++ 336, 341, 346, 353, 360, 367, 374, 381, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, 388, 388, 388, 388, 388, 388, 388, ++ 388, ++ }, ++ { /* Fourth byte 16-bit table 5. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 10, 17, 24, 31, 38, 38, ++ 38, 43, 48, 55, 62, 69, 76, 76, ++ 76, 81, 86, 93, 100, 107, 114, 121, ++ 128, 128, 133, 133, 140, 140, 147, 147, ++ 154, 159, 164, 171, 178, 185, 192, 199, ++ 206, 211, 216, 223, 230, 237, 244, 251, ++ 258, 263, 268, 273, 278, 283, 288, 293, ++ 298, 303, 308, 313, 318, 323, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, 328, 328, 328, 328, 328, 328, 328, ++ 328, ++ }, ++ { /* Fourth byte 16-bit table 6. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 7, 14, 23, 32, 41, 50, 59, ++ 68, 75, 82, 91, 100, 109, 118, 127, ++ 136, 143, 150, 159, 168, 177, 186, 195, ++ 204, 211, 218, 227, 236, 245, 254, 263, ++ 272, 279, 286, 295, 304, 313, 322, 331, ++ 340, 347, 354, 363, 372, 381, 390, 399, ++ 408, 413, 418, 425, 430, 437, 437, 442, ++ 449, 454, 459, 464, 469, 474, 477, 480, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, 483, 483, 483, 483, 483, 483, 483, ++ 483, ++ }, ++ { /* Fourth byte 16-bit table 7. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 14, 21, 26, 33, 33, 38, ++ 45, 50, 55, 60, 65, 70, 82, 94, ++ 106, 111, 116, 123, 130, 130, 130, 135, ++ 142, 147, 152, 157, 162, 162, 174, 186, ++ 198, 203, 208, 215, 222, 227, 232, 237, ++ 244, 249, 254, 259, 264, 269, 280, 291, ++ 293, 293, 293, 300, 305, 312, 312, 317, ++ 324, 329, 334, 339, 344, 349, 356, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, 359, 359, 359, 359, 359, 359, 359, ++ 359, ++ }, ++ { /* Fourth byte 16-bit table 8. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 10, 15, 20, 25, 30, 35, ++ 40, 45, 50, 55, 60, 65, 70, 78, ++ 86, 94, 102, 110, 118, 126, 134, 142, ++ 150, 158, 166, 174, 182, 190, 207, 221, ++ 221, 226, 231, 236, 241, 246, 251, 256, ++ 261, 266, 271, 276, 281, 286, 291, 296, ++ 301, 306, 311, 316, 321, 326, 331, 336, ++ 341, 346, 351, 356, 361, 366, 371, 376, ++ 381, 381, 381, 381, 381, 381, 381, 381, ++ 381, 381, 381, 381, 381, 381, 381, 381, ++ 381, 381, 381, 381, 381, 381, 381, 381, ++ 381, 381, 381, 381, 381, 381, 381, 381, ++ 381, 381, 381, 381, 381, 381, 381, 381, ++ 381, 381, 381, 381, 381, 381, 381, 381, ++ 381, 381, 381, 381, 381, 381, 381, 381, ++ 381, 381, 381, 381, 381, 381, 381, 381, ++ 381, ++ }, ++ { /* Fourth byte 16-bit table 9. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 15, 27, 42, 51, 66, 75, 84, ++ 102, 114, 123, 132, 141, 153, 165, 177, ++ 189, 201, 213, 225, 243, 249, 267, 285, ++ 300, 312, 330, 348, 360, 369, 378, 390, ++ 402, 417, 432, 441, 450, 462, 471, 480, ++ 486, 492, 501, 510, 528, 540, 555, 573, ++ 585, 594, 603, 621, 633, 651, 660, 675, ++ 684, 696, 705, 717, 732, 744, 759, 771, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, 777, 777, 777, 777, 777, 777, 777, ++ 777, ++ }, ++ { /* Fourth byte 16-bit table 10. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 15, 24, 33, 45, 54, 63, 72, ++ 87, 99, 105, 123, 132, 147, 159, 171, ++ 180, 189, 201, 207, 219, 234, 240, 258, ++ 267, 271, 275, 279, 283, 287, 291, 295, ++ 299, 303, 307, 312, 317, 322, 327, 332, ++ 337, 342, 347, 352, 357, 362, 367, 372, ++ 377, 382, 385, 387, 389, 392, 394, 396, ++ 398, 401, 404, 406, 412, 418, 424, 430, ++ 442, 442, 442, 442, 442, 442, 442, 442, ++ 442, 442, 442, 442, 442, 442, 442, 442, ++ 442, 442, 442, 442, 442, 442, 442, 442, ++ 442, 442, 442, 442, 442, 442, 442, 442, ++ 442, 442, 442, 442, 442, 442, 442, 442, ++ 442, 442, 442, 442, 442, 442, 442, 442, ++ 442, 442, 442, 442, 442, 442, 442, 442, ++ 442, 442, 442, 442, 442, 442, 442, 442, ++ 442, ++ }, ++ { /* Fourth byte 16-bit table 11. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 128, 132, 136, 140, 144, 148, 152, 156, ++ 160, 164, 168, 172, 176, 180, 184, 188, ++ 192, 196, 200, 204, 208, 212, 216, 220, ++ 224, 228, 232, 236, 240, 244, 248, 252, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, ++ }, ++ { /* Fourth byte 16-bit table 12. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 128, 132, 136, 140, 144, 148, 152, 156, ++ 160, 164, 168, 172, 176, 180, 184, 188, ++ 192, 196, 200, 204, 208, 212, 216, 220, ++ 224, 228, 232, 236, 240, 244, 248, 252, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, ++ }, ++ { /* Fourth byte 16-bit table 13. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 128, 132, 136, 140, 144, 148, 152, 156, ++ 160, 164, 168, 172, 176, 180, 184, 188, ++ 192, 196, 200, 204, 208, 212, 216, 220, ++ 224, 228, 232, 236, 240, 244, 248, 252, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, ++ }, ++ { /* Fourth byte 16-bit table 14. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 128, 132, 136, 140, 144, 148, 152, 156, ++ 160, 164, 168, 172, 176, 180, 184, 188, ++ 192, 196, 200, 204, 208, 212, 216, 220, ++ 224, 228, 232, 236, 240, 244, 248, 252, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, ++ }, ++ { /* Fourth byte 16-bit table 15. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 128, 132, 136, 140, 144, 148, 152, 156, ++ 160, 164, 168, 172, 176, 180, 184, 188, ++ 192, 196, 200, 204, 208, 212, 216, 220, ++ 224, 228, 232, 236, 240, 244, 248, 252, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, 256, 256, 256, 256, 256, 256, 256, ++ 256, ++ }, ++ { /* Fourth byte 16-bit table 16. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 18, 24, 30, 34, 38, ++ 42, 46, 50, 54, 58, 62, 66, 70, ++ 74, 78, 82, 86, 90, 94, 98, 102, ++ 106, 110, 114, 118, 122, 126, 130, 134, ++ 138, 142, 146, 150, 154, 158, 162, 166, ++ 170, 174, 178, 182, 186, 190, 194, 198, ++ 202, 206, 210, 214, 218, 222, 226, 230, ++ 234, 238, 242, 246, 250, 254, 258, 262, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, ++ }, ++ { /* Fourth byte 16-bit table 17. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 125, ++ 130, 135, 140, 145, 150, 156, 162, 168, ++ 174, 180, 186, 190, 194, 198, 202, 206, ++ 210, 214, 218, 222, 226, 230, 234, 238, ++ 242, 246, 250, 254, 258, 262, 266, 270, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, 274, 274, 274, 274, 274, 274, 274, ++ 274, ++ }, ++ { /* Fourth byte 16-bit table 18. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 98, 104, 110, 116, 122, 126, 130, 134, ++ 138, 142, 146, 150, 154, 158, 162, 166, ++ 170, 174, 178, 182, 186, 190, 194, 198, ++ 202, 206, 210, 214, 218, 222, 226, 230, ++ 234, 238, 242, 246, 250, 254, 258, 262, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, ++ }, ++ { /* Fourth byte 16-bit table 19. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 130, 136, 140, 144, 148, 152, 156, 160, ++ 164, 168, 172, 176, 180, 184, 188, 192, ++ 196, 200, 204, 210, 216, 222, 226, 230, ++ 234, 238, 242, 246, 250, 254, 258, 262, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, 266, 266, 266, 266, 266, 266, 266, ++ 266, ++ }, ++ { /* Fourth byte 16-bit table 20. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 18, 24, 30, 36, 42, ++ 48, 54, 60, 66, 72, 78, 84, 90, ++ 96, 102, 108, 114, 120, 126, 132, 138, ++ 144, 150, 156, 162, 168, 174, 180, 186, ++ 192, 198, 204, 210, 216, 222, 228, 234, ++ 240, 246, 252, 258, 264, 270, 276, 282, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, 288, 288, 288, 288, 288, 288, 288, ++ 288, ++ }, ++ { /* Fourth byte 16-bit table 21. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 6, 12, 18, 24, 30, 36, 42, ++ 48, 54, 60, 66, 72, 78, 84, 90, ++ 96, 96, 96, 102, 108, 114, 120, 126, ++ 132, 138, 144, 150, 156, 162, 168, 174, ++ 180, 186, 192, 198, 204, 210, 216, 222, ++ 228, 234, 240, 246, 252, 258, 264, 270, ++ 276, 282, 288, 294, 300, 306, 312, 318, ++ 324, 330, 336, 342, 348, 354, 360, 366, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, 372, 372, 372, 372, 372, 372, 372, ++ 372, ++ }, ++ { /* Fourth byte 16-bit table 22. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 17, 21, 25, 29, ++ 33, 37, 41, 45, 49, 53, 58, 62, ++ 66, 70, 74, 79, 83, 87, 91, 96, ++ 100, 104, 108, 112, 116, 121, 125, 129, ++ 133, 137, 141, 145, 149, 153, 157, 161, ++ 165, 169, 173, 177, 181, 185, 189, 193, ++ 197, 201, 205, 209, 213, 218, 222, 226, ++ 230, 235, 239, 243, 247, 251, 255, 259, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, ++ }, ++ { /* Fourth byte 16-bit table 23. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 105, 109, 113, 117, 121, 125, ++ 129, 134, 139, 143, 147, 151, 155, 159, ++ 163, 167, 171, 175, 179, 184, 188, 192, ++ 196, 200, 205, 209, 213, 217, 221, 225, ++ 229, 233, 237, 241, 246, 250, 255, 259, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, 263, 263, 263, 263, 263, 263, 263, ++ 263, ++ }, ++ { /* Fourth byte 16-bit table 24. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 41, 45, 49, 53, 57, 61, ++ 66, 70, 75, 80, 84, 88, 92, 96, ++ 101, 106, 110, 114, 118, 122, 126, 130, ++ 134, 138, 142, 146, 150, 155, 159, 163, ++ 167, 171, 175, 179, 183, 187, 191, 195, ++ 199, 203, 207, 211, 215, 219, 223, 227, ++ 231, 236, 240, 244, 248, 252, 256, 261, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, ++ }, ++ { /* Fourth byte 16-bit table 25. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 45, 49, 53, 57, 61, ++ 65, 69, 73, 77, 81, 85, 89, 93, ++ 97, 101, 105, 109, 113, 117, 122, 126, ++ 130, 134, 138, 142, 147, 151, 155, 159, ++ 163, 167, 171, 175, 179, 184, 188, 192, ++ 196, 201, 205, 209, 213, 217, 221, 225, ++ 230, 235, 240, 244, 249, 253, 257, 261, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, 265, 265, 265, 265, 265, 265, 265, ++ 265, ++ }, ++ { /* Fourth byte 16-bit table 26. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 29, ++ 33, 37, 41, 45, 49, 53, 58, 62, ++ 66, 71, 76, 80, 84, 88, 92, 96, ++ 100, 104, 108, 112, 117, 121, 126, 130, ++ 135, 139, 143, 147, 152, 156, 160, 165, ++ 170, 174, 178, 182, 186, 190, 194, 198, ++ 202, 206, 210, 214, 218, 222, 227, 231, ++ 236, 240, 245, 249, 254, 259, 264, 268, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, ++ }, ++ { /* Fourth byte 16-bit table 27. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 9, 14, 19, 24, 28, 32, ++ 36, 40, 44, 48, 52, 56, 61, 65, ++ 69, 73, 77, 82, 86, 91, 96, 100, ++ 104, 108, 112, 116, 120, 125, 130, 135, ++ 139, 143, 148, 152, 156, 160, 165, 169, ++ 173, 177, 181, 185, 190, 194, 198, 202, ++ 206, 210, 214, 219, 224, 228, 233, 237, ++ 242, 246, 250, 254, 259, 264, 268, 273, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, 277, 277, 277, 277, 277, 277, 277, ++ 277, ++ }, ++ { /* Fourth byte 16-bit table 28. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 5, 9, 13, 17, 21, 25, 29, ++ 34, 39, 44, 49, 53, 57, 61, 65, ++ 69, 73, 77, 81, 85, 89, 93, 97, ++ 102, 106, 110, 114, 118, 122, 126, 130, ++ 134, 138, 142, 146, 150, 155, 160, 165, ++ 169, 173, 177, 181, 186, 190, 195, 199, ++ 203, 208, 213, 217, 221, 225, 229, 233, ++ 237, 241, 245, 249, 253, 257, 261, 265, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, 269, 269, 269, 269, 269, 269, 269, ++ 269, ++ }, ++ { /* Fourth byte 16-bit table 29. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 25, 29, ++ 33, 37, 41, 45, 50, 55, 59, 63, ++ 67, 71, 75, 79, 84, 88, 92, 96, ++ 100, 105, 110, 114, 118, 122, 127, 131, ++ 135, 140, 145, 149, 153, 157, 162, 166, ++ 170, 174, 178, 182, 186, 190, 195, 199, ++ 203, 207, 212, 216, 220, 224, 228, 233, ++ 238, 242, 246, 250, 255, 259, 264, 268, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, 272, 272, 272, 272, 272, 272, 272, ++ 272, ++ }, ++ }, ++}; ++ ++static const uchar_t u8_decomp_final_tbl[2][19370] = { ++ { ++ 0x20, 0x20, 0xCC, 0x88, 0x61, 0x20, 0xCC, 0x84, ++ 0x32, 0x33, 0x20, 0xCC, 0x81, 0xCE, 0xBC, 0x20, ++ 0xCC, 0xA7, 0x31, 0x6F, 0x31, 0xE2, 0x81, 0x84, ++ 0x34, 0x31, 0xE2, 0x81, 0x84, 0x32, 0x33, 0xE2, ++ 0x81, 0x84, 0x34, 0xF6, 0x41, 0xCC, 0x80, 0xF6, ++ 0x41, 0xCC, 0x81, 0xF6, 0x41, 0xCC, 0x82, 0xF6, ++ 0x41, 0xCC, 0x83, 0xF6, 0x41, 0xCC, 0x88, 0xF6, ++ 0x41, 0xCC, 0x8A, 0xF6, 0x43, 0xCC, 0xA7, 0xF6, ++ 0x45, 0xCC, 0x80, 0xF6, 0x45, 0xCC, 0x81, 0xF6, ++ 0x45, 0xCC, 0x82, 0xF6, 0x45, 0xCC, 0x88, 0xF6, ++ 0x49, 0xCC, 0x80, 0xF6, 0x49, 0xCC, 0x81, 0xF6, ++ 0x49, 0xCC, 0x82, 0xF6, 0x49, 0xCC, 0x88, 0xF6, ++ 0x4E, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x80, 0xF6, ++ 0x4F, 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xF6, ++ 0x4F, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x88, 0xF6, ++ 0x55, 0xCC, 0x80, 0xF6, 0x55, 0xCC, 0x81, 0xF6, ++ 0x55, 0xCC, 0x82, 0xF6, 0x55, 0xCC, 0x88, 0xF6, ++ 0x59, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x80, 0xF6, ++ 0x61, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x82, 0xF6, ++ 0x61, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x88, 0xF6, ++ 0x61, 0xCC, 0x8A, 0xF6, 0x63, 0xCC, 0xA7, 0xF6, ++ 0x65, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x81, 0xF6, ++ 0x65, 0xCC, 0x82, 0xF6, 0x65, 0xCC, 0x88, 0xF6, ++ 0x69, 0xCC, 0x80, 0xF6, 0x69, 0xCC, 0x81, 0xF6, ++ 0x69, 0xCC, 0x82, 0xF6, 0x69, 0xCC, 0x88, 0xF6, ++ 0x6E, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x80, 0xF6, ++ 0x6F, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82, 0xF6, ++ 0x6F, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x88, 0xF6, ++ 0x75, 0xCC, 0x80, 0xF6, 0x75, 0xCC, 0x81, 0xF6, ++ 0x75, 0xCC, 0x82, 0xF6, 0x75, 0xCC, 0x88, 0xF6, ++ 0x79, 0xCC, 0x81, 0xF6, 0x79, 0xCC, 0x88, 0xF6, ++ 0x41, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x84, 0xF6, ++ 0x41, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0x86, 0xF6, ++ 0x41, 0xCC, 0xA8, 0xF6, 0x61, 0xCC, 0xA8, 0xF6, ++ 0x43, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0x81, 0xF6, ++ 0x43, 0xCC, 0x82, 0xF6, 0x63, 0xCC, 0x82, 0xF6, ++ 0x43, 0xCC, 0x87, 0xF6, 0x63, 0xCC, 0x87, 0xF6, ++ 0x43, 0xCC, 0x8C, 0xF6, 0x63, 0xCC, 0x8C, 0xF6, ++ 0x44, 0xCC, 0x8C, 0xF6, 0x64, 0xCC, 0x8C, 0xF6, ++ 0x45, 0xCC, 0x84, 0xF6, 0x65, 0xCC, 0x84, 0xF6, ++ 0x45, 0xCC, 0x86, 0xF6, 0x65, 0xCC, 0x86, 0xF6, ++ 0x45, 0xCC, 0x87, 0xF6, 0x65, 0xCC, 0x87, 0xF6, ++ 0x45, 0xCC, 0xA8, 0xF6, 0x65, 0xCC, 0xA8, 0xF6, ++ 0x45, 0xCC, 0x8C, 0xF6, 0x65, 0xCC, 0x8C, 0xF6, ++ 0x47, 0xCC, 0x82, 0xF6, 0x67, 0xCC, 0x82, 0xF6, ++ 0x47, 0xCC, 0x86, 0xF6, 0x67, 0xCC, 0x86, 0xF6, ++ 0x47, 0xCC, 0x87, 0xF6, 0x67, 0xCC, 0x87, 0xF6, ++ 0x47, 0xCC, 0xA7, 0xF6, 0x67, 0xCC, 0xA7, 0xF6, ++ 0x48, 0xCC, 0x82, 0xF6, 0x68, 0xCC, 0x82, 0xF6, ++ 0x49, 0xCC, 0x83, 0xF6, 0x69, 0xCC, 0x83, 0xF6, ++ 0x49, 0xCC, 0x84, 0xF6, 0x69, 0xCC, 0x84, 0xF6, ++ 0x49, 0xCC, 0x86, 0xF6, 0x69, 0xCC, 0x86, 0xF6, ++ 0x49, 0xCC, 0xA8, 0xF6, 0x69, 0xCC, 0xA8, 0xF6, ++ 0x49, 0xCC, 0x87, 0x49, 0x4A, 0x69, 0x6A, 0xF6, ++ 0x4A, 0xCC, 0x82, 0xF6, 0x6A, 0xCC, 0x82, 0xF6, ++ 0x4B, 0xCC, 0xA7, 0xF6, 0x6B, 0xCC, 0xA7, 0xF6, ++ 0x4C, 0xCC, 0x81, 0xF6, 0x6C, 0xCC, 0x81, 0xF6, ++ 0x4C, 0xCC, 0xA7, 0xF6, 0x6C, 0xCC, 0xA7, 0xF6, ++ 0x4C, 0xCC, 0x8C, 0xF6, 0x6C, 0xCC, 0x8C, 0x4C, ++ 0xC2, 0xB7, 0x6C, 0xC2, 0xB7, 0xF6, 0x4E, 0xCC, ++ 0x81, 0xF6, 0x6E, 0xCC, 0x81, 0xF6, 0x4E, 0xCC, ++ 0xA7, 0xF6, 0x6E, 0xCC, 0xA7, 0xF6, 0x4E, 0xCC, ++ 0x8C, 0xF6, 0x6E, 0xCC, 0x8C, 0xCA, 0xBC, 0x6E, ++ 0xF6, 0x4F, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, 0x84, ++ 0xF6, 0x4F, 0xCC, 0x86, 0xF6, 0x6F, 0xCC, 0x86, ++ 0xF6, 0x4F, 0xCC, 0x8B, 0xF6, 0x6F, 0xCC, 0x8B, ++ 0xF6, 0x52, 0xCC, 0x81, 0xF6, 0x72, 0xCC, 0x81, ++ 0xF6, 0x52, 0xCC, 0xA7, 0xF6, 0x72, 0xCC, 0xA7, ++ 0xF6, 0x52, 0xCC, 0x8C, 0xF6, 0x72, 0xCC, 0x8C, ++ 0xF6, 0x53, 0xCC, 0x81, 0xF6, 0x73, 0xCC, 0x81, ++ 0xF6, 0x53, 0xCC, 0x82, 0xF6, 0x73, 0xCC, 0x82, ++ 0xF6, 0x53, 0xCC, 0xA7, 0xF6, 0x73, 0xCC, 0xA7, ++ 0xF6, 0x53, 0xCC, 0x8C, 0xF6, 0x73, 0xCC, 0x8C, ++ 0xF6, 0x54, 0xCC, 0xA7, 0xF6, 0x74, 0xCC, 0xA7, ++ 0xF6, 0x54, 0xCC, 0x8C, 0xF6, 0x74, 0xCC, 0x8C, ++ 0xF6, 0x55, 0xCC, 0x83, 0xF6, 0x75, 0xCC, 0x83, ++ 0xF6, 0x55, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x84, ++ 0xF6, 0x55, 0xCC, 0x86, 0xF6, 0x75, 0xCC, 0x86, ++ 0xF6, 0x55, 0xCC, 0x8A, 0xF6, 0x75, 0xCC, 0x8A, ++ 0xF6, 0x55, 0xCC, 0x8B, 0xF6, 0x75, 0xCC, 0x8B, ++ 0xF6, 0x55, 0xCC, 0xA8, 0xF6, 0x75, 0xCC, 0xA8, ++ 0xF6, 0x57, 0xCC, 0x82, 0xF6, 0x77, 0xCC, 0x82, ++ 0xF6, 0x59, 0xCC, 0x82, 0xF6, 0x79, 0xCC, 0x82, ++ 0xF6, 0x59, 0xCC, 0x88, 0xF6, 0x5A, 0xCC, 0x81, ++ 0xF6, 0x7A, 0xCC, 0x81, 0xF6, 0x5A, 0xCC, 0x87, ++ 0xF6, 0x7A, 0xCC, 0x87, 0xF6, 0x5A, 0xCC, 0x8C, ++ 0xF6, 0x7A, 0xCC, 0x8C, 0x73, 0xF6, 0x4F, 0xCC, ++ 0x9B, 0xF6, 0x6F, 0xCC, 0x9B, 0xF6, 0x55, 0xCC, ++ 0x9B, 0xF6, 0x75, 0xCC, 0x9B, 0x44, 0x5A, 0xCC, ++ 0x8C, 0x44, 0x7A, 0xCC, 0x8C, 0x64, 0x7A, 0xCC, ++ 0x8C, 0x4C, 0x4A, 0x4C, 0x6A, 0x6C, 0x6A, 0x4E, ++ 0x4A, 0x4E, 0x6A, 0x6E, 0x6A, 0xF6, 0x41, 0xCC, ++ 0x8C, 0xF6, 0x61, 0xCC, 0x8C, 0xF6, 0x49, 0xCC, ++ 0x8C, 0xF6, 0x69, 0xCC, 0x8C, 0xF6, 0x4F, 0xCC, ++ 0x8C, 0xF6, 0x6F, 0xCC, 0x8C, 0xF6, 0x55, 0xCC, ++ 0x8C, 0xF6, 0x75, 0xCC, 0x8C, 0xF6, 0x55, 0xCC, ++ 0x88, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x88, 0xCC, ++ 0x84, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x81, 0xF6, ++ 0x75, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x55, 0xCC, ++ 0x88, 0xCC, 0x8C, 0xF6, 0x75, 0xCC, 0x88, 0xCC, ++ 0x8C, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x80, 0xF6, ++ 0x75, 0xCC, 0x88, 0xCC, 0x80, 0xF6, 0x41, 0xCC, ++ 0x88, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x88, 0xCC, ++ 0x84, 0xF6, 0x41, 0xCC, 0x87, 0xCC, 0x84, 0xF6, ++ 0x61, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0xC3, 0x86, ++ 0xCC, 0x84, 0xF6, 0xC3, 0xA6, 0xCC, 0x84, 0xF6, ++ 0x47, 0xCC, 0x8C, 0xF6, 0x67, 0xCC, 0x8C, 0xF6, ++ 0x4B, 0xCC, 0x8C, 0xF6, 0x6B, 0xCC, 0x8C, 0xF6, ++ 0x4F, 0xCC, 0xA8, 0xF6, 0x6F, 0xCC, 0xA8, 0xF6, ++ 0x4F, 0xCC, 0xA8, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, ++ 0xA8, 0xCC, 0x84, 0xF6, 0xC6, 0xB7, 0xCC, 0x8C, ++ 0xF6, 0xCA, 0x92, 0xCC, 0x8C, 0xF6, 0x6A, 0xCC, ++ 0x8C, 0x44, 0x5A, 0x44, 0x7A, 0x64, 0x7A, 0xF6, ++ 0x47, 0xCC, 0x81, 0xF6, 0x67, 0xCC, 0x81, 0xF6, ++ 0x4E, 0xCC, 0x80, 0xF6, 0x6E, 0xCC, 0x80, 0xF6, ++ 0x41, 0xCC, 0x8A, 0xCC, 0x81, 0xF6, 0x61, 0xCC, ++ 0x8A, 0xCC, 0x81, 0xF6, 0xC3, 0x86, 0xCC, 0x81, ++ 0xF6, 0xC3, 0xA6, 0xCC, 0x81, 0xF6, 0xC3, 0x98, ++ 0xCC, 0x81, 0xF6, 0xC3, 0xB8, 0xCC, 0x81, 0xF6, ++ 0x41, 0xCC, 0x8F, 0xF6, 0x61, 0xCC, 0x8F, 0xF6, ++ 0x41, 0xCC, 0x91, 0xF6, 0x61, 0xCC, 0x91, 0xF6, ++ 0x45, 0xCC, 0x8F, 0xF6, 0x65, 0xCC, 0x8F, 0xF6, ++ 0x45, 0xCC, 0x91, 0xF6, 0x65, 0xCC, 0x91, 0xF6, ++ 0x49, 0xCC, 0x8F, 0xF6, 0x69, 0xCC, 0x8F, 0xF6, ++ 0x49, 0xCC, 0x91, 0xF6, 0x69, 0xCC, 0x91, 0xF6, ++ 0x4F, 0xCC, 0x8F, 0xF6, 0x6F, 0xCC, 0x8F, 0xF6, ++ 0x4F, 0xCC, 0x91, 0xF6, 0x6F, 0xCC, 0x91, 0xF6, ++ 0x52, 0xCC, 0x8F, 0xF6, 0x72, 0xCC, 0x8F, 0xF6, ++ 0x52, 0xCC, 0x91, 0xF6, 0x72, 0xCC, 0x91, 0xF6, ++ 0x55, 0xCC, 0x8F, 0xF6, 0x75, 0xCC, 0x8F, 0xF6, ++ 0x55, 0xCC, 0x91, 0xF6, 0x75, 0xCC, 0x91, 0xF6, ++ 0x53, 0xCC, 0xA6, 0xF6, 0x73, 0xCC, 0xA6, 0xF6, ++ 0x54, 0xCC, 0xA6, 0xF6, 0x74, 0xCC, 0xA6, 0xF6, ++ 0x48, 0xCC, 0x8C, 0xF6, 0x68, 0xCC, 0x8C, 0xF6, ++ 0x41, 0xCC, 0x87, 0xF6, 0x61, 0xCC, 0x87, 0xF6, ++ 0x45, 0xCC, 0xA7, 0xF6, 0x65, 0xCC, 0xA7, 0xF6, ++ 0x4F, 0xCC, 0x88, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, ++ 0x88, 0xCC, 0x84, 0xF6, 0x4F, 0xCC, 0x83, 0xCC, ++ 0x84, 0xF6, 0x6F, 0xCC, 0x83, 0xCC, 0x84, 0xF6, ++ 0x4F, 0xCC, 0x87, 0xF6, 0x6F, 0xCC, 0x87, 0xF6, ++ 0x4F, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, ++ 0x87, 0xCC, 0x84, 0xF6, 0x59, 0xCC, 0x84, 0xF6, ++ 0x79, 0xCC, 0x84, 0x68, 0xC9, 0xA6, 0x6A, 0x72, ++ 0xC9, 0xB9, 0xC9, 0xBB, 0xCA, 0x81, 0x77, 0x79, ++ 0x20, 0xCC, 0x86, 0x20, 0xCC, 0x87, 0x20, 0xCC, ++ 0x8A, 0x20, 0xCC, 0xA8, 0x20, 0xCC, 0x83, 0x20, ++ 0xCC, 0x8B, 0xC9, 0xA3, 0x6C, 0x73, 0x78, 0xCA, ++ 0x95, 0xF6, 0xCC, 0x80, 0xF6, 0xCC, 0x81, 0xF6, ++ 0xCC, 0x93, 0xF6, 0xCC, 0x88, 0xCC, 0x81, 0xF6, ++ 0xCA, 0xB9, 0x20, 0xCD, 0x85, 0xF6, 0x3B, 0x20, ++ 0xCC, 0x81, 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81, ++ 0x20, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0xCE, 0x91, ++ 0xCC, 0x81, 0xF6, 0xC2, 0xB7, 0xF6, 0xCE, 0x95, ++ 0xCC, 0x81, 0xF6, 0xCE, 0x97, 0xCC, 0x81, 0xF6, ++ 0xCE, 0x99, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, ++ 0x81, 0xF6, 0xCE, 0xA5, 0xCC, 0x81, 0xF6, 0xCE, ++ 0xA9, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x88, ++ 0xCC, 0x81, 0xF6, 0xCE, 0x99, 0xCC, 0x88, 0xF6, ++ 0xCE, 0xA5, 0xCC, 0x88, 0xF6, 0xCE, 0xB1, 0xCC, ++ 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x81, 0xF6, 0xCE, ++ 0xB7, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x81, ++ 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6, ++ 0xCE, 0xB9, 0xCC, 0x88, 0xF6, 0xCF, 0x85, 0xCC, ++ 0x88, 0xF6, 0xCE, 0xBF, 0xCC, 0x81, 0xF6, 0xCF, ++ 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC, 0x81, ++ 0xCE, 0xB2, 0xCE, 0xB8, 0xCE, 0xA5, 0xF5, 0x05, ++ 0xCF, 0x92, 0xCC, 0x81, 0xCE, 0xA5, 0xCC, 0x81, ++ 0xF5, 0x05, 0xCF, 0x92, 0xCC, 0x88, 0xCE, 0xA5, ++ 0xCC, 0x88, 0xCF, 0x86, 0xCF, 0x80, 0xCE, 0xBA, ++ 0xCF, 0x81, 0xCF, 0x82, 0xCE, 0x98, 0xCE, 0xB5, ++ 0xF6, 0xD0, 0x95, 0xCC, 0x80, 0xF6, 0xD0, 0x95, ++ 0xCC, 0x88, 0xF6, 0xD0, 0x93, 0xCC, 0x81, 0xF6, ++ 0xD0, 0x86, 0xCC, 0x88, 0xF6, 0xD0, 0x9A, 0xCC, ++ 0x81, 0xF6, 0xD0, 0x98, 0xCC, 0x80, 0xF6, 0xD0, ++ 0xA3, 0xCC, 0x86, 0xF6, 0xD0, 0x98, 0xCC, 0x86, ++ 0xF6, 0xD0, 0xB8, 0xCC, 0x86, 0xF6, 0xD0, 0xB5, ++ 0xCC, 0x80, 0xF6, 0xD0, 0xB5, 0xCC, 0x88, 0xF6, ++ 0xD0, 0xB3, 0xCC, 0x81, 0xF6, 0xD1, 0x96, 0xCC, ++ 0x88, 0xF6, 0xD0, 0xBA, 0xCC, 0x81, 0xF6, 0xD0, ++ 0xB8, 0xCC, 0x80, 0xF6, 0xD1, 0x83, 0xCC, 0x86, ++ 0xF6, 0xD1, 0xB4, 0xCC, 0x8F, 0xF6, 0xD1, 0xB5, ++ 0xCC, 0x8F, 0xF6, 0xD0, 0x96, 0xCC, 0x86, 0xF6, ++ 0xD0, 0xB6, 0xCC, 0x86, 0xF6, 0xD0, 0x90, 0xCC, ++ 0x86, 0xF6, 0xD0, 0xB0, 0xCC, 0x86, 0xF6, 0xD0, ++ 0x90, 0xCC, 0x88, 0xF6, 0xD0, 0xB0, 0xCC, 0x88, ++ 0xF6, 0xD0, 0x95, 0xCC, 0x86, 0xF6, 0xD0, 0xB5, ++ 0xCC, 0x86, 0xF6, 0xD3, 0x98, 0xCC, 0x88, 0xF6, ++ 0xD3, 0x99, 0xCC, 0x88, 0xF6, 0xD0, 0x96, 0xCC, ++ 0x88, 0xF6, 0xD0, 0xB6, 0xCC, 0x88, 0xF6, 0xD0, ++ 0x97, 0xCC, 0x88, 0xF6, 0xD0, 0xB7, 0xCC, 0x88, ++ 0xF6, 0xD0, 0x98, 0xCC, 0x84, 0xF6, 0xD0, 0xB8, ++ 0xCC, 0x84, 0xF6, 0xD0, 0x98, 0xCC, 0x88, 0xF6, ++ 0xD0, 0xB8, 0xCC, 0x88, 0xF6, 0xD0, 0x9E, 0xCC, ++ 0x88, 0xF6, 0xD0, 0xBE, 0xCC, 0x88, 0xF6, 0xD3, ++ 0xA8, 0xCC, 0x88, 0xF6, 0xD3, 0xA9, 0xCC, 0x88, ++ 0xF6, 0xD0, 0xAD, 0xCC, 0x88, 0xF6, 0xD1, 0x8D, ++ 0xCC, 0x88, 0xF6, 0xD0, 0xA3, 0xCC, 0x84, 0xF6, ++ 0xD1, 0x83, 0xCC, 0x84, 0xF6, 0xD0, 0xA3, 0xCC, ++ 0x88, 0xF6, 0xD1, 0x83, 0xCC, 0x88, 0xF6, 0xD0, ++ 0xA3, 0xCC, 0x8B, 0xF6, 0xD1, 0x83, 0xCC, 0x8B, ++ 0xF6, 0xD0, 0xA7, 0xCC, 0x88, 0xF6, 0xD1, 0x87, ++ 0xCC, 0x88, 0xF6, 0xD0, 0xAB, 0xCC, 0x88, 0xF6, ++ 0xD1, 0x8B, 0xCC, 0x88, 0xD5, 0xA5, 0xD6, 0x82, ++ 0xF6, 0xD8, 0xA7, 0xD9, 0x93, 0xF6, 0xD8, 0xA7, ++ 0xD9, 0x94, 0xF6, 0xD9, 0x88, 0xD9, 0x94, 0xF6, ++ 0xD8, 0xA7, 0xD9, 0x95, 0xF6, 0xD9, 0x8A, 0xD9, ++ 0x94, 0xD8, 0xA7, 0xD9, 0xB4, 0xD9, 0x88, 0xD9, ++ 0xB4, 0xDB, 0x87, 0xD9, 0xB4, 0xD9, 0x8A, 0xD9, ++ 0xB4, 0xF6, 0xDB, 0x95, 0xD9, 0x94, 0xF6, 0xDB, ++ 0x81, 0xD9, 0x94, 0xF6, 0xDB, 0x92, 0xD9, 0x94, ++ 0xF6, 0xE0, 0xA4, 0xA8, 0xE0, 0xA4, 0xBC, 0xF6, ++ 0xE0, 0xA4, 0xB0, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, ++ 0xA4, 0xB3, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, ++ 0x95, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x96, ++ 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x97, 0xE0, ++ 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x9C, 0xE0, 0xA4, ++ 0xBC, 0xF6, 0xE0, 0xA4, 0xA1, 0xE0, 0xA4, 0xBC, ++ 0xF6, 0xE0, 0xA4, 0xA2, 0xE0, 0xA4, 0xBC, 0xF6, ++ 0xE0, 0xA4, 0xAB, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, ++ 0xA4, 0xAF, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA7, ++ 0x87, 0xE0, 0xA6, 0xBE, 0xF6, 0xE0, 0xA7, 0x87, ++ 0xE0, 0xA7, 0x97, 0xF6, 0xE0, 0xA6, 0xA1, 0xE0, ++ 0xA6, 0xBC, 0xF6, 0xE0, 0xA6, 0xA2, 0xE0, 0xA6, ++ 0xBC, 0xF6, 0xE0, 0xA6, 0xAF, 0xE0, 0xA6, 0xBC, ++ 0xF6, 0xE0, 0xA8, 0xB2, 0xE0, 0xA8, 0xBC, 0xF6, ++ 0xE0, 0xA8, 0xB8, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, ++ 0xA8, 0x96, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8, ++ 0x97, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8, 0x9C, ++ 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8, 0xAB, 0xE0, ++ 0xA8, 0xBC, 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAD, ++ 0x96, 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAC, 0xBE, ++ 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAD, 0x97, 0xF6, ++ 0xE0, 0xAC, 0xA1, 0xE0, 0xAC, 0xBC, 0xF6, 0xE0, ++ 0xAC, 0xA2, 0xE0, 0xAC, 0xBC, 0xF6, 0xE0, 0xAE, ++ 0x92, 0xE0, 0xAF, 0x97, 0xF6, 0xE0, 0xAF, 0x86, ++ 0xE0, 0xAE, 0xBE, 0xF6, 0xE0, 0xAF, 0x87, 0xE0, ++ 0xAE, 0xBE, 0xF6, 0xE0, 0xAF, 0x86, 0xE0, 0xAF, ++ 0x97, 0xF6, 0xE0, 0xB1, 0x86, 0xE0, 0xB1, 0x96, ++ 0xF6, 0xE0, 0xB2, 0xBF, 0xE0, 0xB3, 0x95, 0xF6, ++ 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x95, 0xF6, 0xE0, ++ 0xB3, 0x86, 0xE0, 0xB3, 0x96, 0xF6, 0xE0, 0xB3, ++ 0x86, 0xE0, 0xB3, 0x82, 0xF6, 0xE0, 0xB3, 0x86, ++ 0xE0, 0xB3, 0x82, 0xE0, 0xB3, 0x95, 0xF6, 0xE0, ++ 0xB5, 0x86, 0xE0, 0xB4, 0xBE, 0xF6, 0xE0, 0xB5, ++ 0x87, 0xE0, 0xB4, 0xBE, 0xF6, 0xE0, 0xB5, 0x86, ++ 0xE0, 0xB5, 0x97, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, ++ 0xB7, 0x8A, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, 0xB7, ++ 0x8F, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, 0xB7, 0x8F, ++ 0xE0, 0xB7, 0x8A, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, ++ 0xB7, 0x9F, 0xE0, 0xB9, 0x8D, 0xE0, 0xB8, 0xB2, ++ 0xE0, 0xBB, 0x8D, 0xE0, 0xBA, 0xB2, 0xE0, 0xBA, ++ 0xAB, 0xE0, 0xBA, 0x99, 0xE0, 0xBA, 0xAB, 0xE0, ++ 0xBA, 0xA1, 0xE0, 0xBC, 0x8B, 0xF6, 0xE0, 0xBD, ++ 0x82, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x8C, ++ 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x91, 0xE0, ++ 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x96, 0xE0, 0xBE, ++ 0xB7, 0xF6, 0xE0, 0xBD, 0x9B, 0xE0, 0xBE, 0xB7, ++ 0xF6, 0xE0, 0xBD, 0x80, 0xE0, 0xBE, 0xB5, 0xF6, ++ 0xE0, 0xBD, 0xB1, 0xE0, 0xBD, 0xB2, 0xF6, 0xE0, ++ 0xBD, 0xB1, 0xE0, 0xBD, 0xB4, 0xF6, 0xE0, 0xBE, ++ 0xB2, 0xE0, 0xBE, 0x80, 0xE0, 0xBE, 0xB2, 0xE0, ++ 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0, 0xBE, ++ 0xB3, 0xE0, 0xBE, 0x80, 0xE0, 0xBE, 0xB3, 0xE0, ++ 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0, 0xBD, ++ 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0, 0xBE, 0x92, ++ 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0x9C, 0xE0, ++ 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xA1, 0xE0, 0xBE, ++ 0xB7, 0xF6, 0xE0, 0xBE, 0xA6, 0xE0, 0xBE, 0xB7, ++ 0xF6, 0xE0, 0xBE, 0xAB, 0xE0, 0xBE, 0xB7, 0xF6, ++ 0xE0, 0xBE, 0x90, 0xE0, 0xBE, 0xB5, 0xF6, 0xE1, ++ 0x80, 0xA5, 0xE1, 0x80, 0xAE, 0xF6, 0x41, 0xCC, ++ 0xA5, 0xF6, 0x61, 0xCC, 0xA5, 0xF6, 0x42, 0xCC, ++ 0x87, 0xF6, 0x62, 0xCC, 0x87, 0xF6, 0x42, 0xCC, ++ 0xA3, 0xF6, 0x62, 0xCC, 0xA3, 0xF6, 0x42, 0xCC, ++ 0xB1, 0xF6, 0x62, 0xCC, 0xB1, 0xF6, 0x43, 0xCC, ++ 0xA7, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0xA7, 0xCC, ++ 0x81, 0xF6, 0x44, 0xCC, 0x87, 0xF6, 0x64, 0xCC, ++ 0x87, 0xF6, 0x44, 0xCC, 0xA3, 0xF6, 0x64, 0xCC, ++ 0xA3, 0xF6, 0x44, 0xCC, 0xB1, 0xF6, 0x64, 0xCC, ++ 0xB1, 0xF6, 0x44, 0xCC, 0xA7, 0xF6, 0x64, 0xCC, ++ 0xA7, 0xF6, 0x44, 0xCC, 0xAD, 0xF6, 0x64, 0xCC, ++ 0xAD, 0xF6, 0x45, 0xCC, 0x84, 0xCC, 0x80, 0xF6, ++ 0x65, 0xCC, 0x84, 0xCC, 0x80, 0xF6, 0x45, 0xCC, ++ 0x84, 0xCC, 0x81, 0xF6, 0x65, 0xCC, 0x84, 0xCC, ++ 0x81, 0xF6, 0x45, 0xCC, 0xAD, 0xF6, 0x65, 0xCC, ++ 0xAD, 0xF6, 0x45, 0xCC, 0xB0, 0xF6, 0x65, 0xCC, ++ 0xB0, 0xF6, 0x45, 0xCC, 0xA7, 0xCC, 0x86, 0xF6, ++ 0x65, 0xCC, 0xA7, 0xCC, 0x86, 0xF6, 0x46, 0xCC, ++ 0x87, 0xF6, 0x66, 0xCC, 0x87, 0xF6, 0x47, 0xCC, ++ 0x84, 0xF6, 0x67, 0xCC, 0x84, 0xF6, 0x48, 0xCC, ++ 0x87, 0xF6, 0x68, 0xCC, 0x87, 0xF6, 0x48, 0xCC, ++ 0xA3, 0xF6, 0x68, 0xCC, 0xA3, 0xF6, 0x48, 0xCC, ++ 0x88, 0xF6, 0x68, 0xCC, 0x88, 0xF6, 0x48, 0xCC, ++ 0xA7, 0xF6, 0x68, 0xCC, 0xA7, 0xF6, 0x48, 0xCC, ++ 0xAE, 0xF6, 0x68, 0xCC, 0xAE, 0xF6, 0x49, 0xCC, ++ 0xB0, 0xF6, 0x69, 0xCC, 0xB0, 0xF6, 0x49, 0xCC, ++ 0x88, 0xCC, 0x81, 0xF6, 0x69, 0xCC, 0x88, 0xCC, ++ 0x81, 0xF6, 0x4B, 0xCC, 0x81, 0xF6, 0x6B, 0xCC, ++ 0x81, 0xF6, 0x4B, 0xCC, 0xA3, 0xF6, 0x6B, 0xCC, ++ 0xA3, 0xF6, 0x4B, 0xCC, 0xB1, 0xF6, 0x6B, 0xCC, ++ 0xB1, 0xF6, 0x4C, 0xCC, 0xA3, 0xF6, 0x6C, 0xCC, ++ 0xA3, 0xF6, 0x4C, 0xCC, 0xA3, 0xCC, 0x84, 0xF6, ++ 0x6C, 0xCC, 0xA3, 0xCC, 0x84, 0xF6, 0x4C, 0xCC, ++ 0xB1, 0xF6, 0x6C, 0xCC, 0xB1, 0xF6, 0x4C, 0xCC, ++ 0xAD, 0xF6, 0x6C, 0xCC, 0xAD, 0xF6, 0x4D, 0xCC, ++ 0x81, 0xF6, 0x6D, 0xCC, 0x81, 0xF6, 0x4D, 0xCC, ++ 0x87, 0xF6, 0x6D, 0xCC, 0x87, 0xF6, 0x4D, 0xCC, ++ 0xA3, 0xF6, 0x6D, 0xCC, 0xA3, 0xF6, 0x4E, 0xCC, ++ 0x87, 0xF6, 0x6E, 0xCC, 0x87, 0xF6, 0x4E, 0xCC, ++ 0xA3, 0xF6, 0x6E, 0xCC, 0xA3, 0xF6, 0x4E, 0xCC, ++ 0xB1, 0xF6, 0x6E, 0xCC, 0xB1, 0xF6, 0x4E, 0xCC, ++ 0xAD, 0xF6, 0x6E, 0xCC, 0xAD, 0xF6, 0x4F, 0xCC, ++ 0x83, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x83, 0xCC, ++ 0x81, 0xF6, 0x4F, 0xCC, 0x83, 0xCC, 0x88, 0xF6, ++ 0x6F, 0xCC, 0x83, 0xCC, 0x88, 0xF6, 0x4F, 0xCC, ++ 0x84, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x84, 0xCC, ++ 0x80, 0xF6, 0x4F, 0xCC, 0x84, 0xCC, 0x81, 0xF6, ++ 0x6F, 0xCC, 0x84, 0xCC, 0x81, 0xF6, 0x50, 0xCC, ++ 0x81, 0xF6, 0x70, 0xCC, 0x81, 0xF6, 0x50, 0xCC, ++ 0x87, 0xF6, 0x70, 0xCC, 0x87, 0xF6, 0x52, 0xCC, ++ 0x87, 0xF6, 0x72, 0xCC, 0x87, 0xF6, 0x52, 0xCC, ++ 0xA3, 0xF6, 0x72, 0xCC, 0xA3, 0xF6, 0x52, 0xCC, ++ 0xA3, 0xCC, 0x84, 0xF6, 0x72, 0xCC, 0xA3, 0xCC, ++ 0x84, 0xF6, 0x52, 0xCC, 0xB1, 0xF6, 0x72, 0xCC, ++ 0xB1, 0xF6, 0x53, 0xCC, 0x87, 0xF6, 0x73, 0xCC, ++ 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xF6, 0x73, 0xCC, ++ 0xA3, 0xF6, 0x53, 0xCC, 0x81, 0xCC, 0x87, 0xF6, ++ 0x73, 0xCC, 0x81, 0xCC, 0x87, 0xF6, 0x53, 0xCC, ++ 0x8C, 0xCC, 0x87, 0xF6, 0x73, 0xCC, 0x8C, 0xCC, ++ 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xCC, 0x87, 0xF6, ++ 0x73, 0xCC, 0xA3, 0xCC, 0x87, 0xF6, 0x54, 0xCC, ++ 0x87, 0xF6, 0x74, 0xCC, 0x87, 0xF6, 0x54, 0xCC, ++ 0xA3, 0xF6, 0x74, 0xCC, 0xA3, 0xF6, 0x54, 0xCC, ++ 0xB1, 0xF6, 0x74, 0xCC, 0xB1, 0xF6, 0x54, 0xCC, ++ 0xAD, 0xF6, 0x74, 0xCC, 0xAD, 0xF6, 0x55, 0xCC, ++ 0xA4, 0xF6, 0x75, 0xCC, 0xA4, 0xF6, 0x55, 0xCC, ++ 0xB0, 0xF6, 0x75, 0xCC, 0xB0, 0xF6, 0x55, 0xCC, ++ 0xAD, 0xF6, 0x75, 0xCC, 0xAD, 0xF6, 0x55, 0xCC, ++ 0x83, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x83, 0xCC, ++ 0x81, 0xF6, 0x55, 0xCC, 0x84, 0xCC, 0x88, 0xF6, ++ 0x75, 0xCC, 0x84, 0xCC, 0x88, 0xF6, 0x56, 0xCC, ++ 0x83, 0xF6, 0x76, 0xCC, 0x83, 0xF6, 0x56, 0xCC, ++ 0xA3, 0xF6, 0x76, 0xCC, 0xA3, 0xF6, 0x57, 0xCC, ++ 0x80, 0xF6, 0x77, 0xCC, 0x80, 0xF6, 0x57, 0xCC, ++ 0x81, 0xF6, 0x77, 0xCC, 0x81, 0xF6, 0x57, 0xCC, ++ 0x88, 0xF6, 0x77, 0xCC, 0x88, 0xF6, 0x57, 0xCC, ++ 0x87, 0xF6, 0x77, 0xCC, 0x87, 0xF6, 0x57, 0xCC, ++ 0xA3, 0xF6, 0x77, 0xCC, 0xA3, 0xF6, 0x58, 0xCC, ++ 0x87, 0xF6, 0x78, 0xCC, 0x87, 0xF6, 0x58, 0xCC, ++ 0x88, 0xF6, 0x78, 0xCC, 0x88, 0xF6, 0x59, 0xCC, ++ 0x87, 0xF6, 0x79, 0xCC, 0x87, 0xF6, 0x5A, 0xCC, ++ 0x82, 0xF6, 0x7A, 0xCC, 0x82, 0xF6, 0x5A, 0xCC, ++ 0xA3, 0xF6, 0x7A, 0xCC, 0xA3, 0xF6, 0x5A, 0xCC, ++ 0xB1, 0xF6, 0x7A, 0xCC, 0xB1, 0xF6, 0x68, 0xCC, ++ 0xB1, 0xF6, 0x74, 0xCC, 0x88, 0xF6, 0x77, 0xCC, ++ 0x8A, 0xF6, 0x79, 0xCC, 0x8A, 0x61, 0xCA, 0xBE, ++ 0xF5, 0x05, 0xC5, 0xBF, 0xCC, 0x87, 0x73, 0xCC, ++ 0x87, 0xF6, 0x41, 0xCC, 0xA3, 0xF6, 0x61, 0xCC, ++ 0xA3, 0xF6, 0x41, 0xCC, 0x89, 0xF6, 0x61, 0xCC, ++ 0x89, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x81, 0xF6, ++ 0x61, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x41, 0xCC, ++ 0x82, 0xCC, 0x80, 0xF6, 0x61, 0xCC, 0x82, 0xCC, ++ 0x80, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x89, 0xF6, ++ 0x61, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x41, 0xCC, ++ 0x82, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x82, 0xCC, ++ 0x83, 0xF6, 0x41, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, ++ 0x61, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x41, 0xCC, ++ 0x86, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x86, 0xCC, ++ 0x81, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x80, 0xF6, ++ 0x61, 0xCC, 0x86, 0xCC, 0x80, 0xF6, 0x41, 0xCC, ++ 0x86, 0xCC, 0x89, 0xF6, 0x61, 0xCC, 0x86, 0xCC, ++ 0x89, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x83, 0xF6, ++ 0x61, 0xCC, 0x86, 0xCC, 0x83, 0xF6, 0x41, 0xCC, ++ 0xA3, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0xA3, 0xCC, ++ 0x86, 0xF6, 0x45, 0xCC, 0xA3, 0xF6, 0x65, 0xCC, ++ 0xA3, 0xF6, 0x45, 0xCC, 0x89, 0xF6, 0x65, 0xCC, ++ 0x89, 0xF6, 0x45, 0xCC, 0x83, 0xF6, 0x65, 0xCC, ++ 0x83, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x81, 0xF6, ++ 0x65, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x45, 0xCC, ++ 0x82, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x82, 0xCC, ++ 0x80, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x89, 0xF6, ++ 0x65, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x45, 0xCC, ++ 0x82, 0xCC, 0x83, 0xF6, 0x65, 0xCC, 0x82, 0xCC, ++ 0x83, 0xF6, 0x45, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, ++ 0x65, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x49, 0xCC, ++ 0x89, 0xF6, 0x69, 0xCC, 0x89, 0xF6, 0x49, 0xCC, ++ 0xA3, 0xF6, 0x69, 0xCC, 0xA3, 0xF6, 0x4F, 0xCC, ++ 0xA3, 0xF6, 0x6F, 0xCC, 0xA3, 0xF6, 0x4F, 0xCC, ++ 0x89, 0xF6, 0x6F, 0xCC, 0x89, 0xF6, 0x4F, 0xCC, ++ 0x82, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82, 0xCC, ++ 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x80, 0xF6, ++ 0x6F, 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x4F, 0xCC, ++ 0x82, 0xCC, 0x89, 0xF6, 0x6F, 0xCC, 0x82, 0xCC, ++ 0x89, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x83, 0xF6, ++ 0x6F, 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, ++ 0xA3, 0xCC, 0x82, 0xF6, 0x6F, 0xCC, 0xA3, 0xCC, ++ 0x82, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x81, 0xF6, ++ 0x6F, 0xCC, 0x9B, 0xCC, 0x81, 0xF6, 0x4F, 0xCC, ++ 0x9B, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, ++ 0x80, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x89, 0xF6, ++ 0x6F, 0xCC, 0x9B, 0xCC, 0x89, 0xF6, 0x4F, 0xCC, ++ 0x9B, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, ++ 0x83, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0xA3, 0xF6, ++ 0x6F, 0xCC, 0x9B, 0xCC, 0xA3, 0xF6, 0x55, 0xCC, ++ 0xA3, 0xF6, 0x75, 0xCC, 0xA3, 0xF6, 0x55, 0xCC, ++ 0x89, 0xF6, 0x75, 0xCC, 0x89, 0xF6, 0x55, 0xCC, ++ 0x9B, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x9B, 0xCC, ++ 0x81, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x80, 0xF6, ++ 0x75, 0xCC, 0x9B, 0xCC, 0x80, 0xF6, 0x55, 0xCC, ++ 0x9B, 0xCC, 0x89, 0xF6, 0x75, 0xCC, 0x9B, 0xCC, ++ 0x89, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x83, 0xF6, ++ 0x75, 0xCC, 0x9B, 0xCC, 0x83, 0xF6, 0x55, 0xCC, ++ 0x9B, 0xCC, 0xA3, 0xF6, 0x75, 0xCC, 0x9B, 0xCC, ++ 0xA3, 0xF6, 0x59, 0xCC, 0x80, 0xF6, 0x79, 0xCC, ++ 0x80, 0xF6, 0x59, 0xCC, 0xA3, 0xF6, 0x79, 0xCC, ++ 0xA3, 0xF6, 0x59, 0xCC, 0x89, 0xF6, 0x79, 0xCC, ++ 0x89, 0xF6, 0x59, 0xCC, 0x83, 0xF6, 0x79, 0xCC, ++ 0x83, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xF6, 0xCE, ++ 0xB1, 0xCC, 0x94, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, ++ 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCC, ++ 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCC, 0x81, ++ 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCC, 0x81, 0xF6, ++ 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCE, ++ 0xB1, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0x91, ++ 0xCC, 0x93, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xF6, ++ 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, ++ 0x91, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0x91, ++ 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCC, ++ 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCC, 0x93, ++ 0xCD, 0x82, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCD, ++ 0x82, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xF6, 0xCE, ++ 0xB5, 0xCC, 0x94, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, ++ 0xCC, 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x94, 0xCC, ++ 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xCC, 0x81, ++ 0xF6, 0xCE, 0xB5, 0xCC, 0x94, 0xCC, 0x81, 0xF6, ++ 0xCE, 0x95, 0xCC, 0x93, 0xF6, 0xCE, 0x95, 0xCC, ++ 0x94, 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xCC, 0x80, ++ 0xF6, 0xCE, 0x95, 0xCC, 0x94, 0xCC, 0x80, 0xF6, ++ 0xCE, 0x95, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, ++ 0x95, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, ++ 0xCC, 0x93, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xF6, ++ 0xCE, 0xB7, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, ++ 0xB7, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xB7, ++ 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, ++ 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, 0x93, ++ 0xCD, 0x82, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xCD, ++ 0x82, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xF6, 0xCE, ++ 0x97, 0xCC, 0x94, 0xF6, 0xCE, 0x97, 0xCC, 0x93, ++ 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, ++ 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x81, ++ 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x81, 0xF6, ++ 0xCE, 0x97, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCE, ++ 0x97, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xB9, ++ 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0xCC, 0x94, 0xF6, ++ 0xCE, 0xB9, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, ++ 0xB9, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xB9, ++ 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, ++ 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x93, ++ 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x94, 0xCD, ++ 0x82, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xF6, 0xCE, ++ 0x99, 0xCC, 0x94, 0xF6, 0xCE, 0x99, 0xCC, 0x93, ++ 0xCC, 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x94, 0xCC, ++ 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xCC, 0x81, ++ 0xF6, 0xCE, 0x99, 0xCC, 0x94, 0xCC, 0x81, 0xF6, ++ 0xCE, 0x99, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCE, ++ 0x99, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xBF, ++ 0xCC, 0x93, 0xF6, 0xCE, 0xBF, 0xCC, 0x94, 0xF6, ++ 0xCE, 0xBF, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, ++ 0xBF, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xBF, ++ 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xBF, 0xCC, ++ 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, 0x93, ++ 0xF6, 0xCE, 0x9F, 0xCC, 0x94, 0xF6, 0xCE, 0x9F, ++ 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC, ++ 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC, 0x93, ++ 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, 0x94, 0xCC, ++ 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xF6, 0xCF, ++ 0x85, 0xCC, 0x94, 0xF6, 0xCF, 0x85, 0xCC, 0x93, ++ 0xCC, 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x94, 0xCC, ++ 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xCC, 0x81, ++ 0xF6, 0xCF, 0x85, 0xCC, 0x94, 0xCC, 0x81, 0xF6, ++ 0xCF, 0x85, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCF, ++ 0x85, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xA5, ++ 0xCC, 0x94, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCC, ++ 0x80, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCC, 0x81, ++ 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCD, 0x82, 0xF6, ++ 0xCF, 0x89, 0xCC, 0x93, 0xF6, 0xCF, 0x89, 0xCC, ++ 0x94, 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCC, 0x80, ++ 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x80, 0xF6, ++ 0xCF, 0x89, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCF, ++ 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCF, 0x89, ++ 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCF, 0x89, 0xCC, ++ 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, ++ 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xF6, 0xCE, 0xA9, ++ 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, ++ 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, ++ 0xCC, 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC, ++ 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD, 0x82, ++ 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x82, 0xF6, ++ 0xCE, 0xB1, 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC, ++ 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x80, 0xF6, 0xCE, ++ 0xB5, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, 0x80, ++ 0xF6, 0xCE, 0xB7, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, ++ 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC, 0x81, 0xF6, ++ 0xCE, 0xBF, 0xCC, 0x80, 0xF6, 0xCE, 0xBF, 0xCC, ++ 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x80, 0xF6, 0xCF, ++ 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC, 0x80, ++ 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xF6, 0xCE, 0xB1, ++ 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, ++ 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, ++ 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, ++ 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, ++ 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, ++ 0xB1, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, 0xF6, ++ 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xCD, 0x85, ++ 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCD, 0x82, 0xCD, ++ 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCD, 0x85, ++ 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCD, 0x85, 0xF6, ++ 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xCD, 0x85, ++ 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCC, 0x80, 0xCD, ++ 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x81, ++ 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCC, ++ 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, ++ 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, ++ 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, ++ 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, ++ 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x93, ++ 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, ++ 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, ++ 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, ++ 0xB7, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, 0xF6, ++ 0xCE, 0xB7, 0xCC, 0x93, 0xCD, 0x82, 0xCD, 0x85, ++ 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xCD, 0x82, 0xCD, ++ 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCD, 0x85, ++ 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCD, 0x85, 0xF6, ++ 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x80, 0xCD, 0x85, ++ 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x80, 0xCD, ++ 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x81, ++ 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, ++ 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, ++ 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, ++ 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCF, 0x89, ++ 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, ++ 0x94, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x93, ++ 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, ++ 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, ++ 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCF, ++ 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, 0xF6, ++ 0xCF, 0x89, 0xCC, 0x93, 0xCD, 0x82, 0xCD, 0x85, ++ 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCD, 0x82, 0xCD, ++ 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD, 0x85, ++ 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x85, 0xF6, ++ 0xCE, 0xA9, 0xCC, 0x93, 0xCC, 0x80, 0xCD, 0x85, ++ 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC, 0x80, 0xCD, ++ 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCC, 0x81, ++ 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC, ++ 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, ++ 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, ++ 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, ++ 0xCC, 0x86, 0xF6, 0xCE, 0xB1, 0xCC, 0x84, 0xF6, ++ 0xCE, 0xB1, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, ++ 0xB1, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, 0x81, ++ 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCD, 0x82, 0xF6, ++ 0xCE, 0xB1, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, ++ 0x91, 0xCC, 0x86, 0xF6, 0xCE, 0x91, 0xCC, 0x84, ++ 0xF6, 0xCE, 0x91, 0xCC, 0x80, 0xF6, 0xCE, 0x91, ++ 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCD, 0x85, 0x20, ++ 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0x20, 0xCC, 0x93, ++ 0x20, 0xCD, 0x82, 0xF5, 0x05, 0xC2, 0xA8, 0xCD, ++ 0x82, 0x20, 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE, ++ 0xB7, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, ++ 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x81, 0xCD, ++ 0x85, 0xF6, 0xCE, 0xB7, 0xCD, 0x82, 0xF6, 0xCE, ++ 0xB7, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x95, ++ 0xCC, 0x80, 0xF6, 0xCE, 0x95, 0xCC, 0x81, 0xF6, ++ 0xCE, 0x97, 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC, ++ 0x81, 0xF6, 0xCE, 0x97, 0xCD, 0x85, 0xF5, 0x06, ++ 0xE1, 0xBE, 0xBF, 0xCC, 0x80, 0x20, 0xCC, 0x93, ++ 0xCC, 0x80, 0xF5, 0x06, 0xE1, 0xBE, 0xBF, 0xCC, ++ 0x81, 0x20, 0xCC, 0x93, 0xCC, 0x81, 0xF5, 0x06, ++ 0xE1, 0xBE, 0xBF, 0xCD, 0x82, 0x20, 0xCC, 0x93, ++ 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x86, 0xF6, ++ 0xCE, 0xB9, 0xCC, 0x84, 0xF6, 0xCE, 0xB9, 0xCC, ++ 0x88, 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC, 0x88, ++ 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCD, 0x82, 0xF6, ++ 0xCE, 0xB9, 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE, ++ 0x99, 0xCC, 0x86, 0xF6, 0xCE, 0x99, 0xCC, 0x84, ++ 0xF6, 0xCE, 0x99, 0xCC, 0x80, 0xF6, 0xCE, 0x99, ++ 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE, 0xCC, ++ 0x80, 0x20, 0xCC, 0x94, 0xCC, 0x80, 0xF5, 0x06, ++ 0xE1, 0xBF, 0xBE, 0xCC, 0x81, 0x20, 0xCC, 0x94, ++ 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE, 0xCD, ++ 0x82, 0x20, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCF, ++ 0x85, 0xCC, 0x86, 0xF6, 0xCF, 0x85, 0xCC, 0x84, ++ 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x80, 0xF6, ++ 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0xCF, ++ 0x81, 0xCC, 0x93, 0xF6, 0xCF, 0x81, 0xCC, 0x94, ++ 0xF6, 0xCF, 0x85, 0xCD, 0x82, 0xF6, 0xCF, 0x85, ++ 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE, 0xA5, 0xCC, ++ 0x86, 0xF6, 0xCE, 0xA5, 0xCC, 0x84, 0xF6, 0xCE, ++ 0xA5, 0xCC, 0x80, 0xF6, 0xCE, 0xA5, 0xCC, 0x81, ++ 0xF6, 0xCE, 0xA1, 0xCC, 0x94, 0xF5, 0x05, 0xC2, ++ 0xA8, 0xCC, 0x80, 0x20, 0xCC, 0x88, 0xCC, 0x80, ++ 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81, 0x20, 0xCC, ++ 0x88, 0xCC, 0x81, 0xF6, 0x60, 0xF6, 0xCF, 0x89, ++ 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCD, ++ 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xCD, 0x85, ++ 0xF6, 0xCF, 0x89, 0xCD, 0x82, 0xF6, 0xCF, 0x89, ++ 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x9F, 0xCC, ++ 0x80, 0xF6, 0xCE, 0x9F, 0xCC, 0x81, 0xF6, 0xCE, ++ 0xA9, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, 0x81, ++ 0xF6, 0xCE, 0xA9, 0xCD, 0x85, 0xF5, 0x03, 0xC2, ++ 0xB4, 0x20, 0xCC, 0x81, 0x20, 0xCC, 0x94, 0xF5, ++ 0x04, 0xE2, 0x80, 0x82, 0x20, 0xF5, 0x04, 0xE2, ++ 0x80, 0x83, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, ++ 0x20, 0x20, 0x20, 0x20, 0xE2, 0x80, 0x90, 0x20, ++ 0xCC, 0xB3, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, ++ 0x20, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0xE2, ++ 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, ++ 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0xE2, 0x80, ++ 0xB5, 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0x21, ++ 0x21, 0x20, 0xCC, 0x85, 0x3F, 0x3F, 0x3F, 0x21, ++ 0x21, 0x3F, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, ++ 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0x20, 0x30, ++ 0x69, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x2B, ++ 0xE2, 0x88, 0x92, 0x3D, 0x28, 0x29, 0x6E, 0x30, ++ 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, ++ 0x39, 0x2B, 0xE2, 0x88, 0x92, 0x3D, 0x28, 0x29, ++ 0x52, 0x73, 0x61, 0x2F, 0x63, 0x61, 0x2F, 0x73, ++ 0x43, 0xC2, 0xB0, 0x43, 0x63, 0x2F, 0x6F, 0x63, ++ 0x2F, 0x75, 0xC6, 0x90, 0xC2, 0xB0, 0x46, 0x67, ++ 0x48, 0x48, 0x48, 0x68, 0xC4, 0xA7, 0x49, 0x49, ++ 0x4C, 0x6C, 0x4E, 0x4E, 0x6F, 0x50, 0x51, 0x52, ++ 0x52, 0x52, 0x53, 0x4D, 0x54, 0x45, 0x4C, 0x54, ++ 0x4D, 0x5A, 0xF6, 0xCE, 0xA9, 0x5A, 0xF6, 0x4B, ++ 0xF6, 0x41, 0xCC, 0x8A, 0x42, 0x43, 0x65, 0x45, ++ 0x46, 0x4D, 0x6F, 0xD7, 0x90, 0xD7, 0x91, 0xD7, ++ 0x92, 0xD7, 0x93, 0x69, 0xCE, 0xB3, 0xCE, 0x93, ++ 0xCE, 0xA0, 0xE2, 0x88, 0x91, 0x44, 0x64, 0x65, ++ 0x69, 0x6A, 0x31, 0xE2, 0x81, 0x84, 0x33, 0x32, ++ 0xE2, 0x81, 0x84, 0x33, 0x31, 0xE2, 0x81, 0x84, ++ 0x35, 0x32, 0xE2, 0x81, 0x84, 0x35, 0x33, 0xE2, ++ 0x81, 0x84, 0x35, 0x34, 0xE2, 0x81, 0x84, 0x35, ++ 0x31, 0xE2, 0x81, 0x84, 0x36, 0x35, 0xE2, 0x81, ++ 0x84, 0x36, 0x31, 0xE2, 0x81, 0x84, 0x38, 0x33, ++ 0xE2, 0x81, 0x84, 0x38, 0x35, 0xE2, 0x81, 0x84, ++ 0x38, 0x37, 0xE2, 0x81, 0x84, 0x38, 0x31, 0xE2, ++ 0x81, 0x84, 0x49, 0x49, 0x49, 0x49, 0x49, 0x49, ++ 0x49, 0x56, 0x56, 0x56, 0x49, 0x56, 0x49, 0x49, ++ 0x56, 0x49, 0x49, 0x49, 0x49, 0x58, 0x58, 0x58, ++ 0x49, 0x58, 0x49, 0x49, 0x4C, 0x43, 0x44, 0x4D, ++ 0x69, 0x69, 0x69, 0x69, 0x69, 0x69, 0x69, 0x76, ++ 0x76, 0x76, 0x69, 0x76, 0x69, 0x69, 0x76, 0x69, ++ 0x69, 0x69, 0x69, 0x78, 0x78, 0x78, 0x69, 0x78, ++ 0x69, 0x69, 0x6C, 0x63, 0x64, 0x6D, 0xF6, 0xE2, ++ 0x86, 0x90, 0xCC, 0xB8, 0xF6, 0xE2, 0x86, 0x92, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x86, 0x94, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x87, 0x90, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x87, 0x94, 0xCC, 0xB8, 0xF6, 0xE2, 0x87, 0x92, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0x83, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x88, 0x88, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x88, 0x8B, 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0xA3, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0xA5, 0xCC, 0xB8, ++ 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88, ++ 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2, ++ 0x88, 0xAE, 0xE2, 0x88, 0xAE, 0xE2, 0x88, 0xAE, ++ 0xE2, 0x88, 0xAE, 0xE2, 0x88, 0xAE, 0xF6, 0xE2, ++ 0x88, 0xBC, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x83, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x85, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x89, 0x88, 0xCC, 0xB8, 0xF6, 0x3D, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xA1, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x89, 0x8D, 0xCC, 0xB8, 0xF6, 0x3C, ++ 0xCC, 0xB8, 0xF6, 0x3E, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x89, 0xA4, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xA5, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB2, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x89, 0xB3, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x89, 0xB6, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB7, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBA, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x89, 0xBB, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x8A, 0x82, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x83, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x86, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x8A, 0x87, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x8A, 0xA2, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xA8, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xA9, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x8A, 0xAB, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x89, 0xBC, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBD, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x91, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x8A, 0x92, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x8A, 0xB2, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB3, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB4, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x8A, 0xB5, 0xCC, 0xB8, 0xF6, 0xE3, ++ 0x80, 0x88, 0xF6, 0xE3, 0x80, 0x89, 0x31, 0x32, ++ 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x31, ++ 0x30, 0x31, 0x31, 0x31, 0x32, 0x31, 0x33, 0x31, ++ 0x34, 0x31, 0x35, 0x31, 0x36, 0x31, 0x37, 0x31, ++ 0x38, 0x31, 0x39, 0x32, 0x30, 0x28, 0x31, 0x29, ++ 0x28, 0x32, 0x29, 0x28, 0x33, 0x29, 0x28, 0x34, ++ 0x29, 0x28, 0x35, 0x29, 0x28, 0x36, 0x29, 0x28, ++ 0x37, 0x29, 0x28, 0x38, 0x29, 0x28, 0x39, 0x29, ++ 0x28, 0x31, 0x30, 0x29, 0x28, 0x31, 0x31, 0x29, ++ 0x28, 0x31, 0x32, 0x29, 0x28, 0x31, 0x33, 0x29, ++ 0x28, 0x31, 0x34, 0x29, 0x28, 0x31, 0x35, 0x29, ++ 0x28, 0x31, 0x36, 0x29, 0x28, 0x31, 0x37, 0x29, ++ 0x28, 0x31, 0x38, 0x29, 0x28, 0x31, 0x39, 0x29, ++ 0x28, 0x32, 0x30, 0x29, 0x31, 0x2E, 0x32, 0x2E, ++ 0x33, 0x2E, 0x34, 0x2E, 0x35, 0x2E, 0x36, 0x2E, ++ 0x37, 0x2E, 0x38, 0x2E, 0x39, 0x2E, 0x31, 0x30, ++ 0x2E, 0x31, 0x31, 0x2E, 0x31, 0x32, 0x2E, 0x31, ++ 0x33, 0x2E, 0x31, 0x34, 0x2E, 0x31, 0x35, 0x2E, ++ 0x31, 0x36, 0x2E, 0x31, 0x37, 0x2E, 0x31, 0x38, ++ 0x2E, 0x31, 0x39, 0x2E, 0x32, 0x30, 0x2E, 0x28, ++ 0x61, 0x29, 0x28, 0x62, 0x29, 0x28, 0x63, 0x29, ++ 0x28, 0x64, 0x29, 0x28, 0x65, 0x29, 0x28, 0x66, ++ 0x29, 0x28, 0x67, 0x29, 0x28, 0x68, 0x29, 0x28, ++ 0x69, 0x29, 0x28, 0x6A, 0x29, 0x28, 0x6B, 0x29, ++ 0x28, 0x6C, 0x29, 0x28, 0x6D, 0x29, 0x28, 0x6E, ++ 0x29, 0x28, 0x6F, 0x29, 0x28, 0x70, 0x29, 0x28, ++ 0x71, 0x29, 0x28, 0x72, 0x29, 0x28, 0x73, 0x29, ++ 0x28, 0x74, 0x29, 0x28, 0x75, 0x29, 0x28, 0x76, ++ 0x29, 0x28, 0x77, 0x29, 0x28, 0x78, 0x29, 0x28, ++ 0x79, 0x29, 0x28, 0x7A, 0x29, 0x41, 0x42, 0x43, ++ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, ++ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, ++ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, ++ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ++ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, ++ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, ++ 0x7A, 0x30, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, ++ 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0x3A, 0x3A, ++ 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0xF6, 0xE2, ++ 0xAB, 0x9D, 0xCC, 0xB8, 0xE6, 0xAF, 0x8D, 0xE9, ++ 0xBE, 0x9F, 0xE4, 0xB8, 0x80, 0xE4, 0xB8, 0xA8, ++ 0xE4, 0xB8, 0xB6, 0xE4, 0xB8, 0xBF, 0xE4, 0xB9, ++ 0x99, 0xE4, 0xBA, 0x85, 0xE4, 0xBA, 0x8C, 0xE4, ++ 0xBA, 0xA0, 0xE4, 0xBA, 0xBA, 0xE5, 0x84, 0xBF, ++ 0xE5, 0x85, 0xA5, 0xE5, 0x85, 0xAB, 0xE5, 0x86, ++ 0x82, 0xE5, 0x86, 0x96, 0xE5, 0x86, 0xAB, 0xE5, ++ 0x87, 0xA0, 0xE5, 0x87, 0xB5, 0xE5, 0x88, 0x80, ++ 0xE5, 0x8A, 0x9B, 0xE5, 0x8B, 0xB9, 0xE5, 0x8C, ++ 0x95, 0xE5, 0x8C, 0x9A, 0xE5, 0x8C, 0xB8, 0xE5, ++ 0x8D, 0x81, 0xE5, 0x8D, 0x9C, 0xE5, 0x8D, 0xA9, ++ 0xE5, 0x8E, 0x82, 0xE5, 0x8E, 0xB6, 0xE5, 0x8F, ++ 0x88, 0xE5, 0x8F, 0xA3, 0xE5, 0x9B, 0x97, 0xE5, ++ 0x9C, 0x9F, 0xE5, 0xA3, 0xAB, 0xE5, 0xA4, 0x82, ++ 0xE5, 0xA4, 0x8A, 0xE5, 0xA4, 0x95, 0xE5, 0xA4, ++ 0xA7, 0xE5, 0xA5, 0xB3, 0xE5, 0xAD, 0x90, 0xE5, ++ 0xAE, 0x80, 0xE5, 0xAF, 0xB8, 0xE5, 0xB0, 0x8F, ++ 0xE5, 0xB0, 0xA2, 0xE5, 0xB0, 0xB8, 0xE5, 0xB1, ++ 0xAE, 0xE5, 0xB1, 0xB1, 0xE5, 0xB7, 0x9B, 0xE5, ++ 0xB7, 0xA5, 0xE5, 0xB7, 0xB1, 0xE5, 0xB7, 0xBE, ++ 0xE5, 0xB9, 0xB2, 0xE5, 0xB9, 0xBA, 0xE5, 0xB9, ++ 0xBF, 0xE5, 0xBB, 0xB4, 0xE5, 0xBB, 0xBE, 0xE5, ++ 0xBC, 0x8B, 0xE5, 0xBC, 0x93, 0xE5, 0xBD, 0x90, ++ 0xE5, 0xBD, 0xA1, 0xE5, 0xBD, 0xB3, 0xE5, 0xBF, ++ 0x83, 0xE6, 0x88, 0x88, 0xE6, 0x88, 0xB6, 0xE6, ++ 0x89, 0x8B, 0xE6, 0x94, 0xAF, 0xE6, 0x94, 0xB4, ++ 0xE6, 0x96, 0x87, 0xE6, 0x96, 0x97, 0xE6, 0x96, ++ 0xA4, 0xE6, 0x96, 0xB9, 0xE6, 0x97, 0xA0, 0xE6, ++ 0x97, 0xA5, 0xE6, 0x9B, 0xB0, 0xE6, 0x9C, 0x88, ++ 0xE6, 0x9C, 0xA8, 0xE6, 0xAC, 0xA0, 0xE6, 0xAD, ++ 0xA2, 0xE6, 0xAD, 0xB9, 0xE6, 0xAE, 0xB3, 0xE6, ++ 0xAF, 0x8B, 0xE6, 0xAF, 0x94, 0xE6, 0xAF, 0x9B, ++ 0xE6, 0xB0, 0x8F, 0xE6, 0xB0, 0x94, 0xE6, 0xB0, ++ 0xB4, 0xE7, 0x81, 0xAB, 0xE7, 0x88, 0xAA, 0xE7, ++ 0x88, 0xB6, 0xE7, 0x88, 0xBB, 0xE7, 0x88, 0xBF, ++ 0xE7, 0x89, 0x87, 0xE7, 0x89, 0x99, 0xE7, 0x89, ++ 0x9B, 0xE7, 0x8A, 0xAC, 0xE7, 0x8E, 0x84, 0xE7, ++ 0x8E, 0x89, 0xE7, 0x93, 0x9C, 0xE7, 0x93, 0xA6, ++ 0xE7, 0x94, 0x98, 0xE7, 0x94, 0x9F, 0xE7, 0x94, ++ 0xA8, 0xE7, 0x94, 0xB0, 0xE7, 0x96, 0x8B, 0xE7, ++ 0x96, 0x92, 0xE7, 0x99, 0xB6, 0xE7, 0x99, 0xBD, ++ 0xE7, 0x9A, 0xAE, 0xE7, 0x9A, 0xBF, 0xE7, 0x9B, ++ 0xAE, 0xE7, 0x9F, 0x9B, 0xE7, 0x9F, 0xA2, 0xE7, ++ 0x9F, 0xB3, 0xE7, 0xA4, 0xBA, 0xE7, 0xA6, 0xB8, ++ 0xE7, 0xA6, 0xBE, 0xE7, 0xA9, 0xB4, 0xE7, 0xAB, ++ 0x8B, 0xE7, 0xAB, 0xB9, 0xE7, 0xB1, 0xB3, 0xE7, ++ 0xB3, 0xB8, 0xE7, 0xBC, 0xB6, 0xE7, 0xBD, 0x91, ++ 0xE7, 0xBE, 0x8A, 0xE7, 0xBE, 0xBD, 0xE8, 0x80, ++ 0x81, 0xE8, 0x80, 0x8C, 0xE8, 0x80, 0x92, 0xE8, ++ 0x80, 0xB3, 0xE8, 0x81, 0xBF, 0xE8, 0x82, 0x89, ++ 0xE8, 0x87, 0xA3, 0xE8, 0x87, 0xAA, 0xE8, 0x87, ++ 0xB3, 0xE8, 0x87, 0xBC, 0xE8, 0x88, 0x8C, 0xE8, ++ 0x88, 0x9B, 0xE8, 0x88, 0x9F, 0xE8, 0x89, 0xAE, ++ 0xE8, 0x89, 0xB2, 0xE8, 0x89, 0xB8, 0xE8, 0x99, ++ 0x8D, 0xE8, 0x99, 0xAB, 0xE8, 0xA1, 0x80, 0xE8, ++ 0xA1, 0x8C, 0xE8, 0xA1, 0xA3, 0xE8, 0xA5, 0xBE, ++ 0xE8, 0xA6, 0x8B, 0xE8, 0xA7, 0x92, 0xE8, 0xA8, ++ 0x80, 0xE8, 0xB0, 0xB7, 0xE8, 0xB1, 0x86, 0xE8, ++ 0xB1, 0x95, 0xE8, 0xB1, 0xB8, 0xE8, 0xB2, 0x9D, ++ 0xE8, 0xB5, 0xA4, 0xE8, 0xB5, 0xB0, 0xE8, 0xB6, ++ 0xB3, 0xE8, 0xBA, 0xAB, 0xE8, 0xBB, 0x8A, 0xE8, ++ 0xBE, 0x9B, 0xE8, 0xBE, 0xB0, 0xE8, 0xBE, 0xB5, ++ 0xE9, 0x82, 0x91, 0xE9, 0x85, 0x89, 0xE9, 0x87, ++ 0x86, 0xE9, 0x87, 0x8C, 0xE9, 0x87, 0x91, 0xE9, ++ 0x95, 0xB7, 0xE9, 0x96, 0x80, 0xE9, 0x98, 0x9C, ++ 0xE9, 0x9A, 0xB6, 0xE9, 0x9A, 0xB9, 0xE9, 0x9B, ++ 0xA8, 0xE9, 0x9D, 0x91, 0xE9, 0x9D, 0x9E, 0xE9, ++ 0x9D, 0xA2, 0xE9, 0x9D, 0xA9, 0xE9, 0x9F, 0x8B, ++ 0xE9, 0x9F, 0xAD, 0xE9, 0x9F, 0xB3, 0xE9, 0xA0, ++ 0x81, 0xE9, 0xA2, 0xA8, 0xE9, 0xA3, 0x9B, 0xE9, ++ 0xA3, 0x9F, 0xE9, 0xA6, 0x96, 0xE9, 0xA6, 0x99, ++ 0xE9, 0xA6, 0xAC, 0xE9, 0xAA, 0xA8, 0xE9, 0xAB, ++ 0x98, 0xE9, 0xAB, 0x9F, 0xE9, 0xAC, 0xA5, 0xE9, ++ 0xAC, 0xAF, 0xE9, 0xAC, 0xB2, 0xE9, 0xAC, 0xBC, ++ 0xE9, 0xAD, 0x9A, 0xE9, 0xB3, 0xA5, 0xE9, 0xB9, ++ 0xB5, 0xE9, 0xB9, 0xBF, 0xE9, 0xBA, 0xA5, 0xE9, ++ 0xBA, 0xBB, 0xE9, 0xBB, 0x83, 0xE9, 0xBB, 0x8D, ++ 0xE9, 0xBB, 0x91, 0xE9, 0xBB, 0xB9, 0xE9, 0xBB, ++ 0xBD, 0xE9, 0xBC, 0x8E, 0xE9, 0xBC, 0x93, 0xE9, ++ 0xBC, 0xA0, 0xE9, 0xBC, 0xBB, 0xE9, 0xBD, 0x8A, ++ 0xE9, 0xBD, 0x92, 0xE9, 0xBE, 0x8D, 0xE9, 0xBE, ++ 0x9C, 0xE9, 0xBE, 0xA0, 0x20, 0xE3, 0x80, 0x92, ++ 0xE5, 0x8D, 0x81, 0xE5, 0x8D, 0x84, 0xE5, 0x8D, ++ 0x85, 0xF6, 0xE3, 0x81, 0x8B, 0xE3, 0x82, 0x99, ++ 0xF6, 0xE3, 0x81, 0x8D, 0xE3, 0x82, 0x99, 0xF6, ++ 0xE3, 0x81, 0x8F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, ++ 0x81, 0x91, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, ++ 0x93, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x95, ++ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x97, 0xE3, ++ 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x99, 0xE3, 0x82, ++ 0x99, 0xF6, 0xE3, 0x81, 0x9B, 0xE3, 0x82, 0x99, ++ 0xF6, 0xE3, 0x81, 0x9D, 0xE3, 0x82, 0x99, 0xF6, ++ 0xE3, 0x81, 0x9F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, ++ 0x81, 0xA1, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, ++ 0xA4, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA6, ++ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA8, 0xE3, ++ 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82, ++ 0x99, 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82, 0x9A, ++ 0xF6, 0xE3, 0x81, 0xB2, 0xE3, 0x82, 0x99, 0xF6, ++ 0xE3, 0x81, 0xB2, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, ++ 0x81, 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, ++ 0xB5, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x81, 0xB8, ++ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xB8, 0xE3, ++ 0x82, 0x9A, 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82, ++ 0x99, 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82, 0x9A, ++ 0xF6, 0xE3, 0x81, 0x86, 0xE3, 0x82, 0x99, 0x20, ++ 0xE3, 0x82, 0x99, 0x20, 0xE3, 0x82, 0x9A, 0xF6, ++ 0xE3, 0x82, 0x9D, 0xE3, 0x82, 0x99, 0xE3, 0x82, ++ 0x88, 0xE3, 0x82, 0x8A, 0xF6, 0xE3, 0x82, 0xAB, ++ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xAD, 0xE3, ++ 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xAF, 0xE3, 0x82, ++ 0x99, 0xF6, 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0x99, ++ 0xF6, 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0x99, 0xF6, ++ 0xE3, 0x82, 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3, ++ 0x82, 0xB7, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, ++ 0xB9, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBB, ++ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBD, 0xE3, ++ 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBF, 0xE3, 0x82, ++ 0x99, 0xF6, 0xE3, 0x83, 0x81, 0xE3, 0x82, 0x99, ++ 0xF6, 0xE3, 0x83, 0x84, 0xE3, 0x82, 0x99, 0xF6, ++ 0xE3, 0x83, 0x86, 0xE3, 0x82, 0x99, 0xF6, 0xE3, ++ 0x83, 0x88, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, ++ 0x8F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x8F, ++ 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x83, 0x92, 0xE3, ++ 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x92, 0xE3, 0x82, ++ 0x9A, 0xF6, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x99, ++ 0xF6, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x9A, 0xF6, ++ 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x99, 0xF6, 0xE3, ++ 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x83, ++ 0x9B, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x9B, ++ 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x82, 0xA6, 0xE3, ++ 0x82, 0x99, 0xF6, 0xE3, 0x83, 0xAF, 0xE3, 0x82, ++ 0x99, 0xF6, 0xE3, 0x83, 0xB0, 0xE3, 0x82, 0x99, ++ 0xF6, 0xE3, 0x83, 0xB1, 0xE3, 0x82, 0x99, 0xF6, ++ 0xE3, 0x83, 0xB2, 0xE3, 0x82, 0x99, 0xF6, 0xE3, ++ 0x83, 0xBD, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xB3, ++ 0xE3, 0x83, 0x88, 0xE1, 0x84, 0x80, 0xE1, 0x84, ++ 0x81, 0xE1, 0x86, 0xAA, 0xE1, 0x84, 0x82, 0xE1, ++ 0x86, 0xAC, 0xE1, 0x86, 0xAD, 0xE1, 0x84, 0x83, ++ 0xE1, 0x84, 0x84, 0xE1, 0x84, 0x85, 0xE1, 0x86, ++ 0xB0, 0xE1, 0x86, 0xB1, 0xE1, 0x86, 0xB2, 0xE1, ++ 0x86, 0xB3, 0xE1, 0x86, 0xB4, 0xE1, 0x86, 0xB5, ++ 0xE1, 0x84, 0x9A, 0xE1, 0x84, 0x86, 0xE1, 0x84, ++ 0x87, 0xE1, 0x84, 0x88, 0xE1, 0x84, 0xA1, 0xE1, ++ 0x84, 0x89, 0xE1, 0x84, 0x8A, 0xE1, 0x84, 0x8B, ++ 0xE1, 0x84, 0x8C, 0xE1, 0x84, 0x8D, 0xE1, 0x84, ++ 0x8E, 0xE1, 0x84, 0x8F, 0xE1, 0x84, 0x90, 0xE1, ++ 0x84, 0x91, 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, ++ 0xE1, 0x85, 0xA2, 0xE1, 0x85, 0xA3, 0xE1, 0x85, ++ 0xA4, 0xE1, 0x85, 0xA5, 0xE1, 0x85, 0xA6, 0xE1, ++ 0x85, 0xA7, 0xE1, 0x85, 0xA8, 0xE1, 0x85, 0xA9, ++ 0xE1, 0x85, 0xAA, 0xE1, 0x85, 0xAB, 0xE1, 0x85, ++ 0xAC, 0xE1, 0x85, 0xAD, 0xE1, 0x85, 0xAE, 0xE1, ++ 0x85, 0xAF, 0xE1, 0x85, 0xB0, 0xE1, 0x85, 0xB1, ++ 0xE1, 0x85, 0xB2, 0xE1, 0x85, 0xB3, 0xE1, 0x85, ++ 0xB4, 0xE1, 0x85, 0xB5, 0xE1, 0x85, 0xA0, 0xE1, ++ 0x84, 0x94, 0xE1, 0x84, 0x95, 0xE1, 0x87, 0x87, ++ 0xE1, 0x87, 0x88, 0xE1, 0x87, 0x8C, 0xE1, 0x87, ++ 0x8E, 0xE1, 0x87, 0x93, 0xE1, 0x87, 0x97, 0xE1, ++ 0x87, 0x99, 0xE1, 0x84, 0x9C, 0xE1, 0x87, 0x9D, ++ 0xE1, 0x87, 0x9F, 0xE1, 0x84, 0x9D, 0xE1, 0x84, ++ 0x9E, 0xE1, 0x84, 0xA0, 0xE1, 0x84, 0xA2, 0xE1, ++ 0x84, 0xA3, 0xE1, 0x84, 0xA7, 0xE1, 0x84, 0xA9, ++ 0xE1, 0x84, 0xAB, 0xE1, 0x84, 0xAC, 0xE1, 0x84, ++ 0xAD, 0xE1, 0x84, 0xAE, 0xE1, 0x84, 0xAF, 0xE1, ++ 0x84, 0xB2, 0xE1, 0x84, 0xB6, 0xE1, 0x85, 0x80, ++ 0xE1, 0x85, 0x87, 0xE1, 0x85, 0x8C, 0xE1, 0x87, ++ 0xB1, 0xE1, 0x87, 0xB2, 0xE1, 0x85, 0x97, 0xE1, ++ 0x85, 0x98, 0xE1, 0x85, 0x99, 0xE1, 0x86, 0x84, ++ 0xE1, 0x86, 0x85, 0xE1, 0x86, 0x88, 0xE1, 0x86, ++ 0x91, 0xE1, 0x86, 0x92, 0xE1, 0x86, 0x94, 0xE1, ++ 0x86, 0x9E, 0xE1, 0x86, 0xA1, 0xE4, 0xB8, 0x80, ++ 0xE4, 0xBA, 0x8C, 0xE4, 0xB8, 0x89, 0xE5, 0x9B, ++ 0x9B, 0xE4, 0xB8, 0x8A, 0xE4, 0xB8, 0xAD, 0xE4, ++ 0xB8, 0x8B, 0xE7, 0x94, 0xB2, 0xE4, 0xB9, 0x99, ++ 0xE4, 0xB8, 0x99, 0xE4, 0xB8, 0x81, 0xE5, 0xA4, ++ 0xA9, 0xE5, 0x9C, 0xB0, 0xE4, 0xBA, 0xBA, 0x28, ++ 0xE1, 0x84, 0x80, 0x29, 0x28, 0xE1, 0x84, 0x82, ++ 0x29, 0x28, 0xE1, 0x84, 0x83, 0x29, 0x28, 0xE1, ++ 0x84, 0x85, 0x29, 0x28, 0xE1, 0x84, 0x86, 0x29, ++ 0x28, 0xE1, 0x84, 0x87, 0x29, 0x28, 0xE1, 0x84, ++ 0x89, 0x29, 0x28, 0xE1, 0x84, 0x8B, 0x29, 0x28, ++ 0xE1, 0x84, 0x8C, 0x29, 0x28, 0xE1, 0x84, 0x8E, ++ 0x29, 0x28, 0xE1, 0x84, 0x8F, 0x29, 0x28, 0xE1, ++ 0x84, 0x90, 0x29, 0x28, 0xE1, 0x84, 0x91, 0x29, ++ 0x28, 0xE1, 0x84, 0x92, 0x29, 0x28, 0xE1, 0x84, ++ 0x80, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, ++ 0x82, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, ++ 0x83, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, ++ 0x85, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, ++ 0x86, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, ++ 0x87, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, ++ 0x89, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, ++ 0x8B, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, ++ 0x8C, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, ++ 0x8E, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, ++ 0x8F, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, ++ 0x90, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, ++ 0x91, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, ++ 0x92, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, ++ 0x8C, 0xE1, 0x85, 0xAE, 0x29, 0x28, 0xE4, 0xB8, ++ 0x80, 0x29, 0x28, 0xE4, 0xBA, 0x8C, 0x29, 0x28, ++ 0xE4, 0xB8, 0x89, 0x29, 0x28, 0xE5, 0x9B, 0x9B, ++ 0x29, 0x28, 0xE4, 0xBA, 0x94, 0x29, 0x28, 0xE5, ++ 0x85, 0xAD, 0x29, 0x28, 0xE4, 0xB8, 0x83, 0x29, ++ 0x28, 0xE5, 0x85, 0xAB, 0x29, 0x28, 0xE4, 0xB9, ++ 0x9D, 0x29, 0x28, 0xE5, 0x8D, 0x81, 0x29, 0x28, ++ 0xE6, 0x9C, 0x88, 0x29, 0x28, 0xE7, 0x81, 0xAB, ++ 0x29, 0x28, 0xE6, 0xB0, 0xB4, 0x29, 0x28, 0xE6, ++ 0x9C, 0xA8, 0x29, 0x28, 0xE9, 0x87, 0x91, 0x29, ++ 0x28, 0xE5, 0x9C, 0x9F, 0x29, 0x28, 0xE6, 0x97, ++ 0xA5, 0x29, 0x28, 0xE6, 0xA0, 0xAA, 0x29, 0x28, ++ 0xE6, 0x9C, 0x89, 0x29, 0x28, 0xE7, 0xA4, 0xBE, ++ 0x29, 0x28, 0xE5, 0x90, 0x8D, 0x29, 0x28, 0xE7, ++ 0x89, 0xB9, 0x29, 0x28, 0xE8, 0xB2, 0xA1, 0x29, ++ 0x28, 0xE7, 0xA5, 0x9D, 0x29, 0x28, 0xE5, 0x8A, ++ 0xB4, 0x29, 0x28, 0xE4, 0xBB, 0xA3, 0x29, 0x28, ++ 0xE5, 0x91, 0xBC, 0x29, 0x28, 0xE5, 0xAD, 0xA6, ++ 0x29, 0x28, 0xE7, 0x9B, 0xA3, 0x29, 0x28, 0xE4, ++ 0xBC, 0x81, 0x29, 0x28, 0xE8, 0xB3, 0x87, 0x29, ++ 0x28, 0xE5, 0x8D, 0x94, 0x29, 0x28, 0xE7, 0xA5, ++ 0xAD, 0x29, 0x28, 0xE4, 0xBC, 0x91, 0x29, 0x28, ++ 0xE8, 0x87, 0xAA, 0x29, 0x28, 0xE8, 0x87, 0xB3, ++ 0x29, 0x32, 0x31, 0x32, 0x32, 0x32, 0x33, 0x32, ++ 0x34, 0x32, 0x35, 0x32, 0x36, 0x32, 0x37, 0x32, ++ 0x38, 0x32, 0x39, 0x33, 0x30, 0x33, 0x31, 0x33, ++ 0x32, 0x33, 0x33, 0x33, 0x34, 0x33, 0x35, 0xE1, ++ 0x84, 0x80, 0xE1, 0x84, 0x82, 0xE1, 0x84, 0x83, ++ 0xE1, 0x84, 0x85, 0xE1, 0x84, 0x86, 0xE1, 0x84, ++ 0x87, 0xE1, 0x84, 0x89, 0xE1, 0x84, 0x8B, 0xE1, ++ 0x84, 0x8C, 0xE1, 0x84, 0x8E, 0xE1, 0x84, 0x8F, ++ 0xE1, 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1, 0x84, ++ 0x92, 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xA1, 0xE1, ++ 0x84, 0x82, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x83, ++ 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x85, 0xE1, 0x85, ++ 0xA1, 0xE1, 0x84, 0x86, 0xE1, 0x85, 0xA1, 0xE1, ++ 0x84, 0x87, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x89, ++ 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x8B, 0xE1, 0x85, ++ 0xA1, 0xE1, 0x84, 0x8C, 0xE1, 0x85, 0xA1, 0xE1, ++ 0x84, 0x8E, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x8F, ++ 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x90, 0xE1, 0x85, ++ 0xA1, 0xE1, 0x84, 0x91, 0xE1, 0x85, 0xA1, 0xE1, ++ 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE4, 0xB8, 0x80, ++ 0xE4, 0xBA, 0x8C, 0xE4, 0xB8, 0x89, 0xE5, 0x9B, ++ 0x9B, 0xE4, 0xBA, 0x94, 0xE5, 0x85, 0xAD, 0xE4, ++ 0xB8, 0x83, 0xE5, 0x85, 0xAB, 0xE4, 0xB9, 0x9D, ++ 0xE5, 0x8D, 0x81, 0xE6, 0x9C, 0x88, 0xE7, 0x81, ++ 0xAB, 0xE6, 0xB0, 0xB4, 0xE6, 0x9C, 0xA8, 0xE9, ++ 0x87, 0x91, 0xE5, 0x9C, 0x9F, 0xE6, 0x97, 0xA5, ++ 0xE6, 0xA0, 0xAA, 0xE6, 0x9C, 0x89, 0xE7, 0xA4, ++ 0xBE, 0xE5, 0x90, 0x8D, 0xE7, 0x89, 0xB9, 0xE8, ++ 0xB2, 0xA1, 0xE7, 0xA5, 0x9D, 0xE5, 0x8A, 0xB4, ++ 0xE7, 0xA7, 0x98, 0xE7, 0x94, 0xB7, 0xE5, 0xA5, ++ 0xB3, 0xE9, 0x81, 0xA9, 0xE5, 0x84, 0xAA, 0xE5, ++ 0x8D, 0xB0, 0xE6, 0xB3, 0xA8, 0xE9, 0xA0, 0x85, ++ 0xE4, 0xBC, 0x91, 0xE5, 0x86, 0x99, 0xE6, 0xAD, ++ 0xA3, 0xE4, 0xB8, 0x8A, 0xE4, 0xB8, 0xAD, 0xE4, ++ 0xB8, 0x8B, 0xE5, 0xB7, 0xA6, 0xE5, 0x8F, 0xB3, ++ 0xE5, 0x8C, 0xBB, 0xE5, 0xAE, 0x97, 0xE5, 0xAD, ++ 0xA6, 0xE7, 0x9B, 0xA3, 0xE4, 0xBC, 0x81, 0xE8, ++ 0xB3, 0x87, 0xE5, 0x8D, 0x94, 0xE5, 0xA4, 0x9C, ++ 0x33, 0x36, 0x33, 0x37, 0x33, 0x38, 0x33, 0x39, ++ 0x34, 0x30, 0x34, 0x31, 0x34, 0x32, 0x34, 0x33, ++ 0x34, 0x34, 0x34, 0x35, 0x34, 0x36, 0x34, 0x37, ++ 0x34, 0x38, 0x34, 0x39, 0x35, 0x30, 0x31, 0xE6, ++ 0x9C, 0x88, 0x32, 0xE6, 0x9C, 0x88, 0x33, 0xE6, ++ 0x9C, 0x88, 0x34, 0xE6, 0x9C, 0x88, 0x35, 0xE6, ++ 0x9C, 0x88, 0x36, 0xE6, 0x9C, 0x88, 0x37, 0xE6, ++ 0x9C, 0x88, 0x38, 0xE6, 0x9C, 0x88, 0x39, 0xE6, ++ 0x9C, 0x88, 0x31, 0x30, 0xE6, 0x9C, 0x88, 0x31, ++ 0x31, 0xE6, 0x9C, 0x88, 0x31, 0x32, 0xE6, 0x9C, ++ 0x88, 0xE3, 0x82, 0xA2, 0xE3, 0x82, 0xA4, 0xE3, ++ 0x82, 0xA6, 0xE3, 0x82, 0xA8, 0xE3, 0x82, 0xAA, ++ 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0xAD, 0xE3, 0x82, ++ 0xAF, 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0xB3, 0xE3, ++ 0x82, 0xB5, 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0xB9, ++ 0xE3, 0x82, 0xBB, 0xE3, 0x82, 0xBD, 0xE3, 0x82, ++ 0xBF, 0xE3, 0x83, 0x81, 0xE3, 0x83, 0x84, 0xE3, ++ 0x83, 0x86, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x8A, ++ 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0x8C, 0xE3, 0x83, ++ 0x8D, 0xE3, 0x83, 0x8E, 0xE3, 0x83, 0x8F, 0xE3, ++ 0x83, 0x92, 0xE3, 0x83, 0x95, 0xE3, 0x83, 0x98, ++ 0xE3, 0x83, 0x9B, 0xE3, 0x83, 0x9E, 0xE3, 0x83, ++ 0x9F, 0xE3, 0x83, 0xA0, 0xE3, 0x83, 0xA1, 0xE3, ++ 0x83, 0xA2, 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xA6, ++ 0xE3, 0x83, 0xA8, 0xE3, 0x83, 0xA9, 0xE3, 0x83, ++ 0xAA, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xAC, 0xE3, ++ 0x83, 0xAD, 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0xB0, ++ 0xE3, 0x83, 0xB1, 0xE3, 0x83, 0xB2, 0xE3, 0x82, ++ 0xA2, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x9A, 0xE3, ++ 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xA2, ++ 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x95, 0xE3, 0x82, ++ 0xA1, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xB3, 0xE3, ++ 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xA2, ++ 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xBC, 0xE3, 0x83, ++ 0xAB, 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0x8B, 0xE3, ++ 0x83, 0xB3, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, ++ 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xB3, 0xE3, 0x83, ++ 0x81, 0xE3, 0x82, 0xA6, 0xE3, 0x82, 0xA9, 0xE3, ++ 0x83, 0xB3, 0xE3, 0x82, 0xA8, 0xE3, 0x82, 0xB9, ++ 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xBC, 0xE3, 0x83, ++ 0x88, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xA8, 0xE3, ++ 0x83, 0xBC, 0xE3, 0x82, 0xAB, 0xE3, 0x83, 0xBC, ++ 0xE3, 0x82, 0xAA, 0xE3, 0x83, 0xB3, 0xE3, 0x82, ++ 0xB9, 0xE3, 0x82, 0xAA, 0xE3, 0x83, 0xBC, 0xE3, ++ 0x83, 0xA0, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0xA4, ++ 0xE3, 0x83, 0xAA, 0xE3, 0x82, 0xAB, 0xE3, 0x83, ++ 0xA9, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3, ++ 0x82, 0xAB, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xAA, ++ 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAB, 0xE3, 0x82, ++ 0x99, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xB3, 0xE3, ++ 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xB3, ++ 0xE3, 0x83, 0x9E, 0xE3, 0x82, 0xAD, 0xE3, 0x82, ++ 0x99, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3, ++ 0x82, 0xAD, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x8B, ++ 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAD, 0xE3, 0x83, ++ 0xA5, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xBC, 0xE3, ++ 0x82, 0xAD, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAB, ++ 0xE3, 0x82, 0xBF, 0xE3, 0x82, 0x99, 0xE3, 0x83, ++ 0xBC, 0xE3, 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3, ++ 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3, 0x82, 0xAF, ++ 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA9, 0xE3, 0x83, ++ 0xA0, 0xE3, 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3, ++ 0x83, 0xA1, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88, ++ 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xAD, 0xE3, 0x83, ++ 0xAD, 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0x83, 0xE3, ++ 0x83, 0x88, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, ++ 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xA0, 0xE3, 0x82, ++ 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA9, 0xE3, ++ 0x83, 0xA0, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xB3, ++ 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAB, 0xE3, 0x82, ++ 0xBB, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xA4, 0xE3, ++ 0x83, 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAD, ++ 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x8D, 0xE3, 0x82, ++ 0xB1, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xB9, 0xE3, ++ 0x82, 0xB3, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x8A, ++ 0xE3, 0x82, 0xB3, 0xE3, 0x83, 0xBC, 0xE3, 0x83, ++ 0x9B, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xB5, 0xE3, ++ 0x82, 0xA4, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAB, ++ 0xE3, 0x82, 0xB5, 0xE3, 0x83, 0xB3, 0xE3, 0x83, ++ 0x81, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xA0, 0xE3, ++ 0x82, 0xB7, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xB3, ++ 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x82, ++ 0xBB, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x81, 0xE3, ++ 0x82, 0xBB, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88, ++ 0xE3, 0x82, 0xBF, 0xE3, 0x82, 0x99, 0xE3, 0x83, ++ 0xBC, 0xE3, 0x82, 0xB9, 0xE3, 0x83, 0x86, 0xE3, ++ 0x82, 0x99, 0xE3, 0x82, 0xB7, 0xE3, 0x83, 0x88, ++ 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83, ++ 0x88, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x8A, 0xE3, ++ 0x83, 0x8E, 0xE3, 0x83, 0x8E, 0xE3, 0x83, 0x83, ++ 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x8F, 0xE3, 0x82, ++ 0xA4, 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x8F, 0xE3, ++ 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xBB, ++ 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x83, ++ 0x8F, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3, ++ 0x83, 0x84, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x99, ++ 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAC, 0xE3, 0x83, ++ 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3, ++ 0x82, 0xA2, 0xE3, 0x82, 0xB9, 0xE3, 0x83, 0x88, ++ 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82, ++ 0x9A, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAB, 0xE3, ++ 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xB3, ++ 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x99, 0xE3, 0x83, ++ 0xAB, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0xA1, 0xE3, ++ 0x83, 0xA9, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, ++ 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x95, 0xE3, 0x82, ++ 0xA3, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, ++ 0x83, 0x95, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x83, ++ 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0xA7, 0xE3, 0x83, ++ 0xAB, 0xE3, 0x83, 0x95, 0xE3, 0x83, 0xA9, 0xE3, ++ 0x83, 0xB3, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0xAF, ++ 0xE3, 0x82, 0xBF, 0xE3, 0x83, 0xBC, 0xE3, 0x83, ++ 0xAB, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3, ++ 0x82, 0xBD, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x9A, ++ 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0x92, 0xE3, 0x83, ++ 0x98, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x84, 0xE3, ++ 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xB3, ++ 0xE3, 0x82, 0xB9, 0xE3, 0x83, 0x98, 0xE3, 0x82, ++ 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xB7, 0xE3, ++ 0x82, 0x99, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x99, ++ 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xBF, 0xE3, 0x83, ++ 0x9B, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xA4, 0xE3, ++ 0x83, 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x9B, ++ 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83, ++ 0x88, 0xE3, 0x83, 0x9B, 0xE3, 0x83, 0xB3, 0xE3, ++ 0x83, 0x9B, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xB3, ++ 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3, 0x83, ++ 0x9B, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, ++ 0x83, 0x9B, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xB3, ++ 0xE3, 0x83, 0x9E, 0xE3, 0x82, 0xA4, 0xE3, 0x82, ++ 0xAF, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0x9E, 0xE3, ++ 0x82, 0xA4, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x9E, ++ 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x8F, 0xE3, 0x83, ++ 0x9E, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xAF, 0xE3, ++ 0x83, 0x9E, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xB7, ++ 0xE3, 0x83, 0xA7, 0xE3, 0x83, 0xB3, 0xE3, 0x83, ++ 0x9F, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAD, 0xE3, ++ 0x83, 0xB3, 0xE3, 0x83, 0x9F, 0xE3, 0x83, 0xAA, ++ 0xE3, 0x83, 0x9F, 0xE3, 0x83, 0xAA, 0xE3, 0x83, ++ 0x8F, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, ++ 0x83, 0xAB, 0xE3, 0x83, 0xA1, 0xE3, 0x82, 0xAB, ++ 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA1, 0xE3, 0x82, ++ 0xAB, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x88, 0xE3, ++ 0x83, 0xB3, 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xBC, ++ 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB, 0xE3, 0x83, ++ 0xA4, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, ++ 0x82, 0x99, 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xBC, ++ 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xA6, 0xE3, 0x82, ++ 0xA2, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0xAA, 0xE3, ++ 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB, ++ 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xA9, 0xE3, 0x83, ++ 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3, ++ 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xBC, ++ 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x99, 0xE3, 0x83, ++ 0xAB, 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xA0, 0xE3, ++ 0x83, 0xAC, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88, ++ 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0x99, 0xE3, 0x83, ++ 0xB3, 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0x83, 0xE3, ++ 0x83, 0x88, 0x30, 0xE7, 0x82, 0xB9, 0x31, 0xE7, ++ 0x82, 0xB9, 0x32, 0xE7, 0x82, 0xB9, 0x33, 0xE7, ++ 0x82, 0xB9, 0x34, 0xE7, 0x82, 0xB9, 0x35, 0xE7, ++ 0x82, 0xB9, 0x36, 0xE7, 0x82, 0xB9, 0x37, 0xE7, ++ 0x82, 0xB9, 0x38, 0xE7, 0x82, 0xB9, 0x39, 0xE7, ++ 0x82, 0xB9, 0x31, 0x30, 0xE7, 0x82, 0xB9, 0x31, ++ 0x31, 0xE7, 0x82, 0xB9, 0x31, 0x32, 0xE7, 0x82, ++ 0xB9, 0x31, 0x33, 0xE7, 0x82, 0xB9, 0x31, 0x34, ++ 0xE7, 0x82, 0xB9, 0x31, 0x35, 0xE7, 0x82, 0xB9, ++ 0x31, 0x36, 0xE7, 0x82, 0xB9, 0x31, 0x37, 0xE7, ++ 0x82, 0xB9, 0x31, 0x38, 0xE7, 0x82, 0xB9, 0x31, ++ 0x39, 0xE7, 0x82, 0xB9, 0x32, 0x30, 0xE7, 0x82, ++ 0xB9, 0x32, 0x31, 0xE7, 0x82, 0xB9, 0x32, 0x32, ++ 0xE7, 0x82, 0xB9, 0x32, 0x33, 0xE7, 0x82, 0xB9, ++ 0x32, 0x34, 0xE7, 0x82, 0xB9, 0x68, 0x50, 0x61, ++ 0x64, 0x61, 0x41, 0x55, 0x62, 0x61, 0x72, 0x6F, ++ 0x56, 0x70, 0x63, 0xE5, 0xB9, 0xB3, 0xE6, 0x88, ++ 0x90, 0xE6, 0x98, 0xAD, 0xE5, 0x92, 0x8C, 0xE5, ++ 0xA4, 0xA7, 0xE6, 0xAD, 0xA3, 0xE6, 0x98, 0x8E, ++ 0xE6, 0xB2, 0xBB, 0xE6, 0xA0, 0xAA, 0xE5, 0xBC, ++ 0x8F, 0xE4, 0xBC, 0x9A, 0xE7, 0xA4, 0xBE, 0x70, ++ 0x41, 0x6E, 0x41, 0xCE, 0xBC, 0x41, 0x6D, 0x41, ++ 0x6B, 0x41, 0x4B, 0x42, 0x4D, 0x42, 0x47, 0x42, ++ 0x63, 0x61, 0x6C, 0x6B, 0x63, 0x61, 0x6C, 0x70, ++ 0x46, 0x6E, 0x46, 0xCE, 0xBC, 0x46, 0xCE, 0xBC, ++ 0x67, 0x6D, 0x67, 0x6B, 0x67, 0x48, 0x7A, 0x6B, ++ 0x48, 0x7A, 0x4D, 0x48, 0x7A, 0x47, 0x48, 0x7A, ++ 0x54, 0x48, 0x7A, 0xCE, 0xBC, 0x6C, 0x6D, 0x6C, ++ 0x64, 0x6C, 0x6B, 0x6C, 0x66, 0x6D, 0x6E, 0x6D, ++ 0xCE, 0xBC, 0x6D, 0x6D, 0x6D, 0x63, 0x6D, 0x6B, ++ 0x6D, 0x6D, 0x6D, 0x32, 0x63, 0x6D, 0x32, 0x6D, ++ 0x32, 0x6B, 0x6D, 0x32, 0x6D, 0x6D, 0x33, 0x63, ++ 0x6D, 0x33, 0x6D, 0x33, 0x6B, 0x6D, 0x33, 0x6D, ++ 0xE2, 0x88, 0x95, 0x73, 0x6D, 0xE2, 0x88, 0x95, ++ 0x73, 0x32, 0x50, 0x61, 0x6B, 0x50, 0x61, 0x4D, ++ 0x50, 0x61, 0x47, 0x50, 0x61, 0x72, 0x61, 0x64, ++ 0x72, 0x61, 0x64, 0xE2, 0x88, 0x95, 0x73, 0x72, ++ 0x61, 0x64, 0xE2, 0x88, 0x95, 0x73, 0x32, 0x70, ++ 0x73, 0x6E, 0x73, 0xCE, 0xBC, 0x73, 0x6D, 0x73, ++ 0x70, 0x56, 0x6E, 0x56, 0xCE, 0xBC, 0x56, 0x6D, ++ 0x56, 0x6B, 0x56, 0x4D, 0x56, 0x70, 0x57, 0x6E, ++ 0x57, 0xCE, 0xBC, 0x57, 0x6D, 0x57, 0x6B, 0x57, ++ 0x4D, 0x57, 0x6B, 0xCE, 0xA9, 0x4D, 0xCE, 0xA9, ++ 0x61, 0x2E, 0x6D, 0x2E, 0x42, 0x71, 0x63, 0x63, ++ 0x63, 0x64, 0x43, 0xE2, 0x88, 0x95, 0x6B, 0x67, ++ 0x43, 0x6F, 0x2E, 0x64, 0x42, 0x47, 0x79, 0x68, ++ 0x61, 0x48, 0x50, 0x69, 0x6E, 0x4B, 0x4B, 0x4B, ++ 0x4D, 0x6B, 0x74, 0x6C, 0x6D, 0x6C, 0x6E, 0x6C, ++ 0x6F, 0x67, 0x6C, 0x78, 0x6D, 0x62, 0x6D, 0x69, ++ 0x6C, 0x6D, 0x6F, 0x6C, 0x50, 0x48, 0x70, 0x2E, ++ 0x6D, 0x2E, 0x50, 0x50, 0x4D, 0x50, 0x52, 0x73, ++ 0x72, 0x53, 0x76, 0x57, 0x62, 0x31, 0xE6, 0x97, ++ 0xA5, 0x32, 0xE6, 0x97, 0xA5, 0x33, 0xE6, 0x97, ++ 0xA5, 0x34, 0xE6, 0x97, 0xA5, 0x35, 0xE6, 0x97, ++ 0xA5, 0x36, 0xE6, 0x97, 0xA5, 0x37, 0xE6, 0x97, ++ 0xA5, 0x38, 0xE6, 0x97, 0xA5, 0x39, 0xE6, 0x97, ++ 0xA5, 0x31, 0x30, 0xE6, 0x97, 0xA5, 0x31, 0x31, ++ 0xE6, 0x97, 0xA5, 0x31, 0x32, 0xE6, 0x97, 0xA5, ++ 0x31, 0x33, 0xE6, 0x97, 0xA5, 0x31, 0x34, 0xE6, ++ 0x97, 0xA5, 0x31, 0x35, 0xE6, 0x97, 0xA5, 0x31, ++ 0x36, 0xE6, 0x97, 0xA5, 0x31, 0x37, 0xE6, 0x97, ++ 0xA5, 0x31, 0x38, 0xE6, 0x97, 0xA5, 0x31, 0x39, ++ 0xE6, 0x97, 0xA5, 0x32, 0x30, 0xE6, 0x97, 0xA5, ++ 0x32, 0x31, 0xE6, 0x97, 0xA5, 0x32, 0x32, 0xE6, ++ 0x97, 0xA5, 0x32, 0x33, 0xE6, 0x97, 0xA5, 0x32, ++ 0x34, 0xE6, 0x97, 0xA5, 0x32, 0x35, 0xE6, 0x97, ++ 0xA5, 0x32, 0x36, 0xE6, 0x97, 0xA5, 0x32, 0x37, ++ 0xE6, 0x97, 0xA5, 0x32, 0x38, 0xE6, 0x97, 0xA5, ++ 0x32, 0x39, 0xE6, 0x97, 0xA5, 0x33, 0x30, 0xE6, ++ 0x97, 0xA5, 0x33, 0x31, 0xE6, 0x97, 0xA5, 0xF6, ++ 0xE8, 0xB1, 0x88, 0xF6, 0xE6, 0x9B, 0xB4, 0xF6, ++ 0xE8, 0xBB, 0x8A, 0xF6, 0xE8, 0xB3, 0x88, 0xF6, ++ 0xE6, 0xBB, 0x91, 0xF6, 0xE4, 0xB8, 0xB2, 0xF6, ++ 0xE5, 0x8F, 0xA5, 0xF6, 0xE9, 0xBE, 0x9C, 0xF6, ++ 0xE9, 0xBE, 0x9C, 0xF6, 0xE5, 0xA5, 0x91, 0xF6, ++ 0xE9, 0x87, 0x91, 0xF6, 0xE5, 0x96, 0x87, 0xF6, ++ 0xE5, 0xA5, 0x88, 0xF6, 0xE6, 0x87, 0xB6, 0xF6, ++ 0xE7, 0x99, 0xA9, 0xF6, 0xE7, 0xBE, 0x85, 0xF6, ++ 0xE8, 0x98, 0xBF, 0xF6, 0xE8, 0x9E, 0xBA, 0xF6, ++ 0xE8, 0xA3, 0xB8, 0xF6, 0xE9, 0x82, 0x8F, 0xF6, ++ 0xE6, 0xA8, 0x82, 0xF6, 0xE6, 0xB4, 0x9B, 0xF6, ++ 0xE7, 0x83, 0x99, 0xF6, 0xE7, 0x8F, 0x9E, 0xF6, ++ 0xE8, 0x90, 0xBD, 0xF6, 0xE9, 0x85, 0xAA, 0xF6, ++ 0xE9, 0xA7, 0xB1, 0xF6, 0xE4, 0xBA, 0x82, 0xF6, ++ 0xE5, 0x8D, 0xB5, 0xF6, 0xE6, 0xAC, 0x84, 0xF6, ++ 0xE7, 0x88, 0x9B, 0xF6, 0xE8, 0x98, 0xAD, 0xF6, ++ 0xE9, 0xB8, 0x9E, 0xF6, 0xE5, 0xB5, 0x90, 0xF6, ++ 0xE6, 0xBF, 0xAB, 0xF6, 0xE8, 0x97, 0x8D, 0xF6, ++ 0xE8, 0xA5, 0xA4, 0xF6, 0xE6, 0x8B, 0x89, 0xF6, ++ 0xE8, 0x87, 0x98, 0xF6, 0xE8, 0xA0, 0x9F, 0xF6, ++ 0xE5, 0xBB, 0x8A, 0xF6, 0xE6, 0x9C, 0x97, 0xF6, ++ 0xE6, 0xB5, 0xAA, 0xF6, 0xE7, 0x8B, 0xBC, 0xF6, ++ 0xE9, 0x83, 0x8E, 0xF6, 0xE4, 0xBE, 0x86, 0xF6, ++ 0xE5, 0x86, 0xB7, 0xF6, 0xE5, 0x8B, 0x9E, 0xF6, ++ 0xE6, 0x93, 0x84, 0xF6, 0xE6, 0xAB, 0x93, 0xF6, ++ 0xE7, 0x88, 0x90, 0xF6, 0xE7, 0x9B, 0xA7, 0xF6, ++ 0xE8, 0x80, 0x81, 0xF6, 0xE8, 0x98, 0x86, 0xF6, ++ 0xE8, 0x99, 0x9C, 0xF6, 0xE8, 0xB7, 0xAF, 0xF6, ++ 0xE9, 0x9C, 0xB2, 0xF6, 0xE9, 0xAD, 0xAF, 0xF6, ++ 0xE9, 0xB7, 0xBA, 0xF6, 0xE7, 0xA2, 0x8C, 0xF6, ++ 0xE7, 0xA5, 0xBF, 0xF6, 0xE7, 0xB6, 0xA0, 0xF6, ++ 0xE8, 0x8F, 0x89, 0xF6, 0xE9, 0x8C, 0x84, 0xF6, ++ 0xE9, 0xB9, 0xBF, 0xF6, 0xE8, 0xAB, 0x96, 0xF6, ++ 0xE5, 0xA3, 0x9F, 0xF6, 0xE5, 0xBC, 0x84, 0xF6, ++ 0xE7, 0xB1, 0xA0, 0xF6, 0xE8, 0x81, 0xBE, 0xF6, ++ 0xE7, 0x89, 0xA2, 0xF6, 0xE7, 0xA3, 0x8A, 0xF6, ++ 0xE8, 0xB3, 0x82, 0xF6, 0xE9, 0x9B, 0xB7, 0xF6, ++ 0xE5, 0xA3, 0x98, 0xF6, 0xE5, 0xB1, 0xA2, 0xF6, ++ 0xE6, 0xA8, 0x93, 0xF6, 0xE6, 0xB7, 0x9A, 0xF6, ++ 0xE6, 0xBC, 0x8F, 0xF6, 0xE7, 0xB4, 0xAF, 0xF6, ++ 0xE7, 0xB8, 0xB7, 0xF6, 0xE9, 0x99, 0x8B, 0xF6, ++ 0xE5, 0x8B, 0x92, 0xF6, 0xE8, 0x82, 0x8B, 0xF6, ++ 0xE5, 0x87, 0x9C, 0xF6, 0xE5, 0x87, 0x8C, 0xF6, ++ 0xE7, 0xA8, 0x9C, 0xF6, 0xE7, 0xB6, 0xBE, 0xF6, ++ 0xE8, 0x8F, 0xB1, 0xF6, 0xE9, 0x99, 0xB5, 0xF6, ++ 0xE8, 0xAE, 0x80, 0xF6, 0xE6, 0x8B, 0x8F, 0xF6, ++ 0xE6, 0xA8, 0x82, 0xF6, 0xE8, 0xAB, 0xBE, 0xF6, ++ 0xE4, 0xB8, 0xB9, 0xF6, 0xE5, 0xAF, 0xA7, 0xF6, ++ 0xE6, 0x80, 0x92, 0xF6, 0xE7, 0x8E, 0x87, 0xF6, ++ 0xE7, 0x95, 0xB0, 0xF6, 0xE5, 0x8C, 0x97, 0xF6, ++ 0xE7, 0xA3, 0xBB, 0xF6, 0xE4, 0xBE, 0xBF, 0xF6, ++ 0xE5, 0xBE, 0xA9, 0xF6, 0xE4, 0xB8, 0x8D, 0xF6, ++ 0xE6, 0xB3, 0x8C, 0xF6, 0xE6, 0x95, 0xB8, 0xF6, ++ 0xE7, 0xB4, 0xA2, 0xF6, 0xE5, 0x8F, 0x83, 0xF6, ++ 0xE5, 0xA1, 0x9E, 0xF6, 0xE7, 0x9C, 0x81, 0xF6, ++ 0xE8, 0x91, 0x89, 0xF6, 0xE8, 0xAA, 0xAA, 0xF6, ++ 0xE6, 0xAE, 0xBA, 0xF6, 0xE8, 0xBE, 0xB0, 0xF6, ++ 0xE6, 0xB2, 0x88, 0xF6, 0xE6, 0x8B, 0xBE, 0xF6, ++ 0xE8, 0x8B, 0xA5, 0xF6, 0xE6, 0x8E, 0xA0, 0xF6, ++ 0xE7, 0x95, 0xA5, 0xF6, 0xE4, 0xBA, 0xAE, 0xF6, ++ 0xE5, 0x85, 0xA9, 0xF6, 0xE5, 0x87, 0x89, 0xF6, ++ 0xE6, 0xA2, 0x81, 0xF6, 0xE7, 0xB3, 0xA7, 0xF6, ++ 0xE8, 0x89, 0xAF, 0xF6, 0xE8, 0xAB, 0x92, 0xF6, ++ 0xE9, 0x87, 0x8F, 0xF6, 0xE5, 0x8B, 0xB5, 0xF6, ++ 0xE5, 0x91, 0x82, 0xF6, 0xE5, 0xA5, 0xB3, 0xF6, ++ 0xE5, 0xBB, 0xAC, 0xF6, 0xE6, 0x97, 0x85, 0xF6, ++ 0xE6, 0xBF, 0xBE, 0xF6, 0xE7, 0xA4, 0xAA, 0xF6, ++ 0xE9, 0x96, 0xAD, 0xF6, 0xE9, 0xA9, 0xAA, 0xF6, ++ 0xE9, 0xBA, 0x97, 0xF6, 0xE9, 0xBB, 0x8E, 0xF6, ++ 0xE5, 0x8A, 0x9B, 0xF6, 0xE6, 0x9B, 0x86, 0xF6, ++ 0xE6, 0xAD, 0xB7, 0xF6, 0xE8, 0xBD, 0xA2, 0xF6, ++ 0xE5, 0xB9, 0xB4, 0xF6, 0xE6, 0x86, 0x90, 0xF6, ++ 0xE6, 0x88, 0x80, 0xF6, 0xE6, 0x92, 0x9A, 0xF6, ++ 0xE6, 0xBC, 0xA3, 0xF6, 0xE7, 0x85, 0x89, 0xF6, ++ 0xE7, 0x92, 0x89, 0xF6, 0xE7, 0xA7, 0x8A, 0xF6, ++ 0xE7, 0xB7, 0xB4, 0xF6, 0xE8, 0x81, 0xAF, 0xF6, ++ 0xE8, 0xBC, 0xA6, 0xF6, 0xE8, 0x93, 0xAE, 0xF6, ++ 0xE9, 0x80, 0xA3, 0xF6, 0xE9, 0x8D, 0x8A, 0xF6, ++ 0xE5, 0x88, 0x97, 0xF6, 0xE5, 0x8A, 0xA3, 0xF6, ++ 0xE5, 0x92, 0xBD, 0xF6, 0xE7, 0x83, 0x88, 0xF6, ++ 0xE8, 0xA3, 0x82, 0xF6, 0xE8, 0xAA, 0xAA, 0xF6, ++ 0xE5, 0xBB, 0x89, 0xF6, 0xE5, 0xBF, 0xB5, 0xF6, ++ 0xE6, 0x8D, 0xBB, 0xF6, 0xE6, 0xAE, 0xAE, 0xF6, ++ 0xE7, 0xB0, 0xBE, 0xF6, 0xE7, 0x8D, 0xB5, 0xF6, ++ 0xE4, 0xBB, 0xA4, 0xF6, 0xE5, 0x9B, 0xB9, 0xF6, ++ 0xE5, 0xAF, 0xA7, 0xF6, 0xE5, 0xB6, 0xBA, 0xF6, ++ 0xE6, 0x80, 0x9C, 0xF6, 0xE7, 0x8E, 0xB2, 0xF6, ++ 0xE7, 0x91, 0xA9, 0xF6, 0xE7, 0xBE, 0x9A, 0xF6, ++ 0xE8, 0x81, 0x86, 0xF6, 0xE9, 0x88, 0xB4, 0xF6, ++ 0xE9, 0x9B, 0xB6, 0xF6, 0xE9, 0x9D, 0x88, 0xF6, ++ 0xE9, 0xA0, 0x98, 0xF6, 0xE4, 0xBE, 0x8B, 0xF6, ++ 0xE7, 0xA6, 0xAE, 0xF6, 0xE9, 0x86, 0xB4, 0xF6, ++ 0xE9, 0x9A, 0xB8, 0xF6, 0xE6, 0x83, 0xA1, 0xF6, ++ 0xE4, 0xBA, 0x86, 0xF6, 0xE5, 0x83, 0x9A, 0xF6, ++ 0xE5, 0xAF, 0xAE, 0xF6, 0xE5, 0xB0, 0xBF, 0xF6, ++ 0xE6, 0x96, 0x99, 0xF6, 0xE6, 0xA8, 0x82, 0xF6, ++ 0xE7, 0x87, 0x8E, 0xF6, 0xE7, 0x99, 0x82, 0xF6, ++ 0xE8, 0x93, 0xBC, 0xF6, 0xE9, 0x81, 0xBC, 0xF6, ++ 0xE9, 0xBE, 0x8D, 0xF6, 0xE6, 0x9A, 0x88, 0xF6, ++ 0xE9, 0x98, 0xAE, 0xF6, 0xE5, 0x8A, 0x89, 0xF6, ++ 0xE6, 0x9D, 0xBB, 0xF6, 0xE6, 0x9F, 0xB3, 0xF6, ++ 0xE6, 0xB5, 0x81, 0xF6, 0xE6, 0xBA, 0x9C, 0xF6, ++ 0xE7, 0x90, 0x89, 0xF6, 0xE7, 0x95, 0x99, 0xF6, ++ 0xE7, 0xA1, 0xAB, 0xF6, 0xE7, 0xB4, 0x90, 0xF6, ++ 0xE9, 0xA1, 0x9E, 0xF6, 0xE5, 0x85, 0xAD, 0xF6, ++ 0xE6, 0x88, 0xAE, 0xF6, 0xE9, 0x99, 0xB8, 0xF6, ++ 0xE5, 0x80, 0xAB, 0xF6, 0xE5, 0xB4, 0x99, 0xF6, ++ 0xE6, 0xB7, 0xAA, 0xF6, 0xE8, 0xBC, 0xAA, 0xF6, ++ 0xE5, 0xBE, 0x8B, 0xF6, 0xE6, 0x85, 0x84, 0xF6, ++ 0xE6, 0xA0, 0x97, 0xF6, 0xE7, 0x8E, 0x87, 0xF6, ++ 0xE9, 0x9A, 0x86, 0xF6, 0xE5, 0x88, 0xA9, 0xF6, ++ 0xE5, 0x90, 0x8F, 0xF6, 0xE5, 0xB1, 0xA5, 0xF6, ++ 0xE6, 0x98, 0x93, 0xF6, 0xE6, 0x9D, 0x8E, 0xF6, ++ 0xE6, 0xA2, 0xA8, 0xF6, 0xE6, 0xB3, 0xA5, 0xF6, ++ 0xE7, 0x90, 0x86, 0xF6, 0xE7, 0x97, 0xA2, 0xF6, ++ 0xE7, 0xBD, 0xB9, 0xF6, 0xE8, 0xA3, 0x8F, 0xF6, ++ 0xE8, 0xA3, 0xA1, 0xF6, 0xE9, 0x87, 0x8C, 0xF6, ++ 0xE9, 0x9B, 0xA2, 0xF6, 0xE5, 0x8C, 0xBF, 0xF6, ++ 0xE6, 0xBA, 0xBA, 0xF6, 0xE5, 0x90, 0x9D, 0xF6, ++ 0xE7, 0x87, 0x90, 0xF6, 0xE7, 0x92, 0x98, 0xF6, ++ 0xE8, 0x97, 0xBA, 0xF6, 0xE9, 0x9A, 0xA3, 0xF6, ++ 0xE9, 0xB1, 0x97, 0xF6, 0xE9, 0xBA, 0x9F, 0xF6, ++ 0xE6, 0x9E, 0x97, 0xF6, 0xE6, 0xB7, 0x8B, 0xF6, ++ 0xE8, 0x87, 0xA8, 0xF6, 0xE7, 0xAB, 0x8B, 0xF6, ++ 0xE7, 0xAC, 0xA0, 0xF6, 0xE7, 0xB2, 0x92, 0xF6, ++ 0xE7, 0x8B, 0x80, 0xF6, 0xE7, 0x82, 0x99, 0xF6, ++ 0xE8, 0xAD, 0x98, 0xF6, 0xE4, 0xBB, 0x80, 0xF6, ++ 0xE8, 0x8C, 0xB6, 0xF6, 0xE5, 0x88, 0xBA, 0xF6, ++ 0xE5, 0x88, 0x87, 0xF6, 0xE5, 0xBA, 0xA6, 0xF6, ++ 0xE6, 0x8B, 0x93, 0xF6, 0xE7, 0xB3, 0x96, 0xF6, ++ 0xE5, 0xAE, 0x85, 0xF6, 0xE6, 0xB4, 0x9E, 0xF6, ++ 0xE6, 0x9A, 0xB4, 0xF6, 0xE8, 0xBC, 0xBB, 0xF6, ++ 0xE8, 0xA1, 0x8C, 0xF6, 0xE9, 0x99, 0x8D, 0xF6, ++ 0xE8, 0xA6, 0x8B, 0xF6, 0xE5, 0xBB, 0x93, 0xF6, ++ 0xE5, 0x85, 0x80, 0xF6, 0xE5, 0x97, 0x80, 0xF6, ++ 0xE5, 0xA1, 0x9A, 0xF6, 0xE6, 0x99, 0xB4, 0xF6, ++ 0xE5, 0x87, 0x9E, 0xF6, 0xE7, 0x8C, 0xAA, 0xF6, ++ 0xE7, 0x9B, 0x8A, 0xF6, 0xE7, 0xA4, 0xBC, 0xF6, ++ 0xE7, 0xA5, 0x9E, 0xF6, 0xE7, 0xA5, 0xA5, 0xF6, ++ 0xE7, 0xA6, 0x8F, 0xF6, 0xE9, 0x9D, 0x96, 0xF6, ++ 0xE7, 0xB2, 0xBE, 0xF6, 0xE7, 0xBE, 0xBD, 0xF6, ++ 0xE8, 0x98, 0x92, 0xF6, 0xE8, 0xAB, 0xB8, 0xF6, ++ 0xE9, 0x80, 0xB8, 0xF6, 0xE9, 0x83, 0xBD, 0xF6, ++ 0xE9, 0xA3, 0xAF, 0xF6, 0xE9, 0xA3, 0xBC, 0xF6, ++ 0xE9, 0xA4, 0xA8, 0xF6, 0xE9, 0xB6, 0xB4, 0xF6, ++ 0xE4, 0xBE, 0xAE, 0xF6, 0xE5, 0x83, 0xA7, 0xF6, ++ 0xE5, 0x85, 0x8D, 0xF6, 0xE5, 0x8B, 0x89, 0xF6, ++ 0xE5, 0x8B, 0xA4, 0xF6, 0xE5, 0x8D, 0x91, 0xF6, ++ 0xE5, 0x96, 0x9D, 0xF6, 0xE5, 0x98, 0x86, 0xF6, ++ 0xE5, 0x99, 0xA8, 0xF6, 0xE5, 0xA1, 0x80, 0xF6, ++ 0xE5, 0xA2, 0xA8, 0xF6, 0xE5, 0xB1, 0xA4, 0xF6, ++ 0xE5, 0xB1, 0xAE, 0xF6, 0xE6, 0x82, 0x94, 0xF6, ++ 0xE6, 0x85, 0xA8, 0xF6, 0xE6, 0x86, 0x8E, 0xF6, ++ 0xE6, 0x87, 0xB2, 0xF6, 0xE6, 0x95, 0x8F, 0xF6, ++ 0xE6, 0x97, 0xA2, 0xF6, 0xE6, 0x9A, 0x91, 0xF6, ++ 0xE6, 0xA2, 0x85, 0xF6, 0xE6, 0xB5, 0xB7, 0xF6, ++ 0xE6, 0xB8, 0x9A, 0xF6, 0xE6, 0xBC, 0xA2, 0xF6, ++ 0xE7, 0x85, 0xAE, 0xF6, 0xE7, 0x88, 0xAB, 0xF6, ++ 0xE7, 0x90, 0xA2, 0xF6, 0xE7, 0xA2, 0x91, 0xF6, ++ 0xE7, 0xA4, 0xBE, 0xF6, 0xE7, 0xA5, 0x89, 0xF6, ++ 0xE7, 0xA5, 0x88, 0xF6, 0xE7, 0xA5, 0x90, 0xF6, ++ 0xE7, 0xA5, 0x96, 0xF6, 0xE7, 0xA5, 0x9D, 0xF6, ++ 0xE7, 0xA6, 0x8D, 0xF6, 0xE7, 0xA6, 0x8E, 0xF6, ++ 0xE7, 0xA9, 0x80, 0xF6, 0xE7, 0xAA, 0x81, 0xF6, ++ 0xE7, 0xAF, 0x80, 0xF6, 0xE7, 0xB7, 0xB4, 0xF6, ++ 0xE7, 0xB8, 0x89, 0xF6, 0xE7, 0xB9, 0x81, 0xF6, ++ 0xE7, 0xBD, 0xB2, 0xF6, 0xE8, 0x80, 0x85, 0xF6, ++ 0xE8, 0x87, 0xAD, 0xF6, 0xE8, 0x89, 0xB9, 0xF6, ++ 0xE8, 0x89, 0xB9, 0xF6, 0xE8, 0x91, 0x97, 0xF6, ++ 0xE8, 0xA4, 0x90, 0xF6, 0xE8, 0xA6, 0x96, 0xF6, ++ 0xE8, 0xAC, 0x81, 0xF6, 0xE8, 0xAC, 0xB9, 0xF6, ++ 0xE8, 0xB3, 0x93, 0xF6, 0xE8, 0xB4, 0x88, 0xF6, ++ 0xE8, 0xBE, 0xB6, 0xF6, 0xE9, 0x80, 0xB8, 0xF6, ++ 0xE9, 0x9B, 0xA3, 0xF6, 0xE9, 0x9F, 0xBF, 0xF6, ++ 0xE9, 0xA0, 0xBB, 0x66, 0x66, 0x66, 0x69, 0x66, ++ 0x6C, 0x66, 0x66, 0x69, 0x66, 0x66, 0x6C, 0x73, ++ 0x74, 0x73, 0x74, 0xD5, 0xB4, 0xD5, 0xB6, 0xD5, ++ 0xB4, 0xD5, 0xA5, 0xD5, 0xB4, 0xD5, 0xAB, 0xD5, ++ 0xBE, 0xD5, 0xB6, 0xD5, 0xB4, 0xD5, 0xAD, 0xF6, ++ 0xD7, 0x99, 0xD6, 0xB4, 0xF6, 0xD7, 0xB2, 0xD6, ++ 0xB7, 0xD7, 0xA2, 0xD7, 0x90, 0xD7, 0x93, 0xD7, ++ 0x94, 0xD7, 0x9B, 0xD7, 0x9C, 0xD7, 0x9D, 0xD7, ++ 0xA8, 0xD7, 0xAA, 0x2B, 0xF6, 0xD7, 0xA9, 0xD7, ++ 0x81, 0xF6, 0xD7, 0xA9, 0xD7, 0x82, 0xF6, 0xD7, ++ 0xA9, 0xD6, 0xBC, 0xD7, 0x81, 0xF6, 0xD7, 0xA9, ++ 0xD6, 0xBC, 0xD7, 0x82, 0xF6, 0xD7, 0x90, 0xD6, ++ 0xB7, 0xF6, 0xD7, 0x90, 0xD6, 0xB8, 0xF6, 0xD7, ++ 0x90, 0xD6, 0xBC, 0xF6, 0xD7, 0x91, 0xD6, 0xBC, ++ 0xF6, 0xD7, 0x92, 0xD6, 0xBC, 0xF6, 0xD7, 0x93, ++ 0xD6, 0xBC, 0xF6, 0xD7, 0x94, 0xD6, 0xBC, 0xF6, ++ 0xD7, 0x95, 0xD6, 0xBC, 0xF6, 0xD7, 0x96, 0xD6, ++ 0xBC, 0xF6, 0xD7, 0x98, 0xD6, 0xBC, 0xF6, 0xD7, ++ 0x99, 0xD6, 0xBC, 0xF6, 0xD7, 0x9A, 0xD6, 0xBC, ++ 0xF6, 0xD7, 0x9B, 0xD6, 0xBC, 0xF6, 0xD7, 0x9C, ++ 0xD6, 0xBC, 0xF6, 0xD7, 0x9E, 0xD6, 0xBC, 0xF6, ++ 0xD7, 0xA0, 0xD6, 0xBC, 0xF6, 0xD7, 0xA1, 0xD6, ++ 0xBC, 0xF6, 0xD7, 0xA3, 0xD6, 0xBC, 0xF6, 0xD7, ++ 0xA4, 0xD6, 0xBC, 0xF6, 0xD7, 0xA6, 0xD6, 0xBC, ++ 0xF6, 0xD7, 0xA7, 0xD6, 0xBC, 0xF6, 0xD7, 0xA8, ++ 0xD6, 0xBC, 0xF6, 0xD7, 0xA9, 0xD6, 0xBC, 0xF6, ++ 0xD7, 0xAA, 0xD6, 0xBC, 0xF6, 0xD7, 0x95, 0xD6, ++ 0xB9, 0xF6, 0xD7, 0x91, 0xD6, 0xBF, 0xF6, 0xD7, ++ 0x9B, 0xD6, 0xBF, 0xF6, 0xD7, 0xA4, 0xD6, 0xBF, ++ 0xD7, 0x90, 0xD7, 0x9C, 0xD9, 0xB1, 0xD9, 0xB1, ++ 0xD9, 0xBB, 0xD9, 0xBB, 0xD9, 0xBB, 0xD9, 0xBB, ++ 0xD9, 0xBE, 0xD9, 0xBE, 0xD9, 0xBE, 0xD9, 0xBE, ++ 0xDA, 0x80, 0xDA, 0x80, 0xDA, 0x80, 0xDA, 0x80, ++ 0xD9, 0xBA, 0xD9, 0xBA, 0xD9, 0xBA, 0xD9, 0xBA, ++ 0xD9, 0xBF, 0xD9, 0xBF, 0xD9, 0xBF, 0xD9, 0xBF, ++ 0xD9, 0xB9, 0xD9, 0xB9, 0xD9, 0xB9, 0xD9, 0xB9, ++ 0xDA, 0xA4, 0xDA, 0xA4, 0xDA, 0xA4, 0xDA, 0xA4, ++ 0xDA, 0xA6, 0xDA, 0xA6, 0xDA, 0xA6, 0xDA, 0xA6, ++ 0xDA, 0x84, 0xDA, 0x84, 0xDA, 0x84, 0xDA, 0x84, ++ 0xDA, 0x83, 0xDA, 0x83, 0xDA, 0x83, 0xDA, 0x83, ++ 0xDA, 0x86, 0xDA, 0x86, 0xDA, 0x86, 0xDA, 0x86, ++ 0xDA, 0x87, 0xDA, 0x87, 0xDA, 0x87, 0xDA, 0x87, ++ 0xDA, 0x8D, 0xDA, 0x8D, 0xDA, 0x8C, 0xDA, 0x8C, ++ 0xDA, 0x8E, 0xDA, 0x8E, 0xDA, 0x88, 0xDA, 0x88, ++ 0xDA, 0x98, 0xDA, 0x98, 0xDA, 0x91, 0xDA, 0x91, ++ 0xDA, 0xA9, 0xDA, 0xA9, 0xDA, 0xA9, 0xDA, 0xA9, ++ 0xDA, 0xAF, 0xDA, 0xAF, 0xDA, 0xAF, 0xDA, 0xAF, ++ 0xDA, 0xB3, 0xDA, 0xB3, 0xDA, 0xB3, 0xDA, 0xB3, ++ 0xDA, 0xB1, 0xDA, 0xB1, 0xDA, 0xB1, 0xDA, 0xB1, ++ 0xDA, 0xBA, 0xDA, 0xBA, 0xDA, 0xBB, 0xDA, 0xBB, ++ 0xDA, 0xBB, 0xDA, 0xBB, 0xDB, 0x95, 0xD9, 0x94, ++ 0xDB, 0x95, 0xD9, 0x94, 0xDB, 0x81, 0xDB, 0x81, ++ 0xDB, 0x81, 0xDB, 0x81, 0xDA, 0xBE, 0xDA, 0xBE, ++ 0xDA, 0xBE, 0xDA, 0xBE, 0xDB, 0x92, 0xDB, 0x92, ++ 0xDB, 0x92, 0xD9, 0x94, 0xDB, 0x92, 0xD9, 0x94, ++ 0xDA, 0xAD, 0xDA, 0xAD, 0xDA, 0xAD, 0xDA, 0xAD, ++ 0xDB, 0x87, 0xDB, 0x87, 0xDB, 0x86, 0xDB, 0x86, ++ 0xDB, 0x88, 0xDB, 0x88, 0xDB, 0x87, 0xD9, 0xB4, ++ 0xDB, 0x8B, 0xDB, 0x8B, 0xDB, 0x85, 0xDB, 0x85, ++ 0xDB, 0x89, 0xDB, 0x89, 0xDB, 0x90, 0xDB, 0x90, ++ 0xDB, 0x90, 0xDB, 0x90, 0xD9, 0x89, 0xD9, 0x89, ++ 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x8A, ++ 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x8A, 0xD9, 0x94, ++ 0xDB, 0x95, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x95, ++ 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x8A, ++ 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x8A, 0xD9, 0x94, ++ 0xDB, 0x87, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x87, ++ 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x86, 0xD9, 0x8A, ++ 0xD9, 0x94, 0xDB, 0x86, 0xD9, 0x8A, 0xD9, 0x94, ++ 0xDB, 0x88, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x88, ++ 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x90, 0xD9, 0x8A, ++ 0xD9, 0x94, 0xDB, 0x90, 0xD9, 0x8A, 0xD9, 0x94, ++ 0xDB, 0x90, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x89, ++ 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xD9, 0x8A, ++ 0xD9, 0x94, 0xD9, 0x89, 0xDB, 0x8C, 0xDB, 0x8C, ++ 0xDB, 0x8C, 0xDB, 0x8C, 0xD9, 0x8A, 0xD9, 0x94, ++ 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAD, ++ 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, ++ 0xD9, 0x94, 0xD9, 0x89, 0xD9, 0x8A, 0xD9, 0x94, ++ 0xD9, 0x8A, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8, 0xA8, ++ 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8, 0xA8, ++ 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x89, 0xD8, 0xA8, ++ 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD8, 0xAA, ++ 0xD8, 0xAD, 0xD8, 0xAA, 0xD8, 0xAE, 0xD8, 0xAA, ++ 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x89, 0xD8, 0xAA, ++ 0xD9, 0x8A, 0xD8, 0xAB, 0xD8, 0xAC, 0xD8, 0xAB, ++ 0xD9, 0x85, 0xD8, 0xAB, 0xD9, 0x89, 0xD8, 0xAB, ++ 0xD9, 0x8A, 0xD8, 0xAC, 0xD8, 0xAD, 0xD8, 0xAC, ++ 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, 0xAD, ++ 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD8, 0xAE, ++ 0xD8, 0xAD, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB3, ++ 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3, ++ 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xB5, ++ 0xD8, 0xAD, 0xD8, 0xB5, 0xD9, 0x85, 0xD8, 0xB6, ++ 0xD8, 0xAC, 0xD8, 0xB6, 0xD8, 0xAD, 0xD8, 0xB6, ++ 0xD8, 0xAE, 0xD8, 0xB6, 0xD9, 0x85, 0xD8, 0xB7, ++ 0xD8, 0xAD, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xB8, ++ 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8, 0xB9, ++ 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8, 0xBA, ++ 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9, 0x81, ++ 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, 0x81, ++ 0xD9, 0x85, 0xD9, 0x81, 0xD9, 0x89, 0xD9, 0x81, ++ 0xD9, 0x8A, 0xD9, 0x82, 0xD8, 0xAD, 0xD9, 0x82, ++ 0xD9, 0x85, 0xD9, 0x82, 0xD9, 0x89, 0xD9, 0x82, ++ 0xD9, 0x8A, 0xD9, 0x83, 0xD8, 0xA7, 0xD9, 0x83, ++ 0xD8, 0xAC, 0xD9, 0x83, 0xD8, 0xAD, 0xD9, 0x83, ++ 0xD8, 0xAE, 0xD9, 0x83, 0xD9, 0x84, 0xD9, 0x83, ++ 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x89, 0xD9, 0x83, ++ 0xD9, 0x8A, 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x84, ++ 0xD8, 0xAD, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x84, ++ 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x89, 0xD9, 0x84, ++ 0xD9, 0x8A, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x85, ++ 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9, 0x85, ++ 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x89, 0xD9, 0x85, ++ 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x86, ++ 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9, 0x86, ++ 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x89, 0xD9, 0x86, ++ 0xD9, 0x8A, 0xD9, 0x87, 0xD8, 0xAC, 0xD9, 0x87, ++ 0xD9, 0x85, 0xD9, 0x87, 0xD9, 0x89, 0xD9, 0x87, ++ 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, 0x8A, ++ 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9, 0x8A, ++ 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x89, 0xD9, 0x8A, ++ 0xD9, 0x8A, 0xD8, 0xB0, 0xD9, 0xB0, 0xD8, 0xB1, ++ 0xD9, 0xB0, 0xD9, 0x89, 0xD9, 0xB0, 0x20, 0xD9, ++ 0x8C, 0xD9, 0x91, 0x20, 0xD9, 0x8D, 0xD9, 0x91, ++ 0x20, 0xD9, 0x8E, 0xD9, 0x91, 0x20, 0xD9, 0x8F, ++ 0xD9, 0x91, 0x20, 0xD9, 0x90, 0xD9, 0x91, 0x20, ++ 0xD9, 0x91, 0xD9, 0xB0, 0xD9, 0x8A, 0xD9, 0x94, ++ 0xD8, 0xB1, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xB2, ++ 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, ++ 0xD9, 0x94, 0xD9, 0x86, 0xD9, 0x8A, 0xD9, 0x94, ++ 0xD9, 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x8A, ++ 0xD8, 0xA8, 0xD8, 0xB1, 0xD8, 0xA8, 0xD8, 0xB2, ++ 0xD8, 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x86, ++ 0xD8, 0xA8, 0xD9, 0x89, 0xD8, 0xA8, 0xD9, 0x8A, ++ 0xD8, 0xAA, 0xD8, 0xB1, 0xD8, 0xAA, 0xD8, 0xB2, ++ 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x86, ++ 0xD8, 0xAA, 0xD9, 0x89, 0xD8, 0xAA, 0xD9, 0x8A, ++ 0xD8, 0xAB, 0xD8, 0xB1, 0xD8, 0xAB, 0xD8, 0xB2, ++ 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAB, 0xD9, 0x86, ++ 0xD8, 0xAB, 0xD9, 0x89, 0xD8, 0xAB, 0xD9, 0x8A, ++ 0xD9, 0x81, 0xD9, 0x89, 0xD9, 0x81, 0xD9, 0x8A, ++ 0xD9, 0x82, 0xD9, 0x89, 0xD9, 0x82, 0xD9, 0x8A, ++ 0xD9, 0x83, 0xD8, 0xA7, 0xD9, 0x83, 0xD9, 0x84, ++ 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x89, ++ 0xD9, 0x83, 0xD9, 0x8A, 0xD9, 0x84, 0xD9, 0x85, ++ 0xD9, 0x84, 0xD9, 0x89, 0xD9, 0x84, 0xD9, 0x8A, ++ 0xD9, 0x85, 0xD8, 0xA7, 0xD9, 0x85, 0xD9, 0x85, ++ 0xD9, 0x86, 0xD8, 0xB1, 0xD9, 0x86, 0xD8, 0xB2, ++ 0xD9, 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x86, ++ 0xD9, 0x86, 0xD9, 0x89, 0xD9, 0x86, 0xD9, 0x8A, ++ 0xD9, 0x89, 0xD9, 0xB0, 0xD9, 0x8A, 0xD8, 0xB1, ++ 0xD9, 0x8A, 0xD8, 0xB2, 0xD9, 0x8A, 0xD9, 0x85, ++ 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x8A, 0xD9, 0x89, ++ 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x94, ++ 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAD, ++ 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAE, 0xD9, 0x8A, ++ 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x94, ++ 0xD9, 0x87, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8, 0xA8, ++ 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8, 0xA8, ++ 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x87, 0xD8, 0xAA, ++ 0xD8, 0xAC, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAA, ++ 0xD8, 0xAE, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA, ++ 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAC, ++ 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, 0xAD, ++ 0xD8, 0xAC, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, ++ 0xD8, 0xAC, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB3, ++ 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3, ++ 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xB5, ++ 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAE, 0xD8, 0xB5, ++ 0xD9, 0x85, 0xD8, 0xB6, 0xD8, 0xAC, 0xD8, 0xB6, ++ 0xD8, 0xAD, 0xD8, 0xB6, 0xD8, 0xAE, 0xD8, 0xB6, ++ 0xD9, 0x85, 0xD8, 0xB7, 0xD8, 0xAD, 0xD8, 0xB8, ++ 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8, 0xB9, ++ 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8, 0xBA, ++ 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9, 0x81, ++ 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, 0x81, ++ 0xD9, 0x85, 0xD9, 0x82, 0xD8, 0xAD, 0xD9, 0x82, ++ 0xD9, 0x85, 0xD9, 0x83, 0xD8, 0xAC, 0xD9, 0x83, ++ 0xD8, 0xAD, 0xD9, 0x83, 0xD8, 0xAE, 0xD9, 0x83, ++ 0xD9, 0x84, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x84, ++ 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x84, ++ 0xD8, 0xAE, 0xD9, 0x84, 0xD9, 0x85, 0xD9, 0x84, ++ 0xD9, 0x87, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x85, ++ 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9, 0x85, ++ 0xD9, 0x85, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x86, ++ 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9, 0x86, ++ 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9, 0x87, ++ 0xD8, 0xAC, 0xD9, 0x87, 0xD9, 0x85, 0xD9, 0x87, ++ 0xD9, 0xB0, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, 0x8A, ++ 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9, 0x8A, ++ 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x8A, ++ 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x94, ++ 0xD9, 0x87, 0xD8, 0xA8, 0xD9, 0x85, 0xD8, 0xA8, ++ 0xD9, 0x87, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA, ++ 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAB, ++ 0xD9, 0x87, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xB3, ++ 0xD9, 0x87, 0xD8, 0xB4, 0xD9, 0x85, 0xD8, 0xB4, ++ 0xD9, 0x87, 0xD9, 0x83, 0xD9, 0x84, 0xD9, 0x83, ++ 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x85, 0xD9, 0x86, ++ 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9, 0x8A, ++ 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x80, ++ 0xD9, 0x8E, 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x8F, ++ 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x90, 0xD9, 0x91, ++ 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9, 0x8A, ++ 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9, 0x8A, ++ 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x8A, ++ 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9, 0x8A, ++ 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9, 0x8A, ++ 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9, 0x8A, ++ 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x8A, ++ 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9, 0x8A, ++ 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x8A, ++ 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9, 0x8A, ++ 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD, ++ 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85, ++ 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8, 0xB1, ++ 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8, 0xB1, ++ 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9, 0x8A, ++ 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9, 0x8A, ++ 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x8A, ++ 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9, 0x8A, ++ 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9, 0x8A, ++ 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9, 0x8A, ++ 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x8A, ++ 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9, 0x8A, ++ 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x8A, ++ 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9, 0x8A, ++ 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD, ++ 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85, ++ 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8, 0xB1, ++ 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8, 0xB1, ++ 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD, ++ 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85, ++ 0xD8, 0xB3, 0xD9, 0x87, 0xD8, 0xB4, 0xD9, 0x87, ++ 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xB3, 0xD8, 0xAC, ++ 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3, 0xD8, 0xAE, ++ 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD, ++ 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB7, 0xD9, 0x85, ++ 0xD8, 0xB8, 0xD9, 0x85, 0xD8, 0xA7, 0xD9, 0x8B, ++ 0xD8, 0xA7, 0xD9, 0x8B, 0xD8, 0xAA, 0xD8, 0xAC, ++ 0xD9, 0x85, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAC, ++ 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, 0xAA, ++ 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAA, 0xD8, 0xAE, ++ 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAC, ++ 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAA, ++ 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9, 0x85, ++ 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, 0xAD, ++ 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xAD, ++ 0xD9, 0x85, 0xD9, 0x89, 0xD8, 0xB3, 0xD8, 0xAD, ++ 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAC, 0xD8, 0xAD, ++ 0xD8, 0xB3, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xB3, ++ 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xB3, 0xD9, 0x85, ++ 0xD8, 0xAD, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xAC, ++ 0xD8, 0xB3, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB3, ++ 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB5, 0xD8, 0xAD, ++ 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAD, 0xD8, 0xAD, ++ 0xD8, 0xB5, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB4, ++ 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB4, 0xD8, 0xAD, ++ 0xD9, 0x85, 0xD8, 0xB4, 0xD8, 0xAC, 0xD9, 0x8A, ++ 0xD8, 0xB4, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xB4, ++ 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85, ++ 0xD9, 0x85, 0xD8, 0xB4, 0xD9, 0x85, 0xD9, 0x85, ++ 0xD8, 0xB6, 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xB6, ++ 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB6, 0xD8, 0xAE, ++ 0xD9, 0x85, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xAD, ++ 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xB7, ++ 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB7, 0xD9, 0x85, ++ 0xD9, 0x8A, 0xD8, 0xB9, 0xD8, 0xAC, 0xD9, 0x85, ++ 0xD8, 0xB9, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB9, ++ 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB9, 0xD9, 0x85, ++ 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x85, 0xD9, 0x85, ++ 0xD8, 0xBA, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xBA, ++ 0xD9, 0x85, 0xD9, 0x89, 0xD9, 0x81, 0xD8, 0xAE, ++ 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, 0x85, ++ 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x82, ++ 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x84, 0xD8, 0xAD, ++ 0xD9, 0x85, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x8A, ++ 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x89, 0xD9, 0x84, ++ 0xD8, 0xAC, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAC, ++ 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x85, ++ 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x85, 0xD9, 0x84, ++ 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x84, 0xD9, 0x85, ++ 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAC, ++ 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x85, ++ 0xD8, 0xAD, 0xD9, 0x8A, 0xD9, 0x85, 0xD8, 0xAC, ++ 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x85, ++ 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9, 0x85, ++ 0xD8, 0xAE, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xAC, ++ 0xD8, 0xAE, 0xD9, 0x87, 0xD9, 0x85, 0xD8, 0xAC, ++ 0xD9, 0x87, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x86, ++ 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x86, 0xD8, 0xAD, ++ 0xD9, 0x89, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x85, ++ 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, 0x86, ++ 0xD8, 0xAC, 0xD9, 0x89, 0xD9, 0x86, 0xD9, 0x85, ++ 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x85, 0xD9, 0x89, ++ 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x8A, ++ 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xA8, 0xD8, 0xAE, ++ 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD9, 0x8A, ++ 0xD8, 0xAA, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAA, ++ 0xD8, 0xAE, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAE, ++ 0xD9, 0x89, 0xD8, 0xAA, 0xD9, 0x85, 0xD9, 0x8A, ++ 0xD8, 0xAA, 0xD9, 0x85, 0xD9, 0x89, 0xD8, 0xAC, ++ 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xAC, 0xD8, 0xAD, ++ 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, 0x89, ++ 0xD8, 0xB3, 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xB5, ++ 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xB4, 0xD8, 0xAD, ++ 0xD9, 0x8A, 0xD8, 0xB6, 0xD8, 0xAD, 0xD9, 0x8A, ++ 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x84, ++ 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAD, ++ 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, 0x8A, ++ 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x85, ++ 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x82, 0xD9, 0x85, ++ 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAD, 0xD9, 0x8A, ++ 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x84, ++ 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB9, 0xD9, 0x85, ++ 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x8A, ++ 0xD9, 0x86, 0xD8, 0xAC, 0xD8, 0xAD, 0xD9, 0x85, ++ 0xD8, 0xAE, 0xD9, 0x8A, 0xD9, 0x84, 0xD8, 0xAC, ++ 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x85, ++ 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, 0x86, ++ 0xD8, 0xAC, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, 0xAD, ++ 0xD9, 0x8A, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x8A, ++ 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x81, ++ 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xA8, 0xD8, 0xAD, ++ 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x85, ++ 0xD8, 0xB9, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, 0xB5, ++ 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB3, 0xD8, 0xAE, ++ 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x8A, ++ 0xD8, 0xB5, 0xD9, 0x84, 0xDB, 0x92, 0xD9, 0x82, ++ 0xD9, 0x84, 0xDB, 0x92, 0xD8, 0xA7, 0xD9, 0x84, ++ 0xD9, 0x84, 0xD9, 0x87, 0xD8, 0xA7, 0xD9, 0x83, ++ 0xD8, 0xA8, 0xD8, 0xB1, 0xD9, 0x85, 0xD8, 0xAD, ++ 0xD9, 0x85, 0xD8, 0xAF, 0xD8, 0xB5, 0xD9, 0x84, ++ 0xD8, 0xB9, 0xD9, 0x85, 0xD8, 0xB1, 0xD8, 0xB3, ++ 0xD9, 0x88, 0xD9, 0x84, 0xD8, 0xB9, 0xD9, 0x84, ++ 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x88, 0xD8, 0xB3, ++ 0xD9, 0x84, 0xD9, 0x85, 0xD8, 0xB5, 0xD9, 0x84, ++ 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x84, 0xD9, 0x89, ++ 0x20, 0xD8, 0xA7, 0xD9, 0x84, 0xD9, 0x84, 0xD9, ++ 0x87, 0x20, 0xD8, 0xB9, 0xD9, 0x84, 0xD9, 0x8A, ++ 0xD9, 0x87, 0x20, 0xD9, 0x88, 0xD8, 0xB3, 0xD9, ++ 0x84, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x84, 0x20, ++ 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x84, ++ 0xD9, 0x87, 0xD8, 0xB1, 0xDB, 0x8C, 0xD8, 0xA7, ++ 0xD9, 0x84, 0x2E, 0x2E, 0xE2, 0x80, 0x94, 0xE2, ++ 0x80, 0x93, 0x5F, 0x5F, 0x28, 0x29, 0x7B, 0x7D, ++ 0xE3, 0x80, 0x94, 0xE3, 0x80, 0x95, 0xE3, 0x80, ++ 0x90, 0xE3, 0x80, 0x91, 0xE3, 0x80, 0x8A, 0xE3, ++ 0x80, 0x8B, 0xE3, 0x80, 0x88, 0xE3, 0x80, 0x89, ++ 0xE3, 0x80, 0x8C, 0xE3, 0x80, 0x8D, 0xE3, 0x80, ++ 0x8E, 0xE3, 0x80, 0x8F, 0x20, 0xCC, 0x85, 0x20, ++ 0xCC, 0x85, 0x20, 0xCC, 0x85, 0x20, 0xCC, 0x85, ++ 0x5F, 0x5F, 0x5F, 0x2C, 0xE3, 0x80, 0x81, 0x2E, ++ 0x3B, 0x3A, 0x3F, 0x21, 0xE2, 0x80, 0x94, 0x28, ++ 0x29, 0x7B, 0x7D, 0xE3, 0x80, 0x94, 0xE3, 0x80, ++ 0x95, 0x23, 0x26, 0x2A, 0x2B, 0x2D, 0x3C, 0x3E, ++ 0x3D, 0x5C, 0x24, 0x25, 0x40, 0x20, 0xD9, 0x8B, ++ 0xD9, 0x80, 0xD9, 0x8B, 0x20, 0xD9, 0x8C, 0x20, ++ 0xD9, 0x8D, 0x20, 0xD9, 0x8E, 0xD9, 0x80, 0xD9, ++ 0x8E, 0x20, 0xD9, 0x8F, 0xD9, 0x80, 0xD9, 0x8F, ++ 0x20, 0xD9, 0x90, 0xD9, 0x80, 0xD9, 0x90, 0x20, ++ 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x91, 0x20, 0xD9, ++ 0x92, 0xD9, 0x80, 0xD9, 0x92, 0xD8, 0xA1, 0xD8, ++ 0xA7, 0xD9, 0x93, 0xD8, 0xA7, 0xD9, 0x93, 0xD8, ++ 0xA7, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x94, 0xD9, ++ 0x88, 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x94, 0xD8, ++ 0xA7, 0xD9, 0x95, 0xD8, 0xA7, 0xD9, 0x95, 0xD9, ++ 0x8A, 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, ++ 0x8A, 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, ++ 0xA7, 0xD8, 0xA7, 0xD8, 0xA8, 0xD8, 0xA8, 0xD8, ++ 0xA8, 0xD8, 0xA8, 0xD8, 0xA9, 0xD8, 0xA9, 0xD8, ++ 0xAA, 0xD8, 0xAA, 0xD8, 0xAA, 0xD8, 0xAA, 0xD8, ++ 0xAB, 0xD8, 0xAB, 0xD8, 0xAB, 0xD8, 0xAB, 0xD8, ++ 0xAC, 0xD8, 0xAC, 0xD8, 0xAC, 0xD8, 0xAC, 0xD8, ++ 0xAD, 0xD8, 0xAD, 0xD8, 0xAD, 0xD8, 0xAD, 0xD8, ++ 0xAE, 0xD8, 0xAE, 0xD8, 0xAE, 0xD8, 0xAE, 0xD8, ++ 0xAF, 0xD8, 0xAF, 0xD8, 0xB0, 0xD8, 0xB0, 0xD8, ++ 0xB1, 0xD8, 0xB1, 0xD8, 0xB2, 0xD8, 0xB2, 0xD8, ++ 0xB3, 0xD8, 0xB3, 0xD8, 0xB3, 0xD8, 0xB3, 0xD8, ++ 0xB4, 0xD8, 0xB4, 0xD8, 0xB4, 0xD8, 0xB4, 0xD8, ++ 0xB5, 0xD8, 0xB5, 0xD8, 0xB5, 0xD8, 0xB5, 0xD8, ++ 0xB6, 0xD8, 0xB6, 0xD8, 0xB6, 0xD8, 0xB6, 0xD8, ++ 0xB7, 0xD8, 0xB7, 0xD8, 0xB7, 0xD8, 0xB7, 0xD8, ++ 0xB8, 0xD8, 0xB8, 0xD8, 0xB8, 0xD8, 0xB8, 0xD8, ++ 0xB9, 0xD8, 0xB9, 0xD8, 0xB9, 0xD8, 0xB9, 0xD8, ++ 0xBA, 0xD8, 0xBA, 0xD8, 0xBA, 0xD8, 0xBA, 0xD9, ++ 0x81, 0xD9, 0x81, 0xD9, 0x81, 0xD9, 0x81, 0xD9, ++ 0x82, 0xD9, 0x82, 0xD9, 0x82, 0xD9, 0x82, 0xD9, ++ 0x83, 0xD9, 0x83, 0xD9, 0x83, 0xD9, 0x83, 0xD9, ++ 0x84, 0xD9, 0x84, 0xD9, 0x84, 0xD9, 0x84, 0xD9, ++ 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9, ++ 0x86, 0xD9, 0x86, 0xD9, 0x86, 0xD9, 0x86, 0xD9, ++ 0x87, 0xD9, 0x87, 0xD9, 0x87, 0xD9, 0x87, 0xD9, ++ 0x88, 0xD9, 0x88, 0xD9, 0x89, 0xD9, 0x89, 0xD9, ++ 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, ++ 0x84, 0xD8, 0xA7, 0xD9, 0x93, 0xD9, 0x84, 0xD8, ++ 0xA7, 0xD9, 0x93, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, ++ 0x94, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x94, 0xD9, ++ 0x84, 0xD8, 0xA7, 0xD9, 0x95, 0xD9, 0x84, 0xD8, ++ 0xA7, 0xD9, 0x95, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, ++ 0x84, 0xD8, 0xA7, 0x21, 0x22, 0x23, 0x24, 0x25, ++ 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, ++ 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, ++ 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, ++ 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, ++ 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, ++ 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, ++ 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, ++ 0x5E, 0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, ++ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, ++ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, ++ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, ++ 0x7E, 0xE2, 0xA6, 0x85, 0xE2, 0xA6, 0x86, 0xE3, ++ 0x80, 0x82, 0xE3, 0x80, 0x8C, 0xE3, 0x80, 0x8D, ++ 0xE3, 0x80, 0x81, 0xE3, 0x83, 0xBB, 0xE3, 0x83, ++ 0xB2, 0xE3, 0x82, 0xA1, 0xE3, 0x82, 0xA3, 0xE3, ++ 0x82, 0xA5, 0xE3, 0x82, 0xA7, 0xE3, 0x82, 0xA9, ++ 0xE3, 0x83, 0xA3, 0xE3, 0x83, 0xA5, 0xE3, 0x83, ++ 0xA7, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0xBC, 0xE3, ++ 0x82, 0xA2, 0xE3, 0x82, 0xA4, 0xE3, 0x82, 0xA6, ++ 0xE3, 0x82, 0xA8, 0xE3, 0x82, 0xAA, 0xE3, 0x82, ++ 0xAB, 0xE3, 0x82, 0xAD, 0xE3, 0x82, 0xAF, 0xE3, ++ 0x82, 0xB1, 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0xB5, ++ 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0xB9, 0xE3, 0x82, ++ 0xBB, 0xE3, 0x82, 0xBD, 0xE3, 0x82, 0xBF, 0xE3, ++ 0x83, 0x81, 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x86, ++ 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x8A, 0xE3, 0x83, ++ 0x8B, 0xE3, 0x83, 0x8C, 0xE3, 0x83, 0x8D, 0xE3, ++ 0x83, 0x8E, 0xE3, 0x83, 0x8F, 0xE3, 0x83, 0x92, ++ 0xE3, 0x83, 0x95, 0xE3, 0x83, 0x98, 0xE3, 0x83, ++ 0x9B, 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0x9F, 0xE3, ++ 0x83, 0xA0, 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xA2, ++ 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xA6, 0xE3, 0x83, ++ 0xA8, 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xAA, 0xE3, ++ 0x83, 0xAB, 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xAD, ++ 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0xB3, 0xE3, 0x82, ++ 0x99, 0xE3, 0x82, 0x9A, 0xE1, 0x85, 0xA0, 0xE1, ++ 0x84, 0x80, 0xE1, 0x84, 0x81, 0xE1, 0x86, 0xAA, ++ 0xE1, 0x84, 0x82, 0xE1, 0x86, 0xAC, 0xE1, 0x86, ++ 0xAD, 0xE1, 0x84, 0x83, 0xE1, 0x84, 0x84, 0xE1, ++ 0x84, 0x85, 0xE1, 0x86, 0xB0, 0xE1, 0x86, 0xB1, ++ 0xE1, 0x86, 0xB2, 0xE1, 0x86, 0xB3, 0xE1, 0x86, ++ 0xB4, 0xE1, 0x86, 0xB5, 0xE1, 0x84, 0x9A, 0xE1, ++ 0x84, 0x86, 0xE1, 0x84, 0x87, 0xE1, 0x84, 0x88, ++ 0xE1, 0x84, 0xA1, 0xE1, 0x84, 0x89, 0xE1, 0x84, ++ 0x8A, 0xE1, 0x84, 0x8B, 0xE1, 0x84, 0x8C, 0xE1, ++ 0x84, 0x8D, 0xE1, 0x84, 0x8E, 0xE1, 0x84, 0x8F, ++ 0xE1, 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1, 0x84, ++ 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x85, 0xA2, 0xE1, ++ 0x85, 0xA3, 0xE1, 0x85, 0xA4, 0xE1, 0x85, 0xA5, ++ 0xE1, 0x85, 0xA6, 0xE1, 0x85, 0xA7, 0xE1, 0x85, ++ 0xA8, 0xE1, 0x85, 0xA9, 0xE1, 0x85, 0xAA, 0xE1, ++ 0x85, 0xAB, 0xE1, 0x85, 0xAC, 0xE1, 0x85, 0xAD, ++ 0xE1, 0x85, 0xAE, 0xE1, 0x85, 0xAF, 0xE1, 0x85, ++ 0xB0, 0xE1, 0x85, 0xB1, 0xE1, 0x85, 0xB2, 0xE1, ++ 0x85, 0xB3, 0xE1, 0x85, 0xB4, 0xE1, 0x85, 0xB5, ++ 0xC2, 0xA2, 0xC2, 0xA3, 0xC2, 0xAC, 0x20, 0xCC, ++ 0x84, 0xC2, 0xA6, 0xC2, 0xA5, 0xE2, 0x82, 0xA9, ++ 0xE2, 0x94, 0x82, 0xE2, 0x86, 0x90, 0xE2, 0x86, ++ 0x91, 0xE2, 0x86, 0x92, 0xE2, 0x86, 0x93, 0xE2, ++ 0x96, 0xA0, 0xE2, 0x97, 0x8B, 0xF6, 0xF0, 0x9D, ++ 0x85, 0x97, 0xF0, 0x9D, 0x85, 0xA5, 0xF6, 0xF0, ++ 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF6, ++ 0xF0, 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, ++ 0xF0, 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x85, ++ 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, ++ 0xAF, 0xF6, 0xF0, 0x9D, 0x85, 0x98, 0xF0, 0x9D, ++ 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xB0, 0xF6, 0xF0, ++ 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, ++ 0x9D, 0x85, 0xB1, 0xF6, 0xF0, 0x9D, 0x85, 0x98, ++ 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xB2, ++ 0xF6, 0xF0, 0x9D, 0x86, 0xB9, 0xF0, 0x9D, 0x85, ++ 0xA5, 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, ++ 0x85, 0xA5, 0xF6, 0xF0, 0x9D, 0x86, 0xB9, 0xF0, ++ 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAE, 0xF6, ++ 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85, 0xA5, ++ 0xF0, 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x86, ++ 0xB9, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, ++ 0xAF, 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, ++ 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAF, 0x41, 0x42, ++ 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, ++ 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, ++ 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, ++ 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, ++ 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, ++ 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, ++ 0x79, 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, ++ 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, ++ 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, ++ 0x57, 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, ++ 0x65, 0x66, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, ++ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, ++ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, ++ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, ++ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, ++ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, ++ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ++ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, ++ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, ++ 0x7A, 0x41, 0x43, 0x44, 0x47, 0x4A, 0x4B, 0x4E, ++ 0x4F, 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57, ++ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x66, ++ 0x68, 0x69, 0x6A, 0x6B, 0x6D, 0x6E, 0x70, 0x71, ++ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, ++ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, ++ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, ++ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, ++ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, ++ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, ++ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, ++ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44, ++ 0x45, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, ++ 0x4F, 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57, ++ 0x58, 0x59, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, ++ 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, ++ 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, ++ 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44, 0x45, ++ 0x46, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4F, ++ 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x61, ++ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ++ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, ++ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, ++ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, ++ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, ++ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, ++ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, ++ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, ++ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, ++ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, ++ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, ++ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, ++ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, ++ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ++ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, ++ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, ++ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, ++ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, ++ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, ++ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, ++ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, ++ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, ++ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, ++ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, ++ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, ++ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, ++ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ++ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, ++ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, ++ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, ++ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, ++ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, ++ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, ++ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, ++ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, ++ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, ++ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, ++ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, ++ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, ++ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ++ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, ++ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, ++ 0x7A, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, ++ 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, ++ 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, ++ 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, ++ 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, ++ 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, ++ 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, ++ 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, ++ 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, ++ 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, ++ 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, ++ 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, ++ 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, ++ 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, ++ 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE, ++ 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, ++ 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, ++ 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, ++ 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, ++ 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, ++ 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, ++ 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, ++ 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, ++ 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, ++ 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, ++ 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, ++ 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, ++ 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, ++ 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, ++ 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE, ++ 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, ++ 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, ++ 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, ++ 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, ++ 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, ++ 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, ++ 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, ++ 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, ++ 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, ++ 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, ++ 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, ++ 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, ++ 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, ++ 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, ++ 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE, ++ 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, ++ 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, ++ 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, ++ 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, ++ 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, ++ 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, ++ 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, ++ 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, ++ 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, ++ 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, ++ 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, ++ 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, ++ 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, ++ 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, ++ 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, ++ 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, ++ 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, ++ 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, ++ 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, ++ 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, ++ 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, ++ 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, ++ 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, ++ 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, ++ 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, ++ 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, ++ 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, ++ 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, ++ 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, 0x80, 0x30, ++ 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, ++ 0x39, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, ++ 0x37, 0x38, 0x39, 0x30, 0x31, 0x32, 0x33, 0x34, ++ 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, 0x31, 0x32, ++ 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, ++ 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, ++ 0x39, 0xF6, 0xE4, 0xB8, 0xBD, 0xF6, 0xE4, 0xB8, ++ 0xB8, 0xF6, 0xE4, 0xB9, 0x81, 0xF6, 0xF0, 0xA0, ++ 0x84, 0xA2, 0xF6, 0xE4, 0xBD, 0xA0, 0xF6, 0xE4, ++ 0xBE, 0xAE, 0xF6, 0xE4, 0xBE, 0xBB, 0xF6, 0xE5, ++ 0x80, 0x82, 0xF6, 0xE5, 0x81, 0xBA, 0xF6, 0xE5, ++ 0x82, 0x99, 0xF6, 0xE5, 0x83, 0xA7, 0xF6, 0xE5, ++ 0x83, 0x8F, 0xF6, 0xE3, 0x92, 0x9E, 0xF6, 0xF0, ++ 0xA0, 0x98, 0xBA, 0xF6, 0xE5, 0x85, 0x8D, 0xF6, ++ 0xE5, 0x85, 0x94, 0xF6, 0xE5, 0x85, 0xA4, 0xF6, ++ 0xE5, 0x85, 0xB7, 0xF6, 0xF0, 0xA0, 0x94, 0x9C, ++ 0xF6, 0xE3, 0x92, 0xB9, 0xF6, 0xE5, 0x85, 0xA7, ++ 0xF6, 0xE5, 0x86, 0x8D, 0xF6, 0xF0, 0xA0, 0x95, ++ 0x8B, 0xF6, 0xE5, 0x86, 0x97, 0xF6, 0xE5, 0x86, ++ 0xA4, 0xF6, 0xE4, 0xBB, 0x8C, 0xF6, 0xE5, 0x86, ++ 0xAC, 0xF6, 0xE5, 0x86, 0xB5, 0xF6, 0xF0, 0xA9, ++ 0x87, 0x9F, 0xF6, 0xE5, 0x87, 0xB5, 0xF6, 0xE5, ++ 0x88, 0x83, 0xF6, 0xE3, 0x93, 0x9F, 0xF6, 0xE5, ++ 0x88, 0xBB, 0xF6, 0xE5, 0x89, 0x86, 0xF6, 0xE5, ++ 0x89, 0xB2, 0xF6, 0xE5, 0x89, 0xB7, 0xF6, 0xE3, ++ 0x94, 0x95, 0xF6, 0xE5, 0x8B, 0x87, 0xF6, 0xE5, ++ 0x8B, 0x89, 0xF6, 0xE5, 0x8B, 0xA4, 0xF6, 0xE5, ++ 0x8B, 0xBA, 0xF6, 0xE5, 0x8C, 0x85, 0xF6, 0xE5, ++ 0x8C, 0x86, 0xF6, 0xE5, 0x8C, 0x97, 0xF6, 0xE5, ++ 0x8D, 0x89, 0xF6, 0xE5, 0x8D, 0x91, 0xF6, 0xE5, ++ 0x8D, 0x9A, 0xF6, 0xE5, 0x8D, 0xB3, 0xF6, 0xE5, ++ 0x8D, 0xBD, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xE5, ++ 0x8D, 0xBF, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xF0, ++ 0xA0, 0xA8, 0xAC, 0xF6, 0xE7, 0x81, 0xB0, 0xF6, ++ 0xE5, 0x8F, 0x8A, 0xF6, 0xE5, 0x8F, 0x9F, 0xF6, ++ 0xF0, 0xA0, 0xAD, 0xA3, 0xF6, 0xE5, 0x8F, 0xAB, ++ 0xF6, 0xE5, 0x8F, 0xB1, 0xF6, 0xE5, 0x90, 0x86, ++ 0xF6, 0xE5, 0x92, 0x9E, 0xF6, 0xE5, 0x90, 0xB8, ++ 0xF6, 0xE5, 0x91, 0x88, 0xF6, 0xE5, 0x91, 0xA8, ++ 0xF6, 0xE5, 0x92, 0xA2, 0xF6, 0xE5, 0x93, 0xB6, ++ 0xF6, 0xE5, 0x94, 0x90, 0xF6, 0xE5, 0x95, 0x93, ++ 0xF6, 0xE5, 0x95, 0xA3, 0xF6, 0xE5, 0x96, 0x84, ++ 0xF6, 0xE5, 0x96, 0x84, 0xF6, 0xE5, 0x96, 0x99, ++ 0xF6, 0xE5, 0x96, 0xAB, 0xF6, 0xE5, 0x96, 0xB3, ++ 0xF6, 0xE5, 0x97, 0x82, 0xF6, 0xE5, 0x9C, 0x96, ++ 0xF6, 0xE5, 0x98, 0x86, 0xF6, 0xE5, 0x9C, 0x97, ++ 0xF6, 0xE5, 0x99, 0x91, 0xF6, 0xE5, 0x99, 0xB4, ++ 0xF6, 0xE5, 0x88, 0x87, 0xF6, 0xE5, 0xA3, 0xAE, ++ 0xF6, 0xE5, 0x9F, 0x8E, 0xF6, 0xE5, 0x9F, 0xB4, ++ 0xF6, 0xE5, 0xA0, 0x8D, 0xF6, 0xE5, 0x9E, 0x8B, ++ 0xF6, 0xE5, 0xA0, 0xB2, 0xF6, 0xE5, 0xA0, 0xB1, ++ 0xF6, 0xE5, 0xA2, 0xAC, 0xF6, 0xF0, 0xA1, 0x93, ++ 0xA4, 0xF6, 0xE5, 0xA3, 0xB2, 0xF6, 0xE5, 0xA3, ++ 0xB7, 0xF6, 0xE5, 0xA4, 0x86, 0xF6, 0xE5, 0xA4, ++ 0x9A, 0xF6, 0xE5, 0xA4, 0xA2, 0xF6, 0xE5, 0xA5, ++ 0xA2, 0xF6, 0xF0, 0xA1, 0x9A, 0xA8, 0xF6, 0xF0, ++ 0xA1, 0x9B, 0xAA, 0xF6, 0xE5, 0xA7, 0xAC, 0xF6, ++ 0xE5, 0xA8, 0x9B, 0xF6, 0xE5, 0xA8, 0xA7, 0xF6, ++ 0xE5, 0xA7, 0x98, 0xF6, 0xE5, 0xA9, 0xA6, 0xF6, ++ 0xE3, 0x9B, 0xAE, 0xF6, 0xE3, 0x9B, 0xBC, 0xF6, ++ 0xE5, 0xAC, 0x88, 0xF6, 0xE5, 0xAC, 0xBE, 0xF6, ++ 0xE5, 0xAC, 0xBE, 0xF6, 0xF0, 0xA1, 0xA7, 0x88, ++ 0xF6, 0xE5, 0xAF, 0x83, 0xF6, 0xE5, 0xAF, 0x98, ++ 0xF6, 0xE5, 0xAF, 0xA7, 0xF6, 0xE5, 0xAF, 0xB3, ++ 0xF6, 0xF0, 0xA1, 0xAC, 0x98, 0xF6, 0xE5, 0xAF, ++ 0xBF, 0xF6, 0xE5, 0xB0, 0x86, 0xF6, 0xE5, 0xBD, ++ 0x93, 0xF6, 0xE5, 0xB0, 0xA2, 0xF6, 0xE3, 0x9E, ++ 0x81, 0xF6, 0xE5, 0xB1, 0xA0, 0xF6, 0xE5, 0xB1, ++ 0xAE, 0xF6, 0xE5, 0xB3, 0x80, 0xF6, 0xE5, 0xB2, ++ 0x8D, 0xF6, 0xF0, 0xA1, 0xB7, 0xA4, 0xF6, 0xE5, ++ 0xB5, 0x83, 0xF6, 0xF0, 0xA1, 0xB7, 0xA6, 0xF6, ++ 0xE5, 0xB5, 0xAE, 0xF6, 0xE5, 0xB5, 0xAB, 0xF6, ++ 0xE5, 0xB5, 0xBC, 0xF6, 0xE5, 0xB7, 0xA1, 0xF6, ++ 0xE5, 0xB7, 0xA2, 0xF6, 0xE3, 0xA0, 0xAF, 0xF6, ++ 0xE5, 0xB7, 0xBD, 0xF6, 0xE5, 0xB8, 0xA8, 0xF6, ++ 0xE5, 0xB8, 0xBD, 0xF6, 0xE5, 0xB9, 0xA9, 0xF6, ++ 0xE3, 0xA1, 0xA2, 0xF6, 0xF0, 0xA2, 0x86, 0x83, ++ 0xF6, 0xE3, 0xA1, 0xBC, 0xF6, 0xE5, 0xBA, 0xB0, ++ 0xF6, 0xE5, 0xBA, 0xB3, 0xF6, 0xE5, 0xBA, 0xB6, ++ 0xF6, 0xE5, 0xBB, 0x8A, 0xF6, 0xF0, 0xAA, 0x8E, ++ 0x92, 0xF6, 0xE5, 0xBB, 0xBE, 0xF6, 0xF0, 0xA2, ++ 0x8C, 0xB1, 0xF6, 0xF0, 0xA2, 0x8C, 0xB1, 0xF6, ++ 0xE8, 0x88, 0x81, 0xF6, 0xE5, 0xBC, 0xA2, 0xF6, ++ 0xE5, 0xBC, 0xA2, 0xF6, 0xE3, 0xA3, 0x87, 0xF6, ++ 0xF0, 0xA3, 0x8A, 0xB8, 0xF6, 0xF0, 0xA6, 0x87, ++ 0x9A, 0xF6, 0xE5, 0xBD, 0xA2, 0xF6, 0xE5, 0xBD, ++ 0xAB, 0xF6, 0xE3, 0xA3, 0xA3, 0xF6, 0xE5, 0xBE, ++ 0x9A, 0xF6, 0xE5, 0xBF, 0x8D, 0xF6, 0xE5, 0xBF, ++ 0x97, 0xF6, 0xE5, 0xBF, 0xB9, 0xF6, 0xE6, 0x82, ++ 0x81, 0xF6, 0xE3, 0xA4, 0xBA, 0xF6, 0xE3, 0xA4, ++ 0x9C, 0xF6, 0xE6, 0x82, 0x94, 0xF6, 0xF0, 0xA2, ++ 0x9B, 0x94, 0xF6, 0xE6, 0x83, 0x87, 0xF6, 0xE6, ++ 0x85, 0x88, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6, ++ 0x85, 0x8E, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6, ++ 0x85, 0xBA, 0xF6, 0xE6, 0x86, 0x8E, 0xF6, 0xE6, ++ 0x86, 0xB2, 0xF6, 0xE6, 0x86, 0xA4, 0xF6, 0xE6, ++ 0x86, 0xAF, 0xF6, 0xE6, 0x87, 0x9E, 0xF6, 0xE6, ++ 0x87, 0xB2, 0xF6, 0xE6, 0x87, 0xB6, 0xF6, 0xE6, ++ 0x88, 0x90, 0xF6, 0xE6, 0x88, 0x9B, 0xF6, 0xE6, ++ 0x89, 0x9D, 0xF6, 0xE6, 0x8A, 0xB1, 0xF6, 0xE6, ++ 0x8B, 0x94, 0xF6, 0xE6, 0x8D, 0x90, 0xF6, 0xF0, ++ 0xA2, 0xAC, 0x8C, 0xF6, 0xE6, 0x8C, 0xBD, 0xF6, ++ 0xE6, 0x8B, 0xBC, 0xF6, 0xE6, 0x8D, 0xA8, 0xF6, ++ 0xE6, 0x8E, 0x83, 0xF6, 0xE6, 0x8F, 0xA4, 0xF6, ++ 0xF0, 0xA2, 0xAF, 0xB1, 0xF6, 0xE6, 0x90, 0xA2, ++ 0xF6, 0xE6, 0x8F, 0x85, 0xF6, 0xE6, 0x8E, 0xA9, ++ 0xF6, 0xE3, 0xA8, 0xAE, 0xF6, 0xE6, 0x91, 0xA9, ++ 0xF6, 0xE6, 0x91, 0xBE, 0xF6, 0xE6, 0x92, 0x9D, ++ 0xF6, 0xE6, 0x91, 0xB7, 0xF6, 0xE3, 0xA9, 0xAC, ++ 0xF6, 0xE6, 0x95, 0x8F, 0xF6, 0xE6, 0x95, 0xAC, ++ 0xF6, 0xF0, 0xA3, 0x80, 0x8A, 0xF6, 0xE6, 0x97, ++ 0xA3, 0xF6, 0xE6, 0x9B, 0xB8, 0xF6, 0xE6, 0x99, ++ 0x89, 0xF6, 0xE3, 0xAC, 0x99, 0xF6, 0xE6, 0x9A, ++ 0x91, 0xF6, 0xE3, 0xAC, 0x88, 0xF6, 0xE3, 0xAB, ++ 0xA4, 0xF6, 0xE5, 0x86, 0x92, 0xF6, 0xE5, 0x86, ++ 0x95, 0xF6, 0xE6, 0x9C, 0x80, 0xF6, 0xE6, 0x9A, ++ 0x9C, 0xF6, 0xE8, 0x82, 0xAD, 0xF6, 0xE4, 0x8F, ++ 0x99, 0xF6, 0xE6, 0x9C, 0x97, 0xF6, 0xE6, 0x9C, ++ 0x9B, 0xF6, 0xE6, 0x9C, 0xA1, 0xF6, 0xE6, 0x9D, ++ 0x9E, 0xF6, 0xE6, 0x9D, 0x93, 0xF6, 0xF0, 0xA3, ++ 0x8F, 0x83, 0xF6, 0xE3, 0xAD, 0x89, 0xF6, 0xE6, ++ 0x9F, 0xBA, 0xF6, 0xE6, 0x9E, 0x85, 0xF6, 0xE6, ++ 0xA1, 0x92, 0xF6, 0xE6, 0xA2, 0x85, 0xF6, 0xF0, ++ 0xA3, 0x91, 0xAD, 0xF6, 0xE6, 0xA2, 0x8E, 0xF6, ++ 0xE6, 0xA0, 0x9F, 0xF6, 0xE6, 0xA4, 0x94, 0xF6, ++ 0xE3, 0xAE, 0x9D, 0xF6, 0xE6, 0xA5, 0x82, 0xF6, ++ 0xE6, 0xA6, 0xA3, 0xF6, 0xE6, 0xA7, 0xAA, 0xF6, ++ 0xE6, 0xAA, 0xA8, 0xF6, 0xF0, 0xA3, 0x9A, 0xA3, ++ 0xF6, 0xE6, 0xAB, 0x9B, 0xF6, 0xE3, 0xB0, 0x98, ++ 0xF6, 0xE6, 0xAC, 0xA1, 0xF6, 0xF0, 0xA3, 0xA2, ++ 0xA7, 0xF6, 0xE6, 0xAD, 0x94, 0xF6, 0xE3, 0xB1, ++ 0x8E, 0xF6, 0xE6, 0xAD, 0xB2, 0xF6, 0xE6, 0xAE, ++ 0x9F, 0xF6, 0xE6, 0xAE, 0xBA, 0xF6, 0xE6, 0xAE, ++ 0xBB, 0xF6, 0xF0, 0xA3, 0xAA, 0x8D, 0xF6, 0xF0, ++ 0xA1, 0xB4, 0x8B, 0xF6, 0xF0, 0xA3, 0xAB, 0xBA, ++ 0xF6, 0xE6, 0xB1, 0x8E, 0xF6, 0xF0, 0xA3, 0xB2, ++ 0xBC, 0xF6, 0xE6, 0xB2, 0xBF, 0xF6, 0xE6, 0xB3, ++ 0x8D, 0xF6, 0xE6, 0xB1, 0xA7, 0xF6, 0xE6, 0xB4, ++ 0x96, 0xF6, 0xE6, 0xB4, 0xBE, 0xF6, 0xE6, 0xB5, ++ 0xB7, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6, 0xB5, ++ 0xA9, 0xF6, 0xE6, 0xB5, 0xB8, 0xF6, 0xE6, 0xB6, ++ 0x85, 0xF6, 0xF0, 0xA3, 0xB4, 0x9E, 0xF6, 0xE6, ++ 0xB4, 0xB4, 0xF6, 0xE6, 0xB8, 0xAF, 0xF6, 0xE6, ++ 0xB9, 0xAE, 0xF6, 0xE3, 0xB4, 0xB3, 0xF6, 0xE6, ++ 0xBB, 0x8B, 0xF6, 0xE6, 0xBB, 0x87, 0xF6, 0xF0, ++ 0xA3, 0xBB, 0x91, 0xF6, 0xE6, 0xB7, 0xB9, 0xF6, ++ 0xE6, 0xBD, 0xAE, 0xF6, 0xF0, 0xA3, 0xBD, 0x9E, ++ 0xF6, 0xF0, 0xA3, 0xBE, 0x8E, 0xF6, 0xE6, 0xBF, ++ 0x86, 0xF6, 0xE7, 0x80, 0xB9, 0xF6, 0xE7, 0x80, ++ 0x9E, 0xF6, 0xE7, 0x80, 0x9B, 0xF6, 0xE3, 0xB6, ++ 0x96, 0xF6, 0xE7, 0x81, 0x8A, 0xF6, 0xE7, 0x81, ++ 0xBD, 0xF6, 0xE7, 0x81, 0xB7, 0xF6, 0xE7, 0x82, ++ 0xAD, 0xF6, 0xF0, 0xA0, 0x94, 0xA5, 0xF6, 0xE7, ++ 0x85, 0x85, 0xF6, 0xF0, 0xA4, 0x89, 0xA3, 0xF6, ++ 0xE7, 0x86, 0x9C, 0xF6, 0xF0, 0xA4, 0x8E, 0xAB, ++ 0xF6, 0xE7, 0x88, 0xA8, 0xF6, 0xE7, 0x88, 0xB5, ++ 0xF6, 0xE7, 0x89, 0x90, 0xF6, 0xF0, 0xA4, 0x98, ++ 0x88, 0xF6, 0xE7, 0x8A, 0x80, 0xF6, 0xE7, 0x8A, ++ 0x95, 0xF6, 0xF0, 0xA4, 0x9C, 0xB5, 0xF6, 0xF0, ++ 0xA4, 0xA0, 0x94, 0xF6, 0xE7, 0x8D, 0xBA, 0xF6, ++ 0xE7, 0x8E, 0x8B, 0xF6, 0xE3, 0xBA, 0xAC, 0xF6, ++ 0xE7, 0x8E, 0xA5, 0xF6, 0xE3, 0xBA, 0xB8, 0xF6, ++ 0xE3, 0xBA, 0xB8, 0xF6, 0xE7, 0x91, 0x87, 0xF6, ++ 0xE7, 0x91, 0x9C, 0xF6, 0xE7, 0x91, 0xB1, 0xF6, ++ 0xE7, 0x92, 0x85, 0xF6, 0xE7, 0x93, 0x8A, 0xF6, ++ 0xE3, 0xBC, 0x9B, 0xF6, 0xE7, 0x94, 0xA4, 0xF6, ++ 0xF0, 0xA4, 0xB0, 0xB6, 0xF6, 0xE7, 0x94, 0xBE, ++ 0xF6, 0xF0, 0xA4, 0xB2, 0x92, 0xF6, 0xE7, 0x95, ++ 0xB0, 0xF6, 0xF0, 0xA2, 0x86, 0x9F, 0xF6, 0xE7, ++ 0x98, 0x90, 0xF6, 0xF0, 0xA4, 0xBE, 0xA1, 0xF6, ++ 0xF0, 0xA4, 0xBE, 0xB8, 0xF6, 0xF0, 0xA5, 0x81, ++ 0x84, 0xF6, 0xE3, 0xBF, 0xBC, 0xF6, 0xE4, 0x80, ++ 0x88, 0xF6, 0xE7, 0x9B, 0xB4, 0xF6, 0xF0, 0xA5, ++ 0x83, 0xB3, 0xF6, 0xF0, 0xA5, 0x83, 0xB2, 0xF6, ++ 0xF0, 0xA5, 0x84, 0x99, 0xF6, 0xF0, 0xA5, 0x84, ++ 0xB3, 0xF6, 0xE7, 0x9C, 0x9E, 0xF6, 0xE7, 0x9C, ++ 0x9F, 0xF6, 0xE7, 0x9C, 0x9F, 0xF6, 0xE7, 0x9D, ++ 0x8A, 0xF6, 0xE4, 0x80, 0xB9, 0xF6, 0xE7, 0x9E, ++ 0x8B, 0xF6, 0xE4, 0x81, 0x86, 0xF6, 0xE4, 0x82, ++ 0x96, 0xF6, 0xF0, 0xA5, 0x90, 0x9D, 0xF6, 0xE7, ++ 0xA1, 0x8E, 0xF6, 0xE7, 0xA2, 0x8C, 0xF6, 0xE7, ++ 0xA3, 0x8C, 0xF6, 0xE4, 0x83, 0xA3, 0xF6, 0xF0, ++ 0xA5, 0x98, 0xA6, 0xF6, 0xE7, 0xA5, 0x96, 0xF6, ++ 0xF0, 0xA5, 0x9A, 0x9A, 0xF6, 0xF0, 0xA5, 0x9B, ++ 0x85, 0xF6, 0xE7, 0xA6, 0x8F, 0xF6, 0xE7, 0xA7, ++ 0xAB, 0xF6, 0xE4, 0x84, 0xAF, 0xF6, 0xE7, 0xA9, ++ 0x80, 0xF6, 0xE7, 0xA9, 0x8A, 0xF6, 0xE7, 0xA9, ++ 0x8F, 0xF6, 0xF0, 0xA5, 0xA5, 0xBC, 0xF6, 0xF0, ++ 0xA5, 0xAA, 0xA7, 0xF6, 0xF0, 0xA5, 0xAA, 0xA7, ++ 0xF6, 0xE7, 0xAB, 0xAE, 0xF6, 0xE4, 0x88, 0x82, ++ 0xF6, 0xF0, 0xA5, 0xAE, 0xAB, 0xF6, 0xE7, 0xAF, ++ 0x86, 0xF6, 0xE7, 0xAF, 0x89, 0xF6, 0xE4, 0x88, ++ 0xA7, 0xF6, 0xF0, 0xA5, 0xB2, 0x80, 0xF6, 0xE7, ++ 0xB3, 0x92, 0xF6, 0xE4, 0x8A, 0xA0, 0xF6, 0xE7, ++ 0xB3, 0xA8, 0xF6, 0xE7, 0xB3, 0xA3, 0xF6, 0xE7, ++ 0xB4, 0x80, 0xF6, 0xF0, 0xA5, 0xBE, 0x86, 0xF6, ++ 0xE7, 0xB5, 0xA3, 0xF6, 0xE4, 0x8C, 0x81, 0xF6, ++ 0xE7, 0xB7, 0x87, 0xF6, 0xE7, 0xB8, 0x82, 0xF6, ++ 0xE7, 0xB9, 0x85, 0xF6, 0xE4, 0x8C, 0xB4, 0xF6, ++ 0xF0, 0xA6, 0x88, 0xA8, 0xF6, 0xF0, 0xA6, 0x89, ++ 0x87, 0xF6, 0xE4, 0x8D, 0x99, 0xF6, 0xF0, 0xA6, ++ 0x8B, 0x99, 0xF6, 0xE7, 0xBD, 0xBA, 0xF6, 0xF0, ++ 0xA6, 0x8C, 0xBE, 0xF6, 0xE7, 0xBE, 0x95, 0xF6, ++ 0xE7, 0xBF, 0xBA, 0xF6, 0xE8, 0x80, 0x85, 0xF6, ++ 0xF0, 0xA6, 0x93, 0x9A, 0xF6, 0xF0, 0xA6, 0x94, ++ 0xA3, 0xF6, 0xE8, 0x81, 0xA0, 0xF6, 0xF0, 0xA6, ++ 0x96, 0xA8, 0xF6, 0xE8, 0x81, 0xB0, 0xF6, 0xF0, ++ 0xA3, 0x8D, 0x9F, 0xF6, 0xE4, 0x8F, 0x95, 0xF6, ++ 0xE8, 0x82, 0xB2, 0xF6, 0xE8, 0x84, 0x83, 0xF6, ++ 0xE4, 0x90, 0x8B, 0xF6, 0xE8, 0x84, 0xBE, 0xF6, ++ 0xE5, 0xAA, 0xB5, 0xF6, 0xF0, 0xA6, 0x9E, 0xA7, ++ 0xF6, 0xF0, 0xA6, 0x9E, 0xB5, 0xF6, 0xF0, 0xA3, ++ 0x8E, 0x93, 0xF6, 0xF0, 0xA3, 0x8E, 0x9C, 0xF6, ++ 0xE8, 0x88, 0x81, 0xF6, 0xE8, 0x88, 0x84, 0xF6, ++ 0xE8, 0xBE, 0x9E, 0xF6, 0xE4, 0x91, 0xAB, 0xF6, ++ 0xE8, 0x8A, 0x91, 0xF6, 0xE8, 0x8A, 0x8B, 0xF6, ++ 0xE8, 0x8A, 0x9D, 0xF6, 0xE5, 0x8A, 0xB3, 0xF6, ++ 0xE8, 0x8A, 0xB1, 0xF6, 0xE8, 0x8A, 0xB3, 0xF6, ++ 0xE8, 0x8A, 0xBD, 0xF6, 0xE8, 0x8B, 0xA6, 0xF6, ++ 0xF0, 0xA6, 0xAC, 0xBC, 0xF6, 0xE8, 0x8B, 0xA5, ++ 0xF6, 0xE8, 0x8C, 0x9D, 0xF6, 0xE8, 0x8D, 0xA3, ++ 0xF6, 0xE8, 0x8E, 0xAD, 0xF6, 0xE8, 0x8C, 0xA3, ++ 0xF6, 0xE8, 0x8E, 0xBD, 0xF6, 0xE8, 0x8F, 0xA7, ++ 0xF6, 0xE8, 0x91, 0x97, 0xF6, 0xE8, 0x8D, 0x93, ++ 0xF6, 0xE8, 0x8F, 0x8A, 0xF6, 0xE8, 0x8F, 0x8C, ++ 0xF6, 0xE8, 0x8F, 0x9C, 0xF6, 0xF0, 0xA6, 0xB0, ++ 0xB6, 0xF6, 0xF0, 0xA6, 0xB5, 0xAB, 0xF6, 0xF0, ++ 0xA6, 0xB3, 0x95, 0xF6, 0xE4, 0x94, 0xAB, 0xF6, ++ 0xE8, 0x93, 0xB1, 0xF6, 0xE8, 0x93, 0xB3, 0xF6, ++ 0xE8, 0x94, 0x96, 0xF6, 0xF0, 0xA7, 0x8F, 0x8A, ++ 0xF6, 0xE8, 0x95, 0xA4, 0xF6, 0xF0, 0xA6, 0xBC, ++ 0xAC, 0xF6, 0xE4, 0x95, 0x9D, 0xF6, 0xE4, 0x95, ++ 0xA1, 0xF6, 0xF0, 0xA6, 0xBE, 0xB1, 0xF6, 0xF0, ++ 0xA7, 0x83, 0x92, 0xF6, 0xE4, 0x95, 0xAB, 0xF6, ++ 0xE8, 0x99, 0x90, 0xF6, 0xE8, 0x99, 0x9C, 0xF6, ++ 0xE8, 0x99, 0xA7, 0xF6, 0xE8, 0x99, 0xA9, 0xF6, ++ 0xE8, 0x9A, 0xA9, 0xF6, 0xE8, 0x9A, 0x88, 0xF6, ++ 0xE8, 0x9C, 0x8E, 0xF6, 0xE8, 0x9B, 0xA2, 0xF6, ++ 0xE8, 0x9D, 0xB9, 0xF6, 0xE8, 0x9C, 0xA8, 0xF6, ++ 0xE8, 0x9D, 0xAB, 0xF6, 0xE8, 0x9E, 0x86, 0xF6, ++ 0xE4, 0x97, 0x97, 0xF6, 0xE8, 0x9F, 0xA1, 0xF6, ++ 0xE8, 0xA0, 0x81, 0xF6, 0xE4, 0x97, 0xB9, 0xF6, ++ 0xE8, 0xA1, 0xA0, 0xF6, 0xE8, 0xA1, 0xA3, 0xF6, ++ 0xF0, 0xA7, 0x99, 0xA7, 0xF6, 0xE8, 0xA3, 0x97, ++ 0xF6, 0xE8, 0xA3, 0x9E, 0xF6, 0xE4, 0x98, 0xB5, ++ 0xF6, 0xE8, 0xA3, 0xBA, 0xF6, 0xE3, 0x92, 0xBB, ++ 0xF6, 0xF0, 0xA7, 0xA2, 0xAE, 0xF6, 0xF0, 0xA7, ++ 0xA5, 0xA6, 0xF6, 0xE4, 0x9A, 0xBE, 0xF6, 0xE4, ++ 0x9B, 0x87, 0xF6, 0xE8, 0xAA, 0xA0, 0xF6, 0xE8, ++ 0xAB, 0xAD, 0xF6, 0xE8, 0xAE, 0x8A, 0xF6, 0xE8, ++ 0xB1, 0x95, 0xF6, 0xF0, 0xA7, 0xB2, 0xA8, 0xF6, ++ 0xE8, 0xB2, 0xAB, 0xF6, 0xE8, 0xB3, 0x81, 0xF6, ++ 0xE8, 0xB4, 0x9B, 0xF6, 0xE8, 0xB5, 0xB7, 0xF6, ++ 0xF0, 0xA7, 0xBC, 0xAF, 0xF6, 0xF0, 0xA0, 0xA0, ++ 0x84, 0xF6, 0xE8, 0xB7, 0x8B, 0xF6, 0xE8, 0xB6, ++ 0xBC, 0xF6, 0xE8, 0xB7, 0xB0, 0xF6, 0xF0, 0xA0, ++ 0xA3, 0x9E, 0xF6, 0xE8, 0xBB, 0x94, 0xF6, 0xE8, ++ 0xBC, 0xB8, 0xF6, 0xF0, 0xA8, 0x97, 0x92, 0xF6, ++ 0xF0, 0xA8, 0x97, 0xAD, 0xF6, 0xE9, 0x82, 0x94, ++ 0xF6, 0xE9, 0x83, 0xB1, 0xF6, 0xE9, 0x84, 0x91, ++ 0xF6, 0xF0, 0xA8, 0x9C, 0xAE, 0xF6, 0xE9, 0x84, ++ 0x9B, 0xF6, 0xE9, 0x88, 0xB8, 0xF6, 0xE9, 0x8B, ++ 0x97, 0xF6, 0xE9, 0x8B, 0x98, 0xF6, 0xE9, 0x89, ++ 0xBC, 0xF6, 0xE9, 0x8F, 0xB9, 0xF6, 0xE9, 0x90, ++ 0x95, 0xF6, 0xF0, 0xA8, 0xAF, 0xBA, 0xF6, 0xE9, ++ 0x96, 0x8B, 0xF6, 0xE4, 0xA6, 0x95, 0xF6, 0xE9, ++ 0x96, 0xB7, 0xF6, 0xF0, 0xA8, 0xB5, 0xB7, 0xF6, ++ 0xE4, 0xA7, 0xA6, 0xF6, 0xE9, 0x9B, 0x83, 0xF6, ++ 0xE5, 0xB6, 0xB2, 0xF6, 0xE9, 0x9C, 0xA3, 0xF6, ++ 0xF0, 0xA9, 0x85, 0x85, 0xF6, 0xF0, 0xA9, 0x88, ++ 0x9A, 0xF6, 0xE4, 0xA9, 0xAE, 0xF6, 0xE4, 0xA9, ++ 0xB6, 0xF6, 0xE9, 0x9F, 0xA0, 0xF6, 0xF0, 0xA9, ++ 0x90, 0x8A, 0xF6, 0xE4, 0xAA, 0xB2, 0xF6, 0xF0, ++ 0xA9, 0x92, 0x96, 0xF6, 0xE9, 0xA0, 0x8B, 0xF6, ++ 0xE9, 0xA0, 0x8B, 0xF6, 0xE9, 0xA0, 0xA9, 0xF6, ++ 0xF0, 0xA9, 0x96, 0xB6, 0xF6, 0xE9, 0xA3, 0xA2, ++ 0xF6, 0xE4, 0xAC, 0xB3, 0xF6, 0xE9, 0xA4, 0xA9, ++ 0xF6, 0xE9, 0xA6, 0xA7, 0xF6, 0xE9, 0xA7, 0x82, ++ 0xF6, 0xE9, 0xA7, 0xBE, 0xF6, 0xE4, 0xAF, 0x8E, ++ 0xF6, 0xF0, 0xA9, 0xAC, 0xB0, 0xF6, 0xE9, 0xAC, ++ 0x92, 0xF6, 0xE9, 0xB1, 0x80, 0xF6, 0xE9, 0xB3, ++ 0xBD, 0xF6, 0xE4, 0xB3, 0x8E, 0xF6, 0xE4, 0xB3, ++ 0xAD, 0xF6, 0xE9, 0xB5, 0xA7, 0xF6, 0xF0, 0xAA, ++ 0x83, 0x8E, 0xF6, 0xE4, 0xB3, 0xB8, 0xF6, 0xF0, ++ 0xAA, 0x84, 0x85, 0xF6, 0xF0, 0xAA, 0x88, 0x8E, ++ 0xF6, 0xF0, 0xAA, 0x8A, 0x91, 0xF6, 0xE9, 0xBA, ++ 0xBB, 0xF6, 0xE4, 0xB5, 0x96, 0xF6, 0xE9, 0xBB, ++ 0xB9, 0xF6, 0xE9, 0xBB, 0xBE, 0xF6, 0xE9, 0xBC, ++ 0x85, 0xF6, 0xE9, 0xBC, 0x8F, 0xF6, 0xE9, 0xBC, ++ 0x96, 0xF6, 0xE9, 0xBC, 0xBB, 0xF6, 0xF0, 0xAA, ++ 0x98, 0x80, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, ++ }, ++ { ++ 0x20, 0x20, 0xCC, 0x88, 0x61, 0x20, 0xCC, 0x84, ++ 0x32, 0x33, 0x20, 0xCC, 0x81, 0xCE, 0xBC, 0x20, ++ 0xCC, 0xA7, 0x31, 0x6F, 0x31, 0xE2, 0x81, 0x84, ++ 0x34, 0x31, 0xE2, 0x81, 0x84, 0x32, 0x33, 0xE2, ++ 0x81, 0x84, 0x34, 0xF6, 0x41, 0xCC, 0x80, 0xF6, ++ 0x41, 0xCC, 0x81, 0xF6, 0x41, 0xCC, 0x82, 0xF6, ++ 0x41, 0xCC, 0x83, 0xF6, 0x41, 0xCC, 0x88, 0xF6, ++ 0x41, 0xCC, 0x8A, 0xF6, 0x43, 0xCC, 0xA7, 0xF6, ++ 0x45, 0xCC, 0x80, 0xF6, 0x45, 0xCC, 0x81, 0xF6, ++ 0x45, 0xCC, 0x82, 0xF6, 0x45, 0xCC, 0x88, 0xF6, ++ 0x49, 0xCC, 0x80, 0xF6, 0x49, 0xCC, 0x81, 0xF6, ++ 0x49, 0xCC, 0x82, 0xF6, 0x49, 0xCC, 0x88, 0xF6, ++ 0x4E, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x80, 0xF6, ++ 0x4F, 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xF6, ++ 0x4F, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x88, 0xF6, ++ 0x55, 0xCC, 0x80, 0xF6, 0x55, 0xCC, 0x81, 0xF6, ++ 0x55, 0xCC, 0x82, 0xF6, 0x55, 0xCC, 0x88, 0xF6, ++ 0x59, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x80, 0xF6, ++ 0x61, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x82, 0xF6, ++ 0x61, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x88, 0xF6, ++ 0x61, 0xCC, 0x8A, 0xF6, 0x63, 0xCC, 0xA7, 0xF6, ++ 0x65, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x81, 0xF6, ++ 0x65, 0xCC, 0x82, 0xF6, 0x65, 0xCC, 0x88, 0xF6, ++ 0x69, 0xCC, 0x80, 0xF6, 0x69, 0xCC, 0x81, 0xF6, ++ 0x69, 0xCC, 0x82, 0xF6, 0x69, 0xCC, 0x88, 0xF6, ++ 0x6E, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x80, 0xF6, ++ 0x6F, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82, 0xF6, ++ 0x6F, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x88, 0xF6, ++ 0x75, 0xCC, 0x80, 0xF6, 0x75, 0xCC, 0x81, 0xF6, ++ 0x75, 0xCC, 0x82, 0xF6, 0x75, 0xCC, 0x88, 0xF6, ++ 0x79, 0xCC, 0x81, 0xF6, 0x79, 0xCC, 0x88, 0xF6, ++ 0x41, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x84, 0xF6, ++ 0x41, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0x86, 0xF6, ++ 0x41, 0xCC, 0xA8, 0xF6, 0x61, 0xCC, 0xA8, 0xF6, ++ 0x43, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0x81, 0xF6, ++ 0x43, 0xCC, 0x82, 0xF6, 0x63, 0xCC, 0x82, 0xF6, ++ 0x43, 0xCC, 0x87, 0xF6, 0x63, 0xCC, 0x87, 0xF6, ++ 0x43, 0xCC, 0x8C, 0xF6, 0x63, 0xCC, 0x8C, 0xF6, ++ 0x44, 0xCC, 0x8C, 0xF6, 0x64, 0xCC, 0x8C, 0xF6, ++ 0x45, 0xCC, 0x84, 0xF6, 0x65, 0xCC, 0x84, 0xF6, ++ 0x45, 0xCC, 0x86, 0xF6, 0x65, 0xCC, 0x86, 0xF6, ++ 0x45, 0xCC, 0x87, 0xF6, 0x65, 0xCC, 0x87, 0xF6, ++ 0x45, 0xCC, 0xA8, 0xF6, 0x65, 0xCC, 0xA8, 0xF6, ++ 0x45, 0xCC, 0x8C, 0xF6, 0x65, 0xCC, 0x8C, 0xF6, ++ 0x47, 0xCC, 0x82, 0xF6, 0x67, 0xCC, 0x82, 0xF6, ++ 0x47, 0xCC, 0x86, 0xF6, 0x67, 0xCC, 0x86, 0xF6, ++ 0x47, 0xCC, 0x87, 0xF6, 0x67, 0xCC, 0x87, 0xF6, ++ 0x47, 0xCC, 0xA7, 0xF6, 0x67, 0xCC, 0xA7, 0xF6, ++ 0x48, 0xCC, 0x82, 0xF6, 0x68, 0xCC, 0x82, 0xF6, ++ 0x49, 0xCC, 0x83, 0xF6, 0x69, 0xCC, 0x83, 0xF6, ++ 0x49, 0xCC, 0x84, 0xF6, 0x69, 0xCC, 0x84, 0xF6, ++ 0x49, 0xCC, 0x86, 0xF6, 0x69, 0xCC, 0x86, 0xF6, ++ 0x49, 0xCC, 0xA8, 0xF6, 0x69, 0xCC, 0xA8, 0xF6, ++ 0x49, 0xCC, 0x87, 0x49, 0x4A, 0x69, 0x6A, 0xF6, ++ 0x4A, 0xCC, 0x82, 0xF6, 0x6A, 0xCC, 0x82, 0xF6, ++ 0x4B, 0xCC, 0xA7, 0xF6, 0x6B, 0xCC, 0xA7, 0xF6, ++ 0x4C, 0xCC, 0x81, 0xF6, 0x6C, 0xCC, 0x81, 0xF6, ++ 0x4C, 0xCC, 0xA7, 0xF6, 0x6C, 0xCC, 0xA7, 0xF6, ++ 0x4C, 0xCC, 0x8C, 0xF6, 0x6C, 0xCC, 0x8C, 0x4C, ++ 0xC2, 0xB7, 0x6C, 0xC2, 0xB7, 0xF6, 0x4E, 0xCC, ++ 0x81, 0xF6, 0x6E, 0xCC, 0x81, 0xF6, 0x4E, 0xCC, ++ 0xA7, 0xF6, 0x6E, 0xCC, 0xA7, 0xF6, 0x4E, 0xCC, ++ 0x8C, 0xF6, 0x6E, 0xCC, 0x8C, 0xCA, 0xBC, 0x6E, ++ 0xF6, 0x4F, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, 0x84, ++ 0xF6, 0x4F, 0xCC, 0x86, 0xF6, 0x6F, 0xCC, 0x86, ++ 0xF6, 0x4F, 0xCC, 0x8B, 0xF6, 0x6F, 0xCC, 0x8B, ++ 0xF6, 0x52, 0xCC, 0x81, 0xF6, 0x72, 0xCC, 0x81, ++ 0xF6, 0x52, 0xCC, 0xA7, 0xF6, 0x72, 0xCC, 0xA7, ++ 0xF6, 0x52, 0xCC, 0x8C, 0xF6, 0x72, 0xCC, 0x8C, ++ 0xF6, 0x53, 0xCC, 0x81, 0xF6, 0x73, 0xCC, 0x81, ++ 0xF6, 0x53, 0xCC, 0x82, 0xF6, 0x73, 0xCC, 0x82, ++ 0xF6, 0x53, 0xCC, 0xA7, 0xF6, 0x73, 0xCC, 0xA7, ++ 0xF6, 0x53, 0xCC, 0x8C, 0xF6, 0x73, 0xCC, 0x8C, ++ 0xF6, 0x54, 0xCC, 0xA7, 0xF6, 0x74, 0xCC, 0xA7, ++ 0xF6, 0x54, 0xCC, 0x8C, 0xF6, 0x74, 0xCC, 0x8C, ++ 0xF6, 0x55, 0xCC, 0x83, 0xF6, 0x75, 0xCC, 0x83, ++ 0xF6, 0x55, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x84, ++ 0xF6, 0x55, 0xCC, 0x86, 0xF6, 0x75, 0xCC, 0x86, ++ 0xF6, 0x55, 0xCC, 0x8A, 0xF6, 0x75, 0xCC, 0x8A, ++ 0xF6, 0x55, 0xCC, 0x8B, 0xF6, 0x75, 0xCC, 0x8B, ++ 0xF6, 0x55, 0xCC, 0xA8, 0xF6, 0x75, 0xCC, 0xA8, ++ 0xF6, 0x57, 0xCC, 0x82, 0xF6, 0x77, 0xCC, 0x82, ++ 0xF6, 0x59, 0xCC, 0x82, 0xF6, 0x79, 0xCC, 0x82, ++ 0xF6, 0x59, 0xCC, 0x88, 0xF6, 0x5A, 0xCC, 0x81, ++ 0xF6, 0x7A, 0xCC, 0x81, 0xF6, 0x5A, 0xCC, 0x87, ++ 0xF6, 0x7A, 0xCC, 0x87, 0xF6, 0x5A, 0xCC, 0x8C, ++ 0xF6, 0x7A, 0xCC, 0x8C, 0x73, 0xF6, 0x4F, 0xCC, ++ 0x9B, 0xF6, 0x6F, 0xCC, 0x9B, 0xF6, 0x55, 0xCC, ++ 0x9B, 0xF6, 0x75, 0xCC, 0x9B, 0x44, 0x5A, 0xCC, ++ 0x8C, 0x44, 0x7A, 0xCC, 0x8C, 0x64, 0x7A, 0xCC, ++ 0x8C, 0x4C, 0x4A, 0x4C, 0x6A, 0x6C, 0x6A, 0x4E, ++ 0x4A, 0x4E, 0x6A, 0x6E, 0x6A, 0xF6, 0x41, 0xCC, ++ 0x8C, 0xF6, 0x61, 0xCC, 0x8C, 0xF6, 0x49, 0xCC, ++ 0x8C, 0xF6, 0x69, 0xCC, 0x8C, 0xF6, 0x4F, 0xCC, ++ 0x8C, 0xF6, 0x6F, 0xCC, 0x8C, 0xF6, 0x55, 0xCC, ++ 0x8C, 0xF6, 0x75, 0xCC, 0x8C, 0xF6, 0x55, 0xCC, ++ 0x88, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x88, 0xCC, ++ 0x84, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x81, 0xF6, ++ 0x75, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x55, 0xCC, ++ 0x88, 0xCC, 0x8C, 0xF6, 0x75, 0xCC, 0x88, 0xCC, ++ 0x8C, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x80, 0xF6, ++ 0x75, 0xCC, 0x88, 0xCC, 0x80, 0xF6, 0x41, 0xCC, ++ 0x88, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x88, 0xCC, ++ 0x84, 0xF6, 0x41, 0xCC, 0x87, 0xCC, 0x84, 0xF6, ++ 0x61, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0xC3, 0x86, ++ 0xCC, 0x84, 0xF6, 0xC3, 0xA6, 0xCC, 0x84, 0xF6, ++ 0x47, 0xCC, 0x8C, 0xF6, 0x67, 0xCC, 0x8C, 0xF6, ++ 0x4B, 0xCC, 0x8C, 0xF6, 0x6B, 0xCC, 0x8C, 0xF6, ++ 0x4F, 0xCC, 0xA8, 0xF6, 0x6F, 0xCC, 0xA8, 0xF6, ++ 0x4F, 0xCC, 0xA8, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, ++ 0xA8, 0xCC, 0x84, 0xF6, 0xC6, 0xB7, 0xCC, 0x8C, ++ 0xF6, 0xCA, 0x92, 0xCC, 0x8C, 0xF6, 0x6A, 0xCC, ++ 0x8C, 0x44, 0x5A, 0x44, 0x7A, 0x64, 0x7A, 0xF6, ++ 0x47, 0xCC, 0x81, 0xF6, 0x67, 0xCC, 0x81, 0xF6, ++ 0x4E, 0xCC, 0x80, 0xF6, 0x6E, 0xCC, 0x80, 0xF6, ++ 0x41, 0xCC, 0x8A, 0xCC, 0x81, 0xF6, 0x61, 0xCC, ++ 0x8A, 0xCC, 0x81, 0xF6, 0xC3, 0x86, 0xCC, 0x81, ++ 0xF6, 0xC3, 0xA6, 0xCC, 0x81, 0xF6, 0xC3, 0x98, ++ 0xCC, 0x81, 0xF6, 0xC3, 0xB8, 0xCC, 0x81, 0xF6, ++ 0x41, 0xCC, 0x8F, 0xF6, 0x61, 0xCC, 0x8F, 0xF6, ++ 0x41, 0xCC, 0x91, 0xF6, 0x61, 0xCC, 0x91, 0xF6, ++ 0x45, 0xCC, 0x8F, 0xF6, 0x65, 0xCC, 0x8F, 0xF6, ++ 0x45, 0xCC, 0x91, 0xF6, 0x65, 0xCC, 0x91, 0xF6, ++ 0x49, 0xCC, 0x8F, 0xF6, 0x69, 0xCC, 0x8F, 0xF6, ++ 0x49, 0xCC, 0x91, 0xF6, 0x69, 0xCC, 0x91, 0xF6, ++ 0x4F, 0xCC, 0x8F, 0xF6, 0x6F, 0xCC, 0x8F, 0xF6, ++ 0x4F, 0xCC, 0x91, 0xF6, 0x6F, 0xCC, 0x91, 0xF6, ++ 0x52, 0xCC, 0x8F, 0xF6, 0x72, 0xCC, 0x8F, 0xF6, ++ 0x52, 0xCC, 0x91, 0xF6, 0x72, 0xCC, 0x91, 0xF6, ++ 0x55, 0xCC, 0x8F, 0xF6, 0x75, 0xCC, 0x8F, 0xF6, ++ 0x55, 0xCC, 0x91, 0xF6, 0x75, 0xCC, 0x91, 0xF6, ++ 0x53, 0xCC, 0xA6, 0xF6, 0x73, 0xCC, 0xA6, 0xF6, ++ 0x54, 0xCC, 0xA6, 0xF6, 0x74, 0xCC, 0xA6, 0xF6, ++ 0x48, 0xCC, 0x8C, 0xF6, 0x68, 0xCC, 0x8C, 0xF6, ++ 0x41, 0xCC, 0x87, 0xF6, 0x61, 0xCC, 0x87, 0xF6, ++ 0x45, 0xCC, 0xA7, 0xF6, 0x65, 0xCC, 0xA7, 0xF6, ++ 0x4F, 0xCC, 0x88, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, ++ 0x88, 0xCC, 0x84, 0xF6, 0x4F, 0xCC, 0x83, 0xCC, ++ 0x84, 0xF6, 0x6F, 0xCC, 0x83, 0xCC, 0x84, 0xF6, ++ 0x4F, 0xCC, 0x87, 0xF6, 0x6F, 0xCC, 0x87, 0xF6, ++ 0x4F, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, ++ 0x87, 0xCC, 0x84, 0xF6, 0x59, 0xCC, 0x84, 0xF6, ++ 0x79, 0xCC, 0x84, 0x68, 0xC9, 0xA6, 0x6A, 0x72, ++ 0xC9, 0xB9, 0xC9, 0xBB, 0xCA, 0x81, 0x77, 0x79, ++ 0x20, 0xCC, 0x86, 0x20, 0xCC, 0x87, 0x20, 0xCC, ++ 0x8A, 0x20, 0xCC, 0xA8, 0x20, 0xCC, 0x83, 0x20, ++ 0xCC, 0x8B, 0xC9, 0xA3, 0x6C, 0x73, 0x78, 0xCA, ++ 0x95, 0xF6, 0xCC, 0x80, 0xF6, 0xCC, 0x81, 0xF6, ++ 0xCC, 0x93, 0xF6, 0xCC, 0x88, 0xCC, 0x81, 0xF6, ++ 0xCA, 0xB9, 0x20, 0xCD, 0x85, 0xF6, 0x3B, 0x20, ++ 0xCC, 0x81, 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81, ++ 0x20, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0xCE, 0x91, ++ 0xCC, 0x81, 0xF6, 0xC2, 0xB7, 0xF6, 0xCE, 0x95, ++ 0xCC, 0x81, 0xF6, 0xCE, 0x97, 0xCC, 0x81, 0xF6, ++ 0xCE, 0x99, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, ++ 0x81, 0xF6, 0xCE, 0xA5, 0xCC, 0x81, 0xF6, 0xCE, ++ 0xA9, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x88, ++ 0xCC, 0x81, 0xF6, 0xCE, 0x99, 0xCC, 0x88, 0xF6, ++ 0xCE, 0xA5, 0xCC, 0x88, 0xF6, 0xCE, 0xB1, 0xCC, ++ 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x81, 0xF6, 0xCE, ++ 0xB7, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x81, ++ 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6, ++ 0xCE, 0xB9, 0xCC, 0x88, 0xF6, 0xCF, 0x85, 0xCC, ++ 0x88, 0xF6, 0xCE, 0xBF, 0xCC, 0x81, 0xF6, 0xCF, ++ 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC, 0x81, ++ 0xCE, 0xB2, 0xCE, 0xB8, 0xCE, 0xA5, 0xF5, 0x05, ++ 0xCF, 0x92, 0xCC, 0x81, 0xCE, 0xA5, 0xCC, 0x81, ++ 0xF5, 0x05, 0xCF, 0x92, 0xCC, 0x88, 0xCE, 0xA5, ++ 0xCC, 0x88, 0xCF, 0x86, 0xCF, 0x80, 0xCE, 0xBA, ++ 0xCF, 0x81, 0xCF, 0x82, 0xCE, 0x98, 0xCE, 0xB5, ++ 0xCE, 0xA3, 0xF6, 0xD0, 0x95, 0xCC, 0x80, 0xF6, ++ 0xD0, 0x95, 0xCC, 0x88, 0xF6, 0xD0, 0x93, 0xCC, ++ 0x81, 0xF6, 0xD0, 0x86, 0xCC, 0x88, 0xF6, 0xD0, ++ 0x9A, 0xCC, 0x81, 0xF6, 0xD0, 0x98, 0xCC, 0x80, ++ 0xF6, 0xD0, 0xA3, 0xCC, 0x86, 0xF6, 0xD0, 0x98, ++ 0xCC, 0x86, 0xF6, 0xD0, 0xB8, 0xCC, 0x86, 0xF6, ++ 0xD0, 0xB5, 0xCC, 0x80, 0xF6, 0xD0, 0xB5, 0xCC, ++ 0x88, 0xF6, 0xD0, 0xB3, 0xCC, 0x81, 0xF6, 0xD1, ++ 0x96, 0xCC, 0x88, 0xF6, 0xD0, 0xBA, 0xCC, 0x81, ++ 0xF6, 0xD0, 0xB8, 0xCC, 0x80, 0xF6, 0xD1, 0x83, ++ 0xCC, 0x86, 0xF6, 0xD1, 0xB4, 0xCC, 0x8F, 0xF6, ++ 0xD1, 0xB5, 0xCC, 0x8F, 0xF6, 0xD0, 0x96, 0xCC, ++ 0x86, 0xF6, 0xD0, 0xB6, 0xCC, 0x86, 0xF6, 0xD0, ++ 0x90, 0xCC, 0x86, 0xF6, 0xD0, 0xB0, 0xCC, 0x86, ++ 0xF6, 0xD0, 0x90, 0xCC, 0x88, 0xF6, 0xD0, 0xB0, ++ 0xCC, 0x88, 0xF6, 0xD0, 0x95, 0xCC, 0x86, 0xF6, ++ 0xD0, 0xB5, 0xCC, 0x86, 0xF6, 0xD3, 0x98, 0xCC, ++ 0x88, 0xF6, 0xD3, 0x99, 0xCC, 0x88, 0xF6, 0xD0, ++ 0x96, 0xCC, 0x88, 0xF6, 0xD0, 0xB6, 0xCC, 0x88, ++ 0xF6, 0xD0, 0x97, 0xCC, 0x88, 0xF6, 0xD0, 0xB7, ++ 0xCC, 0x88, 0xF6, 0xD0, 0x98, 0xCC, 0x84, 0xF6, ++ 0xD0, 0xB8, 0xCC, 0x84, 0xF6, 0xD0, 0x98, 0xCC, ++ 0x88, 0xF6, 0xD0, 0xB8, 0xCC, 0x88, 0xF6, 0xD0, ++ 0x9E, 0xCC, 0x88, 0xF6, 0xD0, 0xBE, 0xCC, 0x88, ++ 0xF6, 0xD3, 0xA8, 0xCC, 0x88, 0xF6, 0xD3, 0xA9, ++ 0xCC, 0x88, 0xF6, 0xD0, 0xAD, 0xCC, 0x88, 0xF6, ++ 0xD1, 0x8D, 0xCC, 0x88, 0xF6, 0xD0, 0xA3, 0xCC, ++ 0x84, 0xF6, 0xD1, 0x83, 0xCC, 0x84, 0xF6, 0xD0, ++ 0xA3, 0xCC, 0x88, 0xF6, 0xD1, 0x83, 0xCC, 0x88, ++ 0xF6, 0xD0, 0xA3, 0xCC, 0x8B, 0xF6, 0xD1, 0x83, ++ 0xCC, 0x8B, 0xF6, 0xD0, 0xA7, 0xCC, 0x88, 0xF6, ++ 0xD1, 0x87, 0xCC, 0x88, 0xF6, 0xD0, 0xAB, 0xCC, ++ 0x88, 0xF6, 0xD1, 0x8B, 0xCC, 0x88, 0xD5, 0xA5, ++ 0xD6, 0x82, 0xF6, 0xD8, 0xA7, 0xD9, 0x93, 0xF6, ++ 0xD8, 0xA7, 0xD9, 0x94, 0xF6, 0xD9, 0x88, 0xD9, ++ 0x94, 0xF6, 0xD8, 0xA7, 0xD9, 0x95, 0xF6, 0xD9, ++ 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0xB4, 0xD9, ++ 0x88, 0xD9, 0xB4, 0xDB, 0x87, 0xD9, 0xB4, 0xD9, ++ 0x8A, 0xD9, 0xB4, 0xF6, 0xDB, 0x95, 0xD9, 0x94, ++ 0xF6, 0xDB, 0x81, 0xD9, 0x94, 0xF6, 0xDB, 0x92, ++ 0xD9, 0x94, 0xF6, 0xE0, 0xA4, 0xA8, 0xE0, 0xA4, ++ 0xBC, 0xF6, 0xE0, 0xA4, 0xB0, 0xE0, 0xA4, 0xBC, ++ 0xF6, 0xE0, 0xA4, 0xB3, 0xE0, 0xA4, 0xBC, 0xF6, ++ 0xE0, 0xA4, 0x95, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, ++ 0xA4, 0x96, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, ++ 0x97, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x9C, ++ 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0xA1, 0xE0, ++ 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0xA2, 0xE0, 0xA4, ++ 0xBC, 0xF6, 0xE0, 0xA4, 0xAB, 0xE0, 0xA4, 0xBC, ++ 0xF6, 0xE0, 0xA4, 0xAF, 0xE0, 0xA4, 0xBC, 0xF6, ++ 0xE0, 0xA7, 0x87, 0xE0, 0xA6, 0xBE, 0xF6, 0xE0, ++ 0xA7, 0x87, 0xE0, 0xA7, 0x97, 0xF6, 0xE0, 0xA6, ++ 0xA1, 0xE0, 0xA6, 0xBC, 0xF6, 0xE0, 0xA6, 0xA2, ++ 0xE0, 0xA6, 0xBC, 0xF6, 0xE0, 0xA6, 0xAF, 0xE0, ++ 0xA6, 0xBC, 0xF6, 0xE0, 0xA8, 0xB2, 0xE0, 0xA8, ++ 0xBC, 0xF6, 0xE0, 0xA8, 0xB8, 0xE0, 0xA8, 0xBC, ++ 0xF6, 0xE0, 0xA8, 0x96, 0xE0, 0xA8, 0xBC, 0xF6, ++ 0xE0, 0xA8, 0x97, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, ++ 0xA8, 0x9C, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8, ++ 0xAB, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xAD, 0x87, ++ 0xE0, 0xAD, 0x96, 0xF6, 0xE0, 0xAD, 0x87, 0xE0, ++ 0xAC, 0xBE, 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAD, ++ 0x97, 0xF6, 0xE0, 0xAC, 0xA1, 0xE0, 0xAC, 0xBC, ++ 0xF6, 0xE0, 0xAC, 0xA2, 0xE0, 0xAC, 0xBC, 0xF6, ++ 0xE0, 0xAE, 0x92, 0xE0, 0xAF, 0x97, 0xF6, 0xE0, ++ 0xAF, 0x86, 0xE0, 0xAE, 0xBE, 0xF6, 0xE0, 0xAF, ++ 0x87, 0xE0, 0xAE, 0xBE, 0xF6, 0xE0, 0xAF, 0x86, ++ 0xE0, 0xAF, 0x97, 0xF6, 0xE0, 0xB1, 0x86, 0xE0, ++ 0xB1, 0x96, 0xF6, 0xE0, 0xB2, 0xBF, 0xE0, 0xB3, ++ 0x95, 0xF6, 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x95, ++ 0xF6, 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x96, 0xF6, ++ 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x82, 0xF6, 0xE0, ++ 0xB3, 0x86, 0xE0, 0xB3, 0x82, 0xE0, 0xB3, 0x95, ++ 0xF6, 0xE0, 0xB5, 0x86, 0xE0, 0xB4, 0xBE, 0xF6, ++ 0xE0, 0xB5, 0x87, 0xE0, 0xB4, 0xBE, 0xF6, 0xE0, ++ 0xB5, 0x86, 0xE0, 0xB5, 0x97, 0xF6, 0xE0, 0xB7, ++ 0x99, 0xE0, 0xB7, 0x8A, 0xF6, 0xE0, 0xB7, 0x99, ++ 0xE0, 0xB7, 0x8F, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, ++ 0xB7, 0x8F, 0xE0, 0xB7, 0x8A, 0xF6, 0xE0, 0xB7, ++ 0x99, 0xE0, 0xB7, 0x9F, 0xE0, 0xB9, 0x8D, 0xE0, ++ 0xB8, 0xB2, 0xE0, 0xBB, 0x8D, 0xE0, 0xBA, 0xB2, ++ 0xE0, 0xBA, 0xAB, 0xE0, 0xBA, 0x99, 0xE0, 0xBA, ++ 0xAB, 0xE0, 0xBA, 0xA1, 0xE0, 0xBC, 0x8B, 0xF6, ++ 0xE0, 0xBD, 0x82, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, ++ 0xBD, 0x8C, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, ++ 0x91, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x96, ++ 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x9B, 0xE0, ++ 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x80, 0xE0, 0xBE, ++ 0xB5, 0xF6, 0xE0, 0xBD, 0xB1, 0xE0, 0xBD, 0xB2, ++ 0xF6, 0xE0, 0xBD, 0xB1, 0xE0, 0xBD, 0xB4, 0xF6, ++ 0xE0, 0xBE, 0xB2, 0xE0, 0xBE, 0x80, 0xE0, 0xBE, ++ 0xB2, 0xE0, 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, ++ 0xE0, 0xBE, 0xB3, 0xE0, 0xBE, 0x80, 0xE0, 0xBE, ++ 0xB3, 0xE0, 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, ++ 0xE0, 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0, ++ 0xBE, 0x92, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, ++ 0x9C, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xA1, ++ 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xA6, 0xE0, ++ 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xAB, 0xE0, 0xBE, ++ 0xB7, 0xF6, 0xE0, 0xBE, 0x90, 0xE0, 0xBE, 0xB5, ++ 0xF6, 0xE1, 0x80, 0xA5, 0xE1, 0x80, 0xAE, 0xE1, ++ 0x83, 0x9C, 0xF6, 0xE1, 0xAC, 0x85, 0xE1, 0xAC, ++ 0xB5, 0xF6, 0xE1, 0xAC, 0x87, 0xE1, 0xAC, 0xB5, ++ 0xF6, 0xE1, 0xAC, 0x89, 0xE1, 0xAC, 0xB5, 0xF6, ++ 0xE1, 0xAC, 0x8B, 0xE1, 0xAC, 0xB5, 0xF6, 0xE1, ++ 0xAC, 0x8D, 0xE1, 0xAC, 0xB5, 0xF6, 0xE1, 0xAC, ++ 0x91, 0xE1, 0xAC, 0xB5, 0xF6, 0xE1, 0xAC, 0xBA, ++ 0xE1, 0xAC, 0xB5, 0xF6, 0xE1, 0xAC, 0xBC, 0xE1, ++ 0xAC, 0xB5, 0xF6, 0xE1, 0xAC, 0xBE, 0xE1, 0xAC, ++ 0xB5, 0xF6, 0xE1, 0xAC, 0xBF, 0xE1, 0xAC, 0xB5, ++ 0xF6, 0xE1, 0xAD, 0x82, 0xE1, 0xAC, 0xB5, 0x41, ++ 0xC3, 0x86, 0x42, 0x44, 0x45, 0xC6, 0x8E, 0x47, ++ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, ++ 0xC8, 0xA2, 0x50, 0x52, 0x54, 0x55, 0x57, 0x61, ++ 0xC9, 0x90, 0xC9, 0x91, 0xE1, 0xB4, 0x82, 0x62, ++ 0x64, 0x65, 0xC9, 0x99, 0xC9, 0x9B, 0xC9, 0x9C, ++ 0x67, 0x6B, 0x6D, 0xC5, 0x8B, 0x6F, 0xC9, 0x94, ++ 0xE1, 0xB4, 0x96, 0xE1, 0xB4, 0x97, 0x70, 0x74, ++ 0x75, 0xE1, 0xB4, 0x9D, 0xC9, 0xAF, 0x76, 0xE1, ++ 0xB4, 0xA5, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, ++ 0xCF, 0x86, 0xCF, 0x87, 0x69, 0x72, 0x75, 0x76, ++ 0xCE, 0xB2, 0xCE, 0xB3, 0xCF, 0x81, 0xCF, 0x86, ++ 0xCF, 0x87, 0xD0, 0xBD, 0xC9, 0x92, 0x63, 0xC9, ++ 0x95, 0xC3, 0xB0, 0xC9, 0x9C, 0x66, 0xC9, 0x9F, ++ 0xC9, 0xA1, 0xC9, 0xA5, 0xC9, 0xA8, 0xC9, 0xA9, ++ 0xC9, 0xAA, 0xE1, 0xB5, 0xBB, 0xCA, 0x9D, 0xC9, ++ 0xAD, 0xE1, 0xB6, 0x85, 0xCA, 0x9F, 0xC9, 0xB1, ++ 0xC9, 0xB0, 0xC9, 0xB2, 0xC9, 0xB3, 0xC9, 0xB4, ++ 0xC9, 0xB5, 0xC9, 0xB8, 0xCA, 0x82, 0xCA, 0x83, ++ 0xC6, 0xAB, 0xCA, 0x89, 0xCA, 0x8A, 0xE1, 0xB4, ++ 0x9C, 0xCA, 0x8B, 0xCA, 0x8C, 0x7A, 0xCA, 0x90, ++ 0xCA, 0x91, 0xCA, 0x92, 0xCE, 0xB8, 0xF6, 0x41, ++ 0xCC, 0xA5, 0xF6, 0x61, 0xCC, 0xA5, 0xF6, 0x42, ++ 0xCC, 0x87, 0xF6, 0x62, 0xCC, 0x87, 0xF6, 0x42, ++ 0xCC, 0xA3, 0xF6, 0x62, 0xCC, 0xA3, 0xF6, 0x42, ++ 0xCC, 0xB1, 0xF6, 0x62, 0xCC, 0xB1, 0xF6, 0x43, ++ 0xCC, 0xA7, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0xA7, ++ 0xCC, 0x81, 0xF6, 0x44, 0xCC, 0x87, 0xF6, 0x64, ++ 0xCC, 0x87, 0xF6, 0x44, 0xCC, 0xA3, 0xF6, 0x64, ++ 0xCC, 0xA3, 0xF6, 0x44, 0xCC, 0xB1, 0xF6, 0x64, ++ 0xCC, 0xB1, 0xF6, 0x44, 0xCC, 0xA7, 0xF6, 0x64, ++ 0xCC, 0xA7, 0xF6, 0x44, 0xCC, 0xAD, 0xF6, 0x64, ++ 0xCC, 0xAD, 0xF6, 0x45, 0xCC, 0x84, 0xCC, 0x80, ++ 0xF6, 0x65, 0xCC, 0x84, 0xCC, 0x80, 0xF6, 0x45, ++ 0xCC, 0x84, 0xCC, 0x81, 0xF6, 0x65, 0xCC, 0x84, ++ 0xCC, 0x81, 0xF6, 0x45, 0xCC, 0xAD, 0xF6, 0x65, ++ 0xCC, 0xAD, 0xF6, 0x45, 0xCC, 0xB0, 0xF6, 0x65, ++ 0xCC, 0xB0, 0xF6, 0x45, 0xCC, 0xA7, 0xCC, 0x86, ++ 0xF6, 0x65, 0xCC, 0xA7, 0xCC, 0x86, 0xF6, 0x46, ++ 0xCC, 0x87, 0xF6, 0x66, 0xCC, 0x87, 0xF6, 0x47, ++ 0xCC, 0x84, 0xF6, 0x67, 0xCC, 0x84, 0xF6, 0x48, ++ 0xCC, 0x87, 0xF6, 0x68, 0xCC, 0x87, 0xF6, 0x48, ++ 0xCC, 0xA3, 0xF6, 0x68, 0xCC, 0xA3, 0xF6, 0x48, ++ 0xCC, 0x88, 0xF6, 0x68, 0xCC, 0x88, 0xF6, 0x48, ++ 0xCC, 0xA7, 0xF6, 0x68, 0xCC, 0xA7, 0xF6, 0x48, ++ 0xCC, 0xAE, 0xF6, 0x68, 0xCC, 0xAE, 0xF6, 0x49, ++ 0xCC, 0xB0, 0xF6, 0x69, 0xCC, 0xB0, 0xF6, 0x49, ++ 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x69, 0xCC, 0x88, ++ 0xCC, 0x81, 0xF6, 0x4B, 0xCC, 0x81, 0xF6, 0x6B, ++ 0xCC, 0x81, 0xF6, 0x4B, 0xCC, 0xA3, 0xF6, 0x6B, ++ 0xCC, 0xA3, 0xF6, 0x4B, 0xCC, 0xB1, 0xF6, 0x6B, ++ 0xCC, 0xB1, 0xF6, 0x4C, 0xCC, 0xA3, 0xF6, 0x6C, ++ 0xCC, 0xA3, 0xF6, 0x4C, 0xCC, 0xA3, 0xCC, 0x84, ++ 0xF6, 0x6C, 0xCC, 0xA3, 0xCC, 0x84, 0xF6, 0x4C, ++ 0xCC, 0xB1, 0xF6, 0x6C, 0xCC, 0xB1, 0xF6, 0x4C, ++ 0xCC, 0xAD, 0xF6, 0x6C, 0xCC, 0xAD, 0xF6, 0x4D, ++ 0xCC, 0x81, 0xF6, 0x6D, 0xCC, 0x81, 0xF6, 0x4D, ++ 0xCC, 0x87, 0xF6, 0x6D, 0xCC, 0x87, 0xF6, 0x4D, ++ 0xCC, 0xA3, 0xF6, 0x6D, 0xCC, 0xA3, 0xF6, 0x4E, ++ 0xCC, 0x87, 0xF6, 0x6E, 0xCC, 0x87, 0xF6, 0x4E, ++ 0xCC, 0xA3, 0xF6, 0x6E, 0xCC, 0xA3, 0xF6, 0x4E, ++ 0xCC, 0xB1, 0xF6, 0x6E, 0xCC, 0xB1, 0xF6, 0x4E, ++ 0xCC, 0xAD, 0xF6, 0x6E, 0xCC, 0xAD, 0xF6, 0x4F, ++ 0xCC, 0x83, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x83, ++ 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x83, 0xCC, 0x88, ++ 0xF6, 0x6F, 0xCC, 0x83, 0xCC, 0x88, 0xF6, 0x4F, ++ 0xCC, 0x84, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x84, ++ 0xCC, 0x80, 0xF6, 0x4F, 0xCC, 0x84, 0xCC, 0x81, ++ 0xF6, 0x6F, 0xCC, 0x84, 0xCC, 0x81, 0xF6, 0x50, ++ 0xCC, 0x81, 0xF6, 0x70, 0xCC, 0x81, 0xF6, 0x50, ++ 0xCC, 0x87, 0xF6, 0x70, 0xCC, 0x87, 0xF6, 0x52, ++ 0xCC, 0x87, 0xF6, 0x72, 0xCC, 0x87, 0xF6, 0x52, ++ 0xCC, 0xA3, 0xF6, 0x72, 0xCC, 0xA3, 0xF6, 0x52, ++ 0xCC, 0xA3, 0xCC, 0x84, 0xF6, 0x72, 0xCC, 0xA3, ++ 0xCC, 0x84, 0xF6, 0x52, 0xCC, 0xB1, 0xF6, 0x72, ++ 0xCC, 0xB1, 0xF6, 0x53, 0xCC, 0x87, 0xF6, 0x73, ++ 0xCC, 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xF6, 0x73, ++ 0xCC, 0xA3, 0xF6, 0x53, 0xCC, 0x81, 0xCC, 0x87, ++ 0xF6, 0x73, 0xCC, 0x81, 0xCC, 0x87, 0xF6, 0x53, ++ 0xCC, 0x8C, 0xCC, 0x87, 0xF6, 0x73, 0xCC, 0x8C, ++ 0xCC, 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xCC, 0x87, ++ 0xF6, 0x73, 0xCC, 0xA3, 0xCC, 0x87, 0xF6, 0x54, ++ 0xCC, 0x87, 0xF6, 0x74, 0xCC, 0x87, 0xF6, 0x54, ++ 0xCC, 0xA3, 0xF6, 0x74, 0xCC, 0xA3, 0xF6, 0x54, ++ 0xCC, 0xB1, 0xF6, 0x74, 0xCC, 0xB1, 0xF6, 0x54, ++ 0xCC, 0xAD, 0xF6, 0x74, 0xCC, 0xAD, 0xF6, 0x55, ++ 0xCC, 0xA4, 0xF6, 0x75, 0xCC, 0xA4, 0xF6, 0x55, ++ 0xCC, 0xB0, 0xF6, 0x75, 0xCC, 0xB0, 0xF6, 0x55, ++ 0xCC, 0xAD, 0xF6, 0x75, 0xCC, 0xAD, 0xF6, 0x55, ++ 0xCC, 0x83, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x83, ++ 0xCC, 0x81, 0xF6, 0x55, 0xCC, 0x84, 0xCC, 0x88, ++ 0xF6, 0x75, 0xCC, 0x84, 0xCC, 0x88, 0xF6, 0x56, ++ 0xCC, 0x83, 0xF6, 0x76, 0xCC, 0x83, 0xF6, 0x56, ++ 0xCC, 0xA3, 0xF6, 0x76, 0xCC, 0xA3, 0xF6, 0x57, ++ 0xCC, 0x80, 0xF6, 0x77, 0xCC, 0x80, 0xF6, 0x57, ++ 0xCC, 0x81, 0xF6, 0x77, 0xCC, 0x81, 0xF6, 0x57, ++ 0xCC, 0x88, 0xF6, 0x77, 0xCC, 0x88, 0xF6, 0x57, ++ 0xCC, 0x87, 0xF6, 0x77, 0xCC, 0x87, 0xF6, 0x57, ++ 0xCC, 0xA3, 0xF6, 0x77, 0xCC, 0xA3, 0xF6, 0x58, ++ 0xCC, 0x87, 0xF6, 0x78, 0xCC, 0x87, 0xF6, 0x58, ++ 0xCC, 0x88, 0xF6, 0x78, 0xCC, 0x88, 0xF6, 0x59, ++ 0xCC, 0x87, 0xF6, 0x79, 0xCC, 0x87, 0xF6, 0x5A, ++ 0xCC, 0x82, 0xF6, 0x7A, 0xCC, 0x82, 0xF6, 0x5A, ++ 0xCC, 0xA3, 0xF6, 0x7A, 0xCC, 0xA3, 0xF6, 0x5A, ++ 0xCC, 0xB1, 0xF6, 0x7A, 0xCC, 0xB1, 0xF6, 0x68, ++ 0xCC, 0xB1, 0xF6, 0x74, 0xCC, 0x88, 0xF6, 0x77, ++ 0xCC, 0x8A, 0xF6, 0x79, 0xCC, 0x8A, 0x61, 0xCA, ++ 0xBE, 0xF5, 0x05, 0xC5, 0xBF, 0xCC, 0x87, 0x73, ++ 0xCC, 0x87, 0xF6, 0x41, 0xCC, 0xA3, 0xF6, 0x61, ++ 0xCC, 0xA3, 0xF6, 0x41, 0xCC, 0x89, 0xF6, 0x61, ++ 0xCC, 0x89, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x81, ++ 0xF6, 0x61, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x41, ++ 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x61, 0xCC, 0x82, ++ 0xCC, 0x80, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x89, ++ 0xF6, 0x61, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x41, ++ 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x82, ++ 0xCC, 0x83, 0xF6, 0x41, 0xCC, 0xA3, 0xCC, 0x82, ++ 0xF6, 0x61, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x41, ++ 0xCC, 0x86, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x86, ++ 0xCC, 0x81, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x80, ++ 0xF6, 0x61, 0xCC, 0x86, 0xCC, 0x80, 0xF6, 0x41, ++ 0xCC, 0x86, 0xCC, 0x89, 0xF6, 0x61, 0xCC, 0x86, ++ 0xCC, 0x89, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x83, ++ 0xF6, 0x61, 0xCC, 0x86, 0xCC, 0x83, 0xF6, 0x41, ++ 0xCC, 0xA3, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0xA3, ++ 0xCC, 0x86, 0xF6, 0x45, 0xCC, 0xA3, 0xF6, 0x65, ++ 0xCC, 0xA3, 0xF6, 0x45, 0xCC, 0x89, 0xF6, 0x65, ++ 0xCC, 0x89, 0xF6, 0x45, 0xCC, 0x83, 0xF6, 0x65, ++ 0xCC, 0x83, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x81, ++ 0xF6, 0x65, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x45, ++ 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x82, ++ 0xCC, 0x80, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x89, ++ 0xF6, 0x65, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x45, ++ 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x65, 0xCC, 0x82, ++ 0xCC, 0x83, 0xF6, 0x45, 0xCC, 0xA3, 0xCC, 0x82, ++ 0xF6, 0x65, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x49, ++ 0xCC, 0x89, 0xF6, 0x69, 0xCC, 0x89, 0xF6, 0x49, ++ 0xCC, 0xA3, 0xF6, 0x69, 0xCC, 0xA3, 0xF6, 0x4F, ++ 0xCC, 0xA3, 0xF6, 0x6F, 0xCC, 0xA3, 0xF6, 0x4F, ++ 0xCC, 0x89, 0xF6, 0x6F, 0xCC, 0x89, 0xF6, 0x4F, ++ 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82, ++ 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x80, ++ 0xF6, 0x6F, 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x4F, ++ 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x6F, 0xCC, 0x82, ++ 0xCC, 0x89, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x83, ++ 0xF6, 0x6F, 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x4F, ++ 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x6F, 0xCC, 0xA3, ++ 0xCC, 0x82, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x81, ++ 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, 0x81, 0xF6, 0x4F, ++ 0xCC, 0x9B, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x9B, ++ 0xCC, 0x80, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x89, ++ 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, 0x89, 0xF6, 0x4F, ++ 0xCC, 0x9B, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x9B, ++ 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0xA3, ++ 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, 0xA3, 0xF6, 0x55, ++ 0xCC, 0xA3, 0xF6, 0x75, 0xCC, 0xA3, 0xF6, 0x55, ++ 0xCC, 0x89, 0xF6, 0x75, 0xCC, 0x89, 0xF6, 0x55, ++ 0xCC, 0x9B, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x9B, ++ 0xCC, 0x81, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x80, ++ 0xF6, 0x75, 0xCC, 0x9B, 0xCC, 0x80, 0xF6, 0x55, ++ 0xCC, 0x9B, 0xCC, 0x89, 0xF6, 0x75, 0xCC, 0x9B, ++ 0xCC, 0x89, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x83, ++ 0xF6, 0x75, 0xCC, 0x9B, 0xCC, 0x83, 0xF6, 0x55, ++ 0xCC, 0x9B, 0xCC, 0xA3, 0xF6, 0x75, 0xCC, 0x9B, ++ 0xCC, 0xA3, 0xF6, 0x59, 0xCC, 0x80, 0xF6, 0x79, ++ 0xCC, 0x80, 0xF6, 0x59, 0xCC, 0xA3, 0xF6, 0x79, ++ 0xCC, 0xA3, 0xF6, 0x59, 0xCC, 0x89, 0xF6, 0x79, ++ 0xCC, 0x89, 0xF6, 0x59, 0xCC, 0x83, 0xF6, 0x79, ++ 0xCC, 0x83, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xF6, ++ 0xCE, 0xB1, 0xCC, 0x94, 0xF6, 0xCE, 0xB1, 0xCC, ++ 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x94, ++ 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCC, ++ 0x81, 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCC, 0x81, ++ 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xF6, ++ 0xCE, 0xB1, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, ++ 0x91, 0xCC, 0x93, 0xF6, 0xCE, 0x91, 0xCC, 0x94, ++ 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xF6, ++ 0xCE, 0x91, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, ++ 0x91, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0x91, ++ 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCC, ++ 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0x91, 0xCC, 0x94, ++ 0xCD, 0x82, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xF6, ++ 0xCE, 0xB5, 0xCC, 0x94, 0xF6, 0xCE, 0xB5, 0xCC, ++ 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x94, ++ 0xCC, 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xCC, ++ 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x94, 0xCC, 0x81, ++ 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xF6, 0xCE, 0x95, ++ 0xCC, 0x94, 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xCC, ++ 0x80, 0xF6, 0xCE, 0x95, 0xCC, 0x94, 0xCC, 0x80, ++ 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xCC, 0x81, 0xF6, ++ 0xCE, 0x95, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, ++ 0xB7, 0xCC, 0x93, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, ++ 0xF6, 0xCE, 0xB7, 0xCC, 0x93, 0xCC, 0x80, 0xF6, ++ 0xCE, 0xB7, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, ++ 0xB7, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, ++ 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, ++ 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, ++ 0xCD, 0x82, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xF6, ++ 0xCE, 0x97, 0xCC, 0x94, 0xF6, 0xCE, 0x97, 0xCC, ++ 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x94, ++ 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, ++ 0x81, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x81, ++ 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCD, 0x82, 0xF6, ++ 0xCE, 0x97, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, ++ 0xB9, 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0xCC, 0x94, ++ 0xF6, 0xCE, 0xB9, 0xCC, 0x93, 0xCC, 0x80, 0xF6, ++ 0xCE, 0xB9, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, ++ 0xB9, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, ++ 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, ++ 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x94, ++ 0xCD, 0x82, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xF6, ++ 0xCE, 0x99, 0xCC, 0x94, 0xF6, 0xCE, 0x99, 0xCC, ++ 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x94, ++ 0xCC, 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xCC, ++ 0x81, 0xF6, 0xCE, 0x99, 0xCC, 0x94, 0xCC, 0x81, ++ 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xCD, 0x82, 0xF6, ++ 0xCE, 0x99, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, ++ 0xBF, 0xCC, 0x93, 0xF6, 0xCE, 0xBF, 0xCC, 0x94, ++ 0xF6, 0xCE, 0xBF, 0xCC, 0x93, 0xCC, 0x80, 0xF6, ++ 0xCE, 0xBF, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, ++ 0xBF, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xBF, ++ 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, ++ 0x93, 0xF6, 0xCE, 0x9F, 0xCC, 0x94, 0xF6, 0xCE, ++ 0x9F, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x9F, ++ 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC, ++ 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, 0x94, ++ 0xCC, 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xF6, ++ 0xCF, 0x85, 0xCC, 0x94, 0xF6, 0xCF, 0x85, 0xCC, ++ 0x93, 0xCC, 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x94, ++ 0xCC, 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xCC, ++ 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x94, 0xCC, 0x81, ++ 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xCD, 0x82, 0xF6, ++ 0xCF, 0x85, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, ++ 0xA5, 0xCC, 0x94, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, ++ 0xCC, 0x80, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCC, ++ 0x81, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCD, 0x82, ++ 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xF6, 0xCF, 0x89, ++ 0xCC, 0x94, 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCC, ++ 0x80, 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x80, ++ 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCC, 0x81, 0xF6, ++ 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCF, ++ 0x89, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCF, 0x89, ++ 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xA9, 0xCC, ++ 0x93, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xF6, 0xCE, ++ 0xA9, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, ++ 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, ++ 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, ++ 0xCC, 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD, ++ 0x82, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x82, ++ 0xF6, 0xCE, 0xB1, 0xCC, 0x80, 0xF6, 0xCE, 0xB1, ++ 0xCC, 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x80, 0xF6, ++ 0xCE, 0xB5, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, ++ 0x80, 0xF6, 0xCE, 0xB7, 0xCC, 0x81, 0xF6, 0xCE, ++ 0xB9, 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC, 0x81, ++ 0xF6, 0xCE, 0xBF, 0xCC, 0x80, 0xF6, 0xCE, 0xBF, ++ 0xCC, 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x80, 0xF6, ++ 0xCF, 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC, ++ 0x80, 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xF6, 0xCE, ++ 0xB1, 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, ++ 0xCC, 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, ++ 0x93, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, ++ 0xCC, 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, ++ 0xB1, 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, ++ 0xCE, 0xB1, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, ++ 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xCD, ++ 0x85, 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCD, 0x82, ++ 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCD, ++ 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCD, 0x85, ++ 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xCD, ++ 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCC, 0x80, ++ 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC, ++ 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94, ++ 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, ++ 0x93, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x91, ++ 0xCC, 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, ++ 0xB7, 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, ++ 0xCC, 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, ++ 0x93, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, ++ 0xCC, 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, ++ 0xB7, 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, ++ 0xCE, 0xB7, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, ++ 0xF6, 0xCE, 0xB7, 0xCC, 0x93, 0xCD, 0x82, 0xCD, ++ 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xCD, 0x82, ++ 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCD, ++ 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCD, 0x85, ++ 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x80, 0xCD, ++ 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x80, ++ 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, ++ 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94, ++ 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, ++ 0x93, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x97, ++ 0xCC, 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCF, ++ 0x89, 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCF, 0x89, ++ 0xCC, 0x94, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, ++ 0x93, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, ++ 0xCC, 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, ++ 0x89, 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, ++ 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, ++ 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCD, 0x82, 0xCD, ++ 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCD, 0x82, ++ 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD, ++ 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x85, ++ 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCC, 0x80, 0xCD, ++ 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC, 0x80, ++ 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCC, ++ 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, ++ 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, ++ 0x93, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, ++ 0xCC, 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, ++ 0xB1, 0xCC, 0x86, 0xF6, 0xCE, 0xB1, 0xCC, 0x84, ++ 0xF6, 0xCE, 0xB1, 0xCC, 0x80, 0xCD, 0x85, 0xF6, ++ 0xCE, 0xB1, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, ++ 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCD, 0x82, ++ 0xF6, 0xCE, 0xB1, 0xCD, 0x82, 0xCD, 0x85, 0xF6, ++ 0xCE, 0x91, 0xCC, 0x86, 0xF6, 0xCE, 0x91, 0xCC, ++ 0x84, 0xF6, 0xCE, 0x91, 0xCC, 0x80, 0xF6, 0xCE, ++ 0x91, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCD, 0x85, ++ 0x20, 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0x20, 0xCC, ++ 0x93, 0x20, 0xCD, 0x82, 0xF5, 0x05, 0xC2, 0xA8, ++ 0xCD, 0x82, 0x20, 0xCC, 0x88, 0xCD, 0x82, 0xF6, ++ 0xCE, 0xB7, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, ++ 0xB7, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x81, ++ 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCD, 0x82, 0xF6, ++ 0xCE, 0xB7, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, ++ 0x95, 0xCC, 0x80, 0xF6, 0xCE, 0x95, 0xCC, 0x81, ++ 0xF6, 0xCE, 0x97, 0xCC, 0x80, 0xF6, 0xCE, 0x97, ++ 0xCC, 0x81, 0xF6, 0xCE, 0x97, 0xCD, 0x85, 0xF5, ++ 0x06, 0xE1, 0xBE, 0xBF, 0xCC, 0x80, 0x20, 0xCC, ++ 0x93, 0xCC, 0x80, 0xF5, 0x06, 0xE1, 0xBE, 0xBF, ++ 0xCC, 0x81, 0x20, 0xCC, 0x93, 0xCC, 0x81, 0xF5, ++ 0x06, 0xE1, 0xBE, 0xBF, 0xCD, 0x82, 0x20, 0xCC, ++ 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x86, ++ 0xF6, 0xCE, 0xB9, 0xCC, 0x84, 0xF6, 0xCE, 0xB9, ++ 0xCC, 0x88, 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC, ++ 0x88, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCD, 0x82, ++ 0xF6, 0xCE, 0xB9, 0xCC, 0x88, 0xCD, 0x82, 0xF6, ++ 0xCE, 0x99, 0xCC, 0x86, 0xF6, 0xCE, 0x99, 0xCC, ++ 0x84, 0xF6, 0xCE, 0x99, 0xCC, 0x80, 0xF6, 0xCE, ++ 0x99, 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE, ++ 0xCC, 0x80, 0x20, 0xCC, 0x94, 0xCC, 0x80, 0xF5, ++ 0x06, 0xE1, 0xBF, 0xBE, 0xCC, 0x81, 0x20, 0xCC, ++ 0x94, 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE, ++ 0xCD, 0x82, 0x20, 0xCC, 0x94, 0xCD, 0x82, 0xF6, ++ 0xCF, 0x85, 0xCC, 0x86, 0xF6, 0xCF, 0x85, 0xCC, ++ 0x84, 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x80, ++ 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6, ++ 0xCF, 0x81, 0xCC, 0x93, 0xF6, 0xCF, 0x81, 0xCC, ++ 0x94, 0xF6, 0xCF, 0x85, 0xCD, 0x82, 0xF6, 0xCF, ++ 0x85, 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE, 0xA5, ++ 0xCC, 0x86, 0xF6, 0xCE, 0xA5, 0xCC, 0x84, 0xF6, ++ 0xCE, 0xA5, 0xCC, 0x80, 0xF6, 0xCE, 0xA5, 0xCC, ++ 0x81, 0xF6, 0xCE, 0xA1, 0xCC, 0x94, 0xF5, 0x05, ++ 0xC2, 0xA8, 0xCC, 0x80, 0x20, 0xCC, 0x88, 0xCC, ++ 0x80, 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81, 0x20, ++ 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x60, 0xF6, 0xCF, ++ 0x89, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, ++ 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xCD, ++ 0x85, 0xF6, 0xCF, 0x89, 0xCD, 0x82, 0xF6, 0xCF, ++ 0x89, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x9F, ++ 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC, 0x81, 0xF6, ++ 0xCE, 0xA9, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, ++ 0x81, 0xF6, 0xCE, 0xA9, 0xCD, 0x85, 0xF5, 0x03, ++ 0xC2, 0xB4, 0x20, 0xCC, 0x81, 0x20, 0xCC, 0x94, ++ 0xF5, 0x04, 0xE2, 0x80, 0x82, 0x20, 0xF5, 0x04, ++ 0xE2, 0x80, 0x83, 0x20, 0x20, 0x20, 0x20, 0x20, ++ 0x20, 0x20, 0x20, 0x20, 0x20, 0xE2, 0x80, 0x90, ++ 0x20, 0xCC, 0xB3, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, ++ 0x2E, 0x20, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, ++ 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0xE2, 0x80, ++ 0xB2, 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0xE2, ++ 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5, ++ 0x21, 0x21, 0x20, 0xCC, 0x85, 0x3F, 0x3F, 0x3F, ++ 0x21, 0x21, 0x3F, 0xE2, 0x80, 0xB2, 0xE2, 0x80, ++ 0xB2, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0x20, ++ 0x30, 0x69, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, ++ 0x2B, 0xE2, 0x88, 0x92, 0x3D, 0x28, 0x29, 0x6E, ++ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, ++ 0x38, 0x39, 0x2B, 0xE2, 0x88, 0x92, 0x3D, 0x28, ++ 0x29, 0x61, 0x65, 0x6F, 0x78, 0xC9, 0x99, 0x52, ++ 0x73, 0x61, 0x2F, 0x63, 0x61, 0x2F, 0x73, 0x43, ++ 0xC2, 0xB0, 0x43, 0x63, 0x2F, 0x6F, 0x63, 0x2F, ++ 0x75, 0xC6, 0x90, 0xC2, 0xB0, 0x46, 0x67, 0x48, ++ 0x48, 0x48, 0x68, 0xC4, 0xA7, 0x49, 0x49, 0x4C, ++ 0x6C, 0x4E, 0x4E, 0x6F, 0x50, 0x51, 0x52, 0x52, ++ 0x52, 0x53, 0x4D, 0x54, 0x45, 0x4C, 0x54, 0x4D, ++ 0x5A, 0xF6, 0xCE, 0xA9, 0x5A, 0xF6, 0x4B, 0xF6, ++ 0x41, 0xCC, 0x8A, 0x42, 0x43, 0x65, 0x45, 0x46, ++ 0x4D, 0x6F, 0xD7, 0x90, 0xD7, 0x91, 0xD7, 0x92, ++ 0xD7, 0x93, 0x69, 0x46, 0x41, 0x58, 0xCF, 0x80, ++ 0xCE, 0xB3, 0xCE, 0x93, 0xCE, 0xA0, 0xE2, 0x88, ++ 0x91, 0x44, 0x64, 0x65, 0x69, 0x6A, 0x31, 0xE2, ++ 0x81, 0x84, 0x33, 0x32, 0xE2, 0x81, 0x84, 0x33, ++ 0x31, 0xE2, 0x81, 0x84, 0x35, 0x32, 0xE2, 0x81, ++ 0x84, 0x35, 0x33, 0xE2, 0x81, 0x84, 0x35, 0x34, ++ 0xE2, 0x81, 0x84, 0x35, 0x31, 0xE2, 0x81, 0x84, ++ 0x36, 0x35, 0xE2, 0x81, 0x84, 0x36, 0x31, 0xE2, ++ 0x81, 0x84, 0x38, 0x33, 0xE2, 0x81, 0x84, 0x38, ++ 0x35, 0xE2, 0x81, 0x84, 0x38, 0x37, 0xE2, 0x81, ++ 0x84, 0x38, 0x31, 0xE2, 0x81, 0x84, 0x49, 0x49, ++ 0x49, 0x49, 0x49, 0x49, 0x49, 0x56, 0x56, 0x56, ++ 0x49, 0x56, 0x49, 0x49, 0x56, 0x49, 0x49, 0x49, ++ 0x49, 0x58, 0x58, 0x58, 0x49, 0x58, 0x49, 0x49, ++ 0x4C, 0x43, 0x44, 0x4D, 0x69, 0x69, 0x69, 0x69, ++ 0x69, 0x69, 0x69, 0x76, 0x76, 0x76, 0x69, 0x76, ++ 0x69, 0x69, 0x76, 0x69, 0x69, 0x69, 0x69, 0x78, ++ 0x78, 0x78, 0x69, 0x78, 0x69, 0x69, 0x6C, 0x63, ++ 0x64, 0x6D, 0xF6, 0xE2, 0x86, 0x90, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x86, 0x92, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x86, 0x94, 0xCC, 0xB8, 0xF6, 0xE2, 0x87, 0x90, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x87, 0x94, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x87, 0x92, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x88, 0x83, 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0x88, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0x8B, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x88, 0xA3, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x88, 0xA5, 0xCC, 0xB8, 0xE2, 0x88, 0xAB, 0xE2, ++ 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, ++ 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAE, 0xE2, 0x88, ++ 0xAE, 0xE2, 0x88, 0xAE, 0xE2, 0x88, 0xAE, 0xE2, ++ 0x88, 0xAE, 0xF6, 0xE2, 0x88, 0xBC, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x89, 0x83, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x89, 0x85, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x88, ++ 0xCC, 0xB8, 0xF6, 0x3D, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x89, 0xA1, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x8D, ++ 0xCC, 0xB8, 0xF6, 0x3C, 0xCC, 0xB8, 0xF6, 0x3E, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xA4, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x89, 0xA5, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x89, 0xB2, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB3, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB6, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x89, 0xB7, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x89, 0xBA, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBB, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x82, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x8A, 0x83, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x8A, 0x86, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x87, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xA2, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x8A, 0xA8, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x8A, 0xA9, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xAB, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBC, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x89, 0xBD, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x8A, 0x91, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x92, ++ 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB2, 0xCC, 0xB8, ++ 0xF6, 0xE2, 0x8A, 0xB3, 0xCC, 0xB8, 0xF6, 0xE2, ++ 0x8A, 0xB4, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB5, ++ 0xCC, 0xB8, 0xF6, 0xE3, 0x80, 0x88, 0xF6, 0xE3, ++ 0x80, 0x89, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, ++ 0x37, 0x38, 0x39, 0x31, 0x30, 0x31, 0x31, 0x31, ++ 0x32, 0x31, 0x33, 0x31, 0x34, 0x31, 0x35, 0x31, ++ 0x36, 0x31, 0x37, 0x31, 0x38, 0x31, 0x39, 0x32, ++ 0x30, 0x28, 0x31, 0x29, 0x28, 0x32, 0x29, 0x28, ++ 0x33, 0x29, 0x28, 0x34, 0x29, 0x28, 0x35, 0x29, ++ 0x28, 0x36, 0x29, 0x28, 0x37, 0x29, 0x28, 0x38, ++ 0x29, 0x28, 0x39, 0x29, 0x28, 0x31, 0x30, 0x29, ++ 0x28, 0x31, 0x31, 0x29, 0x28, 0x31, 0x32, 0x29, ++ 0x28, 0x31, 0x33, 0x29, 0x28, 0x31, 0x34, 0x29, ++ 0x28, 0x31, 0x35, 0x29, 0x28, 0x31, 0x36, 0x29, ++ 0x28, 0x31, 0x37, 0x29, 0x28, 0x31, 0x38, 0x29, ++ 0x28, 0x31, 0x39, 0x29, 0x28, 0x32, 0x30, 0x29, ++ 0x31, 0x2E, 0x32, 0x2E, 0x33, 0x2E, 0x34, 0x2E, ++ 0x35, 0x2E, 0x36, 0x2E, 0x37, 0x2E, 0x38, 0x2E, ++ 0x39, 0x2E, 0x31, 0x30, 0x2E, 0x31, 0x31, 0x2E, ++ 0x31, 0x32, 0x2E, 0x31, 0x33, 0x2E, 0x31, 0x34, ++ 0x2E, 0x31, 0x35, 0x2E, 0x31, 0x36, 0x2E, 0x31, ++ 0x37, 0x2E, 0x31, 0x38, 0x2E, 0x31, 0x39, 0x2E, ++ 0x32, 0x30, 0x2E, 0x28, 0x61, 0x29, 0x28, 0x62, ++ 0x29, 0x28, 0x63, 0x29, 0x28, 0x64, 0x29, 0x28, ++ 0x65, 0x29, 0x28, 0x66, 0x29, 0x28, 0x67, 0x29, ++ 0x28, 0x68, 0x29, 0x28, 0x69, 0x29, 0x28, 0x6A, ++ 0x29, 0x28, 0x6B, 0x29, 0x28, 0x6C, 0x29, 0x28, ++ 0x6D, 0x29, 0x28, 0x6E, 0x29, 0x28, 0x6F, 0x29, ++ 0x28, 0x70, 0x29, 0x28, 0x71, 0x29, 0x28, 0x72, ++ 0x29, 0x28, 0x73, 0x29, 0x28, 0x74, 0x29, 0x28, ++ 0x75, 0x29, 0x28, 0x76, 0x29, 0x28, 0x77, 0x29, ++ 0x28, 0x78, 0x29, 0x28, 0x79, 0x29, 0x28, 0x7A, ++ 0x29, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, ++ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, ++ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, ++ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, ++ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, ++ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, ++ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x30, 0xE2, 0x88, ++ 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2, ++ 0x88, 0xAB, 0x3A, 0x3A, 0x3D, 0x3D, 0x3D, 0x3D, ++ 0x3D, 0x3D, 0xF6, 0xE2, 0xAB, 0x9D, 0xCC, 0xB8, ++ 0xE2, 0xB5, 0xA1, 0xE6, 0xAF, 0x8D, 0xE9, 0xBE, ++ 0x9F, 0xE4, 0xB8, 0x80, 0xE4, 0xB8, 0xA8, 0xE4, ++ 0xB8, 0xB6, 0xE4, 0xB8, 0xBF, 0xE4, 0xB9, 0x99, ++ 0xE4, 0xBA, 0x85, 0xE4, 0xBA, 0x8C, 0xE4, 0xBA, ++ 0xA0, 0xE4, 0xBA, 0xBA, 0xE5, 0x84, 0xBF, 0xE5, ++ 0x85, 0xA5, 0xE5, 0x85, 0xAB, 0xE5, 0x86, 0x82, ++ 0xE5, 0x86, 0x96, 0xE5, 0x86, 0xAB, 0xE5, 0x87, ++ 0xA0, 0xE5, 0x87, 0xB5, 0xE5, 0x88, 0x80, 0xE5, ++ 0x8A, 0x9B, 0xE5, 0x8B, 0xB9, 0xE5, 0x8C, 0x95, ++ 0xE5, 0x8C, 0x9A, 0xE5, 0x8C, 0xB8, 0xE5, 0x8D, ++ 0x81, 0xE5, 0x8D, 0x9C, 0xE5, 0x8D, 0xA9, 0xE5, ++ 0x8E, 0x82, 0xE5, 0x8E, 0xB6, 0xE5, 0x8F, 0x88, ++ 0xE5, 0x8F, 0xA3, 0xE5, 0x9B, 0x97, 0xE5, 0x9C, ++ 0x9F, 0xE5, 0xA3, 0xAB, 0xE5, 0xA4, 0x82, 0xE5, ++ 0xA4, 0x8A, 0xE5, 0xA4, 0x95, 0xE5, 0xA4, 0xA7, ++ 0xE5, 0xA5, 0xB3, 0xE5, 0xAD, 0x90, 0xE5, 0xAE, ++ 0x80, 0xE5, 0xAF, 0xB8, 0xE5, 0xB0, 0x8F, 0xE5, ++ 0xB0, 0xA2, 0xE5, 0xB0, 0xB8, 0xE5, 0xB1, 0xAE, ++ 0xE5, 0xB1, 0xB1, 0xE5, 0xB7, 0x9B, 0xE5, 0xB7, ++ 0xA5, 0xE5, 0xB7, 0xB1, 0xE5, 0xB7, 0xBE, 0xE5, ++ 0xB9, 0xB2, 0xE5, 0xB9, 0xBA, 0xE5, 0xB9, 0xBF, ++ 0xE5, 0xBB, 0xB4, 0xE5, 0xBB, 0xBE, 0xE5, 0xBC, ++ 0x8B, 0xE5, 0xBC, 0x93, 0xE5, 0xBD, 0x90, 0xE5, ++ 0xBD, 0xA1, 0xE5, 0xBD, 0xB3, 0xE5, 0xBF, 0x83, ++ 0xE6, 0x88, 0x88, 0xE6, 0x88, 0xB6, 0xE6, 0x89, ++ 0x8B, 0xE6, 0x94, 0xAF, 0xE6, 0x94, 0xB4, 0xE6, ++ 0x96, 0x87, 0xE6, 0x96, 0x97, 0xE6, 0x96, 0xA4, ++ 0xE6, 0x96, 0xB9, 0xE6, 0x97, 0xA0, 0xE6, 0x97, ++ 0xA5, 0xE6, 0x9B, 0xB0, 0xE6, 0x9C, 0x88, 0xE6, ++ 0x9C, 0xA8, 0xE6, 0xAC, 0xA0, 0xE6, 0xAD, 0xA2, ++ 0xE6, 0xAD, 0xB9, 0xE6, 0xAE, 0xB3, 0xE6, 0xAF, ++ 0x8B, 0xE6, 0xAF, 0x94, 0xE6, 0xAF, 0x9B, 0xE6, ++ 0xB0, 0x8F, 0xE6, 0xB0, 0x94, 0xE6, 0xB0, 0xB4, ++ 0xE7, 0x81, 0xAB, 0xE7, 0x88, 0xAA, 0xE7, 0x88, ++ 0xB6, 0xE7, 0x88, 0xBB, 0xE7, 0x88, 0xBF, 0xE7, ++ 0x89, 0x87, 0xE7, 0x89, 0x99, 0xE7, 0x89, 0x9B, ++ 0xE7, 0x8A, 0xAC, 0xE7, 0x8E, 0x84, 0xE7, 0x8E, ++ 0x89, 0xE7, 0x93, 0x9C, 0xE7, 0x93, 0xA6, 0xE7, ++ 0x94, 0x98, 0xE7, 0x94, 0x9F, 0xE7, 0x94, 0xA8, ++ 0xE7, 0x94, 0xB0, 0xE7, 0x96, 0x8B, 0xE7, 0x96, ++ 0x92, 0xE7, 0x99, 0xB6, 0xE7, 0x99, 0xBD, 0xE7, ++ 0x9A, 0xAE, 0xE7, 0x9A, 0xBF, 0xE7, 0x9B, 0xAE, ++ 0xE7, 0x9F, 0x9B, 0xE7, 0x9F, 0xA2, 0xE7, 0x9F, ++ 0xB3, 0xE7, 0xA4, 0xBA, 0xE7, 0xA6, 0xB8, 0xE7, ++ 0xA6, 0xBE, 0xE7, 0xA9, 0xB4, 0xE7, 0xAB, 0x8B, ++ 0xE7, 0xAB, 0xB9, 0xE7, 0xB1, 0xB3, 0xE7, 0xB3, ++ 0xB8, 0xE7, 0xBC, 0xB6, 0xE7, 0xBD, 0x91, 0xE7, ++ 0xBE, 0x8A, 0xE7, 0xBE, 0xBD, 0xE8, 0x80, 0x81, ++ 0xE8, 0x80, 0x8C, 0xE8, 0x80, 0x92, 0xE8, 0x80, ++ 0xB3, 0xE8, 0x81, 0xBF, 0xE8, 0x82, 0x89, 0xE8, ++ 0x87, 0xA3, 0xE8, 0x87, 0xAA, 0xE8, 0x87, 0xB3, ++ 0xE8, 0x87, 0xBC, 0xE8, 0x88, 0x8C, 0xE8, 0x88, ++ 0x9B, 0xE8, 0x88, 0x9F, 0xE8, 0x89, 0xAE, 0xE8, ++ 0x89, 0xB2, 0xE8, 0x89, 0xB8, 0xE8, 0x99, 0x8D, ++ 0xE8, 0x99, 0xAB, 0xE8, 0xA1, 0x80, 0xE8, 0xA1, ++ 0x8C, 0xE8, 0xA1, 0xA3, 0xE8, 0xA5, 0xBE, 0xE8, ++ 0xA6, 0x8B, 0xE8, 0xA7, 0x92, 0xE8, 0xA8, 0x80, ++ 0xE8, 0xB0, 0xB7, 0xE8, 0xB1, 0x86, 0xE8, 0xB1, ++ 0x95, 0xE8, 0xB1, 0xB8, 0xE8, 0xB2, 0x9D, 0xE8, ++ 0xB5, 0xA4, 0xE8, 0xB5, 0xB0, 0xE8, 0xB6, 0xB3, ++ 0xE8, 0xBA, 0xAB, 0xE8, 0xBB, 0x8A, 0xE8, 0xBE, ++ 0x9B, 0xE8, 0xBE, 0xB0, 0xE8, 0xBE, 0xB5, 0xE9, ++ 0x82, 0x91, 0xE9, 0x85, 0x89, 0xE9, 0x87, 0x86, ++ 0xE9, 0x87, 0x8C, 0xE9, 0x87, 0x91, 0xE9, 0x95, ++ 0xB7, 0xE9, 0x96, 0x80, 0xE9, 0x98, 0x9C, 0xE9, ++ 0x9A, 0xB6, 0xE9, 0x9A, 0xB9, 0xE9, 0x9B, 0xA8, ++ 0xE9, 0x9D, 0x91, 0xE9, 0x9D, 0x9E, 0xE9, 0x9D, ++ 0xA2, 0xE9, 0x9D, 0xA9, 0xE9, 0x9F, 0x8B, 0xE9, ++ 0x9F, 0xAD, 0xE9, 0x9F, 0xB3, 0xE9, 0xA0, 0x81, ++ 0xE9, 0xA2, 0xA8, 0xE9, 0xA3, 0x9B, 0xE9, 0xA3, ++ 0x9F, 0xE9, 0xA6, 0x96, 0xE9, 0xA6, 0x99, 0xE9, ++ 0xA6, 0xAC, 0xE9, 0xAA, 0xA8, 0xE9, 0xAB, 0x98, ++ 0xE9, 0xAB, 0x9F, 0xE9, 0xAC, 0xA5, 0xE9, 0xAC, ++ 0xAF, 0xE9, 0xAC, 0xB2, 0xE9, 0xAC, 0xBC, 0xE9, ++ 0xAD, 0x9A, 0xE9, 0xB3, 0xA5, 0xE9, 0xB9, 0xB5, ++ 0xE9, 0xB9, 0xBF, 0xE9, 0xBA, 0xA5, 0xE9, 0xBA, ++ 0xBB, 0xE9, 0xBB, 0x83, 0xE9, 0xBB, 0x8D, 0xE9, ++ 0xBB, 0x91, 0xE9, 0xBB, 0xB9, 0xE9, 0xBB, 0xBD, ++ 0xE9, 0xBC, 0x8E, 0xE9, 0xBC, 0x93, 0xE9, 0xBC, ++ 0xA0, 0xE9, 0xBC, 0xBB, 0xE9, 0xBD, 0x8A, 0xE9, ++ 0xBD, 0x92, 0xE9, 0xBE, 0x8D, 0xE9, 0xBE, 0x9C, ++ 0xE9, 0xBE, 0xA0, 0x20, 0xE3, 0x80, 0x92, 0xE5, ++ 0x8D, 0x81, 0xE5, 0x8D, 0x84, 0xE5, 0x8D, 0x85, ++ 0xF6, 0xE3, 0x81, 0x8B, 0xE3, 0x82, 0x99, 0xF6, ++ 0xE3, 0x81, 0x8D, 0xE3, 0x82, 0x99, 0xF6, 0xE3, ++ 0x81, 0x8F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, ++ 0x91, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x93, ++ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x95, 0xE3, ++ 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x97, 0xE3, 0x82, ++ 0x99, 0xF6, 0xE3, 0x81, 0x99, 0xE3, 0x82, 0x99, ++ 0xF6, 0xE3, 0x81, 0x9B, 0xE3, 0x82, 0x99, 0xF6, ++ 0xE3, 0x81, 0x9D, 0xE3, 0x82, 0x99, 0xF6, 0xE3, ++ 0x81, 0x9F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, ++ 0xA1, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA4, ++ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA6, 0xE3, ++ 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA8, 0xE3, 0x82, ++ 0x99, 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82, 0x99, ++ 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82, 0x9A, 0xF6, ++ 0xE3, 0x81, 0xB2, 0xE3, 0x82, 0x99, 0xF6, 0xE3, ++ 0x81, 0xB2, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x81, ++ 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xB5, ++ 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x81, 0xB8, 0xE3, ++ 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xB8, 0xE3, 0x82, ++ 0x9A, 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82, 0x99, ++ 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82, 0x9A, 0xF6, ++ 0xE3, 0x81, 0x86, 0xE3, 0x82, 0x99, 0x20, 0xE3, ++ 0x82, 0x99, 0x20, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, ++ 0x82, 0x9D, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0x88, ++ 0xE3, 0x82, 0x8A, 0xF6, 0xE3, 0x82, 0xAB, 0xE3, ++ 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xAD, 0xE3, 0x82, ++ 0x99, 0xF6, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, ++ 0xF6, 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0x99, 0xF6, ++ 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0x99, 0xF6, 0xE3, ++ 0x82, 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, ++ 0xB7, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xB9, ++ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBB, 0xE3, ++ 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBD, 0xE3, 0x82, ++ 0x99, 0xF6, 0xE3, 0x82, 0xBF, 0xE3, 0x82, 0x99, ++ 0xF6, 0xE3, 0x83, 0x81, 0xE3, 0x82, 0x99, 0xF6, ++ 0xE3, 0x83, 0x84, 0xE3, 0x82, 0x99, 0xF6, 0xE3, ++ 0x83, 0x86, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, ++ 0x88, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x8F, ++ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x8F, 0xE3, ++ 0x82, 0x9A, 0xF6, 0xE3, 0x83, 0x92, 0xE3, 0x82, ++ 0x99, 0xF6, 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x9A, ++ 0xF6, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x99, 0xF6, ++ 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, ++ 0x83, 0x98, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, ++ 0x98, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x83, 0x9B, ++ 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x9B, 0xE3, ++ 0x82, 0x9A, 0xF6, 0xE3, 0x82, 0xA6, 0xE3, 0x82, ++ 0x99, 0xF6, 0xE3, 0x83, 0xAF, 0xE3, 0x82, 0x99, ++ 0xF6, 0xE3, 0x83, 0xB0, 0xE3, 0x82, 0x99, 0xF6, ++ 0xE3, 0x83, 0xB1, 0xE3, 0x82, 0x99, 0xF6, 0xE3, ++ 0x83, 0xB2, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, ++ 0xBD, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xB3, 0xE3, ++ 0x83, 0x88, 0xE1, 0x84, 0x80, 0xE1, 0x84, 0x81, ++ 0xE1, 0x86, 0xAA, 0xE1, 0x84, 0x82, 0xE1, 0x86, ++ 0xAC, 0xE1, 0x86, 0xAD, 0xE1, 0x84, 0x83, 0xE1, ++ 0x84, 0x84, 0xE1, 0x84, 0x85, 0xE1, 0x86, 0xB0, ++ 0xE1, 0x86, 0xB1, 0xE1, 0x86, 0xB2, 0xE1, 0x86, ++ 0xB3, 0xE1, 0x86, 0xB4, 0xE1, 0x86, 0xB5, 0xE1, ++ 0x84, 0x9A, 0xE1, 0x84, 0x86, 0xE1, 0x84, 0x87, ++ 0xE1, 0x84, 0x88, 0xE1, 0x84, 0xA1, 0xE1, 0x84, ++ 0x89, 0xE1, 0x84, 0x8A, 0xE1, 0x84, 0x8B, 0xE1, ++ 0x84, 0x8C, 0xE1, 0x84, 0x8D, 0xE1, 0x84, 0x8E, ++ 0xE1, 0x84, 0x8F, 0xE1, 0x84, 0x90, 0xE1, 0x84, ++ 0x91, 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, ++ 0x85, 0xA2, 0xE1, 0x85, 0xA3, 0xE1, 0x85, 0xA4, ++ 0xE1, 0x85, 0xA5, 0xE1, 0x85, 0xA6, 0xE1, 0x85, ++ 0xA7, 0xE1, 0x85, 0xA8, 0xE1, 0x85, 0xA9, 0xE1, ++ 0x85, 0xAA, 0xE1, 0x85, 0xAB, 0xE1, 0x85, 0xAC, ++ 0xE1, 0x85, 0xAD, 0xE1, 0x85, 0xAE, 0xE1, 0x85, ++ 0xAF, 0xE1, 0x85, 0xB0, 0xE1, 0x85, 0xB1, 0xE1, ++ 0x85, 0xB2, 0xE1, 0x85, 0xB3, 0xE1, 0x85, 0xB4, ++ 0xE1, 0x85, 0xB5, 0xE1, 0x85, 0xA0, 0xE1, 0x84, ++ 0x94, 0xE1, 0x84, 0x95, 0xE1, 0x87, 0x87, 0xE1, ++ 0x87, 0x88, 0xE1, 0x87, 0x8C, 0xE1, 0x87, 0x8E, ++ 0xE1, 0x87, 0x93, 0xE1, 0x87, 0x97, 0xE1, 0x87, ++ 0x99, 0xE1, 0x84, 0x9C, 0xE1, 0x87, 0x9D, 0xE1, ++ 0x87, 0x9F, 0xE1, 0x84, 0x9D, 0xE1, 0x84, 0x9E, ++ 0xE1, 0x84, 0xA0, 0xE1, 0x84, 0xA2, 0xE1, 0x84, ++ 0xA3, 0xE1, 0x84, 0xA7, 0xE1, 0x84, 0xA9, 0xE1, ++ 0x84, 0xAB, 0xE1, 0x84, 0xAC, 0xE1, 0x84, 0xAD, ++ 0xE1, 0x84, 0xAE, 0xE1, 0x84, 0xAF, 0xE1, 0x84, ++ 0xB2, 0xE1, 0x84, 0xB6, 0xE1, 0x85, 0x80, 0xE1, ++ 0x85, 0x87, 0xE1, 0x85, 0x8C, 0xE1, 0x87, 0xB1, ++ 0xE1, 0x87, 0xB2, 0xE1, 0x85, 0x97, 0xE1, 0x85, ++ 0x98, 0xE1, 0x85, 0x99, 0xE1, 0x86, 0x84, 0xE1, ++ 0x86, 0x85, 0xE1, 0x86, 0x88, 0xE1, 0x86, 0x91, ++ 0xE1, 0x86, 0x92, 0xE1, 0x86, 0x94, 0xE1, 0x86, ++ 0x9E, 0xE1, 0x86, 0xA1, 0xE4, 0xB8, 0x80, 0xE4, ++ 0xBA, 0x8C, 0xE4, 0xB8, 0x89, 0xE5, 0x9B, 0x9B, ++ 0xE4, 0xB8, 0x8A, 0xE4, 0xB8, 0xAD, 0xE4, 0xB8, ++ 0x8B, 0xE7, 0x94, 0xB2, 0xE4, 0xB9, 0x99, 0xE4, ++ 0xB8, 0x99, 0xE4, 0xB8, 0x81, 0xE5, 0xA4, 0xA9, ++ 0xE5, 0x9C, 0xB0, 0xE4, 0xBA, 0xBA, 0x28, 0xE1, ++ 0x84, 0x80, 0x29, 0x28, 0xE1, 0x84, 0x82, 0x29, ++ 0x28, 0xE1, 0x84, 0x83, 0x29, 0x28, 0xE1, 0x84, ++ 0x85, 0x29, 0x28, 0xE1, 0x84, 0x86, 0x29, 0x28, ++ 0xE1, 0x84, 0x87, 0x29, 0x28, 0xE1, 0x84, 0x89, ++ 0x29, 0x28, 0xE1, 0x84, 0x8B, 0x29, 0x28, 0xE1, ++ 0x84, 0x8C, 0x29, 0x28, 0xE1, 0x84, 0x8E, 0x29, ++ 0x28, 0xE1, 0x84, 0x8F, 0x29, 0x28, 0xE1, 0x84, ++ 0x90, 0x29, 0x28, 0xE1, 0x84, 0x91, 0x29, 0x28, ++ 0xE1, 0x84, 0x92, 0x29, 0x28, 0xE1, 0x84, 0x80, ++ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x82, ++ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x83, ++ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x85, ++ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x86, ++ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x87, ++ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x89, ++ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8B, ++ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8C, ++ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8E, ++ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8F, ++ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x90, ++ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x91, ++ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x92, ++ 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8C, ++ 0xE1, 0x85, 0xAE, 0x29, 0x28, 0xE1, 0x84, 0x8B, ++ 0xE1, 0x85, 0xA9, 0xE1, 0x84, 0x8C, 0xE1, 0x85, ++ 0xA5, 0xE1, 0x86, 0xAB, 0x29, 0x28, 0xE1, 0x84, ++ 0x8B, 0xE1, 0x85, 0xA9, 0xE1, 0x84, 0x92, 0xE1, ++ 0x85, 0xAE, 0x29, 0x28, 0xE4, 0xB8, 0x80, 0x29, ++ 0x28, 0xE4, 0xBA, 0x8C, 0x29, 0x28, 0xE4, 0xB8, ++ 0x89, 0x29, 0x28, 0xE5, 0x9B, 0x9B, 0x29, 0x28, ++ 0xE4, 0xBA, 0x94, 0x29, 0x28, 0xE5, 0x85, 0xAD, ++ 0x29, 0x28, 0xE4, 0xB8, 0x83, 0x29, 0x28, 0xE5, ++ 0x85, 0xAB, 0x29, 0x28, 0xE4, 0xB9, 0x9D, 0x29, ++ 0x28, 0xE5, 0x8D, 0x81, 0x29, 0x28, 0xE6, 0x9C, ++ 0x88, 0x29, 0x28, 0xE7, 0x81, 0xAB, 0x29, 0x28, ++ 0xE6, 0xB0, 0xB4, 0x29, 0x28, 0xE6, 0x9C, 0xA8, ++ 0x29, 0x28, 0xE9, 0x87, 0x91, 0x29, 0x28, 0xE5, ++ 0x9C, 0x9F, 0x29, 0x28, 0xE6, 0x97, 0xA5, 0x29, ++ 0x28, 0xE6, 0xA0, 0xAA, 0x29, 0x28, 0xE6, 0x9C, ++ 0x89, 0x29, 0x28, 0xE7, 0xA4, 0xBE, 0x29, 0x28, ++ 0xE5, 0x90, 0x8D, 0x29, 0x28, 0xE7, 0x89, 0xB9, ++ 0x29, 0x28, 0xE8, 0xB2, 0xA1, 0x29, 0x28, 0xE7, ++ 0xA5, 0x9D, 0x29, 0x28, 0xE5, 0x8A, 0xB4, 0x29, ++ 0x28, 0xE4, 0xBB, 0xA3, 0x29, 0x28, 0xE5, 0x91, ++ 0xBC, 0x29, 0x28, 0xE5, 0xAD, 0xA6, 0x29, 0x28, ++ 0xE7, 0x9B, 0xA3, 0x29, 0x28, 0xE4, 0xBC, 0x81, ++ 0x29, 0x28, 0xE8, 0xB3, 0x87, 0x29, 0x28, 0xE5, ++ 0x8D, 0x94, 0x29, 0x28, 0xE7, 0xA5, 0xAD, 0x29, ++ 0x28, 0xE4, 0xBC, 0x91, 0x29, 0x28, 0xE8, 0x87, ++ 0xAA, 0x29, 0x28, 0xE8, 0x87, 0xB3, 0x29, 0x50, ++ 0x54, 0x45, 0x32, 0x31, 0x32, 0x32, 0x32, 0x33, ++ 0x32, 0x34, 0x32, 0x35, 0x32, 0x36, 0x32, 0x37, ++ 0x32, 0x38, 0x32, 0x39, 0x33, 0x30, 0x33, 0x31, ++ 0x33, 0x32, 0x33, 0x33, 0x33, 0x34, 0x33, 0x35, ++ 0xE1, 0x84, 0x80, 0xE1, 0x84, 0x82, 0xE1, 0x84, ++ 0x83, 0xE1, 0x84, 0x85, 0xE1, 0x84, 0x86, 0xE1, ++ 0x84, 0x87, 0xE1, 0x84, 0x89, 0xE1, 0x84, 0x8B, ++ 0xE1, 0x84, 0x8C, 0xE1, 0x84, 0x8E, 0xE1, 0x84, ++ 0x8F, 0xE1, 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1, ++ 0x84, 0x92, 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xA1, ++ 0xE1, 0x84, 0x82, 0xE1, 0x85, 0xA1, 0xE1, 0x84, ++ 0x83, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x85, 0xE1, ++ 0x85, 0xA1, 0xE1, 0x84, 0x86, 0xE1, 0x85, 0xA1, ++ 0xE1, 0x84, 0x87, 0xE1, 0x85, 0xA1, 0xE1, 0x84, ++ 0x89, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x8B, 0xE1, ++ 0x85, 0xA1, 0xE1, 0x84, 0x8C, 0xE1, 0x85, 0xA1, ++ 0xE1, 0x84, 0x8E, 0xE1, 0x85, 0xA1, 0xE1, 0x84, ++ 0x8F, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x90, 0xE1, ++ 0x85, 0xA1, 0xE1, 0x84, 0x91, 0xE1, 0x85, 0xA1, ++ 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x84, ++ 0x8E, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xB7, 0xE1, ++ 0x84, 0x80, 0xE1, 0x85, 0xA9, 0xE1, 0x84, 0x8C, ++ 0xE1, 0x85, 0xAE, 0xE1, 0x84, 0x8B, 0xE1, 0x85, ++ 0xB4, 0xE1, 0x84, 0x8B, 0xE1, 0x85, 0xAE, 0xE4, ++ 0xB8, 0x80, 0xE4, 0xBA, 0x8C, 0xE4, 0xB8, 0x89, ++ 0xE5, 0x9B, 0x9B, 0xE4, 0xBA, 0x94, 0xE5, 0x85, ++ 0xAD, 0xE4, 0xB8, 0x83, 0xE5, 0x85, 0xAB, 0xE4, ++ 0xB9, 0x9D, 0xE5, 0x8D, 0x81, 0xE6, 0x9C, 0x88, ++ 0xE7, 0x81, 0xAB, 0xE6, 0xB0, 0xB4, 0xE6, 0x9C, ++ 0xA8, 0xE9, 0x87, 0x91, 0xE5, 0x9C, 0x9F, 0xE6, ++ 0x97, 0xA5, 0xE6, 0xA0, 0xAA, 0xE6, 0x9C, 0x89, ++ 0xE7, 0xA4, 0xBE, 0xE5, 0x90, 0x8D, 0xE7, 0x89, ++ 0xB9, 0xE8, 0xB2, 0xA1, 0xE7, 0xA5, 0x9D, 0xE5, ++ 0x8A, 0xB4, 0xE7, 0xA7, 0x98, 0xE7, 0x94, 0xB7, ++ 0xE5, 0xA5, 0xB3, 0xE9, 0x81, 0xA9, 0xE5, 0x84, ++ 0xAA, 0xE5, 0x8D, 0xB0, 0xE6, 0xB3, 0xA8, 0xE9, ++ 0xA0, 0x85, 0xE4, 0xBC, 0x91, 0xE5, 0x86, 0x99, ++ 0xE6, 0xAD, 0xA3, 0xE4, 0xB8, 0x8A, 0xE4, 0xB8, ++ 0xAD, 0xE4, 0xB8, 0x8B, 0xE5, 0xB7, 0xA6, 0xE5, ++ 0x8F, 0xB3, 0xE5, 0x8C, 0xBB, 0xE5, 0xAE, 0x97, ++ 0xE5, 0xAD, 0xA6, 0xE7, 0x9B, 0xA3, 0xE4, 0xBC, ++ 0x81, 0xE8, 0xB3, 0x87, 0xE5, 0x8D, 0x94, 0xE5, ++ 0xA4, 0x9C, 0x33, 0x36, 0x33, 0x37, 0x33, 0x38, ++ 0x33, 0x39, 0x34, 0x30, 0x34, 0x31, 0x34, 0x32, ++ 0x34, 0x33, 0x34, 0x34, 0x34, 0x35, 0x34, 0x36, ++ 0x34, 0x37, 0x34, 0x38, 0x34, 0x39, 0x35, 0x30, ++ 0x31, 0xE6, 0x9C, 0x88, 0x32, 0xE6, 0x9C, 0x88, ++ 0x33, 0xE6, 0x9C, 0x88, 0x34, 0xE6, 0x9C, 0x88, ++ 0x35, 0xE6, 0x9C, 0x88, 0x36, 0xE6, 0x9C, 0x88, ++ 0x37, 0xE6, 0x9C, 0x88, 0x38, 0xE6, 0x9C, 0x88, ++ 0x39, 0xE6, 0x9C, 0x88, 0x31, 0x30, 0xE6, 0x9C, ++ 0x88, 0x31, 0x31, 0xE6, 0x9C, 0x88, 0x31, 0x32, ++ 0xE6, 0x9C, 0x88, 0x48, 0x67, 0x65, 0x72, 0x67, ++ 0x65, 0x56, 0x4C, 0x54, 0x44, 0xE3, 0x82, 0xA2, ++ 0xE3, 0x82, 0xA4, 0xE3, 0x82, 0xA6, 0xE3, 0x82, ++ 0xA8, 0xE3, 0x82, 0xAA, 0xE3, 0x82, 0xAB, 0xE3, ++ 0x82, 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0xB1, ++ 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0xB5, 0xE3, 0x82, ++ 0xB7, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xBB, 0xE3, ++ 0x82, 0xBD, 0xE3, 0x82, 0xBF, 0xE3, 0x83, 0x81, ++ 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x86, 0xE3, 0x83, ++ 0x88, 0xE3, 0x83, 0x8A, 0xE3, 0x83, 0x8B, 0xE3, ++ 0x83, 0x8C, 0xE3, 0x83, 0x8D, 0xE3, 0x83, 0x8E, ++ 0xE3, 0x83, 0x8F, 0xE3, 0x83, 0x92, 0xE3, 0x83, ++ 0x95, 0xE3, 0x83, 0x98, 0xE3, 0x83, 0x9B, 0xE3, ++ 0x83, 0x9E, 0xE3, 0x83, 0x9F, 0xE3, 0x83, 0xA0, ++ 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xA2, 0xE3, 0x83, ++ 0xA4, 0xE3, 0x83, 0xA6, 0xE3, 0x83, 0xA8, 0xE3, ++ 0x83, 0xA9, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xAB, ++ 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xAD, 0xE3, 0x83, ++ 0xAF, 0xE3, 0x83, 0xB0, 0xE3, 0x83, 0xB1, 0xE3, ++ 0x83, 0xB2, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0x8F, ++ 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x83, ++ 0x88, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xAB, 0xE3, ++ 0x83, 0x95, 0xE3, 0x82, 0xA1, 0xE3, 0x82, 0xA2, ++ 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x98, 0xE3, 0x82, ++ 0x9A, 0xE3, 0x82, 0xA2, 0xE3, 0x82, 0xA2, 0xE3, ++ 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xA4, ++ 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0xB3, 0xE3, 0x82, ++ 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xA4, 0xE3, ++ 0x83, 0xB3, 0xE3, 0x83, 0x81, 0xE3, 0x82, 0xA6, ++ 0xE3, 0x82, 0xA9, 0xE3, 0x83, 0xB3, 0xE3, 0x82, ++ 0xA8, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xAF, 0xE3, ++ 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, ++ 0xE3, 0x82, 0xA8, 0xE3, 0x83, 0xBC, 0xE3, 0x82, ++ 0xAB, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAA, 0xE3, ++ 0x83, 0xB3, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xAA, ++ 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xA0, 0xE3, 0x82, ++ 0xAB, 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xAA, 0xE3, ++ 0x82, 0xAB, 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0x83, ++ 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xAB, 0xE3, 0x83, ++ 0xAD, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xBC, 0xE3, ++ 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAD, ++ 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xAB, 0xE3, 0x82, ++ 0x99, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x9E, 0xE3, ++ 0x82, 0xAD, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xAB, ++ 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xAD, 0xE3, 0x82, ++ 0x99, 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0xBC, 0xE3, ++ 0x82, 0xAD, 0xE3, 0x83, 0xA5, 0xE3, 0x83, 0xAA, ++ 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAD, 0xE3, 0x82, ++ 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xBF, 0xE3, ++ 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAD, ++ 0xE3, 0x83, 0xAD, 0xE3, 0x82, 0xAD, 0xE3, 0x83, ++ 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, 0xE3, ++ 0x83, 0xA9, 0xE3, 0x83, 0xA0, 0xE3, 0x82, 0xAD, ++ 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xA1, 0xE3, 0x83, ++ 0xBC, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB, 0xE3, ++ 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xAF, ++ 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3, 0x82, ++ 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA9, 0xE3, ++ 0x83, 0xA0, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, ++ 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xA0, 0xE3, 0x83, ++ 0x88, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xAF, 0xE3, ++ 0x83, 0xAB, 0xE3, 0x82, 0xBB, 0xE3, 0x82, 0x99, ++ 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xAD, 0xE3, 0x82, ++ 0xAF, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xBC, 0xE3, ++ 0x83, 0x8D, 0xE3, 0x82, 0xB1, 0xE3, 0x83, 0xBC, ++ 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xB3, 0xE3, 0x83, ++ 0xAB, 0xE3, 0x83, 0x8A, 0xE3, 0x82, 0xB3, 0xE3, ++ 0x83, 0xBC, 0xE3, 0x83, 0x9B, 0xE3, 0x82, 0x9A, ++ 0xE3, 0x82, 0xB5, 0xE3, 0x82, 0xA4, 0xE3, 0x82, ++ 0xAF, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xB5, 0xE3, ++ 0x83, 0xB3, 0xE3, 0x83, 0x81, 0xE3, 0x83, 0xBC, ++ 0xE3, 0x83, 0xA0, 0xE3, 0x82, 0xB7, 0xE3, 0x83, ++ 0xAA, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xAF, 0xE3, ++ 0x82, 0x99, 0xE3, 0x82, 0xBB, 0xE3, 0x83, 0xB3, ++ 0xE3, 0x83, 0x81, 0xE3, 0x82, 0xBB, 0xE3, 0x83, ++ 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xBF, 0xE3, ++ 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xB9, ++ 0xE3, 0x83, 0x86, 0xE3, 0x82, 0x99, 0xE3, 0x82, ++ 0xB7, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3, ++ 0x83, 0xAB, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xB3, ++ 0xE3, 0x83, 0x8A, 0xE3, 0x83, 0x8E, 0xE3, 0x83, ++ 0x8E, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3, ++ 0x83, 0x8F, 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0x84, ++ 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x9A, 0xE3, 0x83, ++ 0xBC, 0xE3, 0x82, 0xBB, 0xE3, 0x83, 0xB3, 0xE3, ++ 0x83, 0x88, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x9A, ++ 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x84, 0xE3, 0x83, ++ 0x8F, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, ++ 0x83, 0xAC, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92, ++ 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xA2, 0xE3, 0x82, ++ 0xB9, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB, 0xE3, ++ 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xAF, ++ 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82, ++ 0x9A, 0xE3, 0x82, 0xB3, 0xE3, 0x83, 0x92, 0xE3, ++ 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x95, ++ 0xE3, 0x82, 0xA1, 0xE3, 0x83, 0xA9, 0xE3, 0x83, ++ 0x83, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3, ++ 0x83, 0x95, 0xE3, 0x82, 0xA3, 0xE3, 0x83, 0xBC, ++ 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x95, 0xE3, 0x82, ++ 0x99, 0xE3, 0x83, 0x83, 0xE3, 0x82, 0xB7, 0xE3, ++ 0x82, 0xA7, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x95, ++ 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xB3, 0xE3, 0x83, ++ 0x98, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0xBF, 0xE3, ++ 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x98, ++ 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xBD, 0xE3, 0x83, ++ 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0x8B, 0xE3, ++ 0x83, 0x92, 0xE3, 0x83, 0x98, 0xE3, 0x83, 0xAB, ++ 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x98, 0xE3, 0x82, ++ 0x9A, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xB9, 0xE3, ++ 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC, ++ 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0x99, 0xE3, 0x83, ++ 0x98, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, ++ 0x82, 0xBF, 0xE3, 0x83, 0x9B, 0xE3, 0x82, 0x9A, ++ 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xB3, 0xE3, 0x83, ++ 0x88, 0xE3, 0x83, 0x9B, 0xE3, 0x82, 0x99, 0xE3, ++ 0x83, 0xAB, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x9B, ++ 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x9B, 0xE3, 0x82, ++ 0x9A, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88, 0xE3, ++ 0x82, 0x99, 0xE3, 0x83, 0x9B, 0xE3, 0x83, 0xBC, ++ 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x9B, 0xE3, 0x83, ++ 0xBC, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x9E, 0xE3, ++ 0x82, 0xA4, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAD, ++ 0xE3, 0x83, 0x9E, 0xE3, 0x82, 0xA4, 0xE3, 0x83, ++ 0xAB, 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0x83, 0xE3, ++ 0x83, 0x8F, 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0xAB, ++ 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0x9E, 0xE3, 0x83, ++ 0xB3, 0xE3, 0x82, 0xB7, 0xE3, 0x83, 0xA7, 0xE3, ++ 0x83, 0xB3, 0xE3, 0x83, 0x9F, 0xE3, 0x82, 0xAF, ++ 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xB3, 0xE3, 0x83, ++ 0x9F, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0x9F, 0xE3, ++ 0x83, 0xAA, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x99, ++ 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x83, ++ 0xA1, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3, ++ 0x83, 0xA1, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0x99, ++ 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xB3, 0xE3, 0x83, ++ 0xA1, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, ++ 0x83, 0xAB, 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xBC, ++ 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3, 0x83, ++ 0xA4, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, ++ 0x83, 0xA6, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xB3, ++ 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0x83, 0xE3, 0x83, ++ 0x88, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xAA, 0xE3, ++ 0x83, 0xA9, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92, ++ 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x83, ++ 0xAB, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x95, 0xE3, ++ 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xAC, ++ 0xE3, 0x83, 0xA0, 0xE3, 0x83, 0xAC, 0xE3, 0x83, ++ 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xB1, 0xE3, ++ 0x82, 0x99, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0xAF, ++ 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0x30, 0xE7, ++ 0x82, 0xB9, 0x31, 0xE7, 0x82, 0xB9, 0x32, 0xE7, ++ 0x82, 0xB9, 0x33, 0xE7, 0x82, 0xB9, 0x34, 0xE7, ++ 0x82, 0xB9, 0x35, 0xE7, 0x82, 0xB9, 0x36, 0xE7, ++ 0x82, 0xB9, 0x37, 0xE7, 0x82, 0xB9, 0x38, 0xE7, ++ 0x82, 0xB9, 0x39, 0xE7, 0x82, 0xB9, 0x31, 0x30, ++ 0xE7, 0x82, 0xB9, 0x31, 0x31, 0xE7, 0x82, 0xB9, ++ 0x31, 0x32, 0xE7, 0x82, 0xB9, 0x31, 0x33, 0xE7, ++ 0x82, 0xB9, 0x31, 0x34, 0xE7, 0x82, 0xB9, 0x31, ++ 0x35, 0xE7, 0x82, 0xB9, 0x31, 0x36, 0xE7, 0x82, ++ 0xB9, 0x31, 0x37, 0xE7, 0x82, 0xB9, 0x31, 0x38, ++ 0xE7, 0x82, 0xB9, 0x31, 0x39, 0xE7, 0x82, 0xB9, ++ 0x32, 0x30, 0xE7, 0x82, 0xB9, 0x32, 0x31, 0xE7, ++ 0x82, 0xB9, 0x32, 0x32, 0xE7, 0x82, 0xB9, 0x32, ++ 0x33, 0xE7, 0x82, 0xB9, 0x32, 0x34, 0xE7, 0x82, ++ 0xB9, 0x68, 0x50, 0x61, 0x64, 0x61, 0x41, 0x55, ++ 0x62, 0x61, 0x72, 0x6F, 0x56, 0x70, 0x63, 0x64, ++ 0x6D, 0x64, 0x6D, 0x32, 0x64, 0x6D, 0x33, 0x49, ++ 0x55, 0xE5, 0xB9, 0xB3, 0xE6, 0x88, 0x90, 0xE6, ++ 0x98, 0xAD, 0xE5, 0x92, 0x8C, 0xE5, 0xA4, 0xA7, ++ 0xE6, 0xAD, 0xA3, 0xE6, 0x98, 0x8E, 0xE6, 0xB2, ++ 0xBB, 0xE6, 0xA0, 0xAA, 0xE5, 0xBC, 0x8F, 0xE4, ++ 0xBC, 0x9A, 0xE7, 0xA4, 0xBE, 0x70, 0x41, 0x6E, ++ 0x41, 0xCE, 0xBC, 0x41, 0x6D, 0x41, 0x6B, 0x41, ++ 0x4B, 0x42, 0x4D, 0x42, 0x47, 0x42, 0x63, 0x61, ++ 0x6C, 0x6B, 0x63, 0x61, 0x6C, 0x70, 0x46, 0x6E, ++ 0x46, 0xCE, 0xBC, 0x46, 0xCE, 0xBC, 0x67, 0x6D, ++ 0x67, 0x6B, 0x67, 0x48, 0x7A, 0x6B, 0x48, 0x7A, ++ 0x4D, 0x48, 0x7A, 0x47, 0x48, 0x7A, 0x54, 0x48, ++ 0x7A, 0xCE, 0xBC, 0x6C, 0x6D, 0x6C, 0x64, 0x6C, ++ 0x6B, 0x6C, 0x66, 0x6D, 0x6E, 0x6D, 0xCE, 0xBC, ++ 0x6D, 0x6D, 0x6D, 0x63, 0x6D, 0x6B, 0x6D, 0x6D, ++ 0x6D, 0x32, 0x63, 0x6D, 0x32, 0x6D, 0x32, 0x6B, ++ 0x6D, 0x32, 0x6D, 0x6D, 0x33, 0x63, 0x6D, 0x33, ++ 0x6D, 0x33, 0x6B, 0x6D, 0x33, 0x6D, 0xE2, 0x88, ++ 0x95, 0x73, 0x6D, 0xE2, 0x88, 0x95, 0x73, 0x32, ++ 0x50, 0x61, 0x6B, 0x50, 0x61, 0x4D, 0x50, 0x61, ++ 0x47, 0x50, 0x61, 0x72, 0x61, 0x64, 0x72, 0x61, ++ 0x64, 0xE2, 0x88, 0x95, 0x73, 0x72, 0x61, 0x64, ++ 0xE2, 0x88, 0x95, 0x73, 0x32, 0x70, 0x73, 0x6E, ++ 0x73, 0xCE, 0xBC, 0x73, 0x6D, 0x73, 0x70, 0x56, ++ 0x6E, 0x56, 0xCE, 0xBC, 0x56, 0x6D, 0x56, 0x6B, ++ 0x56, 0x4D, 0x56, 0x70, 0x57, 0x6E, 0x57, 0xCE, ++ 0xBC, 0x57, 0x6D, 0x57, 0x6B, 0x57, 0x4D, 0x57, ++ 0x6B, 0xCE, 0xA9, 0x4D, 0xCE, 0xA9, 0x61, 0x2E, ++ 0x6D, 0x2E, 0x42, 0x71, 0x63, 0x63, 0x63, 0x64, ++ 0x43, 0xE2, 0x88, 0x95, 0x6B, 0x67, 0x43, 0x6F, ++ 0x2E, 0x64, 0x42, 0x47, 0x79, 0x68, 0x61, 0x48, ++ 0x50, 0x69, 0x6E, 0x4B, 0x4B, 0x4B, 0x4D, 0x6B, ++ 0x74, 0x6C, 0x6D, 0x6C, 0x6E, 0x6C, 0x6F, 0x67, ++ 0x6C, 0x78, 0x6D, 0x62, 0x6D, 0x69, 0x6C, 0x6D, ++ 0x6F, 0x6C, 0x50, 0x48, 0x70, 0x2E, 0x6D, 0x2E, ++ 0x50, 0x50, 0x4D, 0x50, 0x52, 0x73, 0x72, 0x53, ++ 0x76, 0x57, 0x62, 0x56, 0xE2, 0x88, 0x95, 0x6D, ++ 0x41, 0xE2, 0x88, 0x95, 0x6D, 0x31, 0xE6, 0x97, ++ 0xA5, 0x32, 0xE6, 0x97, 0xA5, 0x33, 0xE6, 0x97, ++ 0xA5, 0x34, 0xE6, 0x97, 0xA5, 0x35, 0xE6, 0x97, ++ 0xA5, 0x36, 0xE6, 0x97, 0xA5, 0x37, 0xE6, 0x97, ++ 0xA5, 0x38, 0xE6, 0x97, 0xA5, 0x39, 0xE6, 0x97, ++ 0xA5, 0x31, 0x30, 0xE6, 0x97, 0xA5, 0x31, 0x31, ++ 0xE6, 0x97, 0xA5, 0x31, 0x32, 0xE6, 0x97, 0xA5, ++ 0x31, 0x33, 0xE6, 0x97, 0xA5, 0x31, 0x34, 0xE6, ++ 0x97, 0xA5, 0x31, 0x35, 0xE6, 0x97, 0xA5, 0x31, ++ 0x36, 0xE6, 0x97, 0xA5, 0x31, 0x37, 0xE6, 0x97, ++ 0xA5, 0x31, 0x38, 0xE6, 0x97, 0xA5, 0x31, 0x39, ++ 0xE6, 0x97, 0xA5, 0x32, 0x30, 0xE6, 0x97, 0xA5, ++ 0x32, 0x31, 0xE6, 0x97, 0xA5, 0x32, 0x32, 0xE6, ++ 0x97, 0xA5, 0x32, 0x33, 0xE6, 0x97, 0xA5, 0x32, ++ 0x34, 0xE6, 0x97, 0xA5, 0x32, 0x35, 0xE6, 0x97, ++ 0xA5, 0x32, 0x36, 0xE6, 0x97, 0xA5, 0x32, 0x37, ++ 0xE6, 0x97, 0xA5, 0x32, 0x38, 0xE6, 0x97, 0xA5, ++ 0x32, 0x39, 0xE6, 0x97, 0xA5, 0x33, 0x30, 0xE6, ++ 0x97, 0xA5, 0x33, 0x31, 0xE6, 0x97, 0xA5, 0x67, ++ 0x61, 0x6C, 0xF6, 0xE8, 0xB1, 0x88, 0xF6, 0xE6, ++ 0x9B, 0xB4, 0xF6, 0xE8, 0xBB, 0x8A, 0xF6, 0xE8, ++ 0xB3, 0x88, 0xF6, 0xE6, 0xBB, 0x91, 0xF6, 0xE4, ++ 0xB8, 0xB2, 0xF6, 0xE5, 0x8F, 0xA5, 0xF6, 0xE9, ++ 0xBE, 0x9C, 0xF6, 0xE9, 0xBE, 0x9C, 0xF6, 0xE5, ++ 0xA5, 0x91, 0xF6, 0xE9, 0x87, 0x91, 0xF6, 0xE5, ++ 0x96, 0x87, 0xF6, 0xE5, 0xA5, 0x88, 0xF6, 0xE6, ++ 0x87, 0xB6, 0xF6, 0xE7, 0x99, 0xA9, 0xF6, 0xE7, ++ 0xBE, 0x85, 0xF6, 0xE8, 0x98, 0xBF, 0xF6, 0xE8, ++ 0x9E, 0xBA, 0xF6, 0xE8, 0xA3, 0xB8, 0xF6, 0xE9, ++ 0x82, 0x8F, 0xF6, 0xE6, 0xA8, 0x82, 0xF6, 0xE6, ++ 0xB4, 0x9B, 0xF6, 0xE7, 0x83, 0x99, 0xF6, 0xE7, ++ 0x8F, 0x9E, 0xF6, 0xE8, 0x90, 0xBD, 0xF6, 0xE9, ++ 0x85, 0xAA, 0xF6, 0xE9, 0xA7, 0xB1, 0xF6, 0xE4, ++ 0xBA, 0x82, 0xF6, 0xE5, 0x8D, 0xB5, 0xF6, 0xE6, ++ 0xAC, 0x84, 0xF6, 0xE7, 0x88, 0x9B, 0xF6, 0xE8, ++ 0x98, 0xAD, 0xF6, 0xE9, 0xB8, 0x9E, 0xF6, 0xE5, ++ 0xB5, 0x90, 0xF6, 0xE6, 0xBF, 0xAB, 0xF6, 0xE8, ++ 0x97, 0x8D, 0xF6, 0xE8, 0xA5, 0xA4, 0xF6, 0xE6, ++ 0x8B, 0x89, 0xF6, 0xE8, 0x87, 0x98, 0xF6, 0xE8, ++ 0xA0, 0x9F, 0xF6, 0xE5, 0xBB, 0x8A, 0xF6, 0xE6, ++ 0x9C, 0x97, 0xF6, 0xE6, 0xB5, 0xAA, 0xF6, 0xE7, ++ 0x8B, 0xBC, 0xF6, 0xE9, 0x83, 0x8E, 0xF6, 0xE4, ++ 0xBE, 0x86, 0xF6, 0xE5, 0x86, 0xB7, 0xF6, 0xE5, ++ 0x8B, 0x9E, 0xF6, 0xE6, 0x93, 0x84, 0xF6, 0xE6, ++ 0xAB, 0x93, 0xF6, 0xE7, 0x88, 0x90, 0xF6, 0xE7, ++ 0x9B, 0xA7, 0xF6, 0xE8, 0x80, 0x81, 0xF6, 0xE8, ++ 0x98, 0x86, 0xF6, 0xE8, 0x99, 0x9C, 0xF6, 0xE8, ++ 0xB7, 0xAF, 0xF6, 0xE9, 0x9C, 0xB2, 0xF6, 0xE9, ++ 0xAD, 0xAF, 0xF6, 0xE9, 0xB7, 0xBA, 0xF6, 0xE7, ++ 0xA2, 0x8C, 0xF6, 0xE7, 0xA5, 0xBF, 0xF6, 0xE7, ++ 0xB6, 0xA0, 0xF6, 0xE8, 0x8F, 0x89, 0xF6, 0xE9, ++ 0x8C, 0x84, 0xF6, 0xE9, 0xB9, 0xBF, 0xF6, 0xE8, ++ 0xAB, 0x96, 0xF6, 0xE5, 0xA3, 0x9F, 0xF6, 0xE5, ++ 0xBC, 0x84, 0xF6, 0xE7, 0xB1, 0xA0, 0xF6, 0xE8, ++ 0x81, 0xBE, 0xF6, 0xE7, 0x89, 0xA2, 0xF6, 0xE7, ++ 0xA3, 0x8A, 0xF6, 0xE8, 0xB3, 0x82, 0xF6, 0xE9, ++ 0x9B, 0xB7, 0xF6, 0xE5, 0xA3, 0x98, 0xF6, 0xE5, ++ 0xB1, 0xA2, 0xF6, 0xE6, 0xA8, 0x93, 0xF6, 0xE6, ++ 0xB7, 0x9A, 0xF6, 0xE6, 0xBC, 0x8F, 0xF6, 0xE7, ++ 0xB4, 0xAF, 0xF6, 0xE7, 0xB8, 0xB7, 0xF6, 0xE9, ++ 0x99, 0x8B, 0xF6, 0xE5, 0x8B, 0x92, 0xF6, 0xE8, ++ 0x82, 0x8B, 0xF6, 0xE5, 0x87, 0x9C, 0xF6, 0xE5, ++ 0x87, 0x8C, 0xF6, 0xE7, 0xA8, 0x9C, 0xF6, 0xE7, ++ 0xB6, 0xBE, 0xF6, 0xE8, 0x8F, 0xB1, 0xF6, 0xE9, ++ 0x99, 0xB5, 0xF6, 0xE8, 0xAE, 0x80, 0xF6, 0xE6, ++ 0x8B, 0x8F, 0xF6, 0xE6, 0xA8, 0x82, 0xF6, 0xE8, ++ 0xAB, 0xBE, 0xF6, 0xE4, 0xB8, 0xB9, 0xF6, 0xE5, ++ 0xAF, 0xA7, 0xF6, 0xE6, 0x80, 0x92, 0xF6, 0xE7, ++ 0x8E, 0x87, 0xF6, 0xE7, 0x95, 0xB0, 0xF6, 0xE5, ++ 0x8C, 0x97, 0xF6, 0xE7, 0xA3, 0xBB, 0xF6, 0xE4, ++ 0xBE, 0xBF, 0xF6, 0xE5, 0xBE, 0xA9, 0xF6, 0xE4, ++ 0xB8, 0x8D, 0xF6, 0xE6, 0xB3, 0x8C, 0xF6, 0xE6, ++ 0x95, 0xB8, 0xF6, 0xE7, 0xB4, 0xA2, 0xF6, 0xE5, ++ 0x8F, 0x83, 0xF6, 0xE5, 0xA1, 0x9E, 0xF6, 0xE7, ++ 0x9C, 0x81, 0xF6, 0xE8, 0x91, 0x89, 0xF6, 0xE8, ++ 0xAA, 0xAA, 0xF6, 0xE6, 0xAE, 0xBA, 0xF6, 0xE8, ++ 0xBE, 0xB0, 0xF6, 0xE6, 0xB2, 0x88, 0xF6, 0xE6, ++ 0x8B, 0xBE, 0xF6, 0xE8, 0x8B, 0xA5, 0xF6, 0xE6, ++ 0x8E, 0xA0, 0xF6, 0xE7, 0x95, 0xA5, 0xF6, 0xE4, ++ 0xBA, 0xAE, 0xF6, 0xE5, 0x85, 0xA9, 0xF6, 0xE5, ++ 0x87, 0x89, 0xF6, 0xE6, 0xA2, 0x81, 0xF6, 0xE7, ++ 0xB3, 0xA7, 0xF6, 0xE8, 0x89, 0xAF, 0xF6, 0xE8, ++ 0xAB, 0x92, 0xF6, 0xE9, 0x87, 0x8F, 0xF6, 0xE5, ++ 0x8B, 0xB5, 0xF6, 0xE5, 0x91, 0x82, 0xF6, 0xE5, ++ 0xA5, 0xB3, 0xF6, 0xE5, 0xBB, 0xAC, 0xF6, 0xE6, ++ 0x97, 0x85, 0xF6, 0xE6, 0xBF, 0xBE, 0xF6, 0xE7, ++ 0xA4, 0xAA, 0xF6, 0xE9, 0x96, 0xAD, 0xF6, 0xE9, ++ 0xA9, 0xAA, 0xF6, 0xE9, 0xBA, 0x97, 0xF6, 0xE9, ++ 0xBB, 0x8E, 0xF6, 0xE5, 0x8A, 0x9B, 0xF6, 0xE6, ++ 0x9B, 0x86, 0xF6, 0xE6, 0xAD, 0xB7, 0xF6, 0xE8, ++ 0xBD, 0xA2, 0xF6, 0xE5, 0xB9, 0xB4, 0xF6, 0xE6, ++ 0x86, 0x90, 0xF6, 0xE6, 0x88, 0x80, 0xF6, 0xE6, ++ 0x92, 0x9A, 0xF6, 0xE6, 0xBC, 0xA3, 0xF6, 0xE7, ++ 0x85, 0x89, 0xF6, 0xE7, 0x92, 0x89, 0xF6, 0xE7, ++ 0xA7, 0x8A, 0xF6, 0xE7, 0xB7, 0xB4, 0xF6, 0xE8, ++ 0x81, 0xAF, 0xF6, 0xE8, 0xBC, 0xA6, 0xF6, 0xE8, ++ 0x93, 0xAE, 0xF6, 0xE9, 0x80, 0xA3, 0xF6, 0xE9, ++ 0x8D, 0x8A, 0xF6, 0xE5, 0x88, 0x97, 0xF6, 0xE5, ++ 0x8A, 0xA3, 0xF6, 0xE5, 0x92, 0xBD, 0xF6, 0xE7, ++ 0x83, 0x88, 0xF6, 0xE8, 0xA3, 0x82, 0xF6, 0xE8, ++ 0xAA, 0xAA, 0xF6, 0xE5, 0xBB, 0x89, 0xF6, 0xE5, ++ 0xBF, 0xB5, 0xF6, 0xE6, 0x8D, 0xBB, 0xF6, 0xE6, ++ 0xAE, 0xAE, 0xF6, 0xE7, 0xB0, 0xBE, 0xF6, 0xE7, ++ 0x8D, 0xB5, 0xF6, 0xE4, 0xBB, 0xA4, 0xF6, 0xE5, ++ 0x9B, 0xB9, 0xF6, 0xE5, 0xAF, 0xA7, 0xF6, 0xE5, ++ 0xB6, 0xBA, 0xF6, 0xE6, 0x80, 0x9C, 0xF6, 0xE7, ++ 0x8E, 0xB2, 0xF6, 0xE7, 0x91, 0xA9, 0xF6, 0xE7, ++ 0xBE, 0x9A, 0xF6, 0xE8, 0x81, 0x86, 0xF6, 0xE9, ++ 0x88, 0xB4, 0xF6, 0xE9, 0x9B, 0xB6, 0xF6, 0xE9, ++ 0x9D, 0x88, 0xF6, 0xE9, 0xA0, 0x98, 0xF6, 0xE4, ++ 0xBE, 0x8B, 0xF6, 0xE7, 0xA6, 0xAE, 0xF6, 0xE9, ++ 0x86, 0xB4, 0xF6, 0xE9, 0x9A, 0xB8, 0xF6, 0xE6, ++ 0x83, 0xA1, 0xF6, 0xE4, 0xBA, 0x86, 0xF6, 0xE5, ++ 0x83, 0x9A, 0xF6, 0xE5, 0xAF, 0xAE, 0xF6, 0xE5, ++ 0xB0, 0xBF, 0xF6, 0xE6, 0x96, 0x99, 0xF6, 0xE6, ++ 0xA8, 0x82, 0xF6, 0xE7, 0x87, 0x8E, 0xF6, 0xE7, ++ 0x99, 0x82, 0xF6, 0xE8, 0x93, 0xBC, 0xF6, 0xE9, ++ 0x81, 0xBC, 0xF6, 0xE9, 0xBE, 0x8D, 0xF6, 0xE6, ++ 0x9A, 0x88, 0xF6, 0xE9, 0x98, 0xAE, 0xF6, 0xE5, ++ 0x8A, 0x89, 0xF6, 0xE6, 0x9D, 0xBB, 0xF6, 0xE6, ++ 0x9F, 0xB3, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6, ++ 0xBA, 0x9C, 0xF6, 0xE7, 0x90, 0x89, 0xF6, 0xE7, ++ 0x95, 0x99, 0xF6, 0xE7, 0xA1, 0xAB, 0xF6, 0xE7, ++ 0xB4, 0x90, 0xF6, 0xE9, 0xA1, 0x9E, 0xF6, 0xE5, ++ 0x85, 0xAD, 0xF6, 0xE6, 0x88, 0xAE, 0xF6, 0xE9, ++ 0x99, 0xB8, 0xF6, 0xE5, 0x80, 0xAB, 0xF6, 0xE5, ++ 0xB4, 0x99, 0xF6, 0xE6, 0xB7, 0xAA, 0xF6, 0xE8, ++ 0xBC, 0xAA, 0xF6, 0xE5, 0xBE, 0x8B, 0xF6, 0xE6, ++ 0x85, 0x84, 0xF6, 0xE6, 0xA0, 0x97, 0xF6, 0xE7, ++ 0x8E, 0x87, 0xF6, 0xE9, 0x9A, 0x86, 0xF6, 0xE5, ++ 0x88, 0xA9, 0xF6, 0xE5, 0x90, 0x8F, 0xF6, 0xE5, ++ 0xB1, 0xA5, 0xF6, 0xE6, 0x98, 0x93, 0xF6, 0xE6, ++ 0x9D, 0x8E, 0xF6, 0xE6, 0xA2, 0xA8, 0xF6, 0xE6, ++ 0xB3, 0xA5, 0xF6, 0xE7, 0x90, 0x86, 0xF6, 0xE7, ++ 0x97, 0xA2, 0xF6, 0xE7, 0xBD, 0xB9, 0xF6, 0xE8, ++ 0xA3, 0x8F, 0xF6, 0xE8, 0xA3, 0xA1, 0xF6, 0xE9, ++ 0x87, 0x8C, 0xF6, 0xE9, 0x9B, 0xA2, 0xF6, 0xE5, ++ 0x8C, 0xBF, 0xF6, 0xE6, 0xBA, 0xBA, 0xF6, 0xE5, ++ 0x90, 0x9D, 0xF6, 0xE7, 0x87, 0x90, 0xF6, 0xE7, ++ 0x92, 0x98, 0xF6, 0xE8, 0x97, 0xBA, 0xF6, 0xE9, ++ 0x9A, 0xA3, 0xF6, 0xE9, 0xB1, 0x97, 0xF6, 0xE9, ++ 0xBA, 0x9F, 0xF6, 0xE6, 0x9E, 0x97, 0xF6, 0xE6, ++ 0xB7, 0x8B, 0xF6, 0xE8, 0x87, 0xA8, 0xF6, 0xE7, ++ 0xAB, 0x8B, 0xF6, 0xE7, 0xAC, 0xA0, 0xF6, 0xE7, ++ 0xB2, 0x92, 0xF6, 0xE7, 0x8B, 0x80, 0xF6, 0xE7, ++ 0x82, 0x99, 0xF6, 0xE8, 0xAD, 0x98, 0xF6, 0xE4, ++ 0xBB, 0x80, 0xF6, 0xE8, 0x8C, 0xB6, 0xF6, 0xE5, ++ 0x88, 0xBA, 0xF6, 0xE5, 0x88, 0x87, 0xF6, 0xE5, ++ 0xBA, 0xA6, 0xF6, 0xE6, 0x8B, 0x93, 0xF6, 0xE7, ++ 0xB3, 0x96, 0xF6, 0xE5, 0xAE, 0x85, 0xF6, 0xE6, ++ 0xB4, 0x9E, 0xF6, 0xE6, 0x9A, 0xB4, 0xF6, 0xE8, ++ 0xBC, 0xBB, 0xF6, 0xE8, 0xA1, 0x8C, 0xF6, 0xE9, ++ 0x99, 0x8D, 0xF6, 0xE8, 0xA6, 0x8B, 0xF6, 0xE5, ++ 0xBB, 0x93, 0xF6, 0xE5, 0x85, 0x80, 0xF6, 0xE5, ++ 0x97, 0x80, 0xF6, 0xE5, 0xA1, 0x9A, 0xF6, 0xE6, ++ 0x99, 0xB4, 0xF6, 0xE5, 0x87, 0x9E, 0xF6, 0xE7, ++ 0x8C, 0xAA, 0xF6, 0xE7, 0x9B, 0x8A, 0xF6, 0xE7, ++ 0xA4, 0xBC, 0xF6, 0xE7, 0xA5, 0x9E, 0xF6, 0xE7, ++ 0xA5, 0xA5, 0xF6, 0xE7, 0xA6, 0x8F, 0xF6, 0xE9, ++ 0x9D, 0x96, 0xF6, 0xE7, 0xB2, 0xBE, 0xF6, 0xE7, ++ 0xBE, 0xBD, 0xF6, 0xE8, 0x98, 0x92, 0xF6, 0xE8, ++ 0xAB, 0xB8, 0xF6, 0xE9, 0x80, 0xB8, 0xF6, 0xE9, ++ 0x83, 0xBD, 0xF6, 0xE9, 0xA3, 0xAF, 0xF6, 0xE9, ++ 0xA3, 0xBC, 0xF6, 0xE9, 0xA4, 0xA8, 0xF6, 0xE9, ++ 0xB6, 0xB4, 0xF6, 0xE4, 0xBE, 0xAE, 0xF6, 0xE5, ++ 0x83, 0xA7, 0xF6, 0xE5, 0x85, 0x8D, 0xF6, 0xE5, ++ 0x8B, 0x89, 0xF6, 0xE5, 0x8B, 0xA4, 0xF6, 0xE5, ++ 0x8D, 0x91, 0xF6, 0xE5, 0x96, 0x9D, 0xF6, 0xE5, ++ 0x98, 0x86, 0xF6, 0xE5, 0x99, 0xA8, 0xF6, 0xE5, ++ 0xA1, 0x80, 0xF6, 0xE5, 0xA2, 0xA8, 0xF6, 0xE5, ++ 0xB1, 0xA4, 0xF6, 0xE5, 0xB1, 0xAE, 0xF6, 0xE6, ++ 0x82, 0x94, 0xF6, 0xE6, 0x85, 0xA8, 0xF6, 0xE6, ++ 0x86, 0x8E, 0xF6, 0xE6, 0x87, 0xB2, 0xF6, 0xE6, ++ 0x95, 0x8F, 0xF6, 0xE6, 0x97, 0xA2, 0xF6, 0xE6, ++ 0x9A, 0x91, 0xF6, 0xE6, 0xA2, 0x85, 0xF6, 0xE6, ++ 0xB5, 0xB7, 0xF6, 0xE6, 0xB8, 0x9A, 0xF6, 0xE6, ++ 0xBC, 0xA2, 0xF6, 0xE7, 0x85, 0xAE, 0xF6, 0xE7, ++ 0x88, 0xAB, 0xF6, 0xE7, 0x90, 0xA2, 0xF6, 0xE7, ++ 0xA2, 0x91, 0xF6, 0xE7, 0xA4, 0xBE, 0xF6, 0xE7, ++ 0xA5, 0x89, 0xF6, 0xE7, 0xA5, 0x88, 0xF6, 0xE7, ++ 0xA5, 0x90, 0xF6, 0xE7, 0xA5, 0x96, 0xF6, 0xE7, ++ 0xA5, 0x9D, 0xF6, 0xE7, 0xA6, 0x8D, 0xF6, 0xE7, ++ 0xA6, 0x8E, 0xF6, 0xE7, 0xA9, 0x80, 0xF6, 0xE7, ++ 0xAA, 0x81, 0xF6, 0xE7, 0xAF, 0x80, 0xF6, 0xE7, ++ 0xB7, 0xB4, 0xF6, 0xE7, 0xB8, 0x89, 0xF6, 0xE7, ++ 0xB9, 0x81, 0xF6, 0xE7, 0xBD, 0xB2, 0xF6, 0xE8, ++ 0x80, 0x85, 0xF6, 0xE8, 0x87, 0xAD, 0xF6, 0xE8, ++ 0x89, 0xB9, 0xF6, 0xE8, 0x89, 0xB9, 0xF6, 0xE8, ++ 0x91, 0x97, 0xF6, 0xE8, 0xA4, 0x90, 0xF6, 0xE8, ++ 0xA6, 0x96, 0xF6, 0xE8, 0xAC, 0x81, 0xF6, 0xE8, ++ 0xAC, 0xB9, 0xF6, 0xE8, 0xB3, 0x93, 0xF6, 0xE8, ++ 0xB4, 0x88, 0xF6, 0xE8, 0xBE, 0xB6, 0xF6, 0xE9, ++ 0x80, 0xB8, 0xF6, 0xE9, 0x9B, 0xA3, 0xF6, 0xE9, ++ 0x9F, 0xBF, 0xF6, 0xE9, 0xA0, 0xBB, 0xF6, 0xE4, ++ 0xB8, 0xA6, 0xF6, 0xE5, 0x86, 0xB5, 0xF6, 0xE5, ++ 0x85, 0xA8, 0xF6, 0xE4, 0xBE, 0x80, 0xF6, 0xE5, ++ 0x85, 0x85, 0xF6, 0xE5, 0x86, 0x80, 0xF6, 0xE5, ++ 0x8B, 0x87, 0xF6, 0xE5, 0x8B, 0xBA, 0xF6, 0xE5, ++ 0x96, 0x9D, 0xF6, 0xE5, 0x95, 0x95, 0xF6, 0xE5, ++ 0x96, 0x99, 0xF6, 0xE5, 0x97, 0xA2, 0xF6, 0xE5, ++ 0xA1, 0x9A, 0xF6, 0xE5, 0xA2, 0xB3, 0xF6, 0xE5, ++ 0xA5, 0x84, 0xF6, 0xE5, 0xA5, 0x94, 0xF6, 0xE5, ++ 0xA9, 0xA2, 0xF6, 0xE5, 0xAC, 0xA8, 0xF6, 0xE5, ++ 0xBB, 0x92, 0xF6, 0xE5, 0xBB, 0x99, 0xF6, 0xE5, ++ 0xBD, 0xA9, 0xF6, 0xE5, 0xBE, 0xAD, 0xF6, 0xE6, ++ 0x83, 0x98, 0xF6, 0xE6, 0x85, 0x8E, 0xF6, 0xE6, ++ 0x84, 0x88, 0xF6, 0xE6, 0x86, 0x8E, 0xF6, 0xE6, ++ 0x85, 0xA0, 0xF6, 0xE6, 0x87, 0xB2, 0xF6, 0xE6, ++ 0x88, 0xB4, 0xF6, 0xE6, 0x8F, 0x84, 0xF6, 0xE6, ++ 0x90, 0x9C, 0xF6, 0xE6, 0x91, 0x92, 0xF6, 0xE6, ++ 0x95, 0x96, 0xF6, 0xE6, 0x99, 0xB4, 0xF6, 0xE6, ++ 0x9C, 0x97, 0xF6, 0xE6, 0x9C, 0x9B, 0xF6, 0xE6, ++ 0x9D, 0x96, 0xF6, 0xE6, 0xAD, 0xB9, 0xF6, 0xE6, ++ 0xAE, 0xBA, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6, ++ 0xBB, 0x9B, 0xF6, 0xE6, 0xBB, 0x8B, 0xF6, 0xE6, ++ 0xBC, 0xA2, 0xF6, 0xE7, 0x80, 0x9E, 0xF6, 0xE7, ++ 0x85, 0xAE, 0xF6, 0xE7, 0x9E, 0xA7, 0xF6, 0xE7, ++ 0x88, 0xB5, 0xF6, 0xE7, 0x8A, 0xAF, 0xF6, 0xE7, ++ 0x8C, 0xAA, 0xF6, 0xE7, 0x91, 0xB1, 0xF6, 0xE7, ++ 0x94, 0x86, 0xF6, 0xE7, 0x94, 0xBB, 0xF6, 0xE7, ++ 0x98, 0x9D, 0xF6, 0xE7, 0x98, 0x9F, 0xF6, 0xE7, ++ 0x9B, 0x8A, 0xF6, 0xE7, 0x9B, 0x9B, 0xF6, 0xE7, ++ 0x9B, 0xB4, 0xF6, 0xE7, 0x9D, 0x8A, 0xF6, 0xE7, ++ 0x9D, 0x80, 0xF6, 0xE7, 0xA3, 0x8C, 0xF6, 0xE7, ++ 0xAA, 0xB1, 0xF6, 0xE7, 0xAF, 0x80, 0xF6, 0xE7, ++ 0xB1, 0xBB, 0xF6, 0xE7, 0xB5, 0x9B, 0xF6, 0xE7, ++ 0xB7, 0xB4, 0xF6, 0xE7, 0xBC, 0xBE, 0xF6, 0xE8, ++ 0x80, 0x85, 0xF6, 0xE8, 0x8D, 0x92, 0xF6, 0xE8, ++ 0x8F, 0xAF, 0xF6, 0xE8, 0x9D, 0xB9, 0xF6, 0xE8, ++ 0xA5, 0x81, 0xF6, 0xE8, 0xA6, 0x86, 0xF6, 0xE8, ++ 0xA6, 0x96, 0xF6, 0xE8, 0xAA, 0xBF, 0xF6, 0xE8, ++ 0xAB, 0xB8, 0xF6, 0xE8, 0xAB, 0x8B, 0xF6, 0xE8, ++ 0xAC, 0x81, 0xF6, 0xE8, 0xAB, 0xBE, 0xF6, 0xE8, ++ 0xAB, 0xAD, 0xF6, 0xE8, 0xAC, 0xB9, 0xF6, 0xE8, ++ 0xAE, 0x8A, 0xF6, 0xE8, 0xB4, 0x88, 0xF6, 0xE8, ++ 0xBC, 0xB8, 0xF6, 0xE9, 0x81, 0xB2, 0xF6, 0xE9, ++ 0x86, 0x99, 0xF6, 0xE9, 0x89, 0xB6, 0xF6, 0xE9, ++ 0x99, 0xBC, 0xF6, 0xE9, 0x9B, 0xA3, 0xF6, 0xE9, ++ 0x9D, 0x96, 0xF6, 0xE9, 0x9F, 0x9B, 0xF6, 0xE9, ++ 0x9F, 0xBF, 0xF6, 0xE9, 0xA0, 0x8B, 0xF6, 0xE9, ++ 0xA0, 0xBB, 0xF6, 0xE9, 0xAC, 0x92, 0xF6, 0xE9, ++ 0xBE, 0x9C, 0xF6, 0xF0, 0xA2, 0xA1, 0x8A, 0xF6, ++ 0xF0, 0xA2, 0xA1, 0x84, 0xF6, 0xF0, 0xA3, 0x8F, ++ 0x95, 0xF6, 0xE3, 0xAE, 0x9D, 0xF6, 0xE4, 0x80, ++ 0x98, 0xF6, 0xE4, 0x80, 0xB9, 0xF6, 0xF0, 0xA5, ++ 0x89, 0x89, 0xF6, 0xF0, 0xA5, 0xB3, 0x90, 0xF6, ++ 0xF0, 0xA7, 0xBB, 0x93, 0xF6, 0xE9, 0xBD, 0x83, ++ 0xF6, 0xE9, 0xBE, 0x8E, 0x66, 0x66, 0x66, 0x69, ++ 0x66, 0x6C, 0x66, 0x66, 0x69, 0x66, 0x66, 0x6C, ++ 0x73, 0x74, 0x73, 0x74, 0xD5, 0xB4, 0xD5, 0xB6, ++ 0xD5, 0xB4, 0xD5, 0xA5, 0xD5, 0xB4, 0xD5, 0xAB, ++ 0xD5, 0xBE, 0xD5, 0xB6, 0xD5, 0xB4, 0xD5, 0xAD, ++ 0xF6, 0xD7, 0x99, 0xD6, 0xB4, 0xF6, 0xD7, 0xB2, ++ 0xD6, 0xB7, 0xD7, 0xA2, 0xD7, 0x90, 0xD7, 0x93, ++ 0xD7, 0x94, 0xD7, 0x9B, 0xD7, 0x9C, 0xD7, 0x9D, ++ 0xD7, 0xA8, 0xD7, 0xAA, 0x2B, 0xF6, 0xD7, 0xA9, ++ 0xD7, 0x81, 0xF6, 0xD7, 0xA9, 0xD7, 0x82, 0xF6, ++ 0xD7, 0xA9, 0xD6, 0xBC, 0xD7, 0x81, 0xF6, 0xD7, ++ 0xA9, 0xD6, 0xBC, 0xD7, 0x82, 0xF6, 0xD7, 0x90, ++ 0xD6, 0xB7, 0xF6, 0xD7, 0x90, 0xD6, 0xB8, 0xF6, ++ 0xD7, 0x90, 0xD6, 0xBC, 0xF6, 0xD7, 0x91, 0xD6, ++ 0xBC, 0xF6, 0xD7, 0x92, 0xD6, 0xBC, 0xF6, 0xD7, ++ 0x93, 0xD6, 0xBC, 0xF6, 0xD7, 0x94, 0xD6, 0xBC, ++ 0xF6, 0xD7, 0x95, 0xD6, 0xBC, 0xF6, 0xD7, 0x96, ++ 0xD6, 0xBC, 0xF6, 0xD7, 0x98, 0xD6, 0xBC, 0xF6, ++ 0xD7, 0x99, 0xD6, 0xBC, 0xF6, 0xD7, 0x9A, 0xD6, ++ 0xBC, 0xF6, 0xD7, 0x9B, 0xD6, 0xBC, 0xF6, 0xD7, ++ 0x9C, 0xD6, 0xBC, 0xF6, 0xD7, 0x9E, 0xD6, 0xBC, ++ 0xF6, 0xD7, 0xA0, 0xD6, 0xBC, 0xF6, 0xD7, 0xA1, ++ 0xD6, 0xBC, 0xF6, 0xD7, 0xA3, 0xD6, 0xBC, 0xF6, ++ 0xD7, 0xA4, 0xD6, 0xBC, 0xF6, 0xD7, 0xA6, 0xD6, ++ 0xBC, 0xF6, 0xD7, 0xA7, 0xD6, 0xBC, 0xF6, 0xD7, ++ 0xA8, 0xD6, 0xBC, 0xF6, 0xD7, 0xA9, 0xD6, 0xBC, ++ 0xF6, 0xD7, 0xAA, 0xD6, 0xBC, 0xF6, 0xD7, 0x95, ++ 0xD6, 0xB9, 0xF6, 0xD7, 0x91, 0xD6, 0xBF, 0xF6, ++ 0xD7, 0x9B, 0xD6, 0xBF, 0xF6, 0xD7, 0xA4, 0xD6, ++ 0xBF, 0xD7, 0x90, 0xD7, 0x9C, 0xD9, 0xB1, 0xD9, ++ 0xB1, 0xD9, 0xBB, 0xD9, 0xBB, 0xD9, 0xBB, 0xD9, ++ 0xBB, 0xD9, 0xBE, 0xD9, 0xBE, 0xD9, 0xBE, 0xD9, ++ 0xBE, 0xDA, 0x80, 0xDA, 0x80, 0xDA, 0x80, 0xDA, ++ 0x80, 0xD9, 0xBA, 0xD9, 0xBA, 0xD9, 0xBA, 0xD9, ++ 0xBA, 0xD9, 0xBF, 0xD9, 0xBF, 0xD9, 0xBF, 0xD9, ++ 0xBF, 0xD9, 0xB9, 0xD9, 0xB9, 0xD9, 0xB9, 0xD9, ++ 0xB9, 0xDA, 0xA4, 0xDA, 0xA4, 0xDA, 0xA4, 0xDA, ++ 0xA4, 0xDA, 0xA6, 0xDA, 0xA6, 0xDA, 0xA6, 0xDA, ++ 0xA6, 0xDA, 0x84, 0xDA, 0x84, 0xDA, 0x84, 0xDA, ++ 0x84, 0xDA, 0x83, 0xDA, 0x83, 0xDA, 0x83, 0xDA, ++ 0x83, 0xDA, 0x86, 0xDA, 0x86, 0xDA, 0x86, 0xDA, ++ 0x86, 0xDA, 0x87, 0xDA, 0x87, 0xDA, 0x87, 0xDA, ++ 0x87, 0xDA, 0x8D, 0xDA, 0x8D, 0xDA, 0x8C, 0xDA, ++ 0x8C, 0xDA, 0x8E, 0xDA, 0x8E, 0xDA, 0x88, 0xDA, ++ 0x88, 0xDA, 0x98, 0xDA, 0x98, 0xDA, 0x91, 0xDA, ++ 0x91, 0xDA, 0xA9, 0xDA, 0xA9, 0xDA, 0xA9, 0xDA, ++ 0xA9, 0xDA, 0xAF, 0xDA, 0xAF, 0xDA, 0xAF, 0xDA, ++ 0xAF, 0xDA, 0xB3, 0xDA, 0xB3, 0xDA, 0xB3, 0xDA, ++ 0xB3, 0xDA, 0xB1, 0xDA, 0xB1, 0xDA, 0xB1, 0xDA, ++ 0xB1, 0xDA, 0xBA, 0xDA, 0xBA, 0xDA, 0xBB, 0xDA, ++ 0xBB, 0xDA, 0xBB, 0xDA, 0xBB, 0xDB, 0x95, 0xD9, ++ 0x94, 0xDB, 0x95, 0xD9, 0x94, 0xDB, 0x81, 0xDB, ++ 0x81, 0xDB, 0x81, 0xDB, 0x81, 0xDA, 0xBE, 0xDA, ++ 0xBE, 0xDA, 0xBE, 0xDA, 0xBE, 0xDB, 0x92, 0xDB, ++ 0x92, 0xDB, 0x92, 0xD9, 0x94, 0xDB, 0x92, 0xD9, ++ 0x94, 0xDA, 0xAD, 0xDA, 0xAD, 0xDA, 0xAD, 0xDA, ++ 0xAD, 0xDB, 0x87, 0xDB, 0x87, 0xDB, 0x86, 0xDB, ++ 0x86, 0xDB, 0x88, 0xDB, 0x88, 0xDB, 0x87, 0xD9, ++ 0xB4, 0xDB, 0x8B, 0xDB, 0x8B, 0xDB, 0x85, 0xDB, ++ 0x85, 0xDB, 0x89, 0xDB, 0x89, 0xDB, 0x90, 0xDB, ++ 0x90, 0xDB, 0x90, 0xDB, 0x90, 0xD9, 0x89, 0xD9, ++ 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, ++ 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x8A, 0xD9, ++ 0x94, 0xDB, 0x95, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, ++ 0x95, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x88, 0xD9, ++ 0x8A, 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x8A, 0xD9, ++ 0x94, 0xDB, 0x87, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, ++ 0x87, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x86, 0xD9, ++ 0x8A, 0xD9, 0x94, 0xDB, 0x86, 0xD9, 0x8A, 0xD9, ++ 0x94, 0xDB, 0x88, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, ++ 0x88, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x90, 0xD9, ++ 0x8A, 0xD9, 0x94, 0xDB, 0x90, 0xD9, 0x8A, 0xD9, ++ 0x94, 0xDB, 0x90, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, ++ 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xD9, ++ 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xDB, 0x8C, 0xDB, ++ 0x8C, 0xDB, 0x8C, 0xDB, 0x8C, 0xD9, 0x8A, 0xD9, ++ 0x94, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, ++ 0xAD, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, ++ 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xD9, 0x8A, 0xD9, ++ 0x94, 0xD9, 0x8A, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8, ++ 0xA8, 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8, ++ 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x89, 0xD8, ++ 0xA8, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD8, ++ 0xAA, 0xD8, 0xAD, 0xD8, 0xAA, 0xD8, 0xAE, 0xD8, ++ 0xAA, 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x89, 0xD8, ++ 0xAA, 0xD9, 0x8A, 0xD8, 0xAB, 0xD8, 0xAC, 0xD8, ++ 0xAB, 0xD9, 0x85, 0xD8, 0xAB, 0xD9, 0x89, 0xD8, ++ 0xAB, 0xD9, 0x8A, 0xD8, 0xAC, 0xD8, 0xAD, 0xD8, ++ 0xAC, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, ++ 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD8, ++ 0xAE, 0xD8, 0xAD, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, ++ 0xB3, 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, ++ 0xB3, 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, ++ 0xB5, 0xD8, 0xAD, 0xD8, 0xB5, 0xD9, 0x85, 0xD8, ++ 0xB6, 0xD8, 0xAC, 0xD8, 0xB6, 0xD8, 0xAD, 0xD8, ++ 0xB6, 0xD8, 0xAE, 0xD8, 0xB6, 0xD9, 0x85, 0xD8, ++ 0xB7, 0xD8, 0xAD, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, ++ 0xB8, 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8, ++ 0xB9, 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8, ++ 0xBA, 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9, ++ 0x81, 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, ++ 0x81, 0xD9, 0x85, 0xD9, 0x81, 0xD9, 0x89, 0xD9, ++ 0x81, 0xD9, 0x8A, 0xD9, 0x82, 0xD8, 0xAD, 0xD9, ++ 0x82, 0xD9, 0x85, 0xD9, 0x82, 0xD9, 0x89, 0xD9, ++ 0x82, 0xD9, 0x8A, 0xD9, 0x83, 0xD8, 0xA7, 0xD9, ++ 0x83, 0xD8, 0xAC, 0xD9, 0x83, 0xD8, 0xAD, 0xD9, ++ 0x83, 0xD8, 0xAE, 0xD9, 0x83, 0xD9, 0x84, 0xD9, ++ 0x83, 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x89, 0xD9, ++ 0x83, 0xD9, 0x8A, 0xD9, 0x84, 0xD8, 0xAC, 0xD9, ++ 0x84, 0xD8, 0xAD, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, ++ 0x84, 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x89, 0xD9, ++ 0x84, 0xD9, 0x8A, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, ++ 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9, ++ 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x89, 0xD9, ++ 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, ++ 0x86, 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9, ++ 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x89, 0xD9, ++ 0x86, 0xD9, 0x8A, 0xD9, 0x87, 0xD8, 0xAC, 0xD9, ++ 0x87, 0xD9, 0x85, 0xD9, 0x87, 0xD9, 0x89, 0xD9, ++ 0x87, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, ++ 0x8A, 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9, ++ 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x89, 0xD9, ++ 0x8A, 0xD9, 0x8A, 0xD8, 0xB0, 0xD9, 0xB0, 0xD8, ++ 0xB1, 0xD9, 0xB0, 0xD9, 0x89, 0xD9, 0xB0, 0x20, ++ 0xD9, 0x8C, 0xD9, 0x91, 0x20, 0xD9, 0x8D, 0xD9, ++ 0x91, 0x20, 0xD9, 0x8E, 0xD9, 0x91, 0x20, 0xD9, ++ 0x8F, 0xD9, 0x91, 0x20, 0xD9, 0x90, 0xD9, 0x91, ++ 0x20, 0xD9, 0x91, 0xD9, 0xB0, 0xD9, 0x8A, 0xD9, ++ 0x94, 0xD8, 0xB1, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, ++ 0xB2, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, ++ 0x8A, 0xD9, 0x94, 0xD9, 0x86, 0xD9, 0x8A, 0xD9, ++ 0x94, 0xD9, 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, ++ 0x8A, 0xD8, 0xA8, 0xD8, 0xB1, 0xD8, 0xA8, 0xD8, ++ 0xB2, 0xD8, 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9, ++ 0x86, 0xD8, 0xA8, 0xD9, 0x89, 0xD8, 0xA8, 0xD9, ++ 0x8A, 0xD8, 0xAA, 0xD8, 0xB1, 0xD8, 0xAA, 0xD8, ++ 0xB2, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA, 0xD9, ++ 0x86, 0xD8, 0xAA, 0xD9, 0x89, 0xD8, 0xAA, 0xD9, ++ 0x8A, 0xD8, 0xAB, 0xD8, 0xB1, 0xD8, 0xAB, 0xD8, ++ 0xB2, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAB, 0xD9, ++ 0x86, 0xD8, 0xAB, 0xD9, 0x89, 0xD8, 0xAB, 0xD9, ++ 0x8A, 0xD9, 0x81, 0xD9, 0x89, 0xD9, 0x81, 0xD9, ++ 0x8A, 0xD9, 0x82, 0xD9, 0x89, 0xD9, 0x82, 0xD9, ++ 0x8A, 0xD9, 0x83, 0xD8, 0xA7, 0xD9, 0x83, 0xD9, ++ 0x84, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x83, 0xD9, ++ 0x89, 0xD9, 0x83, 0xD9, 0x8A, 0xD9, 0x84, 0xD9, ++ 0x85, 0xD9, 0x84, 0xD9, 0x89, 0xD9, 0x84, 0xD9, ++ 0x8A, 0xD9, 0x85, 0xD8, 0xA7, 0xD9, 0x85, 0xD9, ++ 0x85, 0xD9, 0x86, 0xD8, 0xB1, 0xD9, 0x86, 0xD8, ++ 0xB2, 0xD9, 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, ++ 0x86, 0xD9, 0x86, 0xD9, 0x89, 0xD9, 0x86, 0xD9, ++ 0x8A, 0xD9, 0x89, 0xD9, 0xB0, 0xD9, 0x8A, 0xD8, ++ 0xB1, 0xD9, 0x8A, 0xD8, 0xB2, 0xD9, 0x8A, 0xD9, ++ 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x8A, 0xD9, ++ 0x89, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, ++ 0x94, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, ++ 0xAD, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAE, 0xD9, ++ 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, ++ 0x94, 0xD9, 0x87, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8, ++ 0xA8, 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8, ++ 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x87, 0xD8, ++ 0xAA, 0xD8, 0xAC, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, ++ 0xAA, 0xD8, 0xAE, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, ++ 0xAA, 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, ++ 0xAC, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, ++ 0xAD, 0xD8, 0xAC, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, ++ 0xAE, 0xD8, 0xAC, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, ++ 0xB3, 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, ++ 0xB3, 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, ++ 0xB5, 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAE, 0xD8, ++ 0xB5, 0xD9, 0x85, 0xD8, 0xB6, 0xD8, 0xAC, 0xD8, ++ 0xB6, 0xD8, 0xAD, 0xD8, 0xB6, 0xD8, 0xAE, 0xD8, ++ 0xB6, 0xD9, 0x85, 0xD8, 0xB7, 0xD8, 0xAD, 0xD8, ++ 0xB8, 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8, ++ 0xB9, 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8, ++ 0xBA, 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9, ++ 0x81, 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, ++ 0x81, 0xD9, 0x85, 0xD9, 0x82, 0xD8, 0xAD, 0xD9, ++ 0x82, 0xD9, 0x85, 0xD9, 0x83, 0xD8, 0xAC, 0xD9, ++ 0x83, 0xD8, 0xAD, 0xD9, 0x83, 0xD8, 0xAE, 0xD9, ++ 0x83, 0xD9, 0x84, 0xD9, 0x83, 0xD9, 0x85, 0xD9, ++ 0x84, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, ++ 0x84, 0xD8, 0xAE, 0xD9, 0x84, 0xD9, 0x85, 0xD9, ++ 0x84, 0xD9, 0x87, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, ++ 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9, ++ 0x85, 0xD9, 0x85, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, ++ 0x86, 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9, ++ 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9, ++ 0x87, 0xD8, 0xAC, 0xD9, 0x87, 0xD9, 0x85, 0xD9, ++ 0x87, 0xD9, 0xB0, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, ++ 0x8A, 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9, ++ 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, ++ 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, ++ 0x94, 0xD9, 0x87, 0xD8, 0xA8, 0xD9, 0x85, 0xD8, ++ 0xA8, 0xD9, 0x87, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, ++ 0xAA, 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, ++ 0xAB, 0xD9, 0x87, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, ++ 0xB3, 0xD9, 0x87, 0xD8, 0xB4, 0xD9, 0x85, 0xD8, ++ 0xB4, 0xD9, 0x87, 0xD9, 0x83, 0xD9, 0x84, 0xD9, ++ 0x83, 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x85, 0xD9, ++ 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9, ++ 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, ++ 0x80, 0xD9, 0x8E, 0xD9, 0x91, 0xD9, 0x80, 0xD9, ++ 0x8F, 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x90, 0xD9, ++ 0x91, 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9, ++ 0x8A, 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9, ++ 0x8A, 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, ++ 0x8A, 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9, ++ 0x8A, 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9, ++ 0x8A, 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9, ++ 0x8A, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, ++ 0x8A, 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9, ++ 0x8A, 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, ++ 0x8A, 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9, ++ 0x8A, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, ++ 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, ++ 0x85, 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8, ++ 0xB1, 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8, ++ 0xB1, 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9, ++ 0x8A, 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9, ++ 0x8A, 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, ++ 0x8A, 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9, ++ 0x8A, 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9, ++ 0x8A, 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9, ++ 0x8A, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, ++ 0x8A, 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9, ++ 0x8A, 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, ++ 0x8A, 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9, ++ 0x8A, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, ++ 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, ++ 0x85, 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8, ++ 0xB1, 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8, ++ 0xB1, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, ++ 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, ++ 0x85, 0xD8, 0xB3, 0xD9, 0x87, 0xD8, 0xB4, 0xD9, ++ 0x87, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xB3, 0xD8, ++ 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3, 0xD8, ++ 0xAE, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, ++ 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB7, 0xD9, ++ 0x85, 0xD8, 0xB8, 0xD9, 0x85, 0xD8, 0xA7, 0xD9, ++ 0x8B, 0xD8, 0xA7, 0xD9, 0x8B, 0xD8, 0xAA, 0xD8, ++ 0xAC, 0xD9, 0x85, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, ++ 0xAC, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, ++ 0xAA, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAA, 0xD8, ++ 0xAE, 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, ++ 0xAC, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, ++ 0xAA, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9, ++ 0x85, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, ++ 0xAD, 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, ++ 0xAD, 0xD9, 0x85, 0xD9, 0x89, 0xD8, 0xB3, 0xD8, ++ 0xAD, 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAC, 0xD8, ++ 0xAD, 0xD8, 0xB3, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, ++ 0xB3, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xB3, 0xD9, ++ 0x85, 0xD8, 0xAD, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, ++ 0xAC, 0xD8, 0xB3, 0xD9, 0x85, 0xD9, 0x85, 0xD8, ++ 0xB3, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB5, 0xD8, ++ 0xAD, 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAD, 0xD8, ++ 0xAD, 0xD8, 0xB5, 0xD9, 0x85, 0xD9, 0x85, 0xD8, ++ 0xB4, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB4, 0xD8, ++ 0xAD, 0xD9, 0x85, 0xD8, 0xB4, 0xD8, 0xAC, 0xD9, ++ 0x8A, 0xD8, 0xB4, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, ++ 0xB4, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, ++ 0x85, 0xD9, 0x85, 0xD8, 0xB4, 0xD9, 0x85, 0xD9, ++ 0x85, 0xD8, 0xB6, 0xD8, 0xAD, 0xD9, 0x89, 0xD8, ++ 0xB6, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB6, 0xD8, ++ 0xAE, 0xD9, 0x85, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, ++ 0xAD, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, ++ 0xB7, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB7, 0xD9, ++ 0x85, 0xD9, 0x8A, 0xD8, 0xB9, 0xD8, 0xAC, 0xD9, ++ 0x85, 0xD8, 0xB9, 0xD9, 0x85, 0xD9, 0x85, 0xD8, ++ 0xB9, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB9, 0xD9, ++ 0x85, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x85, 0xD9, ++ 0x85, 0xD8, 0xBA, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, ++ 0xBA, 0xD9, 0x85, 0xD9, 0x89, 0xD9, 0x81, 0xD8, ++ 0xAE, 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, ++ 0x85, 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, ++ 0x82, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x84, 0xD8, ++ 0xAD, 0xD9, 0x85, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, ++ 0x8A, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x89, 0xD9, ++ 0x84, 0xD8, 0xAC, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, ++ 0xAC, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, ++ 0x85, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x85, 0xD9, ++ 0x84, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x84, 0xD9, ++ 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, ++ 0xAC, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD9, ++ 0x85, 0xD8, 0xAD, 0xD9, 0x8A, 0xD9, 0x85, 0xD8, ++ 0xAC, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, ++ 0x85, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9, ++ 0x85, 0xD8, 0xAE, 0xD9, 0x85, 0xD9, 0x85, 0xD8, ++ 0xAC, 0xD8, 0xAE, 0xD9, 0x87, 0xD9, 0x85, 0xD8, ++ 0xAC, 0xD9, 0x87, 0xD9, 0x85, 0xD9, 0x85, 0xD9, ++ 0x86, 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x86, 0xD8, ++ 0xAD, 0xD9, 0x89, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, ++ 0x85, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, ++ 0x86, 0xD8, 0xAC, 0xD9, 0x89, 0xD9, 0x86, 0xD9, ++ 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x85, 0xD9, ++ 0x89, 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x85, 0xD9, ++ 0x8A, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xA8, 0xD8, ++ 0xAE, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD9, ++ 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, ++ 0xAA, 0xD8, 0xAE, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, ++ 0xAE, 0xD9, 0x89, 0xD8, 0xAA, 0xD9, 0x85, 0xD9, ++ 0x8A, 0xD8, 0xAA, 0xD9, 0x85, 0xD9, 0x89, 0xD8, ++ 0xAC, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xAC, 0xD8, ++ 0xAD, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, ++ 0x89, 0xD8, 0xB3, 0xD8, 0xAE, 0xD9, 0x89, 0xD8, ++ 0xB5, 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xB4, 0xD8, ++ 0xAD, 0xD9, 0x8A, 0xD8, 0xB6, 0xD8, 0xAD, 0xD9, ++ 0x8A, 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, ++ 0x84, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, ++ 0xAD, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, ++ 0x8A, 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, ++ 0x85, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x82, 0xD9, ++ 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAD, 0xD9, ++ 0x8A, 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, ++ 0x84, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB9, 0xD9, ++ 0x85, 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9, ++ 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD8, 0xAD, 0xD9, ++ 0x85, 0xD8, 0xAE, 0xD9, 0x8A, 0xD9, 0x84, 0xD8, ++ 0xAC, 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x85, 0xD9, ++ 0x85, 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, ++ 0x86, 0xD8, 0xAC, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, ++ 0xAD, 0xD9, 0x8A, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, ++ 0x8A, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, ++ 0x81, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xA8, 0xD8, ++ 0xAD, 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9, ++ 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, ++ 0xB5, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB3, 0xD8, ++ 0xAE, 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, ++ 0x8A, 0xD8, 0xB5, 0xD9, 0x84, 0xDB, 0x92, 0xD9, ++ 0x82, 0xD9, 0x84, 0xDB, 0x92, 0xD8, 0xA7, 0xD9, ++ 0x84, 0xD9, 0x84, 0xD9, 0x87, 0xD8, 0xA7, 0xD9, ++ 0x83, 0xD8, 0xA8, 0xD8, 0xB1, 0xD9, 0x85, 0xD8, ++ 0xAD, 0xD9, 0x85, 0xD8, 0xAF, 0xD8, 0xB5, 0xD9, ++ 0x84, 0xD8, 0xB9, 0xD9, 0x85, 0xD8, 0xB1, 0xD8, ++ 0xB3, 0xD9, 0x88, 0xD9, 0x84, 0xD8, 0xB9, 0xD9, ++ 0x84, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x88, 0xD8, ++ 0xB3, 0xD9, 0x84, 0xD9, 0x85, 0xD8, 0xB5, 0xD9, ++ 0x84, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x84, 0xD9, ++ 0x89, 0x20, 0xD8, 0xA7, 0xD9, 0x84, 0xD9, 0x84, ++ 0xD9, 0x87, 0x20, 0xD8, 0xB9, 0xD9, 0x84, 0xD9, ++ 0x8A, 0xD9, 0x87, 0x20, 0xD9, 0x88, 0xD8, 0xB3, ++ 0xD9, 0x84, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x84, ++ 0x20, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, ++ 0x84, 0xD9, 0x87, 0xD8, 0xB1, 0xDB, 0x8C, 0xD8, ++ 0xA7, 0xD9, 0x84, 0x2C, 0xE3, 0x80, 0x81, 0xE3, ++ 0x80, 0x82, 0x3A, 0x3B, 0x21, 0x3F, 0xE3, 0x80, ++ 0x96, 0xE3, 0x80, 0x97, 0x2E, 0x2E, 0x2E, 0x2E, ++ 0x2E, 0xE2, 0x80, 0x94, 0xE2, 0x80, 0x93, 0x5F, ++ 0x5F, 0x28, 0x29, 0x7B, 0x7D, 0xE3, 0x80, 0x94, ++ 0xE3, 0x80, 0x95, 0xE3, 0x80, 0x90, 0xE3, 0x80, ++ 0x91, 0xE3, 0x80, 0x8A, 0xE3, 0x80, 0x8B, 0xE3, ++ 0x80, 0x88, 0xE3, 0x80, 0x89, 0xE3, 0x80, 0x8C, ++ 0xE3, 0x80, 0x8D, 0xE3, 0x80, 0x8E, 0xE3, 0x80, ++ 0x8F, 0x5B, 0x5D, 0x20, 0xCC, 0x85, 0x20, 0xCC, ++ 0x85, 0x20, 0xCC, 0x85, 0x20, 0xCC, 0x85, 0x5F, ++ 0x5F, 0x5F, 0x2C, 0xE3, 0x80, 0x81, 0x2E, 0x3B, ++ 0x3A, 0x3F, 0x21, 0xE2, 0x80, 0x94, 0x28, 0x29, ++ 0x7B, 0x7D, 0xE3, 0x80, 0x94, 0xE3, 0x80, 0x95, ++ 0x23, 0x26, 0x2A, 0x2B, 0x2D, 0x3C, 0x3E, 0x3D, ++ 0x5C, 0x24, 0x25, 0x40, 0x20, 0xD9, 0x8B, 0xD9, ++ 0x80, 0xD9, 0x8B, 0x20, 0xD9, 0x8C, 0x20, 0xD9, ++ 0x8D, 0x20, 0xD9, 0x8E, 0xD9, 0x80, 0xD9, 0x8E, ++ 0x20, 0xD9, 0x8F, 0xD9, 0x80, 0xD9, 0x8F, 0x20, ++ 0xD9, 0x90, 0xD9, 0x80, 0xD9, 0x90, 0x20, 0xD9, ++ 0x91, 0xD9, 0x80, 0xD9, 0x91, 0x20, 0xD9, 0x92, ++ 0xD9, 0x80, 0xD9, 0x92, 0xD8, 0xA1, 0xD8, 0xA7, ++ 0xD9, 0x93, 0xD8, 0xA7, 0xD9, 0x93, 0xD8, 0xA7, ++ 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x94, 0xD9, 0x88, ++ 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x94, 0xD8, 0xA7, ++ 0xD9, 0x95, 0xD8, 0xA7, 0xD9, 0x95, 0xD9, 0x8A, ++ 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x8A, ++ 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xA7, ++ 0xD8, 0xA7, 0xD8, 0xA8, 0xD8, 0xA8, 0xD8, 0xA8, ++ 0xD8, 0xA8, 0xD8, 0xA9, 0xD8, 0xA9, 0xD8, 0xAA, ++ 0xD8, 0xAA, 0xD8, 0xAA, 0xD8, 0xAA, 0xD8, 0xAB, ++ 0xD8, 0xAB, 0xD8, 0xAB, 0xD8, 0xAB, 0xD8, 0xAC, ++ 0xD8, 0xAC, 0xD8, 0xAC, 0xD8, 0xAC, 0xD8, 0xAD, ++ 0xD8, 0xAD, 0xD8, 0xAD, 0xD8, 0xAD, 0xD8, 0xAE, ++ 0xD8, 0xAE, 0xD8, 0xAE, 0xD8, 0xAE, 0xD8, 0xAF, ++ 0xD8, 0xAF, 0xD8, 0xB0, 0xD8, 0xB0, 0xD8, 0xB1, ++ 0xD8, 0xB1, 0xD8, 0xB2, 0xD8, 0xB2, 0xD8, 0xB3, ++ 0xD8, 0xB3, 0xD8, 0xB3, 0xD8, 0xB3, 0xD8, 0xB4, ++ 0xD8, 0xB4, 0xD8, 0xB4, 0xD8, 0xB4, 0xD8, 0xB5, ++ 0xD8, 0xB5, 0xD8, 0xB5, 0xD8, 0xB5, 0xD8, 0xB6, ++ 0xD8, 0xB6, 0xD8, 0xB6, 0xD8, 0xB6, 0xD8, 0xB7, ++ 0xD8, 0xB7, 0xD8, 0xB7, 0xD8, 0xB7, 0xD8, 0xB8, ++ 0xD8, 0xB8, 0xD8, 0xB8, 0xD8, 0xB8, 0xD8, 0xB9, ++ 0xD8, 0xB9, 0xD8, 0xB9, 0xD8, 0xB9, 0xD8, 0xBA, ++ 0xD8, 0xBA, 0xD8, 0xBA, 0xD8, 0xBA, 0xD9, 0x81, ++ 0xD9, 0x81, 0xD9, 0x81, 0xD9, 0x81, 0xD9, 0x82, ++ 0xD9, 0x82, 0xD9, 0x82, 0xD9, 0x82, 0xD9, 0x83, ++ 0xD9, 0x83, 0xD9, 0x83, 0xD9, 0x83, 0xD9, 0x84, ++ 0xD9, 0x84, 0xD9, 0x84, 0xD9, 0x84, 0xD9, 0x85, ++ 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x86, ++ 0xD9, 0x86, 0xD9, 0x86, 0xD9, 0x86, 0xD9, 0x87, ++ 0xD9, 0x87, 0xD9, 0x87, 0xD9, 0x87, 0xD9, 0x88, ++ 0xD9, 0x88, 0xD9, 0x89, 0xD9, 0x89, 0xD9, 0x8A, ++ 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x84, ++ 0xD8, 0xA7, 0xD9, 0x93, 0xD9, 0x84, 0xD8, 0xA7, ++ 0xD9, 0x93, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x94, ++ 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x94, 0xD9, 0x84, ++ 0xD8, 0xA7, 0xD9, 0x95, 0xD9, 0x84, 0xD8, 0xA7, ++ 0xD9, 0x95, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x84, ++ 0xD8, 0xA7, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, ++ 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, ++ 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, ++ 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, ++ 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, ++ 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, ++ 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, ++ 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, ++ 0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, ++ 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, ++ 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, ++ 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, ++ 0xE2, 0xA6, 0x85, 0xE2, 0xA6, 0x86, 0xE3, 0x80, ++ 0x82, 0xE3, 0x80, 0x8C, 0xE3, 0x80, 0x8D, 0xE3, ++ 0x80, 0x81, 0xE3, 0x83, 0xBB, 0xE3, 0x83, 0xB2, ++ 0xE3, 0x82, 0xA1, 0xE3, 0x82, 0xA3, 0xE3, 0x82, ++ 0xA5, 0xE3, 0x82, 0xA7, 0xE3, 0x82, 0xA9, 0xE3, ++ 0x83, 0xA3, 0xE3, 0x83, 0xA5, 0xE3, 0x83, 0xA7, ++ 0xE3, 0x83, 0x83, 0xE3, 0x83, 0xBC, 0xE3, 0x82, ++ 0xA2, 0xE3, 0x82, 0xA4, 0xE3, 0x82, 0xA6, 0xE3, ++ 0x82, 0xA8, 0xE3, 0x82, 0xAA, 0xE3, 0x82, 0xAB, ++ 0xE3, 0x82, 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x82, ++ 0xB1, 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0xB5, 0xE3, ++ 0x82, 0xB7, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xBB, ++ 0xE3, 0x82, 0xBD, 0xE3, 0x82, 0xBF, 0xE3, 0x83, ++ 0x81, 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x86, 0xE3, ++ 0x83, 0x88, 0xE3, 0x83, 0x8A, 0xE3, 0x83, 0x8B, ++ 0xE3, 0x83, 0x8C, 0xE3, 0x83, 0x8D, 0xE3, 0x83, ++ 0x8E, 0xE3, 0x83, 0x8F, 0xE3, 0x83, 0x92, 0xE3, ++ 0x83, 0x95, 0xE3, 0x83, 0x98, 0xE3, 0x83, 0x9B, ++ 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0x9F, 0xE3, 0x83, ++ 0xA0, 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xA2, 0xE3, ++ 0x83, 0xA4, 0xE3, 0x83, 0xA6, 0xE3, 0x83, 0xA8, ++ 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xAA, 0xE3, 0x83, ++ 0xAB, 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xAD, 0xE3, ++ 0x83, 0xAF, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0x99, ++ 0xE3, 0x82, 0x9A, 0xE1, 0x85, 0xA0, 0xE1, 0x84, ++ 0x80, 0xE1, 0x84, 0x81, 0xE1, 0x86, 0xAA, 0xE1, ++ 0x84, 0x82, 0xE1, 0x86, 0xAC, 0xE1, 0x86, 0xAD, ++ 0xE1, 0x84, 0x83, 0xE1, 0x84, 0x84, 0xE1, 0x84, ++ 0x85, 0xE1, 0x86, 0xB0, 0xE1, 0x86, 0xB1, 0xE1, ++ 0x86, 0xB2, 0xE1, 0x86, 0xB3, 0xE1, 0x86, 0xB4, ++ 0xE1, 0x86, 0xB5, 0xE1, 0x84, 0x9A, 0xE1, 0x84, ++ 0x86, 0xE1, 0x84, 0x87, 0xE1, 0x84, 0x88, 0xE1, ++ 0x84, 0xA1, 0xE1, 0x84, 0x89, 0xE1, 0x84, 0x8A, ++ 0xE1, 0x84, 0x8B, 0xE1, 0x84, 0x8C, 0xE1, 0x84, ++ 0x8D, 0xE1, 0x84, 0x8E, 0xE1, 0x84, 0x8F, 0xE1, ++ 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1, 0x84, 0x92, ++ 0xE1, 0x85, 0xA1, 0xE1, 0x85, 0xA2, 0xE1, 0x85, ++ 0xA3, 0xE1, 0x85, 0xA4, 0xE1, 0x85, 0xA5, 0xE1, ++ 0x85, 0xA6, 0xE1, 0x85, 0xA7, 0xE1, 0x85, 0xA8, ++ 0xE1, 0x85, 0xA9, 0xE1, 0x85, 0xAA, 0xE1, 0x85, ++ 0xAB, 0xE1, 0x85, 0xAC, 0xE1, 0x85, 0xAD, 0xE1, ++ 0x85, 0xAE, 0xE1, 0x85, 0xAF, 0xE1, 0x85, 0xB0, ++ 0xE1, 0x85, 0xB1, 0xE1, 0x85, 0xB2, 0xE1, 0x85, ++ 0xB3, 0xE1, 0x85, 0xB4, 0xE1, 0x85, 0xB5, 0xC2, ++ 0xA2, 0xC2, 0xA3, 0xC2, 0xAC, 0x20, 0xCC, 0x84, ++ 0xC2, 0xA6, 0xC2, 0xA5, 0xE2, 0x82, 0xA9, 0xE2, ++ 0x94, 0x82, 0xE2, 0x86, 0x90, 0xE2, 0x86, 0x91, ++ 0xE2, 0x86, 0x92, 0xE2, 0x86, 0x93, 0xE2, 0x96, ++ 0xA0, 0xE2, 0x97, 0x8B, 0xF6, 0xF0, 0x9D, 0x85, ++ 0x97, 0xF0, 0x9D, 0x85, 0xA5, 0xF6, 0xF0, 0x9D, ++ 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF6, 0xF0, ++ 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, ++ 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x85, 0x98, ++ 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAF, ++ 0xF6, 0xF0, 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, ++ 0xA5, 0xF0, 0x9D, 0x85, 0xB0, 0xF6, 0xF0, 0x9D, ++ 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, ++ 0x85, 0xB1, 0xF6, 0xF0, 0x9D, 0x85, 0x98, 0xF0, ++ 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xB2, 0xF6, ++ 0xF0, 0x9D, 0x86, 0xB9, 0xF0, 0x9D, 0x85, 0xA5, ++ 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85, ++ 0xA5, 0xF6, 0xF0, 0x9D, 0x86, 0xB9, 0xF0, 0x9D, ++ 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAE, 0xF6, 0xF0, ++ 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, ++ 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x86, 0xB9, ++ 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAF, ++ 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85, ++ 0xA5, 0xF0, 0x9D, 0x85, 0xAF, 0x41, 0x42, 0x43, ++ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, ++ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, ++ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, ++ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ++ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, ++ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, ++ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, ++ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, ++ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, ++ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, ++ 0x66, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, ++ 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, ++ 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, 0x44, ++ 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, ++ 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, ++ 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, 0x62, ++ 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, ++ 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, ++ 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, ++ 0x41, 0x43, 0x44, 0x47, 0x4A, 0x4B, 0x4E, 0x4F, ++ 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, ++ 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x66, 0x68, ++ 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x70, 0x71, ++ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, ++ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, ++ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, ++ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, ++ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, ++ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, ++ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, ++ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44, ++ 0x45, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, ++ 0x4F, 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57, ++ 0x58, 0x59, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, ++ 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, ++ 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, ++ 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44, 0x45, ++ 0x46, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4F, ++ 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x61, ++ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ++ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, ++ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, ++ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, ++ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, ++ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, ++ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, ++ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, ++ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, ++ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, ++ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, ++ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, ++ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, ++ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ++ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, ++ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, ++ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, ++ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, ++ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, ++ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, ++ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, ++ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, ++ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, ++ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, ++ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, ++ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, ++ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ++ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, ++ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, ++ 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, ++ 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, ++ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, ++ 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, ++ 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, ++ 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, ++ 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, ++ 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, ++ 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, ++ 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, ++ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ++ 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, ++ 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, ++ 0x7A, 0xC4, 0xB1, 0xC8, 0xB7, 0xCE, 0x91, 0xCE, ++ 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, ++ 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, ++ 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, ++ 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, ++ 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, ++ 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, ++ 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, ++ 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, ++ 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, ++ 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, ++ 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, ++ 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, ++ 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, ++ 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, ++ 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE, ++ 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, ++ 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, ++ 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, ++ 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, ++ 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, ++ 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, ++ 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, ++ 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, ++ 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, ++ 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, ++ 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, ++ 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, ++ 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, ++ 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, ++ 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, ++ 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, ++ 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, ++ 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, ++ 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, ++ 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, ++ 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, ++ 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, ++ 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, ++ 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, ++ 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, ++ 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, ++ 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, ++ 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, ++ 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE, ++ 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, ++ 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, ++ 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, ++ 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, ++ 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, ++ 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, ++ 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, ++ 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, ++ 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, ++ 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, ++ 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, ++ 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, ++ 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, ++ 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, ++ 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE, ++ 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, ++ 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, ++ 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, ++ 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, ++ 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, ++ 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, ++ 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, ++ 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, ++ 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, ++ 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, ++ 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, ++ 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, ++ 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, ++ 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, ++ 0x81, 0xCF, 0x80, 0xCF, 0x9C, 0xCF, 0x9D, 0x30, ++ 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, ++ 0x39, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, ++ 0x37, 0x38, 0x39, 0x30, 0x31, 0x32, 0x33, 0x34, ++ 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, 0x31, 0x32, ++ 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, ++ 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, ++ 0x39, 0xF6, 0xE4, 0xB8, 0xBD, 0xF6, 0xE4, 0xB8, ++ 0xB8, 0xF6, 0xE4, 0xB9, 0x81, 0xF6, 0xF0, 0xA0, ++ 0x84, 0xA2, 0xF6, 0xE4, 0xBD, 0xA0, 0xF6, 0xE4, ++ 0xBE, 0xAE, 0xF6, 0xE4, 0xBE, 0xBB, 0xF6, 0xE5, ++ 0x80, 0x82, 0xF6, 0xE5, 0x81, 0xBA, 0xF6, 0xE5, ++ 0x82, 0x99, 0xF6, 0xE5, 0x83, 0xA7, 0xF6, 0xE5, ++ 0x83, 0x8F, 0xF6, 0xE3, 0x92, 0x9E, 0xF6, 0xF0, ++ 0xA0, 0x98, 0xBA, 0xF6, 0xE5, 0x85, 0x8D, 0xF6, ++ 0xE5, 0x85, 0x94, 0xF6, 0xE5, 0x85, 0xA4, 0xF6, ++ 0xE5, 0x85, 0xB7, 0xF6, 0xF0, 0xA0, 0x94, 0x9C, ++ 0xF6, 0xE3, 0x92, 0xB9, 0xF6, 0xE5, 0x85, 0xA7, ++ 0xF6, 0xE5, 0x86, 0x8D, 0xF6, 0xF0, 0xA0, 0x95, ++ 0x8B, 0xF6, 0xE5, 0x86, 0x97, 0xF6, 0xE5, 0x86, ++ 0xA4, 0xF6, 0xE4, 0xBB, 0x8C, 0xF6, 0xE5, 0x86, ++ 0xAC, 0xF6, 0xE5, 0x86, 0xB5, 0xF6, 0xF0, 0xA9, ++ 0x87, 0x9F, 0xF6, 0xE5, 0x87, 0xB5, 0xF6, 0xE5, ++ 0x88, 0x83, 0xF6, 0xE3, 0x93, 0x9F, 0xF6, 0xE5, ++ 0x88, 0xBB, 0xF6, 0xE5, 0x89, 0x86, 0xF6, 0xE5, ++ 0x89, 0xB2, 0xF6, 0xE5, 0x89, 0xB7, 0xF6, 0xE3, ++ 0x94, 0x95, 0xF6, 0xE5, 0x8B, 0x87, 0xF6, 0xE5, ++ 0x8B, 0x89, 0xF6, 0xE5, 0x8B, 0xA4, 0xF6, 0xE5, ++ 0x8B, 0xBA, 0xF6, 0xE5, 0x8C, 0x85, 0xF6, 0xE5, ++ 0x8C, 0x86, 0xF6, 0xE5, 0x8C, 0x97, 0xF6, 0xE5, ++ 0x8D, 0x89, 0xF6, 0xE5, 0x8D, 0x91, 0xF6, 0xE5, ++ 0x8D, 0x9A, 0xF6, 0xE5, 0x8D, 0xB3, 0xF6, 0xE5, ++ 0x8D, 0xBD, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xE5, ++ 0x8D, 0xBF, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xF0, ++ 0xA0, 0xA8, 0xAC, 0xF6, 0xE7, 0x81, 0xB0, 0xF6, ++ 0xE5, 0x8F, 0x8A, 0xF6, 0xE5, 0x8F, 0x9F, 0xF6, ++ 0xF0, 0xA0, 0xAD, 0xA3, 0xF6, 0xE5, 0x8F, 0xAB, ++ 0xF6, 0xE5, 0x8F, 0xB1, 0xF6, 0xE5, 0x90, 0x86, ++ 0xF6, 0xE5, 0x92, 0x9E, 0xF6, 0xE5, 0x90, 0xB8, ++ 0xF6, 0xE5, 0x91, 0x88, 0xF6, 0xE5, 0x91, 0xA8, ++ 0xF6, 0xE5, 0x92, 0xA2, 0xF6, 0xE5, 0x93, 0xB6, ++ 0xF6, 0xE5, 0x94, 0x90, 0xF6, 0xE5, 0x95, 0x93, ++ 0xF6, 0xE5, 0x95, 0xA3, 0xF6, 0xE5, 0x96, 0x84, ++ 0xF6, 0xE5, 0x96, 0x84, 0xF6, 0xE5, 0x96, 0x99, ++ 0xF6, 0xE5, 0x96, 0xAB, 0xF6, 0xE5, 0x96, 0xB3, ++ 0xF6, 0xE5, 0x97, 0x82, 0xF6, 0xE5, 0x9C, 0x96, ++ 0xF6, 0xE5, 0x98, 0x86, 0xF6, 0xE5, 0x9C, 0x97, ++ 0xF6, 0xE5, 0x99, 0x91, 0xF6, 0xE5, 0x99, 0xB4, ++ 0xF6, 0xE5, 0x88, 0x87, 0xF6, 0xE5, 0xA3, 0xAE, ++ 0xF6, 0xE5, 0x9F, 0x8E, 0xF6, 0xE5, 0x9F, 0xB4, ++ 0xF6, 0xE5, 0xA0, 0x8D, 0xF6, 0xE5, 0x9E, 0x8B, ++ 0xF6, 0xE5, 0xA0, 0xB2, 0xF6, 0xE5, 0xA0, 0xB1, ++ 0xF6, 0xE5, 0xA2, 0xAC, 0xF6, 0xF0, 0xA1, 0x93, ++ 0xA4, 0xF6, 0xE5, 0xA3, 0xB2, 0xF6, 0xE5, 0xA3, ++ 0xB7, 0xF6, 0xE5, 0xA4, 0x86, 0xF6, 0xE5, 0xA4, ++ 0x9A, 0xF6, 0xE5, 0xA4, 0xA2, 0xF6, 0xE5, 0xA5, ++ 0xA2, 0xF6, 0xF0, 0xA1, 0x9A, 0xA8, 0xF6, 0xF0, ++ 0xA1, 0x9B, 0xAA, 0xF6, 0xE5, 0xA7, 0xAC, 0xF6, ++ 0xE5, 0xA8, 0x9B, 0xF6, 0xE5, 0xA8, 0xA7, 0xF6, ++ 0xE5, 0xA7, 0x98, 0xF6, 0xE5, 0xA9, 0xA6, 0xF6, ++ 0xE3, 0x9B, 0xAE, 0xF6, 0xE3, 0x9B, 0xBC, 0xF6, ++ 0xE5, 0xAC, 0x88, 0xF6, 0xE5, 0xAC, 0xBE, 0xF6, ++ 0xE5, 0xAC, 0xBE, 0xF6, 0xF0, 0xA1, 0xA7, 0x88, ++ 0xF6, 0xE5, 0xAF, 0x83, 0xF6, 0xE5, 0xAF, 0x98, ++ 0xF6, 0xE5, 0xAF, 0xA7, 0xF6, 0xE5, 0xAF, 0xB3, ++ 0xF6, 0xF0, 0xA1, 0xAC, 0x98, 0xF6, 0xE5, 0xAF, ++ 0xBF, 0xF6, 0xE5, 0xB0, 0x86, 0xF6, 0xE5, 0xBD, ++ 0x93, 0xF6, 0xE5, 0xB0, 0xA2, 0xF6, 0xE3, 0x9E, ++ 0x81, 0xF6, 0xE5, 0xB1, 0xA0, 0xF6, 0xE5, 0xB1, ++ 0xAE, 0xF6, 0xE5, 0xB3, 0x80, 0xF6, 0xE5, 0xB2, ++ 0x8D, 0xF6, 0xF0, 0xA1, 0xB7, 0xA4, 0xF6, 0xE5, ++ 0xB5, 0x83, 0xF6, 0xF0, 0xA1, 0xB7, 0xA6, 0xF6, ++ 0xE5, 0xB5, 0xAE, 0xF6, 0xE5, 0xB5, 0xAB, 0xF6, ++ 0xE5, 0xB5, 0xBC, 0xF6, 0xE5, 0xB7, 0xA1, 0xF6, ++ 0xE5, 0xB7, 0xA2, 0xF6, 0xE3, 0xA0, 0xAF, 0xF6, ++ 0xE5, 0xB7, 0xBD, 0xF6, 0xE5, 0xB8, 0xA8, 0xF6, ++ 0xE5, 0xB8, 0xBD, 0xF6, 0xE5, 0xB9, 0xA9, 0xF6, ++ 0xE3, 0xA1, 0xA2, 0xF6, 0xF0, 0xA2, 0x86, 0x83, ++ 0xF6, 0xE3, 0xA1, 0xBC, 0xF6, 0xE5, 0xBA, 0xB0, ++ 0xF6, 0xE5, 0xBA, 0xB3, 0xF6, 0xE5, 0xBA, 0xB6, ++ 0xF6, 0xE5, 0xBB, 0x8A, 0xF6, 0xF0, 0xAA, 0x8E, ++ 0x92, 0xF6, 0xE5, 0xBB, 0xBE, 0xF6, 0xF0, 0xA2, ++ 0x8C, 0xB1, 0xF6, 0xF0, 0xA2, 0x8C, 0xB1, 0xF6, ++ 0xE8, 0x88, 0x81, 0xF6, 0xE5, 0xBC, 0xA2, 0xF6, ++ 0xE5, 0xBC, 0xA2, 0xF6, 0xE3, 0xA3, 0x87, 0xF6, ++ 0xF0, 0xA3, 0x8A, 0xB8, 0xF6, 0xF0, 0xA6, 0x87, ++ 0x9A, 0xF6, 0xE5, 0xBD, 0xA2, 0xF6, 0xE5, 0xBD, ++ 0xAB, 0xF6, 0xE3, 0xA3, 0xA3, 0xF6, 0xE5, 0xBE, ++ 0x9A, 0xF6, 0xE5, 0xBF, 0x8D, 0xF6, 0xE5, 0xBF, ++ 0x97, 0xF6, 0xE5, 0xBF, 0xB9, 0xF6, 0xE6, 0x82, ++ 0x81, 0xF6, 0xE3, 0xA4, 0xBA, 0xF6, 0xE3, 0xA4, ++ 0x9C, 0xF6, 0xE6, 0x82, 0x94, 0xF6, 0xF0, 0xA2, ++ 0x9B, 0x94, 0xF6, 0xE6, 0x83, 0x87, 0xF6, 0xE6, ++ 0x85, 0x88, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6, ++ 0x85, 0x8E, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6, ++ 0x85, 0xBA, 0xF6, 0xE6, 0x86, 0x8E, 0xF6, 0xE6, ++ 0x86, 0xB2, 0xF6, 0xE6, 0x86, 0xA4, 0xF6, 0xE6, ++ 0x86, 0xAF, 0xF6, 0xE6, 0x87, 0x9E, 0xF6, 0xE6, ++ 0x87, 0xB2, 0xF6, 0xE6, 0x87, 0xB6, 0xF6, 0xE6, ++ 0x88, 0x90, 0xF6, 0xE6, 0x88, 0x9B, 0xF6, 0xE6, ++ 0x89, 0x9D, 0xF6, 0xE6, 0x8A, 0xB1, 0xF6, 0xE6, ++ 0x8B, 0x94, 0xF6, 0xE6, 0x8D, 0x90, 0xF6, 0xF0, ++ 0xA2, 0xAC, 0x8C, 0xF6, 0xE6, 0x8C, 0xBD, 0xF6, ++ 0xE6, 0x8B, 0xBC, 0xF6, 0xE6, 0x8D, 0xA8, 0xF6, ++ 0xE6, 0x8E, 0x83, 0xF6, 0xE6, 0x8F, 0xA4, 0xF6, ++ 0xF0, 0xA2, 0xAF, 0xB1, 0xF6, 0xE6, 0x90, 0xA2, ++ 0xF6, 0xE6, 0x8F, 0x85, 0xF6, 0xE6, 0x8E, 0xA9, ++ 0xF6, 0xE3, 0xA8, 0xAE, 0xF6, 0xE6, 0x91, 0xA9, ++ 0xF6, 0xE6, 0x91, 0xBE, 0xF6, 0xE6, 0x92, 0x9D, ++ 0xF6, 0xE6, 0x91, 0xB7, 0xF6, 0xE3, 0xA9, 0xAC, ++ 0xF6, 0xE6, 0x95, 0x8F, 0xF6, 0xE6, 0x95, 0xAC, ++ 0xF6, 0xF0, 0xA3, 0x80, 0x8A, 0xF6, 0xE6, 0x97, ++ 0xA3, 0xF6, 0xE6, 0x9B, 0xB8, 0xF6, 0xE6, 0x99, ++ 0x89, 0xF6, 0xE3, 0xAC, 0x99, 0xF6, 0xE6, 0x9A, ++ 0x91, 0xF6, 0xE3, 0xAC, 0x88, 0xF6, 0xE3, 0xAB, ++ 0xA4, 0xF6, 0xE5, 0x86, 0x92, 0xF6, 0xE5, 0x86, ++ 0x95, 0xF6, 0xE6, 0x9C, 0x80, 0xF6, 0xE6, 0x9A, ++ 0x9C, 0xF6, 0xE8, 0x82, 0xAD, 0xF6, 0xE4, 0x8F, ++ 0x99, 0xF6, 0xE6, 0x9C, 0x97, 0xF6, 0xE6, 0x9C, ++ 0x9B, 0xF6, 0xE6, 0x9C, 0xA1, 0xF6, 0xE6, 0x9D, ++ 0x9E, 0xF6, 0xE6, 0x9D, 0x93, 0xF6, 0xF0, 0xA3, ++ 0x8F, 0x83, 0xF6, 0xE3, 0xAD, 0x89, 0xF6, 0xE6, ++ 0x9F, 0xBA, 0xF6, 0xE6, 0x9E, 0x85, 0xF6, 0xE6, ++ 0xA1, 0x92, 0xF6, 0xE6, 0xA2, 0x85, 0xF6, 0xF0, ++ 0xA3, 0x91, 0xAD, 0xF6, 0xE6, 0xA2, 0x8E, 0xF6, ++ 0xE6, 0xA0, 0x9F, 0xF6, 0xE6, 0xA4, 0x94, 0xF6, ++ 0xE3, 0xAE, 0x9D, 0xF6, 0xE6, 0xA5, 0x82, 0xF6, ++ 0xE6, 0xA6, 0xA3, 0xF6, 0xE6, 0xA7, 0xAA, 0xF6, ++ 0xE6, 0xAA, 0xA8, 0xF6, 0xF0, 0xA3, 0x9A, 0xA3, ++ 0xF6, 0xE6, 0xAB, 0x9B, 0xF6, 0xE3, 0xB0, 0x98, ++ 0xF6, 0xE6, 0xAC, 0xA1, 0xF6, 0xF0, 0xA3, 0xA2, ++ 0xA7, 0xF6, 0xE6, 0xAD, 0x94, 0xF6, 0xE3, 0xB1, ++ 0x8E, 0xF6, 0xE6, 0xAD, 0xB2, 0xF6, 0xE6, 0xAE, ++ 0x9F, 0xF6, 0xE6, 0xAE, 0xBA, 0xF6, 0xE6, 0xAE, ++ 0xBB, 0xF6, 0xF0, 0xA3, 0xAA, 0x8D, 0xF6, 0xF0, ++ 0xA1, 0xB4, 0x8B, 0xF6, 0xF0, 0xA3, 0xAB, 0xBA, ++ 0xF6, 0xE6, 0xB1, 0x8E, 0xF6, 0xF0, 0xA3, 0xB2, ++ 0xBC, 0xF6, 0xE6, 0xB2, 0xBF, 0xF6, 0xE6, 0xB3, ++ 0x8D, 0xF6, 0xE6, 0xB1, 0xA7, 0xF6, 0xE6, 0xB4, ++ 0x96, 0xF6, 0xE6, 0xB4, 0xBE, 0xF6, 0xE6, 0xB5, ++ 0xB7, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6, 0xB5, ++ 0xA9, 0xF6, 0xE6, 0xB5, 0xB8, 0xF6, 0xE6, 0xB6, ++ 0x85, 0xF6, 0xF0, 0xA3, 0xB4, 0x9E, 0xF6, 0xE6, ++ 0xB4, 0xB4, 0xF6, 0xE6, 0xB8, 0xAF, 0xF6, 0xE6, ++ 0xB9, 0xAE, 0xF6, 0xE3, 0xB4, 0xB3, 0xF6, 0xE6, ++ 0xBB, 0x8B, 0xF6, 0xE6, 0xBB, 0x87, 0xF6, 0xF0, ++ 0xA3, 0xBB, 0x91, 0xF6, 0xE6, 0xB7, 0xB9, 0xF6, ++ 0xE6, 0xBD, 0xAE, 0xF6, 0xF0, 0xA3, 0xBD, 0x9E, ++ 0xF6, 0xF0, 0xA3, 0xBE, 0x8E, 0xF6, 0xE6, 0xBF, ++ 0x86, 0xF6, 0xE7, 0x80, 0xB9, 0xF6, 0xE7, 0x80, ++ 0x9E, 0xF6, 0xE7, 0x80, 0x9B, 0xF6, 0xE3, 0xB6, ++ 0x96, 0xF6, 0xE7, 0x81, 0x8A, 0xF6, 0xE7, 0x81, ++ 0xBD, 0xF6, 0xE7, 0x81, 0xB7, 0xF6, 0xE7, 0x82, ++ 0xAD, 0xF6, 0xF0, 0xA0, 0x94, 0xA5, 0xF6, 0xE7, ++ 0x85, 0x85, 0xF6, 0xF0, 0xA4, 0x89, 0xA3, 0xF6, ++ 0xE7, 0x86, 0x9C, 0xF6, 0xF0, 0xA4, 0x8E, 0xAB, ++ 0xF6, 0xE7, 0x88, 0xA8, 0xF6, 0xE7, 0x88, 0xB5, ++ 0xF6, 0xE7, 0x89, 0x90, 0xF6, 0xF0, 0xA4, 0x98, ++ 0x88, 0xF6, 0xE7, 0x8A, 0x80, 0xF6, 0xE7, 0x8A, ++ 0x95, 0xF6, 0xF0, 0xA4, 0x9C, 0xB5, 0xF6, 0xF0, ++ 0xA4, 0xA0, 0x94, 0xF6, 0xE7, 0x8D, 0xBA, 0xF6, ++ 0xE7, 0x8E, 0x8B, 0xF6, 0xE3, 0xBA, 0xAC, 0xF6, ++ 0xE7, 0x8E, 0xA5, 0xF6, 0xE3, 0xBA, 0xB8, 0xF6, ++ 0xE3, 0xBA, 0xB8, 0xF6, 0xE7, 0x91, 0x87, 0xF6, ++ 0xE7, 0x91, 0x9C, 0xF6, 0xE7, 0x91, 0xB1, 0xF6, ++ 0xE7, 0x92, 0x85, 0xF6, 0xE7, 0x93, 0x8A, 0xF6, ++ 0xE3, 0xBC, 0x9B, 0xF6, 0xE7, 0x94, 0xA4, 0xF6, ++ 0xF0, 0xA4, 0xB0, 0xB6, 0xF6, 0xE7, 0x94, 0xBE, ++ 0xF6, 0xF0, 0xA4, 0xB2, 0x92, 0xF6, 0xE7, 0x95, ++ 0xB0, 0xF6, 0xF0, 0xA2, 0x86, 0x9F, 0xF6, 0xE7, ++ 0x98, 0x90, 0xF6, 0xF0, 0xA4, 0xBE, 0xA1, 0xF6, ++ 0xF0, 0xA4, 0xBE, 0xB8, 0xF6, 0xF0, 0xA5, 0x81, ++ 0x84, 0xF6, 0xE3, 0xBF, 0xBC, 0xF6, 0xE4, 0x80, ++ 0x88, 0xF6, 0xE7, 0x9B, 0xB4, 0xF6, 0xF0, 0xA5, ++ 0x83, 0xB3, 0xF6, 0xF0, 0xA5, 0x83, 0xB2, 0xF6, ++ 0xF0, 0xA5, 0x84, 0x99, 0xF6, 0xF0, 0xA5, 0x84, ++ 0xB3, 0xF6, 0xE7, 0x9C, 0x9E, 0xF6, 0xE7, 0x9C, ++ 0x9F, 0xF6, 0xE7, 0x9C, 0x9F, 0xF6, 0xE7, 0x9D, ++ 0x8A, 0xF6, 0xE4, 0x80, 0xB9, 0xF6, 0xE7, 0x9E, ++ 0x8B, 0xF6, 0xE4, 0x81, 0x86, 0xF6, 0xE4, 0x82, ++ 0x96, 0xF6, 0xF0, 0xA5, 0x90, 0x9D, 0xF6, 0xE7, ++ 0xA1, 0x8E, 0xF6, 0xE7, 0xA2, 0x8C, 0xF6, 0xE7, ++ 0xA3, 0x8C, 0xF6, 0xE4, 0x83, 0xA3, 0xF6, 0xF0, ++ 0xA5, 0x98, 0xA6, 0xF6, 0xE7, 0xA5, 0x96, 0xF6, ++ 0xF0, 0xA5, 0x9A, 0x9A, 0xF6, 0xF0, 0xA5, 0x9B, ++ 0x85, 0xF6, 0xE7, 0xA6, 0x8F, 0xF6, 0xE7, 0xA7, ++ 0xAB, 0xF6, 0xE4, 0x84, 0xAF, 0xF6, 0xE7, 0xA9, ++ 0x80, 0xF6, 0xE7, 0xA9, 0x8A, 0xF6, 0xE7, 0xA9, ++ 0x8F, 0xF6, 0xF0, 0xA5, 0xA5, 0xBC, 0xF6, 0xF0, ++ 0xA5, 0xAA, 0xA7, 0xF6, 0xF0, 0xA5, 0xAA, 0xA7, ++ 0xF6, 0xE7, 0xAB, 0xAE, 0xF6, 0xE4, 0x88, 0x82, ++ 0xF6, 0xF0, 0xA5, 0xAE, 0xAB, 0xF6, 0xE7, 0xAF, ++ 0x86, 0xF6, 0xE7, 0xAF, 0x89, 0xF6, 0xE4, 0x88, ++ 0xA7, 0xF6, 0xF0, 0xA5, 0xB2, 0x80, 0xF6, 0xE7, ++ 0xB3, 0x92, 0xF6, 0xE4, 0x8A, 0xA0, 0xF6, 0xE7, ++ 0xB3, 0xA8, 0xF6, 0xE7, 0xB3, 0xA3, 0xF6, 0xE7, ++ 0xB4, 0x80, 0xF6, 0xF0, 0xA5, 0xBE, 0x86, 0xF6, ++ 0xE7, 0xB5, 0xA3, 0xF6, 0xE4, 0x8C, 0x81, 0xF6, ++ 0xE7, 0xB7, 0x87, 0xF6, 0xE7, 0xB8, 0x82, 0xF6, ++ 0xE7, 0xB9, 0x85, 0xF6, 0xE4, 0x8C, 0xB4, 0xF6, ++ 0xF0, 0xA6, 0x88, 0xA8, 0xF6, 0xF0, 0xA6, 0x89, ++ 0x87, 0xF6, 0xE4, 0x8D, 0x99, 0xF6, 0xF0, 0xA6, ++ 0x8B, 0x99, 0xF6, 0xE7, 0xBD, 0xBA, 0xF6, 0xF0, ++ 0xA6, 0x8C, 0xBE, 0xF6, 0xE7, 0xBE, 0x95, 0xF6, ++ 0xE7, 0xBF, 0xBA, 0xF6, 0xE8, 0x80, 0x85, 0xF6, ++ 0xF0, 0xA6, 0x93, 0x9A, 0xF6, 0xF0, 0xA6, 0x94, ++ 0xA3, 0xF6, 0xE8, 0x81, 0xA0, 0xF6, 0xF0, 0xA6, ++ 0x96, 0xA8, 0xF6, 0xE8, 0x81, 0xB0, 0xF6, 0xF0, ++ 0xA3, 0x8D, 0x9F, 0xF6, 0xE4, 0x8F, 0x95, 0xF6, ++ 0xE8, 0x82, 0xB2, 0xF6, 0xE8, 0x84, 0x83, 0xF6, ++ 0xE4, 0x90, 0x8B, 0xF6, 0xE8, 0x84, 0xBE, 0xF6, ++ 0xE5, 0xAA, 0xB5, 0xF6, 0xF0, 0xA6, 0x9E, 0xA7, ++ 0xF6, 0xF0, 0xA6, 0x9E, 0xB5, 0xF6, 0xF0, 0xA3, ++ 0x8E, 0x93, 0xF6, 0xF0, 0xA3, 0x8E, 0x9C, 0xF6, ++ 0xE8, 0x88, 0x81, 0xF6, 0xE8, 0x88, 0x84, 0xF6, ++ 0xE8, 0xBE, 0x9E, 0xF6, 0xE4, 0x91, 0xAB, 0xF6, ++ 0xE8, 0x8A, 0x91, 0xF6, 0xE8, 0x8A, 0x8B, 0xF6, ++ 0xE8, 0x8A, 0x9D, 0xF6, 0xE5, 0x8A, 0xB3, 0xF6, ++ 0xE8, 0x8A, 0xB1, 0xF6, 0xE8, 0x8A, 0xB3, 0xF6, ++ 0xE8, 0x8A, 0xBD, 0xF6, 0xE8, 0x8B, 0xA6, 0xF6, ++ 0xF0, 0xA6, 0xAC, 0xBC, 0xF6, 0xE8, 0x8B, 0xA5, ++ 0xF6, 0xE8, 0x8C, 0x9D, 0xF6, 0xE8, 0x8D, 0xA3, ++ 0xF6, 0xE8, 0x8E, 0xAD, 0xF6, 0xE8, 0x8C, 0xA3, ++ 0xF6, 0xE8, 0x8E, 0xBD, 0xF6, 0xE8, 0x8F, 0xA7, ++ 0xF6, 0xE8, 0x91, 0x97, 0xF6, 0xE8, 0x8D, 0x93, ++ 0xF6, 0xE8, 0x8F, 0x8A, 0xF6, 0xE8, 0x8F, 0x8C, ++ 0xF6, 0xE8, 0x8F, 0x9C, 0xF6, 0xF0, 0xA6, 0xB0, ++ 0xB6, 0xF6, 0xF0, 0xA6, 0xB5, 0xAB, 0xF6, 0xF0, ++ 0xA6, 0xB3, 0x95, 0xF6, 0xE4, 0x94, 0xAB, 0xF6, ++ 0xE8, 0x93, 0xB1, 0xF6, 0xE8, 0x93, 0xB3, 0xF6, ++ 0xE8, 0x94, 0x96, 0xF6, 0xF0, 0xA7, 0x8F, 0x8A, ++ 0xF6, 0xE8, 0x95, 0xA4, 0xF6, 0xF0, 0xA6, 0xBC, ++ 0xAC, 0xF6, 0xE4, 0x95, 0x9D, 0xF6, 0xE4, 0x95, ++ 0xA1, 0xF6, 0xF0, 0xA6, 0xBE, 0xB1, 0xF6, 0xF0, ++ 0xA7, 0x83, 0x92, 0xF6, 0xE4, 0x95, 0xAB, 0xF6, ++ 0xE8, 0x99, 0x90, 0xF6, 0xE8, 0x99, 0x9C, 0xF6, ++ 0xE8, 0x99, 0xA7, 0xF6, 0xE8, 0x99, 0xA9, 0xF6, ++ 0xE8, 0x9A, 0xA9, 0xF6, 0xE8, 0x9A, 0x88, 0xF6, ++ 0xE8, 0x9C, 0x8E, 0xF6, 0xE8, 0x9B, 0xA2, 0xF6, ++ 0xE8, 0x9D, 0xB9, 0xF6, 0xE8, 0x9C, 0xA8, 0xF6, ++ 0xE8, 0x9D, 0xAB, 0xF6, 0xE8, 0x9E, 0x86, 0xF6, ++ 0xE4, 0x97, 0x97, 0xF6, 0xE8, 0x9F, 0xA1, 0xF6, ++ 0xE8, 0xA0, 0x81, 0xF6, 0xE4, 0x97, 0xB9, 0xF6, ++ 0xE8, 0xA1, 0xA0, 0xF6, 0xE8, 0xA1, 0xA3, 0xF6, ++ 0xF0, 0xA7, 0x99, 0xA7, 0xF6, 0xE8, 0xA3, 0x97, ++ 0xF6, 0xE8, 0xA3, 0x9E, 0xF6, 0xE4, 0x98, 0xB5, ++ 0xF6, 0xE8, 0xA3, 0xBA, 0xF6, 0xE3, 0x92, 0xBB, ++ 0xF6, 0xF0, 0xA7, 0xA2, 0xAE, 0xF6, 0xF0, 0xA7, ++ 0xA5, 0xA6, 0xF6, 0xE4, 0x9A, 0xBE, 0xF6, 0xE4, ++ 0x9B, 0x87, 0xF6, 0xE8, 0xAA, 0xA0, 0xF6, 0xE8, ++ 0xAB, 0xAD, 0xF6, 0xE8, 0xAE, 0x8A, 0xF6, 0xE8, ++ 0xB1, 0x95, 0xF6, 0xF0, 0xA7, 0xB2, 0xA8, 0xF6, ++ 0xE8, 0xB2, 0xAB, 0xF6, 0xE8, 0xB3, 0x81, 0xF6, ++ 0xE8, 0xB4, 0x9B, 0xF6, 0xE8, 0xB5, 0xB7, 0xF6, ++ 0xF0, 0xA7, 0xBC, 0xAF, 0xF6, 0xF0, 0xA0, 0xA0, ++ 0x84, 0xF6, 0xE8, 0xB7, 0x8B, 0xF6, 0xE8, 0xB6, ++ 0xBC, 0xF6, 0xE8, 0xB7, 0xB0, 0xF6, 0xF0, 0xA0, ++ 0xA3, 0x9E, 0xF6, 0xE8, 0xBB, 0x94, 0xF6, 0xE8, ++ 0xBC, 0xB8, 0xF6, 0xF0, 0xA8, 0x97, 0x92, 0xF6, ++ 0xF0, 0xA8, 0x97, 0xAD, 0xF6, 0xE9, 0x82, 0x94, ++ 0xF6, 0xE9, 0x83, 0xB1, 0xF6, 0xE9, 0x84, 0x91, ++ 0xF6, 0xF0, 0xA8, 0x9C, 0xAE, 0xF6, 0xE9, 0x84, ++ 0x9B, 0xF6, 0xE9, 0x88, 0xB8, 0xF6, 0xE9, 0x8B, ++ 0x97, 0xF6, 0xE9, 0x8B, 0x98, 0xF6, 0xE9, 0x89, ++ 0xBC, 0xF6, 0xE9, 0x8F, 0xB9, 0xF6, 0xE9, 0x90, ++ 0x95, 0xF6, 0xF0, 0xA8, 0xAF, 0xBA, 0xF6, 0xE9, ++ 0x96, 0x8B, 0xF6, 0xE4, 0xA6, 0x95, 0xF6, 0xE9, ++ 0x96, 0xB7, 0xF6, 0xF0, 0xA8, 0xB5, 0xB7, 0xF6, ++ 0xE4, 0xA7, 0xA6, 0xF6, 0xE9, 0x9B, 0x83, 0xF6, ++ 0xE5, 0xB6, 0xB2, 0xF6, 0xE9, 0x9C, 0xA3, 0xF6, ++ 0xF0, 0xA9, 0x85, 0x85, 0xF6, 0xF0, 0xA9, 0x88, ++ 0x9A, 0xF6, 0xE4, 0xA9, 0xAE, 0xF6, 0xE4, 0xA9, ++ 0xB6, 0xF6, 0xE9, 0x9F, 0xA0, 0xF6, 0xF0, 0xA9, ++ 0x90, 0x8A, 0xF6, 0xE4, 0xAA, 0xB2, 0xF6, 0xF0, ++ 0xA9, 0x92, 0x96, 0xF6, 0xE9, 0xA0, 0x8B, 0xF6, ++ 0xE9, 0xA0, 0x8B, 0xF6, 0xE9, 0xA0, 0xA9, 0xF6, ++ 0xF0, 0xA9, 0x96, 0xB6, 0xF6, 0xE9, 0xA3, 0xA2, ++ 0xF6, 0xE4, 0xAC, 0xB3, 0xF6, 0xE9, 0xA4, 0xA9, ++ 0xF6, 0xE9, 0xA6, 0xA7, 0xF6, 0xE9, 0xA7, 0x82, ++ 0xF6, 0xE9, 0xA7, 0xBE, 0xF6, 0xE4, 0xAF, 0x8E, ++ 0xF6, 0xF0, 0xA9, 0xAC, 0xB0, 0xF6, 0xE9, 0xAC, ++ 0x92, 0xF6, 0xE9, 0xB1, 0x80, 0xF6, 0xE9, 0xB3, ++ 0xBD, 0xF6, 0xE4, 0xB3, 0x8E, 0xF6, 0xE4, 0xB3, ++ 0xAD, 0xF6, 0xE9, 0xB5, 0xA7, 0xF6, 0xF0, 0xAA, ++ 0x83, 0x8E, 0xF6, 0xE4, 0xB3, 0xB8, 0xF6, 0xF0, ++ 0xAA, 0x84, 0x85, 0xF6, 0xF0, 0xAA, 0x88, 0x8E, ++ 0xF6, 0xF0, 0xAA, 0x8A, 0x91, 0xF6, 0xE9, 0xBA, ++ 0xBB, 0xF6, 0xE4, 0xB5, 0x96, 0xF6, 0xE9, 0xBB, ++ 0xB9, 0xF6, 0xE9, 0xBB, 0xBE, 0xF6, 0xE9, 0xBC, ++ 0x85, 0xF6, 0xE9, 0xBC, 0x8F, 0xF6, 0xE9, 0xBC, ++ 0x96, 0xF6, 0xE9, 0xBC, 0xBB, 0xF6, 0xF0, 0xAA, ++ 0x98, 0x80, ++ }, ++}; ++ ++static const uchar_t u8_case_common_b2_tbl[2][2][256] = { ++ { ++ { ++ 0, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, 1, 2, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, 3, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 4, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ ++ }, ++ { ++ { ++ 0, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, 1, 2, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, 3, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ { ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ 4, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ N_, N_, N_, N_, N_, N_, N_, N_, ++ }, ++ ++ }, ++ ++}; ++ ++static const u8_displacement_t u8_tolower_b3_tbl[2][5][256] = { ++ { ++ { /* Third byte table 0. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { 0, 0 }, ++ { 1, 60 }, { 2, 123 }, { 3, 185 }, { 4, 257 }, ++ { 5, 321 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 6, 373 }, { 7, 439 }, ++ { 8, 465 }, { 9, 561 }, { 10, 593 }, { 11, 649 }, ++ { 12, 703 }, { 13, 749 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 1. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 14, 795 }, { 15, 891 }, { 16, 987 }, { 17, 1068 }, ++ { 18, 1155 }, { 19, 1245 }, { 20, 1299 }, { 21, 1386 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 2. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 22, 1443 }, { 23, 1448 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 24, 1496 }, { 25, 1526 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 3. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 26, 1574 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 4. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 27, 1652 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ }, ++ { ++ { /* Third byte table 0. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { 0, 0 }, ++ { 1, 60 }, { 2, 123 }, { 3, 185 }, { 4, 257 }, ++ { 5, 321 }, { 6, 383 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 7, 401 }, { 8, 467 }, ++ { 9, 505 }, { 10, 601 }, { 11, 633 }, { 12, 689 }, ++ { 13, 753 }, { 14, 803 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 1. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 15, 849 }, { 16, 945 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 17, 963 }, { 18, 1059 }, { 19, 1155 }, { 20, 1236 }, ++ { 21, 1323 }, { 22, 1413 }, { 23, 1467 }, { 24, 1554 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 2. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 25, 1611 }, { 26, 1619 }, { 27, 1667 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 28, 1670 }, { 29, 1700 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 30, 1748 }, { 31, 1889 }, { 32, 1911 }, { 33, 2007 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 3. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 34, 2061 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 4. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 35, 2139 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ }, ++}; ++ ++static const uchar_t u8_tolower_b4_tbl[2][36][257] = { ++ { ++ { /* Fourth byte table 0. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 46, 48, 50, 52, 54, 56, 58, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, ++ }, ++ { /* Fourth byte table 1. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 4, 4, 6, 6, 8, ++ 8, 10, 10, 12, 12, 14, 14, 16, ++ 16, 18, 18, 20, 20, 22, 22, 24, ++ 24, 26, 26, 28, 28, 30, 30, 32, ++ 32, 34, 34, 36, 36, 38, 38, 40, ++ 40, 42, 42, 44, 44, 46, 46, 48, ++ 48, 49, 49, 51, 51, 53, 53, 55, ++ 55, 55, 57, 57, 59, 59, 61, 61, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, ++ }, ++ { /* Fourth byte table 2. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 2, 4, 4, 6, 6, ++ 8, 8, 8, 10, 10, 12, 12, 14, ++ 14, 16, 16, 18, 18, 20, 20, 22, ++ 22, 24, 24, 26, 26, 28, 28, 30, ++ 30, 32, 32, 34, 34, 36, 36, 38, ++ 38, 40, 40, 42, 42, 44, 44, 46, ++ 46, 48, 48, 50, 50, 52, 52, 54, ++ 54, 56, 58, 58, 60, 60, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, ++ }, ++ { /* Fourth byte table 3. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 4, 4, 6, 6, 8, ++ 10, 10, 12, 14, 16, 16, 16, 18, ++ 20, 22, 24, 24, 26, 28, 28, 30, ++ 32, 34, 34, 34, 34, 36, 38, 38, ++ 40, 42, 42, 44, 44, 46, 46, 48, ++ 50, 50, 52, 52, 52, 54, 54, 56, ++ 58, 58, 60, 62, 64, 64, 66, 66, ++ 68, 70, 70, 70, 70, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, ++ }, ++ { /* Fourth byte table 4. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 2, 4, 4, ++ 6, 8, 8, 10, 12, 12, 14, 14, ++ 16, 16, 18, 18, 20, 20, 22, 22, ++ 24, 24, 26, 26, 28, 28, 28, 30, ++ 30, 32, 32, 34, 34, 36, 36, 38, ++ 38, 40, 40, 42, 42, 44, 44, 46, ++ 46, 46, 48, 50, 50, 52, 52, 54, ++ 56, 58, 58, 60, 60, 62, 62, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, ++ }, ++ { /* Fourth byte table 5. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 4, 4, 6, 6, 8, ++ 8, 10, 10, 12, 12, 14, 14, 16, ++ 16, 18, 18, 20, 20, 22, 22, 24, ++ 24, 26, 26, 28, 28, 30, 30, 32, ++ 32, 34, 34, 36, 36, 38, 38, 40, ++ 40, 42, 42, 44, 44, 46, 46, 48, ++ 48, 50, 50, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, ++ }, ++ { /* Fourth byte table 6. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 2, ++ 2, 4, 6, 8, 8, 10, 10, 12, ++ 14, 14, 16, 18, 20, 22, 24, 26, ++ 28, 30, 32, 34, 36, 38, 40, 42, ++ 44, 46, 48, 48, 50, 52, 54, 56, ++ 58, 60, 62, 64, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, ++ }, ++ { /* Fourth byte table 7. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 4, 4, 6, 6, 8, ++ 8, 10, 10, 12, 12, 14, 14, 16, ++ 16, 18, 18, 20, 20, 22, 22, 24, ++ 24, 24, 24, 24, 24, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, ++ }, ++ { /* Fourth byte table 8. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 48, 50, 52, 54, 56, 58, 60, 62, ++ 64, 66, 68, 70, 72, 74, 76, 78, ++ 80, 82, 84, 86, 88, 90, 92, 94, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 9. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 4, 4, 6, 6, 8, ++ 8, 10, 10, 12, 12, 14, 14, 16, ++ 16, 18, 18, 20, 20, 22, 22, 24, ++ 24, 26, 26, 28, 28, 30, 30, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, ++ }, ++ { /* Fourth byte table 10. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 4, 4, 6, 6, 8, ++ 8, 10, 10, 12, 12, 14, 14, 16, ++ 16, 18, 18, 20, 20, 22, 22, 24, ++ 24, 26, 26, 28, 28, 30, 30, 32, ++ 32, 34, 34, 36, 36, 38, 38, 40, ++ 40, 42, 42, 44, 44, 46, 46, 48, ++ 48, 50, 50, 52, 52, 54, 54, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, ++ }, ++ { /* Fourth byte table 11. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 2, 4, 4, 6, 6, ++ 8, 8, 10, 10, 12, 12, 14, 14, ++ 14, 16, 16, 18, 18, 20, 20, 22, ++ 22, 24, 24, 26, 26, 28, 28, 30, ++ 30, 32, 32, 34, 34, 36, 36, 38, ++ 38, 40, 40, 42, 42, 44, 44, 46, ++ 46, 48, 48, 50, 50, 52, 52, 52, ++ 52, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, ++ }, ++ { /* Fourth byte table 12. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 4, 4, 6, 6, 8, ++ 8, 10, 10, 12, 12, 14, 14, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 18, 20, 22, 24, 26, 28, ++ 30, 32, 34, 36, 38, 40, 42, 44, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, ++ }, ++ { /* Fourth byte table 13. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, ++ }, ++ { /* Fourth byte table 14. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 3, 6, 6, 9, 9, 12, ++ 12, 15, 15, 18, 18, 21, 21, 24, ++ 24, 27, 27, 30, 30, 33, 33, 36, ++ 36, 39, 39, 42, 42, 45, 45, 48, ++ 48, 51, 51, 54, 54, 57, 57, 60, ++ 60, 63, 63, 66, 66, 69, 69, 72, ++ 72, 75, 75, 78, 78, 81, 81, 84, ++ 84, 87, 87, 90, 90, 93, 93, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 15. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 3, 6, 6, 9, 9, 12, ++ 12, 15, 15, 18, 18, 21, 21, 24, ++ 24, 27, 27, 30, 30, 33, 33, 36, ++ 36, 39, 39, 42, 42, 45, 45, 48, ++ 48, 51, 51, 54, 54, 57, 57, 60, ++ 60, 63, 63, 66, 66, 69, 69, 72, ++ 72, 75, 75, 78, 78, 81, 81, 84, ++ 84, 87, 87, 90, 90, 93, 93, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 16. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 3, 6, 6, 9, 9, 12, ++ 12, 15, 15, 18, 18, 21, 21, 24, ++ 24, 27, 27, 30, 30, 33, 33, 33, ++ 33, 33, 33, 33, 33, 33, 33, 33, ++ 33, 36, 36, 39, 39, 42, 42, 45, ++ 45, 48, 48, 51, 51, 54, 54, 57, ++ 57, 60, 60, 63, 63, 66, 66, 69, ++ 69, 72, 72, 75, 75, 78, 78, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, ++ }, ++ { /* Fourth byte table 17. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 3, 6, 6, 9, 9, 12, ++ 12, 15, 15, 18, 18, 21, 21, 24, ++ 24, 27, 27, 30, 30, 33, 33, 36, ++ 36, 39, 39, 42, 42, 45, 45, 48, ++ 48, 51, 51, 54, 54, 57, 57, 60, ++ 60, 63, 63, 66, 66, 69, 69, 72, ++ 72, 75, 75, 78, 78, 81, 81, 84, ++ 84, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, ++ }, ++ { /* Fourth byte table 18. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 27, 30, 33, 36, 39, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 45, 48, 51, 54, 57, 60, 63, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 69, 72, 75, 78, 81, 84, 87, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, ++ }, ++ { /* Fourth byte table 19. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 21, 21, 24, 24, 27, 27, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 33, 36, 39, 42, 45, 48, 51, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, ++ }, ++ { /* Fourth byte table 20. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 75, 78, 81, 84, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, ++ }, ++ { /* Fourth byte table 21. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 18, 21, 24, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 30, 33, 36, 39, 42, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 45, 48, 51, 54, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, ++ }, ++ { /* Fourth byte table 22. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 2, ++ 2, 2, 2, 3, 5, 5, 5, 5, ++ 5, 5, 5, 5, 5, 5, 5, 5, ++ 5, 5, 5, 5, 5, 5, 5, 5, ++ 5, 5, 5, 5, 5, 5, 5, 5, ++ 5, 5, 5, 5, 5, 5, 5, 5, ++ 5, 5, 5, 5, 5, 5, 5, 5, ++ 5, 5, 5, 5, 5, 5, 5, 5, ++ 5, 5, 5, 5, 5, 5, 5, 5, ++ 5, 5, 5, 5, 5, 5, 5, 5, ++ 5, 5, 5, 5, 5, 5, 5, 5, ++ 5, 5, 5, 5, 5, 5, 5, 5, ++ 5, ++ }, ++ { /* Fourth byte table 23. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, ++ }, ++ { /* Fourth byte table 24. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 3, ++ 6, 9, 12, 15, 18, 21, 24, 27, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, ++ }, ++ { /* Fourth byte table 25. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, ++ }, ++ { /* Fourth byte table 26. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 6, 9, 12, 15, 18, ++ 21, 24, 27, 30, 33, 36, 39, 42, ++ 45, 48, 51, 54, 57, 60, 63, 66, ++ 69, 72, 75, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, ++ }, ++ { /* Fourth byte table 27. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 128, 132, 136, 140, 144, 148, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, 152, 152, 152, 152, 152, 152, 152, ++ 152, ++ }, ++ { /* Fourth byte table 28. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 29. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 30. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 31. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 32. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 33. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 34. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 35. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ }, ++ { ++ { /* Fourth byte table 0. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 46, 48, 50, 52, 54, 56, 58, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, 60, 60, 60, 60, 60, 60, 60, ++ 60, ++ }, ++ { /* Fourth byte table 1. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 4, 4, 6, 6, 8, ++ 8, 10, 10, 12, 12, 14, 14, 16, ++ 16, 18, 18, 20, 20, 22, 22, 24, ++ 24, 26, 26, 28, 28, 30, 30, 32, ++ 32, 34, 34, 36, 36, 38, 38, 40, ++ 40, 42, 42, 44, 44, 46, 46, 48, ++ 48, 49, 49, 51, 51, 53, 53, 55, ++ 55, 55, 57, 57, 59, 59, 61, 61, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, ++ }, ++ { /* Fourth byte table 2. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 2, 4, 4, 6, 6, ++ 8, 8, 8, 10, 10, 12, 12, 14, ++ 14, 16, 16, 18, 18, 20, 20, 22, ++ 22, 24, 24, 26, 26, 28, 28, 30, ++ 30, 32, 32, 34, 34, 36, 36, 38, ++ 38, 40, 40, 42, 42, 44, 44, 46, ++ 46, 48, 48, 50, 50, 52, 52, 54, ++ 54, 56, 58, 58, 60, 60, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, ++ }, ++ { /* Fourth byte table 3. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 4, 4, 6, 6, 8, ++ 10, 10, 12, 14, 16, 16, 16, 18, ++ 20, 22, 24, 24, 26, 28, 28, 30, ++ 32, 34, 34, 34, 34, 36, 38, 38, ++ 40, 42, 42, 44, 44, 46, 46, 48, ++ 50, 50, 52, 52, 52, 54, 54, 56, ++ 58, 58, 60, 62, 64, 64, 66, 66, ++ 68, 70, 70, 70, 70, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, ++ }, ++ { /* Fourth byte table 4. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 2, 4, 4, ++ 6, 8, 8, 10, 12, 12, 14, 14, ++ 16, 16, 18, 18, 20, 20, 22, 22, ++ 24, 24, 26, 26, 28, 28, 28, 30, ++ 30, 32, 32, 34, 34, 36, 36, 38, ++ 38, 40, 40, 42, 42, 44, 44, 46, ++ 46, 46, 48, 50, 50, 52, 52, 54, ++ 56, 58, 58, 60, 60, 62, 62, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, ++ }, ++ { /* Fourth byte table 5. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 4, 4, 6, 6, 8, ++ 8, 10, 10, 12, 12, 14, 14, 16, ++ 16, 18, 18, 20, 20, 22, 22, 24, ++ 24, 26, 26, 28, 28, 30, 30, 32, ++ 32, 34, 34, 36, 36, 38, 38, 40, ++ 40, 42, 42, 44, 44, 46, 46, 48, ++ 48, 50, 50, 52, 52, 52, 52, 52, ++ 52, 52, 52, 55, 57, 57, 59, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, ++ }, ++ { /* Fourth byte table 6. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 2, 4, 6, 8, 10, ++ 10, 12, 12, 14, 14, 16, 16, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, ++ }, ++ { /* Fourth byte table 7. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 2, ++ 2, 4, 6, 8, 8, 10, 10, 12, ++ 14, 14, 16, 18, 20, 22, 24, 26, ++ 28, 30, 32, 34, 36, 38, 40, 42, ++ 44, 46, 48, 48, 50, 52, 54, 56, ++ 58, 60, 62, 64, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, ++ }, ++ { /* Fourth byte table 8. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 4, 4, 6, 6, 8, ++ 8, 10, 10, 12, 12, 14, 14, 16, ++ 16, 18, 18, 20, 20, 22, 22, 24, ++ 24, 24, 24, 24, 24, 26, 26, 26, ++ 28, 28, 30, 32, 32, 32, 34, 36, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, ++ }, ++ { /* Fourth byte table 9. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 48, 50, 52, 54, 56, 58, 60, 62, ++ 64, 66, 68, 70, 72, 74, 76, 78, ++ 80, 82, 84, 86, 88, 90, 92, 94, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 10. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 4, 4, 6, 6, 8, ++ 8, 10, 10, 12, 12, 14, 14, 16, ++ 16, 18, 18, 20, 20, 22, 22, 24, ++ 24, 26, 26, 28, 28, 30, 30, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, ++ }, ++ { /* Fourth byte table 11. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 4, 4, 6, 6, 8, ++ 8, 10, 10, 12, 12, 14, 14, 16, ++ 16, 18, 18, 20, 20, 22, 22, 24, ++ 24, 26, 26, 28, 28, 30, 30, 32, ++ 32, 34, 34, 36, 36, 38, 38, 40, ++ 40, 42, 42, 44, 44, 46, 46, 48, ++ 48, 50, 50, 52, 52, 54, 54, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, ++ }, ++ { /* Fourth byte table 12. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 4, 6, 6, 8, 8, ++ 10, 10, 12, 12, 14, 14, 16, 16, ++ 16, 18, 18, 20, 20, 22, 22, 24, ++ 24, 26, 26, 28, 28, 30, 30, 32, ++ 32, 34, 34, 36, 36, 38, 38, 40, ++ 40, 42, 42, 44, 44, 46, 46, 48, ++ 48, 50, 50, 52, 52, 54, 54, 56, ++ 56, 58, 58, 60, 60, 62, 62, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, ++ }, ++ { /* Fourth byte table 13. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 4, 4, 6, 6, 8, ++ 8, 10, 10, 12, 12, 14, 14, 16, ++ 16, 18, 18, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 22, 24, 26, 28, 30, 32, ++ 34, 36, 38, 40, 42, 44, 46, 48, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, ++ }, ++ { /* Fourth byte table 14. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, 46, 46, 46, 46, 46, 46, 46, ++ 46, ++ }, ++ { /* Fourth byte table 15. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 16. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, ++ }, ++ { /* Fourth byte table 17. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 3, 6, 6, 9, 9, 12, ++ 12, 15, 15, 18, 18, 21, 21, 24, ++ 24, 27, 27, 30, 30, 33, 33, 36, ++ 36, 39, 39, 42, 42, 45, 45, 48, ++ 48, 51, 51, 54, 54, 57, 57, 60, ++ 60, 63, 63, 66, 66, 69, 69, 72, ++ 72, 75, 75, 78, 78, 81, 81, 84, ++ 84, 87, 87, 90, 90, 93, 93, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 18. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 3, 6, 6, 9, 9, 12, ++ 12, 15, 15, 18, 18, 21, 21, 24, ++ 24, 27, 27, 30, 30, 33, 33, 36, ++ 36, 39, 39, 42, 42, 45, 45, 48, ++ 48, 51, 51, 54, 54, 57, 57, 60, ++ 60, 63, 63, 66, 66, 69, 69, 72, ++ 72, 75, 75, 78, 78, 81, 81, 84, ++ 84, 87, 87, 90, 90, 93, 93, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 19. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 3, 6, 6, 9, 9, 12, ++ 12, 15, 15, 18, 18, 21, 21, 24, ++ 24, 27, 27, 30, 30, 33, 33, 33, ++ 33, 33, 33, 33, 33, 33, 33, 33, ++ 33, 36, 36, 39, 39, 42, 42, 45, ++ 45, 48, 48, 51, 51, 54, 54, 57, ++ 57, 60, 60, 63, 63, 66, 66, 69, ++ 69, 72, 72, 75, 75, 78, 78, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 81, ++ 81, ++ }, ++ { /* Fourth byte table 20. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 3, 6, 6, 9, 9, 12, ++ 12, 15, 15, 18, 18, 21, 21, 24, ++ 24, 27, 27, 30, 30, 33, 33, 36, ++ 36, 39, 39, 42, 42, 45, 45, 48, ++ 48, 51, 51, 54, 54, 57, 57, 60, ++ 60, 63, 63, 66, 66, 69, 69, 72, ++ 72, 75, 75, 78, 78, 81, 81, 84, ++ 84, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, ++ }, ++ { /* Fourth byte table 21. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 27, 30, 33, 36, 39, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 45, 48, 51, 54, 57, 60, 63, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 69, 72, 75, 78, 81, 84, 87, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, ++ }, ++ { /* Fourth byte table 22. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 21, 21, 24, 24, 27, 27, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 33, 36, 39, 42, 45, 48, 51, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, ++ }, ++ { /* Fourth byte table 23. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 75, 78, 81, 84, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, ++ }, ++ { /* Fourth byte table 24. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 15, 15, ++ 15, 15, 15, 15, 15, 15, 15, 15, ++ 15, 18, 21, 24, 27, 27, 27, 27, ++ 27, 27, 27, 27, 27, 27, 27, 27, ++ 27, 30, 33, 36, 39, 42, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 45, 48, 51, 54, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, 57, 57, 57, 57, 57, 57, 57, ++ 57, ++ }, ++ { /* Fourth byte table 25. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 2, ++ 2, 2, 2, 3, 5, 5, 5, 5, ++ 5, 5, 5, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, ++ }, ++ { /* Fourth byte table 26. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, ++ }, ++ { /* Fourth byte table 27. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, ++ }, ++ { /* Fourth byte table 28. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 3, ++ 6, 9, 12, 15, 18, 21, 24, 27, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, ++ }, ++ { /* Fourth byte table 29. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, ++ }, ++ { /* Fourth byte table 30. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 96, 99, 102, 105, 108, 111, 114, 117, ++ 120, 123, 126, 129, 132, 135, 138, 141, ++ 141, 141, 141, 141, 141, 141, 141, 141, ++ 141, 141, 141, 141, 141, 141, 141, 141, ++ 141, 141, 141, 141, 141, 141, 141, 141, ++ 141, 141, 141, 141, 141, 141, 141, 141, ++ 141, 141, 141, 141, 141, 141, 141, 141, ++ 141, 141, 141, 141, 141, 141, 141, 141, ++ 141, 141, 141, 141, 141, 141, 141, 141, ++ 141, 141, 141, 141, 141, 141, 141, 141, ++ 141, 141, 141, 141, 141, 141, 141, 141, ++ 141, 141, 141, 141, 141, 141, 141, 141, ++ 141, ++ }, ++ { /* Fourth byte table 31. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 3, 5, 8, 10, 10, 10, ++ 13, 13, 16, 16, 19, 19, 19, 19, ++ 19, 19, 19, 19, 19, 19, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, 22, 22, 22, 22, 22, 22, 22, ++ 22, ++ }, ++ { /* Fourth byte table 32. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 3, 6, 6, 9, 9, 12, ++ 12, 15, 15, 18, 18, 21, 21, 24, ++ 24, 27, 27, 30, 30, 33, 33, 36, ++ 36, 39, 39, 42, 42, 45, 45, 48, ++ 48, 51, 51, 54, 54, 57, 57, 60, ++ 60, 63, 63, 66, 66, 69, 69, 72, ++ 72, 75, 75, 78, 78, 81, 81, 84, ++ 84, 87, 87, 90, 90, 93, 93, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 33. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 3, 6, 6, 9, 9, 12, ++ 12, 15, 15, 18, 18, 21, 21, 24, ++ 24, 27, 27, 30, 30, 33, 33, 36, ++ 36, 39, 39, 42, 42, 45, 45, 48, ++ 48, 51, 51, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, ++ }, ++ { /* Fourth byte table 34. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 6, 9, 12, 15, 18, ++ 21, 24, 27, 30, 33, 36, 39, 42, ++ 45, 48, 51, 54, 57, 60, 63, 66, ++ 69, 72, 75, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, ++ }, ++ { /* Fourth byte table 35. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 100, 104, 108, 112, 116, 120, 124, ++ 128, 132, 136, 140, 144, 148, 152, 156, ++ 160, 160, 160, 160, 160, 160, 160, 160, ++ 160, 160, 160, 160, 160, 160, 160, 160, ++ 160, 160, 160, 160, 160, 160, 160, 160, ++ 160, 160, 160, 160, 160, 160, 160, 160, ++ 160, 160, 160, 160, 160, 160, 160, 160, ++ 160, 160, 160, 160, 160, 160, 160, 160, ++ 160, 160, 160, 160, 160, 160, 160, 160, ++ 160, 160, 160, 160, 160, 160, 160, 160, ++ 160, 160, 160, 160, 160, 160, 160, 160, ++ 160, 160, 160, 160, 160, 160, 160, 160, ++ 160, 160, 160, 160, 160, 160, 160, 160, ++ 160, ++ }, ++ }, ++}; ++ ++static const uchar_t u8_tolower_final_tbl[2][2299] = { ++ { ++ 0xC3, 0xA0, 0xC3, 0xA1, 0xC3, 0xA2, 0xC3, 0xA3, ++ 0xC3, 0xA4, 0xC3, 0xA5, 0xC3, 0xA6, 0xC3, 0xA7, ++ 0xC3, 0xA8, 0xC3, 0xA9, 0xC3, 0xAA, 0xC3, 0xAB, ++ 0xC3, 0xAC, 0xC3, 0xAD, 0xC3, 0xAE, 0xC3, 0xAF, ++ 0xC3, 0xB0, 0xC3, 0xB1, 0xC3, 0xB2, 0xC3, 0xB3, ++ 0xC3, 0xB4, 0xC3, 0xB5, 0xC3, 0xB6, 0xC3, 0xB8, ++ 0xC3, 0xB9, 0xC3, 0xBA, 0xC3, 0xBB, 0xC3, 0xBC, ++ 0xC3, 0xBD, 0xC3, 0xBE, 0xC4, 0x81, 0xC4, 0x83, ++ 0xC4, 0x85, 0xC4, 0x87, 0xC4, 0x89, 0xC4, 0x8B, ++ 0xC4, 0x8D, 0xC4, 0x8F, 0xC4, 0x91, 0xC4, 0x93, ++ 0xC4, 0x95, 0xC4, 0x97, 0xC4, 0x99, 0xC4, 0x9B, ++ 0xC4, 0x9D, 0xC4, 0x9F, 0xC4, 0xA1, 0xC4, 0xA3, ++ 0xC4, 0xA5, 0xC4, 0xA7, 0xC4, 0xA9, 0xC4, 0xAB, ++ 0xC4, 0xAD, 0xC4, 0xAF, 0x69, 0xC4, 0xB3, 0xC4, ++ 0xB5, 0xC4, 0xB7, 0xC4, 0xBA, 0xC4, 0xBC, 0xC4, ++ 0xBE, 0xC5, 0x80, 0xC5, 0x82, 0xC5, 0x84, 0xC5, ++ 0x86, 0xC5, 0x88, 0xC5, 0x8B, 0xC5, 0x8D, 0xC5, ++ 0x8F, 0xC5, 0x91, 0xC5, 0x93, 0xC5, 0x95, 0xC5, ++ 0x97, 0xC5, 0x99, 0xC5, 0x9B, 0xC5, 0x9D, 0xC5, ++ 0x9F, 0xC5, 0xA1, 0xC5, 0xA3, 0xC5, 0xA5, 0xC5, ++ 0xA7, 0xC5, 0xA9, 0xC5, 0xAB, 0xC5, 0xAD, 0xC5, ++ 0xAF, 0xC5, 0xB1, 0xC5, 0xB3, 0xC5, 0xB5, 0xC5, ++ 0xB7, 0xC3, 0xBF, 0xC5, 0xBA, 0xC5, 0xBC, 0xC5, ++ 0xBE, 0xC9, 0x93, 0xC6, 0x83, 0xC6, 0x85, 0xC9, ++ 0x94, 0xC6, 0x88, 0xC9, 0x96, 0xC9, 0x97, 0xC6, ++ 0x8C, 0xC7, 0x9D, 0xC9, 0x99, 0xC9, 0x9B, 0xC6, ++ 0x92, 0xC9, 0xA0, 0xC9, 0xA3, 0xC9, 0xA9, 0xC9, ++ 0xA8, 0xC6, 0x99, 0xC9, 0xAF, 0xC9, 0xB2, 0xC9, ++ 0xB5, 0xC6, 0xA1, 0xC6, 0xA3, 0xC6, 0xA5, 0xCA, ++ 0x80, 0xC6, 0xA8, 0xCA, 0x83, 0xC6, 0xAD, 0xCA, ++ 0x88, 0xC6, 0xB0, 0xCA, 0x8A, 0xCA, 0x8B, 0xC6, ++ 0xB4, 0xC6, 0xB6, 0xCA, 0x92, 0xC6, 0xB9, 0xC6, ++ 0xBD, 0xC7, 0x86, 0xC7, 0x86, 0xC7, 0x89, 0xC7, ++ 0x89, 0xC7, 0x8C, 0xC7, 0x8C, 0xC7, 0x8E, 0xC7, ++ 0x90, 0xC7, 0x92, 0xC7, 0x94, 0xC7, 0x96, 0xC7, ++ 0x98, 0xC7, 0x9A, 0xC7, 0x9C, 0xC7, 0x9F, 0xC7, ++ 0xA1, 0xC7, 0xA3, 0xC7, 0xA5, 0xC7, 0xA7, 0xC7, ++ 0xA9, 0xC7, 0xAB, 0xC7, 0xAD, 0xC7, 0xAF, 0xC7, ++ 0xB3, 0xC7, 0xB3, 0xC7, 0xB5, 0xC6, 0x95, 0xC6, ++ 0xBF, 0xC7, 0xB9, 0xC7, 0xBB, 0xC7, 0xBD, 0xC7, ++ 0xBF, 0xC8, 0x81, 0xC8, 0x83, 0xC8, 0x85, 0xC8, ++ 0x87, 0xC8, 0x89, 0xC8, 0x8B, 0xC8, 0x8D, 0xC8, ++ 0x8F, 0xC8, 0x91, 0xC8, 0x93, 0xC8, 0x95, 0xC8, ++ 0x97, 0xC8, 0x99, 0xC8, 0x9B, 0xC8, 0x9D, 0xC8, ++ 0x9F, 0xC6, 0x9E, 0xC8, 0xA3, 0xC8, 0xA5, 0xC8, ++ 0xA7, 0xC8, 0xA9, 0xC8, 0xAB, 0xC8, 0xAD, 0xC8, ++ 0xAF, 0xC8, 0xB1, 0xC8, 0xB3, 0xCE, 0xAC, 0xCE, ++ 0xAD, 0xCE, 0xAE, 0xCE, 0xAF, 0xCF, 0x8C, 0xCF, ++ 0x8D, 0xCF, 0x8E, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, ++ 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, ++ 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, ++ 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, ++ 0xBF, 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x83, 0xCF, ++ 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xCF, ++ 0x88, 0xCF, 0x89, 0xCF, 0x8A, 0xCF, 0x8B, 0xCF, ++ 0x99, 0xCF, 0x9B, 0xCF, 0x9D, 0xCF, 0x9F, 0xCF, ++ 0xA1, 0xCF, 0xA3, 0xCF, 0xA5, 0xCF, 0xA7, 0xCF, ++ 0xA9, 0xCF, 0xAB, 0xCF, 0xAD, 0xCF, 0xAF, 0xCE, ++ 0xB8, 0xD1, 0x90, 0xD1, 0x91, 0xD1, 0x92, 0xD1, ++ 0x93, 0xD1, 0x94, 0xD1, 0x95, 0xD1, 0x96, 0xD1, ++ 0x97, 0xD1, 0x98, 0xD1, 0x99, 0xD1, 0x9A, 0xD1, ++ 0x9B, 0xD1, 0x9C, 0xD1, 0x9D, 0xD1, 0x9E, 0xD1, ++ 0x9F, 0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2, 0xD0, ++ 0xB3, 0xD0, 0xB4, 0xD0, 0xB5, 0xD0, 0xB6, 0xD0, ++ 0xB7, 0xD0, 0xB8, 0xD0, 0xB9, 0xD0, 0xBA, 0xD0, ++ 0xBB, 0xD0, 0xBC, 0xD0, 0xBD, 0xD0, 0xBE, 0xD0, ++ 0xBF, 0xD1, 0x80, 0xD1, 0x81, 0xD1, 0x82, 0xD1, ++ 0x83, 0xD1, 0x84, 0xD1, 0x85, 0xD1, 0x86, 0xD1, ++ 0x87, 0xD1, 0x88, 0xD1, 0x89, 0xD1, 0x8A, 0xD1, ++ 0x8B, 0xD1, 0x8C, 0xD1, 0x8D, 0xD1, 0x8E, 0xD1, ++ 0x8F, 0xD1, 0xA1, 0xD1, 0xA3, 0xD1, 0xA5, 0xD1, ++ 0xA7, 0xD1, 0xA9, 0xD1, 0xAB, 0xD1, 0xAD, 0xD1, ++ 0xAF, 0xD1, 0xB1, 0xD1, 0xB3, 0xD1, 0xB5, 0xD1, ++ 0xB7, 0xD1, 0xB9, 0xD1, 0xBB, 0xD1, 0xBD, 0xD1, ++ 0xBF, 0xD2, 0x81, 0xD2, 0x8B, 0xD2, 0x8D, 0xD2, ++ 0x8F, 0xD2, 0x91, 0xD2, 0x93, 0xD2, 0x95, 0xD2, ++ 0x97, 0xD2, 0x99, 0xD2, 0x9B, 0xD2, 0x9D, 0xD2, ++ 0x9F, 0xD2, 0xA1, 0xD2, 0xA3, 0xD2, 0xA5, 0xD2, ++ 0xA7, 0xD2, 0xA9, 0xD2, 0xAB, 0xD2, 0xAD, 0xD2, ++ 0xAF, 0xD2, 0xB1, 0xD2, 0xB3, 0xD2, 0xB5, 0xD2, ++ 0xB7, 0xD2, 0xB9, 0xD2, 0xBB, 0xD2, 0xBD, 0xD2, ++ 0xBF, 0xD3, 0x82, 0xD3, 0x84, 0xD3, 0x86, 0xD3, ++ 0x88, 0xD3, 0x8A, 0xD3, 0x8C, 0xD3, 0x8E, 0xD3, ++ 0x91, 0xD3, 0x93, 0xD3, 0x95, 0xD3, 0x97, 0xD3, ++ 0x99, 0xD3, 0x9B, 0xD3, 0x9D, 0xD3, 0x9F, 0xD3, ++ 0xA1, 0xD3, 0xA3, 0xD3, 0xA5, 0xD3, 0xA7, 0xD3, ++ 0xA9, 0xD3, 0xAB, 0xD3, 0xAD, 0xD3, 0xAF, 0xD3, ++ 0xB1, 0xD3, 0xB3, 0xD3, 0xB5, 0xD3, 0xB9, 0xD4, ++ 0x81, 0xD4, 0x83, 0xD4, 0x85, 0xD4, 0x87, 0xD4, ++ 0x89, 0xD4, 0x8B, 0xD4, 0x8D, 0xD4, 0x8F, 0xD5, ++ 0xA1, 0xD5, 0xA2, 0xD5, 0xA3, 0xD5, 0xA4, 0xD5, ++ 0xA5, 0xD5, 0xA6, 0xD5, 0xA7, 0xD5, 0xA8, 0xD5, ++ 0xA9, 0xD5, 0xAA, 0xD5, 0xAB, 0xD5, 0xAC, 0xD5, ++ 0xAD, 0xD5, 0xAE, 0xD5, 0xAF, 0xD5, 0xB0, 0xD5, ++ 0xB1, 0xD5, 0xB2, 0xD5, 0xB3, 0xD5, 0xB4, 0xD5, ++ 0xB5, 0xD5, 0xB6, 0xD5, 0xB7, 0xD5, 0xB8, 0xD5, ++ 0xB9, 0xD5, 0xBA, 0xD5, 0xBB, 0xD5, 0xBC, 0xD5, ++ 0xBD, 0xD5, 0xBE, 0xD5, 0xBF, 0xD6, 0x80, 0xD6, ++ 0x81, 0xD6, 0x82, 0xD6, 0x83, 0xD6, 0x84, 0xD6, ++ 0x85, 0xD6, 0x86, 0xE1, 0xB8, 0x81, 0xE1, 0xB8, ++ 0x83, 0xE1, 0xB8, 0x85, 0xE1, 0xB8, 0x87, 0xE1, ++ 0xB8, 0x89, 0xE1, 0xB8, 0x8B, 0xE1, 0xB8, 0x8D, ++ 0xE1, 0xB8, 0x8F, 0xE1, 0xB8, 0x91, 0xE1, 0xB8, ++ 0x93, 0xE1, 0xB8, 0x95, 0xE1, 0xB8, 0x97, 0xE1, ++ 0xB8, 0x99, 0xE1, 0xB8, 0x9B, 0xE1, 0xB8, 0x9D, ++ 0xE1, 0xB8, 0x9F, 0xE1, 0xB8, 0xA1, 0xE1, 0xB8, ++ 0xA3, 0xE1, 0xB8, 0xA5, 0xE1, 0xB8, 0xA7, 0xE1, ++ 0xB8, 0xA9, 0xE1, 0xB8, 0xAB, 0xE1, 0xB8, 0xAD, ++ 0xE1, 0xB8, 0xAF, 0xE1, 0xB8, 0xB1, 0xE1, 0xB8, ++ 0xB3, 0xE1, 0xB8, 0xB5, 0xE1, 0xB8, 0xB7, 0xE1, ++ 0xB8, 0xB9, 0xE1, 0xB8, 0xBB, 0xE1, 0xB8, 0xBD, ++ 0xE1, 0xB8, 0xBF, 0xE1, 0xB9, 0x81, 0xE1, 0xB9, ++ 0x83, 0xE1, 0xB9, 0x85, 0xE1, 0xB9, 0x87, 0xE1, ++ 0xB9, 0x89, 0xE1, 0xB9, 0x8B, 0xE1, 0xB9, 0x8D, ++ 0xE1, 0xB9, 0x8F, 0xE1, 0xB9, 0x91, 0xE1, 0xB9, ++ 0x93, 0xE1, 0xB9, 0x95, 0xE1, 0xB9, 0x97, 0xE1, ++ 0xB9, 0x99, 0xE1, 0xB9, 0x9B, 0xE1, 0xB9, 0x9D, ++ 0xE1, 0xB9, 0x9F, 0xE1, 0xB9, 0xA1, 0xE1, 0xB9, ++ 0xA3, 0xE1, 0xB9, 0xA5, 0xE1, 0xB9, 0xA7, 0xE1, ++ 0xB9, 0xA9, 0xE1, 0xB9, 0xAB, 0xE1, 0xB9, 0xAD, ++ 0xE1, 0xB9, 0xAF, 0xE1, 0xB9, 0xB1, 0xE1, 0xB9, ++ 0xB3, 0xE1, 0xB9, 0xB5, 0xE1, 0xB9, 0xB7, 0xE1, ++ 0xB9, 0xB9, 0xE1, 0xB9, 0xBB, 0xE1, 0xB9, 0xBD, ++ 0xE1, 0xB9, 0xBF, 0xE1, 0xBA, 0x81, 0xE1, 0xBA, ++ 0x83, 0xE1, 0xBA, 0x85, 0xE1, 0xBA, 0x87, 0xE1, ++ 0xBA, 0x89, 0xE1, 0xBA, 0x8B, 0xE1, 0xBA, 0x8D, ++ 0xE1, 0xBA, 0x8F, 0xE1, 0xBA, 0x91, 0xE1, 0xBA, ++ 0x93, 0xE1, 0xBA, 0x95, 0xE1, 0xBA, 0xA1, 0xE1, ++ 0xBA, 0xA3, 0xE1, 0xBA, 0xA5, 0xE1, 0xBA, 0xA7, ++ 0xE1, 0xBA, 0xA9, 0xE1, 0xBA, 0xAB, 0xE1, 0xBA, ++ 0xAD, 0xE1, 0xBA, 0xAF, 0xE1, 0xBA, 0xB1, 0xE1, ++ 0xBA, 0xB3, 0xE1, 0xBA, 0xB5, 0xE1, 0xBA, 0xB7, ++ 0xE1, 0xBA, 0xB9, 0xE1, 0xBA, 0xBB, 0xE1, 0xBA, ++ 0xBD, 0xE1, 0xBA, 0xBF, 0xE1, 0xBB, 0x81, 0xE1, ++ 0xBB, 0x83, 0xE1, 0xBB, 0x85, 0xE1, 0xBB, 0x87, ++ 0xE1, 0xBB, 0x89, 0xE1, 0xBB, 0x8B, 0xE1, 0xBB, ++ 0x8D, 0xE1, 0xBB, 0x8F, 0xE1, 0xBB, 0x91, 0xE1, ++ 0xBB, 0x93, 0xE1, 0xBB, 0x95, 0xE1, 0xBB, 0x97, ++ 0xE1, 0xBB, 0x99, 0xE1, 0xBB, 0x9B, 0xE1, 0xBB, ++ 0x9D, 0xE1, 0xBB, 0x9F, 0xE1, 0xBB, 0xA1, 0xE1, ++ 0xBB, 0xA3, 0xE1, 0xBB, 0xA5, 0xE1, 0xBB, 0xA7, ++ 0xE1, 0xBB, 0xA9, 0xE1, 0xBB, 0xAB, 0xE1, 0xBB, ++ 0xAD, 0xE1, 0xBB, 0xAF, 0xE1, 0xBB, 0xB1, 0xE1, ++ 0xBB, 0xB3, 0xE1, 0xBB, 0xB5, 0xE1, 0xBB, 0xB7, ++ 0xE1, 0xBB, 0xB9, 0xE1, 0xBC, 0x80, 0xE1, 0xBC, ++ 0x81, 0xE1, 0xBC, 0x82, 0xE1, 0xBC, 0x83, 0xE1, ++ 0xBC, 0x84, 0xE1, 0xBC, 0x85, 0xE1, 0xBC, 0x86, ++ 0xE1, 0xBC, 0x87, 0xE1, 0xBC, 0x90, 0xE1, 0xBC, ++ 0x91, 0xE1, 0xBC, 0x92, 0xE1, 0xBC, 0x93, 0xE1, ++ 0xBC, 0x94, 0xE1, 0xBC, 0x95, 0xE1, 0xBC, 0xA0, ++ 0xE1, 0xBC, 0xA1, 0xE1, 0xBC, 0xA2, 0xE1, 0xBC, ++ 0xA3, 0xE1, 0xBC, 0xA4, 0xE1, 0xBC, 0xA5, 0xE1, ++ 0xBC, 0xA6, 0xE1, 0xBC, 0xA7, 0xE1, 0xBC, 0xB0, ++ 0xE1, 0xBC, 0xB1, 0xE1, 0xBC, 0xB2, 0xE1, 0xBC, ++ 0xB3, 0xE1, 0xBC, 0xB4, 0xE1, 0xBC, 0xB5, 0xE1, ++ 0xBC, 0xB6, 0xE1, 0xBC, 0xB7, 0xE1, 0xBD, 0x80, ++ 0xE1, 0xBD, 0x81, 0xE1, 0xBD, 0x82, 0xE1, 0xBD, ++ 0x83, 0xE1, 0xBD, 0x84, 0xE1, 0xBD, 0x85, 0xE1, ++ 0xBD, 0x91, 0xE1, 0xBD, 0x93, 0xE1, 0xBD, 0x95, ++ 0xE1, 0xBD, 0x97, 0xE1, 0xBD, 0xA0, 0xE1, 0xBD, ++ 0xA1, 0xE1, 0xBD, 0xA2, 0xE1, 0xBD, 0xA3, 0xE1, ++ 0xBD, 0xA4, 0xE1, 0xBD, 0xA5, 0xE1, 0xBD, 0xA6, ++ 0xE1, 0xBD, 0xA7, 0xE1, 0xBE, 0x80, 0xE1, 0xBE, ++ 0x81, 0xE1, 0xBE, 0x82, 0xE1, 0xBE, 0x83, 0xE1, ++ 0xBE, 0x84, 0xE1, 0xBE, 0x85, 0xE1, 0xBE, 0x86, ++ 0xE1, 0xBE, 0x87, 0xE1, 0xBE, 0x90, 0xE1, 0xBE, ++ 0x91, 0xE1, 0xBE, 0x92, 0xE1, 0xBE, 0x93, 0xE1, ++ 0xBE, 0x94, 0xE1, 0xBE, 0x95, 0xE1, 0xBE, 0x96, ++ 0xE1, 0xBE, 0x97, 0xE1, 0xBE, 0xA0, 0xE1, 0xBE, ++ 0xA1, 0xE1, 0xBE, 0xA2, 0xE1, 0xBE, 0xA3, 0xE1, ++ 0xBE, 0xA4, 0xE1, 0xBE, 0xA5, 0xE1, 0xBE, 0xA6, ++ 0xE1, 0xBE, 0xA7, 0xE1, 0xBE, 0xB0, 0xE1, 0xBE, ++ 0xB1, 0xE1, 0xBD, 0xB0, 0xE1, 0xBD, 0xB1, 0xE1, ++ 0xBE, 0xB3, 0xE1, 0xBD, 0xB2, 0xE1, 0xBD, 0xB3, ++ 0xE1, 0xBD, 0xB4, 0xE1, 0xBD, 0xB5, 0xE1, 0xBF, ++ 0x83, 0xE1, 0xBF, 0x90, 0xE1, 0xBF, 0x91, 0xE1, ++ 0xBD, 0xB6, 0xE1, 0xBD, 0xB7, 0xE1, 0xBF, 0xA0, ++ 0xE1, 0xBF, 0xA1, 0xE1, 0xBD, 0xBA, 0xE1, 0xBD, ++ 0xBB, 0xE1, 0xBF, 0xA5, 0xE1, 0xBD, 0xB8, 0xE1, ++ 0xBD, 0xB9, 0xE1, 0xBD, 0xBC, 0xE1, 0xBD, 0xBD, ++ 0xE1, 0xBF, 0xB3, 0xCF, 0x89, 0x6B, 0xC3, 0xA5, ++ 0xE2, 0x85, 0xB0, 0xE2, 0x85, 0xB1, 0xE2, 0x85, ++ 0xB2, 0xE2, 0x85, 0xB3, 0xE2, 0x85, 0xB4, 0xE2, ++ 0x85, 0xB5, 0xE2, 0x85, 0xB6, 0xE2, 0x85, 0xB7, ++ 0xE2, 0x85, 0xB8, 0xE2, 0x85, 0xB9, 0xE2, 0x85, ++ 0xBA, 0xE2, 0x85, 0xBB, 0xE2, 0x85, 0xBC, 0xE2, ++ 0x85, 0xBD, 0xE2, 0x85, 0xBE, 0xE2, 0x85, 0xBF, ++ 0xE2, 0x93, 0x90, 0xE2, 0x93, 0x91, 0xE2, 0x93, ++ 0x92, 0xE2, 0x93, 0x93, 0xE2, 0x93, 0x94, 0xE2, ++ 0x93, 0x95, 0xE2, 0x93, 0x96, 0xE2, 0x93, 0x97, ++ 0xE2, 0x93, 0x98, 0xE2, 0x93, 0x99, 0xE2, 0x93, ++ 0x9A, 0xE2, 0x93, 0x9B, 0xE2, 0x93, 0x9C, 0xE2, ++ 0x93, 0x9D, 0xE2, 0x93, 0x9E, 0xE2, 0x93, 0x9F, ++ 0xE2, 0x93, 0xA0, 0xE2, 0x93, 0xA1, 0xE2, 0x93, ++ 0xA2, 0xE2, 0x93, 0xA3, 0xE2, 0x93, 0xA4, 0xE2, ++ 0x93, 0xA5, 0xE2, 0x93, 0xA6, 0xE2, 0x93, 0xA7, ++ 0xE2, 0x93, 0xA8, 0xE2, 0x93, 0xA9, 0xEF, 0xBD, ++ 0x81, 0xEF, 0xBD, 0x82, 0xEF, 0xBD, 0x83, 0xEF, ++ 0xBD, 0x84, 0xEF, 0xBD, 0x85, 0xEF, 0xBD, 0x86, ++ 0xEF, 0xBD, 0x87, 0xEF, 0xBD, 0x88, 0xEF, 0xBD, ++ 0x89, 0xEF, 0xBD, 0x8A, 0xEF, 0xBD, 0x8B, 0xEF, ++ 0xBD, 0x8C, 0xEF, 0xBD, 0x8D, 0xEF, 0xBD, 0x8E, ++ 0xEF, 0xBD, 0x8F, 0xEF, 0xBD, 0x90, 0xEF, 0xBD, ++ 0x91, 0xEF, 0xBD, 0x92, 0xEF, 0xBD, 0x93, 0xEF, ++ 0xBD, 0x94, 0xEF, 0xBD, 0x95, 0xEF, 0xBD, 0x96, ++ 0xEF, 0xBD, 0x97, 0xEF, 0xBD, 0x98, 0xEF, 0xBD, ++ 0x99, 0xEF, 0xBD, 0x9A, 0xF0, 0x90, 0x90, 0xA8, ++ 0xF0, 0x90, 0x90, 0xA9, 0xF0, 0x90, 0x90, 0xAA, ++ 0xF0, 0x90, 0x90, 0xAB, 0xF0, 0x90, 0x90, 0xAC, ++ 0xF0, 0x90, 0x90, 0xAD, 0xF0, 0x90, 0x90, 0xAE, ++ 0xF0, 0x90, 0x90, 0xAF, 0xF0, 0x90, 0x90, 0xB0, ++ 0xF0, 0x90, 0x90, 0xB1, 0xF0, 0x90, 0x90, 0xB2, ++ 0xF0, 0x90, 0x90, 0xB3, 0xF0, 0x90, 0x90, 0xB4, ++ 0xF0, 0x90, 0x90, 0xB5, 0xF0, 0x90, 0x90, 0xB6, ++ 0xF0, 0x90, 0x90, 0xB7, 0xF0, 0x90, 0x90, 0xB8, ++ 0xF0, 0x90, 0x90, 0xB9, 0xF0, 0x90, 0x90, 0xBA, ++ 0xF0, 0x90, 0x90, 0xBB, 0xF0, 0x90, 0x90, 0xBC, ++ 0xF0, 0x90, 0x90, 0xBD, 0xF0, 0x90, 0x90, 0xBE, ++ 0xF0, 0x90, 0x90, 0xBF, 0xF0, 0x90, 0x91, 0x80, ++ 0xF0, 0x90, 0x91, 0x81, 0xF0, 0x90, 0x91, 0x82, ++ 0xF0, 0x90, 0x91, 0x83, 0xF0, 0x90, 0x91, 0x84, ++ 0xF0, 0x90, 0x91, 0x85, 0xF0, 0x90, 0x91, 0x86, ++ 0xF0, 0x90, 0x91, 0x87, 0xF0, 0x90, 0x91, 0x88, ++ 0xF0, 0x90, 0x91, 0x89, 0xF0, 0x90, 0x91, 0x8A, ++ 0xF0, 0x90, 0x91, 0x8B, 0xF0, 0x90, 0x91, 0x8C, ++ 0xF0, 0x90, 0x91, 0x8D, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, ++ }, ++ { ++ 0xC3, 0xA0, 0xC3, 0xA1, 0xC3, 0xA2, 0xC3, 0xA3, ++ 0xC3, 0xA4, 0xC3, 0xA5, 0xC3, 0xA6, 0xC3, 0xA7, ++ 0xC3, 0xA8, 0xC3, 0xA9, 0xC3, 0xAA, 0xC3, 0xAB, ++ 0xC3, 0xAC, 0xC3, 0xAD, 0xC3, 0xAE, 0xC3, 0xAF, ++ 0xC3, 0xB0, 0xC3, 0xB1, 0xC3, 0xB2, 0xC3, 0xB3, ++ 0xC3, 0xB4, 0xC3, 0xB5, 0xC3, 0xB6, 0xC3, 0xB8, ++ 0xC3, 0xB9, 0xC3, 0xBA, 0xC3, 0xBB, 0xC3, 0xBC, ++ 0xC3, 0xBD, 0xC3, 0xBE, 0xC4, 0x81, 0xC4, 0x83, ++ 0xC4, 0x85, 0xC4, 0x87, 0xC4, 0x89, 0xC4, 0x8B, ++ 0xC4, 0x8D, 0xC4, 0x8F, 0xC4, 0x91, 0xC4, 0x93, ++ 0xC4, 0x95, 0xC4, 0x97, 0xC4, 0x99, 0xC4, 0x9B, ++ 0xC4, 0x9D, 0xC4, 0x9F, 0xC4, 0xA1, 0xC4, 0xA3, ++ 0xC4, 0xA5, 0xC4, 0xA7, 0xC4, 0xA9, 0xC4, 0xAB, ++ 0xC4, 0xAD, 0xC4, 0xAF, 0x69, 0xC4, 0xB3, 0xC4, ++ 0xB5, 0xC4, 0xB7, 0xC4, 0xBA, 0xC4, 0xBC, 0xC4, ++ 0xBE, 0xC5, 0x80, 0xC5, 0x82, 0xC5, 0x84, 0xC5, ++ 0x86, 0xC5, 0x88, 0xC5, 0x8B, 0xC5, 0x8D, 0xC5, ++ 0x8F, 0xC5, 0x91, 0xC5, 0x93, 0xC5, 0x95, 0xC5, ++ 0x97, 0xC5, 0x99, 0xC5, 0x9B, 0xC5, 0x9D, 0xC5, ++ 0x9F, 0xC5, 0xA1, 0xC5, 0xA3, 0xC5, 0xA5, 0xC5, ++ 0xA7, 0xC5, 0xA9, 0xC5, 0xAB, 0xC5, 0xAD, 0xC5, ++ 0xAF, 0xC5, 0xB1, 0xC5, 0xB3, 0xC5, 0xB5, 0xC5, ++ 0xB7, 0xC3, 0xBF, 0xC5, 0xBA, 0xC5, 0xBC, 0xC5, ++ 0xBE, 0xC9, 0x93, 0xC6, 0x83, 0xC6, 0x85, 0xC9, ++ 0x94, 0xC6, 0x88, 0xC9, 0x96, 0xC9, 0x97, 0xC6, ++ 0x8C, 0xC7, 0x9D, 0xC9, 0x99, 0xC9, 0x9B, 0xC6, ++ 0x92, 0xC9, 0xA0, 0xC9, 0xA3, 0xC9, 0xA9, 0xC9, ++ 0xA8, 0xC6, 0x99, 0xC9, 0xAF, 0xC9, 0xB2, 0xC9, ++ 0xB5, 0xC6, 0xA1, 0xC6, 0xA3, 0xC6, 0xA5, 0xCA, ++ 0x80, 0xC6, 0xA8, 0xCA, 0x83, 0xC6, 0xAD, 0xCA, ++ 0x88, 0xC6, 0xB0, 0xCA, 0x8A, 0xCA, 0x8B, 0xC6, ++ 0xB4, 0xC6, 0xB6, 0xCA, 0x92, 0xC6, 0xB9, 0xC6, ++ 0xBD, 0xC7, 0x86, 0xC7, 0x86, 0xC7, 0x89, 0xC7, ++ 0x89, 0xC7, 0x8C, 0xC7, 0x8C, 0xC7, 0x8E, 0xC7, ++ 0x90, 0xC7, 0x92, 0xC7, 0x94, 0xC7, 0x96, 0xC7, ++ 0x98, 0xC7, 0x9A, 0xC7, 0x9C, 0xC7, 0x9F, 0xC7, ++ 0xA1, 0xC7, 0xA3, 0xC7, 0xA5, 0xC7, 0xA7, 0xC7, ++ 0xA9, 0xC7, 0xAB, 0xC7, 0xAD, 0xC7, 0xAF, 0xC7, ++ 0xB3, 0xC7, 0xB3, 0xC7, 0xB5, 0xC6, 0x95, 0xC6, ++ 0xBF, 0xC7, 0xB9, 0xC7, 0xBB, 0xC7, 0xBD, 0xC7, ++ 0xBF, 0xC8, 0x81, 0xC8, 0x83, 0xC8, 0x85, 0xC8, ++ 0x87, 0xC8, 0x89, 0xC8, 0x8B, 0xC8, 0x8D, 0xC8, ++ 0x8F, 0xC8, 0x91, 0xC8, 0x93, 0xC8, 0x95, 0xC8, ++ 0x97, 0xC8, 0x99, 0xC8, 0x9B, 0xC8, 0x9D, 0xC8, ++ 0x9F, 0xC6, 0x9E, 0xC8, 0xA3, 0xC8, 0xA5, 0xC8, ++ 0xA7, 0xC8, 0xA9, 0xC8, 0xAB, 0xC8, 0xAD, 0xC8, ++ 0xAF, 0xC8, 0xB1, 0xC8, 0xB3, 0xE2, 0xB1, 0xA5, ++ 0xC8, 0xBC, 0xC6, 0x9A, 0xE2, 0xB1, 0xA6, 0xC9, ++ 0x82, 0xC6, 0x80, 0xCA, 0x89, 0xCA, 0x8C, 0xC9, ++ 0x87, 0xC9, 0x89, 0xC9, 0x8B, 0xC9, 0x8D, 0xC9, ++ 0x8F, 0xCE, 0xAC, 0xCE, 0xAD, 0xCE, 0xAE, 0xCE, ++ 0xAF, 0xCF, 0x8C, 0xCF, 0x8D, 0xCF, 0x8E, 0xCE, ++ 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, ++ 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, ++ 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, ++ 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF, ++ 0x81, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, 0xCF, ++ 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, 0xCF, ++ 0x8A, 0xCF, 0x8B, 0xCF, 0x99, 0xCF, 0x9B, 0xCF, ++ 0x9D, 0xCF, 0x9F, 0xCF, 0xA1, 0xCF, 0xA3, 0xCF, ++ 0xA5, 0xCF, 0xA7, 0xCF, 0xA9, 0xCF, 0xAB, 0xCF, ++ 0xAD, 0xCF, 0xAF, 0xCE, 0xB8, 0xCF, 0xB8, 0xCF, ++ 0xB2, 0xCF, 0xBB, 0xCD, 0xBB, 0xCD, 0xBC, 0xCD, ++ 0xBD, 0xD1, 0x90, 0xD1, 0x91, 0xD1, 0x92, 0xD1, ++ 0x93, 0xD1, 0x94, 0xD1, 0x95, 0xD1, 0x96, 0xD1, ++ 0x97, 0xD1, 0x98, 0xD1, 0x99, 0xD1, 0x9A, 0xD1, ++ 0x9B, 0xD1, 0x9C, 0xD1, 0x9D, 0xD1, 0x9E, 0xD1, ++ 0x9F, 0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2, 0xD0, ++ 0xB3, 0xD0, 0xB4, 0xD0, 0xB5, 0xD0, 0xB6, 0xD0, ++ 0xB7, 0xD0, 0xB8, 0xD0, 0xB9, 0xD0, 0xBA, 0xD0, ++ 0xBB, 0xD0, 0xBC, 0xD0, 0xBD, 0xD0, 0xBE, 0xD0, ++ 0xBF, 0xD1, 0x80, 0xD1, 0x81, 0xD1, 0x82, 0xD1, ++ 0x83, 0xD1, 0x84, 0xD1, 0x85, 0xD1, 0x86, 0xD1, ++ 0x87, 0xD1, 0x88, 0xD1, 0x89, 0xD1, 0x8A, 0xD1, ++ 0x8B, 0xD1, 0x8C, 0xD1, 0x8D, 0xD1, 0x8E, 0xD1, ++ 0x8F, 0xD1, 0xA1, 0xD1, 0xA3, 0xD1, 0xA5, 0xD1, ++ 0xA7, 0xD1, 0xA9, 0xD1, 0xAB, 0xD1, 0xAD, 0xD1, ++ 0xAF, 0xD1, 0xB1, 0xD1, 0xB3, 0xD1, 0xB5, 0xD1, ++ 0xB7, 0xD1, 0xB9, 0xD1, 0xBB, 0xD1, 0xBD, 0xD1, ++ 0xBF, 0xD2, 0x81, 0xD2, 0x8B, 0xD2, 0x8D, 0xD2, ++ 0x8F, 0xD2, 0x91, 0xD2, 0x93, 0xD2, 0x95, 0xD2, ++ 0x97, 0xD2, 0x99, 0xD2, 0x9B, 0xD2, 0x9D, 0xD2, ++ 0x9F, 0xD2, 0xA1, 0xD2, 0xA3, 0xD2, 0xA5, 0xD2, ++ 0xA7, 0xD2, 0xA9, 0xD2, 0xAB, 0xD2, 0xAD, 0xD2, ++ 0xAF, 0xD2, 0xB1, 0xD2, 0xB3, 0xD2, 0xB5, 0xD2, ++ 0xB7, 0xD2, 0xB9, 0xD2, 0xBB, 0xD2, 0xBD, 0xD2, ++ 0xBF, 0xD3, 0x8F, 0xD3, 0x82, 0xD3, 0x84, 0xD3, ++ 0x86, 0xD3, 0x88, 0xD3, 0x8A, 0xD3, 0x8C, 0xD3, ++ 0x8E, 0xD3, 0x91, 0xD3, 0x93, 0xD3, 0x95, 0xD3, ++ 0x97, 0xD3, 0x99, 0xD3, 0x9B, 0xD3, 0x9D, 0xD3, ++ 0x9F, 0xD3, 0xA1, 0xD3, 0xA3, 0xD3, 0xA5, 0xD3, ++ 0xA7, 0xD3, 0xA9, 0xD3, 0xAB, 0xD3, 0xAD, 0xD3, ++ 0xAF, 0xD3, 0xB1, 0xD3, 0xB3, 0xD3, 0xB5, 0xD3, ++ 0xB7, 0xD3, 0xB9, 0xD3, 0xBB, 0xD3, 0xBD, 0xD3, ++ 0xBF, 0xD4, 0x81, 0xD4, 0x83, 0xD4, 0x85, 0xD4, ++ 0x87, 0xD4, 0x89, 0xD4, 0x8B, 0xD4, 0x8D, 0xD4, ++ 0x8F, 0xD4, 0x91, 0xD4, 0x93, 0xD5, 0xA1, 0xD5, ++ 0xA2, 0xD5, 0xA3, 0xD5, 0xA4, 0xD5, 0xA5, 0xD5, ++ 0xA6, 0xD5, 0xA7, 0xD5, 0xA8, 0xD5, 0xA9, 0xD5, ++ 0xAA, 0xD5, 0xAB, 0xD5, 0xAC, 0xD5, 0xAD, 0xD5, ++ 0xAE, 0xD5, 0xAF, 0xD5, 0xB0, 0xD5, 0xB1, 0xD5, ++ 0xB2, 0xD5, 0xB3, 0xD5, 0xB4, 0xD5, 0xB5, 0xD5, ++ 0xB6, 0xD5, 0xB7, 0xD5, 0xB8, 0xD5, 0xB9, 0xD5, ++ 0xBA, 0xD5, 0xBB, 0xD5, 0xBC, 0xD5, 0xBD, 0xD5, ++ 0xBE, 0xD5, 0xBF, 0xD6, 0x80, 0xD6, 0x81, 0xD6, ++ 0x82, 0xD6, 0x83, 0xD6, 0x84, 0xD6, 0x85, 0xD6, ++ 0x86, 0xE2, 0xB4, 0x80, 0xE2, 0xB4, 0x81, 0xE2, ++ 0xB4, 0x82, 0xE2, 0xB4, 0x83, 0xE2, 0xB4, 0x84, ++ 0xE2, 0xB4, 0x85, 0xE2, 0xB4, 0x86, 0xE2, 0xB4, ++ 0x87, 0xE2, 0xB4, 0x88, 0xE2, 0xB4, 0x89, 0xE2, ++ 0xB4, 0x8A, 0xE2, 0xB4, 0x8B, 0xE2, 0xB4, 0x8C, ++ 0xE2, 0xB4, 0x8D, 0xE2, 0xB4, 0x8E, 0xE2, 0xB4, ++ 0x8F, 0xE2, 0xB4, 0x90, 0xE2, 0xB4, 0x91, 0xE2, ++ 0xB4, 0x92, 0xE2, 0xB4, 0x93, 0xE2, 0xB4, 0x94, ++ 0xE2, 0xB4, 0x95, 0xE2, 0xB4, 0x96, 0xE2, 0xB4, ++ 0x97, 0xE2, 0xB4, 0x98, 0xE2, 0xB4, 0x99, 0xE2, ++ 0xB4, 0x9A, 0xE2, 0xB4, 0x9B, 0xE2, 0xB4, 0x9C, ++ 0xE2, 0xB4, 0x9D, 0xE2, 0xB4, 0x9E, 0xE2, 0xB4, ++ 0x9F, 0xE2, 0xB4, 0xA0, 0xE2, 0xB4, 0xA1, 0xE2, ++ 0xB4, 0xA2, 0xE2, 0xB4, 0xA3, 0xE2, 0xB4, 0xA4, ++ 0xE2, 0xB4, 0xA5, 0xE1, 0xB8, 0x81, 0xE1, 0xB8, ++ 0x83, 0xE1, 0xB8, 0x85, 0xE1, 0xB8, 0x87, 0xE1, ++ 0xB8, 0x89, 0xE1, 0xB8, 0x8B, 0xE1, 0xB8, 0x8D, ++ 0xE1, 0xB8, 0x8F, 0xE1, 0xB8, 0x91, 0xE1, 0xB8, ++ 0x93, 0xE1, 0xB8, 0x95, 0xE1, 0xB8, 0x97, 0xE1, ++ 0xB8, 0x99, 0xE1, 0xB8, 0x9B, 0xE1, 0xB8, 0x9D, ++ 0xE1, 0xB8, 0x9F, 0xE1, 0xB8, 0xA1, 0xE1, 0xB8, ++ 0xA3, 0xE1, 0xB8, 0xA5, 0xE1, 0xB8, 0xA7, 0xE1, ++ 0xB8, 0xA9, 0xE1, 0xB8, 0xAB, 0xE1, 0xB8, 0xAD, ++ 0xE1, 0xB8, 0xAF, 0xE1, 0xB8, 0xB1, 0xE1, 0xB8, ++ 0xB3, 0xE1, 0xB8, 0xB5, 0xE1, 0xB8, 0xB7, 0xE1, ++ 0xB8, 0xB9, 0xE1, 0xB8, 0xBB, 0xE1, 0xB8, 0xBD, ++ 0xE1, 0xB8, 0xBF, 0xE1, 0xB9, 0x81, 0xE1, 0xB9, ++ 0x83, 0xE1, 0xB9, 0x85, 0xE1, 0xB9, 0x87, 0xE1, ++ 0xB9, 0x89, 0xE1, 0xB9, 0x8B, 0xE1, 0xB9, 0x8D, ++ 0xE1, 0xB9, 0x8F, 0xE1, 0xB9, 0x91, 0xE1, 0xB9, ++ 0x93, 0xE1, 0xB9, 0x95, 0xE1, 0xB9, 0x97, 0xE1, ++ 0xB9, 0x99, 0xE1, 0xB9, 0x9B, 0xE1, 0xB9, 0x9D, ++ 0xE1, 0xB9, 0x9F, 0xE1, 0xB9, 0xA1, 0xE1, 0xB9, ++ 0xA3, 0xE1, 0xB9, 0xA5, 0xE1, 0xB9, 0xA7, 0xE1, ++ 0xB9, 0xA9, 0xE1, 0xB9, 0xAB, 0xE1, 0xB9, 0xAD, ++ 0xE1, 0xB9, 0xAF, 0xE1, 0xB9, 0xB1, 0xE1, 0xB9, ++ 0xB3, 0xE1, 0xB9, 0xB5, 0xE1, 0xB9, 0xB7, 0xE1, ++ 0xB9, 0xB9, 0xE1, 0xB9, 0xBB, 0xE1, 0xB9, 0xBD, ++ 0xE1, 0xB9, 0xBF, 0xE1, 0xBA, 0x81, 0xE1, 0xBA, ++ 0x83, 0xE1, 0xBA, 0x85, 0xE1, 0xBA, 0x87, 0xE1, ++ 0xBA, 0x89, 0xE1, 0xBA, 0x8B, 0xE1, 0xBA, 0x8D, ++ 0xE1, 0xBA, 0x8F, 0xE1, 0xBA, 0x91, 0xE1, 0xBA, ++ 0x93, 0xE1, 0xBA, 0x95, 0xE1, 0xBA, 0xA1, 0xE1, ++ 0xBA, 0xA3, 0xE1, 0xBA, 0xA5, 0xE1, 0xBA, 0xA7, ++ 0xE1, 0xBA, 0xA9, 0xE1, 0xBA, 0xAB, 0xE1, 0xBA, ++ 0xAD, 0xE1, 0xBA, 0xAF, 0xE1, 0xBA, 0xB1, 0xE1, ++ 0xBA, 0xB3, 0xE1, 0xBA, 0xB5, 0xE1, 0xBA, 0xB7, ++ 0xE1, 0xBA, 0xB9, 0xE1, 0xBA, 0xBB, 0xE1, 0xBA, ++ 0xBD, 0xE1, 0xBA, 0xBF, 0xE1, 0xBB, 0x81, 0xE1, ++ 0xBB, 0x83, 0xE1, 0xBB, 0x85, 0xE1, 0xBB, 0x87, ++ 0xE1, 0xBB, 0x89, 0xE1, 0xBB, 0x8B, 0xE1, 0xBB, ++ 0x8D, 0xE1, 0xBB, 0x8F, 0xE1, 0xBB, 0x91, 0xE1, ++ 0xBB, 0x93, 0xE1, 0xBB, 0x95, 0xE1, 0xBB, 0x97, ++ 0xE1, 0xBB, 0x99, 0xE1, 0xBB, 0x9B, 0xE1, 0xBB, ++ 0x9D, 0xE1, 0xBB, 0x9F, 0xE1, 0xBB, 0xA1, 0xE1, ++ 0xBB, 0xA3, 0xE1, 0xBB, 0xA5, 0xE1, 0xBB, 0xA7, ++ 0xE1, 0xBB, 0xA9, 0xE1, 0xBB, 0xAB, 0xE1, 0xBB, ++ 0xAD, 0xE1, 0xBB, 0xAF, 0xE1, 0xBB, 0xB1, 0xE1, ++ 0xBB, 0xB3, 0xE1, 0xBB, 0xB5, 0xE1, 0xBB, 0xB7, ++ 0xE1, 0xBB, 0xB9, 0xE1, 0xBC, 0x80, 0xE1, 0xBC, ++ 0x81, 0xE1, 0xBC, 0x82, 0xE1, 0xBC, 0x83, 0xE1, ++ 0xBC, 0x84, 0xE1, 0xBC, 0x85, 0xE1, 0xBC, 0x86, ++ 0xE1, 0xBC, 0x87, 0xE1, 0xBC, 0x90, 0xE1, 0xBC, ++ 0x91, 0xE1, 0xBC, 0x92, 0xE1, 0xBC, 0x93, 0xE1, ++ 0xBC, 0x94, 0xE1, 0xBC, 0x95, 0xE1, 0xBC, 0xA0, ++ 0xE1, 0xBC, 0xA1, 0xE1, 0xBC, 0xA2, 0xE1, 0xBC, ++ 0xA3, 0xE1, 0xBC, 0xA4, 0xE1, 0xBC, 0xA5, 0xE1, ++ 0xBC, 0xA6, 0xE1, 0xBC, 0xA7, 0xE1, 0xBC, 0xB0, ++ 0xE1, 0xBC, 0xB1, 0xE1, 0xBC, 0xB2, 0xE1, 0xBC, ++ 0xB3, 0xE1, 0xBC, 0xB4, 0xE1, 0xBC, 0xB5, 0xE1, ++ 0xBC, 0xB6, 0xE1, 0xBC, 0xB7, 0xE1, 0xBD, 0x80, ++ 0xE1, 0xBD, 0x81, 0xE1, 0xBD, 0x82, 0xE1, 0xBD, ++ 0x83, 0xE1, 0xBD, 0x84, 0xE1, 0xBD, 0x85, 0xE1, ++ 0xBD, 0x91, 0xE1, 0xBD, 0x93, 0xE1, 0xBD, 0x95, ++ 0xE1, 0xBD, 0x97, 0xE1, 0xBD, 0xA0, 0xE1, 0xBD, ++ 0xA1, 0xE1, 0xBD, 0xA2, 0xE1, 0xBD, 0xA3, 0xE1, ++ 0xBD, 0xA4, 0xE1, 0xBD, 0xA5, 0xE1, 0xBD, 0xA6, ++ 0xE1, 0xBD, 0xA7, 0xE1, 0xBE, 0x80, 0xE1, 0xBE, ++ 0x81, 0xE1, 0xBE, 0x82, 0xE1, 0xBE, 0x83, 0xE1, ++ 0xBE, 0x84, 0xE1, 0xBE, 0x85, 0xE1, 0xBE, 0x86, ++ 0xE1, 0xBE, 0x87, 0xE1, 0xBE, 0x90, 0xE1, 0xBE, ++ 0x91, 0xE1, 0xBE, 0x92, 0xE1, 0xBE, 0x93, 0xE1, ++ 0xBE, 0x94, 0xE1, 0xBE, 0x95, 0xE1, 0xBE, 0x96, ++ 0xE1, 0xBE, 0x97, 0xE1, 0xBE, 0xA0, 0xE1, 0xBE, ++ 0xA1, 0xE1, 0xBE, 0xA2, 0xE1, 0xBE, 0xA3, 0xE1, ++ 0xBE, 0xA4, 0xE1, 0xBE, 0xA5, 0xE1, 0xBE, 0xA6, ++ 0xE1, 0xBE, 0xA7, 0xE1, 0xBE, 0xB0, 0xE1, 0xBE, ++ 0xB1, 0xE1, 0xBD, 0xB0, 0xE1, 0xBD, 0xB1, 0xE1, ++ 0xBE, 0xB3, 0xE1, 0xBD, 0xB2, 0xE1, 0xBD, 0xB3, ++ 0xE1, 0xBD, 0xB4, 0xE1, 0xBD, 0xB5, 0xE1, 0xBF, ++ 0x83, 0xE1, 0xBF, 0x90, 0xE1, 0xBF, 0x91, 0xE1, ++ 0xBD, 0xB6, 0xE1, 0xBD, 0xB7, 0xE1, 0xBF, 0xA0, ++ 0xE1, 0xBF, 0xA1, 0xE1, 0xBD, 0xBA, 0xE1, 0xBD, ++ 0xBB, 0xE1, 0xBF, 0xA5, 0xE1, 0xBD, 0xB8, 0xE1, ++ 0xBD, 0xB9, 0xE1, 0xBD, 0xBC, 0xE1, 0xBD, 0xBD, ++ 0xE1, 0xBF, 0xB3, 0xCF, 0x89, 0x6B, 0xC3, 0xA5, ++ 0xE2, 0x85, 0x8E, 0xE2, 0x85, 0xB0, 0xE2, 0x85, ++ 0xB1, 0xE2, 0x85, 0xB2, 0xE2, 0x85, 0xB3, 0xE2, ++ 0x85, 0xB4, 0xE2, 0x85, 0xB5, 0xE2, 0x85, 0xB6, ++ 0xE2, 0x85, 0xB7, 0xE2, 0x85, 0xB8, 0xE2, 0x85, ++ 0xB9, 0xE2, 0x85, 0xBA, 0xE2, 0x85, 0xBB, 0xE2, ++ 0x85, 0xBC, 0xE2, 0x85, 0xBD, 0xE2, 0x85, 0xBE, ++ 0xE2, 0x85, 0xBF, 0xE2, 0x86, 0x84, 0xE2, 0x93, ++ 0x90, 0xE2, 0x93, 0x91, 0xE2, 0x93, 0x92, 0xE2, ++ 0x93, 0x93, 0xE2, 0x93, 0x94, 0xE2, 0x93, 0x95, ++ 0xE2, 0x93, 0x96, 0xE2, 0x93, 0x97, 0xE2, 0x93, ++ 0x98, 0xE2, 0x93, 0x99, 0xE2, 0x93, 0x9A, 0xE2, ++ 0x93, 0x9B, 0xE2, 0x93, 0x9C, 0xE2, 0x93, 0x9D, ++ 0xE2, 0x93, 0x9E, 0xE2, 0x93, 0x9F, 0xE2, 0x93, ++ 0xA0, 0xE2, 0x93, 0xA1, 0xE2, 0x93, 0xA2, 0xE2, ++ 0x93, 0xA3, 0xE2, 0x93, 0xA4, 0xE2, 0x93, 0xA5, ++ 0xE2, 0x93, 0xA6, 0xE2, 0x93, 0xA7, 0xE2, 0x93, ++ 0xA8, 0xE2, 0x93, 0xA9, 0xE2, 0xB0, 0xB0, 0xE2, ++ 0xB0, 0xB1, 0xE2, 0xB0, 0xB2, 0xE2, 0xB0, 0xB3, ++ 0xE2, 0xB0, 0xB4, 0xE2, 0xB0, 0xB5, 0xE2, 0xB0, ++ 0xB6, 0xE2, 0xB0, 0xB7, 0xE2, 0xB0, 0xB8, 0xE2, ++ 0xB0, 0xB9, 0xE2, 0xB0, 0xBA, 0xE2, 0xB0, 0xBB, ++ 0xE2, 0xB0, 0xBC, 0xE2, 0xB0, 0xBD, 0xE2, 0xB0, ++ 0xBE, 0xE2, 0xB0, 0xBF, 0xE2, 0xB1, 0x80, 0xE2, ++ 0xB1, 0x81, 0xE2, 0xB1, 0x82, 0xE2, 0xB1, 0x83, ++ 0xE2, 0xB1, 0x84, 0xE2, 0xB1, 0x85, 0xE2, 0xB1, ++ 0x86, 0xE2, 0xB1, 0x87, 0xE2, 0xB1, 0x88, 0xE2, ++ 0xB1, 0x89, 0xE2, 0xB1, 0x8A, 0xE2, 0xB1, 0x8B, ++ 0xE2, 0xB1, 0x8C, 0xE2, 0xB1, 0x8D, 0xE2, 0xB1, ++ 0x8E, 0xE2, 0xB1, 0x8F, 0xE2, 0xB1, 0x90, 0xE2, ++ 0xB1, 0x91, 0xE2, 0xB1, 0x92, 0xE2, 0xB1, 0x93, ++ 0xE2, 0xB1, 0x94, 0xE2, 0xB1, 0x95, 0xE2, 0xB1, ++ 0x96, 0xE2, 0xB1, 0x97, 0xE2, 0xB1, 0x98, 0xE2, ++ 0xB1, 0x99, 0xE2, 0xB1, 0x9A, 0xE2, 0xB1, 0x9B, ++ 0xE2, 0xB1, 0x9C, 0xE2, 0xB1, 0x9D, 0xE2, 0xB1, ++ 0x9E, 0xE2, 0xB1, 0xA1, 0xC9, 0xAB, 0xE1, 0xB5, ++ 0xBD, 0xC9, 0xBD, 0xE2, 0xB1, 0xA8, 0xE2, 0xB1, ++ 0xAA, 0xE2, 0xB1, 0xAC, 0xE2, 0xB1, 0xB6, 0xE2, ++ 0xB2, 0x81, 0xE2, 0xB2, 0x83, 0xE2, 0xB2, 0x85, ++ 0xE2, 0xB2, 0x87, 0xE2, 0xB2, 0x89, 0xE2, 0xB2, ++ 0x8B, 0xE2, 0xB2, 0x8D, 0xE2, 0xB2, 0x8F, 0xE2, ++ 0xB2, 0x91, 0xE2, 0xB2, 0x93, 0xE2, 0xB2, 0x95, ++ 0xE2, 0xB2, 0x97, 0xE2, 0xB2, 0x99, 0xE2, 0xB2, ++ 0x9B, 0xE2, 0xB2, 0x9D, 0xE2, 0xB2, 0x9F, 0xE2, ++ 0xB2, 0xA1, 0xE2, 0xB2, 0xA3, 0xE2, 0xB2, 0xA5, ++ 0xE2, 0xB2, 0xA7, 0xE2, 0xB2, 0xA9, 0xE2, 0xB2, ++ 0xAB, 0xE2, 0xB2, 0xAD, 0xE2, 0xB2, 0xAF, 0xE2, ++ 0xB2, 0xB1, 0xE2, 0xB2, 0xB3, 0xE2, 0xB2, 0xB5, ++ 0xE2, 0xB2, 0xB7, 0xE2, 0xB2, 0xB9, 0xE2, 0xB2, ++ 0xBB, 0xE2, 0xB2, 0xBD, 0xE2, 0xB2, 0xBF, 0xE2, ++ 0xB3, 0x81, 0xE2, 0xB3, 0x83, 0xE2, 0xB3, 0x85, ++ 0xE2, 0xB3, 0x87, 0xE2, 0xB3, 0x89, 0xE2, 0xB3, ++ 0x8B, 0xE2, 0xB3, 0x8D, 0xE2, 0xB3, 0x8F, 0xE2, ++ 0xB3, 0x91, 0xE2, 0xB3, 0x93, 0xE2, 0xB3, 0x95, ++ 0xE2, 0xB3, 0x97, 0xE2, 0xB3, 0x99, 0xE2, 0xB3, ++ 0x9B, 0xE2, 0xB3, 0x9D, 0xE2, 0xB3, 0x9F, 0xE2, ++ 0xB3, 0xA1, 0xE2, 0xB3, 0xA3, 0xEF, 0xBD, 0x81, ++ 0xEF, 0xBD, 0x82, 0xEF, 0xBD, 0x83, 0xEF, 0xBD, ++ 0x84, 0xEF, 0xBD, 0x85, 0xEF, 0xBD, 0x86, 0xEF, ++ 0xBD, 0x87, 0xEF, 0xBD, 0x88, 0xEF, 0xBD, 0x89, ++ 0xEF, 0xBD, 0x8A, 0xEF, 0xBD, 0x8B, 0xEF, 0xBD, ++ 0x8C, 0xEF, 0xBD, 0x8D, 0xEF, 0xBD, 0x8E, 0xEF, ++ 0xBD, 0x8F, 0xEF, 0xBD, 0x90, 0xEF, 0xBD, 0x91, ++ 0xEF, 0xBD, 0x92, 0xEF, 0xBD, 0x93, 0xEF, 0xBD, ++ 0x94, 0xEF, 0xBD, 0x95, 0xEF, 0xBD, 0x96, 0xEF, ++ 0xBD, 0x97, 0xEF, 0xBD, 0x98, 0xEF, 0xBD, 0x99, ++ 0xEF, 0xBD, 0x9A, 0xF0, 0x90, 0x90, 0xA8, 0xF0, ++ 0x90, 0x90, 0xA9, 0xF0, 0x90, 0x90, 0xAA, 0xF0, ++ 0x90, 0x90, 0xAB, 0xF0, 0x90, 0x90, 0xAC, 0xF0, ++ 0x90, 0x90, 0xAD, 0xF0, 0x90, 0x90, 0xAE, 0xF0, ++ 0x90, 0x90, 0xAF, 0xF0, 0x90, 0x90, 0xB0, 0xF0, ++ 0x90, 0x90, 0xB1, 0xF0, 0x90, 0x90, 0xB2, 0xF0, ++ 0x90, 0x90, 0xB3, 0xF0, 0x90, 0x90, 0xB4, 0xF0, ++ 0x90, 0x90, 0xB5, 0xF0, 0x90, 0x90, 0xB6, 0xF0, ++ 0x90, 0x90, 0xB7, 0xF0, 0x90, 0x90, 0xB8, 0xF0, ++ 0x90, 0x90, 0xB9, 0xF0, 0x90, 0x90, 0xBA, 0xF0, ++ 0x90, 0x90, 0xBB, 0xF0, 0x90, 0x90, 0xBC, 0xF0, ++ 0x90, 0x90, 0xBD, 0xF0, 0x90, 0x90, 0xBE, 0xF0, ++ 0x90, 0x90, 0xBF, 0xF0, 0x90, 0x91, 0x80, 0xF0, ++ 0x90, 0x91, 0x81, 0xF0, 0x90, 0x91, 0x82, 0xF0, ++ 0x90, 0x91, 0x83, 0xF0, 0x90, 0x91, 0x84, 0xF0, ++ 0x90, 0x91, 0x85, 0xF0, 0x90, 0x91, 0x86, 0xF0, ++ 0x90, 0x91, 0x87, 0xF0, 0x90, 0x91, 0x88, 0xF0, ++ 0x90, 0x91, 0x89, 0xF0, 0x90, 0x91, 0x8A, 0xF0, ++ 0x90, 0x91, 0x8B, 0xF0, 0x90, 0x91, 0x8C, 0xF0, ++ 0x90, 0x91, 0x8D, 0xF0, 0x90, 0x91, 0x8E, 0xF0, ++ 0x90, 0x91, 0x8F, ++ }, ++}; ++ ++static const u8_displacement_t u8_toupper_b3_tbl[2][5][256] = { ++ { ++ { /* Third byte table 0. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 0, 0 }, { 1, 2 }, ++ { 2, 64 }, { 3, 125 }, { 4, 188 }, { 5, 226 }, ++ { 6, 288 }, { 7, 338 }, { 8, 364 }, { N_, 0 }, ++ { N_, 0 }, { 9, 376 }, { 10, 378 }, { 11, 416 }, ++ { 12, 486 }, { 13, 518 }, { 14, 614 }, { 15, 670 }, ++ { 16, 724 }, { 17, 740 }, { 18, 802 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 1. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 19, 816 }, { 20, 912 }, { 21, 1008 }, { 22, 1092 }, ++ { 23, 1179 }, { 24, 1269 }, { 25, 1365 }, { 26, 1448 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 2. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 27, 1469 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { 28, 1517 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 3. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 29, 1595 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 4. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 30, 1673 }, { 31, 1769 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ }, ++ { ++ { /* Third byte table 0. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { 0, 0 }, { 1, 2 }, ++ { 2, 64 }, { 3, 125 }, { 4, 188 }, { 5, 230 }, ++ { 6, 292 }, { 7, 344 }, { 8, 388 }, { N_, 0 }, ++ { N_, 0 }, { 9, 404 }, { 10, 412 }, { 11, 450 }, ++ { 12, 524 }, { 13, 556 }, { 14, 652 }, { 15, 708 }, ++ { 16, 772 }, { 17, 792 }, { 18, 854 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 1. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 19, 868 }, { N_, 0 }, { N_, 0 }, ++ { 20, 871 }, { 21, 967 }, { 22, 1063 }, { 23, 1147 }, ++ { 24, 1234 }, { 25, 1324 }, { 26, 1420 }, { 27, 1503 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 2. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 28, 1524 }, { 29, 1575 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { 30, 1578 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 31, 1656 }, { 32, 1704 }, { 33, 1816 }, { 34, 1912 }, ++ { 35, 1966 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 3. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { 36, 2080 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ { /* Third byte table 4. */ ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { 37, 2158 }, { 38, 2254 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, ++ }, ++ }, ++}; ++ ++static const uchar_t u8_toupper_b4_tbl[2][39][257] = { ++ { ++ { /* Fourth byte table 0. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, ++ }, ++ { /* Fourth byte table 1. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 46, 48, 50, 52, 54, 56, 58, 60, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, ++ }, ++ { /* Fourth byte table 2. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 2, 4, 4, 6, 6, ++ 8, 8, 10, 10, 12, 12, 14, 14, ++ 16, 16, 18, 18, 20, 20, 22, 22, ++ 24, 24, 26, 26, 28, 28, 30, 30, ++ 32, 32, 34, 34, 36, 36, 38, 38, ++ 40, 40, 42, 42, 44, 44, 46, 46, ++ 48, 48, 49, 49, 51, 51, 53, 53, ++ 55, 55, 55, 57, 57, 59, 59, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, ++ }, ++ { /* Fourth byte table 3. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 4, 4, 6, 6, 8, ++ 8, 10, 10, 10, 12, 12, 14, 14, ++ 16, 16, 18, 18, 20, 20, 22, 22, ++ 24, 24, 26, 26, 28, 28, 30, 30, ++ 32, 32, 34, 34, 36, 36, 38, 38, ++ 40, 40, 42, 42, 44, 44, 46, 46, ++ 48, 48, 50, 50, 52, 52, 54, 54, ++ 56, 56, 56, 58, 58, 60, 60, 62, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, ++ }, ++ { /* Fourth byte table 4. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 2, 2, 4, 4, ++ 4, 6, 6, 6, 6, 8, 8, 8, ++ 8, 8, 8, 10, 10, 10, 12, 12, ++ 12, 12, 14, 14, 14, 14, 14, 16, ++ 16, 16, 18, 18, 20, 20, 22, 22, ++ 22, 24, 24, 24, 24, 24, 26, 26, ++ 26, 28, 28, 28, 28, 30, 30, 32, ++ 32, 32, 34, 34, 34, 34, 36, 36, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, ++ }, ++ { /* Fourth byte table 5. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 2, 4, ++ 4, 6, 8, 8, 10, 12, 12, 14, ++ 14, 16, 16, 18, 18, 20, 20, 22, ++ 22, 24, 24, 26, 26, 28, 30, 30, ++ 32, 32, 34, 34, 36, 36, 38, 38, ++ 40, 40, 42, 42, 44, 44, 46, 46, ++ 48, 48, 48, 50, 52, 52, 54, 54, ++ 54, 54, 56, 56, 58, 58, 60, 60, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, ++ }, ++ { /* Fourth byte table 6. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 2, 4, 4, 6, 6, ++ 8, 8, 10, 10, 12, 12, 14, 14, ++ 16, 16, 18, 18, 20, 20, 22, 22, ++ 24, 24, 26, 26, 28, 28, 30, 30, ++ 32, 32, 32, 32, 34, 34, 36, 36, ++ 38, 38, 40, 40, 42, 42, 44, 44, ++ 46, 46, 48, 48, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 50, 50, 50, ++ 50, ++ }, ++ { /* Fourth byte table 7. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 2, 4, 4, 6, ++ 8, 8, 10, 10, 12, 12, 12, 12, ++ 12, 14, 14, 14, 16, 16, 16, 16, ++ 16, 18, 20, 20, 20, 20, 20, 20, ++ 22, 22, 22, 24, 24, 24, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, 26, 26, 26, 26, 26, 26, 26, ++ 26, ++ }, ++ { /* Fourth byte table 8. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 2, 4, 4, 4, 4, ++ 4, 6, 6, 8, 10, 10, 10, 10, ++ 10, 10, 10, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, 12, 12, 12, 12, 12, 12, 12, ++ 12, ++ }, ++ { /* Fourth byte table 9. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, ++ }, ++ { /* Fourth byte table 10. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 2, 4, 6, ++ 8, 8, 10, 12, 14, 16, 18, 20, ++ 22, 24, 26, 28, 30, 32, 34, 36, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, ++ }, ++ { /* Fourth byte table 11. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 30, 32, 34, 34, 34, 34, 36, 38, ++ 38, 38, 40, 40, 42, 42, 44, 44, ++ 46, 46, 48, 48, 50, 50, 52, 52, ++ 54, 54, 56, 56, 58, 58, 60, 60, ++ 62, 64, 66, 68, 68, 68, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, 70, 70, 70, 70, 70, 70, 70, ++ 70, ++ }, ++ { /* Fourth byte table 12. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, ++ }, ++ { /* Fourth byte table 13. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 48, 50, 52, 54, 56, 58, 60, 62, ++ 64, 64, 66, 66, 68, 68, 70, 70, ++ 72, 72, 74, 74, 76, 76, 78, 78, ++ 80, 80, 82, 82, 84, 84, 86, 86, ++ 88, 88, 90, 90, 92, 92, 94, 94, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 14. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 4, 4, 6, 6, ++ 8, 8, 10, 10, 12, 12, 14, 14, ++ 16, 16, 18, 18, 20, 20, 22, 22, ++ 24, 24, 26, 26, 28, 28, 30, 30, ++ 32, 32, 34, 34, 36, 36, 38, 38, ++ 40, 40, 42, 42, 44, 44, 46, 46, ++ 48, 48, 50, 50, 52, 52, 54, 54, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, ++ }, ++ { /* Fourth byte table 15. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 2, 2, 4, 4, 6, ++ 6, 8, 8, 10, 10, 12, 12, 14, ++ 14, 14, 16, 16, 18, 18, 20, 20, ++ 22, 22, 24, 24, 26, 26, 28, 28, ++ 30, 30, 32, 32, 34, 34, 36, 36, ++ 38, 38, 40, 40, 42, 42, 44, 44, ++ 46, 46, 48, 48, 50, 50, 52, 52, ++ 52, 52, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, ++ }, ++ { /* Fourth byte table 16. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 2, 4, 4, 6, 6, ++ 8, 8, 10, 10, 12, 12, 14, 14, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, ++ }, ++ { /* Fourth byte table 17. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 4, 6, 8, 10, 12, ++ 14, 16, 18, 20, 22, 24, 26, 28, ++ 30, 32, 34, 36, 38, 40, 42, 44, ++ 46, 48, 50, 52, 54, 56, 58, 60, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, ++ }, ++ { /* Fourth byte table 18. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, ++ }, ++ { /* Fourth byte table 19. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 3, 6, 6, 9, 9, ++ 12, 12, 15, 15, 18, 18, 21, 21, ++ 24, 24, 27, 27, 30, 30, 33, 33, ++ 36, 36, 39, 39, 42, 42, 45, 45, ++ 48, 48, 51, 51, 54, 54, 57, 57, ++ 60, 60, 63, 63, 66, 66, 69, 69, ++ 72, 72, 75, 75, 78, 78, 81, 81, ++ 84, 84, 87, 87, 90, 90, 93, 93, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 20. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 3, 6, 6, 9, 9, ++ 12, 12, 15, 15, 18, 18, 21, 21, ++ 24, 24, 27, 27, 30, 30, 33, 33, ++ 36, 36, 39, 39, 42, 42, 45, 45, ++ 48, 48, 51, 51, 54, 54, 57, 57, ++ 60, 60, 63, 63, 66, 66, 69, 69, ++ 72, 72, 75, 75, 78, 78, 81, 81, ++ 84, 84, 87, 87, 90, 90, 93, 93, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 21. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 3, 6, 6, 9, 9, ++ 12, 12, 15, 15, 18, 18, 21, 21, ++ 24, 24, 27, 27, 30, 30, 33, 33, ++ 33, 33, 33, 33, 36, 36, 36, 36, ++ 36, 36, 39, 39, 42, 42, 45, 45, ++ 48, 48, 51, 51, 54, 54, 57, 57, ++ 60, 60, 63, 63, 66, 66, 69, 69, ++ 72, 72, 75, 75, 78, 78, 81, 81, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, ++ }, ++ { /* Fourth byte table 22. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 3, 6, 6, 9, 9, ++ 12, 12, 15, 15, 18, 18, 21, 21, ++ 24, 24, 27, 27, 30, 30, 33, 33, ++ 36, 36, 39, 39, 42, 42, 45, 45, ++ 48, 48, 51, 51, 54, 54, 57, 57, ++ 60, 60, 63, 63, 66, 66, 69, 69, ++ 72, 72, 75, 75, 78, 78, 81, 81, ++ 84, 84, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, ++ }, ++ { /* Fourth byte table 23. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 27, 30, 33, 36, 39, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 45, 48, 51, 54, 57, 60, 63, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 69, 72, 75, 78, 81, 84, 87, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, ++ }, ++ { /* Fourth byte table 24. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 21, 21, 24, 24, 27, 27, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 33, 36, 39, 42, 45, 48, 51, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 57, 60, 63, 66, 69, 72, 75, ++ 78, 81, 84, 87, 90, 93, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 25. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 75, 78, 78, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, ++ }, ++ { /* Fourth byte table 26. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 6, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 12, 15, 15, 15, 15, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, ++ }, ++ { /* Fourth byte table 27. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, ++ }, ++ { /* Fourth byte table 28. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, ++ }, ++ { /* Fourth byte table 29. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 6, 9, 12, 15, 18, ++ 21, 24, 27, 30, 33, 36, 39, 42, ++ 45, 48, 51, 54, 57, 60, 63, 66, ++ 69, 72, 75, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, ++ }, ++ { /* Fourth byte table 30. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 31. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, ++ }, ++ { /* Fourth byte table 32. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 33. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 34. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 35. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 36. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 37. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ { /* Fourth byte table 38. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, ++ }, ++ }, ++ { ++ { /* Fourth byte table 0. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, ++ }, ++ { /* Fourth byte table 1. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 46, 48, 50, 52, 54, 56, 58, 60, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, ++ }, ++ { /* Fourth byte table 2. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 2, 4, 4, 6, 6, ++ 8, 8, 10, 10, 12, 12, 14, 14, ++ 16, 16, 18, 18, 20, 20, 22, 22, ++ 24, 24, 26, 26, 28, 28, 30, 30, ++ 32, 32, 34, 34, 36, 36, 38, 38, ++ 40, 40, 42, 42, 44, 44, 46, 46, ++ 48, 48, 49, 49, 51, 51, 53, 53, ++ 55, 55, 55, 57, 57, 59, 59, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, 61, 61, 61, 61, 61, 61, 61, ++ 61, ++ }, ++ { /* Fourth byte table 3. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 4, 4, 6, 6, 8, ++ 8, 10, 10, 10, 12, 12, 14, 14, ++ 16, 16, 18, 18, 20, 20, 22, 22, ++ 24, 24, 26, 26, 28, 28, 30, 30, ++ 32, 32, 34, 34, 36, 36, 38, 38, ++ 40, 40, 42, 42, 44, 44, 46, 46, ++ 48, 48, 50, 50, 52, 52, 54, 54, ++ 56, 56, 56, 58, 58, 60, 60, 62, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, 63, 63, 63, 63, 63, 63, 63, ++ 63, ++ }, ++ { /* Fourth byte table 4. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 2, 4, 4, 6, 6, ++ 6, 8, 8, 8, 8, 10, 10, 10, ++ 10, 10, 10, 12, 12, 12, 14, 14, ++ 14, 14, 16, 18, 18, 18, 18, 20, ++ 20, 20, 22, 22, 24, 24, 26, 26, ++ 26, 28, 28, 28, 28, 28, 30, 30, ++ 30, 32, 32, 32, 32, 34, 34, 36, ++ 36, 36, 38, 38, 38, 38, 40, 40, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, ++ }, ++ { /* Fourth byte table 5. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 2, 4, ++ 4, 6, 8, 8, 10, 12, 12, 14, ++ 14, 16, 16, 18, 18, 20, 20, 22, ++ 22, 24, 24, 26, 26, 28, 30, 30, ++ 32, 32, 34, 34, 36, 36, 38, 38, ++ 40, 40, 42, 42, 44, 44, 46, 46, ++ 48, 48, 48, 50, 52, 52, 54, 54, ++ 54, 54, 56, 56, 58, 58, 60, 60, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, ++ }, ++ { /* Fourth byte table 6. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 2, 4, 4, 6, 6, ++ 8, 8, 10, 10, 12, 12, 14, 14, ++ 16, 16, 18, 18, 20, 20, 22, 22, ++ 24, 24, 26, 26, 28, 28, 30, 30, ++ 32, 32, 32, 32, 34, 34, 36, 36, ++ 38, 38, 40, 40, 42, 42, 44, 44, ++ 46, 46, 48, 48, 50, 50, 50, 50, ++ 50, 50, 50, 50, 50, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, 52, 52, 52, 52, 52, 52, 52, ++ 52, ++ }, ++ { /* Fourth byte table 7. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 2, 2, 2, 2, 2, ++ 4, 4, 6, 6, 8, 8, 10, 10, ++ 12, 12, 12, 12, 14, 16, 16, 18, ++ 20, 20, 22, 22, 24, 24, 24, 24, ++ 24, 26, 26, 26, 28, 28, 28, 28, ++ 28, 30, 32, 32, 35, 35, 35, 35, ++ 37, 37, 37, 39, 39, 39, 41, 41, ++ 41, 41, 41, 41, 41, 41, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, 44, 44, 44, 44, 44, 44, 44, ++ 44, ++ }, ++ { /* Fourth byte table 8. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 2, 2, 4, 4, 4, 4, ++ 4, 6, 8, 10, 12, 14, 14, 14, ++ 14, 14, 14, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, ++ }, ++ { /* Fourth byte table 9. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 4, 6, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, 8, 8, 8, 8, 8, 8, 8, ++ 8, ++ }, ++ { /* Fourth byte table 10. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 2, 4, 6, ++ 8, 8, 10, 12, 14, 16, 18, 20, ++ 22, 24, 26, 28, 30, 32, 34, 36, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, 38, 38, 38, 38, 38, 38, 38, ++ 38, ++ }, ++ { /* Fourth byte table 11. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 30, 32, 34, 34, 34, 34, 36, 38, ++ 38, 38, 40, 40, 42, 42, 44, 44, ++ 46, 46, 48, 48, 50, 50, 52, 52, ++ 54, 54, 56, 56, 58, 58, 60, 60, ++ 62, 64, 66, 68, 68, 68, 70, 70, ++ 70, 72, 72, 72, 74, 74, 74, 74, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, 74, 74, 74, 74, 74, 74, 74, ++ 74, ++ }, ++ { /* Fourth byte table 12. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, 32, 32, 32, 32, 32, 32, 32, ++ 32, ++ }, ++ { /* Fourth byte table 13. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 16, 18, 20, 22, 24, 26, 28, 30, ++ 32, 34, 36, 38, 40, 42, 44, 46, ++ 48, 50, 52, 54, 56, 58, 60, 62, ++ 64, 64, 66, 66, 68, 68, 70, 70, ++ 72, 72, 74, 74, 76, 76, 78, 78, ++ 80, 80, 82, 82, 84, 84, 86, 86, ++ 88, 88, 90, 90, 92, 92, 94, 94, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 14. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 4, 4, 6, 6, ++ 8, 8, 10, 10, 12, 12, 14, 14, ++ 16, 16, 18, 18, 20, 20, 22, 22, ++ 24, 24, 26, 26, 28, 28, 30, 30, ++ 32, 32, 34, 34, 36, 36, 38, 38, ++ 40, 40, 42, 42, 44, 44, 46, 46, ++ 48, 48, 50, 50, 52, 52, 54, 54, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, 56, 56, 56, 56, 56, 56, 56, ++ 56, ++ }, ++ { /* Fourth byte table 15. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 2, 2, 4, 4, 6, ++ 6, 8, 8, 10, 10, 12, 12, 14, ++ 16, 16, 18, 18, 20, 20, 22, 22, ++ 24, 24, 26, 26, 28, 28, 30, 30, ++ 32, 32, 34, 34, 36, 36, 38, 38, ++ 40, 40, 42, 42, 44, 44, 46, 46, ++ 48, 48, 50, 50, 52, 52, 54, 54, ++ 56, 56, 58, 58, 60, 60, 62, 62, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, ++ }, ++ { /* Fourth byte table 16. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 2, 4, 4, 6, 6, ++ 8, 8, 10, 10, 12, 12, 14, 14, ++ 16, 16, 18, 18, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, 20, 20, 20, 20, 20, 20, 20, ++ 20, ++ }, ++ { /* Fourth byte table 17. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 2, 4, 6, 8, 10, 12, ++ 14, 16, 18, 20, 22, 24, 26, 28, ++ 30, 32, 34, 36, 38, 40, 42, 44, ++ 46, 48, 50, 52, 54, 56, 58, 60, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, 62, 62, 62, 62, 62, 62, 62, ++ 62, ++ }, ++ { /* Fourth byte table 18. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 2, 4, 6, 8, 10, 12, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, 14, 14, 14, 14, 14, 14, 14, ++ 14, ++ }, ++ { /* Fourth byte table 19. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, ++ }, ++ { /* Fourth byte table 20. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 3, 6, 6, 9, 9, ++ 12, 12, 15, 15, 18, 18, 21, 21, ++ 24, 24, 27, 27, 30, 30, 33, 33, ++ 36, 36, 39, 39, 42, 42, 45, 45, ++ 48, 48, 51, 51, 54, 54, 57, 57, ++ 60, 60, 63, 63, 66, 66, 69, 69, ++ 72, 72, 75, 75, 78, 78, 81, 81, ++ 84, 84, 87, 87, 90, 90, 93, 93, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 21. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 3, 6, 6, 9, 9, ++ 12, 12, 15, 15, 18, 18, 21, 21, ++ 24, 24, 27, 27, 30, 30, 33, 33, ++ 36, 36, 39, 39, 42, 42, 45, 45, ++ 48, 48, 51, 51, 54, 54, 57, 57, ++ 60, 60, 63, 63, 66, 66, 69, 69, ++ 72, 72, 75, 75, 78, 78, 81, 81, ++ 84, 84, 87, 87, 90, 90, 93, 93, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 22. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 3, 6, 6, 9, 9, ++ 12, 12, 15, 15, 18, 18, 21, 21, ++ 24, 24, 27, 27, 30, 30, 33, 33, ++ 33, 33, 33, 33, 36, 36, 36, 36, ++ 36, 36, 39, 39, 42, 42, 45, 45, ++ 48, 48, 51, 51, 54, 54, 57, 57, ++ 60, 60, 63, 63, 66, 66, 69, 69, ++ 72, 72, 75, 75, 78, 78, 81, 81, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, 84, 84, 84, 84, 84, 84, 84, ++ 84, ++ }, ++ { /* Fourth byte table 23. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 3, 6, 6, 9, 9, ++ 12, 12, 15, 15, 18, 18, 21, 21, ++ 24, 24, 27, 27, 30, 30, 33, 33, ++ 36, 36, 39, 39, 42, 42, 45, 45, ++ 48, 48, 51, 51, 54, 54, 57, 57, ++ 60, 60, 63, 63, 66, 66, 69, 69, ++ 72, 72, 75, 75, 78, 78, 81, 81, ++ 84, 84, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, 87, 87, 87, 87, 87, 87, 87, ++ 87, ++ }, ++ { /* Fourth byte table 24. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 27, 30, 33, 36, 39, 42, 42, ++ 42, 42, 42, 42, 42, 42, 42, 42, ++ 42, 45, 48, 51, 54, 57, 60, 63, ++ 66, 66, 66, 66, 66, 66, 66, 66, ++ 66, 69, 72, 75, 78, 81, 84, 87, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, 90, 90, 90, 90, 90, 90, 90, ++ 90, ++ }, ++ { /* Fourth byte table 25. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 21, 21, 24, 24, 27, 27, ++ 30, 30, 30, 30, 30, 30, 30, 30, ++ 30, 33, 36, 39, 42, 45, 48, 51, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 57, 60, 63, 66, 69, 72, 75, ++ 78, 81, 84, 87, 90, 93, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 26. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 24, 24, 24, 24, 24, 24, 24, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 72, 72, 72, 72, 72, 72, 72, ++ 72, 75, 78, 78, 81, 81, 81, 81, ++ 81, 81, 81, 81, 81, 81, 81, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, 83, 83, 83, 83, 83, 83, 83, ++ 83, ++ }, ++ { /* Fourth byte table 27. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 6, 9, 9, 9, 9, 9, 9, ++ 9, 9, 9, 9, 9, 9, 9, 9, ++ 9, 12, 15, 15, 15, 15, 18, 18, ++ 18, 18, 18, 18, 18, 18, 18, 18, ++ 18, 18, 18, 18, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, 21, 21, 21, 21, 21, 21, 21, ++ 21, ++ }, ++ { /* Fourth byte table 28. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 6, 9, 12, 15, 18, 21, 24, ++ 27, 30, 33, 36, 39, 42, 45, 48, ++ 51, 51, 51, 51, 51, 51, 51, 51, ++ 51, 51, 51, 51, 51, 51, 51, 51, ++ 51, 51, 51, 51, 51, 51, 51, 51, ++ 51, 51, 51, 51, 51, 51, 51, 51, ++ 51, 51, 51, 51, 51, 51, 51, 51, ++ 51, 51, 51, 51, 51, 51, 51, 51, ++ 51, 51, 51, 51, 51, 51, 51, 51, ++ 51, 51, 51, 51, 51, 51, 51, 51, ++ 51, ++ }, ++ { /* Fourth byte table 29. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, ++ }, ++ { /* Fourth byte table 30. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, ++ }, ++ { /* Fourth byte table 31. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, 48, 48, 48, 48, 48, 48, 48, ++ 48, ++ }, ++ { /* Fourth byte table 32. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 93, 93, 96, 96, 96, 96, 98, 100, ++ 100, 103, 103, 106, 106, 109, 109, 109, ++ 109, 109, 109, 109, 109, 109, 109, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, 112, 112, 112, 112, 112, 112, 112, ++ 112, ++ }, ++ { /* Fourth byte table 33. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 3, 6, 6, 9, 9, ++ 12, 12, 15, 15, 18, 18, 21, 21, ++ 24, 24, 27, 27, 30, 30, 33, 33, ++ 36, 36, 39, 39, 42, 42, 45, 45, ++ 48, 48, 51, 51, 54, 54, 57, 57, ++ 60, 60, 63, 63, 66, 66, 69, 69, ++ 72, 72, 75, 75, 78, 78, 81, 81, ++ 84, 84, 87, 87, 90, 90, 93, 93, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 34. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 3, 6, 6, 9, 9, ++ 12, 12, 15, 15, 18, 18, 21, 21, ++ 24, 24, 27, 27, 30, 30, 33, 33, ++ 36, 36, 39, 39, 42, 42, 45, 45, ++ 48, 48, 51, 51, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, 54, 54, 54, 54, 54, 54, 54, ++ 54, ++ }, ++ { /* Fourth byte table 35. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 3, 6, 9, 12, 15, 18, 21, ++ 24, 27, 30, 33, 36, 39, 42, 45, ++ 48, 51, 54, 57, 60, 63, 66, 69, ++ 72, 75, 78, 81, 84, 87, 90, 93, ++ 96, 99, 102, 105, 108, 111, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, 114, 114, 114, 114, 114, 114, 114, ++ 114, ++ }, ++ { /* Fourth byte table 36. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 3, 6, 9, 12, 15, 18, ++ 21, 24, 27, 30, 33, 36, 39, 42, ++ 45, 48, 51, 54, 57, 60, 63, 66, ++ 69, 72, 75, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, 78, 78, 78, 78, 78, 78, 78, ++ 78, ++ }, ++ { /* Fourth byte table 37. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 68, 72, 76, 80, 84, 88, 92, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, 96, 96, 96, 96, 96, 96, 96, ++ 96, ++ }, ++ { /* Fourth byte table 38. */ ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 4, 8, 12, 16, 20, 24, 28, ++ 32, 36, 40, 44, 48, 52, 56, 60, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, ++ }, ++ }, ++}; ++ ++static const uchar_t u8_toupper_final_tbl[2][2318] = { ++ { ++ 0xCE, 0x9C, 0xC3, 0x80, 0xC3, 0x81, 0xC3, 0x82, ++ 0xC3, 0x83, 0xC3, 0x84, 0xC3, 0x85, 0xC3, 0x86, ++ 0xC3, 0x87, 0xC3, 0x88, 0xC3, 0x89, 0xC3, 0x8A, ++ 0xC3, 0x8B, 0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E, ++ 0xC3, 0x8F, 0xC3, 0x90, 0xC3, 0x91, 0xC3, 0x92, ++ 0xC3, 0x93, 0xC3, 0x94, 0xC3, 0x95, 0xC3, 0x96, ++ 0xC3, 0x98, 0xC3, 0x99, 0xC3, 0x9A, 0xC3, 0x9B, ++ 0xC3, 0x9C, 0xC3, 0x9D, 0xC3, 0x9E, 0xC5, 0xB8, ++ 0xC4, 0x80, 0xC4, 0x82, 0xC4, 0x84, 0xC4, 0x86, ++ 0xC4, 0x88, 0xC4, 0x8A, 0xC4, 0x8C, 0xC4, 0x8E, ++ 0xC4, 0x90, 0xC4, 0x92, 0xC4, 0x94, 0xC4, 0x96, ++ 0xC4, 0x98, 0xC4, 0x9A, 0xC4, 0x9C, 0xC4, 0x9E, ++ 0xC4, 0xA0, 0xC4, 0xA2, 0xC4, 0xA4, 0xC4, 0xA6, ++ 0xC4, 0xA8, 0xC4, 0xAA, 0xC4, 0xAC, 0xC4, 0xAE, ++ 0x49, 0xC4, 0xB2, 0xC4, 0xB4, 0xC4, 0xB6, 0xC4, ++ 0xB9, 0xC4, 0xBB, 0xC4, 0xBD, 0xC4, 0xBF, 0xC5, ++ 0x81, 0xC5, 0x83, 0xC5, 0x85, 0xC5, 0x87, 0xC5, ++ 0x8A, 0xC5, 0x8C, 0xC5, 0x8E, 0xC5, 0x90, 0xC5, ++ 0x92, 0xC5, 0x94, 0xC5, 0x96, 0xC5, 0x98, 0xC5, ++ 0x9A, 0xC5, 0x9C, 0xC5, 0x9E, 0xC5, 0xA0, 0xC5, ++ 0xA2, 0xC5, 0xA4, 0xC5, 0xA6, 0xC5, 0xA8, 0xC5, ++ 0xAA, 0xC5, 0xAC, 0xC5, 0xAE, 0xC5, 0xB0, 0xC5, ++ 0xB2, 0xC5, 0xB4, 0xC5, 0xB6, 0xC5, 0xB9, 0xC5, ++ 0xBB, 0xC5, 0xBD, 0x53, 0xC6, 0x82, 0xC6, 0x84, ++ 0xC6, 0x87, 0xC6, 0x8B, 0xC6, 0x91, 0xC7, 0xB6, ++ 0xC6, 0x98, 0xC8, 0xA0, 0xC6, 0xA0, 0xC6, 0xA2, ++ 0xC6, 0xA4, 0xC6, 0xA7, 0xC6, 0xAC, 0xC6, 0xAF, ++ 0xC6, 0xB3, 0xC6, 0xB5, 0xC6, 0xB8, 0xC6, 0xBC, ++ 0xC7, 0xB7, 0xC7, 0x84, 0xC7, 0x84, 0xC7, 0x87, ++ 0xC7, 0x87, 0xC7, 0x8A, 0xC7, 0x8A, 0xC7, 0x8D, ++ 0xC7, 0x8F, 0xC7, 0x91, 0xC7, 0x93, 0xC7, 0x95, ++ 0xC7, 0x97, 0xC7, 0x99, 0xC7, 0x9B, 0xC6, 0x8E, ++ 0xC7, 0x9E, 0xC7, 0xA0, 0xC7, 0xA2, 0xC7, 0xA4, ++ 0xC7, 0xA6, 0xC7, 0xA8, 0xC7, 0xAA, 0xC7, 0xAC, ++ 0xC7, 0xAE, 0xC7, 0xB1, 0xC7, 0xB1, 0xC7, 0xB4, ++ 0xC7, 0xB8, 0xC7, 0xBA, 0xC7, 0xBC, 0xC7, 0xBE, ++ 0xC8, 0x80, 0xC8, 0x82, 0xC8, 0x84, 0xC8, 0x86, ++ 0xC8, 0x88, 0xC8, 0x8A, 0xC8, 0x8C, 0xC8, 0x8E, ++ 0xC8, 0x90, 0xC8, 0x92, 0xC8, 0x94, 0xC8, 0x96, ++ 0xC8, 0x98, 0xC8, 0x9A, 0xC8, 0x9C, 0xC8, 0x9E, ++ 0xC8, 0xA2, 0xC8, 0xA4, 0xC8, 0xA6, 0xC8, 0xA8, ++ 0xC8, 0xAA, 0xC8, 0xAC, 0xC8, 0xAE, 0xC8, 0xB0, ++ 0xC8, 0xB2, 0xC6, 0x81, 0xC6, 0x86, 0xC6, 0x89, ++ 0xC6, 0x8A, 0xC6, 0x8F, 0xC6, 0x90, 0xC6, 0x93, ++ 0xC6, 0x94, 0xC6, 0x97, 0xC6, 0x96, 0xC6, 0x9C, ++ 0xC6, 0x9D, 0xC6, 0x9F, 0xC6, 0xA6, 0xC6, 0xA9, ++ 0xC6, 0xAE, 0xC6, 0xB1, 0xC6, 0xB2, 0xC6, 0xB7, ++ 0xCE, 0x99, 0xCE, 0x86, 0xCE, 0x88, 0xCE, 0x89, ++ 0xCE, 0x8A, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93, ++ 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, ++ 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, ++ 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, ++ 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0xA3, 0xCE, 0xA3, ++ 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, ++ 0xCE, 0xA8, 0xCE, 0xA9, 0xCE, 0xAA, 0xCE, 0xAB, ++ 0xCE, 0x8C, 0xCE, 0x8E, 0xCE, 0x8F, 0xCE, 0x92, ++ 0xCE, 0x98, 0xCE, 0xA6, 0xCE, 0xA0, 0xCF, 0x98, ++ 0xCF, 0x9A, 0xCF, 0x9C, 0xCF, 0x9E, 0xCF, 0xA0, ++ 0xCF, 0xA2, 0xCF, 0xA4, 0xCF, 0xA6, 0xCF, 0xA8, ++ 0xCF, 0xAA, 0xCF, 0xAC, 0xCF, 0xAE, 0xCE, 0x9A, ++ 0xCE, 0xA1, 0xCE, 0xA3, 0xCE, 0x95, 0xD0, 0x90, ++ 0xD0, 0x91, 0xD0, 0x92, 0xD0, 0x93, 0xD0, 0x94, ++ 0xD0, 0x95, 0xD0, 0x96, 0xD0, 0x97, 0xD0, 0x98, ++ 0xD0, 0x99, 0xD0, 0x9A, 0xD0, 0x9B, 0xD0, 0x9C, ++ 0xD0, 0x9D, 0xD0, 0x9E, 0xD0, 0x9F, 0xD0, 0xA0, ++ 0xD0, 0xA1, 0xD0, 0xA2, 0xD0, 0xA3, 0xD0, 0xA4, ++ 0xD0, 0xA5, 0xD0, 0xA6, 0xD0, 0xA7, 0xD0, 0xA8, ++ 0xD0, 0xA9, 0xD0, 0xAA, 0xD0, 0xAB, 0xD0, 0xAC, ++ 0xD0, 0xAD, 0xD0, 0xAE, 0xD0, 0xAF, 0xD0, 0x80, ++ 0xD0, 0x81, 0xD0, 0x82, 0xD0, 0x83, 0xD0, 0x84, ++ 0xD0, 0x85, 0xD0, 0x86, 0xD0, 0x87, 0xD0, 0x88, ++ 0xD0, 0x89, 0xD0, 0x8A, 0xD0, 0x8B, 0xD0, 0x8C, ++ 0xD0, 0x8D, 0xD0, 0x8E, 0xD0, 0x8F, 0xD1, 0xA0, ++ 0xD1, 0xA2, 0xD1, 0xA4, 0xD1, 0xA6, 0xD1, 0xA8, ++ 0xD1, 0xAA, 0xD1, 0xAC, 0xD1, 0xAE, 0xD1, 0xB0, ++ 0xD1, 0xB2, 0xD1, 0xB4, 0xD1, 0xB6, 0xD1, 0xB8, ++ 0xD1, 0xBA, 0xD1, 0xBC, 0xD1, 0xBE, 0xD2, 0x80, ++ 0xD2, 0x8A, 0xD2, 0x8C, 0xD2, 0x8E, 0xD2, 0x90, ++ 0xD2, 0x92, 0xD2, 0x94, 0xD2, 0x96, 0xD2, 0x98, ++ 0xD2, 0x9A, 0xD2, 0x9C, 0xD2, 0x9E, 0xD2, 0xA0, ++ 0xD2, 0xA2, 0xD2, 0xA4, 0xD2, 0xA6, 0xD2, 0xA8, ++ 0xD2, 0xAA, 0xD2, 0xAC, 0xD2, 0xAE, 0xD2, 0xB0, ++ 0xD2, 0xB2, 0xD2, 0xB4, 0xD2, 0xB6, 0xD2, 0xB8, ++ 0xD2, 0xBA, 0xD2, 0xBC, 0xD2, 0xBE, 0xD3, 0x81, ++ 0xD3, 0x83, 0xD3, 0x85, 0xD3, 0x87, 0xD3, 0x89, ++ 0xD3, 0x8B, 0xD3, 0x8D, 0xD3, 0x90, 0xD3, 0x92, ++ 0xD3, 0x94, 0xD3, 0x96, 0xD3, 0x98, 0xD3, 0x9A, ++ 0xD3, 0x9C, 0xD3, 0x9E, 0xD3, 0xA0, 0xD3, 0xA2, ++ 0xD3, 0xA4, 0xD3, 0xA6, 0xD3, 0xA8, 0xD3, 0xAA, ++ 0xD3, 0xAC, 0xD3, 0xAE, 0xD3, 0xB0, 0xD3, 0xB2, ++ 0xD3, 0xB4, 0xD3, 0xB8, 0xD4, 0x80, 0xD4, 0x82, ++ 0xD4, 0x84, 0xD4, 0x86, 0xD4, 0x88, 0xD4, 0x8A, ++ 0xD4, 0x8C, 0xD4, 0x8E, 0xD4, 0xB1, 0xD4, 0xB2, ++ 0xD4, 0xB3, 0xD4, 0xB4, 0xD4, 0xB5, 0xD4, 0xB6, ++ 0xD4, 0xB7, 0xD4, 0xB8, 0xD4, 0xB9, 0xD4, 0xBA, ++ 0xD4, 0xBB, 0xD4, 0xBC, 0xD4, 0xBD, 0xD4, 0xBE, ++ 0xD4, 0xBF, 0xD5, 0x80, 0xD5, 0x81, 0xD5, 0x82, ++ 0xD5, 0x83, 0xD5, 0x84, 0xD5, 0x85, 0xD5, 0x86, ++ 0xD5, 0x87, 0xD5, 0x88, 0xD5, 0x89, 0xD5, 0x8A, ++ 0xD5, 0x8B, 0xD5, 0x8C, 0xD5, 0x8D, 0xD5, 0x8E, ++ 0xD5, 0x8F, 0xD5, 0x90, 0xD5, 0x91, 0xD5, 0x92, ++ 0xD5, 0x93, 0xD5, 0x94, 0xD5, 0x95, 0xD5, 0x96, ++ 0xE1, 0xB8, 0x80, 0xE1, 0xB8, 0x82, 0xE1, 0xB8, ++ 0x84, 0xE1, 0xB8, 0x86, 0xE1, 0xB8, 0x88, 0xE1, ++ 0xB8, 0x8A, 0xE1, 0xB8, 0x8C, 0xE1, 0xB8, 0x8E, ++ 0xE1, 0xB8, 0x90, 0xE1, 0xB8, 0x92, 0xE1, 0xB8, ++ 0x94, 0xE1, 0xB8, 0x96, 0xE1, 0xB8, 0x98, 0xE1, ++ 0xB8, 0x9A, 0xE1, 0xB8, 0x9C, 0xE1, 0xB8, 0x9E, ++ 0xE1, 0xB8, 0xA0, 0xE1, 0xB8, 0xA2, 0xE1, 0xB8, ++ 0xA4, 0xE1, 0xB8, 0xA6, 0xE1, 0xB8, 0xA8, 0xE1, ++ 0xB8, 0xAA, 0xE1, 0xB8, 0xAC, 0xE1, 0xB8, 0xAE, ++ 0xE1, 0xB8, 0xB0, 0xE1, 0xB8, 0xB2, 0xE1, 0xB8, ++ 0xB4, 0xE1, 0xB8, 0xB6, 0xE1, 0xB8, 0xB8, 0xE1, ++ 0xB8, 0xBA, 0xE1, 0xB8, 0xBC, 0xE1, 0xB8, 0xBE, ++ 0xE1, 0xB9, 0x80, 0xE1, 0xB9, 0x82, 0xE1, 0xB9, ++ 0x84, 0xE1, 0xB9, 0x86, 0xE1, 0xB9, 0x88, 0xE1, ++ 0xB9, 0x8A, 0xE1, 0xB9, 0x8C, 0xE1, 0xB9, 0x8E, ++ 0xE1, 0xB9, 0x90, 0xE1, 0xB9, 0x92, 0xE1, 0xB9, ++ 0x94, 0xE1, 0xB9, 0x96, 0xE1, 0xB9, 0x98, 0xE1, ++ 0xB9, 0x9A, 0xE1, 0xB9, 0x9C, 0xE1, 0xB9, 0x9E, ++ 0xE1, 0xB9, 0xA0, 0xE1, 0xB9, 0xA2, 0xE1, 0xB9, ++ 0xA4, 0xE1, 0xB9, 0xA6, 0xE1, 0xB9, 0xA8, 0xE1, ++ 0xB9, 0xAA, 0xE1, 0xB9, 0xAC, 0xE1, 0xB9, 0xAE, ++ 0xE1, 0xB9, 0xB0, 0xE1, 0xB9, 0xB2, 0xE1, 0xB9, ++ 0xB4, 0xE1, 0xB9, 0xB6, 0xE1, 0xB9, 0xB8, 0xE1, ++ 0xB9, 0xBA, 0xE1, 0xB9, 0xBC, 0xE1, 0xB9, 0xBE, ++ 0xE1, 0xBA, 0x80, 0xE1, 0xBA, 0x82, 0xE1, 0xBA, ++ 0x84, 0xE1, 0xBA, 0x86, 0xE1, 0xBA, 0x88, 0xE1, ++ 0xBA, 0x8A, 0xE1, 0xBA, 0x8C, 0xE1, 0xBA, 0x8E, ++ 0xE1, 0xBA, 0x90, 0xE1, 0xBA, 0x92, 0xE1, 0xBA, ++ 0x94, 0xE1, 0xB9, 0xA0, 0xE1, 0xBA, 0xA0, 0xE1, ++ 0xBA, 0xA2, 0xE1, 0xBA, 0xA4, 0xE1, 0xBA, 0xA6, ++ 0xE1, 0xBA, 0xA8, 0xE1, 0xBA, 0xAA, 0xE1, 0xBA, ++ 0xAC, 0xE1, 0xBA, 0xAE, 0xE1, 0xBA, 0xB0, 0xE1, ++ 0xBA, 0xB2, 0xE1, 0xBA, 0xB4, 0xE1, 0xBA, 0xB6, ++ 0xE1, 0xBA, 0xB8, 0xE1, 0xBA, 0xBA, 0xE1, 0xBA, ++ 0xBC, 0xE1, 0xBA, 0xBE, 0xE1, 0xBB, 0x80, 0xE1, ++ 0xBB, 0x82, 0xE1, 0xBB, 0x84, 0xE1, 0xBB, 0x86, ++ 0xE1, 0xBB, 0x88, 0xE1, 0xBB, 0x8A, 0xE1, 0xBB, ++ 0x8C, 0xE1, 0xBB, 0x8E, 0xE1, 0xBB, 0x90, 0xE1, ++ 0xBB, 0x92, 0xE1, 0xBB, 0x94, 0xE1, 0xBB, 0x96, ++ 0xE1, 0xBB, 0x98, 0xE1, 0xBB, 0x9A, 0xE1, 0xBB, ++ 0x9C, 0xE1, 0xBB, 0x9E, 0xE1, 0xBB, 0xA0, 0xE1, ++ 0xBB, 0xA2, 0xE1, 0xBB, 0xA4, 0xE1, 0xBB, 0xA6, ++ 0xE1, 0xBB, 0xA8, 0xE1, 0xBB, 0xAA, 0xE1, 0xBB, ++ 0xAC, 0xE1, 0xBB, 0xAE, 0xE1, 0xBB, 0xB0, 0xE1, ++ 0xBB, 0xB2, 0xE1, 0xBB, 0xB4, 0xE1, 0xBB, 0xB6, ++ 0xE1, 0xBB, 0xB8, 0xE1, 0xBC, 0x88, 0xE1, 0xBC, ++ 0x89, 0xE1, 0xBC, 0x8A, 0xE1, 0xBC, 0x8B, 0xE1, ++ 0xBC, 0x8C, 0xE1, 0xBC, 0x8D, 0xE1, 0xBC, 0x8E, ++ 0xE1, 0xBC, 0x8F, 0xE1, 0xBC, 0x98, 0xE1, 0xBC, ++ 0x99, 0xE1, 0xBC, 0x9A, 0xE1, 0xBC, 0x9B, 0xE1, ++ 0xBC, 0x9C, 0xE1, 0xBC, 0x9D, 0xE1, 0xBC, 0xA8, ++ 0xE1, 0xBC, 0xA9, 0xE1, 0xBC, 0xAA, 0xE1, 0xBC, ++ 0xAB, 0xE1, 0xBC, 0xAC, 0xE1, 0xBC, 0xAD, 0xE1, ++ 0xBC, 0xAE, 0xE1, 0xBC, 0xAF, 0xE1, 0xBC, 0xB8, ++ 0xE1, 0xBC, 0xB9, 0xE1, 0xBC, 0xBA, 0xE1, 0xBC, ++ 0xBB, 0xE1, 0xBC, 0xBC, 0xE1, 0xBC, 0xBD, 0xE1, ++ 0xBC, 0xBE, 0xE1, 0xBC, 0xBF, 0xE1, 0xBD, 0x88, ++ 0xE1, 0xBD, 0x89, 0xE1, 0xBD, 0x8A, 0xE1, 0xBD, ++ 0x8B, 0xE1, 0xBD, 0x8C, 0xE1, 0xBD, 0x8D, 0xE1, ++ 0xBD, 0x99, 0xE1, 0xBD, 0x9B, 0xE1, 0xBD, 0x9D, ++ 0xE1, 0xBD, 0x9F, 0xE1, 0xBD, 0xA8, 0xE1, 0xBD, ++ 0xA9, 0xE1, 0xBD, 0xAA, 0xE1, 0xBD, 0xAB, 0xE1, ++ 0xBD, 0xAC, 0xE1, 0xBD, 0xAD, 0xE1, 0xBD, 0xAE, ++ 0xE1, 0xBD, 0xAF, 0xE1, 0xBE, 0xBA, 0xE1, 0xBE, ++ 0xBB, 0xE1, 0xBF, 0x88, 0xE1, 0xBF, 0x89, 0xE1, ++ 0xBF, 0x8A, 0xE1, 0xBF, 0x8B, 0xE1, 0xBF, 0x9A, ++ 0xE1, 0xBF, 0x9B, 0xE1, 0xBF, 0xB8, 0xE1, 0xBF, ++ 0xB9, 0xE1, 0xBF, 0xAA, 0xE1, 0xBF, 0xAB, 0xE1, ++ 0xBF, 0xBA, 0xE1, 0xBF, 0xBB, 0xE1, 0xBE, 0x88, ++ 0xE1, 0xBE, 0x89, 0xE1, 0xBE, 0x8A, 0xE1, 0xBE, ++ 0x8B, 0xE1, 0xBE, 0x8C, 0xE1, 0xBE, 0x8D, 0xE1, ++ 0xBE, 0x8E, 0xE1, 0xBE, 0x8F, 0xE1, 0xBE, 0x98, ++ 0xE1, 0xBE, 0x99, 0xE1, 0xBE, 0x9A, 0xE1, 0xBE, ++ 0x9B, 0xE1, 0xBE, 0x9C, 0xE1, 0xBE, 0x9D, 0xE1, ++ 0xBE, 0x9E, 0xE1, 0xBE, 0x9F, 0xE1, 0xBE, 0xA8, ++ 0xE1, 0xBE, 0xA9, 0xE1, 0xBE, 0xAA, 0xE1, 0xBE, ++ 0xAB, 0xE1, 0xBE, 0xAC, 0xE1, 0xBE, 0xAD, 0xE1, ++ 0xBE, 0xAE, 0xE1, 0xBE, 0xAF, 0xE1, 0xBE, 0xB8, ++ 0xE1, 0xBE, 0xB9, 0xE1, 0xBE, 0xBC, 0xCE, 0x99, ++ 0xE1, 0xBF, 0x8C, 0xE1, 0xBF, 0x98, 0xE1, 0xBF, ++ 0x99, 0xE1, 0xBF, 0xA8, 0xE1, 0xBF, 0xA9, 0xE1, ++ 0xBF, 0xAC, 0xE1, 0xBF, 0xBC, 0xE2, 0x85, 0xA0, ++ 0xE2, 0x85, 0xA1, 0xE2, 0x85, 0xA2, 0xE2, 0x85, ++ 0xA3, 0xE2, 0x85, 0xA4, 0xE2, 0x85, 0xA5, 0xE2, ++ 0x85, 0xA6, 0xE2, 0x85, 0xA7, 0xE2, 0x85, 0xA8, ++ 0xE2, 0x85, 0xA9, 0xE2, 0x85, 0xAA, 0xE2, 0x85, ++ 0xAB, 0xE2, 0x85, 0xAC, 0xE2, 0x85, 0xAD, 0xE2, ++ 0x85, 0xAE, 0xE2, 0x85, 0xAF, 0xE2, 0x92, 0xB6, ++ 0xE2, 0x92, 0xB7, 0xE2, 0x92, 0xB8, 0xE2, 0x92, ++ 0xB9, 0xE2, 0x92, 0xBA, 0xE2, 0x92, 0xBB, 0xE2, ++ 0x92, 0xBC, 0xE2, 0x92, 0xBD, 0xE2, 0x92, 0xBE, ++ 0xE2, 0x92, 0xBF, 0xE2, 0x93, 0x80, 0xE2, 0x93, ++ 0x81, 0xE2, 0x93, 0x82, 0xE2, 0x93, 0x83, 0xE2, ++ 0x93, 0x84, 0xE2, 0x93, 0x85, 0xE2, 0x93, 0x86, ++ 0xE2, 0x93, 0x87, 0xE2, 0x93, 0x88, 0xE2, 0x93, ++ 0x89, 0xE2, 0x93, 0x8A, 0xE2, 0x93, 0x8B, 0xE2, ++ 0x93, 0x8C, 0xE2, 0x93, 0x8D, 0xE2, 0x93, 0x8E, ++ 0xE2, 0x93, 0x8F, 0xEF, 0xBC, 0xA1, 0xEF, 0xBC, ++ 0xA2, 0xEF, 0xBC, 0xA3, 0xEF, 0xBC, 0xA4, 0xEF, ++ 0xBC, 0xA5, 0xEF, 0xBC, 0xA6, 0xEF, 0xBC, 0xA7, ++ 0xEF, 0xBC, 0xA8, 0xEF, 0xBC, 0xA9, 0xEF, 0xBC, ++ 0xAA, 0xEF, 0xBC, 0xAB, 0xEF, 0xBC, 0xAC, 0xEF, ++ 0xBC, 0xAD, 0xEF, 0xBC, 0xAE, 0xEF, 0xBC, 0xAF, ++ 0xEF, 0xBC, 0xB0, 0xEF, 0xBC, 0xB1, 0xEF, 0xBC, ++ 0xB2, 0xEF, 0xBC, 0xB3, 0xEF, 0xBC, 0xB4, 0xEF, ++ 0xBC, 0xB5, 0xEF, 0xBC, 0xB6, 0xEF, 0xBC, 0xB7, ++ 0xEF, 0xBC, 0xB8, 0xEF, 0xBC, 0xB9, 0xEF, 0xBC, ++ 0xBA, 0xF0, 0x90, 0x90, 0x80, 0xF0, 0x90, 0x90, ++ 0x81, 0xF0, 0x90, 0x90, 0x82, 0xF0, 0x90, 0x90, ++ 0x83, 0xF0, 0x90, 0x90, 0x84, 0xF0, 0x90, 0x90, ++ 0x85, 0xF0, 0x90, 0x90, 0x86, 0xF0, 0x90, 0x90, ++ 0x87, 0xF0, 0x90, 0x90, 0x88, 0xF0, 0x90, 0x90, ++ 0x89, 0xF0, 0x90, 0x90, 0x8A, 0xF0, 0x90, 0x90, ++ 0x8B, 0xF0, 0x90, 0x90, 0x8C, 0xF0, 0x90, 0x90, ++ 0x8D, 0xF0, 0x90, 0x90, 0x8E, 0xF0, 0x90, 0x90, ++ 0x8F, 0xF0, 0x90, 0x90, 0x90, 0xF0, 0x90, 0x90, ++ 0x91, 0xF0, 0x90, 0x90, 0x92, 0xF0, 0x90, 0x90, ++ 0x93, 0xF0, 0x90, 0x90, 0x94, 0xF0, 0x90, 0x90, ++ 0x95, 0xF0, 0x90, 0x90, 0x96, 0xF0, 0x90, 0x90, ++ 0x97, 0xF0, 0x90, 0x90, 0x98, 0xF0, 0x90, 0x90, ++ 0x99, 0xF0, 0x90, 0x90, 0x9A, 0xF0, 0x90, 0x90, ++ 0x9B, 0xF0, 0x90, 0x90, 0x9C, 0xF0, 0x90, 0x90, ++ 0x9D, 0xF0, 0x90, 0x90, 0x9E, 0xF0, 0x90, 0x90, ++ 0x9F, 0xF0, 0x90, 0x90, 0xA0, 0xF0, 0x90, 0x90, ++ 0xA1, 0xF0, 0x90, 0x90, 0xA2, 0xF0, 0x90, 0x90, ++ 0xA3, 0xF0, 0x90, 0x90, 0xA4, 0xF0, 0x90, 0x90, ++ 0xA5, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, ++ }, ++ { ++ 0xCE, 0x9C, 0xC3, 0x80, 0xC3, 0x81, 0xC3, 0x82, ++ 0xC3, 0x83, 0xC3, 0x84, 0xC3, 0x85, 0xC3, 0x86, ++ 0xC3, 0x87, 0xC3, 0x88, 0xC3, 0x89, 0xC3, 0x8A, ++ 0xC3, 0x8B, 0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E, ++ 0xC3, 0x8F, 0xC3, 0x90, 0xC3, 0x91, 0xC3, 0x92, ++ 0xC3, 0x93, 0xC3, 0x94, 0xC3, 0x95, 0xC3, 0x96, ++ 0xC3, 0x98, 0xC3, 0x99, 0xC3, 0x9A, 0xC3, 0x9B, ++ 0xC3, 0x9C, 0xC3, 0x9D, 0xC3, 0x9E, 0xC5, 0xB8, ++ 0xC4, 0x80, 0xC4, 0x82, 0xC4, 0x84, 0xC4, 0x86, ++ 0xC4, 0x88, 0xC4, 0x8A, 0xC4, 0x8C, 0xC4, 0x8E, ++ 0xC4, 0x90, 0xC4, 0x92, 0xC4, 0x94, 0xC4, 0x96, ++ 0xC4, 0x98, 0xC4, 0x9A, 0xC4, 0x9C, 0xC4, 0x9E, ++ 0xC4, 0xA0, 0xC4, 0xA2, 0xC4, 0xA4, 0xC4, 0xA6, ++ 0xC4, 0xA8, 0xC4, 0xAA, 0xC4, 0xAC, 0xC4, 0xAE, ++ 0x49, 0xC4, 0xB2, 0xC4, 0xB4, 0xC4, 0xB6, 0xC4, ++ 0xB9, 0xC4, 0xBB, 0xC4, 0xBD, 0xC4, 0xBF, 0xC5, ++ 0x81, 0xC5, 0x83, 0xC5, 0x85, 0xC5, 0x87, 0xC5, ++ 0x8A, 0xC5, 0x8C, 0xC5, 0x8E, 0xC5, 0x90, 0xC5, ++ 0x92, 0xC5, 0x94, 0xC5, 0x96, 0xC5, 0x98, 0xC5, ++ 0x9A, 0xC5, 0x9C, 0xC5, 0x9E, 0xC5, 0xA0, 0xC5, ++ 0xA2, 0xC5, 0xA4, 0xC5, 0xA6, 0xC5, 0xA8, 0xC5, ++ 0xAA, 0xC5, 0xAC, 0xC5, 0xAE, 0xC5, 0xB0, 0xC5, ++ 0xB2, 0xC5, 0xB4, 0xC5, 0xB6, 0xC5, 0xB9, 0xC5, ++ 0xBB, 0xC5, 0xBD, 0x53, 0xC9, 0x83, 0xC6, 0x82, ++ 0xC6, 0x84, 0xC6, 0x87, 0xC6, 0x8B, 0xC6, 0x91, ++ 0xC7, 0xB6, 0xC6, 0x98, 0xC8, 0xBD, 0xC8, 0xA0, ++ 0xC6, 0xA0, 0xC6, 0xA2, 0xC6, 0xA4, 0xC6, 0xA7, ++ 0xC6, 0xAC, 0xC6, 0xAF, 0xC6, 0xB3, 0xC6, 0xB5, ++ 0xC6, 0xB8, 0xC6, 0xBC, 0xC7, 0xB7, 0xC7, 0x84, ++ 0xC7, 0x84, 0xC7, 0x87, 0xC7, 0x87, 0xC7, 0x8A, ++ 0xC7, 0x8A, 0xC7, 0x8D, 0xC7, 0x8F, 0xC7, 0x91, ++ 0xC7, 0x93, 0xC7, 0x95, 0xC7, 0x97, 0xC7, 0x99, ++ 0xC7, 0x9B, 0xC6, 0x8E, 0xC7, 0x9E, 0xC7, 0xA0, ++ 0xC7, 0xA2, 0xC7, 0xA4, 0xC7, 0xA6, 0xC7, 0xA8, ++ 0xC7, 0xAA, 0xC7, 0xAC, 0xC7, 0xAE, 0xC7, 0xB1, ++ 0xC7, 0xB1, 0xC7, 0xB4, 0xC7, 0xB8, 0xC7, 0xBA, ++ 0xC7, 0xBC, 0xC7, 0xBE, 0xC8, 0x80, 0xC8, 0x82, ++ 0xC8, 0x84, 0xC8, 0x86, 0xC8, 0x88, 0xC8, 0x8A, ++ 0xC8, 0x8C, 0xC8, 0x8E, 0xC8, 0x90, 0xC8, 0x92, ++ 0xC8, 0x94, 0xC8, 0x96, 0xC8, 0x98, 0xC8, 0x9A, ++ 0xC8, 0x9C, 0xC8, 0x9E, 0xC8, 0xA2, 0xC8, 0xA4, ++ 0xC8, 0xA6, 0xC8, 0xA8, 0xC8, 0xAA, 0xC8, 0xAC, ++ 0xC8, 0xAE, 0xC8, 0xB0, 0xC8, 0xB2, 0xC8, 0xBB, ++ 0xC9, 0x81, 0xC9, 0x86, 0xC9, 0x88, 0xC9, 0x8A, ++ 0xC9, 0x8C, 0xC9, 0x8E, 0xC6, 0x81, 0xC6, 0x86, ++ 0xC6, 0x89, 0xC6, 0x8A, 0xC6, 0x8F, 0xC6, 0x90, ++ 0xC6, 0x93, 0xC6, 0x94, 0xC6, 0x97, 0xC6, 0x96, ++ 0xE2, 0xB1, 0xA2, 0xC6, 0x9C, 0xC6, 0x9D, 0xC6, ++ 0x9F, 0xE2, 0xB1, 0xA4, 0xC6, 0xA6, 0xC6, 0xA9, ++ 0xC6, 0xAE, 0xC9, 0x84, 0xC6, 0xB1, 0xC6, 0xB2, ++ 0xC9, 0x85, 0xC6, 0xB7, 0xCE, 0x99, 0xCF, 0xBD, ++ 0xCF, 0xBE, 0xCF, 0xBF, 0xCE, 0x86, 0xCE, 0x88, ++ 0xCE, 0x89, 0xCE, 0x8A, 0xCE, 0x91, 0xCE, 0x92, ++ 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96, ++ 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A, ++ 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, ++ 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0xA3, ++ 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, ++ 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xCE, 0xAA, ++ 0xCE, 0xAB, 0xCE, 0x8C, 0xCE, 0x8E, 0xCE, 0x8F, ++ 0xCE, 0x92, 0xCE, 0x98, 0xCE, 0xA6, 0xCE, 0xA0, ++ 0xCF, 0x98, 0xCF, 0x9A, 0xCF, 0x9C, 0xCF, 0x9E, ++ 0xCF, 0xA0, 0xCF, 0xA2, 0xCF, 0xA4, 0xCF, 0xA6, ++ 0xCF, 0xA8, 0xCF, 0xAA, 0xCF, 0xAC, 0xCF, 0xAE, ++ 0xCE, 0x9A, 0xCE, 0xA1, 0xCF, 0xB9, 0xCE, 0x95, ++ 0xCF, 0xB7, 0xCF, 0xBA, 0xD0, 0x90, 0xD0, 0x91, ++ 0xD0, 0x92, 0xD0, 0x93, 0xD0, 0x94, 0xD0, 0x95, ++ 0xD0, 0x96, 0xD0, 0x97, 0xD0, 0x98, 0xD0, 0x99, ++ 0xD0, 0x9A, 0xD0, 0x9B, 0xD0, 0x9C, 0xD0, 0x9D, ++ 0xD0, 0x9E, 0xD0, 0x9F, 0xD0, 0xA0, 0xD0, 0xA1, ++ 0xD0, 0xA2, 0xD0, 0xA3, 0xD0, 0xA4, 0xD0, 0xA5, ++ 0xD0, 0xA6, 0xD0, 0xA7, 0xD0, 0xA8, 0xD0, 0xA9, ++ 0xD0, 0xAA, 0xD0, 0xAB, 0xD0, 0xAC, 0xD0, 0xAD, ++ 0xD0, 0xAE, 0xD0, 0xAF, 0xD0, 0x80, 0xD0, 0x81, ++ 0xD0, 0x82, 0xD0, 0x83, 0xD0, 0x84, 0xD0, 0x85, ++ 0xD0, 0x86, 0xD0, 0x87, 0xD0, 0x88, 0xD0, 0x89, ++ 0xD0, 0x8A, 0xD0, 0x8B, 0xD0, 0x8C, 0xD0, 0x8D, ++ 0xD0, 0x8E, 0xD0, 0x8F, 0xD1, 0xA0, 0xD1, 0xA2, ++ 0xD1, 0xA4, 0xD1, 0xA6, 0xD1, 0xA8, 0xD1, 0xAA, ++ 0xD1, 0xAC, 0xD1, 0xAE, 0xD1, 0xB0, 0xD1, 0xB2, ++ 0xD1, 0xB4, 0xD1, 0xB6, 0xD1, 0xB8, 0xD1, 0xBA, ++ 0xD1, 0xBC, 0xD1, 0xBE, 0xD2, 0x80, 0xD2, 0x8A, ++ 0xD2, 0x8C, 0xD2, 0x8E, 0xD2, 0x90, 0xD2, 0x92, ++ 0xD2, 0x94, 0xD2, 0x96, 0xD2, 0x98, 0xD2, 0x9A, ++ 0xD2, 0x9C, 0xD2, 0x9E, 0xD2, 0xA0, 0xD2, 0xA2, ++ 0xD2, 0xA4, 0xD2, 0xA6, 0xD2, 0xA8, 0xD2, 0xAA, ++ 0xD2, 0xAC, 0xD2, 0xAE, 0xD2, 0xB0, 0xD2, 0xB2, ++ 0xD2, 0xB4, 0xD2, 0xB6, 0xD2, 0xB8, 0xD2, 0xBA, ++ 0xD2, 0xBC, 0xD2, 0xBE, 0xD3, 0x81, 0xD3, 0x83, ++ 0xD3, 0x85, 0xD3, 0x87, 0xD3, 0x89, 0xD3, 0x8B, ++ 0xD3, 0x8D, 0xD3, 0x80, 0xD3, 0x90, 0xD3, 0x92, ++ 0xD3, 0x94, 0xD3, 0x96, 0xD3, 0x98, 0xD3, 0x9A, ++ 0xD3, 0x9C, 0xD3, 0x9E, 0xD3, 0xA0, 0xD3, 0xA2, ++ 0xD3, 0xA4, 0xD3, 0xA6, 0xD3, 0xA8, 0xD3, 0xAA, ++ 0xD3, 0xAC, 0xD3, 0xAE, 0xD3, 0xB0, 0xD3, 0xB2, ++ 0xD3, 0xB4, 0xD3, 0xB6, 0xD3, 0xB8, 0xD3, 0xBA, ++ 0xD3, 0xBC, 0xD3, 0xBE, 0xD4, 0x80, 0xD4, 0x82, ++ 0xD4, 0x84, 0xD4, 0x86, 0xD4, 0x88, 0xD4, 0x8A, ++ 0xD4, 0x8C, 0xD4, 0x8E, 0xD4, 0x90, 0xD4, 0x92, ++ 0xD4, 0xB1, 0xD4, 0xB2, 0xD4, 0xB3, 0xD4, 0xB4, ++ 0xD4, 0xB5, 0xD4, 0xB6, 0xD4, 0xB7, 0xD4, 0xB8, ++ 0xD4, 0xB9, 0xD4, 0xBA, 0xD4, 0xBB, 0xD4, 0xBC, ++ 0xD4, 0xBD, 0xD4, 0xBE, 0xD4, 0xBF, 0xD5, 0x80, ++ 0xD5, 0x81, 0xD5, 0x82, 0xD5, 0x83, 0xD5, 0x84, ++ 0xD5, 0x85, 0xD5, 0x86, 0xD5, 0x87, 0xD5, 0x88, ++ 0xD5, 0x89, 0xD5, 0x8A, 0xD5, 0x8B, 0xD5, 0x8C, ++ 0xD5, 0x8D, 0xD5, 0x8E, 0xD5, 0x8F, 0xD5, 0x90, ++ 0xD5, 0x91, 0xD5, 0x92, 0xD5, 0x93, 0xD5, 0x94, ++ 0xD5, 0x95, 0xD5, 0x96, 0xE2, 0xB1, 0xA3, 0xE1, ++ 0xB8, 0x80, 0xE1, 0xB8, 0x82, 0xE1, 0xB8, 0x84, ++ 0xE1, 0xB8, 0x86, 0xE1, 0xB8, 0x88, 0xE1, 0xB8, ++ 0x8A, 0xE1, 0xB8, 0x8C, 0xE1, 0xB8, 0x8E, 0xE1, ++ 0xB8, 0x90, 0xE1, 0xB8, 0x92, 0xE1, 0xB8, 0x94, ++ 0xE1, 0xB8, 0x96, 0xE1, 0xB8, 0x98, 0xE1, 0xB8, ++ 0x9A, 0xE1, 0xB8, 0x9C, 0xE1, 0xB8, 0x9E, 0xE1, ++ 0xB8, 0xA0, 0xE1, 0xB8, 0xA2, 0xE1, 0xB8, 0xA4, ++ 0xE1, 0xB8, 0xA6, 0xE1, 0xB8, 0xA8, 0xE1, 0xB8, ++ 0xAA, 0xE1, 0xB8, 0xAC, 0xE1, 0xB8, 0xAE, 0xE1, ++ 0xB8, 0xB0, 0xE1, 0xB8, 0xB2, 0xE1, 0xB8, 0xB4, ++ 0xE1, 0xB8, 0xB6, 0xE1, 0xB8, 0xB8, 0xE1, 0xB8, ++ 0xBA, 0xE1, 0xB8, 0xBC, 0xE1, 0xB8, 0xBE, 0xE1, ++ 0xB9, 0x80, 0xE1, 0xB9, 0x82, 0xE1, 0xB9, 0x84, ++ 0xE1, 0xB9, 0x86, 0xE1, 0xB9, 0x88, 0xE1, 0xB9, ++ 0x8A, 0xE1, 0xB9, 0x8C, 0xE1, 0xB9, 0x8E, 0xE1, ++ 0xB9, 0x90, 0xE1, 0xB9, 0x92, 0xE1, 0xB9, 0x94, ++ 0xE1, 0xB9, 0x96, 0xE1, 0xB9, 0x98, 0xE1, 0xB9, ++ 0x9A, 0xE1, 0xB9, 0x9C, 0xE1, 0xB9, 0x9E, 0xE1, ++ 0xB9, 0xA0, 0xE1, 0xB9, 0xA2, 0xE1, 0xB9, 0xA4, ++ 0xE1, 0xB9, 0xA6, 0xE1, 0xB9, 0xA8, 0xE1, 0xB9, ++ 0xAA, 0xE1, 0xB9, 0xAC, 0xE1, 0xB9, 0xAE, 0xE1, ++ 0xB9, 0xB0, 0xE1, 0xB9, 0xB2, 0xE1, 0xB9, 0xB4, ++ 0xE1, 0xB9, 0xB6, 0xE1, 0xB9, 0xB8, 0xE1, 0xB9, ++ 0xBA, 0xE1, 0xB9, 0xBC, 0xE1, 0xB9, 0xBE, 0xE1, ++ 0xBA, 0x80, 0xE1, 0xBA, 0x82, 0xE1, 0xBA, 0x84, ++ 0xE1, 0xBA, 0x86, 0xE1, 0xBA, 0x88, 0xE1, 0xBA, ++ 0x8A, 0xE1, 0xBA, 0x8C, 0xE1, 0xBA, 0x8E, 0xE1, ++ 0xBA, 0x90, 0xE1, 0xBA, 0x92, 0xE1, 0xBA, 0x94, ++ 0xE1, 0xB9, 0xA0, 0xE1, 0xBA, 0xA0, 0xE1, 0xBA, ++ 0xA2, 0xE1, 0xBA, 0xA4, 0xE1, 0xBA, 0xA6, 0xE1, ++ 0xBA, 0xA8, 0xE1, 0xBA, 0xAA, 0xE1, 0xBA, 0xAC, ++ 0xE1, 0xBA, 0xAE, 0xE1, 0xBA, 0xB0, 0xE1, 0xBA, ++ 0xB2, 0xE1, 0xBA, 0xB4, 0xE1, 0xBA, 0xB6, 0xE1, ++ 0xBA, 0xB8, 0xE1, 0xBA, 0xBA, 0xE1, 0xBA, 0xBC, ++ 0xE1, 0xBA, 0xBE, 0xE1, 0xBB, 0x80, 0xE1, 0xBB, ++ 0x82, 0xE1, 0xBB, 0x84, 0xE1, 0xBB, 0x86, 0xE1, ++ 0xBB, 0x88, 0xE1, 0xBB, 0x8A, 0xE1, 0xBB, 0x8C, ++ 0xE1, 0xBB, 0x8E, 0xE1, 0xBB, 0x90, 0xE1, 0xBB, ++ 0x92, 0xE1, 0xBB, 0x94, 0xE1, 0xBB, 0x96, 0xE1, ++ 0xBB, 0x98, 0xE1, 0xBB, 0x9A, 0xE1, 0xBB, 0x9C, ++ 0xE1, 0xBB, 0x9E, 0xE1, 0xBB, 0xA0, 0xE1, 0xBB, ++ 0xA2, 0xE1, 0xBB, 0xA4, 0xE1, 0xBB, 0xA6, 0xE1, ++ 0xBB, 0xA8, 0xE1, 0xBB, 0xAA, 0xE1, 0xBB, 0xAC, ++ 0xE1, 0xBB, 0xAE, 0xE1, 0xBB, 0xB0, 0xE1, 0xBB, ++ 0xB2, 0xE1, 0xBB, 0xB4, 0xE1, 0xBB, 0xB6, 0xE1, ++ 0xBB, 0xB8, 0xE1, 0xBC, 0x88, 0xE1, 0xBC, 0x89, ++ 0xE1, 0xBC, 0x8A, 0xE1, 0xBC, 0x8B, 0xE1, 0xBC, ++ 0x8C, 0xE1, 0xBC, 0x8D, 0xE1, 0xBC, 0x8E, 0xE1, ++ 0xBC, 0x8F, 0xE1, 0xBC, 0x98, 0xE1, 0xBC, 0x99, ++ 0xE1, 0xBC, 0x9A, 0xE1, 0xBC, 0x9B, 0xE1, 0xBC, ++ 0x9C, 0xE1, 0xBC, 0x9D, 0xE1, 0xBC, 0xA8, 0xE1, ++ 0xBC, 0xA9, 0xE1, 0xBC, 0xAA, 0xE1, 0xBC, 0xAB, ++ 0xE1, 0xBC, 0xAC, 0xE1, 0xBC, 0xAD, 0xE1, 0xBC, ++ 0xAE, 0xE1, 0xBC, 0xAF, 0xE1, 0xBC, 0xB8, 0xE1, ++ 0xBC, 0xB9, 0xE1, 0xBC, 0xBA, 0xE1, 0xBC, 0xBB, ++ 0xE1, 0xBC, 0xBC, 0xE1, 0xBC, 0xBD, 0xE1, 0xBC, ++ 0xBE, 0xE1, 0xBC, 0xBF, 0xE1, 0xBD, 0x88, 0xE1, ++ 0xBD, 0x89, 0xE1, 0xBD, 0x8A, 0xE1, 0xBD, 0x8B, ++ 0xE1, 0xBD, 0x8C, 0xE1, 0xBD, 0x8D, 0xE1, 0xBD, ++ 0x99, 0xE1, 0xBD, 0x9B, 0xE1, 0xBD, 0x9D, 0xE1, ++ 0xBD, 0x9F, 0xE1, 0xBD, 0xA8, 0xE1, 0xBD, 0xA9, ++ 0xE1, 0xBD, 0xAA, 0xE1, 0xBD, 0xAB, 0xE1, 0xBD, ++ 0xAC, 0xE1, 0xBD, 0xAD, 0xE1, 0xBD, 0xAE, 0xE1, ++ 0xBD, 0xAF, 0xE1, 0xBE, 0xBA, 0xE1, 0xBE, 0xBB, ++ 0xE1, 0xBF, 0x88, 0xE1, 0xBF, 0x89, 0xE1, 0xBF, ++ 0x8A, 0xE1, 0xBF, 0x8B, 0xE1, 0xBF, 0x9A, 0xE1, ++ 0xBF, 0x9B, 0xE1, 0xBF, 0xB8, 0xE1, 0xBF, 0xB9, ++ 0xE1, 0xBF, 0xAA, 0xE1, 0xBF, 0xAB, 0xE1, 0xBF, ++ 0xBA, 0xE1, 0xBF, 0xBB, 0xE1, 0xBE, 0x88, 0xE1, ++ 0xBE, 0x89, 0xE1, 0xBE, 0x8A, 0xE1, 0xBE, 0x8B, ++ 0xE1, 0xBE, 0x8C, 0xE1, 0xBE, 0x8D, 0xE1, 0xBE, ++ 0x8E, 0xE1, 0xBE, 0x8F, 0xE1, 0xBE, 0x98, 0xE1, ++ 0xBE, 0x99, 0xE1, 0xBE, 0x9A, 0xE1, 0xBE, 0x9B, ++ 0xE1, 0xBE, 0x9C, 0xE1, 0xBE, 0x9D, 0xE1, 0xBE, ++ 0x9E, 0xE1, 0xBE, 0x9F, 0xE1, 0xBE, 0xA8, 0xE1, ++ 0xBE, 0xA9, 0xE1, 0xBE, 0xAA, 0xE1, 0xBE, 0xAB, ++ 0xE1, 0xBE, 0xAC, 0xE1, 0xBE, 0xAD, 0xE1, 0xBE, ++ 0xAE, 0xE1, 0xBE, 0xAF, 0xE1, 0xBE, 0xB8, 0xE1, ++ 0xBE, 0xB9, 0xE1, 0xBE, 0xBC, 0xCE, 0x99, 0xE1, ++ 0xBF, 0x8C, 0xE1, 0xBF, 0x98, 0xE1, 0xBF, 0x99, ++ 0xE1, 0xBF, 0xA8, 0xE1, 0xBF, 0xA9, 0xE1, 0xBF, ++ 0xAC, 0xE1, 0xBF, 0xBC, 0xE2, 0x84, 0xB2, 0xE2, ++ 0x85, 0xA0, 0xE2, 0x85, 0xA1, 0xE2, 0x85, 0xA2, ++ 0xE2, 0x85, 0xA3, 0xE2, 0x85, 0xA4, 0xE2, 0x85, ++ 0xA5, 0xE2, 0x85, 0xA6, 0xE2, 0x85, 0xA7, 0xE2, ++ 0x85, 0xA8, 0xE2, 0x85, 0xA9, 0xE2, 0x85, 0xAA, ++ 0xE2, 0x85, 0xAB, 0xE2, 0x85, 0xAC, 0xE2, 0x85, ++ 0xAD, 0xE2, 0x85, 0xAE, 0xE2, 0x85, 0xAF, 0xE2, ++ 0x86, 0x83, 0xE2, 0x92, 0xB6, 0xE2, 0x92, 0xB7, ++ 0xE2, 0x92, 0xB8, 0xE2, 0x92, 0xB9, 0xE2, 0x92, ++ 0xBA, 0xE2, 0x92, 0xBB, 0xE2, 0x92, 0xBC, 0xE2, ++ 0x92, 0xBD, 0xE2, 0x92, 0xBE, 0xE2, 0x92, 0xBF, ++ 0xE2, 0x93, 0x80, 0xE2, 0x93, 0x81, 0xE2, 0x93, ++ 0x82, 0xE2, 0x93, 0x83, 0xE2, 0x93, 0x84, 0xE2, ++ 0x93, 0x85, 0xE2, 0x93, 0x86, 0xE2, 0x93, 0x87, ++ 0xE2, 0x93, 0x88, 0xE2, 0x93, 0x89, 0xE2, 0x93, ++ 0x8A, 0xE2, 0x93, 0x8B, 0xE2, 0x93, 0x8C, 0xE2, ++ 0x93, 0x8D, 0xE2, 0x93, 0x8E, 0xE2, 0x93, 0x8F, ++ 0xE2, 0xB0, 0x80, 0xE2, 0xB0, 0x81, 0xE2, 0xB0, ++ 0x82, 0xE2, 0xB0, 0x83, 0xE2, 0xB0, 0x84, 0xE2, ++ 0xB0, 0x85, 0xE2, 0xB0, 0x86, 0xE2, 0xB0, 0x87, ++ 0xE2, 0xB0, 0x88, 0xE2, 0xB0, 0x89, 0xE2, 0xB0, ++ 0x8A, 0xE2, 0xB0, 0x8B, 0xE2, 0xB0, 0x8C, 0xE2, ++ 0xB0, 0x8D, 0xE2, 0xB0, 0x8E, 0xE2, 0xB0, 0x8F, ++ 0xE2, 0xB0, 0x90, 0xE2, 0xB0, 0x91, 0xE2, 0xB0, ++ 0x92, 0xE2, 0xB0, 0x93, 0xE2, 0xB0, 0x94, 0xE2, ++ 0xB0, 0x95, 0xE2, 0xB0, 0x96, 0xE2, 0xB0, 0x97, ++ 0xE2, 0xB0, 0x98, 0xE2, 0xB0, 0x99, 0xE2, 0xB0, ++ 0x9A, 0xE2, 0xB0, 0x9B, 0xE2, 0xB0, 0x9C, 0xE2, ++ 0xB0, 0x9D, 0xE2, 0xB0, 0x9E, 0xE2, 0xB0, 0x9F, ++ 0xE2, 0xB0, 0xA0, 0xE2, 0xB0, 0xA1, 0xE2, 0xB0, ++ 0xA2, 0xE2, 0xB0, 0xA3, 0xE2, 0xB0, 0xA4, 0xE2, ++ 0xB0, 0xA5, 0xE2, 0xB0, 0xA6, 0xE2, 0xB0, 0xA7, ++ 0xE2, 0xB0, 0xA8, 0xE2, 0xB0, 0xA9, 0xE2, 0xB0, ++ 0xAA, 0xE2, 0xB0, 0xAB, 0xE2, 0xB0, 0xAC, 0xE2, ++ 0xB0, 0xAD, 0xE2, 0xB0, 0xAE, 0xE2, 0xB1, 0xA0, ++ 0xC8, 0xBA, 0xC8, 0xBE, 0xE2, 0xB1, 0xA7, 0xE2, ++ 0xB1, 0xA9, 0xE2, 0xB1, 0xAB, 0xE2, 0xB1, 0xB5, ++ 0xE2, 0xB2, 0x80, 0xE2, 0xB2, 0x82, 0xE2, 0xB2, ++ 0x84, 0xE2, 0xB2, 0x86, 0xE2, 0xB2, 0x88, 0xE2, ++ 0xB2, 0x8A, 0xE2, 0xB2, 0x8C, 0xE2, 0xB2, 0x8E, ++ 0xE2, 0xB2, 0x90, 0xE2, 0xB2, 0x92, 0xE2, 0xB2, ++ 0x94, 0xE2, 0xB2, 0x96, 0xE2, 0xB2, 0x98, 0xE2, ++ 0xB2, 0x9A, 0xE2, 0xB2, 0x9C, 0xE2, 0xB2, 0x9E, ++ 0xE2, 0xB2, 0xA0, 0xE2, 0xB2, 0xA2, 0xE2, 0xB2, ++ 0xA4, 0xE2, 0xB2, 0xA6, 0xE2, 0xB2, 0xA8, 0xE2, ++ 0xB2, 0xAA, 0xE2, 0xB2, 0xAC, 0xE2, 0xB2, 0xAE, ++ 0xE2, 0xB2, 0xB0, 0xE2, 0xB2, 0xB2, 0xE2, 0xB2, ++ 0xB4, 0xE2, 0xB2, 0xB6, 0xE2, 0xB2, 0xB8, 0xE2, ++ 0xB2, 0xBA, 0xE2, 0xB2, 0xBC, 0xE2, 0xB2, 0xBE, ++ 0xE2, 0xB3, 0x80, 0xE2, 0xB3, 0x82, 0xE2, 0xB3, ++ 0x84, 0xE2, 0xB3, 0x86, 0xE2, 0xB3, 0x88, 0xE2, ++ 0xB3, 0x8A, 0xE2, 0xB3, 0x8C, 0xE2, 0xB3, 0x8E, ++ 0xE2, 0xB3, 0x90, 0xE2, 0xB3, 0x92, 0xE2, 0xB3, ++ 0x94, 0xE2, 0xB3, 0x96, 0xE2, 0xB3, 0x98, 0xE2, ++ 0xB3, 0x9A, 0xE2, 0xB3, 0x9C, 0xE2, 0xB3, 0x9E, ++ 0xE2, 0xB3, 0xA0, 0xE2, 0xB3, 0xA2, 0xE1, 0x82, ++ 0xA0, 0xE1, 0x82, 0xA1, 0xE1, 0x82, 0xA2, 0xE1, ++ 0x82, 0xA3, 0xE1, 0x82, 0xA4, 0xE1, 0x82, 0xA5, ++ 0xE1, 0x82, 0xA6, 0xE1, 0x82, 0xA7, 0xE1, 0x82, ++ 0xA8, 0xE1, 0x82, 0xA9, 0xE1, 0x82, 0xAA, 0xE1, ++ 0x82, 0xAB, 0xE1, 0x82, 0xAC, 0xE1, 0x82, 0xAD, ++ 0xE1, 0x82, 0xAE, 0xE1, 0x82, 0xAF, 0xE1, 0x82, ++ 0xB0, 0xE1, 0x82, 0xB1, 0xE1, 0x82, 0xB2, 0xE1, ++ 0x82, 0xB3, 0xE1, 0x82, 0xB4, 0xE1, 0x82, 0xB5, ++ 0xE1, 0x82, 0xB6, 0xE1, 0x82, 0xB7, 0xE1, 0x82, ++ 0xB8, 0xE1, 0x82, 0xB9, 0xE1, 0x82, 0xBA, 0xE1, ++ 0x82, 0xBB, 0xE1, 0x82, 0xBC, 0xE1, 0x82, 0xBD, ++ 0xE1, 0x82, 0xBE, 0xE1, 0x82, 0xBF, 0xE1, 0x83, ++ 0x80, 0xE1, 0x83, 0x81, 0xE1, 0x83, 0x82, 0xE1, ++ 0x83, 0x83, 0xE1, 0x83, 0x84, 0xE1, 0x83, 0x85, ++ 0xEF, 0xBC, 0xA1, 0xEF, 0xBC, 0xA2, 0xEF, 0xBC, ++ 0xA3, 0xEF, 0xBC, 0xA4, 0xEF, 0xBC, 0xA5, 0xEF, ++ 0xBC, 0xA6, 0xEF, 0xBC, 0xA7, 0xEF, 0xBC, 0xA8, ++ 0xEF, 0xBC, 0xA9, 0xEF, 0xBC, 0xAA, 0xEF, 0xBC, ++ 0xAB, 0xEF, 0xBC, 0xAC, 0xEF, 0xBC, 0xAD, 0xEF, ++ 0xBC, 0xAE, 0xEF, 0xBC, 0xAF, 0xEF, 0xBC, 0xB0, ++ 0xEF, 0xBC, 0xB1, 0xEF, 0xBC, 0xB2, 0xEF, 0xBC, ++ 0xB3, 0xEF, 0xBC, 0xB4, 0xEF, 0xBC, 0xB5, 0xEF, ++ 0xBC, 0xB6, 0xEF, 0xBC, 0xB7, 0xEF, 0xBC, 0xB8, ++ 0xEF, 0xBC, 0xB9, 0xEF, 0xBC, 0xBA, 0xF0, 0x90, ++ 0x90, 0x80, 0xF0, 0x90, 0x90, 0x81, 0xF0, 0x90, ++ 0x90, 0x82, 0xF0, 0x90, 0x90, 0x83, 0xF0, 0x90, ++ 0x90, 0x84, 0xF0, 0x90, 0x90, 0x85, 0xF0, 0x90, ++ 0x90, 0x86, 0xF0, 0x90, 0x90, 0x87, 0xF0, 0x90, ++ 0x90, 0x88, 0xF0, 0x90, 0x90, 0x89, 0xF0, 0x90, ++ 0x90, 0x8A, 0xF0, 0x90, 0x90, 0x8B, 0xF0, 0x90, ++ 0x90, 0x8C, 0xF0, 0x90, 0x90, 0x8D, 0xF0, 0x90, ++ 0x90, 0x8E, 0xF0, 0x90, 0x90, 0x8F, 0xF0, 0x90, ++ 0x90, 0x90, 0xF0, 0x90, 0x90, 0x91, 0xF0, 0x90, ++ 0x90, 0x92, 0xF0, 0x90, 0x90, 0x93, 0xF0, 0x90, ++ 0x90, 0x94, 0xF0, 0x90, 0x90, 0x95, 0xF0, 0x90, ++ 0x90, 0x96, 0xF0, 0x90, 0x90, 0x97, 0xF0, 0x90, ++ 0x90, 0x98, 0xF0, 0x90, 0x90, 0x99, 0xF0, 0x90, ++ 0x90, 0x9A, 0xF0, 0x90, 0x90, 0x9B, 0xF0, 0x90, ++ 0x90, 0x9C, 0xF0, 0x90, 0x90, 0x9D, 0xF0, 0x90, ++ 0x90, 0x9E, 0xF0, 0x90, 0x90, 0x9F, 0xF0, 0x90, ++ 0x90, 0xA0, 0xF0, 0x90, 0x90, 0xA1, 0xF0, 0x90, ++ 0x90, 0xA2, 0xF0, 0x90, 0x90, 0xA3, 0xF0, 0x90, ++ 0x90, 0xA4, 0xF0, 0x90, 0x90, 0xA5, 0xF0, 0x90, ++ 0x90, 0xA6, 0xF0, 0x90, 0x90, 0xA7, ++ }, ++}; ++ ++#undef N_ ++#undef FIL_ ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_U8_TEXTPREP_DATA_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/u8_textprep.h linux-3.2.33-go/include/zfs/sys/u8_textprep.h +--- linux-3.2.33-go.orig/include/zfs/sys/u8_textprep.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/u8_textprep.h 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,113 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_U8_TEXTPREP_H ++#define _SYS_U8_TEXTPREP_H ++ ++ ++ ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * Unicode encoding conversion functions and their macros. ++ */ ++#define UCONV_IN_BIG_ENDIAN 0x0001 ++#define UCONV_OUT_BIG_ENDIAN 0x0002 ++#define UCONV_IN_SYSTEM_ENDIAN 0x0004 ++#define UCONV_OUT_SYSTEM_ENDIAN 0x0008 ++#define UCONV_IN_LITTLE_ENDIAN 0x0010 ++#define UCONV_OUT_LITTLE_ENDIAN 0x0020 ++#define UCONV_IGNORE_NULL 0x0040 ++#define UCONV_IN_ACCEPT_BOM 0x0080 ++#define UCONV_OUT_EMIT_BOM 0x0100 ++ ++extern int uconv_u16tou32(const uint16_t *, size_t *, uint32_t *, size_t *, ++ int); ++extern int uconv_u16tou8(const uint16_t *, size_t *, uchar_t *, size_t *, int); ++extern int uconv_u32tou16(const uint32_t *, size_t *, uint16_t *, size_t *, ++ int); ++extern int uconv_u32tou8(const uint32_t *, size_t *, uchar_t *, size_t *, int); ++extern int uconv_u8tou16(const uchar_t *, size_t *, uint16_t *, size_t *, int); ++extern int uconv_u8tou32(const uchar_t *, size_t *, uint32_t *, size_t *, int); ++ ++/* ++ * UTF-8 text preparation functions and their macros. ++ * ++ * Among the macros defined, U8_CANON_DECOMP, U8_COMPAT_DECOMP, and ++ * U8_CANON_COMP are not public interfaces and must not be used directly ++ * at the flag input argument. ++ */ ++#define U8_STRCMP_CS (0x00000001) ++#define U8_STRCMP_CI_UPPER (0x00000002) ++#define U8_STRCMP_CI_LOWER (0x00000004) ++ ++#define U8_CANON_DECOMP (0x00000010) ++#define U8_COMPAT_DECOMP (0x00000020) ++#define U8_CANON_COMP (0x00000040) ++ ++#define U8_STRCMP_NFD (U8_CANON_DECOMP) ++#define U8_STRCMP_NFC (U8_CANON_DECOMP | U8_CANON_COMP) ++#define U8_STRCMP_NFKD (U8_COMPAT_DECOMP) ++#define U8_STRCMP_NFKC (U8_COMPAT_DECOMP | U8_CANON_COMP) ++ ++#define U8_TEXTPREP_TOUPPER (U8_STRCMP_CI_UPPER) ++#define U8_TEXTPREP_TOLOWER (U8_STRCMP_CI_LOWER) ++ ++#define U8_TEXTPREP_NFD (U8_STRCMP_NFD) ++#define U8_TEXTPREP_NFC (U8_STRCMP_NFC) ++#define U8_TEXTPREP_NFKD (U8_STRCMP_NFKD) ++#define U8_TEXTPREP_NFKC (U8_STRCMP_NFKC) ++ ++#define U8_TEXTPREP_IGNORE_NULL (0x00010000) ++#define U8_TEXTPREP_IGNORE_INVALID (0x00020000) ++#define U8_TEXTPREP_NOWAIT (0x00040000) ++ ++#define U8_UNICODE_320 (0) ++#define U8_UNICODE_500 (1) ++#define U8_UNICODE_LATEST (U8_UNICODE_500) ++ ++#define U8_VALIDATE_ENTIRE (0x00100000) ++#define U8_VALIDATE_CHECK_ADDITIONAL (0x00200000) ++#define U8_VALIDATE_UCS2_RANGE (0x00400000) ++ ++#define U8_ILLEGAL_CHAR (-1) ++#define U8_OUT_OF_RANGE_CHAR (-2) ++ ++extern int u8_validate(char *, size_t, char **, int, int *); ++extern int u8_strcmp(const char *, const char *, size_t, int, size_t, int *); ++extern size_t u8_textprep_str(char *, size_t *, char *, size_t *, int, size_t, ++ int *); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_U8_TEXTPREP_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/uberblock.h linux-3.2.33-go/include/zfs/sys/uberblock.h +--- linux-3.2.33-go.orig/include/zfs/sys/uberblock.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/uberblock.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,46 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_UBERBLOCK_H ++#define _SYS_UBERBLOCK_H ++ ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++typedef struct uberblock uberblock_t; ++ ++extern int uberblock_verify(uberblock_t *ub); ++extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_UBERBLOCK_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/uberblock_impl.h linux-3.2.33-go/include/zfs/sys/uberblock_impl.h +--- linux-3.2.33-go.orig/include/zfs/sys/uberblock_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/uberblock_impl.h 2012-11-16 23:25:34.339039449 +0100 +@@ -0,0 +1,63 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_UBERBLOCK_IMPL_H ++#define _SYS_UBERBLOCK_IMPL_H ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * The uberblock version is incremented whenever an incompatible on-disk ++ * format change is made to the SPA, DMU, or ZAP. ++ * ++ * Note: the first two fields should never be moved. When a storage pool ++ * is opened, the uberblock must be read off the disk before the version ++ * can be checked. If the ub_version field is moved, we may not detect ++ * version mismatch. If the ub_magic field is moved, applications that ++ * expect the magic number in the first word won't work. ++ */ ++#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */ ++#define UBERBLOCK_SHIFT 10 /* up to 1K */ ++ ++struct uberblock { ++ uint64_t ub_magic; /* UBERBLOCK_MAGIC */ ++ uint64_t ub_version; /* SPA_VERSION */ ++ uint64_t ub_txg; /* txg of last sync */ ++ uint64_t ub_guid_sum; /* sum of all vdev guids */ ++ uint64_t ub_timestamp; /* UTC time of last sync */ ++ blkptr_t ub_rootbp; /* MOS objset_phys_t */ ++ ++ /* highest SPA_VERSION supported by software that wrote this txg */ ++ uint64_t ub_software_version; ++}; ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_UBERBLOCK_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/uio_impl.h linux-3.2.33-go/include/zfs/sys/uio_impl.h +--- linux-3.2.33-go.orig/include/zfs/sys/uio_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/uio_impl.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,49 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ ++/* All Rights Reserved */ ++ ++/* ++ * University Copyright- Copyright (c) 1982, 1986, 1988 ++ * The Regents of the University of California ++ * All Rights Reserved ++ * ++ * University Acknowledgment- Portions of this document are derived from ++ * software developed by the University of California, Berkeley, and its ++ * contributors. ++ */ ++ ++#ifndef _SYS_UIO_IMPL_H ++#define _SYS_UIO_IMPL_H ++ ++#include ++ ++extern int uiomove(void *, size_t, enum uio_rw, uio_t *); ++extern void uio_prefaultpages(ssize_t, uio_t *); ++extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *); ++extern void uioskip(uio_t *, size_t); ++ ++#endif /* _SYS_UIO_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/unique.h linux-3.2.33-go/include/zfs/sys/unique.h +--- linux-3.2.33-go.orig/include/zfs/sys/unique.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/unique.h 2012-11-16 23:25:34.343039404 +0100 +@@ -0,0 +1,59 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_UNIQUE_H ++#define _SYS_UNIQUE_H ++ ++ ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* The number of significant bits in each unique value. */ ++#define UNIQUE_BITS 56 ++ ++void unique_init(void); ++void unique_fini(void); ++ ++/* ++ * Return a new unique value (which will not be uniquified against until ++ * it is unique_insert()-ed. ++ */ ++uint64_t unique_create(void); ++ ++/* Return a unique value, which equals the one passed in if possible. */ ++uint64_t unique_insert(uint64_t value); ++ ++/* Indicate that this value no longer needs to be uniquified against. */ ++void unique_remove(uint64_t value); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_UNIQUE_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/uuid.h linux-3.2.33-go/include/zfs/sys/uuid.h +--- linux-3.2.33-go.orig/include/zfs/sys/uuid.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/uuid.h 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,94 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License, Version 1.0 only ++ * (the "License"). You may not use this file except in compliance ++ * with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_UUID_H ++#define _SYS_UUID_H ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * The copyright in this file is taken from the original Leach ++ * & Salz UUID specification, from which this implementation ++ * is derived. ++ */ ++ ++/* ++ * Copyright (c) 1990- 1993, 1996 Open Software Foundation, Inc. ++ * Copyright (c) 1989 by Hewlett-Packard Company, Palo Alto, Ca. & ++ * Digital Equipment Corporation, Maynard, Mass. Copyright (c) 1998 ++ * Microsoft. To anyone who acknowledges that this file is provided ++ * "AS IS" without any express or implied warranty: permission to use, ++ * copy, modify, and distribute this file for any purpose is hereby ++ * granted without fee, provided that the above copyright notices and ++ * this notice appears in all source code copies, and that none of the ++ * names of Open Software Foundation, Inc., Hewlett-Packard Company, ++ * or Digital Equipment Corporation be used in advertising or ++ * publicity pertaining to distribution of the software without ++ * specific, written prior permission. Neither Open Software ++ * Foundation, Inc., Hewlett-Packard Company, Microsoft, nor Digital ++ * Equipment Corporation makes any representations about the ++ * suitability of this software for any purpose. ++ */ ++ ++#include ++#include ++ ++typedef struct { ++ uint8_t nodeID[6]; ++} uuid_node_t; ++ ++/* ++ * The uuid type used throughout when referencing uuids themselves ++ */ ++struct uuid { ++ uint32_t time_low; ++ uint16_t time_mid; ++ uint16_t time_hi_and_version; ++ uint8_t clock_seq_hi_and_reserved; ++ uint8_t clock_seq_low; ++ uint8_t node_addr[6]; ++}; ++ ++#define UUID_PRINTABLE_STRING_LENGTH 37 ++ ++/* ++ * Convert a uuid to/from little-endian format ++ */ ++#define UUID_LE_CONVERT(dest, src) \ ++{ \ ++ (dest) = (src); \ ++ (dest).time_low = LE_32((dest).time_low); \ ++ (dest).time_mid = LE_16((dest).time_mid); \ ++ (dest).time_hi_and_version = LE_16((dest).time_hi_and_version); \ ++} ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_UUID_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/vdev_disk.h linux-3.2.33-go/include/zfs/sys/vdev_disk.h +--- linux-3.2.33-go.orig/include/zfs/sys/vdev_disk.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/vdev_disk.h 2012-11-16 23:25:34.339039449 +0100 +@@ -0,0 +1,45 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * LLNL-CODE-403049. ++ */ ++ ++#ifndef _SYS_VDEV_DISK_H ++#define _SYS_VDEV_DISK_H ++ ++#ifdef _KERNEL ++#include ++ ++typedef struct vdev_disk { ++ ddi_devid_t vd_devid; ++ char *vd_minor; ++ struct block_device *vd_bdev; ++} vdev_disk_t; ++ ++extern int vdev_disk_physio(struct block_device *, caddr_t, ++ size_t, uint64_t, int); ++extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); ++ ++#endif /* _KERNEL */ ++#endif /* _SYS_VDEV_DISK_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/vdev_file.h linux-3.2.33-go/include/zfs/sys/vdev_file.h +--- linux-3.2.33-go.orig/include/zfs/sys/vdev_file.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/vdev_file.h 2012-11-16 23:25:34.343039404 +0100 +@@ -0,0 +1,46 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License, Version 1.0 only ++ * (the "License"). You may not use this file except in compliance ++ * with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2005 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_VDEV_FILE_H ++#define _SYS_VDEV_FILE_H ++ ++ ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++typedef struct vdev_file { ++ vnode_t *vf_vnode; ++} vdev_file_t; ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_VDEV_FILE_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/vdev.h linux-3.2.33-go/include/zfs/sys/vdev.h +--- linux-3.2.33-go.orig/include/zfs/sys/vdev.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/vdev.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,162 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ */ ++ ++#ifndef _SYS_VDEV_H ++#define _SYS_VDEV_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++typedef enum vdev_dtl_type { ++ DTL_MISSING, /* 0% replication: no copies of the data */ ++ DTL_PARTIAL, /* less than 100% replication: some copies missing */ ++ DTL_SCRUB, /* unable to fully repair during scrub/resilver */ ++ DTL_OUTAGE, /* temporarily missing (used to attempt detach) */ ++ DTL_TYPES ++} vdev_dtl_type_t; ++ ++extern int zfs_nocacheflush; ++ ++extern int vdev_open(vdev_t *); ++extern void vdev_open_children(vdev_t *); ++extern boolean_t vdev_uses_zvols(vdev_t *); ++extern int vdev_validate(vdev_t *, boolean_t); ++extern void vdev_close(vdev_t *); ++extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); ++extern void vdev_reopen(vdev_t *); ++extern int vdev_validate_aux(vdev_t *vd); ++extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); ++ ++extern boolean_t vdev_is_bootable(vdev_t *vd); ++extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); ++extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); ++extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, ++ uint64_t txg, uint64_t size); ++extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, ++ uint64_t txg, uint64_t size); ++extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); ++extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, ++ int scrub_done); ++extern boolean_t vdev_dtl_required(vdev_t *vd); ++extern boolean_t vdev_resilver_needed(vdev_t *vd, ++ uint64_t *minp, uint64_t *maxp); ++ ++extern void vdev_hold(vdev_t *); ++extern void vdev_rele(vdev_t *); ++ ++extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); ++extern void vdev_metaslab_fini(vdev_t *vd); ++extern void vdev_metaslab_set_size(vdev_t *); ++extern void vdev_expand(vdev_t *vd, uint64_t txg); ++extern void vdev_split(vdev_t *vd); ++ ++ ++extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); ++extern void vdev_clear_stats(vdev_t *vd); ++extern void vdev_stat_update(zio_t *zio, uint64_t psize); ++extern void vdev_scan_stat_init(vdev_t *vd); ++extern void vdev_propagate_state(vdev_t *vd); ++extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, ++ vdev_aux_t aux); ++ ++extern void vdev_space_update(vdev_t *vd, ++ int64_t alloc_delta, int64_t defer_delta, int64_t space_delta); ++ ++extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); ++ ++extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); ++extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux); ++extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, ++ vdev_state_t *); ++extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); ++extern void vdev_clear(spa_t *spa, vdev_t *vd); ++ ++extern boolean_t vdev_is_dead(vdev_t *vd); ++extern boolean_t vdev_readable(vdev_t *vd); ++extern boolean_t vdev_writeable(vdev_t *vd); ++extern boolean_t vdev_allocatable(vdev_t *vd); ++extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); ++ ++extern void vdev_cache_init(vdev_t *vd); ++extern void vdev_cache_fini(vdev_t *vd); ++extern int vdev_cache_read(zio_t *zio); ++extern void vdev_cache_write(zio_t *zio); ++extern void vdev_cache_purge(vdev_t *vd); ++ ++extern void vdev_queue_init(vdev_t *vd); ++extern void vdev_queue_fini(vdev_t *vd); ++extern zio_t *vdev_queue_io(zio_t *zio); ++extern void vdev_queue_io_done(zio_t *zio); ++ ++extern void vdev_config_dirty(vdev_t *vd); ++extern void vdev_config_clean(vdev_t *vd); ++extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, ++ boolean_t); ++ ++extern void vdev_state_dirty(vdev_t *vd); ++extern void vdev_state_clean(vdev_t *vd); ++ ++typedef enum vdev_config_flag { ++ VDEV_CONFIG_SPARE = 1 << 0, ++ VDEV_CONFIG_L2CACHE = 1 << 1, ++ VDEV_CONFIG_REMOVING = 1 << 2 ++} vdev_config_flag_t; ++ ++extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config); ++extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, ++ boolean_t getstats, vdev_config_flag_t flags); ++ ++/* ++ * Label routines ++ */ ++struct uberblock; ++extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); ++extern int vdev_label_number(uint64_t psise, uint64_t offset); ++extern nvlist_t *vdev_label_read_config(vdev_t *vd); ++extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub); ++ ++typedef enum { ++ VDEV_LABEL_CREATE, /* create/add a new device */ ++ VDEV_LABEL_REPLACE, /* replace an existing device */ ++ VDEV_LABEL_SPARE, /* add a new hot spare */ ++ VDEV_LABEL_REMOVE, /* remove an existing device */ ++ VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */ ++ VDEV_LABEL_SPLIT /* generating new label for split-off dev */ ++} vdev_labeltype_t; ++ ++extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_VDEV_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/vdev_impl.h linux-3.2.33-go/include/zfs/sys/vdev_impl.h +--- linux-3.2.33-go.orig/include/zfs/sys/vdev_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/vdev_impl.h 2012-11-16 23:25:34.342039415 +0100 +@@ -0,0 +1,334 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2012 by Delphix. All rights reserved. ++ */ ++ ++#ifndef _SYS_VDEV_IMPL_H ++#define _SYS_VDEV_IMPL_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * Virtual device descriptors. ++ * ++ * All storage pool operations go through the virtual device framework, ++ * which provides data replication and I/O scheduling. ++ */ ++ ++/* ++ * Forward declarations that lots of things need. ++ */ ++typedef struct vdev_queue vdev_queue_t; ++typedef struct vdev_io vdev_io_t; ++typedef struct vdev_cache vdev_cache_t; ++typedef struct vdev_cache_entry vdev_cache_entry_t; ++ ++/* ++ * Virtual device operations ++ */ ++typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, ++ uint64_t *ashift); ++typedef void vdev_close_func_t(vdev_t *vd); ++typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); ++typedef int vdev_io_start_func_t(zio_t *zio); ++typedef void vdev_io_done_func_t(zio_t *zio); ++typedef void vdev_state_change_func_t(vdev_t *vd, int, int); ++typedef void vdev_hold_func_t(vdev_t *vd); ++typedef void vdev_rele_func_t(vdev_t *vd); ++ ++typedef struct vdev_ops { ++ vdev_open_func_t *vdev_op_open; ++ vdev_close_func_t *vdev_op_close; ++ vdev_asize_func_t *vdev_op_asize; ++ vdev_io_start_func_t *vdev_op_io_start; ++ vdev_io_done_func_t *vdev_op_io_done; ++ vdev_state_change_func_t *vdev_op_state_change; ++ vdev_hold_func_t *vdev_op_hold; ++ vdev_rele_func_t *vdev_op_rele; ++ char vdev_op_type[16]; ++ boolean_t vdev_op_leaf; ++} vdev_ops_t; ++ ++/* ++ * Virtual device properties ++ */ ++struct vdev_cache_entry { ++ char *ve_data; ++ uint64_t ve_offset; ++ uint64_t ve_lastused; ++ avl_node_t ve_offset_node; ++ avl_node_t ve_lastused_node; ++ uint32_t ve_hits; ++ uint16_t ve_missed_update; ++ zio_t *ve_fill_io; ++}; ++ ++struct vdev_cache { ++ avl_tree_t vc_offset_tree; ++ avl_tree_t vc_lastused_tree; ++ kmutex_t vc_lock; ++}; ++ ++struct vdev_queue { ++ avl_tree_t vq_deadline_tree; ++ avl_tree_t vq_read_tree; ++ avl_tree_t vq_write_tree; ++ avl_tree_t vq_pending_tree; ++ list_t vq_io_list; ++ kmutex_t vq_lock; ++}; ++ ++struct vdev_io { ++ char vi_buffer[SPA_MAXBLOCKSIZE]; /* Must be first */ ++ list_node_t vi_node; ++}; ++ ++/* ++ * Virtual device descriptor ++ */ ++struct vdev { ++ /* ++ * Common to all vdev types. ++ */ ++ uint64_t vdev_id; /* child number in vdev parent */ ++ uint64_t vdev_guid; /* unique ID for this vdev */ ++ uint64_t vdev_guid_sum; /* self guid + all child guids */ ++ uint64_t vdev_orig_guid; /* orig. guid prior to remove */ ++ uint64_t vdev_asize; /* allocatable device capacity */ ++ uint64_t vdev_min_asize; /* min acceptable asize */ ++ uint64_t vdev_max_asize; /* max acceptable asize */ ++ uint64_t vdev_ashift; /* block alignment shift */ ++ uint64_t vdev_state; /* see VDEV_STATE_* #defines */ ++ uint64_t vdev_prevstate; /* used when reopening a vdev */ ++ vdev_ops_t *vdev_ops; /* vdev operations */ ++ spa_t *vdev_spa; /* spa for this vdev */ ++ void *vdev_tsd; /* type-specific data */ ++ vnode_t *vdev_name_vp; /* vnode for pathname */ ++ vnode_t *vdev_devid_vp; /* vnode for devid */ ++ vdev_t *vdev_top; /* top-level vdev */ ++ vdev_t *vdev_parent; /* parent vdev */ ++ vdev_t **vdev_child; /* array of children */ ++ uint64_t vdev_children; /* number of children */ ++ space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */ ++ vdev_stat_t vdev_stat; /* virtual device statistics */ ++ boolean_t vdev_expanding; /* expand the vdev? */ ++ boolean_t vdev_reopening; /* reopen in progress? */ ++ int vdev_open_error; /* error on last open */ ++ kthread_t *vdev_open_thread; /* thread opening children */ ++ uint64_t vdev_crtxg; /* txg when top-level was added */ ++ ++ /* ++ * Top-level vdev state. ++ */ ++ uint64_t vdev_ms_array; /* metaslab array object */ ++ uint64_t vdev_ms_shift; /* metaslab size shift */ ++ uint64_t vdev_ms_count; /* number of metaslabs */ ++ metaslab_group_t *vdev_mg; /* metaslab group */ ++ metaslab_t **vdev_ms; /* metaslab array */ ++ uint64_t vdev_pending_fastwrite; /* allocated fastwrites */ ++ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ ++ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ ++ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ ++ boolean_t vdev_remove_wanted; /* async remove wanted? */ ++ boolean_t vdev_probe_wanted; /* async probe wanted? */ ++ uint64_t vdev_removing; /* device is being removed? */ ++ list_node_t vdev_config_dirty_node; /* config dirty list */ ++ list_node_t vdev_state_dirty_node; /* state dirty list */ ++ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ ++ uint64_t vdev_islog; /* is an intent log device */ ++ uint64_t vdev_ishole; /* is a hole in the namespace */ ++ ++ /* ++ * Leaf vdev state. ++ */ ++ uint64_t vdev_psize; /* physical device capacity */ ++ space_map_obj_t vdev_dtl_smo; /* dirty time log space map obj */ ++ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ ++ uint64_t vdev_wholedisk; /* true if this is a whole disk */ ++ uint64_t vdev_offline; /* persistent offline state */ ++ uint64_t vdev_faulted; /* persistent faulted state */ ++ uint64_t vdev_degraded; /* persistent degraded state */ ++ uint64_t vdev_removed; /* persistent removed state */ ++ uint64_t vdev_resilvering; /* persistent resilvering state */ ++ uint64_t vdev_nparity; /* number of parity devices for raidz */ ++ char *vdev_path; /* vdev path (if any) */ ++ char *vdev_devid; /* vdev devid (if any) */ ++ char *vdev_physpath; /* vdev device path (if any) */ ++ char *vdev_fru; /* physical FRU location */ ++ uint64_t vdev_not_present; /* not present during import */ ++ uint64_t vdev_unspare; /* unspare when resilvering done */ ++ hrtime_t vdev_last_try; /* last reopen time */ ++ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ ++ boolean_t vdev_checkremove; /* temporary online test */ ++ boolean_t vdev_forcefault; /* force online fault */ ++ boolean_t vdev_splitting; /* split or repair in progress */ ++ boolean_t vdev_delayed_close; /* delayed device close? */ ++ uint8_t vdev_tmpoffline; /* device taken offline temporarily? */ ++ uint8_t vdev_detached; /* device detached? */ ++ uint8_t vdev_cant_read; /* vdev is failing all reads */ ++ uint8_t vdev_cant_write; /* vdev is failing all writes */ ++ uint64_t vdev_isspare; /* was a hot spare */ ++ uint64_t vdev_isl2cache; /* was a l2cache device */ ++ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ ++ vdev_cache_t vdev_cache; /* physical block cache */ ++ spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */ ++ zio_t *vdev_probe_zio; /* root of current probe */ ++ vdev_aux_t vdev_label_aux; /* on-disk aux state */ ++ ++ /* ++ * For DTrace to work in userland (libzpool) context, these fields must ++ * remain at the end of the structure. DTrace will use the kernel's ++ * CTF definition for 'struct vdev', and since the size of a kmutex_t is ++ * larger in userland, the offsets for the rest fields would be ++ * incorrect. ++ */ ++ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ ++ kmutex_t vdev_stat_lock; /* vdev_stat */ ++ kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ ++}; ++ ++#define VDEV_RAIDZ_MAXPARITY 3 ++ ++#define VDEV_PAD_SIZE (8 << 10) ++/* 2 padding areas (vl_pad1 and vl_pad2) to skip */ ++#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 ++#define VDEV_PHYS_SIZE (112 << 10) ++#define VDEV_UBERBLOCK_RING (128 << 10) ++ ++#define VDEV_UBERBLOCK_SHIFT(vd) \ ++ MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT) ++#define VDEV_UBERBLOCK_COUNT(vd) \ ++ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) ++#define VDEV_UBERBLOCK_OFFSET(vd, n) \ ++ offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) ++#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) ++ ++typedef struct vdev_phys { ++ char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)]; ++ zio_eck_t vp_zbt; ++} vdev_phys_t; ++ ++typedef struct vdev_label { ++ char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ ++ char vl_pad2[VDEV_PAD_SIZE]; /* 8K */ ++ vdev_phys_t vl_vdev_phys; /* 112K */ ++ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ ++} vdev_label_t; /* 256K total */ ++ ++/* ++ * vdev_dirty() flags ++ */ ++#define VDD_METASLAB 0x01 ++#define VDD_DTL 0x02 ++ ++/* ++ * Size and offset of embedded boot loader region on each label. ++ * The total size of the first two labels plus the boot area is 4MB. ++ */ ++#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) ++#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ ++ ++/* ++ * Size of label regions at the start and end of each leaf device. ++ */ ++#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) ++#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) ++#define VDEV_LABELS 4 ++ ++#define VDEV_ALLOC_LOAD 0 ++#define VDEV_ALLOC_ADD 1 ++#define VDEV_ALLOC_SPARE 2 ++#define VDEV_ALLOC_L2CACHE 3 ++#define VDEV_ALLOC_ROOTPOOL 4 ++#define VDEV_ALLOC_SPLIT 5 ++#define VDEV_ALLOC_ATTACH 6 ++ ++/* ++ * Allocate or free a vdev ++ */ ++extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, ++ vdev_ops_t *ops); ++extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, ++ vdev_t *parent, uint_t id, int alloctype); ++extern void vdev_free(vdev_t *vd); ++ ++/* ++ * Add or remove children and parents ++ */ ++extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd); ++extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd); ++extern void vdev_compact_children(vdev_t *pvd); ++extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops); ++extern void vdev_remove_parent(vdev_t *cvd); ++ ++/* ++ * vdev sync load and sync ++ */ ++extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd); ++extern boolean_t vdev_log_state_valid(vdev_t *vd); ++extern void vdev_load(vdev_t *vd); ++extern void vdev_sync(vdev_t *vd, uint64_t txg); ++extern void vdev_sync_done(vdev_t *vd, uint64_t txg); ++extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); ++ ++/* ++ * Available vdev types. ++ */ ++extern vdev_ops_t vdev_root_ops; ++extern vdev_ops_t vdev_mirror_ops; ++extern vdev_ops_t vdev_replacing_ops; ++extern vdev_ops_t vdev_raidz_ops; ++extern vdev_ops_t vdev_disk_ops; ++extern vdev_ops_t vdev_file_ops; ++extern vdev_ops_t vdev_missing_ops; ++extern vdev_ops_t vdev_hole_ops; ++extern vdev_ops_t vdev_spare_ops; ++ ++/* ++ * Common size functions ++ */ ++extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); ++extern uint64_t vdev_get_min_asize(vdev_t *vd); ++extern void vdev_set_min_asize(vdev_t *vd); ++ ++/* ++ * zdb uses this tunable, so it must be declared here to make lint happy. ++ */ ++extern int zfs_vdev_cache_size; ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_VDEV_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/xvattr.h linux-3.2.33-go/include/zfs/sys/xvattr.h +--- linux-3.2.33-go.orig/include/zfs/sys/xvattr.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/xvattr.h 2012-11-16 23:25:34.344039393 +0100 +@@ -0,0 +1,330 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ ++/* All Rights Reserved */ ++ ++/* ++ * University Copyright- Copyright (c) 1982, 1986, 1988 ++ * The Regents of the University of California ++ * All Rights Reserved ++ * ++ * University Acknowledgment- Portions of this document are derived from ++ * software developed by the University of California, Berkeley, and its ++ * contributors. ++ */ ++ ++#ifndef _SYS_XVATTR_H ++#define _SYS_XVATTR_H ++ ++#include ++ ++#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ ++ ++/* ++ * Structure of all optional attributes. ++ */ ++typedef struct xoptattr { ++ timestruc_t xoa_createtime; /* Create time of file */ ++ uint8_t xoa_archive; ++ uint8_t xoa_system; ++ uint8_t xoa_readonly; ++ uint8_t xoa_hidden; ++ uint8_t xoa_nounlink; ++ uint8_t xoa_immutable; ++ uint8_t xoa_appendonly; ++ uint8_t xoa_nodump; ++ uint8_t xoa_opaque; ++ uint8_t xoa_av_quarantined; ++ uint8_t xoa_av_modified; ++ uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ]; ++ uint8_t xoa_reparse; ++ uint64_t xoa_generation; ++ uint8_t xoa_offline; ++ uint8_t xoa_sparse; ++} xoptattr_t; ++ ++/* ++ * The xvattr structure is really a variable length structure that ++ * is made up of: ++ * - The classic vattr_t (xva_vattr) ++ * - a 32 bit quantity (xva_mapsize) that specifies the size of the ++ * attribute bitmaps in 32 bit words. ++ * - A pointer to the returned attribute bitmap (needed because the ++ * previous element, the requested attribute bitmap) is variable lenth. ++ * - The requested attribute bitmap, which is an array of 32 bit words. ++ * Callers use the XVA_SET_REQ() macro to set the bits corresponding to ++ * the attributes that are being requested. ++ * - The returned attribute bitmap, which is an array of 32 bit words. ++ * File systems that support optional attributes use the XVA_SET_RTN() ++ * macro to set the bits corresponding to the attributes that are being ++ * returned. ++ * - The xoptattr_t structure which contains the attribute values ++ * ++ * xva_mapsize determines how many words in the attribute bitmaps. ++ * Immediately following the attribute bitmaps is the xoptattr_t. ++ * xva_getxoptattr() is used to get the pointer to the xoptattr_t ++ * section. ++ */ ++ ++#define XVA_MAPSIZE 3 /* Size of attr bitmaps */ ++#define XVA_MAGIC 0x78766174 /* Magic # for verification */ ++ ++/* ++ * The xvattr structure is an extensible structure which permits optional ++ * attributes to be requested/returned. File systems may or may not support ++ * optional attributes. They do so at their own discretion but if they do ++ * support optional attributes, they must register the VFSFT_XVATTR feature ++ * so that the optional attributes can be set/retrived. ++ * ++ * The fields of the xvattr structure are: ++ * ++ * xva_vattr - The first element of an xvattr is a legacy vattr structure ++ * which includes the common attributes. If AT_XVATTR is set in the va_mask ++ * then the entire structure is treated as an xvattr. If AT_XVATTR is not ++ * set, then only the xva_vattr structure can be used. ++ * ++ * xva_magic - 0x78766174 (hex for "xvat"). Magic number for verification. ++ * ++ * xva_mapsize - Size of requested and returned attribute bitmaps. ++ * ++ * xva_rtnattrmapp - Pointer to xva_rtnattrmap[]. We need this since the ++ * size of the array before it, xva_reqattrmap[], could change which means ++ * the location of xva_rtnattrmap[] could change. This will allow unbundled ++ * file systems to find the location of xva_rtnattrmap[] when the sizes change. ++ * ++ * xva_reqattrmap[] - Array of requested attributes. Attributes are ++ * represented by a specific bit in a specific element of the attribute ++ * map array. Callers set the bits corresponding to the attributes ++ * that the caller wants to get/set. ++ * ++ * xva_rtnattrmap[] - Array of attributes that the file system was able to ++ * process. Not all file systems support all optional attributes. This map ++ * informs the caller which attributes the underlying file system was able ++ * to set/get. (Same structure as the requested attributes array in terms ++ * of each attribute corresponding to specific bits and array elements.) ++ * ++ * xva_xoptattrs - Structure containing values of optional attributes. ++ * These values are only valid if the corresponding bits in xva_reqattrmap ++ * are set and the underlying file system supports those attributes. ++ */ ++typedef struct xvattr { ++ vattr_t xva_vattr; /* Embedded vattr structure */ ++ uint32_t xva_magic; /* Magic Number */ ++ uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */ ++ uint32_t *xva_rtnattrmapp; /* Ptr to xva_rtnattrmap[] */ ++ uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */ ++ uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */ ++ xoptattr_t xva_xoptattrs; /* Optional attributes */ ++} xvattr_t; ++ ++/* ++ * Attribute bits used in the extensible attribute's (xva's) attribute ++ * bitmaps. Note that the bitmaps are made up of a variable length number ++ * of 32-bit words. The convention is to use XAT{n}_{attrname} where "n" ++ * is the element in the bitmap (starting at 1). This convention is for ++ * the convenience of the maintainer to keep track of which element each ++ * attribute belongs to. ++ * ++ * NOTE THAT CONSUMERS MUST *NOT* USE THE XATn_* DEFINES DIRECTLY. CONSUMERS ++ * MUST USE THE XAT_* DEFINES. ++ */ ++#define XAT0_INDEX 0LL /* Index into bitmap for XAT0 attrs */ ++#define XAT0_CREATETIME 0x00000001 /* Create time of file */ ++#define XAT0_ARCHIVE 0x00000002 /* Archive */ ++#define XAT0_SYSTEM 0x00000004 /* System */ ++#define XAT0_READONLY 0x00000008 /* Readonly */ ++#define XAT0_HIDDEN 0x00000010 /* Hidden */ ++#define XAT0_NOUNLINK 0x00000020 /* Nounlink */ ++#define XAT0_IMMUTABLE 0x00000040 /* immutable */ ++#define XAT0_APPENDONLY 0x00000080 /* appendonly */ ++#define XAT0_NODUMP 0x00000100 /* nodump */ ++#define XAT0_OPAQUE 0x00000200 /* opaque */ ++#define XAT0_AV_QUARANTINED 0x00000400 /* anti-virus quarantine */ ++#define XAT0_AV_MODIFIED 0x00000800 /* anti-virus modified */ ++#define XAT0_AV_SCANSTAMP 0x00001000 /* anti-virus scanstamp */ ++#define XAT0_REPARSE 0x00002000 /* FS reparse point */ ++#define XAT0_GEN 0x00004000 /* object generation number */ ++#define XAT0_OFFLINE 0x00008000 /* offline */ ++#define XAT0_SPARSE 0x00010000 /* sparse */ ++ ++#define XAT0_ALL_ATTRS (XAT0_CREATETIME|XAT0_ARCHIVE|XAT0_SYSTEM| \ ++ XAT0_READONLY|XAT0_HIDDEN|XAT0_NOUNLINK|XAT0_IMMUTABLE|XAT0_APPENDONLY| \ ++ XAT0_NODUMP|XAT0_OPAQUE|XAT0_AV_QUARANTINED| XAT0_AV_MODIFIED| \ ++ XAT0_AV_SCANSTAMP|XAT0_REPARSE|XATO_GEN|XAT0_OFFLINE|XAT0_SPARSE) ++ ++/* Support for XAT_* optional attributes */ ++#define XVA_MASK 0xffffffff /* Used to mask off 32 bits */ ++#define XVA_SHFT 32 /* Used to shift index */ ++ ++/* ++ * Used to pry out the index and attribute bits from the XAT_* attributes ++ * defined below. Note that we're masking things down to 32 bits then ++ * casting to uint32_t. ++ */ ++#define XVA_INDEX(attr) ((uint32_t)(((attr) >> XVA_SHFT) & XVA_MASK)) ++#define XVA_ATTRBIT(attr) ((uint32_t)((attr) & XVA_MASK)) ++ ++/* ++ * The following defines present a "flat namespace" so that consumers don't ++ * need to keep track of which element belongs to which bitmap entry. ++ * ++ * NOTE THAT THESE MUST NEVER BE OR-ed TOGETHER ++ */ ++#define XAT_CREATETIME ((XAT0_INDEX << XVA_SHFT) | XAT0_CREATETIME) ++#define XAT_ARCHIVE ((XAT0_INDEX << XVA_SHFT) | XAT0_ARCHIVE) ++#define XAT_SYSTEM ((XAT0_INDEX << XVA_SHFT) | XAT0_SYSTEM) ++#define XAT_READONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_READONLY) ++#define XAT_HIDDEN ((XAT0_INDEX << XVA_SHFT) | XAT0_HIDDEN) ++#define XAT_NOUNLINK ((XAT0_INDEX << XVA_SHFT) | XAT0_NOUNLINK) ++#define XAT_IMMUTABLE ((XAT0_INDEX << XVA_SHFT) | XAT0_IMMUTABLE) ++#define XAT_APPENDONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_APPENDONLY) ++#define XAT_NODUMP ((XAT0_INDEX << XVA_SHFT) | XAT0_NODUMP) ++#define XAT_OPAQUE ((XAT0_INDEX << XVA_SHFT) | XAT0_OPAQUE) ++#define XAT_AV_QUARANTINED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_QUARANTINED) ++#define XAT_AV_MODIFIED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_MODIFIED) ++#define XAT_AV_SCANSTAMP ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_SCANSTAMP) ++#define XAT_REPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_REPARSE) ++#define XAT_GEN ((XAT0_INDEX << XVA_SHFT) | XAT0_GEN) ++#define XAT_OFFLINE ((XAT0_INDEX << XVA_SHFT) | XAT0_OFFLINE) ++#define XAT_SPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_SPARSE) ++ ++/* ++ * The returned attribute map array (xva_rtnattrmap[]) is located past the ++ * requested attribute map array (xva_reqattrmap[]). Its location changes ++ * when the array sizes change. We use a separate pointer in a known location ++ * (xva_rtnattrmapp) to hold the location of xva_rtnattrmap[]. This is ++ * set in xva_init() ++ */ ++#define XVA_RTNATTRMAP(xvap) ((xvap)->xva_rtnattrmapp) ++ ++/* ++ * XVA_SET_REQ() sets an attribute bit in the proper element in the bitmap ++ * of requested attributes (xva_reqattrmap[]). ++ */ ++#define XVA_SET_REQ(xvap, attr) \ ++ ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ ++ ASSERT((xvap)->xva_magic == XVA_MAGIC); \ ++ (xvap)->xva_reqattrmap[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr) ++/* ++ * XVA_CLR_REQ() clears an attribute bit in the proper element in the bitmap ++ * of requested attributes (xva_reqattrmap[]). ++ */ ++#define XVA_CLR_REQ(xvap, attr) \ ++ ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ ++ ASSERT((xvap)->xva_magic == XVA_MAGIC); \ ++ (xvap)->xva_reqattrmap[XVA_INDEX(attr)] &= ~XVA_ATTRBIT(attr) ++ ++/* ++ * XVA_SET_RTN() sets an attribute bit in the proper element in the bitmap ++ * of returned attributes (xva_rtnattrmap[]). ++ */ ++#define XVA_SET_RTN(xvap, attr) \ ++ ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ ++ ASSERT((xvap)->xva_magic == XVA_MAGIC); \ ++ (XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr) ++ ++/* ++ * XVA_ISSET_REQ() checks the requested attribute bitmap (xva_reqattrmap[]) ++ * to see of the corresponding attribute bit is set. If so, returns non-zero. ++ */ ++#define XVA_ISSET_REQ(xvap, attr) \ ++ ((((xvap)->xva_vattr.va_mask | AT_XVATTR) && \ ++ ((xvap)->xva_magic == XVA_MAGIC) && \ ++ ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \ ++ ((xvap)->xva_reqattrmap[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0) ++ ++/* ++ * XVA_ISSET_RTN() checks the returned attribute bitmap (xva_rtnattrmap[]) ++ * to see of the corresponding attribute bit is set. If so, returns non-zero. ++ */ ++#define XVA_ISSET_RTN(xvap, attr) \ ++ ((((xvap)->xva_vattr.va_mask | AT_XVATTR) && \ ++ ((xvap)->xva_magic == XVA_MAGIC) && \ ++ ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \ ++ ((XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0) ++ ++/* ++ * Zero out the structure, set the size of the requested/returned bitmaps, ++ * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer ++ * to the returned attributes array. ++ */ ++static inline void ++xva_init(xvattr_t *xvap) ++{ ++ bzero(xvap, sizeof (xvattr_t)); ++ xvap->xva_mapsize = XVA_MAPSIZE; ++ xvap->xva_magic = XVA_MAGIC; ++ xvap->xva_vattr.va_mask = ATTR_XVATTR; ++ xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0]; ++} ++ ++/* ++ * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t ++ * structure. Otherwise, returns NULL. ++ */ ++static inline xoptattr_t * ++xva_getxoptattr(xvattr_t *xvap) ++{ ++ xoptattr_t *xoap = NULL; ++ if (xvap->xva_vattr.va_mask & AT_XVATTR) ++ xoap = &xvap->xva_xoptattrs; ++ return (xoap); ++} ++ ++#define MODEMASK 07777 /* mode bits plus permission bits */ ++#define PERMMASK 00777 /* permission bits */ ++ ++/* ++ * VOP_ACCESS flags ++ */ ++#define V_ACE_MASK 0x1 /* mask represents NFSv4 ACE permissions */ ++#define V_APPEND 0x2 /* want to do append only check */ ++ ++/* ++ * Structure used on VOP_GETSECATTR and VOP_SETSECATTR operations ++ */ ++ ++typedef struct vsecattr { ++ uint_t vsa_mask; /* See below */ ++ int vsa_aclcnt; /* ACL entry count */ ++ void *vsa_aclentp; /* pointer to ACL entries */ ++ int vsa_dfaclcnt; /* default ACL entry count */ ++ void *vsa_dfaclentp; /* pointer to default ACL entries */ ++ size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */ ++ uint_t vsa_aclflags; /* ACE ACL flags */ ++} vsecattr_t; ++ ++/* vsa_mask values */ ++#define VSA_ACL 0x0001 ++#define VSA_ACLCNT 0x0002 ++#define VSA_DFACL 0x0004 ++#define VSA_DFACLCNT 0x0008 ++#define VSA_ACE 0x0010 ++#define VSA_ACECNT 0x0020 ++#define VSA_ACE_ALLTYPES 0x0040 ++#define VSA_ACE_ACLFLAGS 0x0080 /* get/set ACE ACL flags */ ++ ++#endif /* _SYS_XVATTR_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zap.h linux-3.2.33-go/include/zfs/sys/zap.h +--- linux-3.2.33-go.orig/include/zfs/sys/zap.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zap.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,480 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_ZAP_H ++#define _SYS_ZAP_H ++ ++/* ++ * ZAP - ZFS Attribute Processor ++ * ++ * The ZAP is a module which sits on top of the DMU (Data Management ++ * Unit) and implements a higher-level storage primitive using DMU ++ * objects. Its primary consumer is the ZPL (ZFS Posix Layer). ++ * ++ * A "zapobj" is a DMU object which the ZAP uses to stores attributes. ++ * Users should use only zap routines to access a zapobj - they should ++ * not access the DMU object directly using DMU routines. ++ * ++ * The attributes stored in a zapobj are name-value pairs. The name is ++ * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including ++ * terminating NULL). The value is an array of integers, which may be ++ * 1, 2, 4, or 8 bytes long. The total space used by the array (number ++ * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes. ++ * Note that an 8-byte integer value can be used to store the location ++ * (object number) of another dmu object (which may be itself a zapobj). ++ * Note that you can use a zero-length attribute to store a single bit ++ * of information - the attribute is present or not. ++ * ++ * The ZAP routines are thread-safe. However, you must observe the ++ * DMU's restriction that a transaction may not be operated on ++ * concurrently. ++ * ++ * Any of the routines that return an int may return an I/O error (EIO ++ * or ECHECKSUM). ++ * ++ * ++ * Implementation / Performance Notes: ++ * ++ * The ZAP is intended to operate most efficiently on attributes with ++ * short (49 bytes or less) names and single 8-byte values, for which ++ * the microzap will be used. The ZAP should be efficient enough so ++ * that the user does not need to cache these attributes. ++ * ++ * The ZAP's locking scheme makes its routines thread-safe. Operations ++ * on different zapobjs will be processed concurrently. Operations on ++ * the same zapobj which only read data will be processed concurrently. ++ * Operations on the same zapobj which modify data will be processed ++ * concurrently when there are many attributes in the zapobj (because ++ * the ZAP uses per-block locking - more than 128 * (number of cpus) ++ * small attributes will suffice). ++ */ ++ ++/* ++ * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C ++ * strings) for the names of attributes, rather than a byte string ++ * bounded by an explicit length. If some day we want to support names ++ * in character sets which have embedded zeros (eg. UTF-16, UTF-32), ++ * we'll have to add routines for using length-bounded strings. ++ */ ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * The matchtype specifies which entry will be accessed. ++ * MT_EXACT: only find an exact match (non-normalized) ++ * MT_FIRST: find the "first" normalized (case and Unicode ++ * form) match; the designated "first" match will not change as long ++ * as the set of entries with this normalization doesn't change ++ * MT_BEST: if there is an exact match, find that, otherwise find the ++ * first normalized match ++ */ ++typedef enum matchtype ++{ ++ MT_EXACT, ++ MT_BEST, ++ MT_FIRST ++} matchtype_t; ++ ++typedef enum zap_flags { ++ /* Use 64-bit hash value (serialized cursors will always use 64-bits) */ ++ ZAP_FLAG_HASH64 = 1 << 0, ++ /* Key is binary, not string (zap_add_uint64() can be used) */ ++ ZAP_FLAG_UINT64_KEY = 1 << 1, ++ /* ++ * First word of key (which must be an array of uint64) is ++ * already randomly distributed. ++ */ ++ ZAP_FLAG_PRE_HASHED_KEY = 1 << 2, ++} zap_flags_t; ++ ++/* ++ * Create a new zapobj with no attributes and return its object number. ++ * MT_EXACT will cause the zap object to only support MT_EXACT lookups, ++ * otherwise any matchtype can be used for lookups. ++ * ++ * normflags specifies what normalization will be done. values are: ++ * 0: no normalization (legacy on-disk format, supports MT_EXACT matching ++ * only) ++ * U8_TEXTPREP_TOLOWER: case normalization will be performed. ++ * MT_FIRST/MT_BEST matching will find entries that match without ++ * regard to case (eg. looking for "foo" can find an entry "Foo"). ++ * Eventually, other flags will permit unicode normalization as well. ++ */ ++uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, ++ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); ++uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, ++ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); ++uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, ++ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, ++ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); ++ ++/* ++ * Create a new zapobj with no attributes from the given (unallocated) ++ * object number. ++ */ ++int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot, ++ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); ++int zap_create_claim_norm(objset_t *ds, uint64_t obj, ++ int normflags, dmu_object_type_t ot, ++ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); ++ ++/* ++ * The zapobj passed in must be a valid ZAP object for all of the ++ * following routines. ++ */ ++ ++/* ++ * Destroy this zapobj and all its attributes. ++ * ++ * Frees the object number using dmu_object_free. ++ */ ++int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); ++ ++/* ++ * Manipulate attributes. ++ * ++ * 'integer_size' is in bytes, and must be 1, 2, 4, or 8. ++ */ ++ ++/* ++ * Retrieve the contents of the attribute with the given name. ++ * ++ * If the requested attribute does not exist, the call will fail and ++ * return ENOENT. ++ * ++ * If 'integer_size' is smaller than the attribute's integer size, the ++ * call will fail and return EINVAL. ++ * ++ * If 'integer_size' is equal to or larger than the attribute's integer ++ * size, the call will succeed and return 0. * When converting to a ++ * larger integer size, the integers will be treated as unsigned (ie. no ++ * sign-extension will be performed). ++ * ++ * 'num_integers' is the length (in integers) of 'buf'. ++ * ++ * If the attribute is longer than the buffer, as many integers as will ++ * fit will be transferred to 'buf'. If the entire attribute was not ++ * transferred, the call will return EOVERFLOW. ++ * ++ * If rn_len is nonzero, realname will be set to the name of the found ++ * entry (which may be different from the requested name if matchtype is ++ * not MT_EXACT). ++ * ++ * If normalization_conflictp is not NULL, it will be set if there is ++ * another name with the same case/unicode normalized form. ++ */ ++int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, ++ uint64_t integer_size, uint64_t num_integers, void *buf); ++int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, ++ uint64_t integer_size, uint64_t num_integers, void *buf, ++ matchtype_t mt, char *realname, int rn_len, ++ boolean_t *normalization_conflictp); ++int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, ++ int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); ++int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); ++int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, ++ int key_numints); ++ ++int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, ++ int add, uint64_t *towrite, uint64_t *tooverwrite); ++ ++/* ++ * Create an attribute with the given name and value. ++ * ++ * If an attribute with the given name already exists, the call will ++ * fail and return EEXIST. ++ */ ++int zap_add(objset_t *ds, uint64_t zapobj, const char *key, ++ int integer_size, uint64_t num_integers, ++ const void *val, dmu_tx_t *tx); ++int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, ++ int key_numints, int integer_size, uint64_t num_integers, ++ const void *val, dmu_tx_t *tx); ++ ++/* ++ * Set the attribute with the given name to the given value. If an ++ * attribute with the given name does not exist, it will be created. If ++ * an attribute with the given name already exists, the previous value ++ * will be overwritten. The integer_size may be different from the ++ * existing attribute's integer size, in which case the attribute's ++ * integer size will be updated to the new value. ++ */ ++int zap_update(objset_t *ds, uint64_t zapobj, const char *name, ++ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); ++int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, ++ int key_numints, ++ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); ++ ++/* ++ * Get the length (in integers) and the integer size of the specified ++ * attribute. ++ * ++ * If the requested attribute does not exist, the call will fail and ++ * return ENOENT. ++ */ ++int zap_length(objset_t *ds, uint64_t zapobj, const char *name, ++ uint64_t *integer_size, uint64_t *num_integers); ++int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, ++ int key_numints, uint64_t *integer_size, uint64_t *num_integers); ++ ++/* ++ * Remove the specified attribute. ++ * ++ * If the specified attribute does not exist, the call will fail and ++ * return ENOENT. ++ */ ++int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); ++int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, ++ matchtype_t mt, dmu_tx_t *tx); ++int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, ++ int key_numints, dmu_tx_t *tx); ++ ++/* ++ * Returns (in *count) the number of attributes in the specified zap ++ * object. ++ */ ++int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); ++ ++/* ++ * Returns (in name) the name of the entry whose (value & mask) ++ * (za_first_integer) is value, or ENOENT if not found. The string ++ * pointed to by name must be at least 256 bytes long. If mask==0, the ++ * match must be exact (ie, same as mask=-1ULL). ++ */ ++int zap_value_search(objset_t *os, uint64_t zapobj, ++ uint64_t value, uint64_t mask, char *name); ++ ++/* ++ * Transfer all the entries from fromobj into intoobj. Only works on ++ * int_size=8 num_integers=1 values. Fails if there are any duplicated ++ * entries. ++ */ ++int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); ++ ++/* Same as zap_join, but set the values to 'value'. */ ++int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, ++ uint64_t value, dmu_tx_t *tx); ++ ++/* Same as zap_join, but add together any duplicated entries. */ ++int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, ++ dmu_tx_t *tx); ++ ++/* ++ * Manipulate entries where the name + value are the "same" (the name is ++ * a stringified version of the value). ++ */ ++int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); ++int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); ++int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); ++int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, ++ dmu_tx_t *tx); ++ ++/* Here the key is an int and the value is a different int. */ ++int zap_add_int_key(objset_t *os, uint64_t obj, ++ uint64_t key, uint64_t value, dmu_tx_t *tx); ++int zap_lookup_int_key(objset_t *os, uint64_t obj, ++ uint64_t key, uint64_t *valuep); ++ ++/* ++ * They name is a stringified version of key; increment its value by ++ * delta. Zero values will be zap_remove()-ed. ++ */ ++int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, ++ dmu_tx_t *tx); ++ ++struct zap; ++struct zap_leaf; ++typedef struct zap_cursor { ++ /* This structure is opaque! */ ++ objset_t *zc_objset; ++ struct zap *zc_zap; ++ struct zap_leaf *zc_leaf; ++ uint64_t zc_zapobj; ++ uint64_t zc_serialized; ++ uint64_t zc_hash; ++ uint32_t zc_cd; ++} zap_cursor_t; ++ ++typedef struct { ++ int za_integer_length; ++ /* ++ * za_normalization_conflict will be set if there are additional ++ * entries with this normalized form (eg, "foo" and "Foo"). ++ */ ++ boolean_t za_normalization_conflict; ++ uint64_t za_num_integers; ++ uint64_t za_first_integer; /* no sign extension for <8byte ints */ ++ char za_name[MAXNAMELEN]; ++} zap_attribute_t; ++ ++/* ++ * The interface for listing all the attributes of a zapobj can be ++ * thought of as cursor moving down a list of the attributes one by ++ * one. The cookie returned by the zap_cursor_serialize routine is ++ * persistent across system calls (and across reboot, even). ++ */ ++ ++/* ++ * Initialize a zap cursor, pointing to the "first" attribute of the ++ * zapobj. You must _fini the cursor when you are done with it. ++ */ ++void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj); ++void zap_cursor_fini(zap_cursor_t *zc); ++ ++/* ++ * Get the attribute currently pointed to by the cursor. Returns ++ * ENOENT if at the end of the attributes. ++ */ ++int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za); ++ ++/* ++ * Advance the cursor to the next attribute. ++ */ ++void zap_cursor_advance(zap_cursor_t *zc); ++ ++/* ++ * Get a persistent cookie pointing to the current position of the zap ++ * cursor. The low 4 bits in the cookie are always zero, and thus can ++ * be used as to differentiate a serialized cookie from a different type ++ * of value. The cookie will be less than 2^32 as long as there are ++ * fewer than 2^22 (4.2 million) entries in the zap object. ++ */ ++uint64_t zap_cursor_serialize(zap_cursor_t *zc); ++ ++/* ++ * Advance the cursor to the attribute having the given key. ++ */ ++int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt); ++ ++/* ++ * Initialize a zap cursor pointing to the position recorded by ++ * zap_cursor_serialize (in the "serialized" argument). You can also ++ * use a "serialized" argument of 0 to start at the beginning of the ++ * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to ++ * zap_cursor_init(...).) ++ */ ++void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds, ++ uint64_t zapobj, uint64_t serialized); ++ ++ ++#define ZAP_HISTOGRAM_SIZE 10 ++ ++typedef struct zap_stats { ++ /* ++ * Size of the pointer table (in number of entries). ++ * This is always a power of 2, or zero if it's a microzap. ++ * In general, it should be considerably greater than zs_num_leafs. ++ */ ++ uint64_t zs_ptrtbl_len; ++ ++ uint64_t zs_blocksize; /* size of zap blocks */ ++ ++ /* ++ * The number of blocks used. Note that some blocks may be ++ * wasted because old ptrtbl's and large name/value blocks are ++ * not reused. (Although their space is reclaimed, we don't ++ * reuse those offsets in the object.) ++ */ ++ uint64_t zs_num_blocks; ++ ++ /* ++ * Pointer table values from zap_ptrtbl in the zap_phys_t ++ */ ++ uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */ ++ uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */ ++ uint64_t zs_ptrtbl_zt_blk; /* starting block number */ ++ uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */ ++ uint64_t zs_ptrtbl_zt_shift; /* bits to index it */ ++ ++ /* ++ * Values of the other members of the zap_phys_t ++ */ ++ uint64_t zs_block_type; /* ZBT_HEADER */ ++ uint64_t zs_magic; /* ZAP_MAGIC */ ++ uint64_t zs_num_leafs; /* The number of leaf blocks */ ++ uint64_t zs_num_entries; /* The number of zap entries */ ++ uint64_t zs_salt; /* salt to stir into hash function */ ++ ++ /* ++ * Histograms. For all histograms, the last index ++ * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater ++ * than what can be represented. For example ++ * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number ++ * of leafs with more than 45 entries. ++ */ ++ ++ /* ++ * zs_leafs_with_n_pointers[n] is the number of leafs with ++ * 2^n pointers to it. ++ */ ++ uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE]; ++ ++ /* ++ * zs_leafs_with_n_entries[n] is the number of leafs with ++ * [n*5, (n+1)*5) entries. In the current implementation, there ++ * can be at most 55 entries in any block, but there may be ++ * fewer if the name or value is large, or the block is not ++ * completely full. ++ */ ++ uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE]; ++ ++ /* ++ * zs_leafs_n_tenths_full[n] is the number of leafs whose ++ * fullness is in the range [n/10, (n+1)/10). ++ */ ++ uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE]; ++ ++ /* ++ * zs_entries_using_n_chunks[n] is the number of entries which ++ * consume n 24-byte chunks. (Note, large names/values only use ++ * one chunk, but contribute to zs_num_blocks_large.) ++ */ ++ uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE]; ++ ++ /* ++ * zs_buckets_with_n_entries[n] is the number of buckets (each ++ * leaf has 64 buckets) with n entries. ++ * zs_buckets_with_n_entries[1] should be very close to ++ * zs_num_entries. ++ */ ++ uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE]; ++} zap_stats_t; ++ ++/* ++ * Get statistics about a ZAP object. Note: you need to be aware of the ++ * internal implementation of the ZAP to correctly interpret some of the ++ * statistics. This interface shouldn't be relied on unless you really ++ * know what you're doing. ++ */ ++int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_ZAP_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zap_impl.h linux-3.2.33-go/include/zfs/sys/zap_impl.h +--- linux-3.2.33-go.orig/include/zfs/sys/zap_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zap_impl.h 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,228 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_ZAP_IMPL_H ++#define _SYS_ZAP_IMPL_H ++ ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++extern int fzap_default_block_shift; ++ ++#define ZAP_MAGIC 0x2F52AB2ABULL ++ ++#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift) ++ ++#define MZAP_ENT_LEN 64 ++#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) ++#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT ++#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT) ++ ++#define ZAP_NEED_CD (-1U) ++ ++typedef struct mzap_ent_phys { ++ uint64_t mze_value; ++ uint32_t mze_cd; ++ uint16_t mze_pad; /* in case we want to chain them someday */ ++ char mze_name[MZAP_NAME_LEN]; ++} mzap_ent_phys_t; ++ ++typedef struct mzap_phys { ++ uint64_t mz_block_type; /* ZBT_MICRO */ ++ uint64_t mz_salt; ++ uint64_t mz_normflags; ++ uint64_t mz_pad[5]; ++ mzap_ent_phys_t mz_chunk[1]; ++ /* actually variable size depending on block size */ ++} mzap_phys_t; ++ ++typedef struct mzap_ent { ++ avl_node_t mze_node; ++ int mze_chunkid; ++ uint64_t mze_hash; ++ uint32_t mze_cd; /* copy from mze_phys->mze_cd */ ++} mzap_ent_t; ++ ++#define MZE_PHYS(zap, mze) \ ++ (&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid]) ++ ++/* ++ * The (fat) zap is stored in one object. It is an array of ++ * 1<= 6] [zap_leaf_t] [ptrtbl] ... ++ * ++ */ ++ ++struct dmu_buf; ++struct zap_leaf; ++ ++#define ZBT_LEAF ((1ULL << 63) + 0) ++#define ZBT_HEADER ((1ULL << 63) + 1) ++#define ZBT_MICRO ((1ULL << 63) + 3) ++/* any other values are ptrtbl blocks */ ++ ++/* ++ * the embedded pointer table takes up half a block: ++ * block size / entry size (2^3) / 2 ++ */ ++#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1) ++ ++/* ++ * The embedded pointer table starts half-way through the block. Since ++ * the pointer table itself is half the block, it starts at (64-bit) ++ * word number (1<zap_f.zap_phys) \ ++ [(idx) + (1<> (64 - (n)))) ++ ++void fzap_byteswap(void *buf, size_t size); ++int fzap_count(zap_t *zap, uint64_t *count); ++int fzap_lookup(zap_name_t *zn, ++ uint64_t integer_size, uint64_t num_integers, void *buf, ++ char *realname, int rn_len, boolean_t *normalization_conflictp); ++void fzap_prefetch(zap_name_t *zn); ++int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite, ++ uint64_t *tooverwrite); ++int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, ++ const void *val, dmu_tx_t *tx); ++int fzap_update(zap_name_t *zn, ++ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); ++int fzap_length(zap_name_t *zn, ++ uint64_t *integer_size, uint64_t *num_integers); ++int fzap_remove(zap_name_t *zn, dmu_tx_t *tx); ++int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za); ++void fzap_get_stats(zap_t *zap, zap_stats_t *zs); ++void zap_put_leaf(struct zap_leaf *l); ++ ++int fzap_add_cd(zap_name_t *zn, ++ uint64_t integer_size, uint64_t num_integers, ++ const void *val, uint32_t cd, dmu_tx_t *tx); ++void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags); ++int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_ZAP_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zap_leaf.h linux-3.2.33-go/include/zfs/sys/zap_leaf.h +--- linux-3.2.33-go.orig/include/zfs/sys/zap_leaf.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zap_leaf.h 2012-11-16 23:25:34.342039415 +0100 +@@ -0,0 +1,245 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_ZAP_LEAF_H ++#define _SYS_ZAP_LEAF_H ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct zap; ++struct zap_name; ++struct zap_stats; ++ ++#define ZAP_LEAF_MAGIC 0x2AB1EAF ++ ++/* chunk size = 24 bytes */ ++#define ZAP_LEAF_CHUNKSIZE 24 ++ ++/* ++ * The amount of space available for chunks is: ++ * block size (1<l_bs) - hash entry size (2) * number of hash ++ * entries - header space (2*chunksize) ++ */ ++#define ZAP_LEAF_NUMCHUNKS(l) \ ++ (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \ ++ ZAP_LEAF_CHUNKSIZE - 2) ++ ++/* ++ * The amount of space within the chunk available for the array is: ++ * chunk size - space for type (1) - space for next pointer (2) ++ */ ++#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3) ++ ++#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \ ++ (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES) ++ ++/* ++ * Low water mark: when there are only this many chunks free, start ++ * growing the ptrtbl. Ideally, this should be larger than a ++ * "reasonably-sized" entry. 20 chunks is more than enough for the ++ * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value), ++ * while still being only around 3% for 16k blocks. ++ */ ++#define ZAP_LEAF_LOW_WATER (20) ++ ++/* ++ * The leaf hash table has block size / 2^5 (32) number of entries, ++ * which should be more than enough for the maximum number of entries, ++ * which is less than block size / CHUNKSIZE (24) / minimum number of ++ * chunks per entry (3). ++ */ ++#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5) ++#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l)) ++ ++/* ++ * The chunks start immediately after the hash table. The end of the ++ * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a ++ * chunk_t. ++ */ ++#define ZAP_LEAF_CHUNK(l, idx) \ ++ ((zap_leaf_chunk_t *) \ ++ ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx] ++#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry) ++ ++typedef enum zap_chunk_type { ++ ZAP_CHUNK_FREE = 253, ++ ZAP_CHUNK_ENTRY = 252, ++ ZAP_CHUNK_ARRAY = 251, ++ ZAP_CHUNK_TYPE_MAX = 250 ++} zap_chunk_type_t; ++ ++#define ZLF_ENTRIES_CDSORTED (1<<0) ++ ++/* ++ * TAKE NOTE: ++ * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified. ++ */ ++typedef struct zap_leaf_phys { ++ struct zap_leaf_header { ++ uint64_t lh_block_type; /* ZBT_LEAF */ ++ uint64_t lh_pad1; ++ uint64_t lh_prefix; /* hash prefix of this leaf */ ++ uint32_t lh_magic; /* ZAP_LEAF_MAGIC */ ++ uint16_t lh_nfree; /* number free chunks */ ++ uint16_t lh_nentries; /* number of entries */ ++ uint16_t lh_prefix_len; /* num bits used to id this */ ++ ++/* above is accessable to zap, below is zap_leaf private */ ++ ++ uint16_t lh_freelist; /* chunk head of free list */ ++ uint8_t lh_flags; /* ZLF_* flags */ ++ uint8_t lh_pad2[11]; ++ } l_hdr; /* 2 24-byte chunks */ ++ ++ /* ++ * The header is followed by a hash table with ++ * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is ++ * followed by an array of ZAP_LEAF_NUMCHUNKS(zap) ++ * zap_leaf_chunk structures. These structures are accessed ++ * with the ZAP_LEAF_CHUNK() macro. ++ */ ++ ++ uint16_t l_hash[1]; ++} zap_leaf_phys_t; ++ ++typedef union zap_leaf_chunk { ++ struct zap_leaf_entry { ++ uint8_t le_type; /* always ZAP_CHUNK_ENTRY */ ++ uint8_t le_value_intlen; /* size of value's ints */ ++ uint16_t le_next; /* next entry in hash chain */ ++ uint16_t le_name_chunk; /* first chunk of the name */ ++ uint16_t le_name_numints; /* ints in name (incl null) */ ++ uint16_t le_value_chunk; /* first chunk of the value */ ++ uint16_t le_value_numints; /* value length in ints */ ++ uint32_t le_cd; /* collision differentiator */ ++ uint64_t le_hash; /* hash value of the name */ ++ } l_entry; ++ struct zap_leaf_array { ++ uint8_t la_type; /* always ZAP_CHUNK_ARRAY */ ++ uint8_t la_array[ZAP_LEAF_ARRAY_BYTES]; ++ uint16_t la_next; /* next blk or CHAIN_END */ ++ } l_array; ++ struct zap_leaf_free { ++ uint8_t lf_type; /* always ZAP_CHUNK_FREE */ ++ uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES]; ++ uint16_t lf_next; /* next in free list, or CHAIN_END */ ++ } l_free; ++} zap_leaf_chunk_t; ++ ++typedef struct zap_leaf { ++ krwlock_t l_rwlock; ++ uint64_t l_blkid; /* 1< ++#include ++#include ++#endif ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct znode_phys; ++ ++#define ACE_SLOT_CNT 6 ++#define ZFS_ACL_VERSION_INITIAL 0ULL ++#define ZFS_ACL_VERSION_FUID 1ULL ++#define ZFS_ACL_VERSION ZFS_ACL_VERSION_FUID ++ ++/* ++ * ZFS ACLs are store in various forms. ++ * Files created with ACL version ZFS_ACL_VERSION_INITIAL ++ * will all be created with fixed length ACEs of type ++ * zfs_oldace_t. ++ * ++ * Files with ACL version ZFS_ACL_VERSION_FUID will be created ++ * with various sized ACEs. The abstraction entries will utilize ++ * zfs_ace_hdr_t, normal user/group entries will use zfs_ace_t ++ * and some specialized CIFS ACEs will use zfs_object_ace_t. ++ */ ++ ++/* ++ * All ACEs have a common hdr. For ++ * owner@, group@, and everyone@ this is all ++ * thats needed. ++ */ ++typedef struct zfs_ace_hdr { ++ uint16_t z_type; ++ uint16_t z_flags; ++ uint32_t z_access_mask; ++} zfs_ace_hdr_t; ++ ++typedef zfs_ace_hdr_t zfs_ace_abstract_t; ++ ++/* ++ * Standard ACE ++ */ ++typedef struct zfs_ace { ++ zfs_ace_hdr_t z_hdr; ++ uint64_t z_fuid; ++} zfs_ace_t; ++ ++/* ++ * The following type only applies to ACE_ACCESS_ALLOWED|DENIED_OBJECT_ACE_TYPE ++ * and will only be set/retrieved in a CIFS context. ++ */ ++ ++typedef struct zfs_object_ace { ++ zfs_ace_t z_ace; ++ uint8_t z_object_type[16]; /* object type */ ++ uint8_t z_inherit_type[16]; /* inherited object type */ ++} zfs_object_ace_t; ++ ++typedef struct zfs_oldace { ++ uint32_t z_fuid; /* "who" */ ++ uint32_t z_access_mask; /* access mask */ ++ uint16_t z_flags; /* flags, i.e inheritance */ ++ uint16_t z_type; /* type of entry allow/deny */ ++} zfs_oldace_t; ++ ++typedef struct zfs_acl_phys_v0 { ++ uint64_t z_acl_extern_obj; /* ext acl pieces */ ++ uint32_t z_acl_count; /* Number of ACEs */ ++ uint16_t z_acl_version; /* acl version */ ++ uint16_t z_acl_pad; /* pad */ ++ zfs_oldace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */ ++} zfs_acl_phys_v0_t; ++ ++#define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT) ++ ++/* ++ * Size of ACL count is always 2 bytes. ++ * Necessary to for dealing with both V0 ACL and V1 ACL layout ++ */ ++#define ZFS_ACL_COUNT_SIZE (sizeof (uint16_t)) ++ ++typedef struct zfs_acl_phys { ++ uint64_t z_acl_extern_obj; /* ext acl pieces */ ++ uint32_t z_acl_size; /* Number of bytes in ACL */ ++ uint16_t z_acl_version; /* acl version */ ++ uint16_t z_acl_count; /* ace count */ ++ uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ ++} zfs_acl_phys_t; ++ ++typedef struct acl_ops { ++ uint32_t (*ace_mask_get) (void *acep); /* get access mask */ ++ void (*ace_mask_set) (void *acep, ++ uint32_t mask); /* set access mask */ ++ uint16_t (*ace_flags_get) (void *acep); /* get flags */ ++ void (*ace_flags_set) (void *acep, ++ uint16_t flags); /* set flags */ ++ uint16_t (*ace_type_get)(void *acep); /* get type */ ++ void (*ace_type_set)(void *acep, ++ uint16_t type); /* set type */ ++ uint64_t (*ace_who_get)(void *acep); /* get who/fuid */ ++ void (*ace_who_set)(void *acep, ++ uint64_t who); /* set who/fuid */ ++ size_t (*ace_size)(void *acep); /* how big is this ace */ ++ size_t (*ace_abstract_size)(void); /* sizeof abstract entry */ ++ int (*ace_mask_off)(void); /* off of access mask in ace */ ++ int (*ace_data)(void *acep, void **datap); ++ /* ptr to data if any */ ++} acl_ops_t; ++ ++/* ++ * A zfs_acl_t structure is composed of a list of zfs_acl_node_t's. ++ * Each node will have one or more ACEs associated with it. You will ++ * only have multiple nodes during a chmod operation. Normally only ++ * one node is required. ++ */ ++typedef struct zfs_acl_node { ++ list_node_t z_next; /* Next chunk of ACEs */ ++ void *z_acldata; /* pointer into actual ACE(s) */ ++ void *z_allocdata; /* pointer to kmem allocated memory */ ++ size_t z_allocsize; /* Size of blob in bytes */ ++ size_t z_size; /* length of ACL data */ ++ uint64_t z_ace_count; /* number of ACEs in this acl node */ ++ int z_ace_idx; /* ace iterator positioned on */ ++} zfs_acl_node_t; ++ ++typedef struct zfs_acl { ++ uint64_t z_acl_count; /* Number of ACEs */ ++ size_t z_acl_bytes; /* Number of bytes in ACL */ ++ uint_t z_version; /* version of ACL */ ++ void *z_next_ace; /* pointer to next ACE */ ++ uint64_t z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */ ++ zfs_acl_node_t *z_curr_node; /* current node iterator is handling */ ++ list_t z_acl; /* chunks of ACE data */ ++ acl_ops_t *z_ops; /* ACL operations */ ++} zfs_acl_t; ++ ++typedef struct acl_locator_cb { ++ zfs_acl_t *cb_aclp; ++ zfs_acl_node_t *cb_acl_node; ++} zfs_acl_locator_cb_t; ++ ++#define ACL_DATA_ALLOCED 0x1 ++#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) ++ ++struct zfs_fuid_info; ++ ++typedef struct zfs_acl_ids { ++ uint64_t z_fuid; /* file owner fuid */ ++ uint64_t z_fgid; /* file group owner fuid */ ++ uint64_t z_mode; /* mode to set on create */ ++ zfs_acl_t *z_aclp; /* ACL to create with file */ ++ struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */ ++} zfs_acl_ids_t; ++ ++/* ++ * Property values for acl_mode and acl_inherit. ++ * ++ * acl_mode can take discard, noallow, groupmask and passthrough. ++ * whereas acl_inherit has secure instead of groupmask. ++ */ ++ ++#define ZFS_ACL_DISCARD 0 ++#define ZFS_ACL_NOALLOW 1 ++#define ZFS_ACL_GROUPMASK 2 ++#define ZFS_ACL_PASSTHROUGH 3 ++#define ZFS_ACL_RESTRICTED 4 ++#define ZFS_ACL_PASSTHROUGH_X 5 ++ ++struct znode; ++struct zfs_sb; ++ ++#ifdef _KERNEL ++int zfs_acl_ids_create(struct znode *, int, vattr_t *, ++ cred_t *, vsecattr_t *, zfs_acl_ids_t *); ++void zfs_acl_ids_free(zfs_acl_ids_t *); ++boolean_t zfs_acl_ids_overquota(struct zfs_sb *, zfs_acl_ids_t *); ++int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); ++int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); ++void zfs_acl_rele(void *); ++void zfs_oldace_byteswap(ace_t *, int); ++void zfs_ace_byteswap(void *, size_t, boolean_t); ++extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr); ++extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *); ++int zfs_fastaccesschk_execute(struct znode *, cred_t *); ++extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); ++extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); ++extern int zfs_acl_access(struct znode *, int, cred_t *); ++void zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); ++int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); ++int zfs_zaccess_rename(struct znode *, struct znode *, ++ struct znode *, struct znode *, cred_t *cr); ++void zfs_acl_free(zfs_acl_t *); ++int zfs_vsec_2_aclp(struct zfs_sb *, umode_t, vsecattr_t *, cred_t *, ++ struct zfs_fuid_info **, zfs_acl_t **); ++int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *); ++uint64_t zfs_external_acl(struct znode *); ++int zfs_znode_acl_version(struct znode *); ++int zfs_acl_size(struct znode *, int *); ++zfs_acl_t *zfs_acl_alloc(int); ++zfs_acl_node_t *zfs_acl_node_alloc(size_t); ++void zfs_acl_xform(struct znode *, zfs_acl_t *, cred_t *); ++void zfs_acl_data_locator(void **, uint32_t *, uint32_t, boolean_t, void *); ++uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *, ++ uint64_t *, uint64_t, uint64_t); ++int zfs_acl_chown_setattr(struct znode *); ++ ++#endif ++ ++#ifdef __cplusplus ++} ++#endif ++#endif /* _SYS_FS_ZFS_ACL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zfs_context.h linux-3.2.33-go/include/zfs/sys/zfs_context.h +--- linux-3.2.33-go.orig/include/zfs/sys/zfs_context.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zfs_context.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,653 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++/* ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ * Copyright (c) 2012, Joyent, Inc. All rights reserved. ++ */ ++ ++#ifndef _SYS_ZFS_CONTEXT_H ++#define _SYS_ZFS_CONTEXT_H ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#else /* _KERNEL */ ++ ++#define _SYS_MUTEX_H ++#define _SYS_RWLOCK_H ++#define _SYS_CONDVAR_H ++#define _SYS_SYSTM_H ++#define _SYS_DEBUG_H ++#define _SYS_T_LOCK_H ++#define _SYS_VNODE_H ++#define _SYS_VFS_H ++#define _SYS_SUNDDI_H ++#define _SYS_CALLB_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Stack ++ */ ++ ++#define noinline __attribute__((noinline)) ++ ++/* ++ * Debugging ++ */ ++ ++/* ++ * Note that we are not using the debugging levels. ++ */ ++ ++#define CE_CONT 0 /* continuation */ ++#define CE_NOTE 1 /* notice */ ++#define CE_WARN 2 /* warning */ ++#define CE_PANIC 3 /* panic */ ++#define CE_IGNORE 4 /* print nothing */ ++ ++extern int aok; ++ ++/* ++ * ZFS debugging ++ */ ++ ++extern void dprintf_setup(int *argc, char **argv); ++extern void __dprintf(const char *file, const char *func, ++ int line, const char *fmt, ...); ++extern void cmn_err(int, const char *, ...); ++extern void vcmn_err(int, const char *, __va_list); ++extern void panic(const char *, ...); ++extern void vpanic(const char *, __va_list); ++ ++#define fm_panic panic ++ ++/* ++ * DTrace SDT probes have different signatures in userland than they do in ++ * kernel. If they're being used in kernel code, re-define them out of ++ * existence for their counterparts in libzpool. ++ */ ++ ++#ifdef DTRACE_PROBE ++#undef DTRACE_PROBE ++#define DTRACE_PROBE(a) ((void)0) ++#endif /* DTRACE_PROBE */ ++ ++#ifdef DTRACE_PROBE1 ++#undef DTRACE_PROBE1 ++#define DTRACE_PROBE1(a, b, c) ((void)0) ++#endif /* DTRACE_PROBE1 */ ++ ++#ifdef DTRACE_PROBE2 ++#undef DTRACE_PROBE2 ++#define DTRACE_PROBE2(a, b, c, d, e) ((void)0) ++#endif /* DTRACE_PROBE2 */ ++ ++#ifdef DTRACE_PROBE3 ++#undef DTRACE_PROBE3 ++#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0) ++#endif /* DTRACE_PROBE3 */ ++ ++#ifdef DTRACE_PROBE4 ++#undef DTRACE_PROBE4 ++#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void)0) ++#endif /* DTRACE_PROBE4 */ ++ ++/* ++ * Threads ++ */ ++#define TS_MAGIC 0x72f158ab4261e538ull ++#define TS_RUN 0x00000002 ++#ifdef __linux__ ++#define STACK_SIZE 8192 /* Linux x86 and amd64 */ ++#else ++#define STACK_SIZE 24576 /* Solaris */ ++#endif ++ ++#ifdef NPTL_GUARD_WITHIN_STACK ++#define EXTRA_GUARD_BYTES PAGESIZE ++#else ++#define EXTRA_GUARD_BYTES 0 ++#endif ++ ++/* in libzpool, p0 exists only to have its address taken */ ++typedef struct proc { ++ uintptr_t this_is_never_used_dont_dereference_it; ++} proc_t; ++ ++extern struct proc p0; ++#define curproc (&p0) ++ ++typedef void (*thread_func_t)(void *); ++typedef void (*thread_func_arg_t)(void *); ++typedef pthread_t kt_did_t; ++ ++typedef struct kthread { ++ kt_did_t t_tid; ++ thread_func_t t_func; ++ void * t_arg; ++} kthread_t; ++ ++#define tsd_get(key) pthread_getspecific(key) ++#define tsd_set(key, val) pthread_setspecific(key, val) ++#define curthread zk_thread_current() ++#define thread_exit zk_thread_exit ++#define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ ++ zk_thread_create(stk, stksize, (thread_func_t)func, arg, \ ++ len, NULL, state, pri, PTHREAD_CREATE_DETACHED) ++#define thread_join(t) zk_thread_join(t) ++#define newproc(f,a,cid,pri,ctp,pid) (ENOSYS) ++ ++extern kthread_t *zk_thread_current(void); ++extern void zk_thread_exit(void); ++extern kthread_t *zk_thread_create(caddr_t stk, size_t stksize, ++ thread_func_t func, void *arg, size_t len, ++ proc_t *pp, int state, pri_t pri, int detachstate); ++extern void zk_thread_join(kt_did_t tid); ++ ++#define kpreempt_disable() ((void)0) ++#define kpreempt_enable() ((void)0) ++ ++#define PS_NONE -1 ++ ++#define issig(why) (FALSE) ++#define ISSIG(thr, why) (FALSE) ++ ++/* ++ * Mutexes ++ */ ++#define MTX_MAGIC 0x9522f51362a6e326ull ++#define MTX_INIT ((void *)NULL) ++#define MTX_DEST ((void *)-1UL) ++ ++typedef struct kmutex { ++ void *m_owner; ++ uint64_t m_magic; ++ pthread_mutex_t m_lock; ++} kmutex_t; ++ ++#define MUTEX_DEFAULT 0 ++#define MUTEX_HELD(m) ((m)->m_owner == curthread) ++#define MUTEX_NOT_HELD(m) (!MUTEX_HELD(m)) ++ ++extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie); ++extern void mutex_destroy(kmutex_t *mp); ++extern void mutex_enter(kmutex_t *mp); ++extern void mutex_exit(kmutex_t *mp); ++extern int mutex_tryenter(kmutex_t *mp); ++extern void *mutex_owner(kmutex_t *mp); ++extern int mutex_held(kmutex_t *mp); ++ ++/* ++ * RW locks ++ */ ++#define RW_MAGIC 0x4d31fb123648e78aull ++#define RW_INIT ((void *)NULL) ++#define RW_DEST ((void *)-1UL) ++ ++typedef struct krwlock { ++ void *rw_owner; ++ void *rw_wr_owner; ++ uint64_t rw_magic; ++ pthread_rwlock_t rw_lock; ++ uint_t rw_readers; ++} krwlock_t; ++ ++typedef int krw_t; ++ ++#define RW_READER 0 ++#define RW_WRITER 1 ++#define RW_DEFAULT RW_READER ++ ++#define RW_READ_HELD(x) ((x)->rw_readers > 0) ++#define RW_WRITE_HELD(x) ((x)->rw_wr_owner == curthread) ++#define RW_LOCK_HELD(x) (RW_READ_HELD(x) || RW_WRITE_HELD(x)) ++ ++extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg); ++extern void rw_destroy(krwlock_t *rwlp); ++extern void rw_enter(krwlock_t *rwlp, krw_t rw); ++extern int rw_tryenter(krwlock_t *rwlp, krw_t rw); ++extern int rw_tryupgrade(krwlock_t *rwlp); ++extern void rw_exit(krwlock_t *rwlp); ++#define rw_downgrade(rwlp) do { } while (0) ++ ++extern uid_t crgetuid(cred_t *cr); ++extern gid_t crgetgid(cred_t *cr); ++extern int crgetngroups(cred_t *cr); ++extern gid_t *crgetgroups(cred_t *cr); ++ ++/* ++ * Condition variables ++ */ ++#define CV_MAGIC 0xd31ea9a83b1b30c4ull ++ ++typedef struct kcondvar { ++ uint64_t cv_magic; ++ pthread_cond_t cv; ++} kcondvar_t; ++ ++#define CV_DEFAULT 0 ++ ++extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg); ++extern void cv_destroy(kcondvar_t *cv); ++extern void cv_wait(kcondvar_t *cv, kmutex_t *mp); ++extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); ++extern void cv_signal(kcondvar_t *cv); ++extern void cv_broadcast(kcondvar_t *cv); ++#define cv_timedwait_interruptible(cv, mp, at) cv_timedwait(cv, mp, at) ++#define cv_wait_interruptible(cv, mp) cv_wait(cv, mp) ++ ++/* ++ * kstat creation, installation and deletion ++ */ ++extern kstat_t *kstat_create(char *, int, ++ char *, char *, uchar_t, ulong_t, uchar_t); ++extern void kstat_install(kstat_t *); ++extern void kstat_delete(kstat_t *); ++ ++/* ++ * Kernel memory ++ */ ++#define KM_SLEEP UMEM_NOFAIL ++#define KM_PUSHPAGE KM_SLEEP ++#define KM_NOSLEEP UMEM_DEFAULT ++#define KM_NODEBUG 0x0 ++#define KMC_NODEBUG UMC_NODEBUG ++#define KMC_KMEM 0x0 ++#define KMC_VMEM 0x0 ++#define kmem_alloc(_s, _f) umem_alloc(_s, _f) ++#define kmem_zalloc(_s, _f) umem_zalloc(_s, _f) ++#define kmem_free(_b, _s) umem_free(_b, _s) ++#define vmem_alloc(_s, _f) kmem_alloc(_s, _f) ++#define vmem_zalloc(_s, _f) kmem_zalloc(_s, _f) ++#define vmem_free(_b, _s) kmem_free(_b, _s) ++#define kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \ ++ umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) ++#define kmem_cache_destroy(_c) umem_cache_destroy(_c) ++#define kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f) ++#define kmem_cache_free(_c, _b) umem_cache_free(_c, _b) ++#define kmem_debugging() 0 ++#define kmem_cache_reap_now(_c) /* nothing */ ++#define kmem_cache_set_move(_c, _cb) /* nothing */ ++#define POINTER_INVALIDATE(_pp) /* nothing */ ++#define POINTER_IS_VALID(_p) 0 ++ ++typedef umem_cache_t kmem_cache_t; ++ ++typedef enum kmem_cbrc { ++ KMEM_CBRC_YES, ++ KMEM_CBRC_NO, ++ KMEM_CBRC_LATER, ++ KMEM_CBRC_DONT_NEED, ++ KMEM_CBRC_DONT_KNOW ++} kmem_cbrc_t; ++ ++/* ++ * Task queues ++ */ ++typedef struct taskq taskq_t; ++typedef uintptr_t taskqid_t; ++typedef void (task_func_t)(void *); ++ ++typedef struct taskq_ent { ++ struct taskq_ent *tqent_next; ++ struct taskq_ent *tqent_prev; ++ task_func_t *tqent_func; ++ void *tqent_arg; ++ uintptr_t tqent_flags; ++} taskq_ent_t; ++ ++#define TQENT_FLAG_PREALLOC 0x1 /* taskq_dispatch_ent used */ ++ ++#define TASKQ_PREPOPULATE 0x0001 ++#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ ++#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ ++#define TASKQ_THREADS_CPU_PCT 0x0008 /* Scale # threads by # cpus */ ++#define TASKQ_DC_BATCH 0x0010 /* Mark threads as batch */ ++ ++#define TQ_SLEEP KM_SLEEP /* Can block for memory */ ++#define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */ ++#define TQ_PUSHPAGE KM_PUSHPAGE /* Cannot perform I/O */ ++#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ ++#define TQ_FRONT 0x08 /* Queue in front */ ++ ++extern taskq_t *system_taskq; ++ ++extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); ++#define taskq_create_proc(a, b, c, d, e, p, f) \ ++ (taskq_create(a, b, c, d, e, f)) ++#define taskq_create_sysdc(a, b, d, e, p, dc, f) \ ++ (taskq_create(a, b, maxclsyspri, d, e, f)) ++extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); ++extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, ++ taskq_ent_t *); ++extern int taskq_empty_ent(taskq_ent_t *); ++extern void taskq_init_ent(taskq_ent_t *); ++extern void taskq_destroy(taskq_t *); ++extern void taskq_wait(taskq_t *); ++extern int taskq_member(taskq_t *, kthread_t *); ++extern void system_taskq_init(void); ++extern void system_taskq_fini(void); ++ ++#define XVA_MAPSIZE 3 ++#define XVA_MAGIC 0x78766174 ++ ++/* ++ * vnodes ++ */ ++typedef struct vnode { ++ uint64_t v_size; ++ int v_fd; ++ char *v_path; ++} vnode_t; ++ ++#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ ++ ++typedef struct xoptattr { ++ timestruc_t xoa_createtime; /* Create time of file */ ++ uint8_t xoa_archive; ++ uint8_t xoa_system; ++ uint8_t xoa_readonly; ++ uint8_t xoa_hidden; ++ uint8_t xoa_nounlink; ++ uint8_t xoa_immutable; ++ uint8_t xoa_appendonly; ++ uint8_t xoa_nodump; ++ uint8_t xoa_settable; ++ uint8_t xoa_opaque; ++ uint8_t xoa_av_quarantined; ++ uint8_t xoa_av_modified; ++ uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ]; ++ uint8_t xoa_reparse; ++ uint8_t xoa_offline; ++ uint8_t xoa_sparse; ++} xoptattr_t; ++ ++typedef struct vattr { ++ uint_t va_mask; /* bit-mask of attributes */ ++ u_offset_t va_size; /* file size in bytes */ ++} vattr_t; ++ ++ ++typedef struct xvattr { ++ vattr_t xva_vattr; /* Embedded vattr structure */ ++ uint32_t xva_magic; /* Magic Number */ ++ uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */ ++ uint32_t *xva_rtnattrmapp; /* Ptr to xva_rtnattrmap[] */ ++ uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */ ++ uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */ ++ xoptattr_t xva_xoptattrs; /* Optional attributes */ ++} xvattr_t; ++ ++typedef struct vsecattr { ++ uint_t vsa_mask; /* See below */ ++ int vsa_aclcnt; /* ACL entry count */ ++ void *vsa_aclentp; /* pointer to ACL entries */ ++ int vsa_dfaclcnt; /* default ACL entry count */ ++ void *vsa_dfaclentp; /* pointer to default ACL entries */ ++ size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */ ++} vsecattr_t; ++ ++#define AT_TYPE 0x00001 ++#define AT_MODE 0x00002 ++#define AT_UID 0x00004 ++#define AT_GID 0x00008 ++#define AT_FSID 0x00010 ++#define AT_NODEID 0x00020 ++#define AT_NLINK 0x00040 ++#define AT_SIZE 0x00080 ++#define AT_ATIME 0x00100 ++#define AT_MTIME 0x00200 ++#define AT_CTIME 0x00400 ++#define AT_RDEV 0x00800 ++#define AT_BLKSIZE 0x01000 ++#define AT_NBLOCKS 0x02000 ++#define AT_SEQ 0x08000 ++#define AT_XVATTR 0x10000 ++ ++#define CRCREAT 0 ++ ++extern int fop_getattr(vnode_t *vp, vattr_t *vap); ++ ++#define VOP_CLOSE(vp, f, c, o, cr, ct) vn_close(vp) ++#define VOP_PUTPAGE(vp, of, sz, fl, cr, ct) 0 ++#define VOP_GETATTR(vp, vap, fl, cr, ct) fop_getattr((vp), (vap)); ++ ++#define VOP_FSYNC(vp, f, cr, ct) fsync((vp)->v_fd) ++ ++#define VN_RELE(vp) vn_close(vp) ++ ++extern int vn_open(char *path, int x1, int oflags, int mode, vnode_t **vpp, ++ int x2, int x3); ++extern int vn_openat(char *path, int x1, int oflags, int mode, vnode_t **vpp, ++ int x2, int x3, vnode_t *vp, int fd); ++extern int vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, ++ offset_t offset, int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp); ++extern void vn_close(vnode_t *vp); ++ ++#define vn_remove(path, x1, x2) remove(path) ++#define vn_rename(from, to, seg) rename((from), (to)) ++#define vn_is_readonly(vp) B_FALSE ++ ++extern vnode_t *rootdir; ++ ++#include /* for FREAD, FWRITE, etc */ ++ ++/* ++ * Random stuff ++ */ ++#define ddi_get_lbolt() (gethrtime() >> 23) ++#define ddi_get_lbolt64() (gethrtime() >> 23) ++#define hz 119 /* frequency when using gethrtime() >> 23 for lbolt */ ++ ++extern void delay(clock_t ticks); ++ ++#define gethrestime_sec() time(NULL) ++#define gethrestime(t) \ ++ do {\ ++ (t)->tv_sec = gethrestime_sec();\ ++ (t)->tv_nsec = 0;\ ++ } while (0); ++ ++#define max_ncpus 64 ++ ++#define minclsyspri 60 ++#define maxclsyspri 99 ++ ++#define CPU_SEQID (pthread_self() & (max_ncpus - 1)) ++ ++#define kcred NULL ++#define CRED() NULL ++ ++#define ptob(x) ((x) * PAGESIZE) ++ ++extern uint64_t physmem; ++ ++extern int highbit(ulong_t i); ++extern int random_get_bytes(uint8_t *ptr, size_t len); ++extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); ++ ++extern void kernel_init(int); ++extern void kernel_fini(void); ++ ++struct spa; ++extern void nicenum(uint64_t num, char *buf); ++extern void show_pool_stats(struct spa *); ++ ++typedef struct callb_cpr { ++ kmutex_t *cc_lockp; ++} callb_cpr_t; ++ ++#define CALLB_CPR_INIT(cp, lockp, func, name) { \ ++ (cp)->cc_lockp = lockp; \ ++} ++ ++#define CALLB_CPR_SAFE_BEGIN(cp) { \ ++ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ ++} ++ ++#define CALLB_CPR_SAFE_END(cp, lockp) { \ ++ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ ++} ++ ++#define CALLB_CPR_EXIT(cp) { \ ++ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ ++ mutex_exit((cp)->cc_lockp); \ ++} ++ ++#define zone_dataset_visible(x, y) (1) ++#define INGLOBALZONE(z) (1) ++ ++extern char *kmem_vasprintf(const char *fmt, va_list adx); ++extern char *kmem_asprintf(const char *fmt, ...); ++#define strfree(str) kmem_free((str), strlen(str)+1) ++ ++/* ++ * Hostname information ++ */ ++extern char hw_serial[]; /* for userland-emulated hostid access */ ++extern int ddi_strtoul(const char *str, char **nptr, int base, ++ unsigned long *result); ++ ++extern int ddi_strtoull(const char *str, char **nptr, int base, ++ u_longlong_t *result); ++ ++/* ZFS Boot Related stuff. */ ++ ++struct _buf { ++ intptr_t _fd; ++}; ++ ++struct bootstat { ++ uint64_t st_size; ++}; ++ ++typedef struct ace_object { ++ uid_t a_who; ++ uint32_t a_access_mask; ++ uint16_t a_flags; ++ uint16_t a_type; ++ uint8_t a_obj_type[16]; ++ uint8_t a_inherit_obj_type[16]; ++} ace_object_t; ++ ++ ++#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 ++#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 ++#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 ++#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 ++ ++extern struct _buf *kobj_open_file(char *name); ++extern int kobj_read_file(struct _buf *file, char *buf, unsigned size, ++ unsigned off); ++extern void kobj_close_file(struct _buf *file); ++extern int kobj_get_filesize(struct _buf *file, uint64_t *size); ++extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); ++extern int zfs_secpolicy_rename_perms(const char *from, const char *to, ++ cred_t *cr); ++extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); ++extern zoneid_t getzoneid(void); ++ ++/* SID stuff */ ++typedef struct ksiddomain { ++ uint_t kd_ref; ++ uint_t kd_len; ++ char *kd_name; ++} ksiddomain_t; ++ ++ksiddomain_t *ksid_lookupdomain(const char *); ++void ksiddomain_rele(ksiddomain_t *); ++ ++#define DDI_SLEEP KM_SLEEP ++#define ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) \ ++ sysevent_post_event(_c, _d, _b, "libzpool", _e, _f) ++ ++#endif /* _KERNEL */ ++ ++#endif /* _SYS_ZFS_CONTEXT_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zfs_ctldir.h linux-3.2.33-go/include/zfs/sys/zfs_ctldir.h +--- linux-3.2.33-go.orig/include/zfs/sys/zfs_ctldir.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zfs_ctldir.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,113 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (C) 2011 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * LLNL-CODE-403049. ++ * Rewritten for Linux by: ++ * Rohan Puri ++ * Brian Behlendorf ++ */ ++ ++#ifndef _ZFS_CTLDIR_H ++#define _ZFS_CTLDIR_H ++ ++#include ++#include ++#include ++ ++#define ZFS_CTLDIR_NAME ".zfs" ++#define ZFS_SNAPDIR_NAME "snapshot" ++#define ZFS_SHAREDIR_NAME "shares" ++ ++#define zfs_has_ctldir(zdp) \ ++ ((zdp)->z_id == ZTOZSB(zdp)->z_root && \ ++ (ZTOZSB(zdp)->z_ctldir != NULL)) ++#define zfs_show_ctldir(zdp) \ ++ (zfs_has_ctldir(zdp) && \ ++ (ZTOZSB(zdp)->z_show_ctldir)) ++ ++typedef struct { ++ char *se_name; ++ char *se_path; ++ struct inode *se_inode; ++ struct delayed_work se_work; ++ avl_node_t se_node; ++} zfs_snapentry_t; ++ ++/* zfsctl generic functions */ ++extern int snapentry_compare(const void *a, const void *b); ++extern boolean_t zfsctl_is_node(struct inode *ip); ++extern boolean_t zfsctl_is_snapdir(struct inode *ip); ++extern void zfsctl_inode_inactive(struct inode *ip); ++extern void zfsctl_inode_destroy(struct inode *ip); ++extern int zfsctl_create(zfs_sb_t *zsb); ++extern void zfsctl_destroy(zfs_sb_t *zsb); ++extern struct inode *zfsctl_root(znode_t *zp); ++extern int zfsctl_fid(struct inode *ip, fid_t *fidp); ++ ++/* zfsctl '.zfs' functions */ ++extern int zfsctl_root_lookup(struct inode *dip, char *name, ++ struct inode **ipp, int flags, cred_t *cr, int *direntflags, ++ pathname_t *realpnp); ++ ++/* zfsctl '.zfs/snapshot' functions */ ++extern int zfsctl_snapdir_lookup(struct inode *dip, char *name, ++ struct inode **ipp, int flags, cred_t *cr, int *direntflags, ++ pathname_t *realpnp); ++extern int zfsctl_snapdir_rename(struct inode *sdip, char *sname, ++ struct inode *tdip, char *tname, cred_t *cr, int flags); ++extern int zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, ++ int flags); ++extern int zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, ++ struct inode **ipp, cred_t *cr, int flags); ++extern void zfsctl_snapdir_inactive(struct inode *ip); ++extern int zfsctl_unmount_snapshot(zfs_sb_t *zsb, char *name, int flags); ++extern int zfsctl_unmount_snapshots(zfs_sb_t *zsb, int flags, int *count); ++extern int zfsctl_mount_snapshot(struct path *path, int flags); ++extern int zfsctl_lookup_objset(struct super_block *sb, uint64_t objsetid, ++ zfs_sb_t **zsb); ++ ++/* zfsctl '.zfs/shares' functions */ ++extern int zfsctl_shares_lookup(struct inode *dip, char *name, ++ struct inode **ipp, int flags, cred_t *cr, int *direntflags, ++ pathname_t *realpnp); ++ ++/* zfsctl_init/fini functions */ ++extern void zfsctl_init(void); ++extern void zfsctl_fini(void); ++ ++/* ++ * These inodes numbers are reserved for the .zfs control directory. ++ * It is important that they be no larger that 48-bits because only ++ * 6 bytes are reserved in the NFS file handle for the object number. ++ * However, they should be as large as possible to avoid conflicts ++ * with the objects which are assigned monotonically by the dmu. ++ */ ++#define ZFSCTL_INO_ROOT 0x0000FFFFFFFFFFFFULL ++#define ZFSCTL_INO_SHARES 0x0000FFFFFFFFFFFEULL ++#define ZFSCTL_INO_SNAPDIR 0x0000FFFFFFFFFFFDULL ++#define ZFSCTL_INO_SNAPDIRS 0x0000FFFFFFFFFFFCULL ++ ++#define ZFSCTL_EXPIRE_SNAPSHOT 300 ++ ++#endif /* _ZFS_CTLDIR_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zfs_debug.h linux-3.2.33-go/include/zfs/sys/zfs_debug.h +--- linux-3.2.33-go.orig/include/zfs/sys/zfs_debug.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zfs_debug.h 2012-11-16 23:25:34.343039404 +0100 +@@ -0,0 +1,81 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_ZFS_DEBUG_H ++#define _SYS_ZFS_DEBUG_H ++ ++#ifndef TRUE ++#define TRUE 1 ++#endif ++ ++#ifndef FALSE ++#define FALSE 0 ++#endif ++ ++/* ++ * ZFS debugging - Always enabled for user space builds. ++ */ ++#if !defined(ZFS_DEBUG) && !defined(_KERNEL) ++#define ZFS_DEBUG ++#endif ++ ++extern int zfs_flags; ++extern int zfs_recover; ++ ++#define ZFS_DEBUG_DPRINTF 0x0001 ++#define ZFS_DEBUG_DBUF_VERIFY 0x0002 ++#define ZFS_DEBUG_DNODE_VERIFY 0x0004 ++#define ZFS_DEBUG_SNAPNAMES 0x0008 ++#define ZFS_DEBUG_MODIFY 0x0010 ++ ++/* ++ * Always log zfs debug messages to the spl debug subsystem as SS_USER1. ++ * When the SPL is configured with debugging enabled these messages will ++ * appear in the internal spl debug log, otherwise they are a no-op. ++ */ ++#if defined(_KERNEL) ++ ++#include ++#define dprintf(...) \ ++ if (zfs_flags & ZFS_DEBUG_DPRINTF) \ ++ __SDEBUG(NULL, SS_USER1, SD_DPRINTF, __VA_ARGS__) ++ ++/* ++ * When zfs is running is user space the debugging is always enabled. ++ * The messages will be printed using the __dprintf() function and ++ * filtered based on the zfs_flags variable. ++ */ ++#else ++#define dprintf(...) \ ++ if (zfs_flags & ZFS_DEBUG_DPRINTF) \ ++ __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__) ++ ++#endif /* _KERNEL */ ++ ++void zfs_panic_recover(const char *fmt, ...); ++#define zfs_dbgmsg(...) dprintf(__VA_ARGS__) ++void zfs_dbgmsg_init(void); ++void zfs_dbgmsg_fini(void); ++ ++#endif /* _SYS_ZFS_DEBUG_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zfs_dir.h linux-3.2.33-go/include/zfs/sys/zfs_dir.h +--- linux-3.2.33-go.orig/include/zfs/sys/zfs_dir.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zfs_dir.h 2012-11-16 23:25:34.342039415 +0100 +@@ -0,0 +1,74 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_FS_ZFS_DIR_H ++#define _SYS_FS_ZFS_DIR_H ++ ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* zfs_dirent_lock() flags */ ++#define ZNEW 0x0001 /* entry should not exist */ ++#define ZEXISTS 0x0002 /* entry should exist */ ++#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */ ++#define ZXATTR 0x0008 /* we want the xattr dir */ ++#define ZRENAMING 0x0010 /* znode is being renamed */ ++#define ZCILOOK 0x0020 /* case-insensitive lookup requested */ ++#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */ ++#define ZHAVELOCK 0x0080 /* z_name_lock is already held */ ++ ++/* mknode flags */ ++#define IS_ROOT_NODE 0x01 /* create a root node */ ++#define IS_XATTR 0x02 /* create an extended attribute node */ ++ ++extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **, ++ int, int *, pathname_t *); ++extern void zfs_dirent_unlock(zfs_dirlock_t *); ++extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int); ++extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, ++ boolean_t *); ++extern int zfs_dirlook(znode_t *, char *, struct inode **, int, int *, ++ pathname_t *); ++extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, ++ uint_t, znode_t **, zfs_acl_ids_t *); ++extern void zfs_rmnode(znode_t *); ++extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old); ++extern boolean_t zfs_dirempty(znode_t *); ++extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); ++extern void zfs_unlinked_drain(zfs_sb_t *); ++extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr); ++extern int zfs_get_xattrdir(znode_t *, struct inode **, cred_t *, int); ++extern int zfs_make_xattrdir(znode_t *, vattr_t *, struct inode **, cred_t *); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_FS_ZFS_DIR_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zfs_fuid.h linux-3.2.33-go/include/zfs/sys/zfs_fuid.h +--- linux-3.2.33-go.orig/include/zfs/sys/zfs_fuid.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zfs_fuid.h 2012-11-16 23:25:34.343039404 +0100 +@@ -0,0 +1,132 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_FS_ZFS_FUID_H ++#define _SYS_FS_ZFS_FUID_H ++ ++#ifdef _KERNEL ++#include ++#include ++#include ++#include ++#endif ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++typedef enum { ++ ZFS_OWNER, ++ ZFS_GROUP, ++ ZFS_ACE_USER, ++ ZFS_ACE_GROUP ++} zfs_fuid_type_t; ++ ++/* ++ * Estimate space needed for one more fuid table entry. ++ * for now assume its current size + 1K ++ */ ++#define FUID_SIZE_ESTIMATE(z) ((z)->z_fuid_size + (SPA_MINBLOCKSIZE << 1)) ++ ++#define FUID_INDEX(x) ((x) >> 32) ++#define FUID_RID(x) ((x) & 0xffffffff) ++#define FUID_ENCODE(idx, rid) (((uint64_t)(idx) << 32) | (rid)) ++/* ++ * FUIDs cause problems for the intent log ++ * we need to replay the creation of the FUID, ++ * but we can't count on the idmapper to be around ++ * and during replay the FUID index may be different than ++ * before. Also, if an ACL has 100 ACEs and 12 different ++ * domains we don't want to log 100 domain strings, but rather ++ * just the unique 12. ++ */ ++ ++/* ++ * The FUIDs in the log will index into ++ * domain string table and the bottom half will be the rid. ++ * Used for mapping ephemeral uid/gid during ACL setting to FUIDs ++ */ ++typedef struct zfs_fuid { ++ list_node_t z_next; ++ uint64_t z_id; /* uid/gid being converted to fuid */ ++ uint64_t z_domidx; /* index in AVL domain table */ ++ uint64_t z_logfuid; /* index for domain in log */ ++} zfs_fuid_t; ++ ++/* list of unique domains */ ++typedef struct zfs_fuid_domain { ++ list_node_t z_next; ++ uint64_t z_domidx; /* AVL tree idx */ ++ const char *z_domain; /* domain string */ ++} zfs_fuid_domain_t; ++ ++/* ++ * FUID information necessary for logging create, setattr, and setacl. ++ */ ++typedef struct zfs_fuid_info { ++ list_t z_fuids; ++ list_t z_domains; ++ uint64_t z_fuid_owner; ++ uint64_t z_fuid_group; ++ char **z_domain_table; /* Used during replay */ ++ uint32_t z_fuid_cnt; /* How many fuids in z_fuids */ ++ uint32_t z_domain_cnt; /* How many domains */ ++ size_t z_domain_str_sz; /* len of domain strings z_domain list */ ++} zfs_fuid_info_t; ++ ++#ifdef _KERNEL ++struct znode; ++extern uid_t zfs_fuid_map_id(zfs_sb_t *, uint64_t, cred_t *, zfs_fuid_type_t); ++extern void zfs_fuid_node_add(zfs_fuid_info_t **, const char *, uint32_t, ++ uint64_t, uint64_t, zfs_fuid_type_t); ++extern void zfs_fuid_destroy(zfs_sb_t *); ++extern uint64_t zfs_fuid_create_cred(zfs_sb_t *, zfs_fuid_type_t, ++ cred_t *, zfs_fuid_info_t **); ++extern uint64_t zfs_fuid_create(zfs_sb_t *, uint64_t, cred_t *, zfs_fuid_type_t, ++ zfs_fuid_info_t **); ++extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, ++ uid_t *uid, uid_t *gid); ++extern zfs_fuid_info_t *zfs_fuid_info_alloc(void); ++extern void zfs_fuid_info_free(zfs_fuid_info_t *); ++extern boolean_t zfs_groupmember(zfs_sb_t *, uint64_t, cred_t *); ++void zfs_fuid_sync(zfs_sb_t *, dmu_tx_t *); ++extern int zfs_fuid_find_by_domain(zfs_sb_t *, const char *domain, ++ char **retdomain, boolean_t addok); ++extern const char *zfs_fuid_find_by_idx(zfs_sb_t *zsb, uint32_t idx); ++extern void zfs_fuid_txhold(zfs_sb_t *zsb, dmu_tx_t *tx); ++#endif ++ ++char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t); ++void zfs_fuid_avl_tree_create(avl_tree_t *, avl_tree_t *); ++uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *); ++void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_FS_ZFS_FUID_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zfs_ioctl.h linux-3.2.33-go/include/zfs/sys/zfs_ioctl.h +--- linux-3.2.33-go.orig/include/zfs/sys/zfs_ioctl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zfs_ioctl.h 2012-11-16 23:25:34.343039404 +0100 +@@ -0,0 +1,346 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_ZFS_IOCTL_H ++#define _SYS_ZFS_IOCTL_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef _KERNEL ++#include ++#endif /* _KERNEL */ ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * Property values for snapdir ++ */ ++#define ZFS_SNAPDIR_HIDDEN 0 ++#define ZFS_SNAPDIR_VISIBLE 1 ++ ++/* ++ * Field manipulation macros for the drr_versioninfo field of the ++ * send stream header. ++ */ ++ ++/* ++ * Header types for zfs send streams. ++ */ ++typedef enum drr_headertype { ++ DMU_SUBSTREAM = 0x1, ++ DMU_COMPOUNDSTREAM = 0x2 ++} drr_headertype_t; ++ ++#define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2) ++#define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x) ++ ++#define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30) ++#define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x) ++ ++/* ++ * Feature flags for zfs send streams (flags in drr_versioninfo) ++ */ ++ ++#define DMU_BACKUP_FEATURE_DEDUP (0x1) ++#define DMU_BACKUP_FEATURE_DEDUPPROPS (0x2) ++#define DMU_BACKUP_FEATURE_SA_SPILL (0x4) ++ ++/* ++ * Mask of all supported backup features ++ */ ++#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ ++ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL) ++ ++/* Are all features in the given flag word currently supported? */ ++#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) ++ ++/* ++ * The drr_versioninfo field of the dmu_replay_record has the ++ * following layout: ++ * ++ * 64 56 48 40 32 24 16 8 0 ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * | reserved | feature-flags |C|S| ++ * +-------+-------+-------+-------+-------+-------+-------+-------+ ++ * ++ * The low order two bits indicate the header type: SUBSTREAM (0x1) ++ * or COMPOUNDSTREAM (0x2). Using two bits for this is historical: ++ * this field used to be a version number, where the two version types ++ * were 1 and 2. Using two bits for this allows earlier versions of ++ * the code to be able to recognize send streams that don't use any ++ * of the features indicated by feature flags. ++ */ ++ ++#define DMU_BACKUP_MAGIC 0x2F5bacbacULL ++ ++#define DRR_FLAG_CLONE (1<<0) ++#define DRR_FLAG_CI_DATA (1<<1) ++ ++/* ++ * flags in the drr_checksumflags field in the DRR_WRITE and ++ * DRR_WRITE_BYREF blocks ++ */ ++#define DRR_CHECKSUM_DEDUP (1<<0) ++ ++#define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) ++ ++/* ++ * zfs ioctl command structure ++ */ ++typedef struct dmu_replay_record { ++ enum { ++ DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, ++ DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, ++ DRR_SPILL, DRR_NUMTYPES ++ } drr_type; ++ uint32_t drr_payloadlen; ++ union { ++ struct drr_begin { ++ uint64_t drr_magic; ++ uint64_t drr_versioninfo; /* was drr_version */ ++ uint64_t drr_creation_time; ++ dmu_objset_type_t drr_type; ++ uint32_t drr_flags; ++ uint64_t drr_toguid; ++ uint64_t drr_fromguid; ++ char drr_toname[MAXNAMELEN]; ++ } drr_begin; ++ struct drr_end { ++ zio_cksum_t drr_checksum; ++ uint64_t drr_toguid; ++ } drr_end; ++ struct drr_object { ++ uint64_t drr_object; ++ dmu_object_type_t drr_type; ++ dmu_object_type_t drr_bonustype; ++ uint32_t drr_blksz; ++ uint32_t drr_bonuslen; ++ uint8_t drr_checksumtype; ++ uint8_t drr_compress; ++ uint8_t drr_pad[6]; ++ uint64_t drr_toguid; ++ /* bonus content follows */ ++ } drr_object; ++ struct drr_freeobjects { ++ uint64_t drr_firstobj; ++ uint64_t drr_numobjs; ++ uint64_t drr_toguid; ++ } drr_freeobjects; ++ struct drr_write { ++ uint64_t drr_object; ++ dmu_object_type_t drr_type; ++ uint32_t drr_pad; ++ uint64_t drr_offset; ++ uint64_t drr_length; ++ uint64_t drr_toguid; ++ uint8_t drr_checksumtype; ++ uint8_t drr_checksumflags; ++ uint8_t drr_pad2[6]; ++ ddt_key_t drr_key; /* deduplication key */ ++ /* content follows */ ++ } drr_write; ++ struct drr_free { ++ uint64_t drr_object; ++ uint64_t drr_offset; ++ uint64_t drr_length; ++ uint64_t drr_toguid; ++ } drr_free; ++ struct drr_write_byref { ++ /* where to put the data */ ++ uint64_t drr_object; ++ uint64_t drr_offset; ++ uint64_t drr_length; ++ uint64_t drr_toguid; ++ /* where to find the prior copy of the data */ ++ uint64_t drr_refguid; ++ uint64_t drr_refobject; ++ uint64_t drr_refoffset; ++ /* properties of the data */ ++ uint8_t drr_checksumtype; ++ uint8_t drr_checksumflags; ++ uint8_t drr_pad2[6]; ++ ddt_key_t drr_key; /* deduplication key */ ++ } drr_write_byref; ++ struct drr_spill { ++ uint64_t drr_object; ++ uint64_t drr_length; ++ uint64_t drr_toguid; ++ uint64_t drr_pad[4]; /* needed for crypto */ ++ /* spill data follows */ ++ } drr_spill; ++ } drr_u; ++} dmu_replay_record_t; ++ ++/* diff record range types */ ++typedef enum diff_type { ++ DDR_NONE = 0x1, ++ DDR_INUSE = 0x2, ++ DDR_FREE = 0x4 ++} diff_type_t; ++ ++/* ++ * The diff reports back ranges of free or in-use objects. ++ */ ++typedef struct dmu_diff_record { ++ uint64_t ddr_type; ++ uint64_t ddr_first; ++ uint64_t ddr_last; ++} dmu_diff_record_t; ++ ++typedef struct zinject_record { ++ uint64_t zi_objset; ++ uint64_t zi_object; ++ uint64_t zi_start; ++ uint64_t zi_end; ++ uint64_t zi_guid; ++ uint32_t zi_level; ++ uint32_t zi_error; ++ uint64_t zi_type; ++ uint32_t zi_freq; ++ uint32_t zi_failfast; ++ char zi_func[MAXNAMELEN]; ++ uint32_t zi_iotype; ++ int32_t zi_duration; ++ uint64_t zi_timer; ++} zinject_record_t; ++ ++#define ZINJECT_NULL 0x1 ++#define ZINJECT_FLUSH_ARC 0x2 ++#define ZINJECT_UNLOAD_SPA 0x4 ++ ++#define ZEVENT_NONBLOCK 0x1 ++#define ZEVENT_SIZE 1024 ++ ++typedef struct zfs_share { ++ uint64_t z_exportdata; ++ uint64_t z_sharedata; ++ uint64_t z_sharetype; /* 0 = share, 1 = unshare */ ++ uint64_t z_sharemax; /* max length of share string */ ++} zfs_share_t; ++ ++/* ++ * ZFS file systems may behave the usual, POSIX-compliant way, where ++ * name lookups are case-sensitive. They may also be set up so that ++ * all the name lookups are case-insensitive, or so that only some ++ * lookups, the ones that set an FIGNORECASE flag, are case-insensitive. ++ */ ++typedef enum zfs_case { ++ ZFS_CASE_SENSITIVE, ++ ZFS_CASE_INSENSITIVE, ++ ZFS_CASE_MIXED ++} zfs_case_t; ++ ++typedef struct zfs_cmd { ++ char zc_name[MAXPATHLEN]; ++ char zc_value[MAXPATHLEN * 2]; ++ char zc_string[MAXNAMELEN]; ++ char zc_top_ds[MAXPATHLEN]; ++ uint64_t zc_guid; ++ uint64_t zc_nvlist_conf; /* really (char *) */ ++ uint64_t zc_nvlist_conf_size; ++ uint64_t zc_nvlist_src; /* really (char *) */ ++ uint64_t zc_nvlist_src_size; ++ uint64_t zc_nvlist_dst; /* really (char *) */ ++ uint64_t zc_nvlist_dst_size; ++ uint64_t zc_cookie; ++ uint64_t zc_objset_type; ++ uint64_t zc_perm_action; ++ uint64_t zc_history; /* really (char *) */ ++ uint64_t zc_history_len; ++ uint64_t zc_history_offset; ++ uint64_t zc_obj; ++ uint64_t zc_iflags; /* internal to zfs(7fs) */ ++ zfs_share_t zc_share; ++ dmu_objset_stats_t zc_objset_stats; ++ struct drr_begin zc_begin_record; ++ zinject_record_t zc_inject_record; ++ boolean_t zc_defer_destroy; ++ boolean_t zc_temphold; ++ uint64_t zc_action_handle; ++ int zc_cleanup_fd; ++ uint8_t zc_simple; ++ uint8_t zc_pad[3]; /* alignment */ ++ uint64_t zc_sendobj; ++ uint64_t zc_fromobj; ++ uint64_t zc_createtxg; ++ zfs_stat_t zc_stat; ++} zfs_cmd_t; ++ ++typedef struct zfs_useracct { ++ char zu_domain[256]; ++ uid_t zu_rid; ++ uint32_t zu_pad; ++ uint64_t zu_space; ++} zfs_useracct_t; ++ ++#define ZFSDEV_MAX_MINOR (1 << 16) ++#define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1) ++ ++#define ZPOOL_EXPORT_AFTER_SPLIT 0x1 ++ ++#ifdef _KERNEL ++ ++typedef struct zfs_creat { ++ nvlist_t *zct_zplprops; ++ nvlist_t *zct_props; ++} zfs_creat_t; ++ ++extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); ++extern int zfs_secpolicy_rename_perms(const char *from, ++ const char *to, cred_t *cr); ++extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); ++extern int zfs_unmount_snap(const char *, void *); ++ ++enum zfsdev_state_type { ++ ZST_ONEXIT, ++ ZST_ZEVENT, ++ ZST_ALL, ++}; ++ ++typedef struct zfsdev_state { ++ list_node_t zs_next; /* next zfsdev_state_t link */ ++ struct file *zs_file; /* associated file struct */ ++ minor_t zs_minor; /* made up minor number */ ++ void *zs_onexit; /* onexit data */ ++ void *zs_zevent; /* zevent data */ ++} zfsdev_state_t; ++ ++extern void *zfsdev_get_state(minor_t minor, enum zfsdev_state_type which); ++extern minor_t zfsdev_getminor(struct file *filp); ++extern minor_t zfsdev_minor_alloc(void); ++ ++#endif /* _KERNEL */ ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_ZFS_IOCTL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zfs_onexit.h linux-3.2.33-go/include/zfs/sys/zfs_onexit.h +--- linux-3.2.33-go.orig/include/zfs/sys/zfs_onexit.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zfs_onexit.h 2012-11-16 23:25:34.342039415 +0100 +@@ -0,0 +1,66 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_ZFS_ONEXIT_H ++#define _SYS_ZFS_ONEXIT_H ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#ifdef _KERNEL ++ ++typedef struct zfs_onexit { ++ kmutex_t zo_lock; ++ list_t zo_actions; ++} zfs_onexit_t; ++ ++typedef struct zfs_onexit_action_node { ++ list_node_t za_link; ++ void (*za_func)(void *); ++ void *za_data; ++} zfs_onexit_action_node_t; ++ ++extern void zfs_onexit_init(zfs_onexit_t **zo); ++extern void zfs_onexit_destroy(zfs_onexit_t *zo); ++ ++#endif ++ ++extern int zfs_onexit_fd_hold(int fd, minor_t *minorp); ++extern void zfs_onexit_fd_rele(int fd); ++extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, ++ uint64_t *action_handle); ++extern int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, ++ boolean_t fire); ++extern int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, ++ void **data); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_ZFS_ONEXIT_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zfs_rlock.h linux-3.2.33-go/include/zfs/sys/zfs_rlock.h +--- linux-3.2.33-go.orig/include/zfs/sys/zfs_rlock.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zfs_rlock.h 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,90 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_FS_ZFS_RLOCK_H ++#define _SYS_FS_ZFS_RLOCK_H ++ ++ ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#ifdef _KERNEL ++ ++#include ++ ++typedef enum { ++ RL_READER, ++ RL_WRITER, ++ RL_APPEND ++} rl_type_t; ++ ++typedef struct rl { ++ znode_t *r_zp; /* znode this lock applies to */ ++ avl_node_t r_node; /* avl node link */ ++ uint64_t r_off; /* file range offset */ ++ uint64_t r_len; /* file range length */ ++ uint_t r_cnt; /* range reference count in tree */ ++ rl_type_t r_type; /* range type */ ++ kcondvar_t r_wr_cv; /* cv for waiting writers */ ++ kcondvar_t r_rd_cv; /* cv for waiting readers */ ++ uint8_t r_proxy; /* acting for original range */ ++ uint8_t r_write_wanted; /* writer wants to lock this range */ ++ uint8_t r_read_wanted; /* reader wants to lock this range */ ++ list_node_t rl_node; /* used for deferred release */ ++} rl_t; ++ ++/* ++ * Lock a range (offset, length) as either shared (READER) ++ * or exclusive (WRITER or APPEND). APPEND is a special type that ++ * is converted to WRITER that specified to lock from the start of the ++ * end of file. zfs_range_lock() returns the range lock structure. ++ */ ++rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type); ++ ++/* ++ * Unlock range and destroy range lock structure. ++ */ ++void zfs_range_unlock(rl_t *rl); ++ ++/* ++ * Reduce range locked as RW_WRITER from whole file to specified range. ++ * Asserts the whole file was previously locked. ++ */ ++void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len); ++ ++/* ++ * AVL comparison function used to compare range locks ++ */ ++int zfs_range_compare(const void *arg1, const void *arg2); ++ ++#endif /* _KERNEL */ ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_FS_ZFS_RLOCK_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zfs_sa.h linux-3.2.33-go/include/zfs/sys/zfs_sa.h +--- linux-3.2.33-go.orig/include/zfs/sys/zfs_sa.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zfs_sa.h 2012-11-16 23:25:34.342039415 +0100 +@@ -0,0 +1,150 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_ZFS_SA_H ++#define _SYS_ZFS_SA_H ++ ++#ifdef _KERNEL ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#endif ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * This is the list of known attributes ++ * to the ZPL. The values of the actual ++ * attributes are not defined by the order ++ * the enums. It is controlled by the attribute ++ * registration mechanism. Two different file system ++ * could have different numeric values for the same ++ * attributes. this list is only used for dereferencing ++ * into the table that will hold the actual numeric value. ++ */ ++typedef enum zpl_attr { ++ ZPL_ATIME, ++ ZPL_MTIME, ++ ZPL_CTIME, ++ ZPL_CRTIME, ++ ZPL_GEN, ++ ZPL_MODE, ++ ZPL_SIZE, ++ ZPL_PARENT, ++ ZPL_LINKS, ++ ZPL_XATTR, ++ ZPL_RDEV, ++ ZPL_FLAGS, ++ ZPL_UID, ++ ZPL_GID, ++ ZPL_PAD, ++ ZPL_ZNODE_ACL, ++ ZPL_DACL_COUNT, ++ ZPL_SYMLINK, ++ ZPL_SCANSTAMP, ++ ZPL_DACL_ACES, ++ ZPL_DXATTR, ++ ZPL_END ++} zpl_attr_t; ++ ++#define ZFS_OLD_ZNODE_PHYS_SIZE 0x108 ++#define ZFS_SA_BASE_ATTR_SIZE (ZFS_OLD_ZNODE_PHYS_SIZE - \ ++ sizeof (zfs_acl_phys_t)) ++ ++#define SA_MODE_OFFSET 0 ++#define SA_SIZE_OFFSET 8 ++#define SA_GEN_OFFSET 16 ++#define SA_UID_OFFSET 24 ++#define SA_GID_OFFSET 32 ++#define SA_PARENT_OFFSET 40 ++ ++extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1]; ++extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1]; ++ ++/* ++ * This is a deprecated data structure that only exists for ++ * dealing with file systems create prior to ZPL version 5. ++ */ ++typedef struct znode_phys { ++ uint64_t zp_atime[2]; /* 0 - last file access time */ ++ uint64_t zp_mtime[2]; /* 16 - last file modification time */ ++ uint64_t zp_ctime[2]; /* 32 - last file change time */ ++ uint64_t zp_crtime[2]; /* 48 - creation time */ ++ uint64_t zp_gen; /* 64 - generation (txg of creation) */ ++ uint64_t zp_mode; /* 72 - file mode bits */ ++ uint64_t zp_size; /* 80 - size of file */ ++ uint64_t zp_parent; /* 88 - directory parent (`..') */ ++ uint64_t zp_links; /* 96 - number of links to file */ ++ uint64_t zp_xattr; /* 104 - DMU object for xattrs */ ++ uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ ++ uint64_t zp_flags; /* 120 - persistent flags */ ++ uint64_t zp_uid; /* 128 - file owner */ ++ uint64_t zp_gid; /* 136 - owning group */ ++ uint64_t zp_zap; /* 144 - extra attributes */ ++ uint64_t zp_pad[3]; /* 152 - future */ ++ zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */ ++ /* ++ * Data may pad out any remaining bytes in the znode buffer, eg: ++ * ++ * |<---------------------- dnode_phys (512) ------------------------>| ++ * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| ++ * |<---- znode (264) ---->|<---- data (56) ---->| ++ * ++ * At present, we use this space for the following: ++ * - symbolic links ++ * - 32-byte anti-virus scanstamp (regular files only) ++ */ ++} znode_phys_t; ++ ++#ifdef _KERNEL ++ ++#define DXATTR_MAX_ENTRY_SIZE (32768) ++#define DXATTR_MAX_SA_SIZE (SPA_MAXBLOCKSIZE >> 1) ++ ++int zfs_sa_readlink(struct znode *, uio_t *); ++void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *); ++void zfs_sa_get_scanstamp(struct znode *, xvattr_t *); ++void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *); ++int zfs_sa_get_xattr(struct znode *); ++int zfs_sa_set_xattr(struct znode *); ++void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *); ++void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *); ++void zfs_sa_init(void); ++void zfs_sa_fini(void); ++#endif ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_ZFS_SA_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zfs_stat.h linux-3.2.33-go/include/zfs/sys/zfs_stat.h +--- linux-3.2.33-go.orig/include/zfs/sys/zfs_stat.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zfs_stat.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,56 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_FS_ZFS_STAT_H ++#define _SYS_FS_ZFS_STAT_H ++ ++#ifdef _KERNEL ++#include ++#include ++#include ++#endif ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * A limited number of zpl level stats are retrievable ++ * with an ioctl. zfs diff is the current consumer. ++ */ ++typedef struct zfs_stat { ++ uint64_t zs_gen; ++ uint64_t zs_mode; ++ uint64_t zs_links; ++ uint64_t zs_ctime[2]; ++} zfs_stat_t; ++ ++extern int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, ++ char *buf, int len); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_FS_ZFS_STAT_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zfs_vfsops.h linux-3.2.33-go/include/zfs/sys/zfs_vfsops.h +--- linux-3.2.33-go.orig/include/zfs/sys/zfs_vfsops.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zfs_vfsops.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,193 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_FS_ZFS_VFSOPS_H ++#define _SYS_FS_ZFS_VFSOPS_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct zfs_sb; ++struct znode; ++ ++typedef struct zfs_sb { ++ struct super_block *z_sb; /* generic super_block */ ++ struct backing_dev_info z_bdi; /* generic backing dev info */ ++ struct zfs_sb *z_parent; /* parent fs */ ++ objset_t *z_os; /* objset reference */ ++ uint64_t z_flags; /* super_block flags */ ++ uint64_t z_root; /* id of root znode */ ++ uint64_t z_unlinkedobj; /* id of unlinked zapobj */ ++ uint64_t z_max_blksz; /* maximum block size for files */ ++ uint64_t z_fuid_obj; /* fuid table object number */ ++ uint64_t z_fuid_size; /* fuid table size */ ++ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ ++ avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ ++ krwlock_t z_fuid_lock; /* fuid lock */ ++ boolean_t z_fuid_loaded; /* fuid tables are loaded */ ++ boolean_t z_fuid_dirty; /* need to sync fuid table ? */ ++ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ ++ zilog_t *z_log; /* intent log pointer */ ++ uint_t z_acl_inherit; /* acl inheritance behavior */ ++ zfs_case_t z_case; /* case-sense */ ++ boolean_t z_utf8; /* utf8-only */ ++ int z_norm; /* normalization flags */ ++ boolean_t z_atime; /* enable atimes mount option */ ++ boolean_t z_unmounted; /* unmounted */ ++ rrwlock_t z_teardown_lock; ++ krwlock_t z_teardown_inactive_lock; ++ list_t z_all_znodes; /* all znodes in the fs */ ++ uint64_t z_nr_znodes; /* number of znodes in the fs */ ++ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ ++ struct inode *z_ctldir; /* .zfs directory inode */ ++ avl_tree_t z_ctldir_snaps; /* .zfs/snapshot entries */ ++ kmutex_t z_ctldir_lock; /* .zfs ctldir lock */ ++ boolean_t z_show_ctldir; /* expose .zfs in the root dir */ ++ boolean_t z_issnap; /* true if this is a snapshot */ ++ boolean_t z_vscan; /* virus scan on/off */ ++ boolean_t z_use_fuids; /* version allows fuids */ ++ boolean_t z_replay; /* set during ZIL replay */ ++ boolean_t z_use_sa; /* version allow system attributes */ ++ boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ ++ uint64_t z_version; /* ZPL version */ ++ uint64_t z_shares_dir; /* hidden shares dir */ ++ kmutex_t z_lock; ++ uint64_t z_userquota_obj; ++ uint64_t z_groupquota_obj; ++ uint64_t z_replay_eof; /* New end of file - replay only */ ++ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ ++#define ZFS_OBJ_MTX_SZ 64 ++ kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ ++} zfs_sb_t; ++ ++#define ZFS_SUPER_MAGIC 0x2fc12fc1 ++ ++#define ZSB_XATTR 0x0001 /* Enable user xattrs */ ++ ++/* ++ * Allow a maximum number of links. While ZFS does not internally limit ++ * this the inode->i_nlink member is defined as an unsigned int. To be ++ * safe we use 2^31-1 as the limit. ++ */ ++#define ZFS_LINK_MAX ((1U << 31) - 1U) ++ ++/* ++ * Normal filesystems (those not under .zfs/snapshot) have a total ++ * file ID size limited to 12 bytes (including the length field) due to ++ * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical ++ * reasons, this same limit is being imposed by the Solaris NFSv3 implementation ++ * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It ++ * is not possible to expand beyond 12 bytes without abandoning support ++ * of NFSv2. ++ * ++ * For normal filesystems, we partition up the available space as follows: ++ * 2 bytes fid length (required) ++ * 6 bytes object number (48 bits) ++ * 4 bytes generation number (32 bits) ++ * ++ * We reserve only 48 bits for the object number, as this is the limit ++ * currently defined and imposed by the DMU. ++ */ ++typedef struct zfid_short { ++ uint16_t zf_len; ++ uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ ++ uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ ++} zfid_short_t; ++ ++/* ++ * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes ++ * (including the length field). This makes files under .zfs/snapshot ++ * accessible by NFSv3 and NFSv4, but not NFSv2. ++ * ++ * For files under .zfs/snapshot, we partition up the available space ++ * as follows: ++ * 2 bytes fid length (required) ++ * 6 bytes object number (48 bits) ++ * 4 bytes generation number (32 bits) ++ * 6 bytes objset id (48 bits) ++ * 4 bytes currently just zero (32 bits) ++ * ++ * We reserve only 48 bits for the object number and objset id, as these are ++ * the limits currently defined and imposed by the DMU. ++ */ ++typedef struct zfid_long { ++ zfid_short_t z_fid; ++ uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ ++ uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */ ++} zfid_long_t; ++ ++#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) ++#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) ++ ++extern uint_t zfs_fsyncer_key; ++ ++extern int zfs_suspend_fs(zfs_sb_t *zsb); ++extern int zfs_resume_fs(zfs_sb_t *zsb, const char *osname); ++extern int zfs_userspace_one(zfs_sb_t *zsb, zfs_userquota_prop_t type, ++ const char *domain, uint64_t rid, uint64_t *valuep); ++extern int zfs_userspace_many(zfs_sb_t *zsb, zfs_userquota_prop_t type, ++ uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); ++extern int zfs_set_userquota(zfs_sb_t *zsb, zfs_userquota_prop_t type, ++ const char *domain, uint64_t rid, uint64_t quota); ++extern boolean_t zfs_owner_overquota(zfs_sb_t *zsb, struct znode *, ++ boolean_t isgroup); ++extern boolean_t zfs_fuid_overquota(zfs_sb_t *zsb, boolean_t isgroup, ++ uint64_t fuid); ++extern int zfs_set_version(zfs_sb_t *zsb, uint64_t newvers); ++extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, ++ uint64_t *value); ++extern int zfs_sb_create(const char *name, zfs_sb_t **zsbp); ++extern int zfs_sb_setup(zfs_sb_t *zsb, boolean_t mounting); ++extern void zfs_sb_free(zfs_sb_t *zsb); ++extern int zfs_sb_prune(struct super_block *sb, unsigned long nr_to_scan, ++ int *objects); ++extern int zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting); ++extern int zfs_check_global_label(const char *dsname, const char *hexsl); ++extern boolean_t zfs_is_readonly(zfs_sb_t *zsb); ++ ++extern int zfs_register_callbacks(zfs_sb_t *zsb); ++extern void zfs_unregister_callbacks(zfs_sb_t *zsb); ++extern int zfs_domount(struct super_block *sb, void *data, int silent); ++extern void zfs_preumount(struct super_block *sb); ++extern int zfs_umount(struct super_block *sb); ++extern int zfs_remount(struct super_block *sb, int *flags, char *data); ++extern int zfs_root(zfs_sb_t *zsb, struct inode **ipp); ++extern int zfs_statvfs(struct dentry *dentry, struct kstatfs *statp); ++extern int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_FS_ZFS_VFSOPS_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zfs_vnops.h linux-3.2.33-go/include/zfs/sys/zfs_vnops.h +--- linux-3.2.33-go.orig/include/zfs/sys/zfs_vnops.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zfs_vnops.h 2012-11-16 23:25:34.342039415 +0100 +@@ -0,0 +1,85 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_FS_ZFS_VNOPS_H ++#define _SYS_FS_ZFS_VNOPS_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++extern int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr); ++extern int zfs_close(struct inode *ip, int flag, cred_t *cr); ++extern int zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr); ++extern int zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr); ++extern int zfs_access(struct inode *ip, int mode, int flag, cred_t *cr); ++extern int zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, ++ int flags, cred_t *cr, int *direntflags, pathname_t *realpnp); ++extern int zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, ++ int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp); ++extern int zfs_remove(struct inode *dip, char *name, cred_t *cr); ++extern int zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, ++ struct inode **ipp, cred_t *cr, int flags, vsecattr_t *vsecp); ++extern int zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, ++ cred_t *cr, int flags); ++extern int zfs_readdir(struct inode *ip, void *dirent, filldir_t filldir, ++ loff_t *pos, cred_t *cr); ++extern int zfs_fsync(struct inode *ip, int syncflag, cred_t *cr); ++extern int zfs_getattr(struct inode *ip, vattr_t *vap, int flag, cred_t *cr); ++extern int zfs_getattr_fast(struct inode *ip, struct kstat *sp); ++extern int zfs_setattr(struct inode *ip, vattr_t *vap, int flag, cred_t *cr); ++extern int zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, ++ char *tnm, cred_t *cr, int flags); ++extern int zfs_symlink(struct inode *dip, char *name, vattr_t *vap, ++ char *link, struct inode **ipp, cred_t *cr, int flags); ++extern int zfs_follow_link(struct dentry *dentry, struct nameidata *nd); ++extern int zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr); ++extern int zfs_link(struct inode *tdip, struct inode *sip, ++ char *name, cred_t *cr); ++extern void zfs_inactive(struct inode *ip); ++extern int zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag, ++ offset_t offset, cred_t *cr); ++extern int zfs_fid(struct inode *ip, fid_t *fidp); ++extern int zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, ++ cred_t *cr); ++extern int zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, ++ cred_t *cr); ++extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages); ++extern int zfs_putpage(struct inode *ip, struct page *pp, ++ struct writeback_control *wbc); ++extern int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, ++ size_t len, unsigned long vm_flags); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_FS_ZFS_VNOPS_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zfs_znode.h linux-3.2.33-go/include/zfs/sys/zfs_znode.h +--- linux-3.2.33-go.orig/include/zfs/sys/zfs_znode.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zfs_znode.h 2012-11-16 23:25:34.338039461 +0100 +@@ -0,0 +1,383 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_FS_ZFS_ZNODE_H ++#define _SYS_FS_ZFS_ZNODE_H ++ ++#ifdef _KERNEL ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#endif ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * Additional file level attributes, that are stored ++ * in the upper half of zp_flags ++ */ ++#define ZFS_READONLY 0x0000000100000000ull ++#define ZFS_HIDDEN 0x0000000200000000ull ++#define ZFS_SYSTEM 0x0000000400000000ull ++#define ZFS_ARCHIVE 0x0000000800000000ull ++#define ZFS_IMMUTABLE 0x0000001000000000ull ++#define ZFS_NOUNLINK 0x0000002000000000ull ++#define ZFS_APPENDONLY 0x0000004000000000ull ++#define ZFS_NODUMP 0x0000008000000000ull ++#define ZFS_OPAQUE 0x0000010000000000ull ++#define ZFS_AV_QUARANTINED 0x0000020000000000ull ++#define ZFS_AV_MODIFIED 0x0000040000000000ull ++#define ZFS_REPARSE 0x0000080000000000ull ++#define ZFS_OFFLINE 0x0000100000000000ull ++#define ZFS_SPARSE 0x0000200000000000ull ++ ++#define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \ ++{ \ ++ if (value) \ ++ pflags |= attr; \ ++ else \ ++ pflags &= ~attr; \ ++ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(ZTOZSB(zp)), \ ++ &pflags, sizeof (pflags), tx)); \ ++} ++ ++/* ++ * Define special zfs pflags ++ */ ++#define ZFS_XATTR 0x1 /* is an extended attribute */ ++#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ ++#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ ++#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */ ++#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */ ++#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */ ++#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */ ++#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ ++#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */ ++ ++#define SA_ZPL_ATIME(z) z->z_attr_table[ZPL_ATIME] ++#define SA_ZPL_MTIME(z) z->z_attr_table[ZPL_MTIME] ++#define SA_ZPL_CTIME(z) z->z_attr_table[ZPL_CTIME] ++#define SA_ZPL_CRTIME(z) z->z_attr_table[ZPL_CRTIME] ++#define SA_ZPL_GEN(z) z->z_attr_table[ZPL_GEN] ++#define SA_ZPL_DACL_ACES(z) z->z_attr_table[ZPL_DACL_ACES] ++#define SA_ZPL_XATTR(z) z->z_attr_table[ZPL_XATTR] ++#define SA_ZPL_SYMLINK(z) z->z_attr_table[ZPL_SYMLINK] ++#define SA_ZPL_RDEV(z) z->z_attr_table[ZPL_RDEV] ++#define SA_ZPL_SCANSTAMP(z) z->z_attr_table[ZPL_SCANSTAMP] ++#define SA_ZPL_UID(z) z->z_attr_table[ZPL_UID] ++#define SA_ZPL_GID(z) z->z_attr_table[ZPL_GID] ++#define SA_ZPL_PARENT(z) z->z_attr_table[ZPL_PARENT] ++#define SA_ZPL_LINKS(z) z->z_attr_table[ZPL_LINKS] ++#define SA_ZPL_MODE(z) z->z_attr_table[ZPL_MODE] ++#define SA_ZPL_DACL_COUNT(z) z->z_attr_table[ZPL_DACL_COUNT] ++#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS] ++#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE] ++#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL] ++#define SA_ZPL_DXATTR(z) z->z_attr_table[ZPL_DXATTR] ++#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD] ++ ++/* ++ * Is ID ephemeral? ++ */ ++#define IS_EPHEMERAL(x) (x > MAXUID) ++ ++/* ++ * Should we use FUIDs? ++ */ ++#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID && \ ++ spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) ++#define USE_SA(version, os) (version >= ZPL_VERSION_SA && \ ++ spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA) ++ ++#define MASTER_NODE_OBJ 1 ++ ++/* ++ * Special attributes for master node. ++ * "userquota@" and "groupquota@" are also valid (from ++ * zfs_userquota_prop_prefixes[]). ++ */ ++#define ZFS_FSID "FSID" ++#define ZFS_UNLINKED_SET "DELETE_QUEUE" ++#define ZFS_ROOT_OBJ "ROOT" ++#define ZPL_VERSION_STR "VERSION" ++#define ZFS_FUID_TABLES "FUID" ++#define ZFS_SHARES_DIR "SHARES" ++#define ZFS_SA_ATTRS "SA_ATTRS" ++ ++#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE) ++ ++/* Path component length */ ++/* ++ * The generic fs code uses MAXNAMELEN to represent ++ * what the largest component length is. Unfortunately, ++ * this length includes the terminating NULL. ZFS needs ++ * to tell the users via pathconf() and statvfs() what the ++ * true maximum length of a component is, excluding the NULL. ++ */ ++#define ZFS_MAXNAMELEN (MAXNAMELEN - 1) ++ ++/* ++ * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in ++ * the directory entries. On Linux systems this value is already ++ * defined correctly as part of the /usr/include/dirent.h header file. ++ */ ++#ifndef IFTODT ++#define IFTODT(mode) (((mode) & S_IFMT) >> 12) ++#endif ++ ++/* ++ * The directory entry has the type (currently unused on Solaris) in the ++ * top 4 bits, and the object number in the low 48 bits. The "middle" ++ * 12 bits are unused. ++ */ ++#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4) ++#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) ++ ++/* ++ * Directory entry locks control access to directory entries. ++ * They are used to protect creates, deletes, and renames. ++ * Each directory znode has a mutex and a list of locked names. ++ */ ++#ifdef _KERNEL ++typedef struct zfs_dirlock { ++ char *dl_name; /* directory entry being locked */ ++ uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */ ++ uint8_t dl_namelock; /* 1 if z_name_lock is NOT held */ ++ uint16_t dl_namesize; /* set if dl_name was allocated */ ++ kcondvar_t dl_cv; /* wait for entry to be unlocked */ ++ struct znode *dl_dzp; /* directory znode */ ++ struct zfs_dirlock *dl_next; /* next in z_dirlocks list */ ++} zfs_dirlock_t; ++ ++typedef struct znode { ++ uint64_t z_id; /* object ID for this znode */ ++ kmutex_t z_lock; /* znode modification lock */ ++ krwlock_t z_parent_lock; /* parent lock for directories */ ++ krwlock_t z_name_lock; /* "master" lock for dirent locks */ ++ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ ++ kmutex_t z_range_lock; /* protects changes to z_range_avl */ ++ avl_tree_t z_range_avl; /* avl tree of file range locks */ ++ uint8_t z_unlinked; /* file has been unlinked */ ++ uint8_t z_atime_dirty; /* atime needs to be synced */ ++ uint8_t z_zn_prefetch; /* Prefetch znodes? */ ++ uint8_t z_moved; /* Has this znode been moved? */ ++ uint_t z_blksz; /* block size in bytes */ ++ uint_t z_seq; /* modification sequence number */ ++ uint64_t z_mapcnt; /* number of pages mapped to file */ ++ uint64_t z_gen; /* generation (cached) */ ++ uint64_t z_size; /* file size (cached) */ ++ uint64_t z_atime[2]; /* atime (cached) */ ++ uint64_t z_links; /* file links (cached) */ ++ uint64_t z_pflags; /* pflags (cached) */ ++ uint64_t z_uid; /* uid fuid (cached) */ ++ uint64_t z_gid; /* gid fuid (cached) */ ++ mode_t z_mode; /* mode (cached) */ ++ uint32_t z_sync_cnt; /* synchronous open count */ ++ kmutex_t z_acl_lock; /* acl data lock */ ++ zfs_acl_t *z_acl_cached; /* cached acl */ ++ krwlock_t z_xattr_lock; /* xattr data lock */ ++ nvlist_t *z_xattr_cached;/* cached xattrs */ ++ list_node_t z_link_node; /* all znodes in fs link */ ++ sa_handle_t *z_sa_hdl; /* handle to sa data */ ++ boolean_t z_is_sa; /* are we native sa? */ ++ boolean_t z_is_zvol; /* are we used by the zvol */ ++ boolean_t z_is_mapped; /* are we mmap'ed */ ++ boolean_t z_is_ctldir; /* are we .zfs entry */ ++ struct inode z_inode; /* generic vfs inode */ ++} znode_t; ++ ++ ++/* ++ * Range locking rules ++ * -------------------- ++ * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole ++ * file range needs to be locked as RL_WRITER. Only then can the pages be ++ * freed etc and zp_size reset. zp_size must be set within range lock. ++ * 2. For writes and punching holes (zfs_write & zfs_space) just the range ++ * being written or freed needs to be locked as RL_WRITER. ++ * Multiple writes at the end of the file must coordinate zp_size updates ++ * to ensure data isn't lost. A compare and swap loop is currently used ++ * to ensure the file size is at least the offset last written. ++ * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being ++ * read needs to be locked as RL_READER. A check against zp_size can then ++ * be made for reading beyond end of file. ++ */ ++ ++/* ++ * Convert between znode pointers and inode pointers ++ */ ++#define ZTOI(znode) (&((znode)->z_inode)) ++#define ITOZ(inode) (container_of((inode), znode_t, z_inode)) ++#define ZTOZSB(znode) ((zfs_sb_t *)(ZTOI(znode)->i_sb->s_fs_info)) ++#define ITOZSB(inode) ((zfs_sb_t *)((inode)->i_sb->s_fs_info)) ++ ++#define S_ISDEV(mode) (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) ++ ++/* ++ * ZFS_ENTER() is called on entry to each ZFS inode and vfs operation. ++ * ZFS_EXIT() must be called before exitting the vop. ++ * ZFS_VERIFY_ZP() verifies the znode is valid. ++ */ ++#define ZFS_ENTER(zsb) \ ++ { \ ++ rrw_enter(&(zsb)->z_teardown_lock, RW_READER, FTAG); \ ++ if ((zsb)->z_unmounted) { \ ++ ZFS_EXIT(zsb); \ ++ return (EIO); \ ++ } \ ++ } ++ ++#define ZFS_EXIT(zsb) \ ++ { \ ++ rrw_exit(&(zsb)->z_teardown_lock, FTAG); \ ++ tsd_exit(); \ ++ } ++ ++#define ZFS_VERIFY_ZP(zp) \ ++ if ((zp)->z_sa_hdl == NULL) { \ ++ ZFS_EXIT(ZTOZSB(zp)); \ ++ return (EIO); \ ++ } ++ ++/* ++ * Macros for dealing with dmu_buf_hold ++ */ ++#define ZFS_OBJ_HASH(obj_num) ((obj_num) & (ZFS_OBJ_MTX_SZ - 1)) ++#define ZFS_OBJ_MUTEX(zsb, obj_num) \ ++ (&(zsb)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) ++#define ZFS_OBJ_HOLD_ENTER(zsb, obj_num) \ ++ mutex_enter(ZFS_OBJ_MUTEX((zsb), (obj_num))) ++#define ZFS_OBJ_HOLD_TRYENTER(zsb, obj_num) \ ++ mutex_tryenter(ZFS_OBJ_MUTEX((zsb), (obj_num))) ++#define ZFS_OBJ_HOLD_EXIT(zsb, obj_num) \ ++ mutex_exit(ZFS_OBJ_MUTEX((zsb), (obj_num))) ++#define ZFS_OBJ_HOLD_OWNED(zsb, obj_num) \ ++ mutex_owned(ZFS_OBJ_MUTEX((zsb), (obj_num))) ++ ++/* ++ * Macros to encode/decode ZFS stored time values from/to struct timespec ++ */ ++#define ZFS_TIME_ENCODE(tp, stmp) \ ++{ \ ++ (stmp)[0] = (uint64_t)(tp)->tv_sec; \ ++ (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ ++} ++ ++#define ZFS_TIME_DECODE(tp, stmp) \ ++{ \ ++ (tp)->tv_sec = (time_t)(stmp)[0]; \ ++ (tp)->tv_nsec = (long)(stmp)[1]; \ ++} ++ ++/* ++ * Timestamp defines ++ */ ++#define ACCESSED (ATTR_ATIME) ++#define STATE_CHANGED (ATTR_CTIME) ++#define CONTENT_MODIFIED (ATTR_MTIME | ATTR_CTIME) ++ ++#define ZFS_ACCESSTIME_STAMP(zsb, zp) \ ++ if ((zsb)->z_atime && !(zfs_is_readonly(zsb))) \ ++ zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE); ++ ++extern int zfs_init_fs(zfs_sb_t *, znode_t **); ++extern void zfs_set_dataprop(objset_t *); ++extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *, ++ dmu_tx_t *tx); ++extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2], ++ uint64_t [2], boolean_t); ++extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *); ++extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); ++extern void zfs_znode_init(void); ++extern void zfs_znode_fini(void); ++extern int zfs_zget(zfs_sb_t *, uint64_t, znode_t **); ++extern int zfs_rezget(znode_t *); ++extern void zfs_zinactive(znode_t *); ++extern void zfs_znode_delete(znode_t *, dmu_tx_t *); ++extern void zfs_remove_op_tables(void); ++extern int zfs_create_op_tables(void); ++extern int zfs_sync(struct super_block *, int, cred_t *); ++extern dev_t zfs_cmpldev(uint64_t); ++extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); ++extern int zfs_get_stats(objset_t *os, nvlist_t *nv); ++extern void zfs_znode_dmu_fini(znode_t *); ++extern int zfs_inode_alloc(struct super_block *, struct inode **ip); ++extern void zfs_inode_destroy(struct inode *); ++extern void zfs_inode_update(znode_t *); ++ ++extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, ++ znode_t *dzp, znode_t *zp, char *name, vsecattr_t *, zfs_fuid_info_t *, ++ vattr_t *vap); ++extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp, ++ vattr_t *vap); ++extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, ++ znode_t *dzp, char *name, uint64_t foid); ++#define ZFS_NO_OBJECT 0 /* no object id */ ++extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, ++ znode_t *dzp, znode_t *zp, char *name); ++extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, ++ znode_t *dzp, znode_t *zp, char *name, char *link); ++extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, ++ znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); ++extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, ++ znode_t *zp, offset_t off, ssize_t len, int ioflag); ++extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, ++ znode_t *zp, uint64_t off, uint64_t len); ++extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, ++ znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp); ++extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, ++ vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); ++extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx); ++extern void zfs_upgrade(zfs_sb_t *zsb, dmu_tx_t *tx); ++extern int zfs_create_share_dir(zfs_sb_t *zsb, dmu_tx_t *tx); ++ ++#if defined(HAVE_UIO_RW) ++extern caddr_t zfs_map_page(page_t *, enum seg_rw); ++extern void zfs_unmap_page(page_t *, caddr_t); ++#endif /* HAVE_UIO_RW */ ++ ++extern zil_get_data_t zfs_get_data; ++extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; ++extern int zfsfstype; ++ ++#endif /* _KERNEL */ ++ ++extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_FS_ZFS_ZNODE_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zil.h linux-3.2.33-go/include/zfs/sys/zil.h +--- linux-3.2.33-go.orig/include/zfs/sys/zil.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zil.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,487 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* Portions Copyright 2010 Robert Milkowski */ ++ ++#ifndef _SYS_ZIL_H ++#define _SYS_ZIL_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * Intent log format: ++ * ++ * Each objset has its own intent log. The log header (zil_header_t) ++ * for objset N's intent log is kept in the Nth object of the SPA's ++ * intent_log objset. The log header points to a chain of log blocks, ++ * each of which contains log records (i.e., transactions) followed by ++ * a log block trailer (zil_trailer_t). The format of a log record ++ * depends on the record (or transaction) type, but all records begin ++ * with a common structure that defines the type, length, and txg. ++ */ ++ ++/* ++ * Intent log header - this on disk structure holds fields to manage ++ * the log. All fields are 64 bit to easily handle cross architectures. ++ */ ++typedef struct zil_header { ++ uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ ++ uint64_t zh_replay_seq; /* highest replayed sequence number */ ++ blkptr_t zh_log; /* log chain */ ++ uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */ ++ uint64_t zh_flags; /* header flags */ ++ uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */ ++ uint64_t zh_pad[3]; ++} zil_header_t; ++ ++/* ++ * zh_flags bit settings ++ */ ++#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */ ++#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */ ++ ++/* ++ * Log block chaining. ++ * ++ * Log blocks are chained together. Originally they were chained at the ++ * end of the block. For performance reasons the chain was moved to the ++ * beginning of the block which allows writes for only the data being used. ++ * The older position is supported for backwards compatability. ++ * ++ * The zio_eck_t contains a zec_cksum which for the intent log is ++ * the sequence number of this log block. A seq of 0 is invalid. ++ * The zec_cksum is checked by the SPA against the sequence ++ * number passed in the blk_cksum field of the blkptr_t ++ */ ++typedef struct zil_chain { ++ uint64_t zc_pad; ++ blkptr_t zc_next_blk; /* next block in chain */ ++ uint64_t zc_nused; /* bytes in log block used */ ++ zio_eck_t zc_eck; /* block trailer */ ++} zil_chain_t; ++ ++#define ZIL_MIN_BLKSZ 4096ULL ++#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE ++ ++/* ++ * The words of a log block checksum. ++ */ ++#define ZIL_ZC_GUID_0 0 ++#define ZIL_ZC_GUID_1 1 ++#define ZIL_ZC_OBJSET 2 ++#define ZIL_ZC_SEQ 3 ++ ++typedef enum zil_create { ++ Z_FILE, ++ Z_DIR, ++ Z_XATTRDIR, ++} zil_create_t; ++ ++/* ++ * size of xvattr log section. ++ * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps ++ * for create time and a single 64 bit integer for all of the attributes, ++ * and 4 64 bit integers (32 bytes) for the scanstamp. ++ * ++ */ ++ ++#define ZIL_XVAT_SIZE(mapsize) \ ++ sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \ ++ (sizeof (uint64_t) * 7) ++ ++/* ++ * Size of ACL in log. The ACE data is padded out to properly align ++ * on 8 byte boundary. ++ */ ++ ++#define ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t))) ++ ++/* ++ * Intent log transaction types and record structures ++ */ ++#define TX_CREATE 1 /* Create file */ ++#define TX_MKDIR 2 /* Make directory */ ++#define TX_MKXATTR 3 /* Make XATTR directory */ ++#define TX_SYMLINK 4 /* Create symbolic link to a file */ ++#define TX_REMOVE 5 /* Remove file */ ++#define TX_RMDIR 6 /* Remove directory */ ++#define TX_LINK 7 /* Create hard link to a file */ ++#define TX_RENAME 8 /* Rename a file */ ++#define TX_WRITE 9 /* File write */ ++#define TX_TRUNCATE 10 /* Truncate a file */ ++#define TX_SETATTR 11 /* Set file attributes */ ++#define TX_ACL_V0 12 /* Set old formatted ACL */ ++#define TX_ACL 13 /* Set ACL */ ++#define TX_CREATE_ACL 14 /* create with ACL */ ++#define TX_CREATE_ATTR 15 /* create + attrs */ ++#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */ ++#define TX_MKDIR_ACL 17 /* mkdir with ACL */ ++#define TX_MKDIR_ATTR 18 /* mkdir with attr */ ++#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ ++#define TX_WRITE2 20 /* dmu_sync EALREADY write */ ++#define TX_MAX_TYPE 21 /* Max transaction type */ ++ ++/* ++ * The transactions for mkdir, symlink, remove, rmdir, link, and rename ++ * may have the following bit set, indicating the original request ++ * specified case-insensitive handling of names. ++ */ ++#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */ ++ ++/* ++ * Transactions for write, truncate, setattr, acl_v0, and acl can be logged ++ * out of order. For convenience in the code, all such records must have ++ * lr_foid at the same offset. ++ */ ++#define TX_OOO(txtype) \ ++ ((txtype) == TX_WRITE || \ ++ (txtype) == TX_TRUNCATE || \ ++ (txtype) == TX_SETATTR || \ ++ (txtype) == TX_ACL_V0 || \ ++ (txtype) == TX_ACL || \ ++ (txtype) == TX_WRITE2) ++ ++/* ++ * Format of log records. ++ * The fields are carefully defined to allow them to be aligned ++ * and sized the same on sparc & intel architectures. ++ * Each log record has a common structure at the beginning. ++ * ++ * The log record on disk (lrc_seq) holds the sequence number of all log ++ * records which is used to ensure we don't replay the same record. ++ */ ++typedef struct { /* common log record header */ ++ uint64_t lrc_txtype; /* intent log transaction type */ ++ uint64_t lrc_reclen; /* transaction record length */ ++ uint64_t lrc_txg; /* dmu transaction group number */ ++ uint64_t lrc_seq; /* see comment above */ ++} lr_t; ++ ++/* ++ * Common start of all out-of-order record types (TX_OOO() above). ++ */ ++typedef struct { ++ lr_t lr_common; /* common portion of log record */ ++ uint64_t lr_foid; /* object id */ ++} lr_ooo_t; ++ ++/* ++ * Handle option extended vattr attributes. ++ * ++ * Whenever new attributes are added the version number ++ * will need to be updated as will code in ++ * zfs_log.c and zfs_replay.c ++ */ ++typedef struct { ++ uint32_t lr_attr_masksize; /* number of elements in array */ ++ uint32_t lr_attr_bitmap; /* First entry of array */ ++ /* remainder of array and any additional fields */ ++} lr_attr_t; ++ ++/* ++ * log record for creates without optional ACL. ++ * This log record does support optional xvattr_t attributes. ++ */ ++typedef struct { ++ lr_t lr_common; /* common portion of log record */ ++ uint64_t lr_doid; /* object id of directory */ ++ uint64_t lr_foid; /* object id of created file object */ ++ uint64_t lr_mode; /* mode of object */ ++ uint64_t lr_uid; /* uid of object */ ++ uint64_t lr_gid; /* gid of object */ ++ uint64_t lr_gen; /* generation (txg of creation) */ ++ uint64_t lr_crtime[2]; /* creation time */ ++ uint64_t lr_rdev; /* rdev of object to create */ ++ /* name of object to create follows this */ ++ /* for symlinks, link content follows name */ ++ /* for creates with xvattr data, the name follows the xvattr info */ ++} lr_create_t; ++ ++/* ++ * FUID ACL record will be an array of ACEs from the original ACL. ++ * If this array includes ephemeral IDs, the record will also include ++ * an array of log-specific FUIDs to replace the ephemeral IDs. ++ * Only one copy of each unique domain will be present, so the log-specific ++ * FUIDs will use an index into a compressed domain table. On replay this ++ * information will be used to construct real FUIDs (and bypass idmap, ++ * since it may not be available). ++ */ ++ ++/* ++ * Log record for creates with optional ACL ++ * This log record is also used for recording any FUID ++ * information needed for replaying the create. If the ++ * file doesn't have any actual ACEs then the lr_aclcnt ++ * would be zero. ++ */ ++typedef struct { ++ lr_create_t lr_create; /* common create portion */ ++ uint64_t lr_aclcnt; /* number of ACEs in ACL */ ++ uint64_t lr_domcnt; /* number of unique domains */ ++ uint64_t lr_fuidcnt; /* number of real fuids */ ++ uint64_t lr_acl_bytes; /* number of bytes in ACL */ ++ uint64_t lr_acl_flags; /* ACL flags */ ++ /* lr_acl_bytes number of variable sized ace's follows */ ++ /* if create is also setting xvattr's, then acl data follows xvattr */ ++ /* if ACE FUIDs are needed then they will follow the xvattr_t */ ++ /* Following the FUIDs will be the domain table information. */ ++ /* The FUIDs for the owner and group will be in the lr_create */ ++ /* portion of the record. */ ++ /* name follows ACL data */ ++} lr_acl_create_t; ++ ++typedef struct { ++ lr_t lr_common; /* common portion of log record */ ++ uint64_t lr_doid; /* obj id of directory */ ++ /* name of object to remove follows this */ ++} lr_remove_t; ++ ++typedef struct { ++ lr_t lr_common; /* common portion of log record */ ++ uint64_t lr_doid; /* obj id of directory */ ++ uint64_t lr_link_obj; /* obj id of link */ ++ /* name of object to link follows this */ ++} lr_link_t; ++ ++typedef struct { ++ lr_t lr_common; /* common portion of log record */ ++ uint64_t lr_sdoid; /* obj id of source directory */ ++ uint64_t lr_tdoid; /* obj id of target directory */ ++ /* 2 strings: names of source and destination follow this */ ++} lr_rename_t; ++ ++typedef struct { ++ lr_t lr_common; /* common portion of log record */ ++ uint64_t lr_foid; /* file object to write */ ++ uint64_t lr_offset; /* offset to write to */ ++ uint64_t lr_length; /* user data length to write */ ++ uint64_t lr_blkoff; /* no longer used */ ++ blkptr_t lr_blkptr; /* spa block pointer for replay */ ++ /* write data will follow for small writes */ ++} lr_write_t; ++ ++typedef struct { ++ lr_t lr_common; /* common portion of log record */ ++ uint64_t lr_foid; /* object id of file to truncate */ ++ uint64_t lr_offset; /* offset to truncate from */ ++ uint64_t lr_length; /* length to truncate */ ++} lr_truncate_t; ++ ++typedef struct { ++ lr_t lr_common; /* common portion of log record */ ++ uint64_t lr_foid; /* file object to change attributes */ ++ uint64_t lr_mask; /* mask of attributes to set */ ++ uint64_t lr_mode; /* mode to set */ ++ uint64_t lr_uid; /* uid to set */ ++ uint64_t lr_gid; /* gid to set */ ++ uint64_t lr_size; /* size to set */ ++ uint64_t lr_atime[2]; /* access time */ ++ uint64_t lr_mtime[2]; /* modification time */ ++ /* optional attribute lr_attr_t may be here */ ++} lr_setattr_t; ++ ++typedef struct { ++ lr_t lr_common; /* common portion of log record */ ++ uint64_t lr_foid; /* obj id of file */ ++ uint64_t lr_aclcnt; /* number of acl entries */ ++ /* lr_aclcnt number of ace_t entries follow this */ ++} lr_acl_v0_t; ++ ++typedef struct { ++ lr_t lr_common; /* common portion of log record */ ++ uint64_t lr_foid; /* obj id of file */ ++ uint64_t lr_aclcnt; /* number of ACEs in ACL */ ++ uint64_t lr_domcnt; /* number of unique domains */ ++ uint64_t lr_fuidcnt; /* number of real fuids */ ++ uint64_t lr_acl_bytes; /* number of bytes in ACL */ ++ uint64_t lr_acl_flags; /* ACL flags */ ++ /* lr_acl_bytes number of variable sized ace's follows */ ++} lr_acl_t; ++ ++/* ++ * ZIL structure definitions, interface function prototype and globals. ++ */ ++ ++/* ++ * Writes are handled in three different ways: ++ * ++ * WR_INDIRECT: ++ * In this mode, if we need to commit the write later, then the block ++ * is immediately written into the file system (using dmu_sync), ++ * and a pointer to the block is put into the log record. ++ * When the txg commits the block is linked in. ++ * This saves additionally writing the data into the log record. ++ * There are a few requirements for this to occur: ++ * - write is greater than zfs/zvol_immediate_write_sz ++ * - not using slogs (as slogs are assumed to always be faster ++ * than writing into the main pool) ++ * - the write occupies only one block ++ * WR_COPIED: ++ * If we know we'll immediately be committing the ++ * transaction (FSYNC or FDSYNC), the we allocate a larger ++ * log record here for the data and copy the data in. ++ * WR_NEED_COPY: ++ * Otherwise we don't allocate a buffer, and *if* we need to ++ * flush the write later then a buffer is allocated and ++ * we retrieve the data using the dmu. ++ */ ++typedef enum { ++ WR_INDIRECT, /* indirect - a large write (dmu_sync() data */ ++ /* and put blkptr in log, rather than actual data) */ ++ WR_COPIED, /* immediate - data is copied into lr_write_t */ ++ WR_NEED_COPY, /* immediate - data needs to be copied if pushed */ ++ WR_NUM_STATES /* number of states */ ++} itx_wr_state_t; ++ ++typedef struct itx { ++ list_node_t itx_node; /* linkage on zl_itx_list */ ++ void *itx_private; /* type-specific opaque data */ ++ itx_wr_state_t itx_wr_state; /* write state */ ++ uint8_t itx_sync; /* synchronous transaction */ ++ uint64_t itx_sod; /* record size on disk */ ++ uint64_t itx_oid; /* object id */ ++ lr_t itx_lr; /* common part of log record */ ++ /* followed by type-specific part of lr_xx_t and its immediate data */ ++} itx_t; ++ ++/* ++ * Used for zil kstat. ++ */ ++typedef struct zil_stats { ++ /* ++ * Number of times a ZIL commit (e.g. fsync) has been requested. ++ */ ++ kstat_named_t zil_commit_count; ++ ++ /* ++ * Number of times the ZIL has been flushed to stable storage. ++ * This is less than zil_commit_count when commits are "merged" ++ * (see the documentation above zil_commit()). ++ */ ++ kstat_named_t zil_commit_writer_count; ++ ++ /* ++ * Number of transactions (reads, writes, renames, etc.) ++ * that have been commited. ++ */ ++ kstat_named_t zil_itx_count; ++ ++ /* ++ * See the documentation for itx_wr_state_t above. ++ * Note that "bytes" accumulates the length of the transactions ++ * (i.e. data), not the actual log record sizes. ++ */ ++ kstat_named_t zil_itx_indirect_count; ++ kstat_named_t zil_itx_indirect_bytes; ++ kstat_named_t zil_itx_copied_count; ++ kstat_named_t zil_itx_copied_bytes; ++ kstat_named_t zil_itx_needcopy_count; ++ kstat_named_t zil_itx_needcopy_bytes; ++ ++ /* ++ * Transactions which have been allocated to the "normal" ++ * (i.e. not slog) storage pool. Note that "bytes" accumulate ++ * the actual log record sizes - which do not include the actual ++ * data in case of indirect writes. ++ */ ++ kstat_named_t zil_itx_metaslab_normal_count; ++ kstat_named_t zil_itx_metaslab_normal_bytes; ++ ++ /* ++ * Transactions which have been allocated to the "slog" storage pool. ++ * If there are no separate log devices, this is the same as the ++ * "normal" pool. ++ */ ++ kstat_named_t zil_itx_metaslab_slog_count; ++ kstat_named_t zil_itx_metaslab_slog_bytes; ++} zil_stats_t; ++ ++extern zil_stats_t zil_stats; ++ ++#define ZIL_STAT_INCR(stat, val) \ ++ atomic_add_64(&zil_stats.stat.value.ui64, (val)); ++#define ZIL_STAT_BUMP(stat) \ ++ ZIL_STAT_INCR(stat, 1); ++ ++typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, ++ uint64_t txg); ++typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, ++ uint64_t txg); ++typedef int zil_replay_func_t(void *, char *, boolean_t); ++typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio); ++ ++extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, ++ zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); ++ ++extern void zil_init(void); ++extern void zil_fini(void); ++ ++extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys); ++extern void zil_free(zilog_t *zilog); ++ ++extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); ++extern void zil_close(zilog_t *zilog); ++ ++extern void zil_replay(objset_t *os, void *arg, ++ zil_replay_func_t *replay_func[TX_MAX_TYPE]); ++extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx); ++extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); ++extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); ++ ++extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); ++extern void zil_itx_destroy(itx_t *itx); ++extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); ++ ++extern void zil_commit(zilog_t *zilog, uint64_t oid); ++ ++extern int zil_vdev_offline(const char *osname, void *txarg); ++extern int zil_claim(const char *osname, void *txarg); ++extern int zil_check_log_chain(const char *osname, void *txarg); ++extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); ++extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); ++ ++extern int zil_suspend(zilog_t *zilog); ++extern void zil_resume(zilog_t *zilog); ++ ++extern void zil_add_block(zilog_t *zilog, const blkptr_t *bp); ++extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp); ++ ++extern void zil_set_sync(zilog_t *zilog, uint64_t syncval); ++ ++extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); ++ ++extern int zil_replay_disable; ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_ZIL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zil_impl.h linux-3.2.33-go/include/zfs/sys/zil_impl.h +--- linux-3.2.33-go.orig/include/zfs/sys/zil_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zil_impl.h 2012-11-16 23:25:34.339039449 +0100 +@@ -0,0 +1,148 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++/* Portions Copyright 2010 Robert Milkowski */ ++ ++#ifndef _SYS_ZIL_IMPL_H ++#define _SYS_ZIL_IMPL_H ++ ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * Log write buffer. ++ */ ++typedef struct lwb { ++ zilog_t *lwb_zilog; /* back pointer to log struct */ ++ blkptr_t lwb_blk; /* on disk address of this log blk */ ++ boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ ++ int lwb_nused; /* # used bytes in buffer */ ++ int lwb_sz; /* size of block and buffer */ ++ char *lwb_buf; /* log write buffer */ ++ zio_t *lwb_zio; /* zio for this buffer */ ++ dmu_tx_t *lwb_tx; /* tx for log block allocation */ ++ uint64_t lwb_max_txg; /* highest txg in this lwb */ ++ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ ++} lwb_t; ++ ++/* ++ * Intent log transaction lists ++ */ ++typedef struct itxs { ++ list_t i_sync_list; /* list of synchronous itxs */ ++ avl_tree_t i_async_tree; /* tree of foids for async itxs */ ++} itxs_t; ++ ++typedef struct itxg { ++ kmutex_t itxg_lock; /* lock for this structure */ ++ uint64_t itxg_txg; /* txg for this chain */ ++ uint64_t itxg_sod; /* total size on disk for this txg */ ++ itxs_t *itxg_itxs; /* sync and async itxs */ ++} itxg_t; ++ ++/* for async nodes we build up an AVL tree of lists of async itxs per file */ ++typedef struct itx_async_node { ++ uint64_t ia_foid; /* file object id */ ++ list_t ia_list; /* list of async itxs for this foid */ ++ avl_node_t ia_node; /* AVL tree linkage */ ++} itx_async_node_t; ++ ++/* ++ * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs ++ * we've touched so we know which ones need a write cache flush at the end. ++ */ ++typedef struct zil_vdev_node { ++ uint64_t zv_vdev; /* vdev to be flushed */ ++ avl_node_t zv_node; /* AVL tree linkage */ ++} zil_vdev_node_t; ++ ++#define ZIL_PREV_BLKS 16 ++ ++/* ++ * Stable storage intent log management structure. One per dataset. ++ */ ++struct zilog { ++ kmutex_t zl_lock; /* protects most zilog_t fields */ ++ struct dsl_pool *zl_dmu_pool; /* DSL pool */ ++ spa_t *zl_spa; /* handle for read/write log */ ++ const zil_header_t *zl_header; /* log header buffer */ ++ objset_t *zl_os; /* object set we're logging */ ++ zil_get_data_t *zl_get_data; /* callback to get object content */ ++ zio_t *zl_root_zio; /* log writer root zio */ ++ uint64_t zl_lr_seq; /* on-disk log record sequence number */ ++ uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */ ++ uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ ++ uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */ ++ uint64_t zl_replaying_seq; /* current replay seq number */ ++ uint32_t zl_suspend; /* log suspend count */ ++ kcondvar_t zl_cv_writer; /* log writer thread completion */ ++ kcondvar_t zl_cv_suspend; /* log suspend completion */ ++ uint8_t zl_suspending; /* log is currently suspending */ ++ uint8_t zl_keep_first; /* keep first log block in destroy */ ++ uint8_t zl_replay; /* replaying records while set */ ++ uint8_t zl_stop_sync; /* for debugging */ ++ uint8_t zl_writer; /* boolean: write setup in progress */ ++ uint8_t zl_logbias; /* latency or throughput */ ++ uint8_t zl_sync; /* synchronous or asynchronous */ ++ int zl_parse_error; /* last zil_parse() error */ ++ uint64_t zl_parse_blk_seq; /* highest blk seq on last parse */ ++ uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */ ++ uint64_t zl_parse_blk_count; /* number of blocks parsed */ ++ uint64_t zl_parse_lr_count; /* number of log records parsed */ ++ uint64_t zl_next_batch; /* next batch number */ ++ uint64_t zl_com_batch; /* committed batch number */ ++ kcondvar_t zl_cv_batch[2]; /* batch condition variables */ ++ itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */ ++ list_t zl_itx_commit_list; /* itx list to be committed */ ++ uint64_t zl_itx_list_sz; /* total size of records on list */ ++ uint64_t zl_cur_used; /* current commit log size used */ ++ list_t zl_lwb_list; /* in-flight log write list */ ++ kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */ ++ avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */ ++ taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */ ++ avl_tree_t zl_bp_tree; /* track bps during log parse */ ++ clock_t zl_replay_time; /* lbolt of when replay started */ ++ uint64_t zl_replay_blks; /* number of log blocks replayed */ ++ zil_header_t zl_old_header; /* debugging aid */ ++ uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */ ++ uint_t zl_prev_rotor; /* rotor for zl_prev[] */ ++}; ++ ++typedef struct zil_bp_node { ++ dva_t zn_dva; ++ avl_node_t zn_node; ++} zil_bp_node_t; ++ ++#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \ ++ sizeof (lr_write_t)) ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_ZIL_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zio_checksum.h linux-3.2.33-go/include/zfs/sys/zio_checksum.h +--- linux-3.2.33-go.orig/include/zfs/sys/zio_checksum.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zio_checksum.h 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,75 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_ZIO_CHECKSUM_H ++#define _SYS_ZIO_CHECKSUM_H ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * Signature for checksum functions. ++ */ ++typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); ++ ++/* ++ * Information about each checksum function. ++ */ ++typedef struct zio_checksum_info { ++ zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ ++ int ci_correctable; /* number of correctable bits */ ++ int ci_eck; /* uses zio embedded checksum? */ ++ int ci_dedup; /* strong enough for dedup? */ ++ char *ci_name; /* descriptive name */ ++} zio_checksum_info_t; ++ ++typedef struct zio_bad_cksum { ++ zio_cksum_t zbc_expected; ++ zio_cksum_t zbc_actual; ++ const char *zbc_checksum_name; ++ uint8_t zbc_byteswapped; ++ uint8_t zbc_injected; ++ uint8_t zbc_has_cksum; /* expected/actual valid */ ++} zio_bad_cksum_t; ++ ++extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; ++ ++/* ++ * Checksum routines. ++ */ ++extern zio_checksum_t zio_checksum_SHA256; ++ ++extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, ++ void *data, uint64_t size); ++extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); ++extern enum zio_checksum spa_dedup_checksum(spa_t *spa); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_ZIO_CHECKSUM_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zio_compress.h linux-3.2.33-go/include/zfs/sys/zio_compress.h +--- linux-3.2.33-go.orig/include/zfs/sys/zio_compress.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zio_compress.h 2012-11-16 23:25:34.343039404 +0100 +@@ -0,0 +1,84 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _SYS_ZIO_COMPRESS_H ++#define _SYS_ZIO_COMPRESS_H ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * Common signature for all zio compress/decompress functions. ++ */ ++typedef size_t zio_compress_func_t(void *src, void *dst, ++ size_t s_len, size_t d_len, int); ++typedef int zio_decompress_func_t(void *src, void *dst, ++ size_t s_len, size_t d_len, int); ++ ++/* ++ * Information about each compression function. ++ */ ++typedef struct zio_compress_info { ++ zio_compress_func_t *ci_compress; /* compression function */ ++ zio_decompress_func_t *ci_decompress; /* decompression function */ ++ int ci_level; /* level parameter */ ++ char *ci_name; /* algorithm name */ ++} zio_compress_info_t; ++ ++extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; ++ ++/* ++ * Compression routines. ++ */ ++extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len, ++ int level); ++extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len, ++ int level); ++extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len, ++ int level); ++extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len, ++ int level); ++extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len, ++ int level); ++extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len, ++ int level); ++ ++/* ++ * Compress and decompress data if necessary. ++ */ ++extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst, ++ size_t s_len); ++extern int zio_decompress_data(enum zio_compress c, void *src, void *dst, ++ size_t s_len, size_t d_len); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_ZIO_COMPRESS_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zio.h linux-3.2.33-go/include/zfs/sys/zio.h +--- linux-3.2.33-go.orig/include/zfs/sys/zio.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zio.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,574 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++/* ++ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. ++ */ ++ ++#ifndef _ZIO_H ++#define _ZIO_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * Embedded checksum ++ */ ++#define ZEC_MAGIC 0x210da7ab10c7a11ULL ++ ++typedef struct zio_eck { ++ uint64_t zec_magic; /* for validation, endianness */ ++ zio_cksum_t zec_cksum; /* 256-bit checksum */ ++} zio_eck_t; ++ ++/* ++ * Gang block headers are self-checksumming and contain an array ++ * of block pointers. ++ */ ++#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE ++#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ ++ sizeof (zio_eck_t)) / sizeof (blkptr_t)) ++#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ ++ sizeof (zio_eck_t) - \ ++ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ ++ sizeof (uint64_t)) ++ ++typedef struct zio_gbh { ++ blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; ++ uint64_t zg_filler[SPA_GBH_FILLER]; ++ zio_eck_t zg_tail; ++} zio_gbh_phys_t; ++ ++enum zio_checksum { ++ ZIO_CHECKSUM_INHERIT = 0, ++ ZIO_CHECKSUM_ON, ++ ZIO_CHECKSUM_OFF, ++ ZIO_CHECKSUM_LABEL, ++ ZIO_CHECKSUM_GANG_HEADER, ++ ZIO_CHECKSUM_ZILOG, ++ ZIO_CHECKSUM_FLETCHER_2, ++ ZIO_CHECKSUM_FLETCHER_4, ++ ZIO_CHECKSUM_SHA256, ++ ZIO_CHECKSUM_ZILOG2, ++ ZIO_CHECKSUM_FUNCTIONS ++}; ++ ++#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 ++#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON ++ ++#define ZIO_CHECKSUM_MASK 0xffULL ++#define ZIO_CHECKSUM_VERIFY (1 << 8) ++ ++#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 ++#define ZIO_DEDUPDITTO_MIN 100 ++ ++enum zio_compress { ++ ZIO_COMPRESS_INHERIT = 0, ++ ZIO_COMPRESS_ON, ++ ZIO_COMPRESS_OFF, ++ ZIO_COMPRESS_LZJB, ++ ZIO_COMPRESS_EMPTY, ++ ZIO_COMPRESS_GZIP_1, ++ ZIO_COMPRESS_GZIP_2, ++ ZIO_COMPRESS_GZIP_3, ++ ZIO_COMPRESS_GZIP_4, ++ ZIO_COMPRESS_GZIP_5, ++ ZIO_COMPRESS_GZIP_6, ++ ZIO_COMPRESS_GZIP_7, ++ ZIO_COMPRESS_GZIP_8, ++ ZIO_COMPRESS_GZIP_9, ++ ZIO_COMPRESS_ZLE, ++ ZIO_COMPRESS_FUNCTIONS ++}; ++ ++#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB ++#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF ++ ++#define BOOTFS_COMPRESS_VALID(compress) \ ++ ((compress) == ZIO_COMPRESS_LZJB || \ ++ ((compress) == ZIO_COMPRESS_ON && \ ++ ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \ ++ (compress) == ZIO_COMPRESS_OFF) ++ ++/* ++ * Default Linux timeout for a sd device. ++ */ ++#define ZIO_DELAY_MAX (30 * MILLISEC) ++ ++#define ZIO_FAILURE_MODE_WAIT 0 ++#define ZIO_FAILURE_MODE_CONTINUE 1 ++#define ZIO_FAILURE_MODE_PANIC 2 ++ ++#define ZIO_PRIORITY_NOW (zio_priority_table[0]) ++#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1]) ++#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2]) ++#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[3]) ++#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[4]) ++#define ZIO_PRIORITY_AGG (zio_priority_table[5]) ++#define ZIO_PRIORITY_FREE (zio_priority_table[6]) ++#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[7]) ++#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8]) ++#define ZIO_PRIORITY_RESILVER (zio_priority_table[9]) ++#define ZIO_PRIORITY_SCRUB (zio_priority_table[10]) ++#define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11]) ++#define ZIO_PRIORITY_TABLE_SIZE 12 ++ ++#define ZIO_PIPELINE_CONTINUE 0x100 ++#define ZIO_PIPELINE_STOP 0x101 ++ ++enum zio_flag { ++ /* ++ * Flags inherited by gang, ddt, and vdev children, ++ * and that must be equal for two zios to aggregate ++ */ ++ ZIO_FLAG_DONT_AGGREGATE = 1 << 0, ++ ZIO_FLAG_IO_REPAIR = 1 << 1, ++ ZIO_FLAG_SELF_HEAL = 1 << 2, ++ ZIO_FLAG_RESILVER = 1 << 3, ++ ZIO_FLAG_SCRUB = 1 << 4, ++ ZIO_FLAG_SCAN_THREAD = 1 << 5, ++ ++#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) ++ ++ /* ++ * Flags inherited by ddt, gang, and vdev children. ++ */ ++ ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */ ++ ZIO_FLAG_SPECULATIVE = 1 << 7, ++ ZIO_FLAG_CONFIG_WRITER = 1 << 8, ++ ZIO_FLAG_DONT_RETRY = 1 << 9, ++ ZIO_FLAG_DONT_CACHE = 1 << 10, ++ ZIO_FLAG_NODATA = 1 << 11, ++ ZIO_FLAG_INDUCE_DAMAGE = 1 << 12, ++ ++#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) ++#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) ++ ++ /* ++ * Flags inherited by vdev children. ++ */ ++ ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */ ++ ZIO_FLAG_PROBE = 1 << 14, ++ ZIO_FLAG_TRYHARD = 1 << 15, ++ ZIO_FLAG_OPTIONAL = 1 << 16, ++ ++#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) ++ ++ /* ++ * Flags not inherited by any children. ++ */ ++ ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */ ++ ZIO_FLAG_DONT_PROPAGATE = 1 << 18, ++ ZIO_FLAG_IO_BYPASS = 1 << 19, ++ ZIO_FLAG_IO_REWRITE = 1 << 20, ++ ZIO_FLAG_RAW = 1 << 21, ++ ZIO_FLAG_GANG_CHILD = 1 << 22, ++ ZIO_FLAG_DDT_CHILD = 1 << 23, ++ ZIO_FLAG_GODFATHER = 1 << 24, ++ ZIO_FLAG_FASTWRITE = 1 << 25 ++}; ++ ++#define ZIO_FLAG_MUSTSUCCEED 0 ++ ++#define ZIO_DDT_CHILD_FLAGS(zio) \ ++ (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \ ++ ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL) ++ ++#define ZIO_GANG_CHILD_FLAGS(zio) \ ++ (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ ++ ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) ++ ++#define ZIO_VDEV_CHILD_FLAGS(zio) \ ++ (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ ++ ZIO_FLAG_CANFAIL) ++ ++enum zio_child { ++ ZIO_CHILD_VDEV = 0, ++ ZIO_CHILD_GANG, ++ ZIO_CHILD_DDT, ++ ZIO_CHILD_LOGICAL, ++ ZIO_CHILD_TYPES ++}; ++ ++enum zio_wait_type { ++ ZIO_WAIT_READY = 0, ++ ZIO_WAIT_DONE, ++ ZIO_WAIT_TYPES ++}; ++ ++/* ++ * We'll take the unused errnos, 'EBADE' and 'EBADR' (from the Convergent ++ * graveyard) to indicate checksum errors and fragmentation. ++ */ ++#define ECKSUM EBADE ++#define EFRAGS EBADR ++ ++typedef void zio_done_func_t(zio_t *zio); ++ ++extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE]; ++extern char *zio_type_name[ZIO_TYPES]; ++ ++/* ++ * A bookmark is a four-tuple that uniquely ++ * identifies any block in the pool. By convention, the meta-objset (MOS) ++ * is objset 0, and the meta-dnode is object 0. This covers all blocks ++ * except root blocks and ZIL blocks, which are defined as follows: ++ * ++ * Root blocks (objset_phys_t) are object 0, level -1: . ++ * ZIL blocks are bookmarked . ++ * dmu_sync()ed ZIL data blocks are bookmarked . ++ * ++ * Note: this structure is called a bookmark because its original purpose ++ * was to remember where to resume a pool-wide traverse. ++ * ++ * Note: this structure is passed between userland and the kernel. ++ * Therefore it must not change size or alignment between 32/64 bit ++ * compilation options. ++ */ ++typedef struct zbookmark { ++ uint64_t zb_objset; ++ uint64_t zb_object; ++ int64_t zb_level; ++ uint64_t zb_blkid; ++} zbookmark_t; ++ ++#define SET_BOOKMARK(zb, objset, object, level, blkid) \ ++{ \ ++ (zb)->zb_objset = objset; \ ++ (zb)->zb_object = object; \ ++ (zb)->zb_level = level; \ ++ (zb)->zb_blkid = blkid; \ ++} ++ ++#define ZB_DESTROYED_OBJSET (-1ULL) ++ ++#define ZB_ROOT_OBJECT (0ULL) ++#define ZB_ROOT_LEVEL (-1LL) ++#define ZB_ROOT_BLKID (0ULL) ++ ++#define ZB_ZIL_OBJECT (0ULL) ++#define ZB_ZIL_LEVEL (-2LL) ++ ++typedef struct zio_prop { ++ enum zio_checksum zp_checksum; ++ enum zio_compress zp_compress; ++ dmu_object_type_t zp_type; ++ uint8_t zp_level; ++ uint8_t zp_copies; ++ uint8_t zp_dedup; ++ uint8_t zp_dedup_verify; ++} zio_prop_t; ++ ++typedef struct zio_cksum_report zio_cksum_report_t; ++ ++typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, ++ const void *good_data); ++typedef void zio_cksum_free_f(void *cbdata, size_t size); ++ ++struct zio_bad_cksum; /* defined in zio_checksum.h */ ++ ++struct zio_cksum_report { ++ struct zio_cksum_report *zcr_next; ++ nvlist_t *zcr_ereport; ++ nvlist_t *zcr_detector; ++ void *zcr_cbdata; ++ size_t zcr_cbinfo; /* passed to zcr_free() */ ++ uint64_t zcr_align; ++ uint64_t zcr_length; ++ zio_cksum_finish_f *zcr_finish; ++ zio_cksum_free_f *zcr_free; ++ ++ /* internal use only */ ++ struct zio_bad_cksum *zcr_ckinfo; /* information from failure */ ++}; ++ ++typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr, ++ void *arg); ++ ++zio_vsd_cksum_report_f zio_vsd_default_cksum_report; ++ ++typedef struct zio_vsd_ops { ++ zio_done_func_t *vsd_free; ++ zio_vsd_cksum_report_f *vsd_cksum_report; ++} zio_vsd_ops_t; ++ ++typedef struct zio_gang_node { ++ zio_gbh_phys_t *gn_gbh; ++ struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; ++} zio_gang_node_t; ++ ++typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, ++ zio_gang_node_t *gn, void *data); ++ ++typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size); ++ ++typedef struct zio_transform { ++ void *zt_orig_data; ++ uint64_t zt_orig_size; ++ uint64_t zt_bufsize; ++ zio_transform_func_t *zt_transform; ++ struct zio_transform *zt_next; ++} zio_transform_t; ++ ++typedef int zio_pipe_stage_t(zio_t *zio); ++ ++/* ++ * The io_reexecute flags are distinct from io_flags because the child must ++ * be able to propagate them to the parent. The normal io_flags are local ++ * to the zio, not protected by any lock, and not modifiable by children; ++ * the reexecute flags are protected by io_lock, modifiable by children, ++ * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. ++ */ ++#define ZIO_REEXECUTE_NOW 0x01 ++#define ZIO_REEXECUTE_SUSPEND 0x02 ++ ++typedef struct zio_link { ++ zio_t *zl_parent; ++ zio_t *zl_child; ++ list_node_t zl_parent_node; ++ list_node_t zl_child_node; ++} zio_link_t; ++ ++struct zio { ++ /* Core information about this I/O */ ++ zbookmark_t io_bookmark; ++ zio_prop_t io_prop; ++ zio_type_t io_type; ++ enum zio_child io_child_type; ++ int io_cmd; ++ uint8_t io_priority; ++ uint8_t io_reexecute; ++ uint8_t io_state[ZIO_WAIT_TYPES]; ++ uint64_t io_txg; ++ spa_t *io_spa; ++ blkptr_t *io_bp; ++ blkptr_t *io_bp_override; ++ blkptr_t io_bp_copy; ++ list_t io_parent_list; ++ list_t io_child_list; ++ zio_link_t *io_walk_link; ++ zio_t *io_logical; ++ zio_transform_t *io_transform_stack; ++ ++ /* Callback info */ ++ zio_done_func_t *io_ready; ++ zio_done_func_t *io_done; ++ void *io_private; ++ int64_t io_prev_space_delta; /* DMU private */ ++ blkptr_t io_bp_orig; ++ ++ /* Data represented by this I/O */ ++ void *io_data; ++ void *io_orig_data; ++ uint64_t io_size; ++ uint64_t io_orig_size; ++ ++ /* Stuff for the vdev stack */ ++ vdev_t *io_vd; ++ void *io_vsd; ++ const zio_vsd_ops_t *io_vsd_ops; ++ ++ uint64_t io_offset; ++ uint64_t io_deadline; ++ avl_node_t io_offset_node; ++ avl_node_t io_deadline_node; ++ avl_tree_t *io_vdev_tree; ++ ++ /* Internal pipeline state */ ++ enum zio_flag io_flags; ++ enum zio_stage io_stage; ++ enum zio_stage io_pipeline; ++ enum zio_flag io_orig_flags; ++ enum zio_stage io_orig_stage; ++ enum zio_stage io_orig_pipeline; ++ uint64_t io_delay; ++ int io_error; ++ int io_child_error[ZIO_CHILD_TYPES]; ++ uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; ++ uint64_t io_child_count; ++ uint64_t io_parent_count; ++ uint64_t *io_stall; ++ zio_t *io_gang_leader; ++ zio_gang_node_t *io_gang_tree; ++ void *io_executor; ++ void *io_waiter; ++ kmutex_t io_lock; ++ kcondvar_t io_cv; ++ ++ /* FMA state */ ++ zio_cksum_report_t *io_cksum_report; ++ uint64_t io_ena; ++ ++ /* Taskq dispatching state */ ++ taskq_ent_t io_tqent; ++}; ++ ++extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, ++ zio_done_func_t *done, void *private, enum zio_flag flags); ++ ++extern zio_t *zio_root(spa_t *spa, ++ zio_done_func_t *done, void *private, enum zio_flag flags); ++ ++extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, ++ uint64_t size, zio_done_func_t *done, void *private, ++ int priority, enum zio_flag flags, const zbookmark_t *zb); ++ ++extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, ++ void *data, uint64_t size, const zio_prop_t *zp, ++ zio_done_func_t *ready, zio_done_func_t *done, void *private, ++ int priority, enum zio_flag flags, const zbookmark_t *zb); ++ ++extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, ++ void *data, uint64_t size, zio_done_func_t *done, void *private, ++ int priority, enum zio_flag flags, zbookmark_t *zb); ++ ++extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies); ++ ++extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); ++ ++extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, ++ const blkptr_t *bp, ++ zio_done_func_t *done, void *private, enum zio_flag flags); ++ ++extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, ++ zio_done_func_t *done, void *private, int priority, enum zio_flag flags); ++ ++extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, ++ uint64_t size, void *data, int checksum, ++ zio_done_func_t *done, void *private, int priority, enum zio_flag flags, ++ boolean_t labels); ++ ++extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, ++ uint64_t size, void *data, int checksum, ++ zio_done_func_t *done, void *private, int priority, enum zio_flag flags, ++ boolean_t labels); ++ ++extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, ++ const blkptr_t *bp, enum zio_flag flags); ++ ++extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, ++ uint64_t size, boolean_t use_slog); ++extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); ++extern void zio_flush(zio_t *zio, vdev_t *vd); ++extern void zio_shrink(zio_t *zio, uint64_t size); ++ ++extern int zio_wait(zio_t *zio); ++extern void zio_nowait(zio_t *zio); ++extern void zio_execute(zio_t *zio); ++extern void zio_interrupt(zio_t *zio); ++ ++extern zio_t *zio_walk_parents(zio_t *cio); ++extern zio_t *zio_walk_children(zio_t *pio); ++extern zio_t *zio_unique_parent(zio_t *cio); ++extern void zio_add_child(zio_t *pio, zio_t *cio); ++ ++extern void *zio_buf_alloc(size_t size); ++extern void zio_buf_free(void *buf, size_t size); ++extern void *zio_data_buf_alloc(size_t size); ++extern void zio_data_buf_free(void *buf, size_t size); ++extern void *zio_vdev_alloc(void); ++extern void zio_vdev_free(void *buf); ++ ++extern void zio_resubmit_stage_async(void *); ++ ++extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, ++ uint64_t offset, void *data, uint64_t size, int type, int priority, ++ enum zio_flag flags, zio_done_func_t *done, void *private); ++ ++extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, ++ void *data, uint64_t size, int type, int priority, ++ enum zio_flag flags, zio_done_func_t *done, void *private); ++ ++extern void zio_vdev_io_bypass(zio_t *zio); ++extern void zio_vdev_io_reissue(zio_t *zio); ++extern void zio_vdev_io_redone(zio_t *zio); ++ ++extern void zio_checksum_verified(zio_t *zio); ++extern int zio_worst_error(int e1, int e2); ++ ++extern enum zio_checksum zio_checksum_select(enum zio_checksum child, ++ enum zio_checksum parent); ++extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, ++ enum zio_checksum child, enum zio_checksum parent); ++extern enum zio_compress zio_compress_select(enum zio_compress child, ++ enum zio_compress parent); ++ ++extern void zio_suspend(spa_t *spa, zio_t *zio); ++extern int zio_resume(spa_t *spa); ++extern void zio_resume_wait(spa_t *spa); ++ ++/* ++ * Initial setup and teardown. ++ */ ++extern void zio_init(void); ++extern void zio_fini(void); ++ ++/* ++ * Fault injection ++ */ ++struct zinject_record; ++extern uint32_t zio_injection_enabled; ++extern int zio_inject_fault(char *name, int flags, int *id, ++ struct zinject_record *record); ++extern int zio_inject_list_next(int *id, char *name, size_t buflen, ++ struct zinject_record *record); ++extern int zio_clear_fault(int id); ++extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type); ++extern int zio_handle_fault_injection(zio_t *zio, int error); ++extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); ++extern int zio_handle_label_injection(zio_t *zio, int error); ++extern void zio_handle_ignored_writes(zio_t *zio); ++ ++/* ++ * Checksum ereport functions ++ */ ++extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio, ++ uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info); ++extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, ++ const void *good_data, const void *bad_data, boolean_t drop_if_identical); ++ ++extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report); ++extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); ++ ++/* If we have the good data in hand, this function can be used */ ++extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, ++ struct zio *zio, uint64_t offset, uint64_t length, ++ const void *good_data, const void *bad_data, struct zio_bad_cksum *info); ++ ++/* Called from spa_sync(), but primarily an injection handler */ ++extern void spa_handle_ignored_writes(spa_t *spa); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _ZIO_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zio_impl.h linux-3.2.33-go/include/zfs/sys/zio_impl.h +--- linux-3.2.33-go.orig/include/zfs/sys/zio_impl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zio_impl.h 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,175 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _ZIO_IMPL_H ++#define _ZIO_IMPL_H ++ ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * zio pipeline stage definitions ++ */ ++enum zio_stage { ++ ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */ ++ ++ ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */ ++ ZIO_STAGE_FREE_BP_INIT = 1 << 2, /* --F-- */ ++ ZIO_STAGE_ISSUE_ASYNC = 1 << 3, /* RWF-- */ ++ ZIO_STAGE_WRITE_BP_INIT = 1 << 4, /* -W--- */ ++ ++ ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */ ++ ++ ZIO_STAGE_DDT_READ_START = 1 << 6, /* R---- */ ++ ZIO_STAGE_DDT_READ_DONE = 1 << 7, /* R---- */ ++ ZIO_STAGE_DDT_WRITE = 1 << 8, /* -W--- */ ++ ZIO_STAGE_DDT_FREE = 1 << 9, /* --F-- */ ++ ++ ZIO_STAGE_GANG_ASSEMBLE = 1 << 10, /* RWFC- */ ++ ZIO_STAGE_GANG_ISSUE = 1 << 11, /* RWFC- */ ++ ++ ZIO_STAGE_DVA_ALLOCATE = 1 << 12, /* -W--- */ ++ ZIO_STAGE_DVA_FREE = 1 << 13, /* --F-- */ ++ ZIO_STAGE_DVA_CLAIM = 1 << 14, /* ---C- */ ++ ++ ZIO_STAGE_READY = 1 << 15, /* RWFCI */ ++ ++ ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */ ++ ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */ ++ ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */ ++ ++ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */ ++ ++ ZIO_STAGE_DONE = 1 << 20 /* RWFCI */ ++}; ++ ++#define ZIO_INTERLOCK_STAGES \ ++ (ZIO_STAGE_READY | \ ++ ZIO_STAGE_DONE) ++ ++#define ZIO_INTERLOCK_PIPELINE \ ++ ZIO_INTERLOCK_STAGES ++ ++#define ZIO_VDEV_IO_STAGES \ ++ (ZIO_STAGE_VDEV_IO_START | \ ++ ZIO_STAGE_VDEV_IO_DONE | \ ++ ZIO_STAGE_VDEV_IO_ASSESS) ++ ++#define ZIO_VDEV_CHILD_PIPELINE \ ++ (ZIO_VDEV_IO_STAGES | \ ++ ZIO_STAGE_DONE) ++ ++#define ZIO_READ_COMMON_STAGES \ ++ (ZIO_INTERLOCK_STAGES | \ ++ ZIO_VDEV_IO_STAGES | \ ++ ZIO_STAGE_CHECKSUM_VERIFY) ++ ++#define ZIO_READ_PHYS_PIPELINE \ ++ ZIO_READ_COMMON_STAGES ++ ++#define ZIO_READ_PIPELINE \ ++ (ZIO_READ_COMMON_STAGES | \ ++ ZIO_STAGE_READ_BP_INIT) ++ ++#define ZIO_DDT_CHILD_READ_PIPELINE \ ++ ZIO_READ_COMMON_STAGES ++ ++#define ZIO_DDT_READ_PIPELINE \ ++ (ZIO_INTERLOCK_STAGES | \ ++ ZIO_STAGE_READ_BP_INIT | \ ++ ZIO_STAGE_DDT_READ_START | \ ++ ZIO_STAGE_DDT_READ_DONE) ++ ++#define ZIO_WRITE_COMMON_STAGES \ ++ (ZIO_INTERLOCK_STAGES | \ ++ ZIO_VDEV_IO_STAGES | \ ++ ZIO_STAGE_ISSUE_ASYNC | \ ++ ZIO_STAGE_CHECKSUM_GENERATE) ++ ++#define ZIO_WRITE_PHYS_PIPELINE \ ++ ZIO_WRITE_COMMON_STAGES ++ ++#define ZIO_REWRITE_PIPELINE \ ++ (ZIO_WRITE_COMMON_STAGES | \ ++ ZIO_STAGE_WRITE_BP_INIT) ++ ++#define ZIO_WRITE_PIPELINE \ ++ (ZIO_WRITE_COMMON_STAGES | \ ++ ZIO_STAGE_WRITE_BP_INIT | \ ++ ZIO_STAGE_DVA_ALLOCATE) ++ ++#define ZIO_DDT_CHILD_WRITE_PIPELINE \ ++ (ZIO_INTERLOCK_STAGES | \ ++ ZIO_VDEV_IO_STAGES | \ ++ ZIO_STAGE_DVA_ALLOCATE) ++ ++#define ZIO_DDT_WRITE_PIPELINE \ ++ (ZIO_INTERLOCK_STAGES | \ ++ ZIO_STAGE_ISSUE_ASYNC | \ ++ ZIO_STAGE_WRITE_BP_INIT | \ ++ ZIO_STAGE_CHECKSUM_GENERATE | \ ++ ZIO_STAGE_DDT_WRITE) ++ ++#define ZIO_GANG_STAGES \ ++ (ZIO_STAGE_GANG_ASSEMBLE | \ ++ ZIO_STAGE_GANG_ISSUE) ++ ++#define ZIO_FREE_PIPELINE \ ++ (ZIO_INTERLOCK_STAGES | \ ++ ZIO_STAGE_FREE_BP_INIT | \ ++ ZIO_STAGE_DVA_FREE) ++ ++#define ZIO_DDT_FREE_PIPELINE \ ++ (ZIO_INTERLOCK_STAGES | \ ++ ZIO_STAGE_FREE_BP_INIT | \ ++ ZIO_STAGE_ISSUE_ASYNC | \ ++ ZIO_STAGE_DDT_FREE) ++ ++#define ZIO_CLAIM_PIPELINE \ ++ (ZIO_INTERLOCK_STAGES | \ ++ ZIO_STAGE_DVA_CLAIM) ++ ++#define ZIO_IOCTL_PIPELINE \ ++ (ZIO_INTERLOCK_STAGES | \ ++ ZIO_STAGE_VDEV_IO_START | \ ++ ZIO_STAGE_VDEV_IO_ASSESS) ++ ++#define ZIO_BLOCKING_STAGES \ ++ (ZIO_STAGE_DVA_ALLOCATE | \ ++ ZIO_STAGE_DVA_CLAIM | \ ++ ZIO_STAGE_VDEV_IO_START) ++ ++extern void zio_inject_init(void); ++extern void zio_inject_fini(void); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _ZIO_IMPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zpl.h linux-3.2.33-go/include/zfs/sys/zpl.h +--- linux-3.2.33-go.orig/include/zfs/sys/zpl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zpl.h 2012-11-16 23:25:34.337039473 +0100 +@@ -0,0 +1,90 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2011, Lawrence Livermore National Security, LLC. ++ */ ++ ++#ifndef _SYS_ZPL_H ++#define _SYS_ZPL_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* zpl_inode.c */ ++extern void zpl_vap_init(vattr_t *vap, struct inode *dir, ++ struct dentry *dentry, zpl_umode_t mode, cred_t *cr); ++ ++extern const struct inode_operations zpl_inode_operations; ++extern const struct inode_operations zpl_dir_inode_operations; ++extern const struct inode_operations zpl_symlink_inode_operations; ++extern const struct inode_operations zpl_special_inode_operations; ++ ++/* zpl_file.c */ ++extern ssize_t zpl_read_common(struct inode *ip, const char *buf, ++ size_t len, loff_t pos, uio_seg_t segment, int flags, cred_t *cr); ++extern ssize_t zpl_write_common(struct inode *ip, const char *buf, ++ size_t len, loff_t pos, uio_seg_t segment, int flags, cred_t *cr); ++extern long zpl_fallocate_common(struct inode *ip, int mode, ++ loff_t offset, loff_t len); ++ ++extern const struct address_space_operations zpl_address_space_operations; ++extern const struct file_operations zpl_file_operations; ++extern const struct file_operations zpl_dir_file_operations; ++ ++/* zpl_super.c */ ++extern void zpl_prune_sbs(int64_t bytes_to_scan, void *private); ++ ++typedef struct zpl_mount_data { ++ const char *z_osname; /* Dataset name */ ++ void *z_data; /* Mount options string */ ++} zpl_mount_data_t; ++ ++extern const struct super_operations zpl_super_operations; ++extern const struct export_operations zpl_export_operations; ++extern struct file_system_type zpl_fs_type; ++ ++/* zpl_xattr.c */ ++extern ssize_t zpl_xattr_list(struct dentry *dentry, char *buf, size_t size); ++extern int zpl_xattr_security_init(struct inode *ip, struct inode *dip, ++ const struct qstr *qstr); ++ ++extern xattr_handler_t *zpl_xattr_handlers[]; ++ ++/* zpl_ctldir.c */ ++extern const struct file_operations zpl_fops_root; ++extern const struct inode_operations zpl_ops_root; ++ ++extern const struct file_operations zpl_fops_snapdir; ++extern const struct inode_operations zpl_ops_snapdir; ++#ifdef HAVE_AUTOMOUNT ++extern const struct dentry_operations zpl_dops_snapdirs; ++#else ++extern const struct inode_operations zpl_ops_snapdirs; ++#endif /* HAVE_AUTOMOUNT */ ++ ++extern const struct file_operations zpl_fops_shares; ++extern const struct inode_operations zpl_ops_shares; ++ ++#endif /* _SYS_ZPL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zrlock.h linux-3.2.33-go/include/zfs/sys/zrlock.h +--- linux-3.2.33-go.orig/include/zfs/sys/zrlock.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zrlock.h 2012-11-16 23:25:34.342039415 +0100 +@@ -0,0 +1,66 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_ZRLOCK_H ++#define _SYS_ZRLOCK_H ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++typedef struct zrlock { ++ kmutex_t zr_mtx; ++ volatile int32_t zr_refcount; ++ kcondvar_t zr_cv; ++ uint16_t zr_pad; ++#ifdef ZFS_DEBUG ++ kthread_t *zr_owner; ++ const char *zr_caller; ++#endif ++} zrlock_t; ++ ++extern void zrl_init(zrlock_t *); ++extern void zrl_destroy(zrlock_t *); ++#ifdef ZFS_DEBUG ++#define zrl_add(_z) zrl_add_debug((_z), __func__) ++extern void zrl_add_debug(zrlock_t *, const char *); ++#else ++extern void zrl_add(zrlock_t *); ++#endif ++extern void zrl_remove(zrlock_t *); ++extern int zrl_tryenter(zrlock_t *); ++extern void zrl_exit(zrlock_t *); ++extern int zrl_is_zero(zrlock_t *); ++extern int zrl_is_locked(zrlock_t *); ++#ifdef ZFS_DEBUG ++extern kthread_t *zrl_owner(zrlock_t *); ++#endif ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _SYS_ZRLOCK_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/sys/zvol.h linux-3.2.33-go/include/zfs/sys/zvol.h +--- linux-3.2.33-go.orig/include/zfs/sys/zvol.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/sys/zvol.h 2012-11-16 23:25:34.339039449 +0100 +@@ -0,0 +1,51 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++ ++/* ++ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _SYS_ZVOL_H ++#define _SYS_ZVOL_H ++ ++#include ++ ++#define ZVOL_OBJ 1ULL ++#define ZVOL_ZAP_OBJ 2ULL ++ ++#ifdef _KERNEL ++ ++extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); ++extern int zvol_check_volblocksize(uint64_t volblocksize); ++extern int zvol_get_stats(objset_t *os, nvlist_t *nv); ++extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); ++extern int zvol_create_minor(const char *); ++extern int zvol_create_minors(const char *); ++extern int zvol_remove_minor(const char *); ++extern void zvol_remove_minors(const char *); ++extern int zvol_set_volsize(const char *, uint64_t); ++extern int zvol_set_volblocksize(const char *, uint64_t); ++ ++extern int zvol_init(void); ++extern void zvol_fini(void); ++ ++#endif /* _KERNEL */ ++#endif /* _SYS_ZVOL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/zfs_comutil.h linux-3.2.33-go/include/zfs/zfs_comutil.h +--- linux-3.2.33-go.orig/include/zfs/zfs_comutil.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/zfs_comutil.h 2012-11-16 23:25:34.344039393 +0100 +@@ -0,0 +1,46 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. ++ */ ++ ++#ifndef _ZFS_COMUTIL_H ++#define _ZFS_COMUTIL_H ++ ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++extern boolean_t zfs_allocatable_devs(nvlist_t *); ++extern void zpool_get_rewind_policy(nvlist_t *, zpool_rewind_policy_t *); ++ ++extern int zfs_zpl_version_map(int spa_version); ++extern int zfs_spa_version_map(int zpl_version); ++extern const char *zfs_history_event_names[LOG_END]; ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _ZFS_COMUTIL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/zfs_deleg.h linux-3.2.33-go/include/zfs/zfs_deleg.h +--- linux-3.2.33-go.orig/include/zfs/zfs_deleg.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/zfs_deleg.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,87 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Copyright 2010 Nexenta Systems, Inc. All rights reserved. ++ */ ++ ++#ifndef _ZFS_DELEG_H ++#define _ZFS_DELEG_H ++ ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#define ZFS_DELEG_SET_NAME_CHR '@' /* set name lead char */ ++#define ZFS_DELEG_FIELD_SEP_CHR '$' /* field separator */ ++ ++/* ++ * Max name length for a delegation attribute ++ */ ++#define ZFS_MAX_DELEG_NAME 128 ++ ++#define ZFS_DELEG_LOCAL 'l' ++#define ZFS_DELEG_DESCENDENT 'd' ++#define ZFS_DELEG_NA '-' ++ ++typedef enum { ++ ZFS_DELEG_NOTE_CREATE, ++ ZFS_DELEG_NOTE_DESTROY, ++ ZFS_DELEG_NOTE_SNAPSHOT, ++ ZFS_DELEG_NOTE_ROLLBACK, ++ ZFS_DELEG_NOTE_CLONE, ++ ZFS_DELEG_NOTE_PROMOTE, ++ ZFS_DELEG_NOTE_RENAME, ++ ZFS_DELEG_NOTE_SEND, ++ ZFS_DELEG_NOTE_RECEIVE, ++ ZFS_DELEG_NOTE_ALLOW, ++ ZFS_DELEG_NOTE_USERPROP, ++ ZFS_DELEG_NOTE_MOUNT, ++ ZFS_DELEG_NOTE_SHARE, ++ ZFS_DELEG_NOTE_USERQUOTA, ++ ZFS_DELEG_NOTE_GROUPQUOTA, ++ ZFS_DELEG_NOTE_USERUSED, ++ ZFS_DELEG_NOTE_GROUPUSED, ++ ZFS_DELEG_NOTE_HOLD, ++ ZFS_DELEG_NOTE_RELEASE, ++ ZFS_DELEG_NOTE_DIFF, ++ ZFS_DELEG_NOTE_NONE ++} zfs_deleg_note_t; ++ ++typedef struct zfs_deleg_perm_tab { ++ char *z_perm; ++ zfs_deleg_note_t z_note; ++} zfs_deleg_perm_tab_t; ++ ++extern zfs_deleg_perm_tab_t zfs_deleg_perm_tab[]; ++ ++int zfs_deleg_verify_nvlist(nvlist_t *nvlist); ++void zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type, ++ char checkflag, void *data); ++const char *zfs_deleg_canonicalize_perm(const char *perm); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _ZFS_DELEG_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/zfs_fletcher.h linux-3.2.33-go/include/zfs/zfs_fletcher.h +--- linux-3.2.33-go.orig/include/zfs/zfs_fletcher.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/zfs_fletcher.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,53 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _ZFS_FLETCHER_H ++#define _ZFS_FLETCHER_H ++ ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * fletcher checksum functions ++ */ ++ ++void fletcher_2_native(const void *, uint64_t, zio_cksum_t *); ++void fletcher_2_byteswap(const void *, uint64_t, zio_cksum_t *); ++void fletcher_4_native(const void *, uint64_t, zio_cksum_t *); ++void fletcher_4_byteswap(const void *, uint64_t, zio_cksum_t *); ++void fletcher_4_incremental_native(const void *, uint64_t, ++ zio_cksum_t *); ++void fletcher_4_incremental_byteswap(const void *, uint64_t, ++ zio_cksum_t *); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _ZFS_FLETCHER_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/zfs_namecheck.h linux-3.2.33-go/include/zfs/zfs_namecheck.h +--- linux-3.2.33-go.orig/include/zfs/zfs_namecheck.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/zfs_namecheck.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,58 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _ZFS_NAMECHECK_H ++#define _ZFS_NAMECHECK_H ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++typedef enum { ++ NAME_ERR_LEADING_SLASH, /* name begins with leading slash */ ++ NAME_ERR_EMPTY_COMPONENT, /* name contains an empty component */ ++ NAME_ERR_TRAILING_SLASH, /* name ends with a slash */ ++ NAME_ERR_INVALCHAR, /* invalid character found */ ++ NAME_ERR_MULTIPLE_AT, /* multiple '@' characters found */ ++ NAME_ERR_NOLETTER, /* pool doesn't begin with a letter */ ++ NAME_ERR_RESERVED, /* entire name is reserved */ ++ NAME_ERR_DISKLIKE, /* reserved disk name (c[0-9].*) */ ++ NAME_ERR_TOOLONG, /* name is too long */ ++ NAME_ERR_NO_AT, /* permission set is missing '@' */ ++} namecheck_err_t; ++ ++#define ZFS_PERMSET_MAXLEN 64 ++ ++int pool_namecheck(const char *, namecheck_err_t *, char *); ++int dataset_namecheck(const char *, namecheck_err_t *, char *); ++int mountpoint_namecheck(const char *, namecheck_err_t *); ++int snapshot_namecheck(const char *, namecheck_err_t *, char *); ++int permset_namecheck(const char *, namecheck_err_t *, char *); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _ZFS_NAMECHECK_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/zfs_prop.h linux-3.2.33-go/include/zfs/zfs_prop.h +--- linux-3.2.33-go.orig/include/zfs/zfs_prop.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/zfs_prop.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,129 @@ ++/* ++ * CDDL HEADER START ++ * ++ * The contents of this file are subject to the terms of the ++ * Common Development and Distribution License (the "License"). ++ * You may not use this file except in compliance with the License. ++ * ++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE ++ * or http://www.opensolaris.org/os/licensing. ++ * See the License for the specific language governing permissions ++ * and limitations under the License. ++ * ++ * When distributing Covered Code, include this CDDL HEADER in each ++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. ++ * If applicable, add the following below this CDDL HEADER, with the ++ * fields enclosed by brackets "[]" replaced with your own identifying ++ * information: Portions Copyright [yyyy] [name of copyright owner] ++ * ++ * CDDL HEADER END ++ */ ++/* ++ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. ++ * Use is subject to license terms. ++ */ ++ ++#ifndef _ZFS_PROP_H ++#define _ZFS_PROP_H ++ ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* ++ * For index types (e.g. compression and checksum), we want the numeric value ++ * in the kernel, but the string value in userland. ++ */ ++typedef enum { ++ PROP_TYPE_NUMBER, /* numeric value */ ++ PROP_TYPE_STRING, /* string value */ ++ PROP_TYPE_INDEX /* numeric value indexed by string */ ++} zprop_type_t; ++ ++typedef enum { ++ PROP_DEFAULT, ++ PROP_READONLY, ++ PROP_INHERIT, ++ /* ++ * ONETIME properties are a sort of conglomeration of READONLY ++ * and INHERIT. They can be set only during object creation, ++ * after that they are READONLY. If not explicitly set during ++ * creation, they can be inherited. ++ */ ++ PROP_ONETIME ++} zprop_attr_t; ++ ++typedef struct zfs_index { ++ const char *pi_name; ++ uint64_t pi_value; ++} zprop_index_t; ++ ++typedef struct { ++ const char *pd_name; /* human-readable property name */ ++ int pd_propnum; /* property number */ ++ zprop_type_t pd_proptype; /* string, boolean, index, number */ ++ const char *pd_strdefault; /* default for strings */ ++ uint64_t pd_numdefault; /* for boolean / index / number */ ++ zprop_attr_t pd_attr; /* default, readonly, inherit */ ++ int pd_types; /* bitfield of valid dataset types */ ++ /* fs | vol | snap; or pool */ ++ const char *pd_values; /* string telling acceptable values */ ++ const char *pd_colname; /* column header for "zfs list" */ ++ boolean_t pd_rightalign; /* column alignment for "zfs list" */ ++ boolean_t pd_visible; /* do we list this property with the */ ++ /* "zfs get" help message */ ++ const zprop_index_t *pd_table; /* for index properties, a table */ ++ /* defining the possible values */ ++ size_t pd_table_size; /* number of entries in pd_table[] */ ++} zprop_desc_t; ++ ++/* ++ * zfs dataset property functions ++ */ ++void zfs_prop_init(void); ++zprop_type_t zfs_prop_get_type(zfs_prop_t); ++boolean_t zfs_prop_delegatable(zfs_prop_t prop); ++zprop_desc_t *zfs_prop_get_table(void); ++ ++/* ++ * zpool property functions ++ */ ++void zpool_prop_init(void); ++zprop_type_t zpool_prop_get_type(zpool_prop_t); ++zprop_desc_t *zpool_prop_get_table(void); ++ ++/* ++ * Common routines to initialize property tables ++ */ ++void zprop_register_impl(int, const char *, zprop_type_t, uint64_t, ++ const char *, zprop_attr_t, int, const char *, const char *, ++ boolean_t, boolean_t, const zprop_index_t *); ++void zprop_register_string(int, const char *, const char *, ++ zprop_attr_t attr, int, const char *, const char *); ++void zprop_register_number(int, const char *, uint64_t, zprop_attr_t, int, ++ const char *, const char *); ++void zprop_register_index(int, const char *, uint64_t, zprop_attr_t, int, ++ const char *, const char *, const zprop_index_t *); ++void zprop_register_hidden(int, const char *, zprop_type_t, zprop_attr_t, ++ int, const char *); ++ ++/* ++ * Common routines for zfs and zpool property management ++ */ ++int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, zfs_type_t); ++int zprop_name_to_prop(const char *, zfs_type_t); ++int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t); ++int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t); ++uint64_t zprop_random_value(int, uint64_t, zfs_type_t); ++const char *zprop_values(int, zfs_type_t); ++size_t zprop_width(int, boolean_t *, zfs_type_t); ++boolean_t zprop_valid_for_type(int, zfs_type_t); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* _ZFS_PROP_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/zpios-ctl.h linux-3.2.33-go/include/zfs/zpios-ctl.h +--- linux-3.2.33-go.orig/include/zfs/zpios-ctl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/zpios-ctl.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,198 @@ ++/*****************************************************************************\ ++ * ZPIOS is a heavily modified version of the original PIOS test code. ++ * It is designed to have the test code running in the Linux kernel ++ * against ZFS while still being flexibly controled from user space. ++ * ++ * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * LLNL-CODE-403049 ++ * ++ * Original PIOS Test Code ++ * Copyright (C) 2004 Cluster File Systems, Inc. ++ * Written by Peter Braam ++ * Atul Vidwansa ++ * Milind Dumbare ++ * ++ * This file is part of ZFS on Linux. ++ * For details, see . ++ * ++ * ZPIOS is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * ZPIOS is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with ZPIOS. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _ZPIOS_CTL_H ++#define _ZPIOS_CTL_H ++ ++/* Contains shared definitions which both the userspace ++ * and kernelspace portions of zpios must agree on. ++ */ ++#ifndef _KERNEL ++#include ++#endif ++ ++#define ZPIOS_MAJOR 232 /* XXX - Arbitrary */ ++#define ZPIOS_MINORS 1 ++#define ZPIOS_NAME "zpios" ++#define ZPIOS_DEV "/dev/zpios" ++ ++#define DMU_IO 0x01 ++ ++#define DMU_WRITE 0x0001 ++#define DMU_READ 0x0002 ++#define DMU_VERIFY 0x0004 ++#define DMU_REMOVE 0x0008 ++#define DMU_FPP 0x0010 ++#define DMU_WRITE_ZC 0x0020 /* Incompatible w/DMU_VERIFY */ ++#define DMU_READ_ZC 0x0040 /* Incompatible w/DMU_VERIFY */ ++#define DMU_WRITE_NOWAIT 0x0080 ++#define DMU_READ_NOPF 0x0100 ++ ++#define ZPIOS_NAME_SIZE 16 ++#define ZPIOS_PATH_SIZE 128 ++ ++#define PHASE_PRE_RUN "pre-run" ++#define PHASE_PRE_CREATE "pre-create" ++#define PHASE_PRE_WRITE "pre-write" ++#define PHASE_PRE_READ "pre-read" ++#define PHASE_PRE_REMOVE "pre-remove" ++#define PHASE_POST_RUN "post-run" ++#define PHASE_POST_CREATE "post-create" ++#define PHASE_POST_WRITE "post-write" ++#define PHASE_POST_READ "post-read" ++#define PHASE_POST_REMOVE "post-remove" ++ ++#define ZPIOS_CFG_MAGIC 0x87237190U ++typedef struct zpios_cfg { ++ uint32_t cfg_magic; /* Unique magic */ ++ int32_t cfg_cmd; /* Config command */ ++ int32_t cfg_arg1; /* Config command arg 1 */ ++ int32_t cfg_rc1; /* Config response 1 */ ++} zpios_cfg_t; ++ ++typedef struct zpios_timespec { ++ uint32_t ts_sec; ++ uint32_t ts_nsec; ++} zpios_timespec_t; ++ ++typedef struct zpios_time { ++ zpios_timespec_t start; ++ zpios_timespec_t stop; ++ zpios_timespec_t delta; ++} zpios_time_t; ++ ++typedef struct zpios_stats { ++ zpios_time_t total_time; ++ zpios_time_t cr_time; ++ zpios_time_t rm_time; ++ zpios_time_t wr_time; ++ zpios_time_t rd_time; ++ uint64_t wr_data; ++ uint64_t wr_chunks; ++ uint64_t rd_data; ++ uint64_t rd_chunks; ++} zpios_stats_t; ++ ++#define ZPIOS_CMD_MAGIC 0x49715385U ++typedef struct zpios_cmd { ++ uint32_t cmd_magic; /* Unique magic */ ++ uint32_t cmd_id; /* Run ID */ ++ char cmd_pool[ZPIOS_NAME_SIZE]; /* Pool name */ ++ uint64_t cmd_chunk_size; /* Chunk size */ ++ uint32_t cmd_thread_count; /* Thread count */ ++ uint32_t cmd_region_count; /* Region count */ ++ uint64_t cmd_region_size; /* Region size */ ++ uint64_t cmd_offset; /* Region offset */ ++ uint32_t cmd_region_noise; /* Region noise */ ++ uint32_t cmd_chunk_noise; /* Chunk noise */ ++ uint32_t cmd_thread_delay; /* Thread delay */ ++ uint32_t cmd_flags; /* Test flags */ ++ char cmd_pre[ZPIOS_PATH_SIZE]; /* Pre-exec hook */ ++ char cmd_post[ZPIOS_PATH_SIZE]; /* Post-exec hook */ ++ char cmd_log[ZPIOS_PATH_SIZE]; /* Requested log dir */ ++ uint64_t cmd_data_size; /* Opaque data size */ ++ char cmd_data_str[0]; /* Opaque data region */ ++} zpios_cmd_t; ++ ++/* Valid ioctls */ ++#define ZPIOS_CFG _IOWR('f', 101, zpios_cfg_t) ++#define ZPIOS_CMD _IOWR('f', 102, zpios_cmd_t) ++ ++/* Valid configuration commands */ ++#define ZPIOS_CFG_BUFFER_CLEAR 0x001 /* Clear text buffer */ ++#define ZPIOS_CFG_BUFFER_SIZE 0x002 /* Resize text buffer */ ++ ++#ifndef NSEC_PER_SEC ++#define NSEC_PER_SEC 1000000000L ++#endif ++ ++static inline ++void zpios_timespec_normalize(zpios_timespec_t *ts, uint32_t sec, uint32_t nsec) ++{ ++ while (nsec >= NSEC_PER_SEC) { ++ nsec -= NSEC_PER_SEC; ++ sec++; ++ } ++ while (nsec < 0) { ++ nsec += NSEC_PER_SEC; ++ sec--; ++ } ++ ts->ts_sec = sec; ++ ts->ts_nsec = nsec; ++} ++ ++static inline ++zpios_timespec_t zpios_timespec_add(zpios_timespec_t lhs, zpios_timespec_t rhs) ++{ ++ zpios_timespec_t ts_delta; ++ zpios_timespec_normalize(&ts_delta, lhs.ts_sec + rhs.ts_sec, ++ lhs.ts_nsec + rhs.ts_nsec); ++ return ts_delta; ++} ++ ++static inline ++zpios_timespec_t zpios_timespec_sub(zpios_timespec_t lhs, zpios_timespec_t rhs) ++{ ++ zpios_timespec_t ts_delta; ++ zpios_timespec_normalize(&ts_delta, lhs.ts_sec - rhs.ts_sec, ++ lhs.ts_nsec - rhs.ts_nsec); ++ return ts_delta; ++} ++ ++#ifdef _KERNEL ++ ++static inline ++zpios_timespec_t zpios_timespec_now(void) ++{ ++ zpios_timespec_t zts_now; ++ struct timespec ts_now; ++ ++ ts_now = current_kernel_time(); ++ zts_now.ts_sec = ts_now.tv_sec; ++ zts_now.ts_nsec = ts_now.tv_nsec; ++ ++ return zts_now; ++} ++ ++#else ++ ++static inline ++double zpios_timespec_to_double(zpios_timespec_t ts) ++{ ++ return ((double)(ts.ts_sec) + ++ ((double)(ts.ts_nsec) / (double)(NSEC_PER_SEC))); ++} ++ ++#endif /* _KERNEL */ ++ ++#endif /* _ZPIOS_CTL_H */ +diff -uNr linux-3.2.33-go.orig/include/zfs/zpios-internal.h linux-3.2.33-go/include/zfs/zpios-internal.h +--- linux-3.2.33-go.orig/include/zfs/zpios-internal.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/include/zfs/zpios-internal.h 2012-11-16 23:25:34.336039485 +0100 +@@ -0,0 +1,138 @@ ++/*****************************************************************************\ ++ * ZPIOS is a heavily modified version of the original PIOS test code. ++ * It is designed to have the test code running in the Linux kernel ++ * against ZFS while still being flexibly controled from user space. ++ * ++ * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * LLNL-CODE-403049 ++ * ++ * Original PIOS Test Code ++ * Copyright (C) 2004 Cluster File Systems, Inc. ++ * Written by Peter Braam ++ * Atul Vidwansa ++ * Milind Dumbare ++ * ++ * This file is part of ZFS on Linux. ++ * For details, see . ++ * ++ * ZPIOS is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * ZPIOS is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with ZPIOS. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _ZPIOS_INTERNAL_H ++#define _ZPIOS_INTERNAL_H ++ ++#include "zpios-ctl.h" ++ ++#define OBJ_SIZE 64 ++ ++struct run_args; ++ ++typedef struct dmu_obj { ++ objset_t *os; ++ uint64_t obj; ++} dmu_obj_t; ++ ++/* thread doing the IO data */ ++typedef struct thread_data { ++ struct run_args *run_args; ++ int thread_no; ++ int rc; ++ zpios_stats_t stats; ++ kmutex_t lock; ++} thread_data_t; ++ ++/* region for IO data */ ++typedef struct zpios_region { ++ __u64 wr_offset; ++ __u64 rd_offset; ++ __u64 init_offset; ++ __u64 max_offset; ++ dmu_obj_t obj; ++ zpios_stats_t stats; ++ kmutex_t lock; ++} zpios_region_t; ++ ++/* arguments for one run */ ++typedef struct run_args { ++ /* Config args */ ++ int id; ++ char pool[ZPIOS_NAME_SIZE]; ++ __u64 chunk_size; ++ __u32 thread_count; ++ __u32 region_count; ++ __u64 region_size; ++ __u64 offset; ++ __u32 region_noise; ++ __u32 chunk_noise; ++ __u32 thread_delay; ++ __u32 flags; ++ char pre[ZPIOS_PATH_SIZE]; ++ char post[ZPIOS_PATH_SIZE]; ++ char log[ZPIOS_PATH_SIZE]; ++ ++ /* Control data */ ++ objset_t *os; ++ wait_queue_head_t waitq; ++ volatile uint64_t threads_done; ++ kmutex_t lock_work; ++ kmutex_t lock_ctl; ++ __u32 region_next; ++ ++ /* Results data */ ++ struct file *file; ++ zpios_stats_t stats; ++ ++ thread_data_t **threads; ++ zpios_region_t regions[0]; /* Must be last element */ ++} run_args_t; ++ ++#define ZPIOS_INFO_BUFFER_SIZE 65536 ++#define ZPIOS_INFO_BUFFER_REDZONE 1024 ++ ++typedef struct zpios_info { ++ spinlock_t info_lock; ++ int info_size; ++ char *info_buffer; ++ char *info_head; /* Internal kernel use only */ ++} zpios_info_t; ++ ++#define zpios_print(file, format, args...) \ ++({ zpios_info_t *_info_ = (zpios_info_t *)file->private_data; \ ++ int _rc_; \ ++ \ ++ ASSERT(_info_); \ ++ ASSERT(_info_->info_buffer); \ ++ \ ++ spin_lock(&_info_->info_lock); \ ++ \ ++ /* Don't allow the kernel to start a write in the red zone */ \ ++ if ((int)(_info_->info_head - _info_->info_buffer) > \ ++ (_info_->info_size - ZPIOS_INFO_BUFFER_REDZONE)) { \ ++ _rc_ = -EOVERFLOW; \ ++ } else { \ ++ _rc_ = sprintf(_info_->info_head, format, args); \ ++ if (_rc_ >= 0) \ ++ _info_->info_head += _rc_; \ ++ } \ ++ \ ++ spin_unlock(&_info_->info_lock); \ ++ _rc_; \ ++}) ++ ++#define zpios_vprint(file, test, format, args...) \ ++ zpios_print(file, "%*s: " format, ZPIOS_NAME_SIZE, test, args) ++ ++#endif /* _ZPIOS_INTERNAL_H */ +diff -uNr linux-3.2.33-go.orig/Kconfig linux-3.2.33-go/Kconfig +--- linux-3.2.33-go.orig/Kconfig 2012-11-16 23:15:13.037410874 +0100 ++++ linux-3.2.33-go/Kconfig 2012-11-16 23:22:32.423192712 +0100 +@@ -9,3 +9,4 @@ + option env="SRCARCH" + + source "arch/$SRCARCH/Kconfig" ++source "spl/Kconfig" +diff -uNr linux-3.2.33-go.orig/Makefile linux-3.2.33-go/Makefile +--- linux-3.2.33-go.orig/Makefile 2012-11-16 23:15:13.028410977 +0100 ++++ linux-3.2.33-go/Makefile 2012-11-16 23:22:32.430192633 +0100 +@@ -708,7 +708,7 @@ + + + ifeq ($(KBUILD_EXTMOD),) +-core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ ++core-y += kernel/ mm/ spl/ fs/ ipc/ security/ crypto/ block/ + + vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ + $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ +diff -uNr linux-3.2.33-go.orig/spl/Kbuild linux-3.2.33-go/spl/Kbuild +--- linux-3.2.33-go.orig/spl/Kbuild 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/Kbuild 2012-11-16 23:22:32.422192724 +0100 +@@ -0,0 +1,7 @@ ++SPL_MODULE_CFLAGS = -I$(srctree)/include/spl ++SPL_MODULE_CFLAGS += -include $(srctree)/spl_config.h ++export SPL_MODULE_CFLAGS ++ ++obj-$(CONFIG_SPL) := ++obj-$(CONFIG_SPL) += spl/ ++obj-$(CONFIG_SPL) += splat/ +diff -uNr linux-3.2.33-go.orig/spl/Kconfig linux-3.2.33-go/spl/Kconfig +--- linux-3.2.33-go.orig/spl/Kconfig 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/Kconfig 2012-11-16 23:22:32.421192736 +0100 +@@ -0,0 +1,10 @@ ++config SPL ++ tristate "Solaris Porting Layer (SPL)" ++ help ++ This is the SPL library from the ZFS On Linux project. ++ ++ See http://zfsonlinux.org/ ++ ++ To compile this library as a module, choose M here. ++ ++ If unsure, say N. +diff -uNr linux-3.2.33-go.orig/spl/Makefile linux-3.2.33-go/spl/Makefile +--- linux-3.2.33-go.orig/spl/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/Makefile 2012-11-16 23:22:32.411192851 +0100 +@@ -0,0 +1,50 @@ ++subdir-m += spl ++subdir-m += splat ++ ++INSTALL=/usr/bin/install ++ ++SPL_MODULE_CFLAGS = -I/root/spl-0.6.0-rc12/include ++SPL_MODULE_CFLAGS += -include /root/spl-0.6.0-rc12/spl_config.h ++export SPL_MODULE_CFLAGS ++ ++modules: ++ $(MAKE) -C /usr/src/linux-3.6.0-sabayon SUBDIRS=`pwd` O=/usr/src/linux-3.6.0-sabayon CONFIG_SPL=m $@ ++ ++clean: ++ @# Only cleanup the kernel build directories when CONFIG_KERNEL ++ @# is defined. This indicates that kernel modules should be built. ++# $(MAKE) -C /usr/src/linux-3.6.0-sabayon SUBDIRS=`pwd` O=/usr/src/linux-3.6.0-sabayon $@ ++ ++ if [ -f 'NONE' ]; then $(RM) 'NONE'; fi ++ if [ -f Module.markers ]; then $(RM) Module.markers; fi ++ ++modules_install: ++ @# Install the kernel modules ++ $(MAKE) -C /usr/src/linux-3.6.0-sabayon SUBDIRS=`pwd` \ ++ INSTALL_MOD_PATH=$(DESTDIR) \ ++ INSTALL_MOD_DIR=addon/spl $@ ++ @# Remove extraneous build products when packaging ++ if [ -n "$(DESTDIR)" ]; then \ ++ find $(DESTDIR)/lib/modules/3.6.0-sabayon \ ++ -name 'modules.*' | xargs $(RM); \ ++ fi ++ sysmap=$(DESTDIR)/boot/System.map-3.6.0-sabayon; \ ++ if [ -f $$sysmap ]; then \ ++ depmod -ae -F $$sysmap 3.6.0-sabayon; \ ++ fi ++ ++modules_uninstall: ++ @# Uninstall the kernel modules ++ $(RM) -R $(DESTDIR)/lib/modules/3.6.0-sabayon/addon/spl ++ ++distdir: ++ list='$(subdir-m)'; for subdir in $$list; do \ ++ (find ../module/$$subdir -name '*.c' -o -name '*.h' |\ ++ xargs /bin/cp -t $$distdir/$$subdir); \ ++ done ++ ++distclean maintainer-clean: clean ++install: modules_install ++uninstall: modules_uninstall ++all: modules ++check: +diff -uNr linux-3.2.33-go.orig/spl/Makefile.in linux-3.2.33-go/spl/Makefile.in +--- linux-3.2.33-go.orig/spl/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/Makefile.in 2012-11-16 23:22:32.411192851 +0100 +@@ -0,0 +1,50 @@ ++subdir-m += spl ++subdir-m += splat ++ ++INSTALL=/usr/bin/install ++ ++SPL_MODULE_CFLAGS = -I@abs_top_srcdir@/include ++SPL_MODULE_CFLAGS += -include @abs_top_builddir@/spl_config.h ++export SPL_MODULE_CFLAGS ++ ++modules: ++ $(MAKE) -C @LINUX_OBJ@ SUBDIRS=`pwd` @KERNELMAKE_PARAMS@ CONFIG_SPL=m $@ ++ ++clean: ++ @# Only cleanup the kernel build directories when CONFIG_KERNEL ++ @# is defined. This indicates that kernel modules should be built. ++@CONFIG_KERNEL_TRUE@ $(MAKE) -C @LINUX_OBJ@ SUBDIRS=`pwd` @KERNELMAKE_PARAMS@ $@ ++ ++ if [ -f '@LINUX_SYMBOLS@' ]; then $(RM) '@LINUX_SYMBOLS@'; fi ++ if [ -f Module.markers ]; then $(RM) Module.markers; fi ++ ++modules_install: ++ @# Install the kernel modules ++ $(MAKE) -C @LINUX_OBJ@ SUBDIRS=`pwd` \ ++ INSTALL_MOD_PATH=$(DESTDIR) \ ++ INSTALL_MOD_DIR=addon/spl $@ ++ @# Remove extraneous build products when packaging ++ if [ -n "$(DESTDIR)" ]; then \ ++ find $(DESTDIR)/lib/modules/@LINUX_VERSION@ \ ++ -name 'modules.*' | xargs $(RM); \ ++ fi ++ sysmap=$(DESTDIR)/boot/System.map-@LINUX_VERSION@; \ ++ if [ -f $$sysmap ]; then \ ++ depmod -ae -F $$sysmap @LINUX_VERSION@; \ ++ fi ++ ++modules_uninstall: ++ @# Uninstall the kernel modules ++ $(RM) -R $(DESTDIR)/lib/modules/@LINUX_VERSION@/addon/spl ++ ++distdir: ++ list='$(subdir-m)'; for subdir in $$list; do \ ++ (find @top_srcdir@/module/$$subdir -name '*.c' -o -name '*.h' |\ ++ xargs /bin/cp -t $$distdir/$$subdir); \ ++ done ++ ++distclean maintainer-clean: clean ++install: modules_install ++uninstall: modules_uninstall ++all: modules ++check: +diff -uNr linux-3.2.33-go.orig/spl/spl/Makefile linux-3.2.33-go/spl/spl/Makefile +--- linux-3.2.33-go.orig/spl/spl/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/Makefile 2012-11-16 23:22:32.415192803 +0100 +@@ -0,0 +1,27 @@ ++# Makefile.in for spl kernel module ++ ++MODULE := spl ++EXTRA_CFLAGS = $(SPL_MODULE_CFLAGS) -DHAVE_GPL_ONLY_SYMBOLS -Wstrict-prototypes -DNDEBUG -DDEBUG_LOG -DDEBUG_KMEM ++ ++# Solaris porting layer module ++obj-$(CONFIG_SPL) := $(MODULE).o ++ ++$(MODULE)-objs += spl-debug.o ++$(MODULE)-objs += spl-proc.o ++$(MODULE)-objs += spl-kmem.o ++$(MODULE)-objs += spl-thread.o ++$(MODULE)-objs += spl-taskq.o ++$(MODULE)-objs += spl-rwlock.o ++$(MODULE)-objs += spl-vnode.o ++$(MODULE)-objs += spl-err.o ++$(MODULE)-objs += spl-time.o ++$(MODULE)-objs += spl-kobj.o ++$(MODULE)-objs += spl-generic.o ++$(MODULE)-objs += spl-atomic.o ++$(MODULE)-objs += spl-mutex.o ++$(MODULE)-objs += spl-kstat.o ++$(MODULE)-objs += spl-condvar.o ++$(MODULE)-objs += spl-xdr.o ++$(MODULE)-objs += spl-cred.o ++$(MODULE)-objs += spl-tsd.o ++$(MODULE)-objs += spl-zlib.o +diff -uNr linux-3.2.33-go.orig/spl/spl/Makefile.in linux-3.2.33-go/spl/spl/Makefile.in +--- linux-3.2.33-go.orig/spl/spl/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/Makefile.in 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,27 @@ ++# Makefile.in for spl kernel module ++ ++MODULE := spl ++EXTRA_CFLAGS = $(SPL_MODULE_CFLAGS) @KERNELCPPFLAGS@ ++ ++# Solaris porting layer module ++obj-$(CONFIG_SPL) := $(MODULE).o ++ ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-debug.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-proc.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-kmem.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-thread.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-taskq.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-rwlock.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-vnode.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-err.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-time.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-kobj.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-generic.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-atomic.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-mutex.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-kstat.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-condvar.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-xdr.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-cred.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-tsd.o ++$(MODULE)-objs += @top_srcdir@/module/spl/spl-zlib.o +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-atomic.c linux-3.2.33-go/spl/spl/spl-atomic.c +--- linux-3.2.33-go.orig/spl/spl/spl-atomic.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-atomic.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,42 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Atomic Implementation. ++\*****************************************************************************/ ++ ++#include ++ ++#ifdef DEBUG_SUBSYSTEM ++#undef DEBUG_SUBSYSTEM ++#endif ++ ++#define DEBUG_SUBSYSTEM S_ATOMIC ++ ++#ifdef ATOMIC_SPINLOCK ++/* Global atomic lock declarations */ ++DEFINE_SPINLOCK(atomic32_lock); ++DEFINE_SPINLOCK(atomic64_lock); ++ ++EXPORT_SYMBOL(atomic32_lock); ++EXPORT_SYMBOL(atomic64_lock); ++#endif /* ATOMIC_SPINLOCK */ +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-condvar.c linux-3.2.33-go/spl/spl/spl-condvar.c +--- linux-3.2.33-go.orig/spl/spl/spl-condvar.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-condvar.c 2012-11-16 23:22:32.411192851 +0100 +@@ -0,0 +1,255 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Credential Implementation. ++\*****************************************************************************/ ++ ++#include ++#include ++ ++#ifdef SS_DEBUG_SUBSYS ++#undef SS_DEBUG_SUBSYS ++#endif ++ ++#define SS_DEBUG_SUBSYS SS_CONDVAR ++ ++void ++__cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) ++{ ++ int flags = KM_SLEEP; ++ ++ SENTRY; ++ ASSERT(cvp); ++ ASSERT(name == NULL); ++ ASSERT(type == CV_DEFAULT); ++ ASSERT(arg == NULL); ++ ++ cvp->cv_magic = CV_MAGIC; ++ init_waitqueue_head(&cvp->cv_event); ++ init_waitqueue_head(&cvp->cv_destroy); ++ atomic_set(&cvp->cv_waiters, 0); ++ atomic_set(&cvp->cv_refs, 1); ++ cvp->cv_mutex = NULL; ++ ++ /* We may be called when there is a non-zero preempt_count or ++ * interrupts are disabled is which case we must not sleep. ++ */ ++ if (current_thread_info()->preempt_count || irqs_disabled()) ++ flags = KM_NOSLEEP; ++ ++ SEXIT; ++} ++EXPORT_SYMBOL(__cv_init); ++ ++static int ++cv_destroy_wakeup(kcondvar_t *cvp) ++{ ++ if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) { ++ ASSERT(cvp->cv_mutex == NULL); ++ ASSERT(!waitqueue_active(&cvp->cv_event)); ++ return 1; ++ } ++ ++ return 0; ++} ++ ++void ++__cv_destroy(kcondvar_t *cvp) ++{ ++ SENTRY; ++ ASSERT(cvp); ++ ASSERT(cvp->cv_magic == CV_MAGIC); ++ ++ cvp->cv_magic = CV_DESTROY; ++ atomic_dec(&cvp->cv_refs); ++ ++ /* Block until all waiters are woken and references dropped. */ ++ while (cv_destroy_wakeup(cvp) == 0) ++ wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1); ++ ++ ASSERT3P(cvp->cv_mutex, ==, NULL); ++ ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0); ++ ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0); ++ ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0); ++ ++ SEXIT; ++} ++EXPORT_SYMBOL(__cv_destroy); ++ ++static void ++cv_wait_common(kcondvar_t *cvp, kmutex_t *mp, int state) ++{ ++ DEFINE_WAIT(wait); ++ SENTRY; ++ ++ ASSERT(cvp); ++ ASSERT(mp); ++ ASSERT(cvp->cv_magic == CV_MAGIC); ++ ASSERT(mutex_owned(mp)); ++ atomic_inc(&cvp->cv_refs); ++ ++ if (cvp->cv_mutex == NULL) ++ cvp->cv_mutex = mp; ++ ++ /* Ensure the same mutex is used by all callers */ ++ ASSERT(cvp->cv_mutex == mp); ++ ++ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); ++ atomic_inc(&cvp->cv_waiters); ++ ++ /* Mutex should be dropped after prepare_to_wait() this ++ * ensures we're linked in to the waiters list and avoids the ++ * race where 'cvp->cv_waiters > 0' but the list is empty. */ ++ mutex_exit(mp); ++ schedule(); ++ mutex_enter(mp); ++ ++ /* No more waiters a different mutex could be used */ ++ if (atomic_dec_and_test(&cvp->cv_waiters)) { ++ cvp->cv_mutex = NULL; ++ wake_up(&cvp->cv_destroy); ++ } ++ ++ finish_wait(&cvp->cv_event, &wait); ++ atomic_dec(&cvp->cv_refs); ++ ++ SEXIT; ++} ++ ++void ++__cv_wait(kcondvar_t *cvp, kmutex_t *mp) ++{ ++ cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE); ++} ++EXPORT_SYMBOL(__cv_wait); ++ ++void ++__cv_wait_interruptible(kcondvar_t *cvp, kmutex_t *mp) ++{ ++ cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE); ++} ++EXPORT_SYMBOL(__cv_wait_interruptible); ++ ++/* 'expire_time' argument is an absolute wall clock time in jiffies. ++ * Return value is time left (expire_time - now) or -1 if timeout occurred. ++ */ ++static clock_t ++__cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, ++ clock_t expire_time, int state) ++{ ++ DEFINE_WAIT(wait); ++ clock_t time_left; ++ SENTRY; ++ ++ ASSERT(cvp); ++ ASSERT(mp); ++ ASSERT(cvp->cv_magic == CV_MAGIC); ++ ASSERT(mutex_owned(mp)); ++ atomic_inc(&cvp->cv_refs); ++ ++ if (cvp->cv_mutex == NULL) ++ cvp->cv_mutex = mp; ++ ++ /* Ensure the same mutex is used by all callers */ ++ ASSERT(cvp->cv_mutex == mp); ++ ++ /* XXX - Does not handle jiffie wrap properly */ ++ time_left = expire_time - jiffies; ++ if (time_left <= 0) { ++ atomic_dec(&cvp->cv_refs); ++ SRETURN(-1); ++ } ++ ++ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); ++ atomic_inc(&cvp->cv_waiters); ++ ++ /* Mutex should be dropped after prepare_to_wait() this ++ * ensures we're linked in to the waiters list and avoids the ++ * race where 'cvp->cv_waiters > 0' but the list is empty. */ ++ mutex_exit(mp); ++ time_left = schedule_timeout(time_left); ++ mutex_enter(mp); ++ ++ /* No more waiters a different mutex could be used */ ++ if (atomic_dec_and_test(&cvp->cv_waiters)) { ++ cvp->cv_mutex = NULL; ++ wake_up(&cvp->cv_destroy); ++ } ++ ++ finish_wait(&cvp->cv_event, &wait); ++ atomic_dec(&cvp->cv_refs); ++ ++ SRETURN(time_left > 0 ? time_left : -1); ++} ++ ++clock_t ++__cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) ++{ ++ return __cv_timedwait_common(cvp, mp, exp_time, TASK_UNINTERRUPTIBLE); ++} ++EXPORT_SYMBOL(__cv_timedwait); ++ ++clock_t ++__cv_timedwait_interruptible(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) ++{ ++ return __cv_timedwait_common(cvp, mp, exp_time, TASK_INTERRUPTIBLE); ++} ++EXPORT_SYMBOL(__cv_timedwait_interruptible); ++ ++void ++__cv_signal(kcondvar_t *cvp) ++{ ++ SENTRY; ++ ASSERT(cvp); ++ ASSERT(cvp->cv_magic == CV_MAGIC); ++ atomic_inc(&cvp->cv_refs); ++ ++ /* All waiters are added with WQ_FLAG_EXCLUSIVE so only one ++ * waiter will be set runable with each call to wake_up(). ++ * Additionally wake_up() holds a spin_lock assoicated with ++ * the wait queue to ensure we don't race waking up processes. */ ++ if (atomic_read(&cvp->cv_waiters) > 0) ++ wake_up(&cvp->cv_event); ++ ++ atomic_dec(&cvp->cv_refs); ++ SEXIT; ++} ++EXPORT_SYMBOL(__cv_signal); ++ ++void ++__cv_broadcast(kcondvar_t *cvp) ++{ ++ SENTRY; ++ ASSERT(cvp); ++ ASSERT(cvp->cv_magic == CV_MAGIC); ++ atomic_inc(&cvp->cv_refs); ++ ++ /* Wake_up_all() will wake up all waiters even those which ++ * have the WQ_FLAG_EXCLUSIVE flag set. */ ++ if (atomic_read(&cvp->cv_waiters) > 0) ++ wake_up_all(&cvp->cv_event); ++ ++ atomic_dec(&cvp->cv_refs); ++ SEXIT; ++} ++EXPORT_SYMBOL(__cv_broadcast); +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-cred.c linux-3.2.33-go/spl/spl/spl-cred.c +--- linux-3.2.33-go.orig/spl/spl/spl-cred.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-cred.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,282 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Credential Implementation. ++\*****************************************************************************/ ++ ++#include ++ ++#ifdef DEBUG_SUBSYSTEM ++#undef DEBUG_SUBSYSTEM ++#endif ++ ++#define DEBUG_SUBSYSTEM S_CRED ++ ++#ifdef HAVE_GROUPS_SEARCH ++/* Symbol may be exported by custom kernel patch */ ++#define cr_groups_search(gi, grp) groups_search(gi, grp) ++#else ++/* Implementation from 2.6.30 kernel */ ++static int ++cr_groups_search(const struct group_info *group_info, gid_t grp) ++{ ++ unsigned int left, right; ++ ++ if (!group_info) ++ return 0; ++ ++ left = 0; ++ right = group_info->ngroups; ++ while (left < right) { ++ unsigned int mid = (left+right)/2; ++ int cmp = grp - GROUP_AT(group_info, mid); ++ if (cmp > 0) ++ left = mid + 1; ++ else if (cmp < 0) ++ right = mid; ++ else ++ return 1; ++ } ++ return 0; ++} ++#endif ++ ++#ifdef HAVE_CRED_STRUCT ++ ++/* ++ * As of 2.6.29 a clean credential API appears in the linux kernel. ++ * We attempt to layer the Solaris API on top of the linux API. ++ */ ++ ++/* Hold a reference on the credential and group info */ ++void ++crhold(cred_t *cr) ++{ ++ (void)get_cred((const cred_t *)cr); ++ (void)get_group_info(cr->group_info); ++} ++ ++/* Free a reference on the credential and group info */ ++void ++crfree(cred_t *cr) ++{ ++ put_group_info(cr->group_info); ++ put_cred((const cred_t *)cr); ++} ++ ++/* Return the number of supplemental groups */ ++int ++crgetngroups(const cred_t *cr) ++{ ++ struct group_info *gi; ++ int rc; ++ ++ gi = get_group_info(cr->group_info); ++ rc = gi->ngroups; ++ put_group_info(gi); ++ ++ return rc; ++} ++ ++/* ++ * Return an array of supplemental gids. The returned address is safe ++ * to use as long as the caller has taken a reference with crhold(). ++ * The caller is responsible for releasing the reference with crfree(). ++ */ ++gid_t * ++crgetgroups(const cred_t *cr) ++{ ++ struct group_info *gi; ++ gid_t *gids; ++ ++ gi = get_group_info(cr->group_info); ++ gids = gi->blocks[0]; ++ put_group_info(gi); ++ ++ return gids; ++} ++ ++/* Check if the passed gid is available is in supplied credential. */ ++int ++groupmember(gid_t gid, const cred_t *cr) ++{ ++ struct group_info *gi; ++ int rc; ++ ++ gi = get_group_info(cr->group_info); ++ rc = cr_groups_search(cr->group_info, gid); ++ put_group_info(gi); ++ ++ return rc; ++} ++ ++#else /* HAVE_CRED_STRUCT */ ++ ++/* ++ * Until very recently all credential information was embedded in ++ * the linux task struct. For this reason to simulate a Solaris ++ * cred_t we need to pass the entire task structure around. ++ */ ++ ++/* Hold a reference on the credential and group info */ ++void crhold(cred_t *cr) { } ++ ++/* Free a reference on the credential and group info */ ++void crfree(cred_t *cr) { } ++ ++/* Return the number of supplemental groups */ ++int ++crgetngroups(const cred_t *cr) ++{ ++ int lock, rc; ++ ++ lock = (cr != current); ++ if (lock) ++ task_lock((struct task_struct *)cr); ++ ++ get_group_info(cr->group_info); ++ rc = cr->group_info->ngroups; ++ put_group_info(cr->group_info); ++ ++ if (lock) ++ task_unlock((struct task_struct *)cr); ++ ++ return rc; ++} ++ ++/* ++ * Return an array of supplemental gids. The returned address is safe ++ * to use as long as the caller has taken a reference with crhold(). ++ * The caller is responsible for releasing the reference with crfree(). ++ */ ++gid_t * ++crgetgroups(const cred_t *cr) ++{ ++ gid_t *gids; ++ int lock; ++ ++ lock = (cr != current); ++ if (lock) ++ task_lock((struct task_struct *)cr); ++ ++ get_group_info(cr->group_info); ++ gids = cr->group_info->blocks[0]; ++ put_group_info(cr->group_info); ++ ++ if (lock) ++ task_unlock((struct task_struct *)cr); ++ ++ return gids; ++} ++ ++/* Check if the passed gid is available is in supplied credential. */ ++int ++groupmember(gid_t gid, const cred_t *cr) ++{ ++ int lock, rc; ++ ++ lock = (cr != current); ++ if (lock) ++ task_lock((struct task_struct *)cr); ++ ++ get_group_info(cr->group_info); ++ rc = cr_groups_search(cr->group_info, gid); ++ put_group_info(cr->group_info); ++ ++ if (lock) ++ task_unlock((struct task_struct *)cr); ++ ++ return rc; ++} ++ ++#endif /* HAVE_CRED_STRUCT */ ++ ++/* Return the effective user id */ ++uid_t ++crgetuid(const cred_t *cr) ++{ ++ return cr->euid; ++} ++ ++/* Return the real user id */ ++uid_t ++crgetruid(const cred_t *cr) ++{ ++ return cr->uid; ++} ++ ++/* Return the saved user id */ ++uid_t ++crgetsuid(const cred_t *cr) ++{ ++ return cr->suid; ++} ++ ++/* Return the filesystem user id */ ++uid_t ++crgetfsuid(const cred_t *cr) ++{ ++ return cr->fsuid; ++} ++ ++/* Return the effective group id */ ++gid_t ++crgetgid(const cred_t *cr) ++{ ++ return cr->egid; ++} ++ ++/* Return the real group id */ ++gid_t ++crgetrgid(const cred_t *cr) ++{ ++ return cr->gid; ++} ++ ++/* Return the saved group id */ ++gid_t ++crgetsgid(const cred_t *cr) ++{ ++ return cr->sgid; ++} ++ ++/* Return the filesystem group id */ ++gid_t ++crgetfsgid(const cred_t *cr) ++{ ++ return cr->fsgid; ++} ++ ++EXPORT_SYMBOL(crhold); ++EXPORT_SYMBOL(crfree); ++EXPORT_SYMBOL(crgetuid); ++EXPORT_SYMBOL(crgetruid); ++EXPORT_SYMBOL(crgetsuid); ++EXPORT_SYMBOL(crgetfsuid); ++EXPORT_SYMBOL(crgetgid); ++EXPORT_SYMBOL(crgetrgid); ++EXPORT_SYMBOL(crgetsgid); ++EXPORT_SYMBOL(crgetfsgid); ++EXPORT_SYMBOL(crgetngroups); ++EXPORT_SYMBOL(crgetgroups); ++EXPORT_SYMBOL(groupmember); +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-debug.c linux-3.2.33-go/spl/spl/spl-debug.c +--- linux-3.2.33-go.orig/spl/spl/spl-debug.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-debug.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,1254 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Debug Implementation. ++\*****************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef SS_DEBUG_SUBSYS ++#undef SS_DEBUG_SUBSYS ++#endif ++ ++#define SS_DEBUG_SUBSYS SS_DEBUG ++ ++/* Debug log support enabled */ ++#ifdef DEBUG_LOG ++ ++unsigned long spl_debug_subsys = ~0; ++EXPORT_SYMBOL(spl_debug_subsys); ++module_param(spl_debug_subsys, ulong, 0644); ++MODULE_PARM_DESC(spl_debug_subsys, "Subsystem debugging level mask."); ++ ++unsigned long spl_debug_mask = SD_CANTMASK; ++EXPORT_SYMBOL(spl_debug_mask); ++module_param(spl_debug_mask, ulong, 0644); ++MODULE_PARM_DESC(spl_debug_mask, "Debugging level mask."); ++ ++unsigned long spl_debug_printk = SD_CANTMASK; ++EXPORT_SYMBOL(spl_debug_printk); ++module_param(spl_debug_printk, ulong, 0644); ++MODULE_PARM_DESC(spl_debug_printk, "Console printk level mask."); ++ ++int spl_debug_mb = -1; ++EXPORT_SYMBOL(spl_debug_mb); ++module_param(spl_debug_mb, int, 0644); ++MODULE_PARM_DESC(spl_debug_mb, "Total debug buffer size."); ++ ++unsigned int spl_debug_binary = 1; ++EXPORT_SYMBOL(spl_debug_binary); ++ ++unsigned int spl_debug_catastrophe; ++EXPORT_SYMBOL(spl_debug_catastrophe); ++ ++unsigned int spl_debug_panic_on_bug = 0; ++EXPORT_SYMBOL(spl_debug_panic_on_bug); ++module_param(spl_debug_panic_on_bug, uint, 0644); ++MODULE_PARM_DESC(spl_debug_panic_on_bug, "Panic on BUG"); ++ ++static char spl_debug_file_name[PATH_MAX]; ++char spl_debug_file_path[PATH_MAX] = "/tmp/spl-log"; ++ ++unsigned int spl_console_ratelimit = 1; ++EXPORT_SYMBOL(spl_console_ratelimit); ++ ++long spl_console_max_delay; ++EXPORT_SYMBOL(spl_console_max_delay); ++ ++long spl_console_min_delay; ++EXPORT_SYMBOL(spl_console_min_delay); ++ ++unsigned int spl_console_backoff = SPL_DEFAULT_BACKOFF; ++EXPORT_SYMBOL(spl_console_backoff); ++ ++unsigned int spl_debug_stack; ++EXPORT_SYMBOL(spl_debug_stack); ++ ++static int spl_panic_in_progress; ++ ++union trace_data_union (*trace_data[TCD_TYPE_MAX])[NR_CPUS] __cacheline_aligned; ++char *trace_console_buffers[NR_CPUS][3]; ++struct rw_semaphore trace_sem; ++atomic_t trace_tage_allocated = ATOMIC_INIT(0); ++ ++static int spl_debug_dump_all_pages(dumplog_priv_t *dp, char *); ++static void trace_fini(void); ++ ++ ++/* Memory percentage breakdown by type */ ++static unsigned int pages_factor[TCD_TYPE_MAX] = { ++ 80, /* 80% pages for TCD_TYPE_PROC */ ++ 10, /* 10% pages for TCD_TYPE_SOFTIRQ */ ++ 10 /* 10% pages for TCD_TYPE_IRQ */ ++}; ++ ++const char * ++spl_debug_subsys2str(int subsys) ++{ ++ switch (subsys) { ++ default: ++ return NULL; ++ case SS_UNDEFINED: ++ return "undefined"; ++ case SS_ATOMIC: ++ return "atomic"; ++ case SS_KOBJ: ++ return "kobj"; ++ case SS_VNODE: ++ return "vnode"; ++ case SS_TIME: ++ return "time"; ++ case SS_RWLOCK: ++ return "rwlock"; ++ case SS_THREAD: ++ return "thread"; ++ case SS_CONDVAR: ++ return "condvar"; ++ case SS_MUTEX: ++ return "mutex"; ++ case SS_RNG: ++ return "rng"; ++ case SS_TASKQ: ++ return "taskq"; ++ case SS_KMEM: ++ return "kmem"; ++ case SS_DEBUG: ++ return "debug"; ++ case SS_GENERIC: ++ return "generic"; ++ case SS_PROC: ++ return "proc"; ++ case SS_MODULE: ++ return "module"; ++ case SS_CRED: ++ return "cred"; ++ case SS_KSTAT: ++ return "kstat"; ++ case SS_XDR: ++ return "xdr"; ++ case SS_TSD: ++ return "tsd"; ++ case SS_ZLIB: ++ return "zlib"; ++ case SS_USER1: ++ return "user1"; ++ case SS_USER2: ++ return "user2"; ++ case SS_USER3: ++ return "user3"; ++ case SS_USER4: ++ return "user4"; ++ case SS_USER5: ++ return "user5"; ++ case SS_USER6: ++ return "user6"; ++ case SS_USER7: ++ return "user7"; ++ case SS_USER8: ++ return "user8"; ++ } ++} ++ ++const char * ++spl_debug_dbg2str(int debug) ++{ ++ switch (debug) { ++ default: ++ return NULL; ++ case SD_TRACE: ++ return "trace"; ++ case SD_INFO: ++ return "info"; ++ case SD_WARNING: ++ return "warning"; ++ case SD_ERROR: ++ return "error"; ++ case SD_EMERG: ++ return "emerg"; ++ case SD_CONSOLE: ++ return "console"; ++ case SD_IOCTL: ++ return "ioctl"; ++ case SD_DPRINTF: ++ return "dprintf"; ++ case SD_OTHER: ++ return "other"; ++ } ++} ++ ++int ++spl_debug_mask2str(char *str, int size, unsigned long mask, int is_subsys) ++{ ++ const char *(*fn)(int bit) = is_subsys ? spl_debug_subsys2str : ++ spl_debug_dbg2str; ++ const char *token; ++ int i, bit, len = 0; ++ ++ if (mask == 0) { /* "0" */ ++ if (size > 0) ++ str[0] = '0'; ++ len = 1; ++ } else { /* space-separated tokens */ ++ for (i = 0; i < 32; i++) { ++ bit = 1 << i; ++ ++ if ((mask & bit) == 0) ++ continue; ++ ++ token = fn(bit); ++ if (token == NULL) /* unused bit */ ++ continue; ++ ++ if (len > 0) { /* separator? */ ++ if (len < size) ++ str[len] = ' '; ++ len++; ++ } ++ ++ while (*token != 0) { ++ if (len < size) ++ str[len] = *token; ++ token++; ++ len++; ++ } ++ } ++ } ++ ++ /* terminate 'str' */ ++ if (len < size) ++ str[len] = 0; ++ else ++ str[size - 1] = 0; ++ ++ return len; ++} ++ ++static int ++spl_debug_token2mask(int *mask, const char *str, int len, int is_subsys) ++{ ++ const char *(*fn)(int bit) = is_subsys ? spl_debug_subsys2str : ++ spl_debug_dbg2str; ++ const char *token; ++ int i, j, bit; ++ ++ /* match against known tokens */ ++ for (i = 0; i < 32; i++) { ++ bit = 1 << i; ++ ++ token = fn(bit); ++ if (token == NULL) /* unused? */ ++ continue; ++ ++ /* strcasecmp */ ++ for (j = 0; ; j++) { ++ if (j == len) { /* end of token */ ++ if (token[j] == 0) { ++ *mask = bit; ++ return 0; ++ } ++ break; ++ } ++ ++ if (token[j] == 0) ++ break; ++ ++ if (str[j] == token[j]) ++ continue; ++ ++ if (str[j] < 'A' || 'Z' < str[j]) ++ break; ++ ++ if (str[j] - 'A' + 'a' != token[j]) ++ break; ++ } ++ } ++ ++ return -EINVAL; /* no match */ ++} ++ ++int ++spl_debug_str2mask(unsigned long *mask, const char *str, int is_subsys) ++{ ++ char op = 0; ++ int m = 0, matched, n, t; ++ ++ /* Allow a number for backwards compatibility */ ++ for (n = strlen(str); n > 0; n--) ++ if (!isspace(str[n-1])) ++ break; ++ matched = n; ++ ++ if ((t = sscanf(str, "%i%n", &m, &matched)) >= 1 && matched == n) { ++ *mask = m; ++ return 0; ++ } ++ ++ /* must be a list of debug tokens or numbers separated by ++ * whitespace and optionally an operator ('+' or '-'). If an operator ++ * appears first in , '*mask' is used as the starting point ++ * (relative), otherwise 0 is used (absolute). An operator applies to ++ * all following tokens up to the next operator. */ ++ matched = 0; ++ while (*str != 0) { ++ while (isspace(*str)) /* skip whitespace */ ++ str++; ++ ++ if (*str == 0) ++ break; ++ ++ if (*str == '+' || *str == '-') { ++ op = *str++; ++ ++ /* op on first token == relative */ ++ if (!matched) ++ m = *mask; ++ ++ while (isspace(*str)) /* skip whitespace */ ++ str++; ++ ++ if (*str == 0) /* trailing op */ ++ return -EINVAL; ++ } ++ ++ /* find token length */ ++ for (n = 0; str[n] != 0 && !isspace(str[n]); n++); ++ ++ /* match token */ ++ if (spl_debug_token2mask(&t, str, n, is_subsys) != 0) ++ return -EINVAL; ++ ++ matched = 1; ++ if (op == '-') ++ m &= ~t; ++ else ++ m |= t; ++ ++ str += n; ++ } ++ ++ if (!matched) ++ return -EINVAL; ++ ++ *mask = m; ++ return 0; ++} ++ ++static void ++spl_debug_dumplog_internal(dumplog_priv_t *dp) ++{ ++ void *journal_info; ++ ++ journal_info = current->journal_info; ++ current->journal_info = NULL; ++ ++ snprintf(spl_debug_file_name, sizeof(spl_debug_file_path) - 1, ++ "%s.%ld.%ld", spl_debug_file_path, ++ get_seconds(), (long)dp->dp_pid); ++ printk("SPL: Dumping log to %s\n", spl_debug_file_name); ++ spl_debug_dump_all_pages(dp, spl_debug_file_name); ++ ++ current->journal_info = journal_info; ++} ++ ++static int ++spl_debug_dumplog_thread(void *arg) ++{ ++ dumplog_priv_t *dp = (dumplog_priv_t *)arg; ++ ++ spl_debug_dumplog_internal(dp); ++ atomic_set(&dp->dp_done, 1); ++ wake_up(&dp->dp_waitq); ++ complete_and_exit(NULL, 0); ++ ++ return 0; /* Unreachable */ ++} ++ ++/* When flag is set do not use a new thread for the debug dump */ ++int ++spl_debug_dumplog(int flags) ++{ ++ struct task_struct *tsk; ++ dumplog_priv_t dp; ++ ++ init_waitqueue_head(&dp.dp_waitq); ++ dp.dp_pid = current->pid; ++ dp.dp_flags = flags; ++ atomic_set(&dp.dp_done, 0); ++ ++ if (dp.dp_flags & DL_NOTHREAD) { ++ spl_debug_dumplog_internal(&dp); ++ } else { ++ ++ tsk = kthread_create(spl_debug_dumplog_thread,(void *)&dp,"spl_debug"); ++ if (tsk == NULL) ++ return -ENOMEM; ++ ++ wake_up_process(tsk); ++ wait_event(dp.dp_waitq, atomic_read(&dp.dp_done)); ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(spl_debug_dumplog); ++ ++static char * ++trace_get_console_buffer(void) ++{ ++ int cpu = get_cpu(); ++ int idx; ++ ++ if (in_irq()) { ++ idx = 0; ++ } else if (in_softirq()) { ++ idx = 1; ++ } else { ++ idx = 2; ++ } ++ ++ return trace_console_buffers[cpu][idx]; ++} ++ ++static void ++trace_put_console_buffer(char *buffer) ++{ ++ put_cpu(); ++} ++ ++static int ++trace_lock_tcd(struct trace_cpu_data *tcd) ++{ ++ __ASSERT(tcd->tcd_type < TCD_TYPE_MAX); ++ ++ spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags); ++ ++ return 1; ++} ++ ++static void ++trace_unlock_tcd(struct trace_cpu_data *tcd) ++{ ++ __ASSERT(tcd->tcd_type < TCD_TYPE_MAX); ++ ++ spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags); ++} ++ ++static struct trace_cpu_data * ++trace_get_tcd(void) ++{ ++ int cpu; ++ struct trace_cpu_data *tcd; ++ ++ cpu = get_cpu(); ++ if (in_irq()) ++ tcd = &(*trace_data[TCD_TYPE_IRQ])[cpu].tcd; ++ else if (in_softirq()) ++ tcd = &(*trace_data[TCD_TYPE_SOFTIRQ])[cpu].tcd; ++ else ++ tcd = &(*trace_data[TCD_TYPE_PROC])[cpu].tcd; ++ ++ trace_lock_tcd(tcd); ++ ++ return tcd; ++} ++ ++static void ++trace_put_tcd (struct trace_cpu_data *tcd) ++{ ++ trace_unlock_tcd(tcd); ++ ++ put_cpu(); ++} ++ ++static void ++trace_set_debug_header(struct spl_debug_header *header, int subsys, ++ int mask, const int line, unsigned long stack) ++{ ++ struct timeval tv; ++ ++ do_gettimeofday(&tv); ++ ++ header->ph_subsys = subsys; ++ header->ph_mask = mask; ++ header->ph_cpu_id = smp_processor_id(); ++ header->ph_sec = (__u32)tv.tv_sec; ++ header->ph_usec = tv.tv_usec; ++ header->ph_stack = stack; ++ header->ph_pid = current->pid; ++ header->ph_line_num = line; ++ ++ return; ++} ++ ++static void ++trace_print_to_console(struct spl_debug_header *hdr, int mask, const char *buf, ++ int len, const char *file, const char *fn) ++{ ++ char *prefix = "SPL", *ptype = NULL; ++ ++ if ((mask & SD_EMERG) != 0) { ++ prefix = "SPLError"; ++ ptype = KERN_EMERG; ++ } else if ((mask & SD_ERROR) != 0) { ++ prefix = "SPLError"; ++ ptype = KERN_ERR; ++ } else if ((mask & SD_WARNING) != 0) { ++ prefix = "SPL"; ++ ptype = KERN_WARNING; ++ } else if ((mask & (SD_CONSOLE | spl_debug_printk)) != 0) { ++ prefix = "SPL"; ++ ptype = KERN_INFO; ++ } ++ ++ if ((mask & SD_CONSOLE) != 0) { ++ printk("%s%s: %.*s", ptype, prefix, len, buf); ++ } else { ++ printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, ++ hdr->ph_pid, hdr->ph_stack, file, ++ hdr->ph_line_num, fn, len, buf); ++ } ++ ++ return; ++} ++ ++static int ++trace_max_debug_mb(void) ++{ ++ return MAX(512, ((num_physpages >> (20 - PAGE_SHIFT)) * 80) / 100); ++} ++ ++static struct trace_page * ++tage_alloc(int gfp) ++{ ++ struct page *page; ++ struct trace_page *tage; ++ ++ page = alloc_pages(gfp | __GFP_NOWARN, 0); ++ if (page == NULL) ++ return NULL; ++ ++ tage = kmalloc(sizeof(*tage), gfp); ++ if (tage == NULL) { ++ __free_pages(page, 0); ++ return NULL; ++ } ++ ++ tage->page = page; ++ atomic_inc(&trace_tage_allocated); ++ ++ return tage; ++} ++ ++static void ++tage_free(struct trace_page *tage) ++{ ++ __ASSERT(tage != NULL); ++ __ASSERT(tage->page != NULL); ++ ++ __free_pages(tage->page, 0); ++ kfree(tage); ++ atomic_dec(&trace_tage_allocated); ++} ++ ++static struct trace_page * ++tage_from_list(struct list_head *list) ++{ ++ return list_entry(list, struct trace_page, linkage); ++} ++ ++static void ++tage_to_tail(struct trace_page *tage, struct list_head *queue) ++{ ++ __ASSERT(tage != NULL); ++ __ASSERT(queue != NULL); ++ ++ list_move_tail(&tage->linkage, queue); ++} ++ ++/* try to return a page that has 'len' bytes left at the end */ ++static struct trace_page * ++trace_get_tage_try(struct trace_cpu_data *tcd, unsigned long len) ++{ ++ struct trace_page *tage; ++ ++ if (tcd->tcd_cur_pages > 0) { ++ __ASSERT(!list_empty(&tcd->tcd_pages)); ++ tage = tage_from_list(tcd->tcd_pages.prev); ++ if (tage->used + len <= PAGE_SIZE) ++ return tage; ++ } ++ ++ if (tcd->tcd_cur_pages < tcd->tcd_max_pages) { ++ if (tcd->tcd_cur_stock_pages > 0) { ++ tage = tage_from_list(tcd->tcd_stock_pages.prev); ++ tcd->tcd_cur_stock_pages--; ++ list_del_init(&tage->linkage); ++ } else { ++ tage = tage_alloc(GFP_ATOMIC); ++ if (tage == NULL) { ++ printk(KERN_WARNING ++ "failure to allocate a tage (%ld)\n", ++ tcd->tcd_cur_pages); ++ return NULL; ++ } ++ } ++ ++ tage->used = 0; ++ tage->cpu = smp_processor_id(); ++ tage->type = tcd->tcd_type; ++ list_add_tail(&tage->linkage, &tcd->tcd_pages); ++ tcd->tcd_cur_pages++; ++ ++ return tage; ++ } ++ ++ return NULL; ++} ++ ++/* return a page that has 'len' bytes left at the end */ ++static struct trace_page * ++trace_get_tage(struct trace_cpu_data *tcd, unsigned long len) ++{ ++ struct trace_page *tage; ++ ++ __ASSERT(len <= PAGE_SIZE); ++ ++ tage = trace_get_tage_try(tcd, len); ++ if (tage) ++ return tage; ++ ++ if (tcd->tcd_cur_pages > 0) { ++ tage = tage_from_list(tcd->tcd_pages.next); ++ tage->used = 0; ++ tage_to_tail(tage, &tcd->tcd_pages); ++ } ++ ++ return tage; ++} ++ ++int ++spl_debug_msg(void *arg, int subsys, int mask, const char *file, ++ const char *fn, const int line, const char *format, ...) ++{ ++ spl_debug_limit_state_t *cdls = arg; ++ struct trace_cpu_data *tcd = NULL; ++ struct spl_debug_header header = { 0, }; ++ struct trace_page *tage; ++ /* string_buf is used only if tcd != NULL, and is always set then */ ++ char *string_buf = NULL; ++ char *debug_buf; ++ int known_size; ++ int needed = 85; /* average message length */ ++ int max_nob; ++ va_list ap; ++ int i; ++ ++ if (subsys == 0) ++ subsys = SS_DEBUG_SUBSYS; ++ ++ if (mask == 0) ++ mask = SD_EMERG; ++ ++ if (strchr(file, '/')) ++ file = strrchr(file, '/') + 1; ++ ++ tcd = trace_get_tcd(); ++ trace_set_debug_header(&header, subsys, mask, line, 0); ++ if (tcd == NULL) ++ goto console; ++ ++ if (tcd->tcd_shutting_down) { ++ trace_put_tcd(tcd); ++ tcd = NULL; ++ goto console; ++ } ++ ++ known_size = strlen(file) + 1; ++ if (fn) ++ known_size += strlen(fn) + 1; ++ ++ if (spl_debug_binary) ++ known_size += sizeof(header); ++ ++ /* '2' used because vsnprintf returns real size required for output ++ * _without_ terminating NULL. */ ++ for (i = 0; i < 2; i++) { ++ tage = trace_get_tage(tcd, needed + known_size + 1); ++ if (tage == NULL) { ++ if (needed + known_size > PAGE_SIZE) ++ mask |= SD_ERROR; ++ ++ trace_put_tcd(tcd); ++ tcd = NULL; ++ goto console; ++ } ++ ++ string_buf = (char *)page_address(tage->page) + ++ tage->used + known_size; ++ ++ max_nob = PAGE_SIZE - tage->used - known_size; ++ if (max_nob <= 0) { ++ printk(KERN_EMERG "negative max_nob: %i\n", max_nob); ++ mask |= SD_ERROR; ++ trace_put_tcd(tcd); ++ tcd = NULL; ++ goto console; ++ } ++ ++ needed = 0; ++ if (format) { ++ va_start(ap, format); ++ needed += vsnprintf(string_buf, max_nob, format, ap); ++ va_end(ap); ++ } ++ ++ if (needed < max_nob) ++ break; ++ } ++ ++ header.ph_len = known_size + needed; ++ debug_buf = (char *)page_address(tage->page) + tage->used; ++ ++ if (spl_debug_binary) { ++ memcpy(debug_buf, &header, sizeof(header)); ++ tage->used += sizeof(header); ++ debug_buf += sizeof(header); ++ } ++ ++ strcpy(debug_buf, file); ++ tage->used += strlen(file) + 1; ++ debug_buf += strlen(file) + 1; ++ ++ if (fn) { ++ strcpy(debug_buf, fn); ++ tage->used += strlen(fn) + 1; ++ debug_buf += strlen(fn) + 1; ++ } ++ ++ __ASSERT(debug_buf == string_buf); ++ ++ tage->used += needed; ++ __ASSERT (tage->used <= PAGE_SIZE); ++ ++console: ++ if ((mask & spl_debug_printk) == 0) { ++ /* no console output requested */ ++ if (tcd != NULL) ++ trace_put_tcd(tcd); ++ return 1; ++ } ++ ++ if (cdls != NULL) { ++ if (spl_console_ratelimit && cdls->cdls_next != 0 && ++ !time_before(cdls->cdls_next, jiffies)) { ++ /* skipping a console message */ ++ cdls->cdls_count++; ++ if (tcd != NULL) ++ trace_put_tcd(tcd); ++ return 1; ++ } ++ ++ if (time_before(cdls->cdls_next + spl_console_max_delay + ++ (10 * HZ), jiffies)) { ++ /* last timeout was a long time ago */ ++ cdls->cdls_delay /= spl_console_backoff * 4; ++ } else { ++ cdls->cdls_delay *= spl_console_backoff; ++ ++ if (cdls->cdls_delay < spl_console_min_delay) ++ cdls->cdls_delay = spl_console_min_delay; ++ else if (cdls->cdls_delay > spl_console_max_delay) ++ cdls->cdls_delay = spl_console_max_delay; ++ } ++ ++ /* ensure cdls_next is never zero after it's been seen */ ++ cdls->cdls_next = (jiffies + cdls->cdls_delay) | 1; ++ } ++ ++ if (tcd != NULL) { ++ trace_print_to_console(&header, mask, string_buf, needed, file, fn); ++ trace_put_tcd(tcd); ++ } else { ++ string_buf = trace_get_console_buffer(); ++ ++ needed = 0; ++ if (format != NULL) { ++ va_start(ap, format); ++ needed += vsnprintf(string_buf, ++ TRACE_CONSOLE_BUFFER_SIZE, format, ap); ++ va_end(ap); ++ } ++ trace_print_to_console(&header, mask, ++ string_buf, needed, file, fn); ++ ++ trace_put_console_buffer(string_buf); ++ } ++ ++ if (cdls != NULL && cdls->cdls_count != 0) { ++ string_buf = trace_get_console_buffer(); ++ ++ needed = snprintf(string_buf, TRACE_CONSOLE_BUFFER_SIZE, ++ "Skipped %d previous similar message%s\n", ++ cdls->cdls_count, (cdls->cdls_count > 1) ? "s" : ""); ++ ++ trace_print_to_console(&header, mask, ++ string_buf, needed, file, fn); ++ ++ trace_put_console_buffer(string_buf); ++ cdls->cdls_count = 0; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(spl_debug_msg); ++ ++/* Do the collect_pages job on a single CPU: assumes that all other ++ * CPUs have been stopped during a panic. If this isn't true for ++ * some arch, this will have to be implemented separately in each arch. ++ */ ++static void ++collect_pages_from_single_cpu(struct page_collection *pc) ++{ ++ struct trace_cpu_data *tcd; ++ int i, j; ++ ++ tcd_for_each(tcd, i, j) { ++ list_splice_init(&tcd->tcd_pages, &pc->pc_pages); ++ tcd->tcd_cur_pages = 0; ++ } ++} ++ ++static void ++collect_pages_on_all_cpus(struct page_collection *pc) ++{ ++ struct trace_cpu_data *tcd; ++ int i, cpu; ++ ++ spin_lock(&pc->pc_lock); ++ for_each_possible_cpu(cpu) { ++ tcd_for_each_type_lock(tcd, i, cpu) { ++ list_splice_init(&tcd->tcd_pages, &pc->pc_pages); ++ tcd->tcd_cur_pages = 0; ++ } ++ } ++ spin_unlock(&pc->pc_lock); ++} ++ ++static void ++collect_pages(dumplog_priv_t *dp, struct page_collection *pc) ++{ ++ INIT_LIST_HEAD(&pc->pc_pages); ++ ++ if (spl_panic_in_progress || dp->dp_flags & DL_SINGLE_CPU) ++ collect_pages_from_single_cpu(pc); ++ else ++ collect_pages_on_all_cpus(pc); ++} ++ ++static void ++put_pages_back_on_all_cpus(struct page_collection *pc) ++{ ++ struct trace_cpu_data *tcd; ++ struct list_head *cur_head; ++ struct trace_page *tage; ++ struct trace_page *tmp; ++ int i, cpu; ++ ++ spin_lock(&pc->pc_lock); ++ ++ for_each_possible_cpu(cpu) { ++ tcd_for_each_type_lock(tcd, i, cpu) { ++ cur_head = tcd->tcd_pages.next; ++ ++ list_for_each_entry_safe(tage, tmp, &pc->pc_pages, ++ linkage) { ++ if (tage->cpu != cpu || tage->type != i) ++ continue; ++ ++ tage_to_tail(tage, cur_head); ++ tcd->tcd_cur_pages++; ++ } ++ } ++ } ++ ++ spin_unlock(&pc->pc_lock); ++} ++ ++static void ++put_pages_back(struct page_collection *pc) ++{ ++ if (!spl_panic_in_progress) ++ put_pages_back_on_all_cpus(pc); ++} ++ ++static int ++spl_debug_dump_all_pages(dumplog_priv_t *dp, char *filename) ++{ ++ struct page_collection pc; ++ struct file *filp; ++ struct trace_page *tage; ++ struct trace_page *tmp; ++ mm_segment_t oldfs; ++ int rc = 0; ++ ++ down_write(&trace_sem); ++ ++ filp = spl_filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, ++ 0600, &rc); ++ if (filp == NULL) { ++ if (rc != -EEXIST) ++ printk(KERN_ERR "SPL: Can't open %s for dump: %d\n", ++ filename, rc); ++ goto out; ++ } ++ ++ spin_lock_init(&pc.pc_lock); ++ collect_pages(dp, &pc); ++ if (list_empty(&pc.pc_pages)) { ++ rc = 0; ++ goto close; ++ } ++ ++ oldfs = get_fs(); ++ set_fs(get_ds()); ++ ++ list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { ++ rc = spl_filp_write(filp, page_address(tage->page), ++ tage->used, spl_filp_poff(filp)); ++ if (rc != (int)tage->used) { ++ printk(KERN_WARNING "SPL: Wanted to write %u " ++ "but wrote %d\n", tage->used, rc); ++ put_pages_back(&pc); ++ __ASSERT(list_empty(&pc.pc_pages)); ++ break; ++ } ++ list_del(&tage->linkage); ++ tage_free(tage); ++ } ++ ++ set_fs(oldfs); ++ ++ rc = spl_filp_fsync(filp, 1); ++ if (rc) ++ printk(KERN_ERR "SPL: Unable to sync: %d\n", rc); ++ close: ++ spl_filp_close(filp); ++ out: ++ up_write(&trace_sem); ++ ++ return rc; ++} ++ ++static void ++spl_debug_flush_pages(void) ++{ ++ dumplog_priv_t dp; ++ struct page_collection pc; ++ struct trace_page *tage; ++ struct trace_page *tmp; ++ ++ spin_lock_init(&pc.pc_lock); ++ init_waitqueue_head(&dp.dp_waitq); ++ dp.dp_pid = current->pid; ++ dp.dp_flags = 0; ++ atomic_set(&dp.dp_done, 0); ++ ++ collect_pages(&dp, &pc); ++ list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { ++ list_del(&tage->linkage); ++ tage_free(tage); ++ } ++} ++ ++unsigned long ++spl_debug_set_mask(unsigned long mask) { ++ spl_debug_mask = mask; ++ return 0; ++} ++EXPORT_SYMBOL(spl_debug_set_mask); ++ ++unsigned long ++spl_debug_get_mask(void) { ++ return spl_debug_mask; ++} ++EXPORT_SYMBOL(spl_debug_get_mask); ++ ++unsigned long ++spl_debug_set_subsys(unsigned long subsys) { ++ spl_debug_subsys = subsys; ++ return 0; ++} ++EXPORT_SYMBOL(spl_debug_set_subsys); ++ ++unsigned long ++spl_debug_get_subsys(void) { ++ return spl_debug_subsys; ++} ++EXPORT_SYMBOL(spl_debug_get_subsys); ++ ++int ++spl_debug_set_mb(int mb) ++{ ++ int i, j, pages; ++ int limit = trace_max_debug_mb(); ++ struct trace_cpu_data *tcd; ++ ++ if (mb < num_possible_cpus()) { ++ printk(KERN_ERR "SPL: Refusing to set debug buffer size to " ++ "%dMB - lower limit is %d\n", mb, num_possible_cpus()); ++ return -EINVAL; ++ } ++ ++ if (mb > limit) { ++ printk(KERN_ERR "SPL: Refusing to set debug buffer size to " ++ "%dMB - upper limit is %d\n", mb, limit); ++ return -EINVAL; ++ } ++ ++ mb /= num_possible_cpus(); ++ pages = mb << (20 - PAGE_SHIFT); ++ ++ down_write(&trace_sem); ++ ++ tcd_for_each(tcd, i, j) ++ tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100; ++ ++ up_write(&trace_sem); ++ ++ return 0; ++} ++EXPORT_SYMBOL(spl_debug_set_mb); ++ ++int ++spl_debug_get_mb(void) ++{ ++ int i, j; ++ struct trace_cpu_data *tcd; ++ int total_pages = 0; ++ ++ down_read(&trace_sem); ++ ++ tcd_for_each(tcd, i, j) ++ total_pages += tcd->tcd_max_pages; ++ ++ up_read(&trace_sem); ++ ++ return (total_pages >> (20 - PAGE_SHIFT)) + 1; ++} ++EXPORT_SYMBOL(spl_debug_get_mb); ++ ++void spl_debug_dumpstack(struct task_struct *tsk) ++{ ++ extern void show_task(struct task_struct *); ++ ++ if (tsk == NULL) ++ tsk = current; ++ ++ printk("SPL: Showing stack for process %d\n", tsk->pid); ++ dump_stack(); ++} ++EXPORT_SYMBOL(spl_debug_dumpstack); ++ ++void spl_debug_bug(char *file, const char *func, const int line, int flags) ++{ ++ spl_debug_catastrophe = 1; ++ spl_debug_msg(NULL, 0, SD_EMERG, file, func, line, "SPL PANIC\n"); ++ ++ if (in_interrupt()) ++ panic("SPL PANIC in interrupt.\n"); ++ ++ if (in_atomic() || irqs_disabled()) ++ flags |= DL_NOTHREAD; ++ ++ /* Ensure all debug pages and dumped by current cpu */ ++ if (spl_debug_panic_on_bug) ++ spl_panic_in_progress = 1; ++ ++ spl_debug_dumpstack(NULL); ++ spl_debug_dumplog(flags); ++ ++ if (spl_debug_panic_on_bug) ++ panic("SPL PANIC"); ++ ++ set_task_state(current, TASK_UNINTERRUPTIBLE); ++ while (1) ++ schedule(); ++} ++EXPORT_SYMBOL(spl_debug_bug); ++ ++int ++spl_debug_clear_buffer(void) ++{ ++ spl_debug_flush_pages(); ++ return 0; ++} ++EXPORT_SYMBOL(spl_debug_clear_buffer); ++ ++int ++spl_debug_mark_buffer(char *text) ++{ ++ SDEBUG(SD_WARNING, "*************************************\n"); ++ SDEBUG(SD_WARNING, "DEBUG MARKER: %s\n", text); ++ SDEBUG(SD_WARNING, "*************************************\n"); ++ ++ return 0; ++} ++EXPORT_SYMBOL(spl_debug_mark_buffer); ++ ++static int ++trace_init(int max_pages) ++{ ++ struct trace_cpu_data *tcd; ++ int i, j; ++ ++ init_rwsem(&trace_sem); ++ ++ /* initialize trace_data */ ++ memset(trace_data, 0, sizeof(trace_data)); ++ for (i = 0; i < TCD_TYPE_MAX; i++) { ++ trace_data[i] = kmalloc(sizeof(union trace_data_union) * ++ NR_CPUS, GFP_KERNEL); ++ if (trace_data[i] == NULL) ++ goto out; ++ } ++ ++ tcd_for_each(tcd, i, j) { ++ spin_lock_init(&tcd->tcd_lock); ++ tcd->tcd_pages_factor = pages_factor[i]; ++ tcd->tcd_type = i; ++ tcd->tcd_cpu = j; ++ INIT_LIST_HEAD(&tcd->tcd_pages); ++ INIT_LIST_HEAD(&tcd->tcd_stock_pages); ++ tcd->tcd_cur_pages = 0; ++ tcd->tcd_cur_stock_pages = 0; ++ tcd->tcd_max_pages = (max_pages * pages_factor[i]) / 100; ++ tcd->tcd_shutting_down = 0; ++ } ++ ++ for (i = 0; i < num_possible_cpus(); i++) { ++ for (j = 0; j < 3; j++) { ++ trace_console_buffers[i][j] = ++ kmalloc(TRACE_CONSOLE_BUFFER_SIZE, ++ GFP_KERNEL); ++ ++ if (trace_console_buffers[i][j] == NULL) ++ goto out; ++ } ++ } ++ ++ return 0; ++out: ++ trace_fini(); ++ printk(KERN_ERR "SPL: Insufficient memory for debug logs\n"); ++ return -ENOMEM; ++} ++ ++int ++spl_debug_init(void) ++{ ++ int rc, max = spl_debug_mb; ++ ++ spl_console_max_delay = SPL_DEFAULT_MAX_DELAY; ++ spl_console_min_delay = SPL_DEFAULT_MIN_DELAY; ++ ++ /* If spl_debug_mb is set to an invalid value or uninitialized ++ * then just make the total buffers smp_num_cpus TCD_MAX_PAGES */ ++ if (max > (num_physpages >> (20 - 2 - PAGE_SHIFT)) / 5 || ++ max >= 512 || max < 0) { ++ max = TCD_MAX_PAGES; ++ } else { ++ max = (max / num_online_cpus()) << (20 - PAGE_SHIFT); ++ } ++ ++ rc = trace_init(max); ++ if (rc) ++ return rc; ++ ++ return rc; ++} ++ ++static void ++trace_cleanup_on_all_cpus(void) ++{ ++ struct trace_cpu_data *tcd; ++ struct trace_page *tage; ++ struct trace_page *tmp; ++ int i, cpu; ++ ++ for_each_possible_cpu(cpu) { ++ tcd_for_each_type_lock(tcd, i, cpu) { ++ tcd->tcd_shutting_down = 1; ++ ++ list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, ++ linkage) { ++ list_del(&tage->linkage); ++ tage_free(tage); ++ } ++ tcd->tcd_cur_pages = 0; ++ } ++ } ++} ++ ++static void ++trace_fini(void) ++{ ++ int i, j; ++ ++ trace_cleanup_on_all_cpus(); ++ ++ for (i = 0; i < num_possible_cpus(); i++) { ++ for (j = 0; j < 3; j++) { ++ if (trace_console_buffers[i][j] != NULL) { ++ kfree(trace_console_buffers[i][j]); ++ trace_console_buffers[i][j] = NULL; ++ } ++ } ++ } ++ ++ for (i = 0; i < TCD_TYPE_MAX && trace_data[i] != NULL; i++) { ++ kfree(trace_data[i]); ++ trace_data[i] = NULL; ++ } ++} ++ ++void ++spl_debug_fini(void) ++{ ++ trace_fini(); ++} ++ ++#endif /* DEBUG_LOG */ +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-err.c linux-3.2.33-go/spl/spl/spl-err.c +--- linux-3.2.33-go.orig/spl/spl/spl-err.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-err.c 2012-11-16 23:22:32.411192851 +0100 +@@ -0,0 +1,82 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Error Implementation. ++\*****************************************************************************/ ++ ++#include ++#include ++#include ++ ++#ifdef SS_DEBUG_SUBSYS ++#undef SS_DEBUG_SUBSYS ++#endif ++ ++#define SS_DEBUG_SUBSYS SS_GENERIC ++ ++#ifdef DEBUG_LOG ++static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" }; ++static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" }; ++#endif ++ ++void ++vpanic(const char *fmt, va_list ap) ++{ ++ char msg[MAXMSGLEN]; ++ ++ vsnprintf(msg, MAXMSGLEN - 1, fmt, ap); ++ PANIC("%s", msg); ++} /* vpanic() */ ++EXPORT_SYMBOL(vpanic); ++ ++void ++vcmn_err(int ce, const char *fmt, va_list ap) ++{ ++ char msg[MAXMSGLEN]; ++ ++ if (ce == CE_PANIC) ++ vpanic(fmt, ap); ++ ++ if (ce != CE_NOTE) { ++ vsnprintf(msg, MAXMSGLEN - 1, fmt, ap); ++ ++ if (fmt[0] == '!') ++ SDEBUG(SD_INFO, "%s%s%s", ++ ce_prefix[ce], msg, ce_suffix[ce]); ++ else ++ SERROR("%s%s%s", ce_prefix[ce], msg, ce_suffix[ce]); ++ } ++} /* vcmn_err() */ ++EXPORT_SYMBOL(vcmn_err); ++ ++void ++cmn_err(int ce, const char *fmt, ...) ++{ ++ va_list ap; ++ ++ va_start(ap, fmt); ++ vcmn_err(ce, fmt, ap); ++ va_end(ap); ++} /* cmn_err() */ ++EXPORT_SYMBOL(cmn_err); ++ +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-generic.c linux-3.2.33-go/spl/spl/spl-generic.c +--- linux-3.2.33-go.orig/spl/spl/spl-generic.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-generic.c 2012-11-16 23:22:32.411192851 +0100 +@@ -0,0 +1,742 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Generic Implementation. ++\*****************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef SS_DEBUG_SUBSYS ++#undef SS_DEBUG_SUBSYS ++#endif ++ ++#define SS_DEBUG_SUBSYS SS_GENERIC ++ ++char spl_version[32] = "SPL v" SPL_META_VERSION "-" SPL_META_RELEASE; ++EXPORT_SYMBOL(spl_version); ++ ++unsigned long spl_hostid = HW_INVALID_HOSTID; ++EXPORT_SYMBOL(spl_hostid); ++module_param(spl_hostid, ulong, 0644); ++MODULE_PARM_DESC(spl_hostid, "The system hostid."); ++ ++char hw_serial[HW_HOSTID_LEN] = ""; ++EXPORT_SYMBOL(hw_serial); ++ ++proc_t p0 = { 0 }; ++EXPORT_SYMBOL(p0); ++ ++#ifndef HAVE_KALLSYMS_LOOKUP_NAME ++kallsyms_lookup_name_t spl_kallsyms_lookup_name_fn = SYMBOL_POISON; ++#endif ++ ++int ++highbit(unsigned long i) ++{ ++ register int h = 1; ++ SENTRY; ++ ++ if (i == 0) ++ SRETURN(0); ++#if BITS_PER_LONG == 64 ++ if (i & 0xffffffff00000000ul) { ++ h += 32; i >>= 32; ++ } ++#endif ++ if (i & 0xffff0000) { ++ h += 16; i >>= 16; ++ } ++ if (i & 0xff00) { ++ h += 8; i >>= 8; ++ } ++ if (i & 0xf0) { ++ h += 4; i >>= 4; ++ } ++ if (i & 0xc) { ++ h += 2; i >>= 2; ++ } ++ if (i & 0x2) { ++ h += 1; ++ } ++ SRETURN(h); ++} ++EXPORT_SYMBOL(highbit); ++ ++#if BITS_PER_LONG == 32 ++/* ++ * Support 64/64 => 64 division on a 32-bit platform. While the kernel ++ * provides a div64_u64() function for this we do not use it because the ++ * implementation is flawed. There are cases which return incorrect ++ * results as late as linux-2.6.35. Until this is fixed upstream the ++ * spl must provide its own implementation. ++ * ++ * This implementation is a slightly modified version of the algorithm ++ * proposed by the book 'Hacker's Delight'. The original source can be ++ * found here and is available for use without restriction. ++ * ++ * http://www.hackersdelight.org/HDcode/newCode/divDouble.c ++ */ ++ ++/* ++ * Calculate number of leading of zeros for a 64-bit value. ++ */ ++static int ++nlz64(uint64_t x) { ++ register int n = 0; ++ ++ if (x == 0) ++ return 64; ++ ++ if (x <= 0x00000000FFFFFFFFULL) {n = n + 32; x = x << 32;} ++ if (x <= 0x0000FFFFFFFFFFFFULL) {n = n + 16; x = x << 16;} ++ if (x <= 0x00FFFFFFFFFFFFFFULL) {n = n + 8; x = x << 8;} ++ if (x <= 0x0FFFFFFFFFFFFFFFULL) {n = n + 4; x = x << 4;} ++ if (x <= 0x3FFFFFFFFFFFFFFFULL) {n = n + 2; x = x << 2;} ++ if (x <= 0x7FFFFFFFFFFFFFFFULL) {n = n + 1;} ++ ++ return n; ++} ++ ++/* ++ * Newer kernels have a div_u64() function but we define our own ++ * to simplify portibility between kernel versions. ++ */ ++static inline uint64_t ++__div_u64(uint64_t u, uint32_t v) ++{ ++ (void) do_div(u, v); ++ return u; ++} ++ ++/* ++ * Implementation of 64-bit unsigned division for 32-bit machines. ++ * ++ * First the procedure takes care of the case in which the divisor is a ++ * 32-bit quantity. There are two subcases: (1) If the left half of the ++ * dividend is less than the divisor, one execution of do_div() is all that ++ * is required (overflow is not possible). (2) Otherwise it does two ++ * divisions, using the grade school method. ++ */ ++uint64_t ++__udivdi3(uint64_t u, uint64_t v) ++{ ++ uint64_t u0, u1, v1, q0, q1, k; ++ int n; ++ ++ if (v >> 32 == 0) { // If v < 2**32: ++ if (u >> 32 < v) { // If u/v cannot overflow, ++ return __div_u64(u, v); // just do one division. ++ } else { // If u/v would overflow: ++ u1 = u >> 32; // Break u into two halves. ++ u0 = u & 0xFFFFFFFF; ++ q1 = __div_u64(u1, v); // First quotient digit. ++ k = u1 - q1 * v; // First remainder, < v. ++ u0 += (k << 32); ++ q0 = __div_u64(u0, v); // Seconds quotient digit. ++ return (q1 << 32) + q0; ++ } ++ } else { // If v >= 2**32: ++ n = nlz64(v); // 0 <= n <= 31. ++ v1 = (v << n) >> 32; // Normalize divisor, MSB is 1. ++ u1 = u >> 1; // To ensure no overflow. ++ q1 = __div_u64(u1, v1); // Get quotient from ++ q0 = (q1 << n) >> 31; // Undo normalization and ++ // division of u by 2. ++ if (q0 != 0) // Make q0 correct or ++ q0 = q0 - 1; // too small by 1. ++ if ((u - q0 * v) >= v) ++ q0 = q0 + 1; // Now q0 is correct. ++ ++ return q0; ++ } ++} ++EXPORT_SYMBOL(__udivdi3); ++ ++/* ++ * Implementation of 64-bit signed division for 32-bit machines. ++ */ ++int64_t ++__divdi3(int64_t u, int64_t v) ++{ ++ int64_t q, t; ++ q = __udivdi3(abs64(u), abs64(v)); ++ t = (u ^ v) >> 63; // If u, v have different ++ return (q ^ t) - t; // signs, negate q. ++} ++EXPORT_SYMBOL(__divdi3); ++ ++/* ++ * Implementation of 64-bit unsigned modulo for 32-bit machines. ++ */ ++uint64_t ++__umoddi3(uint64_t dividend, uint64_t divisor) ++{ ++ return (dividend - (divisor * __udivdi3(dividend, divisor))); ++} ++EXPORT_SYMBOL(__umoddi3); ++ ++#if defined(__arm) || defined(__arm__) ++/* ++ * Implementation of 64-bit (un)signed division for 32-bit arm machines. ++ * ++ * Run-time ABI for the ARM Architecture (page 20). A pair of (unsigned) ++ * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1}, ++ * and the remainder in {r2, r3}. The return type is specifically left ++ * set to 'void' to ensure the compiler does not overwrite these registers ++ * during the return. All results are in registers as per ABI ++ */ ++void ++__aeabi_uldivmod(uint64_t u, uint64_t v) ++{ ++ uint64_t res; ++ uint64_t mod; ++ ++ res = __udivdi3(u, v); ++ mod = __umoddi3(u, v); ++ { ++ register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); ++ register uint32_t r1 asm("r1") = (res >> 32); ++ register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); ++ register uint32_t r3 asm("r3") = (mod >> 32); ++ ++ asm volatile("" ++ : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */ ++ : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ ++ ++ return; /* r0; */ ++ } ++} ++EXPORT_SYMBOL(__aeabi_uldivmod); ++ ++void ++__aeabi_ldivmod(int64_t u, int64_t v) ++{ ++ int64_t res; ++ uint64_t mod; ++ ++ res = __divdi3(u, v); ++ mod = __umoddi3(u, v); ++ { ++ register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); ++ register uint32_t r1 asm("r1") = (res >> 32); ++ register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); ++ register uint32_t r3 asm("r3") = (mod >> 32); ++ ++ asm volatile("" ++ : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */ ++ : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ ++ ++ return; /* r0; */ ++ } ++} ++EXPORT_SYMBOL(__aeabi_ldivmod); ++#endif /* __arm || __arm__ */ ++#endif /* BITS_PER_LONG */ ++ ++/* NOTE: The strtoxx behavior is solely based on my reading of the Solaris ++ * ddi_strtol(9F) man page. I have not verified the behavior of these ++ * functions against their Solaris counterparts. It is possible that I ++ * may have misinterpreted the man page or the man page is incorrect. ++ */ ++int ddi_strtoul(const char *, char **, int, unsigned long *); ++int ddi_strtol(const char *, char **, int, long *); ++int ddi_strtoull(const char *, char **, int, unsigned long long *); ++int ddi_strtoll(const char *, char **, int, long long *); ++ ++#define define_ddi_strtoux(type, valtype) \ ++int ddi_strtou##type(const char *str, char **endptr, \ ++ int base, valtype *result) \ ++{ \ ++ valtype last_value, value = 0; \ ++ char *ptr = (char *)str; \ ++ int flag = 1, digit; \ ++ \ ++ if (strlen(ptr) == 0) \ ++ return EINVAL; \ ++ \ ++ /* Auto-detect base based on prefix */ \ ++ if (!base) { \ ++ if (str[0] == '0') { \ ++ if (tolower(str[1])=='x' && isxdigit(str[2])) { \ ++ base = 16; /* hex */ \ ++ ptr += 2; \ ++ } else if (str[1] >= '0' && str[1] < 8) { \ ++ base = 8; /* octal */ \ ++ ptr += 1; \ ++ } else { \ ++ return EINVAL; \ ++ } \ ++ } else { \ ++ base = 10; /* decimal */ \ ++ } \ ++ } \ ++ \ ++ while (1) { \ ++ if (isdigit(*ptr)) \ ++ digit = *ptr - '0'; \ ++ else if (isalpha(*ptr)) \ ++ digit = tolower(*ptr) - 'a' + 10; \ ++ else \ ++ break; \ ++ \ ++ if (digit >= base) \ ++ break; \ ++ \ ++ last_value = value; \ ++ value = value * base + digit; \ ++ if (last_value > value) /* Overflow */ \ ++ return ERANGE; \ ++ \ ++ flag = 1; \ ++ ptr++; \ ++ } \ ++ \ ++ if (flag) \ ++ *result = value; \ ++ \ ++ if (endptr) \ ++ *endptr = (char *)(flag ? ptr : str); \ ++ \ ++ return 0; \ ++} \ ++ ++#define define_ddi_strtox(type, valtype) \ ++int ddi_strto##type(const char *str, char **endptr, \ ++ int base, valtype *result) \ ++{ \ ++ int rc; \ ++ \ ++ if (*str == '-') { \ ++ rc = ddi_strtou##type(str + 1, endptr, base, result); \ ++ if (!rc) { \ ++ if (*endptr == str + 1) \ ++ *endptr = (char *)str; \ ++ else \ ++ *result = -*result; \ ++ } \ ++ } else { \ ++ rc = ddi_strtou##type(str, endptr, base, result); \ ++ } \ ++ \ ++ return rc; \ ++} ++ ++define_ddi_strtoux(l, unsigned long) ++define_ddi_strtox(l, long) ++define_ddi_strtoux(ll, unsigned long long) ++define_ddi_strtox(ll, long long) ++ ++EXPORT_SYMBOL(ddi_strtoul); ++EXPORT_SYMBOL(ddi_strtol); ++EXPORT_SYMBOL(ddi_strtoll); ++EXPORT_SYMBOL(ddi_strtoull); ++ ++int ++ddi_copyin(const void *from, void *to, size_t len, int flags) ++{ ++ /* Fake ioctl() issued by kernel, 'from' is a kernel address */ ++ if (flags & FKIOCTL) { ++ memcpy(to, from, len); ++ return 0; ++ } ++ ++ return copyin(from, to, len); ++} ++EXPORT_SYMBOL(ddi_copyin); ++ ++int ++ddi_copyout(const void *from, void *to, size_t len, int flags) ++{ ++ /* Fake ioctl() issued by kernel, 'from' is a kernel address */ ++ if (flags & FKIOCTL) { ++ memcpy(to, from, len); ++ return 0; ++ } ++ ++ return copyout(from, to, len); ++} ++EXPORT_SYMBOL(ddi_copyout); ++ ++#ifndef HAVE_PUT_TASK_STRUCT ++/* ++ * This is only a stub function which should never be used. The SPL should ++ * never be putting away the last reference on a task structure so this will ++ * not be called. However, we still need to define it so the module does not ++ * have undefined symbol at load time. That all said if this impossible ++ * thing does somehow happen PANIC immediately so we know about it. ++ */ ++void ++__put_task_struct(struct task_struct *t) ++{ ++ PANIC("Unexpectly put last reference on task %d\n", (int)t->pid); ++} ++EXPORT_SYMBOL(__put_task_struct); ++#endif /* HAVE_PUT_TASK_STRUCT */ ++ ++struct new_utsname *__utsname(void) ++{ ++#ifdef HAVE_INIT_UTSNAME ++ return init_utsname(); ++#else ++ return &system_utsname; ++#endif ++} ++EXPORT_SYMBOL(__utsname); ++ ++ ++/* ++ * Read the unique system identifier from the /etc/hostid file. ++ * ++ * The behavior of /usr/bin/hostid on Linux systems with the ++ * regular eglibc and coreutils is: ++ * ++ * 1. Generate the value if the /etc/hostid file does not exist ++ * or if the /etc/hostid file is less than four bytes in size. ++ * ++ * 2. If the /etc/hostid file is at least 4 bytes, then return ++ * the first four bytes [0..3] in native endian order. ++ * ++ * 3. Always ignore bytes [4..] if they exist in the file. ++ * ++ * Only the first four bytes are significant, even on systems that ++ * have a 64-bit word size. ++ * ++ * See: ++ * ++ * eglibc: sysdeps/unix/sysv/linux/gethostid.c ++ * coreutils: src/hostid.c ++ * ++ * Notes: ++ * ++ * The /etc/hostid file on Solaris is a text file that often reads: ++ * ++ * # DO NOT EDIT ++ * "0123456789" ++ * ++ * Directly copying this file to Linux results in a constant ++ * hostid of 4f442023 because the default comment constitutes ++ * the first four bytes of the file. ++ * ++ */ ++ ++char *spl_hostid_path = HW_HOSTID_PATH; ++module_param(spl_hostid_path, charp, 0444); ++MODULE_PARM_DESC(spl_hostid_path, "The system hostid file (/etc/hostid)"); ++ ++static int ++hostid_read(void) ++{ ++ int result; ++ uint64_t size; ++ struct _buf *file; ++ unsigned long hostid = 0; ++ ++ file = kobj_open_file(spl_hostid_path); ++ ++ if (file == (struct _buf *)-1) ++ return -1; ++ ++ result = kobj_get_filesize(file, &size); ++ ++ if (result != 0) { ++ printk(KERN_WARNING ++ "SPL: kobj_get_filesize returned %i on %s\n", ++ result, spl_hostid_path); ++ kobj_close_file(file); ++ return -2; ++ } ++ ++ if (size < sizeof(HW_HOSTID_MASK)) { ++ printk(KERN_WARNING ++ "SPL: Ignoring the %s file because it is %llu bytes; " ++ "expecting %lu bytes instead.\n", spl_hostid_path, ++ size, (unsigned long)sizeof(HW_HOSTID_MASK)); ++ kobj_close_file(file); ++ return -3; ++ } ++ ++ /* Read directly into the variable like eglibc does. */ ++ /* Short reads are okay; native behavior is preserved. */ ++ result = kobj_read_file(file, (char *)&hostid, sizeof(hostid), 0); ++ ++ if (result < 0) { ++ printk(KERN_WARNING ++ "SPL: kobj_read_file returned %i on %s\n", ++ result, spl_hostid_path); ++ kobj_close_file(file); ++ return -4; ++ } ++ ++ /* Mask down to 32 bits like coreutils does. */ ++ spl_hostid = hostid & HW_HOSTID_MASK; ++ kobj_close_file(file); ++ return 0; ++} ++ ++#define GET_HOSTID_CMD \ ++ "exec 0/proc/sys/kernel/spl/hostid " \ ++ " 2>/dev/null; " \ ++ "hostid" ++ ++static int ++hostid_exec(void) ++{ ++ char *argv[] = { "/bin/sh", ++ "-c", ++ GET_HOSTID_CMD, ++ NULL }; ++ char *envp[] = { "HOME=/", ++ "TERM=linux", ++ "PATH=/sbin:/usr/sbin:/bin:/usr/bin", ++ NULL }; ++ int rc; ++ ++ /* Doing address resolution in the kernel is tricky and just ++ * not a good idea in general. So to set the proper 'hw_serial' ++ * use the usermodehelper support to ask '/bin/sh' to run ++ * '/usr/bin/hostid' and redirect the result to /proc/sys/spl/hostid ++ * for us to use. It's a horrific solution but it will do for now. ++ */ ++ rc = call_usermodehelper(argv[0], argv, envp, 1); ++ if (rc) ++ printk("SPL: Failed user helper '%s %s %s', rc = %d\n", ++ argv[0], argv[1], argv[2], rc); ++ ++ return rc; ++} ++ ++uint32_t ++zone_get_hostid(void *zone) ++{ ++ static int first = 1; ++ unsigned long hostid; ++ int rc; ++ ++ /* Only the global zone is supported */ ++ ASSERT(zone == NULL); ++ ++ if (first) { ++ first = 0; ++ ++ /* ++ * Get the hostid if it was not passed as a module parameter. ++ * Try reading the /etc/hostid file directly, and then fall ++ * back to calling the /usr/bin/hostid utility. ++ */ ++ if ((spl_hostid == HW_INVALID_HOSTID) && ++ (rc = hostid_read()) && (rc = hostid_exec())) ++ return HW_INVALID_HOSTID; ++ ++ printk(KERN_NOTICE "SPL: using hostid 0x%08x\n", ++ (unsigned int) spl_hostid); ++ } ++ ++ if (ddi_strtoul(hw_serial, NULL, HW_HOSTID_LEN-1, &hostid) != 0) ++ return HW_INVALID_HOSTID; ++ ++ return (uint32_t)hostid; ++} ++EXPORT_SYMBOL(zone_get_hostid); ++ ++#ifndef HAVE_KALLSYMS_LOOKUP_NAME ++/* ++ * The kallsyms_lookup_name() kernel function is not an exported symbol in ++ * Linux 2.6.19 through 2.6.32 inclusive. ++ * ++ * This function replaces the functionality by performing an upcall to user ++ * space where /proc/kallsyms is consulted for the requested address. ++ * ++ */ ++ ++#define GET_KALLSYMS_ADDR_CMD \ ++ "exec 0/proc/sys/kernel/spl/kallsyms_lookup_name " \ ++ " 2>/dev/null; " \ ++ "awk '{ if ( $3 == \"kallsyms_lookup_name\" ) { print $1 } }' " \ ++ " /proc/kallsyms " ++ ++static int ++set_kallsyms_lookup_name(void) ++{ ++ char *argv[] = { "/bin/sh", ++ "-c", ++ GET_KALLSYMS_ADDR_CMD, ++ NULL }; ++ char *envp[] = { "HOME=/", ++ "TERM=linux", ++ "PATH=/sbin:/usr/sbin:/bin:/usr/bin", ++ NULL }; ++ int rc; ++ ++ rc = call_usermodehelper(argv[0], argv, envp, 1); ++ if (rc) ++ printk("SPL: Failed user helper '%s %s %s', rc = %d\n", ++ argv[0], argv[1], argv[2], rc); ++ ++ return rc; ++} ++#endif ++ ++static int ++__init spl_init(void) ++{ ++ int rc = 0; ++ ++ if ((rc = spl_debug_init())) ++ return rc; ++ ++ if ((rc = spl_kmem_init())) ++ SGOTO(out1, rc); ++ ++ if ((rc = spl_mutex_init())) ++ SGOTO(out2, rc); ++ ++ if ((rc = spl_rw_init())) ++ SGOTO(out3, rc); ++ ++ if ((rc = spl_taskq_init())) ++ SGOTO(out4, rc); ++ ++ if ((rc = spl_vn_init())) ++ SGOTO(out5, rc); ++ ++ if ((rc = spl_proc_init())) ++ SGOTO(out6, rc); ++ ++ if ((rc = spl_kstat_init())) ++ SGOTO(out7, rc); ++ ++ if ((rc = spl_tsd_init())) ++ SGOTO(out8, rc); ++ ++ if ((rc = spl_zlib_init())) ++ SGOTO(out9, rc); ++ ++#ifndef HAVE_KALLSYMS_LOOKUP_NAME ++ if ((rc = set_kallsyms_lookup_name())) ++ SGOTO(out10, rc = -EADDRNOTAVAIL); ++#endif /* HAVE_KALLSYMS_LOOKUP_NAME */ ++ ++ if ((rc = spl_kmem_init_kallsyms_lookup())) ++ SGOTO(out10, rc); ++ ++ if ((rc = spl_vn_init_kallsyms_lookup())) ++ SGOTO(out10, rc); ++ ++ printk(KERN_NOTICE "SPL: Loaded module v%s-%s%s\n", SPL_META_VERSION, ++ SPL_META_RELEASE, SPL_DEBUG_STR); ++ SRETURN(rc); ++out10: ++ spl_zlib_fini(); ++out9: ++ spl_tsd_fini(); ++out8: ++ spl_kstat_fini(); ++out7: ++ spl_proc_fini(); ++out6: ++ spl_vn_fini(); ++out5: ++ spl_taskq_fini(); ++out4: ++ spl_rw_fini(); ++out3: ++ spl_mutex_fini(); ++out2: ++ spl_kmem_fini(); ++out1: ++ spl_debug_fini(); ++ ++ printk(KERN_NOTICE "SPL: Failed to Load Solaris Porting Layer " ++ "v%s-%s%s, rc = %d\n", SPL_META_VERSION, SPL_META_RELEASE, ++ SPL_DEBUG_STR, rc); ++ return rc; ++} ++ ++static void ++spl_fini(void) ++{ ++ SENTRY; ++ ++ printk(KERN_NOTICE "SPL: Unloaded module v%s-%s%s\n", ++ SPL_META_VERSION, SPL_META_RELEASE, SPL_DEBUG_STR); ++ spl_zlib_fini(); ++ spl_tsd_fini(); ++ spl_kstat_fini(); ++ spl_proc_fini(); ++ spl_vn_fini(); ++ spl_taskq_fini(); ++ spl_rw_fini(); ++ spl_mutex_fini(); ++ spl_kmem_fini(); ++ spl_debug_fini(); ++} ++ ++/* Called when a dependent module is loaded */ ++void ++spl_setup(void) ++{ ++ int rc; ++ ++ /* ++ * At module load time the pwd is set to '/' on a Solaris system. ++ * On a Linux system will be set to whatever directory the caller ++ * was in when executing insmod/modprobe. ++ */ ++ rc = vn_set_pwd("/"); ++ if (rc) ++ printk("SPL: Warning unable to set pwd to '/': %d\n", rc); ++} ++EXPORT_SYMBOL(spl_setup); ++ ++/* Called when a dependent module is unloaded */ ++void ++spl_cleanup(void) ++{ ++} ++EXPORT_SYMBOL(spl_cleanup); ++ ++module_init(spl_init); ++module_exit(spl_fini); ++ ++MODULE_AUTHOR("Lawrence Livermore National Labs"); ++MODULE_DESCRIPTION("Solaris Porting Layer"); ++MODULE_LICENSE("GPL"); +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-kmem.c linux-3.2.33-go/spl/spl/spl-kmem.c +--- linux-3.2.33-go.orig/spl/spl/spl-kmem.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-kmem.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,2440 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Kmem Implementation. ++\*****************************************************************************/ ++ ++#include ++#include ++ ++#ifdef SS_DEBUG_SUBSYS ++#undef SS_DEBUG_SUBSYS ++#endif ++ ++#define SS_DEBUG_SUBSYS SS_KMEM ++ ++/* ++ * The minimum amount of memory measured in pages to be free at all ++ * times on the system. This is similar to Linux's zone->pages_min ++ * multiplied by the number of zones and is sized based on that. ++ */ ++pgcnt_t minfree = 0; ++EXPORT_SYMBOL(minfree); ++ ++/* ++ * The desired amount of memory measured in pages to be free at all ++ * times on the system. This is similar to Linux's zone->pages_low ++ * multiplied by the number of zones and is sized based on that. ++ * Assuming all zones are being used roughly equally, when we drop ++ * below this threshold asynchronous page reclamation is triggered. ++ */ ++pgcnt_t desfree = 0; ++EXPORT_SYMBOL(desfree); ++ ++/* ++ * When above this amount of memory measures in pages the system is ++ * determined to have enough free memory. This is similar to Linux's ++ * zone->pages_high multiplied by the number of zones and is sized based ++ * on that. Assuming all zones are being used roughly equally, when ++ * asynchronous page reclamation reaches this threshold it stops. ++ */ ++pgcnt_t lotsfree = 0; ++EXPORT_SYMBOL(lotsfree); ++ ++/* Unused always 0 in this implementation */ ++pgcnt_t needfree = 0; ++EXPORT_SYMBOL(needfree); ++ ++pgcnt_t swapfs_minfree = 0; ++EXPORT_SYMBOL(swapfs_minfree); ++ ++pgcnt_t swapfs_reserve = 0; ++EXPORT_SYMBOL(swapfs_reserve); ++ ++vmem_t *heap_arena = NULL; ++EXPORT_SYMBOL(heap_arena); ++ ++vmem_t *zio_alloc_arena = NULL; ++EXPORT_SYMBOL(zio_alloc_arena); ++ ++vmem_t *zio_arena = NULL; ++EXPORT_SYMBOL(zio_arena); ++ ++#ifndef HAVE_GET_VMALLOC_INFO ++get_vmalloc_info_t get_vmalloc_info_fn = SYMBOL_POISON; ++EXPORT_SYMBOL(get_vmalloc_info_fn); ++#endif /* HAVE_GET_VMALLOC_INFO */ ++ ++#ifdef HAVE_PGDAT_HELPERS ++# ifndef HAVE_FIRST_ONLINE_PGDAT ++first_online_pgdat_t first_online_pgdat_fn = SYMBOL_POISON; ++EXPORT_SYMBOL(first_online_pgdat_fn); ++# endif /* HAVE_FIRST_ONLINE_PGDAT */ ++ ++# ifndef HAVE_NEXT_ONLINE_PGDAT ++next_online_pgdat_t next_online_pgdat_fn = SYMBOL_POISON; ++EXPORT_SYMBOL(next_online_pgdat_fn); ++# endif /* HAVE_NEXT_ONLINE_PGDAT */ ++ ++# ifndef HAVE_NEXT_ZONE ++next_zone_t next_zone_fn = SYMBOL_POISON; ++EXPORT_SYMBOL(next_zone_fn); ++# endif /* HAVE_NEXT_ZONE */ ++ ++#else /* HAVE_PGDAT_HELPERS */ ++ ++# ifndef HAVE_PGDAT_LIST ++struct pglist_data *pgdat_list_addr = SYMBOL_POISON; ++EXPORT_SYMBOL(pgdat_list_addr); ++# endif /* HAVE_PGDAT_LIST */ ++ ++#endif /* HAVE_PGDAT_HELPERS */ ++ ++#ifdef NEED_GET_ZONE_COUNTS ++# ifndef HAVE_GET_ZONE_COUNTS ++get_zone_counts_t get_zone_counts_fn = SYMBOL_POISON; ++EXPORT_SYMBOL(get_zone_counts_fn); ++# endif /* HAVE_GET_ZONE_COUNTS */ ++ ++unsigned long ++spl_global_page_state(spl_zone_stat_item_t item) ++{ ++ unsigned long active; ++ unsigned long inactive; ++ unsigned long free; ++ ++ get_zone_counts(&active, &inactive, &free); ++ switch (item) { ++ case SPL_NR_FREE_PAGES: return free; ++ case SPL_NR_INACTIVE: return inactive; ++ case SPL_NR_ACTIVE: return active; ++ default: ASSERT(0); /* Unsupported */ ++ } ++ ++ return 0; ++} ++#else ++# ifdef HAVE_GLOBAL_PAGE_STATE ++unsigned long ++spl_global_page_state(spl_zone_stat_item_t item) ++{ ++ unsigned long pages = 0; ++ ++ switch (item) { ++ case SPL_NR_FREE_PAGES: ++# ifdef HAVE_ZONE_STAT_ITEM_NR_FREE_PAGES ++ pages += global_page_state(NR_FREE_PAGES); ++# endif ++ break; ++ case SPL_NR_INACTIVE: ++# ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE ++ pages += global_page_state(NR_INACTIVE); ++# endif ++# ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_ANON ++ pages += global_page_state(NR_INACTIVE_ANON); ++# endif ++# ifdef HAVE_ZONE_STAT_ITEM_NR_INACTIVE_FILE ++ pages += global_page_state(NR_INACTIVE_FILE); ++# endif ++ break; ++ case SPL_NR_ACTIVE: ++# ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE ++ pages += global_page_state(NR_ACTIVE); ++# endif ++# ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_ANON ++ pages += global_page_state(NR_ACTIVE_ANON); ++# endif ++# ifdef HAVE_ZONE_STAT_ITEM_NR_ACTIVE_FILE ++ pages += global_page_state(NR_ACTIVE_FILE); ++# endif ++ break; ++ default: ++ ASSERT(0); /* Unsupported */ ++ } ++ ++ return pages; ++} ++# else ++# error "Both global_page_state() and get_zone_counts() unavailable" ++# endif /* HAVE_GLOBAL_PAGE_STATE */ ++#endif /* NEED_GET_ZONE_COUNTS */ ++EXPORT_SYMBOL(spl_global_page_state); ++ ++#if !defined(HAVE_INVALIDATE_INODES) && !defined(HAVE_INVALIDATE_INODES_CHECK) ++invalidate_inodes_t invalidate_inodes_fn = SYMBOL_POISON; ++EXPORT_SYMBOL(invalidate_inodes_fn); ++#endif /* !HAVE_INVALIDATE_INODES && !HAVE_INVALIDATE_INODES_CHECK */ ++ ++#ifndef HAVE_SHRINK_DCACHE_MEMORY ++shrink_dcache_memory_t shrink_dcache_memory_fn = SYMBOL_POISON; ++EXPORT_SYMBOL(shrink_dcache_memory_fn); ++#endif /* HAVE_SHRINK_DCACHE_MEMORY */ ++ ++#ifndef HAVE_SHRINK_ICACHE_MEMORY ++shrink_icache_memory_t shrink_icache_memory_fn = SYMBOL_POISON; ++EXPORT_SYMBOL(shrink_icache_memory_fn); ++#endif /* HAVE_SHRINK_ICACHE_MEMORY */ ++ ++pgcnt_t ++spl_kmem_availrmem(void) ++{ ++ /* The amount of easily available memory */ ++ return (spl_global_page_state(SPL_NR_FREE_PAGES) + ++ spl_global_page_state(SPL_NR_INACTIVE)); ++} ++EXPORT_SYMBOL(spl_kmem_availrmem); ++ ++size_t ++vmem_size(vmem_t *vmp, int typemask) ++{ ++ struct vmalloc_info vmi; ++ size_t size = 0; ++ ++ ASSERT(vmp == NULL); ++ ASSERT(typemask & (VMEM_ALLOC | VMEM_FREE)); ++ ++ get_vmalloc_info(&vmi); ++ if (typemask & VMEM_ALLOC) ++ size += (size_t)vmi.used; ++ ++ if (typemask & VMEM_FREE) ++ size += (size_t)(VMALLOC_TOTAL - vmi.used); ++ ++ return size; ++} ++EXPORT_SYMBOL(vmem_size); ++ ++int ++kmem_debugging(void) ++{ ++ return 0; ++} ++EXPORT_SYMBOL(kmem_debugging); ++ ++#ifndef HAVE_KVASPRINTF ++/* Simplified asprintf. */ ++char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap) ++{ ++ unsigned int len; ++ char *p; ++ va_list aq; ++ ++ va_copy(aq, ap); ++ len = vsnprintf(NULL, 0, fmt, aq); ++ va_end(aq); ++ ++ p = kmalloc(len+1, gfp); ++ if (!p) ++ return NULL; ++ ++ vsnprintf(p, len+1, fmt, ap); ++ ++ return p; ++} ++EXPORT_SYMBOL(kvasprintf); ++#endif /* HAVE_KVASPRINTF */ ++ ++char * ++kmem_vasprintf(const char *fmt, va_list ap) ++{ ++ va_list aq; ++ char *ptr; ++ ++ do { ++ va_copy(aq, ap); ++ ptr = kvasprintf(GFP_KERNEL, fmt, aq); ++ va_end(aq); ++ } while (ptr == NULL); ++ ++ return ptr; ++} ++EXPORT_SYMBOL(kmem_vasprintf); ++ ++char * ++kmem_asprintf(const char *fmt, ...) ++{ ++ va_list ap; ++ char *ptr; ++ ++ do { ++ va_start(ap, fmt); ++ ptr = kvasprintf(GFP_KERNEL, fmt, ap); ++ va_end(ap); ++ } while (ptr == NULL); ++ ++ return ptr; ++} ++EXPORT_SYMBOL(kmem_asprintf); ++ ++static char * ++__strdup(const char *str, int flags) ++{ ++ char *ptr; ++ int n; ++ ++ n = strlen(str); ++ ptr = kmalloc_nofail(n + 1, flags); ++ if (ptr) ++ memcpy(ptr, str, n + 1); ++ ++ return ptr; ++} ++ ++char * ++strdup(const char *str) ++{ ++ return __strdup(str, KM_SLEEP); ++} ++EXPORT_SYMBOL(strdup); ++ ++void ++strfree(char *str) ++{ ++ kfree(str); ++} ++EXPORT_SYMBOL(strfree); ++ ++/* ++ * Memory allocation interfaces and debugging for basic kmem_* ++ * and vmem_* style memory allocation. When DEBUG_KMEM is enabled ++ * the SPL will keep track of the total memory allocated, and ++ * report any memory leaked when the module is unloaded. ++ */ ++#ifdef DEBUG_KMEM ++ ++/* Shim layer memory accounting */ ++# ifdef HAVE_ATOMIC64_T ++atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); ++unsigned long long kmem_alloc_max = 0; ++atomic64_t vmem_alloc_used = ATOMIC64_INIT(0); ++unsigned long long vmem_alloc_max = 0; ++# else /* HAVE_ATOMIC64_T */ ++atomic_t kmem_alloc_used = ATOMIC_INIT(0); ++unsigned long long kmem_alloc_max = 0; ++atomic_t vmem_alloc_used = ATOMIC_INIT(0); ++unsigned long long vmem_alloc_max = 0; ++# endif /* HAVE_ATOMIC64_T */ ++ ++EXPORT_SYMBOL(kmem_alloc_used); ++EXPORT_SYMBOL(kmem_alloc_max); ++EXPORT_SYMBOL(vmem_alloc_used); ++EXPORT_SYMBOL(vmem_alloc_max); ++ ++/* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked ++ * but also the location of every alloc and free. When the SPL module is ++ * unloaded a list of all leaked addresses and where they were allocated ++ * will be dumped to the console. Enabling this feature has a significant ++ * impact on performance but it makes finding memory leaks straight forward. ++ * ++ * Not surprisingly with debugging enabled the xmem_locks are very highly ++ * contended particularly on xfree(). If we want to run with this detailed ++ * debugging enabled for anything other than debugging we need to minimize ++ * the contention by moving to a lock per xmem_table entry model. ++ */ ++# ifdef DEBUG_KMEM_TRACKING ++ ++# define KMEM_HASH_BITS 10 ++# define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) ++ ++# define VMEM_HASH_BITS 10 ++# define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS) ++ ++typedef struct kmem_debug { ++ struct hlist_node kd_hlist; /* Hash node linkage */ ++ struct list_head kd_list; /* List of all allocations */ ++ void *kd_addr; /* Allocation pointer */ ++ size_t kd_size; /* Allocation size */ ++ const char *kd_func; /* Allocation function */ ++ int kd_line; /* Allocation line */ ++} kmem_debug_t; ++ ++spinlock_t kmem_lock; ++struct hlist_head kmem_table[KMEM_TABLE_SIZE]; ++struct list_head kmem_list; ++ ++spinlock_t vmem_lock; ++struct hlist_head vmem_table[VMEM_TABLE_SIZE]; ++struct list_head vmem_list; ++ ++EXPORT_SYMBOL(kmem_lock); ++EXPORT_SYMBOL(kmem_table); ++EXPORT_SYMBOL(kmem_list); ++ ++EXPORT_SYMBOL(vmem_lock); ++EXPORT_SYMBOL(vmem_table); ++EXPORT_SYMBOL(vmem_list); ++ ++static kmem_debug_t * ++kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void *addr) ++{ ++ struct hlist_head *head; ++ struct hlist_node *node; ++ struct kmem_debug *p; ++ unsigned long flags; ++ SENTRY; ++ ++ spin_lock_irqsave(lock, flags); ++ ++ head = &table[hash_ptr(addr, bits)]; ++ hlist_for_each_entry_rcu(p, node, head, kd_hlist) { ++ if (p->kd_addr == addr) { ++ hlist_del_init(&p->kd_hlist); ++ list_del_init(&p->kd_list); ++ spin_unlock_irqrestore(lock, flags); ++ return p; ++ } ++ } ++ ++ spin_unlock_irqrestore(lock, flags); ++ ++ SRETURN(NULL); ++} ++ ++void * ++kmem_alloc_track(size_t size, int flags, const char *func, int line, ++ int node_alloc, int node) ++{ ++ void *ptr = NULL; ++ kmem_debug_t *dptr; ++ unsigned long irq_flags; ++ SENTRY; ++ ++ /* Function may be called with KM_NOSLEEP so failure is possible */ ++ dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t), ++ flags & ~__GFP_ZERO); ++ ++ if (unlikely(dptr == NULL)) { ++ SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug " ++ "kmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n", ++ sizeof(kmem_debug_t), flags, func, line, ++ kmem_alloc_used_read(), kmem_alloc_max); ++ } else { ++ /* ++ * Marked unlikely because we should never be doing this, ++ * we tolerate to up 2 pages but a single page is best. ++ */ ++ if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) { ++ SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "large " ++ "kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n", ++ (unsigned long long) size, flags, func, line, ++ kmem_alloc_used_read(), kmem_alloc_max); ++ spl_debug_dumpstack(NULL); ++ } ++ ++ /* ++ * We use __strdup() below because the string pointed to by ++ * __FUNCTION__ might not be available by the time we want ++ * to print it since the module might have been unloaded. ++ * This can only fail in the KM_NOSLEEP case. ++ */ ++ dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO); ++ if (unlikely(dptr->kd_func == NULL)) { ++ kfree(dptr); ++ SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, ++ "debug __strdup() at %s:%d failed (%lld/%llu)\n", ++ func, line, kmem_alloc_used_read(), kmem_alloc_max); ++ goto out; ++ } ++ ++ /* Use the correct allocator */ ++ if (node_alloc) { ++ ASSERT(!(flags & __GFP_ZERO)); ++ ptr = kmalloc_node_nofail(size, flags, node); ++ } else if (flags & __GFP_ZERO) { ++ ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO); ++ } else { ++ ptr = kmalloc_nofail(size, flags); ++ } ++ ++ if (unlikely(ptr == NULL)) { ++ kfree(dptr->kd_func); ++ kfree(dptr); ++ SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "kmem_alloc" ++ "(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", ++ (unsigned long long) size, flags, func, line, ++ kmem_alloc_used_read(), kmem_alloc_max); ++ goto out; ++ } ++ ++ kmem_alloc_used_add(size); ++ if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) ++ kmem_alloc_max = kmem_alloc_used_read(); ++ ++ INIT_HLIST_NODE(&dptr->kd_hlist); ++ INIT_LIST_HEAD(&dptr->kd_list); ++ ++ dptr->kd_addr = ptr; ++ dptr->kd_size = size; ++ dptr->kd_line = line; ++ ++ spin_lock_irqsave(&kmem_lock, irq_flags); ++ hlist_add_head_rcu(&dptr->kd_hlist, ++ &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); ++ list_add_tail(&dptr->kd_list, &kmem_list); ++ spin_unlock_irqrestore(&kmem_lock, irq_flags); ++ ++ SDEBUG_LIMIT(SD_INFO, ++ "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n", ++ (unsigned long long) size, flags, func, line, ptr, ++ kmem_alloc_used_read(), kmem_alloc_max); ++ } ++out: ++ SRETURN(ptr); ++} ++EXPORT_SYMBOL(kmem_alloc_track); ++ ++void ++kmem_free_track(const void *ptr, size_t size) ++{ ++ kmem_debug_t *dptr; ++ SENTRY; ++ ++ ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr, ++ (unsigned long long) size); ++ ++ dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); ++ ++ /* Must exist in hash due to kmem_alloc() */ ++ ASSERT(dptr); ++ ++ /* Size must match */ ++ ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), " ++ "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size, ++ (unsigned long long) size, dptr->kd_func, dptr->kd_line); ++ ++ kmem_alloc_used_sub(size); ++ SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr, ++ (unsigned long long) size, kmem_alloc_used_read(), ++ kmem_alloc_max); ++ ++ kfree(dptr->kd_func); ++ ++ memset(dptr, 0x5a, sizeof(kmem_debug_t)); ++ kfree(dptr); ++ ++ memset(ptr, 0x5a, size); ++ kfree(ptr); ++ ++ SEXIT; ++} ++EXPORT_SYMBOL(kmem_free_track); ++ ++void * ++vmem_alloc_track(size_t size, int flags, const char *func, int line) ++{ ++ void *ptr = NULL; ++ kmem_debug_t *dptr; ++ unsigned long irq_flags; ++ SENTRY; ++ ++ ASSERT(flags & KM_SLEEP); ++ ++ /* Function may be called with KM_NOSLEEP so failure is possible */ ++ dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t), ++ flags & ~__GFP_ZERO); ++ if (unlikely(dptr == NULL)) { ++ SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "debug " ++ "vmem_alloc(%ld, 0x%x) at %s:%d failed (%lld/%llu)\n", ++ sizeof(kmem_debug_t), flags, func, line, ++ vmem_alloc_used_read(), vmem_alloc_max); ++ } else { ++ /* ++ * We use __strdup() below because the string pointed to by ++ * __FUNCTION__ might not be available by the time we want ++ * to print it, since the module might have been unloaded. ++ * This can never fail because we have already asserted ++ * that flags is KM_SLEEP. ++ */ ++ dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO); ++ if (unlikely(dptr->kd_func == NULL)) { ++ kfree(dptr); ++ SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, ++ "debug __strdup() at %s:%d failed (%lld/%llu)\n", ++ func, line, vmem_alloc_used_read(), vmem_alloc_max); ++ goto out; ++ } ++ ++ /* Use the correct allocator */ ++ if (flags & __GFP_ZERO) { ++ ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO); ++ } else { ++ ptr = vmalloc_nofail(size, flags); ++ } ++ ++ if (unlikely(ptr == NULL)) { ++ kfree(dptr->kd_func); ++ kfree(dptr); ++ SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, "vmem_alloc" ++ "(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", ++ (unsigned long long) size, flags, func, line, ++ vmem_alloc_used_read(), vmem_alloc_max); ++ goto out; ++ } ++ ++ vmem_alloc_used_add(size); ++ if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) ++ vmem_alloc_max = vmem_alloc_used_read(); ++ ++ INIT_HLIST_NODE(&dptr->kd_hlist); ++ INIT_LIST_HEAD(&dptr->kd_list); ++ ++ dptr->kd_addr = ptr; ++ dptr->kd_size = size; ++ dptr->kd_line = line; ++ ++ spin_lock_irqsave(&vmem_lock, irq_flags); ++ hlist_add_head_rcu(&dptr->kd_hlist, ++ &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]); ++ list_add_tail(&dptr->kd_list, &vmem_list); ++ spin_unlock_irqrestore(&vmem_lock, irq_flags); ++ ++ SDEBUG_LIMIT(SD_INFO, ++ "vmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n", ++ (unsigned long long) size, flags, func, line, ++ ptr, vmem_alloc_used_read(), vmem_alloc_max); ++ } ++out: ++ SRETURN(ptr); ++} ++EXPORT_SYMBOL(vmem_alloc_track); ++ ++void ++vmem_free_track(const void *ptr, size_t size) ++{ ++ kmem_debug_t *dptr; ++ SENTRY; ++ ++ ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr, ++ (unsigned long long) size); ++ ++ dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr); ++ ++ /* Must exist in hash due to vmem_alloc() */ ++ ASSERT(dptr); ++ ++ /* Size must match */ ++ ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), " ++ "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size, ++ (unsigned long long) size, dptr->kd_func, dptr->kd_line); ++ ++ vmem_alloc_used_sub(size); ++ SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr, ++ (unsigned long long) size, vmem_alloc_used_read(), ++ vmem_alloc_max); ++ ++ kfree(dptr->kd_func); ++ ++ memset(dptr, 0x5a, sizeof(kmem_debug_t)); ++ kfree(dptr); ++ ++ memset(ptr, 0x5a, size); ++ vfree(ptr); ++ ++ SEXIT; ++} ++EXPORT_SYMBOL(vmem_free_track); ++ ++# else /* DEBUG_KMEM_TRACKING */ ++ ++void * ++kmem_alloc_debug(size_t size, int flags, const char *func, int line, ++ int node_alloc, int node) ++{ ++ void *ptr; ++ SENTRY; ++ ++ /* ++ * Marked unlikely because we should never be doing this, ++ * we tolerate to up 2 pages but a single page is best. ++ */ ++ if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) { ++ SDEBUG(SD_CONSOLE | SD_WARNING, ++ "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n", ++ (unsigned long long) size, flags, func, line, ++ kmem_alloc_used_read(), kmem_alloc_max); ++ dump_stack(); ++ } ++ ++ /* Use the correct allocator */ ++ if (node_alloc) { ++ ASSERT(!(flags & __GFP_ZERO)); ++ ptr = kmalloc_node_nofail(size, flags, node); ++ } else if (flags & __GFP_ZERO) { ++ ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO)); ++ } else { ++ ptr = kmalloc_nofail(size, flags); ++ } ++ ++ if (unlikely(ptr == NULL)) { ++ SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, ++ "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", ++ (unsigned long long) size, flags, func, line, ++ kmem_alloc_used_read(), kmem_alloc_max); ++ } else { ++ kmem_alloc_used_add(size); ++ if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) ++ kmem_alloc_max = kmem_alloc_used_read(); ++ ++ SDEBUG_LIMIT(SD_INFO, ++ "kmem_alloc(%llu, 0x%x) at %s:%d = %p (%lld/%llu)\n", ++ (unsigned long long) size, flags, func, line, ptr, ++ kmem_alloc_used_read(), kmem_alloc_max); ++ } ++ ++ SRETURN(ptr); ++} ++EXPORT_SYMBOL(kmem_alloc_debug); ++ ++void ++kmem_free_debug(const void *ptr, size_t size) ++{ ++ SENTRY; ++ ++ ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr, ++ (unsigned long long) size); ++ ++ kmem_alloc_used_sub(size); ++ SDEBUG_LIMIT(SD_INFO, "kmem_free(%p, %llu) (%lld/%llu)\n", ptr, ++ (unsigned long long) size, kmem_alloc_used_read(), ++ kmem_alloc_max); ++ kfree(ptr); ++ ++ SEXIT; ++} ++EXPORT_SYMBOL(kmem_free_debug); ++ ++void * ++vmem_alloc_debug(size_t size, int flags, const char *func, int line) ++{ ++ void *ptr; ++ SENTRY; ++ ++ ASSERT(flags & KM_SLEEP); ++ ++ /* Use the correct allocator */ ++ if (flags & __GFP_ZERO) { ++ ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO)); ++ } else { ++ ptr = vmalloc_nofail(size, flags); ++ } ++ ++ if (unlikely(ptr == NULL)) { ++ SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, ++ "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", ++ (unsigned long long) size, flags, func, line, ++ vmem_alloc_used_read(), vmem_alloc_max); ++ } else { ++ vmem_alloc_used_add(size); ++ if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) ++ vmem_alloc_max = vmem_alloc_used_read(); ++ ++ SDEBUG_LIMIT(SD_INFO, "vmem_alloc(%llu, 0x%x) = %p " ++ "(%lld/%llu)\n", (unsigned long long) size, flags, ptr, ++ vmem_alloc_used_read(), vmem_alloc_max); ++ } ++ ++ SRETURN(ptr); ++} ++EXPORT_SYMBOL(vmem_alloc_debug); ++ ++void ++vmem_free_debug(const void *ptr, size_t size) ++{ ++ SENTRY; ++ ++ ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr, ++ (unsigned long long) size); ++ ++ vmem_alloc_used_sub(size); ++ SDEBUG_LIMIT(SD_INFO, "vmem_free(%p, %llu) (%lld/%llu)\n", ptr, ++ (unsigned long long) size, vmem_alloc_used_read(), ++ vmem_alloc_max); ++ vfree(ptr); ++ ++ SEXIT; ++} ++EXPORT_SYMBOL(vmem_free_debug); ++ ++# endif /* DEBUG_KMEM_TRACKING */ ++#endif /* DEBUG_KMEM */ ++ ++/* ++ * Slab allocation interfaces ++ * ++ * While the Linux slab implementation was inspired by the Solaris ++ * implementation I cannot use it to emulate the Solaris APIs. I ++ * require two features which are not provided by the Linux slab. ++ * ++ * 1) Constructors AND destructors. Recent versions of the Linux ++ * kernel have removed support for destructors. This is a deal ++ * breaker for the SPL which contains particularly expensive ++ * initializers for mutex's, condition variables, etc. We also ++ * require a minimal level of cleanup for these data types unlike ++ * many Linux data type which do need to be explicitly destroyed. ++ * ++ * 2) Virtual address space backed slab. Callers of the Solaris slab ++ * expect it to work well for both small are very large allocations. ++ * Because of memory fragmentation the Linux slab which is backed ++ * by kmalloc'ed memory performs very badly when confronted with ++ * large numbers of large allocations. Basing the slab on the ++ * virtual address space removes the need for contiguous pages ++ * and greatly improve performance for large allocations. ++ * ++ * For these reasons, the SPL has its own slab implementation with ++ * the needed features. It is not as highly optimized as either the ++ * Solaris or Linux slabs, but it should get me most of what is ++ * needed until it can be optimized or obsoleted by another approach. ++ * ++ * One serious concern I do have about this method is the relatively ++ * small virtual address space on 32bit arches. This will seriously ++ * constrain the size of the slab caches and their performance. ++ * ++ * XXX: Improve the partial slab list by carefully maintaining a ++ * strict ordering of fullest to emptiest slabs based on ++ * the slab reference count. This guarantees the when freeing ++ * slabs back to the system we need only linearly traverse the ++ * last N slabs in the list to discover all the freeable slabs. ++ * ++ * XXX: NUMA awareness for optionally allocating memory close to a ++ * particular core. This can be advantageous if you know the slab ++ * object will be short lived and primarily accessed from one core. ++ * ++ * XXX: Slab coloring may also yield performance improvements and would ++ * be desirable to implement. ++ */ ++ ++struct list_head spl_kmem_cache_list; /* List of caches */ ++struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ ++ ++static int spl_cache_flush(spl_kmem_cache_t *skc, ++ spl_kmem_magazine_t *skm, int flush); ++ ++SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker); ++SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker, ++ spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS); ++ ++static void * ++kv_alloc(spl_kmem_cache_t *skc, int size, int flags) ++{ ++ void *ptr; ++ ++ ASSERT(ISP2(size)); ++ ++ if (skc->skc_flags & KMC_KMEM) ++ ptr = (void *)__get_free_pages(flags, get_order(size)); ++ else ++ ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); ++ ++ /* Resulting allocated memory will be page aligned */ ++ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); ++ ++ return ptr; ++} ++ ++static void ++kv_free(spl_kmem_cache_t *skc, void *ptr, int size) ++{ ++ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); ++ ASSERT(ISP2(size)); ++ ++ /* ++ * The Linux direct reclaim path uses this out of band value to ++ * determine if forward progress is being made. Normally this is ++ * incremented by kmem_freepages() which is part of the various ++ * Linux slab implementations. However, since we are using none ++ * of that infrastructure we are responsible for incrementing it. ++ */ ++ if (current->reclaim_state) ++ current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; ++ ++ if (skc->skc_flags & KMC_KMEM) ++ free_pages((unsigned long)ptr, get_order(size)); ++ else ++ vfree(ptr); ++} ++ ++/* ++ * Required space for each aligned sks. ++ */ ++static inline uint32_t ++spl_sks_size(spl_kmem_cache_t *skc) ++{ ++ return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t), ++ skc->skc_obj_align, uint32_t); ++} ++ ++/* ++ * Required space for each aligned object. ++ */ ++static inline uint32_t ++spl_obj_size(spl_kmem_cache_t *skc) ++{ ++ uint32_t align = skc->skc_obj_align; ++ ++ return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + ++ P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t); ++} ++ ++/* ++ * Lookup the spl_kmem_object_t for an object given that object. ++ */ ++static inline spl_kmem_obj_t * ++spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) ++{ ++ return obj + P2ROUNDUP_TYPED(skc->skc_obj_size, ++ skc->skc_obj_align, uint32_t); ++} ++ ++/* ++ * Required space for each offslab object taking in to account alignment ++ * restrictions and the power-of-two requirement of kv_alloc(). ++ */ ++static inline uint32_t ++spl_offslab_size(spl_kmem_cache_t *skc) ++{ ++ return 1UL << (highbit(spl_obj_size(skc)) + 1); ++} ++ ++/* ++ * It's important that we pack the spl_kmem_obj_t structure and the ++ * actual objects in to one large address space to minimize the number ++ * of calls to the allocator. It is far better to do a few large ++ * allocations and then subdivide it ourselves. Now which allocator ++ * we use requires balancing a few trade offs. ++ * ++ * For small objects we use kmem_alloc() because as long as you are ++ * only requesting a small number of pages (ideally just one) its cheap. ++ * However, when you start requesting multiple pages with kmem_alloc() ++ * it gets increasingly expensive since it requires contiguous pages. ++ * For this reason we shift to vmem_alloc() for slabs of large objects ++ * which removes the need for contiguous pages. We do not use ++ * vmem_alloc() in all cases because there is significant locking ++ * overhead in __get_vm_area_node(). This function takes a single ++ * global lock when acquiring an available virtual address range which ++ * serializes all vmem_alloc()'s for all slab caches. Using slightly ++ * different allocation functions for small and large objects should ++ * give us the best of both worlds. ++ * ++ * KMC_ONSLAB KMC_OFFSLAB ++ * ++ * +------------------------+ +-----------------+ ++ * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+ ++ * | skc_obj_size <-+ | | +-----------------+ | | ++ * | spl_kmem_obj_t | | | | ++ * | skc_obj_size <---+ | +-----------------+ | | ++ * | spl_kmem_obj_t | | | skc_obj_size | <-+ | ++ * | ... v | | spl_kmem_obj_t | | ++ * +------------------------+ +-----------------+ v ++ */ ++static spl_kmem_slab_t * ++spl_slab_alloc(spl_kmem_cache_t *skc, int flags) ++{ ++ spl_kmem_slab_t *sks; ++ spl_kmem_obj_t *sko, *n; ++ void *base, *obj; ++ uint32_t obj_size, offslab_size = 0; ++ int i, rc = 0; ++ ++ base = kv_alloc(skc, skc->skc_slab_size, flags); ++ if (base == NULL) ++ SRETURN(NULL); ++ ++ sks = (spl_kmem_slab_t *)base; ++ sks->sks_magic = SKS_MAGIC; ++ sks->sks_objs = skc->skc_slab_objs; ++ sks->sks_age = jiffies; ++ sks->sks_cache = skc; ++ INIT_LIST_HEAD(&sks->sks_list); ++ INIT_LIST_HEAD(&sks->sks_free_list); ++ sks->sks_ref = 0; ++ obj_size = spl_obj_size(skc); ++ ++ if (skc->skc_flags & KMC_OFFSLAB) ++ offslab_size = spl_offslab_size(skc); ++ ++ for (i = 0; i < sks->sks_objs; i++) { ++ if (skc->skc_flags & KMC_OFFSLAB) { ++ obj = kv_alloc(skc, offslab_size, flags); ++ if (!obj) ++ SGOTO(out, rc = -ENOMEM); ++ } else { ++ obj = base + spl_sks_size(skc) + (i * obj_size); ++ } ++ ++ ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); ++ sko = spl_sko_from_obj(skc, obj); ++ sko->sko_addr = obj; ++ sko->sko_magic = SKO_MAGIC; ++ sko->sko_slab = sks; ++ INIT_LIST_HEAD(&sko->sko_list); ++ list_add_tail(&sko->sko_list, &sks->sks_free_list); ++ } ++ ++ list_for_each_entry(sko, &sks->sks_free_list, sko_list) ++ if (skc->skc_ctor) ++ skc->skc_ctor(sko->sko_addr, skc->skc_private, flags); ++out: ++ if (rc) { ++ if (skc->skc_flags & KMC_OFFSLAB) ++ list_for_each_entry_safe(sko, n, &sks->sks_free_list, ++ sko_list) ++ kv_free(skc, sko->sko_addr, offslab_size); ++ ++ kv_free(skc, base, skc->skc_slab_size); ++ sks = NULL; ++ } ++ ++ SRETURN(sks); ++} ++ ++/* ++ * Remove a slab from complete or partial list, it must be called with ++ * the 'skc->skc_lock' held but the actual free must be performed ++ * outside the lock to prevent deadlocking on vmem addresses. ++ */ ++static void ++spl_slab_free(spl_kmem_slab_t *sks, ++ struct list_head *sks_list, struct list_head *sko_list) ++{ ++ spl_kmem_cache_t *skc; ++ SENTRY; ++ ++ ASSERT(sks->sks_magic == SKS_MAGIC); ++ ASSERT(sks->sks_ref == 0); ++ ++ skc = sks->sks_cache; ++ ASSERT(skc->skc_magic == SKC_MAGIC); ++ ASSERT(spin_is_locked(&skc->skc_lock)); ++ ++ /* ++ * Update slab/objects counters in the cache, then remove the ++ * slab from the skc->skc_partial_list. Finally add the slab ++ * and all its objects in to the private work lists where the ++ * destructors will be called and the memory freed to the system. ++ */ ++ skc->skc_obj_total -= sks->sks_objs; ++ skc->skc_slab_total--; ++ list_del(&sks->sks_list); ++ list_add(&sks->sks_list, sks_list); ++ list_splice_init(&sks->sks_free_list, sko_list); ++ ++ SEXIT; ++} ++ ++/* ++ * Traverses all the partial slabs attached to a cache and free those ++ * which which are currently empty, and have not been touched for ++ * skc_delay seconds to avoid thrashing. The count argument is ++ * passed to optionally cap the number of slabs reclaimed, a count ++ * of zero means try and reclaim everything. When flag is set we ++ * always free an available slab regardless of age. ++ */ ++static void ++spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag) ++{ ++ spl_kmem_slab_t *sks, *m; ++ spl_kmem_obj_t *sko, *n; ++ LIST_HEAD(sks_list); ++ LIST_HEAD(sko_list); ++ uint32_t size = 0; ++ int i = 0; ++ SENTRY; ++ ++ /* ++ * Move empty slabs and objects which have not been touched in ++ * skc_delay seconds on to private lists to be freed outside ++ * the spin lock. This delay time is important to avoid thrashing ++ * however when flag is set the delay will not be used. ++ */ ++ spin_lock(&skc->skc_lock); ++ list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){ ++ /* ++ * All empty slabs are at the end of skc->skc_partial_list, ++ * therefore once a non-empty slab is found we can stop ++ * scanning. Additionally, stop when reaching the target ++ * reclaim 'count' if a non-zero threshold is given. ++ */ ++ if ((sks->sks_ref > 0) || (count && i >= count)) ++ break; ++ ++ if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) { ++ spl_slab_free(sks, &sks_list, &sko_list); ++ i++; ++ } ++ } ++ spin_unlock(&skc->skc_lock); ++ ++ /* ++ * The following two loops ensure all the object destructors are ++ * run, any offslab objects are freed, and the slabs themselves ++ * are freed. This is all done outside the skc->skc_lock since ++ * this allows the destructor to sleep, and allows us to perform ++ * a conditional reschedule when a freeing a large number of ++ * objects and slabs back to the system. ++ */ ++ if (skc->skc_flags & KMC_OFFSLAB) ++ size = spl_offslab_size(skc); ++ ++ list_for_each_entry_safe(sko, n, &sko_list, sko_list) { ++ ASSERT(sko->sko_magic == SKO_MAGIC); ++ ++ if (skc->skc_dtor) ++ skc->skc_dtor(sko->sko_addr, skc->skc_private); ++ ++ if (skc->skc_flags & KMC_OFFSLAB) ++ kv_free(skc, sko->sko_addr, size); ++ ++ cond_resched(); ++ } ++ ++ list_for_each_entry_safe(sks, m, &sks_list, sks_list) { ++ ASSERT(sks->sks_magic == SKS_MAGIC); ++ kv_free(skc, sks, skc->skc_slab_size); ++ cond_resched(); ++ } ++ ++ SEXIT; ++} ++ ++static spl_kmem_emergency_t * ++spl_emergency_search(struct rb_root *root, void *obj) ++{ ++ struct rb_node *node = root->rb_node; ++ spl_kmem_emergency_t *ske; ++ unsigned long address = (unsigned long)obj; ++ ++ while (node) { ++ ske = container_of(node, spl_kmem_emergency_t, ske_node); ++ ++ if (address < (unsigned long)ske->ske_obj) ++ node = node->rb_left; ++ else if (address > (unsigned long)ske->ske_obj) ++ node = node->rb_right; ++ else ++ return ske; ++ } ++ ++ return NULL; ++} ++ ++static int ++spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) ++{ ++ struct rb_node **new = &(root->rb_node), *parent = NULL; ++ spl_kmem_emergency_t *ske_tmp; ++ unsigned long address = (unsigned long)ske->ske_obj; ++ ++ while (*new) { ++ ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); ++ ++ parent = *new; ++ if (address < (unsigned long)ske_tmp->ske_obj) ++ new = &((*new)->rb_left); ++ else if (address > (unsigned long)ske_tmp->ske_obj) ++ new = &((*new)->rb_right); ++ else ++ return 0; ++ } ++ ++ rb_link_node(&ske->ske_node, parent, new); ++ rb_insert_color(&ske->ske_node, root); ++ ++ return 1; ++} ++ ++/* ++ * Allocate a single emergency object and track it in a red black tree. ++ */ ++static int ++spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) ++{ ++ spl_kmem_emergency_t *ske; ++ int empty; ++ SENTRY; ++ ++ /* Last chance use a partial slab if one now exists */ ++ spin_lock(&skc->skc_lock); ++ empty = list_empty(&skc->skc_partial_list); ++ spin_unlock(&skc->skc_lock); ++ if (!empty) ++ SRETURN(-EEXIST); ++ ++ ske = kmalloc(sizeof(*ske), flags); ++ if (ske == NULL) ++ SRETURN(-ENOMEM); ++ ++ ske->ske_obj = kmalloc(skc->skc_obj_size, flags); ++ if (ske->ske_obj == NULL) { ++ kfree(ske); ++ SRETURN(-ENOMEM); ++ } ++ ++ spin_lock(&skc->skc_lock); ++ empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); ++ if (likely(empty)) { ++ skc->skc_obj_total++; ++ skc->skc_obj_emergency++; ++ if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) ++ skc->skc_obj_emergency_max = skc->skc_obj_emergency; ++ } ++ spin_unlock(&skc->skc_lock); ++ ++ if (unlikely(!empty)) { ++ kfree(ske->ske_obj); ++ kfree(ske); ++ SRETURN(-EINVAL); ++ } ++ ++ if (skc->skc_ctor) ++ skc->skc_ctor(ske->ske_obj, skc->skc_private, flags); ++ ++ *obj = ske->ske_obj; ++ ++ SRETURN(0); ++} ++ ++/* ++ * Locate the passed object in the red black tree and free it. ++ */ ++static int ++spl_emergency_free(spl_kmem_cache_t *skc, void *obj) ++{ ++ spl_kmem_emergency_t *ske; ++ SENTRY; ++ ++ spin_lock(&skc->skc_lock); ++ ske = spl_emergency_search(&skc->skc_emergency_tree, obj); ++ if (likely(ske)) { ++ rb_erase(&ske->ske_node, &skc->skc_emergency_tree); ++ skc->skc_obj_emergency--; ++ skc->skc_obj_total--; ++ } ++ spin_unlock(&skc->skc_lock); ++ ++ if (unlikely(ske == NULL)) ++ SRETURN(-ENOENT); ++ ++ if (skc->skc_dtor) ++ skc->skc_dtor(ske->ske_obj, skc->skc_private); ++ ++ kfree(ske->ske_obj); ++ kfree(ske); ++ ++ SRETURN(0); ++} ++ ++/* ++ * Called regularly on all caches to age objects out of the magazines ++ * which have not been access in skc->skc_delay seconds. This prevents ++ * idle magazines from holding memory which might be better used by ++ * other caches or parts of the system. The delay is present to ++ * prevent thrashing the magazine. ++ */ ++static void ++spl_magazine_age(void *data) ++{ ++ spl_kmem_magazine_t *skm = ++ spl_get_work_data(data, spl_kmem_magazine_t, skm_work.work); ++ spl_kmem_cache_t *skc = skm->skm_cache; ++ ++ ASSERT(skm->skm_magic == SKM_MAGIC); ++ ASSERT(skc->skc_magic == SKC_MAGIC); ++ ASSERT(skc->skc_mag[skm->skm_cpu] == skm); ++ ++ if (skm->skm_avail > 0 && ++ time_after(jiffies, skm->skm_age + skc->skc_delay * HZ)) ++ (void)spl_cache_flush(skc, skm, skm->skm_refill); ++ ++ if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)) ++ schedule_delayed_work_on(skm->skm_cpu, &skm->skm_work, ++ skc->skc_delay / 3 * HZ); ++} ++ ++/* ++ * Called regularly to keep a downward pressure on the size of idle ++ * magazines and to release free slabs from the cache. This function ++ * never calls the registered reclaim function, that only occurs ++ * under memory pressure or with a direct call to spl_kmem_reap(). ++ */ ++static void ++spl_cache_age(void *data) ++{ ++ spl_kmem_cache_t *skc = ++ spl_get_work_data(data, spl_kmem_cache_t, skc_work.work); ++ ++ ASSERT(skc->skc_magic == SKC_MAGIC); ++ spl_slab_reclaim(skc, skc->skc_reap, 0); ++ ++ if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)) ++ schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ); ++} ++ ++/* ++ * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. ++ * When on-slab we want to target SPL_KMEM_CACHE_OBJ_PER_SLAB. However, ++ * for very small objects we may end up with more than this so as not ++ * to waste space in the minimal allocation of a single page. Also for ++ * very large objects we may use as few as SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN, ++ * lower than this and we will fail. ++ */ ++static int ++spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) ++{ ++ uint32_t sks_size, obj_size, max_size; ++ ++ if (skc->skc_flags & KMC_OFFSLAB) { ++ *objs = SPL_KMEM_CACHE_OBJ_PER_SLAB; ++ *size = sizeof(spl_kmem_slab_t); ++ } else { ++ sks_size = spl_sks_size(skc); ++ obj_size = spl_obj_size(skc); ++ ++ if (skc->skc_flags & KMC_KMEM) ++ max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE; ++ else ++ max_size = (32 * 1024 * 1024); ++ ++ /* Power of two sized slab */ ++ for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) { ++ *objs = (*size - sks_size) / obj_size; ++ if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB) ++ SRETURN(0); ++ } ++ ++ /* ++ * Unable to satisfy target objects per slab, fall back to ++ * allocating a maximally sized slab and assuming it can ++ * contain the minimum objects count use it. If not fail. ++ */ ++ *size = max_size; ++ *objs = (*size - sks_size) / obj_size; ++ if (*objs >= SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN) ++ SRETURN(0); ++ } ++ ++ SRETURN(-ENOSPC); ++} ++ ++/* ++ * Make a guess at reasonable per-cpu magazine size based on the size of ++ * each object and the cost of caching N of them in each magazine. Long ++ * term this should really adapt based on an observed usage heuristic. ++ */ ++static int ++spl_magazine_size(spl_kmem_cache_t *skc) ++{ ++ uint32_t obj_size = spl_obj_size(skc); ++ int size; ++ SENTRY; ++ ++ /* Per-magazine sizes below assume a 4Kib page size */ ++ if (obj_size > (PAGE_SIZE * 256)) ++ size = 4; /* Minimum 4Mib per-magazine */ ++ else if (obj_size > (PAGE_SIZE * 32)) ++ size = 16; /* Minimum 2Mib per-magazine */ ++ else if (obj_size > (PAGE_SIZE)) ++ size = 64; /* Minimum 256Kib per-magazine */ ++ else if (obj_size > (PAGE_SIZE / 4)) ++ size = 128; /* Minimum 128Kib per-magazine */ ++ else ++ size = 256; ++ ++ SRETURN(size); ++} ++ ++/* ++ * Allocate a per-cpu magazine to associate with a specific core. ++ */ ++static spl_kmem_magazine_t * ++spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) ++{ ++ spl_kmem_magazine_t *skm; ++ int size = sizeof(spl_kmem_magazine_t) + ++ sizeof(void *) * skc->skc_mag_size; ++ SENTRY; ++ ++ skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu)); ++ if (skm) { ++ skm->skm_magic = SKM_MAGIC; ++ skm->skm_avail = 0; ++ skm->skm_size = skc->skc_mag_size; ++ skm->skm_refill = skc->skc_mag_refill; ++ skm->skm_cache = skc; ++ spl_init_delayed_work(&skm->skm_work, spl_magazine_age, skm); ++ skm->skm_age = jiffies; ++ skm->skm_cpu = cpu; ++ } ++ ++ SRETURN(skm); ++} ++ ++/* ++ * Free a per-cpu magazine associated with a specific core. ++ */ ++static void ++spl_magazine_free(spl_kmem_magazine_t *skm) ++{ ++ int size = sizeof(spl_kmem_magazine_t) + ++ sizeof(void *) * skm->skm_size; ++ ++ SENTRY; ++ ASSERT(skm->skm_magic == SKM_MAGIC); ++ ASSERT(skm->skm_avail == 0); ++ ++ kmem_free(skm, size); ++ SEXIT; ++} ++ ++/* ++ * Create all pre-cpu magazines of reasonable sizes. ++ */ ++static int ++spl_magazine_create(spl_kmem_cache_t *skc) ++{ ++ int i; ++ SENTRY; ++ ++ skc->skc_mag_size = spl_magazine_size(skc); ++ skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; ++ ++ for_each_online_cpu(i) { ++ skc->skc_mag[i] = spl_magazine_alloc(skc, i); ++ if (!skc->skc_mag[i]) { ++ for (i--; i >= 0; i--) ++ spl_magazine_free(skc->skc_mag[i]); ++ ++ SRETURN(-ENOMEM); ++ } ++ } ++ ++ /* Only after everything is allocated schedule magazine work */ ++ for_each_online_cpu(i) ++ schedule_delayed_work_on(i, &skc->skc_mag[i]->skm_work, ++ skc->skc_delay / 3 * HZ); ++ ++ SRETURN(0); ++} ++ ++/* ++ * Destroy all pre-cpu magazines. ++ */ ++static void ++spl_magazine_destroy(spl_kmem_cache_t *skc) ++{ ++ spl_kmem_magazine_t *skm; ++ int i; ++ SENTRY; ++ ++ for_each_online_cpu(i) { ++ skm = skc->skc_mag[i]; ++ (void)spl_cache_flush(skc, skm, skm->skm_avail); ++ spl_magazine_free(skm); ++ } ++ ++ SEXIT; ++} ++ ++/* ++ * Create a object cache based on the following arguments: ++ * name cache name ++ * size cache object size ++ * align cache object alignment ++ * ctor cache object constructor ++ * dtor cache object destructor ++ * reclaim cache object reclaim ++ * priv cache private data for ctor/dtor/reclaim ++ * vmp unused must be NULL ++ * flags ++ * KMC_NOTOUCH Disable cache object aging (unsupported) ++ * KMC_NODEBUG Disable debugging (unsupported) ++ * KMC_NOMAGAZINE Disable magazine (unsupported) ++ * KMC_NOHASH Disable hashing (unsupported) ++ * KMC_QCACHE Disable qcache (unsupported) ++ * KMC_KMEM Force kmem backed cache ++ * KMC_VMEM Force vmem backed cache ++ * KMC_OFFSLAB Locate objects off the slab ++ */ ++spl_kmem_cache_t * ++spl_kmem_cache_create(char *name, size_t size, size_t align, ++ spl_kmem_ctor_t ctor, ++ spl_kmem_dtor_t dtor, ++ spl_kmem_reclaim_t reclaim, ++ void *priv, void *vmp, int flags) ++{ ++ spl_kmem_cache_t *skc; ++ int rc, kmem_flags = KM_SLEEP; ++ SENTRY; ++ ++ ASSERTF(!(flags & KMC_NOMAGAZINE), "Bad KMC_NOMAGAZINE (%x)\n", flags); ++ ASSERTF(!(flags & KMC_NOHASH), "Bad KMC_NOHASH (%x)\n", flags); ++ ASSERTF(!(flags & KMC_QCACHE), "Bad KMC_QCACHE (%x)\n", flags); ++ ASSERT(vmp == NULL); ++ ++ /* We may be called when there is a non-zero preempt_count or ++ * interrupts are disabled is which case we must not sleep. ++ */ ++ if (current_thread_info()->preempt_count || irqs_disabled()) ++ kmem_flags = KM_NOSLEEP; ++ ++ /* Allocate memory for a new cache an initialize it. Unfortunately, ++ * this usually ends up being a large allocation of ~32k because ++ * we need to allocate enough memory for the worst case number of ++ * cpus in the magazine, skc_mag[NR_CPUS]. Because of this we ++ * explicitly pass KM_NODEBUG to suppress the kmem warning */ ++ skc = (spl_kmem_cache_t *)kmem_zalloc(sizeof(*skc), ++ kmem_flags | KM_NODEBUG); ++ if (skc == NULL) ++ SRETURN(NULL); ++ ++ skc->skc_magic = SKC_MAGIC; ++ skc->skc_name_size = strlen(name) + 1; ++ skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, kmem_flags); ++ if (skc->skc_name == NULL) { ++ kmem_free(skc, sizeof(*skc)); ++ SRETURN(NULL); ++ } ++ strncpy(skc->skc_name, name, skc->skc_name_size); ++ ++ skc->skc_ctor = ctor; ++ skc->skc_dtor = dtor; ++ skc->skc_reclaim = reclaim; ++ skc->skc_private = priv; ++ skc->skc_vmp = vmp; ++ skc->skc_flags = flags; ++ skc->skc_obj_size = size; ++ skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; ++ skc->skc_delay = SPL_KMEM_CACHE_DELAY; ++ skc->skc_reap = SPL_KMEM_CACHE_REAP; ++ atomic_set(&skc->skc_ref, 0); ++ ++ INIT_LIST_HEAD(&skc->skc_list); ++ INIT_LIST_HEAD(&skc->skc_complete_list); ++ INIT_LIST_HEAD(&skc->skc_partial_list); ++ skc->skc_emergency_tree = RB_ROOT; ++ spin_lock_init(&skc->skc_lock); ++ init_waitqueue_head(&skc->skc_waitq); ++ skc->skc_slab_fail = 0; ++ skc->skc_slab_create = 0; ++ skc->skc_slab_destroy = 0; ++ skc->skc_slab_total = 0; ++ skc->skc_slab_alloc = 0; ++ skc->skc_slab_max = 0; ++ skc->skc_obj_total = 0; ++ skc->skc_obj_alloc = 0; ++ skc->skc_obj_max = 0; ++ skc->skc_obj_deadlock = 0; ++ skc->skc_obj_emergency = 0; ++ skc->skc_obj_emergency_max = 0; ++ ++ if (align) { ++ VERIFY(ISP2(align)); ++ VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); /* Min alignment */ ++ VERIFY3U(align, <=, PAGE_SIZE); /* Max alignment */ ++ skc->skc_obj_align = align; ++ } ++ ++ /* If none passed select a cache type based on object size */ ++ if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) { ++ if (spl_obj_size(skc) < (PAGE_SIZE / 8)) ++ skc->skc_flags |= KMC_KMEM; ++ else ++ skc->skc_flags |= KMC_VMEM; ++ } ++ ++ rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size); ++ if (rc) ++ SGOTO(out, rc); ++ ++ rc = spl_magazine_create(skc); ++ if (rc) ++ SGOTO(out, rc); ++ ++ spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc); ++ schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ); ++ ++ down_write(&spl_kmem_cache_sem); ++ list_add_tail(&skc->skc_list, &spl_kmem_cache_list); ++ up_write(&spl_kmem_cache_sem); ++ ++ SRETURN(skc); ++out: ++ kmem_free(skc->skc_name, skc->skc_name_size); ++ kmem_free(skc, sizeof(*skc)); ++ SRETURN(NULL); ++} ++EXPORT_SYMBOL(spl_kmem_cache_create); ++ ++/* ++ * Register a move callback to for cache defragmentation. ++ * XXX: Unimplemented but harmless to stub out for now. ++ */ ++void ++spl_kmem_cache_set_move(spl_kmem_cache_t *skc, ++ kmem_cbrc_t (move)(void *, void *, size_t, void *)) ++{ ++ ASSERT(move != NULL); ++} ++EXPORT_SYMBOL(spl_kmem_cache_set_move); ++ ++/* ++ * Destroy a cache and all objects associated with the cache. ++ */ ++void ++spl_kmem_cache_destroy(spl_kmem_cache_t *skc) ++{ ++ DECLARE_WAIT_QUEUE_HEAD(wq); ++ int i; ++ SENTRY; ++ ++ ASSERT(skc->skc_magic == SKC_MAGIC); ++ ++ down_write(&spl_kmem_cache_sem); ++ list_del_init(&skc->skc_list); ++ up_write(&spl_kmem_cache_sem); ++ ++ /* Cancel any and wait for any pending delayed work */ ++ VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); ++ cancel_delayed_work_sync(&skc->skc_work); ++ for_each_online_cpu(i) ++ cancel_delayed_work_sync(&skc->skc_mag[i]->skm_work); ++ ++ flush_scheduled_work(); ++ ++ /* Wait until all current callers complete, this is mainly ++ * to catch the case where a low memory situation triggers a ++ * cache reaping action which races with this destroy. */ ++ wait_event(wq, atomic_read(&skc->skc_ref) == 0); ++ ++ spl_magazine_destroy(skc); ++ spl_slab_reclaim(skc, 0, 1); ++ spin_lock(&skc->skc_lock); ++ ++ /* Validate there are no objects in use and free all the ++ * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */ ++ ASSERT3U(skc->skc_slab_alloc, ==, 0); ++ ASSERT3U(skc->skc_obj_alloc, ==, 0); ++ ASSERT3U(skc->skc_slab_total, ==, 0); ++ ASSERT3U(skc->skc_obj_total, ==, 0); ++ ASSERT3U(skc->skc_obj_emergency, ==, 0); ++ ASSERT(list_empty(&skc->skc_complete_list)); ++ ++ kmem_free(skc->skc_name, skc->skc_name_size); ++ spin_unlock(&skc->skc_lock); ++ ++ kmem_free(skc, sizeof(*skc)); ++ ++ SEXIT; ++} ++EXPORT_SYMBOL(spl_kmem_cache_destroy); ++ ++/* ++ * Allocate an object from a slab attached to the cache. This is used to ++ * repopulate the per-cpu magazine caches in batches when they run low. ++ */ ++static void * ++spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) ++{ ++ spl_kmem_obj_t *sko; ++ ++ ASSERT(skc->skc_magic == SKC_MAGIC); ++ ASSERT(sks->sks_magic == SKS_MAGIC); ++ ASSERT(spin_is_locked(&skc->skc_lock)); ++ ++ sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list); ++ ASSERT(sko->sko_magic == SKO_MAGIC); ++ ASSERT(sko->sko_addr != NULL); ++ ++ /* Remove from sks_free_list */ ++ list_del_init(&sko->sko_list); ++ ++ sks->sks_age = jiffies; ++ sks->sks_ref++; ++ skc->skc_obj_alloc++; ++ ++ /* Track max obj usage statistics */ ++ if (skc->skc_obj_alloc > skc->skc_obj_max) ++ skc->skc_obj_max = skc->skc_obj_alloc; ++ ++ /* Track max slab usage statistics */ ++ if (sks->sks_ref == 1) { ++ skc->skc_slab_alloc++; ++ ++ if (skc->skc_slab_alloc > skc->skc_slab_max) ++ skc->skc_slab_max = skc->skc_slab_alloc; ++ } ++ ++ return sko->sko_addr; ++} ++ ++/* ++ * Generic slab allocation function to run by the global work queues. ++ * It is responsible for allocating a new slab, linking it in to the list ++ * of partial slabs, and then waking any waiters. ++ */ ++static void ++spl_cache_grow_work(void *data) ++{ ++ spl_kmem_alloc_t *ska = ++ spl_get_work_data(data, spl_kmem_alloc_t, ska_work.work); ++ spl_kmem_cache_t *skc = ska->ska_cache; ++ spl_kmem_slab_t *sks; ++ ++ sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG); ++ spin_lock(&skc->skc_lock); ++ if (sks) { ++ skc->skc_slab_total++; ++ skc->skc_obj_total += sks->sks_objs; ++ list_add_tail(&sks->sks_list, &skc->skc_partial_list); ++ } ++ ++ atomic_dec(&skc->skc_ref); ++ clear_bit(KMC_BIT_GROWING, &skc->skc_flags); ++ clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); ++ wake_up_all(&skc->skc_waitq); ++ spin_unlock(&skc->skc_lock); ++ ++ kfree(ska); ++} ++ ++/* ++ * Returns non-zero when a new slab should be available. ++ */ ++static int ++spl_cache_grow_wait(spl_kmem_cache_t *skc) ++{ ++ return !test_bit(KMC_BIT_GROWING, &skc->skc_flags); ++} ++ ++static int ++spl_cache_reclaim_wait(void *word) ++{ ++ schedule(); ++ return 0; ++} ++ ++/* ++ * No available objects on any slabs, create a new slab. ++ */ ++static int ++spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) ++{ ++ int remaining, rc; ++ SENTRY; ++ ++ ASSERT(skc->skc_magic == SKC_MAGIC); ++ might_sleep(); ++ *obj = NULL; ++ ++ /* ++ * Before allocating a new slab wait for any reaping to complete and ++ * then return so the local magazine can be rechecked for new objects. ++ */ ++ if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { ++ rc = wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, ++ spl_cache_reclaim_wait, TASK_UNINTERRUPTIBLE); ++ SRETURN(rc ? rc : -EAGAIN); ++ } ++ ++ /* ++ * This is handled by dispatching a work request to the global work ++ * queue. This allows us to asynchronously allocate a new slab while ++ * retaining the ability to safely fall back to a smaller synchronous ++ * allocations to ensure forward progress is always maintained. ++ */ ++ if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { ++ spl_kmem_alloc_t *ska; ++ ++ ska = kmalloc(sizeof(*ska), flags); ++ if (ska == NULL) { ++ clear_bit(KMC_BIT_GROWING, &skc->skc_flags); ++ wake_up_all(&skc->skc_waitq); ++ SRETURN(-ENOMEM); ++ } ++ ++ atomic_inc(&skc->skc_ref); ++ ska->ska_cache = skc; ++ ska->ska_flags = flags; ++ spl_init_delayed_work(&ska->ska_work, spl_cache_grow_work, ska); ++ schedule_delayed_work(&ska->ska_work, 0); ++ } ++ ++ /* ++ * The goal here is to only detect the rare case where a virtual slab ++ * allocation has deadlocked. We must be careful to minimize the use ++ * of emergency objects which are more expensive to track. Therefore, ++ * we set a very long timeout for the asynchronous allocation and if ++ * the timeout is reached the cache is flagged as deadlocked. From ++ * this point only new emergency objects will be allocated until the ++ * asynchronous allocation completes and clears the deadlocked flag. ++ */ ++ if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { ++ rc = spl_emergency_alloc(skc, flags, obj); ++ } else { ++ remaining = wait_event_timeout(skc->skc_waitq, ++ spl_cache_grow_wait(skc), HZ); ++ ++ if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) { ++ spin_lock(&skc->skc_lock); ++ if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { ++ set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); ++ skc->skc_obj_deadlock++; ++ } ++ spin_unlock(&skc->skc_lock); ++ } ++ ++ rc = -ENOMEM; ++ } ++ ++ SRETURN(rc); ++} ++ ++/* ++ * Refill a per-cpu magazine with objects from the slabs for this cache. ++ * Ideally the magazine can be repopulated using existing objects which have ++ * been released, however if we are unable to locate enough free objects new ++ * slabs of objects will be created. On success NULL is returned, otherwise ++ * the address of a single emergency object is returned for use by the caller. ++ */ ++static void * ++spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) ++{ ++ spl_kmem_slab_t *sks; ++ int count = 0, rc, refill; ++ void *obj = NULL; ++ SENTRY; ++ ++ ASSERT(skc->skc_magic == SKC_MAGIC); ++ ASSERT(skm->skm_magic == SKM_MAGIC); ++ ++ refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); ++ spin_lock(&skc->skc_lock); ++ ++ while (refill > 0) { ++ /* No slabs available we may need to grow the cache */ ++ if (list_empty(&skc->skc_partial_list)) { ++ spin_unlock(&skc->skc_lock); ++ ++ local_irq_enable(); ++ rc = spl_cache_grow(skc, flags, &obj); ++ local_irq_disable(); ++ ++ /* Emergency object for immediate use by caller */ ++ if (rc == 0 && obj != NULL) ++ SRETURN(obj); ++ ++ if (rc) ++ SGOTO(out, rc); ++ ++ /* Rescheduled to different CPU skm is not local */ ++ if (skm != skc->skc_mag[smp_processor_id()]) ++ SGOTO(out, rc); ++ ++ /* Potentially rescheduled to the same CPU but ++ * allocations may have occurred from this CPU while ++ * we were sleeping so recalculate max refill. */ ++ refill = MIN(refill, skm->skm_size - skm->skm_avail); ++ ++ spin_lock(&skc->skc_lock); ++ continue; ++ } ++ ++ /* Grab the next available slab */ ++ sks = list_entry((&skc->skc_partial_list)->next, ++ spl_kmem_slab_t, sks_list); ++ ASSERT(sks->sks_magic == SKS_MAGIC); ++ ASSERT(sks->sks_ref < sks->sks_objs); ++ ASSERT(!list_empty(&sks->sks_free_list)); ++ ++ /* Consume as many objects as needed to refill the requested ++ * cache. We must also be careful not to overfill it. */ ++ while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) { ++ ASSERT(skm->skm_avail < skm->skm_size); ++ ASSERT(count < skm->skm_size); ++ skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks); ++ } ++ ++ /* Move slab to skc_complete_list when full */ ++ if (sks->sks_ref == sks->sks_objs) { ++ list_del(&sks->sks_list); ++ list_add(&sks->sks_list, &skc->skc_complete_list); ++ } ++ } ++ ++ spin_unlock(&skc->skc_lock); ++out: ++ SRETURN(NULL); ++} ++ ++/* ++ * Release an object back to the slab from which it came. ++ */ ++static void ++spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) ++{ ++ spl_kmem_slab_t *sks = NULL; ++ spl_kmem_obj_t *sko = NULL; ++ SENTRY; ++ ++ ASSERT(skc->skc_magic == SKC_MAGIC); ++ ASSERT(spin_is_locked(&skc->skc_lock)); ++ ++ sko = spl_sko_from_obj(skc, obj); ++ ASSERT(sko->sko_magic == SKO_MAGIC); ++ sks = sko->sko_slab; ++ ASSERT(sks->sks_magic == SKS_MAGIC); ++ ASSERT(sks->sks_cache == skc); ++ list_add(&sko->sko_list, &sks->sks_free_list); ++ ++ sks->sks_age = jiffies; ++ sks->sks_ref--; ++ skc->skc_obj_alloc--; ++ ++ /* Move slab to skc_partial_list when no longer full. Slabs ++ * are added to the head to keep the partial list is quasi-full ++ * sorted order. Fuller at the head, emptier at the tail. */ ++ if (sks->sks_ref == (sks->sks_objs - 1)) { ++ list_del(&sks->sks_list); ++ list_add(&sks->sks_list, &skc->skc_partial_list); ++ } ++ ++ /* Move empty slabs to the end of the partial list so ++ * they can be easily found and freed during reclamation. */ ++ if (sks->sks_ref == 0) { ++ list_del(&sks->sks_list); ++ list_add_tail(&sks->sks_list, &skc->skc_partial_list); ++ skc->skc_slab_alloc--; ++ } ++ ++ SEXIT; ++} ++ ++/* ++ * Release a batch of objects from a per-cpu magazine back to their ++ * respective slabs. This occurs when we exceed the magazine size, ++ * are under memory pressure, when the cache is idle, or during ++ * cache cleanup. The flush argument contains the number of entries ++ * to remove from the magazine. ++ */ ++static int ++spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) ++{ ++ int i, count = MIN(flush, skm->skm_avail); ++ SENTRY; ++ ++ ASSERT(skc->skc_magic == SKC_MAGIC); ++ ASSERT(skm->skm_magic == SKM_MAGIC); ++ ++ /* ++ * XXX: Currently we simply return objects from the magazine to ++ * the slabs in fifo order. The ideal thing to do from a memory ++ * fragmentation standpoint is to cheaply determine the set of ++ * objects in the magazine which will result in the largest ++ * number of free slabs if released from the magazine. ++ */ ++ spin_lock(&skc->skc_lock); ++ for (i = 0; i < count; i++) ++ spl_cache_shrink(skc, skm->skm_objs[i]); ++ ++ skm->skm_avail -= count; ++ memmove(skm->skm_objs, &(skm->skm_objs[count]), ++ sizeof(void *) * skm->skm_avail); ++ ++ spin_unlock(&skc->skc_lock); ++ ++ SRETURN(count); ++} ++ ++/* ++ * Allocate an object from the per-cpu magazine, or if the magazine ++ * is empty directly allocate from a slab and repopulate the magazine. ++ */ ++void * ++spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) ++{ ++ spl_kmem_magazine_t *skm; ++ unsigned long irq_flags; ++ void *obj = NULL; ++ SENTRY; ++ ++ ASSERT(skc->skc_magic == SKC_MAGIC); ++ ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); ++ ASSERT(flags & KM_SLEEP); ++ atomic_inc(&skc->skc_ref); ++ local_irq_save(irq_flags); ++ ++restart: ++ /* Safe to update per-cpu structure without lock, but ++ * in the restart case we must be careful to reacquire ++ * the local magazine since this may have changed ++ * when we need to grow the cache. */ ++ skm = skc->skc_mag[smp_processor_id()]; ++ ASSERTF(skm->skm_magic == SKM_MAGIC, "%x != %x: %s/%p/%p %x/%x/%x\n", ++ skm->skm_magic, SKM_MAGIC, skc->skc_name, skc, skm, ++ skm->skm_size, skm->skm_refill, skm->skm_avail); ++ ++ if (likely(skm->skm_avail)) { ++ /* Object available in CPU cache, use it */ ++ obj = skm->skm_objs[--skm->skm_avail]; ++ skm->skm_age = jiffies; ++ } else { ++ obj = spl_cache_refill(skc, skm, flags); ++ if (obj == NULL) ++ SGOTO(restart, obj = NULL); ++ } ++ ++ local_irq_restore(irq_flags); ++ ASSERT(obj); ++ ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); ++ ++ /* Pre-emptively migrate object to CPU L1 cache */ ++ prefetchw(obj); ++ atomic_dec(&skc->skc_ref); ++ ++ SRETURN(obj); ++} ++EXPORT_SYMBOL(spl_kmem_cache_alloc); ++ ++/* ++ * Free an object back to the local per-cpu magazine, there is no ++ * guarantee that this is the same magazine the object was originally ++ * allocated from. We may need to flush entire from the magazine ++ * back to the slabs to make space. ++ */ ++void ++spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) ++{ ++ spl_kmem_magazine_t *skm; ++ unsigned long flags; ++ SENTRY; ++ ++ ASSERT(skc->skc_magic == SKC_MAGIC); ++ ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); ++ atomic_inc(&skc->skc_ref); ++ ++ /* ++ * Only virtual slabs may have emergency objects and these objects ++ * are guaranteed to have physical addresses. They must be removed ++ * from the tree of emergency objects and the freed. ++ */ ++ if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) ++ SGOTO(out, spl_emergency_free(skc, obj)); ++ ++ local_irq_save(flags); ++ ++ /* Safe to update per-cpu structure without lock, but ++ * no remote memory allocation tracking is being performed ++ * it is entirely possible to allocate an object from one ++ * CPU cache and return it to another. */ ++ skm = skc->skc_mag[smp_processor_id()]; ++ ASSERT(skm->skm_magic == SKM_MAGIC); ++ ++ /* Per-CPU cache full, flush it to make space */ ++ if (unlikely(skm->skm_avail >= skm->skm_size)) ++ (void)spl_cache_flush(skc, skm, skm->skm_refill); ++ ++ /* Available space in cache, use it */ ++ skm->skm_objs[skm->skm_avail++] = obj; ++ ++ local_irq_restore(flags); ++out: ++ atomic_dec(&skc->skc_ref); ++ ++ SEXIT; ++} ++EXPORT_SYMBOL(spl_kmem_cache_free); ++ ++/* ++ * The generic shrinker function for all caches. Under Linux a shrinker ++ * may not be tightly coupled with a slab cache. In fact Linux always ++ * systematically tries calling all registered shrinker callbacks which ++ * report that they contain unused objects. Because of this we only ++ * register one shrinker function in the shim layer for all slab caches. ++ * We always attempt to shrink all caches when this generic shrinker ++ * is called. The shrinker should return the number of free objects ++ * in the cache when called with nr_to_scan == 0 but not attempt to ++ * free any objects. When nr_to_scan > 0 it is a request that nr_to_scan ++ * objects should be freed, which differs from Solaris semantics. ++ * Solaris semantics are to free all available objects which may (and ++ * probably will) be more objects than the requested nr_to_scan. ++ */ ++static int ++__spl_kmem_cache_generic_shrinker(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ spl_kmem_cache_t *skc; ++ int unused = 0; ++ ++ down_read(&spl_kmem_cache_sem); ++ list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { ++ if (sc->nr_to_scan) ++ spl_kmem_cache_reap_now(skc, ++ MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1)); ++ ++ /* ++ * Presume everything alloc'ed in reclaimable, this ensures ++ * we are called again with nr_to_scan > 0 so can try and ++ * reclaim. The exact number is not important either so ++ * we forgo taking this already highly contented lock. ++ */ ++ unused += skc->skc_obj_alloc; ++ } ++ up_read(&spl_kmem_cache_sem); ++ ++ return (unused * sysctl_vfs_cache_pressure) / 100; ++} ++ ++SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker); ++ ++/* ++ * Call the registered reclaim function for a cache. Depending on how ++ * many and which objects are released it may simply repopulate the ++ * local magazine which will then need to age-out. Objects which cannot ++ * fit in the magazine we will be released back to their slabs which will ++ * also need to age out before being release. This is all just best ++ * effort and we do not want to thrash creating and destroying slabs. ++ */ ++void ++spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) ++{ ++ SENTRY; ++ ++ ASSERT(skc->skc_magic == SKC_MAGIC); ++ ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); ++ ++ /* Prevent concurrent cache reaping when contended */ ++ if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) { ++ SEXIT; ++ return; ++ } ++ ++ atomic_inc(&skc->skc_ref); ++ ++ /* ++ * When a reclaim function is available it may be invoked repeatedly ++ * until at least a single slab can be freed. This ensures that we ++ * do free memory back to the system. This helps minimize the chance ++ * of an OOM event when the bulk of memory is used by the slab. ++ * ++ * When free slabs are already available the reclaim callback will be ++ * skipped. Additionally, if no forward progress is detected despite ++ * a reclaim function the cache will be skipped to avoid deadlock. ++ * ++ * Longer term this would be the correct place to add the code which ++ * repacks the slabs in order minimize fragmentation. ++ */ ++ if (skc->skc_reclaim) { ++ uint64_t objects = UINT64_MAX; ++ int do_reclaim; ++ ++ do { ++ spin_lock(&skc->skc_lock); ++ do_reclaim = ++ (skc->skc_slab_total > 0) && ++ ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) && ++ (skc->skc_obj_alloc < objects); ++ ++ objects = skc->skc_obj_alloc; ++ spin_unlock(&skc->skc_lock); ++ ++ if (do_reclaim) ++ skc->skc_reclaim(skc->skc_private); ++ ++ } while (do_reclaim); ++ } ++ ++ /* Reclaim from the cache, ignoring it's age and delay. */ ++ spl_slab_reclaim(skc, count, 1); ++ clear_bit(KMC_BIT_REAPING, &skc->skc_flags); ++ smp_mb__after_clear_bit(); ++ wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); ++ ++ atomic_dec(&skc->skc_ref); ++ ++ SEXIT; ++} ++EXPORT_SYMBOL(spl_kmem_cache_reap_now); ++ ++/* ++ * Reap all free slabs from all registered caches. ++ */ ++void ++spl_kmem_reap(void) ++{ ++ struct shrink_control sc; ++ ++ sc.nr_to_scan = KMC_REAP_CHUNK; ++ sc.gfp_mask = GFP_KERNEL; ++ ++ __spl_kmem_cache_generic_shrinker(NULL, &sc); ++} ++EXPORT_SYMBOL(spl_kmem_reap); ++ ++#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) ++static char * ++spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) ++{ ++ int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; ++ int i, flag = 1; ++ ++ ASSERT(str != NULL && len >= 17); ++ memset(str, 0, len); ++ ++ /* Check for a fully printable string, and while we are at ++ * it place the printable characters in the passed buffer. */ ++ for (i = 0; i < size; i++) { ++ str[i] = ((char *)(kd->kd_addr))[i]; ++ if (isprint(str[i])) { ++ continue; ++ } else { ++ /* Minimum number of printable characters found ++ * to make it worthwhile to print this as ascii. */ ++ if (i > min) ++ break; ++ ++ flag = 0; ++ break; ++ } ++ } ++ ++ if (!flag) { ++ sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", ++ *((uint8_t *)kd->kd_addr), ++ *((uint8_t *)kd->kd_addr + 2), ++ *((uint8_t *)kd->kd_addr + 4), ++ *((uint8_t *)kd->kd_addr + 6), ++ *((uint8_t *)kd->kd_addr + 8), ++ *((uint8_t *)kd->kd_addr + 10), ++ *((uint8_t *)kd->kd_addr + 12), ++ *((uint8_t *)kd->kd_addr + 14)); ++ } ++ ++ return str; ++} ++ ++static int ++spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) ++{ ++ int i; ++ SENTRY; ++ ++ spin_lock_init(lock); ++ INIT_LIST_HEAD(list); ++ ++ for (i = 0; i < size; i++) ++ INIT_HLIST_HEAD(&kmem_table[i]); ++ ++ SRETURN(0); ++} ++ ++static void ++spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) ++{ ++ unsigned long flags; ++ kmem_debug_t *kd; ++ char str[17]; ++ SENTRY; ++ ++ spin_lock_irqsave(lock, flags); ++ if (!list_empty(list)) ++ printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", ++ "size", "data", "func", "line"); ++ ++ list_for_each_entry(kd, list, kd_list) ++ printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, ++ (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), ++ kd->kd_func, kd->kd_line); ++ ++ spin_unlock_irqrestore(lock, flags); ++ SEXIT; ++} ++#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ ++#define spl_kmem_init_tracking(list, lock, size) ++#define spl_kmem_fini_tracking(list, lock) ++#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ ++ ++static void ++spl_kmem_init_globals(void) ++{ ++ struct zone *zone; ++ ++ /* For now all zones are includes, it may be wise to restrict ++ * this to normal and highmem zones if we see problems. */ ++ for_each_zone(zone) { ++ ++ if (!populated_zone(zone)) ++ continue; ++ ++ minfree += min_wmark_pages(zone); ++ desfree += low_wmark_pages(zone); ++ lotsfree += high_wmark_pages(zone); ++ } ++ ++ /* Solaris default values */ ++ swapfs_minfree = MAX(2*1024*1024 >> PAGE_SHIFT, physmem >> 3); ++ swapfs_reserve = MIN(4*1024*1024 >> PAGE_SHIFT, physmem >> 4); ++} ++ ++/* ++ * Called at module init when it is safe to use spl_kallsyms_lookup_name() ++ */ ++int ++spl_kmem_init_kallsyms_lookup(void) ++{ ++#ifndef HAVE_GET_VMALLOC_INFO ++ get_vmalloc_info_fn = (get_vmalloc_info_t) ++ spl_kallsyms_lookup_name("get_vmalloc_info"); ++ if (!get_vmalloc_info_fn) { ++ printk(KERN_ERR "Error: Unknown symbol get_vmalloc_info\n"); ++ return -EFAULT; ++ } ++#endif /* HAVE_GET_VMALLOC_INFO */ ++ ++#ifdef HAVE_PGDAT_HELPERS ++# ifndef HAVE_FIRST_ONLINE_PGDAT ++ first_online_pgdat_fn = (first_online_pgdat_t) ++ spl_kallsyms_lookup_name("first_online_pgdat"); ++ if (!first_online_pgdat_fn) { ++ printk(KERN_ERR "Error: Unknown symbol first_online_pgdat\n"); ++ return -EFAULT; ++ } ++# endif /* HAVE_FIRST_ONLINE_PGDAT */ ++ ++# ifndef HAVE_NEXT_ONLINE_PGDAT ++ next_online_pgdat_fn = (next_online_pgdat_t) ++ spl_kallsyms_lookup_name("next_online_pgdat"); ++ if (!next_online_pgdat_fn) { ++ printk(KERN_ERR "Error: Unknown symbol next_online_pgdat\n"); ++ return -EFAULT; ++ } ++# endif /* HAVE_NEXT_ONLINE_PGDAT */ ++ ++# ifndef HAVE_NEXT_ZONE ++ next_zone_fn = (next_zone_t) ++ spl_kallsyms_lookup_name("next_zone"); ++ if (!next_zone_fn) { ++ printk(KERN_ERR "Error: Unknown symbol next_zone\n"); ++ return -EFAULT; ++ } ++# endif /* HAVE_NEXT_ZONE */ ++ ++#else /* HAVE_PGDAT_HELPERS */ ++ ++# ifndef HAVE_PGDAT_LIST ++ pgdat_list_addr = *(struct pglist_data **) ++ spl_kallsyms_lookup_name("pgdat_list"); ++ if (!pgdat_list_addr) { ++ printk(KERN_ERR "Error: Unknown symbol pgdat_list\n"); ++ return -EFAULT; ++ } ++# endif /* HAVE_PGDAT_LIST */ ++#endif /* HAVE_PGDAT_HELPERS */ ++ ++#if defined(NEED_GET_ZONE_COUNTS) && !defined(HAVE_GET_ZONE_COUNTS) ++ get_zone_counts_fn = (get_zone_counts_t) ++ spl_kallsyms_lookup_name("get_zone_counts"); ++ if (!get_zone_counts_fn) { ++ printk(KERN_ERR "Error: Unknown symbol get_zone_counts\n"); ++ return -EFAULT; ++ } ++#endif /* NEED_GET_ZONE_COUNTS && !HAVE_GET_ZONE_COUNTS */ ++ ++ /* ++ * It is now safe to initialize the global tunings which rely on ++ * the use of the for_each_zone() macro. This macro in turns ++ * depends on the *_pgdat symbols which are now available. ++ */ ++ spl_kmem_init_globals(); ++ ++#if !defined(HAVE_INVALIDATE_INODES) && !defined(HAVE_INVALIDATE_INODES_CHECK) ++ invalidate_inodes_fn = (invalidate_inodes_t) ++ spl_kallsyms_lookup_name("invalidate_inodes"); ++ if (!invalidate_inodes_fn) { ++ printk(KERN_ERR "Error: Unknown symbol invalidate_inodes\n"); ++ return -EFAULT; ++ } ++#endif /* !HAVE_INVALIDATE_INODES && !HAVE_INVALIDATE_INODES_CHECK */ ++ ++#ifndef HAVE_SHRINK_DCACHE_MEMORY ++ /* When shrink_dcache_memory_fn == NULL support is disabled */ ++ shrink_dcache_memory_fn = (shrink_dcache_memory_t) ++ spl_kallsyms_lookup_name("shrink_dcache_memory"); ++#endif /* HAVE_SHRINK_DCACHE_MEMORY */ ++ ++#ifndef HAVE_SHRINK_ICACHE_MEMORY ++ /* When shrink_icache_memory_fn == NULL support is disabled */ ++ shrink_icache_memory_fn = (shrink_icache_memory_t) ++ spl_kallsyms_lookup_name("shrink_icache_memory"); ++#endif /* HAVE_SHRINK_ICACHE_MEMORY */ ++ ++ return 0; ++} ++ ++int ++spl_kmem_init(void) ++{ ++ int rc = 0; ++ SENTRY; ++ ++ init_rwsem(&spl_kmem_cache_sem); ++ INIT_LIST_HEAD(&spl_kmem_cache_list); ++ ++ spl_register_shrinker(&spl_kmem_cache_shrinker); ++ ++#ifdef DEBUG_KMEM ++ kmem_alloc_used_set(0); ++ vmem_alloc_used_set(0); ++ ++ spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); ++ spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE); ++#endif ++ SRETURN(rc); ++} ++ ++void ++spl_kmem_fini(void) ++{ ++#ifdef DEBUG_KMEM ++ /* Display all unreclaimed memory addresses, including the ++ * allocation size and the first few bytes of what's located ++ * at that address to aid in debugging. Performance is not ++ * a serious concern here since it is module unload time. */ ++ if (kmem_alloc_used_read() != 0) ++ SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, ++ "kmem leaked %ld/%ld bytes\n", ++ kmem_alloc_used_read(), kmem_alloc_max); ++ ++ ++ if (vmem_alloc_used_read() != 0) ++ SDEBUG_LIMIT(SD_CONSOLE | SD_WARNING, ++ "vmem leaked %ld/%ld bytes\n", ++ vmem_alloc_used_read(), vmem_alloc_max); ++ ++ spl_kmem_fini_tracking(&kmem_list, &kmem_lock); ++ spl_kmem_fini_tracking(&vmem_list, &vmem_lock); ++#endif /* DEBUG_KMEM */ ++ SENTRY; ++ ++ spl_unregister_shrinker(&spl_kmem_cache_shrinker); ++ ++ SEXIT; ++} +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-kobj.c linux-3.2.33-go/spl/spl/spl-kobj.c +--- linux-3.2.33-go.orig/spl/spl/spl-kobj.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-kobj.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,93 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Kobj Implementation. ++\*****************************************************************************/ ++ ++#include ++#include ++ ++#ifdef SS_DEBUG_SUBSYS ++#undef SS_DEBUG_SUBSYS ++#endif ++ ++#define SS_DEBUG_SUBSYS SS_KOBJ ++ ++struct _buf * ++kobj_open_file(const char *name) ++{ ++ struct _buf *file; ++ vnode_t *vp; ++ int rc; ++ SENTRY; ++ ++ file = kmalloc(sizeof(_buf_t), GFP_KERNEL); ++ if (file == NULL) ++ SRETURN((_buf_t *)-1UL); ++ ++ if ((rc = vn_open(name, UIO_SYSSPACE, FREAD, 0644, &vp, 0, 0))) { ++ kfree(file); ++ SRETURN((_buf_t *)-1UL); ++ } ++ ++ file->vp = vp; ++ ++ SRETURN(file); ++} /* kobj_open_file() */ ++EXPORT_SYMBOL(kobj_open_file); ++ ++void ++kobj_close_file(struct _buf *file) ++{ ++ SENTRY; ++ VOP_CLOSE(file->vp, 0, 0, 0, 0, 0); ++ kfree(file); ++ SEXIT; ++} /* kobj_close_file() */ ++EXPORT_SYMBOL(kobj_close_file); ++ ++int ++kobj_read_file(struct _buf *file, char *buf, ssize_t size, offset_t off) ++{ ++ SENTRY; ++ SRETURN(vn_rdwr(UIO_READ, file->vp, buf, size, off, ++ UIO_SYSSPACE, 0, RLIM64_INFINITY, 0, NULL)); ++} /* kobj_read_file() */ ++EXPORT_SYMBOL(kobj_read_file); ++ ++int ++kobj_get_filesize(struct _buf *file, uint64_t *size) ++{ ++ vattr_t vap; ++ int rc; ++ SENTRY; ++ ++ rc = VOP_GETATTR(file->vp, &vap, 0, 0, NULL); ++ if (rc) ++ SRETURN(rc); ++ ++ *size = vap.va_size; ++ ++ SRETURN(rc); ++} /* kobj_get_filesize() */ ++EXPORT_SYMBOL(kobj_get_filesize); +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-kstat.c linux-3.2.33-go/spl/spl/spl-kstat.c +--- linux-3.2.33-go.orig/spl/spl/spl-kstat.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-kstat.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,549 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Kstat Implementation. ++\*****************************************************************************/ ++ ++#include ++#include ++#include ++ ++#ifdef SS_DEBUG_SUBSYS ++#undef SS_DEBUG_SUBSYS ++#endif ++ ++#define SS_DEBUG_SUBSYS SS_KSTAT ++ ++static spinlock_t kstat_lock; ++static struct list_head kstat_list; ++static kid_t kstat_id; ++ ++static void ++kstat_seq_show_headers(struct seq_file *f) ++{ ++ kstat_t *ksp = (kstat_t *)f->private; ++ ASSERT(ksp->ks_magic == KS_MAGIC); ++ ++ seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n", ++ ksp->ks_kid, ksp->ks_type, ksp->ks_flags, ++ ksp->ks_ndata, (int)ksp->ks_data_size, ++ ksp->ks_crtime, ksp->ks_snaptime); ++ ++ switch (ksp->ks_type) { ++ case KSTAT_TYPE_RAW: ++ seq_printf(f, "raw data"); ++ break; ++ case KSTAT_TYPE_NAMED: ++ seq_printf(f, "%-31s %-4s %s\n", ++ "name", "type", "data"); ++ break; ++ case KSTAT_TYPE_INTR: ++ seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n", ++ "hard", "soft", "watchdog", ++ "spurious", "multsvc"); ++ break; ++ case KSTAT_TYPE_IO: ++ seq_printf(f, ++ "%-8s %-8s %-8s %-8s %-8s %-8s " ++ "%-8s %-8s %-8s %-8s %-8s %-8s\n", ++ "nread", "nwritten", "reads", "writes", ++ "wtime", "wlentime", "wupdate", ++ "rtime", "rlentime", "rupdate", ++ "wcnt", "rcnt"); ++ break; ++ case KSTAT_TYPE_TIMER: ++ seq_printf(f, ++ "%-31s %-8s " ++ "%-8s %-8s %-8s %-8s %-8s\n", ++ "name", "events", "elapsed", ++ "min", "max", "start", "stop"); ++ break; ++ case KSTAT_TYPE_TXG: ++ seq_printf(f, ++ "%-8s %-5s %-13s %-12s %-12s %-8s %-8s " ++ "%-12s %-12s %-12s\n", ++ "txg", "state", "birth", ++ "nread", "nwritten", "reads", "writes", ++ "otime", "qtime", "stime"); ++ break; ++ default: ++ PANIC("Undefined kstat type %d\n", ksp->ks_type); ++ } ++} ++ ++static int ++kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l) ++{ ++ int i, j; ++ ++ for (i = 0; ; i++) { ++ seq_printf(f, "%03x:", i); ++ ++ for (j = 0; j < 16; j++) { ++ if (i * 16 + j >= l) { ++ seq_printf(f, "\n"); ++ goto out; ++ } ++ ++ seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]); ++ } ++ seq_printf(f, "\n"); ++ } ++out: ++ return 0; ++} ++ ++static int ++kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp) ++{ ++ seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type); ++ ++ switch (knp->data_type) { ++ case KSTAT_DATA_CHAR: ++ knp->value.c[15] = '\0'; /* NULL terminate */ ++ seq_printf(f, "%-16s", knp->value.c); ++ break; ++ /* XXX - We need to be more careful able what tokens are ++ * used for each arch, for now this is correct for x86_64. ++ */ ++ case KSTAT_DATA_INT32: ++ seq_printf(f, "%d", knp->value.i32); ++ break; ++ case KSTAT_DATA_UINT32: ++ seq_printf(f, "%u", knp->value.ui32); ++ break; ++ case KSTAT_DATA_INT64: ++ seq_printf(f, "%lld", (signed long long)knp->value.i64); ++ break; ++ case KSTAT_DATA_UINT64: ++ seq_printf(f, "%llu", (unsigned long long)knp->value.ui64); ++ break; ++ case KSTAT_DATA_LONG: ++ seq_printf(f, "%ld", knp->value.l); ++ break; ++ case KSTAT_DATA_ULONG: ++ seq_printf(f, "%lu", knp->value.ul); ++ break; ++ case KSTAT_DATA_STRING: ++ KSTAT_NAMED_STR_PTR(knp) ++ [KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0'; ++ seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp)); ++ break; ++ default: ++ PANIC("Undefined kstat data type %d\n", knp->data_type); ++ } ++ ++ seq_printf(f, "\n"); ++ ++ return 0; ++} ++ ++static int ++kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip) ++{ ++ seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n", ++ kip->intrs[KSTAT_INTR_HARD], ++ kip->intrs[KSTAT_INTR_SOFT], ++ kip->intrs[KSTAT_INTR_WATCHDOG], ++ kip->intrs[KSTAT_INTR_SPURIOUS], ++ kip->intrs[KSTAT_INTR_MULTSVC]); ++ ++ return 0; ++} ++ ++static int ++kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip) ++{ ++ seq_printf(f, ++ "%-8llu %-8llu %-8u %-8u %-8lld %-8lld " ++ "%-8lld %-8lld %-8lld %-8lld %-8u %-8u\n", ++ kip->nread, kip->nwritten, ++ kip->reads, kip->writes, ++ kip->wtime, kip->wlentime, kip->wlastupdate, ++ kip->rtime, kip->wlentime, kip->rlastupdate, ++ kip->wcnt, kip->rcnt); ++ ++ return 0; ++} ++ ++static int ++kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp) ++{ ++ seq_printf(f, ++ "%-31s %-8llu %-8lld %-8lld %-8lld %-8lld %-8lld\n", ++ ktp->name, ktp->num_events, ktp->elapsed_time, ++ ktp->min_time, ktp->max_time, ++ ktp->start_time, ktp->stop_time); ++ ++ return 0; ++} ++ ++static int ++kstat_seq_show_txg(struct seq_file *f, kstat_txg_t *ktp) ++{ ++ char state; ++ ++ switch (ktp->state) { ++ case TXG_STATE_OPEN: state = 'O'; break; ++ case TXG_STATE_QUIESCING: state = 'Q'; break; ++ case TXG_STATE_SYNCING: state = 'S'; break; ++ case TXG_STATE_COMMITTED: state = 'C'; break; ++ default: state = '?'; break; ++ } ++ ++ seq_printf(f, ++ "%-8llu %-5c %-13llu %-12llu %-12llu %-8u %-8u " ++ "%12lld %12lld %12lld\n", ktp->txg, state, ktp->birth, ++ ktp->nread, ktp->nwritten, ktp->reads, ktp->writes, ++ ktp->open_time, ktp->quiesce_time, ktp->sync_time); ++ return 0; ++} ++ ++static int ++kstat_seq_show(struct seq_file *f, void *p) ++{ ++ kstat_t *ksp = (kstat_t *)f->private; ++ int rc = 0; ++ ++ ASSERT(ksp->ks_magic == KS_MAGIC); ++ ++ switch (ksp->ks_type) { ++ case KSTAT_TYPE_RAW: ++ ASSERT(ksp->ks_ndata == 1); ++ rc = kstat_seq_show_raw(f, ksp->ks_data, ++ ksp->ks_data_size); ++ break; ++ case KSTAT_TYPE_NAMED: ++ rc = kstat_seq_show_named(f, (kstat_named_t *)p); ++ break; ++ case KSTAT_TYPE_INTR: ++ rc = kstat_seq_show_intr(f, (kstat_intr_t *)p); ++ break; ++ case KSTAT_TYPE_IO: ++ rc = kstat_seq_show_io(f, (kstat_io_t *)p); ++ break; ++ case KSTAT_TYPE_TIMER: ++ rc = kstat_seq_show_timer(f, (kstat_timer_t *)p); ++ break; ++ case KSTAT_TYPE_TXG: ++ rc = kstat_seq_show_txg(f, (kstat_txg_t *)p); ++ break; ++ default: ++ PANIC("Undefined kstat type %d\n", ksp->ks_type); ++ } ++ ++ return rc; ++} ++ ++int ++kstat_default_update(kstat_t *ksp, int rw) ++{ ++ ASSERT(ksp != NULL); ++ return 0; ++} ++ ++static void * ++kstat_seq_data_addr(kstat_t *ksp, loff_t n) ++{ ++ void *rc = NULL; ++ SENTRY; ++ ++ switch (ksp->ks_type) { ++ case KSTAT_TYPE_RAW: ++ rc = ksp->ks_data; ++ break; ++ case KSTAT_TYPE_NAMED: ++ rc = ksp->ks_data + n * sizeof(kstat_named_t); ++ break; ++ case KSTAT_TYPE_INTR: ++ rc = ksp->ks_data + n * sizeof(kstat_intr_t); ++ break; ++ case KSTAT_TYPE_IO: ++ rc = ksp->ks_data + n * sizeof(kstat_io_t); ++ break; ++ case KSTAT_TYPE_TIMER: ++ rc = ksp->ks_data + n * sizeof(kstat_timer_t); ++ break; ++ case KSTAT_TYPE_TXG: ++ rc = ksp->ks_data + n * sizeof(kstat_txg_t); ++ break; ++ default: ++ PANIC("Undefined kstat type %d\n", ksp->ks_type); ++ } ++ ++ SRETURN(rc); ++} ++ ++static void * ++kstat_seq_start(struct seq_file *f, loff_t *pos) ++{ ++ loff_t n = *pos; ++ kstat_t *ksp = (kstat_t *)f->private; ++ ASSERT(ksp->ks_magic == KS_MAGIC); ++ SENTRY; ++ ++ mutex_enter(&ksp->ks_lock); ++ ++ /* Dynamically update kstat, on error existing kstats are used */ ++ (void) ksp->ks_update(ksp, KSTAT_READ); ++ ++ ksp->ks_snaptime = gethrtime(); ++ ++ if (!n) ++ kstat_seq_show_headers(f); ++ ++ if (n >= ksp->ks_ndata) ++ SRETURN(NULL); ++ ++ SRETURN(kstat_seq_data_addr(ksp, n)); ++} ++ ++static void * ++kstat_seq_next(struct seq_file *f, void *p, loff_t *pos) ++{ ++ kstat_t *ksp = (kstat_t *)f->private; ++ ASSERT(ksp->ks_magic == KS_MAGIC); ++ SENTRY; ++ ++ ++*pos; ++ if (*pos >= ksp->ks_ndata) ++ SRETURN(NULL); ++ ++ SRETURN(kstat_seq_data_addr(ksp, *pos)); ++} ++ ++static void ++kstat_seq_stop(struct seq_file *f, void *v) ++{ ++ kstat_t *ksp = (kstat_t *)f->private; ++ ASSERT(ksp->ks_magic == KS_MAGIC); ++ ++ mutex_exit(&ksp->ks_lock); ++} ++ ++static struct seq_operations kstat_seq_ops = { ++ .show = kstat_seq_show, ++ .start = kstat_seq_start, ++ .next = kstat_seq_next, ++ .stop = kstat_seq_stop, ++}; ++ ++static int ++proc_kstat_open(struct inode *inode, struct file *filp) ++{ ++ struct seq_file *f; ++ int rc; ++ ++ rc = seq_open(filp, &kstat_seq_ops); ++ if (rc) ++ return rc; ++ ++ f = filp->private_data; ++ f->private = PDE(inode)->data; ++ ++ return rc; ++} ++ ++static struct file_operations proc_kstat_operations = { ++ .open = proc_kstat_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++kstat_t * ++__kstat_create(const char *ks_module, int ks_instance, const char *ks_name, ++ const char *ks_class, uchar_t ks_type, uint_t ks_ndata, ++ uchar_t ks_flags) ++{ ++ kstat_t *ksp; ++ ++ ASSERT(ks_module); ++ ASSERT(ks_instance == 0); ++ ASSERT(ks_name); ++ ASSERT(!(ks_flags & KSTAT_FLAG_UNSUPPORTED)); ++ ++ if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) ++ ASSERT(ks_ndata == 1); ++ ++ ksp = kmem_zalloc(sizeof(*ksp), KM_SLEEP); ++ if (ksp == NULL) ++ return ksp; ++ ++ spin_lock(&kstat_lock); ++ ksp->ks_kid = kstat_id; ++ kstat_id++; ++ spin_unlock(&kstat_lock); ++ ++ ksp->ks_magic = KS_MAGIC; ++ mutex_init(&ksp->ks_lock, NULL, MUTEX_DEFAULT, NULL); ++ INIT_LIST_HEAD(&ksp->ks_list); ++ ++ ksp->ks_crtime = gethrtime(); ++ ksp->ks_snaptime = ksp->ks_crtime; ++ strncpy(ksp->ks_module, ks_module, KSTAT_STRLEN); ++ ksp->ks_instance = ks_instance; ++ strncpy(ksp->ks_name, ks_name, KSTAT_STRLEN); ++ strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN); ++ ksp->ks_type = ks_type; ++ ksp->ks_flags = ks_flags; ++ ksp->ks_update = kstat_default_update; ++ ksp->ks_private = NULL; ++ ++ switch (ksp->ks_type) { ++ case KSTAT_TYPE_RAW: ++ ksp->ks_ndata = 1; ++ ksp->ks_data_size = ks_ndata; ++ break; ++ case KSTAT_TYPE_NAMED: ++ ksp->ks_ndata = ks_ndata; ++ ksp->ks_data_size = ks_ndata * sizeof(kstat_named_t); ++ break; ++ case KSTAT_TYPE_INTR: ++ ksp->ks_ndata = ks_ndata; ++ ksp->ks_data_size = ks_ndata * sizeof(kstat_intr_t); ++ break; ++ case KSTAT_TYPE_IO: ++ ksp->ks_ndata = ks_ndata; ++ ksp->ks_data_size = ks_ndata * sizeof(kstat_io_t); ++ break; ++ case KSTAT_TYPE_TIMER: ++ ksp->ks_ndata = ks_ndata; ++ ksp->ks_data_size = ks_ndata * sizeof(kstat_timer_t); ++ break; ++ case KSTAT_TYPE_TXG: ++ ksp->ks_ndata = ks_ndata; ++ ksp->ks_data_size = ks_ndata * sizeof(kstat_timer_t); ++ break; ++ default: ++ PANIC("Undefined kstat type %d\n", ksp->ks_type); ++ } ++ ++ if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) { ++ ksp->ks_data = NULL; ++ } else { ++ ksp->ks_data = kmem_alloc(ksp->ks_data_size, KM_SLEEP); ++ if (ksp->ks_data == NULL) { ++ kmem_free(ksp, sizeof(*ksp)); ++ ksp = NULL; ++ } ++ } ++ ++ return ksp; ++} ++EXPORT_SYMBOL(__kstat_create); ++ ++void ++__kstat_install(kstat_t *ksp) ++{ ++ struct proc_dir_entry *de_module, *de_name; ++ kstat_t *tmp; ++ int rc = 0; ++ SENTRY; ++ ++ spin_lock(&kstat_lock); ++ ++ /* Item may only be added to the list once */ ++ list_for_each_entry(tmp, &kstat_list, ks_list) { ++ if (tmp == ksp) { ++ spin_unlock(&kstat_lock); ++ SGOTO(out, rc = -EEXIST); ++ } ++ } ++ ++ list_add_tail(&ksp->ks_list, &kstat_list); ++ spin_unlock(&kstat_lock); ++ ++ de_module = proc_dir_entry_find(proc_spl_kstat, ksp->ks_module); ++ if (de_module == NULL) { ++ de_module = proc_mkdir(ksp->ks_module, proc_spl_kstat); ++ if (de_module == NULL) ++ SGOTO(out, rc = -EUNATCH); ++ } ++ ++ de_name = create_proc_entry(ksp->ks_name, 0444, de_module); ++ if (de_name == NULL) ++ SGOTO(out, rc = -EUNATCH); ++ ++ mutex_enter(&ksp->ks_lock); ++ ksp->ks_proc = de_name; ++ de_name->proc_fops = &proc_kstat_operations; ++ de_name->data = (void *)ksp; ++ mutex_exit(&ksp->ks_lock); ++out: ++ if (rc) { ++ spin_lock(&kstat_lock); ++ list_del_init(&ksp->ks_list); ++ spin_unlock(&kstat_lock); ++ } ++ ++ SEXIT; ++} ++EXPORT_SYMBOL(__kstat_install); ++ ++void ++__kstat_delete(kstat_t *ksp) ++{ ++ struct proc_dir_entry *de_module; ++ ++ spin_lock(&kstat_lock); ++ list_del_init(&ksp->ks_list); ++ spin_unlock(&kstat_lock); ++ ++ if (ksp->ks_proc) { ++ de_module = ksp->ks_proc->parent; ++ remove_proc_entry(ksp->ks_name, de_module); ++ ++ /* Remove top level module directory if it's empty */ ++ if (proc_dir_entries(de_module) == 0) ++ remove_proc_entry(de_module->name, de_module->parent); ++ } ++ ++ if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) ++ kmem_free(ksp->ks_data, ksp->ks_data_size); ++ ++ mutex_destroy(&ksp->ks_lock); ++ kmem_free(ksp, sizeof(*ksp)); ++ ++ return; ++} ++EXPORT_SYMBOL(__kstat_delete); ++ ++int ++spl_kstat_init(void) ++{ ++ SENTRY; ++ spin_lock_init(&kstat_lock); ++ INIT_LIST_HEAD(&kstat_list); ++ kstat_id = 0; ++ SRETURN(0); ++} ++ ++void ++spl_kstat_fini(void) ++{ ++ SENTRY; ++ ASSERT(list_empty(&kstat_list)); ++ SEXIT; ++} ++ +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-mutex.c linux-3.2.33-go/spl/spl/spl-mutex.c +--- linux-3.2.33-go.orig/spl/spl/spl-mutex.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-mutex.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,77 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Mutex Implementation. ++\*****************************************************************************/ ++ ++#include ++ ++#ifdef DEBUG_SUBSYSTEM ++#undef DEBUG_SUBSYSTEM ++#endif ++ ++#define DEBUG_SUBSYSTEM S_MUTEX ++ ++/* ++ * While a standard mutex implementation has been available in the kernel ++ * for quite some time. It was not until 2.6.29 and latter kernels that ++ * adaptive mutexs were embraced and integrated with the scheduler. This ++ * brought a significant performance improvement, but just as importantly ++ * it added a lock owner to the generic mutex outside CONFIG_DEBUG_MUTEXES ++ * builds. This is critical for correctly supporting the mutex_owner() ++ * Solaris primitive. When the owner is available we use a pure Linux ++ * mutex implementation. When the owner is not available we still use ++ * Linux mutexs as a base but also reserve space for an owner field right ++ * after the mutex structure. ++ * ++ * In the case when HAVE_MUTEX_OWNER is not defined your code may ++ * still me able to leverage adaptive mutexs. As long as the task_curr() ++ * symbol is exported this code will provide a poor mans adaptive mutex ++ * implementation. However, this is not required and if the symbol is ++ * unavailable we provide a standard mutex. ++ */ ++ ++#if !defined(HAVE_MUTEX_OWNER) || !defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES) ++#ifdef HAVE_TASK_CURR ++/* ++ * mutex_spin_max = { 0, -1, 1-MAX_INT } ++ * 0: Never spin when trying to acquire lock ++ * -1: Spin until acquired or holder yields without dropping lock ++ * 1-MAX_INT: Spin for N attempts before sleeping for lock ++ */ ++int mutex_spin_max = 0; ++module_param(mutex_spin_max, int, 0644); ++MODULE_PARM_DESC(mutex_spin_max, "Spin a maximum of N times to acquire lock"); ++ ++int ++spl_mutex_spin_max(void) ++{ ++ return mutex_spin_max; ++} ++EXPORT_SYMBOL(spl_mutex_spin_max); ++ ++#endif /* HAVE_TASK_CURR */ ++#endif /* !HAVE_MUTEX_OWNER */ ++ ++int spl_mutex_init(void) { return 0; } ++void spl_mutex_fini(void) { } +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-proc.c linux-3.2.33-go/spl/spl/spl-proc.c +--- linux-3.2.33-go.orig/spl/spl/spl-proc.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-proc.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,1219 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Proc Implementation. ++\*****************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef SS_DEBUG_SUBSYS ++#undef SS_DEBUG_SUBSYS ++#endif ++ ++#define SS_DEBUG_SUBSYS SS_PROC ++ ++#ifdef DEBUG_KMEM ++static unsigned long table_min = 0; ++static unsigned long table_max = ~0; ++#endif ++ ++#ifdef CONFIG_SYSCTL ++static struct ctl_table_header *spl_header = NULL; ++#endif /* CONFIG_SYSCTL */ ++ ++static struct proc_dir_entry *proc_spl = NULL; ++#ifdef DEBUG_KMEM ++static struct proc_dir_entry *proc_spl_kmem = NULL; ++static struct proc_dir_entry *proc_spl_kmem_slab = NULL; ++#endif /* DEBUG_KMEM */ ++struct proc_dir_entry *proc_spl_kstat = NULL; ++ ++#ifdef HAVE_CTL_NAME ++#ifdef HAVE_CTL_UNNUMBERED ++ ++#define CTL_SPL CTL_UNNUMBERED ++#define CTL_SPL_DEBUG CTL_UNNUMBERED ++#define CTL_SPL_VM CTL_UNNUMBERED ++#define CTL_SPL_MUTEX CTL_UNNUMBERED ++#define CTL_SPL_KMEM CTL_UNNUMBERED ++#define CTL_SPL_KSTAT CTL_UNNUMBERED ++ ++#define CTL_VERSION CTL_UNNUMBERED /* Version */ ++#define CTL_HOSTID CTL_UNNUMBERED /* Host id by /usr/bin/hostid */ ++#define CTL_HW_SERIAL CTL_UNNUMBERED /* HW serial number by hostid */ ++#define CTL_KALLSYMS CTL_UNNUMBERED /* kallsyms_lookup_name addr */ ++ ++#define CTL_DEBUG_SUBSYS CTL_UNNUMBERED /* Debug subsystem */ ++#define CTL_DEBUG_MASK CTL_UNNUMBERED /* Debug mask */ ++#define CTL_DEBUG_PRINTK CTL_UNNUMBERED /* All messages to console */ ++#define CTL_DEBUG_MB CTL_UNNUMBERED /* Debug buffer size */ ++#define CTL_DEBUG_BINARY CTL_UNNUMBERED /* Binary data in buffer */ ++#define CTL_DEBUG_CATASTROPHE CTL_UNNUMBERED /* Set if BUG'd or panic'd */ ++#define CTL_DEBUG_PANIC_ON_BUG CTL_UNNUMBERED /* Should panic on BUG */ ++#define CTL_DEBUG_PATH CTL_UNNUMBERED /* Dump log location */ ++#define CTL_DEBUG_DUMP CTL_UNNUMBERED /* Dump debug buffer to file */ ++#define CTL_DEBUG_FORCE_BUG CTL_UNNUMBERED /* Hook to force a BUG */ ++#define CTL_DEBUG_STACK_SIZE CTL_UNNUMBERED /* Max observed stack size */ ++ ++#define CTL_CONSOLE_RATELIMIT CTL_UNNUMBERED /* Ratelimit console messages */ ++#define CTL_CONSOLE_MAX_DELAY_CS CTL_UNNUMBERED /* Max delay skip messages */ ++#define CTL_CONSOLE_MIN_DELAY_CS CTL_UNNUMBERED /* Init delay skip messages */ ++#define CTL_CONSOLE_BACKOFF CTL_UNNUMBERED /* Delay increase factor */ ++ ++#define CTL_VM_MINFREE CTL_UNNUMBERED /* Minimum free memory */ ++#define CTL_VM_DESFREE CTL_UNNUMBERED /* Desired free memory */ ++#define CTL_VM_LOTSFREE CTL_UNNUMBERED /* Lots of free memory */ ++#define CTL_VM_NEEDFREE CTL_UNNUMBERED /* Need free memory */ ++#define CTL_VM_SWAPFS_MINFREE CTL_UNNUMBERED /* Minimum swapfs memory */ ++#define CTL_VM_SWAPFS_RESERVE CTL_UNNUMBERED /* Reserved swapfs memory */ ++#define CTL_VM_AVAILRMEM CTL_UNNUMBERED /* Easily available memory */ ++#define CTL_VM_FREEMEM CTL_UNNUMBERED /* Free memory */ ++#define CTL_VM_PHYSMEM CTL_UNNUMBERED /* Total physical memory */ ++ ++#ifdef DEBUG_KMEM ++#define CTL_KMEM_KMEMUSED CTL_UNNUMBERED /* Alloc'd kmem bytes */ ++#define CTL_KMEM_KMEMMAX CTL_UNNUMBERED /* Max alloc'd by kmem bytes */ ++#define CTL_KMEM_VMEMUSED CTL_UNNUMBERED /* Alloc'd vmem bytes */ ++#define CTL_KMEM_VMEMMAX CTL_UNNUMBERED /* Max alloc'd by vmem bytes */ ++#define CTL_KMEM_SLAB_KMEMTOTAL CTL_UNNUMBERED /* Total kmem slab size */ ++#define CTL_KMEM_SLAB_KMEMALLOC CTL_UNNUMBERED /* Alloc'd kmem slab size */ ++#define CTL_KMEM_SLAB_KMEMMAX CTL_UNNUMBERED /* Max kmem slab size */ ++#define CTL_KMEM_SLAB_VMEMTOTAL CTL_UNNUMBERED /* Total vmem slab size */ ++#define CTL_KMEM_SLAB_VMEMALLOC CTL_UNNUMBERED /* Alloc'd vmem slab size */ ++#define CTL_KMEM_SLAB_VMEMMAX CTL_UNNUMBERED /* Max vmem slab size */ ++#endif ++ ++#else /* HAVE_CTL_UNNUMBERED */ ++ ++enum { ++ CTL_SPL = 0x87, ++ CTL_SPL_DEBUG = 0x88, ++ CTL_SPL_VM = 0x89, ++ CTL_SPL_MUTEX = 0x90, ++ CTL_SPL_KMEM = 0x91, ++ CTL_SPL_KSTAT = 0x92, ++}; ++ ++enum { ++ CTL_VERSION = 1, /* Version */ ++ CTL_HOSTID, /* Host id reported by /usr/bin/hostid */ ++ CTL_HW_SERIAL, /* Hardware serial number from hostid */ ++ CTL_KALLSYMS, /* Address of kallsyms_lookup_name */ ++ ++#ifdef DEBUG_LOG ++ CTL_DEBUG_SUBSYS, /* Debug subsystem */ ++ CTL_DEBUG_MASK, /* Debug mask */ ++ CTL_DEBUG_PRINTK, /* Force all messages to console */ ++ CTL_DEBUG_MB, /* Debug buffer size */ ++ CTL_DEBUG_BINARY, /* Include binary data in buffer */ ++ CTL_DEBUG_CATASTROPHE, /* Set if we have BUG'd or panic'd */ ++ CTL_DEBUG_PANIC_ON_BUG, /* Set if we should panic on BUG */ ++ CTL_DEBUG_PATH, /* Dump log location */ ++ CTL_DEBUG_DUMP, /* Dump debug buffer to file */ ++ CTL_DEBUG_FORCE_BUG, /* Hook to force a BUG */ ++ CTL_DEBUG_STACK_SIZE, /* Max observed stack size */ ++#endif ++ ++ CTL_CONSOLE_RATELIMIT, /* Ratelimit console messages */ ++ CTL_CONSOLE_MAX_DELAY_CS, /* Max delay which we skip messages */ ++ CTL_CONSOLE_MIN_DELAY_CS, /* Init delay which we skip messages */ ++ CTL_CONSOLE_BACKOFF, /* Delay increase factor */ ++ ++ CTL_VM_MINFREE, /* Minimum free memory threshold */ ++ CTL_VM_DESFREE, /* Desired free memory threshold */ ++ CTL_VM_LOTSFREE, /* Lots of free memory threshold */ ++ CTL_VM_NEEDFREE, /* Need free memory deficit */ ++ CTL_VM_SWAPFS_MINFREE, /* Minimum swapfs memory */ ++ CTL_VM_SWAPFS_RESERVE, /* Reserved swapfs memory */ ++ CTL_VM_AVAILRMEM, /* Easily available memory */ ++ CTL_VM_FREEMEM, /* Free memory */ ++ CTL_VM_PHYSMEM, /* Total physical memory */ ++ ++#ifdef DEBUG_KMEM ++ CTL_KMEM_KMEMUSED, /* Alloc'd kmem bytes */ ++ CTL_KMEM_KMEMMAX, /* Max alloc'd by kmem bytes */ ++ CTL_KMEM_VMEMUSED, /* Alloc'd vmem bytes */ ++ CTL_KMEM_VMEMMAX, /* Max alloc'd by vmem bytes */ ++ CTL_KMEM_SLAB_KMEMTOTAL, /* Total kmem slab size */ ++ CTL_KMEM_SLAB_KMEMALLOC, /* Alloc'd kmem slab size */ ++ CTL_KMEM_SLAB_KMEMMAX, /* Max kmem slab size */ ++ CTL_KMEM_SLAB_VMEMTOTAL, /* Total vmem slab size */ ++ CTL_KMEM_SLAB_VMEMALLOC, /* Alloc'd vmem slab size */ ++ CTL_KMEM_SLAB_VMEMMAX, /* Max vmem slab size */ ++#endif ++}; ++#endif /* HAVE_CTL_UNNUMBERED */ ++#endif /* HAVE_CTL_NAME */ ++ ++static int ++proc_copyin_string(char *kbuffer, int kbuffer_size, ++ const char *ubuffer, int ubuffer_size) ++{ ++ int size; ++ ++ if (ubuffer_size > kbuffer_size) ++ return -EOVERFLOW; ++ ++ if (copy_from_user((void *)kbuffer, (void *)ubuffer, ubuffer_size)) ++ return -EFAULT; ++ ++ /* strip trailing whitespace */ ++ size = strnlen(kbuffer, ubuffer_size); ++ while (size-- >= 0) ++ if (!isspace(kbuffer[size])) ++ break; ++ ++ /* empty string */ ++ if (size < 0) ++ return -EINVAL; ++ ++ /* no space to terminate */ ++ if (size == kbuffer_size) ++ return -EOVERFLOW; ++ ++ kbuffer[size + 1] = 0; ++ return 0; ++} ++ ++static int ++proc_copyout_string(char *ubuffer, int ubuffer_size, ++ const char *kbuffer, char *append) ++{ ++ /* NB if 'append' != NULL, it's a single character to append to the ++ * copied out string - usually "\n", for /proc entries and ++ * (i.e. a terminating zero byte) for sysctl entries ++ */ ++ int size = MIN(strlen(kbuffer), ubuffer_size); ++ ++ if (copy_to_user(ubuffer, kbuffer, size)) ++ return -EFAULT; ++ ++ if (append != NULL && size < ubuffer_size) { ++ if (copy_to_user(ubuffer + size, append, 1)) ++ return -EFAULT; ++ ++ size++; ++ } ++ ++ return size; ++} ++ ++#ifdef DEBUG_LOG ++SPL_PROC_HANDLER(proc_dobitmasks) ++{ ++ unsigned long *mask = table->data; ++ int is_subsys = (mask == &spl_debug_subsys) ? 1 : 0; ++ int is_printk = (mask == &spl_debug_printk) ? 1 : 0; ++ int size = 512, rc; ++ char *str; ++ SENTRY; ++ ++ str = kmem_alloc(size, KM_SLEEP); ++ if (str == NULL) ++ SRETURN(-ENOMEM); ++ ++ if (write) { ++ rc = proc_copyin_string(str, size, buffer, *lenp); ++ if (rc < 0) ++ SRETURN(rc); ++ ++ rc = spl_debug_str2mask(mask, str, is_subsys); ++ /* Always print BUG/ASSERT to console, so keep this mask */ ++ if (is_printk) ++ *mask |= SD_EMERG; ++ ++ *ppos += *lenp; ++ } else { ++ rc = spl_debug_mask2str(str, size, *mask, is_subsys); ++ if (*ppos >= rc) ++ rc = 0; ++ else ++ rc = proc_copyout_string(buffer, *lenp, ++ str + *ppos, "\n"); ++ if (rc >= 0) { ++ *lenp = rc; ++ *ppos += rc; ++ } ++ } ++ ++ kmem_free(str, size); ++ SRETURN(rc); ++} ++ ++SPL_PROC_HANDLER(proc_debug_mb) ++{ ++ char str[32]; ++ int rc, len; ++ SENTRY; ++ ++ if (write) { ++ rc = proc_copyin_string(str, sizeof(str), buffer, *lenp); ++ if (rc < 0) ++ SRETURN(rc); ++ ++ rc = spl_debug_set_mb(simple_strtoul(str, NULL, 0)); ++ *ppos += *lenp; ++ } else { ++ len = snprintf(str, sizeof(str), "%d", spl_debug_get_mb()); ++ if (*ppos >= len) ++ rc = 0; ++ else ++ rc = proc_copyout_string(buffer,*lenp,str+*ppos,"\n"); ++ ++ if (rc >= 0) { ++ *lenp = rc; ++ *ppos += rc; ++ } ++ } ++ ++ SRETURN(rc); ++} ++ ++SPL_PROC_HANDLER(proc_dump_kernel) ++{ ++ SENTRY; ++ ++ if (write) { ++ spl_debug_dumplog(0); ++ *ppos += *lenp; ++ } else { ++ *lenp = 0; ++ } ++ ++ SRETURN(0); ++} ++ ++SPL_PROC_HANDLER(proc_force_bug) ++{ ++ SENTRY; ++ ++ if (write) ++ PANIC("Crashing due to forced panic\n"); ++ else ++ *lenp = 0; ++ ++ SRETURN(0); ++} ++ ++SPL_PROC_HANDLER(proc_console_max_delay_cs) ++{ ++ int rc, max_delay_cs; ++ struct ctl_table dummy = *table; ++ long d; ++ SENTRY; ++ ++ dummy.data = &max_delay_cs; ++ dummy.proc_handler = &proc_dointvec; ++ ++ if (write) { ++ max_delay_cs = 0; ++ rc = spl_proc_dointvec(&dummy,write,filp,buffer,lenp,ppos); ++ if (rc < 0) ++ SRETURN(rc); ++ ++ if (max_delay_cs <= 0) ++ SRETURN(-EINVAL); ++ ++ d = (max_delay_cs * HZ) / 100; ++ if (d == 0 || d < spl_console_min_delay) ++ SRETURN(-EINVAL); ++ ++ spl_console_max_delay = d; ++ } else { ++ max_delay_cs = (spl_console_max_delay * 100) / HZ; ++ rc = spl_proc_dointvec(&dummy,write,filp,buffer,lenp,ppos); ++ } ++ ++ SRETURN(rc); ++} ++ ++SPL_PROC_HANDLER(proc_console_min_delay_cs) ++{ ++ int rc, min_delay_cs; ++ struct ctl_table dummy = *table; ++ long d; ++ SENTRY; ++ ++ dummy.data = &min_delay_cs; ++ dummy.proc_handler = &proc_dointvec; ++ ++ if (write) { ++ min_delay_cs = 0; ++ rc = spl_proc_dointvec(&dummy,write,filp,buffer,lenp,ppos); ++ if (rc < 0) ++ SRETURN(rc); ++ ++ if (min_delay_cs <= 0) ++ SRETURN(-EINVAL); ++ ++ d = (min_delay_cs * HZ) / 100; ++ if (d == 0 || d > spl_console_max_delay) ++ SRETURN(-EINVAL); ++ ++ spl_console_min_delay = d; ++ } else { ++ min_delay_cs = (spl_console_min_delay * 100) / HZ; ++ rc = spl_proc_dointvec(&dummy,write,filp,buffer,lenp,ppos); ++ } ++ ++ SRETURN(rc); ++} ++ ++SPL_PROC_HANDLER(proc_console_backoff) ++{ ++ int rc, backoff; ++ struct ctl_table dummy = *table; ++ SENTRY; ++ ++ dummy.data = &backoff; ++ dummy.proc_handler = &proc_dointvec; ++ ++ if (write) { ++ backoff = 0; ++ rc = spl_proc_dointvec(&dummy,write,filp,buffer,lenp,ppos); ++ if (rc < 0) ++ SRETURN(rc); ++ ++ if (backoff <= 0) ++ SRETURN(-EINVAL); ++ ++ spl_console_backoff = backoff; ++ } else { ++ backoff = spl_console_backoff; ++ rc = spl_proc_dointvec(&dummy,write,filp,buffer,lenp,ppos); ++ } ++ ++ SRETURN(rc); ++} ++#endif /* DEBUG_LOG */ ++ ++#ifdef DEBUG_KMEM ++SPL_PROC_HANDLER(proc_domemused) ++{ ++ int rc = 0; ++ unsigned long min = 0, max = ~0, val; ++ struct ctl_table dummy = *table; ++ SENTRY; ++ ++ dummy.data = &val; ++ dummy.proc_handler = &proc_dointvec; ++ dummy.extra1 = &min; ++ dummy.extra2 = &max; ++ ++ if (write) { ++ *ppos += *lenp; ++ } else { ++# ifdef HAVE_ATOMIC64_T ++ val = atomic64_read((atomic64_t *)table->data); ++# else ++ val = atomic_read((atomic_t *)table->data); ++# endif /* HAVE_ATOMIC64_T */ ++ rc = spl_proc_doulongvec_minmax(&dummy, write, filp, ++ buffer, lenp, ppos); ++ } ++ ++ SRETURN(rc); ++} ++ ++SPL_PROC_HANDLER(proc_doslab) ++{ ++ int rc = 0; ++ unsigned long min = 0, max = ~0, val = 0, mask; ++ struct ctl_table dummy = *table; ++ spl_kmem_cache_t *skc; ++ SENTRY; ++ ++ dummy.data = &val; ++ dummy.proc_handler = &proc_dointvec; ++ dummy.extra1 = &min; ++ dummy.extra2 = &max; ++ ++ if (write) { ++ *ppos += *lenp; ++ } else { ++ down_read(&spl_kmem_cache_sem); ++ mask = (unsigned long)table->data; ++ ++ list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { ++ ++ /* Only use slabs of the correct kmem/vmem type */ ++ if (!(skc->skc_flags & mask)) ++ continue; ++ ++ /* Sum the specified field for selected slabs */ ++ switch (mask & (KMC_TOTAL | KMC_ALLOC | KMC_MAX)) { ++ case KMC_TOTAL: ++ val += skc->skc_slab_size * skc->skc_slab_total; ++ break; ++ case KMC_ALLOC: ++ val += skc->skc_obj_size * skc->skc_obj_alloc; ++ break; ++ case KMC_MAX: ++ val += skc->skc_obj_size * skc->skc_obj_max; ++ break; ++ } ++ } ++ ++ up_read(&spl_kmem_cache_sem); ++ rc = spl_proc_doulongvec_minmax(&dummy, write, filp, ++ buffer, lenp, ppos); ++ } ++ ++ SRETURN(rc); ++} ++#endif /* DEBUG_KMEM */ ++ ++SPL_PROC_HANDLER(proc_dohostid) ++{ ++ int len, rc = 0; ++ char *end, str[32]; ++ SENTRY; ++ ++ if (write) { ++ /* We can't use spl_proc_doulongvec_minmax() in the write ++ * case here because hostid while a hex value has no ++ * leading 0x which confuses the helper function. */ ++ rc = proc_copyin_string(str, sizeof(str), buffer, *lenp); ++ if (rc < 0) ++ SRETURN(rc); ++ ++ spl_hostid = simple_strtoul(str, &end, 16); ++ if (str == end) ++ SRETURN(-EINVAL); ++ ++ (void) snprintf(hw_serial, HW_HOSTID_LEN, "%lu", spl_hostid); ++ hw_serial[HW_HOSTID_LEN - 1] = '\0'; ++ *ppos += *lenp; ++ } else { ++ len = snprintf(str, sizeof(str), "%lx", spl_hostid); ++ if (*ppos >= len) ++ rc = 0; ++ else ++ rc = proc_copyout_string(buffer,*lenp,str+*ppos,"\n"); ++ ++ if (rc >= 0) { ++ *lenp = rc; ++ *ppos += rc; ++ } ++ } ++ ++ SRETURN(rc); ++} ++ ++#ifndef HAVE_KALLSYMS_LOOKUP_NAME ++SPL_PROC_HANDLER(proc_dokallsyms_lookup_name) ++{ ++ int len, rc = 0; ++ char *end, str[32]; ++ SENTRY; ++ ++ if (write) { ++ /* This may only be set once at module load time */ ++ if (spl_kallsyms_lookup_name_fn != SYMBOL_POISON) ++ SRETURN(-EEXIST); ++ ++ /* We can't use spl_proc_doulongvec_minmax() in the write ++ * case here because the address while a hex value has no ++ * leading 0x which confuses the helper function. */ ++ rc = proc_copyin_string(str, sizeof(str), buffer, *lenp); ++ if (rc < 0) ++ SRETURN(rc); ++ ++ spl_kallsyms_lookup_name_fn = ++ (kallsyms_lookup_name_t)simple_strtoul(str, &end, 16); ++ if (str == end) ++ SRETURN(-EINVAL); ++ ++ *ppos += *lenp; ++ } else { ++ len = snprintf(str, sizeof(str), "%lx", ++ (unsigned long)spl_kallsyms_lookup_name_fn); ++ if (*ppos >= len) ++ rc = 0; ++ else ++ rc = proc_copyout_string(buffer,*lenp,str+*ppos,"\n"); ++ ++ if (rc >= 0) { ++ *lenp = rc; ++ *ppos += rc; ++ } ++ } ++ ++ SRETURN(rc); ++} ++#endif /* HAVE_KALLSYMS_LOOKUP_NAME */ ++ ++SPL_PROC_HANDLER(proc_doavailrmem) ++{ ++ int len, rc = 0; ++ char str[32]; ++ SENTRY; ++ ++ if (write) { ++ *ppos += *lenp; ++ } else { ++ len = snprintf(str, sizeof(str), "%lu", ++ (unsigned long)availrmem); ++ if (*ppos >= len) ++ rc = 0; ++ else ++ rc = proc_copyout_string(buffer,*lenp,str+*ppos,"\n"); ++ ++ if (rc >= 0) { ++ *lenp = rc; ++ *ppos += rc; ++ } ++ } ++ ++ SRETURN(rc); ++} ++ ++SPL_PROC_HANDLER(proc_dofreemem) ++{ ++ int len, rc = 0; ++ char str[32]; ++ SENTRY; ++ ++ if (write) { ++ *ppos += *lenp; ++ } else { ++ len = snprintf(str, sizeof(str), "%lu", (unsigned long)freemem); ++ if (*ppos >= len) ++ rc = 0; ++ else ++ rc = proc_copyout_string(buffer,*lenp,str+*ppos,"\n"); ++ ++ if (rc >= 0) { ++ *lenp = rc; ++ *ppos += rc; ++ } ++ } ++ ++ SRETURN(rc); ++} ++ ++#ifdef DEBUG_KMEM ++static void ++slab_seq_show_headers(struct seq_file *f) ++{ ++ seq_printf(f, ++ "--------------------- cache ----------" ++ "--------------------------------------------- " ++ "----- slab ------ " ++ "---- object ----- " ++ "--- emergency ---\n"); ++ seq_printf(f, ++ "name " ++ " flags size alloc slabsize objsize " ++ "total alloc max " ++ "total alloc max " ++ "dlock alloc max\n"); ++} ++ ++static int ++slab_seq_show(struct seq_file *f, void *p) ++{ ++ spl_kmem_cache_t *skc = p; ++ ++ ASSERT(skc->skc_magic == SKC_MAGIC); ++ ++ spin_lock(&skc->skc_lock); ++ seq_printf(f, "%-36s ", skc->skc_name); ++ seq_printf(f, "0x%05lx %9lu %9lu %8u %8u " ++ "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n", ++ (long unsigned)skc->skc_flags, ++ (long unsigned)(skc->skc_slab_size * skc->skc_slab_total), ++ (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc), ++ (unsigned)skc->skc_slab_size, ++ (unsigned)skc->skc_obj_size, ++ (long unsigned)skc->skc_slab_total, ++ (long unsigned)skc->skc_slab_alloc, ++ (long unsigned)skc->skc_slab_max, ++ (long unsigned)skc->skc_obj_total, ++ (long unsigned)skc->skc_obj_alloc, ++ (long unsigned)skc->skc_obj_max, ++ (long unsigned)skc->skc_obj_deadlock, ++ (long unsigned)skc->skc_obj_emergency, ++ (long unsigned)skc->skc_obj_emergency_max); ++ ++ spin_unlock(&skc->skc_lock); ++ ++ return 0; ++} ++ ++static void * ++slab_seq_start(struct seq_file *f, loff_t *pos) ++{ ++ struct list_head *p; ++ loff_t n = *pos; ++ SENTRY; ++ ++ down_read(&spl_kmem_cache_sem); ++ if (!n) ++ slab_seq_show_headers(f); ++ ++ p = spl_kmem_cache_list.next; ++ while (n--) { ++ p = p->next; ++ if (p == &spl_kmem_cache_list) ++ SRETURN(NULL); ++ } ++ ++ SRETURN(list_entry(p, spl_kmem_cache_t, skc_list)); ++} ++ ++static void * ++slab_seq_next(struct seq_file *f, void *p, loff_t *pos) ++{ ++ spl_kmem_cache_t *skc = p; ++ SENTRY; ++ ++ ++*pos; ++ SRETURN((skc->skc_list.next == &spl_kmem_cache_list) ? ++ NULL : list_entry(skc->skc_list.next,spl_kmem_cache_t,skc_list)); ++} ++ ++static void ++slab_seq_stop(struct seq_file *f, void *v) ++{ ++ up_read(&spl_kmem_cache_sem); ++} ++ ++static struct seq_operations slab_seq_ops = { ++ .show = slab_seq_show, ++ .start = slab_seq_start, ++ .next = slab_seq_next, ++ .stop = slab_seq_stop, ++}; ++ ++static int ++proc_slab_open(struct inode *inode, struct file *filp) ++{ ++ return seq_open(filp, &slab_seq_ops); ++} ++ ++static struct file_operations proc_slab_operations = { ++ .open = proc_slab_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++#endif /* DEBUG_KMEM */ ++ ++#ifdef DEBUG_LOG ++static struct ctl_table spl_debug_table[] = { ++ { ++ CTL_NAME (CTL_DEBUG_SUBSYS) ++ .procname = "subsystem", ++ .data = &spl_debug_subsys, ++ .maxlen = sizeof(unsigned long), ++ .mode = 0644, ++ .proc_handler = &proc_dobitmasks ++ }, ++ { ++ CTL_NAME (CTL_DEBUG_MASK) ++ .procname = "mask", ++ .data = &spl_debug_mask, ++ .maxlen = sizeof(unsigned long), ++ .mode = 0644, ++ .proc_handler = &proc_dobitmasks ++ }, ++ { ++ CTL_NAME (CTL_DEBUG_PRINTK) ++ .procname = "printk", ++ .data = &spl_debug_printk, ++ .maxlen = sizeof(unsigned long), ++ .mode = 0644, ++ .proc_handler = &proc_dobitmasks ++ }, ++ { ++ CTL_NAME (CTL_DEBUG_MB) ++ .procname = "mb", ++ .mode = 0644, ++ .proc_handler = &proc_debug_mb, ++ }, ++ { ++ CTL_NAME (CTL_DEBUG_BINARY) ++ .procname = "binary", ++ .data = &spl_debug_binary, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ CTL_NAME (CTL_DEBUG_CATASTROPHE) ++ .procname = "catastrophe", ++ .data = &spl_debug_catastrophe, ++ .maxlen = sizeof(int), ++ .mode = 0444, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ CTL_NAME (CTL_DEBUG_PANIC_ON_BUG) ++ .procname = "panic_on_bug", ++ .data = &spl_debug_panic_on_bug, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { ++ CTL_NAME (CTL_DEBUG_PATH) ++ .procname = "path", ++ .data = spl_debug_file_path, ++ .maxlen = sizeof(spl_debug_file_path), ++ .mode = 0644, ++ .proc_handler = &proc_dostring, ++ }, ++ { ++ CTL_NAME (CTL_DEBUG_DUMP) ++ .procname = "dump", ++ .mode = 0200, ++ .proc_handler = &proc_dump_kernel, ++ }, ++ { CTL_NAME (CTL_DEBUG_FORCE_BUG) ++ .procname = "force_bug", ++ .mode = 0200, ++ .proc_handler = &proc_force_bug, ++ }, ++ { ++ CTL_NAME (CTL_CONSOLE_RATELIMIT) ++ .procname = "console_ratelimit", ++ .data = &spl_console_ratelimit, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ CTL_NAME (CTL_CONSOLE_MAX_DELAY_CS) ++ .procname = "console_max_delay_centisecs", ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_console_max_delay_cs, ++ }, ++ { ++ CTL_NAME (CTL_CONSOLE_MIN_DELAY_CS) ++ .procname = "console_min_delay_centisecs", ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_console_min_delay_cs, ++ }, ++ { ++ CTL_NAME (CTL_CONSOLE_BACKOFF) ++ .procname = "console_backoff", ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_console_backoff, ++ }, ++ { ++ CTL_NAME (CTL_DEBUG_STACK_SIZE) ++ .procname = "stack_max", ++ .data = &spl_debug_stack, ++ .maxlen = sizeof(int), ++ .mode = 0444, ++ .proc_handler = &proc_dointvec, ++ }, ++ {0}, ++}; ++#endif /* DEBUG_LOG */ ++ ++static struct ctl_table spl_vm_table[] = { ++ { ++ CTL_NAME (CTL_VM_MINFREE) ++ .procname = "minfree", ++ .data = &minfree, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ CTL_NAME (CTL_VM_DESFREE) ++ .procname = "desfree", ++ .data = &desfree, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ CTL_NAME (CTL_VM_LOTSFREE) ++ .procname = "lotsfree", ++ .data = &lotsfree, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ CTL_NAME (CTL_VM_NEEDFREE) ++ .procname = "needfree", ++ .data = &needfree, ++ .maxlen = sizeof(int), ++ .mode = 0444, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ CTL_NAME (CTL_VM_SWAPFS_MINFREE) ++ .procname = "swapfs_minfree", ++ .data = &swapfs_minfree, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ CTL_NAME (CTL_VM_SWAPFS_RESERVE) ++ .procname = "swapfs_reserve", ++ .data = &swapfs_reserve, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ CTL_NAME (CTL_VM_AVAILRMEM) ++ .procname = "availrmem", ++ .mode = 0444, ++ .proc_handler = &proc_doavailrmem, ++ }, ++ { ++ CTL_NAME (CTL_VM_FREEMEM) ++ .procname = "freemem", ++ .data = (void *)2, ++ .maxlen = sizeof(int), ++ .mode = 0444, ++ .proc_handler = &proc_dofreemem, ++ }, ++ { ++ CTL_NAME (CTL_VM_PHYSMEM) ++ .procname = "physmem", ++ .data = &physmem, ++ .maxlen = sizeof(int), ++ .mode = 0444, ++ .proc_handler = &proc_dointvec, ++ }, ++ {0}, ++}; ++ ++#ifdef DEBUG_KMEM ++static struct ctl_table spl_kmem_table[] = { ++ { ++ CTL_NAME (CTL_KMEM_KMEMUSED) ++ .procname = "kmem_used", ++ .data = &kmem_alloc_used, ++# ifdef HAVE_ATOMIC64_T ++ .maxlen = sizeof(atomic64_t), ++# else ++ .maxlen = sizeof(atomic_t), ++# endif /* HAVE_ATOMIC64_T */ ++ .mode = 0444, ++ .proc_handler = &proc_domemused, ++ }, ++ { ++ CTL_NAME (CTL_KMEM_KMEMMAX) ++ .procname = "kmem_max", ++ .data = &kmem_alloc_max, ++ .maxlen = sizeof(unsigned long), ++ .extra1 = &table_min, ++ .extra2 = &table_max, ++ .mode = 0444, ++ .proc_handler = &proc_doulongvec_minmax, ++ }, ++ { ++ CTL_NAME (CTL_KMEM_VMEMUSED) ++ .procname = "vmem_used", ++ .data = &vmem_alloc_used, ++# ifdef HAVE_ATOMIC64_T ++ .maxlen = sizeof(atomic64_t), ++# else ++ .maxlen = sizeof(atomic_t), ++# endif /* HAVE_ATOMIC64_T */ ++ .mode = 0444, ++ .proc_handler = &proc_domemused, ++ }, ++ { ++ CTL_NAME (CTL_KMEM_VMEMMAX) ++ .procname = "vmem_max", ++ .data = &vmem_alloc_max, ++ .maxlen = sizeof(unsigned long), ++ .extra1 = &table_min, ++ .extra2 = &table_max, ++ .mode = 0444, ++ .proc_handler = &proc_doulongvec_minmax, ++ }, ++ { ++ CTL_NAME (CTL_KMEM_SLAB_KMEMTOTAL) ++ .procname = "slab_kmem_total", ++ .data = (void *)(KMC_KMEM | KMC_TOTAL), ++ .maxlen = sizeof(unsigned long), ++ .extra1 = &table_min, ++ .extra2 = &table_max, ++ .mode = 0444, ++ .proc_handler = &proc_doslab, ++ }, ++ { ++ CTL_NAME (CTL_KMEM_SLAB_KMEMALLOC) ++ .procname = "slab_kmem_alloc", ++ .data = (void *)(KMC_KMEM | KMC_ALLOC), ++ .maxlen = sizeof(unsigned long), ++ .extra1 = &table_min, ++ .extra2 = &table_max, ++ .mode = 0444, ++ .proc_handler = &proc_doslab, ++ }, ++ { ++ CTL_NAME (CTL_KMEM_SLAB_KMEMMAX) ++ .procname = "slab_kmem_max", ++ .data = (void *)(KMC_KMEM | KMC_MAX), ++ .maxlen = sizeof(unsigned long), ++ .extra1 = &table_min, ++ .extra2 = &table_max, ++ .mode = 0444, ++ .proc_handler = &proc_doslab, ++ }, ++ { ++ CTL_NAME (CTL_KMEM_SLAB_VMEMTOTAL) ++ .procname = "slab_vmem_total", ++ .data = (void *)(KMC_VMEM | KMC_TOTAL), ++ .maxlen = sizeof(unsigned long), ++ .extra1 = &table_min, ++ .extra2 = &table_max, ++ .mode = 0444, ++ .proc_handler = &proc_doslab, ++ }, ++ { ++ CTL_NAME (CTL_KMEM_SLAB_VMEMALLOC) ++ .procname = "slab_vmem_alloc", ++ .data = (void *)(KMC_VMEM | KMC_ALLOC), ++ .maxlen = sizeof(unsigned long), ++ .extra1 = &table_min, ++ .extra2 = &table_max, ++ .mode = 0444, ++ .proc_handler = &proc_doslab, ++ }, ++ { ++ CTL_NAME (CTL_KMEM_SLAB_VMEMMAX) ++ .procname = "slab_vmem_max", ++ .data = (void *)(KMC_VMEM | KMC_MAX), ++ .maxlen = sizeof(unsigned long), ++ .extra1 = &table_min, ++ .extra2 = &table_max, ++ .mode = 0444, ++ .proc_handler = &proc_doslab, ++ }, ++ {0}, ++}; ++#endif /* DEBUG_KMEM */ ++ ++static struct ctl_table spl_kstat_table[] = { ++ {0}, ++}; ++ ++static struct ctl_table spl_table[] = { ++ /* NB No .strategy entries have been provided since ++ * sysctl(8) prefers to go via /proc for portability. ++ */ ++ { ++ CTL_NAME (CTL_VERSION) ++ .procname = "version", ++ .data = spl_version, ++ .maxlen = sizeof(spl_version), ++ .mode = 0444, ++ .proc_handler = &proc_dostring, ++ }, ++ { ++ CTL_NAME (CTL_HOSTID) ++ .procname = "hostid", ++ .data = &spl_hostid, ++ .maxlen = sizeof(unsigned long), ++ .mode = 0644, ++ .proc_handler = &proc_dohostid, ++ }, ++ { ++ CTL_NAME (CTL_HW_SERIAL) ++ .procname = "hw_serial", ++ .data = hw_serial, ++ .maxlen = sizeof(hw_serial), ++ .mode = 0444, ++ .proc_handler = &proc_dostring, ++ }, ++#ifndef HAVE_KALLSYMS_LOOKUP_NAME ++ { ++ CTL_NAME (CTL_KALLSYMS) ++ .procname = "kallsyms_lookup_name", ++ .data = &spl_kallsyms_lookup_name_fn, ++ .maxlen = sizeof(unsigned long), ++ .mode = 0644, ++ .proc_handler = &proc_dokallsyms_lookup_name, ++ }, ++#endif ++#ifdef DEBUG_LOG ++ { ++ CTL_NAME (CTL_SPL_DEBUG) ++ .procname = "debug", ++ .mode = 0555, ++ .child = spl_debug_table, ++ }, ++#endif ++ { ++ CTL_NAME (CTL_SPL_VM) ++ .procname = "vm", ++ .mode = 0555, ++ .child = spl_vm_table, ++ }, ++#ifdef DEBUG_KMEM ++ { ++ CTL_NAME (CTL_SPL_KMEM) ++ .procname = "kmem", ++ .mode = 0555, ++ .child = spl_kmem_table, ++ }, ++#endif ++ { ++ CTL_NAME (CTL_SPL_KSTAT) ++ .procname = "kstat", ++ .mode = 0555, ++ .child = spl_kstat_table, ++ }, ++ { 0 }, ++}; ++ ++static struct ctl_table spl_dir[] = { ++ { ++ CTL_NAME (CTL_SPL) ++ .procname = "spl", ++ .mode = 0555, ++ .child = spl_table, ++ }, ++ { 0 } ++}; ++ ++static struct ctl_table spl_root[] = { ++ { ++ CTL_NAME (CTL_KERN) ++ .procname = "kernel", ++ .mode = 0555, ++ .child = spl_dir, ++ }, ++ { 0 } ++}; ++ ++static int ++proc_dir_entry_match(int len, const char *name, struct proc_dir_entry *de) ++{ ++ if (de->namelen != len) ++ return 0; ++ ++ return !memcmp(name, de->name, len); ++} ++ ++struct proc_dir_entry * ++proc_dir_entry_find(struct proc_dir_entry *root, const char *str) ++{ ++ struct proc_dir_entry *de; ++ ++ for (de = root->subdir; de; de = de->next) ++ if (proc_dir_entry_match(strlen(str), str, de)) ++ return de; ++ ++ return NULL; ++} ++ ++int ++proc_dir_entries(struct proc_dir_entry *root) ++{ ++ struct proc_dir_entry *de; ++ int i = 0; ++ ++ for (de = root->subdir; de; de = de->next) ++ i++; ++ ++ return i; ++} ++ ++int ++spl_proc_init(void) ++{ ++ int rc = 0; ++ SENTRY; ++ ++#ifdef CONFIG_SYSCTL ++ spl_header = spl_register_sysctl_table(spl_root, 0); ++ if (spl_header == NULL) ++ SRETURN(-EUNATCH); ++#endif /* CONFIG_SYSCTL */ ++ ++ proc_spl = proc_mkdir("spl", NULL); ++ if (proc_spl == NULL) ++ SGOTO(out, rc = -EUNATCH); ++ ++#ifdef DEBUG_KMEM ++ proc_spl_kmem = proc_mkdir("kmem", proc_spl); ++ if (proc_spl_kmem == NULL) ++ SGOTO(out, rc = -EUNATCH); ++ ++ proc_spl_kmem_slab = create_proc_entry("slab", 0444, proc_spl_kmem); ++ if (proc_spl_kmem_slab == NULL) ++ SGOTO(out, rc = -EUNATCH); ++ ++ proc_spl_kmem_slab->proc_fops = &proc_slab_operations; ++#endif /* DEBUG_KMEM */ ++ ++ proc_spl_kstat = proc_mkdir("kstat", proc_spl); ++ if (proc_spl_kstat == NULL) ++ SGOTO(out, rc = -EUNATCH); ++out: ++ if (rc) { ++ remove_proc_entry("kstat", proc_spl); ++#ifdef DEBUG_KMEM ++ remove_proc_entry("slab", proc_spl_kmem); ++ remove_proc_entry("kmem", proc_spl); ++#endif ++ remove_proc_entry("spl", NULL); ++#ifdef CONFIG_SYSCTL ++ spl_unregister_sysctl_table(spl_header); ++#endif /* CONFIG_SYSCTL */ ++ } ++ ++ SRETURN(rc); ++} ++ ++void ++spl_proc_fini(void) ++{ ++ SENTRY; ++ ++ remove_proc_entry("kstat", proc_spl); ++#ifdef DEBUG_KMEM ++ remove_proc_entry("slab", proc_spl_kmem); ++ remove_proc_entry("kmem", proc_spl); ++#endif ++ remove_proc_entry("spl", NULL); ++ ++#ifdef CONFIG_SYSCTL ++ ASSERT(spl_header != NULL); ++ spl_unregister_sysctl_table(spl_header); ++#endif /* CONFIG_SYSCTL */ ++ ++ SEXIT; ++} +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-rwlock.c linux-3.2.33-go/spl/spl/spl-rwlock.c +--- linux-3.2.33-go.orig/spl/spl/spl-rwlock.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-rwlock.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,96 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Reader/Writer Lock Implementation. ++\*****************************************************************************/ ++ ++#include ++ ++#ifdef DEBUG_SUBSYSTEM ++#undef DEBUG_SUBSYSTEM ++#endif ++ ++#define DEBUG_SUBSYSTEM S_RWLOCK ++ ++#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK ++ ++/* ++ * From lib/rwsem-spinlock.c but modified such that the caller is ++ * responsible for acquiring and dropping the sem->wait_lock. ++ */ ++struct rwsem_waiter { ++ struct list_head list; ++ struct task_struct *task; ++ unsigned int flags; ++#define RWSEM_WAITING_FOR_READ 0x00000001 ++#define RWSEM_WAITING_FOR_WRITE 0x00000002 ++}; ++ ++/* wake a single writer */ ++static struct rw_semaphore * ++__rwsem_wake_one_writer_locked(struct rw_semaphore *sem) ++{ ++ struct rwsem_waiter *waiter; ++ struct task_struct *tsk; ++ ++ sem->activity = -1; ++ ++ waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); ++ list_del(&waiter->list); ++ ++ tsk = waiter->task; ++ smp_mb(); ++ waiter->task = NULL; ++ wake_up_process(tsk); ++ put_task_struct(tsk); ++ return sem; ++} ++ ++/* release a read lock on the semaphore */ ++void ++__up_read_locked(struct rw_semaphore *sem) ++{ ++ if (--sem->activity == 0 && !list_empty(&sem->wait_list)) ++ (void)__rwsem_wake_one_writer_locked(sem); ++} ++EXPORT_SYMBOL(__up_read_locked); ++ ++/* trylock for writing -- returns 1 if successful, 0 if contention */ ++int ++__down_write_trylock_locked(struct rw_semaphore *sem) ++{ ++ int ret = 0; ++ ++ if (sem->activity == 0 && list_empty(&sem->wait_list)) { ++ sem->activity = -1; ++ ret = 1; ++ } ++ ++ return ret; ++} ++EXPORT_SYMBOL(__down_write_trylock_locked); ++ ++#endif ++ ++int spl_rw_init(void) { return 0; } ++void spl_rw_fini(void) { } +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-taskq.c linux-3.2.33-go/spl/spl/spl-taskq.c +--- linux-3.2.33-go.orig/spl/spl/spl-taskq.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-taskq.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,703 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Task Queue Implementation. ++\*****************************************************************************/ ++ ++#include ++#include ++#include ++ ++#ifdef SS_DEBUG_SUBSYS ++#undef SS_DEBUG_SUBSYS ++#endif ++ ++#define SS_DEBUG_SUBSYS SS_TASKQ ++ ++/* Global system-wide dynamic task queue available for all consumers */ ++taskq_t *system_taskq; ++EXPORT_SYMBOL(system_taskq); ++ ++static int ++task_km_flags(uint_t flags) ++{ ++ if (flags & TQ_NOSLEEP) ++ return KM_NOSLEEP; ++ ++ if (flags & TQ_PUSHPAGE) ++ return KM_PUSHPAGE; ++ ++ return KM_SLEEP; ++} ++ ++/* ++ * NOTE: Must be called with tq->tq_lock held, returns a list_t which ++ * is not attached to the free, work, or pending taskq lists. ++ */ ++static taskq_ent_t * ++task_alloc(taskq_t *tq, uint_t flags) ++{ ++ taskq_ent_t *t; ++ int count = 0; ++ SENTRY; ++ ++ ASSERT(tq); ++ ASSERT(spin_is_locked(&tq->tq_lock)); ++retry: ++ /* Acquire taskq_ent_t's from free list if available */ ++ if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) { ++ t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); ++ ++ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); ++ ++ list_del_init(&t->tqent_list); ++ SRETURN(t); ++ } ++ ++ /* Free list is empty and memory allocations are prohibited */ ++ if (flags & TQ_NOALLOC) ++ SRETURN(NULL); ++ ++ /* Hit maximum taskq_ent_t pool size */ ++ if (tq->tq_nalloc >= tq->tq_maxalloc) { ++ if (flags & TQ_NOSLEEP) ++ SRETURN(NULL); ++ ++ /* ++ * Sleep periodically polling the free list for an available ++ * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed ++ * but we cannot block forever waiting for an taskq_entq_t to ++ * show up in the free list, otherwise a deadlock can happen. ++ * ++ * Therefore, we need to allocate a new task even if the number ++ * of allocated tasks is above tq->tq_maxalloc, but we still ++ * end up delaying the task allocation by one second, thereby ++ * throttling the task dispatch rate. ++ */ ++ spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); ++ schedule_timeout(HZ / 100); ++ spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); ++ if (count < 100) ++ SGOTO(retry, count++); ++ } ++ ++ spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); ++ t = kmem_alloc(sizeof(taskq_ent_t), task_km_flags(flags)); ++ spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); ++ ++ if (t) { ++ taskq_init_ent(t); ++ tq->tq_nalloc++; ++ } ++ ++ SRETURN(t); ++} ++ ++/* ++ * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t ++ * to already be removed from the free, work, or pending taskq lists. ++ */ ++static void ++task_free(taskq_t *tq, taskq_ent_t *t) ++{ ++ SENTRY; ++ ++ ASSERT(tq); ++ ASSERT(t); ++ ASSERT(spin_is_locked(&tq->tq_lock)); ++ ASSERT(list_empty(&t->tqent_list)); ++ ++ kmem_free(t, sizeof(taskq_ent_t)); ++ tq->tq_nalloc--; ++ ++ SEXIT; ++} ++ ++/* ++ * NOTE: Must be called with tq->tq_lock held, either destroys the ++ * taskq_ent_t if too many exist or moves it to the free list for later use. ++ */ ++static void ++task_done(taskq_t *tq, taskq_ent_t *t) ++{ ++ SENTRY; ++ ASSERT(tq); ++ ASSERT(t); ++ ASSERT(spin_is_locked(&tq->tq_lock)); ++ ++ list_del_init(&t->tqent_list); ++ ++ if (tq->tq_nalloc <= tq->tq_minalloc) { ++ t->tqent_id = 0; ++ t->tqent_func = NULL; ++ t->tqent_arg = NULL; ++ t->tqent_flags = 0; ++ ++ list_add_tail(&t->tqent_list, &tq->tq_free_list); ++ } else { ++ task_free(tq, t); ++ } ++ ++ SEXIT; ++} ++ ++/* ++ * As tasks are submitted to the task queue they are assigned a ++ * monotonically increasing taskqid and added to the tail of the pending ++ * list. As worker threads become available the tasks are removed from ++ * the head of the pending or priority list, giving preference to the ++ * priority list. The tasks are then removed from their respective ++ * list, and the taskq_thread servicing the task is added to the active ++ * list, preserving the order using the serviced task's taskqid. ++ * Finally, as tasks complete the taskq_thread servicing the task is ++ * removed from the active list. This means that the pending task and ++ * active taskq_thread lists are always kept sorted by taskqid. Thus the ++ * lowest outstanding incomplete taskqid can be determined simply by ++ * checking the min taskqid for each head item on the pending, priority, ++ * and active taskq_thread list. This value is stored in ++ * tq->tq_lowest_id and only updated to the new lowest id when the ++ * previous lowest id completes. All taskqids lower than ++ * tq->tq_lowest_id must have completed. It is also possible larger ++ * taskqid's have completed because they may be processed in parallel by ++ * several worker threads. However, this is not a problem because the ++ * behavior of taskq_wait_id() is to block until all previously ++ * submitted taskqid's have completed. ++ * ++ * XXX: Taskqid_t wrapping is not handled. However, taskqid_t's are ++ * 64-bit values so even if a taskq is processing 2^24 (16,777,216) ++ * taskqid_ts per second it will still take 2^40 seconds, 34,865 years, ++ * before the wrap occurs. I can live with that for now. ++ */ ++static int ++taskq_wait_check(taskq_t *tq, taskqid_t id) ++{ ++ int rc; ++ ++ spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); ++ rc = (id < tq->tq_lowest_id); ++ spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); ++ ++ SRETURN(rc); ++} ++ ++void ++__taskq_wait_id(taskq_t *tq, taskqid_t id) ++{ ++ SENTRY; ++ ASSERT(tq); ++ ++ wait_event(tq->tq_wait_waitq, taskq_wait_check(tq, id)); ++ ++ SEXIT; ++} ++EXPORT_SYMBOL(__taskq_wait_id); ++ ++void ++__taskq_wait(taskq_t *tq) ++{ ++ taskqid_t id; ++ SENTRY; ++ ASSERT(tq); ++ ++ /* Wait for the largest outstanding taskqid */ ++ spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); ++ id = tq->tq_next_id - 1; ++ spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); ++ ++ __taskq_wait_id(tq, id); ++ ++ SEXIT; ++ ++} ++EXPORT_SYMBOL(__taskq_wait); ++ ++int ++__taskq_member(taskq_t *tq, void *t) ++{ ++ struct list_head *l; ++ taskq_thread_t *tqt; ++ SENTRY; ++ ++ ASSERT(tq); ++ ASSERT(t); ++ ++ list_for_each(l, &tq->tq_thread_list) { ++ tqt = list_entry(l, taskq_thread_t, tqt_thread_list); ++ if (tqt->tqt_thread == (struct task_struct *)t) ++ SRETURN(1); ++ } ++ ++ SRETURN(0); ++} ++EXPORT_SYMBOL(__taskq_member); ++ ++taskqid_t ++__taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) ++{ ++ taskq_ent_t *t; ++ taskqid_t rc = 0; ++ SENTRY; ++ ++ ASSERT(tq); ++ ASSERT(func); ++ ++ spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); ++ ++ /* Taskq being destroyed and all tasks drained */ ++ if (!(tq->tq_flags & TQ_ACTIVE)) ++ SGOTO(out, rc = 0); ++ ++ /* Do not queue the task unless there is idle thread for it */ ++ ASSERT(tq->tq_nactive <= tq->tq_nthreads); ++ if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) ++ SGOTO(out, rc = 0); ++ ++ if ((t = task_alloc(tq, flags)) == NULL) ++ SGOTO(out, rc = 0); ++ ++ spin_lock(&t->tqent_lock); ++ ++ /* Queue to the priority list instead of the pending list */ ++ if (flags & TQ_FRONT) ++ list_add_tail(&t->tqent_list, &tq->tq_prio_list); ++ else ++ list_add_tail(&t->tqent_list, &tq->tq_pend_list); ++ ++ t->tqent_id = rc = tq->tq_next_id; ++ tq->tq_next_id++; ++ t->tqent_func = func; ++ t->tqent_arg = arg; ++ ++ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); ++ ++ spin_unlock(&t->tqent_lock); ++ ++ wake_up(&tq->tq_work_waitq); ++out: ++ spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); ++ SRETURN(rc); ++} ++EXPORT_SYMBOL(__taskq_dispatch); ++ ++void ++__taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, ++ taskq_ent_t *t) ++{ ++ SENTRY; ++ ++ ASSERT(tq); ++ ASSERT(func); ++ ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC)); ++ ++ spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); ++ ++ /* Taskq being destroyed and all tasks drained */ ++ if (!(tq->tq_flags & TQ_ACTIVE)) { ++ t->tqent_id = 0; ++ goto out; ++ } ++ ++ spin_lock(&t->tqent_lock); ++ ++ /* ++ * Mark it as a prealloc'd task. This is important ++ * to ensure that we don't free it later. ++ */ ++ t->tqent_flags |= TQENT_FLAG_PREALLOC; ++ ++ /* Queue to the priority list instead of the pending list */ ++ if (flags & TQ_FRONT) ++ list_add_tail(&t->tqent_list, &tq->tq_prio_list); ++ else ++ list_add_tail(&t->tqent_list, &tq->tq_pend_list); ++ ++ t->tqent_id = tq->tq_next_id; ++ tq->tq_next_id++; ++ t->tqent_func = func; ++ t->tqent_arg = arg; ++ ++ spin_unlock(&t->tqent_lock); ++ ++ wake_up(&tq->tq_work_waitq); ++out: ++ spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); ++ SEXIT; ++} ++EXPORT_SYMBOL(__taskq_dispatch_ent); ++ ++int ++__taskq_empty_ent(taskq_ent_t *t) ++{ ++ return list_empty(&t->tqent_list); ++} ++EXPORT_SYMBOL(__taskq_empty_ent); ++ ++void ++__taskq_init_ent(taskq_ent_t *t) ++{ ++ spin_lock_init(&t->tqent_lock); ++ INIT_LIST_HEAD(&t->tqent_list); ++ t->tqent_id = 0; ++ t->tqent_func = NULL; ++ t->tqent_arg = NULL; ++ t->tqent_flags = 0; ++} ++EXPORT_SYMBOL(__taskq_init_ent); ++ ++/* ++ * Returns the lowest incomplete taskqid_t. The taskqid_t may ++ * be queued on the pending list, on the priority list, or on ++ * the work list currently being handled, but it is not 100% ++ * complete yet. ++ */ ++static taskqid_t ++taskq_lowest_id(taskq_t *tq) ++{ ++ taskqid_t lowest_id = tq->tq_next_id; ++ taskq_ent_t *t; ++ taskq_thread_t *tqt; ++ SENTRY; ++ ++ ASSERT(tq); ++ ASSERT(spin_is_locked(&tq->tq_lock)); ++ ++ if (!list_empty(&tq->tq_pend_list)) { ++ t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list); ++ lowest_id = MIN(lowest_id, t->tqent_id); ++ } ++ ++ if (!list_empty(&tq->tq_prio_list)) { ++ t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list); ++ lowest_id = MIN(lowest_id, t->tqent_id); ++ } ++ ++ if (!list_empty(&tq->tq_active_list)) { ++ tqt = list_entry(tq->tq_active_list.next, taskq_thread_t, ++ tqt_active_list); ++ ASSERT(tqt->tqt_id != 0); ++ lowest_id = MIN(lowest_id, tqt->tqt_id); ++ } ++ ++ SRETURN(lowest_id); ++} ++ ++/* ++ * Insert a task into a list keeping the list sorted by increasing ++ * taskqid. ++ */ ++static void ++taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt) ++{ ++ taskq_thread_t *w; ++ struct list_head *l; ++ ++ SENTRY; ++ ASSERT(tq); ++ ASSERT(tqt); ++ ASSERT(spin_is_locked(&tq->tq_lock)); ++ ++ list_for_each_prev(l, &tq->tq_active_list) { ++ w = list_entry(l, taskq_thread_t, tqt_active_list); ++ if (w->tqt_id < tqt->tqt_id) { ++ list_add(&tqt->tqt_active_list, l); ++ break; ++ } ++ } ++ if (l == &tq->tq_active_list) ++ list_add(&tqt->tqt_active_list, &tq->tq_active_list); ++ ++ SEXIT; ++} ++ ++static int ++taskq_thread(void *args) ++{ ++ DECLARE_WAITQUEUE(wait, current); ++ sigset_t blocked; ++ taskq_thread_t *tqt = args; ++ taskq_t *tq; ++ taskq_ent_t *t; ++ struct list_head *pend_list; ++ SENTRY; ++ ++ ASSERT(tqt); ++ tq = tqt->tqt_tq; ++ current->flags |= PF_NOFREEZE; ++ ++ sigfillset(&blocked); ++ sigprocmask(SIG_BLOCK, &blocked, NULL); ++ flush_signals(current); ++ ++ spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); ++ tq->tq_nthreads++; ++ wake_up(&tq->tq_wait_waitq); ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ while (!kthread_should_stop()) { ++ ++ if (list_empty(&tq->tq_pend_list) && ++ list_empty(&tq->tq_prio_list)) { ++ add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); ++ spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); ++ schedule(); ++ spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); ++ remove_wait_queue(&tq->tq_work_waitq, &wait); ++ } else { ++ __set_current_state(TASK_RUNNING); ++ } ++ ++ ++ if (!list_empty(&tq->tq_prio_list)) ++ pend_list = &tq->tq_prio_list; ++ else if (!list_empty(&tq->tq_pend_list)) ++ pend_list = &tq->tq_pend_list; ++ else ++ pend_list = NULL; ++ ++ if (pend_list) { ++ t = list_entry(pend_list->next, taskq_ent_t, tqent_list); ++ list_del_init(&t->tqent_list); ++ ++ /* In order to support recursively dispatching a ++ * preallocated taskq_ent_t, tqent_id must be ++ * stored prior to executing tqent_func. */ ++ tqt->tqt_id = t->tqent_id; ++ ++ /* We must store a copy of the flags prior to ++ * servicing the task (servicing a prealloc'd task ++ * returns the ownership of the tqent back to ++ * the caller of taskq_dispatch). Thus, ++ * tqent_flags _may_ change within the call. */ ++ tqt->tqt_flags = t->tqent_flags; ++ ++ taskq_insert_in_order(tq, tqt); ++ tq->tq_nactive++; ++ spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); ++ ++ /* Perform the requested task */ ++ t->tqent_func(t->tqent_arg); ++ ++ spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); ++ tq->tq_nactive--; ++ list_del_init(&tqt->tqt_active_list); ++ ++ /* For prealloc'd tasks, we don't free anything. */ ++ if ((tq->tq_flags & TASKQ_DYNAMIC) || ++ !(tqt->tqt_flags & TQENT_FLAG_PREALLOC)) ++ task_done(tq, t); ++ ++ /* When the current lowest outstanding taskqid is ++ * done calculate the new lowest outstanding id */ ++ if (tq->tq_lowest_id == tqt->tqt_id) { ++ tq->tq_lowest_id = taskq_lowest_id(tq); ++ ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id); ++ } ++ ++ tqt->tqt_id = 0; ++ tqt->tqt_flags = 0; ++ wake_up_all(&tq->tq_wait_waitq); ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ } ++ ++ __set_current_state(TASK_RUNNING); ++ tq->tq_nthreads--; ++ list_del_init(&tqt->tqt_thread_list); ++ kmem_free(tqt, sizeof(taskq_thread_t)); ++ ++ spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); ++ ++ SRETURN(0); ++} ++ ++taskq_t * ++__taskq_create(const char *name, int nthreads, pri_t pri, ++ int minalloc, int maxalloc, uint_t flags) ++{ ++ taskq_t *tq; ++ taskq_thread_t *tqt; ++ int rc = 0, i, j = 0; ++ SENTRY; ++ ++ ASSERT(name != NULL); ++ ASSERT(pri <= maxclsyspri); ++ ASSERT(minalloc >= 0); ++ ASSERT(maxalloc <= INT_MAX); ++ ASSERT(!(flags & (TASKQ_CPR_SAFE | TASKQ_DYNAMIC))); /* Unsupported */ ++ ++ /* Scale the number of threads using nthreads as a percentage */ ++ if (flags & TASKQ_THREADS_CPU_PCT) { ++ ASSERT(nthreads <= 100); ++ ASSERT(nthreads >= 0); ++ nthreads = MIN(nthreads, 100); ++ nthreads = MAX(nthreads, 0); ++ nthreads = MAX((num_online_cpus() * nthreads) / 100, 1); ++ } ++ ++ tq = kmem_alloc(sizeof(*tq), KM_PUSHPAGE); ++ if (tq == NULL) ++ SRETURN(NULL); ++ ++ spin_lock_init(&tq->tq_lock); ++ spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); ++ INIT_LIST_HEAD(&tq->tq_thread_list); ++ INIT_LIST_HEAD(&tq->tq_active_list); ++ tq->tq_name = name; ++ tq->tq_nactive = 0; ++ tq->tq_nthreads = 0; ++ tq->tq_pri = pri; ++ tq->tq_minalloc = minalloc; ++ tq->tq_maxalloc = maxalloc; ++ tq->tq_nalloc = 0; ++ tq->tq_flags = (flags | TQ_ACTIVE); ++ tq->tq_next_id = 1; ++ tq->tq_lowest_id = 1; ++ INIT_LIST_HEAD(&tq->tq_free_list); ++ INIT_LIST_HEAD(&tq->tq_pend_list); ++ INIT_LIST_HEAD(&tq->tq_prio_list); ++ init_waitqueue_head(&tq->tq_work_waitq); ++ init_waitqueue_head(&tq->tq_wait_waitq); ++ ++ if (flags & TASKQ_PREPOPULATE) ++ for (i = 0; i < minalloc; i++) ++ task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW)); ++ ++ spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); ++ ++ for (i = 0; i < nthreads; i++) { ++ tqt = kmem_alloc(sizeof(*tqt), KM_PUSHPAGE); ++ INIT_LIST_HEAD(&tqt->tqt_thread_list); ++ INIT_LIST_HEAD(&tqt->tqt_active_list); ++ tqt->tqt_tq = tq; ++ tqt->tqt_id = 0; ++ ++ tqt->tqt_thread = kthread_create(taskq_thread, tqt, ++ "%s/%d", name, i); ++ if (tqt->tqt_thread) { ++ list_add(&tqt->tqt_thread_list, &tq->tq_thread_list); ++ kthread_bind(tqt->tqt_thread, i % num_online_cpus()); ++ set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(pri)); ++ wake_up_process(tqt->tqt_thread); ++ j++; ++ } else { ++ kmem_free(tqt, sizeof(taskq_thread_t)); ++ rc = 1; ++ } ++ } ++ ++ /* Wait for all threads to be started before potential destroy */ ++ wait_event(tq->tq_wait_waitq, tq->tq_nthreads == j); ++ ++ if (rc) { ++ __taskq_destroy(tq); ++ tq = NULL; ++ } ++ ++ SRETURN(tq); ++} ++EXPORT_SYMBOL(__taskq_create); ++ ++void ++__taskq_destroy(taskq_t *tq) ++{ ++ struct task_struct *thread; ++ taskq_thread_t *tqt; ++ taskq_ent_t *t; ++ SENTRY; ++ ++ ASSERT(tq); ++ spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); ++ tq->tq_flags &= ~TQ_ACTIVE; ++ spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); ++ ++ /* TQ_ACTIVE cleared prevents new tasks being added to pending */ ++ __taskq_wait(tq); ++ ++ spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); ++ ++ /* ++ * Signal each thread to exit and block until it does. Each thread ++ * is responsible for removing itself from the list and freeing its ++ * taskq_thread_t. This allows for idle threads to opt to remove ++ * themselves from the taskq. They can be recreated as needed. ++ */ ++ while (!list_empty(&tq->tq_thread_list)) { ++ tqt = list_entry(tq->tq_thread_list.next, ++ taskq_thread_t, tqt_thread_list); ++ thread = tqt->tqt_thread; ++ spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); ++ ++ kthread_stop(thread); ++ ++ spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags); ++ } ++ ++ while (!list_empty(&tq->tq_free_list)) { ++ t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); ++ ++ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); ++ ++ list_del_init(&t->tqent_list); ++ task_free(tq, t); ++ } ++ ++ ASSERT(tq->tq_nthreads == 0); ++ ASSERT(tq->tq_nalloc == 0); ++ ASSERT(list_empty(&tq->tq_thread_list)); ++ ASSERT(list_empty(&tq->tq_active_list)); ++ ASSERT(list_empty(&tq->tq_free_list)); ++ ASSERT(list_empty(&tq->tq_pend_list)); ++ ASSERT(list_empty(&tq->tq_prio_list)); ++ ++ spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags); ++ ++ kmem_free(tq, sizeof(taskq_t)); ++ ++ SEXIT; ++} ++EXPORT_SYMBOL(__taskq_destroy); ++ ++int ++spl_taskq_init(void) ++{ ++ SENTRY; ++ ++ /* Solaris creates a dynamic taskq of up to 64 threads, however in ++ * a Linux environment 1 thread per-core is usually about right */ ++ system_taskq = taskq_create("spl_system_taskq", num_online_cpus(), ++ minclsyspri, 4, 512, TASKQ_PREPOPULATE); ++ if (system_taskq == NULL) ++ SRETURN(1); ++ ++ SRETURN(0); ++} ++ ++void ++spl_taskq_fini(void) ++{ ++ SENTRY; ++ taskq_destroy(system_taskq); ++ SEXIT; ++} +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-thread.c linux-3.2.33-go/spl/spl/spl-thread.c +--- linux-3.2.33-go.orig/spl/spl/spl-thread.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-thread.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,139 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Thread Implementation. ++\*****************************************************************************/ ++ ++#include ++#include ++#include ++#include ++ ++#ifdef SS_DEBUG_SUBSYS ++#undef SS_DEBUG_SUBSYS ++#endif ++ ++#define SS_DEBUG_SUBSYS SS_THREAD ++ ++/* ++ * Thread interfaces ++ */ ++typedef struct thread_priv_s { ++ unsigned long tp_magic; /* Magic */ ++ int tp_name_size; /* Name size */ ++ char *tp_name; /* Name (without _thread suffix) */ ++ void (*tp_func)(void *); /* Registered function */ ++ void *tp_args; /* Args to be passed to function */ ++ size_t tp_len; /* Len to be passed to function */ ++ int tp_state; /* State to start thread at */ ++ pri_t tp_pri; /* Priority to start threat at */ ++} thread_priv_t; ++ ++static int ++thread_generic_wrapper(void *arg) ++{ ++ thread_priv_t *tp = (thread_priv_t *)arg; ++ void (*func)(void *); ++ void *args; ++ ++ ASSERT(tp->tp_magic == TP_MAGIC); ++ func = tp->tp_func; ++ args = tp->tp_args; ++ set_current_state(tp->tp_state); ++ set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri)); ++ kmem_free(tp->tp_name, tp->tp_name_size); ++ kmem_free(tp, sizeof(thread_priv_t)); ++ ++ if (func) ++ func(args); ++ ++ return 0; ++} ++ ++void ++__thread_exit(void) ++{ ++ SENTRY; ++ SEXIT; ++ tsd_exit(); ++ complete_and_exit(NULL, 0); ++ /* Unreachable */ ++} ++EXPORT_SYMBOL(__thread_exit); ++ ++/* thread_create() may block forever if it cannot create a thread or ++ * allocate memory. This is preferable to returning a NULL which Solaris ++ * style callers likely never check for... since it can't fail. */ ++kthread_t * ++__thread_create(caddr_t stk, size_t stksize, thread_func_t func, ++ const char *name, void *args, size_t len, proc_t *pp, ++ int state, pri_t pri) ++{ ++ thread_priv_t *tp; ++ struct task_struct *tsk; ++ char *p; ++ SENTRY; ++ ++ /* Option pp is simply ignored */ ++ /* Variable stack size unsupported */ ++ ASSERT(stk == NULL); ++ ++ tp = kmem_alloc(sizeof(thread_priv_t), KM_PUSHPAGE); ++ if (tp == NULL) ++ SRETURN(NULL); ++ ++ tp->tp_magic = TP_MAGIC; ++ tp->tp_name_size = strlen(name) + 1; ++ ++ tp->tp_name = kmem_alloc(tp->tp_name_size, KM_PUSHPAGE); ++ if (tp->tp_name == NULL) { ++ kmem_free(tp, sizeof(thread_priv_t)); ++ SRETURN(NULL); ++ } ++ ++ strncpy(tp->tp_name, name, tp->tp_name_size); ++ ++ /* Strip trailing "_thread" from passed name which will be the func ++ * name since the exposed API has no parameter for passing a name. ++ */ ++ p = strstr(tp->tp_name, "_thread"); ++ if (p) ++ p[0] = '\0'; ++ ++ tp->tp_func = func; ++ tp->tp_args = args; ++ tp->tp_len = len; ++ tp->tp_state = state; ++ tp->tp_pri = pri; ++ ++ tsk = kthread_create(thread_generic_wrapper, (void *)tp, ++ "%s", tp->tp_name); ++ if (IS_ERR(tsk)) { ++ SERROR("Failed to create thread: %ld\n", PTR_ERR(tsk)); ++ SRETURN(NULL); ++ } ++ ++ wake_up_process(tsk); ++ SRETURN((kthread_t *)tsk); ++} ++EXPORT_SYMBOL(__thread_create); +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-time.c linux-3.2.33-go/spl/spl/spl-time.c +--- linux-3.2.33-go.orig/spl/spl/spl-time.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-time.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,93 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Time Implementation. ++\*****************************************************************************/ ++ ++#include ++#include ++ ++#ifdef HAVE_MONOTONIC_CLOCK ++extern unsigned long long monotonic_clock(void); ++#endif ++ ++#ifdef DEBUG_SUBSYSTEM ++#undef DEBUG_SUBSYSTEM ++#endif ++ ++#define DEBUG_SUBSYSTEM S_TIME ++ ++void ++__gethrestime(timestruc_t *ts) ++{ ++ struct timeval tv; ++ ++ do_gettimeofday(&tv); ++ ts->tv_sec = tv.tv_sec; ++ ts->tv_nsec = tv.tv_usec * NSEC_PER_USEC; ++} ++EXPORT_SYMBOL(__gethrestime); ++ ++/* Use monotonic_clock() by default. It's faster and is available on older ++ * kernels, but few architectures have them, so we must fallback to ++ * do_posix_clock_monotonic_gettime(). ++ */ ++hrtime_t ++__gethrtime(void) { ++#ifdef HAVE_MONOTONIC_CLOCK ++ unsigned long long res = monotonic_clock(); ++ ++ /* Deal with signed/unsigned mismatch */ ++ return (hrtime_t)(res & ~(1ULL << 63)); ++#else ++ struct timespec ts; ++ ++ do_posix_clock_monotonic_gettime(&ts); ++ return (((hrtime_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec); ++#endif ++} ++EXPORT_SYMBOL(__gethrtime); ++ ++/* set_normalized_timespec() API changes ++ * 2.6.0 - 2.6.15: Inline function provided by linux/time.h ++ * 2.6.16 - 2.6.25: Function prototype defined but not exported ++ * 2.6.26 - 2.6.x: Function defined and exported ++ */ ++#if !defined(HAVE_SET_NORMALIZED_TIMESPEC_INLINE) && \ ++ !defined(HAVE_SET_NORMALIZED_TIMESPEC_EXPORT) ++void ++set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) ++{ ++ while (nsec >= NSEC_PER_SEC) { ++ nsec -= NSEC_PER_SEC; ++ ++sec; ++ } ++ while (nsec < 0) { ++ nsec += NSEC_PER_SEC; ++ --sec; ++ } ++ ts->tv_sec = sec; ++ ts->tv_nsec = nsec; ++} ++EXPORT_SYMBOL(set_normalized_timespec); ++#endif +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-tsd.c linux-3.2.33-go/spl/spl/spl-tsd.c +--- linux-3.2.33-go.orig/spl/spl/spl-tsd.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-tsd.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,641 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2010 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Thread Specific Data Implementation. ++ * ++ * Thread specific data has implemented using a hash table, this avoids ++ * the need to add a member to the task structure and allows maximum ++ * portability between kernels. This implementation has been optimized ++ * to keep the tsd_set() and tsd_get() times as small as possible. ++ * ++ * The majority of the entries in the hash table are for specific tsd ++ * entries. These entries are hashed by the product of their key and ++ * pid because by design the key and pid are guaranteed to be unique. ++ * Their product also has the desirable properly that it will be uniformly ++ * distributed over the hash bins providing neither the pid nor key is zero. ++ * Under linux the zero pid is always the init process and thus won't be ++ * used, and this implementation is careful to never to assign a zero key. ++ * By default the hash table is sized to 512 bins which is expected to ++ * be sufficient for light to moderate usage of thread specific data. ++ * ++ * The hash table contains two additional type of entries. They first ++ * type is entry is called a 'key' entry and it is added to the hash during ++ * tsd_create(). It is used to store the address of the destructor function ++ * and it is used as an anchor point. All tsd entries which use the same ++ * key will be linked to this entry. This is used during tsd_destory() to ++ * quickly call the destructor function for all tsd associated with the key. ++ * The 'key' entry may be looked up with tsd_hash_search() by passing the ++ * key you wish to lookup and DTOR_PID constant as the pid. ++ * ++ * The second type of entry is called a 'pid' entry and it is added to the ++ * hash the first time a process set a key. The 'pid' entry is also used ++ * as an anchor and all tsd for the process will be linked to it. This ++ * list is using during tsd_exit() to ensure all registered destructors ++ * are run for the process. The 'pid' entry may be looked up with ++ * tsd_hash_search() by passing the PID_KEY constant as the key, and ++ * the process pid. Note that tsd_exit() is called by thread_exit() ++ * so if your using the Solaris thread API you should not need to call ++ * tsd_exit() directly. ++ * ++\*****************************************************************************/ ++ ++#include ++#include ++#include ++#include ++ ++#ifdef DEBUG_SUBSYSTEM ++#undef DEBUG_SUBSYSTEM ++#endif ++ ++#define DEBUG_SUBSYSTEM SS_TSD ++#define DEBUG_SUBSYSTEM SS_TSD ++ ++typedef struct tsd_hash_bin { ++ spinlock_t hb_lock; ++ struct hlist_head hb_head; ++} tsd_hash_bin_t; ++ ++typedef struct tsd_hash_table { ++ spinlock_t ht_lock; ++ uint_t ht_bits; ++ uint_t ht_key; ++ tsd_hash_bin_t *ht_bins; ++} tsd_hash_table_t; ++ ++typedef struct tsd_hash_entry { ++ uint_t he_key; ++ pid_t he_pid; ++ dtor_func_t he_dtor; ++ void *he_value; ++ struct hlist_node he_list; ++ struct list_head he_key_list; ++ struct list_head he_pid_list; ++} tsd_hash_entry_t; ++ ++static tsd_hash_table_t *tsd_hash_table = NULL; ++ ++ ++/* ++ * tsd_hash_search - searches hash table for tsd_hash_entry ++ * @table: hash table ++ * @key: search key ++ * @pid: search pid ++ */ ++static tsd_hash_entry_t * ++tsd_hash_search(tsd_hash_table_t *table, uint_t key, pid_t pid) ++{ ++ struct hlist_node *node; ++ tsd_hash_entry_t *entry; ++ tsd_hash_bin_t *bin; ++ ulong_t hash; ++ SENTRY; ++ ++ hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits); ++ bin = &table->ht_bins[hash]; ++ spin_lock(&bin->hb_lock); ++ hlist_for_each_entry(entry, node, &bin->hb_head, he_list) { ++ if ((entry->he_key == key) && (entry->he_pid == pid)) { ++ spin_unlock(&bin->hb_lock); ++ SRETURN(entry); ++ } ++ } ++ ++ spin_unlock(&bin->hb_lock); ++ SRETURN(NULL); ++} ++ ++/* ++ * tsd_hash_dtor - call the destructor and free all entries on the list ++ * @work: list of hash entries ++ * ++ * For a list of entries which have all already been removed from the ++ * hash call their registered destructor then free the associated memory. ++ */ ++static void ++tsd_hash_dtor(struct hlist_head *work) ++{ ++ tsd_hash_entry_t *entry; ++ SENTRY; ++ ++ while (!hlist_empty(work)) { ++ entry = hlist_entry(work->first, tsd_hash_entry_t, he_list); ++ hlist_del(&entry->he_list); ++ ++ if (entry->he_dtor && entry->he_pid != DTOR_PID) ++ entry->he_dtor(entry->he_value); ++ ++ kmem_free(entry, sizeof(tsd_hash_entry_t)); ++ } ++ ++ SEXIT; ++} ++ ++/* ++ * tsd_hash_add - adds an entry to hash table ++ * @table: hash table ++ * @key: search key ++ * @pid: search pid ++ * ++ * The caller is responsible for ensuring the unique key/pid do not ++ * already exist in the hash table. This possible because all entries ++ * are thread specific thus a concurrent thread will never attempt to ++ * add this key/pid. Because multiple bins must be checked to add ++ * links to the dtor and pid entries the entire table is locked. ++ */ ++static int ++tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value) ++{ ++ tsd_hash_entry_t *entry, *dtor_entry, *pid_entry; ++ tsd_hash_bin_t *bin; ++ ulong_t hash; ++ int rc = 0; ++ SENTRY; ++ ++ ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL); ++ ++ /* New entry allocate structure, set value, and add to hash */ ++ entry = kmem_alloc(sizeof(tsd_hash_entry_t), KM_PUSHPAGE); ++ if (entry == NULL) ++ SRETURN(ENOMEM); ++ ++ entry->he_key = key; ++ entry->he_pid = pid; ++ entry->he_value = value; ++ INIT_HLIST_NODE(&entry->he_list); ++ INIT_LIST_HEAD(&entry->he_key_list); ++ INIT_LIST_HEAD(&entry->he_pid_list); ++ ++ spin_lock(&table->ht_lock); ++ ++ /* Destructor entry must exist for all valid keys */ ++ dtor_entry = tsd_hash_search(table, entry->he_key, DTOR_PID); ++ ASSERT3P(dtor_entry, !=, NULL); ++ entry->he_dtor = dtor_entry->he_dtor; ++ ++ /* Process entry must exist for all valid processes */ ++ pid_entry = tsd_hash_search(table, PID_KEY, entry->he_pid); ++ ASSERT3P(pid_entry, !=, NULL); ++ ++ hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits); ++ bin = &table->ht_bins[hash]; ++ spin_lock(&bin->hb_lock); ++ ++ /* Add to the hash, key, and pid lists */ ++ hlist_add_head(&entry->he_list, &bin->hb_head); ++ list_add(&entry->he_key_list, &dtor_entry->he_key_list); ++ list_add(&entry->he_pid_list, &pid_entry->he_pid_list); ++ ++ spin_unlock(&bin->hb_lock); ++ spin_unlock(&table->ht_lock); ++ ++ SRETURN(rc); ++} ++ ++/* ++ * tsd_hash_add_key - adds a destructor entry to the hash table ++ * @table: hash table ++ * @keyp: search key ++ * @dtor: key destructor ++ * ++ * For every unique key there is a single entry in the hash which is used ++ * as anchor. All other thread specific entries for this key are linked ++ * to this anchor via the 'he_key_list' list head. On return they keyp ++ * will be set to the next available key for the hash table. ++ */ ++static int ++tsd_hash_add_key(tsd_hash_table_t *table, uint_t *keyp, dtor_func_t dtor) ++{ ++ tsd_hash_entry_t *tmp_entry, *entry; ++ tsd_hash_bin_t *bin; ++ ulong_t hash; ++ int keys_checked = 0; ++ SENTRY; ++ ++ ASSERT3P(table, !=, NULL); ++ ++ /* Allocate entry to be used as a destructor for this key */ ++ entry = kmem_alloc(sizeof(tsd_hash_entry_t), KM_PUSHPAGE); ++ if (entry == NULL) ++ SRETURN(ENOMEM); ++ ++ /* Determine next available key value */ ++ spin_lock(&table->ht_lock); ++ do { ++ /* Limited to TSD_KEYS_MAX concurrent unique keys */ ++ if (table->ht_key++ > TSD_KEYS_MAX) ++ table->ht_key = 1; ++ ++ /* Ensure failure when all TSD_KEYS_MAX keys are in use */ ++ if (keys_checked++ >= TSD_KEYS_MAX) { ++ spin_unlock(&table->ht_lock); ++ SRETURN(ENOENT); ++ } ++ ++ tmp_entry = tsd_hash_search(table, table->ht_key, DTOR_PID); ++ } while (tmp_entry); ++ ++ /* Add destructor entry in to hash table */ ++ entry->he_key = *keyp = table->ht_key; ++ entry->he_pid = DTOR_PID; ++ entry->he_dtor = dtor; ++ entry->he_value = NULL; ++ INIT_HLIST_NODE(&entry->he_list); ++ INIT_LIST_HEAD(&entry->he_key_list); ++ INIT_LIST_HEAD(&entry->he_pid_list); ++ ++ hash = hash_long((ulong_t)*keyp * (ulong_t)DTOR_PID, table->ht_bits); ++ bin = &table->ht_bins[hash]; ++ spin_lock(&bin->hb_lock); ++ ++ hlist_add_head(&entry->he_list, &bin->hb_head); ++ ++ spin_unlock(&bin->hb_lock); ++ spin_unlock(&table->ht_lock); ++ ++ SRETURN(0); ++} ++ ++/* ++ * tsd_hash_add_pid - adds a process entry to the hash table ++ * @table: hash table ++ * @pid: search pid ++ * ++ * For every process these is a single entry in the hash which is used ++ * as anchor. All other thread specific entries for this process are ++ * linked to this anchor via the 'he_pid_list' list head. ++ */ ++static int ++tsd_hash_add_pid(tsd_hash_table_t *table, pid_t pid) ++{ ++ tsd_hash_entry_t *entry; ++ tsd_hash_bin_t *bin; ++ ulong_t hash; ++ SENTRY; ++ ++ /* Allocate entry to be used as the process reference */ ++ entry = kmem_alloc(sizeof(tsd_hash_entry_t), KM_PUSHPAGE); ++ if (entry == NULL) ++ SRETURN(ENOMEM); ++ ++ spin_lock(&table->ht_lock); ++ entry->he_key = PID_KEY; ++ entry->he_pid = pid; ++ entry->he_dtor = NULL; ++ entry->he_value = NULL; ++ INIT_HLIST_NODE(&entry->he_list); ++ INIT_LIST_HEAD(&entry->he_key_list); ++ INIT_LIST_HEAD(&entry->he_pid_list); ++ ++ hash = hash_long((ulong_t)PID_KEY * (ulong_t)pid, table->ht_bits); ++ bin = &table->ht_bins[hash]; ++ spin_lock(&bin->hb_lock); ++ ++ hlist_add_head(&entry->he_list, &bin->hb_head); ++ ++ spin_unlock(&bin->hb_lock); ++ spin_unlock(&table->ht_lock); ++ ++ SRETURN(0); ++} ++ ++/* ++ * tsd_hash_del - delete an entry from hash table, key, and pid lists ++ * @table: hash table ++ * @key: search key ++ * @pid: search pid ++ */ ++static void ++tsd_hash_del(tsd_hash_table_t *table, tsd_hash_entry_t *entry) ++{ ++ SENTRY; ++ ++ ASSERT(spin_is_locked(&table->ht_lock)); ++ hlist_del(&entry->he_list); ++ list_del_init(&entry->he_key_list); ++ list_del_init(&entry->he_pid_list); ++ ++ SEXIT; ++} ++ ++/* ++ * tsd_hash_table_init - allocate a hash table ++ * @bits: hash table size ++ * ++ * A hash table with 2^bits bins will be created, it may not be resized ++ * after the fact and must be free'd with tsd_hash_table_fini(). ++ */ ++static tsd_hash_table_t * ++tsd_hash_table_init(uint_t bits) ++{ ++ tsd_hash_table_t *table; ++ int hash, size = (1 << bits); ++ SENTRY; ++ ++ table = kmem_zalloc(sizeof(tsd_hash_table_t), KM_SLEEP); ++ if (table == NULL) ++ SRETURN(NULL); ++ ++ table->ht_bins = kmem_zalloc(sizeof(tsd_hash_bin_t) * size, ++ KM_SLEEP | KM_NODEBUG); ++ if (table->ht_bins == NULL) { ++ kmem_free(table, sizeof(tsd_hash_table_t)); ++ SRETURN(NULL); ++ } ++ ++ for (hash = 0; hash < size; hash++) { ++ spin_lock_init(&table->ht_bins[hash].hb_lock); ++ INIT_HLIST_HEAD(&table->ht_bins[hash].hb_head); ++ } ++ ++ spin_lock_init(&table->ht_lock); ++ table->ht_bits = bits; ++ table->ht_key = 1; ++ ++ SRETURN(table); ++} ++ ++/* ++ * tsd_hash_table_fini - free a hash table ++ * @table: hash table ++ * ++ * Free a hash table allocated by tsd_hash_table_init(). If the hash ++ * table is not empty this function will call the proper destructor for ++ * all remaining entries before freeing the memory used by those entries. ++ */ ++static void ++tsd_hash_table_fini(tsd_hash_table_t *table) ++{ ++ HLIST_HEAD(work); ++ tsd_hash_bin_t *bin; ++ tsd_hash_entry_t *entry; ++ int size, i; ++ SENTRY; ++ ++ ASSERT3P(table, !=, NULL); ++ spin_lock(&table->ht_lock); ++ for (i = 0, size = (1 << table->ht_bits); i < size; i++) { ++ bin = &table->ht_bins[i]; ++ spin_lock(&bin->hb_lock); ++ while (!hlist_empty(&bin->hb_head)) { ++ entry = hlist_entry(bin->hb_head.first, ++ tsd_hash_entry_t, he_list); ++ tsd_hash_del(table, entry); ++ hlist_add_head(&entry->he_list, &work); ++ } ++ spin_unlock(&bin->hb_lock); ++ } ++ spin_unlock(&table->ht_lock); ++ ++ tsd_hash_dtor(&work); ++ kmem_free(table->ht_bins, sizeof(tsd_hash_bin_t)*(1<ht_bits)); ++ kmem_free(table, sizeof(tsd_hash_table_t)); ++ ++ SEXIT; ++} ++ ++/* ++ * tsd_set - set thread specific data ++ * @key: lookup key ++ * @value: value to set ++ * ++ * Caller must prevent racing tsd_create() or tsd_destroy(), protected ++ * from racing tsd_get() or tsd_set() because it is thread specific. ++ * This function has been optimized to be fast for the update case. ++ * When setting the tsd initially it will be slower due to additional ++ * required locking and potential memory allocations. ++ */ ++int ++tsd_set(uint_t key, void *value) ++{ ++ tsd_hash_table_t *table; ++ tsd_hash_entry_t *entry; ++ pid_t pid; ++ int rc; ++ SENTRY; ++ ++ table = tsd_hash_table; ++ pid = curthread->pid; ++ ASSERT3P(table, !=, NULL); ++ ++ if ((key == 0) || (key > TSD_KEYS_MAX)) ++ SRETURN(EINVAL); ++ ++ /* Entry already exists in hash table update value */ ++ entry = tsd_hash_search(table, key, pid); ++ if (entry) { ++ entry->he_value = value; ++ SRETURN(0); ++ } ++ ++ /* Add a process entry to the hash if not yet exists */ ++ entry = tsd_hash_search(table, PID_KEY, pid); ++ if (entry == NULL) { ++ rc = tsd_hash_add_pid(table, pid); ++ if (rc) ++ SRETURN(rc); ++ } ++ ++ rc = tsd_hash_add(table, key, pid, value); ++ SRETURN(rc); ++} ++EXPORT_SYMBOL(tsd_set); ++ ++/* ++ * tsd_get - get thread specific data ++ * @key: lookup key ++ * ++ * Caller must prevent racing tsd_create() or tsd_destroy(). This ++ * implementation is designed to be fast and scalable, it does not ++ * lock the entire table only a single hash bin. ++ */ ++void * ++tsd_get(uint_t key) ++{ ++ tsd_hash_entry_t *entry; ++ SENTRY; ++ ++ ASSERT3P(tsd_hash_table, !=, NULL); ++ ++ if ((key == 0) || (key > TSD_KEYS_MAX)) ++ SRETURN(NULL); ++ ++ entry = tsd_hash_search(tsd_hash_table, key, curthread->pid); ++ if (entry == NULL) ++ SRETURN(NULL); ++ ++ SRETURN(entry->he_value); ++} ++EXPORT_SYMBOL(tsd_get); ++ ++/* ++ * tsd_create - create thread specific data key ++ * @keyp: lookup key address ++ * @dtor: destructor called during tsd_destroy() or tsd_exit() ++ * ++ * Provided key must be set to 0 or it assumed to be already in use. ++ * The dtor is allowed to be NULL in which case no additional cleanup ++ * for the data is performed during tsd_destroy() or tsd_exit(). ++ * ++ * Caller must prevent racing tsd_set() or tsd_get(), this function is ++ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). ++ */ ++void ++tsd_create(uint_t *keyp, dtor_func_t dtor) ++{ ++ SENTRY; ++ ++ ASSERT3P(keyp, !=, NULL); ++ if (*keyp) { ++ SEXIT; ++ return; ++ } ++ ++ (void)tsd_hash_add_key(tsd_hash_table, keyp, dtor); ++ ++ SEXIT; ++} ++EXPORT_SYMBOL(tsd_create); ++ ++/* ++ * tsd_destroy - destroy thread specific data ++ * @keyp: lookup key address ++ * ++ * Destroys the thread specific data on all threads which use this key. ++ * ++ * Caller must prevent racing tsd_set() or tsd_get(), this function is ++ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). ++ */ ++void ++tsd_destroy(uint_t *keyp) ++{ ++ HLIST_HEAD(work); ++ tsd_hash_table_t *table; ++ tsd_hash_entry_t *dtor_entry, *entry; ++ SENTRY; ++ ++ table = tsd_hash_table; ++ ASSERT3P(table, !=, NULL); ++ ++ spin_lock(&table->ht_lock); ++ dtor_entry = tsd_hash_search(table, *keyp, DTOR_PID); ++ if (dtor_entry == NULL) { ++ spin_unlock(&table->ht_lock); ++ SEXIT; ++ return; ++ } ++ ++ /* ++ * All threads which use this key must be linked off of the ++ * DTOR_PID entry. They are removed from the hash table and ++ * linked in to a private working list to be destroyed. ++ */ ++ while (!list_empty(&dtor_entry->he_key_list)) { ++ entry = list_entry(dtor_entry->he_key_list.next, ++ tsd_hash_entry_t, he_key_list); ++ ASSERT3U(dtor_entry->he_key, ==, entry->he_key); ++ ASSERT3P(dtor_entry->he_dtor, ==, entry->he_dtor); ++ tsd_hash_del(table, entry); ++ hlist_add_head(&entry->he_list, &work); ++ } ++ ++ tsd_hash_del(table, dtor_entry); ++ hlist_add_head(&dtor_entry->he_list, &work); ++ spin_unlock(&table->ht_lock); ++ ++ tsd_hash_dtor(&work); ++ *keyp = 0; ++ ++ SEXIT; ++} ++EXPORT_SYMBOL(tsd_destroy); ++ ++/* ++ * tsd_exit - destroys all thread specific data for this thread ++ * ++ * Destroys all the thread specific data for this thread. ++ * ++ * Caller must prevent racing tsd_set() or tsd_get(), this function is ++ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). ++ */ ++void ++tsd_exit(void) ++{ ++ HLIST_HEAD(work); ++ tsd_hash_table_t *table; ++ tsd_hash_entry_t *pid_entry, *entry; ++ SENTRY; ++ ++ table = tsd_hash_table; ++ ASSERT3P(table, !=, NULL); ++ ++ spin_lock(&table->ht_lock); ++ pid_entry = tsd_hash_search(table, PID_KEY, curthread->pid); ++ if (pid_entry == NULL) { ++ spin_unlock(&table->ht_lock); ++ SEXIT; ++ return; ++ } ++ ++ /* ++ * All keys associated with this pid must be linked off of the ++ * PID_KEY entry. They are removed from the hash table and ++ * linked in to a private working to be destroyed. ++ */ ++ while (!list_empty(&pid_entry->he_pid_list)) { ++ entry = list_entry(pid_entry->he_pid_list.next, ++ tsd_hash_entry_t, he_pid_list); ++ ASSERT3U(pid_entry->he_pid, ==, entry->he_pid); ++ tsd_hash_del(table, entry); ++ hlist_add_head(&entry->he_list, &work); ++ } ++ ++ tsd_hash_del(table, pid_entry); ++ hlist_add_head(&pid_entry->he_list, &work); ++ spin_unlock(&table->ht_lock); ++ ++ tsd_hash_dtor(&work); ++ ++ SEXIT; ++} ++EXPORT_SYMBOL(tsd_exit); ++ ++int ++spl_tsd_init(void) ++{ ++ SENTRY; ++ ++ tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT); ++ if (tsd_hash_table == NULL) ++ SRETURN(1); ++ ++ SRETURN(0); ++} ++ ++void ++spl_tsd_fini(void) ++{ ++ SENTRY; ++ tsd_hash_table_fini(tsd_hash_table); ++ tsd_hash_table = NULL; ++ SEXIT; ++} +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-vnode.c linux-3.2.33-go/spl/spl/spl-vnode.c +--- linux-3.2.33-go.orig/spl/spl/spl-vnode.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-vnode.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,1047 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) Vnode Implementation. ++\*****************************************************************************/ ++ ++#include ++#include ++#include ++ ++#ifdef SS_DEBUG_SUBSYS ++#undef SS_DEBUG_SUBSYS ++#endif ++ ++#define SS_DEBUG_SUBSYS SS_VNODE ++ ++vnode_t *rootdir = (vnode_t *)0xabcd1234; ++EXPORT_SYMBOL(rootdir); ++ ++static spl_kmem_cache_t *vn_cache; ++static spl_kmem_cache_t *vn_file_cache; ++ ++static DEFINE_SPINLOCK(vn_file_lock); ++static LIST_HEAD(vn_file_list); ++ ++#ifdef HAVE_KERN_PATH_PARENT_HEADER ++#ifndef HAVE_KERN_PATH_PARENT_SYMBOL ++kern_path_parent_t kern_path_parent_fn = SYMBOL_POISON; ++EXPORT_SYMBOL(kern_path_parent_fn); ++#endif /* HAVE_KERN_PATH_PARENT_SYMBOL */ ++#endif /* HAVE_KERN_PATH_PARENT_HEADER */ ++ ++#ifdef HAVE_KERN_PATH_LOCKED ++kern_path_locked_t kern_path_locked_fn = SYMBOL_POISON; ++#endif /* HAVE_KERN_PATH_LOCKED */ ++ ++vtype_t ++vn_mode_to_vtype(mode_t mode) ++{ ++ if (S_ISREG(mode)) ++ return VREG; ++ ++ if (S_ISDIR(mode)) ++ return VDIR; ++ ++ if (S_ISCHR(mode)) ++ return VCHR; ++ ++ if (S_ISBLK(mode)) ++ return VBLK; ++ ++ if (S_ISFIFO(mode)) ++ return VFIFO; ++ ++ if (S_ISLNK(mode)) ++ return VLNK; ++ ++ if (S_ISSOCK(mode)) ++ return VSOCK; ++ ++ if (S_ISCHR(mode)) ++ return VCHR; ++ ++ return VNON; ++} /* vn_mode_to_vtype() */ ++EXPORT_SYMBOL(vn_mode_to_vtype); ++ ++mode_t ++vn_vtype_to_mode(vtype_t vtype) ++{ ++ if (vtype == VREG) ++ return S_IFREG; ++ ++ if (vtype == VDIR) ++ return S_IFDIR; ++ ++ if (vtype == VCHR) ++ return S_IFCHR; ++ ++ if (vtype == VBLK) ++ return S_IFBLK; ++ ++ if (vtype == VFIFO) ++ return S_IFIFO; ++ ++ if (vtype == VLNK) ++ return S_IFLNK; ++ ++ if (vtype == VSOCK) ++ return S_IFSOCK; ++ ++ return VNON; ++} /* vn_vtype_to_mode() */ ++EXPORT_SYMBOL(vn_vtype_to_mode); ++ ++vnode_t * ++vn_alloc(int flag) ++{ ++ vnode_t *vp; ++ SENTRY; ++ ++ vp = kmem_cache_alloc(vn_cache, flag); ++ if (vp != NULL) { ++ vp->v_file = NULL; ++ vp->v_type = 0; ++ } ++ ++ SRETURN(vp); ++} /* vn_alloc() */ ++EXPORT_SYMBOL(vn_alloc); ++ ++void ++vn_free(vnode_t *vp) ++{ ++ SENTRY; ++ kmem_cache_free(vn_cache, vp); ++ SEXIT; ++} /* vn_free() */ ++EXPORT_SYMBOL(vn_free); ++ ++int ++vn_open(const char *path, uio_seg_t seg, int flags, int mode, ++ vnode_t **vpp, int x1, void *x2) ++{ ++ struct file *fp; ++ struct kstat stat; ++ int rc, saved_umask = 0; ++ gfp_t saved_gfp; ++ vnode_t *vp; ++ SENTRY; ++ ++ ASSERT(flags & (FWRITE | FREAD)); ++ ASSERT(seg == UIO_SYSSPACE); ++ ASSERT(vpp); ++ *vpp = NULL; ++ ++ if (!(flags & FCREAT) && (flags & FWRITE)) ++ flags |= FEXCL; ++ ++ /* Note for filp_open() the two low bits must be remapped to mean: ++ * 01 - read-only -> 00 read-only ++ * 10 - write-only -> 01 write-only ++ * 11 - read-write -> 10 read-write ++ */ ++ flags--; ++ ++ if (flags & FCREAT) ++ saved_umask = xchg(¤t->fs->umask, 0); ++ ++ fp = filp_open(path, flags, mode); ++ ++ if (flags & FCREAT) ++ (void)xchg(¤t->fs->umask, saved_umask); ++ ++ if (IS_ERR(fp)) ++ SRETURN(-PTR_ERR(fp)); ++ ++ rc = vfs_getattr(fp->f_vfsmnt, fp->f_dentry, &stat); ++ if (rc) { ++ filp_close(fp, 0); ++ SRETURN(-rc); ++ } ++ ++ vp = vn_alloc(KM_SLEEP); ++ if (!vp) { ++ filp_close(fp, 0); ++ SRETURN(ENOMEM); ++ } ++ ++ saved_gfp = mapping_gfp_mask(fp->f_mapping); ++ mapping_set_gfp_mask(fp->f_mapping, saved_gfp & ~(__GFP_IO|__GFP_FS)); ++ ++ mutex_enter(&vp->v_lock); ++ vp->v_type = vn_mode_to_vtype(stat.mode); ++ vp->v_file = fp; ++ vp->v_gfp_mask = saved_gfp; ++ *vpp = vp; ++ mutex_exit(&vp->v_lock); ++ ++ SRETURN(0); ++} /* vn_open() */ ++EXPORT_SYMBOL(vn_open); ++ ++int ++vn_openat(const char *path, uio_seg_t seg, int flags, int mode, ++ vnode_t **vpp, int x1, void *x2, vnode_t *vp, int fd) ++{ ++ char *realpath; ++ int len, rc; ++ SENTRY; ++ ++ ASSERT(vp == rootdir); ++ ++ len = strlen(path) + 2; ++ realpath = kmalloc(len, GFP_KERNEL); ++ if (!realpath) ++ SRETURN(ENOMEM); ++ ++ (void)snprintf(realpath, len, "/%s", path); ++ rc = vn_open(realpath, seg, flags, mode, vpp, x1, x2); ++ kfree(realpath); ++ ++ SRETURN(rc); ++} /* vn_openat() */ ++EXPORT_SYMBOL(vn_openat); ++ ++int ++vn_rdwr(uio_rw_t uio, vnode_t *vp, void *addr, ssize_t len, offset_t off, ++ uio_seg_t seg, int ioflag, rlim64_t x2, void *x3, ssize_t *residp) ++{ ++ loff_t offset; ++ mm_segment_t saved_fs; ++ struct file *fp; ++ int rc; ++ SENTRY; ++ ++ ASSERT(uio == UIO_WRITE || uio == UIO_READ); ++ ASSERT(vp); ++ ASSERT(vp->v_file); ++ ASSERT(seg == UIO_SYSSPACE); ++ ASSERT((ioflag & ~FAPPEND) == 0); ++ ASSERT(x2 == RLIM64_INFINITY); ++ ++ fp = vp->v_file; ++ ++ offset = off; ++ if (ioflag & FAPPEND) ++ offset = fp->f_pos; ++ ++ /* Writable user data segment must be briefly increased for this ++ * process so we can use the user space read call paths to write ++ * in to memory allocated by the kernel. */ ++ saved_fs = get_fs(); ++ set_fs(get_ds()); ++ ++ if (uio & UIO_WRITE) ++ rc = vfs_write(fp, addr, len, &offset); ++ else ++ rc = vfs_read(fp, addr, len, &offset); ++ ++ set_fs(saved_fs); ++ fp->f_pos = offset; ++ ++ if (rc < 0) ++ SRETURN(-rc); ++ ++ if (residp) { ++ *residp = len - rc; ++ } else { ++ if (rc != len) ++ SRETURN(EIO); ++ } ++ ++ SRETURN(0); ++} /* vn_rdwr() */ ++EXPORT_SYMBOL(vn_rdwr); ++ ++int ++vn_close(vnode_t *vp, int flags, int x1, int x2, void *x3, void *x4) ++{ ++ int rc; ++ SENTRY; ++ ++ ASSERT(vp); ++ ASSERT(vp->v_file); ++ ++ mapping_set_gfp_mask(vp->v_file->f_mapping, vp->v_gfp_mask); ++ rc = filp_close(vp->v_file, 0); ++ vn_free(vp); ++ ++ SRETURN(-rc); ++} /* vn_close() */ ++EXPORT_SYMBOL(vn_close); ++ ++/* vn_seek() does not actually seek it only performs bounds checking on the ++ * proposed seek. We perform minimal checking and allow vn_rdwr() to catch ++ * anything more serious. */ ++int ++vn_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, void *ct) ++{ ++ return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); ++} ++EXPORT_SYMBOL(vn_seek); ++ ++#ifdef HAVE_KERN_PATH_LOCKED ++/* Based on do_unlinkat() from linux/fs/namei.c */ ++int ++vn_remove(const char *path, uio_seg_t seg, int flags) ++{ ++ struct dentry *dentry; ++ struct path parent; ++ struct inode *inode = NULL; ++ int rc = 0; ++ SENTRY; ++ ++ ASSERT(seg == UIO_SYSSPACE); ++ ASSERT(flags == RMFILE); ++ ++ dentry = spl_kern_path_locked(path, &parent); ++ rc = PTR_ERR(dentry); ++ if (!IS_ERR(dentry)) { ++ if (parent.dentry->d_name.name[parent.dentry->d_name.len]) ++ SGOTO(slashes, rc = 0); ++ ++ inode = dentry->d_inode; ++ if (!inode) ++ SGOTO(slashes, rc = 0); ++ ++ if (inode) ++ ihold(inode); ++ ++ rc = vfs_unlink(parent.dentry->d_inode, dentry); ++exit1: ++ dput(dentry); ++ } ++ ++ spl_inode_unlock(parent.dentry->d_inode); ++ if (inode) ++ iput(inode); /* truncate the inode here */ ++ ++ path_put(&parent); ++ SRETURN(-rc); ++ ++slashes: ++ rc = !dentry->d_inode ? -ENOENT : ++ S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR; ++ SGOTO(exit1, rc); ++} /* vn_remove() */ ++EXPORT_SYMBOL(vn_remove); ++ ++/* Based on do_rename() from linux/fs/namei.c */ ++int ++vn_rename(const char *oldname, const char *newname, int x1) ++{ ++ struct dentry *old_dir, *new_dir; ++ struct dentry *old_dentry, *new_dentry; ++ struct dentry *trap; ++ struct path old_parent, new_parent; ++ int rc = 0; ++ SENTRY; ++ ++ old_dentry = spl_kern_path_locked(oldname, &old_parent); ++ if (IS_ERR(old_dentry)) ++ SGOTO(exit, rc = PTR_ERR(old_dentry)); ++ ++ spl_inode_unlock(old_parent.dentry->d_inode); ++ ++ new_dentry = spl_kern_path_locked(newname, &new_parent); ++ if (IS_ERR(new_dentry)) ++ SGOTO(exit2, rc = PTR_ERR(new_dentry)); ++ ++ spl_inode_unlock(new_parent.dentry->d_inode); ++ ++ rc = -EXDEV; ++ if (old_parent.mnt != new_parent.mnt) ++ SGOTO(exit3, rc); ++ ++ old_dir = old_parent.dentry; ++ new_dir = new_parent.dentry; ++ trap = lock_rename(new_dir, old_dir); ++ ++ /* source should not be ancestor of target */ ++ rc = -EINVAL; ++ if (old_dentry == trap) ++ SGOTO(exit4, rc); ++ ++ /* target should not be an ancestor of source */ ++ rc = -ENOTEMPTY; ++ if (new_dentry == trap) ++ SGOTO(exit4, rc); ++ ++ /* source must exist */ ++ rc = -ENOENT; ++ if (!old_dentry->d_inode) ++ SGOTO(exit4, rc); ++ ++ /* unless the source is a directory trailing slashes give -ENOTDIR */ ++ if (!S_ISDIR(old_dentry->d_inode->i_mode)) { ++ rc = -ENOTDIR; ++ if (old_dentry->d_name.name[old_dentry->d_name.len]) ++ SGOTO(exit4, rc); ++ if (new_dentry->d_name.name[new_dentry->d_name.len]) ++ SGOTO(exit4, rc); ++ } ++ ++#ifdef HAVE_4ARGS_VFS_RENAME ++ rc = vfs_rename(old_dir->d_inode, old_dentry, ++ new_dir->d_inode, new_dentry); ++#else ++ rc = vfs_rename(old_dir->d_inode, old_dentry, oldnd.nd_mnt, ++ new_dir->d_inode, new_dentry, newnd.nd_mnt); ++#endif /* HAVE_4ARGS_VFS_RENAME */ ++exit4: ++ unlock_rename(new_dir, old_dir); ++exit3: ++ dput(new_dentry); ++ path_put(&new_parent); ++exit2: ++ dput(old_dentry); ++ path_put(&old_parent); ++exit: ++ SRETURN(-rc); ++} ++EXPORT_SYMBOL(vn_rename); ++ ++#else ++static struct dentry * ++vn_lookup_hash(struct nameidata *nd) ++{ ++ return lookup_one_len((const char *)nd->last.name, ++ nd->nd_dentry, nd->last.len); ++} /* lookup_hash() */ ++ ++static void ++vn_path_release(struct nameidata *nd) ++{ ++ dput(nd->nd_dentry); ++ mntput(nd->nd_mnt); ++} ++ ++/* Modified do_unlinkat() from linux/fs/namei.c, only uses exported symbols */ ++int ++vn_remove(const char *path, uio_seg_t seg, int flags) ++{ ++ struct dentry *dentry; ++ struct nameidata nd; ++ struct inode *inode = NULL; ++ int rc = 0; ++ SENTRY; ++ ++ ASSERT(seg == UIO_SYSSPACE); ++ ASSERT(flags == RMFILE); ++ ++ rc = spl_kern_path_parent(path, &nd); ++ if (rc) ++ SGOTO(exit, rc); ++ ++ rc = -EISDIR; ++ if (nd.last_type != LAST_NORM) ++ SGOTO(exit1, rc); ++ ++ spl_inode_lock_nested(nd.nd_dentry->d_inode, I_MUTEX_PARENT); ++ dentry = vn_lookup_hash(&nd); ++ rc = PTR_ERR(dentry); ++ if (!IS_ERR(dentry)) { ++ /* Why not before? Because we want correct rc value */ ++ if (nd.last.name[nd.last.len]) ++ SGOTO(slashes, rc); ++ ++ inode = dentry->d_inode; ++ if (inode) ++ atomic_inc(&inode->i_count); ++#ifdef HAVE_2ARGS_VFS_UNLINK ++ rc = vfs_unlink(nd.nd_dentry->d_inode, dentry); ++#else ++ rc = vfs_unlink(nd.nd_dentry->d_inode, dentry, nd.nd_mnt); ++#endif /* HAVE_2ARGS_VFS_UNLINK */ ++exit2: ++ dput(dentry); ++ } ++ ++ spl_inode_unlock(nd.nd_dentry->d_inode); ++ if (inode) ++ iput(inode); /* truncate the inode here */ ++exit1: ++ vn_path_release(&nd); ++exit: ++ SRETURN(-rc); ++ ++slashes: ++ rc = !dentry->d_inode ? -ENOENT : ++ S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR; ++ SGOTO(exit2, rc); ++} /* vn_remove() */ ++EXPORT_SYMBOL(vn_remove); ++ ++/* Modified do_rename() from linux/fs/namei.c, only uses exported symbols */ ++int ++vn_rename(const char *oldname, const char *newname, int x1) ++{ ++ struct dentry *old_dir, *new_dir; ++ struct dentry *old_dentry, *new_dentry; ++ struct dentry *trap; ++ struct nameidata oldnd, newnd; ++ int rc = 0; ++ SENTRY; ++ ++ rc = spl_kern_path_parent(oldname, &oldnd); ++ if (rc) ++ SGOTO(exit, rc); ++ ++ rc = spl_kern_path_parent(newname, &newnd); ++ if (rc) ++ SGOTO(exit1, rc); ++ ++ rc = -EXDEV; ++ if (oldnd.nd_mnt != newnd.nd_mnt) ++ SGOTO(exit2, rc); ++ ++ old_dir = oldnd.nd_dentry; ++ rc = -EBUSY; ++ if (oldnd.last_type != LAST_NORM) ++ SGOTO(exit2, rc); ++ ++ new_dir = newnd.nd_dentry; ++ if (newnd.last_type != LAST_NORM) ++ SGOTO(exit2, rc); ++ ++ trap = lock_rename(new_dir, old_dir); ++ ++ old_dentry = vn_lookup_hash(&oldnd); ++ ++ rc = PTR_ERR(old_dentry); ++ if (IS_ERR(old_dentry)) ++ SGOTO(exit3, rc); ++ ++ /* source must exist */ ++ rc = -ENOENT; ++ if (!old_dentry->d_inode) ++ SGOTO(exit4, rc); ++ ++ /* unless the source is a directory trailing slashes give -ENOTDIR */ ++ if (!S_ISDIR(old_dentry->d_inode->i_mode)) { ++ rc = -ENOTDIR; ++ if (oldnd.last.name[oldnd.last.len]) ++ SGOTO(exit4, rc); ++ if (newnd.last.name[newnd.last.len]) ++ SGOTO(exit4, rc); ++ } ++ ++ /* source should not be ancestor of target */ ++ rc = -EINVAL; ++ if (old_dentry == trap) ++ SGOTO(exit4, rc); ++ ++ new_dentry = vn_lookup_hash(&newnd); ++ rc = PTR_ERR(new_dentry); ++ if (IS_ERR(new_dentry)) ++ SGOTO(exit4, rc); ++ ++ /* target should not be an ancestor of source */ ++ rc = -ENOTEMPTY; ++ if (new_dentry == trap) ++ SGOTO(exit5, rc); ++ ++#ifdef HAVE_4ARGS_VFS_RENAME ++ rc = vfs_rename(old_dir->d_inode, old_dentry, ++ new_dir->d_inode, new_dentry); ++#else ++ rc = vfs_rename(old_dir->d_inode, old_dentry, oldnd.nd_mnt, ++ new_dir->d_inode, new_dentry, newnd.nd_mnt); ++#endif /* HAVE_4ARGS_VFS_RENAME */ ++exit5: ++ dput(new_dentry); ++exit4: ++ dput(old_dentry); ++exit3: ++ unlock_rename(new_dir, old_dir); ++exit2: ++ vn_path_release(&newnd); ++exit1: ++ vn_path_release(&oldnd); ++exit: ++ SRETURN(-rc); ++} ++EXPORT_SYMBOL(vn_rename); ++#endif /* HAVE_KERN_PATH_LOCKED */ ++ ++int ++vn_getattr(vnode_t *vp, vattr_t *vap, int flags, void *x3, void *x4) ++{ ++ struct file *fp; ++ struct kstat stat; ++ int rc; ++ SENTRY; ++ ++ ASSERT(vp); ++ ASSERT(vp->v_file); ++ ASSERT(vap); ++ ++ fp = vp->v_file; ++ ++ rc = vfs_getattr(fp->f_vfsmnt, fp->f_dentry, &stat); ++ if (rc) ++ SRETURN(-rc); ++ ++ vap->va_type = vn_mode_to_vtype(stat.mode); ++ vap->va_mode = stat.mode; ++ vap->va_uid = stat.uid; ++ vap->va_gid = stat.gid; ++ vap->va_fsid = 0; ++ vap->va_nodeid = stat.ino; ++ vap->va_nlink = stat.nlink; ++ vap->va_size = stat.size; ++ vap->va_blksize = stat.blksize; ++ vap->va_atime = stat.atime; ++ vap->va_mtime = stat.mtime; ++ vap->va_ctime = stat.ctime; ++ vap->va_rdev = stat.rdev; ++ vap->va_nblocks = stat.blocks; ++ ++ SRETURN(0); ++} ++EXPORT_SYMBOL(vn_getattr); ++ ++int vn_fsync(vnode_t *vp, int flags, void *x3, void *x4) ++{ ++ int datasync = 0; ++ SENTRY; ++ ++ ASSERT(vp); ++ ASSERT(vp->v_file); ++ ++ if (flags & FDSYNC) ++ datasync = 1; ++ ++ SRETURN(-spl_filp_fsync(vp->v_file, datasync)); ++} /* vn_fsync() */ ++EXPORT_SYMBOL(vn_fsync); ++ ++int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag, ++ offset_t offset, void *x6, void *x7) ++{ ++ int error = EOPNOTSUPP; ++ SENTRY; ++ ++ if (cmd != F_FREESP || bfp->l_whence != 0) ++ SRETURN(EOPNOTSUPP); ++ ++ ASSERT(vp); ++ ASSERT(vp->v_file); ++ ASSERT(bfp->l_start >= 0 && bfp->l_len > 0); ++ ++#ifdef FALLOC_FL_PUNCH_HOLE ++ if (vp->v_file->f_op->fallocate) { ++ error = -vp->v_file->f_op->fallocate(vp->v_file, ++ FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, ++ bfp->l_start, bfp->l_len); ++ if (!error) ++ SRETURN(0); ++ } ++#endif ++ ++#ifdef HAVE_INODE_TRUNCATE_RANGE ++ if (vp->v_file->f_dentry && vp->v_file->f_dentry->d_inode && ++ vp->v_file->f_dentry->d_inode->i_op && ++ vp->v_file->f_dentry->d_inode->i_op->truncate_range) { ++ off_t end = bfp->l_start + bfp->l_len; ++ /* ++ * Judging from the code in shmem_truncate_range(), ++ * it seems the kernel expects the end offset to be ++ * inclusive and aligned to the end of a page. ++ */ ++ if (end % PAGE_SIZE != 0) { ++ end &= ~(off_t)(PAGE_SIZE - 1); ++ if (end <= bfp->l_start) ++ SRETURN(0); ++ } ++ --end; ++ ++ vp->v_file->f_dentry->d_inode->i_op->truncate_range( ++ vp->v_file->f_dentry->d_inode, ++ bfp->l_start, end ++ ); ++ SRETURN(0); ++ } ++#endif ++ ++ SRETURN(error); ++} ++EXPORT_SYMBOL(vn_space); ++ ++/* Function must be called while holding the vn_file_lock */ ++static file_t * ++file_find(int fd) ++{ ++ file_t *fp; ++ ++ ASSERT(spin_is_locked(&vn_file_lock)); ++ ++ list_for_each_entry(fp, &vn_file_list, f_list) { ++ if (fd == fp->f_fd && fp->f_task == current) { ++ ASSERT(atomic_read(&fp->f_ref) != 0); ++ return fp; ++ } ++ } ++ ++ return NULL; ++} /* file_find() */ ++ ++file_t * ++vn_getf(int fd) ++{ ++ struct kstat stat; ++ struct file *lfp; ++ file_t *fp; ++ vnode_t *vp; ++ int rc = 0; ++ SENTRY; ++ ++ /* Already open just take an extra reference */ ++ spin_lock(&vn_file_lock); ++ ++ fp = file_find(fd); ++ if (fp) { ++ atomic_inc(&fp->f_ref); ++ spin_unlock(&vn_file_lock); ++ SRETURN(fp); ++ } ++ ++ spin_unlock(&vn_file_lock); ++ ++ /* File was not yet opened create the object and setup */ ++ fp = kmem_cache_alloc(vn_file_cache, KM_SLEEP); ++ if (fp == NULL) ++ SGOTO(out, rc); ++ ++ mutex_enter(&fp->f_lock); ++ ++ fp->f_fd = fd; ++ fp->f_task = current; ++ fp->f_offset = 0; ++ atomic_inc(&fp->f_ref); ++ ++ lfp = fget(fd); ++ if (lfp == NULL) ++ SGOTO(out_mutex, rc); ++ ++ vp = vn_alloc(KM_SLEEP); ++ if (vp == NULL) ++ SGOTO(out_fget, rc); ++ ++ if (vfs_getattr(lfp->f_vfsmnt, lfp->f_dentry, &stat)) ++ SGOTO(out_vnode, rc); ++ ++ mutex_enter(&vp->v_lock); ++ vp->v_type = vn_mode_to_vtype(stat.mode); ++ vp->v_file = lfp; ++ mutex_exit(&vp->v_lock); ++ ++ fp->f_vnode = vp; ++ fp->f_file = lfp; ++ ++ /* Put it on the tracking list */ ++ spin_lock(&vn_file_lock); ++ list_add(&fp->f_list, &vn_file_list); ++ spin_unlock(&vn_file_lock); ++ ++ mutex_exit(&fp->f_lock); ++ SRETURN(fp); ++ ++out_vnode: ++ vn_free(vp); ++out_fget: ++ fput(lfp); ++out_mutex: ++ mutex_exit(&fp->f_lock); ++ kmem_cache_free(vn_file_cache, fp); ++out: ++ SRETURN(NULL); ++} /* getf() */ ++EXPORT_SYMBOL(getf); ++ ++static void releasef_locked(file_t *fp) ++{ ++ ASSERT(fp->f_file); ++ ASSERT(fp->f_vnode); ++ ++ /* Unlinked from list, no refs, safe to free outside mutex */ ++ fput(fp->f_file); ++ vn_free(fp->f_vnode); ++ ++ kmem_cache_free(vn_file_cache, fp); ++} ++ ++void ++vn_releasef(int fd) ++{ ++ file_t *fp; ++ SENTRY; ++ ++ spin_lock(&vn_file_lock); ++ fp = file_find(fd); ++ if (fp) { ++ atomic_dec(&fp->f_ref); ++ if (atomic_read(&fp->f_ref) > 0) { ++ spin_unlock(&vn_file_lock); ++ SEXIT; ++ return; ++ } ++ ++ list_del(&fp->f_list); ++ releasef_locked(fp); ++ } ++ spin_unlock(&vn_file_lock); ++ ++ SEXIT; ++ return; ++} /* releasef() */ ++EXPORT_SYMBOL(releasef); ++ ++#ifndef HAVE_SET_FS_PWD ++# ifdef HAVE_2ARGS_SET_FS_PWD ++/* Used from 2.6.25 - 2.6.31+ */ ++void ++set_fs_pwd(struct fs_struct *fs, struct path *path) ++{ ++ struct path old_pwd; ++ ++# ifdef HAVE_FS_STRUCT_SPINLOCK ++ spin_lock(&fs->lock); ++ old_pwd = fs->pwd; ++ fs->pwd = *path; ++ path_get(path); ++ spin_unlock(&fs->lock); ++# else ++ write_lock(&fs->lock); ++ old_pwd = fs->pwd; ++ fs->pwd = *path; ++ path_get(path); ++ write_unlock(&fs->lock); ++# endif /* HAVE_FS_STRUCT_SPINLOCK */ ++ ++ if (old_pwd.dentry) ++ path_put(&old_pwd); ++} ++# else ++/* Used from 2.6.11 - 2.6.24 */ ++void ++set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, struct dentry *dentry) ++{ ++ struct dentry *old_pwd; ++ struct vfsmount *old_pwdmnt; ++ ++ write_lock(&fs->lock); ++ old_pwd = fs->pwd; ++ old_pwdmnt = fs->pwdmnt; ++ fs->pwdmnt = mntget(mnt); ++ fs->pwd = dget(dentry); ++ write_unlock(&fs->lock); ++ ++ if (old_pwd) { ++ dput(old_pwd); ++ mntput(old_pwdmnt); ++ } ++} ++# endif /* HAVE_2ARGS_SET_FS_PWD */ ++#endif /* HAVE_SET_FS_PWD */ ++ ++int ++vn_set_pwd(const char *filename) ++{ ++#if defined(HAVE_2ARGS_SET_FS_PWD) && defined(HAVE_USER_PATH_DIR) ++ struct path path; ++#else ++ struct nameidata nd; ++#endif /* HAVE_2ARGS_SET_FS_PWD */ ++ mm_segment_t saved_fs; ++ int rc; ++ SENTRY; ++ ++ /* ++ * user_path_dir() and __user_walk() both expect 'filename' to be ++ * a user space address so we must briefly increase the data segment ++ * size to ensure strncpy_from_user() does not fail with -EFAULT. ++ */ ++ saved_fs = get_fs(); ++ set_fs(get_ds()); ++ ++#ifdef HAVE_2ARGS_SET_FS_PWD ++# ifdef HAVE_USER_PATH_DIR ++ rc = user_path_dir(filename, &path); ++ if (rc) ++ SGOTO(out, rc); ++ ++ rc = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); ++ if (rc) ++ SGOTO(dput_and_out, rc); ++ ++ set_fs_pwd(current->fs, &path); ++ ++dput_and_out: ++ path_put(&path); ++# else ++ rc = __user_walk(filename, ++ LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_CHDIR, &nd); ++ if (rc) ++ SGOTO(out, rc); ++ ++ rc = vfs_permission(&nd, MAY_EXEC); ++ if (rc) ++ SGOTO(dput_and_out, rc); ++ ++ set_fs_pwd(current->fs, &nd.path); ++ ++dput_and_out: ++ path_put(&nd.path); ++# endif /* HAVE_USER_PATH_DIR */ ++#else ++ rc = __user_walk(filename, ++ LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_CHDIR, &nd); ++ if (rc) ++ SGOTO(out, rc); ++ ++ rc = vfs_permission(&nd, MAY_EXEC); ++ if (rc) ++ SGOTO(dput_and_out, rc); ++ ++ set_fs_pwd(current->fs, nd.nd_mnt, nd.nd_dentry); ++ ++dput_and_out: ++ vn_path_release(&nd); ++#endif /* HAVE_2ARGS_SET_FS_PWD */ ++out: ++ set_fs(saved_fs); ++ ++ SRETURN(-rc); ++} /* vn_set_pwd() */ ++EXPORT_SYMBOL(vn_set_pwd); ++ ++static int ++vn_cache_constructor(void *buf, void *cdrarg, int kmflags) ++{ ++ struct vnode *vp = buf; ++ ++ mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL); ++ ++ return (0); ++} /* vn_cache_constructor() */ ++ ++static void ++vn_cache_destructor(void *buf, void *cdrarg) ++{ ++ struct vnode *vp = buf; ++ ++ mutex_destroy(&vp->v_lock); ++} /* vn_cache_destructor() */ ++ ++static int ++vn_file_cache_constructor(void *buf, void *cdrarg, int kmflags) ++{ ++ file_t *fp = buf; ++ ++ atomic_set(&fp->f_ref, 0); ++ mutex_init(&fp->f_lock, NULL, MUTEX_DEFAULT, NULL); ++ INIT_LIST_HEAD(&fp->f_list); ++ ++ return (0); ++} /* file_cache_constructor() */ ++ ++static void ++vn_file_cache_destructor(void *buf, void *cdrarg) ++{ ++ file_t *fp = buf; ++ ++ mutex_destroy(&fp->f_lock); ++} /* vn_file_cache_destructor() */ ++ ++int spl_vn_init_kallsyms_lookup(void) ++{ ++#ifdef HAVE_KERN_PATH_PARENT_HEADER ++#ifndef HAVE_KERN_PATH_PARENT_SYMBOL ++ kern_path_parent_fn = (kern_path_parent_t) ++ spl_kallsyms_lookup_name("kern_path_parent"); ++ if (!kern_path_parent_fn) { ++ printk(KERN_ERR "Error: Unknown symbol kern_path_parent\n"); ++ return -EFAULT; ++ } ++#endif /* HAVE_KERN_PATH_PARENT_SYMBOL */ ++#endif /* HAVE_KERN_PATH_PARENT_HEADER */ ++ ++#ifdef HAVE_KERN_PATH_LOCKED ++ kern_path_locked_fn = (kern_path_locked_t) ++ spl_kallsyms_lookup_name("kern_path_locked"); ++ if (!kern_path_locked_fn) { ++ printk(KERN_ERR "Error: Unknown symbol kern_path_locked\n"); ++ return -EFAULT; ++ } ++#endif ++ ++ return (0); ++} ++ ++int ++spl_vn_init(void) ++{ ++ SENTRY; ++ vn_cache = kmem_cache_create("spl_vn_cache", ++ sizeof(struct vnode), 64, ++ vn_cache_constructor, ++ vn_cache_destructor, ++ NULL, NULL, NULL, KMC_KMEM); ++ ++ vn_file_cache = kmem_cache_create("spl_vn_file_cache", ++ sizeof(file_t), 64, ++ vn_file_cache_constructor, ++ vn_file_cache_destructor, ++ NULL, NULL, NULL, KMC_KMEM); ++ SRETURN(0); ++} /* vn_init() */ ++ ++void ++spl_vn_fini(void) ++{ ++ file_t *fp, *next_fp; ++ int leaked = 0; ++ SENTRY; ++ ++ spin_lock(&vn_file_lock); ++ ++ list_for_each_entry_safe(fp, next_fp, &vn_file_list, f_list) { ++ list_del(&fp->f_list); ++ releasef_locked(fp); ++ leaked++; ++ } ++ ++ spin_unlock(&vn_file_lock); ++ ++ if (leaked > 0) ++ SWARN("Warning %d files leaked\n", leaked); ++ ++ kmem_cache_destroy(vn_file_cache); ++ kmem_cache_destroy(vn_cache); ++ ++ SEXIT; ++ return; ++} /* vn_fini() */ +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-xdr.c linux-3.2.33-go/spl/spl/spl-xdr.c +--- linux-3.2.33-go.orig/spl/spl/spl-xdr.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-xdr.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,525 @@ ++/*****************************************************************************\ ++ * Copyright (c) 2008-2010 Sun Microsystems, Inc. ++ * Written by Ricardo Correia ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting Layer (SPL) XDR Implementation. ++\*****************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef SS_DEBUG_SUBSYS ++#undef SS_DEBUG_SUBSYS ++#endif ++ ++#define SS_DEBUG_SUBSYS SS_XDR ++ ++/* ++ * SPL's XDR mem implementation. ++ * ++ * This is used by libnvpair to serialize/deserialize the name-value pair data ++ * structures into byte arrays in a well-defined and portable manner. ++ * ++ * These data structures are used by the DMU/ZFS to flexibly manipulate various ++ * information in memory and later serialize it/deserialize it to disk. ++ * Examples of usages include the pool configuration, lists of pool and dataset ++ * properties, etc. ++ * ++ * Reference documentation for the XDR representation and XDR operations can be ++ * found in RFC 1832 and xdr(3), respectively. ++ * ++ * === Implementation shortcomings === ++ * ++ * It is assumed that the following C types have the following sizes: ++ * ++ * char/unsigned char: 1 byte ++ * short/unsigned short: 2 bytes ++ * int/unsigned int: 4 bytes ++ * longlong_t/u_longlong_t: 8 bytes ++ * ++ * The C standard allows these types to be larger (and in the case of ints, ++ * shorter), so if that is the case on some compiler/architecture, the build ++ * will fail (on purpose). ++ * ++ * If someone wants to fix the code to work properly on such environments, then: ++ * ++ * 1) Preconditions should be added to xdrmem_enc functions to make sure the ++ * caller doesn't pass arguments which exceed the expected range. ++ * 2) Functions which take signed integers should be changed to properly do ++ * sign extension. ++ * 3) For ints with less than 32 bits, well.. I suspect you'll have bigger ++ * problems than this implementation. ++ * ++ * It is also assumed that: ++ * ++ * 1) Chars have 8 bits. ++ * 2) We can always do 32-bit-aligned int memory accesses and byte-aligned ++ * memcpy, memset and memcmp. ++ * 3) Arrays passed to xdr_array() are packed and the compiler/architecture ++ * supports element-sized-aligned memory accesses. ++ * 4) Negative integers are natively stored in two's complement binary ++ * representation. ++ * ++ * No checks are done for the 4 assumptions above, though. ++ * ++ * === Caller expectations === ++ * ++ * Existing documentation does not describe the semantics of XDR operations very ++ * well. Therefore, some assumptions about failure semantics will be made and ++ * will be described below: ++ * ++ * 1) If any encoding operation fails (e.g., due to lack of buffer space), the ++ * the stream should be considered valid only up to the encoding operation ++ * previous to the one that first failed. However, the stream size as returned ++ * by xdr_control() cannot be considered to be strictly correct (it may be ++ * bigger). ++ * ++ * Putting it another way, if there is an encoding failure it's undefined ++ * whether anything is added to the stream in that operation and therefore ++ * neither xdr_control() nor future encoding operations on the same stream can ++ * be relied upon to produce correct results. ++ * ++ * 2) If a decoding operation fails, it's undefined whether anything will be ++ * decoded into passed buffers/pointers during that operation, or what the ++ * values on those buffers will look like. ++ * ++ * Future decoding operations on the same stream will also have similar ++ * undefined behavior. ++ * ++ * 3) When the first decoding operation fails it is OK to trust the results of ++ * previous decoding operations on the same stream, as long as the caller ++ * expects a failure to be possible (e.g. due to end-of-stream). ++ * ++ * However, this is highly discouraged because the caller should know the ++ * stream size and should be coded to expect any decoding failure to be data ++ * corruption due to hardware, accidental or even malicious causes, which should ++ * be handled gracefully in all cases. ++ * ++ * In very rare situations where there are strong reasons to believe the data ++ * can be trusted to be valid and non-tampered with, then the caller may assume ++ * a decoding failure to be a bug (e.g. due to mismatched data types) and may ++ * fail non-gracefully. ++ * ++ * 4) Non-zero padding bytes will cause the decoding operation to fail. ++ * ++ * 5) Zero bytes on string types will also cause the decoding operation to fail. ++ * ++ * 6) It is assumed that either the pointer to the stream buffer given by the ++ * caller is 32-bit aligned or the architecture supports non-32-bit-aligned int ++ * memory accesses. ++ * ++ * 7) The stream buffer and encoding/decoding buffers/ptrs should not overlap. ++ * ++ * 8) If a caller passes pointers to non-kernel memory (e.g., pointers to user ++ * space or MMIO space), the computer may explode. ++ */ ++ ++static struct xdr_ops xdrmem_encode_ops; ++static struct xdr_ops xdrmem_decode_ops; ++ ++void ++xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size, ++ const enum xdr_op op) ++{ ++ switch (op) { ++ case XDR_ENCODE: ++ xdrs->x_ops = &xdrmem_encode_ops; ++ break; ++ case XDR_DECODE: ++ xdrs->x_ops = &xdrmem_decode_ops; ++ break; ++ default: ++ SWARN("Invalid op value: %d\n", op); ++ xdrs->x_ops = NULL; /* Let the caller know we failed */ ++ return; ++ } ++ ++ xdrs->x_op = op; ++ xdrs->x_addr = addr; ++ xdrs->x_addr_end = addr + size; ++ ++ if (xdrs->x_addr_end < xdrs->x_addr) { ++ SWARN("Overflow while creating xdrmem: %p, %u\n", addr, size); ++ xdrs->x_ops = NULL; ++ } ++} ++EXPORT_SYMBOL(xdrmem_create); ++ ++static bool_t ++xdrmem_control(XDR *xdrs, int req, void *info) ++{ ++ struct xdr_bytesrec *rec = (struct xdr_bytesrec *) info; ++ ++ if (req != XDR_GET_BYTES_AVAIL) { ++ SWARN("Called with unknown request: %d\n", req); ++ return FALSE; ++ } ++ ++ rec->xc_is_last_record = TRUE; /* always TRUE in xdrmem streams */ ++ rec->xc_num_avail = xdrs->x_addr_end - xdrs->x_addr; ++ ++ return TRUE; ++} ++ ++static bool_t ++xdrmem_enc_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) ++{ ++ uint_t size = roundup(cnt, 4); ++ uint_t pad; ++ ++ if (size < cnt) ++ return FALSE; /* Integer overflow */ ++ ++ if (xdrs->x_addr > xdrs->x_addr_end) ++ return FALSE; ++ ++ if (xdrs->x_addr_end - xdrs->x_addr < size) ++ return FALSE; ++ ++ memcpy(xdrs->x_addr, cp, cnt); ++ ++ xdrs->x_addr += cnt; ++ ++ pad = size - cnt; ++ if (pad > 0) { ++ memset(xdrs->x_addr, 0, pad); ++ xdrs->x_addr += pad; ++ } ++ ++ return TRUE; ++} ++ ++static bool_t ++xdrmem_dec_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) ++{ ++ static uint32_t zero = 0; ++ uint_t size = roundup(cnt, 4); ++ uint_t pad; ++ ++ if (size < cnt) ++ return FALSE; /* Integer overflow */ ++ ++ if (xdrs->x_addr > xdrs->x_addr_end) ++ return FALSE; ++ ++ if (xdrs->x_addr_end - xdrs->x_addr < size) ++ return FALSE; ++ ++ memcpy(cp, xdrs->x_addr, cnt); ++ xdrs->x_addr += cnt; ++ ++ pad = size - cnt; ++ if (pad > 0) { ++ /* An inverted memchr() would be useful here... */ ++ if (memcmp(&zero, xdrs->x_addr, pad) != 0) ++ return FALSE; ++ ++ xdrs->x_addr += pad; ++ } ++ ++ return TRUE; ++} ++ ++static bool_t ++xdrmem_enc_uint32(XDR *xdrs, uint32_t val) ++{ ++ if (xdrs->x_addr + sizeof(uint32_t) > xdrs->x_addr_end) ++ return FALSE; ++ ++ *((uint32_t *) xdrs->x_addr) = cpu_to_be32(val); ++ ++ xdrs->x_addr += sizeof(uint32_t); ++ ++ return TRUE; ++} ++ ++static bool_t ++xdrmem_dec_uint32(XDR *xdrs, uint32_t *val) ++{ ++ if (xdrs->x_addr + sizeof(uint32_t) > xdrs->x_addr_end) ++ return FALSE; ++ ++ *val = be32_to_cpu(*((uint32_t *) xdrs->x_addr)); ++ ++ xdrs->x_addr += sizeof(uint32_t); ++ ++ return TRUE; ++} ++ ++static bool_t ++xdrmem_enc_char(XDR *xdrs, char *cp) ++{ ++ uint32_t val; ++ ++ BUILD_BUG_ON(sizeof(char) != 1); ++ val = *((unsigned char *) cp); ++ ++ return xdrmem_enc_uint32(xdrs, val); ++} ++ ++static bool_t ++xdrmem_dec_char(XDR *xdrs, char *cp) ++{ ++ uint32_t val; ++ ++ BUILD_BUG_ON(sizeof(char) != 1); ++ ++ if (!xdrmem_dec_uint32(xdrs, &val)) ++ return FALSE; ++ ++ /* ++ * If any of the 3 other bytes are non-zero then val will be greater ++ * than 0xff and we fail because according to the RFC, this block does ++ * not have a char encoded in it. ++ */ ++ if (val > 0xff) ++ return FALSE; ++ ++ *((unsigned char *) cp) = val; ++ ++ return TRUE; ++} ++ ++static bool_t ++xdrmem_enc_ushort(XDR *xdrs, unsigned short *usp) ++{ ++ BUILD_BUG_ON(sizeof(unsigned short) != 2); ++ ++ return xdrmem_enc_uint32(xdrs, *usp); ++} ++ ++static bool_t ++xdrmem_dec_ushort(XDR *xdrs, unsigned short *usp) ++{ ++ uint32_t val; ++ ++ BUILD_BUG_ON(sizeof(unsigned short) != 2); ++ ++ if (!xdrmem_dec_uint32(xdrs, &val)) ++ return FALSE; ++ ++ /* ++ * Short ints are not in the RFC, but we assume similar logic as in ++ * xdrmem_dec_char(). ++ */ ++ if (val > 0xffff) ++ return FALSE; ++ ++ *usp = val; ++ ++ return TRUE; ++} ++ ++static bool_t ++xdrmem_enc_uint(XDR *xdrs, unsigned *up) ++{ ++ BUILD_BUG_ON(sizeof(unsigned) != 4); ++ ++ return xdrmem_enc_uint32(xdrs, *up); ++} ++ ++static bool_t ++xdrmem_dec_uint(XDR *xdrs, unsigned *up) ++{ ++ BUILD_BUG_ON(sizeof(unsigned) != 4); ++ ++ return xdrmem_dec_uint32(xdrs, (uint32_t *) up); ++} ++ ++static bool_t ++xdrmem_enc_ulonglong(XDR *xdrs, u_longlong_t *ullp) ++{ ++ BUILD_BUG_ON(sizeof(u_longlong_t) != 8); ++ ++ if (!xdrmem_enc_uint32(xdrs, *ullp >> 32)) ++ return FALSE; ++ ++ return xdrmem_enc_uint32(xdrs, *ullp & 0xffffffff); ++} ++ ++static bool_t ++xdrmem_dec_ulonglong(XDR *xdrs, u_longlong_t *ullp) ++{ ++ uint32_t low, high; ++ ++ BUILD_BUG_ON(sizeof(u_longlong_t) != 8); ++ ++ if (!xdrmem_dec_uint32(xdrs, &high)) ++ return FALSE; ++ if (!xdrmem_dec_uint32(xdrs, &low)) ++ return FALSE; ++ ++ *ullp = ((u_longlong_t) high << 32) | low; ++ ++ return TRUE; ++} ++ ++static bool_t ++xdr_enc_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, ++ const uint_t elsize, const xdrproc_t elproc) ++{ ++ uint_t i; ++ caddr_t addr = *arrp; ++ ++ if (*sizep > maxsize || *sizep > UINT_MAX / elsize) ++ return FALSE; ++ ++ if (!xdrmem_enc_uint(xdrs, sizep)) ++ return FALSE; ++ ++ for (i = 0; i < *sizep; i++) { ++ if (!elproc(xdrs, addr)) ++ return FALSE; ++ addr += elsize; ++ } ++ ++ return TRUE; ++} ++ ++static bool_t ++xdr_dec_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, ++ const uint_t elsize, const xdrproc_t elproc) ++{ ++ uint_t i, size; ++ bool_t alloc = FALSE; ++ caddr_t addr; ++ ++ if (!xdrmem_dec_uint(xdrs, sizep)) ++ return FALSE; ++ ++ size = *sizep; ++ ++ if (size > maxsize || size > UINT_MAX / elsize) ++ return FALSE; ++ ++ /* ++ * The Solaris man page says: "If *arrp is NULL when decoding, ++ * xdr_array() allocates memory and *arrp points to it". ++ */ ++ if (*arrp == NULL) { ++ BUILD_BUG_ON(sizeof(uint_t) > sizeof(size_t)); ++ ++ *arrp = kmem_alloc(size * elsize, KM_NOSLEEP); ++ if (*arrp == NULL) ++ return FALSE; ++ ++ alloc = TRUE; ++ } ++ ++ addr = *arrp; ++ ++ for (i = 0; i < size; i++) { ++ if (!elproc(xdrs, addr)) { ++ if (alloc) ++ kmem_free(*arrp, size * elsize); ++ return FALSE; ++ } ++ addr += elsize; ++ } ++ ++ return TRUE; ++} ++ ++static bool_t ++xdr_enc_string(XDR *xdrs, char **sp, const uint_t maxsize) ++{ ++ size_t slen = strlen(*sp); ++ uint_t len; ++ ++ if (slen > maxsize) ++ return FALSE; ++ ++ len = slen; ++ ++ if (!xdrmem_enc_uint(xdrs, &len)) ++ return FALSE; ++ ++ return xdrmem_enc_bytes(xdrs, *sp, len); ++} ++ ++static bool_t ++xdr_dec_string(XDR *xdrs, char **sp, const uint_t maxsize) ++{ ++ uint_t size; ++ bool_t alloc = FALSE; ++ ++ if (!xdrmem_dec_uint(xdrs, &size)) ++ return FALSE; ++ ++ if (size > maxsize || size > UINT_MAX - 1) ++ return FALSE; ++ ++ /* ++ * Solaris man page: "If *sp is NULL when decoding, xdr_string() ++ * allocates memory and *sp points to it". ++ */ ++ if (*sp == NULL) { ++ BUILD_BUG_ON(sizeof(uint_t) > sizeof(size_t)); ++ ++ *sp = kmem_alloc(size + 1, KM_NOSLEEP); ++ if (*sp == NULL) ++ return FALSE; ++ ++ alloc = TRUE; ++ } ++ ++ if (!xdrmem_dec_bytes(xdrs, *sp, size)) ++ goto fail; ++ ++ if (memchr(*sp, 0, size) != NULL) ++ goto fail; ++ ++ (*sp)[size] = '\0'; ++ ++ return TRUE; ++ ++fail: ++ if (alloc) ++ kmem_free(*sp, size + 1); ++ ++ return FALSE; ++} ++ ++static struct xdr_ops xdrmem_encode_ops = { ++ .xdr_control = xdrmem_control, ++ .xdr_char = xdrmem_enc_char, ++ .xdr_u_short = xdrmem_enc_ushort, ++ .xdr_u_int = xdrmem_enc_uint, ++ .xdr_u_longlong_t = xdrmem_enc_ulonglong, ++ .xdr_opaque = xdrmem_enc_bytes, ++ .xdr_string = xdr_enc_string, ++ .xdr_array = xdr_enc_array ++}; ++ ++static struct xdr_ops xdrmem_decode_ops = { ++ .xdr_control = xdrmem_control, ++ .xdr_char = xdrmem_dec_char, ++ .xdr_u_short = xdrmem_dec_ushort, ++ .xdr_u_int = xdrmem_dec_uint, ++ .xdr_u_longlong_t = xdrmem_dec_ulonglong, ++ .xdr_opaque = xdrmem_dec_bytes, ++ .xdr_string = xdr_dec_string, ++ .xdr_array = xdr_dec_array ++}; ++ +diff -uNr linux-3.2.33-go.orig/spl/spl/spl-zlib.c linux-3.2.33-go/spl/spl/spl-zlib.c +--- linux-3.2.33-go.orig/spl/spl/spl-zlib.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/spl/spl-zlib.c 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,225 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * z_compress_level/z_uncompress are nearly identical copies of the ++ * compress2/uncompress functions provided by the official zlib package ++ * available at http://zlib.net/. The only changes made we to slightly ++ * adapt the functions called to match the linux kernel implementation ++ * of zlib. The full zlib license follows: ++ * ++ * zlib.h -- interface of the 'zlib' general purpose compression library ++ * version 1.2.5, April 19th, 2010 ++ * ++ * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ * ++ * Jean-loup Gailly ++ * Mark Adler ++\*****************************************************************************/ ++ ++ ++#include ++#include ++#include ++ ++#ifdef DEBUG_SUBSYSTEM ++#undef DEBUG_SUBSYSTEM ++#endif ++ ++#define DEBUG_SUBSYSTEM SS_ZLIB ++ ++static spl_kmem_cache_t *zlib_workspace_cache; ++ ++/* ++ * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc ++ * and vfree for every call. Using a kmem_cache also has the advantage ++ * that improves the odds that the memory used will be local to this cpu. ++ * To further improve things it might be wise to create a dedicated per-cpu ++ * workspace for use. This would take some additional care because we then ++ * must disable preemption around the critical section, and verify that ++ * zlib_deflate* and zlib_inflate* never internally call schedule(). ++ */ ++static void * ++zlib_workspace_alloc(int flags) ++{ ++ return kmem_cache_alloc(zlib_workspace_cache, flags & ~(__GFP_FS)); ++} ++ ++static void ++zlib_workspace_free(void *workspace) ++{ ++ kmem_cache_free(zlib_workspace_cache, workspace); ++} ++ ++/* ++ * Compresses the source buffer into the destination buffer. The level ++ * parameter has the same meaning as in deflateInit. sourceLen is the byte ++ * length of the source buffer. Upon entry, destLen is the total size of the ++ * destination buffer, which must be at least 0.1% larger than sourceLen plus ++ * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer. ++ * ++ * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough ++ * memory, Z_BUF_ERROR if there was not enough room in the output buffer, ++ * Z_STREAM_ERROR if the level parameter is invalid. ++ */ ++int ++z_compress_level(void *dest, size_t *destLen, const void *source, ++ size_t sourceLen, int level) ++{ ++ z_stream stream; ++ int err; ++ ++ stream.next_in = (Byte *)source; ++ stream.avail_in = (uInt)sourceLen; ++ stream.next_out = dest; ++ stream.avail_out = (uInt)*destLen; ++ ++ if ((size_t)stream.avail_out != *destLen) ++ return Z_BUF_ERROR; ++ ++ stream.workspace = zlib_workspace_alloc(KM_SLEEP); ++ if (!stream.workspace) ++ return Z_MEM_ERROR; ++ ++ err = zlib_deflateInit(&stream, level); ++ if (err != Z_OK) { ++ zlib_workspace_free(stream.workspace); ++ return err; ++ } ++ ++ err = zlib_deflate(&stream, Z_FINISH); ++ if (err != Z_STREAM_END) { ++ zlib_deflateEnd(&stream); ++ zlib_workspace_free(stream.workspace); ++ return err == Z_OK ? Z_BUF_ERROR : err; ++ } ++ *destLen = stream.total_out; ++ ++ err = zlib_deflateEnd(&stream); ++ zlib_workspace_free(stream.workspace); ++ ++ return err; ++} ++EXPORT_SYMBOL(z_compress_level); ++ ++/* ++ * Decompresses the source buffer into the destination buffer. sourceLen is ++ * the byte length of the source buffer. Upon entry, destLen is the total ++ * size of the destination buffer, which must be large enough to hold the ++ * entire uncompressed data. (The size of the uncompressed data must have ++ * been saved previously by the compressor and transmitted to the decompressor ++ * by some mechanism outside the scope of this compression library.) ++ * Upon exit, destLen is the actual size of the compressed buffer. ++ * This function can be used to decompress a whole file at once if the ++ * input file is mmap'ed. ++ * ++ * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not ++ * enough memory, Z_BUF_ERROR if there was not enough room in the output ++ * buffer, or Z_DATA_ERROR if the input data was corrupted. ++ */ ++int ++z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen) ++{ ++ z_stream stream; ++ int err; ++ ++ stream.next_in = (Byte *)source; ++ stream.avail_in = (uInt)sourceLen; ++ stream.next_out = dest; ++ stream.avail_out = (uInt)*destLen; ++ ++ if ((size_t)stream.avail_out != *destLen) ++ return Z_BUF_ERROR; ++ ++ stream.workspace = zlib_workspace_alloc(KM_SLEEP); ++ if (!stream.workspace) ++ return Z_MEM_ERROR; ++ ++ err = zlib_inflateInit(&stream); ++ if (err != Z_OK) { ++ zlib_workspace_free(stream.workspace); ++ return err; ++ } ++ ++ err = zlib_inflate(&stream, Z_FINISH); ++ if (err != Z_STREAM_END) { ++ zlib_inflateEnd(&stream); ++ zlib_workspace_free(stream.workspace); ++ ++ if (err == Z_NEED_DICT || ++ (err == Z_BUF_ERROR && stream.avail_in == 0)) ++ return Z_DATA_ERROR; ++ ++ return err; ++ } ++ *destLen = stream.total_out; ++ ++ err = zlib_inflateEnd(&stream); ++ zlib_workspace_free(stream.workspace); ++ ++ return err; ++} ++EXPORT_SYMBOL(z_uncompress); ++ ++int ++spl_zlib_init(void) ++{ ++ int size; ++ SENTRY; ++ ++ size = MAX(spl_zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), ++ zlib_inflate_workspacesize()); ++ ++ zlib_workspace_cache = kmem_cache_create( ++ "spl_zlib_workspace_cache", ++ size, 0, NULL, NULL, NULL, NULL, NULL, ++ KMC_VMEM | KMC_NOEMERGENCY); ++ if (!zlib_workspace_cache) ++ SRETURN(1); ++ ++ SRETURN(0); ++} ++ ++void ++spl_zlib_fini(void) ++{ ++ SENTRY; ++ kmem_cache_destroy(zlib_workspace_cache); ++ zlib_workspace_cache = NULL; ++ SEXIT; ++} +diff -uNr linux-3.2.33-go.orig/spl/splat/Makefile linux-3.2.33-go/spl/splat/Makefile +--- linux-3.2.33-go.orig/spl/splat/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/Makefile 2012-11-16 23:22:32.419192759 +0100 +@@ -0,0 +1,25 @@ ++# Makefile.in for splat kernel module ++ ++MODULE := splat ++EXTRA_CFLAGS = $(SPL_MODULE_CFLAGS) -DHAVE_GPL_ONLY_SYMBOLS -Wstrict-prototypes -DNDEBUG -DDEBUG_LOG -DDEBUG_KMEM ++ ++# Solaris Porting LAyer Tests ++obj-$(CONFIG_SPL) := $(MODULE).o ++ ++$(MODULE)-objs += splat-ctl.o ++$(MODULE)-objs += splat-kmem.o ++$(MODULE)-objs += splat-taskq.o ++$(MODULE)-objs += splat-random.o ++$(MODULE)-objs += splat-mutex.o ++$(MODULE)-objs += splat-condvar.o ++$(MODULE)-objs += splat-thread.o ++$(MODULE)-objs += splat-rwlock.o ++$(MODULE)-objs += splat-time.o ++$(MODULE)-objs += splat-vnode.o ++$(MODULE)-objs += splat-kobj.o ++$(MODULE)-objs += splat-atomic.o ++$(MODULE)-objs += splat-list.o ++$(MODULE)-objs += splat-generic.o ++$(MODULE)-objs += splat-cred.o ++$(MODULE)-objs += splat-zlib.o ++$(MODULE)-objs += splat-linux.o +diff -uNr linux-3.2.33-go.orig/spl/splat/Makefile.in linux-3.2.33-go/spl/splat/Makefile.in +--- linux-3.2.33-go.orig/spl/splat/Makefile.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/Makefile.in 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,25 @@ ++# Makefile.in for splat kernel module ++ ++MODULE := splat ++EXTRA_CFLAGS = $(SPL_MODULE_CFLAGS) @KERNELCPPFLAGS@ ++ ++# Solaris Porting LAyer Tests ++obj-$(CONFIG_SPL) := $(MODULE).o ++ ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-ctl.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-kmem.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-taskq.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-random.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-mutex.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-condvar.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-thread.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-rwlock.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-time.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-vnode.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-kobj.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-atomic.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-list.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-generic.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-cred.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-zlib.o ++$(MODULE)-objs += @top_srcdir@/module/splat/splat-linux.o +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-atomic.c linux-3.2.33-go/spl/splat/splat-atomic.c +--- linux-3.2.33-go.orig/spl/splat/splat-atomic.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-atomic.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,227 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Atomic Tests. ++\*****************************************************************************/ ++ ++#include ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_ATOMIC_NAME "atomic" ++#define SPLAT_ATOMIC_DESC "Kernel Atomic Tests" ++ ++#define SPLAT_ATOMIC_TEST1_ID 0x0b01 ++#define SPLAT_ATOMIC_TEST1_NAME "64-bit" ++#define SPLAT_ATOMIC_TEST1_DESC "Validate 64-bit atomic ops" ++ ++#define SPLAT_ATOMIC_TEST_MAGIC 0x43435454UL ++#define SPLAT_ATOMIC_INIT_VALUE 10000000UL ++ ++typedef enum { ++ SPLAT_ATOMIC_INC_64 = 0, ++ SPLAT_ATOMIC_DEC_64 = 1, ++ SPLAT_ATOMIC_ADD_64 = 2, ++ SPLAT_ATOMIC_SUB_64 = 3, ++ SPLAT_ATOMIC_ADD_64_NV = 4, ++ SPLAT_ATOMIC_SUB_64_NV = 5, ++ SPLAT_ATOMIC_COUNT_64 = 6 ++} atomic_op_t; ++ ++typedef struct atomic_priv { ++ unsigned long ap_magic; ++ struct file *ap_file; ++ struct mutex ap_lock; ++ wait_queue_head_t ap_waitq; ++ volatile uint64_t ap_atomic; ++ volatile uint64_t ap_atomic_exited; ++ atomic_op_t ap_op; ++ ++} atomic_priv_t; ++ ++static void ++splat_atomic_work(void *priv) ++{ ++ atomic_priv_t *ap; ++ atomic_op_t op; ++ int i; ++ ++ ap = (atomic_priv_t *)priv; ++ ASSERT(ap->ap_magic == SPLAT_ATOMIC_TEST_MAGIC); ++ ++ mutex_lock(&ap->ap_lock); ++ op = ap->ap_op; ++ wake_up(&ap->ap_waitq); ++ mutex_unlock(&ap->ap_lock); ++ ++ splat_vprint(ap->ap_file, SPLAT_ATOMIC_TEST1_NAME, ++ "Thread %d successfully started: %lu/%lu\n", op, ++ (long unsigned)ap->ap_atomic, ++ (long unsigned)ap->ap_atomic_exited); ++ ++ for (i = 0; i < SPLAT_ATOMIC_INIT_VALUE / 10; i++) { ++ ++ /* Periodically sleep to mix up the ordering */ ++ if ((i % (SPLAT_ATOMIC_INIT_VALUE / 100)) == 0) { ++ splat_vprint(ap->ap_file, SPLAT_ATOMIC_TEST1_NAME, ++ "Thread %d sleeping: %lu/%lu\n", op, ++ (long unsigned)ap->ap_atomic, ++ (long unsigned)ap->ap_atomic_exited); ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule_timeout(HZ / 100); ++ } ++ ++ switch (op) { ++ case SPLAT_ATOMIC_INC_64: ++ atomic_inc_64(&ap->ap_atomic); ++ break; ++ case SPLAT_ATOMIC_DEC_64: ++ atomic_dec_64(&ap->ap_atomic); ++ break; ++ case SPLAT_ATOMIC_ADD_64: ++ atomic_add_64(&ap->ap_atomic, 3); ++ break; ++ case SPLAT_ATOMIC_SUB_64: ++ atomic_sub_64(&ap->ap_atomic, 3); ++ break; ++ case SPLAT_ATOMIC_ADD_64_NV: ++ atomic_add_64_nv(&ap->ap_atomic, 5); ++ break; ++ case SPLAT_ATOMIC_SUB_64_NV: ++ atomic_sub_64_nv(&ap->ap_atomic, 5); ++ break; ++ default: ++ PANIC("Undefined op %d\n", op); ++ } ++ } ++ ++ atomic_inc_64(&ap->ap_atomic_exited); ++ ++ splat_vprint(ap->ap_file, SPLAT_ATOMIC_TEST1_NAME, ++ "Thread %d successfully exited: %lu/%lu\n", op, ++ (long unsigned)ap->ap_atomic, ++ (long unsigned)ap->ap_atomic_exited); ++ ++ wake_up(&ap->ap_waitq); ++ thread_exit(); ++} ++ ++static int ++splat_atomic_test1_cond(atomic_priv_t *ap, int started) ++{ ++ return (ap->ap_atomic_exited == started); ++} ++ ++static int ++splat_atomic_test1(struct file *file, void *arg) ++{ ++ atomic_priv_t ap; ++ DEFINE_WAIT(wait); ++ kthread_t *thr; ++ int i, rc = 0; ++ ++ ap.ap_magic = SPLAT_ATOMIC_TEST_MAGIC; ++ ap.ap_file = file; ++ mutex_init(&ap.ap_lock); ++ init_waitqueue_head(&ap.ap_waitq); ++ ap.ap_atomic = SPLAT_ATOMIC_INIT_VALUE; ++ ap.ap_atomic_exited = 0; ++ ++ for (i = 0; i < SPLAT_ATOMIC_COUNT_64; i++) { ++ mutex_lock(&ap.ap_lock); ++ ap.ap_op = i; ++ ++ thr = (kthread_t *)thread_create(NULL, 0, splat_atomic_work, ++ &ap, 0, &p0, TS_RUN, ++ minclsyspri); ++ if (thr == NULL) { ++ rc = -ESRCH; ++ mutex_unlock(&ap.ap_lock); ++ break; ++ } ++ ++ /* Prepare to wait, the new thread will wake us once it ++ * has made a copy of the unique private passed data */ ++ prepare_to_wait(&ap.ap_waitq, &wait, TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&ap.ap_lock); ++ schedule(); ++ } ++ ++ wait_event(ap.ap_waitq, splat_atomic_test1_cond(&ap, i)); ++ ++ if (rc) { ++ splat_vprint(file, SPLAT_ATOMIC_TEST1_NAME, "Only started " ++ "%d/%d test threads\n", i, SPLAT_ATOMIC_COUNT_64); ++ return rc; ++ } ++ ++ if (ap.ap_atomic != SPLAT_ATOMIC_INIT_VALUE) { ++ splat_vprint(file, SPLAT_ATOMIC_TEST1_NAME, ++ "Final value %lu does not match initial value %lu\n", ++ (long unsigned)ap.ap_atomic, SPLAT_ATOMIC_INIT_VALUE); ++ return -EINVAL; ++ } ++ ++ splat_vprint(file, SPLAT_ATOMIC_TEST1_NAME, ++ "Success initial and final values match, %lu == %lu\n", ++ (long unsigned)ap.ap_atomic, SPLAT_ATOMIC_INIT_VALUE); ++ ++ return 0; ++} ++ ++splat_subsystem_t * ++splat_atomic_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_ATOMIC_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_ATOMIC_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_ATOMIC; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_ATOMIC_TEST1_NAME, SPLAT_ATOMIC_TEST1_DESC, ++ SPLAT_ATOMIC_TEST1_ID, splat_atomic_test1); ++ ++ return sub; ++} ++ ++void ++splat_atomic_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ SPLAT_TEST_FINI(sub, SPLAT_ATOMIC_TEST1_ID); ++ ++ kfree(sub); ++} ++ ++int ++splat_atomic_id(void) { ++ return SPLAT_SUBSYSTEM_ATOMIC; ++} +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-condvar.c linux-3.2.33-go/spl/splat/splat-condvar.c +--- linux-3.2.33-go.orig/spl/splat/splat-condvar.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-condvar.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,479 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Condition Variable Tests. ++\*****************************************************************************/ ++ ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_CONDVAR_NAME "condvar" ++#define SPLAT_CONDVAR_DESC "Kernel Condition Variable Tests" ++ ++#define SPLAT_CONDVAR_TEST1_ID 0x0501 ++#define SPLAT_CONDVAR_TEST1_NAME "signal1" ++#define SPLAT_CONDVAR_TEST1_DESC "Wake a single thread, cv_wait()/cv_signal()" ++ ++#define SPLAT_CONDVAR_TEST2_ID 0x0502 ++#define SPLAT_CONDVAR_TEST2_NAME "broadcast1" ++#define SPLAT_CONDVAR_TEST2_DESC "Wake all threads, cv_wait()/cv_broadcast()" ++ ++#define SPLAT_CONDVAR_TEST3_ID 0x0503 ++#define SPLAT_CONDVAR_TEST3_NAME "signal2" ++#define SPLAT_CONDVAR_TEST3_DESC "Wake a single thread, cv_wait_timeout()/cv_signal()" ++ ++#define SPLAT_CONDVAR_TEST4_ID 0x0504 ++#define SPLAT_CONDVAR_TEST4_NAME "broadcast2" ++#define SPLAT_CONDVAR_TEST4_DESC "Wake all threads, cv_wait_timeout()/cv_broadcast()" ++ ++#define SPLAT_CONDVAR_TEST5_ID 0x0505 ++#define SPLAT_CONDVAR_TEST5_NAME "timeout" ++#define SPLAT_CONDVAR_TEST5_DESC "Timeout thread, cv_wait_timeout()" ++ ++#define SPLAT_CONDVAR_TEST_MAGIC 0x115599DDUL ++#define SPLAT_CONDVAR_TEST_NAME "condvar_test" ++#define SPLAT_CONDVAR_TEST_COUNT 8 ++ ++typedef struct condvar_priv { ++ unsigned long cv_magic; ++ struct file *cv_file; ++ kcondvar_t cv_condvar; ++ kmutex_t cv_mtx; ++} condvar_priv_t; ++ ++typedef struct condvar_thr { ++ int ct_id; ++ const char *ct_name; ++ condvar_priv_t *ct_cvp; ++ int ct_rc; ++} condvar_thr_t; ++ ++int ++splat_condvar_test12_thread(void *arg) ++{ ++ condvar_thr_t *ct = (condvar_thr_t *)arg; ++ condvar_priv_t *cv = ct->ct_cvp; ++ char name[16]; ++ ++ ASSERT(cv->cv_magic == SPLAT_CONDVAR_TEST_MAGIC); ++ snprintf(name, sizeof(name),"%s%d",SPLAT_CONDVAR_TEST_NAME,ct->ct_id); ++ daemonize(name); ++ ++ mutex_enter(&cv->cv_mtx); ++ splat_vprint(cv->cv_file, ct->ct_name, ++ "%s thread sleeping with %d waiters\n", ++ name, atomic_read(&cv->cv_condvar.cv_waiters)); ++ cv_wait(&cv->cv_condvar, &cv->cv_mtx); ++ splat_vprint(cv->cv_file, ct->ct_name, ++ "%s thread woken %d waiters remain\n", ++ name, atomic_read(&cv->cv_condvar.cv_waiters)); ++ mutex_exit(&cv->cv_mtx); ++ ++ return 0; ++} ++ ++static int ++splat_condvar_test1(struct file *file, void *arg) ++{ ++ int i, count = 0, rc = 0; ++ long pids[SPLAT_CONDVAR_TEST_COUNT]; ++ condvar_thr_t ct[SPLAT_CONDVAR_TEST_COUNT]; ++ condvar_priv_t cv; ++ ++ cv.cv_magic = SPLAT_CONDVAR_TEST_MAGIC; ++ cv.cv_file = file; ++ mutex_init(&cv.cv_mtx, SPLAT_CONDVAR_TEST_NAME, MUTEX_DEFAULT, NULL); ++ cv_init(&cv.cv_condvar, NULL, CV_DEFAULT, NULL); ++ ++ /* Create some threads, the exact number isn't important just as ++ * long as we know how many we managed to create and should expect. */ ++ for (i = 0; i < SPLAT_CONDVAR_TEST_COUNT; i++) { ++ ct[i].ct_cvp = &cv; ++ ct[i].ct_id = i; ++ ct[i].ct_name = SPLAT_CONDVAR_TEST1_NAME; ++ ct[i].ct_rc = 0; ++ ++ pids[i] = kernel_thread(splat_condvar_test12_thread, &ct[i], 0); ++ if (pids[i] >= 0) ++ count++; ++ } ++ ++ /* Wait until all threads are waiting on the condition variable */ ++ while (atomic_read(&cv.cv_condvar.cv_waiters) != count) ++ schedule(); ++ ++ /* Wake a single thread at a time, wait until it exits */ ++ for (i = 1; i <= count; i++) { ++ cv_signal(&cv.cv_condvar); ++ ++ while (atomic_read(&cv.cv_condvar.cv_waiters) > (count - i)) ++ schedule(); ++ ++ /* Correct behavior 1 thread woken */ ++ if (atomic_read(&cv.cv_condvar.cv_waiters) == (count - i)) ++ continue; ++ ++ splat_vprint(file, SPLAT_CONDVAR_TEST1_NAME, "Attempted to " ++ "wake %d thread but work %d threads woke\n", ++ 1, count - atomic_read(&cv.cv_condvar.cv_waiters)); ++ rc = -EINVAL; ++ break; ++ } ++ ++ if (!rc) ++ splat_vprint(file, SPLAT_CONDVAR_TEST1_NAME, "Correctly woke " ++ "%d sleeping threads %d at a time\n", count, 1); ++ ++ /* Wait until that last nutex is dropped */ ++ while (mutex_owner(&cv.cv_mtx)) ++ schedule(); ++ ++ /* Wake everything for the failure case */ ++ cv_broadcast(&cv.cv_condvar); ++ cv_destroy(&cv.cv_condvar); ++ mutex_destroy(&cv.cv_mtx); ++ ++ return rc; ++} ++ ++static int ++splat_condvar_test2(struct file *file, void *arg) ++{ ++ int i, count = 0, rc = 0; ++ long pids[SPLAT_CONDVAR_TEST_COUNT]; ++ condvar_thr_t ct[SPLAT_CONDVAR_TEST_COUNT]; ++ condvar_priv_t cv; ++ ++ cv.cv_magic = SPLAT_CONDVAR_TEST_MAGIC; ++ cv.cv_file = file; ++ mutex_init(&cv.cv_mtx, SPLAT_CONDVAR_TEST_NAME, MUTEX_DEFAULT, NULL); ++ cv_init(&cv.cv_condvar, NULL, CV_DEFAULT, NULL); ++ ++ /* Create some threads, the exact number isn't important just as ++ * long as we know how many we managed to create and should expect. */ ++ for (i = 0; i < SPLAT_CONDVAR_TEST_COUNT; i++) { ++ ct[i].ct_cvp = &cv; ++ ct[i].ct_id = i; ++ ct[i].ct_name = SPLAT_CONDVAR_TEST2_NAME; ++ ct[i].ct_rc = 0; ++ ++ pids[i] = kernel_thread(splat_condvar_test12_thread, &ct[i], 0); ++ if (pids[i] > 0) ++ count++; ++ } ++ ++ /* Wait until all threads are waiting on the condition variable */ ++ while (atomic_read(&cv.cv_condvar.cv_waiters) != count) ++ schedule(); ++ ++ /* Wake all threads waiting on the condition variable */ ++ cv_broadcast(&cv.cv_condvar); ++ ++ /* Wait until all threads have exited */ ++ while ((atomic_read(&cv.cv_condvar.cv_waiters) > 0) || mutex_owner(&cv.cv_mtx)) ++ schedule(); ++ ++ splat_vprint(file, SPLAT_CONDVAR_TEST2_NAME, "Correctly woke all " ++ "%d sleeping threads at once\n", count); ++ ++ /* Wake everything for the failure case */ ++ cv_destroy(&cv.cv_condvar); ++ mutex_destroy(&cv.cv_mtx); ++ ++ return rc; ++} ++ ++int ++splat_condvar_test34_thread(void *arg) ++{ ++ condvar_thr_t *ct = (condvar_thr_t *)arg; ++ condvar_priv_t *cv = ct->ct_cvp; ++ char name[16]; ++ clock_t rc; ++ ++ ASSERT(cv->cv_magic == SPLAT_CONDVAR_TEST_MAGIC); ++ snprintf(name, sizeof(name), "%s%d", SPLAT_CONDVAR_TEST_NAME, ct->ct_id); ++ daemonize(name); ++ ++ mutex_enter(&cv->cv_mtx); ++ splat_vprint(cv->cv_file, ct->ct_name, ++ "%s thread sleeping with %d waiters\n", ++ name, atomic_read(&cv->cv_condvar.cv_waiters)); ++ ++ /* Sleep no longer than 3 seconds, for this test we should ++ * actually never sleep that long without being woken up. */ ++ rc = cv_timedwait(&cv->cv_condvar, &cv->cv_mtx, lbolt + HZ * 3); ++ if (rc == -1) { ++ ct->ct_rc = -ETIMEDOUT; ++ splat_vprint(cv->cv_file, ct->ct_name, "%s thread timed out, " ++ "should have been woken\n", name); ++ } else { ++ splat_vprint(cv->cv_file, ct->ct_name, ++ "%s thread woken %d waiters remain\n", ++ name, atomic_read(&cv->cv_condvar.cv_waiters)); ++ } ++ ++ mutex_exit(&cv->cv_mtx); ++ ++ return 0; ++} ++ ++static int ++splat_condvar_test3(struct file *file, void *arg) ++{ ++ int i, count = 0, rc = 0; ++ long pids[SPLAT_CONDVAR_TEST_COUNT]; ++ condvar_thr_t ct[SPLAT_CONDVAR_TEST_COUNT]; ++ condvar_priv_t cv; ++ ++ cv.cv_magic = SPLAT_CONDVAR_TEST_MAGIC; ++ cv.cv_file = file; ++ mutex_init(&cv.cv_mtx, SPLAT_CONDVAR_TEST_NAME, MUTEX_DEFAULT, NULL); ++ cv_init(&cv.cv_condvar, NULL, CV_DEFAULT, NULL); ++ ++ /* Create some threads, the exact number isn't important just as ++ * long as we know how many we managed to create and should expect. */ ++ for (i = 0; i < SPLAT_CONDVAR_TEST_COUNT; i++) { ++ ct[i].ct_cvp = &cv; ++ ct[i].ct_id = i; ++ ct[i].ct_name = SPLAT_CONDVAR_TEST3_NAME; ++ ct[i].ct_rc = 0; ++ ++ pids[i] = kernel_thread(splat_condvar_test34_thread, &ct[i], 0); ++ if (pids[i] >= 0) ++ count++; ++ } ++ ++ /* Wait until all threads are waiting on the condition variable */ ++ while (atomic_read(&cv.cv_condvar.cv_waiters) != count) ++ schedule(); ++ ++ /* Wake a single thread at a time, wait until it exits */ ++ for (i = 1; i <= count; i++) { ++ cv_signal(&cv.cv_condvar); ++ ++ while (atomic_read(&cv.cv_condvar.cv_waiters) > (count - i)) ++ schedule(); ++ ++ /* Correct behavior 1 thread woken */ ++ if (atomic_read(&cv.cv_condvar.cv_waiters) == (count - i)) ++ continue; ++ ++ splat_vprint(file, SPLAT_CONDVAR_TEST3_NAME, "Attempted to " ++ "wake %d thread but work %d threads woke\n", ++ 1, count - atomic_read(&cv.cv_condvar.cv_waiters)); ++ rc = -EINVAL; ++ break; ++ } ++ ++ /* Validate no waiting thread timed out early */ ++ for (i = 0; i < count; i++) ++ if (ct[i].ct_rc) ++ rc = ct[i].ct_rc; ++ ++ if (!rc) ++ splat_vprint(file, SPLAT_CONDVAR_TEST3_NAME, "Correctly woke " ++ "%d sleeping threads %d at a time\n", count, 1); ++ ++ /* Wait until that last nutex is dropped */ ++ while (mutex_owner(&cv.cv_mtx)) ++ schedule(); ++ ++ /* Wake everything for the failure case */ ++ cv_broadcast(&cv.cv_condvar); ++ cv_destroy(&cv.cv_condvar); ++ mutex_destroy(&cv.cv_mtx); ++ ++ return rc; ++} ++ ++static int ++splat_condvar_test4(struct file *file, void *arg) ++{ ++ int i, count = 0, rc = 0; ++ long pids[SPLAT_CONDVAR_TEST_COUNT]; ++ condvar_thr_t ct[SPLAT_CONDVAR_TEST_COUNT]; ++ condvar_priv_t cv; ++ ++ cv.cv_magic = SPLAT_CONDVAR_TEST_MAGIC; ++ cv.cv_file = file; ++ mutex_init(&cv.cv_mtx, SPLAT_CONDVAR_TEST_NAME, MUTEX_DEFAULT, NULL); ++ cv_init(&cv.cv_condvar, NULL, CV_DEFAULT, NULL); ++ ++ /* Create some threads, the exact number isn't important just as ++ * long as we know how many we managed to create and should expect. */ ++ for (i = 0; i < SPLAT_CONDVAR_TEST_COUNT; i++) { ++ ct[i].ct_cvp = &cv; ++ ct[i].ct_id = i; ++ ct[i].ct_name = SPLAT_CONDVAR_TEST3_NAME; ++ ct[i].ct_rc = 0; ++ ++ pids[i] = kernel_thread(splat_condvar_test34_thread, &ct[i], 0); ++ if (pids[i] >= 0) ++ count++; ++ } ++ ++ /* Wait until all threads are waiting on the condition variable */ ++ while (atomic_read(&cv.cv_condvar.cv_waiters) != count) ++ schedule(); ++ ++ /* Wake a single thread at a time, wait until it exits */ ++ for (i = 1; i <= count; i++) { ++ cv_signal(&cv.cv_condvar); ++ ++ while (atomic_read(&cv.cv_condvar.cv_waiters) > (count - i)) ++ schedule(); ++ ++ /* Correct behavior 1 thread woken */ ++ if (atomic_read(&cv.cv_condvar.cv_waiters) == (count - i)) ++ continue; ++ ++ splat_vprint(file, SPLAT_CONDVAR_TEST3_NAME, "Attempted to " ++ "wake %d thread but work %d threads woke\n", ++ 1, count - atomic_read(&cv.cv_condvar.cv_waiters)); ++ rc = -EINVAL; ++ break; ++ } ++ ++ /* Validate no waiting thread timed out early */ ++ for (i = 0; i < count; i++) ++ if (ct[i].ct_rc) ++ rc = ct[i].ct_rc; ++ ++ if (!rc) ++ splat_vprint(file, SPLAT_CONDVAR_TEST3_NAME, "Correctly woke " ++ "%d sleeping threads %d at a time\n", count, 1); ++ ++ /* Wait until that last nutex is dropped */ ++ while (mutex_owner(&cv.cv_mtx)) ++ schedule(); ++ ++ /* Wake everything for the failure case */ ++ cv_broadcast(&cv.cv_condvar); ++ cv_destroy(&cv.cv_condvar); ++ mutex_destroy(&cv.cv_mtx); ++ ++ return rc; ++} ++ ++static int ++splat_condvar_test5(struct file *file, void *arg) ++{ ++ kcondvar_t condvar; ++ kmutex_t mtx; ++ clock_t time_left, time_before, time_after, time_delta; ++ int64_t whole_delta; ++ int32_t remain_delta; ++ int rc = 0; ++ ++ mutex_init(&mtx, SPLAT_CONDVAR_TEST_NAME, MUTEX_DEFAULT, NULL); ++ cv_init(&condvar, NULL, CV_DEFAULT, NULL); ++ ++ splat_vprint(file, SPLAT_CONDVAR_TEST5_NAME, "Thread going to sleep for " ++ "%d second and expecting to be woken by timeout\n", 1); ++ ++ /* Allow a 1 second timeout, plenty long to validate correctness. */ ++ time_before = lbolt; ++ mutex_enter(&mtx); ++ time_left = cv_timedwait(&condvar, &mtx, lbolt + HZ); ++ mutex_exit(&mtx); ++ time_after = lbolt; ++ time_delta = time_after - time_before; /* XXX - Handle jiffie wrap */ ++ whole_delta = time_delta; ++ remain_delta = do_div(whole_delta, HZ); ++ ++ if (time_left == -1) { ++ if (time_delta >= HZ) { ++ splat_vprint(file, SPLAT_CONDVAR_TEST5_NAME, ++ "Thread correctly timed out and was asleep " ++ "for %d.%d seconds (%d second min)\n", ++ (int)whole_delta, remain_delta, 1); ++ } else { ++ splat_vprint(file, SPLAT_CONDVAR_TEST5_NAME, ++ "Thread correctly timed out but was only " ++ "asleep for %d.%d seconds (%d second " ++ "min)\n", (int)whole_delta, remain_delta, 1); ++ rc = -ETIMEDOUT; ++ } ++ } else { ++ splat_vprint(file, SPLAT_CONDVAR_TEST5_NAME, ++ "Thread exited after only %d.%d seconds, it " ++ "did not hit the %d second timeout\n", ++ (int)whole_delta, remain_delta, 1); ++ rc = -ETIMEDOUT; ++ } ++ ++ cv_destroy(&condvar); ++ mutex_destroy(&mtx); ++ ++ return rc; ++} ++ ++splat_subsystem_t * ++splat_condvar_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_CONDVAR_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_CONDVAR_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_CONDVAR; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_CONDVAR_TEST1_NAME, SPLAT_CONDVAR_TEST1_DESC, ++ SPLAT_CONDVAR_TEST1_ID, splat_condvar_test1); ++ SPLAT_TEST_INIT(sub, SPLAT_CONDVAR_TEST2_NAME, SPLAT_CONDVAR_TEST2_DESC, ++ SPLAT_CONDVAR_TEST2_ID, splat_condvar_test2); ++ SPLAT_TEST_INIT(sub, SPLAT_CONDVAR_TEST3_NAME, SPLAT_CONDVAR_TEST3_DESC, ++ SPLAT_CONDVAR_TEST3_ID, splat_condvar_test3); ++ SPLAT_TEST_INIT(sub, SPLAT_CONDVAR_TEST4_NAME, SPLAT_CONDVAR_TEST4_DESC, ++ SPLAT_CONDVAR_TEST4_ID, splat_condvar_test4); ++ SPLAT_TEST_INIT(sub, SPLAT_CONDVAR_TEST5_NAME, SPLAT_CONDVAR_TEST5_DESC, ++ SPLAT_CONDVAR_TEST5_ID, splat_condvar_test5); ++ ++ return sub; ++} ++ ++void ++splat_condvar_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ SPLAT_TEST_FINI(sub, SPLAT_CONDVAR_TEST5_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_CONDVAR_TEST4_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_CONDVAR_TEST3_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_CONDVAR_TEST2_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_CONDVAR_TEST1_ID); ++ ++ kfree(sub); ++} ++ ++int ++splat_condvar_id(void) { ++ return SPLAT_SUBSYSTEM_CONDVAR; ++} +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-cred.c linux-3.2.33-go/spl/splat/splat-cred.c +--- linux-3.2.33-go.orig/spl/splat/splat-cred.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-cred.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,250 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Credential Tests. ++\*****************************************************************************/ ++ ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_CRED_NAME "cred" ++#define SPLAT_CRED_DESC "Kernel Cred Tests" ++ ++#define SPLAT_CRED_TEST1_ID 0x0e01 ++#define SPLAT_CRED_TEST1_NAME "cred" ++#define SPLAT_CRED_TEST1_DESC "Task Credential Test" ++ ++#define SPLAT_CRED_TEST2_ID 0x0e02 ++#define SPLAT_CRED_TEST2_NAME "kcred" ++#define SPLAT_CRED_TEST2_DESC "Kernel Credential Test" ++ ++#define SPLAT_CRED_TEST3_ID 0x0e03 ++#define SPLAT_CRED_TEST3_NAME "groupmember" ++#define SPLAT_CRED_TEST3_DESC "Group Member Test" ++ ++#define GROUP_STR_SIZE 128 ++#define GROUP_STR_REDZONE 16 ++ ++static int ++splat_cred_test1(struct file *file, void *arg) ++{ ++ char str[GROUP_STR_SIZE]; ++ uid_t uid, ruid, suid; ++ gid_t gid, rgid, sgid, *groups; ++ int ngroups, i, count = 0; ++ ++ uid = crgetuid(CRED()); ++ ruid = crgetruid(CRED()); ++ suid = crgetsuid(CRED()); ++ ++ gid = crgetgid(CRED()); ++ rgid = crgetrgid(CRED()); ++ sgid = crgetsgid(CRED()); ++ ++ crhold(CRED()); ++ ngroups = crgetngroups(CRED()); ++ groups = crgetgroups(CRED()); ++ ++ memset(str, 0, GROUP_STR_SIZE); ++ for (i = 0; i < ngroups; i++) { ++ count += sprintf(str + count, "%d ", groups[i]); ++ ++ if (count > (GROUP_STR_SIZE - GROUP_STR_REDZONE)) { ++ splat_vprint(file, SPLAT_CRED_TEST1_NAME, ++ "Failed too many group entries for temp " ++ "buffer: %d, %s\n", ngroups, str); ++ return -ENOSPC; ++ } ++ } ++ ++ crfree(CRED()); ++ ++ splat_vprint(file, SPLAT_CRED_TEST1_NAME, ++ "uid: %d ruid: %d suid: %d " ++ "gid: %d rgid: %d sgid: %d\n", ++ uid, ruid, suid, gid, rgid, sgid); ++ splat_vprint(file, SPLAT_CRED_TEST1_NAME, ++ "ngroups: %d groups: %s\n", ngroups, str); ++ ++ if (uid || ruid || suid || gid || rgid || sgid) { ++ splat_vprint(file, SPLAT_CRED_TEST1_NAME, ++ "Failed expected all uids+gids to be %d\n", 0); ++ return -EIDRM; ++ } ++ ++ if (ngroups > NGROUPS_MAX) { ++ splat_vprint(file, SPLAT_CRED_TEST1_NAME, ++ "Failed ngroups must not exceed NGROUPS_MAX: " ++ "%d > %d\n", ngroups, NGROUPS_MAX); ++ return -EIDRM; ++ } ++ ++ splat_vprint(file, SPLAT_CRED_TEST1_NAME, ++ "Success sane CRED(): %d\n", 0); ++ ++ return 0; ++} /* splat_cred_test1() */ ++ ++static int ++splat_cred_test2(struct file *file, void *arg) ++{ ++ char str[GROUP_STR_SIZE]; ++ uid_t uid, ruid, suid; ++ gid_t gid, rgid, sgid, *groups; ++ int ngroups, i, count = 0; ++ ++ uid = crgetuid(kcred); ++ ruid = crgetruid(kcred); ++ suid = crgetsuid(kcred); ++ ++ gid = crgetgid(kcred); ++ rgid = crgetrgid(kcred); ++ sgid = crgetsgid(kcred); ++ ++ crhold(kcred); ++ ngroups = crgetngroups(kcred); ++ groups = crgetgroups(kcred); ++ ++ memset(str, 0, GROUP_STR_SIZE); ++ for (i = 0; i < ngroups; i++) { ++ count += sprintf(str + count, "%d ", groups[i]); ++ ++ if (count > (GROUP_STR_SIZE - GROUP_STR_REDZONE)) { ++ splat_vprint(file, SPLAT_CRED_TEST2_NAME, ++ "Failed too many group entries for temp " ++ "buffer: %d, %s\n", ngroups, str); ++ return -ENOSPC; ++ } ++ } ++ ++ crfree(kcred); ++ ++ splat_vprint(file, SPLAT_CRED_TEST2_NAME, ++ "uid: %d ruid: %d suid: %d " ++ "gid: %d rgid: %d sgid: %d\n", ++ uid, ruid, suid, gid, rgid, sgid); ++ splat_vprint(file, SPLAT_CRED_TEST2_NAME, ++ "ngroups: %d groups: %s\n", ngroups, str); ++ ++ if (uid || ruid || suid || gid || rgid || sgid) { ++ splat_vprint(file, SPLAT_CRED_TEST2_NAME, ++ "Failed expected all uids+gids to be %d\n", 0); ++ return -EIDRM; ++ } ++ ++ if (ngroups > NGROUPS_MAX) { ++ splat_vprint(file, SPLAT_CRED_TEST2_NAME, ++ "Failed ngroups must not exceed NGROUPS_MAX: " ++ "%d > %d\n", ngroups, NGROUPS_MAX); ++ return -EIDRM; ++ } ++ ++ splat_vprint(file, SPLAT_CRED_TEST2_NAME, ++ "Success sane kcred: %d\n", 0); ++ ++ return 0; ++} /* splat_cred_test2() */ ++ ++/* ++ * On most/all systems it can be expected that a task with root ++ * permissions also is a member of the root group, Since the ++ * test suite is always run as root we check first that CRED() is ++ * a member of the root group, and secondly that it is not a member ++ * of our fake group. This test will break is someone happens to ++ * create group number NGROUPS_MAX-1 and then added root to it. ++ */ ++static int ++splat_cred_test3(struct file *file, void *arg) ++{ ++ gid_t root_gid, fake_gid; ++ int rc; ++ ++ root_gid = 0; ++ fake_gid = NGROUPS_MAX-1; ++ ++ rc = groupmember(root_gid, CRED()); ++ if (!rc) { ++ splat_vprint(file, SPLAT_CRED_TEST3_NAME, ++ "Failed root git %d expected to be member " ++ "of CRED() groups: %d\n", root_gid, rc); ++ return -EIDRM; ++ } ++ ++ rc = groupmember(fake_gid, CRED()); ++ if (rc) { ++ splat_vprint(file, SPLAT_CRED_TEST3_NAME, ++ "Failed fake git %d expected not to be member " ++ "of CRED() groups: %d\n", fake_gid, rc); ++ return -EIDRM; ++ } ++ ++ splat_vprint(file, SPLAT_CRED_TEST3_NAME, "Success root gid " ++ "is a member of the expected groups: %d\n", rc); ++ ++ return rc; ++} /* splat_cred_test3() */ ++ ++splat_subsystem_t * ++splat_cred_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_CRED_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_CRED_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_CRED; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_CRED_TEST1_NAME, SPLAT_CRED_TEST1_DESC, ++ SPLAT_CRED_TEST1_ID, splat_cred_test1); ++ SPLAT_TEST_INIT(sub, SPLAT_CRED_TEST2_NAME, SPLAT_CRED_TEST2_DESC, ++ SPLAT_CRED_TEST2_ID, splat_cred_test2); ++ SPLAT_TEST_INIT(sub, SPLAT_CRED_TEST3_NAME, SPLAT_CRED_TEST3_DESC, ++ SPLAT_CRED_TEST3_ID, splat_cred_test3); ++ ++ return sub; ++} /* splat_cred_init() */ ++ ++void ++splat_cred_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ ++ SPLAT_TEST_FINI(sub, SPLAT_CRED_TEST3_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_CRED_TEST2_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_CRED_TEST1_ID); ++ ++ kfree(sub); ++} /* splat_cred_fini() */ ++ ++int ++splat_cred_id(void) ++{ ++ return SPLAT_SUBSYSTEM_CRED; ++} /* splat_cred_id() */ +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-ctl.c linux-3.2.33-go/spl/splat/splat-ctl.c +--- linux-3.2.33-go.orig/spl/splat/splat-ctl.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-ctl.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,723 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Test Control Interface. ++ * ++ * The 'splat' (Solaris Porting LAyer Tests) module is designed as a ++ * framework which runs various in kernel regression tests to validate ++ * the SPL primitives honor the Solaris ABI. ++ * ++ * The splat module is constructed of various splat_* source files each ++ * of which contain regression tests for a particular subsystem. For ++ * example, the splat_kmem.c file contains all the tests for validating ++ * the kmem interfaces have been implemented correctly. When the splat ++ * module is loaded splat_*_init() will be called for each subsystems ++ * tests. It is the responsibility of splat_*_init() to register all ++ * the tests for this subsystem using the SPLAT_TEST_INIT() macro. ++ * Similarly splat_*_fini() is called when the splat module is removed ++ * and is responsible for unregistering its tests via the SPLAT_TEST_FINI ++ * macro. Once a test is registered it can then be run with an ioctl() ++ * call which specifies the subsystem and test to be run. The provided ++ * splat command line tool can be used to display all available ++ * subsystems and tests. It can also be used to run the full suite ++ * of regression tests or particular tests. ++\*****************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "splat-internal.h" ++ ++static spl_class *splat_class; ++static spl_device *splat_device; ++static struct list_head splat_module_list; ++static spinlock_t splat_module_lock; ++ ++static int ++splat_open(struct inode *inode, struct file *file) ++{ ++ unsigned int minor = iminor(inode); ++ splat_info_t *info; ++ ++ if (minor >= SPLAT_MINORS) ++ return -ENXIO; ++ ++ info = (splat_info_t *)kmalloc(sizeof(*info), GFP_KERNEL); ++ if (info == NULL) ++ return -ENOMEM; ++ ++ mutex_init(&info->info_lock); ++ info->info_size = SPLAT_INFO_BUFFER_SIZE; ++ info->info_buffer = (char *)vmalloc(SPLAT_INFO_BUFFER_SIZE); ++ if (info->info_buffer == NULL) { ++ kfree(info); ++ return -ENOMEM; ++ } ++ memset(info->info_buffer, 0, info->info_size); ++ ++ info->info_head = info->info_buffer; ++ file->private_data = (void *)info; ++ ++ splat_print(file, "%s\n", spl_version); ++ ++ return 0; ++} ++ ++static int ++splat_release(struct inode *inode, struct file *file) ++{ ++ unsigned int minor = iminor(inode); ++ splat_info_t *info = (splat_info_t *)file->private_data; ++ ++ if (minor >= SPLAT_MINORS) ++ return -ENXIO; ++ ++ ASSERT(info); ++ ASSERT(info->info_buffer); ++ ++ mutex_destroy(&info->info_lock); ++ vfree(info->info_buffer); ++ kfree(info); ++ ++ return 0; ++} ++ ++static int ++splat_buffer_clear(struct file *file, splat_cfg_t *kcfg, unsigned long arg) ++{ ++ splat_info_t *info = (splat_info_t *)file->private_data; ++ ++ ASSERT(info); ++ ASSERT(info->info_buffer); ++ ++ mutex_lock(&info->info_lock); ++ memset(info->info_buffer, 0, info->info_size); ++ info->info_head = info->info_buffer; ++ mutex_unlock(&info->info_lock); ++ ++ return 0; ++} ++ ++static int ++splat_buffer_size(struct file *file, splat_cfg_t *kcfg, unsigned long arg) ++{ ++ splat_info_t *info = (splat_info_t *)file->private_data; ++ char *buf; ++ int min, size, rc = 0; ++ ++ ASSERT(info); ++ ASSERT(info->info_buffer); ++ ++ mutex_lock(&info->info_lock); ++ if (kcfg->cfg_arg1 > 0) { ++ ++ size = kcfg->cfg_arg1; ++ buf = (char *)vmalloc(size); ++ if (buf == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ /* Zero fill and truncate contents when coping buffer */ ++ min = ((size < info->info_size) ? size : info->info_size); ++ memset(buf, 0, size); ++ memcpy(buf, info->info_buffer, min); ++ vfree(info->info_buffer); ++ info->info_size = size; ++ info->info_buffer = buf; ++ info->info_head = info->info_buffer; ++ } ++ ++ kcfg->cfg_rc1 = info->info_size; ++ ++ if (copy_to_user((struct splat_cfg_t __user *)arg, kcfg, sizeof(*kcfg))) ++ rc = -EFAULT; ++out: ++ mutex_unlock(&info->info_lock); ++ ++ return rc; ++} ++ ++ ++static splat_subsystem_t * ++splat_subsystem_find(int id) { ++ splat_subsystem_t *sub; ++ ++ spin_lock(&splat_module_lock); ++ list_for_each_entry(sub, &splat_module_list, subsystem_list) { ++ if (id == sub->desc.id) { ++ spin_unlock(&splat_module_lock); ++ return sub; ++ } ++ } ++ spin_unlock(&splat_module_lock); ++ ++ return NULL; ++} ++ ++static int ++splat_subsystem_count(splat_cfg_t *kcfg, unsigned long arg) ++{ ++ splat_subsystem_t *sub; ++ int i = 0; ++ ++ spin_lock(&splat_module_lock); ++ list_for_each_entry(sub, &splat_module_list, subsystem_list) ++ i++; ++ ++ spin_unlock(&splat_module_lock); ++ kcfg->cfg_rc1 = i; ++ ++ if (copy_to_user((struct splat_cfg_t __user *)arg, kcfg, sizeof(*kcfg))) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++static int ++splat_subsystem_list(splat_cfg_t *kcfg, unsigned long arg) ++{ ++ splat_subsystem_t *sub; ++ splat_cfg_t *tmp; ++ int size, i = 0; ++ ++ /* Structure will be sized large enough for N subsystem entries ++ * which is passed in by the caller. On exit the number of ++ * entries filled in with valid subsystems will be stored in ++ * cfg_rc1. If the caller does not provide enough entries ++ * for all subsystems we will truncate the list to avoid overrun. ++ */ ++ size = sizeof(*tmp) + kcfg->cfg_data.splat_subsystems.size * ++ sizeof(splat_user_t); ++ tmp = kmalloc(size, GFP_KERNEL); ++ if (tmp == NULL) ++ return -ENOMEM; ++ ++ /* Local 'tmp' is used as the structure copied back to user space */ ++ memset(tmp, 0, size); ++ memcpy(tmp, kcfg, sizeof(*kcfg)); ++ ++ spin_lock(&splat_module_lock); ++ list_for_each_entry(sub, &splat_module_list, subsystem_list) { ++ strncpy(tmp->cfg_data.splat_subsystems.descs[i].name, ++ sub->desc.name, SPLAT_NAME_SIZE); ++ strncpy(tmp->cfg_data.splat_subsystems.descs[i].desc, ++ sub->desc.desc, SPLAT_DESC_SIZE); ++ tmp->cfg_data.splat_subsystems.descs[i].id = sub->desc.id; ++ ++ /* Truncate list if we are about to overrun alloc'ed memory */ ++ if ((i++) == kcfg->cfg_data.splat_subsystems.size) ++ break; ++ } ++ spin_unlock(&splat_module_lock); ++ tmp->cfg_rc1 = i; ++ ++ if (copy_to_user((struct splat_cfg_t __user *)arg, tmp, size)) { ++ kfree(tmp); ++ return -EFAULT; ++ } ++ ++ kfree(tmp); ++ return 0; ++} ++ ++static int ++splat_test_count(splat_cfg_t *kcfg, unsigned long arg) ++{ ++ splat_subsystem_t *sub; ++ splat_test_t *test; ++ int i = 0; ++ ++ /* Subsystem ID passed as arg1 */ ++ sub = splat_subsystem_find(kcfg->cfg_arg1); ++ if (sub == NULL) ++ return -EINVAL; ++ ++ spin_lock(&(sub->test_lock)); ++ list_for_each_entry(test, &(sub->test_list), test_list) ++ i++; ++ ++ spin_unlock(&(sub->test_lock)); ++ kcfg->cfg_rc1 = i; ++ ++ if (copy_to_user((struct splat_cfg_t __user *)arg, kcfg, sizeof(*kcfg))) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++static int ++splat_test_list(splat_cfg_t *kcfg, unsigned long arg) ++{ ++ splat_subsystem_t *sub; ++ splat_test_t *test; ++ splat_cfg_t *tmp; ++ int size, i = 0; ++ ++ /* Subsystem ID passed as arg1 */ ++ sub = splat_subsystem_find(kcfg->cfg_arg1); ++ if (sub == NULL) ++ return -EINVAL; ++ ++ /* Structure will be sized large enough for N test entries ++ * which is passed in by the caller. On exit the number of ++ * entries filled in with valid tests will be stored in ++ * cfg_rc1. If the caller does not provide enough entries ++ * for all tests we will truncate the list to avoid overrun. ++ */ ++ size = sizeof(*tmp)+kcfg->cfg_data.splat_tests.size*sizeof(splat_user_t); ++ tmp = kmalloc(size, GFP_KERNEL); ++ if (tmp == NULL) ++ return -ENOMEM; ++ ++ /* Local 'tmp' is used as the structure copied back to user space */ ++ memset(tmp, 0, size); ++ memcpy(tmp, kcfg, sizeof(*kcfg)); ++ ++ spin_lock(&(sub->test_lock)); ++ list_for_each_entry(test, &(sub->test_list), test_list) { ++ strncpy(tmp->cfg_data.splat_tests.descs[i].name, ++ test->desc.name, SPLAT_NAME_SIZE); ++ strncpy(tmp->cfg_data.splat_tests.descs[i].desc, ++ test->desc.desc, SPLAT_DESC_SIZE); ++ tmp->cfg_data.splat_tests.descs[i].id = test->desc.id; ++ ++ /* Truncate list if we are about to overrun alloc'ed memory */ ++ if ((i++) == kcfg->cfg_data.splat_tests.size) ++ break; ++ } ++ spin_unlock(&(sub->test_lock)); ++ tmp->cfg_rc1 = i; ++ ++ if (copy_to_user((struct splat_cfg_t __user *)arg, tmp, size)) { ++ kfree(tmp); ++ return -EFAULT; ++ } ++ ++ kfree(tmp); ++ return 0; ++} ++ ++static int ++splat_validate(struct file *file, splat_subsystem_t *sub, int cmd, void *arg) ++{ ++ splat_test_t *test; ++ ++ spin_lock(&(sub->test_lock)); ++ list_for_each_entry(test, &(sub->test_list), test_list) { ++ if (test->desc.id == cmd) { ++ spin_unlock(&(sub->test_lock)); ++ return test->test(file, arg); ++ } ++ } ++ spin_unlock(&(sub->test_lock)); ++ ++ return -EINVAL; ++} ++ ++static int ++splat_ioctl_cfg(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ splat_cfg_t kcfg; ++ int rc = 0; ++ ++ /* User and kernel space agree about arg size */ ++ if (_IOC_SIZE(cmd) != sizeof(kcfg)) ++ return -EBADMSG; ++ ++ if (copy_from_user(&kcfg, (splat_cfg_t *)arg, sizeof(kcfg))) ++ return -EFAULT; ++ ++ if (kcfg.cfg_magic != SPLAT_CFG_MAGIC) { ++ splat_print(file, "Bad config magic 0x%x != 0x%x\n", ++ kcfg.cfg_magic, SPLAT_CFG_MAGIC); ++ return -EINVAL; ++ } ++ ++ switch (kcfg.cfg_cmd) { ++ case SPLAT_CFG_BUFFER_CLEAR: ++ /* cfg_arg1 - Unused ++ * cfg_rc1 - Unused ++ */ ++ rc = splat_buffer_clear(file, &kcfg, arg); ++ break; ++ case SPLAT_CFG_BUFFER_SIZE: ++ /* cfg_arg1 - 0 - query size; >0 resize ++ * cfg_rc1 - Set to current buffer size ++ */ ++ rc = splat_buffer_size(file, &kcfg, arg); ++ break; ++ case SPLAT_CFG_SUBSYSTEM_COUNT: ++ /* cfg_arg1 - Unused ++ * cfg_rc1 - Set to number of subsystems ++ */ ++ rc = splat_subsystem_count(&kcfg, arg); ++ break; ++ case SPLAT_CFG_SUBSYSTEM_LIST: ++ /* cfg_arg1 - Unused ++ * cfg_rc1 - Set to number of subsystems ++ * cfg_data.splat_subsystems - Set with subsystems ++ */ ++ rc = splat_subsystem_list(&kcfg, arg); ++ break; ++ case SPLAT_CFG_TEST_COUNT: ++ /* cfg_arg1 - Set to a target subsystem ++ * cfg_rc1 - Set to number of tests ++ */ ++ rc = splat_test_count(&kcfg, arg); ++ break; ++ case SPLAT_CFG_TEST_LIST: ++ /* cfg_arg1 - Set to a target subsystem ++ * cfg_rc1 - Set to number of tests ++ * cfg_data.splat_subsystems - Populated with tests ++ */ ++ rc = splat_test_list(&kcfg, arg); ++ break; ++ default: ++ splat_print(file, "Bad config command %d\n", ++ kcfg.cfg_cmd); ++ rc = -EINVAL; ++ break; ++ } ++ ++ return rc; ++} ++ ++static int ++splat_ioctl_cmd(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ splat_subsystem_t *sub; ++ splat_cmd_t kcmd; ++ int rc = -EINVAL; ++ void *data = NULL; ++ ++ /* User and kernel space agree about arg size */ ++ if (_IOC_SIZE(cmd) != sizeof(kcmd)) ++ return -EBADMSG; ++ ++ if (copy_from_user(&kcmd, (splat_cfg_t *)arg, sizeof(kcmd))) ++ return -EFAULT; ++ ++ if (kcmd.cmd_magic != SPLAT_CMD_MAGIC) { ++ splat_print(file, "Bad command magic 0x%x != 0x%x\n", ++ kcmd.cmd_magic, SPLAT_CFG_MAGIC); ++ return -EINVAL; ++ } ++ ++ /* Allocate memory for any opaque data the caller needed to pass on */ ++ if (kcmd.cmd_data_size > 0) { ++ data = (void *)kmalloc(kcmd.cmd_data_size, GFP_KERNEL); ++ if (data == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(data, (void *)(arg + offsetof(splat_cmd_t, ++ cmd_data_str)), kcmd.cmd_data_size)) { ++ kfree(data); ++ return -EFAULT; ++ } ++ } ++ ++ sub = splat_subsystem_find(kcmd.cmd_subsystem); ++ if (sub != NULL) ++ rc = splat_validate(file, sub, kcmd.cmd_test, data); ++ else ++ rc = -EINVAL; ++ ++ if (data != NULL) ++ kfree(data); ++ ++ return rc; ++} ++ ++static long ++splat_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ unsigned int minor = iminor(file->f_dentry->d_inode); ++ int rc = 0; ++ ++ /* Ignore tty ioctls */ ++ if ((cmd & 0xffffff00) == ((int)'T') << 8) ++ return -ENOTTY; ++ ++ if (minor >= SPLAT_MINORS) ++ return -ENXIO; ++ ++ switch (cmd) { ++ case SPLAT_CFG: ++ rc = splat_ioctl_cfg(file, cmd, arg); ++ break; ++ case SPLAT_CMD: ++ rc = splat_ioctl_cmd(file, cmd, arg); ++ break; ++ default: ++ splat_print(file, "Bad ioctl command %d\n", cmd); ++ rc = -EINVAL; ++ break; ++ } ++ ++ return rc; ++} ++ ++#ifdef CONFIG_COMPAT ++/* Compatibility handler for ioctls from 32-bit ELF binaries */ ++static long ++splat_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ return splat_unlocked_ioctl(file, cmd, arg); ++} ++#endif /* CONFIG_COMPAT */ ++ ++/* I'm not sure why you would want to write in to this buffer from ++ * user space since its principle use is to pass test status info ++ * back to the user space, but I don't see any reason to prevent it. ++ */ ++static ssize_t splat_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ unsigned int minor = iminor(file->f_dentry->d_inode); ++ splat_info_t *info = (splat_info_t *)file->private_data; ++ int rc = 0; ++ ++ if (minor >= SPLAT_MINORS) ++ return -ENXIO; ++ ++ ASSERT(info); ++ ASSERT(info->info_buffer); ++ ++ mutex_lock(&info->info_lock); ++ ++ /* Write beyond EOF */ ++ if (*ppos >= info->info_size) { ++ rc = -EFBIG; ++ goto out; ++ } ++ ++ /* Resize count if beyond EOF */ ++ if (*ppos + count > info->info_size) ++ count = info->info_size - *ppos; ++ ++ if (copy_from_user(info->info_buffer, buf, count)) { ++ rc = -EFAULT; ++ goto out; ++ } ++ ++ *ppos += count; ++ rc = count; ++out: ++ mutex_unlock(&info->info_lock); ++ return rc; ++} ++ ++static ssize_t splat_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ unsigned int minor = iminor(file->f_dentry->d_inode); ++ splat_info_t *info = (splat_info_t *)file->private_data; ++ int rc = 0; ++ ++ if (minor >= SPLAT_MINORS) ++ return -ENXIO; ++ ++ ASSERT(info); ++ ASSERT(info->info_buffer); ++ ++ mutex_lock(&info->info_lock); ++ ++ /* Read beyond EOF */ ++ if (*ppos >= info->info_size) ++ goto out; ++ ++ /* Resize count if beyond EOF */ ++ if (*ppos + count > info->info_size) ++ count = info->info_size - *ppos; ++ ++ if (copy_to_user(buf, info->info_buffer + *ppos, count)) { ++ rc = -EFAULT; ++ goto out; ++ } ++ ++ *ppos += count; ++ rc = count; ++out: ++ mutex_unlock(&info->info_lock); ++ return rc; ++} ++ ++static loff_t splat_seek(struct file *file, loff_t offset, int origin) ++{ ++ unsigned int minor = iminor(file->f_dentry->d_inode); ++ splat_info_t *info = (splat_info_t *)file->private_data; ++ int rc = -EINVAL; ++ ++ if (minor >= SPLAT_MINORS) ++ return -ENXIO; ++ ++ ASSERT(info); ++ ASSERT(info->info_buffer); ++ ++ mutex_lock(&info->info_lock); ++ ++ switch (origin) { ++ case 0: /* SEEK_SET - No-op just do it */ ++ break; ++ case 1: /* SEEK_CUR - Seek from current */ ++ offset = file->f_pos + offset; ++ break; ++ case 2: /* SEEK_END - Seek from end */ ++ offset = info->info_size + offset; ++ break; ++ } ++ ++ if (offset >= 0) { ++ file->f_pos = offset; ++ file->f_version = 0; ++ rc = offset; ++ } ++ ++ mutex_unlock(&info->info_lock); ++ ++ return rc; ++} ++ ++static struct cdev splat_cdev; ++static struct file_operations splat_fops = { ++ .owner = THIS_MODULE, ++ .open = splat_open, ++ .release = splat_release, ++ .unlocked_ioctl = splat_unlocked_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = splat_compat_ioctl, ++#endif ++ .read = splat_read, ++ .write = splat_write, ++ .llseek = splat_seek, ++}; ++ ++static int ++splat_init(void) ++{ ++ dev_t dev; ++ int rc; ++ ++ spin_lock_init(&splat_module_lock); ++ INIT_LIST_HEAD(&splat_module_list); ++ ++ SPLAT_SUBSYSTEM_INIT(kmem); ++ SPLAT_SUBSYSTEM_INIT(taskq); ++ SPLAT_SUBSYSTEM_INIT(krng); ++ SPLAT_SUBSYSTEM_INIT(mutex); ++ SPLAT_SUBSYSTEM_INIT(condvar); ++ SPLAT_SUBSYSTEM_INIT(thread); ++ SPLAT_SUBSYSTEM_INIT(rwlock); ++ SPLAT_SUBSYSTEM_INIT(time); ++ SPLAT_SUBSYSTEM_INIT(vnode); ++ SPLAT_SUBSYSTEM_INIT(kobj); ++ SPLAT_SUBSYSTEM_INIT(atomic); ++ SPLAT_SUBSYSTEM_INIT(list); ++ SPLAT_SUBSYSTEM_INIT(generic); ++ SPLAT_SUBSYSTEM_INIT(cred); ++ SPLAT_SUBSYSTEM_INIT(zlib); ++ SPLAT_SUBSYSTEM_INIT(linux); ++ ++ dev = MKDEV(SPLAT_MAJOR, 0); ++ if ((rc = register_chrdev_region(dev, SPLAT_MINORS, SPLAT_NAME))) ++ goto error; ++ ++ /* Support for registering a character driver */ ++ cdev_init(&splat_cdev, &splat_fops); ++ splat_cdev.owner = THIS_MODULE; ++ kobject_set_name(&splat_cdev.kobj, SPLAT_NAME); ++ if ((rc = cdev_add(&splat_cdev, dev, SPLAT_MINORS))) { ++ printk(KERN_ERR "SPLAT: Error adding cdev, %d\n", rc); ++ kobject_put(&splat_cdev.kobj); ++ unregister_chrdev_region(dev, SPLAT_MINORS); ++ goto error; ++ } ++ ++ /* Support for udev make driver info available in sysfs */ ++ splat_class = spl_class_create(THIS_MODULE, "splat"); ++ if (IS_ERR(splat_class)) { ++ rc = PTR_ERR(splat_class); ++ printk(KERN_ERR "SPLAT: Error creating splat class, %d\n", rc); ++ cdev_del(&splat_cdev); ++ unregister_chrdev_region(dev, SPLAT_MINORS); ++ goto error; ++ } ++ ++ splat_device = spl_device_create(splat_class, NULL, ++ MKDEV(SPLAT_MAJOR, 0), ++ NULL, SPLAT_NAME); ++ ++ printk(KERN_INFO "SPLAT: Loaded module v%s-%s%s\n", ++ SPL_META_VERSION, SPL_META_RELEASE, SPL_DEBUG_STR); ++ return 0; ++error: ++ printk(KERN_ERR "SPLAT: Error registering splat device, %d\n", rc); ++ return rc; ++} ++ ++static int ++splat_fini(void) ++{ ++ dev_t dev = MKDEV(SPLAT_MAJOR, 0); ++ ++ spl_device_destroy(splat_class, splat_device, dev); ++ spl_class_destroy(splat_class); ++ cdev_del(&splat_cdev); ++ unregister_chrdev_region(dev, SPLAT_MINORS); ++ ++ SPLAT_SUBSYSTEM_FINI(linux); ++ SPLAT_SUBSYSTEM_FINI(zlib); ++ SPLAT_SUBSYSTEM_FINI(cred); ++ SPLAT_SUBSYSTEM_FINI(generic); ++ SPLAT_SUBSYSTEM_FINI(list); ++ SPLAT_SUBSYSTEM_FINI(atomic); ++ SPLAT_SUBSYSTEM_FINI(kobj); ++ SPLAT_SUBSYSTEM_FINI(vnode); ++ SPLAT_SUBSYSTEM_FINI(time); ++ SPLAT_SUBSYSTEM_FINI(rwlock); ++ SPLAT_SUBSYSTEM_FINI(thread); ++ SPLAT_SUBSYSTEM_FINI(condvar); ++ SPLAT_SUBSYSTEM_FINI(mutex); ++ SPLAT_SUBSYSTEM_FINI(krng); ++ SPLAT_SUBSYSTEM_FINI(taskq); ++ SPLAT_SUBSYSTEM_FINI(kmem); ++ ++ ASSERT(list_empty(&splat_module_list)); ++ printk(KERN_INFO "SPLAT: Unloaded module v%s-%s%s\n", ++ SPL_META_VERSION, SPL_META_RELEASE, SPL_DEBUG_STR); ++ ++ return 0; ++} ++ ++spl_module_init(splat_init); ++spl_module_exit(splat_fini); ++ ++MODULE_AUTHOR("Lawrence Livermore National Labs"); ++MODULE_DESCRIPTION("Solaris Porting LAyer Tests"); ++MODULE_LICENSE("GPL"); +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-generic.c linux-3.2.33-go/spl/splat/splat-generic.c +--- linux-3.2.33-go.orig/spl/splat/splat-generic.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-generic.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,366 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Generic Tests. ++\*****************************************************************************/ ++ ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_GENERIC_NAME "generic" ++#define SPLAT_GENERIC_DESC "Kernel Generic Tests" ++ ++#define SPLAT_GENERIC_TEST1_ID 0x0d01 ++#define SPLAT_GENERIC_TEST1_NAME "ddi_strtoul" ++#define SPLAT_GENERIC_TEST1_DESC "ddi_strtoul Test" ++ ++#define SPLAT_GENERIC_TEST2_ID 0x0d02 ++#define SPLAT_GENERIC_TEST2_NAME "ddi_strtol" ++#define SPLAT_GENERIC_TEST2_DESC "ddi_strtol Test" ++ ++#define SPLAT_GENERIC_TEST3_ID 0x0d03 ++#define SPLAT_GENERIC_TEST3_NAME "ddi_strtoull" ++#define SPLAT_GENERIC_TEST3_DESC "ddi_strtoull Test" ++ ++#define SPLAT_GENERIC_TEST4_ID 0x0d04 ++#define SPLAT_GENERIC_TEST4_NAME "ddi_strtoll" ++#define SPLAT_GENERIC_TEST4_DESC "ddi_strtoll Test" ++ ++# define SPLAT_GENERIC_TEST5_ID 0x0d05 ++# define SPLAT_GENERIC_TEST5_NAME "udivdi3" ++# define SPLAT_GENERIC_TEST5_DESC "Unsigned Div-64 Test" ++ ++# define SPLAT_GENERIC_TEST6_ID 0x0d06 ++# define SPLAT_GENERIC_TEST6_NAME "divdi3" ++# define SPLAT_GENERIC_TEST6_DESC "Signed Div-64 Test" ++ ++#define STR_POS "123456789" ++#define STR_NEG "-123456789" ++#define STR_BASE "0xabcdef" ++#define STR_RANGE_MAX "10000000000000000" ++#define STR_RANGE_MIN "-10000000000000000" ++#define STR_INVAL1 "12345U" ++#define STR_INVAL2 "invald" ++ ++#define VAL_POS 123456789 ++#define VAL_NEG -123456789 ++#define VAL_BASE 0xabcdef ++#define VAL_INVAL1 12345U ++ ++#define define_generic_msg_strtox(type, valtype) \ ++static void \ ++generic_msg_strto##type(struct file *file, char *msg, int rc, int *err, \ ++ const char *s, valtype d, char *endptr) \ ++{ \ ++ splat_vprint(file, SPLAT_GENERIC_TEST1_NAME, \ ++ "%s (%d) %s: %s == %lld, 0x%p\n", \ ++ rc ? "Fail" : "Pass", *err, msg, s, \ ++ (unsigned long long)d, endptr); \ ++ *err = rc; \ ++} ++ ++define_generic_msg_strtox(ul, unsigned long); ++define_generic_msg_strtox(l, long); ++define_generic_msg_strtox(ull, unsigned long long); ++define_generic_msg_strtox(ll, long long); ++ ++#define define_splat_generic_test_strtox(type, valtype) \ ++static int \ ++splat_generic_test_strto##type(struct file *file, void *arg) \ ++{ \ ++ int rc, rc1, rc2, rc3, rc4, rc5, rc6, rc7; \ ++ char str[20], *endptr; \ ++ valtype r; \ ++ \ ++ /* Positive value: expect success */ \ ++ r = 0; \ ++ rc = 1; \ ++ endptr = NULL; \ ++ rc1 = ddi_strto##type(STR_POS, &endptr, 10, &r); \ ++ if (rc1 == 0 && r == VAL_POS && endptr && *endptr == '\0') \ ++ rc = 0; \ ++ \ ++ generic_msg_strto##type(file, "positive", rc , &rc1, \ ++ STR_POS, r, endptr); \ ++ \ ++ /* Negative value: expect success */ \ ++ r = 0; \ ++ rc = 1; \ ++ endptr = NULL; \ ++ strcpy(str, STR_NEG); \ ++ rc2 = ddi_strto##type(str, &endptr, 10, &r); \ ++ if (#type[0] == 'u') { \ ++ if (rc2 == 0 && r == 0 && endptr == str) \ ++ rc = 0; \ ++ } else { \ ++ if (rc2 == 0 && r == VAL_NEG && \ ++ endptr && *endptr == '\0') \ ++ rc = 0; \ ++ } \ ++ \ ++ generic_msg_strto##type(file, "negative", rc, &rc2, \ ++ STR_NEG, r, endptr); \ ++ \ ++ /* Non decimal base: expect sucess */ \ ++ r = 0; \ ++ rc = 1; \ ++ endptr = NULL; \ ++ rc3 = ddi_strto##type(STR_BASE, &endptr, 0, &r); \ ++ if (rc3 == 0 && r == VAL_BASE && endptr && *endptr == '\0') \ ++ rc = 0; \ ++ \ ++ generic_msg_strto##type(file, "base", rc, &rc3, \ ++ STR_BASE, r, endptr); \ ++ \ ++ /* Max out of range: failure expected, r unchanged */ \ ++ r = 0; \ ++ rc = 1; \ ++ endptr = NULL; \ ++ rc4 = ddi_strto##type(STR_RANGE_MAX, &endptr, 16, &r); \ ++ if (rc4 == ERANGE && r == 0 && endptr == NULL) \ ++ rc = 0; \ ++ \ ++ generic_msg_strto##type(file, "max", rc, &rc4, \ ++ STR_RANGE_MAX, r, endptr); \ ++ \ ++ /* Min out of range: failure expected, r unchanged */ \ ++ r = 0; \ ++ rc = 1; \ ++ endptr = NULL; \ ++ strcpy(str, STR_RANGE_MIN); \ ++ rc5 = ddi_strto##type(str, &endptr, 16, &r); \ ++ if (#type[0] == 'u') { \ ++ if (rc5 == 0 && r == 0 && endptr == str) \ ++ rc = 0; \ ++ } else { \ ++ if (rc5 == ERANGE && r == 0 && endptr == NULL) \ ++ rc = 0; \ ++ } \ ++ \ ++ generic_msg_strto##type(file, "min", rc, &rc5, \ ++ STR_RANGE_MIN, r, endptr); \ ++ \ ++ /* Invalid string: success expected, endptr == 'U' */ \ ++ r = 0; \ ++ rc = 1; \ ++ endptr = NULL; \ ++ rc6 = ddi_strto##type(STR_INVAL1, &endptr, 10, &r); \ ++ if (rc6 == 0 && r == VAL_INVAL1 && endptr && *endptr == 'U') \ ++ rc = 0; \ ++ \ ++ generic_msg_strto##type(file, "invalid", rc, &rc6, \ ++ STR_INVAL1, r, endptr); \ ++ \ ++ /* Invalid string: failure expected, endptr == str */ \ ++ r = 0; \ ++ rc = 1; \ ++ endptr = NULL; \ ++ strcpy(str, STR_INVAL2); \ ++ rc7 = ddi_strto##type(str, &endptr, 10, &r); \ ++ if (rc7 == 0 && r == 0 && endptr == str) \ ++ rc = 0; \ ++ \ ++ generic_msg_strto##type(file, "invalid", rc, &rc7, \ ++ STR_INVAL2, r, endptr); \ ++ \ ++ return (rc1 || rc2 || rc3 || rc4 || rc5 || rc6 || rc7) ? \ ++ -EINVAL : 0; \ ++} ++ ++define_splat_generic_test_strtox(ul, unsigned long); ++define_splat_generic_test_strtox(l, long); ++define_splat_generic_test_strtox(ull, unsigned long long); ++define_splat_generic_test_strtox(ll, long long); ++ ++/* ++ * The entries in the table are used in all combinations and the ++ * return value is checked to ensure it is range. On 32-bit ++ * systems __udivdi3 will be invoked for the 64-bit division. ++ * On 64-bit system the native 64-bit divide will be used so ++ * __udivdi3 isn't used but we might as well stil run the test. ++ */ ++static int ++splat_generic_test_udivdi3(struct file *file, void *arg) ++{ ++ const uint64_t tabu[] = { ++ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ++ 10, 11, 12, 13, 14, 15, 16, 1000, 2003, ++ 32765, 32766, 32767, 32768, 32769, 32760, ++ 65533, 65534, 65535, 65536, 65537, 65538, ++ 0x7ffffffeULL, 0x7fffffffULL, 0x80000000ULL, 0x80000001ULL, ++ 0x7000000000000000ULL, 0x7000000080000000ULL, 0x7000000080000001ULL, ++ 0x7fffffffffffffffULL, 0x7fffffff8fffffffULL, 0x7fffffff8ffffff1ULL, ++ 0x7fffffff00000000ULL, 0x7fffffff80000000ULL, 0x7fffffff00000001ULL, ++ 0x8000000000000000ULL, 0x8000000080000000ULL, 0x8000000080000001ULL, ++ 0xc000000000000000ULL, 0xc000000080000000ULL, 0xc000000080000001ULL, ++ 0xfffffffffffffffdULL, 0xfffffffffffffffeULL, 0xffffffffffffffffULL, ++ }; ++ uint64_t uu, vu, qu, ru; ++ int n, i, j, errors = 0; ++ ++ splat_vprint(file, SPLAT_GENERIC_TEST5_NAME, "%s", ++ "Testing unsigned 64-bit division.\n"); ++ n = sizeof(tabu) / sizeof(tabu[0]); ++ for (i = 0; i < n; i++) { ++ for (j = 1; j < n; j++) { ++ uu = tabu[i]; ++ vu = tabu[j]; ++ qu = uu / vu; /* __udivdi3 */ ++ ru = uu - qu * vu; ++ if (qu > uu || ru >= vu) { ++ splat_vprint(file, SPLAT_GENERIC_TEST5_NAME, ++ "%016llx/%016llx != %016llx rem %016llx\n", ++ uu, vu, qu, ru); ++ errors++; ++ } ++ } ++ } ++ ++ if (errors) { ++ splat_vprint(file, SPLAT_GENERIC_TEST5_NAME, ++ "Failed %d/%d tests\n", errors, n * (n - 1)); ++ return -ERANGE; ++ } ++ ++ splat_vprint(file, SPLAT_GENERIC_TEST5_NAME, ++ "Passed all %d tests\n", n * (n - 1)); ++ ++ return 0; ++} ++ ++/* ++ * The entries the table are used in all combinations, with + and - signs ++ * preceding them. The return value is checked to ensure it is range. ++ * On 32-bit systems __divdi3 will be invoked for the 64-bit division. ++ * On 64-bit system the native 64-bit divide will be used so __divdi3 ++ * isn't used but we might as well stil run the test. ++ */ ++static int ++splat_generic_test_divdi3(struct file *file, void *arg) ++{ ++ const int64_t tabs[] = { ++ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ++ 10, 11, 12, 13, 14, 15, 16, 1000, 2003, ++ 32765, 32766, 32767, 32768, 32769, 32760, ++ 65533, 65534, 65535, 65536, 65537, 65538, ++ 0x7ffffffeLL, 0x7fffffffLL, 0x80000000LL, 0x80000001LL, ++ 0x7000000000000000LL, 0x7000000080000000LL, 0x7000000080000001LL, ++ 0x7fffffffffffffffLL, 0x7fffffff8fffffffLL, 0x7fffffff8ffffff1LL, ++ 0x7fffffff00000000LL, 0x7fffffff80000000LL, 0x7fffffff00000001LL, ++ 0x0123456789abcdefLL, 0x00000000abcdef01LL, 0x0000000012345678LL, ++#if BITS_PER_LONG == 32 ++ 0x8000000000000000LL, 0x8000000080000000LL, 0x8000000080000001LL, ++#endif ++ }; ++ int64_t u, v, q, r; ++ int n, i, j, k, errors = 0; ++ ++ splat_vprint(file, SPLAT_GENERIC_TEST6_NAME, "%s", ++ "Testing signed 64-bit division.\n"); ++ n = sizeof(tabs) / sizeof(tabs[0]); ++ for (i = 0; i < n; i++) { ++ for (j = 1; j < n; j++) { ++ for (k = 0; k <= 3; k++) { ++ u = (k & 1) ? -tabs[i] : tabs[i]; ++ v = (k >= 2) ? -tabs[j] : tabs[j]; ++ ++ q = u / v; /* __divdi3 */ ++ r = u - q * v; ++ if (abs64(q) > abs64(u) || ++ abs64(r) >= abs64(v) || ++ (r != 0 && (r ^ u) < 0)) { ++ splat_vprint(file, ++ SPLAT_GENERIC_TEST6_NAME, ++ "%016llx/%016llx != %016llx " ++ "rem %016llx\n", u, v, q, r); ++ errors++; ++ } ++ } ++ } ++ } ++ ++ if (errors) { ++ splat_vprint(file, SPLAT_GENERIC_TEST6_NAME, ++ "Failed %d/%d tests\n", errors, n * (n - 1)); ++ return -ERANGE; ++ } ++ ++ splat_vprint(file, SPLAT_GENERIC_TEST6_NAME, ++ "Passed all %d tests\n", n * (n - 1)); ++ ++ return 0; ++} ++ ++splat_subsystem_t * ++splat_generic_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_GENERIC_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_GENERIC_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_GENERIC; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_GENERIC_TEST1_NAME, SPLAT_GENERIC_TEST1_DESC, ++ SPLAT_GENERIC_TEST1_ID, splat_generic_test_strtoul); ++ SPLAT_TEST_INIT(sub, SPLAT_GENERIC_TEST2_NAME, SPLAT_GENERIC_TEST2_DESC, ++ SPLAT_GENERIC_TEST2_ID, splat_generic_test_strtol); ++ SPLAT_TEST_INIT(sub, SPLAT_GENERIC_TEST3_NAME, SPLAT_GENERIC_TEST3_DESC, ++ SPLAT_GENERIC_TEST3_ID, splat_generic_test_strtoull); ++ SPLAT_TEST_INIT(sub, SPLAT_GENERIC_TEST4_NAME, SPLAT_GENERIC_TEST4_DESC, ++ SPLAT_GENERIC_TEST4_ID, splat_generic_test_strtoll); ++ SPLAT_TEST_INIT(sub, SPLAT_GENERIC_TEST5_NAME, SPLAT_GENERIC_TEST5_DESC, ++ SPLAT_GENERIC_TEST5_ID, splat_generic_test_udivdi3); ++ SPLAT_TEST_INIT(sub, SPLAT_GENERIC_TEST6_NAME, SPLAT_GENERIC_TEST6_DESC, ++ SPLAT_GENERIC_TEST6_ID, splat_generic_test_divdi3); ++ ++ return sub; ++} ++ ++void ++splat_generic_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ ++ SPLAT_TEST_FINI(sub, SPLAT_GENERIC_TEST6_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_GENERIC_TEST5_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_GENERIC_TEST4_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_GENERIC_TEST3_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_GENERIC_TEST2_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_GENERIC_TEST1_ID); ++ ++ kfree(sub); ++} ++ ++int ++splat_generic_id(void) ++{ ++ return SPLAT_SUBSYSTEM_GENERIC; ++} +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-internal.h linux-3.2.33-go/spl/splat/splat-internal.h +--- linux-3.2.33-go.orig/spl/splat/splat-internal.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-internal.h 2012-11-16 23:22:32.410192863 +0100 +@@ -0,0 +1,218 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++\*****************************************************************************/ ++ ++#ifndef _SPLAT_INTERNAL_H ++#define _SPLAT_INTERNAL_H ++ ++#include "spl-device.h" ++#include "spl-debug.h" ++#include "splat-ctl.h" ++ ++#define SPLAT_SUBSYSTEM_INIT(type) \ ++({ splat_subsystem_t *_sub_; \ ++ \ ++ _sub_ = (splat_subsystem_t *)splat_##type##_init(); \ ++ if (_sub_ == NULL) { \ ++ printk(KERN_ERR "splat: Error initializing: " #type "\n"); \ ++ } else { \ ++ spin_lock(&splat_module_lock); \ ++ list_add_tail(&(_sub_->subsystem_list), \ ++ &splat_module_list); \ ++ spin_unlock(&splat_module_lock); \ ++ } \ ++}) ++ ++#define SPLAT_SUBSYSTEM_FINI(type) \ ++({ splat_subsystem_t *_sub_, *_tmp_; \ ++ int _id_, _flag_ = 0; \ ++ \ ++ _id_ = splat_##type##_id(); \ ++ spin_lock(&splat_module_lock); \ ++ list_for_each_entry_safe(_sub_, _tmp_, &splat_module_list, \ ++ subsystem_list) { \ ++ if (_sub_->desc.id == _id_) { \ ++ list_del_init(&(_sub_->subsystem_list)); \ ++ spin_unlock(&splat_module_lock); \ ++ splat_##type##_fini(_sub_); \ ++ spin_lock(&splat_module_lock); \ ++ _flag_ = 1; \ ++ } \ ++ } \ ++ spin_unlock(&splat_module_lock); \ ++ \ ++ if (!_flag_) \ ++ printk(KERN_ERR "splat: Error finalizing: " #type "\n"); \ ++}) ++ ++#define SPLAT_TEST_INIT(sub, n, d, tid, func) \ ++({ splat_test_t *_test_; \ ++ \ ++ _test_ = (splat_test_t *)kmalloc(sizeof(*_test_), GFP_KERNEL); \ ++ if (_test_ == NULL) { \ ++ printk(KERN_ERR "splat: Error initializing: " n "/" #tid" \n");\ ++ } else { \ ++ memset(_test_, 0, sizeof(*_test_)); \ ++ strncpy(_test_->desc.name, n, SPLAT_NAME_SIZE-1); \ ++ strncpy(_test_->desc.desc, d, SPLAT_DESC_SIZE-1); \ ++ _test_->desc.id = tid; \ ++ _test_->test = func; \ ++ INIT_LIST_HEAD(&(_test_->test_list)); \ ++ spin_lock(&((sub)->test_lock)); \ ++ list_add_tail(&(_test_->test_list),&((sub)->test_list));\ ++ spin_unlock(&((sub)->test_lock)); \ ++ } \ ++}) ++ ++#define SPLAT_TEST_FINI(sub, tid) \ ++({ splat_test_t *_test_, *_tmp_; \ ++ int _flag_ = 0; \ ++ \ ++ spin_lock(&((sub)->test_lock)); \ ++ list_for_each_entry_safe(_test_, _tmp_, \ ++ &((sub)->test_list), test_list) { \ ++ if (_test_->desc.id == tid) { \ ++ list_del_init(&(_test_->test_list)); \ ++ _flag_ = 1; \ ++ } \ ++ } \ ++ spin_unlock(&((sub)->test_lock)); \ ++ \ ++ if (!_flag_) \ ++ printk(KERN_ERR "splat: Error finalizing: " #tid "\n"); \ ++}) ++ ++typedef int (*splat_test_func_t)(struct file *, void *); ++ ++typedef struct splat_test { ++ struct list_head test_list; ++ splat_user_t desc; ++ splat_test_func_t test; ++} splat_test_t; ++ ++typedef struct splat_subsystem { ++ struct list_head subsystem_list;/* List had to chain entries */ ++ splat_user_t desc; ++ spinlock_t test_lock; ++ struct list_head test_list; ++} splat_subsystem_t; ++ ++#define SPLAT_INFO_BUFFER_SIZE 65536 ++#define SPLAT_INFO_BUFFER_REDZONE 256 ++ ++typedef struct splat_info { ++ struct mutex info_lock; ++ int info_size; ++ char *info_buffer; ++ char *info_head; /* Internal kernel use only */ ++} splat_info_t; ++ ++#define sym2str(sym) (char *)(#sym) ++ ++#define splat_print(file, format, args...) \ ++({ splat_info_t *_info_ = (splat_info_t *)file->private_data; \ ++ int _rc_; \ ++ \ ++ ASSERT(_info_); \ ++ ASSERT(_info_->info_buffer); \ ++ \ ++ mutex_lock(&_info_->info_lock); \ ++ \ ++ /* Don't allow the kernel to start a write in the red zone */ \ ++ if ((int)(_info_->info_head - _info_->info_buffer) > \ ++ (SPLAT_INFO_BUFFER_SIZE - SPLAT_INFO_BUFFER_REDZONE)) { \ ++ _rc_ = -EOVERFLOW; \ ++ } else { \ ++ _rc_ = sprintf(_info_->info_head, format, args); \ ++ if (_rc_ >= 0) \ ++ _info_->info_head += _rc_; \ ++ } \ ++ \ ++ mutex_unlock(&_info_->info_lock); \ ++ _rc_; \ ++}) ++ ++#define splat_vprint(file, test, format, args...) \ ++ splat_print(file, "%*s: " format, SPLAT_NAME_SIZE, test, args) ++ ++#define splat_locked_test(lock, test) \ ++({ \ ++ int _rc_; \ ++ spin_lock(lock); \ ++ _rc_ = (test) ? 1 : 0; \ ++ spin_unlock(lock); \ ++ _rc_; \ ++}) ++ ++splat_subsystem_t *splat_condvar_init(void); ++splat_subsystem_t *splat_kmem_init(void); ++splat_subsystem_t *splat_mutex_init(void); ++splat_subsystem_t *splat_krng_init(void); ++splat_subsystem_t *splat_rwlock_init(void); ++splat_subsystem_t *splat_taskq_init(void); ++splat_subsystem_t *splat_thread_init(void); ++splat_subsystem_t *splat_time_init(void); ++splat_subsystem_t *splat_vnode_init(void); ++splat_subsystem_t *splat_kobj_init(void); ++splat_subsystem_t *splat_atomic_init(void); ++splat_subsystem_t *splat_list_init(void); ++splat_subsystem_t *splat_generic_init(void); ++splat_subsystem_t *splat_cred_init(void); ++splat_subsystem_t *splat_zlib_init(void); ++splat_subsystem_t *splat_linux_init(void); ++ ++void splat_condvar_fini(splat_subsystem_t *); ++void splat_kmem_fini(splat_subsystem_t *); ++void splat_mutex_fini(splat_subsystem_t *); ++void splat_krng_fini(splat_subsystem_t *); ++void splat_rwlock_fini(splat_subsystem_t *); ++void splat_taskq_fini(splat_subsystem_t *); ++void splat_thread_fini(splat_subsystem_t *); ++void splat_time_fini(splat_subsystem_t *); ++void splat_vnode_fini(splat_subsystem_t *); ++void splat_kobj_fini(splat_subsystem_t *); ++void splat_atomic_fini(splat_subsystem_t *); ++void splat_list_fini(splat_subsystem_t *); ++void splat_generic_fini(splat_subsystem_t *); ++void splat_cred_fini(splat_subsystem_t *); ++void splat_zlib_fini(splat_subsystem_t *); ++void splat_linux_fini(splat_subsystem_t *); ++ ++int splat_condvar_id(void); ++int splat_kmem_id(void); ++int splat_mutex_id(void); ++int splat_krng_id(void); ++int splat_rwlock_id(void); ++int splat_taskq_id(void); ++int splat_thread_id(void); ++int splat_time_id(void); ++int splat_vnode_id(void); ++int splat_kobj_id(void); ++int splat_atomic_id(void); ++int splat_list_id(void); ++int splat_generic_id(void); ++int splat_cred_id(void); ++int splat_zlib_id(void); ++int splat_linux_id(void); ++ ++#endif /* _SPLAT_INTERNAL_H */ +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-kmem.c linux-3.2.33-go/spl/splat/splat-kmem.c +--- linux-3.2.33-go.orig/spl/splat/splat-kmem.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-kmem.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,1333 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Kmem Tests. ++\*****************************************************************************/ ++ ++#include ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_KMEM_NAME "kmem" ++#define SPLAT_KMEM_DESC "Kernel Malloc/Slab Tests" ++ ++#define SPLAT_KMEM_TEST1_ID 0x0101 ++#define SPLAT_KMEM_TEST1_NAME "kmem_alloc" ++#define SPLAT_KMEM_TEST1_DESC "Memory allocation test (kmem_alloc)" ++ ++#define SPLAT_KMEM_TEST2_ID 0x0102 ++#define SPLAT_KMEM_TEST2_NAME "kmem_zalloc" ++#define SPLAT_KMEM_TEST2_DESC "Memory allocation test (kmem_zalloc)" ++ ++#define SPLAT_KMEM_TEST3_ID 0x0103 ++#define SPLAT_KMEM_TEST3_NAME "vmem_alloc" ++#define SPLAT_KMEM_TEST3_DESC "Memory allocation test (vmem_alloc)" ++ ++#define SPLAT_KMEM_TEST4_ID 0x0104 ++#define SPLAT_KMEM_TEST4_NAME "vmem_zalloc" ++#define SPLAT_KMEM_TEST4_DESC "Memory allocation test (vmem_zalloc)" ++ ++#define SPLAT_KMEM_TEST5_ID 0x0105 ++#define SPLAT_KMEM_TEST5_NAME "slab_small" ++#define SPLAT_KMEM_TEST5_DESC "Slab ctor/dtor test (small)" ++ ++#define SPLAT_KMEM_TEST6_ID 0x0106 ++#define SPLAT_KMEM_TEST6_NAME "slab_large" ++#define SPLAT_KMEM_TEST6_DESC "Slab ctor/dtor test (large)" ++ ++#define SPLAT_KMEM_TEST7_ID 0x0107 ++#define SPLAT_KMEM_TEST7_NAME "slab_align" ++#define SPLAT_KMEM_TEST7_DESC "Slab alignment test" ++ ++#define SPLAT_KMEM_TEST8_ID 0x0108 ++#define SPLAT_KMEM_TEST8_NAME "slab_reap" ++#define SPLAT_KMEM_TEST8_DESC "Slab reaping test" ++ ++#define SPLAT_KMEM_TEST9_ID 0x0109 ++#define SPLAT_KMEM_TEST9_NAME "slab_age" ++#define SPLAT_KMEM_TEST9_DESC "Slab aging test" ++ ++#define SPLAT_KMEM_TEST10_ID 0x010a ++#define SPLAT_KMEM_TEST10_NAME "slab_lock" ++#define SPLAT_KMEM_TEST10_DESC "Slab locking test" ++ ++#if 0 ++#define SPLAT_KMEM_TEST11_ID 0x010b ++#define SPLAT_KMEM_TEST11_NAME "slab_overcommit" ++#define SPLAT_KMEM_TEST11_DESC "Slab memory overcommit test" ++#endif ++ ++#define SPLAT_KMEM_TEST12_ID 0x010c ++#define SPLAT_KMEM_TEST12_NAME "vmem_size" ++#define SPLAT_KMEM_TEST12_DESC "Memory zone test" ++ ++#define SPLAT_KMEM_TEST13_ID 0x010d ++#define SPLAT_KMEM_TEST13_NAME "slab_reclaim" ++#define SPLAT_KMEM_TEST13_DESC "Slab direct memory reclaim test" ++ ++#define SPLAT_KMEM_ALLOC_COUNT 10 ++#define SPLAT_VMEM_ALLOC_COUNT 10 ++ ++ ++static int ++splat_kmem_test1(struct file *file, void *arg) ++{ ++ void *ptr[SPLAT_KMEM_ALLOC_COUNT]; ++ int size = PAGE_SIZE; ++ int i, count, rc = 0; ++ ++ while ((!rc) && (size <= (PAGE_SIZE * 32))) { ++ count = 0; ++ ++ for (i = 0; i < SPLAT_KMEM_ALLOC_COUNT; i++) { ++ ptr[i] = kmem_alloc(size, KM_SLEEP | KM_NODEBUG); ++ if (ptr[i]) ++ count++; ++ } ++ ++ for (i = 0; i < SPLAT_KMEM_ALLOC_COUNT; i++) ++ if (ptr[i]) ++ kmem_free(ptr[i], size); ++ ++ splat_vprint(file, SPLAT_KMEM_TEST1_NAME, ++ "%d byte allocations, %d/%d successful\n", ++ size, count, SPLAT_KMEM_ALLOC_COUNT); ++ if (count != SPLAT_KMEM_ALLOC_COUNT) ++ rc = -ENOMEM; ++ ++ size *= 2; ++ } ++ ++ return rc; ++} ++ ++static int ++splat_kmem_test2(struct file *file, void *arg) ++{ ++ void *ptr[SPLAT_KMEM_ALLOC_COUNT]; ++ int size = PAGE_SIZE; ++ int i, j, count, rc = 0; ++ ++ while ((!rc) && (size <= (PAGE_SIZE * 32))) { ++ count = 0; ++ ++ for (i = 0; i < SPLAT_KMEM_ALLOC_COUNT; i++) { ++ ptr[i] = kmem_zalloc(size, KM_SLEEP | KM_NODEBUG); ++ if (ptr[i]) ++ count++; ++ } ++ ++ /* Ensure buffer has been zero filled */ ++ for (i = 0; i < SPLAT_KMEM_ALLOC_COUNT; i++) { ++ for (j = 0; j < size; j++) { ++ if (((char *)ptr[i])[j] != '\0') { ++ splat_vprint(file,SPLAT_KMEM_TEST2_NAME, ++ "%d-byte allocation was " ++ "not zeroed\n", size); ++ rc = -EFAULT; ++ } ++ } ++ } ++ ++ for (i = 0; i < SPLAT_KMEM_ALLOC_COUNT; i++) ++ if (ptr[i]) ++ kmem_free(ptr[i], size); ++ ++ splat_vprint(file, SPLAT_KMEM_TEST2_NAME, ++ "%d byte allocations, %d/%d successful\n", ++ size, count, SPLAT_KMEM_ALLOC_COUNT); ++ if (count != SPLAT_KMEM_ALLOC_COUNT) ++ rc = -ENOMEM; ++ ++ size *= 2; ++ } ++ ++ return rc; ++} ++ ++static int ++splat_kmem_test3(struct file *file, void *arg) ++{ ++ void *ptr[SPLAT_VMEM_ALLOC_COUNT]; ++ int size = PAGE_SIZE; ++ int i, count, rc = 0; ++ ++ while ((!rc) && (size <= (PAGE_SIZE * 1024))) { ++ count = 0; ++ ++ for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { ++ ptr[i] = vmem_alloc(size, KM_SLEEP); ++ if (ptr[i]) ++ count++; ++ } ++ ++ for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) ++ if (ptr[i]) ++ vmem_free(ptr[i], size); ++ ++ splat_vprint(file, SPLAT_KMEM_TEST3_NAME, ++ "%d byte allocations, %d/%d successful\n", ++ size, count, SPLAT_VMEM_ALLOC_COUNT); ++ if (count != SPLAT_VMEM_ALLOC_COUNT) ++ rc = -ENOMEM; ++ ++ size *= 2; ++ } ++ ++ return rc; ++} ++ ++static int ++splat_kmem_test4(struct file *file, void *arg) ++{ ++ void *ptr[SPLAT_VMEM_ALLOC_COUNT]; ++ int size = PAGE_SIZE; ++ int i, j, count, rc = 0; ++ ++ while ((!rc) && (size <= (PAGE_SIZE * 1024))) { ++ count = 0; ++ ++ for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { ++ ptr[i] = vmem_zalloc(size, KM_SLEEP); ++ if (ptr[i]) ++ count++; ++ } ++ ++ /* Ensure buffer has been zero filled */ ++ for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { ++ for (j = 0; j < size; j++) { ++ if (((char *)ptr[i])[j] != '\0') { ++ splat_vprint(file, SPLAT_KMEM_TEST4_NAME, ++ "%d-byte allocation was " ++ "not zeroed\n", size); ++ rc = -EFAULT; ++ } ++ } ++ } ++ ++ for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) ++ if (ptr[i]) ++ vmem_free(ptr[i], size); ++ ++ splat_vprint(file, SPLAT_KMEM_TEST4_NAME, ++ "%d byte allocations, %d/%d successful\n", ++ size, count, SPLAT_VMEM_ALLOC_COUNT); ++ if (count != SPLAT_VMEM_ALLOC_COUNT) ++ rc = -ENOMEM; ++ ++ size *= 2; ++ } ++ ++ return rc; ++} ++ ++#define SPLAT_KMEM_TEST_MAGIC 0x004488CCUL ++#define SPLAT_KMEM_CACHE_NAME "kmem_test" ++#define SPLAT_KMEM_OBJ_COUNT 1024 ++#define SPLAT_KMEM_OBJ_RECLAIM 1000 /* objects */ ++#define SPLAT_KMEM_THREADS 32 ++ ++#define KCP_FLAG_READY 0x01 ++ ++typedef struct kmem_cache_data { ++ unsigned long kcd_magic; ++ struct list_head kcd_node; ++ int kcd_flag; ++ char kcd_buf[0]; ++} kmem_cache_data_t; ++ ++typedef struct kmem_cache_thread { ++ spinlock_t kct_lock; ++ int kct_id; ++ struct list_head kct_list; ++} kmem_cache_thread_t; ++ ++typedef struct kmem_cache_priv { ++ unsigned long kcp_magic; ++ struct file *kcp_file; ++ kmem_cache_t *kcp_cache; ++ spinlock_t kcp_lock; ++ wait_queue_head_t kcp_ctl_waitq; ++ wait_queue_head_t kcp_thr_waitq; ++ int kcp_flags; ++ int kcp_kct_count; ++ kmem_cache_thread_t *kcp_kct[SPLAT_KMEM_THREADS]; ++ int kcp_size; ++ int kcp_align; ++ int kcp_count; ++ int kcp_alloc; ++ int kcp_rc; ++} kmem_cache_priv_t; ++ ++static kmem_cache_priv_t * ++splat_kmem_cache_test_kcp_alloc(struct file *file, char *name, ++ int size, int align, int alloc) ++{ ++ kmem_cache_priv_t *kcp; ++ ++ kcp = kmem_zalloc(sizeof(kmem_cache_priv_t), KM_SLEEP); ++ if (!kcp) ++ return NULL; ++ ++ kcp->kcp_magic = SPLAT_KMEM_TEST_MAGIC; ++ kcp->kcp_file = file; ++ kcp->kcp_cache = NULL; ++ spin_lock_init(&kcp->kcp_lock); ++ init_waitqueue_head(&kcp->kcp_ctl_waitq); ++ init_waitqueue_head(&kcp->kcp_thr_waitq); ++ kcp->kcp_flags = 0; ++ kcp->kcp_kct_count = -1; ++ kcp->kcp_size = size; ++ kcp->kcp_align = align; ++ kcp->kcp_count = 0; ++ kcp->kcp_alloc = alloc; ++ kcp->kcp_rc = 0; ++ ++ return kcp; ++} ++ ++static void ++splat_kmem_cache_test_kcp_free(kmem_cache_priv_t *kcp) ++{ ++ kmem_free(kcp, sizeof(kmem_cache_priv_t)); ++} ++ ++static kmem_cache_thread_t * ++splat_kmem_cache_test_kct_alloc(kmem_cache_priv_t *kcp, int id) ++{ ++ kmem_cache_thread_t *kct; ++ ++ ASSERTF(id < SPLAT_KMEM_THREADS, "id=%d\n", id); ++ ASSERT(kcp->kcp_kct[id] == NULL); ++ ++ kct = kmem_zalloc(sizeof(kmem_cache_thread_t), KM_SLEEP); ++ if (!kct) ++ return NULL; ++ ++ spin_lock_init(&kct->kct_lock); ++ kct->kct_id = id; ++ INIT_LIST_HEAD(&kct->kct_list); ++ ++ spin_lock(&kcp->kcp_lock); ++ kcp->kcp_kct[id] = kct; ++ spin_unlock(&kcp->kcp_lock); ++ ++ return kct; ++} ++ ++static void ++splat_kmem_cache_test_kct_free(kmem_cache_priv_t *kcp, ++ kmem_cache_thread_t *kct) ++{ ++ spin_lock(&kcp->kcp_lock); ++ kcp->kcp_kct[kct->kct_id] = NULL; ++ spin_unlock(&kcp->kcp_lock); ++ ++ kmem_free(kct, sizeof(kmem_cache_thread_t)); ++} ++ ++static void ++splat_kmem_cache_test_kcd_free(kmem_cache_priv_t *kcp, ++ kmem_cache_thread_t *kct) ++{ ++ kmem_cache_data_t *kcd; ++ ++ spin_lock(&kct->kct_lock); ++ while (!list_empty(&kct->kct_list)) { ++ kcd = list_entry(kct->kct_list.next, ++ kmem_cache_data_t, kcd_node); ++ list_del(&kcd->kcd_node); ++ spin_unlock(&kct->kct_lock); ++ ++ kmem_cache_free(kcp->kcp_cache, kcd); ++ ++ spin_lock(&kct->kct_lock); ++ } ++ spin_unlock(&kct->kct_lock); ++} ++ ++static int ++splat_kmem_cache_test_kcd_alloc(kmem_cache_priv_t *kcp, ++ kmem_cache_thread_t *kct, int count) ++{ ++ kmem_cache_data_t *kcd; ++ int i; ++ ++ for (i = 0; i < count; i++) { ++ kcd = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP); ++ if (kcd == NULL) { ++ splat_kmem_cache_test_kcd_free(kcp, kct); ++ return -ENOMEM; ++ } ++ ++ spin_lock(&kct->kct_lock); ++ list_add_tail(&kcd->kcd_node, &kct->kct_list); ++ spin_unlock(&kct->kct_lock); ++ } ++ ++ return 0; ++} ++ ++static void ++splat_kmem_cache_test_debug(struct file *file, char *name, ++ kmem_cache_priv_t *kcp) ++{ ++ int j; ++ ++ splat_vprint(file, name, ++ "%s cache objects %d, slabs %u/%u objs %u/%u mags ", ++ kcp->kcp_cache->skc_name, kcp->kcp_count, ++ (unsigned)kcp->kcp_cache->skc_slab_alloc, ++ (unsigned)kcp->kcp_cache->skc_slab_total, ++ (unsigned)kcp->kcp_cache->skc_obj_alloc, ++ (unsigned)kcp->kcp_cache->skc_obj_total); ++ ++ for_each_online_cpu(j) ++ splat_print(file, "%u/%u ", ++ kcp->kcp_cache->skc_mag[j]->skm_avail, ++ kcp->kcp_cache->skc_mag[j]->skm_size); ++ ++ splat_print(file, "%s\n", ""); ++} ++ ++static int ++splat_kmem_cache_test_constructor(void *ptr, void *priv, int flags) ++{ ++ kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)priv; ++ kmem_cache_data_t *kcd = (kmem_cache_data_t *)ptr; ++ ++ if (kcd && kcp) { ++ kcd->kcd_magic = kcp->kcp_magic; ++ INIT_LIST_HEAD(&kcd->kcd_node); ++ kcd->kcd_flag = 1; ++ memset(kcd->kcd_buf, 0xaa, kcp->kcp_size - (sizeof *kcd)); ++ kcp->kcp_count++; ++ } ++ ++ return 0; ++} ++ ++static void ++splat_kmem_cache_test_destructor(void *ptr, void *priv) ++{ ++ kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)priv; ++ kmem_cache_data_t *kcd = (kmem_cache_data_t *)ptr; ++ ++ if (kcd && kcp) { ++ kcd->kcd_magic = 0; ++ kcd->kcd_flag = 0; ++ memset(kcd->kcd_buf, 0xbb, kcp->kcp_size - (sizeof *kcd)); ++ kcp->kcp_count--; ++ } ++ ++ return; ++} ++ ++/* ++ * Generic reclaim function which assumes that all objects may ++ * be reclaimed at any time. We free a small percentage of the ++ * objects linked off the kcp or kct[] every time we are called. ++ */ ++static void ++splat_kmem_cache_test_reclaim(void *priv) ++{ ++ kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)priv; ++ kmem_cache_thread_t *kct; ++ kmem_cache_data_t *kcd; ++ LIST_HEAD(reclaim); ++ int i, count; ++ ++ ASSERT(kcp->kcp_magic == SPLAT_KMEM_TEST_MAGIC); ++ ++ /* For each kct thread reclaim some objects */ ++ spin_lock(&kcp->kcp_lock); ++ for (i = 0; i < SPLAT_KMEM_THREADS; i++) { ++ kct = kcp->kcp_kct[i]; ++ if (!kct) ++ continue; ++ ++ spin_unlock(&kcp->kcp_lock); ++ spin_lock(&kct->kct_lock); ++ ++ count = SPLAT_KMEM_OBJ_RECLAIM; ++ while (count > 0 && !list_empty(&kct->kct_list)) { ++ kcd = list_entry(kct->kct_list.next, ++ kmem_cache_data_t, kcd_node); ++ list_del(&kcd->kcd_node); ++ list_add(&kcd->kcd_node, &reclaim); ++ count--; ++ } ++ ++ spin_unlock(&kct->kct_lock); ++ spin_lock(&kcp->kcp_lock); ++ } ++ spin_unlock(&kcp->kcp_lock); ++ ++ /* Freed outside the spin lock */ ++ while (!list_empty(&reclaim)) { ++ kcd = list_entry(reclaim.next, kmem_cache_data_t, kcd_node); ++ list_del(&kcd->kcd_node); ++ kmem_cache_free(kcp->kcp_cache, kcd); ++ } ++ ++ return; ++} ++ ++static int ++splat_kmem_cache_test_threads(kmem_cache_priv_t *kcp, int threads) ++{ ++ int rc; ++ ++ spin_lock(&kcp->kcp_lock); ++ rc = (kcp->kcp_kct_count == threads); ++ spin_unlock(&kcp->kcp_lock); ++ ++ return rc; ++} ++ ++static int ++splat_kmem_cache_test_flags(kmem_cache_priv_t *kcp, int flags) ++{ ++ int rc; ++ ++ spin_lock(&kcp->kcp_lock); ++ rc = (kcp->kcp_flags & flags); ++ spin_unlock(&kcp->kcp_lock); ++ ++ return rc; ++} ++ ++static void ++splat_kmem_cache_test_thread(void *arg) ++{ ++ kmem_cache_priv_t *kcp = (kmem_cache_priv_t *)arg; ++ kmem_cache_thread_t *kct; ++ int rc = 0, id; ++ ++ ASSERT(kcp->kcp_magic == SPLAT_KMEM_TEST_MAGIC); ++ ++ /* Assign thread ids */ ++ spin_lock(&kcp->kcp_lock); ++ if (kcp->kcp_kct_count == -1) ++ kcp->kcp_kct_count = 0; ++ ++ id = kcp->kcp_kct_count; ++ kcp->kcp_kct_count++; ++ spin_unlock(&kcp->kcp_lock); ++ ++ kct = splat_kmem_cache_test_kct_alloc(kcp, id); ++ if (!kct) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ /* Wait for all threads to have started and report they are ready */ ++ if (kcp->kcp_kct_count == SPLAT_KMEM_THREADS) ++ wake_up(&kcp->kcp_ctl_waitq); ++ ++ wait_event(kcp->kcp_thr_waitq, ++ splat_kmem_cache_test_flags(kcp, KCP_FLAG_READY)); ++ ++ /* Create and destroy objects */ ++ rc = splat_kmem_cache_test_kcd_alloc(kcp, kct, kcp->kcp_alloc); ++ splat_kmem_cache_test_kcd_free(kcp, kct); ++out: ++ if (kct) ++ splat_kmem_cache_test_kct_free(kcp, kct); ++ ++ spin_lock(&kcp->kcp_lock); ++ if (!kcp->kcp_rc) ++ kcp->kcp_rc = rc; ++ ++ if ((--kcp->kcp_kct_count) == 0) ++ wake_up(&kcp->kcp_ctl_waitq); ++ ++ spin_unlock(&kcp->kcp_lock); ++ ++ thread_exit(); ++} ++ ++static int ++splat_kmem_cache_test(struct file *file, void *arg, char *name, ++ int size, int align, int flags) ++{ ++ kmem_cache_priv_t *kcp; ++ kmem_cache_data_t *kcd = NULL; ++ int rc = 0, max; ++ ++ kcp = splat_kmem_cache_test_kcp_alloc(file, name, size, align, 0); ++ if (!kcp) { ++ splat_vprint(file, name, "Unable to create '%s'\n", "kcp"); ++ return -ENOMEM; ++ } ++ ++ kcp->kcp_cache = ++ kmem_cache_create(SPLAT_KMEM_CACHE_NAME, ++ kcp->kcp_size, kcp->kcp_align, ++ splat_kmem_cache_test_constructor, ++ splat_kmem_cache_test_destructor, ++ NULL, kcp, NULL, flags); ++ if (!kcp->kcp_cache) { ++ splat_vprint(file, name, ++ "Unable to create '%s'\n", ++ SPLAT_KMEM_CACHE_NAME); ++ rc = -ENOMEM; ++ goto out_free; ++ } ++ ++ kcd = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP); ++ if (!kcd) { ++ splat_vprint(file, name, ++ "Unable to allocate from '%s'\n", ++ SPLAT_KMEM_CACHE_NAME); ++ rc = -EINVAL; ++ goto out_free; ++ } ++ ++ if (!kcd->kcd_flag) { ++ splat_vprint(file, name, ++ "Failed to run contructor for '%s'\n", ++ SPLAT_KMEM_CACHE_NAME); ++ rc = -EINVAL; ++ goto out_free; ++ } ++ ++ if (kcd->kcd_magic != kcp->kcp_magic) { ++ splat_vprint(file, name, ++ "Failed to pass private data to constructor " ++ "for '%s'\n", SPLAT_KMEM_CACHE_NAME); ++ rc = -EINVAL; ++ goto out_free; ++ } ++ ++ max = kcp->kcp_count; ++ kmem_cache_free(kcp->kcp_cache, kcd); ++ ++ /* Destroy the entire cache which will force destructors to ++ * run and we can verify one was called for every object */ ++ kmem_cache_destroy(kcp->kcp_cache); ++ if (kcp->kcp_count) { ++ splat_vprint(file, name, ++ "Failed to run destructor on all slab objects " ++ "for '%s'\n", SPLAT_KMEM_CACHE_NAME); ++ rc = -EINVAL; ++ } ++ ++ splat_kmem_cache_test_kcp_free(kcp); ++ splat_vprint(file, name, ++ "Successfully ran ctors/dtors for %d elements in '%s'\n", ++ max, SPLAT_KMEM_CACHE_NAME); ++ ++ return rc; ++ ++out_free: ++ if (kcd) ++ kmem_cache_free(kcp->kcp_cache, kcd); ++ ++ if (kcp->kcp_cache) ++ kmem_cache_destroy(kcp->kcp_cache); ++ ++ splat_kmem_cache_test_kcp_free(kcp); ++ ++ return rc; ++} ++ ++static int ++splat_kmem_cache_thread_test(struct file *file, void *arg, char *name, ++ int size, int alloc, int max_time) ++{ ++ kmem_cache_priv_t *kcp; ++ kthread_t *thr; ++ struct timespec start, stop, delta; ++ char cache_name[32]; ++ int i, rc = 0; ++ ++ kcp = splat_kmem_cache_test_kcp_alloc(file, name, size, 0, alloc); ++ if (!kcp) { ++ splat_vprint(file, name, "Unable to create '%s'\n", "kcp"); ++ return -ENOMEM; ++ } ++ ++ (void)snprintf(cache_name, 32, "%s-%d-%d", ++ SPLAT_KMEM_CACHE_NAME, size, alloc); ++ kcp->kcp_cache = ++ kmem_cache_create(cache_name, kcp->kcp_size, 0, ++ splat_kmem_cache_test_constructor, ++ splat_kmem_cache_test_destructor, ++ splat_kmem_cache_test_reclaim, ++ kcp, NULL, 0); ++ if (!kcp->kcp_cache) { ++ splat_vprint(file, name, "Unable to create '%s'\n", cache_name); ++ rc = -ENOMEM; ++ goto out_kcp; ++ } ++ ++ start = current_kernel_time(); ++ ++ for (i = 0; i < SPLAT_KMEM_THREADS; i++) { ++ thr = thread_create(NULL, 0, ++ splat_kmem_cache_test_thread, ++ kcp, 0, &p0, TS_RUN, minclsyspri); ++ if (thr == NULL) { ++ rc = -ESRCH; ++ goto out_cache; ++ } ++ } ++ ++ /* Sleep until all threads have started, then set the ready ++ * flag and wake them all up for maximum concurrency. */ ++ wait_event(kcp->kcp_ctl_waitq, ++ splat_kmem_cache_test_threads(kcp, SPLAT_KMEM_THREADS)); ++ ++ spin_lock(&kcp->kcp_lock); ++ kcp->kcp_flags |= KCP_FLAG_READY; ++ spin_unlock(&kcp->kcp_lock); ++ wake_up_all(&kcp->kcp_thr_waitq); ++ ++ /* Sleep until all thread have finished */ ++ wait_event(kcp->kcp_ctl_waitq, splat_kmem_cache_test_threads(kcp, 0)); ++ ++ stop = current_kernel_time(); ++ delta = timespec_sub(stop, start); ++ ++ splat_vprint(file, name, ++ "%-22s %2ld.%09ld\t" ++ "%lu/%lu/%lu\t%lu/%lu/%lu\n", ++ kcp->kcp_cache->skc_name, ++ delta.tv_sec, delta.tv_nsec, ++ (unsigned long)kcp->kcp_cache->skc_slab_total, ++ (unsigned long)kcp->kcp_cache->skc_slab_max, ++ (unsigned long)(kcp->kcp_alloc * ++ SPLAT_KMEM_THREADS / ++ SPL_KMEM_CACHE_OBJ_PER_SLAB), ++ (unsigned long)kcp->kcp_cache->skc_obj_total, ++ (unsigned long)kcp->kcp_cache->skc_obj_max, ++ (unsigned long)(kcp->kcp_alloc * ++ SPLAT_KMEM_THREADS)); ++ ++ if (delta.tv_sec >= max_time) ++ rc = -ETIME; ++ ++ if (!rc && kcp->kcp_rc) ++ rc = kcp->kcp_rc; ++ ++out_cache: ++ kmem_cache_destroy(kcp->kcp_cache); ++out_kcp: ++ splat_kmem_cache_test_kcp_free(kcp); ++ return rc; ++} ++ ++/* Validate small object cache behavior for dynamic/kmem/vmem caches */ ++static int ++splat_kmem_test5(struct file *file, void *arg) ++{ ++ char *name = SPLAT_KMEM_TEST5_NAME; ++ int rc; ++ ++ rc = splat_kmem_cache_test(file, arg, name, 128, 0, 0); ++ if (rc) ++ return rc; ++ ++ rc = splat_kmem_cache_test(file, arg, name, 128, 0, KMC_KMEM); ++ if (rc) ++ return rc; ++ ++ return splat_kmem_cache_test(file, arg, name, 128, 0, KMC_VMEM); ++} ++ ++/* ++ * Validate large object cache behavior for dynamic/kmem/vmem caches ++ */ ++static int ++splat_kmem_test6(struct file *file, void *arg) ++{ ++ char *name = SPLAT_KMEM_TEST6_NAME; ++ int rc; ++ ++ rc = splat_kmem_cache_test(file, arg, name, 256*1024, 0, 0); ++ if (rc) ++ return rc; ++ ++ rc = splat_kmem_cache_test(file, arg, name, 64*1024, 0, KMC_KMEM); ++ if (rc) ++ return rc; ++ ++ return splat_kmem_cache_test(file, arg, name, 1024*1024, 0, KMC_VMEM); ++} ++ ++/* ++ * Validate object alignment cache behavior for caches ++ */ ++static int ++splat_kmem_test7(struct file *file, void *arg) ++{ ++ char *name = SPLAT_KMEM_TEST7_NAME; ++ int i, rc; ++ ++ for (i = SPL_KMEM_CACHE_ALIGN; i <= PAGE_SIZE; i *= 2) { ++ rc = splat_kmem_cache_test(file, arg, name, 157, i, 0); ++ if (rc) ++ return rc; ++ } ++ ++ return rc; ++} ++ ++/* ++ * Validate kmem_cache_reap() by requesting the slab cache free any objects ++ * it can. For a few reasons this may not immediately result in more free ++ * memory even if objects are freed. First off, due to fragmentation we ++ * may not be able to reclaim any slabs. Secondly, even if we do we fully ++ * clear some slabs we will not want to immediately reclaim all of them ++ * because we may contend with cache allocations and thrash. What we want ++ * to see is the slab size decrease more gradually as it becomes clear they ++ * will not be needed. This should be achievable in less than a minute. ++ * If it takes longer than this something has gone wrong. ++ */ ++static int ++splat_kmem_test8(struct file *file, void *arg) ++{ ++ kmem_cache_priv_t *kcp; ++ kmem_cache_thread_t *kct; ++ int i, rc = 0; ++ ++ kcp = splat_kmem_cache_test_kcp_alloc(file, SPLAT_KMEM_TEST8_NAME, ++ 256, 0, 0); ++ if (!kcp) { ++ splat_vprint(file, SPLAT_KMEM_TEST8_NAME, ++ "Unable to create '%s'\n", "kcp"); ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ kcp->kcp_cache = ++ kmem_cache_create(SPLAT_KMEM_CACHE_NAME, kcp->kcp_size, 0, ++ splat_kmem_cache_test_constructor, ++ splat_kmem_cache_test_destructor, ++ splat_kmem_cache_test_reclaim, ++ kcp, NULL, 0); ++ if (!kcp->kcp_cache) { ++ splat_vprint(file, SPLAT_KMEM_TEST8_NAME, ++ "Unable to create '%s'\n", SPLAT_KMEM_CACHE_NAME); ++ rc = -ENOMEM; ++ goto out_kcp; ++ } ++ ++ kct = splat_kmem_cache_test_kct_alloc(kcp, 0); ++ if (!kct) { ++ splat_vprint(file, SPLAT_KMEM_TEST8_NAME, ++ "Unable to create '%s'\n", "kct"); ++ rc = -ENOMEM; ++ goto out_cache; ++ } ++ ++ rc = splat_kmem_cache_test_kcd_alloc(kcp, kct, SPLAT_KMEM_OBJ_COUNT); ++ if (rc) { ++ splat_vprint(file, SPLAT_KMEM_TEST8_NAME, "Unable to " ++ "allocate from '%s'\n", SPLAT_KMEM_CACHE_NAME); ++ goto out_kct; ++ } ++ ++ for (i = 0; i < 60; i++) { ++ kmem_cache_reap_now(kcp->kcp_cache); ++ splat_kmem_cache_test_debug(file, SPLAT_KMEM_TEST8_NAME, kcp); ++ ++ if (kcp->kcp_cache->skc_obj_total == 0) ++ break; ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule_timeout(HZ); ++ } ++ ++ if (kcp->kcp_cache->skc_obj_total == 0) { ++ splat_vprint(file, SPLAT_KMEM_TEST8_NAME, ++ "Successfully created %d objects " ++ "in cache %s and reclaimed them\n", ++ SPLAT_KMEM_OBJ_COUNT, SPLAT_KMEM_CACHE_NAME); ++ } else { ++ splat_vprint(file, SPLAT_KMEM_TEST8_NAME, ++ "Failed to reclaim %u/%d objects from cache %s\n", ++ (unsigned)kcp->kcp_cache->skc_obj_total, ++ SPLAT_KMEM_OBJ_COUNT, SPLAT_KMEM_CACHE_NAME); ++ rc = -ENOMEM; ++ } ++ ++ /* Cleanup our mess (for failure case of time expiring) */ ++ splat_kmem_cache_test_kcd_free(kcp, kct); ++out_kct: ++ splat_kmem_cache_test_kct_free(kcp, kct); ++out_cache: ++ kmem_cache_destroy(kcp->kcp_cache); ++out_kcp: ++ splat_kmem_cache_test_kcp_free(kcp); ++out: ++ return rc; ++} ++ ++/* Test cache aging, we have allocated a large number of objects thus ++ * creating a large number of slabs and then free'd them all. However, ++ * since there should be little memory pressure at the moment those ++ * slabs have not been freed. What we want to see is the slab size ++ * decrease gradually as it becomes clear they will not be be needed. ++ * This should be achievable in less than minute. If it takes longer ++ * than this something has gone wrong. ++ */ ++static int ++splat_kmem_test9(struct file *file, void *arg) ++{ ++ kmem_cache_priv_t *kcp; ++ kmem_cache_thread_t *kct; ++ int i, rc = 0, count = SPLAT_KMEM_OBJ_COUNT * 128; ++ ++ kcp = splat_kmem_cache_test_kcp_alloc(file, SPLAT_KMEM_TEST9_NAME, ++ 256, 0, 0); ++ if (!kcp) { ++ splat_vprint(file, SPLAT_KMEM_TEST9_NAME, ++ "Unable to create '%s'\n", "kcp"); ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ kcp->kcp_cache = ++ kmem_cache_create(SPLAT_KMEM_CACHE_NAME, kcp->kcp_size, 0, ++ splat_kmem_cache_test_constructor, ++ splat_kmem_cache_test_destructor, ++ NULL, kcp, NULL, 0); ++ if (!kcp->kcp_cache) { ++ splat_vprint(file, SPLAT_KMEM_TEST9_NAME, ++ "Unable to create '%s'\n", SPLAT_KMEM_CACHE_NAME); ++ rc = -ENOMEM; ++ goto out_kcp; ++ } ++ ++ kct = splat_kmem_cache_test_kct_alloc(kcp, 0); ++ if (!kct) { ++ splat_vprint(file, SPLAT_KMEM_TEST8_NAME, ++ "Unable to create '%s'\n", "kct"); ++ rc = -ENOMEM; ++ goto out_cache; ++ } ++ ++ rc = splat_kmem_cache_test_kcd_alloc(kcp, kct, count); ++ if (rc) { ++ splat_vprint(file, SPLAT_KMEM_TEST9_NAME, "Unable to " ++ "allocate from '%s'\n", SPLAT_KMEM_CACHE_NAME); ++ goto out_kct; ++ } ++ ++ splat_kmem_cache_test_kcd_free(kcp, kct); ++ ++ for (i = 0; i < 60; i++) { ++ splat_kmem_cache_test_debug(file, SPLAT_KMEM_TEST9_NAME, kcp); ++ ++ if (kcp->kcp_cache->skc_obj_total == 0) ++ break; ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule_timeout(HZ); ++ } ++ ++ if (kcp->kcp_cache->skc_obj_total == 0) { ++ splat_vprint(file, SPLAT_KMEM_TEST9_NAME, ++ "Successfully created %d objects " ++ "in cache %s and reclaimed them\n", ++ count, SPLAT_KMEM_CACHE_NAME); ++ } else { ++ splat_vprint(file, SPLAT_KMEM_TEST9_NAME, ++ "Failed to reclaim %u/%d objects from cache %s\n", ++ (unsigned)kcp->kcp_cache->skc_obj_total, count, ++ SPLAT_KMEM_CACHE_NAME); ++ rc = -ENOMEM; ++ } ++ ++out_kct: ++ splat_kmem_cache_test_kct_free(kcp, kct); ++out_cache: ++ kmem_cache_destroy(kcp->kcp_cache); ++out_kcp: ++ splat_kmem_cache_test_kcp_free(kcp); ++out: ++ return rc; ++} ++ ++/* ++ * This test creates N threads with a shared kmem cache. They then all ++ * concurrently allocate and free from the cache to stress the locking and ++ * concurrent cache performance. If any one test takes longer than 5 ++ * seconds to complete it is treated as a failure and may indicate a ++ * performance regression. On my test system no one test takes more ++ * than 1 second to complete so a 5x slowdown likely a problem. ++ */ ++static int ++splat_kmem_test10(struct file *file, void *arg) ++{ ++ uint64_t size, alloc, rc = 0; ++ ++ for (size = 32; size <= 1024*1024; size *= 2) { ++ ++ splat_vprint(file, SPLAT_KMEM_TEST10_NAME, "%-22s %s", "name", ++ "time (sec)\tslabs \tobjs \thash\n"); ++ splat_vprint(file, SPLAT_KMEM_TEST10_NAME, "%-22s %s", "", ++ " \ttot/max/calc\ttot/max/calc\n"); ++ ++ for (alloc = 1; alloc <= 1024; alloc *= 2) { ++ ++ /* Skip tests which exceed available memory. We ++ * leverage availrmem here for some extra testing */ ++ if (size * alloc * SPLAT_KMEM_THREADS > availrmem / 2) ++ continue; ++ ++ rc = splat_kmem_cache_thread_test(file, arg, ++ SPLAT_KMEM_TEST10_NAME, size, alloc, 5); ++ if (rc) ++ break; ++ } ++ } ++ ++ return rc; ++} ++ ++#if 0 ++/* ++ * This test creates N threads with a shared kmem cache which overcommits ++ * memory by 4x. This makes it impossible for the slab to satify the ++ * thread requirements without having its reclaim hook run which will ++ * free objects back for use. This behavior is triggered by the linum VM ++ * detecting a low memory condition on the node and invoking the shrinkers. ++ * This should allow all the threads to complete while avoiding deadlock ++ * and for the most part out of memory events. This is very tough on the ++ * system so it is possible the test app may get oom'ed. This particular ++ * test has proven troublesome on 32-bit archs with limited virtual ++ * address space so it only run on 64-bit systems. ++ */ ++static int ++splat_kmem_test11(struct file *file, void *arg) ++{ ++ uint64_t size, alloc, rc; ++ ++ size = 8 * 1024; ++ alloc = ((4 * physmem * PAGE_SIZE) / size) / SPLAT_KMEM_THREADS; ++ ++ splat_vprint(file, SPLAT_KMEM_TEST11_NAME, "%-22s %s", "name", ++ "time (sec)\tslabs \tobjs \thash\n"); ++ splat_vprint(file, SPLAT_KMEM_TEST11_NAME, "%-22s %s", "", ++ " \ttot/max/calc\ttot/max/calc\n"); ++ ++ rc = splat_kmem_cache_thread_test(file, arg, ++ SPLAT_KMEM_TEST11_NAME, size, alloc, 60); ++ ++ return rc; ++} ++#endif ++ ++/* ++ * Check vmem_size() behavior by acquiring the alloc/free/total vmem ++ * space, then allocate a known buffer size from vmem space. We can ++ * then check that vmem_size() values were updated properly with in ++ * a fairly small tolerence. The tolerance is important because we ++ * are not the only vmem consumer on the system. Other unrelated ++ * allocations might occur during the small test window. The vmem ++ * allocation itself may also add in a little extra private space to ++ * the buffer. Finally, verify total space always remains unchanged. ++ */ ++static int ++splat_kmem_test12(struct file *file, void *arg) ++{ ++ size_t alloc1, free1, total1; ++ size_t alloc2, free2, total2; ++ int size = 8*1024*1024; ++ void *ptr; ++ ++ alloc1 = vmem_size(NULL, VMEM_ALLOC); ++ free1 = vmem_size(NULL, VMEM_FREE); ++ total1 = vmem_size(NULL, VMEM_ALLOC | VMEM_FREE); ++ splat_vprint(file, SPLAT_KMEM_TEST12_NAME, "Vmem alloc=%lu " ++ "free=%lu total=%lu\n", (unsigned long)alloc1, ++ (unsigned long)free1, (unsigned long)total1); ++ ++ splat_vprint(file, SPLAT_KMEM_TEST12_NAME, "Alloc %d bytes\n", size); ++ ptr = vmem_alloc(size, KM_SLEEP); ++ if (!ptr) { ++ splat_vprint(file, SPLAT_KMEM_TEST12_NAME, ++ "Failed to alloc %d bytes\n", size); ++ return -ENOMEM; ++ } ++ ++ alloc2 = vmem_size(NULL, VMEM_ALLOC); ++ free2 = vmem_size(NULL, VMEM_FREE); ++ total2 = vmem_size(NULL, VMEM_ALLOC | VMEM_FREE); ++ splat_vprint(file, SPLAT_KMEM_TEST12_NAME, "Vmem alloc=%lu " ++ "free=%lu total=%lu\n", (unsigned long)alloc2, ++ (unsigned long)free2, (unsigned long)total2); ++ ++ splat_vprint(file, SPLAT_KMEM_TEST12_NAME, "Free %d bytes\n", size); ++ vmem_free(ptr, size); ++ if (alloc2 < (alloc1 + size - (size / 100)) || ++ alloc2 > (alloc1 + size + (size / 100))) { ++ splat_vprint(file, SPLAT_KMEM_TEST12_NAME, "Failed " ++ "VMEM_ALLOC size: %lu != %lu+%d (+/- 1%%)\n", ++ (unsigned long)alloc2,(unsigned long)alloc1,size); ++ return -ERANGE; ++ } ++ ++ if (free2 < (free1 - size - (size / 100)) || ++ free2 > (free1 - size + (size / 100))) { ++ splat_vprint(file, SPLAT_KMEM_TEST12_NAME, "Failed " ++ "VMEM_FREE size: %lu != %lu-%d (+/- 1%%)\n", ++ (unsigned long)free2, (unsigned long)free1, size); ++ return -ERANGE; ++ } ++ ++ if (total1 != total2) { ++ splat_vprint(file, SPLAT_KMEM_TEST12_NAME, "Failed " ++ "VMEM_ALLOC | VMEM_FREE not constant: " ++ "%lu != %lu\n", (unsigned long)total2, ++ (unsigned long)total1); ++ return -ERANGE; ++ } ++ ++ splat_vprint(file, SPLAT_KMEM_TEST12_NAME, ++ "VMEM_ALLOC within tolerance: ~%ld%% (%ld/%d)\n", ++ (long)abs(alloc1 + (long)size - alloc2) * 100 / (long)size, ++ (long)abs(alloc1 + (long)size - alloc2), size); ++ splat_vprint(file, SPLAT_KMEM_TEST12_NAME, ++ "VMEM_FREE within tolerance: ~%ld%% (%ld/%d)\n", ++ (long)abs((free1 - (long)size) - free2) * 100 / (long)size, ++ (long)abs((free1 - (long)size) - free2), size); ++ ++ return 0; ++} ++ ++typedef struct dummy_page { ++ struct list_head dp_list; ++ char dp_pad[PAGE_SIZE - sizeof(struct list_head)]; ++} dummy_page_t; ++ ++/* ++ * This test is designed to verify that direct reclaim is functioning as ++ * expected. We allocate a large number of objects thus creating a large ++ * number of slabs. We then apply memory pressure and expect that the ++ * direct reclaim path can easily recover those slabs. The registered ++ * reclaim function will free the objects and the slab shrinker will call ++ * it repeatedly until at least a single slab can be freed. ++ * ++ * Note it may not be possible to reclaim every last slab via direct reclaim ++ * without a failure because the shrinker_rwsem may be contended. For this ++ * reason, quickly reclaiming 3/4 of the slabs is considered a success. ++ * ++ * This should all be possible within 10 seconds. For reference, on a ++ * system with 2G of memory this test takes roughly 0.2 seconds to run. ++ * It may take longer on larger memory systems but should still easily ++ * complete in the alloted 10 seconds. ++ */ ++static int ++splat_kmem_test13(struct file *file, void *arg) ++{ ++ kmem_cache_priv_t *kcp; ++ kmem_cache_thread_t *kct; ++ dummy_page_t *dp; ++ struct list_head list; ++ struct timespec start, delta = { 0, 0 }; ++ int size, count, slabs, fails = 0; ++ int i, rc = 0, max_time = 10; ++ ++ size = 128 * 1024; ++ count = ((physmem * PAGE_SIZE) / 4 / size); ++ ++ kcp = splat_kmem_cache_test_kcp_alloc(file, SPLAT_KMEM_TEST13_NAME, ++ size, 0, 0); ++ if (!kcp) { ++ splat_vprint(file, SPLAT_KMEM_TEST13_NAME, ++ "Unable to create '%s'\n", "kcp"); ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ kcp->kcp_cache = ++ kmem_cache_create(SPLAT_KMEM_CACHE_NAME, kcp->kcp_size, 0, ++ splat_kmem_cache_test_constructor, ++ splat_kmem_cache_test_destructor, ++ splat_kmem_cache_test_reclaim, ++ kcp, NULL, 0); ++ if (!kcp->kcp_cache) { ++ splat_vprint(file, SPLAT_KMEM_TEST13_NAME, ++ "Unable to create '%s'\n", SPLAT_KMEM_CACHE_NAME); ++ rc = -ENOMEM; ++ goto out_kcp; ++ } ++ ++ kct = splat_kmem_cache_test_kct_alloc(kcp, 0); ++ if (!kct) { ++ splat_vprint(file, SPLAT_KMEM_TEST13_NAME, ++ "Unable to create '%s'\n", "kct"); ++ rc = -ENOMEM; ++ goto out_cache; ++ } ++ ++ rc = splat_kmem_cache_test_kcd_alloc(kcp, kct, count); ++ if (rc) { ++ splat_vprint(file, SPLAT_KMEM_TEST13_NAME, "Unable to " ++ "allocate from '%s'\n", SPLAT_KMEM_CACHE_NAME); ++ goto out_kct; ++ } ++ ++ i = 0; ++ slabs = kcp->kcp_cache->skc_slab_total; ++ INIT_LIST_HEAD(&list); ++ start = current_kernel_time(); ++ ++ /* Apply memory pressure */ ++ while (kcp->kcp_cache->skc_slab_total > (slabs >> 2)) { ++ ++ if ((i % 10000) == 0) ++ splat_kmem_cache_test_debug( ++ file, SPLAT_KMEM_TEST13_NAME, kcp); ++ ++ delta = timespec_sub(current_kernel_time(), start); ++ if (delta.tv_sec >= max_time) { ++ splat_vprint(file, SPLAT_KMEM_TEST13_NAME, ++ "Failed to reclaim 3/4 of cache in %ds, " ++ "%u/%u slabs remain\n", max_time, ++ (unsigned)kcp->kcp_cache->skc_slab_total, ++ slabs); ++ rc = -ETIME; ++ break; ++ } ++ ++ dp = (dummy_page_t *)__get_free_page(GFP_KERNEL | __GFP_NORETRY); ++ if (!dp) { ++ fails++; ++ splat_vprint(file, SPLAT_KMEM_TEST13_NAME, ++ "Failed (%d) to allocate page with %u " ++ "slabs still in the cache\n", fails, ++ (unsigned)kcp->kcp_cache->skc_slab_total); ++ continue; ++ } ++ ++ list_add(&dp->dp_list, &list); ++ i++; ++ } ++ ++ if (rc == 0) ++ splat_vprint(file, SPLAT_KMEM_TEST13_NAME, ++ "Successfully created %u slabs and with %d alloc " ++ "failures reclaimed 3/4 of them in %d.%03ds\n", ++ slabs, fails, ++ (int)delta.tv_sec, (int)delta.tv_nsec / 1000000); ++ ++ /* Release memory pressure pages */ ++ while (!list_empty(&list)) { ++ dp = list_entry(list.next, dummy_page_t, dp_list); ++ list_del_init(&dp->dp_list); ++ free_page((unsigned long)dp); ++ } ++ ++ /* Release remaining kmem cache objects */ ++ splat_kmem_cache_test_kcd_free(kcp, kct); ++out_kct: ++ splat_kmem_cache_test_kct_free(kcp, kct); ++out_cache: ++ kmem_cache_destroy(kcp->kcp_cache); ++out_kcp: ++ splat_kmem_cache_test_kcp_free(kcp); ++out: ++ return rc; ++} ++ ++splat_subsystem_t * ++splat_kmem_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_KMEM_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_KMEM_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_KMEM; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST1_NAME, SPLAT_KMEM_TEST1_DESC, ++ SPLAT_KMEM_TEST1_ID, splat_kmem_test1); ++ SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST2_NAME, SPLAT_KMEM_TEST2_DESC, ++ SPLAT_KMEM_TEST2_ID, splat_kmem_test2); ++ SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST3_NAME, SPLAT_KMEM_TEST3_DESC, ++ SPLAT_KMEM_TEST3_ID, splat_kmem_test3); ++ SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST4_NAME, SPLAT_KMEM_TEST4_DESC, ++ SPLAT_KMEM_TEST4_ID, splat_kmem_test4); ++ SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST5_NAME, SPLAT_KMEM_TEST5_DESC, ++ SPLAT_KMEM_TEST5_ID, splat_kmem_test5); ++ SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST6_NAME, SPLAT_KMEM_TEST6_DESC, ++ SPLAT_KMEM_TEST6_ID, splat_kmem_test6); ++ SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST7_NAME, SPLAT_KMEM_TEST7_DESC, ++ SPLAT_KMEM_TEST7_ID, splat_kmem_test7); ++ SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST8_NAME, SPLAT_KMEM_TEST8_DESC, ++ SPLAT_KMEM_TEST8_ID, splat_kmem_test8); ++ SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST9_NAME, SPLAT_KMEM_TEST9_DESC, ++ SPLAT_KMEM_TEST9_ID, splat_kmem_test9); ++ SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST10_NAME, SPLAT_KMEM_TEST10_DESC, ++ SPLAT_KMEM_TEST10_ID, splat_kmem_test10); ++#if 0 ++ SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST11_NAME, SPLAT_KMEM_TEST11_DESC, ++ SPLAT_KMEM_TEST11_ID, splat_kmem_test11); ++#endif ++ SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST12_NAME, SPLAT_KMEM_TEST12_DESC, ++ SPLAT_KMEM_TEST12_ID, splat_kmem_test12); ++ SPLAT_TEST_INIT(sub, SPLAT_KMEM_TEST13_NAME, SPLAT_KMEM_TEST13_DESC, ++ SPLAT_KMEM_TEST13_ID, splat_kmem_test13); ++ ++ return sub; ++} ++ ++void ++splat_kmem_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST13_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST12_ID); ++#if 0 ++ SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST11_ID); ++#endif ++ SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST10_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST9_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST8_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST7_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST6_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST5_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST4_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST3_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST2_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_KMEM_TEST1_ID); ++ ++ kfree(sub); ++} ++ ++int ++splat_kmem_id(void) { ++ return SPLAT_SUBSYSTEM_KMEM; ++} +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-kobj.c linux-3.2.33-go/spl/splat/splat-kobj.c +--- linux-3.2.33-go.orig/spl/splat/splat-kobj.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-kobj.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,166 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Kobj Tests. ++\*****************************************************************************/ ++ ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_KOBJ_NAME "kobj" ++#define SPLAT_KOBJ_DESC "Kernel Kobj Tests" ++ ++#define SPLAT_KOBJ_TEST1_ID 0x0a01 ++#define SPLAT_KOBJ_TEST1_NAME "open" ++#define SPLAT_KOBJ_TEST1_DESC "Kobj Open/Close Test" ++ ++#define SPLAT_KOBJ_TEST2_ID 0x0a02 ++#define SPLAT_KOBJ_TEST2_NAME "size/read" ++#define SPLAT_KOBJ_TEST2_DESC "Kobj Size/Read Test" ++ ++#define SPLAT_KOBJ_TEST_FILE "/etc/fstab" ++ ++static int ++splat_kobj_test1(struct file *file, void *arg) ++{ ++ struct _buf *f; ++ ++ f = kobj_open_file(SPLAT_KOBJ_TEST_FILE); ++ if (f == (struct _buf *)-1) { ++ splat_vprint(file, SPLAT_KOBJ_TEST1_NAME, "Failed to open " ++ "test file: %s\n", SPLAT_KOBJ_TEST_FILE); ++ return -ENOENT; ++ } ++ ++ kobj_close_file(f); ++ splat_vprint(file, SPLAT_KOBJ_TEST1_NAME, "Successfully opened and " ++ "closed test file: %s\n", SPLAT_KOBJ_TEST_FILE); ++ ++ return 0; ++} /* splat_kobj_test1() */ ++ ++static int ++splat_kobj_test2(struct file *file, void *arg) ++{ ++ struct _buf *f; ++ char *buf; ++ uint64_t size; ++ int rc; ++ ++ f = kobj_open_file(SPLAT_KOBJ_TEST_FILE); ++ if (f == (struct _buf *)-1) { ++ splat_vprint(file, SPLAT_KOBJ_TEST2_NAME, "Failed to open " ++ "test file: %s\n", SPLAT_KOBJ_TEST_FILE); ++ return -ENOENT; ++ } ++ ++ rc = kobj_get_filesize(f, &size); ++ if (rc) { ++ splat_vprint(file, SPLAT_KOBJ_TEST2_NAME, "Failed stat of " ++ "test file: %s (%d)\n", SPLAT_KOBJ_TEST_FILE, rc); ++ goto out; ++ } ++ ++ buf = kmalloc(size + 1, GFP_KERNEL); ++ if (!buf) { ++ rc = -ENOMEM; ++ splat_vprint(file, SPLAT_KOBJ_TEST2_NAME, "Failed to alloc " ++ "%lld bytes for tmp buffer (%d)\n", ++ (long long)size, rc); ++ goto out; ++ } ++ ++ memset(buf, 0, size + 1); ++ rc = kobj_read_file(f, buf, size, 0); ++ if (rc < 0) { ++ splat_vprint(file, SPLAT_KOBJ_TEST2_NAME, "Failed read of " ++ "test file: %s (%d)\n", SPLAT_KOBJ_TEST_FILE, rc); ++ goto out2; ++ } ++ ++ /* Validate we read as many bytes as expected based on the stat. This ++ * isn't a perfect test since we didn't create the file however it is ++ * pretty unlikely there are garbage characters in your /etc/fstab */ ++ if (size != (uint64_t)strlen(buf)) { ++ rc = -EFBIG; ++ splat_vprint(file, SPLAT_KOBJ_TEST2_NAME, "Stat'ed size " ++ "(%lld) does not match number of bytes read " ++ "(%lld)\n", (long long)size, ++ (long long)strlen(buf)); ++ goto out2; ++ } ++ ++ rc = 0; ++ splat_vprint(file, SPLAT_KOBJ_TEST2_NAME, "\n%s\n", buf); ++ splat_vprint(file, SPLAT_KOBJ_TEST2_NAME, "Successfully stat'ed " ++ "and read expected number of bytes (%lld) from test " ++ "file: %s\n", (long long)size, SPLAT_KOBJ_TEST_FILE); ++out2: ++ kfree(buf); ++out: ++ kobj_close_file(f); ++ ++ return rc; ++} /* splat_kobj_test2() */ ++ ++splat_subsystem_t * ++splat_kobj_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_KOBJ_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_KOBJ_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_KOBJ; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_KOBJ_TEST1_NAME, SPLAT_KOBJ_TEST1_DESC, ++ SPLAT_KOBJ_TEST1_ID, splat_kobj_test1); ++ SPLAT_TEST_INIT(sub, SPLAT_KOBJ_TEST2_NAME, SPLAT_KOBJ_TEST2_DESC, ++ SPLAT_KOBJ_TEST2_ID, splat_kobj_test2); ++ ++ return sub; ++} /* splat_kobj_init() */ ++ ++void ++splat_kobj_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ ++ SPLAT_TEST_FINI(sub, SPLAT_KOBJ_TEST2_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_KOBJ_TEST1_ID); ++ ++ kfree(sub); ++} /* splat_kobj_fini() */ ++ ++int ++splat_kobj_id(void) ++{ ++ return SPLAT_SUBSYSTEM_KOBJ; ++} /* splat_kobj_id() */ +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-linux.c linux-3.2.33-go/spl/splat/splat-linux.c +--- linux-3.2.33-go.orig/spl/splat/splat-linux.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-linux.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,242 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2011 Lawrence Livermore National Security, LLC. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Kernel Compatibility Tests. ++\*****************************************************************************/ ++ ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_LINUX_NAME "linux" ++#define SPLAT_LINUX_DESC "Kernel Compatibility Tests" ++ ++#define SPLAT_LINUX_TEST1_ID 0x1001 ++#define SPLAT_LINUX_TEST1_NAME "shrink_dcache" ++#define SPLAT_LINUX_TEST1_DESC "Shrink dcache test" ++ ++#define SPLAT_LINUX_TEST2_ID 0x1002 ++#define SPLAT_LINUX_TEST2_NAME "shrink_icache" ++#define SPLAT_LINUX_TEST2_DESC "Shrink icache test" ++ ++#define SPLAT_LINUX_TEST3_ID 0x1003 ++#define SPLAT_LINUX_TEST3_NAME "shrinker" ++#define SPLAT_LINUX_TEST3_DESC "Shrinker test" ++ ++ ++/* ++ * Attempt to shrink the dcache memory. This is simply a functional ++ * to ensure we can correctly call the shrinker. We don't check that ++ * the cache actually decreased because we have no control over what ++ * else may be running on the system. This avoid false positives. ++ */ ++static int ++splat_linux_test1(struct file *file, void *arg) ++{ ++ int remain_before; ++ int remain_after; ++ ++ remain_before = shrink_dcache_memory(0, GFP_KERNEL); ++ remain_after = shrink_dcache_memory(KMC_REAP_CHUNK, GFP_KERNEL); ++ ++ splat_vprint(file, SPLAT_LINUX_TEST1_NAME, ++ "Shrink dcache memory, remain %d -> %d\n", ++ remain_before, remain_after); ++ ++ return 0; ++} ++ ++/* ++ * Attempt to shrink the icache memory. This is simply a functional ++ * to ensure we can correctly call the shrinker. We don't check that ++ * the cache actually decreased because we have no control over what ++ * else may be running on the system. This avoid false positives. ++ */ ++static int ++splat_linux_test2(struct file *file, void *arg) ++{ ++ int remain_before; ++ int remain_after; ++ ++ remain_before = shrink_icache_memory(0, GFP_KERNEL); ++ remain_after = shrink_icache_memory(KMC_REAP_CHUNK, GFP_KERNEL); ++ ++ splat_vprint(file, SPLAT_LINUX_TEST2_NAME, ++ "Shrink icache memory, remain %d -> %d\n", ++ remain_before, remain_after); ++ ++ return 0; ++} ++ ++SPL_SHRINKER_CALLBACK_FWD_DECLARE(splat_linux_shrinker_fn); ++SPL_SHRINKER_DECLARE(splat_linux_shrinker, splat_linux_shrinker_fn, 1); ++static unsigned long splat_linux_shrinker_size = 0; ++static struct file *splat_linux_shrinker_file = NULL; ++ ++static int ++__splat_linux_shrinker_fn(struct shrinker *shrink, struct shrink_control *sc) ++{ ++ static int failsafe = 0; ++ ++ if (sc->nr_to_scan) { ++ splat_linux_shrinker_size = splat_linux_shrinker_size - ++ MIN(sc->nr_to_scan, splat_linux_shrinker_size); ++ ++ splat_vprint(splat_linux_shrinker_file, SPLAT_LINUX_TEST3_NAME, ++ "Reclaimed %lu objects, size now %lu\n", ++ sc->nr_to_scan, splat_linux_shrinker_size); ++ } else { ++ splat_vprint(splat_linux_shrinker_file, SPLAT_LINUX_TEST3_NAME, ++ "Cache size is %lu\n", splat_linux_shrinker_size); ++ } ++ ++ /* Far more calls than expected abort drop_slab as a failsafe */ ++ if ((++failsafe % 1000) == 0) { ++ splat_vprint(splat_linux_shrinker_file, SPLAT_LINUX_TEST3_NAME, ++ "Far more calls than expected (%d), size now %lu\n", ++ failsafe, splat_linux_shrinker_size); ++ return -1; ++ } ++ ++ return (int)splat_linux_shrinker_size; ++} ++ ++SPL_SHRINKER_CALLBACK_WRAPPER(splat_linux_shrinker_fn); ++ ++#define DROP_SLAB_CMD \ ++ "exec 0/proc/sys/vm/drop_caches " \ ++ " 2>/dev/null; " \ ++ "echo 2" ++ ++static int ++splat_linux_drop_slab(struct file *file) ++{ ++ char *argv[] = { "/bin/sh", ++ "-c", ++ DROP_SLAB_CMD, ++ NULL }; ++ char *envp[] = { "HOME=/", ++ "TERM=linux", ++ "PATH=/sbin:/usr/sbin:/bin:/usr/bin", ++ NULL }; ++ int rc; ++ ++ rc = call_usermodehelper(argv[0], argv, envp, 1); ++ if (rc) ++ splat_vprint(file, SPLAT_LINUX_TEST3_NAME, ++ "Failed user helper '%s %s %s', rc = %d\n", ++ argv[0], argv[1], argv[2], rc); ++ ++ return rc; ++} ++ ++/* ++ * Verify correct shrinker functionality by registering a shrinker ++ * with the required compatibility macros. We then use a simulated ++ * cache and force the systems caches to be dropped. The shrinker ++ * should be repeatedly called until it reports that the cache is ++ * empty. It is then cleanly unregistered and correct behavior is ++ * verified. There are now four slightly different supported shrinker ++ * API and this test ensures the compatibility code is correct. ++ */ ++static int ++splat_linux_test3(struct file *file, void *arg) ++{ ++ int rc = -EINVAL; ++ ++ /* ++ * Globals used by the shrinker, it is not safe to run this ++ * test concurrently this is a safe assumption for SPLAT tests. ++ * Regardless we do some minimal checking a bail if concurrent ++ * use is detected. ++ */ ++ if (splat_linux_shrinker_size || splat_linux_shrinker_file) { ++ splat_vprint(file, SPLAT_LINUX_TEST3_NAME, ++ "Failed due to concurrent shrinker test, rc = %d\n", rc); ++ return (rc); ++ } ++ ++ splat_linux_shrinker_size = 1024; ++ splat_linux_shrinker_file = file; ++ ++ spl_register_shrinker(&splat_linux_shrinker); ++ rc = splat_linux_drop_slab(file); ++ if (rc) ++ goto out; ++ ++ if (splat_linux_shrinker_size != 0) { ++ splat_vprint(file, SPLAT_LINUX_TEST3_NAME, ++ "Failed cache was not shrunk to 0, size now %lu", ++ splat_linux_shrinker_size); ++ rc = -EDOM; ++ } ++out: ++ spl_unregister_shrinker(&splat_linux_shrinker); ++ ++ splat_linux_shrinker_size = 0; ++ splat_linux_shrinker_file = NULL; ++ ++ return rc; ++} ++ ++splat_subsystem_t * ++splat_linux_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_LINUX_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_LINUX_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_LINUX; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_LINUX_TEST1_NAME, SPLAT_LINUX_TEST1_DESC, ++ SPLAT_LINUX_TEST1_ID, splat_linux_test1); ++ SPLAT_TEST_INIT(sub, SPLAT_LINUX_TEST2_NAME, SPLAT_LINUX_TEST2_DESC, ++ SPLAT_LINUX_TEST2_ID, splat_linux_test2); ++ SPLAT_TEST_INIT(sub, SPLAT_LINUX_TEST3_NAME, SPLAT_LINUX_TEST3_DESC, ++ SPLAT_LINUX_TEST3_ID, splat_linux_test3); ++ ++ return sub; ++} ++ ++void ++splat_linux_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ SPLAT_TEST_FINI(sub, SPLAT_LINUX_TEST3_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_LINUX_TEST2_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_LINUX_TEST1_ID); ++ ++ kfree(sub); ++} ++ ++int ++splat_linux_id(void) { ++ return SPLAT_SUBSYSTEM_LINUX; ++} +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-list.c linux-3.2.33-go/spl/splat/splat-list.c +--- linux-3.2.33-go.orig/spl/splat/splat-list.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-list.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,475 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) List Tests. ++\*****************************************************************************/ ++ ++#include ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_LIST_NAME "list" ++#define SPLAT_LIST_DESC "Kernel List Tests" ++ ++#define SPLAT_LIST_TEST1_ID 0x0c01 ++#define SPLAT_LIST_TEST1_NAME "create/destroy" ++#define SPLAT_LIST_TEST1_DESC "Create/destroy Test" ++ ++#define SPLAT_LIST_TEST2_ID 0x0c02 ++#define SPLAT_LIST_TEST2_NAME "ins/rm head" ++#define SPLAT_LIST_TEST2_DESC "Insert/remove head Test" ++ ++#define SPLAT_LIST_TEST3_ID 0x0c03 ++#define SPLAT_LIST_TEST3_NAME "ins/rm tail" ++#define SPLAT_LIST_TEST3_DESC "Insert/remove tail Test" ++ ++#define SPLAT_LIST_TEST4_ID 0x0c04 ++#define SPLAT_LIST_TEST4_NAME "insert_after" ++#define SPLAT_LIST_TEST4_DESC "Insert_after Test" ++ ++#define SPLAT_LIST_TEST5_ID 0x0c05 ++#define SPLAT_LIST_TEST5_NAME "insert_before" ++#define SPLAT_LIST_TEST5_DESC "Insert_before Test" ++ ++#define SPLAT_LIST_TEST6_ID 0x0c06 ++#define SPLAT_LIST_TEST6_NAME "remove" ++#define SPLAT_LIST_TEST6_DESC "Remove Test" ++ ++#define SPLAT_LIST_TEST7_ID 0x0c7 ++#define SPLAT_LIST_TEST7_NAME "active" ++#define SPLAT_LIST_TEST7_DESC "Active Test" ++ ++/* It is important that li_node is not the first element, this ++ * ensures the list_d2l/list_object macros are working correctly. */ ++typedef struct list_item { ++ int li_data; ++ list_node_t li_node; ++} list_item_t; ++ ++#define LIST_ORDER_STACK 0 ++#define LIST_ORDER_QUEUE 1 ++ ++static int ++splat_list_test1(struct file *file, void *arg) ++{ ++ list_t list; ++ ++ splat_vprint(file, SPLAT_LIST_TEST1_NAME, "Creating list\n%s", ""); ++ list_create(&list, sizeof(list_item_t), offsetof(list_item_t, li_node)); ++ ++ if (!list_is_empty(&list)) { ++ splat_vprint(file, SPLAT_LIST_TEST1_NAME, ++ "New list NOT empty%s\n", ""); ++ /* list_destroy() intentionally skipped to avoid assert */ ++ return -EEXIST; ++ } ++ ++ splat_vprint(file, SPLAT_LIST_TEST1_NAME, "Destroying list\n%s", ""); ++ list_destroy(&list); ++ ++ /* Validate the list has been destroyed */ ++ if (list_link_active(&list.list_head)) { ++ splat_vprint(file, SPLAT_LIST_TEST1_NAME, ++ "Destroyed list still active%s", ""); ++ return -EIO; ++ } ++ ++ return 0; ++} ++ ++static int ++splat_list_validate(list_t *list, int size, int order, int mult) ++{ ++ list_item_t *li; ++ int i; ++ ++ /* Walk all items in list from head to verify stack or queue ++ * ordering. We bound the for loop by size+1 to ensure that ++ * we still terminate if there is list corruption. We also ++ * intentionally make things a little more complex than they ++ * need to be by using list_head/list_next for queues, and ++ * list_tail/list_prev for stacks. This is simply done for ++ * coverage and to ensure these function are working right. ++ */ ++ for (i = 0, li = (order ? list_head(list) : list_tail(list)); ++ i < size + 1 && li != NULL; ++ i++, li = (order ? list_next(list, li) : list_prev(list, li))) ++ if (li->li_data != i * mult) ++ return -EIDRM; ++ ++ if (i != size) ++ return -E2BIG; ++ ++ return 0; ++} ++ ++static int ++splat_list_test2(struct file *file, void *arg) ++{ ++ list_t list; ++ list_item_t *li; ++ int i, list_size = 8, rc = 0; ++ ++ splat_vprint(file, SPLAT_LIST_TEST2_NAME, "Creating list\n%s", ""); ++ list_create(&list, sizeof(list_item_t), offsetof(list_item_t, li_node)); ++ ++ /* Insert all items at the list head to form a stack */ ++ splat_vprint(file, SPLAT_LIST_TEST2_NAME, ++ "Adding %d items to list head\n", list_size); ++ for (i = 0; i < list_size; i++) { ++ li = kmem_alloc(sizeof(list_item_t), KM_SLEEP); ++ if (li == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ list_link_init(&li->li_node); ++ li->li_data = i; ++ list_insert_head(&list, li); ++ } ++ ++ splat_vprint(file, SPLAT_LIST_TEST2_NAME, ++ "Validating %d item list is a stack\n", list_size); ++ rc = splat_list_validate(&list, list_size, LIST_ORDER_STACK, 1); ++ if (rc) ++ splat_vprint(file, SPLAT_LIST_TEST2_NAME, ++ "List validation failed, %d\n", rc); ++out: ++ /* Remove all items */ ++ splat_vprint(file, SPLAT_LIST_TEST2_NAME, ++ "Removing %d items from list head\n", list_size); ++ while ((li = list_remove_head(&list))) ++ kmem_free(li, sizeof(list_item_t)); ++ ++ splat_vprint(file, SPLAT_LIST_TEST2_NAME, "Destroying list\n%s", ""); ++ list_destroy(&list); ++ ++ return rc; ++} ++ ++static int ++splat_list_test3(struct file *file, void *arg) ++{ ++ list_t list; ++ list_item_t *li; ++ int i, list_size = 8, rc = 0; ++ ++ splat_vprint(file, SPLAT_LIST_TEST3_NAME, "Creating list\n%s", ""); ++ list_create(&list, sizeof(list_item_t), offsetof(list_item_t, li_node)); ++ ++ /* Insert all items at the list tail to form a queue */ ++ splat_vprint(file, SPLAT_LIST_TEST3_NAME, ++ "Adding %d items to list tail\n", list_size); ++ for (i = 0; i < list_size; i++) { ++ li = kmem_alloc(sizeof(list_item_t), KM_SLEEP); ++ if (li == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ list_link_init(&li->li_node); ++ li->li_data = i; ++ list_insert_tail(&list, li); ++ } ++ ++ splat_vprint(file, SPLAT_LIST_TEST3_NAME, ++ "Validating %d item list is a queue\n", list_size); ++ rc = splat_list_validate(&list, list_size, LIST_ORDER_QUEUE, 1); ++ if (rc) ++ splat_vprint(file, SPLAT_LIST_TEST3_NAME, ++ "List validation failed, %d\n", rc); ++out: ++ /* Remove all items */ ++ splat_vprint(file, SPLAT_LIST_TEST3_NAME, ++ "Removing %d items from list tail\n", list_size); ++ while ((li = list_remove_tail(&list))) ++ kmem_free(li, sizeof(list_item_t)); ++ ++ splat_vprint(file, SPLAT_LIST_TEST3_NAME, "Destroying list\n%s", ""); ++ list_destroy(&list); ++ ++ return rc; ++} ++ ++static int ++splat_list_test4(struct file *file, void *arg) ++{ ++ list_t list; ++ list_item_t *li_new, *li_last = NULL; ++ int i, list_size = 8, rc = 0; ++ ++ splat_vprint(file, SPLAT_LIST_TEST4_NAME, "Creating list\n%s", ""); ++ list_create(&list, sizeof(list_item_t), offsetof(list_item_t, li_node)); ++ ++ /* Insert all items after the last item to form a queue */ ++ splat_vprint(file, SPLAT_LIST_TEST4_NAME, ++ "Adding %d items each after the last item\n", list_size); ++ for (i = 0; i < list_size; i++) { ++ li_new = kmem_alloc(sizeof(list_item_t), KM_SLEEP); ++ if (li_new == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ list_link_init(&li_new->li_node); ++ li_new->li_data = i; ++ list_insert_after(&list, li_last, li_new); ++ li_last = li_new; ++ } ++ ++ splat_vprint(file, SPLAT_LIST_TEST4_NAME, ++ "Validating %d item list is a queue\n", list_size); ++ rc = splat_list_validate(&list, list_size, LIST_ORDER_QUEUE, 1); ++ if (rc) ++ splat_vprint(file, SPLAT_LIST_TEST4_NAME, ++ "List validation failed, %d\n", rc); ++out: ++ /* Remove all items */ ++ splat_vprint(file, SPLAT_LIST_TEST4_NAME, ++ "Removing %d items from list tail\n", list_size); ++ while ((li_new = list_remove_head(&list))) ++ kmem_free(li_new, sizeof(list_item_t)); ++ ++ splat_vprint(file, SPLAT_LIST_TEST4_NAME, "Destroying list\n%s", ""); ++ list_destroy(&list); ++ ++ return rc; ++} ++ ++static int ++splat_list_test5(struct file *file, void *arg) ++{ ++ list_t list; ++ list_item_t *li_new, *li_last = NULL; ++ int i, list_size = 8, rc = 0; ++ ++ splat_vprint(file, SPLAT_LIST_TEST5_NAME, "Creating list\n%s", ""); ++ list_create(&list, sizeof(list_item_t), offsetof(list_item_t, li_node)); ++ ++ /* Insert all items before the last item to form a stack */ ++ splat_vprint(file, SPLAT_LIST_TEST5_NAME, ++ "Adding %d items each before the last item\n", list_size); ++ for (i = 0; i < list_size; i++) { ++ li_new = kmem_alloc(sizeof(list_item_t), KM_SLEEP); ++ if (li_new == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ list_link_init(&li_new->li_node); ++ li_new->li_data = i; ++ list_insert_before(&list, li_last, li_new); ++ li_last = li_new; ++ } ++ ++ splat_vprint(file, SPLAT_LIST_TEST5_NAME, ++ "Validating %d item list is a queue\n", list_size); ++ rc = splat_list_validate(&list, list_size, LIST_ORDER_STACK, 1); ++ if (rc) ++ splat_vprint(file, SPLAT_LIST_TEST5_NAME, ++ "List validation failed, %d\n", rc); ++out: ++ /* Remove all items */ ++ splat_vprint(file, SPLAT_LIST_TEST5_NAME, ++ "Removing %d items from list tail\n", list_size); ++ while ((li_new = list_remove_tail(&list))) ++ kmem_free(li_new, sizeof(list_item_t)); ++ ++ splat_vprint(file, SPLAT_LIST_TEST5_NAME, "Destroying list\n%s", ""); ++ list_destroy(&list); ++ ++ return rc; ++} ++ ++static int ++splat_list_test6(struct file *file, void *arg) ++{ ++ list_t list; ++ list_item_t *li, *li_prev; ++ int i, list_size = 8, rc = 0; ++ ++ splat_vprint(file, SPLAT_LIST_TEST6_NAME, "Creating list\n%s", ""); ++ list_create(&list, sizeof(list_item_t), offsetof(list_item_t, li_node)); ++ ++ /* Insert all items at the list tail to form a queue */ ++ splat_vprint(file, SPLAT_LIST_TEST6_NAME, ++ "Adding %d items to list tail\n", list_size); ++ for (i = 0; i < list_size; i++) { ++ li = kmem_alloc(sizeof(list_item_t), KM_SLEEP); ++ if (li == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ list_link_init(&li->li_node); ++ li->li_data = i; ++ list_insert_tail(&list, li); ++ } ++ ++ /* Remove all odd items from the queue */ ++ splat_vprint(file, SPLAT_LIST_TEST6_NAME, ++ "Removing %d odd items from the list\n", list_size >> 1); ++ for (li = list_head(&list); li != NULL; li = list_next(&list, li)) { ++ if (li->li_data % 2 == 1) { ++ li_prev = list_prev(&list, li); ++ list_remove(&list, li); ++ kmem_free(li, sizeof(list_item_t)); ++ li = li_prev; ++ } ++ } ++ ++ splat_vprint(file, SPLAT_LIST_TEST6_NAME, "Validating %d item " ++ "list is a queue of only even elements\n", list_size / 2); ++ rc = splat_list_validate(&list, list_size / 2, LIST_ORDER_QUEUE, 2); ++ if (rc) ++ splat_vprint(file, SPLAT_LIST_TEST6_NAME, ++ "List validation failed, %d\n", rc); ++out: ++ /* Remove all items */ ++ splat_vprint(file, SPLAT_LIST_TEST6_NAME, ++ "Removing %d items from list tail\n", list_size / 2); ++ while ((li = list_remove_tail(&list))) ++ kmem_free(li, sizeof(list_item_t)); ++ ++ splat_vprint(file, SPLAT_LIST_TEST6_NAME, "Destroying list\n%s", ""); ++ list_destroy(&list); ++ ++ return rc; ++} ++ ++static int ++splat_list_test7(struct file *file, void *arg) ++{ ++ list_t list; ++ list_item_t *li; ++ int rc = 0; ++ ++ splat_vprint(file, SPLAT_LIST_TEST7_NAME, "Creating list\n%s", ""); ++ list_create(&list, sizeof(list_item_t), offsetof(list_item_t, li_node)); ++ ++ li = kmem_alloc(sizeof(list_item_t), KM_SLEEP); ++ if (li == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ /* Validate newly initialized node is inactive */ ++ splat_vprint(file, SPLAT_LIST_TEST7_NAME, "Init list node\n%s", ""); ++ list_link_init(&li->li_node); ++ if (list_link_active(&li->li_node)) { ++ splat_vprint(file, SPLAT_LIST_TEST7_NAME, "Newly initialized " ++ "list node should inactive %p/%p\n", ++ li->li_node.prev, li->li_node.next); ++ rc = -EINVAL; ++ goto out_li; ++ } ++ ++ /* Validate node is active when linked in to a list */ ++ splat_vprint(file, SPLAT_LIST_TEST7_NAME, "Insert list node\n%s", ""); ++ list_insert_head(&list, li); ++ if (!list_link_active(&li->li_node)) { ++ splat_vprint(file, SPLAT_LIST_TEST7_NAME, "List node " ++ "inserted in list should be active %p/%p\n", ++ li->li_node.prev, li->li_node.next); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ /* Validate node is inactive when removed from list */ ++ splat_vprint(file, SPLAT_LIST_TEST7_NAME, "Remove list node\n%s", ""); ++ list_remove(&list, li); ++ if (list_link_active(&li->li_node)) { ++ splat_vprint(file, SPLAT_LIST_TEST7_NAME, "List node " ++ "removed from list should be inactive %p/%p\n", ++ li->li_node.prev, li->li_node.next); ++ rc = -EINVAL; ++ } ++out_li: ++ kmem_free(li, sizeof(list_item_t)); ++out: ++ /* Remove all items */ ++ while ((li = list_remove_head(&list))) ++ kmem_free(li, sizeof(list_item_t)); ++ ++ splat_vprint(file, SPLAT_LIST_TEST7_NAME, "Destroying list\n%s", ""); ++ list_destroy(&list); ++ ++ return rc; ++} ++ ++splat_subsystem_t * ++splat_list_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_LIST_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_LIST_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_LIST; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_LIST_TEST1_NAME, SPLAT_LIST_TEST1_DESC, ++ SPLAT_LIST_TEST1_ID, splat_list_test1); ++ SPLAT_TEST_INIT(sub, SPLAT_LIST_TEST2_NAME, SPLAT_LIST_TEST2_DESC, ++ SPLAT_LIST_TEST2_ID, splat_list_test2); ++ SPLAT_TEST_INIT(sub, SPLAT_LIST_TEST3_NAME, SPLAT_LIST_TEST3_DESC, ++ SPLAT_LIST_TEST3_ID, splat_list_test3); ++ SPLAT_TEST_INIT(sub, SPLAT_LIST_TEST4_NAME, SPLAT_LIST_TEST4_DESC, ++ SPLAT_LIST_TEST4_ID, splat_list_test4); ++ SPLAT_TEST_INIT(sub, SPLAT_LIST_TEST5_NAME, SPLAT_LIST_TEST5_DESC, ++ SPLAT_LIST_TEST5_ID, splat_list_test5); ++ SPLAT_TEST_INIT(sub, SPLAT_LIST_TEST6_NAME, SPLAT_LIST_TEST6_DESC, ++ SPLAT_LIST_TEST6_ID, splat_list_test6); ++ SPLAT_TEST_INIT(sub, SPLAT_LIST_TEST7_NAME, SPLAT_LIST_TEST7_DESC, ++ SPLAT_LIST_TEST7_ID, splat_list_test7); ++ ++ return sub; ++} ++ ++void ++splat_list_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ ++ SPLAT_TEST_FINI(sub, SPLAT_LIST_TEST7_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_LIST_TEST6_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_LIST_TEST5_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_LIST_TEST4_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_LIST_TEST3_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_LIST_TEST2_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_LIST_TEST1_ID); ++ ++ kfree(sub); ++} ++ ++int ++splat_list_id(void) ++{ ++ return SPLAT_SUBSYSTEM_LIST; ++} +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-mutex.c linux-3.2.33-go/spl/splat/splat-mutex.c +--- linux-3.2.33-go.orig/spl/splat/splat-mutex.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-mutex.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,439 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Mutex Tests. ++\*****************************************************************************/ ++ ++#include ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_MUTEX_NAME "mutex" ++#define SPLAT_MUTEX_DESC "Kernel Mutex Tests" ++ ++#define SPLAT_MUTEX_TEST1_ID 0x0401 ++#define SPLAT_MUTEX_TEST1_NAME "tryenter" ++#define SPLAT_MUTEX_TEST1_DESC "Validate mutex_tryenter() correctness" ++ ++#define SPLAT_MUTEX_TEST2_ID 0x0402 ++#define SPLAT_MUTEX_TEST2_NAME "race" ++#define SPLAT_MUTEX_TEST2_DESC "Many threads entering/exiting the mutex" ++ ++#define SPLAT_MUTEX_TEST3_ID 0x0403 ++#define SPLAT_MUTEX_TEST3_NAME "owned" ++#define SPLAT_MUTEX_TEST3_DESC "Validate mutex_owned() correctness" ++ ++#define SPLAT_MUTEX_TEST4_ID 0x0404 ++#define SPLAT_MUTEX_TEST4_NAME "owner" ++#define SPLAT_MUTEX_TEST4_DESC "Validate mutex_owner() correctness" ++ ++#define SPLAT_MUTEX_TEST_MAGIC 0x115599DDUL ++#define SPLAT_MUTEX_TEST_NAME "mutex_test" ++#define SPLAT_MUTEX_TEST_TASKQ "mutex_taskq" ++#define SPLAT_MUTEX_TEST_COUNT 128 ++ ++typedef struct mutex_priv { ++ unsigned long mp_magic; ++ struct file *mp_file; ++ kmutex_t mp_mtx; ++ int mp_rc; ++ int mp_rc2; ++} mutex_priv_t; ++ ++static void ++splat_mutex_test1_func(void *arg) ++{ ++ mutex_priv_t *mp = (mutex_priv_t *)arg; ++ ASSERT(mp->mp_magic == SPLAT_MUTEX_TEST_MAGIC); ++ ++ if (mutex_tryenter(&mp->mp_mtx)) { ++ mp->mp_rc = 0; ++ mutex_exit(&mp->mp_mtx); ++ } else { ++ mp->mp_rc = -EBUSY; ++ } ++} ++ ++static int ++splat_mutex_test1(struct file *file, void *arg) ++{ ++ mutex_priv_t *mp; ++ taskq_t *tq; ++ int id, rc = 0; ++ ++ mp = (mutex_priv_t *)kmalloc(sizeof(*mp), GFP_KERNEL); ++ if (mp == NULL) ++ return -ENOMEM; ++ ++ tq = taskq_create(SPLAT_MUTEX_TEST_TASKQ, 1, maxclsyspri, ++ 50, INT_MAX, TASKQ_PREPOPULATE); ++ if (tq == NULL) { ++ rc = -ENOMEM; ++ goto out2; ++ } ++ ++ mp->mp_magic = SPLAT_MUTEX_TEST_MAGIC; ++ mp->mp_file = file; ++ mutex_init(&mp->mp_mtx, SPLAT_MUTEX_TEST_NAME, MUTEX_DEFAULT, NULL); ++ mutex_enter(&mp->mp_mtx); ++ ++ /* ++ * Schedule a task function which will try and acquire the mutex via ++ * mutex_tryenter() while it's held. This should fail and the task ++ * function will indicate this status in the passed private data. ++ */ ++ mp->mp_rc = -EINVAL; ++ id = taskq_dispatch(tq, splat_mutex_test1_func, mp, TQ_SLEEP); ++ if (id == 0) { ++ mutex_exit(&mp->mp_mtx); ++ splat_vprint(file, SPLAT_MUTEX_TEST1_NAME, "%s", ++ "taskq_dispatch() failed\n"); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ taskq_wait_id(tq, id); ++ mutex_exit(&mp->mp_mtx); ++ ++ /* Task function successfully acquired mutex, very bad! */ ++ if (mp->mp_rc != -EBUSY) { ++ splat_vprint(file, SPLAT_MUTEX_TEST1_NAME, ++ "mutex_trylock() incorrectly succeeded when " ++ "the mutex was held, %d/%d\n", id, mp->mp_rc); ++ rc = -EINVAL; ++ goto out; ++ } else { ++ splat_vprint(file, SPLAT_MUTEX_TEST1_NAME, "%s", ++ "mutex_trylock() correctly failed when " ++ "the mutex was held\n"); ++ } ++ ++ /* ++ * Schedule a task function which will try and acquire the mutex via ++ * mutex_tryenter() while it is not held. This should succeed and ++ * can be verified by checking the private data. ++ */ ++ mp->mp_rc = -EINVAL; ++ id = taskq_dispatch(tq, splat_mutex_test1_func, mp, TQ_SLEEP); ++ if (id == 0) { ++ splat_vprint(file, SPLAT_MUTEX_TEST1_NAME, "%s", ++ "taskq_dispatch() failed\n"); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ taskq_wait_id(tq, id); ++ ++ /* Task function failed to acquire mutex, very bad! */ ++ if (mp->mp_rc != 0) { ++ splat_vprint(file, SPLAT_MUTEX_TEST1_NAME, ++ "mutex_trylock() incorrectly failed when " ++ "the mutex was not held, %d/%d\n", id, mp->mp_rc); ++ rc = -EINVAL; ++ } else { ++ splat_vprint(file, SPLAT_MUTEX_TEST1_NAME, "%s", ++ "mutex_trylock() correctly succeeded " ++ "when the mutex was not held\n"); ++ } ++out: ++ taskq_destroy(tq); ++ mutex_destroy(&(mp->mp_mtx)); ++out2: ++ kfree(mp); ++ return rc; ++} ++ ++static void ++splat_mutex_test2_func(void *arg) ++{ ++ mutex_priv_t *mp = (mutex_priv_t *)arg; ++ int rc; ++ ASSERT(mp->mp_magic == SPLAT_MUTEX_TEST_MAGIC); ++ ++ /* Read the value before sleeping and write it after we wake up to ++ * maximize the chance of a race if mutexs are not working properly */ ++ mutex_enter(&mp->mp_mtx); ++ rc = mp->mp_rc; ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule_timeout(HZ / 100); /* 1/100 of a second */ ++ VERIFY(mp->mp_rc == rc); ++ mp->mp_rc = rc + 1; ++ mutex_exit(&mp->mp_mtx); ++} ++ ++static int ++splat_mutex_test2(struct file *file, void *arg) ++{ ++ mutex_priv_t *mp; ++ taskq_t *tq; ++ int i, rc = 0; ++ ++ mp = (mutex_priv_t *)kmalloc(sizeof(*mp), GFP_KERNEL); ++ if (mp == NULL) ++ return -ENOMEM; ++ ++ /* Create several threads allowing tasks to race with each other */ ++ tq = taskq_create(SPLAT_MUTEX_TEST_TASKQ, num_online_cpus(), ++ maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); ++ if (tq == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ mp->mp_magic = SPLAT_MUTEX_TEST_MAGIC; ++ mp->mp_file = file; ++ mutex_init(&(mp->mp_mtx), SPLAT_MUTEX_TEST_NAME, MUTEX_DEFAULT, NULL); ++ mp->mp_rc = 0; ++ ++ /* ++ * Schedule N work items to the work queue each of which enters the ++ * mutex, sleeps briefly, then exits the mutex. On a multiprocessor ++ * box these work items will be handled by all available CPUs. The ++ * task function checks to ensure the tracked shared variable is ++ * always only incremented by one. Additionally, the mutex itself ++ * is instrumented such that if any two processors are in the ++ * critical region at the same time the system will panic. If the ++ * mutex is implemented right this will never happy, that's a pass. ++ */ ++ for (i = 0; i < SPLAT_MUTEX_TEST_COUNT; i++) { ++ if (!taskq_dispatch(tq, splat_mutex_test2_func, mp, TQ_SLEEP)) { ++ splat_vprint(file, SPLAT_MUTEX_TEST2_NAME, ++ "Failed to queue task %d\n", i); ++ rc = -EINVAL; ++ } ++ } ++ ++ taskq_wait(tq); ++ ++ if (mp->mp_rc == SPLAT_MUTEX_TEST_COUNT) { ++ splat_vprint(file, SPLAT_MUTEX_TEST2_NAME, "%d racing threads " ++ "correctly entered/exited the mutex %d times\n", ++ num_online_cpus(), mp->mp_rc); ++ } else { ++ splat_vprint(file, SPLAT_MUTEX_TEST2_NAME, "%d racing threads " ++ "only processed %d/%d mutex work items\n", ++ num_online_cpus(),mp->mp_rc,SPLAT_MUTEX_TEST_COUNT); ++ rc = -EINVAL; ++ } ++ ++ taskq_destroy(tq); ++ mutex_destroy(&(mp->mp_mtx)); ++out: ++ kfree(mp); ++ return rc; ++} ++ ++static void ++splat_mutex_owned(void *priv) ++{ ++ mutex_priv_t *mp = (mutex_priv_t *)priv; ++ ++ ASSERT(mp->mp_magic == SPLAT_MUTEX_TEST_MAGIC); ++ mp->mp_rc = mutex_owned(&mp->mp_mtx); ++ mp->mp_rc2 = MUTEX_HELD(&mp->mp_mtx); ++} ++ ++static int ++splat_mutex_test3(struct file *file, void *arg) ++{ ++ mutex_priv_t mp; ++ taskq_t *tq; ++ int rc = 0; ++ ++ mp.mp_magic = SPLAT_MUTEX_TEST_MAGIC; ++ mp.mp_file = file; ++ mutex_init(&mp.mp_mtx, SPLAT_MUTEX_TEST_NAME, MUTEX_DEFAULT, NULL); ++ ++ if ((tq = taskq_create(SPLAT_MUTEX_TEST_NAME, 1, maxclsyspri, ++ 50, INT_MAX, TASKQ_PREPOPULATE)) == NULL) { ++ splat_vprint(file, SPLAT_MUTEX_TEST3_NAME, "Taskq '%s' " ++ "create failed\n", SPLAT_MUTEX_TEST3_NAME); ++ return -EINVAL; ++ } ++ ++ mutex_enter(&mp.mp_mtx); ++ ++ /* Mutex should be owned by current */ ++ if (!mutex_owned(&mp.mp_mtx)) { ++ splat_vprint(file, SPLAT_MUTEX_TEST3_NAME, "Unowned mutex " ++ "should be owned by pid %d\n", current->pid); ++ rc = -EINVAL; ++ goto out_exit; ++ } ++ ++ if (taskq_dispatch(tq, splat_mutex_owned, &mp, TQ_SLEEP) == 0) { ++ splat_vprint(file, SPLAT_MUTEX_TEST3_NAME, "Failed to " ++ "dispatch function '%s' to taskq\n", ++ sym2str(splat_mutex_owned)); ++ rc = -EINVAL; ++ goto out_exit; ++ } ++ taskq_wait(tq); ++ ++ /* Mutex should not be owned which checked from a different thread */ ++ if (mp.mp_rc || mp.mp_rc2) { ++ splat_vprint(file, SPLAT_MUTEX_TEST3_NAME, "Mutex owned by " ++ "pid %d not by taskq\n", current->pid); ++ rc = -EINVAL; ++ goto out_exit; ++ } ++ ++ mutex_exit(&mp.mp_mtx); ++ ++ /* Mutex should not be owned by current */ ++ if (mutex_owned(&mp.mp_mtx)) { ++ splat_vprint(file, SPLAT_MUTEX_TEST3_NAME, "Mutex owned by " ++ "pid %d it should be unowned\b", current->pid); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ if (taskq_dispatch(tq, splat_mutex_owned, &mp, TQ_SLEEP) == 0) { ++ splat_vprint(file, SPLAT_MUTEX_TEST3_NAME, "Failed to " ++ "dispatch function '%s' to taskq\n", ++ sym2str(splat_mutex_owned)); ++ rc = -EINVAL; ++ goto out; ++ } ++ taskq_wait(tq); ++ ++ /* Mutex should be owned by no one */ ++ if (mp.mp_rc || mp.mp_rc2) { ++ splat_vprint(file, SPLAT_MUTEX_TEST3_NAME, "Mutex owned by " ++ "no one, %d/%d disagrees\n", mp.mp_rc, mp.mp_rc2); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ splat_vprint(file, SPLAT_MUTEX_TEST3_NAME, "%s", ++ "Correct mutex_owned() behavior\n"); ++ goto out; ++out_exit: ++ mutex_exit(&mp.mp_mtx); ++out: ++ mutex_destroy(&mp.mp_mtx); ++ taskq_destroy(tq); ++ ++ return rc; ++} ++ ++static int ++splat_mutex_test4(struct file *file, void *arg) ++{ ++ kmutex_t mtx; ++ kthread_t *owner; ++ int rc = 0; ++ ++ mutex_init(&mtx, SPLAT_MUTEX_TEST_NAME, MUTEX_DEFAULT, NULL); ++ ++ /* ++ * Verify mutex owner is cleared after being dropped. Depending ++ * on how you build your kernel this behavior changes, ensure the ++ * SPL mutex implementation is properly detecting this. ++ */ ++ mutex_enter(&mtx); ++ msleep(100); ++ mutex_exit(&mtx); ++ if (MUTEX_HELD(&mtx)) { ++ splat_vprint(file, SPLAT_MUTEX_TEST4_NAME, "Mutex should " ++ "not be held, bit is by %p\n", mutex_owner(&mtx)); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ mutex_enter(&mtx); ++ ++ /* Mutex should be owned by current */ ++ owner = mutex_owner(&mtx); ++ if (current != owner) { ++ splat_vprint(file, SPLAT_MUTEX_TEST4_NAME, "Mutex should " ++ "be owned by pid %d but is owned by pid %d\n", ++ current->pid, owner ? owner->pid : -1); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ mutex_exit(&mtx); ++ ++ /* Mutex should not be owned by any task */ ++ owner = mutex_owner(&mtx); ++ if (owner) { ++ splat_vprint(file, SPLAT_MUTEX_TEST4_NAME, "Mutex should not " ++ "be owned but is owned by pid %d\n", owner->pid); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ splat_vprint(file, SPLAT_MUTEX_TEST3_NAME, "%s", ++ "Correct mutex_owner() behavior\n"); ++out: ++ mutex_destroy(&mtx); ++ ++ return rc; ++} ++ ++splat_subsystem_t * ++splat_mutex_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_MUTEX_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_MUTEX_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_MUTEX; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_MUTEX_TEST1_NAME, SPLAT_MUTEX_TEST1_DESC, ++ SPLAT_MUTEX_TEST1_ID, splat_mutex_test1); ++ SPLAT_TEST_INIT(sub, SPLAT_MUTEX_TEST2_NAME, SPLAT_MUTEX_TEST2_DESC, ++ SPLAT_MUTEX_TEST2_ID, splat_mutex_test2); ++ SPLAT_TEST_INIT(sub, SPLAT_MUTEX_TEST3_NAME, SPLAT_MUTEX_TEST3_DESC, ++ SPLAT_MUTEX_TEST3_ID, splat_mutex_test3); ++ SPLAT_TEST_INIT(sub, SPLAT_MUTEX_TEST4_NAME, SPLAT_MUTEX_TEST4_DESC, ++ SPLAT_MUTEX_TEST4_ID, splat_mutex_test4); ++ ++ return sub; ++} ++ ++void ++splat_mutex_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ SPLAT_TEST_FINI(sub, SPLAT_MUTEX_TEST4_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_MUTEX_TEST3_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_MUTEX_TEST2_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_MUTEX_TEST1_ID); ++ ++ kfree(sub); ++} ++ ++int ++splat_mutex_id(void) { ++ return SPLAT_SUBSYSTEM_MUTEX; ++} +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-random.c linux-3.2.33-go/spl/splat/splat-random.c +--- linux-3.2.33-go.orig/spl/splat/splat-random.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-random.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,130 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Random Number Generator Tests. ++\*****************************************************************************/ ++ ++#include ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_KRNG_NAME "krng" ++#define SPLAT_KRNG_DESC "Kernel Random Number Generator Tests" ++ ++#define SPLAT_KRNG_TEST1_ID 0x0301 ++#define SPLAT_KRNG_TEST1_NAME "freq" ++#define SPLAT_KRNG_TEST1_DESC "Frequency Test" ++ ++#define KRNG_NUM_BITS 1048576 ++#define KRNG_NUM_BYTES (KRNG_NUM_BITS >> 3) ++#define KRNG_NUM_BITS_DIV2 (KRNG_NUM_BITS >> 1) ++#define KRNG_ERROR_RANGE 2097 ++ ++/* Random Number Generator Tests ++ There can be meny more tests on quality of the ++ random number generator. For now we are only ++ testing the frequency of particular bits. ++ We could also test consecutive sequences, ++ randomness within a particular block, etc. ++ but is probably not necessary for our purposes */ ++ ++static int ++splat_krng_test1(struct file *file, void *arg) ++{ ++ uint8_t *buf; ++ int i, j, diff, num = 0, rc = 0; ++ ++ buf = kmalloc(sizeof(*buf) * KRNG_NUM_BYTES, GFP_KERNEL); ++ if (buf == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ memset(buf, 0, sizeof(*buf) * KRNG_NUM_BYTES); ++ ++ /* Always succeeds */ ++ random_get_pseudo_bytes(buf, sizeof(uint8_t) * KRNG_NUM_BYTES); ++ ++ for (i = 0; i < KRNG_NUM_BYTES; i++) { ++ uint8_t tmp = buf[i]; ++ for (j = 0; j < 8; j++) { ++ uint8_t tmp2 = ((tmp >> j) & 0x01); ++ if (tmp2 == 1) { ++ num++; ++ } ++ } ++ } ++ ++ kfree(buf); ++ ++ diff = KRNG_NUM_BITS_DIV2 - num; ++ if (diff < 0) ++ diff *= -1; ++ ++ splat_print(file, "Test 1 Number of ones: %d\n", num); ++ splat_print(file, "Test 1 Difference from expected: %d Allowed: %d\n", ++ diff, KRNG_ERROR_RANGE); ++ ++ if (diff > KRNG_ERROR_RANGE) ++ rc = -ERANGE; ++out: ++ return rc; ++} ++ ++splat_subsystem_t * ++splat_krng_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_KRNG_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_KRNG_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_KRNG; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_KRNG_TEST1_NAME, SPLAT_KRNG_TEST1_DESC, ++ SPLAT_KRNG_TEST1_ID, splat_krng_test1); ++ ++ return sub; ++} ++ ++void ++splat_krng_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ ++ SPLAT_TEST_FINI(sub, SPLAT_KRNG_TEST1_ID); ++ ++ kfree(sub); ++} ++ ++int ++splat_krng_id(void) { ++ return SPLAT_SUBSYSTEM_KRNG; ++} +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-rwlock.c linux-3.2.33-go/spl/splat/splat-rwlock.c +--- linux-3.2.33-go.orig/spl/splat/splat-rwlock.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-rwlock.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,678 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Read/Writer Lock Tests. ++\*****************************************************************************/ ++ ++#include ++#include ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_RWLOCK_NAME "rwlock" ++#define SPLAT_RWLOCK_DESC "Kernel RW Lock Tests" ++ ++#define SPLAT_RWLOCK_TEST1_ID 0x0701 ++#define SPLAT_RWLOCK_TEST1_NAME "N-rd/1-wr" ++#define SPLAT_RWLOCK_TEST1_DESC "Multiple readers one writer" ++ ++#define SPLAT_RWLOCK_TEST2_ID 0x0702 ++#define SPLAT_RWLOCK_TEST2_NAME "0-rd/N-wr" ++#define SPLAT_RWLOCK_TEST2_DESC "Multiple writers" ++ ++#define SPLAT_RWLOCK_TEST3_ID 0x0703 ++#define SPLAT_RWLOCK_TEST3_NAME "held" ++#define SPLAT_RWLOCK_TEST3_DESC "RW_{LOCK|READ|WRITE}_HELD" ++ ++#define SPLAT_RWLOCK_TEST4_ID 0x0704 ++#define SPLAT_RWLOCK_TEST4_NAME "tryenter" ++#define SPLAT_RWLOCK_TEST4_DESC "Tryenter" ++ ++#define SPLAT_RWLOCK_TEST5_ID 0x0705 ++#define SPLAT_RWLOCK_TEST5_NAME "rw_downgrade" ++#define SPLAT_RWLOCK_TEST5_DESC "Write downgrade" ++ ++#define SPLAT_RWLOCK_TEST6_ID 0x0706 ++#define SPLAT_RWLOCK_TEST6_NAME "rw_tryupgrade" ++#define SPLAT_RWLOCK_TEST6_DESC "Read upgrade" ++ ++#define SPLAT_RWLOCK_TEST_MAGIC 0x115599DDUL ++#define SPLAT_RWLOCK_TEST_NAME "rwlock_test" ++#define SPLAT_RWLOCK_TEST_TASKQ "rwlock_taskq" ++#define SPLAT_RWLOCK_TEST_COUNT 8 ++ ++#define SPLAT_RWLOCK_RELEASE_INIT 0 ++#define SPLAT_RWLOCK_RELEASE_WR 1 ++#define SPLAT_RWLOCK_RELEASE_RD 2 ++ ++typedef struct rw_priv { ++ unsigned long rw_magic; ++ struct file *rw_file; ++ krwlock_t rw_rwlock; ++ spinlock_t rw_lock; ++ wait_queue_head_t rw_waitq; ++ int rw_completed; ++ int rw_holders; ++ int rw_waiters; ++ int rw_release; ++ int rw_rc; ++ krw_t rw_type; ++} rw_priv_t; ++ ++typedef struct rw_thr { ++ const char *rwt_name; ++ rw_priv_t *rwt_rwp; ++ int rwt_id; ++} rw_thr_t; ++ ++void splat_init_rw_priv(rw_priv_t *rwp, struct file *file) ++{ ++ rwp->rw_magic = SPLAT_RWLOCK_TEST_MAGIC; ++ rwp->rw_file = file; ++ rw_init(&rwp->rw_rwlock, SPLAT_RWLOCK_TEST_NAME, RW_DEFAULT, NULL); ++ spin_lock_init(&rwp->rw_lock); ++ init_waitqueue_head(&rwp->rw_waitq); ++ rwp->rw_completed = 0; ++ rwp->rw_holders = 0; ++ rwp->rw_waiters = 0; ++ rwp->rw_release = SPLAT_RWLOCK_RELEASE_INIT; ++ rwp->rw_rc = 0; ++ rwp->rw_type = 0; ++} ++ ++static int ++splat_rwlock_wr_thr(void *arg) ++{ ++ rw_thr_t *rwt = (rw_thr_t *)arg; ++ rw_priv_t *rwp = rwt->rwt_rwp; ++ uint8_t rnd; ++ char name[16]; ++ ++ ASSERT(rwp->rw_magic == SPLAT_RWLOCK_TEST_MAGIC); ++ snprintf(name, sizeof(name), "rwlock_wr_thr%d", rwt->rwt_id); ++ daemonize(name); ++ get_random_bytes((void *)&rnd, 1); ++ msleep((unsigned int)rnd); ++ ++ splat_vprint(rwp->rw_file, rwt->rwt_name, ++ "%s trying to acquire rwlock (%d holding/%d waiting)\n", ++ name, rwp->rw_holders, rwp->rw_waiters); ++ spin_lock(&rwp->rw_lock); ++ rwp->rw_waiters++; ++ spin_unlock(&rwp->rw_lock); ++ rw_enter(&rwp->rw_rwlock, RW_WRITER); ++ ++ spin_lock(&rwp->rw_lock); ++ rwp->rw_waiters--; ++ rwp->rw_holders++; ++ spin_unlock(&rwp->rw_lock); ++ splat_vprint(rwp->rw_file, rwt->rwt_name, ++ "%s acquired rwlock (%d holding/%d waiting)\n", ++ name, rwp->rw_holders, rwp->rw_waiters); ++ ++ /* Wait for control thread to signal we can release the write lock */ ++ wait_event_interruptible(rwp->rw_waitq, splat_locked_test(&rwp->rw_lock, ++ rwp->rw_release == SPLAT_RWLOCK_RELEASE_WR)); ++ ++ spin_lock(&rwp->rw_lock); ++ rwp->rw_completed++; ++ rwp->rw_holders--; ++ spin_unlock(&rwp->rw_lock); ++ splat_vprint(rwp->rw_file, rwt->rwt_name, ++ "%s dropped rwlock (%d holding/%d waiting)\n", ++ name, rwp->rw_holders, rwp->rw_waiters); ++ ++ rw_exit(&rwp->rw_rwlock); ++ ++ return 0; ++} ++ ++static int ++splat_rwlock_rd_thr(void *arg) ++{ ++ rw_thr_t *rwt = (rw_thr_t *)arg; ++ rw_priv_t *rwp = rwt->rwt_rwp; ++ uint8_t rnd; ++ char name[16]; ++ ++ ASSERT(rwp->rw_magic == SPLAT_RWLOCK_TEST_MAGIC); ++ snprintf(name, sizeof(name), "rwlock_rd_thr%d", rwt->rwt_id); ++ daemonize(name); ++ get_random_bytes((void *)&rnd, 1); ++ msleep((unsigned int)rnd); ++ ++ /* Don't try and take the semaphore until after someone has it */ ++ wait_event_interruptible(rwp->rw_waitq, splat_locked_test(&rwp->rw_lock, ++ rwp->rw_holders > 0)); ++ ++ splat_vprint(rwp->rw_file, rwt->rwt_name, ++ "%s trying to acquire rwlock (%d holding/%d waiting)\n", ++ name, rwp->rw_holders, rwp->rw_waiters); ++ spin_lock(&rwp->rw_lock); ++ rwp->rw_waiters++; ++ spin_unlock(&rwp->rw_lock); ++ rw_enter(&rwp->rw_rwlock, RW_READER); ++ ++ spin_lock(&rwp->rw_lock); ++ rwp->rw_waiters--; ++ rwp->rw_holders++; ++ spin_unlock(&rwp->rw_lock); ++ splat_vprint(rwp->rw_file, rwt->rwt_name, ++ "%s acquired rwlock (%d holding/%d waiting)\n", ++ name, rwp->rw_holders, rwp->rw_waiters); ++ ++ /* Wait for control thread to signal we can release the read lock */ ++ wait_event_interruptible(rwp->rw_waitq, splat_locked_test(&rwp->rw_lock, ++ rwp->rw_release == SPLAT_RWLOCK_RELEASE_RD)); ++ ++ spin_lock(&rwp->rw_lock); ++ rwp->rw_completed++; ++ rwp->rw_holders--; ++ spin_unlock(&rwp->rw_lock); ++ splat_vprint(rwp->rw_file, rwt->rwt_name, ++ "%s dropped rwlock (%d holding/%d waiting)\n", ++ name, rwp->rw_holders, rwp->rw_waiters); ++ ++ rw_exit(&rwp->rw_rwlock); ++ ++ return 0; ++} ++ ++static int ++splat_rwlock_test1(struct file *file, void *arg) ++{ ++ int i, count = 0, rc = 0; ++ long pids[SPLAT_RWLOCK_TEST_COUNT]; ++ rw_thr_t rwt[SPLAT_RWLOCK_TEST_COUNT]; ++ rw_priv_t *rwp; ++ ++ rwp = (rw_priv_t *)kmalloc(sizeof(*rwp), GFP_KERNEL); ++ if (rwp == NULL) ++ return -ENOMEM; ++ ++ splat_init_rw_priv(rwp, file); ++ ++ /* Create some threads, the exact number isn't important just as ++ * long as we know how many we managed to create and should expect. */ ++ ++ ++ ++ for (i = 0; i < SPLAT_RWLOCK_TEST_COUNT; i++) { ++ rwt[i].rwt_rwp = rwp; ++ rwt[i].rwt_id = i; ++ rwt[i].rwt_name = SPLAT_RWLOCK_TEST1_NAME; ++ ++ /* The first thread will be the writer */ ++ if (i == 0) ++ pids[i] = kernel_thread(splat_rwlock_wr_thr, &rwt[i], 0); ++ else ++ pids[i] = kernel_thread(splat_rwlock_rd_thr, &rwt[i], 0); ++ ++ if (pids[i] >= 0) ++ count++; ++ } ++ ++ /* Wait for the writer */ ++ while (splat_locked_test(&rwp->rw_lock, rwp->rw_holders == 0)) { ++ wake_up_interruptible(&rwp->rw_waitq); ++ msleep(100); ++ } ++ ++ /* Wait for 'count-1' readers */ ++ while (splat_locked_test(&rwp->rw_lock, rwp->rw_waiters < count - 1)) { ++ wake_up_interruptible(&rwp->rw_waitq); ++ msleep(100); ++ } ++ ++ /* Verify there is only one lock holder */ ++ if (splat_locked_test(&rwp->rw_lock, rwp->rw_holders) != 1) { ++ splat_vprint(file, SPLAT_RWLOCK_TEST1_NAME, "Only 1 holder " ++ "expected for rwlock (%d holding/%d waiting)\n", ++ rwp->rw_holders, rwp->rw_waiters); ++ rc = -EINVAL; ++ } ++ ++ /* Verify 'count-1' readers */ ++ if (splat_locked_test(&rwp->rw_lock, rwp->rw_waiters != count - 1)) { ++ splat_vprint(file, SPLAT_RWLOCK_TEST1_NAME, "Only %d waiters " ++ "expected for rwlock (%d holding/%d waiting)\n", ++ count - 1, rwp->rw_holders, rwp->rw_waiters); ++ rc = -EINVAL; ++ } ++ ++ /* Signal the writer to release, allows readers to acquire */ ++ spin_lock(&rwp->rw_lock); ++ rwp->rw_release = SPLAT_RWLOCK_RELEASE_WR; ++ wake_up_interruptible(&rwp->rw_waitq); ++ spin_unlock(&rwp->rw_lock); ++ ++ /* Wait for 'count-1' readers to hold the lock */ ++ while (splat_locked_test(&rwp->rw_lock, rwp->rw_holders < count - 1)) { ++ wake_up_interruptible(&rwp->rw_waitq); ++ msleep(100); ++ } ++ ++ /* Verify there are 'count-1' readers */ ++ if (splat_locked_test(&rwp->rw_lock, rwp->rw_holders != count - 1)) { ++ splat_vprint(file, SPLAT_RWLOCK_TEST1_NAME, "Only %d holders " ++ "expected for rwlock (%d holding/%d waiting)\n", ++ count - 1, rwp->rw_holders, rwp->rw_waiters); ++ rc = -EINVAL; ++ } ++ ++ /* Release 'count-1' readers */ ++ spin_lock(&rwp->rw_lock); ++ rwp->rw_release = SPLAT_RWLOCK_RELEASE_RD; ++ wake_up_interruptible(&rwp->rw_waitq); ++ spin_unlock(&rwp->rw_lock); ++ ++ /* Wait for the test to complete */ ++ while (splat_locked_test(&rwp->rw_lock, ++ rwp->rw_holders>0 || rwp->rw_waiters>0)) ++ msleep(100); ++ ++ rw_destroy(&(rwp->rw_rwlock)); ++ kfree(rwp); ++ ++ return rc; ++} ++ ++static void ++splat_rwlock_test2_func(void *arg) ++{ ++ rw_priv_t *rwp = (rw_priv_t *)arg; ++ int rc; ++ ASSERT(rwp->rw_magic == SPLAT_RWLOCK_TEST_MAGIC); ++ ++ /* Read the value before sleeping and write it after we wake up to ++ * maximize the chance of a race if rwlocks are not working properly */ ++ rw_enter(&rwp->rw_rwlock, RW_WRITER); ++ rc = rwp->rw_rc; ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule_timeout(HZ / 100); /* 1/100 of a second */ ++ VERIFY(rwp->rw_rc == rc); ++ rwp->rw_rc = rc + 1; ++ rw_exit(&rwp->rw_rwlock); ++} ++ ++static int ++splat_rwlock_test2(struct file *file, void *arg) ++{ ++ rw_priv_t *rwp; ++ taskq_t *tq; ++ int i, rc = 0, tq_count = 256; ++ ++ rwp = (rw_priv_t *)kmalloc(sizeof(*rwp), GFP_KERNEL); ++ if (rwp == NULL) ++ return -ENOMEM; ++ ++ splat_init_rw_priv(rwp, file); ++ ++ /* Create several threads allowing tasks to race with each other */ ++ tq = taskq_create(SPLAT_RWLOCK_TEST_TASKQ, num_online_cpus(), ++ maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); ++ if (tq == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ /* ++ * Schedule N work items to the work queue each of which enters the ++ * writer rwlock, sleeps briefly, then exits the writer rwlock. On a ++ * multiprocessor box these work items will be handled by all available ++ * CPUs. The task function checks to ensure the tracked shared variable ++ * is always only incremented by one. Additionally, the rwlock itself ++ * is instrumented such that if any two processors are in the ++ * critical region at the same time the system will panic. If the ++ * rwlock is implemented right this will never happy, that's a pass. ++ */ ++ for (i = 0; i < tq_count; i++) { ++ if (!taskq_dispatch(tq,splat_rwlock_test2_func,rwp,TQ_SLEEP)) { ++ splat_vprint(file, SPLAT_RWLOCK_TEST2_NAME, ++ "Failed to queue task %d\n", i); ++ rc = -EINVAL; ++ } ++ } ++ ++ taskq_wait(tq); ++ ++ if (rwp->rw_rc == tq_count) { ++ splat_vprint(file, SPLAT_RWLOCK_TEST2_NAME, "%d racing threads " ++ "correctly entered/exited the rwlock %d times\n", ++ num_online_cpus(), rwp->rw_rc); ++ } else { ++ splat_vprint(file, SPLAT_RWLOCK_TEST2_NAME, "%d racing threads " ++ "only processed %d/%d w rwlock work items\n", ++ num_online_cpus(), rwp->rw_rc, tq_count); ++ rc = -EINVAL; ++ } ++ ++ taskq_destroy(tq); ++ rw_destroy(&(rwp->rw_rwlock)); ++out: ++ kfree(rwp); ++ return rc; ++} ++ ++#define splat_rwlock_test3_helper(rwp,rex1,rex2,wex1,wex2,held_func,rc) \ ++do { \ ++ int result, _rc1_, _rc2_, _rc3_, _rc4_; \ ++ \ ++ rc = 0; \ ++ rw_enter(&(rwp)->rw_rwlock, RW_READER); \ ++ _rc1_ = ((result = held_func(&(rwp)->rw_rwlock)) != rex1); \ ++ splat_vprint(file, SPLAT_RWLOCK_TEST3_NAME, "%s" #held_func \ ++ " returned %d (expected %d) when RW_READER\n", \ ++ _rc1_ ? "Fail " : "", result, rex1); \ ++ rw_exit(&(rwp)->rw_rwlock); \ ++ _rc2_ = ((result = held_func(&(rwp)->rw_rwlock)) != rex2); \ ++ splat_vprint(file, SPLAT_RWLOCK_TEST3_NAME, "%s" #held_func \ ++ " returned %d (expected %d) when !RW_READER\n", \ ++ _rc2_ ? "Fail " : "", result, rex2); \ ++ \ ++ rw_enter(&(rwp)->rw_rwlock, RW_WRITER); \ ++ _rc3_ = ((result = held_func(&(rwp)->rw_rwlock)) != wex1); \ ++ splat_vprint(file, SPLAT_RWLOCK_TEST3_NAME, "%s" #held_func \ ++ " returned %d (expected %d) when RW_WRITER\n", \ ++ _rc3_ ? "Fail " : "", result, wex1); \ ++ rw_exit(&(rwp)->rw_rwlock); \ ++ _rc4_ = ((result = held_func(&(rwp)->rw_rwlock)) != wex2); \ ++ splat_vprint(file, SPLAT_RWLOCK_TEST3_NAME, "%s" #held_func \ ++ " returned %d (expected %d) when !RW_WRITER\n", \ ++ _rc4_ ? "Fail " : "", result, wex2); \ ++ \ ++ rc = ((_rc1_ || _rc2_ || _rc3_ || _rc4_) ? -EINVAL : 0); \ ++} while(0); ++ ++static int ++splat_rwlock_test3(struct file *file, void *arg) ++{ ++ rw_priv_t *rwp; ++ int rc1, rc2, rc3; ++ ++ rwp = (rw_priv_t *)kmalloc(sizeof(*rwp), GFP_KERNEL); ++ if (rwp == NULL) ++ return -ENOMEM; ++ ++ splat_init_rw_priv(rwp, file); ++ ++ splat_rwlock_test3_helper(rwp, 1, 0, 1, 0, RW_LOCK_HELD, rc1); ++ splat_rwlock_test3_helper(rwp, 1, 0, 0, 0, RW_READ_HELD, rc2); ++ splat_rwlock_test3_helper(rwp, 0, 0, 1, 0, RW_WRITE_HELD, rc3); ++ ++ rw_destroy(&rwp->rw_rwlock); ++ kfree(rwp); ++ ++ return ((rc1 || rc2 || rc3) ? -EINVAL : 0); ++} ++ ++static void ++splat_rwlock_test4_func(void *arg) ++{ ++ rw_priv_t *rwp = (rw_priv_t *)arg; ++ ASSERT(rwp->rw_magic == SPLAT_RWLOCK_TEST_MAGIC); ++ ++ if (rw_tryenter(&rwp->rw_rwlock, rwp->rw_type)) { ++ rwp->rw_rc = 0; ++ rw_exit(&rwp->rw_rwlock); ++ } else { ++ rwp->rw_rc = -EBUSY; ++ } ++} ++ ++static char * ++splat_rwlock_test4_name(krw_t type) ++{ ++ switch (type) { ++ case RW_NONE: return "RW_NONE"; ++ case RW_WRITER: return "RW_WRITER"; ++ case RW_READER: return "RW_READER"; ++ } ++ ++ return NULL; ++} ++ ++static int ++splat_rwlock_test4_type(taskq_t *tq, rw_priv_t *rwp, int expected_rc, ++ krw_t holder_type, krw_t try_type) ++{ ++ int id, rc = 0; ++ ++ /* Schedule a task function which will try and acquire the rwlock ++ * using type try_type while the rwlock is being held as holder_type. ++ * The result must match expected_rc for the test to pass */ ++ rwp->rw_rc = -EINVAL; ++ rwp->rw_type = try_type; ++ ++ if (holder_type == RW_WRITER || holder_type == RW_READER) ++ rw_enter(&rwp->rw_rwlock, holder_type); ++ ++ id = taskq_dispatch(tq, splat_rwlock_test4_func, rwp, TQ_SLEEP); ++ if (id == 0) { ++ splat_vprint(rwp->rw_file, SPLAT_RWLOCK_TEST4_NAME, "%s", ++ "taskq_dispatch() failed\n"); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ taskq_wait_id(tq, id); ++ ++ if (rwp->rw_rc != expected_rc) ++ rc = -EINVAL; ++ ++ splat_vprint(rwp->rw_file, SPLAT_RWLOCK_TEST4_NAME, ++ "%srw_tryenter(%s) returned %d (expected %d) when %s\n", ++ rc ? "Fail " : "", splat_rwlock_test4_name(try_type), ++ rwp->rw_rc, expected_rc, ++ splat_rwlock_test4_name(holder_type)); ++out: ++ if (holder_type == RW_WRITER || holder_type == RW_READER) ++ rw_exit(&rwp->rw_rwlock); ++ ++ return rc; ++} ++ ++static int ++splat_rwlock_test4(struct file *file, void *arg) ++{ ++ rw_priv_t *rwp; ++ taskq_t *tq; ++ int rc = 0, rc1, rc2, rc3, rc4, rc5, rc6; ++ ++ rwp = (rw_priv_t *)kmalloc(sizeof(*rwp), GFP_KERNEL); ++ if (rwp == NULL) ++ return -ENOMEM; ++ ++ tq = taskq_create(SPLAT_RWLOCK_TEST_TASKQ, 1, maxclsyspri, ++ 50, INT_MAX, TASKQ_PREPOPULATE); ++ if (tq == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ splat_init_rw_priv(rwp, file); ++ ++ /* Validate all combinations of rw_tryenter() contention */ ++ rc1 = splat_rwlock_test4_type(tq, rwp, -EBUSY, RW_WRITER, RW_WRITER); ++ rc2 = splat_rwlock_test4_type(tq, rwp, -EBUSY, RW_WRITER, RW_READER); ++ rc3 = splat_rwlock_test4_type(tq, rwp, -EBUSY, RW_READER, RW_WRITER); ++ rc4 = splat_rwlock_test4_type(tq, rwp, 0, RW_READER, RW_READER); ++ rc5 = splat_rwlock_test4_type(tq, rwp, 0, RW_NONE, RW_WRITER); ++ rc6 = splat_rwlock_test4_type(tq, rwp, 0, RW_NONE, RW_READER); ++ ++ if (rc1 || rc2 || rc3 || rc4 || rc5 || rc6) ++ rc = -EINVAL; ++ ++ taskq_destroy(tq); ++out: ++ rw_destroy(&(rwp->rw_rwlock)); ++ kfree(rwp); ++ ++ return rc; ++} ++ ++static int ++splat_rwlock_test5(struct file *file, void *arg) ++{ ++ rw_priv_t *rwp; ++ int rc = -EINVAL; ++ ++ rwp = (rw_priv_t *)kmalloc(sizeof(*rwp), GFP_KERNEL); ++ if (rwp == NULL) ++ return -ENOMEM; ++ ++ splat_init_rw_priv(rwp, file); ++ ++ rw_enter(&rwp->rw_rwlock, RW_WRITER); ++ if (!RW_WRITE_HELD(&rwp->rw_rwlock)) { ++ splat_vprint(file, SPLAT_RWLOCK_TEST5_NAME, ++ "rwlock should be write lock: %d\n", ++ RW_WRITE_HELD(&rwp->rw_rwlock)); ++ goto out; ++ } ++ ++ rw_downgrade(&rwp->rw_rwlock); ++ if (!RW_READ_HELD(&rwp->rw_rwlock)) { ++ splat_vprint(file, SPLAT_RWLOCK_TEST5_NAME, ++ "rwlock should be read lock: %d\n", ++ RW_READ_HELD(&rwp->rw_rwlock)); ++ goto out; ++ } ++ ++ rc = 0; ++ splat_vprint(file, SPLAT_RWLOCK_TEST5_NAME, "%s", ++ "rwlock properly downgraded\n"); ++out: ++ rw_exit(&rwp->rw_rwlock); ++ rw_destroy(&rwp->rw_rwlock); ++ kfree(rwp); ++ ++ return rc; ++} ++ ++static int ++splat_rwlock_test6(struct file *file, void *arg) ++{ ++ rw_priv_t *rwp; ++ int rc; ++ ++ rwp = (rw_priv_t *)kmalloc(sizeof(*rwp), GFP_KERNEL); ++ if (rwp == NULL) ++ return -ENOMEM; ++ ++ splat_init_rw_priv(rwp, file); ++ ++ rw_enter(&rwp->rw_rwlock, RW_READER); ++ if (!RW_READ_HELD(&rwp->rw_rwlock)) { ++ splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, ++ "rwlock should be read lock: %d\n", ++ RW_READ_HELD(&rwp->rw_rwlock)); ++ rc = -ENOLCK; ++ goto out; ++ } ++ ++#if defined(CONFIG_RWSEM_GENERIC_SPINLOCK) ++ /* With one reader upgrade should never fail. */ ++ rc = rw_tryupgrade(&rwp->rw_rwlock); ++ if (!rc) { ++ splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, ++ "rwlock failed upgrade from reader: %d\n", ++ RW_READ_HELD(&rwp->rw_rwlock)); ++ rc = -ENOLCK; ++ goto out; ++ } ++ ++ if (RW_READ_HELD(&rwp->rw_rwlock) || !RW_WRITE_HELD(&rwp->rw_rwlock)) { ++ splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "rwlock should " ++ "have 0 (not %d) reader and 1 (not %d) writer\n", ++ RW_READ_HELD(&rwp->rw_rwlock), ++ RW_WRITE_HELD(&rwp->rw_rwlock)); ++ goto out; ++ } ++ ++ rc = 0; ++ splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "%s", ++ "rwlock properly upgraded\n"); ++#else ++ rc = 0; ++ splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "%s", ++ "rw_tryupgrade() is disabled for this arch\n"); ++#endif ++out: ++ rw_exit(&rwp->rw_rwlock); ++ rw_destroy(&rwp->rw_rwlock); ++ kfree(rwp); ++ ++ return rc; ++} ++ ++splat_subsystem_t * ++splat_rwlock_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_RWLOCK_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_RWLOCK_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_RWLOCK; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_RWLOCK_TEST1_NAME, SPLAT_RWLOCK_TEST1_DESC, ++ SPLAT_RWLOCK_TEST1_ID, splat_rwlock_test1); ++ SPLAT_TEST_INIT(sub, SPLAT_RWLOCK_TEST2_NAME, SPLAT_RWLOCK_TEST2_DESC, ++ SPLAT_RWLOCK_TEST2_ID, splat_rwlock_test2); ++ SPLAT_TEST_INIT(sub, SPLAT_RWLOCK_TEST3_NAME, SPLAT_RWLOCK_TEST3_DESC, ++ SPLAT_RWLOCK_TEST3_ID, splat_rwlock_test3); ++ SPLAT_TEST_INIT(sub, SPLAT_RWLOCK_TEST4_NAME, SPLAT_RWLOCK_TEST4_DESC, ++ SPLAT_RWLOCK_TEST4_ID, splat_rwlock_test4); ++ SPLAT_TEST_INIT(sub, SPLAT_RWLOCK_TEST5_NAME, SPLAT_RWLOCK_TEST5_DESC, ++ SPLAT_RWLOCK_TEST5_ID, splat_rwlock_test5); ++ SPLAT_TEST_INIT(sub, SPLAT_RWLOCK_TEST6_NAME, SPLAT_RWLOCK_TEST6_DESC, ++ SPLAT_RWLOCK_TEST6_ID, splat_rwlock_test6); ++ ++ return sub; ++} ++ ++void ++splat_rwlock_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST6_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST5_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST4_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST3_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST2_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST1_ID); ++ kfree(sub); ++} ++ ++int ++splat_rwlock_id(void) { ++ return SPLAT_SUBSYSTEM_RWLOCK; ++} +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-taskq.c linux-3.2.33-go/spl/splat/splat-taskq.c +--- linux-3.2.33-go.orig/spl/splat/splat-taskq.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-taskq.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,1163 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Task Queue Tests. ++\*****************************************************************************/ ++ ++#include ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_TASKQ_NAME "taskq" ++#define SPLAT_TASKQ_DESC "Kernel Task Queue Tests" ++ ++#define SPLAT_TASKQ_TEST1_ID 0x0201 ++#define SPLAT_TASKQ_TEST1_NAME "single" ++#define SPLAT_TASKQ_TEST1_DESC "Single task queue, single task" ++ ++#define SPLAT_TASKQ_TEST2_ID 0x0202 ++#define SPLAT_TASKQ_TEST2_NAME "multiple" ++#define SPLAT_TASKQ_TEST2_DESC "Multiple task queues, multiple tasks" ++ ++#define SPLAT_TASKQ_TEST3_ID 0x0203 ++#define SPLAT_TASKQ_TEST3_NAME "system" ++#define SPLAT_TASKQ_TEST3_DESC "System task queue, multiple tasks" ++ ++#define SPLAT_TASKQ_TEST4_ID 0x0204 ++#define SPLAT_TASKQ_TEST4_NAME "wait" ++#define SPLAT_TASKQ_TEST4_DESC "Multiple task waiting" ++ ++#define SPLAT_TASKQ_TEST5_ID 0x0205 ++#define SPLAT_TASKQ_TEST5_NAME "order" ++#define SPLAT_TASKQ_TEST5_DESC "Correct task ordering" ++ ++#define SPLAT_TASKQ_TEST6_ID 0x0206 ++#define SPLAT_TASKQ_TEST6_NAME "front" ++#define SPLAT_TASKQ_TEST6_DESC "Correct ordering with TQ_FRONT flag" ++ ++#define SPLAT_TASKQ_TEST7_ID 0x0207 ++#define SPLAT_TASKQ_TEST7_NAME "recurse" ++#define SPLAT_TASKQ_TEST7_DESC "Single task queue, recursive dispatch" ++ ++#define SPLAT_TASKQ_TEST8_ID 0x0208 ++#define SPLAT_TASKQ_TEST8_NAME "contention" ++#define SPLAT_TASKQ_TEST8_DESC "1 queue, 100 threads, 131072 tasks" ++ ++#define SPLAT_TASKQ_ORDER_MAX 8 ++#define SPLAT_TASKQ_DEPTH_MAX 16 ++ ++ ++typedef struct splat_taskq_arg { ++ int flag; ++ int id; ++ atomic_t count; ++ int order[SPLAT_TASKQ_ORDER_MAX]; ++ unsigned int depth; ++ taskq_t *tq; ++ taskq_ent_t *tqe; ++ spinlock_t lock; ++ struct file *file; ++ const char *name; ++} splat_taskq_arg_t; ++ ++typedef struct splat_taskq_id { ++ int id; ++ splat_taskq_arg_t *arg; ++} splat_taskq_id_t; ++ ++/* ++ * Create a taskq, queue a task, wait until task completes, ensure ++ * task ran properly, cleanup taskq. ++ */ ++static void ++splat_taskq_test13_func(void *arg) ++{ ++ splat_taskq_arg_t *tq_arg = (splat_taskq_arg_t *)arg; ++ ++ ASSERT(tq_arg); ++ splat_vprint(tq_arg->file, SPLAT_TASKQ_TEST1_NAME, ++ "Taskq '%s' function '%s' setting flag\n", ++ tq_arg->name, sym2str(splat_taskq_test13_func)); ++ tq_arg->flag = 1; ++} ++ ++static int ++splat_taskq_test1_impl(struct file *file, void *arg, boolean_t prealloc) ++{ ++ taskq_t *tq; ++ taskqid_t id; ++ splat_taskq_arg_t tq_arg; ++ taskq_ent_t tqe; ++ ++ taskq_init_ent(&tqe); ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST1_NAME, ++ "Taskq '%s' creating (%s dispatch)\n", ++ SPLAT_TASKQ_TEST1_NAME, ++ prealloc ? "prealloc" : "dynamic"); ++ if ((tq = taskq_create(SPLAT_TASKQ_TEST1_NAME, 1, maxclsyspri, ++ 50, INT_MAX, TASKQ_PREPOPULATE)) == NULL) { ++ splat_vprint(file, SPLAT_TASKQ_TEST1_NAME, ++ "Taskq '%s' create failed\n", ++ SPLAT_TASKQ_TEST1_NAME); ++ return -EINVAL; ++ } ++ ++ tq_arg.flag = 0; ++ tq_arg.id = 0; ++ tq_arg.file = file; ++ tq_arg.name = SPLAT_TASKQ_TEST1_NAME; ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST1_NAME, ++ "Taskq '%s' function '%s' dispatching\n", ++ tq_arg.name, sym2str(splat_taskq_test13_func)); ++ if (prealloc) { ++ taskq_dispatch_ent(tq, splat_taskq_test13_func, ++ &tq_arg, TQ_SLEEP, &tqe); ++ id = tqe.tqent_id; ++ } else { ++ id = taskq_dispatch(tq, splat_taskq_test13_func, ++ &tq_arg, TQ_SLEEP); ++ } ++ ++ if (id == 0) { ++ splat_vprint(file, SPLAT_TASKQ_TEST1_NAME, ++ "Taskq '%s' function '%s' dispatch failed\n", ++ tq_arg.name, sym2str(splat_taskq_test13_func)); ++ taskq_destroy(tq); ++ return -EINVAL; ++ } ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST1_NAME, "Taskq '%s' waiting\n", ++ tq_arg.name); ++ taskq_wait(tq); ++ splat_vprint(file, SPLAT_TASKQ_TEST1_NAME, "Taskq '%s' destroying\n", ++ tq_arg.name); ++ ++ taskq_destroy(tq); ++ ++ return (tq_arg.flag) ? 0 : -EINVAL; ++} ++ ++static int ++splat_taskq_test1(struct file *file, void *arg) ++{ ++ int rc; ++ ++ rc = splat_taskq_test1_impl(file, arg, B_FALSE); ++ if (rc) ++ return rc; ++ ++ rc = splat_taskq_test1_impl(file, arg, B_TRUE); ++ ++ return rc; ++} ++ ++/* ++ * Create multiple taskq's, each with multiple tasks, wait until ++ * all tasks complete, ensure all tasks ran properly and in the ++ * correct order. Run order must be the same as the order submitted ++ * because we only have 1 thread per taskq. Finally cleanup the taskq. ++ */ ++static void ++splat_taskq_test2_func1(void *arg) ++{ ++ splat_taskq_arg_t *tq_arg = (splat_taskq_arg_t *)arg; ++ ++ ASSERT(tq_arg); ++ splat_vprint(tq_arg->file, SPLAT_TASKQ_TEST2_NAME, ++ "Taskq '%s/%d' function '%s' flag = %d = %d * 2\n", ++ tq_arg->name, tq_arg->id, ++ sym2str(splat_taskq_test2_func1), ++ tq_arg->flag * 2, tq_arg->flag); ++ tq_arg->flag *= 2; ++} ++ ++static void ++splat_taskq_test2_func2(void *arg) ++{ ++ splat_taskq_arg_t *tq_arg = (splat_taskq_arg_t *)arg; ++ ++ ASSERT(tq_arg); ++ splat_vprint(tq_arg->file, SPLAT_TASKQ_TEST2_NAME, ++ "Taskq '%s/%d' function '%s' flag = %d = %d + 1\n", ++ tq_arg->name, tq_arg->id, ++ sym2str(splat_taskq_test2_func2), ++ tq_arg->flag + 1, tq_arg->flag); ++ tq_arg->flag += 1; ++} ++ ++#define TEST2_TASKQS 8 ++#define TEST2_THREADS_PER_TASKQ 1 ++ ++static int ++splat_taskq_test2_impl(struct file *file, void *arg, boolean_t prealloc) { ++ taskq_t *tq[TEST2_TASKQS] = { NULL }; ++ taskqid_t id; ++ splat_taskq_arg_t tq_args[TEST2_TASKQS]; ++ taskq_ent_t *func1_tqes = NULL; ++ taskq_ent_t *func2_tqes = NULL; ++ int i, rc = 0; ++ ++ func1_tqes = kmalloc(sizeof(*func1_tqes) * TEST2_TASKQS, GFP_KERNEL); ++ if (func1_tqes == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ func2_tqes = kmalloc(sizeof(*func2_tqes) * TEST2_TASKQS, GFP_KERNEL); ++ if (func2_tqes == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ for (i = 0; i < TEST2_TASKQS; i++) { ++ taskq_init_ent(&func1_tqes[i]); ++ taskq_init_ent(&func2_tqes[i]); ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST2_NAME, ++ "Taskq '%s/%d' creating (%s dispatch)\n", ++ SPLAT_TASKQ_TEST2_NAME, i, ++ prealloc ? "prealloc" : "dynamic"); ++ if ((tq[i] = taskq_create(SPLAT_TASKQ_TEST2_NAME, ++ TEST2_THREADS_PER_TASKQ, ++ maxclsyspri, 50, INT_MAX, ++ TASKQ_PREPOPULATE)) == NULL) { ++ splat_vprint(file, SPLAT_TASKQ_TEST2_NAME, ++ "Taskq '%s/%d' create failed\n", ++ SPLAT_TASKQ_TEST2_NAME, i); ++ rc = -EINVAL; ++ break; ++ } ++ ++ tq_args[i].flag = i; ++ tq_args[i].id = i; ++ tq_args[i].file = file; ++ tq_args[i].name = SPLAT_TASKQ_TEST2_NAME; ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST2_NAME, ++ "Taskq '%s/%d' function '%s' dispatching\n", ++ tq_args[i].name, tq_args[i].id, ++ sym2str(splat_taskq_test2_func1)); ++ if (prealloc) { ++ taskq_dispatch_ent(tq[i], splat_taskq_test2_func1, ++ &tq_args[i], TQ_SLEEP, &func1_tqes[i]); ++ id = func1_tqes[i].tqent_id; ++ } else { ++ id = taskq_dispatch(tq[i], splat_taskq_test2_func1, ++ &tq_args[i], TQ_SLEEP); ++ } ++ ++ if (id == 0) { ++ splat_vprint(file, SPLAT_TASKQ_TEST2_NAME, ++ "Taskq '%s/%d' function '%s' dispatch " ++ "failed\n", tq_args[i].name, tq_args[i].id, ++ sym2str(splat_taskq_test2_func1)); ++ rc = -EINVAL; ++ break; ++ } ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST2_NAME, ++ "Taskq '%s/%d' function '%s' dispatching\n", ++ tq_args[i].name, tq_args[i].id, ++ sym2str(splat_taskq_test2_func2)); ++ if (prealloc) { ++ taskq_dispatch_ent(tq[i], splat_taskq_test2_func2, ++ &tq_args[i], TQ_SLEEP, &func2_tqes[i]); ++ id = func2_tqes[i].tqent_id; ++ } else { ++ id = taskq_dispatch(tq[i], splat_taskq_test2_func2, ++ &tq_args[i], TQ_SLEEP); ++ } ++ ++ if (id == 0) { ++ splat_vprint(file, SPLAT_TASKQ_TEST2_NAME, "Taskq " ++ "'%s/%d' function '%s' dispatch failed\n", ++ tq_args[i].name, tq_args[i].id, ++ sym2str(splat_taskq_test2_func2)); ++ rc = -EINVAL; ++ break; ++ } ++ } ++ ++ /* When rc is set we're effectively just doing cleanup here, so ++ * ignore new errors in that case. They just cause noise. */ ++ for (i = 0; i < TEST2_TASKQS; i++) { ++ if (tq[i] != NULL) { ++ splat_vprint(file, SPLAT_TASKQ_TEST2_NAME, ++ "Taskq '%s/%d' waiting\n", ++ tq_args[i].name, tq_args[i].id); ++ taskq_wait(tq[i]); ++ splat_vprint(file, SPLAT_TASKQ_TEST2_NAME, ++ "Taskq '%s/%d; destroying\n", ++ tq_args[i].name, tq_args[i].id); ++ ++ taskq_destroy(tq[i]); ++ ++ if (!rc && tq_args[i].flag != ((i * 2) + 1)) { ++ splat_vprint(file, SPLAT_TASKQ_TEST2_NAME, ++ "Taskq '%s/%d' processed tasks " ++ "out of order; %d != %d\n", ++ tq_args[i].name, tq_args[i].id, ++ tq_args[i].flag, i * 2 + 1); ++ rc = -EINVAL; ++ } else { ++ splat_vprint(file, SPLAT_TASKQ_TEST2_NAME, ++ "Taskq '%s/%d' processed tasks " ++ "in the correct order; %d == %d\n", ++ tq_args[i].name, tq_args[i].id, ++ tq_args[i].flag, i * 2 + 1); ++ } ++ } ++ } ++out: ++ if (func1_tqes) ++ kfree(func1_tqes); ++ ++ if (func2_tqes) ++ kfree(func2_tqes); ++ ++ return rc; ++} ++ ++static int ++splat_taskq_test2(struct file *file, void *arg) { ++ int rc; ++ ++ rc = splat_taskq_test2_impl(file, arg, B_FALSE); ++ if (rc) ++ return rc; ++ ++ rc = splat_taskq_test2_impl(file, arg, B_TRUE); ++ ++ return rc; ++} ++ ++/* ++ * Use the global system task queue with a single task, wait until task ++ * completes, ensure task ran properly. ++ */ ++static int ++splat_taskq_test3_impl(struct file *file, void *arg, boolean_t prealloc) ++{ ++ taskqid_t id; ++ splat_taskq_arg_t tq_arg; ++ taskq_ent_t tqe; ++ ++ taskq_init_ent(&tqe); ++ ++ tq_arg.flag = 0; ++ tq_arg.id = 0; ++ tq_arg.file = file; ++ tq_arg.name = SPLAT_TASKQ_TEST3_NAME; ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST3_NAME, ++ "Taskq '%s' function '%s' %s dispatch\n", ++ tq_arg.name, sym2str(splat_taskq_test13_func), ++ prealloc ? "prealloc" : "dynamic"); ++ if (prealloc) { ++ taskq_dispatch_ent(system_taskq, splat_taskq_test13_func, ++ &tq_arg, TQ_SLEEP, &tqe); ++ id = tqe.tqent_id; ++ } else { ++ id = taskq_dispatch(system_taskq, splat_taskq_test13_func, ++ &tq_arg, TQ_SLEEP); ++ } ++ ++ if (id == 0) { ++ splat_vprint(file, SPLAT_TASKQ_TEST3_NAME, ++ "Taskq '%s' function '%s' dispatch failed\n", ++ tq_arg.name, sym2str(splat_taskq_test13_func)); ++ return -EINVAL; ++ } ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST3_NAME, "Taskq '%s' waiting\n", ++ tq_arg.name); ++ taskq_wait(system_taskq); ++ ++ return (tq_arg.flag) ? 0 : -EINVAL; ++} ++ ++static int ++splat_taskq_test3(struct file *file, void *arg) ++{ ++ int rc; ++ ++ rc = splat_taskq_test3_impl(file, arg, B_FALSE); ++ if (rc) ++ return rc; ++ ++ rc = splat_taskq_test3_impl(file, arg, B_TRUE); ++ ++ return rc; ++} ++ ++/* ++ * Create a taskq and dispatch a large number of tasks to the queue. ++ * Then use taskq_wait() to block until all the tasks complete, then ++ * cross check that all the tasks ran by checking tg_arg->count which ++ * is incremented in the task function. Finally cleanup the taskq. ++ * ++ * First we try with a large 'maxalloc' value, then we try with a small one. ++ * We should not drop tasks when TQ_SLEEP is used in taskq_dispatch(), even ++ * if the number of pending tasks is above maxalloc. ++ */ ++static void ++splat_taskq_test4_func(void *arg) ++{ ++ splat_taskq_arg_t *tq_arg = (splat_taskq_arg_t *)arg; ++ ASSERT(tq_arg); ++ ++ atomic_inc(&tq_arg->count); ++} ++ ++static int ++splat_taskq_test4_common(struct file *file, void *arg, int minalloc, ++ int maxalloc, int nr_tasks, boolean_t prealloc) ++{ ++ taskq_t *tq; ++ taskqid_t id; ++ splat_taskq_arg_t tq_arg; ++ taskq_ent_t *tqes; ++ int i, j, rc = 0; ++ ++ tqes = kmalloc(sizeof(*tqes) * nr_tasks, GFP_KERNEL); ++ if (tqes == NULL) ++ return -ENOMEM; ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST4_NAME, ++ "Taskq '%s' creating (%s dispatch) (%d/%d/%d)\n", ++ SPLAT_TASKQ_TEST4_NAME, ++ prealloc ? "prealloc" : "dynamic", ++ minalloc, maxalloc, nr_tasks); ++ if ((tq = taskq_create(SPLAT_TASKQ_TEST4_NAME, 1, maxclsyspri, ++ minalloc, maxalloc, TASKQ_PREPOPULATE)) == NULL) { ++ splat_vprint(file, SPLAT_TASKQ_TEST4_NAME, ++ "Taskq '%s' create failed\n", ++ SPLAT_TASKQ_TEST4_NAME); ++ rc = -EINVAL; ++ goto out_free; ++ } ++ ++ tq_arg.file = file; ++ tq_arg.name = SPLAT_TASKQ_TEST4_NAME; ++ ++ for (i = 1; i <= nr_tasks; i *= 2) { ++ atomic_set(&tq_arg.count, 0); ++ splat_vprint(file, SPLAT_TASKQ_TEST4_NAME, ++ "Taskq '%s' function '%s' dispatched %d times\n", ++ tq_arg.name, sym2str(splat_taskq_test4_func), i); ++ ++ for (j = 0; j < i; j++) { ++ taskq_init_ent(&tqes[j]); ++ ++ if (prealloc) { ++ taskq_dispatch_ent(tq, splat_taskq_test4_func, ++ &tq_arg, TQ_SLEEP, &tqes[j]); ++ id = tqes[j].tqent_id; ++ } else { ++ id = taskq_dispatch(tq, splat_taskq_test4_func, ++ &tq_arg, TQ_SLEEP); ++ } ++ ++ if (id == 0) { ++ splat_vprint(file, SPLAT_TASKQ_TEST4_NAME, ++ "Taskq '%s' function '%s' dispatch " ++ "%d failed\n", tq_arg.name, ++ sym2str(splat_taskq_test4_func), j); ++ rc = -EINVAL; ++ goto out; ++ } ++ } ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST4_NAME, "Taskq '%s' " ++ "waiting for %d dispatches\n", tq_arg.name, i); ++ taskq_wait(tq); ++ splat_vprint(file, SPLAT_TASKQ_TEST4_NAME, "Taskq '%s' " ++ "%d/%d dispatches finished\n", tq_arg.name, ++ atomic_read(&tq_arg.count), i); ++ if (atomic_read(&tq_arg.count) != i) { ++ rc = -ERANGE; ++ goto out; ++ ++ } ++ } ++out: ++ splat_vprint(file, SPLAT_TASKQ_TEST4_NAME, "Taskq '%s' destroying\n", ++ tq_arg.name); ++ taskq_destroy(tq); ++ ++out_free: ++ kfree(tqes); ++ ++ return rc; ++} ++ ++static int ++splat_taskq_test4_impl(struct file *file, void *arg, boolean_t prealloc) ++{ ++ int rc; ++ ++ rc = splat_taskq_test4_common(file, arg, 50, INT_MAX, 1024, prealloc); ++ if (rc) ++ return rc; ++ ++ rc = splat_taskq_test4_common(file, arg, 1, 1, 32, prealloc); ++ ++ return rc; ++} ++ ++static int ++splat_taskq_test4(struct file *file, void *arg) ++{ ++ int rc; ++ ++ rc = splat_taskq_test4_impl(file, arg, B_FALSE); ++ if (rc) ++ return rc; ++ ++ rc = splat_taskq_test4_impl(file, arg, B_TRUE); ++ ++ return rc; ++} ++ ++/* ++ * Create a taskq and dispatch a specific sequence of tasks carefully ++ * crafted to validate the order in which tasks are processed. When ++ * there are multiple worker threads each thread will process the ++ * next pending task as soon as it completes its current task. This ++ * means that tasks do not strictly complete in order in which they ++ * were dispatched (increasing task id). This is fine but we need to ++ * verify that taskq_wait_id() blocks until the passed task id and all ++ * lower task ids complete. We do this by dispatching the following ++ * specific sequence of tasks each of which block for N time units. ++ * We then use taskq_wait_id() to unblock at specific task id and ++ * verify the only the expected task ids have completed and in the ++ * correct order. The two cases of interest are: ++ * ++ * 1) Task ids larger than the waited for task id can run and ++ * complete as long as there is an available worker thread. ++ * 2) All task ids lower than the waited one must complete before ++ * unblocking even if the waited task id itself has completed. ++ * ++ * The following table shows each task id and how they will be ++ * scheduled. Each rows represent one time unit and each column ++ * one of the three worker threads. The places taskq_wait_id() ++ * must unblock for a specific id are identified as well as the ++ * task ids which must have completed and their order. ++ * ++ * +-----+ <--- taskq_wait_id(tq, 8) unblocks ++ * | | Required Completion Order: 1,2,4,5,3,8,6,7 ++ * +-----+ | ++ * | | | ++ * | | +-----+ ++ * | | | 8 | ++ * | | +-----+ <--- taskq_wait_id(tq, 3) unblocks ++ * | | 7 | | Required Completion Order: 1,2,4,5,3 ++ * | +-----+ | ++ * | 6 | | | ++ * +-----+ | | ++ * | | 5 | | ++ * | +-----+ | ++ * | 4 | | | ++ * +-----+ | | ++ * | 1 | 2 | 3 | ++ * +-----+-----+-----+ ++ * ++ */ ++static void ++splat_taskq_test5_func(void *arg) ++{ ++ splat_taskq_id_t *tq_id = (splat_taskq_id_t *)arg; ++ splat_taskq_arg_t *tq_arg = tq_id->arg; ++ int factor; ++ ++ /* Delays determined by above table */ ++ switch (tq_id->id) { ++ default: factor = 0; break; ++ case 1: case 8: factor = 1; break; ++ case 2: case 4: case 5: factor = 2; break; ++ case 6: case 7: factor = 4; break; ++ case 3: factor = 5; break; ++ } ++ ++ msleep(factor * 100); ++ splat_vprint(tq_arg->file, tq_arg->name, ++ "Taskqid %d complete for taskq '%s'\n", ++ tq_id->id, tq_arg->name); ++ ++ spin_lock(&tq_arg->lock); ++ tq_arg->order[tq_arg->flag] = tq_id->id; ++ tq_arg->flag++; ++ spin_unlock(&tq_arg->lock); ++} ++ ++static int ++splat_taskq_test_order(splat_taskq_arg_t *tq_arg, int *order) ++{ ++ int i, j; ++ ++ for (i = 0; i < SPLAT_TASKQ_ORDER_MAX; i++) { ++ if (tq_arg->order[i] != order[i]) { ++ splat_vprint(tq_arg->file, tq_arg->name, ++ "Taskq '%s' incorrect completion " ++ "order\n", tq_arg->name); ++ splat_vprint(tq_arg->file, tq_arg->name, ++ "%s", "Expected { "); ++ ++ for (j = 0; j < SPLAT_TASKQ_ORDER_MAX; j++) ++ splat_print(tq_arg->file, "%d ", order[j]); ++ ++ splat_print(tq_arg->file, "%s", "}\n"); ++ splat_vprint(tq_arg->file, tq_arg->name, ++ "%s", "Got { "); ++ ++ for (j = 0; j < SPLAT_TASKQ_ORDER_MAX; j++) ++ splat_print(tq_arg->file, "%d ", ++ tq_arg->order[j]); ++ ++ splat_print(tq_arg->file, "%s", "}\n"); ++ return -EILSEQ; ++ } ++ } ++ ++ splat_vprint(tq_arg->file, tq_arg->name, ++ "Taskq '%s' validated correct completion order\n", ++ tq_arg->name); ++ ++ return 0; ++} ++ ++static int ++splat_taskq_test5_impl(struct file *file, void *arg, boolean_t prealloc) ++{ ++ taskq_t *tq; ++ taskqid_t id; ++ splat_taskq_id_t tq_id[SPLAT_TASKQ_ORDER_MAX]; ++ splat_taskq_arg_t tq_arg; ++ int order1[SPLAT_TASKQ_ORDER_MAX] = { 1,2,4,5,3,0,0,0 }; ++ int order2[SPLAT_TASKQ_ORDER_MAX] = { 1,2,4,5,3,8,6,7 }; ++ taskq_ent_t tqes[SPLAT_TASKQ_ORDER_MAX]; ++ int i, rc = 0; ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST5_NAME, ++ "Taskq '%s' creating (%s dispatch)\n", ++ SPLAT_TASKQ_TEST5_NAME, ++ prealloc ? "prealloc" : "dynamic"); ++ if ((tq = taskq_create(SPLAT_TASKQ_TEST5_NAME, 3, maxclsyspri, ++ 50, INT_MAX, TASKQ_PREPOPULATE)) == NULL) { ++ splat_vprint(file, SPLAT_TASKQ_TEST5_NAME, ++ "Taskq '%s' create failed\n", ++ SPLAT_TASKQ_TEST5_NAME); ++ return -EINVAL; ++ } ++ ++ tq_arg.flag = 0; ++ memset(&tq_arg.order, 0, sizeof(int) * SPLAT_TASKQ_ORDER_MAX); ++ spin_lock_init(&tq_arg.lock); ++ tq_arg.file = file; ++ tq_arg.name = SPLAT_TASKQ_TEST5_NAME; ++ ++ for (i = 0; i < SPLAT_TASKQ_ORDER_MAX; i++) { ++ taskq_init_ent(&tqes[i]); ++ ++ tq_id[i].id = i + 1; ++ tq_id[i].arg = &tq_arg; ++ ++ if (prealloc) { ++ taskq_dispatch_ent(tq, splat_taskq_test5_func, ++ &tq_id[i], TQ_SLEEP, &tqes[i]); ++ id = tqes[i].tqent_id; ++ } else { ++ id = taskq_dispatch(tq, splat_taskq_test5_func, ++ &tq_id[i], TQ_SLEEP); ++ } ++ ++ if (id == 0) { ++ splat_vprint(file, SPLAT_TASKQ_TEST5_NAME, ++ "Taskq '%s' function '%s' dispatch failed\n", ++ tq_arg.name, sym2str(splat_taskq_test5_func)); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ if (tq_id[i].id != id) { ++ splat_vprint(file, SPLAT_TASKQ_TEST5_NAME, ++ "Taskq '%s' expected taskqid %d got %d\n", ++ tq_arg.name, (int)tq_id[i].id, (int)id); ++ rc = -EINVAL; ++ goto out; ++ } ++ } ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST5_NAME, "Taskq '%s' " ++ "waiting for taskqid %d completion\n", tq_arg.name, 3); ++ taskq_wait_id(tq, 3); ++ if ((rc = splat_taskq_test_order(&tq_arg, order1))) ++ goto out; ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST5_NAME, "Taskq '%s' " ++ "waiting for taskqid %d completion\n", tq_arg.name, 8); ++ taskq_wait_id(tq, 8); ++ rc = splat_taskq_test_order(&tq_arg, order2); ++ ++out: ++ splat_vprint(file, SPLAT_TASKQ_TEST5_NAME, ++ "Taskq '%s' destroying\n", tq_arg.name); ++ taskq_destroy(tq); ++ ++ return rc; ++} ++ ++static int ++splat_taskq_test5(struct file *file, void *arg) ++{ ++ int rc; ++ ++ rc = splat_taskq_test5_impl(file, arg, B_FALSE); ++ if (rc) ++ return rc; ++ ++ rc = splat_taskq_test5_impl(file, arg, B_TRUE); ++ ++ return rc; ++} ++ ++/* ++ * Create a single task queue with three threads. Dispatch 8 tasks, ++ * setting TQ_FRONT on only the last three. Sleep after ++ * dispatching tasks 1-3 to ensure they will run and hold the threads ++ * busy while we dispatch the remaining tasks. Verify that tasks 6-8 ++ * run before task 4-5. ++ * ++ * The following table shows each task id and how they will be ++ * scheduled. Each rows represent one time unit and each column ++ * one of the three worker threads. ++ * ++ * +-----+ ++ * | | ++ * +-----+ | ++ * | | 5 +-----+ ++ * | | | | ++ * | +-----| | ++ * | 4 | | | ++ * +-----+ | 8 | ++ * | | | | ++ * | | 7 +-----+ ++ * | | | | ++ * | |-----+ | ++ * | 6 | | | ++ * +-----+ | | ++ * | | | | ++ * | 1 | 2 | 3 | ++ * +-----+-----+-----+ ++ * ++ */ ++static void ++splat_taskq_test6_func(void *arg) ++{ ++ splat_taskq_id_t *tq_id = (splat_taskq_id_t *)arg; ++ splat_taskq_arg_t *tq_arg = tq_id->arg; ++ int factor; ++ ++ /* Delays determined by above table */ ++ switch (tq_id->id) { ++ default: factor = 0; break; ++ case 1: factor = 2; break; ++ case 2: case 4: case 5: factor = 4; break; ++ case 6: case 7: case 8: factor = 5; break; ++ case 3: factor = 6; break; ++ } ++ ++ msleep(factor * 100); ++ ++ splat_vprint(tq_arg->file, tq_arg->name, ++ "Taskqid %d complete for taskq '%s'\n", ++ tq_id->id, tq_arg->name); ++ ++ spin_lock(&tq_arg->lock); ++ tq_arg->order[tq_arg->flag] = tq_id->id; ++ tq_arg->flag++; ++ spin_unlock(&tq_arg->lock); ++} ++ ++static int ++splat_taskq_test6_impl(struct file *file, void *arg, boolean_t prealloc) ++{ ++ taskq_t *tq; ++ taskqid_t id; ++ splat_taskq_id_t tq_id[SPLAT_TASKQ_ORDER_MAX]; ++ splat_taskq_arg_t tq_arg; ++ int order[SPLAT_TASKQ_ORDER_MAX] = { 1,2,3,6,7,8,4,5 }; ++ taskq_ent_t tqes[SPLAT_TASKQ_ORDER_MAX]; ++ int i, rc = 0; ++ uint_t tflags; ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST6_NAME, ++ "Taskq '%s' creating (%s dispatch)\n", ++ SPLAT_TASKQ_TEST6_NAME, ++ prealloc ? "prealloc" : "dynamic"); ++ if ((tq = taskq_create(SPLAT_TASKQ_TEST6_NAME, 3, maxclsyspri, ++ 50, INT_MAX, TASKQ_PREPOPULATE)) == NULL) { ++ splat_vprint(file, SPLAT_TASKQ_TEST6_NAME, ++ "Taskq '%s' create failed\n", ++ SPLAT_TASKQ_TEST6_NAME); ++ return -EINVAL; ++ } ++ ++ tq_arg.flag = 0; ++ memset(&tq_arg.order, 0, sizeof(int) * SPLAT_TASKQ_ORDER_MAX); ++ spin_lock_init(&tq_arg.lock); ++ tq_arg.file = file; ++ tq_arg.name = SPLAT_TASKQ_TEST6_NAME; ++ ++ for (i = 0; i < SPLAT_TASKQ_ORDER_MAX; i++) { ++ taskq_init_ent(&tqes[i]); ++ ++ tq_id[i].id = i + 1; ++ tq_id[i].arg = &tq_arg; ++ tflags = TQ_SLEEP; ++ if (i > 4) ++ tflags |= TQ_FRONT; ++ ++ if (prealloc) { ++ taskq_dispatch_ent(tq, splat_taskq_test6_func, ++ &tq_id[i], tflags, &tqes[i]); ++ id = tqes[i].tqent_id; ++ } else { ++ id = taskq_dispatch(tq, splat_taskq_test6_func, ++ &tq_id[i], tflags); ++ } ++ ++ if (id == 0) { ++ splat_vprint(file, SPLAT_TASKQ_TEST6_NAME, ++ "Taskq '%s' function '%s' dispatch failed\n", ++ tq_arg.name, sym2str(splat_taskq_test6_func)); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ if (tq_id[i].id != id) { ++ splat_vprint(file, SPLAT_TASKQ_TEST6_NAME, ++ "Taskq '%s' expected taskqid %d got %d\n", ++ tq_arg.name, (int)tq_id[i].id, (int)id); ++ rc = -EINVAL; ++ goto out; ++ } ++ /* Sleep to let tasks 1-3 start executing. */ ++ if ( i == 2 ) ++ msleep(100); ++ } ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST6_NAME, "Taskq '%s' " ++ "waiting for taskqid %d completion\n", tq_arg.name, ++ SPLAT_TASKQ_ORDER_MAX); ++ taskq_wait_id(tq, SPLAT_TASKQ_ORDER_MAX); ++ rc = splat_taskq_test_order(&tq_arg, order); ++ ++out: ++ splat_vprint(file, SPLAT_TASKQ_TEST6_NAME, ++ "Taskq '%s' destroying\n", tq_arg.name); ++ taskq_destroy(tq); ++ ++ return rc; ++} ++ ++static int ++splat_taskq_test6(struct file *file, void *arg) ++{ ++ int rc; ++ ++ rc = splat_taskq_test6_impl(file, arg, B_FALSE); ++ if (rc) ++ return rc; ++ ++ rc = splat_taskq_test6_impl(file, arg, B_TRUE); ++ ++ return rc; ++} ++ ++static void ++splat_taskq_test7_func(void *arg) ++{ ++ splat_taskq_arg_t *tq_arg = (splat_taskq_arg_t *)arg; ++ taskqid_t id; ++ ++ ASSERT(tq_arg); ++ ++ if (tq_arg->depth >= SPLAT_TASKQ_DEPTH_MAX) ++ return; ++ ++ tq_arg->depth++; ++ ++ splat_vprint(tq_arg->file, SPLAT_TASKQ_TEST7_NAME, ++ "Taskq '%s' function '%s' dispatching (depth = %u)\n", ++ tq_arg->name, sym2str(splat_taskq_test7_func), ++ tq_arg->depth); ++ ++ if (tq_arg->tqe) { ++ VERIFY(taskq_empty_ent(tq_arg->tqe)); ++ taskq_dispatch_ent(tq_arg->tq, splat_taskq_test7_func, ++ tq_arg, TQ_SLEEP, tq_arg->tqe); ++ id = tq_arg->tqe->tqent_id; ++ } else { ++ id = taskq_dispatch(tq_arg->tq, splat_taskq_test7_func, ++ tq_arg, TQ_SLEEP); ++ } ++ ++ if (id == 0) { ++ splat_vprint(tq_arg->file, SPLAT_TASKQ_TEST7_NAME, ++ "Taskq '%s' function '%s' dispatch failed " ++ "(depth = %u)\n", tq_arg->name, ++ sym2str(splat_taskq_test7_func), tq_arg->depth); ++ tq_arg->flag = -EINVAL; ++ return; ++ } ++} ++ ++static int ++splat_taskq_test7_impl(struct file *file, void *arg, boolean_t prealloc) ++{ ++ taskq_t *tq; ++ taskq_ent_t tqe; ++ splat_taskq_arg_t tq_arg; ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST7_NAME, ++ "Taskq '%s' creating (%s dispatch)\n", ++ SPLAT_TASKQ_TEST7_NAME, ++ prealloc ? "prealloc" : "dynamic"); ++ if ((tq = taskq_create(SPLAT_TASKQ_TEST7_NAME, 1, maxclsyspri, ++ 50, INT_MAX, TASKQ_PREPOPULATE)) == NULL) { ++ splat_vprint(file, SPLAT_TASKQ_TEST7_NAME, ++ "Taskq '%s' create failed\n", ++ SPLAT_TASKQ_TEST7_NAME); ++ return -EINVAL; ++ } ++ ++ tq_arg.depth = 0; ++ tq_arg.flag = 0; ++ tq_arg.id = 0; ++ tq_arg.file = file; ++ tq_arg.name = SPLAT_TASKQ_TEST7_NAME; ++ tq_arg.tq = tq; ++ ++ if (prealloc) { ++ taskq_init_ent(&tqe); ++ tq_arg.tqe = &tqe; ++ } else { ++ tq_arg.tqe = NULL; ++ } ++ ++ splat_taskq_test7_func(&tq_arg); ++ ++ if (tq_arg.flag == 0) { ++ splat_vprint(file, SPLAT_TASKQ_TEST7_NAME, ++ "Taskq '%s' waiting\n", tq_arg.name); ++ taskq_wait_id(tq, SPLAT_TASKQ_DEPTH_MAX); ++ } ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST7_NAME, ++ "Taskq '%s' destroying\n", tq_arg.name); ++ taskq_destroy(tq); ++ ++ return tq_arg.depth == SPLAT_TASKQ_DEPTH_MAX ? 0 : -EINVAL; ++} ++ ++static int ++splat_taskq_test7(struct file *file, void *arg) ++{ ++ int rc; ++ ++ rc = splat_taskq_test7_impl(file, arg, B_FALSE); ++ if (rc) ++ return rc; ++ ++ rc = splat_taskq_test7_impl(file, arg, B_TRUE); ++ ++ return rc; ++} ++ ++/* ++ * Create a taskq with 100 threads and dispatch a huge number of trivial ++ * tasks to generate contention on tq->tq_lock. This test should always ++ * pass. The purpose is to provide a benchmark for measuring the ++ * effectiveness of taskq optimizations. ++ */ ++static void ++splat_taskq_test8_func(void *arg) ++{ ++ splat_taskq_arg_t *tq_arg = (splat_taskq_arg_t *)arg; ++ ASSERT(tq_arg); ++ ++ atomic_inc(&tq_arg->count); ++} ++ ++#define TEST8_NUM_TASKS 0x20000 ++#define TEST8_THREADS_PER_TASKQ 100 ++ ++static int ++splat_taskq_test8_common(struct file *file, void *arg, int minalloc, ++ int maxalloc) ++{ ++ taskq_t *tq; ++ taskqid_t id; ++ splat_taskq_arg_t tq_arg; ++ taskq_ent_t **tqes; ++ int i, j, rc = 0; ++ ++ tqes = vmalloc(sizeof(*tqes) * TEST8_NUM_TASKS); ++ if (tqes == NULL) ++ return -ENOMEM; ++ memset(tqes, 0, sizeof(*tqes) * TEST8_NUM_TASKS); ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST8_NAME, ++ "Taskq '%s' creating (%d/%d/%d)\n", ++ SPLAT_TASKQ_TEST8_NAME, ++ minalloc, maxalloc, TEST8_NUM_TASKS); ++ if ((tq = taskq_create(SPLAT_TASKQ_TEST8_NAME, TEST8_THREADS_PER_TASKQ, ++ maxclsyspri, minalloc, maxalloc, ++ TASKQ_PREPOPULATE)) == NULL) { ++ splat_vprint(file, SPLAT_TASKQ_TEST8_NAME, ++ "Taskq '%s' create failed\n", ++ SPLAT_TASKQ_TEST8_NAME); ++ rc = -EINVAL; ++ goto out_free; ++ } ++ ++ tq_arg.file = file; ++ tq_arg.name = SPLAT_TASKQ_TEST8_NAME; ++ ++ atomic_set(&tq_arg.count, 0); ++ for (i = 0; i < TEST8_NUM_TASKS; i++) { ++ tqes[i] = kmalloc(sizeof(taskq_ent_t), GFP_KERNEL); ++ if (tqes[i] == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ taskq_init_ent(tqes[i]); ++ ++ taskq_dispatch_ent(tq, splat_taskq_test8_func, ++ &tq_arg, TQ_SLEEP, tqes[i]); ++ ++ id = tqes[i]->tqent_id; ++ ++ if (id == 0) { ++ splat_vprint(file, SPLAT_TASKQ_TEST8_NAME, ++ "Taskq '%s' function '%s' dispatch " ++ "%d failed\n", tq_arg.name, ++ sym2str(splat_taskq_test8_func), i); ++ rc = -EINVAL; ++ goto out; ++ } ++ } ++ ++ splat_vprint(file, SPLAT_TASKQ_TEST8_NAME, "Taskq '%s' " ++ "waiting for %d dispatches\n", tq_arg.name, ++ TEST8_NUM_TASKS); ++ taskq_wait(tq); ++ splat_vprint(file, SPLAT_TASKQ_TEST8_NAME, "Taskq '%s' " ++ "%d/%d dispatches finished\n", tq_arg.name, ++ atomic_read(&tq_arg.count), TEST8_NUM_TASKS); ++ ++ if (atomic_read(&tq_arg.count) != TEST8_NUM_TASKS) ++ rc = -ERANGE; ++ ++out: ++ splat_vprint(file, SPLAT_TASKQ_TEST8_NAME, "Taskq '%s' destroying\n", ++ tq_arg.name); ++ taskq_destroy(tq); ++out_free: ++ for (j = 0; j < TEST8_NUM_TASKS && tqes[j] != NULL; j++) ++ kfree(tqes[j]); ++ vfree(tqes); ++ ++ return rc; ++} ++ ++static int ++splat_taskq_test8(struct file *file, void *arg) ++{ ++ int rc; ++ ++ rc = splat_taskq_test8_common(file, arg, 1, 100); ++ ++ return rc; ++} ++ ++splat_subsystem_t * ++splat_taskq_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_TASKQ_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_TASKQ_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_TASKQ; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_TASKQ_TEST1_NAME, SPLAT_TASKQ_TEST1_DESC, ++ SPLAT_TASKQ_TEST1_ID, splat_taskq_test1); ++ SPLAT_TEST_INIT(sub, SPLAT_TASKQ_TEST2_NAME, SPLAT_TASKQ_TEST2_DESC, ++ SPLAT_TASKQ_TEST2_ID, splat_taskq_test2); ++ SPLAT_TEST_INIT(sub, SPLAT_TASKQ_TEST3_NAME, SPLAT_TASKQ_TEST3_DESC, ++ SPLAT_TASKQ_TEST3_ID, splat_taskq_test3); ++ SPLAT_TEST_INIT(sub, SPLAT_TASKQ_TEST4_NAME, SPLAT_TASKQ_TEST4_DESC, ++ SPLAT_TASKQ_TEST4_ID, splat_taskq_test4); ++ SPLAT_TEST_INIT(sub, SPLAT_TASKQ_TEST5_NAME, SPLAT_TASKQ_TEST5_DESC, ++ SPLAT_TASKQ_TEST5_ID, splat_taskq_test5); ++ SPLAT_TEST_INIT(sub, SPLAT_TASKQ_TEST6_NAME, SPLAT_TASKQ_TEST6_DESC, ++ SPLAT_TASKQ_TEST6_ID, splat_taskq_test6); ++ SPLAT_TEST_INIT(sub, SPLAT_TASKQ_TEST7_NAME, SPLAT_TASKQ_TEST7_DESC, ++ SPLAT_TASKQ_TEST7_ID, splat_taskq_test7); ++ SPLAT_TEST_INIT(sub, SPLAT_TASKQ_TEST8_NAME, SPLAT_TASKQ_TEST8_DESC, ++ SPLAT_TASKQ_TEST8_ID, splat_taskq_test8); ++ ++ return sub; ++} ++ ++void ++splat_taskq_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ SPLAT_TEST_FINI(sub, SPLAT_TASKQ_TEST8_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_TASKQ_TEST7_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_TASKQ_TEST6_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_TASKQ_TEST5_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_TASKQ_TEST4_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_TASKQ_TEST3_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_TASKQ_TEST2_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_TASKQ_TEST1_ID); ++ ++ kfree(sub); ++} ++ ++int ++splat_taskq_id(void) { ++ return SPLAT_SUBSYSTEM_TASKQ; ++} +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-thread.c linux-3.2.33-go/spl/splat/splat-thread.c +--- linux-3.2.33-go.orig/spl/splat/splat-thread.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-thread.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,386 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Thread Tests. ++\*****************************************************************************/ ++ ++#include ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_THREAD_NAME "thread" ++#define SPLAT_THREAD_DESC "Kernel Thread Tests" ++ ++#define SPLAT_THREAD_TEST1_ID 0x0601 ++#define SPLAT_THREAD_TEST1_NAME "create" ++#define SPLAT_THREAD_TEST1_DESC "Validate thread creation" ++ ++#define SPLAT_THREAD_TEST2_ID 0x0602 ++#define SPLAT_THREAD_TEST2_NAME "exit" ++#define SPLAT_THREAD_TEST2_DESC "Validate thread exit" ++ ++#define SPLAT_THREAD_TEST3_ID 0x6003 ++#define SPLAT_THREAD_TEST3_NAME "tsd" ++#define SPLAT_THREAD_TEST3_DESC "Validate thread specific data" ++ ++#define SPLAT_THREAD_TEST_MAGIC 0x4488CC00UL ++#define SPLAT_THREAD_TEST_KEYS 32 ++#define SPLAT_THREAD_TEST_THREADS 16 ++ ++typedef struct thread_priv { ++ unsigned long tp_magic; ++ struct file *tp_file; ++ spinlock_t tp_lock; ++ wait_queue_head_t tp_waitq; ++ uint_t tp_keys[SPLAT_THREAD_TEST_KEYS]; ++ int tp_rc; ++ int tp_count; ++ int tp_dtor_count; ++} thread_priv_t; ++ ++static int ++splat_thread_rc(thread_priv_t *tp, int rc) ++{ ++ int ret; ++ ++ spin_lock(&tp->tp_lock); ++ ret = (tp->tp_rc == rc); ++ spin_unlock(&tp->tp_lock); ++ ++ return ret; ++} ++ ++static int ++splat_thread_count(thread_priv_t *tp, int count) ++{ ++ int ret; ++ ++ spin_lock(&tp->tp_lock); ++ ret = (tp->tp_count == count); ++ spin_unlock(&tp->tp_lock); ++ ++ return ret; ++} ++ ++static void ++splat_thread_work1(void *priv) ++{ ++ thread_priv_t *tp = (thread_priv_t *)priv; ++ ++ spin_lock(&tp->tp_lock); ++ ASSERT(tp->tp_magic == SPLAT_THREAD_TEST_MAGIC); ++ tp->tp_rc = 1; ++ wake_up(&tp->tp_waitq); ++ spin_unlock(&tp->tp_lock); ++ ++ thread_exit(); ++} ++ ++static int ++splat_thread_test1(struct file *file, void *arg) ++{ ++ thread_priv_t tp; ++ kthread_t *thr; ++ ++ tp.tp_magic = SPLAT_THREAD_TEST_MAGIC; ++ tp.tp_file = file; ++ spin_lock_init(&tp.tp_lock); ++ init_waitqueue_head(&tp.tp_waitq); ++ tp.tp_rc = 0; ++ ++ thr = (kthread_t *)thread_create(NULL, 0, splat_thread_work1, &tp, 0, ++ &p0, TS_RUN, minclsyspri); ++ /* Must never fail under Solaris, but we check anyway since this ++ * can happen in the linux SPL, we may want to change this behavior */ ++ if (thr == NULL) ++ return -ESRCH; ++ ++ /* Sleep until the thread sets tp.tp_rc == 1 */ ++ wait_event(tp.tp_waitq, splat_thread_rc(&tp, 1)); ++ ++ splat_vprint(file, SPLAT_THREAD_TEST1_NAME, "%s", ++ "Thread successfully started properly\n"); ++ return 0; ++} ++ ++static void ++splat_thread_work2(void *priv) ++{ ++ thread_priv_t *tp = (thread_priv_t *)priv; ++ ++ spin_lock(&tp->tp_lock); ++ ASSERT(tp->tp_magic == SPLAT_THREAD_TEST_MAGIC); ++ tp->tp_rc = 1; ++ wake_up(&tp->tp_waitq); ++ spin_unlock(&tp->tp_lock); ++ ++ thread_exit(); ++ ++ /* The following code is unreachable when thread_exit() is ++ * working properly, which is exactly what we're testing */ ++ spin_lock(&tp->tp_lock); ++ tp->tp_rc = 2; ++ wake_up(&tp->tp_waitq); ++ spin_unlock(&tp->tp_lock); ++} ++ ++static int ++splat_thread_test2(struct file *file, void *arg) ++{ ++ thread_priv_t tp; ++ kthread_t *thr; ++ int rc = 0; ++ ++ tp.tp_magic = SPLAT_THREAD_TEST_MAGIC; ++ tp.tp_file = file; ++ spin_lock_init(&tp.tp_lock); ++ init_waitqueue_head(&tp.tp_waitq); ++ tp.tp_rc = 0; ++ ++ thr = (kthread_t *)thread_create(NULL, 0, splat_thread_work2, &tp, 0, ++ &p0, TS_RUN, minclsyspri); ++ /* Must never fail under Solaris, but we check anyway since this ++ * can happen in the linux SPL, we may want to change this behavior */ ++ if (thr == NULL) ++ return -ESRCH; ++ ++ /* Sleep until the thread sets tp.tp_rc == 1 */ ++ wait_event(tp.tp_waitq, splat_thread_rc(&tp, 1)); ++ ++ /* Sleep until the thread sets tp.tp_rc == 2, or until we hit ++ * the timeout. If thread exit is working properly we should ++ * hit the timeout and never see to.tp_rc == 2. */ ++ rc = wait_event_timeout(tp.tp_waitq, splat_thread_rc(&tp, 2), HZ / 10); ++ if (rc > 0) { ++ rc = -EINVAL; ++ splat_vprint(file, SPLAT_THREAD_TEST2_NAME, "%s", ++ "Thread did not exit properly at thread_exit()\n"); ++ } else { ++ splat_vprint(file, SPLAT_THREAD_TEST2_NAME, "%s", ++ "Thread successfully exited at thread_exit()\n"); ++ } ++ ++ return rc; ++} ++ ++static void ++splat_thread_work3_common(thread_priv_t *tp) ++{ ++ ulong_t rnd; ++ int i, rc = 0; ++ ++ /* set a unique value for each key using a random value */ ++ get_random_bytes((void *)&rnd, 4); ++ for (i = 0; i < SPLAT_THREAD_TEST_KEYS; i++) ++ tsd_set(tp->tp_keys[i], (void *)(i + rnd)); ++ ++ /* verify the unique value for each key */ ++ for (i = 0; i < SPLAT_THREAD_TEST_KEYS; i++) ++ if (tsd_get(tp->tp_keys[i]) != (void *)(i + rnd)) ++ rc = -EINVAL; ++ ++ /* set the value to thread_priv_t for use by the destructor */ ++ for (i = 0; i < SPLAT_THREAD_TEST_KEYS; i++) ++ tsd_set(tp->tp_keys[i], (void *)tp); ++ ++ spin_lock(&tp->tp_lock); ++ if (rc && !tp->tp_rc) ++ tp->tp_rc = rc; ++ ++ tp->tp_count++; ++ wake_up_all(&tp->tp_waitq); ++ spin_unlock(&tp->tp_lock); ++} ++ ++static void ++splat_thread_work3_wait(void *priv) ++{ ++ thread_priv_t *tp = (thread_priv_t *)priv; ++ ++ ASSERT(tp->tp_magic == SPLAT_THREAD_TEST_MAGIC); ++ splat_thread_work3_common(tp); ++ wait_event(tp->tp_waitq, splat_thread_count(tp, 0)); ++ thread_exit(); ++} ++ ++static void ++splat_thread_work3_exit(void *priv) ++{ ++ thread_priv_t *tp = (thread_priv_t *)priv; ++ ++ ASSERT(tp->tp_magic == SPLAT_THREAD_TEST_MAGIC); ++ splat_thread_work3_common(tp); ++ thread_exit(); ++} ++ ++static void ++splat_thread_dtor3(void *priv) ++{ ++ thread_priv_t *tp = (thread_priv_t *)priv; ++ ++ ASSERT(tp->tp_magic == SPLAT_THREAD_TEST_MAGIC); ++ spin_lock(&tp->tp_lock); ++ tp->tp_dtor_count++; ++ spin_unlock(&tp->tp_lock); ++} ++ ++/* ++ * Create threads which set and verify SPLAT_THREAD_TEST_KEYS number of ++ * keys. These threads may then exit by calling thread_exit() which calls ++ * tsd_exit() resulting in all their thread specific data being reclaimed. ++ * Alternately, the thread may block in which case the thread specific ++ * data will be reclaimed as part of tsd_destroy(). In either case all ++ * thread specific data must be reclaimed, this is verified by ensuring ++ * the registered destructor is called the correct number of times. ++ */ ++static int ++splat_thread_test3(struct file *file, void *arg) ++{ ++ int i, rc = 0, expected, wait_count = 0, exit_count = 0; ++ thread_priv_t tp; ++ ++ tp.tp_magic = SPLAT_THREAD_TEST_MAGIC; ++ tp.tp_file = file; ++ spin_lock_init(&tp.tp_lock); ++ init_waitqueue_head(&tp.tp_waitq); ++ tp.tp_rc = 0; ++ tp.tp_count = 0; ++ tp.tp_dtor_count = 0; ++ ++ for (i = 0; i < SPLAT_THREAD_TEST_KEYS; i++) { ++ tp.tp_keys[i] = 0; ++ tsd_create(&tp.tp_keys[i], splat_thread_dtor3); ++ } ++ ++ /* Start tsd wait threads */ ++ for (i = 0; i < SPLAT_THREAD_TEST_THREADS; i++) { ++ if (thread_create(NULL, 0, splat_thread_work3_wait, ++ &tp, 0, &p0, TS_RUN, minclsyspri)) ++ wait_count++; ++ } ++ ++ /* All wait threads have setup their tsd and are blocking. */ ++ wait_event(tp.tp_waitq, splat_thread_count(&tp, wait_count)); ++ ++ if (tp.tp_dtor_count != 0) { ++ splat_vprint(file, SPLAT_THREAD_TEST3_NAME, ++ "Prematurely ran %d tsd destructors\n", tp.tp_dtor_count); ++ if (!rc) ++ rc = -ERANGE; ++ } ++ ++ /* Start tsd exit threads */ ++ for (i = 0; i < SPLAT_THREAD_TEST_THREADS; i++) { ++ if (thread_create(NULL, 0, splat_thread_work3_exit, ++ &tp, 0, &p0, TS_RUN, minclsyspri)) ++ exit_count++; ++ } ++ ++ /* All exit threads verified tsd and are in the process of exiting */ ++ wait_event(tp.tp_waitq,splat_thread_count(&tp, wait_count+exit_count)); ++ msleep(500); ++ ++ expected = (SPLAT_THREAD_TEST_KEYS * exit_count); ++ if (tp.tp_dtor_count != expected) { ++ splat_vprint(file, SPLAT_THREAD_TEST3_NAME, ++ "Expected %d exit tsd destructors but saw %d\n", ++ expected, tp.tp_dtor_count); ++ if (!rc) ++ rc = -ERANGE; ++ } ++ ++ /* Destroy all keys and associated tsd in blocked threads */ ++ for (i = 0; i < SPLAT_THREAD_TEST_KEYS; i++) ++ tsd_destroy(&tp.tp_keys[i]); ++ ++ expected = (SPLAT_THREAD_TEST_KEYS * (exit_count + wait_count)); ++ if (tp.tp_dtor_count != expected) { ++ splat_vprint(file, SPLAT_THREAD_TEST3_NAME, ++ "Expected %d wait+exit tsd destructors but saw %d\n", ++ expected, tp.tp_dtor_count); ++ if (!rc) ++ rc = -ERANGE; ++ } ++ ++ /* Release the remaining wait threads, sleep briefly while they exit */ ++ spin_lock(&tp.tp_lock); ++ tp.tp_count = 0; ++ wake_up_all(&tp.tp_waitq); ++ spin_unlock(&tp.tp_lock); ++ msleep(500); ++ ++ if (tp.tp_rc) { ++ splat_vprint(file, SPLAT_THREAD_TEST3_NAME, ++ "Thread tsd_get()/tsd_set() error %d\n", tp.tp_rc); ++ if (!rc) ++ rc = tp.tp_rc; ++ } else if (!rc) { ++ splat_vprint(file, SPLAT_THREAD_TEST3_NAME, "%s", ++ "Thread specific data verified\n"); ++ } ++ ++ return rc; ++} ++ ++splat_subsystem_t * ++splat_thread_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_THREAD_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_THREAD_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_THREAD; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_THREAD_TEST1_NAME, SPLAT_THREAD_TEST1_DESC, ++ SPLAT_THREAD_TEST1_ID, splat_thread_test1); ++ SPLAT_TEST_INIT(sub, SPLAT_THREAD_TEST2_NAME, SPLAT_THREAD_TEST2_DESC, ++ SPLAT_THREAD_TEST2_ID, splat_thread_test2); ++ SPLAT_TEST_INIT(sub, SPLAT_THREAD_TEST3_NAME, SPLAT_THREAD_TEST3_DESC, ++ SPLAT_THREAD_TEST3_ID, splat_thread_test3); ++ ++ return sub; ++} ++ ++void ++splat_thread_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ SPLAT_TEST_FINI(sub, SPLAT_THREAD_TEST3_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_THREAD_TEST2_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_THREAD_TEST1_ID); ++ ++ kfree(sub); ++} ++ ++int ++splat_thread_id(void) { ++ return SPLAT_SUBSYSTEM_THREAD; ++} +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-time.c linux-3.2.33-go/spl/splat/splat-time.c +--- linux-3.2.33-go.orig/spl/splat/splat-time.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-time.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,117 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Time Tests. ++\*****************************************************************************/ ++ ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_TIME_NAME "time" ++#define SPLAT_TIME_DESC "Kernel Time Tests" ++ ++#define SPLAT_TIME_TEST1_ID 0x0801 ++#define SPLAT_TIME_TEST1_NAME "time1" ++#define SPLAT_TIME_TEST1_DESC "HZ Test" ++ ++#define SPLAT_TIME_TEST2_ID 0x0802 ++#define SPLAT_TIME_TEST2_NAME "time2" ++#define SPLAT_TIME_TEST2_DESC "Monotonic Test" ++ ++static int ++splat_time_test1(struct file *file, void *arg) ++{ ++ int myhz = hz; ++ splat_vprint(file, SPLAT_TIME_TEST1_NAME, "hz is %d\n", myhz); ++ return 0; ++} ++ ++static int ++splat_time_test2(struct file *file, void *arg) ++{ ++ hrtime_t tm1, tm2; ++ int i; ++ ++ tm1 = gethrtime(); ++ splat_vprint(file, SPLAT_TIME_TEST2_NAME, "time is %lld\n", tm1); ++ ++ for(i = 0; i < 100; i++) { ++ tm2 = gethrtime(); ++ splat_vprint(file, SPLAT_TIME_TEST2_NAME, "time is %lld\n", tm2); ++ ++ if(tm1 > tm2) { ++ splat_print(file, "%s: gethrtime() is not giving " ++ "monotonically increasing values\n", ++ SPLAT_TIME_TEST2_NAME); ++ return 1; ++ } ++ tm1 = tm2; ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule_timeout(10); ++ } ++ ++ return 0; ++} ++ ++splat_subsystem_t * ++splat_time_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_TIME_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_TIME_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_TIME; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_TIME_TEST1_NAME, SPLAT_TIME_TEST1_DESC, ++ SPLAT_TIME_TEST1_ID, splat_time_test1); ++ SPLAT_TEST_INIT(sub, SPLAT_TIME_TEST2_NAME, SPLAT_TIME_TEST2_DESC, ++ SPLAT_TIME_TEST2_ID, splat_time_test2); ++ ++ return sub; ++} ++ ++void ++splat_time_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ ++ SPLAT_TEST_FINI(sub, SPLAT_TIME_TEST2_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_TIME_TEST1_ID); ++ ++ kfree(sub); ++} ++ ++int ++splat_time_id(void) ++{ ++ return SPLAT_SUBSYSTEM_TIME; ++} +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-vnode.c linux-3.2.33-go/spl/splat/splat-vnode.c +--- linux-3.2.33-go.orig/spl/splat/splat-vnode.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-vnode.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,445 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Vnode Tests. ++\*****************************************************************************/ ++ ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_VNODE_NAME "vnode" ++#define SPLAT_VNODE_DESC "Kernel Vnode Tests" ++ ++#define SPLAT_VNODE_TEST1_ID 0x0901 ++#define SPLAT_VNODE_TEST1_NAME "vn_open" ++#define SPLAT_VNODE_TEST1_DESC "Vn_open Test" ++ ++#define SPLAT_VNODE_TEST2_ID 0x0902 ++#define SPLAT_VNODE_TEST2_NAME "vn_openat" ++#define SPLAT_VNODE_TEST2_DESC "Vn_openat Test" ++ ++#define SPLAT_VNODE_TEST3_ID 0x0903 ++#define SPLAT_VNODE_TEST3_NAME "vn_rdwr" ++#define SPLAT_VNODE_TEST3_DESC "Vn_rdwrt Test" ++ ++#define SPLAT_VNODE_TEST4_ID 0x0904 ++#define SPLAT_VNODE_TEST4_NAME "vn_rename" ++#define SPLAT_VNODE_TEST4_DESC "Vn_rename Test" ++ ++#define SPLAT_VNODE_TEST5_ID 0x0905 ++#define SPLAT_VNODE_TEST5_NAME "vn_getattr" ++#define SPLAT_VNODE_TEST5_DESC "Vn_getattr Test" ++ ++#define SPLAT_VNODE_TEST6_ID 0x0906 ++#define SPLAT_VNODE_TEST6_NAME "vn_sync" ++#define SPLAT_VNODE_TEST6_DESC "Vn_sync Test" ++ ++#define SPLAT_VNODE_TEST_FILE "/etc/fstab" ++#define SPLAT_VNODE_TEST_FILE_AT "etc/fstab" ++#define SPLAT_VNODE_TEST_FILE_RW "/tmp/spl.vnode.tmp" ++#define SPLAT_VNODE_TEST_FILE_RW1 "/tmp/spl.vnode.tmp.1" ++#define SPLAT_VNODE_TEST_FILE_RW2 "/tmp/spl.vnode.tmp.2" ++ ++static int ++splat_vnode_user_cmd(struct file *file, void *arg, ++ char *name, char *cmd) ++{ ++ char sh_path[] = "/bin/sh"; ++ char *argv[] = { sh_path, ++ "-c", ++ cmd, ++ NULL }; ++ char *envp[] = { "HOME=/", ++ "TERM=linux", ++ "PATH=/sbin:/usr/sbin:/bin:/usr/bin", ++ NULL }; ++ int rc; ++ ++ rc = call_usermodehelper(sh_path, argv, envp, 1); ++ if (rc) { ++ splat_vprint(file, name, ++ "Failed command: %s %s %s (%d)\n", ++ argv[0], argv[1], cmd, rc); ++ return -EPERM; ++ } ++ ++ return 0; ++} ++ ++static int ++splat_vnode_unlink_all(struct file *file, void *arg, char *name) ++{ ++ char *cmds[] = { "rm -f " SPLAT_VNODE_TEST_FILE_RW, ++ "rm -f " SPLAT_VNODE_TEST_FILE_RW1, ++ "rm -f " SPLAT_VNODE_TEST_FILE_RW2, ++ NULL }; ++ int i = 0, rc = 0; ++ ++ while (cmds[i] != NULL) { ++ if ((rc = splat_vnode_user_cmd(file, arg, name, cmds[i]))) ++ return rc; ++ ++ i++; ++ } ++ ++ return rc; ++} ++ ++static int ++splat_vnode_test1(struct file *file, void *arg) ++{ ++ vnode_t *vp; ++ int rc; ++ ++ if ((rc = vn_open(SPLAT_VNODE_TEST_FILE, UIO_SYSSPACE, ++ FREAD, 0644, &vp, 0, 0))) { ++ splat_vprint(file, SPLAT_VNODE_TEST1_NAME, ++ "Failed to vn_open test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE, rc); ++ return -rc; ++ } ++ ++ rc = VOP_CLOSE(vp, 0, 0, 0, 0, 0); ++ ++ if (rc) { ++ splat_vprint(file, SPLAT_VNODE_TEST1_NAME, ++ "Failed to vn_close test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE, rc); ++ return -rc; ++ } ++ ++ splat_vprint(file, SPLAT_VNODE_TEST1_NAME, "Successfully vn_open'ed " ++ "and vn_closed test file: %s\n", SPLAT_VNODE_TEST_FILE); ++ ++ return -rc; ++} /* splat_vnode_test1() */ ++ ++static int ++splat_vnode_test2(struct file *file, void *arg) ++{ ++ vnode_t *vp; ++ int rc; ++ ++ if ((rc = vn_openat(SPLAT_VNODE_TEST_FILE_AT, UIO_SYSSPACE, ++ FREAD, 0644, &vp, 0, 0, rootdir, 0))) { ++ splat_vprint(file, SPLAT_VNODE_TEST2_NAME, ++ "Failed to vn_openat test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE, rc); ++ return -rc; ++ } ++ ++ rc = VOP_CLOSE(vp, 0, 0, 0, 0, 0); ++ ++ if (rc) { ++ splat_vprint(file, SPLAT_VNODE_TEST2_NAME, ++ "Failed to vn_close test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE, rc); ++ return -rc; ++ } ++ ++ splat_vprint(file, SPLAT_VNODE_TEST2_NAME, "Successfully vn_openat'ed " ++ "and vn_closed test file: %s\n", SPLAT_VNODE_TEST_FILE); ++ ++ return -rc; ++} /* splat_vnode_test2() */ ++ ++static int ++splat_vnode_test3(struct file *file, void *arg) ++{ ++ vnode_t *vp; ++ char buf1[32] = "SPL VNode Interface Test File\n"; ++ char buf2[32] = ""; ++ int rc; ++ ++ if ((rc = splat_vnode_unlink_all(file, arg, SPLAT_VNODE_TEST3_NAME))) ++ return rc; ++ ++ if ((rc = vn_open(SPLAT_VNODE_TEST_FILE_RW, UIO_SYSSPACE, ++ FWRITE | FREAD | FCREAT | FEXCL, ++ 0644, &vp, 0, 0))) { ++ splat_vprint(file, SPLAT_VNODE_TEST3_NAME, ++ "Failed to vn_open test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE_RW, rc); ++ return -rc; ++ } ++ ++ rc = vn_rdwr(UIO_WRITE, vp, buf1, strlen(buf1), 0, ++ UIO_SYSSPACE, 0, RLIM64_INFINITY, 0, NULL); ++ if (rc) { ++ splat_vprint(file, SPLAT_VNODE_TEST3_NAME, ++ "Failed vn_rdwr write of test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE_RW, rc); ++ goto out; ++ } ++ ++ rc = vn_rdwr(UIO_READ, vp, buf2, strlen(buf1), 0, ++ UIO_SYSSPACE, 0, RLIM64_INFINITY, 0, NULL); ++ if (rc) { ++ splat_vprint(file, SPLAT_VNODE_TEST3_NAME, ++ "Failed vn_rdwr read of test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE_RW, rc); ++ goto out; ++ } ++ ++ if (strncmp(buf1, buf2, strlen(buf1))) { ++ rc = EINVAL; ++ splat_vprint(file, SPLAT_VNODE_TEST3_NAME, ++ "Failed strncmp data written does not match " ++ "data read\nWrote: %sRead: %s\n", buf1, buf2); ++ goto out; ++ } ++ ++ rc = 0; ++ splat_vprint(file, SPLAT_VNODE_TEST3_NAME, "Wrote: %s", buf1); ++ splat_vprint(file, SPLAT_VNODE_TEST3_NAME, "Read: %s", buf2); ++ splat_vprint(file, SPLAT_VNODE_TEST3_NAME, "Successfully wrote and " ++ "read expected data pattern to test file: %s\n", ++ SPLAT_VNODE_TEST_FILE_RW); ++ ++out: ++ VOP_CLOSE(vp, 0, 0, 0, 0, 0); ++ vn_remove(SPLAT_VNODE_TEST_FILE_RW, UIO_SYSSPACE, RMFILE); ++ ++ return -rc; ++} /* splat_vnode_test3() */ ++ ++static int ++splat_vnode_test4(struct file *file, void *arg) ++{ ++ vnode_t *vp; ++ char buf1[32] = "SPL VNode Interface Test File\n"; ++ char buf2[32] = ""; ++ int rc; ++ ++ if ((rc = splat_vnode_unlink_all(file, arg, SPLAT_VNODE_TEST4_NAME))) ++ return rc; ++ ++ if ((rc = vn_open(SPLAT_VNODE_TEST_FILE_RW1, UIO_SYSSPACE, ++ FWRITE | FREAD | FCREAT | FEXCL, 0644, &vp, 0, 0))) { ++ splat_vprint(file, SPLAT_VNODE_TEST4_NAME, ++ "Failed to vn_open test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE_RW1, rc); ++ goto out; ++ } ++ ++ rc = vn_rdwr(UIO_WRITE, vp, buf1, strlen(buf1), 0, ++ UIO_SYSSPACE, 0, RLIM64_INFINITY, 0, NULL); ++ if (rc) { ++ splat_vprint(file, SPLAT_VNODE_TEST4_NAME, ++ "Failed vn_rdwr write of test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE_RW1, rc); ++ goto out2; ++ } ++ ++ VOP_CLOSE(vp, 0, 0, 0, 0, 0); ++ ++ rc = vn_rename(SPLAT_VNODE_TEST_FILE_RW1,SPLAT_VNODE_TEST_FILE_RW2,0); ++ if (rc) { ++ splat_vprint(file, SPLAT_VNODE_TEST4_NAME, "Failed vn_rename " ++ "%s -> %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE_RW1, ++ SPLAT_VNODE_TEST_FILE_RW2, rc); ++ goto out; ++ } ++ ++ if ((rc = vn_open(SPLAT_VNODE_TEST_FILE_RW2, UIO_SYSSPACE, ++ FREAD | FEXCL, 0644, &vp, 0, 0))) { ++ splat_vprint(file, SPLAT_VNODE_TEST4_NAME, ++ "Failed to vn_open test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE_RW2, rc); ++ goto out; ++ } ++ ++ rc = vn_rdwr(UIO_READ, vp, buf2, strlen(buf1), 0, ++ UIO_SYSSPACE, 0, RLIM64_INFINITY, 0, NULL); ++ if (rc) { ++ splat_vprint(file, SPLAT_VNODE_TEST4_NAME, ++ "Failed vn_rdwr read of test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE_RW2, rc); ++ goto out2; ++ } ++ ++ if (strncmp(buf1, buf2, strlen(buf1))) { ++ rc = EINVAL; ++ splat_vprint(file, SPLAT_VNODE_TEST4_NAME, ++ "Failed strncmp data written does not match " ++ "data read\nWrote: %sRead: %s\n", buf1, buf2); ++ goto out2; ++ } ++ ++ rc = 0; ++ splat_vprint(file, SPLAT_VNODE_TEST4_NAME, "Wrote to %s: %s", ++ SPLAT_VNODE_TEST_FILE_RW1, buf1); ++ splat_vprint(file, SPLAT_VNODE_TEST4_NAME, "Read from %s: %s", ++ SPLAT_VNODE_TEST_FILE_RW2, buf2); ++ splat_vprint(file, SPLAT_VNODE_TEST4_NAME, "Successfully renamed " ++ "test file %s -> %s and verified data pattern\n", ++ SPLAT_VNODE_TEST_FILE_RW1, SPLAT_VNODE_TEST_FILE_RW2); ++out2: ++ VOP_CLOSE(vp, 0, 0, 0, 0, 0); ++out: ++ vn_remove(SPLAT_VNODE_TEST_FILE_RW1, UIO_SYSSPACE, RMFILE); ++ vn_remove(SPLAT_VNODE_TEST_FILE_RW2, UIO_SYSSPACE, RMFILE); ++ ++ return -rc; ++} /* splat_vnode_test4() */ ++ ++static int ++splat_vnode_test5(struct file *file, void *arg) ++{ ++ vnode_t *vp; ++ vattr_t vap; ++ int rc; ++ ++ if ((rc = vn_open(SPLAT_VNODE_TEST_FILE, UIO_SYSSPACE, ++ FREAD, 0644, &vp, 0, 0))) { ++ splat_vprint(file, SPLAT_VNODE_TEST5_NAME, ++ "Failed to vn_open test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE, rc); ++ return -rc; ++ } ++ ++ rc = VOP_GETATTR(vp, &vap, 0, 0, NULL); ++ if (rc) { ++ splat_vprint(file, SPLAT_VNODE_TEST5_NAME, ++ "Failed to vn_getattr test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE, rc); ++ goto out; ++ } ++ ++ if (vap.va_type != VREG) { ++ rc = EINVAL; ++ splat_vprint(file, SPLAT_VNODE_TEST5_NAME, ++ "Failed expected regular file type " ++ "(%d != VREG): %s (%d)\n", vap.va_type, ++ SPLAT_VNODE_TEST_FILE, rc); ++ goto out; ++ } ++ ++ splat_vprint(file, SPLAT_VNODE_TEST1_NAME, "Successfully " ++ "vn_getattr'ed test file: %s\n", SPLAT_VNODE_TEST_FILE); ++ ++out: ++ VOP_CLOSE(vp, 0, 0, 0, 0, 0); ++ ++ return -rc; ++} /* splat_vnode_test5() */ ++ ++static int ++splat_vnode_test6(struct file *file, void *arg) ++{ ++ vnode_t *vp; ++ char buf[32] = "SPL VNode Interface Test File\n"; ++ int rc; ++ ++ if ((rc = splat_vnode_unlink_all(file, arg, SPLAT_VNODE_TEST6_NAME))) ++ return rc; ++ ++ if ((rc = vn_open(SPLAT_VNODE_TEST_FILE_RW, UIO_SYSSPACE, ++ FWRITE | FCREAT | FEXCL, 0644, &vp, 0, 0))) { ++ splat_vprint(file, SPLAT_VNODE_TEST6_NAME, ++ "Failed to vn_open test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE_RW, rc); ++ return -rc; ++ } ++ ++ rc = vn_rdwr(UIO_WRITE, vp, buf, strlen(buf), 0, ++ UIO_SYSSPACE, 0, RLIM64_INFINITY, 0, NULL); ++ if (rc) { ++ splat_vprint(file, SPLAT_VNODE_TEST6_NAME, ++ "Failed vn_rdwr write of test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE_RW, rc); ++ goto out; ++ } ++ ++ rc = vn_fsync(vp, 0, 0, 0); ++ if (rc) { ++ splat_vprint(file, SPLAT_VNODE_TEST6_NAME, ++ "Failed vn_fsync of test file: %s (%d)\n", ++ SPLAT_VNODE_TEST_FILE_RW, rc); ++ goto out; ++ } ++ ++ rc = 0; ++ splat_vprint(file, SPLAT_VNODE_TEST6_NAME, "Successfully " ++ "fsync'ed test file %s\n", SPLAT_VNODE_TEST_FILE_RW); ++out: ++ VOP_CLOSE(vp, 0, 0, 0, 0, 0); ++ vn_remove(SPLAT_VNODE_TEST_FILE_RW, UIO_SYSSPACE, RMFILE); ++ ++ return -rc; ++} /* splat_vnode_test6() */ ++ ++splat_subsystem_t * ++splat_vnode_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_VNODE_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_VNODE_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_VNODE; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_VNODE_TEST1_NAME, SPLAT_VNODE_TEST1_DESC, ++ SPLAT_VNODE_TEST1_ID, splat_vnode_test1); ++ SPLAT_TEST_INIT(sub, SPLAT_VNODE_TEST2_NAME, SPLAT_VNODE_TEST2_DESC, ++ SPLAT_VNODE_TEST2_ID, splat_vnode_test2); ++ SPLAT_TEST_INIT(sub, SPLAT_VNODE_TEST3_NAME, SPLAT_VNODE_TEST3_DESC, ++ SPLAT_VNODE_TEST3_ID, splat_vnode_test3); ++ SPLAT_TEST_INIT(sub, SPLAT_VNODE_TEST4_NAME, SPLAT_VNODE_TEST4_DESC, ++ SPLAT_VNODE_TEST4_ID, splat_vnode_test4); ++ SPLAT_TEST_INIT(sub, SPLAT_VNODE_TEST5_NAME, SPLAT_VNODE_TEST5_DESC, ++ SPLAT_VNODE_TEST5_ID, splat_vnode_test5); ++ SPLAT_TEST_INIT(sub, SPLAT_VNODE_TEST6_NAME, SPLAT_VNODE_TEST6_DESC, ++ SPLAT_VNODE_TEST6_ID, splat_vnode_test6); ++ ++ return sub; ++} /* splat_vnode_init() */ ++ ++void ++splat_vnode_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ ++ SPLAT_TEST_FINI(sub, SPLAT_VNODE_TEST6_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_VNODE_TEST5_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_VNODE_TEST4_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_VNODE_TEST3_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_VNODE_TEST2_ID); ++ SPLAT_TEST_FINI(sub, SPLAT_VNODE_TEST1_ID); ++ ++ kfree(sub); ++} /* splat_vnode_fini() */ ++ ++int ++splat_vnode_id(void) ++{ ++ return SPLAT_SUBSYSTEM_VNODE; ++} /* splat_vnode_id() */ +diff -uNr linux-3.2.33-go.orig/spl/splat/splat-zlib.c linux-3.2.33-go/spl/splat/splat-zlib.c +--- linux-3.2.33-go.orig/spl/splat/splat-zlib.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl/splat/splat-zlib.c 2012-11-16 23:22:32.409192874 +0100 +@@ -0,0 +1,165 @@ ++/*****************************************************************************\ ++ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. ++ * Copyright (C) 2007 The Regents of the University of California. ++ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). ++ * Written by Brian Behlendorf . ++ * UCRL-CODE-235197 ++ * ++ * This file is part of the SPL, Solaris Porting Layer. ++ * For details, see . ++ * ++ * The SPL is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * The SPL is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with the SPL. If not, see . ++ ***************************************************************************** ++ * Solaris Porting LAyer Tests (SPLAT) Zlib Compression Tests. ++\*****************************************************************************/ ++ ++#include ++#include ++#include ++#include "splat-internal.h" ++ ++#define SPLAT_ZLIB_NAME "zlib" ++#define SPLAT_ZLIB_DESC "Zlib Compression Tests" ++ ++#define SPLAT_ZLIB_TEST1_ID 0x0f01 ++#define SPLAT_ZLIB_TEST1_NAME "compress/uncompress" ++#define SPLAT_ZLIB_TEST1_DESC "Compress/Uncompress Test" ++ ++#define BUFFER_SIZE (128 * 1024) ++ ++static int ++splat_zlib_test1_check(struct file *file, void *src, void *dst, void *chk, ++ int level) ++{ ++ size_t dst_len = BUFFER_SIZE; ++ size_t chk_len = BUFFER_SIZE; ++ int rc; ++ ++ memset(dst, 0, BUFFER_SIZE); ++ memset(chk, 0, BUFFER_SIZE); ++ ++ rc = z_compress_level(dst, &dst_len, src, BUFFER_SIZE, level); ++ if (rc != Z_OK) { ++ splat_vprint(file, SPLAT_ZLIB_TEST1_NAME, ++ "Failed level %d z_compress_level(), %d\n", level, rc); ++ return -EINVAL; ++ } ++ ++ rc = z_uncompress(chk, &chk_len, dst, dst_len); ++ if (rc != Z_OK) { ++ splat_vprint(file, SPLAT_ZLIB_TEST1_NAME, ++ "Failed level %d z_uncompress(), %d\n", level, rc); ++ return -EINVAL; ++ } ++ ++ rc = memcmp(src, chk, BUFFER_SIZE); ++ if (rc) { ++ splat_vprint(file, SPLAT_ZLIB_TEST1_NAME, ++ "Failed level %d memcmp()), %d\n", level, rc); ++ return -EINVAL; ++ } ++ ++ splat_vprint(file, SPLAT_ZLIB_TEST1_NAME, ++ "Passed level %d, compressed %d bytes to %d bytes\n", ++ level, BUFFER_SIZE, (int)dst_len); ++ ++ return 0; ++} ++ ++/* ++ * Compress a buffer, uncompress the newly compressed buffer, then ++ * compare it to the original. Do this for all 9 compression levels. ++ */ ++static int ++splat_zlib_test1(struct file *file, void *arg) ++{ ++ void *src = NULL, *dst = NULL, *chk = NULL; ++ int i, rc, level; ++ ++ src = vmalloc(BUFFER_SIZE); ++ if (src == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ dst = vmalloc(BUFFER_SIZE); ++ if (dst == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ chk = vmalloc(BUFFER_SIZE); ++ if (chk == NULL) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ /* Source buffer is a repeating 1024 byte random pattern. */ ++ random_get_pseudo_bytes(src, sizeof(uint8_t) * 1024); ++ for (i = 1; i < 128; i++) ++ memcpy(src + (i * 1024), src, 1024); ++ ++ for (level = 1; level <= 9; level++) ++ if ((rc = splat_zlib_test1_check(file, src, dst, chk, level))) ++ break; ++out: ++ if (src) ++ vfree(src); ++ ++ if (dst) ++ vfree(dst); ++ ++ if (chk) ++ vfree(chk); ++ ++ return rc; ++} ++ ++splat_subsystem_t * ++splat_zlib_init(void) ++{ ++ splat_subsystem_t *sub; ++ ++ sub = kmalloc(sizeof(*sub), GFP_KERNEL); ++ if (sub == NULL) ++ return NULL; ++ ++ memset(sub, 0, sizeof(*sub)); ++ strncpy(sub->desc.name, SPLAT_ZLIB_NAME, SPLAT_NAME_SIZE); ++ strncpy(sub->desc.desc, SPLAT_ZLIB_DESC, SPLAT_DESC_SIZE); ++ INIT_LIST_HEAD(&sub->subsystem_list); ++ INIT_LIST_HEAD(&sub->test_list); ++ spin_lock_init(&sub->test_lock); ++ sub->desc.id = SPLAT_SUBSYSTEM_ZLIB; ++ ++ SPLAT_TEST_INIT(sub, SPLAT_ZLIB_TEST1_NAME, SPLAT_ZLIB_TEST1_DESC, ++ SPLAT_ZLIB_TEST1_ID, splat_zlib_test1); ++ ++ return sub; ++} ++ ++void ++splat_zlib_fini(splat_subsystem_t *sub) ++{ ++ ASSERT(sub); ++ ++ SPLAT_TEST_FINI(sub, SPLAT_ZLIB_TEST1_ID); ++ ++ kfree(sub); ++} ++ ++int ++splat_zlib_id(void) { ++ return SPLAT_SUBSYSTEM_ZLIB; ++} +diff -uNr linux-3.2.33-go.orig/spl_config.h linux-3.2.33-go/spl_config.h +--- linux-3.2.33-go.orig/spl_config.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/spl_config.h 2012-11-16 23:22:32.412192839 +0100 +@@ -0,0 +1,289 @@ ++/* spl_config.h. Generated from spl_config.h.in by configure. */ ++/* spl_config.h.in. Generated from configure.ac by autoheader. */ ++ ++/* Atomic types use spinlocks */ ++/* #undef ATOMIC_SPINLOCK */ ++ ++/* Define to 1 to enable basic kmem accounting */ ++#define DEBUG_KMEM 1 ++ ++/* Define to 1 to enable detailed kmem tracking */ ++/* #undef DEBUG_KMEM_TRACKING */ ++ ++/* Define to 1 to enable basic debug logging */ ++#define DEBUG_LOG 1 ++ ++/* invalidate_inodes() wants 2 args */ ++#define HAVE_2ARGS_INVALIDATE_INODES 1 ++ ++/* register_sysctl_table() wants 2 args */ ++/* #undef HAVE_2ARGS_REGISTER_SYSCTL */ ++ ++/* set_fs_pwd() wants 2 args */ ++#define HAVE_2ARGS_SET_FS_PWD 1 ++ ++/* vfs_fsync() wants 2 args */ ++#define HAVE_2ARGS_VFS_FSYNC 1 ++ ++/* vfs_unlink() wants 2 args */ ++#define HAVE_2ARGS_VFS_UNLINK 1 ++ ++/* zlib_deflate_workspacesize() wants 2 args */ ++#define HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE 1 ++ ++/* INIT_WORK wants 3 args */ ++/* #undef HAVE_3ARGS_INIT_WORK */ ++ ++/* on_each_cpu wants 3 args */ ++#define HAVE_3ARGS_ON_EACH_CPU 1 ++ ++/* shrinker callback wants 3 args */ ++/* #undef HAVE_3ARGS_SHRINKER_CALLBACK */ ++ ++/* vfs_rename() wants 4 args */ ++#define HAVE_4ARGS_VFS_RENAME 1 ++ ++/* device_create wants 5 args */ ++#define HAVE_5ARGS_DEVICE_CREATE 1 ++ ++/* proc_handler() wants 5 args */ ++#define HAVE_5ARGS_PROC_HANDLER 1 ++ ++/* kernel defines atomic64_cmpxchg */ ++/* #undef HAVE_ATOMIC64_CMPXCHG */ ++ ++/* kernel defines atomic64_t */ ++#define HAVE_ATOMIC64_T 1 ++ ++/* kernel defines atomic64_xchg */ ++#define HAVE_ATOMIC64_XCHG 1 ++ ++/* class_device_create() is available */ ++/* #undef HAVE_CLASS_DEVICE_CREATE */ ++ ++/* struct cred exists */ ++#define HAVE_CRED_STRUCT 1 ++ ++/* struct ctl_table has ctl_name */ ++/* #undef HAVE_CTL_NAME */ ++ ++/* unnumbered sysctl support exists */ ++/* #undef HAVE_CTL_UNNUMBERED */ ++ ++/* device_create() is available */ ++#define HAVE_DEVICE_CREATE 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_DLFCN_H 1 ++ ++/* first_online_pgdat() is available */ ++#define HAVE_FIRST_ONLINE_PGDAT 1 ++ ++/* fls64() is available */ ++#define HAVE_FLS64 1 ++ ++/* struct fs_struct uses spinlock_t */ ++#define HAVE_FS_STRUCT_SPINLOCK 1 ++ ++/* get_vmalloc_info() is available */ ++/* #undef HAVE_GET_VMALLOC_INFO */ ++ ++/* get_zone_counts() is available */ ++/* #undef HAVE_GET_ZONE_COUNTS */ ++ ++/* global_page_state() is available */ ++#define HAVE_GLOBAL_PAGE_STATE 1 ++ ++/* groups_search() is available */ ++#define HAVE_GROUPS_SEARCH 1 ++ ++/* init_utsname() is available */ ++#define HAVE_INIT_UTSNAME 1 ++ ++/* struct inode has i_mutex */ ++#define HAVE_INODE_I_MUTEX 1 ++ ++/* truncate_range() inode operation is available */ ++/* #undef HAVE_INODE_TRUNCATE_RANGE */ ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_INTTYPES_H 1 ++ ++/* invalidate_inodes() is available */ ++/* #undef HAVE_INVALIDATE_INODES */ ++ ++/* invalidate_inodes_check() is available */ ++/* #undef HAVE_INVALIDATE_INODES_CHECK */ ++ ++/* kallsyms_lookup_name() is available */ ++#define HAVE_KALLSYMS_LOOKUP_NAME 1 ++ ++/* kern_path_locked() is available */ ++/* #undef HAVE_KERN_PATH_LOCKED */ ++ ++/* kern_path_parent() is available */ ++#define HAVE_KERN_PATH_PARENT_HEADER 1 ++ ++/* kern_path_parent() is available */ ++/* #undef HAVE_KERN_PATH_PARENT_SYMBOL */ ++ ++/* kmalloc_node() is available */ ++#define HAVE_KMALLOC_NODE 1 ++ ++/* kvasprintf() is available */ ++#define HAVE_KVASPRINTF 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_MEMORY_H 1 ++ ++/* monotonic_clock() is available */ ++/* #undef HAVE_MONOTONIC_CLOCK */ ++ ++/* mutex_lock_nested() is available */ ++#define HAVE_MUTEX_LOCK_NESTED 1 ++ ++/* struct mutex has owner */ ++#define HAVE_MUTEX_OWNER 1 ++ ++/* struct mutex owner is a task_struct */ ++#define HAVE_MUTEX_OWNER_TASK_STRUCT 1 ++ ++/* next_online_pgdat() is available */ ++#define HAVE_NEXT_ONLINE_PGDAT 1 ++ ++/* next_zone() is available */ ++#define HAVE_NEXT_ZONE 1 ++ ++/* struct path used in struct nameidata */ ++#define HAVE_PATH_IN_NAMEIDATA 1 ++ ++/* pgdat helpers are available */ ++#define HAVE_PGDAT_HELPERS 1 ++ ++/* pgdat_list is available */ ++/* #undef HAVE_PGDAT_LIST */ ++ ++/* __put_task_struct() is available */ ++#define HAVE_PUT_TASK_STRUCT 1 ++ ++/* set_fs_pwd() is available */ ++#define HAVE_SET_FS_PWD 1 ++ ++/* set_normalized_timespec() is available as export */ ++#define HAVE_SET_NORMALIZED_TIMESPEC_EXPORT 1 ++ ++/* set_normalized_timespec() is available as inline */ ++#define HAVE_SET_NORMALIZED_TIMESPEC_INLINE 1 ++ ++/* set_shrinker() available */ ++/* #undef HAVE_SET_SHRINKER */ ++ ++/* struct shrink_control exists */ ++#define HAVE_SHRINK_CONTROL_STRUCT 1 ++ ++/* shrink_dcache_memory() is available */ ++/* #undef HAVE_SHRINK_DCACHE_MEMORY */ ++ ++/* shrink_icache_memory() is available */ ++/* #undef HAVE_SHRINK_ICACHE_MEMORY */ ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_STDINT_H 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_STDLIB_H 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_STRINGS_H 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_STRING_H 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_SYS_STAT_H 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_SYS_TYPES_H 1 ++ ++/* task_curr() is available */ ++#define HAVE_TASK_CURR 1 ++ ++/* timespec_sub() is available */ ++#define HAVE_TIMESPEC_SUB 1 ++ ++/* linux/uaccess.h exists */ ++#define HAVE_UACCESS_HEADER 1 ++ ++/* kernel defines uintptr_t */ ++#define HAVE_UINTPTR_T 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_UNISTD_H 1 ++ ++/* user_path_dir() is available */ ++#define HAVE_USER_PATH_DIR 1 ++ ++/* vfs_fsync() is available */ ++#define HAVE_VFS_FSYNC 1 ++ ++/* Page state NR_ACTIVE is available */ ++/* #undef HAVE_ZONE_STAT_ITEM_NR_ACTIVE */ ++ ++/* Page state NR_ACTIVE_ANON is available */ ++#define HAVE_ZONE_STAT_ITEM_NR_ACTIVE_ANON 1 ++ ++/* Page state NR_ACTIVE_FILE is available */ ++#define HAVE_ZONE_STAT_ITEM_NR_ACTIVE_FILE 1 ++ ++/* Page state NR_FREE_PAGES is available */ ++#define HAVE_ZONE_STAT_ITEM_NR_FREE_PAGES 1 ++ ++/* Page state NR_INACTIVE is available */ ++/* #undef HAVE_ZONE_STAT_ITEM_NR_INACTIVE */ ++ ++/* Page state NR_INACTIVE_ANON is available */ ++#define HAVE_ZONE_STAT_ITEM_NR_INACTIVE_ANON 1 ++ ++/* Page state NR_INACTIVE_FILE is available */ ++#define HAVE_ZONE_STAT_ITEM_NR_INACTIVE_FILE 1 ++ ++/* Define to the sub-directory in which libtool stores uninstalled libraries. ++ */ ++#define LT_OBJDIR ".libs/" ++ ++/* get_zone_counts() is needed */ ++/* #undef NEED_GET_ZONE_COUNTS */ ++ ++/* rwsem_is_locked() acquires sem->wait_lock */ ++/* #undef RWSEM_IS_LOCKED_TAKES_WAIT_LOCK */ ++ ++/* struct rw_semaphore member wait_lock is raw_spinlock_t */ ++#define RWSEM_SPINLOCK_IS_RAW 1 ++ ++/* Define the project alias string. */ ++#define SPL_META_ALIAS "spl-0.6.0-rc12" ++ ++/* Define the project author. */ ++/* #undef SPL_META_AUTHOR */ ++ ++/* Define the project release date. */ ++/* #undef SPL_META_DATA */ ++ ++/* Define the libtool library 'age' version information. */ ++/* #undef SPL_META_LT_AGE */ ++ ++/* Define the libtool library 'current' version information. */ ++/* #undef SPL_META_LT_CURRENT */ ++ ++/* Define the libtool library 'revision' version information. */ ++/* #undef SPL_META_LT_REVISION */ ++ ++/* Define the project name. */ ++#define SPL_META_NAME "spl" ++ ++/* Define the project release. */ ++#define SPL_META_RELEASE "rc12" ++ ++/* Define the project version. */ ++#define SPL_META_VERSION "0.6.0" ++ +diff -uNr linux-3.2.33-go.orig/zfs_config.h linux-3.2.33-go/zfs_config.h +--- linux-3.2.33-go.orig/zfs_config.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/zfs_config.h 2012-11-16 23:25:34.356039255 +0100 +@@ -0,0 +1,317 @@ ++/* zfs_config.h. Generated from zfs_config.h.in by configure. */ ++/* zfs_config.h.in. Generated from configure.ac by autoheader. */ ++ ++/* Define to 1 to enabled dmu tx validation */ ++/* #undef DEBUG_DMU_TX */ ++ ++/* invalidate_bdev() wants 1 arg */ ++#define HAVE_1ARG_INVALIDATE_BDEV 1 ++ ++/* bio_end_io_t wants 2 args */ ++#define HAVE_2ARGS_BIO_END_IO_T 1 ++ ++/* blkdev_get() wants 3 args */ ++#define HAVE_3ARG_BLKDEV_GET 1 ++ ++/* sget() wants 5 args */ ++#define HAVE_5ARG_SGET 1 ++ ++/* security_inode_init_security wants 6 args */ ++/* #undef HAVE_6ARGS_SECURITY_INODE_INIT_SECURITY */ ++ ++/* dops->automount() exists */ ++#define HAVE_AUTOMOUNT 1 ++ ++/* struct block_device_operations use bdevs */ ++#define HAVE_BDEV_BLOCK_DEVICE_OPERATIONS 1 ++ ++/* bdev_logical_block_size() is available */ ++#define HAVE_BDEV_LOGICAL_BLOCK_SIZE 1 ++ ++/* struct super_block has s_bdi */ ++#define HAVE_BDI 1 ++ ++/* bdi_setup_and_register() is available */ ++#define HAVE_BDI_SETUP_AND_REGISTER 1 ++ ++/* bio_empy_barrier() is defined */ ++/* #undef HAVE_BIO_EMPTY_BARRIER */ ++ ++/* REQ_FAILFAST_MASK is defined */ ++#define HAVE_BIO_REQ_FAILFAST_MASK 1 ++ ++/* BIO_RW_FAILFAST is defined */ ++/* #undef HAVE_BIO_RW_FAILFAST */ ++ ++/* BIO_RW_FAILFAST_* are defined */ ++/* #undef HAVE_BIO_RW_FAILFAST_DTD */ ++ ++/* BIO_RW_SYNC is defined */ ++/* #undef HAVE_BIO_RW_SYNC */ ++ ++/* BIO_RW_SYNCIO is defined */ ++/* #undef HAVE_BIO_RW_SYNCIO */ ++ ++/* blkdev_get_by_path() is available */ ++#define HAVE_BLKDEV_GET_BY_PATH 1 ++ ++/* blk_end_request() is available */ ++#define HAVE_BLK_END_REQUEST 1 ++ ++/* blk_end_request() is GPL-only */ ++/* #undef HAVE_BLK_END_REQUEST_GPL_ONLY */ ++ ++/* blk_fetch_request() is available */ ++#define HAVE_BLK_FETCH_REQUEST 1 ++ ++/* blk_queue_discard() is available */ ++#define HAVE_BLK_QUEUE_DISCARD 1 ++ ++/* blk_queue_flush() is available */ ++#define HAVE_BLK_QUEUE_FLUSH 1 ++ ++/* blk_queue_flush() is GPL-only */ ++/* #undef HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */ ++ ++/* blk_queue_io_opt() is available */ ++#define HAVE_BLK_QUEUE_IO_OPT 1 ++ ++/* blk_queue_max_hw_sectors() is available */ ++#define HAVE_BLK_QUEUE_MAX_HW_SECTORS 1 ++ ++/* blk_queue_max_segments() is available */ ++#define HAVE_BLK_QUEUE_MAX_SEGMENTS 1 ++ ++/* blk_queue_nonrot() is available */ ++#define HAVE_BLK_QUEUE_NONROT 1 ++ ++/* blk_queue_physical_block_size() is available */ ++#define HAVE_BLK_QUEUE_PHYSICAL_BLOCK_SIZE 1 ++ ++/* blk_requeue_request() is available */ ++#define HAVE_BLK_REQUEUE_REQUEST 1 ++ ++/* blk_rq_bytes() is available */ ++#define HAVE_BLK_RQ_BYTES 1 ++ ++/* blk_rq_bytes() is GPL-only */ ++/* #undef HAVE_BLK_RQ_BYTES_GPL_ONLY */ ++ ++/* blk_rq_pos() is available */ ++#define HAVE_BLK_RQ_POS 1 ++ ++/* blk_rq_sectors() is available */ ++#define HAVE_BLK_RQ_SECTORS 1 ++ ++/* security_inode_init_security wants callback */ ++#define HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY 1 ++ ++/* check_disk_size_change() is available */ ++#define HAVE_CHECK_DISK_SIZE_CHANGE 1 ++ ++/* clear_inode() is available */ ++#define HAVE_CLEAR_INODE 1 ++ ++/* eops->commit_metadata() exists */ ++#define HAVE_COMMIT_METADATA 1 ++ ++/* super_block uses const struct xattr_hander */ ++#define HAVE_CONST_XATTR_HANDLER 1 ++ ++/* iops->create() operation takes nameidata */ ++/* #undef HAVE_CREATE_NAMEIDATA */ ++ ++/* xattr_handler->get() wants dentry */ ++#define HAVE_DENTRY_XATTR_GET 1 ++ ++/* xattr_handler->set() wants dentry */ ++#define HAVE_DENTRY_XATTR_SET 1 ++ ++/* ql->discard_granularity is available */ ++#define HAVE_DISCARD_GRANULARITY 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_DLFCN_H 1 ++ ++/* d_make_root() is available */ ++#define HAVE_D_MAKE_ROOT 1 ++ ++/* d_obtain_alias() is available */ ++#define HAVE_D_OBTAIN_ALIAS 1 ++ ++/* elevator_change() is available */ ++#define HAVE_ELEVATOR_CHANGE 1 ++ ++/* eops->encode_fh() wants child and parent inodes */ ++#define HAVE_ENCODE_FH_WITH_INODE 1 ++ ++/* sops->evict_inode() exists */ ++#define HAVE_EVICT_INODE 1 ++ ++/* fops->fallocate() exists */ ++#define HAVE_FILE_FALLOCATE 1 ++ ++/* kernel defines fmode_t */ ++#define HAVE_FMODE_T 1 ++ ++/* sops->free_cached_objects() exists */ ++#define HAVE_FREE_CACHED_OBJECTS 1 ++ ++/* fops->fsync() with range */ ++#define HAVE_FSYNC_RANGE 1 ++ ++/* fops->fsync() without dentry */ ++/* #undef HAVE_FSYNC_WITHOUT_DENTRY */ ++ ++/* fops->fsync() with dentry */ ++/* #undef HAVE_FSYNC_WITH_DENTRY */ ++ ++/* blk_disk_ro() is available */ ++#define HAVE_GET_DISK_RO 1 ++ ++/* get_gendisk() is available */ ++#define HAVE_GET_GENDISK 1 ++ ++/* Define to 1 if licensed under the GPL */ ++/* #undef HAVE_GPL_ONLY_SYMBOLS */ ++ ++/* fops->fallocate() exists */ ++/* #undef HAVE_INODE_FALLOCATE */ ++ ++/* iops->truncate_range() exists */ ++/* #undef HAVE_INODE_TRUNCATE_RANGE */ ++ ++/* insert_inode_locked() is available */ ++#define HAVE_INSERT_INODE_LOCKED 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_INTTYPES_H 1 ++ ++/* result=stropts.h Define to 1 if ioctl() defined in */ ++/* #undef HAVE_IOCTL_IN_STROPTS_H */ ++ ++/* Define to 1 if ioctl() defined in */ ++#define HAVE_IOCTL_IN_SYS_IOCTL_H 1 ++ ++/* Define to 1 if ioctl() defined in */ ++/* #undef HAVE_IOCTL_IN_UNISTD_H */ ++ ++/* kernel defines KOBJ_NAME_LEN */ ++/* #undef HAVE_KOBJ_NAME_LEN */ ++ ++/* Define if you have libblkid */ ++/* #undef HAVE_LIBBLKID */ ++ ++/* Define if you have selinux */ ++/* #undef HAVE_LIBSELINUX */ ++ ++/* Define if you have libuuid */ ++#define HAVE_LIBUUID 1 ++ ++/* Define to 1 if you have the `z' library (-lz). */ ++#define HAVE_LIBZ 1 ++ ++/* iops->lookup() operation takes nameidata */ ++/* #undef HAVE_LOOKUP_NAMEIDATA */ ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_MEMORY_H 1 ++ ++/* iops->create()/mkdir()/mknod() take umode_t */ ++#define HAVE_MKDIR_UMODE_T 1 ++ ++/* mount_nodev() is available */ ++#define HAVE_MOUNT_NODEV 1 ++ ++/* sops->nr_cached_objects() exists */ ++#define HAVE_NR_CACHED_OBJECTS 1 ++ ++/* open_bdev_exclusive() is available */ ++/* #undef HAVE_OPEN_BDEV_EXCLUSIVE */ ++ ++/* REQ_SYNC is defined */ ++#define HAVE_REQ_SYNC 1 ++ ++/* rq_for_each_segment() is available */ ++#define HAVE_RQ_FOR_EACH_SEGMENT 1 ++ ++/* rq_is_sync() is available */ ++#define HAVE_RQ_IS_SYNC 1 ++ ++/* set_nlink() is available */ ++#define HAVE_SET_NLINK 1 ++ ++/* sops->show_options() with dentry */ ++#define HAVE_SHOW_OPTIONS_WITH_DENTRY 1 ++ ++/* struct super_block has s_shrink */ ++#define HAVE_SHRINK 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_STDINT_H 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_STDLIB_H 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_STRINGS_H 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_STRING_H 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_SYS_STAT_H 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_SYS_TYPES_H 1 ++ ++/* truncate_setsize() is available */ ++#define HAVE_TRUNCATE_SETSIZE 1 ++ ++/* Define to 1 if you have the header file. */ ++#define HAVE_UNISTD_H 1 ++ ++/* Define if you have zlib */ ++#define HAVE_ZLIB 1 ++ ++/* Define to the sub-directory in which libtool stores uninstalled libraries. ++ */ ++#define LT_OBJDIR ".libs/" ++ ++/* Define to 1 if NPTL threading implementation includes guard area in stack ++ allocation */ ++/* #undef NPTL_GUARD_WITHIN_STACK */ ++ ++/* zfs debugging enabled */ ++/* #undef ZFS_DEBUG */ ++ ++/* Define the project alias string. */ ++#define ZFS_META_ALIAS "zfs-0.6.0-rc12" ++ ++/* Define the project author. */ ++#define ZFS_META_AUTHOR "Sun Microsystems/Oracle, Lawrence Livermore National Laboratory" ++ ++/* Define the project release date. */ ++/* #undef ZFS_META_DATA */ ++ ++/* Define the project license. */ ++#define ZFS_META_LICENSE "CDDL" ++ ++/* Define the libtool library 'age' version information. */ ++/* #undef ZFS_META_LT_AGE */ ++ ++/* Define the libtool library 'current' version information. */ ++/* #undef ZFS_META_LT_CURRENT */ ++ ++/* Define the libtool library 'revision' version information. */ ++/* #undef ZFS_META_LT_REVISION */ ++ ++/* Define the project name. */ ++#define ZFS_META_NAME "zfs" ++ ++/* Define the project release. */ ++#define ZFS_META_RELEASE "rc12" ++ ++/* Define the project version. */ ++#define ZFS_META_VERSION "0.6.0" ++ diff --git a/3.2.34/lschlv2.patch b/3.2.34/lschlv2.patch new file mode 100644 index 0000000..40ef6be --- /dev/null +++ b/3.2.34/lschlv2.patch @@ -0,0 +1,256 @@ +--- a/arch/arm/mach-kirkwood/include/mach/system.h ++++ b/arch/arm/mach-kirkwood/include/mach/system.h +@@ -9,6 +9,8 @@ + #ifndef __ASM_ARCH_SYSTEM_H + #define __ASM_ARCH_SYSTEM_H + ++#include ++#include + #include + + static inline void arch_idle(void) +--- a/arch/arm/mach-kirkwood/Kconfig ++++ b/arch/arm/mach-kirkwood/Kconfig +@@ -87,6 +87,12 @@ + Say 'Y' here if you want your kernel to support the + HP t5325 Thin Client. + ++config MACH_LINKSTATION_CHLV2 ++ bool "Buffalo LS-CHLv2 Series" ++ help ++ Say 'Y' here if you want your kernel to support the ++ Buffalo LS-CHLv2 Series. ++ + endmenu + + endif +--- a/arch/arm/mach-kirkwood/lschlv2-setup.c ++++ b/arch/arm/mach-kirkwood/lschlv2-setup.c +@@ -0,0 +1,210 @@ ++/* ++ * arch/arm/mach-kirkwood/lschlv2-setup.c ++ * ++ * Buffalo LS Kirkwood Series Setup ++ * ++ * This file is licensed under the terms of the GNU General Public ++ * License version 2. This program is licensed "as is" without any ++ * warranty of any kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "include/mach/system.h" ++#include ++#include "common.h" ++#include "mpp.h" ++ ++/***************************************************************************** ++ * 512KB SPI Flash on BOOT Device ++ ****************************************************************************/ ++static struct mtd_partition lschlv2_partitions[] = { ++ { ++ .name = "u-boot", ++ .offset = 0x00000, ++ .size = 0x70000, ++ .mask_flags = MTD_WRITEABLE, ++ }, ++ { ++ .name = "u-boot env", ++ .offset = MTDPART_OFS_APPEND, ++ .size = 0x10000, ++ } ++}; ++ ++static struct flash_platform_data lschlv2_spi_slave_data = { ++ .type = "m25p40", ++ .parts = lschlv2_partitions, ++ .nr_parts = ARRAY_SIZE(lschlv2_partitions), ++}; ++ ++static struct spi_board_info __initdata lschlv2_spi_slave_info[] = { ++ { ++ .modalias = "m25p80", ++ .platform_data = &lschlv2_spi_slave_data, ++ .irq = -1, ++ .max_speed_hz = 20000000, ++ .bus_num = 0, ++ .chip_select = 0, ++ } ++}; ++ ++static struct mv643xx_eth_platform_data lschlv2_ge00_data = { ++ .phy_addr = MV643XX_ETH_PHY_ADDR(0), ++}; ++ ++static struct mv643xx_eth_platform_data lschlv2_ge01_data = { ++ .phy_addr = MV643XX_ETH_PHY_ADDR(8), ++}; ++ ++static unsigned int lschlv2_mpp_config[] __initdata = { ++ MPP10_GPO, /* HDD Power */ ++ MPP11_GPIO, /* USB Vbus Power */ ++ MPP18_GPO, /* FAN High on:0, off:1 */ ++ MPP19_GPO, /* FAN Low on:0, off:1 */ ++ MPP36_GPIO, /* FUNC LED */ ++ MPP37_GPIO, /* ALARM LED */ ++ MPP38_GPIO, /* INFO LED */ ++ MPP39_GPIO, /* POWER LED */ ++ MPP40_GPIO, /* FAN LOCK */ ++ MPP41_GPIO, /* FUNC SW */ ++ MPP42_GPIO, /* POWER SW */ ++ MPP43_GPIO, /* POWER AUTO SW */ ++ MPP48_GPIO, /* FUNC RED LED */ ++ MPP49_GPIO, /* UART EN */ ++ 0 ++}; ++ ++static struct mv_sata_platform_data lschlv2_sata_data = { ++ .n_ports = 1, ++}; ++ ++static struct gpio_led lschlv2_led_pins[] = { ++ { ++ .name = "func", ++ .gpio = 36, ++ .active_low = 1, ++ }, ++ { ++ .name = "alarm", ++ .gpio = 37, ++ .active_low = 1, ++ }, ++ { ++ .name = "info", ++ .gpio = 38, ++ .active_low = 1, ++ }, ++ { ++ .name = "power", ++ .gpio = 39, ++ .default_trigger = "default-on", ++ .active_low = 1, ++ }, ++ { ++ .name = "func2", ++ .gpio = 48, ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_led_platform_data lschlv2_led_data = { ++ .leds = lschlv2_led_pins, ++ .num_leds = ARRAY_SIZE(lschlv2_led_pins), ++}; ++ ++static struct platform_device lschlv2_leds = { ++ .name = "leds-gpio", ++ .id = -1, ++ .dev = { ++ .platform_data = &lschlv2_led_data, ++ } ++}; ++ ++#define LSCHLv2_GPIO_USB_VBUS_EN 11 ++#define LSCHLv2_GPIO_KEY_FUNC 41 ++ ++static struct gpio_keys_button lschlv2_buttons[] = { ++ { ++ .code = KEY_OPTION, ++ .gpio = LSCHLv2_GPIO_KEY_FUNC, ++ .desc = "Function Button", ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_keys_platform_data lschlv2_button_data = { ++ .buttons = lschlv2_buttons, ++ .nbuttons = ARRAY_SIZE(lschlv2_buttons), ++}; ++ ++static struct platform_device lschlv2_button_device = { ++ .name = "gpio-keys", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lschlv2_button_data, ++ }, ++}; ++ ++static void lschlv2_power_off(void) ++{ ++ arch_reset(0, NULL); ++} ++ ++static void __init lschlv2_init(void) ++{ ++ /* ++ * Basic setup. Needs to be called early. ++ */ ++ kirkwood_init(); ++ kirkwood_mpp_conf(lschlv2_mpp_config); ++ ++ kirkwood_uart0_init(); ++ ++ if (gpio_request(LSCHLv2_GPIO_USB_VBUS_EN, "USB Power Enable") != 0 || ++ gpio_direction_output(LSCHLv2_GPIO_USB_VBUS_EN, 1) != 0) ++ printk(KERN_ERR "can't set up USB Power Enable\n"); ++ kirkwood_ehci_init(); ++ ++ kirkwood_ge00_init(&lschlv2_ge00_data); ++ kirkwood_ge01_init(&lschlv2_ge01_data); ++ ++ kirkwood_sata_init(&lschlv2_sata_data); ++ ++ kirkwood_spi_init(); ++ ++ platform_device_register(&lschlv2_leds); ++ platform_device_register(&lschlv2_button_device); ++ ++ spi_register_board_info(lschlv2_spi_slave_info, ++ ARRAY_SIZE(lschlv2_spi_slave_info)); ++ ++ /* register power-off method */ ++ pm_power_off = lschlv2_power_off; ++ ++ pr_info("%s: finished\n", __func__); ++} ++ ++ ++ ++MACHINE_START(LINKSTATION_CHLV2, "Buffalo Linkstation LS-CHLv2") ++ .atag_offset = 0x100, ++ .init_machine = lschlv2_init, ++ .map_io = kirkwood_map_io, ++ .init_early = kirkwood_init_early, ++ .init_irq = kirkwood_init_irq, ++ .timer = &kirkwood_timer, ++MACHINE_END +--- a/arch/arm/mach-kirkwood/Makefile ++++ b/arch/arm/mach-kirkwood/Makefile +@@ -20,3 +20,4 @@ + obj-$(CONFIG_MACH_T5325) += t5325-setup.o ++obj-$(CONFIG_MACH_LINKSTATION_CHLV2) += lschlv2-setup.o + + obj-$(CONFIG_CPU_IDLE) += cpuidle.o +--- a/arch/arm/mach-kirkwood/common.c ++++ b/arch/arm/mach-kirkwood/common.c +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + #include "common.h" + + /***************************************************************************** diff --git a/3.2.34/net-netfilter-IFWLOG-2.6.35-buildfix.patch b/3.2.34/net-netfilter-IFWLOG-2.6.35-buildfix.patch new file mode 100644 index 0000000..99d4d06 --- /dev/null +++ b/3.2.34/net-netfilter-IFWLOG-2.6.35-buildfix.patch @@ -0,0 +1,32 @@ +--- linux-2.6.35-rc6-git-mnb0.1/net/ipv4/netfilter/ipt_IFWLOG.c.orig 2010-07-30 21:17:30.000000000 +0300 ++++ linux-2.6.35-rc6-git-mnb0.1/net/ipv4/netfilter/ipt_IFWLOG.c 2010-07-31 13:46:33.834611944 +0300 +@@ -135,7 +135,7 @@ static void ipt_IFWLOG_packet(const stru + } + + static unsigned int ipt_IFWLOG_target(struct sk_buff *skb, +- const struct xt_target_param *target_param) ++ const struct xt_action_param *target_param) + { + const struct ipt_IFWLOG_info *info = target_param->targinfo; + +@@ -144,17 +144,17 @@ static unsigned int ipt_IFWLOG_target(st + return IPT_CONTINUE; + } + +-static bool ipt_IFWLOG_checkentry(const struct xt_tgchk_param *tgchk_param) ++static int ipt_IFWLOG_checkentry(const struct xt_tgchk_param *tgchk_param) + { + const struct ipt_IFWLOG_info *info = tgchk_param->targinfo; + + if (info->prefix[sizeof(info->prefix)-1] != '\0') { + DEBUGP("IFWLOG: prefix term %i\n", + info->prefix[sizeof(info->prefix)-1]); +- return false; ++ return -EINVAL; + } + +- return true; ++ return 0; + } + + static struct xt_target ipt_IFWLOG = { diff --git a/3.2.34/net-netfilter-IFWLOG-2.6.37-buildfix.patch b/3.2.34/net-netfilter-IFWLOG-2.6.37-buildfix.patch new file mode 100644 index 0000000..0ae95aa --- /dev/null +++ b/3.2.34/net-netfilter-IFWLOG-2.6.37-buildfix.patch @@ -0,0 +1,15 @@ + + net/ipv4/netfilter/ipt_IFWLOG.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- linux-2.6.37-rc3-git1-tmb0.3/net/ipv4/netfilter/ipt_IFWLOG.c.orig 2010-11-24 21:58:36.000000000 +0200 ++++ linux-2.6.37-rc3-git1-tmb0.3/net/ipv4/netfilter/ipt_IFWLOG.c 2010-11-25 13:08:55.719379646 +0200 +@@ -141,7 +141,7 @@ static unsigned int ipt_IFWLOG_target(st + + ipt_IFWLOG_packet(skb, target_param->in, target_param->out, info); + +- return IPT_CONTINUE; ++ return XT_CONTINUE; + } + + static int ipt_IFWLOG_checkentry(const struct xt_tgchk_param *tgchk_param) diff --git a/3.2.34/net-netfilter-IFWLOG-mdv.patch b/3.2.34/net-netfilter-IFWLOG-mdv.patch new file mode 100644 index 0000000..e5b9c92 --- /dev/null +++ b/3.2.34/net-netfilter-IFWLOG-mdv.patch @@ -0,0 +1,264 @@ +ipt_IFWLOG: Mandriva changes + +This patch holds all the Mandriva changes done in ipt_IFWLOG +netfilter module. + +This work is mostly done by Thomas Backlund, Herton R. Krzesinski +and Luiz Fernando N. Capitulino. + +Signed-off-by: Luiz Fernando N. Capitulino +Signed-off-by: Herton Ronaldo Krzesinski + +--- + include/linux/netfilter_ipv4/Kbuild | 1 + include/linux/netfilter_ipv4/ipt_IFWLOG.h | 23 +++++- + net/ipv4/netfilter/ipt_IFWLOG.c | 108 +++++++++++++++--------------- + 3 files changed, 77 insertions(+), 55 deletions(-) + +diff -p -up linux-2.6.28/include/linux/netfilter_ipv4/ipt_IFWLOG.h.orig linux-2.6.28/include/linux/netfilter_ipv4/ipt_IFWLOG.h +--- linux-2.6.28/include/linux/netfilter_ipv4/ipt_IFWLOG.h.orig 2008-12-12 10:55:07.000000000 -0500 ++++ linux-2.6.28/include/linux/netfilter_ipv4/ipt_IFWLOG.h 2008-12-12 10:56:30.000000000 -0500 +@@ -1,10 +1,25 @@ +-#ifndef _IPT_IFWLOG_H +-#define _IPT_IFWLOG_H ++#ifndef _LINUX_IPT_IFWLOG_H ++#define _LINUX_IPT_IFWLOG_H + + #ifndef NETLINK_IFWLOG +-#define NETLINK_IFWLOG 19 ++#define NETLINK_IFWLOG 20 + #endif + ++#ifndef __KERNEL__ ++/* Multicast groups - backwards compatiblility for userspace */ ++#define IFWLOG_NLGRP_NONE 0x00000000 ++#define IFWLOG_NLGRP_DEF 0x00000001 /* default message group */ ++#endif ++ ++enum { ++ IFWLOGNLGRP_NONE, ++#define IFWLOGNLGRP_NONE IFWLOGNLGRP_NONE ++ IFWLOGNLGRP_DEF, ++#define IFWLOGNLGRP_DEF IFWLOGNLGRP_DEF ++ __IFWLOGNLGRP_MAX ++}; ++#define IFWLOGNLGRP_MAX (__IFWLOGNLGRP_MAX - 1) ++ + #define PREFSIZ 32 + + struct nl_msg { /* Netlink message */ +@@ -23,4 +38,4 @@ struct ipt_IFWLOG_info { + char prefix[PREFSIZ]; + }; + +-#endif /* _IPT_IFWLOG_H */ ++#endif /* _LINUX_IPT_IFWLOG_H */ +diff -p -up linux-2.6.28/net/ipv4/netfilter/ipt_IFWLOG.c.orig linux-2.6.28/net/ipv4/netfilter/ipt_IFWLOG.c +--- linux-2.6.28/net/ipv4/netfilter/ipt_IFWLOG.c.orig 2008-12-12 10:55:07.000000000 -0500 ++++ linux-2.6.28/net/ipv4/netfilter/ipt_IFWLOG.c 2008-12-12 10:57:16.000000000 -0500 +@@ -4,6 +4,14 @@ + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. ++ * ++ * 2007-10-10 Thomas Backlund : build fixes for 2.6.22.9 ++ * 2007-11-11 Herton Krzesinski : build fixes for 2.6.24-rc ++ * 2007-12-03 Luiz Capitulino : v1.1 ++ * - Better multicast group usage ++ * - Coding style fixes ++ * - Do not return -EINVAL by default in ipt_ifwlog_init() ++ * - Minor refinements + */ + + #include +@@ -19,12 +27,10 @@ + #include + + #include ++#include + #include + #include + +-MODULE_LICENSE("GPL"); +-MODULE_AUTHOR("Samir Bellabes "); +-MODULE_DESCRIPTION("Interactive firewall logging and module"); + + #if 0 + #define DEBUGP PRINTR +@@ -36,44 +42,41 @@ MODULE_DESCRIPTION("Interactive firewall + + static struct sock *nl; + +-#define GROUP 10 +- + /* send struct to userspace */ +-static void send_packet(struct nl_msg msg) ++static void send_packet(const struct nl_msg *msg) + { + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; ++ unsigned int size; + +- skb = alloc_skb(NLMSG_SPACE(sizeof(struct nl_msg)), GFP_ATOMIC); ++ size = NLMSG_SPACE(sizeof(*msg)); ++ skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) { + PRINTR(KERN_WARNING "IFWLOG: OOM can't allocate skb\n"); +- return ; ++ return; + } + +- nlh = NLMSG_PUT(skb, 0, 0, 0, sizeof(struct nl_msg) - sizeof(*nlh)); ++ nlh = NLMSG_PUT(skb, 0, 0, 0, size - sizeof(*nlh)); + +- memcpy(NLMSG_DATA(nlh), (const void*)&msg, sizeof(struct nl_msg)); ++ memcpy(NLMSG_DATA(nlh), (const void *) msg, sizeof(*msg)); + + NETLINK_CB(skb).pid = 0; /* from kernel */ +- NETLINK_CB(skb).dst_pid = 0; /* multicast */ +- NETLINK_CB(skb).dst_group = 10; ++ NETLINK_CB(skb).dst_group = IFWLOGNLGRP_DEF; + + if (nl) { + DEBUGP(KERN_WARNING + "IFWLOG: nlmsg_len=%ld\nnlmsg_type=%d nlmsg_flags=%d\nnlmsg_seq=%ld nlmsg_pid = %ld\n", + (long)nlh->nlmsg_len, nlh->nlmsg_type, nlh->nlmsg_flags, + (long)nlh->nlmsg_seq, (long)nlh->nlmsg_pid); +- DEBUGP(KERN_WARNING "prefix : %s\n", msg.prefix); ++ DEBUGP(KERN_WARNING "prefix : %s\n", msg->prefix); + +- netlink_broadcast(nl, skb, 0, 10, GFP_ATOMIC); +- return ; ++ netlink_broadcast(nl, skb, 0, IFWLOGNLGRP_DEF, GFP_ATOMIC); ++ return; + } + +- nlmsg_failure: +- if (skb) +- kfree_skb(skb); +- PRINTR(KERN_WARNING "IFWLOG: Error sending netlink packet\n"); +- return ; ++nlmsg_failure: ++ kfree_skb(skb); ++ PRINTR(KERN_WARNING "IFWLOG: Error sending netlink packet\n"); + } + + /* fill struct for userspace */ +@@ -128,73 +131,76 @@ static void ipt_IFWLOG_packet(const stru + do_gettimeofday((struct timeval *)&tv); + msg.timestamp_sec = tv.tv_sec; + +- send_packet(msg); ++ send_packet(&msg); + } + +-static unsigned int ipt_IFWLOG_target(struct sk_buff **pskb, +- const struct net_device *in, +- const struct net_device *out, +- unsigned int hooknum, +- const void *targinfo, +- void *userinfo) ++static unsigned int ipt_IFWLOG_target(struct sk_buff *skb, ++ const struct xt_target_param *target_param) + { +- const struct ipt_IFWLOG_info *info = targinfo; ++ const struct ipt_IFWLOG_info *info = target_param->targinfo; + +- ipt_IFWLOG_packet(*pskb, in, out, info); ++ ipt_IFWLOG_packet(skb, target_param->in, target_param->out, info); + + return IPT_CONTINUE; + } + +-static int ipt_IFWLOG_checkentry(const char *tablename, +- const struct ipt_entry *e, +- void *targinfo, +- unsigned int targinfosize, +- unsigned int hook_mask) ++static bool ipt_IFWLOG_checkentry(const struct xt_tgchk_param *tgchk_param) + { +- const struct ipt_IFWLOG_info *info = targinfo; ++ const struct ipt_IFWLOG_info *info = tgchk_param->targinfo; + + if (info->prefix[sizeof(info->prefix)-1] != '\0') { + DEBUGP("IFWLOG: prefix term %i\n", + info->prefix[sizeof(info->prefix)-1]); +- return 0; ++ return false; + } + +- return 1; ++ return true; + } + +-static struct ipt_target ipt_IFWLOG = { ++static struct xt_target ipt_IFWLOG = { + .name = "IFWLOG", ++ .family = AF_INET, + .target = ipt_IFWLOG_target, + .targetsize = sizeof(struct ipt_IFWLOG_info), + .checkentry = ipt_IFWLOG_checkentry, + .me = THIS_MODULE, + }; + +-static int __init init(void) ++static int __init ipt_ifwlog_init(void) + { +- nl = (struct sock*) netlink_kernel_create(NETLINK_IFWLOG, GROUP, NULL, THIS_MODULE); +- if (!nl) { +- PRINTR(KERN_WARNING "IFWLOG: cannot create netlink socket\n"); +- return -EINVAL; +- } ++ int err; + +- if (ipt_register_target(&ipt_IFWLOG)) { ++ nl = netlink_kernel_create(&init_net, NETLINK_IFWLOG, IFWLOGNLGRP_MAX, ++ NULL, NULL, THIS_MODULE); ++ if (!nl) { ++ PRINTR(KERN_WARNING "IFWLOG: cannot create netlink socket\n"); ++ return -ENOMEM; ++ } ++ ++ err = xt_register_target(&ipt_IFWLOG); ++ if (err) { + if (nl && nl->sk_socket) + sock_release(nl->sk_socket); +- return -EINVAL; ++ return err; + } + + PRINTR(KERN_INFO "IFWLOG: register target\n"); + return 0; + } + +-static void __exit fini(void) ++static void __exit ipt_ifwlog_fini(void) + { + if (nl && nl->sk_socket) +- sock_release(nl->sk_socket); ++ sock_release(nl->sk_socket); + PRINTR(KERN_INFO "IFWLOG: unregister target\n"); +- ipt_unregister_target(&ipt_IFWLOG); ++ xt_unregister_target(&ipt_IFWLOG); + } + +-module_init(init); +-module_exit(fini); ++module_init(ipt_ifwlog_init); ++module_exit(ipt_ifwlog_fini); ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Samir Bellabes "); ++MODULE_AUTHOR("Luiz Capitulino "); ++MODULE_DESCRIPTION("Interactive firewall logging and module"); ++MODULE_VERSION("v1.1"); +--- linux/include/linux/netfilter_ipv4/Kbuild.net-netfilter-IFWLOG-mdv.orig 2012-05-21 01:29:13.000000000 +0300 ++++ linux/include/linux/netfilter_ipv4/Kbuild 2012-05-26 01:27:24.743139430 +0300 +@@ -2,6 +2,7 @@ header-y += ip_queue.h + header-y += ip_tables.h + header-y += ipt_CLUSTERIP.h + header-y += ipt_ECN.h ++header-y += ipt_IFWLOG.h + header-y += ipt_LOG.h + header-y += ipt_REJECT.h + header-y += ipt_TTL.h diff --git a/3.2.34/net-netfilter-IFWLOG.patch b/3.2.34/net-netfilter-IFWLOG.patch new file mode 100644 index 0000000..6efe89a --- /dev/null +++ b/3.2.34/net-netfilter-IFWLOG.patch @@ -0,0 +1,269 @@ +--- + include/linux/netfilter_ipv4/ipt_IFWLOG.h | 26 +++ + net/ipv4/netfilter/Kconfig | 11 + + net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/ipt_IFWLOG.c | 200 ++++++++++++++++++++++++++++++ + 4 files changed, 238 insertions(+) + +--- /dev/null ++++ b/net/ipv4/netfilter/ipt_IFWLOG.c +@@ -0,0 +1,200 @@ ++/* Interactive Firewall for Mandriva ++ * Samir Bellabes ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Samir Bellabes "); ++MODULE_DESCRIPTION("Interactive firewall logging and module"); ++ ++#if 0 ++#define DEBUGP PRINTR ++#else ++#define DEBUGP(format, args...) ++#endif ++ ++#define PRINTR(format, args...) do { if(net_ratelimit()) printk(format, ##args); } while(0) ++ ++static struct sock *nl; ++ ++#define GROUP 10 ++ ++/* send struct to userspace */ ++static void send_packet(struct nl_msg msg) ++{ ++ struct sk_buff *skb = NULL; ++ struct nlmsghdr *nlh; ++ ++ skb = alloc_skb(NLMSG_SPACE(sizeof(struct nl_msg)), GFP_ATOMIC); ++ if (!skb) { ++ PRINTR(KERN_WARNING "IFWLOG: OOM can't allocate skb\n"); ++ return ; ++ } ++ ++ nlh = NLMSG_PUT(skb, 0, 0, 0, sizeof(struct nl_msg) - sizeof(*nlh)); ++ ++ memcpy(NLMSG_DATA(nlh), (const void*)&msg, sizeof(struct nl_msg)); ++ ++ NETLINK_CB(skb).pid = 0; /* from kernel */ ++ NETLINK_CB(skb).dst_pid = 0; /* multicast */ ++ NETLINK_CB(skb).dst_group = 10; ++ ++ if (nl) { ++ DEBUGP(KERN_WARNING ++ "IFWLOG: nlmsg_len=%ld\nnlmsg_type=%d nlmsg_flags=%d\nnlmsg_seq=%ld nlmsg_pid = %ld\n", ++ (long)nlh->nlmsg_len, nlh->nlmsg_type, nlh->nlmsg_flags, ++ (long)nlh->nlmsg_seq, (long)nlh->nlmsg_pid); ++ DEBUGP(KERN_WARNING "prefix : %s\n", msg.prefix); ++ ++ netlink_broadcast(nl, skb, 0, 10, GFP_ATOMIC); ++ return ; ++ } ++ ++ nlmsg_failure: ++ if (skb) ++ kfree_skb(skb); ++ PRINTR(KERN_WARNING "IFWLOG: Error sending netlink packet\n"); ++ return ; ++} ++ ++/* fill struct for userspace */ ++static void ipt_IFWLOG_packet(const struct sk_buff *skb, ++ const struct net_device *in, ++ const struct net_device *out, ++ const struct ipt_IFWLOG_info *info) ++{ ++ struct iphdr iph; ++ struct tcphdr tcph; ++ struct udphdr udph; ++ struct nl_msg msg; ++ struct iphdr _iph, *ih; ++ struct timeval tv; ++ ++ memset(&msg, 0, sizeof(struct nl_msg)); ++ ++ ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); ++ if (ih == NULL) { ++ PRINTR(KERN_WARNING "IFWLOG: skb truncated"); ++ return; ++ } ++ ++ /* save interface name */ ++ if (in) ++ strcpy(msg.indev_name, in->name); ++ if (out) ++ strcpy(msg.outdev_name, out->name); ++ ++ /* save log-prefix */ ++ strcpy(msg.prefix, info->prefix); ++ ++ /* save ip header */ ++ skb_copy_bits(skb, 0, &iph, sizeof(iph)); ++ memcpy(&msg.ip, &iph, sizeof(struct iphdr)); ++ ++ /* save transport header */ ++ switch (iph.protocol){ ++ case IPPROTO_TCP: ++ skb_copy_bits(skb, iph.ihl*4 , &tcph, sizeof(tcph)); ++ memcpy(&msg.h.th, &tcph, sizeof(struct tcphdr)); ++ break; ++ case IPPROTO_UDP: ++ skb_copy_bits(skb, iph.ihl*4 , &udph, sizeof(udph)); ++ memcpy(&msg.h.uh, &udph, sizeof(struct udphdr)); ++ break; ++ default: ++ break; ++ } ++ ++ /* save timetamp */ ++ do_gettimeofday((struct timeval *)&tv); ++ msg.timestamp_sec = tv.tv_sec; ++ ++ send_packet(msg); ++} ++ ++static unsigned int ipt_IFWLOG_target(struct sk_buff **pskb, ++ const struct net_device *in, ++ const struct net_device *out, ++ unsigned int hooknum, ++ const void *targinfo, ++ void *userinfo) ++{ ++ const struct ipt_IFWLOG_info *info = targinfo; ++ ++ ipt_IFWLOG_packet(*pskb, in, out, info); ++ ++ return IPT_CONTINUE; ++} ++ ++static int ipt_IFWLOG_checkentry(const char *tablename, ++ const struct ipt_entry *e, ++ void *targinfo, ++ unsigned int targinfosize, ++ unsigned int hook_mask) ++{ ++ const struct ipt_IFWLOG_info *info = targinfo; ++ ++ if (info->prefix[sizeof(info->prefix)-1] != '\0') { ++ DEBUGP("IFWLOG: prefix term %i\n", ++ info->prefix[sizeof(info->prefix)-1]); ++ return 0; ++ } ++ ++ return 1; ++} ++ ++static struct ipt_target ipt_IFWLOG = { ++ .name = "IFWLOG", ++ .target = ipt_IFWLOG_target, ++ .targetsize = sizeof(struct ipt_IFWLOG_info), ++ .checkentry = ipt_IFWLOG_checkentry, ++ .me = THIS_MODULE, ++}; ++ ++static int __init init(void) ++{ ++ nl = (struct sock*) netlink_kernel_create(NETLINK_IFWLOG, GROUP, NULL, THIS_MODULE); ++ if (!nl) { ++ PRINTR(KERN_WARNING "IFWLOG: cannot create netlink socket\n"); ++ return -EINVAL; ++ } ++ ++ if (ipt_register_target(&ipt_IFWLOG)) { ++ if (nl && nl->sk_socket) ++ sock_release(nl->sk_socket); ++ return -EINVAL; ++ } ++ ++ PRINTR(KERN_INFO "IFWLOG: register target\n"); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ if (nl && nl->sk_socket) ++ sock_release(nl->sk_socket); ++ PRINTR(KERN_INFO "IFWLOG: unregister target\n"); ++ ipt_unregister_target(&ipt_IFWLOG); ++} ++ ++module_init(init); ++module_exit(fini); +--- a/net/ipv4/netfilter/Kconfig ++++ b/net/ipv4/netfilter/Kconfig +@@ -331,6 +331,17 @@ config IP_NF_TARGET_TTL + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_HL. + ++config IP_NF_TARGET_IFWLOG ++ tristate 'IFWLOG target support' ++ depends on IP_NF_IPTABLES ++ help ++ This option adds a `IFWLOG' target, which is used by ++ Interactive Firewall for sending informations to a userspace ++ daemon ++ ++ If you want to compile it as a module, say M here and read ++ Documentation/modules.txt. If unsure, say `N'. ++ + # raw + specific targets + config IP_NF_RAW + tristate 'raw table support (required for NOTRACK/TRACE)' +--- /dev/null ++++ b/include/linux/netfilter_ipv4/ipt_IFWLOG.h +@@ -0,0 +1,26 @@ ++#ifndef _IPT_IFWLOG_H ++#define _IPT_IFWLOG_H ++ ++#ifndef NETLINK_IFWLOG ++#define NETLINK_IFWLOG 19 ++#endif ++ ++#define PREFSIZ 32 ++ ++struct nl_msg { /* Netlink message */ ++ long timestamp_sec; /* time packet */ ++ char indev_name[IFNAMSIZ]; /* name of the ingoing interface */ ++ char outdev_name[IFNAMSIZ]; /* name of the outgoing interface */ ++ unsigned char prefix[PREFSIZ]; /* informations on the logging reason */ ++ struct iphdr ip; ++ union { ++ struct tcphdr th; ++ struct udphdr uh; ++ } h; ++}; ++ ++struct ipt_IFWLOG_info { ++ char prefix[PREFSIZ]; ++}; ++ ++#endif /* _IPT_IFWLOG_H */ +--- linux/net/ipv4/netfilter/Makefile.net-netfilter-IFWLOG.orig 2012-05-21 01:29:13.000000000 +0300 ++++ linux/net/ipv4/netfilter/Makefile 2012-05-26 01:23:57.511514194 +0300 +@@ -53,6 +53,7 @@ obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ip + + # targets + obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o ++obj-$(CONFIG_IP_NF_TARGET_IFWLOG) += ipt_IFWLOG.o + obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o + obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o + obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o diff --git a/3.2.34/net-netfilter-psd-2.6.35-buildfix.patch b/3.2.34/net-netfilter-psd-2.6.35-buildfix.patch new file mode 100644 index 0000000..218031c --- /dev/null +++ b/3.2.34/net-netfilter-psd-2.6.35-buildfix.patch @@ -0,0 +1,11 @@ +--- linux-2.6.35-rc6-git-mnb0.1/net/ipv4/netfilter/ipt_psd.c.orig 2010-07-30 21:17:30.000000000 +0300 ++++ linux-2.6.35-rc6-git-mnb0.1/net/ipv4/netfilter/ipt_psd.c 2010-07-31 13:29:00.623601957 +0300 +@@ -98,7 +98,7 @@ static inline int hashfunc(struct in_add + + static bool + ipt_psd_match(const struct sk_buff *pskb, +- const struct xt_match_param *match_param) ++ struct xt_action_param *match_param) + { + struct iphdr *ip_hdr; + struct tcphdr *tcp_hdr; diff --git a/3.2.34/net-netfilter-psd-mdv.patch b/3.2.34/net-netfilter-psd-mdv.patch new file mode 100644 index 0000000..68884aa --- /dev/null +++ b/3.2.34/net-netfilter-psd-mdv.patch @@ -0,0 +1,235 @@ +ipt_psd: Mandriva changes + +This patch holds all the Mandriva changes done in ipt_psd +netfilter module. + +Most of the time they're just upgrades to match with new +API in the kernel. + +This work is mostly done by Thomas Backlund, Herton R. +Krzesinski and Luiz Fernando N. Capitulino. + +Signed-off-by: Luiz Fernando N. Capitulino +Signed-off-by: Herton Ronaldo Krzesinski + +--- + include/linux/netfilter_ipv4/Kbuild | 1 + net/ipv4/netfilter/Kconfig | 8 ++ + net/ipv4/netfilter/ipt_psd.c | 113 ++++++++++++++---------------------- + 3 files changed, 55 insertions(+), 67 deletions(-) + +diff -p -up linux-2.6.28/net/ipv4/netfilter/ipt_psd.c.orig linux-2.6.28/net/ipv4/netfilter/ipt_psd.c +--- linux-2.6.28/net/ipv4/netfilter/ipt_psd.c.orig 2008-12-12 11:03:05.000000000 -0500 ++++ linux-2.6.28/net/ipv4/netfilter/ipt_psd.c 2008-12-12 11:04:03.000000000 -0500 +@@ -1,21 +1,24 @@ + /* +- This is a module which is used for PSD (portscan detection) +- Derived from scanlogd v2.1 written by Solar Designer +- and LOG target module. +- +- Copyright (C) 2000,2001 astaro AG +- +- This file is distributed under the terms of the GNU General Public +- License (GPL). Copies of the GPL can be obtained from: +- ftp://prep.ai.mit.edu/pub/gnu/GPL +- +- 2000-05-04 Markus Hennig : initial +- 2000-08-18 Dennis Koslowski : first release +- 2000-12-01 Dennis Koslowski : UDP scans detection added +- 2001-01-02 Dennis Koslowski : output modified +- 2001-02-04 Jan Rekorajski : converted from target to match +- 2004-05-05 Martijn Lievaart : ported to 2.6 +-*/ ++ * This is a module which is used for PSD (portscan detection) ++ * Derived from scanlogd v2.1 written by Solar Designer ++ * and LOG target module. ++ * ++ * Copyright (C) 2000,2001 astaro AG ++ * ++ * This file is distributed under the terms of the GNU General Public ++ * License (GPL). Copies of the GPL can be obtained from: ++ * ftp://prep.ai.mit.edu/pub/gnu/GPL ++ * ++ * 2000-05-04 Markus Hennig : initial ++ * 2000-08-18 Dennis Koslowski : first release ++ * 2000-12-01 Dennis Koslowski : UDP scans detection added ++ * 2001-01-02 Dennis Koslowski : output modified ++ * 2001-02-04 Jan Rekorajski : converted from target to match ++ * 2004-05-05 Martijn Lievaart : ported to 2.6 ++ * 2007-10-10 Thomas Backlund : 2.6.22 update ++ * 2007-11-14 Luiz Capitulino : 2.6.22 API usage fixes ++ * 2007-11-26 Herton Ronaldo Krzesinski : switch xt_match->match to bool ++ */ + + #include + #include +@@ -54,7 +57,7 @@ struct port { + */ + struct host { + struct host *next; /* Next entry with the same hash */ +- clock_t timestamp; /* Last update time */ ++ unsigned long timestamp; /* Last update time */ + struct in_addr src_addr; /* Source address */ + struct in_addr dest_addr; /* Destination address */ + unsigned short src_port; /* Source port */ +@@ -93,33 +96,29 @@ static inline int hashfunc(struct in_add + return hash & (HASH_SIZE - 1); + } + +-static int ++static bool + ipt_psd_match(const struct sk_buff *pskb, +- const struct net_device *in, +- const struct net_device *out, +- const void *matchinfo, +- int offset, +- int *hotdrop) ++ const struct xt_match_param *match_param) + { + struct iphdr *ip_hdr; + struct tcphdr *tcp_hdr; + struct in_addr addr; + u_int16_t src_port,dest_port; + u_int8_t tcp_flags, proto; +- clock_t now; ++ unsigned long now; + struct host *curr, *last, **head; + int hash, index, count; + + /* Parameters from userspace */ +- const struct ipt_psd_info *psdinfo = matchinfo; ++ const struct ipt_psd_info *psdinfo = match_param->matchinfo; + + /* IP header */ +- ip_hdr = pskb->nh.iph; ++ ip_hdr = ipip_hdr(pskb); + + /* Sanity check */ + if (ntohs(ip_hdr->frag_off) & IP_OFFSET) { + DEBUGP("PSD: sanity check failed\n"); +- return 0; ++ return false; + } + + /* TCP or UDP ? */ +@@ -127,7 +126,7 @@ ipt_psd_match(const struct sk_buff *pskb + + if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { + DEBUGP("PSD: protocol not supported\n"); +- return 0; ++ return false; + } + + /* Get the source address, source & destination ports, and TCP flags */ +@@ -151,7 +150,7 @@ ipt_psd_match(const struct sk_buff *pskb + * them spoof us. [DHCP needs this feature - HW] */ + if (!addr.s_addr) { + DEBUGP("PSD: spoofed source address (0.0.0.0)\n"); +- return 0; ++ return false; + } + + /* Use jiffies here not to depend on someone setting the time while we're +@@ -298,46 +297,26 @@ ipt_psd_match(const struct sk_buff *pskb + + out_no_match: + spin_unlock(&state.lock); +- return 0; ++ return false; + + out_match: + spin_unlock(&state.lock); +- return 1; ++ DEBUGP("PSD: Dropping packets from "NIPQUAD_FMT" \n", ++ NIPQUAD(curr->src_addr.s_addr)); ++ return true; + } + +-static int ipt_psd_checkentry(const char *tablename, +- const struct ipt_ip *e, +- void *matchinfo, +- unsigned int matchsize, +- unsigned int hook_mask) +-{ +-/* const struct ipt_psd_info *psdinfo = targinfo;*/ +- +- /* we accept TCP only */ +-/* if (e->ip.proto != IPPROTO_TCP) { */ +-/* DEBUGP("PSD: specified protocol may be TCP only\n"); */ +-/* return 0; */ +-/* } */ +- +- if (matchsize != IPT_ALIGN(sizeof(struct ipt_psd_info))) { +- DEBUGP("PSD: matchsize %u != %u\n", +- matchsize, +- IPT_ALIGN(sizeof(struct ipt_psd_info))); +- return 0; +- } +- +- return 1; +-} +- +-static struct ipt_match ipt_psd_reg = { +- .name = "psd", +- .match = ipt_psd_match, +- .checkentry = ipt_psd_checkentry, +- .me = THIS_MODULE }; ++static struct xt_match ipt_psd_reg = { ++ .name = "psd", ++ .family = AF_INET, ++ .match = ipt_psd_match, ++ .matchsize = sizeof(struct ipt_psd_info), ++ .me = THIS_MODULE ++}; + +-static int __init init(void) ++static int __init ipt_psd_init(void) + { +- if (ipt_register_match(&ipt_psd_reg)) ++ if (xt_register_match(&ipt_psd_reg)) + return -EINVAL; + + memset(&state, 0, sizeof(state)); +@@ -348,11 +327,11 @@ static int __init init(void) + return 0; + } + +-static void __exit fini(void) ++static void __exit ipt_psd_fini(void) + { +- ipt_unregister_match(&ipt_psd_reg); ++ xt_unregister_match(&ipt_psd_reg); + printk("netfilter PSD unloaded - (c) astaro AG\n"); + } + +-module_init(init); +-module_exit(fini); ++module_init(ipt_psd_init); ++module_exit(ipt_psd_fini); +--- a/net/ipv4/netfilter/Kconfig ++++ b/net/ipv4/netfilter/Kconfig +@@ -322,6 +322,14 @@ + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_HL. + ++config IP_NF_MATCH_PSD ++ tristate 'Port scanner detection support' ++ depends on NETFILTER_ADVANCED ++ help ++ Module used for PSD (portscan detection). ++ ++ To compile it as a module, choose M here. If unsure, say N. ++ + config IP_NF_TARGET_IFWLOG + tristate 'IFWLOG target support' + depends on IP_NF_IPTABLES +--- linux/include/linux/netfilter_ipv4/Kbuild.net-netfilter-psd-mdv.orig 2012-05-26 01:28:56.000000000 +0300 ++++ linux/include/linux/netfilter_ipv4/Kbuild 2012-05-26 01:30:21.493540796 +0300 +@@ -11,6 +11,7 @@ + header-y += ipt_addrtype.h + header-y += ipt_ah.h + header-y += ipt_ecn.h ++header-y += ipt_psd.h + header-y += ipt_realm.h + header-y += ipt_ttl.h + header-y += nf_nat.h diff --git a/3.2.34/net-netfilter-psd.patch b/3.2.34/net-netfilter-psd.patch new file mode 100644 index 0000000..c8ad7a9 --- /dev/null +++ b/3.2.34/net-netfilter-psd.patch @@ -0,0 +1,420 @@ +--- + include/linux/netfilter_ipv4/ipt_psd.h | 40 +++ + net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/ipt_psd.c | 358 +++++++++++++++++++++++++++++++++ + 3 files changed, 399 insertions(+) + +--- /dev/null ++++ b/net/ipv4/netfilter/ipt_psd.c +@@ -0,0 +1,358 @@ ++/* ++ This is a module which is used for PSD (portscan detection) ++ Derived from scanlogd v2.1 written by Solar Designer ++ and LOG target module. ++ ++ Copyright (C) 2000,2001 astaro AG ++ ++ This file is distributed under the terms of the GNU General Public ++ License (GPL). Copies of the GPL can be obtained from: ++ ftp://prep.ai.mit.edu/pub/gnu/GPL ++ ++ 2000-05-04 Markus Hennig : initial ++ 2000-08-18 Dennis Koslowski : first release ++ 2000-12-01 Dennis Koslowski : UDP scans detection added ++ 2001-01-02 Dennis Koslowski : output modified ++ 2001-02-04 Jan Rekorajski : converted from target to match ++ 2004-05-05 Martijn Lievaart : ported to 2.6 ++*/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#if 0 ++#define DEBUGP printk ++#else ++#define DEBUGP(format, args...) ++#endif ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Dennis Koslowski "); ++ ++#define HF_DADDR_CHANGING 0x01 ++#define HF_SPORT_CHANGING 0x02 ++#define HF_TOS_CHANGING 0x04 ++#define HF_TTL_CHANGING 0x08 ++ ++/* ++ * Information we keep per each target port ++ */ ++struct port { ++ u_int16_t number; /* port number */ ++ u_int8_t proto; /* protocol number */ ++ u_int8_t and_flags; /* tcp ANDed flags */ ++ u_int8_t or_flags; /* tcp ORed flags */ ++}; ++ ++/* ++ * Information we keep per each source address. ++ */ ++struct host { ++ struct host *next; /* Next entry with the same hash */ ++ clock_t timestamp; /* Last update time */ ++ struct in_addr src_addr; /* Source address */ ++ struct in_addr dest_addr; /* Destination address */ ++ unsigned short src_port; /* Source port */ ++ int count; /* Number of ports in the list */ ++ int weight; /* Total weight of ports in the list */ ++ struct port ports[SCAN_MAX_COUNT - 1]; /* List of ports */ ++ unsigned char tos; /* TOS */ ++ unsigned char ttl; /* TTL */ ++ unsigned char flags; /* HF_ flags bitmask */ ++}; ++ ++/* ++ * State information. ++ */ ++static struct { ++ spinlock_t lock; ++ struct host list[LIST_SIZE]; /* List of source addresses */ ++ struct host *hash[HASH_SIZE]; /* Hash: pointers into the list */ ++ int index; /* Oldest entry to be replaced */ ++} state; ++ ++/* ++ * Convert an IP address into a hash table index. ++ */ ++static inline int hashfunc(struct in_addr addr) ++{ ++ unsigned int value; ++ int hash; ++ ++ value = addr.s_addr; ++ hash = 0; ++ do { ++ hash ^= value; ++ } while ((value >>= HASH_LOG)); ++ ++ return hash & (HASH_SIZE - 1); ++} ++ ++static int ++ipt_psd_match(const struct sk_buff *pskb, ++ const struct net_device *in, ++ const struct net_device *out, ++ const void *matchinfo, ++ int offset, ++ int *hotdrop) ++{ ++ struct iphdr *ip_hdr; ++ struct tcphdr *tcp_hdr; ++ struct in_addr addr; ++ u_int16_t src_port,dest_port; ++ u_int8_t tcp_flags, proto; ++ clock_t now; ++ struct host *curr, *last, **head; ++ int hash, index, count; ++ ++ /* Parameters from userspace */ ++ const struct ipt_psd_info *psdinfo = matchinfo; ++ ++ /* IP header */ ++ ip_hdr = pskb->nh.iph; ++ ++ /* Sanity check */ ++ if (ntohs(ip_hdr->frag_off) & IP_OFFSET) { ++ DEBUGP("PSD: sanity check failed\n"); ++ return 0; ++ } ++ ++ /* TCP or UDP ? */ ++ proto = ip_hdr->protocol; ++ ++ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { ++ DEBUGP("PSD: protocol not supported\n"); ++ return 0; ++ } ++ ++ /* Get the source address, source & destination ports, and TCP flags */ ++ ++ addr.s_addr = ip_hdr->saddr; ++ ++ tcp_hdr = (struct tcphdr*)((u_int32_t *)ip_hdr + ip_hdr->ihl); ++ ++ /* Yep, it´s dirty */ ++ src_port = tcp_hdr->source; ++ dest_port = tcp_hdr->dest; ++ ++ if (proto == IPPROTO_TCP) { ++ tcp_flags = *((u_int8_t*)tcp_hdr + 13); ++ } ++ else { ++ tcp_flags = 0x00; ++ } ++ ++ /* We're using IP address 0.0.0.0 for a special purpose here, so don't let ++ * them spoof us. [DHCP needs this feature - HW] */ ++ if (!addr.s_addr) { ++ DEBUGP("PSD: spoofed source address (0.0.0.0)\n"); ++ return 0; ++ } ++ ++ /* Use jiffies here not to depend on someone setting the time while we're ++ * running; we need to be careful with possible return value overflows. */ ++ now = jiffies; ++ ++ spin_lock(&state.lock); ++ ++ /* Do we know this source address already? */ ++ count = 0; ++ last = NULL; ++ if ((curr = *(head = &state.hash[hash = hashfunc(addr)]))) ++ do { ++ if (curr->src_addr.s_addr == addr.s_addr) break; ++ count++; ++ if (curr->next) last = curr; ++ } while ((curr = curr->next)); ++ ++ if (curr) { ++ ++ /* We know this address, and the entry isn't too old. Update it. */ ++ if (now - curr->timestamp <= (psdinfo->delay_threshold*HZ)/100 && ++ time_after_eq(now, curr->timestamp)) { ++ ++ /* Just update the appropriate list entry if we've seen this port already */ ++ for (index = 0; index < curr->count; index++) { ++ if (curr->ports[index].number == dest_port) { ++ curr->ports[index].proto = proto; ++ curr->ports[index].and_flags &= tcp_flags; ++ curr->ports[index].or_flags |= tcp_flags; ++ goto out_no_match; ++ } ++ } ++ ++ /* TCP/ACK and/or TCP/RST to a new port? This could be an outgoing connection. */ ++ if (proto == IPPROTO_TCP && (tcp_hdr->ack || tcp_hdr->rst)) ++ goto out_no_match; ++ ++ /* Packet to a new port, and not TCP/ACK: update the timestamp */ ++ curr->timestamp = now; ++ ++ /* Logged this scan already? Then drop the packet. */ ++ if (curr->weight >= psdinfo->weight_threshold) ++ goto out_match; ++ ++ /* Specify if destination address, source port, TOS or TTL are not fixed */ ++ if (curr->dest_addr.s_addr != ip_hdr->daddr) ++ curr->flags |= HF_DADDR_CHANGING; ++ if (curr->src_port != src_port) ++ curr->flags |= HF_SPORT_CHANGING; ++ if (curr->tos != ip_hdr->tos) ++ curr->flags |= HF_TOS_CHANGING; ++ if (curr->ttl != ip_hdr->ttl) ++ curr->flags |= HF_TTL_CHANGING; ++ ++ /* Update the total weight */ ++ curr->weight += (ntohs(dest_port) < 1024) ? ++ psdinfo->lo_ports_weight : psdinfo->hi_ports_weight; ++ ++ /* Got enough destination ports to decide that this is a scan? */ ++ /* Then log it and drop the packet. */ ++ if (curr->weight >= psdinfo->weight_threshold) ++ goto out_match; ++ ++ /* Remember the new port */ ++ if (curr->count < SCAN_MAX_COUNT) { ++ curr->ports[curr->count].number = dest_port; ++ curr->ports[curr->count].proto = proto; ++ curr->ports[curr->count].and_flags = tcp_flags; ++ curr->ports[curr->count].or_flags = tcp_flags; ++ curr->count++; ++ } ++ ++ goto out_no_match; ++ } ++ ++ /* We know this address, but the entry is outdated. Mark it unused, and ++ * remove from the hash table. We'll allocate a new entry instead since ++ * this one might get re-used too soon. */ ++ curr->src_addr.s_addr = 0; ++ if (last) ++ last->next = last->next->next; ++ else if (*head) ++ *head = (*head)->next; ++ last = NULL; ++ } ++ ++ /* We don't need an ACK from a new source address */ ++ if (proto == IPPROTO_TCP && tcp_hdr->ack) ++ goto out_no_match; ++ ++ /* Got too many source addresses with the same hash value? Then remove the ++ * oldest one from the hash table, so that they can't take too much of our ++ * CPU time even with carefully chosen spoofed IP addresses. */ ++ if (count >= HASH_MAX && last) last->next = NULL; ++ ++ /* We're going to re-use the oldest list entry, so remove it from the hash ++ * table first (if it is really already in use, and isn't removed from the ++ * hash table already because of the HASH_MAX check above). */ ++ ++ /* First, find it */ ++ if (state.list[state.index].src_addr.s_addr) ++ head = &state.hash[hashfunc(state.list[state.index].src_addr)]; ++ else ++ head = &last; ++ last = NULL; ++ if ((curr = *head)) ++ do { ++ if (curr == &state.list[state.index]) break; ++ last = curr; ++ } while ((curr = curr->next)); ++ ++ /* Then, remove it */ ++ if (curr) { ++ if (last) ++ last->next = last->next->next; ++ else if (*head) ++ *head = (*head)->next; ++ } ++ ++ /* Get our list entry */ ++ curr = &state.list[state.index++]; ++ if (state.index >= LIST_SIZE) state.index = 0; ++ ++ /* Link it into the hash table */ ++ head = &state.hash[hash]; ++ curr->next = *head; ++ *head = curr; ++ ++ /* And fill in the fields */ ++ curr->timestamp = now; ++ curr->src_addr = addr; ++ curr->dest_addr.s_addr = ip_hdr->daddr; ++ curr->src_port = src_port; ++ curr->count = 1; ++ curr->weight = (ntohs(dest_port) < 1024) ? ++ psdinfo->lo_ports_weight : psdinfo->hi_ports_weight; ++ curr->ports[0].number = dest_port; ++ curr->ports[0].proto = proto; ++ curr->ports[0].and_flags = tcp_flags; ++ curr->ports[0].or_flags = tcp_flags; ++ curr->tos = ip_hdr->tos; ++ curr->ttl = ip_hdr->ttl; ++ ++out_no_match: ++ spin_unlock(&state.lock); ++ return 0; ++ ++out_match: ++ spin_unlock(&state.lock); ++ return 1; ++} ++ ++static int ipt_psd_checkentry(const char *tablename, ++ const struct ipt_ip *e, ++ void *matchinfo, ++ unsigned int matchsize, ++ unsigned int hook_mask) ++{ ++/* const struct ipt_psd_info *psdinfo = targinfo;*/ ++ ++ /* we accept TCP only */ ++/* if (e->ip.proto != IPPROTO_TCP) { */ ++/* DEBUGP("PSD: specified protocol may be TCP only\n"); */ ++/* return 0; */ ++/* } */ ++ ++ if (matchsize != IPT_ALIGN(sizeof(struct ipt_psd_info))) { ++ DEBUGP("PSD: matchsize %u != %u\n", ++ matchsize, ++ IPT_ALIGN(sizeof(struct ipt_psd_info))); ++ return 0; ++ } ++ ++ return 1; ++} ++ ++static struct ipt_match ipt_psd_reg = { ++ .name = "psd", ++ .match = ipt_psd_match, ++ .checkentry = ipt_psd_checkentry, ++ .me = THIS_MODULE }; ++ ++static int __init init(void) ++{ ++ if (ipt_register_match(&ipt_psd_reg)) ++ return -EINVAL; ++ ++ memset(&state, 0, sizeof(state)); ++ ++ spin_lock_init(&(state.lock)); ++ ++ printk("netfilter PSD loaded - (c) astaro AG\n"); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ ipt_unregister_match(&ipt_psd_reg); ++ printk("netfilter PSD unloaded - (c) astaro AG\n"); ++} ++ ++module_init(init); ++module_exit(fini); +--- /dev/null ++++ b/include/linux/netfilter_ipv4/ipt_psd.h +@@ -0,0 +1,40 @@ ++#ifndef _IPT_PSD_H ++#define _IPT_PSD_H ++ ++#include ++#include ++ ++/* ++ * High port numbers have a lower weight to reduce the frequency of false ++ * positives, such as from passive mode FTP transfers. ++ */ ++#define PORT_WEIGHT_PRIV 3 ++#define PORT_WEIGHT_HIGH 1 ++ ++/* ++ * Port scan detection thresholds: at least COUNT ports need to be scanned ++ * from the same source, with no longer than DELAY ticks between ports. ++ */ ++#define SCAN_MIN_COUNT 7 ++#define SCAN_MAX_COUNT (SCAN_MIN_COUNT * PORT_WEIGHT_PRIV) ++#define SCAN_WEIGHT_THRESHOLD SCAN_MAX_COUNT ++#define SCAN_DELAY_THRESHOLD (300) /* old usage of HZ here was erroneously and broke under uml */ ++ ++/* ++ * Keep track of up to LIST_SIZE source addresses, using a hash table of ++ * HASH_SIZE entries for faster lookups, but limiting hash collisions to ++ * HASH_MAX source addresses per the same hash value. ++ */ ++#define LIST_SIZE 0x100 ++#define HASH_LOG 9 ++#define HASH_SIZE (1 << HASH_LOG) ++#define HASH_MAX 0x10 ++ ++struct ipt_psd_info { ++ unsigned int weight_threshold; ++ unsigned int delay_threshold; ++ unsigned short lo_ports_weight; ++ unsigned short hi_ports_weight; ++}; ++ ++#endif /*_IPT_PSD_H*/ +--- a/net/ipv4/netfilter/Makefile ++++ b/net/ipv4/netfilter/Makefile +@@ -49,6 +49,7 @@ + + # matches + obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o ++obj-$(CONFIG_IP_NF_MATCH_PSD) += ipt_psd.o + obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o + + # targets diff --git a/3.2.34/netfilter-implement-rfc-1123-for-ftp-conntrack.patch b/3.2.34/netfilter-implement-rfc-1123-for-ftp-conntrack.patch new file mode 100644 index 0000000..30cae8c --- /dev/null +++ b/3.2.34/netfilter-implement-rfc-1123-for-ftp-conntrack.patch @@ -0,0 +1,190 @@ +From: Jeff Mahoney +Subject: netfilter: Implement RFC 1123 for FTP conntrack +References: bnc#466279 bnc#681639 +Patch-mainline: Submitted via http://bugzilla.netfilter.org/show_bug.cgi?id=574 23 Jan 2011 + + The FTP conntrack code currently only accepts the following format for + the 227 response for PASV: + 227 Entering Passive Mode (148,100,81,40,31,161). + + It doesn't accept the following format from an obscure server: + 227 Data transfer will passively listen to 67,218,99,134,50,144 + + From RFC 1123: + The format of the 227 reply to a PASV command is not + well standardized. In particular, an FTP client cannot + assume that the parentheses shown on page 40 of RFC-959 + will be present (and in fact, Figure 3 on page 43 omits + them). Therefore, a User-FTP program that interprets + the PASV reply must scan the reply for the first digit + of the host and port numbers. + + This patch adds support for the RFC 1123 clarification by: + - Allowing a search filter to specify NUL as the terminator so that + try_number will return successfully if the array of numbers has been + filled when an unexpected character is encountered. + - Using space as the separator for the 227 reply and then scanning for + the first digit of the number sequence. The number sequence is parsed + out using the existing try_rfc959 but with a NUL terminator. + + Tracked in: https://bugzilla.novell.com/show_bug.cgi?id=466279 + +Reported-by: Mark Post +Signed-off-by: Jeff Mahoney +--- + net/netfilter/nf_conntrack_ftp.c | 73 ++++++++++++++++++++++++++++----------- + 1 file changed, 54 insertions(+), 19 deletions(-) + +--- a/net/netfilter/nf_conntrack_ftp.c ++++ b/net/netfilter/nf_conntrack_ftp.c +@@ -53,10 +53,14 @@ unsigned int (*nf_nat_ftp_hook)(struct s + struct nf_conntrack_expect *exp); + EXPORT_SYMBOL_GPL(nf_nat_ftp_hook); + +-static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char); +-static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char); ++static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, ++ char, unsigned int *); ++static int try_rfc1123(const char *, size_t, struct nf_conntrack_man *, ++ char, unsigned int *); ++static int try_eprt(const char *, size_t, struct nf_conntrack_man *, ++ char, unsigned int *); + static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, +- char); ++ char, unsigned int *); + + static struct ftp_search { + const char *pattern; +@@ -64,7 +68,7 @@ static struct ftp_search { + char skip; + char term; + enum nf_ct_ftp_type ftptype; +- int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); ++ int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *); + } search[IP_CT_DIR_MAX][2] = { + [IP_CT_DIR_ORIGINAL] = { + { +@@ -88,10 +92,8 @@ static struct ftp_search { + { + .pattern = "227 ", + .plen = sizeof("227 ") - 1, +- .skip = '(', +- .term = ')', + .ftptype = NF_CT_FTP_PASV, +- .getnum = try_rfc959, ++ .getnum = try_rfc1123, + }, + { + .pattern = "229 ", +@@ -130,8 +132,9 @@ static int try_number(const char *data, + i++; + else { + /* Unexpected character; true if it's the +- terminator and we're finished. */ +- if (*data == term && i == array_size - 1) ++ terminator (or we don't care about one) ++ and we're finished. */ ++ if ((*data == term || !term) && i == array_size - 1) + return len; + + pr_debug("Char %u (got %u nums) `%u' unexpected\n", +@@ -146,7 +149,8 @@ static int try_number(const char *data, + + /* Returns 0, or length of numbers: 192,168,1,1,5,6 */ + static int try_rfc959(const char *data, size_t dlen, +- struct nf_conntrack_man *cmd, char term) ++ struct nf_conntrack_man *cmd, char term, ++ unsigned int *offset) + { + int length; + u_int32_t array[6]; +@@ -161,6 +165,33 @@ static int try_rfc959(const char *data, + return length; + } + ++/* ++ * From RFC 1123: ++ * The format of the 227 reply to a PASV command is not ++ * well standardized. In particular, an FTP client cannot ++ * assume that the parentheses shown on page 40 of RFC-959 ++ * will be present (and in fact, Figure 3 on page 43 omits ++ * them). Therefore, a User-FTP program that interprets ++ * the PASV reply must scan the reply for the first digit ++ * of the host and port numbers. ++ */ ++static int try_rfc1123(const char *data, size_t dlen, ++ struct nf_conntrack_man *cmd, char term, ++ unsigned int *offset) ++{ ++ int i; ++ for (i = 0; i < dlen; i++) ++ if (isdigit(data[i])) ++ break; ++ ++ if (i == dlen) ++ return 0; ++ ++ *offset += i; ++ ++ return try_rfc959(data + i, dlen - i, cmd, 0, offset); ++} ++ + /* Grab port: number up to delimiter */ + static int get_port(const char *data, int start, size_t dlen, char delim, + __be16 *port) +@@ -189,7 +220,7 @@ static int get_port(const char *data, in + + /* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */ + static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, +- char term) ++ char term, unsigned int *offset) + { + char delim; + int length; +@@ -237,7 +268,8 @@ static int try_eprt(const char *data, si + + /* Returns 0, or length of numbers: |||6446| */ + static int try_epsv_response(const char *data, size_t dlen, +- struct nf_conntrack_man *cmd, char term) ++ struct nf_conntrack_man *cmd, char term, ++ unsigned int *offset) + { + char delim; + +@@ -259,9 +291,10 @@ static int find_pattern(const char *data + unsigned int *numlen, + struct nf_conntrack_man *cmd, + int (*getnum)(const char *, size_t, +- struct nf_conntrack_man *, char)) ++ struct nf_conntrack_man *, char, ++ unsigned int *)) + { +- size_t i; ++ size_t i = plen; + + pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen); + if (dlen == 0) +@@ -291,16 +324,18 @@ static int find_pattern(const char *data + pr_debug("Pattern matches!\n"); + /* Now we've found the constant string, try to skip + to the 'skip' character */ +- for (i = plen; data[i] != skip; i++) +- if (i == dlen - 1) return -1; ++ if (skip) { ++ for (i = plen; data[i] != skip; i++) ++ if (i == dlen - 1) return -1; + +- /* Skip over the last character */ +- i++; ++ /* Skip over the last character */ ++ i++; ++ } + + pr_debug("Skipped up to `%c'!\n", skip); + + *numoff = i; +- *numlen = getnum(data + i, dlen - i, cmd, term); ++ *numlen = getnum(data + i, dlen - i, cmd, term, numoff); + if (!*numlen) + return -1; + diff --git a/3.2.34/netfilter-ip_conntrack_slp.patch b/3.2.34/netfilter-ip_conntrack_slp.patch new file mode 100644 index 0000000..ff72d85 --- /dev/null +++ b/3.2.34/netfilter-ip_conntrack_slp.patch @@ -0,0 +1,185 @@ +From: Jiri Bohac +Subject: connection tracking helper for SLP +References: fate#301134 +Patch-mainline: Not yet + +A simple connection tracking helper for SLP. Marks replies to a +SLP broadcast query as ESTABLISHED to allow them to pass through the +firewall. + +Signed-off-by: Jiri Bohac + +--- + net/netfilter/Kconfig | 15 ++++ + net/netfilter/Makefile | 1 + net/netfilter/nf_conntrack_slp.c | 131 +++++++++++++++++++++++++++++++++++++++ + 3 files changed, 147 insertions(+) + +--- a/net/netfilter/Kconfig ++++ b/net/netfilter/Kconfig +@@ -290,6 +290,21 @@ config NF_CONNTRACK_TFTP + + To compile it as a module, choose M here. If unsure, say N. + ++config NF_CONNTRACK_SLP ++ tristate "SLP protocol support" ++ depends on NF_CONNTRACK ++ depends on NETFILTER_ADVANCED ++ help ++ SLP queries are sometimes sent as broadcast messages from an ++ unprivileged port and responded to with unicast messages to the ++ same port. This make them hard to firewall properly because connection ++ tracking doesn't deal with broadcasts. This helper tracks locally ++ originating broadcast SLP queries and the corresponding ++ responses. It relies on correct IP address configuration, specifically ++ netmask and broadcast address. ++ ++ To compile it as a module, choose M here. If unsure, say N. ++ + config NF_CT_NETLINK + tristate 'Connection tracking netlink interface' + select NETFILTER_NETLINK +--- a/net/netfilter/Makefile ++++ b/net/netfilter/Makefile +@@ -36,6 +36,7 @@ obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_co + obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o + obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o + obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o ++obj-$(CONFIG_NF_CONNTRACK_SLP) += nf_conntrack_slp.o + + # transparent proxy support + obj-$(CONFIG_NETFILTER_TPROXY) += nf_tproxy_core.o +--- /dev/null ++++ b/net/netfilter/nf_conntrack_slp.c +@@ -0,0 +1,131 @@ ++/* ++ * NetBIOS name service broadcast connection tracking helper ++ * ++ * (c) 2007 Jiri Bohac ++ * (c) 2005 Patrick McHardy ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++/* ++ * This helper tracks locally originating NetBIOS name service ++ * requests by issuing permanent expectations (valid until ++ * timing out) matching all reply connections from the ++ * destination network. The only NetBIOS specific thing is ++ * actually the port number. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#define SLP_PORT 427 ++ ++MODULE_AUTHOR("Jiri Bohac "); ++MODULE_DESCRIPTION("SLP broadcast connection tracking helper"); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS("ip_conntrack_slp"); ++ ++static unsigned int timeout __read_mostly = 3; ++module_param(timeout, uint, 0400); ++MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); ++ ++static int help(struct sk_buff *skb, unsigned int protoff, ++ struct nf_conn *ct, enum ip_conntrack_info ctinfo) ++{ ++ struct nf_conntrack_expect *exp; ++ struct rtable *rt = skb_rtable(skb); ++ struct in_device *in_dev; ++ __be32 mask = 0; ++ __be32 src = 0; ++ ++ /* we're only interested in locally generated packets */ ++ if (skb->sk == NULL) ++ goto out; ++ if (rt == NULL || !(rt->rt_flags & (RTCF_MULTICAST|RTCF_BROADCAST))) ++ goto out; ++ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) ++ goto out; ++ ++ rcu_read_lock(); ++ in_dev = __in_dev_get_rcu(rt->dst.dev); ++ if (in_dev != NULL) { ++ for_primary_ifa(in_dev) { ++ /* this is a hack as slp uses multicast we can't match ++ * the destination address to some broadcast address. So ++ * just take the first one. Better would be to install ++ * expectations for all addresses */ ++ mask = ifa->ifa_mask; ++ src = ifa->ifa_broadcast; ++ break; ++ } endfor_ifa(in_dev); ++ } ++ rcu_read_unlock(); ++ ++ if (mask == 0 || src == 0) ++ goto out; ++ ++ exp = nf_ct_expect_alloc(ct); ++ if (exp == NULL) ++ goto out; ++ ++ exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; ++ exp->tuple.src.u3.ip = src; ++ exp->tuple.src.u.udp.port = htons(SLP_PORT); ++ ++ exp->mask.src.u3.ip = mask; ++ exp->mask.src.u.udp.port = htons(0xFFFF); ++ ++ exp->expectfn = NULL; ++ exp->flags = NF_CT_EXPECT_PERMANENT; ++ exp->class = NF_CT_EXPECT_CLASS_DEFAULT; ++ exp->helper = NULL; ++ ++ nf_ct_expect_related(exp); ++ nf_ct_expect_put(exp); ++ ++ nf_ct_refresh(ct, skb, timeout * HZ); ++out: ++ return NF_ACCEPT; ++} ++ ++static struct nf_conntrack_expect_policy exp_policy = { ++ .max_expected = 1, ++}; ++ ++static struct nf_conntrack_helper helper __read_mostly = { ++ .name = "slp", ++ .tuple.src.l3num = AF_INET, ++ .tuple.src.u.udp.port = __constant_htons(SLP_PORT), ++ .tuple.dst.protonum = IPPROTO_UDP, ++ .me = THIS_MODULE, ++ .help = help, ++ .expect_policy = &exp_policy, ++}; ++ ++static int __init nf_conntrack_slp_init(void) ++{ ++ exp_policy.timeout = timeout; ++ return nf_conntrack_helper_register(&helper); ++} ++ ++static void __exit nf_conntrack_slp_fini(void) ++{ ++ nf_conntrack_helper_unregister(&helper); ++} ++ ++module_init(nf_conntrack_slp_init); ++module_exit(nf_conntrack_slp_fini); diff --git a/3.2.34/patches.suse/0002-btrfs-Introduce-btrfs_get_maps_dev.patch b/3.2.34/patches.suse/0002-btrfs-Introduce-btrfs_get_maps_dev.patch new file mode 100644 index 0000000..3dd6a2b --- /dev/null +++ b/3.2.34/patches.suse/0002-btrfs-Introduce-btrfs_get_maps_dev.patch @@ -0,0 +1,39 @@ +From c83e5a977a2510de872d48a4d3bebc94dac0ed8f Mon Sep 17 00:00:00 2001 +From: Mark Fasheh +Date: Fri, 13 May 2011 16:01:39 -0700 +Subject: [PATCH 2/2] btrfs: Introduce btrfs_get_maps_dev() +References: bnc#672923 +Patch-mainline: Never + +Use this to return the subvolume superblock in proc instead of the global +superblock which is automatically taken today. This fixes a userspace +breakage where discrepancies between the devices two would confuse software +such as lsof. + +Signed-off-by: Mark Fasheh +--- + fs/btrfs/super.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -1301,6 +1301,11 @@ static void btrfs_fs_dirty_inode(struct + "error %d\n", btrfs_ino(inode), ret); + } + ++static dev_t btrfs_get_maps_dev(struct inode *inode) ++{ ++ return BTRFS_I(inode)->root->anon_dev; ++} ++ + static const struct super_operations btrfs_super_ops = { + .drop_inode = btrfs_drop_inode, + .evict_inode = btrfs_evict_inode, +@@ -1315,6 +1320,7 @@ static const struct super_operations btr + .remount_fs = btrfs_remount, + .freeze_fs = btrfs_freeze, + .unfreeze_fs = btrfs_unfreeze, ++ .get_maps_dev = btrfs_get_maps_dev, + }; + + static const struct file_operations btrfs_ctl_fops = { diff --git a/3.2.34/patches.suse/btrfs-0900-add-allocator-tracepoints.patch b/3.2.34/patches.suse/btrfs-0900-add-allocator-tracepoints.patch new file mode 100644 index 0000000..6a14034 --- /dev/null +++ b/3.2.34/patches.suse/btrfs-0900-add-allocator-tracepoints.patch @@ -0,0 +1,304 @@ +From: Josef Bacik +Date: Thu, 10 Nov 2011 08:29:20 -0500 +Patch-mainline: pending +References: FATE#306586 +Subject: [PATCH] Btrfs: add allocator tracepoints + +I used these tracepoints when figuring out what the cluster stuff was doing, so +add them to mainline in case we need to profile this stuff again. Thanks, + +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +--- + fs/btrfs/extent-tree.c | 9 ++ + fs/btrfs/free-space-cache.c | 11 ++ + include/trace/events/btrfs.h | 173 +++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 192 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -5128,6 +5128,8 @@ + ins->objectid = 0; + ins->offset = 0; + ++ trace_find_free_extent(orig_root, num_bytes, empty_size, data); ++ + space_info = __find_space_info(root->fs_info, data); + if (!space_info) { + printk(KERN_ERR "No space info for %llu\n", data); +@@ -5313,6 +5315,8 @@ + if (offset) { + /* we have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); ++ trace_btrfs_reserve_extent_cluster(root, ++ block_group, search_start, num_bytes); + goto checks; + } + +@@ -5359,6 +5363,9 @@ + if (offset) { + /* we found one, proceed */ + spin_unlock(&last_ptr->refill_lock); ++ trace_btrfs_reserve_extent_cluster(root, ++ block_group, search_start, ++ num_bytes); + goto checks; + } + } else if (!cached && loop > LOOP_CACHING_NOWAIT +@@ -5439,6 +5446,8 @@ + ins->objectid = search_start; + ins->offset = num_bytes; + ++ trace_btrfs_reserve_extent(orig_root, block_group, ++ search_start, num_bytes); + if (offset < search_start) + btrfs_add_free_space(used_block_group, offset, + search_start - offset); +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -2346,6 +2346,8 @@ + &entry->offset_index, 1); + BUG_ON(ret); + ++ trace_btrfs_setup_cluster(block_group, cluster, ++ total_found * block_group->sectorsize, 1); + return 0; + } + +@@ -2368,6 +2370,7 @@ + u64 window_free; + u64 max_extent; + u64 max_gap = 128 * 1024; ++ u64 total_size = 0; + + entry = tree_search_offset(ctl, offset, 0, 1); + if (!entry) +@@ -2444,11 +2447,12 @@ + rb_erase(&entry->offset_index, &ctl->free_space_offset); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index, 0); ++ total_size += entry->bytes; + BUG_ON(ret); + } while (node && entry != last); + + cluster->max_size = max_extent; +- ++ trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); + return 0; + } + +@@ -2552,6 +2556,9 @@ + goto out; + } + ++ trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, ++ min_bytes); ++ + ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, + bytes, min_bytes); + if (ret) +@@ -2567,6 +2574,8 @@ + list_add_tail(&cluster->block_group_list, + &block_group->cluster_list); + cluster->block_group = block_group; ++ } else { ++ trace_btrfs_failed_cluster_setup(block_group); + } + out: + spin_unlock(&cluster->lock); +--- a/include/trace/events/btrfs.h ++++ b/include/trace/events/btrfs.h +@@ -16,6 +16,8 @@ + struct btrfs_delayed_tree_ref; + struct btrfs_delayed_data_ref; + struct btrfs_delayed_ref_head; ++struct btrfs_block_group_cache; ++struct btrfs_free_cluster; + struct map_lookup; + struct extent_buffer; + +@@ -44,6 +46,15 @@ + obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ + (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" + ++#define BTRFS_GROUP_FLAGS \ ++ { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \ ++ { BTRFS_BLOCK_GROUP_SYSTEM, "SYSTEM"}, \ ++ { BTRFS_BLOCK_GROUP_METADATA, "METADATA"}, \ ++ { BTRFS_BLOCK_GROUP_RAID0, "RAID0"}, \ ++ { BTRFS_BLOCK_GROUP_RAID1, "RAID1"}, \ ++ { BTRFS_BLOCK_GROUP_DUP, "DUP"}, \ ++ { BTRFS_BLOCK_GROUP_RAID10, "RAID10"} ++ + TRACE_EVENT(btrfs_transaction_commit, + + TP_PROTO(struct btrfs_root *root), +@@ -661,6 +672,168 @@ + TP_ARGS(root, start, len) + ); + ++TRACE_EVENT(find_free_extent, ++ ++ TP_PROTO(struct btrfs_root *root, u64 num_bytes, u64 empty_size, ++ u64 data), ++ ++ TP_ARGS(root, num_bytes, empty_size, data), ++ ++ TP_STRUCT__entry( ++ __field( u64, root_objectid ) ++ __field( u64, num_bytes ) ++ __field( u64, empty_size ) ++ __field( u64, data ) ++ ), ++ ++ TP_fast_assign( ++ __entry->root_objectid = root->root_key.objectid; ++ __entry->num_bytes = num_bytes; ++ __entry->empty_size = empty_size; ++ __entry->data = data; ++ ), ++ ++ TP_printk("root = %Lu(%s), len = %Lu, empty_size = %Lu, " ++ "flags = %Lu(%s)", show_root_type(__entry->root_objectid), ++ __entry->num_bytes, __entry->empty_size, __entry->data, ++ __print_flags((unsigned long)__entry->data, "|", ++ BTRFS_GROUP_FLAGS)) ++); ++ ++DECLARE_EVENT_CLASS(btrfs__reserve_extent, ++ ++ TP_PROTO(struct btrfs_root *root, ++ struct btrfs_block_group_cache *block_group, u64 start, ++ u64 len), ++ ++ TP_ARGS(root, block_group, start, len), ++ ++ TP_STRUCT__entry( ++ __field( u64, root_objectid ) ++ __field( u64, bg_objectid ) ++ __field( u64, flags ) ++ __field( u64, start ) ++ __field( u64, len ) ++ ), ++ ++ TP_fast_assign( ++ __entry->root_objectid = root->root_key.objectid; ++ __entry->bg_objectid = block_group->key.objectid; ++ __entry->flags = block_group->flags; ++ __entry->start = start; ++ __entry->len = len; ++ ), ++ ++ TP_printk("root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), " ++ "start = %Lu, len = %Lu", ++ show_root_type(__entry->root_objectid), __entry->bg_objectid, ++ __entry->flags, __print_flags((unsigned long)__entry->flags, ++ "|", BTRFS_GROUP_FLAGS), ++ __entry->start, __entry->len) ++); ++ ++DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent, ++ ++ TP_PROTO(struct btrfs_root *root, ++ struct btrfs_block_group_cache *block_group, u64 start, ++ u64 len), ++ ++ TP_ARGS(root, block_group, start, len) ++); ++ ++DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster, ++ ++ TP_PROTO(struct btrfs_root *root, ++ struct btrfs_block_group_cache *block_group, u64 start, ++ u64 len), ++ ++ TP_ARGS(root, block_group, start, len) ++); ++ ++TRACE_EVENT(btrfs_find_cluster, ++ ++ TP_PROTO(struct btrfs_block_group_cache *block_group, u64 start, ++ u64 bytes, u64 empty_size, u64 min_bytes), ++ ++ TP_ARGS(block_group, start, bytes, empty_size, min_bytes), ++ ++ TP_STRUCT__entry( ++ __field( u64, bg_objectid ) ++ __field( u64, flags ) ++ __field( u64, start ) ++ __field( u64, bytes ) ++ __field( u64, empty_size ) ++ __field( u64, min_bytes ) ++ ), ++ ++ TP_fast_assign( ++ __entry->bg_objectid = block_group->key.objectid; ++ __entry->flags = block_group->flags; ++ __entry->start = start; ++ __entry->bytes = bytes; ++ __entry->empty_size = empty_size; ++ __entry->min_bytes = min_bytes; ++ ), ++ ++ TP_printk("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu," ++ " empty_size = %Lu, min_bytes = %Lu", __entry->bg_objectid, ++ __entry->flags, ++ __print_flags((unsigned long)__entry->flags, "|", ++ BTRFS_GROUP_FLAGS), __entry->start, ++ __entry->bytes, __entry->empty_size, __entry->min_bytes) ++); ++ ++TRACE_EVENT(btrfs_failed_cluster_setup, ++ ++ TP_PROTO(struct btrfs_block_group_cache *block_group), ++ ++ TP_ARGS(block_group), ++ ++ TP_STRUCT__entry( ++ __field( u64, bg_objectid ) ++ ), ++ ++ TP_fast_assign( ++ __entry->bg_objectid = block_group->key.objectid; ++ ), ++ ++ TP_printk("block_group = %Lu", __entry->bg_objectid) ++); ++ ++TRACE_EVENT(btrfs_setup_cluster, ++ ++ TP_PROTO(struct btrfs_block_group_cache *block_group, ++ struct btrfs_free_cluster *cluster, u64 size, int bitmap), ++ ++ TP_ARGS(block_group, cluster, size, bitmap), ++ ++ TP_STRUCT__entry( ++ __field( u64, bg_objectid ) ++ __field( u64, flags ) ++ __field( u64, start ) ++ __field( u64, max_size ) ++ __field( u64, size ) ++ __field( int, bitmap ) ++ ), ++ ++ TP_fast_assign( ++ __entry->bg_objectid = block_group->key.objectid; ++ __entry->flags = block_group->flags; ++ __entry->start = cluster->window_start; ++ __entry->max_size = cluster->max_size; ++ __entry->size = size; ++ __entry->bitmap = bitmap; ++ ), ++ ++ TP_printk("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, " ++ "size = %Lu, max_size = %Lu, bitmap = %d", ++ __entry->bg_objectid, ++ __entry->flags, ++ __print_flags((unsigned long)__entry->flags, "|", ++ BTRFS_GROUP_FLAGS), __entry->start, ++ __entry->size, __entry->max_size, __entry->bitmap) ++); ++ + #endif /* _TRACE_BTRFS_H */ + + /* This part must be outside protection */ diff --git a/3.2.34/patches.suse/btrfs-8001-rewrite-btrfs_trim_block_group.patch b/3.2.34/patches.suse/btrfs-8001-rewrite-btrfs_trim_block_group.patch new file mode 100644 index 0000000..9d858c2 --- /dev/null +++ b/3.2.34/patches.suse/btrfs-8001-rewrite-btrfs_trim_block_group.patch @@ -0,0 +1,299 @@ +From 033eea6d488471c7262b377e066ecf9eea85d5b1 Mon Sep 17 00:00:00 2001 +From: Li Zefan +Date: Thu, 17 Nov 2011 15:26:17 +0800 +Patch-mainline: pending +References: FATE#306586 +Subject: [PATCH] Btrfs: rewrite btrfs_trim_block_group() + +There are various bugs in block group trimming: + +- It may trim from offset smaller than user-specified offset. +- It may trim beyond user-specified range. +- It may leak free space for extents smaller than specified minlen. +- It may truncate the last trimmed extent thus leak free space. +- With mixed extents+bitmaps, some extents may not be trimmed. +- With mixed extents+bitmaps, some bitmaps may not be trimmed (even +none will be trimmed). Even for those trimmed, not all the free space +in the bitmaps will be trimmed. + +I rewrite btrfs_trim_block_group() and break it into two functions. +One is to trim extents only, and the other is to trim bitmaps only. + +Signed-off-by: Li Zefan +Signed-off-by: David Sterba +--- + fs/btrfs/free-space-cache.c | 233 ++++++++++++++++++++++++++++++-------------- + 1 file changed, 163 insertions(+), 70 deletions(-) + +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -2586,17 +2586,57 @@ void btrfs_init_free_cluster(struct btrf + cluster->block_group = NULL; + } + +-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, +- u64 *trimmed, u64 start, u64 end, u64 minlen) ++static int do_trimming(struct btrfs_block_group_cache *block_group, ++ u64 *total_trimmed, u64 start, u64 bytes, ++ u64 reserved_start, u64 reserved_bytes) + { +- struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; +- struct btrfs_free_space *entry = NULL; ++ struct btrfs_space_info *space_info = block_group->space_info; + struct btrfs_fs_info *fs_info = block_group->fs_info; +- u64 bytes = 0; +- u64 actually_trimmed; +- int ret = 0; ++ int ret; ++ int update = 0; ++ u64 trimmed = 0; ++ ++ spin_lock(&space_info->lock); ++ spin_lock(&block_group->lock); ++ if (!block_group->ro) { ++ block_group->reserved += reserved_bytes; ++ space_info->bytes_reserved += reserved_bytes; ++ update = 1; ++ } ++ spin_unlock(&block_group->lock); ++ spin_unlock(&space_info->lock); + +- *trimmed = 0; ++ ret = btrfs_error_discard_extent(fs_info->extent_root, ++ start, bytes, &trimmed); ++ if (!ret) ++ *total_trimmed += trimmed; ++ ++ btrfs_add_free_space(block_group, reserved_start, reserved_bytes); ++ ++ if (update) { ++ spin_lock(&space_info->lock); ++ spin_lock(&block_group->lock); ++ if (block_group->ro) ++ space_info->bytes_readonly += reserved_bytes; ++ block_group->reserved -= reserved_bytes; ++ space_info->bytes_reserved -= reserved_bytes; ++ spin_unlock(&space_info->lock); ++ spin_unlock(&block_group->lock); ++ } ++ ++ return ret; ++} ++ ++static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, ++ u64 *total_trimmed, u64 start, u64 end, u64 minlen) ++{ ++ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; ++ struct btrfs_free_space *entry; ++ struct rb_node *node; ++ int ret = 0; ++ u64 extent_start; ++ u64 extent_bytes; ++ u64 bytes; + + while (start < end) { + spin_lock(&ctl->tree_lock); +@@ -2607,81 +2647,118 @@ int btrfs_trim_block_group(struct btrfs_ + } + + entry = tree_search_offset(ctl, start, 0, 1); +- if (!entry) +- entry = tree_search_offset(ctl, +- offset_to_bitmap(ctl, start), +- 1, 1); +- +- if (!entry || entry->offset >= end) { ++ if (!entry) { + spin_unlock(&ctl->tree_lock); + break; + } + +- if (entry->bitmap) { +- ret = search_bitmap(ctl, entry, &start, &bytes); +- if (!ret) { +- if (start >= end) { +- spin_unlock(&ctl->tree_lock); +- break; +- } +- bytes = min(bytes, end - start); +- bitmap_clear_bits(ctl, entry, start, bytes); +- if (entry->bytes == 0) +- free_bitmap(ctl, entry); +- } else { +- start = entry->offset + BITS_PER_BITMAP * +- block_group->sectorsize; ++ /* skip bitmaps */ ++ while (entry->bitmap) { ++ node = rb_next(&entry->offset_index); ++ if (!node) { + spin_unlock(&ctl->tree_lock); +- ret = 0; +- continue; ++ goto out; + } +- } else { +- start = entry->offset; +- bytes = min(entry->bytes, end - start); +- unlink_free_space(ctl, entry); +- kmem_cache_free(btrfs_free_space_cachep, entry); ++ entry = rb_entry(node, struct btrfs_free_space, ++ offset_index); ++ } ++ ++ if (entry->offset >= end) { ++ spin_unlock(&ctl->tree_lock); ++ break; + } + ++ extent_start = entry->offset; ++ extent_bytes = entry->bytes; ++ start = max(start, extent_start); ++ bytes = min(extent_start + extent_bytes, end) - start; ++ if (bytes < minlen) { ++ spin_unlock(&ctl->tree_lock); ++ goto next; ++ } ++ ++ unlink_free_space(ctl, entry); ++ kmem_cache_free(btrfs_free_space_cachep, entry); ++ + spin_unlock(&ctl->tree_lock); + +- if (bytes >= minlen) { +- struct btrfs_space_info *space_info; +- int update = 0; +- +- space_info = block_group->space_info; +- spin_lock(&space_info->lock); +- spin_lock(&block_group->lock); +- if (!block_group->ro) { +- block_group->reserved += bytes; +- space_info->bytes_reserved += bytes; +- update = 1; +- } +- spin_unlock(&block_group->lock); +- spin_unlock(&space_info->lock); ++ ret = do_trimming(block_group, total_trimmed, start, bytes, ++ extent_start, extent_bytes); ++ if (ret) ++ break; ++next: ++ start += bytes; + +- ret = btrfs_error_discard_extent(fs_info->extent_root, +- start, +- bytes, +- &actually_trimmed); +- +- btrfs_add_free_space(block_group, start, bytes); +- if (update) { +- spin_lock(&space_info->lock); +- spin_lock(&block_group->lock); +- if (block_group->ro) +- space_info->bytes_readonly += bytes; +- block_group->reserved -= bytes; +- space_info->bytes_reserved -= bytes; +- spin_unlock(&space_info->lock); +- spin_unlock(&block_group->lock); +- } ++ if (fatal_signal_pending(current)) { ++ ret = -ERESTARTSYS; ++ break; ++ } ++ ++ cond_resched(); ++ } ++out: ++ return ret; ++} ++ ++static int trim_bitmaps(struct btrfs_block_group_cache *block_group, ++ u64 *total_trimmed, u64 start, u64 end, u64 minlen) ++{ ++ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; ++ struct btrfs_free_space *entry; ++ int ret = 0; ++ int ret2; ++ u64 bytes; ++ u64 offset = offset_to_bitmap(ctl, start); ++ ++ while (offset < end) { ++ bool next_bitmap = false; ++ ++ spin_lock(&ctl->tree_lock); + +- if (ret) +- break; +- *trimmed += actually_trimmed; ++ if (ctl->free_space < minlen) { ++ spin_unlock(&ctl->tree_lock); ++ break; ++ } ++ ++ entry = tree_search_offset(ctl, offset, 1, 0); ++ if (!entry) { ++ spin_unlock(&ctl->tree_lock); ++ next_bitmap = true; ++ goto next; ++ } ++ ++ bytes = minlen; ++ ret2 = search_bitmap(ctl, entry, &start, &bytes); ++ if (ret2 || start >= end) { ++ spin_unlock(&ctl->tree_lock); ++ next_bitmap = true; ++ goto next; ++ } ++ ++ bytes = min(bytes, end - start); ++ if (bytes < minlen) { ++ spin_unlock(&ctl->tree_lock); ++ goto next; ++ } ++ ++ bitmap_clear_bits(ctl, entry, start, bytes); ++ if (entry->bytes == 0) ++ free_bitmap(ctl, entry); ++ ++ spin_unlock(&ctl->tree_lock); ++ ++ ret = do_trimming(block_group, total_trimmed, start, bytes, ++ start, bytes); ++ if (ret) ++ break; ++next: ++ if (next_bitmap) { ++ offset += BITS_PER_BITMAP * ctl->unit; ++ } else { ++ start += bytes; ++ if (start >= offset + BITS_PER_BITMAP * ctl->unit) ++ offset += BITS_PER_BITMAP * ctl->unit; + } +- start += bytes; +- bytes = 0; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; +@@ -2693,6 +2770,22 @@ int btrfs_trim_block_group(struct btrfs_ + + return ret; + } ++ ++int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, ++ u64 *trimmed, u64 start, u64 end, u64 minlen) ++{ ++ int ret; ++ ++ *trimmed = 0; ++ ++ ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); ++ if (ret) ++ return ret; ++ ++ ret = trim_bitmaps(block_group, trimmed, start, end, minlen); ++ ++ return ret; ++} + + /* + * Find the left-most item in the cache tree, and then return the diff --git a/3.2.34/patches.suse/btrfs-8007-lock-and-disable-irq-during-space-alloc.patch b/3.2.34/patches.suse/btrfs-8007-lock-and-disable-irq-during-space-alloc.patch new file mode 100644 index 0000000..900cba4 --- /dev/null +++ b/3.2.34/patches.suse/btrfs-8007-lock-and-disable-irq-during-space-alloc.patch @@ -0,0 +1,40 @@ +From e2049e28add8f8fbfa8680fcf5fc49fa3b713ceb Mon Sep 17 00:00:00 2001 +From: David Sterba +Date: Tue, 22 Nov 2011 18:05:48 +0100 +Patch-mainline: pending +References: FATE#306586 bnc#730103 +Subject: [PATCH] btrfs: lock and disable irq during space alloc + +This is a workaround. + +Signed-off-by: Jeff Mahoney +Signed-off-by: David Sterba +--- + fs/btrfs/free-space-cache.c | 4 ++-- + 1 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c +index 7807276..e49c1cd 100644 +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -2102,7 +2102,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 bytes_search = bytes + empty_size; + u64 ret = 0; + +- spin_lock(&ctl->tree_lock); ++ spin_lock_irq(&ctl->tree_lock); + entry = find_free_space(ctl, &offset, &bytes_search); + if (!entry) + goto out; +@@ -2123,7 +2123,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + } + + out: +- spin_unlock(&ctl->tree_lock); ++ spin_unlock_irq(&ctl->tree_lock); + + return ret; + } +-- +1.7.6 + diff --git a/3.2.34/patches.suse/btrfs-8013-sector-size-check-during-mount.patch b/3.2.34/patches.suse/btrfs-8013-sector-size-check-during-mount.patch new file mode 100644 index 0000000..2017ed3 --- /dev/null +++ b/3.2.34/patches.suse/btrfs-8013-sector-size-check-during-mount.patch @@ -0,0 +1,43 @@ +From: Keith Mannthey +Date: Tue, 29 Nov 2011 17:44:12 -0800 +Patch-mainline: pending +References: FATE#306586 bnc#724620 +Subject: [PATCH] Sector Size check during Mount + +Gracefully fail when trying to mount a BTRFS file system that has a +sectorsize smaller than PAGE_SIZE. + +On PPC it is possible to build a FS while using a 4k PAGE_SIZE kernel +then boot into a 64K PAGE_SIZE kernel. Presently open_ctree fails in an +endless loop and hangs the machine in this situation. + +My debugging has show this Sector size < Page size to be a non trivial +situation and a graceful exit from the situation would be nice for the +time being. + +Signed-off-by: Keith Mannthey +Signed-off-by: David Sterba +--- + fs/btrfs/disk-io.c | 6 ++++++ + 1 files changed, 6 insertions(+), 0 deletions(-) + +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index 94abc25..1cbfa75 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -2230,6 +2230,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, + goto fail_sb_buffer; + } + ++ if (sectorsize < PAGE_SIZE) { ++ printk(KERN_WARNING "btrfs: Incompatible sector size " ++ "found on %s\n", sb->s_id); ++ goto fail_sb_buffer; ++ } ++ + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_sys_array(tree_root); + mutex_unlock(&fs_info->chunk_mutex); +-- +1.7.6.233.gd79bc + diff --git a/3.2.34/patches.suse/btrfs-8014-add-new-ioctl-to-determine-size-of-compressed-.patch b/3.2.34/patches.suse/btrfs-8014-add-new-ioctl-to-determine-size-of-compressed-.patch new file mode 100644 index 0000000..bd92a74 --- /dev/null +++ b/3.2.34/patches.suse/btrfs-8014-add-new-ioctl-to-determine-size-of-compressed-.patch @@ -0,0 +1,158 @@ +From: David Sterba +Date: Tue, 28 Jun 2011 12:38:06 +0200 +Patch-mainline: pending +References: FATE#306586 +Subject: [PATCH] btrfs: add new ioctl to determine size of compressed file + +Go through all extents of a file in a given [start,end) range and sum +for: +* regular extent: ->block_len, size is already rounded up to blocks +* inline extents: length rounded up to 512 + +The range is start inclusive / end exclusive. For whole a file pass +0 and (u64)-1. + +The values returned are number of occupied 512B sectors for uncompressed +and compressed size and can be easily compared to determine rough +compression ratio of the given file range. + +Based on implementation from Ulrich Hecht, +http://comments.gmane.org/gmane.comp.file-systems.btrfs/6253 + +Signed-off-by: David Sterba +--- + fs/btrfs/ioctl.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + fs/btrfs/ioctl.h | 12 ++++++++ + 2 files changed, 95 insertions(+), 0 deletions(-) + +diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c +index c04f02c..91e6ab8 100644 +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -2972,6 +2972,86 @@ static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) + return 0; + } + ++/* ++ * Returns the compressed size of an inode in 512 byte blocks. ++ * Count the on-disk space used by extents starting in range [start, end), ++ * inline data are rounded up to sector, ie. 512. ++ * ++ * The range is start inclusive and end exclusive so it can be used to ++ * determine compressed size of a given extent by its start and start of the ++ * next extent easily, without counting length. ++ * Whole file is specified as start = 0, end = (u64)-1 ++ */ ++static long btrfs_ioctl_compr_size(struct file *file, void __user *argp) ++{ ++ struct inode *inode = fdentry(file)->d_inode; ++ struct btrfs_ioctl_compr_size_args compr_args; ++ u64 len; ++ u64 compressed_size = 0; ++ u64 size = 0; ++ u64 offset = 0; ++ ++ if (S_ISDIR(inode->i_mode)) ++ return -EISDIR; ++ ++ if (copy_from_user(&compr_args, argp, ++ sizeof(struct btrfs_ioctl_compr_size_args))) ++ return -EFAULT; ++ ++ if (compr_args.start > compr_args.end) ++ return -EINVAL; ++ ++ mutex_lock(&inode->i_mutex); ++ ++ offset = compr_args.start; ++ if (inode->i_size > compr_args.end) ++ len = compr_args.end; ++ else ++ len = inode->i_size; ++ ++ /* ++ * do any pending delalloc/csum calc on inode, one way or ++ * another, and lock file content ++ */ ++ btrfs_wait_ordered_range(inode, compr_args.start, len); ++ ++ lock_extent(&BTRFS_I(inode)->io_tree, compr_args.start, len, GFP_NOFS); ++ ++ while (offset < len) { ++ struct extent_map *em; ++ ++ em = btrfs_get_extent(inode, NULL, 0, offset, 1, 0); ++ if (IS_ERR_OR_NULL(em)) ++ goto error; ++ if (em->block_len != (u64)-1) { ++ compressed_size += em->block_len; ++ size += ALIGN(em->len, inode->i_sb->s_blocksize); ++ } else if (em->block_start == EXTENT_MAP_INLINE) { ++ compressed_size += ALIGN(em->len, 512); ++ size += ALIGN(em->len, 512); ++ } ++ offset += em->len; ++ free_extent_map(em); ++ } ++ unlock_extent(&BTRFS_I(inode)->io_tree, compr_args.start, len, GFP_NOFS); ++ mutex_unlock(&inode->i_mutex); ++ ++ compr_args.size = size >> 9; ++ compr_args.compressed_size = compressed_size >> 9; ++ ++ if (copy_to_user(argp, &compr_args, ++ sizeof(struct btrfs_ioctl_compr_size_args))) ++ return -EFAULT; ++ ++ return 0; ++ ++error: ++ unlock_extent(&BTRFS_I(inode)->io_tree, compr_args.start, len, GFP_NOFS); ++ mutex_unlock(&inode->i_mutex); ++ ++ return -EIO; ++} ++ + static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, + void __user *arg) + { +@@ -3110,6 +3190,8 @@ long btrfs_ioctl(struct file *file, unsigned int + return btrfs_ioctl_scrub_cancel(root, argp); + case BTRFS_IOC_SCRUB_PROGRESS: + return btrfs_ioctl_scrub_progress(root, argp); ++ case BTRFS_IOC_COMPR_SIZE: ++ return btrfs_ioctl_compr_size(file, argp); + } + + return -ENOTTY; +diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h +--- a/fs/btrfs/ioctl.h ++++ b/fs/btrfs/ioctl.h +@@ -217,6 +217,16 @@ struct btrfs_ioctl_logical_ino_args { + __u64 inodes; + }; + ++struct btrfs_ioctl_compr_size_args { ++ /* Range start, inclusive */ ++ __u64 start; /* in */ ++ /* Range end, exclusive */ ++ __u64 end; /* in */ ++ __u64 size; /* out */ ++ __u64 compressed_size; /* out */ ++ __u64 reserved[2]; ++}; ++ + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ + struct btrfs_ioctl_vol_args) + #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ +@@ -276,5 +286,7 @@ struct btrfs_ioctl_logical_ino_args { + struct btrfs_ioctl_ino_path_args) + #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ + struct btrfs_ioctl_ino_path_args) ++#define BTRFS_IOC_COMPR_SIZE _IOR(BTRFS_IOCTL_MAGIC, 51, \ ++ struct btrfs_ioctl_compr_size_args) + + #endif +-- +1.7.7.3 + diff --git a/3.2.34/patches.suse/btrfs-8015-make-lzo-the-default-compression-scheme.patch b/3.2.34/patches.suse/btrfs-8015-make-lzo-the-default-compression-scheme.patch new file mode 100644 index 0000000..1a647aa --- /dev/null +++ b/3.2.34/patches.suse/btrfs-8015-make-lzo-the-default-compression-scheme.patch @@ -0,0 +1,68 @@ +From: Li Zefan +Date: Thu, 26 May 2011 11:39:03 +0800 +Patch-mainline: pending +References: FATE#306586 +Subject: [PATCH] Btrfs: make lzo the default compression scheme + +As the lzo compression feature has been established for quite +a while, we are now ready to replace zlib with lzo as the default +compression scheme. + +Signed-off-by: Li Zefan +Signed-off-by: David Sterba +--- + fs/btrfs/disk-io.c | 2 +- + fs/btrfs/ioctl.c | 2 +- + fs/btrfs/super.c | 8 ++++---- + 3 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index 94abc25..7ea0cdd 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -2095,7 +2095,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, + * In the long term, we'll store the compression type in the super + * block, and it'll be used for per file compression control. + */ +- fs_info->compress_type = BTRFS_COMPRESS_ZLIB; ++ fs_info->compress_type = BTRFS_COMPRESS_LZO; + + ret = btrfs_parse_options(tree_root, options); + if (ret) { +diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c +index a90e749..d9c2ba6 100644 +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -992,7 +992,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, + unsigned long ra_index = 0; + int ret; + int defrag_count = 0; +- int compress_type = BTRFS_COMPRESS_ZLIB; ++ int compress_type = BTRFS_COMPRESS_LZO; + int extent_thresh = range->extent_thresh; + int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + int cluster = max_cluster; +diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c +index 8bd9d6d..b6b5bd7 100644 +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -270,12 +270,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) + case Opt_compress_type: + if (token == Opt_compress || + token == Opt_compress_force || +- strcmp(args[0].from, "zlib") == 0) { +- compress_type = "zlib"; +- info->compress_type = BTRFS_COMPRESS_ZLIB; +- } else if (strcmp(args[0].from, "lzo") == 0) { ++ strcmp(args[0].from, "lzo") == 0) { + compress_type = "lzo"; + info->compress_type = BTRFS_COMPRESS_LZO; ++ } else if (strcmp(args[0].from, "zlib") == 0) { ++ compress_type = "zlib"; ++ info->compress_type = BTRFS_COMPRESS_ZLIB; + } else { + ret = -EINVAL; + goto out; +-- +1.7.6 + diff --git a/3.2.34/patches.suse/btrfs-8024-workaround-for-cleaner-deadlock.patch b/3.2.34/patches.suse/btrfs-8024-workaround-for-cleaner-deadlock.patch new file mode 100644 index 0000000..c6767c4 --- /dev/null +++ b/3.2.34/patches.suse/btrfs-8024-workaround-for-cleaner-deadlock.patch @@ -0,0 +1,32 @@ +From: David Sterba +Date: Thu, 15 Dec 2011 02:10:55 +0100 +Patch-mainline: pending +References: FATE#306586 +Subject: [PATCH] btrfs: workaround for cleaner deadlock + +Signed-off-by: David Sterba +--- + fs/btrfs/disk-io.c | 2 ++ + 1 files changed, 2 insertions(+), 0 deletions(-) + +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index 3f9d555..12d785b 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -1572,11 +1572,13 @@ static int cleaner_kthread(void *arg) + vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); + + if (!(root->fs_info->sb->s_flags & MS_RDONLY) && ++ down_read_trylock(&root->fs_info->sb->s_umount) && + mutex_trylock(&root->fs_info->cleaner_mutex)) { + btrfs_run_delayed_iputs(root); + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + btrfs_run_defrag_inodes(root->fs_info); ++ up_read(&root->fs_info->sb->s_umount); + } + + if (freezing(current)) { +-- +1.7.7.3 + diff --git a/3.2.34/patches.suse/btrfs-8025-update-global-block_rsv-when-creating-a-new-bl.patch b/3.2.34/patches.suse/btrfs-8025-update-global-block_rsv-when-creating-a-new-bl.patch new file mode 100644 index 0000000..13c4ebc --- /dev/null +++ b/3.2.34/patches.suse/btrfs-8025-update-global-block_rsv-when-creating-a-new-bl.patch @@ -0,0 +1,61 @@ +From: Li Zefan +Date: Wed, 7 Dec 2011 13:12:59 +0800 +Patch-mainline: pending +References: FATE#306586 +Subject: [PATCH] Btrfs: update global block_rsv when creating a new block + group + +A bug was triggered while using seed device: + + # mkfs.btrfs /dev/loop1 + # btrfstune -S 1 /dev/loop1 + # mount -o /dev/loop1 /mnt + # btrfs dev add /dev/loop2 /mnt + +btrfs: block rsv returned -28 +------------[ cut here ]------------ +WARNING: at fs/btrfs/extent-tree.c:5969 btrfs_alloc_free_block+0x166/0x396 [btrfs]() +... +Call Trace: +... +[] btrfs_cow_block+0x101/0x147 [btrfs] +[] btrfs_search_slot+0x1b8/0x55f [btrfs] +[] btrfs_insert_empty_items+0x42/0x7f [btrfs] +[] btrfs_insert_item+0x40/0x7e [btrfs] +[] btrfs_make_block_group+0x243/0x2aa [btrfs] +[] __btrfs_alloc_chunk+0x672/0x70e [btrfs] +[] init_first_rw_device+0x77/0x13c [btrfs] +[] btrfs_init_new_device+0x664/0x9fd [btrfs] +[] btrfs_ioctl+0x694/0xdbe [btrfs] +[] do_vfs_ioctl+0x496/0x4cc +[] sys_ioctl+0x33/0x4f +[] sysenter_do_call+0x12/0x38 +---[ end trace 906adac595facc7d ]--- + +Since seed device is readonly, there's no usable space in the filesystem. +Afterwards we add a sprout device to it, and the kernel creates a METADATA +block group and a SYSTEM block group where comes free space we can reserve, +but we still get revervation failure because the global block_rsv hasn't +been updated accordingly. + +Signed-off-by: Li Zefan +Signed-off-by: David Sterba +--- + fs/btrfs/extent-tree.c | 1 + + 1 files changed, 1 insertions(+), 0 deletions(-) + +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 8861572..a80efb5 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -7476,6 +7476,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, + &cache->space_info); + BUG_ON(ret); ++ update_global_block_rsv(root->fs_info); + + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_readonly += cache->bytes_super; +-- +1.7.6.233.gd79bc + diff --git a/3.2.34/patches.suse/btrfs-8026-fix-possible-deadlock-when-opening-a-seed-devi.patch b/3.2.34/patches.suse/btrfs-8026-fix-possible-deadlock-when-opening-a-seed-devi.patch new file mode 100644 index 0000000..cc134cb --- /dev/null +++ b/3.2.34/patches.suse/btrfs-8026-fix-possible-deadlock-when-opening-a-seed-devi.patch @@ -0,0 +1,84 @@ +From: Li Zefan +Date: Wed, 7 Dec 2011 13:13:26 +0800 +Patch-mainline: pending +References: FATE#306586 +Subject: [PATCH] Btrfs: fix possible deadlock when opening a seed device + +The correct lock order is uuid_mutex -> volume_mutex -> chunk_mutex, +but when we mount a filesystem which has backing seed devices, we have +this lock chain: + + open_ctree() + lock(chunk_mutex); + read_chunk_tree(); + read_one_dev(); + open_seed_devices(); + lock(uuid_mutex); + +and then we hit a lockdep splat. + +Signed-off-by: Li Zefan +Signed-off-by: David Sterba +--- + fs/btrfs/disk-io.c | 2 -- + fs/btrfs/volumes.c | 9 +++++++-- + 2 files changed, 7 insertions(+), 4 deletions(-) + +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index d0bc3c5..beb1d19 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -2287,9 +2287,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, + (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), + BTRFS_UUID_SIZE); + +- mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_chunk_tree(chunk_root); +- mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", + sb->s_id); +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index d136915..fc94228 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -4264,7 +4264,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) + struct btrfs_fs_devices *fs_devices; + int ret; + +- mutex_lock(&uuid_mutex); ++ BUG_ON(!mutex_is_locked(&uuid_mutex)); + + fs_devices = root->fs_info->fs_devices->seed; + while (fs_devices) { +@@ -4302,7 +4302,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) + fs_devices->seed = root->fs_info->fs_devices->seed; + root->fs_info->fs_devices->seed = fs_devices; + out: +- mutex_unlock(&uuid_mutex); + return ret; + } + +@@ -4459,6 +4458,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) + if (!path) + return -ENOMEM; + ++ mutex_lock(&uuid_mutex); ++ lock_chunks(root); ++ + /* first we search for all of the device items, and then we + * read in all of the chunk items. This way we can create chunk + * mappings that reference all of the devices that are afound +@@ -4509,6 +4511,9 @@ again: + } + ret = 0; + error: ++ unlock_chunks(root); ++ mutex_unlock(&uuid_mutex); ++ + btrfs_free_path(path); + return ret; + } +-- +1.7.6.233.gd79bc + diff --git a/3.2.34/patches.suse/btrfs-allow-cross-subvolume-file-clone.patch b/3.2.34/patches.suse/btrfs-allow-cross-subvolume-file-clone.patch new file mode 100644 index 0000000..70d3975 --- /dev/null +++ b/3.2.34/patches.suse/btrfs-allow-cross-subvolume-file-clone.patch @@ -0,0 +1,47 @@ +From: David Sterba +Date: Mon, 1 Aug 2011 18:11:57 +0200 +Subject: [PATCH] btrfs: allow cross-subvolume file clone +Reference: bnc#698540 +Patch-mainline: pending + +Lift the EXDEV condition and allow different root trees for files being +cloned, then pass source inode's root when searching for extents. + +Signed-off-by: David Sterba +--- + fs/btrfs/ioctl.c | 7 ++++--- + 1 files changed, 4 insertions(+), 3 deletions(-) + +diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c +index 0b980af..58eb0ef 100644 +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -2183,7 +2183,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + goto out_fput; + + ret = -EXDEV; +- if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root) ++ if (src->i_sb != inode->i_sb) + goto out_fput; + + ret = -ENOMEM; +@@ -2247,13 +2247,14 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + * note the key will change type as we walk through the + * tree. + */ +- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); ++ ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, ++ 0, 0); + if (ret < 0) + goto out; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { +- ret = btrfs_next_leaf(root, path); ++ ret = btrfs_next_leaf(BTRFS_I(src)->root, path); + if (ret < 0) + goto out; + if (ret > 0) +-- +1.7.6 + diff --git a/3.2.34/series b/3.2.34/series new file mode 100644 index 0000000..92b8362 --- /dev/null +++ b/3.2.34/series @@ -0,0 +1,68 @@ +bump/1021_linux-3.2.22.patch +bump/1022_linux-3.2.23.patch +bump/1023_linux-3.2.24.patch +bump/1024_linux-3.2.25.patch +bump/1025_linux-3.2.26.patch +bump/1026_linux-3.2.27.patch +bump/1027_linux-3.2.28.patch +bump/1028_linux-3.2.29.patch +bump/1029_linux-3.2.30.patch +bump/1030_linux-3.2.31.patch +bump/1031_linux-3.2.32.patch +bump/1032_linux-3.2.33.patch +bump/1033_linux-3.2.34.patch + + +0001-block-prepare-I-O-context-code-for-BFQ-v5-for-3.2.patch +0002-block-cgroups-kconfig-build-bits-for-BFQ-v5-3.2.patch +0003-block-introduce-the-BFQ-v5-I-O-sched-for-3.2.patch + +0001-AppArmor-compatibility-patch-for-v5-network-controll.patch +0002-AppArmor-compatibility-patch-for-v5-interface.patch +0003-AppArmor-Allow-dfa-backward-compatibility-with-broke.patch + +cloneconfig.patch +kbuild-compress-kernel-modules-on-installation.patch +ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch +colored-printk-3.2.33.patch +910-kobject_uevent.patch +911-kobject_add_broadcast_uevent.patch + +linux-2.6-x86-tune-generic.patch +hz-432-kconfig-option.patch +hz-864-kconfig-option.patch + +Add_CONFIG_VFAT_FS_DUALNAMES_option.patch +linux-2.6-defaults-fat-utf8.patch +aufs3-standalone-3.2.patch +accessfs-3.2-0.26.patch +wrapfs-v3.2.2-45-ga5296eb.patch + +imqmq-3.2.patch + +vserver-3.2.34-vs2.3.2.15.patch +uksm-0.1.2.1-for-v3.2.ge.31.patch +kernel-3.4.0-layer7-2.22.patch +net-netfilter-IFWLOG.patch +net-netfilter-IFWLOG-mdv.patch +net-netfilter-IFWLOG-2.6.35-buildfix.patch +net-netfilter-IFWLOG-2.6.37-buildfix.patch +net-netfilter-psd.patch +net-netfilter-psd-mdv.patch +net-netfilter-psd-2.6.35-buildfix.patch +netfilter-implement-rfc-1123-for-ftp-conntrack.patch +netfilter-ip_conntrack_slp.patch + +kernel-3.2-lsxhl.patch +kernel-3.2-lsproduo.patch +kernel-3.2-lsql.patch +v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-VL.patch +v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-WVL.patch +lschlv2.patch + +3rd-3rdparty-1.0-tree.patch +3rd-3rdparty-merge.patch +3rd-3rdparty-netatop-0.1.1.patch +3rd-3rdparty-button_hotplug-0.4.1.patch +3rd-3rdparty-gpio_button_hotplug-0.1.patch +3rd-3rdparty-gpio_event_drv-0.1.patch \ No newline at end of file diff --git a/3.2.34/uksm-0.1.2.1-for-v3.2.ge.31.patch b/3.2.34/uksm-0.1.2.1-for-v3.2.ge.31.patch new file mode 100644 index 0000000..9036595 --- /dev/null +++ b/3.2.34/uksm-0.1.2.1-for-v3.2.ge.31.patch @@ -0,0 +1,7032 @@ +diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX +index 5481c8b..7141876 100644 +--- a/Documentation/vm/00-INDEX ++++ b/Documentation/vm/00-INDEX +@@ -14,6 +14,8 @@ hwpoison.txt + - explains what hwpoison is + ksm.txt + - how to use the Kernel Samepage Merging feature. ++uksm.txt ++ - Introduction to Ultra KSM + locking + - info on how locking and synchronization is done in the Linux vm code. + map_hugetlb.c +diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt +new file mode 100644 +index 0000000..d4aaae8 +--- /dev/null ++++ b/Documentation/vm/uksm.txt +@@ -0,0 +1,56 @@ ++The Ultra Kernel Samepage Merging feature ++---------------------------------------------- ++/* ++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia ++ * ++ * This is an improvement upon KSM. Some basic data structures and routines ++ * are borrowed from ksm.c . ++ * ++ * Its new features: ++ * 1. Full system scan: ++ * It automatically scans all user processes' anonymous VMAs. Kernel-user ++ * interaction to submit a memory area to KSM is no longer needed. ++ * ++ * 2. Rich area detection: ++ * It automatically detects rich areas containing abundant duplicated ++ * pages based. Rich areas are given a full scan speed. Poor areas are ++ * sampled at a reasonable speed with very low CPU consumption. ++ * ++ * 3. Ultra Per-page scan speed improvement: ++ * A new hash algorithm is proposed. As a result, on a machine with ++ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it ++ * can scan memory areas that does not contain duplicated pages at speed of ++ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of ++ * 477MB/sec ~ 923MB/sec. ++ * ++ * 4. Thrashing area avoidance: ++ * Thrashing area(an VMA that has frequent Ksm page break-out) can be ++ * filtered out. My benchmark shows it's more efficient than KSM's per-page ++ * hash value based volatile page detection. ++ * ++ * ++ * 5. Misc changes upon KSM: ++ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page ++ * comparison. It's much faster than default C version on x86. ++ * * rmap_item now has an struct *page member to loosely cache a ++ * address-->page mapping, which reduces too much time-costly ++ * follow_page(). ++ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. ++ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ ++ * ksm is needed for this case. ++ * ++ * 6. Full Zero Page consideration(contributed by Figo Zhang) ++ * Now uksmd consider full zero pages as special pages and merge them to an ++ * special unswappable uksm zero page. ++ */ ++ ++ChangeLog: ++ ++2012-05-05 The creation of this Doc ++2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up. ++2012-05-28 UKSM 0.1.1.2 bug fix release ++2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2 ++2012-07-2 UKSM 0.1.2-beta2 ++2012-07-10 UKSM 0.1.2-beta3 ++2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization. ++2012-10-13 UKSM 0.1.2.1 Bug fixes. +diff --git a/fs/exec.c b/fs/exec.c +index 160cd2f..ae68311 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -19,7 +19,7 @@ + * current->executable is only used by the procfs. This allows a dispatch + * table to check for several different types of binary formats. We keep + * trying until we recognize the file or we run out of supported binary +- * formats. ++ * formats. + */ + + #include +@@ -55,6 +55,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -85,7 +86,7 @@ int __register_binfmt(struct linux_binfmt * fmt, int insert) + insert ? list_add(&fmt->lh, &formats) : + list_add_tail(&fmt->lh, &formats); + write_unlock(&binfmt_lock); +- return 0; ++ return 0; + } + + EXPORT_SYMBOL(__register_binfmt); +@@ -1169,7 +1170,7 @@ void setup_new_exec(struct linux_binprm * bprm) + group */ + + current->self_exec_id++; +- ++ + flush_signal_handlers(current, 0); + flush_old_files(current->files); + } +@@ -1264,8 +1265,8 @@ int check_unsafe_exec(struct linux_binprm *bprm) + return res; + } + +-/* +- * Fill the binprm structure from the inode. ++/* ++ * Fill the binprm structure from the inode. + * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes + * + * This may be called multiple times for binary chains (scripts for example). +diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c +index 80e4645..33f9e9b 100644 +--- a/fs/proc/meminfo.c ++++ b/fs/proc/meminfo.c +@@ -87,6 +87,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) + "SUnreclaim: %8lu kB\n" + "KernelStack: %8lu kB\n" + "PageTables: %8lu kB\n" ++#ifdef CONFIG_UKSM ++ "KsmZeroPages: %8lu kB\n" ++#endif + #ifdef CONFIG_QUICKLIST + "Quicklists: %8lu kB\n" + #endif +@@ -146,6 +149,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) + K(global_page_state(NR_SLAB_UNRECLAIMABLE)), + global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, + K(global_page_state(NR_PAGETABLE)), ++#ifdef CONFIG_UKSM ++ K(global_page_state(NR_UKSM_ZERO_PAGES)), ++#endif + #ifdef CONFIG_QUICKLIST + K(quicklist_total_size()), + #endif +diff --git a/include/linux/ksm.h b/include/linux/ksm.h +index 3319a69..f4edf33 100644 +--- a/include/linux/ksm.h ++++ b/include/linux/ksm.h +@@ -22,21 +22,6 @@ struct page *ksm_does_need_to_copy(struct page *page, + #ifdef CONFIG_KSM + int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags); +-int __ksm_enter(struct mm_struct *mm); +-void __ksm_exit(struct mm_struct *mm); +- +-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +-{ +- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) +- return __ksm_enter(mm); +- return 0; +-} +- +-static inline void ksm_exit(struct mm_struct *mm) +-{ +- if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) +- __ksm_exit(mm); +-} + + /* + * A KSM page is one of those write-protected "shared pages" or "merged pages" +@@ -90,6 +75,33 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg); + void ksm_migrate_page(struct page *newpage, struct page *oldpage); + ++#ifdef CONFIG_KSM_LEGACY ++int __ksm_enter(struct mm_struct *mm); ++void __ksm_exit(struct mm_struct *mm); ++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) ++{ ++ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) ++ return __ksm_enter(mm); ++ return 0; ++} ++ ++static inline void ksm_exit(struct mm_struct *mm) ++{ ++ if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) ++ __ksm_exit(mm); ++} ++ ++#elif defined(CONFIG_UKSM) ++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) ++{ ++ return 0; ++} ++ ++static inline void ksm_exit(struct mm_struct *mm) ++{ ++} ++#endif /* !CONFIG_UKSM */ ++ + #else /* !CONFIG_KSM */ + + static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +@@ -142,4 +154,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) + #endif /* CONFIG_MMU */ + #endif /* !CONFIG_KSM */ + ++#include ++ + #endif /* __LINUX_KSM_H */ +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 5b42f1b..7a09663 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -253,6 +253,9 @@ struct vm_area_struct { + #ifdef CONFIG_NUMA + struct mempolicy *vm_policy; /* NUMA policy for the VMA */ + #endif ++#ifdef CONFIG_UKSM ++ struct vma_slot *uksm_vma_slot; ++#endif + }; + + struct core_thread { +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 25842b6..d2b8dba 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -116,6 +116,9 @@ enum zone_stat_item { + NUMA_OTHER, /* allocation from other node */ + #endif + NR_ANON_TRANSPARENT_HUGEPAGES, ++#ifdef CONFIG_UKSM ++ NR_UKSM_ZERO_PAGES, ++#endif + NR_VM_ZONE_STAT_ITEMS }; + + /* +@@ -360,7 +363,7 @@ struct zone { + ZONE_PADDING(_pad1_) + + /* Fields commonly accessed by the page reclaim scanner */ +- spinlock_t lru_lock; ++ spinlock_t lru_lock; + struct zone_lru { + struct list_head list; + } lru[NR_LRU_LISTS]; +@@ -745,7 +748,7 @@ static inline int is_normal_idx(enum zone_type idx) + } + + /** +- * is_highmem - helper function to quickly check if a struct zone is a ++ * is_highmem - helper function to quickly check if a struct zone is a + * highmem zone or not. This is an attempt to keep references + * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. + * @zone - pointer to struct zone variable +diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h +new file mode 100644 +index 0000000..6780fdb +--- /dev/null ++++ b/include/linux/sradix-tree.h +@@ -0,0 +1,77 @@ ++#ifndef _LINUX_SRADIX_TREE_H ++#define _LINUX_SRADIX_TREE_H ++ ++ ++#define INIT_SRADIX_TREE(root, mask) \ ++do { \ ++ (root)->height = 0; \ ++ (root)->gfp_mask = (mask); \ ++ (root)->rnode = NULL; \ ++} while (0) ++ ++#define ULONG_BITS (sizeof(unsigned long) * 8) ++#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) ++//#define SRADIX_TREE_MAP_SHIFT 6 ++//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT) ++//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1) ++ ++struct sradix_tree_node { ++ unsigned int height; /* Height from the bottom */ ++ unsigned int count; ++ unsigned int fulls; /* Number of full sublevel trees */ ++ struct sradix_tree_node *parent; ++ void *stores[0]; ++}; ++ ++/* A simple radix tree implementation */ ++struct sradix_tree_root { ++ unsigned int height; ++ struct sradix_tree_node *rnode; ++ ++ /* Where found to have available empty stores in its sublevels */ ++ struct sradix_tree_node *enter_node; ++ unsigned int shift; ++ unsigned int stores_size; ++ unsigned int mask; ++ unsigned long min; /* The first hole index */ ++ unsigned long num; ++ //unsigned long *height_to_maxindex; ++ ++ /* How the node is allocated and freed. */ ++ struct sradix_tree_node *(*alloc)(void); ++ void (*free)(struct sradix_tree_node *node); ++ ++ /* When a new node is added and removed */ ++ void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child); ++ void (*assign)(struct sradix_tree_node *node, unsigned index, void *item); ++ void (*rm)(struct sradix_tree_node *node, unsigned offset); ++}; ++ ++struct sradix_tree_path { ++ struct sradix_tree_node *node; ++ int offset; ++}; ++ ++static inline ++void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift) ++{ ++ root->height = 0; ++ root->rnode = NULL; ++ root->shift = shift; ++ root->stores_size = 1UL << shift; ++ root->mask = root->stores_size - 1; ++} ++ ++ ++extern void *sradix_tree_next(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index, ++ int (*iter)(void *, unsigned long)); ++ ++extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num); ++ ++extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index); ++ ++extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index); ++ ++#endif /* _LINUX_SRADIX_TREE_H */ +diff --git a/include/linux/uksm.h b/include/linux/uksm.h +new file mode 100644 +index 0000000..361eee2 +--- /dev/null ++++ b/include/linux/uksm.h +@@ -0,0 +1,145 @@ ++#ifndef __LINUX_UKSM_H ++#define __LINUX_UKSM_H ++/* ++ * Memory merging support. ++ * ++ * This code enables dynamic sharing of identical pages found in different ++ * memory areas, even if they are not shared by fork(). ++ */ ++ ++/* if !CONFIG_UKSM this file should not be compiled at all. */ ++#ifdef CONFIG_UKSM ++ ++#include ++#include ++#include ++#include ++#include ++ ++extern unsigned long zero_pfn __read_mostly; ++extern unsigned long uksm_zero_pfn __read_mostly; ++extern struct page *empty_uksm_zero_page; ++ ++/* must be done before linked to mm */ ++extern void uksm_vma_add_new(struct vm_area_struct *vma); ++extern void uksm_remove_vma(struct vm_area_struct *vma); ++ ++#define UKSM_SLOT_NEED_SORT (1 << 0) ++#define UKSM_SLOT_NEED_RERAND (1 << 1) ++#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */ ++#define UKSM_SLOT_FUL_SCANNED (1 << 3) ++#define UKSM_SLOT_IN_UKSM (1 << 4) ++ ++struct vma_slot { ++ struct sradix_tree_node *snode; ++ unsigned long sindex; ++ ++ struct list_head slot_list; ++ unsigned long fully_scanned_round; ++ unsigned long dedup_num; ++ unsigned long pages_scanned; ++ unsigned long last_scanned; ++ unsigned long pages_to_scan; ++ struct scan_rung *rung; ++ struct page **rmap_list_pool; ++ unsigned int *pool_counts; ++ unsigned long pool_size; ++ struct vm_area_struct *vma; ++ struct mm_struct *mm; ++ unsigned long ctime_j; ++ unsigned long pages; ++ unsigned long flags; ++ unsigned long pages_cowed; /* pages cowed this round */ ++ unsigned long pages_merged; /* pages merged this round */ ++ unsigned long pages_bemerged; ++ ++ /* when it has page merged in this eval round */ ++ struct list_head dedup_list; ++}; ++ ++static inline void uksm_unmap_zero_page(pte_t pte) ++{ ++ if (pte_pfn(pte) == uksm_zero_pfn) ++ __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); ++} ++ ++static inline void uksm_map_zero_page(pte_t pte) ++{ ++ if (pte_pfn(pte) == uksm_zero_pfn) ++ __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); ++} ++ ++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) ++{ ++ if (vma->uksm_vma_slot && PageKsm(page)) ++ vma->uksm_vma_slot->pages_cowed++; ++} ++ ++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) ++{ ++ if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn) ++ vma->uksm_vma_slot->pages_cowed++; ++} ++ ++static inline int uksm_flags_can_scan(unsigned long vm_flags) ++{ ++ return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND | ++ VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | ++ VM_NONLINEAR | VM_MIXEDMAP | VM_SAO | ++ VM_SHARED | VM_MAYSHARE | VM_GROWSUP ++ | VM_GROWSDOWN)); ++} ++ ++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) ++{ ++ if (uksm_flags_can_scan(*vm_flags_p)) ++ *vm_flags_p |= VM_MERGEABLE; ++} ++ ++/* ++ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will ++ * be removed when uksm zero page patch is stable enough. ++ */ ++static inline void uksm_bugon_zeropage(pte_t pte) ++{ ++ BUG_ON(pte_pfn(pte) == uksm_zero_pfn); ++} ++#else ++static inline void uksm_vma_add_new(struct vm_area_struct *vma) ++{ ++} ++ ++static inline void uksm_remove_vma(struct vm_area_struct *vma) ++{ ++} ++ ++static inline void uksm_unmap_zero_page(pte_t pte) ++{ ++} ++ ++static inline void uksm_map_zero_page(pte_t pte) ++{ ++} ++ ++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) ++{ ++} ++ ++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) ++{ ++} ++ ++static inline int uksm_flags_can_scan(unsigned long vm_flags) ++{ ++ return 0; ++} ++ ++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) ++{ ++} ++ ++static inline void uksm_bugon_zeropage(pte_t pte) ++{ ++} ++#endif /* !CONFIG_UKSM */ ++#endif /* __LINUX_UKSM_H */ +diff --git a/kernel/fork.c b/kernel/fork.c +index 222457a..cd9137e 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -358,7 +358,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) + goto fail_nomem; + charge = len; + } +- tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); ++ tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (!tmp) + goto fail_nomem; + *tmp = *mpnt; +@@ -410,7 +410,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; +- ++ uksm_vma_add_new(tmp); + mm->map_count++; + retval = copy_page_range(mm, oldmm, mpnt); + +diff --git a/lib/Makefile b/lib/Makefile +index a4da283..5ac75a7 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) + endif + + lib-y := ctype.o string.o vsprintf.o cmdline.o \ +- rbtree.o radix-tree.o dump_stack.o timerqueue.o\ ++ rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\ + idr.o int_sqrt.o extable.o prio_tree.o \ + sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ + proportions.o prio_heap.o ratelimit.o show_mem.o \ +diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c +new file mode 100644 +index 0000000..8d06329 +--- /dev/null ++++ b/lib/sradix-tree.c +@@ -0,0 +1,476 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) ++{ ++ return node->fulls == root->stores_size || ++ (node->height == 1 && node->count == root->stores_size); ++} ++ ++/* ++ * Extend a sradix tree so it can store key @index. ++ */ ++static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) ++{ ++ struct sradix_tree_node *node; ++ unsigned int height; ++ ++ if (unlikely(root->rnode == NULL)) { ++ if (!(node = root->alloc())) ++ return -ENOMEM; ++ ++ node->height = 1; ++ root->rnode = node; ++ root->height = 1; ++ } ++ ++ /* Figure out what the height should be. */ ++ height = root->height; ++ index >>= root->shift * height; ++ ++ while (index) { ++ index >>= root->shift; ++ height++; ++ } ++ ++ while (height > root->height) { ++ unsigned int newheight; ++ if (!(node = root->alloc())) ++ return -ENOMEM; ++ ++ /* Increase the height. */ ++ node->stores[0] = root->rnode; ++ root->rnode->parent = node; ++ if (root->extend) ++ root->extend(node, root->rnode); ++ ++ newheight = root->height + 1; ++ node->height = newheight; ++ node->count = 1; ++ if (sradix_node_full(root, root->rnode)) ++ node->fulls = 1; ++ ++ root->rnode = node; ++ root->height = newheight; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Search the next item from the current node, that is not NULL ++ * and can satify root->iter(). ++ */ ++void *sradix_tree_next(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index, ++ int (*iter)(void *item, unsigned long height)) ++{ ++ unsigned long offset; ++ void *item; ++ ++ if (unlikely(node == NULL)) { ++ node = root->rnode; ++ for (offset = 0; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (unlikely(offset >= root->stores_size)) ++ return NULL; ++ ++ if (node->height == 1) ++ return item; ++ else ++ goto go_down; ++ } ++ ++ while (node) { ++ offset = (index & root->mask) + 1; ++ for (;offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (offset < root->stores_size) ++ break; ++ ++ node = node->parent; ++ index >>= root->shift; ++ } ++ ++ if (!node) ++ return NULL; ++ ++ while (node->height > 1) { ++go_down: ++ node = item; ++ for (offset = 0; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (unlikely(offset >= root->stores_size)) ++ return NULL; ++ } ++ ++ BUG_ON(offset > root->stores_size); ++ ++ return item; ++} ++ ++/* ++ * Blindly insert the item to the tree. Typically, we reuse the ++ * first empty store item. ++ */ ++int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) ++{ ++ unsigned long index; ++ unsigned int height; ++ struct sradix_tree_node *node, *tmp = NULL; ++ int offset, offset_saved; ++ void **store = NULL; ++ int error, i, j, shift; ++ ++go_on: ++ index = root->min; ++ ++ if (root->enter_node && !sradix_node_full(root, root->enter_node)) { ++ node = root->enter_node; ++ BUG_ON((index >> (root->shift * root->height))); ++ } else { ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height)) ++ || sradix_node_full(root, node)) { ++ error = sradix_tree_extend(root, index); ++ if (error) ++ return error; ++ ++ node = root->rnode; ++ } ++ } ++ ++ ++ height = node->height; ++ shift = (height - 1) * root->shift; ++ offset = (index >> shift) & root->mask; ++ while (shift > 0) { ++ offset_saved = offset; ++ for (; offset < root->stores_size; offset++) { ++ store = &node->stores[offset]; ++ tmp = *store; ++ ++ if (!tmp || !sradix_node_full(root, tmp)) ++ break; ++ } ++ BUG_ON(offset >= root->stores_size); ++ ++ if (offset != offset_saved) { ++ index += (offset - offset_saved) << shift; ++ index &= ~((1UL << shift) - 1); ++ } ++ ++ if (!tmp) { ++ if (!(tmp = root->alloc())) ++ return -ENOMEM; ++ ++ tmp->height = shift / root->shift; ++ *store = tmp; ++ tmp->parent = node; ++ node->count++; ++// if (root->extend) ++// root->extend(node, tmp); ++ } ++ ++ node = tmp; ++ shift -= root->shift; ++ offset = (index >> shift) & root->mask; ++ } ++ ++ BUG_ON(node->height != 1); ++ ++ ++ store = &node->stores[offset]; ++ for (i = 0, j = 0; ++ j < root->stores_size - node->count && ++ i < root->stores_size - offset && j < num; i++) { ++ if (!store[i]) { ++ store[i] = item[j]; ++ if (root->assign) ++ root->assign(node, index + i, item[j]); ++ j++; ++ } ++ } ++ ++ node->count += j; ++ root->num += j; ++ num -= j; ++ ++ while (sradix_node_full(root, node)) { ++ node = node->parent; ++ if (!node) ++ break; ++ ++ node->fulls++; ++ } ++ ++ if (unlikely(!node)) { ++ /* All nodes are full */ ++ root->min = 1 << (root->height * root->shift); ++ root->enter_node = NULL; ++ } else { ++ root->min = index + i - 1; ++ root->min |= (1UL << (node->height - 1)) - 1; ++ root->min++; ++ root->enter_node = node; ++ } ++ ++ if (num) { ++ item += j; ++ goto go_on; ++ } ++ ++ return 0; ++} ++ ++ ++/** ++ * sradix_tree_shrink - shrink height of a sradix tree to minimal ++ * @root sradix tree root ++ * ++ */ ++static inline void sradix_tree_shrink(struct sradix_tree_root *root) ++{ ++ /* try to shrink tree height */ ++ while (root->height > 1) { ++ struct sradix_tree_node *to_free = root->rnode; ++ ++ /* ++ * The candidate node has more than one child, or its child ++ * is not at the leftmost store, we cannot shrink. ++ */ ++ if (to_free->count != 1 || !to_free->stores[0]) ++ break; ++ ++ root->rnode = to_free->stores[0]; ++ root->rnode->parent = NULL; ++ root->height--; ++ if (unlikely(root->enter_node == to_free)) { ++ root->enter_node = NULL; ++ } ++ root->free(to_free); ++ } ++} ++ ++/* ++ * Del the item on the known leaf node and index ++ */ ++void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index) ++{ ++ unsigned int offset; ++ struct sradix_tree_node *start, *end; ++ ++ BUG_ON(node->height != 1); ++ ++ start = node; ++ while (node && !(--node->count)) ++ node = node->parent; ++ ++ end = node; ++ if (!node) { ++ root->rnode = NULL; ++ root->height = 0; ++ root->min = 0; ++ root->num = 0; ++ root->enter_node = NULL; ++ } else { ++ offset = (index >> (root->shift * (node->height - 1))) & root->mask; ++ if (root->rm) ++ root->rm(node, offset); ++ node->stores[offset] = NULL; ++ root->num--; ++ if (root->min > index) { ++ root->min = index; ++ root->enter_node = node; ++ } ++ } ++ ++ if (start != end) { ++ do { ++ node = start; ++ start = start->parent; ++ if (unlikely(root->enter_node == node)) ++ root->enter_node = end; ++ root->free(node); ++ } while (start != end); ++ ++ /* ++ * Note that shrink may free "end", so enter_node still need to ++ * be checked inside. ++ */ ++ sradix_tree_shrink(root); ++ } else if (node->count == root->stores_size - 1) { ++ /* It WAS a full leaf node. Update the ancestors */ ++ node = node->parent; ++ while (node) { ++ node->fulls--; ++ if (node->fulls != root->stores_size - 1) ++ break; ++ ++ node = node->parent; ++ } ++ } ++} ++ ++void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node; ++ int shift; ++ ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height))) ++ return NULL; ++ ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ node = node->stores[offset]; ++ if (!node) ++ return NULL; ++ ++ shift -= root->shift; ++ } while (shift >= 0); ++ ++ return node; ++} ++ ++/* ++ * Return the item if it exists, otherwise create it in place ++ * and return the created item. ++ */ ++void *sradix_tree_lookup_create(struct sradix_tree_root *root, ++ unsigned long index, void *(*item_alloc)(void)) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node, *tmp; ++ void *item; ++ int shift, error; ++ ++ if (root->rnode == NULL || (index >> (root->shift * root->height))) { ++ if (item_alloc) { ++ error = sradix_tree_extend(root, index); ++ if (error) ++ return NULL; ++ } else { ++ return NULL; ++ } ++ } ++ ++ node = root->rnode; ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ if (!node->stores[offset]) { ++ if (!(tmp = root->alloc())) ++ return NULL; ++ ++ tmp->height = shift / root->shift; ++ node->stores[offset] = tmp; ++ tmp->parent = node; ++ node->count++; ++ node = tmp; ++ } else { ++ node = node->stores[offset]; ++ } ++ ++ shift -= root->shift; ++ } while (shift > 0); ++ ++ BUG_ON(node->height != 1); ++ offset = index & root->mask; ++ if (node->stores[offset]) { ++ return node->stores[offset]; ++ } else if (item_alloc) { ++ if (!(item = item_alloc())) ++ return NULL; ++ ++ node->stores[offset] = item; ++ ++ /* ++ * NOTE: we do NOT call root->assign here, since this item is ++ * newly created by us having no meaning. Caller can call this ++ * if it's necessary to do so. ++ */ ++ ++ node->count++; ++ root->num++; ++ ++ while (sradix_node_full(root, node)) { ++ node = node->parent; ++ if (!node) ++ break; ++ ++ node->fulls++; ++ } ++ ++ if (unlikely(!node)) { ++ /* All nodes are full */ ++ root->min = 1 << (root->height * root->shift); ++ } else { ++ if (root->min == index) { ++ root->min |= (1UL << (node->height - 1)) - 1; ++ root->min++; ++ root->enter_node = node; ++ } ++ } ++ ++ return item; ++ } else { ++ return NULL; ++ } ++ ++} ++ ++int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node; ++ int shift; ++ ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height))) ++ return -ENOENT; ++ ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ node = node->stores[offset]; ++ if (!node) ++ return -ENOENT; ++ ++ shift -= root->shift; ++ } while (shift > 0); ++ ++ offset = index & root->mask; ++ if (!node->stores[offset]) ++ return -ENOENT; ++ ++ sradix_tree_delete_from_leaf(root, node, index); ++ ++ return 0; ++} +diff --git a/mm/Kconfig b/mm/Kconfig +index 011b110..b766090 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -239,6 +239,32 @@ config KSM + See Documentation/vm/ksm.txt for more information: KSM is inactive + until a program has madvised that an area is MADV_MERGEABLE, and + root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). ++choice ++ prompt "Choose UKSM/KSM strategy" ++ default UKSM ++ depends on KSM ++ help ++ This option allows to select a UKSM/KSM stragety. ++ ++config UKSM ++ bool "Ultra-KSM for page merging" ++ depends on KSM ++ help ++ UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same ++ page Merging), but with a fundamentally rewritten core algorithm. With ++ an advanced algorithm, UKSM now can transparently scans all anonymously ++ mapped user space applications with an significantly improved scan speed ++ and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from ++ UKSM. Now UKSM has its first stable release and first real world enterprise user. ++ For more information, please goto its project page. ++ (www.kerneldedup.org) ++ ++config KSM_LEGACY ++ bool "Legacy KSM implementation" ++ depends on KSM ++ help ++ The legacy KSM implementation from Redhat. ++endchoice + + config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" +diff --git a/mm/Makefile b/mm/Makefile +index 50ec00e..c551bae 100644 +--- a/mm/Makefile ++++ b/mm/Makefile +@@ -34,7 +34,8 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o + obj-$(CONFIG_SLOB) += slob.o + obj-$(CONFIG_COMPACTION) += compaction.o + obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +-obj-$(CONFIG_KSM) += ksm.o ++obj-$(CONFIG_KSM_LEGACY) += ksm.o ++obj-$(CONFIG_UKSM) += uksm.o + obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o + obj-$(CONFIG_SLAB) += slab.o + obj-$(CONFIG_SLUB) += slub.o +diff --git a/mm/memory.c b/mm/memory.c +index 70f5daf..861bcbc 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -112,6 +112,37 @@ __setup("norandmaps", disable_randmaps); + unsigned long zero_pfn __read_mostly; + unsigned long highest_memmap_pfn __read_mostly; + ++#ifdef CONFIG_UKSM ++unsigned long uksm_zero_pfn __read_mostly; ++struct page *empty_uksm_zero_page; ++ ++static int __init setup_uksm_zero_page(void) ++{ ++ unsigned long addr; ++ addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0); ++ if (!addr) ++ panic("Oh boy, that early out of memory?"); ++ ++ empty_uksm_zero_page = virt_to_page((void *) addr); ++ SetPageReserved(empty_uksm_zero_page); ++ ++ uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page); ++ ++ return 0; ++} ++core_initcall(setup_uksm_zero_page); ++ ++static inline int is_uksm_zero_pfn(unsigned long pfn) ++{ ++ return pfn == uksm_zero_pfn; ++} ++#else ++static inline int is_uksm_zero_pfn(unsigned long pfn) ++{ ++ return 0; ++} ++#endif ++ + /* + * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() + */ +@@ -123,6 +154,7 @@ static int __init init_zero_pfn(void) + core_initcall(init_zero_pfn); + + ++ + #if defined(SPLIT_RSS_COUNTING) + + static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) +@@ -739,8 +771,10 @@ static inline int is_cow_mapping(vm_flags_t flags) + #ifndef is_zero_pfn + static inline int is_zero_pfn(unsigned long pfn) + { +- return pfn == zero_pfn; ++ return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn)); + } ++#else ++#define is_zero_pfn(pfn) (is_zero_pfn(pfn) || is_uksm_zero_pfn(pfn)) + #endif + + #ifndef my_zero_pfn +@@ -917,6 +951,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; ++ ++ /* Should return NULL in vm_normal_page() */ ++ uksm_bugon_zeropage(pte); ++ } else { ++ uksm_map_zero_page(pte); + } + + out_set_pte: +@@ -1152,8 +1191,10 @@ again: + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + tlb_remove_tlb_entry(tlb, pte, addr); +- if (unlikely(!page)) ++ if (unlikely(!page)) { ++ uksm_unmap_zero_page(ptent); + continue; ++ } + if (unlikely(details) && details->nonlinear_vma + && linear_page_index(details->nonlinear_vma, + addr) != page->index) +@@ -1645,7 +1686,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); + +- /* ++ /* + * Require read or write permissions. + * If FOLL_FORCE is set, we only require the "MAY" flags. + */ +@@ -1692,7 +1733,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + page = vm_normal_page(vma, start, *pte); + if (!page) { + if (!(gup_flags & FOLL_DUMP) && +- is_zero_pfn(pte_pfn(*pte))) ++ (is_zero_pfn(pte_pfn(*pte)))) + page = pte_page(*pte); + else { + pte_unmap(pte); +@@ -2452,8 +2493,10 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo + clear_page(kaddr); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(dst); +- } else ++ } else { + copy_user_highpage(dst, src, va, vma); ++ uksm_cow_page(vma, src); ++ } + } + + /* +@@ -2651,6 +2694,7 @@ gotten: + new_page = alloc_zeroed_user_highpage_movable(vma, address); + if (!new_page) + goto oom; ++ uksm_cow_pte(vma, orig_pte); + } else { + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) +@@ -2672,8 +2716,11 @@ gotten: + dec_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); + } +- } else ++ uksm_bugon_zeropage(orig_pte); ++ } else { ++ uksm_unmap_zero_page(orig_pte); + inc_mm_counter_fast(mm, MM_ANONPAGES); ++ } + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = mk_pte(new_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); +diff --git a/mm/mmap.c b/mm/mmap.c +index eae90af..e723d3a 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -65,7 +66,7 @@ static void unmap_region(struct mm_struct *mm, + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes +- * ++ * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes +@@ -236,6 +237,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) + removed_exe_file_vma(vma->vm_mm); + } + mpol_put(vma_policy(vma)); ++ uksm_remove_vma(vma); + kmem_cache_free(vm_area_cachep, vma); + return next; + } +@@ -500,9 +502,16 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, + long adjust_next = 0; + int remove_next = 0; + ++/* ++ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is ++ * acquired ++ */ ++ uksm_remove_vma(vma); ++ + if (next && !insert) { + struct vm_area_struct *exporter = NULL; + ++ uksm_remove_vma(next); + if (end >= next->vm_end) { + /* + * vma expands, overlapping all the next, and +@@ -578,10 +587,10 @@ again: remove_next = 1 + (end > next->vm_end); + if (adjust_next) + vma_prio_tree_remove(next, root); + } +- + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; ++ + if (adjust_next) { + next->vm_start += adjust_next << PAGE_SHIFT; + next->vm_pgoff += adjust_next; +@@ -634,10 +643,15 @@ again: remove_next = 1 + (end > next->vm_end); + */ + if (remove_next == 2) { + next = vma->vm_next; ++ uksm_remove_vma(next); + goto again; + } ++ } else { ++ if (next && !insert) ++ uksm_vma_add_new(next); + } + ++ uksm_vma_add_new(vma); + validate_mm(mm); + + return 0; +@@ -992,6 +1006,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + ++ /* If uksm is enabled, we add VM_MERGABLE to new VMAs. */ ++ uksm_vm_flags_mod(&vm_flags); ++ + if (flags & MAP_LOCKED) + if (!can_do_mlock()) + return -EPERM; +@@ -1315,6 +1332,7 @@ munmap_back: + + vma_link(mm, vma, prev, rb_link, rb_parent); + file = vma->vm_file; ++ uksm_vma_add_new(vma); + + /* Once vma denies write, undo our temporary denial count */ + if (correct_wcount) +@@ -1341,6 +1359,7 @@ unmap_and_free_vma: + unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); + charged = 0; + free_vma: ++ uksm_remove_vma(vma); + kmem_cache_free(vm_area_cachep, vma); + unacct_error: + if (charged) +@@ -1416,7 +1435,7 @@ full_search: + addr = vma->vm_end; + } + } +-#endif ++#endif + + void arch_unmap_area(struct mm_struct *mm, unsigned long addr) + { +@@ -1978,6 +1997,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, + else + err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + ++ uksm_vma_add_new(new); ++ + /* Success. */ + if (!err) + return 0; +@@ -2147,6 +2168,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) + return error; + + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; ++ uksm_vm_flags_mod(&flags); + + error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (error & ~PAGE_MASK) +@@ -2215,6 +2237,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) + vma->vm_flags = flags; + vma->vm_page_prot = vm_get_page_prot(flags); + vma_link(mm, vma, prev, rb_link, rb_parent); ++ uksm_vma_add_new(vma); + out: + perf_event_mmap(vma); + mm->total_vm += len >> PAGE_SHIFT; +@@ -2238,6 +2261,12 @@ void exit_mmap(struct mm_struct *mm) + /* mm's last user has gone, and its about to be pulled down */ + mmu_notifier_release(mm); + ++ /* ++ * Taking write lock on mmap_sem does not harm others, ++ * but it's crucial for uksm to avoid races. ++ */ ++ down_write(&mm->mmap_sem); ++ + if (mm->locked_vm) { + vma = mm->mmap; + while (vma) { +@@ -2271,6 +2300,11 @@ void exit_mmap(struct mm_struct *mm) + while (vma) + vma = remove_vma(vma); + ++ mm->mmap = NULL; ++ mm->mm_rb = RB_ROOT; ++ mm->mmap_cache = NULL; ++ up_write(&mm->mmap_sem); ++ + BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + } + +@@ -2362,6 +2396,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + vma_link(mm, new_vma, prev, rb_link, rb_parent); ++ uksm_vma_add_new(new_vma); + } + } + return new_vma; +@@ -2467,10 +2502,10 @@ int install_special_mapping(struct mm_struct *mm, + ret = insert_vm_struct(mm, vma); + if (ret) + goto out; +- + mm->total_vm += len >> PAGE_SHIFT; + + perf_event_mmap(vma); ++ uksm_vma_add_new(vma); + + return 0; + +diff --git a/mm/rmap.c b/mm/rmap.c +index a4fd368..f11b505 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -972,9 +972,9 @@ void page_move_anon_rmap(struct page *page, + + /** + * __page_set_anon_rmap - set up new anonymous rmap +- * @page: Page to add to rmap ++ * @page: Page to add to rmap + * @vma: VM area to add page to. +- * @address: User virtual address of the mapping ++ * @address: User virtual address of the mapping + * @exclusive: the page is exclusively owned by the current process + */ + static void __page_set_anon_rmap(struct page *page, +diff --git a/mm/uksm.c b/mm/uksm.c +new file mode 100644 +index 0000000..967c755 +--- /dev/null ++++ b/mm/uksm.c +@@ -0,0 +1,5616 @@ ++/* ++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia ++ * ++ * This is an improvement upon KSM. Some basic data structures and routines ++ * are borrowed from ksm.c . ++ * ++ * Its new features: ++ * 1. Full system scan: ++ * It automatically scans all user processes' anonymous VMAs. Kernel-user ++ * interaction to submit a memory area to KSM is no longer needed. ++ * ++ * 2. Rich area detection: ++ * It automatically detects rich areas containing abundant duplicated ++ * pages based. Rich areas are given a full scan speed. Poor areas are ++ * sampled at a reasonable speed with very low CPU consumption. ++ * ++ * 3. Ultra Per-page scan speed improvement: ++ * A new hash algorithm is proposed. As a result, on a machine with ++ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it ++ * can scan memory areas that does not contain duplicated pages at speed of ++ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of ++ * 477MB/sec ~ 923MB/sec. ++ * ++ * 4. Thrashing area avoidance: ++ * Thrashing area(an VMA that has frequent Ksm page break-out) can be ++ * filtered out. My benchmark shows it's more efficient than KSM's per-page ++ * hash value based volatile page detection. ++ * ++ * ++ * 5. Misc changes upon KSM: ++ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page ++ * comparison. It's much faster than default C version on x86. ++ * * rmap_item now has an struct *page member to loosely cache a ++ * address-->page mapping, which reduces too much time-costly ++ * follow_page(). ++ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. ++ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ ++ * ksm is needed for this case. ++ * ++ * 6. Full Zero Page consideration(contributed by Figo Zhang) ++ * Now uksmd consider full zero pages as special pages and merge them to an ++ * special unswappable uksm zero page. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include "internal.h" ++ ++#ifdef CONFIG_X86 ++#undef memcmp ++ ++#ifdef CONFIG_X86_32 ++#define memcmp memcmpx86_32 ++/* ++ * Compare 4-byte-aligned address s1 and s2, with length n ++ */ ++int memcmpx86_32(void *s1, void *s2, size_t n) ++{ ++ size_t num = n / 4; ++ register int res; ++ ++ __asm__ __volatile__ ++ ( ++ "testl %3,%3\n\t" ++ "repe; cmpsd\n\t" ++ "je 1f\n\t" ++ "sbbl %0,%0\n\t" ++ "orl $1,%0\n" ++ "1:" ++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) ++ : "0" (0) ++ : "cc"); ++ ++ return res; ++} ++ ++/* ++ * Check the page is all zero ? ++ */ ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned char same; ++ ++ len /= 4; ++ ++ __asm__ __volatile__ ++ ("repe; scasl;" ++ "sete %0" ++ : "=qm" (same), "+D" (s1), "+c" (len) ++ : "a" (0) ++ : "cc"); ++ ++ return same; ++} ++ ++ ++#elif defined(CONFIG_X86_64) ++#define memcmp memcmpx86_64 ++/* ++ * Compare 8-byte-aligned address s1 and s2, with length n ++ */ ++int memcmpx86_64(void *s1, void *s2, size_t n) ++{ ++ size_t num = n / 8; ++ register int res; ++ ++ __asm__ __volatile__ ++ ( ++ "testq %q3,%q3\n\t" ++ "repe; cmpsq\n\t" ++ "je 1f\n\t" ++ "sbbq %q0,%q0\n\t" ++ "orq $1,%q0\n" ++ "1:" ++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) ++ : "0" (0) ++ : "cc"); ++ ++ return res; ++} ++ ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned char same; ++ ++ len /= 8; ++ ++ __asm__ __volatile__ ++ ("repe; scasq;" ++ "sete %0" ++ : "=qm" (same), "+D" (s1), "+c" (len) ++ : "a" (0) ++ : "cc"); ++ ++ return same; ++} ++ ++#endif ++#else ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned long *src = s1; ++ int i; ++ ++ len /= sizeof(*src); ++ ++ for (i = 0; i < len; i++) { ++ if (src[i]) ++ return 0; ++ } ++ ++ return 1; ++} ++#endif ++ ++#define U64_MAX (~((u64)0)) ++#define UKSM_RUNG_ROUND_FINISHED (1 << 0) ++#define TIME_RATIO_SCALE 10000 ++ ++#define SLOT_TREE_NODE_SHIFT 8 ++#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT) ++struct slot_tree_node { ++ unsigned long size; ++ struct sradix_tree_node snode; ++ void *stores[SLOT_TREE_NODE_STORE_SIZE]; ++}; ++ ++static struct kmem_cache *slot_tree_node_cachep; ++ ++static struct sradix_tree_node *slot_tree_node_alloc(void) ++{ ++ struct slot_tree_node *p; ++ p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL); ++ if (!p) ++ return NULL; ++ ++ return &p->snode; ++} ++ ++static void slot_tree_node_free(struct sradix_tree_node *node) ++{ ++ struct slot_tree_node *p; ++ ++ p = container_of(node, struct slot_tree_node, snode); ++ kmem_cache_free(slot_tree_node_cachep, p); ++} ++ ++static void slot_tree_node_extend(struct sradix_tree_node *parent, ++ struct sradix_tree_node *child) ++{ ++ struct slot_tree_node *p, *c; ++ ++ p = container_of(parent, struct slot_tree_node, snode); ++ c = container_of(child, struct slot_tree_node, snode); ++ ++ p->size += c->size; ++} ++ ++void slot_tree_node_assign(struct sradix_tree_node *node, ++ unsigned index, void *item) ++{ ++ struct vma_slot *slot = item; ++ struct slot_tree_node *cur; ++ ++ slot->snode = node; ++ slot->sindex = index; ++ ++ while (node) { ++ cur = container_of(node, struct slot_tree_node, snode); ++ cur->size += slot->pages; ++ node = node->parent; ++ } ++} ++ ++void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset) ++{ ++ struct vma_slot *slot; ++ struct slot_tree_node *cur; ++ unsigned long pages; ++ ++ if (node->height == 1) { ++ slot = node->stores[offset]; ++ pages = slot->pages; ++ } else { ++ cur = container_of(node->stores[offset], ++ struct slot_tree_node, snode); ++ pages = cur->size; ++ } ++ ++ while (node) { ++ cur = container_of(node, struct slot_tree_node, snode); ++ cur->size -= pages; ++ node = node->parent; ++ } ++} ++ ++unsigned long slot_iter_index; ++int slot_iter(void *item, unsigned long height) ++{ ++ struct slot_tree_node *node; ++ struct vma_slot *slot; ++ ++ if (height == 1) { ++ slot = item; ++ if (slot_iter_index < slot->pages) { ++ /*in this one*/ ++ return 1; ++ } else { ++ slot_iter_index -= slot->pages; ++ return 0; ++ } ++ ++ } else { ++ node = container_of(item, struct slot_tree_node, snode); ++ if (slot_iter_index < node->size) { ++ /*in this one*/ ++ return 1; ++ } else { ++ slot_iter_index -= node->size; ++ return 0; ++ } ++ } ++} ++ ++ ++static inline void slot_tree_init_root(struct sradix_tree_root *root) ++{ ++ init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT); ++ root->alloc = slot_tree_node_alloc; ++ root->free = slot_tree_node_free; ++ root->extend = slot_tree_node_extend; ++ root->assign = slot_tree_node_assign; ++ root->rm = slot_tree_node_rm; ++} ++ ++void slot_tree_init(void) ++{ ++ slot_tree_node_cachep = kmem_cache_create("slot_tree_node", ++ sizeof(struct slot_tree_node), 0, ++ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, ++ NULL); ++} ++ ++ ++/* Each rung of this ladder is a list of VMAs having a same scan ratio */ ++struct scan_rung { ++ //struct list_head scanned_list; ++ struct sradix_tree_root vma_root; ++ struct sradix_tree_root vma_root2; ++ ++ struct vma_slot *current_scan; ++ unsigned long current_offset; ++ ++ /* ++ * The initial value for current_offset, it should loop over ++ * [0~ step - 1] to let all slot have its chance to be scanned. ++ */ ++ unsigned long offset_init; ++ unsigned long step; /* dynamic step for current_offset */ ++ unsigned int flags; ++ unsigned long pages_to_scan; ++ //unsigned long fully_scanned_slots; ++ /* ++ * a little bit tricky - if cpu_time_ratio > 0, then the value is the ++ * the cpu time ratio it can spend in rung_i for every scan ++ * period. if < 0, then it is the cpu time ratio relative to the ++ * max cpu percentage user specified. Both in unit of ++ * 1/TIME_RATIO_SCALE ++ */ ++ int cpu_ratio; ++ ++ /* ++ * How long it will take for all slots in this rung to be fully ++ * scanned? If it's zero, we don't care about the cover time: ++ * it's fully scanned. ++ */ ++ unsigned int cover_msecs; ++ //unsigned long vma_num; ++ //unsigned long pages; /* Sum of all slot's pages in rung */ ++}; ++ ++/** ++ * node of either the stable or unstale rbtree ++ * ++ */ ++struct tree_node { ++ struct rb_node node; /* link in the main (un)stable rbtree */ ++ struct rb_root sub_root; /* rb_root for sublevel collision rbtree */ ++ u32 hash; ++ unsigned long count; /* TODO: merged with sub_root */ ++ struct list_head all_list; /* all tree nodes in stable/unstable tree */ ++}; ++ ++/** ++ * struct stable_node - node of the stable rbtree ++ * @node: rb node of this ksm page in the stable tree ++ * @hlist: hlist head of rmap_items using this ksm page ++ * @kpfn: page frame number of this ksm page ++ */ ++struct stable_node { ++ struct rb_node node; /* link in sub-rbtree */ ++ struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */ ++ struct hlist_head hlist; ++ unsigned long kpfn; ++ u32 hash_max; /* if ==0 then it's not been calculated yet */ ++ struct list_head all_list; /* in a list for all stable nodes */ ++}; ++ ++/** ++ * struct node_vma - group rmap_items linked in a same stable ++ * node together. ++ */ ++struct node_vma { ++ union { ++ struct vma_slot *slot; ++ unsigned long key; /* slot is used as key sorted on hlist */ ++ }; ++ struct hlist_node hlist; ++ struct hlist_head rmap_hlist; ++ struct stable_node *head; ++}; ++ ++/** ++ * struct rmap_item - reverse mapping item for virtual addresses ++ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list ++ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree ++ * @mm: the memory structure this rmap_item is pointing into ++ * @address: the virtual address this rmap_item tracks (+ flags in low bits) ++ * @node: rb node of this rmap_item in the unstable tree ++ * @head: pointer to stable_node heading this list in the stable tree ++ * @hlist: link into hlist of rmap_items hanging off that stable_node ++ */ ++struct rmap_item { ++ struct vma_slot *slot; ++ struct page *page; ++ unsigned long address; /* + low bits used for flags below */ ++ unsigned long hash_round; ++ unsigned long entry_index; ++ union { ++ struct {/* when in unstable tree */ ++ struct rb_node node; ++ struct tree_node *tree_node; ++ u32 hash_max; ++ }; ++ struct { /* when in stable tree */ ++ struct node_vma *head; ++ struct hlist_node hlist; ++ struct anon_vma *anon_vma; ++ }; ++ }; ++} __attribute__((aligned(4))); ++ ++struct rmap_list_entry { ++ union { ++ struct rmap_item *item; ++ unsigned long addr; ++ }; ++ /* lowest bit is used for is_addr tag */ ++} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/ ++ ++ ++/* Basic data structure definition ends */ ++ ++ ++/* ++ * Flags for rmap_item to judge if it's listed in the stable/unstable tree. ++ * The flags use the low bits of rmap_item.address ++ */ ++#define UNSTABLE_FLAG 0x1 ++#define STABLE_FLAG 0x2 ++#define get_rmap_addr(x) ((x)->address & PAGE_MASK) ++ ++/* ++ * rmap_list_entry helpers ++ */ ++#define IS_ADDR_FLAG 1 ++#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG) ++#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG) ++#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG)) ++ ++ ++/* ++ * High speed caches for frequently allocated and freed structs ++ */ ++static struct kmem_cache *rmap_item_cache; ++static struct kmem_cache *stable_node_cache; ++static struct kmem_cache *node_vma_cache; ++static struct kmem_cache *vma_slot_cache; ++static struct kmem_cache *tree_node_cache; ++#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\ ++ sizeof(struct __struct), __alignof__(struct __struct),\ ++ (__flags), NULL) ++ ++/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */ ++#define SCAN_LADDER_SIZE 4 ++static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE]; ++ ++/* The evaluation rounds uksmd has finished */ ++static unsigned long long uksm_eval_round = 1; ++ ++/* ++ * we add 1 to this var when we consider we should rebuild the whole ++ * unstable tree. ++ */ ++static unsigned long uksm_hash_round = 1; ++ ++/* ++ * How many times the whole memory is scanned. ++ */ ++static unsigned long long fully_scanned_round = 1; ++ ++/* The total number of virtual pages of all vma slots */ ++static u64 uksm_pages_total; ++ ++/* The number of pages has been scanned since the start up */ ++static u64 uksm_pages_scanned; ++ ++static u64 scanned_virtual_pages; ++ ++/* The number of pages has been scanned since last encode_benefit call */ ++static u64 uksm_pages_scanned_last; ++ ++/* If the scanned number is tooo large, we encode it here */ ++static u64 pages_scanned_stored; ++ ++static unsigned long pages_scanned_base; ++ ++/* The number of nodes in the stable tree */ ++static unsigned long uksm_pages_shared; ++ ++/* The number of page slots additionally sharing those nodes */ ++static unsigned long uksm_pages_sharing; ++ ++/* The number of nodes in the unstable tree */ ++static unsigned long uksm_pages_unshared; ++ ++/* ++ * Milliseconds ksmd should sleep between scans, ++ * >= 100ms to be consistent with ++ * scan_time_to_sleep_msec() ++ */ ++static unsigned int uksm_sleep_jiffies; ++ ++/* The real value for the uksmd next sleep */ ++static unsigned int uksm_sleep_real; ++ ++/* Saved value for user input uksm_sleep_jiffies when it's enlarged */ ++static unsigned int uksm_sleep_saved; ++ ++/* Max percentage of cpu utilization ksmd can take to scan in one batch */ ++static unsigned int uksm_max_cpu_percentage; ++ ++static int uksm_cpu_governor; ++ ++static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" }; ++ ++struct uksm_cpu_preset_s { ++ int cpu_ratio[SCAN_LADDER_SIZE]; ++ unsigned int cover_msecs[SCAN_LADDER_SIZE]; ++ unsigned int max_cpu; /* percentage */ ++}; ++ ++struct uksm_cpu_preset_s uksm_cpu_preset[4] = { ++ { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95}, ++ { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50}, ++ { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20}, ++ { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1}, ++}; ++ ++/* The default value for uksm_ema_page_time if it's not initialized */ ++#define UKSM_PAGE_TIME_DEFAULT 500 ++ ++/*cost to scan one page by expotional moving average in nsecs */ ++static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; ++ ++/* The expotional moving average alpha weight, in percentage. */ ++#define EMA_ALPHA 20 ++ ++/* ++ * The threshold used to filter out thrashing areas, ++ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound ++ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio ++ * will be considered as having a zero duplication ratio. ++ */ ++static unsigned int uksm_thrash_threshold = 50; ++ ++/* How much dedup ratio is considered to be abundant*/ ++static unsigned int uksm_abundant_threshold = 10; ++ ++/* All slots having merged pages in this eval round. */ ++struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup); ++ ++/* How many times the ksmd has slept since startup */ ++static unsigned long long uksm_sleep_times; ++ ++#define UKSM_RUN_STOP 0 ++#define UKSM_RUN_MERGE 1 ++static unsigned int uksm_run = 1; ++ ++static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait); ++static DEFINE_MUTEX(uksm_thread_mutex); ++ ++/* ++ * List vma_slot_new is for newly created vma_slot waiting to be added by ++ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to ++ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding ++ * VMA has been removed/freed. ++ */ ++struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new); ++struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd); ++struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del); ++static DEFINE_SPINLOCK(vma_slot_list_lock); ++ ++/* The unstable tree heads */ ++static struct rb_root root_unstable_tree = RB_ROOT; ++ ++/* ++ * All tree_nodes are in a list to be freed at once when unstable tree is ++ * freed after each scan round. ++ */ ++static struct list_head unstable_tree_node_list = ++ LIST_HEAD_INIT(unstable_tree_node_list); ++ ++/* List contains all stable nodes */ ++static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list); ++ ++/* ++ * When the hash strength is changed, the stable tree must be delta_hashed and ++ * re-structured. We use two set of below structs to speed up the ++ * re-structuring of stable tree. ++ */ ++static struct list_head ++stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]), ++ LIST_HEAD_INIT(stable_tree_node_list[1])}; ++ ++static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0]; ++static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT}; ++static struct rb_root *root_stable_treep = &root_stable_tree[0]; ++static unsigned long stable_tree_index; ++ ++/* The hash strength needed to hash a full page */ ++#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32)) ++ ++/* The hash strength needed for loop-back hashing */ ++#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10) ++ ++/* The random offsets in a page */ ++static u32 *random_nums; ++ ++/* The hash strength */ ++static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4; ++ ++/* The delta value each time the hash strength increases or decreases */ ++static unsigned long hash_strength_delta; ++#define HASH_STRENGTH_DELTA_MAX 5 ++ ++/* The time we have saved due to random_sample_hash */ ++static u64 rshash_pos; ++ ++/* The time we have wasted due to hash collision */ ++static u64 rshash_neg; ++ ++struct uksm_benefit { ++ u64 pos; ++ u64 neg; ++ u64 scanned; ++ unsigned long base; ++} benefit; ++ ++/* ++ * The relative cost of memcmp, compared to 1 time unit of random sample ++ * hash, this value is tested when ksm module is initialized ++ */ ++static unsigned long memcmp_cost; ++ ++static unsigned long rshash_neg_cont_zero; ++static unsigned long rshash_cont_obscure; ++ ++/* The possible states of hash strength adjustment heuristic */ ++enum rshash_states { ++ RSHASH_STILL, ++ RSHASH_TRYUP, ++ RSHASH_TRYDOWN, ++ RSHASH_NEW, ++ RSHASH_PRE_STILL, ++}; ++ ++/* The possible direction we are about to adjust hash strength */ ++enum rshash_direct { ++ GO_UP, ++ GO_DOWN, ++ OBSCURE, ++ STILL, ++}; ++ ++/* random sampling hash state machine */ ++static struct { ++ enum rshash_states state; ++ enum rshash_direct pre_direct; ++ u8 below_count; ++ /* Keep a lookup window of size 5, iff above_count/below_count > 3 ++ * in this window we stop trying. ++ */ ++ u8 lookup_window_index; ++ u64 stable_benefit; ++ unsigned long turn_point_down; ++ unsigned long turn_benefit_down; ++ unsigned long turn_point_up; ++ unsigned long turn_benefit_up; ++ unsigned long stable_point; ++} rshash_state; ++ ++/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/ ++static u32 *zero_hash_table; ++ ++static inline struct node_vma *alloc_node_vma(void) ++{ ++ struct node_vma *node_vma; ++ node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL); ++ if (node_vma) { ++ INIT_HLIST_HEAD(&node_vma->rmap_hlist); ++ INIT_HLIST_NODE(&node_vma->hlist); ++ } ++ return node_vma; ++} ++ ++static inline void free_node_vma(struct node_vma *node_vma) ++{ ++ kmem_cache_free(node_vma_cache, node_vma); ++} ++ ++ ++static inline struct vma_slot *alloc_vma_slot(void) ++{ ++ struct vma_slot *slot; ++ ++ /* ++ * In case ksm is not initialized by now. ++ * Oops, we need to consider the call site of uksm_init() in the future. ++ */ ++ if (!vma_slot_cache) ++ return NULL; ++ ++ slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL); ++ if (slot) { ++ INIT_LIST_HEAD(&slot->slot_list); ++ INIT_LIST_HEAD(&slot->dedup_list); ++ slot->flags |= UKSM_SLOT_NEED_RERAND; ++ } ++ return slot; ++} ++ ++static inline void free_vma_slot(struct vma_slot *vma_slot) ++{ ++ kmem_cache_free(vma_slot_cache, vma_slot); ++} ++ ++ ++ ++static inline struct rmap_item *alloc_rmap_item(void) ++{ ++ struct rmap_item *rmap_item; ++ ++ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); ++ if (rmap_item) { ++ /* bug on lowest bit is not clear for flag use */ ++ BUG_ON(is_addr(rmap_item)); ++ } ++ return rmap_item; ++} ++ ++static inline void free_rmap_item(struct rmap_item *rmap_item) ++{ ++ rmap_item->slot = NULL; /* debug safety */ ++ kmem_cache_free(rmap_item_cache, rmap_item); ++} ++ ++static inline struct stable_node *alloc_stable_node(void) ++{ ++ struct stable_node *node; ++ node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC); ++ if (!node) ++ return NULL; ++ ++ INIT_HLIST_HEAD(&node->hlist); ++ list_add(&node->all_list, &stable_node_list); ++ return node; ++} ++ ++static inline void free_stable_node(struct stable_node *stable_node) ++{ ++ list_del(&stable_node->all_list); ++ kmem_cache_free(stable_node_cache, stable_node); ++} ++ ++static inline struct tree_node *alloc_tree_node(struct list_head *list) ++{ ++ struct tree_node *node; ++ node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC); ++ if (!node) ++ return NULL; ++ ++ list_add(&node->all_list, list); ++ return node; ++} ++ ++static inline void free_tree_node(struct tree_node *node) ++{ ++ list_del(&node->all_list); ++ kmem_cache_free(tree_node_cache, node); ++} ++ ++static void uksm_drop_anon_vma(struct rmap_item *rmap_item) ++{ ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ ++ put_anon_vma(anon_vma); ++} ++ ++ ++/** ++ * Remove a stable node from stable_tree, may unlink from its tree_node and ++ * may remove its parent tree_node if no other stable node is pending. ++ * ++ * @stable_node The node need to be removed ++ * @unlink_rb Will this node be unlinked from the rbtree? ++ * @remove_tree_ node Will its tree_node be removed if empty? ++ */ ++static void remove_node_from_stable_tree(struct stable_node *stable_node, ++ int unlink_rb, int remove_tree_node) ++{ ++ struct node_vma *node_vma; ++ struct rmap_item *rmap_item; ++ struct hlist_node *hlist, *rmap_hlist, *n; ++ ++ if (!hlist_empty(&stable_node->hlist)) { ++ hlist_for_each_entry_safe(node_vma, hlist, n, ++ &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, rmap_hlist, ++ &node_vma->rmap_hlist, hlist) { ++ uksm_pages_sharing--; ++ ++ uksm_drop_anon_vma(rmap_item); ++ rmap_item->address &= PAGE_MASK; ++ } ++ free_node_vma(node_vma); ++ cond_resched(); ++ } ++ ++ /* the last one is counted as shared */ ++ uksm_pages_shared--; ++ uksm_pages_sharing++; ++ } ++ ++ if (stable_node->tree_node && unlink_rb) { ++ rb_erase(&stable_node->node, ++ &stable_node->tree_node->sub_root); ++ ++ if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) && ++ remove_tree_node) { ++ rb_erase(&stable_node->tree_node->node, ++ root_stable_treep); ++ free_tree_node(stable_node->tree_node); ++ } else { ++ stable_node->tree_node->count--; ++ } ++ } ++ ++ free_stable_node(stable_node); ++} ++ ++ ++/* ++ * get_uksm_page: checks if the page indicated by the stable node ++ * is still its ksm page, despite having held no reference to it. ++ * In which case we can trust the content of the page, and it ++ * returns the gotten page; but if the page has now been zapped, ++ * remove the stale node from the stable tree and return NULL. ++ * ++ * You would expect the stable_node to hold a reference to the ksm page. ++ * But if it increments the page's count, swapping out has to wait for ++ * ksmd to come around again before it can free the page, which may take ++ * seconds or even minutes: much too unresponsive. So instead we use a ++ * "keyhole reference": access to the ksm page from the stable node peeps ++ * out through its keyhole to see if that page still holds the right key, ++ * pointing back to this stable node. This relies on freeing a PageAnon ++ * page to reset its page->mapping to NULL, and relies on no other use of ++ * a page to put something that might look like our key in page->mapping. ++ * ++ * include/linux/pagemap.h page_cache_get_speculative() is a good reference, ++ * but this is different - made simpler by uksm_thread_mutex being held, but ++ * interesting for assuming that no other use of the struct page could ever ++ * put our expected_mapping into page->mapping (or a field of the union which ++ * coincides with page->mapping). The RCU calls are not for KSM at all, but ++ * to keep the page_count protocol described with page_cache_get_speculative. ++ * ++ * Note: it is possible that get_uksm_page() will return NULL one moment, ++ * then page the next, if the page is in between page_freeze_refs() and ++ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page ++ * is on its way to being freed; but it is an anomaly to bear in mind. ++ * ++ * @unlink_rb: if the removal of this node will firstly unlink from ++ * its rbtree. stable_node_reinsert will prevent this when restructuring the ++ * node from its old tree. ++ * ++ * @remove_tree_node: if this is the last one of its tree_node, will the ++ * tree_node be freed ? If we are inserting stable node, this tree_node may ++ * be reused, so don't free it. ++ */ ++static struct page *get_uksm_page(struct stable_node *stable_node, ++ int unlink_rb, int remove_tree_node) ++{ ++ struct page *page; ++ void *expected_mapping; ++ ++ page = pfn_to_page(stable_node->kpfn); ++ expected_mapping = (void *)stable_node + ++ (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); ++ rcu_read_lock(); ++ if (page->mapping != expected_mapping) ++ goto stale; ++ if (!get_page_unless_zero(page)) ++ goto stale; ++ if (page->mapping != expected_mapping) { ++ put_page(page); ++ goto stale; ++ } ++ rcu_read_unlock(); ++ return page; ++stale: ++ rcu_read_unlock(); ++ remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node); ++ ++ return NULL; ++} ++ ++/* ++ * Removing rmap_item from stable or unstable tree. ++ * This function will clean the information from the stable/unstable tree. ++ */ ++static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item) ++{ ++ if (rmap_item->address & STABLE_FLAG) { ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct page *page; ++ ++ node_vma = rmap_item->head; ++ stable_node = node_vma->head; ++ page = get_uksm_page(stable_node, 1, 1); ++ if (!page) ++ goto out; ++ ++ /* ++ * page lock is needed because it's racing with ++ * try_to_unmap_ksm(), etc. ++ */ ++ lock_page(page); ++ hlist_del(&rmap_item->hlist); ++ ++ if (hlist_empty(&node_vma->rmap_hlist)) { ++ hlist_del(&node_vma->hlist); ++ free_node_vma(node_vma); ++ } ++ unlock_page(page); ++ ++ put_page(page); ++ if (hlist_empty(&stable_node->hlist)) { ++ /* do NOT call remove_node_from_stable_tree() here, ++ * it's possible for a forked rmap_item not in ++ * stable tree while the in-tree rmap_items were ++ * deleted. ++ */ ++ uksm_pages_shared--; ++ } else ++ uksm_pages_sharing--; ++ ++ ++ uksm_drop_anon_vma(rmap_item); ++ } else if (rmap_item->address & UNSTABLE_FLAG) { ++ if (rmap_item->hash_round == uksm_hash_round) { ++ ++ rb_erase(&rmap_item->node, ++ &rmap_item->tree_node->sub_root); ++ if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) { ++ rb_erase(&rmap_item->tree_node->node, ++ &root_unstable_tree); ++ ++ free_tree_node(rmap_item->tree_node); ++ } else ++ rmap_item->tree_node->count--; ++ } ++ uksm_pages_unshared--; ++ } ++ ++ rmap_item->address &= PAGE_MASK; ++ rmap_item->hash_max = 0; ++ ++out: ++ cond_resched(); /* we're called from many long loops */ ++} ++ ++static inline int slot_in_uksm(struct vma_slot *slot) ++{ ++ return list_empty(&slot->slot_list); ++} ++ ++/* ++ * Test if the mm is exiting ++ */ ++static inline bool uksm_test_exit(struct mm_struct *mm) ++{ ++ return atomic_read(&mm->mm_users) == 0; ++} ++ ++/** ++ * Need to do two things: ++ * 1. check if slot was moved to del list ++ * 2. make sure the mmap_sem is manipulated under valid vma. ++ * ++ * My concern here is that in some cases, this may make ++ * vma_slot_list_lock() waiters to serialized further by some ++ * sem->wait_lock, can this really be expensive? ++ * ++ * ++ * @return ++ * 0: if successfully locked mmap_sem ++ * -ENOENT: this slot was moved to del list ++ * -EBUSY: vma lock failed ++ */ ++static int try_down_read_slot_mmap_sem(struct vma_slot *slot) ++{ ++ struct vm_area_struct *vma; ++ struct mm_struct *mm; ++ struct rw_semaphore *sem; ++ ++ spin_lock(&vma_slot_list_lock); ++ ++ /* the slot_list was removed and inited from new list, when it enters ++ * uksm_list. If now it's not empty, then it must be moved to del list ++ */ ++ if (!slot_in_uksm(slot)) { ++ spin_unlock(&vma_slot_list_lock); ++ return -ENOENT; ++ } ++ ++ BUG_ON(slot->pages != vma_pages(slot->vma)); ++ /* Ok, vma still valid */ ++ vma = slot->vma; ++ mm = vma->vm_mm; ++ sem = &mm->mmap_sem; ++ ++ if (uksm_test_exit(mm)) { ++ spin_unlock(&vma_slot_list_lock); ++ return -ENOENT; ++ } ++ ++ if (down_read_trylock(sem)) { ++ spin_unlock(&vma_slot_list_lock); ++ return 0; ++ } ++ ++ spin_unlock(&vma_slot_list_lock); ++ return -EBUSY; ++} ++ ++static inline unsigned long ++vma_page_address(struct page *page, struct vm_area_struct *vma) ++{ ++ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); ++ unsigned long address; ++ ++ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); ++ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { ++ /* page should be within @vma mapping range */ ++ return -EFAULT; ++ } ++ return address; ++} ++ ++ ++/* return 0 on success with the item's mmap_sem locked */ ++static inline int get_mergeable_page_lock_mmap(struct rmap_item *item) ++{ ++ struct mm_struct *mm; ++ struct vma_slot *slot = item->slot; ++ int err = -EINVAL; ++ ++ struct page *page; ++ ++ /* ++ * try_down_read_slot_mmap_sem() returns non-zero if the slot ++ * has been removed by uksm_remove_vma(). ++ */ ++ if (try_down_read_slot_mmap_sem(slot)) ++ return -EBUSY; ++ ++ mm = slot->vma->vm_mm; ++ ++ if (uksm_test_exit(mm)) ++ goto failout_up; ++ ++ page = item->page; ++ rcu_read_lock(); ++ if (!get_page_unless_zero(page)) { ++ rcu_read_unlock(); ++ goto failout_up; ++ } ++ ++ /* No need to consider huge page here. */ ++ if (item->slot->vma->anon_vma != page_anon_vma(page) || ++ vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) { ++ /* ++ * TODO: ++ * should we release this item becase of its stale page ++ * mapping? ++ */ ++ put_page(page); ++ rcu_read_unlock(); ++ goto failout_up; ++ } ++ rcu_read_unlock(); ++ return 0; ++ ++failout_up: ++ up_read(&mm->mmap_sem); ++ return err; ++} ++ ++/* ++ * What kind of VMA is considered ? ++ */ ++static inline int vma_can_enter(struct vm_area_struct *vma) ++{ ++ return uksm_flags_can_scan(vma->vm_flags); ++} ++ ++/* ++ * Called whenever a fresh new vma is created A new vma_slot. ++ * is created and inserted into a global list Must be called. ++ * after vma is inserted to its mm . ++ */ ++void uksm_vma_add_new(struct vm_area_struct *vma) ++{ ++ struct vma_slot *slot; ++ ++ if (!vma_can_enter(vma)) { ++ vma->uksm_vma_slot = NULL; ++ return; ++ } ++ ++ slot = alloc_vma_slot(); ++ if (!slot) { ++ vma->uksm_vma_slot = NULL; ++ return; ++ } ++ ++ vma->uksm_vma_slot = slot; ++ vma->vm_flags |= VM_MERGEABLE; ++ slot->vma = vma; ++ slot->mm = vma->vm_mm; ++ slot->ctime_j = jiffies; ++ slot->pages = vma_pages(vma); ++ spin_lock(&vma_slot_list_lock); ++ list_add_tail(&slot->slot_list, &vma_slot_new); ++ spin_unlock(&vma_slot_list_lock); ++} ++ ++/* ++ * Called after vma is unlinked from its mm ++ */ ++void uksm_remove_vma(struct vm_area_struct *vma) ++{ ++ struct vma_slot *slot; ++ ++ if (!vma->uksm_vma_slot) ++ return; ++ ++ slot = vma->uksm_vma_slot; ++ spin_lock(&vma_slot_list_lock); ++ if (slot_in_uksm(slot)) { ++ /** ++ * This slot has been added by ksmd, so move to the del list ++ * waiting ksmd to free it. ++ */ ++ list_add_tail(&slot->slot_list, &vma_slot_del); ++ } else { ++ /** ++ * It's still on new list. It's ok to free slot directly. ++ */ ++ list_del(&slot->slot_list); ++ free_vma_slot(slot); ++ } ++ spin_unlock(&vma_slot_list_lock); ++ vma->uksm_vma_slot = NULL; ++} ++ ++/* 32/3 < they < 32/2 */ ++#define shiftl 8 ++#define shiftr 12 ++ ++#define HASH_FROM_TO(from, to) \ ++for (index = from; index < to; index++) { \ ++ pos = random_nums[index]; \ ++ hash += key[pos]; \ ++ hash += (hash << shiftl); \ ++ hash ^= (hash >> shiftr); \ ++} ++ ++ ++#define HASH_FROM_DOWN_TO(from, to) \ ++for (index = from - 1; index >= to; index--) { \ ++ hash ^= (hash >> shiftr); \ ++ hash ^= (hash >> (shiftr*2)); \ ++ hash -= (hash << shiftl); \ ++ hash += (hash << (shiftl*2)); \ ++ pos = random_nums[index]; \ ++ hash -= key[pos]; \ ++} ++ ++/* ++ * The main random sample hash function. ++ */ ++static u32 random_sample_hash(void *addr, u32 hash_strength) ++{ ++ u32 hash = 0xdeadbeef; ++ int index, pos, loop = hash_strength; ++ u32 *key = (u32 *)addr; ++ ++ if (loop > HASH_STRENGTH_FULL) ++ loop = HASH_STRENGTH_FULL; ++ ++ HASH_FROM_TO(0, loop); ++ ++ if (hash_strength > HASH_STRENGTH_FULL) { ++ loop = hash_strength - HASH_STRENGTH_FULL; ++ HASH_FROM_TO(0, loop); ++ } ++ ++ return hash; ++} ++ ++ ++/** ++ * It's used when hash strength is adjusted ++ * ++ * @addr The page's virtual address ++ * @from The original hash strength ++ * @to The hash strength changed to ++ * @hash The hash value generated with "from" hash value ++ * ++ * return the hash value ++ */ ++static u32 delta_hash(void *addr, int from, int to, u32 hash) ++{ ++ u32 *key = (u32 *)addr; ++ int index, pos; /* make sure they are int type */ ++ ++ if (to > from) { ++ if (from >= HASH_STRENGTH_FULL) { ++ from -= HASH_STRENGTH_FULL; ++ to -= HASH_STRENGTH_FULL; ++ HASH_FROM_TO(from, to); ++ } else if (to <= HASH_STRENGTH_FULL) { ++ HASH_FROM_TO(from, to); ++ } else { ++ HASH_FROM_TO(from, HASH_STRENGTH_FULL); ++ HASH_FROM_TO(0, to - HASH_STRENGTH_FULL); ++ } ++ } else { ++ if (from <= HASH_STRENGTH_FULL) { ++ HASH_FROM_DOWN_TO(from, to); ++ } else if (to >= HASH_STRENGTH_FULL) { ++ from -= HASH_STRENGTH_FULL; ++ to -= HASH_STRENGTH_FULL; ++ HASH_FROM_DOWN_TO(from, to); ++ } else { ++ HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0); ++ HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to); ++ } ++ } ++ ++ return hash; ++} ++ ++ ++ ++ ++#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta)) ++ ++/** ++ * ++ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round ++ * has finished. ++ * ++ * return 0 if no page has been scanned since last call, 1 otherwise. ++ */ ++static inline int encode_benefit(void) ++{ ++ u64 scanned_delta, pos_delta, neg_delta; ++ unsigned long base = benefit.base; ++ ++ scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last; ++ ++ if (!scanned_delta) ++ return 0; ++ ++ scanned_delta >>= base; ++ pos_delta = rshash_pos >> base; ++ neg_delta = rshash_neg >> base; ++ ++ if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) || ++ CAN_OVERFLOW_U64(benefit.neg, neg_delta) || ++ CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) { ++ benefit.scanned >>= 1; ++ benefit.neg >>= 1; ++ benefit.pos >>= 1; ++ benefit.base++; ++ scanned_delta >>= 1; ++ pos_delta >>= 1; ++ neg_delta >>= 1; ++ } ++ ++ benefit.pos += pos_delta; ++ benefit.neg += neg_delta; ++ benefit.scanned += scanned_delta; ++ ++ BUG_ON(!benefit.scanned); ++ ++ rshash_pos = rshash_neg = 0; ++ uksm_pages_scanned_last = uksm_pages_scanned; ++ ++ return 1; ++} ++ ++static inline void reset_benefit(void) ++{ ++ benefit.pos = 0; ++ benefit.neg = 0; ++ benefit.base = 0; ++ benefit.scanned = 0; ++} ++ ++static inline void inc_rshash_pos(unsigned long delta) ++{ ++ if (CAN_OVERFLOW_U64(rshash_pos, delta)) ++ encode_benefit(); ++ ++ rshash_pos += delta; ++} ++ ++static inline void inc_rshash_neg(unsigned long delta) ++{ ++ if (CAN_OVERFLOW_U64(rshash_neg, delta)) ++ encode_benefit(); ++ ++ rshash_neg += delta; ++} ++ ++ ++static inline u32 page_hash(struct page *page, unsigned long hash_strength, ++ int cost_accounting) ++{ ++ u32 val; ++ unsigned long delta; ++ ++ void *addr = kmap_atomic(page, KM_USER0); ++ ++ val = random_sample_hash(addr, hash_strength); ++ kunmap_atomic(addr, KM_USER0); ++ ++ if (cost_accounting) { ++ if (HASH_STRENGTH_FULL > hash_strength) ++ delta = HASH_STRENGTH_FULL - hash_strength; ++ else ++ delta = 0; ++ ++ inc_rshash_pos(delta); ++ } ++ ++ return val; ++} ++ ++static int memcmp_pages(struct page *page1, struct page *page2, ++ int cost_accounting) ++{ ++ char *addr1, *addr2; ++ int ret; ++ ++ addr1 = kmap_atomic(page1, KM_USER0); ++ addr2 = kmap_atomic(page2, KM_USER1); ++ ret = memcmp(addr1, addr2, PAGE_SIZE); ++ kunmap_atomic(addr2, KM_USER1); ++ kunmap_atomic(addr1, KM_USER0); ++ ++ if (cost_accounting) ++ inc_rshash_neg(memcmp_cost); ++ ++ return ret; ++} ++ ++static inline int pages_identical(struct page *page1, struct page *page2) ++{ ++ return !memcmp_pages(page1, page2, 0); ++} ++ ++static inline int is_page_full_zero(struct page *page) ++{ ++ char *addr; ++ int ret; ++ ++ addr = kmap_atomic(page, KM_USER0); ++ ret = is_full_zero(addr, PAGE_SIZE); ++ kunmap_atomic(addr, KM_USER0); ++ ++ return ret; ++} ++ ++static int write_protect_page(struct vm_area_struct *vma, struct page *page, ++ pte_t *orig_pte, pte_t *old_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ unsigned long addr; ++ pte_t *ptep; ++ spinlock_t *ptl; ++ int swapped; ++ int err = -EFAULT; ++ ++ addr = page_address_in_vma(page, vma); ++ if (addr == -EFAULT) ++ goto out; ++ ++ BUG_ON(PageTransCompound(page)); ++ ptep = page_check_address(page, mm, addr, &ptl, 0); ++ if (!ptep) ++ goto out; ++ ++ if (old_pte) ++ *old_pte = *ptep; ++ ++ if (pte_write(*ptep) || pte_dirty(*ptep)) { ++ pte_t entry; ++ ++ swapped = PageSwapCache(page); ++ flush_cache_page(vma, addr, page_to_pfn(page)); ++ /* ++ * Ok this is tricky, when get_user_pages_fast() run it doesnt ++ * take any lock, therefore the check that we are going to make ++ * with the pagecount against the mapcount is racey and ++ * O_DIRECT can happen right after the check. ++ * So we clear the pte and flush the tlb before the check ++ * this assure us that no O_DIRECT can happen after the check ++ * or in the middle of the check. ++ */ ++ entry = ptep_clear_flush(vma, addr, ptep); ++ /* ++ * Check that no O_DIRECT or similar I/O is in progress on the ++ * page ++ */ ++ if (page_mapcount(page) + 1 + swapped != page_count(page)) { ++ set_pte_at(mm, addr, ptep, entry); ++ goto out_unlock; ++ } ++ if (pte_dirty(entry)) ++ set_page_dirty(page); ++ entry = pte_mkclean(pte_wrprotect(entry)); ++ set_pte_at_notify(mm, addr, ptep, entry); ++ } ++ *orig_pte = *ptep; ++ err = 0; ++ ++out_unlock: ++ pte_unmap_unlock(ptep, ptl); ++out: ++ return err; ++} ++ ++#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */ ++#define MERGE_ERR_COLLI 2 /* there is a collision */ ++#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */ ++#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */ ++ ++ ++/** ++ * replace_page - replace page in vma by new ksm page ++ * @vma: vma that holds the pte pointing to page ++ * @page: the page we are replacing by kpage ++ * @kpage: the ksm page we replace page by ++ * @orig_pte: the original value of the pte ++ * ++ * Returns 0 on success, MERGE_ERR_PGERR on failure. ++ */ ++static int replace_page(struct vm_area_struct *vma, struct page *page, ++ struct page *kpage, pte_t orig_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *ptep; ++ spinlock_t *ptl; ++ pte_t entry; ++ ++ unsigned long addr; ++ int err = MERGE_ERR_PGERR; ++ ++ addr = page_address_in_vma(page, vma); ++ if (addr == -EFAULT) ++ goto out; ++ ++ pgd = pgd_offset(mm, addr); ++ if (!pgd_present(*pgd)) ++ goto out; ++ ++ pud = pud_offset(pgd, addr); ++ if (!pud_present(*pud)) ++ goto out; ++ ++ pmd = pmd_offset(pud, addr); ++ BUG_ON(pmd_trans_huge(*pmd)); ++ if (!pmd_present(*pmd)) ++ goto out; ++ ++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); ++ if (!pte_same(*ptep, orig_pte)) { ++ pte_unmap_unlock(ptep, ptl); ++ goto out; ++ } ++ ++ flush_cache_page(vma, addr, pte_pfn(*ptep)); ++ ptep_clear_flush(vma, addr, ptep); ++ entry = mk_pte(kpage, vma->vm_page_prot); ++ ++ /* special treatment is needed for zero_page */ ++ if ((page_to_pfn(kpage) == uksm_zero_pfn) || ++ (page_to_pfn(kpage) == zero_pfn)) ++ entry = pte_mkspecial(entry); ++ else { ++ get_page(kpage); ++ page_add_anon_rmap(kpage, vma, addr); ++ } ++ ++ set_pte_at_notify(mm, addr, ptep, entry); ++ ++ page_remove_rmap(page); ++ if (!page_mapped(page)) ++ try_to_free_swap(page); ++ put_page(page); ++ ++ pte_unmap_unlock(ptep, ptl); ++ err = 0; ++out: ++ return err; ++} ++ ++ ++/** ++ * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The ++ * zero hash value at HASH_STRENGTH_MAX is used to indicated that its ++ * hash_max member has not been calculated. ++ * ++ * @page The page needs to be hashed ++ * @hash_old The hash value calculated with current hash strength ++ * ++ * return the new hash value calculated at HASH_STRENGTH_MAX ++ */ ++static inline u32 page_hash_max(struct page *page, u32 hash_old) ++{ ++ u32 hash_max = 0; ++ void *addr; ++ ++ addr = kmap_atomic(page, KM_USER0); ++ hash_max = delta_hash(addr, hash_strength, ++ HASH_STRENGTH_MAX, hash_old); ++ ++ kunmap_atomic(addr, KM_USER0); ++ ++ if (!hash_max) ++ hash_max = 1; ++ ++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); ++ return hash_max; ++} ++ ++/* ++ * We compare the hash again, to ensure that it is really a hash collision ++ * instead of being caused by page write. ++ */ ++static inline int check_collision(struct rmap_item *rmap_item, ++ u32 hash) ++{ ++ int err; ++ struct page *page = rmap_item->page; ++ ++ /* if this rmap_item has already been hash_maxed, then the collision ++ * must appears in the second-level rbtree search. In this case we check ++ * if its hash_max value has been changed. Otherwise, the collision ++ * happens in the first-level rbtree search, so we check against it's ++ * current hash value. ++ */ ++ if (rmap_item->hash_max) { ++ inc_rshash_neg(memcmp_cost); ++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); ++ ++ if (rmap_item->hash_max == page_hash_max(page, hash)) ++ err = MERGE_ERR_COLLI; ++ else ++ err = MERGE_ERR_CHANGED; ++ } else { ++ inc_rshash_neg(memcmp_cost + hash_strength); ++ ++ if (page_hash(page, hash_strength, 0) == hash) ++ err = MERGE_ERR_COLLI; ++ else ++ err = MERGE_ERR_CHANGED; ++ } ++ ++ return err; ++} ++ ++static struct page *page_trans_compound_anon(struct page *page) ++{ ++ if (PageTransCompound(page)) { ++ struct page *head = compound_trans_head(page); ++ /* ++ * head may actually be splitted and freed from under ++ * us but it's ok here. ++ */ ++ if (PageAnon(head)) ++ return head; ++ } ++ return NULL; ++} ++ ++static int page_trans_compound_anon_split(struct page *page) ++{ ++ int ret = 0; ++ struct page *transhuge_head = page_trans_compound_anon(page); ++ if (transhuge_head) { ++ /* Get the reference on the head to split it. */ ++ if (get_page_unless_zero(transhuge_head)) { ++ /* ++ * Recheck we got the reference while the head ++ * was still anonymous. ++ */ ++ if (PageAnon(transhuge_head)) ++ ret = split_huge_page(transhuge_head); ++ else ++ /* ++ * Retry later if split_huge_page run ++ * from under us. ++ */ ++ ret = 1; ++ put_page(transhuge_head); ++ } else ++ /* Retry later if split_huge_page run from under us. */ ++ ret = 1; ++ } ++ return ret; ++} ++ ++/** ++ * Try to merge a rmap_item.page with a kpage in stable node. kpage must ++ * already be a ksm page. ++ * ++ * @return 0 if the pages were merged, -EFAULT otherwise. ++ */ ++static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item, ++ struct page *kpage, u32 hash) ++{ ++ struct vm_area_struct *vma = rmap_item->slot->vma; ++ struct mm_struct *mm = vma->vm_mm; ++ pte_t orig_pte = __pte(0); ++ int err = MERGE_ERR_PGERR; ++ struct page *page; ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ page = rmap_item->page; ++ ++ if (page == kpage) { /* ksm page forked */ ++ err = 0; ++ goto out; ++ } ++ ++ if (PageTransCompound(page) && page_trans_compound_anon_split(page)) ++ goto out; ++ BUG_ON(PageTransCompound(page)); ++ ++ if (!PageAnon(page) || !PageKsm(kpage)) ++ goto out; ++ ++ /* ++ * We need the page lock to read a stable PageSwapCache in ++ * write_protect_page(). We use trylock_page() instead of ++ * lock_page() because we don't want to wait here - we ++ * prefer to continue scanning and merging different pages, ++ * then come back to this page when it is unlocked. ++ */ ++ if (!trylock_page(page)) ++ goto out; ++ /* ++ * If this anonymous page is mapped only here, its pte may need ++ * to be write-protected. If it's mapped elsewhere, all of its ++ * ptes are necessarily already write-protected. But in either ++ * case, we need to lock and check page_count is not raised. ++ */ ++ if (write_protect_page(vma, page, &orig_pte, NULL) == 0) { ++ if (pages_identical(page, kpage)) ++ err = replace_page(vma, page, kpage, orig_pte); ++ else ++ err = check_collision(rmap_item, hash); ++ } ++ ++ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { ++ munlock_vma_page(page); ++ if (!PageMlocked(kpage)) { ++ unlock_page(page); ++ lock_page(kpage); ++ mlock_vma_page(kpage); ++ page = kpage; /* for final unlock */ ++ } ++ } ++ ++ unlock_page(page); ++out: ++ return err; ++} ++ ++ ++ ++/** ++ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance ++ * to restore a page mapping that has been changed in try_to_merge_two_pages. ++ * ++ * @return 0 on success. ++ */ ++static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr, ++ pte_t orig_pte, pte_t wprt_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *ptep; ++ spinlock_t *ptl; ++ ++ int err = -EFAULT; ++ ++ pgd = pgd_offset(mm, addr); ++ if (!pgd_present(*pgd)) ++ goto out; ++ ++ pud = pud_offset(pgd, addr); ++ if (!pud_present(*pud)) ++ goto out; ++ ++ pmd = pmd_offset(pud, addr); ++ if (!pmd_present(*pmd)) ++ goto out; ++ ++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); ++ if (!pte_same(*ptep, wprt_pte)) { ++ /* already copied, let it be */ ++ pte_unmap_unlock(ptep, ptl); ++ goto out; ++ } ++ ++ /* ++ * Good boy, still here. When we still get the ksm page, it does not ++ * return to the free page pool, there is no way that a pte was changed ++ * to other page and gets back to this page. And remind that ksm page ++ * do not reuse in do_wp_page(). So it's safe to restore the original ++ * pte. ++ */ ++ flush_cache_page(vma, addr, pte_pfn(*ptep)); ++ ptep_clear_flush(vma, addr, ptep); ++ set_pte_at_notify(mm, addr, ptep, orig_pte); ++ ++ pte_unmap_unlock(ptep, ptl); ++ err = 0; ++out: ++ return err; ++} ++ ++/** ++ * try_to_merge_two_pages() - take two identical pages and prepare ++ * them to be merged into one page(rmap_item->page) ++ * ++ * @return 0 if we successfully merged two identical pages into ++ * one ksm page. MERGE_ERR_COLLI if it's only a hash collision ++ * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been ++ * changed since it's hashed. MERGE_ERR_PGERR otherwise. ++ * ++ */ ++static int try_to_merge_two_pages(struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ u32 hash) ++{ ++ pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0); ++ pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0); ++ struct vm_area_struct *vma1 = rmap_item->slot->vma; ++ struct vm_area_struct *vma2 = tree_rmap_item->slot->vma; ++ struct page *page = rmap_item->page; ++ struct page *tree_page = tree_rmap_item->page; ++ int err = MERGE_ERR_PGERR; ++ struct address_space *saved_mapping; ++ ++ ++ if (rmap_item->page == tree_rmap_item->page) ++ goto out; ++ ++ if (PageTransCompound(page) && page_trans_compound_anon_split(page)) ++ goto out; ++ BUG_ON(PageTransCompound(page)); ++ ++ if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page)) ++ goto out; ++ BUG_ON(PageTransCompound(tree_page)); ++ ++ if (!PageAnon(page) || !PageAnon(tree_page)) ++ goto out; ++ ++ if (!trylock_page(page)) ++ goto out; ++ ++ ++ if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) { ++ unlock_page(page); ++ goto out; ++ } ++ ++ /* ++ * While we hold page lock, upgrade page from ++ * PageAnon+anon_vma to PageKsm+NULL stable_node: ++ * stable_tree_insert() will update stable_node. ++ */ ++ saved_mapping = page->mapping; ++ set_page_stable_node(page, NULL); ++ mark_page_accessed(page); ++ unlock_page(page); ++ ++ if (!trylock_page(tree_page)) ++ goto restore_out; ++ ++ if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if (pages_identical(page, tree_page)) { ++ err = replace_page(vma2, tree_page, page, wprt_pte2); ++ if (err) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if ((vma2->vm_flags & VM_LOCKED)) { ++ munlock_vma_page(tree_page); ++ if (!PageMlocked(page)) { ++ unlock_page(tree_page); ++ lock_page(page); ++ mlock_vma_page(page); ++ tree_page = page; /* for final unlock */ ++ } ++ } ++ ++ unlock_page(tree_page); ++ ++ goto out; /* success */ ++ ++ } else { ++ if (tree_rmap_item->hash_max && ++ tree_rmap_item->hash_max == rmap_item->hash_max) { ++ err = MERGE_ERR_COLLI_MAX; ++ } else if (page_hash(page, hash_strength, 0) == ++ page_hash(tree_page, hash_strength, 0)) { ++ inc_rshash_neg(memcmp_cost + hash_strength * 2); ++ err = MERGE_ERR_COLLI; ++ } else { ++ err = MERGE_ERR_CHANGED; ++ } ++ ++ unlock_page(tree_page); ++ } ++ ++restore_out: ++ lock_page(page); ++ if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item), ++ orig_pte1, wprt_pte1)) ++ page->mapping = saved_mapping; ++ ++ unlock_page(page); ++out: ++ return err; ++} ++ ++static inline int hash_cmp(u32 new_val, u32 node_val) ++{ ++ if (new_val > node_val) ++ return 1; ++ else if (new_val < node_val) ++ return -1; ++ else ++ return 0; ++} ++ ++static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash) ++{ ++ u32 hash_max = item->hash_max; ++ ++ if (!hash_max) { ++ hash_max = page_hash_max(item->page, hash); ++ ++ item->hash_max = hash_max; ++ } ++ ++ return hash_max; ++} ++ ++ ++ ++/** ++ * stable_tree_search() - search the stable tree for a page ++ * ++ * @item: the rmap_item we are comparing with ++ * @hash: the hash value of this item->page already calculated ++ * ++ * @return the page we have found, NULL otherwise. The page returned has ++ * been gotten. ++ */ ++static struct page *stable_tree_search(struct rmap_item *item, u32 hash) ++{ ++ struct rb_node *node = root_stable_treep->rb_node; ++ struct tree_node *tree_node; ++ unsigned long hash_max; ++ struct page *page = item->page; ++ struct stable_node *stable_node; ++ ++ stable_node = page_stable_node(page); ++ if (stable_node) { ++ /* ksm page forked, that is ++ * if (PageKsm(page) && !in_stable_tree(rmap_item)) ++ * it's actually gotten once outside. ++ */ ++ get_page(page); ++ return page; ++ } ++ ++ while (node) { ++ int cmp; ++ ++ tree_node = rb_entry(node, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) ++ node = node->rb_left; ++ else if (cmp > 0) ++ node = node->rb_right; ++ else ++ break; ++ } ++ ++ if (!node) ++ return NULL; ++ ++ if (tree_node->count == 1) { ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ BUG_ON(!stable_node); ++ ++ goto get_page_out; ++ } ++ ++ /* ++ * ok, we have to search the second ++ * level subtree, hash the page to a ++ * full strength. ++ */ ++ node = tree_node->sub_root.rb_node; ++ BUG_ON(!node); ++ hash_max = rmap_item_hash_max(item, hash); ++ ++ while (node) { ++ int cmp; ++ ++ stable_node = rb_entry(node, struct stable_node, node); ++ ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ if (cmp < 0) ++ node = node->rb_left; ++ else if (cmp > 0) ++ node = node->rb_right; ++ else ++ goto get_page_out; ++ } ++ ++ return NULL; ++ ++get_page_out: ++ page = get_uksm_page(stable_node, 1, 1); ++ return page; ++} ++ ++static int try_merge_rmap_item(struct rmap_item *item, ++ struct page *kpage, ++ struct page *tree_page) ++{ ++ spinlock_t *ptl; ++ pte_t *ptep; ++ unsigned long addr; ++ struct vm_area_struct *vma = item->slot->vma; ++ ++ addr = get_rmap_addr(item); ++ ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0); ++ if (!ptep) ++ return 0; ++ ++ if (pte_write(*ptep)) { ++ /* has changed, abort! */ ++ pte_unmap_unlock(ptep, ptl); ++ return 0; ++ } ++ ++ get_page(tree_page); ++ page_add_anon_rmap(tree_page, vma, addr); ++ ++ flush_cache_page(vma, addr, pte_pfn(*ptep)); ++ ptep_clear_flush(vma, addr, ptep); ++ set_pte_at_notify(vma->vm_mm, addr, ptep, ++ mk_pte(tree_page, vma->vm_page_prot)); ++ ++ page_remove_rmap(kpage); ++ put_page(kpage); ++ ++ pte_unmap_unlock(ptep, ptl); ++ ++ return 1; ++} ++ ++/** ++ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted ++ * into stable tree, the page was found to be identical to a stable ksm page, ++ * this is the last chance we can merge them into one. ++ * ++ * @item1: the rmap_item holding the page which we wanted to insert ++ * into stable tree. ++ * @item2: the other rmap_item we found when unstable tree search ++ * @oldpage: the page currently mapped by the two rmap_items ++ * @tree_page: the page we found identical in stable tree node ++ * @success1: return if item1 is successfully merged ++ * @success2: return if item2 is successfully merged ++ */ ++static void try_merge_with_stable(struct rmap_item *item1, ++ struct rmap_item *item2, ++ struct page **kpage, ++ struct page *tree_page, ++ int *success1, int *success2) ++{ ++ struct vm_area_struct *vma1 = item1->slot->vma; ++ struct vm_area_struct *vma2 = item2->slot->vma; ++ *success1 = 0; ++ *success2 = 0; ++ ++ if (unlikely(*kpage == tree_page)) { ++ /* I don't think this can really happen */ ++ printk(KERN_WARNING "UKSM: unexpected condition detected in " ++ "try_merge_with_stable() -- *kpage == tree_page !\n"); ++ *success1 = 1; ++ *success2 = 1; ++ return; ++ } ++ ++ if (!PageAnon(*kpage) || !PageKsm(*kpage)) ++ goto failed; ++ ++ if (!trylock_page(tree_page)) ++ goto failed; ++ ++ /* If the oldpage is still ksm and still pointed ++ * to in the right place, and still write protected, ++ * we are confident it's not changed, no need to ++ * memcmp anymore. ++ * be ware, we cannot take nested pte locks, ++ * deadlock risk. ++ */ ++ if (!try_merge_rmap_item(item1, *kpage, tree_page)) ++ goto unlock_failed; ++ ++ /* ok, then vma2, remind that pte1 already set */ ++ if (!try_merge_rmap_item(item2, *kpage, tree_page)) ++ goto success_1; ++ ++ *success2 = 1; ++success_1: ++ *success1 = 1; ++ ++ ++ if ((*success1 && vma1->vm_flags & VM_LOCKED) || ++ (*success2 && vma2->vm_flags & VM_LOCKED)) { ++ munlock_vma_page(*kpage); ++ if (!PageMlocked(tree_page)) ++ mlock_vma_page(tree_page); ++ } ++ ++ /* ++ * We do not need oldpage any more in the caller, so can break the lock ++ * now. ++ */ ++ unlock_page(*kpage); ++ *kpage = tree_page; /* Get unlocked outside. */ ++ return; ++ ++unlock_failed: ++ unlock_page(tree_page); ++failed: ++ return; ++} ++ ++static inline void stable_node_hash_max(struct stable_node *node, ++ struct page *page, u32 hash) ++{ ++ u32 hash_max = node->hash_max; ++ ++ if (!hash_max) { ++ hash_max = page_hash_max(page, hash); ++ node->hash_max = hash_max; ++ } ++} ++ ++static inline ++struct stable_node *new_stable_node(struct tree_node *tree_node, ++ struct page *kpage, u32 hash_max) ++{ ++ struct stable_node *new_stable_node; ++ ++ new_stable_node = alloc_stable_node(); ++ if (!new_stable_node) ++ return NULL; ++ ++ new_stable_node->kpfn = page_to_pfn(kpage); ++ new_stable_node->hash_max = hash_max; ++ new_stable_node->tree_node = tree_node; ++ set_page_stable_node(kpage, new_stable_node); ++ ++ return new_stable_node; ++} ++ ++static inline ++struct stable_node *first_level_insert(struct tree_node *tree_node, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ struct page **kpage, u32 hash, ++ int *success1, int *success2) ++{ ++ int cmp; ++ struct page *tree_page; ++ u32 hash_max = 0; ++ struct stable_node *stable_node, *new_snode; ++ struct rb_node *parent = NULL, **new; ++ ++ /* this tree node contains no sub-tree yet */ ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ cmp = memcmp_pages(*kpage, tree_page, 1); ++ if (!cmp) { ++ try_merge_with_stable(rmap_item, tree_rmap_item, kpage, ++ tree_page, success1, success2); ++ put_page(tree_page); ++ if (!*success1 && !*success2) ++ goto failed; ++ ++ return stable_node; ++ ++ } else { ++ /* ++ * collision in first level try to create a subtree. ++ * A new node need to be created. ++ */ ++ put_page(tree_page); ++ ++ stable_node_hash_max(stable_node, tree_page, ++ tree_node->hash); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ parent = &stable_node->node; ++ if (cmp < 0) { ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ new = &parent->rb_right; ++ } else { ++ goto failed; ++ } ++ } ++ ++ } else { ++ /* the only stable_node deleted, we reuse its tree_node. ++ */ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++ new_snode = new_stable_node(tree_node, *kpage, hash_max); ++ if (!new_snode) ++ goto failed; ++ ++ rb_link_node(&new_snode->node, parent, new); ++ rb_insert_color(&new_snode->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ ++ return new_snode; ++ ++failed: ++ return NULL; ++} ++ ++static inline ++struct stable_node *stable_subtree_insert(struct tree_node *tree_node, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ struct page **kpage, u32 hash, ++ int *success1, int *success2) ++{ ++ struct page *tree_page; ++ u32 hash_max; ++ struct stable_node *stable_node, *new_snode; ++ struct rb_node *parent, **new; ++ ++research: ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ BUG_ON(!*new); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ while (*new) { ++ int cmp; ++ ++ stable_node = rb_entry(*new, struct stable_node, node); ++ ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else { ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ cmp = memcmp_pages(*kpage, tree_page, 1); ++ if (!cmp) { ++ try_merge_with_stable(rmap_item, ++ tree_rmap_item, kpage, ++ tree_page, success1, success2); ++ ++ put_page(tree_page); ++ if (!*success1 && !*success2) ++ goto failed; ++ /* ++ * successfully merged with a stable ++ * node ++ */ ++ return stable_node; ++ } else { ++ put_page(tree_page); ++ goto failed; ++ } ++ } else { ++ /* ++ * stable node may be deleted, ++ * and subtree maybe ++ * restructed, cannot ++ * continue, research it. ++ */ ++ if (tree_node->count) { ++ goto research; ++ } else { ++ /* reuse the tree node*/ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ } ++ } ++ } ++ ++ new_snode = new_stable_node(tree_node, *kpage, hash_max); ++ if (!new_snode) ++ goto failed; ++ ++ rb_link_node(&new_snode->node, parent, new); ++ rb_insert_color(&new_snode->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ ++ return new_snode; ++ ++failed: ++ return NULL; ++} ++ ++ ++/** ++ * stable_tree_insert() - try to insert a merged page in unstable tree to ++ * the stable tree ++ * ++ * @kpage: the page need to be inserted ++ * @hash: the current hash of this page ++ * @rmap_item: the rmap_item being scanned ++ * @tree_rmap_item: the rmap_item found on unstable tree ++ * @success1: return if rmap_item is merged ++ * @success2: return if tree_rmap_item is merged ++ * ++ * @return the stable_node on stable tree if at least one ++ * rmap_item is inserted into stable tree, NULL ++ * otherwise. ++ */ ++static struct stable_node * ++stable_tree_insert(struct page **kpage, u32 hash, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ int *success1, int *success2) ++{ ++ struct rb_node **new = &root_stable_treep->rb_node; ++ struct rb_node *parent = NULL; ++ struct stable_node *stable_node; ++ struct tree_node *tree_node; ++ u32 hash_max = 0; ++ ++ *success1 = *success2 = 0; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ if (tree_node->count == 1) { ++ stable_node = first_level_insert(tree_node, rmap_item, ++ tree_rmap_item, kpage, ++ hash, success1, success2); ++ } else { ++ stable_node = stable_subtree_insert(tree_node, ++ rmap_item, tree_rmap_item, kpage, ++ hash, success1, success2); ++ } ++ } else { ++ ++ /* no tree node found */ ++ tree_node = alloc_tree_node(stable_tree_node_listp); ++ if (!tree_node) { ++ stable_node = NULL; ++ goto out; ++ } ++ ++ stable_node = new_stable_node(tree_node, *kpage, hash_max); ++ if (!stable_node) { ++ free_tree_node(tree_node); ++ goto out; ++ } ++ ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, root_stable_treep); ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ ++ rb_link_node(&stable_node->node, parent, new); ++ rb_insert_color(&stable_node->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ } ++ ++out: ++ return stable_node; ++} ++ ++ ++/** ++ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem ++ * ++ * @return 0 on success, -EBUSY if unable to lock the mmap_sem, ++ * -EINVAL if the page mapping has been changed. ++ */ ++static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item) ++{ ++ int err; ++ ++ err = get_mergeable_page_lock_mmap(tree_rmap_item); ++ ++ if (err == -EINVAL) { ++ /* its page map has been changed, remove it */ ++ remove_rmap_item_from_tree(tree_rmap_item); ++ } ++ ++ /* The page is gotten and mmap_sem is locked now. */ ++ return err; ++} ++ ++ ++/** ++ * unstable_tree_search_insert() - search an unstable tree rmap_item with the ++ * same hash value. Get its page and trylock the mmap_sem ++ */ ++static inline ++struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, ++ u32 hash) ++ ++{ ++ struct rb_node **new = &root_unstable_tree.rb_node; ++ struct rb_node *parent = NULL; ++ struct tree_node *tree_node; ++ u32 hash_max; ++ struct rmap_item *tree_rmap_item; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ /* got the tree_node */ ++ if (tree_node->count == 1) { ++ tree_rmap_item = rb_entry(tree_node->sub_root.rb_node, ++ struct rmap_item, node); ++ BUG_ON(!tree_rmap_item); ++ ++ goto get_page_out; ++ } ++ ++ /* well, search the collision subtree */ ++ new = &tree_node->sub_root.rb_node; ++ BUG_ON(!*new); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ ++ while (*new) { ++ int cmp; ++ ++ tree_rmap_item = rb_entry(*new, struct rmap_item, ++ node); ++ ++ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); ++ parent = *new; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto get_page_out; ++ } ++ } else { ++ /* alloc a new tree_node */ ++ tree_node = alloc_tree_node(&unstable_tree_node_list); ++ if (!tree_node) ++ return NULL; ++ ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, &root_unstable_tree); ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++ /* did not found even in sub-tree */ ++ rmap_item->tree_node = tree_node; ++ rmap_item->address |= UNSTABLE_FLAG; ++ rmap_item->hash_round = uksm_hash_round; ++ rb_link_node(&rmap_item->node, parent, new); ++ rb_insert_color(&rmap_item->node, &tree_node->sub_root); ++ ++ uksm_pages_unshared++; ++ return NULL; ++ ++get_page_out: ++ if (tree_rmap_item->page == rmap_item->page) ++ return NULL; ++ ++ if (get_tree_rmap_item_page(tree_rmap_item)) ++ return NULL; ++ ++ return tree_rmap_item; ++} ++ ++static void hold_anon_vma(struct rmap_item *rmap_item, ++ struct anon_vma *anon_vma) ++{ ++ rmap_item->anon_vma = anon_vma; ++ get_anon_vma(anon_vma); ++} ++ ++ ++/** ++ * stable_tree_append() - append a rmap_item to a stable node. Deduplication ++ * ratio statistics is done in this function. ++ * ++ */ ++static void stable_tree_append(struct rmap_item *rmap_item, ++ struct stable_node *stable_node, int logdedup) ++{ ++ struct node_vma *node_vma = NULL, *new_node_vma; ++ struct hlist_node *hlist = NULL, *cont_p = NULL; ++ unsigned long key = (unsigned long)rmap_item->slot; ++ unsigned long factor = rmap_item->slot->rung->step; ++ ++ BUG_ON(!stable_node); ++ rmap_item->address |= STABLE_FLAG; ++ ++ if (hlist_empty(&stable_node->hlist)) { ++ uksm_pages_shared++; ++ goto node_vma_new; ++ } else { ++ uksm_pages_sharing++; ++ } ++ ++ hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) { ++ if (node_vma->key >= key) ++ break; ++ ++ if (logdedup) { ++ node_vma->slot->pages_bemerged += factor; ++ if (list_empty(&node_vma->slot->dedup_list)) ++ list_add(&node_vma->slot->dedup_list, ++ &vma_slot_dedup); ++ } ++ } ++ ++ if (node_vma) { ++ if (node_vma->key == key) { ++ cont_p = hlist->next; ++ goto node_vma_ok; ++ } else if (node_vma->key > key) { ++ cont_p = hlist; ++ } ++ } ++ ++node_vma_new: ++ /* no same vma already in node, alloc a new node_vma */ ++ new_node_vma = alloc_node_vma(); ++ BUG_ON(!new_node_vma); ++ new_node_vma->head = stable_node; ++ new_node_vma->slot = rmap_item->slot; ++ ++ if (!node_vma) { ++ hlist_add_head(&new_node_vma->hlist, &stable_node->hlist); ++ } else if (node_vma->key != key) { ++ if (node_vma->key < key) ++ hlist_add_after(&node_vma->hlist, &new_node_vma->hlist); ++ else { ++ hlist_add_before(&new_node_vma->hlist, ++ &node_vma->hlist); ++ } ++ ++ } ++ node_vma = new_node_vma; ++ ++node_vma_ok: /* ok, ready to add to the list */ ++ rmap_item->head = node_vma; ++ hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist); ++ hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma); ++ if (logdedup) { ++ rmap_item->slot->pages_merged++; ++ if (cont_p) { ++ hlist_for_each_entry_continue(node_vma, ++ cont_p, hlist) { ++ node_vma->slot->pages_bemerged += factor; ++ if (list_empty(&node_vma->slot->dedup_list)) ++ list_add(&node_vma->slot->dedup_list, ++ &vma_slot_dedup); ++ } ++ } ++ } ++} ++ ++/* ++ * We use break_ksm to break COW on a ksm page: it's a stripped down ++ * ++ * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) ++ * put_page(page); ++ * ++ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, ++ * in case the application has unmapped and remapped mm,addr meanwhile. ++ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP ++ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. ++ */ ++static int break_ksm(struct vm_area_struct *vma, unsigned long addr) ++{ ++ struct page *page; ++ int ret = 0; ++ ++ do { ++ cond_resched(); ++ page = follow_page(vma, addr, FOLL_GET); ++ if (IS_ERR_OR_NULL(page)) ++ break; ++ if (PageKsm(page)) { ++ ret = handle_mm_fault(vma->vm_mm, vma, addr, ++ FAULT_FLAG_WRITE); ++ } else ++ ret = VM_FAULT_WRITE; ++ put_page(page); ++ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); ++ /* ++ * We must loop because handle_mm_fault() may back out if there's ++ * any difficulty e.g. if pte accessed bit gets updated concurrently. ++ * ++ * VM_FAULT_WRITE is what we have been hoping for: it indicates that ++ * COW has been broken, even if the vma does not permit VM_WRITE; ++ * but note that a concurrent fault might break PageKsm for us. ++ * ++ * VM_FAULT_SIGBUS could occur if we race with truncation of the ++ * backing file, which also invalidates anonymous pages: that's ++ * okay, that truncation will have unmapped the PageKsm for us. ++ * ++ * VM_FAULT_OOM: at the time of writing (late July 2009), setting ++ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the ++ * current task has TIF_MEMDIE set, and will be OOM killed on return ++ * to user; and ksmd, having no mm, would never be chosen for that. ++ * ++ * But if the mm is in a limited mem_cgroup, then the fault may fail ++ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and ++ * even ksmd can fail in this way - though it's usually breaking ksm ++ * just to undo a merge it made a moment before, so unlikely to oom. ++ * ++ * That's a pity: we might therefore have more kernel pages allocated ++ * than we're counting as nodes in the stable tree; but uksm_do_scan ++ * will retry to break_cow on each pass, so should recover the page ++ * in due course. The important thing is to not let VM_MERGEABLE ++ * be cleared while any such pages might remain in the area. ++ */ ++ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; ++} ++ ++static void break_cow(struct rmap_item *rmap_item) ++{ ++ struct vm_area_struct *vma = rmap_item->slot->vma; ++ struct mm_struct *mm = vma->vm_mm; ++ unsigned long addr = get_rmap_addr(rmap_item); ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ break_ksm(vma, addr); ++out: ++ return; ++} ++ ++/* ++ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather ++ * than check every pte of a given vma, the locking doesn't quite work for ++ * that - an rmap_item is assigned to the stable tree after inserting ksm ++ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing ++ * rmap_items from parent to child at fork time (so as not to waste time ++ * if exit comes before the next scan reaches it). ++ * ++ * Similarly, although we'd like to remove rmap_items (so updating counts ++ * and freeing memory) when unmerging an area, it's easier to leave that ++ * to the next pass of ksmd - consider, for example, how ksmd might be ++ * in cmp_and_merge_page on one of the rmap_items we would be removing. ++ */ ++inline int unmerge_uksm_pages(struct vm_area_struct *vma, ++ unsigned long start, unsigned long end) ++{ ++ unsigned long addr; ++ int err = 0; ++ ++ for (addr = start; addr < end && !err; addr += PAGE_SIZE) { ++ if (uksm_test_exit(vma->vm_mm)) ++ break; ++ if (signal_pending(current)) ++ err = -ERESTARTSYS; ++ else ++ err = break_ksm(vma, addr); ++ } ++ return err; ++} ++ ++static inline void inc_uksm_pages_scanned(void) ++{ ++ u64 delta; ++ ++ ++ if (uksm_pages_scanned == U64_MAX) { ++ encode_benefit(); ++ ++ delta = uksm_pages_scanned >> pages_scanned_base; ++ ++ if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) { ++ pages_scanned_stored >>= 1; ++ delta >>= 1; ++ pages_scanned_base++; ++ } ++ ++ pages_scanned_stored += delta; ++ ++ uksm_pages_scanned = uksm_pages_scanned_last = 0; ++ } ++ ++ uksm_pages_scanned++; ++} ++ ++static inline int find_zero_page_hash(int strength, u32 hash) ++{ ++ return (zero_hash_table[strength] == hash); ++} ++ ++static ++int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page) ++{ ++ struct page *zero_page = empty_uksm_zero_page; ++ struct mm_struct *mm = vma->vm_mm; ++ pte_t orig_pte = __pte(0); ++ int err = -EFAULT; ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ if (PageTransCompound(page) && page_trans_compound_anon_split(page)) ++ goto out; ++ BUG_ON(PageTransCompound(page)); ++ ++ if (!PageAnon(page)) ++ goto out; ++ ++ if (!trylock_page(page)) ++ goto out; ++ ++ if (write_protect_page(vma, page, &orig_pte, 0) == 0) { ++ if (is_page_full_zero(page)) ++ err = replace_page(vma, page, zero_page, orig_pte); ++ } ++ ++ unlock_page(page); ++out: ++ return err; ++} ++ ++/* ++ * cmp_and_merge_page() - first see if page can be merged into the stable ++ * tree; if not, compare hash to previous and if it's the same, see if page ++ * can be inserted into the unstable tree, or merged with a page already there ++ * and both transferred to the stable tree. ++ * ++ * @page: the page that we are searching identical page to. ++ * @rmap_item: the reverse mapping into the virtual address of this page ++ */ ++static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash) ++{ ++ struct rmap_item *tree_rmap_item; ++ struct page *page; ++ struct page *kpage = NULL; ++ u32 hash_max; ++ int err; ++ unsigned int success1, success2; ++ struct stable_node *snode; ++ int cmp; ++ struct rb_node *parent = NULL, **new; ++ ++ remove_rmap_item_from_tree(rmap_item); ++ page = rmap_item->page; ++ ++ /* We first start with searching the page inside the stable tree */ ++ kpage = stable_tree_search(rmap_item, hash); ++ if (kpage) { ++ err = try_to_merge_with_uksm_page(rmap_item, kpage, ++ hash); ++ if (!err) { ++ /* ++ * The page was successfully merged, add ++ * its rmap_item to the stable tree. ++ * page lock is needed because it's ++ * racing with try_to_unmap_ksm(), etc. ++ */ ++ lock_page(kpage); ++ snode = page_stable_node(kpage); ++ stable_tree_append(rmap_item, snode, 1); ++ unlock_page(kpage); ++ put_page(kpage); ++ return; /* success */ ++ } ++ put_page(kpage); ++ ++ /* ++ * if it's a collision and it has been search in sub-rbtree ++ * (hash_max != 0), we want to abort, because if it is ++ * successfully merged in unstable tree, the collision trends to ++ * happen again. ++ */ ++ if (err == MERGE_ERR_COLLI && rmap_item->hash_max) ++ return; ++ } ++ ++ tree_rmap_item = ++ unstable_tree_search_insert(rmap_item, hash); ++ if (tree_rmap_item) { ++ err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash); ++ /* ++ * As soon as we merge this page, we want to remove the ++ * rmap_item of the page we have merged with from the unstable ++ * tree, and insert it instead as new node in the stable tree. ++ */ ++ if (!err) { ++ kpage = page; ++ remove_rmap_item_from_tree(tree_rmap_item); ++ lock_page(kpage); ++ snode = stable_tree_insert(&kpage, hash, ++ rmap_item, tree_rmap_item, ++ &success1, &success2); ++ ++ /* ++ * Do not log dedup for tree item, it's not counted as ++ * scanned in this round. ++ */ ++ if (success2) ++ stable_tree_append(tree_rmap_item, snode, 0); ++ ++ /* ++ * The order of these two stable append is important: ++ * we are scanning rmap_item. ++ */ ++ if (success1) ++ stable_tree_append(rmap_item, snode, 1); ++ ++ /* ++ * The original kpage may be unlocked inside ++ * stable_tree_insert() already. This page ++ * should be unlocked before doing ++ * break_cow(). ++ */ ++ unlock_page(kpage); ++ ++ if (!success1) ++ break_cow(rmap_item); ++ ++ if (!success2) ++ break_cow(tree_rmap_item); ++ ++ } else if (err == MERGE_ERR_COLLI) { ++ BUG_ON(tree_rmap_item->tree_node->count > 1); ++ ++ rmap_item_hash_max(tree_rmap_item, ++ tree_rmap_item->tree_node->hash); ++ ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); ++ parent = &tree_rmap_item->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto put_up_out; ++ ++ rmap_item->tree_node = tree_rmap_item->tree_node; ++ rmap_item->address |= UNSTABLE_FLAG; ++ rmap_item->hash_round = uksm_hash_round; ++ rb_link_node(&rmap_item->node, parent, new); ++ rb_insert_color(&rmap_item->node, ++ &tree_rmap_item->tree_node->sub_root); ++ rmap_item->tree_node->count++; ++ } else { ++ /* ++ * either one of the page has changed or they collide ++ * at the max hash, we consider them as ill items. ++ */ ++ remove_rmap_item_from_tree(tree_rmap_item); ++ } ++put_up_out: ++ put_page(tree_rmap_item->page); ++ up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem); ++ } ++} ++ ++ ++ ++ ++static inline unsigned long get_pool_index(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT; ++ if (pool_index >= slot->pool_size) ++ BUG(); ++ return pool_index; ++} ++ ++static inline unsigned long index_page_offset(unsigned long index) ++{ ++ return offset_in_page(sizeof(struct rmap_list_entry *) * index); ++} ++ ++static inline ++struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot, ++ unsigned long index, int need_alloc) ++{ ++ unsigned long pool_index; ++ struct page *page; ++ void *addr; ++ ++ ++ pool_index = get_pool_index(slot, index); ++ if (!slot->rmap_list_pool[pool_index]) { ++ if (!need_alloc) ++ return NULL; ++ ++ page = alloc_page(GFP_KERNEL | __GFP_ZERO); ++ if (!page) ++ return NULL; ++ ++ slot->rmap_list_pool[pool_index] = page; ++ } ++ ++ addr = kmap(slot->rmap_list_pool[pool_index]); ++ addr += index_page_offset(index); ++ ++ return addr; ++} ++ ++static inline void put_rmap_list_entry(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ kunmap(slot->rmap_list_pool[pool_index]); ++} ++ ++static inline int entry_is_new(struct rmap_list_entry *entry) ++{ ++ return !entry->item; ++} ++ ++static inline unsigned long get_index_orig_addr(struct vma_slot *slot, ++ unsigned long index) ++{ ++ return slot->vma->vm_start + (index << PAGE_SHIFT); ++} ++ ++static inline unsigned long get_entry_address(struct rmap_list_entry *entry) ++{ ++ unsigned long addr; ++ ++ if (is_addr(entry->addr)) ++ addr = get_clean_addr(entry->addr); ++ else if (entry->item) ++ addr = get_rmap_addr(entry->item); ++ else ++ BUG(); ++ ++ return addr; ++} ++ ++static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry) ++{ ++ if (is_addr(entry->addr)) ++ return NULL; ++ ++ return entry->item; ++} ++ ++static inline void inc_rmap_list_pool_count(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ slot->pool_counts[pool_index]++; ++} ++ ++static inline void dec_rmap_list_pool_count(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ BUG_ON(!slot->pool_counts[pool_index]); ++ slot->pool_counts[pool_index]--; ++} ++ ++static inline int entry_has_rmap(struct rmap_list_entry *entry) ++{ ++ return !is_addr(entry->addr) && entry->item; ++} ++ ++static inline void swap_entries(struct rmap_list_entry *entry1, ++ unsigned long index1, ++ struct rmap_list_entry *entry2, ++ unsigned long index2) ++{ ++ struct rmap_list_entry tmp; ++ ++ /* swapping two new entries is meaningless */ ++ BUG_ON(entry_is_new(entry1) && entry_is_new(entry2)); ++ ++ tmp = *entry1; ++ *entry1 = *entry2; ++ *entry2 = tmp; ++ ++ if (entry_has_rmap(entry1)) ++ entry1->item->entry_index = index1; ++ ++ if (entry_has_rmap(entry2)) ++ entry2->item->entry_index = index2; ++ ++ if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) { ++ inc_rmap_list_pool_count(entry1->item->slot, index1); ++ dec_rmap_list_pool_count(entry1->item->slot, index2); ++ } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) { ++ inc_rmap_list_pool_count(entry2->item->slot, index2); ++ dec_rmap_list_pool_count(entry2->item->slot, index1); ++ } ++} ++ ++static inline void free_entry_item(struct rmap_list_entry *entry) ++{ ++ unsigned long index; ++ struct rmap_item *item; ++ ++ if (!is_addr(entry->addr)) { ++ BUG_ON(!entry->item); ++ item = entry->item; ++ entry->addr = get_rmap_addr(item); ++ set_is_addr(entry->addr); ++ index = item->entry_index; ++ remove_rmap_item_from_tree(item); ++ dec_rmap_list_pool_count(item->slot, index); ++ free_rmap_item(item); ++ } ++} ++ ++static inline int pool_entry_boundary(unsigned long index) ++{ ++ unsigned long linear_addr; ++ ++ linear_addr = sizeof(struct rmap_list_entry *) * index; ++ return index && !offset_in_page(linear_addr); ++} ++ ++static inline void try_free_last_pool(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ if (slot->rmap_list_pool[pool_index] && ++ !slot->pool_counts[pool_index]) { ++ __free_page(slot->rmap_list_pool[pool_index]); ++ slot->rmap_list_pool[pool_index] = NULL; ++ slot->flags |= UKSM_SLOT_NEED_SORT; ++ } ++ ++} ++ ++static inline unsigned long vma_item_index(struct vm_area_struct *vma, ++ struct rmap_item *item) ++{ ++ return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT; ++} ++ ++static int within_same_pool(struct vma_slot *slot, ++ unsigned long i, unsigned long j) ++{ ++ unsigned long pool_i, pool_j; ++ ++ pool_i = get_pool_index(slot, i); ++ pool_j = get_pool_index(slot, j); ++ ++ return (pool_i == pool_j); ++} ++ ++static void sort_rmap_entry_list(struct vma_slot *slot) ++{ ++ unsigned long i, j; ++ struct rmap_list_entry *entry, *swap_entry; ++ ++ entry = get_rmap_list_entry(slot, 0, 0); ++ for (i = 0; i < slot->pages; ) { ++ ++ if (!entry) ++ goto skip_whole_pool; ++ ++ if (entry_is_new(entry)) ++ goto next_entry; ++ ++ if (is_addr(entry->addr)) { ++ entry->addr = 0; ++ goto next_entry; ++ } ++ ++ j = vma_item_index(slot->vma, entry->item); ++ if (j == i) ++ goto next_entry; ++ ++ if (within_same_pool(slot, i, j)) ++ swap_entry = entry + j - i; ++ else ++ swap_entry = get_rmap_list_entry(slot, j, 1); ++ ++ swap_entries(entry, i, swap_entry, j); ++ if (!within_same_pool(slot, i, j)) ++ put_rmap_list_entry(slot, j); ++ continue; ++ ++skip_whole_pool: ++ i += PAGE_SIZE / sizeof(*entry); ++ if (i < slot->pages) ++ entry = get_rmap_list_entry(slot, i, 0); ++ continue; ++ ++next_entry: ++ if (i >= slot->pages - 1 || ++ !within_same_pool(slot, i, i + 1)) { ++ put_rmap_list_entry(slot, i); ++ if (i + 1 < slot->pages) ++ entry = get_rmap_list_entry(slot, i + 1, 0); ++ } else ++ entry++; ++ i++; ++ continue; ++ } ++ ++ /* free empty pool entries which contain no rmap_item */ ++ /* CAN be simplied to based on only pool_counts when bug freed !!!!! */ ++ for (i = 0; i < slot->pool_size; i++) { ++ unsigned char has_rmap; ++ void *addr; ++ ++ if (!slot->rmap_list_pool[i]) ++ continue; ++ ++ has_rmap = 0; ++ addr = kmap(slot->rmap_list_pool[i]); ++ BUG_ON(!addr); ++ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { ++ entry = (struct rmap_list_entry *)addr + j; ++ if (is_addr(entry->addr)) ++ continue; ++ if (!entry->item) ++ continue; ++ has_rmap = 1; ++ } ++ kunmap(slot->rmap_list_pool[i]); ++ if (!has_rmap) { ++ BUG_ON(slot->pool_counts[i]); ++ __free_page(slot->rmap_list_pool[i]); ++ slot->rmap_list_pool[i] = NULL; ++ } ++ } ++ ++ slot->flags &= ~UKSM_SLOT_NEED_SORT; ++} ++ ++/* ++ * vma_fully_scanned() - if all the pages in this slot have been scanned. ++ */ ++static inline int vma_fully_scanned(struct vma_slot *slot) ++{ ++ return slot->pages_scanned == slot->pages; ++} ++ ++/** ++ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to ++ * its random permutation. This function is embedded with the random ++ * permutation index management code. ++ */ ++static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash) ++{ ++ unsigned long rand_range, addr, swap_index, scan_index; ++ struct rmap_item *item = NULL; ++ struct rmap_list_entry *scan_entry, *swap_entry = NULL; ++ struct page *page; ++ ++ scan_index = swap_index = slot->pages_scanned % slot->pages; ++ ++ if (pool_entry_boundary(scan_index)) ++ try_free_last_pool(slot, scan_index - 1); ++ ++ if (vma_fully_scanned(slot)) { ++ if (slot->flags & UKSM_SLOT_NEED_SORT) ++ slot->flags |= UKSM_SLOT_NEED_RERAND; ++ else ++ slot->flags &= ~UKSM_SLOT_NEED_RERAND; ++ if (slot->flags & UKSM_SLOT_NEED_SORT) ++ sort_rmap_entry_list(slot); ++ } ++ ++ scan_entry = get_rmap_list_entry(slot, scan_index, 1); ++ if (!scan_entry) ++ return NULL; ++ ++ if (entry_is_new(scan_entry)) { ++ scan_entry->addr = get_index_orig_addr(slot, scan_index); ++ set_is_addr(scan_entry->addr); ++ } ++ ++ if (slot->flags & UKSM_SLOT_NEED_RERAND) { ++ rand_range = slot->pages - scan_index; ++ BUG_ON(!rand_range); ++ swap_index = scan_index + (random32() % rand_range); ++ } ++ ++ if (swap_index != scan_index) { ++ swap_entry = get_rmap_list_entry(slot, swap_index, 1); ++ if (entry_is_new(swap_entry)) { ++ swap_entry->addr = get_index_orig_addr(slot, ++ swap_index); ++ set_is_addr(swap_entry->addr); ++ } ++ swap_entries(scan_entry, scan_index, swap_entry, swap_index); ++ } ++ ++ addr = get_entry_address(scan_entry); ++ item = get_entry_item(scan_entry); ++ BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start); ++ ++ page = follow_page(slot->vma, addr, FOLL_GET); ++ if (IS_ERR_OR_NULL(page)) ++ goto nopage; ++ ++ if (!PageAnon(page) && !page_trans_compound_anon(page)) ++ goto putpage; ++ ++ /*check is zero_page pfn or uksm_zero_page*/ ++ if ((page_to_pfn(page) == zero_pfn) ++ || (page_to_pfn(page) == uksm_zero_pfn)) ++ goto putpage; ++ ++ flush_anon_page(slot->vma, page, addr); ++ flush_dcache_page(page); ++ ++ ++ *hash = page_hash(page, hash_strength, 1); ++ inc_uksm_pages_scanned(); ++ /*if the page content all zero, re-map to zero-page*/ ++ if (find_zero_page_hash(hash_strength, *hash)) { ++ if (!cmp_and_merge_zero_page(slot->vma, page)) { ++ slot->pages_merged++; ++ __inc_zone_page_state(page, NR_UKSM_ZERO_PAGES); ++ dec_mm_counter(slot->mm, MM_ANONPAGES); ++ ++ /* For full-zero pages, no need to create rmap item */ ++ goto putpage; ++ } else { ++ inc_rshash_neg(memcmp_cost / 2); ++ } ++ } ++ ++ if (!item) { ++ item = alloc_rmap_item(); ++ if (item) { ++ /* It has already been zeroed */ ++ item->slot = slot; ++ item->address = addr; ++ item->entry_index = scan_index; ++ scan_entry->item = item; ++ inc_rmap_list_pool_count(slot, scan_index); ++ } else ++ goto putpage; ++ } ++ ++ BUG_ON(item->slot != slot); ++ /* the page may have changed */ ++ item->page = page; ++ put_rmap_list_entry(slot, scan_index); ++ if (swap_entry) ++ put_rmap_list_entry(slot, swap_index); ++ return item; ++ ++putpage: ++ put_page(page); ++ page = NULL; ++nopage: ++ /* no page, store addr back and free rmap_item if possible */ ++ free_entry_item(scan_entry); ++ put_rmap_list_entry(slot, scan_index); ++ if (swap_entry) ++ put_rmap_list_entry(slot, swap_index); ++ return NULL; ++} ++ ++static inline int in_stable_tree(struct rmap_item *rmap_item) ++{ ++ return rmap_item->address & STABLE_FLAG; ++} ++ ++/** ++ * scan_vma_one_page() - scan the next page in a vma_slot. Called with ++ * mmap_sem locked. ++ */ ++static noinline void scan_vma_one_page(struct vma_slot *slot) ++{ ++ u32 hash; ++ struct mm_struct *mm; ++ struct rmap_item *rmap_item = NULL; ++ struct vm_area_struct *vma = slot->vma; ++ ++ mm = vma->vm_mm; ++ BUG_ON(!mm); ++ BUG_ON(!slot); ++ ++ rmap_item = get_next_rmap_item(slot, &hash); ++ if (!rmap_item) ++ goto out1; ++ ++ if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item)) ++ goto out2; ++ ++ cmp_and_merge_page(rmap_item, hash); ++out2: ++ put_page(rmap_item->page); ++out1: ++ slot->pages_scanned++; ++ if (slot->fully_scanned_round != fully_scanned_round) ++ scanned_virtual_pages++; ++ ++ if (vma_fully_scanned(slot)) ++ slot->fully_scanned_round = fully_scanned_round; ++} ++ ++static inline unsigned long rung_get_pages(struct scan_rung *rung) ++{ ++ struct slot_tree_node *node; ++ ++ if (!rung->vma_root.rnode) ++ return 0; ++ ++ node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode); ++ ++ return node->size; ++} ++ ++#define RUNG_SAMPLED_MIN 3 ++ ++static inline ++void uksm_calc_rung_step(struct scan_rung *rung, ++ unsigned long page_time, unsigned long ratio) ++{ ++ unsigned long sampled, pages; ++ ++ /* will be fully scanned ? */ ++ if (!rung->cover_msecs) { ++ rung->step = 1; ++ return; ++ } ++ ++ sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE) ++ * ratio / page_time; ++ ++ /* ++ * Before we finsish a scan round and expensive per-round jobs, ++ * we need to have a chance to estimate the per page time. So ++ * the sampled number can not be too small. ++ */ ++ if (sampled < RUNG_SAMPLED_MIN) ++ sampled = RUNG_SAMPLED_MIN; ++ ++ pages = rung_get_pages(rung); ++ if (likely(pages > sampled)) ++ rung->step = pages / sampled; ++ else ++ rung->step = 1; ++} ++ ++static inline int step_need_recalc(struct scan_rung *rung) ++{ ++ unsigned long pages, stepmax; ++ ++ pages = rung_get_pages(rung); ++ stepmax = pages / RUNG_SAMPLED_MIN; ++ ++ return pages && (rung->step > pages || ++ (stepmax && rung->step > stepmax)); ++} ++ ++static inline ++void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc) ++{ ++ struct vma_slot *slot; ++ ++ if (finished) ++ rung->flags |= UKSM_RUNG_ROUND_FINISHED; ++ ++ if (step_recalc || step_need_recalc(rung)) { ++ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); ++ BUG_ON(step_need_recalc(rung)); ++ } ++ ++ slot_iter_index = random32() % rung->step; ++ BUG_ON(!rung->vma_root.rnode); ++ slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter); ++ BUG_ON(!slot); ++ ++ rung->current_scan = slot; ++ rung->current_offset = slot_iter_index; ++} ++ ++static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot) ++{ ++ return &slot->rung->vma_root; ++} ++ ++/* ++ * return if resetted. ++ */ ++static int advance_current_scan(struct scan_rung *rung) ++{ ++ unsigned short n; ++ struct vma_slot *slot, *next = NULL; ++ ++ BUG_ON(!rung->vma_root.num); ++ ++ slot = rung->current_scan; ++ n = (slot->pages - rung->current_offset) % rung->step; ++ slot_iter_index = rung->step - n; ++ next = sradix_tree_next(&rung->vma_root, slot->snode, ++ slot->sindex, slot_iter); ++ ++ if (next) { ++ rung->current_offset = slot_iter_index; ++ rung->current_scan = next; ++ return 0; ++ } else { ++ reset_current_scan(rung, 1, 0); ++ return 1; ++ } ++} ++ ++static inline void rung_rm_slot(struct vma_slot *slot) ++{ ++ struct scan_rung *rung = slot->rung; ++ struct sradix_tree_root *root; ++ ++ if (rung->current_scan == slot) ++ advance_current_scan(rung); ++ ++ root = slot_get_root(slot); ++ sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex); ++ slot->snode = NULL; ++ if (step_need_recalc(rung)) { ++ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); ++ BUG_ON(step_need_recalc(rung)); ++ } ++ ++ /* In case advance_current_scan loop back to this slot again */ ++ if (rung->vma_root.num && rung->current_scan == slot) ++ reset_current_scan(slot->rung, 1, 0); ++} ++ ++static inline void rung_add_new_slots(struct scan_rung *rung, ++ struct vma_slot **slots, unsigned long num) ++{ ++ int err; ++ struct vma_slot *slot; ++ unsigned long i; ++ struct sradix_tree_root *root = &rung->vma_root; ++ ++ err = sradix_tree_enter(root, (void **)slots, num); ++ BUG_ON(err); ++ ++ for (i = 0; i < num; i++) { ++ slot = slots[i]; ++ slot->rung = rung; ++ BUG_ON(vma_fully_scanned(slot)); ++ } ++ ++ if (rung->vma_root.num == num) ++ reset_current_scan(rung, 0, 1); ++} ++ ++static inline int rung_add_one_slot(struct scan_rung *rung, ++ struct vma_slot *slot) ++{ ++ int err; ++ ++ err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1); ++ if (err) ++ return err; ++ ++ slot->rung = rung; ++ if (rung->vma_root.num == 1) ++ reset_current_scan(rung, 0, 1); ++ ++ return 0; ++} ++ ++/* ++ * Return true if the slot is deleted from its rung. ++ */ ++static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung) ++{ ++ struct scan_rung *old_rung = slot->rung; ++ int err; ++ ++ if (old_rung == rung) ++ return 0; ++ ++ rung_rm_slot(slot); ++ err = rung_add_one_slot(rung, slot); ++ if (err) { ++ err = rung_add_one_slot(old_rung, slot); ++ WARN_ON(err); /* OOPS, badly OOM, we lost this slot */ ++ } ++ ++ return 1; ++} ++ ++static inline int vma_rung_up(struct vma_slot *slot) ++{ ++ struct scan_rung *rung; ++ ++ rung = slot->rung; ++ if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1]) ++ rung++; ++ ++ return vma_rung_enter(slot, rung); ++} ++ ++static inline int vma_rung_down(struct vma_slot *slot) ++{ ++ struct scan_rung *rung; ++ ++ rung = slot->rung; ++ if (slot->rung != &uksm_scan_ladder[0]) ++ rung--; ++ ++ return vma_rung_enter(slot, rung); ++} ++ ++/** ++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. ++ */ ++static unsigned long cal_dedup_ratio(struct vma_slot *slot) ++{ ++ unsigned long ret; ++ ++ BUG_ON(slot->pages_scanned == slot->last_scanned); ++ ++ ret = slot->pages_merged; ++ ++ /* Thrashing area filtering */ ++ if (ret && uksm_thrash_threshold) { ++ if (slot->pages_cowed * 100 / slot->pages_merged ++ > uksm_thrash_threshold) { ++ ret = 0; ++ } else { ++ ret = slot->pages_merged - slot->pages_cowed; ++ } ++ } ++ ++ return ret; ++} ++ ++/** ++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. ++ */ ++static unsigned long cal_dedup_ratio_old(struct vma_slot *slot) ++{ ++ unsigned long ret; ++ unsigned long pages_scanned; ++ ++ pages_scanned = slot->pages_scanned; ++ if (!pages_scanned) { ++ if (uksm_thrash_threshold) ++ return 0; ++ else ++ pages_scanned = slot->pages_scanned; ++ } ++ ++ ret = slot->pages_bemerged * 100 / pages_scanned; ++ ++ /* Thrashing area filtering */ ++ if (ret && uksm_thrash_threshold) { ++ if (slot->pages_cowed * 100 / slot->pages_bemerged ++ > uksm_thrash_threshold) { ++ ret = 0; ++ } else { ++ ret = slot->pages_bemerged - slot->pages_cowed; ++ } ++ } ++ ++ return ret; ++} ++ ++/** ++ * stable_node_reinsert() - When the hash_strength has been adjusted, the ++ * stable tree need to be restructured, this is the function re-inserting the ++ * stable node. ++ */ ++static inline void stable_node_reinsert(struct stable_node *new_node, ++ struct page *page, ++ struct rb_root *root_treep, ++ struct list_head *tree_node_listp, ++ u32 hash) ++{ ++ struct rb_node **new = &root_treep->rb_node; ++ struct rb_node *parent = NULL; ++ struct stable_node *stable_node; ++ struct tree_node *tree_node; ++ struct page *tree_page; ++ int cmp; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ /* find a stable tree node with same first level hash value */ ++ stable_node_hash_max(new_node, page, hash); ++ if (tree_node->count == 1) { ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ stable_node_hash_max(stable_node, ++ tree_page, hash); ++ put_page(tree_page); ++ ++ /* prepare for stable node insertion */ ++ ++ cmp = hash_cmp(new_node->hash_max, ++ stable_node->hash_max); ++ parent = &stable_node->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto failed; ++ ++ goto add_node; ++ } else { ++ /* the only stable_node deleted, the tree node ++ * was not deleted. ++ */ ++ goto tree_node_reuse; ++ } ++ } ++ ++ /* well, search the collision subtree */ ++ new = &tree_node->sub_root.rb_node; ++ parent = NULL; ++ BUG_ON(!*new); ++ while (*new) { ++ int cmp; ++ ++ stable_node = rb_entry(*new, struct stable_node, node); ++ ++ cmp = hash_cmp(new_node->hash_max, ++ stable_node->hash_max); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else { ++ /* oh, no, still a collision */ ++ goto failed; ++ } ++ } ++ ++ goto add_node; ++ } ++ ++ /* no tree node found */ ++ tree_node = alloc_tree_node(tree_node_listp); ++ if (!tree_node) { ++ printk(KERN_ERR "UKSM: memory allocation error!\n"); ++ goto failed; ++ } else { ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, root_treep); ++ ++tree_node_reuse: ++ /* prepare for stable node insertion */ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++add_node: ++ rb_link_node(&new_node->node, parent, new); ++ rb_insert_color(&new_node->node, &tree_node->sub_root); ++ new_node->tree_node = tree_node; ++ tree_node->count++; ++ return; ++ ++failed: ++ /* This can only happen when two nodes have collided ++ * in two levels. ++ */ ++ new_node->tree_node = NULL; ++ return; ++} ++ ++static inline void free_all_tree_nodes(struct list_head *list) ++{ ++ struct tree_node *node, *tmp; ++ ++ list_for_each_entry_safe(node, tmp, list, all_list) { ++ free_tree_node(node); ++ } ++} ++ ++/** ++ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash ++ * strength to the current hash_strength. It re-structures the hole tree. ++ */ ++static inline void stable_tree_delta_hash(u32 prev_hash_strength) ++{ ++ struct stable_node *node, *tmp; ++ struct rb_root *root_new_treep; ++ struct list_head *new_tree_node_listp; ++ ++ stable_tree_index = (stable_tree_index + 1) % 2; ++ root_new_treep = &root_stable_tree[stable_tree_index]; ++ new_tree_node_listp = &stable_tree_node_list[stable_tree_index]; ++ *root_new_treep = RB_ROOT; ++ BUG_ON(!list_empty(new_tree_node_listp)); ++ ++ /* ++ * we need to be safe, the node could be removed by get_uksm_page() ++ */ ++ list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) { ++ void *addr; ++ struct page *node_page; ++ u32 hash; ++ ++ /* ++ * We are completely re-structuring the stable nodes to a new ++ * stable tree. We don't want to touch the old tree unlinks and ++ * old tree_nodes. The old tree_nodes will be freed at once. ++ */ ++ node_page = get_uksm_page(node, 0, 0); ++ if (!node_page) ++ continue; ++ ++ if (node->tree_node) { ++ hash = node->tree_node->hash; ++ ++ addr = kmap_atomic(node_page, KM_USER0); ++ ++ hash = delta_hash(addr, prev_hash_strength, ++ hash_strength, hash); ++ kunmap_atomic(addr, KM_USER0); ++ } else { ++ /* ++ *it was not inserted to rbtree due to collision in last ++ *round scan. ++ */ ++ hash = page_hash(node_page, hash_strength, 0); ++ } ++ ++ stable_node_reinsert(node, node_page, root_new_treep, ++ new_tree_node_listp, hash); ++ put_page(node_page); ++ } ++ ++ root_stable_treep = root_new_treep; ++ free_all_tree_nodes(stable_tree_node_listp); ++ BUG_ON(!list_empty(stable_tree_node_listp)); ++ stable_tree_node_listp = new_tree_node_listp; ++} ++ ++static inline void inc_hash_strength(unsigned long delta) ++{ ++ hash_strength += 1 << delta; ++ if (hash_strength > HASH_STRENGTH_MAX) ++ hash_strength = HASH_STRENGTH_MAX; ++} ++ ++static inline void dec_hash_strength(unsigned long delta) ++{ ++ unsigned long change = 1 << delta; ++ ++ if (hash_strength <= change + 1) ++ hash_strength = 1; ++ else ++ hash_strength -= change; ++} ++ ++static inline void inc_hash_strength_delta(void) ++{ ++ hash_strength_delta++; ++ if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX) ++ hash_strength_delta = HASH_STRENGTH_DELTA_MAX; ++} ++ ++/* ++static inline unsigned long get_current_neg_ratio(void) ++{ ++ if (!rshash_pos || rshash_neg > rshash_pos) ++ return 100; ++ ++ return div64_u64(100 * rshash_neg , rshash_pos); ++} ++*/ ++ ++static inline unsigned long get_current_neg_ratio(void) ++{ ++ u64 pos = benefit.pos; ++ u64 neg = benefit.neg; ++ ++ if (!neg) ++ return 0; ++ ++ if (!pos || neg > pos) ++ return 100; ++ ++ if (neg > div64_u64(U64_MAX, 100)) ++ pos = div64_u64(pos, 100); ++ else ++ neg *= 100; ++ ++ return div64_u64(neg, pos); ++} ++ ++static inline unsigned long get_current_benefit(void) ++{ ++ u64 pos = benefit.pos; ++ u64 neg = benefit.neg; ++ u64 scanned = benefit.scanned; ++ ++ if (neg > pos) ++ return 0; ++ ++ return div64_u64((pos - neg), scanned); ++} ++ ++static inline int judge_rshash_direction(void) ++{ ++ u64 current_neg_ratio, stable_benefit; ++ u64 current_benefit, delta = 0; ++ int ret = STILL; ++ ++ /* Try to probe a value after the boot, and in case the system ++ are still for a long time. */ ++ if ((fully_scanned_round & 0xFFULL) == 10) { ++ ret = OBSCURE; ++ goto out; ++ } ++ ++ current_neg_ratio = get_current_neg_ratio(); ++ ++ if (current_neg_ratio == 0) { ++ rshash_neg_cont_zero++; ++ if (rshash_neg_cont_zero > 2) ++ return GO_DOWN; ++ else ++ return STILL; ++ } ++ rshash_neg_cont_zero = 0; ++ ++ if (current_neg_ratio > 90) { ++ ret = GO_UP; ++ goto out; ++ } ++ ++ current_benefit = get_current_benefit(); ++ stable_benefit = rshash_state.stable_benefit; ++ ++ if (!stable_benefit) { ++ ret = OBSCURE; ++ goto out; ++ } ++ ++ if (current_benefit > stable_benefit) ++ delta = current_benefit - stable_benefit; ++ else if (current_benefit < stable_benefit) ++ delta = stable_benefit - current_benefit; ++ ++ delta = div64_u64(100 * delta , stable_benefit); ++ ++ if (delta > 50) { ++ rshash_cont_obscure++; ++ if (rshash_cont_obscure > 2) ++ return OBSCURE; ++ else ++ return STILL; ++ } ++ ++out: ++ rshash_cont_obscure = 0; ++ return ret; ++} ++ ++/** ++ * rshash_adjust() - The main function to control the random sampling state ++ * machine for hash strength adapting. ++ * ++ * return true if hash_strength has changed. ++ */ ++static inline int rshash_adjust(void) ++{ ++ unsigned long prev_hash_strength = hash_strength; ++ ++ if (!encode_benefit()) ++ return 0; ++ ++ switch (rshash_state.state) { ++ case RSHASH_STILL: ++ switch (judge_rshash_direction()) { ++ case GO_UP: ++ if (rshash_state.pre_direct == GO_DOWN) ++ hash_strength_delta = 0; ++ ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.pre_direct = GO_UP; ++ break; ++ ++ case GO_DOWN: ++ if (rshash_state.pre_direct == GO_UP) ++ hash_strength_delta = 0; ++ ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.pre_direct = GO_DOWN; ++ break; ++ ++ case OBSCURE: ++ rshash_state.stable_point = hash_strength; ++ rshash_state.turn_point_down = hash_strength; ++ rshash_state.turn_point_up = hash_strength; ++ rshash_state.turn_benefit_down = get_current_benefit(); ++ rshash_state.turn_benefit_up = get_current_benefit(); ++ rshash_state.lookup_window_index = 0; ++ rshash_state.state = RSHASH_TRYDOWN; ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ break; ++ ++ case STILL: ++ break; ++ default: ++ BUG(); ++ } ++ break; ++ ++ case RSHASH_TRYDOWN: ++ if (rshash_state.lookup_window_index++ % 5 == 0) ++ rshash_state.below_count = 0; ++ ++ if (get_current_benefit() < rshash_state.stable_benefit) ++ rshash_state.below_count++; ++ else if (get_current_benefit() > ++ rshash_state.turn_benefit_down) { ++ rshash_state.turn_point_down = hash_strength; ++ rshash_state.turn_benefit_down = get_current_benefit(); ++ } ++ ++ if (rshash_state.below_count >= 3 || ++ judge_rshash_direction() == GO_UP || ++ hash_strength == 1) { ++ hash_strength = rshash_state.stable_point; ++ hash_strength_delta = 0; ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.lookup_window_index = 0; ++ rshash_state.state = RSHASH_TRYUP; ++ hash_strength_delta = 0; ++ } else { ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ } ++ break; ++ ++ case RSHASH_TRYUP: ++ if (rshash_state.lookup_window_index++ % 5 == 0) ++ rshash_state.below_count = 0; ++ ++ if (get_current_benefit() < rshash_state.turn_benefit_down) ++ rshash_state.below_count++; ++ else if (get_current_benefit() > rshash_state.turn_benefit_up) { ++ rshash_state.turn_point_up = hash_strength; ++ rshash_state.turn_benefit_up = get_current_benefit(); ++ } ++ ++ if (rshash_state.below_count >= 3 || ++ judge_rshash_direction() == GO_DOWN || ++ hash_strength == HASH_STRENGTH_MAX) { ++ hash_strength = rshash_state.turn_benefit_up > ++ rshash_state.turn_benefit_down ? ++ rshash_state.turn_point_up : ++ rshash_state.turn_point_down; ++ ++ rshash_state.state = RSHASH_PRE_STILL; ++ } else { ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ } ++ ++ break; ++ ++ case RSHASH_NEW: ++ case RSHASH_PRE_STILL: ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.state = RSHASH_STILL; ++ hash_strength_delta = 0; ++ break; ++ default: ++ BUG(); ++ } ++ ++ /* rshash_neg = rshash_pos = 0; */ ++ reset_benefit(); ++ ++ if (prev_hash_strength != hash_strength) ++ stable_tree_delta_hash(prev_hash_strength); ++ ++ return prev_hash_strength != hash_strength; ++} ++ ++/** ++ * round_update_ladder() - The main function to do update of all the ++ * adjustments whenever a scan round is finished. ++ */ ++static noinline void round_update_ladder(void) ++{ ++ int i; ++ unsigned long dedup; ++ struct vma_slot *slot, *tmp_slot; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED; ++ } ++ ++ list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) { ++ ++ /* slot may be rung_rm_slot() when mm exits */ ++ if (slot->snode) { ++ dedup = cal_dedup_ratio_old(slot); ++ if (dedup && dedup >= uksm_abundant_threshold) ++ vma_rung_up(slot); ++ } ++ ++ slot->pages_bemerged = 0; ++ slot->pages_cowed = 0; ++ ++ list_del_init(&slot->dedup_list); ++ } ++} ++ ++static void uksm_del_vma_slot(struct vma_slot *slot) ++{ ++ int i, j; ++ struct rmap_list_entry *entry; ++ ++ if (slot->snode) { ++ /* ++ * In case it just failed when entering the rung, it's not ++ * necessary. ++ */ ++ rung_rm_slot(slot); ++ } ++ ++ if (!list_empty(&slot->dedup_list)) ++ list_del(&slot->dedup_list); ++ ++ if (!slot->rmap_list_pool || !slot->pool_counts) { ++ /* In case it OOMed in uksm_vma_enter() */ ++ goto out; ++ } ++ ++ for (i = 0; i < slot->pool_size; i++) { ++ void *addr; ++ ++ if (!slot->rmap_list_pool[i]) ++ continue; ++ ++ addr = kmap(slot->rmap_list_pool[i]); ++ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { ++ entry = (struct rmap_list_entry *)addr + j; ++ if (is_addr(entry->addr)) ++ continue; ++ if (!entry->item) ++ continue; ++ ++ remove_rmap_item_from_tree(entry->item); ++ free_rmap_item(entry->item); ++ slot->pool_counts[i]--; ++ } ++ BUG_ON(slot->pool_counts[i]); ++ kunmap(slot->rmap_list_pool[i]); ++ __free_page(slot->rmap_list_pool[i]); ++ } ++ kfree(slot->rmap_list_pool); ++ kfree(slot->pool_counts); ++ ++out: ++ slot->rung = NULL; ++ BUG_ON(uksm_pages_total < slot->pages); ++ if (slot->flags & UKSM_SLOT_IN_UKSM) ++ uksm_pages_total -= slot->pages; ++ ++ if (slot->fully_scanned_round == fully_scanned_round) ++ scanned_virtual_pages -= slot->pages; ++ else ++ scanned_virtual_pages -= slot->pages_scanned; ++ free_vma_slot(slot); ++} ++ ++ ++#define SPIN_LOCK_PERIOD 32 ++static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD]; ++static inline void cleanup_vma_slots(void) ++{ ++ struct vma_slot *slot; ++ int i; ++ ++ i = 0; ++ spin_lock(&vma_slot_list_lock); ++ while (!list_empty(&vma_slot_del)) { ++ slot = list_entry(vma_slot_del.next, ++ struct vma_slot, slot_list); ++ list_del(&slot->slot_list); ++ cleanup_slots[i++] = slot; ++ if (i == SPIN_LOCK_PERIOD) { ++ spin_unlock(&vma_slot_list_lock); ++ while (--i >= 0) ++ uksm_del_vma_slot(cleanup_slots[i]); ++ i = 0; ++ spin_lock(&vma_slot_list_lock); ++ } ++ } ++ spin_unlock(&vma_slot_list_lock); ++ ++ while (--i >= 0) ++ uksm_del_vma_slot(cleanup_slots[i]); ++} ++ ++/* ++*expotional moving average formula ++*/ ++static inline unsigned long ema(unsigned long curr, unsigned long last_ema) ++{ ++ /* ++ * For a very high burst, even the ema cannot work well, a false very ++ * high per-page time estimation can result in feedback in very high ++ * overhead of context swith and rung update -- this will then lead ++ * to higher per-paper time, this may not converge. ++ * ++ * Instead, we try to approach this value in a binary manner. ++ */ ++ if (curr > last_ema * 10) ++ return last_ema * 2; ++ ++ return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100; ++} ++ ++/* ++ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to ++ * nanoseconds based on current uksm_sleep_jiffies. ++ */ ++static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio) ++{ ++ return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) / ++ (TIME_RATIO_SCALE - ratio) * ratio; ++} ++ ++ ++static inline unsigned long rung_real_ratio(int cpu_time_ratio) ++{ ++ unsigned long ret; ++ ++ BUG_ON(!cpu_time_ratio); ++ ++ if (cpu_time_ratio > 0) ++ ret = cpu_time_ratio; ++ else ++ ret = (unsigned long)(-cpu_time_ratio) * ++ uksm_max_cpu_percentage / 100UL; ++ ++ return ret ? ret : 1; ++} ++ ++static noinline void uksm_calc_scan_pages(void) ++{ ++ struct scan_rung *ladder = uksm_scan_ladder; ++ unsigned long sleep_usecs, nsecs; ++ unsigned long ratio; ++ int i; ++ unsigned long per_page; ++ ++ if (uksm_ema_page_time > 100000 || ++ (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL)) ++ uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; ++ ++ per_page = uksm_ema_page_time; ++ BUG_ON(!per_page); ++ ++ /* ++ * For every 8 eval round, we try to probe a uksm_sleep_jiffies value ++ * based on saved user input. ++ */ ++ if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL) ++ uksm_sleep_jiffies = uksm_sleep_saved; ++ ++ /* We require a rung scan at least 1 page in a period. */ ++ nsecs = per_page; ++ ratio = rung_real_ratio(ladder[0].cpu_ratio); ++ if (cpu_ratio_to_nsec(ratio) < nsecs) { ++ sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio ++ / NSEC_PER_USEC; ++ uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ ratio = rung_real_ratio(ladder[i].cpu_ratio); ++ ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) / ++ per_page; ++ BUG_ON(!ladder[i].pages_to_scan); ++ uksm_calc_rung_step(&ladder[i], per_page, ratio); ++ } ++} ++ ++/* ++ * From the scan time of this round (ns) to next expected min sleep time ++ * (ms), be careful of the possible overflows. ratio is taken from ++ * rung_real_ratio() ++ */ ++static inline ++unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio) ++{ ++ scan_time >>= 20; /* to msec level now */ ++ BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE)); ++ ++ return (unsigned int) ((unsigned long) scan_time * ++ (TIME_RATIO_SCALE - ratio) / ratio); ++} ++ ++#define __round_mask(x, y) ((__typeof__(x))((y)-1)) ++#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) ++ ++static inline unsigned long vma_pool_size(struct vma_slot *slot) ++{ ++ return round_up(sizeof(struct rmap_list_entry) * slot->pages, ++ PAGE_SIZE) >> PAGE_SHIFT; ++} ++ ++static void uksm_vma_enter(struct vma_slot **slots, unsigned long num) ++{ ++ struct scan_rung *rung; ++ unsigned long pool_size, i; ++ struct vma_slot *slot; ++ int failed; ++ ++ rung = &uksm_scan_ladder[0]; ++ ++ failed = 0; ++ for (i = 0; i < num; i++) { ++ slot = slots[i]; ++ ++ pool_size = vma_pool_size(slot); ++ slot->rmap_list_pool = kzalloc(sizeof(struct page *) * ++ pool_size, GFP_KERNEL); ++ if (!slot->rmap_list_pool) ++ break; ++ ++ slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size, ++ GFP_KERNEL); ++ if (!slot->pool_counts) { ++ kfree(slot->rmap_list_pool); ++ break; ++ } ++ ++ slot->pool_size = pool_size; ++ BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages)); ++ slot->flags |= UKSM_SLOT_IN_UKSM; ++ uksm_pages_total += slot->pages; ++ } ++ ++ if (i) ++ rung_add_new_slots(rung, slots, i); ++ ++ return; ++} ++ ++static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE]; ++ ++static void uksm_enter_all_slots(void) ++{ ++ struct vma_slot *slot; ++ unsigned long index; ++ struct list_head empty_vma_list; ++ int i; ++ ++ i = 0; ++ index = 0; ++ INIT_LIST_HEAD(&empty_vma_list); ++ ++ spin_lock(&vma_slot_list_lock); ++ while (!list_empty(&vma_slot_new)) { ++ slot = list_entry(vma_slot_new.next, ++ struct vma_slot, slot_list); ++ ++ if (!slot->vma->anon_vma) { ++ list_move(&slot->slot_list, &empty_vma_list); ++ } else if (vma_can_enter(slot->vma)) { ++ batch_slots[index++] = slot; ++ list_del_init(&slot->slot_list); ++ } else { ++ list_move(&slot->slot_list, &vma_slot_noadd); ++ } ++ ++ if (++i == SPIN_LOCK_PERIOD || ++ (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) { ++ spin_unlock(&vma_slot_list_lock); ++ ++ if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) { ++ uksm_vma_enter(batch_slots, index); ++ index = 0; ++ } ++ i = 0; ++ cond_resched(); ++ spin_lock(&vma_slot_list_lock); ++ } ++ } ++ ++ list_splice(&empty_vma_list, &vma_slot_new); ++ ++ spin_unlock(&vma_slot_list_lock); ++ ++ if (index) ++ uksm_vma_enter(batch_slots, index); ++ ++} ++ ++static inline int rung_round_finished(struct scan_rung *rung) ++{ ++ return rung->flags & UKSM_RUNG_ROUND_FINISHED; ++} ++ ++static inline void judge_slot(struct vma_slot *slot) ++{ ++ struct scan_rung *rung = slot->rung; ++ unsigned long dedup; ++ int deleted; ++ ++ dedup = cal_dedup_ratio(slot); ++ if (vma_fully_scanned(slot) && uksm_thrash_threshold) ++ deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]); ++ else if (dedup && dedup >= uksm_abundant_threshold) ++ deleted = vma_rung_up(slot); ++ else ++ deleted = vma_rung_down(slot); ++ ++ slot->pages_merged = 0; ++ slot->pages_cowed = 0; ++ ++ if (vma_fully_scanned(slot)) ++ slot->pages_scanned = 0; ++ ++ slot->last_scanned = slot->pages_scanned; ++ ++ /* If its deleted in above, then rung was already advanced. */ ++ if (!deleted) ++ advance_current_scan(rung); ++} ++ ++ ++static inline int hash_round_finished(void) ++{ ++ if (scanned_virtual_pages > (uksm_pages_total >> 2)) { ++ scanned_virtual_pages = 0; ++ if (uksm_pages_scanned) ++ fully_scanned_round++; ++ ++ return 1; ++ } else { ++ return 0; ++ } ++} ++ ++#define UKSM_MMSEM_BATCH 5 ++/** ++ * uksm_do_scan() - the main worker function. ++ */ ++static noinline void uksm_do_scan(void) ++{ ++ struct vma_slot *slot, *iter; ++ struct mm_struct *busy_mm; ++ unsigned char round_finished, all_rungs_emtpy; ++ int i, err, mmsem_batch; ++ unsigned long pcost; ++ long long delta_exec; ++ unsigned long vpages, max_cpu_ratio; ++ unsigned long long start_time, end_time, scan_time; ++ unsigned int expected_jiffies; ++ ++ might_sleep(); ++ ++ vpages = 0; ++ ++ start_time = task_sched_runtime(current); ++ max_cpu_ratio = 0; ++ mmsem_batch = 0; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE;) { ++ struct scan_rung *rung = &uksm_scan_ladder[i]; ++ unsigned long ratio; ++ ++ if (!rung->pages_to_scan) { ++ i++; ++ continue; ++ } ++ ++ if (!rung->vma_root.num) { ++ rung->pages_to_scan = 0; ++ i++; ++ continue; ++ } ++ ++ ratio = rung_real_ratio(rung->cpu_ratio); ++ if (ratio > max_cpu_ratio) ++ max_cpu_ratio = ratio; ++ ++ /* ++ * Do not consider rung_round_finished() here, just used up the ++ * rung->pages_to_scan quota. ++ */ ++ while (rung->pages_to_scan && rung->vma_root.num && ++ likely(!freezing(current))) { ++ int reset = 0; ++ ++ slot = rung->current_scan; ++ ++ BUG_ON(vma_fully_scanned(slot)); ++ ++ if (mmsem_batch) { ++ err = 0; ++ } else { ++ err = try_down_read_slot_mmap_sem(slot); ++ } ++ ++ if (err == -ENOENT) { ++rm_slot: ++ rung_rm_slot(slot); ++ continue; ++ } ++ ++ busy_mm = slot->mm; ++ ++ if (err == -EBUSY) { ++ /* skip other vmas on the same mm */ ++ do { ++ reset = advance_current_scan(rung); ++ iter = rung->current_scan; ++ if (iter->vma->vm_mm != busy_mm) ++ break; ++ } while (!reset); ++ ++ if (iter->vma->vm_mm != busy_mm) { ++ continue; ++ } else { ++ /* scan round finsished */ ++ break; ++ } ++ } ++ ++ BUG_ON(!vma_can_enter(slot->vma)); ++ if (uksm_test_exit(slot->vma->vm_mm)) { ++ mmsem_batch = 0; ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ goto rm_slot; ++ } ++ ++ if (mmsem_batch) ++ mmsem_batch--; ++ else ++ mmsem_batch = UKSM_MMSEM_BATCH; ++ ++ /* Ok, we have take the mmap_sem, ready to scan */ ++ scan_vma_one_page(slot); ++ rung->pages_to_scan--; ++ vpages++; ++ ++ if (rung->current_offset + rung->step > slot->pages - 1 ++ || vma_fully_scanned(slot)) { ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ judge_slot(slot); ++ mmsem_batch = 0; ++ } else { ++ rung->current_offset += rung->step; ++ if (!mmsem_batch) ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ } ++ ++ cond_resched(); ++ } ++ ++ if (mmsem_batch) { ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ mmsem_batch = 0; ++ } ++ ++ if (freezing(current)) ++ break; ++ ++ cond_resched(); ++ } ++ end_time = task_sched_runtime(current); ++ delta_exec = end_time - start_time; ++ ++ if (freezing(current)) ++ return; ++ ++ cleanup_vma_slots(); ++ uksm_enter_all_slots(); ++ ++ round_finished = 1; ++ all_rungs_emtpy = 1; ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ struct scan_rung *rung = &uksm_scan_ladder[i]; ++ ++ if (rung->vma_root.num) { ++ all_rungs_emtpy = 0; ++ if (!rung_round_finished(rung)) ++ round_finished = 0; ++ } ++ } ++ ++ if (all_rungs_emtpy) ++ round_finished = 0; ++ ++ if (round_finished) { ++ round_update_ladder(); ++ uksm_eval_round++; ++ ++ if (hash_round_finished() && rshash_adjust()) { ++ /* Reset the unstable root iff hash strength changed */ ++ uksm_hash_round++; ++ root_unstable_tree = RB_ROOT; ++ free_all_tree_nodes(&unstable_tree_node_list); ++ } ++ ++ /* ++ * A number of pages can hang around indefinitely on per-cpu ++ * pagevecs, raised page count preventing write_protect_page ++ * from merging them. Though it doesn't really matter much, ++ * it is puzzling to see some stuck in pages_volatile until ++ * other activity jostles them out, and they also prevented ++ * LTP's KSM test from succeeding deterministically; so drain ++ * them here (here rather than on entry to uksm_do_scan(), ++ * so we don't IPI too often when pages_to_scan is set low). ++ */ ++ lru_add_drain_all(); ++ } ++ ++ ++ if (vpages && delta_exec > 0) { ++ pcost = (unsigned long) delta_exec / vpages; ++ if (likely(uksm_ema_page_time)) ++ uksm_ema_page_time = ema(pcost, uksm_ema_page_time); ++ else ++ uksm_ema_page_time = pcost; ++ } ++ ++ uksm_calc_scan_pages(); ++ uksm_sleep_real = uksm_sleep_jiffies; ++ /* in case of radical cpu bursts, apply the upper bound */ ++ end_time = task_sched_runtime(current); ++ if (max_cpu_ratio && end_time > start_time) { ++ scan_time = end_time - start_time; ++ expected_jiffies = msecs_to_jiffies( ++ scan_time_to_sleep(scan_time, max_cpu_ratio)); ++ ++ if (expected_jiffies > uksm_sleep_real) ++ uksm_sleep_real = expected_jiffies; ++ ++ /* We have a 1 second up bound for responsiveness. */ ++ if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC) ++ uksm_sleep_real = msecs_to_jiffies(1000); ++ } ++ ++ return; ++} ++ ++static int ksmd_should_run(void) ++{ ++ return uksm_run & UKSM_RUN_MERGE; ++} ++ ++static int uksm_scan_thread(void *nothing) ++{ ++ set_freezable(); ++ set_user_nice(current, 5); ++ ++ while (!kthread_should_stop()) { ++ mutex_lock(&uksm_thread_mutex); ++ if (ksmd_should_run()) { ++ uksm_do_scan(); ++ } ++ mutex_unlock(&uksm_thread_mutex); ++ ++ try_to_freeze(); ++ ++ if (ksmd_should_run()) { ++ schedule_timeout_interruptible(uksm_sleep_real); ++ uksm_sleep_times++; ++ } else { ++ wait_event_freezable(uksm_thread_wait, ++ ksmd_should_run() || kthread_should_stop()); ++ } ++ } ++ return 0; ++} ++ ++int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, ++ unsigned long *vm_flags) ++{ ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct rmap_item *rmap_item; ++ struct hlist_node *hlist, *rmap_hlist; ++ unsigned int mapcount = page_mapcount(page); ++ int referenced = 0; ++ int search_new_forks = 0; ++ unsigned long address; ++ ++ VM_BUG_ON(!PageKsm(page)); ++ VM_BUG_ON(!PageLocked(page)); ++ ++ stable_node = page_stable_node(page); ++ if (!stable_node) ++ return 0; ++ ++ ++again: ++ hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, rmap_hlist, ++ &node_vma->rmap_hlist, hlist) { ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ struct anon_vma_chain *vmac; ++ struct vm_area_struct *vma; ++ ++ anon_vma_lock(anon_vma); ++ list_for_each_entry(vmac, &anon_vma->head, ++ same_anon_vma) { ++ vma = vmac->vma; ++ address = get_rmap_addr(rmap_item); ++ ++ if (address < vma->vm_start || ++ address >= vma->vm_end) ++ continue; ++ /* ++ * Initially we examine only the vma which ++ * covers this rmap_item; but later, if there ++ * is still work to do, we examine covering ++ * vmas in other mms: in case they were forked ++ * from the original since ksmd passed. ++ */ ++ if ((rmap_item->slot->vma == vma) == ++ search_new_forks) ++ continue; ++ ++ if (memcg && ++ !mm_match_cgroup(vma->vm_mm, memcg)) ++ continue; ++ ++ referenced += ++ page_referenced_one(page, vma, ++ address, &mapcount, vm_flags); ++ if (!search_new_forks || !mapcount) ++ break; ++ } ++ ++ anon_vma_unlock(anon_vma); ++ if (!mapcount) ++ goto out; ++ } ++ } ++ if (!search_new_forks++) ++ goto again; ++out: ++ return referenced; ++} ++ ++int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) ++{ ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct hlist_node *hlist, *rmap_hlist; ++ struct rmap_item *rmap_item; ++ int ret = SWAP_AGAIN; ++ int search_new_forks = 0; ++ unsigned long address; ++ ++ VM_BUG_ON(!PageKsm(page)); ++ VM_BUG_ON(!PageLocked(page)); ++ ++ stable_node = page_stable_node(page); ++ if (!stable_node) ++ return SWAP_FAIL; ++again: ++ hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, rmap_hlist, ++ &node_vma->rmap_hlist, hlist) { ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ struct anon_vma_chain *vmac; ++ struct vm_area_struct *vma; ++ ++ anon_vma_lock(anon_vma); ++ list_for_each_entry(vmac, &anon_vma->head, ++ same_anon_vma) { ++ vma = vmac->vma; ++ address = get_rmap_addr(rmap_item); ++ ++ if (address < vma->vm_start || ++ address >= vma->vm_end) ++ continue; ++ /* ++ * Initially we examine only the vma which ++ * covers this rmap_item; but later, if there ++ * is still work to do, we examine covering ++ * vmas in other mms: in case they were forked ++ * from the original since ksmd passed. ++ */ ++ if ((rmap_item->slot->vma == vma) == ++ search_new_forks) ++ continue; ++ ++ ret = try_to_unmap_one(page, vma, ++ address, flags); ++ if (ret != SWAP_AGAIN || !page_mapped(page)) { ++ anon_vma_unlock(anon_vma); ++ goto out; ++ } ++ } ++ anon_vma_unlock(anon_vma); ++ } ++ } ++ if (!search_new_forks++) ++ goto again; ++out: ++ return ret; ++} ++ ++#ifdef CONFIG_MIGRATION ++int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, ++ struct vm_area_struct *, unsigned long, void *), void *arg) ++{ ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct hlist_node *hlist, *rmap_hlist; ++ struct rmap_item *rmap_item; ++ int ret = SWAP_AGAIN; ++ int search_new_forks = 0; ++ unsigned long address; ++ ++ VM_BUG_ON(!PageKsm(page)); ++ VM_BUG_ON(!PageLocked(page)); ++ ++ stable_node = page_stable_node(page); ++ if (!stable_node) ++ return ret; ++again: ++ hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, rmap_hlist, ++ &node_vma->rmap_hlist, hlist) { ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ struct anon_vma_chain *vmac; ++ struct vm_area_struct *vma; ++ ++ anon_vma_lock(anon_vma); ++ list_for_each_entry(vmac, &anon_vma->head, ++ same_anon_vma) { ++ vma = vmac->vma; ++ address = get_rmap_addr(rmap_item); ++ ++ if (address < vma->vm_start || ++ address >= vma->vm_end) ++ continue; ++ ++ if ((rmap_item->slot->vma == vma) == ++ search_new_forks) ++ continue; ++ ++ ret = rmap_one(page, vma, address, arg); ++ if (ret != SWAP_AGAIN) { ++ anon_vma_unlock(anon_vma); ++ goto out; ++ } ++ } ++ anon_vma_unlock(anon_vma); ++ } ++ } ++ if (!search_new_forks++) ++ goto again; ++out: ++ return ret; ++} ++ ++/* Common ksm interface but may be specific to uksm */ ++void ksm_migrate_page(struct page *newpage, struct page *oldpage) ++{ ++ struct stable_node *stable_node; ++ ++ VM_BUG_ON(!PageLocked(oldpage)); ++ VM_BUG_ON(!PageLocked(newpage)); ++ VM_BUG_ON(newpage->mapping != oldpage->mapping); ++ ++ stable_node = page_stable_node(newpage); ++ if (stable_node) { ++ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); ++ stable_node->kpfn = page_to_pfn(newpage); ++ } ++} ++#endif /* CONFIG_MIGRATION */ ++ ++#ifdef CONFIG_MEMORY_HOTREMOVE ++static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn, ++ unsigned long end_pfn) ++{ ++ struct rb_node *node; ++ ++ for (node = rb_first(root_stable_treep); node; node = rb_next(node)) { ++ struct stable_node *stable_node; ++ ++ stable_node = rb_entry(node, struct stable_node, node); ++ if (stable_node->kpfn >= start_pfn && ++ stable_node->kpfn < end_pfn) ++ return stable_node; ++ } ++ return NULL; ++} ++ ++static int uksm_memory_callback(struct notifier_block *self, ++ unsigned long action, void *arg) ++{ ++ struct memory_notify *mn = arg; ++ struct stable_node *stable_node; ++ ++ switch (action) { ++ case MEM_GOING_OFFLINE: ++ /* ++ * Keep it very simple for now: just lock out ksmd and ++ * MADV_UNMERGEABLE while any memory is going offline. ++ * mutex_lock_nested() is necessary because lockdep was alarmed ++ * that here we take uksm_thread_mutex inside notifier chain ++ * mutex, and later take notifier chain mutex inside ++ * uksm_thread_mutex to unlock it. But that's safe because both ++ * are inside mem_hotplug_mutex. ++ */ ++ mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING); ++ break; ++ ++ case MEM_OFFLINE: ++ /* ++ * Most of the work is done by page migration; but there might ++ * be a few stable_nodes left over, still pointing to struct ++ * pages which have been offlined: prune those from the tree. ++ */ ++ while ((stable_node = uksm_check_stable_tree(mn->start_pfn, ++ mn->start_pfn + mn->nr_pages)) != NULL) ++ remove_node_from_stable_tree(stable_node, 1, 1); ++ /* fallthrough */ ++ ++ case MEM_CANCEL_OFFLINE: ++ mutex_unlock(&uksm_thread_mutex); ++ break; ++ } ++ return NOTIFY_OK; ++} ++#endif /* CONFIG_MEMORY_HOTREMOVE */ ++ ++#ifdef CONFIG_SYSFS ++/* ++ * This all compiles without CONFIG_SYSFS, but is a waste of space. ++ */ ++ ++#define UKSM_ATTR_RO(_name) \ ++ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) ++#define UKSM_ATTR(_name) \ ++ static struct kobj_attribute _name##_attr = \ ++ __ATTR(_name, 0644, _name##_show, _name##_store) ++ ++static ssize_t max_cpu_percentage_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_max_cpu_percentage); ++} ++ ++static ssize_t max_cpu_percentage_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned long max_cpu_percentage; ++ int err; ++ ++ err = strict_strtoul(buf, 10, &max_cpu_percentage); ++ if (err || max_cpu_percentage > 100) ++ return -EINVAL; ++ ++ if (max_cpu_percentage == 100) ++ max_cpu_percentage = 99; ++ else if (max_cpu_percentage < 10) ++ max_cpu_percentage = 10; ++ ++ uksm_max_cpu_percentage = max_cpu_percentage; ++ ++ return count; ++} ++UKSM_ATTR(max_cpu_percentage); ++ ++static ssize_t sleep_millisecs_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies)); ++} ++ ++static ssize_t sleep_millisecs_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned long msecs; ++ int err; ++ ++ err = strict_strtoul(buf, 10, &msecs); ++ if (err || msecs > MSEC_PER_SEC) ++ return -EINVAL; ++ ++ uksm_sleep_jiffies = msecs_to_jiffies(msecs); ++ uksm_sleep_saved = uksm_sleep_jiffies; ++ ++ return count; ++} ++UKSM_ATTR(sleep_millisecs); ++ ++ ++static ssize_t cpu_governor_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); ++ int i; ++ ++ buf[0] = '\0'; ++ for (i = 0; i < n ; i++) { ++ if (uksm_cpu_governor == i) ++ strcat(buf, "["); ++ ++ strcat(buf, uksm_cpu_governor_str[i]); ++ ++ if (uksm_cpu_governor == i) ++ strcat(buf, "]"); ++ ++ strcat(buf, " "); ++ } ++ strcat(buf, "\n"); ++ ++ return strlen(buf); ++} ++ ++static inline void init_performance_values(void) ++{ ++ int i; ++ struct scan_rung *rung; ++ struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor; ++ ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = uksm_scan_ladder + i; ++ rung->cpu_ratio = preset->cpu_ratio[i]; ++ rung->cover_msecs = preset->cover_msecs[i]; ++ } ++ ++ uksm_max_cpu_percentage = preset->max_cpu; ++} ++ ++static ssize_t cpu_governor_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); ++ ++ for (n--; n >=0 ; n--) { ++ if (!strncmp(buf, uksm_cpu_governor_str[n], ++ strlen(uksm_cpu_governor_str[n]))) ++ break; ++ } ++ ++ if (n < 0) ++ return -EINVAL; ++ else ++ uksm_cpu_governor = n; ++ ++ init_performance_values(); ++ ++ return count; ++} ++UKSM_ATTR(cpu_governor); ++ ++static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_run); ++} ++ ++static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = strict_strtoul(buf, 10, &flags); ++ if (err || flags > UINT_MAX) ++ return -EINVAL; ++ if (flags > UKSM_RUN_MERGE) ++ return -EINVAL; ++ ++ mutex_lock(&uksm_thread_mutex); ++ if (uksm_run != flags) { ++ uksm_run = flags; ++ } ++ mutex_unlock(&uksm_thread_mutex); ++ ++ if (flags & UKSM_RUN_MERGE) ++ wake_up_interruptible(&uksm_thread_wait); ++ ++ return count; ++} ++UKSM_ATTR(run); ++ ++static ssize_t abundant_threshold_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_abundant_threshold); ++} ++ ++static ssize_t abundant_threshold_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = strict_strtoul(buf, 10, &flags); ++ if (err || flags > 99) ++ return -EINVAL; ++ ++ uksm_abundant_threshold = flags; ++ ++ return count; ++} ++UKSM_ATTR(abundant_threshold); ++ ++static ssize_t thrash_threshold_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_thrash_threshold); ++} ++ ++static ssize_t thrash_threshold_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = strict_strtoul(buf, 10, &flags); ++ if (err || flags > 99) ++ return -EINVAL; ++ ++ uksm_thrash_threshold = flags; ++ ++ return count; ++} ++UKSM_ATTR(thrash_threshold); ++ ++static ssize_t cpu_ratios_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int i, size; ++ struct scan_rung *rung; ++ char *p = buf; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ if (rung->cpu_ratio > 0) ++ size = sprintf(p, "%d ", rung->cpu_ratio); ++ else ++ size = sprintf(p, "MAX/%d ", ++ TIME_RATIO_SCALE / -rung->cpu_ratio); ++ ++ p += size; ++ } ++ ++ *p++ = '\n'; ++ *p = '\0'; ++ ++ return p - buf; ++} ++ ++static ssize_t cpu_ratios_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int i, cpuratios[SCAN_LADDER_SIZE], err; ++ unsigned long value; ++ struct scan_rung *rung; ++ char *p, *end = NULL; ++ ++ p = kzalloc(count, GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ memcpy(p, buf, count); ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ if (i != SCAN_LADDER_SIZE -1) { ++ end = strchr(p, ' '); ++ if (!end) ++ return -EINVAL; ++ ++ *end = '\0'; ++ } ++ ++ if (strstr(p, "MAX/")) { ++ p = strchr(p, '/') + 1; ++ err = strict_strtoul(p, 10, &value); ++ if (err || value > TIME_RATIO_SCALE || !value) ++ return -EINVAL; ++ ++ cpuratios[i] = - (int) (TIME_RATIO_SCALE / value); ++ } else { ++ err = strict_strtoul(p, 10, &value); ++ if (err || value > TIME_RATIO_SCALE || !value) ++ return -EINVAL; ++ ++ cpuratios[i] = value; ++ } ++ ++ p = end + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ rung->cpu_ratio = cpuratios[i]; ++ } ++ ++ return count; ++} ++UKSM_ATTR(cpu_ratios); ++ ++static ssize_t eval_intervals_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int i, size; ++ struct scan_rung *rung; ++ char *p = buf; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ size = sprintf(p, "%u ", rung->cover_msecs); ++ p += size; ++ } ++ ++ *p++ = '\n'; ++ *p = '\0'; ++ ++ return p - buf; ++} ++ ++static ssize_t eval_intervals_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int i, err; ++ unsigned long values[SCAN_LADDER_SIZE]; ++ struct scan_rung *rung; ++ char *p, *end = NULL; ++ ++ p = kzalloc(count, GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ memcpy(p, buf, count); ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ if (i != SCAN_LADDER_SIZE -1) { ++ end = strchr(p, ' '); ++ if (!end) ++ return -EINVAL; ++ ++ *end = '\0'; ++ } ++ ++ err = strict_strtoul(p, 10, &values[i]); ++ if (err) ++ return -EINVAL; ++ ++ p = end + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ rung->cover_msecs = values[i]; ++ } ++ ++ return count; ++} ++UKSM_ATTR(eval_intervals); ++ ++static ssize_t ema_per_page_time_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_ema_page_time); ++} ++UKSM_ATTR_RO(ema_per_page_time); ++ ++static ssize_t pages_shared_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_shared); ++} ++UKSM_ATTR_RO(pages_shared); ++ ++static ssize_t pages_sharing_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_sharing); ++} ++UKSM_ATTR_RO(pages_sharing); ++ ++static ssize_t pages_unshared_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_unshared); ++} ++UKSM_ATTR_RO(pages_unshared); ++ ++static ssize_t full_scans_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%llu\n", fully_scanned_round); ++} ++UKSM_ATTR_RO(full_scans); ++ ++static ssize_t pages_scanned_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ unsigned long base = 0; ++ u64 delta, ret; ++ ++ if (pages_scanned_stored) { ++ base = pages_scanned_base; ++ ret = pages_scanned_stored; ++ delta = uksm_pages_scanned >> base; ++ if (CAN_OVERFLOW_U64(ret, delta)) { ++ ret >>= 1; ++ delta >>= 1; ++ base++; ++ ret += delta; ++ } ++ } else { ++ ret = uksm_pages_scanned; ++ } ++ ++ while (ret > ULONG_MAX) { ++ ret >>= 1; ++ base++; ++ } ++ ++ if (base) ++ return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base); ++ else ++ return sprintf(buf, "%lu\n", (unsigned long)ret); ++} ++UKSM_ATTR_RO(pages_scanned); ++ ++static ssize_t hash_strength_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", hash_strength); ++} ++UKSM_ATTR_RO(hash_strength); ++ ++static ssize_t sleep_times_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%llu\n", uksm_sleep_times); ++} ++UKSM_ATTR_RO(sleep_times); ++ ++ ++static struct attribute *uksm_attrs[] = { ++ &max_cpu_percentage_attr.attr, ++ &sleep_millisecs_attr.attr, ++ &cpu_governor_attr.attr, ++ &run_attr.attr, ++ &ema_per_page_time_attr.attr, ++ &pages_shared_attr.attr, ++ &pages_sharing_attr.attr, ++ &pages_unshared_attr.attr, ++ &full_scans_attr.attr, ++ &pages_scanned_attr.attr, ++ &hash_strength_attr.attr, ++ &sleep_times_attr.attr, ++ &thrash_threshold_attr.attr, ++ &abundant_threshold_attr.attr, ++ &cpu_ratios_attr.attr, ++ &eval_intervals_attr.attr, ++ NULL, ++}; ++ ++static struct attribute_group uksm_attr_group = { ++ .attrs = uksm_attrs, ++ .name = "uksm", ++}; ++#endif /* CONFIG_SYSFS */ ++ ++static inline void init_scan_ladder(void) ++{ ++ int i; ++ struct scan_rung *rung; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = uksm_scan_ladder + i; ++ slot_tree_init_root(&rung->vma_root); ++ } ++ ++ init_performance_values(); ++ uksm_calc_scan_pages(); ++} ++ ++static inline int cal_positive_negative_costs(void) ++{ ++ struct page *p1, *p2; ++ unsigned char *addr1, *addr2; ++ unsigned long i, time_start, hash_cost; ++ unsigned long loopnum = 0; ++ ++ /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */ ++ volatile u32 hash; ++ volatile int ret; ++ ++ p1 = alloc_page(GFP_KERNEL); ++ if (!p1) ++ return -ENOMEM; ++ ++ p2 = alloc_page(GFP_KERNEL); ++ if (!p2) ++ return -ENOMEM; ++ ++ addr1 = kmap_atomic(p1, KM_USER0); ++ addr2 = kmap_atomic(p2, KM_USER1); ++ memset(addr1, random32(), PAGE_SIZE); ++ memcpy(addr2, addr1, PAGE_SIZE); ++ ++ /* make sure that the two pages differ in last byte */ ++ addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1]; ++ kunmap_atomic(addr2, KM_USER1); ++ kunmap_atomic(addr1, KM_USER0); ++ ++ time_start = jiffies; ++ while (jiffies - time_start < 100) { ++ for (i = 0; i < 100; i++) ++ hash = page_hash(p1, HASH_STRENGTH_FULL, 0); ++ loopnum += 100; ++ } ++ hash_cost = (jiffies - time_start); ++ ++ time_start = jiffies; ++ for (i = 0; i < loopnum; i++) ++ ret = pages_identical(p1, p2); ++ memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start); ++ memcmp_cost /= hash_cost; ++ printk(KERN_INFO "UKSM: relative memcmp_cost = %lu " ++ "hash=%u cmp_ret=%d.\n", ++ memcmp_cost, hash, ret); ++ ++ __free_page(p1); ++ __free_page(p2); ++ return 0; ++} ++ ++static int init_zeropage_hash_table(void) ++{ ++ struct page *page; ++ char *addr; ++ int i; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ addr = kmap_atomic(page, KM_USER0); ++ memset(addr, 0, PAGE_SIZE); ++ kunmap_atomic(addr, KM_USER0); ++ ++ zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32), ++ GFP_KERNEL); ++ if (!zero_hash_table) ++ return -ENOMEM; ++ ++ for (i = 0; i < HASH_STRENGTH_MAX; i++) ++ zero_hash_table[i] = page_hash(page, i, 0); ++ ++ __free_page(page); ++ ++ return 0; ++} ++ ++static inline int init_random_sampling(void) ++{ ++ unsigned long i; ++ random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL); ++ if (!random_nums) ++ return -ENOMEM; ++ ++ for (i = 0; i < HASH_STRENGTH_FULL; i++) ++ random_nums[i] = i; ++ ++ for (i = 0; i < HASH_STRENGTH_FULL; i++) { ++ unsigned long rand_range, swap_index, tmp; ++ ++ rand_range = HASH_STRENGTH_FULL - i; ++ swap_index = i + random32() % rand_range; ++ tmp = random_nums[i]; ++ random_nums[i] = random_nums[swap_index]; ++ random_nums[swap_index] = tmp; ++ } ++ ++ rshash_state.state = RSHASH_NEW; ++ rshash_state.below_count = 0; ++ rshash_state.lookup_window_index = 0; ++ ++ return cal_positive_negative_costs(); ++} ++ ++static int __init uksm_slab_init(void) ++{ ++ rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0); ++ if (!rmap_item_cache) ++ goto out; ++ ++ stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0); ++ if (!stable_node_cache) ++ goto out_free1; ++ ++ node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0); ++ if (!node_vma_cache) ++ goto out_free2; ++ ++ vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0); ++ if (!vma_slot_cache) ++ goto out_free3; ++ ++ tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0); ++ if (!tree_node_cache) ++ goto out_free4; ++ ++ return 0; ++ ++out_free4: ++ kmem_cache_destroy(vma_slot_cache); ++out_free3: ++ kmem_cache_destroy(node_vma_cache); ++out_free2: ++ kmem_cache_destroy(stable_node_cache); ++out_free1: ++ kmem_cache_destroy(rmap_item_cache); ++out: ++ return -ENOMEM; ++} ++ ++static void __init uksm_slab_free(void) ++{ ++ kmem_cache_destroy(stable_node_cache); ++ kmem_cache_destroy(rmap_item_cache); ++ kmem_cache_destroy(node_vma_cache); ++ kmem_cache_destroy(vma_slot_cache); ++ kmem_cache_destroy(tree_node_cache); ++} ++ ++/* Common interface to ksm, different to it. */ ++int ksm_madvise(struct vm_area_struct *vma, unsigned long start, ++ unsigned long end, int advice, unsigned long *vm_flags) ++{ ++ int err; ++ ++ switch (advice) { ++ case MADV_MERGEABLE: ++ return 0; /* just ignore the advice */ ++ ++ case MADV_UNMERGEABLE: ++ if (!(*vm_flags & VM_MERGEABLE)) ++ return 0; /* just ignore the advice */ ++ ++ if (vma->anon_vma) { ++ err = unmerge_uksm_pages(vma, start, end); ++ if (err) ++ return err; ++ } ++ ++ uksm_remove_vma(vma); ++ *vm_flags &= ~VM_MERGEABLE; ++ break; ++ } ++ ++ return 0; ++} ++ ++/* Common interface to ksm, actually the same. */ ++struct page *ksm_does_need_to_copy(struct page *page, ++ struct vm_area_struct *vma, unsigned long address) ++{ ++ struct page *new_page; ++ ++ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); ++ if (new_page) { ++ copy_user_highpage(new_page, page, address, vma); ++ ++ SetPageDirty(new_page); ++ __SetPageUptodate(new_page); ++ SetPageSwapBacked(new_page); ++ __set_page_locked(new_page); ++ ++ if (page_evictable(new_page, vma)) ++ lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); ++ else ++ add_page_to_unevictable_list(new_page); ++ } ++ ++ return new_page; ++} ++ ++static int __init uksm_init(void) ++{ ++ struct task_struct *uksm_thread; ++ int err; ++ ++ uksm_sleep_jiffies = msecs_to_jiffies(100); ++ uksm_sleep_saved = uksm_sleep_jiffies; ++ ++ slot_tree_init(); ++ init_scan_ladder(); ++ ++ ++ err = init_random_sampling(); ++ if (err) ++ goto out_free2; ++ ++ err = uksm_slab_init(); ++ if (err) ++ goto out_free1; ++ ++ err = init_zeropage_hash_table(); ++ if (err) ++ goto out_free0; ++ ++ uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd"); ++ if (IS_ERR(uksm_thread)) { ++ printk(KERN_ERR "uksm: creating kthread failed\n"); ++ err = PTR_ERR(uksm_thread); ++ goto out_free; ++ } ++ ++#ifdef CONFIG_SYSFS ++ err = sysfs_create_group(mm_kobj, &uksm_attr_group); ++ if (err) { ++ printk(KERN_ERR "uksm: register sysfs failed\n"); ++ kthread_stop(uksm_thread); ++ goto out_free; ++ } ++#else ++ uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */ ++ ++#endif /* CONFIG_SYSFS */ ++ ++#ifdef CONFIG_MEMORY_HOTREMOVE ++ /* ++ * Choose a high priority since the callback takes uksm_thread_mutex: ++ * later callbacks could only be taking locks which nest within that. ++ */ ++ hotplug_memory_notifier(uksm_memory_callback, 100); ++#endif ++ return 0; ++ ++out_free: ++ kfree(zero_hash_table); ++out_free0: ++ uksm_slab_free(); ++out_free1: ++ kfree(random_nums); ++out_free2: ++ kfree(uksm_scan_ladder); ++ return err; ++} ++ ++#ifdef MODULE ++module_init(uksm_init) ++#else ++late_initcall(uksm_init); ++#endif ++ +diff --git a/mm/vmstat.c b/mm/vmstat.c +index 8fd603b..63d43d9 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -719,6 +719,9 @@ const char * const vmstat_text[] = { + "numa_other", + #endif + "nr_anon_transparent_hugepages", ++#ifdef CONFIG_UKSM ++ "nr_uksm_zero_pages", ++#endif + "nr_dirty_threshold", + "nr_dirty_background_threshold", + diff --git a/3.2.34/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-VL.patch b/3.2.34/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-VL.patch new file mode 100644 index 0000000..a26d2b1 --- /dev/null +++ b/3.2.34/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-VL.patch @@ -0,0 +1,381 @@ +diff -uNr linux-3.2.33-go.orig/arch/arm/mach-kirkwood/Kconfig linux-3.2.33-go/arch/arm/mach-kirkwood/Kconfig +--- linux-3.2.33-go.orig/arch/arm/mach-kirkwood/Kconfig 2012-11-14 21:20:22.326388580 +0100 ++++ linux-3.2.33-go/arch/arm/mach-kirkwood/Kconfig 2012-11-14 21:21:02.353908681 +0100 +@@ -136,6 +136,12 @@ + Say 'Y' here if you want your kernel to support the + Buffalo LS-XHL Series. + ++config MACH_LSVL ++ bool "Buffalo LS-VL Series" ++ help ++ Say 'Y' here if you want your kernel to support the ++ Buffalo LS-VL Series. ++ + endmenu + + endif +diff -uNr linux-3.2.33-go.orig/arch/arm/mach-kirkwood/lsvl-setup.c linux-3.2.33-go/arch/arm/mach-kirkwood/lsvl-setup.c +--- linux-3.2.33-go.orig/arch/arm/mach-kirkwood/lsvl-setup.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/arch/arm/mach-kirkwood/lsvl-setup.c 2012-11-14 21:22:54.158568343 +0100 +@@ -0,0 +1,340 @@ ++/* ++ * arch/arm/mach-kirkwood/lsvl-setup.c ++ * ++ * Buffalo LS-VL Series Setup ++ * ++ * This file is licensed under the terms of the GNU General Public ++ * License version 2. This program is licensed "as is" without any ++ * warranty of any kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "common.h" ++#include "mpp.h" ++ ++/***************************************************************************** ++ * 512KB SPI Flash on BOOT Device ++ ****************************************************************************/ ++static struct mtd_partition lsvl_partitions[] = { ++ { ++ .name = "u-boot", ++ .size = 0x80000, ++ .offset = 0x00000, ++ .mask_flags = MTD_WRITEABLE, /* force read-only */ ++ } ++}; ++ ++static struct flash_platform_data lsvl_spi_slave_data = { ++ .type = "m25p40-nonjedec", ++ .parts = lsvl_partitions, ++ .nr_parts = ARRAY_SIZE(lsvl_partitions), ++}; ++ ++static struct spi_board_info __initdata lsvl_spi_slave_info[] = { ++ { ++ .modalias = "m25p80", ++ .platform_data = &lsvl_spi_slave_data, ++ .irq = -1, ++ .max_speed_hz = 20000000, ++ .bus_num = 0, ++ .chip_select = 0, ++ } ++}; ++ ++/***************************************************************************** ++ * Ethernet ++ ****************************************************************************/ ++static struct mv643xx_eth_platform_data lsvl_ge00_data = { ++ .phy_addr = MV643XX_ETH_PHY_ADDR(0), ++}; ++ ++/***************************************************************************** ++ * SATA ++ ****************************************************************************/ ++static struct mv_sata_platform_data lsvl_sata_data = { ++ .n_ports = 1, ++}; ++ ++/***************************************************************************** ++ * LEDs attached to GPIO ++ ****************************************************************************/ ++#define LSVL_GPIO_LED_ALARM 36 ++#define LSVL_GPIO_LED_FUNC_RED 37 ++#define LSVL_GPIO_LED_INFO 38 ++#define LSVL_GPIO_LED_FUNC_BLUE 39 ++#define LSVL_GPIO_LED_PWR 40 ++ ++static struct gpio_led lsvl_led_pins[] = { ++ { ++ .name = "alarm:red", ++ .gpio = LSVL_GPIO_LED_ALARM, ++ }, ++ { ++ .name = "func:red:bottom", ++ .gpio = LSVL_GPIO_LED_FUNC_RED, ++ }, ++ { ++ .name = "info:amber", ++ .gpio = LSVL_GPIO_LED_INFO, ++ }, ++ { ++ .name = "func:blue:bottom", ++ .gpio = LSVL_GPIO_LED_FUNC_BLUE, ++ }, ++ ++ { ++ .name = "power:blue", ++ .default_trigger = "default-on", ++ .gpio = LSVL_GPIO_LED_PWR, ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_led_platform_data lsvl_led_data = { ++ .leds = lsvl_led_pins, ++ .num_leds = ARRAY_SIZE(lsvl_led_pins), ++}; ++ ++static struct platform_device lsvl_leds = { ++ .name = "leds-gpio", ++ .id = -1, ++ .dev = { ++ .platform_data = &lsvl_led_data, ++ } ++}; ++ ++/***************************************************************************** ++ * General Setup ++ ****************************************************************************/ ++#define LSVL_GPIO_HDD_POWER 8 ++#define LSVL_GPIO_USB_POWER 12 ++ ++/***************************************************************************** ++ * GPIO Attached Keys ++ ****************************************************************************/ ++/*#define LSVL_GPIO_KEY_FUNC 45 ++#define LSVL_GPIO_KEY_POWER 46 ++#define LSVL_GPIO_KEY_AUTOPOWER 47 ++#define LSVL_SW_POWER 0x00 ++#define LSVL_SW_AUTOPOWER 0x01 ++#define LSVL_SW_FUNC 0x02 ++ ++static struct gpio_keys_button lsvl_buttons[] = { ++ { ++ .type = EV_SW, ++ .code = LSVL_SW_POWER, ++ .gpio = LSVL_GPIO_KEY_POWER, ++ .desc = "Power-on Switch", ++ .active_low = 1, ++ }, { ++ .type = EV_SW, ++ .code = LSVL_SW_AUTOPOWER, ++ .gpio = LSVL_GPIO_KEY_AUTOPOWER, ++ .desc = "Power-auto Switch", ++ .active_low = 1, ++ }, { ++ .type = EV_SW, ++ .code = LSVL_SW_FUNC, ++ .gpio = LSVL_GPIO_KEY_FUNC, ++ .desc = "Function Button", ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_keys_platform_data lsvl_button_data = { ++ .buttons = lsvl_buttons, ++ .nbuttons = ARRAY_SIZE(lsvl_buttons), ++}; ++ ++static struct platform_device lsvl_button_device = { ++ .name = "gpio-keys", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lsvl_button_data, ++ }, ++}; ++*/ ++ ++/***************************************************************************** ++ * GPIO Fan ++ ****************************************************************************/ ++#define LSVL_GPIO_FAN_HIGH 16 ++#define LSVL_GPIO_FAN_LOW 17 ++#define LSVL_GPIO_FAN_LOCK 43 ++ ++static struct gpio_fan_alarm lsvl_alarm = { ++ .gpio = LSVL_GPIO_FAN_LOCK, ++}; ++ ++static struct gpio_fan_speed lsvl_speeds[] = { ++ { ++ .rpm = 0, ++ .ctrl_val = 3, ++ }, { ++ .rpm = 1500, ++ .ctrl_val = 1, ++ }, { ++ .rpm = 3250, ++ .ctrl_val = 2, ++ }, { ++ .rpm = 5000, ++ .ctrl_val = 0, ++ } ++}; ++ ++static int lsvl_gpio_list[] = { ++ LSVL_GPIO_FAN_HIGH, LSVL_GPIO_FAN_LOW, ++}; ++ ++static struct gpio_fan_platform_data lsvl_fan_data = { ++ .num_ctrl = ARRAY_SIZE(lsvl_gpio_list), ++ .ctrl = lsvl_gpio_list, ++ .alarm = &lsvl_alarm, ++ .num_speed = ARRAY_SIZE(lsvl_speeds), ++ .speed = lsvl_speeds, ++}; ++ ++static struct platform_device lsvl_fan_device = { ++ .name = "gpio-fan", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lsvl_fan_data, ++ }, ++}; ++ ++/***************************************************************************** ++ * GPIO Data ++ ****************************************************************************/ ++ ++static unsigned int lsvl_mpp_config[] __initdata = { ++ MPP0_NF_IO2, ++ MPP1_NF_IO3, ++ MPP2_NF_IO4, ++ MPP3_NF_IO5, ++ MPP4_NF_IO6, ++ MPP5_NF_IO7, ++ MPP6_SYSRST_OUTn, ++ MPP7_SPI_SCn, ++ MPP8_GPIO, /* HDD Power */ ++ MPP9_GPIO, ++ MPP10_UART0_TXD, ++ MPP11_UART0_RXD, ++ MPP12_GPO, /* USB VBUS EN */ ++ MPP13_GPIO, ++ MPP14_GPIO, ++ MPP15_GPIO, ++ MPP16_GPIO, /* FAN HIGH: on:0, off:1 */ ++ MPP17_GPIO, /* FAN LOW: on:0, off:1 */ ++ MPP18_NF_IO0, ++ MPP19_NF_IO1, ++ MPP20_GPIO, ++ MPP21_GPIO, ++ MPP22_GPIO, ++ MPP23_GPIO, ++ MPP24_GPIO, ++ MPP25_GPIO, ++ MPP26_GPIO, ++ MPP27_GPIO, ++ MPP28_GPIO, ++ MPP29_GPIO, ++ MPP30_GPIO, ++ MPP31_GPIO, ++ MPP32_GPIO, ++ MPP33_GPO, ++ MPP34_GPIO, ++ MPP35_GPIO, ++ MPP36_GPIO, /* ALARM LED */ ++ MPP37_GPIO, /* FUNC RED LED */ ++ MPP38_GPIO, /* INFO LED */ ++ MPP39_GPIO, /* FUNC LED */ ++ MPP40_GPIO, /* POWER LED */ ++ MPP41_GPIO, ++ MPP42_GPIO, ++ MPP43_GPIO, /* FAN LOCK */ ++ MPP44_GPIO, ++ MPP45_GPIO, /* FUNC SW */ ++ MPP46_GPIO, /* POWER SW */ ++ MPP47_GPIO, /* POWER AUTO SW */ ++ MPP48_GPIO, /* UART EN */ ++ MPP49_GPIO, ++ 0 ++}; ++ ++/***************************************************************************** ++ * LS-VL specific power off method: reboot ++ ****************************************************************************/ ++/* ++ * On the LS-VL, the shutdown process is following: ++ * - Userland monitors key events until the power switch goes to off position ++ * - The board reboots ++ * - U-boot starts and goes into an idle mode waiting for the user ++ * to move the switch to ON position ++ * ++ */ ++ ++static void lsvl_power_off(void) ++{ ++ arm_machine_restart('h', NULL); ++} ++ ++static void __init lsvl_init(void) ++{ ++ /* ++ * Basic setup. Needs to be called early. ++ */ ++ kirkwood_init(); ++ kirkwood_mpp_conf(lsvl_mpp_config); ++ ++ /* ++ * Configure peripherals. ++ */ ++ kirkwood_uart0_init(); ++ kirkwood_ehci_init(); ++ kirkwood_ge00_init(&lsvl_ge00_data); ++ kirkwood_sata_init(&lsvl_sata_data); ++ kirkwood_spi_init(); ++ ++ platform_device_register(&lsvl_leds); ++// platform_device_register(&lsvl_button_device); ++ platform_device_register(&lsvl_fan_device); ++ ++ spi_register_board_info(lsvl_spi_slave_info, ++ ARRAY_SIZE(lsvl_spi_slave_info)); ++ ++ /* usb power on */ ++ gpio_set_value(LSVL_GPIO_USB_POWER, 1); ++ ++ /* register power-off method */ ++ pm_power_off = lsvl_power_off; ++ ++ pr_info("%s: finished\n", __func__); ++} ++ ++MACHINE_START(LSVL, "Buffalo LS-VL Series") ++ .atag_offset = 0x100, ++ .init_machine = lsvl_init, ++ .map_io = kirkwood_map_io, ++ .init_early = kirkwood_init_early, ++ .init_irq = kirkwood_init_irq, ++ .timer = &kirkwood_timer, ++MACHINE_END ++ +diff -uNr linux-3.2.33-go.orig/arch/arm/mach-kirkwood/Makefile linux-3.2.33-go/arch/arm/mach-kirkwood/Makefile +--- linux-3.2.33-go.orig/arch/arm/mach-kirkwood/Makefile 2012-11-14 21:20:22.326388580 +0100 ++++ linux-3.2.33-go/arch/arm/mach-kirkwood/Makefile 2012-11-14 21:22:20.882968794 +0100 +@@ -19,5 +19,6 @@ + obj-$(CONFIG_MACH_NET5BIG_V2) += netxbig_v2-setup.o lacie_v2-common.o + obj-$(CONFIG_MACH_T5325) += t5325-setup.o + obj-$(CONFIG_MACH_LSXHL) += lsxhl-setup.o ++obj-$(CONFIG_MACH_LSVL) += lsvl-setup.o + + obj-$(CONFIG_CPU_IDLE) += cpuidle.o +diff -uNr linux-3.2.33-go.orig/arch/arm/tools/mach-types linux-3.2.33-go/arch/arm/tools/mach-types +--- linux-3.2.33-go.orig/arch/arm/tools/mach-types 2012-11-14 21:20:22.348388327 +0100 ++++ linux-3.2.33-go/arch/arm/tools/mach-types 2012-11-14 21:21:02.356908648 +0100 +@@ -118,6 +118,7 @@ + omap_osk MACH_OMAP_OSK OMAP_OSK 515 + tosa MACH_TOSA TOSA 520 + avila MACH_AVILA AVILA 526 ++lsvl MACH_LSVL LSVL 5277 + edb9302 MACH_EDB9302 EDB9302 538 + husky MACH_HUSKY HUSKY 543 + shepherd MACH_SHEPHERD SHEPHERD 545 diff --git a/3.2.34/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-WVL.patch b/3.2.34/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-WVL.patch new file mode 100644 index 0000000..7d272d7 --- /dev/null +++ b/3.2.34/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-WVL.patch @@ -0,0 +1,538 @@ +diff -uNr linux-3.2.34-go.orig/arch/arm/mach-kirkwood/Kconfig linux-3.2.34-go/arch/arm/mach-kirkwood/Kconfig +--- linux-3.2.34-go.orig/arch/arm/mach-kirkwood/Kconfig 2012-11-19 21:03:42.654743005 +0100 ++++ linux-3.2.34-go/arch/arm/mach-kirkwood/Kconfig 2012-11-19 21:04:02.744505974 +0100 +@@ -148,6 +148,12 @@ + Say 'Y' here if you want your kernel to support the + Buffalo LS-CHLv2 Series. + ++config MACH_LSWVL ++ bool "Buffalo LS-WVL Series" ++ help ++ Say 'Y' here if you want your kernel to support the ++ Buffalo LS-WVL/E-AP NAS ++ + endmenu + + endif +diff -uNr linux-3.2.34-go.orig/arch/arm/mach-kirkwood/lswvl-setup.c linux-3.2.34-go/arch/arm/mach-kirkwood/lswvl-setup.c +--- linux-3.2.34-go.orig/arch/arm/mach-kirkwood/lswvl-setup.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-go/arch/arm/mach-kirkwood/lswvl-setup.c 2012-11-19 21:04:02.745505962 +0100 +@@ -0,0 +1,366 @@ ++/* ++ * arch/arm/mach-kirkwood/lswvl-setup.c ++ * ++ * Buffalo LS-WVL Series Setup ++ * ++ * This file is licensed under the terms of the GNU General Public ++ * License version 2. This program is licensed "as is" without any ++ * warranty of any kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "common.h" ++#include "mpp.h" ++ ++ ++/***************************************************************************** ++ * 512MB NAND Flash on Device bus CS0 ++ ****************************************************************************/ ++static struct mtd_partition lswvl_nand_parts[] = { ++ { ++ .name = "boot", ++ .offset = 0, ++ .size = 16 * 1024 * 1024, ++ }, { ++ .name = "rootfs", ++ .offset = MTDPART_OFS_NXTBLK, ++ .size = 488 * 1024 * 1024, ++ }, { ++ .name = "reserve", ++ .offset = MTDPART_OFS_NXTBLK, ++ .size = MTDPART_SIZ_FULL, ++ }, ++}; ++ ++/***************************************************************************** ++ * 512KB NOR Flash on BOOT Device ++ ****************************************************************************/ ++static struct mtd_partition lswvl_partitions[] = { ++ { ++ .name = "u-boot", ++ .size = 0x80000, ++ .offset = 0x00000, ++ .mask_flags = MTD_WRITEABLE, /* force read-only */ ++ }, ++}; ++ ++static struct flash_platform_data lswvl_spi_slave_data = { ++ .parts = lswvl_partitions, ++ .nr_parts = ARRAY_SIZE(lswvl_partitions), ++}; ++ ++static struct spi_board_info __initdata lswvl_spi_slave_info[] = { ++ { ++ .modalias = "m25p80", ++ .platform_data = &lswvl_spi_slave_data, ++ .irq = -1, ++ .max_speed_hz = 20000000, ++ .bus_num = 0, ++ .chip_select = 0, ++ }, ++}; ++ ++/***************************************************************************** ++ * Ethernet ++ ****************************************************************************/ ++static struct mv643xx_eth_platform_data lswvl_ge00_data = { ++ .phy_addr = MV643XX_ETH_PHY_ADDR(0), ++}; ++ ++/***************************************************************************** ++ * SATA ++ ****************************************************************************/ ++static struct mv_sata_platform_data lswvl_sata_data = { ++ .n_ports = 2, ++}; ++ ++/***************************************************************************** ++ * LEDs attached to GPIO ++ ****************************************************************************/ ++#define LSWVL_GPIO_LED_HDDERR0 34 ++#define LSWVL_GPIO_LED_HDDERR1 35 ++#define LSWVL_GPIO_LED_ALARM 36 ++#define LSWVL_GPIO_LED_FUNC_RED 37 ++#define LSWVL_GPIO_LED_INFO 38 ++#define LSWVL_GPIO_LED_FUNC_BLUE 39 ++#define LSWVL_GPIO_LED_PWR 40 ++ ++static struct gpio_led lswvl_led_pins[] = { ++ { ++ .name = "lswvl:hdderr:0", ++ .gpio = LSWVL_GPIO_LED_HDDERR0, ++ }, { ++ .name = "lswvl:hdderr:1", ++ .gpio = LSWVL_GPIO_LED_HDDERR1, ++ }, { ++ .name = "lswvl:alarm:red", ++ .gpio = LSWVL_GPIO_LED_ALARM, ++ }, { ++ .name = "lswvl:func:red", ++ .gpio = LSWVL_GPIO_LED_FUNC_RED, ++ }, { ++ .name = "lswvl:info:amber", ++ .gpio = LSWVL_GPIO_LED_INFO, ++ }, { ++ .name = "lswvl:func:blue", ++ .gpio = LSWVL_GPIO_LED_FUNC_BLUE, ++ }, { ++ .name = "lswvl:power:blue", ++ .default_trigger = "default-on", ++ .gpio = LSWVL_GPIO_LED_PWR, ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_led_platform_data lswvl_led_data = { ++ .leds = lswvl_led_pins, ++ .num_leds = ARRAY_SIZE(lswvl_led_pins), ++}; ++ ++static struct platform_device lswvl_leds = { ++ .name = "leds-gpio", ++ .id = -1, ++ .dev = { ++ .platform_data = &lswvl_led_data, ++ } ++}; ++ ++/***************************************************************************** ++ * General Setup ++ ****************************************************************************/ ++#define LSWVL_GPIO_HDD0_POWER 8 ++#define LSWVL_GPIO_HDD1_POWER 9 ++#define LSWVL_GPIO_USB_POWER 12 ++ ++/***************************************************************************** ++ * GPIO Attached Keys ++ ****************************************************************************/ ++#define LSWVL_GPIO_KEY_FUNC 45 ++#define LSWVL_GPIO_KEY_POWER 46 ++#define LSWVL_GPIO_KEY_AUTOPOWER 47 ++#define LSWVL_SW_POWER 0x00 ++#define LSWVL_SW_AUTOPOWER 0x01 ++#define LSWVL_SW_FUNC 0x02 ++ ++static struct gpio_keys_button lswvl_buttons[] = { ++ { ++ .type = EV_KEY, ++ .code = BTN_1, ++ .gpio = LSWVL_GPIO_KEY_POWER, ++ .desc = "power-on", ++ .active_low = 1, ++ }, { ++ .type = EV_KEY, ++ .code = BTN_2, ++ .gpio = LSWVL_GPIO_KEY_AUTOPOWER, ++ .desc = "power-auto", ++ .active_low = 1, ++ }, { ++ .type = EV_KEY, ++ .code = BTN_0, ++ .gpio = LSWVL_GPIO_KEY_FUNC, ++ .desc = "function", ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_keys_platform_data lswvl_button_data = { ++ .buttons = lswvl_buttons, ++ .nbuttons = ARRAY_SIZE(lswvl_buttons), ++}; ++ ++static struct platform_device lswvl_button_device = { ++ .name = "gpio-keys", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lswvl_button_data, ++ }, ++}; ++ ++/***************************************************************************** ++ * GPIO Fan ++ ****************************************************************************/ ++#define LSWVL_GPIO_FAN_HIGH 16 ++#define LSWVL_GPIO_FAN_LOW 17 ++#define LSWVL_GPIO_FAN_LOCK 43 ++ ++static struct gpio_fan_alarm lswvl_alarm = { ++ .gpio = LSWVL_GPIO_FAN_LOCK, ++}; ++ ++static struct gpio_fan_speed lswvl_speeds[] = { ++ { ++ .rpm = 0, ++ .ctrl_val = 3, ++ }, { ++ .rpm = 1500, ++ .ctrl_val = 1, ++ }, { ++ .rpm = 3250, ++ .ctrl_val = 2, ++ }, { ++ .rpm = 5000, ++ .ctrl_val = 0, ++ } ++}; ++ ++static int lswvl_gpio_list[] = { ++ LSWVL_GPIO_FAN_HIGH, LSWVL_GPIO_FAN_LOW, ++}; ++ ++static struct gpio_fan_platform_data lswvl_fan_data = { ++ .num_ctrl = ARRAY_SIZE(lswvl_gpio_list), ++ .ctrl = lswvl_gpio_list, ++ .alarm = &lswvl_alarm, ++ .num_speed = ARRAY_SIZE(lswvl_speeds), ++ .speed = lswvl_speeds, ++}; ++ ++static struct platform_device lswvl_fan_device = { ++ .name = "gpio-fan", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lswvl_fan_data, ++ }, ++}; ++ ++/***************************************************************************** ++ * GPIO Data ++ ****************************************************************************/ ++ ++static unsigned int lswvl_mpp_config[] __initdata = { ++ MPP0_NF_IO2, ++ MPP1_NF_IO3, ++ MPP2_NF_IO4, ++ MPP3_NF_IO5, ++ MPP4_NF_IO6, ++ MPP5_NF_IO7, ++ MPP6_SYSRST_OUTn, ++ MPP7_SPI_SCn, ++ MPP8_GPIO, /* HDD Power */ ++ MPP9_GPIO, /* HDD Power */ ++ MPP10_UART0_TXD, ++ MPP11_UART0_RXD, ++ MPP12_GPO, /* USB VBUS EN */ ++ MPP13_GPIO, ++ MPP14_GPIO, ++ MPP15_GPIO, ++ MPP16_GPIO, /* FAN HIGH: on:0, off:1 */ ++ MPP17_GPIO, /* FAN LOW: on:0, off:1 */ ++ MPP18_NF_IO0, ++ MPP19_NF_IO1, ++ MPP20_GPIO, ++ MPP21_GPIO, ++ MPP22_GPIO, ++ MPP23_GPIO, ++ MPP24_GPIO, ++ MPP25_GPIO, ++ MPP26_GPIO, ++ MPP27_GPIO, ++ MPP28_GPIO, ++ MPP29_GPIO, ++ MPP30_GPIO, ++ MPP31_GPIO, ++ MPP32_GPIO, ++ MPP33_GPO, ++ MPP34_GPIO, /*HDD ERROR LED 0*/ ++ MPP35_GPIO, /*HDD ERROR LED 1*/ ++ MPP36_GPIO, /* ALARM LED */ ++ MPP37_GPIO, /* FUNC RED LED */ ++ MPP38_GPIO, /* INFO LED */ ++ MPP39_GPIO, /* FUNC LED */ ++ MPP40_GPIO, /* POWER LED */ ++ MPP41_GPIO, ++ MPP42_GPIO, ++ MPP43_GPIO, /* FAN LOCK */ ++ MPP44_GPIO, ++ MPP45_GPIO, /* FUNC SW */ ++ MPP46_GPIO, /* POWER SW */ ++ MPP47_GPIO, /* POWER AUTO SW */ ++ MPP48_GPIO, /* UART EN */ ++ MPP49_GPIO, ++ 0 ++}; ++ ++/***************************************************************************** ++ * LS-WVL specific power off method: reboot ++ ****************************************************************************/ ++/* ++ * On the LS-WVL, the shutdown process is following: ++ * - Userland monitors key events until the power switch goes to off position ++ * - The board reboots ++ * - U-boot starts and goes into an idle mode waiting for the user ++ * to move the switch to ON position ++ * ++ */ ++ ++static void lswvl_power_off(void) ++{ ++ arm_machine_restart('h', NULL); //kirkwood_restart('h', NULL); ++} ++ ++static void __init lswvl_init(void) ++{ ++ /* ++ * Basic setup. Needs to be called early. ++ */ ++ kirkwood_init(); ++ kirkwood_mpp_conf(lswvl_mpp_config); ++ ++ /* ++ * Configure peripherals. ++ */ ++ kirkwood_ge00_init(&lswvl_ge00_data); ++ kirkwood_uart0_init(); ++ kirkwood_uart1_init(); ++ kirkwood_ehci_init(); ++ kirkwood_sata_init(&lswvl_sata_data); ++ ++ spi_register_board_info(lswvl_spi_slave_info, ++ ARRAY_SIZE(lswvl_spi_slave_info)); ++ kirkwood_spi_init(); ++ kirkwood_nand_init(ARRAY_AND_SIZE(lswvl_nand_parts), 25); ++ ++ platform_device_register(&lswvl_leds); ++ platform_device_register(&lswvl_button_device); ++ platform_device_register(&lswvl_fan_device); ++ ++ /* usb power on */ ++ gpio_set_value(LSWVL_GPIO_USB_POWER, 1); ++ ++ /* register power-off method */ ++ pm_power_off = lswvl_power_off; ++ ++ pr_info("%s: finished\n", __func__); ++} ++ ++MACHINE_START(LSWVL, "Buffalo LS-WVL Series") ++ .atag_offset = 0x100, ++ .map_io = kirkwood_map_io, ++ .init_early = kirkwood_init_early, ++ .init_irq = kirkwood_init_irq, ++ .timer = &kirkwood_timer, ++ .init_machine = lswvl_init, ++ // .restart = kirkwood_restart, ++MACHINE_END ++ +diff -uNr linux-3.2.34-go.orig/arch/arm/mach-kirkwood/Makefile linux-3.2.34-go/arch/arm/mach-kirkwood/Makefile +--- linux-3.2.34-go.orig/arch/arm/mach-kirkwood/Makefile 2012-11-19 21:03:42.653743017 +0100 ++++ linux-3.2.34-go/arch/arm/mach-kirkwood/Makefile 2012-11-19 21:04:42.686036907 +0100 +@@ -21,5 +21,6 @@ + obj-$(CONFIG_MACH_LINKSTATION_CHLV2) += lschlv2-setup.o + obj-$(CONFIG_MACH_LSXHL) += lsxhl-setup.o + obj-$(CONFIG_MACH_LSVL) += lsvl-setup.o ++obj-$(CONFIG_MACH_LSWVL) += lswvl-setup.o + + obj-$(CONFIG_CPU_IDLE) += cpuidle.o +diff -uNr linux-3.2.34-go.orig/arch/arm/plat-orion/mpp.c linux-3.2.34-go/arch/arm/plat-orion/mpp.c +--- linux-3.2.34-go.orig/arch/arm/plat-orion/mpp.c 2012-11-19 21:03:42.766741717 +0100 ++++ linux-3.2.34-go/arch/arm/plat-orion/mpp.c 2012-11-19 21:04:02.747505938 +0100 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + + /* Address of the ith MPP control register */ + static __init unsigned long mpp_ctrl_addr(unsigned int i, +@@ -75,3 +76,37 @@ + } + printk("\n"); + } ++ ++#ifdef CONFIG_MACH_LSWVL ++ ++static u32 boot_mpp_value = 0x21111111; ++/* ++ * change MPP[3:1] to SPI mode ++ */ ++void lswvl_setup_spi_mpp(void) ++{ ++ u32 spival = 0; ++ u32 bootval = 0; ++ ++ spival = 0x00002220; ++ boot_mpp_value = bootval = readl(mpp_ctrl_addr(0, DEV_BUS_VIRT_BASE)); ++ bootval &= 0xffff000f; ++ writel(spival | bootval, mpp_ctrl_addr(0, DEV_BUS_VIRT_BASE)); ++} ++ ++/* ++ * change back MPP[3:1] to default configuration ++ */ ++void lswvl_reset_mpp(void) ++{ ++ u32 spival = 0; ++ u32 bootval = 0; ++ ++ spival = readl(mpp_ctrl_addr(0, DEV_BUS_VIRT_BASE)); ++ spival &= 0xffff000f; ++ bootval = boot_mpp_value & ~0xffff000f; ++ writel(spival | bootval, mpp_ctrl_addr(0, DEV_BUS_VIRT_BASE)); ++} ++ ++#endif ++ +diff -uNr linux-3.2.34-go.orig/arch/arm/tools/mach-types linux-3.2.34-go/arch/arm/tools/mach-types +--- linux-3.2.34-go.orig/arch/arm/tools/mach-types 2012-11-19 21:03:42.675742765 +0100 ++++ linux-3.2.34-go/arch/arm/tools/mach-types 2012-11-19 21:22:29.653445807 +0100 +@@ -119,6 +119,7 @@ + tosa MACH_TOSA TOSA 520 + avila MACH_AVILA AVILA 526 + lsvl MACH_LSVL LSVL 5277 ++lswvl MACH_LSWVL LSWVL 5278 + edb9302 MACH_EDB9302 EDB9302 538 + husky MACH_HUSKY HUSKY 543 + shepherd MACH_SHEPHERD SHEPHERD 545 +diff -uNr linux-3.2.34-go.orig/drivers/spi/spi-orion.c linux-3.2.34-go/drivers/spi/spi-orion.c +--- linux-3.2.34-go.orig/drivers/spi/spi-orion.c 2012-11-19 21:03:41.809752734 +0100 ++++ linux-3.2.34-go/drivers/spi/spi-orion.c 2012-11-19 21:20:55.123558883 +0100 +@@ -19,6 +19,12 @@ + #include + #include + #include ++#include ++ ++#ifdef CONFIG_MACH_LSWVL ++void lswvl_setup_spi_mpp(void); ++void lswvl_reset_mpp(void); ++#endif + + #define DRIVER_NAME "orion_spi" + +@@ -141,6 +147,9 @@ + unsigned int bits_per_word = spi->bits_per_word; + int rc; + ++#ifdef CONFIG_MACH_LSWVL ++ lswvl_setup_spi_mpp(); ++#endif + orion_spi = spi_master_get_devdata(spi->master); + + if ((t != NULL) && t->speed_hz) +@@ -153,15 +162,37 @@ + if (rc) + return rc; + ++#ifdef CONFIG_MACH_LSWVL ++ rc = orion_spi_set_transfer_size(orion_spi, bits_per_word); ++ lswvl_reset_mpp(); ++ return rc; ++#else + return orion_spi_set_transfer_size(orion_spi, bits_per_word); ++#endif + } + + static void orion_spi_set_cs(struct orion_spi *orion_spi, int enable) + { + if (enable) ++#ifdef CONFIG_MACH_LSWVL ++ { ++ lswvl_setup_spi_mpp(); ++ udelay(1); ++ orion_spi_setbits(orion_spi, ORION_SPI_IF_CTRL_REG, 0x1); ++ } ++#else + orion_spi_setbits(orion_spi, ORION_SPI_IF_CTRL_REG, 0x1); ++#endif + else + orion_spi_clrbits(orion_spi, ORION_SPI_IF_CTRL_REG, 0x1); ++#ifdef CONFIG_MACH_LSWVL ++ { ++ orion_spi_clrbits(orion_spi, ORION_SPI_IF_CTRL_REG, 0x1); ++ lswvl_reset_mpp(); ++ } ++#else ++ orion_spi_clrbits(orion_spi, ORION_SPI_IF_CTRL_REG, 0x1); ++#endif + } + + static inline int orion_spi_wait_till_ready(struct orion_spi *orion_spi) +@@ -361,8 +392,17 @@ + + /* Fix ac timing if required. */ + if (orion_spi->spi_info->enable_clock_fix) ++#ifdef CONFIG_MACH_LSWVL ++ { ++ lswvl_setup_spi_mpp(); ++ orion_spi_setbits(orion_spi, ORION_SPI_IF_CONFIG_REG, ++ (1 << 14)); ++ lswvl_reset_mpp(); ++ } ++#else + orion_spi_setbits(orion_spi, ORION_SPI_IF_CONFIG_REG, + (1 << 14)); ++#endif + + if ((spi->max_speed_hz == 0) + || (spi->max_speed_hz > orion_spi->max_speed)) diff --git a/3.2.34/vserver-3.2.34-vs2.3.2.15.patch b/3.2.34/vserver-3.2.34-vs2.3.2.15.patch new file mode 100644 index 0000000..7fdd459 --- /dev/null +++ b/3.2.34/vserver-3.2.34-vs2.3.2.15.patch @@ -0,0 +1,26125 @@ +diff -NurpP --minimal linux-3.2.34/Documentation/vserver/debug.txt linux-3.2.34-vs2.3.2.15/Documentation/vserver/debug.txt +--- linux-3.2.34/Documentation/vserver/debug.txt 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/Documentation/vserver/debug.txt 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,154 @@ ++ ++debug_cvirt: ++ ++ 2 4 "vx_map_tgid: %p/%llx: %d -> %d" ++ "vx_rmap_tgid: %p/%llx: %d -> %d" ++ ++debug_dlim: ++ ++ 0 1 "ALLOC (%p,#%d)%c inode (%d)" ++ "FREE (%p,#%d)%c inode" ++ 1 2 "ALLOC (%p,#%d)%c %lld bytes (%d)" ++ "FREE (%p,#%d)%c %lld bytes" ++ 2 4 "ADJUST: %lld,%lld on %ld,%ld [mult=%d]" ++ 3 8 "ext3_has_free_blocks(%p): %lu<%lu+1, %c, %u!=%u r=%d" ++ "ext3_has_free_blocks(%p): free=%lu, root=%lu" ++ "rcu_free_dl_info(%p)" ++ 4 10 "alloc_dl_info(%p,%d) = %p" ++ "dealloc_dl_info(%p)" ++ "get_dl_info(%p[#%d.%d])" ++ "put_dl_info(%p[#%d.%d])" ++ 5 20 "alloc_dl_info(%p,%d)*" ++ 6 40 "__hash_dl_info: %p[#%d]" ++ "__unhash_dl_info: %p[#%d]" ++ 7 80 "locate_dl_info(%p,#%d) = %p" ++ ++debug_misc: ++ ++ 0 1 "destroy_dqhash: %p [#0x%08x] c=%d" ++ "new_dqhash: %p [#0x%08x]" ++ "vroot[%d]_clr_dev: dev=%p[%lu,%d:%d]" ++ "vroot[%d]_get_real_bdev: dev=%p[%lu,%d:%d]" ++ "vroot[%d]_set_dev: dev=%p[%lu,%d:%d]" ++ "vroot_get_real_bdev not set" ++ 1 2 "cow_break_link(»%s«)" ++ "temp copy »%s«" ++ 2 4 "dentry_open(new): %p" ++ "dentry_open(old): %p" ++ "lookup_create(new): %p" ++ "old path »%s«" ++ "path_lookup(old): %d" ++ "vfs_create(new): %d" ++ "vfs_rename: %d" ++ "vfs_sendfile: %d" ++ 3 8 "fput(new_file=%p[#%d])" ++ "fput(old_file=%p[#%d])" ++ 4 10 "vx_info_kill(%p[#%d],%d,%d) = %d" ++ "vx_info_kill(%p[#%d],%d,%d)*" ++ 5 20 "vs_reboot(%p[#%d],%d)" ++ 6 40 "dropping task %p[#%u,%u] for %p[#%u,%u]" ++ ++debug_net: ++ ++ 2 4 "nx_addr_conflict(%p,%p) %d.%d,%d.%d" ++ 3 8 "inet_bind(%p) %d.%d.%d.%d, %d.%d.%d.%d, %d.%d.%d.%d" ++ "inet_bind(%p)* %p,%p;%lx %d.%d.%d.%d" ++ 4 10 "ip_route_connect(%p) %p,%p;%lx" ++ 5 20 "__addr_in_socket(%p,%d.%d.%d.%d) %p:%d.%d.%d.%d %p;%lx" ++ 6 40 "sk,egf: %p [#%d] (from %d)" ++ "sk,egn: %p [#%d] (from %d)" ++ "sk,req: %p [#%d] (from %d)" ++ "sk: %p [#%d] (from %d)" ++ "tw: %p [#%d] (from %d)" ++ 7 80 "__sock_recvmsg: %p[%p,%p,%p;%d]:%d/%d" ++ "__sock_sendmsg: %p[%p,%p,%p;%d]:%d/%d" ++ ++debug_nid: ++ ++ 0 1 "__lookup_nx_info(#%u): %p[#%u]" ++ "alloc_nx_info(%d) = %p" ++ "create_nx_info(%d) (dynamic rejected)" ++ "create_nx_info(%d) = %p (already there)" ++ "create_nx_info(%d) = %p (new)" ++ "dealloc_nx_info(%p)" ++ 1 2 "alloc_nx_info(%d)*" ++ "create_nx_info(%d)*" ++ 2 4 "get_nx_info(%p[#%d.%d])" ++ "put_nx_info(%p[#%d.%d])" ++ 3 8 "claim_nx_info(%p[#%d.%d.%d]) %p" ++ "clr_nx_info(%p[#%d.%d])" ++ "init_nx_info(%p[#%d.%d])" ++ "release_nx_info(%p[#%d.%d.%d]) %p" ++ "set_nx_info(%p[#%d.%d])" ++ 4 10 "__hash_nx_info: %p[#%d]" ++ "__nx_dynamic_id: [#%d]" ++ "__unhash_nx_info: %p[#%d.%d.%d]" ++ 5 20 "moved task %p into nxi:%p[#%d]" ++ "nx_migrate_task(%p,%p[#%d.%d.%d])" ++ "task_get_nx_info(%p)" ++ 6 40 "nx_clear_persistent(%p[#%d])" ++ ++debug_quota: ++ ++ 0 1 "quota_sync_dqh(%p,%d) discard inode %p" ++ 1 2 "quota_sync_dqh(%p,%d)" ++ "sync_dquots(%p,%d)" ++ "sync_dquots_dqh(%p,%d)" ++ 3 8 "do_quotactl(%p,%d,cmd=%d,id=%d,%p)" ++ ++debug_switch: ++ ++ 0 1 "vc: VCMD_%02d_%d[%d], %d,%p [%d,%d,%x,%x]" ++ 1 2 "vc: VCMD_%02d_%d[%d] = %08lx(%ld) [%d,%d]" ++ 4 10 "%s: (%s %s) returned %s with %d" ++ ++debug_tag: ++ ++ 7 80 "dx_parse_tag(»%s«): %d:#%d" ++ "dx_propagate_tag(%p[#%lu.%d]): %d,%d" ++ ++debug_xid: ++ ++ 0 1 "__lookup_vx_info(#%u): %p[#%u]" ++ "alloc_vx_info(%d) = %p" ++ "alloc_vx_info(%d)*" ++ "create_vx_info(%d) (dynamic rejected)" ++ "create_vx_info(%d) = %p (already there)" ++ "create_vx_info(%d) = %p (new)" ++ "dealloc_vx_info(%p)" ++ "loc_vx_info(%d) = %p (found)" ++ "loc_vx_info(%d) = %p (new)" ++ "loc_vx_info(%d) = %p (not available)" ++ 1 2 "create_vx_info(%d)*" ++ "loc_vx_info(%d)*" ++ 2 4 "get_vx_info(%p[#%d.%d])" ++ "put_vx_info(%p[#%d.%d])" ++ 3 8 "claim_vx_info(%p[#%d.%d.%d]) %p" ++ "clr_vx_info(%p[#%d.%d])" ++ "init_vx_info(%p[#%d.%d])" ++ "release_vx_info(%p[#%d.%d.%d]) %p" ++ "set_vx_info(%p[#%d.%d])" ++ 4 10 "__hash_vx_info: %p[#%d]" ++ "__unhash_vx_info: %p[#%d.%d.%d]" ++ "__vx_dynamic_id: [#%d]" ++ 5 20 "enter_vx_info(%p[#%d],%p) %p[#%d,%p]" ++ "leave_vx_info(%p[#%d,%p]) %p[#%d,%p]" ++ "moved task %p into vxi:%p[#%d]" ++ "task_get_vx_info(%p)" ++ "vx_migrate_task(%p,%p[#%d.%d])" ++ 6 40 "vx_clear_persistent(%p[#%d])" ++ "vx_exit_init(%p[#%d],%p[#%d,%d,%d])" ++ "vx_set_init(%p[#%d],%p[#%d,%d,%d])" ++ "vx_set_persistent(%p[#%d])" ++ "vx_set_reaper(%p[#%d],%p[#%d,%d])" ++ 7 80 "vx_child_reaper(%p[#%u,%u]) = %p[#%u,%u]" ++ ++ ++debug_limit: ++ ++ n 2^n "vx_acc_cres[%5d,%s,%2d]: %5d%s" ++ "vx_cres_avail[%5d,%s,%2d]: %5ld > %5d + %5d" ++ ++ m 2^m "vx_acc_page[%5d,%s,%2d]: %5d%s" ++ "vx_acc_pages[%5d,%s,%2d]: %5d += %5d" ++ "vx_pages_avail[%5d,%s,%2d]: %5ld > %5d + %5d" +diff -NurpP --minimal linux-3.2.34/arch/alpha/Kconfig linux-3.2.34-vs2.3.2.15/arch/alpha/Kconfig +--- linux-3.2.34/arch/alpha/Kconfig 2012-01-09 16:13:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/alpha/Kconfig 2011-12-05 19:33:02.000000000 +0100 +@@ -665,6 +665,8 @@ config DUMMY_CONSOLE + depends on VGA_HOSE + default y + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.2.34/arch/alpha/kernel/entry.S linux-3.2.34-vs2.3.2.15/arch/alpha/kernel/entry.S +--- linux-3.2.34/arch/alpha/kernel/entry.S 2010-10-21 13:06:45.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/arch/alpha/kernel/entry.S 2011-12-05 19:33:02.000000000 +0100 +@@ -860,24 +860,15 @@ sys_getxgid: + .globl sys_getxpid + .ent sys_getxpid + sys_getxpid: ++ lda $sp, -16($sp) ++ stq $26, 0($sp) + .prologue 0 +- ldq $2, TI_TASK($8) + +- /* See linux/kernel/timer.c sys_getppid for discussion +- about this loop. */ +- ldq $3, TASK_GROUP_LEADER($2) +- ldq $4, TASK_REAL_PARENT($3) +- ldl $0, TASK_TGID($2) +-1: ldl $1, TASK_TGID($4) +-#ifdef CONFIG_SMP +- mov $4, $5 +- mb +- ldq $3, TASK_GROUP_LEADER($2) +- ldq $4, TASK_REAL_PARENT($3) +- cmpeq $4, $5, $5 +- beq $5, 1b +-#endif +- stq $1, 80($sp) ++ lda $16, 96($sp) ++ jsr $26, do_getxpid ++ ldq $26, 0($sp) ++ ++ lda $sp, 16($sp) + ret + .end sys_getxpid + +diff -NurpP --minimal linux-3.2.34/arch/alpha/kernel/ptrace.c linux-3.2.34-vs2.3.2.15/arch/alpha/kernel/ptrace.c +--- linux-3.2.34/arch/alpha/kernel/ptrace.c 2011-01-05 21:48:40.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/alpha/kernel/ptrace.c 2011-12-05 19:33:02.000000000 +0100 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + + #include + #include +diff -NurpP --minimal linux-3.2.34/arch/alpha/kernel/systbls.S linux-3.2.34-vs2.3.2.15/arch/alpha/kernel/systbls.S +--- linux-3.2.34/arch/alpha/kernel/systbls.S 2012-01-09 16:13:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/alpha/kernel/systbls.S 2011-12-05 19:33:02.000000000 +0100 +@@ -446,7 +446,7 @@ sys_call_table: + .quad sys_stat64 /* 425 */ + .quad sys_lstat64 + .quad sys_fstat64 +- .quad sys_ni_syscall /* sys_vserver */ ++ .quad sys_vserver /* sys_vserver */ + .quad sys_ni_syscall /* sys_mbind */ + .quad sys_ni_syscall /* sys_get_mempolicy */ + .quad sys_ni_syscall /* sys_set_mempolicy */ +diff -NurpP --minimal linux-3.2.34/arch/alpha/kernel/traps.c linux-3.2.34-vs2.3.2.15/arch/alpha/kernel/traps.c +--- linux-3.2.34/arch/alpha/kernel/traps.c 2010-10-21 13:06:46.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/arch/alpha/kernel/traps.c 2011-12-05 19:33:02.000000000 +0100 +@@ -183,7 +183,8 @@ die_if_kernel(char * str, struct pt_regs + #ifdef CONFIG_SMP + printk("CPU %d ", hard_smp_processor_id()); + #endif +- printk("%s(%d): %s %ld\n", current->comm, task_pid_nr(current), str, err); ++ printk("%s(%d[#%u]): %s %ld\n", current->comm, ++ task_pid_nr(current), current->xid, str, err); + dik_show_regs(regs, r9_15); + add_taint(TAINT_DIE); + dik_show_trace((unsigned long *)(regs+1)); +diff -NurpP --minimal linux-3.2.34/arch/arm/Kconfig linux-3.2.34-vs2.3.2.15/arch/arm/Kconfig +--- linux-3.2.34/arch/arm/Kconfig 2012-11-18 18:42:07.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/arm/Kconfig 2012-10-22 12:59:45.000000000 +0200 +@@ -2252,6 +2252,8 @@ source "fs/Kconfig" + + source "arch/arm/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.2.34/arch/arm/kernel/calls.S linux-3.2.34-vs2.3.2.15/arch/arm/kernel/calls.S +--- linux-3.2.34/arch/arm/kernel/calls.S 2012-01-09 16:13:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/arm/kernel/calls.S 2011-12-05 19:33:02.000000000 +0100 +@@ -322,7 +322,7 @@ + /* 310 */ CALL(sys_request_key) + CALL(sys_keyctl) + CALL(ABI(sys_semtimedop, sys_oabi_semtimedop)) +-/* vserver */ CALL(sys_ni_syscall) ++ CALL(sys_vserver) + CALL(sys_ioprio_set) + /* 315 */ CALL(sys_ioprio_get) + CALL(sys_inotify_init) +diff -NurpP --minimal linux-3.2.34/arch/arm/kernel/process.c linux-3.2.34-vs2.3.2.15/arch/arm/kernel/process.c +--- linux-3.2.34/arch/arm/kernel/process.c 2012-11-18 18:42:07.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/arm/kernel/process.c 2012-08-13 12:40:51.000000000 +0200 +@@ -322,7 +322,8 @@ void __show_regs(struct pt_regs *regs) + void show_regs(struct pt_regs * regs) + { + printk("\n"); +- printk("Pid: %d, comm: %20s\n", task_pid_nr(current), current->comm); ++ printk("Pid: %d[#%u], comm: %20s\n", ++ task_pid_nr(current), current->xid, current->comm); + __show_regs(regs); + dump_stack(); + } +diff -NurpP --minimal linux-3.2.34/arch/arm/kernel/traps.c linux-3.2.34-vs2.3.2.15/arch/arm/kernel/traps.c +--- linux-3.2.34/arch/arm/kernel/traps.c 2012-11-18 18:42:07.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/arm/kernel/traps.c 2012-10-22 12:59:46.000000000 +0200 +@@ -244,8 +244,8 @@ static int __die(const char *str, int er + + print_modules(); + __show_regs(regs); +- printk(KERN_EMERG "Process %.*s (pid: %d, stack limit = 0x%p)\n", +- TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), thread + 1); ++ printk(KERN_EMERG "Process %.*s (pid: %d:#%u, stack limit = 0x%p)\n", ++ TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), tsk->xid, thread + 1); + + if (!user_mode(regs) || in_interrupt()) { + dump_mem(KERN_EMERG, "Stack: ", regs->ARM_sp, +diff -NurpP --minimal linux-3.2.34/arch/cris/Kconfig linux-3.2.34-vs2.3.2.15/arch/cris/Kconfig +--- linux-3.2.34/arch/cris/Kconfig 2012-01-09 16:14:01.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/cris/Kconfig 2011-12-05 19:33:02.000000000 +0100 +@@ -678,6 +678,8 @@ source "drivers/staging/Kconfig" + + source "arch/cris/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.2.34/arch/frv/kernel/kernel_thread.S linux-3.2.34-vs2.3.2.15/arch/frv/kernel/kernel_thread.S +--- linux-3.2.34/arch/frv/kernel/kernel_thread.S 2008-12-25 00:26:37.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/frv/kernel/kernel_thread.S 2011-12-05 19:33:02.000000000 +0100 +@@ -37,7 +37,7 @@ kernel_thread: + + # start by forking the current process, but with shared VM + setlos.p #__NR_clone,gr7 ; syscall number +- ori gr10,#CLONE_VM,gr8 ; first syscall arg [clone_flags] ++ ori gr10,#CLONE_KT,gr8 ; first syscall arg [clone_flags] + sethi.p #0xe4e4,gr9 ; second syscall arg [newsp] + setlo #0xe4e4,gr9 + setlos.p #0,gr10 ; third syscall arg [parent_tidptr] +diff -NurpP --minimal linux-3.2.34/arch/h8300/Kconfig linux-3.2.34-vs2.3.2.15/arch/h8300/Kconfig +--- linux-3.2.34/arch/h8300/Kconfig 2012-01-09 16:14:01.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/h8300/Kconfig 2011-12-05 19:33:02.000000000 +0100 +@@ -213,6 +213,8 @@ source "fs/Kconfig" + + source "arch/h8300/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.2.34/arch/ia64/Kconfig linux-3.2.34-vs2.3.2.15/arch/ia64/Kconfig +--- linux-3.2.34/arch/ia64/Kconfig 2012-01-09 16:14:01.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/ia64/Kconfig 2011-12-05 19:33:02.000000000 +0100 +@@ -657,6 +657,8 @@ source "fs/Kconfig" + + source "arch/ia64/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.2.34/arch/ia64/kernel/entry.S linux-3.2.34-vs2.3.2.15/arch/ia64/kernel/entry.S +--- linux-3.2.34/arch/ia64/kernel/entry.S 2012-11-18 18:42:08.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/ia64/kernel/entry.S 2012-05-21 18:54:41.000000000 +0200 +@@ -1714,7 +1714,7 @@ sys_call_table: + data8 sys_mq_notify + data8 sys_mq_getsetattr + data8 sys_kexec_load +- data8 sys_ni_syscall // reserved for vserver ++ data8 sys_vserver + data8 sys_waitid // 1270 + data8 sys_add_key + data8 sys_request_key +diff -NurpP --minimal linux-3.2.34/arch/ia64/kernel/process.c linux-3.2.34-vs2.3.2.15/arch/ia64/kernel/process.c +--- linux-3.2.34/arch/ia64/kernel/process.c 2011-03-15 18:06:39.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/ia64/kernel/process.c 2011-12-05 19:33:02.000000000 +0100 +@@ -109,8 +109,8 @@ show_regs (struct pt_regs *regs) + unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri; + + print_modules(); +- printk("\nPid: %d, CPU %d, comm: %20s\n", task_pid_nr(current), +- smp_processor_id(), current->comm); ++ printk("\nPid: %d[#%u], CPU %d, comm: %20s\n", task_pid_nr(current), ++ current->xid, smp_processor_id(), current->comm); + printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s (%s)\n", + regs->cr_ipsr, regs->cr_ifs, ip, print_tainted(), + init_utsname()->release); +diff -NurpP --minimal linux-3.2.34/arch/ia64/kernel/ptrace.c linux-3.2.34-vs2.3.2.15/arch/ia64/kernel/ptrace.c +--- linux-3.2.34/arch/ia64/kernel/ptrace.c 2011-01-05 21:48:59.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/ia64/kernel/ptrace.c 2011-12-05 19:33:02.000000000 +0100 +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + + #include + #include +diff -NurpP --minimal linux-3.2.34/arch/ia64/kernel/traps.c linux-3.2.34-vs2.3.2.15/arch/ia64/kernel/traps.c +--- linux-3.2.34/arch/ia64/kernel/traps.c 2010-07-07 18:31:01.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/arch/ia64/kernel/traps.c 2011-12-05 19:33:02.000000000 +0100 +@@ -59,8 +59,9 @@ die (const char *str, struct pt_regs *re + put_cpu(); + + if (++die.lock_owner_depth < 3) { +- printk("%s[%d]: %s %ld [%d]\n", +- current->comm, task_pid_nr(current), str, err, ++die_counter); ++ printk("%s[%d[#%u]]: %s %ld [%d]\n", ++ current->comm, task_pid_nr(current), current->xid, ++ str, err, ++die_counter); + if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) + != NOTIFY_STOP) + show_regs(regs); +@@ -323,8 +324,9 @@ handle_fpu_swa (int fp_fault, struct pt_ + if ((last.count & 15) < 5 && (ia64_fetchadd(1, &last.count, acq) & 15) < 5) { + last.time = current_jiffies + 5 * HZ; + printk(KERN_WARNING +- "%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n", +- current->comm, task_pid_nr(current), regs->cr_iip + ia64_psr(regs)->ri, isr); ++ "%s(%d[#%u]): floating-point assist fault at ip %016lx, isr %016lx\n", ++ current->comm, task_pid_nr(current), current->xid, ++ regs->cr_iip + ia64_psr(regs)->ri, isr); + } + } + } +diff -NurpP --minimal linux-3.2.34/arch/m32r/kernel/traps.c linux-3.2.34-vs2.3.2.15/arch/m32r/kernel/traps.c +--- linux-3.2.34/arch/m32r/kernel/traps.c 2011-10-24 18:44:58.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/arch/m32r/kernel/traps.c 2011-12-05 19:33:02.000000000 +0100 +@@ -196,8 +196,9 @@ static void show_registers(struct pt_reg + } else { + printk("SPI: %08lx\n", sp); + } +- printk("Process %s (pid: %d, process nr: %d, stackpage=%08lx)", +- current->comm, task_pid_nr(current), 0xffff & i, 4096+(unsigned long)current); ++ printk("Process %s (pid: %d[#%u], process nr: %d, stackpage=%08lx)", ++ current->comm, task_pid_nr(current), current->xid, ++ 0xffff & i, 4096+(unsigned long)current); + + /* + * When in-kernel, we also print out the stack and code at the +diff -NurpP --minimal linux-3.2.34/arch/m68k/Kconfig linux-3.2.34-vs2.3.2.15/arch/m68k/Kconfig +--- linux-3.2.34/arch/m68k/Kconfig 2012-01-09 16:14:03.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/m68k/Kconfig 2011-12-05 19:33:02.000000000 +0100 +@@ -135,6 +135,8 @@ source "fs/Kconfig" + + source "arch/m68k/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.2.34/arch/mips/Kconfig linux-3.2.34-vs2.3.2.15/arch/mips/Kconfig +--- linux-3.2.34/arch/mips/Kconfig 2012-01-09 16:14:04.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/mips/Kconfig 2011-12-05 19:33:02.000000000 +0100 +@@ -2478,6 +2478,8 @@ source "fs/Kconfig" + + source "arch/mips/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.2.34/arch/mips/kernel/ptrace.c linux-3.2.34-vs2.3.2.15/arch/mips/kernel/ptrace.c +--- linux-3.2.34/arch/mips/kernel/ptrace.c 2011-07-22 11:17:36.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/arch/mips/kernel/ptrace.c 2011-12-05 19:33:02.000000000 +0100 +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -263,6 +264,9 @@ long arch_ptrace(struct task_struct *chi + void __user *datavp = (void __user *) data; + unsigned long __user *datalp = (void __user *) data; + ++ if (!vx_check(vx_task_xid(child), VS_WATCH_P | VS_IDENT)) ++ goto out; ++ + switch (request) { + /* when I and D space are separate, these will need to be fixed. */ + case PTRACE_PEEKTEXT: /* read word at location addr. */ +diff -NurpP --minimal linux-3.2.34/arch/mips/kernel/scall32-o32.S linux-3.2.34-vs2.3.2.15/arch/mips/kernel/scall32-o32.S +--- linux-3.2.34/arch/mips/kernel/scall32-o32.S 2012-01-09 16:14:05.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/mips/kernel/scall32-o32.S 2011-12-05 19:33:02.000000000 +0100 +@@ -523,7 +523,7 @@ einval: li v0, -ENOSYS + sys sys_mq_timedreceive 5 + sys sys_mq_notify 2 /* 4275 */ + sys sys_mq_getsetattr 3 +- sys sys_ni_syscall 0 /* sys_vserver */ ++ sys sys_vserver 3 + sys sys_waitid 5 + sys sys_ni_syscall 0 /* available, was setaltroot */ + sys sys_add_key 5 /* 4280 */ +diff -NurpP --minimal linux-3.2.34/arch/mips/kernel/scall64-64.S linux-3.2.34-vs2.3.2.15/arch/mips/kernel/scall64-64.S +--- linux-3.2.34/arch/mips/kernel/scall64-64.S 2012-01-09 16:14:05.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/mips/kernel/scall64-64.S 2011-12-05 19:33:02.000000000 +0100 +@@ -362,7 +362,7 @@ sys_call_table: + PTR sys_mq_timedreceive + PTR sys_mq_notify + PTR sys_mq_getsetattr /* 5235 */ +- PTR sys_ni_syscall /* sys_vserver */ ++ PTR sys_vserver + PTR sys_waitid + PTR sys_ni_syscall /* available, was setaltroot */ + PTR sys_add_key +diff -NurpP --minimal linux-3.2.34/arch/mips/kernel/scall64-n32.S linux-3.2.34-vs2.3.2.15/arch/mips/kernel/scall64-n32.S +--- linux-3.2.34/arch/mips/kernel/scall64-n32.S 2012-01-09 16:14:05.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/mips/kernel/scall64-n32.S 2011-12-05 19:33:02.000000000 +0100 +@@ -361,7 +361,7 @@ EXPORT(sysn32_call_table) + PTR compat_sys_mq_timedreceive + PTR compat_sys_mq_notify + PTR compat_sys_mq_getsetattr +- PTR sys_ni_syscall /* 6240, sys_vserver */ ++ PTR sys32_vserver /* 6240 */ + PTR compat_sys_waitid + PTR sys_ni_syscall /* available, was setaltroot */ + PTR sys_add_key +diff -NurpP --minimal linux-3.2.34/arch/mips/kernel/scall64-o32.S linux-3.2.34-vs2.3.2.15/arch/mips/kernel/scall64-o32.S +--- linux-3.2.34/arch/mips/kernel/scall64-o32.S 2012-01-09 16:14:05.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/mips/kernel/scall64-o32.S 2011-12-05 19:33:02.000000000 +0100 +@@ -480,7 +480,7 @@ sys_call_table: + PTR compat_sys_mq_timedreceive + PTR compat_sys_mq_notify /* 4275 */ + PTR compat_sys_mq_getsetattr +- PTR sys_ni_syscall /* sys_vserver */ ++ PTR sys32_vserver + PTR sys_32_waitid + PTR sys_ni_syscall /* available, was setaltroot */ + PTR sys_add_key /* 4280 */ +diff -NurpP --minimal linux-3.2.34/arch/mips/kernel/traps.c linux-3.2.34-vs2.3.2.15/arch/mips/kernel/traps.c +--- linux-3.2.34/arch/mips/kernel/traps.c 2012-01-09 16:14:05.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/mips/kernel/traps.c 2011-12-05 19:33:02.000000000 +0100 +@@ -343,9 +343,10 @@ void show_registers(struct pt_regs *regs + + __show_regs(regs); + print_modules(); +- printk("Process %s (pid: %d, threadinfo=%p, task=%p, tls=%0*lx)\n", +- current->comm, current->pid, current_thread_info(), current, +- field, current_thread_info()->tp_value); ++ printk("Process %s (pid: %d:#%u, threadinfo=%p, task=%p, tls=%0*lx)\n", ++ current->comm, task_pid_nr(current), current->xid, ++ current_thread_info(), current, ++ field, current_thread_info()->tp_value); + if (cpu_has_userlocal) { + unsigned long tls; + +diff -NurpP --minimal linux-3.2.34/arch/parisc/Kconfig linux-3.2.34-vs2.3.2.15/arch/parisc/Kconfig +--- linux-3.2.34/arch/parisc/Kconfig 2012-01-09 16:14:05.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/parisc/Kconfig 2011-12-05 19:33:02.000000000 +0100 +@@ -278,6 +278,8 @@ source "fs/Kconfig" + + source "arch/parisc/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.2.34/arch/parisc/kernel/syscall_table.S linux-3.2.34-vs2.3.2.15/arch/parisc/kernel/syscall_table.S +--- linux-3.2.34/arch/parisc/kernel/syscall_table.S 2011-10-24 18:45:00.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/arch/parisc/kernel/syscall_table.S 2011-12-05 19:33:02.000000000 +0100 +@@ -361,7 +361,7 @@ + ENTRY_COMP(mbind) /* 260 */ + ENTRY_COMP(get_mempolicy) + ENTRY_COMP(set_mempolicy) +- ENTRY_SAME(ni_syscall) /* 263: reserved for vserver */ ++ ENTRY_DIFF(vserver) + ENTRY_SAME(add_key) + ENTRY_SAME(request_key) /* 265 */ + ENTRY_SAME(keyctl) +diff -NurpP --minimal linux-3.2.34/arch/parisc/kernel/traps.c linux-3.2.34-vs2.3.2.15/arch/parisc/kernel/traps.c +--- linux-3.2.34/arch/parisc/kernel/traps.c 2011-10-24 18:45:00.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/arch/parisc/kernel/traps.c 2011-12-05 19:33:02.000000000 +0100 +@@ -236,8 +236,9 @@ void die_if_kernel(char *str, struct pt_ + if (err == 0) + return; /* STFU */ + +- printk(KERN_CRIT "%s (pid %d): %s (code %ld) at " RFMT "\n", +- current->comm, task_pid_nr(current), str, err, regs->iaoq[0]); ++ printk(KERN_CRIT "%s (pid %d:#%u): %s (code %ld) at " RFMT "\n", ++ current->comm, task_pid_nr(current), current->xid, ++ str, err, regs->iaoq[0]); + #ifdef PRINT_USER_FAULTS + /* XXX for debugging only */ + show_regs(regs); +@@ -270,8 +271,8 @@ void die_if_kernel(char *str, struct pt_ + pdc_console_restart(); + + if (err) +- printk(KERN_CRIT "%s (pid %d): %s (code %ld)\n", +- current->comm, task_pid_nr(current), str, err); ++ printk(KERN_CRIT "%s (pid %d:#%u): %s (code %ld)\n", ++ current->comm, task_pid_nr(current), current->xid, str, err); + + /* Wot's wrong wif bein' racy? */ + if (current->thread.flags & PARISC_KERNEL_DEATH) { +diff -NurpP --minimal linux-3.2.34/arch/parisc/mm/fault.c linux-3.2.34-vs2.3.2.15/arch/parisc/mm/fault.c +--- linux-3.2.34/arch/parisc/mm/fault.c 2010-08-02 16:52:06.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/arch/parisc/mm/fault.c 2011-12-05 19:33:02.000000000 +0100 +@@ -237,8 +237,9 @@ bad_area: + + #ifdef PRINT_USER_FAULTS + printk(KERN_DEBUG "\n"); +- printk(KERN_DEBUG "do_page_fault() pid=%d command='%s' type=%lu address=0x%08lx\n", +- task_pid_nr(tsk), tsk->comm, code, address); ++ printk(KERN_DEBUG "do_page_fault() pid=%d:#%u " ++ "command='%s' type=%lu address=0x%08lx\n", ++ task_pid_nr(tsk), tsk->xid, tsk->comm, code, address); + if (vma) { + printk(KERN_DEBUG "vm_start = 0x%08lx, vm_end = 0x%08lx\n", + vma->vm_start, vma->vm_end); +diff -NurpP --minimal linux-3.2.34/arch/powerpc/Kconfig linux-3.2.34-vs2.3.2.15/arch/powerpc/Kconfig +--- linux-3.2.34/arch/powerpc/Kconfig 2012-01-09 16:14:05.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/powerpc/Kconfig 2011-12-05 19:33:02.000000000 +0100 +@@ -960,6 +960,8 @@ source "lib/Kconfig" + + source "arch/powerpc/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + config KEYS_COMPAT +diff -NurpP --minimal linux-3.2.34/arch/powerpc/include/asm/unistd.h linux-3.2.34-vs2.3.2.15/arch/powerpc/include/asm/unistd.h +--- linux-3.2.34/arch/powerpc/include/asm/unistd.h 2012-01-09 16:14:05.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/powerpc/include/asm/unistd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -275,7 +275,7 @@ + #endif + #define __NR_rtas 255 + #define __NR_sys_debug_setcontext 256 +-/* Number 257 is reserved for vserver */ ++#define __NR_vserver 257 + #define __NR_migrate_pages 258 + #define __NR_mbind 259 + #define __NR_get_mempolicy 260 +diff -NurpP --minimal linux-3.2.34/arch/powerpc/kernel/process.c linux-3.2.34-vs2.3.2.15/arch/powerpc/kernel/process.c +--- linux-3.2.34/arch/powerpc/kernel/process.c 2012-11-18 18:42:08.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/powerpc/kernel/process.c 2012-10-22 12:59:46.000000000 +0200 +@@ -640,8 +640,9 @@ void show_regs(struct pt_regs * regs) + #else + printk("DAR: "REG", DSISR: %08lx\n", regs->dar, regs->dsisr); + #endif +- printk("TASK = %p[%d] '%s' THREAD: %p", +- current, task_pid_nr(current), current->comm, task_thread_info(current)); ++ printk("TASK = %p[%d,#%u] '%s' THREAD: %p", ++ current, task_pid_nr(current), current->xid, ++ current->comm, task_thread_info(current)); + + #ifdef CONFIG_SMP + printk(" CPU: %d", raw_smp_processor_id()); +diff -NurpP --minimal linux-3.2.34/arch/powerpc/kernel/traps.c linux-3.2.34-vs2.3.2.15/arch/powerpc/kernel/traps.c +--- linux-3.2.34/arch/powerpc/kernel/traps.c 2012-11-18 18:42:08.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/powerpc/kernel/traps.c 2012-10-22 12:59:46.000000000 +0200 +@@ -1083,8 +1083,9 @@ void nonrecoverable_exception(struct pt_ + + void trace_syscall(struct pt_regs *regs) + { +- printk("Task: %p(%d), PC: %08lX/%08lX, Syscall: %3ld, Result: %s%ld %s\n", +- current, task_pid_nr(current), regs->nip, regs->link, regs->gpr[0], ++ printk("Task: %p(%d[#%u]), PC: %08lX/%08lX, Syscall: %3ld, Result: %s%ld %s\n", ++ current, task_pid_nr(current), current->xid, ++ regs->nip, regs->link, regs->gpr[0], + regs->ccr&0x10000000?"Error=":"", regs->gpr[3], print_tainted()); + } + +diff -NurpP --minimal linux-3.2.34/arch/s390/Kconfig linux-3.2.34-vs2.3.2.15/arch/s390/Kconfig +--- linux-3.2.34/arch/s390/Kconfig 2012-11-18 18:42:08.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/s390/Kconfig 2012-04-24 16:50:48.000000000 +0200 +@@ -643,6 +643,8 @@ source "fs/Kconfig" + + source "arch/s390/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.2.34/arch/s390/include/asm/tlb.h linux-3.2.34-vs2.3.2.15/arch/s390/include/asm/tlb.h +--- linux-3.2.34/arch/s390/include/asm/tlb.h 2012-11-18 18:42:08.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/s390/include/asm/tlb.h 2012-04-24 16:50:48.000000000 +0200 +@@ -24,6 +24,7 @@ + #include + #include + #include ++ + #include + #include + #include +diff -NurpP --minimal linux-3.2.34/arch/s390/include/asm/unistd.h linux-3.2.34-vs2.3.2.15/arch/s390/include/asm/unistd.h +--- linux-3.2.34/arch/s390/include/asm/unistd.h 2012-01-09 16:14:06.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/s390/include/asm/unistd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -202,7 +202,7 @@ + #define __NR_clock_gettime (__NR_timer_create+6) + #define __NR_clock_getres (__NR_timer_create+7) + #define __NR_clock_nanosleep (__NR_timer_create+8) +-/* Number 263 is reserved for vserver */ ++#define __NR_vserver 263 + #define __NR_statfs64 265 + #define __NR_fstatfs64 266 + #define __NR_remap_file_pages 267 +diff -NurpP --minimal linux-3.2.34/arch/s390/kernel/ptrace.c linux-3.2.34-vs2.3.2.15/arch/s390/kernel/ptrace.c +--- linux-3.2.34/arch/s390/kernel/ptrace.c 2012-11-18 18:42:08.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/s390/kernel/ptrace.c 2012-03-14 10:25:26.000000000 +0100 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + #include + #include + #include +diff -NurpP --minimal linux-3.2.34/arch/s390/kernel/syscalls.S linux-3.2.34-vs2.3.2.15/arch/s390/kernel/syscalls.S +--- linux-3.2.34/arch/s390/kernel/syscalls.S 2012-01-09 16:14:06.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/s390/kernel/syscalls.S 2011-12-05 19:33:02.000000000 +0100 +@@ -271,7 +271,7 @@ SYSCALL(sys_clock_settime,sys_clock_sett + SYSCALL(sys_clock_gettime,sys_clock_gettime,sys32_clock_gettime_wrapper) /* 260 */ + SYSCALL(sys_clock_getres,sys_clock_getres,sys32_clock_getres_wrapper) + SYSCALL(sys_clock_nanosleep,sys_clock_nanosleep,sys32_clock_nanosleep_wrapper) +-NI_SYSCALL /* reserved for vserver */ ++SYSCALL(sys_vserver,sys_vserver,sys32_vserver) + SYSCALL(sys_s390_fadvise64_64,sys_ni_syscall,sys32_fadvise64_64_wrapper) + SYSCALL(sys_statfs64,sys_statfs64,compat_sys_statfs64_wrapper) + SYSCALL(sys_fstatfs64,sys_fstatfs64,compat_sys_fstatfs64_wrapper) +diff -NurpP --minimal linux-3.2.34/arch/sh/Kconfig linux-3.2.34-vs2.3.2.15/arch/sh/Kconfig +--- linux-3.2.34/arch/sh/Kconfig 2012-01-09 16:14:07.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/sh/Kconfig 2011-12-05 19:33:02.000000000 +0100 +@@ -901,6 +901,8 @@ source "fs/Kconfig" + + source "arch/sh/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.2.34/arch/sh/kernel/irq.c linux-3.2.34-vs2.3.2.15/arch/sh/kernel/irq.c +--- linux-3.2.34/arch/sh/kernel/irq.c 2011-07-22 11:17:41.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/arch/sh/kernel/irq.c 2011-12-05 19:33:02.000000000 +0100 +@@ -14,6 +14,7 @@ + #include + #include + #include ++// #include + #include + #include + #include +diff -NurpP --minimal linux-3.2.34/arch/sparc/Kconfig linux-3.2.34-vs2.3.2.15/arch/sparc/Kconfig +--- linux-3.2.34/arch/sparc/Kconfig 2012-11-18 18:42:08.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/sparc/Kconfig 2012-06-14 20:45:24.000000000 +0200 +@@ -598,6 +598,8 @@ source "fs/Kconfig" + + source "arch/sparc/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.2.34/arch/sparc/include/asm/unistd.h linux-3.2.34-vs2.3.2.15/arch/sparc/include/asm/unistd.h +--- linux-3.2.34/arch/sparc/include/asm/unistd.h 2012-01-09 16:14:07.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/sparc/include/asm/unistd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -335,7 +335,7 @@ + #define __NR_timer_getoverrun 264 + #define __NR_timer_delete 265 + #define __NR_timer_create 266 +-/* #define __NR_vserver 267 Reserved for VSERVER */ ++#define __NR_vserver 267 + #define __NR_io_setup 268 + #define __NR_io_destroy 269 + #define __NR_io_submit 270 +diff -NurpP --minimal linux-3.2.34/arch/sparc/kernel/systbls_32.S linux-3.2.34-vs2.3.2.15/arch/sparc/kernel/systbls_32.S +--- linux-3.2.34/arch/sparc/kernel/systbls_32.S 2012-01-09 16:14:09.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/sparc/kernel/systbls_32.S 2011-12-05 19:33:02.000000000 +0100 +@@ -70,7 +70,7 @@ sys_call_table: + /*250*/ .long sys_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_ni_syscall + /*255*/ .long sys_sync_file_range, sys_clock_settime, sys_clock_gettime, sys_clock_getres, sys_clock_nanosleep + /*260*/ .long sys_sched_getaffinity, sys_sched_setaffinity, sys_timer_settime, sys_timer_gettime, sys_timer_getoverrun +-/*265*/ .long sys_timer_delete, sys_timer_create, sys_nis_syscall, sys_io_setup, sys_io_destroy ++/*265*/ .long sys_timer_delete, sys_timer_create, sys_vserver, sys_io_setup, sys_io_destroy + /*270*/ .long sys_io_submit, sys_io_cancel, sys_io_getevents, sys_mq_open, sys_mq_unlink + /*275*/ .long sys_mq_timedsend, sys_mq_timedreceive, sys_mq_notify, sys_mq_getsetattr, sys_waitid + /*280*/ .long sys_tee, sys_add_key, sys_request_key, sys_keyctl, sys_openat +diff -NurpP --minimal linux-3.2.34/arch/sparc/kernel/systbls_64.S linux-3.2.34-vs2.3.2.15/arch/sparc/kernel/systbls_64.S +--- linux-3.2.34/arch/sparc/kernel/systbls_64.S 2012-11-18 18:42:08.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/sparc/kernel/systbls_64.S 2012-06-14 20:45:24.000000000 +0200 +@@ -71,7 +71,7 @@ sys_call_table32: + /*250*/ .word sys_mremap, compat_sys_sysctl, sys32_getsid, sys_fdatasync, sys_nis_syscall + .word sys32_sync_file_range, compat_sys_clock_settime, compat_sys_clock_gettime, compat_sys_clock_getres, sys32_clock_nanosleep + /*260*/ .word compat_sys_sched_getaffinity, compat_sys_sched_setaffinity, sys32_timer_settime, compat_sys_timer_gettime, sys_timer_getoverrun +- .word sys_timer_delete, compat_sys_timer_create, sys_ni_syscall, compat_sys_io_setup, sys_io_destroy ++ .word sys_timer_delete, compat_sys_timer_create, sys32_vserver, compat_sys_io_setup, sys_io_destroy + /*270*/ .word sys32_io_submit, sys_io_cancel, compat_sys_io_getevents, sys32_mq_open, sys_mq_unlink + .word compat_sys_mq_timedsend, compat_sys_mq_timedreceive, compat_sys_mq_notify, compat_sys_mq_getsetattr, compat_sys_waitid + /*280*/ .word sys32_tee, sys_add_key, sys_request_key, compat_sys_keyctl, compat_sys_openat +@@ -148,7 +148,7 @@ sys_call_table: + /*250*/ .word sys_64_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_nis_syscall + .word sys_sync_file_range, sys_clock_settime, sys_clock_gettime, sys_clock_getres, sys_clock_nanosleep + /*260*/ .word sys_sched_getaffinity, sys_sched_setaffinity, sys_timer_settime, sys_timer_gettime, sys_timer_getoverrun +- .word sys_timer_delete, sys_timer_create, sys_ni_syscall, sys_io_setup, sys_io_destroy ++ .word sys_timer_delete, sys_timer_create, sys_vserver, sys_io_setup, sys_io_destroy + /*270*/ .word sys_io_submit, sys_io_cancel, sys_io_getevents, sys_mq_open, sys_mq_unlink + .word sys_mq_timedsend, sys_mq_timedreceive, sys_mq_notify, sys_mq_getsetattr, sys_waitid + /*280*/ .word sys_tee, sys_add_key, sys_request_key, sys_keyctl, sys_openat +diff -NurpP --minimal linux-3.2.34/arch/um/Kconfig.rest linux-3.2.34-vs2.3.2.15/arch/um/Kconfig.rest +--- linux-3.2.34/arch/um/Kconfig.rest 2012-01-09 16:14:09.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/um/Kconfig.rest 2011-12-05 19:33:02.000000000 +0100 +@@ -12,6 +12,8 @@ source "arch/um/Kconfig.net" + + source "fs/Kconfig" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.2.34/arch/um/include/shared/kern_constants.h linux-3.2.34-vs2.3.2.15/arch/um/include/shared/kern_constants.h +--- linux-3.2.34/arch/um/include/shared/kern_constants.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/um/include/shared/kern_constants.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1 @@ ++#include "../../../../include/generated/asm-offsets.h" +diff -NurpP --minimal linux-3.2.34/arch/um/include/shared/user_constants.h linux-3.2.34-vs2.3.2.15/arch/um/include/shared/user_constants.h +--- linux-3.2.34/arch/um/include/shared/user_constants.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/um/include/shared/user_constants.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,40 @@ ++/* ++ * DO NOT MODIFY. ++ * ++ * This file was generated by arch/um/Makefile ++ * ++ */ ++ ++#define HOST_SC_CR2 176 /* offsetof(struct sigcontext, cr2) # */ ++#define HOST_SC_ERR 152 /* offsetof(struct sigcontext, err) # */ ++#define HOST_SC_TRAPNO 160 /* offsetof(struct sigcontext, trapno) # */ ++#define HOST_FP_SIZE 64 /* sizeof(struct _fpstate) / sizeof(unsigned long) # */ ++#define HOST_RBX 5 /* RBX # */ ++#define HOST_RCX 11 /* RCX # */ ++#define HOST_RDI 14 /* RDI # */ ++#define HOST_RSI 13 /* RSI # */ ++#define HOST_RDX 12 /* RDX # */ ++#define HOST_RBP 4 /* RBP # */ ++#define HOST_RAX 10 /* RAX # */ ++#define HOST_R8 9 /* R8 # */ ++#define HOST_R9 8 /* R9 # */ ++#define HOST_R10 7 /* R10 # */ ++#define HOST_R11 6 /* R11 # */ ++#define HOST_R12 3 /* R12 # */ ++#define HOST_R13 2 /* R13 # */ ++#define HOST_R14 1 /* R14 # */ ++#define HOST_R15 0 /* R15 # */ ++#define HOST_ORIG_RAX 15 /* ORIG_RAX # */ ++#define HOST_CS 17 /* CS # */ ++#define HOST_SS 20 /* SS # */ ++#define HOST_EFLAGS 18 /* EFLAGS # */ ++#define HOST_IP 16 /* RIP # */ ++#define HOST_SP 19 /* RSP # */ ++#define UM_FRAME_SIZE 216 /* sizeof(struct user_regs_struct) # */ ++#define UM_POLLIN 1 /* POLLIN # */ ++#define UM_POLLPRI 2 /* POLLPRI # */ ++#define UM_POLLOUT 4 /* POLLOUT # */ ++#define UM_PROT_READ 1 /* PROT_READ # */ ++#define UM_PROT_WRITE 2 /* PROT_WRITE # */ ++#define UM_PROT_EXEC 4 /* PROT_EXEC # */ ++ +diff -NurpP --minimal linux-3.2.34/arch/x86/Kconfig linux-3.2.34-vs2.3.2.15/arch/x86/Kconfig +--- linux-3.2.34/arch/x86/Kconfig 2012-01-09 16:14:10.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/x86/Kconfig 2011-12-15 01:11:29.000000000 +0100 +@@ -2170,6 +2170,8 @@ source "fs/Kconfig" + + source "arch/x86/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.2.34/arch/x86/ia32/ia32entry.S linux-3.2.34-vs2.3.2.15/arch/x86/ia32/ia32entry.S +--- linux-3.2.34/arch/x86/ia32/ia32entry.S 2012-01-09 16:14:10.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/x86/ia32/ia32entry.S 2011-12-05 19:33:02.000000000 +0100 +@@ -776,7 +776,7 @@ ia32_sys_call_table: + .quad sys_tgkill /* 270 */ + .quad compat_sys_utimes + .quad sys32_fadvise64_64 +- .quad quiet_ni_syscall /* sys_vserver */ ++ .quad sys32_vserver + .quad sys_mbind + .quad compat_sys_get_mempolicy /* 275 */ + .quad sys_set_mempolicy +diff -NurpP --minimal linux-3.2.34/arch/x86/include/asm/unistd_64.h linux-3.2.34-vs2.3.2.15/arch/x86/include/asm/unistd_64.h +--- linux-3.2.34/arch/x86/include/asm/unistd_64.h 2012-01-09 16:14:11.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/x86/include/asm/unistd_64.h 2011-12-05 19:33:02.000000000 +0100 +@@ -535,7 +535,7 @@ __SYSCALL(__NR_tgkill, sys_tgkill) + #define __NR_utimes 235 + __SYSCALL(__NR_utimes, sys_utimes) + #define __NR_vserver 236 +-__SYSCALL(__NR_vserver, sys_ni_syscall) ++__SYSCALL(__NR_vserver, sys_vserver) + #define __NR_mbind 237 + __SYSCALL(__NR_mbind, sys_mbind) + #define __NR_set_mempolicy 238 +diff -NurpP --minimal linux-3.2.34/arch/x86/kernel/syscall_table_32.S linux-3.2.34-vs2.3.2.15/arch/x86/kernel/syscall_table_32.S +--- linux-3.2.34/arch/x86/kernel/syscall_table_32.S 2012-01-09 16:14:11.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/arch/x86/kernel/syscall_table_32.S 2011-12-05 19:33:02.000000000 +0100 +@@ -272,7 +272,7 @@ ENTRY(sys_call_table) + .long sys_tgkill /* 270 */ + .long sys_utimes + .long sys_fadvise64_64 +- .long sys_ni_syscall /* sys_vserver */ ++ .long sys_vserver + .long sys_mbind + .long sys_get_mempolicy + .long sys_set_mempolicy +diff -NurpP --minimal linux-3.2.34/drivers/block/Kconfig linux-3.2.34-vs2.3.2.15/drivers/block/Kconfig +--- linux-3.2.34/drivers/block/Kconfig 2011-10-24 18:45:08.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/drivers/block/Kconfig 2011-12-05 19:33:02.000000000 +0100 +@@ -288,6 +288,13 @@ config BLK_DEV_CRYPTOLOOP + + source "drivers/block/drbd/Kconfig" + ++config BLK_DEV_VROOT ++ tristate "Virtual Root device support" ++ depends on QUOTACTL ++ ---help--- ++ Saying Y here will allow you to use quota/fs ioctls on a shared ++ partition within a virtual server without compromising security. ++ + config BLK_DEV_NBD + tristate "Network block device support" + depends on NET +diff -NurpP --minimal linux-3.2.34/drivers/block/Makefile linux-3.2.34-vs2.3.2.15/drivers/block/Makefile +--- linux-3.2.34/drivers/block/Makefile 2011-07-22 11:17:44.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/drivers/block/Makefile 2011-12-05 19:33:02.000000000 +0100 +@@ -34,6 +34,7 @@ obj-$(CONFIG_VIODASD) += viodasd.o + obj-$(CONFIG_BLK_DEV_SX8) += sx8.o + obj-$(CONFIG_BLK_DEV_UB) += ub.o + obj-$(CONFIG_BLK_DEV_HD) += hd.o ++obj-$(CONFIG_BLK_DEV_VROOT) += vroot.o + + obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o + obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/ +diff -NurpP --minimal linux-3.2.34/drivers/block/loop.c linux-3.2.34-vs2.3.2.15/drivers/block/loop.c +--- linux-3.2.34/drivers/block/loop.c 2012-01-09 16:14:15.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/drivers/block/loop.c 2012-01-09 16:19:31.000000000 +0100 +@@ -77,6 +77,7 @@ + #include + #include + #include ++#include + + #include + +@@ -868,6 +869,7 @@ static int loop_set_fd(struct loop_devic + lo->lo_blocksize = lo_blocksize; + lo->lo_device = bdev; + lo->lo_flags = lo_flags; ++ lo->lo_xid = vx_current_xid(); + lo->lo_backing_file = file; + lo->transfer = transfer_none; + lo->ioctl = NULL; +@@ -1000,6 +1002,7 @@ static int loop_clr_fd(struct loop_devic + lo->lo_sizelimit = 0; + lo->lo_encrypt_key_size = 0; + lo->lo_thread = NULL; ++ lo->lo_xid = 0; + memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); + memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); + memset(lo->lo_file_name, 0, LO_NAME_SIZE); +@@ -1041,7 +1044,7 @@ loop_set_status(struct loop_device *lo, + + if (lo->lo_encrypt_key_size && + lo->lo_key_owner != uid && +- !capable(CAP_SYS_ADMIN)) ++ !vx_capable(CAP_SYS_ADMIN, VXC_ADMIN_CLOOP)) + return -EPERM; + if (lo->lo_state != Lo_bound) + return -ENXIO; +@@ -1131,7 +1134,8 @@ loop_get_status(struct loop_device *lo, + memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE); + info->lo_encrypt_type = + lo->lo_encryption ? lo->lo_encryption->number : 0; +- if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) { ++ if (lo->lo_encrypt_key_size && ++ vx_capable(CAP_SYS_ADMIN, VXC_ADMIN_CLOOP)) { + info->lo_encrypt_key_size = lo->lo_encrypt_key_size; + memcpy(info->lo_encrypt_key, lo->lo_encrypt_key, + lo->lo_encrypt_key_size); +@@ -1491,6 +1495,11 @@ static int lo_open(struct block_device * + goto out; + } + ++ if (!vx_check(lo->lo_xid, VS_IDENT|VS_HOSTID|VS_ADMIN_P)) { ++ err = -EACCES; ++ goto out; ++ } ++ + mutex_lock(&lo->lo_ctl_mutex); + lo->lo_refcnt++; + mutex_unlock(&lo->lo_ctl_mutex); +diff -NurpP --minimal linux-3.2.34/drivers/block/vroot.c linux-3.2.34-vs2.3.2.15/drivers/block/vroot.c +--- linux-3.2.34/drivers/block/vroot.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/drivers/block/vroot.c 2011-12-07 00:05:16.000000000 +0100 +@@ -0,0 +1,291 @@ ++/* ++ * linux/drivers/block/vroot.c ++ * ++ * written by Herbert Pötzl, 9/11/2002 ++ * ported to 2.6.10 by Herbert Pötzl, 30/12/2004 ++ * ++ * based on the loop.c code by Theodore Ts'o. ++ * ++ * Copyright (C) 2002-2007 by Herbert Pötzl. ++ * Redistribution of this file is permitted under the ++ * GNU General Public License. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++ ++static int max_vroot = 8; ++ ++static struct vroot_device *vroot_dev; ++static struct gendisk **disks; ++ ++ ++static int vroot_set_dev( ++ struct vroot_device *vr, ++ struct block_device *bdev, ++ unsigned int arg) ++{ ++ struct block_device *real_bdev; ++ struct file *file; ++ struct inode *inode; ++ int error; ++ ++ error = -EBUSY; ++ if (vr->vr_state != Vr_unbound) ++ goto out; ++ ++ error = -EBADF; ++ file = fget(arg); ++ if (!file) ++ goto out; ++ ++ error = -EINVAL; ++ inode = file->f_dentry->d_inode; ++ ++ ++ if (S_ISBLK(inode->i_mode)) { ++ real_bdev = inode->i_bdev; ++ vr->vr_device = real_bdev; ++ __iget(real_bdev->bd_inode); ++ } else ++ goto out_fput; ++ ++ vxdprintk(VXD_CBIT(misc, 0), ++ "vroot[%d]_set_dev: dev=" VXF_DEV, ++ vr->vr_number, VXD_DEV(real_bdev)); ++ ++ vr->vr_state = Vr_bound; ++ error = 0; ++ ++ out_fput: ++ fput(file); ++ out: ++ return error; ++} ++ ++static int vroot_clr_dev( ++ struct vroot_device *vr, ++ struct block_device *bdev) ++{ ++ struct block_device *real_bdev; ++ ++ if (vr->vr_state != Vr_bound) ++ return -ENXIO; ++ if (vr->vr_refcnt > 1) /* we needed one fd for the ioctl */ ++ return -EBUSY; ++ ++ real_bdev = vr->vr_device; ++ ++ vxdprintk(VXD_CBIT(misc, 0), ++ "vroot[%d]_clr_dev: dev=" VXF_DEV, ++ vr->vr_number, VXD_DEV(real_bdev)); ++ ++ bdput(real_bdev); ++ vr->vr_state = Vr_unbound; ++ vr->vr_device = NULL; ++ return 0; ++} ++ ++ ++static int vr_ioctl(struct block_device *bdev, fmode_t mode, ++ unsigned int cmd, unsigned long arg) ++{ ++ struct vroot_device *vr = bdev->bd_disk->private_data; ++ int err; ++ ++ down(&vr->vr_ctl_mutex); ++ switch (cmd) { ++ case VROOT_SET_DEV: ++ err = vroot_set_dev(vr, bdev, arg); ++ break; ++ case VROOT_CLR_DEV: ++ err = vroot_clr_dev(vr, bdev); ++ break; ++ default: ++ err = -EINVAL; ++ break; ++ } ++ up(&vr->vr_ctl_mutex); ++ return err; ++} ++ ++static int vr_open(struct block_device *bdev, fmode_t mode) ++{ ++ struct vroot_device *vr = bdev->bd_disk->private_data; ++ ++ down(&vr->vr_ctl_mutex); ++ vr->vr_refcnt++; ++ up(&vr->vr_ctl_mutex); ++ return 0; ++} ++ ++static int vr_release(struct gendisk *disk, fmode_t mode) ++{ ++ struct vroot_device *vr = disk->private_data; ++ ++ down(&vr->vr_ctl_mutex); ++ --vr->vr_refcnt; ++ up(&vr->vr_ctl_mutex); ++ return 0; ++} ++ ++static struct block_device_operations vr_fops = { ++ .owner = THIS_MODULE, ++ .open = vr_open, ++ .release = vr_release, ++ .ioctl = vr_ioctl, ++}; ++ ++static void vroot_make_request(struct request_queue *q, struct bio *bio) ++{ ++ printk("vroot_make_request %p, %p\n", q, bio); ++ bio_io_error(bio); ++} ++ ++struct block_device *__vroot_get_real_bdev(struct block_device *bdev) ++{ ++ struct inode *inode = bdev->bd_inode; ++ struct vroot_device *vr; ++ struct block_device *real_bdev; ++ int minor = iminor(inode); ++ ++ vr = &vroot_dev[minor]; ++ real_bdev = vr->vr_device; ++ ++ vxdprintk(VXD_CBIT(misc, 0), ++ "vroot[%d]_get_real_bdev: dev=" VXF_DEV, ++ vr->vr_number, VXD_DEV(real_bdev)); ++ ++ if (vr->vr_state != Vr_bound) ++ return ERR_PTR(-ENXIO); ++ ++ __iget(real_bdev->bd_inode); ++ return real_bdev; ++} ++ ++ ++ ++/* ++ * And now the modules code and kernel interface. ++ */ ++ ++module_param(max_vroot, int, 0); ++ ++MODULE_PARM_DESC(max_vroot, "Maximum number of vroot devices (1-256)"); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS_BLOCKDEV_MAJOR(VROOT_MAJOR); ++ ++MODULE_AUTHOR ("Herbert Pötzl"); ++MODULE_DESCRIPTION ("Virtual Root Device Mapper"); ++ ++ ++int __init vroot_init(void) ++{ ++ int err, i; ++ ++ if (max_vroot < 1 || max_vroot > 256) { ++ max_vroot = MAX_VROOT_DEFAULT; ++ printk(KERN_WARNING "vroot: invalid max_vroot " ++ "(must be between 1 and 256), " ++ "using default (%d)\n", max_vroot); ++ } ++ ++ if (register_blkdev(VROOT_MAJOR, "vroot")) ++ return -EIO; ++ ++ err = -ENOMEM; ++ vroot_dev = kmalloc(max_vroot * sizeof(struct vroot_device), GFP_KERNEL); ++ if (!vroot_dev) ++ goto out_mem1; ++ memset(vroot_dev, 0, max_vroot * sizeof(struct vroot_device)); ++ ++ disks = kmalloc(max_vroot * sizeof(struct gendisk *), GFP_KERNEL); ++ if (!disks) ++ goto out_mem2; ++ ++ for (i = 0; i < max_vroot; i++) { ++ disks[i] = alloc_disk(1); ++ if (!disks[i]) ++ goto out_mem3; ++ disks[i]->queue = blk_alloc_queue(GFP_KERNEL); ++ if (!disks[i]->queue) ++ goto out_mem3; ++ blk_queue_make_request(disks[i]->queue, vroot_make_request); ++ } ++ ++ for (i = 0; i < max_vroot; i++) { ++ struct vroot_device *vr = &vroot_dev[i]; ++ struct gendisk *disk = disks[i]; ++ ++ memset(vr, 0, sizeof(*vr)); ++ sema_init(&vr->vr_ctl_mutex, 1); ++ vr->vr_number = i; ++ disk->major = VROOT_MAJOR; ++ disk->first_minor = i; ++ disk->fops = &vr_fops; ++ sprintf(disk->disk_name, "vroot%d", i); ++ disk->private_data = vr; ++ } ++ ++ err = register_vroot_grb(&__vroot_get_real_bdev); ++ if (err) ++ goto out_mem3; ++ ++ for (i = 0; i < max_vroot; i++) ++ add_disk(disks[i]); ++ printk(KERN_INFO "vroot: loaded (max %d devices)\n", max_vroot); ++ return 0; ++ ++out_mem3: ++ while (i--) ++ put_disk(disks[i]); ++ kfree(disks); ++out_mem2: ++ kfree(vroot_dev); ++out_mem1: ++ unregister_blkdev(VROOT_MAJOR, "vroot"); ++ printk(KERN_ERR "vroot: ran out of memory\n"); ++ return err; ++} ++ ++void vroot_exit(void) ++{ ++ int i; ++ ++ if (unregister_vroot_grb(&__vroot_get_real_bdev)) ++ printk(KERN_WARNING "vroot: cannot unregister grb\n"); ++ ++ for (i = 0; i < max_vroot; i++) { ++ del_gendisk(disks[i]); ++ put_disk(disks[i]); ++ } ++ unregister_blkdev(VROOT_MAJOR, "vroot"); ++ ++ kfree(disks); ++ kfree(vroot_dev); ++} ++ ++module_init(vroot_init); ++module_exit(vroot_exit); ++ ++#ifndef MODULE ++ ++static int __init max_vroot_setup(char *str) ++{ ++ max_vroot = simple_strtol(str, NULL, 0); ++ return 1; ++} ++ ++__setup("max_vroot=", max_vroot_setup); ++ ++#endif ++ +diff -NurpP --minimal linux-3.2.34/drivers/infiniband/Kconfig linux-3.2.34-vs2.3.2.15/drivers/infiniband/Kconfig +--- linux-3.2.34/drivers/infiniband/Kconfig 2011-07-22 11:17:45.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/drivers/infiniband/Kconfig 2012-02-15 03:26:22.000000000 +0100 +@@ -39,7 +39,7 @@ config INFINIBAND_USER_MEM + config INFINIBAND_ADDR_TRANS + bool + depends on INET +- depends on !(INFINIBAND = y && IPV6 = m) ++ depends on !(INFINIBAND = y && IPV6 = y) + default y + + source "drivers/infiniband/hw/mthca/Kconfig" +diff -NurpP --minimal linux-3.2.34/drivers/infiniband/core/addr.c linux-3.2.34-vs2.3.2.15/drivers/infiniband/core/addr.c +--- linux-3.2.34/drivers/infiniband/core/addr.c 2012-01-09 16:14:19.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/drivers/infiniband/core/addr.c 2011-12-05 19:33:02.000000000 +0100 +@@ -255,7 +255,7 @@ static int addr6_resolve(struct sockaddr + + if (ipv6_addr_any(&fl6.saddr)) { + ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev, +- &fl6.daddr, 0, &fl6.saddr); ++ &fl6.daddr, 0, &fl6.saddr, NULL); + if (ret) + goto put; + +diff -NurpP --minimal linux-3.2.34/drivers/md/dm-ioctl.c linux-3.2.34-vs2.3.2.15/drivers/md/dm-ioctl.c +--- linux-3.2.34/drivers/md/dm-ioctl.c 2012-11-18 18:42:11.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/drivers/md/dm-ioctl.c 2012-03-14 10:24:05.000000000 +0100 +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + + #include + +@@ -106,7 +107,8 @@ static struct hash_cell *__get_name_cell + unsigned int h = hash_str(str); + + list_for_each_entry (hc, _name_buckets + h, name_list) +- if (!strcmp(hc->name, str)) { ++ if (vx_check(dm_get_xid(hc->md), VS_WATCH_P | VS_IDENT) && ++ !strcmp(hc->name, str)) { + dm_get(hc->md); + return hc; + } +@@ -120,7 +122,8 @@ static struct hash_cell *__get_uuid_cell + unsigned int h = hash_str(str); + + list_for_each_entry (hc, _uuid_buckets + h, uuid_list) +- if (!strcmp(hc->uuid, str)) { ++ if (vx_check(dm_get_xid(hc->md), VS_WATCH_P | VS_IDENT) && ++ !strcmp(hc->uuid, str)) { + dm_get(hc->md); + return hc; + } +@@ -131,13 +134,15 @@ static struct hash_cell *__get_uuid_cell + static struct hash_cell *__get_dev_cell(uint64_t dev) + { + struct mapped_device *md; +- struct hash_cell *hc; ++ struct hash_cell *hc = NULL; + + md = dm_get_md(huge_decode_dev(dev)); + if (!md) + return NULL; + +- hc = dm_get_mdptr(md); ++ if (vx_check(dm_get_xid(md), VS_WATCH_P | VS_IDENT)) ++ hc = dm_get_mdptr(md); ++ + if (!hc) { + dm_put(md); + return NULL; +@@ -445,6 +450,9 @@ typedef int (*ioctl_fn)(struct dm_ioctl + + static int remove_all(struct dm_ioctl *param, size_t param_size) + { ++ if (!vx_check(0, VS_ADMIN)) ++ return -EPERM; ++ + dm_hash_remove_all(1); + param->data_size = 0; + return 0; +@@ -492,6 +500,8 @@ static int list_devices(struct dm_ioctl + */ + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_entry (hc, _name_buckets + i, name_list) { ++ if (!vx_check(dm_get_xid(hc->md), VS_WATCH_P | VS_IDENT)) ++ continue; + needed += sizeof(struct dm_name_list); + needed += strlen(hc->name) + 1; + needed += ALIGN_MASK; +@@ -515,6 +525,8 @@ static int list_devices(struct dm_ioctl + */ + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_entry (hc, _name_buckets + i, name_list) { ++ if (!vx_check(dm_get_xid(hc->md), VS_WATCH_P | VS_IDENT)) ++ continue; + if (old_nl) + old_nl->next = (uint32_t) ((void *) nl - + (void *) old_nl); +@@ -1615,8 +1627,8 @@ static int ctl_ioctl(uint command, struc + ioctl_fn fn = NULL; + size_t input_param_size; + +- /* only root can play with this */ +- if (!capable(CAP_SYS_ADMIN)) ++ /* only root and certain contexts can play with this */ ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_ADMIN_MAPPER)) + return -EACCES; + + if (_IOC_TYPE(command) != DM_IOCTL) +diff -NurpP --minimal linux-3.2.34/drivers/md/dm.c linux-3.2.34-vs2.3.2.15/drivers/md/dm.c +--- linux-3.2.34/drivers/md/dm.c 2012-11-18 18:42:12.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/drivers/md/dm.c 2012-10-22 12:59:48.000000000 +0200 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #include + +@@ -132,6 +133,7 @@ struct mapped_device { + rwlock_t map_lock; + atomic_t holders; + atomic_t open_count; ++ xid_t xid; + + unsigned long flags; + +@@ -344,6 +346,7 @@ int dm_deleting_md(struct mapped_device + static int dm_blk_open(struct block_device *bdev, fmode_t mode) + { + struct mapped_device *md; ++ int ret = -ENXIO; + + spin_lock(&_minor_lock); + +@@ -352,18 +355,19 @@ static int dm_blk_open(struct block_devi + goto out; + + if (test_bit(DMF_FREEING, &md->flags) || +- dm_deleting_md(md)) { +- md = NULL; ++ dm_deleting_md(md)) ++ goto out; ++ ++ ret = -EACCES; ++ if (!vx_check(md->xid, VS_IDENT|VS_HOSTID)) + goto out; +- } + + dm_get(md); + atomic_inc(&md->open_count); +- ++ ret = 0; + out: + spin_unlock(&_minor_lock); +- +- return md ? 0 : -ENXIO; ++ return ret; + } + + static int dm_blk_close(struct gendisk *disk, fmode_t mode) +@@ -584,6 +588,14 @@ int dm_set_geometry(struct mapped_device + return 0; + } + ++/* ++ * Get the xid associated with a dm device ++ */ ++xid_t dm_get_xid(struct mapped_device *md) ++{ ++ return md->xid; ++} ++ + /*----------------------------------------------------------------- + * CRUD START: + * A more elegant soln is in the works that uses the queue +@@ -1870,6 +1882,7 @@ static struct mapped_device *alloc_dev(i + INIT_LIST_HEAD(&md->uevent_list); + spin_lock_init(&md->uevent_lock); + ++ md->xid = vx_current_xid(); + md->queue = blk_alloc_queue(GFP_KERNEL); + if (!md->queue) + goto bad_queue; +diff -NurpP --minimal linux-3.2.34/drivers/md/dm.h linux-3.2.34-vs2.3.2.15/drivers/md/dm.h +--- linux-3.2.34/drivers/md/dm.h 2012-01-09 16:14:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/drivers/md/dm.h 2011-12-05 19:33:02.000000000 +0100 +@@ -41,6 +41,8 @@ struct dm_dev_internal { + struct dm_table; + struct dm_md_mempools; + ++xid_t dm_get_xid(struct mapped_device *md); ++ + /*----------------------------------------------------------------- + * Internal table functions. + *---------------------------------------------------------------*/ +diff -NurpP --minimal linux-3.2.34/drivers/net/tun.c linux-3.2.34-vs2.3.2.15/drivers/net/tun.c +--- linux-3.2.34/drivers/net/tun.c 2012-11-18 18:42:14.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/drivers/net/tun.c 2012-09-01 11:10:32.000000000 +0200 +@@ -64,6 +64,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -121,6 +122,7 @@ struct tun_struct { + unsigned int flags; + uid_t owner; + gid_t group; ++ nid_t nid; + + struct net_device *dev; + u32 set_features; +@@ -909,6 +911,7 @@ static void tun_setup(struct net_device + + tun->owner = -1; + tun->group = -1; ++ tun->nid = current->nid; + + dev->ethtool_ops = &tun_ethtool_ops; + dev->destructor = tun_free_netdev; +@@ -1059,7 +1062,7 @@ static int tun_set_iff(struct net *net, + + if (((tun->owner != -1 && cred->euid != tun->owner) || + (tun->group != -1 && !in_egroup_p(tun->group))) && +- !capable(CAP_NET_ADMIN)) ++ !cap_raised(current_cap(), CAP_NET_ADMIN)) + return -EPERM; + err = security_tun_dev_attach(tun->socket.sk); + if (err < 0) +@@ -1073,7 +1076,7 @@ static int tun_set_iff(struct net *net, + char *name; + unsigned long flags = 0; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!nx_capable(CAP_NET_ADMIN, NXC_TUN_CREATE)) + return -EPERM; + err = security_tun_dev_create(); + if (err < 0) +@@ -1141,6 +1144,9 @@ static int tun_set_iff(struct net *net, + + sk->sk_destruct = tun_sock_destruct; + ++ if (!nx_check(tun->nid, VS_IDENT | VS_HOSTID | VS_ADMIN_P)) ++ return -EPERM; ++ + err = tun_attach(tun, file); + if (err < 0) + goto failed; +@@ -1324,6 +1330,16 @@ static long __tun_chr_ioctl(struct file + tun_debug(KERN_INFO, tun, "group set to %d\n", tun->group); + break; + ++ case TUNSETNID: ++ if (!capable(CAP_CONTEXT)) ++ return -EPERM; ++ ++ /* Set nid owner of the device */ ++ tun->nid = (nid_t) arg; ++ ++ tun_debug(KERN_INFO, tun, "nid owner set to %u\n", tun->nid); ++ break; ++ + case TUNSETLINK: + /* Only allow setting the type when the interface is down */ + if (tun->dev->flags & IFF_UP) { +diff -NurpP --minimal linux-3.2.34/drivers/tty/sysrq.c linux-3.2.34-vs2.3.2.15/drivers/tty/sysrq.c +--- linux-3.2.34/drivers/tty/sysrq.c 2011-05-22 16:17:44.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/drivers/tty/sysrq.c 2011-12-05 19:33:02.000000000 +0100 +@@ -41,6 +41,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -395,6 +396,21 @@ static struct sysrq_key_op sysrq_unrt_op + .enable_mask = SYSRQ_ENABLE_RTNICE, + }; + ++ ++#ifdef CONFIG_VSERVER_DEBUG ++static void sysrq_handle_vxinfo(int key) ++{ ++ dump_vx_info_inactive((key == 'x') ? 0 : 1); ++} ++ ++static struct sysrq_key_op sysrq_showvxinfo_op = { ++ .handler = sysrq_handle_vxinfo, ++ .help_msg = "conteXt", ++ .action_msg = "Show Context Info", ++ .enable_mask = SYSRQ_ENABLE_DUMP, ++}; ++#endif ++ + /* Key Operations table and lock */ + static DEFINE_SPINLOCK(sysrq_key_table_lock); + +@@ -449,7 +465,11 @@ static struct sysrq_key_op *sysrq_key_ta + NULL, /* v */ + &sysrq_showstate_blocked_op, /* w */ + /* x: May be registered on ppc/powerpc for xmon */ ++#ifdef CONFIG_VSERVER_DEBUG ++ &sysrq_showvxinfo_op, /* x */ ++#else + NULL, /* x */ ++#endif + /* y: May be registered on sparc64 for global register dump */ + NULL, /* y */ + &sysrq_ftrace_dump_op, /* z */ +@@ -464,6 +484,8 @@ static int sysrq_key_table_key2index(int + retval = key - '0'; + else if ((key >= 'a') && (key <= 'z')) + retval = key + 10 - 'a'; ++ else if ((key >= 'A') && (key <= 'Z')) ++ retval = key + 10 - 'A'; + else + retval = -1; + return retval; +diff -NurpP --minimal linux-3.2.34/drivers/tty/tty_io.c linux-3.2.34-vs2.3.2.15/drivers/tty/tty_io.c +--- linux-3.2.34/drivers/tty/tty_io.c 2012-01-09 16:14:48.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/drivers/tty/tty_io.c 2011-12-05 19:33:02.000000000 +0100 +@@ -105,6 +105,7 @@ + + #include + #include ++#include + + #undef TTY_DEBUG_HANGUP + +@@ -2080,7 +2081,8 @@ static int tiocsti(struct tty_struct *tt + char ch, mbz = 0; + struct tty_ldisc *ld; + +- if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) ++ if (((current->signal->tty != tty) && ++ !vx_capable(CAP_SYS_ADMIN, VXC_TIOCSTI))) + return -EPERM; + if (get_user(ch, p)) + return -EFAULT; +@@ -2368,6 +2370,7 @@ static int tiocspgrp(struct tty_struct * + return -ENOTTY; + if (get_user(pgrp_nr, p)) + return -EFAULT; ++ pgrp_nr = vx_rmap_pid(pgrp_nr); + if (pgrp_nr < 0) + return -EINVAL; + rcu_read_lock(); +diff -NurpP --minimal linux-3.2.34/fs/attr.c linux-3.2.34-vs2.3.2.15/fs/attr.c +--- linux-3.2.34/fs/attr.c 2012-11-18 18:42:20.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/attr.c 2012-06-14 20:45:24.000000000 +0200 +@@ -14,6 +14,9 @@ + #include + #include + #include ++#include ++#include ++#include + + /** + * inode_change_ok - check if attribute changes to an inode are allowed +@@ -74,6 +77,10 @@ int inode_change_ok(const struct inode * + return -EPERM; + } + ++ /* check for inode tag permission */ ++ if (dx_permission(inode, MAY_WRITE)) ++ return -EACCES; ++ + return 0; + } + EXPORT_SYMBOL(inode_change_ok); +@@ -144,6 +151,8 @@ void setattr_copy(struct inode *inode, c + inode->i_uid = attr->ia_uid; + if (ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; ++ if ((ia_valid & ATTR_TAG) && IS_TAGGED(inode)) ++ inode->i_tag = attr->ia_tag; + if (ia_valid & ATTR_ATIME) + inode->i_atime = timespec_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); +@@ -171,7 +180,8 @@ int notify_change(struct dentry * dentry + struct timespec now; + unsigned int ia_valid = attr->ia_valid; + +- if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { ++ if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ++ ATTR_TAG | ATTR_TIMES_SET)) { + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + } +diff -NurpP --minimal linux-3.2.34/fs/block_dev.c linux-3.2.34-vs2.3.2.15/fs/block_dev.c +--- linux-3.2.34/fs/block_dev.c 2012-11-18 18:42:20.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/block_dev.c 2012-06-14 20:45:24.000000000 +0200 +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + #include + #include "internal.h" + +@@ -563,6 +564,7 @@ struct block_device *bdget(dev_t dev) + bdev->bd_invalidated = 0; + inode->i_mode = S_IFBLK; + inode->i_rdev = dev; ++ inode->i_mdev = dev; + inode->i_bdev = bdev; + inode->i_data.a_ops = &def_blk_aops; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); +@@ -609,6 +611,11 @@ EXPORT_SYMBOL(bdput); + static struct block_device *bd_acquire(struct inode *inode) + { + struct block_device *bdev; ++ dev_t mdev; ++ ++ if (!vs_map_blkdev(inode->i_rdev, &mdev, DATTR_OPEN)) ++ return NULL; ++ inode->i_mdev = mdev; + + spin_lock(&bdev_lock); + bdev = inode->i_bdev; +@@ -619,7 +626,7 @@ static struct block_device *bd_acquire(s + } + spin_unlock(&bdev_lock); + +- bdev = bdget(inode->i_rdev); ++ bdev = bdget(mdev); + if (bdev) { + spin_lock(&bdev_lock); + if (!inode->i_bdev) { +diff -NurpP --minimal linux-3.2.34/fs/btrfs/ctree.h linux-3.2.34-vs2.3.2.15/fs/btrfs/ctree.h +--- linux-3.2.34/fs/btrfs/ctree.h 2012-11-18 18:42:20.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/btrfs/ctree.h 2012-05-15 18:16:52.000000000 +0200 +@@ -643,11 +643,14 @@ struct btrfs_inode_item { + /* modification sequence number for NFS */ + __le64 sequence; + ++ __le16 tag; + /* + * a little future expansion, for more than this we can + * just grow the inode item and version it + */ +- __le64 reserved[4]; ++ __le16 reserved16; ++ __le32 reserved32; ++ __le64 reserved[3]; + struct btrfs_timespec atime; + struct btrfs_timespec ctime; + struct btrfs_timespec mtime; +@@ -1414,6 +1417,8 @@ struct btrfs_ioctl_defrag_range_args { + #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) + #define BTRFS_MOUNT_RECOVERY (1 << 18) + ++#define BTRFS_MOUNT_TAGGED (1 << 24) ++ + #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) + #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) + #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ +@@ -1621,6 +1626,7 @@ BTRFS_SETGET_FUNCS(inode_block_group, st + BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); + BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); + BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); ++BTRFS_SETGET_FUNCS(inode_tag, struct btrfs_inode_item, tag, 16); + BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); + BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); + BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); +@@ -1674,6 +1680,10 @@ BTRFS_SETGET_FUNCS(extent_flags, struct + + BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32); + ++#define BTRFS_INODE_IXUNLINK (1 << 24) ++#define BTRFS_INODE_BARRIER (1 << 25) ++#define BTRFS_INODE_COW (1 << 26) ++ + + BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); + +@@ -2730,6 +2740,7 @@ extern const struct dentry_operations bt + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + void btrfs_update_iflags(struct inode *inode); + void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); ++int btrfs_sync_flags(struct inode *inode, int, int); + int btrfs_defrag_file(struct inode *inode, struct file *file, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_pages); +diff -NurpP --minimal linux-3.2.34/fs/btrfs/disk-io.c linux-3.2.34-vs2.3.2.15/fs/btrfs/disk-io.c +--- linux-3.2.34/fs/btrfs/disk-io.c 2012-11-18 18:42:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/btrfs/disk-io.c 2012-08-13 12:40:51.000000000 +0200 +@@ -2104,6 +2104,9 @@ struct btrfs_root *open_ctree(struct sup + goto fail_alloc; + } + ++ if (btrfs_test_opt(tree_root, TAGGED)) ++ sb->s_flags |= MS_TAGGED; ++ + features = btrfs_super_incompat_flags(disk_super) & + ~BTRFS_FEATURE_INCOMPAT_SUPP; + if (features) { +diff -NurpP --minimal linux-3.2.34/fs/btrfs/inode.c linux-3.2.34-vs2.3.2.15/fs/btrfs/inode.c +--- linux-3.2.34/fs/btrfs/inode.c 2012-01-09 16:14:53.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/btrfs/inode.c 2012-01-09 16:19:51.000000000 +0100 +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + #include "compat.h" + #include "ctree.h" + #include "disk-io.h" +@@ -2332,6 +2333,8 @@ static void btrfs_read_locked_inode(stru + struct btrfs_key location; + int maybe_acls; + u32 rdev; ++ uid_t uid; ++ gid_t gid; + int ret; + bool filled = false; + +@@ -2359,8 +2362,13 @@ static void btrfs_read_locked_inode(stru + struct btrfs_inode_item); + inode->i_mode = btrfs_inode_mode(leaf, inode_item); + set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); +- inode->i_uid = btrfs_inode_uid(leaf, inode_item); +- inode->i_gid = btrfs_inode_gid(leaf, inode_item); ++ ++ uid = btrfs_inode_uid(leaf, inode_item); ++ gid = btrfs_inode_gid(leaf, inode_item); ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, ++ btrfs_inode_tag(leaf, inode_item)); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + + tspec = btrfs_inode_atime(inode_item); +@@ -2438,8 +2446,14 @@ static void fill_inode_item(struct btrfs + struct btrfs_inode_item *item, + struct inode *inode) + { +- btrfs_set_inode_uid(leaf, item, inode->i_uid); +- btrfs_set_inode_gid(leaf, item, inode->i_gid); ++ uid_t uid = TAGINO_UID(DX_TAG(inode), inode->i_uid, inode->i_tag); ++ gid_t gid = TAGINO_GID(DX_TAG(inode), inode->i_gid, inode->i_tag); ++ ++ btrfs_set_inode_uid(leaf, item, uid); ++ btrfs_set_inode_gid(leaf, item, gid); ++#ifdef CONFIG_TAGGING_INTERN ++ btrfs_set_inode_tag(leaf, item, inode->i_tag); ++#endif + btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); +@@ -7377,11 +7391,13 @@ static const struct inode_operations btr + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .permission = btrfs_permission, ++ .sync_flags = btrfs_sync_flags, + .get_acl = btrfs_get_acl, + }; + static const struct inode_operations btrfs_dir_ro_inode_operations = { + .lookup = btrfs_lookup, + .permission = btrfs_permission, ++ .sync_flags = btrfs_sync_flags, + .get_acl = btrfs_get_acl, + }; + +diff -NurpP --minimal linux-3.2.34/fs/btrfs/ioctl.c linux-3.2.34-vs2.3.2.15/fs/btrfs/ioctl.c +--- linux-3.2.34/fs/btrfs/ioctl.c 2012-01-09 16:14:53.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/btrfs/ioctl.c 2012-01-09 16:19:31.000000000 +0100 +@@ -71,10 +71,13 @@ static unsigned int btrfs_flags_to_ioctl + { + unsigned int iflags = 0; + +- if (flags & BTRFS_INODE_SYNC) +- iflags |= FS_SYNC_FL; + if (flags & BTRFS_INODE_IMMUTABLE) + iflags |= FS_IMMUTABLE_FL; ++ if (flags & BTRFS_INODE_IXUNLINK) ++ iflags |= FS_IXUNLINK_FL; ++ ++ if (flags & BTRFS_INODE_SYNC) ++ iflags |= FS_SYNC_FL; + if (flags & BTRFS_INODE_APPEND) + iflags |= FS_APPEND_FL; + if (flags & BTRFS_INODE_NODUMP) +@@ -91,28 +94,78 @@ static unsigned int btrfs_flags_to_ioctl + else if (flags & BTRFS_INODE_NOCOMPRESS) + iflags |= FS_NOCOMP_FL; + ++ if (flags & BTRFS_INODE_BARRIER) ++ iflags |= FS_BARRIER_FL; ++ if (flags & BTRFS_INODE_COW) ++ iflags |= FS_COW_FL; + return iflags; + } + + /* +- * Update inode->i_flags based on the btrfs internal flags. ++ * Update inode->i_(v)flags based on the btrfs internal flags. + */ + void btrfs_update_iflags(struct inode *inode) + { + struct btrfs_inode *ip = BTRFS_I(inode); + +- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | ++ S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); + +- if (ip->flags & BTRFS_INODE_SYNC) +- inode->i_flags |= S_SYNC; + if (ip->flags & BTRFS_INODE_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; ++ if (ip->flags & BTRFS_INODE_IXUNLINK) ++ inode->i_flags |= S_IXUNLINK; ++ ++ if (ip->flags & BTRFS_INODE_SYNC) ++ inode->i_flags |= S_SYNC; + if (ip->flags & BTRFS_INODE_APPEND) + inode->i_flags |= S_APPEND; + if (ip->flags & BTRFS_INODE_NOATIME) + inode->i_flags |= S_NOATIME; + if (ip->flags & BTRFS_INODE_DIRSYNC) + inode->i_flags |= S_DIRSYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (ip->flags & BTRFS_INODE_BARRIER) ++ inode->i_vflags |= V_BARRIER; ++ if (ip->flags & BTRFS_INODE_COW) ++ inode->i_vflags |= V_COW; ++} ++ ++/* ++ * Update btrfs internal flags from inode->i_(v)flags. ++ */ ++void btrfs_update_flags(struct inode *inode) ++{ ++ struct btrfs_inode *ip = BTRFS_I(inode); ++ ++ unsigned int flags = inode->i_flags; ++ unsigned int vflags = inode->i_vflags; ++ ++ ip->flags &= ~(BTRFS_INODE_SYNC | BTRFS_INODE_APPEND | ++ BTRFS_INODE_IMMUTABLE | BTRFS_INODE_IXUNLINK | ++ BTRFS_INODE_NOATIME | BTRFS_INODE_DIRSYNC | ++ BTRFS_INODE_BARRIER | BTRFS_INODE_COW); ++ ++ if (flags & S_IMMUTABLE) ++ ip->flags |= BTRFS_INODE_IMMUTABLE; ++ if (flags & S_IXUNLINK) ++ ip->flags |= BTRFS_INODE_IXUNLINK; ++ ++ if (flags & S_SYNC) ++ ip->flags |= BTRFS_INODE_SYNC; ++ if (flags & S_APPEND) ++ ip->flags |= BTRFS_INODE_APPEND; ++ if (flags & S_NOATIME) ++ ip->flags |= BTRFS_INODE_NOATIME; ++ if (flags & S_DIRSYNC) ++ ip->flags |= BTRFS_INODE_DIRSYNC; ++ ++ if (vflags & V_BARRIER) ++ ip->flags |= BTRFS_INODE_BARRIER; ++ if (vflags & V_COW) ++ ip->flags |= BTRFS_INODE_COW; + } + + /* +@@ -128,6 +181,7 @@ void btrfs_inherit_iflags(struct inode * + return; + + flags = BTRFS_I(dir)->flags; ++ flags &= ~BTRFS_INODE_BARRIER; + + if (flags & BTRFS_INODE_NOCOMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; +@@ -143,6 +197,30 @@ void btrfs_inherit_iflags(struct inode * + btrfs_update_iflags(inode); + } + ++int btrfs_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ struct btrfs_inode *ip = BTRFS_I(inode); ++ struct btrfs_root *root = ip->root; ++ struct btrfs_trans_handle *trans; ++ int ret; ++ ++ trans = btrfs_join_transaction(root); ++ BUG_ON(!trans); ++ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ btrfs_update_flags(inode); ++ ++ ret = btrfs_update_inode(trans, root, inode); ++ BUG_ON(ret); ++ ++ btrfs_update_iflags(inode); ++ inode->i_ctime = CURRENT_TIME; ++ btrfs_end_transaction(trans, root); ++ ++ return 0; ++} ++ + static int btrfs_ioctl_getflags(struct file *file, void __user *arg) + { + struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode); +@@ -194,7 +272,8 @@ static int btrfs_ioctl_setflags(struct f + + flags = btrfs_mask_flags(inode->i_mode, flags); + oldflags = btrfs_flags_to_ioctl(ip->flags); +- if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { ++ if ((flags ^ oldflags) & (FS_APPEND_FL | ++ FS_IMMUTABLE_FL | FS_IXUNLINK_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + ret = -EPERM; + goto out_unlock; +@@ -205,14 +284,19 @@ static int btrfs_ioctl_setflags(struct f + if (ret) + goto out_unlock; + +- if (flags & FS_SYNC_FL) +- ip->flags |= BTRFS_INODE_SYNC; +- else +- ip->flags &= ~BTRFS_INODE_SYNC; + if (flags & FS_IMMUTABLE_FL) + ip->flags |= BTRFS_INODE_IMMUTABLE; + else + ip->flags &= ~BTRFS_INODE_IMMUTABLE; ++ if (flags & FS_IXUNLINK_FL) ++ ip->flags |= BTRFS_INODE_IXUNLINK; ++ else ++ ip->flags &= ~BTRFS_INODE_IXUNLINK; ++ ++ if (flags & FS_SYNC_FL) ++ ip->flags |= BTRFS_INODE_SYNC; ++ else ++ ip->flags &= ~BTRFS_INODE_SYNC; + if (flags & FS_APPEND_FL) + ip->flags |= BTRFS_INODE_APPEND; + else +diff -NurpP --minimal linux-3.2.34/fs/btrfs/super.c linux-3.2.34-vs2.3.2.15/fs/btrfs/super.c +--- linux-3.2.34/fs/btrfs/super.c 2012-01-09 16:14:53.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/btrfs/super.c 2012-01-09 16:19:31.000000000 +0100 +@@ -165,7 +165,8 @@ enum { + Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, + Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, + Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, +- Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, ++ Opt_inode_cache, Opt_no_space_cache, Opt_recovery, ++ Opt_tag, Opt_notag, Opt_tagid, Opt_err, + }; + + static match_table_t tokens = { +@@ -200,6 +201,9 @@ static match_table_t tokens = { + {Opt_inode_cache, "inode_cache"}, + {Opt_no_space_cache, "nospace_cache"}, + {Opt_recovery, "recovery"}, ++ {Opt_tag, "tag"}, ++ {Opt_notag, "notag"}, ++ {Opt_tagid, "tagid=%u"}, + {Opt_err, NULL}, + }; + +@@ -398,6 +402,22 @@ int btrfs_parse_options(struct btrfs_roo + printk(KERN_INFO "btrfs: enabling auto recovery"); + btrfs_set_opt(info->mount_opt, RECOVERY); + break; ++#ifndef CONFIG_TAGGING_NONE ++ case Opt_tag: ++ printk(KERN_INFO "btrfs: use tagging\n"); ++ btrfs_set_opt(info->mount_opt, TAGGED); ++ break; ++ case Opt_notag: ++ printk(KERN_INFO "btrfs: disabled tagging\n"); ++ btrfs_clear_opt(info->mount_opt, TAGGED); ++ break; ++#endif ++#ifdef CONFIG_PROPAGATE ++ case Opt_tagid: ++ /* use args[0] */ ++ btrfs_set_opt(info->mount_opt, TAGGED); ++ break; ++#endif + case Opt_err: + printk(KERN_INFO "btrfs: unrecognized mount option " + "'%s'\n", p); +@@ -985,6 +1005,12 @@ static int btrfs_remount(struct super_bl + if (ret) + return -EINVAL; + ++ if (btrfs_test_opt(root, TAGGED) && !(sb->s_flags & MS_TAGGED)) { ++ printk("btrfs: %s: tagging not permitted on remount.\n", ++ sb->s_id); ++ return -EINVAL; ++ } ++ + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + return 0; + +diff -NurpP --minimal linux-3.2.34/fs/char_dev.c linux-3.2.34-vs2.3.2.15/fs/char_dev.c +--- linux-3.2.34/fs/char_dev.c 2011-03-15 18:07:31.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/char_dev.c 2011-12-05 19:33:02.000000000 +0100 +@@ -21,6 +21,8 @@ + #include + #include + #include ++#include ++#include + + #include "internal.h" + +@@ -371,14 +373,21 @@ static int chrdev_open(struct inode *ino + struct cdev *p; + struct cdev *new = NULL; + int ret = 0; ++ dev_t mdev; ++ ++ if (!vs_map_chrdev(inode->i_rdev, &mdev, DATTR_OPEN)) ++ return -EPERM; ++ inode->i_mdev = mdev; + + spin_lock(&cdev_lock); + p = inode->i_cdev; + if (!p) { + struct kobject *kobj; + int idx; ++ + spin_unlock(&cdev_lock); +- kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); ++ ++ kobj = kobj_lookup(cdev_map, mdev, &idx); + if (!kobj) + return -ENXIO; + new = container_of(kobj, struct cdev, kobj); +diff -NurpP --minimal linux-3.2.34/fs/dcache.c linux-3.2.34-vs2.3.2.15/fs/dcache.c +--- linux-3.2.34/fs/dcache.c 2012-11-18 18:42:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/dcache.c 2012-10-22 12:59:51.000000000 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + #include "internal.h" + + /* +@@ -539,6 +540,8 @@ int d_invalidate(struct dentry * dentry) + spin_lock(&dentry->d_lock); + } + ++ vx_dentry_dec(dentry); ++ + /* + * Somebody else still using it? + * +@@ -568,6 +571,7 @@ EXPORT_SYMBOL(d_invalidate); + static inline void __dget_dlock(struct dentry *dentry) + { + dentry->d_count++; ++ vx_dentry_inc(dentry); + } + + static inline void __dget(struct dentry *dentry) +@@ -1196,6 +1200,9 @@ struct dentry *__d_alloc(struct super_bl + struct dentry *dentry; + char *dname; + ++ if (!vx_dentry_avail(1)) ++ return NULL; ++ + dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); + if (!dentry) + return NULL; +@@ -1218,6 +1225,7 @@ struct dentry *__d_alloc(struct super_bl + + dentry->d_count = 1; + dentry->d_flags = 0; ++ vx_dentry_inc(dentry); + spin_lock_init(&dentry->d_lock); + seqcount_init(&dentry->d_seq); + dentry->d_inode = NULL; +@@ -1876,6 +1884,7 @@ struct dentry *__d_lookup(struct dentry + } + + dentry->d_count++; ++ vx_dentry_inc(dentry); + found = dentry; + spin_unlock(&dentry->d_lock); + break; +diff -NurpP --minimal linux-3.2.34/fs/devpts/inode.c linux-3.2.34-vs2.3.2.15/fs/devpts/inode.c +--- linux-3.2.34/fs/devpts/inode.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/devpts/inode.c 2011-12-05 21:23:19.000000000 +0100 +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + + #define DEVPTS_DEFAULT_MODE 0600 + /* +@@ -36,6 +37,20 @@ + #define DEVPTS_DEFAULT_PTMX_MODE 0000 + #define PTMX_MINOR 2 + ++static int devpts_permission(struct inode *inode, int mask) ++{ ++ int ret = -EACCES; ++ ++ /* devpts is xid tagged */ ++ if (vx_check((xid_t)inode->i_tag, VS_WATCH_P | VS_IDENT)) ++ ret = generic_permission(inode, mask); ++ return ret; ++} ++ ++static struct inode_operations devpts_file_inode_operations = { ++ .permission = devpts_permission, ++}; ++ + extern int pty_limit; /* Config limit on Unix98 ptys */ + static DEFINE_MUTEX(allocated_ptys_lock); + +@@ -263,6 +278,34 @@ static int devpts_show_options(struct se + return 0; + } + ++static int devpts_filter(struct dentry *de) ++{ ++ xid_t xid = 0; ++ ++ /* devpts is xid tagged */ ++ if (de && de->d_inode) ++ xid = (xid_t)de->d_inode->i_tag; ++#ifdef CONFIG_VSERVER_WARN_DEVPTS ++ else ++ vxwprintk_task(1, "devpts " VS_Q("%.*s") " without inode.", ++ de->d_name.len, de->d_name.name); ++#endif ++ return vx_check(xid, VS_WATCH_P | VS_IDENT); ++} ++ ++static int devpts_readdir(struct file * filp, void * dirent, filldir_t filldir) ++{ ++ return dcache_readdir_filter(filp, dirent, filldir, devpts_filter); ++} ++ ++static struct file_operations devpts_dir_operations = { ++ .open = dcache_dir_open, ++ .release = dcache_dir_close, ++ .llseek = dcache_dir_lseek, ++ .read = generic_read_dir, ++ .readdir = devpts_readdir, ++}; ++ + static const struct super_operations devpts_sops = { + .statfs = simple_statfs, + .remount_fs = devpts_remount, +@@ -306,8 +349,10 @@ devpts_fill_super(struct super_block *s, + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; + inode->i_op = &simple_dir_inode_operations; +- inode->i_fop = &simple_dir_operations; ++ inode->i_fop = &devpts_dir_operations; + set_nlink(inode, 2); ++ /* devpts is xid tagged */ ++ inode->i_tag = (tag_t)vx_current_xid(); + + s->s_root = d_alloc_root(inode); + if (s->s_root) +@@ -494,6 +539,9 @@ int devpts_pty_new(struct inode *ptmx_in + inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + init_special_inode(inode, S_IFCHR|opts->mode, device); ++ /* devpts is xid tagged */ ++ inode->i_tag = (tag_t)vx_current_xid(); ++ inode->i_op = &devpts_file_inode_operations; + inode->i_private = tty; + tty->driver_data = inode; + +diff -NurpP --minimal linux-3.2.34/fs/ext2/balloc.c linux-3.2.34-vs2.3.2.15/fs/ext2/balloc.c +--- linux-3.2.34/fs/ext2/balloc.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext2/balloc.c 2011-12-05 19:33:02.000000000 +0100 +@@ -701,7 +701,6 @@ ext2_try_to_allocate(struct super_block + start = 0; + end = EXT2_BLOCKS_PER_GROUP(sb); + } +- + BUG_ON(start > EXT2_BLOCKS_PER_GROUP(sb)); + + repeat: +diff -NurpP --minimal linux-3.2.34/fs/ext2/ext2.h linux-3.2.34-vs2.3.2.15/fs/ext2/ext2.h +--- linux-3.2.34/fs/ext2/ext2.h 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext2/ext2.h 2011-12-05 19:33:02.000000000 +0100 +@@ -126,6 +126,7 @@ extern void ext2_set_inode_flags(struct + extern void ext2_get_inode_flags(struct ext2_inode_info *); + extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); ++extern int ext2_sync_flags(struct inode *, int, int); + + /* ioctl.c */ + extern long ext2_ioctl(struct file *, unsigned int, unsigned long); +diff -NurpP --minimal linux-3.2.34/fs/ext2/file.c linux-3.2.34-vs2.3.2.15/fs/ext2/file.c +--- linux-3.2.34/fs/ext2/file.c 2011-10-24 18:45:27.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/ext2/file.c 2011-12-05 19:33:02.000000000 +0100 +@@ -104,4 +104,5 @@ const struct inode_operations ext2_file_ + .setattr = ext2_setattr, + .get_acl = ext2_get_acl, + .fiemap = ext2_fiemap, ++ .sync_flags = ext2_sync_flags, + }; +diff -NurpP --minimal linux-3.2.34/fs/ext2/ialloc.c linux-3.2.34-vs2.3.2.15/fs/ext2/ialloc.c +--- linux-3.2.34/fs/ext2/ialloc.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext2/ialloc.c 2011-12-05 19:33:02.000000000 +0100 +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include "ext2.h" + #include "xattr.h" + #include "acl.h" +@@ -549,6 +550,7 @@ got: + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; ++ inode->i_tag = dx_current_fstag(sb); + } else + inode_init_owner(inode, dir, mode); + +diff -NurpP --minimal linux-3.2.34/fs/ext2/inode.c linux-3.2.34-vs2.3.2.15/fs/ext2/inode.c +--- linux-3.2.34/fs/ext2/inode.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext2/inode.c 2011-12-05 21:24:12.000000000 +0100 +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + #include "ext2.h" + #include "acl.h" + #include "xip.h" +@@ -1167,7 +1168,7 @@ static void ext2_truncate_blocks(struct + return; + if (ext2_inode_is_fast_symlink(inode)) + return; +- if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) ++ if (IS_APPEND(inode) || IS_IXORUNLINK(inode)) + return; + __ext2_truncate_blocks(inode, offset); + } +@@ -1258,36 +1259,61 @@ void ext2_set_inode_flags(struct inode * + { + unsigned int flags = EXT2_I(inode)->i_flags; + +- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | ++ S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); ++ ++ ++ if (flags & EXT2_IMMUTABLE_FL) ++ inode->i_flags |= S_IMMUTABLE; ++ if (flags & EXT2_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; ++ + if (flags & EXT2_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT2_APPEND_FL) + inode->i_flags |= S_APPEND; +- if (flags & EXT2_IMMUTABLE_FL) +- inode->i_flags |= S_IMMUTABLE; + if (flags & EXT2_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & EXT2_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (flags & EXT2_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ if (flags & EXT2_COW_FL) ++ inode->i_vflags |= V_COW; + } + + /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ + void ext2_get_inode_flags(struct ext2_inode_info *ei) + { + unsigned int flags = ei->vfs_inode.i_flags; ++ unsigned int vflags = ei->vfs_inode.i_vflags; ++ ++ ei->i_flags &= ~(EXT2_SYNC_FL | EXT2_APPEND_FL | ++ EXT2_IMMUTABLE_FL | EXT2_IXUNLINK_FL | ++ EXT2_NOATIME_FL | EXT2_DIRSYNC_FL | ++ EXT2_BARRIER_FL | EXT2_COW_FL); ++ ++ if (flags & S_IMMUTABLE) ++ ei->i_flags |= EXT2_IMMUTABLE_FL; ++ if (flags & S_IXUNLINK) ++ ei->i_flags |= EXT2_IXUNLINK_FL; + +- ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL| +- EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL); + if (flags & S_SYNC) + ei->i_flags |= EXT2_SYNC_FL; + if (flags & S_APPEND) + ei->i_flags |= EXT2_APPEND_FL; +- if (flags & S_IMMUTABLE) +- ei->i_flags |= EXT2_IMMUTABLE_FL; + if (flags & S_NOATIME) + ei->i_flags |= EXT2_NOATIME_FL; + if (flags & S_DIRSYNC) + ei->i_flags |= EXT2_DIRSYNC_FL; ++ ++ if (vflags & V_BARRIER) ++ ei->i_flags |= EXT2_BARRIER_FL; ++ if (vflags & V_COW) ++ ei->i_flags |= EXT2_COW_FL; + } + + struct inode *ext2_iget (struct super_block *sb, unsigned long ino) +@@ -1297,6 +1323,8 @@ struct inode *ext2_iget (struct super_bl + struct ext2_inode *raw_inode; + struct inode *inode; + long ret = -EIO; ++ uid_t uid; ++ gid_t gid; + int n; + + inode = iget_locked(sb, ino); +@@ -1315,12 +1343,16 @@ struct inode *ext2_iget (struct super_bl + } + + inode->i_mode = le16_to_cpu(raw_inode->i_mode); +- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); +- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); ++ uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); ++ gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (!(test_opt (inode->i_sb, NO_UID32))) { +- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; +- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; ++ uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; ++ gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, ++ le16_to_cpu(raw_inode->i_raw_tag)); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); + inode->i_size = le32_to_cpu(raw_inode->i_size); + inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); +@@ -1418,8 +1450,8 @@ static int __ext2_write_inode(struct ino + struct ext2_inode_info *ei = EXT2_I(inode); + struct super_block *sb = inode->i_sb; + ino_t ino = inode->i_ino; +- uid_t uid = inode->i_uid; +- gid_t gid = inode->i_gid; ++ uid_t uid = TAGINO_UID(DX_TAG(inode), inode->i_uid, inode->i_tag); ++ gid_t gid = TAGINO_GID(DX_TAG(inode), inode->i_gid, inode->i_tag); + struct buffer_head * bh; + struct ext2_inode * raw_inode = ext2_get_inode(sb, ino, &bh); + int n; +@@ -1455,6 +1487,9 @@ static int __ext2_write_inode(struct ino + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } ++#ifdef CONFIG_TAGGING_INTERN ++ raw_inode->i_raw_tag = cpu_to_le16(inode->i_tag); ++#endif + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le32(inode->i_size); + raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); +@@ -1535,7 +1570,8 @@ int ext2_setattr(struct dentry *dentry, + if (is_quota_modification(inode, iattr)) + dquot_initialize(inode); + if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || +- (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { ++ (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) || ++ (iattr->ia_valid & ATTR_TAG && iattr->ia_tag != inode->i_tag)) { + error = dquot_transfer(inode, iattr); + if (error) + return error; +diff -NurpP --minimal linux-3.2.34/fs/ext2/ioctl.c linux-3.2.34-vs2.3.2.15/fs/ext2/ioctl.c +--- linux-3.2.34/fs/ext2/ioctl.c 2011-05-22 16:17:51.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/ext2/ioctl.c 2011-12-05 19:33:02.000000000 +0100 +@@ -17,6 +17,16 @@ + #include + + ++int ext2_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ ext2_get_inode_flags(EXT2_I(inode)); ++ inode->i_ctime = CURRENT_TIME_SEC; ++ mark_inode_dirty(inode); ++ return 0; ++} ++ + long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_dentry->d_inode; +@@ -51,6 +61,11 @@ long ext2_ioctl(struct file *filp, unsig + + flags = ext2_mask_flags(inode->i_mode, flags); + ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -EACCES; ++ } ++ + mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { +@@ -66,7 +81,9 @@ long ext2_ioctl(struct file *filp, unsig + * + * This test looks nicer. Thanks to Pauline Middelink + */ +- if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) { ++ if ((oldflags & EXT2_IMMUTABLE_FL) || ++ ((flags ^ oldflags) & (EXT2_APPEND_FL | ++ EXT2_IMMUTABLE_FL | EXT2_IXUNLINK_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); + ret = -EPERM; +@@ -74,7 +91,7 @@ long ext2_ioctl(struct file *filp, unsig + } + } + +- flags = flags & EXT2_FL_USER_MODIFIABLE; ++ flags &= EXT2_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE; + ei->i_flags = flags; + mutex_unlock(&inode->i_mutex); +diff -NurpP --minimal linux-3.2.34/fs/ext2/namei.c linux-3.2.34-vs2.3.2.15/fs/ext2/namei.c +--- linux-3.2.34/fs/ext2/namei.c 2011-10-24 18:45:27.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/ext2/namei.c 2011-12-05 19:33:02.000000000 +0100 +@@ -32,6 +32,7 @@ + + #include + #include ++#include + #include "ext2.h" + #include "xattr.h" + #include "acl.h" +@@ -73,6 +74,7 @@ static struct dentry *ext2_lookup(struct + (unsigned long) ino); + return ERR_PTR(-EIO); + } ++ dx_propagate_tag(nd, inode); + } + return d_splice_alias(inode, dentry); + } +@@ -408,6 +410,7 @@ const struct inode_operations ext2_dir_i + .removexattr = generic_removexattr, + #endif + .setattr = ext2_setattr, ++ .sync_flags = ext2_sync_flags, + .get_acl = ext2_get_acl, + }; + +diff -NurpP --minimal linux-3.2.34/fs/ext2/super.c linux-3.2.34-vs2.3.2.15/fs/ext2/super.c +--- linux-3.2.34/fs/ext2/super.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext2/super.c 2011-12-05 19:33:02.000000000 +0100 +@@ -394,7 +394,8 @@ enum { + Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, + Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, + Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota, +- Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation ++ Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation, ++ Opt_tag, Opt_notag, Opt_tagid + }; + + static const match_table_t tokens = { +@@ -422,6 +423,9 @@ static const match_table_t tokens = { + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_xip, "xip"}, ++ {Opt_tag, "tag"}, ++ {Opt_notag, "notag"}, ++ {Opt_tagid, "tagid=%u"}, + {Opt_grpquota, "grpquota"}, + {Opt_ignore, "noquota"}, + {Opt_quota, "quota"}, +@@ -492,6 +496,20 @@ static int parse_options(char *options, + case Opt_nouid32: + set_opt (sbi->s_mount_opt, NO_UID32); + break; ++#ifndef CONFIG_TAGGING_NONE ++ case Opt_tag: ++ set_opt (sbi->s_mount_opt, TAGGED); ++ break; ++ case Opt_notag: ++ clear_opt (sbi->s_mount_opt, TAGGED); ++ break; ++#endif ++#ifdef CONFIG_PROPAGATE ++ case Opt_tagid: ++ /* use args[0] */ ++ set_opt (sbi->s_mount_opt, TAGGED); ++ break; ++#endif + case Opt_nocheck: + clear_opt (sbi->s_mount_opt, CHECK); + break; +@@ -850,6 +868,8 @@ static int ext2_fill_super(struct super_ + if (!parse_options((char *) data, sb)) + goto failed_mount; + ++ if (EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_TAGGED) ++ sb->s_flags |= MS_TAGGED; + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? + MS_POSIXACL : 0); +@@ -1224,6 +1244,14 @@ static int ext2_remount (struct super_bl + goto restore_opts; + } + ++ if ((sbi->s_mount_opt & EXT2_MOUNT_TAGGED) && ++ !(sb->s_flags & MS_TAGGED)) { ++ printk("EXT2-fs: %s: tagging not permitted on remount.\n", ++ sb->s_id); ++ err = -EINVAL; ++ goto restore_opts; ++ } ++ + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + +diff -NurpP --minimal linux-3.2.34/fs/ext3/file.c linux-3.2.34-vs2.3.2.15/fs/ext3/file.c +--- linux-3.2.34/fs/ext3/file.c 2011-10-24 18:45:27.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/ext3/file.c 2011-12-05 19:33:02.000000000 +0100 +@@ -80,5 +80,6 @@ const struct inode_operations ext3_file_ + #endif + .get_acl = ext3_get_acl, + .fiemap = ext3_fiemap, ++ .sync_flags = ext3_sync_flags, + }; + +diff -NurpP --minimal linux-3.2.34/fs/ext3/ialloc.c linux-3.2.34-vs2.3.2.15/fs/ext3/ialloc.c +--- linux-3.2.34/fs/ext3/ialloc.c 2012-11-18 18:42:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext3/ialloc.c 2012-06-14 20:45:24.000000000 +0200 +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -496,6 +497,7 @@ got: + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; ++ inode->i_tag = dx_current_fstag(sb); + } else + inode_init_owner(inode, dir, mode); + +diff -NurpP --minimal linux-3.2.34/fs/ext3/inode.c linux-3.2.34-vs2.3.2.15/fs/ext3/inode.c +--- linux-3.2.34/fs/ext3/inode.c 2012-11-18 18:42:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext3/inode.c 2012-10-22 12:59:51.000000000 +0200 +@@ -38,6 +38,7 @@ + #include + #include + #include ++#include + #include + #include "xattr.h" + #include "acl.h" +@@ -2852,36 +2853,60 @@ void ext3_set_inode_flags(struct inode * + { + unsigned int flags = EXT3_I(inode)->i_flags; + +- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | ++ S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); ++ ++ if (flags & EXT3_IMMUTABLE_FL) ++ inode->i_flags |= S_IMMUTABLE; ++ if (flags & EXT3_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; ++ + if (flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT3_APPEND_FL) + inode->i_flags |= S_APPEND; +- if (flags & EXT3_IMMUTABLE_FL) +- inode->i_flags |= S_IMMUTABLE; + if (flags & EXT3_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & EXT3_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (flags & EXT3_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ if (flags & EXT3_COW_FL) ++ inode->i_vflags |= V_COW; + } + + /* Propagate flags from i_flags to EXT3_I(inode)->i_flags */ + void ext3_get_inode_flags(struct ext3_inode_info *ei) + { + unsigned int flags = ei->vfs_inode.i_flags; ++ unsigned int vflags = ei->vfs_inode.i_vflags; ++ ++ ei->i_flags &= ~(EXT3_SYNC_FL | EXT3_APPEND_FL | ++ EXT3_IMMUTABLE_FL | EXT3_IXUNLINK_FL | ++ EXT3_NOATIME_FL | EXT3_DIRSYNC_FL | ++ EXT3_BARRIER_FL | EXT3_COW_FL); ++ ++ if (flags & S_IMMUTABLE) ++ ei->i_flags |= EXT3_IMMUTABLE_FL; ++ if (flags & S_IXUNLINK) ++ ei->i_flags |= EXT3_IXUNLINK_FL; + +- ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL| +- EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL); + if (flags & S_SYNC) + ei->i_flags |= EXT3_SYNC_FL; + if (flags & S_APPEND) + ei->i_flags |= EXT3_APPEND_FL; +- if (flags & S_IMMUTABLE) +- ei->i_flags |= EXT3_IMMUTABLE_FL; + if (flags & S_NOATIME) + ei->i_flags |= EXT3_NOATIME_FL; + if (flags & S_DIRSYNC) + ei->i_flags |= EXT3_DIRSYNC_FL; ++ ++ if (vflags & V_BARRIER) ++ ei->i_flags |= EXT3_BARRIER_FL; ++ if (vflags & V_COW) ++ ei->i_flags |= EXT3_COW_FL; + } + + struct inode *ext3_iget(struct super_block *sb, unsigned long ino) +@@ -2895,6 +2920,8 @@ struct inode *ext3_iget(struct super_blo + transaction_t *transaction; + long ret; + int block; ++ uid_t uid; ++ gid_t gid; + + inode = iget_locked(sb, ino); + if (!inode) +@@ -2911,12 +2938,16 @@ struct inode *ext3_iget(struct super_blo + bh = iloc.bh; + raw_inode = ext3_raw_inode(&iloc); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); +- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); +- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); ++ uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); ++ gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if(!(test_opt (inode->i_sb, NO_UID32))) { +- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; +- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; ++ uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; ++ gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, ++ le16_to_cpu(raw_inode->i_raw_tag)); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); + inode->i_size = le32_to_cpu(raw_inode->i_size); + inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); +@@ -3071,6 +3102,8 @@ static int ext3_do_update_inode(handle_t + struct ext3_inode *raw_inode = ext3_raw_inode(iloc); + struct ext3_inode_info *ei = EXT3_I(inode); + struct buffer_head *bh = iloc->bh; ++ uid_t uid = TAGINO_UID(DX_TAG(inode), inode->i_uid, inode->i_tag); ++ gid_t gid = TAGINO_GID(DX_TAG(inode), inode->i_gid, inode->i_tag); + int err = 0, rc, block; + int need_datasync = 0; + __le32 disksize; +@@ -3087,29 +3120,32 @@ again: + ext3_get_inode_flags(ei); + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + if(!(test_opt(inode->i_sb, NO_UID32))) { +- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); +- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); ++ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid)); ++ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid)); + /* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if(!ei->i_dtime) { + raw_inode->i_uid_high = +- cpu_to_le16(high_16_bits(inode->i_uid)); ++ cpu_to_le16(high_16_bits(uid)); + raw_inode->i_gid_high = +- cpu_to_le16(high_16_bits(inode->i_gid)); ++ cpu_to_le16(high_16_bits(gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = +- cpu_to_le16(fs_high2lowuid(inode->i_uid)); ++ cpu_to_le16(fs_high2lowuid(uid)); + raw_inode->i_gid_low = +- cpu_to_le16(fs_high2lowgid(inode->i_gid)); ++ cpu_to_le16(fs_high2lowgid(gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } ++#ifdef CONFIG_TAGGING_INTERN ++ raw_inode->i_raw_tag = cpu_to_le16(inode->i_tag); ++#endif + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + disksize = cpu_to_le32(ei->i_disksize); + if (disksize != raw_inode->i_size) { +@@ -3278,7 +3314,8 @@ int ext3_setattr(struct dentry *dentry, + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || +- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { ++ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid) || ++ (ia_valid & ATTR_TAG && attr->ia_tag != inode->i_tag)) { + handle_t *handle; + + /* (user+group)*(old+new) structure, inode write (sb, +@@ -3300,6 +3337,8 @@ int ext3_setattr(struct dentry *dentry, + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; ++ if ((attr->ia_valid & ATTR_TAG) && IS_TAGGED(inode)) ++ inode->i_tag = attr->ia_tag; + error = ext3_mark_inode_dirty(handle, inode); + ext3_journal_stop(handle); + } +diff -NurpP --minimal linux-3.2.34/fs/ext3/ioctl.c linux-3.2.34-vs2.3.2.15/fs/ext3/ioctl.c +--- linux-3.2.34/fs/ext3/ioctl.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext3/ioctl.c 2011-12-05 19:33:02.000000000 +0100 +@@ -8,6 +8,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -17,6 +18,34 @@ + #include + #include + ++ ++int ext3_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ handle_t *handle = NULL; ++ struct ext3_iloc iloc; ++ int err; ++ ++ handle = ext3_journal_start(inode, 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ err = ext3_reserve_inode_write(handle, inode, &iloc); ++ if (err) ++ goto flags_err; ++ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ ext3_get_inode_flags(EXT3_I(inode)); ++ inode->i_ctime = CURRENT_TIME_SEC; ++ ++ err = ext3_mark_iloc_dirty(handle, inode, &iloc); ++flags_err: ++ ext3_journal_stop(handle); ++ return err; ++} ++ + long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_dentry->d_inode; +@@ -50,6 +79,11 @@ long ext3_ioctl(struct file *filp, unsig + + flags = ext3_mask_flags(inode->i_mode, flags); + ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -EACCES; ++ } ++ + mutex_lock(&inode->i_mutex); + + /* Is it quota file? Do not allow user to mess with it */ +@@ -68,7 +102,9 @@ long ext3_ioctl(struct file *filp, unsig + * + * This test looks nicer. Thanks to Pauline Middelink + */ +- if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { ++ if ((oldflags & EXT3_IMMUTABLE_FL) || ++ ((flags ^ oldflags) & (EXT3_APPEND_FL | ++ EXT3_IMMUTABLE_FL | EXT3_IXUNLINK_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; + } +@@ -93,7 +129,7 @@ long ext3_ioctl(struct file *filp, unsig + if (err) + goto flags_err; + +- flags = flags & EXT3_FL_USER_MODIFIABLE; ++ flags &= EXT3_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; + ei->i_flags = flags; + +diff -NurpP --minimal linux-3.2.34/fs/ext3/namei.c linux-3.2.34-vs2.3.2.15/fs/ext3/namei.c +--- linux-3.2.34/fs/ext3/namei.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext3/namei.c 2011-12-05 19:33:02.000000000 +0100 +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + #include + + #include "namei.h" +@@ -925,6 +926,7 @@ restart: + ll_rw_block(READ | REQ_META | REQ_PRIO, + 1, &bh); + } ++ dx_propagate_tag(nd, inode); + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; +@@ -2535,6 +2537,7 @@ const struct inode_operations ext3_dir_i + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, + #endif ++ .sync_flags = ext3_sync_flags, + .get_acl = ext3_get_acl, + }; + +diff -NurpP --minimal linux-3.2.34/fs/ext3/super.c linux-3.2.34-vs2.3.2.15/fs/ext3/super.c +--- linux-3.2.34/fs/ext3/super.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext3/super.c 2011-12-05 19:33:02.000000000 +0100 +@@ -831,7 +831,8 @@ enum { + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, + Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, +- Opt_resize, Opt_usrquota, Opt_grpquota ++ Opt_resize, Opt_usrquota, Opt_grpquota, ++ Opt_tag, Opt_notag, Opt_tagid + }; + + static const match_table_t tokens = { +@@ -888,6 +889,9 @@ static const match_table_t tokens = { + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_resize, "resize"}, ++ {Opt_tag, "tag"}, ++ {Opt_notag, "notag"}, ++ {Opt_tagid, "tagid=%u"}, + {Opt_err, NULL}, + }; + +@@ -1040,6 +1044,20 @@ static int parse_options (char *options, + case Opt_nouid32: + set_opt (sbi->s_mount_opt, NO_UID32); + break; ++#ifndef CONFIG_TAGGING_NONE ++ case Opt_tag: ++ set_opt (sbi->s_mount_opt, TAGGED); ++ break; ++ case Opt_notag: ++ clear_opt (sbi->s_mount_opt, TAGGED); ++ break; ++#endif ++#ifdef CONFIG_PROPAGATE ++ case Opt_tagid: ++ /* use args[0] */ ++ set_opt (sbi->s_mount_opt, TAGGED); ++ break; ++#endif + case Opt_nocheck: + clear_opt (sbi->s_mount_opt, CHECK); + break; +@@ -1738,6 +1756,9 @@ static int ext3_fill_super (struct super + NULL, 0)) + goto failed_mount; + ++ if (EXT3_SB(sb)->s_mount_opt & EXT3_MOUNT_TAGGED) ++ sb->s_flags |= MS_TAGGED; ++ + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + +@@ -2619,6 +2640,14 @@ static int ext3_remount (struct super_bl + if (test_opt(sb, ABORT)) + ext3_abort(sb, __func__, "Abort forced by user"); + ++ if ((sbi->s_mount_opt & EXT3_MOUNT_TAGGED) && ++ !(sb->s_flags & MS_TAGGED)) { ++ printk("EXT3-fs: %s: tagging not permitted on remount.\n", ++ sb->s_id); ++ err = -EINVAL; ++ goto restore_opts; ++ } ++ + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + +diff -NurpP --minimal linux-3.2.34/fs/ext4/ext4.h linux-3.2.34-vs2.3.2.15/fs/ext4/ext4.h +--- linux-3.2.34/fs/ext4/ext4.h 2012-11-18 18:42:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext4/ext4.h 2012-08-13 12:40:51.000000000 +0200 +@@ -373,8 +373,12 @@ struct flex_groups { + #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ + #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ + #define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ ++#define EXT4_IXUNLINK_FL 0x08000000 /* Immutable invert on unlink */ + #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + ++#define EXT4_BARRIER_FL 0x04000000 /* Barrier for chroot() */ ++#define EXT4_COW_FL 0x20000000 /* Copy on Write marker */ ++ + #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ + #define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ + +@@ -634,7 +638,8 @@ struct ext4_inode { + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ +- __u32 l_i_reserved2; ++ __le16 l_i_tag; /* Context Tag */ ++ __u16 l_i_reserved2; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ +@@ -752,6 +757,7 @@ do { \ + #define i_gid_low i_gid + #define i_uid_high osd2.linux2.l_i_uid_high + #define i_gid_high osd2.linux2.l_i_gid_high ++#define i_raw_tag osd2.linux2.l_i_tag + #define i_reserved2 osd2.linux2.l_i_reserved2 + + #elif defined(__GNU__) +@@ -928,6 +934,7 @@ struct ext4_inode_info { + #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ + #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ + #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ ++#define EXT4_MOUNT_TAGGED 0x40000 /* Enable Context Tags */ + #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ + #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ + #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +@@ -2269,6 +2276,7 @@ extern int ext4_map_blocks(handle_t *han + struct ext4_map_blocks *map, int flags); + extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); ++extern int ext4_sync_flags(struct inode *, int, int); + /* move_extent.c */ + extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 start_orig, __u64 start_donor, +diff -NurpP --minimal linux-3.2.34/fs/ext4/file.c linux-3.2.34-vs2.3.2.15/fs/ext4/file.c +--- linux-3.2.34/fs/ext4/file.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext4/file.c 2011-12-05 19:33:02.000000000 +0100 +@@ -258,5 +258,6 @@ const struct inode_operations ext4_file_ + #endif + .get_acl = ext4_get_acl, + .fiemap = ext4_fiemap, ++ .sync_flags = ext4_sync_flags, + }; + +diff -NurpP --minimal linux-3.2.34/fs/ext4/ialloc.c linux-3.2.34-vs2.3.2.15/fs/ext4/ialloc.c +--- linux-3.2.34/fs/ext4/ialloc.c 2012-11-18 18:42:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext4/ialloc.c 2012-08-13 12:40:51.000000000 +0200 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + + #include "ext4.h" +@@ -860,6 +861,7 @@ got: + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; ++ inode->i_tag = dx_current_fstag(sb); + } else + inode_init_owner(inode, dir, mode); + +diff -NurpP --minimal linux-3.2.34/fs/ext4/inode.c linux-3.2.34-vs2.3.2.15/fs/ext4/inode.c +--- linux-3.2.34/fs/ext4/inode.c 2012-11-18 18:42:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext4/inode.c 2012-10-22 12:59:51.000000000 +0200 +@@ -38,6 +38,7 @@ + #include + #include + #include ++#include + + #include "ext4_jbd2.h" + #include "xattr.h" +@@ -3697,41 +3698,64 @@ void ext4_set_inode_flags(struct inode * + { + unsigned int flags = EXT4_I(inode)->i_flags; + +- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | ++ S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); ++ ++ if (flags & EXT4_IMMUTABLE_FL) ++ inode->i_flags |= S_IMMUTABLE; ++ if (flags & EXT4_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; ++ + if (flags & EXT4_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT4_APPEND_FL) + inode->i_flags |= S_APPEND; +- if (flags & EXT4_IMMUTABLE_FL) +- inode->i_flags |= S_IMMUTABLE; + if (flags & EXT4_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & EXT4_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (flags & EXT4_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ if (flags & EXT4_COW_FL) ++ inode->i_vflags |= V_COW; + } + + /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ + void ext4_get_inode_flags(struct ext4_inode_info *ei) + { +- unsigned int vfs_fl; ++ unsigned int vfs_fl, vfs_vf; + unsigned long old_fl, new_fl; + + do { + vfs_fl = ei->vfs_inode.i_flags; ++ vfs_vf = ei->vfs_inode.i_vflags; + old_fl = ei->i_flags; + new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| + EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| +- EXT4_DIRSYNC_FL); ++ EXT4_DIRSYNC_FL|EXT4_BARRIER_FL| ++ EXT4_COW_FL); ++ ++ if (vfs_fl & S_IMMUTABLE) ++ new_fl |= EXT4_IMMUTABLE_FL; ++ if (vfs_fl & S_IXUNLINK) ++ new_fl |= EXT4_IXUNLINK_FL; ++ + if (vfs_fl & S_SYNC) + new_fl |= EXT4_SYNC_FL; + if (vfs_fl & S_APPEND) + new_fl |= EXT4_APPEND_FL; +- if (vfs_fl & S_IMMUTABLE) +- new_fl |= EXT4_IMMUTABLE_FL; + if (vfs_fl & S_NOATIME) + new_fl |= EXT4_NOATIME_FL; + if (vfs_fl & S_DIRSYNC) + new_fl |= EXT4_DIRSYNC_FL; ++ ++ if (vfs_vf & V_BARRIER) ++ new_fl |= EXT4_BARRIER_FL; ++ if (vfs_vf & V_COW) ++ new_fl |= EXT4_COW_FL; + } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); + } + +@@ -3767,6 +3791,8 @@ struct inode *ext4_iget(struct super_blo + journal_t *journal = EXT4_SB(sb)->s_journal; + long ret; + int block; ++ uid_t uid; ++ gid_t gid; + + inode = iget_locked(sb, ino); + if (!inode) +@@ -3782,12 +3808,16 @@ struct inode *ext4_iget(struct super_blo + goto bad_inode; + raw_inode = ext4_raw_inode(&iloc); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); +- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); +- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); ++ uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); ++ gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (!(test_opt(inode->i_sb, NO_UID32))) { +- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; +- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; ++ uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; ++ gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, ++ le16_to_cpu(raw_inode->i_raw_tag)); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); + + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ +@@ -4006,6 +4036,8 @@ static int ext4_do_update_inode(handle_t + struct ext4_inode *raw_inode = ext4_raw_inode(iloc); + struct ext4_inode_info *ei = EXT4_I(inode); + struct buffer_head *bh = iloc->bh; ++ uid_t uid = TAGINO_UID(DX_TAG(inode), inode->i_uid, inode->i_tag); ++ gid_t gid = TAGINO_GID(DX_TAG(inode), inode->i_gid, inode->i_tag); + int err = 0, rc, block; + int need_datasync = 0; + +@@ -4017,29 +4049,32 @@ static int ext4_do_update_inode(handle_t + ext4_get_inode_flags(ei); + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + if (!(test_opt(inode->i_sb, NO_UID32))) { +- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); +- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); ++ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid)); ++ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid)); + /* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if (!ei->i_dtime) { + raw_inode->i_uid_high = +- cpu_to_le16(high_16_bits(inode->i_uid)); ++ cpu_to_le16(high_16_bits(uid)); + raw_inode->i_gid_high = +- cpu_to_le16(high_16_bits(inode->i_gid)); ++ cpu_to_le16(high_16_bits(gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = +- cpu_to_le16(fs_high2lowuid(inode->i_uid)); ++ cpu_to_le16(fs_high2lowuid(uid)); + raw_inode->i_gid_low = +- cpu_to_le16(fs_high2lowgid(inode->i_gid)); ++ cpu_to_le16(fs_high2lowgid(gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } ++#ifdef CONFIG_TAGGING_INTERN ++ raw_inode->i_raw_tag = cpu_to_le16(inode->i_tag); ++#endif + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + + EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); +@@ -4228,7 +4263,8 @@ int ext4_setattr(struct dentry *dentry, + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || +- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { ++ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid) || ++ (ia_valid & ATTR_TAG && attr->ia_tag != inode->i_tag)) { + handle_t *handle; + + /* (user+group)*(old+new) structure, inode write (sb, +@@ -4250,6 +4286,8 @@ int ext4_setattr(struct dentry *dentry, + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; ++ if ((attr->ia_valid & ATTR_TAG) && IS_TAGGED(inode)) ++ inode->i_tag = attr->ia_tag; + error = ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + } +diff -NurpP --minimal linux-3.2.34/fs/ext4/ioctl.c linux-3.2.34-vs2.3.2.15/fs/ext4/ioctl.c +--- linux-3.2.34/fs/ext4/ioctl.c 2012-11-18 18:42:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext4/ioctl.c 2012-06-14 20:45:24.000000000 +0200 +@@ -14,10 +14,39 @@ + #include + #include + #include ++#include + #include + #include "ext4_jbd2.h" + #include "ext4.h" + ++ ++int ext4_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ handle_t *handle = NULL; ++ struct ext4_iloc iloc; ++ int err; ++ ++ handle = ext4_journal_start(inode, 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_SYNC(inode)) ++ ext4_handle_sync(handle); ++ err = ext4_reserve_inode_write(handle, inode, &iloc); ++ if (err) ++ goto flags_err; ++ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ ext4_get_inode_flags(EXT4_I(inode)); ++ inode->i_ctime = ext4_current_time(inode); ++ ++ err = ext4_mark_iloc_dirty(handle, inode, &iloc); ++flags_err: ++ ext4_journal_stop(handle); ++ return err; ++} ++ + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_dentry->d_inode; +@@ -51,6 +80,11 @@ long ext4_ioctl(struct file *filp, unsig + + flags = ext4_mask_flags(inode->i_mode, flags); + ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -EACCES; ++ } ++ + err = -EPERM; + mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ +@@ -68,7 +102,9 @@ long ext4_ioctl(struct file *filp, unsig + * + * This test looks nicer. Thanks to Pauline Middelink + */ +- if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { ++ if ((oldflags & EXT4_IMMUTABLE_FL) || ++ ((flags ^ oldflags) & (EXT4_APPEND_FL | ++ EXT4_IMMUTABLE_FL | EXT4_IXUNLINK_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; + } +diff -NurpP --minimal linux-3.2.34/fs/ext4/namei.c linux-3.2.34-vs2.3.2.15/fs/ext4/namei.c +--- linux-3.2.34/fs/ext4/namei.c 2012-11-18 18:42:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext4/namei.c 2012-10-22 12:59:51.000000000 +0200 +@@ -34,6 +34,7 @@ + #include + #include + #include ++#include + #include "ext4.h" + #include "ext4_jbd2.h" + +@@ -925,6 +926,7 @@ restart: + ll_rw_block(READ | REQ_META | REQ_PRIO, + 1, &bh); + } ++ dx_propagate_tag(nd, inode); + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; +@@ -2591,6 +2593,7 @@ const struct inode_operations ext4_dir_i + #endif + .get_acl = ext4_get_acl, + .fiemap = ext4_fiemap, ++ .sync_flags = ext4_sync_flags, + }; + + const struct inode_operations ext4_special_inode_operations = { +diff -NurpP --minimal linux-3.2.34/fs/ext4/super.c linux-3.2.34-vs2.3.2.15/fs/ext4/super.c +--- linux-3.2.34/fs/ext4/super.c 2012-11-18 18:42:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ext4/super.c 2012-09-16 18:25:50.000000000 +0200 +@@ -1336,6 +1336,7 @@ enum { + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_dioread_nolock, Opt_dioread_lock, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, ++ Opt_tag, Opt_notag, Opt_tagid + }; + + static const match_table_t tokens = { +@@ -1411,6 +1412,9 @@ static const match_table_t tokens = { + {Opt_init_itable, "init_itable=%u"}, + {Opt_init_itable, "init_itable"}, + {Opt_noinit_itable, "noinit_itable"}, ++ {Opt_tag, "tag"}, ++ {Opt_notag, "notag"}, ++ {Opt_tagid, "tagid=%u"}, + {Opt_err, NULL}, + }; + +@@ -1579,6 +1583,20 @@ static int parse_options(char *options, + case Opt_nouid32: + set_opt(sb, NO_UID32); + break; ++#ifndef CONFIG_TAGGING_NONE ++ case Opt_tag: ++ set_opt(sb, TAGGED); ++ break; ++ case Opt_notag: ++ clear_opt(sb, TAGGED); ++ break; ++#endif ++#ifdef CONFIG_PROPAGATE ++ case Opt_tagid: ++ /* use args[0] */ ++ set_opt(sb, TAGGED); ++ break; ++#endif + case Opt_debug: + set_opt(sb, DEBUG); + break; +@@ -3376,6 +3394,9 @@ static int ext4_fill_super(struct super_ + } + } + ++ if (EXT4_SB(sb)->s_mount_opt & EXT4_MOUNT_TAGGED) ++ sb->s_flags |= MS_TAGGED; ++ + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + +@@ -4538,6 +4559,14 @@ static int ext4_remount(struct super_blo + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) + ext4_abort(sb, "Abort forced by user"); + ++ if ((sbi->s_mount_opt & EXT4_MOUNT_TAGGED) && ++ !(sb->s_flags & MS_TAGGED)) { ++ printk("EXT4-fs: %s: tagging not permitted on remount.\n", ++ sb->s_id); ++ err = -EINVAL; ++ goto restore_opts; ++ } ++ + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + +diff -NurpP --minimal linux-3.2.34/fs/fcntl.c linux-3.2.34-vs2.3.2.15/fs/fcntl.c +--- linux-3.2.34/fs/fcntl.c 2011-05-22 16:17:52.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/fcntl.c 2011-12-05 19:33:02.000000000 +0100 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -103,6 +104,8 @@ SYSCALL_DEFINE3(dup3, unsigned int, oldf + + if (tofree) + filp_close(tofree, files); ++ else ++ vx_openfd_inc(newfd); /* fd was unused */ + + return newfd; + +@@ -447,6 +450,8 @@ SYSCALL_DEFINE3(fcntl, unsigned int, fd, + filp = fget_raw(fd); + if (!filp) + goto out; ++ if (!vx_files_avail(1)) ++ goto out; + + if (unlikely(filp->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) { +diff -NurpP --minimal linux-3.2.34/fs/file.c linux-3.2.34-vs2.3.2.15/fs/file.c +--- linux-3.2.34/fs/file.c 2011-05-22 16:17:52.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/file.c 2011-12-05 19:33:02.000000000 +0100 +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + + struct fdtable_defer { + spinlock_t lock; +@@ -359,6 +360,8 @@ struct files_struct *dup_fd(struct files + struct file *f = *old_fds++; + if (f) { + get_file(f); ++ /* TODO: sum it first for check and performance */ ++ vx_openfd_inc(open_files - i); + } else { + /* + * The fd may be claimed in the fd bitmap but not yet +@@ -466,6 +469,7 @@ repeat: + else + FD_CLR(fd, fdt->close_on_exec); + error = fd; ++ vx_openfd_inc(fd); + #if 1 + /* Sanity check */ + if (rcu_dereference_raw(fdt->fd[fd]) != NULL) { +diff -NurpP --minimal linux-3.2.34/fs/file_table.c linux-3.2.34-vs2.3.2.15/fs/file_table.c +--- linux-3.2.34/fs/file_table.c 2011-10-24 18:45:27.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/file_table.c 2011-12-05 19:33:02.000000000 +0100 +@@ -24,6 +24,8 @@ + #include + #include + #include ++#include ++#include + + #include + +@@ -135,6 +137,8 @@ struct file *get_empty_filp(void) + spin_lock_init(&f->f_lock); + eventpoll_init_file(f); + /* f->f_version: 0 */ ++ f->f_xid = vx_current_xid(); ++ vx_files_inc(f); + return f; + + over: +@@ -253,6 +257,8 @@ static void __fput(struct file *file) + } + fops_put(file->f_op); + put_pid(file->f_owner.pid); ++ vx_files_dec(file); ++ file->f_xid = 0; + file_sb_list_del(file); + if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_dec(inode); +@@ -383,6 +389,8 @@ void put_filp(struct file *file) + { + if (atomic_long_dec_and_test(&file->f_count)) { + security_file_free(file); ++ vx_files_dec(file); ++ file->f_xid = 0; + file_sb_list_del(file); + file_free(file); + } +diff -NurpP --minimal linux-3.2.34/fs/fs_struct.c linux-3.2.34-vs2.3.2.15/fs/fs_struct.c +--- linux-3.2.34/fs/fs_struct.c 2011-03-15 18:07:31.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/fs_struct.c 2011-12-05 19:33:02.000000000 +0100 +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + #include "internal.h" + + static inline void path_get_longterm(struct path *path) +@@ -96,6 +97,7 @@ void free_fs_struct(struct fs_struct *fs + { + path_put_longterm(&fs->root); + path_put_longterm(&fs->pwd); ++ atomic_dec(&vs_global_fs); + kmem_cache_free(fs_cachep, fs); + } + +@@ -135,6 +137,7 @@ struct fs_struct *copy_fs_struct(struct + fs->pwd = old->pwd; + path_get_longterm(&fs->pwd); + spin_unlock(&old->lock); ++ atomic_inc(&vs_global_fs); + } + return fs; + } +diff -NurpP --minimal linux-3.2.34/fs/gfs2/file.c linux-3.2.34-vs2.3.2.15/fs/gfs2/file.c +--- linux-3.2.34/fs/gfs2/file.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/gfs2/file.c 2011-12-05 19:33:02.000000000 +0100 +@@ -143,6 +143,9 @@ static const u32 fsflags_to_gfs2[32] = { + [7] = GFS2_DIF_NOATIME, + [12] = GFS2_DIF_EXHASH, + [14] = GFS2_DIF_INHERIT_JDATA, ++ [27] = GFS2_DIF_IXUNLINK, ++ [26] = GFS2_DIF_BARRIER, ++ [29] = GFS2_DIF_COW, + }; + + static const u32 gfs2_to_fsflags[32] = { +@@ -152,6 +155,9 @@ static const u32 gfs2_to_fsflags[32] = { + [gfs2fl_NoAtime] = FS_NOATIME_FL, + [gfs2fl_ExHash] = FS_INDEX_FL, + [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, ++ [gfs2fl_IXUnlink] = FS_IXUNLINK_FL, ++ [gfs2fl_Barrier] = FS_BARRIER_FL, ++ [gfs2fl_Cow] = FS_COW_FL, + }; + + static int gfs2_get_flags(struct file *filp, u32 __user *ptr) +@@ -182,12 +188,18 @@ void gfs2_set_inode_flags(struct inode * + { + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int flags = inode->i_flags; ++ unsigned int vflags = inode->i_vflags; ++ ++ flags &= ~(S_IMMUTABLE | S_IXUNLINK | ++ S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC | S_NOSEC); + +- flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC); + if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode)) + inode->i_flags |= S_NOSEC; + if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) + flags |= S_IMMUTABLE; ++ if (ip->i_diskflags & GFS2_DIF_IXUNLINK) ++ flags |= S_IXUNLINK; ++ + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) + flags |= S_APPEND; + if (ip->i_diskflags & GFS2_DIF_NOATIME) +@@ -195,6 +207,43 @@ void gfs2_set_inode_flags(struct inode * + if (ip->i_diskflags & GFS2_DIF_SYNC) + flags |= S_SYNC; + inode->i_flags = flags; ++ ++ vflags &= ~(V_BARRIER | V_COW); ++ ++ if (ip->i_diskflags & GFS2_DIF_BARRIER) ++ vflags |= V_BARRIER; ++ if (ip->i_diskflags & GFS2_DIF_COW) ++ vflags |= V_COW; ++ inode->i_vflags = vflags; ++} ++ ++void gfs2_get_inode_flags(struct inode *inode) ++{ ++ struct gfs2_inode *ip = GFS2_I(inode); ++ unsigned int flags = inode->i_flags; ++ unsigned int vflags = inode->i_vflags; ++ ++ ip->i_diskflags &= ~(GFS2_DIF_APPENDONLY | ++ GFS2_DIF_NOATIME | GFS2_DIF_SYNC | ++ GFS2_DIF_IMMUTABLE | GFS2_DIF_IXUNLINK | ++ GFS2_DIF_BARRIER | GFS2_DIF_COW); ++ ++ if (flags & S_IMMUTABLE) ++ ip->i_diskflags |= GFS2_DIF_IMMUTABLE; ++ if (flags & S_IXUNLINK) ++ ip->i_diskflags |= GFS2_DIF_IXUNLINK; ++ ++ if (flags & S_APPEND) ++ ip->i_diskflags |= GFS2_DIF_APPENDONLY; ++ if (flags & S_NOATIME) ++ ip->i_diskflags |= GFS2_DIF_NOATIME; ++ if (flags & S_SYNC) ++ ip->i_diskflags |= GFS2_DIF_SYNC; ++ ++ if (vflags & V_BARRIER) ++ ip->i_diskflags |= GFS2_DIF_BARRIER; ++ if (vflags & V_COW) ++ ip->i_diskflags |= GFS2_DIF_COW; + } + + /* Flags that can be set by user space */ +@@ -306,6 +355,37 @@ static int gfs2_set_flags(struct file *f + return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); + } + ++int gfs2_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ struct gfs2_inode *ip = GFS2_I(inode); ++ struct gfs2_sbd *sdp = GFS2_SB(inode); ++ struct buffer_head *bh; ++ struct gfs2_holder gh; ++ int error; ++ ++ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ++ if (error) ++ return error; ++ error = gfs2_trans_begin(sdp, RES_DINODE, 0); ++ if (error) ++ goto out; ++ error = gfs2_meta_inode_buffer(ip, &bh); ++ if (error) ++ goto out_trans_end; ++ gfs2_trans_add_bh(ip->i_gl, bh, 1); ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ gfs2_get_inode_flags(inode); ++ gfs2_dinode_out(ip, bh->b_data); ++ brelse(bh); ++ gfs2_set_aops(inode); ++out_trans_end: ++ gfs2_trans_end(sdp); ++out: ++ gfs2_glock_dq_uninit(&gh); ++ return error; ++} ++ + static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + switch(cmd) { +diff -NurpP --minimal linux-3.2.34/fs/gfs2/inode.h linux-3.2.34-vs2.3.2.15/fs/gfs2/inode.h +--- linux-3.2.34/fs/gfs2/inode.h 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/gfs2/inode.h 2011-12-05 19:33:02.000000000 +0100 +@@ -120,6 +120,7 @@ extern const struct file_operations gfs2 + extern const struct file_operations gfs2_dir_fops_nolock; + + extern void gfs2_set_inode_flags(struct inode *inode); ++extern int gfs2_sync_flags(struct inode *inode, int flags, int vflags); + + #ifdef CONFIG_GFS2_FS_LOCKING_DLM + extern const struct file_operations gfs2_file_fops; +diff -NurpP --minimal linux-3.2.34/fs/inode.c linux-3.2.34-vs2.3.2.15/fs/inode.c +--- linux-3.2.34/fs/inode.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/inode.c 2011-12-05 19:33:02.000000000 +0100 +@@ -26,6 +26,7 @@ + #include + #include + #include /* for inode_has_buffers */ ++#include + #include "internal.h" + + /* +@@ -137,6 +138,9 @@ int inode_init_always(struct super_block + struct address_space *const mapping = &inode->i_data; + + inode->i_sb = sb; ++ ++ /* essential because of inode slab reuse */ ++ inode->i_tag = 0; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); +@@ -158,6 +162,7 @@ int inode_init_always(struct super_block + inode->i_bdev = NULL; + inode->i_cdev = NULL; + inode->i_rdev = 0; ++ inode->i_mdev = 0; + inode->dirtied_when = 0; + + if (security_inode_alloc(inode)) +@@ -399,6 +404,8 @@ void __insert_inode_hash(struct inode *i + } + EXPORT_SYMBOL(__insert_inode_hash); + ++EXPORT_SYMBOL_GPL(__iget); ++ + /** + * __remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash +@@ -1626,9 +1633,11 @@ void init_special_inode(struct inode *in + if (S_ISCHR(mode)) { + inode->i_fop = &def_chr_fops; + inode->i_rdev = rdev; ++ inode->i_mdev = rdev; + } else if (S_ISBLK(mode)) { + inode->i_fop = &def_blk_fops; + inode->i_rdev = rdev; ++ inode->i_mdev = rdev; + } else if (S_ISFIFO(mode)) + inode->i_fop = &def_fifo_fops; + else if (S_ISSOCK(mode)) +@@ -1657,6 +1666,7 @@ void inode_init_owner(struct inode *inod + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; ++ inode->i_tag = dx_current_fstag(inode->i_sb); + } + EXPORT_SYMBOL(inode_init_owner); + +diff -NurpP --minimal linux-3.2.34/fs/ioctl.c linux-3.2.34-vs2.3.2.15/fs/ioctl.c +--- linux-3.2.34/fs/ioctl.c 2011-05-22 16:17:52.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/ioctl.c 2011-12-05 19:33:02.000000000 +0100 +@@ -15,6 +15,9 @@ + #include + #include + #include ++#include ++#include ++#include + + #include + +diff -NurpP --minimal linux-3.2.34/fs/ioprio.c linux-3.2.34-vs2.3.2.15/fs/ioprio.c +--- linux-3.2.34/fs/ioprio.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ioprio.c 2011-12-05 19:33:02.000000000 +0100 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + int set_task_ioprio(struct task_struct *task, int ioprio) + { +@@ -120,6 +121,8 @@ SYSCALL_DEFINE3(ioprio_set, int, which, + else + pgrp = find_vpid(who); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ++ if (!vx_check(p->xid, VS_ADMIN_P | VS_IDENT)) ++ continue; + ret = set_task_ioprio(p, ioprio); + if (ret) + break; +@@ -209,6 +212,8 @@ SYSCALL_DEFINE2(ioprio_get, int, which, + else + pgrp = find_vpid(who); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ++ if (!vx_check(p->xid, VS_ADMIN_P | VS_IDENT)) ++ continue; + tmpio = get_task_ioprio(p); + if (tmpio < 0) + continue; +diff -NurpP --minimal linux-3.2.34/fs/jfs/file.c linux-3.2.34-vs2.3.2.15/fs/jfs/file.c +--- linux-3.2.34/fs/jfs/file.c 2011-10-24 18:45:27.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/jfs/file.c 2011-12-05 19:33:02.000000000 +0100 +@@ -109,7 +109,8 @@ int jfs_setattr(struct dentry *dentry, s + if (is_quota_modification(inode, iattr)) + dquot_initialize(inode); + if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || +- (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { ++ (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) || ++ (iattr->ia_valid & ATTR_TAG && iattr->ia_tag != inode->i_tag)) { + rc = dquot_transfer(inode, iattr); + if (rc) + return rc; +@@ -142,6 +143,7 @@ const struct inode_operations jfs_file_i + #ifdef CONFIG_JFS_POSIX_ACL + .get_acl = jfs_get_acl, + #endif ++ .sync_flags = jfs_sync_flags, + }; + + const struct file_operations jfs_file_operations = { +diff -NurpP --minimal linux-3.2.34/fs/jfs/ioctl.c linux-3.2.34-vs2.3.2.15/fs/jfs/ioctl.c +--- linux-3.2.34/fs/jfs/ioctl.c 2011-05-22 16:17:52.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/jfs/ioctl.c 2011-12-05 19:33:02.000000000 +0100 +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -52,6 +53,16 @@ static long jfs_map_ext2(unsigned long f + } + + ++int jfs_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ jfs_get_inode_flags(JFS_IP(inode)); ++ inode->i_ctime = CURRENT_TIME_SEC; ++ mark_inode_dirty(inode); ++ return 0; ++} ++ + long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_dentry->d_inode; +@@ -85,6 +96,11 @@ long jfs_ioctl(struct file *filp, unsign + if (!S_ISDIR(inode->i_mode)) + flags &= ~JFS_DIRSYNC_FL; + ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -EACCES; ++ } ++ + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { + err = -EPERM; +@@ -102,8 +118,8 @@ long jfs_ioctl(struct file *filp, unsign + * the relevant capability. + */ + if ((oldflags & JFS_IMMUTABLE_FL) || +- ((flags ^ oldflags) & +- (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) { ++ ((flags ^ oldflags) & (JFS_APPEND_FL | ++ JFS_IMMUTABLE_FL | JFS_IXUNLINK_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); + err = -EPERM; +@@ -111,7 +127,7 @@ long jfs_ioctl(struct file *filp, unsign + } + } + +- flags = flags & JFS_FL_USER_MODIFIABLE; ++ flags &= JFS_FL_USER_MODIFIABLE; + flags |= oldflags & ~JFS_FL_USER_MODIFIABLE; + jfs_inode->mode2 = flags; + +diff -NurpP --minimal linux-3.2.34/fs/jfs/jfs_dinode.h linux-3.2.34-vs2.3.2.15/fs/jfs/jfs_dinode.h +--- linux-3.2.34/fs/jfs/jfs_dinode.h 2008-12-25 00:26:37.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/jfs/jfs_dinode.h 2011-12-05 19:33:02.000000000 +0100 +@@ -161,9 +161,13 @@ struct dinode { + + #define JFS_APPEND_FL 0x01000000 /* writes to file may only append */ + #define JFS_IMMUTABLE_FL 0x02000000 /* Immutable file */ ++#define JFS_IXUNLINK_FL 0x08000000 /* Immutable invert on unlink */ + +-#define JFS_FL_USER_VISIBLE 0x03F80000 +-#define JFS_FL_USER_MODIFIABLE 0x03F80000 ++#define JFS_BARRIER_FL 0x04000000 /* Barrier for chroot() */ ++#define JFS_COW_FL 0x20000000 /* Copy on Write marker */ ++ ++#define JFS_FL_USER_VISIBLE 0x07F80000 ++#define JFS_FL_USER_MODIFIABLE 0x07F80000 + #define JFS_FL_INHERIT 0x03C80000 + + /* These are identical to EXT[23]_IOC_GETFLAGS/SETFLAGS */ +diff -NurpP --minimal linux-3.2.34/fs/jfs/jfs_filsys.h linux-3.2.34-vs2.3.2.15/fs/jfs/jfs_filsys.h +--- linux-3.2.34/fs/jfs/jfs_filsys.h 2008-12-25 00:26:37.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/jfs/jfs_filsys.h 2011-12-05 19:33:02.000000000 +0100 +@@ -263,6 +263,7 @@ + #define JFS_NAME_MAX 255 + #define JFS_PATH_MAX BPSIZE + ++#define JFS_TAGGED 0x00800000 /* Context Tagging */ + + /* + * file system state (superblock state) +diff -NurpP --minimal linux-3.2.34/fs/jfs/jfs_imap.c linux-3.2.34-vs2.3.2.15/fs/jfs/jfs_imap.c +--- linux-3.2.34/fs/jfs/jfs_imap.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/jfs/jfs_imap.c 2011-12-05 19:33:02.000000000 +0100 +@@ -46,6 +46,7 @@ + #include + #include + #include ++#include + + #include "jfs_incore.h" + #include "jfs_inode.h" +@@ -3058,6 +3059,8 @@ static int copy_from_dinode(struct dinod + { + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); ++ uid_t uid; ++ gid_t gid; + + jfs_ip->fileset = le32_to_cpu(dip->di_fileset); + jfs_ip->mode2 = le32_to_cpu(dip->di_mode); +@@ -3078,14 +3081,18 @@ static int copy_from_dinode(struct dinod + } + set_nlink(ip, le32_to_cpu(dip->di_nlink)); + +- jfs_ip->saved_uid = le32_to_cpu(dip->di_uid); ++ uid = le32_to_cpu(dip->di_uid); ++ gid = le32_to_cpu(dip->di_gid); ++ ip->i_tag = INOTAG_TAG(DX_TAG(ip), uid, gid, 0); ++ ++ jfs_ip->saved_uid = INOTAG_UID(DX_TAG(ip), uid, gid); + if (sbi->uid == -1) + ip->i_uid = jfs_ip->saved_uid; + else { + ip->i_uid = sbi->uid; + } + +- jfs_ip->saved_gid = le32_to_cpu(dip->di_gid); ++ jfs_ip->saved_gid = INOTAG_GID(DX_TAG(ip), uid, gid); + if (sbi->gid == -1) + ip->i_gid = jfs_ip->saved_gid; + else { +@@ -3150,14 +3157,12 @@ static void copy_to_dinode(struct dinode + dip->di_size = cpu_to_le64(ip->i_size); + dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks)); + dip->di_nlink = cpu_to_le32(ip->i_nlink); +- if (sbi->uid == -1) +- dip->di_uid = cpu_to_le32(ip->i_uid); +- else +- dip->di_uid = cpu_to_le32(jfs_ip->saved_uid); +- if (sbi->gid == -1) +- dip->di_gid = cpu_to_le32(ip->i_gid); +- else +- dip->di_gid = cpu_to_le32(jfs_ip->saved_gid); ++ ++ dip->di_uid = cpu_to_le32(TAGINO_UID(DX_TAG(ip), ++ (sbi->uid == -1) ? ip->i_uid : jfs_ip->saved_uid, ip->i_tag)); ++ dip->di_gid = cpu_to_le32(TAGINO_GID(DX_TAG(ip), ++ (sbi->gid == -1) ? ip->i_gid : jfs_ip->saved_gid, ip->i_tag)); ++ + jfs_get_inode_flags(jfs_ip); + /* + * mode2 is only needed for storing the higher order bits. +diff -NurpP --minimal linux-3.2.34/fs/jfs/jfs_inode.c linux-3.2.34-vs2.3.2.15/fs/jfs/jfs_inode.c +--- linux-3.2.34/fs/jfs/jfs_inode.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/jfs/jfs_inode.c 2011-12-05 19:33:02.000000000 +0100 +@@ -18,6 +18,7 @@ + + #include + #include ++#include + #include "jfs_incore.h" + #include "jfs_inode.h" + #include "jfs_filsys.h" +@@ -30,29 +31,46 @@ void jfs_set_inode_flags(struct inode *i + { + unsigned int flags = JFS_IP(inode)->mode2; + +- inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | +- S_NOATIME | S_DIRSYNC | S_SYNC); ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | ++ S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); + + if (flags & JFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; ++ if (flags & JFS_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; ++ ++ if (flags & JFS_SYNC_FL) ++ inode->i_flags |= S_SYNC; + if (flags & JFS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & JFS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & JFS_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +- if (flags & JFS_SYNC_FL) +- inode->i_flags |= S_SYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (flags & JFS_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ if (flags & JFS_COW_FL) ++ inode->i_vflags |= V_COW; + } + + void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip) + { + unsigned int flags = jfs_ip->vfs_inode.i_flags; ++ unsigned int vflags = jfs_ip->vfs_inode.i_vflags; ++ ++ jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_IXUNLINK_FL | ++ JFS_APPEND_FL | JFS_NOATIME_FL | ++ JFS_DIRSYNC_FL | JFS_SYNC_FL | ++ JFS_BARRIER_FL | JFS_COW_FL); + +- jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_APPEND_FL | JFS_NOATIME_FL | +- JFS_DIRSYNC_FL | JFS_SYNC_FL); + if (flags & S_IMMUTABLE) + jfs_ip->mode2 |= JFS_IMMUTABLE_FL; ++ if (flags & S_IXUNLINK) ++ jfs_ip->mode2 |= JFS_IXUNLINK_FL; ++ + if (flags & S_APPEND) + jfs_ip->mode2 |= JFS_APPEND_FL; + if (flags & S_NOATIME) +@@ -61,6 +79,11 @@ void jfs_get_inode_flags(struct jfs_inod + jfs_ip->mode2 |= JFS_DIRSYNC_FL; + if (flags & S_SYNC) + jfs_ip->mode2 |= JFS_SYNC_FL; ++ ++ if (vflags & V_BARRIER) ++ jfs_ip->mode2 |= JFS_BARRIER_FL; ++ if (vflags & V_COW) ++ jfs_ip->mode2 |= JFS_COW_FL; + } + + /* +diff -NurpP --minimal linux-3.2.34/fs/jfs/jfs_inode.h linux-3.2.34-vs2.3.2.15/fs/jfs/jfs_inode.h +--- linux-3.2.34/fs/jfs/jfs_inode.h 2011-10-24 18:45:27.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/jfs/jfs_inode.h 2011-12-05 19:33:02.000000000 +0100 +@@ -39,6 +39,7 @@ extern struct dentry *jfs_fh_to_dentry(s + extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); + extern void jfs_set_inode_flags(struct inode *); ++extern int jfs_sync_flags(struct inode *, int, int); + extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); + extern int jfs_setattr(struct dentry *, struct iattr *); + +diff -NurpP --minimal linux-3.2.34/fs/jfs/namei.c linux-3.2.34-vs2.3.2.15/fs/jfs/namei.c +--- linux-3.2.34/fs/jfs/namei.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/jfs/namei.c 2011-12-05 19:33:02.000000000 +0100 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include "jfs_incore.h" + #include "jfs_superblock.h" + #include "jfs_inode.h" +@@ -1474,6 +1475,7 @@ static struct dentry *jfs_lookup(struct + jfs_err("jfs_lookup: iget failed on inum %d", (uint)inum); + } + ++ dx_propagate_tag(nd, ip); + return d_splice_alias(ip, dentry); + } + +@@ -1538,6 +1540,7 @@ const struct inode_operations jfs_dir_in + #ifdef CONFIG_JFS_POSIX_ACL + .get_acl = jfs_get_acl, + #endif ++ .sync_flags = jfs_sync_flags, + }; + + const struct file_operations jfs_dir_operations = { +diff -NurpP --minimal linux-3.2.34/fs/jfs/super.c linux-3.2.34-vs2.3.2.15/fs/jfs/super.c +--- linux-3.2.34/fs/jfs/super.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/jfs/super.c 2011-12-05 19:33:02.000000000 +0100 +@@ -198,7 +198,8 @@ static void jfs_put_super(struct super_b + enum { + Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize, + Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota, +- Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask ++ Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask, ++ Opt_tag, Opt_notag, Opt_tagid + }; + + static const match_table_t tokens = { +@@ -208,6 +209,10 @@ static const match_table_t tokens = { + {Opt_resize, "resize=%u"}, + {Opt_resize_nosize, "resize"}, + {Opt_errors, "errors=%s"}, ++ {Opt_tag, "tag"}, ++ {Opt_notag, "notag"}, ++ {Opt_tagid, "tagid=%u"}, ++ {Opt_tag, "tagxid"}, + {Opt_ignore, "noquota"}, + {Opt_ignore, "quota"}, + {Opt_usrquota, "usrquota"}, +@@ -342,6 +347,20 @@ static int parse_options(char *options, + } + break; + } ++#ifndef CONFIG_TAGGING_NONE ++ case Opt_tag: ++ *flag |= JFS_TAGGED; ++ break; ++ case Opt_notag: ++ *flag &= JFS_TAGGED; ++ break; ++#endif ++#ifdef CONFIG_PROPAGATE ++ case Opt_tagid: ++ /* use args[0] */ ++ *flag |= JFS_TAGGED; ++ break; ++#endif + default: + printk("jfs: Unrecognized mount option \"%s\" " + " or missing value\n", p); +@@ -373,6 +392,12 @@ static int jfs_remount(struct super_bloc + return -EINVAL; + } + ++ if ((flag & JFS_TAGGED) && !(sb->s_flags & MS_TAGGED)) { ++ printk(KERN_ERR "JFS: %s: tagging not permitted on remount.\n", ++ sb->s_id); ++ return -EINVAL; ++ } ++ + if (newLVSize) { + if (sb->s_flags & MS_RDONLY) { + printk(KERN_ERR +@@ -455,6 +480,9 @@ static int jfs_fill_super(struct super_b + #ifdef CONFIG_JFS_POSIX_ACL + sb->s_flags |= MS_POSIXACL; + #endif ++ /* map mount option tagxid */ ++ if (sbi->flag & JFS_TAGGED) ++ sb->s_flags |= MS_TAGGED; + + if (newLVSize) { + printk(KERN_ERR "resize option for remount only\n"); +diff -NurpP --minimal linux-3.2.34/fs/libfs.c linux-3.2.34-vs2.3.2.15/fs/libfs.c +--- linux-3.2.34/fs/libfs.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/libfs.c 2011-12-05 19:33:02.000000000 +0100 +@@ -135,7 +135,8 @@ static inline unsigned char dt_type(stru + * both impossible due to the lock on directory. + */ + +-int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) ++static inline int do_dcache_readdir_filter(struct file *filp, ++ void *dirent, filldir_t filldir, int (*filter)(struct dentry *dentry)) + { + struct dentry *dentry = filp->f_path.dentry; + struct dentry *cursor = filp->private_data; +@@ -166,6 +167,8 @@ int dcache_readdir(struct file * filp, v + for (p=q->next; p != &dentry->d_subdirs; p=p->next) { + struct dentry *next; + next = list_entry(p, struct dentry, d_u.d_child); ++ if (filter && !filter(next)) ++ continue; + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + if (!simple_positive(next)) { + spin_unlock(&next->d_lock); +@@ -192,6 +195,17 @@ int dcache_readdir(struct file * filp, v + return 0; + } + ++int dcache_readdir(struct file *filp, void *dirent, filldir_t filldir) ++{ ++ return do_dcache_readdir_filter(filp, dirent, filldir, NULL); ++} ++ ++int dcache_readdir_filter(struct file *filp, void *dirent, filldir_t filldir, ++ int (*filter)(struct dentry *)) ++{ ++ return do_dcache_readdir_filter(filp, dirent, filldir, filter); ++} ++ + ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) + { + return -EISDIR; +@@ -977,6 +991,7 @@ EXPORT_SYMBOL(dcache_dir_close); + EXPORT_SYMBOL(dcache_dir_lseek); + EXPORT_SYMBOL(dcache_dir_open); + EXPORT_SYMBOL(dcache_readdir); ++EXPORT_SYMBOL(dcache_readdir_filter); + EXPORT_SYMBOL(generic_read_dir); + EXPORT_SYMBOL(mount_pseudo); + EXPORT_SYMBOL(simple_write_begin); +diff -NurpP --minimal linux-3.2.34/fs/locks.c linux-3.2.34-vs2.3.2.15/fs/locks.c +--- linux-3.2.34/fs/locks.c 2012-11-18 18:42:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/locks.c 2012-08-13 12:40:51.000000000 +0200 +@@ -126,6 +126,8 @@ + #include + #include + #include ++#include ++#include + + #include + +@@ -184,11 +186,17 @@ static void locks_init_lock_heads(struct + /* Allocate an empty lock structure. */ + struct file_lock *locks_alloc_lock(void) + { +- struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL); ++ struct file_lock *fl; + +- if (fl) +- locks_init_lock_heads(fl); ++ if (!vx_locks_avail(1)) ++ return NULL; + ++ fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL); ++ ++ if (fl) { ++ locks_init_lock_heads(fl); ++ fl->fl_xid = -1; ++ } + return fl; + } + EXPORT_SYMBOL_GPL(locks_alloc_lock); +@@ -216,6 +224,7 @@ void locks_free_lock(struct file_lock *f + BUG_ON(!list_empty(&fl->fl_block)); + BUG_ON(!list_empty(&fl->fl_link)); + ++ vx_locks_dec(fl); + locks_release_private(fl); + kmem_cache_free(filelock_cache, fl); + } +@@ -225,6 +234,7 @@ void locks_init_lock(struct file_lock *f + { + memset(fl, 0, sizeof(struct file_lock)); + locks_init_lock_heads(fl); ++ fl->fl_xid = -1; + } + + EXPORT_SYMBOL(locks_init_lock); +@@ -265,6 +275,7 @@ void locks_copy_lock(struct file_lock *n + new->fl_file = fl->fl_file; + new->fl_ops = fl->fl_ops; + new->fl_lmops = fl->fl_lmops; ++ new->fl_xid = fl->fl_xid; + + locks_copy_private(new, fl); + } +@@ -303,6 +314,11 @@ static int flock_make_lock(struct file * + fl->fl_flags = FL_FLOCK; + fl->fl_type = type; + fl->fl_end = OFFSET_MAX; ++ ++ vxd_assert(filp->f_xid == vx_current_xid(), ++ "f_xid(%d) == current(%d)", filp->f_xid, vx_current_xid()); ++ fl->fl_xid = filp->f_xid; ++ vx_locks_inc(fl); + + *lock = fl; + return 0; +@@ -452,6 +468,7 @@ static int lease_init(struct file *filp, + + fl->fl_owner = current->files; + fl->fl_pid = current->tgid; ++ fl->fl_xid = vx_current_xid(); + + fl->fl_file = filp; + fl->fl_flags = FL_LEASE; +@@ -471,6 +488,11 @@ static struct file_lock *lease_alloc(str + if (fl == NULL) + return ERR_PTR(error); + ++ fl->fl_xid = vx_current_xid(); ++ if (filp) ++ vxd_assert(filp->f_xid == fl->fl_xid, ++ "f_xid(%d) == fl_xid(%d)", filp->f_xid, fl->fl_xid); ++ vx_locks_inc(fl); + error = lease_init(filp, type, fl); + if (error) { + locks_free_lock(fl); +@@ -773,6 +795,7 @@ static int flock_lock_file(struct file * + lock_flocks(); + } + ++ new_fl->fl_xid = -1; + find_conflict: + for_each_lock(inode, before) { + struct file_lock *fl = *before; +@@ -793,6 +816,7 @@ find_conflict: + goto out; + locks_copy_lock(new_fl, request); + locks_insert_lock(before, new_fl); ++ vx_locks_inc(new_fl); + new_fl = NULL; + error = 0; + +@@ -803,7 +827,8 @@ out: + return error; + } + +-static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock) ++static int __posix_lock_file(struct inode *inode, struct file_lock *request, ++ struct file_lock *conflock, xid_t xid) + { + struct file_lock *fl; + struct file_lock *new_fl = NULL; +@@ -813,6 +838,8 @@ static int __posix_lock_file(struct inod + struct file_lock **before; + int error, added = 0; + ++ vxd_assert(xid == vx_current_xid(), ++ "xid(%d) == current(%d)", xid, vx_current_xid()); + /* + * We may need two file_lock structures for this operation, + * so we get them in advance to avoid races. +@@ -823,7 +850,11 @@ static int __posix_lock_file(struct inod + (request->fl_type != F_UNLCK || + request->fl_start != 0 || request->fl_end != OFFSET_MAX)) { + new_fl = locks_alloc_lock(); ++ new_fl->fl_xid = xid; ++ vx_locks_inc(new_fl); + new_fl2 = locks_alloc_lock(); ++ new_fl2->fl_xid = xid; ++ vx_locks_inc(new_fl2); + } + + lock_flocks(); +@@ -1022,7 +1053,8 @@ static int __posix_lock_file(struct inod + int posix_lock_file(struct file *filp, struct file_lock *fl, + struct file_lock *conflock) + { +- return __posix_lock_file(filp->f_path.dentry->d_inode, fl, conflock); ++ return __posix_lock_file(filp->f_path.dentry->d_inode, ++ fl, conflock, filp->f_xid); + } + EXPORT_SYMBOL(posix_lock_file); + +@@ -1112,7 +1144,7 @@ int locks_mandatory_area(int read_write, + fl.fl_end = offset + count - 1; + + for (;;) { +- error = __posix_lock_file(inode, &fl, NULL); ++ error = __posix_lock_file(inode, &fl, NULL, filp->f_xid); + if (error != FILE_LOCK_DEFERRED) + break; + error = wait_event_interruptible(fl.fl_wait, !fl.fl_next); +@@ -1407,6 +1439,7 @@ int generic_add_lease(struct file *filp, + goto out; + + locks_insert_lock(before, lease); ++ vx_locks_inc(lease); + return 0; + + out: +@@ -1847,6 +1880,11 @@ int fcntl_setlk(unsigned int fd, struct + if (file_lock == NULL) + return -ENOLCK; + ++ vxd_assert(filp->f_xid == vx_current_xid(), ++ "f_xid(%d) == current(%d)", filp->f_xid, vx_current_xid()); ++ file_lock->fl_xid = filp->f_xid; ++ vx_locks_inc(file_lock); ++ + /* + * This might block, so we do it before checking the inode. + */ +@@ -1965,6 +2003,11 @@ int fcntl_setlk64(unsigned int fd, struc + if (file_lock == NULL) + return -ENOLCK; + ++ vxd_assert(filp->f_xid == vx_current_xid(), ++ "f_xid(%d) == current(%d)", filp->f_xid, vx_current_xid()); ++ file_lock->fl_xid = filp->f_xid; ++ vx_locks_inc(file_lock); ++ + /* + * This might block, so we do it before checking the inode. + */ +@@ -2230,8 +2273,11 @@ static int locks_show(struct seq_file *f + + lock_get_status(f, fl, *((loff_t *)f->private), ""); + +- list_for_each_entry(bfl, &fl->fl_block, fl_block) ++ list_for_each_entry(bfl, &fl->fl_block, fl_block) { ++ if (!vx_check(fl->fl_xid, VS_WATCH_P | VS_IDENT)) ++ continue; + lock_get_status(f, bfl, *((loff_t *)f->private), " ->"); ++ } + + return 0; + } +diff -NurpP --minimal linux-3.2.34/fs/namei.c linux-3.2.34-vs2.3.2.15/fs/namei.c +--- linux-3.2.34/fs/namei.c 2012-11-18 18:42:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/namei.c 2012-03-23 14:48:48.000000000 +0100 +@@ -33,6 +33,14 @@ + #include + #include + #include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include + #include + + #include "internal.h" +@@ -222,6 +230,89 @@ static int check_acl(struct inode *inode + return -EAGAIN; + } + ++static inline int dx_barrier(const struct inode *inode) ++{ ++ if (IS_BARRIER(inode) && !vx_check(0, VS_ADMIN | VS_WATCH)) { ++ vxwprintk_task(1, "did hit the barrier."); ++ return 1; ++ } ++ return 0; ++} ++ ++static int __dx_permission(const struct inode *inode, int mask) ++{ ++ if (dx_barrier(inode)) ++ return -EACCES; ++ ++ if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) { ++ /* devpts is xid tagged */ ++ if (S_ISDIR(inode->i_mode) || ++ vx_check((xid_t)inode->i_tag, VS_IDENT | VS_WATCH_P)) ++ return 0; ++ ++ /* just pretend we didn't find anything */ ++ return -ENOENT; ++ } ++ else if (inode->i_sb->s_magic == PROC_SUPER_MAGIC) { ++ struct proc_dir_entry *de = PDE(inode); ++ ++ if (de && !vx_hide_check(0, de->vx_flags)) ++ goto out; ++ ++ if ((mask & (MAY_WRITE | MAY_APPEND))) { ++ struct pid *pid; ++ struct task_struct *tsk; ++ ++ if (vx_check(0, VS_ADMIN | VS_WATCH_P) || ++ vx_flags(VXF_STATE_SETUP, 0)) ++ return 0; ++ ++ pid = PROC_I(inode)->pid; ++ if (!pid) ++ goto out; ++ ++ rcu_read_lock(); ++ tsk = pid_task(pid, PIDTYPE_PID); ++ vxdprintk(VXD_CBIT(tag, 0), "accessing %p[#%u]", ++ tsk, (tsk ? vx_task_xid(tsk) : 0)); ++ if (tsk && ++ vx_check(vx_task_xid(tsk), VS_IDENT | VS_WATCH_P)) { ++ rcu_read_unlock(); ++ return 0; ++ } ++ rcu_read_unlock(); ++ } ++ else { ++ /* FIXME: Should we block some entries here? */ ++ return 0; ++ } ++ } ++ else { ++ if (dx_notagcheck(inode->i_sb) || ++ dx_check(inode->i_tag, DX_HOSTID | DX_ADMIN | DX_WATCH | ++ DX_IDENT)) ++ return 0; ++ } ++ ++out: ++ return -EACCES; ++} ++ ++int dx_permission(const struct inode *inode, int mask) ++{ ++ int ret = __dx_permission(inode, mask); ++ if (unlikely(ret)) { ++#ifndef CONFIG_VSERVER_WARN_DEVPTS ++ if (inode->i_sb->s_magic != DEVPTS_SUPER_MAGIC) ++#endif ++ vxwprintk_task(1, ++ "denied [0x%x] access to inode %s:%p[#%d,%lu]", ++ mask, inode->i_sb->s_id, inode, inode->i_tag, ++ inode->i_ino); ++ } ++ return ret; ++} ++ + /* + * This does the basic permission checking + */ +@@ -357,10 +448,14 @@ int inode_permission(struct inode *inode + /* + * Nobody gets write access to an immutable file. + */ +- if (IS_IMMUTABLE(inode)) ++ if (IS_IMMUTABLE(inode) && !IS_COW(inode)) + return -EACCES; + } + ++ retval = dx_permission(inode, mask); ++ if (retval) ++ return retval; ++ + retval = do_inode_permission(inode, mask); + if (retval) + return retval; +@@ -1037,7 +1132,8 @@ static void follow_dotdot(struct nameida + + if (nd->path.dentry == nd->root.dentry && + nd->path.mnt == nd->root.mnt) { +- break; ++ /* for sane '/' avoid follow_mount() */ ++ return; + } + if (nd->path.dentry != nd->path.mnt->mnt_root) { + /* rare case of legitimate dget_parent()... */ +@@ -1148,6 +1244,9 @@ static int do_lookup(struct nameidata *n + } + if (unlikely(d_need_lookup(dentry))) + goto unlazy; ++ ++ /* FIXME: check dx permission */ ++ + path->mnt = mnt; + path->dentry = dentry; + if (unlikely(!__follow_mount_rcu(nd, path, inode))) +@@ -1209,6 +1308,8 @@ retry: + } + } + ++ /* FIXME: check dx permission */ ++ + path->mnt = mnt; + path->dentry = dentry; + err = follow_managed(path, nd->flags); +@@ -1903,7 +2004,7 @@ static int may_delete(struct inode *dir, + if (IS_APPEND(dir)) + return -EPERM; + if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)|| +- IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) ++ IS_IXORUNLINK(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + return -EPERM; + if (isdir) { + if (!S_ISDIR(victim->d_inode->i_mode)) +@@ -1983,19 +2084,25 @@ int vfs_create(struct inode *dir, struct + { + int error = may_create(dir, dentry); + +- if (error) ++ if (error) { ++ vxdprintk(VXD_CBIT(misc, 3), "may_create failed with %d", error); + return error; ++ } + + if (!dir->i_op->create) + return -EACCES; /* shouldn't it be ENOSYS? */ + mode &= S_IALLUGO; + mode |= S_IFREG; + error = security_inode_create(dir, dentry, mode); +- if (error) ++ if (error) { ++ vxdprintk(VXD_CBIT(misc, 3), "security_inode_create failed with %d", error); + return error; ++ } + error = dir->i_op->create(dir, dentry, mode, nd); + if (!error) + fsnotify_create(dir, dentry); ++ else ++ vxdprintk(VXD_CBIT(misc, 3), "i_op->create failed with %d", error); + return error; + } + +@@ -2030,6 +2137,15 @@ static int may_open(struct path *path, i + break; + } + ++#ifdef CONFIG_VSERVER_COWBL ++ if (IS_COW(inode) && ++ ((flag & O_ACCMODE) != O_RDONLY)) { ++ if (IS_COW_LINK(inode)) ++ return -EMLINK; ++ inode->i_flags &= ~(S_IXUNLINK|S_IMMUTABLE); ++ mark_inode_dirty(inode); ++ } ++#endif + error = inode_permission(inode, acc_mode); + if (error) + return error; +@@ -2254,6 +2370,16 @@ ok: + } + common: + error = may_open(&nd->path, acc_mode, open_flag); ++#ifdef CONFIG_VSERVER_COWBL ++ if (error == -EMLINK) { ++ struct dentry *dentry; ++ dentry = cow_break_link(pathname); ++ if (IS_ERR(dentry)) ++ error = PTR_ERR(dentry); ++ else ++ dput(dentry); ++ } ++#endif + if (error) + goto exit; + filp = nameidata_to_filp(nd); +@@ -2296,6 +2422,7 @@ static struct file *path_openat(int dfd, + struct path path; + int error; + ++restart: + filp = get_empty_filp(); + if (!filp) + return ERR_PTR(-ENFILE); +@@ -2333,6 +2460,17 @@ static struct file *path_openat(int dfd, + filp = do_last(nd, &path, op, pathname); + put_link(nd, &link, cookie); + } ++ ++#ifdef CONFIG_VSERVER_COWBL ++ if (filp == ERR_PTR(-EMLINK)) { ++ if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) ++ path_put(&nd->root); ++ if (base) ++ fput(base); ++ release_open_intent(nd); ++ goto restart; ++ } ++#endif + out: + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) + path_put(&nd->root); +@@ -2422,6 +2560,11 @@ struct dentry *kern_path_create(int dfd, + goto fail; + } + *path = nd.path; ++ vxdprintk(VXD_CBIT(misc, 3), "kern_path_create path.dentry = %p (%.*s), dentry = %p (%.*s), d_inode = %p", ++ path->dentry, path->dentry->d_name.len, ++ path->dentry->d_name.name, dentry, ++ dentry->d_name.len, dentry->d_name.name, ++ path->dentry->d_inode); + return dentry; + eexist: + dput(dentry); +@@ -2903,7 +3046,7 @@ int vfs_link(struct dentry *old_dentry, + /* + * A link to an append-only or immutable file cannot be created. + */ +- if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) ++ if (IS_APPEND(inode) || IS_IXORUNLINK(inode)) + return -EPERM; + if (!dir->i_op->link) + return -EPERM; +@@ -3284,6 +3427,227 @@ int vfs_follow_link(struct nameidata *nd + return __vfs_follow_link(nd, link); + } + ++ ++#ifdef CONFIG_VSERVER_COWBL ++ ++static inline ++long do_cow_splice(struct file *in, struct file *out, size_t len) ++{ ++ loff_t ppos = 0; ++ ++ return do_splice_direct(in, &ppos, out, len, 0); ++} ++ ++struct dentry *cow_break_link(const char *pathname) ++{ ++ int ret, mode, pathlen, redo = 0; ++ struct nameidata old_nd, dir_nd; ++ struct path old_path, dir_path; ++ struct dentry *dir, *old_dentry, *new_dentry = NULL; ++ struct file *old_file; ++ struct file *new_file; ++ char *to, *path, pad='\251'; ++ loff_t size; ++ ++ vxdprintk(VXD_CBIT(misc, 1), ++ "cow_break_link(" VS_Q("%s") ")", pathname); ++ path = kmalloc(PATH_MAX, GFP_KERNEL); ++ ret = -ENOMEM; ++ if (!path) ++ goto out; ++ ++ /* old_nd will have refs to dentry and mnt */ ++ ret = do_path_lookup(AT_FDCWD, pathname, LOOKUP_FOLLOW, &old_nd); ++ vxdprintk(VXD_CBIT(misc, 2), ++ "do_path_lookup(old): %d [r=%d]", ++ ret, mnt_get_count(old_nd.path.mnt)); ++ if (ret < 0) ++ goto out_free_path; ++ ++ old_path = old_nd.path; ++ old_dentry = old_path.dentry; ++ mode = old_dentry->d_inode->i_mode; ++ ++ to = d_path(&old_path, path, PATH_MAX-2); ++ pathlen = strlen(to); ++ vxdprintk(VXD_CBIT(misc, 2), ++ "old path " VS_Q("%s") " [%p:" VS_Q("%.*s") ":%d]", to, ++ old_dentry, ++ old_dentry->d_name.len, old_dentry->d_name.name, ++ old_dentry->d_name.len); ++ ++ to[pathlen + 1] = 0; ++retry: ++ new_dentry = NULL; ++ to[pathlen] = pad--; ++ ret = -ELOOP; ++ if (pad <= '\240') ++ goto out_rel_old; ++ ++ vxdprintk(VXD_CBIT(misc, 1), "temp copy " VS_Q("%s"), to); ++ /* dir_nd will have refs to dentry and mnt */ ++ ret = do_path_lookup(AT_FDCWD, to, ++ LOOKUP_PARENT | LOOKUP_OPEN | LOOKUP_CREATE, &dir_nd); ++ vxdprintk(VXD_CBIT(misc, 2), "do_path_lookup(new): %d", ret); ++ if (ret < 0) ++ goto retry; ++ ++ /* this puppy downs the dir inode mutex if successful */ ++ new_dentry = kern_path_create(AT_FDCWD, to, &dir_path, 0); ++ if (!new_dentry || IS_ERR(new_dentry)) { ++ path_put(&dir_nd.path); ++ vxdprintk(VXD_CBIT(misc, 2), ++ "kern_path_create(new) failed with %ld", ++ PTR_ERR(new_dentry)); ++ goto retry; ++ } ++ path_put(&dir_path); ++ vxdprintk(VXD_CBIT(misc, 2), ++ "kern_path_create(new): %p [" VS_Q("%.*s") ":%d]", ++ new_dentry, ++ new_dentry->d_name.len, new_dentry->d_name.name, ++ new_dentry->d_name.len); ++ ++ dir = dir_nd.path.dentry; ++ ++ ret = vfs_create(dir->d_inode, new_dentry, mode, &dir_nd); ++ vxdprintk(VXD_CBIT(misc, 2), ++ "vfs_create(new): %d", ret); ++ if (ret == -EEXIST) { ++ mutex_unlock(&dir->d_inode->i_mutex); ++ path_put(&dir_nd.path); ++ dput(new_dentry); ++ goto retry; ++ } ++ else if (ret < 0) ++ goto out_unlock_new; ++ ++ /* drop out early, ret passes ENOENT */ ++ ret = -ENOENT; ++ if ((redo = d_unhashed(old_dentry))) ++ goto out_unlock_new; ++ ++ path_get(&old_path); ++ /* this one cleans up the dentry/mnt in case of failure */ ++ old_file = dentry_open(old_dentry, old_path.mnt, ++ O_RDONLY, current_cred()); ++ vxdprintk(VXD_CBIT(misc, 2), ++ "dentry_open(old): %p", old_file); ++ if (IS_ERR(old_file)) { ++ ret = PTR_ERR(old_file); ++ goto out_unlock_new; ++ } ++ ++ dget(new_dentry); ++ mntget(old_path.mnt); ++ /* this one cleans up the dentry/mnt in case of failure */ ++ new_file = dentry_open(new_dentry, old_path.mnt, ++ O_WRONLY, current_cred()); ++ vxdprintk(VXD_CBIT(misc, 2), ++ "dentry_open(new): %p", new_file); ++ if (IS_ERR(new_file)) { ++ ret = PTR_ERR(new_file); ++ goto out_fput_old; ++ } ++ ++ size = i_size_read(old_file->f_dentry->d_inode); ++ ret = do_cow_splice(old_file, new_file, size); ++ vxdprintk(VXD_CBIT(misc, 2), "do_splice_direct: %d", ret); ++ if (ret < 0) { ++ goto out_fput_both; ++ } else if (ret < size) { ++ ret = -ENOSPC; ++ goto out_fput_both; ++ } else { ++ struct inode *old_inode = old_dentry->d_inode; ++ struct inode *new_inode = new_dentry->d_inode; ++ struct iattr attr = { ++ .ia_uid = old_inode->i_uid, ++ .ia_gid = old_inode->i_gid, ++ .ia_valid = ATTR_UID | ATTR_GID ++ }; ++ ++ setattr_copy(new_inode, &attr); ++ mark_inode_dirty(new_inode); ++ } ++ ++ mutex_lock(&old_dentry->d_inode->i_sb->s_vfs_rename_mutex); ++ ++ /* drop out late */ ++ ret = -ENOENT; ++ if ((redo = d_unhashed(old_dentry))) ++ goto out_unlock; ++ ++ vxdprintk(VXD_CBIT(misc, 2), ++ "vfs_rename: [" VS_Q("%*s") ":%d] -> [" VS_Q("%*s") ":%d]", ++ new_dentry->d_name.len, new_dentry->d_name.name, ++ new_dentry->d_name.len, ++ old_dentry->d_name.len, old_dentry->d_name.name, ++ old_dentry->d_name.len); ++ ret = vfs_rename(dir_nd.path.dentry->d_inode, new_dentry, ++ old_dentry->d_parent->d_inode, old_dentry); ++ vxdprintk(VXD_CBIT(misc, 2), "vfs_rename: %d", ret); ++ ++out_unlock: ++ mutex_unlock(&old_dentry->d_inode->i_sb->s_vfs_rename_mutex); ++ ++out_fput_both: ++ vxdprintk(VXD_CBIT(misc, 3), ++ "fput(new_file=%p[#%ld])", new_file, ++ atomic_long_read(&new_file->f_count)); ++ fput(new_file); ++ ++out_fput_old: ++ vxdprintk(VXD_CBIT(misc, 3), ++ "fput(old_file=%p[#%ld])", old_file, ++ atomic_long_read(&old_file->f_count)); ++ fput(old_file); ++ ++out_unlock_new: ++ mutex_unlock(&dir->d_inode->i_mutex); ++ if (!ret) ++ goto out_redo; ++ ++ /* error path cleanup */ ++ vfs_unlink(dir->d_inode, new_dentry); ++ ++out_redo: ++ if (!redo) ++ goto out_rel_both; ++ /* lookup dentry once again */ ++ /* old_nd.path is freed as old_path in out_rel_old */ ++ ret = do_path_lookup(AT_FDCWD, pathname, LOOKUP_FOLLOW, &old_nd); ++ if (ret) ++ goto out_rel_both; ++ ++ dput(new_dentry); ++ new_dentry = old_nd.path.dentry; ++ vxdprintk(VXD_CBIT(misc, 2), ++ "do_path_lookup(redo): %p [" VS_Q("%.*s") ":%d]", ++ new_dentry, ++ new_dentry->d_name.len, new_dentry->d_name.name, ++ new_dentry->d_name.len); ++ dget(new_dentry); ++ ++out_rel_both: ++ path_put(&dir_nd.path); ++out_rel_old: ++ path_put(&old_path); ++out_free_path: ++ kfree(path); ++out: ++ if (ret) { ++ dput(new_dentry); ++ new_dentry = ERR_PTR(ret); ++ } ++ vxdprintk(VXD_CBIT(misc, 3), ++ "cow_break_link returning with %p [r=%d]", ++ new_dentry, mnt_get_count(old_nd.path.mnt)); ++ return new_dentry; ++} ++ ++#endif ++ + /* get the link contents into pagecache */ + static char *page_getlink(struct dentry * dentry, struct page **ppage) + { +diff -NurpP --minimal linux-3.2.34/fs/namespace.c linux-3.2.34-vs2.3.2.15/fs/namespace.c +--- linux-3.2.34/fs/namespace.c 2012-11-18 18:42:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/namespace.c 2012-06-14 20:45:24.000000000 +0200 +@@ -31,6 +31,11 @@ + #include + #include + #include ++#include ++#include ++#include ++#include ++#include + #include + #include + #include "pnode.h" +@@ -679,6 +684,10 @@ vfs_kern_mount(struct file_system_type * + if (!type) + return ERR_PTR(-ENODEV); + ++ if ((type->fs_flags & FS_BINARY_MOUNTDATA) && ++ !vx_capable(CAP_SYS_ADMIN, VXC_BINARY_MOUNT)) ++ return ERR_PTR(-EPERM); ++ + mnt = alloc_vfsmnt(name); + if (!mnt) + return ERR_PTR(-ENOMEM); +@@ -724,6 +733,7 @@ static struct vfsmount *clone_mnt(struct + mnt->mnt_root = dget(root); + mnt->mnt_mountpoint = mnt->mnt_root; + mnt->mnt_parent = mnt; ++ mnt->mnt_tag = old->mnt_tag; + + if (flag & CL_SLAVE) { + list_add(&mnt->mnt_slave, &old->mnt_slave_list); +@@ -852,6 +862,31 @@ static inline void mangle(struct seq_fil + seq_escape(m, s, " \t\n\\"); + } + ++static int mnt_is_reachable(struct vfsmount *mnt) ++{ ++ struct path root; ++ struct dentry *point; ++ int ret; ++ ++ if (mnt == mnt->mnt_ns->root) ++ return 1; ++ ++ br_read_lock(vfsmount_lock); ++ root = current->fs->root; ++ point = root.dentry; ++ ++ while ((mnt != mnt->mnt_parent) && (mnt != root.mnt)) { ++ point = mnt->mnt_mountpoint; ++ mnt = mnt->mnt_parent; ++ } ++ ++ ret = (mnt == root.mnt) && is_subdir(point, root.dentry); ++ ++ br_read_unlock(vfsmount_lock); ++ ++ return ret; ++} ++ + /* + * Simple .show_options callback for filesystems which don't want to + * implement more complex mount option showing. +@@ -954,6 +989,8 @@ static int show_sb_opts(struct seq_file + { MS_SYNCHRONOUS, ",sync" }, + { MS_DIRSYNC, ",dirsync" }, + { MS_MANDLOCK, ",mand" }, ++ { MS_TAGGED, ",tag" }, ++ { MS_NOTAGCHECK, ",notagcheck" }, + { 0, NULL } + }; + const struct proc_fs_info *fs_infop; +@@ -1000,16 +1037,26 @@ static int show_vfsmnt(struct seq_file * + int err = 0; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + +- if (mnt->mnt_sb->s_op->show_devname) { +- err = mnt->mnt_sb->s_op->show_devname(m, mnt); +- if (err) +- goto out; ++ if (vx_flags(VXF_HIDE_MOUNT, 0)) ++ return SEQ_SKIP; ++ if (!mnt_is_reachable(mnt) && !vx_check(0, VS_WATCH_P)) ++ return SEQ_SKIP; ++ ++ if (!vx_check(0, VS_ADMIN|VS_WATCH) && ++ mnt == current->fs->root.mnt) { ++ seq_puts(m, "/dev/root / "); + } else { +- mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); ++ if (mnt->mnt_sb->s_op->show_devname) { ++ err = mnt->mnt_sb->s_op->show_devname(m, mnt); ++ if (err) ++ goto out; ++ } else { ++ mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); ++ } ++ seq_putc(m, ' '); ++ seq_path(m, &mnt_path, " \t\n\\"); ++ seq_putc(m, ' '); + } +- seq_putc(m, ' '); +- seq_path(m, &mnt_path, " \t\n\\"); +- seq_putc(m, ' '); + show_type(m, mnt->mnt_sb); + seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); + err = show_sb_opts(m, mnt->mnt_sb); +@@ -1039,6 +1086,11 @@ static int show_mountinfo(struct seq_fil + struct path root = p->root; + int err = 0; + ++ if (vx_flags(VXF_HIDE_MOUNT, 0)) ++ return SEQ_SKIP; ++ if (!mnt_is_reachable(mnt) && !vx_check(0, VS_WATCH_P)) ++ return SEQ_SKIP; ++ + seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id, + MAJOR(sb->s_dev), MINOR(sb->s_dev)); + if (sb->s_op->show_path) +@@ -1104,22 +1156,32 @@ static int show_vfsstat(struct seq_file + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + int err = 0; + +- /* device */ +- if (mnt->mnt_sb->s_op->show_devname) { +- seq_puts(m, "device "); +- err = mnt->mnt_sb->s_op->show_devname(m, mnt); ++ if (vx_flags(VXF_HIDE_MOUNT, 0)) ++ return SEQ_SKIP; ++ if (!mnt_is_reachable(mnt) && !vx_check(0, VS_WATCH_P)) ++ return SEQ_SKIP; ++ ++ if (!vx_check(0, VS_ADMIN|VS_WATCH) && ++ mnt == current->fs->root.mnt) { ++ seq_puts(m, "device /dev/root mounted on / "); + } else { +- if (mnt->mnt_devname) { ++ /* device */ ++ if (mnt->mnt_sb->s_op->show_devname) { + seq_puts(m, "device "); +- mangle(m, mnt->mnt_devname); +- } else +- seq_puts(m, "no device"); +- } ++ err = mnt->mnt_sb->s_op->show_devname(m, mnt); ++ } else { ++ if (mnt->mnt_devname) { ++ seq_puts(m, "device "); ++ mangle(m, mnt->mnt_devname); ++ } else ++ seq_puts(m, "no device"); ++ } + +- /* mount point */ +- seq_puts(m, " mounted on "); +- seq_path(m, &mnt_path, " \t\n\\"); +- seq_putc(m, ' '); ++ /* mount point */ ++ seq_puts(m, " mounted on "); ++ seq_path(m, &mnt_path, " \t\n\\"); ++ seq_putc(m, ' '); ++ } + + /* file system type */ + seq_puts(m, "with fstype "); +@@ -1379,7 +1441,7 @@ SYSCALL_DEFINE2(umount, char __user *, n + goto dput_and_out; + + retval = -EPERM; +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_SECURE_MOUNT)) + goto dput_and_out; + + retval = do_umount(path.mnt, flags); +@@ -1405,7 +1467,7 @@ SYSCALL_DEFINE1(oldumount, char __user * + + static int mount_is_safe(struct path *path) + { +- if (capable(CAP_SYS_ADMIN)) ++ if (vx_capable(CAP_SYS_ADMIN, VXC_SECURE_MOUNT)) + return 0; + return -EPERM; + #ifdef notyet +@@ -1715,7 +1777,7 @@ static int do_change_type(struct path *p + int type; + int err = 0; + +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_NAMESPACE)) + return -EPERM; + + if (path->dentry != path->mnt->mnt_root) +@@ -1731,6 +1793,7 @@ static int do_change_type(struct path *p + if (err) + goto out_unlock; + } ++ // mnt->mnt_flags = mnt_flags; + + br_write_lock(vfsmount_lock); + for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) +@@ -1746,12 +1809,14 @@ static int do_change_type(struct path *p + * do loopback mount. + */ + static int do_loopback(struct path *path, char *old_name, +- int recurse) ++ tag_t tag, unsigned long flags, int mnt_flags) + { + LIST_HEAD(umount_list); + struct path old_path; + struct vfsmount *mnt = NULL; + int err = mount_is_safe(path); ++ int recurse = flags & MS_REC; ++ + if (err) + return err; + if (!old_name || !*old_name) +@@ -1817,12 +1882,12 @@ static int change_mount_flags(struct vfs + * on it - tough luck. + */ + static int do_remount(struct path *path, int flags, int mnt_flags, +- void *data) ++ void *data, xid_t xid) + { + int err; + struct super_block *sb = path->mnt->mnt_sb; + +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_SECURE_REMOUNT)) + return -EPERM; + + if (!check_mnt(path->mnt)) +@@ -1870,7 +1935,7 @@ static int do_move_mount(struct path *pa + struct path old_path, parent_path; + struct vfsmount *p; + int err = 0; +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_SECURE_MOUNT)) + return -EPERM; + if (!old_name || !*old_name) + return -EINVAL; +@@ -2021,7 +2086,7 @@ static int do_new_mount(struct path *pat + return -EINVAL; + + /* we need capabilities... */ +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_SECURE_MOUNT)) + return -EPERM; + + mnt = do_kern_mount(type, flags, name, data); +@@ -2290,6 +2355,7 @@ long do_mount(char *dev_name, char *dir_ + struct path path; + int retval = 0; + int mnt_flags = 0; ++ tag_t tag = 0; + + /* Discard magic */ + if ((flags & MS_MGC_MSK) == MS_MGC_VAL) +@@ -2317,6 +2383,12 @@ long do_mount(char *dev_name, char *dir_ + if (!(flags & MS_NOATIME)) + mnt_flags |= MNT_RELATIME; + ++ if (dx_parse_tag(data_page, &tag, 1, &mnt_flags, &flags)) { ++ /* FIXME: bind and re-mounts get the tag flag? */ ++ if (flags & (MS_BIND|MS_REMOUNT)) ++ flags |= MS_TAGID; ++ } ++ + /* Separate the per-mountpoint flags */ + if (flags & MS_NOSUID) + mnt_flags |= MNT_NOSUID; +@@ -2333,15 +2405,17 @@ long do_mount(char *dev_name, char *dir_ + if (flags & MS_RDONLY) + mnt_flags |= MNT_READONLY; + ++ if (!capable(CAP_SYS_ADMIN)) ++ mnt_flags |= MNT_NODEV; + flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | + MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | + MS_STRICTATIME); + + if (flags & MS_REMOUNT) + retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, +- data_page); ++ data_page, tag); + else if (flags & MS_BIND) +- retval = do_loopback(&path, dev_name, flags & MS_REC); ++ retval = do_loopback(&path, dev_name, tag, flags, mnt_flags); + else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + retval = do_change_type(&path, flags); + else if (flags & MS_MOVE) +@@ -2441,6 +2515,7 @@ static struct mnt_namespace *dup_mnt_ns( + q = next_mnt(q, new_ns->root); + } + up_write(&namespace_sem); ++ atomic_inc(&vs_global_mnt_ns); + + if (rootmnt) + mntput(rootmnt); +@@ -2612,9 +2687,10 @@ SYSCALL_DEFINE2(pivot_root, const char _ + goto out3; + + error = -EINVAL; +- if (IS_MNT_SHARED(old.mnt) || ++ if ((IS_MNT_SHARED(old.mnt) || + IS_MNT_SHARED(new.mnt->mnt_parent) || +- IS_MNT_SHARED(root.mnt->mnt_parent)) ++ IS_MNT_SHARED(root.mnt->mnt_parent)) && ++ !vx_flags(VXF_STATE_SETUP, 0)) + goto out4; + if (!check_mnt(root.mnt) || !check_mnt(new.mnt)) + goto out4; +@@ -2746,6 +2822,7 @@ void put_mnt_ns(struct mnt_namespace *ns + br_write_unlock(vfsmount_lock); + up_write(&namespace_sem); + release_mounts(&umount_list); ++ atomic_dec(&vs_global_mnt_ns); + kfree(ns); + } + EXPORT_SYMBOL(put_mnt_ns); +diff -NurpP --minimal linux-3.2.34/fs/nfs/client.c linux-3.2.34-vs2.3.2.15/fs/nfs/client.c +--- linux-3.2.34/fs/nfs/client.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/nfs/client.c 2011-12-05 19:33:02.000000000 +0100 +@@ -779,6 +779,9 @@ static int nfs_init_server_rpcclient(str + if (server->flags & NFS_MOUNT_SOFT) + server->client->cl_softrtry = 1; + ++ server->client->cl_tag = 0; ++ if (server->flags & NFS_MOUNT_TAGGED) ++ server->client->cl_tag = 1; + return 0; + } + +@@ -953,6 +956,10 @@ static void nfs_server_set_fsinfo(struct + server->acdirmin = server->acdirmax = 0; + } + ++ /* FIXME: needs fsinfo ++ if (server->flags & NFS_MOUNT_TAGGED) ++ sb->s_flags |= MS_TAGGED; */ ++ + server->maxfilesize = fsinfo->maxfilesize; + + server->time_delta = fsinfo->time_delta; +diff -NurpP --minimal linux-3.2.34/fs/nfs/dir.c linux-3.2.34-vs2.3.2.15/fs/nfs/dir.c +--- linux-3.2.34/fs/nfs/dir.c 2012-11-18 18:42:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/nfs/dir.c 2012-09-16 18:25:50.000000000 +0200 +@@ -35,6 +35,7 @@ + #include + #include + #include ++#include + + #include "delegation.h" + #include "iostat.h" +@@ -1311,6 +1312,7 @@ static struct dentry *nfs_lookup(struct + if (IS_ERR(res)) + goto out_unblock_sillyrename; + ++ dx_propagate_tag(nd, inode); + no_entry: + res = d_materialise_unique(dentry, inode); + if (res != NULL) { +diff -NurpP --minimal linux-3.2.34/fs/nfs/inode.c linux-3.2.34-vs2.3.2.15/fs/nfs/inode.c +--- linux-3.2.34/fs/nfs/inode.c 2012-11-18 18:42:21.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/nfs/inode.c 2012-10-22 12:59:52.000000000 +0200 +@@ -38,6 +38,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -273,6 +274,8 @@ nfs_fhget(struct super_block *sb, struct + if (inode->i_state & I_NEW) { + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long now = jiffies; ++ uid_t uid; ++ gid_t gid; + + /* We set i_ino for the few things that still rely on it, + * such as stat(2) */ +@@ -321,8 +324,8 @@ nfs_fhget(struct super_block *sb, struct + inode->i_version = 0; + inode->i_size = 0; + clear_nlink(inode); +- inode->i_uid = -2; +- inode->i_gid = -2; ++ uid = -2; ++ gid = -2; + inode->i_blocks = 0; + memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); + +@@ -359,13 +362,13 @@ nfs_fhget(struct super_block *sb, struct + else if (nfs_server_capable(inode, NFS_CAP_NLINK)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + if (fattr->valid & NFS_ATTR_FATTR_OWNER) +- inode->i_uid = fattr->uid; ++ uid = fattr->uid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; + if (fattr->valid & NFS_ATTR_FATTR_GROUP) +- inode->i_gid = fattr->gid; ++ gid = fattr->gid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS +@@ -378,6 +381,11 @@ nfs_fhget(struct super_block *sb, struct + */ + inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); + } ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, 0); ++ /* maybe fattr->xid someday */ ++ + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); + nfsi->attrtimeo_timestamp = now; + nfsi->access_cache = RB_ROOT; +@@ -494,6 +502,8 @@ void nfs_setattr_update_inode(struct ino + inode->i_uid = attr->ia_uid; + if ((attr->ia_valid & ATTR_GID) != 0) + inode->i_gid = attr->ia_gid; ++ if ((attr->ia_valid & ATTR_TAG) && IS_TAGGED(inode)) ++ inode->i_tag = attr->ia_tag; + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + spin_unlock(&inode->i_lock); + } +@@ -943,6 +953,9 @@ static int nfs_check_inode_attributes(st + struct nfs_inode *nfsi = NFS_I(inode); + loff_t cur_size, new_isize; + unsigned long invalid = 0; ++ uid_t uid; ++ gid_t gid; ++ tag_t tag; + + + /* Has the inode gone and changed behind our back? */ +@@ -966,13 +979,18 @@ static int nfs_check_inode_attributes(st + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + } + ++ uid = INOTAG_UID(DX_TAG(inode), fattr->uid, fattr->gid); ++ gid = INOTAG_GID(DX_TAG(inode), fattr->uid, fattr->gid); ++ tag = INOTAG_TAG(DX_TAG(inode), fattr->uid, fattr->gid, 0); ++ + /* Have any file permissions changed? */ + if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; +- if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid) ++ if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && uid != fattr->uid) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; +- if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid) ++ if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && gid != fattr->gid) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; ++ /* maybe check for tag too? */ + + /* Has the link count changed? */ + if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) +@@ -1207,6 +1225,9 @@ static int nfs_update_inode(struct inode + unsigned long invalid = 0; + unsigned long now = jiffies; + unsigned long save_cache_validity; ++ uid_t uid; ++ gid_t gid; ++ tag_t tag; + + dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", + __func__, inode->i_sb->s_id, inode->i_ino, +@@ -1314,6 +1335,9 @@ static int nfs_update_inode(struct inode + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); + ++ uid = TAGINO_UID(DX_TAG(inode), inode->i_uid, inode->i_tag); ++ gid = TAGINO_GID(DX_TAG(inode), inode->i_gid, inode->i_tag); ++ tag = inode->i_tag; + + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); +@@ -1335,9 +1359,9 @@ static int nfs_update_inode(struct inode + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_OWNER) { +- if (inode->i_uid != fattr->uid) { ++ if (uid != fattr->uid) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; +- inode->i_uid = fattr->uid; ++ uid = fattr->uid; + } + } else if (server->caps & NFS_CAP_OWNER) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +@@ -1346,9 +1370,9 @@ static int nfs_update_inode(struct inode + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_GROUP) { +- if (inode->i_gid != fattr->gid) { ++ if (gid != fattr->gid) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; +- inode->i_gid = fattr->gid; ++ gid = fattr->gid; + } + } else if (server->caps & NFS_CAP_OWNER_GROUP) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +@@ -1356,6 +1380,10 @@ static int nfs_update_inode(struct inode + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, tag); ++ + if (fattr->valid & NFS_ATTR_FATTR_NLINK) { + if (inode->i_nlink != fattr->nlink) { + invalid |= NFS_INO_INVALID_ATTR; +diff -NurpP --minimal linux-3.2.34/fs/nfs/nfs3xdr.c linux-3.2.34-vs2.3.2.15/fs/nfs/nfs3xdr.c +--- linux-3.2.34/fs/nfs/nfs3xdr.c 2011-03-15 18:07:32.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/nfs/nfs3xdr.c 2011-12-05 19:33:02.000000000 +0100 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + #include "internal.h" + + #define NFSDBG_FACILITY NFSDBG_XDR +@@ -562,7 +563,8 @@ static __be32 *xdr_decode_nfstime3(__be3 + * set_mtime mtime; + * }; + */ +-static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) ++static void encode_sattr3(struct xdr_stream *xdr, ++ const struct iattr *attr, int tag) + { + u32 nbytes; + __be32 *p; +@@ -594,15 +596,19 @@ static void encode_sattr3(struct xdr_str + } else + *p++ = xdr_zero; + +- if (attr->ia_valid & ATTR_UID) { ++ if (attr->ia_valid & ATTR_UID || ++ (tag && (attr->ia_valid & ATTR_TAG))) { + *p++ = xdr_one; +- *p++ = cpu_to_be32(attr->ia_uid); ++ *p++ = cpu_to_be32(TAGINO_UID(tag, ++ attr->ia_uid, attr->ia_tag)); + } else + *p++ = xdr_zero; + +- if (attr->ia_valid & ATTR_GID) { ++ if (attr->ia_valid & ATTR_GID || ++ (tag && (attr->ia_valid & ATTR_TAG))) { + *p++ = xdr_one; +- *p++ = cpu_to_be32(attr->ia_gid); ++ *p++ = cpu_to_be32(TAGINO_GID(tag, ++ attr->ia_gid, attr->ia_tag)); + } else + *p++ = xdr_zero; + +@@ -878,7 +884,7 @@ static void nfs3_xdr_enc_setattr3args(st + const struct nfs3_sattrargs *args) + { + encode_nfs_fh3(xdr, args->fh); +- encode_sattr3(xdr, args->sattr); ++ encode_sattr3(xdr, args->sattr, req->rq_task->tk_client->cl_tag); + encode_sattrguard3(xdr, args); + } + +@@ -1028,13 +1034,13 @@ static void nfs3_xdr_enc_write3args(stru + * }; + */ + static void encode_createhow3(struct xdr_stream *xdr, +- const struct nfs3_createargs *args) ++ const struct nfs3_createargs *args, int tag) + { + encode_uint32(xdr, args->createmode); + switch (args->createmode) { + case NFS3_CREATE_UNCHECKED: + case NFS3_CREATE_GUARDED: +- encode_sattr3(xdr, args->sattr); ++ encode_sattr3(xdr, args->sattr, tag); + break; + case NFS3_CREATE_EXCLUSIVE: + encode_createverf3(xdr, args->verifier); +@@ -1049,7 +1055,7 @@ static void nfs3_xdr_enc_create3args(str + const struct nfs3_createargs *args) + { + encode_diropargs3(xdr, args->fh, args->name, args->len); +- encode_createhow3(xdr, args); ++ encode_createhow3(xdr, args, req->rq_task->tk_client->cl_tag); + } + + /* +@@ -1065,7 +1071,7 @@ static void nfs3_xdr_enc_mkdir3args(stru + const struct nfs3_mkdirargs *args) + { + encode_diropargs3(xdr, args->fh, args->name, args->len); +- encode_sattr3(xdr, args->sattr); ++ encode_sattr3(xdr, args->sattr, req->rq_task->tk_client->cl_tag); + } + + /* +@@ -1082,9 +1088,9 @@ static void nfs3_xdr_enc_mkdir3args(stru + * }; + */ + static void encode_symlinkdata3(struct xdr_stream *xdr, +- const struct nfs3_symlinkargs *args) ++ const struct nfs3_symlinkargs *args, int tag) + { +- encode_sattr3(xdr, args->sattr); ++ encode_sattr3(xdr, args->sattr, tag); + encode_nfspath3(xdr, args->pages, args->pathlen); + } + +@@ -1093,7 +1099,7 @@ static void nfs3_xdr_enc_symlink3args(st + const struct nfs3_symlinkargs *args) + { + encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen); +- encode_symlinkdata3(xdr, args); ++ encode_symlinkdata3(xdr, args, req->rq_task->tk_client->cl_tag); + } + + /* +@@ -1121,24 +1127,24 @@ static void nfs3_xdr_enc_symlink3args(st + * }; + */ + static void encode_devicedata3(struct xdr_stream *xdr, +- const struct nfs3_mknodargs *args) ++ const struct nfs3_mknodargs *args, int tag) + { +- encode_sattr3(xdr, args->sattr); ++ encode_sattr3(xdr, args->sattr, tag); + encode_specdata3(xdr, args->rdev); + } + + static void encode_mknoddata3(struct xdr_stream *xdr, +- const struct nfs3_mknodargs *args) ++ const struct nfs3_mknodargs *args, int tag) + { + encode_ftype3(xdr, args->type); + switch (args->type) { + case NF3CHR: + case NF3BLK: +- encode_devicedata3(xdr, args); ++ encode_devicedata3(xdr, args, tag); + break; + case NF3SOCK: + case NF3FIFO: +- encode_sattr3(xdr, args->sattr); ++ encode_sattr3(xdr, args->sattr, tag); + break; + case NF3REG: + case NF3DIR: +@@ -1153,7 +1159,7 @@ static void nfs3_xdr_enc_mknod3args(stru + const struct nfs3_mknodargs *args) + { + encode_diropargs3(xdr, args->fh, args->name, args->len); +- encode_mknoddata3(xdr, args); ++ encode_mknoddata3(xdr, args, req->rq_task->tk_client->cl_tag); + } + + /* +diff -NurpP --minimal linux-3.2.34/fs/nfs/super.c linux-3.2.34-vs2.3.2.15/fs/nfs/super.c +--- linux-3.2.34/fs/nfs/super.c 2012-11-18 18:42:22.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/nfs/super.c 2012-11-18 21:11:16.000000000 +0100 +@@ -53,6 +53,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -87,6 +88,7 @@ enum { + Opt_sharecache, Opt_nosharecache, + Opt_resvport, Opt_noresvport, + Opt_fscache, Opt_nofscache, ++ Opt_tag, Opt_notag, + + /* Mount options that take integer arguments */ + Opt_port, +@@ -100,6 +102,7 @@ enum { + Opt_mountvers, + Opt_nfsvers, + Opt_minorversion, ++ Opt_tagid, + + /* Mount options that take string arguments */ + Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, +@@ -180,6 +183,10 @@ static const match_table_t nfs_mount_opt + { Opt_fscache_uniq, "fsc=%s" }, + { Opt_local_lock, "local_lock=%s" }, + ++ { Opt_tag, "tag" }, ++ { Opt_notag, "notag" }, ++ { Opt_tagid, "tagid=%u" }, ++ + { Opt_err, NULL } + }; + +@@ -650,6 +657,7 @@ static void nfs_show_mount_options(struc + { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" }, + { NFS_MOUNT_UNSHARED, ",nosharecache", "" }, + { NFS_MOUNT_NORESVPORT, ",noresvport", "" }, ++ { NFS_MOUNT_TAGGED, ",tag", "" }, + { 0, NULL, NULL } + }; + const struct proc_nfs_info *nfs_infop; +@@ -1217,6 +1225,14 @@ static int nfs_parse_mount_options(char + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = NULL; + break; ++#ifndef CONFIG_TAGGING_NONE ++ case Opt_tag: ++ mnt->flags |= NFS_MOUNT_TAGGED; ++ break; ++ case Opt_notag: ++ mnt->flags &= ~NFS_MOUNT_TAGGED; ++ break; ++#endif + + /* + * options that take numeric values +@@ -1323,6 +1339,12 @@ static int nfs_parse_mount_options(char + goto out_invalid_value; + mnt->minorversion = option; + break; ++#ifdef CONFIG_PROPAGATE ++ case Opt_tagid: ++ /* use args[0] */ ++ nfs_data.flags |= NFS_MOUNT_TAGGED; ++ break; ++#endif + + /* + * options that take text values +diff -NurpP --minimal linux-3.2.34/fs/nfsd/auth.c linux-3.2.34-vs2.3.2.15/fs/nfsd/auth.c +--- linux-3.2.34/fs/nfsd/auth.c 2010-02-25 11:52:05.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/nfsd/auth.c 2011-12-05 19:33:02.000000000 +0100 +@@ -1,6 +1,7 @@ + /* Copyright (C) 1995, 1996 Olaf Kirch */ + + #include ++#include + #include "nfsd.h" + #include "auth.h" + +@@ -36,6 +37,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, + + new->fsuid = rqstp->rq_cred.cr_uid; + new->fsgid = rqstp->rq_cred.cr_gid; ++ /* FIXME: this desperately needs a tag :) ++ new->xid = (xid_t)INOTAG_TAG(DX_TAG_NFSD, cred.cr_uid, cred.cr_gid, 0); ++ */ + + rqgi = rqstp->rq_cred.cr_group_info; + +diff -NurpP --minimal linux-3.2.34/fs/nfsd/nfs3xdr.c linux-3.2.34-vs2.3.2.15/fs/nfsd/nfs3xdr.c +--- linux-3.2.34/fs/nfsd/nfs3xdr.c 2012-11-18 18:42:22.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/nfsd/nfs3xdr.c 2012-05-15 18:16:52.000000000 +0200 +@@ -7,6 +7,7 @@ + */ + + #include ++#include + #include "xdr3.h" + #include "auth.h" + +@@ -95,6 +96,8 @@ static __be32 * + decode_sattr3(__be32 *p, struct iattr *iap) + { + u32 tmp; ++ uid_t uid = 0; ++ gid_t gid = 0; + + iap->ia_valid = 0; + +@@ -104,12 +107,15 @@ decode_sattr3(__be32 *p, struct iattr *i + } + if (*p++) { + iap->ia_valid |= ATTR_UID; +- iap->ia_uid = ntohl(*p++); ++ uid = ntohl(*p++); + } + if (*p++) { + iap->ia_valid |= ATTR_GID; +- iap->ia_gid = ntohl(*p++); ++ gid = ntohl(*p++); + } ++ iap->ia_uid = INOTAG_UID(DX_TAG_NFSD, uid, gid); ++ iap->ia_gid = INOTAG_GID(DX_TAG_NFSD, uid, gid); ++ iap->ia_tag = INOTAG_TAG(DX_TAG_NFSD, uid, gid, 0); + if (*p++) { + u64 newsize; + +@@ -165,8 +171,12 @@ encode_fattr3(struct svc_rqst *rqstp, __ + *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); + *p++ = htonl((u32) stat->mode); + *p++ = htonl((u32) stat->nlink); +- *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid)); +- *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid)); ++ *p++ = htonl((u32) nfsd_ruid(rqstp, ++ TAGINO_UID(0 /* FIXME: DX_TAG(dentry->d_inode) */, ++ stat->uid, stat->tag))); ++ *p++ = htonl((u32) nfsd_rgid(rqstp, ++ TAGINO_GID(0 /* FIXME: DX_TAG(dentry->d_inode) */, ++ stat->gid, stat->tag))); + if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) { + p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN); + } else { +diff -NurpP --minimal linux-3.2.34/fs/nfsd/nfs4xdr.c linux-3.2.34-vs2.3.2.15/fs/nfsd/nfs4xdr.c +--- linux-3.2.34/fs/nfsd/nfs4xdr.c 2012-11-18 18:42:22.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/nfsd/nfs4xdr.c 2012-08-13 12:40:51.000000000 +0200 +@@ -46,6 +46,7 @@ + #include + #include + #include ++#include + + #include "idmap.h" + #include "acl.h" +@@ -2328,14 +2329,18 @@ out_acl: + WRITE32(stat.nlink); + } + if (bmval1 & FATTR4_WORD1_OWNER) { +- status = nfsd4_encode_user(rqstp, stat.uid, &p, &buflen); ++ status = nfsd4_encode_user(rqstp, ++ TAGINO_UID(DX_TAG(dentry->d_inode), ++ stat.uid, stat.tag), &p, &buflen); + if (status == nfserr_resource) + goto out_resource; + if (status) + goto out; + } + if (bmval1 & FATTR4_WORD1_OWNER_GROUP) { +- status = nfsd4_encode_group(rqstp, stat.gid, &p, &buflen); ++ status = nfsd4_encode_group(rqstp, ++ TAGINO_GID(DX_TAG(dentry->d_inode), ++ stat.gid, stat.tag), &p, &buflen); + if (status == nfserr_resource) + goto out_resource; + if (status) +diff -NurpP --minimal linux-3.2.34/fs/nfsd/nfsxdr.c linux-3.2.34-vs2.3.2.15/fs/nfsd/nfsxdr.c +--- linux-3.2.34/fs/nfsd/nfsxdr.c 2011-05-22 16:17:53.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/nfsd/nfsxdr.c 2011-12-05 19:33:02.000000000 +0100 +@@ -6,6 +6,7 @@ + + #include "xdr.h" + #include "auth.h" ++#include + + #define NFSDDBG_FACILITY NFSDDBG_XDR + +@@ -88,6 +89,8 @@ static __be32 * + decode_sattr(__be32 *p, struct iattr *iap) + { + u32 tmp, tmp1; ++ uid_t uid = 0; ++ gid_t gid = 0; + + iap->ia_valid = 0; + +@@ -101,12 +104,15 @@ decode_sattr(__be32 *p, struct iattr *ia + } + if ((tmp = ntohl(*p++)) != (u32)-1) { + iap->ia_valid |= ATTR_UID; +- iap->ia_uid = tmp; ++ uid = tmp; + } + if ((tmp = ntohl(*p++)) != (u32)-1) { + iap->ia_valid |= ATTR_GID; +- iap->ia_gid = tmp; ++ gid = tmp; + } ++ iap->ia_uid = INOTAG_UID(DX_TAG_NFSD, uid, gid); ++ iap->ia_gid = INOTAG_GID(DX_TAG_NFSD, uid, gid); ++ iap->ia_tag = INOTAG_TAG(DX_TAG_NFSD, uid, gid, 0); + if ((tmp = ntohl(*p++)) != (u32)-1) { + iap->ia_valid |= ATTR_SIZE; + iap->ia_size = tmp; +@@ -151,8 +157,10 @@ encode_fattr(struct svc_rqst *rqstp, __b + *p++ = htonl(nfs_ftypes[type >> 12]); + *p++ = htonl((u32) stat->mode); + *p++ = htonl((u32) stat->nlink); +- *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid)); +- *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid)); ++ *p++ = htonl((u32) nfsd_ruid(rqstp, ++ TAGINO_UID(DX_TAG(dentry->d_inode), stat->uid, stat->tag))); ++ *p++ = htonl((u32) nfsd_rgid(rqstp, ++ TAGINO_GID(DX_TAG(dentry->d_inode), stat->gid, stat->tag))); + + if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) { + *p++ = htonl(NFS_MAXPATHLEN); +diff -NurpP --minimal linux-3.2.34/fs/ocfs2/dlmglue.c linux-3.2.34-vs2.3.2.15/fs/ocfs2/dlmglue.c +--- linux-3.2.34/fs/ocfs2/dlmglue.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ocfs2/dlmglue.c 2011-12-05 19:33:02.000000000 +0100 +@@ -2047,6 +2047,7 @@ static void __ocfs2_stuff_meta_lvb(struc + lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); + lvb->lvb_iuid = cpu_to_be32(inode->i_uid); + lvb->lvb_igid = cpu_to_be32(inode->i_gid); ++ lvb->lvb_itag = cpu_to_be16(inode->i_tag); + lvb->lvb_imode = cpu_to_be16(inode->i_mode); + lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); + lvb->lvb_iatime_packed = +@@ -2097,6 +2098,7 @@ static void ocfs2_refresh_inode_from_lvb + + inode->i_uid = be32_to_cpu(lvb->lvb_iuid); + inode->i_gid = be32_to_cpu(lvb->lvb_igid); ++ inode->i_tag = be16_to_cpu(lvb->lvb_itag); + inode->i_mode = be16_to_cpu(lvb->lvb_imode); + set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); + ocfs2_unpack_timespec(&inode->i_atime, +diff -NurpP --minimal linux-3.2.34/fs/ocfs2/dlmglue.h linux-3.2.34-vs2.3.2.15/fs/ocfs2/dlmglue.h +--- linux-3.2.34/fs/ocfs2/dlmglue.h 2010-10-21 13:07:50.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/ocfs2/dlmglue.h 2011-12-05 19:33:02.000000000 +0100 +@@ -46,7 +46,8 @@ struct ocfs2_meta_lvb { + __be16 lvb_inlink; + __be32 lvb_iattr; + __be32 lvb_igeneration; +- __be32 lvb_reserved2; ++ __be16 lvb_itag; ++ __be16 lvb_reserved2; + }; + + #define OCFS2_QINFO_LVB_VERSION 1 +diff -NurpP --minimal linux-3.2.34/fs/ocfs2/file.c linux-3.2.34-vs2.3.2.15/fs/ocfs2/file.c +--- linux-3.2.34/fs/ocfs2/file.c 2012-11-18 18:42:22.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ocfs2/file.c 2012-08-13 12:40:51.000000000 +0200 +@@ -1123,7 +1123,7 @@ int ocfs2_setattr(struct dentry *dentry, + attr->ia_valid &= ~ATTR_SIZE; + + #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ +- | ATTR_GID | ATTR_UID | ATTR_MODE) ++ | ATTR_GID | ATTR_UID | ATTR_TAG | ATTR_MODE) + if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) + return 0; + +diff -NurpP --minimal linux-3.2.34/fs/ocfs2/inode.c linux-3.2.34-vs2.3.2.15/fs/ocfs2/inode.c +--- linux-3.2.34/fs/ocfs2/inode.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ocfs2/inode.c 2011-12-05 19:33:02.000000000 +0100 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + #include + +@@ -78,11 +79,13 @@ void ocfs2_set_inode_flags(struct inode + { + unsigned int flags = OCFS2_I(inode)->ip_attr; + +- inode->i_flags &= ~(S_IMMUTABLE | ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | + S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); + + if (flags & OCFS2_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; ++ if (flags & OCFS2_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; + + if (flags & OCFS2_SYNC_FL) + inode->i_flags |= S_SYNC; +@@ -92,25 +95,44 @@ void ocfs2_set_inode_flags(struct inode + inode->i_flags |= S_NOATIME; + if (flags & OCFS2_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (flags & OCFS2_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ if (flags & OCFS2_COW_FL) ++ inode->i_vflags |= V_COW; + } + + /* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */ + void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi) + { + unsigned int flags = oi->vfs_inode.i_flags; ++ unsigned int vflags = oi->vfs_inode.i_vflags; ++ ++ oi->ip_attr &= ~(OCFS2_SYNC_FL | OCFS2_APPEND_FL | ++ OCFS2_IMMUTABLE_FL | OCFS2_IXUNLINK_FL | ++ OCFS2_NOATIME_FL | OCFS2_DIRSYNC_FL | ++ OCFS2_BARRIER_FL | OCFS2_COW_FL); ++ ++ if (flags & S_IMMUTABLE) ++ oi->ip_attr |= OCFS2_IMMUTABLE_FL; ++ if (flags & S_IXUNLINK) ++ oi->ip_attr |= OCFS2_IXUNLINK_FL; + +- oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL| +- OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL); + if (flags & S_SYNC) + oi->ip_attr |= OCFS2_SYNC_FL; + if (flags & S_APPEND) + oi->ip_attr |= OCFS2_APPEND_FL; +- if (flags & S_IMMUTABLE) +- oi->ip_attr |= OCFS2_IMMUTABLE_FL; + if (flags & S_NOATIME) + oi->ip_attr |= OCFS2_NOATIME_FL; + if (flags & S_DIRSYNC) + oi->ip_attr |= OCFS2_DIRSYNC_FL; ++ ++ if (vflags & V_BARRIER) ++ oi->ip_attr |= OCFS2_BARRIER_FL; ++ if (vflags & V_COW) ++ oi->ip_attr |= OCFS2_COW_FL; + } + + struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) +@@ -241,6 +263,8 @@ void ocfs2_populate_inode(struct inode * + struct super_block *sb; + struct ocfs2_super *osb; + int use_plocks = 1; ++ uid_t uid; ++ gid_t gid; + + sb = inode->i_sb; + osb = OCFS2_SB(sb); +@@ -269,8 +293,12 @@ void ocfs2_populate_inode(struct inode * + inode->i_generation = le32_to_cpu(fe->i_generation); + inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); + inode->i_mode = le16_to_cpu(fe->i_mode); +- inode->i_uid = le32_to_cpu(fe->i_uid); +- inode->i_gid = le32_to_cpu(fe->i_gid); ++ uid = le32_to_cpu(fe->i_uid); ++ gid = le32_to_cpu(fe->i_gid); ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, ++ /* le16_to_cpu(raw_inode->i_raw_tag)i */ 0); + + /* Fast symlinks will have i_size but no allocated clusters. */ + if (S_ISLNK(inode->i_mode) && !fe->i_clusters) +diff -NurpP --minimal linux-3.2.34/fs/ocfs2/inode.h linux-3.2.34-vs2.3.2.15/fs/ocfs2/inode.h +--- linux-3.2.34/fs/ocfs2/inode.h 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ocfs2/inode.h 2011-12-05 19:33:02.000000000 +0100 +@@ -154,6 +154,7 @@ struct buffer_head *ocfs2_bread(struct i + + void ocfs2_set_inode_flags(struct inode *inode); + void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi); ++int ocfs2_sync_flags(struct inode *inode, int, int); + + static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode) + { +diff -NurpP --minimal linux-3.2.34/fs/ocfs2/ioctl.c linux-3.2.34-vs2.3.2.15/fs/ocfs2/ioctl.c +--- linux-3.2.34/fs/ocfs2/ioctl.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ocfs2/ioctl.c 2011-12-05 19:33:02.000000000 +0100 +@@ -78,7 +78,41 @@ static int ocfs2_get_inode_attr(struct i + return status; + } + +-static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, ++int ocfs2_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); ++ struct buffer_head *bh = NULL; ++ handle_t *handle = NULL; ++ int status; ++ ++ status = ocfs2_inode_lock(inode, &bh, 1); ++ if (status < 0) { ++ mlog_errno(status); ++ return status; ++ } ++ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); ++ if (IS_ERR(handle)) { ++ status = PTR_ERR(handle); ++ mlog_errno(status); ++ goto bail_unlock; ++ } ++ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ ocfs2_get_inode_flags(OCFS2_I(inode)); ++ ++ status = ocfs2_mark_inode_dirty(handle, inode, bh); ++ if (status < 0) ++ mlog_errno(status); ++ ++ ocfs2_commit_trans(osb, handle); ++bail_unlock: ++ ocfs2_inode_unlock(inode, 1); ++ brelse(bh); ++ return status; ++} ++ ++int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, + unsigned mask) + { + struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode); +@@ -103,6 +137,11 @@ static int ocfs2_set_inode_attr(struct i + if (!S_ISDIR(inode->i_mode)) + flags &= ~OCFS2_DIRSYNC_FL; + ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ goto bail_unlock; ++ } ++ + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); +@@ -881,6 +920,7 @@ bail: + return status; + } + ++ + long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_path.dentry->d_inode; +diff -NurpP --minimal linux-3.2.34/fs/ocfs2/namei.c linux-3.2.34-vs2.3.2.15/fs/ocfs2/namei.c +--- linux-3.2.34/fs/ocfs2/namei.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ocfs2/namei.c 2011-12-05 19:33:02.000000000 +0100 +@@ -41,6 +41,7 @@ + #include + #include + #include ++#include + + #include + +@@ -475,6 +476,7 @@ static int __ocfs2_mknod_locked(struct i + struct ocfs2_dinode *fe = NULL; + struct ocfs2_extent_list *fel; + u16 feat; ++ tag_t tag; + + *new_fe_bh = NULL; + +@@ -512,8 +514,11 @@ static int __ocfs2_mknod_locked(struct i + fe->i_suballoc_loc = cpu_to_le64(suballoc_loc); + fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); + fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); +- fe->i_uid = cpu_to_le32(inode->i_uid); +- fe->i_gid = cpu_to_le32(inode->i_gid); ++ ++ tag = dx_current_fstag(osb->sb); ++ fe->i_uid = cpu_to_le32(TAGINO_UID(DX_TAG(inode), inode->i_uid, tag)); ++ fe->i_gid = cpu_to_le32(TAGINO_GID(DX_TAG(inode), inode->i_gid, tag)); ++ inode->i_tag = tag; + fe->i_mode = cpu_to_le16(inode->i_mode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); +diff -NurpP --minimal linux-3.2.34/fs/ocfs2/ocfs2.h linux-3.2.34-vs2.3.2.15/fs/ocfs2/ocfs2.h +--- linux-3.2.34/fs/ocfs2/ocfs2.h 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ocfs2/ocfs2.h 2011-12-05 19:33:02.000000000 +0100 +@@ -272,6 +272,7 @@ enum ocfs2_mount_options + writes */ + OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */ + OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ ++ OCFS2_MOUNT_TAGGED = 1 << 15, /* use tagging */ + }; + + #define OCFS2_OSB_SOFT_RO 0x0001 +diff -NurpP --minimal linux-3.2.34/fs/ocfs2/ocfs2_fs.h linux-3.2.34-vs2.3.2.15/fs/ocfs2/ocfs2_fs.h +--- linux-3.2.34/fs/ocfs2/ocfs2_fs.h 2011-05-22 16:17:53.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/ocfs2/ocfs2_fs.h 2011-12-05 19:33:02.000000000 +0100 +@@ -266,6 +266,11 @@ + #define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/ + #define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */ + ++#define OCFS2_IXUNLINK_FL FS_IXUNLINK_FL /* Immutable invert on unlink */ ++ ++#define OCFS2_BARRIER_FL FS_BARRIER_FL /* Barrier for chroot() */ ++#define OCFS2_COW_FL FS_COW_FL /* Copy on Write marker */ ++ + #define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */ + #define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */ + +diff -NurpP --minimal linux-3.2.34/fs/ocfs2/super.c linux-3.2.34-vs2.3.2.15/fs/ocfs2/super.c +--- linux-3.2.34/fs/ocfs2/super.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/ocfs2/super.c 2011-12-05 19:33:02.000000000 +0100 +@@ -185,6 +185,7 @@ enum { + Opt_coherency_full, + Opt_resv_level, + Opt_dir_resv_level, ++ Opt_tag, Opt_notag, Opt_tagid, + Opt_err, + }; + +@@ -216,6 +217,9 @@ static const match_table_t tokens = { + {Opt_coherency_full, "coherency=full"}, + {Opt_resv_level, "resv_level=%u"}, + {Opt_dir_resv_level, "dir_resv_level=%u"}, ++ {Opt_tag, "tag"}, ++ {Opt_notag, "notag"}, ++ {Opt_tagid, "tagid=%u"}, + {Opt_err, NULL} + }; + +@@ -663,6 +667,13 @@ static int ocfs2_remount(struct super_bl + goto out; + } + ++ if ((osb->s_mount_opt & OCFS2_MOUNT_TAGGED) != ++ (parsed_options.mount_opt & OCFS2_MOUNT_TAGGED)) { ++ ret = -EINVAL; ++ mlog(ML_ERROR, "Cannot change tagging on remount\n"); ++ goto out; ++ } ++ + /* We're going to/from readonly mode. */ + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + /* Disable quota accounting before remounting RO */ +@@ -1178,6 +1189,9 @@ static int ocfs2_fill_super(struct super + + ocfs2_complete_mount_recovery(osb); + ++ if (osb->s_mount_opt & OCFS2_MOUNT_TAGGED) ++ sb->s_flags |= MS_TAGGED; ++ + if (ocfs2_mount_local(osb)) + snprintf(nodestr, sizeof(nodestr), "local"); + else +@@ -1507,6 +1521,20 @@ static int ocfs2_parse_options(struct su + option < OCFS2_MAX_RESV_LEVEL) + mopt->dir_resv_level = option; + break; ++#ifndef CONFIG_TAGGING_NONE ++ case Opt_tag: ++ mopt->mount_opt |= OCFS2_MOUNT_TAGGED; ++ break; ++ case Opt_notag: ++ mopt->mount_opt &= ~OCFS2_MOUNT_TAGGED; ++ break; ++#endif ++#ifdef CONFIG_PROPAGATE ++ case Opt_tagid: ++ /* use args[0] */ ++ mopt->mount_opt |= OCFS2_MOUNT_TAGGED; ++ break; ++#endif + default: + mlog(ML_ERROR, + "Unrecognized mount option \"%s\" " +diff -NurpP --minimal linux-3.2.34/fs/open.c linux-3.2.34-vs2.3.2.15/fs/open.c +--- linux-3.2.34/fs/open.c 2012-11-18 18:42:22.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/open.c 2012-09-16 18:25:50.000000000 +0200 +@@ -30,6 +30,11 @@ + #include + #include + #include ++#include ++#include ++#include ++#include ++#include + + #include "internal.h" + +@@ -74,6 +79,12 @@ static long do_sys_truncate(const char _ + error = user_path(pathname, &path); + if (error) + goto out; ++ ++#ifdef CONFIG_VSERVER_COWBL ++ error = cow_check_and_break(&path); ++ if (error) ++ goto dput_and_out; ++#endif + inode = path.dentry->d_inode; + + /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ +@@ -489,6 +500,10 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, cons + + error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); + if (!error) { ++#ifdef CONFIG_VSERVER_COWBL ++ error = cow_check_and_break(&path); ++ if (!error) ++#endif + error = chmod_common(&path, mode); + path_put(&path); + } +@@ -509,11 +524,11 @@ static int chown_common(struct path *pat + newattrs.ia_valid = ATTR_CTIME; + if (user != (uid_t) -1) { + newattrs.ia_valid |= ATTR_UID; +- newattrs.ia_uid = user; ++ newattrs.ia_uid = dx_map_uid(user); + } + if (group != (gid_t) -1) { + newattrs.ia_valid |= ATTR_GID; +- newattrs.ia_gid = group; ++ newattrs.ia_gid = dx_map_gid(group); + } + if (!S_ISDIR(inode->i_mode)) + newattrs.ia_valid |= +@@ -538,6 +553,10 @@ SYSCALL_DEFINE3(chown, const char __user + error = mnt_want_write(path.mnt); + if (error) + goto out_release; ++#ifdef CONFIG_VSERVER_COWBL ++ error = cow_check_and_break(&path); ++ if (!error) ++#endif + error = chown_common(&path, user, group); + mnt_drop_write(path.mnt); + out_release: +@@ -565,6 +584,10 @@ SYSCALL_DEFINE5(fchownat, int, dfd, cons + error = mnt_want_write(path.mnt); + if (error) + goto out_release; ++#ifdef CONFIG_VSERVER_COWBL ++ error = cow_check_and_break(&path); ++ if (!error) ++#endif + error = chown_common(&path, user, group); + mnt_drop_write(path.mnt); + out_release: +@@ -584,6 +607,10 @@ SYSCALL_DEFINE3(lchown, const char __use + error = mnt_want_write(path.mnt); + if (error) + goto out_release; ++#ifdef CONFIG_VSERVER_COWBL ++ error = cow_check_and_break(&path); ++ if (!error) ++#endif + error = chown_common(&path, user, group); + mnt_drop_write(path.mnt); + out_release: +@@ -839,6 +866,7 @@ static void __put_unused_fd(struct files + __FD_CLR(fd, fdt->open_fds); + if (fd < files->next_fd) + files->next_fd = fd; ++ vx_openfd_dec(fd); + } + + void put_unused_fd(unsigned int fd) +diff -NurpP --minimal linux-3.2.34/fs/proc/array.c linux-3.2.34-vs2.3.2.15/fs/proc/array.c +--- linux-3.2.34/fs/proc/array.c 2011-10-24 18:45:27.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/proc/array.c 2011-12-05 19:33:02.000000000 +0100 +@@ -81,6 +81,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -170,6 +172,9 @@ static inline void task_state(struct seq + rcu_read_lock(); + ppid = pid_alive(p) ? + task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; ++ if (unlikely(vx_current_initpid(p->pid))) ++ ppid = 0; ++ + tpid = 0; + if (pid_alive(p)) { + struct task_struct *tracer = ptrace_parent(p); +@@ -287,7 +292,7 @@ static inline void task_sig(struct seq_f + } + + static void render_cap_t(struct seq_file *m, const char *header, +- kernel_cap_t *a) ++ struct vx_info *vxi, kernel_cap_t *a) + { + unsigned __capi; + +@@ -312,10 +317,11 @@ static inline void task_cap(struct seq_f + cap_bset = cred->cap_bset; + rcu_read_unlock(); + +- render_cap_t(m, "CapInh:\t", &cap_inheritable); +- render_cap_t(m, "CapPrm:\t", &cap_permitted); +- render_cap_t(m, "CapEff:\t", &cap_effective); +- render_cap_t(m, "CapBnd:\t", &cap_bset); ++ /* FIXME: maybe move the p->vx_info masking to __task_cred() ? */ ++ render_cap_t(m, "CapInh:\t", p->vx_info, &cap_inheritable); ++ render_cap_t(m, "CapPrm:\t", p->vx_info, &cap_permitted); ++ render_cap_t(m, "CapEff:\t", p->vx_info, &cap_effective); ++ render_cap_t(m, "CapBnd:\t", p->vx_info, &cap_bset); + } + + static inline void task_context_switch_counts(struct seq_file *m, +@@ -337,6 +343,42 @@ static void task_cpus_allowed(struct seq + seq_putc(m, '\n'); + } + ++int proc_pid_nsproxy(struct seq_file *m, struct pid_namespace *ns, ++ struct pid *pid, struct task_struct *task) ++{ ++ seq_printf(m, "Proxy:\t%p(%c)\n" ++ "Count:\t%u\n" ++ "uts:\t%p(%c)\n" ++ "ipc:\t%p(%c)\n" ++ "mnt:\t%p(%c)\n" ++ "pid:\t%p(%c)\n" ++ "net:\t%p(%c)\n", ++ task->nsproxy, ++ (task->nsproxy == init_task.nsproxy ? 'I' : '-'), ++ atomic_read(&task->nsproxy->count), ++ task->nsproxy->uts_ns, ++ (task->nsproxy->uts_ns == init_task.nsproxy->uts_ns ? 'I' : '-'), ++ task->nsproxy->ipc_ns, ++ (task->nsproxy->ipc_ns == init_task.nsproxy->ipc_ns ? 'I' : '-'), ++ task->nsproxy->mnt_ns, ++ (task->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns ? 'I' : '-'), ++ task->nsproxy->pid_ns, ++ (task->nsproxy->pid_ns == init_task.nsproxy->pid_ns ? 'I' : '-'), ++ task->nsproxy->net_ns, ++ (task->nsproxy->net_ns == init_task.nsproxy->net_ns ? 'I' : '-')); ++ return 0; ++} ++ ++void task_vs_id(struct seq_file *m, struct task_struct *task) ++{ ++ if (task_vx_flags(task, VXF_HIDE_VINFO, 0)) ++ return; ++ ++ seq_printf(m, "VxID: %d\n", vx_task_xid(task)); ++ seq_printf(m, "NxID: %d\n", nx_task_nid(task)); ++} ++ ++ + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) + { +@@ -353,6 +395,7 @@ int proc_pid_status(struct seq_file *m, + task_cap(m, task); + task_cpus_allowed(m, task); + cpuset_task_status_allowed(m, task); ++ task_vs_id(m, task); + task_context_switch_counts(m, task); + return 0; + } +@@ -462,6 +505,17 @@ static int do_task_stat(struct seq_file + /* convert nsec -> ticks */ + start_time = nsec_to_clock_t(start_time); + ++ /* fixup start time for virt uptime */ ++ if (vx_flags(VXF_VIRT_UPTIME, 0)) { ++ unsigned long long bias = ++ current->vx_info->cvirt.bias_clock; ++ ++ if (start_time > bias) ++ start_time -= bias; ++ else ++ start_time = 0; ++ } ++ + seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ + %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ + %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", +diff -NurpP --minimal linux-3.2.34/fs/proc/base.c linux-3.2.34-vs2.3.2.15/fs/proc/base.c +--- linux-3.2.34/fs/proc/base.c 2012-11-18 18:42:22.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/proc/base.c 2012-02-15 03:26:56.000000000 +0100 +@@ -83,6 +83,8 @@ + #include + #include + #include ++#include ++#include + #ifdef CONFIG_HARDWALL + #include + #endif +@@ -1021,11 +1023,16 @@ static ssize_t oom_adjust_write(struct f + goto err_task_lock; + } + +- if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { ++ if (oom_adjust < task->signal->oom_adj && ++ !vx_capable(CAP_SYS_RESOURCE, VXC_OOM_ADJUST)) { + err = -EACCES; + goto err_sighand; + } + ++ /* prevent guest processes from circumventing the oom killer */ ++ if (vx_current_xid() && (oom_adjust == OOM_DISABLE)) ++ oom_adjust = OOM_ADJUST_MIN; ++ + /* + * Warn that /proc/pid/oom_adj is deprecated, see + * Documentation/feature-removal-schedule.txt. +@@ -1180,7 +1187,7 @@ static ssize_t proc_loginuid_write(struc + ssize_t length; + uid_t loginuid; + +- if (!capable(CAP_AUDIT_CONTROL)) ++ if (!vx_capable(CAP_AUDIT_CONTROL, VXC_AUDIT_CONTROL)) + return -EPERM; + + rcu_read_lock(); +@@ -1627,6 +1634,8 @@ struct inode *proc_pid_make_inode(struct + inode->i_gid = cred->egid; + rcu_read_unlock(); + } ++ /* procfs is xid tagged */ ++ inode->i_tag = (tag_t)vx_task_xid(task); + security_task_to_inode(task, inode); + + out: +@@ -1663,6 +1672,8 @@ int pid_getattr(struct vfsmount *mnt, st + + /* dentry stuff */ + ++static unsigned name_to_int(struct dentry *dentry); ++ + /* + * Exceptional case: normally we are not allowed to unhash a busy + * directory. In this case, however, we can do it - no aliasing problems +@@ -1691,6 +1702,12 @@ int pid_revalidate(struct dentry *dentry + task = get_proc_task(inode); + + if (task) { ++ unsigned pid = name_to_int(dentry); ++ ++ if (pid != ~0U && pid != vx_map_pid(task->pid)) { ++ put_task_struct(task); ++ goto drop; ++ } + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + rcu_read_lock(); +@@ -1707,6 +1724,7 @@ int pid_revalidate(struct dentry *dentry + put_task_struct(task); + return 1; + } ++drop: + d_drop(dentry); + return 0; + } +@@ -2196,6 +2214,13 @@ static struct dentry *proc_pident_lookup + if (!task) + goto out_no_task; + ++ /* TODO: maybe we can come up with a generic approach? */ ++ if (task_vx_flags(task, VXF_HIDE_VINFO, 0) && ++ (dentry->d_name.len == 5) && ++ (!memcmp(dentry->d_name.name, "vinfo", 5) || ++ !memcmp(dentry->d_name.name, "ninfo", 5))) ++ goto out; ++ + /* + * Yes, it does not scale. And it should not. Don't add + * new entries into /proc// without very good reasons. +@@ -2581,7 +2606,7 @@ out_iput: + static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) + { + struct dentry *error; +- struct task_struct *task = get_proc_task(dir); ++ struct task_struct *task = get_proc_task_real(dir); + const struct pid_entry *p, *last; + + error = ERR_PTR(-ENOENT); +@@ -2688,6 +2713,9 @@ static int proc_pid_personality(struct s + static const struct file_operations proc_task_operations; + static const struct inode_operations proc_task_inode_operations; + ++extern int proc_pid_vx_info(struct task_struct *, char *); ++extern int proc_pid_nx_info(struct task_struct *, char *); ++ + static const struct pid_entry tgid_base_stuff[] = { + DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), + DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), +@@ -2751,6 +2779,8 @@ static const struct pid_entry tgid_base_ + #ifdef CONFIG_CGROUPS + REG("cgroup", S_IRUGO, proc_cgroup_operations), + #endif ++ INF("vinfo", S_IRUGO, proc_pid_vx_info), ++ INF("ninfo", S_IRUGO, proc_pid_nx_info), + INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +@@ -2770,6 +2800,7 @@ static const struct pid_entry tgid_base_ + #ifdef CONFIG_HARDWALL + INF("hardwall", S_IRUGO, proc_pid_hardwall), + #endif ++ ONE("nsproxy", S_IRUGO, proc_pid_nsproxy), + }; + + static int proc_tgid_base_readdir(struct file * filp, +@@ -2962,7 +2993,7 @@ retry: + iter.task = NULL; + pid = find_ge_pid(iter.tgid, ns); + if (pid) { +- iter.tgid = pid_nr_ns(pid, ns); ++ iter.tgid = pid_unmapped_nr_ns(pid, ns); + iter.task = pid_task(pid, PIDTYPE_PID); + /* What we to know is if the pid we have find is the + * pid of a thread_group_leader. Testing for task +@@ -2992,7 +3023,7 @@ static int proc_pid_fill_cache(struct fi + struct tgid_iter iter) + { + char name[PROC_NUMBUF]; +- int len = snprintf(name, sizeof(name), "%d", iter.tgid); ++ int len = snprintf(name, sizeof(name), "%d", vx_map_tgid(iter.tgid)); + return proc_fill_cache(filp, dirent, filldir, name, len, + proc_pid_instantiate, iter.task, NULL); + } +@@ -3009,7 +3040,7 @@ int proc_pid_readdir(struct file * filp, + goto out_no_task; + nr = filp->f_pos - FIRST_PROCESS_ENTRY; + +- reaper = get_proc_task(filp->f_path.dentry->d_inode); ++ reaper = get_proc_task_real(filp->f_path.dentry->d_inode); + if (!reaper) + goto out_no_task; + +@@ -3026,6 +3057,8 @@ int proc_pid_readdir(struct file * filp, + iter.task; + iter.tgid += 1, iter = next_tgid(ns, iter)) { + filp->f_pos = iter.tgid + TGID_OFFSET; ++ if (!vx_proc_task_visible(iter.task)) ++ continue; + if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) { + put_task_struct(iter.task); + goto out; +@@ -3179,6 +3212,8 @@ static struct dentry *proc_task_lookup(s + tid = name_to_int(dentry); + if (tid == ~0U) + goto out; ++ if (vx_current_initpid(tid)) ++ goto out; + + ns = dentry->d_sb->s_fs_info; + rcu_read_lock(); +diff -NurpP --minimal linux-3.2.34/fs/proc/generic.c linux-3.2.34-vs2.3.2.15/fs/proc/generic.c +--- linux-3.2.34/fs/proc/generic.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/proc/generic.c 2011-12-05 19:33:02.000000000 +0100 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + + #include "internal.h" +@@ -424,11 +425,15 @@ struct dentry *proc_lookup_de(struct pro + for (de = de->subdir; de ; de = de->next) { + if (de->namelen != dentry->d_name.len) + continue; ++ if (!vx_hide_check(0, de->vx_flags)) ++ continue; + if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { + pde_get(de); + spin_unlock(&proc_subdir_lock); + error = -EINVAL; + inode = proc_get_inode(dir->i_sb, de); ++ /* generic proc entries belong to the host */ ++ inode->i_tag = 0; + goto out_unlock; + } + } +@@ -506,6 +511,8 @@ int proc_readdir_de(struct proc_dir_entr + + /* filldir passes info to user space */ + pde_get(de); ++ if (!vx_hide_check(0, de->vx_flags)) ++ goto skip; + spin_unlock(&proc_subdir_lock); + if (filldir(dirent, de->name, de->namelen, filp->f_pos, + de->low_ino, de->mode >> 12) < 0) { +@@ -513,6 +520,7 @@ int proc_readdir_de(struct proc_dir_entr + goto out; + } + spin_lock(&proc_subdir_lock); ++ skip: + filp->f_pos++; + next = de->next; + pde_put(de); +@@ -626,6 +634,7 @@ static struct proc_dir_entry *__proc_cre + ent->nlink = nlink; + atomic_set(&ent->count, 1); + ent->pde_users = 0; ++ ent->vx_flags = IATTR_PROC_DEFAULT; + spin_lock_init(&ent->pde_unload_lock); + ent->pde_unload_completion = NULL; + INIT_LIST_HEAD(&ent->pde_openers); +@@ -649,7 +658,8 @@ struct proc_dir_entry *proc_symlink(cons + kfree(ent->data); + kfree(ent); + ent = NULL; +- } ++ } else ++ ent->vx_flags = IATTR_PROC_SYMLINK; + } else { + kfree(ent); + ent = NULL; +diff -NurpP --minimal linux-3.2.34/fs/proc/inode.c linux-3.2.34-vs2.3.2.15/fs/proc/inode.c +--- linux-3.2.34/fs/proc/inode.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/proc/inode.c 2011-12-05 19:33:02.000000000 +0100 +@@ -442,6 +442,8 @@ struct inode *proc_get_inode(struct supe + inode->i_uid = de->uid; + inode->i_gid = de->gid; + } ++ if (de->vx_flags) ++ PROC_I(inode)->vx_flags = de->vx_flags; + if (de->size) + inode->i_size = de->size; + if (de->nlink) +diff -NurpP --minimal linux-3.2.34/fs/proc/internal.h linux-3.2.34-vs2.3.2.15/fs/proc/internal.h +--- linux-3.2.34/fs/proc/internal.h 2011-07-22 11:18:06.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/proc/internal.h 2011-12-05 19:33:02.000000000 +0100 +@@ -10,6 +10,7 @@ + */ + + #include ++#include + + extern struct proc_dir_entry proc_root; + #ifdef CONFIG_PROC_SYSCTL +@@ -51,6 +52,9 @@ extern int proc_pid_status(struct seq_fi + struct pid *pid, struct task_struct *task); + extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); ++extern int proc_pid_nsproxy(struct seq_file *m, struct pid_namespace *ns, ++ struct pid *pid, struct task_struct *task); ++ + extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); + + extern const struct file_operations proc_maps_operations; +@@ -76,11 +80,16 @@ static inline struct pid *proc_pid(struc + return PROC_I(inode)->pid; + } + +-static inline struct task_struct *get_proc_task(struct inode *inode) ++static inline struct task_struct *get_proc_task_real(struct inode *inode) + { + return get_pid_task(proc_pid(inode), PIDTYPE_PID); + } + ++static inline struct task_struct *get_proc_task(struct inode *inode) ++{ ++ return vx_get_proc_task(inode, proc_pid(inode)); ++} ++ + static inline int proc_fd(struct inode *inode) + { + return PROC_I(inode)->fd; +diff -NurpP --minimal linux-3.2.34/fs/proc/loadavg.c linux-3.2.34-vs2.3.2.15/fs/proc/loadavg.c +--- linux-3.2.34/fs/proc/loadavg.c 2009-09-10 15:26:23.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/proc/loadavg.c 2011-12-05 19:33:02.000000000 +0100 +@@ -12,15 +12,27 @@ + + static int loadavg_proc_show(struct seq_file *m, void *v) + { ++ unsigned long running; ++ unsigned int threads; + unsigned long avnrun[3]; + + get_avenrun(avnrun, FIXED_1/200, 0); + ++ if (vx_flags(VXF_VIRT_LOAD, 0)) { ++ struct vx_info *vxi = current_vx_info(); ++ ++ running = atomic_read(&vxi->cvirt.nr_running); ++ threads = atomic_read(&vxi->cvirt.nr_threads); ++ } else { ++ running = nr_running(); ++ threads = nr_threads; ++ } ++ + seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n", + LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), + LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), + LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), +- nr_running(), nr_threads, ++ running, threads, + task_active_pid_ns(current)->last_pid); + return 0; + } +diff -NurpP --minimal linux-3.2.34/fs/proc/meminfo.c linux-3.2.34-vs2.3.2.15/fs/proc/meminfo.c +--- linux-3.2.34/fs/proc/meminfo.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/proc/meminfo.c 2011-12-15 01:11:32.000000000 +0100 +@@ -39,7 +39,8 @@ static int meminfo_proc_show(struct seq_ + allowed = ((totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100) + total_swap_pages; + +- cached = global_page_state(NR_FILE_PAGES) - ++ cached = vx_flags(VXF_VIRT_MEM, 0) ? ++ vx_vsi_cached(&i) : global_page_state(NR_FILE_PAGES) - + total_swapcache_pages - i.bufferram; + if (cached < 0) + cached = 0; +diff -NurpP --minimal linux-3.2.34/fs/proc/root.c linux-3.2.34-vs2.3.2.15/fs/proc/root.c +--- linux-3.2.34/fs/proc/root.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/proc/root.c 2012-01-09 16:19:31.000000000 +0100 +@@ -18,9 +18,14 @@ + #include + #include + #include ++#include + + #include "internal.h" + ++struct proc_dir_entry *proc_virtual; ++ ++extern void proc_vx_init(void); ++ + static int proc_test_super(struct super_block *sb, void *data) + { + return sb->s_fs_info == data; +@@ -123,6 +128,7 @@ void __init proc_root_init(void) + #endif + proc_mkdir("bus", NULL); + proc_sys_init(); ++ proc_vx_init(); + } + + static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat +@@ -190,6 +196,7 @@ struct proc_dir_entry proc_root = { + .proc_iops = &proc_root_inode_operations, + .proc_fops = &proc_root_operations, + .parent = &proc_root, ++ .vx_flags = IATTR_ADMIN | IATTR_WATCH, + .name = "/proc", + }; + +diff -NurpP --minimal linux-3.2.34/fs/proc/stat.c linux-3.2.34-vs2.3.2.15/fs/proc/stat.c +--- linux-3.2.34/fs/proc/stat.c 2012-11-18 18:42:22.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/proc/stat.c 2012-11-06 18:08:24.000000000 +0100 +@@ -9,6 +9,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -72,6 +73,10 @@ static int show_stat(struct seq_file *p, + irq = softirq = steal = cputime64_zero; + guest = guest_nice = cputime64_zero; + getboottime(&boottime); ++ ++ if (vx_flags(VXF_VIRT_UPTIME, 0)) ++ vx_vsi_boottime(&boottime); ++ + jif = boottime.tv_sec; + + for_each_possible_cpu(i) { +diff -NurpP --minimal linux-3.2.34/fs/proc/uptime.c linux-3.2.34-vs2.3.2.15/fs/proc/uptime.c +--- linux-3.2.34/fs/proc/uptime.c 2012-11-18 18:42:22.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/proc/uptime.c 2012-01-26 09:03:19.000000000 +0100 +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + #include + + static int uptime_proc_show(struct seq_file *m, void *v) +@@ -25,6 +26,10 @@ static int uptime_proc_show(struct seq_f + nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + idle.tv_nsec = rem; ++ ++ if (vx_flags(VXF_VIRT_UPTIME, 0)) ++ vx_vsi_uptime(&uptime, &idle); ++ + seq_printf(m, "%lu.%02lu %lu.%02lu\n", + (unsigned long) uptime.tv_sec, + (uptime.tv_nsec / (NSEC_PER_SEC / 100)), +diff -NurpP --minimal linux-3.2.34/fs/quota/dquot.c linux-3.2.34-vs2.3.2.15/fs/quota/dquot.c +--- linux-3.2.34/fs/quota/dquot.c 2011-07-22 11:18:06.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/quota/dquot.c 2011-12-05 19:33:02.000000000 +0100 +@@ -1548,6 +1548,9 @@ int __dquot_alloc_space(struct inode *in + int reserve = flags & DQUOT_SPACE_RESERVE; + int nofail = flags & DQUOT_SPACE_NOFAIL; + ++ if ((ret = dl_alloc_space(inode, number))) ++ return ret; ++ + /* + * First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex +@@ -1602,6 +1605,9 @@ int dquot_alloc_inode(const struct inode + int cnt, ret = 0; + char warntype[MAXQUOTAS]; + ++ if ((ret = dl_alloc_inode(inode))) ++ return ret; ++ + /* First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex */ + if (!dquot_active(inode)) +@@ -1672,6 +1678,8 @@ void __dquot_free_space(struct inode *in + char warntype[MAXQUOTAS]; + int reserve = flags & DQUOT_SPACE_RESERVE; + ++ dl_free_space(inode, number); ++ + /* First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex */ + if (!dquot_active(inode)) { +@@ -1710,6 +1718,8 @@ void dquot_free_inode(const struct inode + unsigned int cnt; + char warntype[MAXQUOTAS]; + ++ dl_free_inode(inode); ++ + /* First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex */ + if (!dquot_active(inode)) +diff -NurpP --minimal linux-3.2.34/fs/quota/quota.c linux-3.2.34-vs2.3.2.15/fs/quota/quota.c +--- linux-3.2.34/fs/quota/quota.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/quota/quota.c 2011-12-05 19:33:02.000000000 +0100 +@@ -8,6 +8,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -38,7 +39,7 @@ static int check_quotactl_permission(str + break; + /*FALLTHROUGH*/ + default: +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_QUOTA_CTL)) + return -EPERM; + } + +@@ -293,6 +294,46 @@ static int do_quotactl(struct super_bloc + } + } + ++#if defined(CONFIG_BLK_DEV_VROOT) || defined(CONFIG_BLK_DEV_VROOT_MODULE) ++ ++#include ++#include ++#include ++#include ++#include ++ ++static vroot_grb_func *vroot_get_real_bdev = NULL; ++ ++static DEFINE_SPINLOCK(vroot_grb_lock); ++ ++int register_vroot_grb(vroot_grb_func *func) { ++ int ret = -EBUSY; ++ ++ spin_lock(&vroot_grb_lock); ++ if (!vroot_get_real_bdev) { ++ vroot_get_real_bdev = func; ++ ret = 0; ++ } ++ spin_unlock(&vroot_grb_lock); ++ return ret; ++} ++EXPORT_SYMBOL(register_vroot_grb); ++ ++int unregister_vroot_grb(vroot_grb_func *func) { ++ int ret = -EINVAL; ++ ++ spin_lock(&vroot_grb_lock); ++ if (vroot_get_real_bdev) { ++ vroot_get_real_bdev = NULL; ++ ret = 0; ++ } ++ spin_unlock(&vroot_grb_lock); ++ return ret; ++} ++EXPORT_SYMBOL(unregister_vroot_grb); ++ ++#endif ++ + /* + * look up a superblock on which quota ops will be performed + * - use the name of a block device to find the superblock thereon +@@ -310,6 +351,22 @@ static struct super_block *quotactl_bloc + putname(tmp); + if (IS_ERR(bdev)) + return ERR_CAST(bdev); ++#if defined(CONFIG_BLK_DEV_VROOT) || defined(CONFIG_BLK_DEV_VROOT_MODULE) ++ if (bdev && bdev->bd_inode && ++ imajor(bdev->bd_inode) == VROOT_MAJOR) { ++ struct block_device *bdnew = (void *)-EINVAL; ++ ++ if (vroot_get_real_bdev) ++ bdnew = vroot_get_real_bdev(bdev); ++ else ++ vxdprintk(VXD_CBIT(misc, 0), ++ "vroot_get_real_bdev not set"); ++ bdput(bdev); ++ if (IS_ERR(bdnew)) ++ return ERR_PTR(PTR_ERR(bdnew)); ++ bdev = bdnew; ++ } ++#endif + sb = get_super(bdev); + bdput(bdev); + if (!sb) +diff -NurpP --minimal linux-3.2.34/fs/reiserfs/file.c linux-3.2.34-vs2.3.2.15/fs/reiserfs/file.c +--- linux-3.2.34/fs/reiserfs/file.c 2011-10-24 18:45:27.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/reiserfs/file.c 2011-12-05 19:33:02.000000000 +0100 +@@ -319,5 +319,6 @@ const struct inode_operations reiserfs_f + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, ++ .sync_flags = reiserfs_sync_flags, + .get_acl = reiserfs_get_acl, + }; +diff -NurpP --minimal linux-3.2.34/fs/reiserfs/inode.c linux-3.2.34-vs2.3.2.15/fs/reiserfs/inode.c +--- linux-3.2.34/fs/reiserfs/inode.c 2012-11-18 18:42:22.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/reiserfs/inode.c 2012-11-06 18:08:24.000000000 +0100 +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + + int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to); +@@ -1131,6 +1132,8 @@ static void init_inode(struct inode *ino + struct buffer_head *bh; + struct item_head *ih; + __u32 rdev; ++ uid_t uid; ++ gid_t gid; + //int version = ITEM_VERSION_1; + + bh = PATH_PLAST_BUFFER(path); +@@ -1151,12 +1154,13 @@ static void init_inode(struct inode *ino + (struct stat_data_v1 *)B_I_PITEM(bh, ih); + unsigned long blocks; + ++ uid = sd_v1_uid(sd); ++ gid = sd_v1_gid(sd); ++ + set_inode_item_key_version(inode, KEY_FORMAT_3_5); + set_inode_sd_version(inode, STAT_DATA_V1); + inode->i_mode = sd_v1_mode(sd); + set_nlink(inode, sd_v1_nlink(sd)); +- inode->i_uid = sd_v1_uid(sd); +- inode->i_gid = sd_v1_gid(sd); + inode->i_size = sd_v1_size(sd); + inode->i_atime.tv_sec = sd_v1_atime(sd); + inode->i_mtime.tv_sec = sd_v1_mtime(sd); +@@ -1198,11 +1202,12 @@ static void init_inode(struct inode *ino + // (directories and symlinks) + struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih); + ++ uid = sd_v2_uid(sd); ++ gid = sd_v2_gid(sd); ++ + inode->i_mode = sd_v2_mode(sd); + set_nlink(inode, sd_v2_nlink(sd)); +- inode->i_uid = sd_v2_uid(sd); + inode->i_size = sd_v2_size(sd); +- inode->i_gid = sd_v2_gid(sd); + inode->i_mtime.tv_sec = sd_v2_mtime(sd); + inode->i_atime.tv_sec = sd_v2_atime(sd); + inode->i_ctime.tv_sec = sd_v2_ctime(sd); +@@ -1232,6 +1237,10 @@ static void init_inode(struct inode *ino + sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode); + } + ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, 0); ++ + pathrelse(path); + if (S_ISREG(inode->i_mode)) { + inode->i_op = &reiserfs_file_inode_operations; +@@ -1254,13 +1263,15 @@ static void init_inode(struct inode *ino + static void inode2sd(void *sd, struct inode *inode, loff_t size) + { + struct stat_data *sd_v2 = (struct stat_data *)sd; ++ uid_t uid = TAGINO_UID(DX_TAG(inode), inode->i_uid, inode->i_tag); ++ gid_t gid = TAGINO_GID(DX_TAG(inode), inode->i_gid, inode->i_tag); + __u16 flags; + ++ set_sd_v2_uid(sd_v2, uid); ++ set_sd_v2_gid(sd_v2, gid); + set_sd_v2_mode(sd_v2, inode->i_mode); + set_sd_v2_nlink(sd_v2, inode->i_nlink); +- set_sd_v2_uid(sd_v2, inode->i_uid); + set_sd_v2_size(sd_v2, size); +- set_sd_v2_gid(sd_v2, inode->i_gid); + set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec); + set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec); + set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec); +@@ -2872,14 +2883,19 @@ int reiserfs_commit_write(struct file *f + void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode) + { + if (reiserfs_attrs(inode->i_sb)) { +- if (sd_attrs & REISERFS_SYNC_FL) +- inode->i_flags |= S_SYNC; +- else +- inode->i_flags &= ~S_SYNC; + if (sd_attrs & REISERFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; ++ if (sd_attrs & REISERFS_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; ++ else ++ inode->i_flags &= ~S_IXUNLINK; ++ ++ if (sd_attrs & REISERFS_SYNC_FL) ++ inode->i_flags |= S_SYNC; ++ else ++ inode->i_flags &= ~S_SYNC; + if (sd_attrs & REISERFS_APPEND_FL) + inode->i_flags |= S_APPEND; + else +@@ -2892,6 +2908,15 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, + REISERFS_I(inode)->i_flags |= i_nopack_mask; + else + REISERFS_I(inode)->i_flags &= ~i_nopack_mask; ++ ++ if (sd_attrs & REISERFS_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ else ++ inode->i_vflags &= ~V_BARRIER; ++ if (sd_attrs & REISERFS_COW_FL) ++ inode->i_vflags |= V_COW; ++ else ++ inode->i_vflags &= ~V_COW; + } + } + +@@ -2902,6 +2927,11 @@ void i_attrs_to_sd_attrs(struct inode *i + *sd_attrs |= REISERFS_IMMUTABLE_FL; + else + *sd_attrs &= ~REISERFS_IMMUTABLE_FL; ++ if (inode->i_flags & S_IXUNLINK) ++ *sd_attrs |= REISERFS_IXUNLINK_FL; ++ else ++ *sd_attrs &= ~REISERFS_IXUNLINK_FL; ++ + if (inode->i_flags & S_SYNC) + *sd_attrs |= REISERFS_SYNC_FL; + else +@@ -2914,6 +2944,15 @@ void i_attrs_to_sd_attrs(struct inode *i + *sd_attrs |= REISERFS_NOTAIL_FL; + else + *sd_attrs &= ~REISERFS_NOTAIL_FL; ++ ++ if (inode->i_vflags & V_BARRIER) ++ *sd_attrs |= REISERFS_BARRIER_FL; ++ else ++ *sd_attrs &= ~REISERFS_BARRIER_FL; ++ if (inode->i_vflags & V_COW) ++ *sd_attrs |= REISERFS_COW_FL; ++ else ++ *sd_attrs &= ~REISERFS_COW_FL; + } + } + +@@ -3159,7 +3198,8 @@ int reiserfs_setattr(struct dentry *dent + } + + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || +- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { ++ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid) || ++ (ia_valid & ATTR_TAG && attr->ia_tag != inode->i_tag)) { + struct reiserfs_transaction_handle th; + int jbegin_count = + 2 * +@@ -3188,6 +3228,9 @@ int reiserfs_setattr(struct dentry *dent + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; ++ if ((attr->ia_valid & ATTR_TAG) && ++ IS_TAGGED(inode)) ++ inode->i_tag = attr->ia_tag; + mark_inode_dirty(inode); + error = journal_end(&th, inode->i_sb, jbegin_count); + if (error) +diff -NurpP --minimal linux-3.2.34/fs/reiserfs/ioctl.c linux-3.2.34-vs2.3.2.15/fs/reiserfs/ioctl.c +--- linux-3.2.34/fs/reiserfs/ioctl.c 2011-05-22 16:17:53.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/reiserfs/ioctl.c 2011-12-05 19:33:02.000000000 +0100 +@@ -11,6 +11,21 @@ + #include + #include + ++ ++int reiserfs_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ __u16 sd_attrs = 0; ++ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ ++ i_attrs_to_sd_attrs(inode, &sd_attrs); ++ REISERFS_I(inode)->i_attrs = sd_attrs; ++ inode->i_ctime = CURRENT_TIME_SEC; ++ mark_inode_dirty(inode); ++ return 0; ++} ++ + /* + * reiserfs_ioctl - handler for ioctl for inode + * supported commands: +@@ -22,7 +37,7 @@ + long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_path.dentry->d_inode; +- unsigned int flags; ++ unsigned int flags, oldflags; + int err = 0; + + reiserfs_write_lock(inode->i_sb); +@@ -47,6 +62,7 @@ long reiserfs_ioctl(struct file *filp, u + + flags = REISERFS_I(inode)->i_attrs; + i_attrs_to_sd_attrs(inode, (__u16 *) & flags); ++ flags &= REISERFS_FL_USER_VISIBLE; + err = put_user(flags, (int __user *)arg); + break; + case REISERFS_IOC_SETFLAGS:{ +@@ -67,6 +83,10 @@ long reiserfs_ioctl(struct file *filp, u + err = -EFAULT; + goto setflags_out; + } ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -EACCES; ++ } + /* + * Is it quota file? Do not allow user to mess with it + */ +@@ -91,6 +111,10 @@ long reiserfs_ioctl(struct file *filp, u + goto setflags_out; + } + } ++ ++ oldflags = REISERFS_I(inode)->i_attrs; ++ flags &= REISERFS_FL_USER_MODIFIABLE; ++ flags |= oldflags & ~REISERFS_FL_USER_MODIFIABLE; + sd_attrs_to_i_attrs(flags, inode); + REISERFS_I(inode)->i_attrs = flags; + inode->i_ctime = CURRENT_TIME_SEC; +diff -NurpP --minimal linux-3.2.34/fs/reiserfs/namei.c linux-3.2.34-vs2.3.2.15/fs/reiserfs/namei.c +--- linux-3.2.34/fs/reiserfs/namei.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/reiserfs/namei.c 2011-12-05 19:33:02.000000000 +0100 +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + + #define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); } + #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i); +@@ -362,6 +363,7 @@ static struct dentry *reiserfs_lookup(st + if (retval == IO_ERROR) { + return ERR_PTR(-EIO); + } ++ dx_propagate_tag(nd, inode); + + return d_splice_alias(inode, dentry); + } +diff -NurpP --minimal linux-3.2.34/fs/reiserfs/super.c linux-3.2.34-vs2.3.2.15/fs/reiserfs/super.c +--- linux-3.2.34/fs/reiserfs/super.c 2012-11-18 18:42:22.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/reiserfs/super.c 2012-01-18 02:58:07.000000000 +0100 +@@ -903,6 +903,14 @@ static int reiserfs_parse_options(struct + {"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT}, + {"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT}, + #endif ++#ifndef CONFIG_TAGGING_NONE ++ {"tagxid",.setmask = 1 << REISERFS_TAGGED}, ++ {"tag",.setmask = 1 << REISERFS_TAGGED}, ++ {"notag",.clrmask = 1 << REISERFS_TAGGED}, ++#endif ++#ifdef CONFIG_PROPAGATE ++ {"tag",.arg_required = 'T',.values = NULL}, ++#endif + #ifdef CONFIG_REISERFS_FS_POSIX_ACL + {"acl",.setmask = 1 << REISERFS_POSIXACL}, + {"noacl",.clrmask = 1 << REISERFS_POSIXACL}, +@@ -1213,6 +1221,14 @@ static int reiserfs_remount(struct super + handle_quota_files(s, qf_names, &qfmt); + #endif + ++ if ((mount_options & (1 << REISERFS_TAGGED)) && ++ !(s->s_flags & MS_TAGGED)) { ++ reiserfs_warning(s, "super-vs01", ++ "reiserfs: tagging not permitted on remount."); ++ err = -EINVAL; ++ goto out_err; ++ } ++ + handle_attrs(s); + + /* Add options that are safe here */ +@@ -1696,6 +1712,10 @@ static int reiserfs_fill_super(struct su + goto error; + } + ++ /* map mount option tagxid */ ++ if (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_TAGGED)) ++ s->s_flags |= MS_TAGGED; ++ + rs = SB_DISK_SUPER_BLOCK(s); + /* Let's do basic sanity check to verify that underlying device is not + smaller than the filesystem. If the check fails then abort and scream, +diff -NurpP --minimal linux-3.2.34/fs/reiserfs/xattr.c linux-3.2.34-vs2.3.2.15/fs/reiserfs/xattr.c +--- linux-3.2.34/fs/reiserfs/xattr.c 2011-10-24 18:45:27.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/reiserfs/xattr.c 2011-12-05 19:33:02.000000000 +0100 +@@ -40,6 +40,7 @@ + #include + #include + #include ++#include + #include + #include + #include +diff -NurpP --minimal linux-3.2.34/fs/stat.c linux-3.2.34-vs2.3.2.15/fs/stat.c +--- linux-3.2.34/fs/stat.c 2012-11-18 18:42:22.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/stat.c 2012-10-22 12:59:52.000000000 +0200 +@@ -26,6 +26,7 @@ void generic_fillattr(struct inode *inod + stat->nlink = inode->i_nlink; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; ++ stat->tag = inode->i_tag; + stat->rdev = inode->i_rdev; + stat->size = i_size_read(inode); + stat->atime = inode->i_atime; +diff -NurpP --minimal linux-3.2.34/fs/statfs.c linux-3.2.34-vs2.3.2.15/fs/statfs.c +--- linux-3.2.34/fs/statfs.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/statfs.c 2011-12-05 19:33:02.000000000 +0100 +@@ -7,6 +7,8 @@ + #include + #include + #include ++#include ++#include + + static int flags_by_mnt(int mnt_flags) + { +@@ -59,6 +61,8 @@ int statfs_by_dentry(struct dentry *dent + retval = dentry->d_sb->s_op->statfs(dentry, buf); + if (retval == 0 && buf->f_frsize == 0) + buf->f_frsize = buf->f_bsize; ++ if (!vx_check(0, VS_ADMIN|VS_WATCH)) ++ vx_vsi_statfs(dentry->d_sb, buf); + return retval; + } + +diff -NurpP --minimal linux-3.2.34/fs/super.c linux-3.2.34-vs2.3.2.15/fs/super.c +--- linux-3.2.34/fs/super.c 2012-11-18 18:42:22.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/super.c 2012-06-14 20:45:24.000000000 +0200 +@@ -32,6 +32,9 @@ + #include + #include + #include ++#include ++#include ++#include + #include "internal.h" + + +@@ -1100,6 +1103,13 @@ mount_fs(struct file_system_type *type, + WARN_ON(sb->s_bdi == &default_backing_dev_info); + sb->s_flags |= MS_BORN; + ++ error = -EPERM; ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_BINARY_MOUNT) && ++ !sb->s_bdev && ++ (sb->s_magic != PROC_SUPER_MAGIC) && ++ (sb->s_magic != DEVPTS_SUPER_MAGIC)) ++ goto out_sb; ++ + error = security_sb_kern_mount(sb, flags, secdata); + if (error) + goto out_sb; +diff -NurpP --minimal linux-3.2.34/fs/sysfs/mount.c linux-3.2.34-vs2.3.2.15/fs/sysfs/mount.c +--- linux-3.2.34/fs/sysfs/mount.c 2011-07-22 11:18:06.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/sysfs/mount.c 2011-12-05 19:33:02.000000000 +0100 +@@ -47,7 +47,7 @@ static int sysfs_fill_super(struct super + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; +- sb->s_magic = SYSFS_MAGIC; ++ sb->s_magic = SYSFS_SUPER_MAGIC; + sb->s_op = &sysfs_ops; + sb->s_time_gran = 1; + +diff -NurpP --minimal linux-3.2.34/fs/utimes.c linux-3.2.34-vs2.3.2.15/fs/utimes.c +--- linux-3.2.34/fs/utimes.c 2011-05-22 16:17:54.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/utimes.c 2011-12-05 19:33:02.000000000 +0100 +@@ -8,6 +8,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + +@@ -52,12 +54,18 @@ static int utimes_common(struct path *pa + { + int error; + struct iattr newattrs; +- struct inode *inode = path->dentry->d_inode; ++ struct inode *inode; + + error = mnt_want_write(path->mnt); + if (error) + goto out; + ++ error = cow_check_and_break(path); ++ if (error) ++ goto mnt_drop_write_and_out; ++ ++ inode = path->dentry->d_inode; ++ + if (times && times[0].tv_nsec == UTIME_NOW && + times[1].tv_nsec == UTIME_NOW) + times = NULL; +diff -NurpP --minimal linux-3.2.34/fs/xattr.c linux-3.2.34-vs2.3.2.15/fs/xattr.c +--- linux-3.2.34/fs/xattr.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/xattr.c 2011-12-05 19:33:02.000000000 +0100 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + #include + + +@@ -50,7 +51,7 @@ xattr_permission(struct inode *inode, co + * The trusted.* namespace can only be accessed by privileged users. + */ + if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_FS_TRUSTED)) + return (mask & MAY_WRITE) ? -EPERM : -ENODATA; + return 0; + } +diff -NurpP --minimal linux-3.2.34/fs/xfs/xfs_dinode.h linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_dinode.h +--- linux-3.2.34/fs/xfs/xfs_dinode.h 2011-10-24 18:45:31.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_dinode.h 2011-12-05 19:33:02.000000000 +0100 +@@ -51,7 +51,9 @@ typedef struct xfs_dinode { + __be32 di_nlink; /* number of links to file */ + __be16 di_projid_lo; /* lower part of owner's project id */ + __be16 di_projid_hi; /* higher part owner's project id */ +- __u8 di_pad[6]; /* unused, zeroed space */ ++ __u8 di_pad[2]; /* unused, zeroed space */ ++ __be16 di_tag; /* context tagging */ ++ __be16 di_vflags; /* vserver specific flags */ + __be16 di_flushiter; /* incremented on flush */ + xfs_timestamp_t di_atime; /* time last accessed */ + xfs_timestamp_t di_mtime; /* time last modified */ +@@ -184,6 +186,8 @@ static inline void xfs_dinode_put_rdev(s + #define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ + #define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ + #define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ ++#define XFS_DIFLAG_IXUNLINK_BIT 15 /* Immutable inver on unlink */ ++ + #define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) + #define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) + #define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) +@@ -199,6 +203,7 @@ static inline void xfs_dinode_put_rdev(s + #define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) + #define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) + #define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT) ++#define XFS_DIFLAG_IXUNLINK (1 << XFS_DIFLAG_IXUNLINK_BIT) + + #ifdef CONFIG_XFS_RT + #define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) +@@ -211,6 +216,10 @@ static inline void xfs_dinode_put_rdev(s + XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ + XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ + XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ +- XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) ++ XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM | \ ++ XFS_DIFLAG_IXUNLINK) ++ ++#define XFS_DIVFLAG_BARRIER 0x01 ++#define XFS_DIVFLAG_COW 0x02 + + #endif /* __XFS_DINODE_H__ */ +diff -NurpP --minimal linux-3.2.34/fs/xfs/xfs_fs.h linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_fs.h +--- linux-3.2.34/fs/xfs/xfs_fs.h 2011-10-24 18:45:31.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_fs.h 2011-12-05 19:33:02.000000000 +0100 +@@ -67,6 +67,9 @@ struct fsxattr { + #define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ + #define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ + #define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ ++#define XFS_XFLAG_IXUNLINK 0x00008000 /* immutable invert on unlink */ ++#define XFS_XFLAG_BARRIER 0x10000000 /* chroot() barrier */ ++#define XFS_XFLAG_COW 0x20000000 /* copy on write mark */ + #define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ + + /* +@@ -302,7 +305,8 @@ typedef struct xfs_bstat { + #define bs_projid bs_projid_lo /* (previously just bs_projid) */ + __u16 bs_forkoff; /* inode fork offset in bytes */ + __u16 bs_projid_hi; /* higher part of project id */ +- unsigned char bs_pad[10]; /* pad space, unused */ ++ unsigned char bs_pad[8]; /* pad space, unused */ ++ __u16 bs_tag; /* context tagging */ + __u32 bs_dmevmask; /* DMIG event mask */ + __u16 bs_dmstate; /* DMIG state info */ + __u16 bs_aextents; /* attribute number of extents */ +diff -NurpP --minimal linux-3.2.34/fs/xfs/xfs_ialloc.c linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_ialloc.c +--- linux-3.2.34/fs/xfs/xfs_ialloc.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_ialloc.c 2011-12-05 19:33:02.000000000 +0100 +@@ -37,7 +37,6 @@ + #include "xfs_error.h" + #include "xfs_bmap.h" + +- + /* + * Allocation group level functions. + */ +diff -NurpP --minimal linux-3.2.34/fs/xfs/xfs_inode.c linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_inode.c +--- linux-3.2.34/fs/xfs/xfs_inode.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_inode.c 2011-12-15 01:11:32.000000000 +0100 +@@ -236,6 +236,7 @@ xfs_inotobp( + return 0; + } + ++#include + + /* + * This routine is called to map an inode to the buffer containing +@@ -634,15 +635,25 @@ xfs_iformat_btree( + STATIC void + xfs_dinode_from_disk( + xfs_icdinode_t *to, +- xfs_dinode_t *from) ++ xfs_dinode_t *from, ++ int tagged) + { ++ uint32_t uid, gid, tag; ++ + to->di_magic = be16_to_cpu(from->di_magic); + to->di_mode = be16_to_cpu(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = be16_to_cpu(from->di_onlink); +- to->di_uid = be32_to_cpu(from->di_uid); +- to->di_gid = be32_to_cpu(from->di_gid); ++ ++ uid = be32_to_cpu(from->di_uid); ++ gid = be32_to_cpu(from->di_gid); ++ tag = be16_to_cpu(from->di_tag); ++ ++ to->di_uid = INOTAG_UID(tagged, uid, gid); ++ to->di_gid = INOTAG_GID(tagged, uid, gid); ++ to->di_tag = INOTAG_TAG(tagged, uid, gid, tag); ++ + to->di_nlink = be32_to_cpu(from->di_nlink); + to->di_projid_lo = be16_to_cpu(from->di_projid_lo); + to->di_projid_hi = be16_to_cpu(from->di_projid_hi); +@@ -664,21 +675,26 @@ xfs_dinode_from_disk( + to->di_dmevmask = be32_to_cpu(from->di_dmevmask); + to->di_dmstate = be16_to_cpu(from->di_dmstate); + to->di_flags = be16_to_cpu(from->di_flags); ++ to->di_vflags = be16_to_cpu(from->di_vflags); + to->di_gen = be32_to_cpu(from->di_gen); + } + + void + xfs_dinode_to_disk( + xfs_dinode_t *to, +- xfs_icdinode_t *from) ++ xfs_icdinode_t *from, ++ int tagged) + { + to->di_magic = cpu_to_be16(from->di_magic); + to->di_mode = cpu_to_be16(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = cpu_to_be16(from->di_onlink); +- to->di_uid = cpu_to_be32(from->di_uid); +- to->di_gid = cpu_to_be32(from->di_gid); ++ ++ to->di_uid = cpu_to_be32(TAGINO_UID(tagged, from->di_uid, from->di_tag)); ++ to->di_gid = cpu_to_be32(TAGINO_GID(tagged, from->di_gid, from->di_tag)); ++ to->di_tag = cpu_to_be16(TAGINO_TAG(tagged, from->di_tag)); ++ + to->di_nlink = cpu_to_be32(from->di_nlink); + to->di_projid_lo = cpu_to_be16(from->di_projid_lo); + to->di_projid_hi = cpu_to_be16(from->di_projid_hi); +@@ -700,12 +716,14 @@ xfs_dinode_to_disk( + to->di_dmevmask = cpu_to_be32(from->di_dmevmask); + to->di_dmstate = cpu_to_be16(from->di_dmstate); + to->di_flags = cpu_to_be16(from->di_flags); ++ to->di_vflags = cpu_to_be16(from->di_vflags); + to->di_gen = cpu_to_be32(from->di_gen); + } + + STATIC uint + _xfs_dic2xflags( +- __uint16_t di_flags) ++ __uint16_t di_flags, ++ __uint16_t di_vflags) + { + uint flags = 0; + +@@ -716,6 +734,8 @@ _xfs_dic2xflags( + flags |= XFS_XFLAG_PREALLOC; + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= XFS_XFLAG_IMMUTABLE; ++ if (di_flags & XFS_DIFLAG_IXUNLINK) ++ flags |= XFS_XFLAG_IXUNLINK; + if (di_flags & XFS_DIFLAG_APPEND) + flags |= XFS_XFLAG_APPEND; + if (di_flags & XFS_DIFLAG_SYNC) +@@ -740,6 +760,10 @@ _xfs_dic2xflags( + flags |= XFS_XFLAG_FILESTREAM; + } + ++ if (di_vflags & XFS_DIVFLAG_BARRIER) ++ flags |= FS_BARRIER_FL; ++ if (di_vflags & XFS_DIVFLAG_COW) ++ flags |= FS_COW_FL; + return flags; + } + +@@ -749,7 +773,7 @@ xfs_ip2xflags( + { + xfs_icdinode_t *dic = &ip->i_d; + +- return _xfs_dic2xflags(dic->di_flags) | ++ return _xfs_dic2xflags(dic->di_flags, dic->di_vflags) | + (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); + } + +@@ -757,7 +781,8 @@ uint + xfs_dic2xflags( + xfs_dinode_t *dip) + { +- return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | ++ return _xfs_dic2xflags(be16_to_cpu(dip->di_flags), ++ be16_to_cpu(dip->di_vflags)) | + (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); + } + +@@ -790,7 +815,6 @@ xfs_iread( + if (error) + return error; + dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); +- + /* + * If we got something that isn't an inode it means someone + * (nfs or dmi) has a stale handle. +@@ -813,7 +837,8 @@ xfs_iread( + * Otherwise, just get the truly permanent information. + */ + if (dip->di_mode) { +- xfs_dinode_from_disk(&ip->i_d, dip); ++ xfs_dinode_from_disk(&ip->i_d, dip, ++ mp->m_flags & XFS_MOUNT_TAGGED); + error = xfs_iformat(ip, dip); + if (error) { + #ifdef DEBUG +@@ -1008,6 +1033,7 @@ xfs_ialloc( + ASSERT(ip->i_d.di_nlink == nlink); + ip->i_d.di_uid = current_fsuid(); + ip->i_d.di_gid = current_fsgid(); ++ ip->i_d.di_tag = current_fstag(&ip->i_vnode); + xfs_set_projid(ip, prid); + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + +@@ -1068,6 +1094,7 @@ xfs_ialloc( + ip->i_d.di_dmevmask = 0; + ip->i_d.di_dmstate = 0; + ip->i_d.di_flags = 0; ++ ip->i_d.di_vflags = 0; + flags = XFS_ILOG_CORE; + switch (mode & S_IFMT) { + case S_IFIFO: +@@ -1842,6 +1869,7 @@ xfs_ifree( + } + ip->i_d.di_mode = 0; /* mark incore inode as free */ + ip->i_d.di_flags = 0; ++ ip->i_d.di_vflags = 0; + ip->i_d.di_dmevmask = 0; + ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ + ip->i_df.if_ext_max = +@@ -2723,7 +2751,8 @@ xfs_iflush_int( + * because if the inode is dirty at all the core must + * be. + */ +- xfs_dinode_to_disk(dip, &ip->i_d); ++ xfs_dinode_to_disk(dip, &ip->i_d, ++ mp->m_flags & XFS_MOUNT_TAGGED); + + /* Wrap, we never let the log put out DI_MAX_FLUSH */ + if (ip->i_d.di_flushiter == DI_MAX_FLUSH) +diff -NurpP --minimal linux-3.2.34/fs/xfs/xfs_inode.h linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_inode.h +--- linux-3.2.34/fs/xfs/xfs_inode.h 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_inode.h 2011-12-15 01:11:32.000000000 +0100 +@@ -135,7 +135,9 @@ typedef struct xfs_icdinode { + __uint32_t di_nlink; /* number of links to file */ + __uint16_t di_projid_lo; /* lower part of owner's project id */ + __uint16_t di_projid_hi; /* higher part of owner's project id */ +- __uint8_t di_pad[6]; /* unused, zeroed space */ ++ __uint8_t di_pad[2]; /* unused, zeroed space */ ++ __uint16_t di_tag; /* context tagging */ ++ __uint16_t di_vflags; /* vserver specific flags */ + __uint16_t di_flushiter; /* incremented on flush */ + xfs_ictimestamp_t di_atime; /* time last accessed */ + xfs_ictimestamp_t di_mtime; /* time last modified */ +@@ -536,7 +538,7 @@ int xfs_itobp(struct xfs_mount *, struc + int xfs_iread(struct xfs_mount *, struct xfs_trans *, + struct xfs_inode *, uint); + void xfs_dinode_to_disk(struct xfs_dinode *, +- struct xfs_icdinode *); ++ struct xfs_icdinode *, int); + void xfs_idestroy_fork(struct xfs_inode *, int); + void xfs_idata_realloc(struct xfs_inode *, int, int); + void xfs_iroot_realloc(struct xfs_inode *, int, int); +diff -NurpP --minimal linux-3.2.34/fs/xfs/xfs_ioctl.c linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_ioctl.c +--- linux-3.2.34/fs/xfs/xfs_ioctl.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_ioctl.c 2011-12-05 19:33:02.000000000 +0100 +@@ -28,7 +28,7 @@ + #include "xfs_bmap_btree.h" + #include "xfs_dinode.h" + #include "xfs_inode.h" +-#include "xfs_ioctl.h" ++// #include "xfs_ioctl.h" + #include "xfs_rtalloc.h" + #include "xfs_itable.h" + #include "xfs_error.h" +@@ -748,6 +748,10 @@ xfs_merge_ioc_xflags( + xflags |= XFS_XFLAG_IMMUTABLE; + else + xflags &= ~XFS_XFLAG_IMMUTABLE; ++ if (flags & FS_IXUNLINK_FL) ++ xflags |= XFS_XFLAG_IXUNLINK; ++ else ++ xflags &= ~XFS_XFLAG_IXUNLINK; + if (flags & FS_APPEND_FL) + xflags |= XFS_XFLAG_APPEND; + else +@@ -776,6 +780,8 @@ xfs_di2lxflags( + + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; ++ if (di_flags & XFS_DIFLAG_IXUNLINK) ++ flags |= FS_IXUNLINK_FL; + if (di_flags & XFS_DIFLAG_APPEND) + flags |= FS_APPEND_FL; + if (di_flags & XFS_DIFLAG_SYNC) +@@ -836,6 +842,8 @@ xfs_set_diflags( + di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + if (xflags & XFS_XFLAG_IMMUTABLE) + di_flags |= XFS_DIFLAG_IMMUTABLE; ++ if (xflags & XFS_XFLAG_IXUNLINK) ++ di_flags |= XFS_DIFLAG_IXUNLINK; + if (xflags & XFS_XFLAG_APPEND) + di_flags |= XFS_DIFLAG_APPEND; + if (xflags & XFS_XFLAG_SYNC) +@@ -878,6 +886,10 @@ xfs_diflags_to_linux( + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; ++ if (xflags & XFS_XFLAG_IXUNLINK) ++ inode->i_flags |= S_IXUNLINK; ++ else ++ inode->i_flags &= ~S_IXUNLINK; + if (xflags & XFS_XFLAG_APPEND) + inode->i_flags |= S_APPEND; + else +@@ -1370,10 +1382,18 @@ xfs_file_ioctl( + case XFS_IOC_FSGETXATTRA: + return xfs_ioc_fsgetxattr(ip, 1, arg); + case XFS_IOC_FSSETXATTR: ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -XFS_ERROR(EACCES); ++ } + return xfs_ioc_fssetxattr(ip, filp, arg); + case XFS_IOC_GETXFLAGS: + return xfs_ioc_getxflags(ip, arg); + case XFS_IOC_SETXFLAGS: ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -XFS_ERROR(EACCES); ++ } + return xfs_ioc_setxflags(ip, filp, arg); + + case XFS_IOC_FSSETDM: { +diff -NurpP --minimal linux-3.2.34/fs/xfs/xfs_ioctl.h linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_ioctl.h +--- linux-3.2.34/fs/xfs/xfs_ioctl.h 2011-10-24 18:45:31.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_ioctl.h 2011-12-05 19:33:02.000000000 +0100 +@@ -70,6 +70,12 @@ xfs_handle_to_dentry( + void __user *uhandle, + u32 hlen); + ++extern int ++xfs_sync_flags( ++ struct inode *inode, ++ int flags, ++ int vflags); ++ + extern long + xfs_file_ioctl( + struct file *filp, +diff -NurpP --minimal linux-3.2.34/fs/xfs/xfs_iops.c linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_iops.c +--- linux-3.2.34/fs/xfs/xfs_iops.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_iops.c 2011-12-05 19:33:02.000000000 +0100 +@@ -30,6 +30,7 @@ + #include "xfs_bmap_btree.h" + #include "xfs_dinode.h" + #include "xfs_inode.h" ++#include "xfs_ioctl.h" + #include "xfs_bmap.h" + #include "xfs_rtalloc.h" + #include "xfs_error.h" +@@ -49,6 +50,7 @@ + #include + #include + #include ++#include + + /* + * Bring the timestamps in the XFS inode uptodate. +@@ -474,6 +476,7 @@ xfs_vn_getattr( + stat->nlink = ip->i_d.di_nlink; + stat->uid = ip->i_d.di_uid; + stat->gid = ip->i_d.di_gid; ++ stat->tag = ip->i_d.di_tag; + stat->ino = ip->i_ino; + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; +@@ -1039,6 +1042,7 @@ static const struct inode_operations xfs + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .fiemap = xfs_vn_fiemap, ++ .sync_flags = xfs_sync_flags, + }; + + static const struct inode_operations xfs_dir_inode_operations = { +@@ -1064,6 +1068,7 @@ static const struct inode_operations xfs + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, ++ .sync_flags = xfs_sync_flags, + }; + + static const struct inode_operations xfs_dir_ci_inode_operations = { +@@ -1113,6 +1118,10 @@ xfs_diflags_to_iflags( + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; ++ if (ip->i_d.di_flags & XFS_DIFLAG_IXUNLINK) ++ inode->i_flags |= S_IXUNLINK; ++ else ++ inode->i_flags &= ~S_IXUNLINK; + if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + inode->i_flags |= S_APPEND; + else +@@ -1125,6 +1134,15 @@ xfs_diflags_to_iflags( + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; ++ ++ if (ip->i_d.di_vflags & XFS_DIVFLAG_BARRIER) ++ inode->i_vflags |= V_BARRIER; ++ else ++ inode->i_vflags &= ~V_BARRIER; ++ if (ip->i_d.di_vflags & XFS_DIVFLAG_COW) ++ inode->i_vflags |= V_COW; ++ else ++ inode->i_vflags &= ~V_COW; + } + + /* +@@ -1156,6 +1174,7 @@ xfs_setup_inode( + set_nlink(inode, ip->i_d.di_nlink); + inode->i_uid = ip->i_d.di_uid; + inode->i_gid = ip->i_d.di_gid; ++ inode->i_tag = ip->i_d.di_tag; + + switch (inode->i_mode & S_IFMT) { + case S_IFBLK: +diff -NurpP --minimal linux-3.2.34/fs/xfs/xfs_itable.c linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_itable.c +--- linux-3.2.34/fs/xfs/xfs_itable.c 2011-05-22 16:17:54.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_itable.c 2011-12-05 19:33:02.000000000 +0100 +@@ -98,6 +98,7 @@ xfs_bulkstat_one_int( + buf->bs_mode = dic->di_mode; + buf->bs_uid = dic->di_uid; + buf->bs_gid = dic->di_gid; ++ buf->bs_tag = dic->di_tag; + buf->bs_size = dic->di_size; + + /* +diff -NurpP --minimal linux-3.2.34/fs/xfs/xfs_linux.h linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_linux.h +--- linux-3.2.34/fs/xfs/xfs_linux.h 2011-10-24 18:45:31.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_linux.h 2011-12-05 19:33:02.000000000 +0100 +@@ -121,6 +121,7 @@ + + #define current_cpu() (raw_smp_processor_id()) + #define current_pid() (current->pid) ++#define current_fstag(vp) (dx_current_fstag((vp)->i_sb)) + #define current_test_flags(f) (current->flags & (f)) + #define current_set_flags_nested(sp, f) \ + (*(sp) = current->flags, current->flags |= (f)) +diff -NurpP --minimal linux-3.2.34/fs/xfs/xfs_log_recover.c linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_log_recover.c +--- linux-3.2.34/fs/xfs/xfs_log_recover.c 2012-11-18 18:42:22.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_log_recover.c 2012-11-18 21:11:16.000000000 +0100 +@@ -2344,7 +2344,8 @@ xlog_recover_inode_pass2( + } + + /* The core is in in-core format */ +- xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr); ++ xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr, ++ mp->m_flags & XFS_MOUNT_TAGGED); + + /* the rest is in on-disk format */ + if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { +diff -NurpP --minimal linux-3.2.34/fs/xfs/xfs_mount.h linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_mount.h +--- linux-3.2.34/fs/xfs/xfs_mount.h 2011-10-24 18:45:31.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_mount.h 2011-12-05 19:33:02.000000000 +0100 +@@ -249,6 +249,7 @@ typedef struct xfs_mount { + allocator */ + #define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ + ++#define XFS_MOUNT_TAGGED (1ULL << 31) /* context tagging */ + + /* + * Default minimum read and write sizes. +diff -NurpP --minimal linux-3.2.34/fs/xfs/xfs_super.c linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_super.c +--- linux-3.2.34/fs/xfs/xfs_super.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_super.c 2012-01-09 16:19:31.000000000 +0100 +@@ -113,6 +113,9 @@ mempool_t *xfs_ioend_pool; + #define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */ + #define MNTOPT_DISCARD "discard" /* Discard unused blocks */ + #define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ ++#define MNTOPT_TAGXID "tagxid" /* context tagging for inodes */ ++#define MNTOPT_TAGGED "tag" /* context tagging for inodes */ ++#define MNTOPT_NOTAGTAG "notag" /* do not use context tagging */ + + /* + * Table driven mount option parser. +@@ -121,10 +124,14 @@ mempool_t *xfs_ioend_pool; + * in the future, too. + */ + enum { ++ Opt_tag, Opt_notag, + Opt_barrier, Opt_nobarrier, Opt_err + }; + + static const match_table_t tokens = { ++ {Opt_tag, "tagxid"}, ++ {Opt_tag, "tag"}, ++ {Opt_notag, "notag"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_err, NULL} +@@ -374,6 +381,19 @@ xfs_parseargs( + } else if (!strcmp(this_char, "irixsgid")) { + xfs_warn(mp, + "irixsgid is now a sysctl(2) variable, option is deprecated."); ++#ifndef CONFIG_TAGGING_NONE ++ } else if (!strcmp(this_char, MNTOPT_TAGGED)) { ++ mp->m_flags |= XFS_MOUNT_TAGGED; ++ } else if (!strcmp(this_char, MNTOPT_NOTAGTAG)) { ++ mp->m_flags &= ~XFS_MOUNT_TAGGED; ++ } else if (!strcmp(this_char, MNTOPT_TAGXID)) { ++ mp->m_flags |= XFS_MOUNT_TAGGED; ++#endif ++#ifdef CONFIG_PROPAGATE ++ } else if (!strcmp(this_char, MNTOPT_TAGGED)) { ++ /* use value */ ++ mp->m_flags |= XFS_MOUNT_TAGGED; ++#endif + } else { + xfs_warn(mp, "unknown mount option [%s].", this_char); + return EINVAL; +@@ -1138,6 +1158,16 @@ xfs_fs_remount( + case Opt_nobarrier: + mp->m_flags &= ~XFS_MOUNT_BARRIER; + break; ++ case Opt_tag: ++ if (!(sb->s_flags & MS_TAGGED)) { ++ printk(KERN_INFO ++ "XFS: %s: tagging not permitted on remount.\n", ++ sb->s_id); ++ return -EINVAL; ++ } ++ break; ++ case Opt_notag: ++ break; + default: + /* + * Logically we would return an error here to prevent +@@ -1353,6 +1383,9 @@ xfs_fs_fill_super( + if (error) + goto out_free_sb; + ++ if (mp->m_flags & XFS_MOUNT_TAGGED) ++ sb->s_flags |= MS_TAGGED; ++ + /* + * we must configure the block size in the superblock before we run the + * full mount process as the mount process can lookup and cache inodes. +diff -NurpP --minimal linux-3.2.34/fs/xfs/xfs_vnodeops.c linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_vnodeops.c +--- linux-3.2.34/fs/xfs/xfs_vnodeops.c 2012-11-18 18:42:22.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/fs/xfs/xfs_vnodeops.c 2012-03-14 10:19:18.000000000 +0100 +@@ -106,6 +106,77 @@ xfs_readlink_bmap( + return error; + } + ++ ++STATIC void ++xfs_get_inode_flags( ++ xfs_inode_t *ip) ++{ ++ struct inode *inode = VFS_I(ip); ++ unsigned int flags = inode->i_flags; ++ unsigned int vflags = inode->i_vflags; ++ ++ if (flags & S_IMMUTABLE) ++ ip->i_d.di_flags |= XFS_DIFLAG_IMMUTABLE; ++ else ++ ip->i_d.di_flags &= ~XFS_DIFLAG_IMMUTABLE; ++ if (flags & S_IXUNLINK) ++ ip->i_d.di_flags |= XFS_DIFLAG_IXUNLINK; ++ else ++ ip->i_d.di_flags &= ~XFS_DIFLAG_IXUNLINK; ++ ++ if (vflags & V_BARRIER) ++ ip->i_d.di_vflags |= XFS_DIVFLAG_BARRIER; ++ else ++ ip->i_d.di_vflags &= ~XFS_DIVFLAG_BARRIER; ++ if (vflags & V_COW) ++ ip->i_d.di_vflags |= XFS_DIVFLAG_COW; ++ else ++ ip->i_d.di_vflags &= ~XFS_DIVFLAG_COW; ++} ++ ++int ++xfs_sync_flags( ++ struct inode *inode, ++ int flags, ++ int vflags) ++{ ++ struct xfs_inode *ip = XFS_I(inode); ++ struct xfs_mount *mp = ip->i_mount; ++ struct xfs_trans *tp; ++ unsigned int lock_flags = 0; ++ int code; ++ ++ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); ++ code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); ++ if (code) ++ goto error_out; ++ ++ xfs_ilock(ip, XFS_ILOCK_EXCL); ++ xfs_trans_ijoin(tp, ip, 0); ++ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ xfs_get_inode_flags(ip); ++ ++ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); ++ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); ++ ++ XFS_STATS_INC(xs_ig_attrchg); ++ ++ if (mp->m_flags & XFS_MOUNT_WSYNC) ++ xfs_trans_set_sync(tp); ++ code = xfs_trans_commit(tp, 0); ++ xfs_iunlock(ip, XFS_ILOCK_EXCL); ++ return code; ++ ++error_out: ++ xfs_trans_cancel(tp, 0); ++ if (lock_flags) ++ xfs_iunlock(ip, XFS_ILOCK_EXCL); ++ return code; ++} ++ ++ + int + xfs_readlink( + xfs_inode_t *ip, +diff -NurpP --minimal linux-3.2.34/include/linux/Kbuild linux-3.2.34-vs2.3.2.15/include/linux/Kbuild +--- linux-3.2.34/include/linux/Kbuild 2012-11-18 18:42:22.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/Kbuild 2012-08-13 12:40:51.000000000 +0200 +@@ -17,6 +17,7 @@ header-y += netfilter_bridge/ + header-y += netfilter_ipv4/ + header-y += netfilter_ipv6/ + header-y += usb/ ++header-y += vserver/ + header-y += wimax/ + + objhdr-y += version.h +diff -NurpP --minimal linux-3.2.34/include/linux/capability.h linux-3.2.34-vs2.3.2.15/include/linux/capability.h +--- linux-3.2.34/include/linux/capability.h 2012-01-09 16:14:56.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/capability.h 2011-12-05 19:33:02.000000000 +0100 +@@ -280,6 +280,7 @@ struct cpu_vfs_cap_data { + arbitrary SCSI commands */ + /* Allow setting encryption key on loopback filesystem */ + /* Allow setting zone reclaim policy */ ++/* Allow the selection of a security context */ + + #define CAP_SYS_ADMIN 21 + +@@ -363,7 +364,12 @@ struct cpu_vfs_cap_data { + + #define CAP_LAST_CAP CAP_WAKE_ALARM + +-#define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) ++/* Allow context manipulations */ ++/* Allow changing context info on files */ ++ ++#define CAP_CONTEXT 63 ++ ++#define cap_valid(x) ((x) >= 0 && ((x) <= CAP_LAST_CAP || (x) == CAP_CONTEXT)) + + /* + * Bit location of each capability (used by user-space library and kernel) +diff -NurpP --minimal linux-3.2.34/include/linux/cred.h linux-3.2.34-vs2.3.2.15/include/linux/cred.h +--- linux-3.2.34/include/linux/cred.h 2011-10-24 18:45:31.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/include/linux/cred.h 2011-12-05 19:33:02.000000000 +0100 +@@ -156,6 +156,7 @@ extern void exit_creds(struct task_struc + extern int copy_creds(struct task_struct *, unsigned long); + extern const struct cred *get_task_cred(struct task_struct *); + extern struct cred *cred_alloc_blank(void); ++extern struct cred *__prepare_creds(const struct cred *); + extern struct cred *prepare_creds(void); + extern struct cred *prepare_exec_creds(void); + extern int commit_creds(struct cred *); +@@ -209,6 +210,31 @@ static inline void validate_process_cred + } + #endif + ++static inline void set_cred_subscribers(struct cred *cred, int n) ++{ ++#ifdef CONFIG_DEBUG_CREDENTIALS ++ atomic_set(&cred->subscribers, n); ++#endif ++} ++ ++static inline int read_cred_subscribers(const struct cred *cred) ++{ ++#ifdef CONFIG_DEBUG_CREDENTIALS ++ return atomic_read(&cred->subscribers); ++#else ++ return 0; ++#endif ++} ++ ++static inline void alter_cred_subscribers(const struct cred *_cred, int n) ++{ ++#ifdef CONFIG_DEBUG_CREDENTIALS ++ struct cred *cred = (struct cred *) _cred; ++ ++ atomic_add(n, &cred->subscribers); ++#endif ++} ++ + /** + * get_new_cred - Get a reference on a new set of credentials + * @cred: The new credentials to reference +diff -NurpP --minimal linux-3.2.34/include/linux/devpts_fs.h linux-3.2.34-vs2.3.2.15/include/linux/devpts_fs.h +--- linux-3.2.34/include/linux/devpts_fs.h 2008-12-25 00:26:37.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/devpts_fs.h 2011-12-05 19:33:02.000000000 +0100 +@@ -45,5 +45,4 @@ static inline void devpts_pty_kill(struc + + #endif + +- + #endif /* _LINUX_DEVPTS_FS_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/ext2_fs.h linux-3.2.34-vs2.3.2.15/include/linux/ext2_fs.h +--- linux-3.2.34/include/linux/ext2_fs.h 2012-01-09 16:14:56.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/ext2_fs.h 2011-12-05 19:33:02.000000000 +0100 +@@ -190,8 +190,12 @@ struct ext2_group_desc + #define EXT2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */ + #define EXT2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */ + #define EXT2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/ ++#define EXT2_IXUNLINK_FL FS_IXUNLINK_FL /* Immutable invert on unlink */ + #define EXT2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */ + ++#define EXT2_BARRIER_FL FS_BARRIER_FL /* Barrier for chroot() */ ++#define EXT2_COW_FL FS_COW_FL /* Copy on Write marker */ ++ + #define EXT2_FL_USER_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */ + #define EXT2_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */ + +@@ -275,7 +279,8 @@ struct ext2_inode { + __u16 i_pad1; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ +- __u32 l_i_reserved2; ++ __le16 l_i_tag; /* Context Tag */ ++ __u16 l_i_reserved2; + } linux2; + struct { + __u8 h_i_frag; /* Fragment number */ +@@ -304,6 +309,7 @@ struct ext2_inode { + #define i_gid_low i_gid + #define i_uid_high osd2.linux2.l_i_uid_high + #define i_gid_high osd2.linux2.l_i_gid_high ++#define i_raw_tag osd2.linux2.l_i_tag + #define i_reserved2 osd2.linux2.l_i_reserved2 + #endif + +@@ -348,6 +354,7 @@ struct ext2_inode { + #define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ + #define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ + #define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ ++#define EXT2_MOUNT_TAGGED (1<<24) /* Enable Context Tags */ + + + #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt +diff -NurpP --minimal linux-3.2.34/include/linux/ext3_fs.h linux-3.2.34-vs2.3.2.15/include/linux/ext3_fs.h +--- linux-3.2.34/include/linux/ext3_fs.h 2012-01-09 16:14:56.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/ext3_fs.h 2011-12-05 19:33:02.000000000 +0100 +@@ -173,10 +173,14 @@ struct ext3_group_desc + #define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ + #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ + #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ ++#define EXT3_IXUNLINK_FL 0x08000000 /* Immutable invert on unlink */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +-#define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ ++#define EXT3_BARRIER_FL 0x04000000 /* Barrier for chroot() */ ++#define EXT3_COW_FL 0x20000000 /* Copy on Write marker */ ++ ++#define EXT3_FL_USER_VISIBLE 0x0103DFFF /* User visible flags */ ++#define EXT3_FL_USER_MODIFIABLE 0x010380FF /* User modifiable flags */ + + /* Flags that should be inherited by new inodes from their parent. */ + #define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\ +@@ -312,7 +316,8 @@ struct ext3_inode { + __u16 i_pad1; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ +- __u32 l_i_reserved2; ++ __le16 l_i_tag; /* Context Tag */ ++ __u16 l_i_reserved2; + } linux2; + struct { + __u8 h_i_frag; /* Fragment number */ +@@ -343,6 +348,7 @@ struct ext3_inode { + #define i_gid_low i_gid + #define i_uid_high osd2.linux2.l_i_uid_high + #define i_gid_high osd2.linux2.l_i_gid_high ++#define i_raw_tag osd2.linux2.l_i_tag + #define i_reserved2 osd2.linux2.l_i_reserved2 + + #elif defined(__GNU__) +@@ -405,6 +411,7 @@ struct ext3_inode { + #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ + #define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write + * error in ordered mode */ ++#define EXT3_MOUNT_TAGGED (1<<24) /* Enable Context Tags */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -918,6 +925,7 @@ extern void ext3_get_inode_flags(struct + extern void ext3_set_aops(struct inode *inode); + extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); ++extern int ext3_sync_flags(struct inode *, int, int); + + /* ioctl.c */ + extern long ext3_ioctl(struct file *, unsigned int, unsigned long); +diff -NurpP --minimal linux-3.2.34/include/linux/fs.h linux-3.2.34-vs2.3.2.15/include/linux/fs.h +--- linux-3.2.34/include/linux/fs.h 2012-11-18 18:42:23.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/fs.h 2012-08-13 12:40:51.000000000 +0200 +@@ -210,6 +210,9 @@ struct inodes_stat_t { + #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ + #define MS_I_VERSION (1<<23) /* Update inode I_version field */ + #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ ++#define MS_TAGGED (1<<25) /* use generic inode tagging */ ++#define MS_TAGID (1<<26) /* use specific tag for this mount */ ++#define MS_NOTAGCHECK (1<<27) /* don't check tags */ + #define MS_NOSEC (1<<28) + #define MS_BORN (1<<29) + #define MS_ACTIVE (1<<30) +@@ -241,6 +244,14 @@ struct inodes_stat_t { + #define S_IMA 1024 /* Inode has an associated IMA struct */ + #define S_AUTOMOUNT 2048 /* Automount/referral quasi-directory */ + #define S_NOSEC 4096 /* no suid or xattr security attributes */ ++#define S_IXUNLINK 8192 /* Immutable Invert on unlink */ ++ ++/* Linux-VServer related Inode flags */ ++ ++#define V_VALID 1 ++#define V_XATTR 2 ++#define V_BARRIER 4 /* Barrier for chroot() */ ++#define V_COW 8 /* Copy on Write */ + + /* + * Note that nosuid etc flags are inode-specific: setting some file-system +@@ -263,12 +274,15 @@ struct inodes_stat_t { + #define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \ + ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) + #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) +-#define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME) +-#define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION) ++#define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME) ++#define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION) ++#define IS_TAGGED(inode) __IS_FLG(inode, MS_TAGGED) + + #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) + #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) + #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) ++#define IS_IXUNLINK(inode) ((inode)->i_flags & S_IXUNLINK) ++#define IS_IXORUNLINK(inode) ((IS_IXUNLINK(inode) ? S_IMMUTABLE : 0) ^ IS_IMMUTABLE(inode)) + #define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL) + + #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) +@@ -279,6 +293,16 @@ struct inodes_stat_t { + #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) + #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) + ++#define IS_BARRIER(inode) (S_ISDIR((inode)->i_mode) && ((inode)->i_vflags & V_BARRIER)) ++ ++#ifdef CONFIG_VSERVER_COWBL ++# define IS_COW(inode) (IS_IXUNLINK(inode) && IS_IMMUTABLE(inode)) ++# define IS_COW_LINK(inode) (S_ISREG((inode)->i_mode) && ((inode)->i_nlink > 1)) ++#else ++# define IS_COW(inode) (0) ++# define IS_COW_LINK(inode) (0) ++#endif ++ + /* the read-only stuff doesn't really belong here, but any other place is + probably as bad and I don't want to create yet another include file. */ + +@@ -364,11 +388,14 @@ struct inodes_stat_t { + #define FS_EXTENT_FL 0x00080000 /* Extents */ + #define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */ + #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ ++#define FS_IXUNLINK_FL 0x08000000 /* Immutable invert on unlink */ + #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ + +-#define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +-#define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ ++#define FS_BARRIER_FL 0x04000000 /* Barrier for chroot() */ ++#define FS_COW_FL 0x20000000 /* Copy on Write marker */ + ++#define FS_FL_USER_VISIBLE 0x0103DFFF /* User visible flags */ ++#define FS_FL_USER_MODIFIABLE 0x010380FF /* User modifiable flags */ + + #define SYNC_FILE_RANGE_WAIT_BEFORE 1 + #define SYNC_FILE_RANGE_WRITE 2 +@@ -449,6 +476,7 @@ typedef void (dio_iodone_t)(struct kiocb + #define ATTR_KILL_PRIV (1 << 14) + #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ + #define ATTR_TIMES_SET (1 << 16) ++#define ATTR_TAG (1 << 17) + + /* + * This is the Inode Attributes structure, used for notify_change(). It +@@ -464,6 +492,7 @@ struct iattr { + umode_t ia_mode; + uid_t ia_uid; + gid_t ia_gid; ++ tag_t ia_tag; + loff_t ia_size; + struct timespec ia_atime; + struct timespec ia_mtime; +@@ -477,6 +506,9 @@ struct iattr { + struct file *ia_file; + }; + ++#define ATTR_FLAG_BARRIER 512 /* Barrier for chroot() */ ++#define ATTR_FLAG_IXUNLINK 1024 /* Immutable invert on unlink */ ++ + /* + * Includes for diskquotas. + */ +@@ -755,7 +787,9 @@ struct inode { + unsigned short i_opflags; + uid_t i_uid; + gid_t i_gid; +- unsigned int i_flags; ++ tag_t i_tag; ++ unsigned short i_flags; ++ unsigned short i_vflags; + + #ifdef CONFIG_FS_POSIX_ACL + struct posix_acl *i_acl; +@@ -784,6 +818,7 @@ struct inode { + unsigned int __i_nlink; + }; + dev_t i_rdev; ++ dev_t i_mdev; + struct timespec i_atime; + struct timespec i_mtime; + struct timespec i_ctime; +@@ -921,12 +956,12 @@ static inline void i_size_write(struct i + + static inline unsigned iminor(const struct inode *inode) + { +- return MINOR(inode->i_rdev); ++ return MINOR(inode->i_mdev); + } + + static inline unsigned imajor(const struct inode *inode) + { +- return MAJOR(inode->i_rdev); ++ return MAJOR(inode->i_mdev); + } + + extern struct block_device *I_BDEV(struct inode *inode); +@@ -993,6 +1028,7 @@ struct file { + loff_t f_pos; + struct fown_struct f_owner; + const struct cred *f_cred; ++ xid_t f_xid; + struct file_ra_state f_ra; + + u64 f_version; +@@ -1140,6 +1176,7 @@ struct file_lock { + struct file *fl_file; + loff_t fl_start; + loff_t fl_end; ++ xid_t fl_xid; + + struct fasync_struct * fl_fasync; /* for lease break notifications */ + /* for lease breaks: */ +@@ -1645,6 +1682,7 @@ struct inode_operations { + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); + ssize_t (*listxattr) (struct dentry *, char *, size_t); + int (*removexattr) (struct dentry *, const char *); ++ int (*sync_flags) (struct inode *, int, int); + void (*truncate_range)(struct inode *, loff_t, loff_t); + int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, + u64 len); +@@ -1664,6 +1702,7 @@ extern ssize_t vfs_readv(struct file *, + unsigned long, loff_t *); + extern ssize_t vfs_writev(struct file *, const struct iovec __user *, + unsigned long, loff_t *); ++ssize_t vfs_sendfile(struct file *, struct file *, loff_t *, size_t, loff_t); + + struct super_operations { + struct inode *(*alloc_inode)(struct super_block *sb); +@@ -2552,6 +2591,7 @@ extern int dcache_dir_open(struct inode + extern int dcache_dir_close(struct inode *, struct file *); + extern loff_t dcache_dir_lseek(struct file *, loff_t, int); + extern int dcache_readdir(struct file *, void *, filldir_t); ++extern int dcache_readdir_filter(struct file *, void *, filldir_t, int (*)(struct dentry *)); + extern int simple_setattr(struct dentry *, struct iattr *); + extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); + extern int simple_statfs(struct dentry *, struct kstatfs *); +diff -NurpP --minimal linux-3.2.34/include/linux/gfs2_ondisk.h linux-3.2.34-vs2.3.2.15/include/linux/gfs2_ondisk.h +--- linux-3.2.34/include/linux/gfs2_ondisk.h 2010-07-07 18:31:55.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/include/linux/gfs2_ondisk.h 2011-12-05 19:33:02.000000000 +0100 +@@ -211,6 +211,9 @@ enum { + gfs2fl_NoAtime = 7, + gfs2fl_Sync = 8, + gfs2fl_System = 9, ++ gfs2fl_IXUnlink = 16, ++ gfs2fl_Barrier = 17, ++ gfs2fl_Cow = 18, + gfs2fl_TruncInProg = 29, + gfs2fl_InheritDirectio = 30, + gfs2fl_InheritJdata = 31, +@@ -227,6 +230,9 @@ enum { + #define GFS2_DIF_NOATIME 0x00000080 + #define GFS2_DIF_SYNC 0x00000100 + #define GFS2_DIF_SYSTEM 0x00000200 /* New in gfs2 */ ++#define GFS2_DIF_IXUNLINK 0x00010000 ++#define GFS2_DIF_BARRIER 0x00020000 ++#define GFS2_DIF_COW 0x00040000 + #define GFS2_DIF_TRUNC_IN_PROG 0x20000000 /* New in gfs2 */ + #define GFS2_DIF_INHERIT_DIRECTIO 0x40000000 + #define GFS2_DIF_INHERIT_JDATA 0x80000000 +diff -NurpP --minimal linux-3.2.34/include/linux/if_tun.h linux-3.2.34-vs2.3.2.15/include/linux/if_tun.h +--- linux-3.2.34/include/linux/if_tun.h 2010-08-02 16:52:54.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/include/linux/if_tun.h 2011-12-05 19:33:02.000000000 +0100 +@@ -53,6 +53,7 @@ + #define TUNDETACHFILTER _IOW('T', 214, struct sock_fprog) + #define TUNGETVNETHDRSZ _IOR('T', 215, int) + #define TUNSETVNETHDRSZ _IOW('T', 216, int) ++#define TUNSETNID _IOW('T', 217, int) + + /* TUNSETIFF ifr flags */ + #define IFF_TUN 0x0001 +diff -NurpP --minimal linux-3.2.34/include/linux/init_task.h linux-3.2.34-vs2.3.2.15/include/linux/init_task.h +--- linux-3.2.34/include/linux/init_task.h 2012-11-18 18:42:23.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/init_task.h 2012-08-13 12:42:19.000000000 +0200 +@@ -211,6 +211,10 @@ extern struct task_group root_task_group + INIT_TRACE_RECURSION \ + INIT_TASK_RCU_PREEMPT(tsk) \ + INIT_CPUSET_SEQ \ ++ .xid = 0, \ ++ .vx_info = NULL, \ ++ .nid = 0, \ ++ .nx_info = NULL, \ + } + + +diff -NurpP --minimal linux-3.2.34/include/linux/ipc.h linux-3.2.34-vs2.3.2.15/include/linux/ipc.h +--- linux-3.2.34/include/linux/ipc.h 2009-12-03 20:02:55.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/ipc.h 2011-12-05 19:33:02.000000000 +0100 +@@ -91,6 +91,7 @@ struct kern_ipc_perm + key_t key; + uid_t uid; + gid_t gid; ++ xid_t xid; + uid_t cuid; + gid_t cgid; + mode_t mode; +diff -NurpP --minimal linux-3.2.34/include/linux/ipc_namespace.h linux-3.2.34-vs2.3.2.15/include/linux/ipc_namespace.h +--- linux-3.2.34/include/linux/ipc_namespace.h 2011-10-24 18:45:32.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/include/linux/ipc_namespace.h 2011-12-05 19:33:02.000000000 +0100 +@@ -101,7 +101,8 @@ static inline int mq_init_ns(struct ipc_ + + #if defined(CONFIG_IPC_NS) + extern struct ipc_namespace *copy_ipcs(unsigned long flags, +- struct task_struct *tsk); ++ struct ipc_namespace *old_ns, ++ struct user_namespace *user_ns); + static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) + { + if (ns) +@@ -112,12 +113,13 @@ static inline struct ipc_namespace *get_ + extern void put_ipc_ns(struct ipc_namespace *ns); + #else + static inline struct ipc_namespace *copy_ipcs(unsigned long flags, +- struct task_struct *tsk) ++ struct ipc_namespace *old_ns, ++ struct user_namespace *user_ns) + { + if (flags & CLONE_NEWIPC) + return ERR_PTR(-EINVAL); + +- return tsk->nsproxy->ipc_ns; ++ return old_ns; + } + + static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) +diff -NurpP --minimal linux-3.2.34/include/linux/loop.h linux-3.2.34-vs2.3.2.15/include/linux/loop.h +--- linux-3.2.34/include/linux/loop.h 2012-01-09 16:14:58.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/loop.h 2011-12-05 19:33:02.000000000 +0100 +@@ -45,6 +45,7 @@ struct loop_device { + struct loop_func_table *lo_encryption; + __u32 lo_init[2]; + uid_t lo_key_owner; /* Who set the key */ ++ xid_t lo_xid; + int (*ioctl)(struct loop_device *, int cmd, + unsigned long arg); + +diff -NurpP --minimal linux-3.2.34/include/linux/magic.h linux-3.2.34-vs2.3.2.15/include/linux/magic.h +--- linux-3.2.34/include/linux/magic.h 2012-01-09 16:14:58.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/magic.h 2011-12-05 19:33:02.000000000 +0100 +@@ -3,7 +3,7 @@ + + #define ADFS_SUPER_MAGIC 0xadf5 + #define AFFS_SUPER_MAGIC 0xadff +-#define AFS_SUPER_MAGIC 0x5346414F ++#define AFS_SUPER_MAGIC 0x5346414F + #define AUTOFS_SUPER_MAGIC 0x0187 + #define CODA_SUPER_MAGIC 0x73757245 + #define CRAMFS_MAGIC 0x28cd3d45 /* some random number */ +@@ -41,6 +41,7 @@ + #define NFS_SUPER_MAGIC 0x6969 + #define OPENPROM_SUPER_MAGIC 0x9fa1 + #define PROC_SUPER_MAGIC 0x9fa0 ++#define DEVPTS_SUPER_MAGIC 0x1cd1 + #define QNX4_SUPER_MAGIC 0x002f /* qnx4 fs detection */ + + #define REISERFS_SUPER_MAGIC 0x52654973 /* used by gcc */ +diff -NurpP --minimal linux-3.2.34/include/linux/major.h linux-3.2.34-vs2.3.2.15/include/linux/major.h +--- linux-3.2.34/include/linux/major.h 2009-09-10 15:26:25.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/include/linux/major.h 2011-12-05 19:33:02.000000000 +0100 +@@ -15,6 +15,7 @@ + #define HD_MAJOR IDE0_MAJOR + #define PTY_SLAVE_MAJOR 3 + #define TTY_MAJOR 4 ++#define VROOT_MAJOR 4 + #define TTYAUX_MAJOR 5 + #define LP_MAJOR 6 + #define VCS_MAJOR 7 +diff -NurpP --minimal linux-3.2.34/include/linux/memcontrol.h linux-3.2.34-vs2.3.2.15/include/linux/memcontrol.h +--- linux-3.2.34/include/linux/memcontrol.h 2012-11-18 18:42:23.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/memcontrol.h 2012-01-26 08:52:10.000000000 +0100 +@@ -85,6 +85,13 @@ extern struct mem_cgroup *try_get_mem_cg + extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); + extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm); + ++extern u64 mem_cgroup_res_read_u64(struct mem_cgroup *mem, int member); ++extern u64 mem_cgroup_memsw_read_u64(struct mem_cgroup *mem, int member); ++ ++extern s64 mem_cgroup_stat_read_cache(struct mem_cgroup *mem); ++extern s64 mem_cgroup_stat_read_anon(struct mem_cgroup *mem); ++extern s64 mem_cgroup_stat_read_mapped(struct mem_cgroup *mem); ++ + static inline + int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) + { +diff -NurpP --minimal linux-3.2.34/include/linux/mm_types.h linux-3.2.34-vs2.3.2.15/include/linux/mm_types.h +--- linux-3.2.34/include/linux/mm_types.h 2012-01-09 16:14:58.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/mm_types.h 2011-12-05 19:33:02.000000000 +0100 +@@ -344,6 +344,7 @@ struct mm_struct { + + /* Architecture-specific MM context */ + mm_context_t context; ++ struct vx_info *mm_vx_info; + + /* Swap token stuff */ + /* +diff -NurpP --minimal linux-3.2.34/include/linux/mmzone.h linux-3.2.34-vs2.3.2.15/include/linux/mmzone.h +--- linux-3.2.34/include/linux/mmzone.h 2012-11-18 18:42:23.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/mmzone.h 2012-08-13 12:40:51.000000000 +0200 +@@ -675,6 +675,13 @@ typedef struct pglist_data { + __pgdat->node_start_pfn + __pgdat->node_spanned_pages;\ + }) + ++#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) ++ ++#define node_end_pfn(nid) ({\ ++ pg_data_t *__pgdat = NODE_DATA(nid);\ ++ __pgdat->node_start_pfn + __pgdat->node_spanned_pages;\ ++}) ++ + #include + + extern struct mutex zonelists_mutex; +diff -NurpP --minimal linux-3.2.34/include/linux/mount.h linux-3.2.34-vs2.3.2.15/include/linux/mount.h +--- linux-3.2.34/include/linux/mount.h 2011-10-24 18:45:32.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/include/linux/mount.h 2011-12-05 19:33:02.000000000 +0100 +@@ -52,6 +52,9 @@ struct mnt_pcp { + int mnt_writers; + }; + ++#define MNT_TAGID 0x10000 ++#define MNT_NOTAG 0x20000 ++ + struct vfsmount { + struct list_head mnt_hash; + struct vfsmount *mnt_parent; /* fs we are mounted on */ +@@ -86,6 +89,7 @@ struct vfsmount { + int mnt_expiry_mark; /* true if marked for expiry */ + int mnt_pinned; + int mnt_ghosts; ++ tag_t mnt_tag; /* tagging used for vfsmount */ + }; + + struct file; /* forward dec */ +diff -NurpP --minimal linux-3.2.34/include/linux/net.h linux-3.2.34-vs2.3.2.15/include/linux/net.h +--- linux-3.2.34/include/linux/net.h 2011-07-22 11:18:11.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/include/linux/net.h 2011-12-05 19:33:02.000000000 +0100 +@@ -72,6 +72,7 @@ struct net; + #define SOCK_NOSPACE 2 + #define SOCK_PASSCRED 3 + #define SOCK_PASSSEC 4 ++#define SOCK_USER_SOCKET 5 + + #ifndef ARCH_HAS_SOCKET_TYPES + /** +diff -NurpP --minimal linux-3.2.34/include/linux/netdevice.h linux-3.2.34-vs2.3.2.15/include/linux/netdevice.h +--- linux-3.2.34/include/linux/netdevice.h 2012-11-18 18:42:23.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/netdevice.h 2012-10-22 12:59:52.000000000 +0200 +@@ -1641,6 +1641,7 @@ extern void netdev_resync_ops(struct ne + + extern struct net_device *dev_get_by_index(struct net *net, int ifindex); + extern struct net_device *__dev_get_by_index(struct net *net, int ifindex); ++extern struct net_device *dev_get_by_index_real_rcu(struct net *net, int ifindex); + extern struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); + extern int dev_restart(struct net_device *dev); + #ifdef CONFIG_NETPOLL_TRAP +diff -NurpP --minimal linux-3.2.34/include/linux/nfs_mount.h linux-3.2.34-vs2.3.2.15/include/linux/nfs_mount.h +--- linux-3.2.34/include/linux/nfs_mount.h 2011-01-05 21:50:31.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/nfs_mount.h 2011-12-05 19:33:02.000000000 +0100 +@@ -63,7 +63,8 @@ struct nfs_mount_data { + #define NFS_MOUNT_SECFLAVOUR 0x2000 /* 5 */ + #define NFS_MOUNT_NORDIRPLUS 0x4000 /* 5 */ + #define NFS_MOUNT_UNSHARED 0x8000 /* 5 */ +-#define NFS_MOUNT_FLAGMASK 0xFFFF ++#define NFS_MOUNT_TAGGED 0x10000 /* context tagging */ ++#define NFS_MOUNT_FLAGMASK 0x1FFFF + + /* The following are for internal use only */ + #define NFS_MOUNT_LOOKUP_CACHE_NONEG 0x10000 +diff -NurpP --minimal linux-3.2.34/include/linux/nsproxy.h linux-3.2.34-vs2.3.2.15/include/linux/nsproxy.h +--- linux-3.2.34/include/linux/nsproxy.h 2011-10-24 18:45:32.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/include/linux/nsproxy.h 2011-12-05 19:33:02.000000000 +0100 +@@ -3,6 +3,7 @@ + + #include + #include ++#include + + struct mnt_namespace; + struct uts_namespace; +@@ -63,6 +64,7 @@ static inline struct nsproxy *task_nspro + } + + int copy_namespaces(unsigned long flags, struct task_struct *tsk); ++struct nsproxy *copy_nsproxy(struct nsproxy *orig); + void exit_task_namespaces(struct task_struct *tsk); + void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); + void free_nsproxy(struct nsproxy *ns); +@@ -70,16 +72,26 @@ int unshare_nsproxy_namespaces(unsigned + struct fs_struct *); + int __init nsproxy_cache_init(void); + +-static inline void put_nsproxy(struct nsproxy *ns) ++#define get_nsproxy(n) __get_nsproxy(n, __FILE__, __LINE__) ++ ++static inline void __get_nsproxy(struct nsproxy *ns, ++ const char *_file, int _line) + { +- if (atomic_dec_and_test(&ns->count)) { +- free_nsproxy(ns); +- } ++ vxlprintk(VXD_CBIT(space, 0), "get_nsproxy(%p[%u])", ++ ns, atomic_read(&ns->count), _file, _line); ++ atomic_inc(&ns->count); + } + +-static inline void get_nsproxy(struct nsproxy *ns) ++#define put_nsproxy(n) __put_nsproxy(n, __FILE__, __LINE__) ++ ++static inline void __put_nsproxy(struct nsproxy *ns, ++ const char *_file, int _line) + { +- atomic_inc(&ns->count); ++ vxlprintk(VXD_CBIT(space, 0), "put_nsproxy(%p[%u])", ++ ns, atomic_read(&ns->count), _file, _line); ++ if (atomic_dec_and_test(&ns->count)) { ++ free_nsproxy(ns); ++ } + } + + #endif +diff -NurpP --minimal linux-3.2.34/include/linux/pid.h linux-3.2.34-vs2.3.2.15/include/linux/pid.h +--- linux-3.2.34/include/linux/pid.h 2011-07-22 11:18:11.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/include/linux/pid.h 2011-12-05 19:33:02.000000000 +0100 +@@ -8,7 +8,8 @@ enum pid_type + PIDTYPE_PID, + PIDTYPE_PGID, + PIDTYPE_SID, +- PIDTYPE_MAX ++ PIDTYPE_MAX, ++ PIDTYPE_REALPID + }; + + /* +@@ -171,6 +172,7 @@ static inline pid_t pid_nr(struct pid *p + } + + pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns); ++pid_t pid_unmapped_nr_ns(struct pid *pid, struct pid_namespace *ns); + pid_t pid_vnr(struct pid *pid); + + #define do_each_pid_task(pid, type, task) \ +diff -NurpP --minimal linux-3.2.34/include/linux/proc_fs.h linux-3.2.34-vs2.3.2.15/include/linux/proc_fs.h +--- linux-3.2.34/include/linux/proc_fs.h 2011-10-24 18:45:32.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/include/linux/proc_fs.h 2011-12-05 19:33:02.000000000 +0100 +@@ -54,6 +54,7 @@ struct proc_dir_entry { + nlink_t nlink; + uid_t uid; + gid_t gid; ++ int vx_flags; + loff_t size; + const struct inode_operations *proc_iops; + /* +@@ -252,12 +253,18 @@ extern const struct proc_ns_operations n + extern const struct proc_ns_operations utsns_operations; + extern const struct proc_ns_operations ipcns_operations; + ++struct vx_info; ++struct nx_info; ++ + union proc_op { + int (*proc_get_link)(struct inode *, struct path *); + int (*proc_read)(struct task_struct *task, char *page); + int (*proc_show)(struct seq_file *m, + struct pid_namespace *ns, struct pid *pid, + struct task_struct *task); ++ int (*proc_vs_read)(char *page); ++ int (*proc_vxi_read)(struct vx_info *vxi, char *page); ++ int (*proc_nxi_read)(struct nx_info *nxi, char *page); + }; + + struct ctl_table_header; +@@ -265,6 +272,7 @@ struct ctl_table; + + struct proc_inode { + struct pid *pid; ++ int vx_flags; + int fd; + union proc_op op; + struct proc_dir_entry *pde; +diff -NurpP --minimal linux-3.2.34/include/linux/quotaops.h linux-3.2.34-vs2.3.2.15/include/linux/quotaops.h +--- linux-3.2.34/include/linux/quotaops.h 2012-01-09 16:14:58.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/quotaops.h 2011-12-05 19:33:02.000000000 +0100 +@@ -8,6 +8,7 @@ + #define _LINUX_QUOTAOPS_ + + #include ++#include + + #define DQUOT_SPACE_WARN 0x1 + #define DQUOT_SPACE_RESERVE 0x2 +@@ -204,11 +205,12 @@ static inline void dquot_drop(struct ino + + static inline int dquot_alloc_inode(const struct inode *inode) + { +- return 0; ++ return dl_alloc_inode(inode); + } + + static inline void dquot_free_inode(const struct inode *inode) + { ++ dl_free_inode(inode); + } + + static inline int dquot_transfer(struct inode *inode, struct iattr *iattr) +@@ -219,6 +221,10 @@ static inline int dquot_transfer(struct + static inline int __dquot_alloc_space(struct inode *inode, qsize_t number, + int flags) + { ++ int ret = 0; ++ ++ if ((ret = dl_alloc_space(inode, number))) ++ return ret; + if (!(flags & DQUOT_SPACE_RESERVE)) + inode_add_bytes(inode, number); + return 0; +@@ -229,6 +235,7 @@ static inline void __dquot_free_space(st + { + if (!(flags & DQUOT_SPACE_RESERVE)) + inode_sub_bytes(inode, number); ++ dl_free_space(inode, number); + } + + static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) +diff -NurpP --minimal linux-3.2.34/include/linux/reboot.h linux-3.2.34-vs2.3.2.15/include/linux/reboot.h +--- linux-3.2.34/include/linux/reboot.h 2011-10-24 18:45:32.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/include/linux/reboot.h 2011-12-05 19:33:02.000000000 +0100 +@@ -33,6 +33,7 @@ + #define LINUX_REBOOT_CMD_RESTART2 0xA1B2C3D4 + #define LINUX_REBOOT_CMD_SW_SUSPEND 0xD000FCE2 + #define LINUX_REBOOT_CMD_KEXEC 0x45584543 ++#define LINUX_REBOOT_CMD_OOM 0xDEADBEEF + + + #ifdef __KERNEL__ +diff -NurpP --minimal linux-3.2.34/include/linux/reiserfs_fs.h linux-3.2.34-vs2.3.2.15/include/linux/reiserfs_fs.h +--- linux-3.2.34/include/linux/reiserfs_fs.h 2011-10-24 18:45:32.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/include/linux/reiserfs_fs.h 2011-12-05 19:33:02.000000000 +0100 +@@ -976,6 +976,11 @@ struct stat_data_v1 { + #define REISERFS_COMPR_FL FS_COMPR_FL + #define REISERFS_NOTAIL_FL FS_NOTAIL_FL + ++/* unfortunately reiserfs sdattr is only 16 bit */ ++#define REISERFS_IXUNLINK_FL (FS_IXUNLINK_FL >> 16) ++#define REISERFS_BARRIER_FL (FS_BARRIER_FL >> 16) ++#define REISERFS_COW_FL (FS_COW_FL >> 16) ++ + /* persistent flags that file inherits from the parent directory */ + #define REISERFS_INHERIT_MASK ( REISERFS_IMMUTABLE_FL | \ + REISERFS_SYNC_FL | \ +@@ -985,6 +990,9 @@ struct stat_data_v1 { + REISERFS_COMPR_FL | \ + REISERFS_NOTAIL_FL ) + ++#define REISERFS_FL_USER_VISIBLE 0x80FF ++#define REISERFS_FL_USER_MODIFIABLE 0x80FF ++ + /* Stat Data on disk (reiserfs version of UFS disk inode minus the + address blocks) */ + struct stat_data { +@@ -2073,6 +2081,7 @@ static inline void reiserfs_update_sd(st + void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); + void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs); + int reiserfs_setattr(struct dentry *dentry, struct iattr *attr); ++int reiserfs_sync_flags(struct inode *inode, int, int); + + int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); + +diff -NurpP --minimal linux-3.2.34/include/linux/reiserfs_fs_sb.h linux-3.2.34-vs2.3.2.15/include/linux/reiserfs_fs_sb.h +--- linux-3.2.34/include/linux/reiserfs_fs_sb.h 2010-02-25 11:52:07.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/reiserfs_fs_sb.h 2011-12-05 19:33:02.000000000 +0100 +@@ -476,6 +476,7 @@ enum reiserfs_mount_options { + REISERFS_EXPOSE_PRIVROOT, + REISERFS_BARRIER_NONE, + REISERFS_BARRIER_FLUSH, ++ REISERFS_TAGGED, + + /* Actions on error */ + REISERFS_ERROR_PANIC, +diff -NurpP --minimal linux-3.2.34/include/linux/sched.h linux-3.2.34-vs2.3.2.15/include/linux/sched.h +--- linux-3.2.34/include/linux/sched.h 2012-11-18 18:42:23.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/sched.h 2012-08-13 12:40:51.000000000 +0200 +@@ -1407,6 +1407,14 @@ struct task_struct { + #endif + seccomp_t seccomp; + ++/* vserver context data */ ++ struct vx_info *vx_info; ++ struct nx_info *nx_info; ++ ++ xid_t xid; ++ nid_t nid; ++ tag_t tag; ++ + /* Thread group tracking */ + u32 parent_exec_id; + u32 self_exec_id; +@@ -1655,6 +1663,11 @@ struct pid_namespace; + pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, + struct pid_namespace *ns); + ++#include ++#include ++#include ++#include ++ + static inline pid_t task_pid_nr(struct task_struct *tsk) + { + return tsk->pid; +@@ -1668,7 +1681,8 @@ static inline pid_t task_pid_nr_ns(struc + + static inline pid_t task_pid_vnr(struct task_struct *tsk) + { +- return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); ++ // return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); ++ return vx_map_pid(__task_pid_nr_ns(tsk, PIDTYPE_PID, NULL)); + } + + +@@ -1681,7 +1695,7 @@ pid_t task_tgid_nr_ns(struct task_struct + + static inline pid_t task_tgid_vnr(struct task_struct *tsk) + { +- return pid_vnr(task_tgid(tsk)); ++ return vx_map_tgid(pid_vnr(task_tgid(tsk))); + } + + +diff -NurpP --minimal linux-3.2.34/include/linux/shmem_fs.h linux-3.2.34-vs2.3.2.15/include/linux/shmem_fs.h +--- linux-3.2.34/include/linux/shmem_fs.h 2012-11-18 18:42:23.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/shmem_fs.h 2012-01-26 08:52:10.000000000 +0100 +@@ -8,6 +8,9 @@ + + /* inode in-kernel data */ + ++#define TMPFS_SUPER_MAGIC 0x01021994 ++ ++ + struct shmem_inode_info { + spinlock_t lock; + unsigned long flags; +diff -NurpP --minimal linux-3.2.34/include/linux/stat.h linux-3.2.34-vs2.3.2.15/include/linux/stat.h +--- linux-3.2.34/include/linux/stat.h 2008-12-25 00:26:37.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/stat.h 2011-12-05 19:33:02.000000000 +0100 +@@ -66,6 +66,7 @@ struct kstat { + unsigned int nlink; + uid_t uid; + gid_t gid; ++ tag_t tag; + dev_t rdev; + loff_t size; + struct timespec atime; +diff -NurpP --minimal linux-3.2.34/include/linux/sunrpc/auth.h linux-3.2.34-vs2.3.2.15/include/linux/sunrpc/auth.h +--- linux-3.2.34/include/linux/sunrpc/auth.h 2011-10-24 18:45:32.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/include/linux/sunrpc/auth.h 2011-12-05 19:33:02.000000000 +0100 +@@ -25,6 +25,7 @@ + struct auth_cred { + uid_t uid; + gid_t gid; ++ tag_t tag; + struct group_info *group_info; + unsigned char machine_cred : 1; + }; +diff -NurpP --minimal linux-3.2.34/include/linux/sunrpc/clnt.h linux-3.2.34-vs2.3.2.15/include/linux/sunrpc/clnt.h +--- linux-3.2.34/include/linux/sunrpc/clnt.h 2012-01-09 16:14:58.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/sunrpc/clnt.h 2011-12-05 19:33:02.000000000 +0100 +@@ -50,7 +50,8 @@ struct rpc_clnt { + unsigned int cl_softrtry : 1,/* soft timeouts */ + cl_discrtry : 1,/* disconnect before retry */ + cl_autobind : 1,/* use getport() */ +- cl_chatty : 1;/* be verbose */ ++ cl_chatty : 1,/* be verbose */ ++ cl_tag : 1;/* context tagging */ + + struct rpc_rtt * cl_rtt; /* RTO estimator data */ + const struct rpc_timeout *cl_timeout; /* Timeout strategy */ +diff -NurpP --minimal linux-3.2.34/include/linux/syscalls.h linux-3.2.34-vs2.3.2.15/include/linux/syscalls.h +--- linux-3.2.34/include/linux/syscalls.h 2012-01-09 16:14:58.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/syscalls.h 2011-12-05 19:33:02.000000000 +0100 +@@ -483,6 +483,8 @@ asmlinkage long sys_symlink(const char _ + asmlinkage long sys_unlink(const char __user *pathname); + asmlinkage long sys_rename(const char __user *oldname, + const char __user *newname); ++asmlinkage long sys_copyfile(const char __user *from, const char __user *to, ++ umode_t mode); + asmlinkage long sys_chmod(const char __user *filename, mode_t mode); + asmlinkage long sys_fchmod(unsigned int fd, mode_t mode); + +diff -NurpP --minimal linux-3.2.34/include/linux/sysctl.h linux-3.2.34-vs2.3.2.15/include/linux/sysctl.h +--- linux-3.2.34/include/linux/sysctl.h 2012-01-09 16:14:58.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/sysctl.h 2011-12-05 19:33:02.000000000 +0100 +@@ -60,6 +60,7 @@ enum + CTL_ABI=9, /* Binary emulation */ + CTL_CPU=10, /* CPU stuff (speed scaling, etc) */ + CTL_ARLAN=254, /* arlan wireless driver */ ++ CTL_VSERVER=4242, /* Linux-VServer debug */ + CTL_S390DBF=5677, /* s390 debug */ + CTL_SUNRPC=7249, /* sunrpc debug */ + CTL_PM=9899, /* frv power management */ +@@ -94,6 +95,7 @@ enum + + KERN_PANIC=15, /* int: panic timeout */ + KERN_REALROOTDEV=16, /* real root device to mount after initrd */ ++ KERN_VSHELPER=17, /* string: path to vshelper policy agent */ + + KERN_SPARC_REBOOT=21, /* reboot command on Sparc */ + KERN_CTLALTDEL=22, /* int: allow ctl-alt-del to reboot */ +diff -NurpP --minimal linux-3.2.34/include/linux/sysfs.h linux-3.2.34-vs2.3.2.15/include/linux/sysfs.h +--- linux-3.2.34/include/linux/sysfs.h 2012-01-09 16:14:58.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/sysfs.h 2011-12-05 19:33:02.000000000 +0100 +@@ -19,6 +19,8 @@ + #include + #include + ++#define SYSFS_SUPER_MAGIC 0x62656572 ++ + struct kobject; + struct module; + enum kobj_ns_type; +diff -NurpP --minimal linux-3.2.34/include/linux/time.h linux-3.2.34-vs2.3.2.15/include/linux/time.h +--- linux-3.2.34/include/linux/time.h 2012-11-18 18:42:23.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/time.h 2012-10-22 12:59:52.000000000 +0200 +@@ -281,6 +281,9 @@ static __always_inline void timespec_add + a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); + a->tv_nsec = ns; + } ++ ++#include ++ + #endif /* __KERNEL__ */ + + #define NFDBITS __NFDBITS +diff -NurpP --minimal linux-3.2.34/include/linux/types.h linux-3.2.34-vs2.3.2.15/include/linux/types.h +--- linux-3.2.34/include/linux/types.h 2012-01-09 16:14:59.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/types.h 2011-12-05 19:33:02.000000000 +0100 +@@ -40,6 +40,9 @@ typedef __kernel_uid32_t uid_t; + typedef __kernel_gid32_t gid_t; + typedef __kernel_uid16_t uid16_t; + typedef __kernel_gid16_t gid16_t; ++typedef unsigned int xid_t; ++typedef unsigned int nid_t; ++typedef unsigned int tag_t; + + typedef unsigned long uintptr_t; + +diff -NurpP --minimal linux-3.2.34/include/linux/utsname.h linux-3.2.34-vs2.3.2.15/include/linux/utsname.h +--- linux-3.2.34/include/linux/utsname.h 2012-01-09 16:14:59.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/utsname.h 2011-12-05 19:33:02.000000000 +0100 +@@ -62,7 +62,8 @@ static inline void get_uts_ns(struct uts + } + + extern struct uts_namespace *copy_utsname(unsigned long flags, +- struct task_struct *tsk); ++ struct uts_namespace *old_ns, ++ struct user_namespace *user_ns); + extern void free_uts_ns(struct kref *kref); + + static inline void put_uts_ns(struct uts_namespace *ns) +@@ -79,12 +80,13 @@ static inline void put_uts_ns(struct uts + } + + static inline struct uts_namespace *copy_utsname(unsigned long flags, +- struct task_struct *tsk) ++ struct uts_namespace *old_ns, ++ struct user_namespace *user_ns) + { + if (flags & CLONE_NEWUTS) + return ERR_PTR(-EINVAL); + +- return tsk->nsproxy->uts_ns; ++ return old_ns; + } + #endif + +diff -NurpP --minimal linux-3.2.34/include/linux/vroot.h linux-3.2.34-vs2.3.2.15/include/linux/vroot.h +--- linux-3.2.34/include/linux/vroot.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vroot.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,51 @@ ++ ++/* ++ * include/linux/vroot.h ++ * ++ * written by Herbert Pötzl, 9/11/2002 ++ * ported to 2.6 by Herbert Pötzl, 30/12/2004 ++ * ++ * Copyright (C) 2002-2007 by Herbert Pötzl. ++ * Redistribution of this file is permitted under the ++ * GNU General Public License. ++ */ ++ ++#ifndef _LINUX_VROOT_H ++#define _LINUX_VROOT_H ++ ++ ++#ifdef __KERNEL__ ++ ++/* Possible states of device */ ++enum { ++ Vr_unbound, ++ Vr_bound, ++}; ++ ++struct vroot_device { ++ int vr_number; ++ int vr_refcnt; ++ ++ struct semaphore vr_ctl_mutex; ++ struct block_device *vr_device; ++ int vr_state; ++}; ++ ++ ++typedef struct block_device *(vroot_grb_func)(struct block_device *); ++ ++extern int register_vroot_grb(vroot_grb_func *); ++extern int unregister_vroot_grb(vroot_grb_func *); ++ ++#endif /* __KERNEL__ */ ++ ++#define MAX_VROOT_DEFAULT 8 ++ ++/* ++ * IOCTL commands --- we will commandeer 0x56 ('V') ++ */ ++ ++#define VROOT_SET_DEV 0x5600 ++#define VROOT_CLR_DEV 0x5601 ++ ++#endif /* _LINUX_VROOT_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vs_base.h linux-3.2.34-vs2.3.2.15/include/linux/vs_base.h +--- linux-3.2.34/include/linux/vs_base.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vs_base.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,10 @@ ++#ifndef _VS_BASE_H ++#define _VS_BASE_H ++ ++#include "vserver/base.h" ++#include "vserver/check.h" ++#include "vserver/debug.h" ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vs_context.h linux-3.2.34-vs2.3.2.15/include/linux/vs_context.h +--- linux-3.2.34/include/linux/vs_context.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vs_context.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,242 @@ ++#ifndef _VS_CONTEXT_H ++#define _VS_CONTEXT_H ++ ++#include "vserver/base.h" ++#include "vserver/check.h" ++#include "vserver/context.h" ++#include "vserver/history.h" ++#include "vserver/debug.h" ++ ++#include ++ ++ ++#define get_vx_info(i) __get_vx_info(i, __FILE__, __LINE__, __HERE__) ++ ++static inline struct vx_info *__get_vx_info(struct vx_info *vxi, ++ const char *_file, int _line, void *_here) ++{ ++ if (!vxi) ++ return NULL; ++ ++ vxlprintk(VXD_CBIT(xid, 2), "get_vx_info(%p[#%d.%d])", ++ vxi, vxi ? vxi->vx_id : 0, ++ vxi ? atomic_read(&vxi->vx_usecnt) : 0, ++ _file, _line); ++ __vxh_get_vx_info(vxi, _here); ++ ++ atomic_inc(&vxi->vx_usecnt); ++ return vxi; ++} ++ ++ ++extern void free_vx_info(struct vx_info *); ++ ++#define put_vx_info(i) __put_vx_info(i, __FILE__, __LINE__, __HERE__) ++ ++static inline void __put_vx_info(struct vx_info *vxi, ++ const char *_file, int _line, void *_here) ++{ ++ if (!vxi) ++ return; ++ ++ vxlprintk(VXD_CBIT(xid, 2), "put_vx_info(%p[#%d.%d])", ++ vxi, vxi ? vxi->vx_id : 0, ++ vxi ? atomic_read(&vxi->vx_usecnt) : 0, ++ _file, _line); ++ __vxh_put_vx_info(vxi, _here); ++ ++ if (atomic_dec_and_test(&vxi->vx_usecnt)) ++ free_vx_info(vxi); ++} ++ ++ ++#define init_vx_info(p, i) \ ++ __init_vx_info(p, i, __FILE__, __LINE__, __HERE__) ++ ++static inline void __init_vx_info(struct vx_info **vxp, struct vx_info *vxi, ++ const char *_file, int _line, void *_here) ++{ ++ if (vxi) { ++ vxlprintk(VXD_CBIT(xid, 3), ++ "init_vx_info(%p[#%d.%d])", ++ vxi, vxi ? vxi->vx_id : 0, ++ vxi ? atomic_read(&vxi->vx_usecnt) : 0, ++ _file, _line); ++ __vxh_init_vx_info(vxi, vxp, _here); ++ ++ atomic_inc(&vxi->vx_usecnt); ++ } ++ *vxp = vxi; ++} ++ ++ ++#define set_vx_info(p, i) \ ++ __set_vx_info(p, i, __FILE__, __LINE__, __HERE__) ++ ++static inline void __set_vx_info(struct vx_info **vxp, struct vx_info *vxi, ++ const char *_file, int _line, void *_here) ++{ ++ struct vx_info *vxo; ++ ++ if (!vxi) ++ return; ++ ++ vxlprintk(VXD_CBIT(xid, 3), "set_vx_info(%p[#%d.%d])", ++ vxi, vxi ? vxi->vx_id : 0, ++ vxi ? atomic_read(&vxi->vx_usecnt) : 0, ++ _file, _line); ++ __vxh_set_vx_info(vxi, vxp, _here); ++ ++ atomic_inc(&vxi->vx_usecnt); ++ vxo = xchg(vxp, vxi); ++ BUG_ON(vxo); ++} ++ ++ ++#define clr_vx_info(p) __clr_vx_info(p, __FILE__, __LINE__, __HERE__) ++ ++static inline void __clr_vx_info(struct vx_info **vxp, ++ const char *_file, int _line, void *_here) ++{ ++ struct vx_info *vxo; ++ ++ vxo = xchg(vxp, NULL); ++ if (!vxo) ++ return; ++ ++ vxlprintk(VXD_CBIT(xid, 3), "clr_vx_info(%p[#%d.%d])", ++ vxo, vxo ? vxo->vx_id : 0, ++ vxo ? atomic_read(&vxo->vx_usecnt) : 0, ++ _file, _line); ++ __vxh_clr_vx_info(vxo, vxp, _here); ++ ++ if (atomic_dec_and_test(&vxo->vx_usecnt)) ++ free_vx_info(vxo); ++} ++ ++ ++#define claim_vx_info(v, p) \ ++ __claim_vx_info(v, p, __FILE__, __LINE__, __HERE__) ++ ++static inline void __claim_vx_info(struct vx_info *vxi, ++ struct task_struct *task, ++ const char *_file, int _line, void *_here) ++{ ++ vxlprintk(VXD_CBIT(xid, 3), "claim_vx_info(%p[#%d.%d.%d]) %p", ++ vxi, vxi ? vxi->vx_id : 0, ++ vxi ? atomic_read(&vxi->vx_usecnt) : 0, ++ vxi ? atomic_read(&vxi->vx_tasks) : 0, ++ task, _file, _line); ++ __vxh_claim_vx_info(vxi, task, _here); ++ ++ atomic_inc(&vxi->vx_tasks); ++} ++ ++ ++extern void unhash_vx_info(struct vx_info *); ++ ++#define release_vx_info(v, p) \ ++ __release_vx_info(v, p, __FILE__, __LINE__, __HERE__) ++ ++static inline void __release_vx_info(struct vx_info *vxi, ++ struct task_struct *task, ++ const char *_file, int _line, void *_here) ++{ ++ vxlprintk(VXD_CBIT(xid, 3), "release_vx_info(%p[#%d.%d.%d]) %p", ++ vxi, vxi ? vxi->vx_id : 0, ++ vxi ? atomic_read(&vxi->vx_usecnt) : 0, ++ vxi ? atomic_read(&vxi->vx_tasks) : 0, ++ task, _file, _line); ++ __vxh_release_vx_info(vxi, task, _here); ++ ++ might_sleep(); ++ ++ if (atomic_dec_and_test(&vxi->vx_tasks)) ++ unhash_vx_info(vxi); ++} ++ ++ ++#define task_get_vx_info(p) \ ++ __task_get_vx_info(p, __FILE__, __LINE__, __HERE__) ++ ++static inline struct vx_info *__task_get_vx_info(struct task_struct *p, ++ const char *_file, int _line, void *_here) ++{ ++ struct vx_info *vxi; ++ ++ task_lock(p); ++ vxlprintk(VXD_CBIT(xid, 5), "task_get_vx_info(%p)", ++ p, _file, _line); ++ vxi = __get_vx_info(p->vx_info, _file, _line, _here); ++ task_unlock(p); ++ return vxi; ++} ++ ++ ++static inline void __wakeup_vx_info(struct vx_info *vxi) ++{ ++ if (waitqueue_active(&vxi->vx_wait)) ++ wake_up_interruptible(&vxi->vx_wait); ++} ++ ++ ++#define enter_vx_info(v, s) __enter_vx_info(v, s, __FILE__, __LINE__) ++ ++static inline void __enter_vx_info(struct vx_info *vxi, ++ struct vx_info_save *vxis, const char *_file, int _line) ++{ ++ vxlprintk(VXD_CBIT(xid, 5), "enter_vx_info(%p[#%d],%p) %p[#%d,%p]", ++ vxi, vxi ? vxi->vx_id : 0, vxis, current, ++ current->xid, current->vx_info, _file, _line); ++ vxis->vxi = xchg(¤t->vx_info, vxi); ++ vxis->xid = current->xid; ++ current->xid = vxi ? vxi->vx_id : 0; ++} ++ ++#define leave_vx_info(s) __leave_vx_info(s, __FILE__, __LINE__) ++ ++static inline void __leave_vx_info(struct vx_info_save *vxis, ++ const char *_file, int _line) ++{ ++ vxlprintk(VXD_CBIT(xid, 5), "leave_vx_info(%p[#%d,%p]) %p[#%d,%p]", ++ vxis, vxis->xid, vxis->vxi, current, ++ current->xid, current->vx_info, _file, _line); ++ (void)xchg(¤t->vx_info, vxis->vxi); ++ current->xid = vxis->xid; ++} ++ ++ ++static inline void __enter_vx_admin(struct vx_info_save *vxis) ++{ ++ vxis->vxi = xchg(¤t->vx_info, NULL); ++ vxis->xid = xchg(¤t->xid, (xid_t)0); ++} ++ ++static inline void __leave_vx_admin(struct vx_info_save *vxis) ++{ ++ (void)xchg(¤t->xid, vxis->xid); ++ (void)xchg(¤t->vx_info, vxis->vxi); ++} ++ ++#define task_is_init(p) \ ++ __task_is_init(p, __FILE__, __LINE__, __HERE__) ++ ++static inline int __task_is_init(struct task_struct *p, ++ const char *_file, int _line, void *_here) ++{ ++ int is_init = is_global_init(p); ++ ++ task_lock(p); ++ if (p->vx_info) ++ is_init = p->vx_info->vx_initpid == p->pid; ++ task_unlock(p); ++ return is_init; ++} ++ ++extern void exit_vx_info(struct task_struct *, int); ++extern void exit_vx_info_early(struct task_struct *, int); ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vs_cowbl.h linux-3.2.34-vs2.3.2.15/include/linux/vs_cowbl.h +--- linux-3.2.34/include/linux/vs_cowbl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vs_cowbl.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,48 @@ ++#ifndef _VS_COWBL_H ++#define _VS_COWBL_H ++ ++#include ++#include ++#include ++#include ++ ++extern struct dentry *cow_break_link(const char *pathname); ++ ++static inline int cow_check_and_break(struct path *path) ++{ ++ struct inode *inode = path->dentry->d_inode; ++ int error = 0; ++ ++ /* do we need this check? */ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ ++ if (IS_COW(inode)) { ++ if (IS_COW_LINK(inode)) { ++ struct dentry *new_dentry, *old_dentry = path->dentry; ++ char *pp, *buf; ++ ++ buf = kmalloc(PATH_MAX, GFP_KERNEL); ++ if (!buf) { ++ return -ENOMEM; ++ } ++ pp = d_path(path, buf, PATH_MAX); ++ new_dentry = cow_break_link(pp); ++ kfree(buf); ++ if (!IS_ERR(new_dentry)) { ++ path->dentry = new_dentry; ++ dput(old_dentry); ++ } else ++ error = PTR_ERR(new_dentry); ++ } else { ++ inode->i_flags &= ~(S_IXUNLINK | S_IMMUTABLE); ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty(inode); ++ } ++ } ++ return error; ++} ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vs_cvirt.h linux-3.2.34-vs2.3.2.15/include/linux/vs_cvirt.h +--- linux-3.2.34/include/linux/vs_cvirt.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vs_cvirt.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,50 @@ ++#ifndef _VS_CVIRT_H ++#define _VS_CVIRT_H ++ ++#include "vserver/cvirt.h" ++#include "vserver/context.h" ++#include "vserver/base.h" ++#include "vserver/check.h" ++#include "vserver/debug.h" ++ ++ ++static inline void vx_activate_task(struct task_struct *p) ++{ ++ struct vx_info *vxi; ++ ++ if ((vxi = p->vx_info)) { ++ vx_update_load(vxi); ++ atomic_inc(&vxi->cvirt.nr_running); ++ } ++} ++ ++static inline void vx_deactivate_task(struct task_struct *p) ++{ ++ struct vx_info *vxi; ++ ++ if ((vxi = p->vx_info)) { ++ vx_update_load(vxi); ++ atomic_dec(&vxi->cvirt.nr_running); ++ } ++} ++ ++static inline void vx_uninterruptible_inc(struct task_struct *p) ++{ ++ struct vx_info *vxi; ++ ++ if ((vxi = p->vx_info)) ++ atomic_inc(&vxi->cvirt.nr_uninterruptible); ++} ++ ++static inline void vx_uninterruptible_dec(struct task_struct *p) ++{ ++ struct vx_info *vxi; ++ ++ if ((vxi = p->vx_info)) ++ atomic_dec(&vxi->cvirt.nr_uninterruptible); ++} ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vs_device.h linux-3.2.34-vs2.3.2.15/include/linux/vs_device.h +--- linux-3.2.34/include/linux/vs_device.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vs_device.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,45 @@ ++#ifndef _VS_DEVICE_H ++#define _VS_DEVICE_H ++ ++#include "vserver/base.h" ++#include "vserver/device.h" ++#include "vserver/debug.h" ++ ++ ++#ifdef CONFIG_VSERVER_DEVICE ++ ++int vs_map_device(struct vx_info *, dev_t, dev_t *, umode_t); ++ ++#define vs_device_perm(v, d, m, p) \ ++ ((vs_map_device(current_vx_info(), d, NULL, m) & (p)) == (p)) ++ ++#else ++ ++static inline ++int vs_map_device(struct vx_info *vxi, ++ dev_t device, dev_t *target, umode_t mode) ++{ ++ if (target) ++ *target = device; ++ return ~0; ++} ++ ++#define vs_device_perm(v, d, m, p) ((p) == (p)) ++ ++#endif ++ ++ ++#define vs_map_chrdev(d, t, p) \ ++ ((vs_map_device(current_vx_info(), d, t, S_IFCHR) & (p)) == (p)) ++#define vs_map_blkdev(d, t, p) \ ++ ((vs_map_device(current_vx_info(), d, t, S_IFBLK) & (p)) == (p)) ++ ++#define vs_chrdev_perm(d, p) \ ++ vs_device_perm(current_vx_info(), d, S_IFCHR, p) ++#define vs_blkdev_perm(d, p) \ ++ vs_device_perm(current_vx_info(), d, S_IFBLK, p) ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vs_dlimit.h linux-3.2.34-vs2.3.2.15/include/linux/vs_dlimit.h +--- linux-3.2.34/include/linux/vs_dlimit.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vs_dlimit.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,215 @@ ++#ifndef _VS_DLIMIT_H ++#define _VS_DLIMIT_H ++ ++#include ++ ++#include "vserver/dlimit.h" ++#include "vserver/base.h" ++#include "vserver/debug.h" ++ ++ ++#define get_dl_info(i) __get_dl_info(i, __FILE__, __LINE__) ++ ++static inline struct dl_info *__get_dl_info(struct dl_info *dli, ++ const char *_file, int _line) ++{ ++ if (!dli) ++ return NULL; ++ vxlprintk(VXD_CBIT(dlim, 4), "get_dl_info(%p[#%d.%d])", ++ dli, dli ? dli->dl_tag : 0, ++ dli ? atomic_read(&dli->dl_usecnt) : 0, ++ _file, _line); ++ atomic_inc(&dli->dl_usecnt); ++ return dli; ++} ++ ++ ++#define free_dl_info(i) \ ++ call_rcu(&(i)->dl_rcu, rcu_free_dl_info) ++ ++#define put_dl_info(i) __put_dl_info(i, __FILE__, __LINE__) ++ ++static inline void __put_dl_info(struct dl_info *dli, ++ const char *_file, int _line) ++{ ++ if (!dli) ++ return; ++ vxlprintk(VXD_CBIT(dlim, 4), "put_dl_info(%p[#%d.%d])", ++ dli, dli ? dli->dl_tag : 0, ++ dli ? atomic_read(&dli->dl_usecnt) : 0, ++ _file, _line); ++ if (atomic_dec_and_test(&dli->dl_usecnt)) ++ free_dl_info(dli); ++} ++ ++ ++#define __dlimit_char(d) ((d) ? '*' : ' ') ++ ++static inline int __dl_alloc_space(struct super_block *sb, ++ tag_t tag, dlsize_t nr, const char *file, int line) ++{ ++ struct dl_info *dli = NULL; ++ int ret = 0; ++ ++ if (nr == 0) ++ goto out; ++ dli = locate_dl_info(sb, tag); ++ if (!dli) ++ goto out; ++ ++ spin_lock(&dli->dl_lock); ++ ret = (dli->dl_space_used + nr > dli->dl_space_total); ++ if (!ret) ++ dli->dl_space_used += nr; ++ spin_unlock(&dli->dl_lock); ++ put_dl_info(dli); ++out: ++ vxlprintk(VXD_CBIT(dlim, 1), ++ "ALLOC (%p,#%d)%c %lld bytes (%d)", ++ sb, tag, __dlimit_char(dli), (long long)nr, ++ ret, file, line); ++ return ret ? -ENOSPC : 0; ++} ++ ++static inline void __dl_free_space(struct super_block *sb, ++ tag_t tag, dlsize_t nr, const char *_file, int _line) ++{ ++ struct dl_info *dli = NULL; ++ ++ if (nr == 0) ++ goto out; ++ dli = locate_dl_info(sb, tag); ++ if (!dli) ++ goto out; ++ ++ spin_lock(&dli->dl_lock); ++ if (dli->dl_space_used > nr) ++ dli->dl_space_used -= nr; ++ else ++ dli->dl_space_used = 0; ++ spin_unlock(&dli->dl_lock); ++ put_dl_info(dli); ++out: ++ vxlprintk(VXD_CBIT(dlim, 1), ++ "FREE (%p,#%d)%c %lld bytes", ++ sb, tag, __dlimit_char(dli), (long long)nr, ++ _file, _line); ++} ++ ++static inline int __dl_alloc_inode(struct super_block *sb, ++ tag_t tag, const char *_file, int _line) ++{ ++ struct dl_info *dli; ++ int ret = 0; ++ ++ dli = locate_dl_info(sb, tag); ++ if (!dli) ++ goto out; ++ ++ spin_lock(&dli->dl_lock); ++ dli->dl_inodes_used++; ++ ret = (dli->dl_inodes_used > dli->dl_inodes_total); ++ spin_unlock(&dli->dl_lock); ++ put_dl_info(dli); ++out: ++ vxlprintk(VXD_CBIT(dlim, 0), ++ "ALLOC (%p,#%d)%c inode (%d)", ++ sb, tag, __dlimit_char(dli), ret, _file, _line); ++ return ret ? -ENOSPC : 0; ++} ++ ++static inline void __dl_free_inode(struct super_block *sb, ++ tag_t tag, const char *_file, int _line) ++{ ++ struct dl_info *dli; ++ ++ dli = locate_dl_info(sb, tag); ++ if (!dli) ++ goto out; ++ ++ spin_lock(&dli->dl_lock); ++ if (dli->dl_inodes_used > 1) ++ dli->dl_inodes_used--; ++ else ++ dli->dl_inodes_used = 0; ++ spin_unlock(&dli->dl_lock); ++ put_dl_info(dli); ++out: ++ vxlprintk(VXD_CBIT(dlim, 0), ++ "FREE (%p,#%d)%c inode", ++ sb, tag, __dlimit_char(dli), _file, _line); ++} ++ ++static inline void __dl_adjust_block(struct super_block *sb, tag_t tag, ++ unsigned long long *free_blocks, unsigned long long *root_blocks, ++ const char *_file, int _line) ++{ ++ struct dl_info *dli; ++ uint64_t broot, bfree; ++ ++ dli = locate_dl_info(sb, tag); ++ if (!dli) ++ return; ++ ++ spin_lock(&dli->dl_lock); ++ broot = (dli->dl_space_total - ++ (dli->dl_space_total >> 10) * dli->dl_nrlmult) ++ >> sb->s_blocksize_bits; ++ bfree = (dli->dl_space_total - dli->dl_space_used) ++ >> sb->s_blocksize_bits; ++ spin_unlock(&dli->dl_lock); ++ ++ vxlprintk(VXD_CBIT(dlim, 2), ++ "ADJUST: %lld,%lld on %lld,%lld [mult=%d]", ++ (long long)bfree, (long long)broot, ++ *free_blocks, *root_blocks, dli->dl_nrlmult, ++ _file, _line); ++ if (free_blocks) { ++ if (*free_blocks > bfree) ++ *free_blocks = bfree; ++ } ++ if (root_blocks) { ++ if (*root_blocks > broot) ++ *root_blocks = broot; ++ } ++ put_dl_info(dli); ++} ++ ++#define dl_prealloc_space(in, bytes) \ ++ __dl_alloc_space((in)->i_sb, (in)->i_tag, (dlsize_t)(bytes), \ ++ __FILE__, __LINE__ ) ++ ++#define dl_alloc_space(in, bytes) \ ++ __dl_alloc_space((in)->i_sb, (in)->i_tag, (dlsize_t)(bytes), \ ++ __FILE__, __LINE__ ) ++ ++#define dl_reserve_space(in, bytes) \ ++ __dl_alloc_space((in)->i_sb, (in)->i_tag, (dlsize_t)(bytes), \ ++ __FILE__, __LINE__ ) ++ ++#define dl_claim_space(in, bytes) (0) ++ ++#define dl_release_space(in, bytes) \ ++ __dl_free_space((in)->i_sb, (in)->i_tag, (dlsize_t)(bytes), \ ++ __FILE__, __LINE__ ) ++ ++#define dl_free_space(in, bytes) \ ++ __dl_free_space((in)->i_sb, (in)->i_tag, (dlsize_t)(bytes), \ ++ __FILE__, __LINE__ ) ++ ++ ++ ++#define dl_alloc_inode(in) \ ++ __dl_alloc_inode((in)->i_sb, (in)->i_tag, __FILE__, __LINE__ ) ++ ++#define dl_free_inode(in) \ ++ __dl_free_inode((in)->i_sb, (in)->i_tag, __FILE__, __LINE__ ) ++ ++ ++#define dl_adjust_block(sb, tag, fb, rb) \ ++ __dl_adjust_block(sb, tag, fb, rb, __FILE__, __LINE__ ) ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vs_inet.h linux-3.2.34-vs2.3.2.15/include/linux/vs_inet.h +--- linux-3.2.34/include/linux/vs_inet.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vs_inet.h 2012-02-15 03:03:53.000000000 +0100 +@@ -0,0 +1,353 @@ ++#ifndef _VS_INET_H ++#define _VS_INET_H ++ ++#include "vserver/base.h" ++#include "vserver/network.h" ++#include "vserver/debug.h" ++ ++#define IPI_LOOPBACK htonl(INADDR_LOOPBACK) ++ ++#define NXAV4(a) NIPQUAD((a)->ip[0]), NIPQUAD((a)->ip[1]), \ ++ NIPQUAD((a)->mask), (a)->type ++#define NXAV4_FMT "[" NIPQUAD_FMT "-" NIPQUAD_FMT "/" NIPQUAD_FMT ":%04x]" ++ ++#define NIPQUAD(addr) \ ++ ((unsigned char *)&addr)[0], \ ++ ((unsigned char *)&addr)[1], \ ++ ((unsigned char *)&addr)[2], \ ++ ((unsigned char *)&addr)[3] ++ ++#define NIPQUAD_FMT "%u.%u.%u.%u" ++ ++ ++static inline ++int v4_addr_match(struct nx_addr_v4 *nxa, __be32 addr, uint16_t tmask) ++{ ++ __be32 ip = nxa->ip[0].s_addr; ++ __be32 mask = nxa->mask.s_addr; ++ __be32 bcast = ip | ~mask; ++ int ret = 0; ++ ++ switch (nxa->type & tmask) { ++ case NXA_TYPE_MASK: ++ ret = (ip == (addr & mask)); ++ break; ++ case NXA_TYPE_ADDR: ++ ret = 3; ++ if (addr == ip) ++ break; ++ /* fall through to broadcast */ ++ case NXA_MOD_BCAST: ++ ret = ((tmask & NXA_MOD_BCAST) && (addr == bcast)); ++ break; ++ case NXA_TYPE_RANGE: ++ ret = ((nxa->ip[0].s_addr <= addr) && ++ (nxa->ip[1].s_addr > addr)); ++ break; ++ case NXA_TYPE_ANY: ++ ret = 2; ++ break; ++ } ++ ++ vxdprintk(VXD_CBIT(net, 0), ++ "v4_addr_match(%p" NXAV4_FMT "," NIPQUAD_FMT ",%04x) = %d", ++ nxa, NXAV4(nxa), NIPQUAD(addr), tmask, ret); ++ return ret; ++} ++ ++static inline ++int v4_addr_in_nx_info(struct nx_info *nxi, __be32 addr, uint16_t tmask) ++{ ++ struct nx_addr_v4 *nxa; ++ int ret = 1; ++ ++ if (!nxi) ++ goto out; ++ ++ ret = 2; ++ /* allow 127.0.0.1 when remapping lback */ ++ if ((tmask & NXA_LOOPBACK) && ++ (addr == IPI_LOOPBACK) && ++ nx_info_flags(nxi, NXF_LBACK_REMAP, 0)) ++ goto out; ++ ret = 3; ++ /* check for lback address */ ++ if ((tmask & NXA_MOD_LBACK) && ++ (nxi->v4_lback.s_addr == addr)) ++ goto out; ++ ret = 4; ++ /* check for broadcast address */ ++ if ((tmask & NXA_MOD_BCAST) && ++ (nxi->v4_bcast.s_addr == addr)) ++ goto out; ++ ret = 5; ++ /* check for v4 addresses */ ++ for (nxa = &nxi->v4; nxa; nxa = nxa->next) ++ if (v4_addr_match(nxa, addr, tmask)) ++ goto out; ++ ret = 0; ++out: ++ vxdprintk(VXD_CBIT(net, 0), ++ "v4_addr_in_nx_info(%p[#%u]," NIPQUAD_FMT ",%04x) = %d", ++ nxi, nxi ? nxi->nx_id : 0, NIPQUAD(addr), tmask, ret); ++ return ret; ++} ++ ++static inline ++int v4_nx_addr_match(struct nx_addr_v4 *nxa, struct nx_addr_v4 *addr, uint16_t mask) ++{ ++ /* FIXME: needs full range checks */ ++ return v4_addr_match(nxa, addr->ip[0].s_addr, mask); ++} ++ ++static inline ++int v4_nx_addr_in_nx_info(struct nx_info *nxi, struct nx_addr_v4 *nxa, uint16_t mask) ++{ ++ struct nx_addr_v4 *ptr; ++ ++ for (ptr = &nxi->v4; ptr; ptr = ptr->next) ++ if (v4_nx_addr_match(ptr, nxa, mask)) ++ return 1; ++ return 0; ++} ++ ++#include ++ ++/* ++ * Check if a given address matches for a socket ++ * ++ * nxi: the socket's nx_info if any ++ * addr: to be verified address ++ */ ++static inline ++int v4_sock_addr_match ( ++ struct nx_info *nxi, ++ struct inet_sock *inet, ++ __be32 addr) ++{ ++ __be32 saddr = inet->inet_rcv_saddr; ++ __be32 bcast = nxi ? nxi->v4_bcast.s_addr : INADDR_BROADCAST; ++ ++ if (addr && (saddr == addr || bcast == addr)) ++ return 1; ++ if (!saddr) ++ return v4_addr_in_nx_info(nxi, addr, NXA_MASK_BIND); ++ return 0; ++} ++ ++ ++/* inet related checks and helpers */ ++ ++ ++struct in_ifaddr; ++struct net_device; ++struct sock; ++ ++#ifdef CONFIG_INET ++ ++#include ++#include ++#include ++#include ++ ++ ++int dev_in_nx_info(struct net_device *, struct nx_info *); ++int v4_dev_in_nx_info(struct net_device *, struct nx_info *); ++int nx_v4_addr_conflict(struct nx_info *, struct nx_info *); ++ ++ ++/* ++ * check if address is covered by socket ++ * ++ * sk: the socket to check against ++ * addr: the address in question (must be != 0) ++ */ ++ ++static inline ++int __v4_addr_match_socket(const struct sock *sk, struct nx_addr_v4 *nxa) ++{ ++ struct nx_info *nxi = sk->sk_nx_info; ++ __be32 saddr = sk_rcv_saddr(sk); ++ ++ vxdprintk(VXD_CBIT(net, 5), ++ "__v4_addr_in_socket(%p," NXAV4_FMT ") %p:" NIPQUAD_FMT " %p;%lx", ++ sk, NXAV4(nxa), nxi, NIPQUAD(saddr), sk->sk_socket, ++ (sk->sk_socket?sk->sk_socket->flags:0)); ++ ++ if (saddr) { /* direct address match */ ++ return v4_addr_match(nxa, saddr, -1); ++ } else if (nxi) { /* match against nx_info */ ++ return v4_nx_addr_in_nx_info(nxi, nxa, -1); ++ } else { /* unrestricted any socket */ ++ return 1; ++ } ++} ++ ++ ++ ++static inline ++int nx_dev_visible(struct nx_info *nxi, struct net_device *dev) ++{ ++ vxdprintk(VXD_CBIT(net, 1), ++ "nx_dev_visible(%p[#%u],%p " VS_Q("%s") ") %d", ++ nxi, nxi ? nxi->nx_id : 0, dev, dev->name, ++ nxi ? dev_in_nx_info(dev, nxi) : 0); ++ ++ if (!nx_info_flags(nxi, NXF_HIDE_NETIF, 0)) ++ return 1; ++ if (dev_in_nx_info(dev, nxi)) ++ return 1; ++ return 0; ++} ++ ++ ++static inline ++int v4_ifa_in_nx_info(struct in_ifaddr *ifa, struct nx_info *nxi) ++{ ++ if (!nxi) ++ return 1; ++ if (!ifa) ++ return 0; ++ return v4_addr_in_nx_info(nxi, ifa->ifa_local, NXA_MASK_SHOW); ++} ++ ++static inline ++int nx_v4_ifa_visible(struct nx_info *nxi, struct in_ifaddr *ifa) ++{ ++ vxdprintk(VXD_CBIT(net, 1), "nx_v4_ifa_visible(%p[#%u],%p) %d", ++ nxi, nxi ? nxi->nx_id : 0, ifa, ++ nxi ? v4_ifa_in_nx_info(ifa, nxi) : 0); ++ ++ if (!nx_info_flags(nxi, NXF_HIDE_NETIF, 0)) ++ return 1; ++ if (v4_ifa_in_nx_info(ifa, nxi)) ++ return 1; ++ return 0; ++} ++ ++ ++struct nx_v4_sock_addr { ++ __be32 saddr; /* Address used for validation */ ++ __be32 baddr; /* Address used for socket bind */ ++}; ++ ++static inline ++int v4_map_sock_addr(struct inet_sock *inet, struct sockaddr_in *addr, ++ struct nx_v4_sock_addr *nsa) ++{ ++ struct sock *sk = &inet->sk; ++ struct nx_info *nxi = sk->sk_nx_info; ++ __be32 saddr = addr->sin_addr.s_addr; ++ __be32 baddr = saddr; ++ ++ vxdprintk(VXD_CBIT(net, 3), ++ "inet_bind(%p)* %p,%p;%lx " NIPQUAD_FMT, ++ sk, sk->sk_nx_info, sk->sk_socket, ++ (sk->sk_socket ? sk->sk_socket->flags : 0), ++ NIPQUAD(saddr)); ++ ++ if (nxi) { ++ if (saddr == INADDR_ANY) { ++ if (nx_info_flags(nxi, NXF_SINGLE_IP, 0)) ++ baddr = nxi->v4.ip[0].s_addr; ++ } else if (saddr == IPI_LOOPBACK) { ++ if (nx_info_flags(nxi, NXF_LBACK_REMAP, 0)) ++ baddr = nxi->v4_lback.s_addr; ++ } else if (!ipv4_is_multicast(saddr) || ++ !nx_info_ncaps(nxi, NXC_MULTICAST)) { ++ /* normal address bind */ ++ if (!v4_addr_in_nx_info(nxi, saddr, NXA_MASK_BIND)) ++ return -EADDRNOTAVAIL; ++ } ++ } ++ ++ vxdprintk(VXD_CBIT(net, 3), ++ "inet_bind(%p) " NIPQUAD_FMT ", " NIPQUAD_FMT, ++ sk, NIPQUAD(saddr), NIPQUAD(baddr)); ++ ++ nsa->saddr = saddr; ++ nsa->baddr = baddr; ++ return 0; ++} ++ ++static inline ++void v4_set_sock_addr(struct inet_sock *inet, struct nx_v4_sock_addr *nsa) ++{ ++ inet->inet_saddr = nsa->baddr; ++ inet->inet_rcv_saddr = nsa->baddr; ++} ++ ++ ++/* ++ * helper to simplify inet_lookup_listener ++ * ++ * nxi: the socket's nx_info if any ++ * addr: to be verified address ++ * saddr: socket address ++ */ ++static inline int v4_inet_addr_match ( ++ struct nx_info *nxi, ++ __be32 addr, ++ __be32 saddr) ++{ ++ if (addr && (saddr == addr)) ++ return 1; ++ if (!saddr) ++ return nxi ? v4_addr_in_nx_info(nxi, addr, NXA_MASK_BIND) : 1; ++ return 0; ++} ++ ++static inline __be32 nx_map_sock_lback(struct nx_info *nxi, __be32 addr) ++{ ++ if (nx_info_flags(nxi, NXF_HIDE_LBACK, 0) && ++ (addr == nxi->v4_lback.s_addr)) ++ return IPI_LOOPBACK; ++ return addr; ++} ++ ++static inline ++int nx_info_has_v4(struct nx_info *nxi) ++{ ++ if (!nxi) ++ return 1; ++ if (NX_IPV4(nxi)) ++ return 1; ++ if (nx_info_flags(nxi, NXF_LBACK_REMAP, 0)) ++ return 1; ++ return 0; ++} ++ ++#else /* CONFIG_INET */ ++ ++static inline ++int nx_dev_visible(struct nx_info *n, struct net_device *d) ++{ ++ return 1; ++} ++ ++static inline ++int nx_v4_addr_conflict(struct nx_info *n, uint32_t a, const struct sock *s) ++{ ++ return 1; ++} ++ ++static inline ++int v4_ifa_in_nx_info(struct in_ifaddr *a, struct nx_info *n) ++{ ++ return 1; ++} ++ ++static inline ++int nx_info_has_v4(struct nx_info *nxi) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_INET */ ++ ++#define current_nx_info_has_v4() \ ++ nx_info_has_v4(current_nx_info()) ++ ++#else ++// #warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vs_inet6.h linux-3.2.34-vs2.3.2.15/include/linux/vs_inet6.h +--- linux-3.2.34/include/linux/vs_inet6.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vs_inet6.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,246 @@ ++#ifndef _VS_INET6_H ++#define _VS_INET6_H ++ ++#include "vserver/base.h" ++#include "vserver/network.h" ++#include "vserver/debug.h" ++ ++#include ++ ++#define NXAV6(a) &(a)->ip, &(a)->mask, (a)->prefix, (a)->type ++#define NXAV6_FMT "[%pI6/%pI6/%d:%04x]" ++ ++ ++#ifdef CONFIG_IPV6 ++ ++static inline ++int v6_addr_match(struct nx_addr_v6 *nxa, ++ const struct in6_addr *addr, uint16_t mask) ++{ ++ int ret = 0; ++ ++ switch (nxa->type & mask) { ++ case NXA_TYPE_MASK: ++ ret = ipv6_masked_addr_cmp(&nxa->ip, &nxa->mask, addr); ++ break; ++ case NXA_TYPE_ADDR: ++ ret = ipv6_addr_equal(&nxa->ip, addr); ++ break; ++ case NXA_TYPE_ANY: ++ ret = 1; ++ break; ++ } ++ vxdprintk(VXD_CBIT(net, 0), ++ "v6_addr_match(%p" NXAV6_FMT ",%pI6,%04x) = %d", ++ nxa, NXAV6(nxa), addr, mask, ret); ++ return ret; ++} ++ ++static inline ++int v6_addr_in_nx_info(struct nx_info *nxi, ++ const struct in6_addr *addr, uint16_t mask) ++{ ++ struct nx_addr_v6 *nxa; ++ int ret = 1; ++ ++ if (!nxi) ++ goto out; ++ for (nxa = &nxi->v6; nxa; nxa = nxa->next) ++ if (v6_addr_match(nxa, addr, mask)) ++ goto out; ++ ret = 0; ++out: ++ vxdprintk(VXD_CBIT(net, 0), ++ "v6_addr_in_nx_info(%p[#%u],%pI6,%04x) = %d", ++ nxi, nxi ? nxi->nx_id : 0, addr, mask, ret); ++ return ret; ++} ++ ++static inline ++int v6_nx_addr_match(struct nx_addr_v6 *nxa, struct nx_addr_v6 *addr, uint16_t mask) ++{ ++ /* FIXME: needs full range checks */ ++ return v6_addr_match(nxa, &addr->ip, mask); ++} ++ ++static inline ++int v6_nx_addr_in_nx_info(struct nx_info *nxi, struct nx_addr_v6 *nxa, uint16_t mask) ++{ ++ struct nx_addr_v6 *ptr; ++ ++ for (ptr = &nxi->v6; ptr; ptr = ptr->next) ++ if (v6_nx_addr_match(ptr, nxa, mask)) ++ return 1; ++ return 0; ++} ++ ++ ++/* ++ * Check if a given address matches for a socket ++ * ++ * nxi: the socket's nx_info if any ++ * addr: to be verified address ++ */ ++static inline ++int v6_sock_addr_match ( ++ struct nx_info *nxi, ++ struct inet_sock *inet, ++ struct in6_addr *addr) ++{ ++ struct sock *sk = &inet->sk; ++ struct in6_addr *saddr = inet6_rcv_saddr(sk); ++ ++ if (!ipv6_addr_any(addr) && ++ ipv6_addr_equal(saddr, addr)) ++ return 1; ++ if (ipv6_addr_any(saddr)) ++ return v6_addr_in_nx_info(nxi, addr, -1); ++ return 0; ++} ++ ++/* ++ * check if address is covered by socket ++ * ++ * sk: the socket to check against ++ * addr: the address in question (must be != 0) ++ */ ++ ++static inline ++int __v6_addr_match_socket(const struct sock *sk, struct nx_addr_v6 *nxa) ++{ ++ struct nx_info *nxi = sk->sk_nx_info; ++ struct in6_addr *saddr = inet6_rcv_saddr(sk); ++ ++ vxdprintk(VXD_CBIT(net, 5), ++ "__v6_addr_in_socket(%p," NXAV6_FMT ") %p:%pI6 %p;%lx", ++ sk, NXAV6(nxa), nxi, saddr, sk->sk_socket, ++ (sk->sk_socket?sk->sk_socket->flags:0)); ++ ++ if (!ipv6_addr_any(saddr)) { /* direct address match */ ++ return v6_addr_match(nxa, saddr, -1); ++ } else if (nxi) { /* match against nx_info */ ++ return v6_nx_addr_in_nx_info(nxi, nxa, -1); ++ } else { /* unrestricted any socket */ ++ return 1; ++ } ++} ++ ++ ++/* inet related checks and helpers */ ++ ++ ++struct in_ifaddr; ++struct net_device; ++struct sock; ++ ++ ++#include ++#include ++#include ++ ++ ++int dev_in_nx_info(struct net_device *, struct nx_info *); ++int v6_dev_in_nx_info(struct net_device *, struct nx_info *); ++int nx_v6_addr_conflict(struct nx_info *, struct nx_info *); ++ ++ ++ ++static inline ++int v6_ifa_in_nx_info(struct inet6_ifaddr *ifa, struct nx_info *nxi) ++{ ++ if (!nxi) ++ return 1; ++ if (!ifa) ++ return 0; ++ return v6_addr_in_nx_info(nxi, &ifa->addr, -1); ++} ++ ++static inline ++int nx_v6_ifa_visible(struct nx_info *nxi, struct inet6_ifaddr *ifa) ++{ ++ vxdprintk(VXD_CBIT(net, 1), "nx_v6_ifa_visible(%p[#%u],%p) %d", ++ nxi, nxi ? nxi->nx_id : 0, ifa, ++ nxi ? v6_ifa_in_nx_info(ifa, nxi) : 0); ++ ++ if (!nx_info_flags(nxi, NXF_HIDE_NETIF, 0)) ++ return 1; ++ if (v6_ifa_in_nx_info(ifa, nxi)) ++ return 1; ++ return 0; ++} ++ ++ ++struct nx_v6_sock_addr { ++ struct in6_addr saddr; /* Address used for validation */ ++ struct in6_addr baddr; /* Address used for socket bind */ ++}; ++ ++static inline ++int v6_map_sock_addr(struct inet_sock *inet, struct sockaddr_in6 *addr, ++ struct nx_v6_sock_addr *nsa) ++{ ++ // struct sock *sk = &inet->sk; ++ // struct nx_info *nxi = sk->sk_nx_info; ++ struct in6_addr saddr = addr->sin6_addr; ++ struct in6_addr baddr = saddr; ++ ++ nsa->saddr = saddr; ++ nsa->baddr = baddr; ++ return 0; ++} ++ ++static inline ++void v6_set_sock_addr(struct inet_sock *inet, struct nx_v6_sock_addr *nsa) ++{ ++ // struct sock *sk = &inet->sk; ++ // struct in6_addr *saddr = inet6_rcv_saddr(sk); ++ ++ // *saddr = nsa->baddr; ++ // inet->inet_saddr = nsa->baddr; ++} ++ ++static inline ++int nx_info_has_v6(struct nx_info *nxi) ++{ ++ if (!nxi) ++ return 1; ++ if (NX_IPV6(nxi)) ++ return 1; ++ return 0; ++} ++ ++#else /* CONFIG_IPV6 */ ++ ++static inline ++int nx_v6_dev_visible(struct nx_info *n, struct net_device *d) ++{ ++ return 1; ++} ++ ++ ++static inline ++int nx_v6_addr_conflict(struct nx_info *n, uint32_t a, const struct sock *s) ++{ ++ return 1; ++} ++ ++static inline ++int v6_ifa_in_nx_info(struct in_ifaddr *a, struct nx_info *n) ++{ ++ return 1; ++} ++ ++static inline ++int nx_info_has_v6(struct nx_info *nxi) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_IPV6 */ ++ ++#define current_nx_info_has_v6() \ ++ nx_info_has_v6(current_nx_info()) ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vs_limit.h linux-3.2.34-vs2.3.2.15/include/linux/vs_limit.h +--- linux-3.2.34/include/linux/vs_limit.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vs_limit.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,140 @@ ++#ifndef _VS_LIMIT_H ++#define _VS_LIMIT_H ++ ++#include "vserver/limit.h" ++#include "vserver/base.h" ++#include "vserver/context.h" ++#include "vserver/debug.h" ++#include "vserver/context.h" ++#include "vserver/limit_int.h" ++ ++ ++#define vx_acc_cres(v, d, p, r) \ ++ __vx_acc_cres(v, r, d, p, __FILE__, __LINE__) ++ ++#define vx_acc_cres_cond(x, d, p, r) \ ++ __vx_acc_cres(((x) == vx_current_xid()) ? current_vx_info() : 0, \ ++ r, d, p, __FILE__, __LINE__) ++ ++ ++#define vx_add_cres(v, a, p, r) \ ++ __vx_add_cres(v, r, a, p, __FILE__, __LINE__) ++#define vx_sub_cres(v, a, p, r) vx_add_cres(v, -(a), p, r) ++ ++#define vx_add_cres_cond(x, a, p, r) \ ++ __vx_add_cres(((x) == vx_current_xid()) ? current_vx_info() : 0, \ ++ r, a, p, __FILE__, __LINE__) ++#define vx_sub_cres_cond(x, a, p, r) vx_add_cres_cond(x, -(a), p, r) ++ ++ ++/* process and file limits */ ++ ++#define vx_nproc_inc(p) \ ++ vx_acc_cres((p)->vx_info, 1, p, RLIMIT_NPROC) ++ ++#define vx_nproc_dec(p) \ ++ vx_acc_cres((p)->vx_info,-1, p, RLIMIT_NPROC) ++ ++#define vx_files_inc(f) \ ++ vx_acc_cres_cond((f)->f_xid, 1, f, RLIMIT_NOFILE) ++ ++#define vx_files_dec(f) \ ++ vx_acc_cres_cond((f)->f_xid,-1, f, RLIMIT_NOFILE) ++ ++#define vx_locks_inc(l) \ ++ vx_acc_cres_cond((l)->fl_xid, 1, l, RLIMIT_LOCKS) ++ ++#define vx_locks_dec(l) \ ++ vx_acc_cres_cond((l)->fl_xid,-1, l, RLIMIT_LOCKS) ++ ++#define vx_openfd_inc(f) \ ++ vx_acc_cres(current_vx_info(), 1, (void *)(long)(f), VLIMIT_OPENFD) ++ ++#define vx_openfd_dec(f) \ ++ vx_acc_cres(current_vx_info(),-1, (void *)(long)(f), VLIMIT_OPENFD) ++ ++ ++#define vx_cres_avail(v, n, r) \ ++ __vx_cres_avail(v, r, n, __FILE__, __LINE__) ++ ++ ++#define vx_nproc_avail(n) \ ++ vx_cres_avail(current_vx_info(), n, RLIMIT_NPROC) ++ ++#define vx_files_avail(n) \ ++ vx_cres_avail(current_vx_info(), n, RLIMIT_NOFILE) ++ ++#define vx_locks_avail(n) \ ++ vx_cres_avail(current_vx_info(), n, RLIMIT_LOCKS) ++ ++#define vx_openfd_avail(n) \ ++ vx_cres_avail(current_vx_info(), n, VLIMIT_OPENFD) ++ ++ ++/* dentry limits */ ++ ++#define vx_dentry_inc(d) do { \ ++ if ((d)->d_count == 1) \ ++ vx_acc_cres(current_vx_info(), 1, d, VLIMIT_DENTRY); \ ++ } while (0) ++ ++#define vx_dentry_dec(d) do { \ ++ if ((d)->d_count == 0) \ ++ vx_acc_cres(current_vx_info(),-1, d, VLIMIT_DENTRY); \ ++ } while (0) ++ ++#define vx_dentry_avail(n) \ ++ vx_cres_avail(current_vx_info(), n, VLIMIT_DENTRY) ++ ++ ++/* socket limits */ ++ ++#define vx_sock_inc(s) \ ++ vx_acc_cres((s)->sk_vx_info, 1, s, VLIMIT_NSOCK) ++ ++#define vx_sock_dec(s) \ ++ vx_acc_cres((s)->sk_vx_info,-1, s, VLIMIT_NSOCK) ++ ++#define vx_sock_avail(n) \ ++ vx_cres_avail(current_vx_info(), n, VLIMIT_NSOCK) ++ ++ ++/* ipc resource limits */ ++ ++#define vx_ipcmsg_add(v, u, a) \ ++ vx_add_cres(v, a, u, RLIMIT_MSGQUEUE) ++ ++#define vx_ipcmsg_sub(v, u, a) \ ++ vx_sub_cres(v, a, u, RLIMIT_MSGQUEUE) ++ ++#define vx_ipcmsg_avail(v, a) \ ++ vx_cres_avail(v, a, RLIMIT_MSGQUEUE) ++ ++ ++#define vx_ipcshm_add(v, k, a) \ ++ vx_add_cres(v, a, (void *)(long)(k), VLIMIT_SHMEM) ++ ++#define vx_ipcshm_sub(v, k, a) \ ++ vx_sub_cres(v, a, (void *)(long)(k), VLIMIT_SHMEM) ++ ++#define vx_ipcshm_avail(v, a) \ ++ vx_cres_avail(v, a, VLIMIT_SHMEM) ++ ++ ++#define vx_semary_inc(a) \ ++ vx_acc_cres(current_vx_info(), 1, a, VLIMIT_SEMARY) ++ ++#define vx_semary_dec(a) \ ++ vx_acc_cres(current_vx_info(), -1, a, VLIMIT_SEMARY) ++ ++ ++#define vx_nsems_add(a,n) \ ++ vx_add_cres(current_vx_info(), n, a, VLIMIT_NSEMS) ++ ++#define vx_nsems_sub(a,n) \ ++ vx_sub_cres(current_vx_info(), n, a, VLIMIT_NSEMS) ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vs_network.h linux-3.2.34-vs2.3.2.15/include/linux/vs_network.h +--- linux-3.2.34/include/linux/vs_network.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vs_network.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,169 @@ ++#ifndef _NX_VS_NETWORK_H ++#define _NX_VS_NETWORK_H ++ ++#include "vserver/context.h" ++#include "vserver/network.h" ++#include "vserver/base.h" ++#include "vserver/check.h" ++#include "vserver/debug.h" ++ ++#include ++ ++ ++#define get_nx_info(i) __get_nx_info(i, __FILE__, __LINE__) ++ ++static inline struct nx_info *__get_nx_info(struct nx_info *nxi, ++ const char *_file, int _line) ++{ ++ if (!nxi) ++ return NULL; ++ ++ vxlprintk(VXD_CBIT(nid, 2), "get_nx_info(%p[#%d.%d])", ++ nxi, nxi ? nxi->nx_id : 0, ++ nxi ? atomic_read(&nxi->nx_usecnt) : 0, ++ _file, _line); ++ ++ atomic_inc(&nxi->nx_usecnt); ++ return nxi; ++} ++ ++ ++extern void free_nx_info(struct nx_info *); ++ ++#define put_nx_info(i) __put_nx_info(i, __FILE__, __LINE__) ++ ++static inline void __put_nx_info(struct nx_info *nxi, const char *_file, int _line) ++{ ++ if (!nxi) ++ return; ++ ++ vxlprintk(VXD_CBIT(nid, 2), "put_nx_info(%p[#%d.%d])", ++ nxi, nxi ? nxi->nx_id : 0, ++ nxi ? atomic_read(&nxi->nx_usecnt) : 0, ++ _file, _line); ++ ++ if (atomic_dec_and_test(&nxi->nx_usecnt)) ++ free_nx_info(nxi); ++} ++ ++ ++#define init_nx_info(p, i) __init_nx_info(p, i, __FILE__, __LINE__) ++ ++static inline void __init_nx_info(struct nx_info **nxp, struct nx_info *nxi, ++ const char *_file, int _line) ++{ ++ if (nxi) { ++ vxlprintk(VXD_CBIT(nid, 3), ++ "init_nx_info(%p[#%d.%d])", ++ nxi, nxi ? nxi->nx_id : 0, ++ nxi ? atomic_read(&nxi->nx_usecnt) : 0, ++ _file, _line); ++ ++ atomic_inc(&nxi->nx_usecnt); ++ } ++ *nxp = nxi; ++} ++ ++ ++#define set_nx_info(p, i) __set_nx_info(p, i, __FILE__, __LINE__) ++ ++static inline void __set_nx_info(struct nx_info **nxp, struct nx_info *nxi, ++ const char *_file, int _line) ++{ ++ struct nx_info *nxo; ++ ++ if (!nxi) ++ return; ++ ++ vxlprintk(VXD_CBIT(nid, 3), "set_nx_info(%p[#%d.%d])", ++ nxi, nxi ? nxi->nx_id : 0, ++ nxi ? atomic_read(&nxi->nx_usecnt) : 0, ++ _file, _line); ++ ++ atomic_inc(&nxi->nx_usecnt); ++ nxo = xchg(nxp, nxi); ++ BUG_ON(nxo); ++} ++ ++#define clr_nx_info(p) __clr_nx_info(p, __FILE__, __LINE__) ++ ++static inline void __clr_nx_info(struct nx_info **nxp, ++ const char *_file, int _line) ++{ ++ struct nx_info *nxo; ++ ++ nxo = xchg(nxp, NULL); ++ if (!nxo) ++ return; ++ ++ vxlprintk(VXD_CBIT(nid, 3), "clr_nx_info(%p[#%d.%d])", ++ nxo, nxo ? nxo->nx_id : 0, ++ nxo ? atomic_read(&nxo->nx_usecnt) : 0, ++ _file, _line); ++ ++ if (atomic_dec_and_test(&nxo->nx_usecnt)) ++ free_nx_info(nxo); ++} ++ ++ ++#define claim_nx_info(v, p) __claim_nx_info(v, p, __FILE__, __LINE__) ++ ++static inline void __claim_nx_info(struct nx_info *nxi, ++ struct task_struct *task, const char *_file, int _line) ++{ ++ vxlprintk(VXD_CBIT(nid, 3), "claim_nx_info(%p[#%d.%d.%d]) %p", ++ nxi, nxi ? nxi->nx_id : 0, ++ nxi?atomic_read(&nxi->nx_usecnt):0, ++ nxi?atomic_read(&nxi->nx_tasks):0, ++ task, _file, _line); ++ ++ atomic_inc(&nxi->nx_tasks); ++} ++ ++ ++extern void unhash_nx_info(struct nx_info *); ++ ++#define release_nx_info(v, p) __release_nx_info(v, p, __FILE__, __LINE__) ++ ++static inline void __release_nx_info(struct nx_info *nxi, ++ struct task_struct *task, const char *_file, int _line) ++{ ++ vxlprintk(VXD_CBIT(nid, 3), "release_nx_info(%p[#%d.%d.%d]) %p", ++ nxi, nxi ? nxi->nx_id : 0, ++ nxi ? atomic_read(&nxi->nx_usecnt) : 0, ++ nxi ? atomic_read(&nxi->nx_tasks) : 0, ++ task, _file, _line); ++ ++ might_sleep(); ++ ++ if (atomic_dec_and_test(&nxi->nx_tasks)) ++ unhash_nx_info(nxi); ++} ++ ++ ++#define task_get_nx_info(i) __task_get_nx_info(i, __FILE__, __LINE__) ++ ++static __inline__ struct nx_info *__task_get_nx_info(struct task_struct *p, ++ const char *_file, int _line) ++{ ++ struct nx_info *nxi; ++ ++ task_lock(p); ++ vxlprintk(VXD_CBIT(nid, 5), "task_get_nx_info(%p)", ++ p, _file, _line); ++ nxi = __get_nx_info(p->nx_info, _file, _line); ++ task_unlock(p); ++ return nxi; ++} ++ ++ ++static inline void exit_nx_info(struct task_struct *p) ++{ ++ if (p->nx_info) ++ release_nx_info(p->nx_info, p); ++} ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vs_pid.h linux-3.2.34-vs2.3.2.15/include/linux/vs_pid.h +--- linux-3.2.34/include/linux/vs_pid.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vs_pid.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,50 @@ ++#ifndef _VS_PID_H ++#define _VS_PID_H ++ ++#include "vserver/base.h" ++#include "vserver/check.h" ++#include "vserver/context.h" ++#include "vserver/debug.h" ++#include "vserver/pid.h" ++#include ++ ++ ++#define VXF_FAKE_INIT (VXF_INFO_INIT | VXF_STATE_INIT) ++ ++static inline ++int vx_proc_task_visible(struct task_struct *task) ++{ ++ if ((task->pid == 1) && ++ !vx_flags(VXF_FAKE_INIT, VXF_FAKE_INIT)) ++ /* show a blend through init */ ++ goto visible; ++ if (vx_check(vx_task_xid(task), VS_WATCH | VS_IDENT)) ++ goto visible; ++ return 0; ++visible: ++ return 1; ++} ++ ++#define find_task_by_real_pid(pid) find_task_by_pid_ns(pid, &init_pid_ns) ++ ++ ++static inline ++struct task_struct *vx_get_proc_task(struct inode *inode, struct pid *pid) ++{ ++ struct task_struct *task = get_pid_task(pid, PIDTYPE_PID); ++ ++ if (task && !vx_proc_task_visible(task)) { ++ vxdprintk(VXD_CBIT(misc, 6), ++ "dropping task (get) %p[#%u,%u] for %p[#%u,%u]", ++ task, task->xid, task->pid, ++ current, current->xid, current->pid); ++ put_task_struct(task); ++ task = NULL; ++ } ++ return task; ++} ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vs_sched.h linux-3.2.34-vs2.3.2.15/include/linux/vs_sched.h +--- linux-3.2.34/include/linux/vs_sched.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vs_sched.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,40 @@ ++#ifndef _VS_SCHED_H ++#define _VS_SCHED_H ++ ++#include "vserver/base.h" ++#include "vserver/context.h" ++#include "vserver/sched.h" ++ ++ ++#define MAX_PRIO_BIAS 20 ++#define MIN_PRIO_BIAS -20 ++ ++static inline ++int vx_adjust_prio(struct task_struct *p, int prio, int max_user) ++{ ++ struct vx_info *vxi = p->vx_info; ++ ++ if (vxi) ++ prio += vx_cpu(vxi, sched_pc).prio_bias; ++ return prio; ++} ++ ++static inline void vx_account_user(struct vx_info *vxi, ++ cputime_t cputime, int nice) ++{ ++ if (!vxi) ++ return; ++ vx_cpu(vxi, sched_pc).user_ticks += cputime; ++} ++ ++static inline void vx_account_system(struct vx_info *vxi, ++ cputime_t cputime, int idle) ++{ ++ if (!vxi) ++ return; ++ vx_cpu(vxi, sched_pc).sys_ticks += cputime; ++} ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vs_socket.h linux-3.2.34-vs2.3.2.15/include/linux/vs_socket.h +--- linux-3.2.34/include/linux/vs_socket.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vs_socket.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,67 @@ ++#ifndef _VS_SOCKET_H ++#define _VS_SOCKET_H ++ ++#include "vserver/debug.h" ++#include "vserver/base.h" ++#include "vserver/cacct.h" ++#include "vserver/context.h" ++#include "vserver/tag.h" ++ ++ ++/* socket accounting */ ++ ++#include ++ ++static inline int vx_sock_type(int family) ++{ ++ switch (family) { ++ case PF_UNSPEC: ++ return VXA_SOCK_UNSPEC; ++ case PF_UNIX: ++ return VXA_SOCK_UNIX; ++ case PF_INET: ++ return VXA_SOCK_INET; ++ case PF_INET6: ++ return VXA_SOCK_INET6; ++ case PF_PACKET: ++ return VXA_SOCK_PACKET; ++ default: ++ return VXA_SOCK_OTHER; ++ } ++} ++ ++#define vx_acc_sock(v, f, p, s) \ ++ __vx_acc_sock(v, f, p, s, __FILE__, __LINE__) ++ ++static inline void __vx_acc_sock(struct vx_info *vxi, ++ int family, int pos, int size, char *file, int line) ++{ ++ if (vxi) { ++ int type = vx_sock_type(family); ++ ++ atomic_long_inc(&vxi->cacct.sock[type][pos].count); ++ atomic_long_add(size, &vxi->cacct.sock[type][pos].total); ++ } ++} ++ ++#define vx_sock_recv(sk, s) \ ++ vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 0, s) ++#define vx_sock_send(sk, s) \ ++ vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 1, s) ++#define vx_sock_fail(sk, s) \ ++ vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 2, s) ++ ++ ++#define sock_vx_init(s) do { \ ++ (s)->sk_xid = 0; \ ++ (s)->sk_vx_info = NULL; \ ++ } while (0) ++ ++#define sock_nx_init(s) do { \ ++ (s)->sk_nid = 0; \ ++ (s)->sk_nx_info = NULL; \ ++ } while (0) ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vs_tag.h linux-3.2.34-vs2.3.2.15/include/linux/vs_tag.h +--- linux-3.2.34/include/linux/vs_tag.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vs_tag.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,47 @@ ++#ifndef _VS_TAG_H ++#define _VS_TAG_H ++ ++#include ++ ++/* check conditions */ ++ ++#define DX_ADMIN 0x0001 ++#define DX_WATCH 0x0002 ++#define DX_HOSTID 0x0008 ++ ++#define DX_IDENT 0x0010 ++ ++#define DX_ARG_MASK 0x0010 ++ ++ ++#define dx_task_tag(t) ((t)->tag) ++ ++#define dx_current_tag() dx_task_tag(current) ++ ++#define dx_check(c, m) __dx_check(dx_current_tag(), c, m) ++ ++#define dx_weak_check(c, m) ((m) ? dx_check(c, m) : 1) ++ ++ ++/* ++ * check current context for ADMIN/WATCH and ++ * optionally against supplied argument ++ */ ++static inline int __dx_check(tag_t cid, tag_t id, unsigned int mode) ++{ ++ if (mode & DX_ARG_MASK) { ++ if ((mode & DX_IDENT) && (id == cid)) ++ return 1; ++ } ++ return (((mode & DX_ADMIN) && (cid == 0)) || ++ ((mode & DX_WATCH) && (cid == 1)) || ++ ((mode & DX_HOSTID) && (id == 0))); ++} ++ ++struct inode; ++int dx_permission(const struct inode *inode, int mask); ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vs_time.h linux-3.2.34-vs2.3.2.15/include/linux/vs_time.h +--- linux-3.2.34/include/linux/vs_time.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vs_time.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,19 @@ ++#ifndef _VS_TIME_H ++#define _VS_TIME_H ++ ++ ++/* time faking stuff */ ++ ++#ifdef CONFIG_VSERVER_VTIME ++ ++extern void vx_adjust_timespec(struct timespec *ts); ++extern int vx_settimeofday(const struct timespec *ts); ++ ++#else ++#define vx_adjust_timespec(t) do { } while (0) ++#define vx_settimeofday(t) do_settimeofday(t) ++#endif ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/Kbuild linux-3.2.34-vs2.3.2.15/include/linux/vserver/Kbuild +--- linux-3.2.34/include/linux/vserver/Kbuild 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/Kbuild 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,8 @@ ++ ++header-y += context_cmd.h network_cmd.h space_cmd.h \ ++ cacct_cmd.h cvirt_cmd.h limit_cmd.h dlimit_cmd.h \ ++ inode_cmd.h tag_cmd.h sched_cmd.h signal_cmd.h \ ++ debug_cmd.h device_cmd.h ++ ++header-y += switch.h network.h monitor.h inode.h device.h ++ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/base.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/base.h +--- linux-3.2.34/include/linux/vserver/base.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/base.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,178 @@ ++#ifndef _VX_BASE_H ++#define _VX_BASE_H ++ ++ ++/* context state changes */ ++ ++enum { ++ VSC_STARTUP = 1, ++ VSC_SHUTDOWN, ++ ++ VSC_NETUP, ++ VSC_NETDOWN, ++}; ++ ++ ++ ++#define vx_task_xid(t) ((t)->xid) ++ ++#define vx_current_xid() vx_task_xid(current) ++ ++#define current_vx_info() (current->vx_info) ++ ++ ++#define nx_task_nid(t) ((t)->nid) ++ ++#define nx_current_nid() nx_task_nid(current) ++ ++#define current_nx_info() (current->nx_info) ++ ++ ++/* generic flag merging */ ++ ++#define vs_check_flags(v, m, f) (((v) & (m)) ^ (f)) ++ ++#define vs_mask_flags(v, f, m) (((v) & ~(m)) | ((f) & (m))) ++ ++#define vs_mask_mask(v, f, m) (((v) & ~(m)) | ((v) & (f) & (m))) ++ ++#define vs_check_bit(v, n) ((v) & (1LL << (n))) ++ ++ ++/* context flags */ ++ ++#define __vx_flags(v) ((v) ? (v)->vx_flags : 0) ++ ++#define vx_current_flags() __vx_flags(current_vx_info()) ++ ++#define vx_info_flags(v, m, f) \ ++ vs_check_flags(__vx_flags(v), m, f) ++ ++#define task_vx_flags(t, m, f) \ ++ ((t) && vx_info_flags((t)->vx_info, m, f)) ++ ++#define vx_flags(m, f) vx_info_flags(current_vx_info(), m, f) ++ ++ ++/* context caps */ ++ ++#define __vx_ccaps(v) ((v) ? (v)->vx_ccaps : 0) ++ ++#define vx_current_ccaps() __vx_ccaps(current_vx_info()) ++ ++#define vx_info_ccaps(v, c) (__vx_ccaps(v) & (c)) ++ ++#define vx_ccaps(c) vx_info_ccaps(current_vx_info(), (c)) ++ ++ ++ ++/* network flags */ ++ ++#define __nx_flags(n) ((n) ? (n)->nx_flags : 0) ++ ++#define nx_current_flags() __nx_flags(current_nx_info()) ++ ++#define nx_info_flags(n, m, f) \ ++ vs_check_flags(__nx_flags(n), m, f) ++ ++#define task_nx_flags(t, m, f) \ ++ ((t) && nx_info_flags((t)->nx_info, m, f)) ++ ++#define nx_flags(m, f) nx_info_flags(current_nx_info(), m, f) ++ ++ ++/* network caps */ ++ ++#define __nx_ncaps(n) ((n) ? (n)->nx_ncaps : 0) ++ ++#define nx_current_ncaps() __nx_ncaps(current_nx_info()) ++ ++#define nx_info_ncaps(n, c) (__nx_ncaps(n) & (c)) ++ ++#define nx_ncaps(c) nx_info_ncaps(current_nx_info(), c) ++ ++ ++/* context mask capabilities */ ++ ++#define __vx_mcaps(v) ((v) ? (v)->vx_ccaps >> 32UL : ~0 ) ++ ++#define vx_info_mcaps(v, c) (__vx_mcaps(v) & (c)) ++ ++#define vx_mcaps(c) vx_info_mcaps(current_vx_info(), c) ++ ++ ++/* context bcap mask */ ++ ++#define __vx_bcaps(v) ((v)->vx_bcaps) ++ ++#define vx_current_bcaps() __vx_bcaps(current_vx_info()) ++ ++ ++/* mask given bcaps */ ++ ++#define vx_info_mbcaps(v, c) ((v) ? cap_intersect(__vx_bcaps(v), c) : c) ++ ++#define vx_mbcaps(c) vx_info_mbcaps(current_vx_info(), c) ++ ++ ++/* masked cap_bset */ ++ ++#define vx_info_cap_bset(v) vx_info_mbcaps(v, current->cap_bset) ++ ++#define vx_current_cap_bset() vx_info_cap_bset(current_vx_info()) ++ ++#if 0 ++#define vx_info_mbcap(v, b) \ ++ (!vx_info_flags(v, VXF_STATE_SETUP, 0) ? \ ++ vx_info_bcaps(v, b) : (b)) ++ ++#define task_vx_mbcap(t, b) \ ++ vx_info_mbcap((t)->vx_info, (t)->b) ++ ++#define vx_mbcap(b) task_vx_mbcap(current, b) ++#endif ++ ++#define vx_cap_raised(v, c, f) cap_raised(vx_info_mbcaps(v, c), f) ++ ++#define vx_capable(b, c) (capable(b) || \ ++ (cap_raised(current_cap(), b) && vx_ccaps(c))) ++ ++#define vx_ns_capable(n, b, c) (ns_capable(n, b) || \ ++ (cap_raised(current_cap(), b) && vx_ccaps(c))) ++ ++#define nx_capable(b, c) (capable(b) || \ ++ (cap_raised(current_cap(), b) && nx_ncaps(c))) ++ ++#define vx_task_initpid(t, n) \ ++ ((t)->vx_info && \ ++ ((t)->vx_info->vx_initpid == (n))) ++ ++#define vx_current_initpid(n) vx_task_initpid(current, n) ++ ++ ++/* context unshare mask */ ++ ++#define __vx_umask(v) ((v)->vx_umask) ++ ++#define vx_current_umask() __vx_umask(current_vx_info()) ++ ++#define vx_can_unshare(b, f) (capable(b) || \ ++ (cap_raised(current_cap(), b) && \ ++ !((f) & ~vx_current_umask()))) ++ ++ ++#define __vx_wmask(v) ((v)->vx_wmask) ++ ++#define vx_current_wmask() __vx_wmask(current_vx_info()) ++ ++ ++#define __vx_state(v) ((v) ? ((v)->vx_state) : 0) ++ ++#define vx_info_state(v, m) (__vx_state(v) & (m)) ++ ++ ++#define __nx_state(n) ((n) ? ((n)->nx_state) : 0) ++ ++#define nx_info_state(n, m) (__nx_state(n) & (m)) ++ ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/cacct.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/cacct.h +--- linux-3.2.34/include/linux/vserver/cacct.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/cacct.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,15 @@ ++#ifndef _VX_CACCT_H ++#define _VX_CACCT_H ++ ++ ++enum sock_acc_field { ++ VXA_SOCK_UNSPEC = 0, ++ VXA_SOCK_UNIX, ++ VXA_SOCK_INET, ++ VXA_SOCK_INET6, ++ VXA_SOCK_PACKET, ++ VXA_SOCK_OTHER, ++ VXA_SOCK_SIZE /* array size */ ++}; ++ ++#endif /* _VX_CACCT_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/cacct_cmd.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/cacct_cmd.h +--- linux-3.2.34/include/linux/vserver/cacct_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/cacct_cmd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,23 @@ ++#ifndef _VX_CACCT_CMD_H ++#define _VX_CACCT_CMD_H ++ ++ ++/* virtual host info name commands */ ++ ++#define VCMD_sock_stat VC_CMD(VSTAT, 5, 0) ++ ++struct vcmd_sock_stat_v0 { ++ uint32_t field; ++ uint32_t count[3]; ++ uint64_t total[3]; ++}; ++ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++extern int vc_sock_stat(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_CACCT_CMD_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/cacct_def.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/cacct_def.h +--- linux-3.2.34/include/linux/vserver/cacct_def.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/cacct_def.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,43 @@ ++#ifndef _VX_CACCT_DEF_H ++#define _VX_CACCT_DEF_H ++ ++#include ++#include ++ ++ ++struct _vx_sock_acc { ++ atomic_long_t count; ++ atomic_long_t total; ++}; ++ ++/* context sub struct */ ++ ++struct _vx_cacct { ++ struct _vx_sock_acc sock[VXA_SOCK_SIZE][3]; ++ atomic_t slab[8]; ++ atomic_t page[6][8]; ++}; ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ ++static inline void __dump_vx_cacct(struct _vx_cacct *cacct) ++{ ++ int i, j; ++ ++ printk("\t_vx_cacct:"); ++ for (i = 0; i < 6; i++) { ++ struct _vx_sock_acc *ptr = cacct->sock[i]; ++ ++ printk("\t [%d] =", i); ++ for (j = 0; j < 3; j++) { ++ printk(" [%d] = %8lu, %8lu", j, ++ atomic_long_read(&ptr[j].count), ++ atomic_long_read(&ptr[j].total)); ++ } ++ printk("\n"); ++ } ++} ++ ++#endif ++ ++#endif /* _VX_CACCT_DEF_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/cacct_int.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/cacct_int.h +--- linux-3.2.34/include/linux/vserver/cacct_int.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/cacct_int.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,21 @@ ++#ifndef _VX_CACCT_INT_H ++#define _VX_CACCT_INT_H ++ ++ ++#ifdef __KERNEL__ ++ ++static inline ++unsigned long vx_sock_count(struct _vx_cacct *cacct, int type, int pos) ++{ ++ return atomic_long_read(&cacct->sock[type][pos].count); ++} ++ ++ ++static inline ++unsigned long vx_sock_total(struct _vx_cacct *cacct, int type, int pos) ++{ ++ return atomic_long_read(&cacct->sock[type][pos].total); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_CACCT_INT_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/check.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/check.h +--- linux-3.2.34/include/linux/vserver/check.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/check.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,89 @@ ++#ifndef _VS_CHECK_H ++#define _VS_CHECK_H ++ ++ ++#define MAX_S_CONTEXT 65535 /* Arbitrary limit */ ++ ++#ifdef CONFIG_VSERVER_DYNAMIC_IDS ++#define MIN_D_CONTEXT 49152 /* dynamic contexts start here */ ++#else ++#define MIN_D_CONTEXT 65536 ++#endif ++ ++/* check conditions */ ++ ++#define VS_ADMIN 0x0001 ++#define VS_WATCH 0x0002 ++#define VS_HIDE 0x0004 ++#define VS_HOSTID 0x0008 ++ ++#define VS_IDENT 0x0010 ++#define VS_EQUIV 0x0020 ++#define VS_PARENT 0x0040 ++#define VS_CHILD 0x0080 ++ ++#define VS_ARG_MASK 0x00F0 ++ ++#define VS_DYNAMIC 0x0100 ++#define VS_STATIC 0x0200 ++ ++#define VS_ATR_MASK 0x0F00 ++ ++#ifdef CONFIG_VSERVER_PRIVACY ++#define VS_ADMIN_P (0) ++#define VS_WATCH_P (0) ++#else ++#define VS_ADMIN_P VS_ADMIN ++#define VS_WATCH_P VS_WATCH ++#endif ++ ++#define VS_HARDIRQ 0x1000 ++#define VS_SOFTIRQ 0x2000 ++#define VS_IRQ 0x4000 ++ ++#define VS_IRQ_MASK 0xF000 ++ ++#include ++ ++/* ++ * check current context for ADMIN/WATCH and ++ * optionally against supplied argument ++ */ ++static inline int __vs_check(int cid, int id, unsigned int mode) ++{ ++ if (mode & VS_ARG_MASK) { ++ if ((mode & VS_IDENT) && (id == cid)) ++ return 1; ++ } ++ if (mode & VS_ATR_MASK) { ++ if ((mode & VS_DYNAMIC) && ++ (id >= MIN_D_CONTEXT) && ++ (id <= MAX_S_CONTEXT)) ++ return 1; ++ if ((mode & VS_STATIC) && ++ (id > 1) && (id < MIN_D_CONTEXT)) ++ return 1; ++ } ++ if (mode & VS_IRQ_MASK) { ++ if ((mode & VS_IRQ) && unlikely(in_interrupt())) ++ return 1; ++ if ((mode & VS_HARDIRQ) && unlikely(in_irq())) ++ return 1; ++ if ((mode & VS_SOFTIRQ) && unlikely(in_softirq())) ++ return 1; ++ } ++ return (((mode & VS_ADMIN) && (cid == 0)) || ++ ((mode & VS_WATCH) && (cid == 1)) || ++ ((mode & VS_HOSTID) && (id == 0))); ++} ++ ++#define vx_check(c, m) __vs_check(vx_current_xid(), c, (m) | VS_IRQ) ++ ++#define vx_weak_check(c, m) ((m) ? vx_check(c, m) : 1) ++ ++ ++#define nx_check(c, m) __vs_check(nx_current_nid(), c, m) ++ ++#define nx_weak_check(c, m) ((m) ? nx_check(c, m) : 1) ++ ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/context.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/context.h +--- linux-3.2.34/include/linux/vserver/context.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/context.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,188 @@ ++#ifndef _VX_CONTEXT_H ++#define _VX_CONTEXT_H ++ ++#include ++#include ++ ++ ++/* context flags */ ++ ++#define VXF_INFO_SCHED 0x00000002 ++#define VXF_INFO_NPROC 0x00000004 ++#define VXF_INFO_PRIVATE 0x00000008 ++ ++#define VXF_INFO_INIT 0x00000010 ++#define VXF_INFO_HIDE 0x00000020 ++#define VXF_INFO_ULIMIT 0x00000040 ++#define VXF_INFO_NSPACE 0x00000080 ++ ++#define VXF_SCHED_HARD 0x00000100 ++#define VXF_SCHED_PRIO 0x00000200 ++#define VXF_SCHED_PAUSE 0x00000400 ++ ++#define VXF_VIRT_MEM 0x00010000 ++#define VXF_VIRT_UPTIME 0x00020000 ++#define VXF_VIRT_CPU 0x00040000 ++#define VXF_VIRT_LOAD 0x00080000 ++#define VXF_VIRT_TIME 0x00100000 ++ ++#define VXF_HIDE_MOUNT 0x01000000 ++/* was VXF_HIDE_NETIF 0x02000000 */ ++#define VXF_HIDE_VINFO 0x04000000 ++ ++#define VXF_STATE_SETUP (1ULL << 32) ++#define VXF_STATE_INIT (1ULL << 33) ++#define VXF_STATE_ADMIN (1ULL << 34) ++ ++#define VXF_SC_HELPER (1ULL << 36) ++#define VXF_REBOOT_KILL (1ULL << 37) ++#define VXF_PERSISTENT (1ULL << 38) ++ ++#define VXF_FORK_RSS (1ULL << 48) ++#define VXF_PROLIFIC (1ULL << 49) ++ ++#define VXF_IGNEG_NICE (1ULL << 52) ++ ++#define VXF_ONE_TIME (0x0007ULL << 32) ++ ++#define VXF_INIT_SET (VXF_STATE_SETUP | VXF_STATE_INIT | VXF_STATE_ADMIN) ++ ++ ++/* context migration */ ++ ++#define VXM_SET_INIT 0x00000001 ++#define VXM_SET_REAPER 0x00000002 ++ ++/* context caps */ ++ ++#define VXC_SET_UTSNAME 0x00000001 ++#define VXC_SET_RLIMIT 0x00000002 ++#define VXC_FS_SECURITY 0x00000004 ++#define VXC_FS_TRUSTED 0x00000008 ++#define VXC_TIOCSTI 0x00000010 ++ ++/* was VXC_RAW_ICMP 0x00000100 */ ++#define VXC_SYSLOG 0x00001000 ++#define VXC_OOM_ADJUST 0x00002000 ++#define VXC_AUDIT_CONTROL 0x00004000 ++ ++#define VXC_SECURE_MOUNT 0x00010000 ++#define VXC_SECURE_REMOUNT 0x00020000 ++#define VXC_BINARY_MOUNT 0x00040000 ++ ++#define VXC_QUOTA_CTL 0x00100000 ++#define VXC_ADMIN_MAPPER 0x00200000 ++#define VXC_ADMIN_CLOOP 0x00400000 ++ ++#define VXC_KTHREAD 0x01000000 ++#define VXC_NAMESPACE 0x02000000 ++ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++ ++#include "limit_def.h" ++#include "sched_def.h" ++#include "cvirt_def.h" ++#include "cacct_def.h" ++#include "device_def.h" ++ ++#define VX_SPACES 2 ++ ++struct _vx_info_pc { ++ struct _vx_sched_pc sched_pc; ++ struct _vx_cvirt_pc cvirt_pc; ++}; ++ ++struct _vx_space { ++ unsigned long vx_nsmask; /* assignment mask */ ++ struct nsproxy *vx_nsproxy; /* private namespaces */ ++ struct fs_struct *vx_fs; /* private namespace fs */ ++ const struct cred *vx_cred; /* task credentials */ ++}; ++ ++struct vx_info { ++ struct hlist_node vx_hlist; /* linked list of contexts */ ++ xid_t vx_id; /* context id */ ++ atomic_t vx_usecnt; /* usage count */ ++ atomic_t vx_tasks; /* tasks count */ ++ struct vx_info *vx_parent; /* parent context */ ++ int vx_state; /* context state */ ++ ++ struct _vx_space space[VX_SPACES]; /* namespace store */ ++ ++ uint64_t vx_flags; /* context flags */ ++ uint64_t vx_ccaps; /* context caps (vserver) */ ++ uint64_t vx_umask; /* unshare mask (guest) */ ++ uint64_t vx_wmask; /* warn mask (guest) */ ++ kernel_cap_t vx_bcaps; /* bounding caps (system) */ ++ ++ struct task_struct *vx_reaper; /* guest reaper process */ ++ pid_t vx_initpid; /* PID of guest init */ ++ int64_t vx_badness_bias; /* OOM points bias */ ++ ++ struct _vx_limit limit; /* vserver limits */ ++ struct _vx_sched sched; /* vserver scheduler */ ++ struct _vx_cvirt cvirt; /* virtual/bias stuff */ ++ struct _vx_cacct cacct; /* context accounting */ ++ ++ struct _vx_device dmap; /* default device map targets */ ++ ++#ifndef CONFIG_SMP ++ struct _vx_info_pc info_pc; /* per cpu data */ ++#else ++ struct _vx_info_pc *ptr_pc; /* per cpu array */ ++#endif ++ ++ wait_queue_head_t vx_wait; /* context exit waitqueue */ ++ int reboot_cmd; /* last sys_reboot() cmd */ ++ int exit_code; /* last process exit code */ ++ ++ char vx_name[65]; /* vserver name */ ++}; ++ ++#ifndef CONFIG_SMP ++#define vx_ptr_pc(vxi) (&(vxi)->info_pc) ++#define vx_per_cpu(vxi, v, id) vx_ptr_pc(vxi)->v ++#else ++#define vx_ptr_pc(vxi) ((vxi)->ptr_pc) ++#define vx_per_cpu(vxi, v, id) per_cpu_ptr(vx_ptr_pc(vxi), id)->v ++#endif ++ ++#define vx_cpu(vxi, v) vx_per_cpu(vxi, v, smp_processor_id()) ++ ++ ++struct vx_info_save { ++ struct vx_info *vxi; ++ xid_t xid; ++}; ++ ++ ++/* status flags */ ++ ++#define VXS_HASHED 0x0001 ++#define VXS_PAUSED 0x0010 ++#define VXS_SHUTDOWN 0x0100 ++#define VXS_HELPER 0x1000 ++#define VXS_RELEASED 0x8000 ++ ++ ++extern void claim_vx_info(struct vx_info *, struct task_struct *); ++extern void release_vx_info(struct vx_info *, struct task_struct *); ++ ++extern struct vx_info *lookup_vx_info(int); ++extern struct vx_info *lookup_or_create_vx_info(int); ++ ++extern int get_xid_list(int, unsigned int *, int); ++extern int xid_is_hashed(xid_t); ++ ++extern int vx_migrate_task(struct task_struct *, struct vx_info *, int); ++ ++extern long vs_state_change(struct vx_info *, unsigned int); ++ ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_CONTEXT_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/context_cmd.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/context_cmd.h +--- linux-3.2.34/include/linux/vserver/context_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/context_cmd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,162 @@ ++#ifndef _VX_CONTEXT_CMD_H ++#define _VX_CONTEXT_CMD_H ++ ++ ++/* vinfo commands */ ++ ++#define VCMD_task_xid VC_CMD(VINFO, 1, 0) ++ ++#ifdef __KERNEL__ ++extern int vc_task_xid(uint32_t); ++ ++#endif /* __KERNEL__ */ ++ ++#define VCMD_vx_info VC_CMD(VINFO, 5, 0) ++ ++struct vcmd_vx_info_v0 { ++ uint32_t xid; ++ uint32_t initpid; ++ /* more to come */ ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_vx_info(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++#define VCMD_ctx_stat VC_CMD(VSTAT, 0, 0) ++ ++struct vcmd_ctx_stat_v0 { ++ uint32_t usecnt; ++ uint32_t tasks; ++ /* more to come */ ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_ctx_stat(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++/* context commands */ ++ ++#define VCMD_ctx_create_v0 VC_CMD(VPROC, 1, 0) ++#define VCMD_ctx_create VC_CMD(VPROC, 1, 1) ++ ++struct vcmd_ctx_create { ++ uint64_t flagword; ++}; ++ ++#define VCMD_ctx_migrate_v0 VC_CMD(PROCMIG, 1, 0) ++#define VCMD_ctx_migrate VC_CMD(PROCMIG, 1, 1) ++ ++struct vcmd_ctx_migrate { ++ uint64_t flagword; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_ctx_create(uint32_t, void __user *); ++extern int vc_ctx_migrate(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* flag commands */ ++ ++#define VCMD_get_cflags VC_CMD(FLAGS, 1, 0) ++#define VCMD_set_cflags VC_CMD(FLAGS, 2, 0) ++ ++struct vcmd_ctx_flags_v0 { ++ uint64_t flagword; ++ uint64_t mask; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_cflags(struct vx_info *, void __user *); ++extern int vc_set_cflags(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* context caps commands */ ++ ++#define VCMD_get_ccaps VC_CMD(FLAGS, 3, 1) ++#define VCMD_set_ccaps VC_CMD(FLAGS, 4, 1) ++ ++struct vcmd_ctx_caps_v1 { ++ uint64_t ccaps; ++ uint64_t cmask; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_ccaps(struct vx_info *, void __user *); ++extern int vc_set_ccaps(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* bcaps commands */ ++ ++#define VCMD_get_bcaps VC_CMD(FLAGS, 9, 0) ++#define VCMD_set_bcaps VC_CMD(FLAGS, 10, 0) ++ ++struct vcmd_bcaps { ++ uint64_t bcaps; ++ uint64_t bmask; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_bcaps(struct vx_info *, void __user *); ++extern int vc_set_bcaps(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* umask commands */ ++ ++#define VCMD_get_umask VC_CMD(FLAGS, 13, 0) ++#define VCMD_set_umask VC_CMD(FLAGS, 14, 0) ++ ++struct vcmd_umask { ++ uint64_t umask; ++ uint64_t mask; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_umask(struct vx_info *, void __user *); ++extern int vc_set_umask(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* wmask commands */ ++ ++#define VCMD_get_wmask VC_CMD(FLAGS, 15, 0) ++#define VCMD_set_wmask VC_CMD(FLAGS, 16, 0) ++ ++struct vcmd_wmask { ++ uint64_t wmask; ++ uint64_t mask; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_wmask(struct vx_info *, void __user *); ++extern int vc_set_wmask(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* OOM badness */ ++ ++#define VCMD_get_badness VC_CMD(MEMCTRL, 5, 0) ++#define VCMD_set_badness VC_CMD(MEMCTRL, 6, 0) ++ ++struct vcmd_badness_v0 { ++ int64_t bias; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_badness(struct vx_info *, void __user *); ++extern int vc_set_badness(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_CONTEXT_CMD_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/cvirt.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/cvirt.h +--- linux-3.2.34/include/linux/vserver/cvirt.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/cvirt.h 2012-04-24 00:44:36.000000000 +0200 +@@ -0,0 +1,22 @@ ++#ifndef _VX_CVIRT_H ++#define _VX_CVIRT_H ++ ++ ++#ifdef __KERNEL__ ++ ++struct timespec; ++ ++void vx_vsi_boottime(struct timespec *); ++ ++void vx_vsi_uptime(struct timespec *, struct timespec *); ++ ++ ++struct vx_info; ++ ++void vx_update_load(struct vx_info *); ++ ++ ++int vx_do_syslog(int, char __user *, int); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_CVIRT_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/cvirt_cmd.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/cvirt_cmd.h +--- linux-3.2.34/include/linux/vserver/cvirt_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/cvirt_cmd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,53 @@ ++#ifndef _VX_CVIRT_CMD_H ++#define _VX_CVIRT_CMD_H ++ ++ ++/* virtual host info name commands */ ++ ++#define VCMD_set_vhi_name VC_CMD(VHOST, 1, 0) ++#define VCMD_get_vhi_name VC_CMD(VHOST, 2, 0) ++ ++struct vcmd_vhi_name_v0 { ++ uint32_t field; ++ char name[65]; ++}; ++ ++ ++enum vhi_name_field { ++ VHIN_CONTEXT = 0, ++ VHIN_SYSNAME, ++ VHIN_NODENAME, ++ VHIN_RELEASE, ++ VHIN_VERSION, ++ VHIN_MACHINE, ++ VHIN_DOMAINNAME, ++}; ++ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++extern int vc_set_vhi_name(struct vx_info *, void __user *); ++extern int vc_get_vhi_name(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++#define VCMD_virt_stat VC_CMD(VSTAT, 3, 0) ++ ++struct vcmd_virt_stat_v0 { ++ uint64_t offset; ++ uint64_t uptime; ++ uint32_t nr_threads; ++ uint32_t nr_running; ++ uint32_t nr_uninterruptible; ++ uint32_t nr_onhold; ++ uint32_t nr_forks; ++ uint32_t load[3]; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_virt_stat(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_CVIRT_CMD_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/cvirt_def.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/cvirt_def.h +--- linux-3.2.34/include/linux/vserver/cvirt_def.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/cvirt_def.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,80 @@ ++#ifndef _VX_CVIRT_DEF_H ++#define _VX_CVIRT_DEF_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++ ++struct _vx_usage_stat { ++ uint64_t user; ++ uint64_t nice; ++ uint64_t system; ++ uint64_t softirq; ++ uint64_t irq; ++ uint64_t idle; ++ uint64_t iowait; ++}; ++ ++struct _vx_syslog { ++ wait_queue_head_t log_wait; ++ spinlock_t logbuf_lock; /* lock for the log buffer */ ++ ++ unsigned long log_start; /* next char to be read by syslog() */ ++ unsigned long con_start; /* next char to be sent to consoles */ ++ unsigned long log_end; /* most-recently-written-char + 1 */ ++ unsigned long logged_chars; /* #chars since last read+clear operation */ ++ ++ char log_buf[1024]; ++}; ++ ++ ++/* context sub struct */ ++ ++struct _vx_cvirt { ++ atomic_t nr_threads; /* number of current threads */ ++ atomic_t nr_running; /* number of running threads */ ++ atomic_t nr_uninterruptible; /* number of uninterruptible threads */ ++ ++ atomic_t nr_onhold; /* processes on hold */ ++ uint32_t onhold_last; /* jiffies when put on hold */ ++ ++ struct timespec bias_ts; /* time offset to the host */ ++ struct timespec bias_idle; ++ struct timespec bias_uptime; /* context creation point */ ++ uint64_t bias_clock; /* offset in clock_t */ ++ ++ spinlock_t load_lock; /* lock for the load averages */ ++ atomic_t load_updates; /* nr of load updates done so far */ ++ uint32_t load_last; /* last time load was calculated */ ++ uint32_t load[3]; /* load averages 1,5,15 */ ++ ++ atomic_t total_forks; /* number of forks so far */ ++ ++ struct _vx_syslog syslog; ++}; ++ ++struct _vx_cvirt_pc { ++ struct _vx_usage_stat cpustat; ++}; ++ ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ ++static inline void __dump_vx_cvirt(struct _vx_cvirt *cvirt) ++{ ++ printk("\t_vx_cvirt:\n"); ++ printk("\t threads: %4d, %4d, %4d, %4d\n", ++ atomic_read(&cvirt->nr_threads), ++ atomic_read(&cvirt->nr_running), ++ atomic_read(&cvirt->nr_uninterruptible), ++ atomic_read(&cvirt->nr_onhold)); ++ /* add rest here */ ++ printk("\t total_forks = %d\n", atomic_read(&cvirt->total_forks)); ++} ++ ++#endif ++ ++#endif /* _VX_CVIRT_DEF_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/debug.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/debug.h +--- linux-3.2.34/include/linux/vserver/debug.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/debug.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,145 @@ ++#ifndef _VX_DEBUG_H ++#define _VX_DEBUG_H ++ ++ ++#define VXD_CBIT(n, m) (vs_debug_ ## n & (1 << (m))) ++#define VXD_CMIN(n, m) (vs_debug_ ## n > (m)) ++#define VXD_MASK(n, m) (vs_debug_ ## n & (m)) ++ ++#define VXD_DEV(d) (d), (d)->bd_inode->i_ino, \ ++ imajor((d)->bd_inode), iminor((d)->bd_inode) ++#define VXF_DEV "%p[%lu,%d:%d]" ++ ++#if defined(CONFIG_QUOTES_UTF8) ++#define VS_Q_LQM "\xc2\xbb" ++#define VS_Q_RQM "\xc2\xab" ++#elif defined(CONFIG_QUOTES_ASCII) ++#define VS_Q_LQM "\x27" ++#define VS_Q_RQM "\x27" ++#else ++#define VS_Q_LQM "\xbb" ++#define VS_Q_RQM "\xab" ++#endif ++ ++#define VS_Q(f) VS_Q_LQM f VS_Q_RQM ++ ++ ++#define vxd_path(p) \ ++ ({ static char _buffer[PATH_MAX]; \ ++ d_path(p, _buffer, sizeof(_buffer)); }) ++ ++#define vxd_cond_path(n) \ ++ ((n) ? vxd_path(&(n)->path) : "" ) ++ ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ ++extern unsigned int vs_debug_switch; ++extern unsigned int vs_debug_xid; ++extern unsigned int vs_debug_nid; ++extern unsigned int vs_debug_tag; ++extern unsigned int vs_debug_net; ++extern unsigned int vs_debug_limit; ++extern unsigned int vs_debug_cres; ++extern unsigned int vs_debug_dlim; ++extern unsigned int vs_debug_quota; ++extern unsigned int vs_debug_cvirt; ++extern unsigned int vs_debug_space; ++extern unsigned int vs_debug_perm; ++extern unsigned int vs_debug_misc; ++ ++ ++#define VX_LOGLEVEL "vxD: " ++#define VX_PROC_FMT "%p: " ++#define VX_PROCESS current ++ ++#define vxdprintk(c, f, x...) \ ++ do { \ ++ if (c) \ ++ printk(VX_LOGLEVEL VX_PROC_FMT f "\n", \ ++ VX_PROCESS , ##x); \ ++ } while (0) ++ ++#define vxlprintk(c, f, x...) \ ++ do { \ ++ if (c) \ ++ printk(VX_LOGLEVEL f " @%s:%d\n", x); \ ++ } while (0) ++ ++#define vxfprintk(c, f, x...) \ ++ do { \ ++ if (c) \ ++ printk(VX_LOGLEVEL f " %s@%s:%d\n", x); \ ++ } while (0) ++ ++ ++struct vx_info; ++ ++void dump_vx_info(struct vx_info *, int); ++void dump_vx_info_inactive(int); ++ ++#else /* CONFIG_VSERVER_DEBUG */ ++ ++#define vs_debug_switch 0 ++#define vs_debug_xid 0 ++#define vs_debug_nid 0 ++#define vs_debug_tag 0 ++#define vs_debug_net 0 ++#define vs_debug_limit 0 ++#define vs_debug_cres 0 ++#define vs_debug_dlim 0 ++#define vs_debug_quota 0 ++#define vs_debug_cvirt 0 ++#define vs_debug_space 0 ++#define vs_debug_perm 0 ++#define vs_debug_misc 0 ++ ++#define vxdprintk(x...) do { } while (0) ++#define vxlprintk(x...) do { } while (0) ++#define vxfprintk(x...) do { } while (0) ++ ++#endif /* CONFIG_VSERVER_DEBUG */ ++ ++ ++#ifdef CONFIG_VSERVER_WARN ++ ++#define VX_WARNLEVEL KERN_WARNING "vxW: " ++#define VX_WARN_TASK "[" VS_Q("%s") ",%u:#%u|%u|%u] " ++#define VX_WARN_XID "[xid #%u] " ++#define VX_WARN_NID "[nid #%u] " ++#define VX_WARN_TAG "[tag #%u] " ++ ++#define vxwprintk(c, f, x...) \ ++ do { \ ++ if (c) \ ++ printk(VX_WARNLEVEL f "\n", ##x); \ ++ } while (0) ++ ++#else /* CONFIG_VSERVER_WARN */ ++ ++#define vxwprintk(x...) do { } while (0) ++ ++#endif /* CONFIG_VSERVER_WARN */ ++ ++#define vxwprintk_task(c, f, x...) \ ++ vxwprintk(c, VX_WARN_TASK f, \ ++ current->comm, current->pid, \ ++ current->xid, current->nid, current->tag, ##x) ++#define vxwprintk_xid(c, f, x...) \ ++ vxwprintk(c, VX_WARN_XID f, current->xid, x) ++#define vxwprintk_nid(c, f, x...) \ ++ vxwprintk(c, VX_WARN_NID f, current->nid, x) ++#define vxwprintk_tag(c, f, x...) \ ++ vxwprintk(c, VX_WARN_TAG f, current->tag, x) ++ ++#ifdef CONFIG_VSERVER_DEBUG ++#define vxd_assert_lock(l) assert_spin_locked(l) ++#define vxd_assert(c, f, x...) vxlprintk(!(c), \ ++ "assertion [" f "] failed.", ##x, __FILE__, __LINE__) ++#else ++#define vxd_assert_lock(l) do { } while (0) ++#define vxd_assert(c, f, x...) do { } while (0) ++#endif ++ ++ ++#endif /* _VX_DEBUG_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/debug_cmd.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/debug_cmd.h +--- linux-3.2.34/include/linux/vserver/debug_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/debug_cmd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,58 @@ ++#ifndef _VX_DEBUG_CMD_H ++#define _VX_DEBUG_CMD_H ++ ++ ++/* debug commands */ ++ ++#define VCMD_dump_history VC_CMD(DEBUG, 1, 0) ++ ++#define VCMD_read_history VC_CMD(DEBUG, 5, 0) ++#define VCMD_read_monitor VC_CMD(DEBUG, 6, 0) ++ ++struct vcmd_read_history_v0 { ++ uint32_t index; ++ uint32_t count; ++ char __user *data; ++}; ++ ++struct vcmd_read_monitor_v0 { ++ uint32_t index; ++ uint32_t count; ++ char __user *data; ++}; ++ ++ ++#ifdef __KERNEL__ ++ ++#ifdef CONFIG_COMPAT ++ ++#include ++ ++struct vcmd_read_history_v0_x32 { ++ uint32_t index; ++ uint32_t count; ++ compat_uptr_t data_ptr; ++}; ++ ++struct vcmd_read_monitor_v0_x32 { ++ uint32_t index; ++ uint32_t count; ++ compat_uptr_t data_ptr; ++}; ++ ++#endif /* CONFIG_COMPAT */ ++ ++extern int vc_dump_history(uint32_t); ++ ++extern int vc_read_history(uint32_t, void __user *); ++extern int vc_read_monitor(uint32_t, void __user *); ++ ++#ifdef CONFIG_COMPAT ++ ++extern int vc_read_history_x32(uint32_t, void __user *); ++extern int vc_read_monitor_x32(uint32_t, void __user *); ++ ++#endif /* CONFIG_COMPAT */ ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_DEBUG_CMD_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/device.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/device.h +--- linux-3.2.34/include/linux/vserver/device.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/device.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,15 @@ ++#ifndef _VX_DEVICE_H ++#define _VX_DEVICE_H ++ ++ ++#define DATTR_CREATE 0x00000001 ++#define DATTR_OPEN 0x00000002 ++ ++#define DATTR_REMAP 0x00000010 ++ ++#define DATTR_MASK 0x00000013 ++ ++ ++#else /* _VX_DEVICE_H */ ++#warning duplicate inclusion ++#endif /* _VX_DEVICE_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/device_cmd.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/device_cmd.h +--- linux-3.2.34/include/linux/vserver/device_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/device_cmd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,44 @@ ++#ifndef _VX_DEVICE_CMD_H ++#define _VX_DEVICE_CMD_H ++ ++ ++/* device vserver commands */ ++ ++#define VCMD_set_mapping VC_CMD(DEVICE, 1, 0) ++#define VCMD_unset_mapping VC_CMD(DEVICE, 2, 0) ++ ++struct vcmd_set_mapping_v0 { ++ const char __user *device; ++ const char __user *target; ++ uint32_t flags; ++}; ++ ++ ++#ifdef __KERNEL__ ++ ++#ifdef CONFIG_COMPAT ++ ++#include ++ ++struct vcmd_set_mapping_v0_x32 { ++ compat_uptr_t device_ptr; ++ compat_uptr_t target_ptr; ++ uint32_t flags; ++}; ++ ++#endif /* CONFIG_COMPAT */ ++ ++#include ++ ++extern int vc_set_mapping(struct vx_info *, void __user *); ++extern int vc_unset_mapping(struct vx_info *, void __user *); ++ ++#ifdef CONFIG_COMPAT ++ ++extern int vc_set_mapping_x32(struct vx_info *, void __user *); ++extern int vc_unset_mapping_x32(struct vx_info *, void __user *); ++ ++#endif /* CONFIG_COMPAT */ ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_DEVICE_CMD_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/device_def.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/device_def.h +--- linux-3.2.34/include/linux/vserver/device_def.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/device_def.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,17 @@ ++#ifndef _VX_DEVICE_DEF_H ++#define _VX_DEVICE_DEF_H ++ ++#include ++ ++struct vx_dmap_target { ++ dev_t target; ++ uint32_t flags; ++}; ++ ++struct _vx_device { ++#ifdef CONFIG_VSERVER_DEVICE ++ struct vx_dmap_target targets[2]; ++#endif ++}; ++ ++#endif /* _VX_DEVICE_DEF_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/dlimit.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/dlimit.h +--- linux-3.2.34/include/linux/vserver/dlimit.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/dlimit.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,54 @@ ++#ifndef _VX_DLIMIT_H ++#define _VX_DLIMIT_H ++ ++#include "switch.h" ++ ++ ++#ifdef __KERNEL__ ++ ++/* keep in sync with CDLIM_INFINITY */ ++ ++#define DLIM_INFINITY (~0ULL) ++ ++#include ++#include ++ ++struct super_block; ++ ++struct dl_info { ++ struct hlist_node dl_hlist; /* linked list of contexts */ ++ struct rcu_head dl_rcu; /* the rcu head */ ++ tag_t dl_tag; /* context tag */ ++ atomic_t dl_usecnt; /* usage count */ ++ atomic_t dl_refcnt; /* reference count */ ++ ++ struct super_block *dl_sb; /* associated superblock */ ++ ++ spinlock_t dl_lock; /* protect the values */ ++ ++ unsigned long long dl_space_used; /* used space in bytes */ ++ unsigned long long dl_space_total; /* maximum space in bytes */ ++ unsigned long dl_inodes_used; /* used inodes */ ++ unsigned long dl_inodes_total; /* maximum inodes */ ++ ++ unsigned int dl_nrlmult; /* non root limit mult */ ++}; ++ ++struct rcu_head; ++ ++extern void rcu_free_dl_info(struct rcu_head *); ++extern void unhash_dl_info(struct dl_info *); ++ ++extern struct dl_info *locate_dl_info(struct super_block *, tag_t); ++ ++ ++struct kstatfs; ++ ++extern void vx_vsi_statfs(struct super_block *, struct kstatfs *); ++ ++typedef uint64_t dlsize_t; ++ ++#endif /* __KERNEL__ */ ++#else /* _VX_DLIMIT_H */ ++#warning duplicate inclusion ++#endif /* _VX_DLIMIT_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/dlimit_cmd.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/dlimit_cmd.h +--- linux-3.2.34/include/linux/vserver/dlimit_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/dlimit_cmd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,109 @@ ++#ifndef _VX_DLIMIT_CMD_H ++#define _VX_DLIMIT_CMD_H ++ ++ ++/* dlimit vserver commands */ ++ ++#define VCMD_add_dlimit VC_CMD(DLIMIT, 1, 0) ++#define VCMD_rem_dlimit VC_CMD(DLIMIT, 2, 0) ++ ++#define VCMD_set_dlimit VC_CMD(DLIMIT, 5, 0) ++#define VCMD_get_dlimit VC_CMD(DLIMIT, 6, 0) ++ ++struct vcmd_ctx_dlimit_base_v0 { ++ const char __user *name; ++ uint32_t flags; ++}; ++ ++struct vcmd_ctx_dlimit_v0 { ++ const char __user *name; ++ uint32_t space_used; /* used space in kbytes */ ++ uint32_t space_total; /* maximum space in kbytes */ ++ uint32_t inodes_used; /* used inodes */ ++ uint32_t inodes_total; /* maximum inodes */ ++ uint32_t reserved; /* reserved for root in % */ ++ uint32_t flags; ++}; ++ ++#define CDLIM_UNSET ((uint32_t)0UL) ++#define CDLIM_INFINITY ((uint32_t)~0UL) ++#define CDLIM_KEEP ((uint32_t)~1UL) ++ ++#define DLIME_UNIT 0 ++#define DLIME_KILO 1 ++#define DLIME_MEGA 2 ++#define DLIME_GIGA 3 ++ ++#define DLIMF_SHIFT 0x10 ++ ++#define DLIMS_USED 0 ++#define DLIMS_TOTAL 2 ++ ++static inline ++uint64_t dlimit_space_32to64(uint32_t val, uint32_t flags, int shift) ++{ ++ int exp = (flags & DLIMF_SHIFT) ? ++ (flags >> shift) & DLIME_GIGA : DLIME_KILO; ++ return ((uint64_t)val) << (10 * exp); ++} ++ ++static inline ++uint32_t dlimit_space_64to32(uint64_t val, uint32_t *flags, int shift) ++{ ++ int exp = 0; ++ ++ if (*flags & DLIMF_SHIFT) { ++ while (val > (1LL << 32) && (exp < 3)) { ++ val >>= 10; ++ exp++; ++ } ++ *flags &= ~(DLIME_GIGA << shift); ++ *flags |= exp << shift; ++ } else ++ val >>= 10; ++ return val; ++} ++ ++#ifdef __KERNEL__ ++ ++#ifdef CONFIG_COMPAT ++ ++#include ++ ++struct vcmd_ctx_dlimit_base_v0_x32 { ++ compat_uptr_t name_ptr; ++ uint32_t flags; ++}; ++ ++struct vcmd_ctx_dlimit_v0_x32 { ++ compat_uptr_t name_ptr; ++ uint32_t space_used; /* used space in kbytes */ ++ uint32_t space_total; /* maximum space in kbytes */ ++ uint32_t inodes_used; /* used inodes */ ++ uint32_t inodes_total; /* maximum inodes */ ++ uint32_t reserved; /* reserved for root in % */ ++ uint32_t flags; ++}; ++ ++#endif /* CONFIG_COMPAT */ ++ ++#include ++ ++extern int vc_add_dlimit(uint32_t, void __user *); ++extern int vc_rem_dlimit(uint32_t, void __user *); ++ ++extern int vc_set_dlimit(uint32_t, void __user *); ++extern int vc_get_dlimit(uint32_t, void __user *); ++ ++#ifdef CONFIG_COMPAT ++ ++extern int vc_add_dlimit_x32(uint32_t, void __user *); ++extern int vc_rem_dlimit_x32(uint32_t, void __user *); ++ ++extern int vc_set_dlimit_x32(uint32_t, void __user *); ++extern int vc_get_dlimit_x32(uint32_t, void __user *); ++ ++#endif /* CONFIG_COMPAT */ ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_DLIMIT_CMD_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/global.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/global.h +--- linux-3.2.34/include/linux/vserver/global.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/global.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,19 @@ ++#ifndef _VX_GLOBAL_H ++#define _VX_GLOBAL_H ++ ++ ++extern atomic_t vx_global_ctotal; ++extern atomic_t vx_global_cactive; ++ ++extern atomic_t nx_global_ctotal; ++extern atomic_t nx_global_cactive; ++ ++extern atomic_t vs_global_nsproxy; ++extern atomic_t vs_global_fs; ++extern atomic_t vs_global_mnt_ns; ++extern atomic_t vs_global_uts_ns; ++extern atomic_t vs_global_user_ns; ++extern atomic_t vs_global_pid_ns; ++ ++ ++#endif /* _VX_GLOBAL_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/history.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/history.h +--- linux-3.2.34/include/linux/vserver/history.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/history.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,197 @@ ++#ifndef _VX_HISTORY_H ++#define _VX_HISTORY_H ++ ++ ++enum { ++ VXH_UNUSED = 0, ++ VXH_THROW_OOPS = 1, ++ ++ VXH_GET_VX_INFO, ++ VXH_PUT_VX_INFO, ++ VXH_INIT_VX_INFO, ++ VXH_SET_VX_INFO, ++ VXH_CLR_VX_INFO, ++ VXH_CLAIM_VX_INFO, ++ VXH_RELEASE_VX_INFO, ++ VXH_ALLOC_VX_INFO, ++ VXH_DEALLOC_VX_INFO, ++ VXH_HASH_VX_INFO, ++ VXH_UNHASH_VX_INFO, ++ VXH_LOC_VX_INFO, ++ VXH_LOOKUP_VX_INFO, ++ VXH_CREATE_VX_INFO, ++}; ++ ++struct _vxhe_vxi { ++ struct vx_info *ptr; ++ unsigned xid; ++ unsigned usecnt; ++ unsigned tasks; ++}; ++ ++struct _vxhe_set_clr { ++ void *data; ++}; ++ ++struct _vxhe_loc_lookup { ++ unsigned arg; ++}; ++ ++struct _vx_hist_entry { ++ void *loc; ++ unsigned short seq; ++ unsigned short type; ++ struct _vxhe_vxi vxi; ++ union { ++ struct _vxhe_set_clr sc; ++ struct _vxhe_loc_lookup ll; ++ }; ++}; ++ ++#ifdef CONFIG_VSERVER_HISTORY ++ ++extern unsigned volatile int vxh_active; ++ ++struct _vx_hist_entry *vxh_advance(void *loc); ++ ++ ++static inline ++void __vxh_copy_vxi(struct _vx_hist_entry *entry, struct vx_info *vxi) ++{ ++ entry->vxi.ptr = vxi; ++ if (vxi) { ++ entry->vxi.usecnt = atomic_read(&vxi->vx_usecnt); ++ entry->vxi.tasks = atomic_read(&vxi->vx_tasks); ++ entry->vxi.xid = vxi->vx_id; ++ } ++} ++ ++ ++#define __HERE__ current_text_addr() ++ ++#define __VXH_BODY(__type, __data, __here) \ ++ struct _vx_hist_entry *entry; \ ++ \ ++ preempt_disable(); \ ++ entry = vxh_advance(__here); \ ++ __data; \ ++ entry->type = __type; \ ++ preempt_enable(); ++ ++ ++ /* pass vxi only */ ++ ++#define __VXH_SMPL \ ++ __vxh_copy_vxi(entry, vxi) ++ ++static inline ++void __vxh_smpl(struct vx_info *vxi, int __type, void *__here) ++{ ++ __VXH_BODY(__type, __VXH_SMPL, __here) ++} ++ ++ /* pass vxi and data (void *) */ ++ ++#define __VXH_DATA \ ++ __vxh_copy_vxi(entry, vxi); \ ++ entry->sc.data = data ++ ++static inline ++void __vxh_data(struct vx_info *vxi, void *data, ++ int __type, void *__here) ++{ ++ __VXH_BODY(__type, __VXH_DATA, __here) ++} ++ ++ /* pass vxi and arg (long) */ ++ ++#define __VXH_LONG \ ++ __vxh_copy_vxi(entry, vxi); \ ++ entry->ll.arg = arg ++ ++static inline ++void __vxh_long(struct vx_info *vxi, long arg, ++ int __type, void *__here) ++{ ++ __VXH_BODY(__type, __VXH_LONG, __here) ++} ++ ++ ++static inline ++void __vxh_throw_oops(void *__here) ++{ ++ __VXH_BODY(VXH_THROW_OOPS, {}, __here); ++ /* prevent further acquisition */ ++ vxh_active = 0; ++} ++ ++ ++#define vxh_throw_oops() __vxh_throw_oops(__HERE__); ++ ++#define __vxh_get_vx_info(v, h) __vxh_smpl(v, VXH_GET_VX_INFO, h); ++#define __vxh_put_vx_info(v, h) __vxh_smpl(v, VXH_PUT_VX_INFO, h); ++ ++#define __vxh_init_vx_info(v, d, h) \ ++ __vxh_data(v, d, VXH_INIT_VX_INFO, h); ++#define __vxh_set_vx_info(v, d, h) \ ++ __vxh_data(v, d, VXH_SET_VX_INFO, h); ++#define __vxh_clr_vx_info(v, d, h) \ ++ __vxh_data(v, d, VXH_CLR_VX_INFO, h); ++ ++#define __vxh_claim_vx_info(v, d, h) \ ++ __vxh_data(v, d, VXH_CLAIM_VX_INFO, h); ++#define __vxh_release_vx_info(v, d, h) \ ++ __vxh_data(v, d, VXH_RELEASE_VX_INFO, h); ++ ++#define vxh_alloc_vx_info(v) \ ++ __vxh_smpl(v, VXH_ALLOC_VX_INFO, __HERE__); ++#define vxh_dealloc_vx_info(v) \ ++ __vxh_smpl(v, VXH_DEALLOC_VX_INFO, __HERE__); ++ ++#define vxh_hash_vx_info(v) \ ++ __vxh_smpl(v, VXH_HASH_VX_INFO, __HERE__); ++#define vxh_unhash_vx_info(v) \ ++ __vxh_smpl(v, VXH_UNHASH_VX_INFO, __HERE__); ++ ++#define vxh_loc_vx_info(v, l) \ ++ __vxh_long(v, l, VXH_LOC_VX_INFO, __HERE__); ++#define vxh_lookup_vx_info(v, l) \ ++ __vxh_long(v, l, VXH_LOOKUP_VX_INFO, __HERE__); ++#define vxh_create_vx_info(v, l) \ ++ __vxh_long(v, l, VXH_CREATE_VX_INFO, __HERE__); ++ ++extern void vxh_dump_history(void); ++ ++ ++#else /* CONFIG_VSERVER_HISTORY */ ++ ++#define __HERE__ 0 ++ ++#define vxh_throw_oops() do { } while (0) ++ ++#define __vxh_get_vx_info(v, h) do { } while (0) ++#define __vxh_put_vx_info(v, h) do { } while (0) ++ ++#define __vxh_init_vx_info(v, d, h) do { } while (0) ++#define __vxh_set_vx_info(v, d, h) do { } while (0) ++#define __vxh_clr_vx_info(v, d, h) do { } while (0) ++ ++#define __vxh_claim_vx_info(v, d, h) do { } while (0) ++#define __vxh_release_vx_info(v, d, h) do { } while (0) ++ ++#define vxh_alloc_vx_info(v) do { } while (0) ++#define vxh_dealloc_vx_info(v) do { } while (0) ++ ++#define vxh_hash_vx_info(v) do { } while (0) ++#define vxh_unhash_vx_info(v) do { } while (0) ++ ++#define vxh_loc_vx_info(v, l) do { } while (0) ++#define vxh_lookup_vx_info(v, l) do { } while (0) ++#define vxh_create_vx_info(v, l) do { } while (0) ++ ++#define vxh_dump_history() do { } while (0) ++ ++ ++#endif /* CONFIG_VSERVER_HISTORY */ ++ ++#endif /* _VX_HISTORY_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/inode.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/inode.h +--- linux-3.2.34/include/linux/vserver/inode.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/inode.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,39 @@ ++#ifndef _VX_INODE_H ++#define _VX_INODE_H ++ ++ ++#define IATTR_TAG 0x01000000 ++ ++#define IATTR_ADMIN 0x00000001 ++#define IATTR_WATCH 0x00000002 ++#define IATTR_HIDE 0x00000004 ++#define IATTR_FLAGS 0x00000007 ++ ++#define IATTR_BARRIER 0x00010000 ++#define IATTR_IXUNLINK 0x00020000 ++#define IATTR_IMMUTABLE 0x00040000 ++#define IATTR_COW 0x00080000 ++ ++#ifdef __KERNEL__ ++ ++ ++#ifdef CONFIG_VSERVER_PROC_SECURE ++#define IATTR_PROC_DEFAULT ( IATTR_ADMIN | IATTR_HIDE ) ++#define IATTR_PROC_SYMLINK ( IATTR_ADMIN ) ++#else ++#define IATTR_PROC_DEFAULT ( IATTR_ADMIN ) ++#define IATTR_PROC_SYMLINK ( IATTR_ADMIN ) ++#endif ++ ++#define vx_hide_check(c, m) (((m) & IATTR_HIDE) ? vx_check(c, m) : 1) ++ ++#endif /* __KERNEL__ */ ++ ++/* inode ioctls */ ++ ++#define FIOC_GETXFLG _IOR('x', 5, long) ++#define FIOC_SETXFLG _IOW('x', 6, long) ++ ++#else /* _VX_INODE_H */ ++#warning duplicate inclusion ++#endif /* _VX_INODE_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/inode_cmd.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/inode_cmd.h +--- linux-3.2.34/include/linux/vserver/inode_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/inode_cmd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,59 @@ ++#ifndef _VX_INODE_CMD_H ++#define _VX_INODE_CMD_H ++ ++ ++/* inode vserver commands */ ++ ++#define VCMD_get_iattr VC_CMD(INODE, 1, 1) ++#define VCMD_set_iattr VC_CMD(INODE, 2, 1) ++ ++#define VCMD_fget_iattr VC_CMD(INODE, 3, 0) ++#define VCMD_fset_iattr VC_CMD(INODE, 4, 0) ++ ++struct vcmd_ctx_iattr_v1 { ++ const char __user *name; ++ uint32_t tag; ++ uint32_t flags; ++ uint32_t mask; ++}; ++ ++struct vcmd_ctx_fiattr_v0 { ++ uint32_t tag; ++ uint32_t flags; ++ uint32_t mask; ++}; ++ ++ ++#ifdef __KERNEL__ ++ ++ ++#ifdef CONFIG_COMPAT ++ ++#include ++ ++struct vcmd_ctx_iattr_v1_x32 { ++ compat_uptr_t name_ptr; ++ uint32_t tag; ++ uint32_t flags; ++ uint32_t mask; ++}; ++ ++#endif /* CONFIG_COMPAT */ ++ ++#include ++ ++extern int vc_get_iattr(void __user *); ++extern int vc_set_iattr(void __user *); ++ ++extern int vc_fget_iattr(uint32_t, void __user *); ++extern int vc_fset_iattr(uint32_t, void __user *); ++ ++#ifdef CONFIG_COMPAT ++ ++extern int vc_get_iattr_x32(void __user *); ++extern int vc_set_iattr_x32(void __user *); ++ ++#endif /* CONFIG_COMPAT */ ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_INODE_CMD_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/limit.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/limit.h +--- linux-3.2.34/include/linux/vserver/limit.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/limit.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,71 @@ ++#ifndef _VX_LIMIT_H ++#define _VX_LIMIT_H ++ ++#define VLIMIT_NSOCK 16 ++#define VLIMIT_OPENFD 17 ++#define VLIMIT_ANON 18 ++#define VLIMIT_SHMEM 19 ++#define VLIMIT_SEMARY 20 ++#define VLIMIT_NSEMS 21 ++#define VLIMIT_DENTRY 22 ++#define VLIMIT_MAPPED 23 ++ ++ ++#ifdef __KERNEL__ ++ ++#define VLIM_NOCHECK ((1L << VLIMIT_DENTRY) | (1L << RLIMIT_RSS)) ++ ++/* keep in sync with CRLIM_INFINITY */ ++ ++#define VLIM_INFINITY (~0ULL) ++ ++#include ++#include ++ ++#ifndef RLIM_INFINITY ++#warning RLIM_INFINITY is undefined ++#endif ++ ++#define __rlim_val(l, r, v) ((l)->res[r].v) ++ ++#define __rlim_soft(l, r) __rlim_val(l, r, soft) ++#define __rlim_hard(l, r) __rlim_val(l, r, hard) ++ ++#define __rlim_rcur(l, r) __rlim_val(l, r, rcur) ++#define __rlim_rmin(l, r) __rlim_val(l, r, rmin) ++#define __rlim_rmax(l, r) __rlim_val(l, r, rmax) ++ ++#define __rlim_lhit(l, r) __rlim_val(l, r, lhit) ++#define __rlim_hit(l, r) atomic_inc(&__rlim_lhit(l, r)) ++ ++typedef atomic_long_t rlim_atomic_t; ++typedef unsigned long rlim_t; ++ ++#define __rlim_get(l, r) atomic_long_read(&__rlim_rcur(l, r)) ++#define __rlim_set(l, r, v) atomic_long_set(&__rlim_rcur(l, r), v) ++#define __rlim_inc(l, r) atomic_long_inc(&__rlim_rcur(l, r)) ++#define __rlim_dec(l, r) atomic_long_dec(&__rlim_rcur(l, r)) ++#define __rlim_add(l, r, v) atomic_long_add(v, &__rlim_rcur(l, r)) ++#define __rlim_sub(l, r, v) atomic_long_sub(v, &__rlim_rcur(l, r)) ++ ++ ++#if (RLIM_INFINITY == VLIM_INFINITY) ++#define VX_VLIM(r) ((long long)(long)(r)) ++#define VX_RLIM(v) ((rlim_t)(v)) ++#else ++#define VX_VLIM(r) (((r) == RLIM_INFINITY) \ ++ ? VLIM_INFINITY : (long long)(r)) ++#define VX_RLIM(v) (((v) == VLIM_INFINITY) \ ++ ? RLIM_INFINITY : (rlim_t)(v)) ++#endif ++ ++struct sysinfo; ++ ++void vx_vsi_meminfo(struct sysinfo *); ++void vx_vsi_swapinfo(struct sysinfo *); ++long vx_vsi_cached(struct sysinfo *); ++ ++#define NUM_LIMITS 24 ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_LIMIT_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/limit_cmd.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/limit_cmd.h +--- linux-3.2.34/include/linux/vserver/limit_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/limit_cmd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,71 @@ ++#ifndef _VX_LIMIT_CMD_H ++#define _VX_LIMIT_CMD_H ++ ++ ++/* rlimit vserver commands */ ++ ++#define VCMD_get_rlimit VC_CMD(RLIMIT, 1, 0) ++#define VCMD_set_rlimit VC_CMD(RLIMIT, 2, 0) ++#define VCMD_get_rlimit_mask VC_CMD(RLIMIT, 3, 0) ++#define VCMD_reset_hits VC_CMD(RLIMIT, 7, 0) ++#define VCMD_reset_minmax VC_CMD(RLIMIT, 9, 0) ++ ++struct vcmd_ctx_rlimit_v0 { ++ uint32_t id; ++ uint64_t minimum; ++ uint64_t softlimit; ++ uint64_t maximum; ++}; ++ ++struct vcmd_ctx_rlimit_mask_v0 { ++ uint32_t minimum; ++ uint32_t softlimit; ++ uint32_t maximum; ++}; ++ ++#define VCMD_rlimit_stat VC_CMD(VSTAT, 1, 0) ++ ++struct vcmd_rlimit_stat_v0 { ++ uint32_t id; ++ uint32_t hits; ++ uint64_t value; ++ uint64_t minimum; ++ uint64_t maximum; ++}; ++ ++#define CRLIM_UNSET (0ULL) ++#define CRLIM_INFINITY (~0ULL) ++#define CRLIM_KEEP (~1ULL) ++ ++#ifdef __KERNEL__ ++ ++#ifdef CONFIG_IA32_EMULATION ++ ++struct vcmd_ctx_rlimit_v0_x32 { ++ uint32_t id; ++ uint64_t minimum; ++ uint64_t softlimit; ++ uint64_t maximum; ++} __attribute__ ((packed)); ++ ++#endif /* CONFIG_IA32_EMULATION */ ++ ++#include ++ ++extern int vc_get_rlimit_mask(uint32_t, void __user *); ++extern int vc_get_rlimit(struct vx_info *, void __user *); ++extern int vc_set_rlimit(struct vx_info *, void __user *); ++extern int vc_reset_hits(struct vx_info *, void __user *); ++extern int vc_reset_minmax(struct vx_info *, void __user *); ++ ++extern int vc_rlimit_stat(struct vx_info *, void __user *); ++ ++#ifdef CONFIG_IA32_EMULATION ++ ++extern int vc_get_rlimit_x32(struct vx_info *, void __user *); ++extern int vc_set_rlimit_x32(struct vx_info *, void __user *); ++ ++#endif /* CONFIG_IA32_EMULATION */ ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_LIMIT_CMD_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/limit_def.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/limit_def.h +--- linux-3.2.34/include/linux/vserver/limit_def.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/limit_def.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,47 @@ ++#ifndef _VX_LIMIT_DEF_H ++#define _VX_LIMIT_DEF_H ++ ++#include ++#include ++ ++#include "limit.h" ++ ++ ++struct _vx_res_limit { ++ rlim_t soft; /* Context soft limit */ ++ rlim_t hard; /* Context hard limit */ ++ ++ rlim_atomic_t rcur; /* Current value */ ++ rlim_t rmin; /* Context minimum */ ++ rlim_t rmax; /* Context maximum */ ++ ++ atomic_t lhit; /* Limit hits */ ++}; ++ ++/* context sub struct */ ++ ++struct _vx_limit { ++ struct _vx_res_limit res[NUM_LIMITS]; ++}; ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ ++static inline void __dump_vx_limit(struct _vx_limit *limit) ++{ ++ int i; ++ ++ printk("\t_vx_limit:"); ++ for (i = 0; i < NUM_LIMITS; i++) { ++ printk("\t [%2d] = %8lu %8lu/%8lu, %8ld/%8ld, %8d\n", ++ i, (unsigned long)__rlim_get(limit, i), ++ (unsigned long)__rlim_rmin(limit, i), ++ (unsigned long)__rlim_rmax(limit, i), ++ (long)__rlim_soft(limit, i), ++ (long)__rlim_hard(limit, i), ++ atomic_read(&__rlim_lhit(limit, i))); ++ } ++} ++ ++#endif ++ ++#endif /* _VX_LIMIT_DEF_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/limit_int.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/limit_int.h +--- linux-3.2.34/include/linux/vserver/limit_int.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/limit_int.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,198 @@ ++#ifndef _VX_LIMIT_INT_H ++#define _VX_LIMIT_INT_H ++ ++#include "context.h" ++ ++#ifdef __KERNEL__ ++ ++#define VXD_RCRES_COND(r) VXD_CBIT(cres, r) ++#define VXD_RLIMIT_COND(r) VXD_CBIT(limit, r) ++ ++extern const char *vlimit_name[NUM_LIMITS]; ++ ++static inline void __vx_acc_cres(struct vx_info *vxi, ++ int res, int dir, void *_data, char *_file, int _line) ++{ ++ if (VXD_RCRES_COND(res)) ++ vxlprintk(1, "vx_acc_cres[%5d,%s,%2d]: %5ld%s (%p)", ++ (vxi ? vxi->vx_id : -1), vlimit_name[res], res, ++ (vxi ? (long)__rlim_get(&vxi->limit, res) : 0), ++ (dir > 0) ? "++" : "--", _data, _file, _line); ++ if (!vxi) ++ return; ++ ++ if (dir > 0) ++ __rlim_inc(&vxi->limit, res); ++ else ++ __rlim_dec(&vxi->limit, res); ++} ++ ++static inline void __vx_add_cres(struct vx_info *vxi, ++ int res, int amount, void *_data, char *_file, int _line) ++{ ++ if (VXD_RCRES_COND(res)) ++ vxlprintk(1, "vx_add_cres[%5d,%s,%2d]: %5ld += %5d (%p)", ++ (vxi ? vxi->vx_id : -1), vlimit_name[res], res, ++ (vxi ? (long)__rlim_get(&vxi->limit, res) : 0), ++ amount, _data, _file, _line); ++ if (amount == 0) ++ return; ++ if (!vxi) ++ return; ++ __rlim_add(&vxi->limit, res, amount); ++} ++ ++static inline ++int __vx_cres_adjust_max(struct _vx_limit *limit, int res, rlim_t value) ++{ ++ int cond = (value > __rlim_rmax(limit, res)); ++ ++ if (cond) ++ __rlim_rmax(limit, res) = value; ++ return cond; ++} ++ ++static inline ++int __vx_cres_adjust_min(struct _vx_limit *limit, int res, rlim_t value) ++{ ++ int cond = (value < __rlim_rmin(limit, res)); ++ ++ if (cond) ++ __rlim_rmin(limit, res) = value; ++ return cond; ++} ++ ++static inline ++void __vx_cres_fixup(struct _vx_limit *limit, int res, rlim_t value) ++{ ++ if (!__vx_cres_adjust_max(limit, res, value)) ++ __vx_cres_adjust_min(limit, res, value); ++} ++ ++ ++/* return values: ++ +1 ... no limit hit ++ -1 ... over soft limit ++ 0 ... over hard limit */ ++ ++static inline int __vx_cres_avail(struct vx_info *vxi, ++ int res, int num, char *_file, int _line) ++{ ++ struct _vx_limit *limit; ++ rlim_t value; ++ ++ if (VXD_RLIMIT_COND(res)) ++ vxlprintk(1, "vx_cres_avail[%5d,%s,%2d]: %5ld/%5ld > %5ld + %5d", ++ (vxi ? vxi->vx_id : -1), vlimit_name[res], res, ++ (vxi ? (long)__rlim_soft(&vxi->limit, res) : -1), ++ (vxi ? (long)__rlim_hard(&vxi->limit, res) : -1), ++ (vxi ? (long)__rlim_get(&vxi->limit, res) : 0), ++ num, _file, _line); ++ if (!vxi) ++ return 1; ++ ++ limit = &vxi->limit; ++ value = __rlim_get(limit, res); ++ ++ if (!__vx_cres_adjust_max(limit, res, value)) ++ __vx_cres_adjust_min(limit, res, value); ++ ++ if (num == 0) ++ return 1; ++ ++ if (__rlim_soft(limit, res) == RLIM_INFINITY) ++ return -1; ++ if (value + num <= __rlim_soft(limit, res)) ++ return -1; ++ ++ if (__rlim_hard(limit, res) == RLIM_INFINITY) ++ return 1; ++ if (value + num <= __rlim_hard(limit, res)) ++ return 1; ++ ++ __rlim_hit(limit, res); ++ return 0; ++} ++ ++ ++static const int VLA_RSS[] = { RLIMIT_RSS, VLIMIT_ANON, VLIMIT_MAPPED, 0 }; ++ ++static inline ++rlim_t __vx_cres_array_sum(struct _vx_limit *limit, const int *array) ++{ ++ rlim_t value, sum = 0; ++ int res; ++ ++ while ((res = *array++)) { ++ value = __rlim_get(limit, res); ++ __vx_cres_fixup(limit, res, value); ++ sum += value; ++ } ++ return sum; ++} ++ ++static inline ++rlim_t __vx_cres_array_fixup(struct _vx_limit *limit, const int *array) ++{ ++ rlim_t value = __vx_cres_array_sum(limit, array + 1); ++ int res = *array; ++ ++ if (value == __rlim_get(limit, res)) ++ return value; ++ ++ __rlim_set(limit, res, value); ++ /* now adjust min/max */ ++ if (!__vx_cres_adjust_max(limit, res, value)) ++ __vx_cres_adjust_min(limit, res, value); ++ ++ return value; ++} ++ ++static inline int __vx_cres_array_avail(struct vx_info *vxi, ++ const int *array, int num, char *_file, int _line) ++{ ++ struct _vx_limit *limit; ++ rlim_t value = 0; ++ int res; ++ ++ if (num == 0) ++ return 1; ++ if (!vxi) ++ return 1; ++ ++ limit = &vxi->limit; ++ res = *array; ++ value = __vx_cres_array_sum(limit, array + 1); ++ ++ __rlim_set(limit, res, value); ++ __vx_cres_fixup(limit, res, value); ++ ++ return __vx_cres_avail(vxi, res, num, _file, _line); ++} ++ ++ ++static inline void vx_limit_fixup(struct _vx_limit *limit, int id) ++{ ++ rlim_t value; ++ int res; ++ ++ /* complex resources first */ ++ if ((id < 0) || (id == RLIMIT_RSS)) ++ __vx_cres_array_fixup(limit, VLA_RSS); ++ ++ for (res = 0; res < NUM_LIMITS; res++) { ++ if ((id > 0) && (res != id)) ++ continue; ++ ++ value = __rlim_get(limit, res); ++ __vx_cres_fixup(limit, res, value); ++ ++ /* not supposed to happen, maybe warn? */ ++ if (__rlim_rmax(limit, res) > __rlim_hard(limit, res)) ++ __rlim_rmax(limit, res) = __rlim_hard(limit, res); ++ } ++} ++ ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_LIMIT_INT_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/monitor.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/monitor.h +--- linux-3.2.34/include/linux/vserver/monitor.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/monitor.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,96 @@ ++#ifndef _VX_MONITOR_H ++#define _VX_MONITOR_H ++ ++#include ++ ++enum { ++ VXM_UNUSED = 0, ++ ++ VXM_SYNC = 0x10, ++ ++ VXM_UPDATE = 0x20, ++ VXM_UPDATE_1, ++ VXM_UPDATE_2, ++ ++ VXM_RQINFO_1 = 0x24, ++ VXM_RQINFO_2, ++ ++ VXM_ACTIVATE = 0x40, ++ VXM_DEACTIVATE, ++ VXM_IDLE, ++ ++ VXM_HOLD = 0x44, ++ VXM_UNHOLD, ++ ++ VXM_MIGRATE = 0x48, ++ VXM_RESCHED, ++ ++ /* all other bits are flags */ ++ VXM_SCHED = 0x80, ++}; ++ ++struct _vxm_update_1 { ++ uint32_t tokens_max; ++ uint32_t fill_rate; ++ uint32_t interval; ++}; ++ ++struct _vxm_update_2 { ++ uint32_t tokens_min; ++ uint32_t fill_rate; ++ uint32_t interval; ++}; ++ ++struct _vxm_rqinfo_1 { ++ uint16_t running; ++ uint16_t onhold; ++ uint16_t iowait; ++ uint16_t uintr; ++ uint32_t idle_tokens; ++}; ++ ++struct _vxm_rqinfo_2 { ++ uint32_t norm_time; ++ uint32_t idle_time; ++ uint32_t idle_skip; ++}; ++ ++struct _vxm_sched { ++ uint32_t tokens; ++ uint32_t norm_time; ++ uint32_t idle_time; ++}; ++ ++struct _vxm_task { ++ uint16_t pid; ++ uint16_t state; ++}; ++ ++struct _vxm_event { ++ uint32_t jif; ++ union { ++ uint32_t seq; ++ uint32_t sec; ++ }; ++ union { ++ uint32_t tokens; ++ uint32_t nsec; ++ struct _vxm_task tsk; ++ }; ++}; ++ ++struct _vx_mon_entry { ++ uint16_t type; ++ uint16_t xid; ++ union { ++ struct _vxm_event ev; ++ struct _vxm_sched sd; ++ struct _vxm_update_1 u1; ++ struct _vxm_update_2 u2; ++ struct _vxm_rqinfo_1 q1; ++ struct _vxm_rqinfo_2 q2; ++ }; ++}; ++ ++ ++#endif /* _VX_MONITOR_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/network.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/network.h +--- linux-3.2.34/include/linux/vserver/network.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/network.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,148 @@ ++#ifndef _VX_NETWORK_H ++#define _VX_NETWORK_H ++ ++#include ++ ++ ++#define MAX_N_CONTEXT 65535 /* Arbitrary limit */ ++ ++ ++/* network flags */ ++ ++#define NXF_INFO_PRIVATE 0x00000008 ++ ++#define NXF_SINGLE_IP 0x00000100 ++#define NXF_LBACK_REMAP 0x00000200 ++#define NXF_LBACK_ALLOW 0x00000400 ++ ++#define NXF_HIDE_NETIF 0x02000000 ++#define NXF_HIDE_LBACK 0x04000000 ++ ++#define NXF_STATE_SETUP (1ULL << 32) ++#define NXF_STATE_ADMIN (1ULL << 34) ++ ++#define NXF_SC_HELPER (1ULL << 36) ++#define NXF_PERSISTENT (1ULL << 38) ++ ++#define NXF_ONE_TIME (0x0005ULL << 32) ++ ++ ++#define NXF_INIT_SET (__nxf_init_set()) ++ ++static inline uint64_t __nxf_init_set(void) { ++ return NXF_STATE_ADMIN ++#ifdef CONFIG_VSERVER_AUTO_LBACK ++ | NXF_LBACK_REMAP ++ | NXF_HIDE_LBACK ++#endif ++#ifdef CONFIG_VSERVER_AUTO_SINGLE ++ | NXF_SINGLE_IP ++#endif ++ | NXF_HIDE_NETIF; ++} ++ ++ ++/* network caps */ ++ ++#define NXC_TUN_CREATE 0x00000001 ++ ++#define NXC_RAW_ICMP 0x00000100 ++ ++#define NXC_MULTICAST 0x00001000 ++ ++ ++/* address types */ ++ ++#define NXA_TYPE_IPV4 0x0001 ++#define NXA_TYPE_IPV6 0x0002 ++ ++#define NXA_TYPE_NONE 0x0000 ++#define NXA_TYPE_ANY 0x00FF ++ ++#define NXA_TYPE_ADDR 0x0010 ++#define NXA_TYPE_MASK 0x0020 ++#define NXA_TYPE_RANGE 0x0040 ++ ++#define NXA_MASK_ALL (NXA_TYPE_ADDR | NXA_TYPE_MASK | NXA_TYPE_RANGE) ++ ++#define NXA_MOD_BCAST 0x0100 ++#define NXA_MOD_LBACK 0x0200 ++ ++#define NXA_LOOPBACK 0x1000 ++ ++#define NXA_MASK_BIND (NXA_MASK_ALL | NXA_MOD_BCAST | NXA_MOD_LBACK) ++#define NXA_MASK_SHOW (NXA_MASK_ALL | NXA_LOOPBACK) ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct nx_addr_v4 { ++ struct nx_addr_v4 *next; ++ struct in_addr ip[2]; ++ struct in_addr mask; ++ uint16_t type; ++ uint16_t flags; ++}; ++ ++struct nx_addr_v6 { ++ struct nx_addr_v6 *next; ++ struct in6_addr ip; ++ struct in6_addr mask; ++ uint32_t prefix; ++ uint16_t type; ++ uint16_t flags; ++}; ++ ++struct nx_info { ++ struct hlist_node nx_hlist; /* linked list of nxinfos */ ++ nid_t nx_id; /* vnet id */ ++ atomic_t nx_usecnt; /* usage count */ ++ atomic_t nx_tasks; /* tasks count */ ++ int nx_state; /* context state */ ++ ++ uint64_t nx_flags; /* network flag word */ ++ uint64_t nx_ncaps; /* network capabilities */ ++ ++ struct in_addr v4_lback; /* Loopback address */ ++ struct in_addr v4_bcast; /* Broadcast address */ ++ struct nx_addr_v4 v4; /* First/Single ipv4 address */ ++#ifdef CONFIG_IPV6 ++ struct nx_addr_v6 v6; /* First/Single ipv6 address */ ++#endif ++ char nx_name[65]; /* network context name */ ++}; ++ ++ ++/* status flags */ ++ ++#define NXS_HASHED 0x0001 ++#define NXS_SHUTDOWN 0x0100 ++#define NXS_RELEASED 0x8000 ++ ++extern struct nx_info *lookup_nx_info(int); ++ ++extern int get_nid_list(int, unsigned int *, int); ++extern int nid_is_hashed(nid_t); ++ ++extern int nx_migrate_task(struct task_struct *, struct nx_info *); ++ ++extern long vs_net_change(struct nx_info *, unsigned int); ++ ++struct sock; ++ ++ ++#define NX_IPV4(n) ((n)->v4.type != NXA_TYPE_NONE) ++#ifdef CONFIG_IPV6 ++#define NX_IPV6(n) ((n)->v6.type != NXA_TYPE_NONE) ++#else ++#define NX_IPV6(n) (0) ++#endif ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_NETWORK_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/network_cmd.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/network_cmd.h +--- linux-3.2.34/include/linux/vserver/network_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/network_cmd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,164 @@ ++#ifndef _VX_NETWORK_CMD_H ++#define _VX_NETWORK_CMD_H ++ ++ ++/* vinfo commands */ ++ ++#define VCMD_task_nid VC_CMD(VINFO, 2, 0) ++ ++#ifdef __KERNEL__ ++extern int vc_task_nid(uint32_t); ++ ++#endif /* __KERNEL__ */ ++ ++#define VCMD_nx_info VC_CMD(VINFO, 6, 0) ++ ++struct vcmd_nx_info_v0 { ++ uint32_t nid; ++ /* more to come */ ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_nx_info(struct nx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++#include ++#include ++ ++#define VCMD_net_create_v0 VC_CMD(VNET, 1, 0) ++#define VCMD_net_create VC_CMD(VNET, 1, 1) ++ ++struct vcmd_net_create { ++ uint64_t flagword; ++}; ++ ++#define VCMD_net_migrate VC_CMD(NETMIG, 1, 0) ++ ++#define VCMD_net_add VC_CMD(NETALT, 1, 0) ++#define VCMD_net_remove VC_CMD(NETALT, 2, 0) ++ ++struct vcmd_net_addr_v0 { ++ uint16_t type; ++ uint16_t count; ++ struct in_addr ip[4]; ++ struct in_addr mask[4]; ++}; ++ ++#define VCMD_net_add_ipv4_v1 VC_CMD(NETALT, 1, 1) ++#define VCMD_net_rem_ipv4_v1 VC_CMD(NETALT, 2, 1) ++ ++struct vcmd_net_addr_ipv4_v1 { ++ uint16_t type; ++ uint16_t flags; ++ struct in_addr ip; ++ struct in_addr mask; ++}; ++ ++#define VCMD_net_add_ipv4 VC_CMD(NETALT, 1, 2) ++#define VCMD_net_rem_ipv4 VC_CMD(NETALT, 2, 2) ++ ++struct vcmd_net_addr_ipv4_v2 { ++ uint16_t type; ++ uint16_t flags; ++ struct in_addr ip; ++ struct in_addr ip2; ++ struct in_addr mask; ++}; ++ ++#define VCMD_net_add_ipv6 VC_CMD(NETALT, 3, 1) ++#define VCMD_net_remove_ipv6 VC_CMD(NETALT, 4, 1) ++ ++struct vcmd_net_addr_ipv6_v1 { ++ uint16_t type; ++ uint16_t flags; ++ uint32_t prefix; ++ struct in6_addr ip; ++ struct in6_addr mask; ++}; ++ ++#define VCMD_add_match_ipv4 VC_CMD(NETALT, 5, 0) ++#define VCMD_get_match_ipv4 VC_CMD(NETALT, 6, 0) ++ ++struct vcmd_match_ipv4_v0 { ++ uint16_t type; ++ uint16_t flags; ++ uint16_t parent; ++ uint16_t prefix; ++ struct in_addr ip; ++ struct in_addr ip2; ++ struct in_addr mask; ++}; ++ ++#define VCMD_add_match_ipv6 VC_CMD(NETALT, 7, 0) ++#define VCMD_get_match_ipv6 VC_CMD(NETALT, 8, 0) ++ ++struct vcmd_match_ipv6_v0 { ++ uint16_t type; ++ uint16_t flags; ++ uint16_t parent; ++ uint16_t prefix; ++ struct in6_addr ip; ++ struct in6_addr ip2; ++ struct in6_addr mask; ++}; ++ ++ ++#ifdef __KERNEL__ ++extern int vc_net_create(uint32_t, void __user *); ++extern int vc_net_migrate(struct nx_info *, void __user *); ++ ++extern int vc_net_add(struct nx_info *, void __user *); ++extern int vc_net_remove(struct nx_info *, void __user *); ++ ++extern int vc_net_add_ipv4_v1(struct nx_info *, void __user *); ++extern int vc_net_add_ipv4(struct nx_info *, void __user *); ++ ++extern int vc_net_rem_ipv4_v1(struct nx_info *, void __user *); ++extern int vc_net_rem_ipv4(struct nx_info *, void __user *); ++ ++extern int vc_net_add_ipv6(struct nx_info *, void __user *); ++extern int vc_net_remove_ipv6(struct nx_info *, void __user *); ++ ++extern int vc_add_match_ipv4(struct nx_info *, void __user *); ++extern int vc_get_match_ipv4(struct nx_info *, void __user *); ++ ++extern int vc_add_match_ipv6(struct nx_info *, void __user *); ++extern int vc_get_match_ipv6(struct nx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* flag commands */ ++ ++#define VCMD_get_nflags VC_CMD(FLAGS, 5, 0) ++#define VCMD_set_nflags VC_CMD(FLAGS, 6, 0) ++ ++struct vcmd_net_flags_v0 { ++ uint64_t flagword; ++ uint64_t mask; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_nflags(struct nx_info *, void __user *); ++extern int vc_set_nflags(struct nx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* network caps commands */ ++ ++#define VCMD_get_ncaps VC_CMD(FLAGS, 7, 0) ++#define VCMD_set_ncaps VC_CMD(FLAGS, 8, 0) ++ ++struct vcmd_net_caps_v0 { ++ uint64_t ncaps; ++ uint64_t cmask; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_ncaps(struct nx_info *, void __user *); ++extern int vc_set_ncaps(struct nx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_CONTEXT_CMD_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/percpu.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/percpu.h +--- linux-3.2.34/include/linux/vserver/percpu.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/percpu.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,14 @@ ++#ifndef _VX_PERCPU_H ++#define _VX_PERCPU_H ++ ++#include "cvirt_def.h" ++#include "sched_def.h" ++ ++struct _vx_percpu { ++ struct _vx_cvirt_pc cvirt; ++ struct _vx_sched_pc sched; ++}; ++ ++#define PERCPU_PERCTX (sizeof(struct _vx_percpu)) ++ ++#endif /* _VX_PERCPU_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/pid.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/pid.h +--- linux-3.2.34/include/linux/vserver/pid.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/pid.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,51 @@ ++#ifndef _VSERVER_PID_H ++#define _VSERVER_PID_H ++ ++/* pid faking stuff */ ++ ++#define vx_info_map_pid(v, p) \ ++ __vx_info_map_pid((v), (p), __func__, __FILE__, __LINE__) ++#define vx_info_map_tgid(v,p) vx_info_map_pid(v,p) ++#define vx_map_pid(p) vx_info_map_pid(current_vx_info(), p) ++#define vx_map_tgid(p) vx_map_pid(p) ++ ++static inline int __vx_info_map_pid(struct vx_info *vxi, int pid, ++ const char *func, const char *file, int line) ++{ ++ if (vx_info_flags(vxi, VXF_INFO_INIT, 0)) { ++ vxfprintk(VXD_CBIT(cvirt, 2), ++ "vx_map_tgid: %p/%llx: %d -> %d", ++ vxi, (long long)vxi->vx_flags, pid, ++ (pid && pid == vxi->vx_initpid) ? 1 : pid, ++ func, file, line); ++ if (pid == 0) ++ return 0; ++ if (pid == vxi->vx_initpid) ++ return 1; ++ } ++ return pid; ++} ++ ++#define vx_info_rmap_pid(v, p) \ ++ __vx_info_rmap_pid((v), (p), __func__, __FILE__, __LINE__) ++#define vx_rmap_pid(p) vx_info_rmap_pid(current_vx_info(), p) ++#define vx_rmap_tgid(p) vx_rmap_pid(p) ++ ++static inline int __vx_info_rmap_pid(struct vx_info *vxi, int pid, ++ const char *func, const char *file, int line) ++{ ++ if (vx_info_flags(vxi, VXF_INFO_INIT, 0)) { ++ vxfprintk(VXD_CBIT(cvirt, 2), ++ "vx_rmap_tgid: %p/%llx: %d -> %d", ++ vxi, (long long)vxi->vx_flags, pid, ++ (pid == 1) ? vxi->vx_initpid : pid, ++ func, file, line); ++ if ((pid == 1) && vxi->vx_initpid) ++ return vxi->vx_initpid; ++ if (pid == vxi->vx_initpid) ++ return ~0U; ++ } ++ return pid; ++} ++ ++#endif +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/sched.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/sched.h +--- linux-3.2.34/include/linux/vserver/sched.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/sched.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,23 @@ ++#ifndef _VX_SCHED_H ++#define _VX_SCHED_H ++ ++ ++#ifdef __KERNEL__ ++ ++struct timespec; ++ ++void vx_vsi_uptime(struct timespec *, struct timespec *); ++ ++ ++struct vx_info; ++ ++void vx_update_load(struct vx_info *); ++ ++ ++void vx_update_sched_param(struct _vx_sched *sched, ++ struct _vx_sched_pc *sched_pc); ++ ++#endif /* __KERNEL__ */ ++#else /* _VX_SCHED_H */ ++#warning duplicate inclusion ++#endif /* _VX_SCHED_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/sched_cmd.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/sched_cmd.h +--- linux-3.2.34/include/linux/vserver/sched_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/sched_cmd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,21 @@ ++#ifndef _VX_SCHED_CMD_H ++#define _VX_SCHED_CMD_H ++ ++ ++struct vcmd_prio_bias { ++ int32_t cpu_id; ++ int32_t prio_bias; ++}; ++ ++#define VCMD_set_prio_bias VC_CMD(SCHED, 4, 0) ++#define VCMD_get_prio_bias VC_CMD(SCHED, 5, 0) ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++extern int vc_set_prio_bias(struct vx_info *, void __user *); ++extern int vc_get_prio_bias(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_SCHED_CMD_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/sched_def.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/sched_def.h +--- linux-3.2.34/include/linux/vserver/sched_def.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/sched_def.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,38 @@ ++#ifndef _VX_SCHED_DEF_H ++#define _VX_SCHED_DEF_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++ ++/* context sub struct */ ++ ++struct _vx_sched { ++ int prio_bias; /* bias offset for priority */ ++ ++ cpumask_t update; /* CPUs which should update */ ++}; ++ ++struct _vx_sched_pc { ++ int prio_bias; /* bias offset for priority */ ++ ++ uint64_t user_ticks; /* token tick events */ ++ uint64_t sys_ticks; /* token tick events */ ++ uint64_t hold_ticks; /* token ticks paused */ ++}; ++ ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ ++static inline void __dump_vx_sched(struct _vx_sched *sched) ++{ ++ printk("\t_vx_sched:\n"); ++ printk("\t priority = %4d\n", sched->prio_bias); ++} ++ ++#endif ++ ++#endif /* _VX_SCHED_DEF_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/signal.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/signal.h +--- linux-3.2.34/include/linux/vserver/signal.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/signal.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,14 @@ ++#ifndef _VX_SIGNAL_H ++#define _VX_SIGNAL_H ++ ++ ++#ifdef __KERNEL__ ++ ++struct vx_info; ++ ++int vx_info_kill(struct vx_info *, int, int); ++ ++#endif /* __KERNEL__ */ ++#else /* _VX_SIGNAL_H */ ++#warning duplicate inclusion ++#endif /* _VX_SIGNAL_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/signal_cmd.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/signal_cmd.h +--- linux-3.2.34/include/linux/vserver/signal_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/signal_cmd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,43 @@ ++#ifndef _VX_SIGNAL_CMD_H ++#define _VX_SIGNAL_CMD_H ++ ++ ++/* signalling vserver commands */ ++ ++#define VCMD_ctx_kill VC_CMD(PROCTRL, 1, 0) ++#define VCMD_wait_exit VC_CMD(EVENT, 99, 0) ++ ++struct vcmd_ctx_kill_v0 { ++ int32_t pid; ++ int32_t sig; ++}; ++ ++struct vcmd_wait_exit_v0 { ++ int32_t reboot_cmd; ++ int32_t exit_code; ++}; ++ ++#ifdef __KERNEL__ ++ ++extern int vc_ctx_kill(struct vx_info *, void __user *); ++extern int vc_wait_exit(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++/* process alteration commands */ ++ ++#define VCMD_get_pflags VC_CMD(PROCALT, 5, 0) ++#define VCMD_set_pflags VC_CMD(PROCALT, 6, 0) ++ ++struct vcmd_pflags_v0 { ++ uint32_t flagword; ++ uint32_t mask; ++}; ++ ++#ifdef __KERNEL__ ++ ++extern int vc_get_pflags(uint32_t pid, void __user *); ++extern int vc_set_pflags(uint32_t pid, void __user *); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_SIGNAL_CMD_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/space.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/space.h +--- linux-3.2.34/include/linux/vserver/space.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/space.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,12 @@ ++#ifndef _VX_SPACE_H ++#define _VX_SPACE_H ++ ++#include ++ ++struct vx_info; ++ ++int vx_set_space(struct vx_info *vxi, unsigned long mask, unsigned index); ++ ++#else /* _VX_SPACE_H */ ++#warning duplicate inclusion ++#endif /* _VX_SPACE_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/space_cmd.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/space_cmd.h +--- linux-3.2.34/include/linux/vserver/space_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/space_cmd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,38 @@ ++#ifndef _VX_SPACE_CMD_H ++#define _VX_SPACE_CMD_H ++ ++ ++#define VCMD_enter_space_v0 VC_CMD(PROCALT, 1, 0) ++#define VCMD_enter_space_v1 VC_CMD(PROCALT, 1, 1) ++#define VCMD_enter_space VC_CMD(PROCALT, 1, 2) ++ ++#define VCMD_set_space_v0 VC_CMD(PROCALT, 3, 0) ++#define VCMD_set_space_v1 VC_CMD(PROCALT, 3, 1) ++#define VCMD_set_space VC_CMD(PROCALT, 3, 2) ++ ++#define VCMD_get_space_mask_v0 VC_CMD(PROCALT, 4, 0) ++ ++#define VCMD_get_space_mask VC_CMD(VSPACE, 0, 1) ++#define VCMD_get_space_default VC_CMD(VSPACE, 1, 0) ++ ++ ++struct vcmd_space_mask_v1 { ++ uint64_t mask; ++}; ++ ++struct vcmd_space_mask_v2 { ++ uint64_t mask; ++ uint32_t index; ++}; ++ ++ ++#ifdef __KERNEL__ ++ ++extern int vc_enter_space_v1(struct vx_info *, void __user *); ++extern int vc_set_space_v1(struct vx_info *, void __user *); ++extern int vc_enter_space(struct vx_info *, void __user *); ++extern int vc_set_space(struct vx_info *, void __user *); ++extern int vc_get_space_mask(void __user *, int); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_SPACE_CMD_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/switch.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/switch.h +--- linux-3.2.34/include/linux/vserver/switch.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/switch.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,98 @@ ++#ifndef _VX_SWITCH_H ++#define _VX_SWITCH_H ++ ++#include ++ ++ ++#define VC_CATEGORY(c) (((c) >> 24) & 0x3F) ++#define VC_COMMAND(c) (((c) >> 16) & 0xFF) ++#define VC_VERSION(c) ((c) & 0xFFF) ++ ++#define VC_CMD(c, i, v) ((((VC_CAT_ ## c) & 0x3F) << 24) \ ++ | (((i) & 0xFF) << 16) | ((v) & 0xFFF)) ++ ++/* ++ ++ Syscall Matrix V2.8 ++ ++ |VERSION|CREATE |MODIFY |MIGRATE|CONTROL|EXPERIM| |SPECIAL|SPECIAL| ++ |STATS |DESTROY|ALTER |CHANGE |LIMIT |TEST | | | | ++ |INFO |SETUP | |MOVE | | | | | | ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ SYSTEM |VERSION|VSETUP |VHOST | | | | |DEVICE | | ++ HOST | 00| 01| 02| 03| 04| 05| | 06| 07| ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ CPU | |VPROC |PROCALT|PROCMIG|PROCTRL| | |SCHED. | | ++ PROCESS| 08| 09| 10| 11| 12| 13| | 14| 15| ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ MEMORY | | | | |MEMCTRL| | |SWAP | | ++ | 16| 17| 18| 19| 20| 21| | 22| 23| ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ NETWORK| |VNET |NETALT |NETMIG |NETCTL | | |SERIAL | | ++ | 24| 25| 26| 27| 28| 29| | 30| 31| ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ DISK | | | |TAGMIG |DLIMIT | | |INODE | | ++ VFS | 32| 33| 34| 35| 36| 37| | 38| 39| ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ OTHER |VSTAT | | | | | | |VINFO | | ++ | 40| 41| 42| 43| 44| 45| | 46| 47| ++ =======+=======+=======+=======+=======+=======+=======+ +=======+=======+ ++ SPECIAL|EVENT | | | |FLAGS | | |VSPACE | | ++ | 48| 49| 50| 51| 52| 53| | 54| 55| ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ SPECIAL|DEBUG | | | |RLIMIT |SYSCALL| | |COMPAT | ++ | 56| 57| 58| 59| 60|TEST 61| | 62| 63| ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ ++*/ ++ ++#define VC_CAT_VERSION 0 ++ ++#define VC_CAT_VSETUP 1 ++#define VC_CAT_VHOST 2 ++ ++#define VC_CAT_DEVICE 6 ++ ++#define VC_CAT_VPROC 9 ++#define VC_CAT_PROCALT 10 ++#define VC_CAT_PROCMIG 11 ++#define VC_CAT_PROCTRL 12 ++ ++#define VC_CAT_SCHED 14 ++#define VC_CAT_MEMCTRL 20 ++ ++#define VC_CAT_VNET 25 ++#define VC_CAT_NETALT 26 ++#define VC_CAT_NETMIG 27 ++#define VC_CAT_NETCTRL 28 ++ ++#define VC_CAT_TAGMIG 35 ++#define VC_CAT_DLIMIT 36 ++#define VC_CAT_INODE 38 ++ ++#define VC_CAT_VSTAT 40 ++#define VC_CAT_VINFO 46 ++#define VC_CAT_EVENT 48 ++ ++#define VC_CAT_FLAGS 52 ++#define VC_CAT_VSPACE 54 ++#define VC_CAT_DEBUG 56 ++#define VC_CAT_RLIMIT 60 ++ ++#define VC_CAT_SYSTEST 61 ++#define VC_CAT_COMPAT 63 ++ ++/* query version */ ++ ++#define VCMD_get_version VC_CMD(VERSION, 0, 0) ++#define VCMD_get_vci VC_CMD(VERSION, 1, 0) ++ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++#endif /* __KERNEL__ */ ++ ++#endif /* _VX_SWITCH_H */ ++ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/tag.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/tag.h +--- linux-3.2.34/include/linux/vserver/tag.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/tag.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,143 @@ ++#ifndef _DX_TAG_H ++#define _DX_TAG_H ++ ++#include ++ ++ ++#define DX_TAG(in) (IS_TAGGED(in)) ++ ++ ++#ifdef CONFIG_TAG_NFSD ++#define DX_TAG_NFSD 1 ++#else ++#define DX_TAG_NFSD 0 ++#endif ++ ++ ++#ifdef CONFIG_TAGGING_NONE ++ ++#define MAX_UID 0xFFFFFFFF ++#define MAX_GID 0xFFFFFFFF ++ ++#define INOTAG_TAG(cond, uid, gid, tag) (0) ++ ++#define TAGINO_UID(cond, uid, tag) (uid) ++#define TAGINO_GID(cond, gid, tag) (gid) ++ ++#endif ++ ++ ++#ifdef CONFIG_TAGGING_GID16 ++ ++#define MAX_UID 0xFFFFFFFF ++#define MAX_GID 0x0000FFFF ++ ++#define INOTAG_TAG(cond, uid, gid, tag) \ ++ ((cond) ? (((gid) >> 16) & 0xFFFF) : 0) ++ ++#define TAGINO_UID(cond, uid, tag) (uid) ++#define TAGINO_GID(cond, gid, tag) \ ++ ((cond) ? (((gid) & 0xFFFF) | ((tag) << 16)) : (gid)) ++ ++#endif ++ ++ ++#ifdef CONFIG_TAGGING_ID24 ++ ++#define MAX_UID 0x00FFFFFF ++#define MAX_GID 0x00FFFFFF ++ ++#define INOTAG_TAG(cond, uid, gid, tag) \ ++ ((cond) ? ((((uid) >> 16) & 0xFF00) | (((gid) >> 24) & 0xFF)) : 0) ++ ++#define TAGINO_UID(cond, uid, tag) \ ++ ((cond) ? (((uid) & 0xFFFFFF) | (((tag) & 0xFF00) << 16)) : (uid)) ++#define TAGINO_GID(cond, gid, tag) \ ++ ((cond) ? (((gid) & 0xFFFFFF) | (((tag) & 0x00FF) << 24)) : (gid)) ++ ++#endif ++ ++ ++#ifdef CONFIG_TAGGING_UID16 ++ ++#define MAX_UID 0x0000FFFF ++#define MAX_GID 0xFFFFFFFF ++ ++#define INOTAG_TAG(cond, uid, gid, tag) \ ++ ((cond) ? (((uid) >> 16) & 0xFFFF) : 0) ++ ++#define TAGINO_UID(cond, uid, tag) \ ++ ((cond) ? (((uid) & 0xFFFF) | ((tag) << 16)) : (uid)) ++#define TAGINO_GID(cond, gid, tag) (gid) ++ ++#endif ++ ++ ++#ifdef CONFIG_TAGGING_INTERN ++ ++#define MAX_UID 0xFFFFFFFF ++#define MAX_GID 0xFFFFFFFF ++ ++#define INOTAG_TAG(cond, uid, gid, tag) \ ++ ((cond) ? (tag) : 0) ++ ++#define TAGINO_UID(cond, uid, tag) (uid) ++#define TAGINO_GID(cond, gid, tag) (gid) ++ ++#endif ++ ++ ++#ifndef CONFIG_TAGGING_NONE ++#define dx_current_fstag(sb) \ ++ ((sb)->s_flags & MS_TAGGED ? dx_current_tag() : 0) ++#else ++#define dx_current_fstag(sb) (0) ++#endif ++ ++#ifndef CONFIG_TAGGING_INTERN ++#define TAGINO_TAG(cond, tag) (0) ++#else ++#define TAGINO_TAG(cond, tag) ((cond) ? (tag) : 0) ++#endif ++ ++#define INOTAG_UID(cond, uid, gid) \ ++ ((cond) ? ((uid) & MAX_UID) : (uid)) ++#define INOTAG_GID(cond, uid, gid) \ ++ ((cond) ? ((gid) & MAX_GID) : (gid)) ++ ++ ++static inline uid_t dx_map_uid(uid_t uid) ++{ ++ if ((uid > MAX_UID) && (uid != -1)) ++ uid = -2; ++ return (uid & MAX_UID); ++} ++ ++static inline gid_t dx_map_gid(gid_t gid) ++{ ++ if ((gid > MAX_GID) && (gid != -1)) ++ gid = -2; ++ return (gid & MAX_GID); ++} ++ ++struct peer_tag { ++ int32_t xid; ++ int32_t nid; ++}; ++ ++#define dx_notagcheck(sb) ((sb) && ((sb)->s_flags & MS_NOTAGCHECK)) ++ ++int dx_parse_tag(char *string, tag_t *tag, int remove, int *mnt_flags, ++ unsigned long *flags); ++ ++#ifdef CONFIG_PROPAGATE ++ ++void __dx_propagate_tag(struct nameidata *nd, struct inode *inode); ++ ++#define dx_propagate_tag(n, i) __dx_propagate_tag(n, i) ++ ++#else ++#define dx_propagate_tag(n, i) do { } while (0) ++#endif ++ ++#endif /* _DX_TAG_H */ +diff -NurpP --minimal linux-3.2.34/include/linux/vserver/tag_cmd.h linux-3.2.34-vs2.3.2.15/include/linux/vserver/tag_cmd.h +--- linux-3.2.34/include/linux/vserver/tag_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/linux/vserver/tag_cmd.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,22 @@ ++#ifndef _VX_TAG_CMD_H ++#define _VX_TAG_CMD_H ++ ++ ++/* vinfo commands */ ++ ++#define VCMD_task_tag VC_CMD(VINFO, 3, 0) ++ ++#ifdef __KERNEL__ ++extern int vc_task_tag(uint32_t); ++ ++#endif /* __KERNEL__ */ ++ ++/* context commands */ ++ ++#define VCMD_tag_migrate VC_CMD(TAGMIG, 1, 0) ++ ++#ifdef __KERNEL__ ++extern int vc_tag_migrate(uint32_t); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_TAG_CMD_H */ +diff -NurpP --minimal linux-3.2.34/include/net/addrconf.h linux-3.2.34-vs2.3.2.15/include/net/addrconf.h +--- linux-3.2.34/include/net/addrconf.h 2012-01-09 16:14:59.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/net/addrconf.h 2011-12-05 19:33:02.000000000 +0100 +@@ -80,7 +80,8 @@ extern int ipv6_dev_get_saddr(struct n + struct net_device *dev, + const struct in6_addr *daddr, + unsigned int srcprefs, +- struct in6_addr *saddr); ++ struct in6_addr *saddr, ++ struct nx_info *nxi); + extern int ipv6_get_lladdr(struct net_device *dev, + struct in6_addr *addr, + unsigned char banned_flags); +diff -NurpP --minimal linux-3.2.34/include/net/af_unix.h linux-3.2.34-vs2.3.2.15/include/net/af_unix.h +--- linux-3.2.34/include/net/af_unix.h 2011-07-22 11:18:11.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/include/net/af_unix.h 2011-12-05 19:33:02.000000000 +0100 +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + #include + + extern void unix_inflight(struct file *fp); +diff -NurpP --minimal linux-3.2.34/include/net/inet_timewait_sock.h linux-3.2.34-vs2.3.2.15/include/net/inet_timewait_sock.h +--- linux-3.2.34/include/net/inet_timewait_sock.h 2012-01-09 16:14:59.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/net/inet_timewait_sock.h 2011-12-05 19:33:02.000000000 +0100 +@@ -112,6 +112,10 @@ struct inet_timewait_sock { + #define tw_net __tw_common.skc_net + #define tw_daddr __tw_common.skc_daddr + #define tw_rcv_saddr __tw_common.skc_rcv_saddr ++#define tw_xid __tw_common.skc_xid ++#define tw_vx_info __tw_common.skc_vx_info ++#define tw_nid __tw_common.skc_nid ++#define tw_nx_info __tw_common.skc_nx_info + int tw_timeout; + volatile unsigned char tw_substate; + unsigned char tw_rcv_wscale; +diff -NurpP --minimal linux-3.2.34/include/net/ip6_route.h linux-3.2.34-vs2.3.2.15/include/net/ip6_route.h +--- linux-3.2.34/include/net/ip6_route.h 2011-07-22 11:18:11.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/include/net/ip6_route.h 2011-12-05 19:33:02.000000000 +0100 +@@ -86,7 +86,8 @@ extern int ip6_route_get_saddr(struct + struct rt6_info *rt, + const struct in6_addr *daddr, + unsigned int prefs, +- struct in6_addr *saddr); ++ struct in6_addr *saddr, ++ struct nx_info *nxi); + + extern struct rt6_info *rt6_lookup(struct net *net, + const struct in6_addr *daddr, +diff -NurpP --minimal linux-3.2.34/include/net/route.h linux-3.2.34-vs2.3.2.15/include/net/route.h +--- linux-3.2.34/include/net/route.h 2012-11-18 18:42:23.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/net/route.h 2012-03-01 21:39:38.000000000 +0100 +@@ -202,6 +202,9 @@ static inline void ip_rt_put(struct rtab + dst_release(&rt->dst); + } + ++#include ++#include ++ + #define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3) + + extern const __u8 ip_tos2prio[16]; +@@ -253,6 +256,9 @@ static inline void ip_route_connect_init + protocol, flow_flags, dst, src, dport, sport); + } + ++extern struct rtable *ip_v4_find_src(struct net *net, struct nx_info *, ++ struct flowi4 *); ++ + static inline struct rtable *ip_route_connect(struct flowi4 *fl4, + __be32 dst, __be32 src, u32 tos, + int oif, u8 protocol, +@@ -261,11 +267,25 @@ static inline struct rtable *ip_route_co + { + struct net *net = sock_net(sk); + struct rtable *rt; ++ struct nx_info *nx_info = current_nx_info(); + + ip_route_connect_init(fl4, dst, src, tos, oif, protocol, + sport, dport, sk, can_sleep); + +- if (!dst || !src) { ++ if (sk) ++ nx_info = sk->sk_nx_info; ++ ++ vxdprintk(VXD_CBIT(net, 4), ++ "ip_route_connect(%p) %p,%p;%lx", ++ sk, nx_info, sk->sk_socket, ++ (sk->sk_socket?sk->sk_socket->flags:0)); ++ ++ rt = ip_v4_find_src(net, nx_info, fl4); ++ if (IS_ERR(rt)) ++ return rt; ++ ip_rt_put(rt); ++ ++ if (!fl4->daddr || !fl4->saddr) { + rt = __ip_route_output_key(net, fl4); + if (IS_ERR(rt)) + return rt; +diff -NurpP --minimal linux-3.2.34/include/net/sock.h linux-3.2.34-vs2.3.2.15/include/net/sock.h +--- linux-3.2.34/include/net/sock.h 2012-11-18 18:42:23.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/include/net/sock.h 2012-10-22 12:59:52.000000000 +0200 +@@ -149,6 +149,10 @@ struct sock_common { + #ifdef CONFIG_NET_NS + struct net *skc_net; + #endif ++ xid_t skc_xid; ++ struct vx_info *skc_vx_info; ++ nid_t skc_nid; ++ struct nx_info *skc_nx_info; + /* + * fields between dontcopy_begin/dontcopy_end + * are not copied in sock_copy() +@@ -257,6 +261,10 @@ struct sock { + #define sk_bind_node __sk_common.skc_bind_node + #define sk_prot __sk_common.skc_prot + #define sk_net __sk_common.skc_net ++#define sk_xid __sk_common.skc_xid ++#define sk_vx_info __sk_common.skc_vx_info ++#define sk_nid __sk_common.skc_nid ++#define sk_nx_info __sk_common.skc_nx_info + socket_lock_t sk_lock; + struct sk_buff_head sk_receive_queue; + /* +diff -NurpP --minimal linux-3.2.34/init/Kconfig linux-3.2.34-vs2.3.2.15/init/Kconfig +--- linux-3.2.34/init/Kconfig 2012-01-09 16:14:59.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/init/Kconfig 2011-12-05 19:33:02.000000000 +0100 +@@ -574,6 +574,7 @@ config HAVE_UNSTABLE_SCHED_CLOCK + menuconfig CGROUPS + boolean "Control Group support" + depends on EVENTFD ++ default y + help + This option adds support for grouping sets of processes together, for + use with process control subsystems such as Cpusets, CFS, memory +@@ -802,6 +803,7 @@ config IPC_NS + config USER_NS + bool "User namespace (EXPERIMENTAL)" + depends on EXPERIMENTAL ++ depends on VSERVER_DISABLED + default y + help + This allows containers, i.e. vservers, to use user namespaces +diff -NurpP --minimal linux-3.2.34/init/main.c linux-3.2.34-vs2.3.2.15/init/main.c +--- linux-3.2.34/init/main.c 2012-11-18 18:42:23.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/init/main.c 2012-06-14 20:45:24.000000000 +0200 +@@ -68,6 +68,7 @@ + #include + #include + #include ++#include + + #include + #include +diff -NurpP --minimal linux-3.2.34/ipc/mqueue.c linux-3.2.34-vs2.3.2.15/ipc/mqueue.c +--- linux-3.2.34/ipc/mqueue.c 2012-01-09 16:14:59.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/ipc/mqueue.c 2012-01-09 16:19:31.000000000 +0100 +@@ -33,6 +33,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include "util.h" +@@ -66,6 +68,7 @@ struct mqueue_inode_info { + struct sigevent notify; + struct pid* notify_owner; + struct user_struct *user; /* user who created, for accounting */ ++ struct vx_info *vxi; + struct sock *notify_sock; + struct sk_buff *notify_cookie; + +@@ -128,6 +131,7 @@ static struct inode *mqueue_get_inode(st + if (S_ISREG(mode)) { + struct mqueue_inode_info *info; + struct task_struct *p = current; ++ struct vx_info *vxi = p->vx_info; + unsigned long mq_bytes, mq_msg_tblsz; + + inode->i_fop = &mqueue_file_operations; +@@ -141,6 +145,7 @@ static struct inode *mqueue_get_inode(st + info->notify_owner = NULL; + info->qsize = 0; + info->user = NULL; /* set when all is ok */ ++ info->vxi = NULL; + memset(&info->attr, 0, sizeof(info->attr)); + info->attr.mq_maxmsg = ipc_ns->mq_msg_max; + info->attr.mq_msgsize = ipc_ns->mq_msgsize_max; +@@ -158,17 +163,20 @@ static struct inode *mqueue_get_inode(st + + spin_lock(&mq_lock); + if (u->mq_bytes + mq_bytes < u->mq_bytes || +- u->mq_bytes + mq_bytes > task_rlimit(p, RLIMIT_MSGQUEUE)) { ++ u->mq_bytes + mq_bytes > task_rlimit(p, RLIMIT_MSGQUEUE) || ++ !vx_ipcmsg_avail(vxi, mq_bytes)) { + spin_unlock(&mq_lock); + /* mqueue_evict_inode() releases info->messages */ + ret = -EMFILE; + goto out_inode; + } + u->mq_bytes += mq_bytes; ++ vx_ipcmsg_add(vxi, u, mq_bytes); + spin_unlock(&mq_lock); + + /* all is ok */ + info->user = get_uid(u); ++ info->vxi = get_vx_info(vxi); + } else if (S_ISDIR(mode)) { + inc_nlink(inode); + /* Some things misbehave if size == 0 on a directory */ +@@ -278,8 +286,11 @@ static void mqueue_evict_inode(struct in + + info->attr.mq_msgsize); + user = info->user; + if (user) { ++ struct vx_info *vxi = info->vxi; ++ + spin_lock(&mq_lock); + user->mq_bytes -= mq_bytes; ++ vx_ipcmsg_sub(vxi, user, mq_bytes); + /* + * get_ns_from_inode() ensures that the + * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns +@@ -289,6 +300,7 @@ static void mqueue_evict_inode(struct in + if (ipc_ns) + ipc_ns->mq_queues_count--; + spin_unlock(&mq_lock); ++ put_vx_info(vxi); + free_uid(user); + } + if (ipc_ns) +diff -NurpP --minimal linux-3.2.34/ipc/msg.c linux-3.2.34-vs2.3.2.15/ipc/msg.c +--- linux-3.2.34/ipc/msg.c 2011-05-22 16:17:59.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/ipc/msg.c 2011-12-05 19:33:02.000000000 +0100 +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -190,6 +191,7 @@ static int newque(struct ipc_namespace * + + msq->q_perm.mode = msgflg & S_IRWXUGO; + msq->q_perm.key = key; ++ msq->q_perm.xid = vx_current_xid(); + + msq->q_perm.security = NULL; + retval = security_msg_queue_alloc(msq); +diff -NurpP --minimal linux-3.2.34/ipc/namespace.c linux-3.2.34-vs2.3.2.15/ipc/namespace.c +--- linux-3.2.34/ipc/namespace.c 2011-07-22 11:18:12.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/ipc/namespace.c 2011-12-05 19:33:02.000000000 +0100 +@@ -13,11 +13,12 @@ + #include + #include + #include ++#include ++#include + + #include "util.h" + +-static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk, +- struct ipc_namespace *old_ns) ++static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns) + { + struct ipc_namespace *ns; + int err; +@@ -46,19 +47,18 @@ static struct ipc_namespace *create_ipc_ + ipcns_notify(IPCNS_CREATED); + register_ipcns_notifier(ns); + +- ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); ++ ns->user_ns = get_user_ns(user_ns); + + return ns; + } + + struct ipc_namespace *copy_ipcs(unsigned long flags, +- struct task_struct *tsk) ++ struct ipc_namespace *old_ns, ++ struct user_namespace *user_ns) + { +- struct ipc_namespace *ns = tsk->nsproxy->ipc_ns; +- + if (!(flags & CLONE_NEWIPC)) +- return get_ipc_ns(ns); +- return create_ipc_ns(tsk, ns); ++ return get_ipc_ns(old_ns); ++ return create_ipc_ns(user_ns); + } + + /* +diff -NurpP --minimal linux-3.2.34/ipc/sem.c linux-3.2.34-vs2.3.2.15/ipc/sem.c +--- linux-3.2.34/ipc/sem.c 2012-01-09 16:14:59.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/ipc/sem.c 2011-12-05 19:33:02.000000000 +0100 +@@ -86,6 +86,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include "util.h" +@@ -306,6 +308,7 @@ static int newary(struct ipc_namespace * + + sma->sem_perm.mode = (semflg & S_IRWXUGO); + sma->sem_perm.key = key; ++ sma->sem_perm.xid = vx_current_xid(); + + sma->sem_perm.security = NULL; + retval = security_sem_alloc(sma); +@@ -321,6 +324,9 @@ static int newary(struct ipc_namespace * + return id; + } + ns->used_sems += nsems; ++ /* FIXME: obsoleted? */ ++ vx_semary_inc(sma); ++ vx_nsems_add(sma, nsems); + + sma->sem_base = (struct sem *) &sma[1]; + +@@ -770,6 +776,9 @@ static void freeary(struct ipc_namespace + + wake_up_sem_queue_do(&tasks); + ns->used_sems -= sma->sem_nsems; ++ /* FIXME: obsoleted? */ ++ vx_nsems_sub(sma, sma->sem_nsems); ++ vx_semary_dec(sma); + security_sem_free(sma); + ipc_rcu_putref(sma); + } +diff -NurpP --minimal linux-3.2.34/ipc/shm.c linux-3.2.34-vs2.3.2.15/ipc/shm.c +--- linux-3.2.34/ipc/shm.c 2012-11-18 18:42:23.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/ipc/shm.c 2012-01-26 08:52:10.000000000 +0100 +@@ -39,6 +39,8 @@ + #include + #include + #include ++#include ++#include + + #include + +@@ -187,7 +189,12 @@ static void shm_open(struct vm_area_stru + */ + static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) + { +- ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ struct vx_info *vxi = lookup_vx_info(shp->shm_perm.xid); ++ int numpages = (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ ++ vx_ipcshm_sub(vxi, shp, numpages); ++ ns->shm_tot -= numpages; ++ + shm_rmid(ns, shp); + shm_unlock(shp); + if (!is_file_hugepages(shp->shm_file)) +@@ -197,6 +204,7 @@ static void shm_destroy(struct ipc_names + shp->mlock_user); + fput (shp->shm_file); + security_shm_free(shp); ++ put_vx_info(vxi); + ipc_rcu_putref(shp); + } + +@@ -462,11 +470,15 @@ static int newseg(struct ipc_namespace * + if (ns->shm_tot + numpages > ns->shm_ctlall) + return -ENOSPC; + ++ if (!vx_ipcshm_avail(current_vx_info(), numpages)) ++ return -ENOSPC; ++ + shp = ipc_rcu_alloc(sizeof(*shp)); + if (!shp) + return -ENOMEM; + + shp->shm_perm.key = key; ++ shp->shm_perm.xid = vx_current_xid(); + shp->shm_perm.mode = (shmflg & S_IRWXUGO); + shp->mlock_user = NULL; + +@@ -521,6 +533,7 @@ static int newseg(struct ipc_namespace * + ns->shm_tot += numpages; + error = shp->shm_perm.id; + shm_unlock(shp); ++ vx_ipcshm_add(current_vx_info(), key, numpages); + return error; + + no_id: +diff -NurpP --minimal linux-3.2.34/kernel/Makefile linux-3.2.34-vs2.3.2.15/kernel/Makefile +--- linux-3.2.34/kernel/Makefile 2012-01-09 16:14:59.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/Makefile 2011-12-05 19:33:02.000000000 +0100 +@@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg + CFLAGS_REMOVE_irq_work.o = -pg + endif + ++obj-y += vserver/ + obj-$(CONFIG_FREEZER) += freezer.o + obj-$(CONFIG_PROFILING) += profile.o + obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o +diff -NurpP --minimal linux-3.2.34/kernel/capability.c linux-3.2.34-vs2.3.2.15/kernel/capability.c +--- linux-3.2.34/kernel/capability.c 2012-01-09 16:14:59.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/capability.c 2011-12-05 19:33:02.000000000 +0100 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + + /* +@@ -116,6 +117,7 @@ static int cap_validate_magic(cap_user_h + return 0; + } + ++ + /* + * The only thing that can change the capabilities of the current + * process is the current process. As such, we can't be in this code +@@ -340,6 +342,8 @@ bool has_capability_noaudit(struct task_ + return (ret == 0); + } + ++#include ++ + /** + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for +diff -NurpP --minimal linux-3.2.34/kernel/compat.c linux-3.2.34-vs2.3.2.15/kernel/compat.c +--- linux-3.2.34/kernel/compat.c 2012-11-18 18:42:23.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/compat.c 2012-06-14 20:45:24.000000000 +0200 +@@ -1002,7 +1002,7 @@ asmlinkage long compat_sys_stime(compat_ + if (err) + return err; + +- do_settimeofday(&tv); ++ vx_settimeofday(&tv); + return 0; + } + +diff -NurpP --minimal linux-3.2.34/kernel/cred.c linux-3.2.34-vs2.3.2.15/kernel/cred.c +--- linux-3.2.34/kernel/cred.c 2012-11-18 18:42:23.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/cred.c 2012-04-16 12:14:54.000000000 +0200 +@@ -61,31 +61,6 @@ struct cred init_cred = { + #endif + }; + +-static inline void set_cred_subscribers(struct cred *cred, int n) +-{ +-#ifdef CONFIG_DEBUG_CREDENTIALS +- atomic_set(&cred->subscribers, n); +-#endif +-} +- +-static inline int read_cred_subscribers(const struct cred *cred) +-{ +-#ifdef CONFIG_DEBUG_CREDENTIALS +- return atomic_read(&cred->subscribers); +-#else +- return 0; +-#endif +-} +- +-static inline void alter_cred_subscribers(const struct cred *_cred, int n) +-{ +-#ifdef CONFIG_DEBUG_CREDENTIALS +- struct cred *cred = (struct cred *) _cred; +- +- atomic_add(n, &cred->subscribers); +-#endif +-} +- + /* + * Dispose of the shared task group credentials + */ +@@ -281,21 +256,16 @@ error: + * + * Call commit_creds() or abort_creds() to clean up. + */ +-struct cred *prepare_creds(void) ++struct cred *__prepare_creds(const struct cred *old) + { +- struct task_struct *task = current; +- const struct cred *old; + struct cred *new; + +- validate_process_creds(); +- + new = kmem_cache_alloc(cred_jar, GFP_KERNEL); + if (!new) + return NULL; + + kdebug("prepare_creds() alloc %p", new); + +- old = task->cred; + memcpy(new, old, sizeof(struct cred)); + + atomic_set(&new->usage, 1); +@@ -322,6 +292,13 @@ error: + abort_creds(new); + return NULL; + } ++ ++struct cred *prepare_creds(void) ++{ ++ validate_process_creds(); ++ ++ return __prepare_creds(current->cred); ++} + EXPORT_SYMBOL(prepare_creds); + + /* +diff -NurpP --minimal linux-3.2.34/kernel/exit.c linux-3.2.34-vs2.3.2.15/kernel/exit.c +--- linux-3.2.34/kernel/exit.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/exit.c 2012-10-22 12:59:52.000000000 +0200 +@@ -48,6 +48,10 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + #include + #include + #include +@@ -480,9 +484,11 @@ static void close_files(struct files_str + filp_close(file, files); + cond_resched(); + } ++ vx_openfd_dec(i); + } + i++; + set >>= 1; ++ cond_resched(); + } + } + } +@@ -987,6 +993,9 @@ NORET_TYPE void do_exit(long code) + */ + ptrace_put_breakpoints(tsk); + ++ /* needs to stay before exit_notify() */ ++ exit_vx_info_early(tsk, code); ++ + exit_notify(tsk, group_dead); + #ifdef CONFIG_NUMA + task_lock(tsk); +@@ -1017,6 +1026,10 @@ NORET_TYPE void do_exit(long code) + + validate_creds_for_do_exit(tsk); + ++ /* needs to stay after exit_notify() */ ++ exit_vx_info(tsk, code); ++ exit_nx_info(tsk); ++ + preempt_disable(); + exit_rcu(); + +@@ -1038,6 +1051,7 @@ NORET_TYPE void do_exit(long code) + /* causes final put_task_struct in finish_task_switch(). */ + tsk->state = TASK_DEAD; + schedule(); ++ printk("bad task: %p [%lx]\n", current, current->state); + BUG(); + /* Avoid "noreturn function does return". */ + for (;;) +diff -NurpP --minimal linux-3.2.34/kernel/fork.c linux-3.2.34-vs2.3.2.15/kernel/fork.c +--- linux-3.2.34/kernel/fork.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/fork.c 2012-08-13 12:40:51.000000000 +0200 +@@ -68,6 +68,9 @@ + #include + #include + #include ++#include ++#include ++#include + + #include + #include +@@ -167,6 +170,8 @@ void free_task(struct task_struct *tsk) + account_kernel_stack(tsk->stack, -1); + free_thread_info(tsk->stack); + rt_mutex_debug_task_free(tsk); ++ clr_vx_info(&tsk->vx_info); ++ clr_nx_info(&tsk->nx_info); + ftrace_graph_exit_task(tsk); + free_task_struct(tsk); + } +@@ -503,6 +508,7 @@ static struct mm_struct *mm_init(struct + if (likely(!mm_alloc_pgd(mm))) { + mm->def_flags = 0; + mmu_notifier_mm_init(mm); ++ set_vx_info(&mm->mm_vx_info, p->vx_info); + return mm; + } + +@@ -540,6 +546,7 @@ void __mmdrop(struct mm_struct *mm) + #ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); + #endif ++ clr_vx_info(&mm->mm_vx_info); + free_mm(mm); + } + EXPORT_SYMBOL_GPL(__mmdrop); +@@ -727,6 +734,7 @@ struct mm_struct *dup_mm(struct task_str + goto fail_nomem; + + memcpy(mm, oldmm, sizeof(*mm)); ++ mm->mm_vx_info = NULL; + mm_init_cpumask(mm); + + /* Initializing for Swap token stuff */ +@@ -770,6 +778,7 @@ fail_nocontext: + * If init_new_context() failed, we cannot use mmput() to free the mm + * because it calls destroy_context() + */ ++ clr_vx_info(&mm->mm_vx_info); + mm_free_pgd(mm); + free_mm(mm); + return NULL; +@@ -1058,6 +1067,8 @@ static struct task_struct *copy_process( + int retval; + struct task_struct *p; + int cgroup_callbacks_done = 0; ++ struct vx_info *vxi; ++ struct nx_info *nxi; + + if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) + return ERR_PTR(-EINVAL); +@@ -1104,7 +1115,12 @@ static struct task_struct *copy_process( + DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); + DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); + #endif ++ init_vx_info(&p->vx_info, current_vx_info()); ++ init_nx_info(&p->nx_info, current_nx_info()); ++ + retval = -EAGAIN; ++ if (!vx_nproc_avail(1)) ++ goto bad_fork_free; + if (atomic_read(&p->real_cred->user->processes) >= + task_rlimit(p, RLIMIT_NPROC)) { + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && +@@ -1377,6 +1393,18 @@ static struct task_struct *copy_process( + + total_forks++; + spin_unlock(¤t->sighand->siglock); ++ ++ /* p is copy of current */ ++ vxi = p->vx_info; ++ if (vxi) { ++ claim_vx_info(vxi, p); ++ atomic_inc(&vxi->cvirt.nr_threads); ++ atomic_inc(&vxi->cvirt.total_forks); ++ vx_nproc_inc(p); ++ } ++ nxi = p->nx_info; ++ if (nxi) ++ claim_nx_info(nxi, p); + write_unlock_irq(&tasklist_lock); + proc_fork_connector(p); + cgroup_post_fork(p); +diff -NurpP --minimal linux-3.2.34/kernel/kthread.c linux-3.2.34-vs2.3.2.15/kernel/kthread.c +--- linux-3.2.34/kernel/kthread.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/kthread.c 2011-12-05 19:33:02.000000000 +0100 +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + #include + + static DEFINE_SPINLOCK(kthread_create_lock); +diff -NurpP --minimal linux-3.2.34/kernel/nsproxy.c linux-3.2.34-vs2.3.2.15/kernel/nsproxy.c +--- linux-3.2.34/kernel/nsproxy.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/nsproxy.c 2011-12-05 19:33:02.000000000 +0100 +@@ -20,6 +20,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -46,8 +48,11 @@ static inline struct nsproxy *create_nsp + struct nsproxy *nsproxy; + + nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); +- if (nsproxy) ++ if (nsproxy) { + atomic_set(&nsproxy->count, 1); ++ atomic_inc(&vs_global_nsproxy); ++ } ++ vxdprintk(VXD_CBIT(space, 2), "create_nsproxy = %p[1]", nsproxy); + return nsproxy; + } + +@@ -56,8 +61,11 @@ static inline struct nsproxy *create_nsp + * Return the newly created nsproxy. Do not attach this to the task, + * leave it to the caller to do proper locking and attach it to task. + */ +-static struct nsproxy *create_new_namespaces(unsigned long flags, +- struct task_struct *tsk, struct fs_struct *new_fs) ++static struct nsproxy *unshare_namespaces(unsigned long flags, ++ struct nsproxy *orig, ++ struct fs_struct *new_fs, ++ struct user_namespace *new_user, ++ struct pid_namespace *new_pid) + { + struct nsproxy *new_nsp; + int err; +@@ -66,31 +74,31 @@ static struct nsproxy *create_new_namesp + if (!new_nsp) + return ERR_PTR(-ENOMEM); + +- new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); ++ new_nsp->mnt_ns = copy_mnt_ns(flags, orig->mnt_ns, new_fs); + if (IS_ERR(new_nsp->mnt_ns)) { + err = PTR_ERR(new_nsp->mnt_ns); + goto out_ns; + } + +- new_nsp->uts_ns = copy_utsname(flags, tsk); ++ new_nsp->uts_ns = copy_utsname(flags, orig->uts_ns, new_user); + if (IS_ERR(new_nsp->uts_ns)) { + err = PTR_ERR(new_nsp->uts_ns); + goto out_uts; + } + +- new_nsp->ipc_ns = copy_ipcs(flags, tsk); ++ new_nsp->ipc_ns = copy_ipcs(flags, orig->ipc_ns, new_user); + if (IS_ERR(new_nsp->ipc_ns)) { + err = PTR_ERR(new_nsp->ipc_ns); + goto out_ipc; + } + +- new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); ++ new_nsp->pid_ns = copy_pid_ns(flags, new_pid); + if (IS_ERR(new_nsp->pid_ns)) { + err = PTR_ERR(new_nsp->pid_ns); + goto out_pid; + } + +- new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); ++ new_nsp->net_ns = copy_net_ns(flags, orig->net_ns); + if (IS_ERR(new_nsp->net_ns)) { + err = PTR_ERR(new_nsp->net_ns); + goto out_net; +@@ -115,6 +123,40 @@ out_ns: + return ERR_PTR(err); + } + ++static struct nsproxy *create_new_namespaces(unsigned long flags, ++ struct task_struct *tsk, struct fs_struct *new_fs) ++{ ++ return unshare_namespaces(flags, tsk->nsproxy, ++ new_fs, task_cred_xxx(tsk, user)->user_ns, ++ task_active_pid_ns(tsk)); ++} ++ ++/* ++ * copies the nsproxy, setting refcount to 1, and grabbing a ++ * reference to all contained namespaces. ++ */ ++struct nsproxy *copy_nsproxy(struct nsproxy *orig) ++{ ++ struct nsproxy *ns = create_nsproxy(); ++ ++ if (ns) { ++ memcpy(ns, orig, sizeof(struct nsproxy)); ++ atomic_set(&ns->count, 1); ++ ++ if (ns->mnt_ns) ++ get_mnt_ns(ns->mnt_ns); ++ if (ns->uts_ns) ++ get_uts_ns(ns->uts_ns); ++ if (ns->ipc_ns) ++ get_ipc_ns(ns->ipc_ns); ++ if (ns->pid_ns) ++ get_pid_ns(ns->pid_ns); ++ if (ns->net_ns) ++ get_net(ns->net_ns); ++ } ++ return ns; ++} ++ + /* + * called from clone. This now handles copy for nsproxy and all + * namespaces therein. +@@ -122,9 +164,12 @@ out_ns: + int copy_namespaces(unsigned long flags, struct task_struct *tsk) + { + struct nsproxy *old_ns = tsk->nsproxy; +- struct nsproxy *new_ns; ++ struct nsproxy *new_ns = NULL; + int err = 0; + ++ vxdprintk(VXD_CBIT(space, 7), "copy_namespaces(0x%08lx,%p[%p])", ++ flags, tsk, old_ns); ++ + if (!old_ns) + return 0; + +@@ -134,7 +179,7 @@ int copy_namespaces(unsigned long flags, + CLONE_NEWPID | CLONE_NEWNET))) + return 0; + +- if (!capable(CAP_SYS_ADMIN)) { ++ if (!vx_can_unshare(CAP_SYS_ADMIN, flags)) { + err = -EPERM; + goto out; + } +@@ -161,6 +206,9 @@ int copy_namespaces(unsigned long flags, + + out: + put_nsproxy(old_ns); ++ vxdprintk(VXD_CBIT(space, 3), ++ "copy_namespaces(0x%08lx,%p[%p]) = %d [%p]", ++ flags, tsk, old_ns, err, new_ns); + return err; + } + +@@ -174,7 +222,9 @@ void free_nsproxy(struct nsproxy *ns) + put_ipc_ns(ns->ipc_ns); + if (ns->pid_ns) + put_pid_ns(ns->pid_ns); +- put_net(ns->net_ns); ++ if (ns->net_ns) ++ put_net(ns->net_ns); ++ atomic_dec(&vs_global_nsproxy); + kmem_cache_free(nsproxy_cachep, ns); + } + +@@ -187,11 +237,15 @@ int unshare_nsproxy_namespaces(unsigned + { + int err = 0; + ++ vxdprintk(VXD_CBIT(space, 4), ++ "unshare_nsproxy_namespaces(0x%08lx,[%p])", ++ unshare_flags, current->nsproxy); ++ + if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWNET))) + return 0; + +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_can_unshare(CAP_SYS_ADMIN, unshare_flags)) + return -EPERM; + + *new_nsp = create_new_namespaces(unshare_flags, current, +diff -NurpP --minimal linux-3.2.34/kernel/pid.c linux-3.2.34-vs2.3.2.15/kernel/pid.c +--- linux-3.2.34/kernel/pid.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/pid.c 2011-12-05 19:43:14.000000000 +0100 +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + + #define pid_hashfn(nr, ns) \ + hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) +@@ -342,7 +343,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns); + + struct pid *find_vpid(int nr) + { +- return find_pid_ns(nr, current->nsproxy->pid_ns); ++ return find_pid_ns(vx_rmap_pid(nr), current->nsproxy->pid_ns); + } + EXPORT_SYMBOL_GPL(find_vpid); + +@@ -402,6 +403,9 @@ void transfer_pid(struct task_struct *ol + struct task_struct *pid_task(struct pid *pid, enum pid_type type) + { + struct task_struct *result = NULL; ++ ++ if (type == PIDTYPE_REALPID) ++ type = PIDTYPE_PID; + if (pid) { + struct hlist_node *first; + first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), +@@ -421,7 +425,7 @@ struct task_struct *find_task_by_pid_ns( + rcu_lockdep_assert(rcu_read_lock_held(), + "find_task_by_pid_ns() needs rcu_read_lock()" + " protection"); +- return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); ++ return pid_task(find_pid_ns(vx_rmap_pid(nr), ns), PIDTYPE_PID); + } + + struct task_struct *find_task_by_vpid(pid_t vnr) +@@ -465,7 +469,7 @@ struct pid *find_get_pid(pid_t nr) + } + EXPORT_SYMBOL_GPL(find_get_pid); + +-pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) ++pid_t pid_unmapped_nr_ns(struct pid *pid, struct pid_namespace *ns) + { + struct upid *upid; + pid_t nr = 0; +@@ -478,6 +482,11 @@ pid_t pid_nr_ns(struct pid *pid, struct + return nr; + } + ++pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) ++{ ++ return vx_map_pid(pid_unmapped_nr_ns(pid, ns)); ++} ++ + pid_t pid_vnr(struct pid *pid) + { + return pid_nr_ns(pid, current->nsproxy->pid_ns); +diff -NurpP --minimal linux-3.2.34/kernel/pid_namespace.c linux-3.2.34-vs2.3.2.15/kernel/pid_namespace.c +--- linux-3.2.34/kernel/pid_namespace.c 2011-05-22 16:17:59.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/kernel/pid_namespace.c 2011-12-05 19:33:02.000000000 +0100 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + + #define BITS_PER_PAGE (PAGE_SIZE*8) + +@@ -88,6 +89,7 @@ static struct pid_namespace *create_pid_ + goto out_free_map; + + kref_init(&ns->kref); ++ atomic_inc(&vs_global_pid_ns); + ns->level = level; + ns->parent = get_pid_ns(parent_pid_ns); + +@@ -119,6 +121,7 @@ static void destroy_pid_namespace(struct + + for (i = 0; i < PIDMAP_ENTRIES; i++) + kfree(ns->pidmap[i].page); ++ atomic_dec(&vs_global_pid_ns); + kmem_cache_free(pid_ns_cachep, ns); + } + +diff -NurpP --minimal linux-3.2.34/kernel/posix-timers.c linux-3.2.34-vs2.3.2.15/kernel/posix-timers.c +--- linux-3.2.34/kernel/posix-timers.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/posix-timers.c 2011-12-05 19:44:00.000000000 +0100 +@@ -47,6 +47,7 @@ + #include + #include + #include ++#include + + /* + * Management arrays for POSIX timers. Timers are kept in slab memory +@@ -340,6 +341,7 @@ int posix_timer_event(struct k_itimer *t + { + struct task_struct *task; + int shared, ret = -1; ++ + /* + * FIXME: if ->sigq is queued we can race with + * dequeue_signal()->do_schedule_next_timer(). +@@ -356,10 +358,18 @@ int posix_timer_event(struct k_itimer *t + rcu_read_lock(); + task = pid_task(timr->it_pid, PIDTYPE_PID); + if (task) { ++ struct vx_info_save vxis; ++ struct vx_info *vxi; ++ ++ vxi = get_vx_info(task->vx_info); ++ enter_vx_info(vxi, &vxis); + shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); + ret = send_sigqueue(timr->sigq, task, shared); ++ leave_vx_info(&vxis); ++ put_vx_info(vxi); + } + rcu_read_unlock(); ++ + /* If we failed to send the signal the timer stops. */ + return ret > 0; + } +diff -NurpP --minimal linux-3.2.34/kernel/printk.c linux-3.2.34-vs2.3.2.15/kernel/printk.c +--- linux-3.2.34/kernel/printk.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/printk.c 2011-12-15 01:11:33.000000000 +0100 +@@ -41,6 +41,7 @@ + #include + #include + #include ++#include + + #include + +@@ -314,7 +315,7 @@ static int check_syslog_permissions(int + return 0; + + if (syslog_action_restricted(type)) { +- if (capable(CAP_SYSLOG)) ++ if (vx_capable(CAP_SYSLOG, VXC_SYSLOG)) + return 0; + /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ + if (capable(CAP_SYS_ADMIN)) { +@@ -344,12 +345,9 @@ int do_syslog(int type, char __user *buf + if (error) + return error; + +- switch (type) { +- case SYSLOG_ACTION_CLOSE: /* Close log */ +- break; +- case SYSLOG_ACTION_OPEN: /* Open log */ +- break; +- case SYSLOG_ACTION_READ: /* Read from log */ ++ if ((type == SYSLOG_ACTION_READ) || ++ (type == SYSLOG_ACTION_READ_ALL) || ++ (type == SYSLOG_ACTION_READ_CLEAR)) { + error = -EINVAL; + if (!buf || len < 0) + goto out; +@@ -360,6 +358,16 @@ int do_syslog(int type, char __user *buf + error = -EFAULT; + goto out; + } ++ } ++ if (!vx_check(0, VS_ADMIN|VS_WATCH)) ++ return vx_do_syslog(type, buf, len); ++ ++ switch (type) { ++ case SYSLOG_ACTION_CLOSE: /* Close log */ ++ break; ++ case SYSLOG_ACTION_OPEN: /* Open log */ ++ break; ++ case SYSLOG_ACTION_READ: /* Read from log */ + error = wait_event_interruptible(log_wait, + (log_start - log_end)); + if (error) +@@ -386,16 +394,6 @@ int do_syslog(int type, char __user *buf + /* FALL THRU */ + /* Read last kernel messages */ + case SYSLOG_ACTION_READ_ALL: +- error = -EINVAL; +- if (!buf || len < 0) +- goto out; +- error = 0; +- if (!len) +- goto out; +- if (!access_ok(VERIFY_WRITE, buf, len)) { +- error = -EFAULT; +- goto out; +- } + count = len; + if (count > log_buf_len) + count = log_buf_len; +diff -NurpP --minimal linux-3.2.34/kernel/ptrace.c linux-3.2.34-vs2.3.2.15/kernel/ptrace.c +--- linux-3.2.34/kernel/ptrace.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/ptrace.c 2012-01-09 16:19:31.000000000 +0100 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -209,6 +210,11 @@ ok: + dumpable = get_dumpable(task->mm); + if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) + return -EPERM; ++ if (!vx_check(task->xid, VS_ADMIN_P|VS_WATCH_P|VS_IDENT)) ++ return -EPERM; ++ if (!vx_check(task->xid, VS_IDENT) && ++ !task_vx_flags(task, VXF_STATE_ADMIN, 0)) ++ return -EACCES; + + return security_ptrace_access_check(task, mode); + } +diff -NurpP --minimal linux-3.2.34/kernel/sched.c linux-3.2.34-vs2.3.2.15/kernel/sched.c +--- linux-3.2.34/kernel/sched.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/sched.c 2012-10-22 12:59:52.000000000 +0200 +@@ -72,6 +72,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -3460,9 +3462,17 @@ EXPORT_SYMBOL(avenrun); /* should be rem + */ + void get_avenrun(unsigned long *loads, unsigned long offset, int shift) + { +- loads[0] = (avenrun[0] + offset) << shift; +- loads[1] = (avenrun[1] + offset) << shift; +- loads[2] = (avenrun[2] + offset) << shift; ++ if (vx_flags(VXF_VIRT_LOAD, 0)) { ++ struct vx_info *vxi = current_vx_info(); ++ ++ loads[0] = (vxi->cvirt.load[0] + offset) << shift; ++ loads[1] = (vxi->cvirt.load[1] + offset) << shift; ++ loads[2] = (vxi->cvirt.load[2] + offset) << shift; ++ } else { ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++ } + } + + static long calc_load_fold_active(struct rq *this_rq) +@@ -4054,16 +4064,19 @@ void account_user_time(struct task_struc + cputime_t cputime_scaled) + { + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; ++ struct vx_info *vxi = p->vx_info; /* p is _always_ current */ + cputime64_t tmp; ++ int nice = (TASK_NICE(p) > 0); + + /* Add user time to process. */ + p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); ++ vx_account_user(vxi, cputime, nice); + account_group_user_time(p, cputime); + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); +- if (TASK_NICE(p) > 0) ++ if (nice) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); +@@ -4115,10 +4128,12 @@ void __account_system_time(struct task_s + cputime_t cputime_scaled, cputime64_t *target_cputime64) + { + cputime64_t tmp = cputime_to_cputime64(cputime); ++ struct vx_info *vxi = p->vx_info; /* p is _always_ current */ + + /* Add system time to process. */ + p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); ++ vx_account_system(vxi, cputime, 0 /* do we have idle time? */); + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ +@@ -5324,7 +5339,7 @@ SYSCALL_DEFINE1(nice, int, increment) + nice = 19; + + if (increment < 0 && !can_nice(current, nice)) +- return -EPERM; ++ return vx_flags(VXF_IGNEG_NICE, 0) ? 0 : -EPERM; + + retval = security_task_setnice(current, nice); + if (retval) +diff -NurpP --minimal linux-3.2.34/kernel/sched_fair.c linux-3.2.34-vs2.3.2.15/kernel/sched_fair.c +--- linux-3.2.34/kernel/sched_fair.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/sched_fair.c 2012-08-13 12:40:51.000000000 +0200 +@@ -1014,6 +1014,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, st + __enqueue_entity(cfs_rq, se); + se->on_rq = 1; + ++ if (entity_is_task(se)) ++ vx_activate_task(task_of(se)); + if (cfs_rq->nr_running == 1) { + list_add_leaf_cfs_rq(cfs_rq); + check_enqueue_throttle(cfs_rq); +@@ -1094,6 +1096,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, st + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->on_rq = 0; ++ if (entity_is_task(se)) ++ vx_deactivate_task(task_of(se)); + update_cfs_load(cfs_rq, 0); + account_entity_dequeue(cfs_rq, se); + +diff -NurpP --minimal linux-3.2.34/kernel/signal.c linux-3.2.34-vs2.3.2.15/kernel/signal.c +--- linux-3.2.34/kernel/signal.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/signal.c 2012-05-15 18:16:52.000000000 +0200 +@@ -28,6 +28,8 @@ + #include + #include + #include ++#include ++#include + #define CREATE_TRACE_POINTS + #include + +@@ -789,9 +791,18 @@ static int check_kill_permission(int sig + struct pid *sid; + int error; + ++ vxdprintk(VXD_CBIT(misc, 7), ++ "check_kill_permission(%d,%p,%p[#%u,%u])", ++ sig, info, t, vx_task_xid(t), t->pid); ++ + if (!valid_signal(sig)) + return -EINVAL; + ++/* FIXME: needed? if so, why? ++ if ((info != SEND_SIG_NOINFO) && ++ (is_si_special(info) || !si_fromuser(info))) ++ goto skip; */ ++ + if (!si_fromuser(info)) + return 0; + +@@ -815,6 +826,20 @@ static int check_kill_permission(int sig + } + } + ++ error = -EPERM; ++ if (t->pid == 1 && current->xid) ++ return error; ++ ++ error = -ESRCH; ++ /* FIXME: we shouldn't return ESRCH ever, to avoid ++ loops, maybe ENOENT or EACCES? */ ++ if (!vx_check(vx_task_xid(t), VS_WATCH_P | VS_IDENT)) { ++ vxdprintk(current->xid || VXD_CBIT(misc, 7), ++ "signal %d[%p] xid mismatch %p[#%u,%u] xid=#%u", ++ sig, info, t, vx_task_xid(t), t->pid, current->xid); ++ return error; ++ } ++/* skip: */ + return security_task_kill(t, info, sig, 0); + } + +@@ -1319,7 +1344,7 @@ int kill_pid_info(int sig, struct siginf + rcu_read_lock(); + retry: + p = pid_task(pid, PIDTYPE_PID); +- if (p) { ++ if (p && vx_check(vx_task_xid(p), VS_IDENT)) { + error = group_send_sig_info(sig, info, p); + if (unlikely(error == -ESRCH)) + /* +@@ -1369,7 +1394,7 @@ int kill_pid_info_as_cred(int sig, struc + + rcu_read_lock(); + p = pid_task(pid, PIDTYPE_PID); +- if (!p) { ++ if (!p || !vx_check(vx_task_xid(p), VS_IDENT)) { + ret = -ESRCH; + goto out_unlock; + } +@@ -1421,8 +1446,10 @@ static int kill_something_info(int sig, + struct task_struct * p; + + for_each_process(p) { +- if (task_pid_vnr(p) > 1 && +- !same_thread_group(p, current)) { ++ if (vx_check(vx_task_xid(p), VS_ADMIN|VS_IDENT) && ++ task_pid_vnr(p) > 1 && ++ !same_thread_group(p, current) && ++ !vx_current_initpid(p->pid)) { + int err = group_send_sig_info(sig, info, p); + ++count; + if (err != -EPERM) +@@ -2264,6 +2291,11 @@ relock: + !sig_kernel_only(signr)) + continue; + ++ /* virtual init is protected against user signals */ ++ if ((info->si_code == SI_USER) && ++ vx_current_initpid(current->pid)) ++ continue; ++ + if (sig_kernel_stop(signr)) { + /* + * The default action is to stop all threads in +diff -NurpP --minimal linux-3.2.34/kernel/softirq.c linux-3.2.34-vs2.3.2.15/kernel/softirq.c +--- linux-3.2.34/kernel/softirq.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/softirq.c 2011-12-05 19:33:02.000000000 +0100 +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + + #define CREATE_TRACE_POINTS + #include +diff -NurpP --minimal linux-3.2.34/kernel/sys.c linux-3.2.34-vs2.3.2.15/kernel/sys.c +--- linux-3.2.34/kernel/sys.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/sys.c 2012-11-06 18:08:24.000000000 +0100 +@@ -45,6 +45,7 @@ + #include + #include + #include ++#include + + #include + /* Move somewhere else to avoid recompiling? */ +@@ -155,7 +156,10 @@ static int set_one_prio(struct task_stru + goto out; + } + if (niceval < task_nice(p) && !can_nice(p, niceval)) { +- error = -EACCES; ++ if (vx_flags(VXF_IGNEG_NICE, 0)) ++ error = 0; ++ else ++ error = -EACCES; + goto out; + } + no_nice = security_task_setnice(p, niceval); +@@ -205,6 +209,8 @@ SYSCALL_DEFINE3(setpriority, int, which, + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ++ if (!vx_check(p->xid, VS_ADMIN_P | VS_IDENT)) ++ continue; + error = set_one_prio(p, niceval, error); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; +@@ -268,6 +274,8 @@ SYSCALL_DEFINE2(getpriority, int, which, + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ++ if (!vx_check(p->xid, VS_ADMIN_P | VS_IDENT)) ++ continue; + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; +@@ -419,6 +427,8 @@ EXPORT_SYMBOL_GPL(kernel_power_off); + + static DEFINE_MUTEX(reboot_mutex); + ++long vs_reboot(unsigned int, void __user *); ++ + /* + * Reboot system call: for obvious reasons only root may call it, + * and even root needs to set up some magic numbers in the registers +@@ -451,6 +461,9 @@ SYSCALL_DEFINE4(reboot, int, magic1, int + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) + cmd = LINUX_REBOOT_CMD_HALT; + ++ if (!vx_check(0, VS_ADMIN|VS_WATCH)) ++ return vs_reboot(cmd, arg); ++ + mutex_lock(&reboot_mutex); + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: +@@ -1276,7 +1289,8 @@ SYSCALL_DEFINE2(sethostname, char __user + int errno; + char tmp[__NEW_UTS_LEN]; + +- if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) ++ if (!vx_ns_capable(current->nsproxy->uts_ns->user_ns, ++ CAP_SYS_ADMIN, VXC_SET_UTSNAME)) + return -EPERM; + + if (len < 0 || len > __NEW_UTS_LEN) +@@ -1327,7 +1341,8 @@ SYSCALL_DEFINE2(setdomainname, char __us + int errno; + char tmp[__NEW_UTS_LEN]; + +- if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) ++ if (!vx_ns_capable(current->nsproxy->uts_ns->user_ns, ++ CAP_SYS_ADMIN, VXC_SET_UTSNAME)) + return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; +@@ -1446,7 +1461,7 @@ int do_prlimit(struct task_struct *tsk, + /* Keep the capable check against init_user_ns until + cgroups can contain all limits */ + if (new_rlim->rlim_max > rlim->rlim_max && +- !capable(CAP_SYS_RESOURCE)) ++ !vx_capable(CAP_SYS_RESOURCE, VXC_SET_RLIMIT)) + retval = -EPERM; + if (!retval) + retval = security_task_setrlimit(tsk->group_leader, +@@ -1500,7 +1515,8 @@ static int check_prlimit_permission(stru + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + return 0; +- if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) ++ if (vx_ns_capable(tcred->user->user_ns, ++ CAP_SYS_RESOURCE, VXC_SET_RLIMIT)) + return 0; + + return -EPERM; +diff -NurpP --minimal linux-3.2.34/kernel/sysctl.c linux-3.2.34-vs2.3.2.15/kernel/sysctl.c +--- linux-3.2.34/kernel/sysctl.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/sysctl.c 2012-04-16 12:14:54.000000000 +0200 +@@ -76,6 +76,7 @@ + #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) + #include + #endif ++extern char vshelper_path[]; + #ifdef CONFIG_CHR_DEV_SG + #include + #endif +@@ -572,6 +573,13 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dostring, + }, + #endif ++ { ++ .procname = "vshelper", ++ .data = &vshelper_path, ++ .maxlen = 256, ++ .mode = 0644, ++ .proc_handler = &proc_dostring, ++ }, + #ifdef CONFIG_CHR_DEV_SG + { + .procname = "sg-big-buff", +diff -NurpP --minimal linux-3.2.34/kernel/sysctl_binary.c linux-3.2.34-vs2.3.2.15/kernel/sysctl_binary.c +--- linux-3.2.34/kernel/sysctl_binary.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/sysctl_binary.c 2012-01-09 16:19:31.000000000 +0100 +@@ -73,6 +73,7 @@ static const struct bin_table bin_kern_t + + { CTL_INT, KERN_PANIC, "panic" }, + { CTL_INT, KERN_REALROOTDEV, "real-root-dev" }, ++ { CTL_STR, KERN_VSHELPER, "vshelper" }, + + { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" }, + { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" }, +diff -NurpP --minimal linux-3.2.34/kernel/time/timekeeping.c linux-3.2.34-vs2.3.2.15/kernel/time/timekeeping.c +--- linux-3.2.34/kernel/time/timekeeping.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/time/timekeeping.c 2012-11-06 18:08:24.000000000 +0100 +@@ -253,6 +253,7 @@ void getnstimeofday(struct timespec *ts) + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); ++ vx_adjust_timespec(ts); + } + + EXPORT_SYMBOL(getnstimeofday); +diff -NurpP --minimal linux-3.2.34/kernel/time.c linux-3.2.34-vs2.3.2.15/kernel/time.c +--- linux-3.2.34/kernel/time.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/time.c 2011-12-05 19:33:02.000000000 +0100 +@@ -92,7 +92,7 @@ SYSCALL_DEFINE1(stime, time_t __user *, + if (err) + return err; + +- do_settimeofday(&tv); ++ vx_settimeofday(&tv); + return 0; + } + +@@ -177,7 +177,7 @@ int do_sys_settimeofday(const struct tim + /* SMP safe, again the code in arch/foo/time.c should + * globally block out interrupts when it runs. + */ +- return do_settimeofday(tv); ++ return vx_settimeofday(tv); + } + return 0; + } +diff -NurpP --minimal linux-3.2.34/kernel/timer.c linux-3.2.34-vs2.3.2.15/kernel/timer.c +--- linux-3.2.34/kernel/timer.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/timer.c 2012-11-06 18:08:24.000000000 +0100 +@@ -40,6 +40,10 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + + #include + #include +@@ -1338,12 +1342,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, sec + + #endif + +-#ifndef __alpha__ +- +-/* +- * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this +- * should be moved into arch/i386 instead? +- */ + + /** + * sys_getpid - return the thread group id of the current process +@@ -1372,10 +1370,23 @@ SYSCALL_DEFINE0(getppid) + rcu_read_lock(); + pid = task_tgid_vnr(rcu_dereference(current->real_parent)); + rcu_read_unlock(); ++ return vx_map_pid(pid); ++} + +- return pid; ++#ifdef __alpha__ ++ ++/* ++ * The Alpha uses getxpid, getxuid, and getxgid instead. ++ */ ++ ++asmlinkage long do_getxpid(long *ppid) ++{ ++ *ppid = sys_getppid(); ++ return sys_getpid(); + } + ++#else /* _alpha_ */ ++ + SYSCALL_DEFINE0(getuid) + { + /* Only we change this so SMP safe */ +diff -NurpP --minimal linux-3.2.34/kernel/user_namespace.c linux-3.2.34-vs2.3.2.15/kernel/user_namespace.c +--- linux-3.2.34/kernel/user_namespace.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/user_namespace.c 2011-12-05 19:33:02.000000000 +0100 +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + + static struct kmem_cache *user_ns_cachep __read_mostly; + +@@ -33,6 +34,7 @@ int create_user_ns(struct cred *new) + return -ENOMEM; + + kref_init(&ns->kref); ++ atomic_inc(&vs_global_user_ns); + + for (n = 0; n < UIDHASH_SZ; ++n) + INIT_HLIST_HEAD(ns->uidhash_table + n); +@@ -81,6 +83,8 @@ void free_user_ns(struct kref *kref) + struct user_namespace *ns = + container_of(kref, struct user_namespace, kref); + ++ /* FIXME: maybe move into destroyer? */ ++ atomic_dec(&vs_global_user_ns); + INIT_WORK(&ns->destroyer, free_user_ns_work); + schedule_work(&ns->destroyer); + } +diff -NurpP --minimal linux-3.2.34/kernel/utsname.c linux-3.2.34-vs2.3.2.15/kernel/utsname.c +--- linux-3.2.34/kernel/utsname.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/utsname.c 2011-12-05 19:33:02.000000000 +0100 +@@ -16,14 +16,17 @@ + #include + #include + #include ++#include + + static struct uts_namespace *create_uts_ns(void) + { + struct uts_namespace *uts_ns; + + uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); +- if (uts_ns) ++ if (uts_ns) { + kref_init(&uts_ns->kref); ++ atomic_inc(&vs_global_uts_ns); ++ } + return uts_ns; + } + +@@ -32,8 +35,8 @@ static struct uts_namespace *create_uts_ + * @old_ns: namespace to clone + * Return NULL on error (failure to kmalloc), new ns otherwise + */ +-static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, +- struct uts_namespace *old_ns) ++static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns, ++ struct user_namespace *old_user) + { + struct uts_namespace *ns; + +@@ -43,7 +46,7 @@ static struct uts_namespace *clone_uts_n + + down_read(&uts_sem); + memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); +- ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); ++ ns->user_ns = get_user_ns(old_user); + up_read(&uts_sem); + return ns; + } +@@ -55,9 +58,9 @@ static struct uts_namespace *clone_uts_n + * versa. + */ + struct uts_namespace *copy_utsname(unsigned long flags, +- struct task_struct *tsk) ++ struct uts_namespace *old_ns, ++ struct user_namespace *user_ns) + { +- struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; + struct uts_namespace *new_ns; + + BUG_ON(!old_ns); +@@ -66,7 +69,7 @@ struct uts_namespace *copy_utsname(unsig + if (!(flags & CLONE_NEWUTS)) + return old_ns; + +- new_ns = clone_uts_ns(tsk, old_ns); ++ new_ns = clone_uts_ns(old_ns, user_ns); + + put_uts_ns(old_ns); + return new_ns; +@@ -78,6 +81,7 @@ void free_uts_ns(struct kref *kref) + + ns = container_of(kref, struct uts_namespace, kref); + put_user_ns(ns->user_ns); ++ atomic_dec(&vs_global_uts_ns); + kfree(ns); + } + +diff -NurpP --minimal linux-3.2.34/kernel/vserver/Kconfig linux-3.2.34-vs2.3.2.15/kernel/vserver/Kconfig +--- linux-3.2.34/kernel/vserver/Kconfig 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/Kconfig 2011-12-15 01:52:48.000000000 +0100 +@@ -0,0 +1,224 @@ ++# ++# Linux VServer configuration ++# ++ ++menu "Linux VServer" ++ ++config VSERVER_AUTO_LBACK ++ bool "Automatically Assign Loopback IP" ++ default y ++ help ++ Automatically assign a guest specific loopback ++ IP and add it to the kernel network stack on ++ startup. ++ ++config VSERVER_AUTO_SINGLE ++ bool "Automatic Single IP Special Casing" ++ depends on EXPERIMENTAL ++ default y ++ help ++ This allows network contexts with a single IP to ++ automatically remap 0.0.0.0 bindings to that IP, ++ avoiding further network checks and improving ++ performance. ++ ++ (note: such guests do not allow to change the ip ++ on the fly and do not show loopback addresses) ++ ++config VSERVER_COWBL ++ bool "Enable COW Immutable Link Breaking" ++ default y ++ help ++ This enables the COW (Copy-On-Write) link break code. ++ It allows you to treat unified files like normal files ++ when writing to them (which will implicitely break the ++ link and create a copy of the unified file) ++ ++config VSERVER_VTIME ++ bool "Enable Virtualized Guest Time" ++ depends on EXPERIMENTAL ++ default n ++ help ++ This enables per guest time offsets to allow for ++ adjusting the system clock individually per guest. ++ this adds some overhead to the time functions and ++ therefore should not be enabled without good reason. ++ ++config VSERVER_DEVICE ++ bool "Enable Guest Device Mapping" ++ depends on EXPERIMENTAL ++ default n ++ help ++ This enables generic device remapping. ++ ++config VSERVER_PROC_SECURE ++ bool "Enable Proc Security" ++ depends on PROC_FS ++ default y ++ help ++ This configures ProcFS security to initially hide ++ non-process entries for all contexts except the main and ++ spectator context (i.e. for all guests), which is a secure ++ default. ++ ++ (note: on 1.2x the entries were visible by default) ++ ++choice ++ prompt "Persistent Inode Tagging" ++ default TAGGING_ID24 ++ help ++ This adds persistent context information to filesystems ++ mounted with the tagxid option. Tagging is a requirement ++ for per-context disk limits and per-context quota. ++ ++ ++config TAGGING_NONE ++ bool "Disabled" ++ help ++ do not store per-context information in inodes. ++ ++config TAGGING_UID16 ++ bool "UID16/GID32" ++ help ++ reduces UID to 16 bit, but leaves GID at 32 bit. ++ ++config TAGGING_GID16 ++ bool "UID32/GID16" ++ help ++ reduces GID to 16 bit, but leaves UID at 32 bit. ++ ++config TAGGING_ID24 ++ bool "UID24/GID24" ++ help ++ uses the upper 8bit from UID and GID for XID tagging ++ which leaves 24bit for UID/GID each, which should be ++ more than sufficient for normal use. ++ ++config TAGGING_INTERN ++ bool "UID32/GID32" ++ help ++ this uses otherwise reserved inode fields in the on ++ disk representation, which limits the use to a few ++ filesystems (currently ext2 and ext3) ++ ++endchoice ++ ++config TAG_NFSD ++ bool "Tag NFSD User Auth and Files" ++ default n ++ help ++ Enable this if you do want the in-kernel NFS ++ Server to use the tagging specified above. ++ (will require patched clients too) ++ ++config VSERVER_PRIVACY ++ bool "Honor Privacy Aspects of Guests" ++ default n ++ help ++ When enabled, most context checks will disallow ++ access to structures assigned to a specific context, ++ like ptys or loop devices. ++ ++config VSERVER_CONTEXTS ++ int "Maximum number of Contexts (1-65533)" if EMBEDDED ++ range 1 65533 ++ default "768" if 64BIT ++ default "256" ++ help ++ This setting will optimize certain data structures ++ and memory allocations according to the expected ++ maximum. ++ ++ note: this is not a strict upper limit. ++ ++config VSERVER_WARN ++ bool "VServer Warnings" ++ default y ++ help ++ This enables various runtime warnings, which will ++ notify about potential manipulation attempts or ++ resource shortage. It is generally considered to ++ be a good idea to have that enabled. ++ ++config VSERVER_WARN_DEVPTS ++ bool "VServer DevPTS Warnings" ++ depends on VSERVER_WARN ++ default y ++ help ++ This enables DevPTS related warnings, issued when a ++ process inside a context tries to lookup or access ++ a dynamic pts from the host or a different context. ++ ++config VSERVER_DEBUG ++ bool "VServer Debugging Code" ++ default n ++ help ++ Set this to yes if you want to be able to activate ++ debugging output at runtime. It adds a very small ++ overhead to all vserver related functions and ++ increases the kernel size by about 20k. ++ ++config VSERVER_HISTORY ++ bool "VServer History Tracing" ++ depends on VSERVER_DEBUG ++ default n ++ help ++ Set this to yes if you want to record the history of ++ linux-vserver activities, so they can be replayed in ++ the event of a kernel panic or oops. ++ ++config VSERVER_HISTORY_SIZE ++ int "Per-CPU History Size (32-65536)" ++ depends on VSERVER_HISTORY ++ range 32 65536 ++ default 64 ++ help ++ This allows you to specify the number of entries in ++ the per-CPU history buffer. ++ ++choice ++ prompt "Quotes used in debug and warn messages" ++ default QUOTES_ISO8859 ++ ++config QUOTES_ISO8859 ++ bool "Extended ASCII (ISO 8859) angle quotes" ++ help ++ This uses the extended ASCII characters \xbb ++ and \xab for quoting file and process names. ++ ++config QUOTES_UTF8 ++ bool "UTF-8 angle quotes" ++ help ++ This uses the the UTF-8 sequences for angle ++ quotes to quote file and process names. ++ ++config QUOTES_ASCII ++ bool "ASCII single quotes" ++ help ++ This uses the ASCII single quote character ++ (\x27) to quote file and process names. ++ ++endchoice ++ ++endmenu ++ ++ ++config VSERVER ++ bool ++ default y ++ select NAMESPACES ++ select UTS_NS ++ select IPC_NS ++# select USER_NS ++ select SYSVIPC ++ ++config VSERVER_SECURITY ++ bool ++ depends on SECURITY ++ default y ++ select SECURITY_CAPABILITIES ++ ++config VSERVER_DISABLED ++ bool ++ default n ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/Makefile linux-3.2.34-vs2.3.2.15/kernel/vserver/Makefile +--- linux-3.2.34/kernel/vserver/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/Makefile 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,18 @@ ++# ++# Makefile for the Linux vserver routines. ++# ++ ++ ++obj-y += vserver.o ++ ++vserver-y := switch.o context.o space.o sched.o network.o inode.o \ ++ limit.o cvirt.o cacct.o signal.o helper.o init.o \ ++ dlimit.o tag.o ++ ++vserver-$(CONFIG_INET) += inet.o ++vserver-$(CONFIG_PROC_FS) += proc.o ++vserver-$(CONFIG_VSERVER_DEBUG) += sysctl.o debug.o ++vserver-$(CONFIG_VSERVER_HISTORY) += history.o ++vserver-$(CONFIG_VSERVER_MONITOR) += monitor.o ++vserver-$(CONFIG_VSERVER_DEVICE) += device.o ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/cacct.c linux-3.2.34-vs2.3.2.15/kernel/vserver/cacct.c +--- linux-3.2.34/kernel/vserver/cacct.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/cacct.c 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,42 @@ ++/* ++ * linux/kernel/vserver/cacct.c ++ * ++ * Virtual Server: Context Accounting ++ * ++ * Copyright (C) 2006-2007 Herbert Pötzl ++ * ++ * V0.01 added accounting stats ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++ ++int vc_sock_stat(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_sock_stat_v0 vc_data; ++ int j, field; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ field = vc_data.field; ++ if ((field < 0) || (field >= VXA_SOCK_SIZE)) ++ return -EINVAL; ++ ++ for (j = 0; j < 3; j++) { ++ vc_data.count[j] = vx_sock_count(&vxi->cacct, field, j); ++ vc_data.total[j] = vx_sock_total(&vxi->cacct, field, j); ++ } ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/cacct_init.h linux-3.2.34-vs2.3.2.15/kernel/vserver/cacct_init.h +--- linux-3.2.34/kernel/vserver/cacct_init.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/cacct_init.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,25 @@ ++ ++ ++static inline void vx_info_init_cacct(struct _vx_cacct *cacct) ++{ ++ int i, j; ++ ++ ++ for (i = 0; i < VXA_SOCK_SIZE; i++) { ++ for (j = 0; j < 3; j++) { ++ atomic_long_set(&cacct->sock[i][j].count, 0); ++ atomic_long_set(&cacct->sock[i][j].total, 0); ++ } ++ } ++ for (i = 0; i < 8; i++) ++ atomic_set(&cacct->slab[i], 0); ++ for (i = 0; i < 5; i++) ++ for (j = 0; j < 4; j++) ++ atomic_set(&cacct->page[i][j], 0); ++} ++ ++static inline void vx_info_exit_cacct(struct _vx_cacct *cacct) ++{ ++ return; ++} ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/cacct_proc.h linux-3.2.34-vs2.3.2.15/kernel/vserver/cacct_proc.h +--- linux-3.2.34/kernel/vserver/cacct_proc.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/cacct_proc.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,53 @@ ++#ifndef _VX_CACCT_PROC_H ++#define _VX_CACCT_PROC_H ++ ++#include ++ ++ ++#define VX_SOCKA_TOP \ ++ "Type\t recv #/bytes\t\t send #/bytes\t\t fail #/bytes\n" ++ ++static inline int vx_info_proc_cacct(struct _vx_cacct *cacct, char *buffer) ++{ ++ int i, j, length = 0; ++ static char *type[VXA_SOCK_SIZE] = { ++ "UNSPEC", "UNIX", "INET", "INET6", "PACKET", "OTHER" ++ }; ++ ++ length += sprintf(buffer + length, VX_SOCKA_TOP); ++ for (i = 0; i < VXA_SOCK_SIZE; i++) { ++ length += sprintf(buffer + length, "%s:", type[i]); ++ for (j = 0; j < 3; j++) { ++ length += sprintf(buffer + length, ++ "\t%10lu/%-10lu", ++ vx_sock_count(cacct, i, j), ++ vx_sock_total(cacct, i, j)); ++ } ++ buffer[length++] = '\n'; ++ } ++ ++ length += sprintf(buffer + length, "\n"); ++ length += sprintf(buffer + length, ++ "slab:\t %8u %8u %8u %8u\n", ++ atomic_read(&cacct->slab[1]), ++ atomic_read(&cacct->slab[4]), ++ atomic_read(&cacct->slab[0]), ++ atomic_read(&cacct->slab[2])); ++ ++ length += sprintf(buffer + length, "\n"); ++ for (i = 0; i < 5; i++) { ++ length += sprintf(buffer + length, ++ "page[%d]: %8u %8u %8u %8u\t %8u %8u %8u %8u\n", i, ++ atomic_read(&cacct->page[i][0]), ++ atomic_read(&cacct->page[i][1]), ++ atomic_read(&cacct->page[i][2]), ++ atomic_read(&cacct->page[i][3]), ++ atomic_read(&cacct->page[i][4]), ++ atomic_read(&cacct->page[i][5]), ++ atomic_read(&cacct->page[i][6]), ++ atomic_read(&cacct->page[i][7])); ++ } ++ return length; ++} ++ ++#endif /* _VX_CACCT_PROC_H */ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/context.c linux-3.2.34-vs2.3.2.15/kernel/vserver/context.c +--- linux-3.2.34/kernel/vserver/context.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/context.c 2012-06-27 05:01:29.000000000 +0200 +@@ -0,0 +1,1119 @@ ++/* ++ * linux/kernel/vserver/context.c ++ * ++ * Virtual Server: Context Support ++ * ++ * Copyright (C) 2003-2011 Herbert Pötzl ++ * ++ * V0.01 context helper ++ * V0.02 vx_ctx_kill syscall command ++ * V0.03 replaced context_info calls ++ * V0.04 redesign of struct (de)alloc ++ * V0.05 rlimit basic implementation ++ * V0.06 task_xid and info commands ++ * V0.07 context flags and caps ++ * V0.08 switch to RCU based hash ++ * V0.09 revert to non RCU for now ++ * V0.10 and back to working RCU hash ++ * V0.11 and back to locking again ++ * V0.12 referenced context store ++ * V0.13 separate per cpu data ++ * V0.14 changed vcmds to vxi arg ++ * V0.15 added context stat ++ * V0.16 have __create claim() the vxi ++ * V0.17 removed older and legacy stuff ++ * V0.18 added user credentials ++ * V0.19 added warn mask ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#include "cvirt_init.h" ++#include "cacct_init.h" ++#include "limit_init.h" ++#include "sched_init.h" ++ ++ ++atomic_t vx_global_ctotal = ATOMIC_INIT(0); ++atomic_t vx_global_cactive = ATOMIC_INIT(0); ++ ++ ++/* now inactive context structures */ ++ ++static struct hlist_head vx_info_inactive = HLIST_HEAD_INIT; ++ ++static DEFINE_SPINLOCK(vx_info_inactive_lock); ++ ++ ++/* __alloc_vx_info() ++ ++ * allocate an initialized vx_info struct ++ * doesn't make it visible (hash) */ ++ ++static struct vx_info *__alloc_vx_info(xid_t xid) ++{ ++ struct vx_info *new = NULL; ++ int cpu, index; ++ ++ vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid); ++ ++ /* would this benefit from a slab cache? */ ++ new = kmalloc(sizeof(struct vx_info), GFP_KERNEL); ++ if (!new) ++ return 0; ++ ++ memset(new, 0, sizeof(struct vx_info)); ++#ifdef CONFIG_SMP ++ new->ptr_pc = alloc_percpu(struct _vx_info_pc); ++ if (!new->ptr_pc) ++ goto error; ++#endif ++ new->vx_id = xid; ++ INIT_HLIST_NODE(&new->vx_hlist); ++ atomic_set(&new->vx_usecnt, 0); ++ atomic_set(&new->vx_tasks, 0); ++ new->vx_parent = NULL; ++ new->vx_state = 0; ++ init_waitqueue_head(&new->vx_wait); ++ ++ /* prepare reaper */ ++ get_task_struct(init_pid_ns.child_reaper); ++ new->vx_reaper = init_pid_ns.child_reaper; ++ new->vx_badness_bias = 0; ++ ++ /* rest of init goes here */ ++ vx_info_init_limit(&new->limit); ++ vx_info_init_sched(&new->sched); ++ vx_info_init_cvirt(&new->cvirt); ++ vx_info_init_cacct(&new->cacct); ++ ++ /* per cpu data structures */ ++ for_each_possible_cpu(cpu) { ++ vx_info_init_sched_pc( ++ &vx_per_cpu(new, sched_pc, cpu), cpu); ++ vx_info_init_cvirt_pc( ++ &vx_per_cpu(new, cvirt_pc, cpu), cpu); ++ } ++ ++ new->vx_flags = VXF_INIT_SET; ++ new->vx_bcaps = CAP_FULL_SET; // maybe ~CAP_SETPCAP ++ new->vx_ccaps = 0; ++ new->vx_umask = 0; ++ new->vx_wmask = 0; ++ ++ new->reboot_cmd = 0; ++ new->exit_code = 0; ++ ++ // preconfig spaces ++ for (index = 0; index < VX_SPACES; index++) { ++ struct _vx_space *space = &new->space[index]; ++ ++ // filesystem ++ spin_lock(&init_fs.lock); ++ init_fs.users++; ++ spin_unlock(&init_fs.lock); ++ space->vx_fs = &init_fs; ++ ++ /* FIXME: do we want defaults? */ ++ // space->vx_real_cred = 0; ++ // space->vx_cred = 0; ++ } ++ ++ ++ vxdprintk(VXD_CBIT(xid, 0), ++ "alloc_vx_info(%d) = %p", xid, new); ++ vxh_alloc_vx_info(new); ++ atomic_inc(&vx_global_ctotal); ++ return new; ++#ifdef CONFIG_SMP ++error: ++ kfree(new); ++ return 0; ++#endif ++} ++ ++/* __dealloc_vx_info() ++ ++ * final disposal of vx_info */ ++ ++static void __dealloc_vx_info(struct vx_info *vxi) ++{ ++#ifdef CONFIG_VSERVER_WARN ++ struct vx_info_save vxis; ++ int cpu; ++#endif ++ vxdprintk(VXD_CBIT(xid, 0), ++ "dealloc_vx_info(%p)", vxi); ++ vxh_dealloc_vx_info(vxi); ++ ++#ifdef CONFIG_VSERVER_WARN ++ enter_vx_info(vxi, &vxis); ++ vx_info_exit_limit(&vxi->limit); ++ vx_info_exit_sched(&vxi->sched); ++ vx_info_exit_cvirt(&vxi->cvirt); ++ vx_info_exit_cacct(&vxi->cacct); ++ ++ for_each_possible_cpu(cpu) { ++ vx_info_exit_sched_pc( ++ &vx_per_cpu(vxi, sched_pc, cpu), cpu); ++ vx_info_exit_cvirt_pc( ++ &vx_per_cpu(vxi, cvirt_pc, cpu), cpu); ++ } ++ leave_vx_info(&vxis); ++#endif ++ ++ vxi->vx_id = -1; ++ vxi->vx_state |= VXS_RELEASED; ++ ++#ifdef CONFIG_SMP ++ free_percpu(vxi->ptr_pc); ++#endif ++ kfree(vxi); ++ atomic_dec(&vx_global_ctotal); ++} ++ ++static void __shutdown_vx_info(struct vx_info *vxi) ++{ ++ struct nsproxy *nsproxy; ++ struct fs_struct *fs; ++ struct cred *cred; ++ int index, kill; ++ ++ might_sleep(); ++ ++ vxi->vx_state |= VXS_SHUTDOWN; ++ vs_state_change(vxi, VSC_SHUTDOWN); ++ ++ for (index = 0; index < VX_SPACES; index++) { ++ struct _vx_space *space = &vxi->space[index]; ++ ++ nsproxy = xchg(&space->vx_nsproxy, NULL); ++ if (nsproxy) ++ put_nsproxy(nsproxy); ++ ++ fs = xchg(&space->vx_fs, NULL); ++ spin_lock(&fs->lock); ++ kill = !--fs->users; ++ spin_unlock(&fs->lock); ++ if (kill) ++ free_fs_struct(fs); ++ ++ cred = (struct cred *)xchg(&space->vx_cred, NULL); ++ if (cred) ++ abort_creds(cred); ++ } ++} ++ ++/* exported stuff */ ++ ++void free_vx_info(struct vx_info *vxi) ++{ ++ unsigned long flags; ++ unsigned index; ++ ++ /* check for reference counts first */ ++ BUG_ON(atomic_read(&vxi->vx_usecnt)); ++ BUG_ON(atomic_read(&vxi->vx_tasks)); ++ ++ /* context must not be hashed */ ++ BUG_ON(vx_info_state(vxi, VXS_HASHED)); ++ ++ /* context shutdown is mandatory */ ++ BUG_ON(!vx_info_state(vxi, VXS_SHUTDOWN)); ++ ++ /* spaces check */ ++ for (index = 0; index < VX_SPACES; index++) { ++ struct _vx_space *space = &vxi->space[index]; ++ ++ BUG_ON(space->vx_nsproxy); ++ BUG_ON(space->vx_fs); ++ // BUG_ON(space->vx_real_cred); ++ // BUG_ON(space->vx_cred); ++ } ++ ++ spin_lock_irqsave(&vx_info_inactive_lock, flags); ++ hlist_del(&vxi->vx_hlist); ++ spin_unlock_irqrestore(&vx_info_inactive_lock, flags); ++ ++ __dealloc_vx_info(vxi); ++} ++ ++ ++/* hash table for vx_info hash */ ++ ++#define VX_HASH_SIZE 13 ++ ++static struct hlist_head vx_info_hash[VX_HASH_SIZE] = ++ { [0 ... VX_HASH_SIZE-1] = HLIST_HEAD_INIT }; ++ ++static DEFINE_SPINLOCK(vx_info_hash_lock); ++ ++ ++static inline unsigned int __hashval(xid_t xid) ++{ ++ return (xid % VX_HASH_SIZE); ++} ++ ++ ++ ++/* __hash_vx_info() ++ ++ * add the vxi to the global hash table ++ * requires the hash_lock to be held */ ++ ++static inline void __hash_vx_info(struct vx_info *vxi) ++{ ++ struct hlist_head *head; ++ ++ vxd_assert_lock(&vx_info_hash_lock); ++ vxdprintk(VXD_CBIT(xid, 4), ++ "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id); ++ vxh_hash_vx_info(vxi); ++ ++ /* context must not be hashed */ ++ BUG_ON(vx_info_state(vxi, VXS_HASHED)); ++ ++ vxi->vx_state |= VXS_HASHED; ++ head = &vx_info_hash[__hashval(vxi->vx_id)]; ++ hlist_add_head(&vxi->vx_hlist, head); ++ atomic_inc(&vx_global_cactive); ++} ++ ++/* __unhash_vx_info() ++ ++ * remove the vxi from the global hash table ++ * requires the hash_lock to be held */ ++ ++static inline void __unhash_vx_info(struct vx_info *vxi) ++{ ++ unsigned long flags; ++ ++ vxd_assert_lock(&vx_info_hash_lock); ++ vxdprintk(VXD_CBIT(xid, 4), ++ "__unhash_vx_info: %p[#%d.%d.%d]", vxi, vxi->vx_id, ++ atomic_read(&vxi->vx_usecnt), atomic_read(&vxi->vx_tasks)); ++ vxh_unhash_vx_info(vxi); ++ ++ /* context must be hashed */ ++ BUG_ON(!vx_info_state(vxi, VXS_HASHED)); ++ /* but without tasks */ ++ BUG_ON(atomic_read(&vxi->vx_tasks)); ++ ++ vxi->vx_state &= ~VXS_HASHED; ++ hlist_del_init(&vxi->vx_hlist); ++ spin_lock_irqsave(&vx_info_inactive_lock, flags); ++ hlist_add_head(&vxi->vx_hlist, &vx_info_inactive); ++ spin_unlock_irqrestore(&vx_info_inactive_lock, flags); ++ atomic_dec(&vx_global_cactive); ++} ++ ++ ++/* __lookup_vx_info() ++ ++ * requires the hash_lock to be held ++ * doesn't increment the vx_refcnt */ ++ ++static inline struct vx_info *__lookup_vx_info(xid_t xid) ++{ ++ struct hlist_head *head = &vx_info_hash[__hashval(xid)]; ++ struct hlist_node *pos; ++ struct vx_info *vxi; ++ ++ vxd_assert_lock(&vx_info_hash_lock); ++ hlist_for_each(pos, head) { ++ vxi = hlist_entry(pos, struct vx_info, vx_hlist); ++ ++ if (vxi->vx_id == xid) ++ goto found; ++ } ++ vxi = NULL; ++found: ++ vxdprintk(VXD_CBIT(xid, 0), ++ "__lookup_vx_info(#%u): %p[#%u]", ++ xid, vxi, vxi ? vxi->vx_id : 0); ++ vxh_lookup_vx_info(vxi, xid); ++ return vxi; ++} ++ ++ ++/* __create_vx_info() ++ ++ * create the requested context ++ * get(), claim() and hash it */ ++ ++static struct vx_info *__create_vx_info(int id) ++{ ++ struct vx_info *new, *vxi = NULL; ++ ++ vxdprintk(VXD_CBIT(xid, 1), "create_vx_info(%d)*", id); ++ ++ if (!(new = __alloc_vx_info(id))) ++ return ERR_PTR(-ENOMEM); ++ ++ /* required to make dynamic xids unique */ ++ spin_lock(&vx_info_hash_lock); ++ ++ /* static context requested */ ++ if ((vxi = __lookup_vx_info(id))) { ++ vxdprintk(VXD_CBIT(xid, 0), ++ "create_vx_info(%d) = %p (already there)", id, vxi); ++ if (vx_info_flags(vxi, VXF_STATE_SETUP, 0)) ++ vxi = ERR_PTR(-EBUSY); ++ else ++ vxi = ERR_PTR(-EEXIST); ++ goto out_unlock; ++ } ++ /* new context */ ++ vxdprintk(VXD_CBIT(xid, 0), ++ "create_vx_info(%d) = %p (new)", id, new); ++ claim_vx_info(new, NULL); ++ __hash_vx_info(get_vx_info(new)); ++ vxi = new, new = NULL; ++ ++out_unlock: ++ spin_unlock(&vx_info_hash_lock); ++ vxh_create_vx_info(IS_ERR(vxi) ? NULL : vxi, id); ++ if (new) ++ __dealloc_vx_info(new); ++ return vxi; ++} ++ ++ ++/* exported stuff */ ++ ++ ++void unhash_vx_info(struct vx_info *vxi) ++{ ++ spin_lock(&vx_info_hash_lock); ++ __unhash_vx_info(vxi); ++ spin_unlock(&vx_info_hash_lock); ++ __shutdown_vx_info(vxi); ++ __wakeup_vx_info(vxi); ++} ++ ++ ++/* lookup_vx_info() ++ ++ * search for a vx_info and get() it ++ * negative id means current */ ++ ++struct vx_info *lookup_vx_info(int id) ++{ ++ struct vx_info *vxi = NULL; ++ ++ if (id < 0) { ++ vxi = get_vx_info(current_vx_info()); ++ } else if (id > 1) { ++ spin_lock(&vx_info_hash_lock); ++ vxi = get_vx_info(__lookup_vx_info(id)); ++ spin_unlock(&vx_info_hash_lock); ++ } ++ return vxi; ++} ++ ++/* xid_is_hashed() ++ ++ * verify that xid is still hashed */ ++ ++int xid_is_hashed(xid_t xid) ++{ ++ int hashed; ++ ++ spin_lock(&vx_info_hash_lock); ++ hashed = (__lookup_vx_info(xid) != NULL); ++ spin_unlock(&vx_info_hash_lock); ++ return hashed; ++} ++ ++#ifdef CONFIG_PROC_FS ++ ++/* get_xid_list() ++ ++ * get a subset of hashed xids for proc ++ * assumes size is at least one */ ++ ++int get_xid_list(int index, unsigned int *xids, int size) ++{ ++ int hindex, nr_xids = 0; ++ ++ /* only show current and children */ ++ if (!vx_check(0, VS_ADMIN | VS_WATCH)) { ++ if (index > 0) ++ return 0; ++ xids[nr_xids] = vx_current_xid(); ++ return 1; ++ } ++ ++ for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) { ++ struct hlist_head *head = &vx_info_hash[hindex]; ++ struct hlist_node *pos; ++ ++ spin_lock(&vx_info_hash_lock); ++ hlist_for_each(pos, head) { ++ struct vx_info *vxi; ++ ++ if (--index > 0) ++ continue; ++ ++ vxi = hlist_entry(pos, struct vx_info, vx_hlist); ++ xids[nr_xids] = vxi->vx_id; ++ if (++nr_xids >= size) { ++ spin_unlock(&vx_info_hash_lock); ++ goto out; ++ } ++ } ++ /* keep the lock time short */ ++ spin_unlock(&vx_info_hash_lock); ++ } ++out: ++ return nr_xids; ++} ++#endif ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ ++void dump_vx_info_inactive(int level) ++{ ++ struct hlist_node *entry, *next; ++ ++ hlist_for_each_safe(entry, next, &vx_info_inactive) { ++ struct vx_info *vxi = ++ list_entry(entry, struct vx_info, vx_hlist); ++ ++ dump_vx_info(vxi, level); ++ } ++} ++ ++#endif ++ ++#if 0 ++int vx_migrate_user(struct task_struct *p, struct vx_info *vxi) ++{ ++ struct user_struct *new_user, *old_user; ++ ++ if (!p || !vxi) ++ BUG(); ++ ++ if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0)) ++ return -EACCES; ++ ++ new_user = alloc_uid(vxi->vx_id, p->uid); ++ if (!new_user) ++ return -ENOMEM; ++ ++ old_user = p->user; ++ if (new_user != old_user) { ++ atomic_inc(&new_user->processes); ++ atomic_dec(&old_user->processes); ++ p->user = new_user; ++ } ++ free_uid(old_user); ++ return 0; ++} ++#endif ++ ++#if 0 ++void vx_mask_cap_bset(struct vx_info *vxi, struct task_struct *p) ++{ ++ // p->cap_effective &= vxi->vx_cap_bset; ++ p->cap_effective = ++ cap_intersect(p->cap_effective, vxi->cap_bset); ++ // p->cap_inheritable &= vxi->vx_cap_bset; ++ p->cap_inheritable = ++ cap_intersect(p->cap_inheritable, vxi->cap_bset); ++ // p->cap_permitted &= vxi->vx_cap_bset; ++ p->cap_permitted = ++ cap_intersect(p->cap_permitted, vxi->cap_bset); ++} ++#endif ++ ++ ++#include ++#include ++ ++static int vx_openfd_task(struct task_struct *tsk) ++{ ++ struct files_struct *files = tsk->files; ++ struct fdtable *fdt; ++ const unsigned long *bptr; ++ int count, total; ++ ++ /* no rcu_read_lock() because of spin_lock() */ ++ spin_lock(&files->file_lock); ++ fdt = files_fdtable(files); ++ bptr = fdt->open_fds->fds_bits; ++ count = fdt->max_fds / (sizeof(unsigned long) * 8); ++ for (total = 0; count > 0; count--) { ++ if (*bptr) ++ total += hweight_long(*bptr); ++ bptr++; ++ } ++ spin_unlock(&files->file_lock); ++ return total; ++} ++ ++ ++/* for *space compatibility */ ++ ++asmlinkage long sys_unshare(unsigned long); ++ ++/* ++ * migrate task to new context ++ * gets vxi, puts old_vxi on change ++ * optionally unshares namespaces (hack) ++ */ ++ ++int vx_migrate_task(struct task_struct *p, struct vx_info *vxi, int unshare) ++{ ++ struct vx_info *old_vxi; ++ int ret = 0; ++ ++ if (!p || !vxi) ++ BUG(); ++ ++ vxdprintk(VXD_CBIT(xid, 5), ++ "vx_migrate_task(%p,%p[#%d.%d])", p, vxi, ++ vxi->vx_id, atomic_read(&vxi->vx_usecnt)); ++ ++ if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0) && ++ !vx_info_flags(vxi, VXF_STATE_SETUP, 0)) ++ return -EACCES; ++ ++ if (vx_info_state(vxi, VXS_SHUTDOWN)) ++ return -EFAULT; ++ ++ old_vxi = task_get_vx_info(p); ++ if (old_vxi == vxi) ++ goto out; ++ ++// if (!(ret = vx_migrate_user(p, vxi))) { ++ { ++ int openfd; ++ ++ task_lock(p); ++ openfd = vx_openfd_task(p); ++ ++ if (old_vxi) { ++ atomic_dec(&old_vxi->cvirt.nr_threads); ++ atomic_dec(&old_vxi->cvirt.nr_running); ++ __rlim_dec(&old_vxi->limit, RLIMIT_NPROC); ++ /* FIXME: what about the struct files here? */ ++ __rlim_sub(&old_vxi->limit, VLIMIT_OPENFD, openfd); ++ /* account for the executable */ ++ __rlim_dec(&old_vxi->limit, VLIMIT_DENTRY); ++ } ++ atomic_inc(&vxi->cvirt.nr_threads); ++ atomic_inc(&vxi->cvirt.nr_running); ++ __rlim_inc(&vxi->limit, RLIMIT_NPROC); ++ /* FIXME: what about the struct files here? */ ++ __rlim_add(&vxi->limit, VLIMIT_OPENFD, openfd); ++ /* account for the executable */ ++ __rlim_inc(&vxi->limit, VLIMIT_DENTRY); ++ ++ if (old_vxi) { ++ release_vx_info(old_vxi, p); ++ clr_vx_info(&p->vx_info); ++ } ++ claim_vx_info(vxi, p); ++ set_vx_info(&p->vx_info, vxi); ++ p->xid = vxi->vx_id; ++ ++ vxdprintk(VXD_CBIT(xid, 5), ++ "moved task %p into vxi:%p[#%d]", ++ p, vxi, vxi->vx_id); ++ ++ // vx_mask_cap_bset(vxi, p); ++ task_unlock(p); ++ ++ /* hack for *spaces to provide compatibility */ ++ if (unshare) { ++ struct nsproxy *old_nsp, *new_nsp; ++ ++ ret = unshare_nsproxy_namespaces( ++ CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER, ++ &new_nsp, NULL); ++ if (ret) ++ goto out; ++ ++ old_nsp = xchg(&p->nsproxy, new_nsp); ++ vx_set_space(vxi, ++ CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER, 0); ++ put_nsproxy(old_nsp); ++ } ++ } ++out: ++ put_vx_info(old_vxi); ++ return ret; ++} ++ ++int vx_set_reaper(struct vx_info *vxi, struct task_struct *p) ++{ ++ struct task_struct *old_reaper; ++ struct vx_info *reaper_vxi; ++ ++ if (!vxi) ++ return -EINVAL; ++ ++ vxdprintk(VXD_CBIT(xid, 6), ++ "vx_set_reaper(%p[#%d],%p[#%d,%d])", ++ vxi, vxi->vx_id, p, p->xid, p->pid); ++ ++ old_reaper = vxi->vx_reaper; ++ if (old_reaper == p) ++ return 0; ++ ++ reaper_vxi = task_get_vx_info(p); ++ if (reaper_vxi && reaper_vxi != vxi) { ++ vxwprintk(1, ++ "Unsuitable reaper [" VS_Q("%s") ",%u:#%u] " ++ "for [xid #%u]", ++ p->comm, p->pid, p->xid, vx_current_xid()); ++ goto out; ++ } ++ ++ /* set new child reaper */ ++ get_task_struct(p); ++ vxi->vx_reaper = p; ++ put_task_struct(old_reaper); ++out: ++ put_vx_info(reaper_vxi); ++ return 0; ++} ++ ++int vx_set_init(struct vx_info *vxi, struct task_struct *p) ++{ ++ if (!vxi) ++ return -EINVAL; ++ ++ vxdprintk(VXD_CBIT(xid, 6), ++ "vx_set_init(%p[#%d],%p[#%d,%d,%d])", ++ vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid); ++ ++ vxi->vx_flags &= ~VXF_STATE_INIT; ++ // vxi->vx_initpid = p->tgid; ++ vxi->vx_initpid = p->pid; ++ return 0; ++} ++ ++void vx_exit_init(struct vx_info *vxi, struct task_struct *p, int code) ++{ ++ vxdprintk(VXD_CBIT(xid, 6), ++ "vx_exit_init(%p[#%d],%p[#%d,%d,%d])", ++ vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid); ++ ++ vxi->exit_code = code; ++ vxi->vx_initpid = 0; ++} ++ ++ ++void vx_set_persistent(struct vx_info *vxi) ++{ ++ vxdprintk(VXD_CBIT(xid, 6), ++ "vx_set_persistent(%p[#%d])", vxi, vxi->vx_id); ++ ++ get_vx_info(vxi); ++ claim_vx_info(vxi, NULL); ++} ++ ++void vx_clear_persistent(struct vx_info *vxi) ++{ ++ vxdprintk(VXD_CBIT(xid, 6), ++ "vx_clear_persistent(%p[#%d])", vxi, vxi->vx_id); ++ ++ release_vx_info(vxi, NULL); ++ put_vx_info(vxi); ++} ++ ++void vx_update_persistent(struct vx_info *vxi) ++{ ++ if (vx_info_flags(vxi, VXF_PERSISTENT, 0)) ++ vx_set_persistent(vxi); ++ else ++ vx_clear_persistent(vxi); ++} ++ ++ ++/* task must be current or locked */ ++ ++void exit_vx_info(struct task_struct *p, int code) ++{ ++ struct vx_info *vxi = p->vx_info; ++ ++ if (vxi) { ++ atomic_dec(&vxi->cvirt.nr_threads); ++ vx_nproc_dec(p); ++ ++ vxi->exit_code = code; ++ release_vx_info(vxi, p); ++ } ++} ++ ++void exit_vx_info_early(struct task_struct *p, int code) ++{ ++ struct vx_info *vxi = p->vx_info; ++ ++ if (vxi) { ++ if (vxi->vx_initpid == p->pid) ++ vx_exit_init(vxi, p, code); ++ if (vxi->vx_reaper == p) ++ vx_set_reaper(vxi, init_pid_ns.child_reaper); ++ } ++} ++ ++ ++/* vserver syscall commands below here */ ++ ++/* taks xid and vx_info functions */ ++ ++#include ++ ++ ++int vc_task_xid(uint32_t id) ++{ ++ xid_t xid; ++ ++ if (id) { ++ struct task_struct *tsk; ++ ++ rcu_read_lock(); ++ tsk = find_task_by_real_pid(id); ++ xid = (tsk) ? tsk->xid : -ESRCH; ++ rcu_read_unlock(); ++ } else ++ xid = vx_current_xid(); ++ return xid; ++} ++ ++ ++int vc_vx_info(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_vx_info_v0 vc_data; ++ ++ vc_data.xid = vxi->vx_id; ++ vc_data.initpid = vxi->vx_initpid; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++int vc_ctx_stat(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_stat_v0 vc_data; ++ ++ vc_data.usecnt = atomic_read(&vxi->vx_usecnt); ++ vc_data.tasks = atomic_read(&vxi->vx_tasks); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++/* context functions */ ++ ++int vc_ctx_create(uint32_t xid, void __user *data) ++{ ++ struct vcmd_ctx_create vc_data = { .flagword = VXF_INIT_SET }; ++ struct vx_info *new_vxi; ++ int ret; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ if ((xid > MAX_S_CONTEXT) || (xid < 2)) ++ return -EINVAL; ++ ++ new_vxi = __create_vx_info(xid); ++ if (IS_ERR(new_vxi)) ++ return PTR_ERR(new_vxi); ++ ++ /* initial flags */ ++ new_vxi->vx_flags = vc_data.flagword; ++ ++ ret = -ENOEXEC; ++ if (vs_state_change(new_vxi, VSC_STARTUP)) ++ goto out; ++ ++ ret = vx_migrate_task(current, new_vxi, (!data)); ++ if (ret) ++ goto out; ++ ++ /* return context id on success */ ++ ret = new_vxi->vx_id; ++ ++ /* get a reference for persistent contexts */ ++ if ((vc_data.flagword & VXF_PERSISTENT)) ++ vx_set_persistent(new_vxi); ++out: ++ release_vx_info(new_vxi, NULL); ++ put_vx_info(new_vxi); ++ return ret; ++} ++ ++ ++int vc_ctx_migrate(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_migrate vc_data = { .flagword = 0 }; ++ int ret; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = vx_migrate_task(current, vxi, 0); ++ if (ret) ++ return ret; ++ if (vc_data.flagword & VXM_SET_INIT) ++ ret = vx_set_init(vxi, current); ++ if (ret) ++ return ret; ++ if (vc_data.flagword & VXM_SET_REAPER) ++ ret = vx_set_reaper(vxi, current); ++ return ret; ++} ++ ++ ++int vc_get_cflags(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_flags_v0 vc_data; ++ ++ vc_data.flagword = vxi->vx_flags; ++ ++ /* special STATE flag handling */ ++ vc_data.mask = vs_mask_flags(~0ULL, vxi->vx_flags, VXF_ONE_TIME); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_cflags(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_flags_v0 vc_data; ++ uint64_t mask, trigger; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ /* special STATE flag handling */ ++ mask = vs_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME); ++ trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword); ++ ++ if (vxi == current_vx_info()) { ++ /* if (trigger & VXF_STATE_SETUP) ++ vx_mask_cap_bset(vxi, current); */ ++ if (trigger & VXF_STATE_INIT) { ++ int ret; ++ ++ ret = vx_set_init(vxi, current); ++ if (ret) ++ return ret; ++ ret = vx_set_reaper(vxi, current); ++ if (ret) ++ return ret; ++ } ++ } ++ ++ vxi->vx_flags = vs_mask_flags(vxi->vx_flags, ++ vc_data.flagword, mask); ++ if (trigger & VXF_PERSISTENT) ++ vx_update_persistent(vxi); ++ ++ return 0; ++} ++ ++ ++static inline uint64_t caps_from_cap_t(kernel_cap_t c) ++{ ++ uint64_t v = c.cap[0] | ((uint64_t)c.cap[1] << 32); ++ ++ // printk("caps_from_cap_t(%08x:%08x) = %016llx\n", c.cap[1], c.cap[0], v); ++ return v; ++} ++ ++static inline kernel_cap_t cap_t_from_caps(uint64_t v) ++{ ++ kernel_cap_t c = __cap_empty_set; ++ ++ c.cap[0] = v & 0xFFFFFFFF; ++ c.cap[1] = (v >> 32) & 0xFFFFFFFF; ++ ++ // printk("cap_t_from_caps(%016llx) = %08x:%08x\n", v, c.cap[1], c.cap[0]); ++ return c; ++} ++ ++ ++static int do_get_caps(struct vx_info *vxi, uint64_t *bcaps, uint64_t *ccaps) ++{ ++ if (bcaps) ++ *bcaps = caps_from_cap_t(vxi->vx_bcaps); ++ if (ccaps) ++ *ccaps = vxi->vx_ccaps; ++ ++ return 0; ++} ++ ++int vc_get_ccaps(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_caps_v1 vc_data; ++ int ret; ++ ++ ret = do_get_caps(vxi, NULL, &vc_data.ccaps); ++ if (ret) ++ return ret; ++ vc_data.cmask = ~0ULL; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int do_set_caps(struct vx_info *vxi, ++ uint64_t bcaps, uint64_t bmask, uint64_t ccaps, uint64_t cmask) ++{ ++ uint64_t bcold = caps_from_cap_t(vxi->vx_bcaps); ++ ++#if 0 ++ printk("do_set_caps(%16llx, %16llx, %16llx, %16llx)\n", ++ bcaps, bmask, ccaps, cmask); ++#endif ++ vxi->vx_bcaps = cap_t_from_caps( ++ vs_mask_flags(bcold, bcaps, bmask)); ++ vxi->vx_ccaps = vs_mask_flags(vxi->vx_ccaps, ccaps, cmask); ++ ++ return 0; ++} ++ ++int vc_set_ccaps(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_caps_v1 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_caps(vxi, 0, 0, vc_data.ccaps, vc_data.cmask); ++} ++ ++int vc_get_bcaps(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_bcaps vc_data; ++ int ret; ++ ++ ret = do_get_caps(vxi, &vc_data.bcaps, NULL); ++ if (ret) ++ return ret; ++ vc_data.bmask = ~0ULL; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_bcaps(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_bcaps vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_caps(vxi, vc_data.bcaps, vc_data.bmask, 0, 0); ++} ++ ++ ++int vc_get_umask(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_umask vc_data; ++ ++ vc_data.umask = vxi->vx_umask; ++ vc_data.mask = ~0ULL; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_umask(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_umask vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ vxi->vx_umask = vs_mask_flags(vxi->vx_umask, ++ vc_data.umask, vc_data.mask); ++ return 0; ++} ++ ++ ++int vc_get_wmask(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_wmask vc_data; ++ ++ vc_data.wmask = vxi->vx_wmask; ++ vc_data.mask = ~0ULL; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_wmask(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_wmask vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ vxi->vx_wmask = vs_mask_flags(vxi->vx_wmask, ++ vc_data.wmask, vc_data.mask); ++ return 0; ++} ++ ++ ++int vc_get_badness(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_badness_v0 vc_data; ++ ++ vc_data.bias = vxi->vx_badness_bias; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_badness(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_badness_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ vxi->vx_badness_bias = vc_data.bias; ++ return 0; ++} ++ ++#include ++ ++EXPORT_SYMBOL_GPL(free_vx_info); ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/cvirt.c linux-3.2.34-vs2.3.2.15/kernel/vserver/cvirt.c +--- linux-3.2.34/kernel/vserver/cvirt.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/cvirt.c 2012-04-24 00:43:39.000000000 +0200 +@@ -0,0 +1,313 @@ ++/* ++ * linux/kernel/vserver/cvirt.c ++ * ++ * Virtual Server: Context Virtualization ++ * ++ * Copyright (C) 2004-2007 Herbert Pötzl ++ * ++ * V0.01 broken out from limit.c ++ * V0.02 added utsname stuff ++ * V0.03 changed vcmds to vxi arg ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++ ++void vx_vsi_boottime(struct timespec *boottime) ++{ ++ struct vx_info *vxi = current_vx_info(); ++ ++ set_normalized_timespec(boottime, ++ boottime->tv_sec + vxi->cvirt.bias_uptime.tv_sec, ++ boottime->tv_nsec + vxi->cvirt.bias_uptime.tv_nsec); ++ return; ++} ++ ++void vx_vsi_uptime(struct timespec *uptime, struct timespec *idle) ++{ ++ struct vx_info *vxi = current_vx_info(); ++ ++ set_normalized_timespec(uptime, ++ uptime->tv_sec - vxi->cvirt.bias_uptime.tv_sec, ++ uptime->tv_nsec - vxi->cvirt.bias_uptime.tv_nsec); ++ if (!idle) ++ return; ++ set_normalized_timespec(idle, ++ idle->tv_sec - vxi->cvirt.bias_idle.tv_sec, ++ idle->tv_nsec - vxi->cvirt.bias_idle.tv_nsec); ++ return; ++} ++ ++uint64_t vx_idle_jiffies(void) ++{ ++ return init_task.utime + init_task.stime; ++} ++ ++ ++ ++static inline uint32_t __update_loadavg(uint32_t load, ++ int wsize, int delta, int n) ++{ ++ unsigned long long calc, prev; ++ ++ /* just set it to n */ ++ if (unlikely(delta >= wsize)) ++ return (n << FSHIFT); ++ ++ calc = delta * n; ++ calc <<= FSHIFT; ++ prev = (wsize - delta); ++ prev *= load; ++ calc += prev; ++ do_div(calc, wsize); ++ return calc; ++} ++ ++ ++void vx_update_load(struct vx_info *vxi) ++{ ++ uint32_t now, last, delta; ++ unsigned int nr_running, nr_uninterruptible; ++ unsigned int total; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&vxi->cvirt.load_lock, flags); ++ ++ now = jiffies; ++ last = vxi->cvirt.load_last; ++ delta = now - last; ++ ++ if (delta < 5*HZ) ++ goto out; ++ ++ nr_running = atomic_read(&vxi->cvirt.nr_running); ++ nr_uninterruptible = atomic_read(&vxi->cvirt.nr_uninterruptible); ++ total = nr_running + nr_uninterruptible; ++ ++ vxi->cvirt.load[0] = __update_loadavg(vxi->cvirt.load[0], ++ 60*HZ, delta, total); ++ vxi->cvirt.load[1] = __update_loadavg(vxi->cvirt.load[1], ++ 5*60*HZ, delta, total); ++ vxi->cvirt.load[2] = __update_loadavg(vxi->cvirt.load[2], ++ 15*60*HZ, delta, total); ++ ++ vxi->cvirt.load_last = now; ++out: ++ atomic_inc(&vxi->cvirt.load_updates); ++ spin_unlock_irqrestore(&vxi->cvirt.load_lock, flags); ++} ++ ++ ++/* ++ * Commands to do_syslog: ++ * ++ * 0 -- Close the log. Currently a NOP. ++ * 1 -- Open the log. Currently a NOP. ++ * 2 -- Read from the log. ++ * 3 -- Read all messages remaining in the ring buffer. ++ * 4 -- Read and clear all messages remaining in the ring buffer ++ * 5 -- Clear ring buffer. ++ * 6 -- Disable printk's to console ++ * 7 -- Enable printk's to console ++ * 8 -- Set level of messages printed to console ++ * 9 -- Return number of unread characters in the log buffer ++ * 10 -- Return size of the log buffer ++ */ ++int vx_do_syslog(int type, char __user *buf, int len) ++{ ++ int error = 0; ++ int do_clear = 0; ++ struct vx_info *vxi = current_vx_info(); ++ struct _vx_syslog *log; ++ ++ if (!vxi) ++ return -EINVAL; ++ log = &vxi->cvirt.syslog; ++ ++ switch (type) { ++ case 0: /* Close log */ ++ case 1: /* Open log */ ++ break; ++ case 2: /* Read from log */ ++ error = wait_event_interruptible(log->log_wait, ++ (log->log_start - log->log_end)); ++ if (error) ++ break; ++ spin_lock_irq(&log->logbuf_lock); ++ spin_unlock_irq(&log->logbuf_lock); ++ break; ++ case 4: /* Read/clear last kernel messages */ ++ do_clear = 1; ++ /* fall through */ ++ case 3: /* Read last kernel messages */ ++ return 0; ++ ++ case 5: /* Clear ring buffer */ ++ return 0; ++ ++ case 6: /* Disable logging to console */ ++ case 7: /* Enable logging to console */ ++ case 8: /* Set level of messages printed to console */ ++ break; ++ ++ case 9: /* Number of chars in the log buffer */ ++ return 0; ++ case 10: /* Size of the log buffer */ ++ return 0; ++ default: ++ error = -EINVAL; ++ break; ++ } ++ return error; ++} ++ ++ ++/* virtual host info names */ ++ ++static char *vx_vhi_name(struct vx_info *vxi, int id) ++{ ++ struct nsproxy *nsproxy; ++ struct uts_namespace *uts; ++ ++ if (id == VHIN_CONTEXT) ++ return vxi->vx_name; ++ ++ nsproxy = vxi->space[0].vx_nsproxy; ++ if (!nsproxy) ++ return NULL; ++ ++ uts = nsproxy->uts_ns; ++ if (!uts) ++ return NULL; ++ ++ switch (id) { ++ case VHIN_SYSNAME: ++ return uts->name.sysname; ++ case VHIN_NODENAME: ++ return uts->name.nodename; ++ case VHIN_RELEASE: ++ return uts->name.release; ++ case VHIN_VERSION: ++ return uts->name.version; ++ case VHIN_MACHINE: ++ return uts->name.machine; ++ case VHIN_DOMAINNAME: ++ return uts->name.domainname; ++ default: ++ return NULL; ++ } ++ return NULL; ++} ++ ++int vc_set_vhi_name(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_vhi_name_v0 vc_data; ++ char *name; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ name = vx_vhi_name(vxi, vc_data.field); ++ if (!name) ++ return -EINVAL; ++ ++ memcpy(name, vc_data.name, 65); ++ return 0; ++} ++ ++int vc_get_vhi_name(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_vhi_name_v0 vc_data; ++ char *name; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ name = vx_vhi_name(vxi, vc_data.field); ++ if (!name) ++ return -EINVAL; ++ ++ memcpy(vc_data.name, name, 65); ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++int vc_virt_stat(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_virt_stat_v0 vc_data; ++ struct _vx_cvirt *cvirt = &vxi->cvirt; ++ struct timespec uptime; ++ ++ do_posix_clock_monotonic_gettime(&uptime); ++ set_normalized_timespec(&uptime, ++ uptime.tv_sec - cvirt->bias_uptime.tv_sec, ++ uptime.tv_nsec - cvirt->bias_uptime.tv_nsec); ++ ++ vc_data.offset = timespec_to_ns(&cvirt->bias_ts); ++ vc_data.uptime = timespec_to_ns(&uptime); ++ vc_data.nr_threads = atomic_read(&cvirt->nr_threads); ++ vc_data.nr_running = atomic_read(&cvirt->nr_running); ++ vc_data.nr_uninterruptible = atomic_read(&cvirt->nr_uninterruptible); ++ vc_data.nr_onhold = atomic_read(&cvirt->nr_onhold); ++ vc_data.nr_forks = atomic_read(&cvirt->total_forks); ++ vc_data.load[0] = cvirt->load[0]; ++ vc_data.load[1] = cvirt->load[1]; ++ vc_data.load[2] = cvirt->load[2]; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++#ifdef CONFIG_VSERVER_VTIME ++ ++/* virtualized time base */ ++ ++void vx_adjust_timespec(struct timespec *ts) ++{ ++ struct vx_info *vxi; ++ ++ if (!vx_flags(VXF_VIRT_TIME, 0)) ++ return; ++ ++ vxi = current_vx_info(); ++ ts->tv_sec += vxi->cvirt.bias_ts.tv_sec; ++ ts->tv_nsec += vxi->cvirt.bias_ts.tv_nsec; ++ ++ if (ts->tv_nsec >= NSEC_PER_SEC) { ++ ts->tv_sec++; ++ ts->tv_nsec -= NSEC_PER_SEC; ++ } else if (ts->tv_nsec < 0) { ++ ts->tv_sec--; ++ ts->tv_nsec += NSEC_PER_SEC; ++ } ++} ++ ++int vx_settimeofday(const struct timespec *ts) ++{ ++ struct timespec ats, delta; ++ struct vx_info *vxi; ++ ++ if (!vx_flags(VXF_VIRT_TIME, 0)) ++ return do_settimeofday(ts); ++ ++ getnstimeofday(&ats); ++ delta = timespec_sub(*ts, ats); ++ ++ vxi = current_vx_info(); ++ vxi->cvirt.bias_ts = timespec_add(vxi->cvirt.bias_ts, delta); ++ return 0; ++} ++ ++#endif ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/cvirt_init.h linux-3.2.34-vs2.3.2.15/kernel/vserver/cvirt_init.h +--- linux-3.2.34/kernel/vserver/cvirt_init.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/cvirt_init.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,70 @@ ++ ++ ++extern uint64_t vx_idle_jiffies(void); ++ ++static inline void vx_info_init_cvirt(struct _vx_cvirt *cvirt) ++{ ++ uint64_t idle_jiffies = vx_idle_jiffies(); ++ uint64_t nsuptime; ++ ++ do_posix_clock_monotonic_gettime(&cvirt->bias_uptime); ++ nsuptime = (unsigned long long)cvirt->bias_uptime.tv_sec ++ * NSEC_PER_SEC + cvirt->bias_uptime.tv_nsec; ++ cvirt->bias_clock = nsec_to_clock_t(nsuptime); ++ cvirt->bias_ts.tv_sec = 0; ++ cvirt->bias_ts.tv_nsec = 0; ++ ++ jiffies_to_timespec(idle_jiffies, &cvirt->bias_idle); ++ atomic_set(&cvirt->nr_threads, 0); ++ atomic_set(&cvirt->nr_running, 0); ++ atomic_set(&cvirt->nr_uninterruptible, 0); ++ atomic_set(&cvirt->nr_onhold, 0); ++ ++ spin_lock_init(&cvirt->load_lock); ++ cvirt->load_last = jiffies; ++ atomic_set(&cvirt->load_updates, 0); ++ cvirt->load[0] = 0; ++ cvirt->load[1] = 0; ++ cvirt->load[2] = 0; ++ atomic_set(&cvirt->total_forks, 0); ++ ++ spin_lock_init(&cvirt->syslog.logbuf_lock); ++ init_waitqueue_head(&cvirt->syslog.log_wait); ++ cvirt->syslog.log_start = 0; ++ cvirt->syslog.log_end = 0; ++ cvirt->syslog.con_start = 0; ++ cvirt->syslog.logged_chars = 0; ++} ++ ++static inline ++void vx_info_init_cvirt_pc(struct _vx_cvirt_pc *cvirt_pc, int cpu) ++{ ++ // cvirt_pc->cpustat = { 0 }; ++} ++ ++static inline void vx_info_exit_cvirt(struct _vx_cvirt *cvirt) ++{ ++#ifdef CONFIG_VSERVER_WARN ++ int value; ++#endif ++ vxwprintk_xid((value = atomic_read(&cvirt->nr_threads)), ++ "!!! cvirt: %p[nr_threads] = %d on exit.", ++ cvirt, value); ++ vxwprintk_xid((value = atomic_read(&cvirt->nr_running)), ++ "!!! cvirt: %p[nr_running] = %d on exit.", ++ cvirt, value); ++ vxwprintk_xid((value = atomic_read(&cvirt->nr_uninterruptible)), ++ "!!! cvirt: %p[nr_uninterruptible] = %d on exit.", ++ cvirt, value); ++ vxwprintk_xid((value = atomic_read(&cvirt->nr_onhold)), ++ "!!! cvirt: %p[nr_onhold] = %d on exit.", ++ cvirt, value); ++ return; ++} ++ ++static inline ++void vx_info_exit_cvirt_pc(struct _vx_cvirt_pc *cvirt_pc, int cpu) ++{ ++ return; ++} ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/cvirt_proc.h linux-3.2.34-vs2.3.2.15/kernel/vserver/cvirt_proc.h +--- linux-3.2.34/kernel/vserver/cvirt_proc.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/cvirt_proc.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,135 @@ ++#ifndef _VX_CVIRT_PROC_H ++#define _VX_CVIRT_PROC_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++ ++static inline ++int vx_info_proc_nsproxy(struct nsproxy *nsproxy, char *buffer) ++{ ++ struct mnt_namespace *ns; ++ struct uts_namespace *uts; ++ struct ipc_namespace *ipc; ++ struct path path; ++ char *pstr, *root; ++ int length = 0; ++ ++ if (!nsproxy) ++ goto out; ++ ++ length += sprintf(buffer + length, ++ "NSProxy:\t%p [%p,%p,%p]\n", ++ nsproxy, nsproxy->mnt_ns, ++ nsproxy->uts_ns, nsproxy->ipc_ns); ++ ++ ns = nsproxy->mnt_ns; ++ if (!ns) ++ goto skip_ns; ++ ++ pstr = kmalloc(PATH_MAX, GFP_KERNEL); ++ if (!pstr) ++ goto skip_ns; ++ ++ path.mnt = ns->root; ++ path.dentry = ns->root->mnt_root; ++ root = d_path(&path, pstr, PATH_MAX - 2); ++ length += sprintf(buffer + length, ++ "Namespace:\t%p [#%u]\n" ++ "RootPath:\t%s\n", ++ ns, atomic_read(&ns->count), ++ root); ++ kfree(pstr); ++skip_ns: ++ ++ uts = nsproxy->uts_ns; ++ if (!uts) ++ goto skip_uts; ++ ++ length += sprintf(buffer + length, ++ "SysName:\t%.*s\n" ++ "NodeName:\t%.*s\n" ++ "Release:\t%.*s\n" ++ "Version:\t%.*s\n" ++ "Machine:\t%.*s\n" ++ "DomainName:\t%.*s\n", ++ __NEW_UTS_LEN, uts->name.sysname, ++ __NEW_UTS_LEN, uts->name.nodename, ++ __NEW_UTS_LEN, uts->name.release, ++ __NEW_UTS_LEN, uts->name.version, ++ __NEW_UTS_LEN, uts->name.machine, ++ __NEW_UTS_LEN, uts->name.domainname); ++skip_uts: ++ ++ ipc = nsproxy->ipc_ns; ++ if (!ipc) ++ goto skip_ipc; ++ ++ length += sprintf(buffer + length, ++ "SEMS:\t\t%d %d %d %d %d\n" ++ "MSG:\t\t%d %d %d\n" ++ "SHM:\t\t%lu %lu %d %d\n", ++ ipc->sem_ctls[0], ipc->sem_ctls[1], ++ ipc->sem_ctls[2], ipc->sem_ctls[3], ++ ipc->used_sems, ++ ipc->msg_ctlmax, ipc->msg_ctlmnb, ipc->msg_ctlmni, ++ (unsigned long)ipc->shm_ctlmax, ++ (unsigned long)ipc->shm_ctlall, ++ ipc->shm_ctlmni, ipc->shm_tot); ++skip_ipc: ++out: ++ return length; ++} ++ ++ ++#include ++ ++#define LOAD_INT(x) ((x) >> FSHIFT) ++#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1 - 1)) * 100) ++ ++static inline ++int vx_info_proc_cvirt(struct _vx_cvirt *cvirt, char *buffer) ++{ ++ int length = 0; ++ int a, b, c; ++ ++ length += sprintf(buffer + length, ++ "BiasUptime:\t%lu.%02lu\n", ++ (unsigned long)cvirt->bias_uptime.tv_sec, ++ (cvirt->bias_uptime.tv_nsec / (NSEC_PER_SEC / 100))); ++ ++ a = cvirt->load[0] + (FIXED_1 / 200); ++ b = cvirt->load[1] + (FIXED_1 / 200); ++ c = cvirt->load[2] + (FIXED_1 / 200); ++ length += sprintf(buffer + length, ++ "nr_threads:\t%d\n" ++ "nr_running:\t%d\n" ++ "nr_unintr:\t%d\n" ++ "nr_onhold:\t%d\n" ++ "load_updates:\t%d\n" ++ "loadavg:\t%d.%02d %d.%02d %d.%02d\n" ++ "total_forks:\t%d\n", ++ atomic_read(&cvirt->nr_threads), ++ atomic_read(&cvirt->nr_running), ++ atomic_read(&cvirt->nr_uninterruptible), ++ atomic_read(&cvirt->nr_onhold), ++ atomic_read(&cvirt->load_updates), ++ LOAD_INT(a), LOAD_FRAC(a), ++ LOAD_INT(b), LOAD_FRAC(b), ++ LOAD_INT(c), LOAD_FRAC(c), ++ atomic_read(&cvirt->total_forks)); ++ return length; ++} ++ ++static inline ++int vx_info_proc_cvirt_pc(struct _vx_cvirt_pc *cvirt_pc, ++ char *buffer, int cpu) ++{ ++ int length = 0; ++ return length; ++} ++ ++#endif /* _VX_CVIRT_PROC_H */ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/debug.c linux-3.2.34-vs2.3.2.15/kernel/vserver/debug.c +--- linux-3.2.34/kernel/vserver/debug.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/debug.c 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,32 @@ ++/* ++ * kernel/vserver/debug.c ++ * ++ * Copyright (C) 2005-2007 Herbert Pötzl ++ * ++ * V0.01 vx_info dump support ++ * ++ */ ++ ++#include ++ ++#include ++ ++ ++void dump_vx_info(struct vx_info *vxi, int level) ++{ ++ printk("vx_info %p[#%d, %d.%d, %4x]\n", vxi, vxi->vx_id, ++ atomic_read(&vxi->vx_usecnt), ++ atomic_read(&vxi->vx_tasks), ++ vxi->vx_state); ++ if (level > 0) { ++ __dump_vx_limit(&vxi->limit); ++ __dump_vx_sched(&vxi->sched); ++ __dump_vx_cvirt(&vxi->cvirt); ++ __dump_vx_cacct(&vxi->cacct); ++ } ++ printk("---\n"); ++} ++ ++ ++EXPORT_SYMBOL_GPL(dump_vx_info); ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/device.c linux-3.2.34-vs2.3.2.15/kernel/vserver/device.c +--- linux-3.2.34/kernel/vserver/device.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/device.c 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,443 @@ ++/* ++ * linux/kernel/vserver/device.c ++ * ++ * Linux-VServer: Device Support ++ * ++ * Copyright (C) 2006 Herbert Pötzl ++ * Copyright (C) 2007 Daniel Hokka Zakrisson ++ * ++ * V0.01 device mapping basics ++ * V0.02 added defaults ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#define DMAP_HASH_BITS 4 ++ ++ ++struct vs_mapping { ++ union { ++ struct hlist_node hlist; ++ struct list_head list; ++ } u; ++#define dm_hlist u.hlist ++#define dm_list u.list ++ xid_t xid; ++ dev_t device; ++ struct vx_dmap_target target; ++}; ++ ++ ++static struct hlist_head dmap_main_hash[1 << DMAP_HASH_BITS]; ++ ++static DEFINE_SPINLOCK(dmap_main_hash_lock); ++ ++static struct vx_dmap_target dmap_defaults[2] = { ++ { .flags = DATTR_OPEN }, ++ { .flags = DATTR_OPEN }, ++}; ++ ++ ++struct kmem_cache *dmap_cachep __read_mostly; ++ ++int __init dmap_cache_init(void) ++{ ++ dmap_cachep = kmem_cache_create("dmap_cache", ++ sizeof(struct vs_mapping), 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ return 0; ++} ++ ++__initcall(dmap_cache_init); ++ ++ ++static inline unsigned int __hashval(dev_t dev, int bits) ++{ ++ return hash_long((unsigned long)dev, bits); ++} ++ ++ ++/* __hash_mapping() ++ * add the mapping to the hash table ++ */ ++static inline void __hash_mapping(struct vx_info *vxi, struct vs_mapping *vdm) ++{ ++ spinlock_t *hash_lock = &dmap_main_hash_lock; ++ struct hlist_head *head, *hash = dmap_main_hash; ++ int device = vdm->device; ++ ++ spin_lock(hash_lock); ++ vxdprintk(VXD_CBIT(misc, 8), "__hash_mapping: %p[#%d] %08x:%08x", ++ vxi, vxi ? vxi->vx_id : 0, device, vdm->target.target); ++ ++ head = &hash[__hashval(device, DMAP_HASH_BITS)]; ++ hlist_add_head(&vdm->dm_hlist, head); ++ spin_unlock(hash_lock); ++} ++ ++ ++static inline int __mode_to_default(umode_t mode) ++{ ++ switch (mode) { ++ case S_IFBLK: ++ return 0; ++ case S_IFCHR: ++ return 1; ++ default: ++ BUG(); ++ } ++} ++ ++ ++/* __set_default() ++ * set a default ++ */ ++static inline void __set_default(struct vx_info *vxi, umode_t mode, ++ struct vx_dmap_target *vdmt) ++{ ++ spinlock_t *hash_lock = &dmap_main_hash_lock; ++ spin_lock(hash_lock); ++ ++ if (vxi) ++ vxi->dmap.targets[__mode_to_default(mode)] = *vdmt; ++ else ++ dmap_defaults[__mode_to_default(mode)] = *vdmt; ++ ++ ++ spin_unlock(hash_lock); ++ ++ vxdprintk(VXD_CBIT(misc, 8), "__set_default: %p[#%u] %08x %04x", ++ vxi, vxi ? vxi->vx_id : 0, vdmt->target, vdmt->flags); ++} ++ ++ ++/* __remove_default() ++ * remove a default ++ */ ++static inline int __remove_default(struct vx_info *vxi, umode_t mode) ++{ ++ spinlock_t *hash_lock = &dmap_main_hash_lock; ++ spin_lock(hash_lock); ++ ++ if (vxi) ++ vxi->dmap.targets[__mode_to_default(mode)].flags = 0; ++ else /* remove == reset */ ++ dmap_defaults[__mode_to_default(mode)].flags = DATTR_OPEN | mode; ++ ++ spin_unlock(hash_lock); ++ return 0; ++} ++ ++ ++/* __find_mapping() ++ * find a mapping in the hash table ++ * ++ * caller must hold hash_lock ++ */ ++static inline int __find_mapping(xid_t xid, dev_t device, umode_t mode, ++ struct vs_mapping **local, struct vs_mapping **global) ++{ ++ struct hlist_head *hash = dmap_main_hash; ++ struct hlist_head *head = &hash[__hashval(device, DMAP_HASH_BITS)]; ++ struct hlist_node *pos; ++ struct vs_mapping *vdm; ++ ++ *local = NULL; ++ if (global) ++ *global = NULL; ++ ++ hlist_for_each(pos, head) { ++ vdm = hlist_entry(pos, struct vs_mapping, dm_hlist); ++ ++ if ((vdm->device == device) && ++ !((vdm->target.flags ^ mode) & S_IFMT)) { ++ if (vdm->xid == xid) { ++ *local = vdm; ++ return 1; ++ } else if (global && vdm->xid == 0) ++ *global = vdm; ++ } ++ } ++ ++ if (global && *global) ++ return 0; ++ else ++ return -ENOENT; ++} ++ ++ ++/* __lookup_mapping() ++ * find a mapping and store the result in target and flags ++ */ ++static inline int __lookup_mapping(struct vx_info *vxi, ++ dev_t device, dev_t *target, int *flags, umode_t mode) ++{ ++ spinlock_t *hash_lock = &dmap_main_hash_lock; ++ struct vs_mapping *vdm, *global; ++ struct vx_dmap_target *vdmt; ++ int ret = 0; ++ xid_t xid = vxi->vx_id; ++ int index; ++ ++ spin_lock(hash_lock); ++ if (__find_mapping(xid, device, mode, &vdm, &global) > 0) { ++ ret = 1; ++ vdmt = &vdm->target; ++ goto found; ++ } ++ ++ index = __mode_to_default(mode); ++ if (vxi && vxi->dmap.targets[index].flags) { ++ ret = 2; ++ vdmt = &vxi->dmap.targets[index]; ++ } else if (global) { ++ ret = 3; ++ vdmt = &global->target; ++ goto found; ++ } else { ++ ret = 4; ++ vdmt = &dmap_defaults[index]; ++ } ++ ++found: ++ if (target && (vdmt->flags & DATTR_REMAP)) ++ *target = vdmt->target; ++ else if (target) ++ *target = device; ++ if (flags) ++ *flags = vdmt->flags; ++ ++ spin_unlock(hash_lock); ++ ++ return ret; ++} ++ ++ ++/* __remove_mapping() ++ * remove a mapping from the hash table ++ */ ++static inline int __remove_mapping(struct vx_info *vxi, dev_t device, ++ umode_t mode) ++{ ++ spinlock_t *hash_lock = &dmap_main_hash_lock; ++ struct vs_mapping *vdm = NULL; ++ int ret = 0; ++ ++ spin_lock(hash_lock); ++ ++ ret = __find_mapping((vxi ? vxi->vx_id : 0), device, mode, &vdm, ++ NULL); ++ vxdprintk(VXD_CBIT(misc, 8), "__remove_mapping: %p[#%d] %08x %04x", ++ vxi, vxi ? vxi->vx_id : 0, device, mode); ++ if (ret < 0) ++ goto out; ++ hlist_del(&vdm->dm_hlist); ++ ++out: ++ spin_unlock(hash_lock); ++ if (vdm) ++ kmem_cache_free(dmap_cachep, vdm); ++ return ret; ++} ++ ++ ++ ++int vs_map_device(struct vx_info *vxi, ++ dev_t device, dev_t *target, umode_t mode) ++{ ++ int ret, flags = DATTR_MASK; ++ ++ if (!vxi) { ++ if (target) ++ *target = device; ++ goto out; ++ } ++ ret = __lookup_mapping(vxi, device, target, &flags, mode); ++ vxdprintk(VXD_CBIT(misc, 8), "vs_map_device: %08x target: %08x flags: %04x mode: %04x mapped=%d", ++ device, target ? *target : 0, flags, mode, ret); ++out: ++ return (flags & DATTR_MASK); ++} ++ ++ ++ ++static int do_set_mapping(struct vx_info *vxi, ++ dev_t device, dev_t target, int flags, umode_t mode) ++{ ++ if (device) { ++ struct vs_mapping *new; ++ ++ new = kmem_cache_alloc(dmap_cachep, GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ INIT_HLIST_NODE(&new->dm_hlist); ++ new->device = device; ++ new->target.target = target; ++ new->target.flags = flags | mode; ++ new->xid = (vxi ? vxi->vx_id : 0); ++ ++ vxdprintk(VXD_CBIT(misc, 8), "do_set_mapping: %08x target: %08x flags: %04x", device, target, flags); ++ __hash_mapping(vxi, new); ++ } else { ++ struct vx_dmap_target new = { ++ .target = target, ++ .flags = flags | mode, ++ }; ++ __set_default(vxi, mode, &new); ++ } ++ return 0; ++} ++ ++ ++static int do_unset_mapping(struct vx_info *vxi, ++ dev_t device, dev_t target, int flags, umode_t mode) ++{ ++ int ret = -EINVAL; ++ ++ if (device) { ++ ret = __remove_mapping(vxi, device, mode); ++ if (ret < 0) ++ goto out; ++ } else { ++ ret = __remove_default(vxi, mode); ++ if (ret < 0) ++ goto out; ++ } ++ ++out: ++ return ret; ++} ++ ++ ++static inline int __user_device(const char __user *name, dev_t *dev, ++ umode_t *mode) ++{ ++ struct nameidata nd; ++ int ret; ++ ++ if (!name) { ++ *dev = 0; ++ return 0; ++ } ++ ret = user_lpath(name, &nd.path); ++ if (ret) ++ return ret; ++ if (nd.path.dentry->d_inode) { ++ *dev = nd.path.dentry->d_inode->i_rdev; ++ *mode = nd.path.dentry->d_inode->i_mode; ++ } ++ path_put(&nd.path); ++ return 0; ++} ++ ++static inline int __mapping_mode(dev_t device, dev_t target, ++ umode_t device_mode, umode_t target_mode, umode_t *mode) ++{ ++ if (device) ++ *mode = device_mode & S_IFMT; ++ else if (target) ++ *mode = target_mode & S_IFMT; ++ else ++ return -EINVAL; ++ ++ /* if both given, device and target mode have to match */ ++ if (device && target && ++ ((device_mode ^ target_mode) & S_IFMT)) ++ return -EINVAL; ++ return 0; ++} ++ ++ ++static inline int do_mapping(struct vx_info *vxi, const char __user *device_path, ++ const char __user *target_path, int flags, int set) ++{ ++ dev_t device = ~0, target = ~0; ++ umode_t device_mode = 0, target_mode = 0, mode; ++ int ret; ++ ++ ret = __user_device(device_path, &device, &device_mode); ++ if (ret) ++ return ret; ++ ret = __user_device(target_path, &target, &target_mode); ++ if (ret) ++ return ret; ++ ++ ret = __mapping_mode(device, target, ++ device_mode, target_mode, &mode); ++ if (ret) ++ return ret; ++ ++ if (set) ++ return do_set_mapping(vxi, device, target, ++ flags, mode); ++ else ++ return do_unset_mapping(vxi, device, target, ++ flags, mode); ++} ++ ++ ++int vc_set_mapping(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_set_mapping_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_mapping(vxi, vc_data.device, vc_data.target, ++ vc_data.flags, 1); ++} ++ ++int vc_unset_mapping(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_set_mapping_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_mapping(vxi, vc_data.device, vc_data.target, ++ vc_data.flags, 0); ++} ++ ++ ++#ifdef CONFIG_COMPAT ++ ++int vc_set_mapping_x32(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_set_mapping_v0_x32 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_mapping(vxi, compat_ptr(vc_data.device_ptr), ++ compat_ptr(vc_data.target_ptr), vc_data.flags, 1); ++} ++ ++int vc_unset_mapping_x32(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_set_mapping_v0_x32 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_mapping(vxi, compat_ptr(vc_data.device_ptr), ++ compat_ptr(vc_data.target_ptr), vc_data.flags, 0); ++} ++ ++#endif /* CONFIG_COMPAT */ ++ ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/dlimit.c linux-3.2.34-vs2.3.2.15/kernel/vserver/dlimit.c +--- linux-3.2.34/kernel/vserver/dlimit.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/dlimit.c 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,531 @@ ++/* ++ * linux/kernel/vserver/dlimit.c ++ * ++ * Virtual Server: Context Disk Limits ++ * ++ * Copyright (C) 2004-2009 Herbert Pötzl ++ * ++ * V0.01 initial version ++ * V0.02 compat32 splitup ++ * V0.03 extended interface ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++// #include ++ ++#include ++ ++/* __alloc_dl_info() ++ ++ * allocate an initialized dl_info struct ++ * doesn't make it visible (hash) */ ++ ++static struct dl_info *__alloc_dl_info(struct super_block *sb, tag_t tag) ++{ ++ struct dl_info *new = NULL; ++ ++ vxdprintk(VXD_CBIT(dlim, 5), ++ "alloc_dl_info(%p,%d)*", sb, tag); ++ ++ /* would this benefit from a slab cache? */ ++ new = kmalloc(sizeof(struct dl_info), GFP_KERNEL); ++ if (!new) ++ return 0; ++ ++ memset(new, 0, sizeof(struct dl_info)); ++ new->dl_tag = tag; ++ new->dl_sb = sb; ++ // INIT_RCU_HEAD(&new->dl_rcu); ++ INIT_HLIST_NODE(&new->dl_hlist); ++ spin_lock_init(&new->dl_lock); ++ atomic_set(&new->dl_refcnt, 0); ++ atomic_set(&new->dl_usecnt, 0); ++ ++ /* rest of init goes here */ ++ ++ vxdprintk(VXD_CBIT(dlim, 4), ++ "alloc_dl_info(%p,%d) = %p", sb, tag, new); ++ return new; ++} ++ ++/* __dealloc_dl_info() ++ ++ * final disposal of dl_info */ ++ ++static void __dealloc_dl_info(struct dl_info *dli) ++{ ++ vxdprintk(VXD_CBIT(dlim, 4), ++ "dealloc_dl_info(%p)", dli); ++ ++ dli->dl_hlist.next = LIST_POISON1; ++ dli->dl_tag = -1; ++ dli->dl_sb = 0; ++ ++ BUG_ON(atomic_read(&dli->dl_usecnt)); ++ BUG_ON(atomic_read(&dli->dl_refcnt)); ++ ++ kfree(dli); ++} ++ ++ ++/* hash table for dl_info hash */ ++ ++#define DL_HASH_SIZE 13 ++ ++struct hlist_head dl_info_hash[DL_HASH_SIZE]; ++ ++static DEFINE_SPINLOCK(dl_info_hash_lock); ++ ++ ++static inline unsigned int __hashval(struct super_block *sb, tag_t tag) ++{ ++ return ((tag ^ (unsigned long)sb) % DL_HASH_SIZE); ++} ++ ++ ++ ++/* __hash_dl_info() ++ ++ * add the dli to the global hash table ++ * requires the hash_lock to be held */ ++ ++static inline void __hash_dl_info(struct dl_info *dli) ++{ ++ struct hlist_head *head; ++ ++ vxdprintk(VXD_CBIT(dlim, 6), ++ "__hash_dl_info: %p[#%d]", dli, dli->dl_tag); ++ get_dl_info(dli); ++ head = &dl_info_hash[__hashval(dli->dl_sb, dli->dl_tag)]; ++ hlist_add_head_rcu(&dli->dl_hlist, head); ++} ++ ++/* __unhash_dl_info() ++ ++ * remove the dli from the global hash table ++ * requires the hash_lock to be held */ ++ ++static inline void __unhash_dl_info(struct dl_info *dli) ++{ ++ vxdprintk(VXD_CBIT(dlim, 6), ++ "__unhash_dl_info: %p[#%d]", dli, dli->dl_tag); ++ hlist_del_rcu(&dli->dl_hlist); ++ put_dl_info(dli); ++} ++ ++ ++/* __lookup_dl_info() ++ ++ * requires the rcu_read_lock() ++ * doesn't increment the dl_refcnt */ ++ ++static inline struct dl_info *__lookup_dl_info(struct super_block *sb, tag_t tag) ++{ ++ struct hlist_head *head = &dl_info_hash[__hashval(sb, tag)]; ++ struct hlist_node *pos; ++ struct dl_info *dli; ++ ++ hlist_for_each_entry_rcu(dli, pos, head, dl_hlist) { ++ ++ if (dli->dl_tag == tag && dli->dl_sb == sb) { ++ return dli; ++ } ++ } ++ return NULL; ++} ++ ++ ++struct dl_info *locate_dl_info(struct super_block *sb, tag_t tag) ++{ ++ struct dl_info *dli; ++ ++ rcu_read_lock(); ++ dli = get_dl_info(__lookup_dl_info(sb, tag)); ++ vxdprintk(VXD_CBIT(dlim, 7), ++ "locate_dl_info(%p,#%d) = %p", sb, tag, dli); ++ rcu_read_unlock(); ++ return dli; ++} ++ ++void rcu_free_dl_info(struct rcu_head *head) ++{ ++ struct dl_info *dli = container_of(head, struct dl_info, dl_rcu); ++ int usecnt, refcnt; ++ ++ BUG_ON(!dli || !head); ++ ++ usecnt = atomic_read(&dli->dl_usecnt); ++ BUG_ON(usecnt < 0); ++ ++ refcnt = atomic_read(&dli->dl_refcnt); ++ BUG_ON(refcnt < 0); ++ ++ vxdprintk(VXD_CBIT(dlim, 3), ++ "rcu_free_dl_info(%p)", dli); ++ if (!usecnt) ++ __dealloc_dl_info(dli); ++ else ++ printk("!!! rcu didn't free\n"); ++} ++ ++ ++ ++ ++static int do_addrem_dlimit(uint32_t id, const char __user *name, ++ uint32_t flags, int add) ++{ ++ struct path path; ++ int ret; ++ ++ ret = user_lpath(name, &path); ++ if (!ret) { ++ struct super_block *sb; ++ struct dl_info *dli; ++ ++ ret = -EINVAL; ++ if (!path.dentry->d_inode) ++ goto out_release; ++ if (!(sb = path.dentry->d_inode->i_sb)) ++ goto out_release; ++ ++ if (add) { ++ dli = __alloc_dl_info(sb, id); ++ spin_lock(&dl_info_hash_lock); ++ ++ ret = -EEXIST; ++ if (__lookup_dl_info(sb, id)) ++ goto out_unlock; ++ __hash_dl_info(dli); ++ dli = NULL; ++ } else { ++ spin_lock(&dl_info_hash_lock); ++ dli = __lookup_dl_info(sb, id); ++ ++ ret = -ESRCH; ++ if (!dli) ++ goto out_unlock; ++ __unhash_dl_info(dli); ++ } ++ ret = 0; ++ out_unlock: ++ spin_unlock(&dl_info_hash_lock); ++ if (add && dli) ++ __dealloc_dl_info(dli); ++ out_release: ++ path_put(&path); ++ } ++ return ret; ++} ++ ++int vc_add_dlimit(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_base_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_addrem_dlimit(id, vc_data.name, vc_data.flags, 1); ++} ++ ++int vc_rem_dlimit(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_base_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_addrem_dlimit(id, vc_data.name, vc_data.flags, 0); ++} ++ ++#ifdef CONFIG_COMPAT ++ ++int vc_add_dlimit_x32(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_base_v0_x32 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_addrem_dlimit(id, ++ compat_ptr(vc_data.name_ptr), vc_data.flags, 1); ++} ++ ++int vc_rem_dlimit_x32(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_base_v0_x32 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_addrem_dlimit(id, ++ compat_ptr(vc_data.name_ptr), vc_data.flags, 0); ++} ++ ++#endif /* CONFIG_COMPAT */ ++ ++ ++static inline ++int do_set_dlimit(uint32_t id, const char __user *name, ++ uint32_t space_used, uint32_t space_total, ++ uint32_t inodes_used, uint32_t inodes_total, ++ uint32_t reserved, uint32_t flags) ++{ ++ struct path path; ++ int ret; ++ ++ ret = user_lpath(name, &path); ++ if (!ret) { ++ struct super_block *sb; ++ struct dl_info *dli; ++ ++ ret = -EINVAL; ++ if (!path.dentry->d_inode) ++ goto out_release; ++ if (!(sb = path.dentry->d_inode->i_sb)) ++ goto out_release; ++ ++ /* sanity checks */ ++ if ((reserved != CDLIM_KEEP && ++ reserved > 100) || ++ (inodes_used != CDLIM_KEEP && ++ inodes_used > inodes_total) || ++ (space_used != CDLIM_KEEP && ++ space_used > space_total)) ++ goto out_release; ++ ++ ret = -ESRCH; ++ dli = locate_dl_info(sb, id); ++ if (!dli) ++ goto out_release; ++ ++ spin_lock(&dli->dl_lock); ++ ++ if (inodes_used != CDLIM_KEEP) ++ dli->dl_inodes_used = inodes_used; ++ if (inodes_total != CDLIM_KEEP) ++ dli->dl_inodes_total = inodes_total; ++ if (space_used != CDLIM_KEEP) ++ dli->dl_space_used = dlimit_space_32to64( ++ space_used, flags, DLIMS_USED); ++ ++ if (space_total == CDLIM_INFINITY) ++ dli->dl_space_total = DLIM_INFINITY; ++ else if (space_total != CDLIM_KEEP) ++ dli->dl_space_total = dlimit_space_32to64( ++ space_total, flags, DLIMS_TOTAL); ++ ++ if (reserved != CDLIM_KEEP) ++ dli->dl_nrlmult = (1 << 10) * (100 - reserved) / 100; ++ ++ spin_unlock(&dli->dl_lock); ++ ++ put_dl_info(dli); ++ ret = 0; ++ ++ out_release: ++ path_put(&path); ++ } ++ return ret; ++} ++ ++int vc_set_dlimit(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_dlimit(id, vc_data.name, ++ vc_data.space_used, vc_data.space_total, ++ vc_data.inodes_used, vc_data.inodes_total, ++ vc_data.reserved, vc_data.flags); ++} ++ ++#ifdef CONFIG_COMPAT ++ ++int vc_set_dlimit_x32(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_v0_x32 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_dlimit(id, compat_ptr(vc_data.name_ptr), ++ vc_data.space_used, vc_data.space_total, ++ vc_data.inodes_used, vc_data.inodes_total, ++ vc_data.reserved, vc_data.flags); ++} ++ ++#endif /* CONFIG_COMPAT */ ++ ++ ++static inline ++int do_get_dlimit(uint32_t id, const char __user *name, ++ uint32_t *space_used, uint32_t *space_total, ++ uint32_t *inodes_used, uint32_t *inodes_total, ++ uint32_t *reserved, uint32_t *flags) ++{ ++ struct path path; ++ int ret; ++ ++ ret = user_lpath(name, &path); ++ if (!ret) { ++ struct super_block *sb; ++ struct dl_info *dli; ++ ++ ret = -EINVAL; ++ if (!path.dentry->d_inode) ++ goto out_release; ++ if (!(sb = path.dentry->d_inode->i_sb)) ++ goto out_release; ++ ++ ret = -ESRCH; ++ dli = locate_dl_info(sb, id); ++ if (!dli) ++ goto out_release; ++ ++ spin_lock(&dli->dl_lock); ++ *inodes_used = dli->dl_inodes_used; ++ *inodes_total = dli->dl_inodes_total; ++ ++ *space_used = dlimit_space_64to32( ++ dli->dl_space_used, flags, DLIMS_USED); ++ ++ if (dli->dl_space_total == DLIM_INFINITY) ++ *space_total = CDLIM_INFINITY; ++ else ++ *space_total = dlimit_space_64to32( ++ dli->dl_space_total, flags, DLIMS_TOTAL); ++ ++ *reserved = 100 - ((dli->dl_nrlmult * 100 + 512) >> 10); ++ spin_unlock(&dli->dl_lock); ++ ++ put_dl_info(dli); ++ ret = -EFAULT; ++ ++ ret = 0; ++ out_release: ++ path_put(&path); ++ } ++ return ret; ++} ++ ++ ++int vc_get_dlimit(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_v0 vc_data; ++ int ret; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = do_get_dlimit(id, vc_data.name, ++ &vc_data.space_used, &vc_data.space_total, ++ &vc_data.inodes_used, &vc_data.inodes_total, ++ &vc_data.reserved, &vc_data.flags); ++ if (ret) ++ return ret; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++#ifdef CONFIG_COMPAT ++ ++int vc_get_dlimit_x32(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_v0_x32 vc_data; ++ int ret; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = do_get_dlimit(id, compat_ptr(vc_data.name_ptr), ++ &vc_data.space_used, &vc_data.space_total, ++ &vc_data.inodes_used, &vc_data.inodes_total, ++ &vc_data.reserved, &vc_data.flags); ++ if (ret) ++ return ret; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++#endif /* CONFIG_COMPAT */ ++ ++ ++void vx_vsi_statfs(struct super_block *sb, struct kstatfs *buf) ++{ ++ struct dl_info *dli; ++ __u64 blimit, bfree, bavail; ++ __u32 ifree; ++ ++ dli = locate_dl_info(sb, dx_current_tag()); ++ if (!dli) ++ return; ++ ++ spin_lock(&dli->dl_lock); ++ if (dli->dl_inodes_total == (unsigned long)DLIM_INFINITY) ++ goto no_ilim; ++ ++ /* reduce max inodes available to limit */ ++ if (buf->f_files > dli->dl_inodes_total) ++ buf->f_files = dli->dl_inodes_total; ++ ++ ifree = dli->dl_inodes_total - dli->dl_inodes_used; ++ /* reduce free inodes to min */ ++ if (ifree < buf->f_ffree) ++ buf->f_ffree = ifree; ++ ++no_ilim: ++ if (dli->dl_space_total == DLIM_INFINITY) ++ goto no_blim; ++ ++ blimit = dli->dl_space_total >> sb->s_blocksize_bits; ++ ++ if (dli->dl_space_total < dli->dl_space_used) ++ bfree = 0; ++ else ++ bfree = (dli->dl_space_total - dli->dl_space_used) ++ >> sb->s_blocksize_bits; ++ ++ bavail = ((dli->dl_space_total >> 10) * dli->dl_nrlmult); ++ if (bavail < dli->dl_space_used) ++ bavail = 0; ++ else ++ bavail = (bavail - dli->dl_space_used) ++ >> sb->s_blocksize_bits; ++ ++ /* reduce max space available to limit */ ++ if (buf->f_blocks > blimit) ++ buf->f_blocks = blimit; ++ ++ /* reduce free space to min */ ++ if (bfree < buf->f_bfree) ++ buf->f_bfree = bfree; ++ ++ /* reduce avail space to min */ ++ if (bavail < buf->f_bavail) ++ buf->f_bavail = bavail; ++ ++no_blim: ++ spin_unlock(&dli->dl_lock); ++ put_dl_info(dli); ++ ++ return; ++} ++ ++#include ++ ++EXPORT_SYMBOL_GPL(locate_dl_info); ++EXPORT_SYMBOL_GPL(rcu_free_dl_info); ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/helper.c linux-3.2.34-vs2.3.2.15/kernel/vserver/helper.c +--- linux-3.2.34/kernel/vserver/helper.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/helper.c 2012-09-16 18:26:07.000000000 +0200 +@@ -0,0 +1,229 @@ ++/* ++ * linux/kernel/vserver/helper.c ++ * ++ * Virtual Context Support ++ * ++ * Copyright (C) 2004-2007 Herbert Pötzl ++ * ++ * V0.01 basic helper ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++ ++char vshelper_path[255] = "/sbin/vshelper"; ++ ++static int vshelper_init(struct subprocess_info *info, struct cred *new_cred) ++{ ++ current->flags &= ~PF_THREAD_BOUND; ++ return 0; ++} ++ ++static int do_vshelper(char *name, char *argv[], char *envp[], int sync) ++{ ++ int ret; ++ ++ if ((ret = call_usermodehelper_fns(name, argv, envp, ++ sync ? UMH_WAIT_PROC : UMH_WAIT_EXEC, ++ vshelper_init, NULL, NULL))) { ++ printk(KERN_WARNING "%s: (%s %s) returned %s with %d\n", ++ name, argv[1], argv[2], ++ sync ? "sync" : "async", ret); ++ } ++ vxdprintk(VXD_CBIT(switch, 4), ++ "%s: (%s %s) returned %s with %d", ++ name, argv[1], argv[2], sync ? "sync" : "async", ret); ++ return ret; ++} ++ ++/* ++ * vshelper path is set via /proc/sys ++ * invoked by vserver sys_reboot(), with ++ * the following arguments ++ * ++ * argv [0] = vshelper_path; ++ * argv [1] = action: "restart", "halt", "poweroff", ... ++ * argv [2] = context identifier ++ * ++ * envp [*] = type-specific parameters ++ */ ++ ++long vs_reboot_helper(struct vx_info *vxi, int cmd, void __user *arg) ++{ ++ char id_buf[8], cmd_buf[16]; ++ char uid_buf[16], pid_buf[16]; ++ int ret; ++ ++ char *argv[] = {vshelper_path, NULL, id_buf, 0}; ++ char *envp[] = {"HOME=/", "TERM=linux", ++ "PATH=/sbin:/usr/sbin:/bin:/usr/bin", ++ uid_buf, pid_buf, cmd_buf, 0}; ++ ++ if (vx_info_state(vxi, VXS_HELPER)) ++ return -EAGAIN; ++ vxi->vx_state |= VXS_HELPER; ++ ++ snprintf(id_buf, sizeof(id_buf), "%d", vxi->vx_id); ++ ++ snprintf(cmd_buf, sizeof(cmd_buf), "VS_CMD=%08x", cmd); ++ snprintf(uid_buf, sizeof(uid_buf), "VS_UID=%d", current_uid()); ++ snprintf(pid_buf, sizeof(pid_buf), "VS_PID=%d", current->pid); ++ ++ switch (cmd) { ++ case LINUX_REBOOT_CMD_RESTART: ++ argv[1] = "restart"; ++ break; ++ ++ case LINUX_REBOOT_CMD_HALT: ++ argv[1] = "halt"; ++ break; ++ ++ case LINUX_REBOOT_CMD_POWER_OFF: ++ argv[1] = "poweroff"; ++ break; ++ ++ case LINUX_REBOOT_CMD_SW_SUSPEND: ++ argv[1] = "swsusp"; ++ break; ++ ++ case LINUX_REBOOT_CMD_OOM: ++ argv[1] = "oom"; ++ break; ++ ++ default: ++ vxi->vx_state &= ~VXS_HELPER; ++ return 0; ++ } ++ ++ ret = do_vshelper(vshelper_path, argv, envp, 0); ++ vxi->vx_state &= ~VXS_HELPER; ++ __wakeup_vx_info(vxi); ++ return (ret) ? -EPERM : 0; ++} ++ ++ ++long vs_reboot(unsigned int cmd, void __user *arg) ++{ ++ struct vx_info *vxi = current_vx_info(); ++ long ret = 0; ++ ++ vxdprintk(VXD_CBIT(misc, 5), ++ "vs_reboot(%p[#%d],%u)", ++ vxi, vxi ? vxi->vx_id : 0, cmd); ++ ++ ret = vs_reboot_helper(vxi, cmd, arg); ++ if (ret) ++ return ret; ++ ++ vxi->reboot_cmd = cmd; ++ if (vx_info_flags(vxi, VXF_REBOOT_KILL, 0)) { ++ switch (cmd) { ++ case LINUX_REBOOT_CMD_RESTART: ++ case LINUX_REBOOT_CMD_HALT: ++ case LINUX_REBOOT_CMD_POWER_OFF: ++ vx_info_kill(vxi, 0, SIGKILL); ++ vx_info_kill(vxi, 1, SIGKILL); ++ default: ++ break; ++ } ++ } ++ return 0; ++} ++ ++long vs_oom_action(unsigned int cmd) ++{ ++ struct vx_info *vxi = current_vx_info(); ++ long ret = 0; ++ ++ vxdprintk(VXD_CBIT(misc, 5), ++ "vs_oom_action(%p[#%d],%u)", ++ vxi, vxi ? vxi->vx_id : 0, cmd); ++ ++ ret = vs_reboot_helper(vxi, cmd, NULL); ++ if (ret) ++ return ret; ++ ++ vxi->reboot_cmd = cmd; ++ if (vx_info_flags(vxi, VXF_REBOOT_KILL, 0)) { ++ vx_info_kill(vxi, 0, SIGKILL); ++ vx_info_kill(vxi, 1, SIGKILL); ++ } ++ return 0; ++} ++ ++/* ++ * argv [0] = vshelper_path; ++ * argv [1] = action: "startup", "shutdown" ++ * argv [2] = context identifier ++ * ++ * envp [*] = type-specific parameters ++ */ ++ ++long vs_state_change(struct vx_info *vxi, unsigned int cmd) ++{ ++ char id_buf[8], cmd_buf[16]; ++ char *argv[] = {vshelper_path, NULL, id_buf, 0}; ++ char *envp[] = {"HOME=/", "TERM=linux", ++ "PATH=/sbin:/usr/sbin:/bin:/usr/bin", cmd_buf, 0}; ++ ++ if (!vx_info_flags(vxi, VXF_SC_HELPER, 0)) ++ return 0; ++ ++ snprintf(id_buf, sizeof(id_buf), "%d", vxi->vx_id); ++ snprintf(cmd_buf, sizeof(cmd_buf), "VS_CMD=%08x", cmd); ++ ++ switch (cmd) { ++ case VSC_STARTUP: ++ argv[1] = "startup"; ++ break; ++ case VSC_SHUTDOWN: ++ argv[1] = "shutdown"; ++ break; ++ default: ++ return 0; ++ } ++ ++ return do_vshelper(vshelper_path, argv, envp, 1); ++} ++ ++ ++/* ++ * argv [0] = vshelper_path; ++ * argv [1] = action: "netup", "netdown" ++ * argv [2] = context identifier ++ * ++ * envp [*] = type-specific parameters ++ */ ++ ++long vs_net_change(struct nx_info *nxi, unsigned int cmd) ++{ ++ char id_buf[8], cmd_buf[16]; ++ char *argv[] = {vshelper_path, NULL, id_buf, 0}; ++ char *envp[] = {"HOME=/", "TERM=linux", ++ "PATH=/sbin:/usr/sbin:/bin:/usr/bin", cmd_buf, 0}; ++ ++ if (!nx_info_flags(nxi, NXF_SC_HELPER, 0)) ++ return 0; ++ ++ snprintf(id_buf, sizeof(id_buf), "%d", nxi->nx_id); ++ snprintf(cmd_buf, sizeof(cmd_buf), "VS_CMD=%08x", cmd); ++ ++ switch (cmd) { ++ case VSC_NETUP: ++ argv[1] = "netup"; ++ break; ++ case VSC_NETDOWN: ++ argv[1] = "netdown"; ++ break; ++ default: ++ return 0; ++ } ++ ++ return do_vshelper(vshelper_path, argv, envp, 1); ++} ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/history.c linux-3.2.34-vs2.3.2.15/kernel/vserver/history.c +--- linux-3.2.34/kernel/vserver/history.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/history.c 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,258 @@ ++/* ++ * kernel/vserver/history.c ++ * ++ * Virtual Context History Backtrace ++ * ++ * Copyright (C) 2004-2007 Herbert Pötzl ++ * ++ * V0.01 basic structure ++ * V0.02 hash/unhash and trace ++ * V0.03 preemption fixes ++ * ++ */ ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++ ++#ifdef CONFIG_VSERVER_HISTORY ++#define VXH_SIZE CONFIG_VSERVER_HISTORY_SIZE ++#else ++#define VXH_SIZE 64 ++#endif ++ ++struct _vx_history { ++ unsigned int counter; ++ ++ struct _vx_hist_entry entry[VXH_SIZE + 1]; ++}; ++ ++ ++DEFINE_PER_CPU(struct _vx_history, vx_history_buffer); ++ ++unsigned volatile int vxh_active = 1; ++ ++static atomic_t sequence = ATOMIC_INIT(0); ++ ++ ++/* vxh_advance() ++ ++ * requires disabled preemption */ ++ ++struct _vx_hist_entry *vxh_advance(void *loc) ++{ ++ unsigned int cpu = smp_processor_id(); ++ struct _vx_history *hist = &per_cpu(vx_history_buffer, cpu); ++ struct _vx_hist_entry *entry; ++ unsigned int index; ++ ++ index = vxh_active ? (hist->counter++ % VXH_SIZE) : VXH_SIZE; ++ entry = &hist->entry[index]; ++ ++ entry->seq = atomic_inc_return(&sequence); ++ entry->loc = loc; ++ return entry; ++} ++ ++EXPORT_SYMBOL_GPL(vxh_advance); ++ ++ ++#define VXH_LOC_FMTS "(#%04x,*%d):%p" ++ ++#define VXH_LOC_ARGS(e) (e)->seq, cpu, (e)->loc ++ ++ ++#define VXH_VXI_FMTS "%p[#%d,%d.%d]" ++ ++#define VXH_VXI_ARGS(e) (e)->vxi.ptr, \ ++ (e)->vxi.ptr ? (e)->vxi.xid : 0, \ ++ (e)->vxi.ptr ? (e)->vxi.usecnt : 0, \ ++ (e)->vxi.ptr ? (e)->vxi.tasks : 0 ++ ++void vxh_dump_entry(struct _vx_hist_entry *e, unsigned cpu) ++{ ++ switch (e->type) { ++ case VXH_THROW_OOPS: ++ printk( VXH_LOC_FMTS " oops \n", VXH_LOC_ARGS(e)); ++ break; ++ ++ case VXH_GET_VX_INFO: ++ case VXH_PUT_VX_INFO: ++ printk( VXH_LOC_FMTS " %s_vx_info " VXH_VXI_FMTS "\n", ++ VXH_LOC_ARGS(e), ++ (e->type == VXH_GET_VX_INFO) ? "get" : "put", ++ VXH_VXI_ARGS(e)); ++ break; ++ ++ case VXH_INIT_VX_INFO: ++ case VXH_SET_VX_INFO: ++ case VXH_CLR_VX_INFO: ++ printk( VXH_LOC_FMTS " %s_vx_info " VXH_VXI_FMTS " @%p\n", ++ VXH_LOC_ARGS(e), ++ (e->type == VXH_INIT_VX_INFO) ? "init" : ++ ((e->type == VXH_SET_VX_INFO) ? "set" : "clr"), ++ VXH_VXI_ARGS(e), e->sc.data); ++ break; ++ ++ case VXH_CLAIM_VX_INFO: ++ case VXH_RELEASE_VX_INFO: ++ printk( VXH_LOC_FMTS " %s_vx_info " VXH_VXI_FMTS " @%p\n", ++ VXH_LOC_ARGS(e), ++ (e->type == VXH_CLAIM_VX_INFO) ? "claim" : "release", ++ VXH_VXI_ARGS(e), e->sc.data); ++ break; ++ ++ case VXH_ALLOC_VX_INFO: ++ case VXH_DEALLOC_VX_INFO: ++ printk( VXH_LOC_FMTS " %s_vx_info " VXH_VXI_FMTS "\n", ++ VXH_LOC_ARGS(e), ++ (e->type == VXH_ALLOC_VX_INFO) ? "alloc" : "dealloc", ++ VXH_VXI_ARGS(e)); ++ break; ++ ++ case VXH_HASH_VX_INFO: ++ case VXH_UNHASH_VX_INFO: ++ printk( VXH_LOC_FMTS " __%s_vx_info " VXH_VXI_FMTS "\n", ++ VXH_LOC_ARGS(e), ++ (e->type == VXH_HASH_VX_INFO) ? "hash" : "unhash", ++ VXH_VXI_ARGS(e)); ++ break; ++ ++ case VXH_LOC_VX_INFO: ++ case VXH_LOOKUP_VX_INFO: ++ case VXH_CREATE_VX_INFO: ++ printk( VXH_LOC_FMTS " __%s_vx_info [#%d] -> " VXH_VXI_FMTS "\n", ++ VXH_LOC_ARGS(e), ++ (e->type == VXH_CREATE_VX_INFO) ? "create" : ++ ((e->type == VXH_LOC_VX_INFO) ? "loc" : "lookup"), ++ e->ll.arg, VXH_VXI_ARGS(e)); ++ break; ++ } ++} ++ ++static void __vxh_dump_history(void) ++{ ++ unsigned int i, cpu; ++ ++ printk("History:\tSEQ: %8x\tNR_CPUS: %d\n", ++ atomic_read(&sequence), NR_CPUS); ++ ++ for (i = 0; i < VXH_SIZE; i++) { ++ for_each_online_cpu(cpu) { ++ struct _vx_history *hist = ++ &per_cpu(vx_history_buffer, cpu); ++ unsigned int index = (hist->counter - i) % VXH_SIZE; ++ struct _vx_hist_entry *entry = &hist->entry[index]; ++ ++ vxh_dump_entry(entry, cpu); ++ } ++ } ++} ++ ++void vxh_dump_history(void) ++{ ++ vxh_active = 0; ++#ifdef CONFIG_SMP ++ local_irq_enable(); ++ smp_send_stop(); ++ local_irq_disable(); ++#endif ++ __vxh_dump_history(); ++} ++ ++ ++/* vserver syscall commands below here */ ++ ++ ++int vc_dump_history(uint32_t id) ++{ ++ vxh_active = 0; ++ __vxh_dump_history(); ++ vxh_active = 1; ++ ++ return 0; ++} ++ ++ ++int do_read_history(struct __user _vx_hist_entry *data, ++ int cpu, uint32_t *index, uint32_t *count) ++{ ++ int pos, ret = 0; ++ struct _vx_history *hist = &per_cpu(vx_history_buffer, cpu); ++ int end = hist->counter; ++ int start = end - VXH_SIZE + 2; ++ int idx = *index; ++ ++ /* special case: get current pos */ ++ if (!*count) { ++ *index = end; ++ return 0; ++ } ++ ++ /* have we lost some data? */ ++ if (idx < start) ++ idx = start; ++ ++ for (pos = 0; (pos < *count) && (idx < end); pos++, idx++) { ++ struct _vx_hist_entry *entry = ++ &hist->entry[idx % VXH_SIZE]; ++ ++ /* send entry to userspace */ ++ ret = copy_to_user(&data[pos], entry, sizeof(*entry)); ++ if (ret) ++ break; ++ } ++ /* save new index and count */ ++ *index = idx; ++ *count = pos; ++ return ret ? ret : (*index < end); ++} ++ ++int vc_read_history(uint32_t id, void __user *data) ++{ ++ struct vcmd_read_history_v0 vc_data; ++ int ret; ++ ++ if (id >= NR_CPUS) ++ return -EINVAL; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = do_read_history((struct __user _vx_hist_entry *)vc_data.data, ++ id, &vc_data.index, &vc_data.count); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return ret; ++} ++ ++#ifdef CONFIG_COMPAT ++ ++int vc_read_history_x32(uint32_t id, void __user *data) ++{ ++ struct vcmd_read_history_v0_x32 vc_data; ++ int ret; ++ ++ if (id >= NR_CPUS) ++ return -EINVAL; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = do_read_history((struct __user _vx_hist_entry *) ++ compat_ptr(vc_data.data_ptr), ++ id, &vc_data.index, &vc_data.count); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return ret; ++} ++ ++#endif /* CONFIG_COMPAT */ ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/inet.c linux-3.2.34-vs2.3.2.15/kernel/vserver/inet.c +--- linux-3.2.34/kernel/vserver/inet.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/inet.c 2011-12-15 01:33:09.000000000 +0100 +@@ -0,0 +1,226 @@ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++int nx_v4_addr_conflict(struct nx_info *nxi1, struct nx_info *nxi2) ++{ ++ int ret = 0; ++ ++ if (!nxi1 || !nxi2 || nxi1 == nxi2) ++ ret = 1; ++ else { ++ struct nx_addr_v4 *ptr; ++ ++ for (ptr = &nxi1->v4; ptr; ptr = ptr->next) { ++ if (v4_nx_addr_in_nx_info(nxi2, ptr, -1)) { ++ ret = 1; ++ break; ++ } ++ } ++ } ++ ++ vxdprintk(VXD_CBIT(net, 2), ++ "nx_v4_addr_conflict(%p,%p): %d", ++ nxi1, nxi2, ret); ++ ++ return ret; ++} ++ ++ ++#ifdef CONFIG_IPV6 ++ ++int nx_v6_addr_conflict(struct nx_info *nxi1, struct nx_info *nxi2) ++{ ++ int ret = 0; ++ ++ if (!nxi1 || !nxi2 || nxi1 == nxi2) ++ ret = 1; ++ else { ++ struct nx_addr_v6 *ptr; ++ ++ for (ptr = &nxi1->v6; ptr; ptr = ptr->next) { ++ if (v6_nx_addr_in_nx_info(nxi2, ptr, -1)) { ++ ret = 1; ++ break; ++ } ++ } ++ } ++ ++ vxdprintk(VXD_CBIT(net, 2), ++ "nx_v6_addr_conflict(%p,%p): %d", ++ nxi1, nxi2, ret); ++ ++ return ret; ++} ++ ++#endif ++ ++int v4_dev_in_nx_info(struct net_device *dev, struct nx_info *nxi) ++{ ++ struct in_device *in_dev; ++ struct in_ifaddr **ifap; ++ struct in_ifaddr *ifa; ++ int ret = 0; ++ ++ if (!dev) ++ goto out; ++ in_dev = in_dev_get(dev); ++ if (!in_dev) ++ goto out; ++ ++ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; ++ ifap = &ifa->ifa_next) { ++ if (v4_addr_in_nx_info(nxi, ifa->ifa_local, NXA_MASK_SHOW)) { ++ ret = 1; ++ break; ++ } ++ } ++ in_dev_put(in_dev); ++out: ++ return ret; ++} ++ ++ ++#ifdef CONFIG_IPV6 ++ ++int v6_dev_in_nx_info(struct net_device *dev, struct nx_info *nxi) ++{ ++ struct inet6_dev *in_dev; ++ struct inet6_ifaddr *ifa; ++ int ret = 0; ++ ++ if (!dev) ++ goto out; ++ in_dev = in6_dev_get(dev); ++ if (!in_dev) ++ goto out; ++ ++ // for (ifap = &in_dev->addr_list; (ifa = *ifap) != NULL; ++ list_for_each_entry(ifa, &in_dev->addr_list, if_list) { ++ if (v6_addr_in_nx_info(nxi, &ifa->addr, -1)) { ++ ret = 1; ++ break; ++ } ++ } ++ in6_dev_put(in_dev); ++out: ++ return ret; ++} ++ ++#endif ++ ++int dev_in_nx_info(struct net_device *dev, struct nx_info *nxi) ++{ ++ int ret = 1; ++ ++ if (!nxi) ++ goto out; ++ if (nxi->v4.type && v4_dev_in_nx_info(dev, nxi)) ++ goto out; ++#ifdef CONFIG_IPV6 ++ ret = 2; ++ if (nxi->v6.type && v6_dev_in_nx_info(dev, nxi)) ++ goto out; ++#endif ++ ret = 0; ++out: ++ vxdprintk(VXD_CBIT(net, 3), ++ "dev_in_nx_info(%p,%p[#%d]) = %d", ++ dev, nxi, nxi ? nxi->nx_id : 0, ret); ++ return ret; ++} ++ ++struct rtable *ip_v4_find_src(struct net *net, struct nx_info *nxi, ++ struct flowi4 *fl4) ++{ ++ struct rtable *rt; ++ ++ if (!nxi) ++ return NULL; ++ ++ /* FIXME: handle lback only case */ ++ if (!NX_IPV4(nxi)) ++ return ERR_PTR(-EPERM); ++ ++ vxdprintk(VXD_CBIT(net, 4), ++ "ip_v4_find_src(%p[#%u]) " NIPQUAD_FMT " -> " NIPQUAD_FMT, ++ nxi, nxi ? nxi->nx_id : 0, ++ NIPQUAD(fl4->saddr), NIPQUAD(fl4->daddr)); ++ ++ /* single IP is unconditional */ ++ if (nx_info_flags(nxi, NXF_SINGLE_IP, 0) && ++ (fl4->saddr == INADDR_ANY)) ++ fl4->saddr = nxi->v4.ip[0].s_addr; ++ ++ if (fl4->saddr == INADDR_ANY) { ++ struct nx_addr_v4 *ptr; ++ __be32 found = 0; ++ ++ rt = __ip_route_output_key(net, fl4); ++ if (!IS_ERR(rt)) { ++ found = fl4->saddr; ++ ip_rt_put(rt); ++ vxdprintk(VXD_CBIT(net, 4), ++ "ip_v4_find_src(%p[#%u]) rok[%u]: " NIPQUAD_FMT, ++ nxi, nxi ? nxi->nx_id : 0, fl4->flowi4_oif, NIPQUAD(found)); ++ if (v4_addr_in_nx_info(nxi, found, NXA_MASK_BIND)) ++ goto found; ++ } ++ ++ for (ptr = &nxi->v4; ptr; ptr = ptr->next) { ++ __be32 primary = ptr->ip[0].s_addr; ++ __be32 mask = ptr->mask.s_addr; ++ __be32 neta = primary & mask; ++ ++ vxdprintk(VXD_CBIT(net, 4), "ip_v4_find_src(%p[#%u]) chk: " ++ NIPQUAD_FMT "/" NIPQUAD_FMT "/" NIPQUAD_FMT, ++ nxi, nxi ? nxi->nx_id : 0, NIPQUAD(primary), ++ NIPQUAD(mask), NIPQUAD(neta)); ++ if ((found & mask) != neta) ++ continue; ++ ++ fl4->saddr = primary; ++ rt = __ip_route_output_key(net, fl4); ++ vxdprintk(VXD_CBIT(net, 4), ++ "ip_v4_find_src(%p[#%u]) rok[%u]: " NIPQUAD_FMT, ++ nxi, nxi ? nxi->nx_id : 0, fl4->flowi4_oif, NIPQUAD(primary)); ++ if (!IS_ERR(rt)) { ++ found = fl4->saddr; ++ ip_rt_put(rt); ++ if (found == primary) ++ goto found; ++ } ++ } ++ /* still no source ip? */ ++ found = ipv4_is_loopback(fl4->daddr) ++ ? IPI_LOOPBACK : nxi->v4.ip[0].s_addr; ++ found: ++ /* assign src ip to flow */ ++ fl4->saddr = found; ++ ++ } else { ++ if (!v4_addr_in_nx_info(nxi, fl4->saddr, NXA_MASK_BIND)) ++ return ERR_PTR(-EPERM); ++ } ++ ++ if (nx_info_flags(nxi, NXF_LBACK_REMAP, 0)) { ++ if (ipv4_is_loopback(fl4->daddr)) ++ fl4->daddr = nxi->v4_lback.s_addr; ++ if (ipv4_is_loopback(fl4->saddr)) ++ fl4->saddr = nxi->v4_lback.s_addr; ++ } else if (ipv4_is_loopback(fl4->daddr) && ++ !nx_info_flags(nxi, NXF_LBACK_ALLOW, 0)) ++ return ERR_PTR(-EPERM); ++ ++ return NULL; ++} ++ ++EXPORT_SYMBOL_GPL(ip_v4_find_src); ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/init.c linux-3.2.34-vs2.3.2.15/kernel/vserver/init.c +--- linux-3.2.34/kernel/vserver/init.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/init.c 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,45 @@ ++/* ++ * linux/kernel/init.c ++ * ++ * Virtual Server Init ++ * ++ * Copyright (C) 2004-2007 Herbert Pötzl ++ * ++ * V0.01 basic structure ++ * ++ */ ++ ++#include ++ ++int vserver_register_sysctl(void); ++void vserver_unregister_sysctl(void); ++ ++ ++static int __init init_vserver(void) ++{ ++ int ret = 0; ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ vserver_register_sysctl(); ++#endif ++ return ret; ++} ++ ++ ++static void __exit exit_vserver(void) ++{ ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ vserver_unregister_sysctl(); ++#endif ++ return; ++} ++ ++/* FIXME: GFP_ZONETYPES gone ++long vx_slab[GFP_ZONETYPES]; */ ++long vx_area; ++ ++ ++module_init(init_vserver); ++module_exit(exit_vserver); ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/inode.c linux-3.2.34-vs2.3.2.15/kernel/vserver/inode.c +--- linux-3.2.34/kernel/vserver/inode.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/inode.c 2011-12-06 23:56:11.000000000 +0100 +@@ -0,0 +1,437 @@ ++/* ++ * linux/kernel/vserver/inode.c ++ * ++ * Virtual Server: File System Support ++ * ++ * Copyright (C) 2004-2007 Herbert Pötzl ++ * ++ * V0.01 separated from vcontext V0.05 ++ * V0.02 moved to tag (instead of xid) ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++ ++static int __vc_get_iattr(struct inode *in, uint32_t *tag, uint32_t *flags, uint32_t *mask) ++{ ++ struct proc_dir_entry *entry; ++ ++ if (!in || !in->i_sb) ++ return -ESRCH; ++ ++ *flags = IATTR_TAG ++ | (IS_IMMUTABLE(in) ? IATTR_IMMUTABLE : 0) ++ | (IS_IXUNLINK(in) ? IATTR_IXUNLINK : 0) ++ | (IS_BARRIER(in) ? IATTR_BARRIER : 0) ++ | (IS_COW(in) ? IATTR_COW : 0); ++ *mask = IATTR_IXUNLINK | IATTR_IMMUTABLE | IATTR_COW; ++ ++ if (S_ISDIR(in->i_mode)) ++ *mask |= IATTR_BARRIER; ++ ++ if (IS_TAGGED(in)) { ++ *tag = in->i_tag; ++ *mask |= IATTR_TAG; ++ } ++ ++ switch (in->i_sb->s_magic) { ++ case PROC_SUPER_MAGIC: ++ entry = PROC_I(in)->pde; ++ ++ /* check for specific inodes? */ ++ if (entry) ++ *mask |= IATTR_FLAGS; ++ if (entry) ++ *flags |= (entry->vx_flags & IATTR_FLAGS); ++ else ++ *flags |= (PROC_I(in)->vx_flags & IATTR_FLAGS); ++ break; ++ ++ case DEVPTS_SUPER_MAGIC: ++ *tag = in->i_tag; ++ *mask |= IATTR_TAG; ++ break; ++ ++ default: ++ break; ++ } ++ return 0; ++} ++ ++int vc_get_iattr(void __user *data) ++{ ++ struct path path; ++ struct vcmd_ctx_iattr_v1 vc_data = { .tag = -1 }; ++ int ret; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = user_lpath(vc_data.name, &path); ++ if (!ret) { ++ ret = __vc_get_iattr(path.dentry->d_inode, ++ &vc_data.tag, &vc_data.flags, &vc_data.mask); ++ path_put(&path); ++ } ++ if (ret) ++ return ret; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ ret = -EFAULT; ++ return ret; ++} ++ ++#ifdef CONFIG_COMPAT ++ ++int vc_get_iattr_x32(void __user *data) ++{ ++ struct path path; ++ struct vcmd_ctx_iattr_v1_x32 vc_data = { .tag = -1 }; ++ int ret; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = user_lpath(compat_ptr(vc_data.name_ptr), &path); ++ if (!ret) { ++ ret = __vc_get_iattr(path.dentry->d_inode, ++ &vc_data.tag, &vc_data.flags, &vc_data.mask); ++ path_put(&path); ++ } ++ if (ret) ++ return ret; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ ret = -EFAULT; ++ return ret; ++} ++ ++#endif /* CONFIG_COMPAT */ ++ ++ ++int vc_fget_iattr(uint32_t fd, void __user *data) ++{ ++ struct file *filp; ++ struct vcmd_ctx_fiattr_v0 vc_data = { .tag = -1 }; ++ int ret; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ filp = fget(fd); ++ if (!filp || !filp->f_dentry || !filp->f_dentry->d_inode) ++ return -EBADF; ++ ++ ret = __vc_get_iattr(filp->f_dentry->d_inode, ++ &vc_data.tag, &vc_data.flags, &vc_data.mask); ++ ++ fput(filp); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ ret = -EFAULT; ++ return ret; ++} ++ ++ ++static int __vc_set_iattr(struct dentry *de, uint32_t *tag, uint32_t *flags, uint32_t *mask) ++{ ++ struct inode *in = de->d_inode; ++ int error = 0, is_proc = 0, has_tag = 0; ++ struct iattr attr = { 0 }; ++ ++ if (!in || !in->i_sb) ++ return -ESRCH; ++ ++ is_proc = (in->i_sb->s_magic == PROC_SUPER_MAGIC); ++ if ((*mask & IATTR_FLAGS) && !is_proc) ++ return -EINVAL; ++ ++ has_tag = IS_TAGGED(in) || ++ (in->i_sb->s_magic == DEVPTS_SUPER_MAGIC); ++ if ((*mask & IATTR_TAG) && !has_tag) ++ return -EINVAL; ++ ++ mutex_lock(&in->i_mutex); ++ if (*mask & IATTR_TAG) { ++ attr.ia_tag = *tag; ++ attr.ia_valid |= ATTR_TAG; ++ } ++ ++ if (*mask & IATTR_FLAGS) { ++ struct proc_dir_entry *entry = PROC_I(in)->pde; ++ unsigned int iflags = PROC_I(in)->vx_flags; ++ ++ iflags = (iflags & ~(*mask & IATTR_FLAGS)) ++ | (*flags & IATTR_FLAGS); ++ PROC_I(in)->vx_flags = iflags; ++ if (entry) ++ entry->vx_flags = iflags; ++ } ++ ++ if (*mask & (IATTR_IMMUTABLE | IATTR_IXUNLINK | ++ IATTR_BARRIER | IATTR_COW)) { ++ int iflags = in->i_flags; ++ int vflags = in->i_vflags; ++ ++ if (*mask & IATTR_IMMUTABLE) { ++ if (*flags & IATTR_IMMUTABLE) ++ iflags |= S_IMMUTABLE; ++ else ++ iflags &= ~S_IMMUTABLE; ++ } ++ if (*mask & IATTR_IXUNLINK) { ++ if (*flags & IATTR_IXUNLINK) ++ iflags |= S_IXUNLINK; ++ else ++ iflags &= ~S_IXUNLINK; ++ } ++ if (S_ISDIR(in->i_mode) && (*mask & IATTR_BARRIER)) { ++ if (*flags & IATTR_BARRIER) ++ vflags |= V_BARRIER; ++ else ++ vflags &= ~V_BARRIER; ++ } ++ if (S_ISREG(in->i_mode) && (*mask & IATTR_COW)) { ++ if (*flags & IATTR_COW) ++ vflags |= V_COW; ++ else ++ vflags &= ~V_COW; ++ } ++ if (in->i_op && in->i_op->sync_flags) { ++ error = in->i_op->sync_flags(in, iflags, vflags); ++ if (error) ++ goto out; ++ } ++ } ++ ++ if (attr.ia_valid) { ++ if (in->i_op && in->i_op->setattr) ++ error = in->i_op->setattr(de, &attr); ++ else { ++ error = inode_change_ok(in, &attr); ++ if (!error) { ++ setattr_copy(in, &attr); ++ mark_inode_dirty(in); ++ } ++ } ++ } ++ ++out: ++ mutex_unlock(&in->i_mutex); ++ return error; ++} ++ ++int vc_set_iattr(void __user *data) ++{ ++ struct path path; ++ struct vcmd_ctx_iattr_v1 vc_data; ++ int ret; ++ ++ if (!capable(CAP_LINUX_IMMUTABLE)) ++ return -EPERM; ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = user_lpath(vc_data.name, &path); ++ if (!ret) { ++ ret = __vc_set_iattr(path.dentry, ++ &vc_data.tag, &vc_data.flags, &vc_data.mask); ++ path_put(&path); ++ } ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ ret = -EFAULT; ++ return ret; ++} ++ ++#ifdef CONFIG_COMPAT ++ ++int vc_set_iattr_x32(void __user *data) ++{ ++ struct path path; ++ struct vcmd_ctx_iattr_v1_x32 vc_data; ++ int ret; ++ ++ if (!capable(CAP_LINUX_IMMUTABLE)) ++ return -EPERM; ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = user_lpath(compat_ptr(vc_data.name_ptr), &path); ++ if (!ret) { ++ ret = __vc_set_iattr(path.dentry, ++ &vc_data.tag, &vc_data.flags, &vc_data.mask); ++ path_put(&path); ++ } ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ ret = -EFAULT; ++ return ret; ++} ++ ++#endif /* CONFIG_COMPAT */ ++ ++int vc_fset_iattr(uint32_t fd, void __user *data) ++{ ++ struct file *filp; ++ struct vcmd_ctx_fiattr_v0 vc_data; ++ int ret; ++ ++ if (!capable(CAP_LINUX_IMMUTABLE)) ++ return -EPERM; ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ filp = fget(fd); ++ if (!filp || !filp->f_dentry || !filp->f_dentry->d_inode) ++ return -EBADF; ++ ++ ret = __vc_set_iattr(filp->f_dentry, &vc_data.tag, ++ &vc_data.flags, &vc_data.mask); ++ ++ fput(filp); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return ret; ++} ++ ++ ++enum { Opt_notagcheck, Opt_tag, Opt_notag, Opt_tagid, Opt_err }; ++ ++static match_table_t tokens = { ++ {Opt_notagcheck, "notagcheck"}, ++#ifdef CONFIG_PROPAGATE ++ {Opt_notag, "notag"}, ++ {Opt_tag, "tag"}, ++ {Opt_tagid, "tagid=%u"}, ++#endif ++ {Opt_err, NULL} ++}; ++ ++ ++static void __dx_parse_remove(char *string, char *opt) ++{ ++ char *p = strstr(string, opt); ++ char *q = p; ++ ++ if (p) { ++ while (*q != '\0' && *q != ',') ++ q++; ++ while (*q) ++ *p++ = *q++; ++ while (*p) ++ *p++ = '\0'; ++ } ++} ++ ++int dx_parse_tag(char *string, tag_t *tag, int remove, int *mnt_flags, ++ unsigned long *flags) ++{ ++ int set = 0; ++ substring_t args[MAX_OPT_ARGS]; ++ int token; ++ char *s, *p, *opts; ++#if defined(CONFIG_PROPAGATE) || defined(CONFIG_VSERVER_DEBUG) ++ int option = 0; ++#endif ++ ++ if (!string) ++ return 0; ++ s = kstrdup(string, GFP_KERNEL | GFP_ATOMIC); ++ if (!s) ++ return 0; ++ ++ opts = s; ++ while ((p = strsep(&opts, ",")) != NULL) { ++ token = match_token(p, tokens, args); ++ ++ switch (token) { ++#ifdef CONFIG_PROPAGATE ++ case Opt_tag: ++ if (tag) ++ *tag = 0; ++ if (remove) ++ __dx_parse_remove(s, "tag"); ++ *mnt_flags |= MNT_TAGID; ++ set |= MNT_TAGID; ++ break; ++ case Opt_notag: ++ if (remove) ++ __dx_parse_remove(s, "notag"); ++ *mnt_flags |= MNT_NOTAG; ++ set |= MNT_NOTAG; ++ break; ++ case Opt_tagid: ++ if (tag && !match_int(args, &option)) ++ *tag = option; ++ if (remove) ++ __dx_parse_remove(s, "tagid"); ++ *mnt_flags |= MNT_TAGID; ++ set |= MNT_TAGID; ++ break; ++#endif /* CONFIG_PROPAGATE */ ++ case Opt_notagcheck: ++ if (remove) ++ __dx_parse_remove(s, "notagcheck"); ++ *flags |= MS_NOTAGCHECK; ++ set |= MS_NOTAGCHECK; ++ break; ++ } ++ vxdprintk(VXD_CBIT(tag, 7), ++ "dx_parse_tag(" VS_Q("%s") "): %d:#%d", ++ p, token, option); ++ } ++ if (set) ++ strcpy(string, s); ++ kfree(s); ++ return set; ++} ++ ++#ifdef CONFIG_PROPAGATE ++ ++void __dx_propagate_tag(struct nameidata *nd, struct inode *inode) ++{ ++ tag_t new_tag = 0; ++ struct vfsmount *mnt; ++ int propagate; ++ ++ if (!nd) ++ return; ++ mnt = nd->path.mnt; ++ if (!mnt) ++ return; ++ ++ propagate = (mnt->mnt_flags & MNT_TAGID); ++ if (propagate) ++ new_tag = mnt->mnt_tag; ++ ++ vxdprintk(VXD_CBIT(tag, 7), ++ "dx_propagate_tag(%p[#%lu.%d]): %d,%d", ++ inode, inode->i_ino, inode->i_tag, ++ new_tag, (propagate) ? 1 : 0); ++ ++ if (propagate) ++ inode->i_tag = new_tag; ++} ++ ++#include ++ ++EXPORT_SYMBOL_GPL(__dx_propagate_tag); ++ ++#endif /* CONFIG_PROPAGATE */ ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/limit.c linux-3.2.34-vs2.3.2.15/kernel/vserver/limit.c +--- linux-3.2.34/kernel/vserver/limit.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/limit.c 2012-08-13 15:49:40.000000000 +0200 +@@ -0,0 +1,345 @@ ++/* ++ * linux/kernel/vserver/limit.c ++ * ++ * Virtual Server: Context Limits ++ * ++ * Copyright (C) 2004-2010 Herbert Pötzl ++ * ++ * V0.01 broken out from vcontext V0.05 ++ * V0.02 changed vcmds to vxi arg ++ * V0.03 added memory cgroup support ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++ ++const char *vlimit_name[NUM_LIMITS] = { ++ [RLIMIT_CPU] = "CPU", ++ [RLIMIT_NPROC] = "NPROC", ++ [RLIMIT_NOFILE] = "NOFILE", ++ [RLIMIT_LOCKS] = "LOCKS", ++ [RLIMIT_SIGPENDING] = "SIGP", ++ [RLIMIT_MSGQUEUE] = "MSGQ", ++ ++ [VLIMIT_NSOCK] = "NSOCK", ++ [VLIMIT_OPENFD] = "OPENFD", ++ [VLIMIT_SHMEM] = "SHMEM", ++ [VLIMIT_DENTRY] = "DENTRY", ++}; ++ ++EXPORT_SYMBOL_GPL(vlimit_name); ++ ++#define MASK_ENTRY(x) (1 << (x)) ++ ++const struct vcmd_ctx_rlimit_mask_v0 vlimit_mask = { ++ /* minimum */ ++ 0 ++ , /* softlimit */ ++ 0 ++ , /* maximum */ ++ MASK_ENTRY( RLIMIT_NPROC ) | ++ MASK_ENTRY( RLIMIT_NOFILE ) | ++ MASK_ENTRY( RLIMIT_LOCKS ) | ++ MASK_ENTRY( RLIMIT_MSGQUEUE ) | ++ ++ MASK_ENTRY( VLIMIT_NSOCK ) | ++ MASK_ENTRY( VLIMIT_OPENFD ) | ++ MASK_ENTRY( VLIMIT_SHMEM ) | ++ MASK_ENTRY( VLIMIT_DENTRY ) | ++ 0 ++}; ++ /* accounting only */ ++uint32_t account_mask = ++ MASK_ENTRY( VLIMIT_SEMARY ) | ++ MASK_ENTRY( VLIMIT_NSEMS ) | ++ MASK_ENTRY( VLIMIT_MAPPED ) | ++ 0; ++ ++ ++static int is_valid_vlimit(int id) ++{ ++ uint32_t mask = vlimit_mask.minimum | ++ vlimit_mask.softlimit | vlimit_mask.maximum; ++ return mask & (1 << id); ++} ++ ++static int is_accounted_vlimit(int id) ++{ ++ if (is_valid_vlimit(id)) ++ return 1; ++ return account_mask & (1 << id); ++} ++ ++ ++static inline uint64_t vc_get_soft(struct vx_info *vxi, int id) ++{ ++ rlim_t limit = __rlim_soft(&vxi->limit, id); ++ return VX_VLIM(limit); ++} ++ ++static inline uint64_t vc_get_hard(struct vx_info *vxi, int id) ++{ ++ rlim_t limit = __rlim_hard(&vxi->limit, id); ++ return VX_VLIM(limit); ++} ++ ++static int do_get_rlimit(struct vx_info *vxi, uint32_t id, ++ uint64_t *minimum, uint64_t *softlimit, uint64_t *maximum) ++{ ++ if (!is_valid_vlimit(id)) ++ return -EINVAL; ++ ++ if (minimum) ++ *minimum = CRLIM_UNSET; ++ if (softlimit) ++ *softlimit = vc_get_soft(vxi, id); ++ if (maximum) ++ *maximum = vc_get_hard(vxi, id); ++ return 0; ++} ++ ++int vc_get_rlimit(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_rlimit_v0 vc_data; ++ int ret; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = do_get_rlimit(vxi, vc_data.id, ++ &vc_data.minimum, &vc_data.softlimit, &vc_data.maximum); ++ if (ret) ++ return ret; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int do_set_rlimit(struct vx_info *vxi, uint32_t id, ++ uint64_t minimum, uint64_t softlimit, uint64_t maximum) ++{ ++ if (!is_valid_vlimit(id)) ++ return -EINVAL; ++ ++ if (maximum != CRLIM_KEEP) ++ __rlim_hard(&vxi->limit, id) = VX_RLIM(maximum); ++ if (softlimit != CRLIM_KEEP) ++ __rlim_soft(&vxi->limit, id) = VX_RLIM(softlimit); ++ ++ /* clamp soft limit */ ++ if (__rlim_soft(&vxi->limit, id) > __rlim_hard(&vxi->limit, id)) ++ __rlim_soft(&vxi->limit, id) = __rlim_hard(&vxi->limit, id); ++ ++ return 0; ++} ++ ++int vc_set_rlimit(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_rlimit_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_rlimit(vxi, vc_data.id, ++ vc_data.minimum, vc_data.softlimit, vc_data.maximum); ++} ++ ++#ifdef CONFIG_IA32_EMULATION ++ ++int vc_set_rlimit_x32(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_rlimit_v0_x32 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_rlimit(vxi, vc_data.id, ++ vc_data.minimum, vc_data.softlimit, vc_data.maximum); ++} ++ ++int vc_get_rlimit_x32(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_rlimit_v0_x32 vc_data; ++ int ret; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = do_get_rlimit(vxi, vc_data.id, ++ &vc_data.minimum, &vc_data.softlimit, &vc_data.maximum); ++ if (ret) ++ return ret; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++#endif /* CONFIG_IA32_EMULATION */ ++ ++ ++int vc_get_rlimit_mask(uint32_t id, void __user *data) ++{ ++ if (copy_to_user(data, &vlimit_mask, sizeof(vlimit_mask))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++static inline void vx_reset_hits(struct _vx_limit *limit) ++{ ++ int lim; ++ ++ for (lim = 0; lim < NUM_LIMITS; lim++) { ++ atomic_set(&__rlim_lhit(limit, lim), 0); ++ } ++} ++ ++int vc_reset_hits(struct vx_info *vxi, void __user *data) ++{ ++ vx_reset_hits(&vxi->limit); ++ return 0; ++} ++ ++static inline void vx_reset_minmax(struct _vx_limit *limit) ++{ ++ rlim_t value; ++ int lim; ++ ++ for (lim = 0; lim < NUM_LIMITS; lim++) { ++ value = __rlim_get(limit, lim); ++ __rlim_rmax(limit, lim) = value; ++ __rlim_rmin(limit, lim) = value; ++ } ++} ++ ++int vc_reset_minmax(struct vx_info *vxi, void __user *data) ++{ ++ vx_reset_minmax(&vxi->limit); ++ return 0; ++} ++ ++ ++int vc_rlimit_stat(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_rlimit_stat_v0 vc_data; ++ struct _vx_limit *limit = &vxi->limit; ++ int id; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ id = vc_data.id; ++ if (!is_accounted_vlimit(id)) ++ return -EINVAL; ++ ++ vx_limit_fixup(limit, id); ++ vc_data.hits = atomic_read(&__rlim_lhit(limit, id)); ++ vc_data.value = __rlim_get(limit, id); ++ vc_data.minimum = __rlim_rmin(limit, id); ++ vc_data.maximum = __rlim_rmax(limit, id); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++void vx_vsi_meminfo(struct sysinfo *val) ++{ ++#ifdef CONFIG_CGROUP_MEM_RES_CTLR ++ struct mem_cgroup *mcg; ++ u64 res_limit, res_usage; ++ ++ rcu_read_lock(); ++ mcg = mem_cgroup_from_task(current); ++ if (!mcg) ++ goto out; ++ ++ res_limit = mem_cgroup_res_read_u64(mcg, RES_LIMIT); ++ res_usage = mem_cgroup_res_read_u64(mcg, RES_USAGE); ++ ++ if (res_limit != RESOURCE_MAX) ++ val->totalram = (res_limit >> PAGE_SHIFT); ++ val->freeram = val->totalram - (res_usage >> PAGE_SHIFT); ++ val->bufferram = 0; ++ val->totalhigh = 0; ++ val->freehigh = 0; ++out: ++ rcu_read_unlock(); ++#endif /* CONFIG_CGROUP_MEM_RES_CTLR */ ++ return; ++} ++ ++void vx_vsi_swapinfo(struct sysinfo *val) ++{ ++#ifdef CONFIG_CGROUP_MEM_RES_CTLR ++#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP ++ struct mem_cgroup *mcg; ++ u64 res_limit, res_usage, memsw_limit, memsw_usage; ++ s64 swap_limit, swap_usage; ++ ++ rcu_read_lock(); ++ mcg = mem_cgroup_from_task(current); ++ if (!mcg) ++ goto out; ++ ++ res_limit = mem_cgroup_res_read_u64(mcg, RES_LIMIT); ++ res_usage = mem_cgroup_res_read_u64(mcg, RES_USAGE); ++ memsw_limit = mem_cgroup_memsw_read_u64(mcg, RES_LIMIT); ++ memsw_usage = mem_cgroup_memsw_read_u64(mcg, RES_USAGE); ++ ++ /* memory unlimited */ ++ if (res_limit == RESOURCE_MAX) ++ goto out; ++ ++ swap_limit = memsw_limit - res_limit; ++ /* we have a swap limit? */ ++ if (memsw_limit != RESOURCE_MAX) ++ val->totalswap = swap_limit >> PAGE_SHIFT; ++ ++ /* calculate swap part */ ++ swap_usage = (memsw_usage > res_usage) ? ++ memsw_usage - res_usage : 0; ++ ++ /* total shown minus usage gives free swap */ ++ val->freeswap = (swap_usage < swap_limit) ? ++ val->totalswap - (swap_usage >> PAGE_SHIFT) : 0; ++out: ++ rcu_read_unlock(); ++#else /* !CONFIG_CGROUP_MEM_RES_CTLR_SWAP */ ++ val->totalswap = 0; ++ val->freeswap = 0; ++#endif /* !CONFIG_CGROUP_MEM_RES_CTLR_SWAP */ ++#endif /* CONFIG_CGROUP_MEM_RES_CTLR */ ++ return; ++} ++ ++long vx_vsi_cached(struct sysinfo *val) ++{ ++ long cache = 0; ++#ifdef CONFIG_CGROUP_MEM_RES_CTLR ++ struct mem_cgroup *mcg; ++ ++ rcu_read_lock(); ++ mcg = mem_cgroup_from_task(current); ++ if (!mcg) ++ goto out; ++ ++ cache = mem_cgroup_stat_read_cache(mcg); ++out: ++ rcu_read_unlock(); ++#endif ++ return cache; ++} ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/limit_init.h linux-3.2.34-vs2.3.2.15/kernel/vserver/limit_init.h +--- linux-3.2.34/kernel/vserver/limit_init.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/limit_init.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,31 @@ ++ ++ ++static inline void vx_info_init_limit(struct _vx_limit *limit) ++{ ++ int lim; ++ ++ for (lim = 0; lim < NUM_LIMITS; lim++) { ++ __rlim_soft(limit, lim) = RLIM_INFINITY; ++ __rlim_hard(limit, lim) = RLIM_INFINITY; ++ __rlim_set(limit, lim, 0); ++ atomic_set(&__rlim_lhit(limit, lim), 0); ++ __rlim_rmin(limit, lim) = 0; ++ __rlim_rmax(limit, lim) = 0; ++ } ++} ++ ++static inline void vx_info_exit_limit(struct _vx_limit *limit) ++{ ++ rlim_t value; ++ int lim; ++ ++ for (lim = 0; lim < NUM_LIMITS; lim++) { ++ if ((1 << lim) & VLIM_NOCHECK) ++ continue; ++ value = __rlim_get(limit, lim); ++ vxwprintk_xid(value, ++ "!!! limit: %p[%s,%d] = %ld on exit.", ++ limit, vlimit_name[lim], lim, (long)value); ++ } ++} ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/limit_proc.h linux-3.2.34-vs2.3.2.15/kernel/vserver/limit_proc.h +--- linux-3.2.34/kernel/vserver/limit_proc.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/limit_proc.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,57 @@ ++#ifndef _VX_LIMIT_PROC_H ++#define _VX_LIMIT_PROC_H ++ ++#include ++ ++ ++#define VX_LIMIT_FMT ":\t%8ld\t%8ld/%8ld\t%8lld/%8lld\t%6d\n" ++#define VX_LIMIT_TOP \ ++ "Limit\t current\t min/max\t\t soft/hard\t\thits\n" ++ ++#define VX_LIMIT_ARG(r) \ ++ (unsigned long)__rlim_get(limit, r), \ ++ (unsigned long)__rlim_rmin(limit, r), \ ++ (unsigned long)__rlim_rmax(limit, r), \ ++ VX_VLIM(__rlim_soft(limit, r)), \ ++ VX_VLIM(__rlim_hard(limit, r)), \ ++ atomic_read(&__rlim_lhit(limit, r)) ++ ++static inline int vx_info_proc_limit(struct _vx_limit *limit, char *buffer) ++{ ++ vx_limit_fixup(limit, -1); ++ return sprintf(buffer, VX_LIMIT_TOP ++ "PROC" VX_LIMIT_FMT ++ "VM" VX_LIMIT_FMT ++ "VML" VX_LIMIT_FMT ++ "RSS" VX_LIMIT_FMT ++ "ANON" VX_LIMIT_FMT ++ "RMAP" VX_LIMIT_FMT ++ "FILES" VX_LIMIT_FMT ++ "OFD" VX_LIMIT_FMT ++ "LOCKS" VX_LIMIT_FMT ++ "SOCK" VX_LIMIT_FMT ++ "MSGQ" VX_LIMIT_FMT ++ "SHM" VX_LIMIT_FMT ++ "SEMA" VX_LIMIT_FMT ++ "SEMS" VX_LIMIT_FMT ++ "DENT" VX_LIMIT_FMT, ++ VX_LIMIT_ARG(RLIMIT_NPROC), ++ VX_LIMIT_ARG(RLIMIT_AS), ++ VX_LIMIT_ARG(RLIMIT_MEMLOCK), ++ VX_LIMIT_ARG(RLIMIT_RSS), ++ VX_LIMIT_ARG(VLIMIT_ANON), ++ VX_LIMIT_ARG(VLIMIT_MAPPED), ++ VX_LIMIT_ARG(RLIMIT_NOFILE), ++ VX_LIMIT_ARG(VLIMIT_OPENFD), ++ VX_LIMIT_ARG(RLIMIT_LOCKS), ++ VX_LIMIT_ARG(VLIMIT_NSOCK), ++ VX_LIMIT_ARG(RLIMIT_MSGQUEUE), ++ VX_LIMIT_ARG(VLIMIT_SHMEM), ++ VX_LIMIT_ARG(VLIMIT_SEMARY), ++ VX_LIMIT_ARG(VLIMIT_NSEMS), ++ VX_LIMIT_ARG(VLIMIT_DENTRY)); ++} ++ ++#endif /* _VX_LIMIT_PROC_H */ ++ ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/network.c linux-3.2.34-vs2.3.2.15/kernel/vserver/network.c +--- linux-3.2.34/kernel/vserver/network.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/network.c 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,912 @@ ++/* ++ * linux/kernel/vserver/network.c ++ * ++ * Virtual Server: Network Support ++ * ++ * Copyright (C) 2003-2007 Herbert Pötzl ++ * ++ * V0.01 broken out from vcontext V0.05 ++ * V0.02 cleaned up implementation ++ * V0.03 added equiv nx commands ++ * V0.04 switch to RCU based hash ++ * V0.05 and back to locking again ++ * V0.06 changed vcmds to nxi arg ++ * V0.07 have __create claim() the nxi ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++ ++atomic_t nx_global_ctotal = ATOMIC_INIT(0); ++atomic_t nx_global_cactive = ATOMIC_INIT(0); ++ ++static struct kmem_cache *nx_addr_v4_cachep = NULL; ++static struct kmem_cache *nx_addr_v6_cachep = NULL; ++ ++ ++static int __init init_network(void) ++{ ++ nx_addr_v4_cachep = kmem_cache_create("nx_v4_addr_cache", ++ sizeof(struct nx_addr_v4), 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ nx_addr_v6_cachep = kmem_cache_create("nx_v6_addr_cache", ++ sizeof(struct nx_addr_v6), 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ return 0; ++} ++ ++ ++/* __alloc_nx_addr_v4() */ ++ ++static inline struct nx_addr_v4 *__alloc_nx_addr_v4(void) ++{ ++ struct nx_addr_v4 *nxa = kmem_cache_alloc( ++ nx_addr_v4_cachep, GFP_KERNEL); ++ ++ if (!IS_ERR(nxa)) ++ memset(nxa, 0, sizeof(*nxa)); ++ return nxa; ++} ++ ++/* __dealloc_nx_addr_v4() */ ++ ++static inline void __dealloc_nx_addr_v4(struct nx_addr_v4 *nxa) ++{ ++ kmem_cache_free(nx_addr_v4_cachep, nxa); ++} ++ ++/* __dealloc_nx_addr_v4_all() */ ++ ++static inline void __dealloc_nx_addr_v4_all(struct nx_addr_v4 *nxa) ++{ ++ while (nxa) { ++ struct nx_addr_v4 *next = nxa->next; ++ ++ __dealloc_nx_addr_v4(nxa); ++ nxa = next; ++ } ++} ++ ++ ++#ifdef CONFIG_IPV6 ++ ++/* __alloc_nx_addr_v6() */ ++ ++static inline struct nx_addr_v6 *__alloc_nx_addr_v6(void) ++{ ++ struct nx_addr_v6 *nxa = kmem_cache_alloc( ++ nx_addr_v6_cachep, GFP_KERNEL); ++ ++ if (!IS_ERR(nxa)) ++ memset(nxa, 0, sizeof(*nxa)); ++ return nxa; ++} ++ ++/* __dealloc_nx_addr_v6() */ ++ ++static inline void __dealloc_nx_addr_v6(struct nx_addr_v6 *nxa) ++{ ++ kmem_cache_free(nx_addr_v6_cachep, nxa); ++} ++ ++/* __dealloc_nx_addr_v6_all() */ ++ ++static inline void __dealloc_nx_addr_v6_all(struct nx_addr_v6 *nxa) ++{ ++ while (nxa) { ++ struct nx_addr_v6 *next = nxa->next; ++ ++ __dealloc_nx_addr_v6(nxa); ++ nxa = next; ++ } ++} ++ ++#endif /* CONFIG_IPV6 */ ++ ++/* __alloc_nx_info() ++ ++ * allocate an initialized nx_info struct ++ * doesn't make it visible (hash) */ ++ ++static struct nx_info *__alloc_nx_info(nid_t nid) ++{ ++ struct nx_info *new = NULL; ++ ++ vxdprintk(VXD_CBIT(nid, 1), "alloc_nx_info(%d)*", nid); ++ ++ /* would this benefit from a slab cache? */ ++ new = kmalloc(sizeof(struct nx_info), GFP_KERNEL); ++ if (!new) ++ return 0; ++ ++ memset(new, 0, sizeof(struct nx_info)); ++ new->nx_id = nid; ++ INIT_HLIST_NODE(&new->nx_hlist); ++ atomic_set(&new->nx_usecnt, 0); ++ atomic_set(&new->nx_tasks, 0); ++ new->nx_state = 0; ++ ++ new->nx_flags = NXF_INIT_SET; ++ ++ /* rest of init goes here */ ++ ++ new->v4_lback.s_addr = htonl(INADDR_LOOPBACK); ++ new->v4_bcast.s_addr = htonl(INADDR_BROADCAST); ++ ++ vxdprintk(VXD_CBIT(nid, 0), ++ "alloc_nx_info(%d) = %p", nid, new); ++ atomic_inc(&nx_global_ctotal); ++ return new; ++} ++ ++/* __dealloc_nx_info() ++ ++ * final disposal of nx_info */ ++ ++static void __dealloc_nx_info(struct nx_info *nxi) ++{ ++ vxdprintk(VXD_CBIT(nid, 0), ++ "dealloc_nx_info(%p)", nxi); ++ ++ nxi->nx_hlist.next = LIST_POISON1; ++ nxi->nx_id = -1; ++ ++ BUG_ON(atomic_read(&nxi->nx_usecnt)); ++ BUG_ON(atomic_read(&nxi->nx_tasks)); ++ ++ __dealloc_nx_addr_v4_all(nxi->v4.next); ++ ++ nxi->nx_state |= NXS_RELEASED; ++ kfree(nxi); ++ atomic_dec(&nx_global_ctotal); ++} ++ ++static void __shutdown_nx_info(struct nx_info *nxi) ++{ ++ nxi->nx_state |= NXS_SHUTDOWN; ++ vs_net_change(nxi, VSC_NETDOWN); ++} ++ ++/* exported stuff */ ++ ++void free_nx_info(struct nx_info *nxi) ++{ ++ /* context shutdown is mandatory */ ++ BUG_ON(nxi->nx_state != NXS_SHUTDOWN); ++ ++ /* context must not be hashed */ ++ BUG_ON(nxi->nx_state & NXS_HASHED); ++ ++ BUG_ON(atomic_read(&nxi->nx_usecnt)); ++ BUG_ON(atomic_read(&nxi->nx_tasks)); ++ ++ __dealloc_nx_info(nxi); ++} ++ ++ ++void __nx_set_lback(struct nx_info *nxi) ++{ ++ int nid = nxi->nx_id; ++ __be32 lback = htonl(INADDR_LOOPBACK ^ ((nid & 0xFFFF) << 8)); ++ ++ nxi->v4_lback.s_addr = lback; ++} ++ ++extern int __nx_inet_add_lback(__be32 addr); ++extern int __nx_inet_del_lback(__be32 addr); ++ ++ ++/* hash table for nx_info hash */ ++ ++#define NX_HASH_SIZE 13 ++ ++struct hlist_head nx_info_hash[NX_HASH_SIZE]; ++ ++static DEFINE_SPINLOCK(nx_info_hash_lock); ++ ++ ++static inline unsigned int __hashval(nid_t nid) ++{ ++ return (nid % NX_HASH_SIZE); ++} ++ ++ ++ ++/* __hash_nx_info() ++ ++ * add the nxi to the global hash table ++ * requires the hash_lock to be held */ ++ ++static inline void __hash_nx_info(struct nx_info *nxi) ++{ ++ struct hlist_head *head; ++ ++ vxd_assert_lock(&nx_info_hash_lock); ++ vxdprintk(VXD_CBIT(nid, 4), ++ "__hash_nx_info: %p[#%d]", nxi, nxi->nx_id); ++ ++ /* context must not be hashed */ ++ BUG_ON(nx_info_state(nxi, NXS_HASHED)); ++ ++ nxi->nx_state |= NXS_HASHED; ++ head = &nx_info_hash[__hashval(nxi->nx_id)]; ++ hlist_add_head(&nxi->nx_hlist, head); ++ atomic_inc(&nx_global_cactive); ++} ++ ++/* __unhash_nx_info() ++ ++ * remove the nxi from the global hash table ++ * requires the hash_lock to be held */ ++ ++static inline void __unhash_nx_info(struct nx_info *nxi) ++{ ++ vxd_assert_lock(&nx_info_hash_lock); ++ vxdprintk(VXD_CBIT(nid, 4), ++ "__unhash_nx_info: %p[#%d.%d.%d]", nxi, nxi->nx_id, ++ atomic_read(&nxi->nx_usecnt), atomic_read(&nxi->nx_tasks)); ++ ++ /* context must be hashed */ ++ BUG_ON(!nx_info_state(nxi, NXS_HASHED)); ++ /* but without tasks */ ++ BUG_ON(atomic_read(&nxi->nx_tasks)); ++ ++ nxi->nx_state &= ~NXS_HASHED; ++ hlist_del(&nxi->nx_hlist); ++ atomic_dec(&nx_global_cactive); ++} ++ ++ ++/* __lookup_nx_info() ++ ++ * requires the hash_lock to be held ++ * doesn't increment the nx_refcnt */ ++ ++static inline struct nx_info *__lookup_nx_info(nid_t nid) ++{ ++ struct hlist_head *head = &nx_info_hash[__hashval(nid)]; ++ struct hlist_node *pos; ++ struct nx_info *nxi; ++ ++ vxd_assert_lock(&nx_info_hash_lock); ++ hlist_for_each(pos, head) { ++ nxi = hlist_entry(pos, struct nx_info, nx_hlist); ++ ++ if (nxi->nx_id == nid) ++ goto found; ++ } ++ nxi = NULL; ++found: ++ vxdprintk(VXD_CBIT(nid, 0), ++ "__lookup_nx_info(#%u): %p[#%u]", ++ nid, nxi, nxi ? nxi->nx_id : 0); ++ return nxi; ++} ++ ++ ++/* __create_nx_info() ++ ++ * create the requested context ++ * get(), claim() and hash it */ ++ ++static struct nx_info *__create_nx_info(int id) ++{ ++ struct nx_info *new, *nxi = NULL; ++ ++ vxdprintk(VXD_CBIT(nid, 1), "create_nx_info(%d)*", id); ++ ++ if (!(new = __alloc_nx_info(id))) ++ return ERR_PTR(-ENOMEM); ++ ++ /* required to make dynamic xids unique */ ++ spin_lock(&nx_info_hash_lock); ++ ++ /* static context requested */ ++ if ((nxi = __lookup_nx_info(id))) { ++ vxdprintk(VXD_CBIT(nid, 0), ++ "create_nx_info(%d) = %p (already there)", id, nxi); ++ if (nx_info_flags(nxi, NXF_STATE_SETUP, 0)) ++ nxi = ERR_PTR(-EBUSY); ++ else ++ nxi = ERR_PTR(-EEXIST); ++ goto out_unlock; ++ } ++ /* new context */ ++ vxdprintk(VXD_CBIT(nid, 0), ++ "create_nx_info(%d) = %p (new)", id, new); ++ claim_nx_info(new, NULL); ++ __nx_set_lback(new); ++ __hash_nx_info(get_nx_info(new)); ++ nxi = new, new = NULL; ++ ++out_unlock: ++ spin_unlock(&nx_info_hash_lock); ++ if (new) ++ __dealloc_nx_info(new); ++ return nxi; ++} ++ ++ ++ ++/* exported stuff */ ++ ++ ++void unhash_nx_info(struct nx_info *nxi) ++{ ++ __shutdown_nx_info(nxi); ++ spin_lock(&nx_info_hash_lock); ++ __unhash_nx_info(nxi); ++ spin_unlock(&nx_info_hash_lock); ++} ++ ++/* lookup_nx_info() ++ ++ * search for a nx_info and get() it ++ * negative id means current */ ++ ++struct nx_info *lookup_nx_info(int id) ++{ ++ struct nx_info *nxi = NULL; ++ ++ if (id < 0) { ++ nxi = get_nx_info(current_nx_info()); ++ } else if (id > 1) { ++ spin_lock(&nx_info_hash_lock); ++ nxi = get_nx_info(__lookup_nx_info(id)); ++ spin_unlock(&nx_info_hash_lock); ++ } ++ return nxi; ++} ++ ++/* nid_is_hashed() ++ ++ * verify that nid is still hashed */ ++ ++int nid_is_hashed(nid_t nid) ++{ ++ int hashed; ++ ++ spin_lock(&nx_info_hash_lock); ++ hashed = (__lookup_nx_info(nid) != NULL); ++ spin_unlock(&nx_info_hash_lock); ++ return hashed; ++} ++ ++ ++#ifdef CONFIG_PROC_FS ++ ++/* get_nid_list() ++ ++ * get a subset of hashed nids for proc ++ * assumes size is at least one */ ++ ++int get_nid_list(int index, unsigned int *nids, int size) ++{ ++ int hindex, nr_nids = 0; ++ ++ /* only show current and children */ ++ if (!nx_check(0, VS_ADMIN | VS_WATCH)) { ++ if (index > 0) ++ return 0; ++ nids[nr_nids] = nx_current_nid(); ++ return 1; ++ } ++ ++ for (hindex = 0; hindex < NX_HASH_SIZE; hindex++) { ++ struct hlist_head *head = &nx_info_hash[hindex]; ++ struct hlist_node *pos; ++ ++ spin_lock(&nx_info_hash_lock); ++ hlist_for_each(pos, head) { ++ struct nx_info *nxi; ++ ++ if (--index > 0) ++ continue; ++ ++ nxi = hlist_entry(pos, struct nx_info, nx_hlist); ++ nids[nr_nids] = nxi->nx_id; ++ if (++nr_nids >= size) { ++ spin_unlock(&nx_info_hash_lock); ++ goto out; ++ } ++ } ++ /* keep the lock time short */ ++ spin_unlock(&nx_info_hash_lock); ++ } ++out: ++ return nr_nids; ++} ++#endif ++ ++ ++/* ++ * migrate task to new network ++ * gets nxi, puts old_nxi on change ++ */ ++ ++int nx_migrate_task(struct task_struct *p, struct nx_info *nxi) ++{ ++ struct nx_info *old_nxi; ++ int ret = 0; ++ ++ if (!p || !nxi) ++ BUG(); ++ ++ vxdprintk(VXD_CBIT(nid, 5), ++ "nx_migrate_task(%p,%p[#%d.%d.%d])", ++ p, nxi, nxi->nx_id, ++ atomic_read(&nxi->nx_usecnt), ++ atomic_read(&nxi->nx_tasks)); ++ ++ if (nx_info_flags(nxi, NXF_INFO_PRIVATE, 0) && ++ !nx_info_flags(nxi, NXF_STATE_SETUP, 0)) ++ return -EACCES; ++ ++ if (nx_info_state(nxi, NXS_SHUTDOWN)) ++ return -EFAULT; ++ ++ /* maybe disallow this completely? */ ++ old_nxi = task_get_nx_info(p); ++ if (old_nxi == nxi) ++ goto out; ++ ++ task_lock(p); ++ if (old_nxi) ++ clr_nx_info(&p->nx_info); ++ claim_nx_info(nxi, p); ++ set_nx_info(&p->nx_info, nxi); ++ p->nid = nxi->nx_id; ++ task_unlock(p); ++ ++ vxdprintk(VXD_CBIT(nid, 5), ++ "moved task %p into nxi:%p[#%d]", ++ p, nxi, nxi->nx_id); ++ ++ if (old_nxi) ++ release_nx_info(old_nxi, p); ++ ret = 0; ++out: ++ put_nx_info(old_nxi); ++ return ret; ++} ++ ++ ++void nx_set_persistent(struct nx_info *nxi) ++{ ++ vxdprintk(VXD_CBIT(nid, 6), ++ "nx_set_persistent(%p[#%d])", nxi, nxi->nx_id); ++ ++ get_nx_info(nxi); ++ claim_nx_info(nxi, NULL); ++} ++ ++void nx_clear_persistent(struct nx_info *nxi) ++{ ++ vxdprintk(VXD_CBIT(nid, 6), ++ "nx_clear_persistent(%p[#%d])", nxi, nxi->nx_id); ++ ++ release_nx_info(nxi, NULL); ++ put_nx_info(nxi); ++} ++ ++void nx_update_persistent(struct nx_info *nxi) ++{ ++ if (nx_info_flags(nxi, NXF_PERSISTENT, 0)) ++ nx_set_persistent(nxi); ++ else ++ nx_clear_persistent(nxi); ++} ++ ++/* vserver syscall commands below here */ ++ ++/* taks nid and nx_info functions */ ++ ++#include ++ ++ ++int vc_task_nid(uint32_t id) ++{ ++ nid_t nid; ++ ++ if (id) { ++ struct task_struct *tsk; ++ ++ rcu_read_lock(); ++ tsk = find_task_by_real_pid(id); ++ nid = (tsk) ? tsk->nid : -ESRCH; ++ rcu_read_unlock(); ++ } else ++ nid = nx_current_nid(); ++ return nid; ++} ++ ++ ++int vc_nx_info(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_nx_info_v0 vc_data; ++ ++ vc_data.nid = nxi->nx_id; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++/* network functions */ ++ ++int vc_net_create(uint32_t nid, void __user *data) ++{ ++ struct vcmd_net_create vc_data = { .flagword = NXF_INIT_SET }; ++ struct nx_info *new_nxi; ++ int ret; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ if ((nid > MAX_S_CONTEXT) || (nid < 2)) ++ return -EINVAL; ++ ++ new_nxi = __create_nx_info(nid); ++ if (IS_ERR(new_nxi)) ++ return PTR_ERR(new_nxi); ++ ++ /* initial flags */ ++ new_nxi->nx_flags = vc_data.flagword; ++ ++ ret = -ENOEXEC; ++ if (vs_net_change(new_nxi, VSC_NETUP)) ++ goto out; ++ ++ ret = nx_migrate_task(current, new_nxi); ++ if (ret) ++ goto out; ++ ++ /* return context id on success */ ++ ret = new_nxi->nx_id; ++ ++ /* get a reference for persistent contexts */ ++ if ((vc_data.flagword & NXF_PERSISTENT)) ++ nx_set_persistent(new_nxi); ++out: ++ release_nx_info(new_nxi, NULL); ++ put_nx_info(new_nxi); ++ return ret; ++} ++ ++ ++int vc_net_migrate(struct nx_info *nxi, void __user *data) ++{ ++ return nx_migrate_task(current, nxi); ++} ++ ++ ++ ++int do_add_v4_addr(struct nx_info *nxi, __be32 ip, __be32 ip2, __be32 mask, ++ uint16_t type, uint16_t flags) ++{ ++ struct nx_addr_v4 *nxa = &nxi->v4; ++ ++ if (NX_IPV4(nxi)) { ++ /* locate last entry */ ++ for (; nxa->next; nxa = nxa->next); ++ nxa->next = __alloc_nx_addr_v4(); ++ nxa = nxa->next; ++ ++ if (IS_ERR(nxa)) ++ return PTR_ERR(nxa); ++ } ++ ++ if (nxi->v4.next) ++ /* remove single ip for ip list */ ++ nxi->nx_flags &= ~NXF_SINGLE_IP; ++ ++ nxa->ip[0].s_addr = ip; ++ nxa->ip[1].s_addr = ip2; ++ nxa->mask.s_addr = mask; ++ nxa->type = type; ++ nxa->flags = flags; ++ return 0; ++} ++ ++int do_remove_v4_addr(struct nx_info *nxi, __be32 ip, __be32 ip2, __be32 mask, ++ uint16_t type, uint16_t flags) ++{ ++ struct nx_addr_v4 *nxa = &nxi->v4; ++ ++ switch (type) { ++/* case NXA_TYPE_ADDR: ++ break; */ ++ ++ case NXA_TYPE_ANY: ++ __dealloc_nx_addr_v4_all(xchg(&nxa->next, NULL)); ++ memset(nxa, 0, sizeof(*nxa)); ++ break; ++ ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++ ++int vc_net_add(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_v0 vc_data; ++ int index, ret = 0; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ switch (vc_data.type) { ++ case NXA_TYPE_IPV4: ++ if ((vc_data.count < 1) || (vc_data.count > 4)) ++ return -EINVAL; ++ ++ index = 0; ++ while (index < vc_data.count) { ++ ret = do_add_v4_addr(nxi, vc_data.ip[index].s_addr, 0, ++ vc_data.mask[index].s_addr, NXA_TYPE_ADDR, 0); ++ if (ret) ++ return ret; ++ index++; ++ } ++ ret = index; ++ break; ++ ++ case NXA_TYPE_IPV4|NXA_MOD_BCAST: ++ nxi->v4_bcast = vc_data.ip[0]; ++ ret = 1; ++ break; ++ ++ case NXA_TYPE_IPV4|NXA_MOD_LBACK: ++ nxi->v4_lback = vc_data.ip[0]; ++ ret = 1; ++ break; ++ ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ return ret; ++} ++ ++int vc_net_remove(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_v0 vc_data; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ switch (vc_data.type) { ++ case NXA_TYPE_ANY: ++ __dealloc_nx_addr_v4_all(xchg(&nxi->v4.next, NULL)); ++ memset(&nxi->v4, 0, sizeof(nxi->v4)); ++ break; ++ ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++ ++int vc_net_add_ipv4_v1(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_ipv4_v1 vc_data; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ switch (vc_data.type) { ++ case NXA_TYPE_ADDR: ++ case NXA_TYPE_MASK: ++ return do_add_v4_addr(nxi, vc_data.ip.s_addr, 0, ++ vc_data.mask.s_addr, vc_data.type, vc_data.flags); ++ ++ case NXA_TYPE_ADDR | NXA_MOD_BCAST: ++ nxi->v4_bcast = vc_data.ip; ++ break; ++ ++ case NXA_TYPE_ADDR | NXA_MOD_LBACK: ++ nxi->v4_lback = vc_data.ip; ++ break; ++ ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++int vc_net_add_ipv4(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_ipv4_v2 vc_data; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ switch (vc_data.type) { ++ case NXA_TYPE_ADDR: ++ case NXA_TYPE_MASK: ++ case NXA_TYPE_RANGE: ++ return do_add_v4_addr(nxi, vc_data.ip.s_addr, vc_data.ip2.s_addr, ++ vc_data.mask.s_addr, vc_data.type, vc_data.flags); ++ ++ case NXA_TYPE_ADDR | NXA_MOD_BCAST: ++ nxi->v4_bcast = vc_data.ip; ++ break; ++ ++ case NXA_TYPE_ADDR | NXA_MOD_LBACK: ++ nxi->v4_lback = vc_data.ip; ++ break; ++ ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++int vc_net_rem_ipv4_v1(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_ipv4_v1 vc_data; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_remove_v4_addr(nxi, vc_data.ip.s_addr, 0, ++ vc_data.mask.s_addr, vc_data.type, vc_data.flags); ++} ++ ++int vc_net_rem_ipv4(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_ipv4_v2 vc_data; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_remove_v4_addr(nxi, vc_data.ip.s_addr, vc_data.ip2.s_addr, ++ vc_data.mask.s_addr, vc_data.type, vc_data.flags); ++} ++ ++#ifdef CONFIG_IPV6 ++ ++int do_add_v6_addr(struct nx_info *nxi, ++ struct in6_addr *ip, struct in6_addr *mask, ++ uint32_t prefix, uint16_t type, uint16_t flags) ++{ ++ struct nx_addr_v6 *nxa = &nxi->v6; ++ ++ if (NX_IPV6(nxi)) { ++ /* locate last entry */ ++ for (; nxa->next; nxa = nxa->next); ++ nxa->next = __alloc_nx_addr_v6(); ++ nxa = nxa->next; ++ ++ if (IS_ERR(nxa)) ++ return PTR_ERR(nxa); ++ } ++ ++ nxa->ip = *ip; ++ nxa->mask = *mask; ++ nxa->prefix = prefix; ++ nxa->type = type; ++ nxa->flags = flags; ++ return 0; ++} ++ ++ ++int vc_net_add_ipv6(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_ipv6_v1 vc_data; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ switch (vc_data.type) { ++ case NXA_TYPE_ADDR: ++ memset(&vc_data.mask, ~0, sizeof(vc_data.mask)); ++ /* fallthrough */ ++ case NXA_TYPE_MASK: ++ return do_add_v6_addr(nxi, &vc_data.ip, &vc_data.mask, ++ vc_data.prefix, vc_data.type, vc_data.flags); ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++int vc_net_remove_ipv6(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_ipv6_v1 vc_data; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ switch (vc_data.type) { ++ case NXA_TYPE_ANY: ++ __dealloc_nx_addr_v6_all(xchg(&nxi->v6.next, NULL)); ++ memset(&nxi->v6, 0, sizeof(nxi->v6)); ++ break; ++ ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++#endif /* CONFIG_IPV6 */ ++ ++ ++int vc_get_nflags(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_flags_v0 vc_data; ++ ++ vc_data.flagword = nxi->nx_flags; ++ ++ /* special STATE flag handling */ ++ vc_data.mask = vs_mask_flags(~0ULL, nxi->nx_flags, NXF_ONE_TIME); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_nflags(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_flags_v0 vc_data; ++ uint64_t mask, trigger; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ /* special STATE flag handling */ ++ mask = vs_mask_mask(vc_data.mask, nxi->nx_flags, NXF_ONE_TIME); ++ trigger = (mask & nxi->nx_flags) ^ (mask & vc_data.flagword); ++ ++ nxi->nx_flags = vs_mask_flags(nxi->nx_flags, ++ vc_data.flagword, mask); ++ if (trigger & NXF_PERSISTENT) ++ nx_update_persistent(nxi); ++ ++ return 0; ++} ++ ++int vc_get_ncaps(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_caps_v0 vc_data; ++ ++ vc_data.ncaps = nxi->nx_ncaps; ++ vc_data.cmask = ~0ULL; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_ncaps(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_caps_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ nxi->nx_ncaps = vs_mask_flags(nxi->nx_ncaps, ++ vc_data.ncaps, vc_data.cmask); ++ return 0; ++} ++ ++ ++#include ++ ++module_init(init_network); ++ ++EXPORT_SYMBOL_GPL(free_nx_info); ++EXPORT_SYMBOL_GPL(unhash_nx_info); ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/proc.c linux-3.2.34-vs2.3.2.15/kernel/vserver/proc.c +--- linux-3.2.34/kernel/vserver/proc.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/proc.c 2011-12-06 23:19:52.000000000 +0100 +@@ -0,0 +1,1103 @@ ++/* ++ * linux/kernel/vserver/proc.c ++ * ++ * Virtual Context Support ++ * ++ * Copyright (C) 2003-2011 Herbert Pötzl ++ * ++ * V0.01 basic structure ++ * V0.02 adaptation vs1.3.0 ++ * V0.03 proc permissions ++ * V0.04 locking/generic ++ * V0.05 next generation procfs ++ * V0.06 inode validation ++ * V0.07 generic rewrite vid ++ * V0.08 remove inode type ++ * V0.09 added u/wmask info ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "cvirt_proc.h" ++#include "cacct_proc.h" ++#include "limit_proc.h" ++#include "sched_proc.h" ++#include "vci_config.h" ++ ++ ++static inline char *print_cap_t(char *buffer, kernel_cap_t *c) ++{ ++ unsigned __capi; ++ ++ CAP_FOR_EACH_U32(__capi) { ++ buffer += sprintf(buffer, "%08x", ++ c->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]); ++ } ++ return buffer; ++} ++ ++ ++static struct proc_dir_entry *proc_virtual; ++ ++static struct proc_dir_entry *proc_virtnet; ++ ++ ++/* first the actual feeds */ ++ ++ ++static int proc_vci(char *buffer) ++{ ++ return sprintf(buffer, ++ "VCIVersion:\t%04x:%04x\n" ++ "VCISyscall:\t%d\n" ++ "VCIKernel:\t%08x\n", ++ VCI_VERSION >> 16, ++ VCI_VERSION & 0xFFFF, ++ __NR_vserver, ++ vci_kernel_config()); ++} ++ ++static int proc_virtual_info(char *buffer) ++{ ++ return proc_vci(buffer); ++} ++ ++static int proc_virtual_status(char *buffer) ++{ ++ return sprintf(buffer, ++ "#CTotal:\t%d\n" ++ "#CActive:\t%d\n" ++ "#NSProxy:\t%d\t%d %d %d %d %d %d\n" ++ "#InitTask:\t%d\t%d %d\n", ++ atomic_read(&vx_global_ctotal), ++ atomic_read(&vx_global_cactive), ++ atomic_read(&vs_global_nsproxy), ++ atomic_read(&vs_global_fs), ++ atomic_read(&vs_global_mnt_ns), ++ atomic_read(&vs_global_uts_ns), ++ atomic_read(&nr_ipc_ns), ++ atomic_read(&vs_global_user_ns), ++ atomic_read(&vs_global_pid_ns), ++ atomic_read(&init_task.usage), ++ atomic_read(&init_task.nsproxy->count), ++ init_task.fs->users); ++} ++ ++ ++int proc_vxi_info(struct vx_info *vxi, char *buffer) ++{ ++ int length; ++ ++ length = sprintf(buffer, ++ "ID:\t%d\n" ++ "Info:\t%p\n" ++ "Init:\t%d\n" ++ "OOM:\t%lld\n", ++ vxi->vx_id, ++ vxi, ++ vxi->vx_initpid, ++ vxi->vx_badness_bias); ++ return length; ++} ++ ++int proc_vxi_status(struct vx_info *vxi, char *buffer) ++{ ++ char *orig = buffer; ++ ++ buffer += sprintf(buffer, ++ "UseCnt:\t%d\n" ++ "Tasks:\t%d\n" ++ "Flags:\t%016llx\n", ++ atomic_read(&vxi->vx_usecnt), ++ atomic_read(&vxi->vx_tasks), ++ (unsigned long long)vxi->vx_flags); ++ ++ buffer += sprintf(buffer, "BCaps:\t"); ++ buffer = print_cap_t(buffer, &vxi->vx_bcaps); ++ buffer += sprintf(buffer, "\n"); ++ ++ buffer += sprintf(buffer, ++ "CCaps:\t%016llx\n" ++ "Umask:\t%16llx\n" ++ "Wmask:\t%16llx\n" ++ "Spaces:\t%08lx %08lx\n", ++ (unsigned long long)vxi->vx_ccaps, ++ (unsigned long long)vxi->vx_umask, ++ (unsigned long long)vxi->vx_wmask, ++ vxi->space[0].vx_nsmask, vxi->space[1].vx_nsmask); ++ return buffer - orig; ++} ++ ++int proc_vxi_limit(struct vx_info *vxi, char *buffer) ++{ ++ return vx_info_proc_limit(&vxi->limit, buffer); ++} ++ ++int proc_vxi_sched(struct vx_info *vxi, char *buffer) ++{ ++ int cpu, length; ++ ++ length = vx_info_proc_sched(&vxi->sched, buffer); ++ for_each_online_cpu(cpu) { ++ length += vx_info_proc_sched_pc( ++ &vx_per_cpu(vxi, sched_pc, cpu), ++ buffer + length, cpu); ++ } ++ return length; ++} ++ ++int proc_vxi_nsproxy0(struct vx_info *vxi, char *buffer) ++{ ++ return vx_info_proc_nsproxy(vxi->space[0].vx_nsproxy, buffer); ++} ++ ++int proc_vxi_nsproxy1(struct vx_info *vxi, char *buffer) ++{ ++ return vx_info_proc_nsproxy(vxi->space[1].vx_nsproxy, buffer); ++} ++ ++int proc_vxi_cvirt(struct vx_info *vxi, char *buffer) ++{ ++ int cpu, length; ++ ++ vx_update_load(vxi); ++ length = vx_info_proc_cvirt(&vxi->cvirt, buffer); ++ for_each_online_cpu(cpu) { ++ length += vx_info_proc_cvirt_pc( ++ &vx_per_cpu(vxi, cvirt_pc, cpu), ++ buffer + length, cpu); ++ } ++ return length; ++} ++ ++int proc_vxi_cacct(struct vx_info *vxi, char *buffer) ++{ ++ return vx_info_proc_cacct(&vxi->cacct, buffer); ++} ++ ++ ++static int proc_virtnet_info(char *buffer) ++{ ++ return proc_vci(buffer); ++} ++ ++static int proc_virtnet_status(char *buffer) ++{ ++ return sprintf(buffer, ++ "#CTotal:\t%d\n" ++ "#CActive:\t%d\n", ++ atomic_read(&nx_global_ctotal), ++ atomic_read(&nx_global_cactive)); ++} ++ ++int proc_nxi_info(struct nx_info *nxi, char *buffer) ++{ ++ struct nx_addr_v4 *v4a; ++#ifdef CONFIG_IPV6 ++ struct nx_addr_v6 *v6a; ++#endif ++ int length, i; ++ ++ length = sprintf(buffer, ++ "ID:\t%d\n" ++ "Info:\t%p\n" ++ "Bcast:\t" NIPQUAD_FMT "\n" ++ "Lback:\t" NIPQUAD_FMT "\n", ++ nxi->nx_id, ++ nxi, ++ NIPQUAD(nxi->v4_bcast.s_addr), ++ NIPQUAD(nxi->v4_lback.s_addr)); ++ ++ if (!NX_IPV4(nxi)) ++ goto skip_v4; ++ for (i = 0, v4a = &nxi->v4; v4a; i++, v4a = v4a->next) ++ length += sprintf(buffer + length, "%d:\t" NXAV4_FMT "\n", ++ i, NXAV4(v4a)); ++skip_v4: ++#ifdef CONFIG_IPV6 ++ if (!NX_IPV6(nxi)) ++ goto skip_v6; ++ for (i = 0, v6a = &nxi->v6; v6a; i++, v6a = v6a->next) ++ length += sprintf(buffer + length, "%d:\t" NXAV6_FMT "\n", ++ i, NXAV6(v6a)); ++skip_v6: ++#endif ++ return length; ++} ++ ++int proc_nxi_status(struct nx_info *nxi, char *buffer) ++{ ++ int length; ++ ++ length = sprintf(buffer, ++ "UseCnt:\t%d\n" ++ "Tasks:\t%d\n" ++ "Flags:\t%016llx\n" ++ "NCaps:\t%016llx\n", ++ atomic_read(&nxi->nx_usecnt), ++ atomic_read(&nxi->nx_tasks), ++ (unsigned long long)nxi->nx_flags, ++ (unsigned long long)nxi->nx_ncaps); ++ return length; ++} ++ ++ ++ ++/* here the inode helpers */ ++ ++struct vs_entry { ++ int len; ++ char *name; ++ mode_t mode; ++ struct inode_operations *iop; ++ struct file_operations *fop; ++ union proc_op op; ++}; ++ ++static struct inode *vs_proc_make_inode(struct super_block *sb, struct vs_entry *p) ++{ ++ struct inode *inode = new_inode(sb); ++ ++ if (!inode) ++ goto out; ++ ++ inode->i_mode = p->mode; ++ if (p->iop) ++ inode->i_op = p->iop; ++ if (p->fop) ++ inode->i_fop = p->fop; ++ ++ set_nlink(inode, (p->mode & S_IFDIR) ? 2 : 1); ++ inode->i_flags |= S_IMMUTABLE; ++ ++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; ++ ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_tag = 0; ++out: ++ return inode; ++} ++ ++static struct dentry *vs_proc_instantiate(struct inode *dir, ++ struct dentry *dentry, int id, void *ptr) ++{ ++ struct vs_entry *p = ptr; ++ struct inode *inode = vs_proc_make_inode(dir->i_sb, p); ++ struct dentry *error = ERR_PTR(-EINVAL); ++ ++ if (!inode) ++ goto out; ++ ++ PROC_I(inode)->op = p->op; ++ PROC_I(inode)->fd = id; ++ d_add(dentry, inode); ++ error = NULL; ++out: ++ return error; ++} ++ ++/* Lookups */ ++ ++typedef struct dentry *instantiate_t(struct inode *, struct dentry *, int, void *); ++ ++/* ++ * Fill a directory entry. ++ * ++ * If possible create the dcache entry and derive our inode number and ++ * file type from dcache entry. ++ * ++ * Since all of the proc inode numbers are dynamically generated, the inode ++ * numbers do not exist until the inode is cache. This means creating the ++ * the dcache entry in readdir is necessary to keep the inode numbers ++ * reported by readdir in sync with the inode numbers reported ++ * by stat. ++ */ ++static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, ++ char *name, int len, instantiate_t instantiate, int id, void *ptr) ++{ ++ struct dentry *child, *dir = filp->f_dentry; ++ struct inode *inode; ++ struct qstr qname; ++ ino_t ino = 0; ++ unsigned type = DT_UNKNOWN; ++ ++ qname.name = name; ++ qname.len = len; ++ qname.hash = full_name_hash(name, len); ++ ++ child = d_lookup(dir, &qname); ++ if (!child) { ++ struct dentry *new; ++ new = d_alloc(dir, &qname); ++ if (new) { ++ child = instantiate(dir->d_inode, new, id, ptr); ++ if (child) ++ dput(new); ++ else ++ child = new; ++ } ++ } ++ if (!child || IS_ERR(child) || !child->d_inode) ++ goto end_instantiate; ++ inode = child->d_inode; ++ if (inode) { ++ ino = inode->i_ino; ++ type = inode->i_mode >> 12; ++ } ++ dput(child); ++end_instantiate: ++ if (!ino) ++ ino = find_inode_number(dir, &qname); ++ if (!ino) ++ ino = 1; ++ return filldir(dirent, name, len, filp->f_pos, ino, type); ++} ++ ++ ++ ++/* get and revalidate vx_info/xid */ ++ ++static inline ++struct vx_info *get_proc_vx_info(struct inode *inode) ++{ ++ return lookup_vx_info(PROC_I(inode)->fd); ++} ++ ++static int proc_xid_revalidate(struct dentry *dentry, struct nameidata *nd) ++{ ++ struct inode *inode = dentry->d_inode; ++ xid_t xid = PROC_I(inode)->fd; ++ ++ if (!xid || xid_is_hashed(xid)) ++ return 1; ++ d_drop(dentry); ++ return 0; ++} ++ ++ ++/* get and revalidate nx_info/nid */ ++ ++static int proc_nid_revalidate(struct dentry *dentry, struct nameidata *nd) ++{ ++ struct inode *inode = dentry->d_inode; ++ nid_t nid = PROC_I(inode)->fd; ++ ++ if (!nid || nid_is_hashed(nid)) ++ return 1; ++ d_drop(dentry); ++ return 0; ++} ++ ++ ++ ++#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024) ++ ++static ssize_t proc_vs_info_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct inode *inode = file->f_dentry->d_inode; ++ unsigned long page; ++ ssize_t length = 0; ++ ++ if (count > PROC_BLOCK_SIZE) ++ count = PROC_BLOCK_SIZE; ++ ++ /* fade that out as soon as stable */ ++ WARN_ON(PROC_I(inode)->fd); ++ ++ if (!(page = __get_free_page(GFP_KERNEL))) ++ return -ENOMEM; ++ ++ BUG_ON(!PROC_I(inode)->op.proc_vs_read); ++ length = PROC_I(inode)->op.proc_vs_read((char *)page); ++ ++ if (length >= 0) ++ length = simple_read_from_buffer(buf, count, ppos, ++ (char *)page, length); ++ ++ free_page(page); ++ return length; ++} ++ ++static ssize_t proc_vx_info_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct inode *inode = file->f_dentry->d_inode; ++ struct vx_info *vxi = NULL; ++ xid_t xid = PROC_I(inode)->fd; ++ unsigned long page; ++ ssize_t length = 0; ++ ++ if (count > PROC_BLOCK_SIZE) ++ count = PROC_BLOCK_SIZE; ++ ++ /* fade that out as soon as stable */ ++ WARN_ON(!xid); ++ vxi = lookup_vx_info(xid); ++ if (!vxi) ++ goto out; ++ ++ length = -ENOMEM; ++ if (!(page = __get_free_page(GFP_KERNEL))) ++ goto out_put; ++ ++ BUG_ON(!PROC_I(inode)->op.proc_vxi_read); ++ length = PROC_I(inode)->op.proc_vxi_read(vxi, (char *)page); ++ ++ if (length >= 0) ++ length = simple_read_from_buffer(buf, count, ppos, ++ (char *)page, length); ++ ++ free_page(page); ++out_put: ++ put_vx_info(vxi); ++out: ++ return length; ++} ++ ++static ssize_t proc_nx_info_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct inode *inode = file->f_dentry->d_inode; ++ struct nx_info *nxi = NULL; ++ nid_t nid = PROC_I(inode)->fd; ++ unsigned long page; ++ ssize_t length = 0; ++ ++ if (count > PROC_BLOCK_SIZE) ++ count = PROC_BLOCK_SIZE; ++ ++ /* fade that out as soon as stable */ ++ WARN_ON(!nid); ++ nxi = lookup_nx_info(nid); ++ if (!nxi) ++ goto out; ++ ++ length = -ENOMEM; ++ if (!(page = __get_free_page(GFP_KERNEL))) ++ goto out_put; ++ ++ BUG_ON(!PROC_I(inode)->op.proc_nxi_read); ++ length = PROC_I(inode)->op.proc_nxi_read(nxi, (char *)page); ++ ++ if (length >= 0) ++ length = simple_read_from_buffer(buf, count, ppos, ++ (char *)page, length); ++ ++ free_page(page); ++out_put: ++ put_nx_info(nxi); ++out: ++ return length; ++} ++ ++ ++ ++/* here comes the lower level */ ++ ++ ++#define NOD(NAME, MODE, IOP, FOP, OP) { \ ++ .len = sizeof(NAME) - 1, \ ++ .name = (NAME), \ ++ .mode = MODE, \ ++ .iop = IOP, \ ++ .fop = FOP, \ ++ .op = OP, \ ++} ++ ++ ++#define DIR(NAME, MODE, OTYPE) \ ++ NOD(NAME, (S_IFDIR | (MODE)), \ ++ &proc_ ## OTYPE ## _inode_operations, \ ++ &proc_ ## OTYPE ## _file_operations, { } ) ++ ++#define INF(NAME, MODE, OTYPE) \ ++ NOD(NAME, (S_IFREG | (MODE)), NULL, \ ++ &proc_vs_info_file_operations, \ ++ { .proc_vs_read = &proc_##OTYPE } ) ++ ++#define VINF(NAME, MODE, OTYPE) \ ++ NOD(NAME, (S_IFREG | (MODE)), NULL, \ ++ &proc_vx_info_file_operations, \ ++ { .proc_vxi_read = &proc_##OTYPE } ) ++ ++#define NINF(NAME, MODE, OTYPE) \ ++ NOD(NAME, (S_IFREG | (MODE)), NULL, \ ++ &proc_nx_info_file_operations, \ ++ { .proc_nxi_read = &proc_##OTYPE } ) ++ ++ ++static struct file_operations proc_vs_info_file_operations = { ++ .read = proc_vs_info_read, ++}; ++ ++static struct file_operations proc_vx_info_file_operations = { ++ .read = proc_vx_info_read, ++}; ++ ++static struct dentry_operations proc_xid_dentry_operations = { ++ .d_revalidate = proc_xid_revalidate, ++}; ++ ++static struct vs_entry vx_base_stuff[] = { ++ VINF("info", S_IRUGO, vxi_info), ++ VINF("status", S_IRUGO, vxi_status), ++ VINF("limit", S_IRUGO, vxi_limit), ++ VINF("sched", S_IRUGO, vxi_sched), ++ VINF("nsproxy", S_IRUGO, vxi_nsproxy0), ++ VINF("nsproxy1",S_IRUGO, vxi_nsproxy1), ++ VINF("cvirt", S_IRUGO, vxi_cvirt), ++ VINF("cacct", S_IRUGO, vxi_cacct), ++ {} ++}; ++ ++ ++ ++ ++static struct dentry *proc_xid_instantiate(struct inode *dir, ++ struct dentry *dentry, int id, void *ptr) ++{ ++ dentry->d_op = &proc_xid_dentry_operations; ++ return vs_proc_instantiate(dir, dentry, id, ptr); ++} ++ ++static struct dentry *proc_xid_lookup(struct inode *dir, ++ struct dentry *dentry, struct nameidata *nd) ++{ ++ struct vs_entry *p = vx_base_stuff; ++ struct dentry *error = ERR_PTR(-ENOENT); ++ ++ for (; p->name; p++) { ++ if (p->len != dentry->d_name.len) ++ continue; ++ if (!memcmp(dentry->d_name.name, p->name, p->len)) ++ break; ++ } ++ if (!p->name) ++ goto out; ++ ++ error = proc_xid_instantiate(dir, dentry, PROC_I(dir)->fd, p); ++out: ++ return error; ++} ++ ++static int proc_xid_readdir(struct file *filp, ++ void *dirent, filldir_t filldir) ++{ ++ struct dentry *dentry = filp->f_dentry; ++ struct inode *inode = dentry->d_inode; ++ struct vs_entry *p = vx_base_stuff; ++ int size = sizeof(vx_base_stuff) / sizeof(struct vs_entry); ++ int pos, index; ++ u64 ino; ++ ++ pos = filp->f_pos; ++ switch (pos) { ++ case 0: ++ ino = inode->i_ino; ++ if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ case 1: ++ ino = parent_ino(dentry); ++ if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ default: ++ index = pos - 2; ++ if (index >= size) ++ goto out; ++ for (p += index; p->name; p++) { ++ if (proc_fill_cache(filp, dirent, filldir, p->name, p->len, ++ vs_proc_instantiate, PROC_I(inode)->fd, p)) ++ goto out; ++ pos++; ++ } ++ } ++out: ++ filp->f_pos = pos; ++ return 1; ++} ++ ++ ++ ++static struct file_operations proc_nx_info_file_operations = { ++ .read = proc_nx_info_read, ++}; ++ ++static struct dentry_operations proc_nid_dentry_operations = { ++ .d_revalidate = proc_nid_revalidate, ++}; ++ ++static struct vs_entry nx_base_stuff[] = { ++ NINF("info", S_IRUGO, nxi_info), ++ NINF("status", S_IRUGO, nxi_status), ++ {} ++}; ++ ++ ++static struct dentry *proc_nid_instantiate(struct inode *dir, ++ struct dentry *dentry, int id, void *ptr) ++{ ++ dentry->d_op = &proc_nid_dentry_operations; ++ return vs_proc_instantiate(dir, dentry, id, ptr); ++} ++ ++static struct dentry *proc_nid_lookup(struct inode *dir, ++ struct dentry *dentry, struct nameidata *nd) ++{ ++ struct vs_entry *p = nx_base_stuff; ++ struct dentry *error = ERR_PTR(-ENOENT); ++ ++ for (; p->name; p++) { ++ if (p->len != dentry->d_name.len) ++ continue; ++ if (!memcmp(dentry->d_name.name, p->name, p->len)) ++ break; ++ } ++ if (!p->name) ++ goto out; ++ ++ error = proc_nid_instantiate(dir, dentry, PROC_I(dir)->fd, p); ++out: ++ return error; ++} ++ ++static int proc_nid_readdir(struct file *filp, ++ void *dirent, filldir_t filldir) ++{ ++ struct dentry *dentry = filp->f_dentry; ++ struct inode *inode = dentry->d_inode; ++ struct vs_entry *p = nx_base_stuff; ++ int size = sizeof(nx_base_stuff) / sizeof(struct vs_entry); ++ int pos, index; ++ u64 ino; ++ ++ pos = filp->f_pos; ++ switch (pos) { ++ case 0: ++ ino = inode->i_ino; ++ if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ case 1: ++ ino = parent_ino(dentry); ++ if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ default: ++ index = pos - 2; ++ if (index >= size) ++ goto out; ++ for (p += index; p->name; p++) { ++ if (proc_fill_cache(filp, dirent, filldir, p->name, p->len, ++ vs_proc_instantiate, PROC_I(inode)->fd, p)) ++ goto out; ++ pos++; ++ } ++ } ++out: ++ filp->f_pos = pos; ++ return 1; ++} ++ ++ ++#define MAX_MULBY10 ((~0U - 9) / 10) ++ ++static inline int atovid(const char *str, int len) ++{ ++ int vid, c; ++ ++ vid = 0; ++ while (len-- > 0) { ++ c = *str - '0'; ++ str++; ++ if (c > 9) ++ return -1; ++ if (vid >= MAX_MULBY10) ++ return -1; ++ vid *= 10; ++ vid += c; ++ if (!vid) ++ return -1; ++ } ++ return vid; ++} ++ ++/* now the upper level (virtual) */ ++ ++ ++static struct file_operations proc_xid_file_operations = { ++ .read = generic_read_dir, ++ .readdir = proc_xid_readdir, ++}; ++ ++static struct inode_operations proc_xid_inode_operations = { ++ .lookup = proc_xid_lookup, ++}; ++ ++static struct vs_entry vx_virtual_stuff[] = { ++ INF("info", S_IRUGO, virtual_info), ++ INF("status", S_IRUGO, virtual_status), ++ DIR(NULL, S_IRUGO | S_IXUGO, xid), ++}; ++ ++ ++static struct dentry *proc_virtual_lookup(struct inode *dir, ++ struct dentry *dentry, struct nameidata *nd) ++{ ++ struct vs_entry *p = vx_virtual_stuff; ++ struct dentry *error = ERR_PTR(-ENOENT); ++ int id = 0; ++ ++ for (; p->name; p++) { ++ if (p->len != dentry->d_name.len) ++ continue; ++ if (!memcmp(dentry->d_name.name, p->name, p->len)) ++ break; ++ } ++ if (p->name) ++ goto instantiate; ++ ++ id = atovid(dentry->d_name.name, dentry->d_name.len); ++ if ((id < 0) || !xid_is_hashed(id)) ++ goto out; ++ ++instantiate: ++ error = proc_xid_instantiate(dir, dentry, id, p); ++out: ++ return error; ++} ++ ++static struct file_operations proc_nid_file_operations = { ++ .read = generic_read_dir, ++ .readdir = proc_nid_readdir, ++}; ++ ++static struct inode_operations proc_nid_inode_operations = { ++ .lookup = proc_nid_lookup, ++}; ++ ++static struct vs_entry nx_virtnet_stuff[] = { ++ INF("info", S_IRUGO, virtnet_info), ++ INF("status", S_IRUGO, virtnet_status), ++ DIR(NULL, S_IRUGO | S_IXUGO, nid), ++}; ++ ++ ++static struct dentry *proc_virtnet_lookup(struct inode *dir, ++ struct dentry *dentry, struct nameidata *nd) ++{ ++ struct vs_entry *p = nx_virtnet_stuff; ++ struct dentry *error = ERR_PTR(-ENOENT); ++ int id = 0; ++ ++ for (; p->name; p++) { ++ if (p->len != dentry->d_name.len) ++ continue; ++ if (!memcmp(dentry->d_name.name, p->name, p->len)) ++ break; ++ } ++ if (p->name) ++ goto instantiate; ++ ++ id = atovid(dentry->d_name.name, dentry->d_name.len); ++ if ((id < 0) || !nid_is_hashed(id)) ++ goto out; ++ ++instantiate: ++ error = proc_nid_instantiate(dir, dentry, id, p); ++out: ++ return error; ++} ++ ++ ++#define PROC_MAXVIDS 32 ++ ++int proc_virtual_readdir(struct file *filp, ++ void *dirent, filldir_t filldir) ++{ ++ struct dentry *dentry = filp->f_dentry; ++ struct inode *inode = dentry->d_inode; ++ struct vs_entry *p = vx_virtual_stuff; ++ int size = sizeof(vx_virtual_stuff) / sizeof(struct vs_entry); ++ int pos, index; ++ unsigned int xid_array[PROC_MAXVIDS]; ++ char buf[PROC_NUMBUF]; ++ unsigned int nr_xids, i; ++ u64 ino; ++ ++ pos = filp->f_pos; ++ switch (pos) { ++ case 0: ++ ino = inode->i_ino; ++ if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ case 1: ++ ino = parent_ino(dentry); ++ if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ default: ++ index = pos - 2; ++ if (index >= size) ++ goto entries; ++ for (p += index; p->name; p++) { ++ if (proc_fill_cache(filp, dirent, filldir, p->name, p->len, ++ vs_proc_instantiate, 0, p)) ++ goto out; ++ pos++; ++ } ++ entries: ++ index = pos - size; ++ p = &vx_virtual_stuff[size - 1]; ++ nr_xids = get_xid_list(index, xid_array, PROC_MAXVIDS); ++ for (i = 0; i < nr_xids; i++) { ++ int n, xid = xid_array[i]; ++ unsigned int j = PROC_NUMBUF; ++ ++ n = xid; ++ do ++ buf[--j] = '0' + (n % 10); ++ while (n /= 10); ++ ++ if (proc_fill_cache(filp, dirent, filldir, ++ buf + j, PROC_NUMBUF - j, ++ vs_proc_instantiate, xid, p)) ++ goto out; ++ pos++; ++ } ++ } ++out: ++ filp->f_pos = pos; ++ return 0; ++} ++ ++static int proc_virtual_getattr(struct vfsmount *mnt, ++ struct dentry *dentry, struct kstat *stat) ++{ ++ struct inode *inode = dentry->d_inode; ++ ++ generic_fillattr(inode, stat); ++ stat->nlink = 2 + atomic_read(&vx_global_cactive); ++ return 0; ++} ++ ++static struct file_operations proc_virtual_dir_operations = { ++ .read = generic_read_dir, ++ .readdir = proc_virtual_readdir, ++}; ++ ++static struct inode_operations proc_virtual_dir_inode_operations = { ++ .getattr = proc_virtual_getattr, ++ .lookup = proc_virtual_lookup, ++}; ++ ++ ++ ++ ++ ++int proc_virtnet_readdir(struct file *filp, ++ void *dirent, filldir_t filldir) ++{ ++ struct dentry *dentry = filp->f_dentry; ++ struct inode *inode = dentry->d_inode; ++ struct vs_entry *p = nx_virtnet_stuff; ++ int size = sizeof(nx_virtnet_stuff) / sizeof(struct vs_entry); ++ int pos, index; ++ unsigned int nid_array[PROC_MAXVIDS]; ++ char buf[PROC_NUMBUF]; ++ unsigned int nr_nids, i; ++ u64 ino; ++ ++ pos = filp->f_pos; ++ switch (pos) { ++ case 0: ++ ino = inode->i_ino; ++ if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ case 1: ++ ino = parent_ino(dentry); ++ if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ default: ++ index = pos - 2; ++ if (index >= size) ++ goto entries; ++ for (p += index; p->name; p++) { ++ if (proc_fill_cache(filp, dirent, filldir, p->name, p->len, ++ vs_proc_instantiate, 0, p)) ++ goto out; ++ pos++; ++ } ++ entries: ++ index = pos - size; ++ p = &nx_virtnet_stuff[size - 1]; ++ nr_nids = get_nid_list(index, nid_array, PROC_MAXVIDS); ++ for (i = 0; i < nr_nids; i++) { ++ int n, nid = nid_array[i]; ++ unsigned int j = PROC_NUMBUF; ++ ++ n = nid; ++ do ++ buf[--j] = '0' + (n % 10); ++ while (n /= 10); ++ ++ if (proc_fill_cache(filp, dirent, filldir, ++ buf + j, PROC_NUMBUF - j, ++ vs_proc_instantiate, nid, p)) ++ goto out; ++ pos++; ++ } ++ } ++out: ++ filp->f_pos = pos; ++ return 0; ++} ++ ++static int proc_virtnet_getattr(struct vfsmount *mnt, ++ struct dentry *dentry, struct kstat *stat) ++{ ++ struct inode *inode = dentry->d_inode; ++ ++ generic_fillattr(inode, stat); ++ stat->nlink = 2 + atomic_read(&nx_global_cactive); ++ return 0; ++} ++ ++static struct file_operations proc_virtnet_dir_operations = { ++ .read = generic_read_dir, ++ .readdir = proc_virtnet_readdir, ++}; ++ ++static struct inode_operations proc_virtnet_dir_inode_operations = { ++ .getattr = proc_virtnet_getattr, ++ .lookup = proc_virtnet_lookup, ++}; ++ ++ ++ ++void proc_vx_init(void) ++{ ++ struct proc_dir_entry *ent; ++ ++ ent = proc_mkdir("virtual", 0); ++ if (ent) { ++ ent->proc_fops = &proc_virtual_dir_operations; ++ ent->proc_iops = &proc_virtual_dir_inode_operations; ++ } ++ proc_virtual = ent; ++ ++ ent = proc_mkdir("virtnet", 0); ++ if (ent) { ++ ent->proc_fops = &proc_virtnet_dir_operations; ++ ent->proc_iops = &proc_virtnet_dir_inode_operations; ++ } ++ proc_virtnet = ent; ++} ++ ++ ++ ++ ++/* per pid info */ ++ ++ ++int proc_pid_vx_info(struct task_struct *p, char *buffer) ++{ ++ struct vx_info *vxi; ++ char *orig = buffer; ++ ++ buffer += sprintf(buffer, "XID:\t%d\n", vx_task_xid(p)); ++ ++ vxi = task_get_vx_info(p); ++ if (!vxi) ++ goto out; ++ ++ buffer += sprintf(buffer, "BCaps:\t"); ++ buffer = print_cap_t(buffer, &vxi->vx_bcaps); ++ buffer += sprintf(buffer, "\n"); ++ buffer += sprintf(buffer, "CCaps:\t%016llx\n", ++ (unsigned long long)vxi->vx_ccaps); ++ buffer += sprintf(buffer, "CFlags:\t%016llx\n", ++ (unsigned long long)vxi->vx_flags); ++ buffer += sprintf(buffer, "CIPid:\t%d\n", vxi->vx_initpid); ++ ++ put_vx_info(vxi); ++out: ++ return buffer - orig; ++} ++ ++ ++int proc_pid_nx_info(struct task_struct *p, char *buffer) ++{ ++ struct nx_info *nxi; ++ struct nx_addr_v4 *v4a; ++#ifdef CONFIG_IPV6 ++ struct nx_addr_v6 *v6a; ++#endif ++ char *orig = buffer; ++ int i; ++ ++ buffer += sprintf(buffer, "NID:\t%d\n", nx_task_nid(p)); ++ ++ nxi = task_get_nx_info(p); ++ if (!nxi) ++ goto out; ++ ++ buffer += sprintf(buffer, "NCaps:\t%016llx\n", ++ (unsigned long long)nxi->nx_ncaps); ++ buffer += sprintf(buffer, "NFlags:\t%016llx\n", ++ (unsigned long long)nxi->nx_flags); ++ ++ buffer += sprintf(buffer, ++ "V4Root[bcast]:\t" NIPQUAD_FMT "\n", ++ NIPQUAD(nxi->v4_bcast.s_addr)); ++ buffer += sprintf (buffer, ++ "V4Root[lback]:\t" NIPQUAD_FMT "\n", ++ NIPQUAD(nxi->v4_lback.s_addr)); ++ if (!NX_IPV4(nxi)) ++ goto skip_v4; ++ for (i = 0, v4a = &nxi->v4; v4a; i++, v4a = v4a->next) ++ buffer += sprintf(buffer, "V4Root[%d]:\t" NXAV4_FMT "\n", ++ i, NXAV4(v4a)); ++skip_v4: ++#ifdef CONFIG_IPV6 ++ if (!NX_IPV6(nxi)) ++ goto skip_v6; ++ for (i = 0, v6a = &nxi->v6; v6a; i++, v6a = v6a->next) ++ buffer += sprintf(buffer, "V6Root[%d]:\t" NXAV6_FMT "\n", ++ i, NXAV6(v6a)); ++skip_v6: ++#endif ++ put_nx_info(nxi); ++out: ++ return buffer - orig; ++} ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/sched.c linux-3.2.34-vs2.3.2.15/kernel/vserver/sched.c +--- linux-3.2.34/kernel/vserver/sched.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/sched.c 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,82 @@ ++/* ++ * linux/kernel/vserver/sched.c ++ * ++ * Virtual Server: Scheduler Support ++ * ++ * Copyright (C) 2004-2010 Herbert Pötzl ++ * ++ * V0.01 adapted Sam Vilains version to 2.6.3 ++ * V0.02 removed legacy interface ++ * V0.03 changed vcmds to vxi arg ++ * V0.04 removed older and legacy interfaces ++ * V0.05 removed scheduler code/commands ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include ++ ++ ++void vx_update_sched_param(struct _vx_sched *sched, ++ struct _vx_sched_pc *sched_pc) ++{ ++ sched_pc->prio_bias = sched->prio_bias; ++} ++ ++static int do_set_prio_bias(struct vx_info *vxi, struct vcmd_prio_bias *data) ++{ ++ int cpu; ++ ++ if (data->prio_bias > MAX_PRIO_BIAS) ++ data->prio_bias = MAX_PRIO_BIAS; ++ if (data->prio_bias < MIN_PRIO_BIAS) ++ data->prio_bias = MIN_PRIO_BIAS; ++ ++ if (data->cpu_id != ~0) { ++ vxi->sched.update = cpumask_of_cpu(data->cpu_id); ++ cpus_and(vxi->sched.update, cpu_online_map, ++ vxi->sched.update); ++ } else ++ vxi->sched.update = cpu_online_map; ++ ++ for_each_cpu_mask(cpu, vxi->sched.update) ++ vx_update_sched_param(&vxi->sched, ++ &vx_per_cpu(vxi, sched_pc, cpu)); ++ return 0; ++} ++ ++int vc_set_prio_bias(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_prio_bias vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_prio_bias(vxi, &vc_data); ++} ++ ++int vc_get_prio_bias(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_prio_bias vc_data; ++ struct _vx_sched_pc *pcd; ++ int cpu; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ cpu = vc_data.cpu_id; ++ ++ if (!cpu_possible(cpu)) ++ return -EINVAL; ++ ++ pcd = &vx_per_cpu(vxi, sched_pc, cpu); ++ vc_data.prio_bias = pcd->prio_bias; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/sched_init.h linux-3.2.34-vs2.3.2.15/kernel/vserver/sched_init.h +--- linux-3.2.34/kernel/vserver/sched_init.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/sched_init.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,27 @@ ++ ++static inline void vx_info_init_sched(struct _vx_sched *sched) ++{ ++ /* scheduling; hard code starting values as constants */ ++ sched->prio_bias = 0; ++} ++ ++static inline ++void vx_info_init_sched_pc(struct _vx_sched_pc *sched_pc, int cpu) ++{ ++ sched_pc->prio_bias = 0; ++ ++ sched_pc->user_ticks = 0; ++ sched_pc->sys_ticks = 0; ++ sched_pc->hold_ticks = 0; ++} ++ ++static inline void vx_info_exit_sched(struct _vx_sched *sched) ++{ ++ return; ++} ++ ++static inline ++void vx_info_exit_sched_pc(struct _vx_sched_pc *sched_pc, int cpu) ++{ ++ return; ++} +diff -NurpP --minimal linux-3.2.34/kernel/vserver/sched_proc.h linux-3.2.34-vs2.3.2.15/kernel/vserver/sched_proc.h +--- linux-3.2.34/kernel/vserver/sched_proc.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/sched_proc.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,32 @@ ++#ifndef _VX_SCHED_PROC_H ++#define _VX_SCHED_PROC_H ++ ++ ++static inline ++int vx_info_proc_sched(struct _vx_sched *sched, char *buffer) ++{ ++ int length = 0; ++ ++ length += sprintf(buffer, ++ "PrioBias:\t%8d\n", ++ sched->prio_bias); ++ return length; ++} ++ ++static inline ++int vx_info_proc_sched_pc(struct _vx_sched_pc *sched_pc, ++ char *buffer, int cpu) ++{ ++ int length = 0; ++ ++ length += sprintf(buffer + length, ++ "cpu %d: %lld %lld %lld", cpu, ++ (unsigned long long)sched_pc->user_ticks, ++ (unsigned long long)sched_pc->sys_ticks, ++ (unsigned long long)sched_pc->hold_ticks); ++ length += sprintf(buffer + length, ++ " %d\n", sched_pc->prio_bias); ++ return length; ++} ++ ++#endif /* _VX_SCHED_PROC_H */ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/signal.c linux-3.2.34-vs2.3.2.15/kernel/vserver/signal.c +--- linux-3.2.34/kernel/vserver/signal.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/signal.c 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,134 @@ ++/* ++ * linux/kernel/vserver/signal.c ++ * ++ * Virtual Server: Signal Support ++ * ++ * Copyright (C) 2003-2007 Herbert Pötzl ++ * ++ * V0.01 broken out from vcontext V0.05 ++ * V0.02 changed vcmds to vxi arg ++ * V0.03 adjusted siginfo for kill ++ * ++ */ ++ ++#include ++ ++#include ++#include ++#include ++ ++ ++int vx_info_kill(struct vx_info *vxi, int pid, int sig) ++{ ++ int retval, count = 0; ++ struct task_struct *p; ++ struct siginfo *sip = SEND_SIG_PRIV; ++ ++ retval = -ESRCH; ++ vxdprintk(VXD_CBIT(misc, 4), ++ "vx_info_kill(%p[#%d],%d,%d)*", ++ vxi, vxi->vx_id, pid, sig); ++ read_lock(&tasklist_lock); ++ switch (pid) { ++ case 0: ++ case -1: ++ for_each_process(p) { ++ int err = 0; ++ ++ if (vx_task_xid(p) != vxi->vx_id || p->pid <= 1 || ++ (pid && vxi->vx_initpid == p->pid)) ++ continue; ++ ++ err = group_send_sig_info(sig, sip, p); ++ ++count; ++ if (err != -EPERM) ++ retval = err; ++ } ++ break; ++ ++ case 1: ++ if (vxi->vx_initpid) { ++ pid = vxi->vx_initpid; ++ /* for now, only SIGINT to private init ... */ ++ if (!vx_info_flags(vxi, VXF_STATE_ADMIN, 0) && ++ /* ... as long as there are tasks left */ ++ (atomic_read(&vxi->vx_tasks) > 1)) ++ sig = SIGINT; ++ } ++ /* fallthrough */ ++ default: ++ rcu_read_lock(); ++ p = find_task_by_real_pid(pid); ++ rcu_read_unlock(); ++ if (p) { ++ if (vx_task_xid(p) == vxi->vx_id) ++ retval = group_send_sig_info(sig, sip, p); ++ } ++ break; ++ } ++ read_unlock(&tasklist_lock); ++ vxdprintk(VXD_CBIT(misc, 4), ++ "vx_info_kill(%p[#%d],%d,%d,%ld) = %d", ++ vxi, vxi->vx_id, pid, sig, (long)sip, retval); ++ return retval; ++} ++ ++int vc_ctx_kill(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_kill_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ /* special check to allow guest shutdown */ ++ if (!vx_info_flags(vxi, VXF_STATE_ADMIN, 0) && ++ /* forbid killall pid=0 when init is present */ ++ (((vc_data.pid < 1) && vxi->vx_initpid) || ++ (vc_data.pid > 1))) ++ return -EACCES; ++ ++ return vx_info_kill(vxi, vc_data.pid, vc_data.sig); ++} ++ ++ ++static int __wait_exit(struct vx_info *vxi) ++{ ++ DECLARE_WAITQUEUE(wait, current); ++ int ret = 0; ++ ++ add_wait_queue(&vxi->vx_wait, &wait); ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++wait: ++ if (vx_info_state(vxi, ++ VXS_SHUTDOWN | VXS_HASHED | VXS_HELPER) == VXS_SHUTDOWN) ++ goto out; ++ if (signal_pending(current)) { ++ ret = -ERESTARTSYS; ++ goto out; ++ } ++ schedule(); ++ goto wait; ++ ++out: ++ set_current_state(TASK_RUNNING); ++ remove_wait_queue(&vxi->vx_wait, &wait); ++ return ret; ++} ++ ++ ++ ++int vc_wait_exit(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_wait_exit_v0 vc_data; ++ int ret; ++ ++ ret = __wait_exit(vxi); ++ vc_data.reboot_cmd = vxi->reboot_cmd; ++ vc_data.exit_code = vxi->exit_code; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ ret = -EFAULT; ++ return ret; ++} ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/space.c linux-3.2.34-vs2.3.2.15/kernel/vserver/space.c +--- linux-3.2.34/kernel/vserver/space.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/space.c 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,435 @@ ++/* ++ * linux/kernel/vserver/space.c ++ * ++ * Virtual Server: Context Space Support ++ * ++ * Copyright (C) 2003-2010 Herbert Pötzl ++ * ++ * V0.01 broken out from context.c 0.07 ++ * V0.02 added task locking for namespace ++ * V0.03 broken out vx_enter_namespace ++ * V0.04 added *space support and commands ++ * V0.05 added credential support ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++atomic_t vs_global_nsproxy = ATOMIC_INIT(0); ++atomic_t vs_global_fs = ATOMIC_INIT(0); ++atomic_t vs_global_mnt_ns = ATOMIC_INIT(0); ++atomic_t vs_global_uts_ns = ATOMIC_INIT(0); ++atomic_t vs_global_user_ns = ATOMIC_INIT(0); ++atomic_t vs_global_pid_ns = ATOMIC_INIT(0); ++ ++ ++/* namespace functions */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++ ++static const struct vcmd_space_mask_v1 space_mask_v0 = { ++ .mask = CLONE_FS | ++ CLONE_NEWNS | ++#ifdef CONFIG_UTS_NS ++ CLONE_NEWUTS | ++#endif ++#ifdef CONFIG_IPC_NS ++ CLONE_NEWIPC | ++#endif ++#ifdef CONFIG_USER_NS ++ CLONE_NEWUSER | ++#endif ++ 0 ++}; ++ ++static const struct vcmd_space_mask_v1 space_mask = { ++ .mask = CLONE_FS | ++ CLONE_NEWNS | ++#ifdef CONFIG_UTS_NS ++ CLONE_NEWUTS | ++#endif ++#ifdef CONFIG_IPC_NS ++ CLONE_NEWIPC | ++#endif ++#ifdef CONFIG_USER_NS ++ CLONE_NEWUSER | ++#endif ++#ifdef CONFIG_PID_NS ++ CLONE_NEWPID | ++#endif ++#ifdef CONFIG_NET_NS ++ CLONE_NEWNET | ++#endif ++ 0 ++}; ++ ++static const struct vcmd_space_mask_v1 default_space_mask = { ++ .mask = CLONE_FS | ++ CLONE_NEWNS | ++#ifdef CONFIG_UTS_NS ++ CLONE_NEWUTS | ++#endif ++#ifdef CONFIG_IPC_NS ++ CLONE_NEWIPC | ++#endif ++#ifdef CONFIG_USER_NS ++ CLONE_NEWUSER | ++#endif ++#ifdef CONFIG_PID_NS ++// CLONE_NEWPID | ++#endif ++ 0 ++}; ++ ++/* ++ * build a new nsproxy mix ++ * assumes that both proxies are 'const' ++ * does not touch nsproxy refcounts ++ * will hold a reference on the result. ++ */ ++ ++struct nsproxy *vs_mix_nsproxy(struct nsproxy *old_nsproxy, ++ struct nsproxy *new_nsproxy, unsigned long mask) ++{ ++ struct mnt_namespace *old_ns; ++ struct uts_namespace *old_uts; ++ struct ipc_namespace *old_ipc; ++#ifdef CONFIG_PID_NS ++ struct pid_namespace *old_pid; ++#endif ++#ifdef CONFIG_NET_NS ++ struct net *old_net; ++#endif ++ struct nsproxy *nsproxy; ++ ++ nsproxy = copy_nsproxy(old_nsproxy); ++ if (!nsproxy) ++ goto out; ++ ++ if (mask & CLONE_NEWNS) { ++ old_ns = nsproxy->mnt_ns; ++ nsproxy->mnt_ns = new_nsproxy->mnt_ns; ++ if (nsproxy->mnt_ns) ++ get_mnt_ns(nsproxy->mnt_ns); ++ } else ++ old_ns = NULL; ++ ++ if (mask & CLONE_NEWUTS) { ++ old_uts = nsproxy->uts_ns; ++ nsproxy->uts_ns = new_nsproxy->uts_ns; ++ if (nsproxy->uts_ns) ++ get_uts_ns(nsproxy->uts_ns); ++ } else ++ old_uts = NULL; ++ ++ if (mask & CLONE_NEWIPC) { ++ old_ipc = nsproxy->ipc_ns; ++ nsproxy->ipc_ns = new_nsproxy->ipc_ns; ++ if (nsproxy->ipc_ns) ++ get_ipc_ns(nsproxy->ipc_ns); ++ } else ++ old_ipc = NULL; ++ ++#ifdef CONFIG_PID_NS ++ if (mask & CLONE_NEWPID) { ++ old_pid = nsproxy->pid_ns; ++ nsproxy->pid_ns = new_nsproxy->pid_ns; ++ if (nsproxy->pid_ns) ++ get_pid_ns(nsproxy->pid_ns); ++ } else ++ old_pid = NULL; ++#endif ++#ifdef CONFIG_NET_NS ++ if (mask & CLONE_NEWNET) { ++ old_net = nsproxy->net_ns; ++ nsproxy->net_ns = new_nsproxy->net_ns; ++ if (nsproxy->net_ns) ++ get_net(nsproxy->net_ns); ++ } else ++ old_net = NULL; ++#endif ++ if (old_ns) ++ put_mnt_ns(old_ns); ++ if (old_uts) ++ put_uts_ns(old_uts); ++ if (old_ipc) ++ put_ipc_ns(old_ipc); ++#ifdef CONFIG_PID_NS ++ if (old_pid) ++ put_pid_ns(old_pid); ++#endif ++#ifdef CONFIG_NET_NS ++ if (old_net) ++ put_net(old_net); ++#endif ++out: ++ return nsproxy; ++} ++ ++ ++/* ++ * merge two nsproxy structs into a new one. ++ * will hold a reference on the result. ++ */ ++ ++static inline ++struct nsproxy *__vs_merge_nsproxy(struct nsproxy *old, ++ struct nsproxy *proxy, unsigned long mask) ++{ ++ struct nsproxy null_proxy = { .mnt_ns = NULL }; ++ ++ if (!proxy) ++ return NULL; ++ ++ if (mask) { ++ /* vs_mix_nsproxy returns with reference */ ++ return vs_mix_nsproxy(old ? old : &null_proxy, ++ proxy, mask); ++ } ++ get_nsproxy(proxy); ++ return proxy; ++} ++ ++ ++int vx_enter_space(struct vx_info *vxi, unsigned long mask, unsigned index) ++{ ++ struct nsproxy *proxy, *proxy_cur, *proxy_new; ++ struct fs_struct *fs_cur, *fs = NULL; ++ struct _vx_space *space; ++ int ret, kill = 0; ++ ++ vxdprintk(VXD_CBIT(space, 8), "vx_enter_space(%p[#%u],0x%08lx,%d)", ++ vxi, vxi->vx_id, mask, index); ++ ++ if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0)) ++ return -EACCES; ++ ++ if (index >= VX_SPACES) ++ return -EINVAL; ++ ++ space = &vxi->space[index]; ++ ++ if (!mask) ++ mask = space->vx_nsmask; ++ ++ if ((mask & space->vx_nsmask) != mask) ++ return -EINVAL; ++ ++ if (mask & CLONE_FS) { ++ fs = copy_fs_struct(space->vx_fs); ++ if (!fs) ++ return -ENOMEM; ++ } ++ proxy = space->vx_nsproxy; ++ ++ vxdprintk(VXD_CBIT(space, 9), ++ "vx_enter_space(%p[#%u],0x%08lx,%d) -> (%p,%p)", ++ vxi, vxi->vx_id, mask, index, proxy, fs); ++ ++ task_lock(current); ++ fs_cur = current->fs; ++ ++ if (mask & CLONE_FS) { ++ spin_lock(&fs_cur->lock); ++ current->fs = fs; ++ kill = !--fs_cur->users; ++ spin_unlock(&fs_cur->lock); ++ } ++ ++ proxy_cur = current->nsproxy; ++ get_nsproxy(proxy_cur); ++ task_unlock(current); ++ ++ if (kill) ++ free_fs_struct(fs_cur); ++ ++ proxy_new = __vs_merge_nsproxy(proxy_cur, proxy, mask); ++ if (IS_ERR(proxy_new)) { ++ ret = PTR_ERR(proxy_new); ++ goto out_put; ++ } ++ ++ proxy_new = xchg(¤t->nsproxy, proxy_new); ++ ++ if (mask & CLONE_NEWUSER) { ++ struct cred *cred; ++ ++ vxdprintk(VXD_CBIT(space, 10), ++ "vx_enter_space(%p[#%u],%p) cred (%p,%p)", ++ vxi, vxi->vx_id, space->vx_cred, ++ current->real_cred, current->cred); ++ ++ if (space->vx_cred) { ++ cred = __prepare_creds(space->vx_cred); ++ if (cred) ++ commit_creds(cred); ++ } ++ } ++ ++ ret = 0; ++ ++ if (proxy_new) ++ put_nsproxy(proxy_new); ++out_put: ++ if (proxy_cur) ++ put_nsproxy(proxy_cur); ++ return ret; ++} ++ ++ ++int vx_set_space(struct vx_info *vxi, unsigned long mask, unsigned index) ++{ ++ struct nsproxy *proxy_vxi, *proxy_cur, *proxy_new; ++ struct fs_struct *fs_vxi, *fs; ++ struct _vx_space *space; ++ int ret, kill = 0; ++ ++ vxdprintk(VXD_CBIT(space, 8), "vx_set_space(%p[#%u],0x%08lx,%d)", ++ vxi, vxi->vx_id, mask, index); ++ ++ if ((mask & space_mask.mask) != mask) ++ return -EINVAL; ++ ++ if (index >= VX_SPACES) ++ return -EINVAL; ++ ++ space = &vxi->space[index]; ++ ++ proxy_vxi = space->vx_nsproxy; ++ fs_vxi = space->vx_fs; ++ ++ if (mask & CLONE_FS) { ++ fs = copy_fs_struct(current->fs); ++ if (!fs) ++ return -ENOMEM; ++ } ++ ++ task_lock(current); ++ ++ if (mask & CLONE_FS) { ++ spin_lock(&fs_vxi->lock); ++ space->vx_fs = fs; ++ kill = !--fs_vxi->users; ++ spin_unlock(&fs_vxi->lock); ++ } ++ ++ proxy_cur = current->nsproxy; ++ get_nsproxy(proxy_cur); ++ task_unlock(current); ++ ++ if (kill) ++ free_fs_struct(fs_vxi); ++ ++ proxy_new = __vs_merge_nsproxy(proxy_vxi, proxy_cur, mask); ++ if (IS_ERR(proxy_new)) { ++ ret = PTR_ERR(proxy_new); ++ goto out_put; ++ } ++ ++ proxy_new = xchg(&space->vx_nsproxy, proxy_new); ++ space->vx_nsmask |= mask; ++ ++ if (mask & CLONE_NEWUSER) { ++ struct cred *cred; ++ ++ vxdprintk(VXD_CBIT(space, 10), ++ "vx_set_space(%p[#%u],%p) cred (%p,%p)", ++ vxi, vxi->vx_id, space->vx_cred, ++ current->real_cred, current->cred); ++ ++ cred = prepare_creds(); ++ cred = (struct cred *)xchg(&space->vx_cred, cred); ++ if (cred) ++ abort_creds(cred); ++ } ++ ++ ret = 0; ++ ++ if (proxy_new) ++ put_nsproxy(proxy_new); ++out_put: ++ if (proxy_cur) ++ put_nsproxy(proxy_cur); ++ return ret; ++} ++ ++ ++int vc_enter_space_v1(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_space_mask_v1 vc_data = { .mask = 0 }; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return vx_enter_space(vxi, vc_data.mask, 0); ++} ++ ++int vc_enter_space(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_space_mask_v2 vc_data = { .mask = 0 }; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ if (vc_data.index >= VX_SPACES) ++ return -EINVAL; ++ ++ return vx_enter_space(vxi, vc_data.mask, vc_data.index); ++} ++ ++int vc_set_space_v1(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_space_mask_v1 vc_data = { .mask = 0 }; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return vx_set_space(vxi, vc_data.mask, 0); ++} ++ ++int vc_set_space(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_space_mask_v2 vc_data = { .mask = 0 }; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ if (vc_data.index >= VX_SPACES) ++ return -EINVAL; ++ ++ return vx_set_space(vxi, vc_data.mask, vc_data.index); ++} ++ ++int vc_get_space_mask(void __user *data, int type) ++{ ++ const struct vcmd_space_mask_v1 *mask; ++ ++ if (type == 0) ++ mask = &space_mask_v0; ++ else if (type == 1) ++ mask = &space_mask; ++ else ++ mask = &default_space_mask; ++ ++ vxdprintk(VXD_CBIT(space, 10), ++ "vc_get_space_mask(%d) = %08llx", type, mask->mask); ++ ++ if (copy_to_user(data, mask, sizeof(*mask))) ++ return -EFAULT; ++ return 0; ++} ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/switch.c linux-3.2.34-vs2.3.2.15/kernel/vserver/switch.c +--- linux-3.2.34/kernel/vserver/switch.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/switch.c 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,556 @@ ++/* ++ * linux/kernel/vserver/switch.c ++ * ++ * Virtual Server: Syscall Switch ++ * ++ * Copyright (C) 2003-2011 Herbert Pötzl ++ * ++ * V0.01 syscall switch ++ * V0.02 added signal to context ++ * V0.03 added rlimit functions ++ * V0.04 added iattr, task/xid functions ++ * V0.05 added debug/history stuff ++ * V0.06 added compat32 layer ++ * V0.07 vcmd args and perms ++ * V0.08 added status commands ++ * V0.09 added tag commands ++ * V0.10 added oom bias ++ * V0.11 added device commands ++ * V0.12 added warn mask ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include "vci_config.h" ++ ++ ++static inline ++int vc_get_version(uint32_t id) ++{ ++ return VCI_VERSION; ++} ++ ++static inline ++int vc_get_vci(uint32_t id) ++{ ++ return vci_kernel_config(); ++} ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++ ++#ifdef CONFIG_COMPAT ++#define __COMPAT(name, id, data, compat) \ ++ (compat) ? name ## _x32(id, data) : name(id, data) ++#define __COMPAT_NO_ID(name, data, compat) \ ++ (compat) ? name ## _x32(data) : name(data) ++#else ++#define __COMPAT(name, id, data, compat) \ ++ name(id, data) ++#define __COMPAT_NO_ID(name, data, compat) \ ++ name(data) ++#endif ++ ++ ++static inline ++long do_vcmd(uint32_t cmd, uint32_t id, ++ struct vx_info *vxi, struct nx_info *nxi, ++ void __user *data, int compat) ++{ ++ switch (cmd) { ++ ++ case VCMD_get_version: ++ return vc_get_version(id); ++ case VCMD_get_vci: ++ return vc_get_vci(id); ++ ++ case VCMD_task_xid: ++ return vc_task_xid(id); ++ case VCMD_vx_info: ++ return vc_vx_info(vxi, data); ++ ++ case VCMD_task_nid: ++ return vc_task_nid(id); ++ case VCMD_nx_info: ++ return vc_nx_info(nxi, data); ++ ++ case VCMD_task_tag: ++ return vc_task_tag(id); ++ ++ case VCMD_set_space_v1: ++ return vc_set_space_v1(vxi, data); ++ /* this is version 2 */ ++ case VCMD_set_space: ++ return vc_set_space(vxi, data); ++ ++ case VCMD_get_space_mask_v0: ++ return vc_get_space_mask(data, 0); ++ /* this is version 1 */ ++ case VCMD_get_space_mask: ++ return vc_get_space_mask(data, 1); ++ ++ case VCMD_get_space_default: ++ return vc_get_space_mask(data, -1); ++ ++ case VCMD_set_umask: ++ return vc_set_umask(vxi, data); ++ ++ case VCMD_get_umask: ++ return vc_get_umask(vxi, data); ++ ++ case VCMD_set_wmask: ++ return vc_set_wmask(vxi, data); ++ ++ case VCMD_get_wmask: ++ return vc_get_wmask(vxi, data); ++#ifdef CONFIG_IA32_EMULATION ++ case VCMD_get_rlimit: ++ return __COMPAT(vc_get_rlimit, vxi, data, compat); ++ case VCMD_set_rlimit: ++ return __COMPAT(vc_set_rlimit, vxi, data, compat); ++#else ++ case VCMD_get_rlimit: ++ return vc_get_rlimit(vxi, data); ++ case VCMD_set_rlimit: ++ return vc_set_rlimit(vxi, data); ++#endif ++ case VCMD_get_rlimit_mask: ++ return vc_get_rlimit_mask(id, data); ++ case VCMD_reset_hits: ++ return vc_reset_hits(vxi, data); ++ case VCMD_reset_minmax: ++ return vc_reset_minmax(vxi, data); ++ ++ case VCMD_get_vhi_name: ++ return vc_get_vhi_name(vxi, data); ++ case VCMD_set_vhi_name: ++ return vc_set_vhi_name(vxi, data); ++ ++ case VCMD_ctx_stat: ++ return vc_ctx_stat(vxi, data); ++ case VCMD_virt_stat: ++ return vc_virt_stat(vxi, data); ++ case VCMD_sock_stat: ++ return vc_sock_stat(vxi, data); ++ case VCMD_rlimit_stat: ++ return vc_rlimit_stat(vxi, data); ++ ++ case VCMD_set_cflags: ++ return vc_set_cflags(vxi, data); ++ case VCMD_get_cflags: ++ return vc_get_cflags(vxi, data); ++ ++ /* this is version 1 */ ++ case VCMD_set_ccaps: ++ return vc_set_ccaps(vxi, data); ++ /* this is version 1 */ ++ case VCMD_get_ccaps: ++ return vc_get_ccaps(vxi, data); ++ case VCMD_set_bcaps: ++ return vc_set_bcaps(vxi, data); ++ case VCMD_get_bcaps: ++ return vc_get_bcaps(vxi, data); ++ ++ case VCMD_set_badness: ++ return vc_set_badness(vxi, data); ++ case VCMD_get_badness: ++ return vc_get_badness(vxi, data); ++ ++ case VCMD_set_nflags: ++ return vc_set_nflags(nxi, data); ++ case VCMD_get_nflags: ++ return vc_get_nflags(nxi, data); ++ ++ case VCMD_set_ncaps: ++ return vc_set_ncaps(nxi, data); ++ case VCMD_get_ncaps: ++ return vc_get_ncaps(nxi, data); ++ ++ case VCMD_set_prio_bias: ++ return vc_set_prio_bias(vxi, data); ++ case VCMD_get_prio_bias: ++ return vc_get_prio_bias(vxi, data); ++ case VCMD_add_dlimit: ++ return __COMPAT(vc_add_dlimit, id, data, compat); ++ case VCMD_rem_dlimit: ++ return __COMPAT(vc_rem_dlimit, id, data, compat); ++ case VCMD_set_dlimit: ++ return __COMPAT(vc_set_dlimit, id, data, compat); ++ case VCMD_get_dlimit: ++ return __COMPAT(vc_get_dlimit, id, data, compat); ++ ++ case VCMD_ctx_kill: ++ return vc_ctx_kill(vxi, data); ++ ++ case VCMD_wait_exit: ++ return vc_wait_exit(vxi, data); ++ ++ case VCMD_get_iattr: ++ return __COMPAT_NO_ID(vc_get_iattr, data, compat); ++ case VCMD_set_iattr: ++ return __COMPAT_NO_ID(vc_set_iattr, data, compat); ++ ++ case VCMD_fget_iattr: ++ return vc_fget_iattr(id, data); ++ case VCMD_fset_iattr: ++ return vc_fset_iattr(id, data); ++ ++ case VCMD_enter_space_v0: ++ return vc_enter_space_v1(vxi, NULL); ++ case VCMD_enter_space_v1: ++ return vc_enter_space_v1(vxi, data); ++ /* this is version 2 */ ++ case VCMD_enter_space: ++ return vc_enter_space(vxi, data); ++ ++ case VCMD_ctx_create_v0: ++ return vc_ctx_create(id, NULL); ++ case VCMD_ctx_create: ++ return vc_ctx_create(id, data); ++ case VCMD_ctx_migrate_v0: ++ return vc_ctx_migrate(vxi, NULL); ++ case VCMD_ctx_migrate: ++ return vc_ctx_migrate(vxi, data); ++ ++ case VCMD_net_create_v0: ++ return vc_net_create(id, NULL); ++ case VCMD_net_create: ++ return vc_net_create(id, data); ++ case VCMD_net_migrate: ++ return vc_net_migrate(nxi, data); ++ ++ case VCMD_tag_migrate: ++ return vc_tag_migrate(id); ++ ++ case VCMD_net_add: ++ return vc_net_add(nxi, data); ++ case VCMD_net_remove: ++ return vc_net_remove(nxi, data); ++ ++ case VCMD_net_add_ipv4_v1: ++ return vc_net_add_ipv4_v1(nxi, data); ++ /* this is version 2 */ ++ case VCMD_net_add_ipv4: ++ return vc_net_add_ipv4(nxi, data); ++ ++ case VCMD_net_rem_ipv4_v1: ++ return vc_net_rem_ipv4_v1(nxi, data); ++ /* this is version 2 */ ++ case VCMD_net_rem_ipv4: ++ return vc_net_rem_ipv4(nxi, data); ++#ifdef CONFIG_IPV6 ++ case VCMD_net_add_ipv6: ++ return vc_net_add_ipv6(nxi, data); ++ case VCMD_net_remove_ipv6: ++ return vc_net_remove_ipv6(nxi, data); ++#endif ++/* case VCMD_add_match_ipv4: ++ return vc_add_match_ipv4(nxi, data); ++ case VCMD_get_match_ipv4: ++ return vc_get_match_ipv4(nxi, data); ++#ifdef CONFIG_IPV6 ++ case VCMD_add_match_ipv6: ++ return vc_add_match_ipv6(nxi, data); ++ case VCMD_get_match_ipv6: ++ return vc_get_match_ipv6(nxi, data); ++#endif */ ++ ++#ifdef CONFIG_VSERVER_DEVICE ++ case VCMD_set_mapping: ++ return __COMPAT(vc_set_mapping, vxi, data, compat); ++ case VCMD_unset_mapping: ++ return __COMPAT(vc_unset_mapping, vxi, data, compat); ++#endif ++#ifdef CONFIG_VSERVER_HISTORY ++ case VCMD_dump_history: ++ return vc_dump_history(id); ++ case VCMD_read_history: ++ return __COMPAT(vc_read_history, id, data, compat); ++#endif ++ default: ++ vxwprintk_task(1, "unimplemented VCMD_%02d_%d[%d]", ++ VC_CATEGORY(cmd), VC_COMMAND(cmd), VC_VERSION(cmd)); ++ } ++ return -ENOSYS; ++} ++ ++ ++#define __VCMD(vcmd, _perm, _args, _flags) \ ++ case VCMD_ ## vcmd: perm = _perm; \ ++ args = _args; flags = _flags; break ++ ++ ++#define VCA_NONE 0x00 ++#define VCA_VXI 0x01 ++#define VCA_NXI 0x02 ++ ++#define VCF_NONE 0x00 ++#define VCF_INFO 0x01 ++#define VCF_ADMIN 0x02 ++#define VCF_ARES 0x06 /* includes admin */ ++#define VCF_SETUP 0x08 ++ ++#define VCF_ZIDOK 0x10 /* zero id okay */ ++ ++ ++static inline ++long do_vserver(uint32_t cmd, uint32_t id, void __user *data, int compat) ++{ ++ long ret; ++ int permit = -1, state = 0; ++ int perm = -1, args = 0, flags = 0; ++ struct vx_info *vxi = NULL; ++ struct nx_info *nxi = NULL; ++ ++ switch (cmd) { ++ /* unpriviledged commands */ ++ __VCMD(get_version, 0, VCA_NONE, 0); ++ __VCMD(get_vci, 0, VCA_NONE, 0); ++ __VCMD(get_rlimit_mask, 0, VCA_NONE, 0); ++ __VCMD(get_space_mask_v0,0, VCA_NONE, 0); ++ __VCMD(get_space_mask, 0, VCA_NONE, 0); ++ __VCMD(get_space_default,0, VCA_NONE, 0); ++ ++ /* info commands */ ++ __VCMD(task_xid, 2, VCA_NONE, 0); ++ __VCMD(reset_hits, 2, VCA_VXI, 0); ++ __VCMD(reset_minmax, 2, VCA_VXI, 0); ++ __VCMD(vx_info, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_bcaps, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_ccaps, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_cflags, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_umask, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_wmask, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_badness, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_vhi_name, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_rlimit, 3, VCA_VXI, VCF_INFO); ++ ++ __VCMD(ctx_stat, 3, VCA_VXI, VCF_INFO); ++ __VCMD(virt_stat, 3, VCA_VXI, VCF_INFO); ++ __VCMD(sock_stat, 3, VCA_VXI, VCF_INFO); ++ __VCMD(rlimit_stat, 3, VCA_VXI, VCF_INFO); ++ ++ __VCMD(task_nid, 2, VCA_NONE, 0); ++ __VCMD(nx_info, 3, VCA_NXI, VCF_INFO); ++ __VCMD(get_ncaps, 3, VCA_NXI, VCF_INFO); ++ __VCMD(get_nflags, 3, VCA_NXI, VCF_INFO); ++ ++ __VCMD(task_tag, 2, VCA_NONE, 0); ++ ++ __VCMD(get_iattr, 2, VCA_NONE, 0); ++ __VCMD(fget_iattr, 2, VCA_NONE, 0); ++ __VCMD(get_dlimit, 3, VCA_NONE, VCF_INFO); ++ __VCMD(get_prio_bias, 3, VCA_VXI, VCF_INFO); ++ ++ /* lower admin commands */ ++ __VCMD(wait_exit, 4, VCA_VXI, VCF_INFO); ++ __VCMD(ctx_create_v0, 5, VCA_NONE, 0); ++ __VCMD(ctx_create, 5, VCA_NONE, 0); ++ __VCMD(ctx_migrate_v0, 5, VCA_VXI, VCF_ADMIN); ++ __VCMD(ctx_migrate, 5, VCA_VXI, VCF_ADMIN); ++ __VCMD(enter_space_v0, 5, VCA_VXI, VCF_ADMIN); ++ __VCMD(enter_space_v1, 5, VCA_VXI, VCF_ADMIN); ++ __VCMD(enter_space, 5, VCA_VXI, VCF_ADMIN); ++ ++ __VCMD(net_create_v0, 5, VCA_NONE, 0); ++ __VCMD(net_create, 5, VCA_NONE, 0); ++ __VCMD(net_migrate, 5, VCA_NXI, VCF_ADMIN); ++ ++ __VCMD(tag_migrate, 5, VCA_NONE, VCF_ADMIN); ++ ++ /* higher admin commands */ ++ __VCMD(ctx_kill, 6, VCA_VXI, VCF_ARES); ++ __VCMD(set_space_v1, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_space, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ ++ __VCMD(set_ccaps, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_bcaps, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_cflags, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_umask, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_wmask, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_badness, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ ++ __VCMD(set_vhi_name, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_rlimit, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_prio_bias, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ ++ __VCMD(set_ncaps, 7, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_nflags, 7, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(net_add, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(net_remove, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(net_add_ipv4_v1, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(net_rem_ipv4_v1, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(net_add_ipv4, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(net_rem_ipv4, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++#ifdef CONFIG_IPV6 ++ __VCMD(net_add_ipv6, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(net_remove_ipv6, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++#endif ++ __VCMD(set_iattr, 7, VCA_NONE, 0); ++ __VCMD(fset_iattr, 7, VCA_NONE, 0); ++ __VCMD(set_dlimit, 7, VCA_NONE, VCF_ARES); ++ __VCMD(add_dlimit, 8, VCA_NONE, VCF_ARES); ++ __VCMD(rem_dlimit, 8, VCA_NONE, VCF_ARES); ++ ++#ifdef CONFIG_VSERVER_DEVICE ++ __VCMD(set_mapping, 8, VCA_VXI, VCF_ARES|VCF_ZIDOK); ++ __VCMD(unset_mapping, 8, VCA_VXI, VCF_ARES|VCF_ZIDOK); ++#endif ++ /* debug level admin commands */ ++#ifdef CONFIG_VSERVER_HISTORY ++ __VCMD(dump_history, 9, VCA_NONE, 0); ++ __VCMD(read_history, 9, VCA_NONE, 0); ++#endif ++ ++ default: ++ perm = -1; ++ } ++ ++ vxdprintk(VXD_CBIT(switch, 0), ++ "vc: VCMD_%02d_%d[%d], %d,%p [%d,%d,%x,%x]", ++ VC_CATEGORY(cmd), VC_COMMAND(cmd), ++ VC_VERSION(cmd), id, data, compat, ++ perm, args, flags); ++ ++ ret = -ENOSYS; ++ if (perm < 0) ++ goto out; ++ ++ state = 1; ++ if (!capable(CAP_CONTEXT)) ++ goto out; ++ ++ state = 2; ++ /* moved here from the individual commands */ ++ ret = -EPERM; ++ if ((perm > 1) && !capable(CAP_SYS_ADMIN)) ++ goto out; ++ ++ state = 3; ++ /* vcmd involves resource management */ ++ ret = -EPERM; ++ if ((flags & VCF_ARES) && !capable(CAP_SYS_RESOURCE)) ++ goto out; ++ ++ state = 4; ++ /* various legacy exceptions */ ++ switch (cmd) { ++ /* will go away when spectator is a cap */ ++ case VCMD_ctx_migrate_v0: ++ case VCMD_ctx_migrate: ++ if (id == 1) { ++ current->xid = 1; ++ ret = 1; ++ goto out; ++ } ++ break; ++ ++ /* will go away when spectator is a cap */ ++ case VCMD_net_migrate: ++ if (id == 1) { ++ current->nid = 1; ++ ret = 1; ++ goto out; ++ } ++ break; ++ } ++ ++ /* vcmds are fine by default */ ++ permit = 1; ++ ++ /* admin type vcmds require admin ... */ ++ if (flags & VCF_ADMIN) ++ permit = vx_check(0, VS_ADMIN) ? 1 : 0; ++ ++ /* ... but setup type vcmds override that */ ++ if (!permit && (flags & VCF_SETUP)) ++ permit = vx_flags(VXF_STATE_SETUP, 0) ? 2 : 0; ++ ++ state = 5; ++ ret = -EPERM; ++ if (!permit) ++ goto out; ++ ++ state = 6; ++ if (!id && (flags & VCF_ZIDOK)) ++ goto skip_id; ++ ++ ret = -ESRCH; ++ if (args & VCA_VXI) { ++ vxi = lookup_vx_info(id); ++ if (!vxi) ++ goto out; ++ ++ if ((flags & VCF_ADMIN) && ++ /* special case kill for shutdown */ ++ (cmd != VCMD_ctx_kill) && ++ /* can context be administrated? */ ++ !vx_info_flags(vxi, VXF_STATE_ADMIN, 0)) { ++ ret = -EACCES; ++ goto out_vxi; ++ } ++ } ++ state = 7; ++ if (args & VCA_NXI) { ++ nxi = lookup_nx_info(id); ++ if (!nxi) ++ goto out_vxi; ++ ++ if ((flags & VCF_ADMIN) && ++ /* can context be administrated? */ ++ !nx_info_flags(nxi, NXF_STATE_ADMIN, 0)) { ++ ret = -EACCES; ++ goto out_nxi; ++ } ++ } ++skip_id: ++ state = 8; ++ ret = do_vcmd(cmd, id, vxi, nxi, data, compat); ++ ++out_nxi: ++ if ((args & VCA_NXI) && nxi) ++ put_nx_info(nxi); ++out_vxi: ++ if ((args & VCA_VXI) && vxi) ++ put_vx_info(vxi); ++out: ++ vxdprintk(VXD_CBIT(switch, 1), ++ "vc: VCMD_%02d_%d[%d] = %08lx(%ld) [%d,%d]", ++ VC_CATEGORY(cmd), VC_COMMAND(cmd), ++ VC_VERSION(cmd), ret, ret, state, permit); ++ return ret; ++} ++ ++asmlinkage long ++sys_vserver(uint32_t cmd, uint32_t id, void __user *data) ++{ ++ return do_vserver(cmd, id, data, 0); ++} ++ ++#ifdef CONFIG_COMPAT ++ ++asmlinkage long ++sys32_vserver(uint32_t cmd, uint32_t id, void __user *data) ++{ ++ return do_vserver(cmd, id, data, 1); ++} ++ ++#endif /* CONFIG_COMPAT */ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/sysctl.c linux-3.2.34-vs2.3.2.15/kernel/vserver/sysctl.c +--- linux-3.2.34/kernel/vserver/sysctl.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/sysctl.c 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,247 @@ ++/* ++ * kernel/vserver/sysctl.c ++ * ++ * Virtual Context Support ++ * ++ * Copyright (C) 2004-2007 Herbert Pötzl ++ * ++ * V0.01 basic structure ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++enum { ++ CTL_DEBUG_ERROR = 0, ++ CTL_DEBUG_SWITCH = 1, ++ CTL_DEBUG_XID, ++ CTL_DEBUG_NID, ++ CTL_DEBUG_TAG, ++ CTL_DEBUG_NET, ++ CTL_DEBUG_LIMIT, ++ CTL_DEBUG_CRES, ++ CTL_DEBUG_DLIM, ++ CTL_DEBUG_QUOTA, ++ CTL_DEBUG_CVIRT, ++ CTL_DEBUG_SPACE, ++ CTL_DEBUG_PERM, ++ CTL_DEBUG_MISC, ++}; ++ ++ ++unsigned int vs_debug_switch = 0; ++unsigned int vs_debug_xid = 0; ++unsigned int vs_debug_nid = 0; ++unsigned int vs_debug_tag = 0; ++unsigned int vs_debug_net = 0; ++unsigned int vs_debug_limit = 0; ++unsigned int vs_debug_cres = 0; ++unsigned int vs_debug_dlim = 0; ++unsigned int vs_debug_quota = 0; ++unsigned int vs_debug_cvirt = 0; ++unsigned int vs_debug_space = 0; ++unsigned int vs_debug_perm = 0; ++unsigned int vs_debug_misc = 0; ++ ++ ++static struct ctl_table_header *vserver_table_header; ++static ctl_table vserver_root_table[]; ++ ++ ++void vserver_register_sysctl(void) ++{ ++ if (!vserver_table_header) { ++ vserver_table_header = register_sysctl_table(vserver_root_table); ++ } ++ ++} ++ ++void vserver_unregister_sysctl(void) ++{ ++ if (vserver_table_header) { ++ unregister_sysctl_table(vserver_table_header); ++ vserver_table_header = NULL; ++ } ++} ++ ++ ++static int proc_dodebug(ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ char tmpbuf[20], *p, c; ++ unsigned int value; ++ size_t left, len; ++ ++ if ((*ppos && !write) || !*lenp) { ++ *lenp = 0; ++ return 0; ++ } ++ ++ left = *lenp; ++ ++ if (write) { ++ if (!access_ok(VERIFY_READ, buffer, left)) ++ return -EFAULT; ++ p = (char *)buffer; ++ while (left && __get_user(c, p) >= 0 && isspace(c)) ++ left--, p++; ++ if (!left) ++ goto done; ++ ++ if (left > sizeof(tmpbuf) - 1) ++ return -EINVAL; ++ if (copy_from_user(tmpbuf, p, left)) ++ return -EFAULT; ++ tmpbuf[left] = '\0'; ++ ++ for (p = tmpbuf, value = 0; '0' <= *p && *p <= '9'; p++, left--) ++ value = 10 * value + (*p - '0'); ++ if (*p && !isspace(*p)) ++ return -EINVAL; ++ while (left && isspace(*p)) ++ left--, p++; ++ *(unsigned int *)table->data = value; ++ } else { ++ if (!access_ok(VERIFY_WRITE, buffer, left)) ++ return -EFAULT; ++ len = sprintf(tmpbuf, "%d", *(unsigned int *)table->data); ++ if (len > left) ++ len = left; ++ if (__copy_to_user(buffer, tmpbuf, len)) ++ return -EFAULT; ++ if ((left -= len) > 0) { ++ if (put_user('\n', (char *)buffer + len)) ++ return -EFAULT; ++ left--; ++ } ++ } ++ ++done: ++ *lenp -= left; ++ *ppos += *lenp; ++ return 0; ++} ++ ++static int zero; ++ ++#define CTL_ENTRY(ctl, name) \ ++ { \ ++ .procname = #name, \ ++ .data = &vs_ ## name, \ ++ .maxlen = sizeof(int), \ ++ .mode = 0644, \ ++ .proc_handler = &proc_dodebug, \ ++ .extra1 = &zero, \ ++ .extra2 = &zero, \ ++ } ++ ++static ctl_table vserver_debug_table[] = { ++ CTL_ENTRY(CTL_DEBUG_SWITCH, debug_switch), ++ CTL_ENTRY(CTL_DEBUG_XID, debug_xid), ++ CTL_ENTRY(CTL_DEBUG_NID, debug_nid), ++ CTL_ENTRY(CTL_DEBUG_TAG, debug_tag), ++ CTL_ENTRY(CTL_DEBUG_NET, debug_net), ++ CTL_ENTRY(CTL_DEBUG_LIMIT, debug_limit), ++ CTL_ENTRY(CTL_DEBUG_CRES, debug_cres), ++ CTL_ENTRY(CTL_DEBUG_DLIM, debug_dlim), ++ CTL_ENTRY(CTL_DEBUG_QUOTA, debug_quota), ++ CTL_ENTRY(CTL_DEBUG_CVIRT, debug_cvirt), ++ CTL_ENTRY(CTL_DEBUG_SPACE, debug_space), ++ CTL_ENTRY(CTL_DEBUG_PERM, debug_perm), ++ CTL_ENTRY(CTL_DEBUG_MISC, debug_misc), ++ { 0 } ++}; ++ ++static ctl_table vserver_root_table[] = { ++ { ++ .procname = "vserver", ++ .mode = 0555, ++ .child = vserver_debug_table ++ }, ++ { 0 } ++}; ++ ++ ++static match_table_t tokens = { ++ { CTL_DEBUG_SWITCH, "switch=%x" }, ++ { CTL_DEBUG_XID, "xid=%x" }, ++ { CTL_DEBUG_NID, "nid=%x" }, ++ { CTL_DEBUG_TAG, "tag=%x" }, ++ { CTL_DEBUG_NET, "net=%x" }, ++ { CTL_DEBUG_LIMIT, "limit=%x" }, ++ { CTL_DEBUG_CRES, "cres=%x" }, ++ { CTL_DEBUG_DLIM, "dlim=%x" }, ++ { CTL_DEBUG_QUOTA, "quota=%x" }, ++ { CTL_DEBUG_CVIRT, "cvirt=%x" }, ++ { CTL_DEBUG_SPACE, "space=%x" }, ++ { CTL_DEBUG_PERM, "perm=%x" }, ++ { CTL_DEBUG_MISC, "misc=%x" }, ++ { CTL_DEBUG_ERROR, NULL } ++}; ++ ++#define HANDLE_CASE(id, name, val) \ ++ case CTL_DEBUG_ ## id: \ ++ vs_debug_ ## name = val; \ ++ printk("vs_debug_" #name "=0x%x\n", val); \ ++ break ++ ++ ++static int __init vs_debug_setup(char *str) ++{ ++ char *p; ++ int token; ++ ++ printk("vs_debug_setup(%s)\n", str); ++ while ((p = strsep(&str, ",")) != NULL) { ++ substring_t args[MAX_OPT_ARGS]; ++ unsigned int value; ++ ++ if (!*p) ++ continue; ++ ++ token = match_token(p, tokens, args); ++ value = (token > 0) ? simple_strtoul(args[0].from, NULL, 0) : 0; ++ ++ switch (token) { ++ HANDLE_CASE(SWITCH, switch, value); ++ HANDLE_CASE(XID, xid, value); ++ HANDLE_CASE(NID, nid, value); ++ HANDLE_CASE(TAG, tag, value); ++ HANDLE_CASE(NET, net, value); ++ HANDLE_CASE(LIMIT, limit, value); ++ HANDLE_CASE(CRES, cres, value); ++ HANDLE_CASE(DLIM, dlim, value); ++ HANDLE_CASE(QUOTA, quota, value); ++ HANDLE_CASE(CVIRT, cvirt, value); ++ HANDLE_CASE(SPACE, space, value); ++ HANDLE_CASE(PERM, perm, value); ++ HANDLE_CASE(MISC, misc, value); ++ default: ++ return -EINVAL; ++ break; ++ } ++ } ++ return 1; ++} ++ ++__setup("vsdebug=", vs_debug_setup); ++ ++ ++ ++EXPORT_SYMBOL_GPL(vs_debug_switch); ++EXPORT_SYMBOL_GPL(vs_debug_xid); ++EXPORT_SYMBOL_GPL(vs_debug_nid); ++EXPORT_SYMBOL_GPL(vs_debug_net); ++EXPORT_SYMBOL_GPL(vs_debug_limit); ++EXPORT_SYMBOL_GPL(vs_debug_cres); ++EXPORT_SYMBOL_GPL(vs_debug_dlim); ++EXPORT_SYMBOL_GPL(vs_debug_quota); ++EXPORT_SYMBOL_GPL(vs_debug_cvirt); ++EXPORT_SYMBOL_GPL(vs_debug_space); ++EXPORT_SYMBOL_GPL(vs_debug_perm); ++EXPORT_SYMBOL_GPL(vs_debug_misc); ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/tag.c linux-3.2.34-vs2.3.2.15/kernel/vserver/tag.c +--- linux-3.2.34/kernel/vserver/tag.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/tag.c 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,63 @@ ++/* ++ * linux/kernel/vserver/tag.c ++ * ++ * Virtual Server: Shallow Tag Space ++ * ++ * Copyright (C) 2007 Herbert Pötzl ++ * ++ * V0.01 basic implementation ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++ ++ ++int dx_migrate_task(struct task_struct *p, tag_t tag) ++{ ++ if (!p) ++ BUG(); ++ ++ vxdprintk(VXD_CBIT(tag, 5), ++ "dx_migrate_task(%p[#%d],#%d)", p, p->tag, tag); ++ ++ task_lock(p); ++ p->tag = tag; ++ task_unlock(p); ++ ++ vxdprintk(VXD_CBIT(tag, 5), ++ "moved task %p into [#%d]", p, tag); ++ return 0; ++} ++ ++/* vserver syscall commands below here */ ++ ++/* taks xid and vx_info functions */ ++ ++ ++int vc_task_tag(uint32_t id) ++{ ++ tag_t tag; ++ ++ if (id) { ++ struct task_struct *tsk; ++ rcu_read_lock(); ++ tsk = find_task_by_real_pid(id); ++ tag = (tsk) ? tsk->tag : -ESRCH; ++ rcu_read_unlock(); ++ } else ++ tag = dx_current_tag(); ++ return tag; ++} ++ ++ ++int vc_tag_migrate(uint32_t tag) ++{ ++ return dx_migrate_task(current, tag & 0xFFFF); ++} ++ ++ +diff -NurpP --minimal linux-3.2.34/kernel/vserver/vci_config.h linux-3.2.34-vs2.3.2.15/kernel/vserver/vci_config.h +--- linux-3.2.34/kernel/vserver/vci_config.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/kernel/vserver/vci_config.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,76 @@ ++ ++/* interface version */ ++ ++#define VCI_VERSION 0x00020308 ++ ++ ++enum { ++ VCI_KCBIT_NO_DYNAMIC = 0, ++ ++ VCI_KCBIT_PROC_SECURE = 4, ++ /* VCI_KCBIT_HARDCPU = 5, */ ++ /* VCI_KCBIT_IDLELIMIT = 6, */ ++ /* VCI_KCBIT_IDLETIME = 7, */ ++ ++ VCI_KCBIT_COWBL = 8, ++ VCI_KCBIT_FULLCOWBL = 9, ++ VCI_KCBIT_SPACES = 10, ++ VCI_KCBIT_NETV2 = 11, ++ VCI_KCBIT_MEMCG = 12, ++ ++ VCI_KCBIT_DEBUG = 16, ++ VCI_KCBIT_HISTORY = 20, ++ VCI_KCBIT_TAGGED = 24, ++ VCI_KCBIT_PPTAG = 28, ++ ++ VCI_KCBIT_MORE = 31, ++}; ++ ++ ++static inline uint32_t vci_kernel_config(void) ++{ ++ return ++ (1 << VCI_KCBIT_NO_DYNAMIC) | ++ ++ /* configured features */ ++#ifdef CONFIG_VSERVER_PROC_SECURE ++ (1 << VCI_KCBIT_PROC_SECURE) | ++#endif ++#ifdef CONFIG_VSERVER_COWBL ++ (1 << VCI_KCBIT_COWBL) | ++ (1 << VCI_KCBIT_FULLCOWBL) | ++#endif ++ (1 << VCI_KCBIT_SPACES) | ++ (1 << VCI_KCBIT_NETV2) | ++#ifdef CONFIG_CGROUP_MEM_RES_CTLR ++ (1 << VCI_KCBIT_MEMCG) | ++#endif ++ ++ /* debug options */ ++#ifdef CONFIG_VSERVER_DEBUG ++ (1 << VCI_KCBIT_DEBUG) | ++#endif ++#ifdef CONFIG_VSERVER_HISTORY ++ (1 << VCI_KCBIT_HISTORY) | ++#endif ++ ++ /* inode context tagging */ ++#if defined(CONFIG_TAGGING_NONE) ++ (0 << VCI_KCBIT_TAGGED) | ++#elif defined(CONFIG_TAGGING_UID16) ++ (1 << VCI_KCBIT_TAGGED) | ++#elif defined(CONFIG_TAGGING_GID16) ++ (2 << VCI_KCBIT_TAGGED) | ++#elif defined(CONFIG_TAGGING_ID24) ++ (3 << VCI_KCBIT_TAGGED) | ++#elif defined(CONFIG_TAGGING_INTERN) ++ (4 << VCI_KCBIT_TAGGED) | ++#elif defined(CONFIG_TAGGING_RUNTIME) ++ (5 << VCI_KCBIT_TAGGED) | ++#else ++ (7 << VCI_KCBIT_TAGGED) | ++#endif ++ (1 << VCI_KCBIT_PPTAG) | ++ 0; ++} ++ +diff -NurpP --minimal linux-3.2.34/mm/memcontrol.c linux-3.2.34-vs2.3.2.15/mm/memcontrol.c +--- linux-3.2.34/mm/memcontrol.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/mm/memcontrol.c 2012-06-14 20:45:24.000000000 +0200 +@@ -766,6 +766,31 @@ struct mem_cgroup *mem_cgroup_from_task( + struct mem_cgroup, css); + } + ++u64 mem_cgroup_res_read_u64(struct mem_cgroup *mem, int member) ++{ ++ return res_counter_read_u64(&mem->res, member); ++} ++ ++u64 mem_cgroup_memsw_read_u64(struct mem_cgroup *mem, int member) ++{ ++ return res_counter_read_u64(&mem->memsw, member); ++} ++ ++s64 mem_cgroup_stat_read_cache(struct mem_cgroup *mem) ++{ ++ return mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); ++} ++ ++s64 mem_cgroup_stat_read_anon(struct mem_cgroup *mem) ++{ ++ return mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); ++} ++ ++s64 mem_cgroup_stat_read_mapped(struct mem_cgroup *mem) ++{ ++ return mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); ++} ++ + struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) + { + struct mem_cgroup *memcg = NULL; +diff -NurpP --minimal linux-3.2.34/mm/oom_kill.c linux-3.2.34-vs2.3.2.15/mm/oom_kill.c +--- linux-3.2.34/mm/oom_kill.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/mm/oom_kill.c 2012-01-09 16:19:31.000000000 +0100 +@@ -33,6 +33,8 @@ + #include + #include + #include ++#include ++#include + + int sysctl_panic_on_oom; + int sysctl_oom_kill_allocating_task; +@@ -148,11 +150,18 @@ struct task_struct *find_lock_task_mm(st + static bool oom_unkillable_task(struct task_struct *p, + const struct mem_cgroup *mem, const nodemask_t *nodemask) + { +- if (is_global_init(p)) ++ unsigned xid = vx_current_xid(); ++ ++ /* skip the init task, global and per guest */ ++ if (task_is_init(p)) + return true; + if (p->flags & PF_KTHREAD) + return true; + ++ /* skip other guest and host processes if oom in guest */ ++ if (xid && vx_task_xid(p) != xid) ++ return true; ++ + /* When mem_cgroup_out_of_memory() and p is not member of the group */ + if (mem && !task_in_mem_cgroup(p, mem)) + return true; +@@ -440,8 +449,8 @@ static int oom_kill_task(struct task_str + /* mm cannot be safely dereferenced after task_unlock(p) */ + mm = p->mm; + +- pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", +- task_pid_nr(p), p->comm, K(p->mm->total_vm), ++ pr_err("Killed process %d:#%u (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", ++ task_pid_nr(p), p->xid, p->comm, K(p->mm->total_vm), + K(get_mm_counter(p->mm, MM_ANONPAGES)), + K(get_mm_counter(p->mm, MM_FILEPAGES))); + task_unlock(p); +@@ -499,8 +508,8 @@ static int oom_kill_process(struct task_ + } + + task_lock(p); +- pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", +- message, task_pid_nr(p), p->comm, points); ++ pr_err("%s: Kill process %d:#%u (%s) score %d or sacrifice child\n", ++ message, task_pid_nr(p), p->xid, p->comm, points); + task_unlock(p); + + /* +@@ -601,6 +610,8 @@ int unregister_oom_notifier(struct notif + } + EXPORT_SYMBOL_GPL(unregister_oom_notifier); + ++long vs_oom_action(unsigned int); ++ + /* + * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero + * if a parallel OOM killing is already taking place that includes a zone in +@@ -759,7 +770,12 @@ retry: + if (!p) { + dump_header(NULL, gfp_mask, order, NULL, mpol_mask); + read_unlock(&tasklist_lock); +- panic("Out of memory and no killable processes...\n"); ++ ++ /* avoid panic for guest OOM */ ++ if (current->xid) ++ vs_oom_action(LINUX_REBOOT_CMD_OOM); ++ else ++ panic("Out of memory and no killable processes...\n"); + } + + if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, +diff -NurpP --minimal linux-3.2.34/mm/page_alloc.c linux-3.2.34-vs2.3.2.15/mm/page_alloc.c +--- linux-3.2.34/mm/page_alloc.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/mm/page_alloc.c 2012-10-22 12:59:53.000000000 +0200 +@@ -57,6 +57,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -2527,6 +2529,9 @@ void si_meminfo(struct sysinfo *val) + val->totalhigh = totalhigh_pages; + val->freehigh = nr_free_highpages(); + val->mem_unit = PAGE_SIZE; ++ ++ if (vx_flags(VXF_VIRT_MEM, 0)) ++ vx_vsi_meminfo(val); + } + + EXPORT_SYMBOL(si_meminfo); +@@ -2547,6 +2552,9 @@ void si_meminfo_node(struct sysinfo *val + val->freehigh = 0; + #endif + val->mem_unit = PAGE_SIZE; ++ ++ if (vx_flags(VXF_VIRT_MEM, 0)) ++ vx_vsi_meminfo(val); + } + #endif + +diff -NurpP --minimal linux-3.2.34/mm/pgtable-generic.c linux-3.2.34-vs2.3.2.15/mm/pgtable-generic.c +--- linux-3.2.34/mm/pgtable-generic.c 2011-03-15 18:07:42.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/mm/pgtable-generic.c 2011-12-05 19:33:02.000000000 +0100 +@@ -6,6 +6,8 @@ + * Copyright (C) 2010 Linus Torvalds + */ + ++#include ++ + #include + #include + #include +diff -NurpP --minimal linux-3.2.34/mm/shmem.c linux-3.2.34-vs2.3.2.15/mm/shmem.c +--- linux-3.2.34/mm/shmem.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/mm/shmem.c 2012-11-06 18:08:24.000000000 +0100 +@@ -1461,7 +1461,7 @@ static int shmem_statfs(struct dentry *d + { + struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); + +- buf->f_type = TMPFS_MAGIC; ++ buf->f_type = TMPFS_SUPER_MAGIC; + buf->f_bsize = PAGE_CACHE_SIZE; + buf->f_namelen = NAME_MAX; + if (sbinfo->max_blocks) { +@@ -2220,7 +2220,7 @@ int shmem_fill_super(struct super_block + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; +- sb->s_magic = TMPFS_MAGIC; ++ sb->s_magic = TMPFS_SUPER_MAGIC; + sb->s_op = &shmem_ops; + sb->s_time_gran = 1; + #ifdef CONFIG_TMPFS_XATTR +diff -NurpP --minimal linux-3.2.34/mm/slab.c linux-3.2.34-vs2.3.2.15/mm/slab.c +--- linux-3.2.34/mm/slab.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/mm/slab.c 2012-10-22 12:59:53.000000000 +0200 +@@ -411,6 +411,8 @@ static void kmem_list3_init(struct kmem_ + #define STATS_INC_FREEMISS(x) do { } while (0) + #endif + ++#include "slab_vs.h" ++ + #if DEBUG + + /* +@@ -3400,6 +3402,7 @@ retry: + + obj = slab_get_obj(cachep, slabp, nodeid); + check_slabp(cachep, slabp); ++ vx_slab_alloc(cachep, flags); + l3->free_objects--; + /* move slabp to correct slabp list: */ + list_del(&slabp->list); +@@ -3477,6 +3480,7 @@ __cache_alloc_node(struct kmem_cache *ca + /* ___cache_alloc_node can fall back to other nodes */ + ptr = ____cache_alloc_node(cachep, flags, nodeid); + out: ++ vx_slab_alloc(cachep, flags); + local_irq_restore(save_flags); + ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); + kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, +@@ -3664,6 +3668,7 @@ static inline void __cache_free(struct k + check_irq_off(); + kmemleak_free_recursive(objp, cachep->flags); + objp = cache_free_debugcheck(cachep, objp, caller); ++ vx_slab_free(cachep); + + kmemcheck_slab_free(cachep, objp, obj_size(cachep)); + +diff -NurpP --minimal linux-3.2.34/mm/slab_vs.h linux-3.2.34-vs2.3.2.15/mm/slab_vs.h +--- linux-3.2.34/mm/slab_vs.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/mm/slab_vs.h 2011-12-05 19:33:02.000000000 +0100 +@@ -0,0 +1,29 @@ ++ ++#include ++ ++#include ++ ++static inline ++void vx_slab_alloc(struct kmem_cache *cachep, gfp_t flags) ++{ ++ int what = gfp_zone(cachep->gfpflags); ++ struct vx_info *vxi = current_vx_info(); ++ ++ if (!vxi) ++ return; ++ ++ atomic_add(cachep->buffer_size, &vxi->cacct.slab[what]); ++} ++ ++static inline ++void vx_slab_free(struct kmem_cache *cachep) ++{ ++ int what = gfp_zone(cachep->gfpflags); ++ struct vx_info *vxi = current_vx_info(); ++ ++ if (!vxi) ++ return; ++ ++ atomic_sub(cachep->buffer_size, &vxi->cacct.slab[what]); ++} ++ +diff -NurpP --minimal linux-3.2.34/mm/swapfile.c linux-3.2.34-vs2.3.2.15/mm/swapfile.c +--- linux-3.2.34/mm/swapfile.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/mm/swapfile.c 2012-06-22 19:03:16.000000000 +0200 +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + + static bool swap_count_continued(struct swap_info_struct *, pgoff_t, + unsigned char); +@@ -1751,6 +1752,16 @@ static int swap_show(struct seq_file *sw + + if (si == SEQ_START_TOKEN) { + seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); ++ if (vx_flags(VXF_VIRT_MEM, 0)) { ++ struct sysinfo si; ++ ++ vx_vsi_swapinfo(&si); ++ if (si.totalswap < (1 << 10)) ++ return 0; ++ seq_printf(swap, "%s\t\t\t\t\t%s\t%lu\t%lu\t%d\n", ++ "hdv0", "partition", si.totalswap >> 10, ++ (si.totalswap - si.freeswap) >> 10, -1); ++ } + return 0; + } + +@@ -2170,6 +2181,8 @@ void si_swapinfo(struct sysinfo *val) + val->freeswap = nr_swap_pages + nr_to_be_unused; + val->totalswap = total_swap_pages + nr_to_be_unused; + spin_unlock(&swap_lock); ++ if (vx_flags(VXF_VIRT_MEM, 0)) ++ vx_vsi_swapinfo(val); + } + + /* +diff -NurpP --minimal linux-3.2.34/net/bridge/br_multicast.c linux-3.2.34-vs2.3.2.15/net/bridge/br_multicast.c +--- linux-3.2.34/net/bridge/br_multicast.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/bridge/br_multicast.c 2012-05-15 18:16:52.000000000 +0200 +@@ -445,7 +445,7 @@ static struct sk_buff *br_ip6_multicast_ + ip6h->hop_limit = 1; + ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1)); + if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0, +- &ip6h->saddr)) { ++ &ip6h->saddr, NULL)) { + kfree_skb(skb); + return NULL; + } +diff -NurpP --minimal linux-3.2.34/net/core/dev.c linux-3.2.34-vs2.3.2.15/net/core/dev.c +--- linux-3.2.34/net/core/dev.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/core/dev.c 2012-11-18 21:11:16.000000000 +0100 +@@ -127,6 +127,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -623,7 +624,8 @@ struct net_device *__dev_get_by_name(str + struct hlist_head *head = dev_name_hash(net, name); + + hlist_for_each_entry(dev, p, head, name_hlist) +- if (!strncmp(dev->name, name, IFNAMSIZ)) ++ if (!strncmp(dev->name, name, IFNAMSIZ) && ++ nx_dev_visible(current_nx_info(), dev)) + return dev; + + return NULL; +@@ -649,7 +651,8 @@ struct net_device *dev_get_by_name_rcu(s + struct hlist_head *head = dev_name_hash(net, name); + + hlist_for_each_entry_rcu(dev, p, head, name_hlist) +- if (!strncmp(dev->name, name, IFNAMSIZ)) ++ if (!strncmp(dev->name, name, IFNAMSIZ) && ++ nx_dev_visible(current_nx_info(), dev)) + return dev; + + return NULL; +@@ -700,7 +703,8 @@ struct net_device *__dev_get_by_index(st + struct hlist_head *head = dev_index_hash(net, ifindex); + + hlist_for_each_entry(dev, p, head, index_hlist) +- if (dev->ifindex == ifindex) ++ if ((dev->ifindex == ifindex) && ++ nx_dev_visible(current_nx_info(), dev)) + return dev; + + return NULL; +@@ -718,7 +722,7 @@ EXPORT_SYMBOL(__dev_get_by_index); + * about locking. The caller must hold RCU lock. + */ + +-struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) ++struct net_device *dev_get_by_index_real_rcu(struct net *net, int ifindex) + { + struct hlist_node *p; + struct net_device *dev; +@@ -730,6 +734,16 @@ struct net_device *dev_get_by_index_rcu( + + return NULL; + } ++EXPORT_SYMBOL(dev_get_by_index_real_rcu); ++ ++struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) ++{ ++ struct net_device *dev = dev_get_by_index_real_rcu(net, ifindex); ++ ++ if (nx_dev_visible(current_nx_info(), dev)) ++ return dev; ++ return NULL; ++} + EXPORT_SYMBOL(dev_get_by_index_rcu); + + +@@ -778,7 +792,8 @@ struct net_device *dev_getbyhwaddr_rcu(s + + for_each_netdev_rcu(net, dev) + if (dev->type == type && +- !memcmp(dev->dev_addr, ha, dev->addr_len)) ++ !memcmp(dev->dev_addr, ha, dev->addr_len) && ++ nx_dev_visible(current_nx_info(), dev)) + return dev; + + return NULL; +@@ -790,9 +805,11 @@ struct net_device *__dev_getfirstbyhwtyp + struct net_device *dev; + + ASSERT_RTNL(); +- for_each_netdev(net, dev) +- if (dev->type == type) ++ for_each_netdev(net, dev) { ++ if ((dev->type == type) && ++ nx_dev_visible(current_nx_info(), dev)) + return dev; ++ } + + return NULL; + } +@@ -910,6 +927,8 @@ static int __dev_alloc_name(struct net * + continue; + if (i < 0 || i >= max_netdevices) + continue; ++ if (!nx_dev_visible(current_nx_info(), d)) ++ continue; + + /* avoid cases where sscanf is not exact inverse of printf */ + snprintf(buf, IFNAMSIZ, name, i); +@@ -4071,6 +4090,8 @@ static int dev_ifconf(struct net *net, c + + total = 0; + for_each_netdev(net, dev) { ++ if (!nx_dev_visible(current_nx_info(), dev)) ++ continue; + for (i = 0; i < NPROTO; i++) { + if (gifconf_list[i]) { + int done; +@@ -4173,6 +4194,10 @@ static void dev_seq_printf_stats(struct + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + ++ /* device visible inside network context? */ ++ if (!nx_dev_visible(current_nx_info(), dev)) ++ return; ++ + seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " + "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", + dev->name, stats->rx_bytes, stats->rx_packets, +diff -NurpP --minimal linux-3.2.34/net/core/rtnetlink.c linux-3.2.34-vs2.3.2.15/net/core/rtnetlink.c +--- linux-3.2.34/net/core/rtnetlink.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/core/rtnetlink.c 2012-11-18 21:11:16.000000000 +0100 +@@ -1076,6 +1076,8 @@ static int rtnl_dump_ifinfo(struct sk_bu + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; ++ if (!nx_dev_visible(skb->sk->sk_nx_info, dev)) ++ continue; + if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, 0, +@@ -1958,6 +1960,9 @@ void rtmsg_ifinfo(int type, struct net_d + int err = -ENOBUFS; + size_t if_info_size; + ++ if (!nx_dev_visible(current_nx_info(), dev)) ++ return; ++ + skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL); + if (skb == NULL) + goto errout; +diff -NurpP --minimal linux-3.2.34/net/core/sock.c linux-3.2.34-vs2.3.2.15/net/core/sock.c +--- linux-3.2.34/net/core/sock.c 2012-11-18 18:42:24.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/core/sock.c 2012-10-22 12:59:53.000000000 +0200 +@@ -127,6 +127,10 @@ + #include + + #include ++#include ++#include ++#include ++#include + + #include + +@@ -1067,6 +1071,8 @@ static struct sock *sk_prot_alloc(struct + goto out_free_sec; + sk_tx_queue_clear(sk); + } ++ sock_vx_init(sk); ++ sock_nx_init(sk); + + return sk; + +@@ -1166,6 +1172,11 @@ static void __sk_free(struct sock *sk) + put_cred(sk->sk_peer_cred); + put_pid(sk->sk_peer_pid); + put_net(sock_net(sk)); ++ vx_sock_dec(sk); ++ clr_vx_info(&sk->sk_vx_info); ++ sk->sk_xid = -1; ++ clr_nx_info(&sk->sk_nx_info); ++ sk->sk_nid = -1; + sk_prot_free(sk->sk_prot_creator, sk); + } + +@@ -1213,6 +1224,8 @@ struct sock *sk_clone(const struct sock + + /* SANITY */ + get_net(sock_net(newsk)); ++ sock_vx_init(newsk); ++ sock_nx_init(newsk); + sk_node_init(&newsk->sk_node); + sock_lock_init(newsk); + bh_lock_sock(newsk); +@@ -1269,6 +1282,12 @@ struct sock *sk_clone(const struct sock + smp_wmb(); + atomic_set(&newsk->sk_refcnt, 2); + ++ set_vx_info(&newsk->sk_vx_info, sk->sk_vx_info); ++ newsk->sk_xid = sk->sk_xid; ++ vx_sock_inc(newsk); ++ set_nx_info(&newsk->sk_nx_info, sk->sk_nx_info); ++ newsk->sk_nid = sk->sk_nid; ++ + /* + * Increment the counter in the same struct proto as the master + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that +@@ -2019,6 +2038,12 @@ void sock_init_data(struct socket *sock, + + sk->sk_stamp = ktime_set(-1L, 0); + ++ set_vx_info(&sk->sk_vx_info, current_vx_info()); ++ sk->sk_xid = vx_current_xid(); ++ vx_sock_inc(sk); ++ set_nx_info(&sk->sk_nx_info, current_nx_info()); ++ sk->sk_nid = nx_current_nid(); ++ + /* + * Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.txt for details) +diff -NurpP --minimal linux-3.2.34/net/ipv4/af_inet.c linux-3.2.34-vs2.3.2.15/net/ipv4/af_inet.c +--- linux-3.2.34/net/ipv4/af_inet.c 2012-01-09 16:15:03.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv4/af_inet.c 2012-02-15 03:03:53.000000000 +0100 +@@ -117,6 +117,7 @@ + #ifdef CONFIG_IP_MROUTE + #include + #endif ++#include + + + /* The inetsw table contains everything that inet_create needs to +@@ -326,9 +327,13 @@ lookup_protocol: + } + + err = -EPERM; ++ if ((protocol == IPPROTO_ICMP) && ++ nx_capable(CAP_NET_RAW, NXC_RAW_ICMP)) ++ goto override; ++ + if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) + goto out_rcu_unlock; +- ++override: + err = -EAFNOSUPPORT; + if (!inet_netns_ok(net, protocol)) + goto out_rcu_unlock; +@@ -452,6 +457,7 @@ int inet_bind(struct socket *sock, struc + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); ++ struct nx_v4_sock_addr nsa; + unsigned short snum; + int chk_addr_ret; + int err; +@@ -475,7 +481,11 @@ int inet_bind(struct socket *sock, struc + goto out; + } + +- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); ++ err = v4_map_sock_addr(inet, addr, &nsa); ++ if (err) ++ goto out; ++ ++ chk_addr_ret = inet_addr_type(sock_net(sk), nsa.saddr); + + /* Not specified by any standard per-se, however it breaks too + * many applications when removed. It is unfortunate since +@@ -487,7 +497,7 @@ int inet_bind(struct socket *sock, struc + err = -EADDRNOTAVAIL; + if (!sysctl_ip_nonlocal_bind && + !(inet->freebind || inet->transparent) && +- addr->sin_addr.s_addr != htonl(INADDR_ANY) && ++ nsa.saddr != htonl(INADDR_ANY) && + chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && + chk_addr_ret != RTN_BROADCAST) +@@ -512,7 +522,7 @@ int inet_bind(struct socket *sock, struc + if (sk->sk_state != TCP_CLOSE || inet->inet_num) + goto out_release_sock; + +- inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; ++ v4_set_sock_addr(inet, &nsa); + if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) + inet->inet_saddr = 0; /* Use device */ + +@@ -715,11 +725,13 @@ int inet_getname(struct socket *sock, st + peer == 1)) + return -ENOTCONN; + sin->sin_port = inet->inet_dport; +- sin->sin_addr.s_addr = inet->inet_daddr; ++ sin->sin_addr.s_addr = ++ nx_map_sock_lback(sk->sk_nx_info, inet->inet_daddr); + } else { + __be32 addr = inet->inet_rcv_saddr; + if (!addr) + addr = inet->inet_saddr; ++ addr = nx_map_sock_lback(sk->sk_nx_info, addr); + sin->sin_port = inet->inet_sport; + sin->sin_addr.s_addr = addr; + } +diff -NurpP --minimal linux-3.2.34/net/ipv4/arp.c linux-3.2.34-vs2.3.2.15/net/ipv4/arp.c +--- linux-3.2.34/net/ipv4/arp.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv4/arp.c 2012-03-01 21:39:38.000000000 +0100 +@@ -1333,6 +1333,7 @@ static void arp_format_neigh_entry(struc + struct net_device *dev = n->dev; + int hatype = dev->type; + ++ /* FIXME: check for network context */ + read_lock(&n->lock); + /* Convert hardware address to XX:XX:XX:XX ... form. */ + #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +@@ -1364,6 +1365,7 @@ static void arp_format_pneigh_entry(stru + int hatype = dev ? dev->type : 0; + char tbuf[16]; + ++ /* FIXME: check for network context */ + sprintf(tbuf, "%pI4", n->key); + seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", + tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00", +diff -NurpP --minimal linux-3.2.34/net/ipv4/devinet.c linux-3.2.34-vs2.3.2.15/net/ipv4/devinet.c +--- linux-3.2.34/net/ipv4/devinet.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv4/devinet.c 2012-02-07 03:14:01.000000000 +0100 +@@ -518,6 +518,7 @@ struct in_device *inetdev_by_index(struc + } + EXPORT_SYMBOL(inetdev_by_index); + ++ + /* Called only from RTNL semaphored context. No locks. */ + + struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, +@@ -759,6 +760,8 @@ int devinet_ioctl(struct net *net, unsig + + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) { ++ struct nx_info *nxi = current_nx_info(); ++ + if (tryaddrmatch) { + /* Matthias Andree */ + /* compare label and address (4.4BSD style) */ +@@ -767,6 +770,8 @@ int devinet_ioctl(struct net *net, unsig + This is checked above. */ + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + ifap = &ifa->ifa_next) { ++ if (!nx_v4_ifa_visible(nxi, ifa)) ++ continue; + if (!strcmp(ifr.ifr_name, ifa->ifa_label) && + sin_orig.sin_addr.s_addr == + ifa->ifa_local) { +@@ -779,9 +784,12 @@ int devinet_ioctl(struct net *net, unsig + comparing just the label */ + if (!ifa) { + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; +- ifap = &ifa->ifa_next) ++ ifap = &ifa->ifa_next) { ++ if (!nx_v4_ifa_visible(nxi, ifa)) ++ continue; + if (!strcmp(ifr.ifr_name, ifa->ifa_label)) + break; ++ } + } + } + +@@ -934,6 +942,8 @@ static int inet_gifconf(struct net_devic + goto out; + + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { ++ if (!nx_v4_ifa_visible(current_nx_info(), ifa)) ++ continue; + if (!buf) { + done += sizeof(ifr); + continue; +@@ -1294,6 +1304,7 @@ static int inet_dump_ifaddr(struct sk_bu + struct net_device *dev; + struct in_device *in_dev; + struct in_ifaddr *ifa; ++ struct sock *sk = skb->sk; + struct hlist_head *head; + struct hlist_node *node; + +@@ -1316,6 +1327,8 @@ static int inet_dump_ifaddr(struct sk_bu + + for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; + ifa = ifa->ifa_next, ip_idx++) { ++ if (sk && !nx_v4_ifa_visible(sk->sk_nx_info, ifa)) ++ continue; + if (ip_idx < s_ip_idx) + continue; + if (inet_fill_ifaddr(skb, ifa, +diff -NurpP --minimal linux-3.2.34/net/ipv4/fib_trie.c linux-3.2.34-vs2.3.2.15/net/ipv4/fib_trie.c +--- linux-3.2.34/net/ipv4/fib_trie.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv4/fib_trie.c 2012-06-14 20:45:24.000000000 +0200 +@@ -2557,6 +2557,7 @@ static int fib_route_seq_show(struct seq + || fa->fa_type == RTN_MULTICAST) + continue; + ++ /* FIXME: check for network context? */ + if (fi) + seq_printf(seq, + "%s\t%08X\t%08X\t%04X\t%d\t%u\t" +diff -NurpP --minimal linux-3.2.34/net/ipv4/inet_connection_sock.c linux-3.2.34-vs2.3.2.15/net/ipv4/inet_connection_sock.c +--- linux-3.2.34/net/ipv4/inet_connection_sock.c 2011-07-22 11:18:13.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/net/ipv4/inet_connection_sock.c 2012-02-07 03:13:38.000000000 +0100 +@@ -52,6 +52,37 @@ void inet_get_local_port_range(int *low, + } + EXPORT_SYMBOL(inet_get_local_port_range); + ++int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) ++{ ++ __be32 sk1_rcv_saddr = sk_rcv_saddr(sk1), ++ sk2_rcv_saddr = sk_rcv_saddr(sk2); ++ ++ if (inet_v6_ipv6only(sk2)) ++ return 0; ++ ++ if (sk1_rcv_saddr && ++ sk2_rcv_saddr && ++ sk1_rcv_saddr == sk2_rcv_saddr) ++ return 1; ++ ++ if (sk1_rcv_saddr && ++ !sk2_rcv_saddr && ++ v4_addr_in_nx_info(sk2->sk_nx_info, sk1_rcv_saddr, NXA_MASK_BIND)) ++ return 1; ++ ++ if (sk2_rcv_saddr && ++ !sk1_rcv_saddr && ++ v4_addr_in_nx_info(sk1->sk_nx_info, sk2_rcv_saddr, NXA_MASK_BIND)) ++ return 1; ++ ++ if (!sk1_rcv_saddr && ++ !sk2_rcv_saddr && ++ nx_v4_addr_conflict(sk1->sk_nx_info, sk2->sk_nx_info)) ++ return 1; ++ ++ return 0; ++} ++ + int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) + { +@@ -74,9 +105,7 @@ int inet_csk_bind_conflict(const struct + sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { + if (!reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) { +- const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); +- if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || +- sk2_rcv_saddr == sk_rcv_saddr(sk)) ++ if (ipv4_rcv_saddr_equal(sk, sk2)) + break; + } + } +diff -NurpP --minimal linux-3.2.34/net/ipv4/inet_diag.c linux-3.2.34-vs2.3.2.15/net/ipv4/inet_diag.c +--- linux-3.2.34/net/ipv4/inet_diag.c 2012-01-09 16:15:03.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv4/inet_diag.c 2012-02-15 03:03:53.000000000 +0100 +@@ -33,6 +33,8 @@ + #include + + #include ++#include ++#include + + static const struct inet_diag_handler **inet_diag_table; + +@@ -119,8 +121,10 @@ static int inet_csk_diag_fill(struct soc + + r->id.idiag_sport = inet->inet_sport; + r->id.idiag_dport = inet->inet_dport; +- r->id.idiag_src[0] = inet->inet_rcv_saddr; +- r->id.idiag_dst[0] = inet->inet_daddr; ++ r->id.idiag_src[0] = nx_map_sock_lback(sk->sk_nx_info, ++ inet->inet_rcv_saddr); ++ r->id.idiag_dst[0] = nx_map_sock_lback(sk->sk_nx_info, ++ inet->inet_daddr); + + /* IPv6 dual-stack sockets use inet->tos for IPv4 connections, + * hence this needs to be included regardless of socket family. +@@ -214,8 +218,8 @@ static int inet_twsk_diag_fill(struct in + r->id.idiag_cookie[1] = (u32)(((unsigned long)tw >> 31) >> 1); + r->id.idiag_sport = tw->tw_sport; + r->id.idiag_dport = tw->tw_dport; +- r->id.idiag_src[0] = tw->tw_rcv_saddr; +- r->id.idiag_dst[0] = tw->tw_daddr; ++ r->id.idiag_src[0] = nx_map_sock_lback(tw->tw_nx_info, tw->tw_rcv_saddr); ++ r->id.idiag_dst[0] = nx_map_sock_lback(tw->tw_nx_info, tw->tw_daddr); + r->idiag_state = tw->tw_substate; + r->idiag_timer = 3; + r->idiag_expires = DIV_ROUND_UP(tmo * 1000, HZ); +@@ -272,6 +276,7 @@ static int inet_diag_get_exact(struct sk + err = -EINVAL; + + if (req->idiag_family == AF_INET) { ++ /* TODO: lback */ + sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0], + req->id.idiag_dport, req->id.idiag_src[0], + req->id.idiag_sport, req->id.idiag_if); +@@ -514,6 +519,7 @@ static int inet_csk_diag_dump(struct soc + } else + #endif + { ++ /* TODO: lback */ + entry.saddr = &inet->inet_rcv_saddr; + entry.daddr = &inet->inet_daddr; + } +@@ -552,6 +558,7 @@ static int inet_twsk_diag_dump(struct in + } else + #endif + { ++ /* TODO: lback */ + entry.saddr = &tw->tw_rcv_saddr; + entry.daddr = &tw->tw_daddr; + } +@@ -598,8 +605,8 @@ static int inet_diag_fill_req(struct sk_ + + r->id.idiag_sport = inet->inet_sport; + r->id.idiag_dport = ireq->rmt_port; +- r->id.idiag_src[0] = ireq->loc_addr; +- r->id.idiag_dst[0] = ireq->rmt_addr; ++ r->id.idiag_src[0] = nx_map_sock_lback(sk->sk_nx_info, ireq->loc_addr); ++ r->id.idiag_dst[0] = nx_map_sock_lback(sk->sk_nx_info, ireq->rmt_addr); + r->idiag_expires = jiffies_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; +@@ -670,6 +677,7 @@ static int inet_diag_dump_reqs(struct sk + continue; + + if (bc) { ++ /* TODO: lback */ + entry.saddr = + #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + (entry.family == AF_INET6) ? +@@ -740,6 +748,8 @@ static int inet_diag_dump(struct sk_buff + sk_nulls_for_each(sk, node, &ilb->head) { + struct inet_sock *inet = inet_sk(sk); + ++ if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (num < s_num) { + num++; + continue; +@@ -806,6 +816,8 @@ skip_listen_ht: + sk_nulls_for_each(sk, node, &head->chain) { + struct inet_sock *inet = inet_sk(sk); + ++ if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (num < s_num) + goto next_normal; + if (!(r->idiag_states & (1 << sk->sk_state))) +@@ -830,6 +842,8 @@ next_normal: + inet_twsk_for_each(tw, node, + &head->twchain) { + ++ if (!nx_check(tw->tw_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (num < s_num) + goto next_dying; + if (r->id.idiag_sport != tw->tw_sport && +diff -NurpP --minimal linux-3.2.34/net/ipv4/inet_hashtables.c linux-3.2.34-vs2.3.2.15/net/ipv4/inet_hashtables.c +--- linux-3.2.34/net/ipv4/inet_hashtables.c 2011-10-24 18:45:34.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/net/ipv4/inet_hashtables.c 2011-12-05 19:33:02.000000000 +0100 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + + /* +@@ -156,6 +157,11 @@ static inline int compute_score(struct s + if (rcv_saddr != daddr) + return -1; + score += 2; ++ } else { ++ /* block non nx_info ips */ ++ if (!v4_addr_in_nx_info(sk->sk_nx_info, ++ daddr, NXA_MASK_BIND)) ++ return -1; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) +@@ -173,7 +179,6 @@ static inline int compute_score(struct s + * wildcarded during the search since they can never be otherwise. + */ + +- + struct sock *__inet_lookup_listener(struct net *net, + struct inet_hashinfo *hashinfo, + const __be32 daddr, const unsigned short hnum, +@@ -196,6 +201,7 @@ begin: + hiscore = score; + } + } ++ + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. +diff -NurpP --minimal linux-3.2.34/net/ipv4/netfilter/nf_nat_helper.c linux-3.2.34-vs2.3.2.15/net/ipv4/netfilter/nf_nat_helper.c +--- linux-3.2.34/net/ipv4/netfilter/nf_nat_helper.c 2011-07-22 11:18:13.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/net/ipv4/netfilter/nf_nat_helper.c 2011-12-05 19:33:02.000000000 +0100 +@@ -20,6 +20,7 @@ + #include + + #include ++#include + #include + #include + #include +diff -NurpP --minimal linux-3.2.34/net/ipv4/netfilter.c linux-3.2.34-vs2.3.2.15/net/ipv4/netfilter.c +--- linux-3.2.34/net/ipv4/netfilter.c 2012-01-09 16:15:03.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv4/netfilter.c 2012-02-07 03:13:38.000000000 +0100 +@@ -6,7 +6,7 @@ + #include + #include + #include +-#include ++// #include + #include + #include + #include +diff -NurpP --minimal linux-3.2.34/net/ipv4/raw.c linux-3.2.34-vs2.3.2.15/net/ipv4/raw.c +--- linux-3.2.34/net/ipv4/raw.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv4/raw.c 2012-10-22 12:59:53.000000000 +0200 +@@ -118,7 +118,7 @@ static struct sock *__raw_v4_lookup(stru + + if (net_eq(sock_net(sk), net) && inet->inet_num == num && + !(inet->inet_daddr && inet->inet_daddr != raddr) && +- !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && ++ v4_sock_addr_match(sk->sk_nx_info, inet, laddr) && + !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + goto found; /* gotcha */ + } +@@ -388,6 +388,12 @@ static int raw_send_hdrinc(struct sock * + icmp_out_count(net, ((struct icmphdr *) + skb_transport_header(skb))->type); + ++ err = -EPERM; ++ if (!nx_check(0, VS_ADMIN) && !capable(CAP_NET_RAW) && ++ sk->sk_nx_info && ++ !v4_addr_in_nx_info(sk->sk_nx_info, iph->saddr, NXA_MASK_BIND)) ++ goto error_free; ++ + err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, + rt->dst.dev, dst_output); + if (err > 0) +@@ -575,6 +581,16 @@ static int raw_sendmsg(struct kiocb *ioc + goto done; + } + ++ if (sk->sk_nx_info) { ++ rt = ip_v4_find_src(sock_net(sk), sk->sk_nx_info, &fl4); ++ if (IS_ERR(rt)) { ++ err = PTR_ERR(rt); ++ rt = NULL; ++ goto done; ++ } ++ ip_rt_put(rt); ++ } ++ + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + if (IS_ERR(rt)) { +@@ -651,17 +667,19 @@ static int raw_bind(struct sock *sk, str + { + struct inet_sock *inet = inet_sk(sk); + struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; ++ struct nx_v4_sock_addr nsa = { 0 }; + int ret = -EINVAL; + int chk_addr_ret; + + if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) + goto out; +- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); ++ v4_map_sock_addr(inet, addr, &nsa); ++ chk_addr_ret = inet_addr_type(sock_net(sk), nsa.saddr); + ret = -EADDRNOTAVAIL; +- if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && ++ if (nsa.saddr && chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) + goto out; +- inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; ++ v4_set_sock_addr(inet, &nsa); + if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) + inet->inet_saddr = 0; /* Use device */ + sk_dst_reset(sk); +@@ -713,7 +731,8 @@ static int raw_recvmsg(struct kiocb *ioc + /* Copy the address. */ + if (sin) { + sin->sin_family = AF_INET; +- sin->sin_addr.s_addr = ip_hdr(skb)->saddr; ++ sin->sin_addr.s_addr = ++ nx_map_sock_lback(sk->sk_nx_info, ip_hdr(skb)->saddr); + sin->sin_port = 0; + memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +@@ -909,7 +928,8 @@ static struct sock *raw_get_first(struct + struct hlist_node *node; + + sk_for_each(sk, node, &state->h->ht[state->bucket]) +- if (sock_net(sk) == seq_file_net(seq)) ++ if ((sock_net(sk) == seq_file_net(seq)) && ++ nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) + goto found; + } + sk = NULL; +@@ -925,7 +945,8 @@ static struct sock *raw_get_next(struct + sk = sk_next(sk); + try_again: + ; +- } while (sk && sock_net(sk) != seq_file_net(seq)); ++ } while (sk && ((sock_net(sk) != seq_file_net(seq)) || ++ !nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT))); + + if (!sk && ++state->bucket < RAW_HTABLE_SIZE) { + sk = sk_head(&state->h->ht[state->bucket]); +diff -NurpP --minimal linux-3.2.34/net/ipv4/route.c linux-3.2.34-vs2.3.2.15/net/ipv4/route.c +--- linux-3.2.34/net/ipv4/route.c 2012-01-09 16:15:04.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv4/route.c 2012-02-07 03:13:38.000000000 +0100 +@@ -2709,7 +2709,7 @@ static struct rtable *ip_route_output_sl + + + if (fl4->flowi4_oif) { +- dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); ++ dev_out = dev_get_by_index_real_rcu(net, fl4->flowi4_oif); + rth = ERR_PTR(-ENODEV); + if (dev_out == NULL) + goto out; +diff -NurpP --minimal linux-3.2.34/net/ipv4/tcp.c linux-3.2.34-vs2.3.2.15/net/ipv4/tcp.c +--- linux-3.2.34/net/ipv4/tcp.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv4/tcp.c 2012-11-18 21:11:16.000000000 +0100 +@@ -266,6 +266,7 @@ + #include + #include + #include ++#include + + #include + #include +diff -NurpP --minimal linux-3.2.34/net/ipv4/tcp_ipv4.c linux-3.2.34-vs2.3.2.15/net/ipv4/tcp_ipv4.c +--- linux-3.2.34/net/ipv4/tcp_ipv4.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv4/tcp_ipv4.c 2012-11-06 18:08:24.000000000 +0100 +@@ -2033,6 +2033,12 @@ static void *listening_get_next(struct s + req = req->dl_next; + while (1) { + while (req) { ++ vxdprintk(VXD_CBIT(net, 6), ++ "sk,req: %p [#%d] (from %d)", req->sk, ++ (req->sk)?req->sk->sk_nid:0, nx_current_nid()); ++ if (req->sk && ++ !nx_check(req->sk->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (req->rsk_ops->family == st->family) { + cur = req; + goto out; +@@ -2057,6 +2063,10 @@ get_req: + } + get_sk: + sk_nulls_for_each_from(sk, node) { ++ vxdprintk(VXD_CBIT(net, 6), "sk: %p [#%d] (from %d)", ++ sk, sk->sk_nid, nx_current_nid()); ++ if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (!net_eq(sock_net(sk), net)) + continue; + if (sk->sk_family == st->family) { +@@ -2133,6 +2143,11 @@ static void *established_get_first(struc + + spin_lock_bh(lock); + sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { ++ vxdprintk(VXD_CBIT(net, 6), ++ "sk,egf: %p [#%d] (from %d)", ++ sk, sk->sk_nid, nx_current_nid()); ++ if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (sk->sk_family != st->family || + !net_eq(sock_net(sk), net)) { + continue; +@@ -2143,6 +2158,11 @@ static void *established_get_first(struc + st->state = TCP_SEQ_STATE_TIME_WAIT; + inet_twsk_for_each(tw, node, + &tcp_hashinfo.ehash[st->bucket].twchain) { ++ vxdprintk(VXD_CBIT(net, 6), ++ "tw: %p [#%d] (from %d)", ++ tw, tw->tw_nid, nx_current_nid()); ++ if (!nx_check(tw->tw_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (tw->tw_family != st->family || + !net_eq(twsk_net(tw), net)) { + continue; +@@ -2172,7 +2192,9 @@ static void *established_get_next(struct + tw = cur; + tw = tw_next(tw); + get_tw: +- while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) { ++ while (tw && (tw->tw_family != st->family || ++ !net_eq(twsk_net(tw), net) || ++ !nx_check(tw->tw_nid, VS_WATCH_P | VS_IDENT))) { + tw = tw_next(tw); + } + if (tw) { +@@ -2196,6 +2218,11 @@ get_tw: + sk = sk_nulls_next(sk); + + sk_nulls_for_each_from(sk, node) { ++ vxdprintk(VXD_CBIT(net, 6), ++ "sk,egn: %p [#%d] (from %d)", ++ sk, sk->sk_nid, nx_current_nid()); ++ if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) + goto found; + } +@@ -2401,9 +2428,9 @@ static void get_openreq4(const struct so + seq_printf(f, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", + i, +- ireq->loc_addr, ++ nx_map_sock_lback(current_nx_info(), ireq->loc_addr), + ntohs(inet_sk(sk)->inet_sport), +- ireq->rmt_addr, ++ nx_map_sock_lback(current_nx_info(), ireq->rmt_addr), + ntohs(ireq->rmt_port), + TCP_SYN_RECV, + 0, 0, /* could print option size, but that is af dependent. */ +@@ -2425,8 +2452,8 @@ static void get_tcp4_sock(struct sock *s + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_sock *inet = inet_sk(sk); +- __be32 dest = inet->inet_daddr; +- __be32 src = inet->inet_rcv_saddr; ++ __be32 dest = nx_map_sock_lback(current_nx_info(), inet->inet_daddr); ++ __be32 src = nx_map_sock_lback(current_nx_info(), inet->inet_rcv_saddr); + __u16 destp = ntohs(inet->inet_dport); + __u16 srcp = ntohs(inet->inet_sport); + int rx_queue; +@@ -2483,8 +2510,8 @@ static void get_timewait4_sock(const str + if (ttd < 0) + ttd = 0; + +- dest = tw->tw_daddr; +- src = tw->tw_rcv_saddr; ++ dest = nx_map_sock_lback(current_nx_info(), tw->tw_daddr); ++ src = nx_map_sock_lback(current_nx_info(), tw->tw_rcv_saddr); + destp = ntohs(tw->tw_dport); + srcp = ntohs(tw->tw_sport); + +diff -NurpP --minimal linux-3.2.34/net/ipv4/tcp_minisocks.c linux-3.2.34-vs2.3.2.15/net/ipv4/tcp_minisocks.c +--- linux-3.2.34/net/ipv4/tcp_minisocks.c 2012-01-09 16:15:04.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv4/tcp_minisocks.c 2011-12-05 19:33:02.000000000 +0100 +@@ -23,6 +23,9 @@ + #include + #include + #include ++#include ++#include ++#include + #include + #include + #include +@@ -336,6 +339,11 @@ void tcp_time_wait(struct sock *sk, int + tcptw->tw_ts_recent = tp->rx_opt.ts_recent; + tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + ++ tw->tw_xid = sk->sk_xid; ++ tw->tw_vx_info = NULL; ++ tw->tw_nid = sk->sk_nid; ++ tw->tw_nx_info = NULL; ++ + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (tw->tw_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); +diff -NurpP --minimal linux-3.2.34/net/ipv4/udp.c linux-3.2.34-vs2.3.2.15/net/ipv4/udp.c +--- linux-3.2.34/net/ipv4/udp.c 2012-01-09 16:15:04.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv4/udp.c 2012-06-26 19:07:00.000000000 +0200 +@@ -297,14 +297,7 @@ fail: + } + EXPORT_SYMBOL(udp_lib_get_port); + +-static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) +-{ +- struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); +- +- return (!ipv6_only_sock(sk2) && +- (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr || +- inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); +-} ++extern int ipv4_rcv_saddr_equal(const struct sock *, const struct sock *); + + static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr, + unsigned int port) +@@ -339,6 +332,11 @@ static inline int compute_score(struct s + if (inet->inet_rcv_saddr != daddr) + return -1; + score += 2; ++ } else { ++ /* block non nx_info ips */ ++ if (!v4_addr_in_nx_info(sk->sk_nx_info, ++ daddr, NXA_MASK_BIND)) ++ return -1; + } + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) +@@ -442,6 +440,7 @@ exact_match: + return result; + } + ++ + /* UDP is nearly always wildcards out the wazoo, it makes no sense to try + * harder than this. -DaveM + */ +@@ -487,6 +486,11 @@ begin: + sk_nulls_for_each_rcu(sk, node, &hslot->head) { + score = compute_score(sk, net, saddr, hnum, sport, + daddr, dport, dif); ++ /* FIXME: disabled? ++ if (score == 9) { ++ result = sk; ++ break; ++ } else */ + if (score > badness) { + result = sk; + badness = score; +@@ -500,6 +504,7 @@ begin: + if (get_nulls_value(node) != slot) + goto begin; + ++ + if (result) { + if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) + result = NULL; +@@ -509,6 +514,7 @@ begin: + goto begin; + } + } ++ + rcu_read_unlock(); + return result; + } +@@ -551,8 +557,7 @@ static inline struct sock *udp_v4_mcast_ + udp_sk(s)->udp_port_hash != hnum || + (inet->inet_daddr && inet->inet_daddr != rmt_addr) || + (inet->inet_dport != rmt_port && inet->inet_dport) || +- (inet->inet_rcv_saddr && +- inet->inet_rcv_saddr != loc_addr) || ++ !v4_sock_addr_match(sk->sk_nx_info, inet, loc_addr) || + ipv6_only_sock(s) || + (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) + continue; +@@ -930,6 +935,16 @@ int udp_sendmsg(struct kiocb *iocb, stru + inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP, + faddr, saddr, dport, inet->inet_sport); + ++ if (sk->sk_nx_info) { ++ rt = ip_v4_find_src(net, sk->sk_nx_info, fl4); ++ if (IS_ERR(rt)) { ++ err = PTR_ERR(rt); ++ rt = NULL; ++ goto out; ++ } ++ ip_rt_put(rt); ++ } ++ + security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + rt = ip_route_output_flow(net, fl4, sk); + if (IS_ERR(rt)) { +@@ -1228,7 +1243,8 @@ try_again: + if (sin) { + sin->sin_family = AF_INET; + sin->sin_port = udp_hdr(skb)->source; +- sin->sin_addr.s_addr = ip_hdr(skb)->saddr; ++ sin->sin_addr.s_addr = nx_map_sock_lback( ++ skb->sk->sk_nx_info, ip_hdr(skb)->saddr); + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + if (inet->cmsg_flags) +@@ -1974,6 +1990,8 @@ static struct sock *udp_get_first(struct + sk_nulls_for_each(sk, node, &hslot->head) { + if (!net_eq(sock_net(sk), net)) + continue; ++ if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (sk->sk_family == state->family) + goto found; + } +@@ -1991,7 +2009,9 @@ static struct sock *udp_get_next(struct + + do { + sk = sk_nulls_next(sk); +- } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); ++ } while (sk && (!net_eq(sock_net(sk), net) || ++ sk->sk_family != state->family || ++ !nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT))); + + if (!sk) { + if (state->bucket <= state->udp_table->mask) +@@ -2087,8 +2107,8 @@ static void udp4_format_sock(struct sock + int bucket, int *len) + { + struct inet_sock *inet = inet_sk(sp); +- __be32 dest = inet->inet_daddr; +- __be32 src = inet->inet_rcv_saddr; ++ __be32 dest = nx_map_sock_lback(current_nx_info(), inet->inet_daddr); ++ __be32 src = nx_map_sock_lback(current_nx_info(), inet->inet_rcv_saddr); + __u16 destp = ntohs(inet->inet_dport); + __u16 srcp = ntohs(inet->inet_sport); + +diff -NurpP --minimal linux-3.2.34/net/ipv6/Kconfig linux-3.2.34-vs2.3.2.15/net/ipv6/Kconfig +--- linux-3.2.34/net/ipv6/Kconfig 2010-08-02 16:52:59.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/net/ipv6/Kconfig 2011-12-05 19:33:02.000000000 +0100 +@@ -4,8 +4,8 @@ + + # IPv6 as module will cause a CRASH if you try to unload it + menuconfig IPV6 +- tristate "The IPv6 protocol" +- default m ++ bool "The IPv6 protocol" ++ default n + ---help--- + This is complemental support for the IP version 6. + You will still be able to do traditional IPv4 networking as well. +diff -NurpP --minimal linux-3.2.34/net/ipv6/addrconf.c linux-3.2.34-vs2.3.2.15/net/ipv6/addrconf.c +--- linux-3.2.34/net/ipv6/addrconf.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv6/addrconf.c 2012-10-22 12:59:53.000000000 +0200 +@@ -88,6 +88,8 @@ + #include + #include + #include ++#include ++#include + + /* Set to 3 to get tracing... */ + #define ACONF_DEBUG 2 +@@ -1111,7 +1113,7 @@ out: + + int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev, + const struct in6_addr *daddr, unsigned int prefs, +- struct in6_addr *saddr) ++ struct in6_addr *saddr, struct nx_info *nxi) + { + struct ipv6_saddr_score scores[2], + *score = &scores[0], *hiscore = &scores[1]; +@@ -1183,6 +1185,8 @@ int ipv6_dev_get_saddr(struct net *net, + dev->name); + continue; + } ++ if (!v6_addr_in_nx_info(nxi, &score->ifa->addr, -1)) ++ continue; + + score->rule = -1; + bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX); +@@ -3155,7 +3159,10 @@ static void if6_seq_stop(struct seq_file + static int if6_seq_show(struct seq_file *seq, void *v) + { + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v; +- seq_printf(seq, "%pi6 %02x %02x %02x %02x %8s\n", ++ ++ if (nx_check(0, VS_ADMIN|VS_WATCH) || ++ v6_addr_in_nx_info(current_nx_info(), &ifp->addr, -1)) ++ seq_printf(seq, "%pi6 %02x %02x %02x %02x %8s\n", + &ifp->addr, + ifp->idev->dev->ifindex, + ifp->prefix_len, +@@ -3661,6 +3668,11 @@ static int in6_dump_addrs(struct inet6_d + struct ifacaddr6 *ifaca; + int err = 1; + int ip_idx = *p_ip_idx; ++ struct nx_info *nxi = skb->sk ? skb->sk->sk_nx_info : NULL; ++ ++ /* disable ipv6 on non v6 guests */ ++ if (nxi && !nx_info_has_v6(nxi)) ++ return skb->len; + + read_lock_bh(&idev->lock); + switch (type) { +@@ -3671,6 +3683,8 @@ static int in6_dump_addrs(struct inet6_d + list_for_each_entry(ifa, &idev->addr_list, if_list) { + if (++ip_idx < s_ip_idx) + continue; ++ if (!v6_addr_in_nx_info(nxi, &ifa->addr, -1)) ++ continue; + err = inet6_fill_ifaddr(skb, ifa, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, +@@ -3687,6 +3701,8 @@ static int in6_dump_addrs(struct inet6_d + ifmca = ifmca->next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; ++ if (!v6_addr_in_nx_info(nxi, &ifmca->mca_addr, -1)) ++ continue; + err = inet6_fill_ifmcaddr(skb, ifmca, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, +@@ -3702,6 +3718,8 @@ static int in6_dump_addrs(struct inet6_d + ifaca = ifaca->aca_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; ++ if (!v6_addr_in_nx_info(nxi, &ifaca->aca_addr, -1)) ++ continue; + err = inet6_fill_ifacaddr(skb, ifaca, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, +@@ -4087,6 +4105,11 @@ static int inet6_dump_ifinfo(struct sk_b + struct inet6_dev *idev; + struct hlist_head *head; + struct hlist_node *node; ++ struct nx_info *nxi = skb->sk ? skb->sk->sk_nx_info : NULL; ++ ++ /* FIXME: maybe disable ipv6 on non v6 guests? ++ if (skb->sk && skb->sk->sk_vx_info) ++ return skb->len; */ + + s_h = cb->args[0]; + s_idx = cb->args[1]; +@@ -4098,6 +4121,8 @@ static int inet6_dump_ifinfo(struct sk_b + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; ++ if (!v6_dev_in_nx_info(dev, nxi)) ++ goto cont; + idev = __in6_dev_get(dev); + if (!idev) + goto cont; +diff -NurpP --minimal linux-3.2.34/net/ipv6/af_inet6.c linux-3.2.34-vs2.3.2.15/net/ipv6/af_inet6.c +--- linux-3.2.34/net/ipv6/af_inet6.c 2011-10-24 18:45:34.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/net/ipv6/af_inet6.c 2012-09-01 11:40:43.000000000 +0200 +@@ -42,6 +42,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -160,9 +162,12 @@ lookup_protocol: + } + + err = -EPERM; ++ if ((protocol == IPPROTO_ICMPV6) && ++ nx_capable(CAP_NET_RAW, NXC_RAW_ICMP)) ++ goto override; + if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) + goto out_rcu_unlock; +- ++override: + sock->ops = answer->ops; + answer_prot = answer->prot; + answer_no_check = answer->no_check; +@@ -261,6 +266,7 @@ int inet6_bind(struct socket *sock, stru + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); ++ struct nx_v6_sock_addr nsa; + __be32 v4addr = 0; + unsigned short snum; + int addr_type = 0; +@@ -276,6 +282,10 @@ int inet6_bind(struct socket *sock, stru + if (addr->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + ++ err = v6_map_sock_addr(inet, addr, &nsa); ++ if (err) ++ return err; ++ + addr_type = ipv6_addr_type(&addr->sin6_addr); + if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) + return -EINVAL; +@@ -307,6 +317,7 @@ int inet6_bind(struct socket *sock, stru + /* Reproduce AF_INET checks to make the bindings consistent */ + v4addr = addr->sin6_addr.s6_addr32[3]; + chk_addr_ret = inet_addr_type(net, v4addr); ++ + if (!sysctl_ip_nonlocal_bind && + !(inet->freebind || inet->transparent) && + v4addr != htonl(INADDR_ANY) && +@@ -316,6 +327,10 @@ int inet6_bind(struct socket *sock, stru + err = -EADDRNOTAVAIL; + goto out; + } ++ if (!v4_addr_in_nx_info(sk->sk_nx_info, v4addr, NXA_MASK_BIND)) { ++ err = -EADDRNOTAVAIL; ++ goto out; ++ } + } else { + if (addr_type != IPV6_ADDR_ANY) { + struct net_device *dev = NULL; +@@ -342,6 +357,11 @@ int inet6_bind(struct socket *sock, stru + } + } + ++ if (!v6_addr_in_nx_info(sk->sk_nx_info, &addr->sin6_addr, -1)) { ++ err = -EADDRNOTAVAIL; ++ goto out_unlock; ++ } ++ + /* ipv4 addr of the socket is invalid. Only the + * unspecified and mapped address have a v4 equivalent. + */ +@@ -358,6 +378,9 @@ int inet6_bind(struct socket *sock, stru + } + } + ++ /* what's that for? */ ++ v6_set_sock_addr(inet, &nsa); ++ + inet->inet_rcv_saddr = v4addr; + inet->inet_saddr = v4addr; + +@@ -459,9 +482,11 @@ int inet6_getname(struct socket *sock, s + return -ENOTCONN; + sin->sin6_port = inet->inet_dport; + ipv6_addr_copy(&sin->sin6_addr, &np->daddr); ++ /* FIXME: remap lback? */ + if (np->sndflow) + sin->sin6_flowinfo = np->flow_label; + } else { ++ /* FIXME: remap lback? */ + if (ipv6_addr_any(&np->rcv_saddr)) + ipv6_addr_copy(&sin->sin6_addr, &np->saddr); + else +diff -NurpP --minimal linux-3.2.34/net/ipv6/datagram.c linux-3.2.34-vs2.3.2.15/net/ipv6/datagram.c +--- linux-3.2.34/net/ipv6/datagram.c 2012-01-09 16:15:04.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv6/datagram.c 2011-12-05 19:33:02.000000000 +0100 +@@ -642,7 +642,7 @@ int datagram_send_ctl(struct net *net, s + + rcu_read_lock(); + if (fl6->flowi6_oif) { +- dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); ++ dev = dev_get_by_index_real_rcu(net, fl6->flowi6_oif); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; +diff -NurpP --minimal linux-3.2.34/net/ipv6/fib6_rules.c linux-3.2.34-vs2.3.2.15/net/ipv6/fib6_rules.c +--- linux-3.2.34/net/ipv6/fib6_rules.c 2012-01-09 16:15:04.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv6/fib6_rules.c 2011-12-05 19:33:02.000000000 +0100 +@@ -91,7 +91,7 @@ static int fib6_rule_action(struct fib_r + ip6_dst_idev(&rt->dst)->dev, + &flp6->daddr, + rt6_flags2srcprefs(flags), +- &saddr)) ++ &saddr, NULL)) + goto again; + if (!ipv6_prefix_equal(&saddr, &r->src.addr, + r->src.plen)) +diff -NurpP --minimal linux-3.2.34/net/ipv6/inet6_hashtables.c linux-3.2.34-vs2.3.2.15/net/ipv6/inet6_hashtables.c +--- linux-3.2.34/net/ipv6/inet6_hashtables.c 2011-10-24 18:45:34.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/net/ipv6/inet6_hashtables.c 2011-12-05 19:33:02.000000000 +0100 +@@ -16,6 +16,7 @@ + + #include + #include ++#include + + #include + #include +@@ -83,7 +84,6 @@ struct sock *__inet6_lookup_established( + unsigned int slot = hash & hashinfo->ehash_mask; + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + +- + rcu_read_lock(); + begin: + sk_nulls_for_each_rcu(sk, node, &head->chain) { +@@ -95,7 +95,7 @@ begin: + sock_put(sk); + goto begin; + } +- goto out; ++ goto out; + } + } + if (get_nulls_value(node) != slot) +@@ -141,6 +141,9 @@ static inline int compute_score(struct s + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + return -1; + score++; ++ } else { ++ if (!v6_addr_in_nx_info(sk->sk_nx_info, daddr, -1)) ++ return -1; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) +diff -NurpP --minimal linux-3.2.34/net/ipv6/ip6_output.c linux-3.2.34-vs2.3.2.15/net/ipv6/ip6_output.c +--- linux-3.2.34/net/ipv6/ip6_output.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv6/ip6_output.c 2012-06-14 20:45:24.000000000 +0200 +@@ -963,7 +963,8 @@ static int ip6_dst_lookup_tail(struct so + struct rt6_info *rt = (struct rt6_info *) *dst; + err = ip6_route_get_saddr(net, rt, &fl6->daddr, + sk ? inet6_sk(sk)->srcprefs : 0, +- &fl6->saddr); ++ &fl6->saddr, ++ sk ? sk->sk_nx_info : NULL); + if (err) + goto out_err_release; + } +diff -NurpP --minimal linux-3.2.34/net/ipv6/ndisc.c linux-3.2.34-vs2.3.2.15/net/ipv6/ndisc.c +--- linux-3.2.34/net/ipv6/ndisc.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv6/ndisc.c 2012-11-18 21:11:16.000000000 +0100 +@@ -588,7 +588,7 @@ static void ndisc_send_na(struct net_dev + } else { + if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, + inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs, +- &tmpaddr)) ++ &tmpaddr, NULL)) + return; + src_addr = &tmpaddr; + } +diff -NurpP --minimal linux-3.2.34/net/ipv6/raw.c linux-3.2.34-vs2.3.2.15/net/ipv6/raw.c +--- linux-3.2.34/net/ipv6/raw.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv6/raw.c 2012-10-22 12:59:53.000000000 +0200 +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -284,6 +285,13 @@ static int rawv6_bind(struct sock *sk, s + goto out_unlock; + } + ++ if (!v6_addr_in_nx_info(sk->sk_nx_info, &addr->sin6_addr, -1)) { ++ err = -EADDRNOTAVAIL; ++ if (dev) ++ dev_put(dev); ++ goto out; ++ } ++ + /* ipv4 addr of the socket is invalid. Only the + * unspecified and mapped address have a v4 equivalent. + */ +diff -NurpP --minimal linux-3.2.34/net/ipv6/route.c linux-3.2.34-vs2.3.2.15/net/ipv6/route.c +--- linux-3.2.34/net/ipv6/route.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv6/route.c 2012-11-18 21:11:16.000000000 +0100 +@@ -55,6 +55,7 @@ + #include + #include + #include ++#include + + #include + +@@ -2094,15 +2095,17 @@ int ip6_route_get_saddr(struct net *net, + struct rt6_info *rt, + const struct in6_addr *daddr, + unsigned int prefs, +- struct in6_addr *saddr) ++ struct in6_addr *saddr, ++ struct nx_info *nxi) + { + struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt); + int err = 0; +- if (rt->rt6i_prefsrc.plen) ++ if (rt->rt6i_prefsrc.plen && (!nxi || ++ v6_addr_in_nx_info(nxi, &rt->rt6i_prefsrc.addr, NXA_TYPE_ADDR))) + ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr); + else + err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, +- daddr, prefs, saddr); ++ daddr, prefs, saddr, nxi); + return err; + } + +@@ -2432,7 +2435,8 @@ static int rt6_fill_node(struct net *net + NLA_PUT_U32(skb, RTA_IIF, iif); + } else if (dst) { + struct in6_addr saddr_buf; +- if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0) ++ if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf, ++ (skb->sk ? skb->sk->sk_nx_info : NULL)) == 0) + NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); + } + +@@ -2639,6 +2643,7 @@ static int rt6_info_route(struct rt6_inf + struct seq_file *m = p_arg; + struct neighbour *n; + ++ /* FIXME: check for network context? */ + seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); + + #ifdef CONFIG_IPV6_SUBTREES +diff -NurpP --minimal linux-3.2.34/net/ipv6/tcp_ipv6.c linux-3.2.34-vs2.3.2.15/net/ipv6/tcp_ipv6.c +--- linux-3.2.34/net/ipv6/tcp_ipv6.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv6/tcp_ipv6.c 2012-11-06 18:08:24.000000000 +0100 +@@ -70,6 +70,7 @@ + + #include + #include ++#include + + static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); + static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +@@ -162,8 +163,15 @@ static int tcp_v6_connect(struct sock *s + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + +- if(ipv6_addr_any(&usin->sin6_addr)) +- usin->sin6_addr.s6_addr[15] = 0x1; ++ if(ipv6_addr_any(&usin->sin6_addr)) { ++ struct nx_info *nxi = sk->sk_nx_info; ++ ++ if (nxi && nx_info_has_v6(nxi)) ++ /* FIXME: remap lback? */ ++ usin->sin6_addr = nxi->v6.ip; ++ else ++ usin->sin6_addr.s6_addr[15] = 0x1; ++ } + + addr_type = ipv6_addr_type(&usin->sin6_addr); + +diff -NurpP --minimal linux-3.2.34/net/ipv6/udp.c linux-3.2.34-vs2.3.2.15/net/ipv6/udp.c +--- linux-3.2.34/net/ipv6/udp.c 2012-01-09 16:15:04.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/ipv6/udp.c 2011-12-15 01:11:37.000000000 +0100 +@@ -45,41 +45,67 @@ + #include + #include + #include ++#include + + #include + #include + #include "udp_impl.h" + +-int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) ++int ipv6_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) + { +- const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; ++ const struct in6_addr *sk1_rcv_saddr6 = &inet6_sk(sk1)->rcv_saddr; + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); +- __be32 sk1_rcv_saddr = sk_rcv_saddr(sk); ++ __be32 sk1_rcv_saddr = sk_rcv_saddr(sk1); + __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); +- int sk_ipv6only = ipv6_only_sock(sk); ++ int sk1_ipv6only = ipv6_only_sock(sk1); + int sk2_ipv6only = inet_v6_ipv6only(sk2); +- int addr_type = ipv6_addr_type(sk_rcv_saddr6); ++ int addr_type = ipv6_addr_type(sk1_rcv_saddr6); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + /* if both are mapped, treat as IPv4 */ +- if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) +- return (!sk2_ipv6only && ++ if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { ++ if (!sk2_ipv6only && + (!sk1_rcv_saddr || !sk2_rcv_saddr || +- sk1_rcv_saddr == sk2_rcv_saddr)); ++ sk1_rcv_saddr == sk2_rcv_saddr)) ++ goto vs_v4; ++ else ++ return 0; ++ } + + if (addr_type2 == IPV6_ADDR_ANY && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) +- return 1; ++ goto vs; + + if (addr_type == IPV6_ADDR_ANY && +- !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) +- return 1; ++ !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) ++ goto vs; + + if (sk2_rcv_saddr6 && +- ipv6_addr_equal(sk_rcv_saddr6, sk2_rcv_saddr6)) +- return 1; ++ ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) ++ goto vs; + + return 0; ++ ++vs_v4: ++ if (!sk1_rcv_saddr && !sk2_rcv_saddr) ++ return nx_v4_addr_conflict(sk1->sk_nx_info, sk2->sk_nx_info); ++ if (!sk2_rcv_saddr) ++ return v4_addr_in_nx_info(sk1->sk_nx_info, sk2_rcv_saddr, -1); ++ if (!sk1_rcv_saddr) ++ return v4_addr_in_nx_info(sk2->sk_nx_info, sk1_rcv_saddr, -1); ++ return 1; ++vs: ++ if (addr_type2 == IPV6_ADDR_ANY && addr_type == IPV6_ADDR_ANY) ++ return nx_v6_addr_conflict(sk1->sk_nx_info, sk2->sk_nx_info); ++ else if (addr_type2 == IPV6_ADDR_ANY) ++ return v6_addr_in_nx_info(sk2->sk_nx_info, sk1_rcv_saddr6, -1); ++ else if (addr_type == IPV6_ADDR_ANY) { ++ if (addr_type2 == IPV6_ADDR_MAPPED) ++ return nx_v4_addr_conflict(sk1->sk_nx_info, sk2->sk_nx_info); ++ else ++ return v6_addr_in_nx_info(sk1->sk_nx_info, sk2_rcv_saddr6, -1); ++ } ++ return 1; + } + + static unsigned int udp6_portaddr_hash(struct net *net, +@@ -143,6 +169,10 @@ static inline int compute_score(struct s + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + return -1; + score++; ++ } else { ++ /* block non nx_info ips */ ++ if (!v6_addr_in_nx_info(sk->sk_nx_info, daddr, -1)) ++ return -1; + } + if (!ipv6_addr_any(&np->daddr)) { + if (!ipv6_addr_equal(&np->daddr, saddr)) +diff -NurpP --minimal linux-3.2.34/net/ipv6/xfrm6_policy.c linux-3.2.34-vs2.3.2.15/net/ipv6/xfrm6_policy.c +--- linux-3.2.34/net/ipv6/xfrm6_policy.c 2011-07-22 11:18:13.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/net/ipv6/xfrm6_policy.c 2011-12-05 19:33:02.000000000 +0100 +@@ -63,7 +63,7 @@ static int xfrm6_get_saddr(struct net *n + dev = ip6_dst_idev(dst)->dev; + ipv6_dev_get_saddr(dev_net(dev), dev, + (struct in6_addr *)&daddr->a6, 0, +- (struct in6_addr *)&saddr->a6); ++ (struct in6_addr *)&saddr->a6, NULL); + dst_release(dst); + return 0; + } +diff -NurpP --minimal linux-3.2.34/net/netfilter/ipvs/ip_vs_xmit.c linux-3.2.34-vs2.3.2.15/net/netfilter/ipvs/ip_vs_xmit.c +--- linux-3.2.34/net/netfilter/ipvs/ip_vs_xmit.c 2012-01-09 16:15:04.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/netfilter/ipvs/ip_vs_xmit.c 2012-02-07 03:13:38.000000000 +0100 +@@ -226,7 +226,7 @@ __ip_vs_route_output_v6(struct net *net, + return dst; + if (ipv6_addr_any(&fl6.saddr) && + ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, +- &fl6.daddr, 0, &fl6.saddr) < 0) ++ &fl6.daddr, 0, &fl6.saddr, NULL) < 0) + goto out_err; + if (do_xfrm) { + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); +diff -NurpP --minimal linux-3.2.34/net/netlink/af_netlink.c linux-3.2.34-vs2.3.2.15/net/netlink/af_netlink.c +--- linux-3.2.34/net/netlink/af_netlink.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/netlink/af_netlink.c 2012-11-18 21:11:16.000000000 +0100 +@@ -55,6 +55,9 @@ + #include + #include + #include ++#include ++#include ++#include + + #include + #include +@@ -1926,6 +1929,8 @@ static struct sock *netlink_seq_socket_i + sk_for_each(s, node, &hash->table[j]) { + if (sock_net(s) != seq_file_net(seq)) + continue; ++ if (!nx_check(s->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (off == pos) { + iter->link = i; + iter->hash_idx = j; +@@ -1960,7 +1965,8 @@ static void *netlink_seq_next(struct seq + s = v; + do { + s = sk_next(s); +- } while (s && sock_net(s) != seq_file_net(seq)); ++ } while (s && (sock_net(s) != seq_file_net(seq) || ++ !nx_check(s->sk_nid, VS_WATCH_P | VS_IDENT))); + if (s) + return s; + +@@ -1972,7 +1978,8 @@ static void *netlink_seq_next(struct seq + + for (; j <= hash->mask; j++) { + s = sk_head(&hash->table[j]); +- while (s && sock_net(s) != seq_file_net(seq)) ++ while (s && (sock_net(s) != seq_file_net(seq) || ++ !nx_check(s->sk_nid, VS_WATCH_P | VS_IDENT))) + s = sk_next(s); + if (s) { + iter->link = i; +diff -NurpP --minimal linux-3.2.34/net/socket.c linux-3.2.34-vs2.3.2.15/net/socket.c +--- linux-3.2.34/net/socket.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/socket.c 2012-10-22 12:59:53.000000000 +0200 +@@ -98,6 +98,10 @@ + + #include + #include ++#include ++#include ++#include ++#include + + #include + #include +@@ -546,6 +550,7 @@ static inline int __sock_sendmsg_nosec(s + struct msghdr *msg, size_t size) + { + struct sock_iocb *si = kiocb_to_siocb(iocb); ++ size_t len; + + sock_update_classid(sock->sk); + +@@ -554,7 +559,22 @@ static inline int __sock_sendmsg_nosec(s + si->msg = msg; + si->size = size; + +- return sock->ops->sendmsg(iocb, sock, msg, size); ++ len = sock->ops->sendmsg(iocb, sock, msg, size); ++ if (sock->sk) { ++ if (len == size) ++ vx_sock_send(sock->sk, size); ++ else ++ vx_sock_fail(sock->sk, size); ++ } ++ vxdprintk(VXD_CBIT(net, 7), ++ "__sock_sendmsg: %p[%p,%p,%p;%d/%d]:%d/%zu", ++ sock, sock->sk, ++ (sock->sk)?sock->sk->sk_nx_info:0, ++ (sock->sk)?sock->sk->sk_vx_info:0, ++ (sock->sk)?sock->sk->sk_xid:0, ++ (sock->sk)?sock->sk->sk_nid:0, ++ (unsigned int)size, len); ++ return len; + } + + static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, +@@ -694,6 +714,7 @@ static inline int __sock_recvmsg_nosec(s + struct msghdr *msg, size_t size, int flags) + { + struct sock_iocb *si = kiocb_to_siocb(iocb); ++ int len; + + sock_update_classid(sock->sk); + +@@ -703,7 +724,18 @@ static inline int __sock_recvmsg_nosec(s + si->size = size; + si->flags = flags; + +- return sock->ops->recvmsg(iocb, sock, msg, size, flags); ++ len = sock->ops->recvmsg(iocb, sock, msg, size, flags); ++ if ((len >= 0) && sock->sk) ++ vx_sock_recv(sock->sk, len); ++ vxdprintk(VXD_CBIT(net, 7), ++ "__sock_recvmsg: %p[%p,%p,%p;%d/%d]:%d/%d", ++ sock, sock->sk, ++ (sock->sk)?sock->sk->sk_nx_info:0, ++ (sock->sk)?sock->sk->sk_vx_info:0, ++ (sock->sk)?sock->sk->sk_xid:0, ++ (sock->sk)?sock->sk->sk_nid:0, ++ (unsigned int)size, len); ++ return len; + } + + static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, +@@ -1188,6 +1220,13 @@ int __sock_create(struct net *net, int f + if (type < 0 || type >= SOCK_MAX) + return -EINVAL; + ++ if (!nx_check(0, VS_ADMIN)) { ++ if (family == PF_INET && !current_nx_info_has_v4()) ++ return -EAFNOSUPPORT; ++ if (family == PF_INET6 && !current_nx_info_has_v6()) ++ return -EAFNOSUPPORT; ++ } ++ + /* Compatibility. + + This uglymoron is moved from INET layer to here to avoid +@@ -1323,6 +1362,7 @@ SYSCALL_DEFINE3(socket, int, family, int + if (retval < 0) + goto out; + ++ set_bit(SOCK_USER_SOCKET, &sock->flags); + retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); + if (retval < 0) + goto out_release; +@@ -1364,10 +1404,12 @@ SYSCALL_DEFINE4(socketpair, int, family, + err = sock_create(family, type, protocol, &sock1); + if (err < 0) + goto out; ++ set_bit(SOCK_USER_SOCKET, &sock1->flags); + + err = sock_create(family, type, protocol, &sock2); + if (err < 0) + goto out_release_1; ++ set_bit(SOCK_USER_SOCKET, &sock2->flags); + + err = sock1->ops->socketpair(sock1, sock2); + if (err < 0) +diff -NurpP --minimal linux-3.2.34/net/sunrpc/auth.c linux-3.2.34-vs2.3.2.15/net/sunrpc/auth.c +--- linux-3.2.34/net/sunrpc/auth.c 2011-10-24 18:45:34.000000000 +0200 ++++ linux-3.2.34-vs2.3.2.15/net/sunrpc/auth.c 2011-12-05 19:33:02.000000000 +0100 +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + + #ifdef RPC_DEBUG + # define RPCDBG_FACILITY RPCDBG_AUTH +@@ -427,6 +428,7 @@ rpcauth_lookupcred(struct rpc_auth *auth + memset(&acred, 0, sizeof(acred)); + acred.uid = cred->fsuid; + acred.gid = cred->fsgid; ++ acred.tag = dx_current_tag(); + acred.group_info = get_group_info(((struct cred *)cred)->group_info); + + ret = auth->au_ops->lookup_cred(auth, &acred, flags); +@@ -467,6 +469,7 @@ rpcauth_bind_root_cred(struct rpc_task * + struct auth_cred acred = { + .uid = 0, + .gid = 0, ++ .tag = dx_current_tag(), + }; + + dprintk("RPC: %5u looking up %s cred\n", +diff -NurpP --minimal linux-3.2.34/net/sunrpc/auth_unix.c linux-3.2.34-vs2.3.2.15/net/sunrpc/auth_unix.c +--- linux-3.2.34/net/sunrpc/auth_unix.c 2012-01-09 16:15:04.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/sunrpc/auth_unix.c 2011-12-05 19:33:02.000000000 +0100 +@@ -12,12 +12,14 @@ + #include + #include + #include ++#include + + #define NFS_NGROUPS 16 + + struct unx_cred { + struct rpc_cred uc_base; + gid_t uc_gid; ++ tag_t uc_tag; + gid_t uc_gids[NFS_NGROUPS]; + }; + #define uc_uid uc_base.cr_uid +@@ -78,6 +80,7 @@ unx_create_cred(struct rpc_auth *auth, s + groups = NFS_NGROUPS; + + cred->uc_gid = acred->gid; ++ cred->uc_tag = acred->tag; + for (i = 0; i < groups; i++) + cred->uc_gids[i] = GROUP_AT(acred->group_info, i); + if (i < NFS_NGROUPS) +@@ -119,7 +122,9 @@ unx_match(struct auth_cred *acred, struc + unsigned int i; + + +- if (cred->uc_uid != acred->uid || cred->uc_gid != acred->gid) ++ if (cred->uc_uid != acred->uid || ++ cred->uc_gid != acred->gid || ++ cred->uc_tag != acred->tag) + return 0; + + if (acred->group_info != NULL) +@@ -145,7 +150,7 @@ unx_marshal(struct rpc_task *task, __be3 + struct rpc_clnt *clnt = task->tk_client; + struct unx_cred *cred = container_of(task->tk_rqstp->rq_cred, struct unx_cred, uc_base); + __be32 *base, *hold; +- int i; ++ int i, tag; + + *p++ = htonl(RPC_AUTH_UNIX); + base = p++; +@@ -155,9 +160,12 @@ unx_marshal(struct rpc_task *task, __be3 + * Copy the UTS nodename captured when the client was created. + */ + p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); ++ tag = task->tk_client->cl_tag; + +- *p++ = htonl((u32) cred->uc_uid); +- *p++ = htonl((u32) cred->uc_gid); ++ *p++ = htonl((u32) TAGINO_UID(tag, ++ cred->uc_uid, cred->uc_tag)); ++ *p++ = htonl((u32) TAGINO_GID(tag, ++ cred->uc_gid, cred->uc_tag)); + hold = p++; + for (i = 0; i < 16 && cred->uc_gids[i] != (gid_t) NOGROUP; i++) + *p++ = htonl((u32) cred->uc_gids[i]); +diff -NurpP --minimal linux-3.2.34/net/sunrpc/clnt.c linux-3.2.34-vs2.3.2.15/net/sunrpc/clnt.c +--- linux-3.2.34/net/sunrpc/clnt.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/sunrpc/clnt.c 2012-06-14 20:45:24.000000000 +0200 +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -361,6 +362,9 @@ struct rpc_clnt *rpc_create(struct rpc_c + if (!(args->flags & RPC_CLNT_CREATE_QUIET)) + clnt->cl_chatty = 1; + ++ /* TODO: handle RPC_CLNT_CREATE_TAGGED ++ if (args->flags & RPC_CLNT_CREATE_TAGGED) ++ clnt->cl_tag = 1; */ + return clnt; + } + EXPORT_SYMBOL_GPL(rpc_create); +diff -NurpP --minimal linux-3.2.34/net/unix/af_unix.c linux-3.2.34-vs2.3.2.15/net/unix/af_unix.c +--- linux-3.2.34/net/unix/af_unix.c 2012-11-18 18:42:25.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/net/unix/af_unix.c 2012-10-22 12:59:54.000000000 +0200 +@@ -114,6 +114,8 @@ + #include + #include + #include ++#include ++#include + + static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; + static DEFINE_SPINLOCK(unix_table_lock); +@@ -258,6 +260,8 @@ static struct sock *__unix_find_socket_b + if (!net_eq(sock_net(s), net)) + continue; + ++ if (!nx_check(s->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (u->addr->len == len && + !memcmp(u->addr->name, sunname, len)) + goto found; +@@ -2219,6 +2223,8 @@ static struct sock *unix_seq_idx(struct + for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) { + if (sock_net(s) != seq_file_net(seq)) + continue; ++ if (!nx_check(s->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (off == pos) + return s; + ++off; +@@ -2243,7 +2249,8 @@ static void *unix_seq_next(struct seq_fi + sk = first_unix_socket(&iter->i); + else + sk = next_unix_socket(&iter->i, sk); +- while (sk && (sock_net(sk) != seq_file_net(seq))) ++ while (sk && (sock_net(sk) != seq_file_net(seq) || ++ !nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT))) + sk = next_unix_socket(&iter->i, sk); + return sk; + } +diff -NurpP --minimal linux-3.2.34/scripts/checksyscalls.sh linux-3.2.34-vs2.3.2.15/scripts/checksyscalls.sh +--- linux-3.2.34/scripts/checksyscalls.sh 2011-03-15 18:07:46.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/scripts/checksyscalls.sh 2011-12-05 19:33:02.000000000 +0100 +@@ -193,7 +193,6 @@ cat << EOF + #define __IGNORE_afs_syscall + #define __IGNORE_getpmsg + #define __IGNORE_putpmsg +-#define __IGNORE_vserver + EOF + } + +diff -NurpP --minimal linux-3.2.34/security/commoncap.c linux-3.2.34-vs2.3.2.15/security/commoncap.c +--- linux-3.2.34/security/commoncap.c 2012-11-18 18:42:26.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/security/commoncap.c 2012-04-24 16:50:48.000000000 +0200 +@@ -63,6 +63,7 @@ int cap_netlink_recv(struct sk_buff *skb + return -EPERM; + return 0; + } ++ + EXPORT_SYMBOL(cap_netlink_recv); + + /** +@@ -84,14 +85,20 @@ EXPORT_SYMBOL(cap_netlink_recv); + int cap_capable(struct task_struct *tsk, const struct cred *cred, + struct user_namespace *targ_ns, int cap, int audit) + { ++ struct vx_info *vxi = tsk->vx_info; ++ + for (;;) { + /* The creator of the user namespace has all caps. */ + if (targ_ns != &init_user_ns && targ_ns->creator == cred->user) + return 0; + + /* Do we have the necessary capabilities? */ +- if (targ_ns == cred->user->user_ns) +- return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM; ++ if (targ_ns == cred->user->user_ns) { ++ if (vx_info_flags(vxi, VXF_STATE_SETUP, 0) && ++ cap_raised(cred->cap_effective, cap)) ++ return 0; ++ return vx_cap_raised(vxi, cred->cap_effective, cap) ? 0 : -EPERM; ++ } + + /* Have we tried all of the parent namespaces? */ + if (targ_ns == &init_user_ns) +@@ -621,7 +628,7 @@ int cap_inode_setxattr(struct dentry *de + + if (!strncmp(name, XATTR_SECURITY_PREFIX, + sizeof(XATTR_SECURITY_PREFIX) - 1) && +- !capable(CAP_SYS_ADMIN)) ++ !vx_capable(CAP_SYS_ADMIN, VXC_FS_SECURITY)) + return -EPERM; + return 0; + } +@@ -647,7 +654,7 @@ int cap_inode_removexattr(struct dentry + + if (!strncmp(name, XATTR_SECURITY_PREFIX, + sizeof(XATTR_SECURITY_PREFIX) - 1) && +- !capable(CAP_SYS_ADMIN)) ++ !vx_capable(CAP_SYS_ADMIN, VXC_FS_SECURITY)) + return -EPERM; + return 0; + } +diff -NurpP --minimal linux-3.2.34/security/selinux/hooks.c linux-3.2.34-vs2.3.2.15/security/selinux/hooks.c +--- linux-3.2.34/security/selinux/hooks.c 2012-01-09 16:15:05.000000000 +0100 ++++ linux-3.2.34-vs2.3.2.15/security/selinux/hooks.c 2011-12-05 19:33:02.000000000 +0100 +@@ -67,7 +67,6 @@ + #include + #include + #include /* for Unix socket types */ +-#include /* for Unix socket types */ + #include + #include + #include diff --git a/3.2.34/wrapfs-v3.2.2-45-ga5296eb.patch b/3.2.34/wrapfs-v3.2.2-45-ga5296eb.patch new file mode 100644 index 0000000..ebe5788 --- /dev/null +++ b/3.2.34/wrapfs-v3.2.2-45-ga5296eb.patch @@ -0,0 +1,2084 @@ +diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX +index 8c624a1..b8822ed 100644 +--- a/Documentation/filesystems/00-INDEX ++++ b/Documentation/filesystems/00-INDEX +@@ -114,6 +114,8 @@ vfat.txt + - info on using the VFAT filesystem used in Windows NT and Windows 95 + vfs.txt + - overview of the Virtual File System ++wrapfs.txt ++ - info and mount options for the stackable wrapper file system + xfs.txt + - info and mount options for the XFS filesystem. + xip.txt +diff --git a/Documentation/filesystems/wrapfs.txt b/Documentation/filesystems/wrapfs.txt +new file mode 100644 +index 0000000..f61879a +--- /dev/null ++++ b/Documentation/filesystems/wrapfs.txt +@@ -0,0 +1,165 @@ ++Wrapfs: a null-layer (aka wrapper) stackable file system ++ ++Maintainer: Erez Zadok ++Web Site: ++ ++------------------------------------------------------------------------------ ++MOTIVATION: ++ ++Wrapfs is a small null-layer stackable file system, similar to BSD's Nullfs. ++Wrapfs is small, under 1800 lines of code. Compare that to, say, eCryptfs ++and Unionfs, each of which are over 10,000 LoC. As such, Wrapfs is simple ++and easy to read and understand. Wrapfs is useful for several reasons: ++ ++1. Many people like to experiment with in-kernel file system ideas as a ++ prototype; Wrapfs is an ideal small template from which one could modify ++ the code to create new file system functionality incrementally. ++ ++2. As a platform to test and debug generic stacking problems in other Linux ++ stackable file systems (e.g., ecryptfs). ++ ++3. As a way to test VFS enhancements to better support stacking in Linux. ++ ++4. Wrapfs is a very useful instructional tool, often used as a starting ++ point for course assignments, for people who want a small example of who ++ the Linux VFS works, or for those who want to learn to write new Linux ++ file systems. ++ ++Various versions of Wrapfs appeared as part of the "fistgen" package since ++1994, and have been used by numerous users world-wide. For a more detailed ++history of Wrapfs, and list of most of its known users, see the section ++marked "HISTORY" below. ++ ++------------------------------------------------------------------------------ ++OPERATION: ++ ++This is a brief description of how Wrapfs operates. For more information, ++see the full paper published in Linux Expo 1999, titled "A Stackable File ++System Interface For Linux": ++ ++ ++ ++The basic function of a stackable file system is to pass an operation and ++its arguments to the lower-level file system. For every VFS object (inode, ++dentry, file, superblock, etc.), Wrapfs keeps a one-to-one mapping of a ++Wrapfs-level object to the lower one. We call the Wrapfs object the "upper" ++one, and the one below we call the "lower" one. Wrapfs stores these ++mappings as simple pointers inside the private field of the existing VFS ++objects (e.g., dentry->d_fsdata, sb->s_fs_info, and a container for inodes). ++ ++There are two kinds of stackable operations: those that create new VFS ++objects and those that don't. ++ ++The following distilled code snippet shows a method which doesn't create a ++new object. The method just has to pass it to the lower layer and propagate ++any errors back up the VFS: ++ ++int wrapfs_unlink(struct inode *dir, struct dentry *dentry) ++{ ++ int err; ++ struct inode *lower_dir; ++ struct dentry *lower_dentry; ++ lower_dir = get_lower_inode(dir); ++ lower_dentry = get_lower_dentry(dentry); ++ err = lower_dir->i_op->unlink(lower_dir, lower_dentry); ++ return err; ++} ++ ++The following code snippet shows a method which creates a new object. After ++a lower object gets created, Wrapfs has to also create its own object, and ++make the pointer connections between the upper and lower objects (the latter ++is done via a helper routine called "interpose"): ++ ++int wrapfs_create(struct inode *dir, struct dentry *dentry, int mode) ++{ ++ int err; ++ struct dentry *lower_dentry; ++ struct inode *lower_dir; ++ lower_dir = wrapfs_lower_inode(dir); ++ lower_dentry = wrapfs_lower_dentry(dentry); ++ err = vfs_create(lower_dir, lower_dentry, mode); ++ if (!err) ++ err = wrapfs_interpose(dentry, dir->i_sb); ++ return err; ++} ++ ++The wrapfs_unlink code snippet above can be easily modified to change the ++behavior of unlink(2). For example, if an ->unlink operation is changed to ++->rename, this could become the basis for an "undo" file system; or if the ++lower_dentry's name gets encrypted before calling the lower ->unlink, this ++could be part of an encryption file system. ++ ++------------------------------------------------------------------------------ ++USAGE: ++ ++First, you have to have some pre-existing directory already mounted from any ++other file system, say /some/lower/path. Then, to mount wrapfs in ++/mnt/wrapfs, on that lower directory, issue this command: ++ ++# mount -t wrapfs /some/lower/path /mnt/wrapfs ++ ++To access the files via Wrapfs, use the mount point /mnt/wrapfs. ++ ++------------------------------------------------------------------------------ ++CAVEATS: ++ ++Stacking on NFS. Wrapfs has been tested with LTP, racer, fsx, parallel ++compile, and more. It's been tested on top of ext2, ext3, xfs, reiserfs, ++and tmpfs -- and passed all tests. However, on top of nfs3, wrapfs has to ++treat silly-deleted files as if they don't exist: in ->unlink, if we try to ++vfs_unlink an NFS silly-deleted file, NFS returns EBUSY; so we simply ignore ++it and return 0 (success) to the VFS. NFS will delete this file later on ++anyway. As the VFS also has special handling for silly-deleted files, this ++isn't unusual. A cleaner way to handle this in the future is if the VFS ++were to handle silly-deleted (aka "delayed-delete") files entirely at the ++VFS. ++ ++------------------------------------------------------------------------------ ++HISTORY: ++ ++Wrapfs was developed initially in 1994 for Linux 2.1, as part of Erez ++Zadok's graduate work at Columbia University. It was designed to be a ++flexible null-layer, pass-through, stackable file system, from which other ++file systems would be developed and even instantiated automatically using a ++high-level language. One of the first file systems developed from Wrapfs ++was a simple encryption file system called Cryptfs (eCryptfs is based on ++Cryptfs). Other examples include Gzipfs, a stackable compression file ++system, and Unionfs, a stackable unification file system. Wrapfs was ++integrated into a larger package called fistgen (see www.filesystems.org), ++and ported to FreeBSD and Solaris. Wrapfs and fistgen continued to be ++maintained for newer versions of kernels, but remained largely standalone ++until recently: this release of Wrapfs for Linux represents a clean version ++written from scratch. ++ ++Over the past 15+ years, versions of Wrapfs had been used by many users and ++companies. At one point or another, the following groups have used stacking ++code based on Wrapfs. ++ ++1. PROJECTS: eCryptfs, Unionfs, mini_fo, Aufs, FindFS, StoreCompress, ++ TestFS, ToPAS, and MFS. ++ ++2. COMPANIES AND RESEARCH LABS: Bell Labs's Plan 9 group, EMC, ++ Hewlett-Packard, IBM Research Almaden, IBM Research Austin, Red Hat, ++ SuSE, Sun Microsystems, Veritas, Booyaka, CalSoft (India), Computer Farm, ++ Deutsche Bank (Germany), DreamWorks LLC, Eli Lilly and Company, FAME ++ Information Services, GMX AG (Germany), IBM global services (India), IDA ++ Center for Communications Research, Indra Networks, Inc., Kavi ++ Corporation, Mendepie, Mitsubishi Electric (Japan), Mobile-Mind, Monster ++ Labs, Morning Network (Russia), NeST Technologies, Packet General ++ Networks, Inc., Outstep Technologies, Reflective Systems Group, River ++ Styx Internet, SARAI Net, Saint-Petersburg Official Web Site (Russia), ++ Shadow Island Games, TISCover (Germany), Trymedia Systems, Uber Admin, ++ Videsh Sanchar Nigam Limited (India), Wanadoo (France), and iNsu ++ Innovations. ++ ++3. UNIVERSITIES: Georgia Institute of Technology, Stanford University, UC ++ Berkeley, UCLA, University of Maryland, College Park, University of ++ Michigan, Ben Gurion University (Israel), Clarkson University, Clemson ++ University, Deutsches Elektronen Synchrotron (Germany), Electronics and ++ Telecommunications Research Institute (South Korea), Indian Institute of ++ Technology (India), National Taiwan University, Pune University (India), ++ The College of William \& Mary, Trinity College (Ireland), Universitaet ++ Frankfurt am Main (Germany), University Hospital Olomouc (Czech ++ Republic), and University of Salermo (Italy). ++ ++------------------------------------------------------------------------------ +diff --git a/MAINTAINERS b/MAINTAINERS +index f986e7d..1aecfdb 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -7374,6 +7374,16 @@ F: include/linux/workqueue.h + F: kernel/workqueue.c + F: Documentation/workqueue.txt + ++WRAPFS ++P: Erez Zadok ++M: ezk@cs.sunysb.edu ++L: wrapfs@filesystems.org ++W: http://wrapfs.filesystems.org/ ++T: git git.kernel.org/pub/scm/linux/kernel/git/ezk/wrapfs.git ++S: Maintained ++F: Documentation/filesystems/wrapfs.txt ++F: fs/wrapfs/ ++ + X.25 NETWORK LAYER + M: Andrew Hendry + L: linux-x25@vger.kernel.org +diff --git a/fs/Kconfig b/fs/Kconfig +index 6ad58a5..73699ed 100644 +--- a/fs/Kconfig ++++ b/fs/Kconfig +@@ -194,6 +194,7 @@ if MISC_FILESYSTEMS + source "fs/adfs/Kconfig" + source "fs/affs/Kconfig" + source "fs/ecryptfs/Kconfig" ++source "fs/wrapfs/Kconfig" + source "fs/hfs/Kconfig" + source "fs/hfsplus/Kconfig" + source "fs/befs/Kconfig" +diff --git a/fs/Makefile b/fs/Makefile +index d2c3353..36daa5e 100644 +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -82,6 +82,7 @@ obj-$(CONFIG_ISO9660_FS) += isofs/ + obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+ + obj-$(CONFIG_HFS_FS) += hfs/ + obj-$(CONFIG_ECRYPT_FS) += ecryptfs/ ++obj-$(CONFIG_WRAP_FS) += wrapfs/ + obj-$(CONFIG_VXFS_FS) += freevxfs/ + obj-$(CONFIG_NFS_FS) += nfs/ + obj-$(CONFIG_EXPORTFS) += exportfs/ +diff --git a/fs/wrapfs/Kconfig b/fs/wrapfs/Kconfig +new file mode 100644 +index 0000000..d790ccd +--- /dev/null ++++ b/fs/wrapfs/Kconfig +@@ -0,0 +1,9 @@ ++config WRAP_FS ++ tristate "Wrapfs stackable file system (EXPERIMENTAL)" ++ depends on EXPERIMENTAL ++ help ++ Wrapfs is a stackable file system which simply passes its ++ operations to the lower layer. It is designed as a useful ++ template for developing or debugging other stackable file systems, ++ and more (see Documentation/filesystems/wrapfs.txt). See ++ for details. +diff --git a/fs/wrapfs/Makefile b/fs/wrapfs/Makefile +new file mode 100644 +index 0000000..f318d11 +--- /dev/null ++++ b/fs/wrapfs/Makefile +@@ -0,0 +1,7 @@ ++WRAPFS_VERSION="0.1" ++ ++EXTRA_CFLAGS += -DWRAPFS_VERSION=\"$(WRAPFS_VERSION)\" ++ ++obj-$(CONFIG_WRAP_FS) += wrapfs.o ++ ++wrapfs-y := dentry.o file.o inode.o main.o super.o lookup.o mmap.o +diff --git a/fs/wrapfs/dentry.c b/fs/wrapfs/dentry.c +new file mode 100644 +index 0000000..b173153 +--- /dev/null ++++ b/fs/wrapfs/dentry.c +@@ -0,0 +1,52 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include "wrapfs.h" ++ ++/* ++ * returns: -ERRNO if error (returned to user) ++ * 0: tell VFS to invalidate dentry ++ * 1: dentry is valid ++ */ ++static int wrapfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) ++{ ++ struct path lower_path, saved_path; ++ struct dentry *lower_dentry; ++ int err = 1; ++ ++ if (nd && nd->flags & LOOKUP_RCU) ++ return -ECHILD; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) ++ goto out; ++ pathcpy(&saved_path, &nd->path); ++ pathcpy(&nd->path, &lower_path); ++ err = lower_dentry->d_op->d_revalidate(lower_dentry, nd); ++ pathcpy(&nd->path, &saved_path); ++out: ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++static void wrapfs_d_release(struct dentry *dentry) ++{ ++ /* release and reset the lower paths */ ++ wrapfs_put_reset_lower_path(dentry); ++ free_dentry_private_data(dentry); ++ return; ++} ++ ++const struct dentry_operations wrapfs_dops = { ++ .d_revalidate = wrapfs_d_revalidate, ++ .d_release = wrapfs_d_release, ++}; +diff --git a/fs/wrapfs/file.c b/fs/wrapfs/file.c +new file mode 100644 +index 0000000..7a7fe1e +--- /dev/null ++++ b/fs/wrapfs/file.c +@@ -0,0 +1,298 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include "wrapfs.h" ++ ++static ssize_t wrapfs_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ int err; ++ struct file *lower_file; ++ struct dentry *dentry = file->f_path.dentry; ++ ++ lower_file = wrapfs_lower_file(file); ++ err = vfs_read(lower_file, buf, count, ppos); ++ /* update our inode atime upon a successful lower read */ ++ if (err >= 0) ++ fsstack_copy_attr_atime(dentry->d_inode, ++ lower_file->f_path.dentry->d_inode); ++ ++ return err; ++} ++ ++static ssize_t wrapfs_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ int err = 0; ++ struct file *lower_file; ++ struct dentry *dentry = file->f_path.dentry; ++ ++ lower_file = wrapfs_lower_file(file); ++ err = vfs_write(lower_file, buf, count, ppos); ++ /* update our inode times+sizes upon a successful lower write */ ++ if (err >= 0) { ++ fsstack_copy_inode_size(dentry->d_inode, ++ lower_file->f_path.dentry->d_inode); ++ fsstack_copy_attr_times(dentry->d_inode, ++ lower_file->f_path.dentry->d_inode); ++ } ++ ++ return err; ++} ++ ++static int wrapfs_readdir(struct file *file, void *dirent, filldir_t filldir) ++{ ++ int err = 0; ++ struct file *lower_file = NULL; ++ struct dentry *dentry = file->f_path.dentry; ++ ++ lower_file = wrapfs_lower_file(file); ++ err = vfs_readdir(lower_file, filldir, dirent); ++ file->f_pos = lower_file->f_pos; ++ if (err >= 0) /* copy the atime */ ++ fsstack_copy_attr_atime(dentry->d_inode, ++ lower_file->f_path.dentry->d_inode); ++ return err; ++} ++ ++static long wrapfs_unlocked_ioctl(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ long err = -ENOTTY; ++ struct file *lower_file; ++ ++ lower_file = wrapfs_lower_file(file); ++ ++ /* XXX: use vfs_ioctl if/when VFS exports it */ ++ if (!lower_file || !lower_file->f_op) ++ goto out; ++ if (lower_file->f_op->unlocked_ioctl) ++ err = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); ++ ++out: ++ return err; ++} ++ ++#ifdef CONFIG_COMPAT ++static long wrapfs_compat_ioctl(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ long err = -ENOTTY; ++ struct file *lower_file; ++ ++ lower_file = wrapfs_lower_file(file); ++ ++ /* XXX: use vfs_ioctl if/when VFS exports it */ ++ if (!lower_file || !lower_file->f_op) ++ goto out; ++ if (lower_file->f_op->compat_ioctl) ++ err = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); ++ ++out: ++ return err; ++} ++#endif ++ ++static int wrapfs_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ int err = 0; ++ bool willwrite; ++ struct file *lower_file; ++ const struct vm_operations_struct *saved_vm_ops = NULL; ++ ++ /* this might be deferred to mmap's writepage */ ++ willwrite = ((vma->vm_flags | VM_SHARED | VM_WRITE) == vma->vm_flags); ++ ++ /* ++ * File systems which do not implement ->writepage may use ++ * generic_file_readonly_mmap as their ->mmap op. If you call ++ * generic_file_readonly_mmap with VM_WRITE, you'd get an -EINVAL. ++ * But we cannot call the lower ->mmap op, so we can't tell that ++ * writeable mappings won't work. Therefore, our only choice is to ++ * check if the lower file system supports the ->writepage, and if ++ * not, return EINVAL (the same error that ++ * generic_file_readonly_mmap returns in that case). ++ */ ++ lower_file = wrapfs_lower_file(file); ++ if (willwrite && !lower_file->f_mapping->a_ops->writepage) { ++ err = -EINVAL; ++ printk(KERN_ERR "wrapfs: lower file system does not " ++ "support writeable mmap\n"); ++ goto out; ++ } ++ ++ /* ++ * find and save lower vm_ops. ++ * ++ * XXX: the VFS should have a cleaner way of finding the lower vm_ops ++ */ ++ if (!WRAPFS_F(file)->lower_vm_ops) { ++ err = lower_file->f_op->mmap(lower_file, vma); ++ if (err) { ++ printk(KERN_ERR "wrapfs: lower mmap failed %d\n", err); ++ goto out; ++ } ++ saved_vm_ops = vma->vm_ops; /* save: came from lower ->mmap */ ++ err = do_munmap(current->mm, vma->vm_start, ++ vma->vm_end - vma->vm_start); ++ if (err) { ++ printk(KERN_ERR "wrapfs: do_munmap failed %d\n", err); ++ goto out; ++ } ++ } ++ ++ /* ++ * Next 3 lines are all I need from generic_file_mmap. I definitely ++ * don't want its test for ->readpage which returns -ENOEXEC. ++ */ ++ file_accessed(file); ++ vma->vm_ops = &wrapfs_vm_ops; ++ vma->vm_flags |= VM_CAN_NONLINEAR; ++ ++ file->f_mapping->a_ops = &wrapfs_aops; /* set our aops */ ++ if (!WRAPFS_F(file)->lower_vm_ops) /* save for our ->fault */ ++ WRAPFS_F(file)->lower_vm_ops = saved_vm_ops; ++ ++out: ++ return err; ++} ++ ++static int wrapfs_open(struct inode *inode, struct file *file) ++{ ++ int err = 0; ++ struct file *lower_file = NULL; ++ struct path lower_path; ++ ++ /* don't open unhashed/deleted files */ ++ if (d_unhashed(file->f_path.dentry)) { ++ err = -ENOENT; ++ goto out_err; ++ } ++ ++ file->private_data = ++ kzalloc(sizeof(struct wrapfs_file_info), GFP_KERNEL); ++ if (!WRAPFS_F(file)) { ++ err = -ENOMEM; ++ goto out_err; ++ } ++ ++ /* open lower object and link wrapfs's file struct to lower's */ ++ wrapfs_get_lower_path(file->f_path.dentry, &lower_path); ++ lower_file = dentry_open(lower_path.dentry, lower_path.mnt, ++ file->f_flags, current_cred()); ++ if (IS_ERR(lower_file)) { ++ err = PTR_ERR(lower_file); ++ lower_file = wrapfs_lower_file(file); ++ if (lower_file) { ++ wrapfs_set_lower_file(file, NULL); ++ fput(lower_file); /* fput calls dput for lower_dentry */ ++ } ++ } else { ++ wrapfs_set_lower_file(file, lower_file); ++ } ++ ++ if (err) ++ kfree(WRAPFS_F(file)); ++ else ++ fsstack_copy_attr_all(inode, wrapfs_lower_inode(inode)); ++out_err: ++ return err; ++} ++ ++static int wrapfs_flush(struct file *file, fl_owner_t id) ++{ ++ int err = 0; ++ struct file *lower_file = NULL; ++ ++ lower_file = wrapfs_lower_file(file); ++ if (lower_file && lower_file->f_op && lower_file->f_op->flush) ++ err = lower_file->f_op->flush(lower_file, id); ++ ++ return err; ++} ++ ++/* release all lower object references & free the file info structure */ ++static int wrapfs_file_release(struct inode *inode, struct file *file) ++{ ++ struct file *lower_file; ++ ++ lower_file = wrapfs_lower_file(file); ++ if (lower_file) { ++ wrapfs_set_lower_file(file, NULL); ++ fput(lower_file); ++ } ++ ++ kfree(WRAPFS_F(file)); ++ return 0; ++} ++ ++static int wrapfs_fsync(struct file *file, loff_t start, loff_t end, ++ int datasync) ++{ ++ int err; ++ struct file *lower_file; ++ struct path lower_path; ++ struct dentry *dentry = file->f_path.dentry; ++ ++ err = generic_file_fsync(file, start, end, datasync); ++ if (err) ++ goto out; ++ lower_file = wrapfs_lower_file(file); ++ wrapfs_get_lower_path(dentry, &lower_path); ++ err = vfs_fsync_range(lower_file, start, end, datasync); ++ wrapfs_put_lower_path(dentry, &lower_path); ++out: ++ return err; ++} ++ ++static int wrapfs_fasync(int fd, struct file *file, int flag) ++{ ++ int err = 0; ++ struct file *lower_file = NULL; ++ ++ lower_file = wrapfs_lower_file(file); ++ if (lower_file->f_op && lower_file->f_op->fasync) ++ err = lower_file->f_op->fasync(fd, lower_file, flag); ++ ++ return err; ++} ++ ++const struct file_operations wrapfs_main_fops = { ++ .llseek = generic_file_llseek, ++ .read = wrapfs_read, ++ .write = wrapfs_write, ++ .unlocked_ioctl = wrapfs_unlocked_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = wrapfs_compat_ioctl, ++#endif ++ .mmap = wrapfs_mmap, ++ .open = wrapfs_open, ++ .flush = wrapfs_flush, ++ .release = wrapfs_file_release, ++ .fsync = wrapfs_fsync, ++ .fasync = wrapfs_fasync, ++}; ++ ++/* trimmed directory options */ ++const struct file_operations wrapfs_dir_fops = { ++ .llseek = generic_file_llseek, ++ .read = generic_read_dir, ++ .readdir = wrapfs_readdir, ++ .unlocked_ioctl = wrapfs_unlocked_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = wrapfs_compat_ioctl, ++#endif ++ .open = wrapfs_open, ++ .release = wrapfs_file_release, ++ .flush = wrapfs_flush, ++ .fsync = wrapfs_fsync, ++ .fasync = wrapfs_fasync, ++}; +diff --git a/fs/wrapfs/inode.c b/fs/wrapfs/inode.c +new file mode 100644 +index 0000000..1dc3645 +--- /dev/null ++++ b/fs/wrapfs/inode.c +@@ -0,0 +1,514 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include "wrapfs.h" ++ ++static int wrapfs_create(struct inode *dir, struct dentry *dentry, ++ int mode, struct nameidata *nd) ++{ ++ int err = 0; ++ struct dentry *lower_dentry; ++ struct dentry *lower_parent_dentry = NULL; ++ struct path lower_path, saved_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ lower_parent_dentry = lock_parent(lower_dentry); ++ ++ err = mnt_want_write(lower_path.mnt); ++ if (err) ++ goto out_unlock; ++ ++ pathcpy(&saved_path, &nd->path); ++ pathcpy(&nd->path, &lower_path); ++ err = vfs_create(lower_parent_dentry->d_inode, lower_dentry, mode, nd); ++ pathcpy(&nd->path, &saved_path); ++ if (err) ++ goto out; ++ ++ err = wrapfs_interpose(dentry, dir->i_sb, &lower_path); ++ if (err) ++ goto out; ++ fsstack_copy_attr_times(dir, wrapfs_lower_inode(dir)); ++ fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); ++ ++out: ++ mnt_drop_write(lower_path.mnt); ++out_unlock: ++ unlock_dir(lower_parent_dentry); ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++static int wrapfs_link(struct dentry *old_dentry, struct inode *dir, ++ struct dentry *new_dentry) ++{ ++ struct dentry *lower_old_dentry; ++ struct dentry *lower_new_dentry; ++ struct dentry *lower_dir_dentry; ++ u64 file_size_save; ++ int err; ++ struct path lower_old_path, lower_new_path; ++ ++ file_size_save = i_size_read(old_dentry->d_inode); ++ wrapfs_get_lower_path(old_dentry, &lower_old_path); ++ wrapfs_get_lower_path(new_dentry, &lower_new_path); ++ lower_old_dentry = lower_old_path.dentry; ++ lower_new_dentry = lower_new_path.dentry; ++ lower_dir_dentry = lock_parent(lower_new_dentry); ++ ++ err = mnt_want_write(lower_new_path.mnt); ++ if (err) ++ goto out_unlock; ++ ++ err = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode, ++ lower_new_dentry); ++ if (err || !lower_new_dentry->d_inode) ++ goto out; ++ ++ err = wrapfs_interpose(new_dentry, dir->i_sb, &lower_new_path); ++ if (err) ++ goto out; ++ fsstack_copy_attr_times(dir, lower_new_dentry->d_inode); ++ fsstack_copy_inode_size(dir, lower_new_dentry->d_inode); ++ set_nlink(old_dentry->d_inode, ++ wrapfs_lower_inode(old_dentry->d_inode)->i_nlink); ++ i_size_write(new_dentry->d_inode, file_size_save); ++out: ++ mnt_drop_write(lower_new_path.mnt); ++out_unlock: ++ unlock_dir(lower_dir_dentry); ++ wrapfs_put_lower_path(old_dentry, &lower_old_path); ++ wrapfs_put_lower_path(new_dentry, &lower_new_path); ++ return err; ++} ++ ++static int wrapfs_unlink(struct inode *dir, struct dentry *dentry) ++{ ++ int err; ++ struct dentry *lower_dentry; ++ struct inode *lower_dir_inode = wrapfs_lower_inode(dir); ++ struct dentry *lower_dir_dentry; ++ struct path lower_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ dget(lower_dentry); ++ lower_dir_dentry = lock_parent(lower_dentry); ++ ++ err = mnt_want_write(lower_path.mnt); ++ if (err) ++ goto out_unlock; ++ err = vfs_unlink(lower_dir_inode, lower_dentry); ++ ++ /* ++ * Note: unlinking on top of NFS can cause silly-renamed files. ++ * Trying to delete such files results in EBUSY from NFS ++ * below. Silly-renamed files will get deleted by NFS later on, so ++ * we just need to detect them here and treat such EBUSY errors as ++ * if the upper file was successfully deleted. ++ */ ++ if (err == -EBUSY && lower_dentry->d_flags & DCACHE_NFSFS_RENAMED) ++ err = 0; ++ if (err) ++ goto out; ++ fsstack_copy_attr_times(dir, lower_dir_inode); ++ fsstack_copy_inode_size(dir, lower_dir_inode); ++ set_nlink(dentry->d_inode, ++ wrapfs_lower_inode(dentry->d_inode)->i_nlink); ++ dentry->d_inode->i_ctime = dir->i_ctime; ++ d_drop(dentry); /* this is needed, else LTP fails (VFS won't do it) */ ++out: ++ mnt_drop_write(lower_path.mnt); ++out_unlock: ++ unlock_dir(lower_dir_dentry); ++ dput(lower_dentry); ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++static int wrapfs_symlink(struct inode *dir, struct dentry *dentry, ++ const char *symname) ++{ ++ int err = 0; ++ struct dentry *lower_dentry; ++ struct dentry *lower_parent_dentry = NULL; ++ struct path lower_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ lower_parent_dentry = lock_parent(lower_dentry); ++ ++ err = mnt_want_write(lower_path.mnt); ++ if (err) ++ goto out_unlock; ++ err = vfs_symlink(lower_parent_dentry->d_inode, lower_dentry, symname); ++ if (err) ++ goto out; ++ err = wrapfs_interpose(dentry, dir->i_sb, &lower_path); ++ if (err) ++ goto out; ++ fsstack_copy_attr_times(dir, wrapfs_lower_inode(dir)); ++ fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); ++ ++out: ++ mnt_drop_write(lower_path.mnt); ++out_unlock: ++ unlock_dir(lower_parent_dentry); ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++static int wrapfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) ++{ ++ int err = 0; ++ struct dentry *lower_dentry; ++ struct dentry *lower_parent_dentry = NULL; ++ struct path lower_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ lower_parent_dentry = lock_parent(lower_dentry); ++ ++ err = mnt_want_write(lower_path.mnt); ++ if (err) ++ goto out_unlock; ++ err = vfs_mkdir(lower_parent_dentry->d_inode, lower_dentry, mode); ++ if (err) ++ goto out; ++ ++ err = wrapfs_interpose(dentry, dir->i_sb, &lower_path); ++ if (err) ++ goto out; ++ ++ fsstack_copy_attr_times(dir, wrapfs_lower_inode(dir)); ++ fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); ++ /* update number of links on parent directory */ ++ set_nlink(dir, wrapfs_lower_inode(dir)->i_nlink); ++ ++out: ++ mnt_drop_write(lower_path.mnt); ++out_unlock: ++ unlock_dir(lower_parent_dentry); ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++static int wrapfs_rmdir(struct inode *dir, struct dentry *dentry) ++{ ++ struct dentry *lower_dentry; ++ struct dentry *lower_dir_dentry; ++ int err; ++ struct path lower_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ lower_dir_dentry = lock_parent(lower_dentry); ++ ++ err = mnt_want_write(lower_path.mnt); ++ if (err) ++ goto out_unlock; ++ err = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry); ++ if (err) ++ goto out; ++ ++ d_drop(dentry); /* drop our dentry on success (why not VFS's job?) */ ++ if (dentry->d_inode) ++ clear_nlink(dentry->d_inode); ++ fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); ++ fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); ++ set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); ++ ++out: ++ mnt_drop_write(lower_path.mnt); ++out_unlock: ++ unlock_dir(lower_dir_dentry); ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++static int wrapfs_mknod(struct inode *dir, struct dentry *dentry, int mode, ++ dev_t dev) ++{ ++ int err = 0; ++ struct dentry *lower_dentry; ++ struct dentry *lower_parent_dentry = NULL; ++ struct path lower_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ lower_parent_dentry = lock_parent(lower_dentry); ++ ++ err = mnt_want_write(lower_path.mnt); ++ if (err) ++ goto out_unlock; ++ err = vfs_mknod(lower_parent_dentry->d_inode, lower_dentry, mode, dev); ++ if (err) ++ goto out; ++ ++ err = wrapfs_interpose(dentry, dir->i_sb, &lower_path); ++ if (err) ++ goto out; ++ fsstack_copy_attr_times(dir, wrapfs_lower_inode(dir)); ++ fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); ++ ++out: ++ mnt_drop_write(lower_path.mnt); ++out_unlock: ++ unlock_dir(lower_parent_dentry); ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++/* ++ * The locking rules in wrapfs_rename are complex. We could use a simpler ++ * superblock-level name-space lock for renames and copy-ups. ++ */ ++static int wrapfs_rename(struct inode *old_dir, struct dentry *old_dentry, ++ struct inode *new_dir, struct dentry *new_dentry) ++{ ++ int err = 0; ++ struct dentry *lower_old_dentry = NULL; ++ struct dentry *lower_new_dentry = NULL; ++ struct dentry *lower_old_dir_dentry = NULL; ++ struct dentry *lower_new_dir_dentry = NULL; ++ struct dentry *trap = NULL; ++ struct path lower_old_path, lower_new_path; ++ ++ wrapfs_get_lower_path(old_dentry, &lower_old_path); ++ wrapfs_get_lower_path(new_dentry, &lower_new_path); ++ lower_old_dentry = lower_old_path.dentry; ++ lower_new_dentry = lower_new_path.dentry; ++ lower_old_dir_dentry = dget_parent(lower_old_dentry); ++ lower_new_dir_dentry = dget_parent(lower_new_dentry); ++ ++ trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); ++ /* source should not be ancestor of target */ ++ if (trap == lower_old_dentry) { ++ err = -EINVAL; ++ goto out; ++ } ++ /* target should not be ancestor of source */ ++ if (trap == lower_new_dentry) { ++ err = -ENOTEMPTY; ++ goto out; ++ } ++ ++ err = mnt_want_write(lower_old_path.mnt); ++ if (err) ++ goto out; ++ err = mnt_want_write(lower_new_path.mnt); ++ if (err) ++ goto out_drop_old_write; ++ ++ err = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, ++ lower_new_dir_dentry->d_inode, lower_new_dentry); ++ if (err) ++ goto out_err; ++ ++ fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); ++ fsstack_copy_inode_size(new_dir, lower_new_dir_dentry->d_inode); ++ if (new_dir != old_dir) { ++ fsstack_copy_attr_all(old_dir, ++ lower_old_dir_dentry->d_inode); ++ fsstack_copy_inode_size(old_dir, ++ lower_old_dir_dentry->d_inode); ++ } ++ ++out_err: ++ mnt_drop_write(lower_new_path.mnt); ++out_drop_old_write: ++ mnt_drop_write(lower_old_path.mnt); ++out: ++ unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); ++ dput(lower_old_dir_dentry); ++ dput(lower_new_dir_dentry); ++ wrapfs_put_lower_path(old_dentry, &lower_old_path); ++ wrapfs_put_lower_path(new_dentry, &lower_new_path); ++ return err; ++} ++ ++static int wrapfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) ++{ ++ int err; ++ struct dentry *lower_dentry; ++ struct path lower_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ if (!lower_dentry->d_inode->i_op || ++ !lower_dentry->d_inode->i_op->readlink) { ++ err = -EINVAL; ++ goto out; ++ } ++ ++ err = lower_dentry->d_inode->i_op->readlink(lower_dentry, ++ buf, bufsiz); ++ if (err < 0) ++ goto out; ++ fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode); ++ ++out: ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++static void *wrapfs_follow_link(struct dentry *dentry, struct nameidata *nd) ++{ ++ char *buf; ++ int len = PAGE_SIZE, err; ++ mm_segment_t old_fs; ++ ++ /* This is freed by the put_link method assuming a successful call. */ ++ buf = kmalloc(len, GFP_KERNEL); ++ if (!buf) { ++ buf = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ /* read the symlink, and then we will follow it */ ++ old_fs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = wrapfs_readlink(dentry, buf, len); ++ set_fs(old_fs); ++ if (err < 0) { ++ kfree(buf); ++ buf = ERR_PTR(err); ++ } else { ++ buf[err] = '\0'; ++ } ++out: ++ nd_set_link(nd, buf); ++ return NULL; ++} ++ ++/* this @nd *IS* still used */ ++static void wrapfs_put_link(struct dentry *dentry, struct nameidata *nd, ++ void *cookie) ++{ ++ char *buf = nd_get_link(nd); ++ if (!IS_ERR(buf)) /* free the char* */ ++ kfree(buf); ++} ++ ++static int wrapfs_permission(struct inode *inode, int mask) ++{ ++ struct inode *lower_inode; ++ int err; ++ ++ lower_inode = wrapfs_lower_inode(inode); ++ err = inode_permission(lower_inode, mask); ++ return err; ++} ++ ++static int wrapfs_setattr(struct dentry *dentry, struct iattr *ia) ++{ ++ int err = 0; ++ struct dentry *lower_dentry; ++ struct inode *inode; ++ struct inode *lower_inode; ++ struct path lower_path; ++ struct iattr lower_ia; ++ ++ inode = dentry->d_inode; ++ ++ /* ++ * Check if user has permission to change inode. We don't check if ++ * this user can change the lower inode: that should happen when ++ * calling notify_change on the lower inode. ++ */ ++ err = inode_change_ok(inode, ia); ++ if (err) ++ goto out_err; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ lower_inode = wrapfs_lower_inode(inode); ++ ++ /* prepare our own lower struct iattr (with the lower file) */ ++ memcpy(&lower_ia, ia, sizeof(lower_ia)); ++ if (ia->ia_valid & ATTR_FILE) ++ lower_ia.ia_file = wrapfs_lower_file(ia->ia_file); ++ ++ /* ++ * If shrinking, first truncate upper level to cancel writing dirty ++ * pages beyond the new eof; and also if its' maxbytes is more ++ * limiting (fail with -EFBIG before making any change to the lower ++ * level). There is no need to vmtruncate the upper level ++ * afterwards in the other cases: we fsstack_copy_inode_size from ++ * the lower level. ++ */ ++ if (ia->ia_valid & ATTR_SIZE) { ++ err = inode_newsize_ok(inode, ia->ia_size); ++ if (err) ++ goto out; ++ truncate_setsize(inode, ia->ia_size); ++ } ++ ++ /* ++ * mode change is for clearing setuid/setgid bits. Allow lower fs ++ * to interpret this in its own way. ++ */ ++ if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) ++ lower_ia.ia_valid &= ~ATTR_MODE; ++ ++ /* notify the (possibly copied-up) lower inode */ ++ /* ++ * Note: we use lower_dentry->d_inode, because lower_inode may be ++ * unlinked (no inode->i_sb and i_ino==0. This happens if someone ++ * tries to open(), unlink(), then ftruncate() a file. ++ */ ++ mutex_lock(&lower_dentry->d_inode->i_mutex); ++ err = notify_change(lower_dentry, &lower_ia); /* note: lower_ia */ ++ mutex_unlock(&lower_dentry->d_inode->i_mutex); ++ if (err) ++ goto out; ++ ++ /* get attributes from the lower inode */ ++ fsstack_copy_attr_all(inode, lower_inode); ++ /* ++ * Not running fsstack_copy_inode_size(inode, lower_inode), because ++ * VFS should update our inode size, and notify_change on ++ * lower_inode should update its size. ++ */ ++ ++out: ++ wrapfs_put_lower_path(dentry, &lower_path); ++out_err: ++ return err; ++} ++ ++const struct inode_operations wrapfs_symlink_iops = { ++ .readlink = wrapfs_readlink, ++ .permission = wrapfs_permission, ++ .follow_link = wrapfs_follow_link, ++ .setattr = wrapfs_setattr, ++ .put_link = wrapfs_put_link, ++}; ++ ++const struct inode_operations wrapfs_dir_iops = { ++ .create = wrapfs_create, ++ .lookup = wrapfs_lookup, ++ .link = wrapfs_link, ++ .unlink = wrapfs_unlink, ++ .symlink = wrapfs_symlink, ++ .mkdir = wrapfs_mkdir, ++ .rmdir = wrapfs_rmdir, ++ .mknod = wrapfs_mknod, ++ .rename = wrapfs_rename, ++ .permission = wrapfs_permission, ++ .setattr = wrapfs_setattr, ++}; ++ ++const struct inode_operations wrapfs_main_iops = { ++ .permission = wrapfs_permission, ++ .setattr = wrapfs_setattr, ++}; +diff --git a/fs/wrapfs/lookup.c b/fs/wrapfs/lookup.c +new file mode 100644 +index 0000000..325b2ba +--- /dev/null ++++ b/fs/wrapfs/lookup.c +@@ -0,0 +1,304 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include "wrapfs.h" ++ ++/* The dentry cache is just so we have properly sized dentries */ ++static struct kmem_cache *wrapfs_dentry_cachep; ++ ++int wrapfs_init_dentry_cache(void) ++{ ++ wrapfs_dentry_cachep = ++ kmem_cache_create("wrapfs_dentry", ++ sizeof(struct wrapfs_dentry_info), ++ 0, SLAB_RECLAIM_ACCOUNT, NULL); ++ ++ return wrapfs_dentry_cachep ? 0 : -ENOMEM; ++} ++ ++void wrapfs_destroy_dentry_cache(void) ++{ ++ if (wrapfs_dentry_cachep) ++ kmem_cache_destroy(wrapfs_dentry_cachep); ++} ++ ++void free_dentry_private_data(struct dentry *dentry) ++{ ++ if (!dentry || !dentry->d_fsdata) ++ return; ++ kmem_cache_free(wrapfs_dentry_cachep, dentry->d_fsdata); ++ dentry->d_fsdata = NULL; ++} ++ ++/* allocate new dentry private data */ ++int new_dentry_private_data(struct dentry *dentry) ++{ ++ struct wrapfs_dentry_info *info = WRAPFS_D(dentry); ++ ++ /* use zalloc to init dentry_info.lower_path */ ++ info = kmem_cache_zalloc(wrapfs_dentry_cachep, GFP_ATOMIC); ++ if (!info) ++ return -ENOMEM; ++ ++ spin_lock_init(&info->lock); ++ dentry->d_fsdata = info; ++ ++ return 0; ++} ++ ++static int wrapfs_inode_test(struct inode *inode, void *candidate_lower_inode) ++{ ++ struct inode *current_lower_inode = wrapfs_lower_inode(inode); ++ if (current_lower_inode == (struct inode *)candidate_lower_inode) ++ return 1; /* found a match */ ++ else ++ return 0; /* no match */ ++} ++ ++static int wrapfs_inode_set(struct inode *inode, void *lower_inode) ++{ ++ /* we do actual inode initialization in wrapfs_iget */ ++ return 0; ++} ++ ++struct inode *wrapfs_iget(struct super_block *sb, struct inode *lower_inode) ++{ ++ struct wrapfs_inode_info *info; ++ struct inode *inode; /* the new inode to return */ ++ int err; ++ ++ inode = iget5_locked(sb, /* our superblock */ ++ /* ++ * hashval: we use inode number, but we can ++ * also use "(unsigned long)lower_inode" ++ * instead. ++ */ ++ lower_inode->i_ino, /* hashval */ ++ wrapfs_inode_test, /* inode comparison function */ ++ wrapfs_inode_set, /* inode init function */ ++ lower_inode); /* data passed to test+set fxns */ ++ if (!inode) { ++ err = -EACCES; ++ iput(lower_inode); ++ return ERR_PTR(err); ++ } ++ /* if found a cached inode, then just return it */ ++ if (!(inode->i_state & I_NEW)) ++ return inode; ++ ++ /* initialize new inode */ ++ info = WRAPFS_I(inode); ++ ++ inode->i_ino = lower_inode->i_ino; ++ if (!igrab(lower_inode)) { ++ err = -ESTALE; ++ return ERR_PTR(err); ++ } ++ wrapfs_set_lower_inode(inode, lower_inode); ++ ++ inode->i_version++; ++ ++ /* use different set of inode ops for symlinks & directories */ ++ if (S_ISDIR(lower_inode->i_mode)) ++ inode->i_op = &wrapfs_dir_iops; ++ else if (S_ISLNK(lower_inode->i_mode)) ++ inode->i_op = &wrapfs_symlink_iops; ++ else ++ inode->i_op = &wrapfs_main_iops; ++ ++ /* use different set of file ops for directories */ ++ if (S_ISDIR(lower_inode->i_mode)) ++ inode->i_fop = &wrapfs_dir_fops; ++ else ++ inode->i_fop = &wrapfs_main_fops; ++ ++ inode->i_mapping->a_ops = &wrapfs_aops; ++ ++ inode->i_atime.tv_sec = 0; ++ inode->i_atime.tv_nsec = 0; ++ inode->i_mtime.tv_sec = 0; ++ inode->i_mtime.tv_nsec = 0; ++ inode->i_ctime.tv_sec = 0; ++ inode->i_ctime.tv_nsec = 0; ++ ++ /* properly initialize special inodes */ ++ if (S_ISBLK(lower_inode->i_mode) || S_ISCHR(lower_inode->i_mode) || ++ S_ISFIFO(lower_inode->i_mode) || S_ISSOCK(lower_inode->i_mode)) ++ init_special_inode(inode, lower_inode->i_mode, ++ lower_inode->i_rdev); ++ ++ /* all well, copy inode attributes */ ++ fsstack_copy_attr_all(inode, lower_inode); ++ fsstack_copy_inode_size(inode, lower_inode); ++ ++ unlock_new_inode(inode); ++ return inode; ++} ++ ++/* ++ * Connect a wrapfs inode dentry/inode with several lower ones. This is ++ * the classic stackable file system "vnode interposition" action. ++ * ++ * @dentry: wrapfs's dentry which interposes on lower one ++ * @sb: wrapfs's super_block ++ * @lower_path: the lower path (caller does path_get/put) ++ */ ++int wrapfs_interpose(struct dentry *dentry, struct super_block *sb, ++ struct path *lower_path) ++{ ++ int err = 0; ++ struct inode *inode; ++ struct inode *lower_inode; ++ struct super_block *lower_sb; ++ ++ lower_inode = lower_path->dentry->d_inode; ++ lower_sb = wrapfs_lower_super(sb); ++ ++ /* check that the lower file system didn't cross a mount point */ ++ if (lower_inode->i_sb != lower_sb) { ++ err = -EXDEV; ++ goto out; ++ } ++ ++ /* ++ * We allocate our new inode below by calling wrapfs_iget, ++ * which will initialize some of the new inode's fields ++ */ ++ ++ /* inherit lower inode number for wrapfs's inode */ ++ inode = wrapfs_iget(sb, lower_inode); ++ if (IS_ERR(inode)) { ++ err = PTR_ERR(inode); ++ goto out; ++ } ++ ++ d_add(dentry, inode); ++ ++out: ++ return err; ++} ++ ++/* ++ * Main driver function for wrapfs's lookup. ++ * ++ * Returns: NULL (ok), ERR_PTR if an error occurred. ++ * Fills in lower_parent_path with on success. ++ */ ++static struct dentry *__wrapfs_lookup(struct dentry *dentry, int flags, ++ struct path *lower_parent_path) ++{ ++ int err = 0; ++ struct vfsmount *lower_dir_mnt; ++ struct dentry *lower_dir_dentry = NULL; ++ struct dentry *lower_dentry; ++ const char *name; ++ struct path lower_path; ++ struct qstr this; ++ ++ /* must initialize dentry operations */ ++ d_set_d_op(dentry, &wrapfs_dops); ++ ++ if (IS_ROOT(dentry)) ++ goto out; ++ ++ name = dentry->d_name.name; ++ ++ /* now start the actual lookup procedure */ ++ lower_dir_dentry = lower_parent_path->dentry; ++ lower_dir_mnt = lower_parent_path->mnt; ++ ++ /* Use vfs_path_lookup to check if the dentry exists or not */ ++ err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name, 0, ++ &lower_path); ++ ++ /* no error: handle positive dentries */ ++ if (!err) { ++ wrapfs_set_lower_path(dentry, &lower_path); ++ err = wrapfs_interpose(dentry, dentry->d_sb, &lower_path); ++ if (err) /* path_put underlying path on error */ ++ wrapfs_put_reset_lower_path(dentry); ++ goto out; ++ } ++ ++ /* ++ * We don't consider ENOENT an error, and we want to return a ++ * negative dentry. ++ */ ++ if (err && err != -ENOENT) ++ goto out; ++ ++ /* instatiate a new negative dentry */ ++ this.name = name; ++ this.len = strlen(name); ++ this.hash = full_name_hash(this.name, this.len); ++ lower_dentry = d_lookup(lower_dir_dentry, &this); ++ if (lower_dentry) ++ goto setup_lower; ++ ++ lower_dentry = d_alloc(lower_dir_dentry, &this); ++ if (!lower_dentry) { ++ err = -ENOMEM; ++ goto out; ++ } ++ d_add(lower_dentry, NULL); /* instantiate and hash */ ++ ++setup_lower: ++ lower_path.dentry = lower_dentry; ++ lower_path.mnt = mntget(lower_dir_mnt); ++ wrapfs_set_lower_path(dentry, &lower_path); ++ ++ /* ++ * If the intent is to create a file, then don't return an error, so ++ * the VFS will continue the process of making this negative dentry ++ * into a positive one. ++ */ ++ if (flags & (LOOKUP_CREATE|LOOKUP_RENAME_TARGET)) ++ err = 0; ++ ++out: ++ return ERR_PTR(err); ++} ++ ++struct dentry *wrapfs_lookup(struct inode *dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct dentry *ret, *parent; ++ struct path lower_parent_path; ++ int err = 0; ++ ++ BUG_ON(!nd); ++ parent = dget_parent(dentry); ++ ++ wrapfs_get_lower_path(parent, &lower_parent_path); ++ ++ /* allocate dentry private data. We free it in ->d_release */ ++ err = new_dentry_private_data(dentry); ++ if (err) { ++ ret = ERR_PTR(err); ++ goto out; ++ } ++ ret = __wrapfs_lookup(dentry, nd->flags, &lower_parent_path); ++ if (IS_ERR(ret)) ++ goto out; ++ if (ret) ++ dentry = ret; ++ if (dentry->d_inode) ++ fsstack_copy_attr_times(dentry->d_inode, ++ wrapfs_lower_inode(dentry->d_inode)); ++ /* update parent directory's atime */ ++ fsstack_copy_attr_atime(parent->d_inode, ++ wrapfs_lower_inode(parent->d_inode)); ++ ++out: ++ wrapfs_put_lower_path(parent, &lower_parent_path); ++ dput(parent); ++ return ret; ++} +diff --git a/fs/wrapfs/main.c b/fs/wrapfs/main.c +new file mode 100644 +index 0000000..130aca6 +--- /dev/null ++++ b/fs/wrapfs/main.c +@@ -0,0 +1,173 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include "wrapfs.h" ++#include ++ ++/* ++ * There is no need to lock the wrapfs_super_info's rwsem as there is no ++ * way anyone can have a reference to the superblock at this point in time. ++ */ ++static int wrapfs_read_super(struct super_block *sb, void *raw_data, int silent) ++{ ++ int err = 0; ++ struct super_block *lower_sb; ++ struct path lower_path; ++ char *dev_name = (char *) raw_data; ++ struct inode *inode; ++ ++ if (!dev_name) { ++ printk(KERN_ERR ++ "wrapfs: read_super: missing dev_name argument\n"); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ /* parse lower path */ ++ err = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, ++ &lower_path); ++ if (err) { ++ printk(KERN_ERR "wrapfs: error accessing " ++ "lower directory '%s'\n", dev_name); ++ goto out; ++ } ++ ++ /* allocate superblock private data */ ++ sb->s_fs_info = kzalloc(sizeof(struct wrapfs_sb_info), GFP_KERNEL); ++ if (!WRAPFS_SB(sb)) { ++ printk(KERN_CRIT "wrapfs: read_super: out of memory\n"); ++ err = -ENOMEM; ++ goto out_free; ++ } ++ ++ /* set the lower superblock field of upper superblock */ ++ lower_sb = lower_path.dentry->d_sb; ++ atomic_inc(&lower_sb->s_active); ++ wrapfs_set_lower_super(sb, lower_sb); ++ ++ /* inherit maxbytes from lower file system */ ++ sb->s_maxbytes = lower_sb->s_maxbytes; ++ ++ /* ++ * Our c/m/atime granularity is 1 ns because we may stack on file ++ * systems whose granularity is as good. ++ */ ++ sb->s_time_gran = 1; ++ ++ sb->s_op = &wrapfs_sops; ++ ++ /* get a new inode and allocate our root dentry */ ++ inode = wrapfs_iget(sb, lower_path.dentry->d_inode); ++ if (IS_ERR(inode)) { ++ err = PTR_ERR(inode); ++ goto out_sput; ++ } ++ sb->s_root = d_alloc_root(inode); ++ if (!sb->s_root) { ++ err = -ENOMEM; ++ goto out_iput; ++ } ++ d_set_d_op(sb->s_root, &wrapfs_dops); ++ ++ /* link the upper and lower dentries */ ++ sb->s_root->d_fsdata = NULL; ++ err = new_dentry_private_data(sb->s_root); ++ if (err) ++ goto out_freeroot; ++ ++ /* if get here: cannot have error */ ++ ++ /* set the lower dentries for s_root */ ++ wrapfs_set_lower_path(sb->s_root, &lower_path); ++ ++ /* ++ * No need to call interpose because we already have a positive ++ * dentry, which was instantiated by d_alloc_root. Just need to ++ * d_rehash it. ++ */ ++ d_rehash(sb->s_root); ++ if (!silent) ++ printk(KERN_INFO ++ "wrapfs: mounted on top of %s type %s\n", ++ dev_name, lower_sb->s_type->name); ++ goto out; /* all is well */ ++ ++ /* no longer needed: free_dentry_private_data(sb->s_root); */ ++out_freeroot: ++ dput(sb->s_root); ++out_iput: ++ iput(inode); ++out_sput: ++ /* drop refs we took earlier */ ++ atomic_dec(&lower_sb->s_active); ++ kfree(WRAPFS_SB(sb)); ++ sb->s_fs_info = NULL; ++out_free: ++ path_put(&lower_path); ++ ++out: ++ return err; ++} ++ ++struct dentry *wrapfs_mount(struct file_system_type *fs_type, int flags, ++ const char *dev_name, void *raw_data) ++{ ++ void *lower_path_name = (void *) dev_name; ++ ++ return mount_nodev(fs_type, flags, lower_path_name, ++ wrapfs_read_super); ++} ++ ++static struct file_system_type wrapfs_fs_type = { ++ .owner = THIS_MODULE, ++ .name = WRAPFS_NAME, ++ .mount = wrapfs_mount, ++ .kill_sb = generic_shutdown_super, ++ .fs_flags = FS_REVAL_DOT, ++}; ++ ++static int __init init_wrapfs_fs(void) ++{ ++ int err; ++ ++ pr_info("Registering wrapfs " WRAPFS_VERSION "\n"); ++ ++ err = wrapfs_init_inode_cache(); ++ if (err) ++ goto out; ++ err = wrapfs_init_dentry_cache(); ++ if (err) ++ goto out; ++ err = register_filesystem(&wrapfs_fs_type); ++out: ++ if (err) { ++ wrapfs_destroy_inode_cache(); ++ wrapfs_destroy_dentry_cache(); ++ } ++ return err; ++} ++ ++static void __exit exit_wrapfs_fs(void) ++{ ++ wrapfs_destroy_inode_cache(); ++ wrapfs_destroy_dentry_cache(); ++ unregister_filesystem(&wrapfs_fs_type); ++ pr_info("Completed wrapfs module unload\n"); ++} ++ ++MODULE_AUTHOR("Erez Zadok, Filesystems and Storage Lab, Stony Brook University" ++ " (http://www.fsl.cs.sunysb.edu/)"); ++MODULE_DESCRIPTION("Wrapfs " WRAPFS_VERSION ++ " (http://wrapfs.filesystems.org/)"); ++MODULE_LICENSE("GPL"); ++ ++module_init(init_wrapfs_fs); ++module_exit(exit_wrapfs_fs); +diff --git a/fs/wrapfs/mmap.c b/fs/wrapfs/mmap.c +new file mode 100644 +index 0000000..c224fc3 +--- /dev/null ++++ b/fs/wrapfs/mmap.c +@@ -0,0 +1,53 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include "wrapfs.h" ++ ++static int wrapfs_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ++{ ++ int err; ++ struct file *file, *lower_file; ++ const struct vm_operations_struct *lower_vm_ops; ++ struct vm_area_struct lower_vma; ++ ++ memcpy(&lower_vma, vma, sizeof(struct vm_area_struct)); ++ file = lower_vma.vm_file; ++ lower_vm_ops = WRAPFS_F(file)->lower_vm_ops; ++ BUG_ON(!lower_vm_ops); ++ ++ lower_file = wrapfs_lower_file(file); ++ /* ++ * XXX: vm_ops->fault may be called in parallel. Because we have to ++ * resort to temporarily changing the vma->vm_file to point to the ++ * lower file, a concurrent invocation of wrapfs_fault could see a ++ * different value. In this workaround, we keep a different copy of ++ * the vma structure in our stack, so we never expose a different ++ * value of the vma->vm_file called to us, even temporarily. A ++ * better fix would be to change the calling semantics of ->fault to ++ * take an explicit file pointer. ++ */ ++ lower_vma.vm_file = lower_file; ++ err = lower_vm_ops->fault(&lower_vma, vmf); ++ return err; ++} ++ ++/* ++ * XXX: the default address_space_ops for wrapfs is empty. We cannot set ++ * our inode->i_mapping->a_ops to NULL because too many code paths expect ++ * the a_ops vector to be non-NULL. ++ */ ++const struct address_space_operations wrapfs_aops = { ++ /* empty on purpose */ ++}; ++ ++const struct vm_operations_struct wrapfs_vm_ops = { ++ .fault = wrapfs_fault, ++}; +diff --git a/fs/wrapfs/super.c b/fs/wrapfs/super.c +new file mode 100644 +index 0000000..89d277d +--- /dev/null ++++ b/fs/wrapfs/super.c +@@ -0,0 +1,168 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include "wrapfs.h" ++ ++/* ++ * The inode cache is used with alloc_inode for both our inode info and the ++ * vfs inode. ++ */ ++static struct kmem_cache *wrapfs_inode_cachep; ++ ++/* final actions when unmounting a file system */ ++static void wrapfs_put_super(struct super_block *sb) ++{ ++ struct wrapfs_sb_info *spd; ++ struct super_block *s; ++ ++ spd = WRAPFS_SB(sb); ++ if (!spd) ++ return; ++ ++ /* decrement lower super references */ ++ s = wrapfs_lower_super(sb); ++ wrapfs_set_lower_super(sb, NULL); ++ atomic_dec(&s->s_active); ++ ++ kfree(spd); ++ sb->s_fs_info = NULL; ++} ++ ++static int wrapfs_statfs(struct dentry *dentry, struct kstatfs *buf) ++{ ++ int err; ++ struct path lower_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ err = vfs_statfs(&lower_path, buf); ++ wrapfs_put_lower_path(dentry, &lower_path); ++ ++ /* set return buf to our f/s to avoid confusing user-level utils */ ++ buf->f_type = WRAPFS_SUPER_MAGIC; ++ ++ return err; ++} ++ ++/* ++ * @flags: numeric mount options ++ * @options: mount options string ++ */ ++static int wrapfs_remount_fs(struct super_block *sb, int *flags, char *options) ++{ ++ int err = 0; ++ ++ /* ++ * The VFS will take care of "ro" and "rw" flags among others. We ++ * can safely accept a few flags (RDONLY, MANDLOCK), and honor ++ * SILENT, but anything else left over is an error. ++ */ ++ if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT)) != 0) { ++ printk(KERN_ERR ++ "wrapfs: remount flags 0x%x unsupported\n", *flags); ++ err = -EINVAL; ++ } ++ ++ return err; ++} ++ ++/* ++ * Called by iput() when the inode reference count reached zero ++ * and the inode is not hashed anywhere. Used to clear anything ++ * that needs to be, before the inode is completely destroyed and put ++ * on the inode free list. ++ */ ++static void wrapfs_evict_inode(struct inode *inode) ++{ ++ struct inode *lower_inode; ++ ++ truncate_inode_pages(&inode->i_data, 0); ++ end_writeback(inode); ++ /* ++ * Decrement a reference to a lower_inode, which was incremented ++ * by our read_inode when it was created initially. ++ */ ++ lower_inode = wrapfs_lower_inode(inode); ++ wrapfs_set_lower_inode(inode, NULL); ++ iput(lower_inode); ++} ++ ++static struct inode *wrapfs_alloc_inode(struct super_block *sb) ++{ ++ struct wrapfs_inode_info *i; ++ ++ i = kmem_cache_alloc(wrapfs_inode_cachep, GFP_KERNEL); ++ if (!i) ++ return NULL; ++ ++ /* memset everything up to the inode to 0 */ ++ memset(i, 0, offsetof(struct wrapfs_inode_info, vfs_inode)); ++ ++ i->vfs_inode.i_version = 1; ++ return &i->vfs_inode; ++} ++ ++static void wrapfs_destroy_inode(struct inode *inode) ++{ ++ kmem_cache_free(wrapfs_inode_cachep, WRAPFS_I(inode)); ++} ++ ++/* wrapfs inode cache constructor */ ++static void init_once(void *obj) ++{ ++ struct wrapfs_inode_info *i = obj; ++ ++ inode_init_once(&i->vfs_inode); ++} ++ ++int wrapfs_init_inode_cache(void) ++{ ++ int err = 0; ++ ++ wrapfs_inode_cachep = ++ kmem_cache_create("wrapfs_inode_cache", ++ sizeof(struct wrapfs_inode_info), 0, ++ SLAB_RECLAIM_ACCOUNT, init_once); ++ if (!wrapfs_inode_cachep) ++ err = -ENOMEM; ++ return err; ++} ++ ++/* wrapfs inode cache destructor */ ++void wrapfs_destroy_inode_cache(void) ++{ ++ if (wrapfs_inode_cachep) ++ kmem_cache_destroy(wrapfs_inode_cachep); ++} ++ ++/* ++ * Used only in nfs, to kill any pending RPC tasks, so that subsequent ++ * code can actually succeed and won't leave tasks that need handling. ++ */ ++static void wrapfs_umount_begin(struct super_block *sb) ++{ ++ struct super_block *lower_sb; ++ ++ lower_sb = wrapfs_lower_super(sb); ++ if (lower_sb && lower_sb->s_op && lower_sb->s_op->umount_begin) ++ lower_sb->s_op->umount_begin(lower_sb); ++} ++ ++const struct super_operations wrapfs_sops = { ++ .put_super = wrapfs_put_super, ++ .statfs = wrapfs_statfs, ++ .remount_fs = wrapfs_remount_fs, ++ .evict_inode = wrapfs_evict_inode, ++ .umount_begin = wrapfs_umount_begin, ++ .show_options = generic_show_options, ++ .alloc_inode = wrapfs_alloc_inode, ++ .destroy_inode = wrapfs_destroy_inode, ++ .drop_inode = generic_delete_inode, ++}; +diff --git a/fs/wrapfs/wrapfs.h b/fs/wrapfs/wrapfs.h +new file mode 100644 +index 0000000..25b5795 +--- /dev/null ++++ b/fs/wrapfs/wrapfs.h +@@ -0,0 +1,204 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#ifndef _WRAPFS_H_ ++#define _WRAPFS_H_ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* the file system name */ ++#define WRAPFS_NAME "wrapfs" ++ ++/* wrapfs root inode number */ ++#define WRAPFS_ROOT_INO 1 ++ ++/* useful for tracking code reachability */ ++#define UDBG printk(KERN_DEFAULT "DBG:%s:%s:%d\n", __FILE__, __func__, __LINE__) ++ ++/* operations vectors defined in specific files */ ++extern const struct file_operations wrapfs_main_fops; ++extern const struct file_operations wrapfs_dir_fops; ++extern const struct inode_operations wrapfs_main_iops; ++extern const struct inode_operations wrapfs_dir_iops; ++extern const struct inode_operations wrapfs_symlink_iops; ++extern const struct super_operations wrapfs_sops; ++extern const struct dentry_operations wrapfs_dops; ++extern const struct address_space_operations wrapfs_aops, wrapfs_dummy_aops; ++extern const struct vm_operations_struct wrapfs_vm_ops; ++ ++extern int wrapfs_init_inode_cache(void); ++extern void wrapfs_destroy_inode_cache(void); ++extern int wrapfs_init_dentry_cache(void); ++extern void wrapfs_destroy_dentry_cache(void); ++extern int new_dentry_private_data(struct dentry *dentry); ++extern void free_dentry_private_data(struct dentry *dentry); ++extern struct dentry *wrapfs_lookup(struct inode *dir, struct dentry *dentry, ++ struct nameidata *nd); ++extern struct inode *wrapfs_iget(struct super_block *sb, ++ struct inode *lower_inode); ++extern int wrapfs_interpose(struct dentry *dentry, struct super_block *sb, ++ struct path *lower_path); ++ ++/* file private data */ ++struct wrapfs_file_info { ++ struct file *lower_file; ++ const struct vm_operations_struct *lower_vm_ops; ++}; ++ ++/* wrapfs inode data in memory */ ++struct wrapfs_inode_info { ++ struct inode *lower_inode; ++ struct inode vfs_inode; ++}; ++ ++/* wrapfs dentry data in memory */ ++struct wrapfs_dentry_info { ++ spinlock_t lock; /* protects lower_path */ ++ struct path lower_path; ++}; ++ ++/* wrapfs super-block data in memory */ ++struct wrapfs_sb_info { ++ struct super_block *lower_sb; ++}; ++ ++/* ++ * inode to private data ++ * ++ * Since we use containers and the struct inode is _inside_ the ++ * wrapfs_inode_info structure, WRAPFS_I will always (given a non-NULL ++ * inode pointer), return a valid non-NULL pointer. ++ */ ++static inline struct wrapfs_inode_info *WRAPFS_I(const struct inode *inode) ++{ ++ return container_of(inode, struct wrapfs_inode_info, vfs_inode); ++} ++ ++/* dentry to private data */ ++#define WRAPFS_D(dent) ((struct wrapfs_dentry_info *)(dent)->d_fsdata) ++ ++/* superblock to private data */ ++#define WRAPFS_SB(super) ((struct wrapfs_sb_info *)(super)->s_fs_info) ++ ++/* file to private Data */ ++#define WRAPFS_F(file) ((struct wrapfs_file_info *)((file)->private_data)) ++ ++/* file to lower file */ ++static inline struct file *wrapfs_lower_file(const struct file *f) ++{ ++ return WRAPFS_F(f)->lower_file; ++} ++ ++static inline void wrapfs_set_lower_file(struct file *f, struct file *val) ++{ ++ WRAPFS_F(f)->lower_file = val; ++} ++ ++/* inode to lower inode. */ ++static inline struct inode *wrapfs_lower_inode(const struct inode *i) ++{ ++ return WRAPFS_I(i)->lower_inode; ++} ++ ++static inline void wrapfs_set_lower_inode(struct inode *i, struct inode *val) ++{ ++ WRAPFS_I(i)->lower_inode = val; ++} ++ ++/* superblock to lower superblock */ ++static inline struct super_block *wrapfs_lower_super( ++ const struct super_block *sb) ++{ ++ return WRAPFS_SB(sb)->lower_sb; ++} ++ ++static inline void wrapfs_set_lower_super(struct super_block *sb, ++ struct super_block *val) ++{ ++ WRAPFS_SB(sb)->lower_sb = val; ++} ++ ++/* path based (dentry/mnt) macros */ ++static inline void pathcpy(struct path *dst, const struct path *src) ++{ ++ dst->dentry = src->dentry; ++ dst->mnt = src->mnt; ++} ++/* Returns struct path. Caller must path_put it. */ ++static inline void wrapfs_get_lower_path(const struct dentry *dent, ++ struct path *lower_path) ++{ ++ spin_lock(&WRAPFS_D(dent)->lock); ++ pathcpy(lower_path, &WRAPFS_D(dent)->lower_path); ++ path_get(lower_path); ++ spin_unlock(&WRAPFS_D(dent)->lock); ++ return; ++} ++static inline void wrapfs_put_lower_path(const struct dentry *dent, ++ struct path *lower_path) ++{ ++ path_put(lower_path); ++ return; ++} ++static inline void wrapfs_set_lower_path(const struct dentry *dent, ++ struct path *lower_path) ++{ ++ spin_lock(&WRAPFS_D(dent)->lock); ++ pathcpy(&WRAPFS_D(dent)->lower_path, lower_path); ++ spin_unlock(&WRAPFS_D(dent)->lock); ++ return; ++} ++static inline void wrapfs_reset_lower_path(const struct dentry *dent) ++{ ++ spin_lock(&WRAPFS_D(dent)->lock); ++ WRAPFS_D(dent)->lower_path.dentry = NULL; ++ WRAPFS_D(dent)->lower_path.mnt = NULL; ++ spin_unlock(&WRAPFS_D(dent)->lock); ++ return; ++} ++static inline void wrapfs_put_reset_lower_path(const struct dentry *dent) ++{ ++ struct path lower_path; ++ spin_lock(&WRAPFS_D(dent)->lock); ++ pathcpy(&lower_path, &WRAPFS_D(dent)->lower_path); ++ WRAPFS_D(dent)->lower_path.dentry = NULL; ++ WRAPFS_D(dent)->lower_path.mnt = NULL; ++ spin_unlock(&WRAPFS_D(dent)->lock); ++ path_put(&lower_path); ++ return; ++} ++ ++/* locking helpers */ ++static inline struct dentry *lock_parent(struct dentry *dentry) ++{ ++ struct dentry *dir = dget_parent(dentry); ++ mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); ++ return dir; ++} ++ ++static inline void unlock_dir(struct dentry *dir) ++{ ++ mutex_unlock(&dir->d_inode->i_mutex); ++ dput(dir); ++} ++#endif /* not _WRAPFS_H_ */ +diff --git a/include/linux/magic.h b/include/linux/magic.h +index 2d4beab..8ef0170 100644 +--- a/include/linux/magic.h ++++ b/include/linux/magic.h +@@ -50,6 +50,8 @@ + #define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs" + #define REISER2FS_JR_SUPER_MAGIC_STRING "ReIsEr3Fs" + ++#define WRAPFS_SUPER_MAGIC 0xb550ca10 ++ + #define SMB_SUPER_MAGIC 0x517B + #define USBDEVICE_SUPER_MAGIC 0x9fa2 + #define CGROUP_SUPER_MAGIC 0x27e0eb diff --git a/3.3.8/0001-AppArmor-compatibility-patch-for-v5-network-controll.patch b/3.3.8/0001-AppArmor-compatibility-patch-for-v5-network-controll.patch new file mode 100644 index 0000000..00c8712 --- /dev/null +++ b/3.3.8/0001-AppArmor-compatibility-patch-for-v5-network-controll.patch @@ -0,0 +1,553 @@ +From dc13dec93dbd04bfa7a9ba67df1b8ed3431d8d48 Mon Sep 17 00:00:00 2001 +From: John Johansen +Date: Wed, 10 Aug 2011 22:02:39 -0700 +Subject: [PATCH 1/3] AppArmor: compatibility patch for v5 network controll + +Add compatibility for v5 network rules. + +Signed-off-by: John Johansen +--- + include/linux/lsm_audit.h | 4 + + security/apparmor/Makefile | 19 ++++- + security/apparmor/include/net.h | 40 +++++++++ + security/apparmor/include/policy.h | 3 + + security/apparmor/lsm.c | 112 +++++++++++++++++++++++ + security/apparmor/net.c | 170 ++++++++++++++++++++++++++++++++++++ + security/apparmor/policy.c | 1 + + security/apparmor/policy_unpack.c | 48 ++++++++++- + 8 files changed, 394 insertions(+), 3 deletions(-) + create mode 100644 security/apparmor/include/net.h + create mode 100644 security/apparmor/net.c + +diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h +index 88e78de..c63979a 100644 +--- a/include/linux/lsm_audit.h ++++ b/include/linux/lsm_audit.h +@@ -124,6 +124,10 @@ struct common_audit_data { + u32 denied; + uid_t ouid; + } fs; ++ struct { ++ int type, protocol; ++ struct sock *sk; ++ } net; + }; + } apparmor_audit_data; + #endif +diff --git a/security/apparmor/Makefile b/security/apparmor/Makefile +index 2dafe50..7cefef9 100644 +--- a/security/apparmor/Makefile ++++ b/security/apparmor/Makefile +@@ -4,9 +4,9 @@ obj-$(CONFIG_SECURITY_APPARMOR) += apparmor.o + + apparmor-y := apparmorfs.o audit.o capability.o context.o ipc.o lib.o match.o \ + path.o domain.o policy.o policy_unpack.o procattr.o lsm.o \ +- resource.o sid.o file.o ++ resource.o sid.o file.o net.o + +-clean-files := capability_names.h rlim_names.h ++clean-files := capability_names.h rlim_names.h af_names.h + + + # Build a lower case string table of capability names +@@ -44,9 +44,24 @@ cmd_make-rlim = echo "static const char *rlim_names[] = {" > $@ ;\ + sed -r -n "s/^\# ?define[ \t]+(RLIMIT_[A-Z0-9_]+).*/\1,/p" $< >> $@ ;\ + echo "};" >> $@ + ++# Build a lower case string table of address family names. ++# Transform lines from ++# #define AF_INET 2 /* Internet IP Protocol */ ++# to ++# [2] = "inet", ++quiet_cmd_make-af = GEN $@ ++cmd_make-af = echo "static const char *address_family_names[] = {" > $@ ;\ ++ sed $< >> $@ -r -n -e "/AF_MAX/d" -e "/AF_LOCAL/d" -e \ ++ 's/^\#define[ \t]+AF_([A-Z0-9_]+)[ \t]+([0-9]+).*/[\2] = "\L\1",/p';\ ++ echo "};" >> $@ ++ ++ + $(obj)/capability.o : $(obj)/capability_names.h + $(obj)/resource.o : $(obj)/rlim_names.h ++$(obj)/net.o : $(obj)/af_names.h + $(obj)/capability_names.h : $(srctree)/include/linux/capability.h + $(call cmd,make-caps) + $(obj)/rlim_names.h : $(srctree)/include/asm-generic/resource.h + $(call cmd,make-rlim) ++$(obj)/af_names.h : $(srctree)/include/linux/socket.h ++ $(call cmd,make-af) +\ No newline at end of file +diff --git a/security/apparmor/include/net.h b/security/apparmor/include/net.h +new file mode 100644 +index 0000000..3c7d599 +--- /dev/null ++++ b/security/apparmor/include/net.h +@@ -0,0 +1,40 @@ ++/* ++ * AppArmor security module ++ * ++ * This file contains AppArmor network mediation definitions. ++ * ++ * Copyright (C) 1998-2008 Novell/SUSE ++ * Copyright 2009-2010 Canonical Ltd. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License as ++ * published by the Free Software Foundation, version 2 of the ++ * License. ++ */ ++ ++#ifndef __AA_NET_H ++#define __AA_NET_H ++ ++#include ++ ++/* struct aa_net - network confinement data ++ * @allowed: basic network families permissions ++ * @audit_network: which network permissions to force audit ++ * @quiet_network: which network permissions to quiet rejects ++ */ ++struct aa_net { ++ u16 allow[AF_MAX]; ++ u16 audit[AF_MAX]; ++ u16 quiet[AF_MAX]; ++}; ++ ++extern int aa_net_perm(int op, struct aa_profile *profile, u16 family, ++ int type, int protocol, struct sock *sk); ++extern int aa_revalidate_sk(int op, struct sock *sk); ++ ++static inline void aa_free_net_rules(struct aa_net *new) ++{ ++ /* NOP */ ++} ++ ++#endif /* __AA_NET_H */ +diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h +index aeda5cf..6776929 100644 +--- a/security/apparmor/include/policy.h ++++ b/security/apparmor/include/policy.h +@@ -27,6 +27,7 @@ + #include "capability.h" + #include "domain.h" + #include "file.h" ++#include "net.h" + #include "resource.h" + + extern const char *profile_mode_names[]; +@@ -145,6 +146,7 @@ struct aa_namespace { + * @size: the memory consumed by this profiles rules + * @file: The set of rules governing basic file access and domain transitions + * @caps: capabilities for the profile ++ * @net: network controls for the profile + * @rlimits: rlimits for the profile + * + * The AppArmor profile contains the basic confinement data. Each profile +@@ -181,6 +183,7 @@ struct aa_profile { + + struct aa_file_rules file; + struct aa_caps caps; ++ struct aa_net net; + struct aa_rlimit rlimits; + }; + +diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c +index 3d2fd14..aa293ae 100644 +--- a/security/apparmor/lsm.c ++++ b/security/apparmor/lsm.c +@@ -32,6 +32,7 @@ + #include "include/context.h" + #include "include/file.h" + #include "include/ipc.h" ++#include "include/net.h" + #include "include/path.h" + #include "include/policy.h" + #include "include/procattr.h" +@@ -621,6 +622,104 @@ static int apparmor_task_setrlimit(struct task_struct *task, + return error; + } + ++static int apparmor_socket_create(int family, int type, int protocol, int kern) ++{ ++ struct aa_profile *profile; ++ int error = 0; ++ ++ if (kern) ++ return 0; ++ ++ profile = __aa_current_profile(); ++ if (!unconfined(profile)) ++ error = aa_net_perm(OP_CREATE, profile, family, type, protocol, ++ NULL); ++ return error; ++} ++ ++static int apparmor_socket_bind(struct socket *sock, ++ struct sockaddr *address, int addrlen) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_BIND, sk); ++} ++ ++static int apparmor_socket_connect(struct socket *sock, ++ struct sockaddr *address, int addrlen) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_CONNECT, sk); ++} ++ ++static int apparmor_socket_listen(struct socket *sock, int backlog) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_LISTEN, sk); ++} ++ ++static int apparmor_socket_accept(struct socket *sock, struct socket *newsock) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_ACCEPT, sk); ++} ++ ++static int apparmor_socket_sendmsg(struct socket *sock, ++ struct msghdr *msg, int size) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_SENDMSG, sk); ++} ++ ++static int apparmor_socket_recvmsg(struct socket *sock, ++ struct msghdr *msg, int size, int flags) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_RECVMSG, sk); ++} ++ ++static int apparmor_socket_getsockname(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_GETSOCKNAME, sk); ++} ++ ++static int apparmor_socket_getpeername(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_GETPEERNAME, sk); ++} ++ ++static int apparmor_socket_getsockopt(struct socket *sock, int level, ++ int optname) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_GETSOCKOPT, sk); ++} ++ ++static int apparmor_socket_setsockopt(struct socket *sock, int level, ++ int optname) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_SETSOCKOPT, sk); ++} ++ ++static int apparmor_socket_shutdown(struct socket *sock, int how) ++{ ++ struct sock *sk = sock->sk; ++ ++ return aa_revalidate_sk(OP_SOCK_SHUTDOWN, sk); ++} ++ + static struct security_operations apparmor_ops = { + .name = "apparmor", + +@@ -652,6 +751,19 @@ static struct security_operations apparmor_ops = { + .getprocattr = apparmor_getprocattr, + .setprocattr = apparmor_setprocattr, + ++ .socket_create = apparmor_socket_create, ++ .socket_bind = apparmor_socket_bind, ++ .socket_connect = apparmor_socket_connect, ++ .socket_listen = apparmor_socket_listen, ++ .socket_accept = apparmor_socket_accept, ++ .socket_sendmsg = apparmor_socket_sendmsg, ++ .socket_recvmsg = apparmor_socket_recvmsg, ++ .socket_getsockname = apparmor_socket_getsockname, ++ .socket_getpeername = apparmor_socket_getpeername, ++ .socket_getsockopt = apparmor_socket_getsockopt, ++ .socket_setsockopt = apparmor_socket_setsockopt, ++ .socket_shutdown = apparmor_socket_shutdown, ++ + .cred_alloc_blank = apparmor_cred_alloc_blank, + .cred_free = apparmor_cred_free, + .cred_prepare = apparmor_cred_prepare, +diff --git a/security/apparmor/net.c b/security/apparmor/net.c +new file mode 100644 +index 0000000..1765901 +--- /dev/null ++++ b/security/apparmor/net.c +@@ -0,0 +1,170 @@ ++/* ++ * AppArmor security module ++ * ++ * This file contains AppArmor network mediation ++ * ++ * Copyright (C) 1998-2008 Novell/SUSE ++ * Copyright 2009-2010 Canonical Ltd. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License as ++ * published by the Free Software Foundation, version 2 of the ++ * License. ++ */ ++ ++#include "include/apparmor.h" ++#include "include/audit.h" ++#include "include/context.h" ++#include "include/net.h" ++#include "include/policy.h" ++ ++#include "af_names.h" ++ ++static const char *sock_type_names[] = { ++ "unknown(0)", ++ "stream", ++ "dgram", ++ "raw", ++ "rdm", ++ "seqpacket", ++ "dccp", ++ "unknown(7)", ++ "unknown(8)", ++ "unknown(9)", ++ "packet", ++}; ++ ++/* audit callback for net specific fields */ ++static void audit_cb(struct audit_buffer *ab, void *va) ++{ ++ struct common_audit_data *sa = va; ++ ++ audit_log_format(ab, " family="); ++ if (address_family_names[sa->u.net.family]) { ++ audit_log_string(ab, address_family_names[sa->u.net.family]); ++ } else { ++ audit_log_format(ab, " \"unknown(%d)\"", sa->u.net.family); ++ } ++ ++ audit_log_format(ab, " sock_type="); ++ if (sock_type_names[sa->aad.net.type]) { ++ audit_log_string(ab, sock_type_names[sa->aad.net.type]); ++ } else { ++ audit_log_format(ab, "\"unknown(%d)\"", sa->aad.net.type); ++ } ++ ++ audit_log_format(ab, " protocol=%d", sa->aad.net.protocol); ++} ++ ++/** ++ * audit_net - audit network access ++ * @profile: profile being enforced (NOT NULL) ++ * @op: operation being checked ++ * @family: network family ++ * @type: network type ++ * @protocol: network protocol ++ * @sk: socket auditing is being applied to ++ * @error: error code for failure else 0 ++ * ++ * Returns: %0 or sa->error else other errorcode on failure ++ */ ++static int audit_net(struct aa_profile *profile, int op, u16 family, int type, ++ int protocol, struct sock *sk, int error) ++{ ++ int audit_type = AUDIT_APPARMOR_AUTO; ++ struct common_audit_data sa; ++ if (sk) { ++ COMMON_AUDIT_DATA_INIT(&sa, NET); ++ } else { ++ COMMON_AUDIT_DATA_INIT(&sa, NONE); ++ } ++ /* todo fill in socket addr info */ ++ ++ sa.aad.op = op, ++ sa.u.net.family = family; ++ sa.u.net.sk = sk; ++ sa.aad.net.type = type; ++ sa.aad.net.protocol = protocol; ++ sa.aad.error = error; ++ ++ if (likely(!sa.aad.error)) { ++ u16 audit_mask = profile->net.audit[sa.u.net.family]; ++ if (likely((AUDIT_MODE(profile) != AUDIT_ALL) && ++ !(1 << sa.aad.net.type & audit_mask))) ++ return 0; ++ audit_type = AUDIT_APPARMOR_AUDIT; ++ } else { ++ u16 quiet_mask = profile->net.quiet[sa.u.net.family]; ++ u16 kill_mask = 0; ++ u16 denied = (1 << sa.aad.net.type) & ~quiet_mask; ++ ++ if (denied & kill_mask) ++ audit_type = AUDIT_APPARMOR_KILL; ++ ++ if ((denied & quiet_mask) && ++ AUDIT_MODE(profile) != AUDIT_NOQUIET && ++ AUDIT_MODE(profile) != AUDIT_ALL) ++ return COMPLAIN_MODE(profile) ? 0 : sa.aad.error; ++ } ++ ++ return aa_audit(audit_type, profile, GFP_KERNEL, &sa, audit_cb); ++} ++ ++/** ++ * aa_net_perm - very course network access check ++ * @op: operation being checked ++ * @profile: profile being enforced (NOT NULL) ++ * @family: network family ++ * @type: network type ++ * @protocol: network protocol ++ * ++ * Returns: %0 else error if permission denied ++ */ ++int aa_net_perm(int op, struct aa_profile *profile, u16 family, int type, ++ int protocol, struct sock *sk) ++{ ++ u16 family_mask; ++ int error; ++ ++ if ((family < 0) || (family >= AF_MAX)) ++ return -EINVAL; ++ ++ if ((type < 0) || (type >= SOCK_MAX)) ++ return -EINVAL; ++ ++ /* unix domain and netlink sockets are handled by ipc */ ++ if (family == AF_UNIX || family == AF_NETLINK) ++ return 0; ++ ++ family_mask = profile->net.allow[family]; ++ ++ error = (family_mask & (1 << type)) ? 0 : -EACCES; ++ ++ return audit_net(profile, op, family, type, protocol, sk, error); ++} ++ ++/** ++ * aa_revalidate_sk - Revalidate access to a sock ++ * @op: operation being checked ++ * @sk: sock being revalidated (NOT NULL) ++ * ++ * Returns: %0 else error if permission denied ++ */ ++int aa_revalidate_sk(int op, struct sock *sk) ++{ ++ struct aa_profile *profile; ++ int error = 0; ++ ++ /* aa_revalidate_sk should not be called from interrupt context ++ * don't mediate these calls as they are not task related ++ */ ++ if (in_interrupt()) ++ return 0; ++ ++ profile = __aa_current_profile(); ++ if (!unconfined(profile)) ++ error = aa_net_perm(op, profile, sk->sk_family, sk->sk_type, ++ sk->sk_protocol, sk); ++ ++ return error; ++} +diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c +index 4f0eade..4d5ce13 100644 +--- a/security/apparmor/policy.c ++++ b/security/apparmor/policy.c +@@ -745,6 +745,7 @@ static void free_profile(struct aa_profile *profile) + + aa_free_file_rules(&profile->file); + aa_free_cap_rules(&profile->caps); ++ aa_free_net_rules(&profile->net); + aa_free_rlimit_rules(&profile->rlimits); + + aa_free_sid(profile->sid); +diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c +index d6d9a57..f4874c4 100644 +--- a/security/apparmor/policy_unpack.c ++++ b/security/apparmor/policy_unpack.c +@@ -190,6 +190,19 @@ fail: + return 0; + } + ++static bool unpack_u16(struct aa_ext *e, u16 *data, const char *name) ++{ ++ if (unpack_nameX(e, AA_U16, name)) { ++ if (!inbounds(e, sizeof(u16))) ++ return 0; ++ if (data) ++ *data = le16_to_cpu(get_unaligned((u16 *) e->pos)); ++ e->pos += sizeof(u16); ++ return 1; ++ } ++ return 0; ++} ++ + static bool unpack_u32(struct aa_ext *e, u32 *data, const char *name) + { + if (unpack_nameX(e, AA_U32, name)) { +@@ -468,7 +481,8 @@ static struct aa_profile *unpack_profile(struct aa_ext *e) + { + struct aa_profile *profile = NULL; + const char *name = NULL; +- int error = -EPROTO; ++ size_t size = 0; ++ int i, error = -EPROTO; + kernel_cap_t tmpcap; + u32 tmp; + +@@ -559,6 +573,38 @@ static struct aa_profile *unpack_profile(struct aa_ext *e) + if (!unpack_rlimits(e, profile)) + goto fail; + ++ size = unpack_array(e, "net_allowed_af"); ++ if (size) { ++ ++ for (i = 0; i < size; i++) { ++ /* discard extraneous rules that this kernel will ++ * never request ++ */ ++ if (i >= AF_MAX) { ++ u16 tmp; ++ if (!unpack_u16(e, &tmp, NULL) || ++ !unpack_u16(e, &tmp, NULL) || ++ !unpack_u16(e, &tmp, NULL)) ++ goto fail; ++ continue; ++ } ++ if (!unpack_u16(e, &profile->net.allow[i], NULL)) ++ goto fail; ++ if (!unpack_u16(e, &profile->net.audit[i], NULL)) ++ goto fail; ++ if (!unpack_u16(e, &profile->net.quiet[i], NULL)) ++ goto fail; ++ } ++ if (!unpack_nameX(e, AA_ARRAYEND, NULL)) ++ goto fail; ++ /* ++ * allow unix domain and netlink sockets they are handled ++ * by IPC ++ */ ++ } ++ profile->net.allow[AF_UNIX] = 0xffff; ++ profile->net.allow[AF_NETLINK] = 0xffff; ++ + /* get file rules */ + profile->file.dfa = unpack_dfa(e); + if (IS_ERR(profile->file.dfa)) { +-- +1.7.5.4 + diff --git a/3.3.8/0001-block-cgroups-kconfig-build-bits-for-BFQ-v5-3.3.patch b/3.3.8/0001-block-cgroups-kconfig-build-bits-for-BFQ-v5-3.3.patch new file mode 100644 index 0000000..0ea5a23 --- /dev/null +++ b/3.3.8/0001-block-cgroups-kconfig-build-bits-for-BFQ-v5-3.3.patch @@ -0,0 +1,99 @@ +From 2ddeb856c43f60139fc5c8e2ed9bc350b9bed590 Mon Sep 17 00:00:00 2001 +From: Arianna Avanzini +Date: Sat, 4 Feb 2012 10:55:51 +0100 +Subject: [PATCH 1/2] block: cgroups, kconfig, build bits for BFQ-v5-3.3 + +Update Kconfig.iosched to include kernel configuration options +for BFQ. Add a Kconfig option and do the related Makefile changes +to compile the scheduler. Also let the cgroups subsystem know about +the BFQ I/O controller. + +Signed-off-by: Fabio Checconi +Signed-off-by: Paolo Valente +Signed-off-by: Arianna Avanzini +--- + block/Kconfig.iosched | 26 ++++++++++++++++++++++++++ + block/Makefile | 1 + + include/linux/cgroup_subsys.h | 6 ++++++ + 3 files changed, 33 insertions(+) + +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched +index 3199b76..5905452 100644 +--- a/block/Kconfig.iosched ++++ b/block/Kconfig.iosched +@@ -43,6 +43,28 @@ config CFQ_GROUP_IOSCHED + ---help--- + Enable group IO scheduling in CFQ. + ++config IOSCHED_BFQ ++ tristate "BFQ I/O scheduler" ++ depends on EXPERIMENTAL ++ default n ++ ---help--- ++ The BFQ I/O scheduler tries to distribute bandwidth among ++ all processes according to their weights. ++ It aims at distributing the bandwidth as desired, independently of ++ the disk parameters and with any workload. It also tries to ++ guarantee low latency to interactive and soft real-time ++ applications. If compiled built-in (saying Y here), BFQ can ++ be configured to support hierarchical scheduling. ++ ++config CGROUP_BFQIO ++ bool "BFQ hierarchical scheduling support" ++ depends on CGROUPS && IOSCHED_BFQ=y ++ default n ++ ---help--- ++ Enable hierarchical scheduling in BFQ, using the cgroups ++ filesystem interface. The name of the subsystem will be ++ bfqio. ++ + choice + prompt "Default I/O scheduler" + default DEFAULT_CFQ +@@ -56,6 +78,9 @@ choice + config DEFAULT_CFQ + bool "CFQ" if IOSCHED_CFQ=y + ++ config DEFAULT_BFQ ++ bool "BFQ" if IOSCHED_BFQ=y ++ + config DEFAULT_NOOP + bool "No-op" + +@@ -65,6 +90,7 @@ config DEFAULT_IOSCHED + string + default "deadline" if DEFAULT_DEADLINE + default "cfq" if DEFAULT_CFQ ++ default "bfq" if DEFAULT_BFQ + default "noop" if DEFAULT_NOOP + + endmenu +diff --git a/block/Makefile b/block/Makefile +index 39b76ba..c0d20fa 100644 +--- a/block/Makefile ++++ b/block/Makefile +@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o + obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o + obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o + obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o ++obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o + + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o + obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o +diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h +index 0bd390c..cbf22b1 100644 +--- a/include/linux/cgroup_subsys.h ++++ b/include/linux/cgroup_subsys.h +@@ -72,3 +72,9 @@ SUBSYS(net_prio) + #endif + + /* */ ++ ++#ifdef CONFIG_CGROUP_BFQIO ++SUBSYS(bfqio) ++#endif ++ ++/* */ +-- +1.7.10.4 + diff --git a/3.3.8/0002-AppArmor-compatibility-patch-for-v5-interface.patch b/3.3.8/0002-AppArmor-compatibility-patch-for-v5-interface.patch new file mode 100644 index 0000000..10d4640 --- /dev/null +++ b/3.3.8/0002-AppArmor-compatibility-patch-for-v5-interface.patch @@ -0,0 +1,391 @@ +From a2515f25ad5a7833ddc5a032d34eee6a5ddee3a2 Mon Sep 17 00:00:00 2001 +From: John Johansen +Date: Wed, 10 Aug 2011 22:02:40 -0700 +Subject: [PATCH 2/3] AppArmor: compatibility patch for v5 interface + +Signed-off-by: John Johansen +--- + security/apparmor/Kconfig | 9 + + security/apparmor/Makefile | 1 + + security/apparmor/apparmorfs-24.c | 287 ++++++++++++++++++++++++++++++++ + security/apparmor/apparmorfs.c | 18 ++- + security/apparmor/include/apparmorfs.h | 6 + + 5 files changed, 319 insertions(+), 2 deletions(-) + create mode 100644 security/apparmor/apparmorfs-24.c + +diff --git a/security/apparmor/Kconfig b/security/apparmor/Kconfig +index 9b9013b..51ebf96 100644 +--- a/security/apparmor/Kconfig ++++ b/security/apparmor/Kconfig +@@ -29,3 +29,12 @@ config SECURITY_APPARMOR_BOOTPARAM_VALUE + boot. + + If you are unsure how to answer this question, answer 1. ++ ++config SECURITY_APPARMOR_COMPAT_24 ++ bool "Enable AppArmor 2.4 compatability" ++ depends on SECURITY_APPARMOR ++ default y ++ help ++ This option enables compatability with AppArmor 2.4. It is ++ recommended if compatability with older versions of AppArmor ++ is desired. +diff --git a/security/apparmor/Makefile b/security/apparmor/Makefile +index 7cefef9..0bb604b 100644 +--- a/security/apparmor/Makefile ++++ b/security/apparmor/Makefile +@@ -5,6 +5,7 @@ obj-$(CONFIG_SECURITY_APPARMOR) += apparmor.o + apparmor-y := apparmorfs.o audit.o capability.o context.o ipc.o lib.o match.o \ + path.o domain.o policy.o policy_unpack.o procattr.o lsm.o \ + resource.o sid.o file.o net.o ++apparmor-$(CONFIG_SECURITY_APPARMOR_COMPAT_24) += apparmorfs-24.o + + clean-files := capability_names.h rlim_names.h af_names.h + +diff --git a/security/apparmor/apparmorfs-24.c b/security/apparmor/apparmorfs-24.c +new file mode 100644 +index 0000000..dc8c744 +--- /dev/null ++++ b/security/apparmor/apparmorfs-24.c +@@ -0,0 +1,287 @@ ++/* ++ * AppArmor security module ++ * ++ * This file contains AppArmor /sys/kernel/secrutiy/apparmor interface functions ++ * ++ * Copyright (C) 1998-2008 Novell/SUSE ++ * Copyright 2009-2010 Canonical Ltd. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License as ++ * published by the Free Software Foundation, version 2 of the ++ * License. ++ * ++ * ++ * This file contain functions providing an interface for <= AppArmor 2.4 ++ * compatibility. It is dependent on CONFIG_SECURITY_APPARMOR_COMPAT_24 ++ * being set (see Makefile). ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "include/apparmor.h" ++#include "include/audit.h" ++#include "include/context.h" ++#include "include/policy.h" ++ ++ ++/* apparmor/matching */ ++static ssize_t aa_matching_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ const char matching[] = "pattern=aadfa audit perms=crwxamlk/ " ++ "user::other"; ++ ++ return simple_read_from_buffer(buf, size, ppos, matching, ++ sizeof(matching) - 1); ++} ++ ++const struct file_operations aa_fs_matching_fops = { ++ .read = aa_matching_read, ++}; ++ ++/* apparmor/features */ ++static ssize_t aa_features_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ const char features[] = "file=3.1 capability=2.0 network=1.0 " ++ "change_hat=1.5 change_profile=1.1 " "aanamespaces=1.1 rlimit=1.1"; ++ ++ return simple_read_from_buffer(buf, size, ppos, features, ++ sizeof(features) - 1); ++} ++ ++const struct file_operations aa_fs_features_fops = { ++ .read = aa_features_read, ++}; ++ ++/** ++ * __next_namespace - find the next namespace to list ++ * @root: root namespace to stop search at (NOT NULL) ++ * @ns: current ns position (NOT NULL) ++ * ++ * Find the next namespace from @ns under @root and handle all locking needed ++ * while switching current namespace. ++ * ++ * Returns: next namespace or NULL if at last namespace under @root ++ * NOTE: will not unlock root->lock ++ */ ++static struct aa_namespace *__next_namespace(struct aa_namespace *root, ++ struct aa_namespace *ns) ++{ ++ struct aa_namespace *parent; ++ ++ /* is next namespace a child */ ++ if (!list_empty(&ns->sub_ns)) { ++ struct aa_namespace *next; ++ next = list_first_entry(&ns->sub_ns, typeof(*ns), base.list); ++ read_lock(&next->lock); ++ return next; ++ } ++ ++ /* check if the next ns is a sibling, parent, gp, .. */ ++ parent = ns->parent; ++ while (parent) { ++ read_unlock(&ns->lock); ++ list_for_each_entry_continue(ns, &parent->sub_ns, base.list) { ++ read_lock(&ns->lock); ++ return ns; ++ } ++ if (parent == root) ++ return NULL; ++ ns = parent; ++ parent = parent->parent; ++ } ++ ++ return NULL; ++} ++ ++/** ++ * __first_profile - find the first profile in a namespace ++ * @root: namespace that is root of profiles being displayed (NOT NULL) ++ * @ns: namespace to start in (NOT NULL) ++ * ++ * Returns: unrefcounted profile or NULL if no profile ++ */ ++static struct aa_profile *__first_profile(struct aa_namespace *root, ++ struct aa_namespace *ns) ++{ ++ for ( ; ns; ns = __next_namespace(root, ns)) { ++ if (!list_empty(&ns->base.profiles)) ++ return list_first_entry(&ns->base.profiles, ++ struct aa_profile, base.list); ++ } ++ return NULL; ++} ++ ++/** ++ * __next_profile - step to the next profile in a profile tree ++ * @profile: current profile in tree (NOT NULL) ++ * ++ * Perform a depth first taversal on the profile tree in a namespace ++ * ++ * Returns: next profile or NULL if done ++ * Requires: profile->ns.lock to be held ++ */ ++static struct aa_profile *__next_profile(struct aa_profile *p) ++{ ++ struct aa_profile *parent; ++ struct aa_namespace *ns = p->ns; ++ ++ /* is next profile a child */ ++ if (!list_empty(&p->base.profiles)) ++ return list_first_entry(&p->base.profiles, typeof(*p), ++ base.list); ++ ++ /* is next profile a sibling, parent sibling, gp, subling, .. */ ++ parent = p->parent; ++ while (parent) { ++ list_for_each_entry_continue(p, &parent->base.profiles, ++ base.list) ++ return p; ++ p = parent; ++ parent = parent->parent; ++ } ++ ++ /* is next another profile in the namespace */ ++ list_for_each_entry_continue(p, &ns->base.profiles, base.list) ++ return p; ++ ++ return NULL; ++} ++ ++/** ++ * next_profile - step to the next profile in where ever it may be ++ * @root: root namespace (NOT NULL) ++ * @profile: current profile (NOT NULL) ++ * ++ * Returns: next profile or NULL if there isn't one ++ */ ++static struct aa_profile *next_profile(struct aa_namespace *root, ++ struct aa_profile *profile) ++{ ++ struct aa_profile *next = __next_profile(profile); ++ if (next) ++ return next; ++ ++ /* finished all profiles in namespace move to next namespace */ ++ return __first_profile(root, __next_namespace(root, profile->ns)); ++} ++ ++/** ++ * p_start - start a depth first traversal of profile tree ++ * @f: seq_file to fill ++ * @pos: current position ++ * ++ * Returns: first profile under current namespace or NULL if none found ++ * ++ * acquires first ns->lock ++ */ ++static void *p_start(struct seq_file *f, loff_t *pos) ++ __acquires(root->lock) ++{ ++ struct aa_profile *profile = NULL; ++ struct aa_namespace *root = aa_current_profile()->ns; ++ loff_t l = *pos; ++ f->private = aa_get_namespace(root); ++ ++ ++ /* find the first profile */ ++ read_lock(&root->lock); ++ profile = __first_profile(root, root); ++ ++ /* skip to position */ ++ for (; profile && l > 0; l--) ++ profile = next_profile(root, profile); ++ ++ return profile; ++} ++ ++/** ++ * p_next - read the next profile entry ++ * @f: seq_file to fill ++ * @p: profile previously returned ++ * @pos: current position ++ * ++ * Returns: next profile after @p or NULL if none ++ * ++ * may acquire/release locks in namespace tree as necessary ++ */ ++static void *p_next(struct seq_file *f, void *p, loff_t *pos) ++{ ++ struct aa_profile *profile = p; ++ struct aa_namespace *root = f->private; ++ (*pos)++; ++ ++ return next_profile(root, profile); ++} ++ ++/** ++ * p_stop - stop depth first traversal ++ * @f: seq_file we are filling ++ * @p: the last profile writen ++ * ++ * Release all locking done by p_start/p_next on namespace tree ++ */ ++static void p_stop(struct seq_file *f, void *p) ++ __releases(root->lock) ++{ ++ struct aa_profile *profile = p; ++ struct aa_namespace *root = f->private, *ns; ++ ++ if (profile) { ++ for (ns = profile->ns; ns && ns != root; ns = ns->parent) ++ read_unlock(&ns->lock); ++ } ++ read_unlock(&root->lock); ++ aa_put_namespace(root); ++} ++ ++/** ++ * seq_show_profile - show a profile entry ++ * @f: seq_file to file ++ * @p: current position (profile) (NOT NULL) ++ * ++ * Returns: error on failure ++ */ ++static int seq_show_profile(struct seq_file *f, void *p) ++{ ++ struct aa_profile *profile = (struct aa_profile *)p; ++ struct aa_namespace *root = f->private; ++ ++ if (profile->ns != root) ++ seq_printf(f, ":%s://", aa_ns_name(root, profile->ns)); ++ seq_printf(f, "%s (%s)\n", profile->base.hname, ++ COMPLAIN_MODE(profile) ? "complain" : "enforce"); ++ ++ return 0; ++} ++ ++static const struct seq_operations aa_fs_profiles_op = { ++ .start = p_start, ++ .next = p_next, ++ .stop = p_stop, ++ .show = seq_show_profile, ++}; ++ ++static int profiles_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &aa_fs_profiles_op); ++} ++ ++static int profiles_release(struct inode *inode, struct file *file) ++{ ++ return seq_release(inode, file); ++} ++ ++const struct file_operations aa_fs_profiles_fops = { ++ .open = profiles_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = profiles_release, ++}; +diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c +index 0848292..28c52ac 100644 +--- a/security/apparmor/apparmorfs.c ++++ b/security/apparmor/apparmorfs.c +@@ -187,7 +187,11 @@ void __init aa_destroy_aafs(void) + aafs_remove(".remove"); + aafs_remove(".replace"); + aafs_remove(".load"); +- ++#ifdef CONFIG_SECURITY_APPARMOR_COMPAT_24 ++ aafs_remove("profiles"); ++ aafs_remove("matching"); ++ aafs_remove("features"); ++#endif + securityfs_remove(aa_fs_dentry); + aa_fs_dentry = NULL; + } +@@ -218,7 +222,17 @@ int __init aa_create_aafs(void) + aa_fs_dentry = NULL; + goto error; + } +- ++#ifdef CONFIG_SECURITY_APPARMOR_COMPAT_24 ++ error = aafs_create("matching", 0444, &aa_fs_matching_fops); ++ if (error) ++ goto error; ++ error = aafs_create("features", 0444, &aa_fs_features_fops); ++ if (error) ++ goto error; ++#endif ++ error = aafs_create("profiles", 0440, &aa_fs_profiles_fops); ++ if (error) ++ goto error; + error = aafs_create(".load", 0640, &aa_fs_profile_load); + if (error) + goto error; +diff --git a/security/apparmor/include/apparmorfs.h b/security/apparmor/include/apparmorfs.h +index cb1e93a..14f955c 100644 +--- a/security/apparmor/include/apparmorfs.h ++++ b/security/apparmor/include/apparmorfs.h +@@ -17,4 +17,10 @@ + + extern void __init aa_destroy_aafs(void); + ++#ifdef CONFIG_SECURITY_APPARMOR_COMPAT_24 ++extern const struct file_operations aa_fs_matching_fops; ++extern const struct file_operations aa_fs_features_fops; ++extern const struct file_operations aa_fs_profiles_fops; ++#endif ++ + #endif /* __AA_APPARMORFS_H */ +-- +1.7.5.4 + diff --git a/3.3.8/0002-block-introduce-the-BFQ-v5-I-O-sched-for-3.3.patch b/3.3.8/0002-block-introduce-the-BFQ-v5-I-O-sched-for-3.3.patch new file mode 100644 index 0000000..0f6b038 --- /dev/null +++ b/3.3.8/0002-block-introduce-the-BFQ-v5-I-O-sched-for-3.3.patch @@ -0,0 +1,5624 @@ +From bb6938c4f08c3c5a537175c887d5cc4e14e804a2 Mon Sep 17 00:00:00 2001 +From: Arianna Avanzini +Date: Sun, 5 Feb 2012 01:04:27 +0100 +Subject: [PATCH 2/2] block: introduce the BFQ-v5 I/O sched for 3.3 + +Add the BFQ-v5 I/O scheduler to 3.3. +The general structure is borrowed from CFQ, as much code. A (bfq_)queue is +associated to each task doing I/O on a device, and each time a scheduling +decision has to be taken a queue is selected and it is served until it expires. + + - Slices are given in the service domain: tasks are assigned budgets, + measured in number of sectors. Once got the disk, a task must + however consume its assigned budget within a configurable maximum time + (by default, the maximum possible value of the budgets is automatically + computed to comply with this timeout). This allows the desired latency + vs "throughput boosting" tradeoff to be set. + + - Budgets are scheduled according to a variant of WF2Q+, implemented + using an augmented rb-tree to take eligibility into account while + preserving an O(log N) overall complexity. + + - A low-latency tunable is provided; if enabled, both interactive and soft + real-time applications are guaranteed very low latency. + + - Latency guarantees are preserved also in presence of NCQ. + + - High throughput with flash-based devices, while still preserving + latency guarantees. + + - Useful features borrowed from CFQ: cooperating-queues merging (with + some additional optimizations with respect to the original CFQ version), + static fallback queue for OOM. + + - BFQ supports full hierarchical scheduling, exporting a cgroups + interface. Each node has a full scheduler, so each group can + be assigned its own ioprio and an ioprio_class. + + - If the cgroups interface is used, weights can be explictly assigned, + otherwise ioprio values are mapped to weights using the relation + weight = IOPRIO_BE_NR - ioprio. + + - ioprio classes are served in strict priority order, i.e., lower + priority queues are not served as long as there are higher priority + queues. Among queues in the same class the bandwidth is distributed + in proportion to the weights of each queue. A very thin extra bandwidth + is however guaranteed to the Idle class, to prevent it from starving. + +Signed-off-by: Paolo Valente +Signed-off-by: Arianna Avanzini +--- + block/bfq-cgroup.c | 841 +++++++++++++++ + block/bfq-ioc.c | 34 + + block/bfq-iosched.c | 2985 +++++++++++++++++++++++++++++++++++++++++++++++++++ + block/bfq-sched.c | 1070 ++++++++++++++++++ + block/bfq.h | 599 +++++++++++ + 5 files changed, 5529 insertions(+) + create mode 100644 block/bfq-cgroup.c + create mode 100644 block/bfq-ioc.c + create mode 100644 block/bfq-iosched.c + create mode 100644 block/bfq-sched.c + create mode 100644 block/bfq.h + +diff --git a/block/bfq-cgroup.c block/bfq-cgroup.c +new file mode 100644 +index 0000000..6bb907d +--- /dev/null ++++ block/bfq-cgroup.c +@@ -0,0 +1,841 @@ ++/* ++ * BFQ: CGROUPS support. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. ++ */ ++ ++#ifdef CONFIG_CGROUP_BFQIO ++static struct bfqio_cgroup bfqio_root_cgroup = { ++ .weight = BFQ_DEFAULT_GRP_WEIGHT, ++ .ioprio = BFQ_DEFAULT_GRP_IOPRIO, ++ .ioprio_class = BFQ_DEFAULT_GRP_CLASS, ++}; ++ ++static inline void bfq_init_entity(struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++ entity->weight = entity->new_weight; ++ entity->orig_weight = entity->new_weight; ++ entity->ioprio = entity->new_ioprio; ++ entity->ioprio_class = entity->new_ioprio_class; ++ entity->parent = bfqg->my_entity; ++ entity->sched_data = &bfqg->sched_data; ++} ++ ++static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup) ++{ ++ return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id), ++ struct bfqio_cgroup, css); ++} ++ ++/* ++ * Search the bfq_group for bfqd into the hash table (by now only a list) ++ * of bgrp. Must be called under rcu_read_lock(). ++ */ ++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp, ++ struct bfq_data *bfqd) ++{ ++ struct bfq_group *bfqg; ++ struct hlist_node *n; ++ void *key; ++ ++ hlist_for_each_entry_rcu(bfqg, n, &bgrp->group_data, group_node) { ++ key = rcu_dereference(bfqg->bfqd); ++ if (key == bfqd) ++ return bfqg; ++ } ++ ++ return NULL; ++} ++ ++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp, ++ struct bfq_group *bfqg) ++{ ++ struct bfq_entity *entity = &bfqg->entity; ++ ++ entity->weight = entity->new_weight = bgrp->weight; ++ entity->orig_weight = entity->new_weight; ++ entity->ioprio = entity->new_ioprio = bgrp->ioprio; ++ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class; ++ entity->ioprio_changed = 1; ++ entity->my_sched_data = &bfqg->sched_data; ++} ++ ++static inline void bfq_group_set_parent(struct bfq_group *bfqg, ++ struct bfq_group *parent) ++{ ++ struct bfq_entity *entity; ++ ++ BUG_ON(parent == NULL); ++ BUG_ON(bfqg == NULL); ++ ++ entity = &bfqg->entity; ++ entity->parent = parent->my_entity; ++ entity->sched_data = &parent->sched_data; ++} ++ ++/** ++ * bfq_group_chain_alloc - allocate a chain of groups. ++ * @bfqd: queue descriptor. ++ * @cgroup: the leaf cgroup this chain starts from. ++ * ++ * Allocate a chain of groups starting from the one belonging to ++ * @cgroup up to the root cgroup. Stop if a cgroup on the chain ++ * to the root has already an allocated group on @bfqd. ++ */ ++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd, ++ struct cgroup *cgroup) ++{ ++ struct bfqio_cgroup *bgrp; ++ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL; ++ ++ for (; cgroup != NULL; cgroup = cgroup->parent) { ++ bgrp = cgroup_to_bfqio(cgroup); ++ ++ bfqg = bfqio_lookup_group(bgrp, bfqd); ++ if (bfqg != NULL) { ++ /* ++ * All the cgroups in the path from there to the ++ * root must have a bfq_group for bfqd, so we don't ++ * need any more allocations. ++ */ ++ break; ++ } ++ ++ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC); ++ if (bfqg == NULL) ++ goto cleanup; ++ ++ bfq_group_init_entity(bgrp, bfqg); ++ bfqg->my_entity = &bfqg->entity; ++ ++ if (leaf == NULL) { ++ leaf = bfqg; ++ prev = leaf; ++ } else { ++ bfq_group_set_parent(prev, bfqg); ++ /* ++ * Build a list of allocated nodes using the bfqd ++ * filed, that is still unused and will be initialized ++ * only after the node will be connected. ++ */ ++ prev->bfqd = bfqg; ++ prev = bfqg; ++ } ++ } ++ ++ return leaf; ++ ++cleanup: ++ while (leaf != NULL) { ++ prev = leaf; ++ leaf = leaf->bfqd; ++ kfree(prev); ++ } ++ ++ return NULL; ++} ++ ++/** ++ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy. ++ * @bfqd: the queue descriptor. ++ * @cgroup: the leaf cgroup to start from. ++ * @leaf: the leaf group (to be associated to @cgroup). ++ * ++ * Try to link a chain of groups to a cgroup hierarchy, connecting the ++ * nodes bottom-up, so we can be sure that when we find a cgroup in the ++ * hierarchy that already as a group associated to @bfqd all the nodes ++ * in the path to the root cgroup have one too. ++ * ++ * On locking: the queue lock protects the hierarchy (there is a hierarchy ++ * per device) while the bfqio_cgroup lock protects the list of groups ++ * belonging to the same cgroup. ++ */ ++static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup, ++ struct bfq_group *leaf) ++{ ++ struct bfqio_cgroup *bgrp; ++ struct bfq_group *bfqg, *next, *prev = NULL; ++ unsigned long flags; ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ ++ for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) { ++ bgrp = cgroup_to_bfqio(cgroup); ++ next = leaf->bfqd; ++ ++ bfqg = bfqio_lookup_group(bgrp, bfqd); ++ BUG_ON(bfqg != NULL); ++ ++ spin_lock_irqsave(&bgrp->lock, flags); ++ ++ rcu_assign_pointer(leaf->bfqd, bfqd); ++ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data); ++ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list); ++ ++ spin_unlock_irqrestore(&bgrp->lock, flags); ++ ++ prev = leaf; ++ leaf = next; ++ } ++ ++ BUG_ON(cgroup == NULL && leaf != NULL); ++ if (cgroup != NULL && prev != NULL) { ++ bgrp = cgroup_to_bfqio(cgroup); ++ bfqg = bfqio_lookup_group(bgrp, bfqd); ++ bfq_group_set_parent(prev, bfqg); ++ } ++} ++ ++/** ++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup. ++ * @bfqd: queue descriptor. ++ * @cgroup: cgroup being searched for. ++ * ++ * Return a group associated to @bfqd in @cgroup, allocating one if ++ * necessary. When a group is returned all the cgroups in the path ++ * to the root have a group associated to @bfqd. ++ * ++ * If the allocation fails, return the root group: this breaks guarantees ++ * but is a safe fallbak. If this loss becames a problem it can be ++ * mitigated using the equivalent weight (given by the product of the ++ * weights of the groups in the path from @group to the root) in the ++ * root scheduler. ++ * ++ * We allocate all the missing nodes in the path from the leaf cgroup ++ * to the root and we connect the nodes only after all the allocations ++ * have been successful. ++ */ ++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, ++ struct cgroup *cgroup) ++{ ++ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup); ++ struct bfq_group *bfqg; ++ ++ bfqg = bfqio_lookup_group(bgrp, bfqd); ++ if (bfqg != NULL) ++ return bfqg; ++ ++ bfqg = bfq_group_chain_alloc(bfqd, cgroup); ++ if (bfqg != NULL) ++ bfq_group_chain_link(bfqd, cgroup, bfqg); ++ else ++ bfqg = bfqd->root_group; ++ ++ return bfqg; ++} ++ ++/** ++ * bfq_bfqq_move - migrate @bfqq to @bfqg. ++ * @bfqd: queue descriptor. ++ * @bfqq: the queue to move. ++ * @entity: @bfqq's entity. ++ * @bfqg: the group to move to. ++ * ++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating ++ * it on the new one. Avoid putting the entity on the old group idle tree. ++ * ++ * Must be called under the queue lock; the cgroup owning @bfqg must ++ * not disappear (by now this just means that we are called under ++ * rcu_read_lock()). ++ */ ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_entity *entity, struct bfq_group *bfqg) ++{ ++ int busy, resume; ++ ++ busy = bfq_bfqq_busy(bfqq); ++ resume = !RB_EMPTY_ROOT(&bfqq->sort_list); ++ ++ BUG_ON(resume && !entity->on_st); ++ BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue); ++ ++ if (busy) { ++ BUG_ON(atomic_read(&bfqq->ref) < 2); ++ ++ if (!resume) ++ bfq_del_bfqq_busy(bfqd, bfqq, 0); ++ else ++ bfq_deactivate_bfqq(bfqd, bfqq, 0); ++ } else if (entity->on_st) ++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); ++ ++ /* ++ * Here we use a reference to bfqg. We don't need a refcounter ++ * as the cgroup reference will not be dropped, so that its ++ * destroy() callback will not be invoked. ++ */ ++ entity->parent = bfqg->my_entity; ++ entity->sched_data = &bfqg->sched_data; ++ ++ if (busy && resume) ++ bfq_activate_bfqq(bfqd, bfqq); ++} ++ ++/** ++ * __bfq_bic_change_cgroup - move @bic to @cgroup. ++ * @bfqd: the queue descriptor. ++ * @bic: the bic to move. ++ * @cgroup: the cgroup to move to. ++ * ++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller ++ * has to make sure that the reference to cgroup is valid across the call. ++ * ++ * NOTE: an alternative approach might have been to store the current ++ * cgroup in bfqq and getting a reference to it, reducing the lookup ++ * time here, at the price of slightly more complex code. ++ */ ++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic, ++ struct cgroup *cgroup) ++{ ++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); ++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); ++ struct bfq_entity *entity; ++ struct bfq_group *bfqg; ++ struct bfqio_cgroup *bgrp; ++ ++ bgrp = cgroup_to_bfqio(cgroup); ++ ++ bfqg = bfq_find_alloc_group(bfqd, cgroup); ++ if (async_bfqq != NULL) { ++ entity = &async_bfqq->entity; ++ ++ if (entity->sched_data != &bfqg->sched_data) { ++ bic_set_bfqq(bic, NULL, 0); ++ bfq_log_bfqq(bfqd, async_bfqq, ++ "bic_change_group: %p %d", ++ async_bfqq, atomic_read(&async_bfqq->ref)); ++ bfq_put_queue(async_bfqq); ++ } ++ } ++ ++ if (sync_bfqq != NULL) { ++ entity = &sync_bfqq->entity; ++ if (entity->sched_data != &bfqg->sched_data) ++ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); ++ } ++ ++ return bfqg; ++} ++ ++/** ++ * bfq_bic_change_cgroup - move @bic to @cgroup. ++ * @bic: the bic being migrated. ++ * @cgroup: the destination cgroup. ++ * ++ * When the task owning @bic is moved to @cgroup, @bic is immediately ++ * moved into its new parent group. ++ */ ++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic, ++ struct cgroup *cgroup) ++{ ++ struct bfq_data *bfqd; ++ unsigned long uninitialized_var(flags); ++ ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags); ++ if (bfqd != NULL) { ++ __bfq_bic_change_cgroup(bfqd, bic, cgroup); ++ bfq_put_bfqd_unlock(bfqd, &flags); ++ } ++} ++ ++/** ++ * bfq_bic_update_cgroup - update the cgroup of @bic. ++ * @bic: the @bic to update. ++ * ++ * Make sure that @bic is enqueued in the cgroup of the current task. ++ * We need this in addition to moving bics during the cgroup attach ++ * phase because the task owning @bic could be at its first disk ++ * access or we may end up in the root cgroup as the result of a ++ * memory allocation failure and here we try to move to the right ++ * group. ++ * ++ * Must be called under the queue lock. It is safe to use the returned ++ * value even after the rcu_read_unlock() as the migration/destruction ++ * paths act under the queue lock too. IOW it is impossible to race with ++ * group migration/destruction and end up with an invalid group as: ++ * a) here cgroup has not yet been destroyed, nor its destroy callback ++ * has started execution, as current holds a reference to it, ++ * b) if it is destroyed after rcu_read_unlock() [after current is ++ * migrated to a different cgroup] its attach() callback will have ++ * taken care of remove all the references to the old cgroup data. ++ */ ++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic) ++{ ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ struct bfq_group *bfqg; ++ struct cgroup *cgroup; ++ ++ BUG_ON(bfqd == NULL); ++ ++ rcu_read_lock(); ++ cgroup = task_cgroup(current, bfqio_subsys_id); ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, cgroup); ++ rcu_read_unlock(); ++ ++ return bfqg; ++} ++ ++/** ++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. ++ * @st: the service tree being flushed. ++ */ ++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *entity = st->first_idle; ++ ++ for (; entity != NULL; entity = st->first_idle) ++ __bfq_deactivate_entity(entity, 0); ++} ++ ++/** ++ * bfq_reparent_leaf_entity - move leaf entity to the root_group. ++ * @bfqd: the device data structure with the root group. ++ * @entity: the entity to move. ++ */ ++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ BUG_ON(bfqq == NULL); ++ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); ++ return; ++} ++ ++/** ++ * bfq_reparent_active_entities - move to the root group all active entities. ++ * @bfqd: the device data structure with the root group. ++ * @bfqg: the group to move from. ++ * @st: the service tree with the entities. ++ * ++ * Needs queue_lock to be taken and reference to be valid over the call. ++ */ ++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ struct bfq_service_tree *st) ++{ ++ struct rb_root *active = &st->active; ++ struct bfq_entity *entity = NULL; ++ ++ if (!RB_EMPTY_ROOT(&st->active)) ++ entity = bfq_entity_of(rb_first(active)); ++ ++ for (; entity != NULL ; entity = bfq_entity_of(rb_first(active))) ++ bfq_reparent_leaf_entity(bfqd, entity); ++ ++ if (bfqg->sched_data.active_entity != NULL) ++ bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity); ++ ++ return; ++} ++ ++/** ++ * bfq_destroy_group - destroy @bfqg. ++ * @bgrp: the bfqio_cgroup containing @bfqg. ++ * @bfqg: the group being destroyed. ++ * ++ * Destroy @bfqg, making sure that it is not referenced from its parent. ++ */ ++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) ++{ ++ struct bfq_data *bfqd; ++ struct bfq_service_tree *st; ++ struct bfq_entity *entity = bfqg->my_entity; ++ unsigned long uninitialized_var(flags); ++ int i; ++ ++ hlist_del(&bfqg->group_node); ++ ++ /* ++ * Empty all service_trees belonging to this group before deactivating ++ * the group itself. ++ */ ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { ++ st = bfqg->sched_data.service_tree + i; ++ ++ /* ++ * The idle tree may still contain bfq_queues belonging ++ * to exited task because they never migrated to a different ++ * cgroup from the one being destroyed now. Noone else ++ * can access them so it's safe to act without any lock. ++ */ ++ bfq_flush_idle_tree(st); ++ ++ /* ++ * It may happen that some queues are still active ++ * (busy) upon group destruction (if the corresponding ++ * processes have been forced to terminate). We move ++ * all the leaf entities corresponding to these queues ++ * to the root_group. ++ * Also, it may happen that the group has an entity ++ * under service, which is disconnected from the active ++ * tree: it must be moved, too. ++ * There is no need to put the sync queues, as the ++ * scheduler has taken no reference. ++ */ ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); ++ if (bfqd != NULL) { ++ bfq_reparent_active_entities(bfqd, bfqg, st); ++ bfq_put_bfqd_unlock(bfqd, &flags); ++ } ++ BUG_ON(!RB_EMPTY_ROOT(&st->active)); ++ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); ++ } ++ BUG_ON(bfqg->sched_data.next_active != NULL); ++ BUG_ON(bfqg->sched_data.active_entity != NULL); ++ ++ /* ++ * We may race with device destruction, take extra care when ++ * dereferencing bfqg->bfqd. ++ */ ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); ++ if (bfqd != NULL) { ++ hlist_del(&bfqg->bfqd_node); ++ __bfq_deactivate_entity(entity, 0); ++ bfq_put_async_queues(bfqd, bfqg); ++ bfq_put_bfqd_unlock(bfqd, &flags); ++ } ++ BUG_ON(entity->tree != NULL); ++ ++ /* ++ * No need to defer the kfree() to the end of the RCU grace ++ * period: we are called from the destroy() callback of our ++ * cgroup, so we can be sure that noone is a) still using ++ * this cgroup or b) doing lookups in it. ++ */ ++ kfree(bfqg); ++} ++ ++/** ++ * bfq_disconnect_groups - diconnect @bfqd from all its groups. ++ * @bfqd: the device descriptor being exited. ++ * ++ * When the device exits we just make sure that no lookup can return ++ * the now unused group structures. They will be deallocated on cgroup ++ * destruction. ++ */ ++static void bfq_disconnect_groups(struct bfq_data *bfqd) ++{ ++ struct hlist_node *pos, *n; ++ struct bfq_group *bfqg; ++ ++ bfq_log(bfqd, "disconnect_groups beginning") ; ++ hlist_for_each_entry_safe(bfqg, pos, n, &bfqd->group_list, bfqd_node) { ++ hlist_del(&bfqg->bfqd_node); ++ ++ __bfq_deactivate_entity(bfqg->my_entity, 0); ++ ++ /* ++ * Don't remove from the group hash, just set an ++ * invalid key. No lookups can race with the ++ * assignment as bfqd is being destroyed; this ++ * implies also that new elements cannot be added ++ * to the list. ++ */ ++ rcu_assign_pointer(bfqg->bfqd, NULL); ++ ++ bfq_log(bfqd, "disconnect_groups: put async for group %p", ++ bfqg) ; ++ bfq_put_async_queues(bfqd, bfqg); ++ } ++} ++ ++static inline void bfq_free_root_group(struct bfq_data *bfqd) ++{ ++ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup; ++ struct bfq_group *bfqg = bfqd->root_group; ++ ++ bfq_put_async_queues(bfqd, bfqg); ++ ++ spin_lock_irq(&bgrp->lock); ++ hlist_del_rcu(&bfqg->group_node); ++ spin_unlock_irq(&bgrp->lock); ++ ++ /* ++ * No need to synchronize_rcu() here: since the device is gone ++ * there cannot be any read-side access to its root_group. ++ */ ++ kfree(bfqg); ++} ++ ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) ++{ ++ struct bfq_group *bfqg; ++ struct bfqio_cgroup *bgrp; ++ int i; ++ ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); ++ if (bfqg == NULL) ++ return NULL; ++ ++ bfqg->entity.parent = NULL; ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ ++ bgrp = &bfqio_root_cgroup; ++ spin_lock_irq(&bgrp->lock); ++ rcu_assign_pointer(bfqg->bfqd, bfqd); ++ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data); ++ spin_unlock_irq(&bgrp->lock); ++ ++ return bfqg; ++} ++ ++#define SHOW_FUNCTION(__VAR) \ ++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup, \ ++ struct cftype *cftype) \ ++{ \ ++ struct bfqio_cgroup *bgrp; \ ++ u64 ret; \ ++ \ ++ if (!cgroup_lock_live_group(cgroup)) \ ++ return -ENODEV; \ ++ \ ++ bgrp = cgroup_to_bfqio(cgroup); \ ++ spin_lock_irq(&bgrp->lock); \ ++ ret = bgrp->__VAR; \ ++ spin_unlock_irq(&bgrp->lock); \ ++ \ ++ cgroup_unlock(); \ ++ \ ++ return ret; \ ++} ++ ++SHOW_FUNCTION(weight); ++SHOW_FUNCTION(ioprio); ++SHOW_FUNCTION(ioprio_class); ++#undef SHOW_FUNCTION ++ ++#define STORE_FUNCTION(__VAR, __MIN, __MAX) \ ++static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \ ++ struct cftype *cftype, \ ++ u64 val) \ ++{ \ ++ struct bfqio_cgroup *bgrp; \ ++ struct bfq_group *bfqg; \ ++ struct hlist_node *n; \ ++ \ ++ if (val < (__MIN) || val > (__MAX)) \ ++ return -EINVAL; \ ++ \ ++ if (!cgroup_lock_live_group(cgroup)) \ ++ return -ENODEV; \ ++ \ ++ bgrp = cgroup_to_bfqio(cgroup); \ ++ \ ++ spin_lock_irq(&bgrp->lock); \ ++ bgrp->__VAR = (unsigned short)val; \ ++ hlist_for_each_entry(bfqg, n, &bgrp->group_data, group_node) { \ ++ bfqg->entity.new_##__VAR = (unsigned short)val; \ ++ smp_wmb(); \ ++ bfqg->entity.ioprio_changed = 1; \ ++ } \ ++ spin_unlock_irq(&bgrp->lock); \ ++ \ ++ cgroup_unlock(); \ ++ \ ++ return 0; \ ++} ++ ++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT); ++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1); ++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); ++#undef STORE_FUNCTION ++ ++static struct cftype bfqio_files[] = { ++ { ++ .name = "weight", ++ .read_u64 = bfqio_cgroup_weight_read, ++ .write_u64 = bfqio_cgroup_weight_write, ++ }, ++ { ++ .name = "ioprio", ++ .read_u64 = bfqio_cgroup_ioprio_read, ++ .write_u64 = bfqio_cgroup_ioprio_write, ++ }, ++ { ++ .name = "ioprio_class", ++ .read_u64 = bfqio_cgroup_ioprio_class_read, ++ .write_u64 = bfqio_cgroup_ioprio_class_write, ++ }, ++}; ++ ++static int bfqio_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) ++{ ++ return cgroup_add_files(cgroup, subsys, bfqio_files, ++ ARRAY_SIZE(bfqio_files)); ++} ++ ++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys *subsys, ++ struct cgroup *cgroup) ++{ ++ struct bfqio_cgroup *bgrp; ++ ++ if (cgroup->parent != NULL) { ++ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL); ++ if (bgrp == NULL) ++ return ERR_PTR(-ENOMEM); ++ } else ++ bgrp = &bfqio_root_cgroup; ++ ++ spin_lock_init(&bgrp->lock); ++ INIT_HLIST_HEAD(&bgrp->group_data); ++ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO; ++ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS; ++ ++ return &bgrp->css; ++} ++ ++/* ++ * We cannot support shared io contexts, as we have no mean to support ++ * two tasks with the same ioc in two different groups without major rework ++ * of the main bic/bfqq data structures. By now we allow a task to change ++ * its cgroup only if it's the only owner of its ioc; the drawback of this ++ * behavior is that a group containing a task that forked using CLONE_IO ++ * will not be destroyed until the tasks sharing the ioc die. ++ */ ++static int bfqio_can_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, ++ struct cgroup_taskset *tset) ++{ ++ struct task_struct *task; ++ struct io_context *ioc; ++ int ret = 0; ++ ++ cgroup_taskset_for_each(task, cgroup, tset) { ++ /* task_lock() is needed to avoid races with exit_io_context() */ ++ task_lock(task); ++ ioc = task->io_context; ++ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) ++ /* ++ * ioc == NULL means that the task is either too young or ++ * exiting: if it has still no ioc the ioc can't be shared, ++ * if the task is exiting the attach will fail anyway, no ++ * matter what we return here. ++ */ ++ ret = -EINVAL; ++ task_unlock(task); ++ if (ret) ++ break; ++ } ++ ++ return ret; ++} ++ ++static void bfqio_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, ++ struct cgroup_taskset *tset) ++{ ++ struct task_struct *task; ++ struct io_context *ioc; ++ struct io_cq *icq; ++ struct hlist_node *n; ++ ++ /* ++ * IMPORTANT NOTE: The move of more than one process at a time to a ++ * new group has not yet been tested. ++ */ ++ cgroup_taskset_for_each(task, cgroup, tset) { ++ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); ++ if (ioc) { ++ /* ++ * Handle cgroup change here. ++ */ ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(icq, n, &ioc->icq_list, ioc_node) ++ if (!strncmp(icq->q->elevator->type->elevator_name, ++ "bfq", ELV_NAME_MAX)) ++ bfq_bic_change_cgroup(icq_to_bic(icq), ++ cgroup); ++ rcu_read_unlock(); ++ put_io_context(ioc); ++ } ++ } ++} ++ ++static void bfqio_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) ++{ ++ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup); ++ struct hlist_node *n, *tmp; ++ struct bfq_group *bfqg; ++ ++ /* ++ * Since we are destroying the cgroup, there are no more tasks ++ * referencing it, and all the RCU grace periods that may have ++ * referenced it are ended (as the destruction of the parent ++ * cgroup is RCU-safe); bgrp->group_data will not be accessed by ++ * anything else and we don't need any synchronization. ++ */ ++ hlist_for_each_entry_safe(bfqg, n, tmp, &bgrp->group_data, group_node) ++ bfq_destroy_group(bgrp, bfqg); ++ ++ BUG_ON(!hlist_empty(&bgrp->group_data)); ++ ++ kfree(bgrp); ++} ++ ++struct cgroup_subsys bfqio_subsys = { ++ .name = "bfqio", ++ .create = bfqio_create, ++ .can_attach = bfqio_can_attach, ++ .attach = bfqio_attach, ++ .destroy = bfqio_destroy, ++ .populate = bfqio_populate, ++ .subsys_id = bfqio_subsys_id, ++}; ++#else ++static inline void bfq_init_entity(struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++ entity->weight = entity->new_weight; ++ entity->orig_weight = entity->new_weight; ++ entity->ioprio = entity->new_ioprio; ++ entity->ioprio_class = entity->new_ioprio_class; ++ entity->sched_data = &bfqg->sched_data; ++} ++ ++static inline struct bfq_group * ++bfq_bic_update_cgroup(struct bfq_io_cq *bic) ++{ ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ return bfqd->root_group; ++} ++ ++static inline void bfq_bfqq_move(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++} ++ ++static inline void bfq_disconnect_groups(struct bfq_data *bfqd) ++{ ++ bfq_put_async_queues(bfqd, bfqd->root_group); ++} ++ ++static inline void bfq_free_root_group(struct bfq_data *bfqd) ++{ ++ kfree(bfqd->root_group); ++} ++ ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) ++{ ++ struct bfq_group *bfqg; ++ int i; ++ ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); ++ if (bfqg == NULL) ++ return NULL; ++ ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ ++ return bfqg; ++} ++#endif +diff --git a/block/bfq-ioc.c block/bfq-ioc.c +new file mode 100644 +index 0000000..af791d2 +--- /dev/null ++++ block/bfq-ioc.c +@@ -0,0 +1,34 @@ ++/* ++ * BFQ: I/O context handling. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ */ ++ ++/** ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. ++ * @icq: the iocontext queue. ++ */ ++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq) ++{ ++ /* bic->icq is the first member, %NULL will convert to %NULL */ ++ return container_of(icq, struct bfq_io_cq, icq); ++} ++ ++/** ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. ++ * @bfqd: the lookup key. ++ * @ioc: the io_context of the process doing I/O. ++ * ++ * Queue lock must be held. ++ */ ++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, ++ struct io_context *ioc) ++{ ++ if(ioc) ++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); ++ return NULL; ++} +diff --git a/block/bfq-iosched.c block/bfq-iosched.c +new file mode 100644 +index 0000000..4122afd +--- /dev/null ++++ block/bfq-iosched.c +@@ -0,0 +1,2985 @@ ++/* ++ * BFQ, or Budget Fair Queueing, disk scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. ++ * ++ * BFQ is a proportional share disk scheduling algorithm based on the ++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, ++ * measured in number of sectors, to tasks instead of time slices. ++ * The disk is not granted to the active task for a given time slice, ++ * but until it has exahusted its assigned budget. This change from ++ * the time to the service domain allows BFQ to distribute the disk ++ * bandwidth among tasks as desired, without any distortion due to ++ * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc ++ * internal scheduler, called B-WF2Q+, to schedule tasks according to ++ * their budgets. Thanks to this accurate scheduler, BFQ can afford ++ * to assign high budgets to disk-bound non-seeky tasks (to boost the ++ * throughput), and yet guarantee low latencies to interactive and ++ * soft real-time applications. ++ * ++ * BFQ has been introduced in [1], where the interested reader can ++ * find an accurate description of the algorithm, the bandwidth ++ * distribution and latency guarantees it provides, plus formal proofs ++ * of all the properties. With respect to the algorithm presented in ++ * the paper, this implementation adds several little heuristics, and ++ * a hierarchical extension, based on H-WF2Q+. ++ * ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) ++ * complexity derives from the one introduced with EEVDF in [3]. ++ * ++ * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling ++ * with Deterministic Guarantees on Bandwidth Distribution,'', ++ * IEEE Transactions on Computer, May 2010. ++ * ++ * http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf ++ * ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, ++ * Oct 1997. ++ * ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz ++ * ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline ++ * First: A Flexible and Accurate Mechanism for Proportional Share ++ * Resource Allocation,'' technical report. ++ * ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "bfq.h" ++#include "blk.h" ++ ++/* Max number of dispatches in one round of service. */ ++static const int bfq_quantum = 4; ++ ++/* Expiration time of sync (0) and async (1) requests, in jiffies. */ ++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; ++ ++/* Maximum backwards seek, in KiB. */ ++static const int bfq_back_max = 16 * 1024; ++ ++/* Penalty of a backwards seek, in number of sectors. */ ++static const int bfq_back_penalty = 2; ++ ++/* Idling period duration, in jiffies. */ ++static int bfq_slice_idle = HZ / 125; ++ ++/* Default maximum budget values, in sectors and number of requests. */ ++static const int bfq_default_max_budget = 16 * 1024; ++static const int bfq_max_budget_async_rq = 4; ++ ++/* ++ * Async to sync throughput distribution is controlled as follows: ++ * when an async request is served, the entity is charged the number ++ * of sectors of the request, multipled by the factor below ++ */ ++static const int bfq_async_charge_factor = 10; ++ ++/* Default timeout values, in jiffies, approximating CFQ defaults. */ ++static const int bfq_timeout_sync = HZ / 8; ++static int bfq_timeout_async = HZ / 25; ++ ++struct kmem_cache *bfq_pool; ++ ++/* Below this threshold (in ms), we consider thinktime immediate. */ ++#define BFQ_MIN_TT 2 ++ ++/* hw_tag detection: parallel requests threshold and min samples needed. */ ++#define BFQ_HW_QUEUE_THRESHOLD 4 ++#define BFQ_HW_QUEUE_SAMPLES 32 ++ ++#define BFQQ_SEEK_THR (sector_t)(8 * 1024) ++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) ++ ++/* Min samples used for peak rate estimation (for autotuning). */ ++#define BFQ_PEAK_RATE_SAMPLES 32 ++ ++/* Shift used for peak rate fixed precision calculations. */ ++#define BFQ_RATE_SHIFT 16 ++ ++/* ++ * The duration of the weight raising for interactive applications is ++ * computed automatically (as default behaviour), using the following ++ * formula: duration = (R / r) * T, where r is the peak rate of the ++ * disk, and R and T are two reference parameters. In particular, R is ++ * the peak rate of a reference disk, and T is about the maximum time ++ * for starting popular large applications on that disk, under BFQ and ++ * while reading two files in parallel. Finally, BFQ uses two ++ * different pairs (R, T) depending on whether the disk is rotational ++ * or non-rotational. ++ */ ++#define T_rot (msecs_to_jiffies(5500)) ++#define T_nonrot (msecs_to_jiffies(2000)) ++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */ ++#define R_rot 17415 ++#define R_nonrot 34791 ++ ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) ++ ++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) ++ ++#include "bfq-ioc.c" ++#include "bfq-sched.c" ++#include "bfq-cgroup.c" ++ ++#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\ ++ IOPRIO_CLASS_IDLE) ++#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\ ++ IOPRIO_CLASS_RT) ++ ++#define bfq_sample_valid(samples) ((samples) > 80) ++ ++/* ++ * We regard a request as SYNC, if either it's a read or has the SYNC bit ++ * set (in which case it could also be a direct WRITE). ++ */ ++static inline int bfq_bio_sync(struct bio *bio) ++{ ++ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) ++ return 1; ++ ++ return 0; ++} ++ ++/* ++ * Scheduler run of queue, if there are requests pending and no one in the ++ * driver that will restart queueing. ++ */ ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd) ++{ ++ if (bfqd->queued != 0) { ++ bfq_log(bfqd, "schedule dispatch"); ++ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work); ++ } ++} ++ ++/* ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now. ++ * We choose the request that is closesr to the head right now. Distance ++ * behind the head is penalized and only allowed to a certain extent. ++ */ ++static struct request *bfq_choose_req(struct bfq_data *bfqd, ++ struct request *rq1, ++ struct request *rq2, ++ sector_t last) ++{ ++ sector_t s1, s2, d1 = 0, d2 = 0; ++ unsigned long back_max; ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ ++ unsigned wrap = 0; /* bit mask: requests behind the disk head? */ ++ ++ if (rq1 == NULL || rq1 == rq2) ++ return rq2; ++ if (rq2 == NULL) ++ return rq1; ++ ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) ++ return rq1; ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) ++ return rq2; ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) ++ return rq1; ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) ++ return rq2; ++ ++ s1 = blk_rq_pos(rq1); ++ s2 = blk_rq_pos(rq2); ++ ++ /* ++ * By definition, 1KiB is 2 sectors. ++ */ ++ back_max = bfqd->bfq_back_max * 2; ++ ++ /* ++ * Strict one way elevator _except_ in the case where we allow ++ * short backward seeks which are biased as twice the cost of a ++ * similar forward seek. ++ */ ++ if (s1 >= last) ++ d1 = s1 - last; ++ else if (s1 + back_max >= last) ++ d1 = (last - s1) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ1_WRAP; ++ ++ if (s2 >= last) ++ d2 = s2 - last; ++ else if (s2 + back_max >= last) ++ d2 = (last - s2) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ2_WRAP; ++ ++ /* Found required data */ ++ ++ /* ++ * By doing switch() on the bit mask "wrap" we avoid having to ++ * check two variables for all permutations: --> faster! ++ */ ++ switch (wrap) { ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ ++ if (d1 < d2) ++ return rq1; ++ else if (d2 < d1) ++ return rq2; ++ else { ++ if (s1 >= s2) ++ return rq1; ++ else ++ return rq2; ++ } ++ ++ case BFQ_RQ2_WRAP: ++ return rq1; ++ case BFQ_RQ1_WRAP: ++ return rq2; ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ ++ default: ++ /* ++ * Since both rqs are wrapped, ++ * start with the one that's further behind head ++ * (--> only *one* back seek required), ++ * since back seek takes more time than forward. ++ */ ++ if (s1 <= s2) ++ return rq1; ++ else ++ return rq2; ++ } ++} ++ ++static struct bfq_queue * ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, ++ sector_t sector, struct rb_node **ret_parent, ++ struct rb_node ***rb_link) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *bfqq = NULL; ++ ++ parent = NULL; ++ p = &root->rb_node; ++ while (*p) { ++ struct rb_node **n; ++ ++ parent = *p; ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ ++ /* ++ * Sort strictly based on sector. Smallest to the left, ++ * largest to the right. ++ */ ++ if (sector > blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_right; ++ else if (sector < blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_left; ++ else ++ break; ++ p = n; ++ bfqq = NULL; ++ } ++ ++ *ret_parent = parent; ++ if (rb_link) ++ *rb_link = p; ++ ++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", ++ (long long unsigned)sector, ++ bfqq != NULL ? bfqq->pid : 0); ++ ++ return bfqq; ++} ++ ++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *__bfqq; ++ ++ if (bfqq->pos_root != NULL) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ ++ if (bfq_class_idle(bfqq)) ++ return; ++ if (!bfqq->next_rq) ++ return; ++ ++ bfqq->pos_root = &bfqd->rq_pos_tree; ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, ++ blk_rq_pos(bfqq->next_rq), &parent, &p); ++ if (__bfqq == NULL) { ++ rb_link_node(&bfqq->pos_node, parent, p); ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); ++ } else ++ bfqq->pos_root = NULL; ++} ++ ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct rb_node *rbnext = rb_next(&last->rb_node); ++ struct rb_node *rbprev = rb_prev(&last->rb_node); ++ struct request *next = NULL, *prev = NULL; ++ ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); ++ ++ if (rbprev != NULL) ++ prev = rb_entry_rq(rbprev); ++ ++ if (rbnext != NULL) ++ next = rb_entry_rq(rbnext); ++ else { ++ rbnext = rb_first(&bfqq->sort_list); ++ if (rbnext && rbnext != &last->rb_node) ++ next = rb_entry_rq(rbnext); ++ } ++ ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); ++} ++ ++static void bfq_del_rq_rb(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ const int sync = rq_is_sync(rq); ++ ++ BUG_ON(bfqq->queued[sync] == 0); ++ bfqq->queued[sync]--; ++ bfqd->queued--; ++ ++ elv_rb_del(&bfqq->sort_list, rq); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue) ++ bfq_del_bfqq_busy(bfqd, bfqq, 1); ++ /* ++ * Remove queue from request-position tree as it is empty. ++ */ ++ if (bfqq->pos_root != NULL) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ } ++} ++ ++/* see the definition of bfq_async_charge_factor for details */ ++static inline unsigned long bfq_serv_to_charge(struct request *rq, ++ struct bfq_queue *bfqq) ++{ ++ return blk_rq_sectors(rq) * ++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) * ++ bfq_async_charge_factor)); ++} ++ ++/** ++ * bfq_updated_next_req - update the queue after a new next_rq selection. ++ * @bfqd: the device data the queue belongs to. ++ * @bfqq: the queue to update. ++ * ++ * If the first request of a queue changes we make sure that the queue ++ * has enough budget to serve at least its first request (if the ++ * request has grown). We do this because if the queue has not enough ++ * budget for its first request, it has to go through two dispatch ++ * rounds to actually get it dispatched. ++ */ ++static void bfq_updated_next_req(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ struct request *next_rq = bfqq->next_rq; ++ unsigned long new_budget; ++ ++ if (next_rq == NULL) ++ return; ++ ++ if (bfqq == bfqd->active_queue) ++ /* ++ * In order not to break guarantees, budgets cannot be ++ * changed after an entity has been selected. ++ */ ++ return; ++ ++ BUG_ON(entity->tree != &st->active); ++ BUG_ON(entity == entity->sched_data->active_entity); ++ ++ new_budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ entity->budget = new_budget; ++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); ++ bfq_activate_bfqq(bfqd, bfqq); ++} ++ ++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd) ++{ ++ u64 dur; ++ ++ if (bfqd->bfq_raising_max_time > 0) ++ return bfqd->bfq_raising_max_time; ++ ++ dur = bfqd->RT_prod; ++ do_div(dur, bfqd->peak_rate); ++ ++ return dur; ++} ++ ++static void bfq_add_rq_rb(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *next_rq, *prev; ++ unsigned long old_raising_coeff = bfqq->raising_coeff; ++ int idle_for_long_time = bfqq->budget_timeout + ++ bfqd->bfq_raising_min_idle_time < jiffies; ++ ++ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq)); ++ bfqq->queued[rq_is_sync(rq)]++; ++ bfqd->queued++; ++ ++ elv_rb_add(&bfqq->sort_list, rq); ++ ++ /* ++ * Check if this request is a better next-serve candidate. ++ */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); ++ BUG_ON(next_rq == NULL); ++ bfqq->next_rq = next_rq; ++ ++ /* ++ * Adjust priority tree position, if next_rq changes. ++ */ ++ if (prev != bfqq->next_rq) ++ bfq_rq_pos_tree_add(bfqd, bfqq); ++ ++ if (!bfq_bfqq_busy(bfqq)) { ++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 && ++ bfqq->soft_rt_next_start < jiffies; ++ entity->budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ ++ if (! bfqd->low_latency) ++ goto add_bfqq_busy; ++ ++ /* ++ * If the queue is not being boosted and has been idle ++ * for enough time, start a weight-raising period ++ */ ++ if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) { ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff; ++ if (idle_for_long_time) ++ bfqq->raising_cur_max_time = ++ bfq_wrais_duration(bfqd); ++ else ++ bfqq->raising_cur_max_time = ++ bfqd->bfq_raising_rt_max_time; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais starting at %llu msec," ++ "rais_max_time %u", ++ bfqq->last_rais_start_finish, ++ jiffies_to_msecs(bfqq-> ++ raising_cur_max_time)); ++ } else if (old_raising_coeff > 1) { ++ if (idle_for_long_time) ++ bfqq->raising_cur_max_time = ++ bfq_wrais_duration(bfqd); ++ else if (bfqq->raising_cur_max_time == ++ bfqd->bfq_raising_rt_max_time && ++ !soft_rt) { ++ bfqq->raising_coeff = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais ending at %llu msec," ++ "rais_max_time %u", ++ bfqq->last_rais_start_finish, ++ jiffies_to_msecs(bfqq-> ++ raising_cur_max_time)); ++ } ++ } ++ if (old_raising_coeff != bfqq->raising_coeff) ++ entity->ioprio_changed = 1; ++add_bfqq_busy: ++ bfq_add_bfqq_busy(bfqd, bfqq); ++ } else { ++ if(bfqd->low_latency && old_raising_coeff == 1 && ++ !rq_is_sync(rq) && ++ bfqq->last_rais_start_finish + ++ bfqd->bfq_raising_min_inter_arr_async < jiffies) { ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff; ++ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd); ++ ++ entity->ioprio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "non-idle wrais starting at %llu msec," ++ "rais_max_time %u", ++ bfqq->last_rais_start_finish, ++ jiffies_to_msecs(bfqq-> ++ raising_cur_max_time)); ++ } ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ if(bfqd->low_latency && ++ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 || ++ idle_for_long_time)) ++ bfqq->last_rais_start_finish = jiffies; ++} ++ ++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq) ++{ ++ elv_rb_del(&bfqq->sort_list, rq); ++ bfqq->queued[rq_is_sync(rq)]--; ++ bfqq->bfqd->queued--; ++ bfq_add_rq_rb(rq); ++} ++ ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, ++ struct bio *bio) ++{ ++ struct task_struct *tsk = current; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq; ++ ++ bic = bfq_bic_lookup(bfqd, tsk->io_context); ++ if (bic == NULL) ++ return NULL; ++ ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); ++ if (bfqq != NULL) { ++ sector_t sector = bio->bi_sector + bio_sectors(bio); ++ ++ return elv_rb_find(&bfqq->sort_list, sector); ++ } ++ ++ return NULL; ++} ++ ++static void bfq_activate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ ++ bfqd->rq_in_driver++; ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", ++ (long long unsigned)bfqd->last_position); ++} ++ ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ ++ WARN_ON(bfqd->rq_in_driver == 0); ++ bfqd->rq_in_driver--; ++} ++ ++static void bfq_remove_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ ++ if (bfqq->next_rq == rq) { ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ list_del_init(&rq->queuelist); ++ bfq_del_rq_rb(rq); ++ ++ if (rq->cmd_flags & REQ_META) { ++ WARN_ON(bfqq->meta_pending == 0); ++ bfqq->meta_pending--; ++ } ++} ++ ++static int bfq_merge(struct request_queue *q, struct request **req, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct request *__rq; ++ ++ __rq = bfq_find_rq_fmerge(bfqd, bio); ++ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) { ++ *req = __rq; ++ return ELEVATOR_FRONT_MERGE; ++ } ++ ++ return ELEVATOR_NO_MERGE; ++} ++ ++static void bfq_merged_request(struct request_queue *q, struct request *req, ++ int type) ++{ ++ if (type == ELEVATOR_FRONT_MERGE) { ++ struct bfq_queue *bfqq = RQ_BFQQ(req); ++ ++ bfq_reposition_rq_rb(bfqq, req); ++ } ++} ++ ++static void bfq_merged_requests(struct request_queue *q, struct request *rq, ++ struct request *next) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ /* ++ * Reposition in fifo if next is older than rq. ++ */ ++ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && ++ time_before(rq_fifo_time(next), rq_fifo_time(rq))) { ++ list_move(&rq->queuelist, &next->queuelist); ++ rq_set_fifo_time(rq, rq_fifo_time(next)); ++ } ++ ++ if (bfqq->next_rq == next) ++ bfqq->next_rq = rq; ++ ++ bfq_remove_request(next); ++} ++ ++static int bfq_allow_merge(struct request_queue *q, struct request *rq, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq; ++ ++ /* ++ * Disallow merge of a sync bio into an async request. ++ */ ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq)) ++ return 0; ++ ++ /* ++ * Lookup the bfqq that this bio will be queued with. Allow ++ * merge only if rq is queued there. ++ * Queue lock is held here. ++ */ ++ bic = bfq_bic_lookup(bfqd, current->io_context); ++ if (bic == NULL) ++ return 0; ++ ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); ++ return bfqq == RQ_BFQQ(rq); ++} ++ ++static void __bfq_set_active_queue(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ if (bfqq != NULL) { ++ bfq_mark_bfqq_must_alloc(bfqq); ++ bfq_mark_bfqq_budget_new(bfqq); ++ bfq_clear_bfqq_fifo_expire(bfqq); ++ ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; ++ ++ bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu", ++ bfqq->entity.budget); ++ } ++ ++ bfqd->active_queue = bfqq; ++} ++ ++/* ++ * Get and set a new active queue for service. ++ */ ++static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ if (!bfqq) ++ bfqq = bfq_get_next_queue(bfqd); ++ else ++ bfq_get_next_queue_forced(bfqd, bfqq); ++ ++ __bfq_set_active_queue(bfqd, bfqq); ++ return bfqq; ++} ++ ++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd, ++ struct request *rq) ++{ ++ if (blk_rq_pos(rq) >= bfqd->last_position) ++ return blk_rq_pos(rq) - bfqd->last_position; ++ else ++ return bfqd->last_position - blk_rq_pos(rq); ++} ++ ++/* ++ * Return true if bfqq has no request pending and rq is close enough to ++ * bfqd->last_position, or if rq is closer to bfqd->last_position than ++ * bfqq->next_rq ++ */ ++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq) ++{ ++ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR; ++} ++ ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) ++{ ++ struct rb_root *root = &bfqd->rq_pos_tree; ++ struct rb_node *parent, *node; ++ struct bfq_queue *__bfqq; ++ sector_t sector = bfqd->last_position; ++ ++ if (RB_EMPTY_ROOT(root)) ++ return NULL; ++ ++ /* ++ * First, if we find a request starting at the end of the last ++ * request, choose it. ++ */ ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); ++ if (__bfqq != NULL) ++ return __bfqq; ++ ++ /* ++ * If the exact sector wasn't found, the parent of the NULL leaf ++ * will contain the closest sector (rq_pos_tree sorted by next_request ++ * position). ++ */ ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ if (bfq_rq_close(bfqd, __bfqq->next_rq)) ++ return __bfqq; ++ ++ if (blk_rq_pos(__bfqq->next_rq) < sector) ++ node = rb_next(&__bfqq->pos_node); ++ else ++ node = rb_prev(&__bfqq->pos_node); ++ if (node == NULL) ++ return NULL; ++ ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node); ++ if (bfq_rq_close(bfqd, __bfqq->next_rq)) ++ return __bfqq; ++ ++ return NULL; ++} ++ ++/* ++ * bfqd - obvious ++ * cur_bfqq - passed in so that we don't decide that the current queue ++ * is closely cooperating with itself. ++ * ++ * We are assuming that cur_bfqq has dispatched at least one request, ++ * and that bfqd->last_position reflects a position on the disk associated ++ * with the I/O issued by cur_bfqq. ++ */ ++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, ++ struct bfq_queue *cur_bfqq) ++{ ++ struct bfq_queue *bfqq; ++ ++ if (bfq_class_idle(cur_bfqq)) ++ return NULL; ++ if (!bfq_bfqq_sync(cur_bfqq)) ++ return NULL; ++ if (BFQQ_SEEKY(cur_bfqq)) ++ return NULL; ++ ++ /* If device has only one backlogged bfq_queue, don't search. */ ++ if (bfqd->busy_queues == 1) ++ return NULL; ++ ++ /* ++ * We should notice if some of the queues are cooperating, e.g. ++ * working closely on the same area of the disk. In that case, ++ * we can group them together and don't waste time idling. ++ */ ++ bfqq = bfqq_close(bfqd); ++ if (bfqq == NULL || bfqq == cur_bfqq) ++ return NULL; ++ ++ /* ++ * Do not merge queues from different bfq_groups. ++ */ ++ if (bfqq->entity.parent != cur_bfqq->entity.parent) ++ return NULL; ++ ++ /* ++ * It only makes sense to merge sync queues. ++ */ ++ if (!bfq_bfqq_sync(bfqq)) ++ return NULL; ++ if (BFQQ_SEEKY(bfqq)) ++ return NULL; ++ ++ /* ++ * Do not merge queues of different priority classes. ++ */ ++ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq)) ++ return NULL; ++ ++ return bfqq; ++} ++ ++/* ++ * If enough samples have been computed, return the current max budget ++ * stored in bfqd, which is dynamically updated according to the ++ * estimated disk peak rate; otherwise return the default max budget ++ */ ++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < 194) ++ return bfq_default_max_budget; ++ else ++ return bfqd->bfq_max_budget; ++} ++ ++/* ++ * Return min budget, which is a fraction of the current or default ++ * max budget (trying with 1/32) ++ */ ++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < 194) ++ return bfq_default_max_budget; ++ else ++ return bfqd->bfq_max_budget / 32; ++} ++ ++/* ++ * Decides whether idling should be done for given device and ++ * given active queue. ++ */ ++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd, ++ struct bfq_queue *active_bfqq) ++{ ++ if (active_bfqq == NULL) ++ return false; ++ /* ++ * If device is SSD it has no seek penalty, disable idling; but ++ * do so only if: ++ * - device does not support queuing, otherwise we still have ++ * a problem with sync vs async workloads; ++ * - the queue is not weight-raised, to preserve guarantees. ++ */ ++ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag && ++ active_bfqq->raising_coeff == 1); ++} ++ ++static void bfq_arm_slice_timer(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->active_queue; ++ struct bfq_io_cq *bic; ++ unsigned long sl; ++ ++ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (bfq_queue_nonrot_noidle(bfqd, bfqq)) ++ return; ++ ++ /* Idling is disabled, either manually or by past process history. */ ++ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_idle_window(bfqq)) ++ return; ++ ++ /* Tasks have exited, don't wait. */ ++ bic = bfqd->active_bic; ++ if (bic == NULL || atomic_read(&bic->icq.ioc->nr_tasks) == 0) ++ return; ++ ++ bfq_mark_bfqq_wait_request(bfqq); ++ ++ /* ++ * We don't want to idle for seeks, but we do want to allow ++ * fair distribution of slice time for a process doing back-to-back ++ * seeks. So allow a little bit of time for him to submit a new rq. ++ * ++ * To prevent processes with (partly) seeky workloads from ++ * being too ill-treated, grant them a small fraction of the ++ * assigned budget before reducing the waiting time to ++ * BFQ_MIN_TT. This happened to help reduce latency. ++ */ ++ sl = bfqd->bfq_slice_idle; ++ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) && ++ bfqq->entity.service > bfq_max_budget(bfqd) / 8 && ++ bfqq->raising_coeff == 1) ++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); ++ else if (bfqq->raising_coeff > 1) ++ sl = sl * 3; ++ bfqd->last_idling_start = ktime_get(); ++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl); ++ bfq_log(bfqd, "arm idle: %u/%u ms", ++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); ++} ++ ++/* ++ * Set the maximum time for the active queue to consume its ++ * budget. This prevents seeky processes from lowering the disk ++ * throughput (always guaranteed with a time slice scheme as in CFQ). ++ */ ++static void bfq_set_budget_timeout(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->active_queue; ++ unsigned int timeout_coeff; ++ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time) ++ timeout_coeff = 1; ++ else ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; ++ ++ bfqd->last_budget_start = ktime_get(); ++ ++ bfq_clear_bfqq_budget_new(bfqq); ++ bfqq->budget_timeout = jiffies + ++ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; ++ ++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", ++ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * ++ timeout_coeff)); ++} ++ ++/* ++ * Move request from internal lists to the request queue dispatch list. ++ */ ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ bfq_remove_request(rq); ++ bfqq->dispatched++; ++ elv_dispatch_sort(q, rq); ++ ++ if (bfq_bfqq_sync(bfqq)) ++ bfqd->sync_flight++; ++} ++ ++/* ++ * Return expired entry, or NULL to just start from scratch in rbtree. ++ */ ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq) ++{ ++ struct request *rq = NULL; ++ ++ if (bfq_bfqq_fifo_expire(bfqq)) ++ return NULL; ++ ++ bfq_mark_bfqq_fifo_expire(bfqq); ++ ++ if (list_empty(&bfqq->fifo)) ++ return NULL; ++ ++ rq = rq_entry_fifo(bfqq->fifo.next); ++ ++ if (time_before(jiffies, rq_fifo_time(rq))) ++ return NULL; ++ ++ return rq; ++} ++ ++/* ++ * Must be called with the queue_lock held. ++ */ ++static int bfqq_process_refs(struct bfq_queue *bfqq) ++{ ++ int process_refs, io_refs; ++ ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; ++ BUG_ON(process_refs < 0); ++ return process_refs; ++} ++ ++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ int process_refs, new_process_refs; ++ struct bfq_queue *__bfqq; ++ ++ /* ++ * If there are no process references on the new_bfqq, then it is ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain ++ * may have dropped their last reference (not just their last process ++ * reference). ++ */ ++ if (!bfqq_process_refs(new_bfqq)) ++ return; ++ ++ /* Avoid a circular list and skip interim queue merges. */ ++ while ((__bfqq = new_bfqq->new_bfqq)) { ++ if (__bfqq == bfqq) ++ return; ++ new_bfqq = __bfqq; ++ } ++ ++ process_refs = bfqq_process_refs(bfqq); ++ new_process_refs = bfqq_process_refs(new_bfqq); ++ /* ++ * If the process for the bfqq has gone away, there is no ++ * sense in merging the queues. ++ */ ++ if (process_refs == 0 || new_process_refs == 0) ++ return; ++ ++ /* ++ * Merge in the direction of the lesser amount of work. ++ */ ++ if (new_process_refs >= process_refs) { ++ bfqq->new_bfqq = new_bfqq; ++ atomic_add(process_refs, &new_bfqq->ref); ++ } else { ++ new_bfqq->new_bfqq = bfqq; ++ atomic_add(new_process_refs, &bfqq->ref); ++ } ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", ++ new_bfqq->pid); ++} ++ ++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ return entity->budget - entity->service; ++} ++ ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfqq != bfqd->active_queue); ++ ++ __bfq_bfqd_reset_active(bfqd); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ bfq_del_bfqq_busy(bfqd, bfqq, 1); ++ /* ++ * overloading budget_timeout field to store when ++ * the queue remains with no backlog, used by ++ * the weight-raising mechanism ++ */ ++ bfqq->budget_timeout = jiffies ; ++ } else { ++ bfq_activate_bfqq(bfqd, bfqq); ++ /* ++ * Resort priority tree of potential close cooperators. ++ */ ++ bfq_rq_pos_tree_add(bfqd, bfqq); ++ } ++ ++ /* ++ * If this bfqq is shared between multiple processes, check ++ * to make sure that those processes are still issuing I/Os ++ * within the mean seek distance. If not, it may be time to ++ * break the queues apart again. ++ */ ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) ++ bfq_mark_bfqq_split_coop(bfqq); ++} ++ ++/** ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. ++ * @bfqd: device data. ++ * @bfqq: queue to update. ++ * @reason: reason for expiration. ++ * ++ * Handle the feedback on @bfqq budget. See the body for detailed ++ * comments. ++ */ ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ enum bfqq_expiration reason) ++{ ++ struct request *next_rq; ++ unsigned long budget, min_budget; ++ ++ budget = bfqq->max_budget; ++ min_budget = bfq_min_budget(bfqd); ++ ++ BUG_ON(bfqq != bfqd->active_queue); ++ ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu", ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu", ++ budget, bfq_min_budget(bfqd)); ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue)); ++ ++ if (bfq_bfqq_sync(bfqq)) { ++ switch (reason) { ++ /* ++ * Caveat: in all the following cases we trade latency ++ * for throughput. ++ */ ++ case BFQ_BFQQ_TOO_IDLE: ++ /* ++ * This is the only case where we may reduce ++ * the budget: if there is no requets of the ++ * process still waiting for completion, then ++ * we assume (tentatively) that the timer has ++ * expired because the batch of requests of ++ * the process could have been served with a ++ * smaller budget. Hence, betting that ++ * process will behave in the same way when it ++ * becomes backlogged again, we reduce its ++ * next budget. As long as we guess right, ++ * this budget cut reduces the latency ++ * experienced by the process. ++ * ++ * However, if there are still outstanding ++ * requests, then the process may have not yet ++ * issued its next request just because it is ++ * still waiting for the completion of some of ++ * the still oustanding ones. So in this ++ * subcase we do not reduce its budget, on the ++ * contrary we increase it to possibly boost ++ * the throughput, as discussed in the ++ * comments to the BUDGET_TIMEOUT case. ++ */ ++ if (bfqq->dispatched > 0) /* still oustanding reqs */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ else { ++ if (budget > 5 * min_budget) ++ budget -= 4 * min_budget; ++ else ++ budget = min_budget; ++ } ++ break; ++ case BFQ_BFQQ_BUDGET_TIMEOUT: ++ /* ++ * We double the budget here because: 1) it ++ * gives the chance to boost the throughput if ++ * this is not a seeky process (which may have ++ * bumped into this timeout because of, e.g., ++ * ZBR), 2) together with charge_full_budget ++ * it helps give seeky processes higher ++ * timestamps, and hence be served less ++ * frequently. ++ */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_BUDGET_EXHAUSTED: ++ /* ++ * The process still has backlog, and did not ++ * let either the budget timeout or the disk ++ * idling timeout expire. Hence it is not ++ * seeky, has a short thinktime and may be ++ * happy with a higher budget too. So ++ * definitely increase the budget of this good ++ * candidate to boost the disk throughput. ++ */ ++ budget = min(budget * 4, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_NO_MORE_REQUESTS: ++ /* ++ * Leave the budget unchanged. ++ */ ++ default: ++ return; ++ } ++ } else /* async queue */ ++ /* async queues get always the maximum possible budget ++ * (their ability to dispatch is limited by ++ * @bfqd->bfq_max_budget_async_rq). ++ */ ++ budget = bfqd->bfq_max_budget; ++ ++ bfqq->max_budget = budget; ++ ++ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 && ++ bfqq->max_budget > bfqd->bfq_max_budget) ++ bfqq->max_budget = bfqd->bfq_max_budget; ++ ++ /* ++ * Make sure that we have enough budget for the next request. ++ * Since the finish time of the bfqq must be kept in sync with ++ * the budget, be sure to call __bfq_bfqq_expire() after the ++ * update. ++ */ ++ next_rq = bfqq->next_rq; ++ if (next_rq != NULL) ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ else ++ bfqq->entity.budget = bfqq->max_budget; ++ ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu", ++ next_rq != NULL ? blk_rq_sectors(next_rq) : 0, ++ bfqq->entity.budget); ++} ++ ++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) ++{ ++ unsigned long max_budget; ++ ++ /* ++ * The max_budget calculated when autotuning is equal to the ++ * amount of sectors transfered in timeout_sync at the ++ * estimated peak rate. ++ */ ++ max_budget = (unsigned long)(peak_rate * 1000 * ++ timeout >> BFQ_RATE_SHIFT); ++ ++ return max_budget; ++} ++ ++/* ++ * In addition to updating the peak rate, checks whether the process ++ * is "slow", and returns 1 if so. This slow flag is used, in addition ++ * to the budget timeout, to reduce the amount of service provided to ++ * seeky processes, and hence reduce their chances to lower the ++ * throughput. See the code for more details. ++ */ ++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ int compensate, enum bfqq_expiration reason) ++{ ++ u64 bw, usecs, expected, timeout; ++ ktime_t delta; ++ int update = 0; ++ ++ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) ++ return 0; ++ ++ if (compensate) ++ delta = bfqd->last_idling_start; ++ else ++ delta = ktime_get(); ++ delta = ktime_sub(delta, bfqd->last_budget_start); ++ usecs = ktime_to_us(delta); ++ ++ /* Don't trust short/unrealistic values. */ ++ if (usecs < 100 || usecs >= LONG_MAX) ++ return 0; ++ ++ /* ++ * Calculate the bandwidth for the last slice. We use a 64 bit ++ * value to store the peak rate, in sectors per usec in fixed ++ * point math. We do so to have enough precision in the estimate ++ * and to avoid overflows. ++ */ ++ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; ++ do_div(bw, (unsigned long)usecs); ++ ++ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); ++ ++ /* ++ * Use only long (> 20ms) intervals to filter out spikes for ++ * the peak rate estimation. ++ */ ++ if (usecs > 20000) { ++ if (bw > bfqd->peak_rate || ++ (!BFQQ_SEEKY(bfqq) && ++ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { ++ bfq_log(bfqd, "measured bw =%llu", bw); ++ /* ++ * To smooth oscillations use a low-pass filter with ++ * alpha=7/8, i.e., ++ * new_rate = (7/8) * old_rate + (1/8) * bw ++ */ ++ do_div(bw, 8); ++ bfqd->peak_rate *= 7; ++ do_div(bfqd->peak_rate, 8); ++ bfqd->peak_rate += bw; ++ update = 1; ++ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); ++ } ++ ++ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; ++ ++ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) ++ bfqd->peak_rate_samples++; ++ ++ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && ++ update && bfqd->bfq_user_max_budget == 0) { ++ bfqd->bfq_max_budget = ++ bfq_calc_max_budget(bfqd->peak_rate, timeout); ++ bfq_log(bfqd, "new max_budget=%lu", ++ bfqd->bfq_max_budget); ++ } ++ } ++ ++ /* ++ * If the process has been served for a too short time ++ * interval to let its possible sequential accesses prevail on ++ * the initial seek time needed to move the disk head on the ++ * first sector it requested, then give the process a chance ++ * and for the moment return false. ++ */ ++ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) ++ return 0; ++ ++ /* ++ * A process is considered ``slow'' (i.e., seeky, so that we ++ * cannot treat it fairly in the service domain, as it would ++ * slow down too much the other processes) if, when a slice ++ * ends for whatever reason, it has received service at a ++ * rate that would not be high enough to complete the budget ++ * before the budget timeout expiration. ++ */ ++ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; ++ ++ /* ++ * Caveat: processes doing IO in the slower disk zones will ++ * tend to be slow(er) even if not seeky. And the estimated ++ * peak rate will actually be an average over the disk ++ * surface. Hence, to not be too harsh with unlucky processes, ++ * we keep a budget/3 margin of safety before declaring a ++ * process slow. ++ */ ++ return expected > (4 * bfqq->entity.budget) / 3; ++} ++ ++/** ++ * bfq_bfqq_expire - expire a queue. ++ * @bfqd: device owning the queue. ++ * @bfqq: the queue to expire. ++ * @compensate: if true, compensate for the time spent idling. ++ * @reason: the reason causing the expiration. ++ * ++ * ++ * If the process associated to the queue is slow (i.e., seeky), or in ++ * case of budget timeout, or, finally, if it is async, we ++ * artificially charge it an entire budget (independently of the ++ * actual service it received). As a consequence, the queue will get ++ * higher timestamps than the correct ones upon reactivation, and ++ * hence it will be rescheduled as if it had received more service ++ * than what it actually received. In the end, this class of processes ++ * will receive less service in proportion to how slowly they consume ++ * their budgets (and hence how seriously they tend to lower the ++ * throughput). ++ * ++ * In contrast, when a queue expires because it has been idling for ++ * too much or because it exhausted its budget, we do not touch the ++ * amount of service it has received. Hence when the queue will be ++ * reactivated and its timestamps updated, the latter will be in sync ++ * with the actual service received by the queue until expiration. ++ * ++ * Charging a full budget to the first type of queues and the exact ++ * service to the others has the effect of using the WF2Q+ policy to ++ * schedule the former on a timeslice basis, without violating the ++ * service domain guarantees of the latter. ++ */ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ int compensate, ++ enum bfqq_expiration reason) ++{ ++ int slow; ++ BUG_ON(bfqq != bfqd->active_queue); ++ ++ /* Update disk peak rate for autotuning and check whether the ++ * process is slow (see bfq_update_peak_rate). ++ */ ++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); ++ ++ /* ++ * As above explained, 'punish' slow (i.e., seeky), timed-out ++ * and async queues, to favor sequential sync workloads. ++ * ++ * Processes doing IO in the slower disk zones will tend to be ++ * slow(er) even if not seeky. Hence, since the estimated peak ++ * rate is actually an average over the disk surface, these ++ * processes may timeout just for bad luck. To avoid punishing ++ * them we do not charge a full budget to a process that ++ * succeeded in consuming at least 2/3 of its budget. ++ */ ++ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) ++ bfq_bfqq_charge_full_budget(bfqq); ++ ++ if (bfqd->low_latency && bfqq->raising_coeff == 1) ++ bfqq->last_rais_start_finish = jiffies; ++ ++ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) { ++ if(reason != BFQ_BFQQ_BUDGET_TIMEOUT) ++ bfqq->soft_rt_next_start = ++ jiffies + ++ HZ * bfqq->entity.service / ++ bfqd->bfq_raising_max_softrt_rate; ++ else ++ bfqq->soft_rt_next_start = -1; /* infinity */ ++ } ++ bfq_log_bfqq(bfqd, bfqq, ++ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow, ++ bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); ++ ++ /* Increase, decrease or leave budget unchanged according to reason */ ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); ++ __bfq_bfqq_expire(bfqd, bfqq); ++} ++ ++/* ++ * Budget timeout is not implemented through a dedicated timer, but ++ * just checked on request arrivals and completions, as well as on ++ * idle timer expirations. ++ */ ++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_budget_new(bfqq)) ++ return 0; ++ ++ if (time_before(jiffies, bfqq->budget_timeout)) ++ return 0; ++ ++ return 1; ++} ++ ++/* ++ * If we expire a queue that is waiting for the arrival of a new ++ * request, we may prevent the fictitious timestamp backshifting that ++ * allows the guarantees of the queue to be preserved (see [1] for ++ * this tricky aspect). Hence we return true only if this condition ++ * does not hold, or if the queue is slow enough to deserve only to be ++ * kicked off for preserving a high throughput. ++*/ ++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "may_budget_timeout: wr %d left %d timeout %d", ++ bfq_bfqq_wait_request(bfqq), ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, ++ bfq_bfqq_budget_timeout(bfqq)); ++ ++ return (!bfq_bfqq_wait_request(bfqq) || ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) ++ && ++ bfq_bfqq_budget_timeout(bfqq); ++} ++ ++/* ++ * Select a queue for service. If we have a current active queue, ++ * check whether to continue servicing it, or retrieve and set a new one. ++ */ ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq, *new_bfqq = NULL; ++ struct request *next_rq; ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ ++ bfqq = bfqd->active_queue; ++ if (bfqq == NULL) ++ goto new_queue; ++ ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue"); ++ ++ /* ++ * If another queue has a request waiting within our mean seek ++ * distance, let it run. The expire code will check for close ++ * cooperators and put the close queue at the front of the ++ * service tree. If possible, merge the expiring queue with the ++ * new bfqq. ++ */ ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq); ++ if (new_bfqq != NULL && bfqq->new_bfqq == NULL) ++ bfq_setup_merge(bfqq, new_bfqq); ++ ++ if (bfq_may_expire_for_budg_timeout(bfqq)) ++ goto expire; ++ ++ next_rq = bfqq->next_rq; ++ /* ++ * If bfqq has requests queued and it has enough budget left to ++ * serve them, keep the queue, otherwise expire it. ++ */ ++ if (next_rq != NULL) { ++ if (bfq_serv_to_charge(next_rq, bfqq) > ++ bfq_bfqq_budget_left(bfqq)) { ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; ++ goto expire; ++ } else { ++ /* ++ * The idle timer may be pending because we may not ++ * disable disk idling even when a new request arrives ++ */ ++ if (timer_pending(&bfqd->idle_slice_timer)) { ++ /* ++ * If we get here: 1) at least a new request ++ * has arrived but we have not disabled the ++ * timer because the request was too small, ++ * 2) then the block layer has unplugged the ++ * device, causing the dispatch to be invoked. ++ * ++ * Since the device is unplugged, now the ++ * requests are probably large enough to ++ * provide a reasonable throughput. ++ * So we disable idling. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ del_timer(&bfqd->idle_slice_timer); ++ } ++ if (new_bfqq == NULL) ++ goto keep_queue; ++ else ++ goto expire; ++ } ++ } ++ ++ /* ++ * No requests pending. If there is no cooperator, and the active ++ * queue still has requests in flight or is idling for a new request, ++ * then keep it. ++ */ ++ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) || ++ (bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq) && ++ !bfq_queue_nonrot_noidle(bfqd, bfqq)))) { ++ bfqq = NULL; ++ goto keep_queue; ++ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) { ++ /* ++ * Expiring the queue because there is a close cooperator, ++ * cancel timer. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ del_timer(&bfqd->idle_slice_timer); ++ } ++ ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS; ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, 0, reason); ++new_queue: ++ bfqq = bfq_set_active_queue(bfqd, new_bfqq); ++ bfq_log(bfqd, "select_queue: new queue %d returned", ++ bfqq != NULL ? bfqq->pid : 0); ++keep_queue: ++ return bfqq; ++} ++ ++static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ if (bfqq->raising_coeff > 1) { /* queue is being boosted */ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, " ++ "old raising coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - ++ bfqq->last_rais_start_finish), ++ jiffies_to_msecs(bfqq->raising_cur_max_time), ++ bfqq->raising_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ BUG_ON(bfqq != bfqd->active_queue && entity->weight != ++ entity->orig_weight * bfqq->raising_coeff); ++ if(entity->ioprio_changed) ++ bfq_log_bfqq(bfqd, bfqq, ++ "WARN: pending prio change"); ++ /* ++ * If too much time has elapsed from the beginning ++ * of this weight-raising period and process is not soft ++ * real-time, stop it ++ */ ++ if (jiffies - bfqq->last_rais_start_finish > ++ bfqq->raising_cur_max_time) { ++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 && ++ bfqq->soft_rt_next_start < jiffies; ++ ++ bfqq->last_rais_start_finish = jiffies; ++ if (soft_rt) ++ bfqq->raising_cur_max_time = ++ bfqd->bfq_raising_rt_max_time; ++ else { ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais ending at %llu msec," ++ "rais_max_time %u", ++ bfqq->last_rais_start_finish, ++ jiffies_to_msecs(bfqq-> ++ raising_cur_max_time)); ++ bfqq->raising_coeff = 1; ++ entity->ioprio_changed = 1; ++ __bfq_entity_update_weight_prio( ++ bfq_entity_service_tree(entity), ++ entity); ++ } ++ } ++ } ++} ++ ++ ++/* ++ * Dispatch one request from bfqq, moving it to the request queue ++ * dispatch list. ++ */ ++static int bfq_dispatch_request(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ struct request *rq; ++ unsigned long service_to_charge; ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ /* Follow expired path, else get first next available. */ ++ rq = bfq_check_fifo(bfqq); ++ if (rq == NULL) ++ rq = bfqq->next_rq; ++ service_to_charge = bfq_serv_to_charge(rq, bfqq); ++ ++ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { ++ /* ++ * This may happen if the next rq is chosen ++ * in fifo order instead of sector order. ++ * The budget is properly dimensioned ++ * to be always sufficient to serve the next request ++ * only if it is chosen in sector order. The reason is ++ * that it would be quite inefficient and little useful ++ * to always make sure that the budget is large enough ++ * to serve even the possible next rq in fifo order. ++ * In fact, requests are seldom served in fifo order. ++ * ++ * Expire the queue for budget exhaustion, and ++ * make sure that the next act_budget is enough ++ * to serve the next request, even if it comes ++ * from the fifo expired path. ++ */ ++ bfqq->next_rq = rq; ++ /* ++ * Since this dispatch is failed, make sure that ++ * a new one will be performed ++ */ ++ if (!bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++ goto expire; ++ } ++ ++ /* Finally, insert request into driver dispatch list. */ ++ bfq_bfqq_served(bfqq, service_to_charge); ++ bfq_dispatch_insert(bfqd->queue, rq); ++ ++ update_raising_data(bfqd, bfqq); ++ ++ bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), " ++ "budg left %lu", ++ blk_rq_sectors(rq), ++ (long long unsigned)blk_rq_pos(rq), ++ bfq_bfqq_budget_left(bfqq)); ++ ++ dispatched++; ++ ++ if (bfqd->active_bic == NULL) { ++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); ++ bfqd->active_bic = RQ_BIC(rq); ++ } ++ ++ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && ++ dispatched >= bfqd->bfq_max_budget_async_rq) || ++ bfq_class_idle(bfqq))) ++ goto expire; ++ ++ return dispatched; ++ ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED); ++ return dispatched; ++} ++ ++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ ++ while (bfqq->next_rq != NULL) { ++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); ++ dispatched++; ++ } ++ ++ BUG_ON(!list_empty(&bfqq->fifo)); ++ return dispatched; ++} ++ ++/* ++ * Drain our current requests. Used for barriers and when switching ++ * io schedulers on-the-fly. ++ */ ++static int bfq_forced_dispatch(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq, *n; ++ struct bfq_service_tree *st; ++ int dispatched = 0; ++ ++ bfqq = bfqd->active_queue; ++ if (bfqq != NULL) ++ __bfq_bfqq_expire(bfqd, bfqq); ++ ++ /* ++ * Loop through classes, and be careful to leave the scheduler ++ * in a consistent state, as feedback mechanisms and vtime ++ * updates cannot be disabled during the process. ++ */ ++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { ++ st = bfq_entity_service_tree(&bfqq->entity); ++ ++ dispatched += __bfq_forced_dispatch_bfqq(bfqq); ++ bfqq->max_budget = bfq_max_budget(bfqd); ++ ++ bfq_forget_idle(st); ++ } ++ ++ BUG_ON(bfqd->busy_queues != 0); ++ ++ return dispatched; ++} ++ ++static int bfq_dispatch_requests(struct request_queue *q, int force) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq; ++ int max_dispatch; ++ ++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); ++ if (bfqd->busy_queues == 0) ++ return 0; ++ ++ if (unlikely(force)) ++ return bfq_forced_dispatch(bfqd); ++ ++ if((bfqq = bfq_select_queue(bfqd)) == NULL) ++ return 0; ++ ++ max_dispatch = bfqd->bfq_quantum; ++ if (bfq_class_idle(bfqq)) ++ max_dispatch = 1; ++ ++ if (!bfq_bfqq_sync(bfqq)) ++ max_dispatch = bfqd->bfq_max_budget_async_rq; ++ ++ if (bfqq->dispatched >= max_dispatch) { ++ if (bfqd->busy_queues > 1) ++ return 0; ++ if (bfqq->dispatched >= 4 * max_dispatch) ++ return 0; ++ } ++ ++ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) ++ return 0; ++ ++ bfq_clear_bfqq_wait_request(bfqq); ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); ++ ++ if (! bfq_dispatch_request(bfqd, bfqq)) ++ return 0; ++ ++ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d" ++ "(max_disp %d)", bfqq->pid, max_dispatch); ++ ++ return 1; ++} ++ ++/* ++ * Task holds one reference to the queue, dropped when task exits. Each rq ++ * in-flight on this queue also holds a reference, dropped when rq is freed. ++ * ++ * Queue lock must be held here. ++ */ ++static void bfq_put_queue(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++ ++ BUG_ON(atomic_read(&bfqq->ref) <= 0); ++ ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ if (!atomic_dec_and_test(&bfqq->ref)) ++ return; ++ ++ BUG_ON(rb_first(&bfqq->sort_list) != NULL); ++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); ++ BUG_ON(bfqq->entity.tree != NULL); ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ BUG_ON(bfqd->active_queue == bfqq); ++ ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); ++ ++ kmem_cache_free(bfq_pool, bfqq); ++} ++ ++static void bfq_put_cooperator(struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *__bfqq, *next; ++ ++ /* ++ * If this queue was scheduled to merge with another queue, be ++ * sure to drop the reference taken on that queue (and others in ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. ++ */ ++ __bfqq = bfqq->new_bfqq; ++ while (__bfqq) { ++ if (__bfqq == bfqq) { ++ WARN(1, "bfqq->new_bfqq loop detected.\n"); ++ break; ++ } ++ next = __bfqq->new_bfqq; ++ bfq_put_queue(__bfqq); ++ __bfqq = next; ++ } ++} ++ ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ if (bfqq == bfqd->active_queue) { ++ __bfq_bfqq_expire(bfqd, bfqq); ++ bfq_schedule_dispatch(bfqd); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); ++} ++ ++static void bfq_init_icq(struct io_cq *icq) ++{ ++ struct bfq_io_cq *bic = icq_to_bic(icq); ++ ++ bic->ttime.last_end_request = jiffies; ++} ++ ++static void bfq_exit_icq(struct io_cq *icq) ++{ ++ struct bfq_io_cq *bic = icq_to_bic(icq); ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ ++ if (bic->bfqq[BLK_RW_ASYNC]) { ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); ++ bic->bfqq[BLK_RW_ASYNC] = NULL; ++ } ++ ++ if (bic->bfqq[BLK_RW_SYNC]) { ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); ++ bic->bfqq[BLK_RW_SYNC] = NULL; ++ } ++} ++ ++/* ++ * Update the entity prio values; note that the new values will not ++ * be used until the next (re)activation. ++ */ ++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct io_context *ioc) ++{ ++ struct task_struct *tsk = current; ++ int ioprio_class; ++ ++ if (!bfq_bfqq_prio_changed(bfqq)) ++ return; ++ ++ ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); ++ switch (ioprio_class) { ++ default: ++ printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class); ++ case IOPRIO_CLASS_NONE: ++ /* ++ * No prio set, inherit CPU scheduling settings. ++ */ ++ bfqq->entity.new_ioprio = task_nice_ioprio(tsk); ++ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk); ++ break; ++ case IOPRIO_CLASS_RT: ++ bfqq->entity.new_ioprio = task_ioprio(ioc); ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT; ++ break; ++ case IOPRIO_CLASS_BE: ++ bfqq->entity.new_ioprio = task_ioprio(ioc); ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; ++ break; ++ case IOPRIO_CLASS_IDLE: ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE; ++ bfqq->entity.new_ioprio = 7; ++ bfq_clear_bfqq_idle_window(bfqq); ++ break; ++ } ++ ++ bfqq->entity.ioprio_changed = 1; ++ ++ /* ++ * Keep track of original prio settings in case we have to temporarily ++ * elevate the priority of this queue. ++ */ ++ bfqq->org_ioprio = bfqq->entity.new_ioprio; ++ bfq_clear_bfqq_prio_changed(bfqq); ++} ++ ++static void bfq_changed_ioprio(struct io_context *ioc, ++ struct bfq_io_cq *bic) ++{ ++ struct bfq_data *bfqd; ++ struct bfq_queue *bfqq, *new_bfqq; ++ struct bfq_group *bfqg; ++ unsigned long uninitialized_var(flags); ++ ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags); ++ if (unlikely(bfqd == NULL)) ++ return; ++ ++ bfqq = bic->bfqq[BLK_RW_ASYNC]; ++ if (bfqq != NULL) { ++ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group, ++ sched_data); ++ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic->icq.ioc, ++ GFP_ATOMIC); ++ if (new_bfqq != NULL) { ++ bic->bfqq[BLK_RW_ASYNC] = new_bfqq; ++ bfq_log_bfqq(bfqd, bfqq, ++ "changed_ioprio: bfqq %p %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ } ++ } ++ ++ bfqq = bic->bfqq[BLK_RW_SYNC]; ++ if (bfqq != NULL) ++ bfq_mark_bfqq_prio_changed(bfqq); ++ ++ bfq_put_bfqd_unlock(bfqd, &flags); ++} ++ ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ pid_t pid, int is_sync) ++{ ++ RB_CLEAR_NODE(&bfqq->entity.rb_node); ++ INIT_LIST_HEAD(&bfqq->fifo); ++ ++ atomic_set(&bfqq->ref, 0); ++ bfqq->bfqd = bfqd; ++ ++ bfq_mark_bfqq_prio_changed(bfqq); ++ ++ if (is_sync) { ++ if (!bfq_class_idle(bfqq)) ++ bfq_mark_bfqq_idle_window(bfqq); ++ bfq_mark_bfqq_sync(bfqq); ++ } ++ ++ /* Tentative initial value to trade off between thr and lat */ ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; ++ bfqq->pid = pid; ++ ++ bfqq->raising_coeff = 1; ++ bfqq->last_rais_start_finish = 0; ++ bfqq->soft_rt_next_start = -1; ++} ++ ++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ int is_sync, ++ struct io_context *ioc, ++ gfp_t gfp_mask) ++{ ++ struct bfq_queue *bfqq, *new_bfqq = NULL; ++ struct bfq_io_cq *bic; ++ ++retry: ++ bic = bfq_bic_lookup(bfqd, ioc); ++ /* bic always exists here */ ++ bfqq = bic_to_bfqq(bic, is_sync); ++ ++ /* ++ * Always try a new alloc if we fall back to the OOM bfqq ++ * originally, since it should just be a temporary situation. ++ */ ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { ++ bfqq = NULL; ++ if (new_bfqq != NULL) { ++ bfqq = new_bfqq; ++ new_bfqq = NULL; ++ } else if (gfp_mask & __GFP_WAIT) { ++ spin_unlock_irq(bfqd->queue->queue_lock); ++ new_bfqq = kmem_cache_alloc_node(bfq_pool, ++ gfp_mask | __GFP_ZERO, ++ bfqd->queue->node); ++ spin_lock_irq(bfqd->queue->queue_lock); ++ if (new_bfqq != NULL) ++ goto retry; ++ } else { ++ bfqq = kmem_cache_alloc_node(bfq_pool, ++ gfp_mask | __GFP_ZERO, ++ bfqd->queue->node); ++ } ++ ++ if (bfqq != NULL) { ++ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync); ++ bfq_log_bfqq(bfqd, bfqq, "allocated"); ++ } else { ++ bfqq = &bfqd->oom_bfqq; ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); ++ } ++ ++ bfq_init_prio_data(bfqq, ioc); ++ bfq_init_entity(&bfqq->entity, bfqg); ++ } ++ ++ if (new_bfqq != NULL) ++ kmem_cache_free(bfq_pool, new_bfqq); ++ ++ return bfqq; ++} ++ ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ int ioprio_class, int ioprio) ++{ ++ switch (ioprio_class) { ++ case IOPRIO_CLASS_RT: ++ return &bfqg->async_bfqq[0][ioprio]; ++ case IOPRIO_CLASS_BE: ++ return &bfqg->async_bfqq[1][ioprio]; ++ case IOPRIO_CLASS_IDLE: ++ return &bfqg->async_idle_bfqq; ++ default: ++ BUG(); ++ } ++} ++ ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, int is_sync, ++ struct io_context *ioc, gfp_t gfp_mask) ++{ ++ const int ioprio = task_ioprio(ioc); ++ const int ioprio_class = task_ioprio_class(ioc); ++ struct bfq_queue **async_bfqq = NULL; ++ struct bfq_queue *bfqq = NULL; ++ ++ if (!is_sync) { ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ++ ioprio); ++ bfqq = *async_bfqq; ++ } ++ ++ if (bfqq == NULL) ++ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, ioc, gfp_mask); ++ ++ /* ++ * Pin the queue now that it's allocated, scheduler exit will prune it. ++ */ ++ if (!is_sync && *async_bfqq == NULL) { ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ *async_bfqq = bfqq; ++ } ++ ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ return bfqq; ++} ++ ++static void bfq_update_io_thinktime(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic) ++{ ++ unsigned long elapsed = jiffies - bic->ttime.last_end_request; ++ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); ++ ++ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; ++ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; ++ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / bic->ttime.ttime_samples; ++} ++ ++static void bfq_update_io_seektime(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ sector_t sdist; ++ u64 total; ++ ++ if (bfqq->last_request_pos < blk_rq_pos(rq)) ++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos; ++ else ++ sdist = bfqq->last_request_pos - blk_rq_pos(rq); ++ ++ /* ++ * Don't allow the seek distance to get too large from the ++ * odd fragment, pagein, etc. ++ */ ++ if (bfqq->seek_samples == 0) /* first request, not really a seek */ ++ sdist = 0; ++ else if (bfqq->seek_samples <= 60) /* second & third seek */ ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); ++ else ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); ++ ++ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; ++ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; ++ total = bfqq->seek_total + (bfqq->seek_samples/2); ++ do_div(total, bfqq->seek_samples); ++ if (bfq_bfqq_coop(bfqq)) { ++ /* ++ * If the mean seektime increases for a (non-seeky) shared ++ * queue, some cooperator is likely to be idling too much. ++ * On the contrary, if it decreases, some cooperator has ++ * probably waked up. ++ * ++ */ ++ if ((sector_t)total < bfqq->seek_mean) ++ bfq_mark_bfqq_some_coop_idle(bfqq) ; ++ else if ((sector_t)total > bfqq->seek_mean) ++ bfq_clear_bfqq_some_coop_idle(bfqq) ; ++ } ++ bfqq->seek_mean = (sector_t)total; ++ ++ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, ++ (u64)bfqq->seek_mean); ++} ++ ++/* ++ * Disable idle window if the process thinks too long or seeks so much that ++ * it doesn't matter. ++ */ ++static void bfq_update_idle_window(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) ++{ ++ int enable_idle; ++ ++ /* Don't idle for async or idle io prio class. */ ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) ++ return; ++ ++ enable_idle = bfq_bfqq_idle_window(bfqq); ++ ++ if (atomic_read(&bic->icq.ioc->nr_tasks) == 0 || ++ bfqd->bfq_slice_idle == 0 || ++ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && ++ bfqq->raising_coeff == 1)) ++ enable_idle = 0; ++ else if (bfq_sample_valid(bic->ttime.ttime_samples)) { ++ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && ++ bfqq->raising_coeff == 1) ++ enable_idle = 0; ++ else ++ enable_idle = 1; ++ } ++ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", ++ enable_idle); ++ ++ if (enable_idle) ++ bfq_mark_bfqq_idle_window(bfqq); ++ else ++ bfq_clear_bfqq_idle_window(bfqq); ++} ++ ++/* ++ * Called when a new fs request (rq) is added to bfqq. Check if there's ++ * something we should do about it. ++ */ ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ struct bfq_io_cq *bic = RQ_BIC(rq); ++ ++ if (rq->cmd_flags & REQ_META) ++ bfqq->meta_pending++; ++ ++ bfq_update_io_thinktime(bfqd, bic); ++ bfq_update_io_seektime(bfqd, bfqq, rq); ++ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || ++ !BFQQ_SEEKY(bfqq)) ++ bfq_update_idle_window(bfqd, bfqq, bic); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", ++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), ++ (long long unsigned)bfqq->seek_mean); ++ ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ ++ if (bfqq == bfqd->active_queue) { ++ /* ++ * If there is just this request queued and the request ++ * is small, just exit. ++ * In this way, if the disk is being idled to wait for a new ++ * request from the active queue, we avoid unplugging the ++ * device now. ++ * ++ * By doing so, we spare the disk to be committed ++ * to serve just a small request. On the contrary, we wait for ++ * the block layer to decide when to unplug the device: ++ * hopefully, new requests will be merged to this ++ * one quickly, then the device will be unplugged ++ * and larger requests will be dispatched. ++ */ ++ if (bfqq->queued[rq_is_sync(rq)] == 1 && ++ blk_rq_sectors(rq) < 32) { ++ return; ++ } ++ if (bfq_bfqq_wait_request(bfqq)) { ++ /* ++ * If we are waiting for a request for this queue, let ++ * it rip immediately and flag that we must not expire ++ * this queue just now. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ del_timer(&bfqd->idle_slice_timer); ++ /* ++ * Here we can safely expire the queue, in ++ * case of budget timeout, without wasting ++ * guarantees ++ */ ++ if (bfq_bfqq_budget_timeout(bfqq)) ++ bfq_bfqq_expire(bfqd, bfqq, 0, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ __blk_run_queue(bfqd->queue); ++ } ++ } ++} ++ ++static void bfq_insert_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ bfq_init_prio_data(bfqq, RQ_BIC(rq)->icq.ioc); ++ ++ bfq_add_rq_rb(rq); ++ ++ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]); ++ list_add_tail(&rq->queuelist, &bfqq->fifo); ++ ++ bfq_rq_enqueued(bfqd, bfqq, rq); ++} ++ ++static void bfq_update_hw_tag(struct bfq_data *bfqd) ++{ ++ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, ++ bfqd->rq_in_driver); ++ ++ if (bfqd->hw_tag == 1) ++ return; ++ ++ /* ++ * This sample is valid if the number of outstanding requests ++ * is large enough to allow a queueing behavior. Note that the ++ * sum is not exact, as it's not taking into account deactivated ++ * requests. ++ */ ++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) ++ return; ++ ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) ++ return; ++ ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; ++ bfqd->max_rq_in_driver = 0; ++ bfqd->hw_tag_samples = 0; ++} ++ ++static void bfq_completed_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ const int sync = rq_is_sync(rq); ++ ++ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)", ++ blk_rq_sectors(rq), sync); ++ ++ bfq_update_hw_tag(bfqd); ++ ++ WARN_ON(!bfqd->rq_in_driver); ++ WARN_ON(!bfqq->dispatched); ++ bfqd->rq_in_driver--; ++ bfqq->dispatched--; ++ ++ if (bfq_bfqq_sync(bfqq)) ++ bfqd->sync_flight--; ++ ++ if (sync) ++ RQ_BIC(rq)->ttime.last_end_request = jiffies; ++ ++ /* ++ * If this is the active queue, check if it needs to be expired, ++ * or if we want to idle in case it has no pending requests. ++ */ ++ if (bfqd->active_queue == bfqq) { ++ if (bfq_bfqq_budget_new(bfqq)) ++ bfq_set_budget_timeout(bfqd); ++ ++ /* Idling is disabled also for cooperation issues: ++ * 1) there is a close cooperator for the queue, or ++ * 2) the queue is shared and some cooperator is likely ++ * to be idle (in this case, by not arming the idle timer, ++ * we try to slow down the queue, to prevent the zones ++ * of the disk accessed by the active cooperators to become ++ * too distant from the zone that will be accessed by the ++ * currently idle cooperators) ++ */ ++ if (bfq_may_expire_for_budg_timeout(bfqq)) ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); ++ else if (sync && ++ (bfqd->rq_in_driver == 0 || ++ bfqq->raising_coeff > 1) ++ && RB_EMPTY_ROOT(&bfqq->sort_list) ++ && !bfq_close_cooperator(bfqd, bfqq) ++ && (!bfq_bfqq_coop(bfqq) || ++ !bfq_bfqq_some_coop_idle(bfqq))) ++ bfq_arm_slice_timer(bfqd); ++ } ++ ++ if (!bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++} ++ ++static inline int __bfq_may_queue(struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { ++ bfq_clear_bfqq_must_alloc(bfqq); ++ return ELV_MQUEUE_MUST; ++ } ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++static int bfq_may_queue(struct request_queue *q, int rw) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct task_struct *tsk = current; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq; ++ ++ /* ++ * Don't force setup of a queue from here, as a call to may_queue ++ * does not necessarily imply that a request actually will be queued. ++ * So just lookup a possibly existing queue, or return 'may queue' ++ * if that fails. ++ */ ++ bic = bfq_bic_lookup(bfqd, tsk->io_context); ++ if (bic == NULL) ++ return ELV_MQUEUE_MAY; ++ ++ bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); ++ if (bfqq != NULL) { ++ bfq_init_prio_data(bfqq, bic->icq.ioc); ++ ++ return __bfq_may_queue(bfqq); ++ } ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++/* ++ * Queue lock held here. ++ */ ++static void bfq_put_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ if (bfqq != NULL) { ++ const int rw = rq_data_dir(rq); ++ ++ BUG_ON(!bfqq->allocated[rw]); ++ bfqq->allocated[rw]--; ++ ++ rq->elv.priv[0] = NULL; ++ rq->elv.priv[1] = NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ } ++} ++ ++static struct bfq_queue * ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, ++ struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", ++ (long unsigned)bfqq->new_bfqq->pid); ++ bic_set_bfqq(bic, bfqq->new_bfqq, 1); ++ bfq_mark_bfqq_coop(bfqq->new_bfqq); ++ bfq_put_queue(bfqq); ++ return bic_to_bfqq(bic, 1); ++} ++ ++/* ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this ++ * was the last process referring to said bfqq. ++ */ ++static struct bfq_queue * ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); ++ if (bfqq_process_refs(bfqq) == 1) { ++ bfqq->pid = current->pid; ++ bfq_clear_bfqq_some_coop_idle(bfqq); ++ bfq_clear_bfqq_coop(bfqq); ++ bfq_clear_bfqq_split_coop(bfqq); ++ return bfqq; ++ } ++ ++ bic_set_bfqq(bic, NULL, 1); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); ++ return NULL; ++} ++ ++/* ++ * Allocate bfq data structures associated with this request. ++ */ ++static int bfq_set_request(struct request_queue *q, struct request *rq, ++ gfp_t gfp_mask) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); ++ const int rw = rq_data_dir(rq); ++ const int is_sync = rq_is_sync(rq); ++ struct bfq_queue *bfqq; ++ struct bfq_group *bfqg; ++ unsigned long flags; ++ ++ /* handle changed prio notifications; cgroup change is handled separately */ ++ if (unlikely(icq_get_changed(&bic->icq) & ICQ_IOPRIO_CHANGED)) ++ bfq_changed_ioprio(bic->icq.ioc, bic); ++ ++ might_sleep_if(gfp_mask & __GFP_WAIT); ++ ++ spin_lock_irqsave(q->queue_lock, flags); ++ ++ if (bic == NULL) ++ goto queue_fail; ++ ++ bfqg = bfq_bic_update_cgroup(bic); ++ ++new_queue: ++ bfqq = bic_to_bfqq(bic, is_sync); ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { ++ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic->icq.ioc, gfp_mask); ++ bic_set_bfqq(bic, bfqq, is_sync); ++ } else { ++ /* ++ * If the queue was seeky for too long, break it apart. ++ */ ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); ++ bfqq = bfq_split_bfqq(bic, bfqq); ++ if (!bfqq) ++ goto new_queue; ++ } ++ ++ /* ++ * Check to see if this queue is scheduled to merge with ++ * another closely cooperating queue. The merging of queues ++ * happens here as it must be done in process context. ++ * The reference on new_bfqq was taken in merge_bfqqs. ++ */ ++ if (bfqq->new_bfqq != NULL) ++ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq); ++ } ++ ++ bfqq->allocated[rw]++; ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ ++ rq->elv.priv[0] = bic; ++ rq->elv.priv[1] = bfqq; ++ ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return 0; ++ ++queue_fail: ++ bfq_schedule_dispatch(bfqd); ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return 1; ++} ++ ++static void bfq_kick_queue(struct work_struct *work) ++{ ++ struct bfq_data *bfqd = ++ container_of(work, struct bfq_data, unplug_work); ++ struct request_queue *q = bfqd->queue; ++ ++ spin_lock_irq(q->queue_lock); ++ __blk_run_queue(q); ++ spin_unlock_irq(q->queue_lock); ++} ++ ++/* ++ * Handler of the expiration of the timer running if the active_queue ++ * is idling inside its time slice. ++ */ ++static void bfq_idle_slice_timer(unsigned long data) ++{ ++ struct bfq_data *bfqd = (struct bfq_data *)data; ++ struct bfq_queue *bfqq; ++ unsigned long flags; ++ enum bfqq_expiration reason; ++ ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags); ++ ++ bfqq = bfqd->active_queue; ++ /* ++ * Theoretical race here: active_queue can be NULL or different ++ * from the queue that was idling if the timer handler spins on ++ * the queue_lock and a new request arrives for the current ++ * queue and there is a full dispatch cycle that changes the ++ * active_queue. This can hardly happen, but in the worst case ++ * we just expire a queue too early. ++ */ ++ if (bfqq != NULL) { ++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); ++ if (bfq_bfqq_budget_timeout(bfqq)) ++ /* ++ * Also here the queue can be safely expired ++ * for budget timeout without wasting ++ * guarantees ++ */ ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) ++ /* ++ * The queue may not be empty upon timer expiration, ++ * because we may not disable the timer when the first ++ * request of the active queue arrives during ++ * disk idling ++ */ ++ reason = BFQ_BFQQ_TOO_IDLE; ++ else ++ goto schedule_dispatch; ++ ++ bfq_bfqq_expire(bfqd, bfqq, 1, reason); ++ } ++ ++schedule_dispatch: ++ bfq_schedule_dispatch(bfqd); ++ ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); ++} ++ ++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) ++{ ++ del_timer_sync(&bfqd->idle_slice_timer); ++ cancel_work_sync(&bfqd->unplug_work); ++} ++ ++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd, ++ struct bfq_queue **bfqq_ptr) ++{ ++ struct bfq_group *root_group = bfqd->root_group; ++ struct bfq_queue *bfqq = *bfqq_ptr; ++ ++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); ++ if (bfqq != NULL) { ++ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); ++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ *bfqq_ptr = NULL; ++ } ++} ++ ++/* ++ * Release all the bfqg references to its async queues. If we are ++ * deallocating the group these queues may still contain requests, so ++ * we reparent them to the root cgroup (i.e., the only one that will ++ * exist for sure untill all the requests on a device are gone). ++ */ ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); ++ ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); ++} ++ ++static void bfq_exit_queue(struct elevator_queue *e) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ struct request_queue *q = bfqd->queue; ++ struct bfq_queue *bfqq, *n; ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ spin_lock_irq(q->queue_lock); ++ ++ BUG_ON(bfqd->active_queue != NULL); ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) ++ bfq_deactivate_bfqq(bfqd, bfqq, 0); ++ ++ bfq_disconnect_groups(bfqd); ++ spin_unlock_irq(q->queue_lock); ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ synchronize_rcu(); ++ ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); ++ ++ bfq_free_root_group(bfqd); ++ kfree(bfqd); ++} ++ ++static void *bfq_init_queue(struct request_queue *q) ++{ ++ struct bfq_group *bfqg; ++ struct bfq_data *bfqd; ++ ++ bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node); ++ if (bfqd == NULL) ++ return NULL; ++ ++ /* ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. ++ * Grab a permanent reference to it, so that the normal code flow ++ * will not attempt to free it. ++ */ ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0); ++ atomic_inc(&bfqd->oom_bfqq.ref); ++ ++ bfqd->queue = q; ++ ++ bfqg = bfq_alloc_root_group(bfqd, q->node); ++ if (bfqg == NULL) { ++ kfree(bfqd); ++ return NULL; ++ } ++ ++ bfqd->root_group = bfqg; ++ ++ init_timer(&bfqd->idle_slice_timer); ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; ++ bfqd->idle_slice_timer.data = (unsigned long)bfqd; ++ ++ bfqd->rq_pos_tree = RB_ROOT; ++ ++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); ++ ++ INIT_LIST_HEAD(&bfqd->active_list); ++ INIT_LIST_HEAD(&bfqd->idle_list); ++ ++ bfqd->hw_tag = -1; ++ ++ bfqd->bfq_max_budget = bfq_default_max_budget; ++ ++ bfqd->bfq_quantum = bfq_quantum; ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; ++ bfqd->bfq_back_max = bfq_back_max; ++ bfqd->bfq_back_penalty = bfq_back_penalty; ++ bfqd->bfq_slice_idle = bfq_slice_idle; ++ bfqd->bfq_class_idle_last_service = 0; ++ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; ++ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; ++ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; ++ ++ bfqd->low_latency = true; ++ ++ bfqd->bfq_raising_coeff = 20; ++ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300); ++ bfqd->bfq_raising_max_time = 0; ++ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000); ++ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500); ++ bfqd->bfq_raising_max_softrt_rate = 7000; ++ ++ /* Initially estimate the device's peak rate as the reference rate */ ++ if (blk_queue_nonrot(bfqd->queue)) { ++ bfqd->RT_prod = R_nonrot * T_nonrot; ++ bfqd->peak_rate = R_nonrot; ++ } else { ++ bfqd->RT_prod = R_rot * T_rot; ++ bfqd->peak_rate = R_rot; ++ } ++ ++ return bfqd; ++} ++ ++static void bfq_slab_kill(void) ++{ ++ if (bfq_pool != NULL) ++ kmem_cache_destroy(bfq_pool); ++} ++ ++static int __init bfq_slab_setup(void) ++{ ++ bfq_pool = KMEM_CACHE(bfq_queue, 0); ++ if (bfq_pool == NULL) ++ return -ENOMEM; ++ return 0; ++} ++ ++static ssize_t bfq_var_show(unsigned int var, char *page) ++{ ++ return sprintf(page, "%d\n", var); ++} ++ ++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count) ++{ ++ unsigned long new_val; ++ int ret = strict_strtoul(page, 10, &new_val); ++ ++ if (ret == 0) ++ *var = new_val; ++ ++ return count; ++} ++ ++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ? ++ bfqd->bfq_raising_max_time : ++ bfq_wrais_duration(bfqd)); ++} ++ ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_queue *bfqq; ++ struct bfq_data *bfqd = e->elevator_data; ++ ssize_t num_char = 0; ++ ++ num_char += sprintf(page + num_char, "Active:\n"); ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, dur %d/%u\n", ++ bfqq->pid, ++ bfqq->entity.weight, ++ jiffies_to_msecs(jiffies - ++ bfqq->last_rais_start_finish), ++ jiffies_to_msecs(bfqq->raising_cur_max_time)); ++ } ++ num_char += sprintf(page + num_char, "Idle:\n"); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, dur %d/%u\n", ++ bfqq->pid, ++ bfqq->entity.weight, ++ jiffies_to_msecs(jiffies - ++ bfqq->last_rais_start_finish), ++ jiffies_to_msecs(bfqq->raising_cur_max_time)); ++ } ++ return num_char; ++} ++ ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned int __data = __VAR; \ ++ if (__CONV) \ ++ __data = jiffies_to_msecs(__data); \ ++ return bfq_var_show(__data, (page)); \ ++} ++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0); ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); ++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0); ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); ++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); ++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0); ++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1); ++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time, ++ 1); ++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show, ++ bfqd->bfq_raising_min_inter_arr_async, ++ 1); ++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show, ++ bfqd->bfq_raising_max_softrt_rate, 0); ++#undef SHOW_FUNCTION ++ ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ ++static ssize_t \ ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long __data; \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ if (__CONV) \ ++ *(__PTR) = msecs_to_jiffies(__data); \ ++ else \ ++ *(__PTR) = __data; \ ++ return ret; \ ++} ++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0); ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, ++ INT_MAX, 0); ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, ++ 1, INT_MAX, 0); ++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1, ++ INT_MAX, 0); ++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_raising_min_idle_time_store, ++ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store, ++ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_raising_max_softrt_rate_store, ++ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0); ++#undef STORE_FUNCTION ++ ++/* do nothing for the moment */ ++static ssize_t bfq_weights_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ return count; ++} ++ ++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) ++{ ++ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); ++ ++ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) ++ return bfq_calc_max_budget(bfqd->peak_rate, timeout); ++ else ++ return bfq_default_max_budget; ++} ++ ++static ssize_t bfq_max_budget_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long __data; ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data == 0) ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); ++ else { ++ if (__data > INT_MAX) ++ __data = INT_MAX; ++ bfqd->bfq_max_budget = __data; ++ } ++ ++ bfqd->bfq_user_max_budget = __data; ++ ++ return ret; ++} ++ ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long __data; ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data < 1) ++ __data = 1; ++ else if (__data > INT_MAX) ++ __data = INT_MAX; ++ ++ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); ++ if (bfqd->bfq_user_max_budget == 0) ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); ++ ++ return ret; ++} ++ ++static ssize_t bfq_low_latency_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long __data; ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ bfqd->low_latency = __data; ++ ++ return ret; ++} ++ ++#define BFQ_ATTR(name) \ ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) ++ ++static struct elv_fs_entry bfq_attrs[] = { ++ BFQ_ATTR(quantum), ++ BFQ_ATTR(fifo_expire_sync), ++ BFQ_ATTR(fifo_expire_async), ++ BFQ_ATTR(back_seek_max), ++ BFQ_ATTR(back_seek_penalty), ++ BFQ_ATTR(slice_idle), ++ BFQ_ATTR(max_budget), ++ BFQ_ATTR(max_budget_async_rq), ++ BFQ_ATTR(timeout_sync), ++ BFQ_ATTR(timeout_async), ++ BFQ_ATTR(low_latency), ++ BFQ_ATTR(raising_coeff), ++ BFQ_ATTR(raising_max_time), ++ BFQ_ATTR(raising_rt_max_time), ++ BFQ_ATTR(raising_min_idle_time), ++ BFQ_ATTR(raising_min_inter_arr_async), ++ BFQ_ATTR(raising_max_softrt_rate), ++ BFQ_ATTR(weights), ++ __ATTR_NULL ++}; ++ ++static struct elevator_type iosched_bfq = { ++ .ops = { ++ .elevator_merge_fn = bfq_merge, ++ .elevator_merged_fn = bfq_merged_request, ++ .elevator_merge_req_fn = bfq_merged_requests, ++ .elevator_allow_merge_fn = bfq_allow_merge, ++ .elevator_dispatch_fn = bfq_dispatch_requests, ++ .elevator_add_req_fn = bfq_insert_request, ++ .elevator_activate_req_fn = bfq_activate_request, ++ .elevator_deactivate_req_fn = bfq_deactivate_request, ++ .elevator_completed_req_fn = bfq_completed_request, ++ .elevator_former_req_fn = elv_rb_former_request, ++ .elevator_latter_req_fn = elv_rb_latter_request, ++ .elevator_init_icq_fn = bfq_init_icq, ++ .elevator_exit_icq_fn = bfq_exit_icq, ++ .elevator_set_req_fn = bfq_set_request, ++ .elevator_put_req_fn = bfq_put_request, ++ .elevator_may_queue_fn = bfq_may_queue, ++ .elevator_init_fn = bfq_init_queue, ++ .elevator_exit_fn = bfq_exit_queue, ++ }, ++ .icq_size = sizeof(struct bfq_io_cq), ++ .icq_align = __alignof__(struct bfq_io_cq), ++ .elevator_attrs = bfq_attrs, ++ .elevator_name = "bfq", ++ .elevator_owner = THIS_MODULE, ++}; ++ ++static int __init bfq_init(void) ++{ ++ /* ++ * Can be 0 on HZ < 1000 setups. ++ */ ++ if (bfq_slice_idle == 0) ++ bfq_slice_idle = 1; ++ ++ if (bfq_timeout_async == 0) ++ bfq_timeout_async = 1; ++ ++ if (bfq_slab_setup()) ++ return -ENOMEM; ++ ++ elv_register(&iosched_bfq); ++ ++ return 0; ++} ++ ++static void __exit bfq_exit(void) ++{ ++ elv_unregister(&iosched_bfq); ++ bfq_slab_kill(); ++} ++ ++module_init(bfq_init); ++module_exit(bfq_exit); ++ ++MODULE_AUTHOR("Fabio Checconi, Paolo Valente"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler"); +diff --git a/block/bfq-sched.c block/bfq-sched.c +new file mode 100644 +index 0000000..87bea97 +--- /dev/null ++++ block/bfq-sched.c +@@ -0,0 +1,1070 @@ ++/* ++ * BFQ: Hierarchical B-WF2Q+ scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ */ ++ ++#ifdef CONFIG_CGROUP_BFQIO ++#define for_each_entity(entity) \ ++ for (; entity != NULL; entity = entity->parent) ++ ++#define for_each_entity_safe(entity, parent) \ ++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) ++ ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, ++ int extract, ++ struct bfq_data *bfqd); ++ ++static inline void bfq_update_budget(struct bfq_entity *next_active) ++{ ++ struct bfq_entity *bfqg_entity; ++ struct bfq_group *bfqg; ++ struct bfq_sched_data *group_sd; ++ ++ BUG_ON(next_active == NULL); ++ ++ group_sd = next_active->sched_data; ++ ++ bfqg = container_of(group_sd, struct bfq_group, sched_data); ++ /* ++ * bfq_group's my_entity field is not NULL only if the group ++ * is not the root group. We must not touch the root entity ++ * as it must never become an active entity. ++ */ ++ bfqg_entity = bfqg->my_entity; ++ if (bfqg_entity != NULL) ++ bfqg_entity->budget = next_active->budget; ++} ++ ++static int bfq_update_next_active(struct bfq_sched_data *sd) ++{ ++ struct bfq_entity *next_active; ++ ++ if (sd->active_entity != NULL) ++ /* will update/requeue at the end of service */ ++ return 0; ++ ++ /* ++ * NOTE: this can be improved in many ways, such as returning ++ * 1 (and thus propagating upwards the update) only when the ++ * budget changes, or caching the bfqq that will be scheduled ++ * next from this subtree. By now we worry more about ++ * correctness than about performance... ++ */ ++ next_active = bfq_lookup_next_entity(sd, 0, NULL); ++ sd->next_active = next_active; ++ ++ if (next_active != NULL) ++ bfq_update_budget(next_active); ++ ++ return 1; ++} ++ ++static inline void bfq_check_next_active(struct bfq_sched_data *sd, ++ struct bfq_entity *entity) ++{ ++ BUG_ON(sd->next_active != entity); ++} ++#else ++#define for_each_entity(entity) \ ++ for (; entity != NULL; entity = NULL) ++ ++#define for_each_entity_safe(entity, parent) \ ++ for (parent = NULL; entity != NULL; entity = parent) ++ ++static inline int bfq_update_next_active(struct bfq_sched_data *sd) ++{ ++ return 0; ++} ++ ++static inline void bfq_check_next_active(struct bfq_sched_data *sd, ++ struct bfq_entity *entity) ++{ ++} ++ ++static inline void bfq_update_budget(struct bfq_entity *next_active) ++{ ++} ++#endif ++ ++/* ++ * Shift for timestamp calculations. This actually limits the maximum ++ * service allowed in one timestamp delta (small shift values increase it), ++ * the maximum total weight that can be used for the queues in the system ++ * (big shift values increase it), and the period of virtual time wraparounds. ++ */ ++#define WFQ_SERVICE_SHIFT 22 ++ ++/** ++ * bfq_gt - compare two timestamps. ++ * @a: first ts. ++ * @b: second ts. ++ * ++ * Return @a > @b, dealing with wrapping correctly. ++ */ ++static inline int bfq_gt(u64 a, u64 b) ++{ ++ return (s64)(a - b) > 0; ++} ++ ++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = NULL; ++ ++ BUG_ON(entity == NULL); ++ ++ if (entity->my_sched_data == NULL) ++ bfqq = container_of(entity, struct bfq_queue, entity); ++ ++ return bfqq; ++} ++ ++ ++/** ++ * bfq_delta - map service into the virtual time domain. ++ * @service: amount of service. ++ * @weight: scale factor (weight of an entity or weight sum). ++ */ ++static inline u64 bfq_delta(unsigned long service, ++ unsigned long weight) ++{ ++ u64 d = (u64)service << WFQ_SERVICE_SHIFT; ++ ++ do_div(d, weight); ++ return d; ++} ++ ++/** ++ * bfq_calc_finish - assign the finish time to an entity. ++ * @entity: the entity to act upon. ++ * @service: the service to be charged to the entity. ++ */ ++static inline void bfq_calc_finish(struct bfq_entity *entity, ++ unsigned long service) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ BUG_ON(entity->weight == 0); ++ ++ entity->finish = entity->start + ++ bfq_delta(service, entity->weight); ++ ++ if (bfqq != NULL) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "calc_finish: serv %lu, w %d", ++ service, entity->weight); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "calc_finish: start %llu, finish %llu, delta %llu", ++ entity->start, entity->finish, ++ bfq_delta(service, entity->weight)); ++ } ++} ++ ++/** ++ * bfq_entity_of - get an entity from a node. ++ * @node: the node field of the entity. ++ * ++ * Convert a node pointer to the relative entity. This is used only ++ * to simplify the logic of some functions and not as the generic ++ * conversion mechanism because, e.g., in the tree walking functions, ++ * the check for a %NULL value would be redundant. ++ */ ++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node) ++{ ++ struct bfq_entity *entity = NULL; ++ ++ if (node != NULL) ++ entity = rb_entry(node, struct bfq_entity, rb_node); ++ ++ return entity; ++} ++ ++/** ++ * bfq_extract - remove an entity from a tree. ++ * @root: the tree root. ++ * @entity: the entity to remove. ++ */ ++static inline void bfq_extract(struct rb_root *root, ++ struct bfq_entity *entity) ++{ ++ BUG_ON(entity->tree != root); ++ ++ entity->tree = NULL; ++ rb_erase(&entity->rb_node, root); ++} ++ ++/** ++ * bfq_idle_extract - extract an entity from the idle tree. ++ * @st: the service tree of the owning @entity. ++ * @entity: the entity being removed. ++ */ ++static void bfq_idle_extract(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *next; ++ ++ BUG_ON(entity->tree != &st->idle); ++ ++ if (entity == st->first_idle) { ++ next = rb_next(&entity->rb_node); ++ st->first_idle = bfq_entity_of(next); ++ } ++ ++ if (entity == st->last_idle) { ++ next = rb_prev(&entity->rb_node); ++ st->last_idle = bfq_entity_of(next); ++ } ++ ++ bfq_extract(&st->idle, entity); ++ ++ if (bfqq != NULL) ++ list_del(&bfqq->bfqq_list); ++} ++ ++/** ++ * bfq_insert - generic tree insertion. ++ * @root: tree root. ++ * @entity: entity to insert. ++ * ++ * This is used for the idle and the active tree, since they are both ++ * ordered by finish time. ++ */ ++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) ++{ ++ struct bfq_entity *entry; ++ struct rb_node **node = &root->rb_node; ++ struct rb_node *parent = NULL; ++ ++ BUG_ON(entity->tree != NULL); ++ ++ while (*node != NULL) { ++ parent = *node; ++ entry = rb_entry(parent, struct bfq_entity, rb_node); ++ ++ if (bfq_gt(entry->finish, entity->finish)) ++ node = &parent->rb_left; ++ else ++ node = &parent->rb_right; ++ } ++ ++ rb_link_node(&entity->rb_node, parent, node); ++ rb_insert_color(&entity->rb_node, root); ++ ++ entity->tree = root; ++} ++ ++/** ++ * bfq_update_min - update the min_start field of a entity. ++ * @entity: the entity to update. ++ * @node: one of its children. ++ * ++ * This function is called when @entity may store an invalid value for ++ * min_start due to updates to the active tree. The function assumes ++ * that the subtree rooted at @node (which may be its left or its right ++ * child) has a valid min_start value. ++ */ ++static inline void bfq_update_min(struct bfq_entity *entity, ++ struct rb_node *node) ++{ ++ struct bfq_entity *child; ++ ++ if (node != NULL) { ++ child = rb_entry(node, struct bfq_entity, rb_node); ++ if (bfq_gt(entity->min_start, child->min_start)) ++ entity->min_start = child->min_start; ++ } ++} ++ ++/** ++ * bfq_update_active_node - recalculate min_start. ++ * @node: the node to update. ++ * ++ * @node may have changed position or one of its children may have moved, ++ * this function updates its min_start value. The left and right subtrees ++ * are assumed to hold a correct min_start value. ++ */ ++static inline void bfq_update_active_node(struct rb_node *node) ++{ ++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); ++ ++ entity->min_start = entity->start; ++ bfq_update_min(entity, node->rb_right); ++ bfq_update_min(entity, node->rb_left); ++} ++ ++/** ++ * bfq_update_active_tree - update min_start for the whole active tree. ++ * @node: the starting node. ++ * ++ * @node must be the deepest modified node after an update. This function ++ * updates its min_start using the values held by its children, assuming ++ * that they did not change, and then updates all the nodes that may have ++ * changed in the path to the root. The only nodes that may have changed ++ * are the ones in the path or their siblings. ++ */ ++static void bfq_update_active_tree(struct rb_node *node) ++{ ++ struct rb_node *parent; ++ ++up: ++ bfq_update_active_node(node); ++ ++ parent = rb_parent(node); ++ if (parent == NULL) ++ return; ++ ++ if (node == parent->rb_left && parent->rb_right != NULL) ++ bfq_update_active_node(parent->rb_right); ++ else if (parent->rb_left != NULL) ++ bfq_update_active_node(parent->rb_left); ++ ++ node = parent; ++ goto up; ++} ++ ++/** ++ * bfq_active_insert - insert an entity in the active tree of its group/device. ++ * @st: the service tree of the entity. ++ * @entity: the entity being inserted. ++ * ++ * The active tree is ordered by finish time, but an extra key is kept ++ * per each node, containing the minimum value for the start times of ++ * its children (and the node itself), so it's possible to search for ++ * the eligible node with the lowest finish time in logarithmic time. ++ */ ++static void bfq_active_insert(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *node = &entity->rb_node; ++ ++ bfq_insert(&st->active, entity); ++ ++ if (node->rb_left != NULL) ++ node = node->rb_left; ++ else if (node->rb_right != NULL) ++ node = node->rb_right; ++ ++ bfq_update_active_tree(node); ++ ++ if (bfqq != NULL) ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); ++} ++ ++/** ++ * bfq_ioprio_to_weight - calc a weight from an ioprio. ++ * @ioprio: the ioprio value to convert. ++ */ ++static unsigned short bfq_ioprio_to_weight(int ioprio) ++{ ++ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); ++ return IOPRIO_BE_NR - ioprio; ++} ++ ++/** ++ * bfq_weight_to_ioprio - calc an ioprio from a weight. ++ * @weight: the weight value to convert. ++ * ++ * To preserve as mush as possible the old only-ioprio user interface, ++ * 0 is used as an escape ioprio value for weights (numerically) equal or ++ * larger than IOPRIO_BE_NR ++ */ ++static unsigned short bfq_weight_to_ioprio(int weight) ++{ ++ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); ++ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; ++} ++ ++static inline void bfq_get_entity(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct bfq_sched_data *sd; ++ ++ if (bfqq != NULL) { ++ sd = entity->sched_data; ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ } ++} ++ ++/** ++ * bfq_find_deepest - find the deepest node that an extraction can modify. ++ * @node: the node being removed. ++ * ++ * Do the first step of an extraction in an rb tree, looking for the ++ * node that will replace @node, and returning the deepest node that ++ * the following modifications to the tree can touch. If @node is the ++ * last node in the tree return %NULL. ++ */ ++static struct rb_node *bfq_find_deepest(struct rb_node *node) ++{ ++ struct rb_node *deepest; ++ ++ if (node->rb_right == NULL && node->rb_left == NULL) ++ deepest = rb_parent(node); ++ else if (node->rb_right == NULL) ++ deepest = node->rb_left; ++ else if (node->rb_left == NULL) ++ deepest = node->rb_right; ++ else { ++ deepest = rb_next(node); ++ if (deepest->rb_right != NULL) ++ deepest = deepest->rb_right; ++ else if (rb_parent(deepest) != node) ++ deepest = rb_parent(deepest); ++ } ++ ++ return deepest; ++} ++ ++/** ++ * bfq_active_extract - remove an entity from the active tree. ++ * @st: the service_tree containing the tree. ++ * @entity: the entity being removed. ++ */ ++static void bfq_active_extract(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *node; ++ ++ node = bfq_find_deepest(&entity->rb_node); ++ bfq_extract(&st->active, entity); ++ ++ if (node != NULL) ++ bfq_update_active_tree(node); ++ ++ if (bfqq != NULL) ++ list_del(&bfqq->bfqq_list); ++} ++ ++/** ++ * bfq_idle_insert - insert an entity into the idle tree. ++ * @st: the service tree containing the tree. ++ * @entity: the entity to insert. ++ */ ++static void bfq_idle_insert(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct bfq_entity *first_idle = st->first_idle; ++ struct bfq_entity *last_idle = st->last_idle; ++ ++ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish)) ++ st->first_idle = entity; ++ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish)) ++ st->last_idle = entity; ++ ++ bfq_insert(&st->idle, entity); ++ ++ if (bfqq != NULL) ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); ++} ++ ++/** ++ * bfq_forget_entity - remove an entity from the wfq trees. ++ * @st: the service tree. ++ * @entity: the entity being removed. ++ * ++ * Update the device status and forget everything about @entity, putting ++ * the device reference to it, if it is a queue. Entities belonging to ++ * groups are not refcounted. ++ */ ++static void bfq_forget_entity(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct bfq_sched_data *sd; ++ ++ BUG_ON(!entity->on_st); ++ ++ entity->on_st = 0; ++ st->wsum -= entity->weight; ++ if (bfqq != NULL) { ++ sd = entity->sched_data; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ } ++} ++ ++/** ++ * bfq_put_idle_entity - release the idle tree ref of an entity. ++ * @st: service tree for the entity. ++ * @entity: the entity being released. ++ */ ++static void bfq_put_idle_entity(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ bfq_idle_extract(st, entity); ++ bfq_forget_entity(st, entity); ++} ++ ++/** ++ * bfq_forget_idle - update the idle tree if necessary. ++ * @st: the service tree to act upon. ++ * ++ * To preserve the global O(log N) complexity we only remove one entry here; ++ * as the idle tree will not grow indefinitely this can be done safely. ++ */ ++static void bfq_forget_idle(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *first_idle = st->first_idle; ++ struct bfq_entity *last_idle = st->last_idle; ++ ++ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL && ++ !bfq_gt(last_idle->finish, st->vtime)) { ++ /* ++ * Forget the whole idle tree, increasing the vtime past ++ * the last finish time of idle entities. ++ */ ++ st->vtime = last_idle->finish; ++ } ++ ++ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime)) ++ bfq_put_idle_entity(st, first_idle); ++} ++ ++static struct bfq_service_tree * ++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_service_tree *new_st = old_st; ++ ++ if (entity->ioprio_changed) { ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ BUG_ON(old_st->wsum < entity->weight); ++ old_st->wsum -= entity->weight; ++ ++ if (entity->new_weight != entity->orig_weight) { ++ entity->orig_weight = entity->new_weight; ++ entity->ioprio = ++ bfq_weight_to_ioprio(entity->orig_weight); ++ } else if (entity->new_ioprio != entity->ioprio) { ++ entity->ioprio = entity->new_ioprio; ++ entity->orig_weight = ++ bfq_ioprio_to_weight(entity->ioprio); ++ } else ++ entity->new_weight = entity->orig_weight = ++ bfq_ioprio_to_weight(entity->ioprio); ++ ++ entity->ioprio_class = entity->new_ioprio_class; ++ entity->ioprio_changed = 0; ++ ++ /* ++ * NOTE: here we may be changing the weight too early, ++ * this will cause unfairness. The correct approach ++ * would have required additional complexity to defer ++ * weight changes to the proper time instants (i.e., ++ * when entity->finish <= old_st->vtime). ++ */ ++ new_st = bfq_entity_service_tree(entity); ++ entity->weight = entity->orig_weight * ++ (bfqq != NULL ? bfqq->raising_coeff : 1); ++ new_st->wsum += entity->weight; ++ ++ if (new_st != old_st) ++ entity->start = new_st->vtime; ++ } ++ ++ return new_st; ++} ++ ++/** ++ * bfq_bfqq_served - update the scheduler status after selection for service. ++ * @bfqq: the queue being served. ++ * @served: bytes to transfer. ++ * ++ * NOTE: this can be optimized, as the timestamps of upper level entities ++ * are synchronized every time a new bfqq is selected for service. By now, ++ * we keep it to better check consistency. ++ */ ++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st; ++ ++ for_each_entity(entity) { ++ st = bfq_entity_service_tree(entity); ++ ++ entity->service += served; ++ BUG_ON(entity->service > entity->budget); ++ BUG_ON(st->wsum == 0); ++ ++ st->vtime += bfq_delta(served, st->wsum); ++ bfq_forget_idle(st); ++ } ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served); ++} ++ ++/** ++ * bfq_bfqq_charge_full_budget - set the service to the entity budget. ++ * @bfqq: the queue that needs a service update. ++ * ++ * When it's not possible to be fair in the service domain, because ++ * a queue is not consuming its budget fast enough (the meaning of ++ * fast depends on the timeout parameter), we charge it a full ++ * budget. In this way we should obtain a sort of time-domain ++ * fairness among all the seeky/slow queues. ++ */ ++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); ++ ++ bfq_bfqq_served(bfqq, entity->budget - entity->service); ++} ++ ++/** ++ * __bfq_activate_entity - activate an entity. ++ * @entity: the entity being activated. ++ * ++ * Called whenever an entity is activated, i.e., it is not active and one ++ * of its children receives a new request, or has to be reactivated due to ++ * budget exhaustion. It uses the current budget of the entity (and the ++ * service received if @entity is active) of the queue to calculate its ++ * timestamps. ++ */ ++static void __bfq_activate_entity(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ ++ if (entity == sd->active_entity) { ++ BUG_ON(entity->tree != NULL); ++ /* ++ * If we are requeueing the current entity we have ++ * to take care of not charging to it service it has ++ * not received. ++ */ ++ bfq_calc_finish(entity, entity->service); ++ entity->start = entity->finish; ++ sd->active_entity = NULL; ++ } else if (entity->tree == &st->active) { ++ /* ++ * Requeueing an entity due to a change of some ++ * next_active entity below it. We reuse the old ++ * start time. ++ */ ++ bfq_active_extract(st, entity); ++ } else if (entity->tree == &st->idle) { ++ /* ++ * Must be on the idle tree, bfq_idle_extract() will ++ * check for that. ++ */ ++ bfq_idle_extract(st, entity); ++ entity->start = bfq_gt(st->vtime, entity->finish) ? ++ st->vtime : entity->finish; ++ } else { ++ /* ++ * The finish time of the entity may be invalid, and ++ * it is in the past for sure, otherwise the queue ++ * would have been on the idle tree. ++ */ ++ entity->start = st->vtime; ++ st->wsum += entity->weight; ++ bfq_get_entity(entity); ++ ++ BUG_ON(entity->on_st); ++ entity->on_st = 1; ++ } ++ ++ st = __bfq_entity_update_weight_prio(st, entity); ++ bfq_calc_finish(entity, entity->budget); ++ bfq_active_insert(st, entity); ++} ++ ++/** ++ * bfq_activate_entity - activate an entity and its ancestors if necessary. ++ * @entity: the entity to activate. ++ * ++ * Activate @entity and all the entities on the path from it to the root. ++ */ ++static void bfq_activate_entity(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sd; ++ ++ for_each_entity(entity) { ++ __bfq_activate_entity(entity); ++ ++ sd = entity->sched_data; ++ if (!bfq_update_next_active(sd)) ++ /* ++ * No need to propagate the activation to the ++ * upper entities, as they will be updated when ++ * the active entity is rescheduled. ++ */ ++ break; ++ } ++} ++ ++/** ++ * __bfq_deactivate_entity - deactivate an entity from its service tree. ++ * @entity: the entity to deactivate. ++ * @requeue: if false, the entity will not be put into the idle tree. ++ * ++ * Deactivate an entity, independently from its previous state. If the ++ * entity was not on a service tree just return, otherwise if it is on ++ * any scheduler tree, extract it from that tree, and if necessary ++ * and if the caller did not specify @requeue, put it on the idle tree. ++ * ++ * Return %1 if the caller should update the entity hierarchy, i.e., ++ * if the entity was under service or if it was the next_active for ++ * its sched_data; return %0 otherwise. ++ */ ++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ int was_active = entity == sd->active_entity; ++ int ret = 0; ++ ++ if (!entity->on_st) ++ return 0; ++ ++ BUG_ON(was_active && entity->tree != NULL); ++ ++ if (was_active) { ++ bfq_calc_finish(entity, entity->service); ++ sd->active_entity = NULL; ++ } else if (entity->tree == &st->active) ++ bfq_active_extract(st, entity); ++ else if (entity->tree == &st->idle) ++ bfq_idle_extract(st, entity); ++ else if (entity->tree != NULL) ++ BUG(); ++ ++ if (was_active || sd->next_active == entity) ++ ret = bfq_update_next_active(sd); ++ ++ if (!requeue || !bfq_gt(entity->finish, st->vtime)) ++ bfq_forget_entity(st, entity); ++ else ++ bfq_idle_insert(st, entity); ++ ++ BUG_ON(sd->active_entity == entity); ++ BUG_ON(sd->next_active == entity); ++ ++ return ret; ++} ++ ++/** ++ * bfq_deactivate_entity - deactivate an entity. ++ * @entity: the entity to deactivate. ++ * @requeue: true if the entity can be put on the idle tree ++ */ ++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) ++{ ++ struct bfq_sched_data *sd; ++ struct bfq_entity *parent; ++ ++ for_each_entity_safe(entity, parent) { ++ sd = entity->sched_data; ++ ++ if (!__bfq_deactivate_entity(entity, requeue)) ++ /* ++ * The parent entity is still backlogged, and ++ * we don't need to update it as it is still ++ * under service. ++ */ ++ break; ++ ++ if (sd->next_active != NULL) ++ /* ++ * The parent entity is still backlogged and ++ * the budgets on the path towards the root ++ * need to be updated. ++ */ ++ goto update; ++ ++ /* ++ * If we reach there the parent is no more backlogged and ++ * we want to propagate the dequeue upwards. ++ */ ++ requeue = 1; ++ } ++ ++ return; ++ ++update: ++ entity = parent; ++ for_each_entity(entity) { ++ __bfq_activate_entity(entity); ++ ++ sd = entity->sched_data; ++ if (!bfq_update_next_active(sd)) ++ break; ++ } ++} ++ ++/** ++ * bfq_update_vtime - update vtime if necessary. ++ * @st: the service tree to act upon. ++ * ++ * If necessary update the service tree vtime to have at least one ++ * eligible entity, skipping to its start time. Assumes that the ++ * active tree of the device is not empty. ++ * ++ * NOTE: this hierarchical implementation updates vtimes quite often, ++ * we may end up with reactivated tasks getting timestamps after a ++ * vtime skip done because we needed a ->first_active entity on some ++ * intermediate node. ++ */ ++static void bfq_update_vtime(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *entry; ++ struct rb_node *node = st->active.rb_node; ++ ++ entry = rb_entry(node, struct bfq_entity, rb_node); ++ if (bfq_gt(entry->min_start, st->vtime)) { ++ st->vtime = entry->min_start; ++ bfq_forget_idle(st); ++ } ++} ++ ++/** ++ * bfq_first_active - find the eligible entity with the smallest finish time ++ * @st: the service tree to select from. ++ * ++ * This function searches the first schedulable entity, starting from the ++ * root of the tree and going on the left every time on this side there is ++ * a subtree with at least one eligible (start >= vtime) entity. The path ++ * on the right is followed only if a) the left subtree contains no eligible ++ * entities and b) no eligible entity has been found yet. ++ */ ++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *entry, *first = NULL; ++ struct rb_node *node = st->active.rb_node; ++ ++ while (node != NULL) { ++ entry = rb_entry(node, struct bfq_entity, rb_node); ++left: ++ if (!bfq_gt(entry->start, st->vtime)) ++ first = entry; ++ ++ BUG_ON(bfq_gt(entry->min_start, st->vtime)); ++ ++ if (node->rb_left != NULL) { ++ entry = rb_entry(node->rb_left, ++ struct bfq_entity, rb_node); ++ if (!bfq_gt(entry->min_start, st->vtime)) { ++ node = node->rb_left; ++ goto left; ++ } ++ } ++ if (first != NULL) ++ break; ++ node = node->rb_right; ++ } ++ ++ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active)); ++ return first; ++} ++ ++/** ++ * __bfq_lookup_next_entity - return the first eligible entity in @st. ++ * @st: the service tree. ++ * ++ * Update the virtual time in @st and return the first eligible entity ++ * it contains. ++ */ ++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, ++ bool force) ++{ ++ struct bfq_entity *entity, *new_next_active = NULL; ++ ++ if (RB_EMPTY_ROOT(&st->active)) ++ return NULL; ++ ++ bfq_update_vtime(st); ++ entity = bfq_first_active_entity(st); ++ BUG_ON(bfq_gt(entity->start, st->vtime)); ++ ++ /* ++ * If the chosen entity does not match with the sched_data's ++ * next_active and we are forcedly serving the IDLE priority ++ * class tree, bubble up budget update. ++ */ ++ if (unlikely(force && entity != entity->sched_data->next_active)) { ++ new_next_active = entity; ++ for_each_entity(new_next_active) ++ bfq_update_budget(new_next_active); ++ } ++ ++ return entity; ++} ++ ++/** ++ * bfq_lookup_next_entity - return the first eligible entity in @sd. ++ * @sd: the sched_data. ++ * @extract: if true the returned entity will be also extracted from @sd. ++ * ++ * NOTE: since we cache the next_active entity at each level of the ++ * hierarchy, the complexity of the lookup can be decreased with ++ * absolutely no effort just returning the cached next_active value; ++ * we prefer to do full lookups to test the consistency of * the data ++ * structures. ++ */ ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, ++ int extract, ++ struct bfq_data *bfqd) ++{ ++ struct bfq_service_tree *st = sd->service_tree; ++ struct bfq_entity *entity; ++ int i=0; ++ ++ BUG_ON(sd->active_entity != NULL); ++ ++ if (bfqd != NULL && ++ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { ++ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true); ++ if (entity != NULL) { ++ i = BFQ_IOPRIO_CLASSES - 1; ++ bfqd->bfq_class_idle_last_service = jiffies; ++ sd->next_active = entity; ++ } ++ } ++ for (; i < BFQ_IOPRIO_CLASSES; i++) { ++ entity = __bfq_lookup_next_entity(st + i, false); ++ if (entity != NULL) { ++ if (extract) { ++ bfq_check_next_active(sd, entity); ++ bfq_active_extract(st + i, entity); ++ sd->active_entity = entity; ++ sd->next_active = NULL; ++ } ++ break; ++ } ++ } ++ ++ return entity; ++} ++ ++/* ++ * Get next queue for service. ++ */ ++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_entity *entity = NULL; ++ struct bfq_sched_data *sd; ++ struct bfq_queue *bfqq; ++ ++ BUG_ON(bfqd->active_queue != NULL); ++ ++ if (bfqd->busy_queues == 0) ++ return NULL; ++ ++ sd = &bfqd->root_group->sched_data; ++ for (; sd != NULL; sd = entity->my_sched_data) { ++ entity = bfq_lookup_next_entity(sd, 1, bfqd); ++ BUG_ON(entity == NULL); ++ entity->service = 0; ++ } ++ ++ bfqq = bfq_entity_to_bfqq(entity); ++ BUG_ON(bfqq == NULL); ++ ++ return bfqq; ++} ++ ++/* ++ * Forced extraction of the given queue. ++ */ ++static void bfq_get_next_queue_forced(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity; ++ struct bfq_sched_data *sd; ++ ++ BUG_ON(bfqd->active_queue != NULL); ++ ++ entity = &bfqq->entity; ++ /* ++ * Bubble up extraction/update from the leaf to the root. ++ */ ++ for_each_entity(entity) { ++ sd = entity->sched_data; ++ bfq_update_budget(entity); ++ bfq_update_vtime(bfq_entity_service_tree(entity)); ++ bfq_active_extract(bfq_entity_service_tree(entity), entity); ++ sd->active_entity = entity; ++ sd->next_active = NULL; ++ entity->service = 0; ++ } ++ ++ return; ++} ++ ++static void __bfq_bfqd_reset_active(struct bfq_data *bfqd) ++{ ++ if (bfqd->active_bic != NULL) { ++ put_io_context(bfqd->active_bic->icq.ioc); ++ bfqd->active_bic = NULL; ++ } ++ ++ bfqd->active_queue = NULL; ++ del_timer(&bfqd->idle_slice_timer); ++} ++ ++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ int requeue) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (bfqq == bfqd->active_queue) ++ __bfq_bfqd_reset_active(bfqd); ++ ++ bfq_deactivate_entity(entity, requeue); ++} ++ ++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_activate_entity(entity); ++} ++ ++/* ++ * Called when the bfqq no longer has requests pending, remove it from ++ * the service tree. ++ */ ++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ int requeue) ++{ ++ BUG_ON(!bfq_bfqq_busy(bfqq)); ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ bfq_log_bfqq(bfqd, bfqq, "del from busy"); ++ ++ bfq_clear_bfqq_busy(bfqq); ++ ++ BUG_ON(bfqd->busy_queues == 0); ++ bfqd->busy_queues--; ++ ++ bfq_deactivate_bfqq(bfqd, bfqq, requeue); ++} ++ ++/* ++ * Called when an inactive queue receives a new request. ++ */ ++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ BUG_ON(bfqq == bfqd->active_queue); ++ ++ bfq_log_bfqq(bfqd, bfqq, "add to busy"); ++ ++ bfq_activate_bfqq(bfqd, bfqq); ++ ++ bfq_mark_bfqq_busy(bfqq); ++ bfqd->busy_queues++; ++} +diff --git a/block/bfq.h block/bfq.h +new file mode 100644 +index 0000000..f487c32 +--- /dev/null ++++ block/bfq.h +@@ -0,0 +1,599 @@ ++/* ++ * BFQ-v5 for 3.3.0: data structures and common functions prototypes. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ */ ++ ++#ifndef _BFQ_H ++#define _BFQ_H ++ ++#include ++#include ++#include ++#include ++ ++#define BFQ_IOPRIO_CLASSES 3 ++#define BFQ_CL_IDLE_TIMEOUT HZ/5 ++ ++#define BFQ_MIN_WEIGHT 1 ++#define BFQ_MAX_WEIGHT 1000 ++ ++#define BFQ_DEFAULT_GRP_WEIGHT 10 ++#define BFQ_DEFAULT_GRP_IOPRIO 0 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE ++ ++struct bfq_entity; ++ ++/** ++ * struct bfq_service_tree - per ioprio_class service tree. ++ * @active: tree for active entities (i.e., those backlogged). ++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). ++ * @first_idle: idle entity with minimum F_i. ++ * @last_idle: idle entity with maximum F_i. ++ * @vtime: scheduler virtual time. ++ * @wsum: scheduler weight sum; active and idle entities contribute to it. ++ * ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each ++ * ioprio_class has its own independent scheduler, and so its own ++ * bfq_service_tree. All the fields are protected by the queue lock ++ * of the containing bfqd. ++ */ ++struct bfq_service_tree { ++ struct rb_root active; ++ struct rb_root idle; ++ ++ struct bfq_entity *first_idle; ++ struct bfq_entity *last_idle; ++ ++ u64 vtime; ++ unsigned long wsum; ++}; ++ ++/** ++ * struct bfq_sched_data - multi-class scheduler. ++ * @active_entity: entity under service. ++ * @next_active: head-of-the-line entity in the scheduler. ++ * @service_tree: array of service trees, one per ioprio_class. ++ * ++ * bfq_sched_data is the basic scheduler queue. It supports three ++ * ioprio_classes, and can be used either as a toplevel queue or as ++ * an intermediate queue on a hierarchical setup. ++ * @next_active points to the active entity of the sched_data service ++ * trees that will be scheduled next. ++ * ++ * The supported ioprio_classes are the same as in CFQ, in descending ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. ++ * Requests from higher priority queues are served before all the ++ * requests from lower priority queues; among requests of the same ++ * queue requests are served according to B-WF2Q+. ++ * All the fields are protected by the queue lock of the containing bfqd. ++ */ ++struct bfq_sched_data { ++ struct bfq_entity *active_entity; ++ struct bfq_entity *next_active; ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; ++}; ++ ++/** ++ * struct bfq_entity - schedulable entity. ++ * @rb_node: service_tree member. ++ * @on_st: flag, true if the entity is on a tree (either the active or ++ * the idle one of its service_tree). ++ * @finish: B-WF2Q+ finish timestamp (aka F_i). ++ * @start: B-WF2Q+ start timestamp (aka S_i). ++ * @tree: tree the entity is enqueued into; %NULL if not on a tree. ++ * @min_start: minimum start time of the (active) subtree rooted at ++ * this entity; used for O(log N) lookups into active trees. ++ * @service: service received during the last round of service. ++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. ++ * @weight: weight of the queue ++ * @parent: parent entity, for hierarchical scheduling. ++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the ++ * associated scheduler queue, %NULL on leaf nodes. ++ * @sched_data: the scheduler queue this entity belongs to. ++ * @ioprio: the ioprio in use. ++ * @new_weight: when a weight change is requested, the new weight value. ++ * @orig_weight: original weight, used to implement weight boosting ++ * @new_ioprio: when an ioprio change is requested, the new ioprio value. ++ * @ioprio_class: the ioprio_class in use. ++ * @new_ioprio_class: when an ioprio_class change is requested, the new ++ * ioprio_class value. ++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or ++ * ioprio_class change. ++ * ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each ++ * entity belongs to the sched_data of the parent group in the cgroup ++ * hierarchy. Non-leaf entities have also their own sched_data, stored ++ * in @my_sched_data. ++ * ++ * Each entity stores independently its priority values; this would ++ * allow different weights on different devices, but this ++ * functionality is not exported to userspace by now. Priorities and ++ * weights are updated lazily, first storing the new values into the ++ * new_* fields, then setting the @ioprio_changed flag. As soon as ++ * there is a transition in the entity state that allows the priority ++ * update to take place the effective and the requested priority ++ * values are synchronized. ++ * ++ * Unless cgroups are used, the weight value is calculated from the ++ * ioprio to export the same interface as CFQ. When dealing with ++ * ``well-behaved'' queues (i.e., queues that do not spend too much ++ * time to consume their budget and have true sequential behavior, and ++ * when there are no external factors breaking anticipation) the ++ * relative weights at each level of the cgroups hierarchy should be ++ * guaranteed. All the fields are protected by the queue lock of the ++ * containing bfqd. ++ */ ++struct bfq_entity { ++ struct rb_node rb_node; ++ ++ int on_st; ++ ++ u64 finish; ++ u64 start; ++ ++ struct rb_root *tree; ++ ++ u64 min_start; ++ ++ unsigned long service, budget; ++ unsigned short weight, new_weight; ++ unsigned short orig_weight; ++ ++ struct bfq_entity *parent; ++ ++ struct bfq_sched_data *my_sched_data; ++ struct bfq_sched_data *sched_data; ++ ++ unsigned short ioprio, new_ioprio; ++ unsigned short ioprio_class, new_ioprio_class; ++ ++ int ioprio_changed; ++}; ++ ++struct bfq_group; ++ ++/** ++ * struct bfq_queue - leaf schedulable entity. ++ * @ref: reference counter. ++ * @bfqd: parent bfq_data. ++ * @new_bfqq: shared bfq_queue if queue is cooperating with ++ * one or more other queues. ++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree). ++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree). ++ * @sort_list: sorted list of pending requests. ++ * @next_rq: if fifo isn't expired, next request to serve. ++ * @queued: nr of requests queued in @sort_list. ++ * @allocated: currently allocated requests. ++ * @meta_pending: pending metadata requests. ++ * @fifo: fifo list of requests in sort_list. ++ * @entity: entity representing this queue in the scheduler. ++ * @max_budget: maximum budget allowed from the feedback mechanism. ++ * @budget_timeout: budget expiration (in jiffies). ++ * @dispatched: number of requests on the dispatch list or inside driver. ++ * @org_ioprio: saved ioprio during boosted periods. ++ * @flags: status flags. ++ * @bfqq_list: node for active/idle bfqq list inside our bfqd. ++ * @seek_samples: number of seeks sampled ++ * @seek_total: sum of the distances of the seeks sampled ++ * @seek_mean: mean seek distance ++ * @last_request_pos: position of the last request enqueued ++ * @pid: pid of the process owning the queue, used for logging purposes. ++ * @last_rais_start_time: last (idle -> weight-raised) transition attempt ++ * @raising_cur_max_time: current max raising time for this queue ++ * ++ * A bfq_queue is a leaf request queue; it can be associated to an io_context ++ * or more (if it is an async one). @cgroup holds a reference to the ++ * cgroup, to be sure that it does not disappear while a bfqq still ++ * references it (mostly to avoid races between request issuing and task ++ * migration followed by cgroup distruction). ++ * All the fields are protected by the queue lock of the containing bfqd. ++ */ ++struct bfq_queue { ++ atomic_t ref; ++ struct bfq_data *bfqd; ++ ++ /* fields for cooperating queues handling */ ++ struct bfq_queue *new_bfqq; ++ struct rb_node pos_node; ++ struct rb_root *pos_root; ++ ++ struct rb_root sort_list; ++ struct request *next_rq; ++ int queued[2]; ++ int allocated[2]; ++ int meta_pending; ++ struct list_head fifo; ++ ++ struct bfq_entity entity; ++ ++ unsigned long max_budget; ++ unsigned long budget_timeout; ++ ++ int dispatched; ++ ++ unsigned short org_ioprio; ++ ++ unsigned int flags; ++ ++ struct list_head bfqq_list; ++ ++ unsigned int seek_samples; ++ u64 seek_total; ++ sector_t seek_mean; ++ sector_t last_request_pos; ++ ++ pid_t pid; ++ ++ /* weight-raising fields */ ++ unsigned int raising_cur_max_time; ++ u64 last_rais_start_finish, soft_rt_next_start; ++ unsigned int raising_coeff; ++}; ++ ++/** ++ * struct bfq_ttime - per process thinktime stats. ++ * @ttime_total: total process thinktime ++ * @ttime_samples: number of thinktime samples ++ * @ttime_mean: average process thinktime ++ */ ++struct bfq_ttime { ++ unsigned long last_end_request; ++ ++ unsigned long ttime_total; ++ unsigned long ttime_samples; ++ unsigned long ttime_mean; ++}; ++ ++/** ++ * struct bfq_io_cq - per (request_queue, io_context) structure. ++ * @icq: associated io_cq structure ++ * @bfqq: array of two process queues, the sync and the async ++ * @ttime: associated @bfq_ttime struct ++ */ ++struct bfq_io_cq { ++ struct io_cq icq; /* must be the first member */ ++ struct bfq_queue *bfqq[2]; ++ struct bfq_ttime ttime; ++}; ++ ++/** ++ * struct bfq_data - per device data structure. ++ * @queue: request queue for the managed device. ++ * @root_group: root bfq_group for the device. ++ * @rq_pos_tree: rbtree sorted by next_request position, ++ * used when determining if two or more queues ++ * have interleaving requests (see bfq_close_cooperator). ++ * @busy_queues: number of bfq_queues containing requests (including the ++ * queue under service, even if it is idling). ++ * @queued: number of queued requests. ++ * @rq_in_driver: number of requests dispatched and waiting for completion. ++ * @sync_flight: number of sync requests in the driver. ++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples ++ * completed requests . ++ * @hw_tag_samples: nr of samples used to calculate hw_tag. ++ * @hw_tag: flag set to one if the driver is showing a queueing behavior. ++ * @budgets_assigned: number of budgets assigned. ++ * @idle_slice_timer: timer set when idling for the next sequential request ++ * from the queue under service. ++ * @unplug_work: delayed work to restart dispatching on the request queue. ++ * @active_queue: bfq_queue under service. ++ * @active_bic: bfq_io_cq (bic) associated with the @active_queue. ++ * @last_position: on-disk position of the last served request. ++ * @last_budget_start: beginning of the last budget. ++ * @last_idling_start: beginning of the last idle slice. ++ * @peak_rate: peak transfer rate observed for a budget. ++ * @peak_rate_samples: number of samples used to calculate @peak_rate. ++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling. ++ * @group_list: list of all the bfq_groups active on the device. ++ * @active_list: list of all the bfq_queues active on the device. ++ * @idle_list: list of all the bfq_queues idle on the device. ++ * @bfq_quantum: max number of requests dispatched per dispatch round. ++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires ++ * requests are served in fifo order. ++ * @bfq_back_penalty: weight of backward seeks wrt forward ones. ++ * @bfq_back_max: maximum allowed backward seek. ++ * @bfq_slice_idle: maximum idling time. ++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning). ++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to ++ * async queues. ++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to ++ * to prevent seeky queues to impose long latencies to well ++ * behaved ones (this also implies that seeky queues cannot ++ * receive guarantees in the service domain; after a timeout ++ * they are charged for the whole allocated budget, to try ++ * to preserve a behavior reasonably fair among them, but ++ * without service-domain guarantees). ++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted ++ * queue is multiplied ++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies) ++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes ++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising ++ * may be reactivated for a queue (in jiffies) ++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals ++ * after which weight-raising may be ++ * reactivated for an already busy queue ++ * (in jiffies) ++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue, ++ * sectors per seconds ++ * @RT_prod: cached value of the product R*T used for computing the maximum ++ * duration of the weight raising automatically ++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions ++ * ++ * All the fields are protected by the @queue lock. ++ */ ++struct bfq_data { ++ struct request_queue *queue; ++ ++ struct bfq_group *root_group; ++ ++ struct rb_root rq_pos_tree; ++ ++ int busy_queues; ++ int queued; ++ int rq_in_driver; ++ int sync_flight; ++ ++ int max_rq_in_driver; ++ int hw_tag_samples; ++ int hw_tag; ++ ++ int budgets_assigned; ++ ++ struct timer_list idle_slice_timer; ++ struct work_struct unplug_work; ++ ++ struct bfq_queue *active_queue; ++ struct bfq_io_cq *active_bic; ++ ++ sector_t last_position; ++ ++ ktime_t last_budget_start; ++ ktime_t last_idling_start; ++ int peak_rate_samples; ++ u64 peak_rate; ++ unsigned long bfq_max_budget; ++ ++ struct hlist_head group_list; ++ struct list_head active_list; ++ struct list_head idle_list; ++ ++ unsigned int bfq_quantum; ++ unsigned int bfq_fifo_expire[2]; ++ unsigned int bfq_back_penalty; ++ unsigned int bfq_back_max; ++ unsigned int bfq_slice_idle; ++ u64 bfq_class_idle_last_service; ++ ++ unsigned int bfq_user_max_budget; ++ unsigned int bfq_max_budget_async_rq; ++ unsigned int bfq_timeout[2]; ++ ++ bool low_latency; ++ ++ /* parameters of the low_latency heuristics */ ++ unsigned int bfq_raising_coeff; ++ unsigned int bfq_raising_max_time; ++ unsigned int bfq_raising_rt_max_time; ++ unsigned int bfq_raising_min_idle_time; ++ unsigned int bfq_raising_min_inter_arr_async; ++ unsigned int bfq_raising_max_softrt_rate; ++ u64 RT_prod; ++ ++ struct bfq_queue oom_bfqq; ++}; ++ ++enum bfqq_state_flags { ++ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */ ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ ++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ ++ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ ++ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */ ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ ++ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ ++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */ ++ BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */ ++}; ++ ++#define BFQ_BFQQ_FNS(name) \ ++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ ++{ \ ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ ++} ++ ++BFQ_BFQQ_FNS(busy); ++BFQ_BFQQ_FNS(wait_request); ++BFQ_BFQQ_FNS(must_alloc); ++BFQ_BFQQ_FNS(fifo_expire); ++BFQ_BFQQ_FNS(idle_window); ++BFQ_BFQQ_FNS(prio_changed); ++BFQ_BFQQ_FNS(sync); ++BFQ_BFQQ_FNS(budget_new); ++BFQ_BFQQ_FNS(coop); ++BFQ_BFQQ_FNS(split_coop); ++BFQ_BFQQ_FNS(some_coop_idle); ++#undef BFQ_BFQQ_FNS ++ ++/* Logging facilities. */ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) ++ ++/* Expiration reasons. */ ++enum bfqq_expiration { ++ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */ ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ ++}; ++ ++#ifdef CONFIG_CGROUP_BFQIO ++/** ++ * struct bfq_group - per (device, cgroup) data structure. ++ * @entity: schedulable entity to insert into the parent group sched_data. ++ * @sched_data: own sched_data, to contain child entities (they may be ++ * both bfq_queues and bfq_groups). ++ * @group_node: node to be inserted into the bfqio_cgroup->group_data ++ * list of the containing cgroup's bfqio_cgroup. ++ * @bfqd_node: node to be inserted into the @bfqd->group_list list ++ * of the groups active on the same device; used for cleanup. ++ * @bfqd: the bfq_data for the device this group acts upon. ++ * @async_bfqq: array of async queues for all the tasks belonging to ++ * the group, one queue per ioprio value per ioprio_class, ++ * except for the idle class that has only one queue. ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used ++ * to avoid too many special cases during group creation/migration. ++ * ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup ++ * there is a set of bfq_groups, each one collecting the lower-level ++ * entities belonging to the group that are acting on the same device. ++ * ++ * Locking works as follows: ++ * o @group_node is protected by the bfqio_cgroup lock, and is accessed ++ * via RCU from its readers. ++ * o @bfqd is protected by the queue lock, RCU is used to access it ++ * from the readers. ++ * o All the other fields are protected by the @bfqd queue lock. ++ */ ++struct bfq_group { ++ struct bfq_entity entity; ++ struct bfq_sched_data sched_data; ++ ++ struct hlist_node group_node; ++ struct hlist_node bfqd_node; ++ ++ void *bfqd; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct bfq_entity *my_entity; ++}; ++ ++/** ++ * struct bfqio_cgroup - bfq cgroup data structure. ++ * @css: subsystem state for bfq in the containing cgroup. ++ * @weight: cgroup weight. ++ * @ioprio: cgroup ioprio. ++ * @ioprio_class: cgroup ioprio_class. ++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data. ++ * @group_data: list containing the bfq_group belonging to this cgroup. ++ * ++ * @group_data is accessed using RCU, with @lock protecting the updates, ++ * @ioprio and @ioprio_class are protected by @lock. ++ */ ++struct bfqio_cgroup { ++ struct cgroup_subsys_state css; ++ ++ unsigned short weight, ioprio, ioprio_class; ++ ++ spinlock_t lock; ++ struct hlist_head group_data; ++}; ++#else ++struct bfq_group { ++ struct bfq_sched_data sched_data; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++}; ++#endif ++ ++static inline struct bfq_service_tree * ++bfq_entity_service_tree(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sched_data = entity->sched_data; ++ unsigned int idx = entity->ioprio_class - 1; ++ ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); ++ BUG_ON(sched_data == NULL); ++ ++ return sched_data->service_tree + idx; ++} ++ ++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, ++ int is_sync) ++{ ++ return bic->bfqq[!!is_sync]; ++} ++ ++static inline void bic_set_bfqq(struct bfq_io_cq *bic, ++ struct bfq_queue *bfqq, int is_sync) ++{ ++ bic->bfqq[!!is_sync] = bfqq; ++} ++ ++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) ++{ ++ return bic->icq.q->elevator->elevator_data; ++} ++ ++/** ++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. ++ * @ptr: a pointer to a bfqd. ++ * @flags: storage for the flags to be saved. ++ * ++ * This function allows bfqg->bfqd to be protected by the ++ * queue lock of the bfqd they reference; the pointer is dereferenced ++ * under RCU, so the storage for bfqd is assured to be safe as long ++ * as the RCU read side critical section does not end. After the ++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be ++ * sure that no other writer accessed it. If we raced with a writer, ++ * the function returns NULL, with the queue unlocked, otherwise it ++ * returns the dereferenced pointer, with the queue locked. ++ */ ++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr, ++ unsigned long *flags) ++{ ++ struct bfq_data *bfqd; ++ ++ rcu_read_lock(); ++ bfqd = rcu_dereference(*(struct bfq_data **)ptr); ++ ++ if (bfqd != NULL) { ++ spin_lock_irqsave(bfqd->queue->queue_lock, *flags); ++ if (*ptr == bfqd) ++ goto out; ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); ++ } ++ ++ bfqd = NULL; ++out: ++ rcu_read_unlock(); ++ return bfqd; ++} ++ ++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd, ++ unsigned long *flags) ++{ ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); ++} ++ ++static void bfq_changed_ioprio(struct io_context *ioc, ++ struct bfq_io_cq *bic); ++static void bfq_put_queue(struct bfq_queue *bfqq); ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, int is_sync, ++ struct io_context *ioc, gfp_t gfp_mask); ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); ++#endif +-- +1.7.10.4 + diff --git a/3.3.8/0003-AppArmor-Allow-dfa-backward-compatibility-with-broke.patch b/3.3.8/0003-AppArmor-Allow-dfa-backward-compatibility-with-broke.patch new file mode 100644 index 0000000..be32585 --- /dev/null +++ b/3.3.8/0003-AppArmor-Allow-dfa-backward-compatibility-with-broke.patch @@ -0,0 +1,69 @@ +From 7a10d093f9779f42cb8d6affcb6a4436d3ebd6d3 Mon Sep 17 00:00:00 2001 +From: John Johansen +Date: Wed, 10 Aug 2011 22:02:41 -0700 +Subject: [PATCH 3/3] AppArmor: Allow dfa backward compatibility with broken + userspace + +The apparmor_parser when compiling policy could generate invalid dfas +that did not have sufficient padding to avoid invalid references, when +used by the kernel. The kernels check to verify the next/check table +size was broken meaning invalid dfas were being created by userspace +and not caught. + +To remain compatible with old tools that are not fixed, pad the loaded +dfas next/check table. The dfa's themselves are valid except for the +high padding for potentially invalid transitions (high bounds error), +which have a maximimum is 256 entries. So just allocate an extra null filled +256 entries for the next/check tables. This will guarentee all bounds +are good and invalid transitions go to the null (0) state. + +Signed-off-by: John Johansen +--- + security/apparmor/match.c | 17 +++++++++++++++++ + 1 files changed, 17 insertions(+), 0 deletions(-) + +diff --git a/security/apparmor/match.c b/security/apparmor/match.c +index 94de6b4..081491e 100644 +--- a/security/apparmor/match.c ++++ b/security/apparmor/match.c +@@ -57,8 +57,17 @@ static struct table_header *unpack_table(char *blob, size_t bsize) + if (bsize < tsize) + goto out; + ++ /* Pad table allocation for next/check by 256 entries to remain ++ * backwards compatible with old (buggy) tools and remain safe without ++ * run time checks ++ */ ++ if (th.td_id == YYTD_ID_NXT || th.td_id == YYTD_ID_CHK) ++ tsize += 256 * th.td_flags; ++ + table = kvmalloc(tsize); + if (table) { ++ /* ensure the pad is clear, else there will be errors */ ++ memset(table, 0, tsize); + *table = th; + if (th.td_flags == YYTD_DATA8) + UNPACK_ARRAY(table->td_data, blob, th.td_lolen, +@@ -134,11 +143,19 @@ static int verify_dfa(struct aa_dfa *dfa, int flags) + goto out; + + if (flags & DFA_FLAG_VERIFY_STATES) { ++ int warning = 0; + for (i = 0; i < state_count; i++) { + if (DEFAULT_TABLE(dfa)[i] >= state_count) + goto out; + /* TODO: do check that DEF state recursion terminates */ + if (BASE_TABLE(dfa)[i] + 255 >= trans_count) { ++ if (warning) ++ continue; ++ printk(KERN_WARNING "AppArmor DFA next/check " ++ "upper bounds error fixed, upgrade " ++ "user space tools \n"); ++ warning = 1; ++ } else if (BASE_TABLE(dfa)[i] >= trans_count) { + printk(KERN_ERR "AppArmor DFA next/check upper " + "bounds error\n"); + goto out; +-- +1.7.5.4 + diff --git a/3.3.8/01patch-2.6.33_atopcnt.patch b/3.3.8/01patch-2.6.33_atopcnt.patch new file mode 100644 index 0000000..28bf733 --- /dev/null +++ b/3.3.8/01patch-2.6.33_atopcnt.patch @@ -0,0 +1,174 @@ +diff --git a/block/blk-core.c b/block/blk-core.c +index d1a9a0a..8b54acb 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -73,6 +73,17 @@ static void drive_stat_acct(struct request *rq, int new_io) + part_inc_in_flight(part, rw); + } + ++ switch (rw) { /* ATOP */ ++ case READ: /* ATOP */ ++ current->group_leader->stat.dsk_rio += new_io; /* ATOP */ ++ current->group_leader->stat.dsk_rsz += blk_rq_sectors(rq); /* ATOP */ ++ break; /* ATOP */ ++ case WRITE: /* ATOP */ ++ current->group_leader->stat.dsk_wio += new_io; /* ATOP */ ++ current->group_leader->stat.dsk_wsz += blk_rq_sectors(rq); /* ATOP */ ++ break; /* ATOP */ ++ } /* ATOP */ ++ + part_stat_unlock(); + } + +diff --git a/fs/proc/array.c b/fs/proc/array.c +index 13b5d07..cac522e 100644 +--- a/fs/proc/array.c ++++ b/fs/proc/array.c +@@ -515,6 +515,25 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, + (unsigned long long)delayacct_blkio_ticks(task), + cputime_to_clock_t(gtime), + cputime_to_clock_t(cgtime)); ++ ++ seq_printf(m, /* ATOP */ ++ "%lu %llu %lu %llu %lu %llu %lu " /* ATOP */ ++ "%llu %lu %llu %lu %llu %lu %lu\n", /* ATOP */ ++ task->stat.dsk_rio, /* ATOP */ ++ task->stat.dsk_rsz, /* ATOP */ ++ task->stat.dsk_wio, /* ATOP */ ++ task->stat.dsk_wsz, /* ATOP */ ++ task->stat.tcp_snd, /* ATOP */ ++ task->stat.tcp_ssz, /* ATOP */ ++ task->stat.tcp_rcv, /* ATOP */ ++ task->stat.tcp_rsz, /* ATOP */ ++ task->stat.udp_snd, /* ATOP */ ++ task->stat.udp_ssz, /* ATOP */ ++ task->stat.udp_rcv, /* ATOP */ ++ task->stat.udp_rsz, /* ATOP */ ++ task->stat.raw_snd, /* ATOP */ ++ task->stat.raw_rcv); /* ATOP */ ++ + if (mm) + mmput(mm); + return 0; +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 78efe7c..22391bf 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1512,6 +1512,17 @@ struct task_struct { + #endif + atomic_t fs_excl; /* holding fs exclusive resources */ + struct rcu_head rcu; ++ ++ struct { /* ATOP */ ++ unsigned long dsk_rio, dsk_wio; /* ATOP */ ++ unsigned long long dsk_rsz, dsk_wsz; /* ATOP */ ++ unsigned long tcp_snd, tcp_rcv; /* ATOP */ ++ unsigned long long tcp_ssz, tcp_rsz; /* ATOP */ ++ unsigned long udp_snd, udp_rcv; /* ATOP */ ++ unsigned long long udp_ssz, udp_rsz; /* ATOP */ ++ unsigned long raw_snd, raw_rcv; /* ATOP */ ++ } stat; /* ATOP */ ++ + + /* + * cache last used pipe for splice +diff --git a/kernel/acct.c b/kernel/acct.c +index a6605ca..d5df53a 100644 +--- a/kernel/acct.c ++++ b/kernel/acct.c +@@ -565,7 +565,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, + ac.ac_exitcode = pacct->ac_exitcode; + spin_unlock_irq(¤t->sighand->siglock); + ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ +- ac.ac_rw = encode_comp_t(ac.ac_io / 1024); ++ ac.ac_rw = encode_comp_t(current->stat.dsk_rio + current->stat.dsk_wio); /* ATOP */ + ac.ac_swaps = encode_comp_t(0); + + /* +diff --git a/kernel/fork.c b/kernel/fork.c +index f88bd98..bab2085 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -683,6 +683,14 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) + + tsk->min_flt = tsk->maj_flt = 0; + tsk->nvcsw = tsk->nivcsw = 0; ++ tsk->stat.dsk_rio = tsk->stat.dsk_wio = 0; /* ATOP */ ++ tsk->stat.dsk_rsz = tsk->stat.dsk_wsz = 0; /* ATOP */ ++ tsk->stat.tcp_snd = tsk->stat.tcp_rcv = 0; /* ATOP */ ++ tsk->stat.tcp_ssz = tsk->stat.tcp_rsz = 0; /* ATOP */ ++ tsk->stat.udp_snd = tsk->stat.udp_rcv = 0; /* ATOP */ ++ tsk->stat.udp_ssz = tsk->stat.udp_rsz = 0; /* ATOP */ ++ tsk->stat.raw_snd = tsk->stat.raw_rcv = 0; /* ATOP */ ++ + #ifdef CONFIG_DETECT_HUNG_TASK + tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; + #endif +diff --git a/net/socket.c b/net/socket.c +index 769c386..3ba19f6 100644 +--- a/net/socket.c ++++ b/net/socket.c +@@ -547,10 +547,28 @@ static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, + si->size = size; + + err = security_socket_sendmsg(sock, msg, size); +- if (err) +- return err; +- +- return sock->ops->sendmsg(iocb, sock, msg, size); ++ if (!err) ++ err = sock->ops->sendmsg(iocb, sock, msg, size); ++ ++ if (err >= 0 && sock->sk) { /* ATOP */ ++ switch (sock->sk->sk_family) { /* ATOP */ ++ case PF_INET: /* ATOP */ ++ case PF_INET6: /* ATOP */ ++ switch (sock->sk->sk_type) { /* ATOP */ ++ case SOCK_STREAM: /* ATOP */ ++ current->group_leader->stat.tcp_snd++; /* ATOP */ ++ current->group_leader->stat.tcp_ssz+=size;/* ATOP */ ++ break; /* ATOP */ ++ case SOCK_DGRAM: /* ATOP */ ++ current->group_leader->stat.udp_snd++; /* ATOP */ ++ current->group_leader->stat.udp_ssz+=size;/* ATOP */ ++ break; /* ATOP */ ++ case SOCK_RAW: /* ATOP */ ++ current->group_leader->stat.raw_snd++; /* ATOP */ ++ } /* ATOP */ ++ } /* ATOP */ ++ } /* ATOP */ ++ return err; + } + + int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +@@ -682,7 +700,29 @@ static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, + { + int err = security_socket_recvmsg(sock, msg, size, flags); + +- return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags); ++ if (!err) ++ err = __sock_recvmsg_nosec(iocb, sock, msg, size, flags); ++ ++ if (err >= 0 && sock->sk) { /* ATOP */ ++ switch (sock->sk->sk_family) { /* ATOP */ ++ case PF_INET: /* ATOP */ ++ case PF_INET6: /* ATOP */ ++ switch (sock->sk->sk_type) { /* ATOP */ ++ case SOCK_STREAM: /* ATOP */ ++ current->group_leader->stat.tcp_rcv++; /* ATOP */ ++ current->group_leader->stat.tcp_rsz+=err; /* ATOP */ ++ break; /* ATOP */ ++ case SOCK_DGRAM: /* ATOP */ ++ current->group_leader->stat.udp_rcv++; /* ATOP */ ++ current->group_leader->stat.udp_rsz+=err; /* ATOP */ ++ break; /* ATOP */ ++ case SOCK_RAW: /* ATOP */ ++ current->group_leader->stat.raw_rcv++; /* ATOP */ ++ break; /* ATOP */ ++ } /* ATOP */ ++ } /* ATOP */ ++ } /* ATOP */ ++ return err; + } + + int sock_recvmsg(struct socket *sock, struct msghdr *msg, diff --git a/3.3.8/02patch-2.6.33_atopacct.patch b/3.3.8/02patch-2.6.33_atopacct.patch new file mode 100644 index 0000000..74e6a1c --- /dev/null +++ b/3.3.8/02patch-2.6.33_atopacct.patch @@ -0,0 +1,125 @@ +Index: linux-2.6.28/include/linux/acct.h +=================================================================== +--- linux-2.6.28.orig/include/linux/acct.h 2009-01-14 13:02:24.000000000 +0100 ++++ linux-2.6.28/include/linux/acct.h 2009-01-14 13:03:33.000000000 +0100 +@@ -97,6 +97,54 @@ + char ac_comm[ACCT_COMM]; /* Command Name */ + }; + ++struct acct_atop ++{ ++ char ac_flag; /* Flags */ ++ char ac_version; /* Always set to ACCT_VERSION */ ++ __u32 ac_pid; /* Process ID */ ++ __u32 ac_ppid; /* Parent Process ID */ ++ __u16 ac_uid16; /* LSB of Real User ID */ ++ __u16 ac_gid16; /* LSB of Real Group ID */ ++ __u16 ac_tty; /* Control Terminal */ ++ __u32 ac_btime; /* Process Creation Time */ ++ comp_t ac_utime; /* User Time */ ++ comp_t ac_stime; /* System Time */ ++ comp_t ac_etime; /* Elapsed Time */ ++ comp_t ac_mem; /* Virtual Memory */ ++ comp_t ac_rss; /* Resident Memory */ ++ comp_t ac_io; /* Chars Transferred */ ++ comp_t ac_rw; /* Blocks Read or Written */ ++ comp_t ac_bread; /* Blocks Read */ ++ comp_t ac_bwrite; /* Blocks Written */ ++ comp2_t ac_dskrsz; /* Cum. blocks read */ ++ comp2_t ac_dskwsz; /* Cum. blocks written */ ++ comp_t ac_tcpsnd; /* TCP send requests */ ++ comp_t ac_tcprcv; /* TCP recv requests */ ++ comp2_t ac_tcpssz; /* TCP cum. length */ ++ comp2_t ac_tcprsz; /* TCP cum. length */ ++ comp_t ac_udpsnd; /* UDP send requests */ ++ comp_t ac_udprcv; /* UDP recv requests */ ++ comp2_t ac_udpssz; /* UDP cum. length */ ++ comp2_t ac_udprsz; /* UDP cum. length */ ++ comp_t ac_rawsnd; /* RAW send requests */ ++ comp_t ac_rawrcv; /* RAW recv requests */ ++ comp_t ac_minflt; /* Minor Pagefaults */ ++ comp_t ac_majflt; /* Major Pagefaults */ ++ comp_t ac_swaps; /* Number of Swaps */ ++/* m68k had no padding here. */ ++#if !defined(CONFIG_M68K) || !defined(__KERNEL__) ++ __u16 ac_ahz; /* AHZ */ ++#endif ++ __u32 ac_exitcode; /* Exitcode */ ++ char ac_comm[ACCT_COMM + 1]; /* Command Name */ ++ __u8 ac_etime_hi; /* Elapsed Time MSB */ ++ __u16 ac_etime_lo; /* Elapsed Time LSB */ ++ __u32 ac_uid; /* Real User ID */ ++ __u32 ac_gid; /* Real Group ID */ ++}; ++ ++ ++ + /* + * accounting flags + */ +@@ -146,7 +194,13 @@ + * 5: new binary incompatible format (128 bytes, second half) + * + */ ++#define CONFIG_PROCESS_ACCT_ATOP + ++#ifdef CONFIG_PROCESS_ACCT_ATOP ++#define ACCT_VERSION 6 ++#define AHZ (USER_HZ) ++typedef struct acct_atop acct_t; ++#else + #ifdef CONFIG_BSD_PROCESS_ACCT_V3 + #define ACCT_VERSION 3 + #define AHZ 100 +@@ -160,6 +214,7 @@ + #define AHZ (USER_HZ) + typedef struct acct acct_t; + #endif ++#endif + + #else + #define ACCT_VERSION 2 +Index: linux-2.6.28/kernel/acct.c +=================================================================== +--- linux-2.6.28.orig/kernel/acct.c 2009-01-14 13:03:31.000000000 +0100 ++++ linux-2.6.28/kernel/acct.c 2009-01-14 13:03:33.000000000 +0100 +@@ -405,7 +405,7 @@ + return exp; + } + +-#if ACCT_VERSION==1 || ACCT_VERSION==2 ++#if ACCT_VERSION==1 || ACCT_VERSION==2 || ACCT_VERSION==6 + /* + * encode an u64 into a comp2_t (24 bits) + * +@@ -552,6 +552,30 @@ + ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); + rcu_read_unlock(); + #endif ++#if ACCT_VERSION==6 /* ATOP */ ++ ac.ac_pid = current->pid; ++ ac.ac_ppid = current->parent->pid; ++ ac.ac_uid16 = ac.ac_uid; ++ ac.ac_gid16 = ac.ac_gid; ++ ac.ac_ahz = AHZ; ++ ac.ac_bread = encode_comp_t(current->stat.dsk_rio); ++ ac.ac_bwrite = encode_comp_t(current->stat.dsk_wio); ++ ac.ac_dskrsz = encode_comp2_t(current->stat.dsk_rsz); ++ ac.ac_dskwsz = encode_comp2_t(current->stat.dsk_wsz); ++ ac.ac_tcpsnd = encode_comp_t(current->stat.tcp_snd); ++ ac.ac_tcprcv = encode_comp_t(current->stat.tcp_rcv); ++ ac.ac_tcpssz = encode_comp2_t(current->stat.tcp_ssz); ++ ac.ac_tcprsz = encode_comp2_t(current->stat.tcp_rsz); ++ ac.ac_udpsnd = encode_comp_t(current->stat.udp_snd); ++ ac.ac_udprcv = encode_comp_t(current->stat.udp_rcv); ++ ac.ac_udpssz = encode_comp2_t(current->stat.udp_ssz); ++ ac.ac_udprsz = encode_comp2_t(current->stat.udp_rsz); ++ ac.ac_rawsnd = encode_comp_t(current->stat.raw_snd); ++ ac.ac_rawrcv = encode_comp_t(current->stat.raw_rcv); ++ ac.ac_rss = current->mm ? ++ encode_comp_t(get_mm_rss(current->mm)<<(PAGE_SHIFT-10)) : ++ encode_comp_t(0); ++#endif + + spin_lock_irq(¤t->sighand->siglock); + tty = current->signal->tty; /* Safe as we hold the siglock */ diff --git a/3.3.8/3.3-ck1.patch b/3.3.8/3.3-ck1.patch new file mode 100644 index 0000000..9c58ff0 --- /dev/null +++ b/3.3.8/3.3-ck1.patch @@ -0,0 +1,8782 @@ +Index: linux-3.3-ck1/arch/powerpc/platforms/cell/spufs/sched.c +=================================================================== +--- linux-3.3-ck1.orig/arch/powerpc/platforms/cell/spufs/sched.c 2012-03-24 19:30:00.013420381 +1100 ++++ linux-3.3-ck1/arch/powerpc/platforms/cell/spufs/sched.c 2012-03-24 19:30:29.038925740 +1100 +@@ -63,11 +63,6 @@ static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + + /* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- +-/* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. + */ +Index: linux-3.3-ck1/Documentation/scheduler/sched-BFS.txt +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-3.3-ck1/Documentation/scheduler/sched-BFS.txt 2012-03-24 19:30:29.038925740 +1100 +@@ -0,0 +1,347 @@ ++BFS - The Brain Fuck Scheduler by Con Kolivas. ++ ++Goals. ++ ++The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to ++completely do away with the complex designs of the past for the cpu process ++scheduler and instead implement one that is very simple in basic design. ++The main focus of BFS is to achieve excellent desktop interactivity and ++responsiveness without heuristics and tuning knobs that are difficult to ++understand, impossible to model and predict the effect of, and when tuned to ++one workload cause massive detriment to another. ++ ++ ++Design summary. ++ ++BFS is best described as a single runqueue, O(n) lookup, earliest effective ++virtual deadline first design, loosely based on EEVDF (earliest eligible virtual ++deadline first) and my previous Staircase Deadline scheduler. Each component ++shall be described in order to understand the significance of, and reasoning for ++it. The codebase when the first stable version was released was approximately ++9000 lines less code than the existing mainline linux kernel scheduler (in ++2.6.31). This does not even take into account the removal of documentation and ++the cgroups code that is not used. ++ ++Design reasoning. ++ ++The single runqueue refers to the queued but not running processes for the ++entire system, regardless of the number of CPUs. The reason for going back to ++a single runqueue design is that once multiple runqueues are introduced, ++per-CPU or otherwise, there will be complex interactions as each runqueue will ++be responsible for the scheduling latency and fairness of the tasks only on its ++own runqueue, and to achieve fairness and low latency across multiple CPUs, any ++advantage in throughput of having CPU local tasks causes other disadvantages. ++This is due to requiring a very complex balancing system to at best achieve some ++semblance of fairness across CPUs and can only maintain relatively low latency ++for tasks bound to the same CPUs, not across them. To increase said fairness ++and latency across CPUs, the advantage of local runqueue locking, which makes ++for better scalability, is lost due to having to grab multiple locks. ++ ++A significant feature of BFS is that all accounting is done purely based on CPU ++used and nowhere is sleep time used in any way to determine entitlement or ++interactivity. Interactivity "estimators" that use some kind of sleep/run ++algorithm are doomed to fail to detect all interactive tasks, and to falsely tag ++tasks that aren't interactive as being so. The reason for this is that it is ++close to impossible to determine that when a task is sleeping, whether it is ++doing it voluntarily, as in a userspace application waiting for input in the ++form of a mouse click or otherwise, or involuntarily, because it is waiting for ++another thread, process, I/O, kernel activity or whatever. Thus, such an ++estimator will introduce corner cases, and more heuristics will be required to ++cope with those corner cases, introducing more corner cases and failed ++interactivity detection and so on. Interactivity in BFS is built into the design ++by virtue of the fact that tasks that are waking up have not used up their quota ++of CPU time, and have earlier effective deadlines, thereby making it very likely ++they will preempt any CPU bound task of equivalent nice level. See below for ++more information on the virtual deadline mechanism. Even if they do not preempt ++a running task, because the rr interval is guaranteed to have a bound upper ++limit on how long a task will wait for, it will be scheduled within a timeframe ++that will not cause visible interface jitter. ++ ++ ++Design details. ++ ++Task insertion. ++ ++BFS inserts tasks into each relevant queue as an O(1) insertion into a double ++linked list. On insertion, *every* running queue is checked to see if the newly ++queued task can run on any idle queue, or preempt the lowest running task on the ++system. This is how the cross-CPU scheduling of BFS achieves significantly lower ++latency per extra CPU the system has. In this case the lookup is, in the worst ++case scenario, O(n) where n is the number of CPUs on the system. ++ ++Data protection. ++ ++BFS has one single lock protecting the process local data of every task in the ++global queue. Thus every insertion, removal and modification of task data in the ++global runqueue needs to grab the global lock. However, once a task is taken by ++a CPU, the CPU has its own local data copy of the running process' accounting ++information which only that CPU accesses and modifies (such as during a ++timer tick) thus allowing the accounting data to be updated lockless. Once a ++CPU has taken a task to run, it removes it from the global queue. Thus the ++global queue only ever has, at most, ++ ++ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 ++ ++tasks in the global queue. This value is relevant for the time taken to look up ++tasks during scheduling. This will increase if many tasks with CPU affinity set ++in their policy to limit which CPUs they're allowed to run on if they outnumber ++the number of CPUs. The +1 is because when rescheduling a task, the CPU's ++currently running task is put back on the queue. Lookup will be described after ++the virtual deadline mechanism is explained. ++ ++Virtual deadline. ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in BFS is entirely in the virtual deadline mechanism. The one ++tunable in BFS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in jiffies by this equation: ++ ++ jiffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. Once a task is descheduled, it is put back on the queue, and an ++O(n) lookup of all queued-but-not-running tasks is done to determine which has ++the earliest deadline and that task is chosen to receive CPU next. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (jiffies) is ++constantly moving. ++ ++Task lookup. ++ ++BFS has 103 priority queues. 100 of these are dedicated to the static priority ++of realtime tasks, and the remaining 3 are, in order of best to worst priority, ++SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority ++scheduling). When a task of these priorities is queued, a bitmap of running ++priorities is set showing which of these priorities has tasks waiting for CPU ++time. When a CPU is made to reschedule, the lookup for the next task to get ++CPU time is performed in the following way: ++ ++First the bitmap is checked to see what static priority tasks are queued. If ++any realtime priorities are found, the corresponding queue is checked and the ++first task listed there is taken (provided CPU affinity is suitable) and lookup ++is complete. If the priority corresponds to a SCHED_ISO task, they are also ++taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds ++to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this ++stage, every task in the runlist that corresponds to that priority is checked ++to see which has the earliest set deadline, and (provided it has suitable CPU ++affinity) it is taken off the runqueue and given the CPU. If a task has an ++expired deadline, it is taken and the rest of the lookup aborted (as they are ++chosen in FIFO order). ++ ++Thus, the lookup is O(n) in the worst case only, where n is as described ++earlier, as tasks may be chosen before the whole task list is looked over. ++ ++ ++Scalability. ++ ++The major limitations of BFS will be that of scalability, as the separate ++runqueue designs will have less lock contention as the number of CPUs rises. ++However they do not scale linearly even with separate runqueues as multiple ++runqueues will need to be locked concurrently on such designs to be able to ++achieve fair CPU balancing, to try and achieve some sort of nice-level fairness ++across CPUs, and to achieve low enough latency for tasks on a busy CPU when ++other CPUs would be more suited. BFS has the advantage that it requires no ++balancing algorithm whatsoever, as balancing occurs by proxy simply because ++all CPUs draw off the global runqueue, in priority and deadline order. Despite ++the fact that scalability is _not_ the prime concern of BFS, it both shows very ++good scalability to smaller numbers of CPUs and is likely a more scalable design ++at these numbers of CPUs. ++ ++It also has some very low overhead scalability features built into the design ++when it has been deemed their overhead is so marginal that they're worth adding. ++The first is the local copy of the running process' data to the CPU it's running ++on to allow that data to be updated lockless where possible. Then there is ++deference paid to the last CPU a task was running on, by trying that CPU first ++when looking for an idle CPU to use the next time it's scheduled. Finally there ++is the notion of "sticky" tasks that are flagged when they are involuntarily ++descheduled, meaning they still want further CPU time. This sticky flag is ++used to bias heavily against those tasks being scheduled on a different CPU ++unless that CPU would be otherwise idle. When a cpu frequency governor is used ++that scales with CPU load, such as ondemand, sticky tasks are not scheduled ++on a different CPU at all, preferring instead to go idle. This means the CPU ++they were bound to is more likely to increase its speed while the other CPU ++will go idle, thus speeding up total task execution time and likely decreasing ++power usage. This is the only scenario where BFS will allow a CPU to go idle ++in preference to scheduling a task on the earliest available spare CPU. ++ ++The real cost of migrating a task from one CPU to another is entirely dependant ++on the cache footprint of the task, how cache intensive the task is, how long ++it's been running on that CPU to take up the bulk of its cache, how big the CPU ++cache is, how fast and how layered the CPU cache is, how fast a context switch ++is... and so on. In other words, it's close to random in the real world where we ++do more than just one sole workload. The only thing we can be sure of is that ++it's not free. So BFS uses the principle that an idle CPU is a wasted CPU and ++utilising idle CPUs is more important than cache locality, and cache locality ++only plays a part after that. ++ ++When choosing an idle CPU for a waking task, the cache locality is determined ++according to where the task last ran and then idle CPUs are ranked from best ++to worst to choose the most suitable idle CPU based on cache locality, NUMA ++node locality and hyperthread sibling business. They are chosen in the ++following preference (if idle): ++ ++* Same core, idle or busy cache, idle threads ++* Other core, same cache, idle or busy cache, idle threads. ++* Same node, other CPU, idle cache, idle threads. ++* Same node, other CPU, busy cache, idle threads. ++* Same core, busy threads. ++* Other core, same cache, busy threads. ++* Same node, other CPU, busy threads. ++* Other node, other CPU, idle cache, idle threads. ++* Other node, other CPU, busy cache, idle threads. ++* Other node, other CPU, busy threads. ++ ++This shows the SMT or "hyperthread" awareness in the design as well which will ++choose a real idle core first before a logical SMT sibling which already has ++tasks on the physical CPU. ++ ++Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. ++However this benchmarking was performed on an earlier design that was far less ++scalable than the current one so it's hard to know how scalable it is in terms ++of both CPUs (due to the global runqueue) and heavily loaded machines (due to ++O(n) lookup) at this stage. Note that in terms of scalability, the number of ++_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) ++quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark ++results are very promising indeed, without needing to tweak any knobs, features ++or options. Benchmark contributions are most welcome. ++ ++ ++Features ++ ++As the initial prime target audience for BFS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval ++and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition ++to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is ++support for CGROUPS. The average user should neither need to know what these ++are, nor should they need to be using them to have good desktop behaviour. ++ ++rr_interval ++ ++There is only one "scheduler" tunable, the round robin interval. This can be ++accessed in ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6ms. Valid values ++are from 1 to 1000. Decreasing the value will decrease latencies at the cost of ++decreasing throughput, while increasing it will improve throughput, but at the ++cost of worsening latencies. The accuracy of the rr interval is limited by HZ ++resolution of the kernel configuration. Thus, the worst case latencies are ++usually slightly higher than this actual value. BFS uses "dithering" to try and ++minimise the effect the Hz limitation has. The default value of 6 is not an ++arbitrary one. It is based on the fact that humans can detect jitter at ++approximately 7ms, so aiming for much lower latencies is pointless under most ++circumstances. It is worth noting this fact when comparing the latency ++performance of BFS to other schedulers. Worst case latencies being higher than ++7ms are far worse than average latencies not being in the microsecond range. ++Experimentation has shown that rr intervals being increased up to 300 can ++improve throughput but beyond that, scheduling noise from elsewhere prevents ++further demonstrable throughput. ++ ++Isochronous scheduling. ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of _total CPU_ available across the machine, configurable ++as a percentage in the following "resource handling" tunable (as opposed to a ++scheduler tunable): ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of BFS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++Because some applications constantly set their policy as well as their nice ++level, there is potential for them to undo the override specified by the user ++on the command line of setting the policy to SCHED_ISO. To counter this, once ++a task has been set to SCHED_ISO policy, it needs superuser privileges to set ++it back to SCHED_NORMAL. This will ensure the task remains ISO and all child ++processes and threads will also inherit the ISO policy. ++ ++Idleprio scheduling. ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start ++a video encode or so on without any slowdown of other tasks. To avoid this ++policy from grabbing shared resources and holding them indefinitely, if it ++detects a state where the task is waiting on I/O, the machine is about to ++suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As ++per the Isochronous task management, once a task has been scheduled as IDLEPRIO, ++it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can ++be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++ schedtool -D -e ./mprime ++ ++Subtick accounting. ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the ++timer tick frequency (HZ) is lowered. It is possible to create an application ++which uses almost 100% CPU, yet by being descheduled at the right time, records ++zero CPU usage. While the main problem with this is that there are possible ++security implications, it is also difficult to determine how much CPU a task ++really does use. BFS tries to use the sub-tick accounting from the TSC clock, ++where possible, to determine real CPU usage. This is not entirely reliable, but ++is far more likely to produce accurate CPU usage data than the existing designs ++and will not show tasks as consuming no CPU usage when they actually are. Thus, ++the amount of CPU reported as being used by BFS will more accurately represent ++how much CPU the task itself is using (as is shown for example by the 'time' ++application), so the reported values may be quite different to other schedulers. ++Values reported as the 'load' are more prone to problems with this design, but ++per process values are closer to real usage. When comparing throughput of BFS ++to other designs, it is important to compare the actual completed work in terms ++of total wall clock time taken and total work done, rather than the reported ++"cpu usage". ++ ++ ++Con Kolivas Tue, 5 Apr 2011 +Index: linux-3.3-ck1/Documentation/sysctl/kernel.txt +=================================================================== +--- linux-3.3-ck1.orig/Documentation/sysctl/kernel.txt 2012-03-24 19:30:00.012420362 +1100 ++++ linux-3.3-ck1/Documentation/sysctl/kernel.txt 2012-03-24 19:30:29.039925758 +1100 +@@ -33,6 +33,7 @@ show up in /proc/sys/kernel: + - domainname + - hostname + - hotplug ++- iso_cpu + - kptr_restrict + - kstack_depth_to_print [ X86 only ] + - l2cr [ PPC only ] +@@ -59,6 +60,7 @@ show up in /proc/sys/kernel: + - randomize_va_space + - real-root-dev ==> Documentation/initrd.txt + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - sem +@@ -301,6 +303,16 @@ kernel stack. + + ============================================================== + ++iso_cpu: (BFS CPU scheduler only). ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling five ++seconds over the -whole- system, meaning all cpus. ++ ++Set to 70 (percent) by default. ++ ++============================================================== ++ + l2cr: (PPC only) + + This flag controls the L2 cache of G3 processor boards. If +@@ -517,6 +529,20 @@ rebooting. ??? + + ============================================================== + ++rr_interval: (BFS CPU scheduler only) ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. Conversely decreasing it will decrease average and maximum ++latencies but at the expense of throughput. This value is in ++milliseconds and the default value chosen depends on the number of ++cpus available at scheduler initialisation with a minimum of 6. ++ ++Valid values are from 1-1000. ++ ++============================================================== ++ + rtsig-max & rtsig-nr: + + The file rtsig-max can be used to tune the maximum number +Index: linux-3.3-ck1/fs/proc/base.c +=================================================================== +--- linux-3.3-ck1.orig/fs/proc/base.c 2012-03-24 19:30:00.013420381 +1100 ++++ linux-3.3-ck1/fs/proc/base.c 2012-03-24 19:30:29.039925758 +1100 +@@ -342,7 +342,7 @@ static int proc_pid_stack(struct seq_fil + static int proc_pid_schedstat(struct task_struct *task, char *buffer) + { + return sprintf(buffer, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + } +Index: linux-3.3-ck1/include/linux/init_task.h +=================================================================== +--- linux-3.3-ck1.orig/include/linux/init_task.h 2012-03-24 19:30:00.013420381 +1100 ++++ linux-3.3-ck1/include/linux/init_task.h 2012-03-24 19:30:29.039925758 +1100 +@@ -125,12 +125,70 @@ extern struct cred init_cred; + # define INIT_PERF_EVENTS(tsk) + #endif + +-#define INIT_TASK_COMM "swapper" +- + /* + * INIT_TASK is used to set up the first task table, touch at + * your own risk!. Base=0, limit=0x1fffff (=2MB) + */ ++#ifdef CONFIG_SCHED_BFS ++#define INIT_TASK_COMM "BFS" ++#define INIT_TASK(tsk) \ ++{ \ ++ .state = 0, \ ++ .stack = &init_thread_info, \ ++ .usage = ATOMIC_INIT(2), \ ++ .flags = PF_KTHREAD, \ ++ .prio = NORMAL_PRIO, \ ++ .static_prio = MAX_PRIO-20, \ ++ .normal_prio = NORMAL_PRIO, \ ++ .deadline = 0, \ ++ .policy = SCHED_NORMAL, \ ++ .cpus_allowed = CPU_MASK_ALL, \ ++ .mm = NULL, \ ++ .active_mm = &init_mm, \ ++ .run_list = LIST_HEAD_INIT(tsk.run_list), \ ++ .time_slice = HZ, \ ++ .tasks = LIST_HEAD_INIT(tsk.tasks), \ ++ INIT_PUSHABLE_TASKS(tsk) \ ++ .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ ++ .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ ++ .real_parent = &tsk, \ ++ .parent = &tsk, \ ++ .children = LIST_HEAD_INIT(tsk.children), \ ++ .sibling = LIST_HEAD_INIT(tsk.sibling), \ ++ .group_leader = &tsk, \ ++ RCU_INIT_POINTER(.real_cred, &init_cred), \ ++ RCU_INIT_POINTER(.cred, &init_cred), \ ++ .comm = INIT_TASK_COMM, \ ++ .thread = INIT_THREAD, \ ++ .fs = &init_fs, \ ++ .files = &init_files, \ ++ .signal = &init_signals, \ ++ .sighand = &init_sighand, \ ++ .nsproxy = &init_nsproxy, \ ++ .pending = { \ ++ .list = LIST_HEAD_INIT(tsk.pending.list), \ ++ .signal = {{0}}}, \ ++ .blocked = {{0}}, \ ++ .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ ++ .journal_info = NULL, \ ++ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ ++ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ ++ .timer_slack_ns = 50000, /* 50 usec default slack */ \ ++ .pids = { \ ++ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ ++ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ ++ [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ ++ }, \ ++ INIT_IDS \ ++ INIT_PERF_EVENTS(tsk) \ ++ INIT_TRACE_IRQFLAGS \ ++ INIT_LOCKDEP \ ++ INIT_FTRACE_GRAPH \ ++ INIT_TRACE_RECURSION \ ++ INIT_TASK_RCU_PREEMPT(tsk) \ ++} ++#else /* CONFIG_SCHED_BFS */ ++#define INIT_TASK_COMM "swapper" + #define INIT_TASK(tsk) \ + { \ + .state = 0, \ +@@ -193,7 +251,7 @@ extern struct cred init_cred; + INIT_TRACE_RECURSION \ + INIT_TASK_RCU_PREEMPT(tsk) \ + } +- ++#endif /* CONFIG_SCHED_BFS */ + + #define INIT_CPU_TIMERS(cpu_timers) \ + { \ +Index: linux-3.3-ck1/include/linux/ioprio.h +=================================================================== +--- linux-3.3-ck1.orig/include/linux/ioprio.h 2012-03-24 19:30:00.013420381 +1100 ++++ linux-3.3-ck1/include/linux/ioprio.h 2012-03-24 19:30:29.039925758 +1100 +@@ -64,6 +64,8 @@ static inline int task_ioprio_class(stru + + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (iso_task(task)) ++ return 0; + return (task_nice(task) + 20) / 5; + } + +Index: linux-3.3-ck1/include/linux/sched.h +=================================================================== +--- linux-3.3-ck1.orig/include/linux/sched.h 2012-03-24 19:30:00.013420381 +1100 ++++ linux-3.3-ck1/include/linux/sched.h 2012-03-24 19:34:53.640769520 +1100 +@@ -37,8 +37,15 @@ + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented on BFS only */ + #define SCHED_IDLE 5 ++#define SCHED_IDLEPRIO SCHED_IDLE ++#ifdef CONFIG_SCHED_BFS ++#define SCHED_ISO 4 ++#define SCHED_MAX (SCHED_IDLEPRIO) ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++#endif ++ + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ + #define SCHED_RESET_ON_FORK 0x40000000 + +@@ -269,8 +276,6 @@ extern asmlinkage void schedule_tail(str + extern void init_idle(struct task_struct *idle, int cpu); + extern void init_idle_bootup_task(struct task_struct *idle); + +-extern int runqueue_is_locked(int cpu); +- + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) + extern void select_nohz_load_balancer(int stop_tick); + extern void set_cpu_sd_state_idle(void); +@@ -1243,15 +1248,31 @@ struct task_struct { + + #ifdef CONFIG_SMP + struct llist_node wake_entry; +- int on_cpu; + #endif +- int on_rq; ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_BFS) ++ bool on_cpu; ++#endif ++#ifndef CONFIG_SCHED_BFS ++ bool on_rq; ++#endif + + int prio, static_prio, normal_prio; + unsigned int rt_priority; ++#ifdef CONFIG_SCHED_BFS ++ int time_slice; ++ u64 deadline; ++ struct list_head run_list; ++ u64 last_ran; ++ u64 sched_time; /* sched_clock time spent running */ ++#ifdef CONFIG_SMP ++ bool sticky; /* Soft affined flag */ ++#endif ++ unsigned long rt_timeout; ++#else /* CONFIG_SCHED_BFS */ + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++#endif + + #ifdef CONFIG_PREEMPT_NOTIFIERS + /* list of struct preempt_notifier: */ +@@ -1358,6 +1379,9 @@ struct task_struct { + int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ + + cputime_t utime, stime, utimescaled, stimescaled; ++#ifdef CONFIG_SCHED_BFS ++ unsigned long utime_pc, stime_pc; ++#endif + cputime_t gtime; + #ifndef CONFIG_VIRT_CPU_ACCOUNTING + cputime_t prev_utime, prev_stime; +@@ -1592,6 +1616,64 @@ struct task_struct { + #endif + }; + ++#ifdef CONFIG_SCHED_BFS ++bool grunqueue_is_locked(void); ++void grq_unlock_wait(void); ++void cpu_scaling(int cpu); ++void cpu_nonscaling(int cpu); ++bool above_background_load(void); ++#define tsk_seruntime(t) ((t)->sched_time) ++#define tsk_rttimeout(t) ((t)->rt_timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++} ++ ++static inline int runqueue_is_locked(int cpu) ++{ ++ return grunqueue_is_locked(); ++} ++ ++void print_scheduler_version(void); ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return (p->policy == SCHED_ISO); ++} ++#else /* CFS */ ++extern int runqueue_is_locked(int cpu); ++static inline void cpu_scaling(int cpu) ++{ ++} ++ ++static inline void cpu_nonscaling(int cpu) ++{ ++} ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++ p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; ++} ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO"CFS CPU scheduler.\n"); ++} ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return false; ++} ++ ++/* Anyone feel like implementing this? */ ++static inline bool above_background_load(void) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_BFS */ ++ + /* Future-safe accessor for struct task_struct's cpus_allowed. */ + #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) + +@@ -1609,10 +1691,20 @@ struct task_struct { + */ + + #define MAX_USER_RT_PRIO 100 +-#define MAX_RT_PRIO MAX_USER_RT_PRIO ++#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) ++#define DEFAULT_PRIO (MAX_RT_PRIO + 20) + ++#ifdef CONFIG_SCHED_BFS ++#define PRIO_RANGE (40) ++#define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE) ++#define ISO_PRIO (MAX_RT_PRIO) ++#define NORMAL_PRIO (MAX_RT_PRIO + 1) ++#define IDLE_PRIO (MAX_RT_PRIO + 2) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* CONFIG_SCHED_BFS */ + #define MAX_PRIO (MAX_RT_PRIO + 40) +-#define DEFAULT_PRIO (MAX_RT_PRIO + 20) ++#define NORMAL_PRIO DEFAULT_PRIO ++#endif /* CONFIG_SCHED_BFS */ + + static inline int rt_prio(int prio) + { +@@ -1976,7 +2068,7 @@ extern unsigned long long + task_sched_runtime(struct task_struct *task); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BFS) + extern void sched_exec(void); + #else + #define sched_exec() {} +@@ -2668,7 +2760,7 @@ static inline unsigned int task_cpu(cons + return 0; + } + +-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) ++static inline void set_task_cpu(struct task_struct *p, int cpu) + { + } + +Index: linux-3.3-ck1/init/Kconfig +=================================================================== +--- linux-3.3-ck1.orig/init/Kconfig 2012-03-24 19:30:00.013420381 +1100 ++++ linux-3.3-ck1/init/Kconfig 2012-03-24 19:30:29.040925775 +1100 +@@ -29,6 +29,19 @@ config IRQ_WORK + + menu "General setup" + ++config SCHED_BFS ++ bool "BFS cpu scheduler" ++ ---help--- ++ The Brain Fuck CPU Scheduler for excellent interactivity and ++ responsiveness on the desktop and solid scalability on normal ++ hardware. Not recommended for 4096 CPUs. ++ ++ Currently incompatible with the Group CPU scheduler, and RCU TORTURE ++ TEST so these options are disabled. ++ ++ Say Y here. ++ default y ++ + config EXPERIMENTAL + bool "Prompt for development and/or incomplete code/drivers" + ---help--- +@@ -640,6 +653,7 @@ config PROC_PID_CPUSET + + config CGROUP_CPUACCT + bool "Simple CPU accounting cgroup subsystem" ++ depends on !SCHED_BFS + help + Provides a simple Resource Controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -727,6 +741,7 @@ config CGROUP_PERF + + menuconfig CGROUP_SCHED + bool "Group CPU scheduler" ++ depends on !SCHED_BFS + default n + help + This feature lets CPU scheduler recognize task groups and control CPU +@@ -863,6 +878,7 @@ endif # NAMESPACES + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_BFS + select EVENTFD + select CGROUPS + select CGROUP_SCHED +Index: linux-3.3-ck1/init/main.c +=================================================================== +--- linux-3.3-ck1.orig/init/main.c 2012-03-24 19:30:00.013420381 +1100 ++++ linux-3.3-ck1/init/main.c 2012-03-24 19:30:29.041925792 +1100 +@@ -757,6 +757,7 @@ static noinline int init_post(void) + system_state = SYSTEM_RUNNING; + numa_default_policy(); + ++ print_scheduler_version(); + + current->signal->flags |= SIGNAL_UNKILLABLE; + +Index: linux-3.3-ck1/kernel/delayacct.c +=================================================================== +--- linux-3.3-ck1.orig/kernel/delayacct.c 2012-03-24 19:30:00.014420399 +1100 ++++ linux-3.3-ck1/kernel/delayacct.c 2012-03-24 19:30:29.041925792 +1100 +@@ -130,7 +130,7 @@ int __delayacct_add_tsk(struct taskstats + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +Index: linux-3.3-ck1/kernel/exit.c +=================================================================== +--- linux-3.3-ck1.orig/kernel/exit.c 2012-03-24 19:30:00.014420399 +1100 ++++ linux-3.3-ck1/kernel/exit.c 2012-03-24 19:30:29.041925792 +1100 +@@ -132,7 +132,7 @@ static void __exit_signal(struct task_st + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + } + + sig->nr_threads--; +Index: linux-3.3-ck1/kernel/posix-cpu-timers.c +=================================================================== +--- linux-3.3-ck1.orig/kernel/posix-cpu-timers.c 2012-03-24 19:30:00.014420399 +1100 ++++ linux-3.3-ck1/kernel/posix-cpu-timers.c 2012-03-24 19:30:29.042925809 +1100 +@@ -495,7 +495,7 @@ static void cleanup_timers(struct list_h + void posix_cpu_timers_exit(struct task_struct *tsk) + { + cleanup_timers(tsk->cpu_timers, +- tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); ++ tsk->utime, tsk->stime, tsk_seruntime(tsk)); + + } + void posix_cpu_timers_exit_group(struct task_struct *tsk) +@@ -504,7 +504,7 @@ void posix_cpu_timers_exit_group(struct + + cleanup_timers(tsk->signal->cpu_timers, + tsk->utime + sig->utime, tsk->stime + sig->stime, +- tsk->se.sum_exec_runtime + sig->sum_sched_runtime); ++ tsk_seruntime(tsk) + sig->sum_sched_runtime); + } + + static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) +@@ -934,7 +934,7 @@ static void check_thread_timers(struct t + struct cpu_timer_list *t = list_first_entry(timers, + struct cpu_timer_list, + entry); +- if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { ++ if (!--maxfire || tsk_seruntime(tsk) < t->expires.sched) { + tsk->cputime_expires.sched_exp = t->expires.sched; + break; + } +@@ -951,7 +951,7 @@ static void check_thread_timers(struct t + ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); + + if (hard != RLIM_INFINITY && +- tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { ++ tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { + /* + * At the hard limit, we just die. + * No need to calculate anything else now. +@@ -959,7 +959,7 @@ static void check_thread_timers(struct t + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } +- if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { ++ if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { + /* + * At the soft limit, send a SIGXCPU every second. + */ +@@ -1252,7 +1252,7 @@ static inline int fastpath_timer_check(s + struct task_cputime task_sample = { + .utime = tsk->utime, + .stime = tsk->stime, +- .sum_exec_runtime = tsk->se.sum_exec_runtime ++ .sum_exec_runtime = tsk_seruntime(tsk) + }; + + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) +Index: linux-3.3-ck1/kernel/sysctl.c +=================================================================== +--- linux-3.3-ck1.orig/kernel/sysctl.c 2012-03-24 19:30:00.013420381 +1100 ++++ linux-3.3-ck1/kernel/sysctl.c 2012-03-24 19:30:29.042925809 +1100 +@@ -121,7 +121,12 @@ static int __maybe_unused one = 1; + static int __maybe_unused two = 2; + static int __maybe_unused three = 3; + static unsigned long one_ul = 1; +-static int one_hundred = 100; ++static int __maybe_unused one_hundred = 100; ++#ifdef CONFIG_SCHED_BFS ++extern int rr_interval; ++extern int sched_iso_cpu; ++static int __read_mostly one_thousand = 1000; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -251,7 +256,7 @@ static struct ctl_table root_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -266,6 +271,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_BFS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -383,6 +389,7 @@ static struct ctl_table kern_table[] = { + .extra1 = &one, + }, + #endif ++#endif /* !CONFIG_SCHED_BFS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -850,6 +857,26 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_BFS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +Index: linux-3.3-ck1/lib/Kconfig.debug +=================================================================== +--- linux-3.3-ck1.orig/lib/Kconfig.debug 2012-03-24 19:30:00.012420362 +1100 ++++ linux-3.3-ck1/lib/Kconfig.debug 2012-03-24 19:30:29.042925809 +1100 +@@ -875,7 +875,7 @@ config BOOT_PRINTK_DELAY + + config RCU_TORTURE_TEST + tristate "torture tests for RCU" +- depends on DEBUG_KERNEL ++ depends on DEBUG_KERNEL && !SCHED_BFS + default n + help + This option provides a kernel module that runs torture tests +Index: linux-3.3-ck1/include/linux/jiffies.h +=================================================================== +--- linux-3.3-ck1.orig/include/linux/jiffies.h 2012-03-24 19:30:00.012420362 +1100 ++++ linux-3.3-ck1/include/linux/jiffies.h 2012-03-24 19:30:29.043925827 +1100 +@@ -164,7 +164,7 @@ static inline u64 get_jiffies_64(void) + * Have the 32 bit jiffies value wrap 5 minutes after boot + * so jiffies wrap bugs show up earlier. + */ +-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) ++#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) + + /* + * Change timeval to jiffies, trying to avoid the +Index: linux-3.3-ck1/drivers/cpufreq/cpufreq.c +=================================================================== +--- linux-3.3-ck1.orig/drivers/cpufreq/cpufreq.c 2012-03-24 19:30:00.012420362 +1100 ++++ linux-3.3-ck1/drivers/cpufreq/cpufreq.c 2012-03-24 19:30:29.043925827 +1100 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -1445,6 +1446,12 @@ int __cpufreq_driver_target(struct cpufr + target_freq, relation); + if (cpu_online(policy->cpu) && cpufreq_driver->target) + retval = cpufreq_driver->target(policy, target_freq, relation); ++ if (likely(retval != -EINVAL)) { ++ if (target_freq == policy->max) ++ cpu_nonscaling(policy->cpu); ++ else ++ cpu_scaling(policy->cpu); ++ } + + return retval; + } +Index: linux-3.3-ck1/drivers/cpufreq/cpufreq_ondemand.c +=================================================================== +--- linux-3.3-ck1.orig/drivers/cpufreq/cpufreq_ondemand.c 2012-03-24 19:30:00.012420362 +1100 ++++ linux-3.3-ck1/drivers/cpufreq/cpufreq_ondemand.c 2012-03-24 19:30:29.043925827 +1100 +@@ -28,8 +28,8 @@ + * It helps to keep variable names smaller, simpler + */ + +-#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (10) +-#define DEF_FREQUENCY_UP_THRESHOLD (80) ++#define DEF_FREQUENCY_DOWN_DIFFERENTIAL (26) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (100000) + #define MICRO_FREQUENCY_DOWN_DIFFERENTIAL (3) +@@ -416,10 +416,10 @@ static void dbs_check_cpu(struct cpu_dbs + + /* + * Every sampling_rate, we check, if current idle time is less +- * than 20% (default), then we try to increase frequency ++ * than 37% (default), then we try to increase frequency + * Every sampling_rate, we look for a the lowest + * frequency which can sustain the load while keeping idle time over +- * 30%. If such a frequency exist, we try to decrease to this frequency. ++ * 63%. If such a frequency exist, we try to decrease to this frequency. + * + * Any frequency increase takes it to the maximum frequency. + * Frequency reduction happens at minimum steps of +Index: linux-3.3-ck1/drivers/cpufreq/cpufreq_conservative.c +=================================================================== +--- linux-3.3-ck1.orig/drivers/cpufreq/cpufreq_conservative.c 2012-03-24 19:30:00.012420362 +1100 ++++ linux-3.3-ck1/drivers/cpufreq/cpufreq_conservative.c 2012-03-24 19:30:29.043925827 +1100 +@@ -29,8 +29,8 @@ + * It helps to keep variable names smaller, simpler + */ + +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_FREQUENCY_DOWN_THRESHOLD (20) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define DEF_FREQUENCY_DOWN_THRESHOLD (26) + + /* + * The polling frequency of this governor depends on the capability of +Index: linux-3.3-ck1/arch/x86/Kconfig +=================================================================== +--- linux-3.3-ck1.orig/arch/x86/Kconfig 2012-03-24 19:30:00.013420381 +1100 ++++ linux-3.3-ck1/arch/x86/Kconfig 2012-03-24 19:34:53.659769871 +1100 +@@ -806,15 +806,7 @@ config SCHED_MC + increased overhead in some places. If unsure say N here. + + config IRQ_TIME_ACCOUNTING +- bool "Fine granularity task level IRQ time accounting" +- default n +- ---help--- +- Select this option to enable fine granularity task irq time +- accounting. This is done by reading a timestamp on each +- transitions between softirq and hardirq state, so there can be a +- small performance impact. +- +- If in doubt, say N here. ++ def_bool y + + source "kernel/Kconfig.preempt" + +@@ -1112,7 +1104,7 @@ endchoice + + choice + depends on EXPERIMENTAL +- prompt "Memory split" if EXPERT ++ prompt "Memory split" + default VMSPLIT_3G + depends on X86_32 + ---help--- +@@ -1132,17 +1124,17 @@ choice + option alone! + + config VMSPLIT_3G +- bool "3G/1G user/kernel split" ++ bool "Default 896MB lowmem (3G/1G user/kernel split)" + config VMSPLIT_3G_OPT + depends on !X86_PAE +- bool "3G/1G user/kernel split (for full 1G low memory)" ++ bool "1GB lowmem (3G/1G user/kernel split)" + config VMSPLIT_2G +- bool "2G/2G user/kernel split" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_2G_OPT + depends on !X86_PAE +- bool "2G/2G user/kernel split (for full 2G low memory)" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_1G +- bool "1G/3G user/kernel split" ++ bool "3GB lowmem (1G/3G user/kernel split)" + endchoice + + config PAGE_OFFSET +Index: linux-3.3-ck1/kernel/sched/bfs.c +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-3.3-ck1/kernel/sched/bfs.c 2012-03-24 19:30:29.047925897 +1100 +@@ -0,0 +1,7251 @@ ++/* ++ * kernel/sched/bfs.c, was kernel/sched.c ++ * ++ * Kernel scheduler and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and ++ * make semaphores SMP safe ++ * 1998-11-19 Implemented schedule_timeout() and related stuff ++ * by Andrea Arcangeli ++ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: ++ * hybrid priority-list and round-robin design with ++ * an array-switch method of distributing timeslices ++ * and per-CPU runqueues. Cleanups and useful suggestions ++ * by Davide Libenzi, preemptible kernel bits by Robert Love. ++ * 2003-09-03 Interactivity tuning by Con Kolivas. ++ * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-04-15 Work begun on replacing all interactivity tuning with a ++ * fair scheduling design by Con Kolivas. ++ * 2007-05-05 Load balancing (smp-nice) and other improvements ++ * by Peter Williams ++ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith ++ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri ++ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, ++ * Thomas Gleixner, Mike Kravetz ++ * now Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#ifdef CONFIG_PARAVIRT ++#include ++#endif ++ ++#include "cpupri.h" ++#include "../workqueue_sched.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_queue(rq) rt_prio((rq)->rq_prio) ++#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) ++#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) ++#define iso_task(p) unlikely((p)->policy == SCHED_ISO) ++#define iso_queue(rq) unlikely((rq)->rq_policy == SCHED_ISO) ++#define rq_running_iso(rq) ((rq)->rq_prio == ISO_PRIO) ++ ++#define ISO_PERIOD ((5 * HZ * grq.noc) + 1) ++ ++/* ++ * Convert user-nice values [ -20 ... 0 ... 19 ] ++ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], ++ * and back. ++ */ ++#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) ++#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) ++#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) ++ ++/* ++ * 'User priority' is the nice value converted to something we ++ * can work with better when scaling various scheduler parameters, ++ * it's a [ 0 ... 39 ] range. ++ */ ++#define USER_PRIO(p) ((p) - MAX_RT_PRIO) ++#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) ++#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) ++#define SCHED_PRIO(p) ((p) + MAX_RT_PRIO) ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) ++#define JIFFY_NS (1000000000 / HZ) ++#define HALF_JIFFY_NS (1000000000 / HZ / 2) ++#define HALF_JIFFY_US (1000000 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "BFS CPU scheduler v0.420 by Con Kolivas.\n"); ++} ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run five seconds as real time tasks. This is the total over ++ * all online cpus. ++ */ ++int sched_iso_cpu __read_mostly = 70; ++ ++/* ++ * The relative length of deadline for each priority(nice) level. ++ */ ++static int prio_ratios[PRIO_RANGE] __read_mostly; ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++/* ++ * The global runqueue data that all CPUs work off. Data is protected either ++ * by the global grq lock, or the discrete lock that precedes the data in this ++ * struct. ++ */ ++struct global_rq { ++ raw_spinlock_t lock; ++ unsigned long nr_running; ++ unsigned long nr_uninterruptible; ++ unsigned long long nr_switches; ++ struct list_head queue[PRIO_LIMIT]; ++ DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1); ++#ifdef CONFIG_SMP ++ unsigned long qnr; /* queued not running */ ++ cpumask_t cpu_idle_map; ++ bool idle_cpus; ++#endif ++ int noc; /* num_online_cpus stored and updated when it changes */ ++ u64 niffies; /* Nanosecond jiffies */ ++ unsigned long last_jiffy; /* Last jiffy we updated niffies */ ++ ++ raw_spinlock_t iso_lock; ++ int iso_ticks; ++ bool iso_refractory; ++}; ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * We add the notion of a root-domain which will be used to define per-domain ++ * variables. Each exclusive cpuset essentially defines an island domain by ++ * fully partitioning the member cpus from any other cpuset. Whenever a new ++ * exclusive cpuset is created, we also create and attach a new root-domain ++ * object. ++ * ++ */ ++struct root_domain { ++ atomic_t refcount; ++ atomic_t rto_count; ++ struct rcu_head rcu; ++ cpumask_var_t span; ++ cpumask_var_t online; ++ ++ /* ++ * The "RT overload" flag: it gets set if a CPU has more than ++ * one runnable RT task. ++ */ ++ cpumask_var_t rto_mask; ++ struct cpupri cpupri; ++}; ++ ++/* ++ * By default the system creates a single root-domain with all cpus as ++ * members (mimicking the global state we have today). ++ */ ++static struct root_domain def_root_domain; ++ ++#endif /* CONFIG_SMP */ ++ ++/* There can be only one */ ++static struct global_rq grq; ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ ++ u64 nohz_stamp; ++ unsigned char in_nohz_recently; ++#endif ++#endif ++ ++ struct task_struct *curr, *idle, *stop; ++ struct mm_struct *prev_mm; ++ ++ /* Stored data about rq->curr to work outside grq lock */ ++ u64 rq_deadline; ++ unsigned int rq_policy; ++ int rq_time_slice; ++ u64 rq_last_ran; ++ int rq_prio; ++ bool rq_running; /* There is a task running */ ++ ++ /* Accurate timekeeping data */ ++ u64 timekeep_clock; ++ unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc, ++ iowait_pc, idle_pc; ++ long account_pc; ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ bool scaling; /* This CPU is managed by a scaling CPU freq governor */ ++ struct task_struct *sticky_task; ++ ++ struct root_domain *rd; ++ struct sched_domain *sd; ++ int *cpu_locality; /* CPU relative cache distance */ ++#ifdef CONFIG_SCHED_SMT ++ bool (*siblings_idle)(int cpu); ++ /* See if all smt siblings are idle */ ++ cpumask_t smt_siblings; ++#endif ++#ifdef CONFIG_SCHED_MC ++ bool (*cache_idle)(int cpu); ++ /* See if all cache siblings are idle */ ++ cpumask_t cache_siblings; ++#endif ++ u64 last_niffy; /* Last time this RQ updated grq.niffies */ ++#endif ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif ++ ++ u64 clock, old_clock, last_tick; ++ u64 clock_task; ++ bool dither; ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif ++}; ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++#ifdef CONFIG_SMP ++/* ++ * sched_domains_mutex serialises calls to init_sched_domains, ++ * detach_destroy_domains and partition_sched_domains. ++ */ ++static DEFINE_MUTEX(sched_domains_mutex); ++ ++/* ++ * By default the system creates a single root-domain with all cpus as ++ * members (mimicking the global state we have today). ++ */ ++static struct root_domain def_root_domain; ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++#endif ++ ++#define rcu_dereference_check_sched_domain(p) \ ++ rcu_dereference_check((p), \ ++ lockdep_is_held(&sched_domains_mutex)) ++ ++/* ++ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. ++ * See detach_destroy_domains: synchronize_sched for details. ++ * ++ * The domain tree of any CPU may only be accessed from within ++ * preempt-disabled sections. ++ */ ++#define for_each_domain(cpu, __sd) \ ++ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) ++ ++static inline void update_rq_clock(struct rq *rq); ++ ++/* ++ * Sanity check should sched_clock return bogus values. We make sure it does ++ * not appear to go backwards, and use jiffies to determine the maximum and ++ * minimum it could possibly have increased, and round down to the nearest ++ * jiffy when it falls outside this. ++ */ ++static inline void niffy_diff(s64 *niff_diff, int jiff_diff) ++{ ++ unsigned long min_diff, max_diff; ++ ++ if (jiff_diff > 1) ++ min_diff = JIFFIES_TO_NS(jiff_diff - 1); ++ else ++ min_diff = 1; ++ /* Round up to the nearest tick for maximum */ ++ max_diff = JIFFIES_TO_NS(jiff_diff + 1); ++ ++ if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff)) ++ *niff_diff = min_diff; ++} ++ ++#ifdef CONFIG_SMP ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() (&__get_cpu_var(runqueues)) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++static inline int cpu_of(struct rq *rq) ++{ ++ return rq->cpu; ++} ++ ++/* ++ * Niffies are a globally increasing nanosecond counter. Whenever a runqueue ++ * clock is updated with the grq.lock held, it is an opportunity to update the ++ * niffies value. Any CPU can update it by adding how much its clock has ++ * increased since it last updated niffies, minus any added niffies by other ++ * CPUs. ++ */ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff; ++ long jdiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ /* old_clock is only updated when we are updating niffies */ ++ rq->old_clock = rq->clock; ++ ndiff -= grq.niffies - rq->last_niffy; ++ jdiff = jiffies - grq.last_jiffy; ++ niffy_diff(&ndiff, jdiff); ++ grq.last_jiffy += jdiff; ++ grq.niffies += ndiff; ++ rq->last_niffy = grq.niffies; ++} ++#else /* CONFIG_SMP */ ++static struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++static inline int cpu_of(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff; ++ long jdiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ rq->old_clock = rq->clock; ++ jdiff = jiffies - grq.last_jiffy; ++ niffy_diff(&ndiff, jdiff); ++ grq.last_jiffy += jdiff; ++ grq.niffies += ndiff; ++} ++#endif ++#define raw_rq() (&__raw_get_cpu_var(runqueues)) ++ ++#include "stats.h" ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_switch ++# define finish_arch_switch(prev) do { } while (0) ++#endif ++ ++/* ++ * All common locking functions performed on grq.lock. rq->clock is local to ++ * the CPU accessing it so it can be modified just with interrupts disabled ++ * when we're not updating niffies. ++ * Looking up task_rq must be done under grq.lock to be safe. ++ */ ++static void update_rq_clock_task(struct rq *rq, s64 delta); ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++static inline void grq_lock(void) ++ __acquires(grq.lock) ++{ ++ raw_spin_lock(&grq.lock); ++} ++ ++static inline void grq_unlock(void) ++ __releases(grq.lock) ++{ ++ raw_spin_unlock(&grq.lock); ++} ++ ++static inline void grq_lock_irq(void) ++ __acquires(grq.lock) ++{ ++ raw_spin_lock_irq(&grq.lock); ++} ++ ++static inline void time_lock_grq(struct rq *rq) ++ __acquires(grq.lock) ++{ ++ grq_lock(); ++ update_clocks(rq); ++} ++ ++static inline void grq_unlock_irq(void) ++ __releases(grq.lock) ++{ ++ raw_spin_unlock_irq(&grq.lock); ++} ++ ++static inline void grq_lock_irqsave(unsigned long *flags) ++ __acquires(grq.lock) ++{ ++ raw_spin_lock_irqsave(&grq.lock, *flags); ++} ++ ++static inline void grq_unlock_irqrestore(unsigned long *flags) ++ __releases(grq.lock) ++{ ++ raw_spin_unlock_irqrestore(&grq.lock, *flags); ++} ++ ++static inline struct rq ++*task_grq_lock(struct task_struct *p, unsigned long *flags) ++ __acquires(grq.lock) ++{ ++ grq_lock_irqsave(flags); ++ return task_rq(p); ++} ++ ++static inline struct rq ++*time_task_grq_lock(struct task_struct *p, unsigned long *flags) ++ __acquires(grq.lock) ++{ ++ struct rq *rq = task_grq_lock(p, flags); ++ update_clocks(rq); ++ return rq; ++} ++ ++static inline struct rq *task_grq_lock_irq(struct task_struct *p) ++ __acquires(grq.lock) ++{ ++ grq_lock_irq(); ++ return task_rq(p); ++} ++ ++static inline void time_task_grq_lock_irq(struct task_struct *p) ++ __acquires(grq.lock) ++{ ++ struct rq *rq = task_grq_lock_irq(p); ++ update_clocks(rq); ++} ++ ++static inline void task_grq_unlock_irq(void) ++ __releases(grq.lock) ++{ ++ grq_unlock_irq(); ++} ++ ++static inline void task_grq_unlock(unsigned long *flags) ++ __releases(grq.lock) ++{ ++ grq_unlock_irqrestore(flags); ++} ++ ++/** ++ * grunqueue_is_locked ++ * ++ * Returns true if the global runqueue is locked. ++ * This interface allows printk to be called with the runqueue lock ++ * held and know whether or not it is OK to wake up the klogd. ++ */ ++bool grunqueue_is_locked(void) ++{ ++ return raw_spin_is_locked(&grq.lock); ++} ++ ++void grq_unlock_wait(void) ++ __releases(grq.lock) ++{ ++ smp_mb(); /* spin-unlock-wait is not a full memory barrier */ ++ raw_spin_unlock_wait(&grq.lock); ++} ++ ++static inline void time_grq_lock(struct rq *rq, unsigned long *flags) ++ __acquires(grq.lock) ++{ ++ local_irq_save(*flags); ++ time_lock_grq(rq); ++} ++ ++static inline struct rq *__task_grq_lock(struct task_struct *p) ++ __acquires(grq.lock) ++{ ++ grq_lock(); ++ return task_rq(p); ++} ++ ++static inline void __task_grq_unlock(void) ++ __releases(grq.lock) ++{ ++ grq_unlock(); ++} ++ ++/* ++ * Look for any tasks *anywhere* that are running nice 0 or better. We do ++ * this lockless for overhead reasons since the occasional wrong result ++ * is harmless. ++ */ ++bool above_background_load(void) ++{ ++ int cpu; ++ ++ for_each_online_cpu(cpu) { ++ struct task_struct *cpu_curr = cpu_rq(cpu)->curr; ++ ++ if (unlikely(!cpu_curr)) ++ continue; ++ if (PRIO_TO_NICE(cpu_curr->static_prio) < 1) { ++ return true; ++ } ++ } ++ return false; ++} ++ ++#ifndef __ARCH_WANT_UNLOCKED_CTXSW ++static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++} ++ ++static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ++{ ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ grq.lock.owner = current; ++#endif ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_); ++ ++ grq_unlock_irq(); ++} ++ ++#else /* __ARCH_WANT_UNLOCKED_CTXSW */ ++ ++static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW ++ grq_unlock_irq(); ++#else ++ grq_unlock(); ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ++{ ++ smp_wmb(); ++#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW ++ local_irq_enable(); ++#endif ++} ++#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ ++ ++static inline bool deadline_before(u64 deadline, u64 time) ++{ ++ return (deadline < time); ++} ++ ++static inline bool deadline_after(u64 deadline, u64 time) ++{ ++ return (deadline > time); ++} ++ ++/* ++ * A task that is queued but not running will be on the grq run list. ++ * A task that is not running or queued will not be on the grq run list. ++ * A task that is currently running will have ->on_cpu set but not on the ++ * grq run list. ++ */ ++static inline bool task_queued(struct task_struct *p) ++{ ++ return (!list_empty(&p->run_list)); ++} ++ ++/* ++ * Removing from the global runqueue. Enter with grq locked. ++ */ ++static void dequeue_task(struct task_struct *p) ++{ ++ list_del_init(&p->run_list); ++ if (list_empty(grq.queue + p->prio)) ++ __clear_bit(p->prio, grq.prio_bitmap); ++} ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!freezing(p) && !signal_pending(p) && ++ !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); ++} ++ ++/* ++ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check ++ * that the iso_refractory flag is not set. ++ */ ++static bool isoprio_suitable(void) ++{ ++ return !grq.iso_refractory; ++} ++ ++/* ++ * Adding to the global runqueue. Enter with grq locked. ++ */ ++static void enqueue_task(struct task_struct *p) ++{ ++ if (!rt_task(p)) { ++ /* Check it hasn't gotten rt from PI */ ++ if ((idleprio_task(p) && idleprio_suitable(p)) || ++ (iso_task(p) && isoprio_suitable())) ++ p->prio = p->normal_prio; ++ else ++ p->prio = NORMAL_PRIO; ++ } ++ __set_bit(p->prio, grq.prio_bitmap); ++ list_add_tail(&p->run_list, grq.queue + p->prio); ++ sched_info_queued(p); ++} ++ ++/* Only idle task does this as a real time task*/ ++static inline void enqueue_task_head(struct task_struct *p) ++{ ++ __set_bit(p->prio, grq.prio_bitmap); ++ list_add(&p->run_list, grq.queue + p->prio); ++ sched_info_queued(p); ++} ++ ++static inline void requeue_task(struct task_struct *p) ++{ ++ sched_info_queued(p); ++} ++ ++/* ++ * Returns the relative length of deadline all compared to the shortest ++ * deadline which is that of nice -20. ++ */ ++static inline int task_prio_ratio(struct task_struct *p) ++{ ++ return prio_ratios[TASK_USER_PRIO(p)]; ++} ++ ++/* ++ * task_timeslice - all tasks of all priorities get the exact same timeslice ++ * length. CPU distribution is handled by giving different deadlines to ++ * tasks of different priorities. Use 128 as the base value for fast shifts. ++ */ ++static inline int task_timeslice(struct task_struct *p) ++{ ++ return (rr_interval * task_prio_ratio(p) / 128); ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * qnr is the "queued but not running" count which is the total number of ++ * tasks on the global runqueue list waiting for cpu time but not actually ++ * currently running on a cpu. ++ */ ++static inline void inc_qnr(void) ++{ ++ grq.qnr++; ++} ++ ++static inline void dec_qnr(void) ++{ ++ grq.qnr--; ++} ++ ++static inline int queued_notrunning(void) ++{ ++ return grq.qnr; ++} ++ ++/* ++ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to ++ * allow easy lookup of whether any suitable idle CPUs are available. ++ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the ++ * idle_cpus variable than to do a full bitmask check when we are busy. ++ */ ++static inline void set_cpuidle_map(int cpu) ++{ ++ if (likely(cpu_online(cpu))) { ++ cpu_set(cpu, grq.cpu_idle_map); ++ grq.idle_cpus = true; ++ } ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++ cpu_clear(cpu, grq.cpu_idle_map); ++ if (cpus_empty(grq.cpu_idle_map)) ++ grq.idle_cpus = false; ++} ++ ++static bool suitable_idle_cpus(struct task_struct *p) ++{ ++ if (!grq.idle_cpus) ++ return false; ++ return (cpus_intersects(p->cpus_allowed, grq.cpu_idle_map)); ++} ++ ++#define CPUIDLE_DIFF_THREAD (1) ++#define CPUIDLE_DIFF_CORE (2) ++#define CPUIDLE_CACHE_BUSY (4) ++#define CPUIDLE_DIFF_CPU (8) ++#define CPUIDLE_THREAD_BUSY (16) ++#define CPUIDLE_DIFF_NODE (32) ++ ++static void resched_task(struct task_struct *p); ++ ++/* ++ * The best idle CPU is chosen according to the CPUIDLE ranking above where the ++ * lowest value would give the most suitable CPU to schedule p onto next. The ++ * order works out to be the following: ++ * ++ * Same core, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ */ ++static void ++resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask) ++{ ++ unsigned int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | ++ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | ++ CPUIDLE_DIFF_THREAD; ++ int cpu_tmp; ++ ++ if (cpu_isset(best_cpu, *tmpmask)) ++ goto out; ++ ++ for_each_cpu_mask(cpu_tmp, *tmpmask) { ++ unsigned int ranking; ++ struct rq *tmp_rq; ++ ++ ranking = 0; ++ tmp_rq = cpu_rq(cpu_tmp); ++ ++#ifdef CONFIG_NUMA ++ if (rq->cpu_locality[cpu_tmp] > 3) ++ ranking |= CPUIDLE_DIFF_NODE; ++ else ++#endif ++ if (rq->cpu_locality[cpu_tmp] > 2) ++ ranking |= CPUIDLE_DIFF_CPU; ++#ifdef CONFIG_SCHED_MC ++ if (rq->cpu_locality[cpu_tmp] == 2) ++ ranking |= CPUIDLE_DIFF_CORE; ++ if (!(tmp_rq->cache_idle(cpu_tmp))) ++ ranking |= CPUIDLE_CACHE_BUSY; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ if (rq->cpu_locality[cpu_tmp] == 1) ++ ranking |= CPUIDLE_DIFF_THREAD; ++ if (!(tmp_rq->siblings_idle(cpu_tmp))) ++ ranking |= CPUIDLE_THREAD_BUSY; ++#endif ++ if (ranking < best_ranking) { ++ best_cpu = cpu_tmp; ++ best_ranking = ranking; ++ } ++ } ++out: ++ resched_task(cpu_rq(best_cpu)->curr); ++} ++ ++static void resched_best_idle(struct task_struct *p) ++{ ++ cpumask_t tmpmask; ++ ++ cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map); ++ resched_best_mask(task_cpu(p), task_rq(p), &tmpmask); ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p); ++} ++/* ++ * Flags to tell us whether this CPU is running a CPU frequency governor that ++ * has slowed its speed or not. No locking required as the very rare wrongly ++ * read value would be harmless. ++ */ ++void cpu_scaling(int cpu) ++{ ++ cpu_rq(cpu)->scaling = true; ++} ++ ++void cpu_nonscaling(int cpu) ++{ ++ cpu_rq(cpu)->scaling = false; ++} ++ ++static inline bool scaling_rq(struct rq *rq) ++{ ++ return rq->scaling; ++} ++ ++static inline int locality_diff(struct task_struct *p, struct rq *rq) ++{ ++ return rq->cpu_locality[task_cpu(p)]; ++} ++#else /* CONFIG_SMP */ ++static inline void inc_qnr(void) ++{ ++} ++ ++static inline void dec_qnr(void) ++{ ++} ++ ++static inline int queued_notrunning(void) ++{ ++ return grq.nr_running; ++} ++ ++static inline void set_cpuidle_map(int cpu) ++{ ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++} ++ ++static inline bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return uprq->curr == uprq->idle; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++} ++ ++void cpu_scaling(int __unused) ++{ ++} ++ ++void cpu_nonscaling(int __unused) ++{ ++} ++ ++/* ++ * Although CPUs can scale in UP, there is nowhere else for tasks to go so this ++ * always returns 0. ++ */ ++static inline bool scaling_rq(struct rq *rq) ++{ ++ return false; ++} ++ ++static inline int locality_diff(struct task_struct *p, struct rq *rq) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++EXPORT_SYMBOL_GPL(cpu_scaling); ++EXPORT_SYMBOL_GPL(cpu_nonscaling); ++ ++/* ++ * activate_idle_task - move idle task to the _front_ of runqueue. ++ */ ++static inline void activate_idle_task(struct task_struct *p) ++{ ++ enqueue_task_head(p); ++ grq.nr_running++; ++ inc_qnr(); ++} ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ if (idleprio_task(p)) ++ return IDLE_PRIO; ++ if (iso_task(p)) ++ return ISO_PRIO; ++ return NORMAL_PRIO; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. Enter with grq locked. ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ update_clocks(rq); ++ ++ /* ++ * Sleep time is in units of nanosecs, so shift by 20 to get a ++ * milliseconds-range estimation of the amount of time that the task ++ * spent sleeping: ++ */ ++ if (unlikely(prof_on == SLEEP_PROFILING)) { ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), ++ (rq->clock - p->last_ran) >> 20); ++ } ++ ++ p->prio = effective_prio(p); ++ if (task_contributes_to_load(p)) ++ grq.nr_uninterruptible--; ++ enqueue_task(p); ++ grq.nr_running++; ++ inc_qnr(); ++} ++ ++static inline void clear_sticky(struct task_struct *p); ++ ++/* ++ * deactivate_task - If it's running, it's not on the grq and we can just ++ * decrement the nr_running. Enter with grq locked. ++ */ ++static inline void deactivate_task(struct task_struct *p) ++{ ++ if (task_contributes_to_load(p)) ++ grq.nr_uninterruptible++; ++ grq.nr_running--; ++ clear_sticky(p); ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold grq lock. ++ */ ++ WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock)); ++#endif ++ trace_sched_migrate_task(p, cpu); ++ if (task_cpu(p) != cpu) ++ perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); ++ ++ /* ++ * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ task_thread_info(p)->cpu = cpu; ++} ++ ++static inline void clear_sticky(struct task_struct *p) ++{ ++ p->sticky = false; ++} ++ ++static inline bool task_sticky(struct task_struct *p) ++{ ++ return p->sticky; ++} ++ ++/* Reschedule the best idle CPU that is not this one. */ ++static void ++resched_closest_idle(struct rq *rq, int cpu, struct task_struct *p) ++{ ++ cpumask_t tmpmask; ++ ++ cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map); ++ cpu_clear(cpu, tmpmask); ++ if (cpus_empty(tmpmask)) ++ return; ++ resched_best_mask(cpu, rq, &tmpmask); ++} ++ ++/* ++ * We set the sticky flag on a task that is descheduled involuntarily meaning ++ * it is awaiting further CPU time. If the last sticky task is still sticky ++ * but unlucky enough to not be the next task scheduled, we unstick it and try ++ * to find it an idle CPU. Realtime tasks do not stick to minimise their ++ * latency at all times. ++ */ ++static inline void ++swap_sticky(struct rq *rq, int cpu, struct task_struct *p) ++{ ++ if (rq->sticky_task) { ++ if (rq->sticky_task == p) { ++ p->sticky = true; ++ return; ++ } ++ if (task_sticky(rq->sticky_task)) { ++ clear_sticky(rq->sticky_task); ++ resched_closest_idle(rq, cpu, rq->sticky_task); ++ } ++ } ++ if (!rt_task(p)) { ++ p->sticky = true; ++ rq->sticky_task = p; ++ } else { ++ resched_closest_idle(rq, cpu, p); ++ rq->sticky_task = NULL; ++ } ++} ++ ++static inline void unstick_task(struct rq *rq, struct task_struct *p) ++{ ++ rq->sticky_task = NULL; ++ clear_sticky(p); ++} ++#else ++static inline void clear_sticky(struct task_struct *p) ++{ ++} ++ ++static inline bool task_sticky(struct task_struct *p) ++{ ++ return false; ++} ++ ++static inline void ++swap_sticky(struct rq *rq, int cpu, struct task_struct *p) ++{ ++} ++ ++static inline void unstick_task(struct rq *rq, struct task_struct *p) ++{ ++} ++#endif ++ ++/* ++ * Move a task off the global queue and take it to a cpu for it will ++ * become the running task. ++ */ ++static inline void take_task(int cpu, struct task_struct *p) ++{ ++ set_task_cpu(p, cpu); ++ dequeue_task(p); ++ clear_sticky(p); ++ dec_qnr(); ++} ++ ++/* ++ * Returns a descheduling task to the grq runqueue unless it is being ++ * deactivated. ++ */ ++static inline void return_task(struct task_struct *p, bool deactivate) ++{ ++ if (deactivate) ++ deactivate_task(p); ++ else { ++ inc_qnr(); ++ enqueue_task(p); ++ } ++} ++ ++/* ++ * resched_task - mark a task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++#ifdef CONFIG_SMP ++ ++#ifndef tsk_is_polling ++#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) ++#endif ++ ++static void resched_task(struct task_struct *p) ++{ ++ int cpu; ++ ++ assert_raw_spin_locked(&grq.lock); ++ ++ if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) ++ return; ++ ++ set_tsk_thread_flag(p, TIF_NEED_RESCHED); ++ ++ cpu = task_cpu(p); ++ if (cpu == smp_processor_id()) ++ return; ++ ++ /* NEED_RESCHED must be visible before we test polling */ ++ smp_mb(); ++ if (!tsk_is_polling(p)) ++ smp_send_reschedule(cpu); ++} ++ ++#else ++static inline void resched_task(struct task_struct *p) ++{ ++ assert_raw_spin_locked(&grq.lock); ++ set_tsk_need_resched(p); ++} ++#endif ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++struct migration_req { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ ++ for (;;) { ++ /* ++ * We do the initial early heuristics without holding ++ * any task-queue locks at all. We'll only try to get ++ * the runqueue lock when things look like they will ++ * work out! In the unlikely event rq is dereferenced ++ * since we're lockless, grab it again. ++ */ ++#ifdef CONFIG_SMP ++retry_rq: ++ rq = task_rq(p); ++ if (unlikely(!rq)) ++ goto retry_rq; ++#else /* CONFIG_SMP */ ++ rq = task_rq(p); ++#endif ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the grq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ rq = task_grq_lock(p, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = task_queued(p); ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_grq_unlock(&flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ); ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++#endif ++ ++#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) ++ ++/* ++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the ++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or ++ * between themselves, they cooperatively multitask. An idle rq scores as ++ * prio PRIO_LIMIT so it is always preempted. ++ */ ++static inline bool ++can_preempt(struct task_struct *p, int prio, u64 deadline) ++{ ++ /* Better static priority RT task or better policy preemption */ ++ if (p->prio < prio) ++ return true; ++ if (p->prio > prio) ++ return false; ++ /* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */ ++ if (!deadline_before(p->deadline, deadline)) ++ return false; ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Check to see if there is a task that is affined only to offline CPUs but ++ * still wants runtime. This happens to kernel threads during suspend/halt and ++ * disabling of CPUs. ++ */ ++static inline bool online_cpus(struct task_struct *p) ++{ ++ return (likely(cpus_intersects(cpu_online_map, p->cpus_allowed))); ++} ++#else /* CONFIG_HOTPLUG_CPU */ ++/* All available CPUs are always online without hotplug. */ ++static inline bool online_cpus(struct task_struct *p) ++{ ++ return true; ++} ++#endif ++ ++/* ++ * Check to see if p can run on cpu, and if not, whether there are any online ++ * CPUs it can run on instead. ++ */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) ++ return true; ++ return false; ++} ++ ++/* ++ * When all else is equal, still prefer this_rq. ++ */ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ struct rq *highest_prio_rq = NULL; ++ int cpu, highest_prio; ++ u64 latest_deadline; ++ cpumask_t tmp; ++ ++ /* ++ * We clear the sticky flag here because for a task to have called ++ * try_preempt with the sticky flag enabled means some complicated ++ * re-scheduling has occurred and we should ignore the sticky flag. ++ */ ++ clear_sticky(p); ++ ++ if (suitable_idle_cpus(p)) { ++ resched_best_idle(p); ++ return; ++ } ++ ++ /* IDLEPRIO tasks never preempt anything but idle */ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ ++ if (likely(online_cpus(p))) ++ cpus_and(tmp, cpu_online_map, p->cpus_allowed); ++ else ++ return; ++ ++ highest_prio = latest_deadline = 0; ++ ++ for_each_cpu_mask(cpu, tmp) { ++ struct rq *rq; ++ int rq_prio; ++ ++ rq = cpu_rq(cpu); ++ rq_prio = rq->rq_prio; ++ if (rq_prio < highest_prio) ++ continue; ++ ++ if (rq_prio > highest_prio || ++ deadline_after(rq->rq_deadline, latest_deadline)) { ++ latest_deadline = rq->rq_deadline; ++ highest_prio = rq_prio; ++ highest_prio_rq = rq; ++ } ++ } ++ ++ if (likely(highest_prio_rq)) { ++ if (can_preempt(p, highest_prio, highest_prio_rq->rq_deadline)) ++ resched_task(highest_prio_rq->curr); ++ } ++} ++#else /* CONFIG_SMP */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ return false; ++} ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) ++ resched_task(uprq->curr); ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++#ifdef CONFIG_SCHEDSTATS ++ struct rq *rq = this_rq(); ++ ++#ifdef CONFIG_SMP ++ int this_cpu = smp_processor_id(); ++ ++ if (cpu == this_cpu) ++ schedstat_inc(rq, ttwu_local); ++ else { ++ struct sched_domain *sd; ++ ++ rcu_read_lock(); ++ for_each_domain(this_cpu, sd) { ++ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { ++ schedstat_inc(sd, ttwu_wake_remote); ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ schedstat_inc(rq, ttwu_count); ++#endif /* CONFIG_SCHEDSTATS */ ++} ++ ++static inline void ttwu_activate(struct task_struct *p, struct rq *rq, ++ bool is_sync) ++{ ++ activate_task(p, rq); ++ ++ /* ++ * Sync wakeups (i.e. those types of wakeups where the waker ++ * has indicated that it will leave the CPU in short order) ++ * don't trigger a preemption if there are no idle cpus, ++ * instead waiting for current to deschedule. ++ */ ++ if (!is_sync || suitable_idle_cpus(p)) ++ try_preempt(p, rq); ++} ++ ++static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, ++ bool success) ++{ ++ trace_sched_wakeup(p, success); ++ p->state = TASK_RUNNING; ++ ++ /* ++ * if a worker is waking up, notify workqueue. Note that on BFS, we ++ * don't really know what cpu it will be, so we fake it for ++ * wq_worker_waking_up :/ ++ */ ++ if ((p->flags & PF_WQ_WORKER) && success) ++ wq_worker_waking_up(p, cpu_of(rq)); ++} ++ ++#ifdef CONFIG_SMP ++void scheduler_ipi(void) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Returns %true if @p was woken up, %false if it was already running ++ * or @state didn't match @p's state. ++ */ ++static bool try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ bool success = false; ++ unsigned long flags; ++ struct rq *rq; ++ int cpu; ++ ++ get_cpu(); ++ ++ /* This barrier is undocumented, probably for p->state? ãã */ ++ smp_wmb(); ++ ++ /* ++ * No need to do time_lock_grq as we only need to update the rq clock ++ * if we activate the task ++ */ ++ rq = task_grq_lock(p, &flags); ++ cpu = task_cpu(p); ++ ++ /* state is a volatile long, ã©ã†ã—ã¦ã€åˆ†ã‹ã‚‰ãªã„ */ ++ if (!((unsigned int)p->state & state)) ++ goto out_unlock; ++ ++ if (task_queued(p) || task_running(p)) ++ goto out_running; ++ ++ ttwu_activate(p, rq, wake_flags & WF_SYNC); ++ success = true; ++ ++out_running: ++ ttwu_post_activation(p, rq, success); ++out_unlock: ++ task_grq_unlock(&flags); ++ ++ ttwu_stat(p, cpu, wake_flags); ++ ++ put_cpu(); ++ ++ return success; ++} ++ ++/** ++ * try_to_wake_up_local - try to wake up a local task with grq lock held ++ * @p: the thread to be awakened ++ * ++ * Put @p on the run-queue if it's not already there. The caller must ++ * ensure that grq is locked and, @p is not the current task. ++ * grq stays locked over invocation. ++ */ ++static void try_to_wake_up_local(struct task_struct *p) ++{ ++ struct rq *rq = task_rq(p); ++ bool success = false; ++ ++ lockdep_assert_held(&grq.lock); ++ ++ if (!(p->state & TASK_NORMAL)) ++ return; ++ ++ if (!task_queued(p)) { ++ if (likely(!task_running(p))) { ++ schedstat_inc(rq, ttwu_count); ++ schedstat_inc(rq, ttwu_local); ++ } ++ ttwu_activate(p, rq, false); ++ ttwu_stat(p, smp_processor_id(), 0); ++ success = true; ++ } ++ ttwu_post_activation(p, rq, success); ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. Returns 1 if the process was woken up, 0 if it was already ++ * running. ++ * ++ * It may be assumed that this function implies a write memory barrier before ++ * changing the task state if and only if any tasks are woken up. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_ALL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++static void time_slice_expired(struct task_struct *p); ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++void sched_fork(struct task_struct *p) ++{ ++ struct task_struct *curr; ++ int cpu = get_cpu(); ++ struct rq *rq; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ /* ++ * We mark the process as running here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_RUNNING; ++ set_task_cpu(p, cpu); ++ ++ /* Should be reset in fork.c but done here for ease of bfs patching */ ++ p->sched_time = p->stime_pc = p->utime_pc = 0; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { ++ p->policy = SCHED_NORMAL; ++ p->normal_prio = normal_prio(p); ++ } ++ ++ if (PRIO_TO_NICE(p->static_prio) < 0) { ++ p->static_prio = NICE_TO_PRIO(0); ++ p->normal_prio = p->static_prio; ++ } ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ curr = current; ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = curr->normal_prio; ++ ++ INIT_LIST_HEAD(&p->run_list); ++#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ ++ p->on_cpu = false; ++ clear_sticky(p); ++ ++#ifdef CONFIG_PREEMPT_COUNT ++ /* Want to start with kernel preemption disabled. */ ++ task_thread_info(p)->preempt_count = 1; ++#endif ++ if (unlikely(p->policy == SCHED_FIFO)) ++ goto out; ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. If it's negative, it won't ++ * matter since that's the same as being 0. current's time_slice is ++ * actually in rq_time_slice when it's running, as is its last_ran ++ * value. rq->rq_deadline is only modified within schedule() so it ++ * is always equal to current->deadline. ++ */ ++ rq = task_grq_lock_irq(curr); ++ if (likely(rq->rq_time_slice >= RESCHED_US * 2)) { ++ rq->rq_time_slice /= 2; ++ p->time_slice = rq->rq_time_slice; ++ } else { ++ /* ++ * Forking task has run out of timeslice. Reschedule it and ++ * start its child with a new time slice and deadline. The ++ * child will end up running first because its deadline will ++ * be slightly earlier. ++ */ ++ rq->rq_time_slice = 0; ++ set_tsk_need_resched(curr); ++ time_slice_expired(p); ++ } ++ p->last_ran = rq->rq_last_ran; ++ task_grq_unlock_irq(); ++out: ++ put_cpu(); ++} ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ struct task_struct *parent; ++ unsigned long flags; ++ struct rq *rq; ++ ++ rq = task_grq_lock(p, &flags); ++ p->state = TASK_RUNNING; ++ parent = p->parent; ++ /* Unnecessary but small chance that the parent changed CPU */ ++ set_task_cpu(p, task_cpu(parent)); ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p, 1); ++ if (rq->curr == parent && !suitable_idle_cpus(p)) { ++ /* ++ * The VM isn't cloned, so we're in a good position to ++ * do child-runs-first in anticipation of an exec. This ++ * usually avoids a lot of COW overhead. ++ */ ++ resched_task(parent); ++ } else ++ try_preempt(p, rq); ++ task_grq_unlock(&flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ struct hlist_node *node; ++ ++ hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ struct hlist_node *node; ++ ++ hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ sched_info_switch(prev, next); ++ perf_event_task_sched_out(prev, next); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_lock_switch(rq, next); ++ prepare_arch_switch(next); ++ trace_sched_switch(prev, next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ */ ++static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) ++ __releases(grq.lock) ++{ ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * The test for TASK_DEAD must occur while the runqueue locks are ++ * still held, otherwise prev could be scheduled on another cpu, die ++ * there before we look at prev->state, and then the reference would ++ * be dropped twice. ++ * Manfred Spraul ++ */ ++ prev_state = prev->state; ++ finish_arch_switch(prev); ++#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW ++ local_irq_disable(); ++#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ ++ perf_event_task_sched_in(prev, current); ++#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW ++ local_irq_enable(); ++#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ ++ finish_lock_switch(rq, prev); ++ ++ fire_sched_in_preempt_notifiers(current); ++ if (mm) ++ mmdrop(mm); ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ put_task_struct(prev); ++ } ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage void schedule_tail(struct task_struct *prev) ++ __releases(grq.lock) ++{ ++ struct rq *rq = this_rq(); ++ ++ finish_task_switch(rq, prev); ++#ifdef __ARCH_WANT_UNLOCKED_CTXSW ++ /* In this case, finish_task_switch does not reenable preemption */ ++ preempt_enable(); ++#endif ++ if (current->set_child_tid) ++ put_user(current->pid, current->set_child_tid); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new ++ * thread's register state. ++ */ ++static inline void ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ struct mm_struct *mm, *oldmm; ++ ++ prepare_task_switch(rq, prev, next); ++ ++ mm = next->mm; ++ oldmm = prev->active_mm; ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ if (!mm) { ++ next->active_mm = oldmm; ++ atomic_inc(&oldmm->mm_count); ++ enter_lazy_tlb(oldmm, next); ++ } else ++ switch_mm(oldmm, mm, next); ++ ++ if (!prev->mm) { ++ prev->active_mm = NULL; ++ rq->prev_mm = oldmm; ++ } ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++#ifndef __ARCH_WANT_UNLOCKED_CTXSW ++ spin_release(&grq.lock.dep_map, 1, _THIS_IP_); ++#endif ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ ++ barrier(); ++ /* ++ * this_rq must be evaluated again because prev may have moved ++ * CPUs since it called schedule(), thus the 'rq' on its stack ++ * frame will be invalid. ++ */ ++ finish_task_switch(this_rq(), prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, current number of uninterruptible-sleeping threads, total ++ * number of context switches performed since bootup. All are measured ++ * without grabbing the grq lock but the occasional inaccurate result ++ * doesn't matter so long as it's positive. ++ */ ++unsigned long nr_running(void) ++{ ++ long nr = grq.nr_running; ++ ++ if (unlikely(nr < 0)) ++ nr = 0; ++ return (unsigned long)nr; ++} ++ ++unsigned long nr_uninterruptible(void) ++{ ++ long nu = grq.nr_uninterruptible; ++ ++ if (unlikely(nu < 0)) ++ nu = 0; ++ return nu; ++} ++ ++unsigned long long nr_context_switches(void) ++{ ++ long long ns = grq.nr_switches; ++ ++ /* This is of course impossible */ ++ if (unlikely(ns < 0)) ++ ns = 1; ++ return (unsigned long long)ns; ++} ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += atomic_read(&cpu_rq(i)->nr_iowait); ++ ++ return sum; ++} ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ struct rq *this = cpu_rq(cpu); ++ return atomic_read(&this->nr_iowait); ++} ++ ++unsigned long nr_active(void) ++{ ++ return nr_running() + nr_uninterruptible(); ++} ++ ++/* Beyond a task running on this CPU, load is equal everywhere on BFS */ ++unsigned long this_cpu_load(void) ++{ ++ return this_rq()->rq_running + ++ ((queued_notrunning() + nr_uninterruptible()) / grq.noc); ++} ++ ++/* Variables and functions for calc_load */ ++static unsigned long calc_load_update; ++unsigned long avenrun[3]; ++EXPORT_SYMBOL(avenrun); ++ ++/** ++ * get_avenrun - get the load average array ++ * @loads: pointer to dest load array ++ * @offset: offset to add ++ * @shift: shift count to shift the result left ++ * ++ * These values are estimates at best, so no need for locking. ++ */ ++void get_avenrun(unsigned long *loads, unsigned long offset, int shift) ++{ ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++} ++ ++static unsigned long ++calc_load(unsigned long load, unsigned long exp, unsigned long active) ++{ ++ load *= exp; ++ load += active * (FIXED_1 - exp); ++ return load >> FSHIFT; ++} ++ ++/* ++ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. ++ */ ++void calc_global_load(unsigned long ticks) ++{ ++ long active; ++ ++ if (time_before(jiffies, calc_load_update)) ++ return; ++ active = nr_active() * FIXED_1; ++ ++ avenrun[0] = calc_load(avenrun[0], EXP_1, active); ++ avenrun[1] = calc_load(avenrun[1], EXP_5, active); ++ avenrun[2] = calc_load(avenrun[2], EXP_15, active); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ ++/* ++ * There are no locks covering percpu hardirq/softirq time. ++ * They are only modified in account_system_vtime, on corresponding CPU ++ * with interrupts disabled. So, writes are safe. ++ * They are read and saved off onto struct rq in update_rq_clock(). ++ * This may result in other CPU reading this CPU's irq time and can ++ * race with irq/account_system_vtime on this CPU. We would either get old ++ * or new value with a side effect of accounting a slice of irq time to wrong ++ * task when irq is in progress while we read rq->clock. That is a worthy ++ * compromise in place of having locks on each irq in account_system_time. ++ */ ++static DEFINE_PER_CPU(u64, cpu_hardirq_time); ++static DEFINE_PER_CPU(u64, cpu_softirq_time); ++ ++static DEFINE_PER_CPU(u64, irq_start_time); ++static int sched_clock_irqtime; ++ ++void enable_sched_clock_irqtime(void) ++{ ++ sched_clock_irqtime = 1; ++} ++ ++void disable_sched_clock_irqtime(void) ++{ ++ sched_clock_irqtime = 0; ++} ++ ++#ifndef CONFIG_64BIT ++static DEFINE_PER_CPU(seqcount_t, irq_time_seq); ++ ++static inline void irq_time_write_begin(void) ++{ ++ __this_cpu_inc(irq_time_seq.sequence); ++ smp_wmb(); ++} ++ ++static inline void irq_time_write_end(void) ++{ ++ smp_wmb(); ++ __this_cpu_inc(irq_time_seq.sequence); ++} ++ ++static inline u64 irq_time_read(int cpu) ++{ ++ u64 irq_time; ++ unsigned seq; ++ ++ do { ++ seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); ++ irq_time = per_cpu(cpu_softirq_time, cpu) + ++ per_cpu(cpu_hardirq_time, cpu); ++ } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); ++ ++ return irq_time; ++} ++#else /* CONFIG_64BIT */ ++static inline void irq_time_write_begin(void) ++{ ++} ++ ++static inline void irq_time_write_end(void) ++{ ++} ++ ++static inline u64 irq_time_read(int cpu) ++{ ++ return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); ++} ++#endif /* CONFIG_64BIT */ ++ ++/* ++ * Called before incrementing preempt_count on {soft,}irq_enter ++ * and before decrementing preempt_count on {soft,}irq_exit. ++ */ ++void account_system_vtime(struct task_struct *curr) ++{ ++ unsigned long flags; ++ s64 delta; ++ int cpu; ++ ++ if (!sched_clock_irqtime) ++ return; ++ ++ local_irq_save(flags); ++ ++ cpu = smp_processor_id(); ++ delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); ++ __this_cpu_add(irq_start_time, delta); ++ ++ irq_time_write_begin(); ++ /* ++ * We do not account for softirq time from ksoftirqd here. ++ * We want to continue accounting softirq time to ksoftirqd thread ++ * in that case, so as not to confuse scheduler with a special task ++ * that do not consume any time, but still wants to run. ++ */ ++ if (hardirq_count()) ++ __this_cpu_add(cpu_hardirq_time, delta); ++ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) ++ __this_cpu_add(cpu_softirq_time, delta); ++ ++ irq_time_write_end(); ++ local_irq_restore(flags); ++} ++EXPORT_SYMBOL_GPL(account_system_vtime); ++ ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_PARAVIRT ++static inline u64 steal_ticks(u64 steal) ++{ ++ if (unlikely(steal > NSEC_PER_SEC)) ++ return div_u64(steal, TICK_NSEC); ++ ++ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); ++} ++#endif ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_branch((¶virt_steal_rq_enabled))) { ++ u64 st, steal = paravirt_steal_clock(cpu_of(rq)); ++ ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ st = steal_ticks(steal); ++ steal = st * TICK_NSEC; ++ ++ rq->prev_steal_time_rq += steal; ++ ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++} ++ ++#ifndef nsecs_to_cputime ++# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) ++#endif ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++static void irqtime_account_hi_si(void) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ u64 latest_ns; ++ ++ latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)); ++ if (latest_ns > cpustat[CPUTIME_IRQ]) ++ cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy; ++ ++ latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)); ++ if (latest_ns > cpustat[CPUTIME_SOFTIRQ]) ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy; ++} ++#else /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#define sched_clock_irqtime (0) ++ ++static inline void irqtime_account_hi_si(void) ++{ ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++static __always_inline bool steal_account_process_tick(void) ++{ ++#ifdef CONFIG_PARAVIRT ++ if (static_branch(¶virt_steal_enabled)) { ++ u64 steal, st = 0; ++ ++ steal = paravirt_steal_clock(smp_processor_id()); ++ steal -= this_rq()->prev_steal_time; ++ ++ st = steal_ticks(steal); ++ this_rq()->prev_steal_time += st * TICK_NSEC; ++ ++ account_steal_time(st); ++ return st; ++ } ++#endif ++ return false; ++} ++ ++/* ++ * On each tick, see what percentage of that tick was attributed to each ++ * component and add the percentage to the _pc values. Once a _pc value has ++ * accumulated one tick's worth, account for that. This means the total ++ * percentage of load components will always be 128 (pseudo 100) per tick. ++ */ ++static void pc_idle_time(struct rq *rq, unsigned long pc) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ ++ if (atomic_read(&rq->nr_iowait) > 0) { ++ rq->iowait_pc += pc; ++ if (rq->iowait_pc >= 128) { ++ rq->iowait_pc %= 128; ++ cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy; ++ } ++ } else { ++ rq->idle_pc += pc; ++ if (rq->idle_pc >= 128) { ++ rq->idle_pc %= 128; ++ cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy; ++ } ++ } ++} ++ ++static void ++pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset, ++ unsigned long pc, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); ++ ++ p->stime_pc += pc; ++ if (p->stime_pc >= 128) { ++ p->stime_pc %= 128; ++ p->stime += (__force u64)cputime_one_jiffy; ++ p->stimescaled += one_jiffy_scaled; ++ account_group_system_time(p, cputime_one_jiffy); ++ acct_update_integrals(p); ++ } ++ p->sched_time += ns; ++ ++ if (hardirq_count() - hardirq_offset) { ++ rq->irq_pc += pc; ++ if (rq->irq_pc >= 128) { ++ rq->irq_pc %= 128; ++ cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy; ++ } ++ } else if (in_serving_softirq()) { ++ rq->softirq_pc += pc; ++ if (rq->softirq_pc >= 128) { ++ rq->softirq_pc %= 128; ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy; ++ } ++ } else { ++ rq->system_pc += pc; ++ if (rq->system_pc >= 128) { ++ rq->system_pc %= 128; ++ cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy; ++ } ++ } ++} ++ ++static void pc_user_time(struct rq *rq, struct task_struct *p, ++ unsigned long pc, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); ++ ++ p->utime_pc += pc; ++ if (p->utime_pc >= 128) { ++ p->utime_pc %= 128; ++ p->utime += (__force u64)cputime_one_jiffy; ++ p->utimescaled += one_jiffy_scaled; ++ account_group_user_time(p, cputime_one_jiffy); ++ acct_update_integrals(p); ++ } ++ p->sched_time += ns; ++ ++ if (this_cpu_ksoftirqd() == p) { ++ /* ++ * ksoftirqd time do not get accounted in cpu_softirq_time. ++ * So, we have to handle it separately here. ++ */ ++ rq->softirq_pc += pc; ++ if (rq->softirq_pc >= 128) { ++ rq->softirq_pc %= 128; ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy; ++ } ++ } ++ ++ if (TASK_NICE(p) > 0 || idleprio_task(p)) { ++ rq->nice_pc += pc; ++ if (rq->nice_pc >= 128) { ++ rq->nice_pc %= 128; ++ cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy; ++ } ++ } else { ++ rq->user_pc += pc; ++ if (rq->user_pc >= 128) { ++ rq->user_pc %= 128; ++ cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy; ++ } ++ } ++} ++ ++/* ++ * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast ++ * shifts instead of 100 ++ */ ++#define NS_TO_PC(NS) (NS * 128 / JIFFY_NS) ++ ++/* ++ * This is called on clock ticks and on context switches. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void ++update_cpu_clock(struct rq *rq, struct task_struct *p, bool tick) ++{ ++ long account_ns = rq->clock - rq->timekeep_clock; ++ struct task_struct *idle = rq->idle; ++ unsigned long account_pc; ++ ++ if (unlikely(account_ns < 0)) ++ account_ns = 0; ++ ++ account_pc = NS_TO_PC(account_ns); ++ ++ if (tick) { ++ int user_tick; ++ ++ /* Accurate tick timekeeping */ ++ rq->account_pc += account_pc - 128; ++ if (rq->account_pc < 0) { ++ /* ++ * Small errors in micro accounting may not make the ++ * accounting add up to 128 each tick so we keep track ++ * of the percentage and round it up when less than 128 ++ */ ++ account_pc += -rq->account_pc; ++ rq->account_pc = 0; ++ } ++ if (steal_account_process_tick()) ++ goto ts_account; ++ ++ user_tick = user_mode(get_irq_regs()); ++ ++ if (user_tick) ++ pc_user_time(rq, p, account_pc, account_ns); ++ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) ++ pc_system_time(rq, p, HARDIRQ_OFFSET, ++ account_pc, account_ns); ++ else ++ pc_idle_time(rq, account_pc); ++ ++ if (sched_clock_irqtime) ++ irqtime_account_hi_si(); ++ } else { ++ /* Accurate subtick timekeeping */ ++ rq->account_pc += account_pc; ++ if (p == idle) ++ pc_idle_time(rq, account_pc); ++ else ++ pc_user_time(rq, p, account_pc, account_ns); ++ } ++ ++ts_account: ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (rq->rq_policy != SCHED_FIFO && p != idle) { ++ s64 time_diff = rq->clock - rq->rq_last_ran; ++ ++ niffy_diff(&time_diff, 1); ++ rq->rq_time_slice -= NS_TO_US(time_diff); ++ } ++ rq->rq_last_ran = rq->timekeep_clock = rq->clock; ++} ++ ++/* ++ * Return any ns on the sched_clock that have not yet been accounted in ++ * @p in case that task is currently running. ++ * ++ * Called with task_grq_lock() held. ++ */ ++static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) ++{ ++ u64 ns = 0; ++ ++ if (p == rq->curr) { ++ update_clocks(rq); ++ ns = rq->clock_task - rq->rq_last_ran; ++ if (unlikely((s64)ns < 0)) ++ ns = 0; ++ } ++ ++ return ns; ++} ++ ++unsigned long long task_delta_exec(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ u64 ns; ++ ++ rq = task_grq_lock(p, &flags); ++ ns = do_task_delta_exec(p, rq); ++ task_grq_unlock(&flags); ++ ++ return ns; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * In case the task is currently running, return the runtime plus current's ++ * pending runtime that have not been accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ u64 ns; ++ ++ rq = task_grq_lock(p, &flags); ++ ns = p->sched_time + do_task_delta_exec(p, rq); ++ task_grq_unlock(&flags); ++ ++ return ns; ++} ++ ++/* Compatibility crap for removal */ ++void account_user_time(struct task_struct *p, cputime_t cputime, ++ cputime_t cputime_scaled) ++{ ++} ++ ++void account_idle_time(cputime_t cputime) ++{ ++} ++ ++/* ++ * Account guest cpu time to a process. ++ * @p: the process that the cpu time gets accounted to ++ * @cputime: the cpu time spent in virtual machine since the last update ++ * @cputime_scaled: cputime scaled by cpu frequency ++ */ ++static void account_guest_time(struct task_struct *p, cputime_t cputime, ++ cputime_t cputime_scaled) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ ++ /* Add guest time to process. */ ++ p->utime += (__force u64)cputime; ++ p->utimescaled += (__force u64)cputime_scaled; ++ account_group_user_time(p, cputime); ++ p->gtime += (__force u64)cputime; ++ ++ /* Add guest time to cpustat. */ ++ if (TASK_NICE(p) > 0) { ++ cpustat[CPUTIME_NICE] += (__force u64)cputime; ++ cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime; ++ } else { ++ cpustat[CPUTIME_USER] += (__force u64)cputime; ++ cpustat[CPUTIME_GUEST] += (__force u64)cputime; ++ } ++} ++ ++/* ++ * Account system cpu time to a process and desired cpustat field ++ * @p: the process that the cpu time gets accounted to ++ * @cputime: the cpu time spent in kernel space since the last update ++ * @cputime_scaled: cputime scaled by cpu frequency ++ * @target_cputime64: pointer to cpustat field that has to be updated ++ */ ++static inline ++void __account_system_time(struct task_struct *p, cputime_t cputime, ++ cputime_t cputime_scaled, cputime64_t *target_cputime64) ++{ ++ /* Add system time to process. */ ++ p->stime += (__force u64)cputime; ++ p->stimescaled += (__force u64)cputime_scaled; ++ account_group_system_time(p, cputime); ++ ++ /* Add system time to cpustat. */ ++ *target_cputime64 += (__force u64)cputime; ++ ++ /* Account for system time used */ ++ acct_update_integrals(p); ++} ++ ++/* ++ * Account system cpu time to a process. ++ * @p: the process that the cpu time gets accounted to ++ * @hardirq_offset: the offset to subtract from hardirq_count() ++ * @cputime: the cpu time spent in kernel space since the last update ++ * @cputime_scaled: cputime scaled by cpu frequency ++ * This is for guest only now. ++ */ ++void account_system_time(struct task_struct *p, int hardirq_offset, ++ cputime_t cputime, cputime_t cputime_scaled) ++{ ++ ++ if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) ++ account_guest_time(p, cputime, cputime_scaled); ++} ++ ++/* ++ * Account for involuntary wait time. ++ * @steal: the cpu time spent in involuntary wait ++ */ ++void account_steal_time(cputime_t cputime) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ ++ cpustat[CPUTIME_STEAL] += (__force u64)cputime; ++} ++ ++/* ++ * Account for idle time. ++ * @cputime: the cpu time spent in idle wait ++ */ ++static void account_idle_times(cputime_t cputime) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ struct rq *rq = this_rq(); ++ ++ if (atomic_read(&rq->nr_iowait) > 0) ++ cpustat[CPUTIME_IOWAIT] += (__force u64)cputime; ++ else ++ cpustat[CPUTIME_IDLE] += (__force u64)cputime; ++} ++ ++#ifndef CONFIG_VIRT_CPU_ACCOUNTING ++ ++void account_process_tick(struct task_struct *p, int user_tick) ++{ ++} ++ ++/* ++ * Account multiple ticks of steal time. ++ * @p: the process from which the cpu time has been stolen ++ * @ticks: number of stolen ticks ++ */ ++void account_steal_ticks(unsigned long ticks) ++{ ++ account_steal_time(jiffies_to_cputime(ticks)); ++} ++ ++/* ++ * Account multiple ticks of idle time. ++ * @ticks: number of stolen ticks ++ */ ++void account_idle_ticks(unsigned long ticks) ++{ ++ account_idle_times(jiffies_to_cputime(ticks)); ++} ++#endif ++ ++static inline void grq_iso_lock(void) ++ __acquires(grq.iso_lock) ++{ ++ raw_spin_lock(&grq.iso_lock); ++} ++ ++static inline void grq_iso_unlock(void) ++ __releases(grq.iso_lock) ++{ ++ raw_spin_unlock(&grq.iso_lock); ++} ++ ++/* ++ * Functions to test for when SCHED_ISO tasks have used their allocated ++ * quota as real time scheduling and convert them back to SCHED_NORMAL. ++ * Where possible, the data is tested lockless, to avoid grabbing iso_lock ++ * because the occasional inaccurate result won't matter. However the ++ * tick data is only ever modified under lock. iso_refractory is only simply ++ * set to 0 or 1 so it's not worth grabbing the lock yet again for that. ++ */ ++static bool set_iso_refractory(void) ++{ ++ grq.iso_refractory = true; ++ return grq.iso_refractory; ++} ++ ++static bool clear_iso_refractory(void) ++{ ++ grq.iso_refractory = false; ++ return grq.iso_refractory; ++} ++ ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a ++ * slow division. ++ */ ++static bool test_ret_isorefractory(struct rq *rq) ++{ ++ if (likely(!grq.iso_refractory)) { ++ if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu) ++ return set_iso_refractory(); ++ } else { ++ if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) ++ return clear_iso_refractory(); ++ } ++ return grq.iso_refractory; ++} ++ ++static void iso_tick(void) ++{ ++ grq_iso_lock(); ++ grq.iso_ticks += 100; ++ grq_iso_unlock(); ++} ++ ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(void) ++{ ++ if (grq.iso_ticks) { ++ grq_iso_lock(); ++ grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1; ++ if (unlikely(grq.iso_refractory && grq.iso_ticks < ++ ISO_PERIOD * (sched_iso_cpu * 115 / 128))) ++ clear_iso_refractory(); ++ grq_iso_unlock(); ++ } ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq) ++{ ++ struct task_struct *p; ++ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) { ++ if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128) ++ iso_tick(); ++ } else ++ no_iso_tick(); ++ ++ if (iso_queue(rq)) { ++ if (unlikely(test_ret_isorefractory(rq))) { ++ if (rq_running_iso(rq)) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Force it to reschedule as ++ * SCHED_NORMAL by zeroing its time_slice ++ */ ++ rq->rq_time_slice = 0; ++ } ++ } ++ } ++ ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (rq->rq_policy == SCHED_FIFO) ++ return; ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ */ ++ if (rq->dither) { ++ if (rq->rq_time_slice > HALF_JIFFY_US) ++ return; ++ else ++ rq->rq_time_slice = 0; ++ } else if (rq->rq_time_slice >= RESCHED_US) ++ return; ++ ++ /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */ ++ p = rq->curr; ++ grq_lock(); ++ requeue_task(p); ++ set_tsk_need_resched(p); ++ grq_unlock(); ++} ++ ++void wake_up_idle_cpu(int cpu); ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. The data modified is all ++ * local to struct rq so we don't need to grab grq lock. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ /* grq lock not grabbed, so only update rq clock */ ++ update_rq_clock(rq); ++ update_cpu_clock(rq, rq->curr, true); ++ if (!rq_idle(rq)) ++ task_running_tick(rq); ++ else ++ no_iso_tick(); ++ rq->last_tick = rq->clock; ++ perf_event_task_tick(); ++} ++ ++notrace unsigned long get_parent_ip(unsigned long addr) ++{ ++ if (in_lock_functions(addr)) { ++ addr = CALLER_ADDR2; ++ if (in_lock_functions(addr)) ++ addr = CALLER_ADDR3; ++ } ++ return addr; ++} ++ ++#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++void __kprobes add_preempt_count(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ preempt_count() += val; ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ if (preempt_count() == val) ++ trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); ++} ++EXPORT_SYMBOL(add_preempt_count); ++ ++void __kprobes sub_preempt_count(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); ++ preempt_count() -= val; ++} ++EXPORT_SYMBOL(sub_preempt_count); ++#endif ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes cpu fairly amongst tasks of the ++ * same nice value, it proportions cpu according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 prio_deadline_diff(int user_prio) ++{ ++ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); ++} ++ ++static inline u64 task_deadline_diff(struct task_struct *p) ++{ ++ return prio_deadline_diff(TASK_USER_PRIO(p)); ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return prio_deadline_diff(USER_PRIO(static_prio)); ++} ++ ++static inline int longest_deadline_diff(void) ++{ ++ return prio_deadline_diff(39); ++} ++ ++static inline int ms_longest_deadline_diff(void) ++{ ++ return NS_TO_MS(longest_deadline_diff()); ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline. ++ */ ++static void time_slice_expired(struct task_struct *p) ++{ ++ p->time_slice = timeslice(); ++ p->deadline = grq.niffies + task_deadline_diff(p); ++} ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p) ++{ ++ if (p->time_slice < RESCHED_US || batch_task(p)) ++ time_slice_expired(p); ++} ++ ++#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) ++ ++/* ++ * Scheduler queue bitmap specific find next bit. ++ */ ++static inline unsigned long ++next_sched_bit(const unsigned long *addr, unsigned long offset) ++{ ++ const unsigned long *p; ++ unsigned long result; ++ unsigned long size; ++ unsigned long tmp; ++ ++ size = PRIO_LIMIT; ++ if (offset >= size) ++ return size; ++ ++ p = addr + BITOP_WORD(offset); ++ result = offset & ~(BITS_PER_LONG-1); ++ size -= result; ++ offset %= BITS_PER_LONG; ++ if (offset) { ++ tmp = *(p++); ++ tmp &= (~0UL << offset); ++ if (size < BITS_PER_LONG) ++ goto found_first; ++ if (tmp) ++ goto found_middle; ++ size -= BITS_PER_LONG; ++ result += BITS_PER_LONG; ++ } ++ while (size & ~(BITS_PER_LONG-1)) { ++ if ((tmp = *(p++))) ++ goto found_middle; ++ result += BITS_PER_LONG; ++ size -= BITS_PER_LONG; ++ } ++ if (!size) ++ return result; ++ tmp = *p; ++ ++found_first: ++ tmp &= (~0UL >> (BITS_PER_LONG - size)); ++ if (tmp == 0UL) /* Are any bits set? */ ++ return result + size; /* Nope. */ ++found_middle: ++ return result + __ffs(tmp); ++} ++ ++/* ++ * O(n) lookup of all tasks in the global runqueue. The real brainfuck ++ * of lock contention and O(n). It's not really O(n) as only the queued, ++ * but not running tasks are scanned, and is O(n) queued in the worst case ++ * scenario only because the right task can be found before scanning all of ++ * them. ++ * Tasks are selected in this order: ++ * Real time tasks are selected purely by their static priority and in the ++ * order they were queued, so the lowest value idx, and the first queued task ++ * of that priority value is chosen. ++ * If no real time tasks are found, the SCHED_ISO priority is checked, and ++ * all SCHED_ISO tasks have the same priority value, so they're selected by ++ * the earliest deadline value. ++ * If no SCHED_ISO tasks are found, SCHED_NORMAL tasks are selected by the ++ * earliest deadline. ++ * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are ++ * selected by the earliest deadline. ++ */ ++static inline struct ++task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct task_struct *edt = NULL; ++ unsigned long idx = -1; ++ ++ do { ++ struct list_head *queue; ++ struct task_struct *p; ++ u64 earliest_deadline; ++ ++ idx = next_sched_bit(grq.prio_bitmap, ++idx); ++ if (idx >= PRIO_LIMIT) ++ return idle; ++ queue = grq.queue + idx; ++ ++ if (idx < MAX_RT_PRIO) { ++ /* We found an rt task */ ++ list_for_each_entry(p, queue, run_list) { ++ /* Make sure cpu affinity is ok */ ++ if (needs_other_cpu(p, cpu)) ++ continue; ++ edt = p; ++ goto out_take; ++ } ++ /* ++ * None of the RT tasks at this priority can run on ++ * this cpu ++ */ ++ continue; ++ } ++ ++ /* ++ * No rt tasks. Find the earliest deadline task. Now we're in ++ * O(n) territory. ++ */ ++ earliest_deadline = ~0ULL; ++ list_for_each_entry(p, queue, run_list) { ++ u64 dl; ++ ++ /* Make sure cpu affinity is ok */ ++ if (needs_other_cpu(p, cpu)) ++ continue; ++ ++ /* ++ * Soft affinity happens here by not scheduling a task ++ * with its sticky flag set that ran on a different CPU ++ * last when the CPU is scaling, or by greatly biasing ++ * against its deadline when not, based on cpu cache ++ * locality. ++ */ ++ if (task_sticky(p) && task_rq(p) != rq) { ++ if (scaling_rq(rq)) ++ continue; ++ dl = p->deadline << locality_diff(p, rq); ++ } else ++ dl = p->deadline; ++ ++ if (deadline_before(dl, earliest_deadline)) { ++ earliest_deadline = dl; ++ edt = p; ++ } ++ } ++ } while (!edt); ++ ++out_take: ++ take_task(cpu, edt); ++ return edt; ++} ++ ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ struct pt_regs *regs = get_irq_regs(); ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ ++ if (regs) ++ show_regs(regs); ++ else ++ dump_stack(); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev) ++{ ++ /* ++ * Test if we are atomic. Since do_exit() needs to call into ++ * schedule() atomically, we ignore that path for now. ++ * Otherwise, whine if we are scheduling when we should not be. ++ */ ++ if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) ++ __schedule_bug(prev); ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq(), sched_count); ++} ++ ++/* ++ * The currently running task's information is all stored in rq local data ++ * which is only modified by the local CPU, thereby allowing the data to be ++ * changed without grabbing the grq lock. ++ */ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ rq->rq_time_slice = p->time_slice; ++ rq->rq_deadline = p->deadline; ++ rq->rq_last_ran = p->last_ran = rq->clock; ++ rq->rq_policy = p->policy; ++ rq->rq_prio = p->prio; ++ if (p != rq->idle) ++ rq->rq_running = true; ++ else ++ rq->rq_running = false; ++} ++ ++static void reset_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ rq->rq_policy = p->policy; ++ rq->rq_prio = p->prio; ++} ++ ++/* ++ * schedule() is the main scheduler function. ++ */ ++asmlinkage void __sched schedule(void) ++{ ++ struct task_struct *prev, *next, *idle; ++ unsigned long *switch_count; ++ bool deactivate; ++ struct rq *rq; ++ int cpu; ++ ++need_resched: ++ preempt_disable(); ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ rcu_note_context_switch(cpu); ++ prev = rq->curr; ++ ++ deactivate = false; ++ schedule_debug(prev); ++ ++ grq_lock_irq(); ++ ++ switch_count = &prev->nivcsw; ++ if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { ++ if (unlikely(signal_pending_state(prev->state, prev))) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate = true; ++ /* ++ * If a worker is going to sleep, notify and ++ * ask workqueue whether it wants to wake up a ++ * task to maintain concurrency. If so, wake ++ * up the task. ++ */ ++ if (prev->flags & PF_WQ_WORKER) { ++ struct task_struct *to_wakeup; ++ ++ to_wakeup = wq_worker_sleeping(prev, cpu); ++ if (to_wakeup) { ++ /* This shouldn't happen, but does */ ++ if (unlikely(to_wakeup == prev)) ++ deactivate = false; ++ else ++ try_to_wake_up_local(to_wakeup); ++ } ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, make ++ * sure to submit it to avoid deadlocks. ++ */ ++ if (unlikely(deactivate && blk_needs_flush_plug(prev))) { ++ grq_unlock_irq(); ++ preempt_enable_no_resched(); ++ blk_schedule_flush_plug(prev); ++ goto need_resched; ++ } ++ ++ update_clocks(rq); ++ update_cpu_clock(rq, prev, false); ++ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) ++ rq->dither = false; ++ else ++ rq->dither = true; ++ ++ clear_tsk_need_resched(prev); ++ ++ idle = rq->idle; ++ if (idle != prev) { ++ /* Update all the information stored on struct rq */ ++ prev->time_slice = rq->rq_time_slice; ++ prev->deadline = rq->rq_deadline; ++ check_deadline(prev); ++ prev->last_ran = rq->clock; ++ ++ /* Task changed affinity off this CPU */ ++ if (needs_other_cpu(prev, cpu)) ++ resched_suitable_idle(prev); ++ else if (!deactivate) { ++ if (!queued_notrunning()) { ++ /* ++ * We now know prev is the only thing that is ++ * awaiting CPU so we can bypass rechecking for ++ * the earliest deadline task and just run it ++ * again. ++ */ ++ set_rq_task(rq, prev); ++ grq_unlock_irq(); ++ goto rerun_prev_unlocked; ++ } else ++ swap_sticky(rq, cpu, prev); ++ } ++ return_task(prev, deactivate); ++ } ++ ++ if (unlikely(!queued_notrunning())) { ++ /* ++ * This CPU is now truly idle as opposed to when idle is ++ * scheduled as a high priority task in its own right. ++ */ ++ next = idle; ++ schedstat_inc(rq, sched_goidle); ++ set_cpuidle_map(cpu); ++ } else { ++ next = earliest_deadline_task(rq, cpu, idle); ++ if (likely(next->prio != PRIO_LIMIT)) ++ clear_cpuidle_map(cpu); ++ else ++ set_cpuidle_map(cpu); ++ } ++ ++ if (likely(prev != next)) { ++ /* ++ * Don't stick tasks when a real time task is going to run as ++ * they may literally get stuck. ++ */ ++ if (rt_task(next)) ++ unstick_task(rq, prev); ++ set_rq_task(rq, next); ++ grq.nr_switches++; ++ prev->on_cpu = false; ++ next->on_cpu = true; ++ rq->curr = next; ++ ++*switch_count; ++ ++ context_switch(rq, prev, next); /* unlocks the grq */ ++ /* ++ * The context switch have flipped the stack from under us ++ * and restored the local variables which were saved when ++ * this task called schedule() in the past. prev == current ++ * is still correct, but it can be moved to another cpu/rq. ++ */ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ idle = rq->idle; ++ } else ++ grq_unlock_irq(); ++ ++rerun_prev_unlocked: ++ preempt_enable_no_resched(); ++ if (unlikely(need_resched())) ++ goto need_resched; ++} ++EXPORT_SYMBOL(schedule); ++ ++#ifdef CONFIG_MUTEX_SPIN_ON_OWNER ++ ++static inline bool owner_running(struct mutex *lock, struct task_struct *owner) ++{ ++ if (lock->owner != owner) ++ return false; ++ ++ /* ++ * Ensure we emit the owner->on_cpu, dereference _after_ checking ++ * lock->owner still matches owner, if that fails, owner might ++ * point to free()d memory, if it still matches, the rcu_read_lock() ++ * ensures the memory stays valid. ++ */ ++ barrier(); ++ ++ return owner->on_cpu; ++} ++ ++/* ++ * Look out! "owner" is an entirely speculative pointer ++ * access and not reliable. ++ */ ++int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) ++{ ++ rcu_read_lock(); ++ while (owner_running(lock, owner)) { ++ if (need_resched()) ++ break; ++ ++ arch_mutex_cpu_relax(); ++ } ++ rcu_read_unlock(); ++ ++ /* ++ * We break out the loop above on need_resched() and when the ++ * owner changed, which is a sign for heavy contention. Return ++ * success only when lock->owner is NULL. ++ */ ++ return lock->owner == NULL; ++} ++#endif ++ ++#ifdef CONFIG_PREEMPT ++/* ++ * this is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. Kernel preemptions off return from interrupt ++ * occur there and call schedule directly. ++ */ ++asmlinkage void __sched notrace preempt_schedule(void) ++{ ++ struct thread_info *ti = current_thread_info(); ++ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(ti->preempt_count || irqs_disabled())) ++ return; ++ ++ do { ++ add_preempt_count_notrace(PREEMPT_ACTIVE); ++ schedule(); ++ sub_preempt_count_notrace(PREEMPT_ACTIVE); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ barrier(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL(preempt_schedule); ++ ++/* ++ * this is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage void __sched preempt_schedule_irq(void) ++{ ++ struct thread_info *ti = current_thread_info(); ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(ti->preempt_count || !irqs_disabled()); ++ ++ do { ++ add_preempt_count(PREEMPT_ACTIVE); ++ local_irq_enable(); ++ schedule(); ++ local_irq_disable(); ++ sub_preempt_count(PREEMPT_ACTIVE); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ barrier(); ++ } while (need_resched()); ++} ++ ++#endif /* CONFIG_PREEMPT */ ++ ++int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++/* ++ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just ++ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve ++ * number) then we wake all the non-exclusive tasks and one exclusive task. ++ * ++ * There are circumstances in which we can try to wake a task which has already ++ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns ++ * zero in this (rare) case, and we handle it by continuing to scan the queue. ++ */ ++static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, ++ int nr_exclusive, int wake_flags, void *key) ++{ ++ struct list_head *tmp, *next; ++ ++ list_for_each_safe(tmp, next, &q->task_list) { ++ wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); ++ unsigned int flags = curr->flags; ++ ++ if (curr->func(curr, mode, wake_flags, key) && ++ (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) ++ break; ++ } ++} ++ ++/** ++ * __wake_up - wake up threads blocked on a waitqueue. ++ * @q: the waitqueue ++ * @mode: which threads ++ * @nr_exclusive: how many wake-one or wake-many threads to wake up ++ * @key: is directly passed to the wakeup function ++ * ++ * It may be assumed that this function implies a write memory barrier before ++ * changing the task state if and only if any tasks are woken up. ++ */ ++void __wake_up(wait_queue_head_t *q, unsigned int mode, ++ int nr_exclusive, void *key) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&q->lock, flags); ++ __wake_up_common(q, mode, nr_exclusive, 0, key); ++ spin_unlock_irqrestore(&q->lock, flags); ++} ++EXPORT_SYMBOL(__wake_up); ++ ++/* ++ * Same as __wake_up but called with the spinlock in wait_queue_head_t held. ++ */ ++void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) ++{ ++ __wake_up_common(q, mode, 1, 0, NULL); ++} ++EXPORT_SYMBOL_GPL(__wake_up_locked); ++ ++void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) ++{ ++ __wake_up_common(q, mode, 1, 0, key); ++} ++EXPORT_SYMBOL_GPL(__wake_up_locked_key); ++ ++/** ++ * __wake_up_sync_key - wake up threads blocked on a waitqueue. ++ * @q: the waitqueue ++ * @mode: which threads ++ * @nr_exclusive: how many wake-one or wake-many threads to wake up ++ * @key: opaque value to be passed to wakeup targets ++ * ++ * The sync wakeup differs that the waker knows that it will schedule ++ * away soon, so while the target thread will be woken up, it will not ++ * be migrated to another CPU - ie. the two threads are 'synchronised' ++ * with each other. This can prevent needless bouncing between CPUs. ++ * ++ * On UP it can prevent extra preemption. ++ * ++ * It may be assumed that this function implies a write memory barrier before ++ * changing the task state if and only if any tasks are woken up. ++ */ ++void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, ++ int nr_exclusive, void *key) ++{ ++ unsigned long flags; ++ int wake_flags = WF_SYNC; ++ ++ if (unlikely(!q)) ++ return; ++ ++ if (unlikely(!nr_exclusive)) ++ wake_flags = 0; ++ ++ spin_lock_irqsave(&q->lock, flags); ++ __wake_up_common(q, mode, nr_exclusive, wake_flags, key); ++ spin_unlock_irqrestore(&q->lock, flags); ++} ++EXPORT_SYMBOL_GPL(__wake_up_sync_key); ++ ++/** ++ * __wake_up_sync - wake up threads blocked on a waitqueue. ++ * @q: the waitqueue ++ * @mode: which threads ++ * @nr_exclusive: how many wake-one or wake-many threads to wake up ++ * ++ * The sync wakeup differs that the waker knows that it will schedule ++ * away soon, so while the target thread will be woken up, it will not ++ * be migrated to another CPU - ie. the two threads are 'synchronised' ++ * with each other. This can prevent needless bouncing between CPUs. ++ * ++ * On UP it can prevent extra preemption. ++ */ ++void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) ++{ ++ unsigned long flags; ++ int sync = 1; ++ ++ if (unlikely(!q)) ++ return; ++ ++ if (unlikely(!nr_exclusive)) ++ sync = 0; ++ ++ spin_lock_irqsave(&q->lock, flags); ++ __wake_up_common(q, mode, nr_exclusive, sync, NULL); ++ spin_unlock_irqrestore(&q->lock, flags); ++} ++EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ ++ ++/** ++ * complete: - signals a single thread waiting on this completion ++ * @x: holds the state of this particular completion ++ * ++ * This will wake up a single thread waiting on this completion. Threads will be ++ * awakened in the same order in which they were queued. ++ * ++ * See also complete_all(), wait_for_completion() and related routines. ++ * ++ * It may be assumed that this function implies a write memory barrier before ++ * changing the task state if and only if any tasks are woken up. ++ */ ++void complete(struct completion *x) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&x->wait.lock, flags); ++ x->done++; ++ __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); ++ spin_unlock_irqrestore(&x->wait.lock, flags); ++} ++EXPORT_SYMBOL(complete); ++ ++/** ++ * complete_all: - signals all threads waiting on this completion ++ * @x: holds the state of this particular completion ++ * ++ * This will wake up all threads waiting on this particular completion event. ++ * ++ * It may be assumed that this function implies a write memory barrier before ++ * changing the task state if and only if any tasks are woken up. ++ */ ++void complete_all(struct completion *x) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&x->wait.lock, flags); ++ x->done += UINT_MAX/2; ++ __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); ++ spin_unlock_irqrestore(&x->wait.lock, flags); ++} ++EXPORT_SYMBOL(complete_all); ++ ++static inline long __sched ++do_wait_for_common(struct completion *x, long timeout, int state) ++{ ++ if (!x->done) { ++ DECLARE_WAITQUEUE(wait, current); ++ ++ __add_wait_queue_tail_exclusive(&x->wait, &wait); ++ do { ++ if (signal_pending_state(state, current)) { ++ timeout = -ERESTARTSYS; ++ break; ++ } ++ __set_current_state(state); ++ spin_unlock_irq(&x->wait.lock); ++ timeout = schedule_timeout(timeout); ++ spin_lock_irq(&x->wait.lock); ++ } while (!x->done && timeout); ++ __remove_wait_queue(&x->wait, &wait); ++ if (!x->done) ++ return timeout; ++ } ++ x->done--; ++ return timeout ?: 1; ++} ++ ++static long __sched ++wait_for_common(struct completion *x, long timeout, int state) ++{ ++ might_sleep(); ++ ++ spin_lock_irq(&x->wait.lock); ++ timeout = do_wait_for_common(x, timeout, state); ++ spin_unlock_irq(&x->wait.lock); ++ return timeout; ++} ++ ++/** ++ * wait_for_completion: - waits for completion of a task ++ * @x: holds the state of this particular completion ++ * ++ * This waits to be signaled for completion of a specific task. It is NOT ++ * interruptible and there is no timeout. ++ * ++ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout ++ * and interrupt capability. Also see complete(). ++ */ ++void __sched wait_for_completion(struct completion *x) ++{ ++ wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); ++} ++EXPORT_SYMBOL(wait_for_completion); ++ ++/** ++ * wait_for_completion_timeout: - waits for completion of a task (w/timeout) ++ * @x: holds the state of this particular completion ++ * @timeout: timeout value in jiffies ++ * ++ * This waits for either a completion of a specific task to be signaled or for a ++ * specified timeout to expire. The timeout is in jiffies. It is not ++ * interruptible. ++ * ++ * The return value is 0 if timed out, and positive (at least 1, or number of ++ * jiffies left till timeout) if completed. ++ */ ++unsigned long __sched ++wait_for_completion_timeout(struct completion *x, unsigned long timeout) ++{ ++ return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); ++} ++EXPORT_SYMBOL(wait_for_completion_timeout); ++ ++/** ++ * wait_for_completion_interruptible: - waits for completion of a task (w/intr) ++ * @x: holds the state of this particular completion ++ * ++ * This waits for completion of a specific task to be signaled. It is ++ * interruptible. ++ * ++ * The return value is -ERESTARTSYS if interrupted, 0 if completed. ++ */ ++int __sched wait_for_completion_interruptible(struct completion *x) ++{ ++ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); ++ if (t == -ERESTARTSYS) ++ return t; ++ return 0; ++} ++EXPORT_SYMBOL(wait_for_completion_interruptible); ++ ++/** ++ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) ++ * @x: holds the state of this particular completion ++ * @timeout: timeout value in jiffies ++ * ++ * This waits for either a completion of a specific task to be signaled or for a ++ * specified timeout to expire. It is interruptible. The timeout is in jiffies. ++ * ++ * The return value is -ERESTARTSYS if interrupted, 0 if timed out, ++ * positive (at least 1, or number of jiffies left till timeout) if completed. ++ */ ++long __sched ++wait_for_completion_interruptible_timeout(struct completion *x, ++ unsigned long timeout) ++{ ++ return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); ++} ++EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); ++ ++/** ++ * wait_for_completion_killable: - waits for completion of a task (killable) ++ * @x: holds the state of this particular completion ++ * ++ * This waits to be signaled for completion of a specific task. It can be ++ * interrupted by a kill signal. ++ * ++ * The return value is -ERESTARTSYS if interrupted, 0 if timed out, ++ * positive (at least 1, or number of jiffies left till timeout) if completed. ++ */ ++int __sched wait_for_completion_killable(struct completion *x) ++{ ++ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); ++ if (t == -ERESTARTSYS) ++ return t; ++ return 0; ++} ++EXPORT_SYMBOL(wait_for_completion_killable); ++ ++/** ++ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) ++ * @x: holds the state of this particular completion ++ * @timeout: timeout value in jiffies ++ * ++ * This waits for either a completion of a specific task to be ++ * signaled or for a specified timeout to expire. It can be ++ * interrupted by a kill signal. The timeout is in jiffies. ++ */ ++long __sched ++wait_for_completion_killable_timeout(struct completion *x, ++ unsigned long timeout) ++{ ++ return wait_for_common(x, timeout, TASK_KILLABLE); ++} ++EXPORT_SYMBOL(wait_for_completion_killable_timeout); ++ ++/** ++ * try_wait_for_completion - try to decrement a completion without blocking ++ * @x: completion structure ++ * ++ * Returns: 0 if a decrement cannot be done without blocking ++ * 1 if a decrement succeeded. ++ * ++ * If a completion is being used as a counting completion, ++ * attempt to decrement the counter without blocking. This ++ * enables us to avoid waiting if the resource the completion ++ * is protecting is not available. ++ */ ++bool try_wait_for_completion(struct completion *x) ++{ ++ unsigned long flags; ++ int ret = 1; ++ ++ spin_lock_irqsave(&x->wait.lock, flags); ++ if (!x->done) ++ ret = 0; ++ else ++ x->done--; ++ spin_unlock_irqrestore(&x->wait.lock, flags); ++ return ret; ++} ++EXPORT_SYMBOL(try_wait_for_completion); ++ ++/** ++ * completion_done - Test to see if a completion has any waiters ++ * @x: completion structure ++ * ++ * Returns: 0 if there are waiters (wait_for_completion() in progress) ++ * 1 if there are no waiters. ++ * ++ */ ++bool completion_done(struct completion *x) ++{ ++ unsigned long flags; ++ int ret = 1; ++ ++ spin_lock_irqsave(&x->wait.lock, flags); ++ if (!x->done) ++ ret = 0; ++ spin_unlock_irqrestore(&x->wait.lock, flags); ++ return ret; ++} ++EXPORT_SYMBOL(completion_done); ++ ++static long __sched ++sleep_on_common(wait_queue_head_t *q, int state, long timeout) ++{ ++ unsigned long flags; ++ wait_queue_t wait; ++ ++ init_waitqueue_entry(&wait, current); ++ ++ __set_current_state(state); ++ ++ spin_lock_irqsave(&q->lock, flags); ++ __add_wait_queue(q, &wait); ++ spin_unlock(&q->lock); ++ timeout = schedule_timeout(timeout); ++ spin_lock_irq(&q->lock); ++ __remove_wait_queue(q, &wait); ++ spin_unlock_irqrestore(&q->lock, flags); ++ ++ return timeout; ++} ++ ++void __sched interruptible_sleep_on(wait_queue_head_t *q) ++{ ++ sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); ++} ++EXPORT_SYMBOL(interruptible_sleep_on); ++ ++long __sched ++interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) ++{ ++ return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); ++} ++EXPORT_SYMBOL(interruptible_sleep_on_timeout); ++ ++void __sched sleep_on(wait_queue_head_t *q) ++{ ++ sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); ++} ++EXPORT_SYMBOL(sleep_on); ++ ++long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) ++{ ++ return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); ++} ++EXPORT_SYMBOL(sleep_on_timeout); ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task ++ * @prio: prio value (kernel-internal form) ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance logic. ++ */ ++void rt_mutex_setprio(struct task_struct *p, int prio) ++{ ++ unsigned long flags; ++ int queued, oldprio; ++ struct rq *rq; ++ ++ BUG_ON(prio < 0 || prio > MAX_PRIO); ++ ++ rq = task_grq_lock(p, &flags); ++ ++ trace_sched_pi_setprio(p, prio); ++ oldprio = p->prio; ++ queued = task_queued(p); ++ if (queued) ++ dequeue_task(p); ++ p->prio = prio; ++ if (task_running(p) && prio > oldprio) ++ resched_task(p); ++ if (queued) { ++ enqueue_task(p); ++ try_preempt(p, rq); ++ } ++ ++ task_grq_unlock(&flags); ++} ++ ++#endif ++ ++/* ++ * Adjust the deadline for when the priority is to change, before it's ++ * changed. ++ */ ++static inline void adjust_deadline(struct task_struct *p, int new_prio) ++{ ++ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); ++} ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int queued, new_static, old_static; ++ unsigned long flags; ++ struct rq *rq; ++ ++ if (TASK_NICE(p) == nice || nice < -20 || nice > 19) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ rq = time_task_grq_lock(p, &flags); ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (has_rt_policy(p)) { ++ p->static_prio = new_static; ++ goto out_unlock; ++ } ++ queued = task_queued(p); ++ if (queued) ++ dequeue_task(p); ++ ++ adjust_deadline(p, new_static); ++ old_static = p->static_prio; ++ p->static_prio = new_static; ++ p->prio = effective_prio(p); ++ ++ if (queued) { ++ enqueue_task(p); ++ if (new_static < old_static) ++ try_preempt(p, rq); ++ } else if (task_running(p)) { ++ reset_rq_task(rq, p); ++ if (old_static < new_static) ++ resched_task(p); ++ } ++out_unlock: ++ task_grq_unlock(&flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = 20 - nice; ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ if (increment < -40) ++ increment = -40; ++ if (increment > 40) ++ increment = 40; ++ ++ nice = TASK_NICE(current) + increment; ++ if (nice < -20) ++ nice = -20; ++ if (nice > 19) ++ nice = 19; ++ ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * This is the priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int delta, prio = p->prio - MAX_RT_PRIO; ++ ++ /* rt tasks and iso tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ /* Convert to ms to avoid overflows */ ++ delta = NS_TO_MS(p->deadline - grq.niffies); ++ delta = delta * 40 / ms_longest_deadline_diff(); ++ if (delta > 0 && delta <= 80) ++ prio += delta; ++ if (idleprio_task(p)) ++ prio += 40; ++out: ++ return prio; ++} ++ ++/** ++ * task_nice - return the nice value of a given task. ++ * @p: the task in question. ++ */ ++int task_nice(const struct task_struct *p) ++{ ++ return TASK_NICE(p); ++} ++EXPORT_SYMBOL_GPL(task_nice); ++ ++/** ++ * idle_cpu - is a given cpu idle currently? ++ * @cpu: the processor in question. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * idle_task - return the idle task for a given cpu. ++ * @cpu: the processor in question. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* Actually do priority change: must hold grq lock. */ ++static void ++__setscheduler(struct task_struct *p, struct rq *rq, int policy, int prio) ++{ ++ int oldrtprio, oldprio; ++ ++ p->policy = policy; ++ oldrtprio = p->rt_priority; ++ p->rt_priority = prio; ++ p->normal_prio = normal_prio(p); ++ oldprio = p->prio; ++ /* we are holding p->pi_lock already */ ++ p->prio = rt_mutex_getprio(p); ++ if (task_running(p)) { ++ reset_rq_task(rq, p); ++ /* Resched only if we might now be preempted */ ++ if (p->prio > oldprio || p->rt_priority > oldrtprio) ++ resched_task(p); ++ } ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ if (cred->user->user_ns == pcred->user->user_ns) ++ match = (cred->euid == pcred->euid || ++ cred->euid == pcred->uid); ++ else ++ match = false; ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool user) ++{ ++ struct sched_param zero_param = { .sched_priority = 0 }; ++ int queued, retval, oldpolicy = -1; ++ unsigned long flags, rlim_rtprio = 0; ++ int reset_on_fork; ++ struct rq *rq; ++ ++ /* may grab non-irq protected spin_locks */ ++ BUG_ON(in_interrupt()); ++ ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ unsigned long lflags; ++ ++ if (!lock_task_sighand(p, &lflags)) ++ return -ESRCH; ++ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); ++ unlock_task_sighand(p, &lflags); ++ if (rlim_rtprio) ++ goto recheck; ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ param = &zero_param; ++ } ++recheck: ++ /* double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); ++ policy &= ~SCHED_RESET_ON_FORK; ++ ++ if (!SCHED_RANGE(policy)) ++ return -EINVAL; ++ } ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH is 0. ++ */ ++ if (param->sched_priority < 0 || ++ (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && param->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if (is_rt_policy(policy) != (param->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (is_rt_policy(policy)) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* can't increase priority */ ++ if (param->sched_priority > p->rt_priority && ++ param->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy == SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } ++ } ++ ++ /* can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ /* ++ * make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * To be able to change p->policy safely, the grunqueue lock must be ++ * held. ++ */ ++ rq = __task_grq_lock(p); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ __task_grq_unlock(); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ return -EINVAL; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || ++ param->sched_priority == p->rt_priority))) { ++ ++ __task_grq_unlock(); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ return 0; ++ } ++ ++ /* recheck policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_grq_unlock(); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ goto recheck; ++ } ++ update_clocks(rq); ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ queued = task_queued(p); ++ if (queued) ++ dequeue_task(p); ++ __setscheduler(p, rq, policy, param->sched_priority); ++ if (queued) { ++ enqueue_task(p); ++ try_preempt(p, rq); ++ } ++ __task_grq_unlock(); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ rt_mutex_adjust_pi(p); ++out: ++ return 0; ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return __sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return __sched_setscheduler(p, policy, param, false); ++} ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setscheduler(p, policy, &lparam); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ */ ++asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, ++ struct sched_param __user *param) ++{ ++ /* negative values for policy are not valid */ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, -1, param); ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ put_online_cpus(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) ++ goto out_unlock; ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = set_cpus_allowed_ptr(p, new_mask); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ put_online_cpus(); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ cpumask_t *new_mask) ++{ ++ if (len < sizeof(cpumask_t)) { ++ memset(new_mask, 0, sizeof(cpumask_t)); ++ } else if (len > sizeof(cpumask_t)) { ++ len = sizeof(cpumask_t); ++ } ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++ ++/** ++ * sys_sched_setaffinity - set the cpu affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new cpu mask ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ unsigned long flags; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ grq_lock_irqsave(&flags); ++ cpumask_and(mask, tsk_cpus_allowed(p), cpu_online_mask); ++ grq_unlock_irqrestore(&flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ put_online_cpus(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the cpu affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current cpu mask ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ size_t retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ */ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ struct task_struct *p; ++ ++ p = current; ++ grq_lock_irq(); ++ schedstat_inc(task_rq(p), yld_count); ++ requeue_task(p); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ __release(grq.lock); ++ spin_release(&grq.lock.dep_map, 1, _THIS_IP_); ++ do_raw_spin_unlock(&grq.lock); ++ preempt_enable_no_resched(); ++ ++ schedule(); ++ ++ return 0; ++} ++ ++static inline bool should_resched(void) ++{ ++ return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); ++} ++ ++static void __cond_resched(void) ++{ ++ /* NOT a real fix but will make voluntary preempt work. 馬鹿ãªäº‹ */ ++ if (unlikely(system_state != SYSTEM_RUNNING)) ++ return; ++ ++ add_preempt_count(PREEMPT_ACTIVE); ++ schedule(); ++ sub_preempt_count(PREEMPT_ACTIVE); ++} ++ ++int __sched _cond_resched(void) ++{ ++ if (should_resched()) { ++ __cond_resched(); ++ return 1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ __cond_resched(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++int __sched __cond_resched_softirq(void) ++{ ++ BUG_ON(!in_softirq()); ++ ++ if (should_resched()) { ++ local_bh_enable(); ++ __cond_resched(); ++ local_bh_disable(); ++ return 1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(__cond_resched_softirq); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * This is a shortcut for kernel-space yielding - it marks the ++ * thread runnable and calls sys_sched_yield(). ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ sys_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * Returns true if we indeed boosted the target task. ++ */ ++bool __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ unsigned long flags; ++ bool yielded = 0; ++ struct rq *rq; ++ ++ rq = this_rq(); ++ grq_lock_irqsave(&flags); ++ if (task_running(p) || p->state) ++ goto out_unlock; ++ yielded = 1; ++ if (p->deadline > rq->rq_deadline) ++ p->deadline = rq->rq_deadline; ++ p->time_slice += rq->rq_time_slice; ++ rq->rq_time_slice = 0; ++ if (p->time_slice > timeslice()) ++ p->time_slice = timeslice(); ++ set_tsk_need_resched(rq->curr); ++out_unlock: ++ grq_unlock_irqrestore(&flags); ++ ++ if (yielded) ++ schedule(); ++ return yielded; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++void __sched io_schedule(void) ++{ ++ struct rq *rq = raw_rq(); ++ ++ delayacct_blkio_start(); ++ atomic_inc(&rq->nr_iowait); ++ blk_flush_plug(current); ++ current->in_iowait = 1; ++ schedule(); ++ current->in_iowait = 0; ++ atomic_dec(&rq->nr_iowait); ++ delayacct_blkio_end(); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ struct rq *rq = raw_rq(); ++ long ret; ++ ++ delayacct_blkio_start(); ++ atomic_inc(&rq->nr_iowait); ++ blk_flush_plug(current); ++ current->in_iowait = 1; ++ ret = schedule_timeout(timeout); ++ current->in_iowait = 0; ++ atomic_dec(&rq->nr_iowait); ++ delayacct_blkio_end(); ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * this syscall returns the maximum rt_priority that can be used ++ * by a given scheduling class. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * this syscall returns the minimum rt_priority that can be used ++ * by a given scheduling class. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * this syscall writes the default timeslice value of a given process ++ * into the user-space timespec buffer. A value of '0' means infinity. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct timespec __user *, interval) ++{ ++ struct task_struct *p; ++ unsigned int time_slice; ++ unsigned long flags; ++ int retval; ++ struct timespec t; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ grq_lock_irqsave(&flags); ++ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); ++ grq_unlock_irqrestore(&flags); ++ ++ rcu_read_unlock(); ++ t = ns_to_timespec(time_slice); ++ retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ unsigned state; ++ ++ state = p->state ? __ffs(p->state) + 1 : 0; ++ printk(KERN_INFO "%-15.15s %c", p->comm, ++ state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); ++#if BITS_PER_LONG == 32 ++ if (state == TASK_RUNNING) ++ printk(KERN_CONT " running "); ++ else ++ printk(KERN_CONT " %08lx ", thread_saved_pc(p)); ++#else ++ if (state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++ else ++ printk(KERN_CONT " %016lx ", thread_saved_pc(p)); ++#endif ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), task_pid_nr(p->real_parent), ++ (unsigned long)task_thread_info(p)->flags); ++ ++ show_stack(p, NULL); ++} ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ do_each_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ */ ++ touch_nmi_watchdog(); ++ if (!state_filter || (p->state & state_filter)) ++ sched_show_task(p); ++ } while_each_thread(g, p); ++ ++ touch_all_softlockup_watchdogs(); ++ ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++#ifdef CONFIG_SMP ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(tsk_cpus_allowed(p), new_mask); ++} ++#endif ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ time_grq_lock(rq, &flags); ++ idle->last_ran = rq->clock; ++ idle->state = TASK_RUNNING; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ set_rq_task(rq, idle); ++ do_set_cpus_allowed(idle, &cpumask_of_cpu(cpu)); ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ rq->curr = rq->idle = idle; ++ idle->on_cpu = 1; ++ grq_unlock_irqrestore(&flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ task_thread_info(idle)->preempt_count = 0; ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++#if defined(CONFIG_SMP) ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) ++/** ++ * lowest_flag_domain - Return lowest sched_domain containing flag. ++ * @cpu: The cpu whose lowest level of sched domain is to ++ * be returned. ++ * @flag: The flag to check for the lowest sched_domain ++ * for the given cpu. ++ * ++ * Returns the lowest sched_domain of a cpu which contains the given flag. ++ */ ++static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd; ++ ++ for_each_domain(cpu, sd) ++ if (sd && (sd->flags & flag)) ++ break; ++ ++ return sd; ++} ++ ++/** ++ * for_each_flag_domain - Iterates over sched_domains containing the flag. ++ * @cpu: The cpu whose domains we're iterating over. ++ * @sd: variable holding the value of the power_savings_sd ++ * for cpu. ++ * @flag: The flag to filter the sched_domains to be iterated. ++ * ++ * Iterates over all the scheduler domains for a given cpu that has the 'flag' ++ * set, starting from the lowest sched_domain to the highest. ++ */ ++#define for_each_flag_domain(cpu, sd, flag) \ ++ for (sd = lowest_flag_domain(cpu, flag); \ ++ (sd && (sd->flags & flag)); sd = sd->parent) ++ ++#endif /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ ++ ++static inline void resched_cpu(int cpu) ++{ ++ unsigned long flags; ++ ++ grq_lock_irqsave(&flags); ++ resched_task(cpu_curr(cpu)); ++ grq_unlock_irqrestore(&flags); ++} ++ ++/* ++ * In the semi idle case, use the nearest busy cpu for migrating timers ++ * from an idle cpu. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle cpu will add more delays to the timers than intended ++ * (as that cpu's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int cpu = smp_processor_id(); ++ int i; ++ struct sched_domain *sd; ++ ++ rcu_read_lock(); ++ for_each_domain(cpu, sd) { ++ for_each_cpu(i, sched_domain_span(sd)) { ++ if (!idle_cpu(i)) ++ cpu = i; ++ goto unlock; ++ } ++ } ++unlock: ++ rcu_read_unlock(); ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ struct task_struct *idle; ++ struct rq *rq; ++ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ rq = cpu_rq(cpu); ++ idle = rq->idle; ++ ++ /* ++ * This is safe, as this function is called with the timer ++ * wheel base lock of (cpu) held. When the CPU is on the way ++ * to idle and has not yet set rq->curr to idle then it will ++ * be serialised on the timer wheel base lock and take the new ++ * timer into account automatically. ++ */ ++ if (unlikely(rq->curr != idle)) ++ return; ++ ++ /* ++ * We can set TIF_RESCHED on the idle task of the other CPU ++ * lockless. The worst case is that the other CPU runs the ++ * idle task through an additional NOOP schedule() ++ */ ++ set_tsk_need_resched(idle); ++ ++ /* NEED_RESCHED must be visible before we test polling */ ++ smp_mb(); ++ if (!tsk_is_polling(idle)) ++ smp_send_reschedule(cpu); ++} ++ ++#endif /* CONFIG_NO_HZ */ ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ bool running_wrong = false; ++ bool queued = false; ++ unsigned long flags; ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = task_grq_lock(p, &flags); ++ ++ if (cpumask_equal(tsk_cpus_allowed(p), new_mask)) ++ goto out; ++ ++ if (!cpumask_intersects(new_mask, cpu_active_mask)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ queued = task_queued(p); ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p)) { ++ /* Task is running on the wrong cpu now, reschedule it. */ ++ if (rq == this_rq()) { ++ set_tsk_need_resched(p); ++ running_wrong = true; ++ } else ++ resched_task(p); ++ } else ++ set_task_cpu(p, cpumask_any_and(cpu_active_mask, new_mask)); ++ ++out: ++ if (queued) ++ try_preempt(p, rq); ++ task_grq_unlock(&flags); ++ ++ if (running_wrong) ++ _cond_resched(); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* Run through task list and find tasks affined to just the dead cpu, then ++ * allocate a new affinity */ ++static void break_sole_affinity(int src_cpu, struct task_struct *idle) ++{ ++ struct task_struct *p, *t; ++ ++ do_each_thread(t, p) { ++ if (p != idle && !online_cpus(p)) { ++ cpumask_copy(tsk_cpus_allowed(p), cpu_possible_mask); ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk(KERN_INFO "process %d (%s) no " ++ "longer affine to cpu %d\n", ++ task_pid_nr(p), p->comm, src_cpu); ++ } ++ } ++ clear_sticky(p); ++ } while_each_thread(t, p); ++} ++ ++/* ++ * Schedules idle task to be the next runnable task on current CPU. ++ * It does so by boosting its priority to highest possible. ++ * Used by CPU offline code. ++ */ ++void sched_idle_next(struct rq *rq, int this_cpu, struct task_struct *idle) ++{ ++ /* cpu has to be offline */ ++ BUG_ON(cpu_online(this_cpu)); ++ ++ __setscheduler(idle, rq, SCHED_FIFO, STOP_PRIO); ++ ++ activate_idle_task(idle); ++ set_tsk_need_resched(rq->curr); ++} ++ ++/* ++ * Ensures that the idle task is using init_mm right before its cpu goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) ++ switch_mm(mm, &init_mm, current); ++ mmdrop(mm); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = MAX_USER_RT_PRIO - 1 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal rt scheduling prio so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_FIFO, &start_param); ++ } ++} ++ ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++ ++static struct ctl_table sd_ctl_dir[] = { ++ { ++ .procname = "sched_domain", ++ .mode = 0555, ++ }, ++ {} ++}; ++ ++static struct ctl_table sd_ctl_root[] = { ++ { ++ .procname = "kernel", ++ .mode = 0555, ++ .child = sd_ctl_dir, ++ }, ++ {} ++}; ++ ++static struct ctl_table *sd_alloc_ctl_entry(int n) ++{ ++ struct ctl_table *entry = ++ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); ++ ++ return entry; ++} ++ ++static void sd_free_ctl_entry(struct ctl_table **tablep) ++{ ++ struct ctl_table *entry; ++ ++ /* ++ * In the intermediate directories, both the child directory and ++ * procname are dynamically allocated and could fail but the mode ++ * will always be set. In the lowest directory the names are ++ * static strings and all have proc handlers. ++ */ ++ for (entry = *tablep; entry->mode; entry++) { ++ if (entry->child) ++ sd_free_ctl_entry(&entry->child); ++ if (entry->proc_handler == NULL) ++ kfree(entry->procname); ++ } ++ ++ kfree(*tablep); ++ *tablep = NULL; ++} ++ ++static void ++set_table_entry(struct ctl_table *entry, ++ const char *procname, void *data, int maxlen, ++ mode_t mode, proc_handler *proc_handler) ++{ ++ entry->procname = procname; ++ entry->data = data; ++ entry->maxlen = maxlen; ++ entry->mode = mode; ++ entry->proc_handler = proc_handler; ++} ++ ++static struct ctl_table * ++sd_alloc_ctl_domain_table(struct sched_domain *sd) ++{ ++ struct ctl_table *table = sd_alloc_ctl_entry(13); ++ ++ if (table == NULL) ++ return NULL; ++ ++ set_table_entry(&table[0], "min_interval", &sd->min_interval, ++ sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[1], "max_interval", &sd->max_interval, ++ sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[2], "busy_idx", &sd->busy_idx, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[3], "idle_idx", &sd->idle_idx, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[5], "wake_idx", &sd->wake_idx, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[7], "busy_factor", &sd->busy_factor, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[9], "cache_nice_tries", ++ &sd->cache_nice_tries, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[10], "flags", &sd->flags, ++ sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[11], "name", sd->name, ++ CORENAME_MAX_SIZE, 0444, proc_dostring); ++ /* &table[12] is terminator */ ++ ++ return table; ++} ++ ++static ctl_table *sd_alloc_ctl_cpu_table(int cpu) ++{ ++ struct ctl_table *entry, *table; ++ struct sched_domain *sd; ++ int domain_num = 0, i; ++ char buf[32]; ++ ++ for_each_domain(cpu, sd) ++ domain_num++; ++ entry = table = sd_alloc_ctl_entry(domain_num + 1); ++ if (table == NULL) ++ return NULL; ++ ++ i = 0; ++ for_each_domain(cpu, sd) { ++ snprintf(buf, 32, "domain%d", i); ++ entry->procname = kstrdup(buf, GFP_KERNEL); ++ entry->mode = 0555; ++ entry->child = sd_alloc_ctl_domain_table(sd); ++ entry++; ++ i++; ++ } ++ return table; ++} ++ ++static struct ctl_table_header *sd_sysctl_header; ++static void register_sched_domain_sysctl(void) ++{ ++ int i, cpu_num = num_possible_cpus(); ++ struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); ++ char buf[32]; ++ ++ WARN_ON(sd_ctl_dir[0].child); ++ sd_ctl_dir[0].child = entry; ++ ++ if (entry == NULL) ++ return; ++ ++ for_each_possible_cpu(i) { ++ snprintf(buf, 32, "cpu%d", i); ++ entry->procname = kstrdup(buf, GFP_KERNEL); ++ entry->mode = 0555; ++ entry->child = sd_alloc_ctl_cpu_table(i); ++ entry++; ++ } ++ ++ WARN_ON(sd_sysctl_header); ++ sd_sysctl_header = register_sysctl_table(sd_ctl_root); ++} ++ ++/* may be called multiple times per register */ ++static void unregister_sched_domain_sysctl(void) ++{ ++ if (sd_sysctl_header) ++ unregister_sysctl_table(sd_sysctl_header); ++ sd_sysctl_header = NULL; ++ if (sd_ctl_dir[0].child) ++ sd_free_ctl_entry(&sd_ctl_dir[0].child); ++} ++#else ++static void register_sched_domain_sysctl(void) ++{ ++} ++static void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) { ++ cpumask_set_cpu(cpu_of(rq), rq->rd->online); ++ rq->online = true; ++ } ++} ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) { ++ cpumask_clear_cpu(cpu_of(rq), rq->rd->online); ++ rq->online = false; ++ } ++} ++ ++/* ++ * migration_call - callback that gets triggered when a CPU is added. ++ */ ++static int __cpuinit ++migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) ++{ ++ int cpu = (long)hcpu; ++ unsigned long flags; ++ struct rq *rq = cpu_rq(cpu); ++#ifdef CONFIG_HOTPLUG_CPU ++ struct task_struct *idle = rq->idle; ++#endif ++ ++ switch (action & ~CPU_TASKS_FROZEN) { ++ ++ case CPU_UP_PREPARE: ++ break; ++ ++ case CPU_ONLINE: ++ /* Update our root-domain */ ++ grq_lock_irqsave(&flags); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ ++ set_rq_online(rq); ++ } ++ grq.noc = num_online_cpus(); ++ grq_unlock_irqrestore(&flags); ++ break; ++ ++#ifdef CONFIG_HOTPLUG_CPU ++ case CPU_DEAD: ++ /* Idle task back to normal (off runqueue, low prio) */ ++ grq_lock_irq(); ++ return_task(idle, true); ++ idle->static_prio = MAX_PRIO; ++ __setscheduler(idle, rq, SCHED_NORMAL, 0); ++ idle->prio = PRIO_LIMIT; ++ set_rq_task(rq, idle); ++ update_clocks(rq); ++ grq_unlock_irq(); ++ break; ++ ++ case CPU_DYING: ++ /* Update our root-domain */ ++ grq_lock_irqsave(&flags); ++ sched_idle_next(rq, cpu, idle); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ break_sole_affinity(cpu, idle); ++ grq.noc = num_online_cpus(); ++ grq_unlock_irqrestore(&flags); ++ break; ++#endif ++ } ++ return NOTIFY_OK; ++} ++ ++/* ++ * Register at high priority so that task migration (migrate_all_tasks) ++ * happens before everything else. This has to be lower priority than ++ * the notifier in the perf_counter subsystem, though. ++ */ ++static struct notifier_block __cpuinitdata migration_notifier = { ++ .notifier_call = migration_call, ++ .priority = CPU_PRI_MIGRATION, ++}; ++ ++static int __cpuinit sched_cpu_active(struct notifier_block *nfb, ++ unsigned long action, void *hcpu) ++{ ++ switch (action & ~CPU_TASKS_FROZEN) { ++ case CPU_ONLINE: ++ case CPU_DOWN_FAILED: ++ set_cpu_active((long)hcpu, true); ++ return NOTIFY_OK; ++ default: ++ return NOTIFY_DONE; ++ } ++} ++ ++static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, ++ unsigned long action, void *hcpu) ++{ ++ switch (action & ~CPU_TASKS_FROZEN) { ++ case CPU_DOWN_PREPARE: ++ set_cpu_active((long)hcpu, false); ++ return NOTIFY_OK; ++ default: ++ return NOTIFY_DONE; ++ } ++} ++ ++int __init migration_init(void) ++{ ++ void *cpu = (void *)(long)smp_processor_id(); ++ int err; ++ ++ /* Initialise migration for the boot CPU */ ++ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); ++ BUG_ON(err == NOTIFY_BAD); ++ migration_call(&migration_notifier, CPU_ONLINE, cpu); ++ register_cpu_notifier(&migration_notifier); ++ ++ /* Register cpu active notifiers */ ++ cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); ++ cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); ++ ++ return 0; ++} ++early_initcall(migration_init); ++#endif ++ ++#ifdef CONFIG_SMP ++ ++static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ ++ ++#ifdef CONFIG_SCHED_DEBUG ++ ++static __read_mostly int sched_domain_debug_enabled; ++ ++static int __init sched_domain_debug_setup(char *str) ++{ ++ sched_domain_debug_enabled = 1; ++ ++ return 0; ++} ++early_param("sched_debug", sched_domain_debug_setup); ++ ++static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, ++ struct cpumask *groupmask) ++{ ++ struct sched_group *group = sd->groups; ++ char str[256]; ++ ++ cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); ++ cpumask_clear(groupmask); ++ ++ printk(KERN_DEBUG "%*s domain %d: ", level, "", level); ++ ++ if (!(sd->flags & SD_LOAD_BALANCE)) { ++ printk("does not load-balance\n"); ++ if (sd->parent) ++ printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" ++ " has parent"); ++ return -1; ++ } ++ ++ printk(KERN_CONT "span %s level %s\n", str, sd->name); ++ ++ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { ++ printk(KERN_ERR "ERROR: domain->span does not contain " ++ "CPU%d\n", cpu); ++ } ++ if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { ++ printk(KERN_ERR "ERROR: domain->groups does not contain" ++ " CPU%d\n", cpu); ++ } ++ ++ printk(KERN_DEBUG "%*s groups:", level + 1, ""); ++ do { ++ if (!group) { ++ printk("\n"); ++ printk(KERN_ERR "ERROR: group is NULL\n"); ++ break; ++ } ++ ++ if (!group->sgp->power) { ++ printk(KERN_CONT "\n"); ++ printk(KERN_ERR "ERROR: domain->cpu_power not " ++ "set\n"); ++ break; ++ } ++ ++ if (!cpumask_weight(sched_group_cpus(group))) { ++ printk(KERN_CONT "\n"); ++ printk(KERN_ERR "ERROR: empty group\n"); ++ break; ++ } ++ ++ if (cpumask_intersects(groupmask, sched_group_cpus(group))) { ++ printk(KERN_CONT "\n"); ++ printk(KERN_ERR "ERROR: repeated CPUs\n"); ++ break; ++ } ++ ++ cpumask_or(groupmask, groupmask, sched_group_cpus(group)); ++ ++ cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); ++ ++ printk(KERN_CONT " %s", str); ++ if (group->sgp->power != SCHED_POWER_SCALE) { ++ printk(KERN_CONT " (cpu_power = %d)", ++ group->sgp->power); ++ } ++ ++ group = group->next; ++ } while (group != sd->groups); ++ printk(KERN_CONT "\n"); ++ ++ if (!cpumask_equal(sched_domain_span(sd), groupmask)) ++ printk(KERN_ERR "ERROR: groups don't span domain->span\n"); ++ ++ if (sd->parent && ++ !cpumask_subset(groupmask, sched_domain_span(sd->parent))) ++ printk(KERN_ERR "ERROR: parent span is not a superset " ++ "of domain->span\n"); ++ return 0; ++} ++ ++static void sched_domain_debug(struct sched_domain *sd, int cpu) ++{ ++ int level = 0; ++ ++ if (!sched_domain_debug_enabled) ++ return; ++ ++ if (!sd) { ++ printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); ++ return; ++ } ++ ++ printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); ++ ++ for (;;) { ++ if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) ++ break; ++ level++; ++ sd = sd->parent; ++ if (!sd) ++ break; ++ } ++} ++#else /* !CONFIG_SCHED_DEBUG */ ++# define sched_domain_debug(sd, cpu) do { } while (0) ++#endif /* CONFIG_SCHED_DEBUG */ ++ ++static int sd_degenerate(struct sched_domain *sd) ++{ ++ if (cpumask_weight(sched_domain_span(sd)) == 1) ++ return 1; ++ ++ /* Following flags need at least 2 groups */ ++ if (sd->flags & (SD_LOAD_BALANCE | ++ SD_BALANCE_NEWIDLE | ++ SD_BALANCE_FORK | ++ SD_BALANCE_EXEC | ++ SD_SHARE_CPUPOWER | ++ SD_SHARE_PKG_RESOURCES)) { ++ if (sd->groups != sd->groups->next) ++ return 0; ++ } ++ ++ /* Following flags don't use groups */ ++ if (sd->flags & (SD_WAKE_AFFINE)) ++ return 0; ++ ++ return 1; ++} ++ ++static int ++sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) ++{ ++ unsigned long cflags = sd->flags, pflags = parent->flags; ++ ++ if (sd_degenerate(parent)) ++ return 1; ++ ++ if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) ++ return 0; ++ ++ /* Flags needing groups don't count if only 1 group in parent */ ++ if (parent->groups == parent->groups->next) { ++ pflags &= ~(SD_LOAD_BALANCE | ++ SD_BALANCE_NEWIDLE | ++ SD_BALANCE_FORK | ++ SD_BALANCE_EXEC | ++ SD_SHARE_CPUPOWER | ++ SD_SHARE_PKG_RESOURCES); ++ if (nr_node_ids == 1) ++ pflags &= ~SD_SERIALIZE; ++ } ++ if (~cflags & pflags) ++ return 0; ++ ++ return 1; ++} ++ ++static void free_rootdomain(struct rcu_head *rcu) ++{ ++ struct root_domain *rd = container_of(rcu, struct root_domain, rcu); ++ ++ cpupri_cleanup(&rd->cpupri); ++ free_cpumask_var(rd->rto_mask); ++ free_cpumask_var(rd->online); ++ free_cpumask_var(rd->span); ++ kfree(rd); ++} ++ ++static void rq_attach_root(struct rq *rq, struct root_domain *rd) ++{ ++ struct root_domain *old_rd = NULL; ++ unsigned long flags; ++ ++ grq_lock_irqsave(&flags); ++ ++ if (rq->rd) { ++ old_rd = rq->rd; ++ ++ if (cpumask_test_cpu(rq->cpu, old_rd->online)) ++ set_rq_offline(rq); ++ ++ cpumask_clear_cpu(rq->cpu, old_rd->span); ++ ++ /* ++ * If we dont want to free the old_rt yet then ++ * set old_rd to NULL to skip the freeing later ++ * in this function: ++ */ ++ if (!atomic_dec_and_test(&old_rd->refcount)) ++ old_rd = NULL; ++ } ++ ++ atomic_inc(&rd->refcount); ++ rq->rd = rd; ++ ++ cpumask_set_cpu(rq->cpu, rd->span); ++ if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) ++ set_rq_online(rq); ++ ++ grq_unlock_irqrestore(&flags); ++ ++ if (old_rd) ++ call_rcu_sched(&old_rd->rcu, free_rootdomain); ++} ++ ++static int init_rootdomain(struct root_domain *rd) ++{ ++ memset(rd, 0, sizeof(*rd)); ++ ++ if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) ++ goto out; ++ if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) ++ goto free_span; ++ if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) ++ goto free_online; ++ ++ if (cpupri_init(&rd->cpupri) != 0) ++ goto free_rto_mask; ++ return 0; ++ ++free_rto_mask: ++ free_cpumask_var(rd->rto_mask); ++free_online: ++ free_cpumask_var(rd->online); ++free_span: ++ free_cpumask_var(rd->span); ++out: ++ return -ENOMEM; ++} ++ ++static void init_defrootdomain(void) ++{ ++ init_rootdomain(&def_root_domain); ++ ++ atomic_set(&def_root_domain.refcount, 1); ++} ++ ++static struct root_domain *alloc_rootdomain(void) ++{ ++ struct root_domain *rd; ++ ++ rd = kmalloc(sizeof(*rd), GFP_KERNEL); ++ if (!rd) ++ return NULL; ++ ++ if (init_rootdomain(rd) != 0) { ++ kfree(rd); ++ return NULL; ++ } ++ ++ return rd; ++} ++ ++static void free_sched_groups(struct sched_group *sg, int free_sgp) ++{ ++ struct sched_group *tmp, *first; ++ ++ if (!sg) ++ return; ++ ++ first = sg; ++ do { ++ tmp = sg->next; ++ ++ if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) ++ kfree(sg->sgp); ++ ++ kfree(sg); ++ sg = tmp; ++ } while (sg != first); ++} ++ ++static void free_sched_domain(struct rcu_head *rcu) ++{ ++ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); ++ ++ /* ++ * If its an overlapping domain it has private groups, iterate and ++ * nuke them all. ++ */ ++ if (sd->flags & SD_OVERLAP) { ++ free_sched_groups(sd->groups, 1); ++ } else if (atomic_dec_and_test(&sd->groups->ref)) { ++ kfree(sd->groups->sgp); ++ kfree(sd->groups); ++ } ++ kfree(sd); ++} ++ ++static void destroy_sched_domain(struct sched_domain *sd, int cpu) ++{ ++ call_rcu(&sd->rcu, free_sched_domain); ++} ++ ++static void destroy_sched_domains(struct sched_domain *sd, int cpu) ++{ ++ for (; sd; sd = sd->parent) ++ destroy_sched_domain(sd, cpu); ++} ++ ++/* ++ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must ++ * hold the hotplug lock. ++ */ ++static void ++cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct sched_domain *tmp; ++ ++ /* Remove the sched domains which do not contribute to scheduling. */ ++ for (tmp = sd; tmp; ) { ++ struct sched_domain *parent = tmp->parent; ++ if (!parent) ++ break; ++ ++ if (sd_parent_degenerate(tmp, parent)) { ++ tmp->parent = parent->parent; ++ if (parent->parent) ++ parent->parent->child = tmp; ++ destroy_sched_domain(parent, cpu); ++ } else ++ tmp = tmp->parent; ++ } ++ ++ if (sd && sd_degenerate(sd)) { ++ tmp = sd; ++ sd = sd->parent; ++ destroy_sched_domain(tmp, cpu); ++ if (sd) ++ sd->child = NULL; ++ } ++ ++ sched_domain_debug(sd, cpu); ++ ++ rq_attach_root(rq, rd); ++ tmp = rq->sd; ++ rcu_assign_pointer(rq->sd, sd); ++ destroy_sched_domains(tmp, cpu); ++} ++ ++/* cpus with isolated domains */ ++static cpumask_var_t cpu_isolated_map; ++ ++/* Setup the mask of cpus configured for isolated domains */ ++static int __init isolated_cpu_setup(char *str) ++{ ++ alloc_bootmem_cpumask_var(&cpu_isolated_map); ++ cpulist_parse(str, cpu_isolated_map); ++ return 1; ++} ++ ++__setup("isolcpus=", isolated_cpu_setup); ++ ++#define SD_NODES_PER_DOMAIN 16 ++ ++#ifdef CONFIG_NUMA ++ ++/** ++ * find_next_best_node - find the next node to include in a sched_domain ++ * @node: node whose sched_domain we're building ++ * @used_nodes: nodes already in the sched_domain ++ * ++ * Find the next node to include in a given scheduling domain. Simply ++ * finds the closest node not already in the @used_nodes map. ++ * ++ * Should use nodemask_t. ++ */ ++static int find_next_best_node(int node, nodemask_t *used_nodes) ++{ ++ int i, n, val, min_val, best_node = -1; ++ ++ min_val = INT_MAX; ++ ++ for (i = 0; i < nr_node_ids; i++) { ++ /* Start at @node */ ++ n = (node + i) % nr_node_ids; ++ ++ if (!nr_cpus_node(n)) ++ continue; ++ ++ /* Skip already used nodes */ ++ if (node_isset(n, *used_nodes)) ++ continue; ++ ++ /* Simple min distance search */ ++ val = node_distance(node, n); ++ ++ if (val < min_val) { ++ min_val = val; ++ best_node = n; ++ } ++ } ++ ++ if (best_node != -1) ++ node_set(best_node, *used_nodes); ++ return best_node; ++} ++ ++/** ++ * sched_domain_node_span - get a cpumask for a node's sched_domain ++ * @node: node whose cpumask we're constructing ++ * @span: resulting cpumask ++ * ++ * Given a node, construct a good cpumask for its sched_domain to span. It ++ * should be one that prevents unnecessary balancing, but also spreads tasks ++ * out optimally. ++ */ ++static void sched_domain_node_span(int node, struct cpumask *span) ++{ ++ nodemask_t used_nodes; ++ int i; ++ ++ cpumask_clear(span); ++ nodes_clear(used_nodes); ++ ++ cpumask_or(span, span, cpumask_of_node(node)); ++ node_set(node, used_nodes); ++ ++ for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { ++ int next_node = find_next_best_node(node, &used_nodes); ++ if (next_node < 0) ++ break; ++ cpumask_or(span, span, cpumask_of_node(next_node)); ++ } ++} ++ ++static const struct cpumask *cpu_node_mask(int cpu) ++{ ++ lockdep_assert_held(&sched_domains_mutex); ++ ++ sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); ++ ++ return sched_domains_tmpmask; ++} ++ ++static const struct cpumask *cpu_allnodes_mask(int cpu) ++{ ++ return cpu_possible_mask; ++} ++#endif /* CONFIG_NUMA */ ++ ++static const struct cpumask *cpu_cpu_mask(int cpu) ++{ ++ return cpumask_of_node(cpu_to_node(cpu)); ++} ++ ++int sched_smt_power_savings = 0, sched_mc_power_savings = 0; ++ ++struct sd_data { ++ struct sched_domain **__percpu sd; ++ struct sched_group **__percpu sg; ++ struct sched_group_power **__percpu sgp; ++}; ++ ++struct s_data { ++ struct sched_domain ** __percpu sd; ++ struct root_domain *rd; ++}; ++ ++enum s_alloc { ++ sa_rootdomain, ++ sa_sd, ++ sa_sd_storage, ++ sa_none, ++}; ++ ++struct sched_domain_topology_level; ++ ++typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); ++typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); ++ ++#define SDTL_OVERLAP 0x01 ++ ++struct sched_domain_topology_level { ++ sched_domain_init_f init; ++ sched_domain_mask_f mask; ++ int flags; ++ struct sd_data data; ++}; ++ ++static int ++build_overlap_sched_groups(struct sched_domain *sd, int cpu) ++{ ++ struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; ++ const struct cpumask *span = sched_domain_span(sd); ++ struct cpumask *covered = sched_domains_tmpmask; ++ struct sd_data *sdd = sd->private; ++ struct sched_domain *child; ++ int i; ++ ++ cpumask_clear(covered); ++ ++ for_each_cpu(i, span) { ++ struct cpumask *sg_span; ++ ++ if (cpumask_test_cpu(i, covered)) ++ continue; ++ ++ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), ++ GFP_KERNEL, cpu_to_node(i)); ++ ++ if (!sg) ++ goto fail; ++ ++ sg_span = sched_group_cpus(sg); ++ ++ child = *per_cpu_ptr(sdd->sd, i); ++ if (child->child) { ++ child = child->child; ++ cpumask_copy(sg_span, sched_domain_span(child)); ++ } else ++ cpumask_set_cpu(i, sg_span); ++ ++ cpumask_or(covered, covered, sg_span); ++ ++ sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); ++ atomic_inc(&sg->sgp->ref); ++ ++ if (cpumask_test_cpu(cpu, sg_span)) ++ groups = sg; ++ ++ if (!first) ++ first = sg; ++ if (last) ++ last->next = sg; ++ last = sg; ++ last->next = first; ++ } ++ sd->groups = groups; ++ ++ return 0; ++ ++fail: ++ free_sched_groups(first, 0); ++ ++ return -ENOMEM; ++} ++ ++static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) ++{ ++ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); ++ struct sched_domain *child = sd->child; ++ ++ if (child) ++ cpu = cpumask_first(sched_domain_span(child)); ++ ++ if (sg) { ++ *sg = *per_cpu_ptr(sdd->sg, cpu); ++ (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); ++ atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ ++ } ++ ++ return cpu; ++} ++ ++/* ++ * build_sched_groups will build a circular linked list of the groups ++ * covered by the given span, and will set each group's ->cpumask correctly, ++ * and ->cpu_power to 0. ++ * ++ * Assumes the sched_domain tree is fully constructed ++ */ ++static int ++build_sched_groups(struct sched_domain *sd, int cpu) ++{ ++ struct sched_group *first = NULL, *last = NULL; ++ struct sd_data *sdd = sd->private; ++ const struct cpumask *span = sched_domain_span(sd); ++ struct cpumask *covered; ++ int i; ++ ++ get_group(cpu, sdd, &sd->groups); ++ atomic_inc(&sd->groups->ref); ++ ++ if (cpu != cpumask_first(sched_domain_span(sd))) ++ return 0; ++ ++ lockdep_assert_held(&sched_domains_mutex); ++ covered = sched_domains_tmpmask; ++ ++ cpumask_clear(covered); ++ ++ for_each_cpu(i, span) { ++ struct sched_group *sg; ++ int group = get_group(i, sdd, &sg); ++ int j; ++ ++ if (cpumask_test_cpu(i, covered)) ++ continue; ++ ++ cpumask_clear(sched_group_cpus(sg)); ++ sg->sgp->power = 0; ++ ++ for_each_cpu(j, span) { ++ if (get_group(j, sdd, NULL) != group) ++ continue; ++ ++ cpumask_set_cpu(j, covered); ++ cpumask_set_cpu(j, sched_group_cpus(sg)); ++ } ++ ++ if (!first) ++ first = sg; ++ if (last) ++ last->next = sg; ++ last = sg; ++ } ++ last->next = first; ++ ++ return 0; ++} ++ ++/* ++ * Initializers for schedule domains ++ * Non-inlined to reduce accumulated stack pressure in build_sched_domains() ++ */ ++ ++#ifdef CONFIG_SCHED_DEBUG ++# define SD_INIT_NAME(sd, type) sd->name = #type ++#else ++# define SD_INIT_NAME(sd, type) do { } while (0) ++#endif ++ ++#define SD_INIT_FUNC(type) \ ++static noinline struct sched_domain * \ ++sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ ++{ \ ++ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ ++ *sd = SD_##type##_INIT; \ ++ SD_INIT_NAME(sd, type); \ ++ sd->private = &tl->data; \ ++ return sd; \ ++} ++ ++SD_INIT_FUNC(CPU) ++#ifdef CONFIG_NUMA ++ SD_INIT_FUNC(ALLNODES) ++ SD_INIT_FUNC(NODE) ++#endif ++#ifdef CONFIG_SCHED_SMT ++ SD_INIT_FUNC(SIBLING) ++#endif ++#ifdef CONFIG_SCHED_MC ++ SD_INIT_FUNC(MC) ++#endif ++#ifdef CONFIG_SCHED_BOOK ++ SD_INIT_FUNC(BOOK) ++#endif ++ ++static int default_relax_domain_level = -1; ++int sched_domain_level_max; ++ ++static int __init setup_relax_domain_level(char *str) ++{ ++ unsigned long val; ++ ++ val = simple_strtoul(str, NULL, 0); ++ if (val < sched_domain_level_max) ++ default_relax_domain_level = val; ++ ++ return 1; ++} ++__setup("relax_domain_level=", setup_relax_domain_level); ++ ++static void set_domain_attribute(struct sched_domain *sd, ++ struct sched_domain_attr *attr) ++{ ++ int request; ++ ++ if (!attr || attr->relax_domain_level < 0) { ++ if (default_relax_domain_level < 0) ++ return; ++ else ++ request = default_relax_domain_level; ++ } else ++ request = attr->relax_domain_level; ++ if (request < sd->level) { ++ /* turn off idle balance on this domain */ ++ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); ++ } else { ++ /* turn on idle balance on this domain */ ++ sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); ++ } ++} ++ ++static void __sdt_free(const struct cpumask *cpu_map); ++static int __sdt_alloc(const struct cpumask *cpu_map); ++ ++static void __free_domain_allocs(struct s_data *d, enum s_alloc what, ++ const struct cpumask *cpu_map) ++{ ++ switch (what) { ++ case sa_rootdomain: ++ if (!atomic_read(&d->rd->refcount)) ++ free_rootdomain(&d->rd->rcu); /* fall through */ ++ case sa_sd: ++ free_percpu(d->sd); /* fall through */ ++ case sa_sd_storage: ++ __sdt_free(cpu_map); /* fall through */ ++ case sa_none: ++ break; ++ } ++} ++ ++static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, ++ const struct cpumask *cpu_map) ++{ ++ memset(d, 0, sizeof(*d)); ++ ++ if (__sdt_alloc(cpu_map)) ++ return sa_sd_storage; ++ d->sd = alloc_percpu(struct sched_domain *); ++ if (!d->sd) ++ return sa_sd_storage; ++ d->rd = alloc_rootdomain(); ++ if (!d->rd) ++ return sa_sd; ++ return sa_rootdomain; ++} ++ ++/* ++ * NULL the sd_data elements we've used to build the sched_domain and ++ * sched_group structure so that the subsequent __free_domain_allocs() ++ * will not free the data we're using. ++ */ ++static void claim_allocations(int cpu, struct sched_domain *sd) ++{ ++ struct sd_data *sdd = sd->private; ++ ++ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); ++ *per_cpu_ptr(sdd->sd, cpu) = NULL; ++ ++ if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) ++ *per_cpu_ptr(sdd->sg, cpu) = NULL; ++ ++ if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) ++ *per_cpu_ptr(sdd->sgp, cpu) = NULL; ++} ++ ++#ifdef CONFIG_SCHED_SMT ++static const struct cpumask *cpu_smt_mask(int cpu) ++{ ++ return topology_thread_cpumask(cpu); ++} ++#endif ++ ++/* ++ * Topology list, bottom-up. ++ */ ++static struct sched_domain_topology_level default_topology[] = { ++#ifdef CONFIG_SCHED_SMT ++ { sd_init_SIBLING, cpu_smt_mask, }, ++#endif ++#ifdef CONFIG_SCHED_MC ++ { sd_init_MC, cpu_coregroup_mask, }, ++#endif ++#ifdef CONFIG_SCHED_BOOK ++ { sd_init_BOOK, cpu_book_mask, }, ++#endif ++ { sd_init_CPU, cpu_cpu_mask, }, ++#ifdef CONFIG_NUMA ++ { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, ++ { sd_init_ALLNODES, cpu_allnodes_mask, }, ++#endif ++ { NULL, }, ++}; ++ ++static struct sched_domain_topology_level *sched_domain_topology = default_topology; ++ ++static int __sdt_alloc(const struct cpumask *cpu_map) ++{ ++ struct sched_domain_topology_level *tl; ++ int j; ++ ++ for (tl = sched_domain_topology; tl->init; tl++) { ++ struct sd_data *sdd = &tl->data; ++ ++ sdd->sd = alloc_percpu(struct sched_domain *); ++ if (!sdd->sd) ++ return -ENOMEM; ++ ++ sdd->sg = alloc_percpu(struct sched_group *); ++ if (!sdd->sg) ++ return -ENOMEM; ++ ++ sdd->sgp = alloc_percpu(struct sched_group_power *); ++ if (!sdd->sgp) ++ return -ENOMEM; ++ ++ for_each_cpu(j, cpu_map) { ++ struct sched_domain *sd; ++ struct sched_group *sg; ++ struct sched_group_power *sgp; ++ ++ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), ++ GFP_KERNEL, cpu_to_node(j)); ++ if (!sd) ++ return -ENOMEM; ++ ++ *per_cpu_ptr(sdd->sd, j) = sd; ++ ++ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), ++ GFP_KERNEL, cpu_to_node(j)); ++ if (!sg) ++ return -ENOMEM; ++ ++ *per_cpu_ptr(sdd->sg, j) = sg; ++ ++ sgp = kzalloc_node(sizeof(struct sched_group_power), ++ GFP_KERNEL, cpu_to_node(j)); ++ if (!sgp) ++ return -ENOMEM; ++ ++ *per_cpu_ptr(sdd->sgp, j) = sgp; ++ } ++ } ++ ++ return 0; ++} ++ ++static void __sdt_free(const struct cpumask *cpu_map) ++{ ++ struct sched_domain_topology_level *tl; ++ int j; ++ ++ for (tl = sched_domain_topology; tl->init; tl++) { ++ struct sd_data *sdd = &tl->data; ++ ++ for_each_cpu(j, cpu_map) { ++ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); ++ if (sd && (sd->flags & SD_OVERLAP)) ++ free_sched_groups(sd->groups, 0); ++ kfree(*per_cpu_ptr(sdd->sd, j)); ++ kfree(*per_cpu_ptr(sdd->sg, j)); ++ kfree(*per_cpu_ptr(sdd->sgp, j)); ++ } ++ free_percpu(sdd->sd); ++ free_percpu(sdd->sg); ++ free_percpu(sdd->sgp); ++ } ++} ++ ++struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, ++ struct s_data *d, const struct cpumask *cpu_map, ++ struct sched_domain_attr *attr, struct sched_domain *child, ++ int cpu) ++{ ++ struct sched_domain *sd = tl->init(tl, cpu); ++ if (!sd) ++ return child; ++ ++ set_domain_attribute(sd, attr); ++ cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); ++ if (child) { ++ sd->level = child->level + 1; ++ sched_domain_level_max = max(sched_domain_level_max, sd->level); ++ child->parent = sd; ++ } ++ sd->child = child; ++ ++ return sd; ++} ++ ++/* ++ * Build sched domains for a given set of cpus and attach the sched domains ++ * to the individual cpus ++ */ ++static int build_sched_domains(const struct cpumask *cpu_map, ++ struct sched_domain_attr *attr) ++{ ++ enum s_alloc alloc_state = sa_none; ++ struct sched_domain *sd; ++ struct s_data d; ++ int i, ret = -ENOMEM; ++ ++ alloc_state = __visit_domain_allocation_hell(&d, cpu_map); ++ if (alloc_state != sa_rootdomain) ++ goto error; ++ ++ /* Set up domains for cpus specified by the cpu_map. */ ++ for_each_cpu(i, cpu_map) { ++ struct sched_domain_topology_level *tl; ++ ++ sd = NULL; ++ for (tl = sched_domain_topology; tl->init; tl++) { ++ sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); ++ if (tl->flags & SDTL_OVERLAP) ++ sd->flags |= SD_OVERLAP; ++ if (cpumask_equal(cpu_map, sched_domain_span(sd))) ++ break; ++ } ++ ++ while (sd->child) ++ sd = sd->child; ++ ++ *per_cpu_ptr(d.sd, i) = sd; ++ } ++ ++ /* Build the groups for the domains */ ++ for_each_cpu(i, cpu_map) { ++ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { ++ sd->span_weight = cpumask_weight(sched_domain_span(sd)); ++ if (sd->flags & SD_OVERLAP) { ++ if (build_overlap_sched_groups(sd, i)) ++ goto error; ++ } else { ++ if (build_sched_groups(sd, i)) ++ goto error; ++ } ++ } ++ } ++ ++ /* Calculate CPU power for physical packages and nodes */ ++ for (i = nr_cpumask_bits-1; i >= 0; i--) { ++ if (!cpumask_test_cpu(i, cpu_map)) ++ continue; ++ ++ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { ++ claim_allocations(i, sd); ++ } ++ } ++ ++ /* Attach the domains */ ++ rcu_read_lock(); ++ for_each_cpu(i, cpu_map) { ++ sd = *per_cpu_ptr(d.sd, i); ++ cpu_attach_domain(sd, d.rd, i); ++ } ++ rcu_read_unlock(); ++ ++ ret = 0; ++error: ++ __free_domain_allocs(&d, alloc_state, cpu_map); ++ return ret; ++} ++ ++static cpumask_var_t *doms_cur; /* current sched domains */ ++static int ndoms_cur; /* number of sched domains in 'doms_cur' */ ++static struct sched_domain_attr *dattr_cur; ++ /* attribues of custom domains in 'doms_cur' */ ++ ++/* ++ * Special case: If a kmalloc of a doms_cur partition (array of ++ * cpumask) fails, then fallback to a single sched domain, ++ * as determined by the single cpumask fallback_doms. ++ */ ++static cpumask_var_t fallback_doms; ++ ++/* ++ * arch_update_cpu_topology lets virtualized architectures update the ++ * cpu core maps. It is supposed to return 1 if the topology changed ++ * or 0 if it stayed the same. ++ */ ++int __attribute__((weak)) arch_update_cpu_topology(void) ++{ ++ return 0; ++} ++ ++cpumask_var_t *alloc_sched_domains(unsigned int ndoms) ++{ ++ int i; ++ cpumask_var_t *doms; ++ ++ doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); ++ if (!doms) ++ return NULL; ++ for (i = 0; i < ndoms; i++) { ++ if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { ++ free_sched_domains(doms, i); ++ return NULL; ++ } ++ } ++ return doms; ++} ++ ++void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) ++{ ++ unsigned int i; ++ for (i = 0; i < ndoms; i++) ++ free_cpumask_var(doms[i]); ++ kfree(doms); ++} ++ ++/* ++ * Set up scheduler domains and groups. Callers must hold the hotplug lock. ++ * For now this just excludes isolated cpus, but could be used to ++ * exclude other special cases in the future. ++ */ ++static int init_sched_domains(const struct cpumask *cpu_map) ++{ ++ int err; ++ ++ arch_update_cpu_topology(); ++ ndoms_cur = 1; ++ doms_cur = alloc_sched_domains(ndoms_cur); ++ if (!doms_cur) ++ doms_cur = &fallback_doms; ++ cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); ++ dattr_cur = NULL; ++ err = build_sched_domains(doms_cur[0], NULL); ++ register_sched_domain_sysctl(); ++ ++ return err; ++} ++ ++/* ++ * Detach sched domains from a group of cpus specified in cpu_map ++ * These cpus will now be attached to the NULL domain ++ */ ++static void detach_destroy_domains(const struct cpumask *cpu_map) ++{ ++ int i; ++ ++ rcu_read_lock(); ++ for_each_cpu(i, cpu_map) ++ cpu_attach_domain(NULL, &def_root_domain, i); ++ rcu_read_unlock(); ++} ++ ++/* handle null as "default" */ ++static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, ++ struct sched_domain_attr *new, int idx_new) ++{ ++ struct sched_domain_attr tmp; ++ ++ /* fast path */ ++ if (!new && !cur) ++ return 1; ++ ++ tmp = SD_ATTR_INIT; ++ return !memcmp(cur ? (cur + idx_cur) : &tmp, ++ new ? (new + idx_new) : &tmp, ++ sizeof(struct sched_domain_attr)); ++} ++ ++/* ++ * Partition sched domains as specified by the 'ndoms_new' ++ * cpumasks in the array doms_new[] of cpumasks. This compares ++ * doms_new[] to the current sched domain partitioning, doms_cur[]. ++ * It destroys each deleted domain and builds each new domain. ++ * ++ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. ++ * The masks don't intersect (don't overlap.) We should setup one ++ * sched domain for each mask. CPUs not in any of the cpumasks will ++ * not be load balanced. If the same cpumask appears both in the ++ * current 'doms_cur' domains and in the new 'doms_new', we can leave ++ * it as it is. ++ * ++ * The passed in 'doms_new' should be allocated using ++ * alloc_sched_domains. This routine takes ownership of it and will ++ * free_sched_domains it when done with it. If the caller failed the ++ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, ++ * and partition_sched_domains() will fallback to the single partition ++ * 'fallback_doms', it also forces the domains to be rebuilt. ++ * ++ * If doms_new == NULL it will be replaced with cpu_online_mask. ++ * ndoms_new == 0 is a special case for destroying existing domains, ++ * and it will not create the default domain. ++ * ++ * Call with hotplug lock held ++ */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{ ++ int i, j, n; ++ int new_topology; ++ ++ mutex_lock(&sched_domains_mutex); ++ ++ /* always unregister in case we don't destroy any domains */ ++ unregister_sched_domain_sysctl(); ++ ++ /* Let architecture update cpu core mappings. */ ++ new_topology = arch_update_cpu_topology(); ++ ++ n = doms_new ? ndoms_new : 0; ++ ++ /* Destroy deleted domains */ ++ for (i = 0; i < ndoms_cur; i++) { ++ for (j = 0; j < n && !new_topology; j++) { ++ if (cpumask_equal(doms_cur[i], doms_new[j]) ++ && dattrs_equal(dattr_cur, i, dattr_new, j)) ++ goto match1; ++ } ++ /* no match - a current sched domain not in new doms_new[] */ ++ detach_destroy_domains(doms_cur[i]); ++match1: ++ ; ++ } ++ ++ if (doms_new == NULL) { ++ ndoms_cur = 0; ++ doms_new = &fallback_doms; ++ cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); ++ WARN_ON_ONCE(dattr_new); ++ } ++ ++ /* Build new domains */ ++ for (i = 0; i < ndoms_new; i++) { ++ for (j = 0; j < ndoms_cur && !new_topology; j++) { ++ if (cpumask_equal(doms_new[i], doms_cur[j]) ++ && dattrs_equal(dattr_new, i, dattr_cur, j)) ++ goto match2; ++ } ++ /* no match - add a new doms_new */ ++ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); ++match2: ++ ; ++ } ++ ++ /* Remember the new sched domains */ ++ if (doms_cur != &fallback_doms) ++ free_sched_domains(doms_cur, ndoms_cur); ++ kfree(dattr_cur); /* kfree(NULL) is safe */ ++ doms_cur = doms_new; ++ dattr_cur = dattr_new; ++ ndoms_cur = ndoms_new; ++ ++ register_sched_domain_sysctl(); ++ ++ mutex_unlock(&sched_domains_mutex); ++} ++ ++#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) ++static void reinit_sched_domains(void) ++{ ++ get_online_cpus(); ++ ++ /* Destroy domains first to force the rebuild */ ++ partition_sched_domains(0, NULL, NULL); ++ ++ rebuild_sched_domains(); ++ put_online_cpus(); ++} ++ ++static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) ++{ ++ unsigned int level = 0; ++ ++ if (sscanf(buf, "%u", &level) != 1) ++ return -EINVAL; ++ ++ /* ++ * level is always be positive so don't check for ++ * level < POWERSAVINGS_BALANCE_NONE which is 0 ++ * What happens on 0 or 1 byte write, ++ * need to check for count as well? ++ */ ++ ++ if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) ++ return -EINVAL; ++ ++ if (smt) ++ sched_smt_power_savings = level; ++ else ++ sched_mc_power_savings = level; ++ ++ reinit_sched_domains(); ++ ++ return count; ++} ++ ++#ifdef CONFIG_SCHED_MC ++static ssize_t sched_mc_power_savings_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ return sprintf(buf, "%u\n", sched_mc_power_savings); ++} ++static ssize_t sched_mc_power_savings_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return sched_power_savings_store(buf, count, 0); ++} ++static DEVICE_ATTR(sched_mc_power_savings, 0644, ++ sched_mc_power_savings_show, ++ sched_mc_power_savings_store); ++#endif ++ ++#ifdef CONFIG_SCHED_SMT ++static ssize_t sched_smt_power_savings_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ return sprintf(buf, "%u\n", sched_smt_power_savings); ++} ++static ssize_t sched_smt_power_savings_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return sched_power_savings_store(buf, count, 1); ++} ++static DEVICE_ATTR(sched_smt_power_savings, 0644, ++ sched_smt_power_savings_show, ++ sched_smt_power_savings_store); ++#endif ++ ++int __init sched_create_sysfs_power_savings_entries(struct device *dev) ++{ ++ int err = 0; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (smt_capable()) ++ err = device_create_file(dev, &dev_attr_sched_smt_power_savings); ++#endif ++#ifdef CONFIG_SCHED_MC ++ if (!err && mc_capable()) ++ err = device_create_file(dev, &dev_attr_sched_mc_power_savings); ++#endif ++ return err; ++} ++#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ */ ++static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, ++ void *hcpu) ++{ ++ switch (action & ~CPU_TASKS_FROZEN) { ++ case CPU_ONLINE: ++ case CPU_DOWN_FAILED: ++ cpuset_update_active_cpus(); ++ return NOTIFY_OK; ++ default: ++ return NOTIFY_DONE; ++ } ++} ++ ++static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, ++ void *hcpu) ++{ ++ switch (action & ~CPU_TASKS_FROZEN) { ++ case CPU_DOWN_PREPARE: ++ cpuset_update_active_cpus(); ++ return NOTIFY_OK; ++ default: ++ return NOTIFY_DONE; ++ } ++} ++ ++#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) ++/* ++ * Cheaper version of the below functions in case support for SMT and MC is ++ * compiled in but CPUs have no siblings. ++ */ ++static bool sole_cpu_idle(int cpu) ++{ ++ return rq_idle(cpu_rq(cpu)); ++} ++#endif ++#ifdef CONFIG_SCHED_SMT ++/* All this CPU's SMT siblings are idle */ ++static bool siblings_cpu_idle(int cpu) ++{ ++ return cpumask_subset(&(cpu_rq(cpu)->smt_siblings), ++ &grq.cpu_idle_map); ++} ++#endif ++#ifdef CONFIG_SCHED_MC ++/* All this CPU's shared cache siblings are idle */ ++static bool cache_cpu_idle(int cpu) ++{ ++ return cpumask_subset(&(cpu_rq(cpu)->cache_siblings), ++ &grq.cpu_idle_map); ++} ++#endif ++ ++enum sched_domain_level { ++ SD_LV_NONE = 0, ++ SD_LV_SIBLING, ++ SD_LV_MC, ++ SD_LV_BOOK, ++ SD_LV_CPU, ++ SD_LV_NODE, ++ SD_LV_ALLNODES, ++ SD_LV_MAX ++}; ++ ++void __init sched_init_smp(void) ++{ ++ struct sched_domain *sd; ++ int cpu; ++ ++ cpumask_var_t non_isolated_cpus; ++ ++ alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); ++ alloc_cpumask_var(&fallback_doms, GFP_KERNEL); ++ ++ get_online_cpus(); ++ mutex_lock(&sched_domains_mutex); ++ init_sched_domains(cpu_active_mask); ++ cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); ++ if (cpumask_empty(non_isolated_cpus)) ++ cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); ++ mutex_unlock(&sched_domains_mutex); ++ put_online_cpus(); ++ ++ hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); ++ hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); ++ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) ++ BUG(); ++ free_cpumask_var(non_isolated_cpus); ++ ++ grq_lock_irq(); ++ /* ++ * Set up the relative cache distance of each online cpu from each ++ * other in a simple array for quick lookup. Locality is determined ++ * by the closest sched_domain that CPUs are separated by. CPUs with ++ * shared cache in SMT and MC are treated as local. Separate CPUs ++ * (within the same package or physically) within the same node are ++ * treated as not local. CPUs not even in the same domain (different ++ * nodes) are treated as very distant. ++ */ ++ for_each_online_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ for_each_domain(cpu, sd) { ++ int locality, other_cpu; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (sd->level == SD_LV_SIBLING) { ++ for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) ++ cpumask_set_cpu(other_cpu, &rq->smt_siblings); ++ } ++#endif ++#ifdef CONFIG_SCHED_MC ++ if (sd->level == SD_LV_MC) { ++ for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) ++ cpumask_set_cpu(other_cpu, &rq->cache_siblings); ++ } ++#endif ++ if (sd->level <= SD_LV_SIBLING) ++ locality = 1; ++ else if (sd->level <= SD_LV_MC) ++ locality = 2; ++ else if (sd->level <= SD_LV_NODE) ++ locality = 3; ++ else ++ continue; ++ ++ for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) { ++ if (locality < rq->cpu_locality[other_cpu]) ++ rq->cpu_locality[other_cpu] = locality; ++ } ++ } ++ ++/* ++ * Each runqueue has its own function in case it doesn't have ++ * siblings of its own allowing mixed topologies. ++ */ ++#ifdef CONFIG_SCHED_SMT ++ if (cpus_weight(rq->smt_siblings) > 1) ++ rq->siblings_idle = siblings_cpu_idle; ++#endif ++#ifdef CONFIG_SCHED_MC ++ if (cpus_weight(rq->cache_siblings) > 1) ++ rq->cache_idle = cache_cpu_idle; ++#endif ++ } ++ grq_unlock_irq(); ++} ++#else ++void __init sched_init_smp(void) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++unsigned int sysctl_timer_migration = 1; ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ prio_ratios[0] = 128; ++ for (i = 1 ; i < PRIO_RANGE ; i++) ++ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; ++ ++ raw_spin_lock_init(&grq.lock); ++ grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0; ++ grq.niffies = 0; ++ grq.last_jiffy = jiffies; ++ raw_spin_lock_init(&grq.iso_lock); ++ grq.iso_ticks = 0; ++ grq.iso_refractory = false; ++ grq.noc = 1; ++#ifdef CONFIG_SMP ++ init_defrootdomain(); ++ grq.qnr = grq.idle_cpus = 0; ++ cpumask_clear(&grq.cpu_idle_map); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc = ++ rq->iowait_pc = rq->idle_pc = 0; ++ rq->dither = false; ++#ifdef CONFIG_SMP ++ rq->sticky_task = NULL; ++ rq->last_niffy = 0; ++ rq->sd = NULL; ++ rq->rd = NULL; ++ rq->online = false; ++ rq->cpu = i; ++ rq_attach_root(rq, &def_root_domain); ++#endif ++ atomic_set(&rq->nr_iowait, 0); ++ } ++ ++#ifdef CONFIG_SMP ++ nr_cpu_ids = i; ++ /* ++ * Set the base locality for cpu cache distance calculation to ++ * "distant" (3). Make sure the distance from a CPU to itself is 0. ++ */ ++ for_each_possible_cpu(i) { ++ int j; ++ ++ rq = cpu_rq(i); ++#ifdef CONFIG_SCHED_SMT ++ cpumask_clear(&rq->smt_siblings); ++ cpumask_set_cpu(i, &rq->smt_siblings); ++ rq->siblings_idle = sole_cpu_idle; ++ cpumask_set_cpu(i, &rq->smt_siblings); ++#endif ++#ifdef CONFIG_SCHED_MC ++ cpumask_clear(&rq->cache_siblings); ++ cpumask_set_cpu(i, &rq->cache_siblings); ++ rq->cache_idle = sole_cpu_idle; ++ cpumask_set_cpu(i, &rq->cache_siblings); ++#endif ++ rq->cpu_locality = kmalloc(nr_cpu_ids * sizeof(int *), GFP_ATOMIC); ++ for_each_possible_cpu(j) { ++ if (i == j) ++ rq->cpu_locality[j] = 0; ++ else ++ rq->cpu_locality[j] = 4; ++ } ++ } ++#endif ++ ++ for (i = 0; i < PRIO_LIMIT; i++) ++ INIT_LIST_HEAD(grq.queue + i); ++ /* delimiter for bitsearch */ ++ __set_bit(PRIO_LIMIT, grq.prio_bitmap); ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&init_task.preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_RT_MUTEXES ++ plist_head_init(&init_task.pi_waiters); ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ atomic_inc(&init_mm.mm_count); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++#ifdef CONFIG_SMP ++ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); ++ /* May be allocated at isolcpus cmdline parse time */ ++ if (cpu_isolated_map == NULL) ++ zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); ++#endif /* SMP */ ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; /* ratelimiting */ ++ ++ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || ++ system_state != SYSTEM_RUNNING || oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ dump_stack(); ++} ++EXPORT_SYMBOL(__might_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ unsigned long flags; ++ struct rq *rq; ++ int queued; ++ ++ read_lock_irq(&tasklist_lock); ++ ++ do_each_thread(g, p) { ++ if (!rt_task(p) && !iso_task(p)) ++ continue; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_grq_lock(p); ++ ++ queued = task_queued(p); ++ if (queued) ++ dequeue_task(p); ++ __setscheduler(p, rq, SCHED_NORMAL, 0); ++ if (queued) { ++ enqueue_task(p); ++ try_preempt(p, rq); ++ } ++ ++ __task_grq_unlock(); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ } while_each_thread(g, p); ++ ++ read_unlock_irq(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given cpu. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * set_curr_task - set the current task for a given cpu. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a cpu in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++/* ++ * Use precise platform statistics if available: ++ */ ++#ifdef CONFIG_VIRT_CPU_ACCOUNTING ++void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) ++{ ++ *ut = p->utime; ++ *st = p->stime; ++} ++ ++void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) ++{ ++ struct task_cputime cputime; ++ ++ thread_group_cputime(p, &cputime); ++ ++ *ut = cputime.utime; ++ *st = cputime.stime; ++} ++#else ++ ++void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) ++{ ++ cputime_t rtime, utime = p->utime, total = utime + p->stime; ++ ++ rtime = nsecs_to_cputime(p->sched_time); ++ ++ if (total) { ++ u64 temp; ++ ++ temp = (u64)(rtime * utime); ++ do_div(temp, total); ++ utime = (cputime_t)temp; ++ } else ++ utime = rtime; ++ ++ /* ++ * Compare with previous values, to keep monotonicity: ++ */ ++ p->prev_utime = max(p->prev_utime, utime); ++ p->prev_stime = max(p->prev_stime, (rtime - p->prev_utime)); ++ ++ *ut = p->prev_utime; ++ *st = p->prev_stime; ++} ++ ++/* ++ * Must be called with siglock held. ++ */ ++void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) ++{ ++ struct signal_struct *sig = p->signal; ++ struct task_cputime cputime; ++ cputime_t rtime, utime, total; ++ ++ thread_group_cputime(p, &cputime); ++ ++ total = cputime.utime + cputime.stime; ++ rtime = nsecs_to_cputime(cputime.sum_exec_runtime); ++ ++ if (total) { ++ u64 temp; ++ ++ temp = (u64)(rtime * cputime.utime); ++ do_div(temp, total); ++ utime = (cputime_t)temp; ++ } else ++ utime = rtime; ++ ++ sig->prev_utime = max(sig->prev_utime, utime); ++ sig->prev_stime = max(sig->prev_stime, (rtime - sig->prev_utime)); ++ ++ *ut = sig->prev_utime; ++ *st = sig->prev_stime; ++} ++#endif ++ ++inline cputime_t task_gtime(struct task_struct *p) ++{ ++ return p->gtime; ++} ++ ++void __cpuinit init_idle_bootup_task(struct task_struct *idle) ++{} ++ ++#ifdef CONFIG_SCHED_DEBUG ++void proc_sched_show_task(struct task_struct *p, struct seq_file *m) ++{} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_SMP ++unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) ++{ ++ return SCHED_LOAD_SCALE; ++} ++ ++unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) ++{ ++ unsigned long weight = cpumask_weight(sched_domain_span(sd)); ++ unsigned long smt_gain = sd->smt_gain; ++ ++ smt_gain /= weight; ++ ++ return smt_gain; ++} ++#endif +Index: linux-3.3-ck1/kernel/sched/Makefile +=================================================================== +--- linux-3.3-ck1.orig/kernel/sched/Makefile 2012-03-24 19:30:00.014420399 +1100 ++++ linux-3.3-ck1/kernel/sched/Makefile 2012-03-24 19:30:29.047925897 +1100 +@@ -11,10 +11,14 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + ++ifdef CONFIG_SCHED_BFS ++obj-y += bfs.o clock.o ++else + obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o +-obj-$(CONFIG_SMP) += cpupri.o + obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o ++endif ++obj-$(CONFIG_SMP) += cpupri.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o + + +Index: linux-3.3-ck1/mm/vmscan.c +=================================================================== +--- linux-3.3-ck1.orig/mm/vmscan.c 2012-03-24 19:29:59.987419928 +1100 ++++ linux-3.3-ck1/mm/vmscan.c 2012-03-24 19:34:53.640769520 +1100 +@@ -153,7 +153,7 @@ struct mem_cgroup_zone { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 10; + long vm_total_pages; /* The total number of pages which the VM controls */ + + static LIST_HEAD(shrinker_list); +@@ -999,7 +999,7 @@ cull_mlocked: + + activate_locked: + /* Not a candidate for swapping, so reclaim swap space. */ +- if (PageSwapCache(page) && vm_swap_full()) ++ if (PageSwapCache(page)) + try_to_free_swap(page); + VM_BUG_ON(PageActive(page)); + SetPageActive(page); +@@ -2206,6 +2206,35 @@ static inline bool compaction_ready(stru + } + + /* ++ * Helper functions to adjust nice level of kswapd, based on the priority of ++ * the task (p) that called it. If it is already higher priority we do not ++ * demote its nice level since it is still working on behalf of a higher ++ * priority task. With kernel threads we leave it at nice 0. ++ * ++ * We don't ever run kswapd real time, so if a real time task calls kswapd we ++ * set it to highest SCHED_NORMAL priority. ++ */ ++static inline int effective_sc_prio(struct task_struct *p) ++{ ++ if (likely(p->mm)) { ++ if (rt_task(p)) ++ return -20; ++ if (p->policy == SCHED_IDLEPRIO) ++ return 19; ++ return task_nice(p); ++ } ++ return 0; ++} ++ ++static void set_kswapd_nice(struct task_struct *kswapd, int active) ++{ ++ long nice = effective_sc_prio(current); ++ ++ if (task_nice(kswapd) > nice || !active) ++ set_user_nice(kswapd, nice); ++} ++ ++/* + * This is the direct reclaim path, for page-allocating processes. We only + * try to reclaim pages from zones which will satisfy the caller's allocation + * request. +@@ -3090,6 +3119,7 @@ static int kswapd(void *p) + void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) + { + pg_data_t *pgdat; ++ int active; + + if (!populated_zone(zone)) + return; +@@ -3101,7 +3131,9 @@ void wakeup_kswapd(struct zone *zone, in + pgdat->kswapd_max_order = order; + pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); + } +- if (!waitqueue_active(&pgdat->kswapd_wait)) ++ active = waitqueue_active(&pgdat->kswapd_wait); ++ set_kswapd_nice(pgdat->kswapd, active); ++ if (!active) + return; + if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + return; +Index: linux-3.3-ck1/include/linux/swap.h +=================================================================== +--- linux-3.3-ck1.orig/include/linux/swap.h 2012-03-24 19:29:59.953419335 +1100 ++++ linux-3.3-ck1/include/linux/swap.h 2012-03-24 19:30:29.779938643 +1100 +@@ -201,7 +201,7 @@ struct swap_list_t { + int next; /* swapfile to be used next */ + }; + +-/* Swap 50% full? Release swapcache more aggressively.. */ ++/* Swap 50% full? */ + #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) + + /* linux/mm/page_alloc.c */ +@@ -351,9 +351,10 @@ extern void grab_swap_token(struct mm_st + extern void __put_swap_token(struct mm_struct *); + extern void disable_swap_token(struct mem_cgroup *memcg); + ++/* Only allow swap token to have effect if swap is full */ + static inline int has_swap_token(struct mm_struct *mm) + { +- return (mm == swap_token_mm); ++ return (mm == swap_token_mm && vm_swap_full()); + } + + static inline void put_swap_token(struct mm_struct *mm) +Index: linux-3.3-ck1/mm/memory.c +=================================================================== +--- linux-3.3-ck1.orig/mm/memory.c 2012-03-24 19:29:59.916418690 +1100 ++++ linux-3.3-ck1/mm/memory.c 2012-03-24 19:30:29.780938660 +1100 +@@ -3003,7 +3003,7 @@ static int do_swap_page(struct mm_struct + mem_cgroup_commit_charge_swapin(page, ptr); + + swap_free(entry); +- if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) ++ if ((vma->vm_flags & VM_LOCKED) || PageMlocked(page)) + try_to_free_swap(page); + unlock_page(page); + if (swapcache) { +Index: linux-3.3-ck1/mm/swapfile.c +=================================================================== +--- linux-3.3-ck1.orig/mm/swapfile.c 2012-03-24 19:29:59.915418673 +1100 ++++ linux-3.3-ck1/mm/swapfile.c 2012-03-24 19:30:29.780938660 +1100 +@@ -288,7 +288,7 @@ checks: + scan_base = offset = si->lowest_bit; + + /* reuse swap entry of cache-only swap if not busy. */ +- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { ++ if (si->swap_map[offset] == SWAP_HAS_CACHE) { + int swap_was_freed; + spin_unlock(&swap_lock); + swap_was_freed = __try_to_reclaim_swap(si, offset); +@@ -377,7 +377,7 @@ scan: + spin_lock(&swap_lock); + goto checks; + } +- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { ++ if (si->swap_map[offset] == SWAP_HAS_CACHE) { + spin_lock(&swap_lock); + goto checks; + } +@@ -392,7 +392,7 @@ scan: + spin_lock(&swap_lock); + goto checks; + } +- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { ++ if (si->swap_map[offset] == SWAP_HAS_CACHE) { + spin_lock(&swap_lock); + goto checks; + } +@@ -706,8 +706,7 @@ int free_swap_and_cache(swp_entry_t entr + * Not mapped elsewhere, or swap space full? Free it! + * Also recheck PageSwapCache now page is locked (above). + */ +- if (PageSwapCache(page) && !PageWriteback(page) && +- (!page_mapped(page) || vm_swap_full())) { ++ if (PageSwapCache(page) && !PageWriteback(page)) { + delete_from_swap_cache(page); + SetPageDirty(page); + } +Index: linux-3.3-ck1/mm/page-writeback.c +=================================================================== +--- linux-3.3-ck1.orig/mm/page-writeback.c 2012-03-24 19:31:32.335037467 +1100 ++++ linux-3.3-ck1/mm/page-writeback.c 2012-03-24 19:34:53.649769687 +1100 +@@ -65,7 +65,7 @@ static long ratelimit_pages = 32; + /* + * Start background writeback (via writeback threads) at this percentage + */ +-int dirty_background_ratio = 10; ++int dirty_background_ratio = 1; + + /* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of +@@ -82,7 +82,7 @@ int vm_highmem_is_dirtyable; + /* + * The generator of dirty data starts writeback at this percentage + */ +-int vm_dirty_ratio = 20; ++int vm_dirty_ratio = 1; + + /* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of +Index: linux-3.3-ck1/kernel/Kconfig.hz +=================================================================== +--- linux-3.3-ck1.orig/kernel/Kconfig.hz 2012-03-24 19:31:32.301036837 +1100 ++++ linux-3.3-ck1/kernel/Kconfig.hz 2012-03-24 19:34:53.685770353 +1100 +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_1000 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -23,13 +23,14 @@ choice + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + +- config HZ_250 ++ config HZ_250_NODEFAULT + bool "250 HZ" + help +- 250 Hz is a good compromise choice allowing server performance +- while also showing good interactive responsiveness even +- on SMP and NUMA systems. If you are going to be using NTSC video +- or multimedia, selected 300Hz instead. ++ 250 HZ is a lousy compromise choice allowing server interactivity ++ while also showing desktop throughput and no extra power saving on ++ laptops. No good for anything. ++ ++ Recommend 100 or 1000 instead. + + config HZ_300 + bool "300 HZ" +@@ -43,16 +44,82 @@ choice + bool "1000 HZ" + help + 1000 Hz is the preferred choice for desktop systems and other +- systems requiring fast interactive responses to events. ++ systems requiring fast interactive responses to events. Laptops ++ can also benefit from this choice without sacrificing battery life ++ if dynticks is also enabled. ++ ++ config HZ_1500 ++ bool "1500 HZ" ++ help ++ 1500 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_2000 ++ bool "2000 HZ" ++ help ++ 2000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_3000 ++ bool "3000 HZ" ++ help ++ 3000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_4000 ++ bool "4000 HZ" ++ help ++ 4000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_5000 ++ bool "5000 HZ" ++ help ++ 5000 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_7500 ++ bool "7500 HZ" ++ help ++ 7500 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_10000 ++ bool "10000 HZ" ++ help ++ 10000 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ + + endchoice + + config HZ + int + default 100 if HZ_100 +- default 250 if HZ_250 ++ default 250 if HZ_250_NODEFAULT + default 300 if HZ_300 + default 1000 if HZ_1000 ++ default 1500 if HZ_1500 ++ default 2000 if HZ_2000 ++ default 3000 if HZ_3000 ++ default 4000 if HZ_4000 ++ default 5000 if HZ_5000 ++ default 7500 if HZ_7500 ++ default 10000 if HZ_10000 + + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) +Index: linux-3.3-ck1/arch/x86/kernel/cpu/proc.c +=================================================================== +--- linux-3.3-ck1.orig/arch/x86/kernel/cpu/proc.c 2012-03-24 19:31:32.230035518 +1100 ++++ linux-3.3-ck1/arch/x86/kernel/cpu/proc.c 2012-03-24 19:34:53.684770335 +1100 +@@ -109,7 +109,7 @@ static int show_cpuinfo(struct seq_file + + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", + c->loops_per_jiffy/(500000/HZ), +- (c->loops_per_jiffy/(5000/HZ)) % 100); ++ (c->loops_per_jiffy * 10 /(50000/HZ)) % 100); + + #ifdef CONFIG_X86_64 + if (c->x86_tlbsize > 0) +Index: linux-3.3-ck1/arch/x86/kernel/smpboot.c +=================================================================== +--- linux-3.3-ck1.orig/arch/x86/kernel/smpboot.c 2012-03-24 19:31:32.230035518 +1100 ++++ linux-3.3-ck1/arch/x86/kernel/smpboot.c 2012-03-24 19:34:53.685770353 +1100 +@@ -436,7 +436,7 @@ static void impress_friends(void) + "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + num_online_cpus(), + bogosum/(500000/HZ), +- (bogosum/(5000/HZ))%100); ++ (bogosum * 10/(50000/HZ))%100); + + pr_debug("Before bogocount - setting activated=1.\n"); + } +Index: linux-3.3-ck1/include/linux/nfsd/stats.h +=================================================================== +--- linux-3.3-ck1.orig/include/linux/nfsd/stats.h 2012-03-24 19:31:32.229035499 +1100 ++++ linux-3.3-ck1/include/linux/nfsd/stats.h 2012-03-24 19:34:53.685770353 +1100 +@@ -11,8 +11,8 @@ + + #include + +-/* thread usage wraps very million seconds (approx one fortnight) */ +-#define NFSD_USAGE_WRAP (HZ*1000000) ++/* thread usage wraps every one hundred thousand seconds (approx one day) */ ++#define NFSD_USAGE_WRAP (HZ*100000) + + #ifdef __KERNEL__ + +Index: linux-3.3-ck1/include/net/inet_timewait_sock.h +=================================================================== +--- linux-3.3-ck1.orig/include/net/inet_timewait_sock.h 2012-03-24 19:31:32.229035499 +1100 ++++ linux-3.3-ck1/include/net/inet_timewait_sock.h 2012-03-24 19:34:53.685770353 +1100 +@@ -38,8 +38,8 @@ struct inet_hashinfo; + * If time > 4sec, it is "slow" path, no recycling is required, + * so that we select tick to get range about 4 seconds. + */ +-#if HZ <= 16 || HZ > 4096 +-# error Unsupported: HZ <= 16 or HZ > 4096 ++#if HZ <= 16 || HZ > 16384 ++# error Unsupported: HZ <= 16 or HZ > 16384 + #elif HZ <= 32 + # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #elif HZ <= 64 +@@ -54,8 +54,12 @@ struct inet_hashinfo; + # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #elif HZ <= 2048 + # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +-#else ++#elif HZ <= 4096 + # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) ++#elif HZ <= 8192 ++# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) ++#else ++# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #endif + + /* TIME_WAIT reaping mechanism. */ +Index: linux-3.3-ck1/init/calibrate.c +=================================================================== +--- linux-3.3-ck1.orig/init/calibrate.c 2012-03-24 19:31:32.230035518 +1100 ++++ linux-3.3-ck1/init/calibrate.c 2012-03-24 19:34:53.685770353 +1100 +@@ -293,7 +293,7 @@ void __cpuinit calibrate_delay(void) + if (!printed) + pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n", + lpj/(500000/HZ), +- (lpj/(5000/HZ)) % 100, lpj); ++ (lpj * 10 /(50000 / HZ)) % 100, lpj); + + loops_per_jiffy = lpj; + printed = true; +Index: linux-3.3-ck1/kernel/Kconfig.preempt +=================================================================== +--- linux-3.3-ck1.orig/kernel/Kconfig.preempt 2012-03-24 19:31:32.213035203 +1100 ++++ linux-3.3-ck1/kernel/Kconfig.preempt 2012-03-24 19:34:53.694770519 +1100 +@@ -1,7 +1,7 @@ + + choice + prompt "Preemption Model" +- default PREEMPT_NONE ++ default PREEMPT + + config PREEMPT_NONE + bool "No Forced Preemption (Server)" +@@ -17,7 +17,7 @@ config PREEMPT_NONE + latencies. + + config PREEMPT_VOLUNTARY +- bool "Voluntary Kernel Preemption (Desktop)" ++ bool "Voluntary Kernel Preemption (Nothing)" + help + This option reduces the latency of the kernel by adding more + "explicit preemption points" to the kernel code. These new +@@ -31,7 +31,8 @@ config PREEMPT_VOLUNTARY + applications to run more 'smoothly' even when the system is + under load. + +- Select this if you are building a kernel for a desktop system. ++ Select this for no system in particular (choose Preemptible ++ instead on a desktop if you know what's good for you). + + config PREEMPT + bool "Preemptible Kernel (Low-Latency Desktop)" +Index: linux-3.3-ck1/Makefile +=================================================================== +--- linux-3.3-ck1.orig/Makefile 2012-03-24 19:31:32.197034905 +1100 ++++ linux-3.3-ck1/Makefile 2012-03-24 19:34:53.703770687 +1100 +@@ -10,6 +10,10 @@ NAME = Saber-toothed Squirrel + # Comments in this file are targeted only to the developer, do not + # expect to learn how to build the kernel reading this file. + ++CKVERSION = -ck1 ++CKNAME = BFS Powered ++EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) ++ + # Do not: + # o use make's built-in rules and variables + # (this increases performance and avoids hard-to-debug behaviour); diff --git a/3.3.8/3rd-3rdparty-1.0-tree.patch b/3.3.8/3rd-3rdparty-1.0-tree.patch new file mode 100644 index 0000000..2a6ed72 --- /dev/null +++ b/3.3.8/3rd-3rdparty-1.0-tree.patch @@ -0,0 +1,181 @@ + + 3rdparty/mkbuild.pl | 92 +++++++++++++++++++++++++++++++++++++++++++++ + Documentation/3rdparty.txt | 76 +++++++++++++++++++++++++++++++++++++ + 2 files changed, 168 insertions(+) + +diff -Nurp linux-2.6.37/3rdparty/mkbuild.pl 3rdparty/mkbuild.pl +--- linux-2.6.37/3rdparty/mkbuild.pl 1970-01-01 02:00:00.000000000 +0200 ++++ 3rdparty/mkbuild.pl 2004-04-23 14:59:03.000000000 +0300 +@@ -0,0 +1,92 @@ ++#!/usr/bin/perl -w ++# ++# Version 1.0 ++# ++# Copyright 2001 Jeff Garzik ++# Copyright 2002 Juan Quintela ++# Copyright 2003 Nicolas Planel ++# ++# This software may be used and distributed according to the terms ++# of the GNU General Public License, incorporated herein by reference. ++# ++# ++# Run "mkbuild.pl" ++# ++# This program generates the following files ++# Makefile ++# Makefile.drivers ++# Config.in ++# using the information in the subdirs of this directory. ++# ++# subdirs need to have: ++# a Config.in file ++# a Makefile with a O_TARGET/L_TARGET targets ++# The config.in should set a CONFIG_ to m/y. ++ ++use strict; ++ ++opendir(THISDIR, "."); ++# get dirs without . and .. garbage ++my (@modules) = grep(!/\.\.?$/,grep(-d, readdir(THISDIR))); ++closedir(THISDIR); ++ ++generate_kconfig(@modules); ++generate_makefile(@modules); ++exit(0); ++ ++########################################################################## ++ ++sub generate_makefile { ++ my (@modules) = @_; ++ ++ local *F; ++ open F, "> Makefile" or die "Cannot create new Makefile: $!\n"; ++ print F <<'EOM'; ++# ++# THIS IS AN AUTOMATICALLY GENERATED FILE. DO NOT EDIT. ++# ++ ++EOM ++ printf F "obj- := 3rdparty.o # Dummy rule to force built-in.o to be made\n"; ++ printf F "obj-\$(%s) += %s\n", to_CONFIG($_), $_ . '/' foreach @modules; ++} ++ ++sub generate_kconfig { ++ my (@modules) = @_; ++ ++ local *F; ++ open F, "> Kconfig" or die "Cannot create Kconfig: $!\n"; ++ print F <<"EOM"; ++# ++# THIS IS AN AUTOMATICALLY GENERATED FILE. DO NOT EDIT. ++# ++ ++menu "Unofficial 3rd party kernel additions" ++ ++EOM ++ ++ foreach (@modules) { ++ die "No Kconfig in $_.\n" if ! -r "$_/Kconfig"; ++ print F "source 3rdparty/$_/Kconfig\n"; ++ } ++ print F "\n\nendmenu\n"; ++} ++ ++sub to_CONFIG { ++ local $_ = $_[0]; ++ tr/a-z/A-Z/; ++ s/[\-\. ]/_/g; ++ "CONFIG_$_"; ++} ++ ++sub find_target { ++ my ($module_dir) = @_; ++ ++ local *F; ++ open(F, "$module_dir/Makefile") or die "$module_dir/Makefile: $!\n"; ++ while () { ++ chomp; ++ return $1 if (/[LO]_TARGET.*:=\s+(\S+)/); ++ } ++} ++ +diff -Nurp linux-2.6.37/Documentation/3rdparty.txt Documentation/3rdparty.txt +--- linux-2.6.37/Documentation/3rdparty.txt 1970-01-01 02:00:00.000000000 +0200 ++++ Documentation/3rdparty.txt 2003-11-22 01:07:26.000000000 +0200 +@@ -0,0 +1,76 @@ ++ ++Third-Party Kernel Source Module Support, or ++an easy way to add modules to your kernel build. ++ ++ ++ ++Vendors quite often add additional drivers and features to the kernel ++which require nothing more than modifying Kconfig, Makefile, and ++adding one or more files to a sub-directory. As a single discrete task, ++this is not a problem. However, using patches to add modules to the ++kernel very often results in patch conflicts, resulting in needless time ++wastage as developers regenerate an otherwise working kernel patch. ++ ++This is designed as a solution to these problems. It is NOT designed as ++a replacement for the kernel build system, but merely as a tool for ++vendors and system administrators to ease the pain of patch management. ++ ++The key feature of this system is the distinct lack of patches. Drivers ++are installed via unpacking a tarball. ++ ++ ++ ++Adding a directory to the build (usually from a tarball) ++-------------------------------------------------------- ++If a directory exists inside the 3rdparty sub-directory that contains a ++proper Makefile, it can be added to the build. It also needs a ++Kconfig file. ++ ++ cd /usr/src/linux-2.4.3/3rdparty ++ bzcat /tmp/my-driver2.tar.bz2 | tar xf - # creates "my2" dir ++ ++ ++Limitations ++----------- ++There are some limitations to this system. This system is only ++designed to support a very common case. If you find yourself running ++into limitations (kernel build experts can spot them right off), ++then you should probably be patching the kernel instead of using ++mkbuild.pl for that particular module. ++ ++FIXME: actually list the limitations ++ ++ ++ ++Other notes ++----------- ++Link order is controlled by the order of mkbuild.pl executions. ++ ++"make mrproper" will erase Makefile.meta, and empty Kconfig, Makefile, ++and Makefile.drivers. ++ ++IMPORTANT NOTE: Because this feature modifies the kernel's makefiles and ++configuration system, you MUST complete all mkbuild.pl runs before ++running any "make" command. ++ ++Building in the 3rdparty dir ++---------------------------- ++ ++If you use modules that: ++ - are contained in one subdir with the name of the module ++ - has a Makefile ++ - has a Kconfig file ++ ++The system calls the ./mkbuild.pl script. It will search for ++subdirectories, and will try to build each of them as a module. ++Things to note: ++ ++ The dependencies will be done in a module called: ++ ++ 3rdparty// ++ ++depending of CONFIG_. ++ ++ is the value of O_TARGET/L_TARGET. ++ ++ diff --git a/3.3.8/3rd-3rdparty-button_hotplug-0.4.1.patch b/3.3.8/3rd-3rdparty-button_hotplug-0.4.1.patch new file mode 100644 index 0000000..a7b4a15 --- /dev/null +++ b/3.3.8/3rd-3rdparty-button_hotplug-0.4.1.patch @@ -0,0 +1,372 @@ +Submitted By: Mario Fetka (mario dot fetka at gmail dot com) +Date: 2012-11-18 +Initial Package Version: 3.2.33 +Origin: openwtr.org packages/system/button-hotplug +Upstream Status: unknown +Description: create uevents from button usage + +diff -Naur linux-3.2.33-go.orig/3rdparty/button_hotplug/Kconfig 3rdparty/button_hotplug/Kconfig +--- linux-3.2.33-go.orig/3rdparty/button_hotplug/Kconfig 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/button_hotplug/Kconfig 2012-11-18 14:45:26.000000000 +0000 +@@ -0,0 +1,2 @@ ++config BUTTON_HOTPLUG ++ tristate "Button Hotplug driver" +diff -Naur linux-3.2.33-go.orig/3rdparty/button_hotplug/Makefile 3rdparty/button_hotplug/Makefile +--- linux-3.2.33-go.orig/3rdparty/button_hotplug/Makefile 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/button_hotplug/Makefile 2012-11-18 14:45:26.000000000 +0000 +@@ -0,0 +1 @@ ++obj-${CONFIG_BUTTON_HOTPLUG} += button-hotplug.o +\ No newline at end of file +diff -Naur linux-3.2.33-go.orig/3rdparty/button_hotplug/button-hotplug.c 3rdparty/button_hotplug/button-hotplug.c +--- linux-3.2.33-go.orig/3rdparty/button_hotplug/button-hotplug.c 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/button_hotplug/button-hotplug.c 2012-11-18 14:45:26.000000000 +0000 +@@ -0,0 +1,349 @@ ++/* ++ * Button Hotplug driver ++ * ++ * Copyright (C) 2008-2010 Gabor Juhos ++ * ++ * Based on the diag.c - GPIO interface driver for Broadcom boards ++ * Copyright (C) 2006 Mike Baker , ++ * Copyright (C) 2006-2007 Felix Fietkau ++ * Copyright (C) 2008 Andy Boyett ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 as published ++ * by the Free Software Foundation. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#define DRV_NAME "button-hotplug" ++#define DRV_VERSION "0.4.1" ++#define DRV_DESC "Button Hotplug driver" ++ ++#define BH_SKB_SIZE 2048 ++ ++#define PFX DRV_NAME ": " ++ ++#undef BH_DEBUG ++ ++#ifdef BH_DEBUG ++#define BH_DBG(fmt, args...) printk(KERN_DEBUG "%s: " fmt, DRV_NAME, ##args ) ++#else ++#define BH_DBG(fmt, args...) do {} while (0) ++#endif ++ ++#define BH_ERR(fmt, args...) printk(KERN_ERR "%s: " fmt, DRV_NAME, ##args ) ++ ++#ifndef BIT_MASK ++#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) ++#endif ++ ++struct bh_priv { ++ unsigned long *seen; ++ struct input_handle handle; ++}; ++ ++struct bh_event { ++ const char *name; ++ char *action; ++ unsigned long seen; ++ ++ struct sk_buff *skb; ++ struct work_struct work; ++}; ++ ++struct bh_map { ++ unsigned int code; ++ const char *name; ++}; ++ ++extern u64 uevent_next_seqnum(void); ++ ++#define BH_MAP(_code, _name) \ ++ { \ ++ .code = (_code), \ ++ .name = (_name), \ ++ } ++ ++static struct bh_map button_map[] = { ++ BH_MAP(BTN_0, "BTN_0"), ++ BH_MAP(BTN_1, "BTN_1"), ++ BH_MAP(BTN_2, "BTN_2"), ++ BH_MAP(BTN_3, "BTN_3"), ++ BH_MAP(BTN_4, "BTN_4"), ++ BH_MAP(BTN_5, "BTN_5"), ++ BH_MAP(BTN_6, "BTN_6"), ++ BH_MAP(BTN_7, "BTN_7"), ++ BH_MAP(BTN_8, "BTN_8"), ++ BH_MAP(BTN_9, "BTN_9"), ++ BH_MAP(KEY_RESTART, "reset"), ++#ifdef KEY_WPS_BUTTON ++ BH_MAP(KEY_WPS_BUTTON, "wps"), ++#endif /* KEY_WPS_BUTTON */ ++}; ++ ++/* -------------------------------------------------------------------------*/ ++ ++static int bh_event_add_var(struct bh_event *event, int argv, ++ const char *format, ...) ++{ ++ static char buf[128]; ++ char *s; ++ va_list args; ++ int len; ++ ++ if (argv) ++ return 0; ++ ++ va_start(args, format); ++ len = vsnprintf(buf, sizeof(buf), format, args); ++ va_end(args); ++ ++ if (len >= sizeof(buf)) { ++ BH_ERR("buffer size too small\n"); ++ WARN_ON(1); ++ return -ENOMEM; ++ } ++ ++ s = skb_put(event->skb, len + 1); ++ strcpy(s, buf); ++ ++ BH_DBG("added variable '%s'\n", s); ++ ++ return 0; ++} ++ ++static int button_hotplug_fill_event(struct bh_event *event) ++{ ++ int ret; ++ ++ ret = bh_event_add_var(event, 0, "HOME=%s", "/"); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "PATH=%s", ++ "/sbin:/bin:/usr/sbin:/usr/bin"); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "SUBSYSTEM=%s", "button"); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "ACTION=%s", event->action); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "BUTTON=%s", event->name); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "SEEN=%ld", event->seen); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "SEQNUM=%llu", uevent_next_seqnum()); ++ ++ return ret; ++} ++ ++static void button_hotplug_work(struct work_struct *work) ++{ ++ struct bh_event *event = container_of(work, struct bh_event, work); ++ int ret = 0; ++ ++ event->skb = alloc_skb(BH_SKB_SIZE, GFP_KERNEL); ++ if (!event->skb) ++ goto out_free_event; ++ ++ ret = bh_event_add_var(event, 0, "%s@", event->action); ++ if (ret) ++ goto out_free_skb; ++ ++ ret = button_hotplug_fill_event(event); ++ if (ret) ++ goto out_free_skb; ++ ++ NETLINK_CB(event->skb).dst_group = 1; ++ broadcast_uevent(event->skb, 0, 1, GFP_KERNEL); ++ ++ out_free_skb: ++ if (ret) { ++ BH_ERR("work error %d\n", ret); ++ kfree_skb(event->skb); ++ } ++ out_free_event: ++ kfree(event); ++} ++ ++static int button_hotplug_create_event(const char *name, unsigned long seen, ++ int pressed) ++{ ++ struct bh_event *event; ++ ++ BH_DBG("create event, name=%s, seen=%lu, pressed=%d\n", ++ name, seen, pressed); ++ ++ event = kzalloc(sizeof(*event), GFP_KERNEL); ++ if (!event) ++ return -ENOMEM; ++ ++ event->name = name; ++ event->seen = seen; ++ event->action = pressed ? "pressed" : "released"; ++ ++ INIT_WORK(&event->work, (void *)(void *)button_hotplug_work); ++ schedule_work(&event->work); ++ ++ return 0; ++} ++ ++/* -------------------------------------------------------------------------*/ ++ ++#ifdef CONFIG_HOTPLUG ++static int button_get_index(unsigned int code) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(button_map); i++) ++ if (button_map[i].code == code) ++ return i; ++ ++ return -1; ++} ++static void button_hotplug_event(struct input_handle *handle, ++ unsigned int type, unsigned int code, int value) ++{ ++ struct bh_priv *priv = handle->private; ++ unsigned long seen = jiffies; ++ int btn; ++ ++ BH_DBG("event type=%u, code=%u, value=%d\n", type, code, value); ++ ++ if (type != EV_KEY) ++ return; ++ ++ btn = button_get_index(code); ++ if (btn < 0) ++ return; ++ ++ button_hotplug_create_event(button_map[btn].name, ++ (seen - priv->seen[btn]) / HZ, value); ++ priv->seen[btn] = seen; ++} ++#else ++static void button_hotplug_event(struct input_handle *handle, ++ unsigned int type, unsigned int code, int value) ++{ ++} ++#endif /* CONFIG_HOTPLUG */ ++ ++static int button_hotplug_connect(struct input_handler *handler, ++ struct input_dev *dev, const struct input_device_id *id) ++{ ++ struct bh_priv *priv; ++ int ret; ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(button_map); i++) ++ if (test_bit(button_map[i].code, dev->keybit)) ++ break; ++ ++ if (i == ARRAY_SIZE(button_map)) ++ return -ENODEV; ++ ++ priv = kzalloc(sizeof(*priv) + ++ (sizeof(unsigned long) * ARRAY_SIZE(button_map)), ++ GFP_KERNEL); ++ if (!priv) ++ return -ENOMEM; ++ ++ priv->seen = (unsigned long *) &priv[1]; ++ priv->handle.private = priv; ++ priv->handle.dev = dev; ++ priv->handle.handler = handler; ++ priv->handle.name = DRV_NAME; ++ ++ ret = input_register_handle(&priv->handle); ++ if (ret) ++ goto err_free_priv; ++ ++ ret = input_open_device(&priv->handle); ++ if (ret) ++ goto err_unregister_handle; ++ ++ BH_DBG("connected to %s\n", dev->name); ++ ++ return 0; ++ ++ err_unregister_handle: ++ input_unregister_handle(&priv->handle); ++ ++ err_free_priv: ++ kfree(priv); ++ return ret; ++} ++ ++static void button_hotplug_disconnect(struct input_handle *handle) ++{ ++ struct bh_priv *priv = handle->private; ++ ++ input_close_device(handle); ++ input_unregister_handle(handle); ++ ++ kfree(priv); ++} ++ ++static const struct input_device_id button_hotplug_ids[] = { ++ { ++ .flags = INPUT_DEVICE_ID_MATCH_EVBIT, ++ .evbit = { BIT_MASK(EV_KEY) }, ++ }, ++ { ++ /* Terminating entry */ ++ }, ++}; ++ ++MODULE_DEVICE_TABLE(input, button_hotplug_ids); ++ ++static struct input_handler button_hotplug_handler = { ++ .event = button_hotplug_event, ++ .connect = button_hotplug_connect, ++ .disconnect = button_hotplug_disconnect, ++ .name = DRV_NAME, ++ .id_table = button_hotplug_ids, ++}; ++ ++/* -------------------------------------------------------------------------*/ ++ ++static int __init button_hotplug_init(void) ++{ ++ int ret; ++ ++ printk(KERN_INFO DRV_DESC " version " DRV_VERSION "\n"); ++ ret = input_register_handler(&button_hotplug_handler); ++ if (ret) ++ BH_ERR("unable to register input handler\n"); ++ ++ return ret; ++} ++module_init(button_hotplug_init); ++ ++static void __exit button_hotplug_exit(void) ++{ ++ input_unregister_handler(&button_hotplug_handler); ++} ++module_exit(button_hotplug_exit); ++ ++MODULE_DESCRIPTION(DRV_DESC); ++MODULE_VERSION(DRV_VERSION); ++MODULE_AUTHOR("Gabor Juhos "); ++MODULE_LICENSE("GPL v2"); ++ diff --git a/3.3.8/3rd-3rdparty-gpio_button_hotplug-0.1.patch b/3.3.8/3rd-3rdparty-gpio_button_hotplug-0.1.patch new file mode 100644 index 0000000..6b2e78e --- /dev/null +++ b/3.3.8/3rd-3rdparty-gpio_button_hotplug-0.1.patch @@ -0,0 +1,472 @@ +Submitted By: Mario Fetka (mario dot fetka at gmail dot com) +Date: 2012-11-18 +Initial Package Version: 3.2.33 +Origin: openwtr.org packages/system/gpio-button-hotplug +Upstream Status: unknown +Description: gpio button uevent + +diff -Naur linux-3.2.33-go.orig/3rdparty/gpio_button_hotplug/Kconfig 3rdparty/gpio_button_hotplug/Kconfig +--- linux-3.2.33-go.orig/3rdparty/gpio_button_hotplug/Kconfig 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/gpio_button_hotplug/Kconfig 2012-11-18 18:41:43.048939468 +0000 +@@ -0,0 +1,2 @@ ++config GPIO_BUTTON_HOTPLUG ++ tristate "GPIO Button Hotplug driver" +diff -Naur linux-3.2.33-go.orig/3rdparty/gpio_button_hotplug/Makefile 3rdparty/gpio_button_hotplug/Makefile +--- linux-3.2.33-go.orig/3rdparty/gpio_button_hotplug/Makefile 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/gpio_button_hotplug/Makefile 2012-11-18 14:45:26.000000000 +0000 +@@ -0,0 +1 @@ ++obj-${CONFIG_GPIO_BUTTON_HOTPLUG} += gpio-button-hotplug.o +diff -Naur linux-3.2.33-go.orig/3rdparty/gpio_button_hotplug/gpio-button-hotplug.c 3rdparty/gpio_button_hotplug/gpio-button-hotplug.c +--- linux-3.2.33-go.orig/3rdparty/gpio_button_hotplug/gpio-button-hotplug.c 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/gpio_button_hotplug/gpio-button-hotplug.c 2012-11-18 14:45:26.000000000 +0000 +@@ -0,0 +1,450 @@ ++/* ++ * GPIO Button Hotplug driver ++ * ++ * Copyright (C) 2012 Felix Fietkau ++ * Copyright (C) 2008-2010 Gabor Juhos ++ * ++ * Based on the diag.c - GPIO interface driver for Broadcom boards ++ * Copyright (C) 2006 Mike Baker , ++ * Copyright (C) 2006-2007 Felix Fietkau ++ * Copyright (C) 2008 Andy Boyett ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 as published ++ * by the Free Software Foundation. ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define DRV_NAME "gpio-keys-polled" ++ ++#define BH_SKB_SIZE 2048 ++ ++#define PFX DRV_NAME ": " ++ ++#undef BH_DEBUG ++ ++#ifdef BH_DEBUG ++#define BH_DBG(fmt, args...) printk(KERN_DEBUG "%s: " fmt, DRV_NAME, ##args ) ++#else ++#define BH_DBG(fmt, args...) do {} while (0) ++#endif ++ ++#define BH_ERR(fmt, args...) printk(KERN_ERR "%s: " fmt, DRV_NAME, ##args ) ++ ++struct bh_priv { ++ unsigned long seen; ++}; ++ ++struct bh_event { ++ const char *name; ++ char *action; ++ unsigned long seen; ++ ++ struct sk_buff *skb; ++ struct work_struct work; ++}; ++ ++struct bh_map { ++ unsigned int code; ++ const char *name; ++}; ++ ++struct gpio_keys_button_data { ++ struct delayed_work work; ++ struct bh_priv bh; ++ int last_state; ++ int count; ++ int threshold; ++ int can_sleep; ++}; ++ ++extern u64 uevent_next_seqnum(void); ++ ++#define BH_MAP(_code, _name) \ ++ { \ ++ .code = (_code), \ ++ .name = (_name), \ ++ } ++ ++static struct bh_map button_map[] = { ++ BH_MAP(BTN_0, "BTN_0"), ++ BH_MAP(BTN_1, "BTN_1"), ++ BH_MAP(BTN_2, "BTN_2"), ++ BH_MAP(BTN_3, "BTN_3"), ++ BH_MAP(BTN_4, "BTN_4"), ++ BH_MAP(BTN_5, "BTN_5"), ++ BH_MAP(BTN_6, "BTN_6"), ++ BH_MAP(BTN_7, "BTN_7"), ++ BH_MAP(BTN_8, "BTN_8"), ++ BH_MAP(BTN_9, "BTN_9"), ++ BH_MAP(KEY_RESTART, "reset"), ++#ifdef KEY_WPS_BUTTON ++ BH_MAP(KEY_WPS_BUTTON, "wps"), ++#endif /* KEY_WPS_BUTTON */ ++}; ++ ++/* -------------------------------------------------------------------------*/ ++ ++static int bh_event_add_var(struct bh_event *event, int argv, ++ const char *format, ...) ++{ ++ static char buf[128]; ++ char *s; ++ va_list args; ++ int len; ++ ++ if (argv) ++ return 0; ++ ++ va_start(args, format); ++ len = vsnprintf(buf, sizeof(buf), format, args); ++ va_end(args); ++ ++ if (len >= sizeof(buf)) { ++ BH_ERR("buffer size too small\n"); ++ WARN_ON(1); ++ return -ENOMEM; ++ } ++ ++ s = skb_put(event->skb, len + 1); ++ strcpy(s, buf); ++ ++ BH_DBG("added variable '%s'\n", s); ++ ++ return 0; ++} ++ ++static int button_hotplug_fill_event(struct bh_event *event) ++{ ++ int ret; ++ ++ ret = bh_event_add_var(event, 0, "HOME=%s", "/"); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "PATH=%s", ++ "/sbin:/bin:/usr/sbin:/usr/bin"); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "SUBSYSTEM=%s", "button"); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "ACTION=%s", event->action); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "BUTTON=%s", event->name); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "SEEN=%ld", event->seen); ++ if (ret) ++ return ret; ++ ++ ret = bh_event_add_var(event, 0, "SEQNUM=%llu", uevent_next_seqnum()); ++ ++ return ret; ++} ++ ++static void button_hotplug_work(struct work_struct *work) ++{ ++ struct bh_event *event = container_of(work, struct bh_event, work); ++ int ret = 0; ++ ++ event->skb = alloc_skb(BH_SKB_SIZE, GFP_KERNEL); ++ if (!event->skb) ++ goto out_free_event; ++ ++ ret = bh_event_add_var(event, 0, "%s@", event->action); ++ if (ret) ++ goto out_free_skb; ++ ++ ret = button_hotplug_fill_event(event); ++ if (ret) ++ goto out_free_skb; ++ ++ NETLINK_CB(event->skb).dst_group = 1; ++ broadcast_uevent(event->skb, 0, 1, GFP_KERNEL); ++ ++ out_free_skb: ++ if (ret) { ++ BH_ERR("work error %d\n", ret); ++ kfree_skb(event->skb); ++ } ++ out_free_event: ++ kfree(event); ++} ++ ++static int button_hotplug_create_event(const char *name, unsigned long seen, ++ int pressed) ++{ ++ struct bh_event *event; ++ ++ BH_DBG("create event, name=%s, seen=%lu, pressed=%d\n", ++ name, seen, pressed); ++ ++ event = kzalloc(sizeof(*event), GFP_KERNEL); ++ if (!event) ++ return -ENOMEM; ++ ++ event->name = name; ++ event->seen = seen; ++ event->action = pressed ? "pressed" : "released"; ++ ++ INIT_WORK(&event->work, (void *)(void *)button_hotplug_work); ++ schedule_work(&event->work); ++ ++ return 0; ++} ++ ++/* -------------------------------------------------------------------------*/ ++ ++#ifdef CONFIG_HOTPLUG ++static int button_get_index(unsigned int code) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(button_map); i++) ++ if (button_map[i].code == code) ++ return i; ++ ++ return -1; ++} ++static void button_hotplug_event(struct gpio_keys_button_data *data, ++ unsigned int type, unsigned int code, int value) ++{ ++ struct bh_priv *priv = &data->bh; ++ unsigned long seen = jiffies; ++ int btn; ++ ++ BH_DBG("event type=%u, code=%u, value=%d\n", type, code, value); ++ ++ if (type != EV_KEY) ++ return; ++ ++ btn = button_get_index(code); ++ if (btn < 0) ++ return; ++ ++ button_hotplug_create_event(button_map[btn].name, ++ (seen - priv->seen) / HZ, value); ++ priv->seen = seen; ++} ++#else ++static void button_hotplug_event(struct gpio_keys_button_data *data, ++ unsigned int type, unsigned int code, int value) ++{ ++} ++#endif /* CONFIG_HOTPLUG */ ++ ++struct gpio_keys_polled_dev { ++ struct delayed_work work; ++ ++ struct device *dev; ++ struct gpio_keys_platform_data *pdata; ++ struct gpio_keys_button_data data[0]; ++}; ++ ++static void gpio_keys_polled_check_state(struct gpio_keys_button *button, ++ struct gpio_keys_button_data *bdata) ++{ ++ int state; ++ ++ if (bdata->can_sleep) ++ state = !!gpio_get_value_cansleep(button->gpio); ++ else ++ state = !!gpio_get_value(button->gpio); ++ ++ state = !!(state ^ button->active_low); ++ if (state != bdata->last_state) { ++ unsigned int type = button->type ?: EV_KEY; ++ ++ button_hotplug_event(bdata, type, button->code, state); ++ bdata->count = 0; ++ bdata->last_state = state; ++ } ++} ++ ++static void gpio_keys_polled_queue_work(struct gpio_keys_polled_dev *bdev) ++{ ++ struct gpio_keys_platform_data *pdata = bdev->pdata; ++ unsigned long delay = msecs_to_jiffies(pdata->poll_interval); ++ ++ if (delay >= HZ) ++ delay = round_jiffies_relative(delay); ++ schedule_delayed_work(&bdev->work, delay); ++} ++ ++static void gpio_keys_polled_poll(struct work_struct *work) ++{ ++ struct gpio_keys_polled_dev *bdev = ++ container_of(work, struct gpio_keys_polled_dev, work.work); ++ struct gpio_keys_platform_data *pdata = bdev->pdata; ++ int i; ++ ++ for (i = 0; i < bdev->pdata->nbuttons; i++) { ++ struct gpio_keys_button_data *bdata = &bdev->data[i]; ++ ++ if (bdata->count < bdata->threshold) ++ bdata->count++; ++ else ++ gpio_keys_polled_check_state(&pdata->buttons[i], bdata); ++ } ++ gpio_keys_polled_queue_work(bdev); ++} ++ ++static void __devinit gpio_keys_polled_open(struct gpio_keys_polled_dev *bdev) ++{ ++ struct gpio_keys_platform_data *pdata = bdev->pdata; ++ int i; ++ ++ if (pdata->enable) ++ pdata->enable(bdev->dev); ++ ++ /* report initial state of the buttons */ ++ for (i = 0; i < pdata->nbuttons; i++) ++ gpio_keys_polled_check_state(&pdata->buttons[i], &bdev->data[i]); ++ ++ gpio_keys_polled_queue_work(bdev); ++} ++ ++static void __devexit gpio_keys_polled_close(struct gpio_keys_polled_dev *bdev) ++{ ++ struct gpio_keys_platform_data *pdata = bdev->pdata; ++ ++ cancel_delayed_work_sync(&bdev->work); ++ ++ if (pdata->disable) ++ pdata->disable(bdev->dev); ++} ++ ++static int __devinit gpio_keys_polled_probe(struct platform_device *pdev) ++{ ++ struct gpio_keys_platform_data *pdata = pdev->dev.platform_data; ++ struct device *dev = &pdev->dev; ++ struct gpio_keys_polled_dev *bdev; ++ int error; ++ int i; ++ ++ if (!pdata || !pdata->poll_interval) ++ return -EINVAL; ++ ++ bdev = kzalloc(sizeof(struct gpio_keys_polled_dev) + ++ pdata->nbuttons * sizeof(struct gpio_keys_button_data), ++ GFP_KERNEL); ++ if (!bdev) { ++ dev_err(dev, "no memory for private data\n"); ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < pdata->nbuttons; i++) { ++ struct gpio_keys_button *button = &pdata->buttons[i]; ++ struct gpio_keys_button_data *bdata = &bdev->data[i]; ++ unsigned int gpio = button->gpio; ++ ++ if (button->wakeup) { ++ dev_err(dev, DRV_NAME " does not support wakeup\n"); ++ error = -EINVAL; ++ goto err_free_gpio; ++ } ++ ++ error = gpio_request(gpio, ++ button->desc ? button->desc : DRV_NAME); ++ if (error) { ++ dev_err(dev, "unable to claim gpio %u, err=%d\n", ++ gpio, error); ++ goto err_free_gpio; ++ } ++ ++ error = gpio_direction_input(gpio); ++ if (error) { ++ dev_err(dev, ++ "unable to set direction on gpio %u, err=%d\n", ++ gpio, error); ++ goto err_free_gpio; ++ } ++ ++ bdata->can_sleep = gpio_cansleep(gpio); ++ bdata->last_state = 0; ++ bdata->threshold = DIV_ROUND_UP(button->debounce_interval, ++ pdata->poll_interval); ++ } ++ ++ bdev->dev = &pdev->dev; ++ bdev->pdata = pdata; ++ platform_set_drvdata(pdev, bdev); ++ ++ INIT_DELAYED_WORK(&bdev->work, gpio_keys_polled_poll); ++ ++ gpio_keys_polled_open(bdev); ++ ++ return 0; ++ ++err_free_gpio: ++ while (--i >= 0) ++ gpio_free(pdata->buttons[i].gpio); ++ ++ kfree(bdev); ++ platform_set_drvdata(pdev, NULL); ++ ++ return error; ++} ++ ++static int __devexit gpio_keys_polled_remove(struct platform_device *pdev) ++{ ++ struct gpio_keys_polled_dev *bdev = platform_get_drvdata(pdev); ++ struct gpio_keys_platform_data *pdata = bdev->pdata; ++ int i = pdata->nbuttons; ++ ++ gpio_keys_polled_close(bdev); ++ ++ while (--i >= 0) ++ gpio_free(pdata->buttons[i].gpio); ++ ++ kfree(bdev); ++ platform_set_drvdata(pdev, NULL); ++ ++ return 0; ++} ++ ++static struct platform_driver gpio_keys_polled_driver = { ++ .probe = gpio_keys_polled_probe, ++ .remove = __devexit_p(gpio_keys_polled_remove), ++ .driver = { ++ .name = DRV_NAME, ++ .owner = THIS_MODULE, ++ }, ++}; ++ ++static int __init gpio_keys_polled_init(void) ++{ ++ return platform_driver_register(&gpio_keys_polled_driver); ++} ++ ++static void __exit gpio_keys_polled_exit(void) ++{ ++ platform_driver_unregister(&gpio_keys_polled_driver); ++} ++ ++module_init(gpio_keys_polled_init); ++module_exit(gpio_keys_polled_exit); ++ ++MODULE_AUTHOR("Gabor Juhos "); ++MODULE_AUTHOR("Felix Fietkau "); ++MODULE_DESCRIPTION("Polled GPIO Buttons hotplug driver"); ++MODULE_LICENSE("GPL v2"); ++MODULE_ALIAS("platform:" DRV_NAME); diff --git a/3.3.8/3rd-3rdparty-gpio_event_drv-0.1.patch b/3.3.8/3rd-3rdparty-gpio_event_drv-0.1.patch new file mode 100644 index 0000000..3a75e28 --- /dev/null +++ b/3.3.8/3rd-3rdparty-gpio_event_drv-0.1.patch @@ -0,0 +1,1354 @@ +Submitted By: Mario Fetka (mario dot fetka at gmail dot com) +Date: 2012-11-18 +Initial Package Version: 3.2.33 +Origin: http://wiki.gumstix.org/index.php?title=GPIO_Event_Driver +Upstream Status: unknown +Description: The gpio-event driver consists of a loadable kernel module, +which registers an interrupt handler, along with an example user-mode program, +which allows the settings to be manipulated and changes to be reported. + +diff -Naur linux-3.2.33-go.orig/3rdparty/gpio_event_drv/Kconfig 3rdparty/gpio_event_drv/Kconfig +--- linux-3.2.33-go.orig/3rdparty/gpio_event_drv/Kconfig 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/gpio_event_drv/Kconfig 2012-11-18 19:03:08.020733547 +0000 +@@ -0,0 +1,2 @@ ++config GPIO_EVENT_DRV ++ tristate "GPIO Event Driver (requires userspace app)" +diff -Naur linux-3.2.33-go.orig/3rdparty/gpio_event_drv/Makefile 3rdparty/gpio_event_drv/Makefile +--- linux-3.2.33-go.orig/3rdparty/gpio_event_drv/Makefile 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/gpio_event_drv/Makefile 2012-11-18 19:02:20.409297191 +0000 +@@ -0,0 +1 @@ ++obj-${CONFIG_GPIO_EVENT_DRV} += gpio-event-drv.o +\ No newline at end of file +diff -Naur linux-3.2.33-go.orig/3rdparty/gpio_event_drv/gpio-event-drv.c 3rdparty/gpio_event_drv/gpio-event-drv.c +--- linux-3.2.33-go.orig/3rdparty/gpio_event_drv/gpio-event-drv.c 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/gpio_event_drv/gpio-event-drv.c 2012-11-18 10:24:14.000000000 +0000 +@@ -0,0 +1,1210 @@ ++/**************************************************************************** ++* ++* Copyright (c) 2006 Dave Hylands ++* ++* This program is free software; you can redistribute it and/or modify ++* it under the terms of the GNU General Public License version 2 as ++* published by the Free Software Foundation. ++* ++* Alternatively, this software may be distributed under the terms of BSD ++* license. ++* ++* See README and COPYING for more details. ++* ++**************************************************************************** ++* ++* This driver allows multiple GPIO pins to be monitored and allows a user ++* mode program to be notified when the pin changes. ++* ++****************************************************************************/ ++ ++/* ---- Include Files ---------------------------------------------------- */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++ ++#include "gpio-event-drv.h" ++ ++/* ---- Public Variables ------------------------------------------------- */ ++/* ---- Private Constants and Types -------------------------------------- */ ++ ++#define GPIO_EVENT_DEV_NAME "gpio-event" ++ ++#define DEBUG_ENABLED 1 ++ ++#if DEBUG_ENABLED ++# define DEBUG( flag, fmt, args... ) do { if ( gDebug ## flag ) printk( "%s: " fmt, __FUNCTION__ , ## args ); } while (0) ++#else ++# define DEBUG( flag, fmt, args... ) ++#endif ++ ++/* ---- Private Variables ------------------------------------------------ */ ++ ++static char gBanner[] __initdata = KERN_INFO "GPIO Event Monitor 0.1 Compiled: " __DATE__ " at " __TIME__ "\n"; ++ ++static int gDebugTrace = 0; ++static int gDebugIoctl = 0; ++static int gDebugError = 1; ++static int gLostEvents = 0; ++ ++static struct ctl_table_header *gSysCtlHeader; ++ ++#if ( LINUX_VERSION_CODE >= KERNEL_VERSION( 2, 6, 33 )) ++#define CTL_NAME(x) ++#else ++#define CTL_NAME(x) .ctl_name = x, ++#endif ++ ++static struct ctl_table gSysCtlSample[] = ++{ ++ { ++ CTL_NAME(1) ++ .procname = "lost-events", ++ .data = &gLostEvents, ++ .maxlen = sizeof( int ), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { ++ CTL_NAME(101) ++ .procname = "debug-trace", ++ .data = &gDebugTrace, ++ .maxlen = sizeof( int ), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { ++ CTL_NAME(102) ++ .procname = "debug-ioctl", ++ .data = &gDebugIoctl, ++ .maxlen = sizeof( int ), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { ++ CTL_NAME(103) ++ .procname = "debug-error", ++ .data = &gDebugError, ++ .maxlen = sizeof( int ), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { 0 } ++}; ++ ++static struct ctl_table gSysCtl[] = ++{ ++ { ++ CTL_NAME(CTL_GPIO_EVENT) ++ .procname = "gpio-event", ++ .mode = 0555, ++ .child = gSysCtlSample ++ }, ++ { 0 } ++}; ++ ++/* ++ * An instance of GPIO_FileData_t is maintained for file open ++ */ ++ ++#define GPIO_EVENT_QUEUE_LEN 20 ++ ++// GPIO_EVENT_BUFFER_SIZE needs to be big enough to hold the ASCII version ++// of the GPIO_Event_t as well as the binary version of the GPIO_Event_t ++ ++#define GPIO_EVENT_BUFFER_SIZE 32 ++ ++typedef struct ++{ ++ struct list_head list; ++ wait_queue_head_t waitQueue; ++ ++ spinlock_t queueLock; ++ GPIO_Event_t queueData[ GPIO_EVENT_QUEUE_LEN ]; ++ volatile int getIndex; ++ volatile int putIndex; ++ volatile int numEvents; ++ ++ GPIO_EventReadMode_t readMode; ++ ++ char buffer[ GPIO_EVENT_BUFFER_SIZE ]; ++ int bufBytes; ++ ++} GPIO_FileData_t; ++ ++/* ++ * An instance of GPIO_PinData_t is maintained for each GPIO line which is ++ * monitored, ++ */ ++ ++typedef enum ++{ ++ PIN_LOW = 0, // Matches level of GPIO line ++ PIN_HIGH = 1, ++ PIN_BOUNCING_LOW, ++ PIN_BOUNCING_HIGH, ++} PinState_t; ++ ++typedef struct ++{ ++ struct list_head list; // list of all pins ++ ++ int gpio; // The gpio line being monitored ++ ++ // We maintain two lists, a global list of pins, and a list associated with each open ++ ++ ++ struct timer_list debounceTimer; // Timer to wake u up after an edge ++ uint8_t debounceMilliSec; // debounce time in milliseconds ++ char devName[ 16 ]; // gpio xx event ++ ++ GPIO_EventEdgeType_t edgeType; // Type of edge(s) we're looking for. ++ ++ PinState_t pinState; // Was the GPIO line low or high? ++ ++} GPIO_PinData_t; ++ ++static volatile int gReportLostEvents = 1; ++ ++static struct class *gGpioEventClass = NULL; ++static struct cdev gGpioEventCDev; ++static dev_t gGpioEventDevNum = 0; ++ ++static DEFINE_SPINLOCK( gFileListLock ); ++static DEFINE_SPINLOCK( gPinListLock ); ++ ++static LIST_HEAD( gFileList ); ++static LIST_HEAD( gPinList ); ++ ++static struct proc_dir_entry *gProcGpioEvent; ++static struct proc_dir_entry *gProcPins; ++ ++ ++/* ---- Private Function Prototypes -------------------------------------- */ ++/* ---- Functions -------------------------------------------------------- */ ++ ++typedef struct ++{ ++ unsigned long flags; ++ struct list_head *list; ++ ++} pin_seq_t; ++ ++/**************************************************************************** ++* ++* pin_seq_start ++* ++* seq_file iterator which goes through the pins being monitored ++* ++****************************************************************************/ ++ ++static void *pin_seq_start( struct seq_file *s, loff_t *pos ) ++{ ++ pin_seq_t *ps; ++ loff_t i; ++ ++ s->private = NULL; ++ ++ if (( ps = kcalloc( 1, sizeof( pin_seq_t ), GFP_KERNEL )) == NULL ) ++ { ++ return ERR_PTR( -ENOMEM ); ++ } ++ s->private = ps; ++ ++ spin_lock_irqsave( &gPinListLock, ps->flags ); ++ ++ if ( list_empty( &gPinList )) ++ { ++ DEBUG( Trace, "list_empty\n" ); ++ return NULL; ++ } ++ ps->list = gPinList.next; ++ ++ for ( i = 0; i < *pos; i++ ) ++ { ++ if ( list_is_last( ps->list, &gPinList )) ++ { ++ DEBUG( Trace, "No item @ %llu\n", i + 1 ); ++ return NULL; ++ } ++ ps->list = ps->list->next; ++ } ++ ++ ++ DEBUG( Trace, "ps->list = 0x%08lx, *pos = %llu\n", (long)ps->list, *pos ); ++ ++ return ps->list; ++ ++} // pin_seq_start ++ ++/**************************************************************************** ++* ++* pin_seq_show ++* ++* seq_file iterator which goes through the pins being monitored ++* ++****************************************************************************/ ++ ++static int pin_seq_show( struct seq_file *s, void *v ) ++{ ++ GPIO_PinData_t *pin = list_entry( v, GPIO_PinData_t, list ); ++ char *edgeTypeStr; ++ ++ DEBUG( Trace, "v = 0x%08lx\n", (long)v ); ++ ++ switch ( pin->edgeType ) ++ { ++ case GPIO_EventRisingEdge: edgeTypeStr = "Rising "; break; ++ case GPIO_EventFallingEdge: edgeTypeStr = "Falling"; break; ++ case GPIO_EventBothEdges: edgeTypeStr = "Both "; break; ++ default: edgeTypeStr = "Unknown"; break; ++ } ++ ++ seq_printf( s, "GPIO: %3d Edge: %s Debounce: %d msec\n", pin->gpio, edgeTypeStr, pin->debounceMilliSec ); ++ ++ return 0; ++ ++} // pin_seq_show ++ ++/**************************************************************************** ++* ++* pin_seq_next ++* ++* seq_file iterator which goes through the pins being monitored ++* ++****************************************************************************/ ++ ++static void *pin_seq_next( struct seq_file *s, void *v, loff_t *pos ) ++{ ++ pin_seq_t *ps = s->private; ++ ++ DEBUG( Trace, "v = 0x%08lx *pos = %llu\n", (long)v, *pos ); ++ ++ if ( list_is_last( ps->list, &gPinList )) ++ { ++ DEBUG( Trace, "ps->list = 0x%08lx (end of list)\n", (long)ps->list ); ++ ++ return NULL; ++ } ++ (*pos)++; ++ ps->list = ps->list->next; ++ ++ DEBUG( Trace, "ps->list = 0x%08lx\n", (long)ps->list ); ++ ++ return ps->list; ++ ++} // pin_seq_next ++ ++/**************************************************************************** ++* ++* pin_seq_stop ++* ++* seq_file iterator which goes through the pins being monitored ++* ++****************************************************************************/ ++ ++static void pin_seq_stop( struct seq_file *s, void *v ) ++{ ++ pin_seq_t *ps = s->private; ++ ++ DEBUG( Trace, "v = 0x%08lx\n", (long)v ); ++ ++ if ( ps != NULL ) ++ { ++ spin_unlock_irqrestore( &gPinListLock, ps->flags ); ++ kfree( ps ); ++ } ++ ++} // pin_seq_stop ++ ++/**************************************************************************** ++* ++* pin_seq_ops ++* ++* Ties all of the pin_seq_xxx routines together. ++* ++****************************************************************************/ ++ ++static struct seq_operations pin_seq_ops = ++{ ++ .start = pin_seq_start, ++ .next = pin_seq_next, ++ .stop = pin_seq_stop, ++ .show = pin_seq_show ++}; ++ ++/**************************************************************************** ++* ++* pins_proc_open ++* ++* Open method for /proc/gpio-event/pin ++* ++****************************************************************************/ ++ ++static int pins_proc_open( struct inode *inode, struct file *file ) ++{ ++ DEBUG( Trace, "called\n" ); ++ ++ return seq_open( file, &pin_seq_ops ); ++} ++ ++/**************************************************************************** ++* ++* pin_proc_ops ++* ++* File operations for our /proc/gpio-event/pins file ++* ++****************************************************************************/ ++ ++static struct file_operations pins_proc_ops = ++{ ++ .owner = THIS_MODULE, ++ .open = pins_proc_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release ++}; ++ ++ ++ ++/**************************************************************************** ++* ++* find_pin ++* ++* Searches the list to see if 'gpio' is currently being monitored. ++* ++****************************************************************************/ ++ ++static GPIO_PinData_t *find_pin( int gpio ) ++{ ++ struct list_head *pin; ++ ++ assert_spin_locked( &gPinListLock ); ++ ++ list_for_each( pin, &gPinList ) ++ { ++ GPIO_PinData_t *pinData = list_entry( pin, GPIO_PinData_t, list ); ++ ++ if ( pinData->gpio == gpio ) ++ { ++ return pinData; ++ } ++ } ++ ++ return NULL; ++ ++} // find_pin ++ ++/**************************************************************************** ++* ++* gpio_event_queue_event ++* ++* Queues an sample event from the bottom half to the top half. This ++* function queues up the event on every file that's open. ++* ++****************************************************************************/ ++ ++static void gpio_event_queue_event( const GPIO_Event_t *gpioEvent ) ++{ ++ unsigned long flags; ++ struct list_head *file; ++ ++ DEBUG( Trace, "gpio %d:%c@%ld.%06ld\n", ++ gpioEvent->gpio, ++ gpioEvent->edgeType == GPIO_EventRisingEdge ? 'R' : 'F', ++ gpioEvent->time.tv_sec, ++ gpioEvent->time.tv_usec ); ++ ++ // Queue up the event on all of the open files ++ // ++ // This function is only called from the ISR, with interrupts already ++ // disabled. ++ ++ spin_lock_irqsave( &gFileListLock, flags ); ++ ++ list_for_each( file, &gFileList ) ++ { ++ GPIO_FileData_t *fileData = list_entry( file, GPIO_FileData_t, list ); ++ ++ spin_lock( &fileData->queueLock ); ++ { ++ if ( fileData->numEvents >= GPIO_EVENT_QUEUE_LEN ) ++ { ++ // Queue is full - Only report first event lost ++ ++ if ( gReportLostEvents ) ++ { ++ printk( KERN_ERR "GPIO Event: event lost due to queue full\n" ); ++ gReportLostEvents = 0; ++ } ++ gLostEvents++; ++ } ++ else ++ { ++ fileData->queueData[ fileData->putIndex++ ] = *gpioEvent; ++ if ( fileData->putIndex >= GPIO_EVENT_QUEUE_LEN ) ++ { ++ fileData->putIndex = 0; ++ } ++ fileData->numEvents++; ++ } ++ } ++ spin_unlock( &fileData->queueLock ); ++ ++ wake_up_interruptible( &fileData->waitQueue ); ++ } ++ spin_unlock_irqrestore( &gFileListLock, flags ); ++ ++} // gpio_event_queue_event ++ ++/**************************************************************************** ++* ++* gpio_event_dequeue_event ++* ++* Removes an event from the queue ++* ++****************************************************************************/ ++ ++static int gpio_event_dequeue_event( GPIO_FileData_t *fileData, GPIO_Event_t *gpioEvent ) ++{ ++ unsigned long flags; ++ int eventAvailable = 0; ++ ++ spin_lock_irqsave( &fileData->queueLock, flags ); ++ { ++ if ( fileData->numEvents > 0 ) ++ { ++ *gpioEvent = fileData->queueData[ fileData->getIndex++ ]; ++ if ( fileData->getIndex >= GPIO_EVENT_QUEUE_LEN ) ++ { ++ fileData->getIndex = 0; ++ } ++ fileData->numEvents--; ++ ++ eventAvailable = 1; ++ ++ if ( fileData->numEvents == 0 ) ++ { ++ // Since somebody is reading the queue now, indicate that we ++ // can report lost events again ++ ++ gReportLostEvents = 1; ++ } ++ } ++ } ++ spin_unlock_irqrestore( &fileData->queueLock, flags ); ++ ++ DEBUG( Trace, "gpio %d:%c@%ld.%06ld\n", ++ gpioEvent->gpio, ++ gpioEvent->edgeType == GPIO_EventRisingEdge ? 'R' : 'F', ++ gpioEvent->time.tv_sec, ++ gpioEvent->time.tv_usec ); ++ ++ return eventAvailable; ++ ++} // gpio_event_dequeue_event ++ ++/**************************************************************************** ++* ++* gpio_event_irq ++* ++****************************************************************************/ ++ ++static irqreturn_t gpio_event_irq( int irq, void *dev_id ) ++{ ++ GPIO_PinData_t *pinData = (GPIO_PinData_t *)dev_id; ++ GPIO_Event_t gpioEvent; ++ int currLevel = gpio_get_value( pinData->gpio ); ++ ++ // We're called with interrupts disabled. ++ ++ (void)irq; ++ ++ do_gettimeofday( &gpioEvent.time ); ++ gpioEvent.gpio = pinData->gpio; ++ ++ if ( pinData->debounceMilliSec == 0 ) ++ { ++ // We assume that this is a clean signal ++ ++ pinData->pinState = (PinState_t)currLevel; ++ ++ if ( pinData->edgeType == GPIO_EventBothEdges ) ++ { ++ // There's no register to tell which edge just occurred. So we ++ // assume that it just changed into its current level. ++ ++ if ( currLevel ) ++ { ++ // Pin is currently high, so this must be a rising edge ++ ++ gpioEvent.edgeType = GPIO_EventRisingEdge; ++ } ++ else ++ { ++ // Pin is currently low, so this must be a falling edge ++ ++ gpioEvent.edgeType = GPIO_EventFallingEdge; ++ } ++ } ++ else ++ { ++ // If we're only monitoring one type of edge, then that's the one ++ // that happened. ++ ++ gpioEvent.edgeType = pinData->edgeType; ++ } ++ gpio_event_queue_event( &gpioEvent ); ++ } ++ else ++ { ++ gpioEvent.edgeType = 0; ++ ++ // If we need to debounce, then we need to monitor both edges, and ++ // use the debounce timer to figure out the real state. So we don't ++ // actually know which edge we just got. We use a state machine ++ // to track things. ++ ++ switch ( pinData->pinState ) ++ { ++ case PIN_LOW: ++ { ++ pinData->pinState = PIN_BOUNCING_HIGH; ++ gpioEvent.edgeType = GPIO_EventRisingEdge; ++ break; ++ } ++ ++ case PIN_HIGH: ++ { ++ pinData->pinState = PIN_BOUNCING_LOW; ++ gpioEvent.edgeType = GPIO_EventFallingEdge; ++ break; ++ } ++ ++ default: ++ { ++ break; ++ } ++ } ++ ++ if (( pinData->edgeType & gpioEvent.edgeType ) != 0 ) ++ { ++ // This is an edge that the user is interested in - send it along. ++ ++ gpio_event_queue_event( &gpioEvent ); ++ } ++ ++ // Disable interrupts for our gpio to allow debounce to occur. The ++ // timer will re-enable the interrupt. ++ ++ disable_irq_nosync( irq ); ++ ++ // Since we have no idea when in the current jiffy that the edge ++ // occurred, we add 1 to the calculation to guarantee at least one ++ // whole jiffy. ++ ++ mod_timer( &pinData->debounceTimer, jiffies + msecs_to_jiffies( pinData->debounceMilliSec ) + 1 ); ++ } ++ ++ return IRQ_HANDLED; ++ ++} // gpio_event_irq ++ ++/**************************************************************************** ++* ++* gpio_event_timer ++* ++****************************************************************************/ ++ ++void gpio_event_timer( unsigned long data ) ++{ ++ GPIO_PinData_t *pinData = (GPIO_PinData_t *)data; ++ ++ // This function is called when the debounce timer for a gpio expires. ++ // We record the state of the pin so that we can figure out what the ++ // next edge will be. ++ ++ pinData->pinState = ( gpio_get_value( pinData->gpio ) != 0 ); ++ ++ // Turn interrupts back on so we can catch the next edge ++ ++ enable_irq( gpio_to_irq( pinData->gpio )); ++ ++} // gpio_event_timer ++ ++/**************************************************************************** ++* ++* gpio_event_monitor ++* ++****************************************************************************/ ++ ++static int gpio_event_monitor( GPIO_EventMonitor_t *monitor ) ++{ ++ int rc = 0; ++ unsigned long flags; ++ GPIO_PinData_t *pinData; ++ unsigned long irqFlags; ++ ++ spin_lock_irqsave( &gPinListLock, flags ); ++ ++ if ( monitor->onOff ) ++ { ++ // Check to make sure we aren't already monitoring the gpio ++ ++ if (( pinData = find_pin( monitor->gpio )) != NULL ) ++ { ++ // We are already monitoring the pin. Unmonitor the pin and then ++ // proceed. ++ ++ monitor->onOff = 0; ++ ++ spin_unlock_irqrestore( &gPinListLock, flags ); ++ gpio_event_monitor( monitor ); ++ spin_lock_irqsave( &gPinListLock, flags ); ++ } ++ ++ if (( pinData = kcalloc( 1, sizeof( *pinData ), GFP_KERNEL )) == NULL ) ++ { ++ DEBUG( Error, "GPIO %d: Out of memory\n", monitor->gpio ); ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ INIT_LIST_HEAD( &pinData->list ); ++ ++ snprintf( pinData->devName, sizeof( pinData->devName ), "gpio %d event", monitor->gpio ); ++ ++ // Note: ++ // Calling request_irq will automatically set the pin to be an input. ++ ++ irqFlags = 0; ++ ++ if ( monitor->debounceMilliSec == 0 ) ++ { ++ // A clean signal is being presented, so we can just look for ++ // a particular edge ++ ++ if (( monitor->edgeType & GPIO_EventRisingEdge ) != 0 ) ++ { ++ irqFlags |= IRQF_TRIGGER_RISING; ++ } ++ if (( monitor->edgeType & GPIO_EventFallingEdge ) != 0 ) ++ { ++ irqFlags |= IRQF_TRIGGER_FALLING; ++ } ++ } ++ else ++ { ++ // Since we need to debounce, we need to look for both types of ++ // edges, since we get both types of edges whenever a bounce ++ // happens. ++ ++ irqFlags |= IRQF_TRIGGER_RISING; ++ irqFlags |= IRQF_TRIGGER_FALLING; ++ } ++ ++ if (( rc = request_irq( gpio_to_irq( monitor->gpio ), gpio_event_irq, irqFlags, pinData->devName, pinData )) != 0 ) ++ { ++ DEBUG( Error, "Unable to register irq for GPIO %d\n", monitor->gpio ); ++ kfree( pinData ); ++ goto out; ++ } ++ ++ pinData->gpio = monitor->gpio; ++ pinData->edgeType = monitor->edgeType; ++ pinData->debounceMilliSec = monitor->debounceMilliSec; ++ ++ init_timer( &pinData->debounceTimer ); ++ ++ pinData->debounceTimer.data = (unsigned long)pinData; ++ pinData->debounceTimer.function = gpio_event_timer; ++ ++ list_add_tail( &pinData->list, &gPinList ); ++ ++ if ( gpio_get_value( pinData->gpio ) == 0 ) ++ { ++ pinData->pinState = PIN_LOW; ++ } ++ else ++ { ++ pinData->pinState = PIN_HIGH; ++ } ++ } ++ else ++ { ++ if (( pinData = find_pin( monitor->gpio )) == NULL ) ++ { ++ DEBUG( Error, "GPIO %d isn't being monitored\n", monitor->gpio ); ++ rc = -ENXIO; ++ goto out; ++ } ++ ++ // We've found the gpio being monitored - turn things off. ++ ++ free_irq( gpio_to_irq( pinData->gpio ), pinData ); ++ ++ del_timer_sync( &pinData->debounceTimer ); ++ list_del( &pinData->list ); ++ ++ kfree( pinData ); ++ } ++ ++out: ++ ++ spin_unlock_irqrestore( &gPinListLock, flags ); ++ ++ return rc; ++ ++} // gpio_event_monitor ++ ++/**************************************************************************** ++* ++* gpio_event_ioctl ++* ++* Called to process ioctl requests ++* ++*****************************************************************************/ ++ ++long gpio_event_ioctl( struct file *file, unsigned int cmd, unsigned long arg ) ++{ ++ GPIO_FileData_t *fileData; ++ ++ DEBUG( Trace, "type: '%c' cmd: 0x%x\n", _IOC_TYPE( cmd ), _IOC_NR( cmd )); ++ ++ fileData = file->private_data; ++ ++ switch ( cmd ) ++ { ++ case GPIO_EVENT_IOCTL_MONITOR_GPIO: ++ { ++ GPIO_EventMonitor_t monitor; ++ ++ if ( copy_from_user( &monitor, (void *)arg, sizeof( monitor )) != 0 ) ++ { ++ return -EFAULT; ++ } ++ return gpio_event_monitor( &monitor ); ++ } ++ ++ case GPIO_EVENT_IOCTL_SET_READ_MODE: ++ { ++ fileData->readMode = (GPIO_EventReadMode_t)arg; ++ break; ++ } ++ ++ case TCGETS: ++ { ++ // When cat opens this device, we get this ioctl ++ return -ENOTTY; ++ } ++ ++ default: ++ { ++ DEBUG( Error, "Unrecognized ioctl: '0x%x'\n", cmd ); ++ return -ENOTTY; ++ } ++ } ++ ++ return 0; ++ ++} // gpio_event_ioctl ++ ++/**************************************************************************** ++* ++* gpio_event_open ++* ++****************************************************************************/ ++ ++static int gpio_event_open( struct inode *inode, struct file *file ) ++{ ++ unsigned long flags; ++ GPIO_FileData_t *fileData; ++ ++ DEBUG( Trace, "gpio_event_open called, major = %d, minor = %d\n", MAJOR( inode->i_rdev ), MINOR( inode->i_rdev )); ++ ++ // Allocate a per-open data structure ++ ++ if (( fileData = kcalloc( 1, sizeof( *fileData ), GFP_KERNEL )) == NULL ) ++ { ++ return -ENOMEM; ++ } ++ ++ INIT_LIST_HEAD( &fileData->list ); ++ ++ init_waitqueue_head( &fileData->waitQueue ); ++ ++ spin_lock_init( &fileData->queueLock ); ++ ++ fileData->getIndex = 0; ++ fileData->putIndex = 0; ++ fileData->numEvents = 0; ++ fileData->bufBytes = 0; ++ ++ fileData->readMode = GPIO_EventReadModeAscii; ++ ++ file->private_data = fileData; ++ ++ spin_lock_irqsave( &gFileListLock, flags ); ++ { ++ list_add_tail( &fileData->list, &gFileList ); ++ } ++ spin_unlock_irqrestore( &gFileListLock, flags ); ++ ++ return 0; ++ ++} // gpio_event_open ++ ++/**************************************************************************** ++* ++* gpio_event_read ++* ++****************************************************************************/ ++ ++static ssize_t gpio_event_read( struct file *file, char *buffer, size_t spaceRemaining, loff_t *ppos ) ++{ ++ int rc; ++ ssize_t bytesCopied = 0; ++ ssize_t bytesToCopy; ++ GPIO_FileData_t *fileData = file->private_data; ++ ++ DEBUG( Trace, "gpio_event_read called, major = %d, minor = %d\n", MAJOR( file->f_dentry->d_inode->i_rdev ), MINOR( file->f_dentry->d_inode->i_rdev )); ++ ++ if ( spaceRemaining == 0 ) ++ { ++ return 0; ++ } ++ ++ // First of all, return any unread data from the previous call ++ ++ if ( fileData->bufBytes > 0 ) ++ { ++ if ( spaceRemaining < fileData->bufBytes ) ++ { ++ bytesCopied = spaceRemaining; ++ } ++ else ++ { ++ bytesCopied = fileData->bufBytes; ++ } ++ ++ if ( copy_to_user( &buffer[0], &fileData->buffer[0], bytesCopied ) != 0 ) ++ { ++ return -EFAULT; ++ } ++ if ( fileData->bufBytes > bytesCopied ) ++ { ++ memmove( &fileData->buffer[ 0 ], &fileData->buffer[ bytesCopied ], fileData->bufBytes - bytesCopied ); ++ } ++ fileData->bufBytes -= bytesCopied; ++ ++ if ( fileData->bufBytes > 0 ) ++ { ++ // We copied some data, but not all of it. Return early. ++ ++ return bytesCopied; ++ } ++ } ++ ++ do ++ { ++ if ((( file->f_flags & O_NONBLOCK ) != 0 ) && ( fileData->numEvents == 0 )) ++ { ++ // File was opened non-blocking and no more data is available ++ // We don't want to wait for an event, so exit from the loop ++ ++ break; ++ } ++ ++ rc = wait_event_interruptible( fileData->waitQueue, ( fileData->numEvents > 0 )); ++ if ( rc != 0 ) ++ { ++ return rc; ++ } ++ ++ if ( fileData->readMode == GPIO_EventReadModeBinary ) ++ { ++ gpio_event_dequeue_event( fileData, (GPIO_Event_t *)&fileData->buffer[0] ); ++ ++ fileData->bufBytes = sizeof( GPIO_Event_t ); ++ ++ } ++ else ++ { ++ GPIO_Event_t gpioEvent; ++ ++ gpio_event_dequeue_event( fileData, &gpioEvent ); ++ ++ // ASCII Mode output: ++ // ++ // nn E tttttttt.tttttt ++ // ++ // Where nn is the base-10 GPIO number ++ // E is R or F (for rising or falling edge) ++ // tttttttt.tttttt is the timestamp with microsecond resolution ++ ++ fileData->bufBytes = snprintf( fileData->buffer, sizeof( fileData->buffer ), ++ "%2d %c %ld.%06ld\n", ++ gpioEvent.gpio, ++ (( gpioEvent.edgeType == GPIO_EventRisingEdge ) ? 'R' : 'F' ), ++ gpioEvent.time.tv_sec, ++ gpioEvent.time.tv_usec ); ++ } ++ ++ if ( spaceRemaining >= fileData->bufBytes ) ++ { ++ bytesToCopy = fileData->bufBytes; ++ } ++ else ++ { ++ bytesToCopy = spaceRemaining; ++ } ++ ++ if ( copy_to_user( &buffer[ bytesCopied ], &fileData->buffer[0], bytesToCopy ) != 0 ) ++ { ++ return -EFAULT; ++ } ++ spaceRemaining -= bytesToCopy; ++ bytesCopied += bytesToCopy; ++ fileData->bufBytes -= bytesToCopy; ++ ++ if ( fileData->bufBytes > 0 ) ++ { ++ // We couldn't copy all of the data out of the buffer. Move the ++ // remaining data to the beginning of the buffer and exit. ++ ++ memmove( &fileData->buffer[ 0 ], &fileData->buffer[ bytesToCopy ], fileData->bufBytes ); ++ return bytesCopied; ++ } ++ } while (( fileData->numEvents > 0 ) && ( spaceRemaining > 0 )); ++ ++ if ((( file->f_flags & O_NONBLOCK ) != 0 ) && ( bytesCopied == 0 )) ++ { ++ // File was opened non-blocking and we didn't copy any data. ++ ++ return -EAGAIN; ++ } ++ ++ return bytesCopied; ++ ++} // gpio_event_read ++ ++/**************************************************************************** ++* ++* gpio_event_poll - used by select & poll ++* ++****************************************************************************/ ++ ++static unsigned int gpio_event_poll(struct file *file, poll_table *wait) ++{ ++ unsigned long flags; ++ GPIO_FileData_t *fileData = file->private_data; ++ unsigned int mask = 0; ++ ++ poll_wait( file, &fileData->waitQueue, wait ); ++ ++ spin_lock_irqsave( &fileData->queueLock, flags ); ++ { ++ if (( fileData->bufBytes > 0 ) || ( fileData->numEvents > 0 )) ++ { ++ mask |= POLLIN | POLLRDNORM; // readable ++ } ++ } ++ spin_unlock_irqrestore( &fileData->queueLock, flags ); ++ ++ return mask; ++ ++} // gpio_event_poll ++ ++/**************************************************************************** ++* ++* gpio_event_release ++* ++****************************************************************************/ ++ ++static int gpio_event_release( struct inode *inode, struct file *file ) ++{ ++ unsigned long flags; ++ GPIO_FileData_t *fileData = file->private_data; ++ ++ DEBUG( Trace, "gpio_event_release called\n" ); ++ ++ spin_lock_irqsave( &gFileListLock, flags ); ++ { ++ list_del( &fileData->list ); ++ } ++ spin_unlock_irqrestore( &gFileListLock, flags ); ++ ++ kfree( fileData ); ++ ++ return 0; ++ ++} // gpio_event_release ++ ++/**************************************************************************** ++* ++* File Operations (these are the device driver entry points) ++* ++****************************************************************************/ ++ ++struct file_operations gpio_event_fops = ++{ ++ owner: THIS_MODULE, ++ unlocked_ioctl: gpio_event_ioctl, ++ open: gpio_event_open, ++ poll: gpio_event_poll, ++ release: gpio_event_release, ++ read: gpio_event_read, ++}; ++ ++/**************************************************************************** ++* ++* gpio_event_init ++* ++* Called to perform module initialization when the module is loaded ++* ++****************************************************************************/ ++ ++static int __init gpio_event_init( void ) ++{ ++ int rc; ++ ++ DEBUG( Trace, "called\n" ); ++ ++ printk( gBanner ); ++ ++ // Get a major number ++ ++ if (( rc = alloc_chrdev_region( &gGpioEventDevNum, 0, 1, GPIO_EVENT_DEV_NAME )) < 0 ) ++ { ++ printk( KERN_WARNING "sample: Unable to allocate major, err: %d\n", rc ); ++ return rc; ++ } ++ DEBUG( Trace, "allocated major:%d minor:%d\n", MAJOR( gGpioEventDevNum ), MINOR( gGpioEventDevNum )); ++ ++ // Register our proc entries. ++ ++ gProcGpioEvent = create_proc_entry( "gpio-event", S_IFDIR | S_IRUGO | S_IXUGO, NULL ); ++ if ( gProcGpioEvent == NULL ) ++ { ++ return -ENOMEM; ++ } ++ gProcPins = create_proc_entry( "pins", 0444, gProcGpioEvent ); ++ if ( gProcPins != NULL ) ++ { ++ gProcPins->proc_fops = &pins_proc_ops; ++ } ++ ++#if ( LINUX_VERSION_CODE <= KERNEL_VERSION( 2, 6, 20 )) ++ gSysCtlHeader = register_sysctl_table( gSysCtl, 0 ); ++ if ( gSysCtlHeader != NULL ) ++ { ++ gSysCtlHeader->ctl_table->child->de->owner = THIS_MODULE; ++ } ++#else ++ gSysCtlHeader = register_sysctl_table( gSysCtl ); ++#endif ++ ++ // Register our device. The device becomes "active" as soon as cdev_add ++ // is called. ++ ++ cdev_init( &gGpioEventCDev, &gpio_event_fops ); ++ gGpioEventCDev.owner = THIS_MODULE; ++ ++ if (( rc = cdev_add( &gGpioEventCDev, gGpioEventDevNum, 1 )) != 0 ) ++ { ++ printk( KERN_WARNING "sample: cdev_add failed: %d\n", rc ); ++ return rc; ++ } ++ ++ // Create a class, so that udev will make the /dev entry ++ ++ gGpioEventClass = class_create( THIS_MODULE, GPIO_EVENT_DEV_NAME ); ++ if ( IS_ERR( gGpioEventClass )) ++ { ++ printk( KERN_WARNING "sample: Unable to create class\n" ); ++ return -1; ++ } ++ ++ device_create( gGpioEventClass, NULL, gGpioEventDevNum, NULL, GPIO_EVENT_DEV_NAME ); ++ ++ return 0; ++ ++} // gpio_event_init ++ ++/**************************************************************************** ++* ++* gpio_event_exit ++* ++* Called to perform module cleanup when the module is unloaded. ++* ++****************************************************************************/ ++ ++static void __exit gpio_event_exit( void ) ++{ ++ struct list_head *next; ++ struct list_head *pin; ++ GPIO_EventMonitor_t monitor; ++ ++ DEBUG( Trace, "called\n" ); ++ ++ // If there are any pins which are currently being monitored, then we ++ // need to unmonitor them. ++ ++ memset( &monitor, 0, sizeof( monitor )); ++ ++ list_for_each_safe( pin, next, &gPinList ) ++ { ++ GPIO_PinData_t *pinData = list_entry( pin, GPIO_PinData_t, list ); ++ ++ monitor.gpio = pinData->gpio; ++ ++ gpio_event_monitor( &monitor ); ++ } ++ ++ // Deregister our driver ++ ++ device_destroy( gGpioEventClass, gGpioEventDevNum ); ++ class_destroy( gGpioEventClass ); ++ ++ cdev_del( &gGpioEventCDev ); ++ ++ if ( gSysCtlHeader != NULL ) ++ { ++ unregister_sysctl_table( gSysCtlHeader ); ++ } ++ remove_proc_entry( "pins", gProcGpioEvent ); ++ remove_proc_entry( "gpio-event", NULL ); ++ ++ unregister_chrdev_region( gGpioEventDevNum, 1 ); ++ ++} // gpio_event_exit ++ ++/****************************************************************************/ ++ ++module_init(gpio_event_init); ++module_exit(gpio_event_exit); ++ ++MODULE_AUTHOR("Dave Hylands"); ++MODULE_DESCRIPTION("GPIO Event Driver"); ++MODULE_LICENSE("Dual BSD/GPL"); ++ +diff -Naur linux-3.2.33-go.orig/3rdparty/gpio_event_drv/gpio-event-drv.h 3rdparty/gpio_event_drv/gpio-event-drv.h +--- linux-3.2.33-go.orig/3rdparty/gpio_event_drv/gpio-event-drv.h 1970-01-01 00:00:00.000000000 +0000 ++++ 3rdparty/gpio_event_drv/gpio-event-drv.h 2012-11-18 10:24:14.000000000 +0000 +@@ -0,0 +1,115 @@ ++/**************************************************************************** ++* ++* Copyright (c) 2006 Dave Hylands ++* ++* This program is free software; you can redistribute it and/or modify ++* it under the terms of the GNU General Public License version 2 as ++* published by the Free Software Foundation. ++* ++* Alternatively, this software may be distributed under the terms of BSD ++* license. ++* ++* See README and COPYING for more details. ++* ++**************************************************************************** ++* ++* This driver allows multiple GPIO pins to be monitored and allows a user ++* mode program to be notified when the pin changes. ++* ++****************************************************************************/ ++ ++#if !defined( GPIO_EVENT_DRV_H ) ++#define GPIO_EVENT_DRV_H ++ ++/* ---- Include Files ----------------------------------------------------- */ ++ ++#if defined( __KERNEL__ ) ++# include ++# include ++# include ++#else ++# include ++# include ++# include ++#endif ++ ++ ++/* ---- Constants and Types ----------------------------------------------- */ ++ ++// The ioctl "magic" is just some character value which is used to help ++// detect when incorrect ioctl values are sent down to a driver. ++ ++#define GPIO_EVENT_IOCTL_MAGIC 'G' ++ ++/** ++ * Deefines for each of the ioctl commands. Note that since we want to reduce ++ * the possibility that a user mode program gets out of sync with a given ++ * driver, we explicitly assign a value to each enumeration. This makes ++ * it more difficult to stick new ioctl's in the middle of the list. ++ */ ++ ++typedef enum ++{ ++ GPIO_EVENT_CMD_FIRST = 0x80, ++ ++ GPIO_EVENT_CMD_MONITOR_GPIO = 0x80, ++ GPIO_EVENT_CMD_SET_READ_MODE = 0x81, ++ ++ /* Insert new ioctls here */ ++ ++ GPIO_EVENT_CMD_LAST, ++ ++} GPIO_EVENT_CMD; ++ ++typedef enum ++{ ++ GPIO_EventRisingEdge = 0x01, ++ GPIO_EventFallingEdge = 0x02, ++ GPIO_EventBothEdges = GPIO_EventRisingEdge | GPIO_EventFallingEdge, ++ ++} GPIO_EventEdgeType_t; ++ ++typedef struct ++{ ++ uint8_t gpio; // gpio to monitor ++ uint8_t onOff; // 0 = stop monitoring, 1 = start monitoring ++ GPIO_EventEdgeType_t edgeType; // Monitor rising/falling/both edges? ++ uint8_t debounceMilliSec; // debounce time in milliseconds ++ ++} GPIO_EventMonitor_t; ++ ++typedef enum ++{ ++ GPIO_EventReadModeAscii = 0x00, // Reads return ASCII data (default) ++ GPIO_EventReadModeBinary = 0x01, // Reads return Binary data ++ ++} GPIO_EventReadMode_t; ++ ++/* ++ * Definitions for the actual ioctl commands ++ */ ++ ++#define GPIO_EVENT_IOCTL_MONITOR_GPIO _IOW( GPIO_EVENT_IOCTL_MAGIC, GPIO_EVENT_CMD_MONITOR_GPIO, GPIO_EventMonitor_t ) // arg is GPIO_EventMonitor * ++#define GPIO_EVENT_IOCTL_SET_READ_MODE _IO( GPIO_EVENT_IOCTL_MAGIC, GPIO_EVENT_CMD_SET_READ_MODE ) // arg is int ++ ++/* ++ * Definitions for sysctl. The top level define has to be unique system wide. ++ * The kernel defines values 1 thru about 10 (see include/linunx/sysctl.h) ++ */ ++ ++#define CTL_GPIO_EVENT 0x47504576 // 'GPEv' in hex form ++ ++/* ++ * Reads return GPIO_Event_t structures ++ */ ++ ++typedef struct ++{ ++ uint8_t gpio; // GPIO that this event is for ++ GPIO_EventEdgeType_t edgeType; // Type of edge detected ++ struct timeval time; // Time the event occurred ++ ++} GPIO_Event_t; ++ ++#endif // GPIO_EVENT_DRV_H ++ diff --git a/3.3.8/3rd-3rdparty-merge.patch b/3.3.8/3rd-3rdparty-merge.patch new file mode 100644 index 0000000..dff4679 --- /dev/null +++ b/3.3.8/3rd-3rdparty-merge.patch @@ -0,0 +1,156 @@ +diff -uNr linux-3.2.33-go.orig/arch/alpha/Kconfig linux-3.2.33-go/arch/alpha/Kconfig +--- linux-3.2.33-go.orig/arch/alpha/Kconfig 2012-11-15 22:08:02.768806792 +0100 ++++ linux-3.2.33-go/arch/alpha/Kconfig 2012-11-15 22:08:29.937483632 +0100 +@@ -673,3 +673,4 @@ + + source "lib/Kconfig" + ++source "3rdparty/Kconfig" +diff -uNr linux-3.2.33-go.orig/arch/arm/Kconfig linux-3.2.33-go/arch/arm/Kconfig +--- linux-3.2.33-go.orig/arch/arm/Kconfig 2012-11-15 22:07:59.952839378 +0100 ++++ linux-3.2.33-go/arch/arm/Kconfig 2012-11-15 22:14:01.950566716 +0100 +@@ -2259,3 +2259,5 @@ + source "crypto/Kconfig" + + source "lib/Kconfig" ++ ++source "3rdparty/Kconfig" +diff -uNr linux-3.2.33-go.orig/arch/ia64/Kconfig linux-3.2.33-go/arch/ia64/Kconfig +--- linux-3.2.33-go.orig/arch/ia64/Kconfig 2012-11-15 22:08:00.893828523 +0100 ++++ linux-3.2.33-go/arch/ia64/Kconfig 2012-11-15 22:08:29.938483621 +0100 +@@ -669,3 +669,5 @@ + + config IOMMU_HELPER + def_bool (IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB || IA64_GENERIC || SWIOTLB) ++ ++source "3rdparty/Kconfig" +diff -uNr linux-3.2.33-go.orig/arch/mips/Kconfig linux-3.2.33-go/arch/mips/Kconfig +--- linux-3.2.33-go.orig/arch/mips/Kconfig 2012-11-15 22:08:02.698807597 +0100 ++++ linux-3.2.33-go/arch/mips/Kconfig 2012-11-15 22:08:29.939483610 +0100 +@@ -2485,3 +2485,5 @@ + source "crypto/Kconfig" + + source "lib/Kconfig" ++ ++source "3rdparty/Kconfig" +diff -uNr linux-3.2.33-go.orig/arch/powerpc/Kconfig linux-3.2.33-go/arch/powerpc/Kconfig +--- linux-3.2.33-go.orig/arch/powerpc/Kconfig 2012-11-15 22:08:01.893816938 +0100 ++++ linux-3.2.33-go/arch/powerpc/Kconfig 2012-11-15 22:08:29.940483598 +0100 +@@ -980,3 +980,5 @@ + bool + + source "arch/powerpc/kvm/Kconfig" ++ ++source "3rdparty/Kconfig" +diff -uNr linux-3.2.33-go.orig/arch/sparc/Kconfig linux-3.2.33-go/arch/sparc/Kconfig +--- linux-3.2.33-go.orig/arch/sparc/Kconfig 2012-11-15 22:08:00.130837331 +0100 ++++ linux-3.2.33-go/arch/sparc/Kconfig 2012-11-15 22:08:29.941483586 +0100 +@@ -605,3 +605,5 @@ + source "crypto/Kconfig" + + source "lib/Kconfig" ++ ++source "3rdparty/Kconfig" +diff -uNr linux-3.2.33-go.orig/arch/x86/Kconfig linux-3.2.33-go/arch/x86/Kconfig +--- linux-3.2.33-go.orig/arch/x86/Kconfig 2012-11-15 22:08:00.435833823 +0100 ++++ linux-3.2.33-go/arch/x86/Kconfig 2012-11-15 22:08:29.945483540 +0100 +@@ -2179,3 +2179,5 @@ + source "arch/x86/kvm/Kconfig" + + source "lib/Kconfig" ++ ++source "3rdparty/Kconfig" +diff -uNr linux-3.2.33-go.orig/Makefile linux-3.2.33-go/Makefile +--- linux-3.2.33-go.orig/Makefile 2012-11-15 22:08:03.435799123 +0100 ++++ linux-3.2.33-go/Makefile 2012-11-15 22:08:29.946483529 +0100 +@@ -507,7 +507,7 @@ + + # Objects we will link into vmlinux / subdirs we need to visit + init-y := init/ +-drivers-y := drivers/ sound/ firmware/ ++drivers-y := drivers/ sound/ firmware/ 3rdparty/ + net-y := net/ + libs-y := lib/ + core-y := usr/ +diff -uNr linux-3.2.33-go.orig/scripts/kconfig/Makefile linux-3.2.33-go/scripts/kconfig/Makefile +--- linux-3.2.33-go.orig/scripts/kconfig/Makefile 2012-11-15 22:07:58.064861094 +0100 ++++ linux-3.2.33-go/scripts/kconfig/Makefile 2012-11-15 22:08:55.603180188 +0100 +@@ -11,29 +11,29 @@ + Kconfig := Kconfig + endif + +-xconfig: $(obj)/qconf ++xconfig: $(obj)/qconf 3rdparty/Makefile + $< $(Kconfig) + +-gconfig: $(obj)/gconf ++gconfig: $(obj)/gconf 3rdparty/Makefile + $< $(Kconfig) + +-menuconfig: $(obj)/mconf ++menuconfig: $(obj)/mconf 3rdparty/Makefile + $< $(Kconfig) + +-config: $(obj)/conf ++config: $(obj)/conf 3rdparty/Makefile + $< --oldaskconfig $(Kconfig) + +-nconfig: $(obj)/nconf ++nconfig: $(obj)/nconf 3rdparty/Makefile + $< $(Kconfig) + +-oldconfig: $(obj)/conf ++oldconfig: $(obj)/conf 3rdparty/Makefile + $< --$@ $(Kconfig) + +-silentoldconfig: $(obj)/conf ++silentoldconfig: $(obj)/conf 3rdparty/Makefile + $(Q)mkdir -p include/generated + $< --$@ $(Kconfig) + +-localyesconfig localmodconfig: $(obj)/streamline_config.pl $(obj)/conf ++localyesconfig localmodconfig: $(obj)/streamline_config.pl $(obj)/conf 3rdparty/Makefile + $(Q)mkdir -p include/generated + $(Q)perl $< --$@ $(srctree) $(Kconfig) > .tmp.config + $(Q)if [ -f .config ]; then \ +@@ -90,18 +90,18 @@ + *) cat $(CLONECONFIG) > .config.running ;; \ + esac && \ + echo -e "Cloning configuration file $(CLONECONFIG)\n" +- $(Q)$< --defconfig=.config.running arch/$(SRCARCH)/Kconfig ++ $(Q)$< --defconfig=.config.running arch/$(SRCARCH)/Kconfig 3rdparty/Makefile + + + PHONY += listnewconfig oldnoconfig savedefconfig defconfig + +-listnewconfig oldnoconfig: $(obj)/conf ++listnewconfig oldnoconfig: $(obj)/conf 3rdparty/Makefile + $< --$@ $(Kconfig) + +-savedefconfig: $(obj)/conf ++savedefconfig: $(obj)/conf 3rdparty/Makefile + $< --$@=defconfig $(Kconfig) + +-defconfig: $(obj)/conf ++defconfig: $(obj)/conf 3rdparty/Makefile + ifeq ($(KBUILD_DEFCONFIG),) + $< --defconfig $(Kconfig) + else +@@ -109,7 +109,7 @@ + $(Q)$< --defconfig=arch/$(SRCARCH)/configs/$(KBUILD_DEFCONFIG) $(Kconfig) + endif + +-%_defconfig: $(obj)/conf ++%_defconfig: $(obj)/conf 3rdparty/Makefile + $(Q)$< --defconfig=arch/$(SRCARCH)/configs/$@ $(Kconfig) + + # Help text used by make help +@@ -186,6 +186,8 @@ + gconf-target := 1 + endif + ++3rdparty/Makefile: ++ pushd $(srctree)/3rdparty ; $(PERL) ./mkbuild.pl ; popd + + ifeq ($(qconf-target),1) + hostprogs-y += qconf diff --git a/3.3.8/3rd-3rdparty-netatop-0.1.1.patch b/3.3.8/3rd-3rdparty-netatop-0.1.1.patch new file mode 100644 index 0000000..a06a77d --- /dev/null +++ b/3.3.8/3rd-3rdparty-netatop-0.1.1.patch @@ -0,0 +1,1769 @@ +diff -uNr linux-3.2.33-go.orig/3rdparty/netatop/Kconfig 3rdparty/netatop/Kconfig +--- linux-3.2.33-go.orig/3rdparty/netatop/Kconfig 1970-01-01 01:00:00.000000000 +0100 ++++ 3rdparty/netatop/Kconfig 2012-11-15 22:48:00.753390796 +0100 +@@ -0,0 +1,8 @@ ++config NETATOP ++ tristate "Netatop kernel module" ++ help ++ The optional kernel module netatop can be loaded to gather statistics ++ about the TCP and UDP packets that have been transmitted/received ++ per process and per thread ++ ++ If unsure, see you again in six months. +diff -uNr linux-3.2.33-go.orig/3rdparty/netatop/Makefile 3rdparty/netatop/Makefile +--- linux-3.2.33-go.orig/3rdparty/netatop/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ 3rdparty/netatop/Makefile 2012-11-15 22:50:01.332957868 +0100 +@@ -0,0 +1,5 @@ ++# ++# THIS IS AN AUTOMATICALLY GENERATED FILE. DO NOT EDIT. ++# ++ ++obj-$(CONFIG_NETATOP) += netatop.o +diff -uNr linux-3.2.33-go.orig/3rdparty/netatop/netatop.c 3rdparty/netatop/netatop.c +--- linux-3.2.33-go.orig/3rdparty/netatop/netatop.c 1970-01-01 01:00:00.000000000 +0100 ++++ 3rdparty/netatop/netatop.c 2012-11-15 22:57:52.989419565 +0100 +@@ -0,0 +1,1687 @@ ++/* ++** This module uses the netfilter interface to maintain statistics ++** about the network traffic per task, on level of thread group ++** and individual thread. ++** ++** General setup ++** ------------- ++** Once the module is active, it is called for every packet that is ++** transmitted by a local process and every packet that is received ++** from an interface. Not only the packets that contain the user data ++** are passed but also the TCP related protocol packets (SYN, ACK, ...). ++** ++** When the module discovers a packet for a connection (TCP) or local ++** port (UDP) that is new, it creates a sockinfo structure. As soon as ++** possible the sockinfo struct will be connected to a taskinfo struct ++** that represents the proces or thread that is related to the socket. ++** However, the task can only be determined when a packet is transmitted, ++** i.e. the module is called during system call handling in the context ++** of the transmitting process. At that moment the tgid (process) and ++** pid (thread) can be obtained from the process administration to ++** be stored in the module's own taskinfo structs (one for the process, ++** one for the thread). ++** For the time that the sockinfo struct can not be related to a taskinfo ++** struct (e.g. when only packets are received), counters are maintained ++** temporarily in the sockinfo struct. As soon as a related taskinfo struct ++** is discovered when the task transmits, counters will be maintained in ++** the taskinfo struct itself. ++** When packets are only received for a socket (e.g. another machine is ++** sending UDP packets to the local machine) while the local task ++** never responds, no match to a process can be made and the packets ++** remain unidentified by the netatop module. At least one packet should ++** have been sent by a local process to be able to match packets for such ++** socket. ++** In the file /proc/netatop counters can be found that show the total ++** number of packets sent/received and how many of these packets were ++** unidentified (i.e. not accounted to a process/thread). ++** ++** Garbage collection ++** ------------------ ++** The module uses a garbage collector to cleanup the unused sockinfo ++** structs if connections do not exist any more (TCP) or have not been ++** used for some time (TCP/UDP). ++** Furthermore, the garbage collector checks if the taskinfo structs ++** still represent existing processes or threads. If not, the taskinfo struct ++** is destroyed (in case of a thread) or it is moved to a separate list of ++** finished processes (in case of a process). Analysis programs can read ++** the taskinfo of such finished process. When the taskinfo of a finished ++** process is not read within 15 seconds, the taskinfo will be destroyed. ++** ++** A garbage collector cycle can be triggered by issueing a getsockopt ++** call from an analysis program (e.g. atop). Apart from that, a time-based ++** garbage collector cycle is issued anyhow every 15 seconds. ++** ++** Interface with user mode ++** ------------------------ ++** Programs can open an IP socket and use the getsockopt() system call ++** to issue commands to this module. With the command ATOP_GETCNT_TGID ++** the current counters can be obtained on process level (thread group) ++** and with the command ATOP_GETCNT_PID the counters on thread level. ++** For both commands, the tgid/pid has to be passed of the required thread ++** (group). When the required thread (group) does not exist, an errno ESRCH ++** is given. ++** ++** The command ATOP_GETCNT_EXIT can be issued to obtain the counters of ++** an exited process. As stated above, such command has to be issued ++** within 15 seconds after a process has been declared 'finished' by ++** the garbage collector. Whenever this command is issued and no exited ++** process is in the exitlist, the requesting process is blocked until ++** an exited process is available. ++** ++** The command NETATOP_FORCE_GC activates the garbage collector of the ++** netatop module to determine if sockinfo's of old connections/ports ++** can be destroyed and if taskinfo's of exited processes can be ++** The command NETATOP_EMPTY_EXIT can be issued to wait until the exitlist ++** with the taskinfo's of exited processes is empty. ++** ---------------------------------------------------------------------- ++** Copyright (C) 2012 Gerlof Langeveld (gerlof.langeveld@atoptool.nl) ++** ++** This program is free software; you can redistribute it and/or modify ++** it under the terms of the GNU General Public License version 2 as ++** published by the Free Software Foundation. ++*/ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "netatop.h" ++#include "netatopversion.h" ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Gerlof Langeveld "); ++MODULE_DESCRIPTION("Per-task network statistics"); ++MODULE_VERSION(NETATOPVERSION); ++ ++#define GCINTERVAL (HZ*15) // interval garbage collector (jiffies) ++#define GCMAXUDP (HZ*16) // max inactivity for UDP (jiffies) ++#define GCMAXTCP (HZ*1800) // max inactivity for TCP (jiffies) ++#define GCMAXUNREF (HZ*60) // max time without taskref (jiffies) ++ ++#define SILIMIT (2048*1024) // maximum memory for sockinfo structs ++#define TILIMIT (1024*1024) // maximum memory for taskinfo structs ++ ++#define NF_IP_PRE_ROUTING 0 ++#define NF_IP_LOCAL_IN 1 ++#define NF_IP_FORWARD 2 ++#define NF_IP_LOCAL_OUT 3 ++#define NF_IP_POST_ROUTING 4 ++ ++/* ++** struct that maintains statistics about the network ++** traffic caused per thread or thread group ++*/ ++struct chainer { ++ void *next; ++ void *prev; ++}; ++ ++struct taskinfobucket; ++ ++struct taskinfo { ++ struct chainer ch; ++ ++ pid_t id; // tgid or pid ++ char type; // 'g' (thread group) or ++ // 't' (thread) ++ unsigned char state; // see below ++ char command[COMLEN]; ++ unsigned long btime; // start time of process ++ unsigned long long exittime; // time inserted in exitlist ++ ++ struct taskcount tc; ++}; ++ ++// state values above ++#define CHECKED 1 // verified that task still exists ++#define INDELETE 2 // task exited but still in hash list ++#define FINISHED 3 // task on exit list ++ ++/* ++** hash tables to find a particular thread group or thread ++*/ ++#define TBUCKS 1024 // must be multiple of 2! ++#define THASH(x, t) (((x)+t)&(TBUCKS-1)) ++ ++struct taskinfobucket { ++ struct chainer ch; ++ spinlock_t lock; ++} thash[TBUCKS]; ++ ++static unsigned long nrt; // current number of taskinfo allocated ++static unsigned long nrt_ovf; // no taskinfo allocated due to overflow ++static DEFINE_SPINLOCK(nrtlock); ++ ++ ++static struct taskinfo *exithead; // linked list of exited processes ++static struct taskinfo *exittail; ++static DEFINE_SPINLOCK(exitlock); ++ ++static DECLARE_WAIT_QUEUE_HEAD(exitlist_filled); ++static DECLARE_WAIT_QUEUE_HEAD(exitlist_empty); ++ ++static unsigned long nre; // current number of taskinfo on exitlist ++ ++/* ++** structs that uniquely identify a TCP connection (host endian format) ++*/ ++struct tcpv4_ident { ++ uint32_t laddr; /* local IP address */ ++ uint32_t raddr; /* remote IP address */ ++ uint16_t lport; /* local port number */ ++ uint16_t rport; /* remote port number */ ++}; ++ ++struct tcpv6_ident { ++ struct in6_addr laddr; /* local IP address */ ++ struct in6_addr raddr; /* remote IP address */ ++ uint16_t lport; /* local port number */ ++ uint16_t rport; /* remote port number */ ++}; ++ ++/* ++** struct to maintain the reference from a socket ++** to a thread and thread-group ++*/ ++struct sockinfo { ++ struct chainer ch; ++ ++ unsigned char last_state; // last known state of socket ++ uint8_t proto; // protocol ++ ++ union keydef { ++ uint16_t udp; // UDP ident (only portnumber) ++ struct tcpv4_ident tcp4; // TCP connection ident IPv4 ++ struct tcpv6_ident tcp6; // TCP connection ident IPv6 ++ } key; ++ ++ struct taskinfo *tgp; // ref to thread group ++ struct taskinfo *thp; // ref to thread (or NULL) ++ ++ short tgh; // hash number of thread group ++ short thh; // hash number of thread ++ ++ unsigned long sndpacks; // temporary counters in case ++ unsigned long sndbytes; // no relation to process is ++ unsigned long rcvpacks; // known yet ++ unsigned long rcvbytes; ++ ++ unsigned long long lastact; // last updated (jiffies) ++}; ++ ++/* ++** hash table to find a socket reference ++*/ ++#define SBUCKS 1024 // must be multiple of 2! ++#define SHASHTCP4(x) (((x).raddr+(x).lport+(x).rport)&(SBUCKS-1)) ++#define SHASHUDP(x) ((x)&(SBUCKS-1)) ++ ++struct { ++ struct chainer ch; ++ spinlock_t lock; ++} shash[SBUCKS]; ++ ++static unsigned long nrs; // current number sockinfo allocated ++static unsigned long nrs_ovf; // no sockinfo allocated due to overflow ++static DEFINE_SPINLOCK(nrslock); ++ ++/* ++** various static counters ++*/ ++static unsigned long icmpsndbytes; ++static unsigned long icmpsndpacks; ++static unsigned long icmprcvbytes; ++static unsigned long icmprcvpacks; ++ ++static unsigned long tcpsndpacks; ++static unsigned long tcprcvpacks; ++static unsigned long udpsndpacks; ++static unsigned long udprcvpacks; ++static unsigned long unidentudpsndpacks; ++static unsigned long unidentudprcvpacks; ++static unsigned long unidenttcpsndpacks; ++static unsigned long unidenttcprcvpacks; ++ ++static unsigned long unknownproto; ++ ++static struct timer_list timer; ++static DEFINE_SPINLOCK(gclock); ++static unsigned long long gclast; // last garbage collection (jiffies) ++ ++static struct timespec boottime; ++ ++/* ++** function prototypes ++*/ ++static void analyze_tcpv4_packet(struct sk_buff *, ++ const struct net_device *, int, char, ++ struct iphdr *, void *); ++ ++static void analyze_udp_packet(struct sk_buff *, ++ const struct net_device *, int, char, ++ struct iphdr *, void *); ++ ++static int sock2task(char, struct sockinfo *, ++ struct taskinfo **, short *, ++ struct sk_buff *, const struct net_device *, ++ int, char); ++ ++static void update_taskcounters(struct sk_buff *, ++ const struct net_device *, ++ struct taskinfo *, char); ++ ++static void update_sockcounters(struct sk_buff *, ++ const struct net_device *, ++ struct sockinfo *, char); ++ ++static void sock2task_sync(struct sk_buff *, ++ struct sockinfo *, struct taskinfo *); ++ ++static void register_unident(struct sockinfo *); ++ ++static int calc_reallen(struct sk_buff *, ++ const struct net_device *); ++ ++static void get_tcpv4_ident(struct iphdr *, void *, ++ char, union keydef *); ++ ++static struct sockinfo *find_sockinfo(int, union keydef *, int, int); ++static struct sockinfo *make_sockinfo(int, union keydef *, int, int); ++ ++static void wipesockinfo(void); ++static void wipetaskinfo(void); ++static void wipetaskexit(void); ++ ++static void garbage_collector(void); ++static void gcperiodic(unsigned long unused); ++static void gctaskexit(void); ++static void gcsockinfo(void); ++static void gctaskinfo(void); ++ ++static void move_taskinfo(struct taskinfo *); ++static void delete_taskinfo(struct taskinfo *); ++static void delete_sockinfo(struct sockinfo *); ++ ++static struct taskinfo *get_taskinfo(pid_t, char); ++ ++static int getsockopt(struct sock *, int, void *, int *); ++ ++/* ++** hook definitions ++*/ ++static struct nf_hook_ops hookin_ipv4; ++static struct nf_hook_ops hookout_ipv4; ++ ++/* ++** getsockopt definitions for communication with user space ++*/ ++static struct nf_sockopt_ops sockopts = { ++ .pf = PF_INET, ++ .get_optmin = NETATOP_BASE_CTL, ++ .get_optmax = NETATOP_BASE_CTL+6, ++ .get = getsockopt, ++ .owner = THIS_MODULE, ++}; ++ ++/* ++** hook function to be called for every incoming local packet ++*/ ++static unsigned int ++ipv4_hookin(unsigned int hooknum, ++ struct sk_buff *skb, ++ const struct net_device *in, ++ const struct net_device *out, ++ int (*okfn)(struct sk_buff *)) ++{ ++ struct iphdr *iph; ++ void *trh; ++ ++ if (skb == NULL) // useless socket buffer? ++ return NF_ACCEPT; ++ ++ /* ++ ** get pointer to IP header and transport header ++ */ ++ iph = (struct iphdr *)skb_network_header(skb); ++ trh = ((char *)iph + (iph->ihl * 4)); ++ ++ /* ++ ** react on protocol number ++ */ ++ switch (iph->protocol) { ++ case IPPROTO_TCP: ++ tcprcvpacks++; ++ analyze_tcpv4_packet(skb, in, 0, 'i', iph, trh); ++ break; ++ ++ case IPPROTO_UDP: ++ udprcvpacks++; ++ analyze_udp_packet(skb, in, 0, 'i', iph, trh); ++ break; ++ ++ case IPPROTO_ICMP: ++ icmprcvpacks++; ++ icmprcvbytes += skb->len + in->hard_header_len + 4; ++ break; ++ ++ default: ++ unknownproto++; ++ } ++ ++ // accept every packet after stats gathering ++ return NF_ACCEPT; ++} ++ ++/* ++** hook function to be called for every outgoing local packet ++*/ ++static unsigned int ++ipv4_hookout(unsigned int hooknum, ++ struct sk_buff *skb, ++ const struct net_device *in, ++ const struct net_device *out, ++ int (*okfn)(struct sk_buff *)) ++{ ++ int in_syscall = !in_interrupt(); ++ struct iphdr *iph; ++ void *trh; ++ ++ if (skb == NULL) // useless socket buffer? ++ return NF_ACCEPT; ++ ++ /* ++ ** get pointer to IP header and transport header ++ */ ++ iph = (struct iphdr *)skb_network_header(skb); ++ trh = skb_transport_header(skb); ++ ++ /* ++ ** react on protocol number ++ */ ++ switch (iph->protocol) { ++ case IPPROTO_TCP: ++ tcpsndpacks++; ++ analyze_tcpv4_packet(skb, out, in_syscall, 'o', iph, trh); ++ break; ++ ++ case IPPROTO_UDP: ++ udpsndpacks++; ++ analyze_udp_packet(skb, out, in_syscall, 'o', iph, trh); ++ break; ++ ++ case IPPROTO_ICMP: ++ icmpsndpacks++; ++ icmpsndbytes += skb->len + out->hard_header_len + 4; ++ break; ++ ++ default: ++ unknownproto++; ++ } ++ ++ // accept every packet after stats gathering ++ return NF_ACCEPT; ++} ++ ++/* ++** generic function (for input and output) to analyze the current packet ++*/ ++static void ++analyze_tcpv4_packet(struct sk_buff *skb, ++ const struct net_device *ndev, // interface description ++ int in_syscall, // called during system call? ++ char direction, // incoming ('i') or outgoing ('o') ++ struct iphdr *iph, void *trh) ++{ ++ union keydef key; ++ struct sockinfo *sip; ++ int bs; // hash bucket for sockinfo ++ unsigned long sflags; ++ ++ /* ++ ** determine tcpv4_ident that identifies this TCP packet ++ ** and calculate hash bucket in sockinfo hash ++ */ ++ get_tcpv4_ident(iph, trh, direction, &key); ++ ++ /* ++ ** check if we have seen this tcpv4_ident before with a ++ ** corresponding thread and thread group ++ */ ++ bs = SHASHTCP4(key.tcp4); ++ ++ spin_lock_irqsave(&shash[bs].lock, sflags); ++ ++ if ( (sip = find_sockinfo(IPPROTO_TCP, &key, sizeof key.tcp4, bs)) ++ == NULL) { ++ // no sockinfo yet: create one ++ if ( (sip = make_sockinfo(IPPROTO_TCP, &key, ++ sizeof key.tcp4, bs)) == NULL) { ++ if (direction == 'i') ++ unidenttcprcvpacks++; ++ else ++ unidenttcpsndpacks++; ++ goto unlocks; ++ } ++ } ++ ++ if (skb->sk) ++ sip->last_state = skb->sk->sk_state; ++ ++ /* ++ ** if needed (re)connect the sockinfo to a taskinfo and update ++ ** the counters ++ */ ++ ++ // connect to thread group and update ++ if (sock2task('g', sip, &sip->tgp, &sip->tgh, ++ skb, ndev, in_syscall, direction)) { ++ // connect to thread and update ++ (void) sock2task('t', sip, &sip->thp, &sip->thh, ++ skb, ndev, in_syscall, direction); ++ } ++ ++unlocks: ++ spin_unlock_irqrestore(&shash[bs].lock, sflags); ++} ++ ++ ++/* ++** generic function (for input and output) to analyze the current packet ++*/ ++static void ++analyze_udp_packet(struct sk_buff *skb, ++ const struct net_device *ndev, // interface description ++ int in_syscall, // called during system call? ++ char direction, // incoming ('i') or outgoing ('o') ++ struct iphdr *iph, void *trh) ++{ ++ struct udphdr *udph = (struct udphdr *)trh; ++ uint16_t udplocal = (direction == 'i' ? ++ ntohs(udph->dest) : ntohs(udph->source)); ++ int bs; // hash bucket for sockinfo ++ ++ union keydef key; ++ struct sockinfo *sip; ++ unsigned long sflags; ++ ++ /* ++ ** check if we have seen this local UDP port before with a ++ ** corresponding thread and thread group ++ */ ++ key.udp = udplocal; ++ bs = SHASHUDP(udplocal); ++ ++ spin_lock_irqsave(&shash[bs].lock, sflags); ++ ++ if ( (sip = find_sockinfo(IPPROTO_UDP, &key, sizeof key.udp, bs)) ++ == NULL) { ++ // no sockinfo yet: create one ++ if ( (sip = make_sockinfo(IPPROTO_UDP, &key, ++ sizeof key.udp, bs)) == NULL) { ++ if (direction == 'i') ++ unidentudprcvpacks++; ++ else ++ unidentudpsndpacks++; ++ goto unlocks; ++ } ++ } ++ ++ /* ++ ** if needed (re)connect the sockinfo to a taskinfo and update ++ ** the counters ++ */ ++ ++ // connect to thread group and update ++ if (sock2task('g', sip, &sip->tgp, &sip->tgh, ++ skb, ndev, in_syscall, direction)) { ++ // connect to thread and update ++ (void) sock2task('t', sip, &sip->thp, &sip->thh, ++ skb, ndev, in_syscall, direction); ++ } ++ ++unlocks: ++ spin_unlock_irqrestore(&shash[bs].lock, sflags); ++} ++ ++/* ++** connect the sockinfo to the correct taskinfo and update the counters ++*/ ++static int ++sock2task(char idtype, struct sockinfo *sip, struct taskinfo **tipp, ++ short *hash, struct sk_buff *skb, const struct net_device *ndev, ++ int in_syscall, char direction) ++{ ++ pid_t curid; ++ unsigned long tflags; ++ ++ if (*tipp == NULL) { ++ /* ++ ** no taskinfo connected yet for this reference from ++ ** sockinfo; to connect to a taskinfo, we must ++ ** be in system call handling now --> verify ++ */ ++ if (!in_syscall) { ++ if (idtype == 'g') ++ update_sockcounters(skb, ndev, sip, direction); ++ ++ return 0; // failed ++ } ++ ++ /* ++ ** try to find existing taskinfo or create new taskinfo ++ */ ++ curid = (idtype == 'g' ? current->tgid : current->pid); ++ ++ *hash = THASH(curid, idtype); // calc hashQ ++ ++ spin_lock_irqsave(&thash[*hash].lock, tflags); ++ ++ if ( (*tipp = get_taskinfo(curid, idtype)) == NULL) { ++ /* ++ ** not possible to connect ++ */ ++ spin_unlock_irqrestore(&thash[*hash].lock, tflags); ++ ++ if (idtype == 'g') ++ update_sockcounters(skb, ndev, sip, direction); ++ ++ return 0; // failed ++ } ++ ++ /* ++ ** new connection made: ++ ** update task counters with sock counters ++ */ ++ sock2task_sync(skb, sip, *tipp); ++ } else { ++ /* ++ ** already related to thread group or thread ++ ** lock existing task ++ */ ++ spin_lock_irqsave(&thash[*hash].lock, tflags); ++ ++ /* ++ ** check if socket has been passed to another process in the ++ ** meantime, like programs as xinetd use to do ++ ** if so, connect sockinfo to the new task ++ */ ++ if (in_syscall) { ++ curid = (idtype == 'g' ? current->tgid : current->pid); ++ ++ if ((*tipp)->id != curid) { ++ spin_unlock_irqrestore(&thash[*hash].lock, ++ tflags); ++ *hash = THASH(curid, idtype); ++ ++ spin_lock_irqsave(&thash[*hash].lock, tflags); ++ ++ if ( (*tipp = get_taskinfo(curid, idtype)) ++ == NULL) { ++ spin_unlock_irqrestore( ++ &thash[*hash].lock, tflags); ++ return 0; ++ } ++ } ++ } ++ } ++ ++ update_taskcounters(skb, ndev, *tipp, direction); ++ ++ spin_unlock_irqrestore(&thash[*hash].lock, tflags); ++ ++ return 1; ++} ++ ++/* ++** update the statistics of a particular thread group or thread ++*/ ++static void ++update_taskcounters(struct sk_buff *skb, const struct net_device *ndev, ++ struct taskinfo *tip, char direction) ++{ ++ struct iphdr *iph = (struct iphdr *)skb_network_header(skb); ++ int reallen = calc_reallen(skb, ndev); ++ ++ switch (iph->protocol) { ++ case IPPROTO_TCP: ++ if (direction == 'i') { ++ tip->tc.tcprcvpacks++; ++ tip->tc.tcprcvbytes += reallen; ++ } else { ++ tip->tc.tcpsndpacks++; ++ tip->tc.tcpsndbytes += reallen; ++ } ++ break; ++ ++ case IPPROTO_UDP: ++ if (direction == 'i') { ++ tip->tc.udprcvpacks++; ++ tip->tc.udprcvbytes += reallen; ++ } else { ++ tip->tc.udpsndpacks++; ++ tip->tc.udpsndbytes += reallen; ++ } ++ } ++} ++ ++/* ++** update the statistics of a sockinfo without a connected task ++*/ ++static void ++update_sockcounters(struct sk_buff *skb, const struct net_device *ndev, ++ struct sockinfo *sip, char direction) ++{ ++ int reallen = calc_reallen(skb, ndev); ++ ++ if (direction == 'i') { ++ sip->rcvpacks++; ++ sip->rcvbytes += reallen; ++ } else { ++ sip->sndpacks++; ++ sip->sndbytes += reallen; ++ } ++} ++ ++/* ++** add the temporary counters in the sockinfo to the new connected task ++*/ ++static void ++sock2task_sync(struct sk_buff *skb, struct sockinfo *sip, struct taskinfo *tip) ++{ ++ struct iphdr *iph = (struct iphdr *)skb_network_header(skb); ++ ++ switch (iph->protocol) { ++ case IPPROTO_TCP: ++ tip->tc.tcprcvpacks += sip->rcvpacks; ++ tip->tc.tcprcvbytes += sip->rcvbytes; ++ tip->tc.tcpsndpacks += sip->sndpacks; ++ tip->tc.tcpsndbytes += sip->sndbytes; ++ break; ++ ++ case IPPROTO_UDP: ++ tip->tc.udprcvpacks += sip->rcvpacks; ++ tip->tc.udprcvbytes += sip->rcvbytes; ++ tip->tc.udpsndpacks += sip->sndpacks; ++ tip->tc.udpsndbytes += sip->sndbytes; ++ } ++} ++ ++static void ++register_unident(struct sockinfo *sip) ++{ ++ switch (sip->proto) { ++ case IPPROTO_TCP: ++ unidenttcprcvpacks += sip->rcvpacks; ++ unidenttcpsndpacks += sip->sndpacks; ++ break; ++ ++ case IPPROTO_UDP: ++ unidentudprcvpacks += sip->rcvpacks; ++ unidentudpsndpacks += sip->sndpacks; ++ } ++} ++ ++/* ++** calculate the number of bytes that are really sent or received ++*/ ++static int ++calc_reallen(struct sk_buff *skb, const struct net_device *ndev) ++{ ++ /* ++ ** calculate the real load of this packet on the network: ++ ** ++ ** - length of IP header, TCP/UDP header and data (skb->len) ++ ** ++ ** since packet assembly/disassembly is done by the IP layer ++ ** (we get an input packet that has been assembled already and ++ ** an output packet that still has to be assembled), additional ++ ** IP headers/interface headers and interface headers have ++ ** to be calculated for packets that are larger than the mtu ++ ** ++ ** - interface header length + 4 bytes crc ++ */ ++ int reallen = skb->len; ++ ++ if (reallen > ndev->mtu) ++ reallen += (reallen / ndev->mtu) * ++ (sizeof(struct iphdr) + ndev->hard_header_len + 4); ++ ++ reallen += ndev->hard_header_len + 4; ++ ++ return reallen; ++} ++ ++/* ++** find the tcpv4_ident for the current packet, represented by ++** the skb_buff ++*/ ++static void ++get_tcpv4_ident(struct iphdr *iph, void *trh, char direction, union keydef *key) ++{ ++ struct tcphdr *tcph = (struct tcphdr *)trh; ++ ++ memset(key, 0, sizeof *key); // important for memcmp later on ++ ++ /* ++ ** determine local/remote IP address and ++ ** determine local/remote port number ++ */ ++ switch (direction) { ++ case 'i': // incoming packet ++ key->tcp4.laddr = ntohl(iph->daddr); ++ key->tcp4.raddr = ntohl(iph->saddr); ++ key->tcp4.lport = ntohs(tcph->dest); ++ key->tcp4.rport = ntohs(tcph->source); ++ break; ++ ++ case 'o': // outgoing packet ++ key->tcp4.laddr = ntohl(iph->saddr); ++ key->tcp4.raddr = ntohl(iph->daddr); ++ key->tcp4.lport = ntohs(tcph->source); ++ key->tcp4.rport = ntohs(tcph->dest); ++ } ++} ++ ++/* ++** search for the sockinfo holding the given address info ++** the appropriate hash bucket must have been locked before calling ++*/ ++static struct sockinfo * ++find_sockinfo(int proto, union keydef *identp, int identsz, int hash) ++{ ++ struct sockinfo *sip = shash[hash].ch.next; ++ ++ /* ++ ** search for appropriate struct ++ */ ++ while (sip != (void *)&shash[hash].ch) { ++ if ( memcmp(&sip->key, identp, identsz) == 0 && ++ sip->proto == proto) { ++ sip->lastact = jiffies_64; ++ return sip; ++ } ++ ++ sip = sip->ch.next; ++ } ++ ++ return NULL; // not existing ++} ++ ++/* ++** create a new sockinfo and fill ++** the appropriate hash bucket must have been locked before calling ++*/ ++static struct sockinfo * ++make_sockinfo(int proto, union keydef *identp, int identsz, int hash) ++{ ++ struct sockinfo *sip; ++ unsigned long flags; ++ ++ /* ++ ** check if the threshold of memory used for sockinfo structs ++ ** is reached to avoid that a fork bomb of processes opening ++ ** a socket leads to memory overload ++ */ ++ if ( (nrs+1) * sizeof(struct sockinfo) > SILIMIT) { ++ spin_lock_irqsave(&nrslock, flags); ++ nrs_ovf++; ++ spin_unlock_irqrestore(&nrslock, flags); ++ return NULL; ++ } ++ ++ if ( (sip = kzalloc(sizeof *sip, GFP_ATOMIC)) == NULL) ++ return NULL; ++ ++ spin_lock_irqsave(&nrslock, flags); ++ nrs++; ++ spin_unlock_irqrestore(&nrslock, flags); ++ ++ /* ++ ** insert new struct in doubly linked list ++ */ ++ sip->ch.next = &shash[hash].ch; ++ sip->ch.prev = shash[hash].ch.prev; ++ ((struct sockinfo *)shash[hash].ch.prev)->ch.next = sip; ++ shash[hash].ch.prev = sip; ++ ++ sip->proto = proto; ++ sip->lastact = jiffies_64; ++ sip->key = *identp; ++ ++ return sip; ++} ++ ++/* ++** search the taskinfo structure holding the info about the given id/type ++** if such taskinfo is not yet present, create a new one ++*/ ++static struct taskinfo * ++get_taskinfo(pid_t id, char type) ++{ ++ int bt = THASH(id, type); ++ struct taskinfo *tip = thash[bt].ch.next; ++ unsigned long tflags; ++ ++ /* ++ ** search if id exists already ++ */ ++ while (tip != (void *)&thash[bt].ch) { ++ if (tip->id == id && tip->type == type) ++ return tip; ++ ++ tip = tip->ch.next; ++ } ++ ++ /* ++ ** check if the threshold of memory used for taskinfo structs ++ ** is reached to avoid that a fork bomb of processes opening ++ ** a socket lead to memory overload ++ */ ++ if ( (nre+nrt+1) * sizeof(struct taskinfo) > TILIMIT) { ++ spin_lock_irqsave(&nrtlock, tflags); ++ nrt_ovf++; ++ spin_unlock_irqrestore(&nrtlock, tflags); ++ return NULL; ++ } ++ ++ /* ++ ** id not known yet ++ ** add new entry to hash list ++ */ ++ if ( (tip = kzalloc(sizeof *tip, GFP_ATOMIC)) == NULL) ++ return NULL; ++ ++ spin_lock_irqsave(&nrtlock, tflags); ++ nrt++; ++ spin_unlock_irqrestore(&nrtlock, tflags); ++ ++ /* ++ ** insert new struct in doubly linked list ++ ** and fill values ++ */ ++ tip->ch.next = &thash[bt].ch; ++ tip->ch.prev = thash[bt].ch.prev; ++ ((struct taskinfo *)thash[bt].ch.prev)->ch.next = tip; ++ thash[bt].ch.prev = tip; ++ ++ tip->id = id; ++ tip->type = type; ++ ++ tip->btime = current->real_start_time.tv_sec + boottime.tv_sec; ++ ++ if (current->real_start_time.tv_nsec + boottime.tv_nsec > NSEC_PER_SEC) ++ tip->btime++; ++ ++ strncpy(tip->command, current->comm, COMLEN); ++ ++ return tip; ++} ++ ++/* ++** function that runs every second to see if a ++** time-based garbage collection cycle has to be ++** forced (i.e. if no process forces it) ++*/ ++static void ++gcperiodic(unsigned long unused) ++{ ++ if (jiffies_64 >= gclast + GCINTERVAL) ++ garbage_collector(); ++ ++ /* ++ ** set timer for next second ++ */ ++ timer.expires = jiffies_64 + HZ; ++ timer.function = gcperiodic; ++ add_timer(&timer); ++} ++ ++/* ++** garbage collector that removes: ++** - exited tasks that are not by user mode programs ++** - sockinfo's that are not used any more ++** - taskinfo's that do not exist any more ++** ++** a lock avoids that the garbage collector runs several times in parallel ++*/ ++static void ++garbage_collector(void) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&gclock, flags); ++ ++ if (jiffies_64 < gclast + (HZ/2)) { // maximum 2 GC cycles per second ++ spin_unlock_irqrestore(&gclock, flags); ++ return; ++ } ++ ++ gctaskexit(); // remove remaining taskinfo structs from exit list ++ ++ gcsockinfo(); // clean up sockinfo structs in shash list ++ ++ gctaskinfo(); // clean up taskinfo structs in thash list ++ ++ gclast = jiffies_64; ++ ++ spin_unlock_irqrestore(&gclock, flags); ++} ++ ++/* ++** tasks in the exitlist can be read by a user mode process for a limited ++** amount of time; this function removes all taskinfo structures that have ++** not been read within that period of time ++** notice that exited processes are chained to the tail, so the oldest ++** can be found at the head ++*/ ++static void ++gctaskexit() ++{ ++ unsigned long flags; ++ struct taskinfo *tip; ++ ++ spin_lock_irqsave(&exitlock, flags); ++ ++ for (tip=exithead; tip;) { ++ if (jiffies_64 < tip->exittime + GCINTERVAL) ++ break; ++ ++ // remove taskinfo from exitlist ++ exithead = tip->ch.next; ++ kfree(tip); ++ nre--; ++ tip = exithead; ++ } ++ ++ /* ++ ** if list empty now, then exithead and exittail both NULL ++ ** wakeup waiters for emptylist ++ */ ++ if (nre == 0) { ++ exittail = NULL; ++ wake_up_interruptible(&exitlist_empty); ++ } ++ ++ spin_unlock_irqrestore(&exitlock, flags); ++} ++ ++/* ++** cleanup sockinfo structures that are connected to finished processes ++*/ ++static void ++gcsockinfo() ++{ ++ int i; ++ struct sockinfo *sip, *sipsave; ++ unsigned long sflags, tflags; ++ ++ /* ++ ** go through all sockinfo hash buckets ++ */ ++ for (i=0; i < SBUCKS; i++) { ++ if (shash[i].ch.next == (void *)&shash[i].ch) ++ continue; // quick return without lock ++ ++ spin_lock_irqsave(&shash[i].lock, sflags); ++ ++ sip = shash[i].ch.next; ++ ++ /* ++ ** search all sockinfo structs chained in one bucket ++ */ ++ while (sip != (void *)&shash[i].ch) { ++ /* ++ ** TCP connections that were not in ++ ** state ESTABLISHED or LISTEN can be ++ ** eliminated ++ */ ++ if (sip->proto == IPPROTO_TCP) { ++ switch (sip->last_state) { ++ case TCP_ESTABLISHED: ++ case TCP_LISTEN: ++ break; ++ ++ default: ++ sipsave = sip->ch.next; ++ delete_sockinfo(sip); ++ sip = sipsave; ++ continue; ++ } ++ } ++ ++ /* ++ ** check if this sockinfo has no relation ++ ** for a while with a thread group ++ ** if so, delete the sockinfo ++ */ ++ if (sip->tgp == NULL) { ++ if (sip->lastact + GCMAXUNREF < jiffies_64) { ++ register_unident(sip); ++ sipsave = sip->ch.next; ++ delete_sockinfo(sip); ++ sip = sipsave; ++ } else { ++ sip = sip->ch.next; ++ } ++ continue; ++ } ++ ++ /* ++ ** check if referred thread group is ++ ** already marked as 'indelete' during this ++ ** sockinfo search ++ ** if so, delete this sockinfo ++ */ ++ spin_lock_irqsave(&thash[sip->tgh].lock, tflags); ++ ++ if (sip->tgp->state == INDELETE) { ++ spin_unlock_irqrestore(&thash[sip->tgh].lock, ++ tflags); ++ sipsave = sip->ch.next; ++ delete_sockinfo(sip); ++ sip = sipsave; ++ continue; ++ } ++ ++ /* ++ ** check if referred thread group still exists; ++ ** this step will be skipped if we already verified ++ ** the existance of the thread group earlier during ++ ** this garbage collection cycle ++ */ ++ if (sip->tgp->state != CHECKED) { ++ /* ++ ** connected thread group not yet verified ++ ** during this cycle, so check if it still ++ ** exists ++ ** if not, mark the thread group as 'indelete' ++ ** (it can not be deleted right now because ++ ** we might find other sockinfo's referring ++ ** to this thread group during the current ++ ** cycle) and delete this sockinfo ++ ** if the thread group exists, just mark ++ ** it as 'checked' for this cycle ++ */ ++ if (find_vpid(sip->tgp->id) == NULL) { ++ sip->tgp->state = INDELETE; ++ spin_unlock_irqrestore( ++ &thash[sip->tgh].lock, tflags); ++ ++ sipsave = sip->ch.next; ++ delete_sockinfo(sip); ++ sip = sipsave; ++ continue; ++ } else { ++ sip->tgp->state = CHECKED; ++ } ++ } ++ ++ spin_unlock_irqrestore(&thash[sip->tgh].lock, tflags); ++ ++ /* ++ ** check if this sockinfo has a relation with a thread ++ ** if not, skip further handling of this sockinfo ++ */ ++ if (sip->thp == NULL) { ++ sip = sip->ch.next; ++ continue; ++ } ++ ++ /* ++ ** check if referred thread is already marked ++ ** as 'indelete' during this sockinfo search ++ ** if so, break connection ++ */ ++ spin_lock_irqsave(&thash[sip->thh].lock, tflags); ++ ++ if (sip->thp->state == INDELETE) { ++ spin_unlock_irqrestore(&thash[sip->thh].lock, ++ tflags); ++ sip->thp = NULL; ++ sip = sip->ch.next; ++ continue; ++ } ++ ++ /* ++ ** check if referred thread is already checked ++ ** during this sockinfo search ++ */ ++ if (sip->thp->state == CHECKED) { ++ spin_unlock_irqrestore(&thash[sip->thh].lock, ++ tflags); ++ sip = sip->ch.next; ++ continue; ++ } ++ ++ /* ++ ** connected thread not yet verified ++ ** check if it still exists ++ ** if not, mark it as 'indelete' and break connection ++ ** if thread exists, mark it 'checked' ++ */ ++ if (find_vpid(sip->thp->id) == NULL) { ++ sip->thp->state = INDELETE; ++ sip->thp = NULL; ++ } else { ++ sip->thp->state = CHECKED; ++ } ++ ++ spin_unlock_irqrestore(&thash[sip->thh].lock, tflags); ++ ++ /* ++ ** check if a TCP port has not been used ++ ** for some time --> destroy even if the thread ++ ** (group) is still there ++ */ ++ if (sip->proto == IPPROTO_TCP && ++ sip->lastact + GCMAXTCP < jiffies_64) { ++ sipsave = sip->ch.next; ++ delete_sockinfo(sip); ++ sip = sipsave; ++ continue; ++ } ++ ++ /* ++ ** check if a UDP port has not been used ++ ** for some time --> destroy even if the thread ++ ** (group) is still there ++ ** e.g. outgoing DNS requests (to remote port 53) are ++ ** issued every time with another source port being ++ ** a new object that should not be kept too long; ++ ** local well-known ports are useful to keep ++ */ ++ if (sip->proto == IPPROTO_UDP && ++ sip->lastact + GCMAXUDP < jiffies_64 && ++ sip->key.udp > 1024) { ++ sipsave = sip->ch.next; ++ delete_sockinfo(sip); ++ sip = sipsave; ++ continue; ++ } ++ ++ sip = sip->ch.next; ++ } ++ ++ spin_unlock_irqrestore(&shash[i].lock, sflags); ++ } ++} ++ ++/* ++** remove taskinfo structures of finished tasks from hash list ++*/ ++static void ++gctaskinfo() ++{ ++ int i; ++ struct taskinfo *tip, *tipsave; ++ unsigned long tflags; ++ ++ /* ++ ** go through all taskinfo hash buckets ++ */ ++ for (i=0; i < TBUCKS; i++) { ++ if (thash[i].ch.next == (void *)&thash[i].ch) ++ continue; // quick return without lock ++ ++ spin_lock_irqsave(&thash[i].lock, tflags); ++ ++ tip = thash[i].ch.next; ++ ++ /* ++ ** check all taskinfo structs chained to this bucket ++ */ ++ while (tip != (void *)&thash[i].ch) { ++ switch (tip->state) { ++ /* ++ ** remove INDELETE tasks from the hash buckets ++ ** -- move thread group to exitlist ++ ** -- destroy thread right away ++ */ ++ case INDELETE: ++ tipsave = tip->ch.next; ++ ++ if (tip->type == 'g') ++ move_taskinfo(tip); // thread group ++ else ++ delete_taskinfo(tip); // thread ++ ++ tip = tipsave; ++ break; ++ ++ case CHECKED: ++ tip->state = 0; ++ tip = tip->ch.next; ++ break; ++ ++ default: // not checked yet ++ if (find_vpid(tip->id) == NULL) { ++ tipsave = tip->ch.next; ++ ++ if (tip->type == 'g') ++ move_taskinfo(tip); ++ else ++ delete_taskinfo(tip); ++ ++ tip = tipsave; ++ } else { ++ tip = tip->ch.next; ++ } ++ } ++ } ++ ++ spin_unlock_irqrestore(&thash[i].lock, tflags); ++ } ++} ++ ++ ++/* ++** remove all sockinfo structs ++*/ ++static void ++wipesockinfo() ++{ ++ struct sockinfo *sip, *sipsave; ++ int i; ++ unsigned long sflags; ++ ++ for (i=0; i < SBUCKS; i++) { ++ spin_lock_irqsave(&shash[i].lock, sflags); ++ ++ sip = shash[i].ch.next; ++ ++ /* ++ ** free all structs chained in one bucket ++ */ ++ while (sip != (void *)&shash[i].ch) { ++ sipsave = sip->ch.next; ++ delete_sockinfo(sip); ++ sip = sipsave; ++ } ++ ++ spin_unlock_irqrestore(&shash[i].lock, sflags); ++ } ++} ++ ++/* ++** remove all taskinfo structs from hash list ++*/ ++static void ++wipetaskinfo() ++{ ++ struct taskinfo *tip, *tipsave; ++ int i; ++ unsigned long tflags; ++ ++ for (i=0; i < TBUCKS; i++) { ++ spin_lock_irqsave(&thash[i].lock, tflags); ++ ++ tip = thash[i].ch.next; ++ ++ /* ++ ** free all structs chained in one bucket ++ */ ++ while (tip != (void *)&thash[i].ch) { ++ tipsave = tip->ch.next; ++ delete_taskinfo(tip); ++ tip = tipsave; ++ } ++ ++ spin_unlock_irqrestore(&thash[i].lock, tflags); ++ } ++} ++ ++/* ++** remove all taskinfo structs from exit list ++*/ ++static void ++wipetaskexit() ++{ ++ gctaskexit(); ++} ++ ++/* ++** move one taskinfo struct from hash bucket to exitlist ++*/ ++static void ++move_taskinfo(struct taskinfo *tip) ++{ ++ unsigned long flags; ++ ++ /* ++ ** remove from hash list ++ */ ++ ((struct taskinfo *)tip->ch.next)->ch.prev = tip->ch.prev; ++ ((struct taskinfo *)tip->ch.prev)->ch.next = tip->ch.next; ++ ++ spin_lock_irqsave(&nrtlock, flags); ++ nrt--; ++ spin_unlock_irqrestore(&nrtlock, flags); ++ ++ /* ++ ** add to exitlist ++ */ ++ tip->ch.next = NULL; ++ tip->state = FINISHED; ++ tip->exittime = jiffies_64; ++ ++ spin_lock_irqsave(&exitlock, flags); ++ ++ if (exittail) { // list filled? ++ exittail->ch.next = tip; ++ exittail = tip; ++ } else { // list empty ++ exithead = exittail = tip; ++ } ++ ++ nre++; ++ ++ wake_up_interruptible(&exitlist_filled); ++ ++ spin_unlock_irqrestore(&exitlock, flags); ++} ++ ++/* ++** remove one taskinfo struct for the hash bucket chain ++*/ ++static void ++delete_taskinfo(struct taskinfo *tip) ++{ ++ unsigned long flags; ++ ++ ((struct taskinfo *)tip->ch.next)->ch.prev = tip->ch.prev; ++ ((struct taskinfo *)tip->ch.prev)->ch.next = tip->ch.next; ++ ++ kfree(tip); ++ ++ spin_lock_irqsave(&nrtlock, flags); ++ nrt--; ++ spin_unlock_irqrestore(&nrtlock, flags); ++} ++ ++/* ++** remove one sockinfo struct for the hash bucket chain ++*/ ++static void ++delete_sockinfo(struct sockinfo *sip) ++{ ++ unsigned long flags; ++ ++ ((struct sockinfo *)sip->ch.next)->ch.prev = sip->ch.prev; ++ ((struct sockinfo *)sip->ch.prev)->ch.next = sip->ch.next; ++ ++ kfree(sip); ++ ++ spin_lock_irqsave(&nrslock, flags); ++ nrs--; ++ spin_unlock_irqrestore(&nrslock, flags); ++} ++ ++/* ++** read function for /proc/netatop ++*/ ++static int ++netatop_read_proc(char *buf, char **start, off_t offset, ++ int count, int *eof, void *data) ++{ ++ return sprintf(buf, "tcpsndpacks: %9lu (unident: %9lu)\n" ++ "tcprcvpacks: %9lu (unident: %9lu)\n" ++ "udpsndpacks: %9lu (unident: %9lu)\n" ++ "udprcvpacks: %9lu (unident: %9lu)\n\n" ++ "icmpsndpacks: %9lu\n" ++ "icmprcvpacks: %9lu\n\n" ++ "#sockinfo: %9lu (overflow: %8lu)\n" ++ "#taskinfo: %9lu (overflow: %8lu)\n" ++ "#taskexit: %9lu\n", ++ tcpsndpacks, unidenttcpsndpacks, ++ tcprcvpacks, unidenttcprcvpacks, ++ udpsndpacks, unidentudpsndpacks, ++ udprcvpacks, unidentudprcvpacks, ++ icmpsndpacks, icmprcvpacks, ++ nrs, nrs_ovf, ++ nrt, nrt_ovf, ++ nre); ++} ++ ++/* ++** called when user spce issues system call getsockopt() ++*/ ++static int ++getsockopt(struct sock *sk, int cmd, void __user *user, int *len) ++{ ++ int bt; ++ struct taskinfo *tip; ++ char tasktype = 't'; ++ struct netpertask npt; ++ unsigned long tflags; ++ ++ /* ++ ** verify the proper privileges ++ */ ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ ++ /* ++ ** react on command ++ */ ++ switch (cmd) { ++ case NETATOP_PROBE: ++ break; ++ ++ case NETATOP_FORCE_GC: ++ garbage_collector(); ++ break; ++ ++ case NETATOP_EMPTY_EXIT: ++ while (nre > 0) { ++ if (wait_event_interruptible(exitlist_empty, nre == 0)) ++ return -ERESTARTSYS; ++ } ++ break; ++ ++ case NETATOP_GETCNT_EXIT: ++ if (nre == 0) ++ wake_up_interruptible(&exitlist_empty); ++ ++ if (*len < sizeof(pid_t)) ++ return -EINVAL; ++ ++ if (*len > sizeof npt) ++ *len = sizeof npt; ++ ++ spin_lock_irqsave(&exitlock, tflags); ++ ++ /* ++ ** check if an exited process is present ++ ** if not, wait for it... ++ */ ++ while (nre == 0) { ++ spin_unlock_irqrestore(&exitlock, tflags); ++ ++ if ( wait_event_interruptible(exitlist_filled, nre > 0)) ++ return -ERESTARTSYS; ++ ++ spin_lock_irqsave(&exitlock, tflags); ++ } ++ ++ /* ++ ** get first eprocess from exitlist and remove it from there ++ */ ++ tip = exithead; ++ ++ if ( (exithead = tip->ch.next) == NULL) ++ exittail = NULL; ++ ++ nre--; ++ ++ spin_unlock_irqrestore(&exitlock, tflags); ++ ++ /* ++ ** pass relevant info to user mode ++ ** and free taskinfo struct ++ */ ++ npt.id = tip->id; ++ npt.tc = tip->tc; ++ npt.btime = tip->btime; ++ memcpy(npt.command, tip->command, COMLEN); ++ ++ if (copy_to_user(user, &npt, *len) != 0) ++ return -EFAULT; ++ ++ kfree(tip); ++ ++ return 0; ++ ++ case NETATOP_GETCNT_TGID: ++ tasktype = 'g'; ++ ++ case NETATOP_GETCNT_PID: ++ if (*len < sizeof(pid_t)) ++ return -EINVAL; ++ ++ if (*len > sizeof npt) ++ *len = sizeof npt; ++ ++ if (copy_from_user(&npt, user, *len) != 0) ++ return -EFAULT; ++ ++ /* ++ ** search requested id in taskinfo hash ++ */ ++ bt = THASH(npt.id, tasktype); // calculate hash ++ ++ if (thash[bt].ch.next == (void *)&thash[bt].ch) ++ return -ESRCH; // quick return without lock ++ ++ spin_lock_irqsave(&thash[bt].lock, tflags); ++ ++ tip = thash[bt].ch.next; ++ ++ while (tip != (void *)&thash[bt].ch) { ++ // is this the one? ++ if (tip->id == npt.id && tip->type == tasktype) { ++ /* ++ ** found: copy results to user space ++ */ ++ memcpy(npt.command, tip->command, COMLEN); ++ npt.tc = tip->tc; ++ npt.btime = tip->btime; ++ ++ spin_unlock_irqrestore(&thash[bt].lock, tflags); ++ ++ if (copy_to_user(user, &npt, *len) != 0) ++ return -EFAULT; ++ else ++ return 0; ++ } ++ ++ tip = tip->ch.next; ++ } ++ ++ spin_unlock_irqrestore(&thash[bt].lock, tflags); ++ return -ESRCH; ++ ++ default: ++ printk(KERN_INFO "unknown getsockopt command %d\n", cmd); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++/* ++** called when module loaded ++*/ ++int ++init_module() ++{ ++ int i; ++ ++ /* ++ ** initialize various admi ++ */ ++ for (i=0; i < TBUCKS; i++) { ++ thash[i].ch.next = &thash[i].ch; ++ thash[i].ch.prev = &thash[i].ch; ++ spin_lock_init(&thash[i].lock); ++ } ++ ++ for (i=0; i < SBUCKS; i++) { ++ shash[i].ch.next = &shash[i].ch; ++ shash[i].ch.prev = &shash[i].ch; ++ spin_lock_init(&shash[i].lock); ++ } ++ ++ getboottime(&boottime); ++ ++ /* ++ ** register getsockopt for user space communication ++ */ ++ if (nf_register_sockopt(&sockopts) < 0) ++ return -1; ++ ++ /* ++ ** prepare hooks and register ++ */ ++ hookin_ipv4.hooknum = NF_IP_LOCAL_IN; // input packs ++ hookin_ipv4.hook = ipv4_hookin; // func to call ++ hookin_ipv4.pf = PF_INET; // IPV4 packets ++ hookin_ipv4.priority = NF_IP_PRI_FIRST; // highest prio ++ ++ hookout_ipv4.hooknum = NF_IP_LOCAL_OUT; // output packs ++ hookout_ipv4.hook = ipv4_hookout; // func to call ++ hookout_ipv4.pf = PF_INET; // IPV4 packets ++ hookout_ipv4.priority = NF_IP_PRI_FIRST; // highest prio ++ ++ nf_register_hook(&hookin_ipv4); // register hook ++ nf_register_hook(&hookout_ipv4); // register hook ++ ++ /* ++ ** create a /proc-entry to produce status-info on request ++ */ ++ create_proc_read_entry("netatop", 0444, NULL, netatop_read_proc, NULL); ++ ++ /* ++ ** activate timer for periodic call of garbage collector ++ */ ++ init_timer(&timer); ++ ++ timer.expires = jiffies_64 + HZ; ++ timer.function = gcperiodic; ++ add_timer(&timer); ++ ++ return 0; // return success ++} ++ ++/* ++** called when module unloaded ++*/ ++void ++cleanup_module() ++{ ++ nf_unregister_hook(&hookin_ipv4); ++ nf_unregister_hook(&hookout_ipv4); ++ ++ remove_proc_entry("netatop", NULL); ++ ++ del_timer(&timer); ++ ++ nf_unregister_sockopt(&sockopts); ++ ++ /* ++ ** destroy allocated stats ++ */ ++ wipesockinfo(); ++ wipetaskinfo(); ++ wipetaskexit(); ++} +diff -uNr linux-3.2.33-go.orig/3rdparty/netatop/netatop.h 3rdparty/netatop/netatop.h +--- linux-3.2.33-go.orig/3rdparty/netatop/netatop.h 1970-01-01 01:00:00.000000000 +0100 ++++ 3rdparty/netatop/netatop.h 2012-11-12 18:08:29.000000000 +0100 +@@ -0,0 +1,47 @@ ++#define COMLEN 16 ++ ++struct taskcount { ++ unsigned long long tcpsndpacks; ++ unsigned long long tcpsndbytes; ++ unsigned long long tcprcvpacks; ++ unsigned long long tcprcvbytes; ++ ++ unsigned long long udpsndpacks; ++ unsigned long long udpsndbytes; ++ unsigned long long udprcvpacks; ++ unsigned long long udprcvbytes; ++ ++ /* space for future extensions */ ++}; ++ ++struct netpertask { ++ pid_t id; // tgid or tid (depending on command) ++ unsigned long btime; ++ char command[COMLEN]; ++ ++ struct taskcount tc; ++}; ++ ++ ++/* ++** getsocktop commands ++*/ ++#define NETATOP_BASE_CTL 15661 ++ ++// just probe if the netatop module is active ++#define NETATOP_PROBE (NETATOP_BASE_CTL) ++ ++// force garbage collection to make finished processes available ++#define NETATOP_FORCE_GC (NETATOP_BASE_CTL+1) ++ ++// wait until all finished processes are read (blocks until done) ++#define NETATOP_EMPTY_EXIT (NETATOP_BASE_CTL+2) ++ ++// get info for finished process (blocks until available) ++#define NETATOP_GETCNT_EXIT (NETATOP_BASE_CTL+3) ++ ++// get counters for thread group (i.e. process): input is 'id' (pid) ++#define NETATOP_GETCNT_TGID (NETATOP_BASE_CTL+4) ++ ++// get counters for thread: input is 'id' (tid) ++#define NETATOP_GETCNT_PID (NETATOP_BASE_CTL+5) +diff -uNr linux-3.2.33-go.orig/3rdparty/netatop/netatopversion.h 3rdparty/netatop/netatopversion.h +--- linux-3.2.33-go.orig/3rdparty/netatop/netatopversion.h 1970-01-01 01:00:00.000000000 +0100 ++++ 3rdparty/netatop/netatopversion.h 2012-11-12 18:08:29.000000000 +0100 +@@ -0,0 +1,2 @@ ++#define NETATOPVERSION "0.1.1" ++#define NETATOPDATE "2012/11/12 18:08:23" diff --git a/3.3.8/600-netfilter_layer7_2.22.patch b/3.3.8/600-netfilter_layer7_2.22.patch new file mode 100644 index 0000000..f305559 --- /dev/null +++ b/3.3.8/600-netfilter_layer7_2.22.patch @@ -0,0 +1,2142 @@ +--- a/net/netfilter/Kconfig ++++ b/net/netfilter/Kconfig +@@ -1053,6 +1053,27 @@ config NETFILTER_XT_MATCH_STATE + + To compile it as a module, choose M here. If unsure, say N. + ++config NETFILTER_XT_MATCH_LAYER7 ++ tristate '"layer7" match support' ++ depends on NETFILTER_XTABLES ++ depends on EXPERIMENTAL && (IP_NF_CONNTRACK || NF_CONNTRACK) ++ depends on NETFILTER_ADVANCED ++ help ++ Say Y if you want to be able to classify connections (and their ++ packets) based on regular expression matching of their application ++ layer data. This is one way to classify applications such as ++ peer-to-peer filesharing systems that do not always use the same ++ port. ++ ++ To compile it as a module, choose M here. If unsure, say N. ++ ++config NETFILTER_XT_MATCH_LAYER7_DEBUG ++ bool 'Layer 7 debugging output' ++ depends on NETFILTER_XT_MATCH_LAYER7 ++ help ++ Say Y to get lots of debugging output. ++ ++ + config NETFILTER_XT_MATCH_STATISTIC + tristate '"statistic" match support' + depends on NETFILTER_ADVANCED +--- a/net/netfilter/Makefile ++++ b/net/netfilter/Makefile +@@ -105,6 +105,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) + obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o + obj-$(CONFIG_NETFILTER_XT_MATCH_SOCKET) += xt_socket.o + obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o ++obj-$(CONFIG_NETFILTER_XT_MATCH_LAYER7) += xt_layer7.o + obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o + obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o + obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o +--- /dev/null ++++ b/net/netfilter/xt_layer7.c +@@ -0,0 +1,666 @@ ++/* ++ Kernel module to match application layer (OSI layer 7) data in connections. ++ ++ http://l7-filter.sf.net ++ ++ (C) 2003-2009 Matthew Strait and Ethan Sommer. ++ ++ This program is free software; you can redistribute it and/or ++ modify it under the terms of the GNU General Public License ++ as published by the Free Software Foundation; either version ++ 2 of the License, or (at your option) any later version. ++ http://www.gnu.org/licenses/gpl.txt ++ ++ Based on ipt_string.c (C) 2000 Emmanuel Roger , ++ xt_helper.c (C) 2002 Harald Welte and cls_layer7.c (C) 2003 Matthew Strait, ++ Ethan Sommer, Justin Levandoski. ++*/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 27) ++#include ++#include ++#endif ++#include ++#include ++#include ++#include ++ ++#include "regexp/regexp.c" ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Matthew Strait , Ethan Sommer "); ++MODULE_DESCRIPTION("iptables application layer match module"); ++MODULE_ALIAS("ipt_layer7"); ++MODULE_VERSION("2.21"); ++ ++static int maxdatalen = 2048; // this is the default ++module_param(maxdatalen, int, 0444); ++MODULE_PARM_DESC(maxdatalen, "maximum bytes of data looked at by l7-filter"); ++#ifdef CONFIG_NETFILTER_XT_MATCH_LAYER7_DEBUG ++ #define DPRINTK(format,args...) printk(format,##args) ++#else ++ #define DPRINTK(format,args...) ++#endif ++ ++/* Number of packets whose data we look at. ++This can be modified through /proc/net/layer7_numpackets */ ++static int num_packets = 10; ++ ++static struct pattern_cache { ++ char * regex_string; ++ regexp * pattern; ++ struct pattern_cache * next; ++} * first_pattern_cache = NULL; ++ ++DEFINE_SPINLOCK(l7_lock); ++ ++static int total_acct_packets(struct nf_conn *ct) ++{ ++#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 26) ++ BUG_ON(ct == NULL); ++ return (ct->counters[IP_CT_DIR_ORIGINAL].packets + ct->counters[IP_CT_DIR_REPLY].packets); ++#else ++ struct nf_conn_counter *acct; ++ ++ BUG_ON(ct == NULL); ++ acct = nf_conn_acct_find(ct); ++ if (!acct) ++ return 0; ++ return (atomic64_read(&acct[IP_CT_DIR_ORIGINAL].packets) + atomic64_read(&acct[IP_CT_DIR_REPLY].packets)); ++#endif ++} ++ ++#ifdef CONFIG_IP_NF_MATCH_LAYER7_DEBUG ++/* Converts an unfriendly string into a friendly one by ++replacing unprintables with periods and all whitespace with " ". */ ++static char * friendly_print(unsigned char * s) ++{ ++ char * f = kmalloc(strlen(s) + 1, GFP_ATOMIC); ++ int i; ++ ++ if(!f) { ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory in " ++ "friendly_print, bailing.\n"); ++ return NULL; ++ } ++ ++ for(i = 0; i < strlen(s); i++){ ++ if(isprint(s[i]) && s[i] < 128) f[i] = s[i]; ++ else if(isspace(s[i])) f[i] = ' '; ++ else f[i] = '.'; ++ } ++ f[i] = '\0'; ++ return f; ++} ++ ++static char dec2hex(int i) ++{ ++ switch (i) { ++ case 0 ... 9: ++ return (i + '0'); ++ break; ++ case 10 ... 15: ++ return (i - 10 + 'a'); ++ break; ++ default: ++ if (net_ratelimit()) ++ printk("layer7: Problem in dec2hex\n"); ++ return '\0'; ++ } ++} ++ ++static char * hex_print(unsigned char * s) ++{ ++ char * g = kmalloc(strlen(s)*3 + 1, GFP_ATOMIC); ++ int i; ++ ++ if(!g) { ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory in hex_print, " ++ "bailing.\n"); ++ return NULL; ++ } ++ ++ for(i = 0; i < strlen(s); i++) { ++ g[i*3 ] = dec2hex(s[i]/16); ++ g[i*3 + 1] = dec2hex(s[i]%16); ++ g[i*3 + 2] = ' '; ++ } ++ g[i*3] = '\0'; ++ ++ return g; ++} ++#endif // DEBUG ++ ++/* Use instead of regcomp. As we expect to be seeing the same regexps over and ++over again, it make sense to cache the results. */ ++static regexp * compile_and_cache(const char * regex_string, ++ const char * protocol) ++{ ++ struct pattern_cache * node = first_pattern_cache; ++ struct pattern_cache * last_pattern_cache = first_pattern_cache; ++ struct pattern_cache * tmp; ++ unsigned int len; ++ ++ while (node != NULL) { ++ if (!strcmp(node->regex_string, regex_string)) ++ return node->pattern; ++ ++ last_pattern_cache = node;/* points at the last non-NULL node */ ++ node = node->next; ++ } ++ ++ /* If we reach the end of the list, then we have not yet cached ++ the pattern for this regex. Let's do that now. ++ Be paranoid about running out of memory to avoid list corruption. */ ++ tmp = kmalloc(sizeof(struct pattern_cache), GFP_ATOMIC); ++ ++ if(!tmp) { ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory in " ++ "compile_and_cache, bailing.\n"); ++ return NULL; ++ } ++ ++ tmp->regex_string = kmalloc(strlen(regex_string) + 1, GFP_ATOMIC); ++ tmp->pattern = kmalloc(sizeof(struct regexp), GFP_ATOMIC); ++ tmp->next = NULL; ++ ++ if(!tmp->regex_string || !tmp->pattern) { ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory in " ++ "compile_and_cache, bailing.\n"); ++ kfree(tmp->regex_string); ++ kfree(tmp->pattern); ++ kfree(tmp); ++ return NULL; ++ } ++ ++ /* Ok. The new node is all ready now. */ ++ node = tmp; ++ ++ if(first_pattern_cache == NULL) /* list is empty */ ++ first_pattern_cache = node; /* make node the beginning */ ++ else ++ last_pattern_cache->next = node; /* attach node to the end */ ++ ++ /* copy the string and compile the regex */ ++ len = strlen(regex_string); ++ DPRINTK("About to compile this: \"%s\"\n", regex_string); ++ node->pattern = regcomp((char *)regex_string, &len); ++ if ( !node->pattern ) { ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: Error compiling regexp " ++ "\"%s\" (%s)\n", ++ regex_string, protocol); ++ /* pattern is now cached as NULL, so we won't try again. */ ++ } ++ ++ strcpy(node->regex_string, regex_string); ++ return node->pattern; ++} ++ ++static int can_handle(const struct sk_buff *skb) ++{ ++ if(!ip_hdr(skb)) /* not IP */ ++ return 0; ++ if(ip_hdr(skb)->protocol != IPPROTO_TCP && ++ ip_hdr(skb)->protocol != IPPROTO_UDP && ++ ip_hdr(skb)->protocol != IPPROTO_ICMP) ++ return 0; ++ return 1; ++} ++ ++/* Returns offset the into the skb->data that the application data starts */ ++static int app_data_offset(const struct sk_buff *skb) ++{ ++ /* In case we are ported somewhere (ebtables?) where ip_hdr(skb) ++ isn't set, this can be gotten from 4*(skb->data[0] & 0x0f) as well. */ ++ int ip_hl = 4*ip_hdr(skb)->ihl; ++ ++ if( ip_hdr(skb)->protocol == IPPROTO_TCP ) { ++ /* 12 == offset into TCP header for the header length field. ++ Can't get this with skb->h.th->doff because the tcphdr ++ struct doesn't get set when routing (this is confirmed to be ++ true in Netfilter as well as QoS.) */ ++ int tcp_hl = 4*(skb->data[ip_hl + 12] >> 4); ++ ++ return ip_hl + tcp_hl; ++ } else if( ip_hdr(skb)->protocol == IPPROTO_UDP ) { ++ return ip_hl + 8; /* UDP header is always 8 bytes */ ++ } else if( ip_hdr(skb)->protocol == IPPROTO_ICMP ) { ++ return ip_hl + 8; /* ICMP header is 8 bytes */ ++ } else { ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: tried to handle unknown " ++ "protocol!\n"); ++ return ip_hl + 8; /* something reasonable */ ++ } ++} ++ ++/* handles whether there's a match when we aren't appending data anymore */ ++static int match_no_append(struct nf_conn * conntrack, ++ struct nf_conn * master_conntrack, ++ enum ip_conntrack_info ctinfo, ++ enum ip_conntrack_info master_ctinfo, ++ const struct xt_layer7_info * info) ++{ ++ /* If we're in here, throw the app data away */ ++ if(master_conntrack->layer7.app_data != NULL) { ++ ++ #ifdef CONFIG_IP_NF_MATCH_LAYER7_DEBUG ++ if(!master_conntrack->layer7.app_proto) { ++ char * f = ++ friendly_print(master_conntrack->layer7.app_data); ++ char * g = ++ hex_print(master_conntrack->layer7.app_data); ++ DPRINTK("\nl7-filter gave up after %d bytes " ++ "(%d packets):\n%s\n", ++ strlen(f), total_acct_packets(master_conntrack), f); ++ kfree(f); ++ DPRINTK("In hex: %s\n", g); ++ kfree(g); ++ } ++ #endif ++ ++ kfree(master_conntrack->layer7.app_data); ++ master_conntrack->layer7.app_data = NULL; /* don't free again */ ++ } ++ ++ if(master_conntrack->layer7.app_proto){ ++ /* Here child connections set their .app_proto (for /proc) */ ++ if(!conntrack->layer7.app_proto) { ++ conntrack->layer7.app_proto = ++ kmalloc(strlen(master_conntrack->layer7.app_proto)+1, ++ GFP_ATOMIC); ++ if(!conntrack->layer7.app_proto){ ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory " ++ "in match_no_append, " ++ "bailing.\n"); ++ return 1; ++ } ++ strcpy(conntrack->layer7.app_proto, ++ master_conntrack->layer7.app_proto); ++ } ++ ++ return (!strcmp(master_conntrack->layer7.app_proto, ++ info->protocol)); ++ } ++ else { ++ /* If not classified, set to "unknown" to distinguish from ++ connections that are still being tested. */ ++ master_conntrack->layer7.app_proto = ++ kmalloc(strlen("unknown")+1, GFP_ATOMIC); ++ if(!master_conntrack->layer7.app_proto){ ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory in " ++ "match_no_append, bailing.\n"); ++ return 1; ++ } ++ strcpy(master_conntrack->layer7.app_proto, "unknown"); ++ return 0; ++ } ++} ++ ++/* add the new app data to the conntrack. Return number of bytes added. */ ++static int add_data(struct nf_conn * master_conntrack, ++ char * app_data, int appdatalen) ++{ ++ int length = 0, i; ++ int oldlength = master_conntrack->layer7.app_data_len; ++ ++ /* This is a fix for a race condition by Deti Fliegl. However, I'm not ++ clear on whether the race condition exists or whether this really ++ fixes it. I might just be being dense... Anyway, if it's not really ++ a fix, all it does is waste a very small amount of time. */ ++ if(!master_conntrack->layer7.app_data) return 0; ++ ++ /* Strip nulls. Make everything lower case (our regex lib doesn't ++ do case insensitivity). Add it to the end of the current data. */ ++ for(i = 0; i < maxdatalen-oldlength-1 && ++ i < appdatalen; i++) { ++ if(app_data[i] != '\0') { ++ /* the kernel version of tolower mungs 'upper ascii' */ ++ master_conntrack->layer7.app_data[length+oldlength] = ++ isascii(app_data[i])? ++ tolower(app_data[i]) : app_data[i]; ++ length++; ++ } ++ } ++ ++ master_conntrack->layer7.app_data[length+oldlength] = '\0'; ++ master_conntrack->layer7.app_data_len = length + oldlength; ++ ++ return length; ++} ++ ++/* taken from drivers/video/modedb.c */ ++static int my_atoi(const char *s) ++{ ++ int val = 0; ++ ++ for (;; s++) { ++ switch (*s) { ++ case '0'...'9': ++ val = 10*val+(*s-'0'); ++ break; ++ default: ++ return val; ++ } ++ } ++} ++ ++/* write out num_packets to userland. */ ++static int layer7_read_proc(char* page, char ** start, off_t off, int count, ++ int* eof, void * data) ++{ ++ if(num_packets > 99 && net_ratelimit()) ++ printk(KERN_ERR "layer7: NOT REACHED. num_packets too big\n"); ++ ++ page[0] = num_packets/10 + '0'; ++ page[1] = num_packets%10 + '0'; ++ page[2] = '\n'; ++ page[3] = '\0'; ++ ++ *eof=1; ++ ++ return 3; ++} ++ ++/* Read in num_packets from userland */ ++static int layer7_write_proc(struct file* file, const char* buffer, ++ unsigned long count, void *data) ++{ ++ char * foo = kmalloc(count, GFP_ATOMIC); ++ ++ if(!foo){ ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory, bailing. " ++ "num_packets unchanged.\n"); ++ return count; ++ } ++ ++ if(copy_from_user(foo, buffer, count)) { ++ return -EFAULT; ++ } ++ ++ ++ num_packets = my_atoi(foo); ++ kfree (foo); ++ ++ /* This has an arbitrary limit to make the math easier. I'm lazy. ++ But anyway, 99 is a LOT! If you want more, you're doing it wrong! */ ++ if(num_packets > 99) { ++ printk(KERN_WARNING "layer7: num_packets can't be > 99.\n"); ++ num_packets = 99; ++ } else if(num_packets < 1) { ++ printk(KERN_WARNING "layer7: num_packets can't be < 1.\n"); ++ num_packets = 1; ++ } ++ ++ return count; ++} ++ ++static bool ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28) ++match(const struct sk_buff *skbin, const struct xt_match_param *par) ++#else ++match(const struct sk_buff *skbin, ++ const struct net_device *in, ++ const struct net_device *out, ++ const struct xt_match *match, ++ const void *matchinfo, ++ int offset, ++ unsigned int protoff, ++ bool *hotdrop) ++#endif ++{ ++ /* sidestep const without getting a compiler warning... */ ++ struct sk_buff * skb = (struct sk_buff *)skbin; ++ ++ const struct xt_layer7_info * info = ++ #if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28) ++ par->matchinfo; ++ #else ++ matchinfo; ++ #endif ++ ++ enum ip_conntrack_info master_ctinfo, ctinfo; ++ struct nf_conn *master_conntrack, *conntrack; ++ unsigned char * app_data; ++ unsigned int pattern_result, appdatalen; ++ regexp * comppattern; ++ ++ /* Be paranoid/incompetent - lock the entire match function. */ ++ spin_lock_bh(&l7_lock); ++ ++ if(!can_handle(skb)){ ++ DPRINTK("layer7: This is some protocol I can't handle.\n"); ++ spin_unlock_bh(&l7_lock); ++ return info->invert; ++ } ++ ++ /* Treat parent & all its children together as one connection, except ++ for the purpose of setting conntrack->layer7.app_proto in the actual ++ connection. This makes /proc/net/ip_conntrack more satisfying. */ ++ if(!(conntrack = nf_ct_get(skb, &ctinfo)) || ++ !(master_conntrack=nf_ct_get(skb,&master_ctinfo))){ ++ DPRINTK("layer7: couldn't get conntrack.\n"); ++ spin_unlock_bh(&l7_lock); ++ return info->invert; ++ } ++ ++ /* Try to get a master conntrack (and its master etc) for FTP, etc. */ ++ while (master_ct(master_conntrack) != NULL) ++ master_conntrack = master_ct(master_conntrack); ++ ++ /* if we've classified it or seen too many packets */ ++ if(total_acct_packets(master_conntrack) > num_packets || ++ master_conntrack->layer7.app_proto) { ++ ++ pattern_result = match_no_append(conntrack, master_conntrack, ++ ctinfo, master_ctinfo, info); ++ ++ /* skb->cb[0] == seen. Don't do things twice if there are ++ multiple l7 rules. I'm not sure that using cb for this purpose ++ is correct, even though it says "put your private variables ++ there". But it doesn't look like it is being used for anything ++ else in the skbs that make it here. */ ++ skb->cb[0] = 1; /* marking it seen here's probably irrelevant */ ++ ++ spin_unlock_bh(&l7_lock); ++ return (pattern_result ^ info->invert); ++ } ++ ++ if(skb_is_nonlinear(skb)){ ++ if(skb_linearize(skb) != 0){ ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: failed to linearize " ++ "packet, bailing.\n"); ++ spin_unlock_bh(&l7_lock); ++ return info->invert; ++ } ++ } ++ ++ /* now that the skb is linearized, it's safe to set these. */ ++ app_data = skb->data + app_data_offset(skb); ++ appdatalen = skb_tail_pointer(skb) - app_data; ++ ++ /* the return value gets checked later, when we're ready to use it */ ++ comppattern = compile_and_cache(info->pattern, info->protocol); ++ ++ /* On the first packet of a connection, allocate space for app data */ ++ if(total_acct_packets(master_conntrack) == 1 && !skb->cb[0] && ++ !master_conntrack->layer7.app_data){ ++ master_conntrack->layer7.app_data = ++ kmalloc(maxdatalen, GFP_ATOMIC); ++ if(!master_conntrack->layer7.app_data){ ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory in " ++ "match, bailing.\n"); ++ spin_unlock_bh(&l7_lock); ++ return info->invert; ++ } ++ ++ master_conntrack->layer7.app_data[0] = '\0'; ++ } ++ ++ /* Can be here, but unallocated, if numpackets is increased near ++ the beginning of a connection */ ++ if(master_conntrack->layer7.app_data == NULL){ ++ spin_unlock_bh(&l7_lock); ++ return info->invert; /* unmatched */ ++ } ++ ++ if(!skb->cb[0]){ ++ int newbytes; ++ newbytes = add_data(master_conntrack, app_data, appdatalen); ++ ++ if(newbytes == 0) { /* didn't add any data */ ++ skb->cb[0] = 1; ++ /* Didn't match before, not going to match now */ ++ spin_unlock_bh(&l7_lock); ++ return info->invert; ++ } ++ } ++ ++ /* If looking for "unknown", then never match. "Unknown" means that ++ we've given up; we're still trying with these packets. */ ++ if(!strcmp(info->protocol, "unknown")) { ++ pattern_result = 0; ++ /* If looking for "unset", then always match. "Unset" means that we ++ haven't yet classified the connection. */ ++ } else if(!strcmp(info->protocol, "unset")) { ++ pattern_result = 2; ++ DPRINTK("layer7: matched unset: not yet classified " ++ "(%d/%d packets)\n", ++ total_acct_packets(master_conntrack), num_packets); ++ /* If the regexp failed to compile, don't bother running it */ ++ } else if(comppattern && ++ regexec(comppattern, master_conntrack->layer7.app_data)){ ++ DPRINTK("layer7: matched %s\n", info->protocol); ++ pattern_result = 1; ++ } else pattern_result = 0; ++ ++ if(pattern_result == 1) { ++ master_conntrack->layer7.app_proto = ++ kmalloc(strlen(info->protocol)+1, GFP_ATOMIC); ++ if(!master_conntrack->layer7.app_proto){ ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory in " ++ "match, bailing.\n"); ++ spin_unlock_bh(&l7_lock); ++ return (pattern_result ^ info->invert); ++ } ++ strcpy(master_conntrack->layer7.app_proto, info->protocol); ++ } else if(pattern_result > 1) { /* cleanup from "unset" */ ++ pattern_result = 1; ++ } ++ ++ /* mark the packet seen */ ++ skb->cb[0] = 1; ++ ++ spin_unlock_bh(&l7_lock); ++ return (pattern_result ^ info->invert); ++} ++ ++// load nf_conntrack_ipv4 ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28) ++static bool check(const struct xt_mtchk_param *par) ++{ ++ if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { ++ printk(KERN_WARNING "can't load conntrack support for " ++ "proto=%d\n", par->match->family); ++#else ++static bool check(const char *tablename, const void *inf, ++ const struct xt_match *match, void *matchinfo, ++ unsigned int hook_mask) ++{ ++ if (nf_ct_l3proto_try_module_get(match->family) < 0) { ++ printk(KERN_WARNING "can't load conntrack support for " ++ "proto=%d\n", match->family); ++#endif ++ return 0; ++ } ++ return 1; ++} ++ ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28) ++ static void destroy(const struct xt_mtdtor_param *par) ++ { ++ nf_ct_l3proto_module_put(par->match->family); ++ } ++#else ++ static void destroy(const struct xt_match *match, void *matchinfo) ++ { ++ nf_ct_l3proto_module_put(match->family); ++ } ++#endif ++ ++static struct xt_match xt_layer7_match[] __read_mostly = { ++{ ++ .name = "layer7", ++ .family = AF_INET, ++ .checkentry = check, ++ .match = match, ++ .destroy = destroy, ++ .matchsize = sizeof(struct xt_layer7_info), ++ .me = THIS_MODULE ++} ++}; ++ ++static void layer7_cleanup_proc(void) ++{ ++ remove_proc_entry("layer7_numpackets", init_net.proc_net); ++} ++ ++/* register the proc file */ ++static void layer7_init_proc(void) ++{ ++ struct proc_dir_entry* entry; ++ entry = create_proc_entry("layer7_numpackets", 0644, init_net.proc_net); ++ entry->read_proc = layer7_read_proc; ++ entry->write_proc = layer7_write_proc; ++} ++ ++static int __init xt_layer7_init(void) ++{ ++ need_conntrack(); ++ ++ layer7_init_proc(); ++ if(maxdatalen < 1) { ++ printk(KERN_WARNING "layer7: maxdatalen can't be < 1, " ++ "using 1\n"); ++ maxdatalen = 1; ++ } ++ /* This is not a hard limit. It's just here to prevent people from ++ bringing their slow machines to a grinding halt. */ ++ else if(maxdatalen > 65536) { ++ printk(KERN_WARNING "layer7: maxdatalen can't be > 65536, " ++ "using 65536\n"); ++ maxdatalen = 65536; ++ } ++ return xt_register_matches(xt_layer7_match, ++ ARRAY_SIZE(xt_layer7_match)); ++} ++ ++static void __exit xt_layer7_fini(void) ++{ ++ layer7_cleanup_proc(); ++ xt_unregister_matches(xt_layer7_match, ARRAY_SIZE(xt_layer7_match)); ++} ++ ++module_init(xt_layer7_init); ++module_exit(xt_layer7_fini); +--- /dev/null ++++ b/net/netfilter/regexp/regexp.c +@@ -0,0 +1,1197 @@ ++/* ++ * regcomp and regexec -- regsub and regerror are elsewhere ++ * @(#)regexp.c 1.3 of 18 April 87 ++ * ++ * Copyright (c) 1986 by University of Toronto. ++ * Written by Henry Spencer. Not derived from licensed software. ++ * ++ * Permission is granted to anyone to use this software for any ++ * purpose on any computer system, and to redistribute it freely, ++ * subject to the following restrictions: ++ * ++ * 1. The author is not responsible for the consequences of use of ++ * this software, no matter how awful, even if they arise ++ * from defects in it. ++ * ++ * 2. The origin of this software must not be misrepresented, either ++ * by explicit claim or by omission. ++ * ++ * 3. Altered versions must be plainly marked as such, and must not ++ * be misrepresented as being the original software. ++ * ++ * Beware that some of this code is subtly aware of the way operator ++ * precedence is structured in regular expressions. Serious changes in ++ * regular-expression syntax might require a total rethink. ++ * ++ * This code was modified by Ethan Sommer to work within the kernel ++ * (it now uses kmalloc etc..) ++ * ++ * Modified slightly by Matthew Strait to use more modern C. ++ */ ++ ++#include "regexp.h" ++#include "regmagic.h" ++ ++/* added by ethan and matt. Lets it work in both kernel and user space. ++(So iptables can use it, for instance.) Yea, it goes both ways... */ ++#if __KERNEL__ ++ #define malloc(foo) kmalloc(foo,GFP_ATOMIC) ++#else ++ #define printk(format,args...) printf(format,##args) ++#endif ++ ++void regerror(char * s) ++{ ++ printk("<3>Regexp: %s\n", s); ++ /* NOTREACHED */ ++} ++ ++/* ++ * The "internal use only" fields in regexp.h are present to pass info from ++ * compile to execute that permits the execute phase to run lots faster on ++ * simple cases. They are: ++ * ++ * regstart char that must begin a match; '\0' if none obvious ++ * reganch is the match anchored (at beginning-of-line only)? ++ * regmust string (pointer into program) that match must include, or NULL ++ * regmlen length of regmust string ++ * ++ * Regstart and reganch permit very fast decisions on suitable starting points ++ * for a match, cutting down the work a lot. Regmust permits fast rejection ++ * of lines that cannot possibly match. The regmust tests are costly enough ++ * that regcomp() supplies a regmust only if the r.e. contains something ++ * potentially expensive (at present, the only such thing detected is * or + ++ * at the start of the r.e., which can involve a lot of backup). Regmlen is ++ * supplied because the test in regexec() needs it and regcomp() is computing ++ * it anyway. ++ */ ++ ++/* ++ * Structure for regexp "program". This is essentially a linear encoding ++ * of a nondeterministic finite-state machine (aka syntax charts or ++ * "railroad normal form" in parsing technology). Each node is an opcode ++ * plus a "next" pointer, possibly plus an operand. "Next" pointers of ++ * all nodes except BRANCH implement concatenation; a "next" pointer with ++ * a BRANCH on both ends of it is connecting two alternatives. (Here we ++ * have one of the subtle syntax dependencies: an individual BRANCH (as ++ * opposed to a collection of them) is never concatenated with anything ++ * because of operator precedence.) The operand of some types of node is ++ * a literal string; for others, it is a node leading into a sub-FSM. In ++ * particular, the operand of a BRANCH node is the first node of the branch. ++ * (NB this is *not* a tree structure: the tail of the branch connects ++ * to the thing following the set of BRANCHes.) The opcodes are: ++ */ ++ ++/* definition number opnd? meaning */ ++#define END 0 /* no End of program. */ ++#define BOL 1 /* no Match "" at beginning of line. */ ++#define EOL 2 /* no Match "" at end of line. */ ++#define ANY 3 /* no Match any one character. */ ++#define ANYOF 4 /* str Match any character in this string. */ ++#define ANYBUT 5 /* str Match any character not in this string. */ ++#define BRANCH 6 /* node Match this alternative, or the next... */ ++#define BACK 7 /* no Match "", "next" ptr points backward. */ ++#define EXACTLY 8 /* str Match this string. */ ++#define NOTHING 9 /* no Match empty string. */ ++#define STAR 10 /* node Match this (simple) thing 0 or more times. */ ++#define PLUS 11 /* node Match this (simple) thing 1 or more times. */ ++#define OPEN 20 /* no Mark this point in input as start of #n. */ ++ /* OPEN+1 is number 1, etc. */ ++#define CLOSE 30 /* no Analogous to OPEN. */ ++ ++/* ++ * Opcode notes: ++ * ++ * BRANCH The set of branches constituting a single choice are hooked ++ * together with their "next" pointers, since precedence prevents ++ * anything being concatenated to any individual branch. The ++ * "next" pointer of the last BRANCH in a choice points to the ++ * thing following the whole choice. This is also where the ++ * final "next" pointer of each individual branch points; each ++ * branch starts with the operand node of a BRANCH node. ++ * ++ * BACK Normal "next" pointers all implicitly point forward; BACK ++ * exists to make loop structures possible. ++ * ++ * STAR,PLUS '?', and complex '*' and '+', are implemented as circular ++ * BRANCH structures using BACK. Simple cases (one character ++ * per match) are implemented with STAR and PLUS for speed ++ * and to minimize recursive plunges. ++ * ++ * OPEN,CLOSE ...are numbered at compile time. ++ */ ++ ++/* ++ * A node is one char of opcode followed by two chars of "next" pointer. ++ * "Next" pointers are stored as two 8-bit pieces, high order first. The ++ * value is a positive offset from the opcode of the node containing it. ++ * An operand, if any, simply follows the node. (Note that much of the ++ * code generation knows about this implicit relationship.) ++ * ++ * Using two bytes for the "next" pointer is vast overkill for most things, ++ * but allows patterns to get big without disasters. ++ */ ++#define OP(p) (*(p)) ++#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) ++#define OPERAND(p) ((p) + 3) ++ ++/* ++ * See regmagic.h for one further detail of program structure. ++ */ ++ ++ ++/* ++ * Utility definitions. ++ */ ++#ifndef CHARBITS ++#define UCHARAT(p) ((int)*(unsigned char *)(p)) ++#else ++#define UCHARAT(p) ((int)*(p)&CHARBITS) ++#endif ++ ++#define FAIL(m) { regerror(m); return(NULL); } ++#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') ++#define META "^$.[()|?+*\\" ++ ++/* ++ * Flags to be passed up and down. ++ */ ++#define HASWIDTH 01 /* Known never to match null string. */ ++#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ ++#define SPSTART 04 /* Starts with * or +. */ ++#define WORST 0 /* Worst case. */ ++ ++/* ++ * Global work variables for regcomp(). ++ */ ++struct match_globals { ++char *reginput; /* String-input pointer. */ ++char *regbol; /* Beginning of input, for ^ check. */ ++char **regstartp; /* Pointer to startp array. */ ++char **regendp; /* Ditto for endp. */ ++char *regparse; /* Input-scan pointer. */ ++int regnpar; /* () count. */ ++char regdummy; ++char *regcode; /* Code-emit pointer; ®dummy = don't. */ ++long regsize; /* Code size. */ ++}; ++ ++/* ++ * Forward declarations for regcomp()'s friends. ++ */ ++#ifndef STATIC ++#define STATIC static ++#endif ++STATIC char *reg(struct match_globals *g, int paren,int *flagp); ++STATIC char *regbranch(struct match_globals *g, int *flagp); ++STATIC char *regpiece(struct match_globals *g, int *flagp); ++STATIC char *regatom(struct match_globals *g, int *flagp); ++STATIC char *regnode(struct match_globals *g, char op); ++STATIC char *regnext(struct match_globals *g, char *p); ++STATIC void regc(struct match_globals *g, char b); ++STATIC void reginsert(struct match_globals *g, char op, char *opnd); ++STATIC void regtail(struct match_globals *g, char *p, char *val); ++STATIC void regoptail(struct match_globals *g, char *p, char *val); ++ ++ ++__kernel_size_t my_strcspn(const char *s1,const char *s2) ++{ ++ char *scan1; ++ char *scan2; ++ int count; ++ ++ count = 0; ++ for (scan1 = (char *)s1; *scan1 != '\0'; scan1++) { ++ for (scan2 = (char *)s2; *scan2 != '\0';) /* ++ moved down. */ ++ if (*scan1 == *scan2++) ++ return(count); ++ count++; ++ } ++ return(count); ++} ++ ++/* ++ - regcomp - compile a regular expression into internal code ++ * ++ * We can't allocate space until we know how big the compiled form will be, ++ * but we can't compile it (and thus know how big it is) until we've got a ++ * place to put the code. So we cheat: we compile it twice, once with code ++ * generation turned off and size counting turned on, and once "for real". ++ * This also means that we don't allocate space until we are sure that the ++ * thing really will compile successfully, and we never have to move the ++ * code and thus invalidate pointers into it. (Note that it has to be in ++ * one piece because free() must be able to free it all.) ++ * ++ * Beware that the optimization-preparation code in here knows about some ++ * of the structure of the compiled regexp. ++ */ ++regexp * ++regcomp(char *exp,int *patternsize) ++{ ++ register regexp *r; ++ register char *scan; ++ register char *longest; ++ register int len; ++ int flags; ++ struct match_globals g; ++ ++ /* commented out by ethan ++ extern char *malloc(); ++ */ ++ ++ if (exp == NULL) ++ FAIL("NULL argument"); ++ ++ /* First pass: determine size, legality. */ ++ g.regparse = exp; ++ g.regnpar = 1; ++ g.regsize = 0L; ++ g.regcode = &g.regdummy; ++ regc(&g, MAGIC); ++ if (reg(&g, 0, &flags) == NULL) ++ return(NULL); ++ ++ /* Small enough for pointer-storage convention? */ ++ if (g.regsize >= 32767L) /* Probably could be 65535L. */ ++ FAIL("regexp too big"); ++ ++ /* Allocate space. */ ++ *patternsize=sizeof(regexp) + (unsigned)g.regsize; ++ r = (regexp *)malloc(sizeof(regexp) + (unsigned)g.regsize); ++ if (r == NULL) ++ FAIL("out of space"); ++ ++ /* Second pass: emit code. */ ++ g.regparse = exp; ++ g.regnpar = 1; ++ g.regcode = r->program; ++ regc(&g, MAGIC); ++ if (reg(&g, 0, &flags) == NULL) ++ return(NULL); ++ ++ /* Dig out information for optimizations. */ ++ r->regstart = '\0'; /* Worst-case defaults. */ ++ r->reganch = 0; ++ r->regmust = NULL; ++ r->regmlen = 0; ++ scan = r->program+1; /* First BRANCH. */ ++ if (OP(regnext(&g, scan)) == END) { /* Only one top-level choice. */ ++ scan = OPERAND(scan); ++ ++ /* Starting-point info. */ ++ if (OP(scan) == EXACTLY) ++ r->regstart = *OPERAND(scan); ++ else if (OP(scan) == BOL) ++ r->reganch++; ++ ++ /* ++ * If there's something expensive in the r.e., find the ++ * longest literal string that must appear and make it the ++ * regmust. Resolve ties in favor of later strings, since ++ * the regstart check works with the beginning of the r.e. ++ * and avoiding duplication strengthens checking. Not a ++ * strong reason, but sufficient in the absence of others. ++ */ ++ if (flags&SPSTART) { ++ longest = NULL; ++ len = 0; ++ for (; scan != NULL; scan = regnext(&g, scan)) ++ if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { ++ longest = OPERAND(scan); ++ len = strlen(OPERAND(scan)); ++ } ++ r->regmust = longest; ++ r->regmlen = len; ++ } ++ } ++ ++ return(r); ++} ++ ++/* ++ - reg - regular expression, i.e. main body or parenthesized thing ++ * ++ * Caller must absorb opening parenthesis. ++ * ++ * Combining parenthesis handling with the base level of regular expression ++ * is a trifle forced, but the need to tie the tails of the branches to what ++ * follows makes it hard to avoid. ++ */ ++static char * ++reg(struct match_globals *g, int paren, int *flagp /* Parenthesized? */ ) ++{ ++ register char *ret; ++ register char *br; ++ register char *ender; ++ register int parno = 0; /* 0 makes gcc happy */ ++ int flags; ++ ++ *flagp = HASWIDTH; /* Tentatively. */ ++ ++ /* Make an OPEN node, if parenthesized. */ ++ if (paren) { ++ if (g->regnpar >= NSUBEXP) ++ FAIL("too many ()"); ++ parno = g->regnpar; ++ g->regnpar++; ++ ret = regnode(g, OPEN+parno); ++ } else ++ ret = NULL; ++ ++ /* Pick up the branches, linking them together. */ ++ br = regbranch(g, &flags); ++ if (br == NULL) ++ return(NULL); ++ if (ret != NULL) ++ regtail(g, ret, br); /* OPEN -> first. */ ++ else ++ ret = br; ++ if (!(flags&HASWIDTH)) ++ *flagp &= ~HASWIDTH; ++ *flagp |= flags&SPSTART; ++ while (*g->regparse == '|') { ++ g->regparse++; ++ br = regbranch(g, &flags); ++ if (br == NULL) ++ return(NULL); ++ regtail(g, ret, br); /* BRANCH -> BRANCH. */ ++ if (!(flags&HASWIDTH)) ++ *flagp &= ~HASWIDTH; ++ *flagp |= flags&SPSTART; ++ } ++ ++ /* Make a closing node, and hook it on the end. */ ++ ender = regnode(g, (paren) ? CLOSE+parno : END); ++ regtail(g, ret, ender); ++ ++ /* Hook the tails of the branches to the closing node. */ ++ for (br = ret; br != NULL; br = regnext(g, br)) ++ regoptail(g, br, ender); ++ ++ /* Check for proper termination. */ ++ if (paren && *g->regparse++ != ')') { ++ FAIL("unmatched ()"); ++ } else if (!paren && *g->regparse != '\0') { ++ if (*g->regparse == ')') { ++ FAIL("unmatched ()"); ++ } else ++ FAIL("junk on end"); /* "Can't happen". */ ++ /* NOTREACHED */ ++ } ++ ++ return(ret); ++} ++ ++/* ++ - regbranch - one alternative of an | operator ++ * ++ * Implements the concatenation operator. ++ */ ++static char * ++regbranch(struct match_globals *g, int *flagp) ++{ ++ register char *ret; ++ register char *chain; ++ register char *latest; ++ int flags; ++ ++ *flagp = WORST; /* Tentatively. */ ++ ++ ret = regnode(g, BRANCH); ++ chain = NULL; ++ while (*g->regparse != '\0' && *g->regparse != '|' && *g->regparse != ')') { ++ latest = regpiece(g, &flags); ++ if (latest == NULL) ++ return(NULL); ++ *flagp |= flags&HASWIDTH; ++ if (chain == NULL) /* First piece. */ ++ *flagp |= flags&SPSTART; ++ else ++ regtail(g, chain, latest); ++ chain = latest; ++ } ++ if (chain == NULL) /* Loop ran zero times. */ ++ (void) regnode(g, NOTHING); ++ ++ return(ret); ++} ++ ++/* ++ - regpiece - something followed by possible [*+?] ++ * ++ * Note that the branching code sequences used for ? and the general cases ++ * of * and + are somewhat optimized: they use the same NOTHING node as ++ * both the endmarker for their branch list and the body of the last branch. ++ * It might seem that this node could be dispensed with entirely, but the ++ * endmarker role is not redundant. ++ */ ++static char * ++regpiece(struct match_globals *g, int *flagp) ++{ ++ register char *ret; ++ register char op; ++ register char *next; ++ int flags; ++ ++ ret = regatom(g, &flags); ++ if (ret == NULL) ++ return(NULL); ++ ++ op = *g->regparse; ++ if (!ISMULT(op)) { ++ *flagp = flags; ++ return(ret); ++ } ++ ++ if (!(flags&HASWIDTH) && op != '?') ++ FAIL("*+ operand could be empty"); ++ *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); ++ ++ if (op == '*' && (flags&SIMPLE)) ++ reginsert(g, STAR, ret); ++ else if (op == '*') { ++ /* Emit x* as (x&|), where & means "self". */ ++ reginsert(g, BRANCH, ret); /* Either x */ ++ regoptail(g, ret, regnode(g, BACK)); /* and loop */ ++ regoptail(g, ret, ret); /* back */ ++ regtail(g, ret, regnode(g, BRANCH)); /* or */ ++ regtail(g, ret, regnode(g, NOTHING)); /* null. */ ++ } else if (op == '+' && (flags&SIMPLE)) ++ reginsert(g, PLUS, ret); ++ else if (op == '+') { ++ /* Emit x+ as x(&|), where & means "self". */ ++ next = regnode(g, BRANCH); /* Either */ ++ regtail(g, ret, next); ++ regtail(g, regnode(g, BACK), ret); /* loop back */ ++ regtail(g, next, regnode(g, BRANCH)); /* or */ ++ regtail(g, ret, regnode(g, NOTHING)); /* null. */ ++ } else if (op == '?') { ++ /* Emit x? as (x|) */ ++ reginsert(g, BRANCH, ret); /* Either x */ ++ regtail(g, ret, regnode(g, BRANCH)); /* or */ ++ next = regnode(g, NOTHING); /* null. */ ++ regtail(g, ret, next); ++ regoptail(g, ret, next); ++ } ++ g->regparse++; ++ if (ISMULT(*g->regparse)) ++ FAIL("nested *?+"); ++ ++ return(ret); ++} ++ ++/* ++ - regatom - the lowest level ++ * ++ * Optimization: gobbles an entire sequence of ordinary characters so that ++ * it can turn them into a single node, which is smaller to store and ++ * faster to run. Backslashed characters are exceptions, each becoming a ++ * separate node; the code is simpler that way and it's not worth fixing. ++ */ ++static char * ++regatom(struct match_globals *g, int *flagp) ++{ ++ register char *ret; ++ int flags; ++ ++ *flagp = WORST; /* Tentatively. */ ++ ++ switch (*g->regparse++) { ++ case '^': ++ ret = regnode(g, BOL); ++ break; ++ case '$': ++ ret = regnode(g, EOL); ++ break; ++ case '.': ++ ret = regnode(g, ANY); ++ *flagp |= HASWIDTH|SIMPLE; ++ break; ++ case '[': { ++ register int class; ++ register int classend; ++ ++ if (*g->regparse == '^') { /* Complement of range. */ ++ ret = regnode(g, ANYBUT); ++ g->regparse++; ++ } else ++ ret = regnode(g, ANYOF); ++ if (*g->regparse == ']' || *g->regparse == '-') ++ regc(g, *g->regparse++); ++ while (*g->regparse != '\0' && *g->regparse != ']') { ++ if (*g->regparse == '-') { ++ g->regparse++; ++ if (*g->regparse == ']' || *g->regparse == '\0') ++ regc(g, '-'); ++ else { ++ class = UCHARAT(g->regparse-2)+1; ++ classend = UCHARAT(g->regparse); ++ if (class > classend+1) ++ FAIL("invalid [] range"); ++ for (; class <= classend; class++) ++ regc(g, class); ++ g->regparse++; ++ } ++ } else ++ regc(g, *g->regparse++); ++ } ++ regc(g, '\0'); ++ if (*g->regparse != ']') ++ FAIL("unmatched []"); ++ g->regparse++; ++ *flagp |= HASWIDTH|SIMPLE; ++ } ++ break; ++ case '(': ++ ret = reg(g, 1, &flags); ++ if (ret == NULL) ++ return(NULL); ++ *flagp |= flags&(HASWIDTH|SPSTART); ++ break; ++ case '\0': ++ case '|': ++ case ')': ++ FAIL("internal urp"); /* Supposed to be caught earlier. */ ++ break; ++ case '?': ++ case '+': ++ case '*': ++ FAIL("?+* follows nothing"); ++ break; ++ case '\\': ++ if (*g->regparse == '\0') ++ FAIL("trailing \\"); ++ ret = regnode(g, EXACTLY); ++ regc(g, *g->regparse++); ++ regc(g, '\0'); ++ *flagp |= HASWIDTH|SIMPLE; ++ break; ++ default: { ++ register int len; ++ register char ender; ++ ++ g->regparse--; ++ len = my_strcspn((const char *)g->regparse, (const char *)META); ++ if (len <= 0) ++ FAIL("internal disaster"); ++ ender = *(g->regparse+len); ++ if (len > 1 && ISMULT(ender)) ++ len--; /* Back off clear of ?+* operand. */ ++ *flagp |= HASWIDTH; ++ if (len == 1) ++ *flagp |= SIMPLE; ++ ret = regnode(g, EXACTLY); ++ while (len > 0) { ++ regc(g, *g->regparse++); ++ len--; ++ } ++ regc(g, '\0'); ++ } ++ break; ++ } ++ ++ return(ret); ++} ++ ++/* ++ - regnode - emit a node ++ */ ++static char * /* Location. */ ++regnode(struct match_globals *g, char op) ++{ ++ register char *ret; ++ register char *ptr; ++ ++ ret = g->regcode; ++ if (ret == &g->regdummy) { ++ g->regsize += 3; ++ return(ret); ++ } ++ ++ ptr = ret; ++ *ptr++ = op; ++ *ptr++ = '\0'; /* Null "next" pointer. */ ++ *ptr++ = '\0'; ++ g->regcode = ptr; ++ ++ return(ret); ++} ++ ++/* ++ - regc - emit (if appropriate) a byte of code ++ */ ++static void ++regc(struct match_globals *g, char b) ++{ ++ if (g->regcode != &g->regdummy) ++ *g->regcode++ = b; ++ else ++ g->regsize++; ++} ++ ++/* ++ - reginsert - insert an operator in front of already-emitted operand ++ * ++ * Means relocating the operand. ++ */ ++static void ++reginsert(struct match_globals *g, char op, char* opnd) ++{ ++ register char *src; ++ register char *dst; ++ register char *place; ++ ++ if (g->regcode == &g->regdummy) { ++ g->regsize += 3; ++ return; ++ } ++ ++ src = g->regcode; ++ g->regcode += 3; ++ dst = g->regcode; ++ while (src > opnd) ++ *--dst = *--src; ++ ++ place = opnd; /* Op node, where operand used to be. */ ++ *place++ = op; ++ *place++ = '\0'; ++ *place++ = '\0'; ++} ++ ++/* ++ - regtail - set the next-pointer at the end of a node chain ++ */ ++static void ++regtail(struct match_globals *g, char *p, char *val) ++{ ++ register char *scan; ++ register char *temp; ++ register int offset; ++ ++ if (p == &g->regdummy) ++ return; ++ ++ /* Find last node. */ ++ scan = p; ++ for (;;) { ++ temp = regnext(g, scan); ++ if (temp == NULL) ++ break; ++ scan = temp; ++ } ++ ++ if (OP(scan) == BACK) ++ offset = scan - val; ++ else ++ offset = val - scan; ++ *(scan+1) = (offset>>8)&0377; ++ *(scan+2) = offset&0377; ++} ++ ++/* ++ - regoptail - regtail on operand of first argument; nop if operandless ++ */ ++static void ++regoptail(struct match_globals *g, char *p, char *val) ++{ ++ /* "Operandless" and "op != BRANCH" are synonymous in practice. */ ++ if (p == NULL || p == &g->regdummy || OP(p) != BRANCH) ++ return; ++ regtail(g, OPERAND(p), val); ++} ++ ++/* ++ * regexec and friends ++ */ ++ ++ ++/* ++ * Forwards. ++ */ ++STATIC int regtry(struct match_globals *g, regexp *prog, char *string); ++STATIC int regmatch(struct match_globals *g, char *prog); ++STATIC int regrepeat(struct match_globals *g, char *p); ++ ++#ifdef DEBUG ++int regnarrate = 0; ++void regdump(); ++STATIC char *regprop(char *op); ++#endif ++ ++/* ++ - regexec - match a regexp against a string ++ */ ++int ++regexec(regexp *prog, char *string) ++{ ++ register char *s; ++ struct match_globals g; ++ ++ /* Be paranoid... */ ++ if (prog == NULL || string == NULL) { ++ printk("<3>Regexp: NULL parameter\n"); ++ return(0); ++ } ++ ++ /* Check validity of program. */ ++ if (UCHARAT(prog->program) != MAGIC) { ++ printk("<3>Regexp: corrupted program\n"); ++ return(0); ++ } ++ ++ /* If there is a "must appear" string, look for it. */ ++ if (prog->regmust != NULL) { ++ s = string; ++ while ((s = strchr(s, prog->regmust[0])) != NULL) { ++ if (strncmp(s, prog->regmust, prog->regmlen) == 0) ++ break; /* Found it. */ ++ s++; ++ } ++ if (s == NULL) /* Not present. */ ++ return(0); ++ } ++ ++ /* Mark beginning of line for ^ . */ ++ g.regbol = string; ++ ++ /* Simplest case: anchored match need be tried only once. */ ++ if (prog->reganch) ++ return(regtry(&g, prog, string)); ++ ++ /* Messy cases: unanchored match. */ ++ s = string; ++ if (prog->regstart != '\0') ++ /* We know what char it must start with. */ ++ while ((s = strchr(s, prog->regstart)) != NULL) { ++ if (regtry(&g, prog, s)) ++ return(1); ++ s++; ++ } ++ else ++ /* We don't -- general case. */ ++ do { ++ if (regtry(&g, prog, s)) ++ return(1); ++ } while (*s++ != '\0'); ++ ++ /* Failure. */ ++ return(0); ++} ++ ++/* ++ - regtry - try match at specific point ++ */ ++static int /* 0 failure, 1 success */ ++regtry(struct match_globals *g, regexp *prog, char *string) ++{ ++ register int i; ++ register char **sp; ++ register char **ep; ++ ++ g->reginput = string; ++ g->regstartp = prog->startp; ++ g->regendp = prog->endp; ++ ++ sp = prog->startp; ++ ep = prog->endp; ++ for (i = NSUBEXP; i > 0; i--) { ++ *sp++ = NULL; ++ *ep++ = NULL; ++ } ++ if (regmatch(g, prog->program + 1)) { ++ prog->startp[0] = string; ++ prog->endp[0] = g->reginput; ++ return(1); ++ } else ++ return(0); ++} ++ ++/* ++ - regmatch - main matching routine ++ * ++ * Conceptually the strategy is simple: check to see whether the current ++ * node matches, call self recursively to see whether the rest matches, ++ * and then act accordingly. In practice we make some effort to avoid ++ * recursion, in particular by going through "ordinary" nodes (that don't ++ * need to know whether the rest of the match failed) by a loop instead of ++ * by recursion. ++ */ ++static int /* 0 failure, 1 success */ ++regmatch(struct match_globals *g, char *prog) ++{ ++ register char *scan = prog; /* Current node. */ ++ char *next; /* Next node. */ ++ ++#ifdef DEBUG ++ if (scan != NULL && regnarrate) ++ fprintf(stderr, "%s(\n", regprop(scan)); ++#endif ++ while (scan != NULL) { ++#ifdef DEBUG ++ if (regnarrate) ++ fprintf(stderr, "%s...\n", regprop(scan)); ++#endif ++ next = regnext(g, scan); ++ ++ switch (OP(scan)) { ++ case BOL: ++ if (g->reginput != g->regbol) ++ return(0); ++ break; ++ case EOL: ++ if (*g->reginput != '\0') ++ return(0); ++ break; ++ case ANY: ++ if (*g->reginput == '\0') ++ return(0); ++ g->reginput++; ++ break; ++ case EXACTLY: { ++ register int len; ++ register char *opnd; ++ ++ opnd = OPERAND(scan); ++ /* Inline the first character, for speed. */ ++ if (*opnd != *g->reginput) ++ return(0); ++ len = strlen(opnd); ++ if (len > 1 && strncmp(opnd, g->reginput, len) != 0) ++ return(0); ++ g->reginput += len; ++ } ++ break; ++ case ANYOF: ++ if (*g->reginput == '\0' || strchr(OPERAND(scan), *g->reginput) == NULL) ++ return(0); ++ g->reginput++; ++ break; ++ case ANYBUT: ++ if (*g->reginput == '\0' || strchr(OPERAND(scan), *g->reginput) != NULL) ++ return(0); ++ g->reginput++; ++ break; ++ case NOTHING: ++ case BACK: ++ break; ++ case OPEN+1: ++ case OPEN+2: ++ case OPEN+3: ++ case OPEN+4: ++ case OPEN+5: ++ case OPEN+6: ++ case OPEN+7: ++ case OPEN+8: ++ case OPEN+9: { ++ register int no; ++ register char *save; ++ ++ no = OP(scan) - OPEN; ++ save = g->reginput; ++ ++ if (regmatch(g, next)) { ++ /* ++ * Don't set startp if some later ++ * invocation of the same parentheses ++ * already has. ++ */ ++ if (g->regstartp[no] == NULL) ++ g->regstartp[no] = save; ++ return(1); ++ } else ++ return(0); ++ } ++ break; ++ case CLOSE+1: ++ case CLOSE+2: ++ case CLOSE+3: ++ case CLOSE+4: ++ case CLOSE+5: ++ case CLOSE+6: ++ case CLOSE+7: ++ case CLOSE+8: ++ case CLOSE+9: ++ { ++ register int no; ++ register char *save; ++ ++ no = OP(scan) - CLOSE; ++ save = g->reginput; ++ ++ if (regmatch(g, next)) { ++ /* ++ * Don't set endp if some later ++ * invocation of the same parentheses ++ * already has. ++ */ ++ if (g->regendp[no] == NULL) ++ g->regendp[no] = save; ++ return(1); ++ } else ++ return(0); ++ } ++ break; ++ case BRANCH: { ++ register char *save; ++ ++ if (OP(next) != BRANCH) /* No choice. */ ++ next = OPERAND(scan); /* Avoid recursion. */ ++ else { ++ do { ++ save = g->reginput; ++ if (regmatch(g, OPERAND(scan))) ++ return(1); ++ g->reginput = save; ++ scan = regnext(g, scan); ++ } while (scan != NULL && OP(scan) == BRANCH); ++ return(0); ++ /* NOTREACHED */ ++ } ++ } ++ break; ++ case STAR: ++ case PLUS: { ++ register char nextch; ++ register int no; ++ register char *save; ++ register int min; ++ ++ /* ++ * Lookahead to avoid useless match attempts ++ * when we know what character comes next. ++ */ ++ nextch = '\0'; ++ if (OP(next) == EXACTLY) ++ nextch = *OPERAND(next); ++ min = (OP(scan) == STAR) ? 0 : 1; ++ save = g->reginput; ++ no = regrepeat(g, OPERAND(scan)); ++ while (no >= min) { ++ /* If it could work, try it. */ ++ if (nextch == '\0' || *g->reginput == nextch) ++ if (regmatch(g, next)) ++ return(1); ++ /* Couldn't or didn't -- back up. */ ++ no--; ++ g->reginput = save + no; ++ } ++ return(0); ++ } ++ break; ++ case END: ++ return(1); /* Success! */ ++ break; ++ default: ++ printk("<3>Regexp: memory corruption\n"); ++ return(0); ++ break; ++ } ++ ++ scan = next; ++ } ++ ++ /* ++ * We get here only if there's trouble -- normally "case END" is ++ * the terminating point. ++ */ ++ printk("<3>Regexp: corrupted pointers\n"); ++ return(0); ++} ++ ++/* ++ - regrepeat - repeatedly match something simple, report how many ++ */ ++static int ++regrepeat(struct match_globals *g, char *p) ++{ ++ register int count = 0; ++ register char *scan; ++ register char *opnd; ++ ++ scan = g->reginput; ++ opnd = OPERAND(p); ++ switch (OP(p)) { ++ case ANY: ++ count = strlen(scan); ++ scan += count; ++ break; ++ case EXACTLY: ++ while (*opnd == *scan) { ++ count++; ++ scan++; ++ } ++ break; ++ case ANYOF: ++ while (*scan != '\0' && strchr(opnd, *scan) != NULL) { ++ count++; ++ scan++; ++ } ++ break; ++ case ANYBUT: ++ while (*scan != '\0' && strchr(opnd, *scan) == NULL) { ++ count++; ++ scan++; ++ } ++ break; ++ default: /* Oh dear. Called inappropriately. */ ++ printk("<3>Regexp: internal foulup\n"); ++ count = 0; /* Best compromise. */ ++ break; ++ } ++ g->reginput = scan; ++ ++ return(count); ++} ++ ++/* ++ - regnext - dig the "next" pointer out of a node ++ */ ++static char* ++regnext(struct match_globals *g, char *p) ++{ ++ register int offset; ++ ++ if (p == &g->regdummy) ++ return(NULL); ++ ++ offset = NEXT(p); ++ if (offset == 0) ++ return(NULL); ++ ++ if (OP(p) == BACK) ++ return(p-offset); ++ else ++ return(p+offset); ++} ++ ++#ifdef DEBUG ++ ++STATIC char *regprop(); ++ ++/* ++ - regdump - dump a regexp onto stdout in vaguely comprehensible form ++ */ ++void ++regdump(regexp *r) ++{ ++ register char *s; ++ register char op = EXACTLY; /* Arbitrary non-END op. */ ++ register char *next; ++ /* extern char *strchr(); */ ++ ++ ++ s = r->program + 1; ++ while (op != END) { /* While that wasn't END last time... */ ++ op = OP(s); ++ printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ ++ next = regnext(s); ++ if (next == NULL) /* Next ptr. */ ++ printf("(0)"); ++ else ++ printf("(%d)", (s-r->program)+(next-s)); ++ s += 3; ++ if (op == ANYOF || op == ANYBUT || op == EXACTLY) { ++ /* Literal string, where present. */ ++ while (*s != '\0') { ++ putchar(*s); ++ s++; ++ } ++ s++; ++ } ++ putchar('\n'); ++ } ++ ++ /* Header fields of interest. */ ++ if (r->regstart != '\0') ++ printf("start `%c' ", r->regstart); ++ if (r->reganch) ++ printf("anchored "); ++ if (r->regmust != NULL) ++ printf("must have \"%s\"", r->regmust); ++ printf("\n"); ++} ++ ++/* ++ - regprop - printable representation of opcode ++ */ ++static char * ++regprop(char *op) ++{ ++#define BUFLEN 50 ++ register char *p; ++ static char buf[BUFLEN]; ++ ++ strcpy(buf, ":"); ++ ++ switch (OP(op)) { ++ case BOL: ++ p = "BOL"; ++ break; ++ case EOL: ++ p = "EOL"; ++ break; ++ case ANY: ++ p = "ANY"; ++ break; ++ case ANYOF: ++ p = "ANYOF"; ++ break; ++ case ANYBUT: ++ p = "ANYBUT"; ++ break; ++ case BRANCH: ++ p = "BRANCH"; ++ break; ++ case EXACTLY: ++ p = "EXACTLY"; ++ break; ++ case NOTHING: ++ p = "NOTHING"; ++ break; ++ case BACK: ++ p = "BACK"; ++ break; ++ case END: ++ p = "END"; ++ break; ++ case OPEN+1: ++ case OPEN+2: ++ case OPEN+3: ++ case OPEN+4: ++ case OPEN+5: ++ case OPEN+6: ++ case OPEN+7: ++ case OPEN+8: ++ case OPEN+9: ++ snprintf(buf+strlen(buf),BUFLEN-strlen(buf), "OPEN%d", OP(op)-OPEN); ++ p = NULL; ++ break; ++ case CLOSE+1: ++ case CLOSE+2: ++ case CLOSE+3: ++ case CLOSE+4: ++ case CLOSE+5: ++ case CLOSE+6: ++ case CLOSE+7: ++ case CLOSE+8: ++ case CLOSE+9: ++ snprintf(buf+strlen(buf),BUFLEN-strlen(buf), "CLOSE%d", OP(op)-CLOSE); ++ p = NULL; ++ break; ++ case STAR: ++ p = "STAR"; ++ break; ++ case PLUS: ++ p = "PLUS"; ++ break; ++ default: ++ printk("<3>Regexp: corrupted opcode\n"); ++ break; ++ } ++ if (p != NULL) ++ strncat(buf, p, BUFLEN-strlen(buf)); ++ return(buf); ++} ++#endif ++ ++ +--- /dev/null ++++ b/net/netfilter/regexp/regexp.h +@@ -0,0 +1,41 @@ ++/* ++ * Definitions etc. for regexp(3) routines. ++ * ++ * Caveat: this is V8 regexp(3) [actually, a reimplementation thereof], ++ * not the System V one. ++ */ ++ ++#ifndef REGEXP_H ++#define REGEXP_H ++ ++ ++/* ++http://www.opensource.apple.com/darwinsource/10.3/expect-1/expect/expect.h , ++which contains a version of this library, says: ++ ++ * ++ * NSUBEXP must be at least 10, and no greater than 117 or the parser ++ * will not work properly. ++ * ++ ++However, it looks rather like this library is limited to 10. If you think ++otherwise, let us know. ++*/ ++ ++#define NSUBEXP 10 ++typedef struct regexp { ++ char *startp[NSUBEXP]; ++ char *endp[NSUBEXP]; ++ char regstart; /* Internal use only. */ ++ char reganch; /* Internal use only. */ ++ char *regmust; /* Internal use only. */ ++ int regmlen; /* Internal use only. */ ++ char program[1]; /* Unwarranted chumminess with compiler. */ ++} regexp; ++ ++regexp * regcomp(char *exp, int *patternsize); ++int regexec(regexp *prog, char *string); ++void regsub(regexp *prog, char *source, char *dest); ++void regerror(char *s); ++ ++#endif +--- /dev/null ++++ b/net/netfilter/regexp/regmagic.h +@@ -0,0 +1,5 @@ ++/* ++ * The first byte of the regexp internal "program" is actually this magic ++ * number; the start node begins in the second byte. ++ */ ++#define MAGIC 0234 +--- /dev/null ++++ b/net/netfilter/regexp/regsub.c +@@ -0,0 +1,95 @@ ++/* ++ * regsub ++ * @(#)regsub.c 1.3 of 2 April 86 ++ * ++ * Copyright (c) 1986 by University of Toronto. ++ * Written by Henry Spencer. Not derived from licensed software. ++ * ++ * Permission is granted to anyone to use this software for any ++ * purpose on any computer system, and to redistribute it freely, ++ * subject to the following restrictions: ++ * ++ * 1. The author is not responsible for the consequences of use of ++ * this software, no matter how awful, even if they arise ++ * from defects in it. ++ * ++ * 2. The origin of this software must not be misrepresented, either ++ * by explicit claim or by omission. ++ * ++ * 3. Altered versions must be plainly marked as such, and must not ++ * be misrepresented as being the original software. ++ * ++ * ++ * This code was modified by Ethan Sommer to work within the kernel ++ * (it now uses kmalloc etc..) ++ * ++ */ ++#include "regexp.h" ++#include "regmagic.h" ++#include ++ ++ ++#ifndef CHARBITS ++#define UCHARAT(p) ((int)*(unsigned char *)(p)) ++#else ++#define UCHARAT(p) ((int)*(p)&CHARBITS) ++#endif ++ ++#if 0 ++//void regerror(char * s) ++//{ ++// printk("regexp(3): %s", s); ++// /* NOTREACHED */ ++//} ++#endif ++ ++/* ++ - regsub - perform substitutions after a regexp match ++ */ ++void ++regsub(regexp * prog, char * source, char * dest) ++{ ++ register char *src; ++ register char *dst; ++ register char c; ++ register int no; ++ register int len; ++ ++ /* Not necessary and gcc doesn't like it -MLS */ ++ /*extern char *strncpy();*/ ++ ++ if (prog == NULL || source == NULL || dest == NULL) { ++ regerror("NULL parm to regsub"); ++ return; ++ } ++ if (UCHARAT(prog->program) != MAGIC) { ++ regerror("damaged regexp fed to regsub"); ++ return; ++ } ++ ++ src = source; ++ dst = dest; ++ while ((c = *src++) != '\0') { ++ if (c == '&') ++ no = 0; ++ else if (c == '\\' && '0' <= *src && *src <= '9') ++ no = *src++ - '0'; ++ else ++ no = -1; ++ ++ if (no < 0) { /* Ordinary character. */ ++ if (c == '\\' && (*src == '\\' || *src == '&')) ++ c = *src++; ++ *dst++ = c; ++ } else if (prog->startp[no] != NULL && prog->endp[no] != NULL) { ++ len = prog->endp[no] - prog->startp[no]; ++ (void) strncpy(dst, prog->startp[no], len); ++ dst += len; ++ if (len != 0 && *(dst-1) == '\0') { /* strncpy hit NUL. */ ++ regerror("damaged match string"); ++ return; ++ } ++ } ++ } ++ *dst++ = '\0'; ++} +--- a/net/netfilter/nf_conntrack_core.c ++++ b/net/netfilter/nf_conntrack_core.c +@@ -214,6 +214,14 @@ destroy_conntrack(struct nf_conntrack *n + * too. */ + nf_ct_remove_expectations(ct); + ++ #if defined(CONFIG_NETFILTER_XT_MATCH_LAYER7) || defined(CONFIG_NETFILTER_XT_MATCH_LAYER7_MODULE) ++ if(ct->layer7.app_proto) ++ kfree(ct->layer7.app_proto); ++ if(ct->layer7.app_data) ++ kfree(ct->layer7.app_data); ++ #endif ++ ++ + /* We overload first tuple to link into unconfirmed list. */ + if (!nf_ct_is_confirmed(ct)) { + BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); +--- a/net/netfilter/nf_conntrack_standalone.c ++++ b/net/netfilter/nf_conntrack_standalone.c +@@ -239,6 +239,12 @@ static int ct_seq_show(struct seq_file * + if (ct_show_delta_time(s, ct)) + goto release; + ++#if defined(CONFIG_NETFILTER_XT_MATCH_LAYER7) || defined(CONFIG_NETFILTER_XT_MATCH_LAYER7_MODULE) ++ if(ct->layer7.app_proto && ++ seq_printf(s, "l7proto=%s ", ct->layer7.app_proto)) ++ return -ENOSPC; ++#endif ++ + if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) + goto release; + +--- a/include/net/netfilter/nf_conntrack.h ++++ b/include/net/netfilter/nf_conntrack.h +@@ -134,6 +134,22 @@ struct nf_conn { + struct net *ct_net; + #endif + ++#if defined(CONFIG_NETFILTER_XT_MATCH_LAYER7) || \ ++ defined(CONFIG_NETFILTER_XT_MATCH_LAYER7_MODULE) ++ struct { ++ /* ++ * e.g. "http". NULL before decision. "unknown" after decision ++ * if no match. ++ */ ++ char *app_proto; ++ /* ++ * application layer data so far. NULL after match decision. ++ */ ++ char *app_data; ++ unsigned int app_data_len; ++ } layer7; ++#endif ++ + /* Storage reserved for other modules, must be the last member */ + union nf_conntrack_proto proto; + }; +--- /dev/null ++++ b/include/linux/netfilter/xt_layer7.h +@@ -0,0 +1,13 @@ ++#ifndef _XT_LAYER7_H ++#define _XT_LAYER7_H ++ ++#define MAX_PATTERN_LEN 8192 ++#define MAX_PROTOCOL_LEN 256 ++ ++struct xt_layer7_info { ++ char protocol[MAX_PROTOCOL_LEN]; ++ char pattern[MAX_PATTERN_LEN]; ++ u_int8_t invert; ++}; ++ ++#endif /* _XT_LAYER7_H */ +--- a/include/linux/netfilter/Kbuild ++++ b/include/linux/netfilter/Kbuild +@@ -49,6 +49,7 @@ header-y += xt_hashlimit.h + header-y += xt_helper.h + header-y += xt_iprange.h + header-y += xt_ipvs.h ++header-y += xt_layer7.h + header-y += xt_length.h + header-y += xt_limit.h + header-y += xt_mac.h diff --git a/3.3.8/601-netfilter_layer7_pktmatch.patch b/3.3.8/601-netfilter_layer7_pktmatch.patch new file mode 100644 index 0000000..f65e301 --- /dev/null +++ b/3.3.8/601-netfilter_layer7_pktmatch.patch @@ -0,0 +1,108 @@ +--- a/include/linux/netfilter/xt_layer7.h ++++ b/include/linux/netfilter/xt_layer7.h +@@ -8,6 +8,7 @@ struct xt_layer7_info { + char protocol[MAX_PROTOCOL_LEN]; + char pattern[MAX_PATTERN_LEN]; + u_int8_t invert; ++ u_int8_t pkt; + }; + + #endif /* _XT_LAYER7_H */ +--- a/net/netfilter/xt_layer7.c ++++ b/net/netfilter/xt_layer7.c +@@ -314,33 +314,35 @@ static int match_no_append(struct nf_con + } + + /* add the new app data to the conntrack. Return number of bytes added. */ +-static int add_data(struct nf_conn * master_conntrack, +- char * app_data, int appdatalen) ++static int add_datastr(char *target, int offset, char *app_data, int len) + { + int length = 0, i; +- int oldlength = master_conntrack->layer7.app_data_len; +- +- /* This is a fix for a race condition by Deti Fliegl. However, I'm not +- clear on whether the race condition exists or whether this really +- fixes it. I might just be being dense... Anyway, if it's not really +- a fix, all it does is waste a very small amount of time. */ +- if(!master_conntrack->layer7.app_data) return 0; ++ if (!target) return 0; + + /* Strip nulls. Make everything lower case (our regex lib doesn't + do case insensitivity). Add it to the end of the current data. */ +- for(i = 0; i < maxdatalen-oldlength-1 && +- i < appdatalen; i++) { ++ for(i = 0; i < maxdatalen-offset-1 && i < len; i++) { + if(app_data[i] != '\0') { + /* the kernel version of tolower mungs 'upper ascii' */ +- master_conntrack->layer7.app_data[length+oldlength] = ++ target[length+offset] = + isascii(app_data[i])? + tolower(app_data[i]) : app_data[i]; + length++; + } + } ++ target[length+offset] = '\0'; ++ ++ return length; ++} ++ ++/* add the new app data to the conntrack. Return number of bytes added. */ ++static int add_data(struct nf_conn * master_conntrack, ++ char * app_data, int appdatalen) ++{ ++ int length; + +- master_conntrack->layer7.app_data[length+oldlength] = '\0'; +- master_conntrack->layer7.app_data_len = length + oldlength; ++ length = add_datastr(master_conntrack->layer7.app_data, master_conntrack->layer7.app_data_len, app_data, appdatalen); ++ master_conntrack->layer7.app_data_len += length; + + return length; + } +@@ -438,7 +440,7 @@ match(const struct sk_buff *skbin, + + enum ip_conntrack_info master_ctinfo, ctinfo; + struct nf_conn *master_conntrack, *conntrack; +- unsigned char * app_data; ++ unsigned char *app_data, *tmp_data; + unsigned int pattern_result, appdatalen; + regexp * comppattern; + +@@ -466,8 +468,8 @@ match(const struct sk_buff *skbin, + master_conntrack = master_ct(master_conntrack); + + /* if we've classified it or seen too many packets */ +- if(total_acct_packets(master_conntrack) > num_packets || +- master_conntrack->layer7.app_proto) { ++ if(!info->pkt && (total_acct_packets(master_conntrack) > num_packets || ++ master_conntrack->layer7.app_proto)) { + + pattern_result = match_no_append(conntrack, master_conntrack, + ctinfo, master_ctinfo, info); +@@ -500,6 +502,25 @@ match(const struct sk_buff *skbin, + /* the return value gets checked later, when we're ready to use it */ + comppattern = compile_and_cache(info->pattern, info->protocol); + ++ if (info->pkt) { ++ tmp_data = kmalloc(maxdatalen, GFP_ATOMIC); ++ if(!tmp_data){ ++ if (net_ratelimit()) ++ printk(KERN_ERR "layer7: out of memory in match, bailing.\n"); ++ return info->invert; ++ } ++ ++ tmp_data[0] = '\0'; ++ add_datastr(tmp_data, 0, app_data, appdatalen); ++ pattern_result = ((comppattern && regexec(comppattern, tmp_data)) ? 1 : 0); ++ ++ kfree(tmp_data); ++ tmp_data = NULL; ++ spin_unlock_bh(&l7_lock); ++ ++ return (pattern_result ^ info->invert); ++ } ++ + /* On the first packet of a connection, allocate space for app data */ + if(total_acct_packets(master_conntrack) == 1 && !skb->cb[0] && + !master_conntrack->layer7.app_data){ diff --git a/3.3.8/602-netfilter_layer7_match.patch b/3.3.8/602-netfilter_layer7_match.patch new file mode 100644 index 0000000..b2e48c8 --- /dev/null +++ b/3.3.8/602-netfilter_layer7_match.patch @@ -0,0 +1,51 @@ +--- a/net/netfilter/xt_layer7.c ++++ b/net/netfilter/xt_layer7.c +@@ -415,7 +415,9 @@ static int layer7_write_proc(struct file + } + + static bool +-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28) ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 35) ++match(const struct sk_buff *skbin, struct xt_action_param *par) ++#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28) + match(const struct sk_buff *skbin, const struct xt_match_param *par) + #else + match(const struct sk_buff *skbin, +@@ -597,14 +599,19 @@ match(const struct sk_buff *skbin, + } + + // load nf_conntrack_ipv4 ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 35) ++static int ++#else ++static bool ++#endif + #if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28) +-static bool check(const struct xt_mtchk_param *par) ++check(const struct xt_mtchk_param *par) + { + if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { + printk(KERN_WARNING "can't load conntrack support for " + "proto=%d\n", par->match->family); + #else +-static bool check(const char *tablename, const void *inf, ++check(const char *tablename, const void *inf, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) + { +@@ -612,9 +619,15 @@ static bool check(const char *tablename, + printk(KERN_WARNING "can't load conntrack support for " + "proto=%d\n", match->family); + #endif ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 35) ++ return -EINVAL; ++ } ++ return 0; ++#else + return 0; + } + return 1; ++#endif + } + + diff --git a/3.3.8/603-netfilter_layer7_2.6.36_fix.patch b/3.3.8/603-netfilter_layer7_2.6.36_fix.patch new file mode 100644 index 0000000..92a7200 --- /dev/null +++ b/3.3.8/603-netfilter_layer7_2.6.36_fix.patch @@ -0,0 +1,61 @@ +--- a/net/netfilter/Kconfig ++++ b/net/netfilter/Kconfig +@@ -857,6 +857,27 @@ config NETFILTER_XT_MATCH_IPVS + + If unsure, say N. + ++config NETFILTER_XT_MATCH_LAYER7 ++ tristate '"layer7" match support' ++ depends on EXPERIMENTAL ++ depends on NETFILTER_XTABLES ++ depends on NETFILTER_ADVANCED ++ depends on NF_CONNTRACK ++ help ++ Say Y if you want to be able to classify connections (and their ++ packets) based on regular expression matching of their application ++ layer data. This is one way to classify applications such as ++ peer-to-peer filesharing systems that do not always use the same ++ port. ++ ++ To compile it as a module, choose M here. If unsure, say N. ++ ++config NETFILTER_XT_MATCH_LAYER7_DEBUG ++ bool 'Layer 7 debugging output' ++ depends on NETFILTER_XT_MATCH_LAYER7 ++ help ++ Say Y to get lots of debugging output. ++ + config NETFILTER_XT_MATCH_LENGTH + tristate '"length" match support' + depends on NETFILTER_ADVANCED +@@ -1053,26 +1074,11 @@ config NETFILTER_XT_MATCH_STATE + + To compile it as a module, choose M here. If unsure, say N. + +-config NETFILTER_XT_MATCH_LAYER7 +- tristate '"layer7" match support' +- depends on NETFILTER_XTABLES +- depends on EXPERIMENTAL && (IP_NF_CONNTRACK || NF_CONNTRACK) +- depends on NETFILTER_ADVANCED +- help +- Say Y if you want to be able to classify connections (and their +- packets) based on regular expression matching of their application +- layer data. This is one way to classify applications such as +- peer-to-peer filesharing systems that do not always use the same +- port. +- +- To compile it as a module, choose M here. If unsure, say N. +- + config NETFILTER_XT_MATCH_LAYER7_DEBUG +- bool 'Layer 7 debugging output' +- depends on NETFILTER_XT_MATCH_LAYER7 +- help +- Say Y to get lots of debugging output. +- ++ bool 'Layer 7 debugging output' ++ depends on NETFILTER_XT_MATCH_LAYER7 ++ help ++ Say Y to get lots of debugging output. + + config NETFILTER_XT_MATCH_STATISTIC + tristate '"statistic" match support' diff --git a/3.3.8/604-netfilter_cisco_794x_iphone.patch b/3.3.8/604-netfilter_cisco_794x_iphone.patch new file mode 100644 index 0000000..662a499 --- /dev/null +++ b/3.3.8/604-netfilter_cisco_794x_iphone.patch @@ -0,0 +1,118 @@ +--- a/include/linux/netfilter/nf_conntrack_sip.h ++++ b/include/linux/netfilter/nf_conntrack_sip.h +@@ -2,12 +2,15 @@ + #define __NF_CONNTRACK_SIP_H__ + #ifdef __KERNEL__ + ++#include ++ + #define SIP_PORT 5060 + #define SIP_TIMEOUT 3600 + + struct nf_ct_sip_master { + unsigned int register_cseq; + unsigned int invite_cseq; ++ __be16 forced_dport; + }; + + enum sip_expectation_classes { +--- a/net/ipv4/netfilter/nf_nat_sip.c ++++ b/net/ipv4/netfilter/nf_nat_sip.c +@@ -73,6 +73,7 @@ static int map_addr(struct sk_buff *skb, + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); ++ struct nf_conn_help *help = nfct_help(ct); + char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; + unsigned int buflen; + __be32 newaddr; +@@ -85,7 +86,8 @@ static int map_addr(struct sk_buff *skb, + } else if (ct->tuplehash[dir].tuple.dst.u3.ip == addr->ip && + ct->tuplehash[dir].tuple.dst.u.udp.port == port) { + newaddr = ct->tuplehash[!dir].tuple.src.u3.ip; +- newport = ct->tuplehash[!dir].tuple.src.u.udp.port; ++ newport = help->help.ct_sip_info.forced_dport ? : ++ ct->tuplehash[!dir].tuple.src.u.udp.port; + } else + return 1; + +@@ -121,6 +123,7 @@ static unsigned int ip_nat_sip(struct sk + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); ++ struct nf_conn_help *help = nfct_help(ct); + unsigned int coff, matchoff, matchlen; + enum sip_header_types hdr; + union nf_inet_addr addr; +@@ -229,6 +232,20 @@ next: + !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO)) + return NF_DROP; + ++ /* Mangle destination port for Cisco phones, then fix up checksums */ ++ if (dir == IP_CT_DIR_REPLY && help->help.ct_sip_info.forced_dport) { ++ struct udphdr *uh; ++ ++ if (!skb_make_writable(skb, skb->len)) ++ return NF_DROP; ++ ++ uh = (struct udphdr *)(skb->data + ip_hdrlen(skb)); ++ uh->dest = help->help.ct_sip_info.forced_dport; ++ ++ if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, 0, 0, NULL, 0)) ++ return NF_DROP; ++ } ++ + return NF_ACCEPT; + } + +@@ -280,8 +297,10 @@ static unsigned int ip_nat_sip_expect(st + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); ++ struct nf_conn_help *help = nfct_help(ct); + __be32 newip; + u_int16_t port; ++ __be16 srcport; + char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; + unsigned buflen; + +@@ -294,8 +313,9 @@ static unsigned int ip_nat_sip_expect(st + /* If the signalling port matches the connection's source port in the + * original direction, try to use the destination port in the opposite + * direction. */ +- if (exp->tuple.dst.u.udp.port == +- ct->tuplehash[dir].tuple.src.u.udp.port) ++ srcport = help->help.ct_sip_info.forced_dport ? : ++ ct->tuplehash[dir].tuple.src.u.udp.port; ++ if (exp->tuple.dst.u.udp.port == srcport) + port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port); + else + port = ntohs(exp->tuple.dst.u.udp.port); +--- a/net/netfilter/nf_conntrack_sip.c ++++ b/net/netfilter/nf_conntrack_sip.c +@@ -1363,8 +1363,25 @@ static int process_sip_request(struct sk + { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); ++ struct nf_conn_help *help = nfct_help(ct); ++ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + unsigned int matchoff, matchlen; + unsigned int cseq, i; ++ union nf_inet_addr addr; ++ __be16 port; ++ ++ /* Many Cisco IP phones use a high source port for SIP requests, but ++ * listen for the response on port 5060. If we are the local ++ * router for one of these phones, save the port number from the ++ * Via: header so that nf_nat_sip can redirect the responses to ++ * the correct port. ++ */ ++ if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, ++ SIP_HDR_VIA_UDP, NULL, &matchoff, ++ &matchlen, &addr, &port) > 0 && ++ port != ct->tuplehash[dir].tuple.src.u.udp.port && ++ nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3)) ++ help->help.ct_sip_info.forced_dport = port; + + for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { + const struct sip_handler *handler; diff --git a/3.3.8/610-netfilter_match_bypass_default_checks.patch b/3.3.8/610-netfilter_match_bypass_default_checks.patch new file mode 100644 index 0000000..51c9e09 --- /dev/null +++ b/3.3.8/610-netfilter_match_bypass_default_checks.patch @@ -0,0 +1,93 @@ +--- a/include/linux/netfilter_ipv4/ip_tables.h ++++ b/include/linux/netfilter_ipv4/ip_tables.h +@@ -93,6 +93,7 @@ struct ipt_ip { + #define IPT_F_FRAG 0x01 /* Set if rule is a fragment rule */ + #define IPT_F_GOTO 0x02 /* Set if jump is a goto */ + #define IPT_F_MASK 0x03 /* All possible flag bits mask. */ ++#define IPT_F_NO_DEF_MATCH 0x80 /* Internal: no default match rules present */ + + /* Values for "inv" field in struct ipt_ip. */ + #define IPT_INV_VIA_IN 0x01 /* Invert the sense of IN IFACE. */ +--- a/net/ipv4/netfilter/ip_tables.c ++++ b/net/ipv4/netfilter/ip_tables.c +@@ -81,6 +81,9 @@ ip_packet_match(const struct iphdr *ip, + + #define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg))) + ++ if (ipinfo->flags & IPT_F_NO_DEF_MATCH) ++ return true; ++ + if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, + IPT_INV_SRCIP) || + FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, +@@ -134,6 +137,29 @@ ip_packet_match(const struct iphdr *ip, + return true; + } + ++static void ++ip_checkdefault(struct ipt_ip *ip) ++{ ++ static const char iface_mask[IFNAMSIZ] = {}; ++ ++ if (ip->invflags || ip->flags & IPT_F_FRAG) ++ return; ++ ++ if (memcmp(ip->iniface_mask, iface_mask, IFNAMSIZ) != 0) ++ return; ++ ++ if (memcmp(ip->outiface_mask, iface_mask, IFNAMSIZ) != 0) ++ return; ++ ++ if (ip->smsk.s_addr || ip->dmsk.s_addr) ++ return; ++ ++ if (ip->proto) ++ return; ++ ++ ip->flags |= IPT_F_NO_DEF_MATCH; ++} ++ + static bool + ip_checkentry(const struct ipt_ip *ip) + { +@@ -561,7 +587,7 @@ static void cleanup_match(struct xt_entr + } + + static int +-check_entry(const struct ipt_entry *e, const char *name) ++check_entry(struct ipt_entry *e, const char *name) + { + const struct xt_entry_target *t; + +@@ -570,6 +596,8 @@ check_entry(const struct ipt_entry *e, c + return -EINVAL; + } + ++ ip_checkdefault(&e->ip); ++ + if (e->target_offset + sizeof(struct xt_entry_target) > + e->next_offset) + return -EINVAL; +@@ -931,6 +959,7 @@ copy_entries_to_user(unsigned int total_ + const struct xt_table_info *private = table->private; + int ret = 0; + const void *loc_cpu_entry; ++ u8 flags; + + counters = alloc_counters(table); + if (IS_ERR(counters)) +@@ -961,6 +990,14 @@ copy_entries_to_user(unsigned int total_ + ret = -EFAULT; + goto free_counters; + } ++ ++ flags = e->ip.flags & IPT_F_MASK; ++ if (copy_to_user(userptr + off ++ + offsetof(struct ipt_entry, ip.flags), ++ &flags, sizeof(flags)) != 0) { ++ ret = -EFAULT; ++ goto free_counters; ++ } + + for (i = sizeof(struct ipt_entry); + i < e->target_offset; diff --git a/3.3.8/611-netfilter_match_bypass_default_table.patch b/3.3.8/611-netfilter_match_bypass_default_table.patch new file mode 100644 index 0000000..3cf0e5a --- /dev/null +++ b/3.3.8/611-netfilter_match_bypass_default_table.patch @@ -0,0 +1,81 @@ +--- a/net/ipv4/netfilter/ip_tables.c ++++ b/net/ipv4/netfilter/ip_tables.c +@@ -310,6 +310,33 @@ struct ipt_entry *ipt_next_entry(const s + return (void *)entry + entry->next_offset; + } + ++static bool ++ipt_handle_default_rule(struct ipt_entry *e, unsigned int *verdict) ++{ ++ struct xt_entry_target *t; ++ struct xt_standard_target *st; ++ ++ if (e->target_offset != sizeof(struct ipt_entry)) ++ return false; ++ ++ if (!(e->ip.flags & IPT_F_NO_DEF_MATCH)) ++ return false; ++ ++ t = ipt_get_target(e); ++ if (t->u.kernel.target->target) ++ return false; ++ ++ st = (struct xt_standard_target *) t; ++ if (st->verdict == XT_RETURN) ++ return false; ++ ++ if (st->verdict >= 0) ++ return false; ++ ++ *verdict = (unsigned)(-st->verdict) - 1; ++ return true; ++} ++ + /* Returns one of the generic firewall policies, like NF_ACCEPT. */ + unsigned int + ipt_do_table(struct sk_buff *skb, +@@ -334,6 +361,25 @@ ipt_do_table(struct sk_buff *skb, + ip = ip_hdr(skb); + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; ++ ++ IP_NF_ASSERT(table->valid_hooks & (1 << hook)); ++ local_bh_disable(); ++ addend = xt_write_recseq_begin(); ++ private = table->private; ++ cpu = smp_processor_id(); ++ table_base = private->entries[cpu]; ++ jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; ++ stackptr = per_cpu_ptr(private->stackptr, cpu); ++ origptr = *stackptr; ++ ++ e = get_entry(table_base, private->hook_entry[hook]); ++ if (ipt_handle_default_rule(e, &verdict)) { ++ ADD_COUNTER(e->counters, skb->len, 1); ++ xt_write_recseq_end(addend); ++ local_bh_enable(); ++ return verdict; ++ } ++ + /* We handle fragments by dealing with the first fragment as + * if it was a normal packet. All other fragments are treated + * normally, except that they will NEVER match rules that ask +@@ -348,18 +394,6 @@ ipt_do_table(struct sk_buff *skb, + acpar.family = NFPROTO_IPV4; + acpar.hooknum = hook; + +- IP_NF_ASSERT(table->valid_hooks & (1 << hook)); +- local_bh_disable(); +- addend = xt_write_recseq_begin(); +- private = table->private; +- cpu = smp_processor_id(); +- table_base = private->entries[cpu]; +- jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; +- stackptr = per_cpu_ptr(private->stackptr, cpu); +- origptr = *stackptr; +- +- e = get_entry(table_base, private->hook_entry[hook]); +- + pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n", + table->name, hook, origptr, + get_entry(table_base, private->underflow[hook])); diff --git a/3.3.8/612-netfilter_match_reduce_memory_access.patch b/3.3.8/612-netfilter_match_reduce_memory_access.patch new file mode 100644 index 0000000..f506165 --- /dev/null +++ b/3.3.8/612-netfilter_match_reduce_memory_access.patch @@ -0,0 +1,16 @@ +--- a/net/ipv4/netfilter/ip_tables.c ++++ b/net/ipv4/netfilter/ip_tables.c +@@ -84,9 +84,11 @@ ip_packet_match(const struct iphdr *ip, + if (ipinfo->flags & IPT_F_NO_DEF_MATCH) + return true; + +- if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, ++ if (FWINV(ipinfo->smsk.s_addr && ++ (ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, + IPT_INV_SRCIP) || +- FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, ++ FWINV(ipinfo->dmsk.s_addr && ++ (ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, + IPT_INV_DSTIP)) { + dprintf("Source or dest mismatch.\n"); + diff --git a/3.3.8/613-netfilter_optional_tcp_window_check.patch b/3.3.8/613-netfilter_optional_tcp_window_check.patch new file mode 100644 index 0000000..1c259d4 --- /dev/null +++ b/3.3.8/613-netfilter_optional_tcp_window_check.patch @@ -0,0 +1,36 @@ +--- a/net/netfilter/nf_conntrack_proto_tcp.c ++++ b/net/netfilter/nf_conntrack_proto_tcp.c +@@ -29,6 +29,9 @@ + #include + #include + ++/* Do not check the TCP window for incoming packets */ ++static int nf_ct_tcp_no_window_check __read_mostly = 1; ++ + /* "Be conservative in what you do, + be liberal in what you accept from others." + If it's non-zero, we mark only out of window RST segments as INVALID. */ +@@ -524,6 +527,9 @@ static bool tcp_in_window(const struct n + s16 receiver_offset; + bool res; + ++ if (nf_ct_tcp_no_window_check) ++ return true; ++ + /* + * Get the required data from the packet. + */ +@@ -1321,6 +1327,13 @@ static struct ctl_table tcp_sysctl_table + .proc_handler = proc_dointvec, + }, + { ++ .procname = "nf_conntrack_tcp_no_window_check", ++ .data = &nf_ct_tcp_no_window_check, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ { + .procname = "nf_conntrack_tcp_be_liberal", + .data = &nf_ct_tcp_be_liberal, + .maxlen = sizeof(unsigned int), diff --git a/3.3.8/620-sched_esfq.patch b/3.3.8/620-sched_esfq.patch new file mode 100644 index 0000000..1fdf09d --- /dev/null +++ b/3.3.8/620-sched_esfq.patch @@ -0,0 +1,791 @@ +--- a/include/linux/pkt_sched.h ++++ b/include/linux/pkt_sched.h +@@ -193,6 +193,33 @@ struct tc_sfq_xstats { + __s32 allot; + }; + ++/* ESFQ section */ ++ ++enum ++{ ++ /* traditional */ ++ TCA_SFQ_HASH_CLASSIC, ++ TCA_SFQ_HASH_DST, ++ TCA_SFQ_HASH_SRC, ++ TCA_SFQ_HASH_FWMARK, ++ /* conntrack */ ++ TCA_SFQ_HASH_CTORIGDST, ++ TCA_SFQ_HASH_CTORIGSRC, ++ TCA_SFQ_HASH_CTREPLDST, ++ TCA_SFQ_HASH_CTREPLSRC, ++ TCA_SFQ_HASH_CTNATCHG, ++}; ++ ++struct tc_esfq_qopt ++{ ++ unsigned quantum; /* Bytes per round allocated to flow */ ++ int perturb_period; /* Period of hash perturbation */ ++ __u32 limit; /* Maximal packets in queue */ ++ unsigned divisor; /* Hash divisor */ ++ unsigned flows; /* Maximal number of flows */ ++ unsigned hash_kind; /* Hash function to use for flow identification */ ++}; ++ + /* RED section */ + + enum { +--- a/net/sched/Kconfig ++++ b/net/sched/Kconfig +@@ -148,6 +148,37 @@ config NET_SCH_SFQ + To compile this code as a module, choose M here: the + module will be called sch_sfq. + ++config NET_SCH_ESFQ ++ tristate "Enhanced Stochastic Fairness Queueing (ESFQ)" ++ ---help--- ++ Say Y here if you want to use the Enhanced Stochastic Fairness ++ Queueing (ESFQ) packet scheduling algorithm for some of your network ++ devices or as a leaf discipline for a classful qdisc such as HTB or ++ CBQ (see the top of for details and ++ references to the SFQ algorithm). ++ ++ This is an enchanced SFQ version which allows you to control some ++ hardcoded values in the SFQ scheduler. ++ ++ ESFQ also adds control of the hash function used to identify packet ++ flows. The original SFQ discipline hashes by connection; ESFQ add ++ several other hashing methods, such as by src IP or by dst IP, which ++ can be more fair to users in some networking situations. ++ ++ To compile this code as a module, choose M here: the ++ module will be called sch_esfq. ++ ++config NET_SCH_ESFQ_NFCT ++ bool "Connection Tracking Hash Types" ++ depends on NET_SCH_ESFQ && NF_CONNTRACK ++ ---help--- ++ Say Y here to enable support for hashing based on netfilter connection ++ tracking information. This is useful for a router that is also using ++ NAT to connect privately-addressed hosts to the Internet. If you want ++ to provide fair distribution of upstream bandwidth, ESFQ must use ++ connection tracking information, since all outgoing packets will share ++ the same source address. ++ + config NET_SCH_TEQL + tristate "True Link Equalizer (TEQL)" + ---help--- +--- a/net/sched/Makefile ++++ b/net/sched/Makefile +@@ -26,6 +26,7 @@ obj-$(CONFIG_NET_SCH_INGRESS) += sch_ing + obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o + obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o + obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o ++obj-$(CONFIG_NET_SCH_ESFQ) += sch_esfq.o + obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o + obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o + obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o +--- /dev/null ++++ b/net/sched/sch_esfq.c +@@ -0,0 +1,702 @@ ++/* ++ * net/sched/sch_esfq.c Extended Stochastic Fairness Queueing discipline. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ * ++ * Authors: Alexey Kuznetsov, ++ * ++ * Changes: Alexander Atanasov, ++ * Added dynamic depth,limit,divisor,hash_kind options. ++ * Added dst and src hashes. ++ * ++ * Alexander Clouter, ++ * Ported ESFQ to Linux 2.6. ++ * ++ * Corey Hickey, ++ * Maintenance of the Linux 2.6 port. ++ * Added fwmark hash (thanks to Robert Kurjata). ++ * Added usage of jhash. ++ * Added conntrack support. ++ * Added ctnatchg hash (thanks to Ben Pfountz). ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_NET_SCH_ESFQ_NFCT ++#include ++#endif ++ ++/* Stochastic Fairness Queuing algorithm. ++ For more comments look at sch_sfq.c. ++ The difference is that you can change limit, depth, ++ hash table size and choose alternate hash types. ++ ++ classic: same as in sch_sfq.c ++ dst: destination IP address ++ src: source IP address ++ fwmark: netfilter mark value ++ ctorigdst: original destination IP address ++ ctorigsrc: original source IP address ++ ctrepldst: reply destination IP address ++ ctreplsrc: reply source IP ++ ++*/ ++ ++#define ESFQ_HEAD 0 ++#define ESFQ_TAIL 1 ++ ++/* This type should contain at least SFQ_DEPTH*2 values */ ++typedef unsigned int esfq_index; ++ ++struct esfq_head ++{ ++ esfq_index next; ++ esfq_index prev; ++}; ++ ++struct esfq_sched_data ++{ ++/* Parameters */ ++ int perturb_period; ++ unsigned quantum; /* Allotment per round: MUST BE >= MTU */ ++ int limit; ++ unsigned depth; ++ unsigned hash_divisor; ++ unsigned hash_kind; ++/* Variables */ ++ struct timer_list perturb_timer; ++ int perturbation; ++ esfq_index tail; /* Index of current slot in round */ ++ esfq_index max_depth; /* Maximal depth */ ++ ++ esfq_index *ht; /* Hash table */ ++ esfq_index *next; /* Active slots link */ ++ short *allot; /* Current allotment per slot */ ++ unsigned short *hash; /* Hash value indexed by slots */ ++ struct sk_buff_head *qs; /* Slot queue */ ++ struct esfq_head *dep; /* Linked list of slots, indexed by depth */ ++}; ++ ++/* This contains the info we will hash. */ ++struct esfq_packet_info ++{ ++ u32 proto; /* protocol or port */ ++ u32 src; /* source from packet header */ ++ u32 dst; /* destination from packet header */ ++ u32 ctorigsrc; /* original source from conntrack */ ++ u32 ctorigdst; /* original destination from conntrack */ ++ u32 ctreplsrc; /* reply source from conntrack */ ++ u32 ctrepldst; /* reply destination from conntrack */ ++ u32 mark; /* netfilter mark (fwmark) */ ++}; ++ ++static __inline__ unsigned esfq_jhash_1word(struct esfq_sched_data *q,u32 a) ++{ ++ return jhash_1word(a, q->perturbation) & (q->hash_divisor-1); ++} ++ ++static __inline__ unsigned esfq_jhash_2words(struct esfq_sched_data *q, u32 a, u32 b) ++{ ++ return jhash_2words(a, b, q->perturbation) & (q->hash_divisor-1); ++} ++ ++static __inline__ unsigned esfq_jhash_3words(struct esfq_sched_data *q, u32 a, u32 b, u32 c) ++{ ++ return jhash_3words(a, b, c, q->perturbation) & (q->hash_divisor-1); ++} ++ ++static unsigned esfq_hash(struct esfq_sched_data *q, struct sk_buff *skb) ++{ ++ struct esfq_packet_info info; ++#ifdef CONFIG_NET_SCH_ESFQ_NFCT ++ enum ip_conntrack_info ctinfo; ++ struct nf_conn *ct = nf_ct_get(skb, &ctinfo); ++#endif ++ ++ switch (skb->protocol) { ++ case __constant_htons(ETH_P_IP): ++ { ++ struct iphdr *iph = ip_hdr(skb); ++ info.dst = iph->daddr; ++ info.src = iph->saddr; ++ if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && ++ (iph->protocol == IPPROTO_TCP || ++ iph->protocol == IPPROTO_UDP || ++ iph->protocol == IPPROTO_SCTP || ++ iph->protocol == IPPROTO_DCCP || ++ iph->protocol == IPPROTO_ESP)) ++ info.proto = *(((u32*)iph) + iph->ihl); ++ else ++ info.proto = iph->protocol; ++ break; ++ } ++ case __constant_htons(ETH_P_IPV6): ++ { ++ struct ipv6hdr *iph = ipv6_hdr(skb); ++ /* Hash ipv6 addresses into a u32. This isn't ideal, ++ * but the code is simple. */ ++ info.dst = jhash2(iph->daddr.s6_addr32, 4, q->perturbation); ++ info.src = jhash2(iph->saddr.s6_addr32, 4, q->perturbation); ++ if (iph->nexthdr == IPPROTO_TCP || ++ iph->nexthdr == IPPROTO_UDP || ++ iph->nexthdr == IPPROTO_SCTP || ++ iph->nexthdr == IPPROTO_DCCP || ++ iph->nexthdr == IPPROTO_ESP) ++ info.proto = *(u32*)&iph[1]; ++ else ++ info.proto = iph->nexthdr; ++ break; ++ } ++ default: ++ info.dst = (u32)(unsigned long)skb_dst(skb); ++ info.src = (u32)(unsigned long)skb->sk; ++ info.proto = skb->protocol; ++ } ++ ++ info.mark = skb->mark; ++ ++#ifdef CONFIG_NET_SCH_ESFQ_NFCT ++ /* defaults if there is no conntrack info */ ++ info.ctorigsrc = info.src; ++ info.ctorigdst = info.dst; ++ info.ctreplsrc = info.dst; ++ info.ctrepldst = info.src; ++ /* collect conntrack info */ ++ if (ct && ct != &nf_conntrack_untracked) { ++ if (skb->protocol == __constant_htons(ETH_P_IP)) { ++ info.ctorigsrc = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; ++ info.ctorigdst = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip; ++ info.ctreplsrc = ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip; ++ info.ctrepldst = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip; ++ } ++ else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { ++ /* Again, hash ipv6 addresses into a single u32. */ ++ info.ctorigsrc = jhash2(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip6, 4, q->perturbation); ++ info.ctorigdst = jhash2(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip6, 4, q->perturbation); ++ info.ctreplsrc = jhash2(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip6, 4, q->perturbation); ++ info.ctrepldst = jhash2(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip6, 4, q->perturbation); ++ } ++ ++ } ++#endif ++ ++ switch(q->hash_kind) { ++ case TCA_SFQ_HASH_CLASSIC: ++ return esfq_jhash_3words(q, info.dst, info.src, info.proto); ++ case TCA_SFQ_HASH_DST: ++ return esfq_jhash_1word(q, info.dst); ++ case TCA_SFQ_HASH_SRC: ++ return esfq_jhash_1word(q, info.src); ++ case TCA_SFQ_HASH_FWMARK: ++ return esfq_jhash_1word(q, info.mark); ++#ifdef CONFIG_NET_SCH_ESFQ_NFCT ++ case TCA_SFQ_HASH_CTORIGDST: ++ return esfq_jhash_1word(q, info.ctorigdst); ++ case TCA_SFQ_HASH_CTORIGSRC: ++ return esfq_jhash_1word(q, info.ctorigsrc); ++ case TCA_SFQ_HASH_CTREPLDST: ++ return esfq_jhash_1word(q, info.ctrepldst); ++ case TCA_SFQ_HASH_CTREPLSRC: ++ return esfq_jhash_1word(q, info.ctreplsrc); ++ case TCA_SFQ_HASH_CTNATCHG: ++ { ++ if (info.ctorigdst == info.ctreplsrc) ++ return esfq_jhash_1word(q, info.ctorigsrc); ++ return esfq_jhash_1word(q, info.ctreplsrc); ++ } ++#endif ++ default: ++ if (net_ratelimit()) ++ printk(KERN_WARNING "ESFQ: Unknown hash method. Falling back to classic.\n"); ++ } ++ return esfq_jhash_3words(q, info.dst, info.src, info.proto); ++} ++ ++static inline void esfq_link(struct esfq_sched_data *q, esfq_index x) ++{ ++ esfq_index p, n; ++ int d = q->qs[x].qlen + q->depth; ++ ++ p = d; ++ n = q->dep[d].next; ++ q->dep[x].next = n; ++ q->dep[x].prev = p; ++ q->dep[p].next = q->dep[n].prev = x; ++} ++ ++static inline void esfq_dec(struct esfq_sched_data *q, esfq_index x) ++{ ++ esfq_index p, n; ++ ++ n = q->dep[x].next; ++ p = q->dep[x].prev; ++ q->dep[p].next = n; ++ q->dep[n].prev = p; ++ ++ if (n == p && q->max_depth == q->qs[x].qlen + 1) ++ q->max_depth--; ++ ++ esfq_link(q, x); ++} ++ ++static inline void esfq_inc(struct esfq_sched_data *q, esfq_index x) ++{ ++ esfq_index p, n; ++ int d; ++ ++ n = q->dep[x].next; ++ p = q->dep[x].prev; ++ q->dep[p].next = n; ++ q->dep[n].prev = p; ++ d = q->qs[x].qlen; ++ if (q->max_depth < d) ++ q->max_depth = d; ++ ++ esfq_link(q, x); ++} ++ ++static unsigned int esfq_drop(struct Qdisc *sch) ++{ ++ struct esfq_sched_data *q = qdisc_priv(sch); ++ esfq_index d = q->max_depth; ++ struct sk_buff *skb; ++ unsigned int len; ++ ++ /* Queue is full! Find the longest slot and ++ drop a packet from it */ ++ ++ if (d > 1) { ++ esfq_index x = q->dep[d+q->depth].next; ++ skb = q->qs[x].prev; ++ len = skb->len; ++ __skb_unlink(skb, &q->qs[x]); ++ kfree_skb(skb); ++ esfq_dec(q, x); ++ sch->q.qlen--; ++ sch->qstats.drops++; ++ sch->qstats.backlog -= len; ++ return len; ++ } ++ ++ if (d == 1) { ++ /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ ++ d = q->next[q->tail]; ++ q->next[q->tail] = q->next[d]; ++ q->allot[q->next[d]] += q->quantum; ++ skb = q->qs[d].prev; ++ len = skb->len; ++ __skb_unlink(skb, &q->qs[d]); ++ kfree_skb(skb); ++ esfq_dec(q, d); ++ sch->q.qlen--; ++ q->ht[q->hash[d]] = q->depth; ++ sch->qstats.drops++; ++ sch->qstats.backlog -= len; ++ return len; ++ } ++ ++ return 0; ++} ++ ++static void esfq_q_enqueue(struct sk_buff *skb, struct esfq_sched_data *q, unsigned int end) ++{ ++ unsigned hash = esfq_hash(q, skb); ++ unsigned depth = q->depth; ++ esfq_index x; ++ ++ x = q->ht[hash]; ++ if (x == depth) { ++ q->ht[hash] = x = q->dep[depth].next; ++ q->hash[x] = hash; ++ } ++ ++ if (end == ESFQ_TAIL) ++ __skb_queue_tail(&q->qs[x], skb); ++ else ++ __skb_queue_head(&q->qs[x], skb); ++ ++ esfq_inc(q, x); ++ if (q->qs[x].qlen == 1) { /* The flow is new */ ++ if (q->tail == depth) { /* It is the first flow */ ++ q->tail = x; ++ q->next[x] = x; ++ q->allot[x] = q->quantum; ++ } else { ++ q->next[x] = q->next[q->tail]; ++ q->next[q->tail] = x; ++ q->tail = x; ++ } ++ } ++} ++ ++static int esfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) ++{ ++ struct esfq_sched_data *q = qdisc_priv(sch); ++ esfq_q_enqueue(skb, q, ESFQ_TAIL); ++ sch->qstats.backlog += skb->len; ++ if (++sch->q.qlen < q->limit-1) { ++ sch->bstats.bytes += skb->len; ++ sch->bstats.packets++; ++ return 0; ++ } ++ ++ sch->qstats.drops++; ++ esfq_drop(sch); ++ return NET_XMIT_CN; ++} ++ ++static struct sk_buff *esfq_peek(struct Qdisc* sch) ++{ ++ struct esfq_sched_data *q = qdisc_priv(sch); ++ esfq_index a; ++ ++ /* No active slots */ ++ if (q->tail == q->depth) ++ return NULL; ++ ++ a = q->next[q->tail]; ++ return skb_peek(&q->qs[a]); ++} ++ ++static struct sk_buff *esfq_q_dequeue(struct esfq_sched_data *q) ++{ ++ struct sk_buff *skb; ++ unsigned depth = q->depth; ++ esfq_index a, old_a; ++ ++ /* No active slots */ ++ if (q->tail == depth) ++ return NULL; ++ ++ a = old_a = q->next[q->tail]; ++ ++ /* Grab packet */ ++ skb = __skb_dequeue(&q->qs[a]); ++ esfq_dec(q, a); ++ ++ /* Is the slot empty? */ ++ if (q->qs[a].qlen == 0) { ++ q->ht[q->hash[a]] = depth; ++ a = q->next[a]; ++ if (a == old_a) { ++ q->tail = depth; ++ return skb; ++ } ++ q->next[q->tail] = a; ++ q->allot[a] += q->quantum; ++ } else if ((q->allot[a] -= skb->len) <= 0) { ++ q->tail = a; ++ a = q->next[a]; ++ q->allot[a] += q->quantum; ++ } ++ ++ return skb; ++} ++ ++static struct sk_buff *esfq_dequeue(struct Qdisc* sch) ++{ ++ struct esfq_sched_data *q = qdisc_priv(sch); ++ struct sk_buff *skb; ++ ++ skb = esfq_q_dequeue(q); ++ if (skb == NULL) ++ return NULL; ++ sch->q.qlen--; ++ sch->qstats.backlog -= skb->len; ++ return skb; ++} ++ ++static void esfq_q_destroy(struct esfq_sched_data *q) ++{ ++ del_timer(&q->perturb_timer); ++ if(q->ht) ++ kfree(q->ht); ++ if(q->dep) ++ kfree(q->dep); ++ if(q->next) ++ kfree(q->next); ++ if(q->allot) ++ kfree(q->allot); ++ if(q->hash) ++ kfree(q->hash); ++ if(q->qs) ++ kfree(q->qs); ++} ++ ++static void esfq_destroy(struct Qdisc *sch) ++{ ++ struct esfq_sched_data *q = qdisc_priv(sch); ++ esfq_q_destroy(q); ++} ++ ++ ++static void esfq_reset(struct Qdisc* sch) ++{ ++ struct sk_buff *skb; ++ ++ while ((skb = esfq_dequeue(sch)) != NULL) ++ kfree_skb(skb); ++} ++ ++static void esfq_perturbation(unsigned long arg) ++{ ++ struct Qdisc *sch = (struct Qdisc*)arg; ++ struct esfq_sched_data *q = qdisc_priv(sch); ++ ++ q->perturbation = net_random()&0x1F; ++ ++ if (q->perturb_period) { ++ q->perturb_timer.expires = jiffies + q->perturb_period; ++ add_timer(&q->perturb_timer); ++ } ++} ++ ++static unsigned int esfq_check_hash(unsigned int kind) ++{ ++ switch (kind) { ++ case TCA_SFQ_HASH_CTORIGDST: ++ case TCA_SFQ_HASH_CTORIGSRC: ++ case TCA_SFQ_HASH_CTREPLDST: ++ case TCA_SFQ_HASH_CTREPLSRC: ++ case TCA_SFQ_HASH_CTNATCHG: ++#ifndef CONFIG_NET_SCH_ESFQ_NFCT ++ { ++ if (net_ratelimit()) ++ printk(KERN_WARNING "ESFQ: Conntrack hash types disabled in kernel config. Falling back to classic.\n"); ++ return TCA_SFQ_HASH_CLASSIC; ++ } ++#endif ++ case TCA_SFQ_HASH_CLASSIC: ++ case TCA_SFQ_HASH_DST: ++ case TCA_SFQ_HASH_SRC: ++ case TCA_SFQ_HASH_FWMARK: ++ return kind; ++ default: ++ { ++ if (net_ratelimit()) ++ printk(KERN_WARNING "ESFQ: Unknown hash type. Falling back to classic.\n"); ++ return TCA_SFQ_HASH_CLASSIC; ++ } ++ } ++} ++ ++static int esfq_q_init(struct esfq_sched_data *q, struct nlattr *opt) ++{ ++ struct tc_esfq_qopt *ctl = nla_data(opt); ++ esfq_index p = ~0U/2; ++ int i; ++ ++ if (opt && opt->nla_len < nla_attr_size(sizeof(*ctl))) ++ return -EINVAL; ++ ++ q->perturbation = 0; ++ q->hash_kind = TCA_SFQ_HASH_CLASSIC; ++ q->max_depth = 0; ++ if (opt == NULL) { ++ q->perturb_period = 0; ++ q->hash_divisor = 1024; ++ q->tail = q->limit = q->depth = 128; ++ ++ } else { ++ struct tc_esfq_qopt *ctl = nla_data(opt); ++ if (ctl->quantum) ++ q->quantum = ctl->quantum; ++ q->perturb_period = ctl->perturb_period*HZ; ++ q->hash_divisor = ctl->divisor ? : 1024; ++ q->tail = q->limit = q->depth = ctl->flows ? : 128; ++ ++ if ( q->depth > p - 1 ) ++ return -EINVAL; ++ ++ if (ctl->limit) ++ q->limit = min_t(u32, ctl->limit, q->depth); ++ ++ if (ctl->hash_kind) { ++ q->hash_kind = esfq_check_hash(ctl->hash_kind); ++ } ++ } ++ ++ q->ht = kmalloc(q->hash_divisor*sizeof(esfq_index), GFP_KERNEL); ++ if (!q->ht) ++ goto err_case; ++ q->dep = kmalloc((1+q->depth*2)*sizeof(struct esfq_head), GFP_KERNEL); ++ if (!q->dep) ++ goto err_case; ++ q->next = kmalloc(q->depth*sizeof(esfq_index), GFP_KERNEL); ++ if (!q->next) ++ goto err_case; ++ q->allot = kmalloc(q->depth*sizeof(short), GFP_KERNEL); ++ if (!q->allot) ++ goto err_case; ++ q->hash = kmalloc(q->depth*sizeof(unsigned short), GFP_KERNEL); ++ if (!q->hash) ++ goto err_case; ++ q->qs = kmalloc(q->depth*sizeof(struct sk_buff_head), GFP_KERNEL); ++ if (!q->qs) ++ goto err_case; ++ ++ for (i=0; i< q->hash_divisor; i++) ++ q->ht[i] = q->depth; ++ for (i=0; idepth; i++) { ++ skb_queue_head_init(&q->qs[i]); ++ q->dep[i+q->depth].next = i+q->depth; ++ q->dep[i+q->depth].prev = i+q->depth; ++ } ++ ++ for (i=0; idepth; i++) ++ esfq_link(q, i); ++ return 0; ++err_case: ++ esfq_q_destroy(q); ++ return -ENOBUFS; ++} ++ ++static int esfq_init(struct Qdisc *sch, struct nlattr *opt) ++{ ++ struct esfq_sched_data *q = qdisc_priv(sch); ++ int err; ++ ++ q->quantum = psched_mtu(qdisc_dev(sch)); /* default */ ++ if ((err = esfq_q_init(q, opt))) ++ return err; ++ ++ init_timer(&q->perturb_timer); ++ q->perturb_timer.data = (unsigned long)sch; ++ q->perturb_timer.function = esfq_perturbation; ++ if (q->perturb_period) { ++ q->perturb_timer.expires = jiffies + q->perturb_period; ++ add_timer(&q->perturb_timer); ++ } ++ ++ return 0; ++} ++ ++static int esfq_change(struct Qdisc *sch, struct nlattr *opt) ++{ ++ struct esfq_sched_data *q = qdisc_priv(sch); ++ struct esfq_sched_data new; ++ struct sk_buff *skb; ++ int err; ++ ++ /* set up new queue */ ++ memset(&new, 0, sizeof(struct esfq_sched_data)); ++ new.quantum = psched_mtu(qdisc_dev(sch)); /* default */ ++ if ((err = esfq_q_init(&new, opt))) ++ return err; ++ ++ /* copy all packets from the old queue to the new queue */ ++ sch_tree_lock(sch); ++ while ((skb = esfq_q_dequeue(q)) != NULL) ++ esfq_q_enqueue(skb, &new, ESFQ_TAIL); ++ ++ /* clean up the old queue */ ++ esfq_q_destroy(q); ++ ++ /* copy elements of the new queue into the old queue */ ++ q->perturb_period = new.perturb_period; ++ q->quantum = new.quantum; ++ q->limit = new.limit; ++ q->depth = new.depth; ++ q->hash_divisor = new.hash_divisor; ++ q->hash_kind = new.hash_kind; ++ q->tail = new.tail; ++ q->max_depth = new.max_depth; ++ q->ht = new.ht; ++ q->dep = new.dep; ++ q->next = new.next; ++ q->allot = new.allot; ++ q->hash = new.hash; ++ q->qs = new.qs; ++ ++ /* finish up */ ++ if (q->perturb_period) { ++ q->perturb_timer.expires = jiffies + q->perturb_period; ++ add_timer(&q->perturb_timer); ++ } else { ++ q->perturbation = 0; ++ } ++ sch_tree_unlock(sch); ++ return 0; ++} ++ ++static int esfq_dump(struct Qdisc *sch, struct sk_buff *skb) ++{ ++ struct esfq_sched_data *q = qdisc_priv(sch); ++ unsigned char *b = skb_tail_pointer(skb); ++ struct tc_esfq_qopt opt; ++ ++ opt.quantum = q->quantum; ++ opt.perturb_period = q->perturb_period/HZ; ++ ++ opt.limit = q->limit; ++ opt.divisor = q->hash_divisor; ++ opt.flows = q->depth; ++ opt.hash_kind = q->hash_kind; ++ ++ NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); ++ ++ return skb->len; ++ ++nla_put_failure: ++ nlmsg_trim(skb, b); ++ return -1; ++} ++ ++static struct Qdisc_ops esfq_qdisc_ops = ++{ ++ .next = NULL, ++ .cl_ops = NULL, ++ .id = "esfq", ++ .priv_size = sizeof(struct esfq_sched_data), ++ .enqueue = esfq_enqueue, ++ .dequeue = esfq_dequeue, ++ .peek = esfq_peek, ++ .drop = esfq_drop, ++ .init = esfq_init, ++ .reset = esfq_reset, ++ .destroy = esfq_destroy, ++ .change = esfq_change, ++ .dump = esfq_dump, ++ .owner = THIS_MODULE, ++}; ++ ++static int __init esfq_module_init(void) ++{ ++ return register_qdisc(&esfq_qdisc_ops); ++} ++static void __exit esfq_module_exit(void) ++{ ++ unregister_qdisc(&esfq_qdisc_ops); ++} ++module_init(esfq_module_init) ++module_exit(esfq_module_exit) ++MODULE_LICENSE("GPL"); diff --git a/3.3.8/621-sched_act_connmark.patch b/3.3.8/621-sched_act_connmark.patch new file mode 100644 index 0000000..b6adce1 --- /dev/null +++ b/3.3.8/621-sched_act_connmark.patch @@ -0,0 +1,172 @@ +--- /dev/null ++++ b/net/sched/act_connmark.c +@@ -0,0 +1,137 @@ ++/* ++ * Copyright (c) 2011 Felix Fietkau ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms and conditions of the GNU General Public License, ++ * version 2, as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ * more details. ++ * ++ * You should have received a copy of the GNU General Public License along with ++ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++ * Place - Suite 330, Boston, MA 02111-1307 USA. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#define TCA_ACT_CONNMARK 20 ++ ++#define CONNMARK_TAB_MASK 3 ++static struct tcf_common *tcf_connmark_ht[CONNMARK_TAB_MASK + 1]; ++static u32 connmark_idx_gen; ++static DEFINE_RWLOCK(connmark_lock); ++ ++static struct tcf_hashinfo connmark_hash_info = { ++ .htab = tcf_connmark_ht, ++ .hmask = CONNMARK_TAB_MASK, ++ .lock = &connmark_lock, ++}; ++ ++static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, ++ struct tcf_result *res) ++{ ++ struct nf_conn *c; ++ enum ip_conntrack_info ctinfo; ++ int proto; ++ int r; ++ ++ if (skb->protocol == htons(ETH_P_IP)) { ++ if (skb->len < sizeof(struct iphdr)) ++ goto out; ++ proto = PF_INET; ++ } else if (skb->protocol == htons(ETH_P_IPV6)) { ++ if (skb->len < sizeof(struct ipv6hdr)) ++ goto out; ++ proto = PF_INET6; ++ } else ++ goto out; ++ ++ r = nf_conntrack_in(dev_net(skb->dev), proto, NF_INET_PRE_ROUTING, skb); ++ if (r != NF_ACCEPT) ++ goto out; ++ ++ c = nf_ct_get(skb, &ctinfo); ++ if (!c) ++ goto out; ++ ++ skb->mark = c->mark; ++ nf_conntrack_put(skb->nfct); ++ skb->nfct = NULL; ++ ++out: ++ return TC_ACT_PIPE; ++} ++ ++static int tcf_connmark_init(struct nlattr *nla, struct nlattr *est, ++ struct tc_action *a, int ovr, int bind) ++{ ++ struct tcf_common *pc; ++ ++ pc = tcf_hash_create(0, est, a, sizeof(*pc), bind, ++ &connmark_idx_gen, &connmark_hash_info); ++ if (IS_ERR(pc)) ++ return PTR_ERR(pc); ++ ++ tcf_hash_insert(pc, &connmark_hash_info); ++ ++ return ACT_P_CREATED; ++} ++ ++static inline int tcf_connmark_cleanup(struct tc_action *a, int bind) ++{ ++ if (a->priv) ++ return tcf_hash_release(a->priv, bind, &connmark_hash_info); ++ return 0; ++} ++ ++static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, ++ int bind, int ref) ++{ ++ return skb->len; ++} ++ ++static struct tc_action_ops act_connmark_ops = { ++ .kind = "connmark", ++ .hinfo = &connmark_hash_info, ++ .type = TCA_ACT_CONNMARK, ++ .capab = TCA_CAP_NONE, ++ .owner = THIS_MODULE, ++ .act = tcf_connmark, ++ .dump = tcf_connmark_dump, ++ .cleanup = tcf_connmark_cleanup, ++ .init = tcf_connmark_init, ++ .walk = tcf_generic_walker, ++}; ++ ++MODULE_AUTHOR("Felix Fietkau "); ++MODULE_DESCRIPTION("Connection tracking mark restoring"); ++MODULE_LICENSE("GPL"); ++ ++static int __init connmark_init_module(void) ++{ ++ return tcf_register_action(&act_connmark_ops); ++} ++ ++static void __exit connmark_cleanup_module(void) ++{ ++ tcf_unregister_action(&act_connmark_ops); ++} ++ ++module_init(connmark_init_module); ++module_exit(connmark_cleanup_module); +--- a/net/sched/Kconfig ++++ b/net/sched/Kconfig +@@ -624,6 +624,19 @@ config NET_ACT_CSUM + To compile this code as a module, choose M here: the + module will be called act_csum. + ++config NET_ACT_CONNMARK ++ tristate "Connection Tracking Marking" ++ depends on NET_CLS_ACT ++ depends on NF_CONNTRACK ++ depends on NF_CONNTRACK_MARK ++ ---help--- ++ Say Y here to restore the connmark from a scheduler action ++ ++ If unsure, say N. ++ ++ To compile this code as a module, choose M here: the ++ module will be called act_connmark. ++ + config NET_CLS_IND + bool "Incoming device classification" + depends on NET_CLS_U32 || NET_CLS_FW +--- a/net/sched/Makefile ++++ b/net/sched/Makefile +@@ -16,6 +16,7 @@ obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit + obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o + obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o + obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o ++obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o + obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o + obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o + obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff --git a/3.3.8/910-kobject_uevent.patch b/3.3.8/910-kobject_uevent.patch new file mode 100644 index 0000000..aa9a40f --- /dev/null +++ b/3.3.8/910-kobject_uevent.patch @@ -0,0 +1,21 @@ +--- a/lib/kobject_uevent.c ++++ b/lib/kobject_uevent.c +@@ -50,6 +50,18 @@ static const char *kobject_actions[] = { + [KOBJ_OFFLINE] = "offline", + }; + ++u64 uevent_next_seqnum(void) ++{ ++ u64 seq; ++ ++ mutex_lock(&uevent_sock_mutex); ++ seq = ++uevent_seqnum; ++ mutex_unlock(&uevent_sock_mutex); ++ ++ return seq; ++} ++EXPORT_SYMBOL_GPL(uevent_next_seqnum); ++ + /** + * kobject_action_type - translate action string to numeric type + * diff --git a/3.3.8/911-kobject_add_broadcast_uevent.patch b/3.3.8/911-kobject_add_broadcast_uevent.patch new file mode 100644 index 0000000..104df13 --- /dev/null +++ b/3.3.8/911-kobject_add_broadcast_uevent.patch @@ -0,0 +1,85 @@ +--- a/include/linux/kobject.h ++++ b/include/linux/kobject.h +@@ -31,6 +31,8 @@ + #define UEVENT_NUM_ENVP 32 /* number of env pointers */ + #define UEVENT_BUFFER_SIZE 2048 /* buffer for the variables */ + ++struct sk_buff; ++ + /* path to the userspace helper executed on an event */ + extern char uevent_helper[]; + +@@ -213,6 +215,10 @@ int add_uevent_var(struct kobj_uevent_en + + int kobject_action_type(const char *buf, size_t count, + enum kobject_action *type); ++ ++int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group, ++ gfp_t allocation); ++ + #else + static inline int kobject_uevent(struct kobject *kobj, + enum kobject_action action) +@@ -229,6 +235,16 @@ int add_uevent_var(struct kobj_uevent_en + static inline int kobject_action_type(const char *buf, size_t count, + enum kobject_action *type) + { return -EINVAL; } ++ ++void kfree_skb(struct sk_buff *); ++ ++static inline int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group, ++ gfp_t allocation) ++{ ++ kfree_skb(skb); ++ return 0; ++} ++ + #endif + + #endif /* _KOBJECT_H_ */ +--- a/lib/kobject_uevent.c ++++ b/lib/kobject_uevent.c +@@ -381,6 +381,43 @@ int add_uevent_var(struct kobj_uevent_en + EXPORT_SYMBOL_GPL(add_uevent_var); + + #if defined(CONFIG_NET) ++int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group, ++ gfp_t allocation) ++{ ++ struct uevent_sock *ue_sk; ++ int err = 0; ++ ++ /* send netlink message */ ++ mutex_lock(&uevent_sock_mutex); ++ list_for_each_entry(ue_sk, &uevent_sock_list, list) { ++ struct sock *uevent_sock = ue_sk->sk; ++ struct sk_buff *skb2; ++ ++ skb2 = skb_clone(skb, allocation); ++ if (!skb2) ++ break; ++ ++ err = netlink_broadcast(uevent_sock, skb2, pid, group, ++ allocation); ++ if (err) ++ break; ++ } ++ mutex_unlock(&uevent_sock_mutex); ++ ++ kfree_skb(skb); ++ return err; ++} ++#else ++int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group, ++ gfp_t allocation) ++{ ++ kfree_skb(skb); ++ return 0; ++} ++#endif ++EXPORT_SYMBOL_GPL(broadcast_uevent); ++ ++#if defined(CONFIG_NET) + static int uevent_net_init(struct net *net) + { + struct uevent_sock *ue_sk; diff --git a/3.3.8/Add_CONFIG_VFAT_FS_DUALNAMES_option.patch b/3.3.8/Add_CONFIG_VFAT_FS_DUALNAMES_option.patch new file mode 100644 index 0000000..5e3cfe1 --- /dev/null +++ b/3.3.8/Add_CONFIG_VFAT_FS_DUALNAMES_option.patch @@ -0,0 +1,145 @@ +diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig +index 182f9ff..907a5de 100644 +--- a/fs/fat/Kconfig ++++ b/fs/fat/Kconfig +@@ -74,6 +74,26 @@ config VFAT_FS + To compile this as a module, choose M here: the module will be called + vfat. + ++config VFAT_FS_DUALNAMES ++ bool "VFAT dual names support" ++ depends on VFAT_FS ++ help ++ This option provides support for dual filenames on VFAT filesystems. ++ If this option is disabled then file creation will either put ++ a short (8.3) name or a long name on the file, but never both. ++ The field where a shortname would normally go is filled with ++ invalid characters such that it cannot be considered a valid ++ short filename. ++ ++ That means that long filenames created with this option ++ disabled will not be accessible at all to operating systems ++ that do not understand the VFAT extensions. ++ ++ Users considering enabling this option should consider the implications ++ of any patents that may exist on dual filenames in VFAT. ++ ++ If unsure, say N ++ + config FAT_DEFAULT_CODEPAGE + int "Default codepage for FAT" + depends on MSDOS_FS || VFAT_FS +diff --git a/fs/fat/dir.c b/fs/fat/dir.c +index 38ff75a..cd5d3ec 100644 +--- a/fs/fat/dir.c ++++ b/fs/fat/dir.c +@@ -415,14 +415,13 @@ + } + i += chl; + } +- if (!last_u) +- continue; +- +- /* Compare shortname */ +- bufuname[last_u] = 0x0000; +- len = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname)); +- if (fat_name_match(sbi, name, name_len, bufname, len)) +- goto found; ++ if (last_u) { ++ /* Compare shortname */ ++ bufuname[last_u] = 0x0000; ++ len = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname)); ++ if (fat_name_match(sbi, name, name_len, bufname, len)) ++ goto found; ++ } + + if (nr_slots) { + void *longname = unicode + FAT_MAX_UNI_CHARS; +diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c +index 73471b7..894f44d 100644 +--- a/fs/fat/namei_vfat.c ++++ b/fs/fat/namei_vfat.c +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include "fat.h" + + /* +@@ -586,6 +587,59 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, + return 0; + } + ++#ifndef CONFIG_VFAT_FS_DUALNAMES ++/* ++ * build a 11 byte 8.3 buffer which is not a short filename. We want 11 ++ * bytes which: ++ * - will be seen as a constant string to all APIs on Linux and Windows ++ * - cannot be matched with wildcard patterns ++ * - cannot be used to access the file ++ * - has a low probability of collision within a directory ++ * - has an invalid 3 byte extension ++ * - contains at least one non-space and non-nul byte ++ */ ++static void vfat_build_dummy_83_buffer(struct inode *dir, char *msdos_name) ++{ ++ u32 rand_num = random32() & 0x3FFFFFFF; ++ int i; ++ ++ /* a value of zero would leave us with only nul and spaces, ++ * which would not work with older linux systems ++ */ ++ if (rand_num == 0) ++ rand_num = 1; ++ ++ /* we start with a space followed by nul as spaces at the ++ * start of an entry are trimmed in FAT, which means that ++ * starting the 11 bytes with 0x20 0x00 gives us a value which ++ * cannot be used to access the file. It also means that the ++ * value as seen from all Windows and Linux APIs is a constant ++ */ ++ msdos_name[0] = ' '; ++ msdos_name[1] = 0; ++ ++ /* we use / and 2 nul bytes for the extension. These are ++ * invalid in FAT and mean that utilities that show the ++ * directory show no extension, but still work via the long ++ * name for old Linux kernels ++ */ ++ msdos_name[8] = '/'; ++ msdos_name[9] = 0; ++ msdos_name[10] = 0; ++ ++ /* ++ * fill the remaining 6 bytes with random invalid values ++ * This gives us a low collision rate, which means a low ++ * chance of problems with chkdsk.exe and WindowsXP ++ */ ++ for (i = 2; i < 8; i++) { ++ msdos_name[i] = rand_num & 0x1F; ++ rand_num >>= 5; ++ } ++} ++#endif ++ ++ + static int vfat_build_slots(struct inode *dir, const unsigned char *name, + int len, int is_dir, int cluster, + struct timespec *ts, +@@ -628,6 +682,11 @@ static int vfat_build_slots(struct inode *dir, const unsigned char *name, + goto shortname; + } + ++#ifndef CONFIG_VFAT_FS_DUALNAMES ++ vfat_build_dummy_83_buffer(dir, msdos_name); ++ lcase = 0; ++#endif ++ + /* build the entry of long file name */ + cksum = fat_checksum(msdos_name); + +-- +1.6.0.4 + + diff --git a/3.3.8/accessfs-3.2-0.26.patch b/3.3.8/accessfs-3.2-0.26.patch new file mode 100644 index 0000000..f36e634 --- /dev/null +++ b/3.3.8/accessfs-3.2-0.26.patch @@ -0,0 +1,1036 @@ +diff --git a/Documentation/filesystems/accessfs.txt b/Documentation/filesystems/accessfs.txt +new file mode 100644 +index 0000000..bf135b5 +--- /dev/null ++++ b/Documentation/filesystems/accessfs.txt +@@ -0,0 +1,41 @@ ++Accessfs is a permission managing filesystem. It allows to control access to ++system resources, based on file permissions. The recommended mount point for ++this file-system is /proc/access, which will appear automatically in the ++/proc filesystem. ++ ++Currently there are two modules using accessfs, userports and usercaps. ++ ++With userports, you will be able to control access to IP ports based ++on user-/groupid. ++ ++There's no need anymore to run internet daemons as root. You can ++individually configure which user/program can bind to protected ports ++(by default, below 1024). ++ ++For example, you can say, user www is allowed to bind to port 80 or ++user mail is allowed to bind to port 25. Then, you can run apache as ++user www and sendmail as user mail. Now, you don't have to rely on ++apache or sendmail giving up superuser rights to enhance security. ++ ++To use this option, you need to mount the access file system ++and do a chown on the appropriate ports: ++ ++# mount -t accessfs none /proc/access ++# chown www /proc/access/net/ip/bind/80 ++# chown mail /proc/access/net/ip/bind/25 ++ ++You can grant access to a group for individual ports as well. Just say: ++ ++# chgrp lp /proc/access/net/ip/bind/515 ++# chown g+x /proc/access/net/ip/bind/515 ++ ++With usercaps, you will be able to grant capabilities based on ++user-/groupid (root by default). ++ ++For example you can create a group raw and change the capability ++net_raw to this group: ++ ++# chgrp raw /proc/access/capabilities/net_raw ++# chmod ug+x /proc/access/capabilities/net_raw ++# chgrp raw /sbin/ping ++# chmod u-s /sbin/ping; chmod g+s /sbin/ping +diff --git a/fs/Kconfig b/fs/Kconfig +index 5f4c45d..24f7348 100644 +--- a/fs/Kconfig ++++ b/fs/Kconfig +@@ -210,6 +210,7 @@ + # UBIFS File system configuration + source "fs/ubifs/Kconfig" + source "fs/logfs/Kconfig" ++source "fs/accessfs/Kconfig" + source "fs/cramfs/Kconfig" + source "fs/squashfs/Kconfig" + source "fs/freevxfs/Kconfig" +diff --git a/fs/Makefile b/fs/Makefile +index d2c3353..fea1cfc 100644 +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -121,5 +121,6 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ + obj-$(CONFIG_BTRFS_FS) += btrfs/ + obj-$(CONFIG_GFS2_FS) += gfs2/ + obj-y += exofs/ # Multiple modules ++obj-$(CONFIG_ACCESS_FS) += accessfs/ + obj-$(CONFIG_CEPH_FS) += ceph/ + obj-$(CONFIG_PSTORE) += pstore/ +diff --git a/fs/accessfs/Kconfig b/fs/accessfs/Kconfig +new file mode 100644 +index 0000000..539d6e9 +--- /dev/null ++++ b/fs/accessfs/Kconfig +@@ -0,0 +1,61 @@ ++config ACCESS_FS ++ tristate "Accessfs support (Experimental)" ++ depends on EXPERIMENTAL ++ default n ++ help ++ This is a new file system to manage permissions. It is not very ++ useful on its own. You need to enable other options below. ++ ++ If you're unsure, say N. ++ ++config ACCESSFS_USER_PORTS ++ tristate "User permission based IP ports" ++ depends on ACCESS_FS && INET ++ select NET_HOOKS ++ default n ++ help ++ If you say Y here, you will be able to control access to IP ports ++ based on user-/groupid. ++ ++ If you're unsure, say N. ++ ++config ACCESSFS_PROT_SOCK ++ int "Range of protected ports (1024-65536)" ++ depends on ACCESSFS_USER_PORTS ++ default 1024 ++ help ++ Here you can extend the range of protected ports. This is ++ from 1-1023 inclusive on normal unix systems. One use for this ++ could be to reserve ports for X11 (port 6000) or database ++ servers (port 3306 for mysql), so nobody else could grab this port. ++ The default permission for extended ports is --x--x--x. ++ ++ If you build this as a module, you can specify the range of ++ protected ports at module load time (max_prot_sock). ++ ++ If you're unsure, say 1024. ++ ++config ACCESSFS_IGNORE_NET_BIND_SERVICE ++ bool "Ignore CAP_NET_BIND_SERVICE capability" ++ depends on ACCESSFS_USER_PORTS ++ default n ++ help ++ This option lets you decide, wether a user with ++ CAP_NET_BIND_SERVICE capability is able to override ++ your userport configuration. ++ ++ If you build this as a module, you can specify this ++ option at module load time (ignore_net_bind_service). ++ ++ If you're unsure, say n. ++ ++config ACCESSFS_USER_CAPABILITIES ++ bool "User permission based capabilities" ++ depends on ACCESS_FS = y ++ select SECURITY ++ default n ++ help ++ If you say Y here, you will be able to grant capabilities based on ++ user-/groupid (root by default). ++ ++ If you're unsure, say N. +diff --git a/fs/accessfs/Makefile b/fs/accessfs/Makefile +new file mode 100644 +index 0000000..63a5647 +--- /dev/null ++++ b/fs/accessfs/Makefile +@@ -0,0 +1,11 @@ ++# ++# Makefile for the linux accessfs routines. ++# ++ ++obj-$(CONFIG_ACCESS_FS) += accessfs.o ++obj-$(CONFIG_ACCESSFS_USER_CAPABILITIES) += usercaps.o ++obj-$(CONFIG_ACCESSFS_USER_PORTS) += userports.o ++ ++accessfs-objs := inode.o ++usercaps-objs := capabilities.o ++userports-objs := ip.o +diff --git a/fs/accessfs/capabilities.c b/fs/accessfs/capabilities.c +new file mode 100644 +index 0000000..1c43f36 +--- /dev/null ++++ b/fs/accessfs/capabilities.c +@@ -0,0 +1,109 @@ ++/* Copyright (c) 2002-2006 Olaf Dietsche ++ * ++ * User based capabilities for Linux. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++/* perl -n -e 'print "\"", lc($1), "\",\n" if (m/^#define\s+CAP_(.+?)\s+\d+$/);' include/linux/capability.h */ ++static const char *names[] = { ++ "chown", ++ "dac_override", ++ "dac_read_search", ++ "fowner", ++ "fsetid", ++ "kill", ++ "setgid", ++ "setuid", ++ "setpcap", ++ "linux_immutable", ++ "net_bind_service", ++ "net_broadcast", ++ "net_admin", ++ "net_raw", ++ "ipc_lock", ++ "ipc_owner", ++ "sys_module", ++ "sys_rawio", ++ "sys_chroot", ++ "sys_ptrace", ++ "sys_pacct", ++ "sys_admin", ++ "sys_boot", ++ "sys_nice", ++ "sys_resource", ++ "sys_time", ++ "sys_tty_config", ++ "mknod", ++ "lease", ++ "audit_write", ++ "audit_control", ++ "setfcap", ++ "mac_override", ++ "mac_admin", ++ "syslog", ++ "wake_alarm", ++}; ++ ++static struct access_attr caps[ARRAY_SIZE(names)]; ++ ++static int accessfs_capable(struct task_struct *tsk, const struct cred *cred, struct user_namespace *ns, int cap, int audit) ++{ ++ if (accessfs_permitted(&caps[cap], MAY_EXEC)) { ++ /* capability granted */ ++ return 0; ++ } ++ ++ /* capability denied */ ++ return -EPERM; ++} ++ ++static struct security_operations accessfs_security_ops = { ++ .name = "usercaps", ++ .capable = accessfs_capable, ++}; ++ ++static void unregister_capabilities(struct accessfs_direntry *dir, int n) ++{ ++ int i; ++ for (i = 0; i < n; ++i) ++ accessfs_unregister(dir, names[i]); ++} ++ ++static int __init init_capabilities(void) ++{ ++ struct accessfs_direntry *dir; ++ int i, err; ++ dir = accessfs_make_dirpath("capabilities"); ++ if (dir == 0) ++ return -ENOTDIR; ++ ++ for (i = 0; i < ARRAY_SIZE(caps); ++i) { ++ caps[i].uid = 0; ++ caps[i].gid = 0; ++ caps[i].mode = S_IXUSR; ++ err = accessfs_register(dir, names[i], &caps[i]); ++ if (err) { ++ unregister_capabilities(dir, i); ++ return err; ++ } ++ } ++ ++ if (!security_module_enable(&accessfs_security_ops)) ++ return -EAGAIN; ++ ++ err = register_security(&accessfs_security_ops); ++ if (err != 0) ++ unregister_capabilities(dir, ARRAY_SIZE(names)); ++ ++ return err; ++} ++ ++security_initcall(init_capabilities); ++ ++MODULE_AUTHOR("Olaf Dietsche"); ++MODULE_DESCRIPTION("User based capabilities"); ++MODULE_LICENSE("GPL v2"); +diff --git a/fs/accessfs/inode.c b/fs/accessfs/inode.c +new file mode 100644 +index 0000000..a2247e2 +--- /dev/null ++++ b/fs/accessfs/inode.c +@@ -0,0 +1,431 @@ ++/* Copyright (c) 2001-2006 Olaf Dietsche ++ * ++ * Access permission filesystem for Linux. ++ * ++ * 2002 Ben Clifford, create mount point at /proc/access ++ * 2002 Ben Clifford, trying to make it work under 2.5.5-dj2 ++ * (see comments: BENC255 for reminders and todos) ++ * ++ * ++ * BENC255: the kernel doesn't lock BKL for us when entering methods ++ * (see Documentation/fs/porting.txt) ++ * Need to look at code here and see if we need either the BKL ++ * or our own lock - I think probably not. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define ACCESSFS_MAGIC 0x3c1d36e7 ++ ++static struct proc_dir_entry *mountdir = NULL; ++ ++static DEFINE_MUTEX(accessfs_sem); ++ ++static struct inode_operations accessfs_inode_operations; ++static struct file_operations accessfs_dir_file_operations; ++static struct inode_operations accessfs_dir_inode_operations; ++ ++static inline void accessfs_readdir_aux(struct file *filp, ++ struct accessfs_direntry *dir, ++ int start, void *dirent, ++ filldir_t filldir) ++{ ++ struct list_head *list; ++ int i = 2; ++ list_for_each(list, &dir->children) { ++ struct accessfs_entry *de; ++ if (i++ < start) ++ continue; ++ ++ de = list_entry(list, struct accessfs_entry, siblings); ++ if (filldir(dirent, de->name, strlen(de->name), filp->f_pos, ++ de->ino, DT_UNKNOWN) < 0) ++ break; ++ ++ ++filp->f_pos; ++ } ++} ++ ++static int accessfs_readdir(struct file *filp, void *dirent, filldir_t filldir) ++{ ++ int i; ++ struct dentry *dentry = filp->f_dentry; ++ struct accessfs_direntry *dir; ++ ++ i = filp->f_pos; ++ switch (i) { ++ case 0: ++ if (filldir(dirent, ".", 1, i, dentry->d_inode->i_ino, ++ DT_DIR) < 0) ++ break; ++ ++ ++i; ++ ++filp->f_pos; ++ /* NO break; */ ++ case 1: ++ if (filldir(dirent, "..", 2, i, ++ dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) ++ break; ++ ++ ++i; ++ ++filp->f_pos; ++ /* NO break; */ ++ default: ++ mutex_lock(&accessfs_sem); ++ dir = dentry->d_inode->i_private; ++ accessfs_readdir_aux(filp, dir, i, dirent, filldir); ++ mutex_unlock(&accessfs_sem); ++ break; ++ } ++ ++ return 0; ++} ++ ++static struct accessfs_entry *accessfs_lookup_entry(struct accessfs_entry *pe, ++ const char *name, int len) ++{ ++ struct list_head *list; ++ struct accessfs_direntry *dir; ++ if (!S_ISDIR(pe->attr->mode)) ++ return NULL; ++ ++ dir = (struct accessfs_direntry *) pe; ++ list_for_each(list, &dir->children) { ++ struct accessfs_entry *de = list_entry(list, struct accessfs_entry, siblings); ++ if (strncmp(de->name, name, len) == 0 && de->name[len] == 0) ++ return de; ++ } ++ ++ return NULL; ++} ++ ++static struct accessfs_direntry accessfs_rootdir = { ++ { "/", ++ LIST_HEAD_INIT(accessfs_rootdir.node.hash), ++ LIST_HEAD_INIT(accessfs_rootdir.node.siblings), ++ 1, &accessfs_rootdir.attr }, ++ NULL, LIST_HEAD_INIT(accessfs_rootdir.children), ++ { 0, 0, S_IFDIR | 0755 } ++}; ++ ++static void accessfs_init_inode(struct inode *inode, struct accessfs_entry *pe) ++{ ++ static const struct timespec epoch = {0, 0}; ++ inode->i_private = pe; ++ inode->i_uid = pe->attr->uid; ++ inode->i_gid = pe->attr->gid; ++ inode->i_mode = pe->attr->mode; ++/* ++ inode->i_blksize = PAGE_CACHE_SIZE; ++ inode->i_blocks = 0; ++ inode->i_rdev = NODEV; ++*/ ++ inode->i_atime = inode->i_mtime = inode->i_ctime = epoch; ++ switch (inode->i_mode & S_IFMT) { ++ case S_IFREG: ++ inode->i_op = &accessfs_inode_operations; ++ break; ++ case S_IFDIR: ++ inode->i_op = &accessfs_dir_inode_operations; ++ inode->i_fop = &accessfs_dir_file_operations; ++ break; ++ default: ++ BUG(); ++ break; ++ } ++} ++ ++static struct inode *accessfs_get_root_inode(struct super_block *sb) ++{ ++ struct inode *inode = new_inode(sb); ++ if (inode) { ++ mutex_lock(&accessfs_sem); ++/* inode->i_ino = accessfs_rootdir.node.ino; */ ++ accessfs_init_inode(inode, &accessfs_rootdir.node); ++ accessfs_rootdir.node.ino = inode->i_ino; ++ mutex_unlock(&accessfs_sem); ++ } ++ ++ return inode; ++} ++ ++static LIST_HEAD(hash); ++ ++static int accessfs_node_init(struct accessfs_direntry *parent, ++ struct accessfs_entry *de, const char *name, ++ size_t len, struct access_attr *attr, mode_t mode) ++{ ++ static unsigned long ino = 1; ++ de->name = kmalloc(len + 1, GFP_KERNEL); ++ if (de->name == NULL) ++ return -ENOMEM; ++ ++ strncpy(de->name, name, len); ++ de->name[len] = 0; ++ de->ino = ++ino; ++ de->attr = attr; ++ de->attr->uid = 0; ++ de->attr->gid = 0; ++ de->attr->mode = mode; ++ ++ list_add_tail(&de->hash, &hash); ++ list_add_tail(&de->siblings, &parent->children); ++ return 0; ++} ++ ++static int accessfs_mknod(struct accessfs_direntry *dir, const char *name, ++ struct access_attr *attr) ++{ ++ struct accessfs_entry *pe; ++ pe = kmalloc(sizeof(struct accessfs_entry), GFP_KERNEL); ++ if (pe == NULL) ++ return -ENOMEM; ++ ++ accessfs_node_init(dir, pe, name, strlen(name), attr, ++ S_IFREG | attr->mode); ++ return 0; ++} ++ ++static struct accessfs_direntry *accessfs_mkdir(struct accessfs_direntry *parent, ++ const char *name, size_t len) ++{ ++ int err; ++ struct accessfs_direntry *dir; ++ dir = kmalloc(sizeof(struct accessfs_direntry), GFP_KERNEL); ++ if (dir == NULL) ++ return NULL; ++ ++ dir->parent = parent; ++ INIT_LIST_HEAD(&dir->children); ++ err = accessfs_node_init(parent, &dir->node, name, len, &dir->attr, ++ S_IFDIR | 0755); ++ if (err) { ++ kfree(dir); ++ dir = 0; ++ } ++ ++ return dir; ++} ++ ++struct accessfs_direntry *accessfs_make_dirpath(const char *name) ++{ ++ struct accessfs_direntry *dir = &accessfs_rootdir; ++ const char *slash; ++ mutex_lock(&accessfs_sem); ++ do { ++ struct accessfs_entry *de; ++ size_t len; ++ while (*name == '/') ++ ++name; ++ ++ slash = strchr(name, '/'); ++ len = slash ? slash - name : strlen(name); ++ de = accessfs_lookup_entry(&dir->node, name, len); ++ if (de == NULL) { ++ dir = accessfs_mkdir(dir, name, len); ++ } else if (S_ISDIR(de->attr->mode)) { ++ dir = (struct accessfs_direntry *) de; ++ } else { ++ dir = NULL; ++ } ++ ++ if (dir == NULL) ++ break; ++ ++ name = slash + 1; ++ } while (slash != NULL); ++ ++ mutex_unlock(&accessfs_sem); ++ return dir; ++} ++ ++static void accessfs_unlink(struct accessfs_entry *pe) ++{ ++ list_del_init(&pe->hash); ++ list_del_init(&pe->siblings); ++ kfree(pe->name); ++ kfree(pe); ++} ++ ++static int accessfs_notify_change(struct dentry *dentry, struct iattr *iattr) ++{ ++ struct accessfs_entry *pe; ++ struct inode *i = dentry->d_inode; ++ int err; ++ err = inode_change_ok(i, iattr); ++ if (err) ++ return err; ++ ++ setattr_copy(i, iattr); ++ ++ pe = (struct accessfs_entry *) i->i_private; ++ pe->attr->uid = i->i_uid; ++ pe->attr->gid = i->i_gid; ++ pe->attr->mode = i->i_mode; ++ return 0; ++} ++ ++static struct inode *accessfs_iget(struct super_block *sb, unsigned long ino) ++{ ++ struct list_head *list; ++ struct inode *inode = iget_locked(sb, ino); ++ if (!inode) ++ return ERR_PTR(-ENOMEM); ++ ++ if (!(inode->i_state & I_NEW)) ++ return inode; ++ ++ mutex_lock(&accessfs_sem); ++ list_for_each(list, &hash) { ++ struct accessfs_entry *pe; ++ pe = list_entry(list, struct accessfs_entry, hash); ++ if (pe->ino == ino) { ++ accessfs_init_inode(inode, pe); ++ break; ++ } ++ } ++ ++ mutex_unlock(&accessfs_sem); ++ return inode; ++} ++ ++static struct dentry *accessfs_lookup(struct inode *dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct inode *inode = NULL; ++ struct accessfs_entry *pe; ++ mutex_lock(&accessfs_sem); ++ pe = accessfs_lookup_entry(dir->i_private, dentry->d_name.name, ++ dentry->d_name.len); ++ mutex_unlock(&accessfs_sem); ++ if (pe) ++ inode = accessfs_iget(dir->i_sb, pe->ino); ++ ++ d_add(dentry, inode); ++ return NULL; ++} ++ ++static struct inode_operations accessfs_inode_operations = { ++ .setattr = accessfs_notify_change, ++}; ++ ++static struct inode_operations accessfs_dir_inode_operations = { ++ .lookup = accessfs_lookup, ++ .setattr = accessfs_notify_change, ++}; ++ ++static struct file_operations accessfs_dir_file_operations = { ++ .readdir = accessfs_readdir, ++}; ++ ++static struct super_operations accessfs_ops = { ++ .statfs = simple_statfs, ++}; ++ ++static int accessfs_fill_super(struct super_block *sb, void *data, int silent) ++{ ++ struct inode *inode; ++ struct dentry *root; ++ ++ sb->s_blocksize = PAGE_CACHE_SIZE; ++ sb->s_blocksize_bits = PAGE_CACHE_SHIFT; ++ sb->s_magic = ACCESSFS_MAGIC; ++ sb->s_op = &accessfs_ops; ++ inode = accessfs_get_root_inode(sb); ++ if (!inode) ++ return -ENOMEM; ++ ++ root = d_alloc_root(inode); ++ if (!root) { ++ iput(inode); ++ return -ENOMEM; ++ } ++ ++ sb->s_root = root; ++ return 0; ++} ++ ++static struct dentry *accessfs_mount(struct file_system_type *fs_type, ++ int flags, const char *dev_name, void *data) ++{ ++ return mount_single(fs_type, flags, data, accessfs_fill_super); ++} ++ ++int accessfs_permitted(struct access_attr *p, int mask) ++{ ++ mode_t mode = p->mode; ++ if (current_fsuid() == p->uid) ++ mode >>= 6; ++ else if (in_group_p(p->gid)) ++ mode >>= 3; ++ ++ return (mode & mask) == mask; ++} ++ ++int accessfs_register(struct accessfs_direntry *dir, const char *name, ++ struct access_attr *attr) ++{ ++ int err; ++ if (dir == 0) ++ return -EINVAL; ++ ++ mutex_lock(&accessfs_sem); ++ err = accessfs_mknod(dir, name, attr); ++ mutex_unlock(&accessfs_sem); ++ return err; ++} ++ ++void accessfs_unregister(struct accessfs_direntry *dir, const char *name) ++{ ++ struct accessfs_entry *pe; ++ mutex_lock(&accessfs_sem); ++ pe = accessfs_lookup_entry(&dir->node, name, strlen(name)); ++ if (pe) ++ accessfs_unlink(pe); ++ ++ mutex_unlock(&accessfs_sem); ++} ++ ++static struct file_system_type accessfs_fs_type = { ++ .owner = THIS_MODULE, ++ .name = "accessfs", ++ .mount = accessfs_mount, ++ .kill_sb = kill_anon_super, ++}; ++ ++static int __init init_accessfs_fs(void) ++{ ++ ++ /* create mount point for accessfs */ ++ mountdir = proc_mkdir("access", NULL); ++ return register_filesystem(&accessfs_fs_type); ++} ++ ++static void __exit exit_accessfs_fs(void) ++{ ++ unregister_filesystem(&accessfs_fs_type); ++ remove_proc_entry("access", NULL); ++} ++ ++module_init(init_accessfs_fs) ++module_exit(exit_accessfs_fs) ++ ++MODULE_AUTHOR("Olaf Dietsche"); ++MODULE_DESCRIPTION("Access Filesystem"); ++MODULE_LICENSE("GPL v2"); ++ ++EXPORT_SYMBOL(accessfs_permitted); ++EXPORT_SYMBOL(accessfs_make_dirpath); ++EXPORT_SYMBOL(accessfs_register); ++EXPORT_SYMBOL(accessfs_unregister); +diff --git a/fs/accessfs/ip.c b/fs/accessfs/ip.c +new file mode 100644 +index 0000000..bddd2f0 +--- /dev/null ++++ b/fs/accessfs/ip.c +@@ -0,0 +1,101 @@ ++/* Copyright (c) 2002-2006 Olaf Dietsche ++ * ++ * User permission based port access for Linux. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++static int max_prot_sock = CONFIG_ACCESSFS_PROT_SOCK; ++#ifndef CONFIG_ACCESSFS_IGNORE_NET_BIND_SERVICE ++#define CONFIG_ACCESSFS_IGNORE_NET_BIND_SERVICE 0 ++#endif ++static int ignore_net_bind_service = CONFIG_ACCESSFS_IGNORE_NET_BIND_SERVICE; ++static struct access_attr *bind_to_port; ++ ++static int accessfs_ip_prot_sock(struct socket *sock, ++ struct sockaddr *uaddr, int addr_len) ++{ ++ struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; ++ unsigned short snum = ntohs(addr->sin_port); ++ if (snum && snum < max_prot_sock ++ && !accessfs_permitted(&bind_to_port[snum], MAY_EXEC) ++ && (ignore_net_bind_service || !capable(CAP_NET_BIND_SERVICE))) ++ return -EACCES; ++ ++ return 0; ++} ++ ++static int accessfs_ip6_prot_sock(struct socket *sock, ++ struct sockaddr *uaddr, int addr_len) ++{ ++ struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; ++ unsigned short snum = ntohs(addr->sin6_port); ++ if (snum && snum < max_prot_sock ++ && !accessfs_permitted(&bind_to_port[snum], MAY_EXEC) ++ && !capable(CAP_NET_BIND_SERVICE)) ++ return -EACCES; ++ ++ return 0; ++} ++ ++static struct net_hook_operations ip_net_ops = { ++ .ip_prot_sock = accessfs_ip_prot_sock, ++ .ip6_prot_sock = accessfs_ip6_prot_sock, ++}; ++ ++static int __init init_ip(void) ++{ ++ struct accessfs_direntry *dir = accessfs_make_dirpath("net/ip/bind"); ++ int i; ++ ++ if (max_prot_sock < PROT_SOCK) ++ max_prot_sock = PROT_SOCK; ++ else if (max_prot_sock > 65536) ++ max_prot_sock = 65536; ++ ++ bind_to_port = kmalloc(max_prot_sock * sizeof(*bind_to_port), ++ GFP_KERNEL); ++ if (bind_to_port == 0) ++ return -ENOMEM; ++ ++ for (i = 1; i < max_prot_sock; ++i) { ++ char buf[sizeof("65536")]; ++ bind_to_port[i].uid = 0; ++ bind_to_port[i].gid = 0; ++ bind_to_port[i].mode = i < PROT_SOCK ? S_IXUSR : S_IXUGO; ++ sprintf(buf, "%d", i); ++ accessfs_register(dir, buf, &bind_to_port[i]); ++ } ++ ++ net_hooks_register(&ip_net_ops); ++ return 0; ++} ++ ++static void __exit exit_ip(void) ++{ ++ struct accessfs_direntry *dir = accessfs_make_dirpath("net/ip/bind"); ++ int i; ++ net_hooks_unregister(&ip_net_ops); ++ for (i = 1; i < max_prot_sock; ++i) { ++ char buf[sizeof("65536")]; ++ sprintf(buf, "%d", i); ++ accessfs_unregister(dir, buf); ++ } ++ ++ if (bind_to_port != NULL) ++ kfree(bind_to_port); ++} ++ ++module_init(init_ip) ++module_exit(exit_ip) ++ ++MODULE_AUTHOR("Olaf Dietsche"); ++MODULE_DESCRIPTION("User based IP ports permission"); ++MODULE_LICENSE("GPL v2"); ++module_param(max_prot_sock, int, 0444); ++MODULE_PARM_DESC(max_prot_sock, "Number of protected ports"); ++module_param(ignore_net_bind_service, bool, 0644); ++MODULE_PARM_DESC(ignore_net_bind_service, "Ignore CAP_NET_BIND_SERVICE capability"); +diff --git a/include/linux/accessfs_fs.h b/include/linux/accessfs_fs.h +new file mode 100644 +index 0000000..ecd914e +--- /dev/null ++++ b/include/linux/accessfs_fs.h +@@ -0,0 +1,42 @@ ++/* -*- mode: c -*- */ ++#ifndef __accessfs_fs_h_included__ ++#define __accessfs_fs_h_included__ 1 ++ ++/* Copyright (c) 2001 Olaf Dietsche ++ * ++ * Access permission filesystem for Linux. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++struct access_attr { ++ uid_t uid; ++ gid_t gid; ++ mode_t mode; ++}; ++ ++struct accessfs_entry { ++ char *name; ++ struct list_head hash; ++ struct list_head siblings; ++ ino_t ino; ++ struct access_attr *attr; ++}; ++ ++struct accessfs_direntry { ++ struct accessfs_entry node; ++ struct accessfs_direntry *parent; ++ struct list_head children; ++ struct access_attr attr; ++}; ++ ++extern int accessfs_permitted(struct access_attr *p, int mask); ++extern struct accessfs_direntry *accessfs_make_dirpath(const char *name); ++extern int accessfs_register(struct accessfs_direntry *dir, const char *name, struct access_attr *attr); ++extern void accessfs_unregister(struct accessfs_direntry *dir, const char *name); ++ ++#endif +diff --git a/include/net/sock.h b/include/net/sock.h +index 32e3937..5fa9348 100644 +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -1860,4 +1860,47 @@ extern int sysctl_optmem_max; + extern __u32 sysctl_wmem_default; + extern __u32 sysctl_rmem_default; + ++/* Networking hooks */ ++extern int default_ip_prot_sock(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len); ++extern int default_ip6_prot_sock(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len); ++#ifdef CONFIG_NET_HOOKS ++struct net_hook_operations { ++ int (*ip_prot_sock)(struct socket *sock, ++ struct sockaddr *uaddr, int addr_len); ++ int (*ip6_prot_sock)(struct socket *sock, ++ struct sockaddr *uaddr, int addr_len); ++}; ++ ++extern struct net_hook_operations *net_ops; ++ ++extern void net_hooks_register(struct net_hook_operations *ops); ++extern void net_hooks_unregister(struct net_hook_operations *ops); ++ ++static inline int ip_prot_sock(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len) ++{ ++ return net_ops->ip_prot_sock(sock, uaddr, addr_len); ++} ++ ++static inline int ip6_prot_sock(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len) ++{ ++ return net_ops->ip6_prot_sock(sock, uaddr, addr_len); ++} ++#else ++static inline int ip_prot_sock(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len) ++{ ++ return default_ip_prot_sock(sock, uaddr, addr_len); ++} ++ ++static inline int ip6_prot_sock(struct socket *sock, struct sockaddr *uaddr, ++ int addr_len) ++{ ++ return default_ip6_prot_sock(sock, uaddr, addr_len); ++} ++#endif ++ + #endif /* _SOCK_H */ +diff --git a/net/Kconfig b/net/Kconfig +index a073148..bb5fb42 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -75,6 +75,18 @@ config INET + if INET + source "net/ipv4/Kconfig" + source "net/ipv6/Kconfig" ++ ++config NET_HOOKS ++ bool "IP: Networking hooks (Experimental)" ++ depends on INET && EXPERIMENTAL ++ default n ++ help ++ This option enables other kernel parts or modules to hook into the ++ networking area and provide fine grained control over the access to ++ IP ports. ++ ++ If you're unsure, say N. ++ + source "net/netlabel/Kconfig" + + endif # if INET +diff --git a/net/Makefile b/net/Makefile +index acdde49..4e5dc79 100644 +--- a/net/Makefile ++++ b/net/Makefile +@@ -61,6 +61,7 @@ + obj-$(CONFIG_IEEE802154) += ieee802154/ + obj-$(CONFIG_MAC802154) += mac802154/ + ++obj-$(CONFIG_NET) += hooks.o + ifeq ($(CONFIG_NET),y) + obj-$(CONFIG_SYSCTL) += sysctl_net.o + endif +diff --git a/net/hooks.c b/net/hooks.c +new file mode 100644 +index 0000000..33100e6 +--- /dev/null ++++ b/net/hooks.c +@@ -0,0 +1,55 @@ ++/* Copyright (c) 2002 Olaf Dietsche ++ * ++ * Networking hooks. Currently for IPv4 and IPv6 only. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++int default_ip_prot_sock(struct socket *sock, struct sockaddr *uaddr, int addr_len) ++{ ++ struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; ++ unsigned short snum = ntohs(addr->sin_port); ++ if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) ++ return -EACCES; ++ ++ return 0; ++} ++ ++int default_ip6_prot_sock(struct socket *sock, struct sockaddr *uaddr, int addr_len) ++{ ++ struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; ++ unsigned short snum = ntohs(addr->sin6_port); ++ if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) ++ return -EACCES; ++ ++ return 0; ++} ++ ++EXPORT_SYMBOL(default_ip_prot_sock); ++EXPORT_SYMBOL(default_ip6_prot_sock); ++ ++#ifdef CONFIG_NET_HOOKS ++static struct net_hook_operations default_net_ops = { ++ .ip_prot_sock = default_ip_prot_sock, ++ .ip6_prot_sock = default_ip6_prot_sock, ++}; ++ ++struct net_hook_operations *net_ops = &default_net_ops; ++ ++void net_hooks_register(struct net_hook_operations *ops) ++{ ++ net_ops = ops; ++} ++ ++void net_hooks_unregister(struct net_hook_operations *ops) ++{ ++ net_ops = &default_net_ops; ++} ++ ++EXPORT_SYMBOL(net_ops); ++EXPORT_SYMBOL(net_hooks_register); ++EXPORT_SYMBOL(net_hooks_unregister); ++#endif +diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c +index 1b5096a..9460a3c 100644 +--- a/net/ipv4/af_inet.c ++++ b/net/ipv4/af_inet.c +@@ -495,7 +495,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) + + snum = ntohs(addr->sin_port); + err = -EACCES; +- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) ++ if (ip_prot_sock(sock, uaddr, addr_len)) + goto out; + + /* We keep a pair of addresses. rcv_saddr is the one +diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c +index d27c797..154b1ec 100644 +--- a/net/ipv6/af_inet6.c ++++ b/net/ipv6/af_inet6.c +@@ -281,7 +281,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) + return -EINVAL; + + snum = ntohs(addr->sin6_port); +- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) ++ if (ip6_prot_sock(sock, uaddr, addr_len)) + return -EACCES; + + lock_sock(sk); + diff --git a/3.3.8/ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch b/3.3.8/ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch new file mode 100644 index 0000000..7af90e4 --- /dev/null +++ b/3.3.8/ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch @@ -0,0 +1,36 @@ +>From 9f04e51293b130474504216a477bb2a73cbf59e1 Mon Sep 17 00:00:00 2001 +From: Anssi Hannula +Date: Thu, 22 Mar 2012 22:29:11 +0200 +Subject: [PATCH] ata: prefer ata drivers over ide drivers when both are built + +Currently the old IDE drivers are preferred over ATA drivers when both +are built, since ide/ is listed first in drivers/Makefile and therefore +the IDE drivers end up before ATA drivers in modules.order which is used +by depmod/modprobe for module ordering. + +Change it so that ATA drivers are preferred over IDE driver by moving +the ide/ entry under ata/ in drivers/Makefile. + +Signed-off-by: Anssi Hannula +--- + drivers/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/Makefile b/drivers/Makefile +index 932e8bf..e8df3d0 100644 +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -47,9 +47,9 @@ obj-$(CONFIG_PARPORT) += parport/ + obj-y += base/ block/ misc/ mfd/ nfc/ + obj-$(CONFIG_NUBUS) += nubus/ + obj-y += macintosh/ +-obj-$(CONFIG_IDE) += ide/ + obj-$(CONFIG_SCSI) += scsi/ + obj-$(CONFIG_ATA) += ata/ ++obj-$(CONFIG_IDE) += ide/ + obj-$(CONFIG_TARGET_CORE) += target/ + obj-$(CONFIG_MTD) += mtd/ + obj-$(CONFIG_SPI) += spi/ +-- +1.7.9.3 + diff --git a/3.3.8/aufs-3.x-rcN.patch b/3.3.8/aufs-3.x-rcN.patch new file mode 100644 index 0000000..176df5a --- /dev/null +++ b/3.3.8/aufs-3.x-rcN.patch @@ -0,0 +1,29364 @@ +aufs3.x-rcN kbuild patch + +diff --git a/fs/Kconfig b/fs/Kconfig +index d621f02..9b9694c 100644 +--- a/fs/Kconfig ++++ b/fs/Kconfig +@@ -215,6 +215,7 @@ source "fs/pstore/Kconfig" + source "fs/sysv/Kconfig" + source "fs/ufs/Kconfig" + source "fs/exofs/Kconfig" ++source "fs/aufs/Kconfig" + + endif # MISC_FILESYSTEMS + +diff --git a/fs/Makefile b/fs/Makefile +index 93804d4..cf3dcb9 100644 +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -124,3 +124,4 @@ obj-$(CONFIG_GFS2_FS) += gfs2/ + obj-y += exofs/ # Multiple modules + obj-$(CONFIG_CEPH_FS) += ceph/ + obj-$(CONFIG_PSTORE) += pstore/ ++obj-$(CONFIG_AUFS_FS) += aufs/ +diff --git a/include/linux/Kbuild b/include/linux/Kbuild +index c94e717..fccb9df 100644 +--- a/include/linux/Kbuild ++++ b/include/linux/Kbuild +@@ -65,6 +65,7 @@ header-y += atmppp.h + header-y += atmsap.h + header-y += atmsvc.h + header-y += audit.h ++header-y += aufs_type.h + header-y += auto_fs.h + header-y += auto_fs4.h + header-y += auxvec.h +aufs3.x-rcN base patch + +diff --git a/fs/namei.c b/fs/namei.c +index e2ba628..fde8ea2 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -1781,7 +1781,7 @@ static struct dentry *__lookup_hash(struct qstr *name, + * needs parent already locked. Doesn't follow mounts. + * SMP-safe. + */ +-static struct dentry *lookup_hash(struct nameidata *nd) ++struct dentry *lookup_hash(struct nameidata *nd) + { + return __lookup_hash(&nd->last, nd->path.dentry, nd); + } +diff --git a/fs/splice.c b/fs/splice.c +index 1ec0493..c599f73 100644 +--- a/fs/splice.c ++++ b/fs/splice.c +@@ -1084,8 +1084,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); + /* + * Attempt to initiate a splice from pipe to file. + */ +-static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, +- loff_t *ppos, size_t len, unsigned int flags) ++long do_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags) + { + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, + loff_t *, size_t, unsigned int); +@@ -1112,9 +1112,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + /* + * Attempt to initiate a splice from a file to a pipe. + */ +-static long do_splice_to(struct file *in, loff_t *ppos, +- struct pipe_inode_info *pipe, size_t len, +- unsigned int flags) ++long do_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags) + { + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); +diff --git a/include/linux/namei.h b/include/linux/namei.h +index ffc0213..ef35a31 100644 +--- a/include/linux/namei.h ++++ b/include/linux/namei.h +@@ -85,6 +85,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, + extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry, + int (*open)(struct inode *, struct file *)); + ++extern struct dentry *lookup_hash(struct nameidata *nd); + extern struct dentry *lookup_one_len(const char *, struct dentry *, int); + + extern int follow_down_one(struct path *); +diff --git a/include/linux/splice.h b/include/linux/splice.h +index 26e5b61..3ffef2f 100644 +--- a/include/linux/splice.h ++++ b/include/linux/splice.h +@@ -91,4 +91,10 @@ extern void splice_shrink_spd(struct pipe_inode_info *, + extern void spd_release_page(struct splice_pipe_desc *, unsigned int); + + extern const struct pipe_buf_operations page_cache_pipe_buf_ops; ++ ++extern long do_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags); ++extern long do_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags); + #endif +aufs3.x-rcN standalone patch + +diff --git a/fs/file_table.c b/fs/file_table.c +index 20002e3..6d792ad 100644 +--- a/fs/file_table.c ++++ b/fs/file_table.c +@@ -443,6 +443,8 @@ void file_sb_list_del(struct file *file) + } + } + ++EXPORT_SYMBOL(file_sb_list_del); ++ + #ifdef CONFIG_SMP + + /* +diff --git a/fs/inode.c b/fs/inode.c +index d3ebdbe..6db6251 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -66,6 +66,7 @@ static struct hlist_head *inode_hashtable __read_mostly; + static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); + + __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); ++EXPORT_SYMBOL(inode_sb_list_lock); + + /* + * Empty aops. Can be used for the cases where the user does not +diff --git a/fs/namei.c b/fs/namei.c +index fde8ea2..62f2302 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -1785,6 +1785,7 @@ struct dentry *lookup_hash(struct nameidata *nd) + { + return __lookup_hash(&nd->last, nd->path.dentry, nd); + } ++EXPORT_SYMBOL(lookup_hash); + + /** + * lookup_one_len - filesystem helper to lookup single pathname component +diff --git a/fs/namespace.c b/fs/namespace.c +index e608199..38fcc2e 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -1339,6 +1339,7 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, + } + return 0; + } ++EXPORT_SYMBOL(iterate_mounts); + + static void cleanup_group_ids(struct mount *mnt, struct mount *end) + { +diff --git a/fs/notify/group.c b/fs/notify/group.c +index 63fc294..6f4adca 100644 +--- a/fs/notify/group.c ++++ b/fs/notify/group.c +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + + #include + #include "fsnotify.h" +@@ -70,6 +71,7 @@ void fsnotify_put_group(struct fsnotify_group *group) + if (atomic_dec_and_test(&group->refcnt)) + fsnotify_destroy_group(group); + } ++EXPORT_SYMBOL(fsnotify_put_group); + + /* + * Create a new fsnotify_group and hold a reference for the group returned. +@@ -102,3 +104,4 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) + + return group; + } ++EXPORT_SYMBOL(fsnotify_alloc_group); +diff --git a/fs/notify/mark.c b/fs/notify/mark.c +index f104d56..54f36db 100644 +--- a/fs/notify/mark.c ++++ b/fs/notify/mark.c +@@ -112,6 +112,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) + if (atomic_dec_and_test(&mark->refcnt)) + mark->free_mark(mark); + } ++EXPORT_SYMBOL(fsnotify_put_mark); + + /* + * Any time a mark is getting freed we end up here. +@@ -191,6 +192,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) + if (unlikely(atomic_dec_and_test(&group->num_marks))) + fsnotify_final_destroy_group(group); + } ++EXPORT_SYMBOL(fsnotify_destroy_mark); + + void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) + { +@@ -278,6 +280,7 @@ err: + + return ret; + } ++EXPORT_SYMBOL(fsnotify_add_mark); + + /* + * clear any marks in a group in which mark->flags & flags is true +@@ -333,6 +336,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, + atomic_set(&mark->refcnt, 1); + mark->free_mark = free_mark; + } ++EXPORT_SYMBOL(fsnotify_init_mark); + + static int fsnotify_mark_destroy(void *ignored) + { +diff --git a/fs/open.c b/fs/open.c +index 77becc0..f634f02 100644 +--- a/fs/open.c ++++ b/fs/open.c +@@ -60,6 +60,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, + mutex_unlock(&dentry->d_inode->i_mutex); + return ret; + } ++EXPORT_SYMBOL(do_truncate); + + static long do_sys_truncate(const char __user *pathname, loff_t length) + { +diff --git a/fs/splice.c b/fs/splice.c +index c599f73..00303ba 100644 +--- a/fs/splice.c ++++ b/fs/splice.c +@@ -1108,6 +1108,7 @@ long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + + return splice_write(pipe, out, ppos, len, flags); + } ++EXPORT_SYMBOL(do_splice_from); + + /* + * Attempt to initiate a splice from a file to a pipe. +@@ -1134,6 +1135,7 @@ long do_splice_to(struct file *in, loff_t *ppos, + + return splice_read(in, ppos, pipe, len, flags); + } ++EXPORT_SYMBOL(do_splice_to); + + /** + * splice_direct_to_actor - splices data directly between two non-pipes +diff --git a/security/commoncap.c b/security/commoncap.c +index 7ce191e..19a95be 100644 +--- a/security/commoncap.c ++++ b/security/commoncap.c +@@ -965,3 +965,4 @@ int cap_file_mmap(struct file *file, unsigned long reqprot, + } + return ret; + } ++EXPORT_SYMBOL(cap_file_mmap); +diff --git a/security/device_cgroup.c b/security/device_cgroup.c +index 8b5b5d8..911850c 100644 +--- a/security/device_cgroup.c ++++ b/security/device_cgroup.c +@@ -7,6 +7,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -501,6 +502,7 @@ found: + + return -EPERM; + } ++EXPORT_SYMBOL(__devcgroup_inode_permission); + + int devcgroup_inode_mknod(int mode, dev_t dev) + { +diff --git a/security/security.c b/security/security.c +index d754249..1aa6154 100644 +--- a/security/security.c ++++ b/security/security.c +@@ -392,6 +392,7 @@ int security_path_rmdir(struct path *dir, struct dentry *dentry) + return 0; + return security_ops->path_rmdir(dir, dentry); + } ++EXPORT_SYMBOL(security_path_rmdir); + + int security_path_unlink(struct path *dir, struct dentry *dentry) + { +@@ -408,6 +409,7 @@ int security_path_symlink(struct path *dir, struct dentry *dentry, + return 0; + return security_ops->path_symlink(dir, dentry, old_name); + } ++EXPORT_SYMBOL(security_path_symlink); + + int security_path_link(struct dentry *old_dentry, struct path *new_dir, + struct dentry *new_dentry) +@@ -416,6 +418,7 @@ int security_path_link(struct dentry *old_dentry, struct path *new_dir, + return 0; + return security_ops->path_link(old_dentry, new_dir, new_dentry); + } ++EXPORT_SYMBOL(security_path_link); + + int security_path_rename(struct path *old_dir, struct dentry *old_dentry, + struct path *new_dir, struct dentry *new_dentry) +@@ -434,6 +437,7 @@ int security_path_truncate(struct path *path) + return 0; + return security_ops->path_truncate(path); + } ++EXPORT_SYMBOL(security_path_truncate); + + int security_path_chmod(struct path *path, umode_t mode) + { +@@ -441,6 +445,7 @@ int security_path_chmod(struct path *path, umode_t mode) + return 0; + return security_ops->path_chmod(path, mode); + } ++EXPORT_SYMBOL(security_path_chmod); + + int security_path_chown(struct path *path, uid_t uid, gid_t gid) + { +@@ -448,6 +453,7 @@ int security_path_chown(struct path *path, uid_t uid, gid_t gid) + return 0; + return security_ops->path_chown(path, uid, gid); + } ++EXPORT_SYMBOL(security_path_chown); + + int security_path_chroot(struct path *path) + { +@@ -524,6 +530,7 @@ int security_inode_readlink(struct dentry *dentry) + return 0; + return security_ops->inode_readlink(dentry); + } ++EXPORT_SYMBOL(security_inode_readlink); + + int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd) + { +@@ -538,6 +545,7 @@ int security_inode_permission(struct inode *inode, int mask) + return 0; + return security_ops->inode_permission(inode, mask); + } ++EXPORT_SYMBOL(security_inode_permission); + + int security_inode_setattr(struct dentry *dentry, struct iattr *attr) + { +@@ -653,6 +661,7 @@ int security_file_permission(struct file *file, int mask) + + return fsnotify_perm(file, mask); + } ++EXPORT_SYMBOL(security_file_permission); + + int security_file_alloc(struct file *file) + { +@@ -680,6 +689,7 @@ int security_file_mmap(struct file *file, unsigned long reqprot, + return ret; + return ima_file_mmap(file, prot); + } ++EXPORT_SYMBOL(security_file_mmap); + + int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, + unsigned long prot) +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/Documentation/ABI/testing/debugfs-aufs 2011-10-25 09:52:26.000000000 +0200 +@@ -0,0 +1,37 @@ ++What: /debug/aufs/si_/ ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ Under /debug/aufs, a directory named si_ is created ++ per aufs mount, where is a unique id generated ++ internally. ++ ++What: /debug/aufs/si_/xib ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ It shows the consumed blocks by xib (External Inode Number ++ Bitmap), its block size and file size. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see the aufs manual. ++ ++What: /debug/aufs/si_/xino0, xino1 ... xinoN ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ It shows the consumed blocks by xino (External Inode Number ++ Translation Table), its link count, block size and file ++ size. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see the aufs manual. ++ ++What: /debug/aufs/si_/xigen ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ It shows the consumed blocks by xigen (External Inode ++ Generation Table), its block size and file size. ++ If CONFIG_AUFS_EXPORT is disabled, this entry will not ++ be created. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see the aufs manual. +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/Documentation/ABI/testing/sysfs-aufs 2011-10-25 09:52:26.000000000 +0200 +@@ -0,0 +1,24 @@ ++What: /sys/fs/aufs/si_/ ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ Under /sys/fs/aufs, a directory named si_ is created ++ per aufs mount, where is a unique id generated ++ internally. ++ ++What: /sys/fs/aufs/si_/br0, br1 ... brN ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ It shows the abolute path of a member directory (which ++ is called branch) in aufs, and its permission. ++ ++What: /sys/fs/aufs/si_/xi_path ++Date: March 2009 ++Contact: J. R. Okajima ++Description: ++ It shows the abolute path of XINO (External Inode Number ++ Bitmap, Translation Table and Generation Table) file ++ even if it is the default path. ++ When the aufs mount option 'noxino' is specified, it ++ will be empty. About XINO files, see the aufs manual. +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/aufs.h 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,60 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * all header files ++ */ ++ ++#ifndef __AUFS_H__ ++#define __AUFS_H__ ++ ++#ifdef __KERNEL__ ++ ++#define AuStub(type, name, body, ...) \ ++ static inline type name(__VA_ARGS__) { body; } ++ ++#define AuStubVoid(name, ...) \ ++ AuStub(void, name, , __VA_ARGS__) ++#define AuStubInt0(name, ...) \ ++ AuStub(int, name, return 0, __VA_ARGS__) ++ ++#include "debug.h" ++ ++#include "branch.h" ++#include "cpup.h" ++#include "dcsub.h" ++#include "dbgaufs.h" ++#include "dentry.h" ++#include "dir.h" ++#include "dynop.h" ++#include "file.h" ++#include "fstype.h" ++#include "inode.h" ++#include "loop.h" ++#include "module.h" ++#include "opts.h" ++#include "rwsem.h" ++#include "spl.h" ++#include "super.h" ++#include "sysaufs.h" ++#include "vfsub.h" ++#include "whout.h" ++#include "wkq.h" ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/branch.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,1169 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * branch management ++ */ ++ ++#include ++#include ++#include "aufs.h" ++ ++/* ++ * free a single branch ++ */ ++static void au_br_do_free(struct au_branch *br) ++{ ++ int i; ++ struct au_wbr *wbr; ++ struct au_dykey **key; ++ ++ au_hnotify_fin_br(br); ++ ++ if (br->br_xino.xi_file) ++ fput(br->br_xino.xi_file); ++ mutex_destroy(&br->br_xino.xi_nondir_mtx); ++ ++ AuDebugOn(atomic_read(&br->br_count)); ++ ++ wbr = br->br_wbr; ++ if (wbr) { ++ for (i = 0; i < AuBrWh_Last; i++) ++ dput(wbr->wbr_wh[i]); ++ AuDebugOn(atomic_read(&wbr->wbr_wh_running)); ++ AuRwDestroy(&wbr->wbr_wh_rwsem); ++ } ++ ++ key = br->br_dykey; ++ for (i = 0; i < AuBrDynOp; i++, key++) ++ if (*key) ++ au_dy_put(*key); ++ else ++ break; ++ ++ mntput(br->br_mnt); ++ kfree(wbr); ++ kfree(br); ++} ++ ++/* ++ * frees all branches ++ */ ++void au_br_free(struct au_sbinfo *sbinfo) ++{ ++ aufs_bindex_t bmax; ++ struct au_branch **br; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ bmax = sbinfo->si_bend + 1; ++ br = sbinfo->si_branch; ++ while (bmax--) ++ au_br_do_free(*br++); ++} ++ ++/* ++ * find the index of a branch which is specified by @br_id. ++ */ ++int au_br_index(struct super_block *sb, aufs_bindex_t br_id) ++{ ++ aufs_bindex_t bindex, bend; ++ ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) ++ if (au_sbr_id(sb, bindex) == br_id) ++ return bindex; ++ return -1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * add a branch ++ */ ++ ++static int test_overlap(struct super_block *sb, struct dentry *h_adding, ++ struct dentry *h_root) ++{ ++ if (unlikely(h_adding == h_root ++ || au_test_loopback_overlap(sb, h_adding))) ++ return 1; ++ if (h_adding->d_sb != h_root->d_sb) ++ return 0; ++ return au_test_subdir(h_adding, h_root) ++ || au_test_subdir(h_root, h_adding); ++} ++ ++/* ++ * returns a newly allocated branch. @new_nbranch is a number of branches ++ * after adding a branch. ++ */ ++static struct au_branch *au_br_alloc(struct super_block *sb, int new_nbranch, ++ int perm) ++{ ++ struct au_branch *add_branch; ++ struct dentry *root; ++ int err; ++ ++ err = -ENOMEM; ++ root = sb->s_root; ++ add_branch = kmalloc(sizeof(*add_branch), GFP_NOFS); ++ if (unlikely(!add_branch)) ++ goto out; ++ ++ err = au_hnotify_init_br(add_branch, perm); ++ if (unlikely(err)) ++ goto out_br; ++ ++ add_branch->br_wbr = NULL; ++ if (au_br_writable(perm)) { ++ /* may be freed separately at changing the branch permission */ ++ add_branch->br_wbr = kmalloc(sizeof(*add_branch->br_wbr), ++ GFP_NOFS); ++ if (unlikely(!add_branch->br_wbr)) ++ goto out_hnotify; ++ } ++ ++ err = au_sbr_realloc(au_sbi(sb), new_nbranch); ++ if (!err) ++ err = au_di_realloc(au_di(root), new_nbranch); ++ if (!err) ++ err = au_ii_realloc(au_ii(root->d_inode), new_nbranch); ++ if (!err) ++ return add_branch; /* success */ ++ ++ kfree(add_branch->br_wbr); ++ ++out_hnotify: ++ au_hnotify_fin_br(add_branch); ++out_br: ++ kfree(add_branch); ++out: ++ return ERR_PTR(err); ++} ++ ++/* ++ * test if the branch permission is legal or not. ++ */ ++static int test_br(struct inode *inode, int brperm, char *path) ++{ ++ int err; ++ ++ err = (au_br_writable(brperm) && IS_RDONLY(inode)); ++ if (!err) ++ goto out; ++ ++ err = -EINVAL; ++ pr_err("write permission for readonly mount or inode, %s\n", path); ++ ++out: ++ return err; ++} ++ ++/* ++ * returns: ++ * 0: success, the caller will add it ++ * plus: success, it is already unified, the caller should ignore it ++ * minus: error ++ */ ++static int test_add(struct super_block *sb, struct au_opt_add *add, int remount) ++{ ++ int err; ++ aufs_bindex_t bend, bindex; ++ struct dentry *root; ++ struct inode *inode, *h_inode; ++ ++ root = sb->s_root; ++ bend = au_sbend(sb); ++ if (unlikely(bend >= 0 ++ && au_find_dbindex(root, add->path.dentry) >= 0)) { ++ err = 1; ++ if (!remount) { ++ err = -EINVAL; ++ pr_err("%s duplicated\n", add->pathname); ++ } ++ goto out; ++ } ++ ++ err = -ENOSPC; /* -E2BIG; */ ++ if (unlikely(AUFS_BRANCH_MAX <= add->bindex ++ || AUFS_BRANCH_MAX - 1 <= bend)) { ++ pr_err("number of branches exceeded %s\n", add->pathname); ++ goto out; ++ } ++ ++ err = -EDOM; ++ if (unlikely(add->bindex < 0 || bend + 1 < add->bindex)) { ++ pr_err("bad index %d\n", add->bindex); ++ goto out; ++ } ++ ++ inode = add->path.dentry->d_inode; ++ err = -ENOENT; ++ if (unlikely(!inode->i_nlink)) { ++ pr_err("no existence %s\n", add->pathname); ++ goto out; ++ } ++ ++ err = -EINVAL; ++ if (unlikely(inode->i_sb == sb)) { ++ pr_err("%s must be outside\n", add->pathname); ++ goto out; ++ } ++ ++ if (unlikely(au_test_fs_unsuppoted(inode->i_sb))) { ++ pr_err("unsupported filesystem, %s (%s)\n", ++ add->pathname, au_sbtype(inode->i_sb)); ++ goto out; ++ } ++ ++ err = test_br(add->path.dentry->d_inode, add->perm, add->pathname); ++ if (unlikely(err)) ++ goto out; ++ ++ if (bend < 0) ++ return 0; /* success */ ++ ++ err = -EINVAL; ++ for (bindex = 0; bindex <= bend; bindex++) ++ if (unlikely(test_overlap(sb, add->path.dentry, ++ au_h_dptr(root, bindex)))) { ++ pr_err("%s is overlapped\n", add->pathname); ++ goto out; ++ } ++ ++ err = 0; ++ if (au_opt_test(au_mntflags(sb), WARN_PERM)) { ++ h_inode = au_h_dptr(root, 0)->d_inode; ++ if ((h_inode->i_mode & S_IALLUGO) != (inode->i_mode & S_IALLUGO) ++ || h_inode->i_uid != inode->i_uid ++ || h_inode->i_gid != inode->i_gid) ++ pr_warning("uid/gid/perm %s %u/%u/0%o, %u/%u/0%o\n", ++ add->pathname, ++ inode->i_uid, inode->i_gid, ++ (inode->i_mode & S_IALLUGO), ++ h_inode->i_uid, h_inode->i_gid, ++ (h_inode->i_mode & S_IALLUGO)); ++ } ++ ++out: ++ return err; ++} ++ ++/* ++ * initialize or clean the whiteouts for an adding branch ++ */ ++static int au_br_init_wh(struct super_block *sb, struct au_branch *br, ++ int new_perm, struct dentry *h_root) ++{ ++ int err, old_perm; ++ aufs_bindex_t bindex; ++ struct mutex *h_mtx; ++ struct au_wbr *wbr; ++ struct au_hinode *hdir; ++ ++ wbr = br->br_wbr; ++ old_perm = br->br_perm; ++ br->br_perm = new_perm; ++ hdir = NULL; ++ h_mtx = NULL; ++ bindex = au_br_index(sb, br->br_id); ++ if (0 <= bindex) { ++ hdir = au_hi(sb->s_root->d_inode, bindex); ++ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ } else { ++ h_mtx = &h_root->d_inode->i_mutex; ++ mutex_lock_nested(h_mtx, AuLsc_I_PARENT); ++ } ++ if (!wbr) ++ err = au_wh_init(h_root, br, sb); ++ else { ++ wbr_wh_write_lock(wbr); ++ err = au_wh_init(h_root, br, sb); ++ wbr_wh_write_unlock(wbr); ++ } ++ if (hdir) ++ au_hn_imtx_unlock(hdir); ++ else ++ mutex_unlock(h_mtx); ++ br->br_perm = old_perm; ++ ++ if (!err && wbr && !au_br_writable(new_perm)) { ++ kfree(wbr); ++ br->br_wbr = NULL; ++ } ++ ++ return err; ++} ++ ++static int au_wbr_init(struct au_branch *br, struct super_block *sb, ++ int perm, struct path *path) ++{ ++ int err; ++ struct kstatfs kst; ++ struct au_wbr *wbr; ++ struct dentry *h_dentry; ++ ++ wbr = br->br_wbr; ++ au_rw_init(&wbr->wbr_wh_rwsem); ++ memset(wbr->wbr_wh, 0, sizeof(wbr->wbr_wh)); ++ atomic_set(&wbr->wbr_wh_running, 0); ++ wbr->wbr_bytes = 0; ++ ++ /* ++ * a limit for rmdir/rename a dir ++ * cf. AUFS_MAX_NAMELEN in include/linux/aufs_type.h ++ */ ++ err = vfs_statfs(path, &kst); ++ if (unlikely(err)) ++ goto out; ++ err = -EINVAL; ++ h_dentry = path->dentry; ++ if (kst.f_namelen >= NAME_MAX) ++ err = au_br_init_wh(sb, br, perm, h_dentry); ++ else ++ pr_err("%.*s(%s), unsupported namelen %ld\n", ++ AuDLNPair(h_dentry), au_sbtype(h_dentry->d_sb), ++ kst.f_namelen); ++ ++out: ++ return err; ++} ++ ++/* intialize a new branch */ ++static int au_br_init(struct au_branch *br, struct super_block *sb, ++ struct au_opt_add *add) ++{ ++ int err; ++ ++ err = 0; ++ memset(&br->br_xino, 0, sizeof(br->br_xino)); ++ mutex_init(&br->br_xino.xi_nondir_mtx); ++ br->br_perm = add->perm; ++ br->br_mnt = add->path.mnt; /* set first, mntget() later */ ++ spin_lock_init(&br->br_dykey_lock); ++ memset(br->br_dykey, 0, sizeof(br->br_dykey)); ++ atomic_set(&br->br_count, 0); ++ br->br_xino_upper = AUFS_XINO_TRUNC_INIT; ++ atomic_set(&br->br_xino_running, 0); ++ br->br_id = au_new_br_id(sb); ++ AuDebugOn(br->br_id < 0); ++ ++ if (au_br_writable(add->perm)) { ++ err = au_wbr_init(br, sb, add->perm, &add->path); ++ if (unlikely(err)) ++ goto out_err; ++ } ++ ++ if (au_opt_test(au_mntflags(sb), XINO)) { ++ err = au_xino_br(sb, br, add->path.dentry->d_inode->i_ino, ++ au_sbr(sb, 0)->br_xino.xi_file, /*do_test*/1); ++ if (unlikely(err)) { ++ AuDebugOn(br->br_xino.xi_file); ++ goto out_err; ++ } ++ } ++ ++ sysaufs_br_init(br); ++ mntget(add->path.mnt); ++ goto out; /* success */ ++ ++out_err: ++ br->br_mnt = NULL; ++out: ++ return err; ++} ++ ++static void au_br_do_add_brp(struct au_sbinfo *sbinfo, aufs_bindex_t bindex, ++ struct au_branch *br, aufs_bindex_t bend, ++ aufs_bindex_t amount) ++{ ++ struct au_branch **brp; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ brp = sbinfo->si_branch + bindex; ++ memmove(brp + 1, brp, sizeof(*brp) * amount); ++ *brp = br; ++ sbinfo->si_bend++; ++ if (unlikely(bend < 0)) ++ sbinfo->si_bend = 0; ++} ++ ++static void au_br_do_add_hdp(struct au_dinfo *dinfo, aufs_bindex_t bindex, ++ aufs_bindex_t bend, aufs_bindex_t amount) ++{ ++ struct au_hdentry *hdp; ++ ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ hdp = dinfo->di_hdentry + bindex; ++ memmove(hdp + 1, hdp, sizeof(*hdp) * amount); ++ au_h_dentry_init(hdp); ++ dinfo->di_bend++; ++ if (unlikely(bend < 0)) ++ dinfo->di_bstart = 0; ++} ++ ++static void au_br_do_add_hip(struct au_iinfo *iinfo, aufs_bindex_t bindex, ++ aufs_bindex_t bend, aufs_bindex_t amount) ++{ ++ struct au_hinode *hip; ++ ++ AuRwMustWriteLock(&iinfo->ii_rwsem); ++ ++ hip = iinfo->ii_hinode + bindex; ++ memmove(hip + 1, hip, sizeof(*hip) * amount); ++ hip->hi_inode = NULL; ++ au_hn_init(hip); ++ iinfo->ii_bend++; ++ if (unlikely(bend < 0)) ++ iinfo->ii_bstart = 0; ++} ++ ++static void au_br_do_add(struct super_block *sb, struct dentry *h_dentry, ++ struct au_branch *br, aufs_bindex_t bindex) ++{ ++ struct dentry *root; ++ struct inode *root_inode; ++ aufs_bindex_t bend, amount; ++ ++ root = sb->s_root; ++ root_inode = root->d_inode; ++ bend = au_sbend(sb); ++ amount = bend + 1 - bindex; ++ au_sbilist_lock(); ++ au_br_do_add_brp(au_sbi(sb), bindex, br, bend, amount); ++ au_br_do_add_hdp(au_di(root), bindex, bend, amount); ++ au_br_do_add_hip(au_ii(root_inode), bindex, bend, amount); ++ au_set_h_dptr(root, bindex, dget(h_dentry)); ++ au_set_h_iptr(root_inode, bindex, au_igrab(h_dentry->d_inode), ++ /*flags*/0); ++ au_sbilist_unlock(); ++} ++ ++int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount) ++{ ++ int err; ++ aufs_bindex_t bend, add_bindex; ++ struct dentry *root, *h_dentry; ++ struct inode *root_inode; ++ struct au_branch *add_branch; ++ ++ root = sb->s_root; ++ root_inode = root->d_inode; ++ IMustLock(root_inode); ++ err = test_add(sb, add, remount); ++ if (unlikely(err < 0)) ++ goto out; ++ if (err) { ++ err = 0; ++ goto out; /* success */ ++ } ++ ++ bend = au_sbend(sb); ++ add_branch = au_br_alloc(sb, bend + 2, add->perm); ++ err = PTR_ERR(add_branch); ++ if (IS_ERR(add_branch)) ++ goto out; ++ ++ err = au_br_init(add_branch, sb, add); ++ if (unlikely(err)) { ++ au_br_do_free(add_branch); ++ goto out; ++ } ++ ++ add_bindex = add->bindex; ++ h_dentry = add->path.dentry; ++ if (!remount) ++ au_br_do_add(sb, h_dentry, add_branch, add_bindex); ++ else { ++ sysaufs_brs_del(sb, add_bindex); ++ au_br_do_add(sb, h_dentry, add_branch, add_bindex); ++ sysaufs_brs_add(sb, add_bindex); ++ } ++ ++ if (!add_bindex) { ++ au_cpup_attr_all(root_inode, /*force*/1); ++ sb->s_maxbytes = h_dentry->d_sb->s_maxbytes; ++ } else ++ au_add_nlink(root_inode, h_dentry->d_inode); ++ ++ /* ++ * this test/set prevents aufs from handling unnecesary notify events ++ * of xino files, in case of re-adding a writable branch which was ++ * once detached from aufs. ++ */ ++ if (au_xino_brid(sb) < 0 ++ && au_br_writable(add_branch->br_perm) ++ && !au_test_fs_bad_xino(h_dentry->d_sb) ++ && add_branch->br_xino.xi_file ++ && add_branch->br_xino.xi_file->f_dentry->d_parent == h_dentry) ++ au_xino_brid_set(sb, add_branch->br_id); ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * delete a branch ++ */ ++ ++/* to show the line number, do not make it inlined function */ ++#define AuVerbose(do_info, fmt, ...) do { \ ++ if (do_info) \ ++ pr_info(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++static int au_test_ibusy(struct inode *inode, aufs_bindex_t bstart, ++ aufs_bindex_t bend) ++{ ++ return (inode && !S_ISDIR(inode->i_mode)) || bstart == bend; ++} ++ ++static int au_test_dbusy(struct dentry *dentry, aufs_bindex_t bstart, ++ aufs_bindex_t bend) ++{ ++ return au_test_ibusy(dentry->d_inode, bstart, bend); ++} ++ ++/* ++ * test if the branch is deletable or not. ++ */ ++static int test_dentry_busy(struct dentry *root, aufs_bindex_t bindex, ++ unsigned int sigen, const unsigned int verbose) ++{ ++ int err, i, j, ndentry; ++ aufs_bindex_t bstart, bend; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry *d; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_dcsub_pages(&dpages, root, NULL, NULL); ++ if (unlikely(err)) ++ goto out_dpages; ++ ++ for (i = 0; !err && i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ ndentry = dpage->ndentry; ++ for (j = 0; !err && j < ndentry; j++) { ++ d = dpage->dentries[j]; ++ AuDebugOn(!d->d_count); ++ if (!au_digen_test(d, sigen)) { ++ di_read_lock_child(d, AuLock_IR); ++ if (unlikely(au_dbrange_test(d))) { ++ di_read_unlock(d, AuLock_IR); ++ continue; ++ } ++ } else { ++ di_write_lock_child(d); ++ if (unlikely(au_dbrange_test(d))) { ++ di_write_unlock(d); ++ continue; ++ } ++ err = au_reval_dpath(d, sigen); ++ if (!err) ++ di_downgrade_lock(d, AuLock_IR); ++ else { ++ di_write_unlock(d); ++ break; ++ } ++ } ++ ++ /* AuDbgDentry(d); */ ++ bstart = au_dbstart(d); ++ bend = au_dbend(d); ++ if (bstart <= bindex ++ && bindex <= bend ++ && au_h_dptr(d, bindex) ++ && au_test_dbusy(d, bstart, bend)) { ++ err = -EBUSY; ++ AuVerbose(verbose, "busy %.*s\n", AuDLNPair(d)); ++ AuDbgDentry(d); ++ } ++ di_read_unlock(d, AuLock_IR); ++ } ++ } ++ ++out_dpages: ++ au_dpages_free(&dpages); ++out: ++ return err; ++} ++ ++static int test_inode_busy(struct super_block *sb, aufs_bindex_t bindex, ++ unsigned int sigen, const unsigned int verbose) ++{ ++ int err; ++ unsigned long long max, ull; ++ struct inode *i, **array; ++ aufs_bindex_t bstart, bend; ++ ++ array = au_iarray_alloc(sb, &max); ++ err = PTR_ERR(array); ++ if (IS_ERR(array)) ++ goto out; ++ ++ err = 0; ++ AuDbg("b%d\n", bindex); ++ for (ull = 0; !err && ull < max; ull++) { ++ i = array[ull]; ++ if (i->i_ino == AUFS_ROOT_INO) ++ continue; ++ ++ /* AuDbgInode(i); */ ++ if (au_iigen(i) == sigen) ++ ii_read_lock_child(i); ++ else { ++ ii_write_lock_child(i); ++ err = au_refresh_hinode_self(i); ++ au_iigen_dec(i); ++ if (!err) ++ ii_downgrade_lock(i); ++ else { ++ ii_write_unlock(i); ++ break; ++ } ++ } ++ ++ bstart = au_ibstart(i); ++ bend = au_ibend(i); ++ if (bstart <= bindex ++ && bindex <= bend ++ && au_h_iptr(i, bindex) ++ && au_test_ibusy(i, bstart, bend)) { ++ err = -EBUSY; ++ AuVerbose(verbose, "busy i%lu\n", i->i_ino); ++ AuDbgInode(i); ++ } ++ ii_read_unlock(i); ++ } ++ au_iarray_free(array, max); ++ ++out: ++ return err; ++} ++ ++static int test_children_busy(struct dentry *root, aufs_bindex_t bindex, ++ const unsigned int verbose) ++{ ++ int err; ++ unsigned int sigen; ++ ++ sigen = au_sigen(root->d_sb); ++ DiMustNoWaiters(root); ++ IiMustNoWaiters(root->d_inode); ++ di_write_unlock(root); ++ err = test_dentry_busy(root, bindex, sigen, verbose); ++ if (!err) ++ err = test_inode_busy(root->d_sb, bindex, sigen, verbose); ++ di_write_lock_child(root); /* aufs_write_lock() calls ..._child() */ ++ ++ return err; ++} ++ ++static void au_br_do_del_brp(struct au_sbinfo *sbinfo, ++ const aufs_bindex_t bindex, ++ const aufs_bindex_t bend) ++{ ++ struct au_branch **brp, **p; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ brp = sbinfo->si_branch + bindex; ++ if (bindex < bend) ++ memmove(brp, brp + 1, sizeof(*brp) * (bend - bindex)); ++ sbinfo->si_branch[0 + bend] = NULL; ++ sbinfo->si_bend--; ++ ++ p = krealloc(sbinfo->si_branch, sizeof(*p) * bend, AuGFP_SBILIST); ++ if (p) ++ sbinfo->si_branch = p; ++ /* harmless error */ ++} ++ ++static void au_br_do_del_hdp(struct au_dinfo *dinfo, const aufs_bindex_t bindex, ++ const aufs_bindex_t bend) ++{ ++ struct au_hdentry *hdp, *p; ++ ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ hdp = dinfo->di_hdentry; ++ if (bindex < bend) ++ memmove(hdp + bindex, hdp + bindex + 1, ++ sizeof(*hdp) * (bend - bindex)); ++ hdp[0 + bend].hd_dentry = NULL; ++ dinfo->di_bend--; ++ ++ p = krealloc(hdp, sizeof(*p) * bend, AuGFP_SBILIST); ++ if (p) ++ dinfo->di_hdentry = p; ++ /* harmless error */ ++} ++ ++static void au_br_do_del_hip(struct au_iinfo *iinfo, const aufs_bindex_t bindex, ++ const aufs_bindex_t bend) ++{ ++ struct au_hinode *hip, *p; ++ ++ AuRwMustWriteLock(&iinfo->ii_rwsem); ++ ++ hip = iinfo->ii_hinode + bindex; ++ if (bindex < bend) ++ memmove(hip, hip + 1, sizeof(*hip) * (bend - bindex)); ++ iinfo->ii_hinode[0 + bend].hi_inode = NULL; ++ au_hn_init(iinfo->ii_hinode + bend); ++ iinfo->ii_bend--; ++ ++ p = krealloc(iinfo->ii_hinode, sizeof(*p) * bend, AuGFP_SBILIST); ++ if (p) ++ iinfo->ii_hinode = p; ++ /* harmless error */ ++} ++ ++static void au_br_do_del(struct super_block *sb, aufs_bindex_t bindex, ++ struct au_branch *br) ++{ ++ aufs_bindex_t bend; ++ struct au_sbinfo *sbinfo; ++ struct dentry *root, *h_root; ++ struct inode *inode, *h_inode; ++ struct au_hinode *hinode; ++ ++ SiMustWriteLock(sb); ++ ++ root = sb->s_root; ++ inode = root->d_inode; ++ sbinfo = au_sbi(sb); ++ bend = sbinfo->si_bend; ++ ++ h_root = au_h_dptr(root, bindex); ++ hinode = au_hi(inode, bindex); ++ h_inode = au_igrab(hinode->hi_inode); ++ au_hiput(hinode); ++ ++ au_sbilist_lock(); ++ au_br_do_del_brp(sbinfo, bindex, bend); ++ au_br_do_del_hdp(au_di(root), bindex, bend); ++ au_br_do_del_hip(au_ii(inode), bindex, bend); ++ au_sbilist_unlock(); ++ ++ dput(h_root); ++ iput(h_inode); ++ au_br_do_free(br); ++} ++ ++int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount) ++{ ++ int err, rerr, i; ++ unsigned int mnt_flags; ++ aufs_bindex_t bindex, bend, br_id; ++ unsigned char do_wh, verbose; ++ struct au_branch *br; ++ struct au_wbr *wbr; ++ ++ err = 0; ++ bindex = au_find_dbindex(sb->s_root, del->h_path.dentry); ++ if (bindex < 0) { ++ if (remount) ++ goto out; /* success */ ++ err = -ENOENT; ++ pr_err("%s no such branch\n", del->pathname); ++ goto out; ++ } ++ AuDbg("bindex b%d\n", bindex); ++ ++ err = -EBUSY; ++ mnt_flags = au_mntflags(sb); ++ verbose = !!au_opt_test(mnt_flags, VERBOSE); ++ bend = au_sbend(sb); ++ if (unlikely(!bend)) { ++ AuVerbose(verbose, "no more branches left\n"); ++ goto out; ++ } ++ br = au_sbr(sb, bindex); ++ i = atomic_read(&br->br_count); ++ if (unlikely(i)) { ++ AuVerbose(verbose, "%d file(s) opened\n", i); ++ goto out; ++ } ++ ++ wbr = br->br_wbr; ++ do_wh = wbr && (wbr->wbr_whbase || wbr->wbr_plink || wbr->wbr_orph); ++ if (do_wh) { ++ /* instead of WbrWhMustWriteLock(wbr) */ ++ SiMustWriteLock(sb); ++ for (i = 0; i < AuBrWh_Last; i++) { ++ dput(wbr->wbr_wh[i]); ++ wbr->wbr_wh[i] = NULL; ++ } ++ } ++ ++ err = test_children_busy(sb->s_root, bindex, verbose); ++ if (unlikely(err)) { ++ if (do_wh) ++ goto out_wh; ++ goto out; ++ } ++ ++ err = 0; ++ br_id = br->br_id; ++ if (!remount) ++ au_br_do_del(sb, bindex, br); ++ else { ++ sysaufs_brs_del(sb, bindex); ++ au_br_do_del(sb, bindex, br); ++ sysaufs_brs_add(sb, bindex); ++ } ++ ++ if (!bindex) { ++ au_cpup_attr_all(sb->s_root->d_inode, /*force*/1); ++ sb->s_maxbytes = au_sbr_sb(sb, 0)->s_maxbytes; ++ } else ++ au_sub_nlink(sb->s_root->d_inode, del->h_path.dentry->d_inode); ++ if (au_opt_test(mnt_flags, PLINK)) ++ au_plink_half_refresh(sb, br_id); ++ ++ if (au_xino_brid(sb) == br_id) ++ au_xino_brid_set(sb, -1); ++ goto out; /* success */ ++ ++out_wh: ++ /* revert */ ++ rerr = au_br_init_wh(sb, br, br->br_perm, del->h_path.dentry); ++ if (rerr) ++ pr_warning("failed re-creating base whiteout, %s. (%d)\n", ++ del->pathname, rerr); ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_ibusy(struct super_block *sb, struct aufs_ibusy __user *arg) ++{ ++ int err; ++ aufs_bindex_t bstart, bend; ++ struct aufs_ibusy ibusy; ++ struct inode *inode, *h_inode; ++ ++ err = -EPERM; ++ if (unlikely(!capable(CAP_SYS_ADMIN))) ++ goto out; ++ ++ err = copy_from_user(&ibusy, arg, sizeof(ibusy)); ++ if (!err) ++ err = !access_ok(VERIFY_WRITE, &arg->h_ino, sizeof(arg->h_ino)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ goto out; ++ } ++ ++ err = -EINVAL; ++ si_read_lock(sb, AuLock_FLUSH); ++ if (unlikely(ibusy.bindex < 0 || ibusy.bindex > au_sbend(sb))) ++ goto out_unlock; ++ ++ err = 0; ++ ibusy.h_ino = 0; /* invalid */ ++ inode = ilookup(sb, ibusy.ino); ++ if (!inode ++ || inode->i_ino == AUFS_ROOT_INO ++ || is_bad_inode(inode)) ++ goto out_unlock; ++ ++ ii_read_lock_child(inode); ++ bstart = au_ibstart(inode); ++ bend = au_ibend(inode); ++ if (bstart <= ibusy.bindex && ibusy.bindex <= bend) { ++ h_inode = au_h_iptr(inode, ibusy.bindex); ++ if (h_inode && au_test_ibusy(inode, bstart, bend)) ++ ibusy.h_ino = h_inode->i_ino; ++ } ++ ii_read_unlock(inode); ++ iput(inode); ++ ++out_unlock: ++ si_read_unlock(sb); ++ if (!err) { ++ err = __put_user(ibusy.h_ino, &arg->h_ino); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ } ++ } ++out: ++ return err; ++} ++ ++long au_ibusy_ioctl(struct file *file, unsigned long arg) ++{ ++ return au_ibusy(file->f_dentry->d_sb, (void __user *)arg); ++} ++ ++#ifdef CONFIG_COMPAT ++long au_ibusy_compat_ioctl(struct file *file, unsigned long arg) ++{ ++ return au_ibusy(file->f_dentry->d_sb, compat_ptr(arg)); ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * change a branch permission ++ */ ++ ++static void au_warn_ima(void) ++{ ++#ifdef CONFIG_IMA ++ /* since it doesn't support mark_files_ro() */ ++ AuWarn1("RW -> RO makes IMA to produce wrong message\n"); ++#endif ++} ++ ++static int do_need_sigen_inc(int a, int b) ++{ ++ return au_br_whable(a) && !au_br_whable(b); ++} ++ ++static int need_sigen_inc(int old, int new) ++{ ++ return do_need_sigen_inc(old, new) ++ || do_need_sigen_inc(new, old); ++} ++ ++static unsigned long long au_farray_cb(void *a, ++ unsigned long long max __maybe_unused, ++ void *arg) ++{ ++ unsigned long long n; ++ struct file **p, *f; ++ struct super_block *sb = arg; ++ ++ n = 0; ++ p = a; ++ lg_global_lock(files_lglock); ++ do_file_list_for_each_entry(sb, f) { ++ if (au_fi(f) ++ && file_count(f) ++ && !special_file(f->f_dentry->d_inode->i_mode)) { ++ get_file(f); ++ *p++ = f; ++ n++; ++ AuDebugOn(n > max); ++ } ++ } while_file_list_for_each_entry; ++ lg_global_unlock(files_lglock); ++ ++ return n; ++} ++ ++static struct file **au_farray_alloc(struct super_block *sb, ++ unsigned long long *max) ++{ ++ *max = atomic_long_read(&au_sbi(sb)->si_nfiles); ++ return au_array_alloc(max, au_farray_cb, sb); ++} ++ ++static void au_farray_free(struct file **a, unsigned long long max) ++{ ++ unsigned long long ull; ++ ++ for (ull = 0; ull < max; ull++) ++ if (a[ull]) ++ fput(a[ull]); ++ au_array_free(a); ++} ++ ++static int au_br_mod_files_ro(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ int err, do_warn; ++ unsigned int mnt_flags; ++ unsigned long long ull, max; ++ aufs_bindex_t br_id; ++ unsigned char verbose; ++ struct file *file, *hf, **array; ++ struct inode *inode; ++ struct au_hfile *hfile; ++ ++ mnt_flags = au_mntflags(sb); ++ verbose = !!au_opt_test(mnt_flags, VERBOSE); ++ ++ array = au_farray_alloc(sb, &max); ++ err = PTR_ERR(array); ++ if (IS_ERR(array)) ++ goto out; ++ ++ do_warn = 0; ++ br_id = au_sbr_id(sb, bindex); ++ for (ull = 0; ull < max; ull++) { ++ file = array[ull]; ++ ++ /* AuDbg("%.*s\n", AuDLNPair(file->f_dentry)); */ ++ fi_read_lock(file); ++ if (unlikely(au_test_mmapped(file))) { ++ err = -EBUSY; ++ AuVerbose(verbose, "mmapped %.*s\n", ++ AuDLNPair(file->f_dentry)); ++ AuDbgFile(file); ++ FiMustNoWaiters(file); ++ fi_read_unlock(file); ++ goto out_array; ++ } ++ ++ inode = file->f_dentry->d_inode; ++ hfile = &au_fi(file)->fi_htop; ++ hf = hfile->hf_file; ++ if (!S_ISREG(inode->i_mode) ++ || !(file->f_mode & FMODE_WRITE) ++ || hfile->hf_br->br_id != br_id ++ || !(hf->f_mode & FMODE_WRITE)) ++ array[ull] = NULL; ++ else { ++ do_warn = 1; ++ get_file(file); ++ } ++ ++ FiMustNoWaiters(file); ++ fi_read_unlock(file); ++ fput(file); ++ } ++ ++ err = 0; ++ if (do_warn) ++ au_warn_ima(); ++ ++ for (ull = 0; ull < max; ull++) { ++ file = array[ull]; ++ if (!file) ++ continue; ++ ++ /* todo: already flushed? */ ++ /* cf. fs/super.c:mark_files_ro() */ ++ /* fi_read_lock(file); */ ++ hfile = &au_fi(file)->fi_htop; ++ hf = hfile->hf_file; ++ /* fi_read_unlock(file); */ ++ spin_lock(&hf->f_lock); ++ hf->f_mode &= ~FMODE_WRITE; ++ spin_unlock(&hf->f_lock); ++ if (!file_check_writeable(hf)) { ++ file_release_write(hf); ++ mnt_drop_write(hf->f_vfsmnt); ++ } ++ } ++ ++out_array: ++ au_farray_free(array, max); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, ++ int *do_refresh) ++{ ++ int err, rerr; ++ aufs_bindex_t bindex; ++ struct path path; ++ struct dentry *root; ++ struct au_branch *br; ++ ++ root = sb->s_root; ++ bindex = au_find_dbindex(root, mod->h_root); ++ if (bindex < 0) { ++ if (remount) ++ return 0; /* success */ ++ err = -ENOENT; ++ pr_err("%s no such branch\n", mod->path); ++ goto out; ++ } ++ AuDbg("bindex b%d\n", bindex); ++ ++ err = test_br(mod->h_root->d_inode, mod->perm, mod->path); ++ if (unlikely(err)) ++ goto out; ++ ++ br = au_sbr(sb, bindex); ++ if (br->br_perm == mod->perm) ++ return 0; /* success */ ++ ++ if (au_br_writable(br->br_perm)) { ++ /* remove whiteout base */ ++ err = au_br_init_wh(sb, br, mod->perm, mod->h_root); ++ if (unlikely(err)) ++ goto out; ++ ++ if (!au_br_writable(mod->perm)) { ++ /* rw --> ro, file might be mmapped */ ++ DiMustNoWaiters(root); ++ IiMustNoWaiters(root->d_inode); ++ di_write_unlock(root); ++ err = au_br_mod_files_ro(sb, bindex); ++ /* aufs_write_lock() calls ..._child() */ ++ di_write_lock_child(root); ++ ++ if (unlikely(err)) { ++ rerr = -ENOMEM; ++ br->br_wbr = kmalloc(sizeof(*br->br_wbr), ++ GFP_NOFS); ++ if (br->br_wbr) { ++ path.mnt = br->br_mnt; ++ path.dentry = mod->h_root; ++ rerr = au_wbr_init(br, sb, br->br_perm, ++ &path); ++ } ++ if (unlikely(rerr)) { ++ AuIOErr("nested error %d (%d)\n", ++ rerr, err); ++ br->br_perm = mod->perm; ++ } ++ } ++ } ++ } else if (au_br_writable(mod->perm)) { ++ /* ro --> rw */ ++ err = -ENOMEM; ++ br->br_wbr = kmalloc(sizeof(*br->br_wbr), GFP_NOFS); ++ if (br->br_wbr) { ++ path.mnt = br->br_mnt; ++ path.dentry = mod->h_root; ++ err = au_wbr_init(br, sb, mod->perm, &path); ++ if (unlikely(err)) { ++ kfree(br->br_wbr); ++ br->br_wbr = NULL; ++ } ++ } ++ } ++ ++ if (!err) { ++ *do_refresh |= need_sigen_inc(br->br_perm, mod->perm); ++ br->br_perm = mod->perm; ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/branch.h 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,230 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * branch filesystems and xino for them ++ */ ++ ++#ifndef __AUFS_BRANCH_H__ ++#define __AUFS_BRANCH_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include "dynop.h" ++#include "rwsem.h" ++#include "super.h" ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* a xino file */ ++struct au_xino_file { ++ struct file *xi_file; ++ struct mutex xi_nondir_mtx; ++ ++ /* todo: make xino files an array to support huge inode number */ ++ ++#ifdef CONFIG_DEBUG_FS ++ struct dentry *xi_dbgaufs; ++#endif ++}; ++ ++/* members for writable branch only */ ++enum {AuBrWh_BASE, AuBrWh_PLINK, AuBrWh_ORPH, AuBrWh_Last}; ++struct au_wbr { ++ struct au_rwsem wbr_wh_rwsem; ++ struct dentry *wbr_wh[AuBrWh_Last]; ++ atomic_t wbr_wh_running; ++#define wbr_whbase wbr_wh[AuBrWh_BASE] /* whiteout base */ ++#define wbr_plink wbr_wh[AuBrWh_PLINK] /* pseudo-link dir */ ++#define wbr_orph wbr_wh[AuBrWh_ORPH] /* dir for orphans */ ++ ++ /* mfs mode */ ++ unsigned long long wbr_bytes; ++}; ++ ++/* ext2 has 3 types of operations at least, ext3 has 4 */ ++#define AuBrDynOp (AuDyLast * 4) ++ ++/* protected by superblock rwsem */ ++struct au_branch { ++ struct au_xino_file br_xino; ++ ++ aufs_bindex_t br_id; ++ ++ int br_perm; ++ struct vfsmount *br_mnt; ++ spinlock_t br_dykey_lock; ++ struct au_dykey *br_dykey[AuBrDynOp]; ++ atomic_t br_count; ++ ++ struct au_wbr *br_wbr; ++ ++ /* xino truncation */ ++ blkcnt_t br_xino_upper; /* watermark in blocks */ ++ atomic_t br_xino_running; ++ ++#ifdef CONFIG_AUFS_HFSNOTIFY ++ struct fsnotify_group *br_hfsn_group; ++ struct fsnotify_ops br_hfsn_ops; ++#endif ++ ++#ifdef CONFIG_SYSFS ++ /* an entry under sysfs per mount-point */ ++ char br_name[8]; ++ struct attribute br_attr; ++#endif ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* branch permissions and attributes */ ++#define AuBrPerm_RW 1 /* writable, hardlinkable wh */ ++#define AuBrPerm_RO (1 << 1) /* readonly */ ++#define AuBrPerm_RR (1 << 2) /* natively readonly */ ++#define AuBrPerm_Mask (AuBrPerm_RW | AuBrPerm_RO | AuBrPerm_RR) ++ ++#define AuBrRAttr_WH (1 << 3) /* whiteout-able */ ++ ++#define AuBrWAttr_NoLinkWH (1 << 4) /* un-hardlinkable whiteouts */ ++ ++static inline int au_br_writable(int brperm) ++{ ++ return brperm & AuBrPerm_RW; ++} ++ ++static inline int au_br_whable(int brperm) ++{ ++ return brperm & (AuBrPerm_RW | AuBrRAttr_WH); ++} ++ ++static inline int au_br_wh_linkable(int brperm) ++{ ++ return !(brperm & AuBrWAttr_NoLinkWH); ++} ++ ++static inline int au_br_rdonly(struct au_branch *br) ++{ ++ return ((br->br_mnt->mnt_sb->s_flags & MS_RDONLY) ++ || !au_br_writable(br->br_perm)) ++ ? -EROFS : 0; ++} ++ ++static inline int au_br_hnotifyable(int brperm __maybe_unused) ++{ ++#ifdef CONFIG_AUFS_HNOTIFY ++ return !(brperm & AuBrPerm_RR); ++#else ++ return 0; ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* branch.c */ ++struct au_sbinfo; ++void au_br_free(struct au_sbinfo *sinfo); ++int au_br_index(struct super_block *sb, aufs_bindex_t br_id); ++struct au_opt_add; ++int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount); ++struct au_opt_del; ++int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount); ++long au_ibusy_ioctl(struct file *file, unsigned long arg); ++#ifdef CONFIG_COMPAT ++long au_ibusy_compat_ioctl(struct file *file, unsigned long arg); ++#endif ++struct au_opt_mod; ++int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, ++ int *do_refresh); ++ ++/* xino.c */ ++static const loff_t au_loff_max = LLONG_MAX; ++ ++int au_xib_trunc(struct super_block *sb); ++ssize_t xino_fread(au_readf_t func, struct file *file, void *buf, size_t size, ++ loff_t *pos); ++ssize_t xino_fwrite(au_writef_t func, struct file *file, void *buf, size_t size, ++ loff_t *pos); ++struct file *au_xino_create2(struct file *base_file, struct file *copy_src); ++struct file *au_xino_create(struct super_block *sb, char *fname, int silent); ++ino_t au_xino_new_ino(struct super_block *sb); ++void au_xino_delete_inode(struct inode *inode, const int unlinked); ++int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t ino); ++int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t *ino); ++int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t hino, ++ struct file *base_file, int do_test); ++int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex); ++ ++struct au_opt_xino; ++int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount); ++void au_xino_clr(struct super_block *sb); ++struct file *au_xino_def(struct super_block *sb); ++int au_xino_path(struct seq_file *seq, struct file *file); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* Superblock to branch */ ++static inline ++aufs_bindex_t au_sbr_id(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_sbr(sb, bindex)->br_id; ++} ++ ++static inline ++struct vfsmount *au_sbr_mnt(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_sbr(sb, bindex)->br_mnt; ++} ++ ++static inline ++struct super_block *au_sbr_sb(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_sbr_mnt(sb, bindex)->mnt_sb; ++} ++ ++static inline void au_sbr_put(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ atomic_dec(&au_sbr(sb, bindex)->br_count); ++} ++ ++static inline int au_sbr_perm(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_sbr(sb, bindex)->br_perm; ++} ++ ++static inline int au_sbr_whable(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ return au_br_whable(au_sbr_perm(sb, bindex)); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * wbr_wh_read_lock, wbr_wh_write_lock ++ * wbr_wh_read_unlock, wbr_wh_write_unlock, wbr_wh_downgrade_lock ++ */ ++AuSimpleRwsemFuncs(wbr_wh, struct au_wbr *wbr, &wbr->wbr_wh_rwsem); ++ ++#define WbrWhMustNoWaiters(wbr) AuRwMustNoWaiters(&wbr->wbr_wh_rwsem) ++#define WbrWhMustAnyLock(wbr) AuRwMustAnyLock(&wbr->wbr_wh_rwsem) ++#define WbrWhMustWriteLock(wbr) AuRwMustWriteLock(&wbr->wbr_wh_rwsem) ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_BRANCH_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/conf.mk 2011-10-25 09:52:26.000000000 +0200 +@@ -0,0 +1,38 @@ ++ ++AuConfStr = CONFIG_AUFS_FS=${CONFIG_AUFS_FS} ++ ++define AuConf ++ifdef ${1} ++AuConfStr += ${1}=${${1}} ++endif ++endef ++ ++AuConfAll = BRANCH_MAX_127 BRANCH_MAX_511 BRANCH_MAX_1023 BRANCH_MAX_32767 \ ++ SBILIST \ ++ HNOTIFY HFSNOTIFY \ ++ EXPORT INO_T_64 \ ++ RDU \ ++ PROC_MAP \ ++ SP_IATTR \ ++ SHWH \ ++ BR_RAMFS \ ++ BR_FUSE POLL \ ++ BR_HFSPLUS \ ++ BDEV_LOOP \ ++ DEBUG MAGIC_SYSRQ ++$(foreach i, ${AuConfAll}, \ ++ $(eval $(call AuConf,CONFIG_AUFS_${i}))) ++ ++AuConfName = ${obj}/conf.str ++${AuConfName}.tmp: FORCE ++ @echo ${AuConfStr} | tr ' ' '\n' | sed -e 's/^/"/' -e 's/$$/\\n"/' > $@ ++${AuConfName}: ${AuConfName}.tmp ++ @diff -q $< $@ > /dev/null 2>&1 || { \ ++ echo ' GEN ' $@; \ ++ cp -p $< $@; \ ++ } ++FORCE: ++clean-files += ${AuConfName} ${AuConfName}.tmp ++${obj}/sysfs.o: ${AuConfName} ++ ++-include ${srctree}/${src}/conf_priv.mk +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/cpup.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,1084 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * copy-up functions, see wbr_policy.c for copy-down ++ */ ++ ++#include ++#include ++#include "aufs.h" ++ ++void au_cpup_attr_flags(struct inode *dst, struct inode *src) ++{ ++ const unsigned int mask = S_DEAD | S_SWAPFILE | S_PRIVATE ++ | S_NOATIME | S_NOCMTIME; ++ ++ dst->i_flags |= src->i_flags & ~mask; ++ if (au_test_fs_notime(dst->i_sb)) ++ dst->i_flags |= S_NOATIME | S_NOCMTIME; ++} ++ ++void au_cpup_attr_timesizes(struct inode *inode) ++{ ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, au_ibstart(inode)); ++ fsstack_copy_attr_times(inode, h_inode); ++ fsstack_copy_inode_size(inode, h_inode); ++} ++ ++void au_cpup_attr_nlink(struct inode *inode, int force) ++{ ++ struct inode *h_inode; ++ struct super_block *sb; ++ aufs_bindex_t bindex, bend; ++ ++ sb = inode->i_sb; ++ bindex = au_ibstart(inode); ++ h_inode = au_h_iptr(inode, bindex); ++ if (!force ++ && !S_ISDIR(h_inode->i_mode) ++ && au_opt_test(au_mntflags(sb), PLINK) ++ && au_plink_test(inode)) ++ return; ++ ++ /* ++ * 0 can happen in revalidating. ++ * h_inode->i_mutex is not held, but it is harmless since once i_nlink ++ * reaches 0, it will never become positive. ++ */ ++ vfsub_set_nlink(inode, h_inode->i_nlink); ++ ++ /* ++ * fewer nlink makes find(1) noisy, but larger nlink doesn't. ++ * it may includes whplink directory. ++ */ ++ if (S_ISDIR(h_inode->i_mode)) { ++ bend = au_ibend(inode); ++ for (bindex++; bindex <= bend; bindex++) { ++ h_inode = au_h_iptr(inode, bindex); ++ if (h_inode) ++ au_add_nlink(inode, h_inode); ++ } ++ } ++} ++ ++void au_cpup_attr_changeable(struct inode *inode) ++{ ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, au_ibstart(inode)); ++ inode->i_mode = h_inode->i_mode; ++ inode->i_uid = h_inode->i_uid; ++ inode->i_gid = h_inode->i_gid; ++ au_cpup_attr_timesizes(inode); ++ au_cpup_attr_flags(inode, h_inode); ++} ++ ++void au_cpup_igen(struct inode *inode, struct inode *h_inode) ++{ ++ struct au_iinfo *iinfo = au_ii(inode); ++ ++ IiMustWriteLock(inode); ++ ++ iinfo->ii_higen = h_inode->i_generation; ++ iinfo->ii_hsb1 = h_inode->i_sb; ++} ++ ++void au_cpup_attr_all(struct inode *inode, int force) ++{ ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, au_ibstart(inode)); ++ au_cpup_attr_changeable(inode); ++ if (inode->i_nlink > 0) ++ au_cpup_attr_nlink(inode, force); ++ inode->i_rdev = h_inode->i_rdev; ++ inode->i_blkbits = h_inode->i_blkbits; ++ au_cpup_igen(inode, h_inode); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* Note: dt_dentry and dt_h_dentry are not dget/dput-ed */ ++ ++/* keep the timestamps of the parent dir when cpup */ ++void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, ++ struct path *h_path) ++{ ++ struct inode *h_inode; ++ ++ dt->dt_dentry = dentry; ++ dt->dt_h_path = *h_path; ++ h_inode = h_path->dentry->d_inode; ++ dt->dt_atime = h_inode->i_atime; ++ dt->dt_mtime = h_inode->i_mtime; ++ /* smp_mb(); */ ++} ++ ++void au_dtime_revert(struct au_dtime *dt) ++{ ++ struct iattr attr; ++ int err; ++ ++ attr.ia_atime = dt->dt_atime; ++ attr.ia_mtime = dt->dt_mtime; ++ attr.ia_valid = ATTR_FORCE | ATTR_MTIME | ATTR_MTIME_SET ++ | ATTR_ATIME | ATTR_ATIME_SET; ++ ++ err = vfsub_notify_change(&dt->dt_h_path, &attr); ++ if (unlikely(err)) ++ pr_warning("restoring timestamps failed(%d). ignored\n", err); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static noinline_for_stack ++int cpup_iattr(struct dentry *dst, aufs_bindex_t bindex, struct dentry *h_src) ++{ ++ int err, sbits; ++ struct iattr ia; ++ struct path h_path; ++ struct inode *h_isrc, *h_idst; ++ ++ h_path.dentry = au_h_dptr(dst, bindex); ++ h_idst = h_path.dentry->d_inode; ++ h_path.mnt = au_sbr_mnt(dst->d_sb, bindex); ++ h_isrc = h_src->d_inode; ++ ia.ia_valid = ATTR_FORCE | ATTR_UID | ATTR_GID ++ | ATTR_ATIME | ATTR_MTIME ++ | ATTR_ATIME_SET | ATTR_MTIME_SET; ++ ia.ia_uid = h_isrc->i_uid; ++ ia.ia_gid = h_isrc->i_gid; ++ ia.ia_atime = h_isrc->i_atime; ++ ia.ia_mtime = h_isrc->i_mtime; ++ if (h_idst->i_mode != h_isrc->i_mode ++ && !S_ISLNK(h_idst->i_mode)) { ++ ia.ia_valid |= ATTR_MODE; ++ ia.ia_mode = h_isrc->i_mode; ++ } ++ sbits = !!(h_isrc->i_mode & (S_ISUID | S_ISGID)); ++ au_cpup_attr_flags(h_idst, h_isrc); ++ err = vfsub_notify_change(&h_path, &ia); ++ ++ /* is this nfs only? */ ++ if (!err && sbits && au_test_nfs(h_path.dentry->d_sb)) { ++ ia.ia_valid = ATTR_FORCE | ATTR_MODE; ++ ia.ia_mode = h_isrc->i_mode; ++ err = vfsub_notify_change(&h_path, &ia); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_copy_file(struct file *dst, struct file *src, loff_t len, ++ char *buf, unsigned long blksize) ++{ ++ int err; ++ size_t sz, rbytes, wbytes; ++ unsigned char all_zero; ++ char *p, *zp; ++ struct mutex *h_mtx; ++ /* reduce stack usage */ ++ struct iattr *ia; ++ ++ zp = page_address(ZERO_PAGE(0)); ++ if (unlikely(!zp)) ++ return -ENOMEM; /* possible? */ ++ ++ err = 0; ++ all_zero = 0; ++ while (len) { ++ AuDbg("len %lld\n", len); ++ sz = blksize; ++ if (len < blksize) ++ sz = len; ++ ++ rbytes = 0; ++ /* todo: signal_pending? */ ++ while (!rbytes || err == -EAGAIN || err == -EINTR) { ++ rbytes = vfsub_read_k(src, buf, sz, &src->f_pos); ++ err = rbytes; ++ } ++ if (unlikely(err < 0)) ++ break; ++ ++ all_zero = 0; ++ if (len >= rbytes && rbytes == blksize) ++ all_zero = !memcmp(buf, zp, rbytes); ++ if (!all_zero) { ++ wbytes = rbytes; ++ p = buf; ++ while (wbytes) { ++ size_t b; ++ ++ b = vfsub_write_k(dst, p, wbytes, &dst->f_pos); ++ err = b; ++ /* todo: signal_pending? */ ++ if (unlikely(err == -EAGAIN || err == -EINTR)) ++ continue; ++ if (unlikely(err < 0)) ++ break; ++ wbytes -= b; ++ p += b; ++ } ++ } else { ++ loff_t res; ++ ++ AuLabel(hole); ++ res = vfsub_llseek(dst, rbytes, SEEK_CUR); ++ err = res; ++ if (unlikely(res < 0)) ++ break; ++ } ++ len -= rbytes; ++ err = 0; ++ } ++ ++ /* the last block may be a hole */ ++ if (!err && all_zero) { ++ AuLabel(last hole); ++ ++ err = 1; ++ if (au_test_nfs(dst->f_dentry->d_sb)) { ++ /* nfs requires this step to make last hole */ ++ /* is this only nfs? */ ++ do { ++ /* todo: signal_pending? */ ++ err = vfsub_write_k(dst, "\0", 1, &dst->f_pos); ++ } while (err == -EAGAIN || err == -EINTR); ++ if (err == 1) ++ dst->f_pos--; ++ } ++ ++ if (err == 1) { ++ ia = (void *)buf; ++ ia->ia_size = dst->f_pos; ++ ia->ia_valid = ATTR_SIZE | ATTR_FILE; ++ ia->ia_file = dst; ++ h_mtx = &dst->f_dentry->d_inode->i_mutex; ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD2); ++ err = vfsub_notify_change(&dst->f_path, ia); ++ mutex_unlock(h_mtx); ++ } ++ } ++ ++ return err; ++} ++ ++int au_copy_file(struct file *dst, struct file *src, loff_t len) ++{ ++ int err; ++ unsigned long blksize; ++ unsigned char do_kfree; ++ char *buf; ++ ++ err = -ENOMEM; ++ blksize = dst->f_dentry->d_sb->s_blocksize; ++ if (!blksize || PAGE_SIZE < blksize) ++ blksize = PAGE_SIZE; ++ AuDbg("blksize %lu\n", blksize); ++ do_kfree = (blksize != PAGE_SIZE && blksize >= sizeof(struct iattr *)); ++ if (do_kfree) ++ buf = kmalloc(blksize, GFP_NOFS); ++ else ++ buf = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!buf)) ++ goto out; ++ ++ if (len > (1 << 22)) ++ AuDbg("copying a large file %lld\n", (long long)len); ++ ++ src->f_pos = 0; ++ dst->f_pos = 0; ++ err = au_do_copy_file(dst, src, len, buf, blksize); ++ if (do_kfree) ++ kfree(buf); ++ else ++ free_page((unsigned long)buf); ++ ++out: ++ return err; ++} ++ ++/* ++ * to support a sparse file which is opened with O_APPEND, ++ * we need to close the file. ++ */ ++static int au_cp_regular(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len) ++{ ++ int err, i; ++ enum { SRC, DST }; ++ struct { ++ aufs_bindex_t bindex; ++ unsigned int flags; ++ struct dentry *dentry; ++ struct file *file; ++ void *label, *label_file; ++ } *f, file[] = { ++ { ++ .bindex = bsrc, ++ .flags = O_RDONLY | O_NOATIME | O_LARGEFILE, ++ .file = NULL, ++ .label = &&out, ++ .label_file = &&out_src ++ }, ++ { ++ .bindex = bdst, ++ .flags = O_WRONLY | O_NOATIME | O_LARGEFILE, ++ .file = NULL, ++ .label = &&out_src, ++ .label_file = &&out_dst ++ } ++ }; ++ struct super_block *sb; ++ ++ /* bsrc branch can be ro/rw. */ ++ sb = dentry->d_sb; ++ f = file; ++ for (i = 0; i < 2; i++, f++) { ++ f->dentry = au_h_dptr(dentry, f->bindex); ++ f->file = au_h_open(dentry, f->bindex, f->flags, /*file*/NULL); ++ err = PTR_ERR(f->file); ++ if (IS_ERR(f->file)) ++ goto *f->label; ++ err = -EINVAL; ++ if (unlikely(!f->file->f_op)) ++ goto *f->label_file; ++ } ++ ++ /* try stopping to update while we copyup */ ++ IMustLock(file[SRC].dentry->d_inode); ++ err = au_copy_file(file[DST].file, file[SRC].file, len); ++ ++out_dst: ++ fput(file[DST].file); ++ au_sbr_put(sb, file[DST].bindex); ++out_src: ++ fput(file[SRC].file); ++ au_sbr_put(sb, file[SRC].bindex); ++out: ++ return err; ++} ++ ++static int au_do_cpup_regular(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len, ++ struct inode *h_dir, struct path *h_path) ++{ ++ int err, rerr; ++ loff_t l; ++ ++ err = 0; ++ l = i_size_read(au_h_iptr(dentry->d_inode, bsrc)); ++ if (len == -1 || l < len) ++ len = l; ++ if (len) ++ err = au_cp_regular(dentry, bdst, bsrc, len); ++ if (!err) ++ goto out; /* success */ ++ ++ rerr = vfsub_unlink(h_dir, h_path, /*force*/0); ++ if (rerr) { ++ AuIOErr("failed unlinking cpup-ed %.*s(%d, %d)\n", ++ AuDLNPair(h_path->dentry), err, rerr); ++ err = -EIO; ++ } ++ ++out: ++ return err; ++} ++ ++static int au_do_cpup_symlink(struct path *h_path, struct dentry *h_src, ++ struct inode *h_dir) ++{ ++ int err, symlen; ++ mm_segment_t old_fs; ++ union { ++ char *k; ++ char __user *u; ++ } sym; ++ ++ err = -ENOSYS; ++ if (unlikely(!h_src->d_inode->i_op->readlink)) ++ goto out; ++ ++ err = -ENOMEM; ++ sym.k = __getname_gfp(GFP_NOFS); ++ if (unlikely(!sym.k)) ++ goto out; ++ ++ /* unnecessary to support mmap_sem since symlink is not mmap-able */ ++ old_fs = get_fs(); ++ set_fs(KERNEL_DS); ++ symlen = h_src->d_inode->i_op->readlink(h_src, sym.u, PATH_MAX); ++ err = symlen; ++ set_fs(old_fs); ++ ++ if (symlen > 0) { ++ sym.k[symlen] = 0; ++ err = vfsub_symlink(h_dir, h_path, sym.k); ++ } ++ __putname(sym.k); ++ ++out: ++ return err; ++} ++ ++/* return with the lower dst inode is locked */ ++static noinline_for_stack ++int cpup_entry(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len, unsigned int flags, ++ struct dentry *dst_parent) ++{ ++ int err; ++ umode_t mode; ++ unsigned int mnt_flags; ++ unsigned char isdir; ++ const unsigned char do_dt = !!au_ftest_cpup(flags, DTIME); ++ struct au_dtime dt; ++ struct path h_path; ++ struct dentry *h_src, *h_dst, *h_parent; ++ struct inode *h_inode, *h_dir; ++ struct super_block *sb; ++ ++ /* bsrc branch can be ro/rw. */ ++ h_src = au_h_dptr(dentry, bsrc); ++ h_inode = h_src->d_inode; ++ AuDebugOn(h_inode != au_h_iptr(dentry->d_inode, bsrc)); ++ ++ /* try stopping to be referenced while we are creating */ ++ h_dst = au_h_dptr(dentry, bdst); ++ h_parent = h_dst->d_parent; /* dir inode is locked */ ++ h_dir = h_parent->d_inode; ++ IMustLock(h_dir); ++ AuDebugOn(h_parent != h_dst->d_parent); ++ ++ sb = dentry->d_sb; ++ h_path.mnt = au_sbr_mnt(sb, bdst); ++ if (do_dt) { ++ h_path.dentry = h_parent; ++ au_dtime_store(&dt, dst_parent, &h_path); ++ } ++ h_path.dentry = h_dst; ++ ++ isdir = 0; ++ mode = h_inode->i_mode; ++ switch (mode & S_IFMT) { ++ case S_IFREG: ++ /* try stopping to update while we are referencing */ ++ IMustLock(h_inode); ++ err = vfsub_create(h_dir, &h_path, mode | S_IWUSR); ++ if (!err) ++ err = au_do_cpup_regular ++ (dentry, bdst, bsrc, len, ++ au_h_iptr(dst_parent->d_inode, bdst), &h_path); ++ break; ++ case S_IFDIR: ++ isdir = 1; ++ err = vfsub_mkdir(h_dir, &h_path, mode); ++ if (!err) { ++ /* ++ * strange behaviour from the users view, ++ * particularry setattr case ++ */ ++ if (au_ibstart(dst_parent->d_inode) == bdst) ++ au_cpup_attr_nlink(dst_parent->d_inode, ++ /*force*/1); ++ au_cpup_attr_nlink(dentry->d_inode, /*force*/1); ++ } ++ break; ++ case S_IFLNK: ++ err = au_do_cpup_symlink(&h_path, h_src, h_dir); ++ break; ++ case S_IFCHR: ++ case S_IFBLK: ++ AuDebugOn(!capable(CAP_MKNOD)); ++ /*FALLTHROUGH*/ ++ case S_IFIFO: ++ case S_IFSOCK: ++ err = vfsub_mknod(h_dir, &h_path, mode, h_inode->i_rdev); ++ break; ++ default: ++ AuIOErr("Unknown inode type 0%o\n", mode); ++ err = -EIO; ++ } ++ ++ mnt_flags = au_mntflags(sb); ++ if (!au_opt_test(mnt_flags, UDBA_NONE) ++ && !isdir ++ && au_opt_test(mnt_flags, XINO) ++ && h_inode->i_nlink == 1 ++ /* todo: unnecessary? */ ++ /* && dentry->d_inode->i_nlink == 1 */ ++ && bdst < bsrc ++ && !au_ftest_cpup(flags, KEEPLINO)) ++ au_xino_write(sb, bsrc, h_inode->i_ino, /*ino*/0); ++ /* ignore this error */ ++ ++ if (do_dt) ++ au_dtime_revert(&dt); ++ return err; ++} ++ ++/* ++ * copyup the @dentry from @bsrc to @bdst. ++ * the caller must set the both of lower dentries. ++ * @len is for truncating when it is -1 copyup the entire file. ++ * in link/rename cases, @dst_parent may be different from the real one. ++ */ ++static int au_cpup_single(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len, unsigned int flags, ++ struct dentry *dst_parent) ++{ ++ int err, rerr; ++ aufs_bindex_t old_ibstart; ++ unsigned char isdir, plink; ++ struct au_dtime dt; ++ struct path h_path; ++ struct dentry *h_src, *h_dst, *h_parent; ++ struct inode *dst_inode, *h_dir, *inode; ++ struct super_block *sb; ++ ++ AuDebugOn(bsrc <= bdst); ++ ++ sb = dentry->d_sb; ++ h_path.mnt = au_sbr_mnt(sb, bdst); ++ h_dst = au_h_dptr(dentry, bdst); ++ h_parent = h_dst->d_parent; /* dir inode is locked */ ++ h_dir = h_parent->d_inode; ++ IMustLock(h_dir); ++ ++ h_src = au_h_dptr(dentry, bsrc); ++ inode = dentry->d_inode; ++ ++ if (!dst_parent) ++ dst_parent = dget_parent(dentry); ++ else ++ dget(dst_parent); ++ ++ plink = !!au_opt_test(au_mntflags(sb), PLINK); ++ dst_inode = au_h_iptr(inode, bdst); ++ if (dst_inode) { ++ if (unlikely(!plink)) { ++ err = -EIO; ++ AuIOErr("hi%lu(i%lu) exists on b%d " ++ "but plink is disabled\n", ++ dst_inode->i_ino, inode->i_ino, bdst); ++ goto out; ++ } ++ ++ if (dst_inode->i_nlink) { ++ const int do_dt = au_ftest_cpup(flags, DTIME); ++ ++ h_src = au_plink_lkup(inode, bdst); ++ err = PTR_ERR(h_src); ++ if (IS_ERR(h_src)) ++ goto out; ++ if (unlikely(!h_src->d_inode)) { ++ err = -EIO; ++ AuIOErr("i%lu exists on a upper branch " ++ "but not pseudo-linked\n", ++ inode->i_ino); ++ dput(h_src); ++ goto out; ++ } ++ ++ if (do_dt) { ++ h_path.dentry = h_parent; ++ au_dtime_store(&dt, dst_parent, &h_path); ++ } ++ h_path.dentry = h_dst; ++ err = vfsub_link(h_src, h_dir, &h_path); ++ if (do_dt) ++ au_dtime_revert(&dt); ++ dput(h_src); ++ goto out; ++ } else ++ /* todo: cpup_wh_file? */ ++ /* udba work */ ++ au_update_ibrange(inode, /*do_put_zero*/1); ++ } ++ ++ old_ibstart = au_ibstart(inode); ++ err = cpup_entry(dentry, bdst, bsrc, len, flags, dst_parent); ++ if (unlikely(err)) ++ goto out; ++ dst_inode = h_dst->d_inode; ++ mutex_lock_nested(&dst_inode->i_mutex, AuLsc_I_CHILD2); ++ ++ err = cpup_iattr(dentry, bdst, h_src); ++ isdir = S_ISDIR(dst_inode->i_mode); ++ if (!err) { ++ if (bdst < old_ibstart) { ++ if (S_ISREG(inode->i_mode)) { ++ err = au_dy_iaop(inode, bdst, dst_inode); ++ if (unlikely(err)) ++ goto out_rev; ++ } ++ au_set_ibstart(inode, bdst); ++ } ++ au_set_h_iptr(inode, bdst, au_igrab(dst_inode), ++ au_hi_flags(inode, isdir)); ++ mutex_unlock(&dst_inode->i_mutex); ++ if (!isdir ++ && h_src->d_inode->i_nlink > 1 ++ && plink) ++ au_plink_append(inode, bdst, h_dst); ++ goto out; /* success */ ++ } ++ ++ /* revert */ ++out_rev: ++ h_path.dentry = h_parent; ++ mutex_unlock(&dst_inode->i_mutex); ++ au_dtime_store(&dt, dst_parent, &h_path); ++ h_path.dentry = h_dst; ++ if (!isdir) ++ rerr = vfsub_unlink(h_dir, &h_path, /*force*/0); ++ else ++ rerr = vfsub_rmdir(h_dir, &h_path); ++ au_dtime_revert(&dt); ++ if (rerr) { ++ AuIOErr("failed removing broken entry(%d, %d)\n", err, rerr); ++ err = -EIO; ++ } ++ ++out: ++ dput(dst_parent); ++ return err; ++} ++ ++struct au_cpup_single_args { ++ int *errp; ++ struct dentry *dentry; ++ aufs_bindex_t bdst, bsrc; ++ loff_t len; ++ unsigned int flags; ++ struct dentry *dst_parent; ++}; ++ ++static void au_call_cpup_single(void *args) ++{ ++ struct au_cpup_single_args *a = args; ++ *a->errp = au_cpup_single(a->dentry, a->bdst, a->bsrc, a->len, ++ a->flags, a->dst_parent); ++} ++ ++/* ++ * prevent SIGXFSZ in copy-up. ++ * testing CAP_MKNOD is for generic fs, ++ * but CAP_FSETID is for xfs only, currently. ++ */ ++static int au_cpup_sio_test(struct super_block *sb, umode_t mode) ++{ ++ int do_sio; ++ ++ do_sio = 0; ++ if (!au_wkq_test() ++ && (!au_sbi(sb)->si_plink_maint_pid ++ || au_plink_maint(sb, AuLock_NOPLM))) { ++ switch (mode & S_IFMT) { ++ case S_IFREG: ++ /* no condition about RLIMIT_FSIZE and the file size */ ++ do_sio = 1; ++ break; ++ case S_IFCHR: ++ case S_IFBLK: ++ do_sio = !capable(CAP_MKNOD); ++ break; ++ } ++ if (!do_sio) ++ do_sio = ((mode & (S_ISUID | S_ISGID)) ++ && !capable(CAP_FSETID)); ++ } ++ ++ return do_sio; ++} ++ ++int au_sio_cpup_single(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len, unsigned int flags, ++ struct dentry *dst_parent) ++{ ++ int err, wkq_err; ++ struct dentry *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bsrc); ++ if (!au_cpup_sio_test(dentry->d_sb, h_dentry->d_inode->i_mode)) ++ err = au_cpup_single(dentry, bdst, bsrc, len, flags, ++ dst_parent); ++ else { ++ struct au_cpup_single_args args = { ++ .errp = &err, ++ .dentry = dentry, ++ .bdst = bdst, ++ .bsrc = bsrc, ++ .len = len, ++ .flags = flags, ++ .dst_parent = dst_parent ++ }; ++ wkq_err = au_wkq_wait(au_call_cpup_single, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} ++ ++/* ++ * copyup the @dentry from the first active lower branch to @bdst, ++ * using au_cpup_single(). ++ */ ++static int au_cpup_simple(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ unsigned int flags) ++{ ++ int err; ++ aufs_bindex_t bsrc, bend; ++ ++ bend = au_dbend(dentry); ++ for (bsrc = bdst + 1; bsrc <= bend; bsrc++) ++ if (au_h_dptr(dentry, bsrc)) ++ break; ++ ++ err = au_lkup_neg(dentry, bdst); ++ if (!err) { ++ err = au_cpup_single(dentry, bdst, bsrc, len, flags, NULL); ++ if (!err) ++ return 0; /* success */ ++ ++ /* revert */ ++ au_set_h_dptr(dentry, bdst, NULL); ++ au_set_dbstart(dentry, bsrc); ++ } ++ ++ return err; ++} ++ ++struct au_cpup_simple_args { ++ int *errp; ++ struct dentry *dentry; ++ aufs_bindex_t bdst; ++ loff_t len; ++ unsigned int flags; ++}; ++ ++static void au_call_cpup_simple(void *args) ++{ ++ struct au_cpup_simple_args *a = args; ++ *a->errp = au_cpup_simple(a->dentry, a->bdst, a->len, a->flags); ++} ++ ++int au_sio_cpup_simple(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ unsigned int flags) ++{ ++ int err, wkq_err; ++ struct dentry *parent; ++ struct inode *h_dir; ++ ++ parent = dget_parent(dentry); ++ h_dir = au_h_iptr(parent->d_inode, bdst); ++ if (!au_test_h_perm_sio(h_dir, MAY_EXEC | MAY_WRITE) ++ && !au_cpup_sio_test(dentry->d_sb, dentry->d_inode->i_mode)) ++ err = au_cpup_simple(dentry, bdst, len, flags); ++ else { ++ struct au_cpup_simple_args args = { ++ .errp = &err, ++ .dentry = dentry, ++ .bdst = bdst, ++ .len = len, ++ .flags = flags ++ }; ++ wkq_err = au_wkq_wait(au_call_cpup_simple, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ dput(parent); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * copyup the deleted file for writing. ++ */ ++static int au_do_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, ++ struct dentry *wh_dentry, struct file *file, ++ loff_t len) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ struct au_dinfo *dinfo; ++ struct dentry *h_d_dst, *h_d_start; ++ struct au_hdentry *hdp; ++ ++ dinfo = au_di(dentry); ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ bstart = dinfo->di_bstart; ++ hdp = dinfo->di_hdentry; ++ h_d_dst = hdp[0 + bdst].hd_dentry; ++ dinfo->di_bstart = bdst; ++ hdp[0 + bdst].hd_dentry = wh_dentry; ++ if (file) { ++ h_d_start = hdp[0 + bstart].hd_dentry; ++ hdp[0 + bstart].hd_dentry = au_hf_top(file)->f_dentry; ++ } ++ err = au_cpup_single(dentry, bdst, bstart, len, !AuCpup_DTIME, ++ /*h_parent*/NULL); ++ if (file) { ++ if (!err) ++ err = au_reopen_nondir(file); ++ hdp[0 + bstart].hd_dentry = h_d_start; ++ } ++ hdp[0 + bdst].hd_dentry = h_d_dst; ++ dinfo->di_bstart = bstart; ++ ++ return err; ++} ++ ++static int au_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ struct file *file) ++{ ++ int err; ++ struct au_dtime dt; ++ struct dentry *parent, *h_parent, *wh_dentry; ++ struct au_branch *br; ++ struct path h_path; ++ ++ br = au_sbr(dentry->d_sb, bdst); ++ parent = dget_parent(dentry); ++ h_parent = au_h_dptr(parent, bdst); ++ wh_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out; ++ ++ h_path.dentry = h_parent; ++ h_path.mnt = br->br_mnt; ++ au_dtime_store(&dt, parent, &h_path); ++ err = au_do_cpup_wh(dentry, bdst, wh_dentry, file, len); ++ if (unlikely(err)) ++ goto out_wh; ++ ++ dget(wh_dentry); ++ h_path.dentry = wh_dentry; ++ if (!S_ISDIR(wh_dentry->d_inode->i_mode)) ++ err = vfsub_unlink(h_parent->d_inode, &h_path, /*force*/0); ++ else ++ err = vfsub_rmdir(h_parent->d_inode, &h_path); ++ if (unlikely(err)) { ++ AuIOErr("failed remove copied-up tmp file %.*s(%d)\n", ++ AuDLNPair(wh_dentry), err); ++ err = -EIO; ++ } ++ au_dtime_revert(&dt); ++ au_set_hi_wh(dentry->d_inode, bdst, wh_dentry); ++ ++out_wh: ++ dput(wh_dentry); ++out: ++ dput(parent); ++ return err; ++} ++ ++struct au_cpup_wh_args { ++ int *errp; ++ struct dentry *dentry; ++ aufs_bindex_t bdst; ++ loff_t len; ++ struct file *file; ++}; ++ ++static void au_call_cpup_wh(void *args) ++{ ++ struct au_cpup_wh_args *a = args; ++ *a->errp = au_cpup_wh(a->dentry, a->bdst, a->len, a->file); ++} ++ ++int au_sio_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ struct file *file) ++{ ++ int err, wkq_err; ++ struct dentry *parent, *h_orph, *h_parent, *h_dentry; ++ struct inode *dir, *h_dir, *h_tmpdir, *h_inode; ++ struct au_wbr *wbr; ++ ++ parent = dget_parent(dentry); ++ dir = parent->d_inode; ++ h_orph = NULL; ++ h_parent = NULL; ++ h_dir = au_igrab(au_h_iptr(dir, bdst)); ++ h_tmpdir = h_dir; ++ if (!h_dir->i_nlink) { ++ wbr = au_sbr(dentry->d_sb, bdst)->br_wbr; ++ h_orph = wbr->wbr_orph; ++ ++ h_parent = dget(au_h_dptr(parent, bdst)); ++ au_set_h_dptr(parent, bdst, dget(h_orph)); ++ h_tmpdir = h_orph->d_inode; ++ au_set_h_iptr(dir, bdst, au_igrab(h_tmpdir), /*flags*/0); ++ ++ /* this temporary unlock is safe */ ++ if (file) ++ h_dentry = au_hf_top(file)->f_dentry; ++ else ++ h_dentry = au_h_dptr(dentry, au_dbstart(dentry)); ++ h_inode = h_dentry->d_inode; ++ IMustLock(h_inode); ++ mutex_unlock(&h_inode->i_mutex); ++ mutex_lock_nested(&h_tmpdir->i_mutex, AuLsc_I_PARENT3); ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ /* todo: au_h_open_pre()? */ ++ } ++ ++ if (!au_test_h_perm_sio(h_tmpdir, MAY_EXEC | MAY_WRITE) ++ && !au_cpup_sio_test(dentry->d_sb, dentry->d_inode->i_mode)) ++ err = au_cpup_wh(dentry, bdst, len, file); ++ else { ++ struct au_cpup_wh_args args = { ++ .errp = &err, ++ .dentry = dentry, ++ .bdst = bdst, ++ .len = len, ++ .file = file ++ }; ++ wkq_err = au_wkq_wait(au_call_cpup_wh, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ if (h_orph) { ++ mutex_unlock(&h_tmpdir->i_mutex); ++ /* todo: au_h_open_post()? */ ++ au_set_h_iptr(dir, bdst, au_igrab(h_dir), /*flags*/0); ++ au_set_h_dptr(parent, bdst, h_parent); ++ } ++ iput(h_dir); ++ dput(parent); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * generic routine for both of copy-up and copy-down. ++ */ ++/* cf. revalidate function in file.c */ ++int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, ++ int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, ++ struct dentry *h_parent, void *arg), ++ void *arg) ++{ ++ int err; ++ struct au_pin pin; ++ struct dentry *d, *parent, *h_parent, *real_parent; ++ ++ err = 0; ++ parent = dget_parent(dentry); ++ if (IS_ROOT(parent)) ++ goto out; ++ ++ au_pin_init(&pin, dentry, bdst, AuLsc_DI_PARENT2, AuLsc_I_PARENT2, ++ au_opt_udba(dentry->d_sb), AuPin_MNT_WRITE); ++ ++ /* do not use au_dpage */ ++ real_parent = parent; ++ while (1) { ++ dput(parent); ++ parent = dget_parent(dentry); ++ h_parent = au_h_dptr(parent, bdst); ++ if (h_parent) ++ goto out; /* success */ ++ ++ /* find top dir which is necessary to cpup */ ++ do { ++ d = parent; ++ dput(parent); ++ parent = dget_parent(d); ++ di_read_lock_parent3(parent, !AuLock_IR); ++ h_parent = au_h_dptr(parent, bdst); ++ di_read_unlock(parent, !AuLock_IR); ++ } while (!h_parent); ++ ++ if (d != real_parent) ++ di_write_lock_child3(d); ++ ++ /* somebody else might create while we were sleeping */ ++ if (!au_h_dptr(d, bdst) || !au_h_dptr(d, bdst)->d_inode) { ++ if (au_h_dptr(d, bdst)) ++ au_update_dbstart(d); ++ ++ au_pin_set_dentry(&pin, d); ++ err = au_do_pin(&pin); ++ if (!err) { ++ err = cp(d, bdst, h_parent, arg); ++ au_unpin(&pin); ++ } ++ } ++ ++ if (d != real_parent) ++ di_write_unlock(d); ++ if (unlikely(err)) ++ break; ++ } ++ ++out: ++ dput(parent); ++ return err; ++} ++ ++static int au_cpup_dir(struct dentry *dentry, aufs_bindex_t bdst, ++ struct dentry *h_parent __maybe_unused , ++ void *arg __maybe_unused) ++{ ++ return au_sio_cpup_simple(dentry, bdst, -1, AuCpup_DTIME); ++} ++ ++int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) ++{ ++ return au_cp_dirs(dentry, bdst, au_cpup_dir, NULL); ++} ++ ++int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) ++{ ++ int err; ++ struct dentry *parent; ++ struct inode *dir; ++ ++ parent = dget_parent(dentry); ++ dir = parent->d_inode; ++ err = 0; ++ if (au_h_iptr(dir, bdst)) ++ goto out; ++ ++ di_read_unlock(parent, AuLock_IR); ++ di_write_lock_parent(parent); ++ /* someone else might change our inode while we were sleeping */ ++ if (!au_h_iptr(dir, bdst)) ++ err = au_cpup_dirs(dentry, bdst); ++ di_downgrade_lock(parent, AuLock_IR); ++ ++out: ++ dput(parent); ++ return err; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/cpup.h 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,81 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * copy-up/down functions ++ */ ++ ++#ifndef __AUFS_CPUP_H__ ++#define __AUFS_CPUP_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++struct inode; ++struct file; ++ ++void au_cpup_attr_flags(struct inode *dst, struct inode *src); ++void au_cpup_attr_timesizes(struct inode *inode); ++void au_cpup_attr_nlink(struct inode *inode, int force); ++void au_cpup_attr_changeable(struct inode *inode); ++void au_cpup_igen(struct inode *inode, struct inode *h_inode); ++void au_cpup_attr_all(struct inode *inode, int force); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* cpup flags */ ++#define AuCpup_DTIME 1 /* do dtime_store/revert */ ++#define AuCpup_KEEPLINO (1 << 1) /* do not clear the lower xino, ++ for link(2) */ ++#define au_ftest_cpup(flags, name) ((flags) & AuCpup_##name) ++#define au_fset_cpup(flags, name) \ ++ do { (flags) |= AuCpup_##name; } while (0) ++#define au_fclr_cpup(flags, name) \ ++ do { (flags) &= ~AuCpup_##name; } while (0) ++ ++int au_copy_file(struct file *dst, struct file *src, loff_t len); ++int au_sio_cpup_single(struct dentry *dentry, aufs_bindex_t bdst, ++ aufs_bindex_t bsrc, loff_t len, unsigned int flags, ++ struct dentry *dst_parent); ++int au_sio_cpup_simple(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ unsigned int flags); ++int au_sio_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, ++ struct file *file); ++ ++int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, ++ int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, ++ struct dentry *h_parent, void *arg), ++ void *arg); ++int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); ++int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* keep timestamps when copyup */ ++struct au_dtime { ++ struct dentry *dt_dentry; ++ struct path dt_h_path; ++ struct timespec dt_atime, dt_mtime; ++}; ++void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, ++ struct path *h_path); ++void au_dtime_revert(struct au_dtime *dt); ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_CPUP_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/dbgaufs.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,334 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * debugfs interface ++ */ ++ ++#include ++#include "aufs.h" ++ ++#ifndef CONFIG_SYSFS ++#error DEBUG_FS depends upon SYSFS ++#endif ++ ++static struct dentry *dbgaufs; ++static const mode_t dbgaufs_mode = S_IRUSR | S_IRGRP | S_IROTH; ++ ++/* 20 is max digits length of ulong 64 */ ++struct dbgaufs_arg { ++ int n; ++ char a[20 * 4]; ++}; ++ ++/* ++ * common function for all XINO files ++ */ ++static int dbgaufs_xi_release(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static int dbgaufs_xi_open(struct file *xf, struct file *file, int do_fcnt) ++{ ++ int err; ++ struct kstat st; ++ struct dbgaufs_arg *p; ++ ++ err = -ENOMEM; ++ p = kmalloc(sizeof(*p), GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ ++ err = 0; ++ p->n = 0; ++ file->private_data = p; ++ if (!xf) ++ goto out; ++ ++ err = vfs_getattr(xf->f_vfsmnt, xf->f_dentry, &st); ++ if (!err) { ++ if (do_fcnt) ++ p->n = snprintf ++ (p->a, sizeof(p->a), "%ld, %llux%lu %lld\n", ++ (long)file_count(xf), st.blocks, st.blksize, ++ (long long)st.size); ++ else ++ p->n = snprintf(p->a, sizeof(p->a), "%llux%lu %lld\n", ++ st.blocks, st.blksize, ++ (long long)st.size); ++ AuDebugOn(p->n >= sizeof(p->a)); ++ } else { ++ p->n = snprintf(p->a, sizeof(p->a), "err %d\n", err); ++ err = 0; ++ } ++ ++out: ++ return err; ++ ++} ++ ++static ssize_t dbgaufs_xi_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct dbgaufs_arg *p; ++ ++ p = file->private_data; ++ return simple_read_from_buffer(buf, count, ppos, p->a, p->n); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int dbgaufs_xib_open(struct inode *inode, struct file *file) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ ++ sbinfo = inode->i_private; ++ sb = sbinfo->si_sb; ++ si_noflush_read_lock(sb); ++ err = dbgaufs_xi_open(sbinfo->si_xib, file, /*do_fcnt*/0); ++ si_read_unlock(sb); ++ return err; ++} ++ ++static const struct file_operations dbgaufs_xib_fop = { ++ .owner = THIS_MODULE, ++ .open = dbgaufs_xib_open, ++ .release = dbgaufs_xi_release, ++ .read = dbgaufs_xi_read ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define DbgaufsXi_PREFIX "xi" ++ ++static int dbgaufs_xino_open(struct inode *inode, struct file *file) ++{ ++ int err; ++ long l; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ struct file *xf; ++ struct qstr *name; ++ ++ err = -ENOENT; ++ xf = NULL; ++ name = &file->f_dentry->d_name; ++ if (unlikely(name->len < sizeof(DbgaufsXi_PREFIX) ++ || memcmp(name->name, DbgaufsXi_PREFIX, ++ sizeof(DbgaufsXi_PREFIX) - 1))) ++ goto out; ++ err = kstrtol(name->name + sizeof(DbgaufsXi_PREFIX) - 1, 10, &l); ++ if (unlikely(err)) ++ goto out; ++ ++ sbinfo = inode->i_private; ++ sb = sbinfo->si_sb; ++ si_noflush_read_lock(sb); ++ if (l <= au_sbend(sb)) { ++ xf = au_sbr(sb, (aufs_bindex_t)l)->br_xino.xi_file; ++ err = dbgaufs_xi_open(xf, file, /*do_fcnt*/1); ++ } else ++ err = -ENOENT; ++ si_read_unlock(sb); ++ ++out: ++ return err; ++} ++ ++static const struct file_operations dbgaufs_xino_fop = { ++ .owner = THIS_MODULE, ++ .open = dbgaufs_xino_open, ++ .release = dbgaufs_xi_release, ++ .read = dbgaufs_xi_read ++}; ++ ++void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ aufs_bindex_t bend; ++ struct au_branch *br; ++ struct au_xino_file *xi; ++ ++ if (!au_sbi(sb)->si_dbgaufs) ++ return; ++ ++ bend = au_sbend(sb); ++ for (; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ xi = &br->br_xino; ++ if (xi->xi_dbgaufs) { ++ debugfs_remove(xi->xi_dbgaufs); ++ xi->xi_dbgaufs = NULL; ++ } ++ } ++} ++ ++void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ struct au_sbinfo *sbinfo; ++ struct dentry *parent; ++ struct au_branch *br; ++ struct au_xino_file *xi; ++ aufs_bindex_t bend; ++ char name[sizeof(DbgaufsXi_PREFIX) + 5]; /* "xi" bindex NULL */ ++ ++ sbinfo = au_sbi(sb); ++ parent = sbinfo->si_dbgaufs; ++ if (!parent) ++ return; ++ ++ bend = au_sbend(sb); ++ for (; bindex <= bend; bindex++) { ++ snprintf(name, sizeof(name), DbgaufsXi_PREFIX "%d", bindex); ++ br = au_sbr(sb, bindex); ++ xi = &br->br_xino; ++ AuDebugOn(xi->xi_dbgaufs); ++ xi->xi_dbgaufs = debugfs_create_file(name, dbgaufs_mode, parent, ++ sbinfo, &dbgaufs_xino_fop); ++ /* ignore an error */ ++ if (unlikely(!xi->xi_dbgaufs)) ++ AuWarn1("failed %s under debugfs\n", name); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_EXPORT ++static int dbgaufs_xigen_open(struct inode *inode, struct file *file) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ ++ sbinfo = inode->i_private; ++ sb = sbinfo->si_sb; ++ si_noflush_read_lock(sb); ++ err = dbgaufs_xi_open(sbinfo->si_xigen, file, /*do_fcnt*/0); ++ si_read_unlock(sb); ++ return err; ++} ++ ++static const struct file_operations dbgaufs_xigen_fop = { ++ .owner = THIS_MODULE, ++ .open = dbgaufs_xigen_open, ++ .release = dbgaufs_xi_release, ++ .read = dbgaufs_xi_read ++}; ++ ++static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) ++{ ++ int err; ++ ++ /* ++ * This function is a dynamic '__init' fucntion actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++ ++ err = -EIO; ++ sbinfo->si_dbgaufs_xigen = debugfs_create_file ++ ("xigen", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, ++ &dbgaufs_xigen_fop); ++ if (sbinfo->si_dbgaufs_xigen) ++ err = 0; ++ ++ return err; ++} ++#else ++static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) ++{ ++ return 0; ++} ++#endif /* CONFIG_AUFS_EXPORT */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++void dbgaufs_si_fin(struct au_sbinfo *sbinfo) ++{ ++ /* ++ * This function is a dynamic '__init' fucntion actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++ ++ debugfs_remove_recursive(sbinfo->si_dbgaufs); ++ sbinfo->si_dbgaufs = NULL; ++ kobject_put(&sbinfo->si_kobj); ++} ++ ++int dbgaufs_si_init(struct au_sbinfo *sbinfo) ++{ ++ int err; ++ char name[SysaufsSiNameLen]; ++ ++ /* ++ * This function is a dynamic '__init' fucntion actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++ ++ err = -ENOENT; ++ if (!dbgaufs) { ++ AuErr1("/debug/aufs is uninitialized\n"); ++ goto out; ++ } ++ ++ err = -EIO; ++ sysaufs_name(sbinfo, name); ++ sbinfo->si_dbgaufs = debugfs_create_dir(name, dbgaufs); ++ if (unlikely(!sbinfo->si_dbgaufs)) ++ goto out; ++ kobject_get(&sbinfo->si_kobj); ++ ++ sbinfo->si_dbgaufs_xib = debugfs_create_file ++ ("xib", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, ++ &dbgaufs_xib_fop); ++ if (unlikely(!sbinfo->si_dbgaufs_xib)) ++ goto out_dir; ++ ++ err = dbgaufs_xigen_init(sbinfo); ++ if (!err) ++ goto out; /* success */ ++ ++out_dir: ++ dbgaufs_si_fin(sbinfo); ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void dbgaufs_fin(void) ++{ ++ debugfs_remove(dbgaufs); ++} ++ ++int __init dbgaufs_init(void) ++{ ++ int err; ++ ++ err = -EIO; ++ dbgaufs = debugfs_create_dir(AUFS_NAME, NULL); ++ if (dbgaufs) ++ err = 0; ++ return err; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/dbgaufs.h 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,49 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * debugfs interface ++ */ ++ ++#ifndef __DBGAUFS_H__ ++#define __DBGAUFS_H__ ++ ++#ifdef __KERNEL__ ++ ++struct super_block; ++struct au_sbinfo; ++ ++#ifdef CONFIG_DEBUG_FS ++/* dbgaufs.c */ ++void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); ++void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); ++void dbgaufs_si_fin(struct au_sbinfo *sbinfo); ++int dbgaufs_si_init(struct au_sbinfo *sbinfo); ++void dbgaufs_fin(void); ++int __init dbgaufs_init(void); ++#else ++AuStubVoid(dbgaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex) ++AuStubVoid(dbgaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex) ++AuStubVoid(dbgaufs_si_fin, struct au_sbinfo *sbinfo) ++AuStubInt0(dbgaufs_si_init, struct au_sbinfo *sbinfo) ++AuStubVoid(dbgaufs_fin, void) ++AuStubInt0(__init dbgaufs_init, void) ++#endif /* CONFIG_DEBUG_FS */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __DBGAUFS_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/dcsub.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,243 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sub-routines for dentry cache ++ */ ++ ++#include "aufs.h" ++ ++static void au_dpage_free(struct au_dpage *dpage) ++{ ++ int i; ++ struct dentry **p; ++ ++ p = dpage->dentries; ++ for (i = 0; i < dpage->ndentry; i++) ++ dput(*p++); ++ free_page((unsigned long)dpage->dentries); ++} ++ ++int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp) ++{ ++ int err; ++ void *p; ++ ++ err = -ENOMEM; ++ dpages->dpages = kmalloc(sizeof(*dpages->dpages), gfp); ++ if (unlikely(!dpages->dpages)) ++ goto out; ++ ++ p = (void *)__get_free_page(gfp); ++ if (unlikely(!p)) ++ goto out_dpages; ++ ++ dpages->dpages[0].ndentry = 0; ++ dpages->dpages[0].dentries = p; ++ dpages->ndpage = 1; ++ return 0; /* success */ ++ ++out_dpages: ++ kfree(dpages->dpages); ++out: ++ return err; ++} ++ ++void au_dpages_free(struct au_dcsub_pages *dpages) ++{ ++ int i; ++ struct au_dpage *p; ++ ++ p = dpages->dpages; ++ for (i = 0; i < dpages->ndpage; i++) ++ au_dpage_free(p++); ++ kfree(dpages->dpages); ++} ++ ++static int au_dpages_append(struct au_dcsub_pages *dpages, ++ struct dentry *dentry, gfp_t gfp) ++{ ++ int err, sz; ++ struct au_dpage *dpage; ++ void *p; ++ ++ dpage = dpages->dpages + dpages->ndpage - 1; ++ sz = PAGE_SIZE / sizeof(dentry); ++ if (unlikely(dpage->ndentry >= sz)) { ++ AuLabel(new dpage); ++ err = -ENOMEM; ++ sz = dpages->ndpage * sizeof(*dpages->dpages); ++ p = au_kzrealloc(dpages->dpages, sz, ++ sz + sizeof(*dpages->dpages), gfp); ++ if (unlikely(!p)) ++ goto out; ++ ++ dpages->dpages = p; ++ dpage = dpages->dpages + dpages->ndpage; ++ p = (void *)__get_free_page(gfp); ++ if (unlikely(!p)) ++ goto out; ++ ++ dpage->ndentry = 0; ++ dpage->dentries = p; ++ dpages->ndpage++; ++ } ++ ++ AuDebugOn(!dentry->d_count); ++ dpage->dentries[dpage->ndentry++] = dget_dlock(dentry); ++ return 0; /* success */ ++ ++out: ++ return err; ++} ++ ++int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, ++ au_dpages_test test, void *arg) ++{ ++ int err; ++ struct dentry *this_parent; ++ struct list_head *next; ++ struct super_block *sb = root->d_sb; ++ ++ err = 0; ++ write_seqlock(&rename_lock); ++ this_parent = root; ++ spin_lock(&this_parent->d_lock); ++repeat: ++ next = this_parent->d_subdirs.next; ++resume: ++ if (this_parent->d_sb == sb ++ && !IS_ROOT(this_parent) ++ && au_di(this_parent) ++ && this_parent->d_count ++ && (!test || test(this_parent, arg))) { ++ err = au_dpages_append(dpages, this_parent, GFP_ATOMIC); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ while (next != &this_parent->d_subdirs) { ++ struct list_head *tmp = next; ++ struct dentry *dentry = list_entry(tmp, struct dentry, ++ d_u.d_child); ++ ++ next = tmp->next; ++ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); ++ if (dentry->d_count) { ++ if (!list_empty(&dentry->d_subdirs)) { ++ spin_unlock(&this_parent->d_lock); ++ spin_release(&dentry->d_lock.dep_map, 1, ++ _RET_IP_); ++ this_parent = dentry; ++ spin_acquire(&this_parent->d_lock.dep_map, 0, 1, ++ _RET_IP_); ++ goto repeat; ++ } ++ if (dentry->d_sb == sb ++ && au_di(dentry) ++ && (!test || test(dentry, arg))) ++ err = au_dpages_append(dpages, dentry, ++ GFP_ATOMIC); ++ } ++ spin_unlock(&dentry->d_lock); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ if (this_parent != root) { ++ struct dentry *tmp; ++ struct dentry *child; ++ ++ tmp = this_parent->d_parent; ++ rcu_read_lock(); ++ spin_unlock(&this_parent->d_lock); ++ child = this_parent; ++ this_parent = tmp; ++ spin_lock(&this_parent->d_lock); ++ rcu_read_unlock(); ++ next = child->d_u.d_child.next; ++ goto resume; ++ } ++ ++out: ++ spin_unlock(&this_parent->d_lock); ++ write_sequnlock(&rename_lock); ++ return err; ++} ++ ++int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, ++ int do_include, au_dpages_test test, void *arg) ++{ ++ int err; ++ ++ err = 0; ++ write_seqlock(&rename_lock); ++ spin_lock(&dentry->d_lock); ++ if (do_include ++ && dentry->d_count ++ && (!test || test(dentry, arg))) ++ err = au_dpages_append(dpages, dentry, GFP_ATOMIC); ++ spin_unlock(&dentry->d_lock); ++ if (unlikely(err)) ++ goto out; ++ ++ /* ++ * vfsmount_lock is unnecessary since this is a traverse in a single ++ * mount ++ */ ++ while (!IS_ROOT(dentry)) { ++ dentry = dentry->d_parent; /* rename_lock is locked */ ++ spin_lock(&dentry->d_lock); ++ if (dentry->d_count ++ && (!test || test(dentry, arg))) ++ err = au_dpages_append(dpages, dentry, GFP_ATOMIC); ++ spin_unlock(&dentry->d_lock); ++ if (unlikely(err)) ++ break; ++ } ++ ++out: ++ write_sequnlock(&rename_lock); ++ return err; ++} ++ ++static inline int au_dcsub_dpages_aufs(struct dentry *dentry, void *arg) ++{ ++ return au_di(dentry) && dentry->d_sb == arg; ++} ++ ++int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages, ++ struct dentry *dentry, int do_include) ++{ ++ return au_dcsub_pages_rev(dpages, dentry, do_include, ++ au_dcsub_dpages_aufs, dentry->d_sb); ++} ++ ++int au_test_subdir(struct dentry *d1, struct dentry *d2) ++{ ++ struct path path[2] = { ++ { ++ .dentry = d1 ++ }, ++ { ++ .dentry = d2 ++ } ++ }; ++ ++ return path_is_under(path + 0, path + 1); ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/dcsub.h 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,94 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sub-routines for dentry cache ++ */ ++ ++#ifndef __AUFS_DCSUB_H__ ++#define __AUFS_DCSUB_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++ ++struct dentry; ++ ++struct au_dpage { ++ int ndentry; ++ struct dentry **dentries; ++}; ++ ++struct au_dcsub_pages { ++ int ndpage; ++ struct au_dpage *dpages; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dcsub.c */ ++int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp); ++void au_dpages_free(struct au_dcsub_pages *dpages); ++typedef int (*au_dpages_test)(struct dentry *dentry, void *arg); ++int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, ++ au_dpages_test test, void *arg); ++int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, ++ int do_include, au_dpages_test test, void *arg); ++int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages, ++ struct dentry *dentry, int do_include); ++int au_test_subdir(struct dentry *d1, struct dentry *d2); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline int au_d_hashed_positive(struct dentry *d) ++{ ++ int err; ++ struct inode *inode = d->d_inode; ++ err = 0; ++ if (unlikely(d_unhashed(d) || !inode || !inode->i_nlink)) ++ err = -ENOENT; ++ return err; ++} ++ ++static inline int au_d_alive(struct dentry *d) ++{ ++ int err; ++ struct inode *inode; ++ err = 0; ++ if (!IS_ROOT(d)) ++ err = au_d_hashed_positive(d); ++ else { ++ inode = d->d_inode; ++ if (unlikely(d_unlinked(d) || !inode || !inode->i_nlink)) ++ err = -ENOENT; ++ } ++ return err; ++} ++ ++static inline int au_alive_dir(struct dentry *d) ++{ ++ int err; ++ err = au_d_alive(d); ++ if (unlikely(err || IS_DEADDIR(d->d_inode))) ++ err = -ENOENT; ++ return err; ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DCSUB_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/debug.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,489 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * debug print functions ++ */ ++ ++#include ++#include "aufs.h" ++ ++int aufs_debug; ++MODULE_PARM_DESC(debug, "debug print"); ++module_param_named(debug, aufs_debug, int, S_IRUGO | S_IWUSR | S_IWGRP); ++ ++char *au_plevel = KERN_DEBUG; ++#define dpri(fmt, ...) do { \ ++ if ((au_plevel \ ++ && strcmp(au_plevel, KERN_DEBUG)) \ ++ || au_debug_test()) \ ++ printk("%s" fmt, au_plevel, ##__VA_ARGS__); \ ++} while (0) ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_dpri_whlist(struct au_nhash *whlist) ++{ ++ unsigned long ul, n; ++ struct hlist_head *head; ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos; ++ ++ n = whlist->nh_num; ++ head = whlist->nh_head; ++ for (ul = 0; ul < n; ul++) { ++ hlist_for_each_entry(tpos, pos, head, wh_hash) ++ dpri("b%d, %.*s, %d\n", ++ tpos->wh_bindex, ++ tpos->wh_str.len, tpos->wh_str.name, ++ tpos->wh_str.len); ++ head++; ++ } ++} ++ ++void au_dpri_vdir(struct au_vdir *vdir) ++{ ++ unsigned long ul; ++ union au_vdir_deblk_p p; ++ unsigned char *o; ++ ++ if (!vdir || IS_ERR(vdir)) { ++ dpri("err %ld\n", PTR_ERR(vdir)); ++ return; ++ } ++ ++ dpri("deblk %u, nblk %lu, deblk %p, last{%lu, %p}, ver %lu\n", ++ vdir->vd_deblk_sz, vdir->vd_nblk, vdir->vd_deblk, ++ vdir->vd_last.ul, vdir->vd_last.p.deblk, vdir->vd_version); ++ for (ul = 0; ul < vdir->vd_nblk; ul++) { ++ p.deblk = vdir->vd_deblk[ul]; ++ o = p.deblk; ++ dpri("[%lu]: %p\n", ul, o); ++ } ++} ++ ++static int do_pri_inode(aufs_bindex_t bindex, struct inode *inode, int hn, ++ struct dentry *wh) ++{ ++ char *n = NULL; ++ int l = 0; ++ ++ if (!inode || IS_ERR(inode)) { ++ dpri("i%d: err %ld\n", bindex, PTR_ERR(inode)); ++ return -1; ++ } ++ ++ /* the type of i_blocks depends upon CONFIG_LSF */ ++ BUILD_BUG_ON(sizeof(inode->i_blocks) != sizeof(unsigned long) ++ && sizeof(inode->i_blocks) != sizeof(u64)); ++ if (wh) { ++ n = (void *)wh->d_name.name; ++ l = wh->d_name.len; ++ } ++ ++ dpri("i%d: %p, i%lu, %s, cnt %d, nl %u, 0%o, sz %llu, blk %llu," ++ " hn %d, ct %lld, np %lu, st 0x%lx, f 0x%x, v %llu, g %x%s%.*s\n", ++ bindex, inode, ++ inode->i_ino, inode->i_sb ? au_sbtype(inode->i_sb) : "??", ++ atomic_read(&inode->i_count), inode->i_nlink, inode->i_mode, ++ i_size_read(inode), (unsigned long long)inode->i_blocks, ++ hn, (long long)timespec_to_ns(&inode->i_ctime) & 0x0ffff, ++ inode->i_mapping ? inode->i_mapping->nrpages : 0, ++ inode->i_state, inode->i_flags, inode->i_version, ++ inode->i_generation, ++ l ? ", wh " : "", l, n); ++ return 0; ++} ++ ++void au_dpri_inode(struct inode *inode) ++{ ++ struct au_iinfo *iinfo; ++ aufs_bindex_t bindex; ++ int err, hn; ++ ++ err = do_pri_inode(-1, inode, -1, NULL); ++ if (err || !au_test_aufs(inode->i_sb)) ++ return; ++ ++ iinfo = au_ii(inode); ++ if (!iinfo) ++ return; ++ dpri("i-1: bstart %d, bend %d, gen %d\n", ++ iinfo->ii_bstart, iinfo->ii_bend, au_iigen(inode)); ++ if (iinfo->ii_bstart < 0) ++ return; ++ hn = 0; ++ for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; bindex++) { ++ hn = !!au_hn(iinfo->ii_hinode + bindex); ++ do_pri_inode(bindex, iinfo->ii_hinode[0 + bindex].hi_inode, hn, ++ iinfo->ii_hinode[0 + bindex].hi_whdentry); ++ } ++} ++ ++void au_dpri_dalias(struct inode *inode) ++{ ++ struct dentry *d; ++ ++ spin_lock(&inode->i_lock); ++ list_for_each_entry(d, &inode->i_dentry, d_alias) ++ au_dpri_dentry(d); ++ spin_unlock(&inode->i_lock); ++} ++ ++static int do_pri_dentry(aufs_bindex_t bindex, struct dentry *dentry) ++{ ++ struct dentry *wh = NULL; ++ int hn; ++ ++ if (!dentry || IS_ERR(dentry)) { ++ dpri("d%d: err %ld\n", bindex, PTR_ERR(dentry)); ++ return -1; ++ } ++ /* do not call dget_parent() here */ ++ /* note: access d_xxx without d_lock */ ++ dpri("d%d: %.*s?/%.*s, %s, cnt %d, flags 0x%x\n", ++ bindex, ++ AuDLNPair(dentry->d_parent), AuDLNPair(dentry), ++ dentry->d_sb ? au_sbtype(dentry->d_sb) : "??", ++ dentry->d_count, dentry->d_flags); ++ hn = -1; ++ if (bindex >= 0 && dentry->d_inode && au_test_aufs(dentry->d_sb)) { ++ struct au_iinfo *iinfo = au_ii(dentry->d_inode); ++ if (iinfo) { ++ hn = !!au_hn(iinfo->ii_hinode + bindex); ++ wh = iinfo->ii_hinode[0 + bindex].hi_whdentry; ++ } ++ } ++ do_pri_inode(bindex, dentry->d_inode, hn, wh); ++ return 0; ++} ++ ++void au_dpri_dentry(struct dentry *dentry) ++{ ++ struct au_dinfo *dinfo; ++ aufs_bindex_t bindex; ++ int err; ++ struct au_hdentry *hdp; ++ ++ err = do_pri_dentry(-1, dentry); ++ if (err || !au_test_aufs(dentry->d_sb)) ++ return; ++ ++ dinfo = au_di(dentry); ++ if (!dinfo) ++ return; ++ dpri("d-1: bstart %d, bend %d, bwh %d, bdiropq %d, gen %d\n", ++ dinfo->di_bstart, dinfo->di_bend, ++ dinfo->di_bwh, dinfo->di_bdiropq, au_digen(dentry)); ++ if (dinfo->di_bstart < 0) ++ return; ++ hdp = dinfo->di_hdentry; ++ for (bindex = dinfo->di_bstart; bindex <= dinfo->di_bend; bindex++) ++ do_pri_dentry(bindex, hdp[0 + bindex].hd_dentry); ++} ++ ++static int do_pri_file(aufs_bindex_t bindex, struct file *file) ++{ ++ char a[32]; ++ ++ if (!file || IS_ERR(file)) { ++ dpri("f%d: err %ld\n", bindex, PTR_ERR(file)); ++ return -1; ++ } ++ a[0] = 0; ++ if (bindex < 0 ++ && file->f_dentry ++ && au_test_aufs(file->f_dentry->d_sb) ++ && au_fi(file)) ++ snprintf(a, sizeof(a), ", gen %d, mmapped %d", ++ au_figen(file), atomic_read(&au_fi(file)->fi_mmapped)); ++ dpri("f%d: mode 0x%x, flags 0%o, cnt %ld, v %llu, pos %llu%s\n", ++ bindex, file->f_mode, file->f_flags, (long)file_count(file), ++ file->f_version, file->f_pos, a); ++ if (file->f_dentry) ++ do_pri_dentry(bindex, file->f_dentry); ++ return 0; ++} ++ ++void au_dpri_file(struct file *file) ++{ ++ struct au_finfo *finfo; ++ struct au_fidir *fidir; ++ struct au_hfile *hfile; ++ aufs_bindex_t bindex; ++ int err; ++ ++ err = do_pri_file(-1, file); ++ if (err || !file->f_dentry || !au_test_aufs(file->f_dentry->d_sb)) ++ return; ++ ++ finfo = au_fi(file); ++ if (!finfo) ++ return; ++ if (finfo->fi_btop < 0) ++ return; ++ fidir = finfo->fi_hdir; ++ if (!fidir) ++ do_pri_file(finfo->fi_btop, finfo->fi_htop.hf_file); ++ else ++ for (bindex = finfo->fi_btop; ++ bindex >= 0 && bindex <= fidir->fd_bbot; ++ bindex++) { ++ hfile = fidir->fd_hfile + bindex; ++ do_pri_file(bindex, hfile ? hfile->hf_file : NULL); ++ } ++} ++ ++static int do_pri_br(aufs_bindex_t bindex, struct au_branch *br) ++{ ++ struct vfsmount *mnt; ++ struct super_block *sb; ++ ++ if (!br || IS_ERR(br)) ++ goto out; ++ mnt = br->br_mnt; ++ if (!mnt || IS_ERR(mnt)) ++ goto out; ++ sb = mnt->mnt_sb; ++ if (!sb || IS_ERR(sb)) ++ goto out; ++ ++ dpri("s%d: {perm 0x%x, id %d, cnt %d, wbr %p}, " ++ "%s, dev 0x%02x%02x, flags 0x%lx, cnt %d, active %d, " ++ "xino %d\n", ++ bindex, br->br_perm, br->br_id, atomic_read(&br->br_count), ++ br->br_wbr, au_sbtype(sb), MAJOR(sb->s_dev), MINOR(sb->s_dev), ++ sb->s_flags, sb->s_count, ++ atomic_read(&sb->s_active), !!br->br_xino.xi_file); ++ return 0; ++ ++out: ++ dpri("s%d: err %ld\n", bindex, PTR_ERR(br)); ++ return -1; ++} ++ ++void au_dpri_sb(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ aufs_bindex_t bindex; ++ int err; ++ /* to reuduce stack size */ ++ struct { ++ struct vfsmount mnt; ++ struct au_branch fake; ++ } *a; ++ ++ /* this function can be called from magic sysrq */ ++ a = kzalloc(sizeof(*a), GFP_ATOMIC); ++ if (unlikely(!a)) { ++ dpri("no memory\n"); ++ return; ++ } ++ ++ a->mnt.mnt_sb = sb; ++ a->fake.br_perm = 0; ++ a->fake.br_mnt = &a->mnt; ++ a->fake.br_xino.xi_file = NULL; ++ atomic_set(&a->fake.br_count, 0); ++ smp_mb(); /* atomic_set */ ++ err = do_pri_br(-1, &a->fake); ++ kfree(a); ++ dpri("dev 0x%x\n", sb->s_dev); ++ if (err || !au_test_aufs(sb)) ++ return; ++ ++ sbinfo = au_sbi(sb); ++ if (!sbinfo) ++ return; ++ dpri("nw %d, gen %u, kobj %d\n", ++ atomic_read(&sbinfo->si_nowait.nw_len), sbinfo->si_generation, ++ atomic_read(&sbinfo->si_kobj.kref.refcount)); ++ for (bindex = 0; bindex <= sbinfo->si_bend; bindex++) ++ do_pri_br(bindex, sbinfo->si_branch[0 + bindex]); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_dbg_sleep_jiffy(int jiffy) ++{ ++ while (jiffy) ++ jiffy = schedule_timeout_uninterruptible(jiffy); ++} ++ ++void au_dbg_iattr(struct iattr *ia) ++{ ++#define AuBit(name) if (ia->ia_valid & ATTR_ ## name) \ ++ dpri(#name "\n") ++ AuBit(MODE); ++ AuBit(UID); ++ AuBit(GID); ++ AuBit(SIZE); ++ AuBit(ATIME); ++ AuBit(MTIME); ++ AuBit(CTIME); ++ AuBit(ATIME_SET); ++ AuBit(MTIME_SET); ++ AuBit(FORCE); ++ AuBit(ATTR_FLAG); ++ AuBit(KILL_SUID); ++ AuBit(KILL_SGID); ++ AuBit(FILE); ++ AuBit(KILL_PRIV); ++ AuBit(OPEN); ++ AuBit(TIMES_SET); ++#undef AuBit ++ dpri("ia_file %p\n", ia->ia_file); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line) ++{ ++ struct inode *h_inode, *inode = dentry->d_inode; ++ struct dentry *h_dentry; ++ aufs_bindex_t bindex, bend, bi; ++ ++ if (!inode /* || au_di(dentry)->di_lsc == AuLsc_DI_TMP */) ++ return; ++ ++ bend = au_dbend(dentry); ++ bi = au_ibend(inode); ++ if (bi < bend) ++ bend = bi; ++ bindex = au_dbstart(dentry); ++ bi = au_ibstart(inode); ++ if (bi > bindex) ++ bindex = bi; ++ ++ for (; bindex <= bend; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ h_inode = au_h_iptr(inode, bindex); ++ if (unlikely(h_inode != h_dentry->d_inode)) { ++ int old = au_debug_test(); ++ if (!old) ++ au_debug(1); ++ AuDbg("b%d, %s:%d\n", bindex, func, line); ++ AuDbgDentry(dentry); ++ AuDbgInode(inode); ++ if (!old) ++ au_debug(0); ++ BUG(); ++ } ++ } ++} ++ ++void au_dbg_verify_dir_parent(struct dentry *dentry, unsigned int sigen) ++{ ++ struct dentry *parent; ++ ++ parent = dget_parent(dentry); ++ AuDebugOn(!S_ISDIR(dentry->d_inode->i_mode)); ++ AuDebugOn(IS_ROOT(dentry)); ++ AuDebugOn(au_digen_test(parent, sigen)); ++ dput(parent); ++} ++ ++void au_dbg_verify_nondir_parent(struct dentry *dentry, unsigned int sigen) ++{ ++ struct dentry *parent; ++ struct inode *inode; ++ ++ parent = dget_parent(dentry); ++ inode = dentry->d_inode; ++ AuDebugOn(inode && S_ISDIR(dentry->d_inode->i_mode)); ++ AuDebugOn(au_digen_test(parent, sigen)); ++ dput(parent); ++} ++ ++void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen) ++{ ++ int err, i, j; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ AuDebugOn(err); ++ err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/1); ++ AuDebugOn(err); ++ for (i = dpages.ndpage - 1; !err && i >= 0; i--) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ for (j = dpage->ndentry - 1; !err && j >= 0; j--) ++ AuDebugOn(au_digen_test(dentries[j], sigen)); ++ } ++ au_dpages_free(&dpages); ++} ++ ++void au_dbg_verify_kthread(void) ++{ ++ if (au_wkq_test()) { ++ au_dbg_blocked(); ++ /* ++ * It may be recursive, but udba=notify between two aufs mounts, ++ * where a single ro branch is shared, is not a problem. ++ */ ++ /* WARN_ON(1); */ ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_debug_sbinfo_init(struct au_sbinfo *sbinfo __maybe_unused) ++{ ++#ifdef AuForceNoPlink ++ au_opt_clr(sbinfo->si_mntflags, PLINK); ++#endif ++#ifdef AuForceNoXino ++ au_opt_clr(sbinfo->si_mntflags, XINO); ++#endif ++#ifdef AuForceNoRefrof ++ au_opt_clr(sbinfo->si_mntflags, REFROF); ++#endif ++#ifdef AuForceHnotify ++ au_opt_set_udba(sbinfo->si_mntflags, UDBA_HNOTIFY); ++#endif ++#ifdef AuForceRd0 ++ sbinfo->si_rdblk = 0; ++ sbinfo->si_rdhash = 0; ++#endif ++} ++ ++int __init au_debug_init(void) ++{ ++ aufs_bindex_t bindex; ++ struct au_vdir_destr destr; ++ ++ bindex = -1; ++ AuDebugOn(bindex >= 0); ++ ++ destr.len = -1; ++ AuDebugOn(destr.len < NAME_MAX); ++ ++#ifdef CONFIG_4KSTACKS ++ pr_warning("CONFIG_4KSTACKS is defined.\n"); ++#endif ++ ++#ifdef AuForceNoBrs ++ sysaufs_brs = 0; ++#endif ++ ++ return 0; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/debug.h 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,243 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * debug print functions ++ */ ++ ++#ifndef __AUFS_DEBUG_H__ ++#define __AUFS_DEBUG_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_AUFS_DEBUG ++#define AuDebugOn(a) BUG_ON(a) ++ ++/* module parameter */ ++extern int aufs_debug; ++static inline void au_debug(int n) ++{ ++ aufs_debug = n; ++ smp_mb(); ++} ++ ++static inline int au_debug_test(void) ++{ ++ return aufs_debug; ++} ++#else ++#define AuDebugOn(a) do {} while (0) ++AuStubVoid(au_debug, int n) ++AuStubInt0(au_debug_test, void) ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* debug print */ ++ ++#define AuDbg(fmt, ...) do { \ ++ if (au_debug_test()) \ ++ pr_debug("DEBUG: " fmt, ##__VA_ARGS__); \ ++} while (0) ++#define AuLabel(l) AuDbg(#l "\n") ++#define AuIOErr(fmt, ...) pr_err("I/O Error, " fmt, ##__VA_ARGS__) ++#define AuWarn1(fmt, ...) do { \ ++ static unsigned char _c; \ ++ if (!_c++) \ ++ pr_warning(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++#define AuErr1(fmt, ...) do { \ ++ static unsigned char _c; \ ++ if (!_c++) \ ++ pr_err(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++#define AuIOErr1(fmt, ...) do { \ ++ static unsigned char _c; \ ++ if (!_c++) \ ++ AuIOErr(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++#define AuUnsupportMsg "This operation is not supported." \ ++ " Please report this application to aufs-users ML." ++#define AuUnsupport(fmt, ...) do { \ ++ pr_err(AuUnsupportMsg "\n" fmt, ##__VA_ARGS__); \ ++ dump_stack(); \ ++} while (0) ++ ++#define AuTraceErr(e) do { \ ++ if (unlikely((e) < 0)) \ ++ AuDbg("err %d\n", (int)(e)); \ ++} while (0) ++ ++#define AuTraceErrPtr(p) do { \ ++ if (IS_ERR(p)) \ ++ AuDbg("err %ld\n", PTR_ERR(p)); \ ++} while (0) ++ ++/* dirty macros for debug print, use with "%.*s" and caution */ ++#define AuLNPair(qstr) (qstr)->len, (qstr)->name ++#define AuDLNPair(d) AuLNPair(&(d)->d_name) ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_sbinfo; ++struct au_finfo; ++struct dentry; ++#ifdef CONFIG_AUFS_DEBUG ++extern char *au_plevel; ++struct au_nhash; ++void au_dpri_whlist(struct au_nhash *whlist); ++struct au_vdir; ++void au_dpri_vdir(struct au_vdir *vdir); ++struct inode; ++void au_dpri_inode(struct inode *inode); ++void au_dpri_dalias(struct inode *inode); ++void au_dpri_dentry(struct dentry *dentry); ++struct file; ++void au_dpri_file(struct file *filp); ++struct super_block; ++void au_dpri_sb(struct super_block *sb); ++ ++void au_dbg_sleep_jiffy(int jiffy); ++struct iattr; ++void au_dbg_iattr(struct iattr *ia); ++ ++#define au_dbg_verify_dinode(d) __au_dbg_verify_dinode(d, __func__, __LINE__) ++void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line); ++void au_dbg_verify_dir_parent(struct dentry *dentry, unsigned int sigen); ++void au_dbg_verify_nondir_parent(struct dentry *dentry, unsigned int sigen); ++void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen); ++void au_dbg_verify_kthread(void); ++ ++int __init au_debug_init(void); ++void au_debug_sbinfo_init(struct au_sbinfo *sbinfo); ++#define AuDbgWhlist(w) do { \ ++ AuDbg(#w "\n"); \ ++ au_dpri_whlist(w); \ ++} while (0) ++ ++#define AuDbgVdir(v) do { \ ++ AuDbg(#v "\n"); \ ++ au_dpri_vdir(v); \ ++} while (0) ++ ++#define AuDbgInode(i) do { \ ++ AuDbg(#i "\n"); \ ++ au_dpri_inode(i); \ ++} while (0) ++ ++#define AuDbgDAlias(i) do { \ ++ AuDbg(#i "\n"); \ ++ au_dpri_dalias(i); \ ++} while (0) ++ ++#define AuDbgDentry(d) do { \ ++ AuDbg(#d "\n"); \ ++ au_dpri_dentry(d); \ ++} while (0) ++ ++#define AuDbgFile(f) do { \ ++ AuDbg(#f "\n"); \ ++ au_dpri_file(f); \ ++} while (0) ++ ++#define AuDbgSb(sb) do { \ ++ AuDbg(#sb "\n"); \ ++ au_dpri_sb(sb); \ ++} while (0) ++ ++#define AuDbgSleep(sec) do { \ ++ AuDbg("sleep %d sec\n", sec); \ ++ ssleep(sec); \ ++} while (0) ++ ++#define AuDbgSleepJiffy(jiffy) do { \ ++ AuDbg("sleep %d jiffies\n", jiffy); \ ++ au_dbg_sleep_jiffy(jiffy); \ ++} while (0) ++ ++#define AuDbgIAttr(ia) do { \ ++ AuDbg("ia_valid 0x%x\n", (ia)->ia_valid); \ ++ au_dbg_iattr(ia); \ ++} while (0) ++ ++#define AuDbgSym(addr) do { \ ++ char sym[KSYM_SYMBOL_LEN]; \ ++ sprint_symbol(sym, (unsigned long)addr); \ ++ AuDbg("%s\n", sym); \ ++} while (0) ++ ++#define AuInfoSym(addr) do { \ ++ char sym[KSYM_SYMBOL_LEN]; \ ++ sprint_symbol(sym, (unsigned long)addr); \ ++ AuInfo("%s\n", sym); \ ++} while (0) ++#else ++AuStubVoid(au_dbg_verify_dinode, struct dentry *dentry) ++AuStubVoid(au_dbg_verify_dir_parent, struct dentry *dentry, unsigned int sigen) ++AuStubVoid(au_dbg_verify_nondir_parent, struct dentry *dentry, ++ unsigned int sigen) ++AuStubVoid(au_dbg_verify_gen, struct dentry *parent, unsigned int sigen) ++AuStubVoid(au_dbg_verify_kthread, void) ++AuStubInt0(__init au_debug_init, void) ++AuStubVoid(au_debug_sbinfo_init, struct au_sbinfo *sbinfo) ++ ++#define AuDbgWhlist(w) do {} while (0) ++#define AuDbgVdir(v) do {} while (0) ++#define AuDbgInode(i) do {} while (0) ++#define AuDbgDAlias(i) do {} while (0) ++#define AuDbgDentry(d) do {} while (0) ++#define AuDbgFile(f) do {} while (0) ++#define AuDbgSb(sb) do {} while (0) ++#define AuDbgSleep(sec) do {} while (0) ++#define AuDbgSleepJiffy(jiffy) do {} while (0) ++#define AuDbgIAttr(ia) do {} while (0) ++#define AuDbgSym(addr) do {} while (0) ++#define AuInfoSym(addr) do {} while (0) ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_MAGIC_SYSRQ ++int __init au_sysrq_init(void); ++void au_sysrq_fin(void); ++ ++#ifdef CONFIG_HW_CONSOLE ++#define au_dbg_blocked() do { \ ++ WARN_ON(1); \ ++ handle_sysrq('w'); \ ++} while (0) ++#else ++AuStubVoid(au_dbg_blocked, void) ++#endif ++ ++#else ++AuStubInt0(__init au_sysrq_init, void) ++AuStubVoid(au_sysrq_fin, void) ++AuStubVoid(au_dbg_blocked, void) ++#endif /* CONFIG_AUFS_MAGIC_SYSRQ */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DEBUG_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/dentry.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,1140 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * lookup and dentry operations ++ */ ++ ++#include ++#include "aufs.h" ++ ++static void au_h_nd(struct nameidata *h_nd, struct nameidata *nd) ++{ ++ if (nd) { ++ *h_nd = *nd; ++ ++ /* ++ * gave up supporting LOOKUP_CREATE/OPEN for lower fs, ++ * due to whiteout and branch permission. ++ */ ++ h_nd->flags &= ~(/*LOOKUP_PARENT |*/ LOOKUP_OPEN | LOOKUP_CREATE ++ | LOOKUP_FOLLOW | LOOKUP_EXCL); ++ /* unnecessary? */ ++ h_nd->intent.open.file = NULL; ++ } else ++ memset(h_nd, 0, sizeof(*h_nd)); ++} ++ ++struct au_lkup_one_args { ++ struct dentry **errp; ++ struct qstr *name; ++ struct dentry *h_parent; ++ struct au_branch *br; ++ struct nameidata *nd; ++}; ++ ++struct dentry *au_lkup_one(struct qstr *name, struct dentry *h_parent, ++ struct au_branch *br, struct nameidata *nd) ++{ ++ struct dentry *h_dentry; ++ int err; ++ struct nameidata h_nd; ++ ++ if (au_test_fs_null_nd(h_parent->d_sb)) ++ return vfsub_lookup_one_len(name->name, h_parent, name->len); ++ ++ au_h_nd(&h_nd, nd); ++ h_nd.path.dentry = h_parent; ++ h_nd.path.mnt = br->br_mnt; ++ ++ err = vfsub_name_hash(name->name, &h_nd.last, name->len); ++ h_dentry = ERR_PTR(err); ++ if (!err) { ++ path_get(&h_nd.path); ++ h_dentry = vfsub_lookup_hash(&h_nd); ++ path_put(&h_nd.path); ++ } ++ ++ AuTraceErrPtr(h_dentry); ++ return h_dentry; ++} ++ ++static void au_call_lkup_one(void *args) ++{ ++ struct au_lkup_one_args *a = args; ++ *a->errp = au_lkup_one(a->name, a->h_parent, a->br, a->nd); ++} ++ ++#define AuLkup_ALLOW_NEG 1 ++#define au_ftest_lkup(flags, name) ((flags) & AuLkup_##name) ++#define au_fset_lkup(flags, name) \ ++ do { (flags) |= AuLkup_##name; } while (0) ++#define au_fclr_lkup(flags, name) \ ++ do { (flags) &= ~AuLkup_##name; } while (0) ++ ++struct au_do_lookup_args { ++ unsigned int flags; ++ mode_t type; ++ struct nameidata *nd; ++}; ++ ++/* ++ * returns positive/negative dentry, NULL or an error. ++ * NULL means whiteout-ed or not-found. ++ */ ++static struct dentry* ++au_do_lookup(struct dentry *h_parent, struct dentry *dentry, ++ aufs_bindex_t bindex, struct qstr *wh_name, ++ struct au_do_lookup_args *args) ++{ ++ struct dentry *h_dentry; ++ struct inode *h_inode, *inode; ++ struct au_branch *br; ++ int wh_found, opq; ++ unsigned char wh_able; ++ const unsigned char allow_neg = !!au_ftest_lkup(args->flags, ALLOW_NEG); ++ ++ wh_found = 0; ++ br = au_sbr(dentry->d_sb, bindex); ++ wh_able = !!au_br_whable(br->br_perm); ++ if (wh_able) ++ wh_found = au_wh_test(h_parent, wh_name, br, /*try_sio*/0); ++ h_dentry = ERR_PTR(wh_found); ++ if (!wh_found) ++ goto real_lookup; ++ if (unlikely(wh_found < 0)) ++ goto out; ++ ++ /* We found a whiteout */ ++ /* au_set_dbend(dentry, bindex); */ ++ au_set_dbwh(dentry, bindex); ++ if (!allow_neg) ++ return NULL; /* success */ ++ ++real_lookup: ++ h_dentry = au_lkup_one(&dentry->d_name, h_parent, br, args->nd); ++ if (IS_ERR(h_dentry)) ++ goto out; ++ ++ h_inode = h_dentry->d_inode; ++ if (!h_inode) { ++ if (!allow_neg) ++ goto out_neg; ++ } else if (wh_found ++ || (args->type && args->type != (h_inode->i_mode & S_IFMT))) ++ goto out_neg; ++ ++ if (au_dbend(dentry) <= bindex) ++ au_set_dbend(dentry, bindex); ++ if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry)) ++ au_set_dbstart(dentry, bindex); ++ au_set_h_dptr(dentry, bindex, h_dentry); ++ ++ inode = dentry->d_inode; ++ if (!h_inode || !S_ISDIR(h_inode->i_mode) || !wh_able ++ || (inode && !S_ISDIR(inode->i_mode))) ++ goto out; /* success */ ++ ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ opq = au_diropq_test(h_dentry, br); ++ mutex_unlock(&h_inode->i_mutex); ++ if (opq > 0) ++ au_set_dbdiropq(dentry, bindex); ++ else if (unlikely(opq < 0)) { ++ au_set_h_dptr(dentry, bindex, NULL); ++ h_dentry = ERR_PTR(opq); ++ } ++ goto out; ++ ++out_neg: ++ dput(h_dentry); ++ h_dentry = NULL; ++out: ++ return h_dentry; ++} ++ ++static int au_test_shwh(struct super_block *sb, const struct qstr *name) ++{ ++ if (unlikely(!au_opt_test(au_mntflags(sb), SHWH) ++ && !strncmp(name->name, AUFS_WH_PFX, AUFS_WH_PFX_LEN))) ++ return -EPERM; ++ return 0; ++} ++ ++/* ++ * returns the number of lower positive dentries, ++ * otherwise an error. ++ * can be called at unlinking with @type is zero. ++ */ ++int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type, ++ struct nameidata *nd) ++{ ++ int npositive, err; ++ aufs_bindex_t bindex, btail, bdiropq; ++ unsigned char isdir; ++ struct qstr whname; ++ struct au_do_lookup_args args = { ++ .flags = 0, ++ .type = type, ++ .nd = nd ++ }; ++ const struct qstr *name = &dentry->d_name; ++ struct dentry *parent; ++ struct inode *inode; ++ ++ err = au_test_shwh(dentry->d_sb, name); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_wh_name_alloc(&whname, name); ++ if (unlikely(err)) ++ goto out; ++ ++ inode = dentry->d_inode; ++ isdir = !!(inode && S_ISDIR(inode->i_mode)); ++ if (!type) ++ au_fset_lkup(args.flags, ALLOW_NEG); ++ ++ npositive = 0; ++ parent = dget_parent(dentry); ++ btail = au_dbtaildir(parent); ++ for (bindex = bstart; bindex <= btail; bindex++) { ++ struct dentry *h_parent, *h_dentry; ++ struct inode *h_inode, *h_dir; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry) { ++ if (h_dentry->d_inode) ++ npositive++; ++ if (type != S_IFDIR) ++ break; ++ continue; ++ } ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent) ++ continue; ++ h_dir = h_parent->d_inode; ++ if (!h_dir || !S_ISDIR(h_dir->i_mode)) ++ continue; ++ ++ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); ++ h_dentry = au_do_lookup(h_parent, dentry, bindex, &whname, ++ &args); ++ mutex_unlock(&h_dir->i_mutex); ++ err = PTR_ERR(h_dentry); ++ if (IS_ERR(h_dentry)) ++ goto out_parent; ++ au_fclr_lkup(args.flags, ALLOW_NEG); ++ ++ if (au_dbwh(dentry) >= 0) ++ break; ++ if (!h_dentry) ++ continue; ++ h_inode = h_dentry->d_inode; ++ if (!h_inode) ++ continue; ++ npositive++; ++ if (!args.type) ++ args.type = h_inode->i_mode & S_IFMT; ++ if (args.type != S_IFDIR) ++ break; ++ else if (isdir) { ++ /* the type of lower may be different */ ++ bdiropq = au_dbdiropq(dentry); ++ if (bdiropq >= 0 && bdiropq <= bindex) ++ break; ++ } ++ } ++ ++ if (npositive) { ++ AuLabel(positive); ++ au_update_dbstart(dentry); ++ } ++ err = npositive; ++ if (unlikely(!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE) ++ && au_dbstart(dentry) < 0)) { ++ err = -EIO; ++ AuIOErr("both of real entry and whiteout found, %.*s, err %d\n", ++ AuDLNPair(dentry), err); ++ } ++ ++out_parent: ++ dput(parent); ++ kfree(whname.name); ++out: ++ return err; ++} ++ ++struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent, ++ struct au_branch *br) ++{ ++ struct dentry *dentry; ++ int wkq_err; ++ ++ if (!au_test_h_perm_sio(parent->d_inode, MAY_EXEC)) ++ dentry = au_lkup_one(name, parent, br, /*nd*/NULL); ++ else { ++ struct au_lkup_one_args args = { ++ .errp = &dentry, ++ .name = name, ++ .h_parent = parent, ++ .br = br, ++ .nd = NULL ++ }; ++ ++ wkq_err = au_wkq_wait(au_call_lkup_one, &args); ++ if (unlikely(wkq_err)) ++ dentry = ERR_PTR(wkq_err); ++ } ++ ++ return dentry; ++} ++ ++/* ++ * lookup @dentry on @bindex which should be negative. ++ */ ++int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ int err; ++ struct dentry *parent, *h_parent, *h_dentry; ++ ++ parent = dget_parent(dentry); ++ h_parent = au_h_dptr(parent, bindex); ++ h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent, ++ au_sbr(dentry->d_sb, bindex)); ++ err = PTR_ERR(h_dentry); ++ if (IS_ERR(h_dentry)) ++ goto out; ++ if (unlikely(h_dentry->d_inode)) { ++ err = -EIO; ++ AuIOErr("%.*s should be negative on b%d.\n", ++ AuDLNPair(h_dentry), bindex); ++ dput(h_dentry); ++ goto out; ++ } ++ ++ err = 0; ++ if (bindex < au_dbstart(dentry)) ++ au_set_dbstart(dentry, bindex); ++ if (au_dbend(dentry) < bindex) ++ au_set_dbend(dentry, bindex); ++ au_set_h_dptr(dentry, bindex, h_dentry); ++ ++out: ++ dput(parent); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* subset of struct inode */ ++struct au_iattr { ++ unsigned long i_ino; ++ /* unsigned int i_nlink; */ ++ uid_t i_uid; ++ gid_t i_gid; ++ u64 i_version; ++/* ++ loff_t i_size; ++ blkcnt_t i_blocks; ++*/ ++ umode_t i_mode; ++}; ++ ++static void au_iattr_save(struct au_iattr *ia, struct inode *h_inode) ++{ ++ ia->i_ino = h_inode->i_ino; ++ /* ia->i_nlink = h_inode->i_nlink; */ ++ ia->i_uid = h_inode->i_uid; ++ ia->i_gid = h_inode->i_gid; ++ ia->i_version = h_inode->i_version; ++/* ++ ia->i_size = h_inode->i_size; ++ ia->i_blocks = h_inode->i_blocks; ++*/ ++ ia->i_mode = (h_inode->i_mode & S_IFMT); ++} ++ ++static int au_iattr_test(struct au_iattr *ia, struct inode *h_inode) ++{ ++ return ia->i_ino != h_inode->i_ino ++ /* || ia->i_nlink != h_inode->i_nlink */ ++ || ia->i_uid != h_inode->i_uid ++ || ia->i_gid != h_inode->i_gid ++ || ia->i_version != h_inode->i_version ++/* ++ || ia->i_size != h_inode->i_size ++ || ia->i_blocks != h_inode->i_blocks ++*/ ++ || ia->i_mode != (h_inode->i_mode & S_IFMT); ++} ++ ++static int au_h_verify_dentry(struct dentry *h_dentry, struct dentry *h_parent, ++ struct au_branch *br) ++{ ++ int err; ++ struct au_iattr ia; ++ struct inode *h_inode; ++ struct dentry *h_d; ++ struct super_block *h_sb; ++ ++ err = 0; ++ memset(&ia, -1, sizeof(ia)); ++ h_sb = h_dentry->d_sb; ++ h_inode = h_dentry->d_inode; ++ if (h_inode) ++ au_iattr_save(&ia, h_inode); ++ else if (au_test_nfs(h_sb) || au_test_fuse(h_sb)) ++ /* nfs d_revalidate may return 0 for negative dentry */ ++ /* fuse d_revalidate always return 0 for negative dentry */ ++ goto out; ++ ++ /* main purpose is namei.c:cached_lookup() and d_revalidate */ ++ h_d = au_lkup_one(&h_dentry->d_name, h_parent, br, /*nd*/NULL); ++ err = PTR_ERR(h_d); ++ if (IS_ERR(h_d)) ++ goto out; ++ ++ err = 0; ++ if (unlikely(h_d != h_dentry ++ || h_d->d_inode != h_inode ++ || (h_inode && au_iattr_test(&ia, h_inode)))) ++ err = au_busy_or_stale(); ++ dput(h_d); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, ++ struct dentry *h_parent, struct au_branch *br) ++{ ++ int err; ++ ++ err = 0; ++ if (udba == AuOpt_UDBA_REVAL ++ && !au_test_fs_remote(h_dentry->d_sb)) { ++ IMustLock(h_dir); ++ err = (h_dentry->d_parent->d_inode != h_dir); ++ } else if (udba != AuOpt_UDBA_NONE) ++ err = au_h_verify_dentry(h_dentry, h_parent, br); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_refresh_hdentry(struct dentry *dentry, struct dentry *parent) ++{ ++ int err; ++ aufs_bindex_t new_bindex, bindex, bend, bwh, bdiropq; ++ struct au_hdentry tmp, *p, *q; ++ struct au_dinfo *dinfo; ++ struct super_block *sb; ++ ++ DiMustWriteLock(dentry); ++ ++ sb = dentry->d_sb; ++ dinfo = au_di(dentry); ++ bend = dinfo->di_bend; ++ bwh = dinfo->di_bwh; ++ bdiropq = dinfo->di_bdiropq; ++ p = dinfo->di_hdentry + dinfo->di_bstart; ++ for (bindex = dinfo->di_bstart; bindex <= bend; bindex++, p++) { ++ if (!p->hd_dentry) ++ continue; ++ ++ new_bindex = au_br_index(sb, p->hd_id); ++ if (new_bindex == bindex) ++ continue; ++ ++ if (dinfo->di_bwh == bindex) ++ bwh = new_bindex; ++ if (dinfo->di_bdiropq == bindex) ++ bdiropq = new_bindex; ++ if (new_bindex < 0) { ++ au_hdput(p); ++ p->hd_dentry = NULL; ++ continue; ++ } ++ ++ /* swap two lower dentries, and loop again */ ++ q = dinfo->di_hdentry + new_bindex; ++ tmp = *q; ++ *q = *p; ++ *p = tmp; ++ if (tmp.hd_dentry) { ++ bindex--; ++ p--; ++ } ++ } ++ ++ dinfo->di_bwh = -1; ++ if (bwh >= 0 && bwh <= au_sbend(sb) && au_sbr_whable(sb, bwh)) ++ dinfo->di_bwh = bwh; ++ ++ dinfo->di_bdiropq = -1; ++ if (bdiropq >= 0 ++ && bdiropq <= au_sbend(sb) ++ && au_sbr_whable(sb, bdiropq)) ++ dinfo->di_bdiropq = bdiropq; ++ ++ err = -EIO; ++ dinfo->di_bstart = -1; ++ dinfo->di_bend = -1; ++ bend = au_dbend(parent); ++ p = dinfo->di_hdentry; ++ for (bindex = 0; bindex <= bend; bindex++, p++) ++ if (p->hd_dentry) { ++ dinfo->di_bstart = bindex; ++ break; ++ } ++ ++ if (dinfo->di_bstart >= 0) { ++ p = dinfo->di_hdentry + bend; ++ for (bindex = bend; bindex >= 0; bindex--, p--) ++ if (p->hd_dentry) { ++ dinfo->di_bend = bindex; ++ err = 0; ++ break; ++ } ++ } ++ ++ return err; ++} ++ ++static void au_do_hide(struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ inode = dentry->d_inode; ++ if (inode) { ++ if (!S_ISDIR(inode->i_mode)) { ++ if (inode->i_nlink && !d_unhashed(dentry)) ++ drop_nlink(inode); ++ } else { ++ clear_nlink(inode); ++ /* stop next lookup */ ++ inode->i_flags |= S_DEAD; ++ } ++ smp_mb(); /* necessary? */ ++ } ++ d_drop(dentry); ++} ++ ++static int au_hide_children(struct dentry *parent) ++{ ++ int err, i, j, ndentry; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry *dentry; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_dcsub_pages(&dpages, parent, NULL, NULL); ++ if (unlikely(err)) ++ goto out_dpages; ++ ++ /* in reverse order */ ++ for (i = dpages.ndpage - 1; i >= 0; i--) { ++ dpage = dpages.dpages + i; ++ ndentry = dpage->ndentry; ++ for (j = ndentry - 1; j >= 0; j--) { ++ dentry = dpage->dentries[j]; ++ if (dentry != parent) ++ au_do_hide(dentry); ++ } ++ } ++ ++out_dpages: ++ au_dpages_free(&dpages); ++out: ++ return err; ++} ++ ++static void au_hide(struct dentry *dentry) ++{ ++ int err; ++ struct inode *inode; ++ ++ AuDbgDentry(dentry); ++ inode = dentry->d_inode; ++ if (inode && S_ISDIR(inode->i_mode)) { ++ /* shrink_dcache_parent(dentry); */ ++ err = au_hide_children(dentry); ++ if (unlikely(err)) ++ AuIOErr("%.*s, failed hiding children, ignored %d\n", ++ AuDLNPair(dentry), err); ++ } ++ au_do_hide(dentry); ++} ++ ++/* ++ * By adding a dirty branch, a cached dentry may be affected in various ways. ++ * ++ * a dirty branch is added ++ * - on the top of layers ++ * - in the middle of layers ++ * - to the bottom of layers ++ * ++ * on the added branch there exists ++ * - a whiteout ++ * - a diropq ++ * - a same named entry ++ * + exist ++ * * negative --> positive ++ * * positive --> positive ++ * - type is unchanged ++ * - type is changed ++ * + doesn't exist ++ * * negative --> negative ++ * * positive --> negative (rejected by au_br_del() for non-dir case) ++ * - none ++ */ ++static int au_refresh_by_dinfo(struct dentry *dentry, struct au_dinfo *dinfo, ++ struct au_dinfo *tmp) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ struct { ++ struct dentry *dentry; ++ struct inode *inode; ++ mode_t mode; ++ } orig_h, tmp_h; ++ struct au_hdentry *hd; ++ struct inode *inode, *h_inode; ++ struct dentry *h_dentry; ++ ++ err = 0; ++ AuDebugOn(dinfo->di_bstart < 0); ++ orig_h.dentry = dinfo->di_hdentry[dinfo->di_bstart].hd_dentry; ++ orig_h.inode = orig_h.dentry->d_inode; ++ orig_h.mode = 0; ++ if (orig_h.inode) ++ orig_h.mode = orig_h.inode->i_mode & S_IFMT; ++ memset(&tmp_h, 0, sizeof(tmp_h)); ++ if (tmp->di_bstart >= 0) { ++ tmp_h.dentry = tmp->di_hdentry[tmp->di_bstart].hd_dentry; ++ tmp_h.inode = tmp_h.dentry->d_inode; ++ if (tmp_h.inode) ++ tmp_h.mode = tmp_h.inode->i_mode & S_IFMT; ++ } ++ ++ inode = dentry->d_inode; ++ if (!orig_h.inode) { ++ AuDbg("nagative originally\n"); ++ if (inode) { ++ au_hide(dentry); ++ goto out; ++ } ++ AuDebugOn(inode); ++ AuDebugOn(dinfo->di_bstart != dinfo->di_bend); ++ AuDebugOn(dinfo->di_bdiropq != -1); ++ ++ if (!tmp_h.inode) { ++ AuDbg("negative --> negative\n"); ++ /* should have only one negative lower */ ++ if (tmp->di_bstart >= 0 ++ && tmp->di_bstart < dinfo->di_bstart) { ++ AuDebugOn(tmp->di_bstart != tmp->di_bend); ++ AuDebugOn(dinfo->di_bstart != dinfo->di_bend); ++ au_set_h_dptr(dentry, dinfo->di_bstart, NULL); ++ au_di_cp(dinfo, tmp); ++ hd = tmp->di_hdentry + tmp->di_bstart; ++ au_set_h_dptr(dentry, tmp->di_bstart, ++ dget(hd->hd_dentry)); ++ } ++ au_dbg_verify_dinode(dentry); ++ } else { ++ AuDbg("negative --> positive\n"); ++ /* ++ * similar to the behaviour of creating with bypassing ++ * aufs. ++ * unhash it in order to force an error in the ++ * succeeding create operation. ++ * we should not set S_DEAD here. ++ */ ++ d_drop(dentry); ++ /* au_di_swap(tmp, dinfo); */ ++ au_dbg_verify_dinode(dentry); ++ } ++ } else { ++ AuDbg("positive originally\n"); ++ /* inode may be NULL */ ++ AuDebugOn(inode && (inode->i_mode & S_IFMT) != orig_h.mode); ++ if (!tmp_h.inode) { ++ AuDbg("positive --> negative\n"); ++ /* or bypassing aufs */ ++ au_hide(dentry); ++ if (tmp->di_bwh >= 0 && tmp->di_bwh <= dinfo->di_bstart) ++ dinfo->di_bwh = tmp->di_bwh; ++ if (inode) ++ err = au_refresh_hinode_self(inode); ++ au_dbg_verify_dinode(dentry); ++ } else if (orig_h.mode == tmp_h.mode) { ++ AuDbg("positive --> positive, same type\n"); ++ if (!S_ISDIR(orig_h.mode) ++ && dinfo->di_bstart > tmp->di_bstart) { ++ /* ++ * similar to the behaviour of removing and ++ * creating. ++ */ ++ au_hide(dentry); ++ if (inode) ++ err = au_refresh_hinode_self(inode); ++ au_dbg_verify_dinode(dentry); ++ } else { ++ /* fill empty slots */ ++ if (dinfo->di_bstart > tmp->di_bstart) ++ dinfo->di_bstart = tmp->di_bstart; ++ if (dinfo->di_bend < tmp->di_bend) ++ dinfo->di_bend = tmp->di_bend; ++ dinfo->di_bwh = tmp->di_bwh; ++ dinfo->di_bdiropq = tmp->di_bdiropq; ++ hd = tmp->di_hdentry; ++ bend = dinfo->di_bend; ++ for (bindex = tmp->di_bstart; bindex <= bend; ++ bindex++) { ++ if (au_h_dptr(dentry, bindex)) ++ continue; ++ h_dentry = hd[bindex].hd_dentry; ++ if (!h_dentry) ++ continue; ++ h_inode = h_dentry->d_inode; ++ AuDebugOn(!h_inode); ++ AuDebugOn(orig_h.mode ++ != (h_inode->i_mode ++ & S_IFMT)); ++ au_set_h_dptr(dentry, bindex, ++ dget(h_dentry)); ++ } ++ err = au_refresh_hinode(inode, dentry); ++ au_dbg_verify_dinode(dentry); ++ } ++ } else { ++ AuDbg("positive --> positive, different type\n"); ++ /* similar to the behaviour of removing and creating */ ++ au_hide(dentry); ++ if (inode) ++ err = au_refresh_hinode_self(inode); ++ au_dbg_verify_dinode(dentry); ++ } ++ } ++ ++out: ++ return err; ++} ++ ++int au_refresh_dentry(struct dentry *dentry, struct dentry *parent) ++{ ++ int err, ebrange; ++ unsigned int sigen; ++ struct au_dinfo *dinfo, *tmp; ++ struct super_block *sb; ++ struct inode *inode; ++ ++ DiMustWriteLock(dentry); ++ AuDebugOn(IS_ROOT(dentry)); ++ AuDebugOn(!parent->d_inode); ++ ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ sigen = au_sigen(sb); ++ err = au_digen_test(parent, sigen); ++ if (unlikely(err)) ++ goto out; ++ ++ dinfo = au_di(dentry); ++ err = au_di_realloc(dinfo, au_sbend(sb) + 1); ++ if (unlikely(err)) ++ goto out; ++ ebrange = au_dbrange_test(dentry); ++ if (!ebrange) ++ ebrange = au_do_refresh_hdentry(dentry, parent); ++ ++ if (d_unhashed(dentry) || ebrange) { ++ AuDebugOn(au_dbstart(dentry) < 0 && au_dbend(dentry) >= 0); ++ if (inode) ++ err = au_refresh_hinode_self(inode); ++ au_dbg_verify_dinode(dentry); ++ if (!err) ++ goto out_dgen; /* success */ ++ goto out; ++ } ++ ++ /* temporary dinfo */ ++ AuDbgDentry(dentry); ++ err = -ENOMEM; ++ tmp = au_di_alloc(sb, AuLsc_DI_TMP); ++ if (unlikely(!tmp)) ++ goto out; ++ au_di_swap(tmp, dinfo); ++ /* returns the number of positive dentries */ ++ /* ++ * if current working dir is removed, it returns an error. ++ * but the dentry is legal. ++ */ ++ err = au_lkup_dentry(dentry, /*bstart*/0, /*type*/0, /*nd*/NULL); ++ AuDbgDentry(dentry); ++ au_di_swap(tmp, dinfo); ++ if (err == -ENOENT) ++ err = 0; ++ if (err >= 0) { ++ /* compare/refresh by dinfo */ ++ AuDbgDentry(dentry); ++ err = au_refresh_by_dinfo(dentry, dinfo, tmp); ++ au_dbg_verify_dinode(dentry); ++ AuTraceErr(err); ++ } ++ au_rw_write_unlock(&tmp->di_rwsem); ++ au_di_free(tmp); ++ if (unlikely(err)) ++ goto out; ++ ++out_dgen: ++ au_update_digen(dentry); ++out: ++ if (unlikely(err && !(dentry->d_flags & DCACHE_NFSFS_RENAMED))) { ++ AuIOErr("failed refreshing %.*s, %d\n", ++ AuDLNPair(dentry), err); ++ AuDbgDentry(dentry); ++ } ++ AuTraceErr(err); ++ return err; ++} ++ ++static noinline_for_stack ++int au_do_h_d_reval(struct dentry *h_dentry, struct nameidata *nd, ++ struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ int err, valid; ++ int (*reval)(struct dentry *, struct nameidata *); ++ ++ err = 0; ++ if (!(h_dentry->d_flags & DCACHE_OP_REVALIDATE)) ++ goto out; ++ reval = h_dentry->d_op->d_revalidate; ++ ++ AuDbg("b%d\n", bindex); ++ if (au_test_fs_null_nd(h_dentry->d_sb)) ++ /* it may return tri-state */ ++ valid = reval(h_dentry, NULL); ++ else { ++ struct nameidata h_nd; ++ int locked; ++ struct dentry *parent; ++ ++ au_h_nd(&h_nd, nd); ++ parent = nd->path.dentry; ++ locked = (nd && nd->path.dentry != dentry); ++ if (locked) ++ di_read_lock_parent(parent, AuLock_IR); ++ BUG_ON(bindex > au_dbend(parent)); ++ h_nd.path.dentry = au_h_dptr(parent, bindex); ++ BUG_ON(!h_nd.path.dentry); ++ h_nd.path.mnt = au_sbr(parent->d_sb, bindex)->br_mnt; ++ path_get(&h_nd.path); ++ valid = reval(h_dentry, &h_nd); ++ path_put(&h_nd.path); ++ if (locked) ++ di_read_unlock(parent, AuLock_IR); ++ } ++ ++ if (unlikely(valid < 0)) ++ err = valid; ++ else if (!valid) ++ err = -EINVAL; ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* todo: remove this */ ++static int h_d_revalidate(struct dentry *dentry, struct inode *inode, ++ struct nameidata *nd, int do_udba) ++{ ++ int err; ++ umode_t mode, h_mode; ++ aufs_bindex_t bindex, btail, bstart, ibs, ibe; ++ unsigned char plus, unhashed, is_root, h_plus; ++ struct inode *h_inode, *h_cached_inode; ++ struct dentry *h_dentry; ++ struct qstr *name, *h_name; ++ ++ err = 0; ++ plus = 0; ++ mode = 0; ++ ibs = -1; ++ ibe = -1; ++ unhashed = !!d_unhashed(dentry); ++ is_root = !!IS_ROOT(dentry); ++ name = &dentry->d_name; ++ ++ /* ++ * Theoretically, REVAL test should be unnecessary in case of ++ * {FS,I}NOTIFY. ++ * But {fs,i}notify doesn't fire some necessary events, ++ * IN_ATTRIB for atime/nlink/pageio ++ * IN_DELETE for NFS dentry ++ * Let's do REVAL test too. ++ */ ++ if (do_udba && inode) { ++ mode = (inode->i_mode & S_IFMT); ++ plus = (inode->i_nlink > 0); ++ ibs = au_ibstart(inode); ++ ibe = au_ibend(inode); ++ } ++ ++ bstart = au_dbstart(dentry); ++ btail = bstart; ++ if (inode && S_ISDIR(inode->i_mode)) ++ btail = au_dbtaildir(dentry); ++ for (bindex = bstart; bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ ++ AuDbg("b%d, %.*s\n", bindex, AuDLNPair(h_dentry)); ++ spin_lock(&h_dentry->d_lock); ++ h_name = &h_dentry->d_name; ++ if (unlikely(do_udba ++ && !is_root ++ && (unhashed != !!d_unhashed(h_dentry) ++ || name->len != h_name->len ++ || memcmp(name->name, h_name->name, name->len)) ++ )) { ++ AuDbg("unhash 0x%x 0x%x, %.*s %.*s\n", ++ unhashed, d_unhashed(h_dentry), ++ AuDLNPair(dentry), AuDLNPair(h_dentry)); ++ spin_unlock(&h_dentry->d_lock); ++ goto err; ++ } ++ spin_unlock(&h_dentry->d_lock); ++ ++ err = au_do_h_d_reval(h_dentry, nd, dentry, bindex); ++ if (unlikely(err)) ++ /* do not goto err, to keep the errno */ ++ break; ++ ++ /* todo: plink too? */ ++ if (!do_udba) ++ continue; ++ ++ /* UDBA tests */ ++ h_inode = h_dentry->d_inode; ++ if (unlikely(!!inode != !!h_inode)) ++ goto err; ++ ++ h_plus = plus; ++ h_mode = mode; ++ h_cached_inode = h_inode; ++ if (h_inode) { ++ h_mode = (h_inode->i_mode & S_IFMT); ++ h_plus = (h_inode->i_nlink > 0); ++ } ++ if (inode && ibs <= bindex && bindex <= ibe) ++ h_cached_inode = au_h_iptr(inode, bindex); ++ ++ if (unlikely(plus != h_plus ++ || mode != h_mode ++ || h_cached_inode != h_inode)) ++ goto err; ++ continue; ++ ++ err: ++ err = -EINVAL; ++ break; ++ } ++ ++ return err; ++} ++ ++/* todo: consolidate with do_refresh() and au_reval_for_attr() */ ++static int simple_reval_dpath(struct dentry *dentry, unsigned int sigen) ++{ ++ int err; ++ struct dentry *parent; ++ ++ if (!au_digen_test(dentry, sigen)) ++ return 0; ++ ++ parent = dget_parent(dentry); ++ di_read_lock_parent(parent, AuLock_IR); ++ AuDebugOn(au_digen_test(parent, sigen)); ++ au_dbg_verify_gen(parent, sigen); ++ err = au_refresh_dentry(dentry, parent); ++ di_read_unlock(parent, AuLock_IR); ++ dput(parent); ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_reval_dpath(struct dentry *dentry, unsigned int sigen) ++{ ++ int err; ++ struct dentry *d, *parent; ++ struct inode *inode; ++ ++ if (!au_ftest_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR)) ++ return simple_reval_dpath(dentry, sigen); ++ ++ /* slow loop, keep it simple and stupid */ ++ /* cf: au_cpup_dirs() */ ++ err = 0; ++ parent = NULL; ++ while (au_digen_test(dentry, sigen)) { ++ d = dentry; ++ while (1) { ++ dput(parent); ++ parent = dget_parent(d); ++ if (!au_digen_test(parent, sigen)) ++ break; ++ d = parent; ++ } ++ ++ inode = d->d_inode; ++ if (d != dentry) ++ di_write_lock_child2(d); ++ ++ /* someone might update our dentry while we were sleeping */ ++ if (au_digen_test(d, sigen)) { ++ /* ++ * todo: consolidate with simple_reval_dpath(), ++ * do_refresh() and au_reval_for_attr(). ++ */ ++ di_read_lock_parent(parent, AuLock_IR); ++ err = au_refresh_dentry(d, parent); ++ di_read_unlock(parent, AuLock_IR); ++ } ++ ++ if (d != dentry) ++ di_write_unlock(d); ++ dput(parent); ++ if (unlikely(err)) ++ break; ++ } ++ ++ return err; ++} ++ ++/* ++ * if valid returns 1, otherwise 0. ++ */ ++static int aufs_d_revalidate(struct dentry *dentry, struct nameidata *nd) ++{ ++ int valid, err; ++ unsigned int sigen; ++ unsigned char do_udba; ++ struct super_block *sb; ++ struct inode *inode; ++ ++ /* todo: support rcu-walk? */ ++ if (nd && (nd->flags & LOOKUP_RCU)) ++ return -ECHILD; ++ ++ valid = 0; ++ if (unlikely(!au_di(dentry))) ++ goto out; ++ ++ inode = dentry->d_inode; ++ if (inode && is_bad_inode(inode)) ++ goto out; ++ ++ valid = 1; ++ sb = dentry->d_sb; ++ /* ++ * todo: very ugly ++ * i_mutex of parent dir may be held, ++ * but we should not return 'invalid' due to busy. ++ */ ++ err = aufs_read_lock(dentry, AuLock_FLUSH | AuLock_DW | AuLock_NOPLM); ++ if (unlikely(err)) { ++ valid = err; ++ AuTraceErr(err); ++ goto out; ++ } ++ if (unlikely(au_dbrange_test(dentry))) { ++ err = -EINVAL; ++ AuTraceErr(err); ++ goto out_dgrade; ++ } ++ ++ sigen = au_sigen(sb); ++ if (au_digen_test(dentry, sigen)) { ++ AuDebugOn(IS_ROOT(dentry)); ++ err = au_reval_dpath(dentry, sigen); ++ if (unlikely(err)) { ++ AuTraceErr(err); ++ goto out_dgrade; ++ } ++ } ++ di_downgrade_lock(dentry, AuLock_IR); ++ ++ err = -EINVAL; ++ if (inode && (IS_DEADDIR(inode) || !inode->i_nlink)) ++ goto out_inval; ++ ++ do_udba = !au_opt_test(au_mntflags(sb), UDBA_NONE); ++ if (do_udba && inode) { ++ aufs_bindex_t bstart = au_ibstart(inode); ++ struct inode *h_inode; ++ ++ if (bstart >= 0) { ++ h_inode = au_h_iptr(inode, bstart); ++ if (h_inode && au_test_higen(inode, h_inode)) ++ goto out_inval; ++ } ++ } ++ ++ err = h_d_revalidate(dentry, inode, nd, do_udba); ++ if (unlikely(!err && do_udba && au_dbstart(dentry) < 0)) { ++ err = -EIO; ++ AuDbg("both of real entry and whiteout found, %.*s, err %d\n", ++ AuDLNPair(dentry), err); ++ } ++ goto out_inval; ++ ++out_dgrade: ++ di_downgrade_lock(dentry, AuLock_IR); ++out_inval: ++ aufs_read_unlock(dentry, AuLock_IR); ++ AuTraceErr(err); ++ valid = !err; ++out: ++ if (!valid) { ++ AuDbg("%.*s invalid, %d\n", AuDLNPair(dentry), valid); ++ d_drop(dentry); ++ } ++ return valid; ++} ++ ++static void aufs_d_release(struct dentry *dentry) ++{ ++ if (au_di(dentry)) { ++ au_di_fin(dentry); ++ au_hn_di_reinit(dentry); ++ } ++} ++ ++const struct dentry_operations aufs_dop = { ++ .d_revalidate = aufs_d_revalidate, ++ .d_release = aufs_d_release ++}; +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/dentry.h 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,237 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * lookup and dentry operations ++ */ ++ ++#ifndef __AUFS_DENTRY_H__ ++#define __AUFS_DENTRY_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include "rwsem.h" ++ ++struct au_hdentry { ++ struct dentry *hd_dentry; ++ aufs_bindex_t hd_id; ++}; ++ ++struct au_dinfo { ++ atomic_t di_generation; ++ ++ struct au_rwsem di_rwsem; ++ aufs_bindex_t di_bstart, di_bend, di_bwh, di_bdiropq; ++ struct au_hdentry *di_hdentry; ++} ____cacheline_aligned_in_smp; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dentry.c */ ++extern const struct dentry_operations aufs_dop; ++struct au_branch; ++struct dentry *au_lkup_one(struct qstr *name, struct dentry *h_parent, ++ struct au_branch *br, struct nameidata *nd); ++struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent, ++ struct au_branch *br); ++int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, ++ struct dentry *h_parent, struct au_branch *br); ++ ++int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type, ++ struct nameidata *nd); ++int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex); ++int au_refresh_dentry(struct dentry *dentry, struct dentry *parent); ++int au_reval_dpath(struct dentry *dentry, unsigned int sigen); ++ ++/* dinfo.c */ ++void au_di_init_once(void *_di); ++struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc); ++void au_di_free(struct au_dinfo *dinfo); ++void au_di_swap(struct au_dinfo *a, struct au_dinfo *b); ++void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src); ++int au_di_init(struct dentry *dentry); ++void au_di_fin(struct dentry *dentry); ++int au_di_realloc(struct au_dinfo *dinfo, int nbr); ++ ++void di_read_lock(struct dentry *d, int flags, unsigned int lsc); ++void di_read_unlock(struct dentry *d, int flags); ++void di_downgrade_lock(struct dentry *d, int flags); ++void di_write_lock(struct dentry *d, unsigned int lsc); ++void di_write_unlock(struct dentry *d); ++void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir); ++void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir); ++void di_write_unlock2(struct dentry *d1, struct dentry *d2); ++ ++struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex); ++struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex); ++aufs_bindex_t au_dbtail(struct dentry *dentry); ++aufs_bindex_t au_dbtaildir(struct dentry *dentry); ++ ++void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_dentry); ++int au_digen_test(struct dentry *dentry, unsigned int sigen); ++int au_dbrange_test(struct dentry *dentry); ++void au_update_digen(struct dentry *dentry); ++void au_update_dbrange(struct dentry *dentry, int do_put_zero); ++void au_update_dbstart(struct dentry *dentry); ++void au_update_dbend(struct dentry *dentry); ++int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_dinfo *au_di(struct dentry *dentry) ++{ ++ return dentry->d_fsdata; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock subclass for dinfo */ ++enum { ++ AuLsc_DI_CHILD, /* child first */ ++ AuLsc_DI_CHILD2, /* rename(2), link(2), and cpup at hnotify */ ++ AuLsc_DI_CHILD3, /* copyup dirs */ ++ AuLsc_DI_PARENT, ++ AuLsc_DI_PARENT2, ++ AuLsc_DI_PARENT3, ++ AuLsc_DI_TMP /* temp for replacing dinfo */ ++}; ++ ++/* ++ * di_read_lock_child, di_write_lock_child, ++ * di_read_lock_child2, di_write_lock_child2, ++ * di_read_lock_child3, di_write_lock_child3, ++ * di_read_lock_parent, di_write_lock_parent, ++ * di_read_lock_parent2, di_write_lock_parent2, ++ * di_read_lock_parent3, di_write_lock_parent3, ++ */ ++#define AuReadLockFunc(name, lsc) \ ++static inline void di_read_lock_##name(struct dentry *d, int flags) \ ++{ di_read_lock(d, flags, AuLsc_DI_##lsc); } ++ ++#define AuWriteLockFunc(name, lsc) \ ++static inline void di_write_lock_##name(struct dentry *d) \ ++{ di_write_lock(d, AuLsc_DI_##lsc); } ++ ++#define AuRWLockFuncs(name, lsc) \ ++ AuReadLockFunc(name, lsc) \ ++ AuWriteLockFunc(name, lsc) ++ ++AuRWLockFuncs(child, CHILD); ++AuRWLockFuncs(child2, CHILD2); ++AuRWLockFuncs(child3, CHILD3); ++AuRWLockFuncs(parent, PARENT); ++AuRWLockFuncs(parent2, PARENT2); ++AuRWLockFuncs(parent3, PARENT3); ++ ++#undef AuReadLockFunc ++#undef AuWriteLockFunc ++#undef AuRWLockFuncs ++ ++#define DiMustNoWaiters(d) AuRwMustNoWaiters(&au_di(d)->di_rwsem) ++#define DiMustAnyLock(d) AuRwMustAnyLock(&au_di(d)->di_rwsem) ++#define DiMustWriteLock(d) AuRwMustWriteLock(&au_di(d)->di_rwsem) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* todo: memory barrier? */ ++static inline unsigned int au_digen(struct dentry *d) ++{ ++ return atomic_read(&au_di(d)->di_generation); ++} ++ ++static inline void au_h_dentry_init(struct au_hdentry *hdentry) ++{ ++ hdentry->hd_dentry = NULL; ++} ++ ++static inline void au_hdput(struct au_hdentry *hd) ++{ ++ if (hd) ++ dput(hd->hd_dentry); ++} ++ ++static inline aufs_bindex_t au_dbstart(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_bstart; ++} ++ ++static inline aufs_bindex_t au_dbend(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_bend; ++} ++ ++static inline aufs_bindex_t au_dbwh(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_bwh; ++} ++ ++static inline aufs_bindex_t au_dbdiropq(struct dentry *dentry) ++{ ++ DiMustAnyLock(dentry); ++ return au_di(dentry)->di_bdiropq; ++} ++ ++/* todo: hard/soft set? */ ++static inline void au_set_dbstart(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ au_di(dentry)->di_bstart = bindex; ++} ++ ++static inline void au_set_dbend(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ au_di(dentry)->di_bend = bindex; ++} ++ ++static inline void au_set_dbwh(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ /* dbwh can be outside of bstart - bend range */ ++ au_di(dentry)->di_bwh = bindex; ++} ++ ++static inline void au_set_dbdiropq(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ DiMustWriteLock(dentry); ++ au_di(dentry)->di_bdiropq = bindex; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_HNOTIFY ++static inline void au_digen_dec(struct dentry *d) ++{ ++ atomic_dec(&au_di(d)->di_generation); ++} ++ ++static inline void au_hn_di_reinit(struct dentry *dentry) ++{ ++ dentry->d_fsdata = NULL; ++} ++#else ++AuStubVoid(au_hn_di_reinit, struct dentry *dentry __maybe_unused) ++#endif /* CONFIG_AUFS_HNOTIFY */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DENTRY_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/dinfo.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,543 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * dentry private data ++ */ ++ ++#include "aufs.h" ++ ++void au_di_init_once(void *_dinfo) ++{ ++ struct au_dinfo *dinfo = _dinfo; ++ static struct lock_class_key aufs_di; ++ ++ au_rw_init(&dinfo->di_rwsem); ++ au_rw_class(&dinfo->di_rwsem, &aufs_di); ++} ++ ++struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc) ++{ ++ struct au_dinfo *dinfo; ++ int nbr, i; ++ ++ dinfo = au_cache_alloc_dinfo(); ++ if (unlikely(!dinfo)) ++ goto out; ++ ++ nbr = au_sbend(sb) + 1; ++ if (nbr <= 0) ++ nbr = 1; ++ dinfo->di_hdentry = kcalloc(nbr, sizeof(*dinfo->di_hdentry), GFP_NOFS); ++ if (dinfo->di_hdentry) { ++ au_rw_write_lock_nested(&dinfo->di_rwsem, lsc); ++ dinfo->di_bstart = -1; ++ dinfo->di_bend = -1; ++ dinfo->di_bwh = -1; ++ dinfo->di_bdiropq = -1; ++ for (i = 0; i < nbr; i++) ++ dinfo->di_hdentry[i].hd_id = -1; ++ goto out; ++ } ++ ++ au_cache_free_dinfo(dinfo); ++ dinfo = NULL; ++ ++out: ++ return dinfo; ++} ++ ++void au_di_free(struct au_dinfo *dinfo) ++{ ++ struct au_hdentry *p; ++ aufs_bindex_t bend, bindex; ++ ++ /* dentry may not be revalidated */ ++ bindex = dinfo->di_bstart; ++ if (bindex >= 0) { ++ bend = dinfo->di_bend; ++ p = dinfo->di_hdentry + bindex; ++ while (bindex++ <= bend) ++ au_hdput(p++); ++ } ++ kfree(dinfo->di_hdentry); ++ au_cache_free_dinfo(dinfo); ++} ++ ++void au_di_swap(struct au_dinfo *a, struct au_dinfo *b) ++{ ++ struct au_hdentry *p; ++ aufs_bindex_t bi; ++ ++ AuRwMustWriteLock(&a->di_rwsem); ++ AuRwMustWriteLock(&b->di_rwsem); ++ ++#define DiSwap(v, name) \ ++ do { \ ++ v = a->di_##name; \ ++ a->di_##name = b->di_##name; \ ++ b->di_##name = v; \ ++ } while (0) ++ ++ DiSwap(p, hdentry); ++ DiSwap(bi, bstart); ++ DiSwap(bi, bend); ++ DiSwap(bi, bwh); ++ DiSwap(bi, bdiropq); ++ /* smp_mb(); */ ++ ++#undef DiSwap ++} ++ ++void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src) ++{ ++ AuRwMustWriteLock(&dst->di_rwsem); ++ AuRwMustWriteLock(&src->di_rwsem); ++ ++ dst->di_bstart = src->di_bstart; ++ dst->di_bend = src->di_bend; ++ dst->di_bwh = src->di_bwh; ++ dst->di_bdiropq = src->di_bdiropq; ++ /* smp_mb(); */ ++} ++ ++int au_di_init(struct dentry *dentry) ++{ ++ int err; ++ struct super_block *sb; ++ struct au_dinfo *dinfo; ++ ++ err = 0; ++ sb = dentry->d_sb; ++ dinfo = au_di_alloc(sb, AuLsc_DI_CHILD); ++ if (dinfo) { ++ atomic_set(&dinfo->di_generation, au_sigen(sb)); ++ /* smp_mb(); */ /* atomic_set */ ++ dentry->d_fsdata = dinfo; ++ } else ++ err = -ENOMEM; ++ ++ return err; ++} ++ ++void au_di_fin(struct dentry *dentry) ++{ ++ struct au_dinfo *dinfo; ++ ++ dinfo = au_di(dentry); ++ AuRwDestroy(&dinfo->di_rwsem); ++ au_di_free(dinfo); ++} ++ ++int au_di_realloc(struct au_dinfo *dinfo, int nbr) ++{ ++ int err, sz; ++ struct au_hdentry *hdp; ++ ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ err = -ENOMEM; ++ sz = sizeof(*hdp) * (dinfo->di_bend + 1); ++ if (!sz) ++ sz = sizeof(*hdp); ++ hdp = au_kzrealloc(dinfo->di_hdentry, sz, sizeof(*hdp) * nbr, GFP_NOFS); ++ if (hdp) { ++ dinfo->di_hdentry = hdp; ++ err = 0; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void do_ii_write_lock(struct inode *inode, unsigned int lsc) ++{ ++ switch (lsc) { ++ case AuLsc_DI_CHILD: ++ ii_write_lock_child(inode); ++ break; ++ case AuLsc_DI_CHILD2: ++ ii_write_lock_child2(inode); ++ break; ++ case AuLsc_DI_CHILD3: ++ ii_write_lock_child3(inode); ++ break; ++ case AuLsc_DI_PARENT: ++ ii_write_lock_parent(inode); ++ break; ++ case AuLsc_DI_PARENT2: ++ ii_write_lock_parent2(inode); ++ break; ++ case AuLsc_DI_PARENT3: ++ ii_write_lock_parent3(inode); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static void do_ii_read_lock(struct inode *inode, unsigned int lsc) ++{ ++ switch (lsc) { ++ case AuLsc_DI_CHILD: ++ ii_read_lock_child(inode); ++ break; ++ case AuLsc_DI_CHILD2: ++ ii_read_lock_child2(inode); ++ break; ++ case AuLsc_DI_CHILD3: ++ ii_read_lock_child3(inode); ++ break; ++ case AuLsc_DI_PARENT: ++ ii_read_lock_parent(inode); ++ break; ++ case AuLsc_DI_PARENT2: ++ ii_read_lock_parent2(inode); ++ break; ++ case AuLsc_DI_PARENT3: ++ ii_read_lock_parent3(inode); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++void di_read_lock(struct dentry *d, int flags, unsigned int lsc) ++{ ++ au_rw_read_lock_nested(&au_di(d)->di_rwsem, lsc); ++ if (d->d_inode) { ++ if (au_ftest_lock(flags, IW)) ++ do_ii_write_lock(d->d_inode, lsc); ++ else if (au_ftest_lock(flags, IR)) ++ do_ii_read_lock(d->d_inode, lsc); ++ } ++} ++ ++void di_read_unlock(struct dentry *d, int flags) ++{ ++ if (d->d_inode) { ++ if (au_ftest_lock(flags, IW)) { ++ au_dbg_verify_dinode(d); ++ ii_write_unlock(d->d_inode); ++ } else if (au_ftest_lock(flags, IR)) { ++ au_dbg_verify_dinode(d); ++ ii_read_unlock(d->d_inode); ++ } ++ } ++ au_rw_read_unlock(&au_di(d)->di_rwsem); ++} ++ ++void di_downgrade_lock(struct dentry *d, int flags) ++{ ++ if (d->d_inode && au_ftest_lock(flags, IR)) ++ ii_downgrade_lock(d->d_inode); ++ au_rw_dgrade_lock(&au_di(d)->di_rwsem); ++} ++ ++void di_write_lock(struct dentry *d, unsigned int lsc) ++{ ++ au_rw_write_lock_nested(&au_di(d)->di_rwsem, lsc); ++ if (d->d_inode) ++ do_ii_write_lock(d->d_inode, lsc); ++} ++ ++void di_write_unlock(struct dentry *d) ++{ ++ au_dbg_verify_dinode(d); ++ if (d->d_inode) ++ ii_write_unlock(d->d_inode); ++ au_rw_write_unlock(&au_di(d)->di_rwsem); ++} ++ ++void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir) ++{ ++ AuDebugOn(d1 == d2 ++ || d1->d_inode == d2->d_inode ++ || d1->d_sb != d2->d_sb); ++ ++ if (isdir && au_test_subdir(d1, d2)) { ++ di_write_lock_child(d1); ++ di_write_lock_child2(d2); ++ } else { ++ /* there should be no races */ ++ di_write_lock_child(d2); ++ di_write_lock_child2(d1); ++ } ++} ++ ++void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir) ++{ ++ AuDebugOn(d1 == d2 ++ || d1->d_inode == d2->d_inode ++ || d1->d_sb != d2->d_sb); ++ ++ if (isdir && au_test_subdir(d1, d2)) { ++ di_write_lock_parent(d1); ++ di_write_lock_parent2(d2); ++ } else { ++ /* there should be no races */ ++ di_write_lock_parent(d2); ++ di_write_lock_parent2(d1); ++ } ++} ++ ++void di_write_unlock2(struct dentry *d1, struct dentry *d2) ++{ ++ di_write_unlock(d1); ++ if (d1->d_inode == d2->d_inode) ++ au_rw_write_unlock(&au_di(d2)->di_rwsem); ++ else ++ di_write_unlock(d2); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ struct dentry *d; ++ ++ DiMustAnyLock(dentry); ++ ++ if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry)) ++ return NULL; ++ AuDebugOn(bindex < 0); ++ d = au_di(dentry)->di_hdentry[0 + bindex].hd_dentry; ++ AuDebugOn(d && d->d_count <= 0); ++ return d; ++} ++ ++/* ++ * extended version of au_h_dptr(). ++ * returns a hashed and positive h_dentry in bindex, NULL, or error. ++ */ ++struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ struct dentry *h_dentry; ++ struct inode *inode, *h_inode; ++ ++ inode = dentry->d_inode; ++ AuDebugOn(!inode); ++ ++ h_dentry = NULL; ++ if (au_dbstart(dentry) <= bindex ++ && bindex <= au_dbend(dentry)) ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry && !au_d_hashed_positive(h_dentry)) { ++ dget(h_dentry); ++ goto out; /* success */ ++ } ++ ++ AuDebugOn(bindex < au_ibstart(inode)); ++ AuDebugOn(au_ibend(inode) < bindex); ++ h_inode = au_h_iptr(inode, bindex); ++ h_dentry = d_find_alias(h_inode); ++ if (h_dentry) { ++ if (!IS_ERR(h_dentry)) { ++ if (!au_d_hashed_positive(h_dentry)) ++ goto out; /* success */ ++ dput(h_dentry); ++ } else ++ goto out; ++ } ++ ++ if (au_opt_test(au_mntflags(dentry->d_sb), PLINK)) { ++ h_dentry = au_plink_lkup(inode, bindex); ++ AuDebugOn(!h_dentry); ++ if (!IS_ERR(h_dentry)) { ++ if (!au_d_hashed_positive(h_dentry)) ++ goto out; /* success */ ++ dput(h_dentry); ++ h_dentry = NULL; ++ } ++ } ++ ++out: ++ AuDbgDentry(h_dentry); ++ return h_dentry; ++} ++ ++aufs_bindex_t au_dbtail(struct dentry *dentry) ++{ ++ aufs_bindex_t bend, bwh; ++ ++ bend = au_dbend(dentry); ++ if (0 <= bend) { ++ bwh = au_dbwh(dentry); ++ if (!bwh) ++ return bwh; ++ if (0 < bwh && bwh < bend) ++ return bwh - 1; ++ } ++ return bend; ++} ++ ++aufs_bindex_t au_dbtaildir(struct dentry *dentry) ++{ ++ aufs_bindex_t bend, bopq; ++ ++ bend = au_dbtail(dentry); ++ if (0 <= bend) { ++ bopq = au_dbdiropq(dentry); ++ if (0 <= bopq && bopq < bend) ++ bend = bopq; ++ } ++ return bend; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_dentry) ++{ ++ struct au_hdentry *hd = au_di(dentry)->di_hdentry + bindex; ++ struct au_branch *br; ++ ++ DiMustWriteLock(dentry); ++ ++ au_hdput(hd); ++ hd->hd_dentry = h_dentry; ++ if (h_dentry) { ++ br = au_sbr(dentry->d_sb, bindex); ++ hd->hd_id = br->br_id; ++ } ++} ++ ++int au_dbrange_test(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bstart, bend; ++ ++ err = 0; ++ bstart = au_dbstart(dentry); ++ bend = au_dbend(dentry); ++ if (bstart >= 0) ++ AuDebugOn(bend < 0 && bstart > bend); ++ else { ++ err = -EIO; ++ AuDebugOn(bend >= 0); ++ } ++ ++ return err; ++} ++ ++int au_digen_test(struct dentry *dentry, unsigned int sigen) ++{ ++ int err; ++ ++ err = 0; ++ if (unlikely(au_digen(dentry) != sigen ++ || au_iigen_test(dentry->d_inode, sigen))) ++ err = -EIO; ++ ++ return err; ++} ++ ++void au_update_digen(struct dentry *dentry) ++{ ++ atomic_set(&au_di(dentry)->di_generation, au_sigen(dentry->d_sb)); ++ /* smp_mb(); */ /* atomic_set */ ++} ++ ++void au_update_dbrange(struct dentry *dentry, int do_put_zero) ++{ ++ struct au_dinfo *dinfo; ++ struct dentry *h_d; ++ struct au_hdentry *hdp; ++ ++ DiMustWriteLock(dentry); ++ ++ dinfo = au_di(dentry); ++ if (!dinfo || dinfo->di_bstart < 0) ++ return; ++ ++ hdp = dinfo->di_hdentry; ++ if (do_put_zero) { ++ aufs_bindex_t bindex, bend; ++ ++ bend = dinfo->di_bend; ++ for (bindex = dinfo->di_bstart; bindex <= bend; bindex++) { ++ h_d = hdp[0 + bindex].hd_dentry; ++ if (h_d && !h_d->d_inode) ++ au_set_h_dptr(dentry, bindex, NULL); ++ } ++ } ++ ++ dinfo->di_bstart = -1; ++ while (++dinfo->di_bstart <= dinfo->di_bend) ++ if (hdp[0 + dinfo->di_bstart].hd_dentry) ++ break; ++ if (dinfo->di_bstart > dinfo->di_bend) { ++ dinfo->di_bstart = -1; ++ dinfo->di_bend = -1; ++ return; ++ } ++ ++ dinfo->di_bend++; ++ while (0 <= --dinfo->di_bend) ++ if (hdp[0 + dinfo->di_bend].hd_dentry) ++ break; ++ AuDebugOn(dinfo->di_bstart > dinfo->di_bend || dinfo->di_bend < 0); ++} ++ ++void au_update_dbstart(struct dentry *dentry) ++{ ++ aufs_bindex_t bindex, bend; ++ struct dentry *h_dentry; ++ ++ bend = au_dbend(dentry); ++ for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ if (h_dentry->d_inode) { ++ au_set_dbstart(dentry, bindex); ++ return; ++ } ++ au_set_h_dptr(dentry, bindex, NULL); ++ } ++} ++ ++void au_update_dbend(struct dentry *dentry) ++{ ++ aufs_bindex_t bindex, bstart; ++ struct dentry *h_dentry; ++ ++ bstart = au_dbstart(dentry); ++ for (bindex = au_dbend(dentry); bindex >= bstart; bindex--) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ if (h_dentry->d_inode) { ++ au_set_dbend(dentry, bindex); ++ return; ++ } ++ au_set_h_dptr(dentry, bindex, NULL); ++ } ++} ++ ++int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry) ++{ ++ aufs_bindex_t bindex, bend; ++ ++ bend = au_dbend(dentry); ++ for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) ++ if (au_h_dptr(dentry, bindex) == h_dentry) ++ return bindex; ++ return -1; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/dir.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,636 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * directory operations ++ */ ++ ++#include ++#include "aufs.h" ++ ++void au_add_nlink(struct inode *dir, struct inode *h_dir) ++{ ++ unsigned int nlink; ++ ++ AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); ++ ++ nlink = dir->i_nlink; ++ nlink += h_dir->i_nlink - 2; ++ if (h_dir->i_nlink < 2) ++ nlink += 2; ++ /* 0 can happen in revaliding */ ++ vfsub_set_nlink(dir, nlink); ++} ++ ++void au_sub_nlink(struct inode *dir, struct inode *h_dir) ++{ ++ unsigned int nlink; ++ ++ AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); ++ ++ nlink = dir->i_nlink; ++ nlink -= h_dir->i_nlink - 2; ++ if (h_dir->i_nlink < 2) ++ nlink -= 2; ++ /* no vfsub version. nlink == 0 means the branch-fs is broken */ ++ set_nlink(dir, nlink); ++} ++ ++loff_t au_dir_size(struct file *file, struct dentry *dentry) ++{ ++ loff_t sz; ++ aufs_bindex_t bindex, bend; ++ struct file *h_file; ++ struct dentry *h_dentry; ++ ++ sz = 0; ++ if (file) { ++ AuDebugOn(!file->f_dentry); ++ AuDebugOn(!file->f_dentry->d_inode); ++ AuDebugOn(!S_ISDIR(file->f_dentry->d_inode->i_mode)); ++ ++ bend = au_fbend_dir(file); ++ for (bindex = au_fbstart(file); ++ bindex <= bend && sz < KMALLOC_MAX_SIZE; ++ bindex++) { ++ h_file = au_hf_dir(file, bindex); ++ if (h_file ++ && h_file->f_dentry ++ && h_file->f_dentry->d_inode) ++ sz += i_size_read(h_file->f_dentry->d_inode); ++ } ++ } else { ++ AuDebugOn(!dentry); ++ AuDebugOn(!dentry->d_inode); ++ AuDebugOn(!S_ISDIR(dentry->d_inode->i_mode)); ++ ++ bend = au_dbtaildir(dentry); ++ for (bindex = au_dbstart(dentry); ++ bindex <= bend && sz < KMALLOC_MAX_SIZE; ++ bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry && h_dentry->d_inode) ++ sz += i_size_read(h_dentry->d_inode); ++ } ++ } ++ if (sz < KMALLOC_MAX_SIZE) ++ sz = roundup_pow_of_two(sz); ++ if (sz > KMALLOC_MAX_SIZE) ++ sz = KMALLOC_MAX_SIZE; ++ else if (sz < NAME_MAX) { ++ BUILD_BUG_ON(AUFS_RDBLK_DEF < NAME_MAX); ++ sz = AUFS_RDBLK_DEF; ++ } ++ return sz; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int reopen_dir(struct file *file) ++{ ++ int err; ++ unsigned int flags; ++ aufs_bindex_t bindex, btail, bstart; ++ struct dentry *dentry, *h_dentry; ++ struct file *h_file; ++ ++ /* open all lower dirs */ ++ dentry = file->f_dentry; ++ bstart = au_dbstart(dentry); ++ for (bindex = au_fbstart(file); bindex < bstart; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ au_set_fbstart(file, bstart); ++ ++ btail = au_dbtaildir(dentry); ++ for (bindex = au_fbend_dir(file); btail < bindex; bindex--) ++ au_set_h_fptr(file, bindex, NULL); ++ au_set_fbend_dir(file, btail); ++ ++ flags = vfsub_file_flags(file); ++ for (bindex = bstart; bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ h_file = au_hf_dir(file, bindex); ++ if (h_file) ++ continue; ++ ++ h_file = au_h_open(dentry, bindex, flags, file); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; /* close all? */ ++ au_set_h_fptr(file, bindex, h_file); ++ } ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ err = 0; ++ ++out: ++ return err; ++} ++ ++static int do_open_dir(struct file *file, int flags) ++{ ++ int err; ++ aufs_bindex_t bindex, btail; ++ struct dentry *dentry, *h_dentry; ++ struct file *h_file; ++ ++ FiMustWriteLock(file); ++ ++ dentry = file->f_dentry; ++ err = au_alive_dir(dentry); ++ if (unlikely(err)) ++ goto out; ++ ++ file->f_version = dentry->d_inode->i_version; ++ bindex = au_dbstart(dentry); ++ au_set_fbstart(file, bindex); ++ btail = au_dbtaildir(dentry); ++ au_set_fbend_dir(file, btail); ++ for (; !err && bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!h_dentry) ++ continue; ++ ++ h_file = au_h_open(dentry, bindex, flags, file); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ break; ++ } ++ au_set_h_fptr(file, bindex, h_file); ++ } ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ if (!err) ++ return 0; /* success */ ++ ++ /* close all */ ++ for (bindex = au_fbstart(file); bindex <= btail; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ au_set_fbstart(file, -1); ++ au_set_fbend_dir(file, -1); ++ ++out: ++ return err; ++} ++ ++static int aufs_open_dir(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ int err; ++ struct super_block *sb; ++ struct au_fidir *fidir; ++ ++ err = -ENOMEM; ++ sb = file->f_dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ fidir = au_fidir_alloc(sb); ++ if (fidir) { ++ err = au_do_open(file, do_open_dir, fidir); ++ if (unlikely(err)) ++ kfree(fidir); ++ } ++ si_read_unlock(sb); ++ return err; ++} ++ ++static int aufs_release_dir(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ struct au_vdir *vdir_cache; ++ struct au_finfo *finfo; ++ struct au_fidir *fidir; ++ aufs_bindex_t bindex, bend; ++ ++ finfo = au_fi(file); ++ fidir = finfo->fi_hdir; ++ if (fidir) { ++ /* remove me from sb->s_files */ ++ file_sb_list_del(file); ++ ++ vdir_cache = fidir->fd_vdir_cache; /* lock-free */ ++ if (vdir_cache) ++ au_vdir_free(vdir_cache); ++ ++ bindex = finfo->fi_btop; ++ if (bindex >= 0) { ++ /* ++ * calls fput() instead of filp_close(), ++ * since no dnotify or lock for the lower file. ++ */ ++ bend = fidir->fd_bbot; ++ for (; bindex <= bend; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ } ++ kfree(fidir); ++ finfo->fi_hdir = NULL; ++ } ++ au_finfo_fin(file); ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_flush_dir(struct file *file, fl_owner_t id) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ struct file *h_file; ++ ++ err = 0; ++ bend = au_fbend_dir(file); ++ for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) { ++ h_file = au_hf_dir(file, bindex); ++ if (h_file) ++ err = vfsub_flush(h_file, id); ++ } ++ return err; ++} ++ ++static int aufs_flush_dir(struct file *file, fl_owner_t id) ++{ ++ return au_do_flush(file, id, au_do_flush_dir); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_fsync_dir_no_file(struct dentry *dentry, int datasync) ++{ ++ int err; ++ aufs_bindex_t bend, bindex; ++ struct inode *inode; ++ struct super_block *sb; ++ ++ err = 0; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ IMustLock(inode); ++ bend = au_dbend(dentry); ++ for (bindex = au_dbstart(dentry); !err && bindex <= bend; bindex++) { ++ struct path h_path; ++ ++ if (au_test_ro(sb, bindex, inode)) ++ continue; ++ h_path.dentry = au_h_dptr(dentry, bindex); ++ if (!h_path.dentry) ++ continue; ++ ++ h_path.mnt = au_sbr_mnt(sb, bindex); ++ err = vfsub_fsync(NULL, &h_path, datasync); ++ } ++ ++ return err; ++} ++ ++static int au_do_fsync_dir(struct file *file, int datasync) ++{ ++ int err; ++ aufs_bindex_t bend, bindex; ++ struct file *h_file; ++ struct super_block *sb; ++ struct inode *inode; ++ ++ err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ sb = file->f_dentry->d_sb; ++ inode = file->f_dentry->d_inode; ++ bend = au_fbend_dir(file); ++ for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) { ++ h_file = au_hf_dir(file, bindex); ++ if (!h_file || au_test_ro(sb, bindex, inode)) ++ continue; ++ ++ err = vfsub_fsync(h_file, &h_file->f_path, datasync); ++ } ++ ++out: ++ return err; ++} ++ ++/* ++ * @file may be NULL ++ */ ++static int aufs_fsync_dir(struct file *file, loff_t start, loff_t end, ++ int datasync) ++{ ++ int err; ++ struct dentry *dentry; ++ struct super_block *sb; ++ struct mutex *mtx; ++ ++ err = 0; ++ dentry = file->f_dentry; ++ mtx = &dentry->d_inode->i_mutex; ++ mutex_lock(mtx); ++ sb = dentry->d_sb; ++ si_noflush_read_lock(sb); ++ if (file) ++ err = au_do_fsync_dir(file, datasync); ++ else { ++ di_write_lock_child(dentry); ++ err = au_do_fsync_dir_no_file(dentry, datasync); ++ } ++ au_cpup_attr_timesizes(dentry->d_inode); ++ di_write_unlock(dentry); ++ if (file) ++ fi_write_unlock(file); ++ ++ si_read_unlock(sb); ++ mutex_unlock(mtx); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_readdir(struct file *file, void *dirent, filldir_t filldir) ++{ ++ int err; ++ struct dentry *dentry; ++ struct inode *inode, *h_inode; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ inode = dentry->d_inode; ++ IMustLock(inode); ++ ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ err = au_alive_dir(dentry); ++ if (!err) ++ err = au_vdir_init(file); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ h_inode = au_h_iptr(inode, au_ibstart(inode)); ++ if (!au_test_nfsd()) { ++ err = au_vdir_fill_de(file, dirent, filldir); ++ fsstack_copy_attr_atime(inode, h_inode); ++ } else { ++ /* ++ * nfsd filldir may call lookup_one_len(), vfs_getattr(), ++ * encode_fh() and others. ++ */ ++ atomic_inc(&h_inode->i_count); ++ di_read_unlock(dentry, AuLock_IR); ++ si_read_unlock(sb); ++ err = au_vdir_fill_de(file, dirent, filldir); ++ fsstack_copy_attr_atime(inode, h_inode); ++ fi_write_unlock(file); ++ iput(h_inode); ++ ++ AuTraceErr(err); ++ return err; ++ } ++ ++out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AuTestEmpty_WHONLY 1 ++#define AuTestEmpty_CALLED (1 << 1) ++#define AuTestEmpty_SHWH (1 << 2) ++#define au_ftest_testempty(flags, name) ((flags) & AuTestEmpty_##name) ++#define au_fset_testempty(flags, name) \ ++ do { (flags) |= AuTestEmpty_##name; } while (0) ++#define au_fclr_testempty(flags, name) \ ++ do { (flags) &= ~AuTestEmpty_##name; } while (0) ++ ++#ifndef CONFIG_AUFS_SHWH ++#undef AuTestEmpty_SHWH ++#define AuTestEmpty_SHWH 0 ++#endif ++ ++struct test_empty_arg { ++ struct au_nhash *whlist; ++ unsigned int flags; ++ int err; ++ aufs_bindex_t bindex; ++}; ++ ++static int test_empty_cb(void *__arg, const char *__name, int namelen, ++ loff_t offset __maybe_unused, u64 ino, ++ unsigned int d_type) ++{ ++ struct test_empty_arg *arg = __arg; ++ char *name = (void *)__name; ++ ++ arg->err = 0; ++ au_fset_testempty(arg->flags, CALLED); ++ /* smp_mb(); */ ++ if (name[0] == '.' ++ && (namelen == 1 || (name[1] == '.' && namelen == 2))) ++ goto out; /* success */ ++ ++ if (namelen <= AUFS_WH_PFX_LEN ++ || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { ++ if (au_ftest_testempty(arg->flags, WHONLY) ++ && !au_nhash_test_known_wh(arg->whlist, name, namelen)) ++ arg->err = -ENOTEMPTY; ++ goto out; ++ } ++ ++ name += AUFS_WH_PFX_LEN; ++ namelen -= AUFS_WH_PFX_LEN; ++ if (!au_nhash_test_known_wh(arg->whlist, name, namelen)) ++ arg->err = au_nhash_append_wh ++ (arg->whlist, name, namelen, ino, d_type, arg->bindex, ++ au_ftest_testempty(arg->flags, SHWH)); ++ ++out: ++ /* smp_mb(); */ ++ AuTraceErr(arg->err); ++ return arg->err; ++} ++ ++static int do_test_empty(struct dentry *dentry, struct test_empty_arg *arg) ++{ ++ int err; ++ struct file *h_file; ++ ++ h_file = au_h_open(dentry, arg->bindex, ++ O_RDONLY | O_NONBLOCK | O_DIRECTORY | O_LARGEFILE, ++ /*file*/NULL); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; ++ ++ err = 0; ++ if (!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE) ++ && !h_file->f_dentry->d_inode->i_nlink) ++ goto out_put; ++ ++ do { ++ arg->err = 0; ++ au_fclr_testempty(arg->flags, CALLED); ++ /* smp_mb(); */ ++ err = vfsub_readdir(h_file, test_empty_cb, arg); ++ if (err >= 0) ++ err = arg->err; ++ } while (!err && au_ftest_testempty(arg->flags, CALLED)); ++ ++out_put: ++ fput(h_file); ++ au_sbr_put(dentry->d_sb, arg->bindex); ++out: ++ return err; ++} ++ ++struct do_test_empty_args { ++ int *errp; ++ struct dentry *dentry; ++ struct test_empty_arg *arg; ++}; ++ ++static void call_do_test_empty(void *args) ++{ ++ struct do_test_empty_args *a = args; ++ *a->errp = do_test_empty(a->dentry, a->arg); ++} ++ ++static int sio_test_empty(struct dentry *dentry, struct test_empty_arg *arg) ++{ ++ int err, wkq_err; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ ++ h_dentry = au_h_dptr(dentry, arg->bindex); ++ h_inode = h_dentry->d_inode; ++ /* todo: i_mode changes anytime? */ ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ err = au_test_h_perm_sio(h_inode, MAY_EXEC | MAY_READ); ++ mutex_unlock(&h_inode->i_mutex); ++ if (!err) ++ err = do_test_empty(dentry, arg); ++ else { ++ struct do_test_empty_args args = { ++ .errp = &err, ++ .dentry = dentry, ++ .arg = arg ++ }; ++ unsigned int flags = arg->flags; ++ ++ wkq_err = au_wkq_wait(call_do_test_empty, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ arg->flags = flags; ++ } ++ ++ return err; ++} ++ ++int au_test_empty_lower(struct dentry *dentry) ++{ ++ int err; ++ unsigned int rdhash; ++ aufs_bindex_t bindex, bstart, btail; ++ struct au_nhash whlist; ++ struct test_empty_arg arg; ++ ++ SiMustAnyLock(dentry->d_sb); ++ ++ rdhash = au_sbi(dentry->d_sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, dentry)); ++ err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ ++ arg.flags = 0; ++ arg.whlist = &whlist; ++ bstart = au_dbstart(dentry); ++ if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) ++ au_fset_testempty(arg.flags, SHWH); ++ arg.bindex = bstart; ++ err = do_test_empty(dentry, &arg); ++ if (unlikely(err)) ++ goto out_whlist; ++ ++ au_fset_testempty(arg.flags, WHONLY); ++ btail = au_dbtaildir(dentry); ++ for (bindex = bstart + 1; !err && bindex <= btail; bindex++) { ++ struct dentry *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry && h_dentry->d_inode) { ++ arg.bindex = bindex; ++ err = do_test_empty(dentry, &arg); ++ } ++ } ++ ++out_whlist: ++ au_nhash_wh_free(&whlist); ++out: ++ return err; ++} ++ ++int au_test_empty(struct dentry *dentry, struct au_nhash *whlist) ++{ ++ int err; ++ struct test_empty_arg arg; ++ aufs_bindex_t bindex, btail; ++ ++ err = 0; ++ arg.whlist = whlist; ++ arg.flags = AuTestEmpty_WHONLY; ++ if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) ++ au_fset_testempty(arg.flags, SHWH); ++ btail = au_dbtaildir(dentry); ++ for (bindex = au_dbstart(dentry); !err && bindex <= btail; bindex++) { ++ struct dentry *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry && h_dentry->d_inode) { ++ arg.bindex = bindex; ++ err = sio_test_empty(dentry, &arg); ++ } ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++const struct file_operations aufs_dir_fop = { ++ .owner = THIS_MODULE, ++ .llseek = default_llseek, ++ .read = generic_read_dir, ++ .readdir = aufs_readdir, ++ .unlocked_ioctl = aufs_ioctl_dir, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = aufs_compat_ioctl_dir, ++#endif ++ .open = aufs_open_dir, ++ .release = aufs_release_dir, ++ .flush = aufs_flush_dir, ++ .fsync = aufs_fsync_dir ++}; +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/dir.h 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,137 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * directory operations ++ */ ++ ++#ifndef __AUFS_DIR_H__ ++#define __AUFS_DIR_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* need to be faster and smaller */ ++ ++struct au_nhash { ++ unsigned int nh_num; ++ struct hlist_head *nh_head; ++}; ++ ++struct au_vdir_destr { ++ unsigned char len; ++ unsigned char name[0]; ++} __packed; ++ ++struct au_vdir_dehstr { ++ struct hlist_node hash; ++ struct au_vdir_destr *str; ++} ____cacheline_aligned_in_smp; ++ ++struct au_vdir_de { ++ ino_t de_ino; ++ unsigned char de_type; ++ /* caution: packed */ ++ struct au_vdir_destr de_str; ++} __packed; ++ ++struct au_vdir_wh { ++ struct hlist_node wh_hash; ++#ifdef CONFIG_AUFS_SHWH ++ ino_t wh_ino; ++ aufs_bindex_t wh_bindex; ++ unsigned char wh_type; ++#else ++ aufs_bindex_t wh_bindex; ++#endif ++ /* caution: packed */ ++ struct au_vdir_destr wh_str; ++} __packed; ++ ++union au_vdir_deblk_p { ++ unsigned char *deblk; ++ struct au_vdir_de *de; ++}; ++ ++struct au_vdir { ++ unsigned char **vd_deblk; ++ unsigned long vd_nblk; ++ struct { ++ unsigned long ul; ++ union au_vdir_deblk_p p; ++ } vd_last; ++ ++ unsigned long vd_version; ++ unsigned int vd_deblk_sz; ++ unsigned long vd_jiffy; ++} ____cacheline_aligned_in_smp; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dir.c */ ++extern const struct file_operations aufs_dir_fop; ++void au_add_nlink(struct inode *dir, struct inode *h_dir); ++void au_sub_nlink(struct inode *dir, struct inode *h_dir); ++loff_t au_dir_size(struct file *file, struct dentry *dentry); ++int au_test_empty_lower(struct dentry *dentry); ++int au_test_empty(struct dentry *dentry, struct au_nhash *whlist); ++ ++/* vdir.c */ ++unsigned int au_rdhash_est(loff_t sz); ++int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp); ++void au_nhash_wh_free(struct au_nhash *whlist); ++int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, ++ int limit); ++int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen); ++int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, ++ unsigned int d_type, aufs_bindex_t bindex, ++ unsigned char shwh); ++void au_vdir_free(struct au_vdir *vdir); ++int au_vdir_init(struct file *file); ++int au_vdir_fill_de(struct file *file, void *dirent, filldir_t filldir); ++ ++/* ioctl.c */ ++long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg); ++ ++#ifdef CONFIG_AUFS_RDU ++/* rdu.c */ ++long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg); ++#ifdef CONFIG_COMPAT ++long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, ++ unsigned long arg); ++#endif ++#else ++static inline long au_rdu_ioctl(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ return -EINVAL; ++} ++#ifdef CONFIG_COMPAT ++static inline long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ return -EINVAL; ++} ++#endif ++#endif ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DIR_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/dynop.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,377 @@ ++/* ++ * Copyright (C) 2010-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * dynamically customizable operations for regular files ++ */ ++ ++#include "aufs.h" ++ ++#define DyPrSym(key) AuDbgSym(key->dk_op.dy_hop) ++ ++/* ++ * How large will these lists be? ++ * Usually just a few elements, 20-30 at most for each, I guess. ++ */ ++static struct au_splhead dynop[AuDyLast]; ++ ++static struct au_dykey *dy_gfind_get(struct au_splhead *spl, const void *h_op) ++{ ++ struct au_dykey *key, *tmp; ++ struct list_head *head; ++ ++ key = NULL; ++ head = &spl->head; ++ rcu_read_lock(); ++ list_for_each_entry_rcu(tmp, head, dk_list) ++ if (tmp->dk_op.dy_hop == h_op) { ++ key = tmp; ++ kref_get(&key->dk_kref); ++ break; ++ } ++ rcu_read_unlock(); ++ ++ return key; ++} ++ ++static struct au_dykey *dy_bradd(struct au_branch *br, struct au_dykey *key) ++{ ++ struct au_dykey **k, *found; ++ const void *h_op = key->dk_op.dy_hop; ++ int i; ++ ++ found = NULL; ++ k = br->br_dykey; ++ for (i = 0; i < AuBrDynOp; i++) ++ if (k[i]) { ++ if (k[i]->dk_op.dy_hop == h_op) { ++ found = k[i]; ++ break; ++ } ++ } else ++ break; ++ if (!found) { ++ spin_lock(&br->br_dykey_lock); ++ for (; i < AuBrDynOp; i++) ++ if (k[i]) { ++ if (k[i]->dk_op.dy_hop == h_op) { ++ found = k[i]; ++ break; ++ } ++ } else { ++ k[i] = key; ++ break; ++ } ++ spin_unlock(&br->br_dykey_lock); ++ BUG_ON(i == AuBrDynOp); /* expand the array */ ++ } ++ ++ return found; ++} ++ ++/* kref_get() if @key is already added */ ++static struct au_dykey *dy_gadd(struct au_splhead *spl, struct au_dykey *key) ++{ ++ struct au_dykey *tmp, *found; ++ struct list_head *head; ++ const void *h_op = key->dk_op.dy_hop; ++ ++ found = NULL; ++ head = &spl->head; ++ spin_lock(&spl->spin); ++ list_for_each_entry(tmp, head, dk_list) ++ if (tmp->dk_op.dy_hop == h_op) { ++ kref_get(&tmp->dk_kref); ++ found = tmp; ++ break; ++ } ++ if (!found) ++ list_add_rcu(&key->dk_list, head); ++ spin_unlock(&spl->spin); ++ ++ if (!found) ++ DyPrSym(key); ++ return found; ++} ++ ++static void dy_free_rcu(struct rcu_head *rcu) ++{ ++ struct au_dykey *key; ++ ++ key = container_of(rcu, struct au_dykey, dk_rcu); ++ DyPrSym(key); ++ kfree(key); ++} ++ ++static void dy_free(struct kref *kref) ++{ ++ struct au_dykey *key; ++ struct au_splhead *spl; ++ ++ key = container_of(kref, struct au_dykey, dk_kref); ++ spl = dynop + key->dk_op.dy_type; ++ au_spl_del_rcu(&key->dk_list, spl); ++ call_rcu(&key->dk_rcu, dy_free_rcu); ++} ++ ++void au_dy_put(struct au_dykey *key) ++{ ++ kref_put(&key->dk_kref, dy_free); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define DyDbgSize(cnt, op) AuDebugOn(cnt != sizeof(op)/sizeof(void *)) ++ ++#ifdef CONFIG_AUFS_DEBUG ++#define DyDbgDeclare(cnt) unsigned int cnt = 0 ++#define DyDbgInc(cnt) do { cnt++; } while (0) ++#else ++#define DyDbgDeclare(cnt) do {} while (0) ++#define DyDbgInc(cnt) do {} while (0) ++#endif ++ ++#define DySet(func, dst, src, h_op, h_sb) do { \ ++ DyDbgInc(cnt); \ ++ if (h_op->func) { \ ++ if (src.func) \ ++ dst.func = src.func; \ ++ else \ ++ AuDbg("%s %s\n", au_sbtype(h_sb), #func); \ ++ } \ ++} while (0) ++ ++#define DySetForce(func, dst, src) do { \ ++ AuDebugOn(!src.func); \ ++ DyDbgInc(cnt); \ ++ dst.func = src.func; \ ++} while (0) ++ ++#define DySetAop(func) \ ++ DySet(func, dyaop->da_op, aufs_aop, h_aop, h_sb) ++#define DySetAopForce(func) \ ++ DySetForce(func, dyaop->da_op, aufs_aop) ++ ++static void dy_aop(struct au_dykey *key, const void *h_op, ++ struct super_block *h_sb __maybe_unused) ++{ ++ struct au_dyaop *dyaop = (void *)key; ++ const struct address_space_operations *h_aop = h_op; ++ DyDbgDeclare(cnt); ++ ++ AuDbg("%s\n", au_sbtype(h_sb)); ++ ++ DySetAop(writepage); ++ DySetAopForce(readpage); /* force */ ++ DySetAop(writepages); ++ DySetAop(set_page_dirty); ++ DySetAop(readpages); ++ DySetAop(write_begin); ++ DySetAop(write_end); ++ DySetAop(bmap); ++ DySetAop(invalidatepage); ++ DySetAop(releasepage); ++ DySetAop(freepage); ++ /* these two will be changed according to an aufs mount option */ ++ DySetAop(direct_IO); ++ DySetAop(get_xip_mem); ++ DySetAop(migratepage); ++ DySetAop(launder_page); ++ DySetAop(is_partially_uptodate); ++ DySetAop(error_remove_page); ++ ++ DyDbgSize(cnt, *h_aop); ++ dyaop->da_get_xip_mem = h_aop->get_xip_mem; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void dy_bug(struct kref *kref) ++{ ++ BUG(); ++} ++ ++static struct au_dykey *dy_get(struct au_dynop *op, struct au_branch *br) ++{ ++ struct au_dykey *key, *old; ++ struct au_splhead *spl; ++ struct op { ++ unsigned int sz; ++ void (*set)(struct au_dykey *key, const void *h_op, ++ struct super_block *h_sb __maybe_unused); ++ }; ++ static const struct op a[] = { ++ [AuDy_AOP] = { ++ .sz = sizeof(struct au_dyaop), ++ .set = dy_aop ++ } ++ }; ++ const struct op *p; ++ ++ spl = dynop + op->dy_type; ++ key = dy_gfind_get(spl, op->dy_hop); ++ if (key) ++ goto out_add; /* success */ ++ ++ p = a + op->dy_type; ++ key = kzalloc(p->sz, GFP_NOFS); ++ if (unlikely(!key)) { ++ key = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ key->dk_op.dy_hop = op->dy_hop; ++ kref_init(&key->dk_kref); ++ p->set(key, op->dy_hop, br->br_mnt->mnt_sb); ++ old = dy_gadd(spl, key); ++ if (old) { ++ kfree(key); ++ key = old; ++ } ++ ++out_add: ++ old = dy_bradd(br, key); ++ if (old) ++ /* its ref-count should never be zero here */ ++ kref_put(&key->dk_kref, dy_bug); ++out: ++ return key; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * Aufs prohibits O_DIRECT by defaut even if the branch supports it. ++ * This behaviour is neccessary to return an error from open(O_DIRECT) instead ++ * of the succeeding I/O. The dio mount option enables O_DIRECT and makes ++ * open(O_DIRECT) always succeed, but the succeeding I/O may return an error. ++ * See the aufs manual in detail. ++ * ++ * To keep this behaviour, aufs has to set NULL to ->get_xip_mem too, and the ++ * performance of fadvise() and madvise() may be affected. ++ */ ++static void dy_adx(struct au_dyaop *dyaop, int do_dx) ++{ ++ if (!do_dx) { ++ dyaop->da_op.direct_IO = NULL; ++ dyaop->da_op.get_xip_mem = NULL; ++ } else { ++ dyaop->da_op.direct_IO = aufs_aop.direct_IO; ++ dyaop->da_op.get_xip_mem = aufs_aop.get_xip_mem; ++ if (!dyaop->da_get_xip_mem) ++ dyaop->da_op.get_xip_mem = NULL; ++ } ++} ++ ++static struct au_dyaop *dy_aget(struct au_branch *br, ++ const struct address_space_operations *h_aop, ++ int do_dx) ++{ ++ struct au_dyaop *dyaop; ++ struct au_dynop op; ++ ++ op.dy_type = AuDy_AOP; ++ op.dy_haop = h_aop; ++ dyaop = (void *)dy_get(&op, br); ++ if (IS_ERR(dyaop)) ++ goto out; ++ dy_adx(dyaop, do_dx); ++ ++out: ++ return dyaop; ++} ++ ++int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex, ++ struct inode *h_inode) ++{ ++ int err, do_dx; ++ struct super_block *sb; ++ struct au_branch *br; ++ struct au_dyaop *dyaop; ++ ++ AuDebugOn(!S_ISREG(h_inode->i_mode)); ++ IiMustWriteLock(inode); ++ ++ sb = inode->i_sb; ++ br = au_sbr(sb, bindex); ++ do_dx = !!au_opt_test(au_mntflags(sb), DIO); ++ dyaop = dy_aget(br, h_inode->i_mapping->a_ops, do_dx); ++ err = PTR_ERR(dyaop); ++ if (IS_ERR(dyaop)) ++ /* unnecessary to call dy_fput() */ ++ goto out; ++ ++ err = 0; ++ inode->i_mapping->a_ops = &dyaop->da_op; ++ ++out: ++ return err; ++} ++ ++/* ++ * Is it safe to replace a_ops during the inode/file is in operation? ++ * Yes, I hope so. ++ */ ++int au_dy_irefresh(struct inode *inode) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ struct inode *h_inode; ++ ++ err = 0; ++ if (S_ISREG(inode->i_mode)) { ++ bstart = au_ibstart(inode); ++ h_inode = au_h_iptr(inode, bstart); ++ err = au_dy_iaop(inode, bstart, h_inode); ++ } ++ return err; ++} ++ ++void au_dy_arefresh(int do_dx) ++{ ++ struct au_splhead *spl; ++ struct list_head *head; ++ struct au_dykey *key; ++ ++ spl = dynop + AuDy_AOP; ++ head = &spl->head; ++ spin_lock(&spl->spin); ++ list_for_each_entry(key, head, dk_list) ++ dy_adx((void *)key, do_dx); ++ spin_unlock(&spl->spin); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void __init au_dy_init(void) ++{ ++ int i; ++ ++ /* make sure that 'struct au_dykey *' can be any type */ ++ BUILD_BUG_ON(offsetof(struct au_dyaop, da_key)); ++ ++ for (i = 0; i < AuDyLast; i++) ++ au_spl_init(dynop + i); ++} ++ ++void au_dy_fin(void) ++{ ++ int i; ++ ++ for (i = 0; i < AuDyLast; i++) ++ WARN_ON(!list_empty(&dynop[i].head)); ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/dynop.h 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,76 @@ ++/* ++ * Copyright (C) 2010-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * dynamically customizable operations (for regular files only) ++ */ ++ ++#ifndef __AUFS_DYNOP_H__ ++#define __AUFS_DYNOP_H__ ++ ++#ifdef __KERNEL__ ++ ++#include "inode.h" ++ ++enum {AuDy_AOP, AuDyLast}; ++ ++struct au_dynop { ++ int dy_type; ++ union { ++ const void *dy_hop; ++ const struct address_space_operations *dy_haop; ++ }; ++}; ++ ++struct au_dykey { ++ union { ++ struct list_head dk_list; ++ struct rcu_head dk_rcu; ++ }; ++ struct au_dynop dk_op; ++ ++ /* ++ * during I am in the branch local array, kref is gotten. when the ++ * branch is removed, kref is put. ++ */ ++ struct kref dk_kref; ++}; ++ ++/* stop unioning since their sizes are very different from each other */ ++struct au_dyaop { ++ struct au_dykey da_key; ++ struct address_space_operations da_op; /* not const */ ++ int (*da_get_xip_mem)(struct address_space *, pgoff_t, int, ++ void **, unsigned long *); ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dynop.c */ ++struct au_branch; ++void au_dy_put(struct au_dykey *key); ++int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex, ++ struct inode *h_inode); ++int au_dy_irefresh(struct inode *inode); ++void au_dy_arefresh(int do_dio); ++ ++void __init au_dy_init(void); ++void au_dy_fin(void); ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_DYNOP_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/export.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,803 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * export via nfs ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "../fs/mount.h" ++#include "aufs.h" ++ ++union conv { ++#ifdef CONFIG_AUFS_INO_T_64 ++ __u32 a[2]; ++#else ++ __u32 a[1]; ++#endif ++ ino_t ino; ++}; ++ ++static ino_t decode_ino(__u32 *a) ++{ ++ union conv u; ++ ++ BUILD_BUG_ON(sizeof(u.ino) != sizeof(u.a)); ++ u.a[0] = a[0]; ++#ifdef CONFIG_AUFS_INO_T_64 ++ u.a[1] = a[1]; ++#endif ++ return u.ino; ++} ++ ++static void encode_ino(__u32 *a, ino_t ino) ++{ ++ union conv u; ++ ++ u.ino = ino; ++ a[0] = u.a[0]; ++#ifdef CONFIG_AUFS_INO_T_64 ++ a[1] = u.a[1]; ++#endif ++} ++ ++/* NFS file handle */ ++enum { ++ Fh_br_id, ++ Fh_sigen, ++#ifdef CONFIG_AUFS_INO_T_64 ++ /* support 64bit inode number */ ++ Fh_ino1, ++ Fh_ino2, ++ Fh_dir_ino1, ++ Fh_dir_ino2, ++#else ++ Fh_ino1, ++ Fh_dir_ino1, ++#endif ++ Fh_igen, ++ Fh_h_type, ++ Fh_tail, ++ ++ Fh_ino = Fh_ino1, ++ Fh_dir_ino = Fh_dir_ino1 ++}; ++ ++static int au_test_anon(struct dentry *dentry) ++{ ++ /* note: read d_flags without d_lock */ ++ return !!(dentry->d_flags & DCACHE_DISCONNECTED); ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* inode generation external table */ ++ ++void au_xigen_inc(struct inode *inode) ++{ ++ loff_t pos; ++ ssize_t sz; ++ __u32 igen; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++ sb = inode->i_sb; ++ AuDebugOn(!au_opt_test(au_mntflags(sb), XINO)); ++ ++ sbinfo = au_sbi(sb); ++ pos = inode->i_ino; ++ pos *= sizeof(igen); ++ igen = inode->i_generation + 1; ++ sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xigen, &igen, ++ sizeof(igen), &pos); ++ if (sz == sizeof(igen)) ++ return; /* success */ ++ ++ if (unlikely(sz >= 0)) ++ AuIOErr("xigen error (%zd)\n", sz); ++} ++ ++int au_xigen_new(struct inode *inode) ++{ ++ int err; ++ loff_t pos; ++ ssize_t sz; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ ++ err = 0; ++ /* todo: dirty, at mount time */ ++ if (inode->i_ino == AUFS_ROOT_INO) ++ goto out; ++ sb = inode->i_sb; ++ SiMustAnyLock(sb); ++ if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) ++ goto out; ++ ++ err = -EFBIG; ++ pos = inode->i_ino; ++ if (unlikely(au_loff_max / sizeof(inode->i_generation) - 1 < pos)) { ++ AuIOErr1("too large i%lld\n", pos); ++ goto out; ++ } ++ pos *= sizeof(inode->i_generation); ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ file = sbinfo->si_xigen; ++ BUG_ON(!file); ++ ++ if (i_size_read(file->f_dentry->d_inode) ++ < pos + sizeof(inode->i_generation)) { ++ inode->i_generation = atomic_inc_return(&sbinfo->si_xigen_next); ++ sz = xino_fwrite(sbinfo->si_xwrite, file, &inode->i_generation, ++ sizeof(inode->i_generation), &pos); ++ } else ++ sz = xino_fread(sbinfo->si_xread, file, &inode->i_generation, ++ sizeof(inode->i_generation), &pos); ++ if (sz == sizeof(inode->i_generation)) ++ goto out; /* success */ ++ ++ err = sz; ++ if (unlikely(sz >= 0)) { ++ err = -EIO; ++ AuIOErr("xigen error (%zd)\n", sz); ++ } ++ ++out: ++ return err; ++} ++ ++int au_xigen_set(struct super_block *sb, struct file *base) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ file = au_xino_create2(base, sbinfo->si_xigen); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ err = 0; ++ if (sbinfo->si_xigen) ++ fput(sbinfo->si_xigen); ++ sbinfo->si_xigen = file; ++ ++out: ++ return err; ++} ++ ++void au_xigen_clr(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ if (sbinfo->si_xigen) { ++ fput(sbinfo->si_xigen); ++ sbinfo->si_xigen = NULL; ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry *decode_by_ino(struct super_block *sb, ino_t ino, ++ ino_t dir_ino) ++{ ++ struct dentry *dentry, *d; ++ struct inode *inode; ++ unsigned int sigen; ++ ++ dentry = NULL; ++ inode = ilookup(sb, ino); ++ if (!inode) ++ goto out; ++ ++ dentry = ERR_PTR(-ESTALE); ++ sigen = au_sigen(sb); ++ if (unlikely(is_bad_inode(inode) ++ || IS_DEADDIR(inode) ++ || sigen != au_iigen(inode))) ++ goto out_iput; ++ ++ dentry = NULL; ++ if (!dir_ino || S_ISDIR(inode->i_mode)) ++ dentry = d_find_alias(inode); ++ else { ++ spin_lock(&inode->i_lock); ++ list_for_each_entry(d, &inode->i_dentry, d_alias) { ++ spin_lock(&d->d_lock); ++ if (!au_test_anon(d) ++ && d->d_parent->d_inode->i_ino == dir_ino) { ++ dentry = dget_dlock(d); ++ spin_unlock(&d->d_lock); ++ break; ++ } ++ spin_unlock(&d->d_lock); ++ } ++ spin_unlock(&inode->i_lock); ++ } ++ if (unlikely(dentry && au_digen_test(dentry, sigen))) { ++ /* need to refresh */ ++ dput(dentry); ++ dentry = NULL; ++ } ++ ++out_iput: ++ iput(inode); ++out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* todo: dirty? */ ++/* if exportfs_decode_fh() passed vfsmount*, we could be happy */ ++ ++struct au_compare_mnt_args { ++ /* input */ ++ struct super_block *sb; ++ ++ /* output */ ++ struct vfsmount *mnt; ++}; ++ ++static int au_compare_mnt(struct vfsmount *mnt, void *arg) ++{ ++ struct au_compare_mnt_args *a = arg; ++ ++ if (mnt->mnt_sb != a->sb) ++ return 0; ++ a->mnt = mntget(mnt); ++ return 1; ++} ++ ++static struct vfsmount *au_mnt_get(struct super_block *sb) ++{ ++ int err; ++ struct path root; ++ struct au_compare_mnt_args args = { ++ .sb = sb ++ }; ++ ++ get_fs_root(current->fs, &root); ++ br_read_lock(vfsmount_lock); ++ err = iterate_mounts(au_compare_mnt, &args, root.mnt); ++ br_read_unlock(vfsmount_lock); ++ path_put(&root); ++ AuDebugOn(!err); ++ AuDebugOn(!args.mnt); ++ return args.mnt; ++} ++ ++struct au_nfsd_si_lock { ++ unsigned int sigen; ++ aufs_bindex_t bindex, br_id; ++ unsigned char force_lock; ++}; ++ ++static int si_nfsd_read_lock(struct super_block *sb, ++ struct au_nfsd_si_lock *nsi_lock) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ ++ si_read_lock(sb, AuLock_FLUSH); ++ ++ /* branch id may be wrapped around */ ++ err = 0; ++ bindex = au_br_index(sb, nsi_lock->br_id); ++ if (bindex >= 0 && nsi_lock->sigen + AUFS_BRANCH_MAX > au_sigen(sb)) ++ goto out; /* success */ ++ ++ err = -ESTALE; ++ bindex = -1; ++ if (!nsi_lock->force_lock) ++ si_read_unlock(sb); ++ ++out: ++ nsi_lock->bindex = bindex; ++ return err; ++} ++ ++struct find_name_by_ino { ++ int called, found; ++ ino_t ino; ++ char *name; ++ int namelen; ++}; ++ ++static int ++find_name_by_ino(void *arg, const char *name, int namelen, loff_t offset, ++ u64 ino, unsigned int d_type) ++{ ++ struct find_name_by_ino *a = arg; ++ ++ a->called++; ++ if (a->ino != ino) ++ return 0; ++ ++ memcpy(a->name, name, namelen); ++ a->namelen = namelen; ++ a->found = 1; ++ return 1; ++} ++ ++static struct dentry *au_lkup_by_ino(struct path *path, ino_t ino, ++ struct au_nfsd_si_lock *nsi_lock) ++{ ++ struct dentry *dentry, *parent; ++ struct file *file; ++ struct inode *dir; ++ struct find_name_by_ino arg; ++ int err; ++ ++ parent = path->dentry; ++ if (nsi_lock) ++ si_read_unlock(parent->d_sb); ++ file = vfsub_dentry_open(path, au_dir_roflags); ++ dentry = (void *)file; ++ if (IS_ERR(file)) ++ goto out; ++ ++ dentry = ERR_PTR(-ENOMEM); ++ arg.name = __getname_gfp(GFP_NOFS); ++ if (unlikely(!arg.name)) ++ goto out_file; ++ arg.ino = ino; ++ arg.found = 0; ++ do { ++ arg.called = 0; ++ /* smp_mb(); */ ++ err = vfsub_readdir(file, find_name_by_ino, &arg); ++ } while (!err && !arg.found && arg.called); ++ dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_name; ++ dentry = ERR_PTR(-ENOENT); ++ if (!arg.found) ++ goto out_name; ++ ++ /* do not call au_lkup_one() */ ++ dir = parent->d_inode; ++ mutex_lock(&dir->i_mutex); ++ dentry = vfsub_lookup_one_len(arg.name, parent, arg.namelen); ++ mutex_unlock(&dir->i_mutex); ++ AuTraceErrPtr(dentry); ++ if (IS_ERR(dentry)) ++ goto out_name; ++ AuDebugOn(au_test_anon(dentry)); ++ if (unlikely(!dentry->d_inode)) { ++ dput(dentry); ++ dentry = ERR_PTR(-ENOENT); ++ } ++ ++out_name: ++ __putname(arg.name); ++out_file: ++ fput(file); ++out: ++ if (unlikely(nsi_lock ++ && si_nfsd_read_lock(parent->d_sb, nsi_lock) < 0)) ++ if (!IS_ERR(dentry)) { ++ dput(dentry); ++ dentry = ERR_PTR(-ESTALE); ++ } ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++static struct dentry *decode_by_dir_ino(struct super_block *sb, ino_t ino, ++ ino_t dir_ino, ++ struct au_nfsd_si_lock *nsi_lock) ++{ ++ struct dentry *dentry; ++ struct path path; ++ ++ if (dir_ino != AUFS_ROOT_INO) { ++ path.dentry = decode_by_ino(sb, dir_ino, 0); ++ dentry = path.dentry; ++ if (!path.dentry || IS_ERR(path.dentry)) ++ goto out; ++ AuDebugOn(au_test_anon(path.dentry)); ++ } else ++ path.dentry = dget(sb->s_root); ++ ++ path.mnt = au_mnt_get(sb); ++ dentry = au_lkup_by_ino(&path, ino, nsi_lock); ++ path_put(&path); ++ ++out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int h_acceptable(void *expv, struct dentry *dentry) ++{ ++ return 1; ++} ++ ++static char *au_build_path(struct dentry *h_parent, struct path *h_rootpath, ++ char *buf, int len, struct super_block *sb) ++{ ++ char *p; ++ int n; ++ struct path path; ++ ++ p = d_path(h_rootpath, buf, len); ++ if (IS_ERR(p)) ++ goto out; ++ n = strlen(p); ++ ++ path.mnt = h_rootpath->mnt; ++ path.dentry = h_parent; ++ p = d_path(&path, buf, len); ++ if (IS_ERR(p)) ++ goto out; ++ if (n != 1) ++ p += n; ++ ++ path.mnt = au_mnt_get(sb); ++ path.dentry = sb->s_root; ++ p = d_path(&path, buf, len - strlen(p)); ++ mntput(path.mnt); ++ if (IS_ERR(p)) ++ goto out; ++ if (n != 1) ++ p[strlen(p)] = '/'; ++ ++out: ++ AuTraceErrPtr(p); ++ return p; ++} ++ ++static ++struct dentry *decode_by_path(struct super_block *sb, ino_t ino, __u32 *fh, ++ int fh_len, struct au_nfsd_si_lock *nsi_lock) ++{ ++ struct dentry *dentry, *h_parent, *root; ++ struct super_block *h_sb; ++ char *pathname, *p; ++ struct vfsmount *h_mnt; ++ struct au_branch *br; ++ int err; ++ struct path path; ++ ++ br = au_sbr(sb, nsi_lock->bindex); ++ h_mnt = br->br_mnt; ++ h_sb = h_mnt->mnt_sb; ++ /* todo: call lower fh_to_dentry()? fh_to_parent()? */ ++ h_parent = exportfs_decode_fh(h_mnt, (void *)(fh + Fh_tail), ++ fh_len - Fh_tail, fh[Fh_h_type], ++ h_acceptable, /*context*/NULL); ++ dentry = h_parent; ++ if (unlikely(!h_parent || IS_ERR(h_parent))) { ++ AuWarn1("%s decode_fh failed, %ld\n", ++ au_sbtype(h_sb), PTR_ERR(h_parent)); ++ goto out; ++ } ++ dentry = NULL; ++ if (unlikely(au_test_anon(h_parent))) { ++ AuWarn1("%s decode_fh returned a disconnected dentry\n", ++ au_sbtype(h_sb)); ++ goto out_h_parent; ++ } ++ ++ dentry = ERR_PTR(-ENOMEM); ++ pathname = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!pathname)) ++ goto out_h_parent; ++ ++ root = sb->s_root; ++ path.mnt = h_mnt; ++ di_read_lock_parent(root, !AuLock_IR); ++ path.dentry = au_h_dptr(root, nsi_lock->bindex); ++ di_read_unlock(root, !AuLock_IR); ++ p = au_build_path(h_parent, &path, pathname, PAGE_SIZE, sb); ++ dentry = (void *)p; ++ if (IS_ERR(p)) ++ goto out_pathname; ++ ++ si_read_unlock(sb); ++ err = vfsub_kern_path(p, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); ++ dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_relock; ++ ++ dentry = ERR_PTR(-ENOENT); ++ AuDebugOn(au_test_anon(path.dentry)); ++ if (unlikely(!path.dentry->d_inode)) ++ goto out_path; ++ ++ if (ino != path.dentry->d_inode->i_ino) ++ dentry = au_lkup_by_ino(&path, ino, /*nsi_lock*/NULL); ++ else ++ dentry = dget(path.dentry); ++ ++out_path: ++ path_put(&path); ++out_relock: ++ if (unlikely(si_nfsd_read_lock(sb, nsi_lock) < 0)) ++ if (!IS_ERR(dentry)) { ++ dput(dentry); ++ dentry = ERR_PTR(-ESTALE); ++ } ++out_pathname: ++ free_page((unsigned long)pathname); ++out_h_parent: ++ dput(h_parent); ++out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry * ++aufs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, ++ int fh_type) ++{ ++ struct dentry *dentry; ++ __u32 *fh = fid->raw; ++ struct au_branch *br; ++ ino_t ino, dir_ino; ++ struct au_nfsd_si_lock nsi_lock = { ++ .force_lock = 0 ++ }; ++ ++ dentry = ERR_PTR(-ESTALE); ++ /* it should never happen, but the file handle is unreliable */ ++ if (unlikely(fh_len < Fh_tail)) ++ goto out; ++ nsi_lock.sigen = fh[Fh_sigen]; ++ nsi_lock.br_id = fh[Fh_br_id]; ++ ++ /* branch id may be wrapped around */ ++ br = NULL; ++ if (unlikely(si_nfsd_read_lock(sb, &nsi_lock))) ++ goto out; ++ nsi_lock.force_lock = 1; ++ ++ /* is this inode still cached? */ ++ ino = decode_ino(fh + Fh_ino); ++ /* it should never happen */ ++ if (unlikely(ino == AUFS_ROOT_INO)) ++ goto out; ++ ++ dir_ino = decode_ino(fh + Fh_dir_ino); ++ dentry = decode_by_ino(sb, ino, dir_ino); ++ if (IS_ERR(dentry)) ++ goto out_unlock; ++ if (dentry) ++ goto accept; ++ ++ /* is the parent dir cached? */ ++ br = au_sbr(sb, nsi_lock.bindex); ++ atomic_inc(&br->br_count); ++ dentry = decode_by_dir_ino(sb, ino, dir_ino, &nsi_lock); ++ if (IS_ERR(dentry)) ++ goto out_unlock; ++ if (dentry) ++ goto accept; ++ ++ /* lookup path */ ++ dentry = decode_by_path(sb, ino, fh, fh_len, &nsi_lock); ++ if (IS_ERR(dentry)) ++ goto out_unlock; ++ if (unlikely(!dentry)) ++ /* todo?: make it ESTALE */ ++ goto out_unlock; ++ ++accept: ++ if (!au_digen_test(dentry, au_sigen(sb)) ++ && dentry->d_inode->i_generation == fh[Fh_igen]) ++ goto out_unlock; /* success */ ++ ++ dput(dentry); ++ dentry = ERR_PTR(-ESTALE); ++out_unlock: ++ if (br) ++ atomic_dec(&br->br_count); ++ si_read_unlock(sb); ++out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++#if 0 /* reserved for future use */ ++/* support subtreecheck option */ ++static struct dentry *aufs_fh_to_parent(struct super_block *sb, struct fid *fid, ++ int fh_len, int fh_type) ++{ ++ struct dentry *parent; ++ __u32 *fh = fid->raw; ++ ino_t dir_ino; ++ ++ dir_ino = decode_ino(fh + Fh_dir_ino); ++ parent = decode_by_ino(sb, dir_ino, 0); ++ if (IS_ERR(parent)) ++ goto out; ++ if (!parent) ++ parent = decode_by_path(sb, au_br_index(sb, fh[Fh_br_id]), ++ dir_ino, fh, fh_len); ++ ++out: ++ AuTraceErrPtr(parent); ++ return parent; ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, ++ int connectable) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ struct super_block *sb, *h_sb; ++ struct inode *inode; ++ struct dentry *parent, *h_parent; ++ struct au_branch *br; ++ ++ AuDebugOn(au_test_anon(dentry)); ++ ++ parent = NULL; ++ err = -ENOSPC; ++ if (unlikely(*max_len <= Fh_tail)) { ++ AuWarn1("NFSv2 client (max_len %d)?\n", *max_len); ++ goto out; ++ } ++ ++ err = FILEID_ROOT; ++ if (IS_ROOT(dentry)) { ++ AuDebugOn(dentry->d_inode->i_ino != AUFS_ROOT_INO); ++ goto out; ++ } ++ ++ h_parent = NULL; ++ err = aufs_read_lock(dentry, AuLock_FLUSH | AuLock_IR | AuLock_GEN); ++ if (unlikely(err)) ++ goto out; ++ ++ inode = dentry->d_inode; ++ AuDebugOn(!inode); ++ sb = dentry->d_sb; ++#ifdef CONFIG_AUFS_DEBUG ++ if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) ++ AuWarn1("NFS-exporting requires xino\n"); ++#endif ++ err = -EIO; ++ parent = dget_parent(dentry); ++ di_read_lock_parent(parent, !AuLock_IR); ++ bend = au_dbtaildir(parent); ++ for (bindex = au_dbstart(parent); bindex <= bend; bindex++) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (h_parent) { ++ dget(h_parent); ++ break; ++ } ++ } ++ if (unlikely(!h_parent)) ++ goto out_unlock; ++ ++ err = -EPERM; ++ br = au_sbr(sb, bindex); ++ h_sb = br->br_mnt->mnt_sb; ++ if (unlikely(!h_sb->s_export_op)) { ++ AuErr1("%s branch is not exportable\n", au_sbtype(h_sb)); ++ goto out_dput; ++ } ++ ++ fh[Fh_br_id] = br->br_id; ++ fh[Fh_sigen] = au_sigen(sb); ++ encode_ino(fh + Fh_ino, inode->i_ino); ++ encode_ino(fh + Fh_dir_ino, parent->d_inode->i_ino); ++ fh[Fh_igen] = inode->i_generation; ++ ++ *max_len -= Fh_tail; ++ fh[Fh_h_type] = exportfs_encode_fh(h_parent, (void *)(fh + Fh_tail), ++ max_len, ++ /*connectable or subtreecheck*/0); ++ err = fh[Fh_h_type]; ++ *max_len += Fh_tail; ++ /* todo: macros? */ ++ if (err != 255) ++ err = 99; ++ else ++ AuWarn1("%s encode_fh failed\n", au_sbtype(h_sb)); ++ ++out_dput: ++ dput(h_parent); ++out_unlock: ++ di_read_unlock(parent, !AuLock_IR); ++ dput(parent); ++ aufs_read_unlock(dentry, AuLock_IR); ++out: ++ if (unlikely(err < 0)) ++ err = 255; ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_commit_metadata(struct inode *inode) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct super_block *sb; ++ struct inode *h_inode; ++ int (*f)(struct inode *inode); ++ ++ sb = inode->i_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ ii_write_lock_child(inode); ++ bindex = au_ibstart(inode); ++ AuDebugOn(bindex < 0); ++ h_inode = au_h_iptr(inode, bindex); ++ ++ f = h_inode->i_sb->s_export_op->commit_metadata; ++ if (f) ++ err = f(h_inode); ++ else { ++ struct writeback_control wbc = { ++ .sync_mode = WB_SYNC_ALL, ++ .nr_to_write = 0 /* metadata only */ ++ }; ++ ++ err = sync_inode(h_inode, &wbc); ++ } ++ ++ au_cpup_attr_timesizes(inode); ++ ii_write_unlock(inode); ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct export_operations aufs_export_op = { ++ .fh_to_dentry = aufs_fh_to_dentry, ++ /* .fh_to_parent = aufs_fh_to_parent, */ ++ .encode_fh = aufs_encode_fh, ++ .commit_metadata = aufs_commit_metadata ++}; ++ ++void au_export_init(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ __u32 u; ++ ++ sb->s_export_op = &aufs_export_op; ++ sbinfo = au_sbi(sb); ++ sbinfo->si_xigen = NULL; ++ get_random_bytes(&u, sizeof(u)); ++ BUILD_BUG_ON(sizeof(u) != sizeof(int)); ++ atomic_set(&sbinfo->si_xigen_next, u); ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/file.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,676 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * handling file/dir, and address_space operation ++ */ ++ ++#ifdef CONFIG_AUFS_DEBUG ++#include ++#endif ++#include ++#include "aufs.h" ++ ++/* drop flags for writing */ ++unsigned int au_file_roflags(unsigned int flags) ++{ ++ flags &= ~(O_WRONLY | O_RDWR | O_APPEND | O_CREAT | O_TRUNC); ++ flags |= O_RDONLY | O_NOATIME; ++ return flags; ++} ++ ++/* common functions to regular file and dir */ ++struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, ++ struct file *file) ++{ ++ struct file *h_file; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ struct super_block *sb; ++ struct au_branch *br; ++ struct path h_path; ++ int err, exec_flag; ++ ++ /* a race condition can happen between open and unlink/rmdir */ ++ h_file = ERR_PTR(-ENOENT); ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (au_test_nfsd() && !h_dentry) ++ goto out; ++ h_inode = h_dentry->d_inode; ++ if (au_test_nfsd() && !h_inode) ++ goto out; ++ spin_lock(&h_dentry->d_lock); ++ err = (!d_unhashed(dentry) && d_unlinked(h_dentry)) ++ || !h_inode ++ /* || !dentry->d_inode->i_nlink */ ++ ; ++ spin_unlock(&h_dentry->d_lock); ++ if (unlikely(err)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ br = au_sbr(sb, bindex); ++ h_file = ERR_PTR(-EACCES); ++ exec_flag = flags & __FMODE_EXEC; ++ if (exec_flag && (br->br_mnt->mnt_flags & MNT_NOEXEC)) ++ goto out; ++ ++ /* drop flags for writing */ ++ if (au_test_ro(sb, bindex, dentry->d_inode)) ++ flags = au_file_roflags(flags); ++ flags &= ~O_CREAT; ++ atomic_inc(&br->br_count); ++ h_path.dentry = h_dentry; ++ h_path.mnt = br->br_mnt; ++ if (!au_special_file(h_inode->i_mode)) ++ h_file = vfsub_dentry_open(&h_path, flags); ++ else { ++ /* this block depends upon the configuration */ ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ si_read_unlock(sb); ++ h_file = vfsub_dentry_open(&h_path, flags); ++ si_noflush_read_lock(sb); ++ fi_write_lock(file); ++ di_read_lock_child(dentry, AuLock_IR); ++ } ++ if (IS_ERR(h_file)) ++ goto out_br; ++ ++ if (exec_flag) { ++ err = deny_write_access(h_file); ++ if (unlikely(err)) { ++ fput(h_file); ++ h_file = ERR_PTR(err); ++ goto out_br; ++ } ++ } ++ fsnotify_open(h_file); ++ goto out; /* success */ ++ ++out_br: ++ atomic_dec(&br->br_count); ++out: ++ return h_file; ++} ++ ++int au_do_open(struct file *file, int (*open)(struct file *file, int flags), ++ struct au_fidir *fidir) ++{ ++ int err; ++ struct dentry *dentry; ++ ++ err = au_finfo_init(file, fidir); ++ if (unlikely(err)) ++ goto out; ++ ++ dentry = file->f_dentry; ++ di_read_lock_child(dentry, AuLock_IR); ++ err = open(file, vfsub_file_flags(file)); ++ di_read_unlock(dentry, AuLock_IR); ++ ++ fi_write_unlock(file); ++ if (unlikely(err)) { ++ au_fi(file)->fi_hdir = NULL; ++ au_finfo_fin(file); ++ } ++ ++out: ++ return err; ++} ++ ++int au_reopen_nondir(struct file *file) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ struct dentry *dentry; ++ struct file *h_file, *h_file_tmp; ++ ++ dentry = file->f_dentry; ++ AuDebugOn(au_special_file(dentry->d_inode->i_mode)); ++ bstart = au_dbstart(dentry); ++ h_file_tmp = NULL; ++ if (au_fbstart(file) == bstart) { ++ h_file = au_hf_top(file); ++ if (file->f_mode == h_file->f_mode) ++ return 0; /* success */ ++ h_file_tmp = h_file; ++ get_file(h_file_tmp); ++ au_set_h_fptr(file, bstart, NULL); ++ } ++ AuDebugOn(au_fi(file)->fi_hdir); ++ AuDebugOn(au_fbstart(file) < bstart); ++ ++ h_file = au_h_open(dentry, bstart, vfsub_file_flags(file) & ~O_TRUNC, ++ file); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out; /* todo: close all? */ ++ ++ err = 0; ++ au_set_fbstart(file, bstart); ++ au_set_h_fptr(file, bstart, h_file); ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ ++out: ++ if (h_file_tmp) ++ fput(h_file_tmp); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_reopen_wh(struct file *file, aufs_bindex_t btgt, ++ struct dentry *hi_wh) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ struct au_dinfo *dinfo; ++ struct dentry *h_dentry; ++ struct au_hdentry *hdp; ++ ++ dinfo = au_di(file->f_dentry); ++ AuRwMustWriteLock(&dinfo->di_rwsem); ++ ++ bstart = dinfo->di_bstart; ++ dinfo->di_bstart = btgt; ++ hdp = dinfo->di_hdentry; ++ h_dentry = hdp[0 + btgt].hd_dentry; ++ hdp[0 + btgt].hd_dentry = hi_wh; ++ err = au_reopen_nondir(file); ++ hdp[0 + btgt].hd_dentry = h_dentry; ++ dinfo->di_bstart = bstart; ++ ++ return err; ++} ++ ++static int au_ready_to_write_wh(struct file *file, loff_t len, ++ aufs_bindex_t bcpup) ++{ ++ int err; ++ struct inode *inode, *h_inode; ++ struct dentry *dentry, *h_dentry, *hi_wh; ++ ++ dentry = file->f_dentry; ++ au_update_dbstart(dentry); ++ inode = dentry->d_inode; ++ h_inode = NULL; ++ if (au_dbstart(dentry) <= bcpup && au_dbend(dentry) >= bcpup) { ++ h_dentry = au_h_dptr(dentry, bcpup); ++ if (h_dentry) ++ h_inode = h_dentry->d_inode; ++ } ++ hi_wh = au_hi_wh(inode, bcpup); ++ if (!hi_wh && !h_inode) ++ err = au_sio_cpup_wh(dentry, bcpup, len, file); ++ else ++ /* already copied-up after unlink */ ++ err = au_reopen_wh(file, bcpup, hi_wh); ++ ++ if (!err ++ && inode->i_nlink > 1 ++ && au_opt_test(au_mntflags(dentry->d_sb), PLINK)) ++ au_plink_append(inode, bcpup, au_h_dptr(dentry, bcpup)); ++ ++ return err; ++} ++ ++/* ++ * prepare the @file for writing. ++ */ ++int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin) ++{ ++ int err; ++ aufs_bindex_t bstart, bcpup, dbstart; ++ struct dentry *dentry, *parent, *h_dentry; ++ struct inode *h_inode, *inode; ++ struct super_block *sb; ++ struct file *h_file; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ AuDebugOn(au_special_file(inode->i_mode)); ++ bstart = au_fbstart(file); ++ err = au_test_ro(sb, bstart, inode); ++ if (!err && (au_hf_top(file)->f_mode & FMODE_WRITE)) { ++ err = au_pin(pin, dentry, bstart, AuOpt_UDBA_NONE, /*flags*/0); ++ goto out; ++ } ++ ++ /* need to cpup or reopen */ ++ parent = dget_parent(dentry); ++ di_write_lock_parent(parent); ++ err = AuWbrCopyup(au_sbi(sb), dentry); ++ bcpup = err; ++ if (unlikely(err < 0)) ++ goto out_dgrade; ++ err = 0; ++ ++ if (!d_unhashed(dentry) && !au_h_dptr(parent, bcpup)) { ++ err = au_cpup_dirs(dentry, bcpup); ++ if (unlikely(err)) ++ goto out_dgrade; ++ } ++ ++ err = au_pin(pin, dentry, bcpup, AuOpt_UDBA_NONE, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (unlikely(err)) ++ goto out_dgrade; ++ ++ h_dentry = au_hf_top(file)->f_dentry; ++ h_inode = h_dentry->d_inode; ++ dbstart = au_dbstart(dentry); ++ if (dbstart <= bcpup) { ++ h_dentry = au_h_dptr(dentry, bcpup); ++ AuDebugOn(!h_dentry); ++ h_inode = h_dentry->d_inode; ++ AuDebugOn(!h_inode); ++ bstart = bcpup; ++ } ++ ++ if (dbstart <= bcpup /* just reopen */ ++ || !d_unhashed(dentry) /* copyup and reopen */ ++ ) { ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ h_file = au_h_open_pre(dentry, bstart); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ h_file = NULL; ++ } else { ++ di_downgrade_lock(parent, AuLock_IR); ++ if (dbstart > bcpup) ++ err = au_sio_cpup_simple(dentry, bcpup, len, ++ AuCpup_DTIME); ++ if (!err) ++ err = au_reopen_nondir(file); ++ } ++ mutex_unlock(&h_inode->i_mutex); ++ au_h_open_post(dentry, bstart, h_file); ++ } else { /* copyup as wh and reopen */ ++ /* ++ * since writable hfsplus branch is not supported, ++ * h_open_pre/post() are unnecessary. ++ */ ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ err = au_ready_to_write_wh(file, len, bcpup); ++ di_downgrade_lock(parent, AuLock_IR); ++ mutex_unlock(&h_inode->i_mutex); ++ } ++ ++ if (!err) { ++ au_pin_set_parent_lflag(pin, /*lflag*/0); ++ goto out_dput; /* success */ ++ } ++ au_unpin(pin); ++ goto out_unlock; ++ ++out_dgrade: ++ di_downgrade_lock(parent, AuLock_IR); ++out_unlock: ++ di_read_unlock(parent, AuLock_IR); ++out_dput: ++ dput(parent); ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_do_flush(struct file *file, fl_owner_t id, ++ int (*flush)(struct file *file, fl_owner_t id)) ++{ ++ int err; ++ struct dentry *dentry; ++ struct super_block *sb; ++ struct inode *inode; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ si_noflush_read_lock(sb); ++ fi_read_lock(file); ++ ii_read_lock_child(inode); ++ ++ err = flush(file, id); ++ au_cpup_attr_timesizes(inode); ++ ++ ii_read_unlock(inode); ++ fi_read_unlock(file); ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_file_refresh_by_inode(struct file *file, int *need_reopen) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ struct au_pin pin; ++ struct au_finfo *finfo; ++ struct dentry *dentry, *parent, *hi_wh; ++ struct inode *inode; ++ struct super_block *sb; ++ ++ FiMustWriteLock(file); ++ ++ err = 0; ++ finfo = au_fi(file); ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ bstart = au_ibstart(inode); ++ if (bstart == finfo->fi_btop || IS_ROOT(dentry)) ++ goto out; ++ ++ parent = dget_parent(dentry); ++ if (au_test_ro(sb, bstart, inode)) { ++ di_read_lock_parent(parent, !AuLock_IR); ++ err = AuWbrCopyup(au_sbi(sb), dentry); ++ bstart = err; ++ di_read_unlock(parent, !AuLock_IR); ++ if (unlikely(err < 0)) ++ goto out_parent; ++ err = 0; ++ } ++ ++ di_read_lock_parent(parent, AuLock_IR); ++ hi_wh = au_hi_wh(inode, bstart); ++ if (!S_ISDIR(inode->i_mode) ++ && au_opt_test(au_mntflags(sb), PLINK) ++ && au_plink_test(inode) ++ && !d_unhashed(dentry)) { ++ err = au_test_and_cpup_dirs(dentry, bstart); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ /* always superio. */ ++ err = au_pin(&pin, dentry, bstart, AuOpt_UDBA_NONE, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (!err) ++ err = au_sio_cpup_simple(dentry, bstart, -1, ++ AuCpup_DTIME); ++ au_unpin(&pin); ++ } else if (hi_wh) { ++ /* already copied-up after unlink */ ++ err = au_reopen_wh(file, bstart, hi_wh); ++ *need_reopen = 0; ++ } ++ ++out_unlock: ++ di_read_unlock(parent, AuLock_IR); ++out_parent: ++ dput(parent); ++out: ++ return err; ++} ++ ++static void au_do_refresh_dir(struct file *file) ++{ ++ aufs_bindex_t bindex, bend, new_bindex, brid; ++ struct au_hfile *p, tmp, *q; ++ struct au_finfo *finfo; ++ struct super_block *sb; ++ struct au_fidir *fidir; ++ ++ FiMustWriteLock(file); ++ ++ sb = file->f_dentry->d_sb; ++ finfo = au_fi(file); ++ fidir = finfo->fi_hdir; ++ AuDebugOn(!fidir); ++ p = fidir->fd_hfile + finfo->fi_btop; ++ brid = p->hf_br->br_id; ++ bend = fidir->fd_bbot; ++ for (bindex = finfo->fi_btop; bindex <= bend; bindex++, p++) { ++ if (!p->hf_file) ++ continue; ++ ++ new_bindex = au_br_index(sb, p->hf_br->br_id); ++ if (new_bindex == bindex) ++ continue; ++ if (new_bindex < 0) { ++ au_set_h_fptr(file, bindex, NULL); ++ continue; ++ } ++ ++ /* swap two lower inode, and loop again */ ++ q = fidir->fd_hfile + new_bindex; ++ tmp = *q; ++ *q = *p; ++ *p = tmp; ++ if (tmp.hf_file) { ++ bindex--; ++ p--; ++ } ++ } ++ ++ p = fidir->fd_hfile; ++ if (!au_test_mmapped(file) && !d_unlinked(file->f_dentry)) { ++ bend = au_sbend(sb); ++ for (finfo->fi_btop = 0; finfo->fi_btop <= bend; ++ finfo->fi_btop++, p++) ++ if (p->hf_file) { ++ if (p->hf_file->f_dentry ++ && p->hf_file->f_dentry->d_inode) ++ break; ++ else ++ au_hfput(p, file); ++ } ++ } else { ++ bend = au_br_index(sb, brid); ++ for (finfo->fi_btop = 0; finfo->fi_btop < bend; ++ finfo->fi_btop++, p++) ++ if (p->hf_file) ++ au_hfput(p, file); ++ bend = au_sbend(sb); ++ } ++ ++ p = fidir->fd_hfile + bend; ++ for (fidir->fd_bbot = bend; fidir->fd_bbot >= finfo->fi_btop; ++ fidir->fd_bbot--, p--) ++ if (p->hf_file) { ++ if (p->hf_file->f_dentry ++ && p->hf_file->f_dentry->d_inode) ++ break; ++ else ++ au_hfput(p, file); ++ } ++ AuDebugOn(fidir->fd_bbot < finfo->fi_btop); ++} ++ ++/* ++ * after branch manipulating, refresh the file. ++ */ ++static int refresh_file(struct file *file, int (*reopen)(struct file *file)) ++{ ++ int err, need_reopen; ++ aufs_bindex_t bend, bindex; ++ struct dentry *dentry; ++ struct au_finfo *finfo; ++ struct au_hfile *hfile; ++ ++ dentry = file->f_dentry; ++ finfo = au_fi(file); ++ if (!finfo->fi_hdir) { ++ hfile = &finfo->fi_htop; ++ AuDebugOn(!hfile->hf_file); ++ bindex = au_br_index(dentry->d_sb, hfile->hf_br->br_id); ++ AuDebugOn(bindex < 0); ++ if (bindex != finfo->fi_btop) ++ au_set_fbstart(file, bindex); ++ } else { ++ err = au_fidir_realloc(finfo, au_sbend(dentry->d_sb) + 1); ++ if (unlikely(err)) ++ goto out; ++ au_do_refresh_dir(file); ++ } ++ ++ err = 0; ++ need_reopen = 1; ++ if (!au_test_mmapped(file)) ++ err = au_file_refresh_by_inode(file, &need_reopen); ++ if (!err && need_reopen && !d_unlinked(dentry)) ++ err = reopen(file); ++ if (!err) { ++ au_update_figen(file); ++ goto out; /* success */ ++ } ++ ++ /* error, close all lower files */ ++ if (finfo->fi_hdir) { ++ bend = au_fbend_dir(file); ++ for (bindex = au_fbstart(file); bindex <= bend; bindex++) ++ au_set_h_fptr(file, bindex, NULL); ++ } ++ ++out: ++ return err; ++} ++ ++/* common function to regular file and dir */ ++int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), ++ int wlock) ++{ ++ int err; ++ unsigned int sigen, figen; ++ aufs_bindex_t bstart; ++ unsigned char pseudo_link; ++ struct dentry *dentry; ++ struct inode *inode; ++ ++ err = 0; ++ dentry = file->f_dentry; ++ inode = dentry->d_inode; ++ AuDebugOn(au_special_file(inode->i_mode)); ++ sigen = au_sigen(dentry->d_sb); ++ fi_write_lock(file); ++ figen = au_figen(file); ++ di_write_lock_child(dentry); ++ bstart = au_dbstart(dentry); ++ pseudo_link = (bstart != au_ibstart(inode)); ++ if (sigen == figen && !pseudo_link && au_fbstart(file) == bstart) { ++ if (!wlock) { ++ di_downgrade_lock(dentry, AuLock_IR); ++ fi_downgrade_lock(file); ++ } ++ goto out; /* success */ ++ } ++ ++ AuDbg("sigen %d, figen %d\n", sigen, figen); ++ if (au_digen_test(dentry, sigen)) { ++ err = au_reval_dpath(dentry, sigen); ++ AuDebugOn(!err && au_digen_test(dentry, sigen)); ++ } ++ ++ if (!err) ++ err = refresh_file(file, reopen); ++ if (!err) { ++ if (!wlock) { ++ di_downgrade_lock(dentry, AuLock_IR); ++ fi_downgrade_lock(file); ++ } ++ } else { ++ di_write_unlock(dentry); ++ fi_write_unlock(file); ++ } ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* cf. aufs_nopage() */ ++/* for madvise(2) */ ++static int aufs_readpage(struct file *file __maybe_unused, struct page *page) ++{ ++ unlock_page(page); ++ return 0; ++} ++ ++/* it will never be called, but necessary to support O_DIRECT */ ++static ssize_t aufs_direct_IO(int rw, struct kiocb *iocb, ++ const struct iovec *iov, loff_t offset, ++ unsigned long nr_segs) ++{ BUG(); return 0; } ++ ++/* ++ * it will never be called, but madvise and fadvise behaves differently ++ * when get_xip_mem is defined ++ */ ++static int aufs_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, ++ int create, void **kmem, unsigned long *pfn) ++{ BUG(); return 0; } ++ ++/* they will never be called. */ ++#ifdef CONFIG_AUFS_DEBUG ++static int aufs_write_begin(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned flags, ++ struct page **pagep, void **fsdata) ++{ AuUnsupport(); return 0; } ++static int aufs_write_end(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned copied, ++ struct page *page, void *fsdata) ++{ AuUnsupport(); return 0; } ++static int aufs_writepage(struct page *page, struct writeback_control *wbc) ++{ AuUnsupport(); return 0; } ++ ++static int aufs_set_page_dirty(struct page *page) ++{ AuUnsupport(); return 0; } ++static void aufs_invalidatepage(struct page *page, unsigned long offset) ++{ AuUnsupport(); } ++static int aufs_releasepage(struct page *page, gfp_t gfp) ++{ AuUnsupport(); return 0; } ++static int aufs_migratepage(struct address_space *mapping, struct page *newpage, ++ struct page *page, enum migrate_mode mode) ++{ AuUnsupport(); return 0; } ++static int aufs_launder_page(struct page *page) ++{ AuUnsupport(); return 0; } ++static int aufs_is_partially_uptodate(struct page *page, ++ read_descriptor_t *desc, ++ unsigned long from) ++{ AuUnsupport(); return 0; } ++static int aufs_error_remove_page(struct address_space *mapping, ++ struct page *page) ++{ AuUnsupport(); return 0; } ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++const struct address_space_operations aufs_aop = { ++ .readpage = aufs_readpage, ++ .direct_IO = aufs_direct_IO, ++ .get_xip_mem = aufs_get_xip_mem, ++#ifdef CONFIG_AUFS_DEBUG ++ .writepage = aufs_writepage, ++ /* no writepages, because of writepage */ ++ .set_page_dirty = aufs_set_page_dirty, ++ /* no readpages, because of readpage */ ++ .write_begin = aufs_write_begin, ++ .write_end = aufs_write_end, ++ /* no bmap, no block device */ ++ .invalidatepage = aufs_invalidatepage, ++ .releasepage = aufs_releasepage, ++ .migratepage = aufs_migratepage, ++ .launder_page = aufs_launder_page, ++ .is_partially_uptodate = aufs_is_partially_uptodate, ++ .error_remove_page = aufs_error_remove_page ++#endif /* CONFIG_AUFS_DEBUG */ ++}; +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/file.h 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,298 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * file operations ++ */ ++ ++#ifndef __AUFS_FILE_H__ ++#define __AUFS_FILE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++#include "rwsem.h" ++ ++struct au_branch; ++struct au_hfile { ++ struct file *hf_file; ++ struct au_branch *hf_br; ++}; ++ ++struct au_vdir; ++struct au_fidir { ++ aufs_bindex_t fd_bbot; ++ aufs_bindex_t fd_nent; ++ struct au_vdir *fd_vdir_cache; ++ struct au_hfile fd_hfile[]; ++}; ++ ++static inline int au_fidir_sz(int nent) ++{ ++ AuDebugOn(nent < 0); ++ return sizeof(struct au_fidir) + sizeof(struct au_hfile) * nent; ++} ++ ++struct au_finfo { ++ atomic_t fi_generation; ++ ++ struct au_rwsem fi_rwsem; ++ aufs_bindex_t fi_btop; ++ ++ /* do not union them */ ++ struct { /* for non-dir */ ++ struct au_hfile fi_htop; ++ atomic_t fi_mmapped; ++ }; ++ struct au_fidir *fi_hdir; /* for dir only */ ++} ____cacheline_aligned_in_smp; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* file.c */ ++extern const struct address_space_operations aufs_aop; ++unsigned int au_file_roflags(unsigned int flags); ++struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, ++ struct file *file); ++int au_do_open(struct file *file, int (*open)(struct file *file, int flags), ++ struct au_fidir *fidir); ++int au_reopen_nondir(struct file *file); ++struct au_pin; ++int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin); ++int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), ++ int wlock); ++int au_do_flush(struct file *file, fl_owner_t id, ++ int (*flush)(struct file *file, fl_owner_t id)); ++ ++/* poll.c */ ++#ifdef CONFIG_AUFS_POLL ++unsigned int aufs_poll(struct file *file, poll_table *wait); ++#endif ++ ++#ifdef CONFIG_AUFS_BR_HFSPLUS ++/* hfsplus.c */ ++struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex); ++void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex, ++ struct file *h_file); ++#else ++static inline ++struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ return NULL; ++} ++ ++AuStubVoid(au_h_open_post, struct dentry *dentry, aufs_bindex_t bindex, ++ struct file *h_file); ++#endif ++ ++/* f_op.c */ ++extern const struct file_operations aufs_file_fop; ++int au_do_open_nondir(struct file *file, int flags); ++int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file); ++ ++#ifdef CONFIG_AUFS_SP_IATTR ++/* f_op_sp.c */ ++int au_special_file(umode_t mode); ++void au_init_special_fop(struct inode *inode, umode_t mode, dev_t rdev); ++#else ++AuStubInt0(au_special_file, umode_t mode) ++static inline void au_init_special_fop(struct inode *inode, umode_t mode, ++ dev_t rdev) ++{ ++ init_special_inode(inode, mode, rdev); ++} ++#endif ++ ++/* finfo.c */ ++void au_hfput(struct au_hfile *hf, struct file *file); ++void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, ++ struct file *h_file); ++ ++void au_update_figen(struct file *file); ++struct au_fidir *au_fidir_alloc(struct super_block *sb); ++int au_fidir_realloc(struct au_finfo *finfo, int nbr); ++ ++void au_fi_init_once(void *_fi); ++void au_finfo_fin(struct file *file); ++int au_finfo_init(struct file *file, struct au_fidir *fidir); ++ ++/* ioctl.c */ ++long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg); ++#ifdef CONFIG_COMPAT ++long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd, ++ unsigned long arg); ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_finfo *au_fi(struct file *file) ++{ ++ return file->private_data; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * fi_read_lock, fi_write_lock, ++ * fi_read_unlock, fi_write_unlock, fi_downgrade_lock ++ */ ++AuSimpleRwsemFuncs(fi, struct file *f, &au_fi(f)->fi_rwsem); ++ ++#define FiMustNoWaiters(f) AuRwMustNoWaiters(&au_fi(f)->fi_rwsem) ++#define FiMustAnyLock(f) AuRwMustAnyLock(&au_fi(f)->fi_rwsem) ++#define FiMustWriteLock(f) AuRwMustWriteLock(&au_fi(f)->fi_rwsem) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* todo: hard/soft set? */ ++static inline aufs_bindex_t au_fbstart(struct file *file) ++{ ++ FiMustAnyLock(file); ++ return au_fi(file)->fi_btop; ++} ++ ++static inline aufs_bindex_t au_fbend_dir(struct file *file) ++{ ++ FiMustAnyLock(file); ++ AuDebugOn(!au_fi(file)->fi_hdir); ++ return au_fi(file)->fi_hdir->fd_bbot; ++} ++ ++static inline struct au_vdir *au_fvdir_cache(struct file *file) ++{ ++ FiMustAnyLock(file); ++ AuDebugOn(!au_fi(file)->fi_hdir); ++ return au_fi(file)->fi_hdir->fd_vdir_cache; ++} ++ ++static inline void au_set_fbstart(struct file *file, aufs_bindex_t bindex) ++{ ++ FiMustWriteLock(file); ++ au_fi(file)->fi_btop = bindex; ++} ++ ++static inline void au_set_fbend_dir(struct file *file, aufs_bindex_t bindex) ++{ ++ FiMustWriteLock(file); ++ AuDebugOn(!au_fi(file)->fi_hdir); ++ au_fi(file)->fi_hdir->fd_bbot = bindex; ++} ++ ++static inline void au_set_fvdir_cache(struct file *file, ++ struct au_vdir *vdir_cache) ++{ ++ FiMustWriteLock(file); ++ AuDebugOn(!au_fi(file)->fi_hdir); ++ au_fi(file)->fi_hdir->fd_vdir_cache = vdir_cache; ++} ++ ++static inline struct file *au_hf_top(struct file *file) ++{ ++ FiMustAnyLock(file); ++ AuDebugOn(au_fi(file)->fi_hdir); ++ return au_fi(file)->fi_htop.hf_file; ++} ++ ++static inline struct file *au_hf_dir(struct file *file, aufs_bindex_t bindex) ++{ ++ FiMustAnyLock(file); ++ AuDebugOn(!au_fi(file)->fi_hdir); ++ return au_fi(file)->fi_hdir->fd_hfile[0 + bindex].hf_file; ++} ++ ++/* todo: memory barrier? */ ++static inline unsigned int au_figen(struct file *f) ++{ ++ return atomic_read(&au_fi(f)->fi_generation); ++} ++ ++static inline void au_set_mmapped(struct file *f) ++{ ++ if (atomic_inc_return(&au_fi(f)->fi_mmapped)) ++ return; ++ pr_warning("fi_mmapped wrapped around\n"); ++ while (!atomic_inc_return(&au_fi(f)->fi_mmapped)) ++ ; ++} ++ ++static inline void au_unset_mmapped(struct file *f) ++{ ++ atomic_dec(&au_fi(f)->fi_mmapped); ++} ++ ++static inline int au_test_mmapped(struct file *f) ++{ ++ return atomic_read(&au_fi(f)->fi_mmapped); ++} ++ ++/* customize vma->vm_file */ ++ ++static inline void au_do_vm_file_reset(struct vm_area_struct *vma, ++ struct file *file) ++{ ++ struct file *f; ++ ++ f = vma->vm_file; ++ get_file(file); ++ vma->vm_file = file; ++ fput(f); ++} ++ ++#ifdef CONFIG_MMU ++#define AuDbgVmRegion(file, vma) do {} while (0) ++ ++static inline void au_vm_file_reset(struct vm_area_struct *vma, ++ struct file *file) ++{ ++ au_do_vm_file_reset(vma, file); ++} ++#else ++#define AuDbgVmRegion(file, vma) \ ++ AuDebugOn((vma)->vm_region && (vma)->vm_region->vm_file != (file)) ++ ++static inline void au_vm_file_reset(struct vm_area_struct *vma, ++ struct file *file) ++{ ++ struct file *f; ++ ++ au_do_vm_file_reset(vma, file); ++ f = vma->vm_region->vm_file; ++ get_file(file); ++ vma->vm_region->vm_file = file; ++ fput(f); ++} ++#endif /* CONFIG_MMU */ ++ ++/* handle vma->vm_prfile */ ++static inline void au_vm_prfile_set(struct vm_area_struct *vma, ++ struct file *file) ++{ ++#ifdef CONFIG_AUFS_PROC_MAP ++ get_file(file); ++ vma->vm_prfile = file; ++#ifndef CONFIG_MMU ++ get_file(file); ++ vma->vm_region->vm_prfile = file; ++#endif ++#endif ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_FILE_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/finfo.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,156 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * file private data ++ */ ++ ++#include "aufs.h" ++ ++void au_hfput(struct au_hfile *hf, struct file *file) ++{ ++ /* todo: direct access f_flags */ ++ if (vfsub_file_flags(file) & __FMODE_EXEC) ++ allow_write_access(hf->hf_file); ++ fput(hf->hf_file); ++ hf->hf_file = NULL; ++ atomic_dec(&hf->hf_br->br_count); ++ hf->hf_br = NULL; ++} ++ ++void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, struct file *val) ++{ ++ struct au_finfo *finfo = au_fi(file); ++ struct au_hfile *hf; ++ struct au_fidir *fidir; ++ ++ fidir = finfo->fi_hdir; ++ if (!fidir) { ++ AuDebugOn(finfo->fi_btop != bindex); ++ hf = &finfo->fi_htop; ++ } else ++ hf = fidir->fd_hfile + bindex; ++ ++ if (hf && hf->hf_file) ++ au_hfput(hf, file); ++ if (val) { ++ FiMustWriteLock(file); ++ hf->hf_file = val; ++ hf->hf_br = au_sbr(file->f_dentry->d_sb, bindex); ++ } ++} ++ ++void au_update_figen(struct file *file) ++{ ++ atomic_set(&au_fi(file)->fi_generation, au_digen(file->f_dentry)); ++ /* smp_mb(); */ /* atomic_set */ ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_fidir *au_fidir_alloc(struct super_block *sb) ++{ ++ struct au_fidir *fidir; ++ int nbr; ++ ++ nbr = au_sbend(sb) + 1; ++ if (nbr < 2) ++ nbr = 2; /* initial allocate for 2 branches */ ++ fidir = kzalloc(au_fidir_sz(nbr), GFP_NOFS); ++ if (fidir) { ++ fidir->fd_bbot = -1; ++ fidir->fd_nent = nbr; ++ fidir->fd_vdir_cache = NULL; ++ } ++ ++ return fidir; ++} ++ ++int au_fidir_realloc(struct au_finfo *finfo, int nbr) ++{ ++ int err; ++ struct au_fidir *fidir, *p; ++ ++ AuRwMustWriteLock(&finfo->fi_rwsem); ++ fidir = finfo->fi_hdir; ++ AuDebugOn(!fidir); ++ ++ err = -ENOMEM; ++ p = au_kzrealloc(fidir, au_fidir_sz(fidir->fd_nent), au_fidir_sz(nbr), ++ GFP_NOFS); ++ if (p) { ++ p->fd_nent = nbr; ++ finfo->fi_hdir = p; ++ err = 0; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_finfo_fin(struct file *file) ++{ ++ struct au_finfo *finfo; ++ ++ au_nfiles_dec(file->f_dentry->d_sb); ++ ++ finfo = au_fi(file); ++ AuDebugOn(finfo->fi_hdir); ++ AuRwDestroy(&finfo->fi_rwsem); ++ au_cache_free_finfo(finfo); ++} ++ ++void au_fi_init_once(void *_finfo) ++{ ++ struct au_finfo *finfo = _finfo; ++ static struct lock_class_key aufs_fi; ++ ++ au_rw_init(&finfo->fi_rwsem); ++ au_rw_class(&finfo->fi_rwsem, &aufs_fi); ++} ++ ++int au_finfo_init(struct file *file, struct au_fidir *fidir) ++{ ++ int err, lc_idx; ++ struct au_finfo *finfo; ++ struct dentry *dentry; ++ ++ err = -ENOMEM; ++ dentry = file->f_dentry; ++ finfo = au_cache_alloc_finfo(); ++ if (unlikely(!finfo)) ++ goto out; ++ ++ err = 0; ++ au_nfiles_inc(dentry->d_sb); ++ lc_idx = AuLcNonDir_FIINFO; ++ if (fidir) ++ lc_idx = AuLcDir_FIINFO; ++ au_rw_class(&finfo->fi_rwsem, au_lc_key + lc_idx); ++ au_rw_write_lock(&finfo->fi_rwsem); ++ finfo->fi_btop = -1; ++ finfo->fi_hdir = fidir; ++ atomic_set(&finfo->fi_generation, au_digen(dentry)); ++ /* smp_mb(); */ /* atomic_set */ ++ ++ file->private_data = finfo; ++ ++out: ++ return err; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/f_op.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,729 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * file and vm operations ++ */ ++ ++#include ++#include ++#include ++#include "aufs.h" ++ ++int au_do_open_nondir(struct file *file, int flags) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct file *h_file; ++ struct dentry *dentry; ++ struct au_finfo *finfo; ++ ++ FiMustWriteLock(file); ++ ++ dentry = file->f_dentry; ++ err = au_d_alive(dentry); ++ if (unlikely(err)) ++ goto out; ++ ++ finfo = au_fi(file); ++ memset(&finfo->fi_htop, 0, sizeof(finfo->fi_htop)); ++ atomic_set(&finfo->fi_mmapped, 0); ++ bindex = au_dbstart(dentry); ++ h_file = au_h_open(dentry, bindex, flags, file); ++ if (IS_ERR(h_file)) ++ err = PTR_ERR(h_file); ++ else { ++ au_set_fbstart(file, bindex); ++ au_set_h_fptr(file, bindex, h_file); ++ au_update_figen(file); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ } ++ ++out: ++ return err; ++} ++ ++static int aufs_open_nondir(struct inode *inode __maybe_unused, ++ struct file *file) ++{ ++ int err; ++ struct super_block *sb; ++ ++ AuDbg("%.*s, f_flags 0x%x, f_mode 0x%x\n", ++ AuDLNPair(file->f_dentry), vfsub_file_flags(file), ++ file->f_mode); ++ ++ sb = file->f_dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_do_open(file, au_do_open_nondir, /*fidir*/NULL); ++ si_read_unlock(sb); ++ return err; ++} ++ ++int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file) ++{ ++ struct au_finfo *finfo; ++ aufs_bindex_t bindex; ++ ++ finfo = au_fi(file); ++ bindex = finfo->fi_btop; ++ if (bindex >= 0) { ++ /* remove me from sb->s_files */ ++ file_sb_list_del(file); ++ au_set_h_fptr(file, bindex, NULL); ++ } ++ ++ au_finfo_fin(file); ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_do_flush_nondir(struct file *file, fl_owner_t id) ++{ ++ int err; ++ struct file *h_file; ++ ++ err = 0; ++ h_file = au_hf_top(file); ++ if (h_file) ++ err = vfsub_flush(h_file, id); ++ return err; ++} ++ ++static int aufs_flush_nondir(struct file *file, fl_owner_t id) ++{ ++ return au_do_flush(file, id, au_do_flush_nondir); ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * read and write functions acquire [fdi]_rwsem once, but release before ++ * mmap_sem. This is because to stop a race condition between mmap(2). ++ * Releasing these aufs-rwsem should be safe, no branch-mamagement (by keeping ++ * si_rwsem), no harmful copy-up should happen. Actually copy-up may happen in ++ * read functions after [fdi]_rwsem are released, but it should be harmless. ++ */ ++ ++static ssize_t aufs_read(struct file *file, char __user *buf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ struct dentry *dentry; ++ struct file *h_file; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ di_read_unlock(dentry, AuLock_IR); ++ fi_read_unlock(file); ++ ++ /* filedata may be obsoleted by concurrent copyup, but no problem */ ++ err = vfsub_read_u(h_file, buf, count, ppos); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ /* update without lock, I don't think it a problem */ ++ fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode); ++ fput(h_file); ++ ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ++ * todo: very ugly ++ * it locks both of i_mutex and si_rwsem for read in safe. ++ * if the plink maintenance mode continues forever (that is the problem), ++ * may loop forever. ++ */ ++static void au_mtx_and_read_lock(struct inode *inode) ++{ ++ int err; ++ struct super_block *sb = inode->i_sb; ++ ++ while (1) { ++ mutex_lock(&inode->i_mutex); ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (!err) ++ break; ++ mutex_unlock(&inode->i_mutex); ++ si_read_lock(sb, AuLock_NOPLMW); ++ si_read_unlock(sb); ++ } ++} ++ ++static ssize_t aufs_write(struct file *file, const char __user *ubuf, ++ size_t count, loff_t *ppos) ++{ ++ ssize_t err; ++ struct au_pin pin; ++ struct dentry *dentry; ++ struct super_block *sb; ++ struct inode *inode; ++ struct file *h_file; ++ char __user *buf = (char __user *)ubuf; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ au_mtx_and_read_lock(inode); ++ ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) { ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ goto out; ++ } ++ ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ au_unpin(&pin); ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ ++ err = vfsub_write_u(h_file, buf, count, ppos); ++ ii_write_lock_child(inode); ++ au_cpup_attr_timesizes(inode); ++ inode->i_mode = h_file->f_dentry->d_inode->i_mode; ++ ii_write_unlock(inode); ++ fput(h_file); ++ ++out: ++ si_read_unlock(sb); ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++ ++static ssize_t au_do_aio(struct file *h_file, int rw, struct kiocb *kio, ++ const struct iovec *iov, unsigned long nv, loff_t pos) ++{ ++ ssize_t err; ++ struct file *file; ++ ssize_t (*func)(struct kiocb *, const struct iovec *, unsigned long, ++ loff_t); ++ ++ err = security_file_permission(h_file, rw); ++ if (unlikely(err)) ++ goto out; ++ ++ err = -ENOSYS; ++ func = NULL; ++ if (rw == MAY_READ) ++ func = h_file->f_op->aio_read; ++ else if (rw == MAY_WRITE) ++ func = h_file->f_op->aio_write; ++ if (func) { ++ file = kio->ki_filp; ++ kio->ki_filp = h_file; ++ lockdep_off(); ++ err = func(kio, iov, nv, pos); ++ lockdep_on(); ++ kio->ki_filp = file; ++ } else ++ /* currently there is no such fs */ ++ WARN_ON_ONCE(1); ++ ++out: ++ return err; ++} ++ ++static ssize_t aufs_aio_read(struct kiocb *kio, const struct iovec *iov, ++ unsigned long nv, loff_t pos) ++{ ++ ssize_t err; ++ struct file *file, *h_file; ++ struct dentry *dentry; ++ struct super_block *sb; ++ ++ file = kio->ki_filp; ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ di_read_unlock(dentry, AuLock_IR); ++ fi_read_unlock(file); ++ ++ err = au_do_aio(h_file, MAY_READ, kio, iov, nv, pos); ++ /* todo: necessary? */ ++ /* file->f_ra = h_file->f_ra; */ ++ /* update without lock, I don't think it a problem */ ++ fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode); ++ fput(h_file); ++ ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++static ssize_t aufs_aio_write(struct kiocb *kio, const struct iovec *iov, ++ unsigned long nv, loff_t pos) ++{ ++ ssize_t err; ++ struct au_pin pin; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct file *file, *h_file; ++ struct super_block *sb; ++ ++ file = kio->ki_filp; ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ au_mtx_and_read_lock(inode); ++ ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) { ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ goto out; ++ } ++ ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ au_unpin(&pin); ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ ++ err = au_do_aio(h_file, MAY_WRITE, kio, iov, nv, pos); ++ ii_write_lock_child(inode); ++ au_cpup_attr_timesizes(inode); ++ inode->i_mode = h_file->f_dentry->d_inode->i_mode; ++ ii_write_unlock(inode); ++ fput(h_file); ++ ++out: ++ si_read_unlock(sb); ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++ ++static ssize_t aufs_splice_read(struct file *file, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags) ++{ ++ ssize_t err; ++ struct file *h_file; ++ struct dentry *dentry; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ err = -EINVAL; ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ if (au_test_loopback_kthread()) { ++ au_warn_loopback(h_file->f_dentry->d_sb); ++ if (file->f_mapping != h_file->f_mapping) { ++ file->f_mapping = h_file->f_mapping; ++ smp_mb(); /* unnecessary? */ ++ } ++ } ++ di_read_unlock(dentry, AuLock_IR); ++ fi_read_unlock(file); ++ ++ err = vfsub_splice_to(h_file, ppos, pipe, len, flags); ++ /* todo: necessasry? */ ++ /* file->f_ra = h_file->f_ra; */ ++ /* update without lock, I don't think it a problem */ ++ fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode); ++ fput(h_file); ++ ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++static ssize_t ++aufs_splice_write(struct pipe_inode_info *pipe, struct file *file, loff_t *ppos, ++ size_t len, unsigned int flags) ++{ ++ ssize_t err; ++ struct au_pin pin; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct file *h_file; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ au_mtx_and_read_lock(inode); ++ ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) { ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ goto out; ++ } ++ ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ au_unpin(&pin); ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ ++ err = vfsub_splice_from(pipe, h_file, ppos, len, flags); ++ ii_write_lock_child(inode); ++ au_cpup_attr_timesizes(inode); ++ inode->i_mode = h_file->f_dentry->d_inode->i_mode; ++ ii_write_unlock(inode); ++ fput(h_file); ++ ++out: ++ si_read_unlock(sb); ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * The locking order around current->mmap_sem. ++ * - in most and regular cases ++ * file I/O syscall -- aufs_read() or something ++ * -- si_rwsem for read -- mmap_sem ++ * (Note that [fdi]i_rwsem are released before mmap_sem). ++ * - in mmap case ++ * mmap(2) -- mmap_sem -- aufs_mmap() -- si_rwsem for read -- [fdi]i_rwsem ++ * This AB-BA order is definitly bad, but is not a problem since "si_rwsem for ++ * read" allows muliple processes to acquire it and [fdi]i_rwsem are not held in ++ * file I/O. Aufs needs to stop lockdep in aufs_mmap() though. ++ * It means that when aufs acquires si_rwsem for write, the process should never ++ * acquire mmap_sem. ++ * ++ * Actually aufs_readdir() holds [fdi]i_rwsem before mmap_sem, but this is not a ++ * problem either since any directory is not able to be mmap-ed. ++ * The similar scenario is applied to aufs_readlink() too. ++ */ ++ ++/* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */ ++#define AuConv_VM_PROT(f, b) _calc_vm_trans(f, VM_##b, PROT_##b) ++ ++static unsigned long au_arch_prot_conv(unsigned long flags) ++{ ++ /* currently ppc64 only */ ++#ifdef CONFIG_PPC64 ++ /* cf. linux/arch/powerpc/include/asm/mman.h */ ++ AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO); ++ return AuConv_VM_PROT(flags, SAO); ++#else ++ AuDebugOn(arch_calc_vm_prot_bits(-1)); ++ return 0; ++#endif ++} ++ ++static unsigned long au_prot_conv(unsigned long flags) ++{ ++ return AuConv_VM_PROT(flags, READ) ++ | AuConv_VM_PROT(flags, WRITE) ++ | AuConv_VM_PROT(flags, EXEC) ++ | au_arch_prot_conv(flags); ++} ++ ++/* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */ ++#define AuConv_VM_MAP(f, b) _calc_vm_trans(f, VM_##b, MAP_##b) ++ ++static unsigned long au_flag_conv(unsigned long flags) ++{ ++ return AuConv_VM_MAP(flags, GROWSDOWN) ++ | AuConv_VM_MAP(flags, DENYWRITE) ++ | AuConv_VM_MAP(flags, EXECUTABLE) ++ | AuConv_VM_MAP(flags, LOCKED); ++} ++ ++static int aufs_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ int err; ++ unsigned long prot; ++ aufs_bindex_t bstart; ++ const unsigned char wlock ++ = (file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED); ++ struct dentry *dentry; ++ struct super_block *sb; ++ struct file *h_file; ++ struct au_branch *br; ++ struct au_pin pin; ++ ++ AuDbgVmRegion(file, vma); ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ lockdep_off(); ++ si_read_lock(sb, AuLock_NOPLMW); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ if (wlock) { ++ err = au_ready_to_write(file, -1, &pin); ++ di_write_unlock(dentry); ++ if (unlikely(err)) { ++ fi_write_unlock(file); ++ goto out; ++ } ++ au_unpin(&pin); ++ } else ++ di_write_unlock(dentry); ++ ++ bstart = au_fbstart(file); ++ br = au_sbr(sb, bstart); ++ h_file = au_hf_top(file); ++ get_file(h_file); ++ au_set_mmapped(file); ++ fi_write_unlock(file); ++ lockdep_on(); ++ ++ au_vm_file_reset(vma, h_file); ++ prot = au_prot_conv(vma->vm_flags); ++ err = security_file_mmap(h_file, /*reqprot*/prot, prot, ++ au_flag_conv(vma->vm_flags), vma->vm_start, 0); ++ if (!err) ++ err = h_file->f_op->mmap(h_file, vma); ++ if (unlikely(err)) ++ goto out_reset; ++ ++ au_vm_prfile_set(vma, file); ++ /* update without lock, I don't think it a problem */ ++ fsstack_copy_attr_atime(file->f_dentry->d_inode, ++ h_file->f_dentry->d_inode); ++ goto out_fput; /* success */ ++ ++out_reset: ++ au_unset_mmapped(file); ++ au_vm_file_reset(vma, file); ++out_fput: ++ fput(h_file); ++ lockdep_off(); ++out: ++ si_read_unlock(sb); ++ lockdep_on(); ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_fsync_nondir(struct file *file, loff_t start, loff_t end, ++ int datasync) ++{ ++ int err; ++ struct au_pin pin; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct file *h_file; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ inode = dentry->d_inode; ++ sb = dentry->d_sb; ++ mutex_lock(&inode->i_mutex); ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out; ++ ++ err = 0; /* -EBADF; */ /* posix? */ ++ if (unlikely(!(file->f_mode & FMODE_WRITE))) ++ goto out_si; ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out_si; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ au_unpin(&pin); ++ ++ err = -EINVAL; ++ h_file = au_hf_top(file); ++ err = vfsub_fsync(h_file, &h_file->f_path, datasync); ++ au_cpup_attr_timesizes(inode); ++ ++out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++out_si: ++ si_read_unlock(sb); ++out: ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++ ++/* no one supports this operation, currently */ ++#if 0 ++static int aufs_aio_fsync_nondir(struct kiocb *kio, int datasync) ++{ ++ int err; ++ struct au_pin pin; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct file *file, *h_file; ++ ++ file = kio->ki_filp; ++ dentry = file->f_dentry; ++ inode = dentry->d_inode; ++ au_mtx_and_read_lock(inode); ++ ++ err = 0; /* -EBADF; */ /* posix? */ ++ if (unlikely(!(file->f_mode & FMODE_WRITE))) ++ goto out; ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_ready_to_write(file, -1, &pin); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ au_unpin(&pin); ++ ++ err = -ENOSYS; ++ h_file = au_hf_top(file); ++ if (h_file->f_op && h_file->f_op->aio_fsync) { ++ struct dentry *h_d; ++ struct mutex *h_mtx; ++ ++ h_d = h_file->f_dentry; ++ h_mtx = &h_d->d_inode->i_mutex; ++ if (!is_sync_kiocb(kio)) { ++ get_file(h_file); ++ fput(file); ++ } ++ kio->ki_filp = h_file; ++ err = h_file->f_op->aio_fsync(kio, datasync); ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ if (!err) ++ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); ++ /*ignore*/ ++ au_cpup_attr_timesizes(inode); ++ mutex_unlock(h_mtx); ++ } ++ ++out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++out: ++ si_read_unlock(inode->sb); ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++#endif ++ ++static int aufs_fasync(int fd, struct file *file, int flag) ++{ ++ int err; ++ struct file *h_file; ++ struct dentry *dentry; ++ struct super_block *sb; ++ ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ h_file = au_hf_top(file); ++ if (h_file->f_op && h_file->f_op->fasync) ++ err = h_file->f_op->fasync(fd, h_file, flag); ++ ++ di_read_unlock(dentry, AuLock_IR); ++ fi_read_unlock(file); ++ ++out: ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* no one supports this operation, currently */ ++#if 0 ++static ssize_t aufs_sendpage(struct file *file, struct page *page, int offset, ++ size_t len, loff_t *pos , int more) ++{ ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++const struct file_operations aufs_file_fop = { ++ .owner = THIS_MODULE, ++ ++ .llseek = default_llseek, ++ ++ .read = aufs_read, ++ .write = aufs_write, ++ .aio_read = aufs_aio_read, ++ .aio_write = aufs_aio_write, ++#ifdef CONFIG_AUFS_POLL ++ .poll = aufs_poll, ++#endif ++ .unlocked_ioctl = aufs_ioctl_nondir, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = aufs_ioctl_nondir, /* same */ ++#endif ++ .mmap = aufs_mmap, ++ .open = aufs_open_nondir, ++ .flush = aufs_flush_nondir, ++ .release = aufs_release_nondir, ++ .fsync = aufs_fsync_nondir, ++ /* .aio_fsync = aufs_aio_fsync_nondir, */ ++ .fasync = aufs_fasync, ++ /* .sendpage = aufs_sendpage, */ ++ .splice_write = aufs_splice_write, ++ .splice_read = aufs_splice_read, ++#if 0 ++ .aio_splice_write = aufs_aio_splice_write, ++ .aio_splice_read = aufs_aio_splice_read ++#endif ++}; +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/f_op_sp.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,298 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * file operations for special files. ++ * while they exist in aufs virtually, ++ * their file I/O is handled out of aufs. ++ */ ++ ++#include "aufs.h" ++ ++static ssize_t aufs_aio_read_sp(struct kiocb *kio, const struct iovec *iov, ++ unsigned long nv, loff_t pos) ++{ ++ ssize_t err; ++ aufs_bindex_t bstart; ++ unsigned char wbr; ++ struct file *file, *h_file; ++ struct super_block *sb; ++ ++ file = kio->ki_filp; ++ sb = file->f_dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ fi_read_lock(file); ++ bstart = au_fbstart(file); ++ h_file = au_hf_top(file); ++ fi_read_unlock(file); ++ wbr = !!au_br_writable(au_sbr(sb, bstart)->br_perm); ++ si_read_unlock(sb); ++ ++ /* do not change the file in kio */ ++ AuDebugOn(!h_file->f_op || !h_file->f_op->aio_read); ++ err = h_file->f_op->aio_read(kio, iov, nv, pos); ++ if (err > 0 && wbr) ++ file_accessed(h_file); ++ ++ return err; ++} ++ ++static ssize_t aufs_aio_write_sp(struct kiocb *kio, const struct iovec *iov, ++ unsigned long nv, loff_t pos) ++{ ++ ssize_t err; ++ aufs_bindex_t bstart; ++ unsigned char wbr; ++ struct super_block *sb; ++ struct file *file, *h_file; ++ ++ file = kio->ki_filp; ++ sb = file->f_dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ fi_read_lock(file); ++ bstart = au_fbstart(file); ++ h_file = au_hf_top(file); ++ fi_read_unlock(file); ++ wbr = !!au_br_writable(au_sbr(sb, bstart)->br_perm); ++ si_read_unlock(sb); ++ ++ /* do not change the file in kio */ ++ AuDebugOn(!h_file->f_op || !h_file->f_op->aio_write); ++ err = h_file->f_op->aio_write(kio, iov, nv, pos); ++ if (err > 0 && wbr) ++ file_update_time(h_file); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int aufs_release_sp(struct inode *inode, struct file *file) ++{ ++ int err; ++ struct file *h_file; ++ ++ fi_read_lock(file); ++ h_file = au_hf_top(file); ++ fi_read_unlock(file); ++ /* close this fifo in aufs */ ++ err = h_file->f_op->release(inode, file); /* ignore */ ++ aufs_release_nondir(inode, file); /* ignore */ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* currently, support only FIFO */ ++enum { ++ AuSp_FIFO, AuSp_FIFO_R, AuSp_FIFO_W, AuSp_FIFO_RW, ++ /* AuSp_SOCK, AuSp_CHR, AuSp_BLK, */ ++ AuSp_Last ++}; ++static int aufs_open_sp(struct inode *inode, struct file *file); ++static struct au_sp_fop { ++ int done; ++ struct file_operations fop; /* not 'const' */ ++ spinlock_t spin; ++} au_sp_fop[AuSp_Last] = { ++ [AuSp_FIFO] = { ++ .fop = { ++ .owner = THIS_MODULE, ++ .open = aufs_open_sp ++ } ++ } ++}; ++ ++static void au_init_fop_sp(struct file *file) ++{ ++ struct au_sp_fop *p; ++ int i; ++ struct file *h_file; ++ ++ p = au_sp_fop; ++ if (unlikely(!p->done)) { ++ /* initialize first time only */ ++ static DEFINE_SPINLOCK(spin); ++ ++ spin_lock(&spin); ++ if (!p->done) { ++ BUILD_BUG_ON(sizeof(au_sp_fop)/sizeof(*au_sp_fop) ++ != AuSp_Last); ++ for (i = 0; i < AuSp_Last; i++) ++ spin_lock_init(&p[i].spin); ++ p->done = 1; ++ } ++ spin_unlock(&spin); ++ } ++ ++ switch (file->f_mode & (FMODE_READ | FMODE_WRITE)) { ++ case FMODE_READ: ++ i = AuSp_FIFO_R; ++ break; ++ case FMODE_WRITE: ++ i = AuSp_FIFO_W; ++ break; ++ case FMODE_READ | FMODE_WRITE: ++ i = AuSp_FIFO_RW; ++ break; ++ default: ++ BUG(); ++ } ++ ++ p += i; ++ if (unlikely(!p->done)) { ++ /* initialize first time only */ ++ h_file = au_hf_top(file); ++ spin_lock(&p->spin); ++ if (!p->done) { ++ p->fop = *h_file->f_op; ++ p->fop.owner = THIS_MODULE; ++ if (p->fop.aio_read) ++ p->fop.aio_read = aufs_aio_read_sp; ++ if (p->fop.aio_write) ++ p->fop.aio_write = aufs_aio_write_sp; ++ p->fop.release = aufs_release_sp; ++ p->done = 1; ++ } ++ spin_unlock(&p->spin); ++ } ++ file->f_op = &p->fop; ++} ++ ++static int au_cpup_sp(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bcpup; ++ struct au_pin pin; ++ struct au_wr_dir_args wr_dir_args = { ++ .force_btgt = -1, ++ .flags = 0 ++ }; ++ ++ AuDbg("%.*s\n", AuDLNPair(dentry)); ++ ++ di_read_unlock(dentry, AuLock_IR); ++ di_write_lock_child(dentry); ++ err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args); ++ if (unlikely(err < 0)) ++ goto out; ++ bcpup = err; ++ err = 0; ++ if (bcpup == au_dbstart(dentry)) ++ goto out; /* success */ ++ ++ err = au_pin(&pin, dentry, bcpup, au_opt_udba(dentry->d_sb), ++ AuPin_MNT_WRITE); ++ if (!err) { ++ err = au_sio_cpup_simple(dentry, bcpup, -1, AuCpup_DTIME); ++ au_unpin(&pin); ++ } ++ ++out: ++ di_downgrade_lock(dentry, AuLock_IR); ++ return err; ++} ++ ++static int au_do_open_sp(struct file *file, int flags) ++{ ++ int err; ++ struct dentry *dentry; ++ struct super_block *sb; ++ struct file *h_file; ++ struct inode *h_inode; ++ ++ dentry = file->f_dentry; ++ AuDbg("%.*s\n", AuDLNPair(dentry)); ++ ++ /* ++ * try copying-up. ++ * operate on the ro branch is not an error. ++ */ ++ au_cpup_sp(dentry); /* ignore */ ++ ++ /* prepare h_file */ ++ err = au_do_open_nondir(file, vfsub_file_flags(file)); ++ if (unlikely(err)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ h_file = au_hf_top(file); ++ h_inode = h_file->f_dentry->d_inode; ++ di_read_unlock(dentry, AuLock_IR); ++ fi_write_unlock(file); ++ si_read_unlock(sb); ++ /* open this fifo in aufs */ ++ err = h_inode->i_fop->open(file->f_dentry->d_inode, file); ++ si_noflush_read_lock(sb); ++ fi_write_lock(file); ++ di_read_lock_child(dentry, AuLock_IR); ++ if (!err) ++ au_init_fop_sp(file); ++ ++out: ++ return err; ++} ++ ++static int aufs_open_sp(struct inode *inode, struct file *file) ++{ ++ int err; ++ struct super_block *sb; ++ ++ sb = file->f_dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ err = au_do_open(file, au_do_open_sp, /*fidir*/NULL); ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_init_special_fop(struct inode *inode, umode_t mode, dev_t rdev) ++{ ++ init_special_inode(inode, mode, rdev); ++ ++ switch (mode & S_IFMT) { ++ case S_IFIFO: ++ inode->i_fop = &au_sp_fop[AuSp_FIFO].fop; ++ /*FALLTHROUGH*/ ++ case S_IFCHR: ++ case S_IFBLK: ++ case S_IFSOCK: ++ break; ++ default: ++ AuDebugOn(1); ++ } ++} ++ ++int au_special_file(umode_t mode) ++{ ++ int ret; ++ ++ ret = 0; ++ switch (mode & S_IFMT) { ++ case S_IFIFO: ++#if 0 ++ case S_IFCHR: ++ case S_IFBLK: ++ case S_IFSOCK: ++#endif ++ ret = 1; ++ } ++ ++ return ret; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/fstype.h 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,496 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * judging filesystem type ++ */ ++ ++#ifndef __AUFS_FSTYPE_H__ ++#define __AUFS_FSTYPE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++ ++static inline int au_test_aufs(struct super_block *sb) ++{ ++ return sb->s_magic == AUFS_SUPER_MAGIC; ++} ++ ++static inline const char *au_sbtype(struct super_block *sb) ++{ ++ return sb->s_type->name; ++} ++ ++static inline int au_test_iso9660(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_ROMFS_FS) || defined(CONFIG_ROMFS_FS_MODULE) ++ return sb->s_magic == ROMFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_romfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_ISO9660_FS) || defined(CONFIG_ISO9660_FS_MODULE) ++ return sb->s_magic == ISOFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_cramfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_CRAMFS) || defined(CONFIG_CRAMFS_MODULE) ++ return sb->s_magic == CRAMFS_MAGIC; ++#endif ++ return 0; ++} ++ ++static inline int au_test_nfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE) ++ return sb->s_magic == NFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_fuse(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE) ++ return sb->s_magic == FUSE_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_xfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_XFS_FS) || defined(CONFIG_XFS_FS_MODULE) ++ return sb->s_magic == XFS_SB_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_tmpfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_TMPFS ++ return sb->s_magic == TMPFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ecryptfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_ECRYPT_FS) || defined(CONFIG_ECRYPT_FS_MODULE) ++ return !strcmp(au_sbtype(sb), "ecryptfs"); ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_smbfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_SMB_FS) || defined(CONFIG_SMB_FS_MODULE) ++ return sb->s_magic == SMB_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ocfs2(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_OCFS2_FS) || defined(CONFIG_OCFS2_FS_MODULE) ++ return sb->s_magic == OCFS2_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ocfs2_dlmfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_OCFS2_FS_O2CB) || defined(CONFIG_OCFS2_FS_O2CB_MODULE) ++ return sb->s_magic == DLMFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_coda(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_CODA_FS) || defined(CONFIG_CODA_FS_MODULE) ++ return sb->s_magic == CODA_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_v9fs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_9P_FS) || defined(CONFIG_9P_FS_MODULE) ++ return sb->s_magic == V9FS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ext4(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_EXT4DEV_FS) || defined(CONFIG_EXT4DEV_FS_MODULE) ++ return sb->s_magic == EXT4_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_sysv(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_SYSV_FS) || defined(CONFIG_SYSV_FS_MODULE) ++ return !strcmp(au_sbtype(sb), "sysv"); ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_ramfs(struct super_block *sb) ++{ ++ return sb->s_magic == RAMFS_MAGIC; ++} ++ ++static inline int au_test_ubifs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_UBIFS_FS) || defined(CONFIG_UBIFS_FS_MODULE) ++ return sb->s_magic == UBIFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_procfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_PROC_FS ++ return sb->s_magic == PROC_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_sysfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_SYSFS ++ return sb->s_magic == SYSFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_configfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_CONFIGFS_FS) || defined(CONFIG_CONFIGFS_FS_MODULE) ++ return sb->s_magic == CONFIGFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_minix(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_MINIX_FS) || defined(CONFIG_MINIX_FS_MODULE) ++ return sb->s_magic == MINIX3_SUPER_MAGIC ++ || sb->s_magic == MINIX2_SUPER_MAGIC ++ || sb->s_magic == MINIX2_SUPER_MAGIC2 ++ || sb->s_magic == MINIX_SUPER_MAGIC ++ || sb->s_magic == MINIX_SUPER_MAGIC2; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_cifs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_CIFS_FS) || defined(CONFIGCIFS_FS_MODULE) ++ return sb->s_magic == CIFS_MAGIC_NUMBER; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_fat(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_FAT_FS) || defined(CONFIG_FAT_FS_MODULE) ++ return sb->s_magic == MSDOS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_msdos(struct super_block *sb) ++{ ++ return au_test_fat(sb); ++} ++ ++static inline int au_test_vfat(struct super_block *sb) ++{ ++ return au_test_fat(sb); ++} ++ ++static inline int au_test_securityfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_SECURITYFS ++ return sb->s_magic == SECURITYFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_squashfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_SQUASHFS) || defined(CONFIG_SQUASHFS_MODULE) ++ return sb->s_magic == SQUASHFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_btrfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_BTRFS_FS) || defined(CONFIG_BTRFS_FS_MODULE) ++ return sb->s_magic == BTRFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_xenfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_XENFS) || defined(CONFIG_XENFS_MODULE) ++ return sb->s_magic == XENFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_debugfs(struct super_block *sb __maybe_unused) ++{ ++#ifdef CONFIG_DEBUG_FS ++ return sb->s_magic == DEBUGFS_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_nilfs(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_NILFS) || defined(CONFIG_NILFS_MODULE) ++ return sb->s_magic == NILFS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++static inline int au_test_hfsplus(struct super_block *sb __maybe_unused) ++{ ++#if defined(CONFIG_HFSPLUS_FS) || defined(CONFIG_HFSPLUS_FS_MODULE) ++ return sb->s_magic == HFSPLUS_SUPER_MAGIC; ++#else ++ return 0; ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * they can't be an aufs branch. ++ */ ++static inline int au_test_fs_unsuppoted(struct super_block *sb) ++{ ++ return ++#ifndef CONFIG_AUFS_BR_RAMFS ++ au_test_ramfs(sb) || ++#endif ++ au_test_procfs(sb) ++ || au_test_sysfs(sb) ++ || au_test_configfs(sb) ++ || au_test_debugfs(sb) ++ || au_test_securityfs(sb) ++ || au_test_xenfs(sb) ++ || au_test_ecryptfs(sb) ++ /* || !strcmp(au_sbtype(sb), "unionfs") */ ++ || au_test_aufs(sb); /* will be supported in next version */ ++} ++ ++/* ++ * If the filesystem supports NFS-export, then it has to support NULL as ++ * a nameidata parameter for ->create(), ->lookup() and ->d_revalidate(). ++ * We can apply this principle when we handle a lower filesystem. ++ */ ++static inline int au_test_fs_null_nd(struct super_block *sb) ++{ ++ return !!sb->s_export_op; ++} ++ ++static inline int au_test_fs_remote(struct super_block *sb) ++{ ++ return !au_test_tmpfs(sb) ++#ifdef CONFIG_AUFS_BR_RAMFS ++ && !au_test_ramfs(sb) ++#endif ++ && !(sb->s_type->fs_flags & FS_REQUIRES_DEV); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * Note: these functions (below) are created after reading ->getattr() in all ++ * filesystems under linux/fs. it means we have to do so in every update... ++ */ ++ ++/* ++ * some filesystems require getattr to refresh the inode attributes before ++ * referencing. ++ * in most cases, we can rely on the inode attribute in NFS (or every remote fs) ++ * and leave the work for d_revalidate() ++ */ ++static inline int au_test_fs_refresh_iattr(struct super_block *sb) ++{ ++ return au_test_nfs(sb) ++ || au_test_fuse(sb) ++ /* || au_test_smbfs(sb) */ /* untested */ ++ /* || au_test_ocfs2(sb) */ /* untested */ ++ /* || au_test_btrfs(sb) */ /* untested */ ++ /* || au_test_coda(sb) */ /* untested */ ++ /* || au_test_v9fs(sb) */ /* untested */ ++ ; ++} ++ ++/* ++ * filesystems which don't maintain i_size or i_blocks. ++ */ ++static inline int au_test_fs_bad_iattr_size(struct super_block *sb) ++{ ++ return au_test_xfs(sb) ++ || au_test_btrfs(sb) ++ || au_test_ubifs(sb) ++ || au_test_hfsplus(sb) /* maintained, but incorrect */ ++ /* || au_test_ext4(sb) */ /* untested */ ++ /* || au_test_ocfs2(sb) */ /* untested */ ++ /* || au_test_ocfs2_dlmfs(sb) */ /* untested */ ++ /* || au_test_sysv(sb) */ /* untested */ ++ /* || au_test_minix(sb) */ /* untested */ ++ ; ++} ++ ++/* ++ * filesystems which don't store the correct value in some of their inode ++ * attributes. ++ */ ++static inline int au_test_fs_bad_iattr(struct super_block *sb) ++{ ++ return au_test_fs_bad_iattr_size(sb) ++ /* || au_test_cifs(sb) */ /* untested */ ++ || au_test_fat(sb) ++ || au_test_msdos(sb) ++ || au_test_vfat(sb); ++} ++ ++/* they don't check i_nlink in link(2) */ ++static inline int au_test_fs_no_limit_nlink(struct super_block *sb) ++{ ++ return au_test_tmpfs(sb) ++#ifdef CONFIG_AUFS_BR_RAMFS ++ || au_test_ramfs(sb) ++#endif ++ || au_test_ubifs(sb) ++ || au_test_btrfs(sb) ++ || au_test_hfsplus(sb); ++} ++ ++/* ++ * filesystems which sets S_NOATIME and S_NOCMTIME. ++ */ ++static inline int au_test_fs_notime(struct super_block *sb) ++{ ++ return au_test_nfs(sb) ++ || au_test_fuse(sb) ++ || au_test_ubifs(sb) ++ /* || au_test_cifs(sb) */ /* untested */ ++ ; ++} ++ ++/* ++ * filesystems which requires replacing i_mapping. ++ */ ++static inline int au_test_fs_bad_mapping(struct super_block *sb) ++{ ++ return au_test_fuse(sb) ++ || au_test_ubifs(sb); ++} ++ ++/* temporary support for i#1 in cramfs */ ++static inline int au_test_fs_unique_ino(struct inode *inode) ++{ ++ if (au_test_cramfs(inode->i_sb)) ++ return inode->i_ino != 1; ++ return 1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * the filesystem where the xino files placed must support i/o after unlink and ++ * maintain i_size and i_blocks. ++ */ ++static inline int au_test_fs_bad_xino(struct super_block *sb) ++{ ++ return au_test_fs_remote(sb) ++ || au_test_fs_bad_iattr_size(sb) ++#ifdef CONFIG_AUFS_BR_RAMFS ++ || !(au_test_ramfs(sb) || au_test_fs_null_nd(sb)) ++#else ++ || !au_test_fs_null_nd(sb) /* to keep xino code simple */ ++#endif ++ /* don't want unnecessary work for xino */ ++ || au_test_aufs(sb) ++ || au_test_ecryptfs(sb) ++ || au_test_nilfs(sb); ++} ++ ++static inline int au_test_fs_trunc_xino(struct super_block *sb) ++{ ++ return au_test_tmpfs(sb) ++ || au_test_ramfs(sb); ++} ++ ++/* ++ * test if the @sb is real-readonly. ++ */ ++static inline int au_test_fs_rr(struct super_block *sb) ++{ ++ return au_test_squashfs(sb) ++ || au_test_iso9660(sb) ++ || au_test_cramfs(sb) ++ || au_test_romfs(sb); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_FSTYPE_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/hfsnotify.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,260 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * fsnotify for the lower directories ++ */ ++ ++#include "aufs.h" ++ ++/* FS_IN_IGNORED is unnecessary */ ++static const __u32 AuHfsnMask = (FS_MOVED_TO | FS_MOVED_FROM | FS_DELETE ++ | FS_CREATE | FS_EVENT_ON_CHILD); ++static DECLARE_WAIT_QUEUE_HEAD(au_hfsn_wq); ++static __cacheline_aligned_in_smp atomic64_t au_hfsn_ifree = ATOMIC64_INIT(0); ++ ++static void au_hfsn_free_mark(struct fsnotify_mark *mark) ++{ ++ struct au_hnotify *hn = container_of(mark, struct au_hnotify, ++ hn_mark); ++ AuDbg("here\n"); ++ au_cache_free_hnotify(hn); ++ smp_mb__before_atomic_dec(); ++ atomic64_dec(&au_hfsn_ifree); ++ wake_up(&au_hfsn_wq); ++} ++ ++static int au_hfsn_alloc(struct au_hinode *hinode) ++{ ++ struct au_hnotify *hn; ++ struct super_block *sb; ++ struct au_branch *br; ++ struct fsnotify_mark *mark; ++ aufs_bindex_t bindex; ++ ++ hn = hinode->hi_notify; ++ sb = hn->hn_aufs_inode->i_sb; ++ bindex = au_br_index(sb, hinode->hi_id); ++ br = au_sbr(sb, bindex); ++ mark = &hn->hn_mark; ++ fsnotify_init_mark(mark, au_hfsn_free_mark); ++ mark->mask = AuHfsnMask; ++ /* ++ * by udba rename or rmdir, aufs assign a new inode to the known ++ * h_inode, so specify 1 to allow dups. ++ */ ++ return fsnotify_add_mark(mark, br->br_hfsn_group, hinode->hi_inode, ++ /*mnt*/NULL, /*allow_dups*/1); ++} ++ ++static int au_hfsn_free(struct au_hinode *hinode, struct au_hnotify *hn) ++{ ++ struct fsnotify_mark *mark; ++ unsigned long long ull; ++ ++ ull = atomic64_inc_return(&au_hfsn_ifree); ++ BUG_ON(!ull); ++ ++ mark = &hn->hn_mark; ++ fsnotify_destroy_mark(mark); ++ fsnotify_put_mark(mark); ++ ++ /* free hn by myself */ ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_hfsn_ctl(struct au_hinode *hinode, int do_set) ++{ ++ struct fsnotify_mark *mark; ++ ++ mark = &hinode->hi_notify->hn_mark; ++ spin_lock(&mark->lock); ++ if (do_set) { ++ AuDebugOn(mark->mask & AuHfsnMask); ++ mark->mask |= AuHfsnMask; ++ } else { ++ AuDebugOn(!(mark->mask & AuHfsnMask)); ++ mark->mask &= ~AuHfsnMask; ++ } ++ spin_unlock(&mark->lock); ++ /* fsnotify_recalc_inode_mask(hinode->hi_inode); */ ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* #define AuDbgHnotify */ ++#ifdef AuDbgHnotify ++static char *au_hfsn_name(u32 mask) ++{ ++#ifdef CONFIG_AUFS_DEBUG ++#define test_ret(flag) if (mask & flag) \ ++ return #flag; ++ test_ret(FS_ACCESS); ++ test_ret(FS_MODIFY); ++ test_ret(FS_ATTRIB); ++ test_ret(FS_CLOSE_WRITE); ++ test_ret(FS_CLOSE_NOWRITE); ++ test_ret(FS_OPEN); ++ test_ret(FS_MOVED_FROM); ++ test_ret(FS_MOVED_TO); ++ test_ret(FS_CREATE); ++ test_ret(FS_DELETE); ++ test_ret(FS_DELETE_SELF); ++ test_ret(FS_MOVE_SELF); ++ test_ret(FS_UNMOUNT); ++ test_ret(FS_Q_OVERFLOW); ++ test_ret(FS_IN_IGNORED); ++ test_ret(FS_IN_ISDIR); ++ test_ret(FS_IN_ONESHOT); ++ test_ret(FS_EVENT_ON_CHILD); ++ return ""; ++#undef test_ret ++#else ++ return "??"; ++#endif ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_hfsn_handle_event(struct fsnotify_group *group, ++ struct fsnotify_mark *inode_mark, ++ struct fsnotify_mark *vfsmount_mark, ++ struct fsnotify_event *event) ++{ ++ int err; ++ struct au_hnotify *hnotify; ++ struct inode *h_dir, *h_inode; ++ __u32 mask; ++ struct qstr h_child_qstr = { ++ .name = event->file_name, ++ .len = event->name_len ++ }; ++ ++ AuDebugOn(event->data_type != FSNOTIFY_EVENT_INODE); ++ ++ err = 0; ++ /* if FS_UNMOUNT happens, there must be another bug */ ++ mask = event->mask; ++ AuDebugOn(mask & FS_UNMOUNT); ++ if (mask & (FS_IN_IGNORED | FS_UNMOUNT)) ++ goto out; ++ ++ h_dir = event->to_tell; ++ h_inode = event->inode; ++#ifdef AuDbgHnotify ++ au_debug(1); ++ if (1 || h_child_qstr.len != sizeof(AUFS_XINO_FNAME) - 1 ++ || strncmp(h_child_qstr.name, AUFS_XINO_FNAME, h_child_qstr.len)) { ++ AuDbg("i%lu, mask 0x%x %s, hcname %.*s, hi%lu\n", ++ h_dir->i_ino, mask, au_hfsn_name(mask), ++ AuLNPair(&h_child_qstr), h_inode ? h_inode->i_ino : 0); ++ /* WARN_ON(1); */ ++ } ++ au_debug(0); ++#endif ++ ++ AuDebugOn(!inode_mark); ++ hnotify = container_of(inode_mark, struct au_hnotify, hn_mark); ++ err = au_hnotify(h_dir, hnotify, mask, &h_child_qstr, h_inode); ++ ++out: ++ return err; ++} ++ ++/* isn't it waste to ask every registered 'group'? */ ++/* copied from linux/fs/notify/inotify/inotify_fsnotiry.c */ ++/* it should be exported to modules */ ++static bool au_hfsn_should_send_event(struct fsnotify_group *group, ++ struct inode *h_inode, ++ struct fsnotify_mark *inode_mark, ++ struct fsnotify_mark *vfsmount_mark, ++ __u32 mask, void *data, int data_type) ++{ ++ mask = (mask & ~FS_EVENT_ON_CHILD); ++ return inode_mark->mask & mask; ++} ++ ++static struct fsnotify_ops au_hfsn_ops = { ++ .should_send_event = au_hfsn_should_send_event, ++ .handle_event = au_hfsn_handle_event ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_hfsn_fin_br(struct au_branch *br) ++{ ++ if (br->br_hfsn_group) ++ fsnotify_put_group(br->br_hfsn_group); ++} ++ ++static int au_hfsn_init_br(struct au_branch *br, int perm) ++{ ++ br->br_hfsn_group = NULL; ++ br->br_hfsn_ops = au_hfsn_ops; ++ return 0; ++} ++ ++static int au_hfsn_reset_br(unsigned int udba, struct au_branch *br, int perm) ++{ ++ int err; ++ ++ err = 0; ++ if (udba != AuOpt_UDBA_HNOTIFY ++ || !au_br_hnotifyable(perm)) { ++ au_hfsn_fin_br(br); ++ br->br_hfsn_group = NULL; ++ goto out; ++ } ++ ++ if (br->br_hfsn_group) ++ goto out; ++ ++ br->br_hfsn_group = fsnotify_alloc_group(&br->br_hfsn_ops); ++ if (IS_ERR(br->br_hfsn_group)) { ++ err = PTR_ERR(br->br_hfsn_group); ++ pr_err("fsnotify_alloc_group() failed, %d\n", err); ++ br->br_hfsn_group = NULL; ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_hfsn_fin(void) ++{ ++ AuDbg("au_hfsn_ifree %lld\n", (long long)atomic64_read(&au_hfsn_ifree)); ++ wait_event(au_hfsn_wq, !atomic64_read(&au_hfsn_ifree)); ++} ++ ++const struct au_hnotify_op au_hnotify_op = { ++ .ctl = au_hfsn_ctl, ++ .alloc = au_hfsn_alloc, ++ .free = au_hfsn_free, ++ ++ .fin = au_hfsn_fin, ++ ++ .reset_br = au_hfsn_reset_br, ++ .fin_br = au_hfsn_fin_br, ++ .init_br = au_hfsn_init_br ++}; +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/hfsplus.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,57 @@ ++/* ++ * Copyright (C) 2010-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * special support for filesystems which aqucires an inode mutex ++ * at final closing a file, eg, hfsplus. ++ * ++ * This trick is very simple and stupid, just to open the file before really ++ * neceeary open to tell hfsplus that this is not the final closing. ++ * The caller should call au_h_open_pre() after acquiring the inode mutex, ++ * and au_h_open_post() after releasing it. ++ */ ++ ++#include "aufs.h" ++ ++struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ struct file *h_file; ++ struct dentry *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ AuDebugOn(!h_dentry); ++ AuDebugOn(!h_dentry->d_inode); ++ IMustLock(h_dentry->d_inode); ++ ++ h_file = NULL; ++ if (au_test_hfsplus(h_dentry->d_sb) ++ && S_ISREG(h_dentry->d_inode->i_mode)) ++ h_file = au_h_open(dentry, bindex, ++ O_RDONLY | O_NOATIME | O_LARGEFILE, ++ /*file*/NULL); ++ return h_file; ++} ++ ++void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex, ++ struct file *h_file) ++{ ++ if (h_file) { ++ fput(h_file); ++ au_sbr_put(dentry->d_sb, bindex); ++ } ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/hnotify.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,712 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * abstraction to notify the direct changes on lower directories ++ */ ++ ++#include "aufs.h" ++ ++int au_hn_alloc(struct au_hinode *hinode, struct inode *inode) ++{ ++ int err; ++ struct au_hnotify *hn; ++ ++ err = -ENOMEM; ++ hn = au_cache_alloc_hnotify(); ++ if (hn) { ++ hn->hn_aufs_inode = inode; ++ hinode->hi_notify = hn; ++ err = au_hnotify_op.alloc(hinode); ++ AuTraceErr(err); ++ if (unlikely(err)) { ++ hinode->hi_notify = NULL; ++ au_cache_free_hnotify(hn); ++ /* ++ * The upper dir was removed by udba, but the same named ++ * dir left. In this case, aufs assignes a new inode ++ * number and set the monitor again. ++ * For the lower dir, the old monitnor is still left. ++ */ ++ if (err == -EEXIST) ++ err = 0; ++ } ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++void au_hn_free(struct au_hinode *hinode) ++{ ++ struct au_hnotify *hn; ++ ++ hn = hinode->hi_notify; ++ if (hn) { ++ hinode->hi_notify = NULL; ++ if (au_hnotify_op.free(hinode, hn)) ++ au_cache_free_hnotify(hn); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_hn_ctl(struct au_hinode *hinode, int do_set) ++{ ++ if (hinode->hi_notify) ++ au_hnotify_op.ctl(hinode, do_set); ++} ++ ++void au_hn_reset(struct inode *inode, unsigned int flags) ++{ ++ aufs_bindex_t bindex, bend; ++ struct inode *hi; ++ struct dentry *iwhdentry; ++ ++ bend = au_ibend(inode); ++ for (bindex = au_ibstart(inode); bindex <= bend; bindex++) { ++ hi = au_h_iptr(inode, bindex); ++ if (!hi) ++ continue; ++ ++ /* mutex_lock_nested(&hi->i_mutex, AuLsc_I_CHILD); */ ++ iwhdentry = au_hi_wh(inode, bindex); ++ if (iwhdentry) ++ dget(iwhdentry); ++ au_igrab(hi); ++ au_set_h_iptr(inode, bindex, NULL, 0); ++ au_set_h_iptr(inode, bindex, au_igrab(hi), ++ flags & ~AuHi_XINO); ++ iput(hi); ++ dput(iwhdentry); ++ /* mutex_unlock(&hi->i_mutex); */ ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int hn_xino(struct inode *inode, struct inode *h_inode) ++{ ++ int err; ++ aufs_bindex_t bindex, bend, bfound, bstart; ++ struct inode *h_i; ++ ++ err = 0; ++ if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { ++ pr_warning("branch root dir was changed\n"); ++ goto out; ++ } ++ ++ bfound = -1; ++ bend = au_ibend(inode); ++ bstart = au_ibstart(inode); ++#if 0 /* reserved for future use */ ++ if (bindex == bend) { ++ /* keep this ino in rename case */ ++ goto out; ++ } ++#endif ++ for (bindex = bstart; bindex <= bend; bindex++) ++ if (au_h_iptr(inode, bindex) == h_inode) { ++ bfound = bindex; ++ break; ++ } ++ if (bfound < 0) ++ goto out; ++ ++ for (bindex = bstart; bindex <= bend; bindex++) { ++ h_i = au_h_iptr(inode, bindex); ++ if (!h_i) ++ continue; ++ ++ err = au_xino_write(inode->i_sb, bindex, h_i->i_ino, /*ino*/0); ++ /* ignore this error */ ++ /* bad action? */ ++ } ++ ++ /* children inode number will be broken */ ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int hn_gen_tree(struct dentry *dentry) ++{ ++ int err, i, j, ndentry; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_dcsub_pages(&dpages, dentry, NULL, NULL); ++ if (unlikely(err)) ++ goto out_dpages; ++ ++ for (i = 0; i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ ndentry = dpage->ndentry; ++ for (j = 0; j < ndentry; j++) { ++ struct dentry *d; ++ ++ d = dentries[j]; ++ if (IS_ROOT(d)) ++ continue; ++ ++ au_digen_dec(d); ++ if (d->d_inode) ++ /* todo: reset children xino? ++ cached children only? */ ++ au_iigen_dec(d->d_inode); ++ } ++ } ++ ++out_dpages: ++ au_dpages_free(&dpages); ++ ++#if 0 ++ /* discard children */ ++ dentry_unhash(dentry); ++ dput(dentry); ++#endif ++out: ++ return err; ++} ++ ++/* ++ * return 0 if processed. ++ */ ++static int hn_gen_by_inode(char *name, unsigned int nlen, struct inode *inode, ++ const unsigned int isdir) ++{ ++ int err; ++ struct dentry *d; ++ struct qstr *dname; ++ ++ err = 1; ++ if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { ++ pr_warning("branch root dir was changed\n"); ++ err = 0; ++ goto out; ++ } ++ ++ if (!isdir) { ++ AuDebugOn(!name); ++ au_iigen_dec(inode); ++ spin_lock(&inode->i_lock); ++ list_for_each_entry(d, &inode->i_dentry, d_alias) { ++ spin_lock(&d->d_lock); ++ dname = &d->d_name; ++ if (dname->len != nlen ++ && memcmp(dname->name, name, nlen)) { ++ spin_unlock(&d->d_lock); ++ continue; ++ } ++ err = 0; ++ au_digen_dec(d); ++ spin_unlock(&d->d_lock); ++ break; ++ } ++ spin_unlock(&inode->i_lock); ++ } else { ++ au_fset_si(au_sbi(inode->i_sb), FAILED_REFRESH_DIR); ++ d = d_find_alias(inode); ++ if (!d) { ++ au_iigen_dec(inode); ++ goto out; ++ } ++ ++ spin_lock(&d->d_lock); ++ dname = &d->d_name; ++ if (dname->len == nlen && !memcmp(dname->name, name, nlen)) { ++ spin_unlock(&d->d_lock); ++ err = hn_gen_tree(d); ++ spin_lock(&d->d_lock); ++ } ++ spin_unlock(&d->d_lock); ++ dput(d); ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int hn_gen_by_name(struct dentry *dentry, const unsigned int isdir) ++{ ++ int err; ++ struct inode *inode; ++ ++ inode = dentry->d_inode; ++ if (IS_ROOT(dentry) ++ /* || (inode && inode->i_ino == AUFS_ROOT_INO) */ ++ ) { ++ pr_warning("branch root dir was changed\n"); ++ return 0; ++ } ++ ++ err = 0; ++ if (!isdir) { ++ au_digen_dec(dentry); ++ if (inode) ++ au_iigen_dec(inode); ++ } else { ++ au_fset_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR); ++ if (inode) ++ err = hn_gen_tree(dentry); ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* hnotify job flags */ ++#define AuHnJob_XINO0 1 ++#define AuHnJob_GEN (1 << 1) ++#define AuHnJob_DIRENT (1 << 2) ++#define AuHnJob_ISDIR (1 << 3) ++#define AuHnJob_TRYXINO0 (1 << 4) ++#define AuHnJob_MNTPNT (1 << 5) ++#define au_ftest_hnjob(flags, name) ((flags) & AuHnJob_##name) ++#define au_fset_hnjob(flags, name) \ ++ do { (flags) |= AuHnJob_##name; } while (0) ++#define au_fclr_hnjob(flags, name) \ ++ do { (flags) &= ~AuHnJob_##name; } while (0) ++ ++enum { ++ AuHn_CHILD, ++ AuHn_PARENT, ++ AuHnLast ++}; ++ ++struct au_hnotify_args { ++ struct inode *h_dir, *dir, *h_child_inode; ++ u32 mask; ++ unsigned int flags[AuHnLast]; ++ unsigned int h_child_nlen; ++ char h_child_name[]; ++}; ++ ++struct hn_job_args { ++ unsigned int flags; ++ struct inode *inode, *h_inode, *dir, *h_dir; ++ struct dentry *dentry; ++ char *h_name; ++ int h_nlen; ++}; ++ ++static int hn_job(struct hn_job_args *a) ++{ ++ const unsigned int isdir = au_ftest_hnjob(a->flags, ISDIR); ++ ++ /* reset xino */ ++ if (au_ftest_hnjob(a->flags, XINO0) && a->inode) ++ hn_xino(a->inode, a->h_inode); /* ignore this error */ ++ ++ if (au_ftest_hnjob(a->flags, TRYXINO0) ++ && a->inode ++ && a->h_inode) { ++ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); ++ if (!a->h_inode->i_nlink) ++ hn_xino(a->inode, a->h_inode); /* ignore this error */ ++ mutex_unlock(&a->h_inode->i_mutex); ++ } ++ ++ /* make the generation obsolete */ ++ if (au_ftest_hnjob(a->flags, GEN)) { ++ int err = -1; ++ if (a->inode) ++ err = hn_gen_by_inode(a->h_name, a->h_nlen, a->inode, ++ isdir); ++ if (err && a->dentry) ++ hn_gen_by_name(a->dentry, isdir); ++ /* ignore this error */ ++ } ++ ++ /* make dir entries obsolete */ ++ if (au_ftest_hnjob(a->flags, DIRENT) && a->inode) { ++ struct au_vdir *vdir; ++ ++ vdir = au_ivdir(a->inode); ++ if (vdir) ++ vdir->vd_jiffy = 0; ++ /* IMustLock(a->inode); */ ++ /* a->inode->i_version++; */ ++ } ++ ++ /* can do nothing but warn */ ++ if (au_ftest_hnjob(a->flags, MNTPNT) ++ && a->dentry ++ && d_mountpoint(a->dentry)) ++ pr_warning("mount-point %.*s is removed or renamed\n", ++ AuDLNPair(a->dentry)); ++ ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry *lookup_wlock_by_name(char *name, unsigned int nlen, ++ struct inode *dir) ++{ ++ struct dentry *dentry, *d, *parent; ++ struct qstr *dname; ++ ++ parent = d_find_alias(dir); ++ if (!parent) ++ return NULL; ++ ++ dentry = NULL; ++ spin_lock(&parent->d_lock); ++ list_for_each_entry(d, &parent->d_subdirs, d_u.d_child) { ++ /* AuDbg("%.*s\n", AuDLNPair(d)); */ ++ spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); ++ dname = &d->d_name; ++ if (dname->len != nlen || memcmp(dname->name, name, nlen)) ++ goto cont_unlock; ++ if (au_di(d)) ++ au_digen_dec(d); ++ else ++ goto cont_unlock; ++ if (d->d_count) { ++ dentry = dget_dlock(d); ++ spin_unlock(&d->d_lock); ++ break; ++ } ++ ++ cont_unlock: ++ spin_unlock(&d->d_lock); ++ } ++ spin_unlock(&parent->d_lock); ++ dput(parent); ++ ++ if (dentry) ++ di_write_lock_child(dentry); ++ ++ return dentry; ++} ++ ++static struct inode *lookup_wlock_by_ino(struct super_block *sb, ++ aufs_bindex_t bindex, ino_t h_ino) ++{ ++ struct inode *inode; ++ ino_t ino; ++ int err; ++ ++ inode = NULL; ++ err = au_xino_read(sb, bindex, h_ino, &ino); ++ if (!err && ino) ++ inode = ilookup(sb, ino); ++ if (!inode) ++ goto out; ++ ++ if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { ++ pr_warning("wrong root branch\n"); ++ iput(inode); ++ inode = NULL; ++ goto out; ++ } ++ ++ ii_write_lock_child(inode); ++ ++out: ++ return inode; ++} ++ ++static void au_hn_bh(void *_args) ++{ ++ struct au_hnotify_args *a = _args; ++ struct super_block *sb; ++ aufs_bindex_t bindex, bend, bfound; ++ unsigned char xino, try_iput; ++ int err; ++ struct inode *inode; ++ ino_t h_ino; ++ struct hn_job_args args; ++ struct dentry *dentry; ++ struct au_sbinfo *sbinfo; ++ ++ AuDebugOn(!_args); ++ AuDebugOn(!a->h_dir); ++ AuDebugOn(!a->dir); ++ AuDebugOn(!a->mask); ++ AuDbg("mask 0x%x, i%lu, hi%lu, hci%lu\n", ++ a->mask, a->dir->i_ino, a->h_dir->i_ino, ++ a->h_child_inode ? a->h_child_inode->i_ino : 0); ++ ++ inode = NULL; ++ dentry = NULL; ++ /* ++ * do not lock a->dir->i_mutex here ++ * because of d_revalidate() may cause a deadlock. ++ */ ++ sb = a->dir->i_sb; ++ AuDebugOn(!sb); ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!sbinfo); ++ si_write_lock(sb, AuLock_NOPLMW); ++ ++ ii_read_lock_parent(a->dir); ++ bfound = -1; ++ bend = au_ibend(a->dir); ++ for (bindex = au_ibstart(a->dir); bindex <= bend; bindex++) ++ if (au_h_iptr(a->dir, bindex) == a->h_dir) { ++ bfound = bindex; ++ break; ++ } ++ ii_read_unlock(a->dir); ++ if (unlikely(bfound < 0)) ++ goto out; ++ ++ xino = !!au_opt_test(au_mntflags(sb), XINO); ++ h_ino = 0; ++ if (a->h_child_inode) ++ h_ino = a->h_child_inode->i_ino; ++ ++ if (a->h_child_nlen ++ && (au_ftest_hnjob(a->flags[AuHn_CHILD], GEN) ++ || au_ftest_hnjob(a->flags[AuHn_CHILD], MNTPNT))) ++ dentry = lookup_wlock_by_name(a->h_child_name, a->h_child_nlen, ++ a->dir); ++ try_iput = 0; ++ if (dentry) ++ inode = dentry->d_inode; ++ if (xino && !inode && h_ino ++ && (au_ftest_hnjob(a->flags[AuHn_CHILD], XINO0) ++ || au_ftest_hnjob(a->flags[AuHn_CHILD], TRYXINO0) ++ || au_ftest_hnjob(a->flags[AuHn_CHILD], GEN))) { ++ inode = lookup_wlock_by_ino(sb, bfound, h_ino); ++ try_iput = 1; ++ } ++ ++ args.flags = a->flags[AuHn_CHILD]; ++ args.dentry = dentry; ++ args.inode = inode; ++ args.h_inode = a->h_child_inode; ++ args.dir = a->dir; ++ args.h_dir = a->h_dir; ++ args.h_name = a->h_child_name; ++ args.h_nlen = a->h_child_nlen; ++ err = hn_job(&args); ++ if (dentry) { ++ if (au_di(dentry)) ++ di_write_unlock(dentry); ++ dput(dentry); ++ } ++ if (inode && try_iput) { ++ ii_write_unlock(inode); ++ iput(inode); ++ } ++ ++ ii_write_lock_parent(a->dir); ++ args.flags = a->flags[AuHn_PARENT]; ++ args.dentry = NULL; ++ args.inode = a->dir; ++ args.h_inode = a->h_dir; ++ args.dir = NULL; ++ args.h_dir = NULL; ++ args.h_name = NULL; ++ args.h_nlen = 0; ++ err = hn_job(&args); ++ ii_write_unlock(a->dir); ++ ++out: ++ iput(a->h_child_inode); ++ iput(a->h_dir); ++ iput(a->dir); ++ si_write_unlock(sb); ++ au_nwt_done(&sbinfo->si_nowait); ++ kfree(a); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask, ++ struct qstr *h_child_qstr, struct inode *h_child_inode) ++{ ++ int err, len; ++ unsigned int flags[AuHnLast], f; ++ unsigned char isdir, isroot, wh; ++ struct inode *dir; ++ struct au_hnotify_args *args; ++ char *p, *h_child_name; ++ ++ err = 0; ++ AuDebugOn(!hnotify || !hnotify->hn_aufs_inode); ++ dir = igrab(hnotify->hn_aufs_inode); ++ if (!dir) ++ goto out; ++ ++ isroot = (dir->i_ino == AUFS_ROOT_INO); ++ wh = 0; ++ h_child_name = (void *)h_child_qstr->name; ++ len = h_child_qstr->len; ++ if (h_child_name) { ++ if (len > AUFS_WH_PFX_LEN ++ && !memcmp(h_child_name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { ++ h_child_name += AUFS_WH_PFX_LEN; ++ len -= AUFS_WH_PFX_LEN; ++ wh = 1; ++ } ++ } ++ ++ isdir = 0; ++ if (h_child_inode) ++ isdir = !!S_ISDIR(h_child_inode->i_mode); ++ flags[AuHn_PARENT] = AuHnJob_ISDIR; ++ flags[AuHn_CHILD] = 0; ++ if (isdir) ++ flags[AuHn_CHILD] = AuHnJob_ISDIR; ++ au_fset_hnjob(flags[AuHn_PARENT], DIRENT); ++ au_fset_hnjob(flags[AuHn_CHILD], GEN); ++ switch (mask & FS_EVENTS_POSS_ON_CHILD) { ++ case FS_MOVED_FROM: ++ case FS_MOVED_TO: ++ au_fset_hnjob(flags[AuHn_CHILD], XINO0); ++ au_fset_hnjob(flags[AuHn_CHILD], MNTPNT); ++ /*FALLTHROUGH*/ ++ case FS_CREATE: ++ AuDebugOn(!h_child_name || !h_child_inode); ++ break; ++ ++ case FS_DELETE: ++ /* ++ * aufs never be able to get this child inode. ++ * revalidation should be in d_revalidate() ++ * by checking i_nlink, i_generation or d_unhashed(). ++ */ ++ AuDebugOn(!h_child_name); ++ au_fset_hnjob(flags[AuHn_CHILD], TRYXINO0); ++ au_fset_hnjob(flags[AuHn_CHILD], MNTPNT); ++ break; ++ ++ default: ++ AuDebugOn(1); ++ } ++ ++ if (wh) ++ h_child_inode = NULL; ++ ++ err = -ENOMEM; ++ /* iput() and kfree() will be called in au_hnotify() */ ++ args = kmalloc(sizeof(*args) + len + 1, GFP_NOFS); ++ if (unlikely(!args)) { ++ AuErr1("no memory\n"); ++ iput(dir); ++ goto out; ++ } ++ args->flags[AuHn_PARENT] = flags[AuHn_PARENT]; ++ args->flags[AuHn_CHILD] = flags[AuHn_CHILD]; ++ args->mask = mask; ++ args->dir = dir; ++ args->h_dir = igrab(h_dir); ++ if (h_child_inode) ++ h_child_inode = igrab(h_child_inode); /* can be NULL */ ++ args->h_child_inode = h_child_inode; ++ args->h_child_nlen = len; ++ if (len) { ++ p = (void *)args; ++ p += sizeof(*args); ++ memcpy(p, h_child_name, len); ++ p[len] = 0; ++ } ++ ++ f = 0; ++ if (!dir->i_nlink) ++ f = AuWkq_NEST; ++ err = au_wkq_nowait(au_hn_bh, args, dir->i_sb, f); ++ if (unlikely(err)) { ++ pr_err("wkq %d\n", err); ++ iput(args->h_child_inode); ++ iput(args->h_dir); ++ iput(args->dir); ++ kfree(args); ++ } ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm) ++{ ++ int err; ++ ++ AuDebugOn(!(udba & AuOptMask_UDBA)); ++ ++ err = 0; ++ if (au_hnotify_op.reset_br) ++ err = au_hnotify_op.reset_br(udba, br, perm); ++ ++ return err; ++} ++ ++int au_hnotify_init_br(struct au_branch *br, int perm) ++{ ++ int err; ++ ++ err = 0; ++ if (au_hnotify_op.init_br) ++ err = au_hnotify_op.init_br(br, perm); ++ ++ return err; ++} ++ ++void au_hnotify_fin_br(struct au_branch *br) ++{ ++ if (au_hnotify_op.fin_br) ++ au_hnotify_op.fin_br(br); ++} ++ ++static void au_hn_destroy_cache(void) ++{ ++ kmem_cache_destroy(au_cachep[AuCache_HNOTIFY]); ++ au_cachep[AuCache_HNOTIFY] = NULL; ++} ++ ++int __init au_hnotify_init(void) ++{ ++ int err; ++ ++ err = -ENOMEM; ++ au_cachep[AuCache_HNOTIFY] = AuCache(au_hnotify); ++ if (au_cachep[AuCache_HNOTIFY]) { ++ err = 0; ++ if (au_hnotify_op.init) ++ err = au_hnotify_op.init(); ++ if (unlikely(err)) ++ au_hn_destroy_cache(); ++ } ++ AuTraceErr(err); ++ return err; ++} ++ ++void au_hnotify_fin(void) ++{ ++ if (au_hnotify_op.fin) ++ au_hnotify_op.fin(); ++ /* cf. au_cache_fin() */ ++ if (au_cachep[AuCache_HNOTIFY]) ++ au_hn_destroy_cache(); ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/iinfo.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,264 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode private data ++ */ ++ ++#include "aufs.h" ++ ++struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex) ++{ ++ struct inode *h_inode; ++ ++ IiMustAnyLock(inode); ++ ++ h_inode = au_ii(inode)->ii_hinode[0 + bindex].hi_inode; ++ AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); ++ return h_inode; ++} ++ ++/* todo: hard/soft set? */ ++void au_hiput(struct au_hinode *hinode) ++{ ++ au_hn_free(hinode); ++ dput(hinode->hi_whdentry); ++ iput(hinode->hi_inode); ++} ++ ++unsigned int au_hi_flags(struct inode *inode, int isdir) ++{ ++ unsigned int flags; ++ const unsigned int mnt_flags = au_mntflags(inode->i_sb); ++ ++ flags = 0; ++ if (au_opt_test(mnt_flags, XINO)) ++ au_fset_hi(flags, XINO); ++ if (isdir && au_opt_test(mnt_flags, UDBA_HNOTIFY)) ++ au_fset_hi(flags, HNOTIFY); ++ return flags; ++} ++ ++void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, ++ struct inode *h_inode, unsigned int flags) ++{ ++ struct au_hinode *hinode; ++ struct inode *hi; ++ struct au_iinfo *iinfo = au_ii(inode); ++ ++ IiMustWriteLock(inode); ++ ++ hinode = iinfo->ii_hinode + bindex; ++ hi = hinode->hi_inode; ++ AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); ++ ++ if (hi) ++ au_hiput(hinode); ++ hinode->hi_inode = h_inode; ++ if (h_inode) { ++ int err; ++ struct super_block *sb = inode->i_sb; ++ struct au_branch *br; ++ ++ AuDebugOn(inode->i_mode ++ && (h_inode->i_mode & S_IFMT) ++ != (inode->i_mode & S_IFMT)); ++ if (bindex == iinfo->ii_bstart) ++ au_cpup_igen(inode, h_inode); ++ br = au_sbr(sb, bindex); ++ hinode->hi_id = br->br_id; ++ if (au_ftest_hi(flags, XINO)) { ++ err = au_xino_write(sb, bindex, h_inode->i_ino, ++ inode->i_ino); ++ if (unlikely(err)) ++ AuIOErr1("failed au_xino_write() %d\n", err); ++ } ++ ++ if (au_ftest_hi(flags, HNOTIFY) ++ && au_br_hnotifyable(br->br_perm)) { ++ err = au_hn_alloc(hinode, inode); ++ if (unlikely(err)) ++ AuIOErr1("au_hn_alloc() %d\n", err); ++ } ++ } ++} ++ ++void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_wh) ++{ ++ struct au_hinode *hinode; ++ ++ IiMustWriteLock(inode); ++ ++ hinode = au_ii(inode)->ii_hinode + bindex; ++ AuDebugOn(hinode->hi_whdentry); ++ hinode->hi_whdentry = h_wh; ++} ++ ++void au_update_iigen(struct inode *inode) ++{ ++ atomic_set(&au_ii(inode)->ii_generation, au_sigen(inode->i_sb)); ++ /* smp_mb(); */ /* atomic_set */ ++} ++ ++/* it may be called at remount time, too */ ++void au_update_ibrange(struct inode *inode, int do_put_zero) ++{ ++ struct au_iinfo *iinfo; ++ aufs_bindex_t bindex, bend; ++ ++ iinfo = au_ii(inode); ++ if (!iinfo) ++ return; ++ ++ IiMustWriteLock(inode); ++ ++ if (do_put_zero && iinfo->ii_bstart >= 0) { ++ for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; ++ bindex++) { ++ struct inode *h_i; ++ ++ h_i = iinfo->ii_hinode[0 + bindex].hi_inode; ++ if (h_i && !h_i->i_nlink) ++ au_set_h_iptr(inode, bindex, NULL, 0); ++ } ++ } ++ ++ iinfo->ii_bstart = -1; ++ iinfo->ii_bend = -1; ++ bend = au_sbend(inode->i_sb); ++ for (bindex = 0; bindex <= bend; bindex++) ++ if (iinfo->ii_hinode[0 + bindex].hi_inode) { ++ iinfo->ii_bstart = bindex; ++ break; ++ } ++ if (iinfo->ii_bstart >= 0) ++ for (bindex = bend; bindex >= iinfo->ii_bstart; bindex--) ++ if (iinfo->ii_hinode[0 + bindex].hi_inode) { ++ iinfo->ii_bend = bindex; ++ break; ++ } ++ AuDebugOn(iinfo->ii_bstart > iinfo->ii_bend); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_icntnr_init_once(void *_c) ++{ ++ struct au_icntnr *c = _c; ++ struct au_iinfo *iinfo = &c->iinfo; ++ static struct lock_class_key aufs_ii; ++ ++ au_rw_init(&iinfo->ii_rwsem); ++ au_rw_class(&iinfo->ii_rwsem, &aufs_ii); ++ inode_init_once(&c->vfs_inode); ++} ++ ++int au_iinfo_init(struct inode *inode) ++{ ++ struct au_iinfo *iinfo; ++ struct super_block *sb; ++ int nbr, i; ++ ++ sb = inode->i_sb; ++ iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); ++ nbr = au_sbend(sb) + 1; ++ if (unlikely(nbr <= 0)) ++ nbr = 1; ++ iinfo->ii_hinode = kcalloc(nbr, sizeof(*iinfo->ii_hinode), GFP_NOFS); ++ if (iinfo->ii_hinode) { ++ au_ninodes_inc(sb); ++ for (i = 0; i < nbr; i++) ++ iinfo->ii_hinode[i].hi_id = -1; ++ ++ atomic_set(&iinfo->ii_generation, au_sigen(sb)); ++ /* smp_mb(); */ /* atomic_set */ ++ iinfo->ii_bstart = -1; ++ iinfo->ii_bend = -1; ++ iinfo->ii_vdir = NULL; ++ return 0; ++ } ++ return -ENOMEM; ++} ++ ++int au_ii_realloc(struct au_iinfo *iinfo, int nbr) ++{ ++ int err, sz; ++ struct au_hinode *hip; ++ ++ AuRwMustWriteLock(&iinfo->ii_rwsem); ++ ++ err = -ENOMEM; ++ sz = sizeof(*hip) * (iinfo->ii_bend + 1); ++ if (!sz) ++ sz = sizeof(*hip); ++ hip = au_kzrealloc(iinfo->ii_hinode, sz, sizeof(*hip) * nbr, GFP_NOFS); ++ if (hip) { ++ iinfo->ii_hinode = hip; ++ err = 0; ++ } ++ ++ return err; ++} ++ ++void au_iinfo_fin(struct inode *inode) ++{ ++ struct au_iinfo *iinfo; ++ struct au_hinode *hi; ++ struct super_block *sb; ++ aufs_bindex_t bindex, bend; ++ const unsigned char unlinked = !inode->i_nlink; ++ ++ iinfo = au_ii(inode); ++ /* bad_inode case */ ++ if (!iinfo) ++ return; ++ ++ sb = inode->i_sb; ++ au_ninodes_dec(sb); ++ if (si_pid_test(sb)) ++ au_xino_delete_inode(inode, unlinked); ++ else { ++ /* ++ * it is safe to hide the dependency between sbinfo and ++ * sb->s_umount. ++ */ ++ lockdep_off(); ++ si_noflush_read_lock(sb); ++ au_xino_delete_inode(inode, unlinked); ++ si_read_unlock(sb); ++ lockdep_on(); ++ } ++ ++ if (iinfo->ii_vdir) ++ au_vdir_free(iinfo->ii_vdir); ++ ++ bindex = iinfo->ii_bstart; ++ if (bindex >= 0) { ++ hi = iinfo->ii_hinode + bindex; ++ bend = iinfo->ii_bend; ++ while (bindex++ <= bend) { ++ if (hi->hi_inode) ++ au_hiput(hi); ++ hi++; ++ } ++ } ++ kfree(iinfo->ii_hinode); ++ iinfo->ii_hinode = NULL; ++ AuRwDestroy(&iinfo->ii_rwsem); ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/inode.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,471 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode functions ++ */ ++ ++#include "aufs.h" ++ ++struct inode *au_igrab(struct inode *inode) ++{ ++ if (inode) { ++ AuDebugOn(!atomic_read(&inode->i_count)); ++ ihold(inode); ++ } ++ return inode; ++} ++ ++static void au_refresh_hinode_attr(struct inode *inode, int do_version) ++{ ++ au_cpup_attr_all(inode, /*force*/0); ++ au_update_iigen(inode); ++ if (do_version) ++ inode->i_version++; ++} ++ ++static int au_ii_refresh(struct inode *inode, int *update) ++{ ++ int err, e; ++ umode_t type; ++ aufs_bindex_t bindex, new_bindex; ++ struct super_block *sb; ++ struct au_iinfo *iinfo; ++ struct au_hinode *p, *q, tmp; ++ ++ IiMustWriteLock(inode); ++ ++ *update = 0; ++ sb = inode->i_sb; ++ type = inode->i_mode & S_IFMT; ++ iinfo = au_ii(inode); ++ err = au_ii_realloc(iinfo, au_sbend(sb) + 1); ++ if (unlikely(err)) ++ goto out; ++ ++ AuDebugOn(iinfo->ii_bstart < 0); ++ p = iinfo->ii_hinode + iinfo->ii_bstart; ++ for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; ++ bindex++, p++) { ++ if (!p->hi_inode) ++ continue; ++ ++ AuDebugOn(type != (p->hi_inode->i_mode & S_IFMT)); ++ new_bindex = au_br_index(sb, p->hi_id); ++ if (new_bindex == bindex) ++ continue; ++ ++ if (new_bindex < 0) { ++ *update = 1; ++ au_hiput(p); ++ p->hi_inode = NULL; ++ continue; ++ } ++ ++ if (new_bindex < iinfo->ii_bstart) ++ iinfo->ii_bstart = new_bindex; ++ if (iinfo->ii_bend < new_bindex) ++ iinfo->ii_bend = new_bindex; ++ /* swap two lower inode, and loop again */ ++ q = iinfo->ii_hinode + new_bindex; ++ tmp = *q; ++ *q = *p; ++ *p = tmp; ++ if (tmp.hi_inode) { ++ bindex--; ++ p--; ++ } ++ } ++ au_update_ibrange(inode, /*do_put_zero*/0); ++ e = au_dy_irefresh(inode); ++ if (unlikely(e && !err)) ++ err = e; ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_refresh_hinode_self(struct inode *inode) ++{ ++ int err, update; ++ ++ err = au_ii_refresh(inode, &update); ++ if (!err) ++ au_refresh_hinode_attr(inode, update && S_ISDIR(inode->i_mode)); ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++int au_refresh_hinode(struct inode *inode, struct dentry *dentry) ++{ ++ int err, e, update; ++ unsigned int flags; ++ umode_t mode; ++ aufs_bindex_t bindex, bend; ++ unsigned char isdir; ++ struct au_hinode *p; ++ struct au_iinfo *iinfo; ++ ++ err = au_ii_refresh(inode, &update); ++ if (unlikely(err)) ++ goto out; ++ ++ update = 0; ++ iinfo = au_ii(inode); ++ p = iinfo->ii_hinode + iinfo->ii_bstart; ++ mode = (inode->i_mode & S_IFMT); ++ isdir = S_ISDIR(mode); ++ flags = au_hi_flags(inode, isdir); ++ bend = au_dbend(dentry); ++ for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) { ++ struct inode *h_i; ++ struct dentry *h_d; ++ ++ h_d = au_h_dptr(dentry, bindex); ++ if (!h_d || !h_d->d_inode) ++ continue; ++ ++ AuDebugOn(mode != (h_d->d_inode->i_mode & S_IFMT)); ++ if (iinfo->ii_bstart <= bindex && bindex <= iinfo->ii_bend) { ++ h_i = au_h_iptr(inode, bindex); ++ if (h_i) { ++ if (h_i == h_d->d_inode) ++ continue; ++ err = -EIO; ++ break; ++ } ++ } ++ if (bindex < iinfo->ii_bstart) ++ iinfo->ii_bstart = bindex; ++ if (iinfo->ii_bend < bindex) ++ iinfo->ii_bend = bindex; ++ au_set_h_iptr(inode, bindex, au_igrab(h_d->d_inode), flags); ++ update = 1; ++ } ++ au_update_ibrange(inode, /*do_put_zero*/0); ++ e = au_dy_irefresh(inode); ++ if (unlikely(e && !err)) ++ err = e; ++ if (!err) ++ au_refresh_hinode_attr(inode, update && isdir); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int set_inode(struct inode *inode, struct dentry *dentry) ++{ ++ int err; ++ unsigned int flags; ++ umode_t mode; ++ aufs_bindex_t bindex, bstart, btail; ++ unsigned char isdir; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ struct au_iinfo *iinfo; ++ ++ IiMustWriteLock(inode); ++ ++ err = 0; ++ isdir = 0; ++ bstart = au_dbstart(dentry); ++ h_inode = au_h_dptr(dentry, bstart)->d_inode; ++ mode = h_inode->i_mode; ++ switch (mode & S_IFMT) { ++ case S_IFREG: ++ btail = au_dbtail(dentry); ++ inode->i_op = &aufs_iop; ++ inode->i_fop = &aufs_file_fop; ++ err = au_dy_iaop(inode, bstart, h_inode); ++ if (unlikely(err)) ++ goto out; ++ break; ++ case S_IFDIR: ++ isdir = 1; ++ btail = au_dbtaildir(dentry); ++ inode->i_op = &aufs_dir_iop; ++ inode->i_fop = &aufs_dir_fop; ++ break; ++ case S_IFLNK: ++ btail = au_dbtail(dentry); ++ inode->i_op = &aufs_symlink_iop; ++ break; ++ case S_IFBLK: ++ case S_IFCHR: ++ case S_IFIFO: ++ case S_IFSOCK: ++ btail = au_dbtail(dentry); ++ inode->i_op = &aufs_iop; ++ au_init_special_fop(inode, mode, h_inode->i_rdev); ++ break; ++ default: ++ AuIOErr("Unknown file type 0%o\n", mode); ++ err = -EIO; ++ goto out; ++ } ++ ++ /* do not set hnotify for whiteouted dirs (SHWH mode) */ ++ flags = au_hi_flags(inode, isdir); ++ if (au_opt_test(au_mntflags(dentry->d_sb), SHWH) ++ && au_ftest_hi(flags, HNOTIFY) ++ && dentry->d_name.len > AUFS_WH_PFX_LEN ++ && !memcmp(dentry->d_name.name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) ++ au_fclr_hi(flags, HNOTIFY); ++ iinfo = au_ii(inode); ++ iinfo->ii_bstart = bstart; ++ iinfo->ii_bend = btail; ++ for (bindex = bstart; bindex <= btail; bindex++) { ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (h_dentry) ++ au_set_h_iptr(inode, bindex, ++ au_igrab(h_dentry->d_inode), flags); ++ } ++ au_cpup_attr_all(inode, /*force*/1); ++ ++out: ++ return err; ++} ++ ++/* ++ * successful returns with iinfo write_locked ++ * minus: errno ++ * zero: success, matched ++ * plus: no error, but unmatched ++ */ ++static int reval_inode(struct inode *inode, struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ struct inode *h_inode, *h_dinode; ++ ++ /* ++ * before this function, if aufs got any iinfo lock, it must be only ++ * one, the parent dir. ++ * it can happen by UDBA and the obsoleted inode number. ++ */ ++ err = -EIO; ++ if (unlikely(inode->i_ino == parent_ino(dentry))) ++ goto out; ++ ++ err = 1; ++ ii_write_lock_new_child(inode); ++ h_dinode = au_h_dptr(dentry, au_dbstart(dentry))->d_inode; ++ bend = au_ibend(inode); ++ for (bindex = au_ibstart(inode); bindex <= bend; bindex++) { ++ h_inode = au_h_iptr(inode, bindex); ++ if (h_inode && h_inode == h_dinode) { ++ err = 0; ++ if (au_iigen_test(inode, au_digen(dentry))) ++ err = au_refresh_hinode(inode, dentry); ++ break; ++ } ++ } ++ ++ if (unlikely(err)) ++ ii_write_unlock(inode); ++out: ++ return err; ++} ++ ++int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ unsigned int d_type, ino_t *ino) ++{ ++ int err; ++ struct mutex *mtx; ++ ++ /* prevent hardlinked inode number from race condition */ ++ mtx = NULL; ++ if (d_type != DT_DIR) { ++ mtx = &au_sbr(sb, bindex)->br_xino.xi_nondir_mtx; ++ mutex_lock(mtx); ++ } ++ err = au_xino_read(sb, bindex, h_ino, ino); ++ if (unlikely(err)) ++ goto out; ++ ++ if (!*ino) { ++ err = -EIO; ++ *ino = au_xino_new_ino(sb); ++ if (unlikely(!*ino)) ++ goto out; ++ err = au_xino_write(sb, bindex, h_ino, *ino); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++out: ++ if (mtx) ++ mutex_unlock(mtx); ++ return err; ++} ++ ++/* successful returns with iinfo write_locked */ ++/* todo: return with unlocked? */ ++struct inode *au_new_inode(struct dentry *dentry, int must_new) ++{ ++ struct inode *inode, *h_inode; ++ struct dentry *h_dentry; ++ struct super_block *sb; ++ struct mutex *mtx; ++ ino_t h_ino, ino; ++ int err; ++ aufs_bindex_t bstart; ++ ++ sb = dentry->d_sb; ++ bstart = au_dbstart(dentry); ++ h_dentry = au_h_dptr(dentry, bstart); ++ h_inode = h_dentry->d_inode; ++ h_ino = h_inode->i_ino; ++ ++ /* ++ * stop 'race'-ing between hardlinks under different ++ * parents. ++ */ ++ mtx = NULL; ++ if (!S_ISDIR(h_inode->i_mode)) ++ mtx = &au_sbr(sb, bstart)->br_xino.xi_nondir_mtx; ++ ++new_ino: ++ if (mtx) ++ mutex_lock(mtx); ++ err = au_xino_read(sb, bstart, h_ino, &ino); ++ inode = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ if (!ino) { ++ ino = au_xino_new_ino(sb); ++ if (unlikely(!ino)) { ++ inode = ERR_PTR(-EIO); ++ goto out; ++ } ++ } ++ ++ AuDbg("i%lu\n", (unsigned long)ino); ++ inode = au_iget_locked(sb, ino); ++ err = PTR_ERR(inode); ++ if (IS_ERR(inode)) ++ goto out; ++ ++ AuDbg("%lx, new %d\n", inode->i_state, !!(inode->i_state & I_NEW)); ++ if (inode->i_state & I_NEW) { ++ ii_write_lock_new_child(inode); ++ err = set_inode(inode, dentry); ++ if (!err) { ++ unlock_new_inode(inode); ++ goto out; /* success */ ++ } ++ ++ /* ++ * iget_failed() calls iput(), but we need to call ++ * ii_write_unlock() after iget_failed(). so dirty hack for ++ * i_count. ++ */ ++ atomic_inc(&inode->i_count); ++ iget_failed(inode); ++ ii_write_unlock(inode); ++ au_xino_write(sb, bstart, h_ino, /*ino*/0); ++ /* ignore this error */ ++ goto out_iput; ++ } else if (!must_new && !IS_DEADDIR(inode) && inode->i_nlink) { ++ /* ++ * horrible race condition between lookup, readdir and copyup ++ * (or something). ++ */ ++ if (mtx) ++ mutex_unlock(mtx); ++ err = reval_inode(inode, dentry); ++ if (unlikely(err < 0)) { ++ mtx = NULL; ++ goto out_iput; ++ } ++ ++ if (!err) { ++ mtx = NULL; ++ goto out; /* success */ ++ } else if (mtx) ++ mutex_lock(mtx); ++ } ++ ++ if (unlikely(au_test_fs_unique_ino(h_dentry->d_inode))) ++ AuWarn1("Warning: Un-notified UDBA or repeatedly renamed dir," ++ " b%d, %s, %.*s, hi%lu, i%lu.\n", ++ bstart, au_sbtype(h_dentry->d_sb), AuDLNPair(dentry), ++ (unsigned long)h_ino, (unsigned long)ino); ++ ino = 0; ++ err = au_xino_write(sb, bstart, h_ino, /*ino*/0); ++ if (!err) { ++ iput(inode); ++ if (mtx) ++ mutex_unlock(mtx); ++ goto new_ino; ++ } ++ ++out_iput: ++ iput(inode); ++ inode = ERR_PTR(err); ++out: ++ if (mtx) ++ mutex_unlock(mtx); ++ return inode; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, ++ struct inode *inode) ++{ ++ int err; ++ ++ err = au_br_rdonly(au_sbr(sb, bindex)); ++ ++ /* pseudo-link after flushed may happen out of bounds */ ++ if (!err ++ && inode ++ && au_ibstart(inode) <= bindex ++ && bindex <= au_ibend(inode)) { ++ /* ++ * permission check is unnecessary since vfsub routine ++ * will be called later ++ */ ++ struct inode *hi = au_h_iptr(inode, bindex); ++ if (hi) ++ err = IS_IMMUTABLE(hi) ? -EROFS : 0; ++ } ++ ++ return err; ++} ++ ++int au_test_h_perm(struct inode *h_inode, int mask) ++{ ++ if (!current_fsuid()) ++ return 0; ++ return inode_permission(h_inode, mask); ++} ++ ++int au_test_h_perm_sio(struct inode *h_inode, int mask) ++{ ++ if (au_test_nfs(h_inode->i_sb) ++ && (mask & MAY_WRITE) ++ && S_ISDIR(h_inode->i_mode)) ++ mask |= MAY_READ; /* force permission check */ ++ return au_test_h_perm(h_inode, mask); ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/inode.h 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,560 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode operations ++ */ ++ ++#ifndef __AUFS_INODE_H__ ++#define __AUFS_INODE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include "rwsem.h" ++ ++struct vfsmount; ++ ++struct au_hnotify { ++#ifdef CONFIG_AUFS_HNOTIFY ++#ifdef CONFIG_AUFS_HFSNOTIFY ++ /* never use fsnotify_add_vfsmount_mark() */ ++ struct fsnotify_mark hn_mark; ++#endif ++ struct inode *hn_aufs_inode; /* no get/put */ ++#endif ++} ____cacheline_aligned_in_smp; ++ ++struct au_hinode { ++ struct inode *hi_inode; ++ aufs_bindex_t hi_id; ++#ifdef CONFIG_AUFS_HNOTIFY ++ struct au_hnotify *hi_notify; ++#endif ++ ++ /* reference to the copied-up whiteout with get/put */ ++ struct dentry *hi_whdentry; ++}; ++ ++struct au_vdir; ++struct au_iinfo { ++ atomic_t ii_generation; ++ struct super_block *ii_hsb1; /* no get/put */ ++ ++ struct au_rwsem ii_rwsem; ++ aufs_bindex_t ii_bstart, ii_bend; ++ __u32 ii_higen; ++ struct au_hinode *ii_hinode; ++ struct au_vdir *ii_vdir; ++}; ++ ++struct au_icntnr { ++ struct au_iinfo iinfo; ++ struct inode vfs_inode; ++} ____cacheline_aligned_in_smp; ++ ++/* au_pin flags */ ++#define AuPin_DI_LOCKED 1 ++#define AuPin_MNT_WRITE (1 << 1) ++#define au_ftest_pin(flags, name) ((flags) & AuPin_##name) ++#define au_fset_pin(flags, name) \ ++ do { (flags) |= AuPin_##name; } while (0) ++#define au_fclr_pin(flags, name) \ ++ do { (flags) &= ~AuPin_##name; } while (0) ++ ++struct au_pin { ++ /* input */ ++ struct dentry *dentry; ++ unsigned int udba; ++ unsigned char lsc_di, lsc_hi, flags; ++ aufs_bindex_t bindex; ++ ++ /* output */ ++ struct dentry *parent; ++ struct au_hinode *hdir; ++ struct vfsmount *h_mnt; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_iinfo *au_ii(struct inode *inode) ++{ ++ struct au_iinfo *iinfo; ++ ++ iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); ++ if (iinfo->ii_hinode) ++ return iinfo; ++ return NULL; /* debugging bad_inode case */ ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* inode.c */ ++struct inode *au_igrab(struct inode *inode); ++int au_refresh_hinode_self(struct inode *inode); ++int au_refresh_hinode(struct inode *inode, struct dentry *dentry); ++int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ unsigned int d_type, ino_t *ino); ++struct inode *au_new_inode(struct dentry *dentry, int must_new); ++int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, ++ struct inode *inode); ++int au_test_h_perm(struct inode *h_inode, int mask); ++int au_test_h_perm_sio(struct inode *h_inode, int mask); ++ ++static inline int au_wh_ino(struct super_block *sb, aufs_bindex_t bindex, ++ ino_t h_ino, unsigned int d_type, ino_t *ino) ++{ ++#ifdef CONFIG_AUFS_SHWH ++ return au_ino(sb, bindex, h_ino, d_type, ino); ++#else ++ return 0; ++#endif ++} ++ ++/* i_op.c */ ++extern struct inode_operations aufs_iop, aufs_symlink_iop, aufs_dir_iop; ++ ++/* au_wr_dir flags */ ++#define AuWrDir_ADD_ENTRY 1 ++#define AuWrDir_ISDIR (1 << 1) ++#define au_ftest_wrdir(flags, name) ((flags) & AuWrDir_##name) ++#define au_fset_wrdir(flags, name) \ ++ do { (flags) |= AuWrDir_##name; } while (0) ++#define au_fclr_wrdir(flags, name) \ ++ do { (flags) &= ~AuWrDir_##name; } while (0) ++ ++struct au_wr_dir_args { ++ aufs_bindex_t force_btgt; ++ unsigned char flags; ++}; ++int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, ++ struct au_wr_dir_args *args); ++ ++struct dentry *au_pinned_h_parent(struct au_pin *pin); ++void au_pin_init(struct au_pin *pin, struct dentry *dentry, ++ aufs_bindex_t bindex, int lsc_di, int lsc_hi, ++ unsigned int udba, unsigned char flags); ++int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int udba, unsigned char flags) __must_check; ++int au_do_pin(struct au_pin *pin) __must_check; ++void au_unpin(struct au_pin *pin); ++ ++/* i_op_add.c */ ++int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir); ++int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, ++ dev_t dev); ++int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname); ++int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode, ++ struct nameidata *nd); ++int aufs_link(struct dentry *src_dentry, struct inode *dir, ++ struct dentry *dentry); ++int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); ++ ++/* i_op_del.c */ ++int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup); ++int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir); ++int aufs_unlink(struct inode *dir, struct dentry *dentry); ++int aufs_rmdir(struct inode *dir, struct dentry *dentry); ++ ++/* i_op_ren.c */ ++int au_wbr(struct dentry *dentry, aufs_bindex_t btgt); ++int aufs_rename(struct inode *src_dir, struct dentry *src_dentry, ++ struct inode *dir, struct dentry *dentry); ++ ++/* iinfo.c */ ++struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex); ++void au_hiput(struct au_hinode *hinode); ++void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_wh); ++unsigned int au_hi_flags(struct inode *inode, int isdir); ++ ++/* hinode flags */ ++#define AuHi_XINO 1 ++#define AuHi_HNOTIFY (1 << 1) ++#define au_ftest_hi(flags, name) ((flags) & AuHi_##name) ++#define au_fset_hi(flags, name) \ ++ do { (flags) |= AuHi_##name; } while (0) ++#define au_fclr_hi(flags, name) \ ++ do { (flags) &= ~AuHi_##name; } while (0) ++ ++#ifndef CONFIG_AUFS_HNOTIFY ++#undef AuHi_HNOTIFY ++#define AuHi_HNOTIFY 0 ++#endif ++ ++void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, ++ struct inode *h_inode, unsigned int flags); ++ ++void au_update_iigen(struct inode *inode); ++void au_update_ibrange(struct inode *inode, int do_put_zero); ++ ++void au_icntnr_init_once(void *_c); ++int au_iinfo_init(struct inode *inode); ++void au_iinfo_fin(struct inode *inode); ++int au_ii_realloc(struct au_iinfo *iinfo, int nbr); ++ ++#ifdef CONFIG_PROC_FS ++/* plink.c */ ++int au_plink_maint(struct super_block *sb, int flags); ++void au_plink_maint_leave(struct au_sbinfo *sbinfo); ++int au_plink_maint_enter(struct super_block *sb); ++#ifdef CONFIG_AUFS_DEBUG ++void au_plink_list(struct super_block *sb); ++#else ++AuStubVoid(au_plink_list, struct super_block *sb) ++#endif ++int au_plink_test(struct inode *inode); ++struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex); ++void au_plink_append(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_dentry); ++void au_plink_put(struct super_block *sb, int verbose); ++void au_plink_clean(struct super_block *sb, int verbose); ++void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id); ++#else ++AuStubInt0(au_plink_maint, struct super_block *sb, int flags); ++AuStubVoid(au_plink_maint_leave, struct au_sbinfo *sbinfo); ++AuStubInt0(au_plink_maint_enter, struct super_block *sb); ++AuStubVoid(au_plink_list, struct super_block *sb); ++AuStubInt0(au_plink_test, struct inode *inode); ++AuStub(struct dentry *, au_plink_lkup, return NULL, ++ struct inode *inode, aufs_bindex_t bindex); ++AuStubVoid(au_plink_append, struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_dentry); ++AuStubVoid(au_plink_put, struct super_block *sb, int verbose); ++AuStubVoid(au_plink_clean, struct super_block *sb, int verbose); ++AuStubVoid(au_plink_half_refresh, struct super_block *sb, aufs_bindex_t br_id); ++#endif /* CONFIG_PROC_FS */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock subclass for iinfo */ ++enum { ++ AuLsc_II_CHILD, /* child first */ ++ AuLsc_II_CHILD2, /* rename(2), link(2), and cpup at hnotify */ ++ AuLsc_II_CHILD3, /* copyup dirs */ ++ AuLsc_II_PARENT, /* see AuLsc_I_PARENT in vfsub.h */ ++ AuLsc_II_PARENT2, ++ AuLsc_II_PARENT3, /* copyup dirs */ ++ AuLsc_II_NEW_CHILD ++}; ++ ++/* ++ * ii_read_lock_child, ii_write_lock_child, ++ * ii_read_lock_child2, ii_write_lock_child2, ++ * ii_read_lock_child3, ii_write_lock_child3, ++ * ii_read_lock_parent, ii_write_lock_parent, ++ * ii_read_lock_parent2, ii_write_lock_parent2, ++ * ii_read_lock_parent3, ii_write_lock_parent3, ++ * ii_read_lock_new_child, ii_write_lock_new_child, ++ */ ++#define AuReadLockFunc(name, lsc) \ ++static inline void ii_read_lock_##name(struct inode *i) \ ++{ \ ++ au_rw_read_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ ++} ++ ++#define AuWriteLockFunc(name, lsc) \ ++static inline void ii_write_lock_##name(struct inode *i) \ ++{ \ ++ au_rw_write_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ ++} ++ ++#define AuRWLockFuncs(name, lsc) \ ++ AuReadLockFunc(name, lsc) \ ++ AuWriteLockFunc(name, lsc) ++ ++AuRWLockFuncs(child, CHILD); ++AuRWLockFuncs(child2, CHILD2); ++AuRWLockFuncs(child3, CHILD3); ++AuRWLockFuncs(parent, PARENT); ++AuRWLockFuncs(parent2, PARENT2); ++AuRWLockFuncs(parent3, PARENT3); ++AuRWLockFuncs(new_child, NEW_CHILD); ++ ++#undef AuReadLockFunc ++#undef AuWriteLockFunc ++#undef AuRWLockFuncs ++ ++/* ++ * ii_read_unlock, ii_write_unlock, ii_downgrade_lock ++ */ ++AuSimpleUnlockRwsemFuncs(ii, struct inode *i, &au_ii(i)->ii_rwsem); ++ ++#define IiMustNoWaiters(i) AuRwMustNoWaiters(&au_ii(i)->ii_rwsem) ++#define IiMustAnyLock(i) AuRwMustAnyLock(&au_ii(i)->ii_rwsem) ++#define IiMustWriteLock(i) AuRwMustWriteLock(&au_ii(i)->ii_rwsem) ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline void au_icntnr_init(struct au_icntnr *c) ++{ ++#ifdef CONFIG_AUFS_DEBUG ++ c->vfs_inode.i_mode = 0; ++#endif ++} ++ ++static inline unsigned int au_iigen(struct inode *inode) ++{ ++ return atomic_read(&au_ii(inode)->ii_generation); ++} ++ ++/* tiny test for inode number */ ++/* tmpfs generation is too rough */ ++static inline int au_test_higen(struct inode *inode, struct inode *h_inode) ++{ ++ struct au_iinfo *iinfo; ++ ++ iinfo = au_ii(inode); ++ AuRwMustAnyLock(&iinfo->ii_rwsem); ++ return !(iinfo->ii_hsb1 == h_inode->i_sb ++ && iinfo->ii_higen == h_inode->i_generation); ++} ++ ++static inline void au_iigen_dec(struct inode *inode) ++{ ++ atomic_dec(&au_ii(inode)->ii_generation); ++} ++ ++static inline int au_iigen_test(struct inode *inode, unsigned int sigen) ++{ ++ int err; ++ ++ err = 0; ++ if (unlikely(inode && au_iigen(inode) != sigen)) ++ err = -EIO; ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline aufs_bindex_t au_ii_br_id(struct inode *inode, ++ aufs_bindex_t bindex) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_hinode[0 + bindex].hi_id; ++} ++ ++static inline aufs_bindex_t au_ibstart(struct inode *inode) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_bstart; ++} ++ ++static inline aufs_bindex_t au_ibend(struct inode *inode) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_bend; ++} ++ ++static inline struct au_vdir *au_ivdir(struct inode *inode) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_vdir; ++} ++ ++static inline struct dentry *au_hi_wh(struct inode *inode, aufs_bindex_t bindex) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_hinode[0 + bindex].hi_whdentry; ++} ++ ++static inline void au_set_ibstart(struct inode *inode, aufs_bindex_t bindex) ++{ ++ IiMustWriteLock(inode); ++ au_ii(inode)->ii_bstart = bindex; ++} ++ ++static inline void au_set_ibend(struct inode *inode, aufs_bindex_t bindex) ++{ ++ IiMustWriteLock(inode); ++ au_ii(inode)->ii_bend = bindex; ++} ++ ++static inline void au_set_ivdir(struct inode *inode, struct au_vdir *vdir) ++{ ++ IiMustWriteLock(inode); ++ au_ii(inode)->ii_vdir = vdir; ++} ++ ++static inline struct au_hinode *au_hi(struct inode *inode, aufs_bindex_t bindex) ++{ ++ IiMustAnyLock(inode); ++ return au_ii(inode)->ii_hinode + bindex; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct dentry *au_pinned_parent(struct au_pin *pin) ++{ ++ if (pin) ++ return pin->parent; ++ return NULL; ++} ++ ++static inline struct inode *au_pinned_h_dir(struct au_pin *pin) ++{ ++ if (pin && pin->hdir) ++ return pin->hdir->hi_inode; ++ return NULL; ++} ++ ++static inline struct au_hinode *au_pinned_hdir(struct au_pin *pin) ++{ ++ if (pin) ++ return pin->hdir; ++ return NULL; ++} ++ ++static inline void au_pin_set_dentry(struct au_pin *pin, struct dentry *dentry) ++{ ++ if (pin) ++ pin->dentry = dentry; ++} ++ ++static inline void au_pin_set_parent_lflag(struct au_pin *pin, ++ unsigned char lflag) ++{ ++ if (pin) { ++ if (lflag) ++ au_fset_pin(pin->flags, DI_LOCKED); ++ else ++ au_fclr_pin(pin->flags, DI_LOCKED); ++ } ++} ++ ++static inline void au_pin_set_parent(struct au_pin *pin, struct dentry *parent) ++{ ++ if (pin) { ++ dput(pin->parent); ++ pin->parent = dget(parent); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_branch; ++#ifdef CONFIG_AUFS_HNOTIFY ++struct au_hnotify_op { ++ void (*ctl)(struct au_hinode *hinode, int do_set); ++ int (*alloc)(struct au_hinode *hinode); ++ ++ /* ++ * if it returns true, the the caller should free hinode->hi_notify, ++ * otherwise ->free() frees it. ++ */ ++ int (*free)(struct au_hinode *hinode, ++ struct au_hnotify *hn) __must_check; ++ ++ void (*fin)(void); ++ int (*init)(void); ++ ++ int (*reset_br)(unsigned int udba, struct au_branch *br, int perm); ++ void (*fin_br)(struct au_branch *br); ++ int (*init_br)(struct au_branch *br, int perm); ++}; ++ ++/* hnotify.c */ ++int au_hn_alloc(struct au_hinode *hinode, struct inode *inode); ++void au_hn_free(struct au_hinode *hinode); ++void au_hn_ctl(struct au_hinode *hinode, int do_set); ++void au_hn_reset(struct inode *inode, unsigned int flags); ++int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask, ++ struct qstr *h_child_qstr, struct inode *h_child_inode); ++int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm); ++int au_hnotify_init_br(struct au_branch *br, int perm); ++void au_hnotify_fin_br(struct au_branch *br); ++int __init au_hnotify_init(void); ++void au_hnotify_fin(void); ++ ++/* hfsnotify.c */ ++extern const struct au_hnotify_op au_hnotify_op; ++ ++static inline ++void au_hn_init(struct au_hinode *hinode) ++{ ++ hinode->hi_notify = NULL; ++} ++ ++static inline struct au_hnotify *au_hn(struct au_hinode *hinode) ++{ ++ return hinode->hi_notify; ++} ++ ++#else ++static inline ++int au_hn_alloc(struct au_hinode *hinode __maybe_unused, ++ struct inode *inode __maybe_unused) ++{ ++ return -EOPNOTSUPP; ++} ++ ++static inline struct au_hnotify *au_hn(struct au_hinode *hinode) ++{ ++ return NULL; ++} ++ ++AuStubVoid(au_hn_free, struct au_hinode *hinode __maybe_unused) ++AuStubVoid(au_hn_ctl, struct au_hinode *hinode __maybe_unused, ++ int do_set __maybe_unused) ++AuStubVoid(au_hn_reset, struct inode *inode __maybe_unused, ++ unsigned int flags __maybe_unused) ++AuStubInt0(au_hnotify_reset_br, unsigned int udba __maybe_unused, ++ struct au_branch *br __maybe_unused, ++ int perm __maybe_unused) ++AuStubInt0(au_hnotify_init_br, struct au_branch *br __maybe_unused, ++ int perm __maybe_unused) ++AuStubVoid(au_hnotify_fin_br, struct au_branch *br __maybe_unused) ++AuStubInt0(__init au_hnotify_init, void) ++AuStubVoid(au_hnotify_fin, void) ++AuStubVoid(au_hn_init, struct au_hinode *hinode __maybe_unused) ++#endif /* CONFIG_AUFS_HNOTIFY */ ++ ++static inline void au_hn_suspend(struct au_hinode *hdir) ++{ ++ au_hn_ctl(hdir, /*do_set*/0); ++} ++ ++static inline void au_hn_resume(struct au_hinode *hdir) ++{ ++ au_hn_ctl(hdir, /*do_set*/1); ++} ++ ++static inline void au_hn_imtx_lock(struct au_hinode *hdir) ++{ ++ mutex_lock(&hdir->hi_inode->i_mutex); ++ au_hn_suspend(hdir); ++} ++ ++static inline void au_hn_imtx_lock_nested(struct au_hinode *hdir, ++ unsigned int sc __maybe_unused) ++{ ++ mutex_lock_nested(&hdir->hi_inode->i_mutex, sc); ++ au_hn_suspend(hdir); ++} ++ ++static inline void au_hn_imtx_unlock(struct au_hinode *hdir) ++{ ++ au_hn_resume(hdir); ++ mutex_unlock(&hdir->hi_inode->i_mutex); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_INODE_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/ioctl.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,196 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * ioctl ++ * plink-management and readdir in userspace. ++ * assist the pathconf(3) wrapper library. ++ */ ++ ++#include "aufs.h" ++ ++static int au_wbr_fd(struct path *path, struct aufs_wbr_fd __user *arg) ++{ ++ int err, fd; ++ aufs_bindex_t wbi, bindex, bend; ++ struct file *h_file; ++ struct super_block *sb; ++ struct dentry *root; ++ struct au_branch *br; ++ struct aufs_wbr_fd wbrfd = { ++ .oflags = au_dir_roflags, ++ .brid = -1 ++ }; ++ const int valid = O_RDONLY | O_NONBLOCK | O_LARGEFILE | O_DIRECTORY ++ | O_NOATIME | O_CLOEXEC; ++ ++ AuDebugOn(wbrfd.oflags & ~valid); ++ ++ if (arg) { ++ err = copy_from_user(&wbrfd, arg, sizeof(wbrfd)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ goto out; ++ } ++ ++ err = -EINVAL; ++ AuDbg("wbrfd{0%o, %d}\n", wbrfd.oflags, wbrfd.brid); ++ wbrfd.oflags |= au_dir_roflags; ++ AuDbg("0%o\n", wbrfd.oflags); ++ if (unlikely(wbrfd.oflags & ~valid)) ++ goto out; ++ } ++ ++ fd = get_unused_fd(); ++ err = fd; ++ if (unlikely(fd < 0)) ++ goto out; ++ ++ h_file = ERR_PTR(-EINVAL); ++ wbi = 0; ++ br = NULL; ++ sb = path->dentry->d_sb; ++ root = sb->s_root; ++ aufs_read_lock(root, AuLock_IR); ++ bend = au_sbend(sb); ++ if (wbrfd.brid >= 0) { ++ wbi = au_br_index(sb, wbrfd.brid); ++ if (unlikely(wbi < 0 || wbi > bend)) ++ goto out_unlock; ++ } ++ ++ h_file = ERR_PTR(-ENOENT); ++ br = au_sbr(sb, wbi); ++ if (!au_br_writable(br->br_perm)) { ++ if (arg) ++ goto out_unlock; ++ ++ bindex = wbi + 1; ++ wbi = -1; ++ for (; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (au_br_writable(br->br_perm)) { ++ wbi = bindex; ++ br = au_sbr(sb, wbi); ++ break; ++ } ++ } ++ } ++ AuDbg("wbi %d\n", wbi); ++ if (wbi >= 0) ++ h_file = au_h_open(root, wbi, wbrfd.oflags, NULL); ++ ++out_unlock: ++ aufs_read_unlock(root, AuLock_IR); ++ err = PTR_ERR(h_file); ++ if (IS_ERR(h_file)) ++ goto out_fd; ++ ++ atomic_dec(&br->br_count); /* cf. au_h_open() */ ++ fd_install(fd, h_file); ++ err = fd; ++ goto out; /* success */ ++ ++out_fd: ++ put_unused_fd(fd); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ long err; ++ ++ switch (cmd) { ++ case AUFS_CTL_RDU: ++ case AUFS_CTL_RDU_INO: ++ err = au_rdu_ioctl(file, cmd, arg); ++ break; ++ ++ case AUFS_CTL_WBR_FD: ++ err = au_wbr_fd(&file->f_path, (void __user *)arg); ++ break; ++ ++ case AUFS_CTL_IBUSY: ++ err = au_ibusy_ioctl(file, arg); ++ break; ++ ++ default: ++ /* do not call the lower */ ++ AuDbg("0x%x\n", cmd); ++ err = -ENOTTY; ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ long err; ++ ++ switch (cmd) { ++ case AUFS_CTL_WBR_FD: ++ err = au_wbr_fd(&file->f_path, (void __user *)arg); ++ break; ++ ++ default: ++ /* do not call the lower */ ++ AuDbg("0x%x\n", cmd); ++ err = -ENOTTY; ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++#ifdef CONFIG_COMPAT ++long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ long err; ++ ++ switch (cmd) { ++ case AUFS_CTL_RDU: ++ case AUFS_CTL_RDU_INO: ++ err = au_rdu_compat_ioctl(file, cmd, arg); ++ break; ++ ++ case AUFS_CTL_IBUSY: ++ err = au_ibusy_compat_ioctl(file, arg); ++ break; ++ ++ default: ++ err = aufs_ioctl_dir(file, cmd, arg); ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++#if 0 /* unused yet */ ++long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ return aufs_ioctl_nondir(file, cmd, (unsigned long)compat_ptr(arg)); ++} ++#endif ++#endif +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/i_op_add.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,712 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode operations (add entry) ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * final procedure of adding a new entry, except link(2). ++ * remove whiteout, instantiate, copyup the parent dir's times and size ++ * and update version. ++ * if it failed, re-create the removed whiteout. ++ */ ++static int epilog(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct dentry *dentry) ++{ ++ int err, rerr; ++ aufs_bindex_t bwh; ++ struct path h_path; ++ struct inode *inode, *h_dir; ++ struct dentry *wh; ++ ++ bwh = -1; ++ if (wh_dentry) { ++ h_dir = wh_dentry->d_parent->d_inode; /* dir inode is locked */ ++ IMustLock(h_dir); ++ AuDebugOn(au_h_iptr(dir, bindex) != h_dir); ++ bwh = au_dbwh(dentry); ++ h_path.dentry = wh_dentry; ++ h_path.mnt = au_sbr_mnt(dir->i_sb, bindex); ++ err = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, ++ dentry); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ inode = au_new_inode(dentry, /*must_new*/1); ++ if (!IS_ERR(inode)) { ++ d_instantiate(dentry, inode); ++ dir = dentry->d_parent->d_inode; /* dir inode is locked */ ++ IMustLock(dir); ++ if (au_ibstart(dir) == au_dbstart(dentry)) ++ au_cpup_attr_timesizes(dir); ++ dir->i_version++; ++ return 0; /* success */ ++ } ++ ++ err = PTR_ERR(inode); ++ if (!wh_dentry) ++ goto out; ++ ++ /* revert */ ++ /* dir inode is locked */ ++ wh = au_wh_create(dentry, bwh, wh_dentry->d_parent); ++ rerr = PTR_ERR(wh); ++ if (IS_ERR(wh)) { ++ AuIOErr("%.*s reverting whiteout failed(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ err = -EIO; ++ } else ++ dput(wh); ++ ++out: ++ return err; ++} ++ ++static int au_d_may_add(struct dentry *dentry) ++{ ++ int err; ++ ++ err = 0; ++ if (unlikely(d_unhashed(dentry))) ++ err = -ENOENT; ++ if (unlikely(dentry->d_inode)) ++ err = -EEXIST; ++ return err; ++} ++ ++/* ++ * simple tests for the adding inode operations. ++ * following the checks in vfs, plus the parent-child relationship. ++ */ ++int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir) ++{ ++ int err; ++ umode_t h_mode; ++ struct dentry *h_dentry; ++ struct inode *h_inode; ++ ++ err = -ENAMETOOLONG; ++ if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) ++ goto out; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ h_inode = h_dentry->d_inode; ++ if (!dentry->d_inode) { ++ err = -EEXIST; ++ if (unlikely(h_inode)) ++ goto out; ++ } else { ++ /* rename(2) case */ ++ err = -EIO; ++ if (unlikely(!h_inode || !h_inode->i_nlink)) ++ goto out; ++ ++ h_mode = h_inode->i_mode; ++ if (!isdir) { ++ err = -EISDIR; ++ if (unlikely(S_ISDIR(h_mode))) ++ goto out; ++ } else if (unlikely(!S_ISDIR(h_mode))) { ++ err = -ENOTDIR; ++ goto out; ++ } ++ } ++ ++ err = 0; ++ /* expected parent dir is locked */ ++ if (unlikely(h_parent != h_dentry->d_parent)) ++ err = -EIO; ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ++ * initial procedure of adding a new entry. ++ * prepare writable branch and the parent dir, lock it, ++ * and lookup whiteout for the new entry. ++ */ ++static struct dentry* ++lock_hdir_lkup_wh(struct dentry *dentry, struct au_dtime *dt, ++ struct dentry *src_dentry, struct au_pin *pin, ++ struct au_wr_dir_args *wr_dir_args) ++{ ++ struct dentry *wh_dentry, *h_parent; ++ struct super_block *sb; ++ struct au_branch *br; ++ int err; ++ unsigned int udba; ++ aufs_bindex_t bcpup; ++ ++ AuDbg("%.*s\n", AuDLNPair(dentry)); ++ ++ err = au_wr_dir(dentry, src_dentry, wr_dir_args); ++ bcpup = err; ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err < 0)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ udba = au_opt_udba(sb); ++ err = au_pin(pin, dentry, bcpup, udba, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ h_parent = au_pinned_h_parent(pin); ++ if (udba != AuOpt_UDBA_NONE ++ && au_dbstart(dentry) == bcpup) ++ err = au_may_add(dentry, bcpup, h_parent, ++ au_ftest_wrdir(wr_dir_args->flags, ISDIR)); ++ else if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) ++ err = -ENAMETOOLONG; ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_unpin; ++ ++ br = au_sbr(sb, bcpup); ++ if (dt) { ++ struct path tmp = { ++ .dentry = h_parent, ++ .mnt = br->br_mnt ++ }; ++ au_dtime_store(dt, au_pinned_parent(pin), &tmp); ++ } ++ ++ wh_dentry = NULL; ++ if (bcpup != au_dbwh(dentry)) ++ goto out; /* success */ ++ ++ wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, br); ++ ++out_unpin: ++ if (IS_ERR(wh_dentry)) ++ au_unpin(pin); ++out: ++ return wh_dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++enum { Mknod, Symlink, Creat }; ++struct simple_arg { ++ int type; ++ union { ++ struct { ++ umode_t mode; ++ struct nameidata *nd; ++ } c; ++ struct { ++ const char *symname; ++ } s; ++ struct { ++ umode_t mode; ++ dev_t dev; ++ } m; ++ } u; ++}; ++ ++static int add_simple(struct inode *dir, struct dentry *dentry, ++ struct simple_arg *arg) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ unsigned char created; ++ struct au_dtime dt; ++ struct au_pin pin; ++ struct path h_path; ++ struct dentry *wh_dentry, *parent; ++ struct inode *h_dir; ++ struct au_wr_dir_args wr_dir_args = { ++ .force_btgt = -1, ++ .flags = AuWrDir_ADD_ENTRY ++ }; ++ ++ AuDbg("%.*s\n", AuDLNPair(dentry)); ++ IMustLock(dir); ++ ++ parent = dentry->d_parent; /* dir inode is locked */ ++ err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); ++ if (unlikely(err)) ++ goto out; ++ err = au_d_may_add(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ di_write_lock_parent(parent); ++ wh_dentry = lock_hdir_lkup_wh(dentry, &dt, /*src_dentry*/NULL, &pin, ++ &wr_dir_args); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_parent; ++ ++ bstart = au_dbstart(dentry); ++ h_path.dentry = au_h_dptr(dentry, bstart); ++ h_path.mnt = au_sbr_mnt(dentry->d_sb, bstart); ++ h_dir = au_pinned_h_dir(&pin); ++ switch (arg->type) { ++ case Creat: ++ err = vfsub_create(h_dir, &h_path, arg->u.c.mode); ++ break; ++ case Symlink: ++ err = vfsub_symlink(h_dir, &h_path, arg->u.s.symname); ++ break; ++ case Mknod: ++ err = vfsub_mknod(h_dir, &h_path, arg->u.m.mode, arg->u.m.dev); ++ break; ++ default: ++ BUG(); ++ } ++ created = !err; ++ if (!err) ++ err = epilog(dir, bstart, wh_dentry, dentry); ++ ++ /* revert */ ++ if (unlikely(created && err && h_path.dentry->d_inode)) { ++ int rerr; ++ rerr = vfsub_unlink(h_dir, &h_path, /*force*/0); ++ if (rerr) { ++ AuIOErr("%.*s revert failure(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ err = -EIO; ++ } ++ au_dtime_revert(&dt); ++ } ++ ++ au_unpin(&pin); ++ dput(wh_dentry); ++ ++out_parent: ++ di_write_unlock(parent); ++out_unlock: ++ if (unlikely(err)) { ++ au_update_dbstart(dentry); ++ d_drop(dentry); ++ } ++ aufs_read_unlock(dentry, AuLock_DW); ++out: ++ return err; ++} ++ ++int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, ++ dev_t dev) ++{ ++ struct simple_arg arg = { ++ .type = Mknod, ++ .u.m = { ++ .mode = mode, ++ .dev = dev ++ } ++ }; ++ return add_simple(dir, dentry, &arg); ++} ++ ++int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) ++{ ++ struct simple_arg arg = { ++ .type = Symlink, ++ .u.s.symname = symname ++ }; ++ return add_simple(dir, dentry, &arg); ++} ++ ++int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode, ++ struct nameidata *nd) ++{ ++ struct simple_arg arg = { ++ .type = Creat, ++ .u.c = { ++ .mode = mode, ++ .nd = nd ++ } ++ }; ++ return add_simple(dir, dentry, &arg); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_link_args { ++ aufs_bindex_t bdst, bsrc; ++ struct au_pin pin; ++ struct path h_path; ++ struct dentry *src_parent, *parent; ++}; ++ ++static int au_cpup_before_link(struct dentry *src_dentry, ++ struct au_link_args *a) ++{ ++ int err; ++ struct dentry *h_src_dentry; ++ struct mutex *h_mtx; ++ struct file *h_file; ++ ++ di_read_lock_parent(a->src_parent, AuLock_IR); ++ err = au_test_and_cpup_dirs(src_dentry, a->bdst); ++ if (unlikely(err)) ++ goto out; ++ ++ h_src_dentry = au_h_dptr(src_dentry, a->bsrc); ++ h_mtx = &h_src_dentry->d_inode->i_mutex; ++ err = au_pin(&a->pin, src_dentry, a->bdst, ++ au_opt_udba(src_dentry->d_sb), ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (unlikely(err)) ++ goto out; ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ h_file = au_h_open_pre(src_dentry, a->bsrc); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ h_file = NULL; ++ } else ++ err = au_sio_cpup_simple(src_dentry, a->bdst, -1, ++ AuCpup_DTIME /* | AuCpup_KEEPLINO */); ++ mutex_unlock(h_mtx); ++ au_h_open_post(src_dentry, a->bsrc, h_file); ++ au_unpin(&a->pin); ++ ++out: ++ di_read_unlock(a->src_parent, AuLock_IR); ++ return err; ++} ++ ++static int au_cpup_or_link(struct dentry *src_dentry, struct au_link_args *a) ++{ ++ int err; ++ unsigned char plink; ++ struct inode *h_inode, *inode; ++ struct dentry *h_src_dentry; ++ struct super_block *sb; ++ struct file *h_file; ++ ++ plink = 0; ++ h_inode = NULL; ++ sb = src_dentry->d_sb; ++ inode = src_dentry->d_inode; ++ if (au_ibstart(inode) <= a->bdst) ++ h_inode = au_h_iptr(inode, a->bdst); ++ if (!h_inode || !h_inode->i_nlink) { ++ /* copyup src_dentry as the name of dentry. */ ++ au_set_dbstart(src_dentry, a->bdst); ++ au_set_h_dptr(src_dentry, a->bdst, dget(a->h_path.dentry)); ++ h_inode = au_h_dptr(src_dentry, a->bsrc)->d_inode; ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ h_file = au_h_open_pre(src_dentry, a->bsrc); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ h_file = NULL; ++ } else ++ err = au_sio_cpup_single(src_dentry, a->bdst, a->bsrc, ++ -1, AuCpup_KEEPLINO, ++ a->parent); ++ mutex_unlock(&h_inode->i_mutex); ++ au_h_open_post(src_dentry, a->bsrc, h_file); ++ au_set_h_dptr(src_dentry, a->bdst, NULL); ++ au_set_dbstart(src_dentry, a->bsrc); ++ } else { ++ /* the inode of src_dentry already exists on a.bdst branch */ ++ h_src_dentry = d_find_alias(h_inode); ++ if (!h_src_dentry && au_plink_test(inode)) { ++ plink = 1; ++ h_src_dentry = au_plink_lkup(inode, a->bdst); ++ err = PTR_ERR(h_src_dentry); ++ if (IS_ERR(h_src_dentry)) ++ goto out; ++ ++ if (unlikely(!h_src_dentry->d_inode)) { ++ dput(h_src_dentry); ++ h_src_dentry = NULL; ++ } ++ ++ } ++ if (h_src_dentry) { ++ err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), ++ &a->h_path); ++ dput(h_src_dentry); ++ } else { ++ AuIOErr("no dentry found for hi%lu on b%d\n", ++ h_inode->i_ino, a->bdst); ++ err = -EIO; ++ } ++ } ++ ++ if (!err && !plink) ++ au_plink_append(inode, a->bdst, a->h_path.dentry); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++int aufs_link(struct dentry *src_dentry, struct inode *dir, ++ struct dentry *dentry) ++{ ++ int err, rerr; ++ struct au_dtime dt; ++ struct au_link_args *a; ++ struct dentry *wh_dentry, *h_src_dentry; ++ struct inode *inode; ++ struct super_block *sb; ++ struct au_wr_dir_args wr_dir_args = { ++ /* .force_btgt = -1, */ ++ .flags = AuWrDir_ADD_ENTRY ++ }; ++ ++ IMustLock(dir); ++ inode = src_dentry->d_inode; ++ IMustLock(inode); ++ ++ err = -ENOMEM; ++ a = kzalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ a->parent = dentry->d_parent; /* dir inode is locked */ ++ err = aufs_read_and_write_lock2(dentry, src_dentry, ++ AuLock_NOPLM | AuLock_GEN); ++ if (unlikely(err)) ++ goto out_kfree; ++ err = au_d_hashed_positive(src_dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ err = au_d_may_add(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ a->src_parent = dget_parent(src_dentry); ++ wr_dir_args.force_btgt = au_ibstart(inode); ++ ++ di_write_lock_parent(a->parent); ++ wr_dir_args.force_btgt = au_wbr(dentry, wr_dir_args.force_btgt); ++ wh_dentry = lock_hdir_lkup_wh(dentry, &dt, src_dentry, &a->pin, ++ &wr_dir_args); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_parent; ++ ++ err = 0; ++ sb = dentry->d_sb; ++ a->bdst = au_dbstart(dentry); ++ a->h_path.dentry = au_h_dptr(dentry, a->bdst); ++ a->h_path.mnt = au_sbr_mnt(sb, a->bdst); ++ a->bsrc = au_ibstart(inode); ++ h_src_dentry = au_h_d_alias(src_dentry, a->bsrc); ++ if (!h_src_dentry) { ++ a->bsrc = au_dbstart(src_dentry); ++ h_src_dentry = au_h_d_alias(src_dentry, a->bsrc); ++ AuDebugOn(!h_src_dentry); ++ } else if (IS_ERR(h_src_dentry)) ++ goto out_parent; ++ ++ if (au_opt_test(au_mntflags(sb), PLINK)) { ++ if (a->bdst < a->bsrc ++ /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) ++ err = au_cpup_or_link(src_dentry, a); ++ else ++ err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), ++ &a->h_path); ++ dput(h_src_dentry); ++ } else { ++ /* ++ * copyup src_dentry to the branch we process, ++ * and then link(2) to it. ++ */ ++ dput(h_src_dentry); ++ if (a->bdst < a->bsrc ++ /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) { ++ au_unpin(&a->pin); ++ di_write_unlock(a->parent); ++ err = au_cpup_before_link(src_dentry, a); ++ di_write_lock_parent(a->parent); ++ if (!err) ++ err = au_pin(&a->pin, dentry, a->bdst, ++ au_opt_udba(sb), ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ if (unlikely(err)) ++ goto out_wh; ++ } ++ if (!err) { ++ h_src_dentry = au_h_dptr(src_dentry, a->bdst); ++ err = -ENOENT; ++ if (h_src_dentry && h_src_dentry->d_inode) ++ err = vfsub_link(h_src_dentry, ++ au_pinned_h_dir(&a->pin), ++ &a->h_path); ++ } ++ } ++ if (unlikely(err)) ++ goto out_unpin; ++ ++ if (wh_dentry) { ++ a->h_path.dentry = wh_dentry; ++ err = au_wh_unlink_dentry(au_pinned_h_dir(&a->pin), &a->h_path, ++ dentry); ++ if (unlikely(err)) ++ goto out_revert; ++ } ++ ++ dir->i_version++; ++ if (au_ibstart(dir) == au_dbstart(dentry)) ++ au_cpup_attr_timesizes(dir); ++ inc_nlink(inode); ++ inode->i_ctime = dir->i_ctime; ++ d_instantiate(dentry, au_igrab(inode)); ++ if (d_unhashed(a->h_path.dentry)) ++ /* some filesystem calls d_drop() */ ++ d_drop(dentry); ++ goto out_unpin; /* success */ ++ ++out_revert: ++ rerr = vfsub_unlink(au_pinned_h_dir(&a->pin), &a->h_path, /*force*/0); ++ if (unlikely(rerr)) { ++ AuIOErr("%.*s reverting failed(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ err = -EIO; ++ } ++ au_dtime_revert(&dt); ++out_unpin: ++ au_unpin(&a->pin); ++out_wh: ++ dput(wh_dentry); ++out_parent: ++ di_write_unlock(a->parent); ++ dput(a->src_parent); ++out_unlock: ++ if (unlikely(err)) { ++ au_update_dbstart(dentry); ++ d_drop(dentry); ++ } ++ aufs_read_and_write_unlock2(dentry, src_dentry); ++out_kfree: ++ kfree(a); ++out: ++ return err; ++} ++ ++int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ++{ ++ int err, rerr; ++ aufs_bindex_t bindex; ++ unsigned char diropq; ++ struct path h_path; ++ struct dentry *wh_dentry, *parent, *opq_dentry; ++ struct mutex *h_mtx; ++ struct super_block *sb; ++ struct { ++ struct au_pin pin; ++ struct au_dtime dt; ++ } *a; /* reduce the stack usage */ ++ struct au_wr_dir_args wr_dir_args = { ++ .force_btgt = -1, ++ .flags = AuWrDir_ADD_ENTRY | AuWrDir_ISDIR ++ }; ++ ++ IMustLock(dir); ++ ++ err = -ENOMEM; ++ a = kmalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); ++ if (unlikely(err)) ++ goto out_free; ++ err = au_d_may_add(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_write_lock_parent(parent); ++ wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL, ++ &a->pin, &wr_dir_args); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_parent; ++ ++ sb = dentry->d_sb; ++ bindex = au_dbstart(dentry); ++ h_path.dentry = au_h_dptr(dentry, bindex); ++ h_path.mnt = au_sbr_mnt(sb, bindex); ++ err = vfsub_mkdir(au_pinned_h_dir(&a->pin), &h_path, mode); ++ if (unlikely(err)) ++ goto out_unpin; ++ ++ /* make the dir opaque */ ++ diropq = 0; ++ h_mtx = &h_path.dentry->d_inode->i_mutex; ++ if (wh_dentry ++ || au_opt_test(au_mntflags(sb), ALWAYS_DIROPQ)) { ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ opq_dentry = au_diropq_create(dentry, bindex); ++ mutex_unlock(h_mtx); ++ err = PTR_ERR(opq_dentry); ++ if (IS_ERR(opq_dentry)) ++ goto out_dir; ++ dput(opq_dentry); ++ diropq = 1; ++ } ++ ++ err = epilog(dir, bindex, wh_dentry, dentry); ++ if (!err) { ++ inc_nlink(dir); ++ goto out_unpin; /* success */ ++ } ++ ++ /* revert */ ++ if (diropq) { ++ AuLabel(revert opq); ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ rerr = au_diropq_remove(dentry, bindex); ++ mutex_unlock(h_mtx); ++ if (rerr) { ++ AuIOErr("%.*s reverting diropq failed(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ err = -EIO; ++ } ++ } ++ ++out_dir: ++ AuLabel(revert dir); ++ rerr = vfsub_rmdir(au_pinned_h_dir(&a->pin), &h_path); ++ if (rerr) { ++ AuIOErr("%.*s reverting dir failed(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ err = -EIO; ++ } ++ au_dtime_revert(&a->dt); ++out_unpin: ++ au_unpin(&a->pin); ++ dput(wh_dentry); ++out_parent: ++ di_write_unlock(parent); ++out_unlock: ++ if (unlikely(err)) { ++ au_update_dbstart(dentry); ++ d_drop(dentry); ++ } ++ aufs_read_unlock(dentry, AuLock_DW); ++out_free: ++ kfree(a); ++out: ++ return err; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/i_op.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,992 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode operations (except add/del/rename) ++ */ ++ ++#include ++#include ++#include ++#include ++#include "aufs.h" ++ ++static int h_permission(struct inode *h_inode, int mask, ++ struct vfsmount *h_mnt, int brperm) ++{ ++ int err; ++ const unsigned char write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); ++ ++ err = -EACCES; ++ if ((write_mask && IS_IMMUTABLE(h_inode)) ++ || ((mask & MAY_EXEC) ++ && S_ISREG(h_inode->i_mode) ++ && ((h_mnt->mnt_flags & MNT_NOEXEC) ++ || !(h_inode->i_mode & S_IXUGO)))) ++ goto out; ++ ++ /* ++ * - skip the lower fs test in the case of write to ro branch. ++ * - nfs dir permission write check is optimized, but a policy for ++ * link/rename requires a real check. ++ */ ++ if ((write_mask && !au_br_writable(brperm)) ++ || (au_test_nfs(h_inode->i_sb) && S_ISDIR(h_inode->i_mode) ++ && write_mask && !(mask & MAY_READ)) ++ || !h_inode->i_op->permission) { ++ /* AuLabel(generic_permission); */ ++ err = generic_permission(h_inode, mask); ++ } else { ++ /* AuLabel(h_inode->permission); */ ++ err = h_inode->i_op->permission(h_inode, mask); ++ AuTraceErr(err); ++ } ++ ++ if (!err) ++ err = devcgroup_inode_permission(h_inode, mask); ++ if (!err) ++ err = security_inode_permission(h_inode, mask); ++ ++#if 0 ++ if (!err) { ++ /* todo: do we need to call ima_path_check()? */ ++ struct path h_path = { ++ .dentry = ++ .mnt = h_mnt ++ }; ++ err = ima_path_check(&h_path, ++ mask & (MAY_READ | MAY_WRITE | MAY_EXEC), ++ IMA_COUNT_LEAVE); ++ } ++#endif ++ ++out: ++ return err; ++} ++ ++static int aufs_permission(struct inode *inode, int mask) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ const unsigned char isdir = !!S_ISDIR(inode->i_mode), ++ write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); ++ struct inode *h_inode; ++ struct super_block *sb; ++ struct au_branch *br; ++ ++ /* todo: support rcu-walk? */ ++ if (mask & MAY_NOT_BLOCK) ++ return -ECHILD; ++ ++ sb = inode->i_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ ii_read_lock_child(inode); ++#if 0 ++ err = au_iigen_test(inode, au_sigen(sb)); ++ if (unlikely(err)) ++ goto out; ++#endif ++ ++ if (!isdir || write_mask) { ++ err = au_busy_or_stale(); ++ h_inode = au_h_iptr(inode, au_ibstart(inode)); ++ if (unlikely(!h_inode ++ || (h_inode->i_mode & S_IFMT) ++ != (inode->i_mode & S_IFMT))) ++ goto out; ++ ++ err = 0; ++ bindex = au_ibstart(inode); ++ br = au_sbr(sb, bindex); ++ err = h_permission(h_inode, mask, br->br_mnt, br->br_perm); ++ if (write_mask ++ && !err ++ && !special_file(h_inode->i_mode)) { ++ /* test whether the upper writable branch exists */ ++ err = -EROFS; ++ for (; bindex >= 0; bindex--) ++ if (!au_br_rdonly(au_sbr(sb, bindex))) { ++ err = 0; ++ break; ++ } ++ } ++ goto out; ++ } ++ ++ /* non-write to dir */ ++ err = 0; ++ bend = au_ibend(inode); ++ for (bindex = au_ibstart(inode); !err && bindex <= bend; bindex++) { ++ h_inode = au_h_iptr(inode, bindex); ++ if (h_inode) { ++ err = au_busy_or_stale(); ++ if (unlikely(!S_ISDIR(h_inode->i_mode))) ++ break; ++ ++ br = au_sbr(sb, bindex); ++ err = h_permission(h_inode, mask, br->br_mnt, ++ br->br_perm); ++ } ++ } ++ ++out: ++ ii_read_unlock(inode); ++ si_read_unlock(sb); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry *aufs_lookup(struct inode *dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct dentry *ret, *parent; ++ struct inode *inode; ++ struct super_block *sb; ++ int err, npositive, lc_idx; ++ ++ IMustLock(dir); ++ ++ sb = dir->i_sb; ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ ret = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ ret = ERR_PTR(-ENAMETOOLONG); ++ if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) ++ goto out_si; ++ err = au_di_init(dentry); ++ ret = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_si; ++ ++ inode = NULL; ++ npositive = 0; /* suppress a warning */ ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_read_lock_parent(parent, AuLock_IR); ++ err = au_alive_dir(parent); ++ if (!err) ++ err = au_digen_test(parent, au_sigen(sb)); ++ if (!err) { ++ npositive = au_lkup_dentry(dentry, au_dbstart(parent), ++ /*type*/0, nd); ++ err = npositive; ++ } ++ di_read_unlock(parent, AuLock_IR); ++ ret = ERR_PTR(err); ++ if (unlikely(err < 0)) ++ goto out_unlock; ++ ++ if (npositive) { ++ inode = au_new_inode(dentry, /*must_new*/0); ++ ret = (void *)inode; ++ } ++ if (IS_ERR(inode)) { ++ inode = NULL; ++ goto out_unlock; ++ } ++ ++ ret = d_splice_alias(inode, dentry); ++ if (unlikely(IS_ERR(ret) && inode)) { ++ ii_write_unlock(inode); ++ lc_idx = AuLcNonDir_IIINFO; ++ if (S_ISLNK(inode->i_mode)) ++ lc_idx = AuLcSymlink_IIINFO; ++ else if (S_ISDIR(inode->i_mode)) ++ lc_idx = AuLcDir_IIINFO; ++ au_rw_class(&au_ii(inode)->ii_rwsem, au_lc_key + lc_idx); ++ iput(inode); ++ } ++ ++out_unlock: ++ di_write_unlock(dentry); ++ if (unlikely(IS_ERR(ret) && inode)) { ++ lc_idx = AuLcNonDir_DIINFO; ++ if (S_ISLNK(inode->i_mode)) ++ lc_idx = AuLcSymlink_DIINFO; ++ else if (S_ISDIR(inode->i_mode)) ++ lc_idx = AuLcDir_DIINFO; ++ au_rw_class(&au_di(dentry)->di_rwsem, au_lc_key + lc_idx); ++ } ++out_si: ++ si_read_unlock(sb); ++out: ++ return ret; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_wr_dir_cpup(struct dentry *dentry, struct dentry *parent, ++ const unsigned char add_entry, aufs_bindex_t bcpup, ++ aufs_bindex_t bstart) ++{ ++ int err; ++ struct dentry *h_parent; ++ struct inode *h_dir; ++ ++ if (add_entry) ++ IMustLock(parent->d_inode); ++ else ++ di_write_lock_parent(parent); ++ ++ err = 0; ++ if (!au_h_dptr(parent, bcpup)) { ++ if (bstart < bcpup) ++ err = au_cpdown_dirs(dentry, bcpup); ++ else ++ err = au_cpup_dirs(dentry, bcpup); ++ } ++ if (!err && add_entry) { ++ h_parent = au_h_dptr(parent, bcpup); ++ h_dir = h_parent->d_inode; ++ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); ++ err = au_lkup_neg(dentry, bcpup); ++ /* todo: no unlock here */ ++ mutex_unlock(&h_dir->i_mutex); ++ ++ AuDbg("bcpup %d\n", bcpup); ++ if (!err) { ++ if (!dentry->d_inode) ++ au_set_h_dptr(dentry, bstart, NULL); ++ au_update_dbrange(dentry, /*do_put_zero*/0); ++ } ++ } ++ ++ if (!add_entry) ++ di_write_unlock(parent); ++ if (!err) ++ err = bcpup; /* success */ ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ++ * decide the branch and the parent dir where we will create a new entry. ++ * returns new bindex or an error. ++ * copyup the parent dir if needed. ++ */ ++int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, ++ struct au_wr_dir_args *args) ++{ ++ int err; ++ aufs_bindex_t bcpup, bstart, src_bstart; ++ const unsigned char add_entry = !!au_ftest_wrdir(args->flags, ++ ADD_ENTRY); ++ struct super_block *sb; ++ struct dentry *parent; ++ struct au_sbinfo *sbinfo; ++ ++ sb = dentry->d_sb; ++ sbinfo = au_sbi(sb); ++ parent = dget_parent(dentry); ++ bstart = au_dbstart(dentry); ++ bcpup = bstart; ++ if (args->force_btgt < 0) { ++ if (src_dentry) { ++ src_bstart = au_dbstart(src_dentry); ++ if (src_bstart < bstart) ++ bcpup = src_bstart; ++ } else if (add_entry) { ++ err = AuWbrCreate(sbinfo, dentry, ++ au_ftest_wrdir(args->flags, ISDIR)); ++ bcpup = err; ++ } ++ ++ if (bcpup < 0 || au_test_ro(sb, bcpup, dentry->d_inode)) { ++ if (add_entry) ++ err = AuWbrCopyup(sbinfo, dentry); ++ else { ++ if (!IS_ROOT(dentry)) { ++ di_read_lock_parent(parent, !AuLock_IR); ++ err = AuWbrCopyup(sbinfo, dentry); ++ di_read_unlock(parent, !AuLock_IR); ++ } else ++ err = AuWbrCopyup(sbinfo, dentry); ++ } ++ bcpup = err; ++ if (unlikely(err < 0)) ++ goto out; ++ } ++ } else { ++ bcpup = args->force_btgt; ++ AuDebugOn(au_test_ro(sb, bcpup, dentry->d_inode)); ++ } ++ ++ AuDbg("bstart %d, bcpup %d\n", bstart, bcpup); ++ err = bcpup; ++ if (bcpup == bstart) ++ goto out; /* success */ ++ ++ /* copyup the new parent into the branch we process */ ++ err = au_wr_dir_cpup(dentry, parent, add_entry, bcpup, bstart); ++ if (err >= 0) { ++ if (!dentry->d_inode) { ++ au_set_h_dptr(dentry, bstart, NULL); ++ au_set_dbstart(dentry, bcpup); ++ au_set_dbend(dentry, bcpup); ++ } ++ AuDebugOn(add_entry && !au_h_dptr(dentry, bcpup)); ++ } ++ ++out: ++ dput(parent); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct dentry *au_pinned_h_parent(struct au_pin *pin) ++{ ++ if (pin && pin->parent) ++ return au_h_dptr(pin->parent, pin->bindex); ++ return NULL; ++} ++ ++void au_unpin(struct au_pin *p) ++{ ++ if (p->h_mnt && au_ftest_pin(p->flags, MNT_WRITE)) ++ mnt_drop_write(p->h_mnt); ++ if (!p->hdir) ++ return; ++ ++ au_hn_imtx_unlock(p->hdir); ++ if (!au_ftest_pin(p->flags, DI_LOCKED)) ++ di_read_unlock(p->parent, AuLock_IR); ++ iput(p->hdir->hi_inode); ++ dput(p->parent); ++ p->parent = NULL; ++ p->hdir = NULL; ++ p->h_mnt = NULL; ++} ++ ++int au_do_pin(struct au_pin *p) ++{ ++ int err; ++ struct super_block *sb; ++ struct dentry *h_dentry, *h_parent; ++ struct au_branch *br; ++ struct inode *h_dir; ++ ++ err = 0; ++ sb = p->dentry->d_sb; ++ br = au_sbr(sb, p->bindex); ++ if (IS_ROOT(p->dentry)) { ++ if (au_ftest_pin(p->flags, MNT_WRITE)) { ++ p->h_mnt = br->br_mnt; ++ err = mnt_want_write(p->h_mnt); ++ if (unlikely(err)) { ++ au_fclr_pin(p->flags, MNT_WRITE); ++ goto out_err; ++ } ++ } ++ goto out; ++ } ++ ++ h_dentry = NULL; ++ if (p->bindex <= au_dbend(p->dentry)) ++ h_dentry = au_h_dptr(p->dentry, p->bindex); ++ ++ p->parent = dget_parent(p->dentry); ++ if (!au_ftest_pin(p->flags, DI_LOCKED)) ++ di_read_lock(p->parent, AuLock_IR, p->lsc_di); ++ ++ h_dir = NULL; ++ h_parent = au_h_dptr(p->parent, p->bindex); ++ p->hdir = au_hi(p->parent->d_inode, p->bindex); ++ if (p->hdir) ++ h_dir = p->hdir->hi_inode; ++ ++ /* ++ * udba case, or ++ * if DI_LOCKED is not set, then p->parent may be different ++ * and h_parent can be NULL. ++ */ ++ if (unlikely(!p->hdir || !h_dir || !h_parent)) { ++ err = -EBUSY; ++ if (!au_ftest_pin(p->flags, DI_LOCKED)) ++ di_read_unlock(p->parent, AuLock_IR); ++ dput(p->parent); ++ p->parent = NULL; ++ goto out_err; ++ } ++ ++ au_igrab(h_dir); ++ au_hn_imtx_lock_nested(p->hdir, p->lsc_hi); ++ ++ if (unlikely(p->hdir->hi_inode != h_parent->d_inode)) { ++ err = -EBUSY; ++ goto out_unpin; ++ } ++ if (h_dentry) { ++ err = au_h_verify(h_dentry, p->udba, h_dir, h_parent, br); ++ if (unlikely(err)) { ++ au_fclr_pin(p->flags, MNT_WRITE); ++ goto out_unpin; ++ } ++ } ++ ++ if (au_ftest_pin(p->flags, MNT_WRITE)) { ++ p->h_mnt = br->br_mnt; ++ err = mnt_want_write(p->h_mnt); ++ if (unlikely(err)) { ++ au_fclr_pin(p->flags, MNT_WRITE); ++ goto out_unpin; ++ } ++ } ++ goto out; /* success */ ++ ++out_unpin: ++ au_unpin(p); ++out_err: ++ pr_err("err %d\n", err); ++ err = au_busy_or_stale(); ++out: ++ return err; ++} ++ ++void au_pin_init(struct au_pin *p, struct dentry *dentry, ++ aufs_bindex_t bindex, int lsc_di, int lsc_hi, ++ unsigned int udba, unsigned char flags) ++{ ++ p->dentry = dentry; ++ p->udba = udba; ++ p->lsc_di = lsc_di; ++ p->lsc_hi = lsc_hi; ++ p->flags = flags; ++ p->bindex = bindex; ++ ++ p->parent = NULL; ++ p->hdir = NULL; ++ p->h_mnt = NULL; ++} ++ ++int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int udba, unsigned char flags) ++{ ++ au_pin_init(pin, dentry, bindex, AuLsc_DI_PARENT, AuLsc_I_PARENT2, ++ udba, flags); ++ return au_do_pin(pin); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * ->setattr() and ->getattr() are called in various cases. ++ * chmod, stat: dentry is revalidated. ++ * fchmod, fstat: file and dentry are not revalidated, additionally they may be ++ * unhashed. ++ * for ->setattr(), ia->ia_file is passed from ftruncate only. ++ */ ++/* todo: consolidate with do_refresh() and simple_reval_dpath() */ ++static int au_reval_for_attr(struct dentry *dentry, unsigned int sigen) ++{ ++ int err; ++ struct inode *inode; ++ struct dentry *parent; ++ ++ err = 0; ++ inode = dentry->d_inode; ++ if (au_digen_test(dentry, sigen)) { ++ parent = dget_parent(dentry); ++ di_read_lock_parent(parent, AuLock_IR); ++ err = au_refresh_dentry(dentry, parent); ++ di_read_unlock(parent, AuLock_IR); ++ dput(parent); ++ } ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++#define AuIcpup_DID_CPUP 1 ++#define au_ftest_icpup(flags, name) ((flags) & AuIcpup_##name) ++#define au_fset_icpup(flags, name) \ ++ do { (flags) |= AuIcpup_##name; } while (0) ++#define au_fclr_icpup(flags, name) \ ++ do { (flags) &= ~AuIcpup_##name; } while (0) ++ ++struct au_icpup_args { ++ unsigned char flags; ++ unsigned char pin_flags; ++ aufs_bindex_t btgt; ++ unsigned int udba; ++ struct au_pin pin; ++ struct path h_path; ++ struct inode *h_inode; ++}; ++ ++static int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia, ++ struct au_icpup_args *a) ++{ ++ int err; ++ loff_t sz; ++ aufs_bindex_t bstart, ibstart; ++ struct dentry *hi_wh, *parent; ++ struct inode *inode; ++ struct file *h_file; ++ struct au_wr_dir_args wr_dir_args = { ++ .force_btgt = -1, ++ .flags = 0 ++ }; ++ ++ bstart = au_dbstart(dentry); ++ inode = dentry->d_inode; ++ if (S_ISDIR(inode->i_mode)) ++ au_fset_wrdir(wr_dir_args.flags, ISDIR); ++ /* plink or hi_wh() case */ ++ ibstart = au_ibstart(inode); ++ if (bstart != ibstart && !au_test_ro(inode->i_sb, ibstart, inode)) ++ wr_dir_args.force_btgt = ibstart; ++ err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args); ++ if (unlikely(err < 0)) ++ goto out; ++ a->btgt = err; ++ if (err != bstart) ++ au_fset_icpup(a->flags, DID_CPUP); ++ ++ err = 0; ++ a->pin_flags = AuPin_MNT_WRITE; ++ parent = NULL; ++ if (!IS_ROOT(dentry)) { ++ au_fset_pin(a->pin_flags, DI_LOCKED); ++ parent = dget_parent(dentry); ++ di_write_lock_parent(parent); ++ } ++ ++ err = au_pin(&a->pin, dentry, a->btgt, a->udba, a->pin_flags); ++ if (unlikely(err)) ++ goto out_parent; ++ ++ a->h_path.dentry = au_h_dptr(dentry, bstart); ++ a->h_inode = a->h_path.dentry->d_inode; ++ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); ++ sz = -1; ++ if ((ia->ia_valid & ATTR_SIZE) && ia->ia_size < i_size_read(a->h_inode)) ++ sz = ia->ia_size; ++ ++ h_file = NULL; ++ hi_wh = NULL; ++ if (au_ftest_icpup(a->flags, DID_CPUP) && d_unlinked(dentry)) { ++ hi_wh = au_hi_wh(inode, a->btgt); ++ if (!hi_wh) { ++ err = au_sio_cpup_wh(dentry, a->btgt, sz, /*file*/NULL); ++ if (unlikely(err)) ++ goto out_unlock; ++ hi_wh = au_hi_wh(inode, a->btgt); ++ /* todo: revalidate hi_wh? */ ++ } ++ } ++ ++ if (parent) { ++ au_pin_set_parent_lflag(&a->pin, /*lflag*/0); ++ di_downgrade_lock(parent, AuLock_IR); ++ dput(parent); ++ parent = NULL; ++ } ++ if (!au_ftest_icpup(a->flags, DID_CPUP)) ++ goto out; /* success */ ++ ++ if (!d_unhashed(dentry)) { ++ h_file = au_h_open_pre(dentry, bstart); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ h_file = NULL; ++ } else ++ err = au_sio_cpup_simple(dentry, a->btgt, sz, ++ AuCpup_DTIME); ++ if (!err) ++ a->h_path.dentry = au_h_dptr(dentry, a->btgt); ++ } else if (!hi_wh) ++ a->h_path.dentry = au_h_dptr(dentry, a->btgt); ++ else ++ a->h_path.dentry = hi_wh; /* do not dget here */ ++ ++out_unlock: ++ mutex_unlock(&a->h_inode->i_mutex); ++ au_h_open_post(dentry, bstart, h_file); ++ a->h_inode = a->h_path.dentry->d_inode; ++ if (!err) { ++ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); ++ goto out; /* success */ ++ } ++ ++ au_unpin(&a->pin); ++out_parent: ++ if (parent) { ++ di_write_unlock(parent); ++ dput(parent); ++ } ++out: ++ return err; ++} ++ ++static int aufs_setattr(struct dentry *dentry, struct iattr *ia) ++{ ++ int err; ++ struct inode *inode; ++ struct super_block *sb; ++ struct file *file; ++ struct au_icpup_args *a; ++ ++ inode = dentry->d_inode; ++ IMustLock(inode); ++ ++ err = -ENOMEM; ++ a = kzalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) ++ ia->ia_valid &= ~ATTR_MODE; ++ ++ file = NULL; ++ sb = dentry->d_sb; ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out_kfree; ++ ++ if (ia->ia_valid & ATTR_FILE) { ++ /* currently ftruncate(2) only */ ++ AuDebugOn(!S_ISREG(inode->i_mode)); ++ file = ia->ia_file; ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); ++ if (unlikely(err)) ++ goto out_si; ++ ia->ia_file = au_hf_top(file); ++ a->udba = AuOpt_UDBA_NONE; ++ } else { ++ /* fchmod() doesn't pass ia_file */ ++ a->udba = au_opt_udba(sb); ++ di_write_lock_child(dentry); ++ /* no d_unlinked(), to set UDBA_NONE for root */ ++ if (d_unhashed(dentry)) ++ a->udba = AuOpt_UDBA_NONE; ++ if (a->udba != AuOpt_UDBA_NONE) { ++ AuDebugOn(IS_ROOT(dentry)); ++ err = au_reval_for_attr(dentry, au_sigen(sb)); ++ if (unlikely(err)) ++ goto out_dentry; ++ } ++ } ++ ++ err = au_pin_and_icpup(dentry, ia, a); ++ if (unlikely(err < 0)) ++ goto out_dentry; ++ if (au_ftest_icpup(a->flags, DID_CPUP)) { ++ ia->ia_file = NULL; ++ ia->ia_valid &= ~ATTR_FILE; ++ } ++ ++ a->h_path.mnt = au_sbr_mnt(sb, a->btgt); ++ if ((ia->ia_valid & (ATTR_MODE | ATTR_CTIME)) ++ == (ATTR_MODE | ATTR_CTIME)) { ++ err = security_path_chmod(&a->h_path, ia->ia_mode); ++ if (unlikely(err)) ++ goto out_unlock; ++ } else if ((ia->ia_valid & (ATTR_UID | ATTR_GID)) ++ && (ia->ia_valid & ATTR_CTIME)) { ++ err = security_path_chown(&a->h_path, ia->ia_uid, ia->ia_gid); ++ if (unlikely(err)) ++ goto out_unlock; ++ } ++ ++ if (ia->ia_valid & ATTR_SIZE) { ++ struct file *f; ++ ++ if (ia->ia_size < i_size_read(inode)) ++ /* unmap only */ ++ truncate_setsize(inode, ia->ia_size); ++ ++ f = NULL; ++ if (ia->ia_valid & ATTR_FILE) ++ f = ia->ia_file; ++ mutex_unlock(&a->h_inode->i_mutex); ++ err = vfsub_trunc(&a->h_path, ia->ia_size, ia->ia_valid, f); ++ mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); ++ } else ++ err = vfsub_notify_change(&a->h_path, ia); ++ if (!err) ++ au_cpup_attr_changeable(inode); ++ ++out_unlock: ++ mutex_unlock(&a->h_inode->i_mutex); ++ au_unpin(&a->pin); ++ if (unlikely(err)) ++ au_update_dbstart(dentry); ++out_dentry: ++ di_write_unlock(dentry); ++ if (file) { ++ fi_write_unlock(file); ++ ia->ia_file = file; ++ ia->ia_valid |= ATTR_FILE; ++ } ++out_si: ++ si_read_unlock(sb); ++out_kfree: ++ kfree(a); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static void au_refresh_iattr(struct inode *inode, struct kstat *st, ++ unsigned int nlink) ++{ ++ unsigned int n; ++ ++ inode->i_mode = st->mode; ++ inode->i_uid = st->uid; ++ inode->i_gid = st->gid; ++ inode->i_atime = st->atime; ++ inode->i_mtime = st->mtime; ++ inode->i_ctime = st->ctime; ++ ++ au_cpup_attr_nlink(inode, /*force*/0); ++ if (S_ISDIR(inode->i_mode)) { ++ n = inode->i_nlink; ++ n -= nlink; ++ n += st->nlink; ++ /* 0 can happen */ ++ vfsub_set_nlink(inode, n); ++ } ++ ++ spin_lock(&inode->i_lock); ++ inode->i_blocks = st->blocks; ++ i_size_write(inode, st->size); ++ spin_unlock(&inode->i_lock); ++} ++ ++static int aufs_getattr(struct vfsmount *mnt __maybe_unused, ++ struct dentry *dentry, struct kstat *st) ++{ ++ int err; ++ unsigned int mnt_flags; ++ aufs_bindex_t bindex; ++ unsigned char udba_none, positive; ++ struct super_block *sb, *h_sb; ++ struct inode *inode; ++ struct vfsmount *h_mnt; ++ struct dentry *h_dentry; ++ ++ sb = dentry->d_sb; ++ inode = dentry->d_inode; ++ err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out; ++ mnt_flags = au_mntflags(sb); ++ udba_none = !!au_opt_test(mnt_flags, UDBA_NONE); ++ ++ /* support fstat(2) */ ++ if (!d_unlinked(dentry) && !udba_none) { ++ unsigned int sigen = au_sigen(sb); ++ err = au_digen_test(dentry, sigen); ++ if (!err) { ++ di_read_lock_child(dentry, AuLock_IR); ++ err = au_dbrange_test(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ } else { ++ AuDebugOn(IS_ROOT(dentry)); ++ di_write_lock_child(dentry); ++ err = au_dbrange_test(dentry); ++ if (!err) ++ err = au_reval_for_attr(dentry, sigen); ++ di_downgrade_lock(dentry, AuLock_IR); ++ if (unlikely(err)) ++ goto out_unlock; ++ } ++ } else ++ di_read_lock_child(dentry, AuLock_IR); ++ ++ bindex = au_ibstart(inode); ++ h_mnt = au_sbr_mnt(sb, bindex); ++ h_sb = h_mnt->mnt_sb; ++ if (!au_test_fs_bad_iattr(h_sb) && udba_none) ++ goto out_fill; /* success */ ++ ++ h_dentry = NULL; ++ if (au_dbstart(dentry) == bindex) ++ h_dentry = dget(au_h_dptr(dentry, bindex)); ++ else if (au_opt_test(mnt_flags, PLINK) && au_plink_test(inode)) { ++ h_dentry = au_plink_lkup(inode, bindex); ++ if (IS_ERR(h_dentry)) ++ goto out_fill; /* pretending success */ ++ } ++ /* illegally overlapped or something */ ++ if (unlikely(!h_dentry)) ++ goto out_fill; /* pretending success */ ++ ++ positive = !!h_dentry->d_inode; ++ if (positive) ++ err = vfs_getattr(h_mnt, h_dentry, st); ++ dput(h_dentry); ++ if (!err) { ++ if (positive) ++ au_refresh_iattr(inode, st, h_dentry->d_inode->i_nlink); ++ goto out_fill; /* success */ ++ } ++ AuTraceErr(err); ++ goto out_unlock; ++ ++out_fill: ++ generic_fillattr(inode, st); ++out_unlock: ++ di_read_unlock(dentry, AuLock_IR); ++ si_read_unlock(sb); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int h_readlink(struct dentry *dentry, int bindex, char __user *buf, ++ int bufsiz) ++{ ++ int err; ++ struct super_block *sb; ++ struct dentry *h_dentry; ++ ++ err = -EINVAL; ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (unlikely(!h_dentry->d_inode->i_op->readlink)) ++ goto out; ++ ++ err = security_inode_readlink(h_dentry); ++ if (unlikely(err)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ if (!au_test_ro(sb, bindex, dentry->d_inode)) { ++ vfsub_touch_atime(au_sbr_mnt(sb, bindex), h_dentry); ++ fsstack_copy_attr_atime(dentry->d_inode, h_dentry->d_inode); ++ } ++ err = h_dentry->d_inode->i_op->readlink(h_dentry, buf, bufsiz); ++ ++out: ++ return err; ++} ++ ++static int aufs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) ++{ ++ int err; ++ ++ err = aufs_read_lock(dentry, AuLock_IR | AuLock_GEN); ++ if (unlikely(err)) ++ goto out; ++ err = au_d_hashed_positive(dentry); ++ if (!err) ++ err = h_readlink(dentry, au_dbstart(dentry), buf, bufsiz); ++ aufs_read_unlock(dentry, AuLock_IR); ++ ++out: ++ return err; ++} ++ ++static void *aufs_follow_link(struct dentry *dentry, struct nameidata *nd) ++{ ++ int err; ++ mm_segment_t old_fs; ++ union { ++ char *k; ++ char __user *u; ++ } buf; ++ ++ err = -ENOMEM; ++ buf.k = __getname_gfp(GFP_NOFS); ++ if (unlikely(!buf.k)) ++ goto out; ++ ++ err = aufs_read_lock(dentry, AuLock_IR | AuLock_GEN); ++ if (unlikely(err)) ++ goto out_name; ++ ++ err = au_d_hashed_positive(dentry); ++ if (!err) { ++ old_fs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = h_readlink(dentry, au_dbstart(dentry), buf.u, PATH_MAX); ++ set_fs(old_fs); ++ } ++ aufs_read_unlock(dentry, AuLock_IR); ++ ++ if (err >= 0) { ++ buf.k[err] = 0; ++ /* will be freed by put_link */ ++ nd_set_link(nd, buf.k); ++ return NULL; /* success */ ++ } ++ ++out_name: ++ __putname(buf.k); ++out: ++ path_put(&nd->path); ++ AuTraceErr(err); ++ return ERR_PTR(err); ++} ++ ++static void aufs_put_link(struct dentry *dentry __maybe_unused, ++ struct nameidata *nd, void *cookie __maybe_unused) ++{ ++ __putname(nd_get_link(nd)); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void aufs_truncate_range(struct inode *inode __maybe_unused, ++ loff_t start __maybe_unused, ++ loff_t end __maybe_unused) ++{ ++ AuUnsupport(); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct inode_operations aufs_symlink_iop = { ++ .permission = aufs_permission, ++ .setattr = aufs_setattr, ++ .getattr = aufs_getattr, ++ .readlink = aufs_readlink, ++ .follow_link = aufs_follow_link, ++ .put_link = aufs_put_link ++}; ++ ++struct inode_operations aufs_dir_iop = { ++ .create = aufs_create, ++ .lookup = aufs_lookup, ++ .link = aufs_link, ++ .unlink = aufs_unlink, ++ .symlink = aufs_symlink, ++ .mkdir = aufs_mkdir, ++ .rmdir = aufs_rmdir, ++ .mknod = aufs_mknod, ++ .rename = aufs_rename, ++ ++ .permission = aufs_permission, ++ .setattr = aufs_setattr, ++ .getattr = aufs_getattr ++}; ++ ++struct inode_operations aufs_iop = { ++ .permission = aufs_permission, ++ .setattr = aufs_setattr, ++ .getattr = aufs_getattr, ++ .truncate_range = aufs_truncate_range ++}; +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/i_op_del.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,478 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode operations (del entry) ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * decide if a new whiteout for @dentry is necessary or not. ++ * when it is necessary, prepare the parent dir for the upper branch whose ++ * branch index is @bcpup for creation. the actual creation of the whiteout will ++ * be done by caller. ++ * return value: ++ * 0: wh is unnecessary ++ * plus: wh is necessary ++ * minus: error ++ */ ++int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup) ++{ ++ int need_wh, err; ++ aufs_bindex_t bstart; ++ struct super_block *sb; ++ ++ sb = dentry->d_sb; ++ bstart = au_dbstart(dentry); ++ if (*bcpup < 0) { ++ *bcpup = bstart; ++ if (au_test_ro(sb, bstart, dentry->d_inode)) { ++ err = AuWbrCopyup(au_sbi(sb), dentry); ++ *bcpup = err; ++ if (unlikely(err < 0)) ++ goto out; ++ } ++ } else ++ AuDebugOn(bstart < *bcpup ++ || au_test_ro(sb, *bcpup, dentry->d_inode)); ++ AuDbg("bcpup %d, bstart %d\n", *bcpup, bstart); ++ ++ if (*bcpup != bstart) { ++ err = au_cpup_dirs(dentry, *bcpup); ++ if (unlikely(err)) ++ goto out; ++ need_wh = 1; ++ } else { ++ struct au_dinfo *dinfo, *tmp; ++ ++ need_wh = -ENOMEM; ++ dinfo = au_di(dentry); ++ tmp = au_di_alloc(sb, AuLsc_DI_TMP); ++ if (tmp) { ++ au_di_cp(tmp, dinfo); ++ au_di_swap(tmp, dinfo); ++ /* returns the number of positive dentries */ ++ need_wh = au_lkup_dentry(dentry, bstart + 1, /*type*/0, ++ /*nd*/NULL); ++ au_di_swap(tmp, dinfo); ++ au_rw_write_unlock(&tmp->di_rwsem); ++ au_di_free(tmp); ++ } ++ } ++ AuDbg("need_wh %d\n", need_wh); ++ err = need_wh; ++ ++out: ++ return err; ++} ++ ++/* ++ * simple tests for the del-entry operations. ++ * following the checks in vfs, plus the parent-child relationship. ++ */ ++int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent, int isdir) ++{ ++ int err; ++ umode_t h_mode; ++ struct dentry *h_dentry, *h_latest; ++ struct inode *h_inode; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ h_inode = h_dentry->d_inode; ++ if (dentry->d_inode) { ++ err = -ENOENT; ++ if (unlikely(!h_inode || !h_inode->i_nlink)) ++ goto out; ++ ++ h_mode = h_inode->i_mode; ++ if (!isdir) { ++ err = -EISDIR; ++ if (unlikely(S_ISDIR(h_mode))) ++ goto out; ++ } else if (unlikely(!S_ISDIR(h_mode))) { ++ err = -ENOTDIR; ++ goto out; ++ } ++ } else { ++ /* rename(2) case */ ++ err = -EIO; ++ if (unlikely(h_inode)) ++ goto out; ++ } ++ ++ err = -ENOENT; ++ /* expected parent dir is locked */ ++ if (unlikely(h_parent != h_dentry->d_parent)) ++ goto out; ++ err = 0; ++ ++ /* ++ * rmdir a dir may break the consistency on some filesystem. ++ * let's try heavy test. ++ */ ++ err = -EACCES; ++ if (unlikely(au_test_h_perm(h_parent->d_inode, MAY_EXEC | MAY_WRITE))) ++ goto out; ++ ++ h_latest = au_sio_lkup_one(&dentry->d_name, h_parent, ++ au_sbr(dentry->d_sb, bindex)); ++ err = -EIO; ++ if (IS_ERR(h_latest)) ++ goto out; ++ if (h_latest == h_dentry) ++ err = 0; ++ dput(h_latest); ++ ++out: ++ return err; ++} ++ ++/* ++ * decide the branch where we operate for @dentry. the branch index will be set ++ * @rbcpup. after diciding it, 'pin' it and store the timestamps of the parent ++ * dir for reverting. ++ * when a new whiteout is necessary, create it. ++ */ ++static struct dentry* ++lock_hdir_create_wh(struct dentry *dentry, int isdir, aufs_bindex_t *rbcpup, ++ struct au_dtime *dt, struct au_pin *pin) ++{ ++ struct dentry *wh_dentry; ++ struct super_block *sb; ++ struct path h_path; ++ int err, need_wh; ++ unsigned int udba; ++ aufs_bindex_t bcpup; ++ ++ need_wh = au_wr_dir_need_wh(dentry, isdir, rbcpup); ++ wh_dentry = ERR_PTR(need_wh); ++ if (unlikely(need_wh < 0)) ++ goto out; ++ ++ sb = dentry->d_sb; ++ udba = au_opt_udba(sb); ++ bcpup = *rbcpup; ++ err = au_pin(pin, dentry, bcpup, udba, ++ AuPin_DI_LOCKED | AuPin_MNT_WRITE); ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out; ++ ++ h_path.dentry = au_pinned_h_parent(pin); ++ if (udba != AuOpt_UDBA_NONE ++ && au_dbstart(dentry) == bcpup) { ++ err = au_may_del(dentry, bcpup, h_path.dentry, isdir); ++ wh_dentry = ERR_PTR(err); ++ if (unlikely(err)) ++ goto out_unpin; ++ } ++ ++ h_path.mnt = au_sbr_mnt(sb, bcpup); ++ au_dtime_store(dt, au_pinned_parent(pin), &h_path); ++ wh_dentry = NULL; ++ if (!need_wh) ++ goto out; /* success, no need to create whiteout */ ++ ++ wh_dentry = au_wh_create(dentry, bcpup, h_path.dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_unpin; ++ ++ /* returns with the parent is locked and wh_dentry is dget-ed */ ++ goto out; /* success */ ++ ++out_unpin: ++ au_unpin(pin); ++out: ++ return wh_dentry; ++} ++ ++/* ++ * when removing a dir, rename it to a unique temporary whiteout-ed name first ++ * in order to be revertible and save time for removing many child whiteouts ++ * under the dir. ++ * returns 1 when there are too many child whiteout and caller should remove ++ * them asynchronously. returns 0 when the number of children is enough small to ++ * remove now or the branch fs is a remote fs. ++ * otherwise return an error. ++ */ ++static int renwh_and_rmdir(struct dentry *dentry, aufs_bindex_t bindex, ++ struct au_nhash *whlist, struct inode *dir) ++{ ++ int rmdir_later, err, dirwh; ++ struct dentry *h_dentry; ++ struct super_block *sb; ++ ++ sb = dentry->d_sb; ++ SiMustAnyLock(sb); ++ h_dentry = au_h_dptr(dentry, bindex); ++ err = au_whtmp_ren(h_dentry, au_sbr(sb, bindex)); ++ if (unlikely(err)) ++ goto out; ++ ++ /* stop monitoring */ ++ au_hn_free(au_hi(dentry->d_inode, bindex)); ++ ++ if (!au_test_fs_remote(h_dentry->d_sb)) { ++ dirwh = au_sbi(sb)->si_dirwh; ++ rmdir_later = (dirwh <= 1); ++ if (!rmdir_later) ++ rmdir_later = au_nhash_test_longer_wh(whlist, bindex, ++ dirwh); ++ if (rmdir_later) ++ return rmdir_later; ++ } ++ ++ err = au_whtmp_rmdir(dir, bindex, h_dentry, whlist); ++ if (unlikely(err)) { ++ AuIOErr("rmdir %.*s, b%d failed, %d. ignored\n", ++ AuDLNPair(h_dentry), bindex, err); ++ err = 0; ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ++ * final procedure for deleting a entry. ++ * maintain dentry and iattr. ++ */ ++static void epilog(struct inode *dir, struct dentry *dentry, ++ aufs_bindex_t bindex) ++{ ++ struct inode *inode; ++ ++ inode = dentry->d_inode; ++ d_drop(dentry); ++ inode->i_ctime = dir->i_ctime; ++ ++ if (au_ibstart(dir) == bindex) ++ au_cpup_attr_timesizes(dir); ++ dir->i_version++; ++} ++ ++/* ++ * when an error happened, remove the created whiteout and revert everything. ++ */ ++static int do_revert(int err, struct inode *dir, aufs_bindex_t bindex, ++ aufs_bindex_t bwh, struct dentry *wh_dentry, ++ struct dentry *dentry, struct au_dtime *dt) ++{ ++ int rerr; ++ struct path h_path = { ++ .dentry = wh_dentry, ++ .mnt = au_sbr_mnt(dir->i_sb, bindex) ++ }; ++ ++ rerr = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, dentry); ++ if (!rerr) { ++ au_set_dbwh(dentry, bwh); ++ au_dtime_revert(dt); ++ return 0; ++ } ++ ++ AuIOErr("%.*s reverting whiteout failed(%d, %d)\n", ++ AuDLNPair(dentry), err, rerr); ++ return -EIO; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int aufs_unlink(struct inode *dir, struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bwh, bindex, bstart; ++ struct au_dtime dt; ++ struct au_pin pin; ++ struct path h_path; ++ struct inode *inode, *h_dir; ++ struct dentry *parent, *wh_dentry; ++ ++ IMustLock(dir); ++ ++ err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); ++ if (unlikely(err)) ++ goto out; ++ err = au_d_hashed_positive(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ inode = dentry->d_inode; ++ IMustLock(inode); ++ err = -EISDIR; ++ if (unlikely(S_ISDIR(inode->i_mode))) ++ goto out_unlock; /* possible? */ ++ ++ bstart = au_dbstart(dentry); ++ bwh = au_dbwh(dentry); ++ bindex = -1; ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_write_lock_parent(parent); ++ wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/0, &bindex, &dt, &pin); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_parent; ++ ++ h_path.mnt = au_sbr_mnt(dentry->d_sb, bstart); ++ h_path.dentry = au_h_dptr(dentry, bstart); ++ dget(h_path.dentry); ++ if (bindex == bstart) { ++ h_dir = au_pinned_h_dir(&pin); ++ err = vfsub_unlink(h_dir, &h_path, /*force*/0); ++ } else { ++ /* dir inode is locked */ ++ h_dir = wh_dentry->d_parent->d_inode; ++ IMustLock(h_dir); ++ err = 0; ++ } ++ ++ if (!err) { ++ vfsub_drop_nlink(inode); ++ epilog(dir, dentry, bindex); ++ ++ /* update target timestamps */ ++ if (bindex == bstart) { ++ vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/ ++ inode->i_ctime = h_path.dentry->d_inode->i_ctime; ++ } else ++ /* todo: this timestamp may be reverted later */ ++ inode->i_ctime = h_dir->i_ctime; ++ goto out_unpin; /* success */ ++ } ++ ++ /* revert */ ++ if (wh_dentry) { ++ int rerr; ++ ++ rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry, &dt); ++ if (rerr) ++ err = rerr; ++ } ++ ++out_unpin: ++ au_unpin(&pin); ++ dput(wh_dentry); ++ dput(h_path.dentry); ++out_parent: ++ di_write_unlock(parent); ++out_unlock: ++ aufs_read_unlock(dentry, AuLock_DW); ++out: ++ return err; ++} ++ ++int aufs_rmdir(struct inode *dir, struct dentry *dentry) ++{ ++ int err, rmdir_later; ++ aufs_bindex_t bwh, bindex, bstart; ++ struct au_dtime dt; ++ struct au_pin pin; ++ struct inode *inode; ++ struct dentry *parent, *wh_dentry, *h_dentry; ++ struct au_whtmp_rmdir *args; ++ ++ IMustLock(dir); ++ ++ err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN); ++ if (unlikely(err)) ++ goto out; ++ err = au_alive_dir(dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ inode = dentry->d_inode; ++ IMustLock(inode); ++ err = -ENOTDIR; ++ if (unlikely(!S_ISDIR(inode->i_mode))) ++ goto out_unlock; /* possible? */ ++ ++ err = -ENOMEM; ++ args = au_whtmp_rmdir_alloc(dir->i_sb, GFP_NOFS); ++ if (unlikely(!args)) ++ goto out_unlock; ++ ++ parent = dentry->d_parent; /* dir inode is locked */ ++ di_write_lock_parent(parent); ++ err = au_test_empty(dentry, &args->whlist); ++ if (unlikely(err)) ++ goto out_parent; ++ ++ bstart = au_dbstart(dentry); ++ bwh = au_dbwh(dentry); ++ bindex = -1; ++ wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/1, &bindex, &dt, &pin); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out_parent; ++ ++ h_dentry = au_h_dptr(dentry, bstart); ++ dget(h_dentry); ++ rmdir_later = 0; ++ if (bindex == bstart) { ++ err = renwh_and_rmdir(dentry, bstart, &args->whlist, dir); ++ if (err > 0) { ++ rmdir_later = err; ++ err = 0; ++ } ++ } else { ++ /* stop monitoring */ ++ au_hn_free(au_hi(inode, bstart)); ++ ++ /* dir inode is locked */ ++ IMustLock(wh_dentry->d_parent->d_inode); ++ err = 0; ++ } ++ ++ if (!err) { ++ vfsub_dead_dir(inode); ++ au_set_dbdiropq(dentry, -1); ++ epilog(dir, dentry, bindex); ++ ++ if (rmdir_later) { ++ au_whtmp_kick_rmdir(dir, bstart, h_dentry, args); ++ args = NULL; ++ } ++ ++ goto out_unpin; /* success */ ++ } ++ ++ /* revert */ ++ AuLabel(revert); ++ if (wh_dentry) { ++ int rerr; ++ ++ rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry, &dt); ++ if (rerr) ++ err = rerr; ++ } ++ ++out_unpin: ++ au_unpin(&pin); ++ dput(wh_dentry); ++ dput(h_dentry); ++out_parent: ++ di_write_unlock(parent); ++ if (args) ++ au_whtmp_rmdir_free(args); ++out_unlock: ++ aufs_read_unlock(dentry, AuLock_DW); ++out: ++ AuTraceErr(err); ++ return err; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/i_op_ren.c 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,1026 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * inode operation (rename entry) ++ * todo: this is crazy monster ++ */ ++ ++#include "aufs.h" ++ ++enum { AuSRC, AuDST, AuSrcDst }; ++enum { AuPARENT, AuCHILD, AuParentChild }; ++ ++#define AuRen_ISDIR 1 ++#define AuRen_ISSAMEDIR (1 << 1) ++#define AuRen_WHSRC (1 << 2) ++#define AuRen_WHDST (1 << 3) ++#define AuRen_MNT_WRITE (1 << 4) ++#define AuRen_DT_DSTDIR (1 << 5) ++#define AuRen_DIROPQ (1 << 6) ++#define AuRen_CPUP (1 << 7) ++#define au_ftest_ren(flags, name) ((flags) & AuRen_##name) ++#define au_fset_ren(flags, name) \ ++ do { (flags) |= AuRen_##name; } while (0) ++#define au_fclr_ren(flags, name) \ ++ do { (flags) &= ~AuRen_##name; } while (0) ++ ++struct au_ren_args { ++ struct { ++ struct dentry *dentry, *h_dentry, *parent, *h_parent, ++ *wh_dentry; ++ struct inode *dir, *inode; ++ struct au_hinode *hdir; ++ struct au_dtime dt[AuParentChild]; ++ aufs_bindex_t bstart; ++ } sd[AuSrcDst]; ++ ++#define src_dentry sd[AuSRC].dentry ++#define src_dir sd[AuSRC].dir ++#define src_inode sd[AuSRC].inode ++#define src_h_dentry sd[AuSRC].h_dentry ++#define src_parent sd[AuSRC].parent ++#define src_h_parent sd[AuSRC].h_parent ++#define src_wh_dentry sd[AuSRC].wh_dentry ++#define src_hdir sd[AuSRC].hdir ++#define src_h_dir sd[AuSRC].hdir->hi_inode ++#define src_dt sd[AuSRC].dt ++#define src_bstart sd[AuSRC].bstart ++ ++#define dst_dentry sd[AuDST].dentry ++#define dst_dir sd[AuDST].dir ++#define dst_inode sd[AuDST].inode ++#define dst_h_dentry sd[AuDST].h_dentry ++#define dst_parent sd[AuDST].parent ++#define dst_h_parent sd[AuDST].h_parent ++#define dst_wh_dentry sd[AuDST].wh_dentry ++#define dst_hdir sd[AuDST].hdir ++#define dst_h_dir sd[AuDST].hdir->hi_inode ++#define dst_dt sd[AuDST].dt ++#define dst_bstart sd[AuDST].bstart ++ ++ struct dentry *h_trap; ++ struct au_branch *br; ++ struct au_hinode *src_hinode; ++ struct path h_path; ++ struct au_nhash whlist; ++ aufs_bindex_t btgt, src_bwh, src_bdiropq; ++ ++ unsigned int flags; ++ ++ struct au_whtmp_rmdir *thargs; ++ struct dentry *h_dst; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * functions for reverting. ++ * when an error happened in a single rename systemcall, we should revert ++ * everything as if nothing happend. ++ * we don't need to revert the copied-up/down the parent dir since they are ++ * harmless. ++ */ ++ ++#define RevertFailure(fmt, ...) do { \ ++ AuIOErr("revert failure: " fmt " (%d, %d)\n", \ ++ ##__VA_ARGS__, err, rerr); \ ++ err = -EIO; \ ++} while (0) ++ ++static void au_ren_rev_diropq(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); ++ rerr = au_diropq_remove(a->src_dentry, a->btgt); ++ au_hn_imtx_unlock(a->src_hinode); ++ au_set_dbdiropq(a->src_dentry, a->src_bdiropq); ++ if (rerr) ++ RevertFailure("remove diropq %.*s", AuDLNPair(a->src_dentry)); ++} ++ ++static void au_ren_rev_rename(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ a->h_path.dentry = au_lkup_one(&a->src_dentry->d_name, a->src_h_parent, ++ a->br, /*nd*/NULL); ++ rerr = PTR_ERR(a->h_path.dentry); ++ if (IS_ERR(a->h_path.dentry)) { ++ RevertFailure("au_lkup_one %.*s", AuDLNPair(a->src_dentry)); ++ return; ++ } ++ ++ rerr = vfsub_rename(a->dst_h_dir, ++ au_h_dptr(a->src_dentry, a->btgt), ++ a->src_h_dir, &a->h_path); ++ d_drop(a->h_path.dentry); ++ dput(a->h_path.dentry); ++ /* au_set_h_dptr(a->src_dentry, a->btgt, NULL); */ ++ if (rerr) ++ RevertFailure("rename %.*s", AuDLNPair(a->src_dentry)); ++} ++ ++static void au_ren_rev_cpup(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ a->h_path.dentry = a->dst_h_dentry; ++ rerr = vfsub_unlink(a->dst_h_dir, &a->h_path, /*force*/0); ++ au_set_h_dptr(a->src_dentry, a->btgt, NULL); ++ au_set_dbstart(a->src_dentry, a->src_bstart); ++ if (rerr) ++ RevertFailure("unlink %.*s", AuDLNPair(a->dst_h_dentry)); ++} ++ ++static void au_ren_rev_whtmp(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ a->h_path.dentry = au_lkup_one(&a->dst_dentry->d_name, a->dst_h_parent, ++ a->br, /*nd*/NULL); ++ rerr = PTR_ERR(a->h_path.dentry); ++ if (IS_ERR(a->h_path.dentry)) { ++ RevertFailure("lookup %.*s", AuDLNPair(a->dst_dentry)); ++ return; ++ } ++ if (a->h_path.dentry->d_inode) { ++ d_drop(a->h_path.dentry); ++ dput(a->h_path.dentry); ++ return; ++ } ++ ++ rerr = vfsub_rename(a->dst_h_dir, a->h_dst, a->dst_h_dir, &a->h_path); ++ d_drop(a->h_path.dentry); ++ dput(a->h_path.dentry); ++ if (!rerr) ++ au_set_h_dptr(a->dst_dentry, a->btgt, dget(a->h_dst)); ++ else ++ RevertFailure("rename %.*s", AuDLNPair(a->h_dst)); ++} ++ ++static void au_ren_rev_whsrc(int err, struct au_ren_args *a) ++{ ++ int rerr; ++ ++ a->h_path.dentry = a->src_wh_dentry; ++ rerr = au_wh_unlink_dentry(a->src_h_dir, &a->h_path, a->src_dentry); ++ au_set_dbwh(a->src_dentry, a->src_bwh); ++ if (rerr) ++ RevertFailure("unlink %.*s", AuDLNPair(a->src_wh_dentry)); ++} ++#undef RevertFailure ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * when we have to copyup the renaming entry, do it with the rename-target name ++ * in order to minimize the cost (the later actual rename is unnecessary). ++ * otherwise rename it on the target branch. ++ */ ++static int au_ren_or_cpup(struct au_ren_args *a) ++{ ++ int err; ++ struct dentry *d; ++ ++ d = a->src_dentry; ++ if (au_dbstart(d) == a->btgt) { ++ a->h_path.dentry = a->dst_h_dentry; ++ if (au_ftest_ren(a->flags, DIROPQ) ++ && au_dbdiropq(d) == a->btgt) ++ au_fclr_ren(a->flags, DIROPQ); ++ AuDebugOn(au_dbstart(d) != a->btgt); ++ err = vfsub_rename(a->src_h_dir, au_h_dptr(d, a->btgt), ++ a->dst_h_dir, &a->h_path); ++ } else { ++ struct mutex *h_mtx = &a->src_h_dentry->d_inode->i_mutex; ++ struct file *h_file; ++ ++ au_fset_ren(a->flags, CPUP); ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ au_set_dbstart(d, a->btgt); ++ au_set_h_dptr(d, a->btgt, dget(a->dst_h_dentry)); ++ h_file = au_h_open_pre(d, a->src_bstart); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ h_file = NULL; ++ } else ++ err = au_sio_cpup_single(d, a->btgt, a->src_bstart, -1, ++ !AuCpup_DTIME, a->dst_parent); ++ mutex_unlock(h_mtx); ++ au_h_open_post(d, a->src_bstart, h_file); ++ if (!err) { ++ d = a->dst_dentry; ++ au_set_h_dptr(d, a->btgt, NULL); ++ au_update_dbstart(d); ++ } else { ++ au_set_h_dptr(d, a->btgt, NULL); ++ au_set_dbstart(d, a->src_bstart); ++ } ++ } ++ if (!err && a->h_dst) ++ /* it will be set to dinfo later */ ++ dget(a->h_dst); ++ ++ return err; ++} ++ ++/* cf. aufs_rmdir() */ ++static int au_ren_del_whtmp(struct au_ren_args *a) ++{ ++ int err; ++ struct inode *dir; ++ ++ dir = a->dst_dir; ++ SiMustAnyLock(dir->i_sb); ++ if (!au_nhash_test_longer_wh(&a->whlist, a->btgt, ++ au_sbi(dir->i_sb)->si_dirwh) ++ || au_test_fs_remote(a->h_dst->d_sb)) { ++ err = au_whtmp_rmdir(dir, a->btgt, a->h_dst, &a->whlist); ++ if (unlikely(err)) ++ pr_warning("failed removing whtmp dir %.*s (%d), " ++ "ignored.\n", AuDLNPair(a->h_dst), err); ++ } else { ++ au_nhash_wh_free(&a->thargs->whlist); ++ a->thargs->whlist = a->whlist; ++ a->whlist.nh_num = 0; ++ au_whtmp_kick_rmdir(dir, a->btgt, a->h_dst, a->thargs); ++ dput(a->h_dst); ++ a->thargs = NULL; ++ } ++ ++ return 0; ++} ++ ++/* make it 'opaque' dir. */ ++static int au_ren_diropq(struct au_ren_args *a) ++{ ++ int err; ++ struct dentry *diropq; ++ ++ err = 0; ++ a->src_bdiropq = au_dbdiropq(a->src_dentry); ++ a->src_hinode = au_hi(a->src_inode, a->btgt); ++ au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); ++ diropq = au_diropq_create(a->src_dentry, a->btgt); ++ au_hn_imtx_unlock(a->src_hinode); ++ if (IS_ERR(diropq)) ++ err = PTR_ERR(diropq); ++ dput(diropq); ++ ++ return err; ++} ++ ++static int do_rename(struct au_ren_args *a) ++{ ++ int err; ++ struct dentry *d, *h_d; ++ ++ /* prepare workqueue args for asynchronous rmdir */ ++ h_d = a->dst_h_dentry; ++ if (au_ftest_ren(a->flags, ISDIR) && h_d->d_inode) { ++ err = -ENOMEM; ++ a->thargs = au_whtmp_rmdir_alloc(a->src_dentry->d_sb, GFP_NOFS); ++ if (unlikely(!a->thargs)) ++ goto out; ++ a->h_dst = dget(h_d); ++ } ++ ++ /* create whiteout for src_dentry */ ++ if (au_ftest_ren(a->flags, WHSRC)) { ++ a->src_bwh = au_dbwh(a->src_dentry); ++ AuDebugOn(a->src_bwh >= 0); ++ a->src_wh_dentry ++ = au_wh_create(a->src_dentry, a->btgt, a->src_h_parent); ++ err = PTR_ERR(a->src_wh_dentry); ++ if (IS_ERR(a->src_wh_dentry)) ++ goto out_thargs; ++ } ++ ++ /* lookup whiteout for dentry */ ++ if (au_ftest_ren(a->flags, WHDST)) { ++ h_d = au_wh_lkup(a->dst_h_parent, &a->dst_dentry->d_name, ++ a->br); ++ err = PTR_ERR(h_d); ++ if (IS_ERR(h_d)) ++ goto out_whsrc; ++ if (!h_d->d_inode) ++ dput(h_d); ++ else ++ a->dst_wh_dentry = h_d; ++ } ++ ++ /* rename dentry to tmpwh */ ++ if (a->thargs) { ++ err = au_whtmp_ren(a->dst_h_dentry, a->br); ++ if (unlikely(err)) ++ goto out_whdst; ++ ++ d = a->dst_dentry; ++ au_set_h_dptr(d, a->btgt, NULL); ++ err = au_lkup_neg(d, a->btgt); ++ if (unlikely(err)) ++ goto out_whtmp; ++ a->dst_h_dentry = au_h_dptr(d, a->btgt); ++ } ++ ++ /* cpup src */ ++ if (a->dst_h_dentry->d_inode && a->src_bstart != a->btgt) { ++ struct mutex *h_mtx = &a->src_h_dentry->d_inode->i_mutex; ++ struct file *h_file; ++ ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ AuDebugOn(au_dbstart(a->src_dentry) != a->src_bstart); ++ h_file = au_h_open_pre(a->src_dentry, a->src_bstart); ++ if (IS_ERR(h_file)) { ++ err = PTR_ERR(h_file); ++ h_file = NULL; ++ } else ++ err = au_sio_cpup_simple(a->src_dentry, a->btgt, -1, ++ !AuCpup_DTIME); ++ mutex_unlock(h_mtx); ++ au_h_open_post(a->src_dentry, a->src_bstart, h_file); ++ if (unlikely(err)) ++ goto out_whtmp; ++ } ++ ++ /* rename by vfs_rename or cpup */ ++ d = a->dst_dentry; ++ if (au_ftest_ren(a->flags, ISDIR) ++ && (a->dst_wh_dentry ++ || au_dbdiropq(d) == a->btgt ++ /* hide the lower to keep xino */ ++ || a->btgt < au_dbend(d) ++ || au_opt_test(au_mntflags(d->d_sb), ALWAYS_DIROPQ))) ++ au_fset_ren(a->flags, DIROPQ); ++ err = au_ren_or_cpup(a); ++ if (unlikely(err)) ++ /* leave the copied-up one */ ++ goto out_whtmp; ++ ++ /* make dir opaque */ ++ if (au_ftest_ren(a->flags, DIROPQ)) { ++ err = au_ren_diropq(a); ++ if (unlikely(err)) ++ goto out_rename; ++ } ++ ++ /* update target timestamps */ ++ AuDebugOn(au_dbstart(a->src_dentry) != a->btgt); ++ a->h_path.dentry = au_h_dptr(a->src_dentry, a->btgt); ++ vfsub_update_h_iattr(&a->h_path, /*did*/NULL); /*ignore*/ ++ a->src_inode->i_ctime = a->h_path.dentry->d_inode->i_ctime; ++ ++ /* remove whiteout for dentry */ ++ if (a->dst_wh_dentry) { ++ a->h_path.dentry = a->dst_wh_dentry; ++ err = au_wh_unlink_dentry(a->dst_h_dir, &a->h_path, ++ a->dst_dentry); ++ if (unlikely(err)) ++ goto out_diropq; ++ } ++ ++ /* remove whtmp */ ++ if (a->thargs) ++ au_ren_del_whtmp(a); /* ignore this error */ ++ ++ err = 0; ++ goto out_success; ++ ++out_diropq: ++ if (au_ftest_ren(a->flags, DIROPQ)) ++ au_ren_rev_diropq(err, a); ++out_rename: ++ if (!au_ftest_ren(a->flags, CPUP)) ++ au_ren_rev_rename(err, a); ++ else ++ au_ren_rev_cpup(err, a); ++ dput(a->h_dst); ++out_whtmp: ++ if (a->thargs) ++ au_ren_rev_whtmp(err, a); ++out_whdst: ++ dput(a->dst_wh_dentry); ++ a->dst_wh_dentry = NULL; ++out_whsrc: ++ if (a->src_wh_dentry) ++ au_ren_rev_whsrc(err, a); ++out_success: ++ dput(a->src_wh_dentry); ++ dput(a->dst_wh_dentry); ++out_thargs: ++ if (a->thargs) { ++ dput(a->h_dst); ++ au_whtmp_rmdir_free(a->thargs); ++ a->thargs = NULL; ++ } ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * test if @dentry dir can be rename destination or not. ++ * success means, it is a logically empty dir. ++ */ ++static int may_rename_dstdir(struct dentry *dentry, struct au_nhash *whlist) ++{ ++ return au_test_empty(dentry, whlist); ++} ++ ++/* ++ * test if @dentry dir can be rename source or not. ++ * if it can, return 0 and @children is filled. ++ * success means, ++ * - it is a logically empty dir. ++ * - or, it exists on writable branch and has no children including whiteouts ++ * on the lower branch. ++ */ ++static int may_rename_srcdir(struct dentry *dentry, aufs_bindex_t btgt) ++{ ++ int err; ++ unsigned int rdhash; ++ aufs_bindex_t bstart; ++ ++ bstart = au_dbstart(dentry); ++ if (bstart != btgt) { ++ struct au_nhash whlist; ++ ++ SiMustAnyLock(dentry->d_sb); ++ rdhash = au_sbi(dentry->d_sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, ++ dentry)); ++ err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_test_empty(dentry, &whlist); ++ au_nhash_wh_free(&whlist); ++ goto out; ++ } ++ ++ if (bstart == au_dbtaildir(dentry)) ++ return 0; /* success */ ++ ++ err = au_test_empty_lower(dentry); ++ ++out: ++ if (err == -ENOTEMPTY) { ++ AuWarn1("renaming dir who has child(ren) on multiple branches," ++ " is not supported\n"); ++ err = -EXDEV; ++ } ++ return err; ++} ++ ++/* side effect: sets whlist and h_dentry */ ++static int au_ren_may_dir(struct au_ren_args *a) ++{ ++ int err; ++ unsigned int rdhash; ++ struct dentry *d; ++ ++ d = a->dst_dentry; ++ SiMustAnyLock(d->d_sb); ++ ++ err = 0; ++ if (au_ftest_ren(a->flags, ISDIR) && a->dst_inode) { ++ rdhash = au_sbi(d->d_sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, d)); ++ err = au_nhash_alloc(&a->whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ ++ au_set_dbstart(d, a->dst_bstart); ++ err = may_rename_dstdir(d, &a->whlist); ++ au_set_dbstart(d, a->btgt); ++ } ++ a->dst_h_dentry = au_h_dptr(d, au_dbstart(d)); ++ if (unlikely(err)) ++ goto out; ++ ++ d = a->src_dentry; ++ a->src_h_dentry = au_h_dptr(d, au_dbstart(d)); ++ if (au_ftest_ren(a->flags, ISDIR)) { ++ err = may_rename_srcdir(d, a->btgt); ++ if (unlikely(err)) { ++ au_nhash_wh_free(&a->whlist); ++ a->whlist.nh_num = 0; ++ } ++ } ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * simple tests for rename. ++ * following the checks in vfs, plus the parent-child relationship. ++ */ ++static int au_may_ren(struct au_ren_args *a) ++{ ++ int err, isdir; ++ struct inode *h_inode; ++ ++ if (a->src_bstart == a->btgt) { ++ err = au_may_del(a->src_dentry, a->btgt, a->src_h_parent, ++ au_ftest_ren(a->flags, ISDIR)); ++ if (unlikely(err)) ++ goto out; ++ err = -EINVAL; ++ if (unlikely(a->src_h_dentry == a->h_trap)) ++ goto out; ++ } ++ ++ err = 0; ++ if (a->dst_bstart != a->btgt) ++ goto out; ++ ++ err = -ENOTEMPTY; ++ if (unlikely(a->dst_h_dentry == a->h_trap)) ++ goto out; ++ ++ err = -EIO; ++ h_inode = a->dst_h_dentry->d_inode; ++ isdir = !!au_ftest_ren(a->flags, ISDIR); ++ if (!a->dst_dentry->d_inode) { ++ if (unlikely(h_inode)) ++ goto out; ++ err = au_may_add(a->dst_dentry, a->btgt, a->dst_h_parent, ++ isdir); ++ } else { ++ if (unlikely(!h_inode || !h_inode->i_nlink)) ++ goto out; ++ err = au_may_del(a->dst_dentry, a->btgt, a->dst_h_parent, ++ isdir); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++out: ++ if (unlikely(err == -ENOENT || err == -EEXIST)) ++ err = -EIO; ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * locking order ++ * (VFS) ++ * - src_dir and dir by lock_rename() ++ * - inode if exitsts ++ * (aufs) ++ * - lock all ++ * + src_dentry and dentry by aufs_read_and_write_lock2() which calls, ++ * + si_read_lock ++ * + di_write_lock2_child() ++ * + di_write_lock_child() ++ * + ii_write_lock_child() ++ * + di_write_lock_child2() ++ * + ii_write_lock_child2() ++ * + src_parent and parent ++ * + di_write_lock_parent() ++ * + ii_write_lock_parent() ++ * + di_write_lock_parent2() ++ * + ii_write_lock_parent2() ++ * + lower src_dir and dir by vfsub_lock_rename() ++ * + verify the every relationships between child and parent. if any ++ * of them failed, unlock all and return -EBUSY. ++ */ ++static void au_ren_unlock(struct au_ren_args *a) ++{ ++ struct super_block *sb; ++ ++ sb = a->dst_dentry->d_sb; ++ if (au_ftest_ren(a->flags, MNT_WRITE)) ++ mnt_drop_write(a->br->br_mnt); ++ vfsub_unlock_rename(a->src_h_parent, a->src_hdir, ++ a->dst_h_parent, a->dst_hdir); ++} ++ ++static int au_ren_lock(struct au_ren_args *a) ++{ ++ int err; ++ unsigned int udba; ++ ++ err = 0; ++ a->src_h_parent = au_h_dptr(a->src_parent, a->btgt); ++ a->src_hdir = au_hi(a->src_dir, a->btgt); ++ a->dst_h_parent = au_h_dptr(a->dst_parent, a->btgt); ++ a->dst_hdir = au_hi(a->dst_dir, a->btgt); ++ a->h_trap = vfsub_lock_rename(a->src_h_parent, a->src_hdir, ++ a->dst_h_parent, a->dst_hdir); ++ udba = au_opt_udba(a->src_dentry->d_sb); ++ if (unlikely(a->src_hdir->hi_inode != a->src_h_parent->d_inode ++ || a->dst_hdir->hi_inode != a->dst_h_parent->d_inode)) ++ err = au_busy_or_stale(); ++ if (!err && au_dbstart(a->src_dentry) == a->btgt) ++ err = au_h_verify(a->src_h_dentry, udba, ++ a->src_h_parent->d_inode, a->src_h_parent, ++ a->br); ++ if (!err && au_dbstart(a->dst_dentry) == a->btgt) ++ err = au_h_verify(a->dst_h_dentry, udba, ++ a->dst_h_parent->d_inode, a->dst_h_parent, ++ a->br); ++ if (!err) { ++ err = mnt_want_write(a->br->br_mnt); ++ if (unlikely(err)) ++ goto out_unlock; ++ au_fset_ren(a->flags, MNT_WRITE); ++ goto out; /* success */ ++ } ++ ++ err = au_busy_or_stale(); ++ ++out_unlock: ++ au_ren_unlock(a); ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_ren_refresh_dir(struct au_ren_args *a) ++{ ++ struct inode *dir; ++ ++ dir = a->dst_dir; ++ dir->i_version++; ++ if (au_ftest_ren(a->flags, ISDIR)) { ++ /* is this updating defined in POSIX? */ ++ au_cpup_attr_timesizes(a->src_inode); ++ au_cpup_attr_nlink(dir, /*force*/1); ++ } ++ ++ if (au_ibstart(dir) == a->btgt) ++ au_cpup_attr_timesizes(dir); ++ ++ if (au_ftest_ren(a->flags, ISSAMEDIR)) ++ return; ++ ++ dir = a->src_dir; ++ dir->i_version++; ++ if (au_ftest_ren(a->flags, ISDIR)) ++ au_cpup_attr_nlink(dir, /*force*/1); ++ if (au_ibstart(dir) == a->btgt) ++ au_cpup_attr_timesizes(dir); ++} ++ ++static void au_ren_refresh(struct au_ren_args *a) ++{ ++ aufs_bindex_t bend, bindex; ++ struct dentry *d, *h_d; ++ struct inode *i, *h_i; ++ struct super_block *sb; ++ ++ d = a->dst_dentry; ++ d_drop(d); ++ if (a->h_dst) ++ /* already dget-ed by au_ren_or_cpup() */ ++ au_set_h_dptr(d, a->btgt, a->h_dst); ++ ++ i = a->dst_inode; ++ if (i) { ++ if (!au_ftest_ren(a->flags, ISDIR)) ++ vfsub_drop_nlink(i); ++ else { ++ vfsub_dead_dir(i); ++ au_cpup_attr_timesizes(i); ++ } ++ au_update_dbrange(d, /*do_put_zero*/1); ++ } else { ++ bend = a->btgt; ++ for (bindex = au_dbstart(d); bindex < bend; bindex++) ++ au_set_h_dptr(d, bindex, NULL); ++ bend = au_dbend(d); ++ for (bindex = a->btgt + 1; bindex <= bend; bindex++) ++ au_set_h_dptr(d, bindex, NULL); ++ au_update_dbrange(d, /*do_put_zero*/0); ++ } ++ ++ d = a->src_dentry; ++ au_set_dbwh(d, -1); ++ bend = au_dbend(d); ++ for (bindex = a->btgt + 1; bindex <= bend; bindex++) { ++ h_d = au_h_dptr(d, bindex); ++ if (h_d) ++ au_set_h_dptr(d, bindex, NULL); ++ } ++ au_set_dbend(d, a->btgt); ++ ++ sb = d->d_sb; ++ i = a->src_inode; ++ if (au_opt_test(au_mntflags(sb), PLINK) && au_plink_test(i)) ++ return; /* success */ ++ ++ bend = au_ibend(i); ++ for (bindex = a->btgt + 1; bindex <= bend; bindex++) { ++ h_i = au_h_iptr(i, bindex); ++ if (h_i) { ++ au_xino_write(sb, bindex, h_i->i_ino, /*ino*/0); ++ /* ignore this error */ ++ au_set_h_iptr(i, bindex, NULL, 0); ++ } ++ } ++ au_set_ibend(i, a->btgt); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* mainly for link(2) and rename(2) */ ++int au_wbr(struct dentry *dentry, aufs_bindex_t btgt) ++{ ++ aufs_bindex_t bdiropq, bwh; ++ struct dentry *parent; ++ struct au_branch *br; ++ ++ parent = dentry->d_parent; ++ IMustLock(parent->d_inode); /* dir is locked */ ++ ++ bdiropq = au_dbdiropq(parent); ++ bwh = au_dbwh(dentry); ++ br = au_sbr(dentry->d_sb, btgt); ++ if (au_br_rdonly(br) ++ || (0 <= bdiropq && bdiropq < btgt) ++ || (0 <= bwh && bwh < btgt)) ++ btgt = -1; ++ ++ AuDbg("btgt %d\n", btgt); ++ return btgt; ++} ++ ++/* sets src_bstart, dst_bstart and btgt */ ++static int au_ren_wbr(struct au_ren_args *a) ++{ ++ int err; ++ struct au_wr_dir_args wr_dir_args = { ++ /* .force_btgt = -1, */ ++ .flags = AuWrDir_ADD_ENTRY ++ }; ++ ++ a->src_bstart = au_dbstart(a->src_dentry); ++ a->dst_bstart = au_dbstart(a->dst_dentry); ++ if (au_ftest_ren(a->flags, ISDIR)) ++ au_fset_wrdir(wr_dir_args.flags, ISDIR); ++ wr_dir_args.force_btgt = a->src_bstart; ++ if (a->dst_inode && a->dst_bstart < a->src_bstart) ++ wr_dir_args.force_btgt = a->dst_bstart; ++ wr_dir_args.force_btgt = au_wbr(a->dst_dentry, wr_dir_args.force_btgt); ++ err = au_wr_dir(a->dst_dentry, a->src_dentry, &wr_dir_args); ++ a->btgt = err; ++ ++ return err; ++} ++ ++static void au_ren_dt(struct au_ren_args *a) ++{ ++ a->h_path.dentry = a->src_h_parent; ++ au_dtime_store(a->src_dt + AuPARENT, a->src_parent, &a->h_path); ++ if (!au_ftest_ren(a->flags, ISSAMEDIR)) { ++ a->h_path.dentry = a->dst_h_parent; ++ au_dtime_store(a->dst_dt + AuPARENT, a->dst_parent, &a->h_path); ++ } ++ ++ au_fclr_ren(a->flags, DT_DSTDIR); ++ if (!au_ftest_ren(a->flags, ISDIR)) ++ return; ++ ++ a->h_path.dentry = a->src_h_dentry; ++ au_dtime_store(a->src_dt + AuCHILD, a->src_dentry, &a->h_path); ++ if (a->dst_h_dentry->d_inode) { ++ au_fset_ren(a->flags, DT_DSTDIR); ++ a->h_path.dentry = a->dst_h_dentry; ++ au_dtime_store(a->dst_dt + AuCHILD, a->dst_dentry, &a->h_path); ++ } ++} ++ ++static void au_ren_rev_dt(int err, struct au_ren_args *a) ++{ ++ struct dentry *h_d; ++ struct mutex *h_mtx; ++ ++ au_dtime_revert(a->src_dt + AuPARENT); ++ if (!au_ftest_ren(a->flags, ISSAMEDIR)) ++ au_dtime_revert(a->dst_dt + AuPARENT); ++ ++ if (au_ftest_ren(a->flags, ISDIR) && err != -EIO) { ++ h_d = a->src_dt[AuCHILD].dt_h_path.dentry; ++ h_mtx = &h_d->d_inode->i_mutex; ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ au_dtime_revert(a->src_dt + AuCHILD); ++ mutex_unlock(h_mtx); ++ ++ if (au_ftest_ren(a->flags, DT_DSTDIR)) { ++ h_d = a->dst_dt[AuCHILD].dt_h_path.dentry; ++ h_mtx = &h_d->d_inode->i_mutex; ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD); ++ au_dtime_revert(a->dst_dt + AuCHILD); ++ mutex_unlock(h_mtx); ++ } ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int aufs_rename(struct inode *_src_dir, struct dentry *_src_dentry, ++ struct inode *_dst_dir, struct dentry *_dst_dentry) ++{ ++ int err, flags; ++ /* reduce stack space */ ++ struct au_ren_args *a; ++ ++ AuDbg("%.*s, %.*s\n", AuDLNPair(_src_dentry), AuDLNPair(_dst_dentry)); ++ IMustLock(_src_dir); ++ IMustLock(_dst_dir); ++ ++ err = -ENOMEM; ++ BUILD_BUG_ON(sizeof(*a) > PAGE_SIZE); ++ a = kzalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ a->src_dir = _src_dir; ++ a->src_dentry = _src_dentry; ++ a->src_inode = a->src_dentry->d_inode; ++ a->src_parent = a->src_dentry->d_parent; /* dir inode is locked */ ++ a->dst_dir = _dst_dir; ++ a->dst_dentry = _dst_dentry; ++ a->dst_inode = a->dst_dentry->d_inode; ++ a->dst_parent = a->dst_dentry->d_parent; /* dir inode is locked */ ++ if (a->dst_inode) { ++ IMustLock(a->dst_inode); ++ au_igrab(a->dst_inode); ++ } ++ ++ err = -ENOTDIR; ++ flags = AuLock_FLUSH | AuLock_NOPLM | AuLock_GEN; ++ if (S_ISDIR(a->src_inode->i_mode)) { ++ au_fset_ren(a->flags, ISDIR); ++ if (unlikely(a->dst_inode && !S_ISDIR(a->dst_inode->i_mode))) ++ goto out_free; ++ err = aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, ++ AuLock_DIR | flags); ++ } else ++ err = aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, ++ flags); ++ if (unlikely(err)) ++ goto out_free; ++ ++ err = au_d_hashed_positive(a->src_dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ err = -ENOENT; ++ if (a->dst_inode) { ++ /* ++ * If it is a dir, VFS unhash dst_dentry before this ++ * function. It means we cannot rely upon d_unhashed(). ++ */ ++ if (unlikely(!a->dst_inode->i_nlink)) ++ goto out_unlock; ++ if (!S_ISDIR(a->dst_inode->i_mode)) { ++ err = au_d_hashed_positive(a->dst_dentry); ++ if (unlikely(err)) ++ goto out_unlock; ++ } else if (unlikely(IS_DEADDIR(a->dst_inode))) ++ goto out_unlock; ++ } else if (unlikely(d_unhashed(a->dst_dentry))) ++ goto out_unlock; ++ ++ /* ++ * is it possible? ++ * yes, it happend (in linux-3.3-rcN) but I don't know why. ++ * there may exist a problem somewhere else. ++ */ ++ err = -EINVAL; ++ if (unlikely(a->dst_parent->d_inode == a->src_dentry->d_inode)) ++ goto out_unlock; ++ ++ au_fset_ren(a->flags, ISSAMEDIR); /* temporary */ ++ di_write_lock_parent(a->dst_parent); ++ ++ /* which branch we process */ ++ err = au_ren_wbr(a); ++ if (unlikely(err < 0)) ++ goto out_parent; ++ a->br = au_sbr(a->dst_dentry->d_sb, a->btgt); ++ a->h_path.mnt = a->br->br_mnt; ++ ++ /* are they available to be renamed */ ++ err = au_ren_may_dir(a); ++ if (unlikely(err)) ++ goto out_children; ++ ++ /* prepare the writable parent dir on the same branch */ ++ if (a->dst_bstart == a->btgt) { ++ au_fset_ren(a->flags, WHDST); ++ } else { ++ err = au_cpup_dirs(a->dst_dentry, a->btgt); ++ if (unlikely(err)) ++ goto out_children; ++ } ++ ++ if (a->src_dir != a->dst_dir) { ++ /* ++ * this temporary unlock is safe, ++ * because both dir->i_mutex are locked. ++ */ ++ di_write_unlock(a->dst_parent); ++ di_write_lock_parent(a->src_parent); ++ err = au_wr_dir_need_wh(a->src_dentry, ++ au_ftest_ren(a->flags, ISDIR), ++ &a->btgt); ++ di_write_unlock(a->src_parent); ++ di_write_lock2_parent(a->src_parent, a->dst_parent, /*isdir*/1); ++ au_fclr_ren(a->flags, ISSAMEDIR); ++ } else ++ err = au_wr_dir_need_wh(a->src_dentry, ++ au_ftest_ren(a->flags, ISDIR), ++ &a->btgt); ++ if (unlikely(err < 0)) ++ goto out_children; ++ if (err) ++ au_fset_ren(a->flags, WHSRC); ++ ++ /* lock them all */ ++ err = au_ren_lock(a); ++ if (unlikely(err)) ++ goto out_children; ++ ++ if (!au_opt_test(au_mntflags(a->dst_dir->i_sb), UDBA_NONE)) ++ err = au_may_ren(a); ++ else if (unlikely(a->dst_dentry->d_name.len > AUFS_MAX_NAMELEN)) ++ err = -ENAMETOOLONG; ++ if (unlikely(err)) ++ goto out_hdir; ++ ++ /* store timestamps to be revertible */ ++ au_ren_dt(a); ++ ++ /* here we go */ ++ err = do_rename(a); ++ if (unlikely(err)) ++ goto out_dt; ++ ++ /* update dir attributes */ ++ au_ren_refresh_dir(a); ++ ++ /* dput/iput all lower dentries */ ++ au_ren_refresh(a); ++ ++ goto out_hdir; /* success */ ++ ++out_dt: ++ au_ren_rev_dt(err, a); ++out_hdir: ++ au_ren_unlock(a); ++out_children: ++ au_nhash_wh_free(&a->whlist); ++ if (err && a->dst_inode && a->dst_bstart != a->btgt) { ++ AuDbg("bstart %d, btgt %d\n", a->dst_bstart, a->btgt); ++ au_set_h_dptr(a->dst_dentry, a->btgt, NULL); ++ au_set_dbstart(a->dst_dentry, a->dst_bstart); ++ } ++out_parent: ++ if (!err) ++ d_move(a->src_dentry, a->dst_dentry); ++ else { ++ au_update_dbstart(a->dst_dentry); ++ if (!a->dst_inode) ++ d_drop(a->dst_dentry); ++ } ++ if (au_ftest_ren(a->flags, ISSAMEDIR)) ++ di_write_unlock(a->dst_parent); ++ else ++ di_write_unlock2(a->src_parent, a->dst_parent); ++out_unlock: ++ aufs_read_and_write_unlock2(a->dst_dentry, a->src_dentry); ++out_free: ++ iput(a->dst_inode); ++ if (a->thargs) ++ au_whtmp_rmdir_free(a->thargs); ++ kfree(a); ++out: ++ AuTraceErr(err); ++ return err; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/Kconfig 2011-10-25 09:52:26.000000000 +0200 +@@ -0,0 +1,203 @@ ++config AUFS_FS ++ tristate "Aufs (Advanced multi layered unification filesystem) support" ++ depends on EXPERIMENTAL ++ help ++ Aufs is a stackable unification filesystem such as Unionfs, ++ which unifies several directories and provides a merged single ++ directory. ++ In the early days, aufs was entirely re-designed and ++ re-implemented Unionfs Version 1.x series. Introducing many ++ original ideas, approaches and improvements, it becomes totally ++ different from Unionfs while keeping the basic features. ++ ++if AUFS_FS ++choice ++ prompt "Maximum number of branches" ++ default AUFS_BRANCH_MAX_127 ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_127 ++ bool "127" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_511 ++ bool "511" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_1023 ++ bool "1023" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++config AUFS_BRANCH_MAX_32767 ++ bool "32767" ++ help ++ Specifies the maximum number of branches (or member directories) ++ in a single aufs. The larger value consumes more system ++ resources and has a minor impact to performance. ++endchoice ++ ++config AUFS_SBILIST ++ bool ++ depends on AUFS_MAGIC_SYSRQ || PROC_FS ++ default y ++ help ++ Automatic configuration for internal use. ++ When aufs supports Magic SysRq or /proc, enabled automatically. ++ ++config AUFS_HNOTIFY ++ bool "Detect direct branch access (bypassing aufs)" ++ help ++ If you want to modify files on branches directly, eg. bypassing aufs, ++ and want aufs to detect the changes of them fully, then enable this ++ option and use 'udba=notify' mount option. ++ Currently there is only one available configuration, "fsnotify". ++ It will have a negative impact to the performance. ++ See detail in aufs.5. ++ ++choice ++ prompt "method" if AUFS_HNOTIFY ++ default AUFS_HFSNOTIFY ++config AUFS_HFSNOTIFY ++ bool "fsnotify" ++ select FSNOTIFY ++endchoice ++ ++config AUFS_EXPORT ++ bool "NFS-exportable aufs" ++ depends on EXPORTFS ++ help ++ If you want to export your mounted aufs via NFS, then enable this ++ option. There are several requirements for this configuration. ++ See detail in aufs.5. ++ ++config AUFS_INO_T_64 ++ bool ++ depends on AUFS_EXPORT ++ depends on 64BIT && !(ALPHA || S390) ++ default y ++ help ++ Automatic configuration for internal use. ++ /* typedef unsigned long/int __kernel_ino_t */ ++ /* alpha and s390x are int */ ++ ++config AUFS_RDU ++ bool "Readdir in userspace" ++ help ++ Aufs has two methods to provide a merged view for a directory, ++ by a user-space library and by kernel-space natively. The latter ++ is always enabled but sometimes large and slow. ++ If you enable this option, install the library in aufs2-util ++ package, and set some environment variables for your readdir(3), ++ then the work will be handled in user-space which generally ++ shows better performance in most cases. ++ See detail in aufs.5. ++ ++config AUFS_PROC_MAP ++ bool "support for /proc/maps and lsof(1)" ++ depends on PROC_FS ++ help ++ When you issue mmap(2) in aufs, it is actually a direct mmap(2) ++ call to the file on the branch fs since the file in aufs is ++ purely virtual. And the file path printed in /proc/maps (and ++ others) will be the path on the branch fs. In most cases, it ++ does no harm. But some utilities like lsof(1) may confuse since ++ the utility or user may expect the file path in aufs to be ++ printed. ++ To address this issue, aufs provides a patch which introduces a ++ new member called vm_prfile into struct vm_are_struct. The patch ++ is meaningless without enabling this configuration since nobody ++ sets the new vm_prfile member. ++ If you don't apply the patch, then enabling this configuration ++ will cause a compile error. ++ This approach is fragile since if someone else make some changes ++ around vm_file, then vm_prfile may not work anymore. As a ++ workaround such case, aufs provides this configuration. If you ++ disable it, then lsof(1) may produce incorrect result but the ++ problem will be gone even if the aufs patch is applied (I hope). ++ ++config AUFS_SP_IATTR ++ bool "Respect the attributes (mtime/ctime mainly) of special files" ++ help ++ When you write something to a special file, some attributes of it ++ (mtime/ctime mainly) may be updated. Generally such updates are ++ less important (actually some device drivers and NFS ignore ++ it). But some applications (such like test program) requires ++ such updates. If you need these updates, then enable this ++ configuration which introduces some overhead. ++ Currently this configuration handles FIFO only. ++ ++config AUFS_SHWH ++ bool "Show whiteouts" ++ help ++ If you want to make the whiteouts in aufs visible, then enable ++ this option and specify 'shwh' mount option. Although it may ++ sounds like philosophy or something, but in technically it ++ simply shows the name of whiteout with keeping its behaviour. ++ ++config AUFS_BR_RAMFS ++ bool "Ramfs (initramfs/rootfs) as an aufs branch" ++ help ++ If you want to use ramfs as an aufs branch fs, then enable this ++ option. Generally tmpfs is recommended. ++ Aufs prohibited them to be a branch fs by default, because ++ initramfs becomes unusable after switch_root or something ++ generally. If you sets initramfs as an aufs branch and boot your ++ system by switch_root, you will meet a problem easily since the ++ files in initramfs may be inaccessible. ++ Unless you are going to use ramfs as an aufs branch fs without ++ switch_root or something, leave it N. ++ ++config AUFS_BR_FUSE ++ bool "Fuse fs as an aufs branch" ++ depends on FUSE_FS ++ select AUFS_POLL ++ help ++ If you want to use fuse-based userspace filesystem as an aufs ++ branch fs, then enable this option. ++ It implements the internal poll(2) operation which is ++ implemented by fuse only (curretnly). ++ ++config AUFS_POLL ++ bool ++ help ++ Automatic configuration for internal use. ++ ++config AUFS_BR_HFSPLUS ++ bool "Hfsplus as an aufs branch" ++ depends on HFSPLUS_FS ++ default y ++ help ++ If you want to use hfsplus fs as an aufs branch fs, then enable ++ this option. This option introduces a small overhead at ++ copying-up a file on hfsplus. ++ ++config AUFS_BDEV_LOOP ++ bool ++ depends on BLK_DEV_LOOP ++ default y ++ help ++ Automatic configuration for internal use. ++ Convert =[ym] into =y. ++ ++config AUFS_DEBUG ++ bool "Debug aufs" ++ help ++ Enable this to compile aufs internal debug code. ++ It will have a negative impact to the performance. ++ ++config AUFS_MAGIC_SYSRQ ++ bool ++ depends on AUFS_DEBUG && MAGIC_SYSRQ ++ default y ++ help ++ Automatic configuration for internal use. ++ When aufs supports Magic SysRq, enabled automatically. ++endif +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/loop.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,133 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * support for loopback block device as a branch ++ */ ++ ++#include ++#include "aufs.h" ++ ++/* ++ * test if two lower dentries have overlapping branches. ++ */ ++int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding) ++{ ++ struct super_block *h_sb; ++ struct loop_device *l; ++ ++ h_sb = h_adding->d_sb; ++ if (MAJOR(h_sb->s_dev) != LOOP_MAJOR) ++ return 0; ++ ++ l = h_sb->s_bdev->bd_disk->private_data; ++ h_adding = l->lo_backing_file->f_dentry; ++ /* ++ * h_adding can be local NFS. ++ * in this case aufs cannot detect the loop. ++ */ ++ if (unlikely(h_adding->d_sb == sb)) ++ return 1; ++ return !!au_test_subdir(h_adding, sb->s_root); ++} ++ ++/* true if a kernel thread named 'loop[0-9].*' accesses a file */ ++int au_test_loopback_kthread(void) ++{ ++ int ret; ++ struct task_struct *tsk = current; ++ ++ ret = 0; ++ if (tsk->flags & PF_KTHREAD) { ++ const char c = tsk->comm[4]; ++ ret = ('0' <= c && c <= '9' ++ && !strncmp(tsk->comm, "loop", 4)); ++ } ++ ++ return ret; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define au_warn_loopback_step 16 ++static int au_warn_loopback_nelem = au_warn_loopback_step; ++static unsigned long *au_warn_loopback_array; ++ ++void au_warn_loopback(struct super_block *h_sb) ++{ ++ int i, new_nelem; ++ unsigned long *a, magic; ++ static DEFINE_SPINLOCK(spin); ++ ++ magic = h_sb->s_magic; ++ spin_lock(&spin); ++ a = au_warn_loopback_array; ++ for (i = 0; i < au_warn_loopback_nelem && *a; i++) ++ if (a[i] == magic) { ++ spin_unlock(&spin); ++ return; ++ } ++ ++ /* h_sb is new to us, print it */ ++ if (i < au_warn_loopback_nelem) { ++ a[i] = magic; ++ goto pr; ++ } ++ ++ /* expand the array */ ++ new_nelem = au_warn_loopback_nelem + au_warn_loopback_step; ++ a = au_kzrealloc(au_warn_loopback_array, ++ au_warn_loopback_nelem * sizeof(unsigned long), ++ new_nelem * sizeof(unsigned long), GFP_ATOMIC); ++ if (a) { ++ au_warn_loopback_nelem = new_nelem; ++ au_warn_loopback_array = a; ++ a[i] = magic; ++ goto pr; ++ } ++ ++ spin_unlock(&spin); ++ AuWarn1("realloc failed, ignored\n"); ++ return; ++ ++pr: ++ spin_unlock(&spin); ++ pr_warning("you may want to try another patch for loopback file " ++ "on %s(0x%lx) branch\n", au_sbtype(h_sb), magic); ++} ++ ++int au_loopback_init(void) ++{ ++ int err; ++ struct super_block *sb __maybe_unused; ++ ++ AuDebugOn(sizeof(sb->s_magic) != sizeof(unsigned long)); ++ ++ err = 0; ++ au_warn_loopback_array = kcalloc(au_warn_loopback_step, ++ sizeof(unsigned long), GFP_NOFS); ++ if (unlikely(!au_warn_loopback_array)) ++ err = -ENOMEM; ++ ++ return err; ++} ++ ++void au_loopback_fin(void) ++{ ++ kfree(au_warn_loopback_array); ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/loop.h 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,50 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * support for loopback mount as a branch ++ */ ++ ++#ifndef __AUFS_LOOP_H__ ++#define __AUFS_LOOP_H__ ++ ++#ifdef __KERNEL__ ++ ++struct dentry; ++struct super_block; ++ ++#ifdef CONFIG_AUFS_BDEV_LOOP ++/* loop.c */ ++int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding); ++int au_test_loopback_kthread(void); ++void au_warn_loopback(struct super_block *h_sb); ++ ++int au_loopback_init(void); ++void au_loopback_fin(void); ++#else ++AuStubInt0(au_test_loopback_overlap, struct super_block *sb, ++ struct dentry *h_adding) ++AuStubInt0(au_test_loopback_kthread, void) ++AuStubVoid(au_warn_loopback, struct super_block *h_sb) ++ ++AuStubInt0(au_loopback_init, void) ++AuStubVoid(au_loopback_fin, void) ++#endif /* BLK_DEV_LOOP */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_LOOP_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/magic.mk 2011-10-25 09:52:26.000000000 +0200 +@@ -0,0 +1,54 @@ ++ ++# defined in ${srctree}/fs/fuse/inode.c ++# tristate ++ifdef CONFIG_FUSE_FS ++ccflags-y += -DFUSE_SUPER_MAGIC=0x65735546 ++endif ++ ++# defined in ${srctree}/fs/ocfs2/ocfs2_fs.h ++# tristate ++ifdef CONFIG_OCFS2_FS ++ccflags-y += -DOCFS2_SUPER_MAGIC=0x7461636f ++endif ++ ++# defined in ${srctree}/fs/ocfs2/dlm/userdlm.h ++# tristate ++ifdef CONFIG_OCFS2_FS_O2CB ++ccflags-y += -DDLMFS_MAGIC=0x76a9f425 ++endif ++ ++# defined in ${srctree}/fs/cifs/cifsfs.c ++# tristate ++ifdef CONFIG_CIFS_FS ++ccflags-y += -DCIFS_MAGIC_NUMBER=0xFF534D42 ++endif ++ ++# defined in ${srctree}/fs/xfs/xfs_sb.h ++# tristate ++ifdef CONFIG_XFS_FS ++ccflags-y += -DXFS_SB_MAGIC=0x58465342 ++endif ++ ++# defined in ${srctree}/fs/configfs/mount.c ++# tristate ++ifdef CONFIG_CONFIGFS_FS ++ccflags-y += -DCONFIGFS_MAGIC=0x62656570 ++endif ++ ++# defined in ${srctree}/fs/9p/v9fs.h ++# tristate ++ifdef CONFIG_9P_FS ++ccflags-y += -DV9FS_MAGIC=0x01021997 ++endif ++ ++# defined in ${srctree}/fs/ubifs/ubifs.h ++# tristate ++ifdef CONFIG_UBIFS_FS ++ccflags-y += -DUBIFS_SUPER_MAGIC=0x24051905 ++endif ++ ++# defined in ${srctree}/fs/hfsplus/hfsplus_raw.h ++# tristate ++ifdef CONFIG_HFSPLUS_FS ++ccflags-y += -DHFSPLUS_SUPER_MAGIC=0x482b ++endif +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/Makefile 2012-03-20 17:31:17.000000000 +0100 +@@ -0,0 +1,42 @@ ++ ++include ${src}/magic.mk ++ifeq (${CONFIG_AUFS_FS},m) ++include ${src}/conf.mk ++endif ++-include ${src}/priv_def.mk ++ ++# cf. include/linux/kernel.h ++# enable pr_debug ++ccflags-y += -DDEBUG ++# sparse requires the full pathname ++ifdef M ++ccflags-y += -include ${M}/../../include/linux/aufs_type.h ++else ++ccflags-y += -include ${srctree}/include/linux/aufs_type.h ++endif ++ ++obj-$(CONFIG_AUFS_FS) += aufs.o ++aufs-y := module.o sbinfo.o super.o branch.o xino.o sysaufs.o opts.o \ ++ wkq.o vfsub.o dcsub.o \ ++ cpup.o whout.o wbr_policy.o \ ++ dinfo.o dentry.o \ ++ dynop.o \ ++ finfo.o file.o f_op.o \ ++ dir.o vdir.o \ ++ iinfo.o inode.o i_op.o i_op_add.o i_op_del.o i_op_ren.o \ ++ ioctl.o ++ ++# all are boolean ++aufs-$(CONFIG_PROC_FS) += procfs.o plink.o ++aufs-$(CONFIG_SYSFS) += sysfs.o ++aufs-$(CONFIG_DEBUG_FS) += dbgaufs.o ++aufs-$(CONFIG_AUFS_BDEV_LOOP) += loop.o ++aufs-$(CONFIG_AUFS_HNOTIFY) += hnotify.o ++aufs-$(CONFIG_AUFS_HFSNOTIFY) += hfsnotify.o ++aufs-$(CONFIG_AUFS_EXPORT) += export.o ++aufs-$(CONFIG_AUFS_POLL) += poll.o ++aufs-$(CONFIG_AUFS_RDU) += rdu.o ++aufs-$(CONFIG_AUFS_SP_IATTR) += f_op_sp.o ++aufs-$(CONFIG_AUFS_BR_HFSPLUS) += hfsplus.o ++aufs-$(CONFIG_AUFS_DEBUG) += debug.o ++aufs-$(CONFIG_AUFS_MAGIC_SYSRQ) += sysrq.o +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/module.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,196 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * module global variables and operations ++ */ ++ ++#include ++#include ++#include "aufs.h" ++ ++void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp) ++{ ++ if (new_sz <= nused) ++ return p; ++ ++ p = krealloc(p, new_sz, gfp); ++ if (p) ++ memset(p + nused, 0, new_sz - nused); ++ return p; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * aufs caches ++ */ ++struct kmem_cache *au_cachep[AuCache_Last]; ++static int __init au_cache_init(void) ++{ ++ au_cachep[AuCache_DINFO] = AuCacheCtor(au_dinfo, au_di_init_once); ++ if (au_cachep[AuCache_DINFO]) ++ /* SLAB_DESTROY_BY_RCU */ ++ au_cachep[AuCache_ICNTNR] = AuCacheCtor(au_icntnr, ++ au_icntnr_init_once); ++ if (au_cachep[AuCache_ICNTNR]) ++ au_cachep[AuCache_FINFO] = AuCacheCtor(au_finfo, ++ au_fi_init_once); ++ if (au_cachep[AuCache_FINFO]) ++ au_cachep[AuCache_VDIR] = AuCache(au_vdir); ++ if (au_cachep[AuCache_VDIR]) ++ au_cachep[AuCache_DEHSTR] = AuCache(au_vdir_dehstr); ++ if (au_cachep[AuCache_DEHSTR]) ++ return 0; ++ ++ return -ENOMEM; ++} ++ ++static void au_cache_fin(void) ++{ ++ int i; ++ ++ /* excluding AuCache_HNOTIFY */ ++ BUILD_BUG_ON(AuCache_HNOTIFY + 1 != AuCache_Last); ++ for (i = 0; i < AuCache_HNOTIFY; i++) ++ if (au_cachep[i]) { ++ kmem_cache_destroy(au_cachep[i]); ++ au_cachep[i] = NULL; ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_dir_roflags; ++ ++#ifdef CONFIG_AUFS_SBILIST ++/* ++ * iterate_supers_type() doesn't protect us from ++ * remounting (branch management) ++ */ ++struct au_splhead au_sbilist; ++#endif ++ ++struct lock_class_key au_lc_key[AuLcKey_Last]; ++ ++/* ++ * functions for module interface. ++ */ ++MODULE_LICENSE("GPL"); ++/* MODULE_LICENSE("GPL v2"); */ ++MODULE_AUTHOR("Junjiro R. Okajima "); ++MODULE_DESCRIPTION(AUFS_NAME ++ " -- Advanced multi layered unification filesystem"); ++MODULE_VERSION(AUFS_VERSION); ++ ++/* this module parameter has no meaning when SYSFS is disabled */ ++int sysaufs_brs = 1; ++MODULE_PARM_DESC(brs, "use /fs/aufs/si_*/brN"); ++module_param_named(brs, sysaufs_brs, int, S_IRUGO); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static char au_esc_chars[0x20 + 3]; /* 0x01-0x20, backslash, del, and NULL */ ++ ++int au_seq_path(struct seq_file *seq, struct path *path) ++{ ++ return seq_path(seq, path, au_esc_chars); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int __init aufs_init(void) ++{ ++ int err, i; ++ char *p; ++ ++ p = au_esc_chars; ++ for (i = 1; i <= ' '; i++) ++ *p++ = i; ++ *p++ = '\\'; ++ *p++ = '\x7f'; ++ *p = 0; ++ ++ au_dir_roflags = au_file_roflags(O_DIRECTORY | O_LARGEFILE); ++ ++ au_sbilist_init(); ++ sysaufs_brs_init(); ++ au_debug_init(); ++ au_dy_init(); ++ err = sysaufs_init(); ++ if (unlikely(err)) ++ goto out; ++ err = au_procfs_init(); ++ if (unlikely(err)) ++ goto out_sysaufs; ++ err = au_wkq_init(); ++ if (unlikely(err)) ++ goto out_procfs; ++ err = au_loopback_init(); ++ if (unlikely(err)) ++ goto out_wkq; ++ err = au_hnotify_init(); ++ if (unlikely(err)) ++ goto out_loopback; ++ err = au_sysrq_init(); ++ if (unlikely(err)) ++ goto out_hin; ++ err = au_cache_init(); ++ if (unlikely(err)) ++ goto out_sysrq; ++ err = register_filesystem(&aufs_fs_type); ++ if (unlikely(err)) ++ goto out_cache; ++ /* since we define pr_fmt, call printk directly */ ++ printk(KERN_INFO AUFS_NAME " " AUFS_VERSION "\n"); ++ goto out; /* success */ ++ ++out_cache: ++ au_cache_fin(); ++out_sysrq: ++ au_sysrq_fin(); ++out_hin: ++ au_hnotify_fin(); ++out_loopback: ++ au_loopback_fin(); ++out_wkq: ++ au_wkq_fin(); ++out_procfs: ++ au_procfs_fin(); ++out_sysaufs: ++ sysaufs_fin(); ++ au_dy_fin(); ++out: ++ return err; ++} ++ ++static void __exit aufs_exit(void) ++{ ++ unregister_filesystem(&aufs_fs_type); ++ au_cache_fin(); ++ au_sysrq_fin(); ++ au_hnotify_fin(); ++ au_loopback_fin(); ++ au_wkq_fin(); ++ au_procfs_fin(); ++ sysaufs_fin(); ++ au_dy_fin(); ++} ++ ++module_init(aufs_init); ++module_exit(aufs_exit); +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/module.h 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,105 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * module initialization and module-global ++ */ ++ ++#ifndef __AUFS_MODULE_H__ ++#define __AUFS_MODULE_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++struct path; ++struct seq_file; ++ ++/* module parameters */ ++extern int sysaufs_brs; ++ ++/* ---------------------------------------------------------------------- */ ++ ++extern int au_dir_roflags; ++ ++enum { ++ AuLcNonDir_FIINFO, ++ AuLcNonDir_DIINFO, ++ AuLcNonDir_IIINFO, ++ ++ AuLcDir_FIINFO, ++ AuLcDir_DIINFO, ++ AuLcDir_IIINFO, ++ ++ AuLcSymlink_DIINFO, ++ AuLcSymlink_IIINFO, ++ ++ AuLcKey_Last ++}; ++extern struct lock_class_key au_lc_key[AuLcKey_Last]; ++ ++void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp); ++int au_seq_path(struct seq_file *seq, struct path *path); ++ ++#ifdef CONFIG_PROC_FS ++/* procfs.c */ ++int __init au_procfs_init(void); ++void au_procfs_fin(void); ++#else ++AuStubInt0(au_procfs_init, void); ++AuStubVoid(au_procfs_fin, void); ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* kmem cache */ ++enum { ++ AuCache_DINFO, ++ AuCache_ICNTNR, ++ AuCache_FINFO, ++ AuCache_VDIR, ++ AuCache_DEHSTR, ++ AuCache_HNOTIFY, /* must be last */ ++ AuCache_Last ++}; ++ ++#define AuCacheFlags (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD) ++#define AuCache(type) KMEM_CACHE(type, AuCacheFlags) ++#define AuCacheCtor(type, ctor) \ ++ kmem_cache_create(#type, sizeof(struct type), \ ++ __alignof__(struct type), AuCacheFlags, ctor) ++ ++extern struct kmem_cache *au_cachep[]; ++ ++#define AuCacheFuncs(name, index) \ ++static inline struct au_##name *au_cache_alloc_##name(void) \ ++{ return kmem_cache_alloc(au_cachep[AuCache_##index], GFP_NOFS); } \ ++static inline void au_cache_free_##name(struct au_##name *p) \ ++{ kmem_cache_free(au_cachep[AuCache_##index], p); } ++ ++AuCacheFuncs(dinfo, DINFO); ++AuCacheFuncs(icntnr, ICNTNR); ++AuCacheFuncs(finfo, FINFO); ++AuCacheFuncs(vdir, VDIR); ++AuCacheFuncs(vdir_dehstr, DEHSTR); ++#ifdef CONFIG_AUFS_HNOTIFY ++AuCacheFuncs(hnotify, HNOTIFY); ++#endif ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_MODULE_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/opts.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,1677 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * mount options/flags ++ */ ++ ++#include ++#include /* a distribution requires */ ++#include ++#include "aufs.h" ++ ++/* ---------------------------------------------------------------------- */ ++ ++enum { ++ Opt_br, ++ Opt_add, Opt_del, Opt_mod, Opt_reorder, Opt_append, Opt_prepend, ++ Opt_idel, Opt_imod, Opt_ireorder, ++ Opt_dirwh, Opt_rdcache, Opt_rdblk, Opt_rdhash, Opt_rendir, ++ Opt_rdblk_def, Opt_rdhash_def, ++ Opt_xino, Opt_zxino, Opt_noxino, ++ Opt_trunc_xino, Opt_trunc_xino_v, Opt_notrunc_xino, ++ Opt_trunc_xino_path, Opt_itrunc_xino, ++ Opt_trunc_xib, Opt_notrunc_xib, ++ Opt_shwh, Opt_noshwh, ++ Opt_plink, Opt_noplink, Opt_list_plink, ++ Opt_udba, ++ Opt_dio, Opt_nodio, ++ /* Opt_lock, Opt_unlock, */ ++ Opt_cmd, Opt_cmd_args, ++ Opt_diropq_a, Opt_diropq_w, ++ Opt_warn_perm, Opt_nowarn_perm, ++ Opt_wbr_copyup, Opt_wbr_create, ++ Opt_refrof, Opt_norefrof, ++ Opt_verbose, Opt_noverbose, ++ Opt_sum, Opt_nosum, Opt_wsum, ++ Opt_tail, Opt_ignore, Opt_ignore_silent, Opt_err ++}; ++ ++static match_table_t options = { ++ {Opt_br, "br=%s"}, ++ {Opt_br, "br:%s"}, ++ ++ {Opt_add, "add=%d:%s"}, ++ {Opt_add, "add:%d:%s"}, ++ {Opt_add, "ins=%d:%s"}, ++ {Opt_add, "ins:%d:%s"}, ++ {Opt_append, "append=%s"}, ++ {Opt_append, "append:%s"}, ++ {Opt_prepend, "prepend=%s"}, ++ {Opt_prepend, "prepend:%s"}, ++ ++ {Opt_del, "del=%s"}, ++ {Opt_del, "del:%s"}, ++ /* {Opt_idel, "idel:%d"}, */ ++ {Opt_mod, "mod=%s"}, ++ {Opt_mod, "mod:%s"}, ++ /* {Opt_imod, "imod:%d:%s"}, */ ++ ++ {Opt_dirwh, "dirwh=%d"}, ++ ++ {Opt_xino, "xino=%s"}, ++ {Opt_noxino, "noxino"}, ++ {Opt_trunc_xino, "trunc_xino"}, ++ {Opt_trunc_xino_v, "trunc_xino_v=%d:%d"}, ++ {Opt_notrunc_xino, "notrunc_xino"}, ++ {Opt_trunc_xino_path, "trunc_xino=%s"}, ++ {Opt_itrunc_xino, "itrunc_xino=%d"}, ++ /* {Opt_zxino, "zxino=%s"}, */ ++ {Opt_trunc_xib, "trunc_xib"}, ++ {Opt_notrunc_xib, "notrunc_xib"}, ++ ++#ifdef CONFIG_PROC_FS ++ {Opt_plink, "plink"}, ++#else ++ {Opt_ignore_silent, "plink"}, ++#endif ++ ++ {Opt_noplink, "noplink"}, ++ ++#ifdef CONFIG_AUFS_DEBUG ++ {Opt_list_plink, "list_plink"}, ++#endif ++ ++ {Opt_udba, "udba=%s"}, ++ ++ {Opt_dio, "dio"}, ++ {Opt_nodio, "nodio"}, ++ ++ {Opt_diropq_a, "diropq=always"}, ++ {Opt_diropq_a, "diropq=a"}, ++ {Opt_diropq_w, "diropq=whiteouted"}, ++ {Opt_diropq_w, "diropq=w"}, ++ ++ {Opt_warn_perm, "warn_perm"}, ++ {Opt_nowarn_perm, "nowarn_perm"}, ++ ++ /* keep them temporary */ ++ {Opt_ignore_silent, "coo=%s"}, ++ {Opt_ignore_silent, "nodlgt"}, ++ {Opt_ignore_silent, "nodirperm1"}, ++ {Opt_ignore_silent, "clean_plink"}, ++ ++#ifdef CONFIG_AUFS_SHWH ++ {Opt_shwh, "shwh"}, ++#endif ++ {Opt_noshwh, "noshwh"}, ++ ++ {Opt_rendir, "rendir=%d"}, ++ ++ {Opt_refrof, "refrof"}, ++ {Opt_norefrof, "norefrof"}, ++ ++ {Opt_verbose, "verbose"}, ++ {Opt_verbose, "v"}, ++ {Opt_noverbose, "noverbose"}, ++ {Opt_noverbose, "quiet"}, ++ {Opt_noverbose, "q"}, ++ {Opt_noverbose, "silent"}, ++ ++ {Opt_sum, "sum"}, ++ {Opt_nosum, "nosum"}, ++ {Opt_wsum, "wsum"}, ++ ++ {Opt_rdcache, "rdcache=%d"}, ++ {Opt_rdblk, "rdblk=%d"}, ++ {Opt_rdblk_def, "rdblk=def"}, ++ {Opt_rdhash, "rdhash=%d"}, ++ {Opt_rdhash_def, "rdhash=def"}, ++ ++ {Opt_wbr_create, "create=%s"}, ++ {Opt_wbr_create, "create_policy=%s"}, ++ {Opt_wbr_copyup, "cpup=%s"}, ++ {Opt_wbr_copyup, "copyup=%s"}, ++ {Opt_wbr_copyup, "copyup_policy=%s"}, ++ ++ /* internal use for the scripts */ ++ {Opt_ignore_silent, "si=%s"}, ++ ++ {Opt_br, "dirs=%s"}, ++ {Opt_ignore, "debug=%d"}, ++ {Opt_ignore, "delete=whiteout"}, ++ {Opt_ignore, "delete=all"}, ++ {Opt_ignore, "imap=%s"}, ++ ++ /* temporary workaround, due to old mount(8)? */ ++ {Opt_ignore_silent, "relatime"}, ++ ++ {Opt_err, NULL} ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static const char *au_parser_pattern(int val, struct match_token *token) ++{ ++ while (token->pattern) { ++ if (token->token == val) ++ return token->pattern; ++ token++; ++ } ++ BUG(); ++ return "??"; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static match_table_t brperm = { ++ {AuBrPerm_RO, AUFS_BRPERM_RO}, ++ {AuBrPerm_RR, AUFS_BRPERM_RR}, ++ {AuBrPerm_RW, AUFS_BRPERM_RW}, ++ {0, NULL} ++}; ++ ++static match_table_t brrattr = { ++ {AuBrRAttr_WH, AUFS_BRRATTR_WH}, ++ {0, NULL} ++}; ++ ++static match_table_t brwattr = { ++ {AuBrWAttr_NoLinkWH, AUFS_BRWATTR_NLWH}, ++ {0, NULL} ++}; ++ ++#define AuBrStr_LONGEST AUFS_BRPERM_RW "+" AUFS_BRWATTR_NLWH ++ ++static int br_attr_val(char *str, match_table_t table, substring_t args[]) ++{ ++ int attr, v; ++ char *p; ++ ++ attr = 0; ++ do { ++ p = strchr(str, '+'); ++ if (p) ++ *p = 0; ++ v = match_token(str, table, args); ++ if (v) ++ attr |= v; ++ else { ++ if (p) ++ *p = '+'; ++ pr_warning("ignored branch attribute %s\n", str); ++ break; ++ } ++ if (p) ++ str = p + 1; ++ } while (p); ++ ++ return attr; ++} ++ ++static int noinline_for_stack br_perm_val(char *perm) ++{ ++ int val; ++ char *p; ++ substring_t args[MAX_OPT_ARGS]; ++ ++ p = strchr(perm, '+'); ++ if (p) ++ *p = 0; ++ val = match_token(perm, brperm, args); ++ if (!val) { ++ if (p) ++ *p = '+'; ++ pr_warning("ignored branch permission %s\n", perm); ++ val = AuBrPerm_RO; ++ goto out; ++ } ++ if (!p) ++ goto out; ++ ++ switch (val) { ++ case AuBrPerm_RO: ++ case AuBrPerm_RR: ++ val |= br_attr_val(p + 1, brrattr, args); ++ break; ++ case AuBrPerm_RW: ++ val |= br_attr_val(p + 1, brwattr, args); ++ break; ++ } ++ ++out: ++ return val; ++} ++ ++/* Caller should free the return value */ ++char *au_optstr_br_perm(int brperm) ++{ ++ char *p, a[sizeof(AuBrStr_LONGEST)]; ++ int sz; ++ ++#define SetPerm(str) do { \ ++ sz = sizeof(str); \ ++ memcpy(a, str, sz); \ ++ p = a + sz - 1; \ ++ } while (0) ++ ++#define AppendAttr(flag, str) do { \ ++ if (brperm & flag) { \ ++ sz = sizeof(str); \ ++ *p++ = '+'; \ ++ memcpy(p, str, sz); \ ++ p += sz - 1; \ ++ } \ ++ } while (0) ++ ++ switch (brperm & AuBrPerm_Mask) { ++ case AuBrPerm_RO: ++ SetPerm(AUFS_BRPERM_RO); ++ break; ++ case AuBrPerm_RR: ++ SetPerm(AUFS_BRPERM_RR); ++ break; ++ case AuBrPerm_RW: ++ SetPerm(AUFS_BRPERM_RW); ++ break; ++ default: ++ AuDebugOn(1); ++ } ++ ++ AppendAttr(AuBrRAttr_WH, AUFS_BRRATTR_WH); ++ AppendAttr(AuBrWAttr_NoLinkWH, AUFS_BRWATTR_NLWH); ++ ++ AuDebugOn(strlen(a) >= sizeof(a)); ++ return kstrdup(a, GFP_NOFS); ++#undef SetPerm ++#undef AppendAttr ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static match_table_t udbalevel = { ++ {AuOpt_UDBA_REVAL, "reval"}, ++ {AuOpt_UDBA_NONE, "none"}, ++#ifdef CONFIG_AUFS_HNOTIFY ++ {AuOpt_UDBA_HNOTIFY, "notify"}, /* abstraction */ ++#ifdef CONFIG_AUFS_HFSNOTIFY ++ {AuOpt_UDBA_HNOTIFY, "fsnotify"}, ++#endif ++#endif ++ {-1, NULL} ++}; ++ ++static int noinline_for_stack udba_val(char *str) ++{ ++ substring_t args[MAX_OPT_ARGS]; ++ ++ return match_token(str, udbalevel, args); ++} ++ ++const char *au_optstr_udba(int udba) ++{ ++ return au_parser_pattern(udba, (void *)udbalevel); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static match_table_t au_wbr_create_policy = { ++ {AuWbrCreate_TDP, "tdp"}, ++ {AuWbrCreate_TDP, "top-down-parent"}, ++ {AuWbrCreate_RR, "rr"}, ++ {AuWbrCreate_RR, "round-robin"}, ++ {AuWbrCreate_MFS, "mfs"}, ++ {AuWbrCreate_MFS, "most-free-space"}, ++ {AuWbrCreate_MFSV, "mfs:%d"}, ++ {AuWbrCreate_MFSV, "most-free-space:%d"}, ++ ++ {AuWbrCreate_MFSRR, "mfsrr:%d"}, ++ {AuWbrCreate_MFSRRV, "mfsrr:%d:%d"}, ++ {AuWbrCreate_PMFS, "pmfs"}, ++ {AuWbrCreate_PMFSV, "pmfs:%d"}, ++ ++ {-1, NULL} ++}; ++ ++/* ++ * cf. linux/lib/parser.c and cmdline.c ++ * gave up calling memparse() since it uses simple_strtoull() instead of ++ * kstrto...(). ++ */ ++static int noinline_for_stack ++au_match_ull(substring_t *s, unsigned long long *result) ++{ ++ int err; ++ unsigned int len; ++ char a[32]; ++ ++ err = -ERANGE; ++ len = s->to - s->from; ++ if (len + 1 <= sizeof(a)) { ++ memcpy(a, s->from, len); ++ a[len] = '\0'; ++ err = kstrtoull(a, 0, result); ++ } ++ return err; ++} ++ ++static int au_wbr_mfs_wmark(substring_t *arg, char *str, ++ struct au_opt_wbr_create *create) ++{ ++ int err; ++ unsigned long long ull; ++ ++ err = 0; ++ if (!au_match_ull(arg, &ull)) ++ create->mfsrr_watermark = ull; ++ else { ++ pr_err("bad integer in %s\n", str); ++ err = -EINVAL; ++ } ++ ++ return err; ++} ++ ++static int au_wbr_mfs_sec(substring_t *arg, char *str, ++ struct au_opt_wbr_create *create) ++{ ++ int n, err; ++ ++ err = 0; ++ if (!match_int(arg, &n) && 0 <= n && n <= AUFS_MFS_MAX_SEC) ++ create->mfs_second = n; ++ else { ++ pr_err("bad integer in %s\n", str); ++ err = -EINVAL; ++ } ++ ++ return err; ++} ++ ++static int noinline_for_stack ++au_wbr_create_val(char *str, struct au_opt_wbr_create *create) ++{ ++ int err, e; ++ substring_t args[MAX_OPT_ARGS]; ++ ++ err = match_token(str, au_wbr_create_policy, args); ++ create->wbr_create = err; ++ switch (err) { ++ case AuWbrCreate_MFSRRV: ++ e = au_wbr_mfs_wmark(&args[0], str, create); ++ if (!e) ++ e = au_wbr_mfs_sec(&args[1], str, create); ++ if (unlikely(e)) ++ err = e; ++ break; ++ case AuWbrCreate_MFSRR: ++ e = au_wbr_mfs_wmark(&args[0], str, create); ++ if (unlikely(e)) { ++ err = e; ++ break; ++ } ++ /*FALLTHROUGH*/ ++ case AuWbrCreate_MFS: ++ case AuWbrCreate_PMFS: ++ create->mfs_second = AUFS_MFS_DEF_SEC; ++ break; ++ case AuWbrCreate_MFSV: ++ case AuWbrCreate_PMFSV: ++ e = au_wbr_mfs_sec(&args[0], str, create); ++ if (unlikely(e)) ++ err = e; ++ break; ++ } ++ ++ return err; ++} ++ ++const char *au_optstr_wbr_create(int wbr_create) ++{ ++ return au_parser_pattern(wbr_create, (void *)au_wbr_create_policy); ++} ++ ++static match_table_t au_wbr_copyup_policy = { ++ {AuWbrCopyup_TDP, "tdp"}, ++ {AuWbrCopyup_TDP, "top-down-parent"}, ++ {AuWbrCopyup_BUP, "bup"}, ++ {AuWbrCopyup_BUP, "bottom-up-parent"}, ++ {AuWbrCopyup_BU, "bu"}, ++ {AuWbrCopyup_BU, "bottom-up"}, ++ {-1, NULL} ++}; ++ ++static int noinline_for_stack au_wbr_copyup_val(char *str) ++{ ++ substring_t args[MAX_OPT_ARGS]; ++ ++ return match_token(str, au_wbr_copyup_policy, args); ++} ++ ++const char *au_optstr_wbr_copyup(int wbr_copyup) ++{ ++ return au_parser_pattern(wbr_copyup, (void *)au_wbr_copyup_policy); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static const int lkup_dirflags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; ++ ++static void dump_opts(struct au_opts *opts) ++{ ++#ifdef CONFIG_AUFS_DEBUG ++ /* reduce stack space */ ++ union { ++ struct au_opt_add *add; ++ struct au_opt_del *del; ++ struct au_opt_mod *mod; ++ struct au_opt_xino *xino; ++ struct au_opt_xino_itrunc *xino_itrunc; ++ struct au_opt_wbr_create *create; ++ } u; ++ struct au_opt *opt; ++ ++ opt = opts->opt; ++ while (opt->type != Opt_tail) { ++ switch (opt->type) { ++ case Opt_add: ++ u.add = &opt->add; ++ AuDbg("add {b%d, %s, 0x%x, %p}\n", ++ u.add->bindex, u.add->pathname, u.add->perm, ++ u.add->path.dentry); ++ break; ++ case Opt_del: ++ case Opt_idel: ++ u.del = &opt->del; ++ AuDbg("del {%s, %p}\n", ++ u.del->pathname, u.del->h_path.dentry); ++ break; ++ case Opt_mod: ++ case Opt_imod: ++ u.mod = &opt->mod; ++ AuDbg("mod {%s, 0x%x, %p}\n", ++ u.mod->path, u.mod->perm, u.mod->h_root); ++ break; ++ case Opt_append: ++ u.add = &opt->add; ++ AuDbg("append {b%d, %s, 0x%x, %p}\n", ++ u.add->bindex, u.add->pathname, u.add->perm, ++ u.add->path.dentry); ++ break; ++ case Opt_prepend: ++ u.add = &opt->add; ++ AuDbg("prepend {b%d, %s, 0x%x, %p}\n", ++ u.add->bindex, u.add->pathname, u.add->perm, ++ u.add->path.dentry); ++ break; ++ case Opt_dirwh: ++ AuDbg("dirwh %d\n", opt->dirwh); ++ break; ++ case Opt_rdcache: ++ AuDbg("rdcache %d\n", opt->rdcache); ++ break; ++ case Opt_rdblk: ++ AuDbg("rdblk %u\n", opt->rdblk); ++ break; ++ case Opt_rdblk_def: ++ AuDbg("rdblk_def\n"); ++ break; ++ case Opt_rdhash: ++ AuDbg("rdhash %u\n", opt->rdhash); ++ break; ++ case Opt_rdhash_def: ++ AuDbg("rdhash_def\n"); ++ break; ++ case Opt_xino: ++ u.xino = &opt->xino; ++ AuDbg("xino {%s %.*s}\n", ++ u.xino->path, ++ AuDLNPair(u.xino->file->f_dentry)); ++ break; ++ case Opt_trunc_xino: ++ AuLabel(trunc_xino); ++ break; ++ case Opt_notrunc_xino: ++ AuLabel(notrunc_xino); ++ break; ++ case Opt_trunc_xino_path: ++ case Opt_itrunc_xino: ++ u.xino_itrunc = &opt->xino_itrunc; ++ AuDbg("trunc_xino %d\n", u.xino_itrunc->bindex); ++ break; ++ ++ case Opt_noxino: ++ AuLabel(noxino); ++ break; ++ case Opt_trunc_xib: ++ AuLabel(trunc_xib); ++ break; ++ case Opt_notrunc_xib: ++ AuLabel(notrunc_xib); ++ break; ++ case Opt_shwh: ++ AuLabel(shwh); ++ break; ++ case Opt_noshwh: ++ AuLabel(noshwh); ++ break; ++ case Opt_plink: ++ AuLabel(plink); ++ break; ++ case Opt_noplink: ++ AuLabel(noplink); ++ break; ++ case Opt_list_plink: ++ AuLabel(list_plink); ++ break; ++ case Opt_udba: ++ AuDbg("udba %d, %s\n", ++ opt->udba, au_optstr_udba(opt->udba)); ++ break; ++ case Opt_dio: ++ AuLabel(dio); ++ break; ++ case Opt_nodio: ++ AuLabel(nodio); ++ break; ++ case Opt_diropq_a: ++ AuLabel(diropq_a); ++ break; ++ case Opt_diropq_w: ++ AuLabel(diropq_w); ++ break; ++ case Opt_warn_perm: ++ AuLabel(warn_perm); ++ break; ++ case Opt_nowarn_perm: ++ AuLabel(nowarn_perm); ++ break; ++ case Opt_refrof: ++ AuLabel(refrof); ++ break; ++ case Opt_norefrof: ++ AuLabel(norefrof); ++ break; ++ case Opt_verbose: ++ AuLabel(verbose); ++ break; ++ case Opt_noverbose: ++ AuLabel(noverbose); ++ break; ++ case Opt_sum: ++ AuLabel(sum); ++ break; ++ case Opt_nosum: ++ AuLabel(nosum); ++ break; ++ case Opt_wsum: ++ AuLabel(wsum); ++ break; ++ case Opt_wbr_create: ++ u.create = &opt->wbr_create; ++ AuDbg("create %d, %s\n", u.create->wbr_create, ++ au_optstr_wbr_create(u.create->wbr_create)); ++ switch (u.create->wbr_create) { ++ case AuWbrCreate_MFSV: ++ case AuWbrCreate_PMFSV: ++ AuDbg("%d sec\n", u.create->mfs_second); ++ break; ++ case AuWbrCreate_MFSRR: ++ AuDbg("%llu watermark\n", ++ u.create->mfsrr_watermark); ++ break; ++ case AuWbrCreate_MFSRRV: ++ AuDbg("%llu watermark, %d sec\n", ++ u.create->mfsrr_watermark, ++ u.create->mfs_second); ++ break; ++ } ++ break; ++ case Opt_wbr_copyup: ++ AuDbg("copyup %d, %s\n", opt->wbr_copyup, ++ au_optstr_wbr_copyup(opt->wbr_copyup)); ++ break; ++ default: ++ BUG(); ++ } ++ opt++; ++ } ++#endif ++} ++ ++void au_opts_free(struct au_opts *opts) ++{ ++ struct au_opt *opt; ++ ++ opt = opts->opt; ++ while (opt->type != Opt_tail) { ++ switch (opt->type) { ++ case Opt_add: ++ case Opt_append: ++ case Opt_prepend: ++ path_put(&opt->add.path); ++ break; ++ case Opt_del: ++ case Opt_idel: ++ path_put(&opt->del.h_path); ++ break; ++ case Opt_mod: ++ case Opt_imod: ++ dput(opt->mod.h_root); ++ break; ++ case Opt_xino: ++ fput(opt->xino.file); ++ break; ++ } ++ opt++; ++ } ++} ++ ++static int opt_add(struct au_opt *opt, char *opt_str, unsigned long sb_flags, ++ aufs_bindex_t bindex) ++{ ++ int err; ++ struct au_opt_add *add = &opt->add; ++ char *p; ++ ++ add->bindex = bindex; ++ add->perm = AuBrPerm_RO; ++ add->pathname = opt_str; ++ p = strchr(opt_str, '='); ++ if (p) { ++ *p++ = 0; ++ if (*p) ++ add->perm = br_perm_val(p); ++ } ++ ++ err = vfsub_kern_path(add->pathname, lkup_dirflags, &add->path); ++ if (!err) { ++ if (!p) { ++ add->perm = AuBrPerm_RO; ++ if (au_test_fs_rr(add->path.dentry->d_sb)) ++ add->perm = AuBrPerm_RR; ++ else if (!bindex && !(sb_flags & MS_RDONLY)) ++ add->perm = AuBrPerm_RW; ++ } ++ opt->type = Opt_add; ++ goto out; ++ } ++ pr_err("lookup failed %s (%d)\n", add->pathname, err); ++ err = -EINVAL; ++ ++out: ++ return err; ++} ++ ++static int au_opts_parse_del(struct au_opt_del *del, substring_t args[]) ++{ ++ int err; ++ ++ del->pathname = args[0].from; ++ AuDbg("del path %s\n", del->pathname); ++ ++ err = vfsub_kern_path(del->pathname, lkup_dirflags, &del->h_path); ++ if (unlikely(err)) ++ pr_err("lookup failed %s (%d)\n", del->pathname, err); ++ ++ return err; ++} ++ ++#if 0 /* reserved for future use */ ++static int au_opts_parse_idel(struct super_block *sb, aufs_bindex_t bindex, ++ struct au_opt_del *del, substring_t args[]) ++{ ++ int err; ++ struct dentry *root; ++ ++ err = -EINVAL; ++ root = sb->s_root; ++ aufs_read_lock(root, AuLock_FLUSH); ++ if (bindex < 0 || au_sbend(sb) < bindex) { ++ pr_err("out of bounds, %d\n", bindex); ++ goto out; ++ } ++ ++ err = 0; ++ del->h_path.dentry = dget(au_h_dptr(root, bindex)); ++ del->h_path.mnt = mntget(au_sbr_mnt(sb, bindex)); ++ ++out: ++ aufs_read_unlock(root, !AuLock_IR); ++ return err; ++} ++#endif ++ ++static int noinline_for_stack ++au_opts_parse_mod(struct au_opt_mod *mod, substring_t args[]) ++{ ++ int err; ++ struct path path; ++ char *p; ++ ++ err = -EINVAL; ++ mod->path = args[0].from; ++ p = strchr(mod->path, '='); ++ if (unlikely(!p)) { ++ pr_err("no permssion %s\n", args[0].from); ++ goto out; ++ } ++ ++ *p++ = 0; ++ err = vfsub_kern_path(mod->path, lkup_dirflags, &path); ++ if (unlikely(err)) { ++ pr_err("lookup failed %s (%d)\n", mod->path, err); ++ goto out; ++ } ++ ++ mod->perm = br_perm_val(p); ++ AuDbg("mod path %s, perm 0x%x, %s\n", mod->path, mod->perm, p); ++ mod->h_root = dget(path.dentry); ++ path_put(&path); ++ ++out: ++ return err; ++} ++ ++#if 0 /* reserved for future use */ ++static int au_opts_parse_imod(struct super_block *sb, aufs_bindex_t bindex, ++ struct au_opt_mod *mod, substring_t args[]) ++{ ++ int err; ++ struct dentry *root; ++ ++ err = -EINVAL; ++ root = sb->s_root; ++ aufs_read_lock(root, AuLock_FLUSH); ++ if (bindex < 0 || au_sbend(sb) < bindex) { ++ pr_err("out of bounds, %d\n", bindex); ++ goto out; ++ } ++ ++ err = 0; ++ mod->perm = br_perm_val(args[1].from); ++ AuDbg("mod path %s, perm 0x%x, %s\n", ++ mod->path, mod->perm, args[1].from); ++ mod->h_root = dget(au_h_dptr(root, bindex)); ++ ++out: ++ aufs_read_unlock(root, !AuLock_IR); ++ return err; ++} ++#endif ++ ++static int au_opts_parse_xino(struct super_block *sb, struct au_opt_xino *xino, ++ substring_t args[]) ++{ ++ int err; ++ struct file *file; ++ ++ file = au_xino_create(sb, args[0].from, /*silent*/0); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ ++ err = -EINVAL; ++ if (unlikely(file->f_dentry->d_sb == sb)) { ++ fput(file); ++ pr_err("%s must be outside\n", args[0].from); ++ goto out; ++ } ++ ++ err = 0; ++ xino->file = file; ++ xino->path = args[0].from; ++ ++out: ++ return err; ++} ++ ++static int noinline_for_stack ++au_opts_parse_xino_itrunc_path(struct super_block *sb, ++ struct au_opt_xino_itrunc *xino_itrunc, ++ substring_t args[]) ++{ ++ int err; ++ aufs_bindex_t bend, bindex; ++ struct path path; ++ struct dentry *root; ++ ++ err = vfsub_kern_path(args[0].from, lkup_dirflags, &path); ++ if (unlikely(err)) { ++ pr_err("lookup failed %s (%d)\n", args[0].from, err); ++ goto out; ++ } ++ ++ xino_itrunc->bindex = -1; ++ root = sb->s_root; ++ aufs_read_lock(root, AuLock_FLUSH); ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ if (au_h_dptr(root, bindex) == path.dentry) { ++ xino_itrunc->bindex = bindex; ++ break; ++ } ++ } ++ aufs_read_unlock(root, !AuLock_IR); ++ path_put(&path); ++ ++ if (unlikely(xino_itrunc->bindex < 0)) { ++ pr_err("no such branch %s\n", args[0].from); ++ err = -EINVAL; ++ } ++ ++out: ++ return err; ++} ++ ++/* called without aufs lock */ ++int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts) ++{ ++ int err, n, token; ++ aufs_bindex_t bindex; ++ unsigned char skipped; ++ struct dentry *root; ++ struct au_opt *opt, *opt_tail; ++ char *opt_str; ++ /* reduce the stack space */ ++ union { ++ struct au_opt_xino_itrunc *xino_itrunc; ++ struct au_opt_wbr_create *create; ++ } u; ++ struct { ++ substring_t args[MAX_OPT_ARGS]; ++ } *a; ++ ++ err = -ENOMEM; ++ a = kmalloc(sizeof(*a), GFP_NOFS); ++ if (unlikely(!a)) ++ goto out; ++ ++ root = sb->s_root; ++ err = 0; ++ bindex = 0; ++ opt = opts->opt; ++ opt_tail = opt + opts->max_opt - 1; ++ opt->type = Opt_tail; ++ while (!err && (opt_str = strsep(&str, ",")) && *opt_str) { ++ err = -EINVAL; ++ skipped = 0; ++ token = match_token(opt_str, options, a->args); ++ switch (token) { ++ case Opt_br: ++ err = 0; ++ while (!err && (opt_str = strsep(&a->args[0].from, ":")) ++ && *opt_str) { ++ err = opt_add(opt, opt_str, opts->sb_flags, ++ bindex++); ++ if (unlikely(!err && ++opt > opt_tail)) { ++ err = -E2BIG; ++ break; ++ } ++ opt->type = Opt_tail; ++ skipped = 1; ++ } ++ break; ++ case Opt_add: ++ if (unlikely(match_int(&a->args[0], &n))) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ bindex = n; ++ err = opt_add(opt, a->args[1].from, opts->sb_flags, ++ bindex); ++ if (!err) ++ opt->type = token; ++ break; ++ case Opt_append: ++ err = opt_add(opt, a->args[0].from, opts->sb_flags, ++ /*dummy bindex*/1); ++ if (!err) ++ opt->type = token; ++ break; ++ case Opt_prepend: ++ err = opt_add(opt, a->args[0].from, opts->sb_flags, ++ /*bindex*/0); ++ if (!err) ++ opt->type = token; ++ break; ++ case Opt_del: ++ err = au_opts_parse_del(&opt->del, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#if 0 /* reserved for future use */ ++ case Opt_idel: ++ del->pathname = "(indexed)"; ++ if (unlikely(match_int(&args[0], &n))) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ err = au_opts_parse_idel(sb, n, &opt->del, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#endif ++ case Opt_mod: ++ err = au_opts_parse_mod(&opt->mod, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#ifdef IMOD /* reserved for future use */ ++ case Opt_imod: ++ u.mod->path = "(indexed)"; ++ if (unlikely(match_int(&a->args[0], &n))) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ err = au_opts_parse_imod(sb, n, &opt->mod, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++#endif ++ case Opt_xino: ++ err = au_opts_parse_xino(sb, &opt->xino, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++ ++ case Opt_trunc_xino_path: ++ err = au_opts_parse_xino_itrunc_path ++ (sb, &opt->xino_itrunc, a->args); ++ if (!err) ++ opt->type = token; ++ break; ++ ++ case Opt_itrunc_xino: ++ u.xino_itrunc = &opt->xino_itrunc; ++ if (unlikely(match_int(&a->args[0], &n))) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ u.xino_itrunc->bindex = n; ++ aufs_read_lock(root, AuLock_FLUSH); ++ if (n < 0 || au_sbend(sb) < n) { ++ pr_err("out of bounds, %d\n", n); ++ aufs_read_unlock(root, !AuLock_IR); ++ break; ++ } ++ aufs_read_unlock(root, !AuLock_IR); ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_dirwh: ++ if (unlikely(match_int(&a->args[0], &opt->dirwh))) ++ break; ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_rdcache: ++ if (unlikely(match_int(&a->args[0], &n))) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ if (unlikely(n > AUFS_RDCACHE_MAX)) { ++ pr_err("rdcache must be smaller than %d\n", ++ AUFS_RDCACHE_MAX); ++ break; ++ } ++ opt->rdcache = n; ++ err = 0; ++ opt->type = token; ++ break; ++ case Opt_rdblk: ++ if (unlikely(match_int(&a->args[0], &n) ++ || n < 0 ++ || n > KMALLOC_MAX_SIZE)) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ if (unlikely(n && n < NAME_MAX)) { ++ pr_err("rdblk must be larger than %d\n", ++ NAME_MAX); ++ break; ++ } ++ opt->rdblk = n; ++ err = 0; ++ opt->type = token; ++ break; ++ case Opt_rdhash: ++ if (unlikely(match_int(&a->args[0], &n) ++ || n < 0 ++ || n * sizeof(struct hlist_head) ++ > KMALLOC_MAX_SIZE)) { ++ pr_err("bad integer in %s\n", opt_str); ++ break; ++ } ++ opt->rdhash = n; ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_trunc_xino: ++ case Opt_notrunc_xino: ++ case Opt_noxino: ++ case Opt_trunc_xib: ++ case Opt_notrunc_xib: ++ case Opt_shwh: ++ case Opt_noshwh: ++ case Opt_plink: ++ case Opt_noplink: ++ case Opt_list_plink: ++ case Opt_dio: ++ case Opt_nodio: ++ case Opt_diropq_a: ++ case Opt_diropq_w: ++ case Opt_warn_perm: ++ case Opt_nowarn_perm: ++ case Opt_refrof: ++ case Opt_norefrof: ++ case Opt_verbose: ++ case Opt_noverbose: ++ case Opt_sum: ++ case Opt_nosum: ++ case Opt_wsum: ++ case Opt_rdblk_def: ++ case Opt_rdhash_def: ++ err = 0; ++ opt->type = token; ++ break; ++ ++ case Opt_udba: ++ opt->udba = udba_val(a->args[0].from); ++ if (opt->udba >= 0) { ++ err = 0; ++ opt->type = token; ++ } else ++ pr_err("wrong value, %s\n", opt_str); ++ break; ++ ++ case Opt_wbr_create: ++ u.create = &opt->wbr_create; ++ u.create->wbr_create ++ = au_wbr_create_val(a->args[0].from, u.create); ++ if (u.create->wbr_create >= 0) { ++ err = 0; ++ opt->type = token; ++ } else ++ pr_err("wrong value, %s\n", opt_str); ++ break; ++ case Opt_wbr_copyup: ++ opt->wbr_copyup = au_wbr_copyup_val(a->args[0].from); ++ if (opt->wbr_copyup >= 0) { ++ err = 0; ++ opt->type = token; ++ } else ++ pr_err("wrong value, %s\n", opt_str); ++ break; ++ ++ case Opt_ignore: ++ pr_warning("ignored %s\n", opt_str); ++ /*FALLTHROUGH*/ ++ case Opt_ignore_silent: ++ skipped = 1; ++ err = 0; ++ break; ++ case Opt_err: ++ pr_err("unknown option %s\n", opt_str); ++ break; ++ } ++ ++ if (!err && !skipped) { ++ if (unlikely(++opt > opt_tail)) { ++ err = -E2BIG; ++ opt--; ++ opt->type = Opt_tail; ++ break; ++ } ++ opt->type = Opt_tail; ++ } ++ } ++ ++ kfree(a); ++ dump_opts(opts); ++ if (unlikely(err)) ++ au_opts_free(opts); ++ ++out: ++ return err; ++} ++ ++static int au_opt_wbr_create(struct super_block *sb, ++ struct au_opt_wbr_create *create) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ err = 1; /* handled */ ++ sbinfo = au_sbi(sb); ++ if (sbinfo->si_wbr_create_ops->fin) { ++ err = sbinfo->si_wbr_create_ops->fin(sb); ++ if (!err) ++ err = 1; ++ } ++ ++ sbinfo->si_wbr_create = create->wbr_create; ++ sbinfo->si_wbr_create_ops = au_wbr_create_ops + create->wbr_create; ++ switch (create->wbr_create) { ++ case AuWbrCreate_MFSRRV: ++ case AuWbrCreate_MFSRR: ++ sbinfo->si_wbr_mfs.mfsrr_watermark = create->mfsrr_watermark; ++ /*FALLTHROUGH*/ ++ case AuWbrCreate_MFS: ++ case AuWbrCreate_MFSV: ++ case AuWbrCreate_PMFS: ++ case AuWbrCreate_PMFSV: ++ sbinfo->si_wbr_mfs.mfs_expire ++ = msecs_to_jiffies(create->mfs_second * MSEC_PER_SEC); ++ break; ++ } ++ ++ if (sbinfo->si_wbr_create_ops->init) ++ sbinfo->si_wbr_create_ops->init(sb); /* ignore */ ++ ++ return err; ++} ++ ++/* ++ * returns, ++ * plus: processed without an error ++ * zero: unprocessed ++ */ ++static int au_opt_simple(struct super_block *sb, struct au_opt *opt, ++ struct au_opts *opts) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ err = 1; /* handled */ ++ sbinfo = au_sbi(sb); ++ switch (opt->type) { ++ case Opt_udba: ++ sbinfo->si_mntflags &= ~AuOptMask_UDBA; ++ sbinfo->si_mntflags |= opt->udba; ++ opts->given_udba |= opt->udba; ++ break; ++ ++ case Opt_plink: ++ au_opt_set(sbinfo->si_mntflags, PLINK); ++ break; ++ case Opt_noplink: ++ if (au_opt_test(sbinfo->si_mntflags, PLINK)) ++ au_plink_put(sb, /*verbose*/1); ++ au_opt_clr(sbinfo->si_mntflags, PLINK); ++ break; ++ case Opt_list_plink: ++ if (au_opt_test(sbinfo->si_mntflags, PLINK)) ++ au_plink_list(sb); ++ break; ++ ++ case Opt_dio: ++ au_opt_set(sbinfo->si_mntflags, DIO); ++ au_fset_opts(opts->flags, REFRESH_DYAOP); ++ break; ++ case Opt_nodio: ++ au_opt_clr(sbinfo->si_mntflags, DIO); ++ au_fset_opts(opts->flags, REFRESH_DYAOP); ++ break; ++ ++ case Opt_diropq_a: ++ au_opt_set(sbinfo->si_mntflags, ALWAYS_DIROPQ); ++ break; ++ case Opt_diropq_w: ++ au_opt_clr(sbinfo->si_mntflags, ALWAYS_DIROPQ); ++ break; ++ ++ case Opt_warn_perm: ++ au_opt_set(sbinfo->si_mntflags, WARN_PERM); ++ break; ++ case Opt_nowarn_perm: ++ au_opt_clr(sbinfo->si_mntflags, WARN_PERM); ++ break; ++ ++ case Opt_refrof: ++ au_opt_set(sbinfo->si_mntflags, REFROF); ++ break; ++ case Opt_norefrof: ++ au_opt_clr(sbinfo->si_mntflags, REFROF); ++ break; ++ ++ case Opt_verbose: ++ au_opt_set(sbinfo->si_mntflags, VERBOSE); ++ break; ++ case Opt_noverbose: ++ au_opt_clr(sbinfo->si_mntflags, VERBOSE); ++ break; ++ ++ case Opt_sum: ++ au_opt_set(sbinfo->si_mntflags, SUM); ++ break; ++ case Opt_wsum: ++ au_opt_clr(sbinfo->si_mntflags, SUM); ++ au_opt_set(sbinfo->si_mntflags, SUM_W); ++ case Opt_nosum: ++ au_opt_clr(sbinfo->si_mntflags, SUM); ++ au_opt_clr(sbinfo->si_mntflags, SUM_W); ++ break; ++ ++ case Opt_wbr_create: ++ err = au_opt_wbr_create(sb, &opt->wbr_create); ++ break; ++ case Opt_wbr_copyup: ++ sbinfo->si_wbr_copyup = opt->wbr_copyup; ++ sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + opt->wbr_copyup; ++ break; ++ ++ case Opt_dirwh: ++ sbinfo->si_dirwh = opt->dirwh; ++ break; ++ ++ case Opt_rdcache: ++ sbinfo->si_rdcache ++ = msecs_to_jiffies(opt->rdcache * MSEC_PER_SEC); ++ break; ++ case Opt_rdblk: ++ sbinfo->si_rdblk = opt->rdblk; ++ break; ++ case Opt_rdblk_def: ++ sbinfo->si_rdblk = AUFS_RDBLK_DEF; ++ break; ++ case Opt_rdhash: ++ sbinfo->si_rdhash = opt->rdhash; ++ break; ++ case Opt_rdhash_def: ++ sbinfo->si_rdhash = AUFS_RDHASH_DEF; ++ break; ++ ++ case Opt_shwh: ++ au_opt_set(sbinfo->si_mntflags, SHWH); ++ break; ++ case Opt_noshwh: ++ au_opt_clr(sbinfo->si_mntflags, SHWH); ++ break; ++ ++ case Opt_trunc_xino: ++ au_opt_set(sbinfo->si_mntflags, TRUNC_XINO); ++ break; ++ case Opt_notrunc_xino: ++ au_opt_clr(sbinfo->si_mntflags, TRUNC_XINO); ++ break; ++ ++ case Opt_trunc_xino_path: ++ case Opt_itrunc_xino: ++ err = au_xino_trunc(sb, opt->xino_itrunc.bindex); ++ if (!err) ++ err = 1; ++ break; ++ ++ case Opt_trunc_xib: ++ au_fset_opts(opts->flags, TRUNC_XIB); ++ break; ++ case Opt_notrunc_xib: ++ au_fclr_opts(opts->flags, TRUNC_XIB); ++ break; ++ ++ default: ++ err = 0; ++ break; ++ } ++ ++ return err; ++} ++ ++/* ++ * returns tri-state. ++ * plus: processed without an error ++ * zero: unprocessed ++ * minus: error ++ */ ++static int au_opt_br(struct super_block *sb, struct au_opt *opt, ++ struct au_opts *opts) ++{ ++ int err, do_refresh; ++ ++ err = 0; ++ switch (opt->type) { ++ case Opt_append: ++ opt->add.bindex = au_sbend(sb) + 1; ++ if (opt->add.bindex < 0) ++ opt->add.bindex = 0; ++ goto add; ++ case Opt_prepend: ++ opt->add.bindex = 0; ++ add: ++ case Opt_add: ++ err = au_br_add(sb, &opt->add, ++ au_ftest_opts(opts->flags, REMOUNT)); ++ if (!err) { ++ err = 1; ++ au_fset_opts(opts->flags, REFRESH); ++ } ++ break; ++ ++ case Opt_del: ++ case Opt_idel: ++ err = au_br_del(sb, &opt->del, ++ au_ftest_opts(opts->flags, REMOUNT)); ++ if (!err) { ++ err = 1; ++ au_fset_opts(opts->flags, TRUNC_XIB); ++ au_fset_opts(opts->flags, REFRESH); ++ } ++ break; ++ ++ case Opt_mod: ++ case Opt_imod: ++ err = au_br_mod(sb, &opt->mod, ++ au_ftest_opts(opts->flags, REMOUNT), ++ &do_refresh); ++ if (!err) { ++ err = 1; ++ if (do_refresh) ++ au_fset_opts(opts->flags, REFRESH); ++ } ++ break; ++ } ++ ++ return err; ++} ++ ++static int au_opt_xino(struct super_block *sb, struct au_opt *opt, ++ struct au_opt_xino **opt_xino, ++ struct au_opts *opts) ++{ ++ int err; ++ aufs_bindex_t bend, bindex; ++ struct dentry *root, *parent, *h_root; ++ ++ err = 0; ++ switch (opt->type) { ++ case Opt_xino: ++ err = au_xino_set(sb, &opt->xino, ++ !!au_ftest_opts(opts->flags, REMOUNT)); ++ if (unlikely(err)) ++ break; ++ ++ *opt_xino = &opt->xino; ++ au_xino_brid_set(sb, -1); ++ ++ /* safe d_parent access */ ++ parent = opt->xino.file->f_dentry->d_parent; ++ root = sb->s_root; ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ h_root = au_h_dptr(root, bindex); ++ if (h_root == parent) { ++ au_xino_brid_set(sb, au_sbr_id(sb, bindex)); ++ break; ++ } ++ } ++ break; ++ ++ case Opt_noxino: ++ au_xino_clr(sb); ++ au_xino_brid_set(sb, -1); ++ *opt_xino = (void *)-1; ++ break; ++ } ++ ++ return err; ++} ++ ++int au_opts_verify(struct super_block *sb, unsigned long sb_flags, ++ unsigned int pending) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ unsigned char do_plink, skip, do_free; ++ struct au_branch *br; ++ struct au_wbr *wbr; ++ struct dentry *root; ++ struct inode *dir, *h_dir; ++ struct au_sbinfo *sbinfo; ++ struct au_hinode *hdir; ++ ++ SiMustAnyLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!(sbinfo->si_mntflags & AuOptMask_UDBA)); ++ ++ if (!(sb_flags & MS_RDONLY)) { ++ if (unlikely(!au_br_writable(au_sbr_perm(sb, 0)))) ++ pr_warning("first branch should be rw\n"); ++ if (unlikely(au_opt_test(sbinfo->si_mntflags, SHWH))) ++ pr_warning("shwh should be used with ro\n"); ++ } ++ ++ if (au_opt_test((sbinfo->si_mntflags | pending), UDBA_HNOTIFY) ++ && !au_opt_test(sbinfo->si_mntflags, XINO)) ++ pr_warning("udba=*notify requires xino\n"); ++ ++ err = 0; ++ root = sb->s_root; ++ dir = root->d_inode; ++ do_plink = !!au_opt_test(sbinfo->si_mntflags, PLINK); ++ bend = au_sbend(sb); ++ for (bindex = 0; !err && bindex <= bend; bindex++) { ++ skip = 0; ++ h_dir = au_h_iptr(dir, bindex); ++ br = au_sbr(sb, bindex); ++ do_free = 0; ++ ++ wbr = br->br_wbr; ++ if (wbr) ++ wbr_wh_read_lock(wbr); ++ ++ if (!au_br_writable(br->br_perm)) { ++ do_free = !!wbr; ++ skip = (!wbr ++ || (!wbr->wbr_whbase ++ && !wbr->wbr_plink ++ && !wbr->wbr_orph)); ++ } else if (!au_br_wh_linkable(br->br_perm)) { ++ /* skip = (!br->br_whbase && !br->br_orph); */ ++ skip = (!wbr || !wbr->wbr_whbase); ++ if (skip && wbr) { ++ if (do_plink) ++ skip = !!wbr->wbr_plink; ++ else ++ skip = !wbr->wbr_plink; ++ } ++ } else { ++ /* skip = (br->br_whbase && br->br_ohph); */ ++ skip = (wbr && wbr->wbr_whbase); ++ if (skip) { ++ if (do_plink) ++ skip = !!wbr->wbr_plink; ++ else ++ skip = !wbr->wbr_plink; ++ } ++ } ++ if (wbr) ++ wbr_wh_read_unlock(wbr); ++ ++ if (skip) ++ continue; ++ ++ hdir = au_hi(dir, bindex); ++ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ if (wbr) ++ wbr_wh_write_lock(wbr); ++ err = au_wh_init(au_h_dptr(root, bindex), br, sb); ++ if (wbr) ++ wbr_wh_write_unlock(wbr); ++ au_hn_imtx_unlock(hdir); ++ ++ if (!err && do_free) { ++ kfree(wbr); ++ br->br_wbr = NULL; ++ } ++ } ++ ++ return err; ++} ++ ++int au_opts_mount(struct super_block *sb, struct au_opts *opts) ++{ ++ int err; ++ unsigned int tmp; ++ aufs_bindex_t bindex, bend; ++ struct au_opt *opt; ++ struct au_opt_xino *opt_xino, xino; ++ struct au_sbinfo *sbinfo; ++ struct au_branch *br; ++ ++ SiMustWriteLock(sb); ++ ++ err = 0; ++ opt_xino = NULL; ++ opt = opts->opt; ++ while (err >= 0 && opt->type != Opt_tail) ++ err = au_opt_simple(sb, opt++, opts); ++ if (err > 0) ++ err = 0; ++ else if (unlikely(err < 0)) ++ goto out; ++ ++ /* disable xino and udba temporary */ ++ sbinfo = au_sbi(sb); ++ tmp = sbinfo->si_mntflags; ++ au_opt_clr(sbinfo->si_mntflags, XINO); ++ au_opt_set_udba(sbinfo->si_mntflags, UDBA_REVAL); ++ ++ opt = opts->opt; ++ while (err >= 0 && opt->type != Opt_tail) ++ err = au_opt_br(sb, opt++, opts); ++ if (err > 0) ++ err = 0; ++ else if (unlikely(err < 0)) ++ goto out; ++ ++ bend = au_sbend(sb); ++ if (unlikely(bend < 0)) { ++ err = -EINVAL; ++ pr_err("no branches\n"); ++ goto out; ++ } ++ ++ if (au_opt_test(tmp, XINO)) ++ au_opt_set(sbinfo->si_mntflags, XINO); ++ opt = opts->opt; ++ while (!err && opt->type != Opt_tail) ++ err = au_opt_xino(sb, opt++, &opt_xino, opts); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_opts_verify(sb, sb->s_flags, tmp); ++ if (unlikely(err)) ++ goto out; ++ ++ /* restore xino */ ++ if (au_opt_test(tmp, XINO) && !opt_xino) { ++ xino.file = au_xino_def(sb); ++ err = PTR_ERR(xino.file); ++ if (IS_ERR(xino.file)) ++ goto out; ++ ++ err = au_xino_set(sb, &xino, /*remount*/0); ++ fput(xino.file); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ /* restore udba */ ++ tmp &= AuOptMask_UDBA; ++ sbinfo->si_mntflags &= ~AuOptMask_UDBA; ++ sbinfo->si_mntflags |= tmp; ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ err = au_hnotify_reset_br(tmp, br, br->br_perm); ++ if (unlikely(err)) ++ AuIOErr("hnotify failed on br %d, %d, ignored\n", ++ bindex, err); ++ /* go on even if err */ ++ } ++ if (au_opt_test(tmp, UDBA_HNOTIFY)) { ++ struct inode *dir = sb->s_root->d_inode; ++ au_hn_reset(dir, au_hi_flags(dir, /*isdir*/1) & ~AuHi_XINO); ++ } ++ ++out: ++ return err; ++} ++ ++int au_opts_remount(struct super_block *sb, struct au_opts *opts) ++{ ++ int err, rerr; ++ struct inode *dir; ++ struct au_opt_xino *opt_xino; ++ struct au_opt *opt; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ dir = sb->s_root->d_inode; ++ sbinfo = au_sbi(sb); ++ err = 0; ++ opt_xino = NULL; ++ opt = opts->opt; ++ while (err >= 0 && opt->type != Opt_tail) { ++ err = au_opt_simple(sb, opt, opts); ++ if (!err) ++ err = au_opt_br(sb, opt, opts); ++ if (!err) ++ err = au_opt_xino(sb, opt, &opt_xino, opts); ++ opt++; ++ } ++ if (err > 0) ++ err = 0; ++ AuTraceErr(err); ++ /* go on even err */ ++ ++ rerr = au_opts_verify(sb, opts->sb_flags, /*pending*/0); ++ if (unlikely(rerr && !err)) ++ err = rerr; ++ ++ if (au_ftest_opts(opts->flags, TRUNC_XIB)) { ++ rerr = au_xib_trunc(sb); ++ if (unlikely(rerr && !err)) ++ err = rerr; ++ } ++ ++ /* will be handled by the caller */ ++ if (!au_ftest_opts(opts->flags, REFRESH) ++ && (opts->given_udba || au_opt_test(sbinfo->si_mntflags, XINO))) ++ au_fset_opts(opts->flags, REFRESH); ++ ++ AuDbg("status 0x%x\n", opts->flags); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++unsigned int au_opt_udba(struct super_block *sb) ++{ ++ return au_mntflags(sb) & AuOptMask_UDBA; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/opts.h 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,209 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * mount options/flags ++ */ ++ ++#ifndef __AUFS_OPTS_H__ ++#define __AUFS_OPTS_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++struct file; ++struct super_block; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* mount flags */ ++#define AuOpt_XINO 1 /* external inode number bitmap ++ and translation table */ ++#define AuOpt_TRUNC_XINO (1 << 1) /* truncate xino files */ ++#define AuOpt_UDBA_NONE (1 << 2) /* users direct branch access */ ++#define AuOpt_UDBA_REVAL (1 << 3) ++#define AuOpt_UDBA_HNOTIFY (1 << 4) ++#define AuOpt_SHWH (1 << 5) /* show whiteout */ ++#define AuOpt_PLINK (1 << 6) /* pseudo-link */ ++#define AuOpt_DIRPERM1 (1 << 7) /* unimplemented */ ++#define AuOpt_REFROF (1 << 8) /* unimplemented */ ++#define AuOpt_ALWAYS_DIROPQ (1 << 9) /* policy to creating diropq */ ++#define AuOpt_SUM (1 << 10) /* summation for statfs(2) */ ++#define AuOpt_SUM_W (1 << 11) /* unimplemented */ ++#define AuOpt_WARN_PERM (1 << 12) /* warn when add-branch */ ++#define AuOpt_VERBOSE (1 << 13) /* busy inode when del-branch */ ++#define AuOpt_DIO (1 << 14) /* direct io */ ++ ++#ifndef CONFIG_AUFS_HNOTIFY ++#undef AuOpt_UDBA_HNOTIFY ++#define AuOpt_UDBA_HNOTIFY 0 ++#endif ++#ifndef CONFIG_AUFS_SHWH ++#undef AuOpt_SHWH ++#define AuOpt_SHWH 0 ++#endif ++ ++#define AuOpt_Def (AuOpt_XINO \ ++ | AuOpt_UDBA_REVAL \ ++ | AuOpt_PLINK \ ++ /* | AuOpt_DIRPERM1 */ \ ++ | AuOpt_WARN_PERM) ++#define AuOptMask_UDBA (AuOpt_UDBA_NONE \ ++ | AuOpt_UDBA_REVAL \ ++ | AuOpt_UDBA_HNOTIFY) ++ ++#define au_opt_test(flags, name) (flags & AuOpt_##name) ++#define au_opt_set(flags, name) do { \ ++ BUILD_BUG_ON(AuOpt_##name & AuOptMask_UDBA); \ ++ ((flags) |= AuOpt_##name); \ ++} while (0) ++#define au_opt_set_udba(flags, name) do { \ ++ (flags) &= ~AuOptMask_UDBA; \ ++ ((flags) |= AuOpt_##name); \ ++} while (0) ++#define au_opt_clr(flags, name) do { \ ++ ((flags) &= ~AuOpt_##name); \ ++} while (0) ++ ++static inline unsigned int au_opts_plink(unsigned int mntflags) ++{ ++#ifdef CONFIG_PROC_FS ++ return mntflags; ++#else ++ return mntflags & ~AuOpt_PLINK; ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policies to select one among multiple writable branches */ ++enum { ++ AuWbrCreate_TDP, /* top down parent */ ++ AuWbrCreate_RR, /* round robin */ ++ AuWbrCreate_MFS, /* most free space */ ++ AuWbrCreate_MFSV, /* mfs with seconds */ ++ AuWbrCreate_MFSRR, /* mfs then rr */ ++ AuWbrCreate_MFSRRV, /* mfs then rr with seconds */ ++ AuWbrCreate_PMFS, /* parent and mfs */ ++ AuWbrCreate_PMFSV, /* parent and mfs with seconds */ ++ ++ AuWbrCreate_Def = AuWbrCreate_TDP ++}; ++ ++enum { ++ AuWbrCopyup_TDP, /* top down parent */ ++ AuWbrCopyup_BUP, /* bottom up parent */ ++ AuWbrCopyup_BU, /* bottom up */ ++ ++ AuWbrCopyup_Def = AuWbrCopyup_TDP ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_opt_add { ++ aufs_bindex_t bindex; ++ char *pathname; ++ int perm; ++ struct path path; ++}; ++ ++struct au_opt_del { ++ char *pathname; ++ struct path h_path; ++}; ++ ++struct au_opt_mod { ++ char *path; ++ int perm; ++ struct dentry *h_root; ++}; ++ ++struct au_opt_xino { ++ char *path; ++ struct file *file; ++}; ++ ++struct au_opt_xino_itrunc { ++ aufs_bindex_t bindex; ++}; ++ ++struct au_opt_wbr_create { ++ int wbr_create; ++ int mfs_second; ++ unsigned long long mfsrr_watermark; ++}; ++ ++struct au_opt { ++ int type; ++ union { ++ struct au_opt_xino xino; ++ struct au_opt_xino_itrunc xino_itrunc; ++ struct au_opt_add add; ++ struct au_opt_del del; ++ struct au_opt_mod mod; ++ int dirwh; ++ int rdcache; ++ unsigned int rdblk; ++ unsigned int rdhash; ++ int udba; ++ struct au_opt_wbr_create wbr_create; ++ int wbr_copyup; ++ }; ++}; ++ ++/* opts flags */ ++#define AuOpts_REMOUNT 1 ++#define AuOpts_REFRESH (1 << 1) ++#define AuOpts_TRUNC_XIB (1 << 2) ++#define AuOpts_REFRESH_DYAOP (1 << 3) ++#define au_ftest_opts(flags, name) ((flags) & AuOpts_##name) ++#define au_fset_opts(flags, name) \ ++ do { (flags) |= AuOpts_##name; } while (0) ++#define au_fclr_opts(flags, name) \ ++ do { (flags) &= ~AuOpts_##name; } while (0) ++ ++struct au_opts { ++ struct au_opt *opt; ++ int max_opt; ++ ++ unsigned int given_udba; ++ unsigned int flags; ++ unsigned long sb_flags; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++char *au_optstr_br_perm(int brperm); ++const char *au_optstr_udba(int udba); ++const char *au_optstr_wbr_copyup(int wbr_copyup); ++const char *au_optstr_wbr_create(int wbr_create); ++ ++void au_opts_free(struct au_opts *opts); ++int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts); ++int au_opts_verify(struct super_block *sb, unsigned long sb_flags, ++ unsigned int pending); ++int au_opts_mount(struct super_block *sb, struct au_opts *opts); ++int au_opts_remount(struct super_block *sb, struct au_opts *opts); ++ ++unsigned int au_opt_udba(struct super_block *sb); ++ ++/* ---------------------------------------------------------------------- */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_OPTS_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/plink.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,515 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * pseudo-link ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * the pseudo-link maintenance mode. ++ * during a user process maintains the pseudo-links, ++ * prohibit adding a new plink and branch manipulation. ++ * ++ * Flags ++ * NOPLM: ++ * For entry functions which will handle plink, and i_mutex is already held ++ * in VFS. ++ * They cannot wait and should return an error at once. ++ * Callers has to check the error. ++ * NOPLMW: ++ * For entry functions which will handle plink, but i_mutex is not held ++ * in VFS. ++ * They can wait the plink maintenance mode to finish. ++ * ++ * They behave like F_SETLK and F_SETLKW. ++ * If the caller never handle plink, then both flags are unnecessary. ++ */ ++ ++int au_plink_maint(struct super_block *sb, int flags) ++{ ++ int err; ++ pid_t pid, ppid; ++ struct au_sbinfo *sbi; ++ ++ SiMustAnyLock(sb); ++ ++ err = 0; ++ if (!au_opt_test(au_mntflags(sb), PLINK)) ++ goto out; ++ ++ sbi = au_sbi(sb); ++ pid = sbi->si_plink_maint_pid; ++ if (!pid || pid == current->pid) ++ goto out; ++ ++ /* todo: it highly depends upon /sbin/mount.aufs */ ++ rcu_read_lock(); ++ ppid = task_pid_vnr(rcu_dereference(current->real_parent)); ++ rcu_read_unlock(); ++ if (pid == ppid) ++ goto out; ++ ++ if (au_ftest_lock(flags, NOPLMW)) { ++ /* if there is no i_mutex lock in VFS, we don't need to wait */ ++ /* AuDebugOn(!lockdep_depth(current)); */ ++ while (sbi->si_plink_maint_pid) { ++ si_read_unlock(sb); ++ /* gave up wake_up_bit() */ ++ wait_event(sbi->si_plink_wq, !sbi->si_plink_maint_pid); ++ ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&sbi->si_nowait); ++ si_noflush_read_lock(sb); ++ } ++ } else if (au_ftest_lock(flags, NOPLM)) { ++ AuDbg("ppid %d, pid %d\n", ppid, pid); ++ err = -EAGAIN; ++ } ++ ++out: ++ return err; ++} ++ ++void au_plink_maint_leave(struct au_sbinfo *sbinfo) ++{ ++ spin_lock(&sbinfo->si_plink_maint_lock); ++ sbinfo->si_plink_maint_pid = 0; ++ spin_unlock(&sbinfo->si_plink_maint_lock); ++ wake_up_all(&sbinfo->si_plink_wq); ++} ++ ++int au_plink_maint_enter(struct super_block *sb) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ /* make sure i am the only one in this fs */ ++ si_write_lock(sb, AuLock_FLUSH); ++ if (au_opt_test(au_mntflags(sb), PLINK)) { ++ spin_lock(&sbinfo->si_plink_maint_lock); ++ if (!sbinfo->si_plink_maint_pid) ++ sbinfo->si_plink_maint_pid = current->pid; ++ else ++ err = -EBUSY; ++ spin_unlock(&sbinfo->si_plink_maint_lock); ++ } ++ si_write_unlock(sb); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct pseudo_link { ++ union { ++ struct list_head list; ++ struct rcu_head rcu; ++ }; ++ struct inode *inode; ++}; ++ ++#ifdef CONFIG_AUFS_DEBUG ++void au_plink_list(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ struct list_head *plink_list; ++ struct pseudo_link *plink; ++ ++ SiMustAnyLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); ++ ++ plink_list = &sbinfo->si_plink.head; ++ rcu_read_lock(); ++ list_for_each_entry_rcu(plink, plink_list, list) ++ AuDbg("%lu\n", plink->inode->i_ino); ++ rcu_read_unlock(); ++} ++#endif ++ ++/* is the inode pseudo-linked? */ ++int au_plink_test(struct inode *inode) ++{ ++ int found; ++ struct au_sbinfo *sbinfo; ++ struct list_head *plink_list; ++ struct pseudo_link *plink; ++ ++ sbinfo = au_sbi(inode->i_sb); ++ AuRwMustAnyLock(&sbinfo->si_rwsem); ++ AuDebugOn(!au_opt_test(au_mntflags(inode->i_sb), PLINK)); ++ AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM)); ++ ++ found = 0; ++ plink_list = &sbinfo->si_plink.head; ++ rcu_read_lock(); ++ list_for_each_entry_rcu(plink, plink_list, list) ++ if (plink->inode == inode) { ++ found = 1; ++ break; ++ } ++ rcu_read_unlock(); ++ return found; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * generate a name for plink. ++ * the file will be stored under AUFS_WH_PLINKDIR. ++ */ ++/* 20 is max digits length of ulong 64 */ ++#define PLINK_NAME_LEN ((20 + 1) * 2) ++ ++static int plink_name(char *name, int len, struct inode *inode, ++ aufs_bindex_t bindex) ++{ ++ int rlen; ++ struct inode *h_inode; ++ ++ h_inode = au_h_iptr(inode, bindex); ++ rlen = snprintf(name, len, "%lu.%lu", inode->i_ino, h_inode->i_ino); ++ return rlen; ++} ++ ++struct au_do_plink_lkup_args { ++ struct dentry **errp; ++ struct qstr *tgtname; ++ struct dentry *h_parent; ++ struct au_branch *br; ++}; ++ ++static struct dentry *au_do_plink_lkup(struct qstr *tgtname, ++ struct dentry *h_parent, ++ struct au_branch *br) ++{ ++ struct dentry *h_dentry; ++ struct mutex *h_mtx; ++ ++ h_mtx = &h_parent->d_inode->i_mutex; ++ mutex_lock_nested(h_mtx, AuLsc_I_CHILD2); ++ h_dentry = au_lkup_one(tgtname, h_parent, br, /*nd*/NULL); ++ mutex_unlock(h_mtx); ++ return h_dentry; ++} ++ ++static void au_call_do_plink_lkup(void *args) ++{ ++ struct au_do_plink_lkup_args *a = args; ++ *a->errp = au_do_plink_lkup(a->tgtname, a->h_parent, a->br); ++} ++ ++/* lookup the plink-ed @inode under the branch at @bindex */ ++struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex) ++{ ++ struct dentry *h_dentry, *h_parent; ++ struct au_branch *br; ++ struct inode *h_dir; ++ int wkq_err; ++ char a[PLINK_NAME_LEN]; ++ struct qstr tgtname = { ++ .name = a ++ }; ++ ++ AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM)); ++ ++ br = au_sbr(inode->i_sb, bindex); ++ h_parent = br->br_wbr->wbr_plink; ++ h_dir = h_parent->d_inode; ++ tgtname.len = plink_name(a, sizeof(a), inode, bindex); ++ ++ if (current_fsuid()) { ++ struct au_do_plink_lkup_args args = { ++ .errp = &h_dentry, ++ .tgtname = &tgtname, ++ .h_parent = h_parent, ++ .br = br ++ }; ++ ++ wkq_err = au_wkq_wait(au_call_do_plink_lkup, &args); ++ if (unlikely(wkq_err)) ++ h_dentry = ERR_PTR(wkq_err); ++ } else ++ h_dentry = au_do_plink_lkup(&tgtname, h_parent, br); ++ ++ return h_dentry; ++} ++ ++/* create a pseudo-link */ ++static int do_whplink(struct qstr *tgt, struct dentry *h_parent, ++ struct dentry *h_dentry, struct au_branch *br) ++{ ++ int err; ++ struct path h_path = { ++ .mnt = br->br_mnt ++ }; ++ struct inode *h_dir; ++ ++ h_dir = h_parent->d_inode; ++ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_CHILD2); ++again: ++ h_path.dentry = au_lkup_one(tgt, h_parent, br, /*nd*/NULL); ++ err = PTR_ERR(h_path.dentry); ++ if (IS_ERR(h_path.dentry)) ++ goto out; ++ ++ err = 0; ++ /* wh.plink dir is not monitored */ ++ /* todo: is it really safe? */ ++ if (h_path.dentry->d_inode ++ && h_path.dentry->d_inode != h_dentry->d_inode) { ++ err = vfsub_unlink(h_dir, &h_path, /*force*/0); ++ dput(h_path.dentry); ++ h_path.dentry = NULL; ++ if (!err) ++ goto again; ++ } ++ if (!err && !h_path.dentry->d_inode) ++ err = vfsub_link(h_dentry, h_dir, &h_path); ++ dput(h_path.dentry); ++ ++out: ++ mutex_unlock(&h_dir->i_mutex); ++ return err; ++} ++ ++struct do_whplink_args { ++ int *errp; ++ struct qstr *tgt; ++ struct dentry *h_parent; ++ struct dentry *h_dentry; ++ struct au_branch *br; ++}; ++ ++static void call_do_whplink(void *args) ++{ ++ struct do_whplink_args *a = args; ++ *a->errp = do_whplink(a->tgt, a->h_parent, a->h_dentry, a->br); ++} ++ ++static int whplink(struct dentry *h_dentry, struct inode *inode, ++ aufs_bindex_t bindex, struct au_branch *br) ++{ ++ int err, wkq_err; ++ struct au_wbr *wbr; ++ struct dentry *h_parent; ++ struct inode *h_dir; ++ char a[PLINK_NAME_LEN]; ++ struct qstr tgtname = { ++ .name = a ++ }; ++ ++ wbr = au_sbr(inode->i_sb, bindex)->br_wbr; ++ h_parent = wbr->wbr_plink; ++ h_dir = h_parent->d_inode; ++ tgtname.len = plink_name(a, sizeof(a), inode, bindex); ++ ++ /* always superio. */ ++ if (current_fsuid()) { ++ struct do_whplink_args args = { ++ .errp = &err, ++ .tgt = &tgtname, ++ .h_parent = h_parent, ++ .h_dentry = h_dentry, ++ .br = br ++ }; ++ wkq_err = au_wkq_wait(call_do_whplink, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } else ++ err = do_whplink(&tgtname, h_parent, h_dentry, br); ++ ++ return err; ++} ++ ++/* free a single plink */ ++static void do_put_plink(struct pseudo_link *plink, int do_del) ++{ ++ if (do_del) ++ list_del(&plink->list); ++ iput(plink->inode); ++ kfree(plink); ++} ++ ++static void do_put_plink_rcu(struct rcu_head *rcu) ++{ ++ struct pseudo_link *plink; ++ ++ plink = container_of(rcu, struct pseudo_link, rcu); ++ iput(plink->inode); ++ kfree(plink); ++} ++ ++/* ++ * create a new pseudo-link for @h_dentry on @bindex. ++ * the linked inode is held in aufs @inode. ++ */ ++void au_plink_append(struct inode *inode, aufs_bindex_t bindex, ++ struct dentry *h_dentry) ++{ ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ struct list_head *plink_list; ++ struct pseudo_link *plink, *tmp; ++ int found, err, cnt; ++ ++ sb = inode->i_sb; ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); ++ ++ cnt = 0; ++ found = 0; ++ plink_list = &sbinfo->si_plink.head; ++ rcu_read_lock(); ++ list_for_each_entry_rcu(plink, plink_list, list) { ++ cnt++; ++ if (plink->inode == inode) { ++ found = 1; ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ if (found) ++ return; ++ ++ tmp = kmalloc(sizeof(*plink), GFP_NOFS); ++ if (tmp) ++ tmp->inode = au_igrab(inode); ++ else { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ spin_lock(&sbinfo->si_plink.spin); ++ list_for_each_entry(plink, plink_list, list) { ++ if (plink->inode == inode) { ++ found = 1; ++ break; ++ } ++ } ++ if (!found) ++ list_add_rcu(&tmp->list, plink_list); ++ spin_unlock(&sbinfo->si_plink.spin); ++ if (!found) { ++ cnt++; ++ WARN_ONCE(cnt > AUFS_PLINK_WARN, ++ "unexpectedly many pseudo links, %d\n", cnt); ++ err = whplink(h_dentry, inode, bindex, au_sbr(sb, bindex)); ++ } else { ++ do_put_plink(tmp, 0); ++ return; ++ } ++ ++out: ++ if (unlikely(err)) { ++ pr_warning("err %d, damaged pseudo link.\n", err); ++ if (tmp) { ++ au_spl_del_rcu(&tmp->list, &sbinfo->si_plink); ++ call_rcu(&tmp->rcu, do_put_plink_rcu); ++ } ++ } ++} ++ ++/* free all plinks */ ++void au_plink_put(struct super_block *sb, int verbose) ++{ ++ struct au_sbinfo *sbinfo; ++ struct list_head *plink_list; ++ struct pseudo_link *plink, *tmp; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); ++ ++ plink_list = &sbinfo->si_plink.head; ++ /* no spin_lock since sbinfo is write-locked */ ++ WARN(verbose && !list_empty(plink_list), "pseudo-link is not flushed"); ++ list_for_each_entry_safe(plink, tmp, plink_list, list) ++ do_put_plink(plink, 0); ++ INIT_LIST_HEAD(plink_list); ++} ++ ++void au_plink_clean(struct super_block *sb, int verbose) ++{ ++ struct dentry *root; ++ ++ root = sb->s_root; ++ aufs_write_lock(root); ++ if (au_opt_test(au_mntflags(sb), PLINK)) ++ au_plink_put(sb, verbose); ++ aufs_write_unlock(root); ++} ++ ++/* free the plinks on a branch specified by @br_id */ ++void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id) ++{ ++ struct au_sbinfo *sbinfo; ++ struct list_head *plink_list; ++ struct pseudo_link *plink, *tmp; ++ struct inode *inode; ++ aufs_bindex_t bstart, bend, bindex; ++ unsigned char do_put; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); ++ AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); ++ ++ plink_list = &sbinfo->si_plink.head; ++ /* no spin_lock since sbinfo is write-locked */ ++ list_for_each_entry_safe(plink, tmp, plink_list, list) { ++ do_put = 0; ++ inode = au_igrab(plink->inode); ++ ii_write_lock_child(inode); ++ bstart = au_ibstart(inode); ++ bend = au_ibend(inode); ++ if (bstart >= 0) { ++ for (bindex = bstart; bindex <= bend; bindex++) { ++ if (!au_h_iptr(inode, bindex) ++ || au_ii_br_id(inode, bindex) != br_id) ++ continue; ++ au_set_h_iptr(inode, bindex, NULL, 0); ++ do_put = 1; ++ break; ++ } ++ } else ++ do_put_plink(plink, 1); ++ ++ if (do_put) { ++ for (bindex = bstart; bindex <= bend; bindex++) ++ if (au_h_iptr(inode, bindex)) { ++ do_put = 0; ++ break; ++ } ++ if (do_put) ++ do_put_plink(plink, 1); ++ } ++ ii_write_unlock(inode); ++ iput(inode); ++ } ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/poll.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,56 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * poll operation ++ * There is only one filesystem which implements ->poll operation, currently. ++ */ ++ ++#include "aufs.h" ++ ++unsigned int aufs_poll(struct file *file, poll_table *wait) ++{ ++ unsigned int mask; ++ int err; ++ struct file *h_file; ++ struct dentry *dentry; ++ struct super_block *sb; ++ ++ /* We should pretend an error happened. */ ++ mask = POLLERR /* | POLLIN | POLLOUT */; ++ dentry = file->f_dentry; ++ sb = dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); ++ err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); ++ if (unlikely(err)) ++ goto out; ++ ++ /* it is not an error if h_file has no operation */ ++ mask = DEFAULT_POLLMASK; ++ h_file = au_hf_top(file); ++ if (h_file->f_op && h_file->f_op->poll) ++ mask = h_file->f_op->poll(h_file, wait); ++ ++ di_read_unlock(dentry, AuLock_IR); ++ fi_read_unlock(file); ++ ++out: ++ si_read_unlock(sb); ++ AuTraceErr((int)mask); ++ return mask; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/procfs.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,170 @@ ++/* ++ * Copyright (C) 2010-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * procfs interfaces ++ */ ++ ++#include ++#include "aufs.h" ++ ++static int au_procfs_plm_release(struct inode *inode, struct file *file) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ sbinfo = file->private_data; ++ if (sbinfo) { ++ au_plink_maint_leave(sbinfo); ++ kobject_put(&sbinfo->si_kobj); ++ } ++ ++ return 0; ++} ++ ++static void au_procfs_plm_write_clean(struct file *file) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ sbinfo = file->private_data; ++ if (sbinfo) ++ au_plink_clean(sbinfo->si_sb, /*verbose*/0); ++} ++ ++static int au_procfs_plm_write_si(struct file *file, unsigned long id) ++{ ++ int err; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++ err = -EBUSY; ++ if (unlikely(file->private_data)) ++ goto out; ++ ++ sb = NULL; ++ /* don't use au_sbilist_lock() here */ ++ spin_lock(&au_sbilist.spin); ++ list_for_each_entry(sbinfo, &au_sbilist.head, si_list) ++ if (id == sysaufs_si_id(sbinfo)) { ++ kobject_get(&sbinfo->si_kobj); ++ sb = sbinfo->si_sb; ++ break; ++ } ++ spin_unlock(&au_sbilist.spin); ++ ++ err = -EINVAL; ++ if (unlikely(!sb)) ++ goto out; ++ ++ err = au_plink_maint_enter(sb); ++ if (!err) ++ /* keep kobject_get() */ ++ file->private_data = sbinfo; ++ else ++ kobject_put(&sbinfo->si_kobj); ++out: ++ return err; ++} ++ ++/* ++ * Accept a valid "si=xxxx" only. ++ * Once it is accepted successfully, accept "clean" too. ++ */ ++static ssize_t au_procfs_plm_write(struct file *file, const char __user *ubuf, ++ size_t count, loff_t *ppos) ++{ ++ ssize_t err; ++ unsigned long id; ++ /* last newline is allowed */ ++ char buf[3 + sizeof(unsigned long) * 2 + 1]; ++ ++ err = -EACCES; ++ if (unlikely(!capable(CAP_SYS_ADMIN))) ++ goto out; ++ ++ err = -EINVAL; ++ if (unlikely(count > sizeof(buf))) ++ goto out; ++ ++ err = copy_from_user(buf, ubuf, count); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ goto out; ++ } ++ buf[count] = 0; ++ ++ err = -EINVAL; ++ if (!strcmp("clean", buf)) { ++ au_procfs_plm_write_clean(file); ++ goto out_success; ++ } else if (unlikely(strncmp("si=", buf, 3))) ++ goto out; ++ ++ err = kstrtoul(buf + 3, 16, &id); ++ if (unlikely(err)) ++ goto out; ++ ++ err = au_procfs_plm_write_si(file, id); ++ if (unlikely(err)) ++ goto out; ++ ++out_success: ++ err = count; /* success */ ++out: ++ return err; ++} ++ ++static const struct file_operations au_procfs_plm_fop = { ++ .write = au_procfs_plm_write, ++ .release = au_procfs_plm_release, ++ .owner = THIS_MODULE ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct proc_dir_entry *au_procfs_dir; ++ ++void au_procfs_fin(void) ++{ ++ remove_proc_entry(AUFS_PLINK_MAINT_NAME, au_procfs_dir); ++ remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL); ++} ++ ++int __init au_procfs_init(void) ++{ ++ int err; ++ struct proc_dir_entry *entry; ++ ++ err = -ENOMEM; ++ au_procfs_dir = proc_mkdir(AUFS_PLINK_MAINT_DIR, NULL); ++ if (unlikely(!au_procfs_dir)) ++ goto out; ++ ++ entry = proc_create(AUFS_PLINK_MAINT_NAME, S_IFREG | S_IWUSR, ++ au_procfs_dir, &au_procfs_plm_fop); ++ if (unlikely(!entry)) ++ goto out_dir; ++ ++ err = 0; ++ goto out; /* success */ ++ ++ ++out_dir: ++ remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL); ++out: ++ return err; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/rdu.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,383 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * readdir in userspace. ++ */ ++ ++#include ++#include ++#include ++#include "aufs.h" ++ ++/* bits for struct aufs_rdu.flags */ ++#define AuRdu_CALLED 1 ++#define AuRdu_CONT (1 << 1) ++#define AuRdu_FULL (1 << 2) ++#define au_ftest_rdu(flags, name) ((flags) & AuRdu_##name) ++#define au_fset_rdu(flags, name) \ ++ do { (flags) |= AuRdu_##name; } while (0) ++#define au_fclr_rdu(flags, name) \ ++ do { (flags) &= ~AuRdu_##name; } while (0) ++ ++struct au_rdu_arg { ++ struct aufs_rdu *rdu; ++ union au_rdu_ent_ul ent; ++ unsigned long end; ++ ++ struct super_block *sb; ++ int err; ++}; ++ ++static int au_rdu_fill(void *__arg, const char *name, int nlen, ++ loff_t offset, u64 h_ino, unsigned int d_type) ++{ ++ int err, len; ++ struct au_rdu_arg *arg = __arg; ++ struct aufs_rdu *rdu = arg->rdu; ++ struct au_rdu_ent ent; ++ ++ err = 0; ++ arg->err = 0; ++ au_fset_rdu(rdu->cookie.flags, CALLED); ++ len = au_rdu_len(nlen); ++ if (arg->ent.ul + len < arg->end) { ++ ent.ino = h_ino; ++ ent.bindex = rdu->cookie.bindex; ++ ent.type = d_type; ++ ent.nlen = nlen; ++ if (unlikely(nlen > AUFS_MAX_NAMELEN)) ++ ent.type = DT_UNKNOWN; ++ ++ /* unnecessary to support mmap_sem since this is a dir */ ++ err = -EFAULT; ++ if (copy_to_user(arg->ent.e, &ent, sizeof(ent))) ++ goto out; ++ if (copy_to_user(arg->ent.e->name, name, nlen)) ++ goto out; ++ /* the terminating NULL */ ++ if (__put_user(0, arg->ent.e->name + nlen)) ++ goto out; ++ err = 0; ++ /* AuDbg("%p, %.*s\n", arg->ent.p, nlen, name); */ ++ arg->ent.ul += len; ++ rdu->rent++; ++ } else { ++ err = -EFAULT; ++ au_fset_rdu(rdu->cookie.flags, FULL); ++ rdu->full = 1; ++ rdu->tail = arg->ent; ++ } ++ ++out: ++ /* AuTraceErr(err); */ ++ return err; ++} ++ ++static int au_rdu_do(struct file *h_file, struct au_rdu_arg *arg) ++{ ++ int err; ++ loff_t offset; ++ struct au_rdu_cookie *cookie = &arg->rdu->cookie; ++ ++ offset = vfsub_llseek(h_file, cookie->h_pos, SEEK_SET); ++ err = offset; ++ if (unlikely(offset != cookie->h_pos)) ++ goto out; ++ ++ err = 0; ++ do { ++ arg->err = 0; ++ au_fclr_rdu(cookie->flags, CALLED); ++ /* smp_mb(); */ ++ err = vfsub_readdir(h_file, au_rdu_fill, arg); ++ if (err >= 0) ++ err = arg->err; ++ } while (!err ++ && au_ftest_rdu(cookie->flags, CALLED) ++ && !au_ftest_rdu(cookie->flags, FULL)); ++ cookie->h_pos = h_file->f_pos; ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_rdu(struct file *file, struct aufs_rdu *rdu) ++{ ++ int err; ++ aufs_bindex_t bend; ++ struct au_rdu_arg arg; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct file *h_file; ++ struct au_rdu_cookie *cookie = &rdu->cookie; ++ ++ err = !access_ok(VERIFY_WRITE, rdu->ent.e, rdu->sz); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ goto out; ++ } ++ rdu->rent = 0; ++ rdu->tail = rdu->ent; ++ rdu->full = 0; ++ arg.rdu = rdu; ++ arg.ent = rdu->ent; ++ arg.end = arg.ent.ul; ++ arg.end += rdu->sz; ++ ++ err = -ENOTDIR; ++ if (unlikely(!file->f_op || !file->f_op->readdir)) ++ goto out; ++ ++ err = security_file_permission(file, MAY_READ); ++ AuTraceErr(err); ++ if (unlikely(err)) ++ goto out; ++ ++ dentry = file->f_dentry; ++ inode = dentry->d_inode; ++#if 1 ++ mutex_lock(&inode->i_mutex); ++#else ++ err = mutex_lock_killable(&inode->i_mutex); ++ AuTraceErr(err); ++ if (unlikely(err)) ++ goto out; ++#endif ++ ++ arg.sb = inode->i_sb; ++ err = si_read_lock(arg.sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out_mtx; ++ err = au_alive_dir(dentry); ++ if (unlikely(err)) ++ goto out_si; ++ /* todo: reval? */ ++ fi_read_lock(file); ++ ++ err = -EAGAIN; ++ if (unlikely(au_ftest_rdu(cookie->flags, CONT) ++ && cookie->generation != au_figen(file))) ++ goto out_unlock; ++ ++ err = 0; ++ if (!rdu->blk) { ++ rdu->blk = au_sbi(arg.sb)->si_rdblk; ++ if (!rdu->blk) ++ rdu->blk = au_dir_size(file, /*dentry*/NULL); ++ } ++ bend = au_fbstart(file); ++ if (cookie->bindex < bend) ++ cookie->bindex = bend; ++ bend = au_fbend_dir(file); ++ /* AuDbg("b%d, b%d\n", cookie->bindex, bend); */ ++ for (; !err && cookie->bindex <= bend; ++ cookie->bindex++, cookie->h_pos = 0) { ++ h_file = au_hf_dir(file, cookie->bindex); ++ if (!h_file) ++ continue; ++ ++ au_fclr_rdu(cookie->flags, FULL); ++ err = au_rdu_do(h_file, &arg); ++ AuTraceErr(err); ++ if (unlikely(au_ftest_rdu(cookie->flags, FULL) || err)) ++ break; ++ } ++ AuDbg("rent %llu\n", rdu->rent); ++ ++ if (!err && !au_ftest_rdu(cookie->flags, CONT)) { ++ rdu->shwh = !!au_opt_test(au_sbi(arg.sb)->si_mntflags, SHWH); ++ au_fset_rdu(cookie->flags, CONT); ++ cookie->generation = au_figen(file); ++ } ++ ++ ii_read_lock_child(inode); ++ fsstack_copy_attr_atime(inode, au_h_iptr(inode, au_ibstart(inode))); ++ ii_read_unlock(inode); ++ ++out_unlock: ++ fi_read_unlock(file); ++out_si: ++ si_read_unlock(arg.sb); ++out_mtx: ++ mutex_unlock(&inode->i_mutex); ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_rdu_ino(struct file *file, struct aufs_rdu *rdu) ++{ ++ int err; ++ ino_t ino; ++ unsigned long long nent; ++ union au_rdu_ent_ul *u; ++ struct au_rdu_ent ent; ++ struct super_block *sb; ++ ++ err = 0; ++ nent = rdu->nent; ++ u = &rdu->ent; ++ sb = file->f_dentry->d_sb; ++ si_read_lock(sb, AuLock_FLUSH); ++ while (nent-- > 0) { ++ /* unnecessary to support mmap_sem since this is a dir */ ++ err = copy_from_user(&ent, u->e, sizeof(ent)); ++ if (!err) ++ err = !access_ok(VERIFY_WRITE, &u->e->ino, sizeof(ino)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ break; ++ } ++ ++ /* AuDbg("b%d, i%llu\n", ent.bindex, ent.ino); */ ++ if (!ent.wh) ++ err = au_ino(sb, ent.bindex, ent.ino, ent.type, &ino); ++ else ++ err = au_wh_ino(sb, ent.bindex, ent.ino, ent.type, ++ &ino); ++ if (unlikely(err)) { ++ AuTraceErr(err); ++ break; ++ } ++ ++ err = __put_user(ino, &u->e->ino); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ break; ++ } ++ u->ul += au_rdu_len(ent.nlen); ++ } ++ si_read_unlock(sb); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_rdu_verify(struct aufs_rdu *rdu) ++{ ++ AuDbg("rdu{%llu, %p, %u | %u | %llu, %u, %u | " ++ "%llu, b%d, 0x%x, g%u}\n", ++ rdu->sz, rdu->ent.e, rdu->verify[AufsCtlRduV_SZ], ++ rdu->blk, ++ rdu->rent, rdu->shwh, rdu->full, ++ rdu->cookie.h_pos, rdu->cookie.bindex, rdu->cookie.flags, ++ rdu->cookie.generation); ++ ++ if (rdu->verify[AufsCtlRduV_SZ] == sizeof(*rdu)) ++ return 0; ++ ++ AuDbg("%u:%u\n", ++ rdu->verify[AufsCtlRduV_SZ], (unsigned int)sizeof(*rdu)); ++ return -EINVAL; ++} ++ ++long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ long err, e; ++ struct aufs_rdu rdu; ++ void __user *p = (void __user *)arg; ++ ++ err = copy_from_user(&rdu, p, sizeof(rdu)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ goto out; ++ } ++ err = au_rdu_verify(&rdu); ++ if (unlikely(err)) ++ goto out; ++ ++ switch (cmd) { ++ case AUFS_CTL_RDU: ++ err = au_rdu(file, &rdu); ++ if (unlikely(err)) ++ break; ++ ++ e = copy_to_user(p, &rdu, sizeof(rdu)); ++ if (unlikely(e)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ } ++ break; ++ case AUFS_CTL_RDU_INO: ++ err = au_rdu_ino(file, &rdu); ++ break; ++ ++ default: ++ /* err = -ENOTTY; */ ++ err = -EINVAL; ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++#ifdef CONFIG_COMPAT ++long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ long err, e; ++ struct aufs_rdu rdu; ++ void __user *p = compat_ptr(arg); ++ ++ /* todo: get_user()? */ ++ err = copy_from_user(&rdu, p, sizeof(rdu)); ++ if (unlikely(err)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ goto out; ++ } ++ rdu.ent.e = compat_ptr(rdu.ent.ul); ++ err = au_rdu_verify(&rdu); ++ if (unlikely(err)) ++ goto out; ++ ++ switch (cmd) { ++ case AUFS_CTL_RDU: ++ err = au_rdu(file, &rdu); ++ if (unlikely(err)) ++ break; ++ ++ rdu.ent.ul = ptr_to_compat(rdu.ent.e); ++ rdu.tail.ul = ptr_to_compat(rdu.tail.e); ++ e = copy_to_user(p, &rdu, sizeof(rdu)); ++ if (unlikely(e)) { ++ err = -EFAULT; ++ AuTraceErr(err); ++ } ++ break; ++ case AUFS_CTL_RDU_INO: ++ err = au_rdu_ino(file, &rdu); ++ break; ++ ++ default: ++ /* err = -ENOTTY; */ ++ err = -EINVAL; ++ } ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++#endif +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/rwsem.h 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,188 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * simple read-write semaphore wrappers ++ */ ++ ++#ifndef __AUFS_RWSEM_H__ ++#define __AUFS_RWSEM_H__ ++ ++#ifdef __KERNEL__ ++ ++#include "debug.h" ++ ++struct au_rwsem { ++ struct rw_semaphore rwsem; ++#ifdef CONFIG_AUFS_DEBUG ++ /* just for debugging, not almighty counter */ ++ atomic_t rcnt, wcnt; ++#endif ++}; ++ ++#ifdef CONFIG_AUFS_DEBUG ++#define AuDbgCntInit(rw) do { \ ++ atomic_set(&(rw)->rcnt, 0); \ ++ atomic_set(&(rw)->wcnt, 0); \ ++ smp_mb(); /* atomic set */ \ ++} while (0) ++ ++#define AuDbgRcntInc(rw) atomic_inc(&(rw)->rcnt) ++#define AuDbgRcntDec(rw) WARN_ON(atomic_dec_return(&(rw)->rcnt) < 0) ++#define AuDbgWcntInc(rw) atomic_inc(&(rw)->wcnt) ++#define AuDbgWcntDec(rw) WARN_ON(atomic_dec_return(&(rw)->wcnt) < 0) ++#else ++#define AuDbgCntInit(rw) do {} while (0) ++#define AuDbgRcntInc(rw) do {} while (0) ++#define AuDbgRcntDec(rw) do {} while (0) ++#define AuDbgWcntInc(rw) do {} while (0) ++#define AuDbgWcntDec(rw) do {} while (0) ++#endif /* CONFIG_AUFS_DEBUG */ ++ ++/* to debug easier, do not make them inlined functions */ ++#define AuRwMustNoWaiters(rw) AuDebugOn(!list_empty(&(rw)->rwsem.wait_list)) ++/* rwsem_is_locked() is unusable */ ++#define AuRwMustReadLock(rw) AuDebugOn(atomic_read(&(rw)->rcnt) <= 0) ++#define AuRwMustWriteLock(rw) AuDebugOn(atomic_read(&(rw)->wcnt) <= 0) ++#define AuRwMustAnyLock(rw) AuDebugOn(atomic_read(&(rw)->rcnt) <= 0 \ ++ && atomic_read(&(rw)->wcnt) <= 0) ++#define AuRwDestroy(rw) AuDebugOn(atomic_read(&(rw)->rcnt) \ ++ || atomic_read(&(rw)->wcnt)) ++ ++#define au_rw_class(rw, key) lockdep_set_class(&(rw)->rwsem, key) ++ ++static inline void au_rw_init(struct au_rwsem *rw) ++{ ++ AuDbgCntInit(rw); ++ init_rwsem(&rw->rwsem); ++} ++ ++static inline void au_rw_init_wlock(struct au_rwsem *rw) ++{ ++ au_rw_init(rw); ++ down_write(&rw->rwsem); ++ AuDbgWcntInc(rw); ++} ++ ++static inline void au_rw_init_wlock_nested(struct au_rwsem *rw, ++ unsigned int lsc) ++{ ++ au_rw_init(rw); ++ down_write_nested(&rw->rwsem, lsc); ++ AuDbgWcntInc(rw); ++} ++ ++static inline void au_rw_read_lock(struct au_rwsem *rw) ++{ ++ down_read(&rw->rwsem); ++ AuDbgRcntInc(rw); ++} ++ ++static inline void au_rw_read_lock_nested(struct au_rwsem *rw, unsigned int lsc) ++{ ++ down_read_nested(&rw->rwsem, lsc); ++ AuDbgRcntInc(rw); ++} ++ ++static inline void au_rw_read_unlock(struct au_rwsem *rw) ++{ ++ AuRwMustReadLock(rw); ++ AuDbgRcntDec(rw); ++ up_read(&rw->rwsem); ++} ++ ++static inline void au_rw_dgrade_lock(struct au_rwsem *rw) ++{ ++ AuRwMustWriteLock(rw); ++ AuDbgRcntInc(rw); ++ AuDbgWcntDec(rw); ++ downgrade_write(&rw->rwsem); ++} ++ ++static inline void au_rw_write_lock(struct au_rwsem *rw) ++{ ++ down_write(&rw->rwsem); ++ AuDbgWcntInc(rw); ++} ++ ++static inline void au_rw_write_lock_nested(struct au_rwsem *rw, ++ unsigned int lsc) ++{ ++ down_write_nested(&rw->rwsem, lsc); ++ AuDbgWcntInc(rw); ++} ++ ++static inline void au_rw_write_unlock(struct au_rwsem *rw) ++{ ++ AuRwMustWriteLock(rw); ++ AuDbgWcntDec(rw); ++ up_write(&rw->rwsem); ++} ++ ++/* why is not _nested version defined */ ++static inline int au_rw_read_trylock(struct au_rwsem *rw) ++{ ++ int ret = down_read_trylock(&rw->rwsem); ++ if (ret) ++ AuDbgRcntInc(rw); ++ return ret; ++} ++ ++static inline int au_rw_write_trylock(struct au_rwsem *rw) ++{ ++ int ret = down_write_trylock(&rw->rwsem); ++ if (ret) ++ AuDbgWcntInc(rw); ++ return ret; ++} ++ ++#undef AuDbgCntInit ++#undef AuDbgRcntInc ++#undef AuDbgRcntDec ++#undef AuDbgWcntInc ++#undef AuDbgWcntDec ++ ++#define AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ ++static inline void prefix##_read_lock(param) \ ++{ au_rw_read_lock(rwsem); } \ ++static inline void prefix##_write_lock(param) \ ++{ au_rw_write_lock(rwsem); } \ ++static inline int prefix##_read_trylock(param) \ ++{ return au_rw_read_trylock(rwsem); } \ ++static inline int prefix##_write_trylock(param) \ ++{ return au_rw_write_trylock(rwsem); } ++/* why is not _nested version defined */ ++/* static inline void prefix##_read_trylock_nested(param, lsc) ++{ au_rw_read_trylock_nested(rwsem, lsc)); } ++static inline void prefix##_write_trylock_nestd(param, lsc) ++{ au_rw_write_trylock_nested(rwsem, lsc); } */ ++ ++#define AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) \ ++static inline void prefix##_read_unlock(param) \ ++{ au_rw_read_unlock(rwsem); } \ ++static inline void prefix##_write_unlock(param) \ ++{ au_rw_write_unlock(rwsem); } \ ++static inline void prefix##_downgrade_lock(param) \ ++{ au_rw_dgrade_lock(rwsem); } ++ ++#define AuSimpleRwsemFuncs(prefix, param, rwsem) \ ++ AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ ++ AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_RWSEM_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/sbinfo.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,343 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * superblock private data ++ */ ++ ++#include "aufs.h" ++ ++/* ++ * they are necessary regardless sysfs is disabled. ++ */ ++void au_si_free(struct kobject *kobj) ++{ ++ struct au_sbinfo *sbinfo; ++ char *locked __maybe_unused; /* debug only */ ++ ++ sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); ++ AuDebugOn(!list_empty(&sbinfo->si_plink.head)); ++ AuDebugOn(atomic_read(&sbinfo->si_nowait.nw_len)); ++ ++ au_rw_write_lock(&sbinfo->si_rwsem); ++ au_br_free(sbinfo); ++ au_rw_write_unlock(&sbinfo->si_rwsem); ++ ++ AuDebugOn(radix_tree_gang_lookup ++ (&sbinfo->au_si_pid.tree, (void **)&locked, ++ /*first_index*/PID_MAX_DEFAULT - 1, ++ /*max_items*/sizeof(locked)/sizeof(*locked))); ++ ++ kfree(sbinfo->si_branch); ++ kfree(sbinfo->au_si_pid.bitmap); ++ mutex_destroy(&sbinfo->si_xib_mtx); ++ AuRwDestroy(&sbinfo->si_rwsem); ++ ++ kfree(sbinfo); ++} ++ ++int au_si_alloc(struct super_block *sb) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ static struct lock_class_key aufs_si; ++ ++ err = -ENOMEM; ++ sbinfo = kzalloc(sizeof(*sbinfo), GFP_NOFS); ++ if (unlikely(!sbinfo)) ++ goto out; ++ ++ BUILD_BUG_ON(sizeof(unsigned long) != ++ sizeof(*sbinfo->au_si_pid.bitmap)); ++ sbinfo->au_si_pid.bitmap = kcalloc(BITS_TO_LONGS(PID_MAX_DEFAULT), ++ sizeof(*sbinfo->au_si_pid.bitmap), ++ GFP_NOFS); ++ if (unlikely(!sbinfo->au_si_pid.bitmap)) ++ goto out_sbinfo; ++ ++ /* will be reallocated separately */ ++ sbinfo->si_branch = kzalloc(sizeof(*sbinfo->si_branch), GFP_NOFS); ++ if (unlikely(!sbinfo->si_branch)) ++ goto out_pidmap; ++ ++ err = sysaufs_si_init(sbinfo); ++ if (unlikely(err)) ++ goto out_br; ++ ++ au_nwt_init(&sbinfo->si_nowait); ++ au_rw_init_wlock(&sbinfo->si_rwsem); ++ au_rw_class(&sbinfo->si_rwsem, &aufs_si); ++ spin_lock_init(&sbinfo->au_si_pid.tree_lock); ++ INIT_RADIX_TREE(&sbinfo->au_si_pid.tree, GFP_ATOMIC | __GFP_NOFAIL); ++ ++ atomic_long_set(&sbinfo->si_ninodes, 0); ++ atomic_long_set(&sbinfo->si_nfiles, 0); ++ ++ sbinfo->si_bend = -1; ++ ++ sbinfo->si_wbr_copyup = AuWbrCopyup_Def; ++ sbinfo->si_wbr_create = AuWbrCreate_Def; ++ sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + sbinfo->si_wbr_copyup; ++ sbinfo->si_wbr_create_ops = au_wbr_create_ops + sbinfo->si_wbr_create; ++ ++ sbinfo->si_mntflags = au_opts_plink(AuOpt_Def); ++ ++ mutex_init(&sbinfo->si_xib_mtx); ++ sbinfo->si_xino_brid = -1; ++ /* leave si_xib_last_pindex and si_xib_next_bit */ ++ ++ sbinfo->si_rdcache = msecs_to_jiffies(AUFS_RDCACHE_DEF * MSEC_PER_SEC); ++ sbinfo->si_rdblk = AUFS_RDBLK_DEF; ++ sbinfo->si_rdhash = AUFS_RDHASH_DEF; ++ sbinfo->si_dirwh = AUFS_DIRWH_DEF; ++ ++ au_spl_init(&sbinfo->si_plink); ++ init_waitqueue_head(&sbinfo->si_plink_wq); ++ spin_lock_init(&sbinfo->si_plink_maint_lock); ++ ++ /* leave other members for sysaufs and si_mnt. */ ++ sbinfo->si_sb = sb; ++ sb->s_fs_info = sbinfo; ++ si_pid_set(sb); ++ au_debug_sbinfo_init(sbinfo); ++ return 0; /* success */ ++ ++out_br: ++ kfree(sbinfo->si_branch); ++out_pidmap: ++ kfree(sbinfo->au_si_pid.bitmap); ++out_sbinfo: ++ kfree(sbinfo); ++out: ++ return err; ++} ++ ++int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr) ++{ ++ int err, sz; ++ struct au_branch **brp; ++ ++ AuRwMustWriteLock(&sbinfo->si_rwsem); ++ ++ err = -ENOMEM; ++ sz = sizeof(*brp) * (sbinfo->si_bend + 1); ++ if (unlikely(!sz)) ++ sz = sizeof(*brp); ++ brp = au_kzrealloc(sbinfo->si_branch, sz, sizeof(*brp) * nbr, GFP_NOFS); ++ if (brp) { ++ sbinfo->si_branch = brp; ++ err = 0; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++unsigned int au_sigen_inc(struct super_block *sb) ++{ ++ unsigned int gen; ++ ++ SiMustWriteLock(sb); ++ ++ gen = ++au_sbi(sb)->si_generation; ++ au_update_digen(sb->s_root); ++ au_update_iigen(sb->s_root->d_inode); ++ sb->s_root->d_inode->i_version++; ++ return gen; ++} ++ ++aufs_bindex_t au_new_br_id(struct super_block *sb) ++{ ++ aufs_bindex_t br_id; ++ int i; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ for (i = 0; i <= AUFS_BRANCH_MAX; i++) { ++ br_id = ++sbinfo->si_last_br_id; ++ AuDebugOn(br_id < 0); ++ if (br_id && au_br_index(sb, br_id) < 0) ++ return br_id; ++ } ++ ++ return -1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* it is ok that new 'nwt' tasks are appended while we are sleeping */ ++int si_read_lock(struct super_block *sb, int flags) ++{ ++ int err; ++ ++ err = 0; ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ ++ si_noflush_read_lock(sb); ++ err = au_plink_maint(sb, flags); ++ if (unlikely(err)) ++ si_read_unlock(sb); ++ ++ return err; ++} ++ ++int si_write_lock(struct super_block *sb, int flags) ++{ ++ int err; ++ ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ ++ si_noflush_write_lock(sb); ++ err = au_plink_maint(sb, flags); ++ if (unlikely(err)) ++ si_write_unlock(sb); ++ ++ return err; ++} ++ ++/* dentry and super_block lock. call at entry point */ ++int aufs_read_lock(struct dentry *dentry, int flags) ++{ ++ int err; ++ struct super_block *sb; ++ ++ sb = dentry->d_sb; ++ err = si_read_lock(sb, flags); ++ if (unlikely(err)) ++ goto out; ++ ++ if (au_ftest_lock(flags, DW)) ++ di_write_lock_child(dentry); ++ else ++ di_read_lock_child(dentry, flags); ++ ++ if (au_ftest_lock(flags, GEN)) { ++ err = au_digen_test(dentry, au_sigen(sb)); ++ AuDebugOn(!err && au_dbrange_test(dentry)); ++ if (unlikely(err)) ++ aufs_read_unlock(dentry, flags); ++ } ++ ++out: ++ return err; ++} ++ ++void aufs_read_unlock(struct dentry *dentry, int flags) ++{ ++ if (au_ftest_lock(flags, DW)) ++ di_write_unlock(dentry); ++ else ++ di_read_unlock(dentry, flags); ++ si_read_unlock(dentry->d_sb); ++} ++ ++void aufs_write_lock(struct dentry *dentry) ++{ ++ si_write_lock(dentry->d_sb, AuLock_FLUSH | AuLock_NOPLMW); ++ di_write_lock_child(dentry); ++} ++ ++void aufs_write_unlock(struct dentry *dentry) ++{ ++ di_write_unlock(dentry); ++ si_write_unlock(dentry->d_sb); ++} ++ ++int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags) ++{ ++ int err; ++ unsigned int sigen; ++ struct super_block *sb; ++ ++ sb = d1->d_sb; ++ err = si_read_lock(sb, flags); ++ if (unlikely(err)) ++ goto out; ++ ++ di_write_lock2_child(d1, d2, au_ftest_lock(flags, DIR)); ++ ++ if (au_ftest_lock(flags, GEN)) { ++ sigen = au_sigen(sb); ++ err = au_digen_test(d1, sigen); ++ AuDebugOn(!err && au_dbrange_test(d1)); ++ if (!err) { ++ err = au_digen_test(d2, sigen); ++ AuDebugOn(!err && au_dbrange_test(d2)); ++ } ++ if (unlikely(err)) ++ aufs_read_and_write_unlock2(d1, d2); ++ } ++ ++out: ++ return err; ++} ++ ++void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2) ++{ ++ di_write_unlock2(d1, d2); ++ si_read_unlock(d1->d_sb); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int si_pid_test_slow(struct super_block *sb) ++{ ++ void *p; ++ ++ rcu_read_lock(); ++ p = radix_tree_lookup(&au_sbi(sb)->au_si_pid.tree, current->pid); ++ rcu_read_unlock(); ++ ++ return (long)!!p; ++} ++ ++void si_pid_set_slow(struct super_block *sb) ++{ ++ int err; ++ struct au_sbinfo *sbinfo; ++ ++ AuDebugOn(si_pid_test_slow(sb)); ++ ++ sbinfo = au_sbi(sb); ++ err = radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); ++ AuDebugOn(err); ++ spin_lock(&sbinfo->au_si_pid.tree_lock); ++ err = radix_tree_insert(&sbinfo->au_si_pid.tree, current->pid, ++ /*any valid ptr*/sb); ++ spin_unlock(&sbinfo->au_si_pid.tree_lock); ++ AuDebugOn(err); ++ radix_tree_preload_end(); ++} ++ ++void si_pid_clr_slow(struct super_block *sb) ++{ ++ void *p; ++ struct au_sbinfo *sbinfo; ++ ++ AuDebugOn(!si_pid_test_slow(sb)); ++ ++ sbinfo = au_sbi(sb); ++ spin_lock(&sbinfo->au_si_pid.tree_lock); ++ p = radix_tree_delete(&sbinfo->au_si_pid.tree, current->pid); ++ spin_unlock(&sbinfo->au_si_pid.tree_lock); ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/spl.h 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,62 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * simple list protected by a spinlock ++ */ ++ ++#ifndef __AUFS_SPL_H__ ++#define __AUFS_SPL_H__ ++ ++#ifdef __KERNEL__ ++ ++struct au_splhead { ++ spinlock_t spin; ++ struct list_head head; ++}; ++ ++static inline void au_spl_init(struct au_splhead *spl) ++{ ++ spin_lock_init(&spl->spin); ++ INIT_LIST_HEAD(&spl->head); ++} ++ ++static inline void au_spl_add(struct list_head *list, struct au_splhead *spl) ++{ ++ spin_lock(&spl->spin); ++ list_add(list, &spl->head); ++ spin_unlock(&spl->spin); ++} ++ ++static inline void au_spl_del(struct list_head *list, struct au_splhead *spl) ++{ ++ spin_lock(&spl->spin); ++ list_del(list); ++ spin_unlock(&spl->spin); ++} ++ ++static inline void au_spl_del_rcu(struct list_head *list, ++ struct au_splhead *spl) ++{ ++ spin_lock(&spl->spin); ++ list_del_rcu(list); ++ spin_unlock(&spl->spin); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_SPL_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/super.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,936 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * mount and super_block operations ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "aufs.h" ++ ++/* ++ * super_operations ++ */ ++static struct inode *aufs_alloc_inode(struct super_block *sb __maybe_unused) ++{ ++ struct au_icntnr *c; ++ ++ c = au_cache_alloc_icntnr(); ++ if (c) { ++ au_icntnr_init(c); ++ c->vfs_inode.i_version = 1; /* sigen(sb); */ ++ c->iinfo.ii_hinode = NULL; ++ return &c->vfs_inode; ++ } ++ return NULL; ++} ++ ++static void aufs_destroy_inode_cb(struct rcu_head *head) ++{ ++ struct inode *inode = container_of(head, struct inode, i_rcu); ++ ++ INIT_LIST_HEAD(&inode->i_dentry); ++ au_cache_free_icntnr(container_of(inode, struct au_icntnr, vfs_inode)); ++} ++ ++static void aufs_destroy_inode(struct inode *inode) ++{ ++ au_iinfo_fin(inode); ++ call_rcu(&inode->i_rcu, aufs_destroy_inode_cb); ++} ++ ++struct inode *au_iget_locked(struct super_block *sb, ino_t ino) ++{ ++ struct inode *inode; ++ int err; ++ ++ inode = iget_locked(sb, ino); ++ if (unlikely(!inode)) { ++ inode = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ if (!(inode->i_state & I_NEW)) ++ goto out; ++ ++ err = au_xigen_new(inode); ++ if (!err) ++ err = au_iinfo_init(inode); ++ if (!err) ++ inode->i_version++; ++ else { ++ iget_failed(inode); ++ inode = ERR_PTR(err); ++ } ++ ++out: ++ /* never return NULL */ ++ AuDebugOn(!inode); ++ AuTraceErrPtr(inode); ++ return inode; ++} ++ ++/* lock free root dinfo */ ++static int au_show_brs(struct seq_file *seq, struct super_block *sb) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ struct path path; ++ struct au_hdentry *hdp; ++ struct au_branch *br; ++ char *perm; ++ ++ err = 0; ++ bend = au_sbend(sb); ++ hdp = au_di(sb->s_root)->di_hdentry; ++ for (bindex = 0; !err && bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ path.mnt = br->br_mnt; ++ path.dentry = hdp[bindex].hd_dentry; ++ err = au_seq_path(seq, &path); ++ if (err > 0) { ++ perm = au_optstr_br_perm(br->br_perm); ++ if (perm) { ++ err = seq_printf(seq, "=%s", perm); ++ kfree(perm); ++ if (err == -1) ++ err = -E2BIG; ++ } else ++ err = -ENOMEM; ++ } ++ if (!err && bindex != bend) ++ err = seq_putc(seq, ':'); ++ } ++ ++ return err; ++} ++ ++static void au_show_wbr_create(struct seq_file *m, int v, ++ struct au_sbinfo *sbinfo) ++{ ++ const char *pat; ++ ++ AuRwMustAnyLock(&sbinfo->si_rwsem); ++ ++ seq_printf(m, ",create="); ++ pat = au_optstr_wbr_create(v); ++ switch (v) { ++ case AuWbrCreate_TDP: ++ case AuWbrCreate_RR: ++ case AuWbrCreate_MFS: ++ case AuWbrCreate_PMFS: ++ seq_printf(m, pat); ++ break; ++ case AuWbrCreate_MFSV: ++ seq_printf(m, /*pat*/"mfs:%lu", ++ jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) ++ / MSEC_PER_SEC); ++ break; ++ case AuWbrCreate_PMFSV: ++ seq_printf(m, /*pat*/"pmfs:%lu", ++ jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) ++ / MSEC_PER_SEC); ++ break; ++ case AuWbrCreate_MFSRR: ++ seq_printf(m, /*pat*/"mfsrr:%llu", ++ sbinfo->si_wbr_mfs.mfsrr_watermark); ++ break; ++ case AuWbrCreate_MFSRRV: ++ seq_printf(m, /*pat*/"mfsrr:%llu:%lu", ++ sbinfo->si_wbr_mfs.mfsrr_watermark, ++ jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) ++ / MSEC_PER_SEC); ++ break; ++ } ++} ++ ++static int au_show_xino(struct seq_file *seq, struct super_block *sb) ++{ ++#ifdef CONFIG_SYSFS ++ return 0; ++#else ++ int err; ++ const int len = sizeof(AUFS_XINO_FNAME) - 1; ++ aufs_bindex_t bindex, brid; ++ struct qstr *name; ++ struct file *f; ++ struct dentry *d, *h_root; ++ struct au_hdentry *hdp; ++ ++ AuRwMustAnyLock(&sbinfo->si_rwsem); ++ ++ err = 0; ++ f = au_sbi(sb)->si_xib; ++ if (!f) ++ goto out; ++ ++ /* stop printing the default xino path on the first writable branch */ ++ h_root = NULL; ++ brid = au_xino_brid(sb); ++ if (brid >= 0) { ++ bindex = au_br_index(sb, brid); ++ hdp = au_di(sb->s_root)->di_hdentry; ++ h_root = hdp[0 + bindex].hd_dentry; ++ } ++ d = f->f_dentry; ++ name = &d->d_name; ++ /* safe ->d_parent because the file is unlinked */ ++ if (d->d_parent == h_root ++ && name->len == len ++ && !memcmp(name->name, AUFS_XINO_FNAME, len)) ++ goto out; ++ ++ seq_puts(seq, ",xino="); ++ err = au_xino_path(seq, f); ++ ++out: ++ return err; ++#endif ++} ++ ++/* seq_file will re-call me in case of too long string */ ++static int aufs_show_options(struct seq_file *m, struct dentry *dentry) ++{ ++ int err; ++ unsigned int mnt_flags, v; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++#define AuBool(name, str) do { \ ++ v = au_opt_test(mnt_flags, name); \ ++ if (v != au_opt_test(AuOpt_Def, name)) \ ++ seq_printf(m, ",%s" #str, v ? "" : "no"); \ ++} while (0) ++ ++#define AuStr(name, str) do { \ ++ v = mnt_flags & AuOptMask_##name; \ ++ if (v != (AuOpt_Def & AuOptMask_##name)) \ ++ seq_printf(m, "," #str "=%s", au_optstr_##str(v)); \ ++} while (0) ++ ++#define AuUInt(name, str, val) do { \ ++ if (val != AUFS_##name##_DEF) \ ++ seq_printf(m, "," #str "=%u", val); \ ++} while (0) ++ ++ /* lock free root dinfo */ ++ sb = dentry->d_sb; ++ si_noflush_read_lock(sb); ++ sbinfo = au_sbi(sb); ++ seq_printf(m, ",si=%lx", sysaufs_si_id(sbinfo)); ++ ++ mnt_flags = au_mntflags(sb); ++ if (au_opt_test(mnt_flags, XINO)) { ++ err = au_show_xino(m, sb); ++ if (unlikely(err)) ++ goto out; ++ } else ++ seq_puts(m, ",noxino"); ++ ++ AuBool(TRUNC_XINO, trunc_xino); ++ AuStr(UDBA, udba); ++ AuBool(SHWH, shwh); ++ AuBool(PLINK, plink); ++ AuBool(DIO, dio); ++ /* AuBool(DIRPERM1, dirperm1); */ ++ /* AuBool(REFROF, refrof); */ ++ ++ v = sbinfo->si_wbr_create; ++ if (v != AuWbrCreate_Def) ++ au_show_wbr_create(m, v, sbinfo); ++ ++ v = sbinfo->si_wbr_copyup; ++ if (v != AuWbrCopyup_Def) ++ seq_printf(m, ",cpup=%s", au_optstr_wbr_copyup(v)); ++ ++ v = au_opt_test(mnt_flags, ALWAYS_DIROPQ); ++ if (v != au_opt_test(AuOpt_Def, ALWAYS_DIROPQ)) ++ seq_printf(m, ",diropq=%c", v ? 'a' : 'w'); ++ ++ AuUInt(DIRWH, dirwh, sbinfo->si_dirwh); ++ ++ v = jiffies_to_msecs(sbinfo->si_rdcache) / MSEC_PER_SEC; ++ AuUInt(RDCACHE, rdcache, v); ++ ++ AuUInt(RDBLK, rdblk, sbinfo->si_rdblk); ++ AuUInt(RDHASH, rdhash, sbinfo->si_rdhash); ++ ++ AuBool(SUM, sum); ++ /* AuBool(SUM_W, wsum); */ ++ AuBool(WARN_PERM, warn_perm); ++ AuBool(VERBOSE, verbose); ++ ++out: ++ /* be sure to print "br:" last */ ++ if (!sysaufs_brs) { ++ seq_puts(m, ",br:"); ++ au_show_brs(m, sb); ++ } ++ si_read_unlock(sb); ++ return 0; ++ ++#undef AuBool ++#undef AuStr ++#undef AuUInt ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* sum mode which returns the summation for statfs(2) */ ++ ++static u64 au_add_till_max(u64 a, u64 b) ++{ ++ u64 old; ++ ++ old = a; ++ a += b; ++ if (old < a) ++ return a; ++ return ULLONG_MAX; ++} ++ ++static int au_statfs_sum(struct super_block *sb, struct kstatfs *buf) ++{ ++ int err; ++ u64 blocks, bfree, bavail, files, ffree; ++ aufs_bindex_t bend, bindex, i; ++ unsigned char shared; ++ struct path h_path; ++ struct super_block *h_sb; ++ ++ blocks = 0; ++ bfree = 0; ++ bavail = 0; ++ files = 0; ++ ffree = 0; ++ ++ err = 0; ++ bend = au_sbend(sb); ++ for (bindex = bend; bindex >= 0; bindex--) { ++ h_path.mnt = au_sbr_mnt(sb, bindex); ++ h_sb = h_path.mnt->mnt_sb; ++ shared = 0; ++ for (i = bindex + 1; !shared && i <= bend; i++) ++ shared = (au_sbr_sb(sb, i) == h_sb); ++ if (shared) ++ continue; ++ ++ /* sb->s_root for NFS is unreliable */ ++ h_path.dentry = h_path.mnt->mnt_root; ++ err = vfs_statfs(&h_path, buf); ++ if (unlikely(err)) ++ goto out; ++ ++ blocks = au_add_till_max(blocks, buf->f_blocks); ++ bfree = au_add_till_max(bfree, buf->f_bfree); ++ bavail = au_add_till_max(bavail, buf->f_bavail); ++ files = au_add_till_max(files, buf->f_files); ++ ffree = au_add_till_max(ffree, buf->f_ffree); ++ } ++ ++ buf->f_blocks = blocks; ++ buf->f_bfree = bfree; ++ buf->f_bavail = bavail; ++ buf->f_files = files; ++ buf->f_ffree = ffree; ++ ++out: ++ return err; ++} ++ ++static int aufs_statfs(struct dentry *dentry, struct kstatfs *buf) ++{ ++ int err; ++ struct path h_path; ++ struct super_block *sb; ++ ++ /* lock free root dinfo */ ++ sb = dentry->d_sb; ++ si_noflush_read_lock(sb); ++ if (!au_opt_test(au_mntflags(sb), SUM)) { ++ /* sb->s_root for NFS is unreliable */ ++ h_path.mnt = au_sbr_mnt(sb, 0); ++ h_path.dentry = h_path.mnt->mnt_root; ++ err = vfs_statfs(&h_path, buf); ++ } else ++ err = au_statfs_sum(sb, buf); ++ si_read_unlock(sb); ++ ++ if (!err) { ++ buf->f_type = AUFS_SUPER_MAGIC; ++ buf->f_namelen = AUFS_MAX_NAMELEN; ++ memset(&buf->f_fsid, 0, sizeof(buf->f_fsid)); ++ } ++ /* buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; */ ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* final actions when unmounting a file system */ ++static void aufs_put_super(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ sbinfo = au_sbi(sb); ++ if (!sbinfo) ++ return; ++ ++ dbgaufs_si_fin(sbinfo); ++ kobject_put(&sbinfo->si_kobj); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_array_free(void *array) ++{ ++ if (array) { ++ if (!is_vmalloc_addr(array)) ++ kfree(array); ++ else ++ vfree(array); ++ } ++} ++ ++void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb, void *arg) ++{ ++ void *array; ++ unsigned long long n; ++ ++ array = NULL; ++ n = 0; ++ if (!*hint) ++ goto out; ++ ++ if (*hint > ULLONG_MAX / sizeof(array)) { ++ array = ERR_PTR(-EMFILE); ++ pr_err("hint %llu\n", *hint); ++ goto out; ++ } ++ ++ array = kmalloc(sizeof(array) * *hint, GFP_NOFS); ++ if (unlikely(!array)) ++ array = vmalloc(sizeof(array) * *hint); ++ if (unlikely(!array)) { ++ array = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ n = cb(array, *hint, arg); ++ AuDebugOn(n > *hint); ++ ++out: ++ *hint = n; ++ return array; ++} ++ ++static unsigned long long au_iarray_cb(void *a, ++ unsigned long long max __maybe_unused, ++ void *arg) ++{ ++ unsigned long long n; ++ struct inode **p, *inode; ++ struct list_head *head; ++ ++ n = 0; ++ p = a; ++ head = arg; ++ spin_lock(&inode_sb_list_lock); ++ list_for_each_entry(inode, head, i_sb_list) { ++ if (!is_bad_inode(inode) ++ && au_ii(inode)->ii_bstart >= 0) { ++ spin_lock(&inode->i_lock); ++ if (atomic_read(&inode->i_count)) { ++ au_igrab(inode); ++ *p++ = inode; ++ n++; ++ AuDebugOn(n > max); ++ } ++ spin_unlock(&inode->i_lock); ++ } ++ } ++ spin_unlock(&inode_sb_list_lock); ++ ++ return n; ++} ++ ++struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max) ++{ ++ *max = atomic_long_read(&au_sbi(sb)->si_ninodes); ++ return au_array_alloc(max, au_iarray_cb, &sb->s_inodes); ++} ++ ++void au_iarray_free(struct inode **a, unsigned long long max) ++{ ++ unsigned long long ull; ++ ++ for (ull = 0; ull < max; ull++) ++ iput(a[ull]); ++ au_array_free(a); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * refresh dentry and inode at remount time. ++ */ ++/* todo: consolidate with simple_reval_dpath() and au_reval_for_attr() */ ++static int au_do_refresh(struct dentry *dentry, unsigned int dir_flags, ++ struct dentry *parent) ++{ ++ int err; ++ ++ di_write_lock_child(dentry); ++ di_read_lock_parent(parent, AuLock_IR); ++ err = au_refresh_dentry(dentry, parent); ++ if (!err && dir_flags) ++ au_hn_reset(dentry->d_inode, dir_flags); ++ di_read_unlock(parent, AuLock_IR); ++ di_write_unlock(dentry); ++ ++ return err; ++} ++ ++static int au_do_refresh_d(struct dentry *dentry, unsigned int sigen, ++ struct au_sbinfo *sbinfo, ++ const unsigned int dir_flags) ++{ ++ int err; ++ struct dentry *parent; ++ struct inode *inode; ++ ++ err = 0; ++ parent = dget_parent(dentry); ++ if (!au_digen_test(parent, sigen) && au_digen_test(dentry, sigen)) { ++ inode = dentry->d_inode; ++ if (inode) { ++ if (!S_ISDIR(inode->i_mode)) ++ err = au_do_refresh(dentry, /*dir_flags*/0, ++ parent); ++ else { ++ err = au_do_refresh(dentry, dir_flags, parent); ++ if (unlikely(err)) ++ au_fset_si(sbinfo, FAILED_REFRESH_DIR); ++ } ++ } else ++ err = au_do_refresh(dentry, /*dir_flags*/0, parent); ++ AuDbgDentry(dentry); ++ } ++ dput(parent); ++ ++ AuTraceErr(err); ++ return err; ++} ++ ++static int au_refresh_d(struct super_block *sb) ++{ ++ int err, i, j, ndentry, e; ++ unsigned int sigen; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries, *d; ++ struct au_sbinfo *sbinfo; ++ struct dentry *root = sb->s_root; ++ const unsigned int dir_flags = au_hi_flags(root->d_inode, /*isdir*/1); ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_dcsub_pages(&dpages, root, NULL, NULL); ++ if (unlikely(err)) ++ goto out_dpages; ++ ++ sigen = au_sigen(sb); ++ sbinfo = au_sbi(sb); ++ for (i = 0; i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ ndentry = dpage->ndentry; ++ for (j = 0; j < ndentry; j++) { ++ d = dentries[j]; ++ e = au_do_refresh_d(d, sigen, sbinfo, dir_flags); ++ if (unlikely(e && !err)) ++ err = e; ++ /* go on even err */ ++ } ++ } ++ ++out_dpages: ++ au_dpages_free(&dpages); ++out: ++ return err; ++} ++ ++static int au_refresh_i(struct super_block *sb) ++{ ++ int err, e; ++ unsigned int sigen; ++ unsigned long long max, ull; ++ struct inode *inode, **array; ++ ++ array = au_iarray_alloc(sb, &max); ++ err = PTR_ERR(array); ++ if (IS_ERR(array)) ++ goto out; ++ ++ err = 0; ++ sigen = au_sigen(sb); ++ for (ull = 0; ull < max; ull++) { ++ inode = array[ull]; ++ if (au_iigen(inode) != sigen) { ++ ii_write_lock_child(inode); ++ e = au_refresh_hinode_self(inode); ++ ii_write_unlock(inode); ++ if (unlikely(e)) { ++ pr_err("error %d, i%lu\n", e, inode->i_ino); ++ if (!err) ++ err = e; ++ /* go on even if err */ ++ } ++ } ++ } ++ ++ au_iarray_free(array, max); ++ ++out: ++ return err; ++} ++ ++static void au_remount_refresh(struct super_block *sb) ++{ ++ int err, e; ++ unsigned int udba; ++ aufs_bindex_t bindex, bend; ++ struct dentry *root; ++ struct inode *inode; ++ struct au_branch *br; ++ ++ au_sigen_inc(sb); ++ au_fclr_si(au_sbi(sb), FAILED_REFRESH_DIR); ++ ++ root = sb->s_root; ++ DiMustNoWaiters(root); ++ inode = root->d_inode; ++ IiMustNoWaiters(inode); ++ ++ udba = au_opt_udba(sb); ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ err = au_hnotify_reset_br(udba, br, br->br_perm); ++ if (unlikely(err)) ++ AuIOErr("hnotify failed on br %d, %d, ignored\n", ++ bindex, err); ++ /* go on even if err */ ++ } ++ au_hn_reset(inode, au_hi_flags(inode, /*isdir*/1)); ++ ++ di_write_unlock(root); ++ err = au_refresh_d(sb); ++ e = au_refresh_i(sb); ++ if (unlikely(e && !err)) ++ err = e; ++ /* aufs_write_lock() calls ..._child() */ ++ di_write_lock_child(root); ++ ++ au_cpup_attr_all(inode, /*force*/1); ++ ++ if (unlikely(err)) ++ AuIOErr("refresh failed, ignored, %d\n", err); ++} ++ ++/* stop extra interpretation of errno in mount(8), and strange error messages */ ++static int cvt_err(int err) ++{ ++ AuTraceErr(err); ++ ++ switch (err) { ++ case -ENOENT: ++ case -ENOTDIR: ++ case -EEXIST: ++ case -EIO: ++ err = -EINVAL; ++ } ++ return err; ++} ++ ++static int aufs_remount_fs(struct super_block *sb, int *flags, char *data) ++{ ++ int err, do_dx; ++ unsigned int mntflags; ++ struct au_opts opts; ++ struct dentry *root; ++ struct inode *inode; ++ struct au_sbinfo *sbinfo; ++ ++ err = 0; ++ root = sb->s_root; ++ if (!data || !*data) { ++ err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (!err) { ++ di_write_lock_child(root); ++ err = au_opts_verify(sb, *flags, /*pending*/0); ++ aufs_write_unlock(root); ++ } ++ goto out; ++ } ++ ++ err = -ENOMEM; ++ memset(&opts, 0, sizeof(opts)); ++ opts.opt = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!opts.opt)) ++ goto out; ++ opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); ++ opts.flags = AuOpts_REMOUNT; ++ opts.sb_flags = *flags; ++ ++ /* parse it before aufs lock */ ++ err = au_opts_parse(sb, data, &opts); ++ if (unlikely(err)) ++ goto out_opts; ++ ++ sbinfo = au_sbi(sb); ++ inode = root->d_inode; ++ mutex_lock(&inode->i_mutex); ++ err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM); ++ if (unlikely(err)) ++ goto out_mtx; ++ di_write_lock_child(root); ++ ++ /* au_opts_remount() may return an error */ ++ err = au_opts_remount(sb, &opts); ++ au_opts_free(&opts); ++ ++ if (au_ftest_opts(opts.flags, REFRESH)) ++ au_remount_refresh(sb); ++ ++ if (au_ftest_opts(opts.flags, REFRESH_DYAOP)) { ++ mntflags = au_mntflags(sb); ++ do_dx = !!au_opt_test(mntflags, DIO); ++ au_dy_arefresh(do_dx); ++ } ++ ++ aufs_write_unlock(root); ++ ++out_mtx: ++ mutex_unlock(&inode->i_mutex); ++out_opts: ++ free_page((unsigned long)opts.opt); ++out: ++ err = cvt_err(err); ++ AuTraceErr(err); ++ return err; ++} ++ ++static const struct super_operations aufs_sop = { ++ .alloc_inode = aufs_alloc_inode, ++ .destroy_inode = aufs_destroy_inode, ++ /* always deleting, no clearing */ ++ .drop_inode = generic_delete_inode, ++ .show_options = aufs_show_options, ++ .statfs = aufs_statfs, ++ .put_super = aufs_put_super, ++ .remount_fs = aufs_remount_fs ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int alloc_root(struct super_block *sb) ++{ ++ int err; ++ struct inode *inode; ++ struct dentry *root; ++ ++ err = -ENOMEM; ++ inode = au_iget_locked(sb, AUFS_ROOT_INO); ++ err = PTR_ERR(inode); ++ if (IS_ERR(inode)) ++ goto out; ++ ++ inode->i_op = &aufs_dir_iop; ++ inode->i_fop = &aufs_dir_fop; ++ inode->i_mode = S_IFDIR; ++ set_nlink(inode, 2); ++ unlock_new_inode(inode); ++ ++ root = d_alloc_root(inode); ++ if (unlikely(!root)) ++ goto out_iput; ++ err = PTR_ERR(root); ++ if (IS_ERR(root)) ++ goto out_iput; ++ ++ err = au_di_init(root); ++ if (!err) { ++ sb->s_root = root; ++ return 0; /* success */ ++ } ++ dput(root); ++ goto out; /* do not iput */ ++ ++out_iput: ++ iget_failed(inode); ++out: ++ return err; ++ ++} ++ ++static int aufs_fill_super(struct super_block *sb, void *raw_data, ++ int silent __maybe_unused) ++{ ++ int err; ++ struct au_opts opts; ++ struct dentry *root; ++ struct inode *inode; ++ char *arg = raw_data; ++ ++ if (unlikely(!arg || !*arg)) { ++ err = -EINVAL; ++ pr_err("no arg\n"); ++ goto out; ++ } ++ ++ err = -ENOMEM; ++ memset(&opts, 0, sizeof(opts)); ++ opts.opt = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!opts.opt)) ++ goto out; ++ opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); ++ opts.sb_flags = sb->s_flags; ++ ++ err = au_si_alloc(sb); ++ if (unlikely(err)) ++ goto out_opts; ++ ++ /* all timestamps always follow the ones on the branch */ ++ sb->s_flags |= MS_NOATIME | MS_NODIRATIME; ++ sb->s_op = &aufs_sop; ++ sb->s_d_op = &aufs_dop; ++ sb->s_magic = AUFS_SUPER_MAGIC; ++ sb->s_maxbytes = 0; ++ au_export_init(sb); ++ ++ err = alloc_root(sb); ++ if (unlikely(err)) { ++ si_write_unlock(sb); ++ goto out_info; ++ } ++ root = sb->s_root; ++ inode = root->d_inode; ++ ++ /* ++ * actually we can parse options regardless aufs lock here. ++ * but at remount time, parsing must be done before aufs lock. ++ * so we follow the same rule. ++ */ ++ ii_write_lock_parent(inode); ++ aufs_write_unlock(root); ++ err = au_opts_parse(sb, arg, &opts); ++ if (unlikely(err)) ++ goto out_root; ++ ++ /* lock vfs_inode first, then aufs. */ ++ mutex_lock(&inode->i_mutex); ++ aufs_write_lock(root); ++ err = au_opts_mount(sb, &opts); ++ au_opts_free(&opts); ++ aufs_write_unlock(root); ++ mutex_unlock(&inode->i_mutex); ++ if (!err) ++ goto out_opts; /* success */ ++ ++out_root: ++ dput(root); ++ sb->s_root = NULL; ++out_info: ++ dbgaufs_si_fin(au_sbi(sb)); ++ kobject_put(&au_sbi(sb)->si_kobj); ++ sb->s_fs_info = NULL; ++out_opts: ++ free_page((unsigned long)opts.opt); ++out: ++ AuTraceErr(err); ++ err = cvt_err(err); ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct dentry *aufs_mount(struct file_system_type *fs_type, int flags, ++ const char *dev_name __maybe_unused, ++ void *raw_data) ++{ ++ struct dentry *root; ++ struct super_block *sb; ++ ++ /* all timestamps always follow the ones on the branch */ ++ /* mnt->mnt_flags |= MNT_NOATIME | MNT_NODIRATIME; */ ++ root = mount_nodev(fs_type, flags, raw_data, aufs_fill_super); ++ if (IS_ERR(root)) ++ goto out; ++ ++ sb = root->d_sb; ++ si_write_lock(sb, !AuLock_FLUSH); ++ sysaufs_brs_add(sb, 0); ++ si_write_unlock(sb); ++ au_sbilist_add(sb); ++ ++out: ++ return root; ++} ++ ++static void aufs_kill_sb(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ sbinfo = au_sbi(sb); ++ if (sbinfo) { ++ au_sbilist_del(sb); ++ aufs_write_lock(sb->s_root); ++ if (sbinfo->si_wbr_create_ops->fin) ++ sbinfo->si_wbr_create_ops->fin(sb); ++ if (au_opt_test(sbinfo->si_mntflags, UDBA_HNOTIFY)) { ++ au_opt_set_udba(sbinfo->si_mntflags, UDBA_NONE); ++ au_remount_refresh(sb); ++ } ++ if (au_opt_test(sbinfo->si_mntflags, PLINK)) ++ au_plink_put(sb, /*verbose*/1); ++ au_xino_clr(sb); ++ sbinfo->si_sb = NULL; ++ aufs_write_unlock(sb->s_root); ++ au_nwt_flush(&sbinfo->si_nowait); ++ } ++ generic_shutdown_super(sb); ++} ++ ++struct file_system_type aufs_fs_type = { ++ .name = AUFS_FSTYPE, ++ .fs_flags = ++ FS_RENAME_DOES_D_MOVE /* a race between rename and others */ ++ | FS_REVAL_DOT, /* for NFS branch and udba */ ++ .mount = aufs_mount, ++ .kill_sb = aufs_kill_sb, ++ /* no need to __module_get() and module_put(). */ ++ .owner = THIS_MODULE, ++}; +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/super.h 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,546 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * super_block operations ++ */ ++ ++#ifndef __AUFS_SUPER_H__ ++#define __AUFS_SUPER_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include "rwsem.h" ++#include "spl.h" ++#include "wkq.h" ++ ++typedef ssize_t (*au_readf_t)(struct file *, char __user *, size_t, loff_t *); ++typedef ssize_t (*au_writef_t)(struct file *, const char __user *, size_t, ++ loff_t *); ++ ++/* policies to select one among multiple writable branches */ ++struct au_wbr_copyup_operations { ++ int (*copyup)(struct dentry *dentry); ++}; ++ ++struct au_wbr_create_operations { ++ int (*create)(struct dentry *dentry, int isdir); ++ int (*init)(struct super_block *sb); ++ int (*fin)(struct super_block *sb); ++}; ++ ++struct au_wbr_mfs { ++ struct mutex mfs_lock; /* protect this structure */ ++ unsigned long mfs_jiffy; ++ unsigned long mfs_expire; ++ aufs_bindex_t mfs_bindex; ++ ++ unsigned long long mfsrr_bytes; ++ unsigned long long mfsrr_watermark; ++}; ++ ++struct au_branch; ++struct au_sbinfo { ++ /* nowait tasks in the system-wide workqueue */ ++ struct au_nowait_tasks si_nowait; ++ ++ /* ++ * tried sb->s_umount, but failed due to the dependecy between i_mutex. ++ * rwsem for au_sbinfo is necessary. ++ */ ++ struct au_rwsem si_rwsem; ++ ++ /* prevent recursive locking in deleting inode */ ++ struct { ++ unsigned long *bitmap; ++ spinlock_t tree_lock; ++ struct radix_tree_root tree; ++ } au_si_pid; ++ ++ /* ++ * dirty approach to protect sb->sb_inodes and ->s_files from remount. ++ */ ++ atomic_long_t si_ninodes, si_nfiles; ++ ++ /* branch management */ ++ unsigned int si_generation; ++ ++ /* see above flags */ ++ unsigned char au_si_status; ++ ++ aufs_bindex_t si_bend; ++ ++ /* dirty trick to keep br_id plus */ ++ unsigned int si_last_br_id : ++ sizeof(aufs_bindex_t) * BITS_PER_BYTE - 1; ++ struct au_branch **si_branch; ++ ++ /* policy to select a writable branch */ ++ unsigned char si_wbr_copyup; ++ unsigned char si_wbr_create; ++ struct au_wbr_copyup_operations *si_wbr_copyup_ops; ++ struct au_wbr_create_operations *si_wbr_create_ops; ++ ++ /* round robin */ ++ atomic_t si_wbr_rr_next; ++ ++ /* most free space */ ++ struct au_wbr_mfs si_wbr_mfs; ++ ++ /* mount flags */ ++ /* include/asm-ia64/siginfo.h defines a macro named si_flags */ ++ unsigned int si_mntflags; ++ ++ /* external inode number (bitmap and translation table) */ ++ au_readf_t si_xread; ++ au_writef_t si_xwrite; ++ struct file *si_xib; ++ struct mutex si_xib_mtx; /* protect xib members */ ++ unsigned long *si_xib_buf; ++ unsigned long si_xib_last_pindex; ++ int si_xib_next_bit; ++ aufs_bindex_t si_xino_brid; ++ /* reserved for future use */ ++ /* unsigned long long si_xib_limit; */ /* Max xib file size */ ++ ++#ifdef CONFIG_AUFS_EXPORT ++ /* i_generation */ ++ struct file *si_xigen; ++ atomic_t si_xigen_next; ++#endif ++ ++ /* vdir parameters */ ++ unsigned long si_rdcache; /* max cache time in jiffies */ ++ unsigned int si_rdblk; /* deblk size */ ++ unsigned int si_rdhash; /* hash size */ ++ ++ /* ++ * If the number of whiteouts are larger than si_dirwh, leave all of ++ * them after au_whtmp_ren to reduce the cost of rmdir(2). ++ * future fsck.aufs or kernel thread will remove them later. ++ * Otherwise, remove all whiteouts and the dir in rmdir(2). ++ */ ++ unsigned int si_dirwh; ++ ++ /* ++ * rename(2) a directory with all children. ++ */ ++ /* reserved for future use */ ++ /* int si_rendir; */ ++ ++ /* pseudo_link list */ ++ struct au_splhead si_plink; ++ wait_queue_head_t si_plink_wq; ++ spinlock_t si_plink_maint_lock; ++ pid_t si_plink_maint_pid; ++ ++ /* ++ * sysfs and lifetime management. ++ * this is not a small structure and it may be a waste of memory in case ++ * of sysfs is disabled, particulary when many aufs-es are mounted. ++ * but using sysfs is majority. ++ */ ++ struct kobject si_kobj; ++#ifdef CONFIG_DEBUG_FS ++ struct dentry *si_dbgaufs, *si_dbgaufs_xib; ++#ifdef CONFIG_AUFS_EXPORT ++ struct dentry *si_dbgaufs_xigen; ++#endif ++#endif ++ ++#ifdef CONFIG_AUFS_SBILIST ++ struct list_head si_list; ++#endif ++ ++ /* dirty, necessary for unmounting, sysfs and sysrq */ ++ struct super_block *si_sb; ++}; ++ ++/* sbinfo status flags */ ++/* ++ * set true when refresh_dirs() failed at remount time. ++ * then try refreshing dirs at access time again. ++ * if it is false, refreshing dirs at access time is unnecesary ++ */ ++#define AuSi_FAILED_REFRESH_DIR 1 ++static inline unsigned char au_do_ftest_si(struct au_sbinfo *sbi, ++ unsigned int flag) ++{ ++ AuRwMustAnyLock(&sbi->si_rwsem); ++ return sbi->au_si_status & flag; ++} ++#define au_ftest_si(sbinfo, name) au_do_ftest_si(sbinfo, AuSi_##name) ++#define au_fset_si(sbinfo, name) do { \ ++ AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ ++ (sbinfo)->au_si_status |= AuSi_##name; \ ++} while (0) ++#define au_fclr_si(sbinfo, name) do { \ ++ AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ ++ (sbinfo)->au_si_status &= ~AuSi_##name; \ ++} while (0) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policy to select one among writable branches */ ++#define AuWbrCopyup(sbinfo, ...) \ ++ ((sbinfo)->si_wbr_copyup_ops->copyup(__VA_ARGS__)) ++#define AuWbrCreate(sbinfo, ...) \ ++ ((sbinfo)->si_wbr_create_ops->create(__VA_ARGS__)) ++ ++/* flags for si_read_lock()/aufs_read_lock()/di_read_lock() */ ++#define AuLock_DW 1 /* write-lock dentry */ ++#define AuLock_IR (1 << 1) /* read-lock inode */ ++#define AuLock_IW (1 << 2) /* write-lock inode */ ++#define AuLock_FLUSH (1 << 3) /* wait for 'nowait' tasks */ ++#define AuLock_DIR (1 << 4) /* target is a dir */ ++#define AuLock_NOPLM (1 << 5) /* return err in plm mode */ ++#define AuLock_NOPLMW (1 << 6) /* wait for plm mode ends */ ++#define AuLock_GEN (1 << 7) /* test digen/iigen */ ++#define au_ftest_lock(flags, name) ((flags) & AuLock_##name) ++#define au_fset_lock(flags, name) \ ++ do { (flags) |= AuLock_##name; } while (0) ++#define au_fclr_lock(flags, name) \ ++ do { (flags) &= ~AuLock_##name; } while (0) ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* super.c */ ++extern struct file_system_type aufs_fs_type; ++struct inode *au_iget_locked(struct super_block *sb, ino_t ino); ++typedef unsigned long long (*au_arraycb_t)(void *array, unsigned long long max, ++ void *arg); ++void au_array_free(void *array); ++void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb, void *arg); ++struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max); ++void au_iarray_free(struct inode **a, unsigned long long max); ++ ++/* sbinfo.c */ ++void au_si_free(struct kobject *kobj); ++int au_si_alloc(struct super_block *sb); ++int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr); ++ ++unsigned int au_sigen_inc(struct super_block *sb); ++aufs_bindex_t au_new_br_id(struct super_block *sb); ++ ++int si_read_lock(struct super_block *sb, int flags); ++int si_write_lock(struct super_block *sb, int flags); ++int aufs_read_lock(struct dentry *dentry, int flags); ++void aufs_read_unlock(struct dentry *dentry, int flags); ++void aufs_write_lock(struct dentry *dentry); ++void aufs_write_unlock(struct dentry *dentry); ++int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags); ++void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2); ++ ++int si_pid_test_slow(struct super_block *sb); ++void si_pid_set_slow(struct super_block *sb); ++void si_pid_clr_slow(struct super_block *sb); ++ ++/* wbr_policy.c */ ++extern struct au_wbr_copyup_operations au_wbr_copyup_ops[]; ++extern struct au_wbr_create_operations au_wbr_create_ops[]; ++int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct au_sbinfo *au_sbi(struct super_block *sb) ++{ ++ return sb->s_fs_info; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_EXPORT ++void au_export_init(struct super_block *sb); ++ ++static inline int au_test_nfsd(void) ++{ ++ struct task_struct *tsk = current; ++ ++ return (tsk->flags & PF_KTHREAD) ++ && !strcmp(tsk->comm, "nfsd"); ++} ++ ++void au_xigen_inc(struct inode *inode); ++int au_xigen_new(struct inode *inode); ++int au_xigen_set(struct super_block *sb, struct file *base); ++void au_xigen_clr(struct super_block *sb); ++ ++static inline int au_busy_or_stale(void) ++{ ++ if (!au_test_nfsd()) ++ return -EBUSY; ++ return -ESTALE; ++} ++#else ++AuStubVoid(au_export_init, struct super_block *sb) ++AuStubInt0(au_test_nfsd, void) ++AuStubVoid(au_xigen_inc, struct inode *inode) ++AuStubInt0(au_xigen_new, struct inode *inode) ++AuStubInt0(au_xigen_set, struct super_block *sb, struct file *base) ++AuStubVoid(au_xigen_clr, struct super_block *sb) ++static inline int au_busy_or_stale(void) ++{ ++ return -EBUSY; ++} ++#endif /* CONFIG_AUFS_EXPORT */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_SBILIST ++/* module.c */ ++extern struct au_splhead au_sbilist; ++ ++static inline void au_sbilist_init(void) ++{ ++ au_spl_init(&au_sbilist); ++} ++ ++static inline void au_sbilist_add(struct super_block *sb) ++{ ++ au_spl_add(&au_sbi(sb)->si_list, &au_sbilist); ++} ++ ++static inline void au_sbilist_del(struct super_block *sb) ++{ ++ au_spl_del(&au_sbi(sb)->si_list, &au_sbilist); ++} ++ ++#ifdef CONFIG_AUFS_MAGIC_SYSRQ ++static inline void au_sbilist_lock(void) ++{ ++ spin_lock(&au_sbilist.spin); ++} ++ ++static inline void au_sbilist_unlock(void) ++{ ++ spin_unlock(&au_sbilist.spin); ++} ++#define AuGFP_SBILIST GFP_ATOMIC ++#else ++AuStubVoid(au_sbilist_lock, void) ++AuStubVoid(au_sbilist_unlock, void) ++#define AuGFP_SBILIST GFP_NOFS ++#endif /* CONFIG_AUFS_MAGIC_SYSRQ */ ++#else ++AuStubVoid(au_sbilist_init, void) ++AuStubVoid(au_sbilist_add, struct super_block*) ++AuStubVoid(au_sbilist_del, struct super_block*) ++AuStubVoid(au_sbilist_lock, void) ++AuStubVoid(au_sbilist_unlock, void) ++#define AuGFP_SBILIST GFP_NOFS ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline void dbgaufs_si_null(struct au_sbinfo *sbinfo) ++{ ++ /* ++ * This function is a dynamic '__init' fucntion actually, ++ * so the tiny check for si_rwsem is unnecessary. ++ */ ++ /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ ++#ifdef CONFIG_DEBUG_FS ++ sbinfo->si_dbgaufs = NULL; ++ sbinfo->si_dbgaufs_xib = NULL; ++#ifdef CONFIG_AUFS_EXPORT ++ sbinfo->si_dbgaufs_xigen = NULL; ++#endif ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline pid_t si_pid_bit(void) ++{ ++ /* the origin of pid is 1, but the bitmap's is 0 */ ++ return current->pid - 1; ++} ++ ++static inline int si_pid_test(struct super_block *sb) ++{ ++ pid_t bit = si_pid_bit(); ++ if (bit < PID_MAX_DEFAULT) ++ return test_bit(bit, au_sbi(sb)->au_si_pid.bitmap); ++ else ++ return si_pid_test_slow(sb); ++} ++ ++static inline void si_pid_set(struct super_block *sb) ++{ ++ pid_t bit = si_pid_bit(); ++ if (bit < PID_MAX_DEFAULT) { ++ AuDebugOn(test_bit(bit, au_sbi(sb)->au_si_pid.bitmap)); ++ set_bit(bit, au_sbi(sb)->au_si_pid.bitmap); ++ /* smp_mb(); */ ++ } else ++ si_pid_set_slow(sb); ++} ++ ++static inline void si_pid_clr(struct super_block *sb) ++{ ++ pid_t bit = si_pid_bit(); ++ if (bit < PID_MAX_DEFAULT) { ++ AuDebugOn(!test_bit(bit, au_sbi(sb)->au_si_pid.bitmap)); ++ clear_bit(bit, au_sbi(sb)->au_si_pid.bitmap); ++ /* smp_mb(); */ ++ } else ++ si_pid_clr_slow(sb); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock superblock. mainly for entry point functions */ ++/* ++ * __si_read_lock, __si_write_lock, ++ * __si_read_unlock, __si_write_unlock, __si_downgrade_lock ++ */ ++AuSimpleRwsemFuncs(__si, struct super_block *sb, &au_sbi(sb)->si_rwsem); ++ ++#define SiMustNoWaiters(sb) AuRwMustNoWaiters(&au_sbi(sb)->si_rwsem) ++#define SiMustAnyLock(sb) AuRwMustAnyLock(&au_sbi(sb)->si_rwsem) ++#define SiMustWriteLock(sb) AuRwMustWriteLock(&au_sbi(sb)->si_rwsem) ++ ++static inline void si_noflush_read_lock(struct super_block *sb) ++{ ++ __si_read_lock(sb); ++ si_pid_set(sb); ++} ++ ++static inline int si_noflush_read_trylock(struct super_block *sb) ++{ ++ int locked = __si_read_trylock(sb); ++ if (locked) ++ si_pid_set(sb); ++ return locked; ++} ++ ++static inline void si_noflush_write_lock(struct super_block *sb) ++{ ++ __si_write_lock(sb); ++ si_pid_set(sb); ++} ++ ++static inline int si_noflush_write_trylock(struct super_block *sb) ++{ ++ int locked = __si_write_trylock(sb); ++ if (locked) ++ si_pid_set(sb); ++ return locked; ++} ++ ++#if 0 /* unused */ ++static inline int si_read_trylock(struct super_block *sb, int flags) ++{ ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ return si_noflush_read_trylock(sb); ++} ++#endif ++ ++static inline void si_read_unlock(struct super_block *sb) ++{ ++ si_pid_clr(sb); ++ __si_read_unlock(sb); ++} ++ ++#if 0 /* unused */ ++static inline int si_write_trylock(struct super_block *sb, int flags) ++{ ++ if (au_ftest_lock(flags, FLUSH)) ++ au_nwt_flush(&au_sbi(sb)->si_nowait); ++ return si_noflush_write_trylock(sb); ++} ++#endif ++ ++static inline void si_write_unlock(struct super_block *sb) ++{ ++ si_pid_clr(sb); ++ __si_write_unlock(sb); ++} ++ ++#if 0 /* unused */ ++static inline void si_downgrade_lock(struct super_block *sb) ++{ ++ __si_downgrade_lock(sb); ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline aufs_bindex_t au_sbend(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_bend; ++} ++ ++static inline unsigned int au_mntflags(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_mntflags; ++} ++ ++static inline unsigned int au_sigen(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_generation; ++} ++ ++static inline void au_ninodes_inc(struct super_block *sb) ++{ ++ atomic_long_inc(&au_sbi(sb)->si_ninodes); ++} ++ ++static inline void au_ninodes_dec(struct super_block *sb) ++{ ++ AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_ninodes)); ++ atomic_long_dec(&au_sbi(sb)->si_ninodes); ++} ++ ++static inline void au_nfiles_inc(struct super_block *sb) ++{ ++ atomic_long_inc(&au_sbi(sb)->si_nfiles); ++} ++ ++static inline void au_nfiles_dec(struct super_block *sb) ++{ ++ AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_nfiles)); ++ atomic_long_dec(&au_sbi(sb)->si_nfiles); ++} ++ ++static inline struct au_branch *au_sbr(struct super_block *sb, ++ aufs_bindex_t bindex) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_branch[0 + bindex]; ++} ++ ++static inline void au_xino_brid_set(struct super_block *sb, aufs_bindex_t brid) ++{ ++ SiMustWriteLock(sb); ++ au_sbi(sb)->si_xino_brid = brid; ++} ++ ++static inline aufs_bindex_t au_xino_brid(struct super_block *sb) ++{ ++ SiMustAnyLock(sb); ++ return au_sbi(sb)->si_xino_brid; ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_SUPER_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/sysaufs.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,105 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sysfs interface and lifetime management ++ * they are necessary regardless sysfs is disabled. ++ */ ++ ++#include ++#include "aufs.h" ++ ++unsigned long sysaufs_si_mask; ++struct kset *sysaufs_kset; ++ ++#define AuSiAttr(_name) { \ ++ .attr = { .name = __stringify(_name), .mode = 0444 }, \ ++ .show = sysaufs_si_##_name, \ ++} ++ ++static struct sysaufs_si_attr sysaufs_si_attr_xi_path = AuSiAttr(xi_path); ++struct attribute *sysaufs_si_attrs[] = { ++ &sysaufs_si_attr_xi_path.attr, ++ NULL, ++}; ++ ++static const struct sysfs_ops au_sbi_ops = { ++ .show = sysaufs_si_show ++}; ++ ++static struct kobj_type au_sbi_ktype = { ++ .release = au_si_free, ++ .sysfs_ops = &au_sbi_ops, ++ .default_attrs = sysaufs_si_attrs ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++int sysaufs_si_init(struct au_sbinfo *sbinfo) ++{ ++ int err; ++ ++ sbinfo->si_kobj.kset = sysaufs_kset; ++ /* cf. sysaufs_name() */ ++ err = kobject_init_and_add ++ (&sbinfo->si_kobj, &au_sbi_ktype, /*&sysaufs_kset->kobj*/NULL, ++ SysaufsSiNamePrefix "%lx", sysaufs_si_id(sbinfo)); ++ ++ dbgaufs_si_null(sbinfo); ++ if (!err) { ++ err = dbgaufs_si_init(sbinfo); ++ if (unlikely(err)) ++ kobject_put(&sbinfo->si_kobj); ++ } ++ return err; ++} ++ ++void sysaufs_fin(void) ++{ ++ dbgaufs_fin(); ++ sysfs_remove_group(&sysaufs_kset->kobj, sysaufs_attr_group); ++ kset_unregister(sysaufs_kset); ++} ++ ++int __init sysaufs_init(void) ++{ ++ int err; ++ ++ do { ++ get_random_bytes(&sysaufs_si_mask, sizeof(sysaufs_si_mask)); ++ } while (!sysaufs_si_mask); ++ ++ err = -EINVAL; ++ sysaufs_kset = kset_create_and_add(AUFS_NAME, NULL, fs_kobj); ++ if (unlikely(!sysaufs_kset)) ++ goto out; ++ err = PTR_ERR(sysaufs_kset); ++ if (IS_ERR(sysaufs_kset)) ++ goto out; ++ err = sysfs_create_group(&sysaufs_kset->kobj, sysaufs_attr_group); ++ if (unlikely(err)) { ++ kset_unregister(sysaufs_kset); ++ goto out; ++ } ++ ++ err = dbgaufs_init(); ++ if (unlikely(err)) ++ sysaufs_fin(); ++out: ++ return err; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/sysaufs.h 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,104 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sysfs interface and mount lifetime management ++ */ ++ ++#ifndef __SYSAUFS_H__ ++#define __SYSAUFS_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include "module.h" ++ ++struct super_block; ++struct au_sbinfo; ++ ++struct sysaufs_si_attr { ++ struct attribute attr; ++ int (*show)(struct seq_file *seq, struct super_block *sb); ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* sysaufs.c */ ++extern unsigned long sysaufs_si_mask; ++extern struct kset *sysaufs_kset; ++extern struct attribute *sysaufs_si_attrs[]; ++int sysaufs_si_init(struct au_sbinfo *sbinfo); ++int __init sysaufs_init(void); ++void sysaufs_fin(void); ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* some people doesn't like to show a pointer in kernel */ ++static inline unsigned long sysaufs_si_id(struct au_sbinfo *sbinfo) ++{ ++ return sysaufs_si_mask ^ (unsigned long)sbinfo; ++} ++ ++#define SysaufsSiNamePrefix "si_" ++#define SysaufsSiNameLen (sizeof(SysaufsSiNamePrefix) + 16) ++static inline void sysaufs_name(struct au_sbinfo *sbinfo, char *name) ++{ ++ snprintf(name, SysaufsSiNameLen, SysaufsSiNamePrefix "%lx", ++ sysaufs_si_id(sbinfo)); ++} ++ ++struct au_branch; ++#ifdef CONFIG_SYSFS ++/* sysfs.c */ ++extern struct attribute_group *sysaufs_attr_group; ++ ++int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb); ++ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, ++ char *buf); ++ ++void sysaufs_br_init(struct au_branch *br); ++void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); ++void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); ++ ++#define sysaufs_brs_init() do {} while (0) ++ ++#else ++#define sysaufs_attr_group NULL ++ ++AuStubInt0(sysaufs_si_xi_path, struct seq_file *seq, struct super_block *sb) ++ ++static inline ++ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, ++ char *buf) ++{ ++ return 0; ++} ++ ++AuStubVoid(sysaufs_br_init, struct au_branch *br) ++AuStubVoid(sysaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex) ++AuStubVoid(sysaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex) ++ ++static inline void sysaufs_brs_init(void) ++{ ++ sysaufs_brs = 0; ++} ++ ++#endif /* CONFIG_SYSFS */ ++ ++#endif /* __KERNEL__ */ ++#endif /* __SYSAUFS_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/sysfs.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,257 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sysfs interface ++ */ ++ ++#include ++#include "aufs.h" ++ ++#ifdef CONFIG_AUFS_FS_MODULE ++/* this entry violates the "one line per file" policy of sysfs */ ++static ssize_t config_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ ssize_t err; ++ static char *conf = ++/* this file is generated at compiling */ ++#include "conf.str" ++ ; ++ ++ err = snprintf(buf, PAGE_SIZE, conf); ++ if (unlikely(err >= PAGE_SIZE)) ++ err = -EFBIG; ++ return err; ++} ++ ++static struct kobj_attribute au_config_attr = __ATTR_RO(config); ++#endif ++ ++static struct attribute *au_attr[] = { ++#ifdef CONFIG_AUFS_FS_MODULE ++ &au_config_attr.attr, ++#endif ++ NULL, /* need to NULL terminate the list of attributes */ ++}; ++ ++static struct attribute_group sysaufs_attr_group_body = { ++ .attrs = au_attr ++}; ++ ++struct attribute_group *sysaufs_attr_group = &sysaufs_attr_group_body; ++ ++/* ---------------------------------------------------------------------- */ ++ ++int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb) ++{ ++ int err; ++ ++ SiMustAnyLock(sb); ++ ++ err = 0; ++ if (au_opt_test(au_mntflags(sb), XINO)) { ++ err = au_xino_path(seq, au_sbi(sb)->si_xib); ++ seq_putc(seq, '\n'); ++ } ++ return err; ++} ++ ++/* ++ * the lifetime of branch is independent from the entry under sysfs. ++ * sysfs handles the lifetime of the entry, and never call ->show() after it is ++ * unlinked. ++ */ ++static int sysaufs_si_br(struct seq_file *seq, struct super_block *sb, ++ aufs_bindex_t bindex) ++{ ++ int err; ++ struct path path; ++ struct dentry *root; ++ struct au_branch *br; ++ char *perm; ++ ++ AuDbg("b%d\n", bindex); ++ ++ err = 0; ++ root = sb->s_root; ++ di_read_lock_parent(root, !AuLock_IR); ++ br = au_sbr(sb, bindex); ++ path.mnt = br->br_mnt; ++ path.dentry = au_h_dptr(root, bindex); ++ au_seq_path(seq, &path); ++ di_read_unlock(root, !AuLock_IR); ++ perm = au_optstr_br_perm(br->br_perm); ++ if (perm) { ++ err = seq_printf(seq, "=%s\n", perm); ++ kfree(perm); ++ if (err == -1) ++ err = -E2BIG; ++ } else ++ err = -ENOMEM; ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static struct seq_file *au_seq(char *p, ssize_t len) ++{ ++ struct seq_file *seq; ++ ++ seq = kzalloc(sizeof(*seq), GFP_NOFS); ++ if (seq) { ++ /* mutex_init(&seq.lock); */ ++ seq->buf = p; ++ seq->size = len; ++ return seq; /* success */ ++ } ++ ++ seq = ERR_PTR(-ENOMEM); ++ return seq; ++} ++ ++#define SysaufsBr_PREFIX "br" ++ ++/* todo: file size may exceed PAGE_SIZE */ ++ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, ++ char *buf) ++{ ++ ssize_t err; ++ long l; ++ aufs_bindex_t bend; ++ struct au_sbinfo *sbinfo; ++ struct super_block *sb; ++ struct seq_file *seq; ++ char *name; ++ struct attribute **cattr; ++ ++ sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); ++ sb = sbinfo->si_sb; ++ ++ /* ++ * prevent a race condition between sysfs and aufs. ++ * for instance, sysfs_file_read() calls sysfs_get_active_two() which ++ * prohibits maintaining the sysfs entries. ++ * hew we acquire read lock after sysfs_get_active_two(). ++ * on the other hand, the remount process may maintain the sysfs/aufs ++ * entries after acquiring write lock. ++ * it can cause a deadlock. ++ * simply we gave up processing read here. ++ */ ++ err = -EBUSY; ++ if (unlikely(!si_noflush_read_trylock(sb))) ++ goto out; ++ ++ seq = au_seq(buf, PAGE_SIZE); ++ err = PTR_ERR(seq); ++ if (IS_ERR(seq)) ++ goto out_unlock; ++ ++ name = (void *)attr->name; ++ cattr = sysaufs_si_attrs; ++ while (*cattr) { ++ if (!strcmp(name, (*cattr)->name)) { ++ err = container_of(*cattr, struct sysaufs_si_attr, attr) ++ ->show(seq, sb); ++ goto out_seq; ++ } ++ cattr++; ++ } ++ ++ bend = au_sbend(sb); ++ if (!strncmp(name, SysaufsBr_PREFIX, sizeof(SysaufsBr_PREFIX) - 1)) { ++ name += sizeof(SysaufsBr_PREFIX) - 1; ++ err = kstrtol(name, 10, &l); ++ if (!err) { ++ if (l <= bend) ++ err = sysaufs_si_br(seq, sb, (aufs_bindex_t)l); ++ else ++ err = -ENOENT; ++ } ++ goto out_seq; ++ } ++ BUG(); ++ ++out_seq: ++ if (!err) { ++ err = seq->count; ++ /* sysfs limit */ ++ if (unlikely(err == PAGE_SIZE)) ++ err = -EFBIG; ++ } ++ kfree(seq); ++out_unlock: ++ si_read_unlock(sb); ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void sysaufs_br_init(struct au_branch *br) ++{ ++ struct attribute *attr = &br->br_attr; ++ ++ sysfs_attr_init(attr); ++ attr->name = br->br_name; ++ attr->mode = S_IRUGO; ++} ++ ++void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ struct au_branch *br; ++ struct kobject *kobj; ++ aufs_bindex_t bend; ++ ++ dbgaufs_brs_del(sb, bindex); ++ ++ if (!sysaufs_brs) ++ return; ++ ++ kobj = &au_sbi(sb)->si_kobj; ++ bend = au_sbend(sb); ++ for (; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ sysfs_remove_file(kobj, &br->br_attr); ++ } ++} ++ ++void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ int err; ++ aufs_bindex_t bend; ++ struct kobject *kobj; ++ struct au_branch *br; ++ ++ dbgaufs_brs_add(sb, bindex); ++ ++ if (!sysaufs_brs) ++ return; ++ ++ kobj = &au_sbi(sb)->si_kobj; ++ bend = au_sbend(sb); ++ for (; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ snprintf(br->br_name, sizeof(br->br_name), SysaufsBr_PREFIX ++ "%d", bindex); ++ err = sysfs_create_file(kobj, &br->br_attr); ++ if (unlikely(err)) ++ pr_warning("failed %s under sysfs(%d)\n", ++ br->br_name, err); ++ } ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/sysrq.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,148 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * magic sysrq hanlder ++ */ ++ ++/* #include */ ++#include ++#include "aufs.h" ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void sysrq_sb(struct super_block *sb) ++{ ++ char *plevel; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ ++ plevel = au_plevel; ++ au_plevel = KERN_WARNING; ++ ++ sbinfo = au_sbi(sb); ++ /* since we define pr_fmt, call printk directly */ ++ printk(KERN_WARNING "si=%lx\n", sysaufs_si_id(sbinfo)); ++ printk(KERN_WARNING AUFS_NAME ": superblock\n"); ++ au_dpri_sb(sb); ++ ++#if 0 ++ printk(KERN_WARNING AUFS_NAME ": root dentry\n"); ++ au_dpri_dentry(sb->s_root); ++ printk(KERN_WARNING AUFS_NAME ": root inode\n"); ++ au_dpri_inode(sb->s_root->d_inode); ++#endif ++ ++#if 0 ++ do { ++ int err, i, j, ndentry; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ ++ err = au_dpages_init(&dpages, GFP_ATOMIC); ++ if (unlikely(err)) ++ break; ++ err = au_dcsub_pages(&dpages, sb->s_root, NULL, NULL); ++ if (!err) ++ for (i = 0; i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ ndentry = dpage->ndentry; ++ for (j = 0; j < ndentry; j++) ++ au_dpri_dentry(dpage->dentries[j]); ++ } ++ au_dpages_free(&dpages); ++ } while (0); ++#endif ++ ++#if 1 ++ { ++ struct inode *i; ++ printk(KERN_WARNING AUFS_NAME ": isolated inode\n"); ++ spin_lock(&inode_sb_list_lock); ++ list_for_each_entry(i, &sb->s_inodes, i_sb_list) { ++ spin_lock(&i->i_lock); ++ if (1 || list_empty(&i->i_dentry)) ++ au_dpri_inode(i); ++ spin_unlock(&i->i_lock); ++ } ++ spin_unlock(&inode_sb_list_lock); ++ } ++#endif ++ printk(KERN_WARNING AUFS_NAME ": files\n"); ++ lg_global_lock(files_lglock); ++ do_file_list_for_each_entry(sb, file) { ++ umode_t mode; ++ mode = file->f_dentry->d_inode->i_mode; ++ if (!special_file(mode) || au_special_file(mode)) ++ au_dpri_file(file); ++ } while_file_list_for_each_entry; ++ lg_global_unlock(files_lglock); ++ printk(KERN_WARNING AUFS_NAME ": done\n"); ++ ++ au_plevel = plevel; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* module parameter */ ++static char *aufs_sysrq_key = "a"; ++module_param_named(sysrq, aufs_sysrq_key, charp, S_IRUGO); ++MODULE_PARM_DESC(sysrq, "MagicSysRq key for " AUFS_NAME); ++ ++static void au_sysrq(int key __maybe_unused) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ lockdep_off(); ++ au_sbilist_lock(); ++ list_for_each_entry(sbinfo, &au_sbilist.head, si_list) ++ sysrq_sb(sbinfo->si_sb); ++ au_sbilist_unlock(); ++ lockdep_on(); ++} ++ ++static struct sysrq_key_op au_sysrq_op = { ++ .handler = au_sysrq, ++ .help_msg = "Aufs", ++ .action_msg = "Aufs", ++ .enable_mask = SYSRQ_ENABLE_DUMP ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++int __init au_sysrq_init(void) ++{ ++ int err; ++ char key; ++ ++ err = -1; ++ key = *aufs_sysrq_key; ++ if ('a' <= key && key <= 'z') ++ err = register_sysrq_key(key, &au_sysrq_op); ++ if (unlikely(err)) ++ pr_err("err %d, sysrq=%c\n", err, key); ++ return err; ++} ++ ++void au_sysrq_fin(void) ++{ ++ int err; ++ err = unregister_sysrq_key(*aufs_sysrq_key, &au_sysrq_op); ++ if (unlikely(err)) ++ pr_err("err %d (ignored)\n", err); ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/vdir.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,885 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * virtual or vertical directory ++ */ ++ ++#include "aufs.h" ++ ++static unsigned int calc_size(int nlen) ++{ ++ return ALIGN(sizeof(struct au_vdir_de) + nlen, sizeof(ino_t)); ++} ++ ++static int set_deblk_end(union au_vdir_deblk_p *p, ++ union au_vdir_deblk_p *deblk_end) ++{ ++ if (calc_size(0) <= deblk_end->deblk - p->deblk) { ++ p->de->de_str.len = 0; ++ /* smp_mb(); */ ++ return 0; ++ } ++ return -1; /* error */ ++} ++ ++/* returns true or false */ ++static int is_deblk_end(union au_vdir_deblk_p *p, ++ union au_vdir_deblk_p *deblk_end) ++{ ++ if (calc_size(0) <= deblk_end->deblk - p->deblk) ++ return !p->de->de_str.len; ++ return 1; ++} ++ ++static unsigned char *last_deblk(struct au_vdir *vdir) ++{ ++ return vdir->vd_deblk[vdir->vd_nblk - 1]; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* estimate the apropriate size for name hash table */ ++unsigned int au_rdhash_est(loff_t sz) ++{ ++ unsigned int n; ++ ++ n = UINT_MAX; ++ sz >>= 10; ++ if (sz < n) ++ n = sz; ++ if (sz < AUFS_RDHASH_DEF) ++ n = AUFS_RDHASH_DEF; ++ /* pr_info("n %u\n", n); */ ++ return n; ++} ++ ++/* ++ * the allocated memory has to be freed by ++ * au_nhash_wh_free() or au_nhash_de_free(). ++ */ ++int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp) ++{ ++ struct hlist_head *head; ++ unsigned int u; ++ ++ head = kmalloc(sizeof(*nhash->nh_head) * num_hash, gfp); ++ if (head) { ++ nhash->nh_num = num_hash; ++ nhash->nh_head = head; ++ for (u = 0; u < num_hash; u++) ++ INIT_HLIST_HEAD(head++); ++ return 0; /* success */ ++ } ++ ++ return -ENOMEM; ++} ++ ++static void nhash_count(struct hlist_head *head) ++{ ++#if 0 ++ unsigned long n; ++ struct hlist_node *pos; ++ ++ n = 0; ++ hlist_for_each(pos, head) ++ n++; ++ pr_info("%lu\n", n); ++#endif ++} ++ ++static void au_nhash_wh_do_free(struct hlist_head *head) ++{ ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos, *node; ++ ++ hlist_for_each_entry_safe(tpos, pos, node, head, wh_hash) { ++ /* hlist_del(pos); */ ++ kfree(tpos); ++ } ++} ++ ++static void au_nhash_de_do_free(struct hlist_head *head) ++{ ++ struct au_vdir_dehstr *tpos; ++ struct hlist_node *pos, *node; ++ ++ hlist_for_each_entry_safe(tpos, pos, node, head, hash) { ++ /* hlist_del(pos); */ ++ au_cache_free_vdir_dehstr(tpos); ++ } ++} ++ ++static void au_nhash_do_free(struct au_nhash *nhash, ++ void (*free)(struct hlist_head *head)) ++{ ++ unsigned int n; ++ struct hlist_head *head; ++ ++ n = nhash->nh_num; ++ if (!n) ++ return; ++ ++ head = nhash->nh_head; ++ while (n-- > 0) { ++ nhash_count(head); ++ free(head++); ++ } ++ kfree(nhash->nh_head); ++} ++ ++void au_nhash_wh_free(struct au_nhash *whlist) ++{ ++ au_nhash_do_free(whlist, au_nhash_wh_do_free); ++} ++ ++static void au_nhash_de_free(struct au_nhash *delist) ++{ ++ au_nhash_do_free(delist, au_nhash_de_do_free); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, ++ int limit) ++{ ++ int num; ++ unsigned int u, n; ++ struct hlist_head *head; ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos; ++ ++ num = 0; ++ n = whlist->nh_num; ++ head = whlist->nh_head; ++ for (u = 0; u < n; u++, head++) ++ hlist_for_each_entry(tpos, pos, head, wh_hash) ++ if (tpos->wh_bindex == btgt && ++num > limit) ++ return 1; ++ return 0; ++} ++ ++static struct hlist_head *au_name_hash(struct au_nhash *nhash, ++ unsigned char *name, ++ unsigned int len) ++{ ++ unsigned int v; ++ /* const unsigned int magic_bit = 12; */ ++ ++ AuDebugOn(!nhash->nh_num || !nhash->nh_head); ++ ++ v = 0; ++ while (len--) ++ v += *name++; ++ /* v = hash_long(v, magic_bit); */ ++ v %= nhash->nh_num; ++ return nhash->nh_head + v; ++} ++ ++static int au_nhash_test_name(struct au_vdir_destr *str, const char *name, ++ int nlen) ++{ ++ return str->len == nlen && !memcmp(str->name, name, nlen); ++} ++ ++/* returns found or not */ ++int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen) ++{ ++ struct hlist_head *head; ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos; ++ struct au_vdir_destr *str; ++ ++ head = au_name_hash(whlist, name, nlen); ++ hlist_for_each_entry(tpos, pos, head, wh_hash) { ++ str = &tpos->wh_str; ++ AuDbg("%.*s\n", str->len, str->name); ++ if (au_nhash_test_name(str, name, nlen)) ++ return 1; ++ } ++ return 0; ++} ++ ++/* returns found(true) or not */ ++static int test_known(struct au_nhash *delist, char *name, int nlen) ++{ ++ struct hlist_head *head; ++ struct au_vdir_dehstr *tpos; ++ struct hlist_node *pos; ++ struct au_vdir_destr *str; ++ ++ head = au_name_hash(delist, name, nlen); ++ hlist_for_each_entry(tpos, pos, head, hash) { ++ str = tpos->str; ++ AuDbg("%.*s\n", str->len, str->name); ++ if (au_nhash_test_name(str, name, nlen)) ++ return 1; ++ } ++ return 0; ++} ++ ++static void au_shwh_init_wh(struct au_vdir_wh *wh, ino_t ino, ++ unsigned char d_type) ++{ ++#ifdef CONFIG_AUFS_SHWH ++ wh->wh_ino = ino; ++ wh->wh_type = d_type; ++#endif ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, ++ unsigned int d_type, aufs_bindex_t bindex, ++ unsigned char shwh) ++{ ++ int err; ++ struct au_vdir_destr *str; ++ struct au_vdir_wh *wh; ++ ++ AuDbg("%.*s\n", nlen, name); ++ AuDebugOn(!whlist->nh_num || !whlist->nh_head); ++ ++ err = -ENOMEM; ++ wh = kmalloc(sizeof(*wh) + nlen, GFP_NOFS); ++ if (unlikely(!wh)) ++ goto out; ++ ++ err = 0; ++ wh->wh_bindex = bindex; ++ if (shwh) ++ au_shwh_init_wh(wh, ino, d_type); ++ str = &wh->wh_str; ++ str->len = nlen; ++ memcpy(str->name, name, nlen); ++ hlist_add_head(&wh->wh_hash, au_name_hash(whlist, name, nlen)); ++ /* smp_mb(); */ ++ ++out: ++ return err; ++} ++ ++static int append_deblk(struct au_vdir *vdir) ++{ ++ int err; ++ unsigned long ul; ++ const unsigned int deblk_sz = vdir->vd_deblk_sz; ++ union au_vdir_deblk_p p, deblk_end; ++ unsigned char **o; ++ ++ err = -ENOMEM; ++ o = krealloc(vdir->vd_deblk, sizeof(*o) * (vdir->vd_nblk + 1), ++ GFP_NOFS); ++ if (unlikely(!o)) ++ goto out; ++ ++ vdir->vd_deblk = o; ++ p.deblk = kmalloc(deblk_sz, GFP_NOFS); ++ if (p.deblk) { ++ ul = vdir->vd_nblk++; ++ vdir->vd_deblk[ul] = p.deblk; ++ vdir->vd_last.ul = ul; ++ vdir->vd_last.p.deblk = p.deblk; ++ deblk_end.deblk = p.deblk + deblk_sz; ++ err = set_deblk_end(&p, &deblk_end); ++ } ++ ++out: ++ return err; ++} ++ ++static int append_de(struct au_vdir *vdir, char *name, int nlen, ino_t ino, ++ unsigned int d_type, struct au_nhash *delist) ++{ ++ int err; ++ unsigned int sz; ++ const unsigned int deblk_sz = vdir->vd_deblk_sz; ++ union au_vdir_deblk_p p, *room, deblk_end; ++ struct au_vdir_dehstr *dehstr; ++ ++ p.deblk = last_deblk(vdir); ++ deblk_end.deblk = p.deblk + deblk_sz; ++ room = &vdir->vd_last.p; ++ AuDebugOn(room->deblk < p.deblk || deblk_end.deblk <= room->deblk ++ || !is_deblk_end(room, &deblk_end)); ++ ++ sz = calc_size(nlen); ++ if (unlikely(sz > deblk_end.deblk - room->deblk)) { ++ err = append_deblk(vdir); ++ if (unlikely(err)) ++ goto out; ++ ++ p.deblk = last_deblk(vdir); ++ deblk_end.deblk = p.deblk + deblk_sz; ++ /* smp_mb(); */ ++ AuDebugOn(room->deblk != p.deblk); ++ } ++ ++ err = -ENOMEM; ++ dehstr = au_cache_alloc_vdir_dehstr(); ++ if (unlikely(!dehstr)) ++ goto out; ++ ++ dehstr->str = &room->de->de_str; ++ hlist_add_head(&dehstr->hash, au_name_hash(delist, name, nlen)); ++ room->de->de_ino = ino; ++ room->de->de_type = d_type; ++ room->de->de_str.len = nlen; ++ memcpy(room->de->de_str.name, name, nlen); ++ ++ err = 0; ++ room->deblk += sz; ++ if (unlikely(set_deblk_end(room, &deblk_end))) ++ err = append_deblk(vdir); ++ /* smp_mb(); */ ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_vdir_free(struct au_vdir *vdir) ++{ ++ unsigned char **deblk; ++ ++ deblk = vdir->vd_deblk; ++ while (vdir->vd_nblk--) ++ kfree(*deblk++); ++ kfree(vdir->vd_deblk); ++ au_cache_free_vdir(vdir); ++} ++ ++static struct au_vdir *alloc_vdir(struct file *file) ++{ ++ struct au_vdir *vdir; ++ struct super_block *sb; ++ int err; ++ ++ sb = file->f_dentry->d_sb; ++ SiMustAnyLock(sb); ++ ++ err = -ENOMEM; ++ vdir = au_cache_alloc_vdir(); ++ if (unlikely(!vdir)) ++ goto out; ++ ++ vdir->vd_deblk = kzalloc(sizeof(*vdir->vd_deblk), GFP_NOFS); ++ if (unlikely(!vdir->vd_deblk)) ++ goto out_free; ++ ++ vdir->vd_deblk_sz = au_sbi(sb)->si_rdblk; ++ if (!vdir->vd_deblk_sz) { ++ /* estimate the apropriate size for deblk */ ++ vdir->vd_deblk_sz = au_dir_size(file, /*dentry*/NULL); ++ /* pr_info("vd_deblk_sz %u\n", vdir->vd_deblk_sz); */ ++ } ++ vdir->vd_nblk = 0; ++ vdir->vd_version = 0; ++ vdir->vd_jiffy = 0; ++ err = append_deblk(vdir); ++ if (!err) ++ return vdir; /* success */ ++ ++ kfree(vdir->vd_deblk); ++ ++out_free: ++ au_cache_free_vdir(vdir); ++out: ++ vdir = ERR_PTR(err); ++ return vdir; ++} ++ ++static int reinit_vdir(struct au_vdir *vdir) ++{ ++ int err; ++ union au_vdir_deblk_p p, deblk_end; ++ ++ while (vdir->vd_nblk > 1) { ++ kfree(vdir->vd_deblk[vdir->vd_nblk - 1]); ++ /* vdir->vd_deblk[vdir->vd_nblk - 1] = NULL; */ ++ vdir->vd_nblk--; ++ } ++ p.deblk = vdir->vd_deblk[0]; ++ deblk_end.deblk = p.deblk + vdir->vd_deblk_sz; ++ err = set_deblk_end(&p, &deblk_end); ++ /* keep vd_dblk_sz */ ++ vdir->vd_last.ul = 0; ++ vdir->vd_last.p.deblk = vdir->vd_deblk[0]; ++ vdir->vd_version = 0; ++ vdir->vd_jiffy = 0; ++ /* smp_mb(); */ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AuFillVdir_CALLED 1 ++#define AuFillVdir_WHABLE (1 << 1) ++#define AuFillVdir_SHWH (1 << 2) ++#define au_ftest_fillvdir(flags, name) ((flags) & AuFillVdir_##name) ++#define au_fset_fillvdir(flags, name) \ ++ do { (flags) |= AuFillVdir_##name; } while (0) ++#define au_fclr_fillvdir(flags, name) \ ++ do { (flags) &= ~AuFillVdir_##name; } while (0) ++ ++#ifndef CONFIG_AUFS_SHWH ++#undef AuFillVdir_SHWH ++#define AuFillVdir_SHWH 0 ++#endif ++ ++struct fillvdir_arg { ++ struct file *file; ++ struct au_vdir *vdir; ++ struct au_nhash delist; ++ struct au_nhash whlist; ++ aufs_bindex_t bindex; ++ unsigned int flags; ++ int err; ++}; ++ ++static int fillvdir(void *__arg, const char *__name, int nlen, ++ loff_t offset __maybe_unused, u64 h_ino, ++ unsigned int d_type) ++{ ++ struct fillvdir_arg *arg = __arg; ++ char *name = (void *)__name; ++ struct super_block *sb; ++ ino_t ino; ++ const unsigned char shwh = !!au_ftest_fillvdir(arg->flags, SHWH); ++ ++ arg->err = 0; ++ sb = arg->file->f_dentry->d_sb; ++ au_fset_fillvdir(arg->flags, CALLED); ++ /* smp_mb(); */ ++ if (nlen <= AUFS_WH_PFX_LEN ++ || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { ++ if (test_known(&arg->delist, name, nlen) ++ || au_nhash_test_known_wh(&arg->whlist, name, nlen)) ++ goto out; /* already exists or whiteouted */ ++ ++ sb = arg->file->f_dentry->d_sb; ++ arg->err = au_ino(sb, arg->bindex, h_ino, d_type, &ino); ++ if (!arg->err) { ++ if (unlikely(nlen > AUFS_MAX_NAMELEN)) ++ d_type = DT_UNKNOWN; ++ arg->err = append_de(arg->vdir, name, nlen, ino, ++ d_type, &arg->delist); ++ } ++ } else if (au_ftest_fillvdir(arg->flags, WHABLE)) { ++ name += AUFS_WH_PFX_LEN; ++ nlen -= AUFS_WH_PFX_LEN; ++ if (au_nhash_test_known_wh(&arg->whlist, name, nlen)) ++ goto out; /* already whiteouted */ ++ ++ if (shwh) ++ arg->err = au_wh_ino(sb, arg->bindex, h_ino, d_type, ++ &ino); ++ if (!arg->err) { ++ if (nlen <= AUFS_MAX_NAMELEN + AUFS_WH_PFX_LEN) ++ d_type = DT_UNKNOWN; ++ arg->err = au_nhash_append_wh ++ (&arg->whlist, name, nlen, ino, d_type, ++ arg->bindex, shwh); ++ } ++ } ++ ++out: ++ if (!arg->err) ++ arg->vdir->vd_jiffy = jiffies; ++ /* smp_mb(); */ ++ AuTraceErr(arg->err); ++ return arg->err; ++} ++ ++static int au_handle_shwh(struct super_block *sb, struct au_vdir *vdir, ++ struct au_nhash *whlist, struct au_nhash *delist) ++{ ++#ifdef CONFIG_AUFS_SHWH ++ int err; ++ unsigned int nh, u; ++ struct hlist_head *head; ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos, *n; ++ char *p, *o; ++ struct au_vdir_destr *destr; ++ ++ AuDebugOn(!au_opt_test(au_mntflags(sb), SHWH)); ++ ++ err = -ENOMEM; ++ o = p = __getname_gfp(GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ ++ err = 0; ++ nh = whlist->nh_num; ++ memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); ++ p += AUFS_WH_PFX_LEN; ++ for (u = 0; u < nh; u++) { ++ head = whlist->nh_head + u; ++ hlist_for_each_entry_safe(tpos, pos, n, head, wh_hash) { ++ destr = &tpos->wh_str; ++ memcpy(p, destr->name, destr->len); ++ err = append_de(vdir, o, destr->len + AUFS_WH_PFX_LEN, ++ tpos->wh_ino, tpos->wh_type, delist); ++ if (unlikely(err)) ++ break; ++ } ++ } ++ ++ __putname(o); ++ ++out: ++ AuTraceErr(err); ++ return err; ++#else ++ return 0; ++#endif ++} ++ ++static int au_do_read_vdir(struct fillvdir_arg *arg) ++{ ++ int err; ++ unsigned int rdhash; ++ loff_t offset; ++ aufs_bindex_t bend, bindex, bstart; ++ unsigned char shwh; ++ struct file *hf, *file; ++ struct super_block *sb; ++ ++ file = arg->file; ++ sb = file->f_dentry->d_sb; ++ SiMustAnyLock(sb); ++ ++ rdhash = au_sbi(sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = au_rdhash_est(au_dir_size(file, /*dentry*/NULL)); ++ err = au_nhash_alloc(&arg->delist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ err = au_nhash_alloc(&arg->whlist, rdhash, GFP_NOFS); ++ if (unlikely(err)) ++ goto out_delist; ++ ++ err = 0; ++ arg->flags = 0; ++ shwh = 0; ++ if (au_opt_test(au_mntflags(sb), SHWH)) { ++ shwh = 1; ++ au_fset_fillvdir(arg->flags, SHWH); ++ } ++ bstart = au_fbstart(file); ++ bend = au_fbend_dir(file); ++ for (bindex = bstart; !err && bindex <= bend; bindex++) { ++ hf = au_hf_dir(file, bindex); ++ if (!hf) ++ continue; ++ ++ offset = vfsub_llseek(hf, 0, SEEK_SET); ++ err = offset; ++ if (unlikely(offset)) ++ break; ++ ++ arg->bindex = bindex; ++ au_fclr_fillvdir(arg->flags, WHABLE); ++ if (shwh ++ || (bindex != bend ++ && au_br_whable(au_sbr_perm(sb, bindex)))) ++ au_fset_fillvdir(arg->flags, WHABLE); ++ do { ++ arg->err = 0; ++ au_fclr_fillvdir(arg->flags, CALLED); ++ /* smp_mb(); */ ++ err = vfsub_readdir(hf, fillvdir, arg); ++ if (err >= 0) ++ err = arg->err; ++ } while (!err && au_ftest_fillvdir(arg->flags, CALLED)); ++ } ++ ++ if (!err && shwh) ++ err = au_handle_shwh(sb, arg->vdir, &arg->whlist, &arg->delist); ++ ++ au_nhash_wh_free(&arg->whlist); ++ ++out_delist: ++ au_nhash_de_free(&arg->delist); ++out: ++ return err; ++} ++ ++static int read_vdir(struct file *file, int may_read) ++{ ++ int err; ++ unsigned long expire; ++ unsigned char do_read; ++ struct fillvdir_arg arg; ++ struct inode *inode; ++ struct au_vdir *vdir, *allocated; ++ ++ err = 0; ++ inode = file->f_dentry->d_inode; ++ IMustLock(inode); ++ SiMustAnyLock(inode->i_sb); ++ ++ allocated = NULL; ++ do_read = 0; ++ expire = au_sbi(inode->i_sb)->si_rdcache; ++ vdir = au_ivdir(inode); ++ if (!vdir) { ++ do_read = 1; ++ vdir = alloc_vdir(file); ++ err = PTR_ERR(vdir); ++ if (IS_ERR(vdir)) ++ goto out; ++ err = 0; ++ allocated = vdir; ++ } else if (may_read ++ && (inode->i_version != vdir->vd_version ++ || time_after(jiffies, vdir->vd_jiffy + expire))) { ++ do_read = 1; ++ err = reinit_vdir(vdir); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ if (!do_read) ++ return 0; /* success */ ++ ++ arg.file = file; ++ arg.vdir = vdir; ++ err = au_do_read_vdir(&arg); ++ if (!err) { ++ /* file->f_pos = 0; */ ++ vdir->vd_version = inode->i_version; ++ vdir->vd_last.ul = 0; ++ vdir->vd_last.p.deblk = vdir->vd_deblk[0]; ++ if (allocated) ++ au_set_ivdir(inode, allocated); ++ } else if (allocated) ++ au_vdir_free(allocated); ++ ++out: ++ return err; ++} ++ ++static int copy_vdir(struct au_vdir *tgt, struct au_vdir *src) ++{ ++ int err, rerr; ++ unsigned long ul, n; ++ const unsigned int deblk_sz = src->vd_deblk_sz; ++ ++ AuDebugOn(tgt->vd_nblk != 1); ++ ++ err = -ENOMEM; ++ if (tgt->vd_nblk < src->vd_nblk) { ++ unsigned char **p; ++ ++ p = krealloc(tgt->vd_deblk, sizeof(*p) * src->vd_nblk, ++ GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ tgt->vd_deblk = p; ++ } ++ ++ if (tgt->vd_deblk_sz != deblk_sz) { ++ unsigned char *p; ++ ++ tgt->vd_deblk_sz = deblk_sz; ++ p = krealloc(tgt->vd_deblk[0], deblk_sz, GFP_NOFS); ++ if (unlikely(!p)) ++ goto out; ++ tgt->vd_deblk[0] = p; ++ } ++ memcpy(tgt->vd_deblk[0], src->vd_deblk[0], deblk_sz); ++ tgt->vd_version = src->vd_version; ++ tgt->vd_jiffy = src->vd_jiffy; ++ ++ n = src->vd_nblk; ++ for (ul = 1; ul < n; ul++) { ++ tgt->vd_deblk[ul] = kmemdup(src->vd_deblk[ul], deblk_sz, ++ GFP_NOFS); ++ if (unlikely(!tgt->vd_deblk[ul])) ++ goto out; ++ tgt->vd_nblk++; ++ } ++ tgt->vd_nblk = n; ++ tgt->vd_last.ul = tgt->vd_last.ul; ++ tgt->vd_last.p.deblk = tgt->vd_deblk[tgt->vd_last.ul]; ++ tgt->vd_last.p.deblk += src->vd_last.p.deblk ++ - src->vd_deblk[src->vd_last.ul]; ++ /* smp_mb(); */ ++ return 0; /* success */ ++ ++out: ++ rerr = reinit_vdir(tgt); ++ BUG_ON(rerr); ++ return err; ++} ++ ++int au_vdir_init(struct file *file) ++{ ++ int err; ++ struct inode *inode; ++ struct au_vdir *vdir_cache, *allocated; ++ ++ err = read_vdir(file, !file->f_pos); ++ if (unlikely(err)) ++ goto out; ++ ++ allocated = NULL; ++ vdir_cache = au_fvdir_cache(file); ++ if (!vdir_cache) { ++ vdir_cache = alloc_vdir(file); ++ err = PTR_ERR(vdir_cache); ++ if (IS_ERR(vdir_cache)) ++ goto out; ++ allocated = vdir_cache; ++ } else if (!file->f_pos && vdir_cache->vd_version != file->f_version) { ++ err = reinit_vdir(vdir_cache); ++ if (unlikely(err)) ++ goto out; ++ } else ++ return 0; /* success */ ++ ++ inode = file->f_dentry->d_inode; ++ err = copy_vdir(vdir_cache, au_ivdir(inode)); ++ if (!err) { ++ file->f_version = inode->i_version; ++ if (allocated) ++ au_set_fvdir_cache(file, allocated); ++ } else if (allocated) ++ au_vdir_free(allocated); ++ ++out: ++ return err; ++} ++ ++static loff_t calc_offset(struct au_vdir *vdir) ++{ ++ loff_t offset; ++ union au_vdir_deblk_p p; ++ ++ p.deblk = vdir->vd_deblk[vdir->vd_last.ul]; ++ offset = vdir->vd_last.p.deblk - p.deblk; ++ offset += vdir->vd_deblk_sz * vdir->vd_last.ul; ++ return offset; ++} ++ ++/* returns true or false */ ++static int seek_vdir(struct file *file) ++{ ++ int valid; ++ unsigned int deblk_sz; ++ unsigned long ul, n; ++ loff_t offset; ++ union au_vdir_deblk_p p, deblk_end; ++ struct au_vdir *vdir_cache; ++ ++ valid = 1; ++ vdir_cache = au_fvdir_cache(file); ++ offset = calc_offset(vdir_cache); ++ AuDbg("offset %lld\n", offset); ++ if (file->f_pos == offset) ++ goto out; ++ ++ vdir_cache->vd_last.ul = 0; ++ vdir_cache->vd_last.p.deblk = vdir_cache->vd_deblk[0]; ++ if (!file->f_pos) ++ goto out; ++ ++ valid = 0; ++ deblk_sz = vdir_cache->vd_deblk_sz; ++ ul = div64_u64(file->f_pos, deblk_sz); ++ AuDbg("ul %lu\n", ul); ++ if (ul >= vdir_cache->vd_nblk) ++ goto out; ++ ++ n = vdir_cache->vd_nblk; ++ for (; ul < n; ul++) { ++ p.deblk = vdir_cache->vd_deblk[ul]; ++ deblk_end.deblk = p.deblk + deblk_sz; ++ offset = ul; ++ offset *= deblk_sz; ++ while (!is_deblk_end(&p, &deblk_end) && offset < file->f_pos) { ++ unsigned int l; ++ ++ l = calc_size(p.de->de_str.len); ++ offset += l; ++ p.deblk += l; ++ } ++ if (!is_deblk_end(&p, &deblk_end)) { ++ valid = 1; ++ vdir_cache->vd_last.ul = ul; ++ vdir_cache->vd_last.p = p; ++ break; ++ } ++ } ++ ++out: ++ /* smp_mb(); */ ++ AuTraceErr(!valid); ++ return valid; ++} ++ ++int au_vdir_fill_de(struct file *file, void *dirent, filldir_t filldir) ++{ ++ int err; ++ unsigned int l, deblk_sz; ++ union au_vdir_deblk_p deblk_end; ++ struct au_vdir *vdir_cache; ++ struct au_vdir_de *de; ++ ++ vdir_cache = au_fvdir_cache(file); ++ if (!seek_vdir(file)) ++ return 0; ++ ++ deblk_sz = vdir_cache->vd_deblk_sz; ++ while (1) { ++ deblk_end.deblk = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; ++ deblk_end.deblk += deblk_sz; ++ while (!is_deblk_end(&vdir_cache->vd_last.p, &deblk_end)) { ++ de = vdir_cache->vd_last.p.de; ++ AuDbg("%.*s, off%lld, i%lu, dt%d\n", ++ de->de_str.len, de->de_str.name, file->f_pos, ++ (unsigned long)de->de_ino, de->de_type); ++ err = filldir(dirent, de->de_str.name, de->de_str.len, ++ file->f_pos, de->de_ino, de->de_type); ++ if (unlikely(err)) { ++ AuTraceErr(err); ++ /* todo: ignore the error caused by udba? */ ++ /* return err; */ ++ return 0; ++ } ++ ++ l = calc_size(de->de_str.len); ++ vdir_cache->vd_last.p.deblk += l; ++ file->f_pos += l; ++ } ++ if (vdir_cache->vd_last.ul < vdir_cache->vd_nblk - 1) { ++ vdir_cache->vd_last.ul++; ++ vdir_cache->vd_last.p.deblk ++ = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; ++ file->f_pos = deblk_sz * vdir_cache->vd_last.ul; ++ continue; ++ } ++ break; ++ } ++ ++ /* smp_mb(); */ ++ return 0; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/vfsub.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,832 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sub-routines for VFS ++ */ ++ ++#include ++#include ++#include ++#include ++#include "aufs.h" ++ ++int vfsub_update_h_iattr(struct path *h_path, int *did) ++{ ++ int err; ++ struct kstat st; ++ struct super_block *h_sb; ++ ++ /* for remote fs, leave work for its getattr or d_revalidate */ ++ /* for bad i_attr fs, handle them in aufs_getattr() */ ++ /* still some fs may acquire i_mutex. we need to skip them */ ++ err = 0; ++ if (!did) ++ did = &err; ++ h_sb = h_path->dentry->d_sb; ++ *did = (!au_test_fs_remote(h_sb) && au_test_fs_refresh_iattr(h_sb)); ++ if (*did) ++ err = vfs_getattr(h_path->mnt, h_path->dentry, &st); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct file *vfsub_dentry_open(struct path *path, int flags) ++{ ++ struct file *file; ++ ++ path_get(path); ++ file = dentry_open(path->dentry, path->mnt, ++ flags /* | __FMODE_NONOTIFY */, ++ current_cred()); ++ if (!IS_ERR_OR_NULL(file) ++ && (file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) ++ i_readcount_inc(path->dentry->d_inode); ++ ++ return file; ++} ++ ++struct file *vfsub_filp_open(const char *path, int oflags, int mode) ++{ ++ struct file *file; ++ ++ lockdep_off(); ++ file = filp_open(path, ++ oflags /* | __FMODE_NONOTIFY */, ++ mode); ++ lockdep_on(); ++ if (IS_ERR(file)) ++ goto out; ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ ++out: ++ return file; ++} ++ ++int vfsub_kern_path(const char *name, unsigned int flags, struct path *path) ++{ ++ int err; ++ ++ err = kern_path(name, flags, path); ++ if (!err && path->dentry->d_inode) ++ vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, ++ int len) ++{ ++ struct path path = { ++ .mnt = NULL ++ }; ++ ++ /* VFS checks it too, but by WARN_ON_ONCE() */ ++ IMustLock(parent->d_inode); ++ ++ path.dentry = lookup_one_len(name, parent, len); ++ if (IS_ERR(path.dentry)) ++ goto out; ++ if (path.dentry->d_inode) ++ vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/ ++ ++out: ++ AuTraceErrPtr(path.dentry); ++ return path.dentry; ++} ++ ++struct dentry *vfsub_lookup_hash(struct nameidata *nd) ++{ ++ struct path path = { ++ .mnt = nd->path.mnt ++ }; ++ ++ IMustLock(nd->path.dentry->d_inode); ++ ++ path.dentry = lookup_hash(nd); ++ if (IS_ERR(path.dentry)) ++ goto out; ++ if (path.dentry->d_inode) ++ vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/ ++ ++out: ++ AuTraceErrPtr(path.dentry); ++ return path.dentry; ++} ++ ++/* ++ * this is "VFS:__lookup_one_len()" which was removed and merged into ++ * VFS:lookup_one_len() by the commit. ++ * 6a96ba5 2011-03-14 kill __lookup_one_len() ++ * this function should always be equivalent to the corresponding part in ++ * VFS:lookup_one_len(). ++ */ ++int vfsub_name_hash(const char *name, struct qstr *this, int len) ++{ ++ unsigned int c; ++ ++ this->name = name; ++ this->len = len; ++ this->hash = full_name_hash(name, len); ++ if (!len) ++ return -EACCES; ++ ++ while (len--) { ++ c = *(const unsigned char *)name++; ++ if (c == '/' || c == '\0') ++ return -EACCES; ++ } ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2) ++{ ++ struct dentry *d; ++ ++ lockdep_off(); ++ d = lock_rename(d1, d2); ++ lockdep_on(); ++ au_hn_suspend(hdir1); ++ if (hdir1 != hdir2) ++ au_hn_suspend(hdir2); ++ ++ return d; ++} ++ ++void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2) ++{ ++ au_hn_resume(hdir1); ++ if (hdir1 != hdir2) ++ au_hn_resume(hdir2); ++ lockdep_off(); ++ unlock_rename(d1, d2); ++ lockdep_on(); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int vfsub_create(struct inode *dir, struct path *path, int mode) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_mknod(path, d, mode, 0); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ if (au_test_fs_null_nd(dir->i_sb)) ++ err = vfs_create(dir, path->dentry, mode, NULL); ++ else { ++ struct nameidata h_nd; ++ ++ memset(&h_nd, 0, sizeof(h_nd)); ++ h_nd.flags = LOOKUP_CREATE; ++ h_nd.intent.open.flags = O_CREAT ++ | vfsub_fmode_to_uint(FMODE_READ); ++ h_nd.intent.open.create_mode = mode; ++ h_nd.path.dentry = path->dentry->d_parent; ++ h_nd.path.mnt = path->mnt; ++ path_get(&h_nd.path); ++ err = vfs_create(dir, path->dentry, mode, &h_nd); ++ path_put(&h_nd.path); ++ } ++ ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++int vfsub_symlink(struct inode *dir, struct path *path, const char *symname) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_symlink(path, d, symname); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ err = vfs_symlink(dir, path->dentry, symname); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_mknod(path, d, mode, new_encode_dev(dev)); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ err = vfs_mknod(dir, path->dentry, mode, dev); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++static int au_test_nlink(struct inode *inode) ++{ ++ const unsigned int link_max = UINT_MAX >> 1; /* rough margin */ ++ ++ if (!au_test_fs_no_limit_nlink(inode->i_sb) ++ || inode->i_nlink < link_max) ++ return 0; ++ return -EMLINK; ++} ++ ++int vfsub_link(struct dentry *src_dentry, struct inode *dir, struct path *path) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ err = au_test_nlink(src_dentry->d_inode); ++ if (unlikely(err)) ++ return err; ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_link(src_dentry, path, d); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_link(src_dentry, dir, path->dentry); ++ lockdep_on(); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ /* fuse has different memory inode for the same inumber */ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ tmp.dentry = src_dentry; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++int vfsub_rename(struct inode *src_dir, struct dentry *src_dentry, ++ struct inode *dir, struct path *path) ++{ ++ int err; ++ struct path tmp = { ++ .mnt = path->mnt ++ }; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ IMustLock(src_dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ tmp.dentry = src_dentry->d_parent; ++ err = security_path_rename(&tmp, src_dentry, path, d); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_rename(src_dir, src_dentry, dir, path->dentry); ++ lockdep_on(); ++ if (!err) { ++ int did; ++ ++ tmp.dentry = d->d_parent; ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = src_dentry; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ tmp.dentry = src_dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++int vfsub_mkdir(struct inode *dir, struct path *path, int mode) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_mkdir(path, d, mode); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ err = vfs_mkdir(dir, path->dentry, mode); ++ if (!err) { ++ struct path tmp = *path; ++ int did; ++ ++ vfsub_update_h_iattr(&tmp, &did); ++ if (did) { ++ tmp.dentry = path->dentry->d_parent; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); ++ } ++ /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++int vfsub_rmdir(struct inode *dir, struct path *path) ++{ ++ int err; ++ struct dentry *d; ++ ++ IMustLock(dir); ++ ++ d = path->dentry; ++ path->dentry = d->d_parent; ++ err = security_path_rmdir(path, d); ++ path->dentry = d; ++ if (unlikely(err)) ++ goto out; ++ ++ lockdep_off(); ++ err = vfs_rmdir(dir, path->dentry); ++ lockdep_on(); ++ if (!err) { ++ struct path tmp = { ++ .dentry = path->dentry->d_parent, ++ .mnt = path->mnt ++ }; ++ ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ ++ } ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* todo: support mmap_sem? */ ++ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ ++ lockdep_off(); ++ err = vfs_read(file, ubuf, count, ppos); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++/* todo: kernel_read()? */ ++ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ union { ++ void *k; ++ char __user *u; ++ } buf; ++ ++ buf.k = kbuf; ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = vfsub_read_u(file, buf.u, count, ppos); ++ set_fs(oldfs); ++ return err; ++} ++ ++ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, ++ loff_t *ppos) ++{ ++ ssize_t err; ++ ++ lockdep_off(); ++ err = vfs_write(file, ubuf, count, ppos); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, loff_t *ppos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ union { ++ void *k; ++ const char __user *u; ++ } buf; ++ ++ buf.k = kbuf; ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = vfsub_write_u(file, buf.u, count, ppos); ++ set_fs(oldfs); ++ return err; ++} ++ ++int vfsub_flush(struct file *file, fl_owner_t id) ++{ ++ int err; ++ ++ err = 0; ++ if (file->f_op && file->f_op->flush) { ++ if (!au_test_nfs(file->f_dentry->d_sb)) ++ err = file->f_op->flush(file, id); ++ else { ++ lockdep_off(); ++ err = file->f_op->flush(file, id); ++ lockdep_on(); ++ } ++ if (!err) ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); ++ /*ignore*/ ++ } ++ return err; ++} ++ ++int vfsub_readdir(struct file *file, filldir_t filldir, void *arg) ++{ ++ int err; ++ ++ lockdep_off(); ++ err = vfs_readdir(file, filldir, arg); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++long vfsub_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags) ++{ ++ long err; ++ ++ lockdep_off(); ++ err = do_splice_to(in, ppos, pipe, len, flags); ++ lockdep_on(); ++ file_accessed(in); ++ if (err >= 0) ++ vfsub_update_h_iattr(&in->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags) ++{ ++ long err; ++ ++ lockdep_off(); ++ err = do_splice_from(pipe, out, ppos, len, flags); ++ lockdep_on(); ++ if (err >= 0) ++ vfsub_update_h_iattr(&out->f_path, /*did*/NULL); /*ignore*/ ++ return err; ++} ++ ++int vfsub_fsync(struct file *file, struct path *path, int datasync) ++{ ++ int err; ++ ++ /* file can be NULL */ ++ lockdep_off(); ++ err = vfs_fsync(file, datasync); ++ lockdep_on(); ++ if (!err) { ++ if (!path) { ++ AuDebugOn(!file); ++ path = &file->f_path; ++ } ++ vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/ ++ } ++ return err; ++} ++ ++/* cf. open.c:do_sys_truncate() and do_sys_ftruncate() */ ++int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, ++ struct file *h_file) ++{ ++ int err; ++ struct inode *h_inode; ++ ++ h_inode = h_path->dentry->d_inode; ++ if (!h_file) { ++ err = mnt_want_write(h_path->mnt); ++ if (err) ++ goto out; ++ err = inode_permission(h_inode, MAY_WRITE); ++ if (err) ++ goto out_mnt; ++ err = get_write_access(h_inode); ++ if (err) ++ goto out_mnt; ++ err = break_lease(h_inode, O_WRONLY); ++ if (err) ++ goto out_inode; ++ } ++ ++ err = locks_verify_truncate(h_inode, h_file, length); ++ if (!err) ++ err = security_path_truncate(h_path); ++ if (!err) { ++ lockdep_off(); ++ err = do_truncate(h_path->dentry, length, attr, h_file); ++ lockdep_on(); ++ } ++ ++out_inode: ++ if (!h_file) ++ put_write_access(h_inode); ++out_mnt: ++ if (!h_file) ++ mnt_drop_write(h_path->mnt); ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_vfsub_mkdir_args { ++ int *errp; ++ struct inode *dir; ++ struct path *path; ++ int mode; ++}; ++ ++static void au_call_vfsub_mkdir(void *args) ++{ ++ struct au_vfsub_mkdir_args *a = args; ++ *a->errp = vfsub_mkdir(a->dir, a->path, a->mode); ++} ++ ++int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode) ++{ ++ int err, do_sio, wkq_err; ++ ++ do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); ++ if (!do_sio) ++ err = vfsub_mkdir(dir, path, mode); ++ else { ++ struct au_vfsub_mkdir_args args = { ++ .errp = &err, ++ .dir = dir, ++ .path = path, ++ .mode = mode ++ }; ++ wkq_err = au_wkq_wait(au_call_vfsub_mkdir, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} ++ ++struct au_vfsub_rmdir_args { ++ int *errp; ++ struct inode *dir; ++ struct path *path; ++}; ++ ++static void au_call_vfsub_rmdir(void *args) ++{ ++ struct au_vfsub_rmdir_args *a = args; ++ *a->errp = vfsub_rmdir(a->dir, a->path); ++} ++ ++int vfsub_sio_rmdir(struct inode *dir, struct path *path) ++{ ++ int err, do_sio, wkq_err; ++ ++ do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); ++ if (!do_sio) ++ err = vfsub_rmdir(dir, path); ++ else { ++ struct au_vfsub_rmdir_args args = { ++ .errp = &err, ++ .dir = dir, ++ .path = path ++ }; ++ wkq_err = au_wkq_wait(au_call_vfsub_rmdir, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct notify_change_args { ++ int *errp; ++ struct path *path; ++ struct iattr *ia; ++}; ++ ++static void call_notify_change(void *args) ++{ ++ struct notify_change_args *a = args; ++ struct inode *h_inode; ++ ++ h_inode = a->path->dentry->d_inode; ++ IMustLock(h_inode); ++ ++ *a->errp = -EPERM; ++ if (!IS_IMMUTABLE(h_inode) && !IS_APPEND(h_inode)) { ++ *a->errp = notify_change(a->path->dentry, a->ia); ++ if (!*a->errp) ++ vfsub_update_h_iattr(a->path, /*did*/NULL); /*ignore*/ ++ } ++ AuTraceErr(*a->errp); ++} ++ ++int vfsub_notify_change(struct path *path, struct iattr *ia) ++{ ++ int err; ++ struct notify_change_args args = { ++ .errp = &err, ++ .path = path, ++ .ia = ia ++ }; ++ ++ call_notify_change(&args); ++ ++ return err; ++} ++ ++int vfsub_sio_notify_change(struct path *path, struct iattr *ia) ++{ ++ int err, wkq_err; ++ struct notify_change_args args = { ++ .errp = &err, ++ .path = path, ++ .ia = ia ++ }; ++ ++ wkq_err = au_wkq_wait(call_notify_change, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct unlink_args { ++ int *errp; ++ struct inode *dir; ++ struct path *path; ++}; ++ ++static void call_unlink(void *args) ++{ ++ struct unlink_args *a = args; ++ struct dentry *d = a->path->dentry; ++ struct inode *h_inode; ++ const int stop_sillyrename = (au_test_nfs(d->d_sb) ++ && d->d_count == 1); ++ ++ IMustLock(a->dir); ++ ++ a->path->dentry = d->d_parent; ++ *a->errp = security_path_unlink(a->path, d); ++ a->path->dentry = d; ++ if (unlikely(*a->errp)) ++ return; ++ ++ if (!stop_sillyrename) ++ dget(d); ++ h_inode = d->d_inode; ++ if (h_inode) ++ ihold(h_inode); ++ ++ lockdep_off(); ++ *a->errp = vfs_unlink(a->dir, d); ++ lockdep_on(); ++ if (!*a->errp) { ++ struct path tmp = { ++ .dentry = d->d_parent, ++ .mnt = a->path->mnt ++ }; ++ vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ ++ } ++ ++ if (!stop_sillyrename) ++ dput(d); ++ if (h_inode) ++ iput(h_inode); ++ ++ AuTraceErr(*a->errp); ++} ++ ++/* ++ * @dir: must be locked. ++ * @dentry: target dentry. ++ */ ++int vfsub_unlink(struct inode *dir, struct path *path, int force) ++{ ++ int err; ++ struct unlink_args args = { ++ .errp = &err, ++ .dir = dir, ++ .path = path ++ }; ++ ++ if (!force) ++ call_unlink(&args); ++ else { ++ int wkq_err; ++ ++ wkq_err = au_wkq_wait(call_unlink, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/vfsub.h 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,240 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * sub-routines for VFS ++ */ ++ ++#ifndef __AUFS_VFSUB_H__ ++#define __AUFS_VFSUB_H__ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include "debug.h" ++ ++/* copied from linux/fs/internal.h */ ++/* todo: BAD approach!! */ ++DECLARE_BRLOCK(vfsmount_lock); ++extern void file_sb_list_del(struct file *f); ++extern spinlock_t inode_sb_list_lock; ++ ++/* copied from linux/fs/file_table.c */ ++DECLARE_LGLOCK(files_lglock); ++#ifdef CONFIG_SMP ++/* ++ * These macros iterate all files on all CPUs for a given superblock. ++ * files_lglock must be held globally. ++ */ ++#define do_file_list_for_each_entry(__sb, __file) \ ++{ \ ++ int i; \ ++ for_each_possible_cpu(i) { \ ++ struct list_head *list; \ ++ list = per_cpu_ptr((__sb)->s_files, i); \ ++ list_for_each_entry((__file), list, f_u.fu_list) ++ ++#define while_file_list_for_each_entry \ ++ } \ ++} ++ ++#else ++ ++#define do_file_list_for_each_entry(__sb, __file) \ ++{ \ ++ struct list_head *list; \ ++ list = &(sb)->s_files; \ ++ list_for_each_entry((__file), list, f_u.fu_list) ++ ++#define while_file_list_for_each_entry \ ++} ++#endif ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* lock subclass for lower inode */ ++/* default MAX_LOCKDEP_SUBCLASSES(8) is not enough */ ++/* reduce? gave up. */ ++enum { ++ AuLsc_I_Begin = I_MUTEX_QUOTA, /* 4 */ ++ AuLsc_I_PARENT, /* lower inode, parent first */ ++ AuLsc_I_PARENT2, /* copyup dirs */ ++ AuLsc_I_PARENT3, /* copyup wh */ ++ AuLsc_I_CHILD, ++ AuLsc_I_CHILD2, ++ AuLsc_I_End ++}; ++ ++/* to debug easier, do not make them inlined functions */ ++#define MtxMustLock(mtx) AuDebugOn(!mutex_is_locked(mtx)) ++#define IMustLock(i) MtxMustLock(&(i)->i_mutex) ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline void vfsub_drop_nlink(struct inode *inode) ++{ ++ AuDebugOn(!inode->i_nlink); ++ drop_nlink(inode); ++} ++ ++static inline void vfsub_dead_dir(struct inode *inode) ++{ ++ AuDebugOn(!S_ISDIR(inode->i_mode)); ++ inode->i_flags |= S_DEAD; ++ clear_nlink(inode); ++} ++ ++static inline void vfsub_set_nlink(struct inode *inode, unsigned int nlink) ++{ ++ if (nlink) ++ set_nlink(inode, nlink); ++ else ++ clear_nlink(inode); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int vfsub_update_h_iattr(struct path *h_path, int *did); ++struct file *vfsub_dentry_open(struct path *path, int flags); ++struct file *vfsub_filp_open(const char *path, int oflags, int mode); ++int vfsub_kern_path(const char *name, unsigned int flags, struct path *path); ++struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, ++ int len); ++struct dentry *vfsub_lookup_hash(struct nameidata *nd); ++int vfsub_name_hash(const char *name, struct qstr *this, int len); ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_hinode; ++struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2); ++void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, ++ struct dentry *d2, struct au_hinode *hdir2); ++ ++int vfsub_create(struct inode *dir, struct path *path, int mode); ++int vfsub_symlink(struct inode *dir, struct path *path, ++ const char *symname); ++int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev); ++int vfsub_link(struct dentry *src_dentry, struct inode *dir, ++ struct path *path); ++int vfsub_rename(struct inode *src_hdir, struct dentry *src_dentry, ++ struct inode *hdir, struct path *path); ++int vfsub_mkdir(struct inode *dir, struct path *path, int mode); ++int vfsub_rmdir(struct inode *dir, struct path *path); ++ ++/* ---------------------------------------------------------------------- */ ++ ++ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, ++ loff_t *ppos); ++ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, ++ loff_t *ppos); ++ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, ++ loff_t *ppos); ++ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, ++ loff_t *ppos); ++int vfsub_flush(struct file *file, fl_owner_t id); ++int vfsub_readdir(struct file *file, filldir_t filldir, void *arg); ++ ++static inline unsigned int vfsub_file_flags(struct file *file) ++{ ++ unsigned int flags; ++ ++ spin_lock(&file->f_lock); ++ flags = file->f_flags; ++ spin_unlock(&file->f_lock); ++ ++ return flags; ++} ++ ++static inline void vfsub_file_accessed(struct file *h_file) ++{ ++ file_accessed(h_file); ++ vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); /*ignore*/ ++} ++ ++static inline void vfsub_touch_atime(struct vfsmount *h_mnt, ++ struct dentry *h_dentry) ++{ ++ struct path h_path = { ++ .dentry = h_dentry, ++ .mnt = h_mnt ++ }; ++ touch_atime(h_mnt, h_dentry); ++ vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/ ++} ++ ++long vfsub_splice_to(struct file *in, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags); ++long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, ++ loff_t *ppos, size_t len, unsigned int flags); ++int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, ++ struct file *h_file); ++int vfsub_fsync(struct file *file, struct path *path, int datasync); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline loff_t vfsub_llseek(struct file *file, loff_t offset, int origin) ++{ ++ loff_t err; ++ ++ lockdep_off(); ++ err = vfs_llseek(file, offset, origin); ++ lockdep_on(); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* dirty workaround for strict type of fmode_t */ ++union vfsub_fmu { ++ fmode_t fm; ++ unsigned int ui; ++}; ++ ++static inline unsigned int vfsub_fmode_to_uint(fmode_t fm) ++{ ++ union vfsub_fmu u = { ++ .fm = fm ++ }; ++ ++ BUILD_BUG_ON(sizeof(u.fm) != sizeof(u.ui)); ++ ++ return u.ui; ++} ++ ++static inline fmode_t vfsub_uint_to_fmode(unsigned int ui) ++{ ++ union vfsub_fmu u = { ++ .ui = ui ++ }; ++ ++ return u.fm; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode); ++int vfsub_sio_rmdir(struct inode *dir, struct path *path); ++int vfsub_sio_notify_change(struct path *path, struct iattr *ia); ++int vfsub_notify_change(struct path *path, struct iattr *ia); ++int vfsub_unlink(struct inode *dir, struct path *path, int force); ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_VFSUB_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/wbr_policy.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,700 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * policies for selecting one among multiple writable branches ++ */ ++ ++#include ++#include "aufs.h" ++ ++/* subset of cpup_attr() */ ++static noinline_for_stack ++int au_cpdown_attr(struct path *h_path, struct dentry *h_src) ++{ ++ int err, sbits; ++ struct iattr ia; ++ struct inode *h_isrc; ++ ++ h_isrc = h_src->d_inode; ++ ia.ia_valid = ATTR_FORCE | ATTR_MODE | ATTR_UID | ATTR_GID; ++ ia.ia_mode = h_isrc->i_mode; ++ ia.ia_uid = h_isrc->i_uid; ++ ia.ia_gid = h_isrc->i_gid; ++ sbits = !!(ia.ia_mode & (S_ISUID | S_ISGID)); ++ au_cpup_attr_flags(h_path->dentry->d_inode, h_isrc); ++ err = vfsub_sio_notify_change(h_path, &ia); ++ ++ /* is this nfs only? */ ++ if (!err && sbits && au_test_nfs(h_path->dentry->d_sb)) { ++ ia.ia_valid = ATTR_FORCE | ATTR_MODE; ++ ia.ia_mode = h_isrc->i_mode; ++ err = vfsub_sio_notify_change(h_path, &ia); ++ } ++ ++ return err; ++} ++ ++#define AuCpdown_PARENT_OPQ 1 ++#define AuCpdown_WHED (1 << 1) ++#define AuCpdown_MADE_DIR (1 << 2) ++#define AuCpdown_DIROPQ (1 << 3) ++#define au_ftest_cpdown(flags, name) ((flags) & AuCpdown_##name) ++#define au_fset_cpdown(flags, name) \ ++ do { (flags) |= AuCpdown_##name; } while (0) ++#define au_fclr_cpdown(flags, name) \ ++ do { (flags) &= ~AuCpdown_##name; } while (0) ++ ++struct au_cpdown_dir_args { ++ struct dentry *parent; ++ unsigned int flags; ++}; ++ ++static int au_cpdown_dir_opq(struct dentry *dentry, aufs_bindex_t bdst, ++ struct au_cpdown_dir_args *a) ++{ ++ int err; ++ struct dentry *opq_dentry; ++ ++ opq_dentry = au_diropq_create(dentry, bdst); ++ err = PTR_ERR(opq_dentry); ++ if (IS_ERR(opq_dentry)) ++ goto out; ++ dput(opq_dentry); ++ au_fset_cpdown(a->flags, DIROPQ); ++ ++out: ++ return err; ++} ++ ++static int au_cpdown_dir_wh(struct dentry *dentry, struct dentry *h_parent, ++ struct inode *dir, aufs_bindex_t bdst) ++{ ++ int err; ++ struct path h_path; ++ struct au_branch *br; ++ ++ br = au_sbr(dentry->d_sb, bdst); ++ h_path.dentry = au_wh_lkup(h_parent, &dentry->d_name, br); ++ err = PTR_ERR(h_path.dentry); ++ if (IS_ERR(h_path.dentry)) ++ goto out; ++ ++ err = 0; ++ if (h_path.dentry->d_inode) { ++ h_path.mnt = br->br_mnt; ++ err = au_wh_unlink_dentry(au_h_iptr(dir, bdst), &h_path, ++ dentry); ++ } ++ dput(h_path.dentry); ++ ++out: ++ return err; ++} ++ ++static int au_cpdown_dir(struct dentry *dentry, aufs_bindex_t bdst, ++ struct dentry *h_parent, void *arg) ++{ ++ int err, rerr; ++ aufs_bindex_t bopq, bstart; ++ struct path h_path; ++ struct dentry *parent; ++ struct inode *h_dir, *h_inode, *inode, *dir; ++ struct au_cpdown_dir_args *args = arg; ++ ++ bstart = au_dbstart(dentry); ++ /* dentry is di-locked */ ++ parent = dget_parent(dentry); ++ dir = parent->d_inode; ++ h_dir = h_parent->d_inode; ++ AuDebugOn(h_dir != au_h_iptr(dir, bdst)); ++ IMustLock(h_dir); ++ ++ err = au_lkup_neg(dentry, bdst); ++ if (unlikely(err < 0)) ++ goto out; ++ h_path.dentry = au_h_dptr(dentry, bdst); ++ h_path.mnt = au_sbr_mnt(dentry->d_sb, bdst); ++ err = vfsub_sio_mkdir(au_h_iptr(dir, bdst), &h_path, ++ S_IRWXU | S_IRUGO | S_IXUGO); ++ if (unlikely(err)) ++ goto out_put; ++ au_fset_cpdown(args->flags, MADE_DIR); ++ ++ bopq = au_dbdiropq(dentry); ++ au_fclr_cpdown(args->flags, WHED); ++ au_fclr_cpdown(args->flags, DIROPQ); ++ if (au_dbwh(dentry) == bdst) ++ au_fset_cpdown(args->flags, WHED); ++ if (!au_ftest_cpdown(args->flags, PARENT_OPQ) && bopq <= bdst) ++ au_fset_cpdown(args->flags, PARENT_OPQ); ++ h_inode = h_path.dentry->d_inode; ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ if (au_ftest_cpdown(args->flags, WHED)) { ++ err = au_cpdown_dir_opq(dentry, bdst, args); ++ if (unlikely(err)) { ++ mutex_unlock(&h_inode->i_mutex); ++ goto out_dir; ++ } ++ } ++ ++ err = au_cpdown_attr(&h_path, au_h_dptr(dentry, bstart)); ++ mutex_unlock(&h_inode->i_mutex); ++ if (unlikely(err)) ++ goto out_opq; ++ ++ if (au_ftest_cpdown(args->flags, WHED)) { ++ err = au_cpdown_dir_wh(dentry, h_parent, dir, bdst); ++ if (unlikely(err)) ++ goto out_opq; ++ } ++ ++ inode = dentry->d_inode; ++ if (au_ibend(inode) < bdst) ++ au_set_ibend(inode, bdst); ++ au_set_h_iptr(inode, bdst, au_igrab(h_inode), ++ au_hi_flags(inode, /*isdir*/1)); ++ goto out; /* success */ ++ ++ /* revert */ ++out_opq: ++ if (au_ftest_cpdown(args->flags, DIROPQ)) { ++ mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); ++ rerr = au_diropq_remove(dentry, bdst); ++ mutex_unlock(&h_inode->i_mutex); ++ if (unlikely(rerr)) { ++ AuIOErr("failed removing diropq for %.*s b%d (%d)\n", ++ AuDLNPair(dentry), bdst, rerr); ++ err = -EIO; ++ goto out; ++ } ++ } ++out_dir: ++ if (au_ftest_cpdown(args->flags, MADE_DIR)) { ++ rerr = vfsub_sio_rmdir(au_h_iptr(dir, bdst), &h_path); ++ if (unlikely(rerr)) { ++ AuIOErr("failed removing %.*s b%d (%d)\n", ++ AuDLNPair(dentry), bdst, rerr); ++ err = -EIO; ++ } ++ } ++out_put: ++ au_set_h_dptr(dentry, bdst, NULL); ++ if (au_dbend(dentry) == bdst) ++ au_update_dbend(dentry); ++out: ++ dput(parent); ++ return err; ++} ++ ++int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst) ++{ ++ int err; ++ struct au_cpdown_dir_args args = { ++ .parent = dget_parent(dentry), ++ .flags = 0 ++ }; ++ ++ err = au_cp_dirs(dentry, bdst, au_cpdown_dir, &args); ++ dput(args.parent); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policies for create */ ++ ++static int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ int err, i, j, ndentry; ++ aufs_bindex_t bopq; ++ struct au_dcsub_pages dpages; ++ struct au_dpage *dpage; ++ struct dentry **dentries, *parent, *d; ++ ++ err = au_dpages_init(&dpages, GFP_NOFS); ++ if (unlikely(err)) ++ goto out; ++ parent = dget_parent(dentry); ++ err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/0); ++ if (unlikely(err)) ++ goto out_free; ++ ++ err = bindex; ++ for (i = 0; i < dpages.ndpage; i++) { ++ dpage = dpages.dpages + i; ++ dentries = dpage->dentries; ++ ndentry = dpage->ndentry; ++ for (j = 0; j < ndentry; j++) { ++ d = dentries[j]; ++ di_read_lock_parent2(d, !AuLock_IR); ++ bopq = au_dbdiropq(d); ++ di_read_unlock(d, !AuLock_IR); ++ if (bopq >= 0 && bopq < err) ++ err = bopq; ++ } ++ } ++ ++out_free: ++ dput(parent); ++ au_dpages_free(&dpages); ++out: ++ return err; ++} ++ ++static int au_wbr_bu(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ for (; bindex >= 0; bindex--) ++ if (!au_br_rdonly(au_sbr(sb, bindex))) ++ return bindex; ++ return -EROFS; ++} ++ ++/* top down parent */ ++static int au_wbr_create_tdp(struct dentry *dentry, int isdir __maybe_unused) ++{ ++ int err; ++ aufs_bindex_t bstart, bindex; ++ struct super_block *sb; ++ struct dentry *parent, *h_parent; ++ ++ sb = dentry->d_sb; ++ bstart = au_dbstart(dentry); ++ err = bstart; ++ if (!au_br_rdonly(au_sbr(sb, bstart))) ++ goto out; ++ ++ err = -EROFS; ++ parent = dget_parent(dentry); ++ for (bindex = au_dbstart(parent); bindex < bstart; bindex++) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent || !h_parent->d_inode) ++ continue; ++ ++ if (!au_br_rdonly(au_sbr(sb, bindex))) { ++ err = bindex; ++ break; ++ } ++ } ++ dput(parent); ++ ++ /* bottom up here */ ++ if (unlikely(err < 0)) { ++ err = au_wbr_bu(sb, bstart - 1); ++ if (err >= 0) ++ err = au_wbr_nonopq(dentry, err); ++ } ++ ++out: ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* an exception for the policy other than tdp */ ++static int au_wbr_create_exp(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bwh, bdiropq; ++ struct dentry *parent; ++ ++ err = -1; ++ bwh = au_dbwh(dentry); ++ parent = dget_parent(dentry); ++ bdiropq = au_dbdiropq(parent); ++ if (bwh >= 0) { ++ if (bdiropq >= 0) ++ err = min(bdiropq, bwh); ++ else ++ err = bwh; ++ AuDbg("%d\n", err); ++ } else if (bdiropq >= 0) { ++ err = bdiropq; ++ AuDbg("%d\n", err); ++ } ++ dput(parent); ++ ++ if (err >= 0) ++ err = au_wbr_nonopq(dentry, err); ++ ++ if (err >= 0 && au_br_rdonly(au_sbr(dentry->d_sb, err))) ++ err = -1; ++ ++ AuDbg("%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* round robin */ ++static int au_wbr_create_init_rr(struct super_block *sb) ++{ ++ int err; ++ ++ err = au_wbr_bu(sb, au_sbend(sb)); ++ atomic_set(&au_sbi(sb)->si_wbr_rr_next, -err); /* less important */ ++ /* smp_mb(); */ ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++static int au_wbr_create_rr(struct dentry *dentry, int isdir) ++{ ++ int err, nbr; ++ unsigned int u; ++ aufs_bindex_t bindex, bend; ++ struct super_block *sb; ++ atomic_t *next; ++ ++ err = au_wbr_create_exp(dentry); ++ if (err >= 0) ++ goto out; ++ ++ sb = dentry->d_sb; ++ next = &au_sbi(sb)->si_wbr_rr_next; ++ bend = au_sbend(sb); ++ nbr = bend + 1; ++ for (bindex = 0; bindex <= bend; bindex++) { ++ if (!isdir) { ++ err = atomic_dec_return(next) + 1; ++ /* modulo for 0 is meaningless */ ++ if (unlikely(!err)) ++ err = atomic_dec_return(next) + 1; ++ } else ++ err = atomic_read(next); ++ AuDbg("%d\n", err); ++ u = err; ++ err = u % nbr; ++ AuDbg("%d\n", err); ++ if (!au_br_rdonly(au_sbr(sb, err))) ++ break; ++ err = -EROFS; ++ } ++ ++ if (err >= 0) ++ err = au_wbr_nonopq(dentry, err); ++ ++out: ++ AuDbg("%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* most free space */ ++static void au_mfs(struct dentry *dentry) ++{ ++ struct super_block *sb; ++ struct au_branch *br; ++ struct au_wbr_mfs *mfs; ++ aufs_bindex_t bindex, bend; ++ int err; ++ unsigned long long b, bavail; ++ struct path h_path; ++ /* reduce the stack usage */ ++ struct kstatfs *st; ++ ++ st = kmalloc(sizeof(*st), GFP_NOFS); ++ if (unlikely(!st)) { ++ AuWarn1("failed updating mfs(%d), ignored\n", -ENOMEM); ++ return; ++ } ++ ++ bavail = 0; ++ sb = dentry->d_sb; ++ mfs = &au_sbi(sb)->si_wbr_mfs; ++ MtxMustLock(&mfs->mfs_lock); ++ mfs->mfs_bindex = -EROFS; ++ mfs->mfsrr_bytes = 0; ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (au_br_rdonly(br)) ++ continue; ++ ++ /* sb->s_root for NFS is unreliable */ ++ h_path.mnt = br->br_mnt; ++ h_path.dentry = h_path.mnt->mnt_root; ++ err = vfs_statfs(&h_path, st); ++ if (unlikely(err)) { ++ AuWarn1("failed statfs, b%d, %d\n", bindex, err); ++ continue; ++ } ++ ++ /* when the available size is equal, select the lower one */ ++ BUILD_BUG_ON(sizeof(b) < sizeof(st->f_bavail) ++ || sizeof(b) < sizeof(st->f_bsize)); ++ b = st->f_bavail * st->f_bsize; ++ br->br_wbr->wbr_bytes = b; ++ if (b >= bavail) { ++ bavail = b; ++ mfs->mfs_bindex = bindex; ++ mfs->mfs_jiffy = jiffies; ++ } ++ } ++ ++ mfs->mfsrr_bytes = bavail; ++ AuDbg("b%d\n", mfs->mfs_bindex); ++ kfree(st); ++} ++ ++static int au_wbr_create_mfs(struct dentry *dentry, int isdir __maybe_unused) ++{ ++ int err; ++ struct super_block *sb; ++ struct au_wbr_mfs *mfs; ++ ++ err = au_wbr_create_exp(dentry); ++ if (err >= 0) ++ goto out; ++ ++ sb = dentry->d_sb; ++ mfs = &au_sbi(sb)->si_wbr_mfs; ++ mutex_lock(&mfs->mfs_lock); ++ if (time_after(jiffies, mfs->mfs_jiffy + mfs->mfs_expire) ++ || mfs->mfs_bindex < 0 ++ || au_br_rdonly(au_sbr(sb, mfs->mfs_bindex))) ++ au_mfs(dentry); ++ mutex_unlock(&mfs->mfs_lock); ++ err = mfs->mfs_bindex; ++ ++ if (err >= 0) ++ err = au_wbr_nonopq(dentry, err); ++ ++out: ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++static int au_wbr_create_init_mfs(struct super_block *sb) ++{ ++ struct au_wbr_mfs *mfs; ++ ++ mfs = &au_sbi(sb)->si_wbr_mfs; ++ mutex_init(&mfs->mfs_lock); ++ mfs->mfs_jiffy = 0; ++ mfs->mfs_bindex = -EROFS; ++ ++ return 0; ++} ++ ++static int au_wbr_create_fin_mfs(struct super_block *sb __maybe_unused) ++{ ++ mutex_destroy(&au_sbi(sb)->si_wbr_mfs.mfs_lock); ++ return 0; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* most free space and then round robin */ ++static int au_wbr_create_mfsrr(struct dentry *dentry, int isdir) ++{ ++ int err; ++ struct au_wbr_mfs *mfs; ++ ++ err = au_wbr_create_mfs(dentry, isdir); ++ if (err >= 0) { ++ mfs = &au_sbi(dentry->d_sb)->si_wbr_mfs; ++ mutex_lock(&mfs->mfs_lock); ++ if (mfs->mfsrr_bytes < mfs->mfsrr_watermark) ++ err = au_wbr_create_rr(dentry, isdir); ++ mutex_unlock(&mfs->mfs_lock); ++ } ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++static int au_wbr_create_init_mfsrr(struct super_block *sb) ++{ ++ int err; ++ ++ au_wbr_create_init_mfs(sb); /* ignore */ ++ err = au_wbr_create_init_rr(sb); ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* top down parent and most free space */ ++static int au_wbr_create_pmfs(struct dentry *dentry, int isdir) ++{ ++ int err, e2; ++ unsigned long long b; ++ aufs_bindex_t bindex, bstart, bend; ++ struct super_block *sb; ++ struct dentry *parent, *h_parent; ++ struct au_branch *br; ++ ++ err = au_wbr_create_tdp(dentry, isdir); ++ if (unlikely(err < 0)) ++ goto out; ++ parent = dget_parent(dentry); ++ bstart = au_dbstart(parent); ++ bend = au_dbtaildir(parent); ++ if (bstart == bend) ++ goto out_parent; /* success */ ++ ++ e2 = au_wbr_create_mfs(dentry, isdir); ++ if (e2 < 0) ++ goto out_parent; /* success */ ++ ++ /* when the available size is equal, select upper one */ ++ sb = dentry->d_sb; ++ br = au_sbr(sb, err); ++ b = br->br_wbr->wbr_bytes; ++ AuDbg("b%d, %llu\n", err, b); ++ ++ for (bindex = bstart; bindex <= bend; bindex++) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent || !h_parent->d_inode) ++ continue; ++ ++ br = au_sbr(sb, bindex); ++ if (!au_br_rdonly(br) && br->br_wbr->wbr_bytes > b) { ++ b = br->br_wbr->wbr_bytes; ++ err = bindex; ++ AuDbg("b%d, %llu\n", err, b); ++ } ++ } ++ ++ if (err >= 0) ++ err = au_wbr_nonopq(dentry, err); ++ ++out_parent: ++ dput(parent); ++out: ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* policies for copyup */ ++ ++/* top down parent */ ++static int au_wbr_copyup_tdp(struct dentry *dentry) ++{ ++ return au_wbr_create_tdp(dentry, /*isdir, anything is ok*/0); ++} ++ ++/* bottom up parent */ ++static int au_wbr_copyup_bup(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bindex, bstart; ++ struct dentry *parent, *h_parent; ++ struct super_block *sb; ++ ++ err = -EROFS; ++ sb = dentry->d_sb; ++ parent = dget_parent(dentry); ++ bstart = au_dbstart(parent); ++ for (bindex = au_dbstart(dentry); bindex >= bstart; bindex--) { ++ h_parent = au_h_dptr(parent, bindex); ++ if (!h_parent || !h_parent->d_inode) ++ continue; ++ ++ if (!au_br_rdonly(au_sbr(sb, bindex))) { ++ err = bindex; ++ break; ++ } ++ } ++ dput(parent); ++ ++ /* bottom up here */ ++ if (unlikely(err < 0)) ++ err = au_wbr_bu(sb, bstart - 1); ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* bottom up */ ++static int au_wbr_copyup_bu(struct dentry *dentry) ++{ ++ int err; ++ aufs_bindex_t bstart; ++ ++ bstart = au_dbstart(dentry); ++ err = au_wbr_bu(dentry->d_sb, bstart); ++ AuDbg("b%d\n", err); ++ if (err > bstart) ++ err = au_wbr_nonopq(dentry, err); ++ ++ AuDbg("b%d\n", err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_wbr_copyup_operations au_wbr_copyup_ops[] = { ++ [AuWbrCopyup_TDP] = { ++ .copyup = au_wbr_copyup_tdp ++ }, ++ [AuWbrCopyup_BUP] = { ++ .copyup = au_wbr_copyup_bup ++ }, ++ [AuWbrCopyup_BU] = { ++ .copyup = au_wbr_copyup_bu ++ } ++}; ++ ++struct au_wbr_create_operations au_wbr_create_ops[] = { ++ [AuWbrCreate_TDP] = { ++ .create = au_wbr_create_tdp ++ }, ++ [AuWbrCreate_RR] = { ++ .create = au_wbr_create_rr, ++ .init = au_wbr_create_init_rr ++ }, ++ [AuWbrCreate_MFS] = { ++ .create = au_wbr_create_mfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_MFSV] = { ++ .create = au_wbr_create_mfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_MFSRR] = { ++ .create = au_wbr_create_mfsrr, ++ .init = au_wbr_create_init_mfsrr, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_MFSRRV] = { ++ .create = au_wbr_create_mfsrr, ++ .init = au_wbr_create_init_mfsrr, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_PMFS] = { ++ .create = au_wbr_create_pmfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ }, ++ [AuWbrCreate_PMFSV] = { ++ .create = au_wbr_create_pmfs, ++ .init = au_wbr_create_init_mfs, ++ .fin = au_wbr_create_fin_mfs ++ } ++}; +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/whout.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,1049 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * whiteout for logical deletion and opaque directory ++ */ ++ ++#include "aufs.h" ++ ++#define WH_MASK S_IRUGO ++ ++/* ++ * If a directory contains this file, then it is opaque. We start with the ++ * .wh. flag so that it is blocked by lookup. ++ */ ++static struct qstr diropq_name = { ++ .name = AUFS_WH_DIROPQ, ++ .len = sizeof(AUFS_WH_DIROPQ) - 1 ++}; ++ ++/* ++ * generate whiteout name, which is NOT terminated by NULL. ++ * @name: original d_name.name ++ * @len: original d_name.len ++ * @wh: whiteout qstr ++ * returns zero when succeeds, otherwise error. ++ * succeeded value as wh->name should be freed by kfree(). ++ */ ++int au_wh_name_alloc(struct qstr *wh, const struct qstr *name) ++{ ++ char *p; ++ ++ if (unlikely(name->len > PATH_MAX - AUFS_WH_PFX_LEN)) ++ return -ENAMETOOLONG; ++ ++ wh->len = name->len + AUFS_WH_PFX_LEN; ++ p = kmalloc(wh->len, GFP_NOFS); ++ wh->name = p; ++ if (p) { ++ memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); ++ memcpy(p + AUFS_WH_PFX_LEN, name->name, name->len); ++ /* smp_mb(); */ ++ return 0; ++ } ++ return -ENOMEM; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * test if the @wh_name exists under @h_parent. ++ * @try_sio specifies the necessary of super-io. ++ */ ++int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, ++ struct au_branch *br, int try_sio) ++{ ++ int err; ++ struct dentry *wh_dentry; ++ ++ if (!try_sio) ++ wh_dentry = au_lkup_one(wh_name, h_parent, br, /*nd*/NULL); ++ else ++ wh_dentry = au_sio_lkup_one(wh_name, h_parent, br); ++ err = PTR_ERR(wh_dentry); ++ if (IS_ERR(wh_dentry)) ++ goto out; ++ ++ err = 0; ++ if (!wh_dentry->d_inode) ++ goto out_wh; /* success */ ++ ++ err = 1; ++ if (S_ISREG(wh_dentry->d_inode->i_mode)) ++ goto out_wh; /* success */ ++ ++ err = -EIO; ++ AuIOErr("%.*s Invalid whiteout entry type 0%o.\n", ++ AuDLNPair(wh_dentry), wh_dentry->d_inode->i_mode); ++ ++out_wh: ++ dput(wh_dentry); ++out: ++ return err; ++} ++ ++/* ++ * test if the @h_dentry sets opaque or not. ++ */ ++int au_diropq_test(struct dentry *h_dentry, struct au_branch *br) ++{ ++ int err; ++ struct inode *h_dir; ++ ++ h_dir = h_dentry->d_inode; ++ err = au_wh_test(h_dentry, &diropq_name, br, ++ au_test_h_perm_sio(h_dir, MAY_EXEC)); ++ return err; ++} ++ ++/* ++ * returns a negative dentry whose name is unique and temporary. ++ */ ++struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, ++ struct qstr *prefix) ++{ ++ struct dentry *dentry; ++ int i; ++ char defname[NAME_MAX - AUFS_MAX_NAMELEN + DNAME_INLINE_LEN + 1], ++ *name, *p; ++ /* strict atomic_t is unnecessary here */ ++ static unsigned short cnt; ++ struct qstr qs; ++ ++ BUILD_BUG_ON(sizeof(cnt) * 2 > AUFS_WH_TMP_LEN); ++ ++ name = defname; ++ qs.len = sizeof(defname) - DNAME_INLINE_LEN + prefix->len - 1; ++ if (unlikely(prefix->len > DNAME_INLINE_LEN)) { ++ dentry = ERR_PTR(-ENAMETOOLONG); ++ if (unlikely(qs.len > NAME_MAX)) ++ goto out; ++ dentry = ERR_PTR(-ENOMEM); ++ name = kmalloc(qs.len + 1, GFP_NOFS); ++ if (unlikely(!name)) ++ goto out; ++ } ++ ++ /* doubly whiteout-ed */ ++ memcpy(name, AUFS_WH_PFX AUFS_WH_PFX, AUFS_WH_PFX_LEN * 2); ++ p = name + AUFS_WH_PFX_LEN * 2; ++ memcpy(p, prefix->name, prefix->len); ++ p += prefix->len; ++ *p++ = '.'; ++ AuDebugOn(name + qs.len + 1 - p <= AUFS_WH_TMP_LEN); ++ ++ qs.name = name; ++ for (i = 0; i < 3; i++) { ++ sprintf(p, "%.*x", AUFS_WH_TMP_LEN, cnt++); ++ dentry = au_sio_lkup_one(&qs, h_parent, br); ++ if (IS_ERR(dentry) || !dentry->d_inode) ++ goto out_name; ++ dput(dentry); ++ } ++ /* pr_warning("could not get random name\n"); */ ++ dentry = ERR_PTR(-EEXIST); ++ AuDbg("%.*s\n", AuLNPair(&qs)); ++ BUG(); ++ ++out_name: ++ if (name != defname) ++ kfree(name); ++out: ++ AuTraceErrPtr(dentry); ++ return dentry; ++} ++ ++/* ++ * rename the @h_dentry on @br to the whiteouted temporary name. ++ */ ++int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br) ++{ ++ int err; ++ struct path h_path = { ++ .mnt = br->br_mnt ++ }; ++ struct inode *h_dir; ++ struct dentry *h_parent; ++ ++ h_parent = h_dentry->d_parent; /* dir inode is locked */ ++ h_dir = h_parent->d_inode; ++ IMustLock(h_dir); ++ ++ h_path.dentry = au_whtmp_lkup(h_parent, br, &h_dentry->d_name); ++ err = PTR_ERR(h_path.dentry); ++ if (IS_ERR(h_path.dentry)) ++ goto out; ++ ++ /* under the same dir, no need to lock_rename() */ ++ err = vfsub_rename(h_dir, h_dentry, h_dir, &h_path); ++ AuTraceErr(err); ++ dput(h_path.dentry); ++ ++out: ++ AuTraceErr(err); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * functions for removing a whiteout ++ */ ++ ++static int do_unlink_wh(struct inode *h_dir, struct path *h_path) ++{ ++ int force; ++ ++ /* ++ * forces superio when the dir has a sticky bit. ++ * this may be a violation of unix fs semantics. ++ */ ++ force = (h_dir->i_mode & S_ISVTX) ++ && h_path->dentry->d_inode->i_uid != current_fsuid(); ++ return vfsub_unlink(h_dir, h_path, force); ++} ++ ++int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, ++ struct dentry *dentry) ++{ ++ int err; ++ ++ err = do_unlink_wh(h_dir, h_path); ++ if (!err && dentry) ++ au_set_dbwh(dentry, -1); ++ ++ return err; ++} ++ ++static int unlink_wh_name(struct dentry *h_parent, struct qstr *wh, ++ struct au_branch *br) ++{ ++ int err; ++ struct path h_path = { ++ .mnt = br->br_mnt ++ }; ++ ++ err = 0; ++ h_path.dentry = au_lkup_one(wh, h_parent, br, /*nd*/NULL); ++ if (IS_ERR(h_path.dentry)) ++ err = PTR_ERR(h_path.dentry); ++ else { ++ if (h_path.dentry->d_inode ++ && S_ISREG(h_path.dentry->d_inode->i_mode)) ++ err = do_unlink_wh(h_parent->d_inode, &h_path); ++ dput(h_path.dentry); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * initialize/clean whiteout for a branch ++ */ ++ ++static void au_wh_clean(struct inode *h_dir, struct path *whpath, ++ const int isdir) ++{ ++ int err; ++ ++ if (!whpath->dentry->d_inode) ++ return; ++ ++ err = mnt_want_write(whpath->mnt); ++ if (!err) { ++ if (isdir) ++ err = vfsub_rmdir(h_dir, whpath); ++ else ++ err = vfsub_unlink(h_dir, whpath, /*force*/0); ++ mnt_drop_write(whpath->mnt); ++ } ++ if (unlikely(err)) ++ pr_warning("failed removing %.*s (%d), ignored.\n", ++ AuDLNPair(whpath->dentry), err); ++} ++ ++static int test_linkable(struct dentry *h_root) ++{ ++ struct inode *h_dir = h_root->d_inode; ++ ++ if (h_dir->i_op->link) ++ return 0; ++ ++ pr_err("%.*s (%s) doesn't support link(2), use noplink and rw+nolwh\n", ++ AuDLNPair(h_root), au_sbtype(h_root->d_sb)); ++ return -ENOSYS; ++} ++ ++/* todo: should this mkdir be done in /sbin/mount.aufs helper? */ ++static int au_whdir(struct inode *h_dir, struct path *path) ++{ ++ int err; ++ ++ err = -EEXIST; ++ if (!path->dentry->d_inode) { ++ int mode = S_IRWXU; ++ ++ if (au_test_nfs(path->dentry->d_sb)) ++ mode |= S_IXUGO; ++ err = mnt_want_write(path->mnt); ++ if (!err) { ++ err = vfsub_mkdir(h_dir, path, mode); ++ mnt_drop_write(path->mnt); ++ } ++ } else if (S_ISDIR(path->dentry->d_inode->i_mode)) ++ err = 0; ++ else ++ pr_err("unknown %.*s exists\n", AuDLNPair(path->dentry)); ++ ++ return err; ++} ++ ++struct au_wh_base { ++ const struct qstr *name; ++ struct dentry *dentry; ++}; ++ ++static void au_wh_init_ro(struct inode *h_dir, struct au_wh_base base[], ++ struct path *h_path) ++{ ++ h_path->dentry = base[AuBrWh_BASE].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/0); ++ h_path->dentry = base[AuBrWh_PLINK].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++ h_path->dentry = base[AuBrWh_ORPH].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++} ++ ++/* ++ * returns tri-state, ++ * minus: error, caller should print the mesage ++ * zero: succuess ++ * plus: error, caller should NOT print the mesage ++ */ ++static int au_wh_init_rw_nolink(struct dentry *h_root, struct au_wbr *wbr, ++ int do_plink, struct au_wh_base base[], ++ struct path *h_path) ++{ ++ int err; ++ struct inode *h_dir; ++ ++ h_dir = h_root->d_inode; ++ h_path->dentry = base[AuBrWh_BASE].dentry; ++ au_wh_clean(h_dir, h_path, /*isdir*/0); ++ h_path->dentry = base[AuBrWh_PLINK].dentry; ++ if (do_plink) { ++ err = test_linkable(h_root); ++ if (unlikely(err)) { ++ err = 1; ++ goto out; ++ } ++ ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); ++ } else ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++ h_path->dentry = base[AuBrWh_ORPH].dentry; ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); ++ ++out: ++ return err; ++} ++ ++/* ++ * for the moment, aufs supports the branch filesystem which does not support ++ * link(2). testing on FAT which does not support i_op->setattr() fully either, ++ * copyup failed. finally, such filesystem will not be used as the writable ++ * branch. ++ * ++ * returns tri-state, see above. ++ */ ++static int au_wh_init_rw(struct dentry *h_root, struct au_wbr *wbr, ++ int do_plink, struct au_wh_base base[], ++ struct path *h_path) ++{ ++ int err; ++ struct inode *h_dir; ++ ++ WbrWhMustWriteLock(wbr); ++ ++ err = test_linkable(h_root); ++ if (unlikely(err)) { ++ err = 1; ++ goto out; ++ } ++ ++ /* ++ * todo: should this create be done in /sbin/mount.aufs helper? ++ */ ++ err = -EEXIST; ++ h_dir = h_root->d_inode; ++ if (!base[AuBrWh_BASE].dentry->d_inode) { ++ err = mnt_want_write(h_path->mnt); ++ if (!err) { ++ h_path->dentry = base[AuBrWh_BASE].dentry; ++ err = vfsub_create(h_dir, h_path, WH_MASK); ++ mnt_drop_write(h_path->mnt); ++ } ++ } else if (S_ISREG(base[AuBrWh_BASE].dentry->d_inode->i_mode)) ++ err = 0; ++ else ++ pr_err("unknown %.*s/%.*s exists\n", ++ AuDLNPair(h_root), AuDLNPair(base[AuBrWh_BASE].dentry)); ++ if (unlikely(err)) ++ goto out; ++ ++ h_path->dentry = base[AuBrWh_PLINK].dentry; ++ if (do_plink) { ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); ++ } else ++ au_wh_clean(h_dir, h_path, /*isdir*/1); ++ wbr->wbr_whbase = dget(base[AuBrWh_BASE].dentry); ++ ++ h_path->dentry = base[AuBrWh_ORPH].dentry; ++ err = au_whdir(h_dir, h_path); ++ if (unlikely(err)) ++ goto out; ++ wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); ++ ++out: ++ return err; ++} ++ ++/* ++ * initialize the whiteout base file/dir for @br. ++ */ ++int au_wh_init(struct dentry *h_root, struct au_branch *br, ++ struct super_block *sb) ++{ ++ int err, i; ++ const unsigned char do_plink ++ = !!au_opt_test(au_mntflags(sb), PLINK); ++ struct path path = { ++ .mnt = br->br_mnt ++ }; ++ struct inode *h_dir; ++ struct au_wbr *wbr = br->br_wbr; ++ static const struct qstr base_name[] = { ++ [AuBrWh_BASE] = { ++ .name = AUFS_BASE_NAME, ++ .len = sizeof(AUFS_BASE_NAME) - 1 ++ }, ++ [AuBrWh_PLINK] = { ++ .name = AUFS_PLINKDIR_NAME, ++ .len = sizeof(AUFS_PLINKDIR_NAME) - 1 ++ }, ++ [AuBrWh_ORPH] = { ++ .name = AUFS_ORPHDIR_NAME, ++ .len = sizeof(AUFS_ORPHDIR_NAME) - 1 ++ } ++ }; ++ struct au_wh_base base[] = { ++ [AuBrWh_BASE] = { ++ .name = base_name + AuBrWh_BASE, ++ .dentry = NULL ++ }, ++ [AuBrWh_PLINK] = { ++ .name = base_name + AuBrWh_PLINK, ++ .dentry = NULL ++ }, ++ [AuBrWh_ORPH] = { ++ .name = base_name + AuBrWh_ORPH, ++ .dentry = NULL ++ } ++ }; ++ ++ if (wbr) ++ WbrWhMustWriteLock(wbr); ++ ++ for (i = 0; i < AuBrWh_Last; i++) { ++ /* doubly whiteouted */ ++ struct dentry *d; ++ ++ d = au_wh_lkup(h_root, (void *)base[i].name, br); ++ err = PTR_ERR(d); ++ if (IS_ERR(d)) ++ goto out; ++ ++ base[i].dentry = d; ++ AuDebugOn(wbr ++ && wbr->wbr_wh[i] ++ && wbr->wbr_wh[i] != base[i].dentry); ++ } ++ ++ if (wbr) ++ for (i = 0; i < AuBrWh_Last; i++) { ++ dput(wbr->wbr_wh[i]); ++ wbr->wbr_wh[i] = NULL; ++ } ++ ++ err = 0; ++ if (!au_br_writable(br->br_perm)) { ++ h_dir = h_root->d_inode; ++ au_wh_init_ro(h_dir, base, &path); ++ } else if (!au_br_wh_linkable(br->br_perm)) { ++ err = au_wh_init_rw_nolink(h_root, wbr, do_plink, base, &path); ++ if (err > 0) ++ goto out; ++ else if (err) ++ goto out_err; ++ } else { ++ err = au_wh_init_rw(h_root, wbr, do_plink, base, &path); ++ if (err > 0) ++ goto out; ++ else if (err) ++ goto out_err; ++ } ++ goto out; /* success */ ++ ++out_err: ++ pr_err("an error(%d) on the writable branch %.*s(%s)\n", ++ err, AuDLNPair(h_root), au_sbtype(h_root->d_sb)); ++out: ++ for (i = 0; i < AuBrWh_Last; i++) ++ dput(base[i].dentry); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++/* ++ * whiteouts are all hard-linked usually. ++ * when its link count reaches a ceiling, we create a new whiteout base ++ * asynchronously. ++ */ ++ ++struct reinit_br_wh { ++ struct super_block *sb; ++ struct au_branch *br; ++}; ++ ++static void reinit_br_wh(void *arg) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct path h_path; ++ struct reinit_br_wh *a = arg; ++ struct au_wbr *wbr; ++ struct inode *dir; ++ struct dentry *h_root; ++ struct au_hinode *hdir; ++ ++ err = 0; ++ wbr = a->br->br_wbr; ++ /* big aufs lock */ ++ si_noflush_write_lock(a->sb); ++ if (!au_br_writable(a->br->br_perm)) ++ goto out; ++ bindex = au_br_index(a->sb, a->br->br_id); ++ if (unlikely(bindex < 0)) ++ goto out; ++ ++ di_read_lock_parent(a->sb->s_root, AuLock_IR); ++ dir = a->sb->s_root->d_inode; ++ hdir = au_hi(dir, bindex); ++ h_root = au_h_dptr(a->sb->s_root, bindex); ++ ++ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ wbr_wh_write_lock(wbr); ++ err = au_h_verify(wbr->wbr_whbase, au_opt_udba(a->sb), hdir->hi_inode, ++ h_root, a->br); ++ if (!err) { ++ err = mnt_want_write(a->br->br_mnt); ++ if (!err) { ++ h_path.dentry = wbr->wbr_whbase; ++ h_path.mnt = a->br->br_mnt; ++ err = vfsub_unlink(hdir->hi_inode, &h_path, /*force*/0); ++ mnt_drop_write(a->br->br_mnt); ++ } ++ } else { ++ pr_warning("%.*s is moved, ignored\n", ++ AuDLNPair(wbr->wbr_whbase)); ++ err = 0; ++ } ++ dput(wbr->wbr_whbase); ++ wbr->wbr_whbase = NULL; ++ if (!err) ++ err = au_wh_init(h_root, a->br, a->sb); ++ wbr_wh_write_unlock(wbr); ++ au_hn_imtx_unlock(hdir); ++ di_read_unlock(a->sb->s_root, AuLock_IR); ++ ++out: ++ if (wbr) ++ atomic_dec(&wbr->wbr_wh_running); ++ atomic_dec(&a->br->br_count); ++ si_write_unlock(a->sb); ++ au_nwt_done(&au_sbi(a->sb)->si_nowait); ++ kfree(arg); ++ if (unlikely(err)) ++ AuIOErr("err %d\n", err); ++} ++ ++static void kick_reinit_br_wh(struct super_block *sb, struct au_branch *br) ++{ ++ int do_dec, wkq_err; ++ struct reinit_br_wh *arg; ++ ++ do_dec = 1; ++ if (atomic_inc_return(&br->br_wbr->wbr_wh_running) != 1) ++ goto out; ++ ++ /* ignore ENOMEM */ ++ arg = kmalloc(sizeof(*arg), GFP_NOFS); ++ if (arg) { ++ /* ++ * dec(wh_running), kfree(arg) and dec(br_count) ++ * in reinit function ++ */ ++ arg->sb = sb; ++ arg->br = br; ++ atomic_inc(&br->br_count); ++ wkq_err = au_wkq_nowait(reinit_br_wh, arg, sb, /*flags*/0); ++ if (unlikely(wkq_err)) { ++ atomic_dec(&br->br_wbr->wbr_wh_running); ++ atomic_dec(&br->br_count); ++ kfree(arg); ++ } ++ do_dec = 0; ++ } ++ ++out: ++ if (do_dec) ++ atomic_dec(&br->br_wbr->wbr_wh_running); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create the whiteout @wh. ++ */ ++static int link_or_create_wh(struct super_block *sb, aufs_bindex_t bindex, ++ struct dentry *wh) ++{ ++ int err; ++ struct path h_path = { ++ .dentry = wh ++ }; ++ struct au_branch *br; ++ struct au_wbr *wbr; ++ struct dentry *h_parent; ++ struct inode *h_dir; ++ ++ h_parent = wh->d_parent; /* dir inode is locked */ ++ h_dir = h_parent->d_inode; ++ IMustLock(h_dir); ++ ++ br = au_sbr(sb, bindex); ++ h_path.mnt = br->br_mnt; ++ wbr = br->br_wbr; ++ wbr_wh_read_lock(wbr); ++ if (wbr->wbr_whbase) { ++ err = vfsub_link(wbr->wbr_whbase, h_dir, &h_path); ++ if (!err || err != -EMLINK) ++ goto out; ++ ++ /* link count full. re-initialize br_whbase. */ ++ kick_reinit_br_wh(sb, br); ++ } ++ ++ /* return this error in this context */ ++ err = vfsub_create(h_dir, &h_path, WH_MASK); ++ ++out: ++ wbr_wh_read_unlock(wbr); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create or remove the diropq. ++ */ ++static struct dentry *do_diropq(struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int flags) ++{ ++ struct dentry *opq_dentry, *h_dentry; ++ struct super_block *sb; ++ struct au_branch *br; ++ int err; ++ ++ sb = dentry->d_sb; ++ br = au_sbr(sb, bindex); ++ h_dentry = au_h_dptr(dentry, bindex); ++ opq_dentry = au_lkup_one(&diropq_name, h_dentry, br, /*nd*/NULL); ++ if (IS_ERR(opq_dentry)) ++ goto out; ++ ++ if (au_ftest_diropq(flags, CREATE)) { ++ err = link_or_create_wh(sb, bindex, opq_dentry); ++ if (!err) { ++ au_set_dbdiropq(dentry, bindex); ++ goto out; /* success */ ++ } ++ } else { ++ struct path tmp = { ++ .dentry = opq_dentry, ++ .mnt = br->br_mnt ++ }; ++ err = do_unlink_wh(au_h_iptr(dentry->d_inode, bindex), &tmp); ++ if (!err) ++ au_set_dbdiropq(dentry, -1); ++ } ++ dput(opq_dentry); ++ opq_dentry = ERR_PTR(err); ++ ++out: ++ return opq_dentry; ++} ++ ++struct do_diropq_args { ++ struct dentry **errp; ++ struct dentry *dentry; ++ aufs_bindex_t bindex; ++ unsigned int flags; ++}; ++ ++static void call_do_diropq(void *args) ++{ ++ struct do_diropq_args *a = args; ++ *a->errp = do_diropq(a->dentry, a->bindex, a->flags); ++} ++ ++struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int flags) ++{ ++ struct dentry *diropq, *h_dentry; ++ ++ h_dentry = au_h_dptr(dentry, bindex); ++ if (!au_test_h_perm_sio(h_dentry->d_inode, MAY_EXEC | MAY_WRITE)) ++ diropq = do_diropq(dentry, bindex, flags); ++ else { ++ int wkq_err; ++ struct do_diropq_args args = { ++ .errp = &diropq, ++ .dentry = dentry, ++ .bindex = bindex, ++ .flags = flags ++ }; ++ ++ wkq_err = au_wkq_wait(call_do_diropq, &args); ++ if (unlikely(wkq_err)) ++ diropq = ERR_PTR(wkq_err); ++ } ++ ++ return diropq; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * lookup whiteout dentry. ++ * @h_parent: lower parent dentry which must exist and be locked ++ * @base_name: name of dentry which will be whiteouted ++ * returns dentry for whiteout. ++ */ ++struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, ++ struct au_branch *br) ++{ ++ int err; ++ struct qstr wh_name; ++ struct dentry *wh_dentry; ++ ++ err = au_wh_name_alloc(&wh_name, base_name); ++ wh_dentry = ERR_PTR(err); ++ if (!err) { ++ wh_dentry = au_lkup_one(&wh_name, h_parent, br, /*nd*/NULL); ++ kfree(wh_name.name); ++ } ++ return wh_dentry; ++} ++ ++/* ++ * link/create a whiteout for @dentry on @bindex. ++ */ ++struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent) ++{ ++ struct dentry *wh_dentry; ++ struct super_block *sb; ++ int err; ++ ++ sb = dentry->d_sb; ++ wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, au_sbr(sb, bindex)); ++ if (!IS_ERR(wh_dentry) && !wh_dentry->d_inode) { ++ err = link_or_create_wh(sb, bindex, wh_dentry); ++ if (!err) ++ au_set_dbwh(dentry, bindex); ++ else { ++ dput(wh_dentry); ++ wh_dentry = ERR_PTR(err); ++ } ++ } ++ ++ return wh_dentry; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* Delete all whiteouts in this directory on branch bindex. */ ++static int del_wh_children(struct dentry *h_dentry, struct au_nhash *whlist, ++ aufs_bindex_t bindex, struct au_branch *br) ++{ ++ int err; ++ unsigned long ul, n; ++ struct qstr wh_name; ++ char *p; ++ struct hlist_head *head; ++ struct au_vdir_wh *tpos; ++ struct hlist_node *pos; ++ struct au_vdir_destr *str; ++ ++ err = -ENOMEM; ++ p = __getname_gfp(GFP_NOFS); ++ wh_name.name = p; ++ if (unlikely(!wh_name.name)) ++ goto out; ++ ++ err = 0; ++ memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); ++ p += AUFS_WH_PFX_LEN; ++ n = whlist->nh_num; ++ head = whlist->nh_head; ++ for (ul = 0; !err && ul < n; ul++, head++) { ++ hlist_for_each_entry(tpos, pos, head, wh_hash) { ++ if (tpos->wh_bindex != bindex) ++ continue; ++ ++ str = &tpos->wh_str; ++ if (str->len + AUFS_WH_PFX_LEN <= PATH_MAX) { ++ memcpy(p, str->name, str->len); ++ wh_name.len = AUFS_WH_PFX_LEN + str->len; ++ err = unlink_wh_name(h_dentry, &wh_name, br); ++ if (!err) ++ continue; ++ break; ++ } ++ AuIOErr("whiteout name too long %.*s\n", ++ str->len, str->name); ++ err = -EIO; ++ break; ++ } ++ } ++ __putname(wh_name.name); ++ ++out: ++ return err; ++} ++ ++struct del_wh_children_args { ++ int *errp; ++ struct dentry *h_dentry; ++ struct au_nhash *whlist; ++ aufs_bindex_t bindex; ++ struct au_branch *br; ++}; ++ ++static void call_del_wh_children(void *args) ++{ ++ struct del_wh_children_args *a = args; ++ *a->errp = del_wh_children(a->h_dentry, a->whlist, a->bindex, a->br); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp) ++{ ++ struct au_whtmp_rmdir *whtmp; ++ int err; ++ unsigned int rdhash; ++ ++ SiMustAnyLock(sb); ++ ++ whtmp = kmalloc(sizeof(*whtmp), gfp); ++ if (unlikely(!whtmp)) { ++ whtmp = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ whtmp->dir = NULL; ++ whtmp->br = NULL; ++ whtmp->wh_dentry = NULL; ++ /* no estimation for dir size */ ++ rdhash = au_sbi(sb)->si_rdhash; ++ if (!rdhash) ++ rdhash = AUFS_RDHASH_DEF; ++ err = au_nhash_alloc(&whtmp->whlist, rdhash, gfp); ++ if (unlikely(err)) { ++ kfree(whtmp); ++ whtmp = ERR_PTR(err); ++ } ++ ++out: ++ return whtmp; ++} ++ ++void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp) ++{ ++ if (whtmp->br) ++ atomic_dec(&whtmp->br->br_count); ++ dput(whtmp->wh_dentry); ++ iput(whtmp->dir); ++ au_nhash_wh_free(&whtmp->whlist); ++ kfree(whtmp); ++} ++ ++/* ++ * rmdir the whiteouted temporary named dir @h_dentry. ++ * @whlist: whiteouted children. ++ */ ++int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_nhash *whlist) ++{ ++ int err; ++ struct path h_tmp; ++ struct inode *wh_inode, *h_dir; ++ struct au_branch *br; ++ ++ h_dir = wh_dentry->d_parent->d_inode; /* dir inode is locked */ ++ IMustLock(h_dir); ++ ++ br = au_sbr(dir->i_sb, bindex); ++ wh_inode = wh_dentry->d_inode; ++ mutex_lock_nested(&wh_inode->i_mutex, AuLsc_I_CHILD); ++ ++ /* ++ * someone else might change some whiteouts while we were sleeping. ++ * it means this whlist may have an obsoleted entry. ++ */ ++ if (!au_test_h_perm_sio(wh_inode, MAY_EXEC | MAY_WRITE)) ++ err = del_wh_children(wh_dentry, whlist, bindex, br); ++ else { ++ int wkq_err; ++ struct del_wh_children_args args = { ++ .errp = &err, ++ .h_dentry = wh_dentry, ++ .whlist = whlist, ++ .bindex = bindex, ++ .br = br ++ }; ++ ++ wkq_err = au_wkq_wait(call_del_wh_children, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ mutex_unlock(&wh_inode->i_mutex); ++ ++ if (!err) { ++ h_tmp.dentry = wh_dentry; ++ h_tmp.mnt = br->br_mnt; ++ err = vfsub_rmdir(h_dir, &h_tmp); ++ } ++ ++ if (!err) { ++ if (au_ibstart(dir) == bindex) { ++ /* todo: dir->i_mutex is necessary */ ++ au_cpup_attr_timesizes(dir); ++ vfsub_drop_nlink(dir); ++ } ++ return 0; /* success */ ++ } ++ ++ pr_warning("failed removing %.*s(%d), ignored\n", ++ AuDLNPair(wh_dentry), err); ++ return err; ++} ++ ++static void call_rmdir_whtmp(void *args) ++{ ++ int err; ++ aufs_bindex_t bindex; ++ struct au_whtmp_rmdir *a = args; ++ struct super_block *sb; ++ struct dentry *h_parent; ++ struct inode *h_dir; ++ struct au_hinode *hdir; ++ ++ /* rmdir by nfsd may cause deadlock with this i_mutex */ ++ /* mutex_lock(&a->dir->i_mutex); */ ++ err = -EROFS; ++ sb = a->dir->i_sb; ++ si_read_lock(sb, !AuLock_FLUSH); ++ if (!au_br_writable(a->br->br_perm)) ++ goto out; ++ bindex = au_br_index(sb, a->br->br_id); ++ if (unlikely(bindex < 0)) ++ goto out; ++ ++ err = -EIO; ++ ii_write_lock_parent(a->dir); ++ h_parent = dget_parent(a->wh_dentry); ++ h_dir = h_parent->d_inode; ++ hdir = au_hi(a->dir, bindex); ++ au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); ++ err = au_h_verify(a->wh_dentry, au_opt_udba(sb), h_dir, h_parent, ++ a->br); ++ if (!err) { ++ err = mnt_want_write(a->br->br_mnt); ++ if (!err) { ++ err = au_whtmp_rmdir(a->dir, bindex, a->wh_dentry, ++ &a->whlist); ++ mnt_drop_write(a->br->br_mnt); ++ } ++ } ++ au_hn_imtx_unlock(hdir); ++ dput(h_parent); ++ ii_write_unlock(a->dir); ++ ++out: ++ /* mutex_unlock(&a->dir->i_mutex); */ ++ au_whtmp_rmdir_free(a); ++ si_read_unlock(sb); ++ au_nwt_done(&au_sbi(sb)->si_nowait); ++ if (unlikely(err)) ++ AuIOErr("err %d\n", err); ++} ++ ++void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_whtmp_rmdir *args) ++{ ++ int wkq_err; ++ struct super_block *sb; ++ ++ IMustLock(dir); ++ ++ /* all post-process will be done in do_rmdir_whtmp(). */ ++ sb = dir->i_sb; ++ args->dir = au_igrab(dir); ++ args->br = au_sbr(sb, bindex); ++ atomic_inc(&args->br->br_count); ++ args->wh_dentry = dget(wh_dentry); ++ wkq_err = au_wkq_nowait(call_rmdir_whtmp, args, sb, /*flags*/0); ++ if (unlikely(wkq_err)) { ++ pr_warning("rmdir error %.*s (%d), ignored\n", ++ AuDLNPair(wh_dentry), wkq_err); ++ au_whtmp_rmdir_free(args); ++ } ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/whout.h 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,88 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * whiteout for logical deletion and opaque directory ++ */ ++ ++#ifndef __AUFS_WHOUT_H__ ++#define __AUFS_WHOUT_H__ ++ ++#ifdef __KERNEL__ ++ ++#include "dir.h" ++ ++/* whout.c */ ++int au_wh_name_alloc(struct qstr *wh, const struct qstr *name); ++struct au_branch; ++int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, ++ struct au_branch *br, int try_sio); ++int au_diropq_test(struct dentry *h_dentry, struct au_branch *br); ++struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, ++ struct qstr *prefix); ++int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br); ++int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, ++ struct dentry *dentry); ++int au_wh_init(struct dentry *h_parent, struct au_branch *br, ++ struct super_block *sb); ++ ++/* diropq flags */ ++#define AuDiropq_CREATE 1 ++#define au_ftest_diropq(flags, name) ((flags) & AuDiropq_##name) ++#define au_fset_diropq(flags, name) \ ++ do { (flags) |= AuDiropq_##name; } while (0) ++#define au_fclr_diropq(flags, name) \ ++ do { (flags) &= ~AuDiropq_##name; } while (0) ++ ++struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, ++ unsigned int flags); ++struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, ++ struct au_branch *br); ++struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, ++ struct dentry *h_parent); ++ ++/* real rmdir for the whiteout-ed dir */ ++struct au_whtmp_rmdir { ++ struct inode *dir; ++ struct au_branch *br; ++ struct dentry *wh_dentry; ++ struct au_nhash whlist; ++}; ++ ++struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp); ++void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp); ++int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_nhash *whlist); ++void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, ++ struct dentry *wh_dentry, struct au_whtmp_rmdir *args); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline struct dentry *au_diropq_create(struct dentry *dentry, ++ aufs_bindex_t bindex) ++{ ++ return au_diropq_sio(dentry, bindex, AuDiropq_CREATE); ++} ++ ++static inline int au_diropq_remove(struct dentry *dentry, aufs_bindex_t bindex) ++{ ++ return PTR_ERR(au_diropq_sio(dentry, bindex, !AuDiropq_CREATE)); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_WHOUT_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/wkq.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,214 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * workqueue for asynchronous/super-io operations ++ * todo: try new dredential scheme ++ */ ++ ++#include ++#include "aufs.h" ++ ++/* internal workqueue named AUFS_WKQ_NAME */ ++ ++static struct workqueue_struct *au_wkq; ++ ++struct au_wkinfo { ++ struct work_struct wk; ++ struct kobject *kobj; ++ ++ unsigned int flags; /* see wkq.h */ ++ ++ au_wkq_func_t func; ++ void *args; ++ ++ struct completion *comp; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void wkq_func(struct work_struct *wk) ++{ ++ struct au_wkinfo *wkinfo = container_of(wk, struct au_wkinfo, wk); ++ ++ AuDebugOn(current_fsuid()); ++ AuDebugOn(rlimit(RLIMIT_FSIZE) != RLIM_INFINITY); ++ ++ wkinfo->func(wkinfo->args); ++ if (au_ftest_wkq(wkinfo->flags, WAIT)) ++ complete(wkinfo->comp); ++ else { ++ kobject_put(wkinfo->kobj); ++ module_put(THIS_MODULE); /* todo: ?? */ ++ kfree(wkinfo); ++ } ++} ++ ++/* ++ * Since struct completion is large, try allocating it dynamically. ++ */ ++#if defined(CONFIG_4KSTACKS) || defined(AuTest4KSTACKS) ++#define AuWkqCompDeclare(name) struct completion *comp = NULL ++ ++static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) ++{ ++ *comp = kmalloc(sizeof(**comp), GFP_NOFS); ++ if (*comp) { ++ init_completion(*comp); ++ wkinfo->comp = *comp; ++ return 0; ++ } ++ return -ENOMEM; ++} ++ ++static void au_wkq_comp_free(struct completion *comp) ++{ ++ kfree(comp); ++} ++ ++#else ++ ++/* no braces */ ++#define AuWkqCompDeclare(name) \ ++ DECLARE_COMPLETION_ONSTACK(_ ## name); \ ++ struct completion *comp = &_ ## name ++ ++static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) ++{ ++ wkinfo->comp = *comp; ++ return 0; ++} ++ ++static void au_wkq_comp_free(struct completion *comp __maybe_unused) ++{ ++ /* empty */ ++} ++#endif /* 4KSTACKS */ ++ ++static void au_wkq_run(struct au_wkinfo *wkinfo) ++{ ++ if (au_ftest_wkq(wkinfo->flags, NEST)) { ++ if (au_wkq_test()) { ++ AuWarn1("wkq from wkq, due to a dead dir by UDBA?\n"); ++ AuDebugOn(au_ftest_wkq(wkinfo->flags, WAIT)); ++ } ++ } else ++ au_dbg_verify_kthread(); ++ ++ if (au_ftest_wkq(wkinfo->flags, WAIT)) { ++ INIT_WORK_ONSTACK(&wkinfo->wk, wkq_func); ++ queue_work(au_wkq, &wkinfo->wk); ++ } else { ++ INIT_WORK(&wkinfo->wk, wkq_func); ++ schedule_work(&wkinfo->wk); ++ } ++} ++ ++/* ++ * Be careful. It is easy to make deadlock happen. ++ * processA: lock, wkq and wait ++ * processB: wkq and wait, lock in wkq ++ * --> deadlock ++ */ ++int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args) ++{ ++ int err; ++ AuWkqCompDeclare(comp); ++ struct au_wkinfo wkinfo = { ++ .flags = flags, ++ .func = func, ++ .args = args ++ }; ++ ++ err = au_wkq_comp_alloc(&wkinfo, &comp); ++ if (!err) { ++ au_wkq_run(&wkinfo); ++ /* no timeout, no interrupt */ ++ wait_for_completion(wkinfo.comp); ++ au_wkq_comp_free(comp); ++ destroy_work_on_stack(&wkinfo.wk); ++ } ++ ++ return err; ++ ++} ++ ++/* ++ * Note: dget/dput() in func for aufs dentries are not supported. It will be a ++ * problem in a concurrent umounting. ++ */ ++int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb, ++ unsigned int flags) ++{ ++ int err; ++ struct au_wkinfo *wkinfo; ++ ++ atomic_inc(&au_sbi(sb)->si_nowait.nw_len); ++ ++ /* ++ * wkq_func() must free this wkinfo. ++ * it highly depends upon the implementation of workqueue. ++ */ ++ err = 0; ++ wkinfo = kmalloc(sizeof(*wkinfo), GFP_NOFS); ++ if (wkinfo) { ++ wkinfo->kobj = &au_sbi(sb)->si_kobj; ++ wkinfo->flags = flags & ~AuWkq_WAIT; ++ wkinfo->func = func; ++ wkinfo->args = args; ++ wkinfo->comp = NULL; ++ kobject_get(wkinfo->kobj); ++ __module_get(THIS_MODULE); /* todo: ?? */ ++ ++ au_wkq_run(wkinfo); ++ } else { ++ err = -ENOMEM; ++ au_nwt_done(&au_sbi(sb)->si_nowait); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++void au_nwt_init(struct au_nowait_tasks *nwt) ++{ ++ atomic_set(&nwt->nw_len, 0); ++ /* smp_mb(); */ /* atomic_set */ ++ init_waitqueue_head(&nwt->nw_wq); ++} ++ ++void au_wkq_fin(void) ++{ ++ destroy_workqueue(au_wkq); ++} ++ ++int __init au_wkq_init(void) ++{ ++ int err; ++ ++ err = 0; ++ BUILD_BUG_ON(!WQ_RESCUER); ++ au_wkq = alloc_workqueue(AUFS_WKQ_NAME, !WQ_RESCUER, WQ_DFL_ACTIVE); ++ if (IS_ERR(au_wkq)) ++ err = PTR_ERR(au_wkq); ++ else if (!au_wkq) ++ err = -ENOMEM; ++ ++ return err; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/wkq.h 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,92 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * workqueue for asynchronous/super-io operations ++ * todo: try new credentials management scheme ++ */ ++ ++#ifndef __AUFS_WKQ_H__ ++#define __AUFS_WKQ_H__ ++ ++#ifdef __KERNEL__ ++ ++struct super_block; ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * in the next operation, wait for the 'nowait' tasks in system-wide workqueue ++ */ ++struct au_nowait_tasks { ++ atomic_t nw_len; ++ wait_queue_head_t nw_wq; ++}; ++ ++/* ---------------------------------------------------------------------- */ ++ ++typedef void (*au_wkq_func_t)(void *args); ++ ++/* wkq flags */ ++#define AuWkq_WAIT 1 ++#define AuWkq_NEST (1 << 1) ++#define au_ftest_wkq(flags, name) ((flags) & AuWkq_##name) ++#define au_fset_wkq(flags, name) \ ++ do { (flags) |= AuWkq_##name; } while (0) ++#define au_fclr_wkq(flags, name) \ ++ do { (flags) &= ~AuWkq_##name; } while (0) ++ ++#ifndef CONFIG_AUFS_HNOTIFY ++#undef AuWkq_NEST ++#define AuWkq_NEST 0 ++#endif ++ ++/* wkq.c */ ++int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args); ++int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb, ++ unsigned int flags); ++void au_nwt_init(struct au_nowait_tasks *nwt); ++int __init au_wkq_init(void); ++void au_wkq_fin(void); ++ ++/* ---------------------------------------------------------------------- */ ++ ++static inline int au_wkq_test(void) ++{ ++ return current->flags & PF_WQ_WORKER; ++} ++ ++static inline int au_wkq_wait(au_wkq_func_t func, void *args) ++{ ++ return au_wkq_do_wait(AuWkq_WAIT, func, args); ++} ++ ++static inline void au_nwt_done(struct au_nowait_tasks *nwt) ++{ ++ if (atomic_dec_and_test(&nwt->nw_len)) ++ wake_up_all(&nwt->nw_wq); ++} ++ ++static inline int au_nwt_flush(struct au_nowait_tasks *nwt) ++{ ++ wait_event(nwt->nw_wq, !atomic_read(&nwt->nw_len)); ++ return 0; ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* __AUFS_WKQ_H__ */ +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/fs/aufs/xino.c 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,1264 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/* ++ * external inode number translation table and bitmap ++ */ ++ ++#include ++#include "aufs.h" ++ ++/* todo: unnecessary to support mmap_sem since kernel-space? */ ++ssize_t xino_fread(au_readf_t func, struct file *file, void *kbuf, size_t size, ++ loff_t *pos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ union { ++ void *k; ++ char __user *u; ++ } buf; ++ ++ buf.k = kbuf; ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ do { ++ /* todo: signal_pending? */ ++ err = func(file, buf.u, size, pos); ++ } while (err == -EAGAIN || err == -EINTR); ++ set_fs(oldfs); ++ ++#if 0 /* reserved for future use */ ++ if (err > 0) ++ fsnotify_access(file->f_dentry); ++#endif ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static ssize_t do_xino_fwrite(au_writef_t func, struct file *file, void *kbuf, ++ size_t size, loff_t *pos) ++{ ++ ssize_t err; ++ mm_segment_t oldfs; ++ union { ++ void *k; ++ const char __user *u; ++ } buf; ++ ++ buf.k = kbuf; ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ do { ++ /* todo: signal_pending? */ ++ err = func(file, buf.u, size, pos); ++ } while (err == -EAGAIN || err == -EINTR); ++ set_fs(oldfs); ++ ++#if 0 /* reserved for future use */ ++ if (err > 0) ++ fsnotify_modify(file->f_dentry); ++#endif ++ ++ return err; ++} ++ ++struct do_xino_fwrite_args { ++ ssize_t *errp; ++ au_writef_t func; ++ struct file *file; ++ void *buf; ++ size_t size; ++ loff_t *pos; ++}; ++ ++static void call_do_xino_fwrite(void *args) ++{ ++ struct do_xino_fwrite_args *a = args; ++ *a->errp = do_xino_fwrite(a->func, a->file, a->buf, a->size, a->pos); ++} ++ ++ssize_t xino_fwrite(au_writef_t func, struct file *file, void *buf, size_t size, ++ loff_t *pos) ++{ ++ ssize_t err; ++ ++ /* todo: signal block and no wkq? */ ++ if (rlimit(RLIMIT_FSIZE) == RLIM_INFINITY) { ++ lockdep_off(); ++ err = do_xino_fwrite(func, file, buf, size, pos); ++ lockdep_on(); ++ } else { ++ /* ++ * it breaks RLIMIT_FSIZE and normal user's limit, ++ * users should care about quota and real 'filesystem full.' ++ */ ++ int wkq_err; ++ struct do_xino_fwrite_args args = { ++ .errp = &err, ++ .func = func, ++ .file = file, ++ .buf = buf, ++ .size = size, ++ .pos = pos ++ }; ++ ++ wkq_err = au_wkq_wait(call_do_xino_fwrite, &args); ++ if (unlikely(wkq_err)) ++ err = wkq_err; ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create a new xinofile at the same place/path as @base_file. ++ */ ++struct file *au_xino_create2(struct file *base_file, struct file *copy_src) ++{ ++ struct file *file; ++ struct dentry *base, *parent; ++ struct inode *dir; ++ struct qstr *name; ++ struct path path; ++ int err; ++ ++ base = base_file->f_dentry; ++ parent = base->d_parent; /* dir inode is locked */ ++ dir = parent->d_inode; ++ IMustLock(dir); ++ ++ file = ERR_PTR(-EINVAL); ++ name = &base->d_name; ++ path.dentry = vfsub_lookup_one_len(name->name, parent, name->len); ++ if (IS_ERR(path.dentry)) { ++ file = (void *)path.dentry; ++ pr_err("%.*s lookup err %ld\n", ++ AuLNPair(name), PTR_ERR(path.dentry)); ++ goto out; ++ } ++ ++ /* no need to mnt_want_write() since we call dentry_open() later */ ++ err = vfs_create(dir, path.dentry, S_IRUGO | S_IWUGO, NULL); ++ if (unlikely(err)) { ++ file = ERR_PTR(err); ++ pr_err("%.*s create err %d\n", AuLNPair(name), err); ++ goto out_dput; ++ } ++ ++ path.mnt = base_file->f_vfsmnt; ++ file = vfsub_dentry_open(&path, ++ O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE ++ /* | __FMODE_NONOTIFY */); ++ if (IS_ERR(file)) { ++ pr_err("%.*s open err %ld\n", AuLNPair(name), PTR_ERR(file)); ++ goto out_dput; ++ } ++ ++ err = vfsub_unlink(dir, &file->f_path, /*force*/0); ++ if (unlikely(err)) { ++ pr_err("%.*s unlink err %d\n", AuLNPair(name), err); ++ goto out_fput; ++ } ++ ++ if (copy_src) { ++ /* no one can touch copy_src xino */ ++ err = au_copy_file(file, copy_src, ++ i_size_read(copy_src->f_dentry->d_inode)); ++ if (unlikely(err)) { ++ pr_err("%.*s copy err %d\n", AuLNPair(name), err); ++ goto out_fput; ++ } ++ } ++ goto out_dput; /* success */ ++ ++out_fput: ++ fput(file); ++ file = ERR_PTR(err); ++out_dput: ++ dput(path.dentry); ++out: ++ return file; ++} ++ ++struct au_xino_lock_dir { ++ struct au_hinode *hdir; ++ struct dentry *parent; ++ struct mutex *mtx; ++}; ++ ++static void au_xino_lock_dir(struct super_block *sb, struct file *xino, ++ struct au_xino_lock_dir *ldir) ++{ ++ aufs_bindex_t brid, bindex; ++ ++ ldir->hdir = NULL; ++ bindex = -1; ++ brid = au_xino_brid(sb); ++ if (brid >= 0) ++ bindex = au_br_index(sb, brid); ++ if (bindex >= 0) { ++ ldir->hdir = au_hi(sb->s_root->d_inode, bindex); ++ au_hn_imtx_lock_nested(ldir->hdir, AuLsc_I_PARENT); ++ } else { ++ ldir->parent = dget_parent(xino->f_dentry); ++ ldir->mtx = &ldir->parent->d_inode->i_mutex; ++ mutex_lock_nested(ldir->mtx, AuLsc_I_PARENT); ++ } ++} ++ ++static void au_xino_unlock_dir(struct au_xino_lock_dir *ldir) ++{ ++ if (ldir->hdir) ++ au_hn_imtx_unlock(ldir->hdir); ++ else { ++ mutex_unlock(ldir->mtx); ++ dput(ldir->parent); ++ } ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* trucate xino files asynchronously */ ++ ++int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex) ++{ ++ int err; ++ aufs_bindex_t bi, bend; ++ struct au_branch *br; ++ struct file *new_xino, *file; ++ struct super_block *h_sb; ++ struct au_xino_lock_dir ldir; ++ ++ err = -EINVAL; ++ bend = au_sbend(sb); ++ if (unlikely(bindex < 0 || bend < bindex)) ++ goto out; ++ br = au_sbr(sb, bindex); ++ file = br->br_xino.xi_file; ++ if (!file) ++ goto out; ++ ++ au_xino_lock_dir(sb, file, &ldir); ++ /* mnt_want_write() is unnecessary here */ ++ new_xino = au_xino_create2(file, file); ++ au_xino_unlock_dir(&ldir); ++ err = PTR_ERR(new_xino); ++ if (IS_ERR(new_xino)) ++ goto out; ++ err = 0; ++ fput(file); ++ br->br_xino.xi_file = new_xino; ++ ++ h_sb = br->br_mnt->mnt_sb; ++ for (bi = 0; bi <= bend; bi++) { ++ if (unlikely(bi == bindex)) ++ continue; ++ br = au_sbr(sb, bi); ++ if (br->br_mnt->mnt_sb != h_sb) ++ continue; ++ ++ fput(br->br_xino.xi_file); ++ br->br_xino.xi_file = new_xino; ++ get_file(new_xino); ++ } ++ ++out: ++ return err; ++} ++ ++struct xino_do_trunc_args { ++ struct super_block *sb; ++ struct au_branch *br; ++}; ++ ++static void xino_do_trunc(void *_args) ++{ ++ struct xino_do_trunc_args *args = _args; ++ struct super_block *sb; ++ struct au_branch *br; ++ struct inode *dir; ++ int err; ++ aufs_bindex_t bindex; ++ ++ err = 0; ++ sb = args->sb; ++ dir = sb->s_root->d_inode; ++ br = args->br; ++ ++ si_noflush_write_lock(sb); ++ ii_read_lock_parent(dir); ++ bindex = au_br_index(sb, br->br_id); ++ err = au_xino_trunc(sb, bindex); ++ if (!err ++ && br->br_xino.xi_file->f_dentry->d_inode->i_blocks ++ >= br->br_xino_upper) ++ br->br_xino_upper += AUFS_XINO_TRUNC_STEP; ++ ++ ii_read_unlock(dir); ++ if (unlikely(err)) ++ pr_warning("err b%d, (%d)\n", bindex, err); ++ atomic_dec(&br->br_xino_running); ++ atomic_dec(&br->br_count); ++ si_write_unlock(sb); ++ au_nwt_done(&au_sbi(sb)->si_nowait); ++ kfree(args); ++} ++ ++static void xino_try_trunc(struct super_block *sb, struct au_branch *br) ++{ ++ struct xino_do_trunc_args *args; ++ int wkq_err; ++ ++ if (br->br_xino.xi_file->f_dentry->d_inode->i_blocks ++ < br->br_xino_upper) ++ return; ++ ++ if (atomic_inc_return(&br->br_xino_running) > 1) ++ goto out; ++ ++ /* lock and kfree() will be called in trunc_xino() */ ++ args = kmalloc(sizeof(*args), GFP_NOFS); ++ if (unlikely(!args)) { ++ AuErr1("no memory\n"); ++ goto out_args; ++ } ++ ++ atomic_inc(&br->br_count); ++ args->sb = sb; ++ args->br = br; ++ wkq_err = au_wkq_nowait(xino_do_trunc, args, sb, /*flags*/0); ++ if (!wkq_err) ++ return; /* success */ ++ ++ pr_err("wkq %d\n", wkq_err); ++ atomic_dec(&br->br_count); ++ ++out_args: ++ kfree(args); ++out: ++ atomic_dec(&br->br_xino_running); ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static int au_xino_do_write(au_writef_t write, struct file *file, ++ ino_t h_ino, ino_t ino) ++{ ++ loff_t pos; ++ ssize_t sz; ++ ++ pos = h_ino; ++ if (unlikely(au_loff_max / sizeof(ino) - 1 < pos)) { ++ AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); ++ return -EFBIG; ++ } ++ pos *= sizeof(ino); ++ sz = xino_fwrite(write, file, &ino, sizeof(ino), &pos); ++ if (sz == sizeof(ino)) ++ return 0; /* success */ ++ ++ AuIOErr("write failed (%zd)\n", sz); ++ return -EIO; ++} ++ ++/* ++ * write @ino to the xinofile for the specified branch{@sb, @bindex} ++ * at the position of @h_ino. ++ * even if @ino is zero, it is written to the xinofile and means no entry. ++ * if the size of the xino file on a specific filesystem exceeds the watermark, ++ * try truncating it. ++ */ ++int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t ino) ++{ ++ int err; ++ unsigned int mnt_flags; ++ struct au_branch *br; ++ ++ BUILD_BUG_ON(sizeof(long long) != sizeof(au_loff_max) ++ || ((loff_t)-1) > 0); ++ SiMustAnyLock(sb); ++ ++ mnt_flags = au_mntflags(sb); ++ if (!au_opt_test(mnt_flags, XINO)) ++ return 0; ++ ++ br = au_sbr(sb, bindex); ++ err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, ++ h_ino, ino); ++ if (!err) { ++ if (au_opt_test(mnt_flags, TRUNC_XINO) ++ && au_test_fs_trunc_xino(br->br_mnt->mnt_sb)) ++ xino_try_trunc(sb, br); ++ return 0; /* success */ ++ } ++ ++ AuIOErr("write failed (%d)\n", err); ++ return -EIO; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* aufs inode number bitmap */ ++ ++static const int page_bits = (int)PAGE_SIZE * BITS_PER_BYTE; ++static ino_t xib_calc_ino(unsigned long pindex, int bit) ++{ ++ ino_t ino; ++ ++ AuDebugOn(bit < 0 || page_bits <= bit); ++ ino = AUFS_FIRST_INO + pindex * page_bits + bit; ++ return ino; ++} ++ ++static void xib_calc_bit(ino_t ino, unsigned long *pindex, int *bit) ++{ ++ AuDebugOn(ino < AUFS_FIRST_INO); ++ ino -= AUFS_FIRST_INO; ++ *pindex = ino / page_bits; ++ *bit = ino % page_bits; ++} ++ ++static int xib_pindex(struct super_block *sb, unsigned long pindex) ++{ ++ int err; ++ loff_t pos; ++ ssize_t sz; ++ struct au_sbinfo *sbinfo; ++ struct file *xib; ++ unsigned long *p; ++ ++ sbinfo = au_sbi(sb); ++ MtxMustLock(&sbinfo->si_xib_mtx); ++ AuDebugOn(pindex > ULONG_MAX / PAGE_SIZE ++ || !au_opt_test(sbinfo->si_mntflags, XINO)); ++ ++ if (pindex == sbinfo->si_xib_last_pindex) ++ return 0; ++ ++ xib = sbinfo->si_xib; ++ p = sbinfo->si_xib_buf; ++ pos = sbinfo->si_xib_last_pindex; ++ pos *= PAGE_SIZE; ++ sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); ++ if (unlikely(sz != PAGE_SIZE)) ++ goto out; ++ ++ pos = pindex; ++ pos *= PAGE_SIZE; ++ if (i_size_read(xib->f_dentry->d_inode) >= pos + PAGE_SIZE) ++ sz = xino_fread(sbinfo->si_xread, xib, p, PAGE_SIZE, &pos); ++ else { ++ memset(p, 0, PAGE_SIZE); ++ sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); ++ } ++ if (sz == PAGE_SIZE) { ++ sbinfo->si_xib_last_pindex = pindex; ++ return 0; /* success */ ++ } ++ ++out: ++ AuIOErr1("write failed (%zd)\n", sz); ++ err = sz; ++ if (sz >= 0) ++ err = -EIO; ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++static void au_xib_clear_bit(struct inode *inode) ++{ ++ int err, bit; ++ unsigned long pindex; ++ struct super_block *sb; ++ struct au_sbinfo *sbinfo; ++ ++ AuDebugOn(inode->i_nlink); ++ ++ sb = inode->i_sb; ++ xib_calc_bit(inode->i_ino, &pindex, &bit); ++ AuDebugOn(page_bits <= bit); ++ sbinfo = au_sbi(sb); ++ mutex_lock(&sbinfo->si_xib_mtx); ++ err = xib_pindex(sb, pindex); ++ if (!err) { ++ clear_bit(bit, sbinfo->si_xib_buf); ++ sbinfo->si_xib_next_bit = bit; ++ } ++ mutex_unlock(&sbinfo->si_xib_mtx); ++} ++ ++/* for s_op->delete_inode() */ ++void au_xino_delete_inode(struct inode *inode, const int unlinked) ++{ ++ int err; ++ unsigned int mnt_flags; ++ aufs_bindex_t bindex, bend, bi; ++ unsigned char try_trunc; ++ struct au_iinfo *iinfo; ++ struct super_block *sb; ++ struct au_hinode *hi; ++ struct inode *h_inode; ++ struct au_branch *br; ++ au_writef_t xwrite; ++ ++ sb = inode->i_sb; ++ mnt_flags = au_mntflags(sb); ++ if (!au_opt_test(mnt_flags, XINO) ++ || inode->i_ino == AUFS_ROOT_INO) ++ return; ++ ++ if (unlinked) { ++ au_xigen_inc(inode); ++ au_xib_clear_bit(inode); ++ } ++ ++ iinfo = au_ii(inode); ++ if (!iinfo) ++ return; ++ ++ bindex = iinfo->ii_bstart; ++ if (bindex < 0) ++ return; ++ ++ xwrite = au_sbi(sb)->si_xwrite; ++ try_trunc = !!au_opt_test(mnt_flags, TRUNC_XINO); ++ hi = iinfo->ii_hinode + bindex; ++ bend = iinfo->ii_bend; ++ for (; bindex <= bend; bindex++, hi++) { ++ h_inode = hi->hi_inode; ++ if (!h_inode ++ || (!unlinked && h_inode->i_nlink)) ++ continue; ++ ++ /* inode may not be revalidated */ ++ bi = au_br_index(sb, hi->hi_id); ++ if (bi < 0) ++ continue; ++ ++ br = au_sbr(sb, bi); ++ err = au_xino_do_write(xwrite, br->br_xino.xi_file, ++ h_inode->i_ino, /*ino*/0); ++ if (!err && try_trunc ++ && au_test_fs_trunc_xino(br->br_mnt->mnt_sb)) ++ xino_try_trunc(sb, br); ++ } ++} ++ ++/* get an unused inode number from bitmap */ ++ino_t au_xino_new_ino(struct super_block *sb) ++{ ++ ino_t ino; ++ unsigned long *p, pindex, ul, pend; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ int free_bit, err; ++ ++ if (!au_opt_test(au_mntflags(sb), XINO)) ++ return iunique(sb, AUFS_FIRST_INO); ++ ++ sbinfo = au_sbi(sb); ++ mutex_lock(&sbinfo->si_xib_mtx); ++ p = sbinfo->si_xib_buf; ++ free_bit = sbinfo->si_xib_next_bit; ++ if (free_bit < page_bits && !test_bit(free_bit, p)) ++ goto out; /* success */ ++ free_bit = find_first_zero_bit(p, page_bits); ++ if (free_bit < page_bits) ++ goto out; /* success */ ++ ++ pindex = sbinfo->si_xib_last_pindex; ++ for (ul = pindex - 1; ul < ULONG_MAX; ul--) { ++ err = xib_pindex(sb, ul); ++ if (unlikely(err)) ++ goto out_err; ++ free_bit = find_first_zero_bit(p, page_bits); ++ if (free_bit < page_bits) ++ goto out; /* success */ ++ } ++ ++ file = sbinfo->si_xib; ++ pend = i_size_read(file->f_dentry->d_inode) / PAGE_SIZE; ++ for (ul = pindex + 1; ul <= pend; ul++) { ++ err = xib_pindex(sb, ul); ++ if (unlikely(err)) ++ goto out_err; ++ free_bit = find_first_zero_bit(p, page_bits); ++ if (free_bit < page_bits) ++ goto out; /* success */ ++ } ++ BUG(); ++ ++out: ++ set_bit(free_bit, p); ++ sbinfo->si_xib_next_bit = free_bit + 1; ++ pindex = sbinfo->si_xib_last_pindex; ++ mutex_unlock(&sbinfo->si_xib_mtx); ++ ino = xib_calc_ino(pindex, free_bit); ++ AuDbg("i%lu\n", (unsigned long)ino); ++ return ino; ++out_err: ++ mutex_unlock(&sbinfo->si_xib_mtx); ++ AuDbg("i0\n"); ++ return 0; ++} ++ ++/* ++ * read @ino from xinofile for the specified branch{@sb, @bindex} ++ * at the position of @h_ino. ++ * if @ino does not exist and @do_new is true, get new one. ++ */ ++int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, ++ ino_t *ino) ++{ ++ int err; ++ ssize_t sz; ++ loff_t pos; ++ struct file *file; ++ struct au_sbinfo *sbinfo; ++ ++ *ino = 0; ++ if (!au_opt_test(au_mntflags(sb), XINO)) ++ return 0; /* no xino */ ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ pos = h_ino; ++ if (unlikely(au_loff_max / sizeof(*ino) - 1 < pos)) { ++ AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); ++ return -EFBIG; ++ } ++ pos *= sizeof(*ino); ++ ++ file = au_sbr(sb, bindex)->br_xino.xi_file; ++ if (i_size_read(file->f_dentry->d_inode) < pos + sizeof(*ino)) ++ return 0; /* no ino */ ++ ++ sz = xino_fread(sbinfo->si_xread, file, ino, sizeof(*ino), &pos); ++ if (sz == sizeof(*ino)) ++ return 0; /* success */ ++ ++ err = sz; ++ if (unlikely(sz >= 0)) { ++ err = -EIO; ++ AuIOErr("xino read error (%zd)\n", sz); ++ } ++ ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* create and set a new xino file */ ++ ++struct file *au_xino_create(struct super_block *sb, char *fname, int silent) ++{ ++ struct file *file; ++ struct dentry *h_parent, *d; ++ struct inode *h_dir; ++ int err; ++ ++ /* ++ * at mount-time, and the xino file is the default path, ++ * hnotify is disabled so we have no notify events to ignore. ++ * when a user specified the xino, we cannot get au_hdir to be ignored. ++ */ ++ file = vfsub_filp_open(fname, O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE ++ /* | __FMODE_NONOTIFY */, ++ S_IRUGO | S_IWUGO); ++ if (IS_ERR(file)) { ++ if (!silent) ++ pr_err("open %s(%ld)\n", fname, PTR_ERR(file)); ++ return file; ++ } ++ ++ /* keep file count */ ++ h_parent = dget_parent(file->f_dentry); ++ h_dir = h_parent->d_inode; ++ mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); ++ /* mnt_want_write() is unnecessary here */ ++ err = vfsub_unlink(h_dir, &file->f_path, /*force*/0); ++ mutex_unlock(&h_dir->i_mutex); ++ dput(h_parent); ++ if (unlikely(err)) { ++ if (!silent) ++ pr_err("unlink %s(%d)\n", fname, err); ++ goto out; ++ } ++ ++ err = -EINVAL; ++ d = file->f_dentry; ++ if (unlikely(sb == d->d_sb)) { ++ if (!silent) ++ pr_err("%s must be outside\n", fname); ++ goto out; ++ } ++ if (unlikely(au_test_fs_bad_xino(d->d_sb))) { ++ if (!silent) ++ pr_err("xino doesn't support %s(%s)\n", ++ fname, au_sbtype(d->d_sb)); ++ goto out; ++ } ++ return file; /* success */ ++ ++out: ++ fput(file); ++ file = ERR_PTR(err); ++ return file; ++} ++ ++/* ++ * find another branch who is on the same filesystem of the specified ++ * branch{@btgt}. search until @bend. ++ */ ++static int is_sb_shared(struct super_block *sb, aufs_bindex_t btgt, ++ aufs_bindex_t bend) ++{ ++ aufs_bindex_t bindex; ++ struct super_block *tgt_sb = au_sbr_sb(sb, btgt); ++ ++ for (bindex = 0; bindex < btgt; bindex++) ++ if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) ++ return bindex; ++ for (bindex++; bindex <= bend; bindex++) ++ if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) ++ return bindex; ++ return -1; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * initialize the xinofile for the specified branch @br ++ * at the place/path where @base_file indicates. ++ * test whether another branch is on the same filesystem or not, ++ * if @do_test is true. ++ */ ++int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t h_ino, ++ struct file *base_file, int do_test) ++{ ++ int err; ++ ino_t ino; ++ aufs_bindex_t bend, bindex; ++ struct au_branch *shared_br, *b; ++ struct file *file; ++ struct super_block *tgt_sb; ++ ++ shared_br = NULL; ++ bend = au_sbend(sb); ++ if (do_test) { ++ tgt_sb = br->br_mnt->mnt_sb; ++ for (bindex = 0; bindex <= bend; bindex++) { ++ b = au_sbr(sb, bindex); ++ if (tgt_sb == b->br_mnt->mnt_sb) { ++ shared_br = b; ++ break; ++ } ++ } ++ } ++ ++ if (!shared_br || !shared_br->br_xino.xi_file) { ++ struct au_xino_lock_dir ldir; ++ ++ au_xino_lock_dir(sb, base_file, &ldir); ++ /* mnt_want_write() is unnecessary here */ ++ file = au_xino_create2(base_file, NULL); ++ au_xino_unlock_dir(&ldir); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ br->br_xino.xi_file = file; ++ } else { ++ br->br_xino.xi_file = shared_br->br_xino.xi_file; ++ get_file(br->br_xino.xi_file); ++ } ++ ++ ino = AUFS_ROOT_INO; ++ err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, ++ h_ino, ino); ++ if (unlikely(err)) { ++ fput(br->br_xino.xi_file); ++ br->br_xino.xi_file = NULL; ++ } ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* trucate a xino bitmap file */ ++ ++/* todo: slow */ ++static int do_xib_restore(struct super_block *sb, struct file *file, void *page) ++{ ++ int err, bit; ++ ssize_t sz; ++ unsigned long pindex; ++ loff_t pos, pend; ++ struct au_sbinfo *sbinfo; ++ au_readf_t func; ++ ino_t *ino; ++ unsigned long *p; ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ MtxMustLock(&sbinfo->si_xib_mtx); ++ p = sbinfo->si_xib_buf; ++ func = sbinfo->si_xread; ++ pend = i_size_read(file->f_dentry->d_inode); ++ pos = 0; ++ while (pos < pend) { ++ sz = xino_fread(func, file, page, PAGE_SIZE, &pos); ++ err = sz; ++ if (unlikely(sz <= 0)) ++ goto out; ++ ++ err = 0; ++ for (ino = page; sz > 0; ino++, sz -= sizeof(ino)) { ++ if (unlikely(*ino < AUFS_FIRST_INO)) ++ continue; ++ ++ xib_calc_bit(*ino, &pindex, &bit); ++ AuDebugOn(page_bits <= bit); ++ err = xib_pindex(sb, pindex); ++ if (!err) ++ set_bit(bit, p); ++ else ++ goto out; ++ } ++ } ++ ++out: ++ return err; ++} ++ ++static int xib_restore(struct super_block *sb) ++{ ++ int err; ++ aufs_bindex_t bindex, bend; ++ void *page; ++ ++ err = -ENOMEM; ++ page = (void *)__get_free_page(GFP_NOFS); ++ if (unlikely(!page)) ++ goto out; ++ ++ err = 0; ++ bend = au_sbend(sb); ++ for (bindex = 0; !err && bindex <= bend; bindex++) ++ if (!bindex || is_sb_shared(sb, bindex, bindex - 1) < 0) ++ err = do_xib_restore ++ (sb, au_sbr(sb, bindex)->br_xino.xi_file, page); ++ else ++ AuDbg("b%d\n", bindex); ++ free_page((unsigned long)page); ++ ++out: ++ return err; ++} ++ ++int au_xib_trunc(struct super_block *sb) ++{ ++ int err; ++ ssize_t sz; ++ loff_t pos; ++ struct au_xino_lock_dir ldir; ++ struct au_sbinfo *sbinfo; ++ unsigned long *p; ++ struct file *file; ++ ++ SiMustWriteLock(sb); ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ if (!au_opt_test(sbinfo->si_mntflags, XINO)) ++ goto out; ++ ++ file = sbinfo->si_xib; ++ if (i_size_read(file->f_dentry->d_inode) <= PAGE_SIZE) ++ goto out; ++ ++ au_xino_lock_dir(sb, file, &ldir); ++ /* mnt_want_write() is unnecessary here */ ++ file = au_xino_create2(sbinfo->si_xib, NULL); ++ au_xino_unlock_dir(&ldir); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = file; ++ ++ p = sbinfo->si_xib_buf; ++ memset(p, 0, PAGE_SIZE); ++ pos = 0; ++ sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xib, p, PAGE_SIZE, &pos); ++ if (unlikely(sz != PAGE_SIZE)) { ++ err = sz; ++ AuIOErr("err %d\n", err); ++ if (sz >= 0) ++ err = -EIO; ++ goto out; ++ } ++ ++ mutex_lock(&sbinfo->si_xib_mtx); ++ /* mnt_want_write() is unnecessary here */ ++ err = xib_restore(sb); ++ mutex_unlock(&sbinfo->si_xib_mtx); ++ ++out: ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * xino mount option handlers ++ */ ++static au_readf_t find_readf(struct file *h_file) ++{ ++ const struct file_operations *fop = h_file->f_op; ++ ++ if (fop) { ++ if (fop->read) ++ return fop->read; ++ if (fop->aio_read) ++ return do_sync_read; ++ } ++ return ERR_PTR(-ENOSYS); ++} ++ ++static au_writef_t find_writef(struct file *h_file) ++{ ++ const struct file_operations *fop = h_file->f_op; ++ ++ if (fop) { ++ if (fop->write) ++ return fop->write; ++ if (fop->aio_write) ++ return do_sync_write; ++ } ++ return ERR_PTR(-ENOSYS); ++} ++ ++/* xino bitmap */ ++static void xino_clear_xib(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ sbinfo->si_xread = NULL; ++ sbinfo->si_xwrite = NULL; ++ if (sbinfo->si_xib) ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = NULL; ++ free_page((unsigned long)sbinfo->si_xib_buf); ++ sbinfo->si_xib_buf = NULL; ++} ++ ++static int au_xino_set_xib(struct super_block *sb, struct file *base) ++{ ++ int err; ++ loff_t pos; ++ struct au_sbinfo *sbinfo; ++ struct file *file; ++ ++ SiMustWriteLock(sb); ++ ++ sbinfo = au_sbi(sb); ++ file = au_xino_create2(base, sbinfo->si_xib); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto out; ++ if (sbinfo->si_xib) ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = file; ++ sbinfo->si_xread = find_readf(file); ++ sbinfo->si_xwrite = find_writef(file); ++ ++ err = -ENOMEM; ++ if (!sbinfo->si_xib_buf) ++ sbinfo->si_xib_buf = (void *)get_zeroed_page(GFP_NOFS); ++ if (unlikely(!sbinfo->si_xib_buf)) ++ goto out_unset; ++ ++ sbinfo->si_xib_last_pindex = 0; ++ sbinfo->si_xib_next_bit = 0; ++ if (i_size_read(file->f_dentry->d_inode) < PAGE_SIZE) { ++ pos = 0; ++ err = xino_fwrite(sbinfo->si_xwrite, file, sbinfo->si_xib_buf, ++ PAGE_SIZE, &pos); ++ if (unlikely(err != PAGE_SIZE)) ++ goto out_free; ++ } ++ err = 0; ++ goto out; /* success */ ++ ++out_free: ++ free_page((unsigned long)sbinfo->si_xib_buf); ++ sbinfo->si_xib_buf = NULL; ++ if (err >= 0) ++ err = -EIO; ++out_unset: ++ fput(sbinfo->si_xib); ++ sbinfo->si_xib = NULL; ++ sbinfo->si_xread = NULL; ++ sbinfo->si_xwrite = NULL; ++out: ++ return err; ++} ++ ++/* xino for each branch */ ++static void xino_clear_br(struct super_block *sb) ++{ ++ aufs_bindex_t bindex, bend; ++ struct au_branch *br; ++ ++ bend = au_sbend(sb); ++ for (bindex = 0; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (!br || !br->br_xino.xi_file) ++ continue; ++ ++ fput(br->br_xino.xi_file); ++ br->br_xino.xi_file = NULL; ++ } ++} ++ ++static int au_xino_set_br(struct super_block *sb, struct file *base) ++{ ++ int err; ++ ino_t ino; ++ aufs_bindex_t bindex, bend, bshared; ++ struct { ++ struct file *old, *new; ++ } *fpair, *p; ++ struct au_branch *br; ++ struct inode *inode; ++ au_writef_t writef; ++ ++ SiMustWriteLock(sb); ++ ++ err = -ENOMEM; ++ bend = au_sbend(sb); ++ fpair = kcalloc(bend + 1, sizeof(*fpair), GFP_NOFS); ++ if (unlikely(!fpair)) ++ goto out; ++ ++ inode = sb->s_root->d_inode; ++ ino = AUFS_ROOT_INO; ++ writef = au_sbi(sb)->si_xwrite; ++ for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) { ++ br = au_sbr(sb, bindex); ++ bshared = is_sb_shared(sb, bindex, bindex - 1); ++ if (bshared >= 0) { ++ /* shared xino */ ++ *p = fpair[bshared]; ++ get_file(p->new); ++ } ++ ++ if (!p->new) { ++ /* new xino */ ++ p->old = br->br_xino.xi_file; ++ p->new = au_xino_create2(base, br->br_xino.xi_file); ++ err = PTR_ERR(p->new); ++ if (IS_ERR(p->new)) { ++ p->new = NULL; ++ goto out_pair; ++ } ++ } ++ ++ err = au_xino_do_write(writef, p->new, ++ au_h_iptr(inode, bindex)->i_ino, ino); ++ if (unlikely(err)) ++ goto out_pair; ++ } ++ ++ for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) { ++ br = au_sbr(sb, bindex); ++ if (br->br_xino.xi_file) ++ fput(br->br_xino.xi_file); ++ get_file(p->new); ++ br->br_xino.xi_file = p->new; ++ } ++ ++out_pair: ++ for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) ++ if (p->new) ++ fput(p->new); ++ else ++ break; ++ kfree(fpair); ++out: ++ return err; ++} ++ ++void au_xino_clr(struct super_block *sb) ++{ ++ struct au_sbinfo *sbinfo; ++ ++ au_xigen_clr(sb); ++ xino_clear_xib(sb); ++ xino_clear_br(sb); ++ sbinfo = au_sbi(sb); ++ /* lvalue, do not call au_mntflags() */ ++ au_opt_clr(sbinfo->si_mntflags, XINO); ++} ++ ++int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount) ++{ ++ int err, skip; ++ struct dentry *parent, *cur_parent; ++ struct qstr *dname, *cur_name; ++ struct file *cur_xino; ++ struct inode *dir; ++ struct au_sbinfo *sbinfo; ++ ++ SiMustWriteLock(sb); ++ ++ err = 0; ++ sbinfo = au_sbi(sb); ++ parent = dget_parent(xino->file->f_dentry); ++ if (remount) { ++ skip = 0; ++ dname = &xino->file->f_dentry->d_name; ++ cur_xino = sbinfo->si_xib; ++ if (cur_xino) { ++ cur_parent = dget_parent(cur_xino->f_dentry); ++ cur_name = &cur_xino->f_dentry->d_name; ++ skip = (cur_parent == parent ++ && dname->len == cur_name->len ++ && !memcmp(dname->name, cur_name->name, ++ dname->len)); ++ dput(cur_parent); ++ } ++ if (skip) ++ goto out; ++ } ++ ++ au_opt_set(sbinfo->si_mntflags, XINO); ++ dir = parent->d_inode; ++ mutex_lock_nested(&dir->i_mutex, AuLsc_I_PARENT); ++ /* mnt_want_write() is unnecessary here */ ++ err = au_xino_set_xib(sb, xino->file); ++ if (!err) ++ err = au_xigen_set(sb, xino->file); ++ if (!err) ++ err = au_xino_set_br(sb, xino->file); ++ mutex_unlock(&dir->i_mutex); ++ if (!err) ++ goto out; /* success */ ++ ++ /* reset all */ ++ AuIOErr("failed creating xino(%d).\n", err); ++ ++out: ++ dput(parent); ++ return err; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ++ * create a xinofile at the default place/path. ++ */ ++struct file *au_xino_def(struct super_block *sb) ++{ ++ struct file *file; ++ char *page, *p; ++ struct au_branch *br; ++ struct super_block *h_sb; ++ struct path path; ++ aufs_bindex_t bend, bindex, bwr; ++ ++ br = NULL; ++ bend = au_sbend(sb); ++ bwr = -1; ++ for (bindex = 0; bindex <= bend; bindex++) { ++ br = au_sbr(sb, bindex); ++ if (au_br_writable(br->br_perm) ++ && !au_test_fs_bad_xino(br->br_mnt->mnt_sb)) { ++ bwr = bindex; ++ break; ++ } ++ } ++ ++ if (bwr >= 0) { ++ file = ERR_PTR(-ENOMEM); ++ page = __getname_gfp(GFP_NOFS); ++ if (unlikely(!page)) ++ goto out; ++ path.mnt = br->br_mnt; ++ path.dentry = au_h_dptr(sb->s_root, bwr); ++ p = d_path(&path, page, PATH_MAX - sizeof(AUFS_XINO_FNAME)); ++ file = (void *)p; ++ if (!IS_ERR(p)) { ++ strcat(p, "/" AUFS_XINO_FNAME); ++ AuDbg("%s\n", p); ++ file = au_xino_create(sb, p, /*silent*/0); ++ if (!IS_ERR(file)) ++ au_xino_brid_set(sb, br->br_id); ++ } ++ __putname(page); ++ } else { ++ file = au_xino_create(sb, AUFS_XINO_DEFPATH, /*silent*/0); ++ if (IS_ERR(file)) ++ goto out; ++ h_sb = file->f_dentry->d_sb; ++ if (unlikely(au_test_fs_bad_xino(h_sb))) { ++ pr_err("xino doesn't support %s(%s)\n", ++ AUFS_XINO_DEFPATH, au_sbtype(h_sb)); ++ fput(file); ++ file = ERR_PTR(-EINVAL); ++ } ++ if (!IS_ERR(file)) ++ au_xino_brid_set(sb, -1); ++ } ++ ++out: ++ return file; ++} ++ ++/* ---------------------------------------------------------------------- */ ++ ++int au_xino_path(struct seq_file *seq, struct file *file) ++{ ++ int err; ++ ++ err = au_seq_path(seq, &file->f_path); ++ if (unlikely(err < 0)) ++ goto out; ++ ++ err = 0; ++#define Deleted "\\040(deleted)" ++ seq->count -= sizeof(Deleted) - 1; ++ AuDebugOn(memcmp(seq->buf + seq->count, Deleted, ++ sizeof(Deleted) - 1)); ++#undef Deleted ++ ++out: ++ return err; ++} +--- /dev/null 2012-03-14 12:35:58.848999748 +0100 ++++ b/include/linux/aufs_type.h 2012-03-20 17:31:18.000000000 +0100 +@@ -0,0 +1,233 @@ ++/* ++ * Copyright (C) 2005-2012 Junjiro R. Okajima ++ * ++ * This program, aufs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef __AUFS_TYPE_H__ ++#define __AUFS_TYPE_H__ ++ ++#define AUFS_NAME "aufs" ++ ++#ifdef __KERNEL__ ++/* ++ * define it before including all other headers. ++ * sched.h may use pr_* macros before defining "current", so define the ++ * no-current version first, and re-define later. ++ */ ++#define pr_fmt(fmt) AUFS_NAME " %s:%d: " fmt, __func__, __LINE__ ++#include ++#undef pr_fmt ++#define pr_fmt(fmt) AUFS_NAME " %s:%d:%s[%d]: " fmt, \ ++ __func__, __LINE__, current->comm, current->pid ++#else ++#include ++#include ++#endif /* __KERNEL__ */ ++ ++#include ++ ++#define AUFS_VERSION "3.x-rcN-20120312" ++ ++/* todo? move this to linux-2.6.19/include/magic.h */ ++#define AUFS_SUPER_MAGIC ('a' << 24 | 'u' << 16 | 'f' << 8 | 's') ++ ++/* ---------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_AUFS_BRANCH_MAX_127 ++typedef int8_t aufs_bindex_t; ++#define AUFS_BRANCH_MAX 127 ++#else ++typedef int16_t aufs_bindex_t; ++#ifdef CONFIG_AUFS_BRANCH_MAX_511 ++#define AUFS_BRANCH_MAX 511 ++#elif defined(CONFIG_AUFS_BRANCH_MAX_1023) ++#define AUFS_BRANCH_MAX 1023 ++#elif defined(CONFIG_AUFS_BRANCH_MAX_32767) ++#define AUFS_BRANCH_MAX 32767 ++#endif ++#endif ++ ++#ifdef __KERNEL__ ++#ifndef AUFS_BRANCH_MAX ++#error unknown CONFIG_AUFS_BRANCH_MAX value ++#endif ++#endif /* __KERNEL__ */ ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AUFS_FSTYPE AUFS_NAME ++ ++#define AUFS_ROOT_INO 2 ++#define AUFS_FIRST_INO 11 ++ ++#define AUFS_WH_PFX ".wh." ++#define AUFS_WH_PFX_LEN ((int)sizeof(AUFS_WH_PFX) - 1) ++#define AUFS_WH_TMP_LEN 4 ++/* a limit for rmdir/rename a dir */ ++#define AUFS_MAX_NAMELEN (NAME_MAX \ ++ - AUFS_WH_PFX_LEN * 2 /* doubly whiteouted */\ ++ - 1 /* dot */\ ++ - AUFS_WH_TMP_LEN) /* hex */ ++#define AUFS_XINO_FNAME "." AUFS_NAME ".xino" ++#define AUFS_XINO_DEFPATH "/tmp/" AUFS_XINO_FNAME ++#define AUFS_XINO_TRUNC_INIT 64 /* blocks */ ++#define AUFS_XINO_TRUNC_STEP 4 /* blocks */ ++#define AUFS_DIRWH_DEF 3 ++#define AUFS_RDCACHE_DEF 10 /* seconds */ ++#define AUFS_RDCACHE_MAX 3600 /* seconds */ ++#define AUFS_RDBLK_DEF 512 /* bytes */ ++#define AUFS_RDHASH_DEF 32 ++#define AUFS_WKQ_NAME AUFS_NAME "d" ++#define AUFS_MFS_DEF_SEC 30 /* seconds */ ++#define AUFS_MFS_MAX_SEC 3600 /* seconds */ ++#define AUFS_PLINK_WARN 100 /* number of plinks */ ++ ++/* pseudo-link maintenace under /proc */ ++#define AUFS_PLINK_MAINT_NAME "plink_maint" ++#define AUFS_PLINK_MAINT_DIR "fs/" AUFS_NAME ++#define AUFS_PLINK_MAINT_PATH AUFS_PLINK_MAINT_DIR "/" AUFS_PLINK_MAINT_NAME ++ ++#define AUFS_DIROPQ_NAME AUFS_WH_PFX ".opq" /* whiteouted doubly */ ++#define AUFS_WH_DIROPQ AUFS_WH_PFX AUFS_DIROPQ_NAME ++ ++#define AUFS_BASE_NAME AUFS_WH_PFX AUFS_NAME ++#define AUFS_PLINKDIR_NAME AUFS_WH_PFX "plnk" ++#define AUFS_ORPHDIR_NAME AUFS_WH_PFX "orph" ++ ++/* doubly whiteouted */ ++#define AUFS_WH_BASE AUFS_WH_PFX AUFS_BASE_NAME ++#define AUFS_WH_PLINKDIR AUFS_WH_PFX AUFS_PLINKDIR_NAME ++#define AUFS_WH_ORPHDIR AUFS_WH_PFX AUFS_ORPHDIR_NAME ++ ++/* branch permissions and attributes */ ++#define AUFS_BRPERM_RW "rw" ++#define AUFS_BRPERM_RO "ro" ++#define AUFS_BRPERM_RR "rr" ++#define AUFS_BRRATTR_WH "wh" ++#define AUFS_BRWATTR_NLWH "nolwh" ++ ++/* ---------------------------------------------------------------------- */ ++ ++/* ioctl */ ++enum { ++ /* readdir in userspace */ ++ AuCtl_RDU, ++ AuCtl_RDU_INO, ++ ++ /* pathconf wrapper */ ++ AuCtl_WBR_FD, ++ ++ /* busy inode */ ++ AuCtl_IBUSY ++}; ++ ++/* borrowed from linux/include/linux/kernel.h */ ++#ifndef ALIGN ++#define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) ++#define __ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) ++#endif ++ ++/* borrowed from linux/include/linux/compiler-gcc3.h */ ++#ifndef __aligned ++#define __aligned(x) __attribute__((aligned(x))) ++#endif ++ ++#ifdef __KERNEL__ ++#ifndef __packed ++#define __packed __attribute__((packed)) ++#endif ++#endif ++ ++struct au_rdu_cookie { ++ uint64_t h_pos; ++ int16_t bindex; ++ uint8_t flags; ++ uint8_t pad; ++ uint32_t generation; ++} __aligned(8); ++ ++struct au_rdu_ent { ++ uint64_t ino; ++ int16_t bindex; ++ uint8_t type; ++ uint8_t nlen; ++ uint8_t wh; ++ char name[0]; ++} __aligned(8); ++ ++static inline int au_rdu_len(int nlen) ++{ ++ /* include the terminating NULL */ ++ return ALIGN(sizeof(struct au_rdu_ent) + nlen + 1, ++ sizeof(uint64_t)); ++} ++ ++union au_rdu_ent_ul { ++ struct au_rdu_ent __user *e; ++ uint64_t ul; ++}; ++ ++enum { ++ AufsCtlRduV_SZ, ++ AufsCtlRduV_End ++}; ++ ++struct aufs_rdu { ++ /* input */ ++ union { ++ uint64_t sz; /* AuCtl_RDU */ ++ uint64_t nent; /* AuCtl_RDU_INO */ ++ }; ++ union au_rdu_ent_ul ent; ++ uint16_t verify[AufsCtlRduV_End]; ++ ++ /* input/output */ ++ uint32_t blk; ++ ++ /* output */ ++ union au_rdu_ent_ul tail; ++ /* number of entries which were added in a single call */ ++ uint64_t rent; ++ uint8_t full; ++ uint8_t shwh; ++ ++ struct au_rdu_cookie cookie; ++} __aligned(8); ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct aufs_wbr_fd { ++ uint32_t oflags; ++ int16_t brid; ++} __aligned(8); ++ ++/* ---------------------------------------------------------------------- */ ++ ++struct aufs_ibusy { ++ uint64_t ino, h_ino; ++ int16_t bindex; ++} __aligned(8); ++ ++/* ---------------------------------------------------------------------- */ ++ ++#define AuCtlType 'A' ++#define AUFS_CTL_RDU _IOWR(AuCtlType, AuCtl_RDU, struct aufs_rdu) ++#define AUFS_CTL_RDU_INO _IOWR(AuCtlType, AuCtl_RDU_INO, struct aufs_rdu) ++#define AUFS_CTL_WBR_FD _IOW(AuCtlType, AuCtl_WBR_FD, \ ++ struct aufs_wbr_fd) ++#define AUFS_CTL_IBUSY _IOWR(AuCtlType, AuCtl_IBUSY, struct aufs_ibusy) ++ ++#endif /* __AUFS_TYPE_H__ */ +aufs3.x-rcN proc_map patch + +diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c +index b1822dd..8b29ab7 100644 +--- a/fs/proc/nommu.c ++++ b/fs/proc/nommu.c +@@ -46,6 +46,10 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) + + if (file) { + struct inode *inode = region->vm_file->f_path.dentry->d_inode; ++ if (region->vm_prfile) { ++ file = region->vm_prfile; ++ inode = file->f_path.dentry->d_inode; ++ } + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + } +diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c +index 7dcd2a2..05a146b 100644 +--- a/fs/proc/task_mmu.c ++++ b/fs/proc/task_mmu.c +@@ -222,6 +222,10 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) + + if (file) { + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; ++ if (vma->vm_prfile) { ++ file = vma->vm_prfile; ++ inode = file->f_path.dentry->d_inode; ++ } + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; +@@ -1036,6 +1040,8 @@ static int show_numa_map(struct seq_file *m, void *v) + + if (file) { + seq_printf(m, " file="); ++ if (vma->vm_prfile) ++ file = vma->vm_prfile; + seq_path(m, &file->f_path, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_printf(m, " heap"); +diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c +index 980de54..4ee031f 100644 +--- a/fs/proc/task_nommu.c ++++ b/fs/proc/task_nommu.c +@@ -148,6 +148,10 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) + + if (file) { + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; ++ if (vma->vm_prfile) { ++ file = vma->vm_prfile; ++ inode = file->f_path.dentry->d_inode; ++ } + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 3cc3062..9742239 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -185,6 +185,7 @@ struct vm_region { + unsigned long vm_top; /* region allocated to here */ + unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ + struct file *vm_file; /* the backing file or NULL */ ++ struct file *vm_prfile; /* the virtual backing file or NULL */ + + int vm_usage; /* region usage count (access under nommu_region_sem) */ + bool vm_icache_flushed : 1; /* true if the icache has been flushed for +@@ -244,6 +245,7 @@ struct vm_area_struct { + unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + units, *not* PAGE_CACHE_SIZE */ + struct file * vm_file; /* File we map to (can be NULL). */ ++ struct file *vm_prfile; /* shadow of vm_file */ + void * vm_private_data; /* was vm_pte (shared mem) */ + + #ifndef CONFIG_MMU +diff --git a/kernel/fork.c b/kernel/fork.c +index e2cd3e2..2c322f7 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -380,6 +380,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) + struct address_space *mapping = file->f_mapping; + + get_file(file); ++ if (tmp->vm_prfile) ++ get_file(tmp->vm_prfile); + if (tmp->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + mutex_lock(&mapping->i_mmap_mutex); +diff --git a/mm/memory.c b/mm/memory.c +index fa2f04e..03ff1bc 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -2641,6 +2641,8 @@ reuse: + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); ++ if (vma->vm_prfile) ++ file_update_time(vma->vm_prfile); + + return ret; + } +@@ -3326,6 +3328,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); ++ if (vma->vm_prfile) ++ file_update_time(vma->vm_prfile); + } else { + unlock_page(vmf.page); + if (anon) +diff --git a/mm/mmap.c b/mm/mmap.c +index 3f758c7..5518dd3 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -232,6 +232,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) + vma->vm_ops->close(vma); + if (vma->vm_file) { + fput(vma->vm_file); ++ if (vma->vm_prfile) ++ fput(vma->vm_prfile); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(vma->vm_mm); + } +@@ -619,6 +621,8 @@ again: remove_next = 1 + (end > next->vm_end); + if (remove_next) { + if (file) { + fput(file); ++ if (vma->vm_prfile) ++ fput(vma->vm_prfile); + if (next->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + } +@@ -1945,6 +1949,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, + + if (new->vm_file) { + get_file(new->vm_file); ++ if (new->vm_prfile) ++ get_file(new->vm_prfile); + if (vma->vm_flags & VM_EXECUTABLE) + added_exe_file_vma(mm); + } +@@ -1969,6 +1975,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + fput(new->vm_file); ++ if (new->vm_prfile) ++ fput(new->vm_prfile); + } + unlink_anon_vmas(new); + out_free_mpol: +@@ -2354,6 +2362,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + new_vma->vm_pgoff = pgoff; + if (new_vma->vm_file) { + get_file(new_vma->vm_file); ++ if (new_vma->vm_prfile) ++ get_file(new_vma->vm_prfile); + if (vma->vm_flags & VM_EXECUTABLE) + added_exe_file_vma(mm); + } +diff --git a/mm/nommu.c b/mm/nommu.c +index f59e170..c24bbf3 100644 +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@ -633,6 +633,8 @@ static void __put_nommu_region(struct vm_region *region) + + if (region->vm_file) + fput(region->vm_file); ++ if (region->vm_prfile) ++ fput(region->vm_prfile); + + /* IO memory and memory shared directly out of the pagecache + * from ramfs/tmpfs mustn't be released here */ +@@ -791,6 +793,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) + vma->vm_ops->close(vma); + if (vma->vm_file) { + fput(vma->vm_file); ++ if (vma->vm_prfile) ++ fput(vma->vm_prfile); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + } +@@ -1364,6 +1368,8 @@ unsigned long do_mmap_pgoff(struct file *file, + } + } + fput(region->vm_file); ++ if (region->vm_prfile) ++ fput(region->vm_prfile); + kmem_cache_free(vm_region_jar, region); + region = pregion; + result = start; +@@ -1440,9 +1446,13 @@ error_just_free: + error: + if (region->vm_file) + fput(region->vm_file); ++ if (region->vm_prfile) ++ fput(region->vm_prfile); + kmem_cache_free(vm_region_jar, region); + if (vma->vm_file) + fput(vma->vm_file); ++ if (vma->vm_prfile) ++ fput(vma->vm_prfile); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(vma->vm_mm); + kmem_cache_free(vm_area_cachep, vma); diff --git a/3.3.8/cloneconfig.patch b/3.3.8/cloneconfig.patch new file mode 100644 index 0000000..4bfb615 --- /dev/null +++ b/3.3.8/cloneconfig.patch @@ -0,0 +1,41 @@ +From: Andreas Gruenbacher +Subject: Add ``cloneconfig'' target +Patch-mainline: Submitted 24 Feb 2011 + +Cloneconfig takes the first configuration it finds which appears +to belong to the running kernel, and configures the kernel sources +to match this configuration as closely as possible. + +Signed-off-by: Andreas Gruenbacher +Signed-off-by: Jeff Mahoney +--- + + scripts/kconfig/Makefile | 17 +++++++++++++++++ + 1 file changed, 17 insertions(+) + +--- a/scripts/kconfig/Makefile ++++ b/scripts/kconfig/Makefile +@@ -99,6 +99,23 @@ PHONY += allnoconfig allyesconfig allmod + + allnoconfig allyesconfig allmodconfig alldefconfig randconfig: $(obj)/conf + $< --$@ $(Kconfig) ++ ++UNAME_RELEASE := $(shell uname -r) ++CLONECONFIG := $(firstword $(wildcard /proc/config.gz \ ++ /lib/modules/$(UNAME_RELEASE)/.config \ ++ /etc/kernel-config \ ++ /boot/config-$(UNAME_RELEASE))) ++cloneconfig: $(obj)/conf ++ $(Q)case "$(CLONECONFIG)" in \ ++ '') echo -e "The configuration of the running" \ ++ "kernel could not be determined\n"; \ ++ false ;; \ ++ *.gz) gzip -cd $(CLONECONFIG) > .config.running ;; \ ++ *) cat $(CLONECONFIG) > .config.running ;; \ ++ esac && \ ++ echo -e "Cloning configuration file $(CLONECONFIG)\n" ++ $(Q)$< --defconfig=.config.running arch/$(SRCARCH)/Kconfig ++ + + PHONY += listnewconfig oldnoconfig savedefconfig defconfig + diff --git a/3.3.8/colored-printk-3.3.8.patch b/3.3.8/colored-printk-3.3.8.patch new file mode 100644 index 0000000..b9ab83d --- /dev/null +++ b/3.3.8/colored-printk-3.3.8.patch @@ -0,0 +1,337 @@ +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/arch/x86/kernel/early_printk.c linux-2.6.29.3-cprintk/arch/x86/kernel/early_printk.c +--- a/arch/x86/kernel/early_printk.c 2009-03-24 00:12:14.000000000 +0100 ++++ b/arch/x86/kernel/early_printk.c 2009-05-09 16:10:36.000000000 +0200 +@@ -23,7 +23,8 @@ + static int max_ypos = 25, max_xpos = 80; + static int current_ypos = 25, current_xpos; + +-static void early_vga_write(struct console *con, const char *str, unsigned n) ++static void early_vga_write(struct console *con, const char *str, unsigned n, ++ unsigned int loglevel) + { + char c; + int i, k, j; +@@ -93,7 +94,8 @@ static int early_serial_putc(unsigned ch + return timeout ? 0 : -1; + } + +-static void early_serial_write(struct console *con, const char *s, unsigned n) ++static void early_serial_write(struct console *con, const char *s, unsigned n, ++ unsigned int loglevel) + { + while (*s && n-- > 0) { + if (*s == '\n') +@@ -887,7 +889,7 @@ asmlinkage void early_printk(const char + + va_start(ap, fmt); + n = vscnprintf(buf, sizeof(buf), fmt, ap); +- early_console->write(early_console, buf, n); ++ early_console->write(early_console, buf, n, 0); + va_end(ap); + } + +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/drivers/char/Kconfig linux-2.6.29.3-cprintk/drivers/tty/Kconfig +--- a/drivers/char/Kconfig 2009-03-24 00:12:14.000000000 +0100 ++++ b/drivers/tty/Kconfig 2009-05-09 14:43:48.000000000 +0200 +@@ -66,6 +66,111 @@ config VT_CONSOLE + + If unsure, say Y. + ++menuconfig VT_CKO ++ bool "Colored kernel message output" ++ depends on VT_CONSOLE ++ ---help--- ++ This option enables kernel messages to be emitted in ++ colors other than the default. ++ ++ The color value you need to enter is composed (OR-ed) ++ of a foreground and a background color. ++ ++ Foreground: ++ 0x00 = black, 0x08 = dark gray, ++ 0x01 = red, 0x09 = light red, ++ 0x02 = green, 0x0A = light green, ++ 0x03 = brown, 0x0B = yellow, ++ 0x04 = blue, 0x0C = light blue, ++ 0x05 = magenta, 0x0D = light magenta, ++ 0x06 = cyan, 0x0E = light cyan, ++ 0x07 = gray, 0x0F = white, ++ ++ (Foreground colors 0x08 to 0x0F do not work when a VGA ++ console font with 512 glyphs is used.) ++ ++ Background: ++ 0x00 = black, 0x40 = blue, ++ 0x10 = red, 0x50 = magenta, ++ 0x20 = green, 0x60 = cyan, ++ 0x30 = brown, 0x70 = gray, ++ ++ For example, 0x1F would yield white on red. ++ ++ If unsure, say N. ++ ++config VT_PRINTK_EMERG_COLOR ++ hex "Emergency messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel emergency messages will ++ be printed to the console. ++ ++config VT_PRINTK_ALERT_COLOR ++ hex "Alert messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel alert messages will ++ be printed to the console. ++ ++config VT_PRINTK_CRIT_COLOR ++ hex "Critical messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel critical messages will ++ be printed to the console. ++ ++config VT_PRINTK_ERR_COLOR ++ hex "Error messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel error messages will ++ be printed to the console. ++ ++config VT_PRINTK_WARNING_COLOR ++ hex "Warning messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel warning messages will ++ be printed to the console. ++ ++config VT_PRINTK_NOTICE_COLOR ++ hex "Notice messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel notice messages will ++ be printed to the console. ++ ++config VT_PRINTK_INFO_COLOR ++ hex "Information messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel information messages will ++ be printed to the console. ++ ++config VT_PRINTK_DEBUG_COLOR ++ hex "Debug messages color" ++ range 0x00 0xFF ++ depends on VT_CKO ++ default 0x07 ++ ---help--- ++ This option defines with which color kernel debug messages will ++ be printed to the console. ++ + config HW_CONSOLE + bool + depends on VT && !S390 && !UML +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/drivers/char/vt.c linux-2.6.29.3-cprintk/drivers/tty/vt/vt.c +--- a/drivers/char/vt.c 2009-05-09 10:46:57.000000000 +0200 ++++ b/drivers/tty/vt/vt.c 2009-05-09 14:43:48.000000000 +0200 +@@ -73,6 +73,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -2431,17 +2432,45 @@ struct tty_driver *console_driver; + + #ifdef CONFIG_VT_CONSOLE + ++#ifdef CONFIG_VT_CKO ++static unsigned int printk_color[8] __read_mostly = { ++ CONFIG_VT_PRINTK_EMERG_COLOR, /* KERN_EMERG */ ++ CONFIG_VT_PRINTK_ALERT_COLOR, /* KERN_ALERT */ ++ CONFIG_VT_PRINTK_CRIT_COLOR, /* KERN_CRIT */ ++ CONFIG_VT_PRINTK_ERR_COLOR, /* KERN_ERR */ ++ CONFIG_VT_PRINTK_WARNING_COLOR, /* KERN_WARNING */ ++ CONFIG_VT_PRINTK_NOTICE_COLOR, /* KERN_NOTICE */ ++ CONFIG_VT_PRINTK_INFO_COLOR, /* KERN_INFO */ ++ CONFIG_VT_PRINTK_DEBUG_COLOR, /* KERN_DEBUG */ ++}; ++module_param_array(printk_color, uint, NULL, S_IRUGO | S_IWUSR); ++ ++static inline void vc_set_color(struct vc_data *vc, unsigned char color) ++{ ++ vc->vc_color = color_table[color & 0xF] | ++ (color_table[(color >> 4) & 0x7] << 4) | ++ (color & 0x80); ++ update_attr(vc); ++} ++#else ++static unsigned int printk_color[8]; ++static inline void vc_set_color(const struct vc_data *vc, unsigned char c) ++{ ++} ++#endif ++ + /* + * Console on virtual terminal + * + * The console must be locked when we get here. + */ + +-static void vt_console_print(struct console *co, const char *b, unsigned count) ++static void vt_console_print(struct console *co, const char *b, unsigned count, ++ unsigned int loglevel) + { + struct vc_data *vc = vc_cons[fg_console].d; +- unsigned char c; + static DEFINE_SPINLOCK(printing_lock); ++ unsigned char current_color, c; + const ushort *start; + ushort cnt = 0; + ushort myx; +@@ -2474,11 +2503,19 @@ static void vt_console_print(struct cons + + start = (ushort *)vc->vc_pos; + ++ /* ++ * We always get a valid loglevel - <8> and "no level" is transformed ++ * to <4> in the typical kernel. ++ */ ++ current_color = printk_color[loglevel]; ++ vc_set_color(vc, current_color); ++ + /* Contrived structure to try to emulate original need_wrap behaviour + * Problems caused when we have need_wrap set on '\n' character */ + while (count--) { + c = *b++; + if (c == 10 || c == 13 || c == 8 || vc->vc_need_wrap) { ++ vc_set_color(vc, vc->vc_def_color); + if (cnt > 0) { + if (CON_IS_VISIBLE(vc)) + vc->vc_sw->con_putcs(vc, start, cnt, vc->vc_y, vc->vc_x); +@@ -2491,6 +2528,7 @@ static void vt_console_print(struct cons + bs(vc); + start = (ushort *)vc->vc_pos; + myx = vc->vc_x; ++ vc_set_color(vc, current_color); + continue; + } + if (c != 13) +@@ -2498,6 +2536,7 @@ static void vt_console_print(struct cons + cr(vc); + start = (ushort *)vc->vc_pos; + myx = vc->vc_x; ++ vc_set_color(vc, current_color); + if (c == 10 || c == 13) + continue; + } +@@ -2520,6 +2559,7 @@ static void vt_console_print(struct cons + vc->vc_need_wrap = 1; + } + } ++ vc_set_color(vc, vc->vc_def_color); + set_cursor(vc); + notify_update(vc); + +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/drivers/net/netconsole.c linux-2.6.29.3-cprintk/drivers/net/netconsole.c +--- a/drivers/net/netconsole.c 2009-03-24 00:12:14.000000000 +0100 ++++ b/drivers/net/netconsole.c 2009-05-09 14:43:48.000000000 +0200 +@@ -691,7 +691,8 @@ static struct notifier_block netconsole_ + .notifier_call = netconsole_netdev_event, + }; + +-static void write_msg(struct console *con, const char *msg, unsigned int len) ++static void write_msg(struct console *con, const char *msg, unsigned int len, ++ unsigned int loglevel) + { + int frag, left; + unsigned long flags; +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/drivers/serial/8250.c linux-2.6.29.3-cprintk/drivers/tty/serial/8250/8250.c +--- a/drivers/serial/8250.c 2009-03-24 00:12:14.000000000 +0100 ++++ b/drivers/tty/serial/8250/8250.c 2009-05-09 14:43:48.000000000 +0200 +@@ -2698,7 +2698,8 @@ static void serial8250_console_putchar(s + * The console_lock must be held when we get here. + */ + static void +-serial8250_console_write(struct console *co, const char *s, unsigned int count) ++serial8250_console_write(struct console *co, const char *s, unsigned int count, ++ unsigned int loglevel) + { + struct uart_8250_port *up = &serial8250_ports[co->index]; + unsigned long flags; +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/drivers/serial/8250_early.c linux-2.6.29.3-cprintk/drivers/tty/serial/8250/8250_early.c +--- a/drivers/serial/8250_early.c 2009-03-24 00:12:14.000000000 +0100 ++++ b/drivers/tty/serial/8250/8250_early.c 2009-05-09 14:43:48.000000000 +0200 +@@ -83,7 +83,7 @@ static void __init serial_putc(struct ua + } + + static void __init early_serial8250_write(struct console *console, +- const char *s, unsigned int count) ++ const char *s, unsigned int count, unsigned int loglevel) + { + struct uart_port *port = &early_device.port; + unsigned int ier; +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/include/linux/console.h linux-2.6.29.3-cprintk/include/linux/console.h +--- a/include/linux/console.h 2009-03-24 00:12:14.000000000 +0100 ++++ b/include/linux/console.h 2009-05-09 14:43:48.000000000 +0200 +@@ -95,7 +95,7 @@ void give_up_console(const struct consw + + struct console { + char name[16]; +- void (*write)(struct console *, const char *, unsigned); ++ void (*write)(struct console *, const char *, unsigned, unsigned int); + int (*read)(struct console *, char *, unsigned); + struct tty_driver *(*device)(struct console *, int *); + void (*unblank)(void); +diff -pruN -X linux/Documentation/dontdiff linux-2.6.29.3/kernel/printk.c linux-2.6.29.3-cprintk/kernel/printk.c +--- a/kernel/printk.c 2009-03-24 00:12:14.000000000 +0100 ++++ b/kernel/printk.c 2009-05-09 14:43:48.000000000 +0200 +@@ -389,7 +389,8 @@ SYSCALL_DEFINE3(syslog, int, type, char + /* + * Call the console drivers on a range of log_buf + */ +-static void __call_console_drivers(unsigned start, unsigned end) ++static void __call_console_drivers(unsigned start, unsigned end, ++ unsigned int loglevel) + { + struct console *con; + +@@ -397,7 +398,7 @@ static void __call_console_drivers(unsig + if ((con->flags & CON_ENABLED) && con->write && + (cpu_online(smp_processor_id()) || + (con->flags & CON_ANYTIME))) +- con->write(con, &LOG_BUF(start), end - start); ++ con->write(con, &LOG_BUF(start), end - start, loglevel); + } + } + +@@ -424,10 +425,11 @@ static void _call_console_drivers(unsign + if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { + /* wrapped write */ + __call_console_drivers(start & LOG_BUF_MASK, +- log_buf_len); +- __call_console_drivers(0, end & LOG_BUF_MASK); ++ log_buf_len, msg_log_level); ++ __call_console_drivers(0, end & LOG_BUF_MASK, ++ msg_log_level); + } else { +- __call_console_drivers(start, end); ++ __call_console_drivers(start, end, msg_log_level); + } + } + } diff --git a/3.3.8/fs-btrfs-run-delayed-directory-updates-during-log-replay.patch b/3.3.8/fs-btrfs-run-delayed-directory-updates-during-log-replay.patch new file mode 100644 index 0000000..f902a9c --- /dev/null +++ b/3.3.8/fs-btrfs-run-delayed-directory-updates-during-log-replay.patch @@ -0,0 +1,54 @@ +From b6305567e7d31b0bec1b8cb9ec0cadd7f7086f5f Mon Sep 17 00:00:00 2001 +From: Chris Mason +Date: Mon, 2 Jul 2012 15:29:53 -0400 +Subject: Btrfs: run delayed directory updates during log replay + +From: Chris Mason + +commit b6305567e7d31b0bec1b8cb9ec0cadd7f7086f5f upstream. + +While we are resolving directory modifications in the +tree log, we are triggering delayed metadata updates to +the filesystem btrees. + +This commit forces the delayed updates to run so the +replay code can find any modifications done. It stops +us from crashing because the directory deleltion replay +expects items to be removed immediately from the tree. + +Signed-off-by: Chris Mason +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-log.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -690,6 +690,8 @@ static noinline int drop_one_dir_item(st + kfree(name); + + iput(inode); ++ ++ btrfs_run_delayed_items(trans, root); + return ret; + } + +@@ -895,6 +897,7 @@ again: + ret = btrfs_unlink_inode(trans, root, dir, + inode, victim_name, + victim_name_len); ++ btrfs_run_delayed_items(trans, root); + } + kfree(victim_name); + ptr = (unsigned long)(victim_ref + 1) + victim_name_len; +@@ -1475,6 +1478,9 @@ again: + ret = btrfs_unlink_inode(trans, root, dir, inode, + name, name_len); + BUG_ON(ret); ++ ++ btrfs_run_delayed_items(trans, root); ++ + kfree(name); + iput(inode); + diff --git a/3.3.8/fs-ecryptfs-fix-lockdep-warning-in-miscdev-operations.patch b/3.3.8/fs-ecryptfs-fix-lockdep-warning-in-miscdev-operations.patch new file mode 100644 index 0000000..d557282 --- /dev/null +++ b/3.3.8/fs-ecryptfs-fix-lockdep-warning-in-miscdev-operations.patch @@ -0,0 +1,103 @@ +From 60d65f1f07a7d81d3eb3b91fc13fca80f2fdbb12 Mon Sep 17 00:00:00 2001 +From: Tyler Hicks +Date: Mon, 11 Jun 2012 10:21:34 -0700 +Subject: eCryptfs: Fix lockdep warning in miscdev operations + +From: Tyler Hicks + +commit 60d65f1f07a7d81d3eb3b91fc13fca80f2fdbb12 upstream. + +Don't grab the daemon mutex while holding the message context mutex. +Addresses this lockdep warning: + + ecryptfsd/2141 is trying to acquire lock: + (&ecryptfs_msg_ctx_arr[i].mux){+.+.+.}, at: [] ecryptfs_miscdev_read+0x143/0x470 [ecryptfs] + + but task is already holding lock: + (&(*daemon)->mux){+.+...}, at: [] ecryptfs_miscdev_read+0x21c/0x470 [ecryptfs] + + which lock already depends on the new lock. + + the existing dependency chain (in reverse order) is: + + -> #1 (&(*daemon)->mux){+.+...}: + [] lock_acquire+0x9d/0x220 + [] __mutex_lock_common+0x5a/0x4b0 + [] mutex_lock_nested+0x44/0x50 + [] ecryptfs_send_miscdev+0x97/0x120 [ecryptfs] + [] ecryptfs_send_message+0x134/0x1e0 [ecryptfs] + [] ecryptfs_generate_key_packet_set+0x2fe/0xa80 [ecryptfs] + [] ecryptfs_write_metadata+0x108/0x250 [ecryptfs] + [] ecryptfs_create+0x130/0x250 [ecryptfs] + [] vfs_create+0xb4/0x120 + [] do_last+0x8c5/0xa10 + [] path_openat+0xd9/0x460 + [] do_filp_open+0x42/0xa0 + [] do_sys_open+0xf8/0x1d0 + [] sys_open+0x21/0x30 + [] system_call_fastpath+0x16/0x1b + + -> #0 (&ecryptfs_msg_ctx_arr[i].mux){+.+.+.}: + [] __lock_acquire+0x1bf8/0x1c50 + [] lock_acquire+0x9d/0x220 + [] __mutex_lock_common+0x5a/0x4b0 + [] mutex_lock_nested+0x44/0x50 + [] ecryptfs_miscdev_read+0x143/0x470 [ecryptfs] + [] vfs_read+0xb3/0x180 + [] sys_read+0x4d/0x90 + [] system_call_fastpath+0x16/0x1b + +Signed-off-by: Tyler Hicks +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ecryptfs/miscdev.c | 25 +++++++++++++------------ + 1 file changed, 13 insertions(+), 12 deletions(-) + +--- a/fs/ecryptfs/miscdev.c ++++ b/fs/ecryptfs/miscdev.c +@@ -195,31 +195,32 @@ int ecryptfs_send_miscdev(char *data, si + struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, + u16 msg_flags, struct ecryptfs_daemon *daemon) + { +- int rc = 0; ++ struct ecryptfs_message *msg; + +- mutex_lock(&msg_ctx->mux); +- msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size), +- GFP_KERNEL); +- if (!msg_ctx->msg) { +- rc = -ENOMEM; ++ msg = kmalloc((sizeof(*msg) + data_size), GFP_KERNEL); ++ if (!msg) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc(%zd, GFP_KERNEL)\n", __func__, +- (sizeof(*msg_ctx->msg) + data_size)); +- goto out_unlock; ++ (sizeof(*msg) + data_size)); ++ return -ENOMEM; + } ++ ++ mutex_lock(&msg_ctx->mux); ++ msg_ctx->msg = msg; + msg_ctx->msg->index = msg_ctx->index; + msg_ctx->msg->data_len = data_size; + msg_ctx->type = msg_type; + memcpy(msg_ctx->msg->data, data, data_size); + msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size); +- mutex_lock(&daemon->mux); + list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue); ++ mutex_unlock(&msg_ctx->mux); ++ ++ mutex_lock(&daemon->mux); + daemon->num_queued_msg_ctx++; + wake_up_interruptible(&daemon->wait); + mutex_unlock(&daemon->mux); +-out_unlock: +- mutex_unlock(&msg_ctx->mux); +- return rc; ++ ++ return 0; + } + + /* diff --git a/3.3.8/fs-ecryptfs-gracefully-refuse-miscdev-file-ops-on-inherited-passed-files.patch b/3.3.8/fs-ecryptfs-gracefully-refuse-miscdev-file-ops-on-inherited-passed-files.patch new file mode 100644 index 0000000..f60a64c --- /dev/null +++ b/3.3.8/fs-ecryptfs-gracefully-refuse-miscdev-file-ops-on-inherited-passed-files.patch @@ -0,0 +1,95 @@ +From 8dc6780587c99286c0d3de747a2946a76989414a Mon Sep 17 00:00:00 2001 +From: Tyler Hicks +Date: Mon, 11 Jun 2012 09:24:11 -0700 +Subject: eCryptfs: Gracefully refuse miscdev file ops on inherited/passed files + +From: Tyler Hicks + +commit 8dc6780587c99286c0d3de747a2946a76989414a upstream. + +File operations on /dev/ecryptfs would BUG() when the operations were +performed by processes other than the process that originally opened the +file. This could happen with open files inherited after fork() or file +descriptors passed through IPC mechanisms. Rather than calling BUG(), an +error code can be safely returned in most situations. + +In ecryptfs_miscdev_release(), eCryptfs still needs to handle the +release even if the last file reference is being held by a process that +didn't originally open the file. ecryptfs_find_daemon_by_euid() will not +be successful, so a pointer to the daemon is stored in the file's +private_data. The private_data pointer is initialized when the miscdev +file is opened and only used when the file is released. + +https://launchpad.net/bugs/994247 + +Signed-off-by: Tyler Hicks +Reported-by: Sasha Levin +Tested-by: Sasha Levin +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ecryptfs/miscdev.c | 23 ++++++++++++++++------- + 1 file changed, 16 insertions(+), 7 deletions(-) + +--- a/fs/ecryptfs/miscdev.c ++++ b/fs/ecryptfs/miscdev.c +@@ -49,7 +49,10 @@ ecryptfs_miscdev_poll(struct file *file, + mutex_lock(&ecryptfs_daemon_hash_mux); + /* TODO: Just use file->private_data? */ + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); +- BUG_ON(rc || !daemon); ++ if (rc || !daemon) { ++ mutex_unlock(&ecryptfs_daemon_hash_mux); ++ return -EINVAL; ++ } + mutex_lock(&daemon->mux); + mutex_unlock(&ecryptfs_daemon_hash_mux); + if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { +@@ -122,6 +125,7 @@ ecryptfs_miscdev_open(struct inode *inod + goto out_unlock_daemon; + } + daemon->flags |= ECRYPTFS_DAEMON_MISCDEV_OPEN; ++ file->private_data = daemon; + atomic_inc(&ecryptfs_num_miscdev_opens); + out_unlock_daemon: + mutex_unlock(&daemon->mux); +@@ -152,9 +156,9 @@ ecryptfs_miscdev_release(struct inode *i + + mutex_lock(&ecryptfs_daemon_hash_mux); + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); +- BUG_ON(rc || !daemon); ++ if (rc || !daemon) ++ daemon = file->private_data; + mutex_lock(&daemon->mux); +- BUG_ON(daemon->pid != task_pid(current)); + BUG_ON(!(daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN)); + daemon->flags &= ~ECRYPTFS_DAEMON_MISCDEV_OPEN; + atomic_dec(&ecryptfs_num_miscdev_opens); +@@ -269,8 +273,16 @@ ecryptfs_miscdev_read(struct file *file, + mutex_lock(&ecryptfs_daemon_hash_mux); + /* TODO: Just use file->private_data? */ + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns()); +- BUG_ON(rc || !daemon); ++ if (rc || !daemon) { ++ mutex_unlock(&ecryptfs_daemon_hash_mux); ++ return -EINVAL; ++ } + mutex_lock(&daemon->mux); ++ if (task_pid(current) != daemon->pid) { ++ mutex_unlock(&daemon->mux); ++ mutex_unlock(&ecryptfs_daemon_hash_mux); ++ return -EPERM; ++ } + if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { + rc = 0; + mutex_unlock(&ecryptfs_daemon_hash_mux); +@@ -307,9 +319,6 @@ check_list: + * message from the queue; try again */ + goto check_list; + } +- BUG_ON(euid != daemon->euid); +- BUG_ON(current_user_ns() != daemon->user_ns); +- BUG_ON(task_pid(current) != daemon->pid); + msg_ctx = list_first_entry(&daemon->msg_ctx_out_queue, + struct ecryptfs_msg_ctx, daemon_out_list); + BUG_ON(!msg_ctx); diff --git a/3.3.8/fs-ecryptfs-properly-check-for-o_rdonly-flag-before-doing-privileged-open.patch b/3.3.8/fs-ecryptfs-properly-check-for-o_rdonly-flag-before-doing-privileged-open.patch new file mode 100644 index 0000000..c2a913b --- /dev/null +++ b/3.3.8/fs-ecryptfs-properly-check-for-o_rdonly-flag-before-doing-privileged-open.patch @@ -0,0 +1,42 @@ +From 9fe79d7600497ed8a95c3981cbe5b73ab98222f0 Mon Sep 17 00:00:00 2001 +From: Tyler Hicks +Date: Tue, 12 Jun 2012 11:17:01 -0700 +Subject: eCryptfs: Properly check for O_RDONLY flag before doing privileged open + +From: Tyler Hicks + +commit 9fe79d7600497ed8a95c3981cbe5b73ab98222f0 upstream. + +If the first attempt at opening the lower file read/write fails, +eCryptfs will retry using a privileged kthread. However, the privileged +retry should not happen if the lower file's inode is read-only because a +read/write open will still be unsuccessful. + +The check for determining if the open should be retried was intended to +be based on the access mode of the lower file's open flags being +O_RDONLY, but the check was incorrectly performed. This would cause the +open to be retried by the privileged kthread, resulting in a second +failed open of the lower file. This patch corrects the check to +determine if the open request should be handled by the privileged +kthread. + +Signed-off-by: Tyler Hicks +Reported-by: Dan Carpenter +Acked-by: Dan Carpenter +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ecryptfs/kthread.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ecryptfs/kthread.c ++++ b/fs/ecryptfs/kthread.c +@@ -149,7 +149,7 @@ int ecryptfs_privileged_open(struct file + (*lower_file) = dentry_open(lower_dentry, lower_mnt, flags, cred); + if (!IS_ERR(*lower_file)) + goto out; +- if (flags & O_RDONLY) { ++ if ((flags & O_ACCMODE) == O_RDONLY) { + rc = PTR_ERR((*lower_file)); + goto out; + } diff --git a/3.3.8/fs-epoll-clear-the-tfile_check_list-on-ELOOP_CVE-2012-3375.patch b/3.3.8/fs-epoll-clear-the-tfile_check_list-on-ELOOP_CVE-2012-3375.patch new file mode 100644 index 0000000..6979427 --- /dev/null +++ b/3.3.8/fs-epoll-clear-the-tfile_check_list-on-ELOOP_CVE-2012-3375.patch @@ -0,0 +1,35 @@ +commit 13d518074a952d33d47c428419693f63389547e9 +Author: Jason Baron +Date: Wed Apr 25 16:01:47 2012 -0700 + + epoll: clear the tfile_check_list on -ELOOP + + An epoll_ctl(,EPOLL_CTL_ADD,,) operation can return '-ELOOP' to prevent + circular epoll dependencies from being created. However, in that case we + do not properly clear the 'tfile_check_list'. Thus, add a call to + clear_tfile_check_list() for the -ELOOP case. + + Signed-off-by: Jason Baron + Reported-by: Yurij M. Plotnikov + Cc: Nelson Elhage + Cc: Davide Libenzi + Tested-by: Alexandra N. Kossovsky + Signed-off-by: Andrew Morton + Signed-off-by: Linus Torvalds + +diff --git a/fs/eventpoll.c b/fs/eventpoll.c +index 739b098..c0b3c70 100644 +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -1663,8 +1663,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + if (op == EPOLL_CTL_ADD) { + if (is_file_epoll(tfile)) { + error = -ELOOP; +- if (ep_loop_check(ep, tfile) != 0) ++ if (ep_loop_check(ep, tfile) != 0) { ++ clear_tfile_check_list(); + goto error_tgt_fput; ++ } + } else + list_add(&tfile->f_tfile_llink, &tfile_check_list); + } diff --git a/3.3.8/fs-ext4-fix-duplicated-mnt_drop_write-call-in-ext4_ioc_move_ext.patch b/3.3.8/fs-ext4-fix-duplicated-mnt_drop_write-call-in-ext4_ioc_move_ext.patch new file mode 100644 index 0000000..b2b2714 --- /dev/null +++ b/3.3.8/fs-ext4-fix-duplicated-mnt_drop_write-call-in-ext4_ioc_move_ext.patch @@ -0,0 +1,31 @@ +From 331ae4962b975246944ea039697a8f1cadce42bb Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Wed, 18 Jul 2012 09:31:36 +0100 +Subject: ext4: fix duplicated mnt_drop_write call in EXT4_IOC_MOVE_EXT + +From: Al Viro + +commit 331ae4962b975246944ea039697a8f1cadce42bb upstream. + +Caused, AFAICS, by mismerge in commit ff9cb1c4eead ("Merge branch +'for_linus' into for_linus_merged") + +Signed-off-by: Al Viro +Cc: Theodore Ts'o +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/ioctl.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/fs/ext4/ioctl.c ++++ b/fs/ext4/ioctl.c +@@ -261,7 +261,6 @@ group_extend_out: + err = ext4_move_extents(filp, donor_filp, me.orig_start, + me.donor_start, me.len, &me.moved_len); + mnt_drop_write_file(filp); +- mnt_drop_write(filp->f_path.mnt); + + if (copy_to_user((struct move_extent __user *)arg, + &me, sizeof(me))) diff --git a/3.3.8/fs-ext4-fix-the-free-blocks-calculation-for-ext3-file-systems-w-uninit_bg.patch b/3.3.8/fs-ext4-fix-the-free-blocks-calculation-for-ext3-file-systems-w-uninit_bg.patch new file mode 100644 index 0000000..5f7ab7c --- /dev/null +++ b/3.3.8/fs-ext4-fix-the-free-blocks-calculation-for-ext3-file-systems-w-uninit_bg.patch @@ -0,0 +1,73 @@ +From b0dd6b70f0fda17ae9762fbb72d98e40a4f66556 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Thu, 7 Jun 2012 18:56:06 -0400 +Subject: ext4: fix the free blocks calculation for ext3 file systems w/ uninit_bg + +From: Theodore Ts'o + +commit b0dd6b70f0fda17ae9762fbb72d98e40a4f66556 upstream. + +Ext3 filesystems that are converted to use as many ext4 file system +features as possible will enable uninit_bg to speed up e2fsck times. +These file systems will have a native ext3 layout of inode tables and +block allocation bitmaps (as opposed to ext4's flex_bg layout). +Unfortunately, in these cases, when first allocating a block in an +uninitialized block group, ext4 would incorrectly calculate the number +of free blocks in that block group, and then errorneously report that +the file system was corrupt: + +EXT4-fs error (device vdd): ext4_mb_generate_buddy:741: group 30, 32254 clusters in bitmap, 32258 in gd + +This problem can be reproduced via: + + mke2fs -q -t ext4 -O ^flex_bg /dev/vdd 5g + mount -t ext4 /dev/vdd /mnt + fallocate -l 4600m /mnt/test + +The problem was caused by a bone headed mistake in the check to see if a +particular metadata block was part of the block group. + +Many thanks to Kees Cook for finding and bisecting the buggy commit +which introduced this bug (commit fd034a84e1, present since v3.2). + +Reported-by: Sander Eikelenboom +Reported-by: Kees Cook +Signed-off-by: "Theodore Ts'o" +Tested-by: Kees Cook +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/balloc.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -90,8 +90,8 @@ unsigned ext4_num_overhead_clusters(stru + * unusual file system layouts. + */ + if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { +- block_cluster = EXT4_B2C(sbi, (start - +- ext4_block_bitmap(sb, gdp))); ++ block_cluster = EXT4_B2C(sbi, ++ ext4_block_bitmap(sb, gdp) - start); + if (block_cluster < num_clusters) + block_cluster = -1; + else if (block_cluster == num_clusters) { +@@ -102,7 +102,7 @@ unsigned ext4_num_overhead_clusters(stru + + if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { + inode_cluster = EXT4_B2C(sbi, +- start - ext4_inode_bitmap(sb, gdp)); ++ ext4_inode_bitmap(sb, gdp) - start); + if (inode_cluster < num_clusters) + inode_cluster = -1; + else if (inode_cluster == num_clusters) { +@@ -114,7 +114,7 @@ unsigned ext4_num_overhead_clusters(stru + itbl_blk = ext4_inode_table(sb, gdp); + for (i = 0; i < sbi->s_itb_per_group; i++) { + if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { +- c = EXT4_B2C(sbi, start - itbl_blk + i); ++ c = EXT4_B2C(sbi, itbl_blk + i - start); + if ((c < num_clusters) || (c == inode_cluster) || + (c == block_cluster) || (c == itbl_cluster)) + continue; diff --git a/3.3.8/fs-remove-easily-user-triggerable-bug-from-generic_setlease.patch b/3.3.8/fs-remove-easily-user-triggerable-bug-from-generic_setlease.patch new file mode 100644 index 0000000..26a323e --- /dev/null +++ b/3.3.8/fs-remove-easily-user-triggerable-bug-from-generic_setlease.patch @@ -0,0 +1,39 @@ +From 8d657eb3b43861064d36241e88d9d61c709f33f0 Mon Sep 17 00:00:00 2001 +From: Dave Jones +Date: Fri, 13 Jul 2012 13:35:36 -0400 +Subject: Remove easily user-triggerable BUG from generic_setlease + +From: Dave Jones + +commit 8d657eb3b43861064d36241e88d9d61c709f33f0 upstream. + +This can be trivially triggered from userspace by passing in something unexpected. + + kernel BUG at fs/locks.c:1468! + invalid opcode: 0000 [#1] SMP + RIP: 0010:generic_setlease+0xc2/0x100 + Call Trace: + __vfs_setlease+0x35/0x40 + fcntl_setlease+0x76/0x150 + sys_fcntl+0x1c6/0x810 + system_call_fastpath+0x1a/0x1f + +Signed-off-by: Dave Jones +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/locks.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/locks.c ++++ b/fs/locks.c +@@ -1465,7 +1465,7 @@ int generic_setlease(struct file *filp, + case F_WRLCK: + return generic_add_lease(filp, arg, flp); + default: +- BUG(); ++ return -EINVAL; + } + } + EXPORT_SYMBOL(generic_setlease); diff --git a/3.3.8/fs-udf-avoid-run-away-loop-when-partition-table-length-is-corrupted_CVE-2012-3400.patch b/3.3.8/fs-udf-avoid-run-away-loop-when-partition-table-length-is-corrupted_CVE-2012-3400.patch new file mode 100644 index 0000000..9cff549 --- /dev/null +++ b/3.3.8/fs-udf-avoid-run-away-loop-when-partition-table-length-is-corrupted_CVE-2012-3400.patch @@ -0,0 +1,51 @@ +From adee11b2085bee90bd8f4f52123ffb07882d6256 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Wed, 27 Jun 2012 20:20:22 +0200 +Subject: udf: Avoid run away loop when partition table length is corrupted + +From: Jan Kara + +commit adee11b2085bee90bd8f4f52123ffb07882d6256 upstream. + +Check provided length of partition table so that (possibly maliciously) +corrupted partition table cannot cause accessing data beyond current buffer. + +Signed-off-by: Jan Kara +Signed-off-by: Greg Kroah-Hartman + +--- + fs/udf/super.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/fs/udf/super.c ++++ b/fs/udf/super.c +@@ -1225,6 +1225,7 @@ static int udf_load_logicalvol(struct su + struct genericPartitionMap *gpm; + uint16_t ident; + struct buffer_head *bh; ++ unsigned int table_len; + int ret = 0; + + bh = udf_read_tagged(sb, block, block, &ident); +@@ -1232,13 +1233,20 @@ static int udf_load_logicalvol(struct su + return 1; + BUG_ON(ident != TAG_IDENT_LVD); + lvd = (struct logicalVolDesc *)bh->b_data; ++ table_len = le32_to_cpu(lvd->mapTableLength); ++ if (sizeof(*lvd) + table_len > sb->s_blocksize) { ++ udf_err(sb, "error loading logical volume descriptor: " ++ "Partition table too long (%u > %lu)\n", table_len, ++ sb->s_blocksize - sizeof(*lvd)); ++ goto out_bh; ++ } + + ret = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps)); + if (ret) + goto out_bh; + + for (i = 0, offset = 0; +- i < sbi->s_partitions && offset < le32_to_cpu(lvd->mapTableLength); ++ i < sbi->s_partitions && offset < table_len; + i++, offset += gpm->partitionMapLength) { + struct udf_part_map *map = &sbi->s_partmaps[i]; + gpm = (struct genericPartitionMap *) diff --git a/3.3.8/fs-udf-fortify-loading-of-sparing-table_CVE-2012-3400.patch b/3.3.8/fs-udf-fortify-loading-of-sparing-table_CVE-2012-3400.patch new file mode 100644 index 0000000..109f245 --- /dev/null +++ b/3.3.8/fs-udf-fortify-loading-of-sparing-table_CVE-2012-3400.patch @@ -0,0 +1,132 @@ +From 1df2ae31c724e57be9d7ac00d78db8a5dabdd050 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Wed, 27 Jun 2012 21:23:07 +0200 +Subject: udf: Fortify loading of sparing table + +From: Jan Kara + +commit 1df2ae31c724e57be9d7ac00d78db8a5dabdd050 upstream. + +Add sanity checks when loading sparing table from disk to avoid accessing +unallocated memory or writing to it. + +Signed-off-by: Jan Kara +Signed-off-by: Greg Kroah-Hartman + +--- + fs/udf/super.c | 86 +++++++++++++++++++++++++++++++++++---------------------- + 1 file changed, 53 insertions(+), 33 deletions(-) + +--- a/fs/udf/super.c ++++ b/fs/udf/super.c +@@ -56,6 +56,7 @@ + #include + #include + #include ++#include + #include + + #include "udf_sb.h" +@@ -1215,11 +1216,59 @@ out_bh: + return ret; + } + ++static int udf_load_sparable_map(struct super_block *sb, ++ struct udf_part_map *map, ++ struct sparablePartitionMap *spm) ++{ ++ uint32_t loc; ++ uint16_t ident; ++ struct sparingTable *st; ++ struct udf_sparing_data *sdata = &map->s_type_specific.s_sparing; ++ int i; ++ struct buffer_head *bh; ++ ++ map->s_partition_type = UDF_SPARABLE_MAP15; ++ sdata->s_packet_len = le16_to_cpu(spm->packetLength); ++ if (!is_power_of_2(sdata->s_packet_len)) { ++ udf_err(sb, "error loading logical volume descriptor: " ++ "Invalid packet length %u\n", ++ (unsigned)sdata->s_packet_len); ++ return -EIO; ++ } ++ if (spm->numSparingTables > 4) { ++ udf_err(sb, "error loading logical volume descriptor: " ++ "Too many sparing tables (%d)\n", ++ (int)spm->numSparingTables); ++ return -EIO; ++ } ++ ++ for (i = 0; i < spm->numSparingTables; i++) { ++ loc = le32_to_cpu(spm->locSparingTable[i]); ++ bh = udf_read_tagged(sb, loc, loc, &ident); ++ if (!bh) ++ continue; ++ ++ st = (struct sparingTable *)bh->b_data; ++ if (ident != 0 || ++ strncmp(st->sparingIdent.ident, UDF_ID_SPARING, ++ strlen(UDF_ID_SPARING)) || ++ sizeof(*st) + le16_to_cpu(st->reallocationTableLen) > ++ sb->s_blocksize) { ++ brelse(bh); ++ continue; ++ } ++ ++ sdata->s_spar_map[i] = bh; ++ } ++ map->s_partition_func = udf_get_pblock_spar15; ++ return 0; ++} ++ + static int udf_load_logicalvol(struct super_block *sb, sector_t block, + struct kernel_lb_addr *fileset) + { + struct logicalVolDesc *lvd; +- int i, j, offset; ++ int i, offset; + uint8_t type; + struct udf_sb_info *sbi = UDF_SB(sb); + struct genericPartitionMap *gpm; +@@ -1281,38 +1330,9 @@ static int udf_load_logicalvol(struct su + } else if (!strncmp(upm2->partIdent.ident, + UDF_ID_SPARABLE, + strlen(UDF_ID_SPARABLE))) { +- uint32_t loc; +- struct sparingTable *st; +- struct sparablePartitionMap *spm = +- (struct sparablePartitionMap *)gpm; +- +- map->s_partition_type = UDF_SPARABLE_MAP15; +- map->s_type_specific.s_sparing.s_packet_len = +- le16_to_cpu(spm->packetLength); +- for (j = 0; j < spm->numSparingTables; j++) { +- struct buffer_head *bh2; +- +- loc = le32_to_cpu( +- spm->locSparingTable[j]); +- bh2 = udf_read_tagged(sb, loc, loc, +- &ident); +- map->s_type_specific.s_sparing. +- s_spar_map[j] = bh2; +- +- if (bh2 == NULL) +- continue; +- +- st = (struct sparingTable *)bh2->b_data; +- if (ident != 0 || strncmp( +- st->sparingIdent.ident, +- UDF_ID_SPARING, +- strlen(UDF_ID_SPARING))) { +- brelse(bh2); +- map->s_type_specific.s_sparing. +- s_spar_map[j] = NULL; +- } +- } +- map->s_partition_func = udf_get_pblock_spar15; ++ if (udf_load_sparable_map(sb, map, ++ (struct sparablePartitionMap *)gpm) < 0) ++ goto out_bh; + } else if (!strncmp(upm2->partIdent.ident, + UDF_ID_METADATA, + strlen(UDF_ID_METADATA))) { diff --git a/3.3.8/fs-udf-use-ret-instead-of-abusing-i-in-udf_load_logicalvol.patch b/3.3.8/fs-udf-use-ret-instead-of-abusing-i-in-udf_load_logicalvol.patch new file mode 100644 index 0000000..44e75ac --- /dev/null +++ b/3.3.8/fs-udf-use-ret-instead-of-abusing-i-in-udf_load_logicalvol.patch @@ -0,0 +1,32 @@ +From cb14d340ef1737c24125dd663eff77734a482d47 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Wed, 27 Jun 2012 20:08:44 +0200 +Subject: udf: Use 'ret' instead of abusing 'i' in udf_load_logicalvol() + +From: Jan Kara + +commit cb14d340ef1737c24125dd663eff77734a482d47 upstream. + +Signed-off-by: Jan Kara +Signed-off-by: Greg Kroah-Hartman + +--- + fs/udf/super.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +--- a/fs/udf/super.c ++++ b/fs/udf/super.c +@@ -1233,11 +1233,9 @@ static int udf_load_logicalvol(struct su + BUG_ON(ident != TAG_IDENT_LVD); + lvd = (struct logicalVolDesc *)bh->b_data; + +- i = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps)); +- if (i != 0) { +- ret = i; ++ ret = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps)); ++ if (ret) + goto out_bh; +- } + + for (i = 0, offset = 0; + i < sbi->s_partitions && offset < le32_to_cpu(lvd->mapTableLength); diff --git a/3.3.8/hz-432-kconfig-option.patch b/3.3.8/hz-432-kconfig-option.patch new file mode 100644 index 0000000..2fe9a4f --- /dev/null +++ b/3.3.8/hz-432-kconfig-option.patch @@ -0,0 +1,25 @@ +diff -urN oldtree/kernel/Kconfig.hz newtree/kernel/Kconfig.hz +--- oldtree/kernel/Kconfig.hz 2007-03-06 15:00:55.000000000 -0500 ++++ newtree/kernel/Kconfig.hz 2007-03-06 17:52:36.000000000 -0500 +@@ -39,6 +39,14 @@ + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_432 ++ bool "432 HZ" ++ help ++ 432 HZ is the best value for desktop systems. Most responsive ++ out of all the options. This is for Dual Core/Processor systems only. ++ as timer frequencies * number of processors = actual frequency. ++ Try this if you have a dual-core/dual processor system. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,5 +60,6 @@ + default 100 if HZ_100 + default 250 if HZ_250_NODEFAULT + default 300 if HZ_300 ++ default 432 if HZ_432 + default 1000 if HZ_1000 + diff --git a/3.3.8/hz-864-kconfig-option.patch b/3.3.8/hz-864-kconfig-option.patch new file mode 100644 index 0000000..6bdca04 --- /dev/null +++ b/3.3.8/hz-864-kconfig-option.patch @@ -0,0 +1,25 @@ +diff -urN oldtree/kernel/Kconfig.hz newtree/kernel/Kconfig.hz +--- oldtree/kernel/Kconfig.hz 2007-03-06 15:00:55.000000000 -0500 ++++ newtree/kernel/Kconfig.hz 2007-03-06 17:52:36.000000000 -0500 +@@ -39,6 +39,14 @@ + as timer frequencies * number of processors = actual frequency. + Try this if you have a dual-core/dual processor system. + ++ config HZ_864 ++ bool "864 HZ" ++ help ++ 864 HZ is the best value for desktop systems. Most responsive ++ out of all the options. The only reason it is not default is ++ because it may break few drivers. Give it a try if you have ++ a desktop :). ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,5 +60,6 @@ + default 250 if HZ_250_NODEFAULT + default 300 if HZ_300 + default 432 if HZ_432 ++ default 864 if HZ_864 + default 1000 if HZ_1000 + diff --git a/3.3.8/imqmq-3.3.patch b/3.3.8/imqmq-3.3.patch new file mode 100644 index 0000000..a8f4c58 --- /dev/null +++ b/3.3.8/imqmq-3.3.patch @@ -0,0 +1,1613 @@ +diff -uNr linux-3.3/drivers/net/imq.c linux-3.3-imqmq/drivers/net/imq.c +--- linux-3.3/drivers/net/imq.c 1970-01-01 02:00:00.000000000 +0200 ++++ linux-3.3-imqmq/drivers/net/imq.c 2012-03-19 16:53:04.127494306 +0200 +@@ -0,0 +1,857 @@ ++/* ++ * Pseudo-driver for the intermediate queue device. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ * ++ * Authors: Patrick McHardy, ++ * ++ * The first version was written by Martin Devera, ++ * ++ * Credits: Jan Rafaj ++ * - Update patch to 2.4.21 ++ * Sebastian Strollo ++ * - Fix "Dead-loop on netdevice imq"-issue ++ * Marcel Sebek ++ * - Update to 2.6.2-rc1 ++ * ++ * After some time of inactivity there is a group taking care ++ * of IMQ again: http://www.linuximq.net ++ * ++ * ++ * 2004/06/30 - New version of IMQ patch to kernels <=2.6.7 ++ * including the following changes: ++ * ++ * - Correction of ipv6 support "+"s issue (Hasso Tepper) ++ * - Correction of imq_init_devs() issue that resulted in ++ * kernel OOPS unloading IMQ as module (Norbert Buchmuller) ++ * - Addition of functionality to choose number of IMQ devices ++ * during kernel config (Andre Correa) ++ * - Addition of functionality to choose how IMQ hooks on ++ * PRE and POSTROUTING (after or before NAT) (Andre Correa) ++ * - Cosmetic corrections (Norbert Buchmuller) (Andre Correa) ++ * ++ * ++ * 2005/12/16 - IMQ versions between 2.6.7 and 2.6.13 were ++ * released with almost no problems. 2.6.14-x was released ++ * with some important changes: nfcache was removed; After ++ * some weeks of trouble we figured out that some IMQ fields ++ * in skb were missing in skbuff.c - skb_clone and copy_skb_header. ++ * These functions are correctly patched by this new patch version. ++ * ++ * Thanks for all who helped to figure out all the problems with ++ * 2.6.14.x: Patrick McHardy, Rune Kock, VeNoMouS, Max CtRiX, ++ * Kevin Shanahan, Richard Lucassen, Valery Dachev (hopefully ++ * I didn't forget anybody). I apologize again for my lack of time. ++ * ++ * ++ * 2008/06/17 - 2.6.25 - Changed imq.c to use qdisc_run() instead ++ * of qdisc_restart() and moved qdisc_run() to tasklet to avoid ++ * recursive locking. New initialization routines to fix 'rmmod' not ++ * working anymore. Used code from ifb.c. (Jussi Kivilinna) ++ * ++ * 2008/08/06 - 2.6.26 - (JK) ++ * - Replaced tasklet with 'netif_schedule()'. ++ * - Cleaned up and added comments for imq_nf_queue(). ++ * ++ * 2009/04/12 ++ * - Add skb_save_cb/skb_restore_cb helper functions for backuping ++ * control buffer. This is needed because qdisc-layer on kernels ++ * 2.6.27 and newer overwrite control buffer. (Jussi Kivilinna) ++ * - Add better locking for IMQ device. Hopefully this will solve ++ * SMP issues. (Jussi Kivilinna) ++ * - Port to 2.6.27 ++ * - Port to 2.6.28 ++ * - Port to 2.6.29 + fix rmmod not working ++ * ++ * 2009/04/20 - (Jussi Kivilinna) ++ * - Use netdevice feature flags to avoid extra packet handling ++ * by core networking layer and possibly increase performance. ++ * ++ * 2009/09/26 - (Jussi Kivilinna) ++ * - Add imq_nf_reinject_lockless to fix deadlock with ++ * imq_nf_queue/imq_nf_reinject. ++ * ++ * 2009/12/08 - (Jussi Kivilinna) ++ * - Port to 2.6.32 ++ * - Add check for skb->nf_queue_entry==NULL in imq_dev_xmit() ++ * - Also add better error checking for skb->nf_queue_entry usage ++ * ++ * 2010/02/25 - (Jussi Kivilinna) ++ * - Port to 2.6.33 ++ * ++ * 2010/08/15 - (Jussi Kivilinna) ++ * - Port to 2.6.35 ++ * - Simplify hook registration by using nf_register_hooks. ++ * - nf_reinject doesn't need spinlock around it, therefore remove ++ * imq_nf_reinject function. Other nf_reinject users protect ++ * their own data with spinlock. With IMQ however all data is ++ * needed is stored per skbuff, so no locking is needed. ++ * - Changed IMQ to use 'separate' NF_IMQ_QUEUE instead of ++ * NF_QUEUE, this allows working coexistance of IMQ and other ++ * NF_QUEUE users. ++ * - Make IMQ multi-queue. Number of IMQ device queues can be ++ * increased with 'numqueues' module parameters. Default number ++ * of queues is 1, in other words by default IMQ works as ++ * single-queue device. Multi-queue selection is based on ++ * IFB multi-queue patch by Changli Gao . ++ * ++ * 2011/03/18 - (Jussi Kivilinna) ++ * - Port to 2.6.38 ++ * ++ * 2011/07/12 - (syoder89@gmail.com) ++ * - Crash fix that happens when the receiving interface has more ++ * than one queue (add missing skb_set_queue_mapping in ++ * imq_select_queue). ++ * ++ * 2011/07/26 - (Jussi Kivilinna) ++ * - Add queue mapping checks for packets exiting IMQ. ++ * - Port to 3.0 ++ * ++ * 2011/08/16 - (Jussi Kivilinna) ++ * - Clear IFF_TX_SKB_SHARING flag that was added for linux 3.0.2 ++ * ++ * 2011/11/03 - Germano Michel ++ * - Fix IMQ for net namespaces ++ * ++ * 2011/11/04 - Jussi Kivilinna ++ * - Port to 3.1 ++ * - Clean-up, move 'get imq device pointer by imqX name' to ++ * separate function from imq_nf_queue(). ++ * ++ * 2012/01/05 - Jussi Kivilinna ++ * - Port to 3.2 ++ * ++ * 2012/03/19 - Jussi Kivilinna ++ * - Port to 3.3 ++ * ++ * Also, many thanks to pablo Sebastian Greco for making the initial ++ * patch and to those who helped the testing. ++ * ++ * More info at: http://www.linuximq.net/ (Andre Correa) ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ #include ++#endif ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num); ++ ++static nf_hookfn imq_nf_hook; ++ ++static struct nf_hook_ops imq_ops[] = { ++ { ++ /* imq_ingress_ipv4 */ ++ .hook = imq_nf_hook, ++ .owner = THIS_MODULE, ++ .pf = PF_INET, ++ .hooknum = NF_INET_PRE_ROUTING, ++#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) ++ .priority = NF_IP_PRI_MANGLE + 1, ++#else ++ .priority = NF_IP_PRI_NAT_DST + 1, ++#endif ++ }, ++ { ++ /* imq_egress_ipv4 */ ++ .hook = imq_nf_hook, ++ .owner = THIS_MODULE, ++ .pf = PF_INET, ++ .hooknum = NF_INET_POST_ROUTING, ++#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA) ++ .priority = NF_IP_PRI_LAST, ++#else ++ .priority = NF_IP_PRI_NAT_SRC - 1, ++#endif ++ }, ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ { ++ /* imq_ingress_ipv6 */ ++ .hook = imq_nf_hook, ++ .owner = THIS_MODULE, ++ .pf = PF_INET6, ++ .hooknum = NF_INET_PRE_ROUTING, ++#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) ++ .priority = NF_IP6_PRI_MANGLE + 1, ++#else ++ .priority = NF_IP6_PRI_NAT_DST + 1, ++#endif ++ }, ++ { ++ /* imq_egress_ipv6 */ ++ .hook = imq_nf_hook, ++ .owner = THIS_MODULE, ++ .pf = PF_INET6, ++ .hooknum = NF_INET_POST_ROUTING, ++#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA) ++ .priority = NF_IP6_PRI_LAST, ++#else ++ .priority = NF_IP6_PRI_NAT_SRC - 1, ++#endif ++ }, ++#endif ++}; ++ ++#if defined(CONFIG_IMQ_NUM_DEVS) ++static int numdevs = CONFIG_IMQ_NUM_DEVS; ++#else ++static int numdevs = IMQ_MAX_DEVS; ++#endif ++ ++static struct net_device *imq_devs_cache[IMQ_MAX_DEVS]; ++ ++#define IMQ_MAX_QUEUES 32 ++static int numqueues = 1; ++static u32 imq_hashrnd; ++ ++static inline __be16 pppoe_proto(const struct sk_buff *skb) ++{ ++ return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + ++ sizeof(struct pppoe_hdr))); ++} ++ ++static u16 imq_hash(struct net_device *dev, struct sk_buff *skb) ++{ ++ unsigned int pull_len; ++ u16 protocol = skb->protocol; ++ u32 addr1, addr2; ++ u32 hash, ihl = 0; ++ union { ++ u16 in16[2]; ++ u32 in32; ++ } ports; ++ u8 ip_proto; ++ ++ pull_len = 0; ++ ++recheck: ++ switch (protocol) { ++ case htons(ETH_P_8021Q): { ++ if (unlikely(skb_pull(skb, VLAN_HLEN) == NULL)) ++ goto other; ++ ++ pull_len += VLAN_HLEN; ++ skb->network_header += VLAN_HLEN; ++ ++ protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; ++ goto recheck; ++ } ++ ++ case htons(ETH_P_PPP_SES): { ++ if (unlikely(skb_pull(skb, PPPOE_SES_HLEN) == NULL)) ++ goto other; ++ ++ pull_len += PPPOE_SES_HLEN; ++ skb->network_header += PPPOE_SES_HLEN; ++ ++ protocol = pppoe_proto(skb); ++ goto recheck; ++ } ++ ++ case htons(ETH_P_IP): { ++ const struct iphdr *iph = ip_hdr(skb); ++ ++ if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr)))) ++ goto other; ++ ++ addr1 = iph->daddr; ++ addr2 = iph->saddr; ++ ++ ip_proto = !(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) ? ++ iph->protocol : 0; ++ ihl = ip_hdrlen(skb); ++ ++ break; ++ } ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ case htons(ETH_P_IPV6): { ++ const struct ipv6hdr *iph = ipv6_hdr(skb); ++ __be16 fo = 0; ++ ++ if (unlikely(!pskb_may_pull(skb, sizeof(struct ipv6hdr)))) ++ goto other; ++ ++ addr1 = iph->daddr.s6_addr32[3]; ++ addr2 = iph->saddr.s6_addr32[3]; ++ ihl = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &ip_proto, &fo); ++ if (unlikely(ihl < 0)) ++ goto other; ++ ++ break; ++ } ++#endif ++ default: ++other: ++ if (pull_len != 0) { ++ skb_push(skb, pull_len); ++ skb->network_header -= pull_len; ++ } ++ ++ return (u16)(ntohs(protocol) % dev->real_num_tx_queues); ++ } ++ ++ if (addr1 > addr2) ++ swap(addr1, addr2); ++ ++ switch (ip_proto) { ++ case IPPROTO_TCP: ++ case IPPROTO_UDP: ++ case IPPROTO_DCCP: ++ case IPPROTO_ESP: ++ case IPPROTO_AH: ++ case IPPROTO_SCTP: ++ case IPPROTO_UDPLITE: { ++ if (likely(skb_copy_bits(skb, ihl, &ports.in32, 4) >= 0)) { ++ if (ports.in16[0] > ports.in16[1]) ++ swap(ports.in16[0], ports.in16[1]); ++ break; ++ } ++ /* fall-through */ ++ } ++ default: ++ ports.in32 = 0; ++ break; ++ } ++ ++ if (pull_len != 0) { ++ skb_push(skb, pull_len); ++ skb->network_header -= pull_len; ++ } ++ ++ hash = jhash_3words(addr1, addr2, ports.in32, imq_hashrnd ^ ip_proto); ++ ++ return (u16)(((u64)hash * dev->real_num_tx_queues) >> 32); ++} ++ ++static inline bool sk_tx_queue_recorded(struct sock *sk) ++{ ++ return (sk_tx_queue_get(sk) >= 0); ++} ++ ++static struct netdev_queue *imq_select_queue(struct net_device *dev, ++ struct sk_buff *skb) ++{ ++ u16 queue_index = 0; ++ u32 hash; ++ ++ if (likely(dev->real_num_tx_queues == 1)) ++ goto out; ++ ++ /* IMQ can be receiving ingress or engress packets. */ ++ ++ /* Check first for if rx_queue is set */ ++ if (skb_rx_queue_recorded(skb)) { ++ queue_index = skb_get_rx_queue(skb); ++ goto out; ++ } ++ ++ /* Check if socket has tx_queue set */ ++ if (sk_tx_queue_recorded(skb->sk)) { ++ queue_index = sk_tx_queue_get(skb->sk); ++ goto out; ++ } ++ ++ /* Try use socket hash */ ++ if (skb->sk && skb->sk->sk_hash) { ++ hash = skb->sk->sk_hash; ++ queue_index = ++ (u16)(((u64)hash * dev->real_num_tx_queues) >> 32); ++ goto out; ++ } ++ ++ /* Generate hash from packet data */ ++ queue_index = imq_hash(dev, skb); ++ ++out: ++ if (unlikely(queue_index >= dev->real_num_tx_queues)) ++ queue_index = (u16)((u32)queue_index % dev->real_num_tx_queues); ++ ++ skb_set_queue_mapping(skb, queue_index); ++ return netdev_get_tx_queue(dev, queue_index); ++} ++ ++static struct net_device_stats *imq_get_stats(struct net_device *dev) ++{ ++ return &dev->stats; ++} ++ ++/* called for packets kfree'd in qdiscs at places other than enqueue */ ++static void imq_skb_destructor(struct sk_buff *skb) ++{ ++ struct nf_queue_entry *entry = skb->nf_queue_entry; ++ ++ skb->nf_queue_entry = NULL; ++ ++ if (entry) { ++ nf_queue_entry_release_refs(entry); ++ kfree(entry); ++ } ++ ++ skb_restore_cb(skb); /* kfree backup */ ++} ++ ++static void imq_done_check_queue_mapping(struct sk_buff *skb, ++ struct net_device *dev) ++{ ++ unsigned int queue_index; ++ ++ /* Don't let queue_mapping be left too large after exiting IMQ */ ++ if (likely(skb->dev != dev && skb->dev != NULL)) { ++ queue_index = skb_get_queue_mapping(skb); ++ if (unlikely(queue_index >= skb->dev->real_num_tx_queues)) { ++ queue_index = (u16)((u32)queue_index % ++ skb->dev->real_num_tx_queues); ++ skb_set_queue_mapping(skb, queue_index); ++ } ++ } else { ++ /* skb->dev was IMQ device itself or NULL, be on safe side and ++ * just clear queue mapping. ++ */ ++ skb_set_queue_mapping(skb, 0); ++ } ++} ++ ++static netdev_tx_t imq_dev_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct nf_queue_entry *entry = skb->nf_queue_entry; ++ ++ skb->nf_queue_entry = NULL; ++ dev->trans_start = jiffies; ++ ++ dev->stats.tx_bytes += skb->len; ++ dev->stats.tx_packets++; ++ ++ if (unlikely(entry == NULL)) { ++ /* We don't know what is going on here.. packet is queued for ++ * imq device, but (probably) not by us. ++ * ++ * If this packet was not send here by imq_nf_queue(), then ++ * skb_save_cb() was not used and skb_free() should not show: ++ * WARNING: IMQ: kfree_skb: skb->cb_next:.. ++ * and/or ++ * WARNING: IMQ: kfree_skb: skb->nf_queue_entry... ++ * ++ * However if this message is shown, then IMQ is somehow broken ++ * and you should report this to linuximq.net. ++ */ ++ ++ /* imq_dev_xmit is black hole that eats all packets, report that ++ * we eat this packet happily and increase dropped counters. ++ */ ++ ++ dev->stats.tx_dropped++; ++ dev_kfree_skb(skb); ++ ++ return NETDEV_TX_OK; ++ } ++ ++ skb_restore_cb(skb); /* restore skb->cb */ ++ ++ skb->imq_flags = 0; ++ skb->destructor = NULL; ++ ++ imq_done_check_queue_mapping(skb, dev); ++ ++ nf_reinject(entry, NF_ACCEPT); ++ ++ return NETDEV_TX_OK; ++} ++ ++static struct net_device *get_imq_device_by_index(int index) ++{ ++ struct net_device *dev = NULL; ++ struct net *net; ++ char buf[8]; ++ ++ /* get device by name and cache result */ ++ snprintf(buf, sizeof(buf), "imq%d", index); ++ ++ /* Search device from all namespaces. */ ++ for_each_net(net) { ++ dev = dev_get_by_name(net, buf); ++ if (dev) ++ break; ++ } ++ ++ if (WARN_ON_ONCE(dev == NULL)) { ++ /* IMQ device not found. Exotic config? */ ++ return ERR_PTR(-ENODEV); ++ } ++ ++ imq_devs_cache[index] = dev; ++ dev_put(dev); ++ ++ return dev; ++} ++ ++static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num) ++{ ++ struct net_device *dev; ++ struct sk_buff *skb_orig, *skb, *skb_shared; ++ struct Qdisc *q; ++ struct netdev_queue *txq; ++ spinlock_t *root_lock; ++ int users, index; ++ int retval = -EINVAL; ++ unsigned int orig_queue_index; ++ ++ index = entry->skb->imq_flags & IMQ_F_IFMASK; ++ if (unlikely(index > numdevs - 1)) { ++ if (net_ratelimit()) ++ printk(KERN_WARNING ++ "IMQ: invalid device specified, highest is %u\n", ++ numdevs - 1); ++ retval = -EINVAL; ++ goto out; ++ } ++ ++ /* check for imq device by index from cache */ ++ dev = imq_devs_cache[index]; ++ if (unlikely(!dev)) { ++ dev = get_imq_device_by_index(index); ++ if (IS_ERR(dev)) { ++ retval = PTR_ERR(dev); ++ goto out; ++ } ++ } ++ ++ if (unlikely(!(dev->flags & IFF_UP))) { ++ entry->skb->imq_flags = 0; ++ nf_reinject(entry, NF_ACCEPT); ++ retval = 0; ++ goto out; ++ } ++ dev->last_rx = jiffies; ++ ++ skb = entry->skb; ++ skb_orig = NULL; ++ ++ /* skb has owner? => make clone */ ++ if (unlikely(skb->destructor)) { ++ skb_orig = skb; ++ skb = skb_clone(skb, GFP_ATOMIC); ++ if (unlikely(!skb)) { ++ retval = -ENOMEM; ++ goto out; ++ } ++ entry->skb = skb; ++ } ++ ++ skb->nf_queue_entry = entry; ++ ++ dev->stats.rx_bytes += skb->len; ++ dev->stats.rx_packets++; ++ ++ if (!skb->dev) { ++ /* skb->dev == NULL causes problems, try the find cause. */ ++ if (net_ratelimit()) { ++ dev_warn(&dev->dev, ++ "received packet with skb->dev == NULL\n"); ++ dump_stack(); ++ } ++ ++ skb->dev = dev; ++ } ++ ++ /* Disables softirqs for lock below */ ++ rcu_read_lock_bh(); ++ ++ /* Multi-queue selection */ ++ orig_queue_index = skb_get_queue_mapping(skb); ++ txq = imq_select_queue(dev, skb); ++ ++ q = rcu_dereference(txq->qdisc); ++ if (unlikely(!q->enqueue)) ++ goto packet_not_eaten_by_imq_dev; ++ ++ root_lock = qdisc_lock(q); ++ spin_lock(root_lock); ++ ++ users = atomic_read(&skb->users); ++ ++ skb_shared = skb_get(skb); /* increase reference count by one */ ++ skb_save_cb(skb_shared); /* backup skb->cb, as qdisc layer will ++ overwrite it */ ++ qdisc_enqueue_root(skb_shared, q); /* might kfree_skb */ ++ ++ if (likely(atomic_read(&skb_shared->users) == users + 1)) { ++ kfree_skb(skb_shared); /* decrease reference count by one */ ++ ++ skb->destructor = &imq_skb_destructor; ++ ++ /* cloned? */ ++ if (unlikely(skb_orig)) ++ kfree_skb(skb_orig); /* free original */ ++ ++ spin_unlock(root_lock); ++ rcu_read_unlock_bh(); ++ ++ /* schedule qdisc dequeue */ ++ __netif_schedule(q); ++ ++ retval = 0; ++ goto out; ++ } else { ++ skb_restore_cb(skb_shared); /* restore skb->cb */ ++ skb->nf_queue_entry = NULL; ++ /* qdisc dropped packet and decreased skb reference count of ++ * skb, so we don't really want to and try refree as that would ++ * actually destroy the skb. */ ++ spin_unlock(root_lock); ++ goto packet_not_eaten_by_imq_dev; ++ } ++ ++packet_not_eaten_by_imq_dev: ++ skb_set_queue_mapping(skb, orig_queue_index); ++ rcu_read_unlock_bh(); ++ ++ /* cloned? restore original */ ++ if (unlikely(skb_orig)) { ++ kfree_skb(skb); ++ entry->skb = skb_orig; ++ } ++ retval = -1; ++out: ++ return retval; ++} ++ ++static unsigned int imq_nf_hook(unsigned int hook, struct sk_buff *pskb, ++ const struct net_device *indev, ++ const struct net_device *outdev, ++ int (*okfn)(struct sk_buff *)) ++{ ++ return (pskb->imq_flags & IMQ_F_ENQUEUE) ? NF_IMQ_QUEUE : NF_ACCEPT; ++} ++ ++static int imq_close(struct net_device *dev) ++{ ++ netif_stop_queue(dev); ++ return 0; ++} ++ ++static int imq_open(struct net_device *dev) ++{ ++ netif_start_queue(dev); ++ return 0; ++} ++ ++static const struct net_device_ops imq_netdev_ops = { ++ .ndo_open = imq_open, ++ .ndo_stop = imq_close, ++ .ndo_start_xmit = imq_dev_xmit, ++ .ndo_get_stats = imq_get_stats, ++}; ++ ++static void imq_setup(struct net_device *dev) ++{ ++ dev->netdev_ops = &imq_netdev_ops; ++ dev->type = ARPHRD_VOID; ++ dev->mtu = 16000; /* too small? */ ++ dev->tx_queue_len = 11000; /* too big? */ ++ dev->flags = IFF_NOARP; ++ dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | ++ NETIF_F_GSO | NETIF_F_HW_CSUM | ++ NETIF_F_HIGHDMA; ++ dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | ++ IFF_TX_SKB_SHARING); ++} ++ ++static int imq_validate(struct nlattr *tb[], struct nlattr *data[]) ++{ ++ int ret = 0; ++ ++ if (tb[IFLA_ADDRESS]) { ++ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { ++ ret = -EINVAL; ++ goto end; ++ } ++ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { ++ ret = -EADDRNOTAVAIL; ++ goto end; ++ } ++ } ++ return 0; ++end: ++ printk(KERN_WARNING "IMQ: imq_validate failed (%d)\n", ret); ++ return ret; ++} ++ ++static struct rtnl_link_ops imq_link_ops __read_mostly = { ++ .kind = "imq", ++ .priv_size = 0, ++ .setup = imq_setup, ++ .validate = imq_validate, ++}; ++ ++static const struct nf_queue_handler imq_nfqh = { ++ .name = "imq", ++ .outfn = imq_nf_queue, ++}; ++ ++static int __init imq_init_hooks(void) ++{ ++ int ret; ++ ++ nf_register_queue_imq_handler(&imq_nfqh); ++ ++ ret = nf_register_hooks(imq_ops, ARRAY_SIZE(imq_ops)); ++ if (ret < 0) ++ nf_unregister_queue_imq_handler(); ++ ++ return ret; ++} ++ ++static int __init imq_init_one(int index) ++{ ++ struct net_device *dev; ++ int ret; ++ ++ dev = alloc_netdev_mq(0, "imq%d", imq_setup, numqueues); ++ if (!dev) ++ return -ENOMEM; ++ ++ ret = dev_alloc_name(dev, dev->name); ++ if (ret < 0) ++ goto fail; ++ ++ dev->rtnl_link_ops = &imq_link_ops; ++ ret = register_netdevice(dev); ++ if (ret < 0) ++ goto fail; ++ ++ return 0; ++fail: ++ free_netdev(dev); ++ return ret; ++} ++ ++static int __init imq_init_devs(void) ++{ ++ int err, i; ++ ++ if (numdevs < 1 || numdevs > IMQ_MAX_DEVS) { ++ printk(KERN_ERR "IMQ: numdevs has to be betweed 1 and %u\n", ++ IMQ_MAX_DEVS); ++ return -EINVAL; ++ } ++ ++ if (numqueues < 1 || numqueues > IMQ_MAX_QUEUES) { ++ printk(KERN_ERR "IMQ: numqueues has to be betweed 1 and %u\n", ++ IMQ_MAX_QUEUES); ++ return -EINVAL; ++ } ++ ++ get_random_bytes(&imq_hashrnd, sizeof(imq_hashrnd)); ++ ++ rtnl_lock(); ++ err = __rtnl_link_register(&imq_link_ops); ++ ++ for (i = 0; i < numdevs && !err; i++) ++ err = imq_init_one(i); ++ ++ if (err) { ++ __rtnl_link_unregister(&imq_link_ops); ++ memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); ++ } ++ rtnl_unlock(); ++ ++ return err; ++} ++ ++static int __init imq_init_module(void) ++{ ++ int err; ++ ++#if defined(CONFIG_IMQ_NUM_DEVS) ++ BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS > 16); ++ BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS < 2); ++ BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS - 1 > IMQ_F_IFMASK); ++#endif ++ ++ err = imq_init_devs(); ++ if (err) { ++ printk(KERN_ERR "IMQ: Error trying imq_init_devs(net)\n"); ++ return err; ++ } ++ ++ err = imq_init_hooks(); ++ if (err) { ++ printk(KERN_ERR "IMQ: Error trying imq_init_hooks()\n"); ++ rtnl_link_unregister(&imq_link_ops); ++ memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); ++ return err; ++ } ++ ++ printk(KERN_INFO "IMQ driver loaded successfully. " ++ "(numdevs = %d, numqueues = %d)\n", numdevs, numqueues); ++ ++#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) ++ printk(KERN_INFO "\tHooking IMQ before NAT on PREROUTING.\n"); ++#else ++ printk(KERN_INFO "\tHooking IMQ after NAT on PREROUTING.\n"); ++#endif ++#if defined(CONFIG_IMQ_BEHAVIOR_AB) || defined(CONFIG_IMQ_BEHAVIOR_BB) ++ printk(KERN_INFO "\tHooking IMQ before NAT on POSTROUTING.\n"); ++#else ++ printk(KERN_INFO "\tHooking IMQ after NAT on POSTROUTING.\n"); ++#endif ++ ++ return 0; ++} ++ ++static void __exit imq_unhook(void) ++{ ++ nf_unregister_hooks(imq_ops, ARRAY_SIZE(imq_ops)); ++ nf_unregister_queue_imq_handler(); ++} ++ ++static void __exit imq_cleanup_devs(void) ++{ ++ rtnl_link_unregister(&imq_link_ops); ++ memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); ++} ++ ++static void __exit imq_exit_module(void) ++{ ++ imq_unhook(); ++ imq_cleanup_devs(); ++ printk(KERN_INFO "IMQ driver unloaded successfully.\n"); ++} ++ ++module_init(imq_init_module); ++module_exit(imq_exit_module); ++ ++module_param(numdevs, int, 0); ++module_param(numqueues, int, 0); ++MODULE_PARM_DESC(numdevs, "number of IMQ devices (how many imq* devices will " ++ "be created)"); ++MODULE_PARM_DESC(numqueues, "number of queues per IMQ device"); ++MODULE_AUTHOR("http://www.linuximq.net"); ++MODULE_DESCRIPTION("Pseudo-driver for the intermediate queue device. See " ++ "http://www.linuximq.net/ for more information."); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS_RTNL_LINK("imq"); ++ +diff -uNr linux-3.3/drivers/net/Kconfig linux-3.3-imqmq/drivers/net/Kconfig +--- linux-3.3/drivers/net/Kconfig 2012-03-19 01:15:34.000000000 +0200 ++++ linux-3.3-imqmq/drivers/net/Kconfig 2012-03-19 09:46:57.656134747 +0200 +@@ -195,6 +195,125 @@ + depends on RIONET + default "128" + ++config IMQ ++ tristate "IMQ (intermediate queueing device) support" ++ depends on NETDEVICES && NETFILTER ++ ---help--- ++ The IMQ device(s) is used as placeholder for QoS queueing ++ disciplines. Every packet entering/leaving the IP stack can be ++ directed through the IMQ device where it's enqueued/dequeued to the ++ attached qdisc. This allows you to treat network devices as classes ++ and distribute bandwidth among them. Iptables is used to specify ++ through which IMQ device, if any, packets travel. ++ ++ More information at: http://www.linuximq.net/ ++ ++ To compile this driver as a module, choose M here: the module ++ will be called imq. If unsure, say N. ++ ++choice ++ prompt "IMQ behavior (PRE/POSTROUTING)" ++ depends on IMQ ++ default IMQ_BEHAVIOR_AB ++ help ++ This setting defines how IMQ behaves in respect to its ++ hooking in PREROUTING and POSTROUTING. ++ ++ IMQ can work in any of the following ways: ++ ++ PREROUTING | POSTROUTING ++ -----------------|------------------- ++ #1 After NAT | After NAT ++ #2 After NAT | Before NAT ++ #3 Before NAT | After NAT ++ #4 Before NAT | Before NAT ++ ++ The default behavior is to hook before NAT on PREROUTING ++ and after NAT on POSTROUTING (#3). ++ ++ This settings are specially usefull when trying to use IMQ ++ to shape NATed clients. ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ ++config IMQ_BEHAVIOR_AA ++ bool "IMQ AA" ++ help ++ This setting defines how IMQ behaves in respect to its ++ hooking in PREROUTING and POSTROUTING. ++ ++ Choosing this option will make IMQ hook like this: ++ ++ PREROUTING: After NAT ++ POSTROUTING: After NAT ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ ++config IMQ_BEHAVIOR_AB ++ bool "IMQ AB" ++ help ++ This setting defines how IMQ behaves in respect to its ++ hooking in PREROUTING and POSTROUTING. ++ ++ Choosing this option will make IMQ hook like this: ++ ++ PREROUTING: After NAT ++ POSTROUTING: Before NAT ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ ++config IMQ_BEHAVIOR_BA ++ bool "IMQ BA" ++ help ++ This setting defines how IMQ behaves in respect to its ++ hooking in PREROUTING and POSTROUTING. ++ ++ Choosing this option will make IMQ hook like this: ++ ++ PREROUTING: Before NAT ++ POSTROUTING: After NAT ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ ++config IMQ_BEHAVIOR_BB ++ bool "IMQ BB" ++ help ++ This setting defines how IMQ behaves in respect to its ++ hooking in PREROUTING and POSTROUTING. ++ ++ Choosing this option will make IMQ hook like this: ++ ++ PREROUTING: Before NAT ++ POSTROUTING: Before NAT ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ ++endchoice ++ ++config IMQ_NUM_DEVS ++ int "Number of IMQ devices" ++ range 2 16 ++ depends on IMQ ++ default "16" ++ help ++ This setting defines how many IMQ devices will be created. ++ ++ The default value is 16. ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ + config TUN + tristate "Universal TUN/TAP device driver support" + select CRC32 +diff -uNr linux-3.3/drivers/net/Makefile linux-3.3-imqmq/drivers/net/Makefile +--- linux-3.3/drivers/net/Makefile 2012-03-19 01:15:34.000000000 +0200 ++++ linux-3.3-imqmq/drivers/net/Makefile 2012-03-19 09:46:57.656134747 +0200 +@@ -9,6 +9,7 @@ + obj-$(CONFIG_DUMMY) += dummy.o + obj-$(CONFIG_EQUALIZER) += eql.o + obj-$(CONFIG_IFB) += ifb.o ++obj-$(CONFIG_IMQ) += imq.o + obj-$(CONFIG_MACVLAN) += macvlan.o + obj-$(CONFIG_MACVTAP) += macvtap.o + obj-$(CONFIG_MII) += mii.o +diff -uNr linux-3.3/include/linux/imq.h linux-3.3-imqmq/include/linux/imq.h +--- linux-3.3/include/linux/imq.h 1970-01-01 02:00:00.000000000 +0200 ++++ linux-3.3-imqmq/include/linux/imq.h 2012-03-19 09:46:57.656134747 +0200 +@@ -0,0 +1,13 @@ ++#ifndef _IMQ_H ++#define _IMQ_H ++ ++/* IFMASK (16 device indexes, 0 to 15) and flag(s) fit in 5 bits */ ++#define IMQ_F_BITS 5 ++ ++#define IMQ_F_IFMASK 0x0f ++#define IMQ_F_ENQUEUE 0x10 ++ ++#define IMQ_MAX_DEVS (IMQ_F_IFMASK + 1) ++ ++#endif /* _IMQ_H */ ++ +diff -uNr linux-3.3/include/linux/netfilter/xt_IMQ.h linux-3.3-imqmq/include/linux/netfilter/xt_IMQ.h +--- linux-3.3/include/linux/netfilter/xt_IMQ.h 1970-01-01 02:00:00.000000000 +0200 ++++ linux-3.3-imqmq/include/linux/netfilter/xt_IMQ.h 2012-03-19 09:46:57.656134747 +0200 +@@ -0,0 +1,9 @@ ++#ifndef _XT_IMQ_H ++#define _XT_IMQ_H ++ ++struct xt_imq_info { ++ unsigned int todev; /* target imq device */ ++}; ++ ++#endif /* _XT_IMQ_H */ ++ +diff -uNr linux-3.3/include/linux/netfilter.h linux-3.3-imqmq/include/linux/netfilter.h +--- linux-3.3/include/linux/netfilter.h 2012-03-19 01:15:34.000000000 +0200 ++++ linux-3.3-imqmq/include/linux/netfilter.h 2012-03-19 09:46:57.656134747 +0200 +@@ -22,7 +22,8 @@ + #define NF_QUEUE 3 + #define NF_REPEAT 4 + #define NF_STOP 5 +-#define NF_MAX_VERDICT NF_STOP ++#define NF_IMQ_QUEUE 6 ++#define NF_MAX_VERDICT NF_IMQ_QUEUE + + /* we overload the higher bits for encoding auxiliary data such as the queue + * number or errno values. Not nice, but better than additional function +diff -uNr linux-3.3/include/linux/netfilter_ipv4/ipt_IMQ.h linux-3.3-imqmq/include/linux/netfilter_ipv4/ipt_IMQ.h +--- linux-3.3/include/linux/netfilter_ipv4/ipt_IMQ.h 1970-01-01 02:00:00.000000000 +0200 ++++ linux-3.3-imqmq/include/linux/netfilter_ipv4/ipt_IMQ.h 2012-03-19 09:46:57.656134747 +0200 +@@ -0,0 +1,10 @@ ++#ifndef _IPT_IMQ_H ++#define _IPT_IMQ_H ++ ++/* Backwards compatibility for old userspace */ ++#include ++ ++#define ipt_imq_info xt_imq_info ++ ++#endif /* _IPT_IMQ_H */ ++ +diff -uNr linux-3.3/include/linux/netfilter_ipv6/ip6t_IMQ.h linux-3.3-imqmq/include/linux/netfilter_ipv6/ip6t_IMQ.h +--- linux-3.3/include/linux/netfilter_ipv6/ip6t_IMQ.h 1970-01-01 02:00:00.000000000 +0200 ++++ linux-3.3-imqmq/include/linux/netfilter_ipv6/ip6t_IMQ.h 2012-03-19 09:46:57.656134747 +0200 +@@ -0,0 +1,10 @@ ++#ifndef _IP6T_IMQ_H ++#define _IP6T_IMQ_H ++ ++/* Backwards compatibility for old userspace */ ++#include ++ ++#define ip6t_imq_info xt_imq_info ++ ++#endif /* _IP6T_IMQ_H */ ++ +diff -uNr linux-3.3/include/linux/skbuff.h linux-3.3-imqmq/include/linux/skbuff.h +--- linux-3.3/include/linux/skbuff.h 2012-03-19 01:15:34.000000000 +0200 ++++ linux-3.3-imqmq/include/linux/skbuff.h 2012-03-19 09:49:11.892204930 +0200 +@@ -31,6 +31,9 @@ + #include + #include + #include ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++#include ++#endif + + /* Don't change this without changing skb_csum_unnecessary! */ + #define CHECKSUM_NONE 0 +@@ -395,6 +398,9 @@ + * first. This is owned by whoever has the skb queued ATM. + */ + char cb[48] __aligned(8); ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ void *cb_next; ++#endif + + unsigned long _skb_refdst; + #ifdef CONFIG_XFRM +@@ -433,6 +439,9 @@ + #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED + struct sk_buff *nfct_reasm; + #endif ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ struct nf_queue_entry *nf_queue_entry; ++#endif + #ifdef CONFIG_BRIDGE_NETFILTER + struct nf_bridge_info *nf_bridge; + #endif +@@ -459,6 +468,10 @@ + /* 10/12 bit hole (depending on ndisc_nodetype presence) */ + kmemcheck_bitfield_end(flags2); + ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ __u8 imq_flags:IMQ_F_BITS; ++#endif ++ + #ifdef CONFIG_NET_DMA + dma_cookie_t dma_cookie; + #endif +@@ -545,6 +558,12 @@ + return (struct rtable *)skb_dst(skb); + } + ++ ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++extern int skb_save_cb(struct sk_buff *skb); ++extern int skb_restore_cb(struct sk_buff *skb); ++#endif ++ + extern void kfree_skb(struct sk_buff *skb); + extern void consume_skb(struct sk_buff *skb); + extern void __kfree_skb(struct sk_buff *skb); +@@ -2364,6 +2383,10 @@ + dst->nfct_reasm = src->nfct_reasm; + nf_conntrack_get_reasm(src->nfct_reasm); + #endif ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ dst->imq_flags = src->imq_flags; ++ dst->nf_queue_entry = src->nf_queue_entry; ++#endif + #ifdef CONFIG_BRIDGE_NETFILTER + dst->nf_bridge = src->nf_bridge; + nf_bridge_get(src->nf_bridge); +diff -uNr linux-3.3/include/net/netfilter/nf_queue.h linux-3.3-imqmq/include/net/netfilter/nf_queue.h +--- linux-3.3/include/net/netfilter/nf_queue.h 2012-03-19 01:15:34.000000000 +0200 ++++ linux-3.3-imqmq/include/net/netfilter/nf_queue.h 2012-03-19 09:46:57.662801551 +0200 +@@ -30,5 +30,11 @@ + const struct nf_queue_handler *qh); + extern void nf_unregister_queue_handlers(const struct nf_queue_handler *qh); + extern void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); ++extern void nf_queue_entry_release_refs(struct nf_queue_entry *entry); ++ ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++extern void nf_register_queue_imq_handler(const struct nf_queue_handler *qh); ++extern void nf_unregister_queue_imq_handler(void); ++#endif + + #endif /* _NF_QUEUE_H */ +diff -uNr linux-3.3/net/core/dev.c linux-3.3-imqmq/net/core/dev.c +--- linux-3.3/net/core/dev.c 2012-03-19 01:15:34.000000000 +0200 ++++ linux-3.3-imqmq/net/core/dev.c 2012-03-19 09:46:57.669468353 +0200 +@@ -98,6 +98,9 @@ + #include + #include + #include ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++#include ++#endif + #include + #include + #include +@@ -2207,7 +2210,12 @@ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ if (!list_empty(&ptype_all) && ++ !(skb->imq_flags & IMQ_F_ENQUEUE)) ++#else + if (!list_empty(&ptype_all)) ++#endif + dev_queue_xmit_nit(skb, dev); + + skb_orphan_try(skb); +diff -uNr linux-3.3/net/core/skbuff.c linux-3.3-imqmq/net/core/skbuff.c +--- linux-3.3/net/core/skbuff.c 2012-03-19 01:15:34.000000000 +0200 ++++ linux-3.3-imqmq/net/core/skbuff.c 2012-03-19 09:52:12.300707734 +0200 +@@ -73,6 +73,9 @@ + + static struct kmem_cache *skbuff_head_cache __read_mostly; + static struct kmem_cache *skbuff_fclone_cache __read_mostly; ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++static struct kmem_cache *skbuff_cb_store_cache __read_mostly; ++#endif + + static void sock_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +@@ -92,6 +95,82 @@ + return 1; + } + ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++/* Control buffer save/restore for IMQ devices */ ++struct skb_cb_table { ++ char cb[48] __aligned(8); ++ void *cb_next; ++ atomic_t refcnt; ++}; ++ ++static DEFINE_SPINLOCK(skb_cb_store_lock); ++ ++int skb_save_cb(struct sk_buff *skb) ++{ ++ struct skb_cb_table *next; ++ ++ next = kmem_cache_alloc(skbuff_cb_store_cache, GFP_ATOMIC); ++ if (!next) ++ return -ENOMEM; ++ ++ BUILD_BUG_ON(sizeof(skb->cb) != sizeof(next->cb)); ++ ++ memcpy(next->cb, skb->cb, sizeof(skb->cb)); ++ next->cb_next = skb->cb_next; ++ ++ atomic_set(&next->refcnt, 1); ++ ++ skb->cb_next = next; ++ return 0; ++} ++EXPORT_SYMBOL(skb_save_cb); ++ ++int skb_restore_cb(struct sk_buff *skb) ++{ ++ struct skb_cb_table *next; ++ ++ if (!skb->cb_next) ++ return 0; ++ ++ next = skb->cb_next; ++ ++ BUILD_BUG_ON(sizeof(skb->cb) != sizeof(next->cb)); ++ ++ memcpy(skb->cb, next->cb, sizeof(skb->cb)); ++ skb->cb_next = next->cb_next; ++ ++ spin_lock(&skb_cb_store_lock); ++ ++ if (atomic_dec_and_test(&next->refcnt)) ++ kmem_cache_free(skbuff_cb_store_cache, next); ++ ++ spin_unlock(&skb_cb_store_lock); ++ ++ return 0; ++} ++EXPORT_SYMBOL(skb_restore_cb); ++ ++static void skb_copy_stored_cb(struct sk_buff *new, const struct sk_buff *__old) ++{ ++ struct skb_cb_table *next; ++ struct sk_buff *old; ++ ++ if (!__old->cb_next) { ++ new->cb_next = NULL; ++ return; ++ } ++ ++ spin_lock(&skb_cb_store_lock); ++ ++ old = (struct sk_buff *)__old; ++ ++ next = old->cb_next; ++ atomic_inc(&next->refcnt); ++ new->cb_next = next; ++ ++ spin_unlock(&skb_cb_store_lock); ++} ++#endif + + /* Pipe buffer operations for a socket. */ + static const struct pipe_buf_operations sock_pipe_buf_ops = { +@@ -452,6 +531,29 @@ + WARN_ON(in_irq()); + skb->destructor(skb); + } ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ /* ++ * This should not happen. When it does, avoid memleak by restoring ++ * the chain of cb-backups. ++ */ ++ while (skb->cb_next != NULL) { ++ if (net_ratelimit()) ++ printk(KERN_WARNING "IMQ: kfree_skb: skb->cb_next: " ++ "%08x\n", (unsigned int)skb->cb_next); ++ ++ skb_restore_cb(skb); ++ } ++ /* ++ * This should not happen either, nf_queue_entry is nullified in ++ * imq_dev_xmit(). If we have non-NULL nf_queue_entry then we are ++ * leaking entry pointers, maybe memory. We don't know if this is ++ * pointer to already freed memory, or should this be freed. ++ * If this happens we need to add refcounting, etc for nf_queue_entry. ++ */ ++ if (skb->nf_queue_entry && net_ratelimit()) ++ printk(KERN_WARNING ++ "IMQ: kfree_skb: skb->nf_queue_entry != NULL"); ++#endif + #if IS_ENABLED(CONFIG_NF_CONNTRACK) + nf_conntrack_put(skb->nfct); + #endif +@@ -596,6 +698,9 @@ + new->sp = secpath_get(old->sp); + #endif + memcpy(new->cb, old->cb, sizeof(old->cb)); ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ skb_copy_stored_cb(new, old); ++#endif + new->csum = old->csum; + new->local_df = old->local_df; + new->pkt_type = old->pkt_type; +@@ -2956,6 +3061,13 @@ + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ skbuff_cb_store_cache = kmem_cache_create("skbuff_cb_store_cache", ++ sizeof(struct skb_cb_table), ++ 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, ++ NULL); ++#endif + } + + /** +diff -uNr linux-3.3/net/ipv6/ip6_output.c linux-3.3-imqmq/net/ipv6/ip6_output.c +--- linux-3.3/net/ipv6/ip6_output.c 2012-03-19 01:15:34.000000000 +0200 ++++ linux-3.3-imqmq/net/ipv6/ip6_output.c 2012-03-19 09:46:57.672801754 +0200 +@@ -102,9 +102,6 @@ + struct net_device *dev = dst->dev; + struct neighbour *neigh; + +- skb->protocol = htons(ETH_P_IPV6); +- skb->dev = dev; +- + if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + +@@ -170,6 +167,11 @@ + return 0; + } + ++ /* IMQ-patch: moved setting skb->dev and skb->protocol from ++ * ip6_finish_output2 to fix crashing at netif_skb_features(). */ ++ skb->protocol = htons(ETH_P_IPV6); ++ skb->dev = dev; ++ + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev, + ip6_finish_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); +diff -uNr linux-3.3/net/netfilter/core.c linux-3.3-imqmq/net/netfilter/core.c +--- linux-3.3/net/netfilter/core.c 2012-03-19 01:15:34.000000000 +0200 ++++ linux-3.3-imqmq/net/netfilter/core.c 2012-03-19 09:46:57.676135156 +0200 +@@ -190,9 +190,11 @@ + ret = NF_DROP_GETERR(verdict); + if (ret == 0) + ret = -EPERM; +- } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { ++ } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE || ++ (verdict & NF_VERDICT_MASK) == NF_IMQ_QUEUE) { + int err = nf_queue(skb, elem, pf, hook, indev, outdev, okfn, +- verdict >> NF_VERDICT_QBITS); ++ verdict >> NF_VERDICT_QBITS, ++ verdict & NF_VERDICT_MASK); + if (err < 0) { + if (err == -ECANCELED) + goto next_hook; +diff -uNr linux-3.3/net/netfilter/Kconfig linux-3.3-imqmq/net/netfilter/Kconfig +--- linux-3.3/net/netfilter/Kconfig 2012-03-19 01:15:34.000000000 +0200 ++++ linux-3.3-imqmq/net/netfilter/Kconfig 2012-03-19 09:46:57.676135156 +0200 +@@ -524,6 +524,18 @@ + For more information on the LEDs available on your system, see + Documentation/leds/leds-class.txt + ++config NETFILTER_XT_TARGET_IMQ ++ tristate '"IMQ" target support' ++ depends on NETFILTER_XTABLES ++ depends on IP_NF_MANGLE || IP6_NF_MANGLE ++ select IMQ ++ default m if NETFILTER_ADVANCED=n ++ help ++ This option adds a `IMQ' target which is used to specify if and ++ to which imq device packets should get enqueued/dequeued. ++ ++ To compile it as a module, choose M here. If unsure, say N. ++ + config NETFILTER_XT_TARGET_MARK + tristate '"MARK" target support' + depends on NETFILTER_ADVANCED +diff -uNr linux-3.3/net/netfilter/Makefile linux-3.3-imqmq/net/netfilter/Makefile +--- linux-3.3/net/netfilter/Makefile 2012-03-19 01:15:34.000000000 +0200 ++++ linux-3.3-imqmq/net/netfilter/Makefile 2012-03-19 09:46:57.676135156 +0200 +@@ -57,6 +57,7 @@ + obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o + obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o + obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o ++obj-$(CONFIG_NETFILTER_XT_TARGET_IMQ) += xt_IMQ.o + obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o + obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o + obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o +diff -uNr linux-3.3/net/netfilter/nf_internals.h linux-3.3-imqmq/net/netfilter/nf_internals.h +--- linux-3.3/net/netfilter/nf_internals.h 2012-03-19 01:15:34.000000000 +0200 ++++ linux-3.3-imqmq/net/netfilter/nf_internals.h 2012-03-19 09:46:57.676135156 +0200 +@@ -29,7 +29,7 @@ + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), +- unsigned int queuenum); ++ unsigned int queuenum, unsigned int queuetype); + extern int __init netfilter_queue_init(void); + + /* nf_log.c */ +diff -uNr linux-3.3/net/netfilter/nf_queue.c linux-3.3-imqmq/net/netfilter/nf_queue.c +--- linux-3.3/net/netfilter/nf_queue.c 2012-03-19 01:15:34.000000000 +0200 ++++ linux-3.3-imqmq/net/netfilter/nf_queue.c 2012-03-19 09:48:44.658316350 +0200 +@@ -22,6 +22,26 @@ + + static DEFINE_MUTEX(queue_handler_mutex); + ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++static const struct nf_queue_handler *queue_imq_handler; ++ ++void nf_register_queue_imq_handler(const struct nf_queue_handler *qh) ++{ ++ mutex_lock(&queue_handler_mutex); ++ rcu_assign_pointer(queue_imq_handler, qh); ++ mutex_unlock(&queue_handler_mutex); ++} ++EXPORT_SYMBOL_GPL(nf_register_queue_imq_handler); ++ ++void nf_unregister_queue_imq_handler(void) ++{ ++ mutex_lock(&queue_handler_mutex); ++ rcu_assign_pointer(queue_imq_handler, NULL); ++ mutex_unlock(&queue_handler_mutex); ++} ++EXPORT_SYMBOL_GPL(nf_unregister_queue_imq_handler); ++#endif ++ + /* return EBUSY when somebody else is registered, return EEXIST if the + * same handler is registered, return 0 in case of success. */ + int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) +@@ -92,7 +112,7 @@ + } + EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers); + +-static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) ++void nf_queue_entry_release_refs(struct nf_queue_entry *entry) + { + /* Release those devices we held, or Alexey will kill me. */ + if (entry->indev) +@@ -112,6 +132,7 @@ + /* Drop reference to owner of hook which queued us. */ + module_put(entry->elem->owner); + } ++EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); + + /* + * Any packet that leaves via this function must come back +@@ -123,7 +144,8 @@ + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), +- unsigned int queuenum) ++ unsigned int queuenum, ++ unsigned int queuetype) + { + int status = -ENOENT; + struct nf_queue_entry *entry = NULL; +@@ -137,7 +159,17 @@ + /* QUEUE == DROP if no one is waiting, to be safe. */ + rcu_read_lock(); + +- qh = rcu_dereference(queue_handler[pf]); ++ if (queuetype == NF_IMQ_QUEUE) { ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ qh = rcu_dereference(queue_imq_handler); ++#else ++ BUG(); ++ goto err_unlock; ++#endif ++ } else { ++ qh = rcu_dereference(queue_handler[pf]); ++ } ++ + if (!qh) { + status = -ESRCH; + goto err_unlock; +@@ -230,7 +262,8 @@ + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), +- unsigned int queuenum) ++ unsigned int queuenum, ++ unsigned int queuetype) + { + struct sk_buff *segs; + int err = -EINVAL; +@@ -238,7 +271,7 @@ + + if (!skb_is_gso(skb)) + return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn, +- queuenum); ++ queuenum, queuetype); + + switch (pf) { + case NFPROTO_IPV4: +@@ -266,7 +299,7 @@ + if (err == 0) { + nf_bridge_adjust_segmented_data(segs); + err = __nf_queue(segs, elem, pf, hook, indev, +- outdev, okfn, queuenum); ++ outdev, okfn, queuenum, queuetype); + } + if (err == 0) + queued++; +@@ -323,9 +356,11 @@ + local_bh_enable(); + break; + case NF_QUEUE: ++ case NF_IMQ_QUEUE: + err = __nf_queue(skb, elem, entry->pf, entry->hook, + entry->indev, entry->outdev, entry->okfn, +- verdict >> NF_VERDICT_QBITS); ++ verdict >> NF_VERDICT_QBITS, ++ verdict & NF_VERDICT_MASK); + if (err < 0) { + if (err == -ECANCELED) + goto next_hook; +diff -uNr linux-3.3/net/netfilter/xt_IMQ.c linux-3.3-imqmq/net/netfilter/xt_IMQ.c +--- linux-3.3/net/netfilter/xt_IMQ.c 1970-01-01 02:00:00.000000000 +0200 ++++ linux-3.3-imqmq/net/netfilter/xt_IMQ.c 2012-03-19 09:46:57.679468557 +0200 +@@ -0,0 +1,74 @@ ++/* ++ * This target marks packets to be enqueued to an imq device ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++static unsigned int imq_target(struct sk_buff *pskb, ++ const struct xt_action_param *par) ++{ ++ const struct xt_imq_info *mr = par->targinfo; ++ ++ pskb->imq_flags = (mr->todev & IMQ_F_IFMASK) | IMQ_F_ENQUEUE; ++ ++ return XT_CONTINUE; ++} ++ ++static int imq_checkentry(const struct xt_tgchk_param *par) ++{ ++ struct xt_imq_info *mr = par->targinfo; ++ ++ if (mr->todev > IMQ_MAX_DEVS - 1) { ++ printk(KERN_WARNING ++ "IMQ: invalid device specified, highest is %u\n", ++ IMQ_MAX_DEVS - 1); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static struct xt_target xt_imq_reg[] __read_mostly = { ++ { ++ .name = "IMQ", ++ .family = AF_INET, ++ .checkentry = imq_checkentry, ++ .target = imq_target, ++ .targetsize = sizeof(struct xt_imq_info), ++ .table = "mangle", ++ .me = THIS_MODULE ++ }, ++ { ++ .name = "IMQ", ++ .family = AF_INET6, ++ .checkentry = imq_checkentry, ++ .target = imq_target, ++ .targetsize = sizeof(struct xt_imq_info), ++ .table = "mangle", ++ .me = THIS_MODULE ++ }, ++}; ++ ++static int __init imq_init(void) ++{ ++ return xt_register_targets(xt_imq_reg, ARRAY_SIZE(xt_imq_reg)); ++} ++ ++static void __exit imq_fini(void) ++{ ++ xt_unregister_targets(xt_imq_reg, ARRAY_SIZE(xt_imq_reg)); ++} ++ ++module_init(imq_init); ++module_exit(imq_fini); ++ ++MODULE_AUTHOR("http://www.linuximq.net"); ++MODULE_DESCRIPTION("Pseudo-driver for the intermediate queue device. " ++ "See http://www.linuximq.net/ for more information."); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS("ipt_IMQ"); ++MODULE_ALIAS("ip6t_IMQ"); ++ diff --git a/3.3.8/kbuild-compress-kernel-modules-on-installation.patch b/3.3.8/kbuild-compress-kernel-modules-on-installation.patch new file mode 100644 index 0000000..cb3cc7f --- /dev/null +++ b/3.3.8/kbuild-compress-kernel-modules-on-installation.patch @@ -0,0 +1,137 @@ +================================ +Signed-off-by: Steve Brokenshire +[Rediffed for 2.6.31.3, defaulted to y and compress with -9 /Thomas] +Signed-off-by: Thomas Backlund + +diff -Nurp linux-2.6.31/Documentation/kbuild/modules.txt linux-2.6.31.compress/Documentation/kbuild/modules.txt +--- linux-2.6.31/Documentation/kbuild/modules.txt 2009-09-10 01:13:59.000000000 +0300 ++++ linux-2.6.31.compress/Documentation/kbuild/modules.txt 2009-10-09 14:17:49.335619817 +0300 +@@ -123,6 +123,13 @@ executed to make module versioning work. + Install the external module(s). The default location is + /lib/modules//extra/, but a prefix may + be added with INSTALL_MOD_PATH (discussed in section 5). ++ If MODULES_COMPRESS is set when the modules_install target is ++ run then the module is compressed after it has been ++ copied to /lib/modules/. Compressed modules ++ using the default gzip compression format will require ++ module-init-tools installed with --zlib-enabled. ++ Any options set in MODULE_COMPRESS_OPTIONS will be ++ passed to the selected compression format. + + clean + Remove all generated files in the module directory only. +diff -Nurp linux-2.6.31/init/Kconfig linux-2.6.31.compress/init/Kconfig +--- linux-2.6.31/init/Kconfig 2009-09-10 01:13:59.000000000 +0300 ++++ linux-2.6.31.compress/init/Kconfig 2009-10-09 14:19:01.812591181 +0300 +@@ -1161,6 +1161,64 @@ config MODULE_FORCE_UNLOAD + rmmod). This is mainly for kernel developers and desperate users. + If unsure, say N. + ++config MODULE_COMPRESS ++ bool "Compress kernel modules on installation" ++ depends on MODULES ++ default y ++ help ++ This option compresses the kernel modules when 'make ++ modules_install' is run. ++ ++ The modules will be compressed into the selected compression ++ format with gzip being the default compression format. ++ ++ When a kernel module is installed from outside of the main kernel ++ source and uses the Kbuild system for installing modules then that ++ kernel module will also be compressed when it is installed. ++ ++ When running mkinitrd you will find that an error message ++ appears saying that it cannot find a certain kernel module. ++ As a workaround, unset CONFIG_MODULE_COMPRESS, build the modules ++ and install them, run mkinitrd and create the initrd image, place ++ the initrd image in the correct place for booting, set ++ CONFIG_MODULE_COMPRESS and then install the modules again. ++ ++ This option requires the module-init-tools package to be ++ configured with --enable-zlib (if using gzip which is the ++ default compression format). ++ ++ If unsure, say Y. ++ ++config MODULE_COMPRESS_OPTIONS ++ string "Compression format command line options" ++ depends on MODULE_COMPRESS ++ default "-9" ++ help ++ This option specifies the command line options to be used for ++ the selected compression format. ++ ++ Please refer to the selected compression format's documentation ++ on which options should be used. ++ ++ If unsure, leave this option blank. ++ ++choice ++ prompt "Kernel module compression format" ++ depends on MODULE_COMPRESS ++ default MODULE_COMPRESS_GZIP ++ ++config MODULE_COMPRESS_GZIP ++ bool "gzip compression" ++ help ++ Compresses the kernel modules using the gzip (GNU zip) ++ compression format. ++ ++ This option requires gzip to be installed. ++ ++ If unsure, leave this option selected. ++ ++endchoice ++ + config MODVERSIONS + bool "Module versioning support" + help +diff -Nurp linux-2.6.31/scripts/Makefile.modinst linux-2.6.31.compress/scripts/Makefile.modinst +--- linux-2.6.31/scripts/Makefile.modinst 2009-09-10 01:13:59.000000000 +0300 ++++ linux-2.6.31.compress/scripts/Makefile.modinst 2009-10-09 14:17:49.337619404 +0300 +@@ -5,6 +5,7 @@ + PHONY := __modinst + __modinst: + ++include include/config/auto.conf + include scripts/Kbuild.include + + # +@@ -16,8 +17,21 @@ PHONY += $(modules) + __modinst: $(modules) + @: + +-quiet_cmd_modules_install = INSTALL $@ +- cmd_modules_install = mkdir -p $(2); cp $@ $(2) ; $(mod_strip_cmd) $(2)/$(notdir $@) ++ifeq ($(CONFIG_MODULE_COMPRESS_OPTIONS), "") ++else ++ MODCOMPOPT = $(shell echo -n $(CONFIG_MODULE_COMPRESS_OPTIONS)) ++endif ++ ++quiet_cmd_modules_install = INSTALL $@ ++ cmd_modules_install = mkdir -p $(2); \ ++ cp $@ $(2) ; \ ++ $(mod_strip_cmd) $(2)/$(notdir $@) ++ ++quiet_cmd_modules_compress_gzip = COMPRESS $@ ++ cmd_modules_compress_gzip = gzip $(MODCOMPOPT) -c \ ++ $(2)/$(@F) \ ++ > $(2)/$(@F).gz; \ ++ rm $(2)/$(@F) + + # Modules built outside the kernel source tree go into extra by default + INSTALL_MOD_DIR ?= extra +@@ -26,8 +40,11 @@ ext-mod-dir = $(INSTALL_MOD_DIR)$(subst + modinst_dir = $(if $(KBUILD_EXTMOD),$(ext-mod-dir),kernel/$(@D)) + + $(modules): ++ + $(call cmd,modules_install,$(MODLIB)/$(modinst_dir)) + ++ $(if $(CONFIG_MODULE_COMPRESS_GZIP), \ ++ $(call cmd,modules_compress_gzip,$(MODLIB)/$(modinst_dir))) + + # Declare the contents of the .PHONY variable as phony. We keep that + # information in a variable se we can use it in if_changed and friends. diff --git a/3.3.8/kirkwood-jumbo-frame.patch b/3.3.8/kirkwood-jumbo-frame.patch new file mode 100644 index 0000000..fdbc5b1 --- /dev/null +++ b/3.3.8/kirkwood-jumbo-frame.patch @@ -0,0 +1,135 @@ +kirkwood and dove have a smaller FIFO than other "orion" SoCs. This +needs to be taken into account otherwise people using things like jumbo frames +will get into some troubles. + +As a side note, this patch is an updated version of a patch sent some years +ago: http://lists.infradead.org/pipermail/linux-arm-kernel/2010-June/017320.html +which seems to have been lost. + +Signed-off-by: Arnaud Patard + +Index: alunn/arch/arm/mach-dove/common.c +=================================================================== +--- alunn.orig/arch/arm/mach-dove/common.c 2012-07-20 09:14:45.000000000 +0200 ++++ alunn/arch/arm/mach-dove/common.c 2012-07-20 17:51:38.872925518 +0200 +@@ -102,7 +102,7 @@ void __init dove_ehci1_init(void) + void __init dove_ge00_init(struct mv643xx_eth_platform_data *eth_data) + { + orion_ge00_init(eth_data, DOVE_GE00_PHYS_BASE, +- IRQ_DOVE_GE00_SUM, IRQ_DOVE_GE00_ERR); ++ IRQ_DOVE_GE00_SUM, IRQ_DOVE_GE00_ERR, 0); + } + + /***************************************************************************** +Index: alunn/arch/arm/mach-kirkwood/common.c +=================================================================== +--- alunn.orig/arch/arm/mach-kirkwood/common.c 2012-07-20 09:14:46.000000000 +0200 ++++ alunn/arch/arm/mach-kirkwood/common.c 2012-07-20 17:51:03.104927094 +0200 +@@ -301,7 +301,7 @@ void __init kirkwood_ge00_init(struct mv + { + orion_ge00_init(eth_data, + GE00_PHYS_BASE, IRQ_KIRKWOOD_GE00_SUM, +- IRQ_KIRKWOOD_GE00_ERR); ++ IRQ_KIRKWOOD_GE00_ERR, 1600); + /* The interface forgets the MAC address assigned by u-boot if + the clock is turned off, so claim the clk now. */ + clk_prepare_enable(ge0); +@@ -315,7 +315,7 @@ void __init kirkwood_ge01_init(struct mv + { + orion_ge01_init(eth_data, + GE01_PHYS_BASE, IRQ_KIRKWOOD_GE01_SUM, +- IRQ_KIRKWOOD_GE01_ERR); ++ IRQ_KIRKWOOD_GE01_ERR, 1600); + clk_prepare_enable(ge1); + } + +Index: alunn/arch/arm/mach-mv78xx0/common.c +=================================================================== +--- alunn.orig/arch/arm/mach-mv78xx0/common.c 2012-07-20 09:14:46.000000000 +0200 ++++ alunn/arch/arm/mach-mv78xx0/common.c 2012-07-20 17:50:26.712928695 +0200 +@@ -213,7 +213,7 @@ void __init mv78xx0_ge00_init(struct mv6 + { + orion_ge00_init(eth_data, + GE00_PHYS_BASE, IRQ_MV78XX0_GE00_SUM, +- IRQ_MV78XX0_GE_ERR); ++ IRQ_MV78XX0_GE_ERR, 0); + } + + +@@ -224,7 +224,7 @@ void __init mv78xx0_ge01_init(struct mv6 + { + orion_ge01_init(eth_data, + GE01_PHYS_BASE, IRQ_MV78XX0_GE01_SUM, +- NO_IRQ); ++ NO_IRQ, 0); + } + + +Index: alunn/arch/arm/mach-orion5x/common.c +=================================================================== +--- alunn.orig/arch/arm/mach-orion5x/common.c 2012-07-20 09:14:46.000000000 +0200 ++++ alunn/arch/arm/mach-orion5x/common.c 2012-07-20 17:50:26.744928692 +0200 +@@ -109,7 +109,7 @@ void __init orion5x_eth_init(struct mv64 + { + orion_ge00_init(eth_data, + ORION5X_ETH_PHYS_BASE, IRQ_ORION5X_ETH_SUM, +- IRQ_ORION5X_ETH_ERR); ++ IRQ_ORION5X_ETH_ERR, 0); + } + + +Index: alunn/arch/arm/plat-orion/common.c +=================================================================== +--- alunn.orig/arch/arm/plat-orion/common.c 2012-07-20 09:14:46.000000000 +0200 ++++ alunn/arch/arm/plat-orion/common.c 2012-07-20 17:50:26.756928690 +0200 +@@ -291,10 +291,12 @@ static struct platform_device orion_ge00 + void __init orion_ge00_init(struct mv643xx_eth_platform_data *eth_data, + unsigned long mapbase, + unsigned long irq, +- unsigned long irq_err) ++ unsigned long irq_err, ++ unsigned int tx_csum_limit) + { + fill_resources(&orion_ge00_shared, orion_ge00_shared_resources, + mapbase + 0x2000, SZ_16K - 1, irq_err); ++ orion_ge00_shared_data.tx_csum_limit = tx_csum_limit; + ge_complete(&orion_ge00_shared_data, + orion_ge00_resources, irq, &orion_ge00_shared, + eth_data, &orion_ge00); +@@ -343,10 +345,12 @@ static struct platform_device orion_ge01 + void __init orion_ge01_init(struct mv643xx_eth_platform_data *eth_data, + unsigned long mapbase, + unsigned long irq, +- unsigned long irq_err) ++ unsigned long irq_err, ++ unsigned int tx_csum_limit) + { + fill_resources(&orion_ge01_shared, orion_ge01_shared_resources, + mapbase + 0x2000, SZ_16K - 1, irq_err); ++ orion_ge01_shared_data.tx_csum_limit = tx_csum_limit; + ge_complete(&orion_ge01_shared_data, + orion_ge01_resources, irq, &orion_ge01_shared, + eth_data, &orion_ge01); +Index: alunn/arch/arm/plat-orion/include/plat/common.h +=================================================================== +--- alunn.orig/arch/arm/plat-orion/include/plat/common.h 2012-07-20 09:14:46.000000000 +0200 ++++ alunn/arch/arm/plat-orion/include/plat/common.h 2012-07-20 17:50:26.772928691 +0200 +@@ -39,12 +39,14 @@ void __init orion_rtc_init(unsigned long + void __init orion_ge00_init(struct mv643xx_eth_platform_data *eth_data, + unsigned long mapbase, + unsigned long irq, +- unsigned long irq_err); ++ unsigned long irq_err, ++ unsigned int tx_csum_limit); + + void __init orion_ge01_init(struct mv643xx_eth_platform_data *eth_data, + unsigned long mapbase, + unsigned long irq, +- unsigned long irq_err); ++ unsigned long irq_err, ++ unsigned int tx_csum_limit); + + void __init orion_ge10_init(struct mv643xx_eth_platform_data *eth_data, + unsigned long mapbase, + + diff --git a/3.3.8/linux-2.6-defaults-fat-utf8.patch b/3.3.8/linux-2.6-defaults-fat-utf8.patch new file mode 100644 index 0000000..0d40fd3 --- /dev/null +++ b/3.3.8/linux-2.6-defaults-fat-utf8.patch @@ -0,0 +1,15 @@ + +https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=181963 + +--- linux-2.6.15.noarch/fs/fat/inode.c~ 2006-02-20 23:20:12.000000000 -0500 ++++ linux-2.6.15.noarch/fs/fat/inode.c 2006-02-20 23:21:42.000000000 -0500 +@@ -952,7 +952,8 @@ static int parse_options(char *options, + opts->shortname = 0; + opts->name_check = 'n'; + opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0; +- opts->utf8 = opts->unicode_xlate = 0; ++ opts->utf8 = 1; ++ opts->unicode_xlate = 0; + opts->numtail = 1; + opts->nocase = 0; + *debug = 0; diff --git a/3.3.8/linux-2.6-x86-tune-generic.patch b/3.3.8/linux-2.6-x86-tune-generic.patch new file mode 100644 index 0000000..7a7c76e --- /dev/null +++ b/3.3.8/linux-2.6-x86-tune-generic.patch @@ -0,0 +1,13 @@ +* Optimise for today's CPUs. + +--- linux-2.6/arch/x86/Makefile_32.cpu 2006-01-09 11:39:04.000000000 -0500 ++++ linux-2.6/arch/x86/Makefile_32.cpu 2006-01-09 11:39:36.000000000 -0500 +@@ -15,7 +15,7 @@ cflags-$(CONFIG_M486) += -march=i486 + cflags-$(CONFIG_M586) += -march=i586 + cflags-$(CONFIG_M586TSC) += -march=i586 + cflags-$(CONFIG_M586MMX) += -march=pentium-mmx +-cflags-$(CONFIG_M686) += -march=i686 ++cflags-$(CONFIG_M686) += -march=i686 $(call tune,generic) + cflags-$(CONFIG_MPENTIUMII) += -march=i686 $(call tune,pentium2) + cflags-$(CONFIG_MPENTIUMIII) += -march=i686 $(call tune,pentium3) + cflags-$(CONFIG_MPENTIUMM) += -march=i686 $(call tune,pentium3) diff --git a/3.3.8/linux-3.4-e2c-0.4.59.patch b/3.3.8/linux-3.4-e2c-0.4.59.patch new file mode 100644 index 0000000..ac2ac0e --- /dev/null +++ b/3.3.8/linux-3.4-e2c-0.4.59.patch @@ -0,0 +1,7781 @@ +--- linux-3.4-rc5/fs/ext2/ChangeLog.e2compr-26port 1969-12-31 19:00:00.000000000 -0500 ++++ linux-3.4-rc5-e2c/fs/ext2/ChangeLog.e2compr-26port 2012-05-03 22:17:53.267994289 -0400 +@@ -0,0 +1,453 @@ ++ ++e2compr - Released under the GPL V 2 license. ++ ++ ++Installation: ++============= ++ ++1. gunzip: ++ > gunzip linux-3.1-rc3-e2c-0.4.59.patch.gz ++ ++2. change to you kernel directory ++ ++3. make clean: ++ > make clean ++ ++3. patch: ++ > patch -p1 < ../patch/to/patch/linux-3.4-e2c-0.4.59.patch ++ ++ see if any rejects occured: ++ > find | grep .rej ++ ++ WARNING: All rejects must be fixed manually! ++ ++4. config: ++ > make oldconfig ++ > make menuconfig ++ Now enable at least the ext2-compression feature: ++ Filesystems: ++ <*> Second extended fs support ++ [ ] Ext2 extended attributes ++ [ ] Ext2 execute in place support ++ [*] Ext2 file compression (DANGEROUS) ++ Ext2 file compression options ---> ++ ++5. make: ++ > make ++ ++ ++Building a patch: ++================= ++ ++files.txt: ++ ++fs/ext2/ChangeLog.e2compr-26port ++Documentation/filesystems/e2compress.txt ++fs/ext2/Readme.e2compr ++fs/Kconfig ++include/linux/ext2_fs_c.h ++fs/ext2/Makefile ++fs/ext2/compress.c ++fs/ext2/e2zlib.c ++fs/ext2/adler32.c ++fs/ext2/super.c ++fs/ext2/ialloc.c ++fs/ext2/balloc.c ++fs/ext2/inode.c ++fs/ext2/file.c ++fs/ext2/ioctl.c ++fs/ext2/ext2.h ++include/linux/ext2_fs.h ++fs/fcntl.c ++mm/truncate.c ++mm/swapfile.c ++mm/filemap.c ++mm/page_alloc.c ++ ++ ++cat files.txt | xargs -n1 -I '{}' diff -pruNbB linux-3.4/'{}' linux-3.4-e2c/'{}' > ./linux-3.1-e2c-0.4.59.patch ++ ++ ++Changelog: ++========== ++1 May 2012 ++ Matthias Winkler ++ * released version 0.4.59 for kernel 3.4 ++ * compress.c: ++ - ext2_get_cluster_pages() ++ Removed dead code for releasing cached pages using ++ page_cache_release() and pagevec_free(). ++ Releasing cached pages could not have worked since ++ porting from 2.6.22 to 2.6.25 (April 2008) ++ * mm/truncate.c: ++ - fixes broken files on non ext2 partitions ++ - moved conditional truncate from truncate_pagecache() ++ to vmtruncate() as suggested by Andreas Flick ++ * made patch even smaller ++ ++25 August 2011 ++ Matthias Winkler ++ * released version 0.4.58 for kernel 3.1 ++ * file.c: i_alloc_sem was removed. I am not sure if only holding i_mutex ++ will be enough. See http://patchwork.ozlabs.org/patch/101859/. ++ In ext2_file_write() I replaced: ++ ++ mutex_lock(&inode->i_mutex); ++ - down_read(&inode->i_alloc_sem); ++ + atomic_inc(&inode->i_dio_count); ++ ++ - up_read(&inode->i_alloc_sem); ++ + inode_dio_done(inode); ++ mutex_unlock(&inode->i_mutex); ++ ++ The main prupose of i_dio_count is blocking vmtruncate_range() ++ as long as the i_dio_count is greater than 0. In other words, ++ all direct io must be completed before truncating is allowed. ++ ++ * file.c: generic_osync_inode was removed from mm - added functionality to ++ file.c as ex_generic_osync_inode() ++ * file.c: changed: &inode_lock to &inode->i_lock ++ * ext2_warning() replaced by ext2_msg() ++ * compress.c: vfs_dq_init(inode) replaced by dquot_initialize(inode) ++ * compress.c: ext2_truncate(inode) replaced by ++ ext2_truncate_blocks(inode, inode->i_size) which looks like ++ exactly the same! ++ * inode.c: dentry->d_lock now seems to need ++ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED) held. ++ * compress.c, inode.c: added might_schedule() before wait_on_buffer() ++ statements to assure we are not atomic at this point. ++ * truncate.c: removed patch from memory.c and moved it to truncate.c ++ as surrounding kernel code also moved there. vmtruncate() was ++ split in truncate_setsize() and truncate_pagecache() with kernel 3.1 ++ ++ ++10 August 2009 ++ Matthias Winkler ++ * released version 0.4.58 ++ * merged assert.h and debug.h into ext2_fs_c.h ++ * merged NDEBUG into EXT2_COMPR_DEBUG ++ * disabled adler cheksums on "read" if not defined EXT2_COMPR_DEBUG. ++ * merged none.c into compress.c ++ * inserted multiple defines "CONFIG_EXT2_COMPRESS" to allow disabling ++ of ext2compression with patched sources. ++ * re-inserted EXPORT_SYMBOL(__pagevec_free) to support ext2 as module ++ ++05 August 2009 ++ Matthias Winkler ++ * released version 0.4.57 ++ * ported to kernel 2.6.30: ++ inode.c: after fix of generic ext2 ext2_get_blocks() needed to remove bforget. ++ * integrated SMP from version 0.4.56 ++ * per CPU one separate read and one separate write working area ++ * removed all external compression codecs ++ * removed "verify compression" (never helped to find a bug anyway) ++ * Lindent'ed all source and header files ++ ++01 August 2008 ++ Matthias Winkler ++ * released version 0.4.55 ++ * complete code cleanup ++ * changed policy to ALWAYS_LOCKING pages in do_generic_mapping_read() ++ => completely removed PG_Compr-Flag now! ++ ++31 July 2008 ++ Matthias Winkler ++ * released version 0.4.54 ++ * fixes rare himem bug: only occures if page > cluster in inode.c/readpage() ++ * fixes rare readpage bug in mm/filemap.c/do_generic_mapping_read(): ++ PG_Compr flags dissallow reading a page while de/compressing. ++ Setting and unsetting it requires the page lock, with one exception ++ do_generic_mapping_read() in filemap.c. This is done because of performance ++ reasons. Anyway, a simultaneous call of do_generic_mapping_read() for the SAME ++ page might break the PG_Compr-Mimic. ++ ++ Solutions: Always lock any page before reading OR second(n-th) call of ++ do_generic_mapping_read() busy waits until first is done. ++ Default is busy wait now, ALWAYS_LOCK implemented as option via define. ++ ++25 June 2008 ++ Matthias Winkler ++ * released version 0.4.53 ++ * fixes himem bug: unmapped block in ext2_decompress_cluster() ++ * fixes bdev bug: ext2_get_block() must be called for every block ++ which cause ooops because of bdev == NULL. ext2_get_block() will ++ set the correct bdev and the correct blocknumber of the block. ++ ++ NEVER assign bdev manually, because the blocknumber might be random then: ++ "block->b_bdev = something" (DON'T!) ++ ++ ALWAYS use: ++ if (!buffer_mapped(block)) || (block->b_bdev == NULL) ++ ext2_get_block() ++ ++ Bdev bug is closely related to file holes (empty block in a file). ++ If compressed data will be written to a former hole, then ++ usually ext2_get_block() must be called with create. ++ ext2_get_block( , , , 1 /*create*/). ++ ++ * fixed missing include in xattr.h ++ * EXT2_COMPRBLK might be removed during compression if a cluster ++ doesn't compress. During compression we re-raise EXT2_COMPRBLK ++ flag after every cluster now. ++ * added missing export of __pagevec_free to (mm/page_alloc.c) ++ * deny O_DIRECT access mode after open of a file using fcntl() ++ (in fs/fcntl.c). ++ * file.c: ++ Replaced ext2_filew_write() to use kernels generic ++ do_sync_write(). Writing on compressed files calls ++ ext2_filew_write(): ++ - divide write range into clusters ++ - ext2_decompress_cluster (if needed) ++ - do_sync_write() ++ - ext2_compress_cluster (if needed) ++ * inode.c: ++ ext2_writepage()/ext2_writepages() usually writes back ++ dirty pages of an inode. They reside in the kernels page cache. ++ This pages might e.g. be written/dirtied by a mmap()-ped file. ++ Also generic_file_aio_write() uses ext2_writepage() finally. ++ I don't see how the ext2_writepage() would handle compressed ++ files, so I re-inserted and re-wrote this part of old 2.4 code. ++ Don't know if this code (USE_WRITEPAGE) is needed at all. ++ So I leave it disabled by default. Enabled it might ++ leave compressed files with compression ratio of 100%. ++ Don't use yet! ++ ++17 April 2008 ++ Matthias Winkler ++ * first patch for kernel 2.6.25 released ++ ++20 March 2008 ++ Matthias Winkler ++ * version 0.4.52: EXT2_COMPRESS_WHEN_CLU didn't work. this ++ feature enables compression during file write. ++ ++15 Oct 2007 ++ Matthias Winkler ++ * First offical Sourceforge release as version 0.4.51 ++ * TODO: figure out what is necessary to enable swap ++ suppport for e2compr again (see mm/swapfile.c). ++ ++27 Sep 2007 ++ Matthias Winkler ++ * System stalled with a lot of I/O during de-compression of ++ USB-Sticks, too. I replaced mark_buffer_dirty ++ with set_buffer_dirty. This achieves that ONLY the buffers ++ and not the pages are marked. Then I write back the ++ buffers with ll_rw_block() at the end of ++ ext2_decompress_cluster() and ext2_decompress_pages(). ++ This should stop flooding the system with dirty pages. ++ Because now every routine waits for its newly dirtied buffers. ++ My system with 128MB of RAM is responding much more better during ++ compression/decompression now. Desompression also seems ++ to be a bit faster. ++ (this change is active with: #ifndef E2C_GENERIC_OSYNC) ++ ++25 Sep 2007 ++ Matthias Winkler ++ * System stalled with a lot of I/O during compression of ++ USB-Sticks. Seems generic_osync_inode() should not be ++ called in ext2_compress_cluster. Therefore I replaced ++ it with ll_rw_block() to write the modified blocks ++ directly back to disk. This gave also a ~100% better ++ performance for compression. ++ ++9 Sep 2007 ++ Matthias Winkler ++ * fixed bdev-bug. this bug appeared primarily when ++ files contained holes. A page with holes, which ++ was dirty caused ext2_get_cluster_blocks [ext2_get_block()] ++ to create ALL blocks of the page, even if there were holes! ++ These allocated hole-blocks weren't set to 0 anywhere and ++ therefore contained invalid data. I changed the ++ code to never allocate these holes. ++ ++ * ext2_truncate() added again to ext2_compress_cluster for ++ uncompressed clusters. Fixes filesize errors reported by ++ "e2fsck -f /dev/..." ++ ++24 Aug 2007 ++ Matthias Winkler ++ ++ Major changes: ++ * completly ported inode->i_mutex ++ ++ * clever CONFIG_GZ_HACK to reject "uncompressable" files ++ (according to their extension) early. The IOCTL in ioctl.c ++ which sets the compression on the file already rejects such ++ extensions now. ++ ++ * new create_empty_buffers_e2c() was necessary, because the ++ "extra"-pages should NOT have a valid i_mapping! Further the ++ buffers needed to be initalized right. ++ ++ * proper block initalization (bdev-bug) in: ++ - create_empty_buffers_e2c() ++ - ext2_get_cluster_blocks ++ ++ * in file.c copied: ++ ...with one single change at ext2_mapping_read in label page_ok: ++ A new Page-Flag (page-flags.h) the so called "PG_compr"-Flag is ++ checked to assure the corresponding page is not under ++ compression/decompression. This was necessary because ++ generic_mapping_read() doesn't lock() the page in ALL cases!!! ++ Otherwise the generic_mapping_read() would have to lock EVERY page ++ in the whole system before returning it.... ++ ++ * Fixed HiMem-Support: Balanced ALL kamp/kunmap calls. Unbalanced ++ functions cause the system to hang at "kmap_himem()" after some ++ time. Can be seen with magic-sysctrl "altgr + prtscr + W". ++ ++ * ext2_decompres_cluster() didn't mark uptodate pages for writeback. ++ Don't know how this method could EVER have worked... ++ ++ * ext2_compress_cluster() caused an always increasing amount of dirty-pages ++ (cat /proc/vmstat) which couldn't be wrote back by sync/umount. ++ I think this was due the ClearPageDirty at the end of ext2_compress_cluster(). ++ ++ * introduced ext2_get_dcount() to savely determine if a file is really "open" ++ and to abort compression/decompression in such a case. ++ ++ * Removed gzip completely and not working assembler code. Replaced by the ++ kernels built-in zlib, which is pretty the same code... ++ ++ * New kernel configuration interface ++ ++ * Rollback of some unecessary "fixes"... ++ ++ TODO: ++ ++ * HiMem-Support: ++ One might try to use kmap_atomic instead of kamp in ext2_readpage. kmap_atomic ++ doesn't block and might speed up the regular page reading. might. ++ ++20 April 2007 ++ Andreas: ++ ++ * Replaced GZIP with zlib of the kernel because the assembly versions of existing ++ compression modules crashed. ++ ++ * Replaced gzip with the kernel zlib, which is built-in anyway ++ ++ * Initial HiMem-Support. ++ ++ ++06 Mar 2007 ++ ++ Terry Loveall ++ ++ * adapted linux-2.6.10-e2compr-0.4.45-alpha0126.diff to 2.6.18.5 kernel ++ ++ * replaced most instances of down/up(inode->i_sem) with ++ lock/unlock(inode->i_mutex). For exception see file.c, below. ++ ++ * made various printk regularizations to uniquely identify each printk ++ instance. Inserted missing KERN_DEBUG and KERN_WARNING. ++ ++ * compress.c: ++ bug fix: ext2_count_blocks: init head_bh for each iteration. ++ bug fix: ext2_count_blocks: add set clen=ulen for uncompressable clusters. ++ bug fix: ext2_compress_cluster: replacement and inlining of an ++ invalidate_inode_buffers function to keep root filesystem changes ++ uptodate on disk (prevents umounting root file system to update). ++ warning fix: ext2_compress_cluster: various variables initialized. ++ ext2_compress_cluster: removed #ifdef NDEBUG ++ bug fix: ext2_compress_cluster: defined maxclus, calculate and set for: ++ bug fix: ext2_compress_cluster: set filesize for uncompressed clusters. ++ ext2_cleanup_compressed_inode: changed error message to indicate 'Z' ++ flag was caused by trying to un/compress already open file. ++ bug fix: cp to compr dir: Truncate uncompressed files to their ++ uncompressed length, i.e. force kernel to update inode and sb ++ ++ * file.c: ++ removed file->f_error code since f_error no longer in file struct. ++ ext2_file_write: changed down/up i_sem to down_read/up_read i_alloc_sem ++ ++ * inode.c: ++ bug fix: ext2_get_block: restored changed: loop to bforget ++ ++ * ioctl.c: ++ ext2_ioctl: scrubbed 'B' flag on file uncompress. ++ ++ * match[56]86.S: ++ made code dependent on #ifdef CONFIG_REGPARM to compile with either ++ register variable or stack variable parameter passing. ++ ++28 Feb 2005 ++ ++ Yabo Ding , ++ ++ * Corrected page unlocking in inode.c. ++ ++19 Feb 2005 ++ ++ Paul Whittaker ++ ++ * Added corrections le32_to_cpu in critical areas of compress.c ++ * Optimized function exit code in inode.c. ++ ++24 Aug 2004 ++Yabo Ding , ++ ++ compress.c ++* ext2_decompress_pages() ++ The old code cannot reread data from disk to a changed buffers data pointer in 2.6.x. ++ So, I copy memory data(decompressed) to a temporary buffer; ++ Then reread data(compressed) from disk, and copy to head; ++ Then copy back the memory data from temporary buffer. ++ It seems clumsy, but it works well. ++* ext2_compress_cluster() ++ Force write to disk. ++ ++ inode.c ++* ext2_writepage() ++ Delete old code. All directly call block_write_full_page() function. ++ ++* ../Kconfig ++ Change e2compr config as a submenu config ++ ++04 Aug 2004 ++ ++Paul Whittaker ++ ++* compress.c: replaced mark_buffer_dirty(x,y) with mark_buffer_dirty(x). I'm ++ still not at all sure that this is sufficient. ++ ++03 Aug 2004 ++ ++Paul Whittaker ++ ++* ../../include/linux/ext2_fs_c.h: added missing prototypes for ext2_iLZRW3A(), ++ ext2_iLZRW3A(), ext2_rLZRW3A(). ++ ++02 Aug 2004 ++ ++Paul Whittaker ++ ++* ../../mm/page_alloc.c: added EXPORT_SYMBOL(__pagevec_free). ++ ++* ../../include/linux/pagemap.h, ../../mm/filemap.c: removed inline from ++ __grab_cache_page() declarations, added EXPORT_SYMBOL(__grab_cache_page). ++ ++* ../../include/linux/mm.h, ../../mm/filemap.c: removed inline from ++ page_waitqueue() declarations, added EXPORT_SYMBOL(page_waitqueue). ++ ++* bzip2/{lib_bzip_d,lib_bzip_e}.c, {gzip,lzo,lzrw3a,lzv1}/e2compr*.c: ++ replaced MOD_INC_USE_COUNT and MOD_DEC_USE_COUNT with try_module_get() ++ and module_put() to avoid deprecation and safety warnings. ++ ++* lzrw3a/lzrw3a.c: added (UBYTE *) casts to avoid compiler warnings. ++ ++* compress.c, inode.c: incorporated Yabo's changes, correcting mistakes in ++ ext2_readpages() in inode.c. ++ ++* removed printks for ext2_discard_prealloc from file.c and inode.c (not ++ needed now that this problem has been resolved). ++ ++2.6.5 -> 2.6.7 updates: ++ ++* ../../mm/filemap.c: rewrote CONFIG_EXT2_COMPRESS hunk for 2.6.7. ++ ++* compress.c, file.c: use mapping_mapped(), since mapping->i_mmap has changed ++ and mapping->i_mmap_shared no longer exists. ++ ++* inode.c: page->count becomes page->_count. +--- linux-3.4-rc5/Documentation/filesystems/e2compress.txt 1969-12-31 19:00:00.000000000 -0500 ++++ linux-3.4-rc5-e2c/Documentation/filesystems/e2compress.txt 2012-04-30 04:11:03.787143100 -0400 +@@ -0,0 +1,116 @@ ++Transparent compression for ext2 filesystem ++=========================================== ++ ++What this document is. ++---------------------- ++This document is intended for explaining how e2compress has been implented/ported ++in kernel 2.4. It also give a status of current work. You need to have e2compress ++knowledge (i.e. to know how e2compress works, from a general point of view) ++ ++What this document is not. ++-------------------------- ++This document is not a full explaination of how e2compress work. For this, ++there are other documents such as fs/ext2/Readme.e2compr file for the technical ++point of view and user manual can be found at . ++This site is also a place were you will find many information about e2compress ++development for kernel 2.4, tools, manuals and so on. ++ ++ ++Introduction ++============ ++ ++This is a first adaptation of e2compress for kernel 2.4. The work has been done ++by Alcatel (Alcatel Business Systems - R&D) at Illkirch. It has been started ++from the latest patch provided by Peter Moulder for kernel 2.2, ++i.e. e2compr-0.4.39-patch-2.2.18. ++It is full compatible with previous version. ++Here after you will first find some explainations about the choices mades for ++the development, and then the status of current work from functionnal point of ++view. ++ ++ ++Development ++=========== ++ ++As for previous patches, most interesting happens when reading in ext2_readpage ++and when writing in ext2_writepage and ext2_file_write. ++In fact, in 2.2 kernel, compression occures on cluster of blocks. So when reading ++or writing a part of a file, we first have to compute the cluster on which I/O ++occures, then we have to get every buffers of the cluster, uncompress the data if ++needed, then reading/writing happens "as for normal files". ++In 2.4 kernels, I/O occures through page cache: i.e. when reading/writing to a ++part of the file, first the corresponding page is get, we then get the needed ++buffers, which point to the page; this means that for keeping same work as for 2.2, ++we have to use the notion of cluster of page. For getting every buffers of a cluster, ++we first get every pages of the cluster, then get buffers of every pages... ++ ++So, things happens as follow: ++ ++ext2_readpage ++------------- ++If data corresponding to the page are in a compressed cluster, this functions perfoms ++more works: instead of reading one page, it reads the whole "cluster of pages". ++In fact, anyway, we have to read all compressed buffer. Once we have got all buffers ++of the cluster, uncompressed (at least a part of) the data, and located the part of ++the uncompressed data which correspond to the requested page, there is not any more ++lot of work for also reading (i.e. doing some memcpy) other pages belonging to this ++cluster. ++So, the first reading of the first page of the cluster is quite longer, but then, ++every pages of the cluster are uptodate in the cache. ++ ++ext2_writepage ++-------------- ++An overhead has been added for pages belonging to a compressed cluster. ++In fact, if cluster is still compressed on the disk, we can't directly write the ++page (which contains uncompressed data) in the middle of a compressed cluster. ++So, we first have to uncompress the whole cluster on the disk, then we can write the ++new data of the dirty page(s). ++ ++ext2_file_write ++--------------- ++This replaces `generic_file_write' when e2compress option is activated. ++It is a copy of `generic_file_write'. The main difference is that instead of looping ++page by page in `generic_file_write', we loops on cluster of page. ++In each loop: ++ * we compute the cluster on which beginning of data (to be written) belongs to. ++ * then, we get all pages of the cluster. ++ * If cluster is a compressed one, we read all pages, and uncompress it. ++ Otherwise, we perfoms a `prepare_write' (as in generic_file_write). ++ * We copy the data on each page from user space, ++ * Call `commit_write' on dirty pages. ++ * When reaching end of cluster, we compress it. (As in 2.2) ++ ++Note: Another implentation could have been to keep generic_file_write, and add an overhead ++to `ext2_prepare_write' and `ext2_commit_write'; on the first access to a page of a compressed ++cluster, whole cluster will be uncompressed (i.e. all pages of the cluster will be read and ++uncompressed in `ext2_prepare_write') and when commiting the last page of the cluster, ++compression occures... ++ ++ext2_open_file ++-------------- ++In 2.4.16 kernel, this function has been added for treating the case of files opened for ++"direct IO". Direct IO is not supported on compressed file. So opening a file by this way ++is forbidden. ++ ++Other places in ext2 ++-------------------- ++Other changes occures as in 2.2 for managing the compression flags of files and specific ++`COMPRESSED_BLK_ADDR' address for compressed blocks. ++So please, refer to existing documentation for 2.2 about this topic. ++ ++Status ++====== ++Today (middle of december 2001), e2compress on kernel 2.4.16 has been tested on i386 ++architecture, is used with success by tens of people in the department from some weeks. ++It is full fonctionnal on ix86, full compatible with 2.2 version of e2compress. ++It should work on other architecture, but has NOT been tested. ++Please, note the following: ++ * No performance tests have been done. ++ * I don't proclaim that code is optimized (and it is probably not, but I hope that ++ "gurus" will not find it too bad) ++So, I think I can say that there is no known "big" bug or "blocking" bug. ++ ++Some strange things has been observed in very limit case, i.e. when memory is overloaded. ++ ++ ++As usual, this e2compress comes without warranty, use it at your won risk, etc... +--- linux-3.4-rc5/fs/ext2/Readme.e2compr 1969-12-31 19:00:00.000000000 -0500 ++++ linux-3.4-rc5-e2c/fs/ext2/Readme.e2compr 2012-04-30 04:11:03.788143096 -0400 +@@ -0,0 +1,511 @@ ++ ++ 0. Introduction ++ ~~~~~~~~~~~~~~~ ++ ++This file gives some technical information on e2compr and how it's ++implemented. ++ ++More general information on e2compr can be found at ++http://e2compr.sourceforge.net/. ++ ++The first couple of sections of this document are written for those ++who have no interest in the source code but just want to know enough ++to be able to predict and understand e2compr behaviour and its ++implications. ++ ++Section 3 describes the e2compr-specific ext2 attributes for a file ++(i.e. chattr things). ++ ++Section 4 describes the e2compr ioctls from the point of view of a ++user-mode C programmer. ++ ++Section 5 gives more detail about the file format on disk. ++ ++Section 6 gives details on what's written where, i.e. a map of e2compr ++code in the kernel. ++ ++ ++Authorship: section 2 is written mainly by Antoine; the remainder is ++written by Peter. ++ ++Questions should be sent to the e2compr mailing list, ++e2compr-misc@lists.sourceforge.net, or to the current maintainers, ++bothie@users.sourceforge.net and whitpa@users.sourceforge.net. ++ ++ ++ 1. The idea ++ ~~~~~~~~~~~ ++ ++See section `E2compr implementation' in the main e2compr texinfo ++documentation for an introduction to how e2compr works. (Type ++`info "(e2compr)Implementation"' at the shell prompt.) It was ++originally written as part of the file you're now reading. ++ ++ ++ 2. More details ++ ~~~~~~~~~~~~~~~ ++ ++Every compressed file stores its cluster size in the inode structure ++(in the ext2 attribute flags field). ++This (the cluster size) is the most important information: when ++knowing the cluster size, we can convert a block number into a cluster ++number, get the cluster the block belongs to, and then get the block. ++The inode's flags field also keeps the algorithm that is used to compress data ++written to the file. ++ ++(The algorithm that was used to compress a given ++cluster is stored in the cluster head near the beginning of the ++compressed data. This may differ from the current algorithm ++identified in the inode, which is only used to determine which ++algorithm to use at the time clusters are written.) ++ ++The algorithm id and the cluster size are stored in the i_flags field ++(thus reducing the number of possible flags). We also create some new ++flags: the COMPRBLK flags tells if there is at least one compressed ++cluster in the file, the ECOMPR flag indicates that an error (related ++to compression) occurred while reading from or writing to this file. ++If it is set, the file becomes read-only. (In previous releases, you ++were denied even read access to the file unless you set the NOCOMPR ++flag. There might be some benefit in returning to the old behaviour ++if decompressing erroneous data can cause an OOPS, but I think it ++would be better to correct the decompressors. Others may disagree, ++pointing out that it costs CPU time to check for incorrect data.) ++ ++Beside the information stored into the inode, each cluster holds some ++data. Here is the cluster_head structure for e2compr-0.4: ++ ++struct ext2_cluster_head { ++ __u16 magic; /* == EXT2_COMPRESS_MAGIC_04X. */ ++ __u8 method; /* compression method id. */ ++ __u8 holemap_nbytes; /* length of holemap[] array */ ++ __u32 checksum; /* adler32 checksum. Checksum covers all fields ++ below this one, and the compressed data. */ ++ __u32 ulen; /* size of uncompressed data */ ++ __u32 clen; /* size of compressed data (excluding cluster head) */ ++ __u8 holemap[0]; /* bitmap describing where to put holes. */ ++}; ++ ++The `magic' field is a magic number. It is used to detect filesystem ++corruption, and can also be used for data recovery purposes. (The ++e2compress program for e2compr-0.3 does this.) ++ ++The `checksum' field contains an Adler-32 checksum on the fields below ++it in the struct and the compressed data. Its purpose is to protect ++us from buffer overruns caused by corrupted data. ++ ++The `ulen' field says how many bytes are stored in the cluster, when ++uncompressed. ++ ++The `clen' field says how many bytes are held in the cluster, when ++compressed. ++ ++The `method' ++field identifies the algorithm that was used to compress the cluster ++(this id will be used to uncompress the cluster, not the one stored ++into the inode that will be used only to compress a new cluster). ++ ++The variable-length `holemap' array says where to put hole blocks when ++decompressing data. The `holemap_nbytes' field gives the length of ++this array. Iff holemap_nbytes is zero then there are no holes (other ++than at the end of the cluster, as determined by ulen versus cluster ++size). ++ ++The compressed data immediately follows the holemap array (with no ++padding before it). ++ ++ ++Compressing a cluster is done in the following way: We first get every ++block in the cluster and compute the bitmap. We then compress the ++non-hole data, and store back the compressed data into the existing ++blocks. Unused blocks are then freed. ++ ++Decompressing a cluster is done in the following way: We get the ++cluster head and retrieve the bitmap. Missing blocks are allocated and ++put where the bitmap says, and then compressed data is decompressed and ++stored back into the blocks. ++ ++ ++Reading from a compressed cluster is really easy: get the blocks, ++decompress them into a working area, and get the bytes we want from ++the working area. Writing to a compressed cluster is done by first ++decompressing the cluster, and then write to it, as if it were a ++normal file. The file is then marked so that the cluster will be ++recompressed later. [pjm: Do we decompress the cluster even if it's ++to be entirely written over?] ++ ++In the current version, compression really occurs only when the inode ++is put (which in turn only occurs when no processes have the file ++open). This may change. ++ ++ ++ 3. Ext2 file attributes ++ ~~~~~~~~~~~~~~~~~~~~~~~ ++ ++Attribute Lsattr Meaning ++~~~~~~~~~ ~~~~~~ ~~~~~~~ ++EXT2_SECRM_FL s Secure deletion (not yet implemented) ++EXT2_UNRM_FL u Undelete-able. (Not yet implemented.) ++EXT2_COMPR_FL c Future writes to this file should be compressed. ++ (Clearing this flag decompresses the file if it ++ is a regular file and there is space to do so; ++ see the e2compr FAQ for details.) ++EXT2_SYNC_FL S Synchronous updates. (As far as I know, this is ++ not yet fully implemented.) ++EXT2_IMMUTABLE_FL i Immutable file. ++EXT2_APPEND_FL a Writes to file may only append. ++EXT2_NODUMP_FL d Not a candidate for backup with dump(8). ++EXT2_NOATIME_FL A No access time updates. ++EXT2_DIRTY_FL Z De/compression is yet to happen. Read the ++ source for exact meaning. ++EXT2_COMPRBLK_FL B File contains one or more compressed clusters. ++EXT2_NOCOMPR_FL X Access raw compressed data. This isn't really ++ supported at the moment; user-space access is ++ yet to be worked out for 0.4. ++EXT2_ECOMPR_FL E Compression error associated with this file ++EXT2_BTREE_FL I B-tree indexed directory (seemingly not yet implemented) ++EXT2_RESERVED_FL - (reserved for ext2 lib) ++ ++See the chattr(1) man page for more verbose descriptions of the ++non-e2compr flags. ++ ++ ++ 4. Ioctls available ++ ~~~~~~~~~~~~~~~~~~~ ++ ++ In brief ++ ~~~~~~~~ ++ ++Action Ioctl To kernel From kernel ++~~~~~~ ~~~~~ ~~~~~~~~~ ~~~~~~~~~~~ ++Get cluster bit EXT2_IOC_GETCLUSTERBIT Cluster num 1 or 0 (cmp,uncmp) ++Recognize compressed Cluster num - ++ EXT2_IOC_RECOGNIZE_COMPRESSED ++Get algorithm EXT2_IOC_GETCOMPRMETHOD - Id ++Set algorithm EXT2_IOC_SETCOMPRMETHOD Id - ++Get cluster size EXT2_IOC_GETCLUSTERSIZE - Cluster size ++Set cluster size EXT2_IOC_SETCLUSTERSIZE Cluster size - ++Get attributes EXT2_IOC_GETFLAGS - Flags ++Set attributes EXT2_IOC_SETFLAGS Flags - ++Get block size FIGETBSZ - Block size ++ ++#include to use any of these ioctls, except FIGETBSZ, ++which requires . ++ ++To find out what errors can be returned by these ioctls, read ++fs/ext2/ioctl.c (for all of the above ioctls except FIGETBSZ) or ++fs/ioctl.c (for FIGETBSZ). ++ ++ ++ Setting or testing a cluster bit ++ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++[Note: user-space access to compression details are yet to be worked out, ++so this section may not be accurate.] ++ ++EXT2_IOC_GETCLUSTERBIT sets *arg to 1 if the specified cluster (0 for first ++cluster, 1 for second, etc.) is stored in compressed form. ++ ++To make the kernel consider a certain cluster to be compressed (after ++you've done the compression yourself, in user space), use ++EXT2_IOC_RECOGNIZE_COMPRESSED. This ioctl checks the validity of the ++cluster's data, then marks it as compressed (if valid). This ioctl ++requires special priveleges, because if the compressed data is not ++valid then it may be possible to crash the system (due to buffer ++overruns). ++ ++ ++ Setting or getting the compression algorithm ++ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++EXT2_IOC_SETCOMPRMETHOD sets the default compression method (stored in ++the inode). This is the compression method that is used for future ++writes. In the current version of e2compr [accurate at 0.4.36], this ++does not cause a change to how ++existing clusters are stored, except when the compression method ++changes from `none' to something else, in which case the kernel ++attempts to compress ,all currently-uncompressed clusters` using the ++new algorithm. It is an error to use this ioctl on a file without the ++compressed attribute. ++ ++EXT2_IOC_GETCOMPRMETHOD sets *arg to the current compression method. ++ ++In either case, Id is one of: EXT2_DEFER_METH, EXT2_LZV1_METH, ++EXT2_AUTO_METH, EXT2_NEVER_METH, EXT2_BZIP2_METH, EXT2_LZO1X_1_METH, ++EXT2_LZRW3A_METH (deprecated), EXT2_GZIP1_METH, EXT2_GZIP2_METH, ..., ++EXT2_GZIP9_METH. ++ ++ ++ Setting or getting the cluster size ++ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++EXT2_IOC_SETCLUSTERSIZE sets the cluster size to the value of *arg. ++This ioctl fails if there are already compressed clusters in the file ++(as determined by checking the EXT2_COMPRBLK_FL attribute). ++ ++EXT2_IOC_GETCLUSTERSIZE sets *arg to the current cluster size. ++Surprisingly, this ioctl succeeds even if the EXT2_COMPR_FL attribute ++is clear. (Maybe this will change in future, since the result is ++meaningless.) ++ ++In either case, the size is one of {4, 8, 16, 32}, and represents the ++number of blocks per cluster. To convert to or from a number of ++bytes, use the FIGETBSZ ioctl. ++ ++ ++ Setting or getting the ext2 file attributes ++ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++These ioctls (EXT2_IOC_GETFLAGS and EXT2_IOC_SETFLAGS) are not ++e2compr-specific, but some attributes are e2compr-specific. ++ ++*arg consists of the set of attributes for that file OR'ed together. ++E.g. a value of (EXT2_COMPR_FL | EXT2_COMPRBLK_FL | EXT2_NODUMP_FL) ++for a regular file means that the file contains one or more compressed ++clusters, and should not be backed up when using dump(8). ++ ++See section 3 for a description of the various attributes. ++ ++Note that although the compression method and cluster size are ++physically stored in the flags field on disk this information is ++masked out (i.e. set to zero) for GETFLAGS if the kernel has e2compr compiled in. ++If the kernel does not have e2compr compiled in, then this information ++is not masked out. See section 5 for how the cluster size and ++compression method is stored if you wish to work with ,kernels without ++e2compr`. ++ ++ ++ Getting the block size ++ ~~~~~~~~~~~~~~~~~~~~~~ ++ ++This ioctl (FIGETBSZ) is not e2compr-specific, but is useful in ++interpreting a cluster size (which is specified as a number of blocks ++rather than bytes or kilobytes). ++ ++*arg is set to the block size (in bytes) of the file. For ext2 files, ++this is one of {1024,2048,4096}. It is the same value for all files ++on the same filesystem. ++ ++You must #include to use this ioctl (unlike the rest of ++the ioctls listed here, which require ). ++ ++ ++ 5. File format ++ ~~~~~~~~~~~~~~ ++ ++A note on byte ordering. All current versions of the kernel and ++e2compr write to disk in little-endian format, so the 16-bit number ++`0x8EC7' would be written as a 0xC7 byte followed by a 0x8E byte. ++Unless you want to know the most general rule for byte ordering, you ++can skip to the `Inode' heading. ++ ++In kernel 2.0, the ext2 fs is written to disk in the native byte ++ordering. On x86 machines, this means little endian; most other ++architectures are big-endian (so the same 16-bit number would be ++written as an 0x8E byte followed by 0xC7). ++ ++On kernel 2.1 and later, the ext2 fs (including e2compr data) is ++written in little-endian order regardless of the host architecture. ++ ++ ++ 5.1. Inode ++ ~~~~~~~~~~ ++ ++fs/inode.c controls the reading and writing of inode information ++to/from disk; consult this file (functions ext2_read_inode(), ++ext2_update_inode() and/or ext2_write_inode()) for any detail omitted ++from this section. ++ ++The physical structure of an inode is struct ext2_inode (defined in ++include/linux/ext2_fs.h). ++ ++ ++The i_flags member contains the ext2 file attributes, as well as ++cluster size and compression method. ++ ++The normal flags are stored in the low 23 bits. Only the low 12 bits ++are defined at present, including 4 flags introduced by the e2compr ++patch. See ext2_fs.h for the flag meanings (search for ++EXT2_SECRM_FL). ++ ++Bits 23 through 25 hold the cluster size, or more precisely the log2 of ++the number of filesystem blocks per cluster (excluding the first cluster; ++see ext2_first_cluster_nblocks in include/linux/ext2_fs_c.h). ++ ++Bits 26 through 30 store the compression method. See the definitions ++for EXT2_LZV1_METH etc. in ext2_fs_c.h for the interpretation. ++ ++Bit 31 is reserved for ext2 lib (which means that programs like e2fsck ++store things there during its operation but it isn't used by the ++kernel). ++ ++ ++ Data blocks ++ ~~~~~~~~~~~ ++ ++Uncompressed clusters are stored just as they would be without ++e2compr. So if there are no compressed clusters then the file ++is stored identically to any other file. ++ ++ ++If a cluster is compressed, then the first non-hole block starts with ++a `cluster head', as defined in struct ext2_cluster_head in ext2_fs.h. ++ ++The magic number (i.e. the value of the `magic' field) is 0x8ec7. ++`method' holds one of EXT2_LZV1_ID and the like. `reserved_0' ++contains zero. `ubitmap' describes where the uncompressed data goes. ++(Recall that when we compress a cluster, we only compress the data ++from non-hole blocks, so we need to know where the holes and non-holes ++go when we decompress the data.) A `0' bit means a hole and a `1' bit ++means a data block; bit 0 refers to the first block, b1 the second, ++and so on. ++ ++ ++The block positions within the file where the compressed data is held ++is a subset of where the uncompressed data would be held. Further, if the ++uncompressed data occupies u non-hole blocks and this compresses to c ++blocks, then the compressed data occupies the first c non-hole blocks ++of the file (and the remainder are freed). ++ ++[This paragraph is an expansion of the preceeding: if you understood ++the preceeding paragraph then skip this one.] Consider an array ++cblock[] where cblock[0] holds the block number on disk (or 0 to ++represent a hole) of the first block of a certain cluster of a file, ++cblock[1] the second, and so on. (If you are familiar with the bmap ++array or the format of first-level indirect blocks, then cblock[] is a ++section of that array.) Suppose that the cluster size of this file is ++16 blocks. Suppose too that, when uncompressed, blocks 0, 1, 5 and 6 ++of the cluster are holes but the other 12 blocks (2,3,4,7,8,...,15) ++contain data. (Thus the bitmap is 0x0000ff9c.) Now if we compress this ++cluster to just 5 blocks, then cblock[0], [1], [5] and [6] will continue ++to be holes, ,the positions of the compressed data blocks` are stored in ++cblock[2], cblock[3], [4], [7] and [8], the blocks referenced by ++cblock[9] through cblock[15] are freed, and cblock[9] through cblock[15] ++are set to zero. ++ ++ ++ 6. What's coded where ++ ~~~~~~~~~~~~~~~~~~~~~ ++ ++File names in this section are relative to linux/fs/ext2, except for ++ext2_fs.h which is in linux/include/linux. ++ ++Most of the action happens in compress.c; though note that a few ++small, commonly-used routines are written as inline functions in ++ext2_fs.h. ++ ++ext2_readpage() and ext2_mmap() are in file.c. ext2_file_write() is ++also there. ++ ++Routines to read/write the inode from/to disk are in inode.c. ++ ++super.c contains some e2compr initialisation code (such as allocating ++the e2compr work area). ++ ++All ioctl handling is in ioctl.c. ++ ++acl.c is where we deny open() access in a couple of situations (if the ++EXT2_NOCOMPR_FL is set and another process has the file open; and we ++deny write access to a file with EXT2_ECOMPR_FL set). ++ ++ialloc.c contains code in ext2_new_inode() for newly-created files to ++inherit compression attributes from the directory in which they're ++created. ++ ++truncate.c handles truncation, i.e. zeroing any part of the cluster ++bitmap that's been truncated, and decompressing the final cluster (but ++marking dirty so that we try to recompress it on file close) if the ++new size is part-way through a compressed cluster, so that zeroing ++over the truncated data works. ++ ++linux/include/linux/ext2_fs_i.h has the definition of the ++ext2-specific parts of the in-memory inode. (The on-disk inode is ++defined in ext2_fs.h.) ++ ++linux/mm/filemap.c is also interesting, though there's no ++e2compr-specific code there. Similarly linux/include/linux/mm.h and ++linux/include/linux/fs.h. ++ ++generic_readpage() is in linux/fs/buffer.c. Also all buffer handling. ++ ++ ++The cleanup scheme ++~~~~~~~~~~~~~~~~~~ ++ ++inode->u.ext2_i.i_compr_flags has only a single bit defined: ++EXT2_CLEANUP_FL. This bit gets set to 1 to indicate that ++ext2_cleanup_compressed_inode() needs to be called. ++ ++There is a related flag stored on disk as well as in memory: ++EXT2_DIRTY_FL of i_flags. If ext2_cleanup_compressed_inode() couldn't ++finish it's job (e.g. due to I/O error) then it clears EXT2_CLEANUP_FL ++of i_compr_flags, but leaves EXT2_DIRTY_FL high. ++ ++In ext2_read_inode(), if EXT2_DIRTY_FL is high then EXT2_CLEANUP_FL is ++raised, in the hope that ,whatever was preventing ++ext2_cleanup_compressed_inode() from finishing` is now past. ++ ++Except for ext2_read_inode() as noted above, everything that raises ++EXT2_CLEANUP_FL (i.e. ext2_write_file(), ext2_ioctl() and ++ext2_truncate()) also raises EXT2_DIRTY_FL. ++ ++Nothing lowers either EXT2_CLEANUP_FL or EXT2_DIRTY_FL except ++ext2_cleanup_compressed_inode() (and one or both of new_inode and ++delete_inode routines). ++ ++ ++One feels that at least one of these cleanup flags ought to ++disappear. The main use of the persistent EXT2_DIRTY_FL is where the ++user does `chattr -c' in order to decompress the file, but there isn't ++enough space on the device to do this. We can get rid of this problem ++by having ext2_ioctl() call ext2_cleanup_compressed_inode() ++try to ++ ++ ++Notes on a few variables ++~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++Don't confuse the inode->i_dirt flag with (inode->u.ext2_i.i_flags & ++EXT2_DIRTY_FL). See section `The cleanup scheme' above for a ++description of EXT2_DIRTY_FL. ++ ++ ++inode->u.ext2_i.i_clu_nblocks, ++inode->u.ext2_i.i_log2_clu_nblocks: ++ ++i_clu_nblocks is always equal to ,1 << i_clu_nblocks` (except during a ++couple of cycles while they're being changed; I haven't consciously ++tried to avoid problems for SMP machines in this respect). ++ ++i_clu_nblocks is the number of blocks per cluster for this inode. ++ ++Old information: these variables were previously called ++`i_cluster_bits' and `i_cluster_size'. They were in an array: ++ ++inode->u.ext2_i.i_cluster_bits[2], ++inode->u.ext2_i.i_cluster_size[2]: ++ ++I believe the reason these were declared as an array was for the case ++where someone changes the cluster size of a file that was already ++compressed. (Reason for this belief: All readers of these fields use ++[0]. On creation (ialloc), read_inode, and `chattr +c' (where ++previously uncompressed), both [0] and [1] are updated. On change ++(IOC_SET_CLUSTERSIZE), only [0] is updated.) Since ,changing cluster ++size of an already-compressed file` isn't implemented, I've renamed ++them and made them scalars rather than arrays. ++ ++ ++inode->u.ext2_i.i_flags: When the e2compr patch is applied, this ++variable only holds the low 24 bits of the on-disk i_flags field. ++(Without the e2compr patch applied, all 32 bits are available. An ++interesting side effect of this is that user programs can access the ++compression algorithm and cluster size on kernels without e2compr ++patch by using the EXT2_IOC_GETFLAGS, EXT2_IOC_SETFLAGS ioctls.) ++ ++ ++inode->u.ext2_i.i_compr_method: Holds the compression method ++identifier. Starting from e2compr-0.4.0, this is different from an ++algorithm identifier: an example of a method is gzip9; the ++corresponding algorithm is gzip. See compress.c for where ++ext2_method_table and ext2_algorithm_table are defined. ext2_fs.h has ++some enumerations for addressing these tables (search for ++`EXT2_NONE_METH' and `EXT2_NONE_ALG'). +--- linux-3.4-rc5/fs/Kconfig 2012-04-29 18:19:10.000000000 -0400 ++++ linux-3.4-rc5-e2c/fs/Kconfig 2012-04-30 04:11:03.788143096 -0400 +@@ -11,6 +11,126 @@ config DCACHE_WORD_ACCESS + if BLOCK + + source "fs/ext2/Kconfig" ++ ++config EXT2_COMPRESS ++ bool "Ext2 file compression (DANGEROUS)" ++ depends on EXT2_FS && EXPERIMENTAL ++ select CRYPTO ++ select CRYPTO_ALGAPI ++ select CRYPTO_DEFLATE ++ select ZLIB_INFLATE ++ select ZLIB_DEFLATE ++ help ++ Ext2 file compression allows transparent compression of files on an ++ ext2 filesystem. Transparent compression means that files are ++ stored on the disk in a compressed format but they are automatically ++ decompressed as they are read in and compressed when written out. ++ The user is in control of how and which files are compressed, using ++ the `chattr' utility (see chattr(1)). For the sake of safety, ++ administrative data (superblock, inodes, directories, etc.) are not ++ compressed. ++ ++ Compression is very useful if you're short on disk space, and ++ provides a better option than having lots of .gz files around. ++ For more information, see . ++ ++ You _need_ to have the special e2compr version of e2fsck to be able ++ to make use of this. ++ ++ If you say Y, you will be asked which compression algorithms you wish ++ to include. Gzip is a good all-round algorithm, as its 1..9 parameter ++ allows a good range of speed/compression trade-off. Other noteworthy ++ algorithms are LZV, which caters better to the faster/less compressing ++ end of the scale, and bzip, which caters slightly better to the more ++ compressing but slower end of the scale. ++ ++ Ext2 compression is still experimental, so unless you know you need ++ it, you'd better say N. ++ ++menu "Ext2 file compression options" ++ depends on EXT2_COMPRESS ++ ++choice ++ #depends on EXT2_DEFAULT_COMPR_METHOD_GZIP ++ prompt "Gzip parameter for default compression method" ++ default EXT2_DEFAULT_COMPR_METHOD_GZIP8 ++ help ++ You have selected `gzip' as your default compression algorithm, but ++ I need to know whether to use `gzip -1', `gzip -9', or somewhere ++ in between. gzip1 is the least compressing but fastest; gzip9 is the ++ most compressing and slowest; and the numbers in between have ++ characteristics in between (though not on a linear scale). ++ If unsure, say `8'. ++ ++config EXT2_DEFAULT_COMPR_METHOD_GZIP1 ++ bool "1" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP2 ++ bool "2" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP3 ++ bool "3" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP4 ++ bool "4" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP5 ++ bool "5" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP6 ++ bool "6" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP7 ++ bool "7" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP8 ++ bool "8" ++config EXT2_DEFAULT_COMPR_METHOD_GZIP9 ++ bool "9" ++ ++endchoice ++ ++config GZ_HACK ++ bool "Exclude .gz files from automatic compression" ++ depends on EXT2_COMPRESS ++ default y ++ help ++ If you say Y here, then files created with names ending in `.gz' or ++ `.?gz' or `.bz2' don't inherit the `c' ("compress") attribute from ++ their parent directory. (However, you can still do `chattr +c FILE' ++ if you want to try to compress it anyway.) This means that you ++ don't waste CPU time trying to compress a file that probably can't ++ be compressed. See fs/ext2/namei.c if you want to add other rules. ++ If you have any aesthetic sensibilities then you will say N here ++ and try to implement something better. Most people will say Y here. ++ ++ ++choice ++ depends on EXT2_COMPRESS ++ prompt "Default cluster size (in blocks, usually 1KB each)" ++ default EXT2_DEFAULT_CLUSTER_BITS_5 ++ help ++ To make random access to compressed files reasonably fast the files ++ are compressed in clusters. By default, the clusters will be of the ++ size defined here but there is a modified version of the chattr ++ utility that can set the cluster size for each file independently. ++ Large clusters usually result in better compression at the cost of ++ being slower. ++ ++ Note that the answer to this question is specified in filesystem ++ blocks rather than in kilobytes, though most filesystems have 1KB ++ blocks anyway. (If you have a filesystem with large blocks then ++ you should know it, but if you want to check then "tune2fs -l ++ /dev/xxx | grep size".) The default is 32 blocks which is the ++ slowest setting but gives the best compression. ++ ++config EXT2_DEFAULT_CLUSTER_BITS_2 ++ bool "4" ++config EXT2_DEFAULT_CLUSTER_BITS_3 ++ bool "8" ++config EXT2_DEFAULT_CLUSTER_BITS_4 ++ bool "16" ++config EXT2_DEFAULT_CLUSTER_BITS_5 ++ bool "32" ++ ++endchoice ++ ++endmenu ++ ++ + source "fs/ext3/Kconfig" + source "fs/ext4/Kconfig" + +--- linux-3.4-rc5/include/linux/ext2_fs_c.h 1969-12-31 19:00:00.000000000 -0500 ++++ linux-3.4-rc5-e2c/include/linux/ext2_fs_c.h 2012-05-03 22:14:10.473000559 -0400 +@@ -0,0 +1,498 @@ ++/* ++ * Copyright (C) 2001 Alcatel Business Systems - R&D Illkirch ++ * (transparent compression code) ++ * Pierre Peiffer (pierre.peiffer@sxb.bsf.alcatel.fr) - Denis Richard (denis.richard@sxb.bsf.alcatel.fr) ++ * Adapted from patch e2compr-0.4.39-patch-2.2.18 . ++ */ ++ ++#ifndef EXT2_FS_C_H ++#define EXT2_FS_C_H ++ ++#include ++#include ++#include ++#include "../../fs/ext2/ext2.h" ++ ++/* EXT2_COMPR_DEBUG enables: ++ * - all assertions ++ * - adler checksum checking ++ */ ++//#undef EXT2_COMPR_DEBUG ++#define EXT2_COMPR_DEBUG ++ ++#ifdef EXT2_COMPR_DEBUG ++# define assert(expr) \ ++ if(unlikely(!(expr))) { \ ++ printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \ ++#expr, __FILE__, __func__, __LINE__); \ ++ } ++#else ++# define assert(expr) do {} while (0) ++#endif ++ ++ ++/* proof get_cpu and put_cpu correctness by calling might_sleep() or mabye schedule(). ++ this will check if we are atomic */ ++#ifdef EXT2_COMPR_DEBUG ++#define CHECK_NOT_ATOMIC assert(! in_atomic());//might_sleep(); ++#else ++#define CHECK_NOT_ATOMIC ++#endif ++ ++ ++#undef EXT2_COMPR_REPORT ++//#define EXT2_COMPR_REPORT ++//#define EXT2_COMPR_REPORT_VERBOSE ++//#define EXT2_COMPR_REPORT_PUT ++//# define EXT2_COMPR_REPORT_FILEOPEN ++//#define EXT2_COMPR_REPORT_MUTEX ++ ++#ifdef EXT2_COMPR_REPORT ++//# define EXT2_COMPR_REPORT_PUT ++//# define EXT2_COMPR_REPORT_WA ++//# define EXT2_COMPR_REPORT_MUTEX ++//# define EXT2_COMPR_REPORT_ALLOC /* disk allocation etc. */ ++//# define EXT2_COMPR_REPORT_ALGORITHMS /* Compression algorithms */ ++//# define EXT2_COMPR_REPORT_VERBOSE /* Various things I don't think ++// useful at the moment. */ ++//#define EXT2_COMPR_REPORT_VERBOSE_INODE ++#endif ++ ++ ++#ifdef EXT2_COMPR_DEBUG ++#define E2COMPR_VERSION "ext2-compression: e2c-0.4.59-smp-debug (1 May 2012) for kernel 3.4" ++#else ++#define E2COMPR_VERSION "ext2-compression: e2c-0.4.59-smp-release (1 May 2012) for kernel 3.4" ++#endif ++ ++#define EXT2_IOC_GETCLUSTERSIZE _IOR('c', 0, long) ++#define EXT2_IOC_SETCLUSTERSIZE _IOW('c', 0, long) ++#define EXT2_IOC_GETCOMPRMETHOD _IOR('c', 1, long) ++#define EXT2_IOC_SETCOMPRMETHOD _IOW('c', 1, long) ++#define EXT2_IOC_GETFIRSTCLUSTERSIZE _IOR('c', 2, long) ++#define EXT2_IOC_RECOGNIZE_COMPRESSED _IOW('c', 2, long) ++#define EXT2_IOC_GETCLUSTERBIT _IOR('c', 3, long) ++#define EXT2_IOC_GETCOMPRRATIO _IOR('c', 4, long) ++/* Don't use _IOW('c', {5,6}, long), as these are used by old ++ e2compress binaries as SETCLUSTERBIT and CLRCLUSTERBIT ++ respectively. */ ++ ++/* EXT2_xxxx_ALG is an index into ext2_algorithm_table[] defined in ++ fs/ext2/compress.c. */ ++/* N.B. Don't change these without also changing the table in ++ compress.c. Be careful not to break binary compatibility. ++ (EXT2_NONE_ALG and EXT2_UNDEF_ALG are safe from binary ++ compatibility problems, though, so they can safely be renumbered -- ++ and indeed probably should be if you do add another algorithm.) */ ++#define EXT2_LZV1_ALG 0 ++#define EXT2_LZRW3A_ALG 1 ++#define EXT2_GZIP_ALG 2 ++#define EXT2_BZIP2_ALG 3 ++#define EXT2_LZO_ALG 4 ++#define EXT2_NONE_ALG 5 ++#define EXT2_UNDEF_ALG 6 ++#define EXT2_N_ALGORITHMS 5 /* Count of "real" algorithms. Excludes ++ `none' and `undef'. */ ++ ++/* EXT2_xxxx_METH is an index into ext2_method_table[] defined in ++ fs/ext2/compress.c. */ ++/* N.B. Don't change these without also changing the table in ++ compress.c. */ ++#define EXT2_LZV1_METH 0 ++#define EXT2_AUTO_METH 1 ++#define EXT2_DEFER_METH 2 ++#define EXT2_NEVER_METH 3 ++#define EXT2_BZIP2_METH 4 ++#define EXT2_LZRW3A_METH 8 ++#define EXT2_LZO1X_1_METH 10 ++#define EXT2_GZIP_1_METH 16 ++#define EXT2_GZIP_2_METH 17 ++#define EXT2_GZIP_3_METH 18 ++#define EXT2_GZIP_4_METH 19 ++#define EXT2_GZIP_5_METH 20 ++#define EXT2_GZIP_6_METH 21 ++#define EXT2_GZIP_7_METH 22 ++#define EXT2_GZIP_8_METH 23 ++#define EXT2_GZIP_9_METH 24 ++ ++#define EXT2_N_METHODS 32 /* Don't change this unless you know what ++ you're doing. In particular, it's tied ++ to the width of the algorithm field ++ in i_flags.*/ ++ ++/* Note: EXT2_N_ALGORITHMS can't be increased beyond 16 without ++ changing the width of the s_algorithms_used field in the in-memory ++ superblock. The on-disk s_algorithms_used field is 32 bits long. ++ (This is in a state of flux. Currently (1998-02-05) there is no ++ distinction: we always use the s_es copy. */ ++ ++ ++#define EXT2_MAX_CLUSTER_BYTES (32*1024) ++#define EXT2_LOG2_MAX_CLUSTER_BYTES (5 + 10) ++ ++#define EXT2_COMPRESS_MAGIC_04X 0x9ec7 ++#define EXT2_MAX_CLUSTER_BLOCKS 32 ++#define EXT2_MAX_CLUSTER_PAGES EXT2_MAX_CLUSTER_BYTES >> PAGE_CACHE_SHIFT ++#define EXT2_ECOMPR EIO ++/* A cluster is considered compressed iff the block number for the ++ last block of that cluster is EXT2_COMPRESSED_BLKADDR. If this ++ changes then check if there's anywhere that needs a cpu_to_le32() ++ conversion. */ ++#define EXT2_COMPRESSED_BLKADDR 0xffffffff ++ ++/* I like these names better. */ ++#define EXT2_MAX_CLU_NBYTES EXT2_MAX_CLUSTER_BYTES ++#define EXT2_LOG2_MAX_CLU_NBYTES EXT2_LOG2_MAX_CLUSTER_BYTES ++#define EXT2_MAX_CLU_NBLOCKS EXT2_MAX_CLUSTER_BLOCKS ++ ++ ++#ifndef __KERNEL__ ++ ++/* Cluster head on disk, for e2compr versions before 0.4.0. I'm ++ leaving this here so tht as I may make e2compress able to read ++ old-style e2compr files. */ ++struct ext2_cluster_head_03x { ++ __u16 magic; /* == EXT2_COMPRESS_MAGIC_03X */ ++ __u16 len; /* size of uncompressed data */ ++ __u16 compr_len; /* size of compressed data */ ++ __u8 method; /* compress method */ ++ __u8 reserved_0; ++ __u32 bitmap; /* block bitmap */ ++ __u32 reserved_2; /* 0 or adler32 checksum of ++ _compressed_ data */ ++}; ++# define EXT2_COMPRESS_MAGIC_03X 0x8ec7 /* Head magic number ++ for e2compr versions ++ before 0.4.0. */ ++#endif /* !__KERNEL__ */ ++ ++ ++#ifdef __KERNEL__ ++# ifdef CONFIG_EXT2_COMPRESS ++ ++//mw ++#define CONFIG_EXT2_HAVE_GZIP ++ ++/* If defined, compress each cluster as soon as we get to the end of a ++ whole cluster, when writing. (If undefined, we wait until ++ ext2_release_file() or the like.) */ ++#define EXT2_COMPRESS_WHEN_CLU ++ ++# ifdef CONFIG_EXT2_DEFAULT_COMPR_METHOD_DEFER ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_DEFER_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_BZIP2) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_BZIP2_METH ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_LZO1X_1_ME ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_LZO) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_LZO1X_1_METH ++# ifndef CONFIG_EXT2_HAVE_LZO ++# error "Default algorithm (lzo) is not compiled in." ++# endif ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_LZV1) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_LZV1_METH ++# ifndef CONFIG_EXT2_HAVE_LZV1 ++# error "Default algorithm (lzv1) is not compiled in." ++# endif ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_LZRW3A) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_LZRW3A_METH ++# ifndef CONFIG_EXT2_HAVE_LZRW3A ++# error "Default algorithm (lzrw3a) is not compiled in." ++# endif ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP1) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_1_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP2) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_2_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP3) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_3_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP4) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_4_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP5) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_5_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP6) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_6_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP7) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_7_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP8) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_8_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_GZIP9) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_GZIP_9_METH ++# elif defined (CONFIG_EXT2_DEFAULT_COMPR_METHOD_BZIP2) ++# define EXT2_DEFAULT_COMPR_METHOD EXT2_BZIP2_METH ++# ifndef CONFIG_EXT2_HAVE_BZIP2 ++# error "Default algorithm (bzip2) is not compiled in." ++# endif ++# else ++# error "No default compression algorithm." ++# endif ++# if EXT2_DEFAULT_COMPR_METHOD >= EXT2_GZIP_1_METH && EXT2_DEFAULT_COMPR_METHOD <= EXT2_GZIP_9_METH ++# ifndef CONFIG_EXT2_HAVE_GZIP ++# error "Default algorithm (gzip) is not compiled in." ++# endif ++# endif ++ ++# if defined (CONFIG_EXT2_DEFAULT_CLUSTER_BITS_2) ++# define EXT2_DEFAULT_LOG2_CLU_NBLOCKS 2 ++# elif defined (CONFIG_EXT2_DEFAULT_CLUSTER_BITS_3) ++# define EXT2_DEFAULT_LOG2_CLU_NBLOCKS 3 ++# elif defined (CONFIG_EXT2_DEFAULT_CLUSTER_BITS_4) ++# define EXT2_DEFAULT_LOG2_CLU_NBLOCKS 4 ++# elif defined (CONFIG_EXT2_DEFAULT_CLUSTER_BITS_5) ++# define EXT2_DEFAULT_LOG2_CLU_NBLOCKS 5 ++# else ++# error "No default cluster size." ++# endif ++ ++# define EXT2_DEFAULT_CLU_NBLOCKS (1 << EXT2_DEFAULT_LOG2_CLU_NBLOCKS) ++ ++# if (EXT2_LZV1_ALG != 0) || (EXT2_BZIP2_ALG != 3) || (EXT2_LZO_ALG != 4) || (EXT2_N_ALGORITHMS != 5) ++# error "this code needs changing; but then, you shouldn't be messing with algorithm ids anyway unless you are very careful to protect disk format compatibility" ++# endif ++# ifdef CONFIG_EXT2_HAVE_LZV1 ++# define _ext2_lzv1_builtin (1 << EXT2_LZV1_ALG) ++# else ++# define _ext2_lzv1_builtin 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_LZRW3A ++# define _ext2_lzrw3a_builtin (1 << EXT2_LZRW3A_ALG) ++# else ++# define _ext2_lzrw3a_builtin 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_GZIP ++# define _ext2_gzip_builtin (1 << EXT2_GZIP_ALG) ++# else ++# define _ext2_gzip_builtin 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_BZIP2 ++# define _ext2_bzip2_builtin (1 << EXT2_BZIP2_ALG) ++# else ++# define _ext2_bzip2_builtin 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_LZO ++# define _ext2_lzo_builtin (1 << EXT2_LZO_ALG) ++# else ++# define _ext2_lzo_builtin 0 ++# endif ++ ++# ifdef CONFIG_EXT2_HAVE_LZV1_MODULE ++# define _ext2_lzv1_module (1 << EXT2_LZV1_ALG) ++# else ++# define _ext2_lzv1_module 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_LZRW3A_MODULE ++# define _ext2_lzrw3a_module (1 << EXT2_LZRW3A_ALG) ++# else ++# define _ext2_lzrw3a_module 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_GZIP_MODULE ++# define _ext2_gzip_module (1 << EXT2_GZIP_ALG) ++# else ++# define _ext2_gzip_module 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_BZIP2_MODULE ++# define _ext2_bzip2_module (1 << EXT2_BZIP2_ALG) ++# else ++# define _ext2_bzip2_module 0 ++# endif ++# ifdef CONFIG_EXT2_HAVE_LZO_MODULE ++# define _ext2_lzo_module (1 << EXT2_LZO_ALG) ++# else ++# define _ext2_lzo_module 0 ++# endif ++ ++# define EXT2_ALGORITHMS_MODULE (_ext2_lzv1_module | _ext2_lzrw3a_module | _ext2_gzip_module | _ext2_bzip2_module | _ext2_lzo_module) ++# define EXT2_ALGORITHMS_BUILTIN (_ext2_lzv1_builtin | _ext2_lzrw3a_builtin | _ext2_gzip_builtin | _ext2_bzip2_builtin | _ext2_lzo_builtin) ++ ++# if EXT2_ALGORITHMS_MODULE & EXT2_ALGORITHMS_BUILTIN ++# error "Arithmetic error? Some algorithm appears to be both built-in and a module." ++# endif ++ ++/* EXT2_ALGORITHMS_SUPP is what we test when mounting a filesystem. ++ See fs/ext2/super.c. */ ++# define EXT2_ALGORITHMS_SUPP (EXT2_ALGORITHMS_MODULE | EXT2_ALGORITHMS_BUILTIN) ++# if EXT2_ALGORITHMS_SUPP == 0 ++# error "You must select at least one compression algorithm." ++# endif ++ ++/* Cluster head on disk. Little-endian. */ ++struct ext2_cluster_head { ++ __u16 magic; /* == EXT2_COMPRESS_MAGIC_04X. */ ++ __u8 method; /* compression method id. */ ++ __u8 holemap_nbytes; /* length of holemap[] array */ ++ __u32 checksum; /* adler32 checksum. Checksum covers all fields ++ below this one, and the compressed data. */ ++ __u32 ulen; /* size of uncompressed data */ ++ __u32 clen; /* size of compressed data (excluding cluster head) */ ++ __u8 holemap[0]; /* bitmap describing where to put holes. */ ++}; ++ ++ ++struct ext2_wa_S { ++ __u8 u[EXT2_MAX_CLUSTER_BYTES]; /* Uncompressed data. */ ++ __u8 c[EXT2_MAX_CLUSTER_BYTES]; /* Compressed data. */ ++ __u8 heap[1]; /* Heap: working space for de/compression routines. */ ++}; ++ ++# define EXT2_CLEANUP_FL 0x40 /* See Readme.e2compr */ ++# define EXT2_OSYNC_INODE 0x20 /* sync of inode running */ ++# define ROUNDUP_DIV(_n, _d) ((_n) ? 1 + (((_n) - 1) / (_d)) : 0) ++# define ROUNDUP_RSHIFT(_n, _b) ((_n) ? 1 + (((_n) - 1) >> (_b)) : 0) ++ ++# if defined(EXT2_NDIR_BLOCKS) && (EXT2_NDIR_BLOCKS != 12) ++# error "e2compr currently assumes that EXT2_NDIR_BLOCKS is 12." ++/* If EXT2_NDIR_BLOCKS changes then change the definitions of ++ ext2_first_cluster_nblocks() and friends, and search the patch for ++ anywhere where 12 is hard-coded. (At the time of writing, it's ++ only hard-coded in ext2_first_cluster_nblocks().) What we want to ++ achieve is for clusters not to straddle address blocks. Apart from ++ performance, some code in compress.c (search for `straddle') ++ assumes this. */ ++# endif ++ ++# include ++ ++# define EXT2_ALG_INIT_COMPRESS 1 ++# define EXT2_ALG_INIT_DECOMPRESS 2 ++ ++extern int ext2_get_cluster_pages (struct inode*, u32, struct page**, struct page *, int); ++extern int ext2_get_cluster_extra_pages (struct inode*, u32, struct page**, struct page**); ++extern int ext2_kmap_cluster_pages (struct page *, struct page**, struct page**); ++extern int ext2_kunmap_cluster_pages (struct page *, struct page**, struct page**); ++extern int ext2_get_cluster_blocks (struct inode*, u32, struct buffer_head**, struct page**, struct page**, int); ++extern int ext2_decompress_cluster (struct inode*, u32); ++extern int ext2_decompress_pages(struct inode*, u32, struct page**); ++extern int ext2_compress_cluster (struct inode*, u32); ++extern int ext2_decompress_inode (struct inode*); ++extern int ext2_cleanup_compressed_inode (struct inode*); ++extern void ext2_update_comprblk (struct inode *); ++extern int ext2_get_dcount(struct inode *inode); ++ ++extern size_t ext2_decompress_blocks (struct inode*, struct buffer_head**, int, size_t, u32 cluster); ++extern int ext2_count_blocks (struct inode*); ++extern int ext2_recognize_compressed (struct inode *, unsigned cluster); ++extern unsigned long ext2_adler32 (unsigned long, unsigned char*, int); ++ ++extern size_t ext2_iLZV1 (int); ++extern size_t ext2_iLZV2 (int); ++extern size_t ext2_iNONE (int); ++extern size_t ext2_iGZIP (int); ++extern size_t ext2_iBZIP2 (int); ++extern size_t ext2_iLZO (int); ++extern size_t ext2_iLZRW3A (int); ++extern size_t ext2_iZLIB (int); ++ ++extern size_t ext2_wLZV1 (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_wLZV2 (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_wNONE (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_wGZIP (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_wBZIP2 (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_wLZO (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_wLZRW3A (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_wZLIB (__u8*, __u8*, void*, size_t, size_t, int); ++ ++extern size_t ext2_rLZV1 (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_rLZV2 (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_rNONE (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_rGZIP (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_rBZIP2 (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_rLZO (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_rLZRW3A (__u8*, __u8*, void*, size_t, size_t, int); ++extern size_t ext2_rZLIB (__u8*, __u8*, void*, size_t, size_t, int); ++ ++struct ext2_algorithm { ++ char *name; ++ int avail; ++ size_t (*init) (int); ++ size_t (*compress) (__u8*, __u8*, void*, size_t, size_t, int); ++ size_t (*decompress) (__u8*, __u8*, void*, size_t, size_t, int); ++}; ++ ++struct ext2_method { ++ unsigned alg; ++ int xarg; ++}; ++ ++ ++# define ext2_first_cluster_nblocks(_i) ((EXT2_I(_i))->i_clu_nblocks > 4 && (_i)->i_sb->s_blocksize < 4096 ? 12 : 4) ++# define ext2_block_to_cluster(_i,_b) ((_b) < ext2_first_cluster_nblocks(_i) ? 0 : (((_b) - ext2_first_cluster_nblocks(_i)) >> (EXT2_I(_i))->i_log2_clu_nblocks) + 1) ++# define ext2_offset_to_cluster(_i,_o) ext2_block_to_cluster((_i), ((_o) >> (_i)->i_sb->s_blocksize_bits)) ++# define ext2_n_clusters(_i) ((_i)->i_size ? ext2_offset_to_cluster((_i), (_i)->i_size - 1) + 1 : 0) ++# define ext2_cluster_block0(_i,_c) ((_c) ? ext2_first_cluster_nblocks(_i) + (((_c) - 1) << (EXT2_I(_i))->i_log2_clu_nblocks) : 0) ++# define ext2_cluster_nblocks(_i,_c) ((_c) ? (EXT2_I(_i))->i_clu_nblocks : ext2_first_cluster_nblocks(_i)) ++# define ext2_cluster_offset(_i,_c) ((_c) ? ext2_cluster_block0((_i), (_c)) << (_i)->i_sb->s_blocksize_bits : 0) ++ ++# define ext2_first_cluster_npages(_i) ((EXT2_I(_i))->i_clu_nblocks > 4 && (_i)->i_sb->s_blocksize < 4096 ? 12 >> (PAGE_CACHE_SHIFT - (_i)->i_sb->s_blocksize_bits) : 4 >> (PAGE_CACHE_SHIFT - (_i)->i_sb->s_blocksize_bits)) ++# define ext2_page_to_cluster(_i,_p) ((_p) < ext2_first_cluster_npages(_i) ? 0 : (((_p) - ext2_first_cluster_npages(_i)) >> (((EXT2_I(_i))->i_log2_clu_nblocks)+(_i)->i_sb->s_blocksize_bits-PAGE_CACHE_SHIFT)) + 1) ++# define ext2_cluster_page0(_i,_c) ((_c) ? ext2_cluster_block0(_i, _c) >> (PAGE_CACHE_SHIFT - (_i)->i_sb->s_blocksize_bits) : 0) ++# define ext2_cluster_npages(_i,_c) ((_c) ? (EXT2_I(_i))->i_clu_nblocks >> (PAGE_CACHE_SHIFT - (_i)->i_sb->s_blocksize_bits) : ext2_first_cluster_npages(_i)) ++ ++static inline int ++ext2_offset_is_clu_boundary(struct inode *inode, u32 off) ++{ ++ if (off & (inode->i_sb->s_blocksize - 1)) ++ return 0; ++ if (off == 0) ++ return 1; ++ off >>= inode->i_sb->s_blocksize_bits; ++ if (off < ext2_first_cluster_nblocks(inode)) ++ return 0; ++ off -= ext2_first_cluster_nblocks(inode); ++ return !(off & (EXT2_I(inode)->i_clu_nblocks - 1)); ++} ++ ++struct ext2_wa_contents_S { ++ ino_t ino; ++ dev_t dev; ++ unsigned cluster; ++}; ++ ++DECLARE_PER_CPU(struct ext2_wa_S *, ext2_rd_wa); ++DECLARE_PER_CPU(struct ext2_wa_S *, ext2_wr_wa); ++ ++extern void ext2_alloc_rd_wa(void); ++extern void ext2_alloc_wr_wa(void); ++ ++extern struct ext2_algorithm ext2_algorithm_table[]; ++extern struct ext2_method ext2_method_table[]; /*mw: is static so far, no writes*/ ++ ++/* Both of these return -errno if error, 0 if not compressed, positive ++ if compressed. (You should use the macro unless you've already ++ tested COMPRBLK.) */ ++extern int ext2_cluster_is_compressed_fn (struct inode *inode, __u32 cluster); ++static inline int ext2_cluster_is_compressed (struct inode *inode, __u32 cluster) ++{ ++ if ((EXT2_I(inode)->i_flags & EXT2_COMPRBLK_FL) == 0) ++ return 0; ++ return ext2_cluster_is_compressed_fn (inode, cluster); ++} ++extern unsigned ext2_calc_free_ix (unsigned , u8 const *, unsigned ); ++extern int ext2_unpack_blkaddrs(struct inode *, struct buffer_head **, int, unsigned , u8 const *, unsigned , unsigned , unsigned , unsigned ); ++ ++# define HOLE_BLKADDR(_b) \ ++ (((_b) == 0) \ ++ || ((_b) == EXT2_COMPRESSED_BLKADDR)) ++# else /* !CONFIG_EXT2_COMPRESS */ ++# define HOLE_BLKADDR(_b) ((_b) == 0) ++# endif ++ ++/* For some reason or other, I see code like `if (le32_to_cpu(tmp) != ++ 0)' around in the kernel. So far I haven't checked whether or not ++ the compiler knows that the swab can be dropped. */ ++# if defined(EXT2_COMPRESSED_BLKADDR) && EXT2_COMPRESSED_BLKADDR != 0xffffffff ++/* This may be a false positive; the "correct" test would be `if ++ defined(CONFIG_EXT2_COMPRESS)', but if this test does succeed, then ++ there is at least cause to have a look around. */ ++# error "Next bit of code is wrong." ++# endif ++ ++# define HOLE_BLKADDR_SWAB32(_b) HOLE_BLKADDR(_b) ++ ++#ifdef EXT2_COMPR_REPORT ++#define trace_e2c(format, args...) printk(KERN_DEBUG format, ## args) ++#else ++#define trace_e2c(format, args...) do {} while(0) ++#endif ++ ++#endif /* __KERNEL__ */ ++ ++ ++#endif /* EXT2_FS_C_H */ +--- linux-3.4-rc5/fs/ext2/Makefile 2012-04-29 18:19:10.000000000 -0400 ++++ linux-3.4-rc5-e2c/fs/ext2/Makefile 2012-04-30 04:11:03.790143095 -0400 +@@ -2,10 +2,17 @@ + # Makefile for the linux ext2-filesystem routines. + # + ++ifeq ($(CONFIG_EXT2_COMPRESS),y) ++ ++COMPRESS_STUFF := adler32.o compress.o e2zlib.o\ ++ $($(obj-y):%/=%/ext2-compr-%.o) ++endif ++ + obj-$(CONFIG_EXT2_FS) += ext2.o + + ext2-y := balloc.o dir.o file.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o ++ ioctl.o namei.o super.o symlink.o $(COMPRESS_STUFF) ++ + + ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o +--- linux-3.4-rc5/fs/ext2/compress.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-3.4-rc5-e2c/fs/ext2/compress.c 2012-05-03 22:15:16.951998711 -0400 +@@ -0,0 +1,3407 @@ ++/* ++ * linux/fs/ext2/compress.c ++ * ++ * Copyright (C) 1995 Antoine Dumesnil de Maricourt (dumesnil@etca.fr) ++ * (transparent compression code) ++ */ ++ ++/* ++ * Copyright (C) 2001 Alcatel Business Systems - R&D Illkirch FRANCE ++ * ++ * Transparent compression code for 2.4 kernel. ++ * ++ * Denis Richard (denis.richard@sxb.bsf.alcatel.fr) ++ * Pierre Peiffer (pierre.peiffer@sxb.bsf.alcatel.fr) ++ * ++ * Adapted from patch e2compr-0.4.39-patch-2.2.18 . ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define MIN(a,b) ((a) < (b) ? (a) : (b)) ++ ++#ifdef CONFIG_HIGHMEM ++#define restore_b_data_himem(bh) assert(page_address(bh->b_page)); bh->b_data = page_address(bh->b_page) + bh_offset(bh) ++ ++ ++ ++int ext2_kmap_cluster_pages(struct page *page, struct page *pg[], ++ struct page *epg[]) ++{ ++ int i = 0; ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (!pg[i]) ++ break; ++ if (epg && epg[i]) ++ kmap(epg[i]); ++ else ++ kmap(pg[i]); ++ } ++ ++ if (page) ++ kmap(page); ++ return 0; ++} ++ ++ ++int ext2_kunmap_cluster_pages(struct page *page, struct page *pg[], ++ struct page *epg[]) ++{ ++ int i = 0; ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (!pg[i]) ++ break; ++ if (epg && epg[i]) ++ kunmap(epg[i]); ++ else ++ kunmap(pg[i]); ++ } ++ ++ if (page) ++ kunmap(page); ++ return 0; ++} ++#else //no high-mem: ++#define restore_b_data_himem(bh) ; ++#endif ++ ++ ++/*none compression dummy functions*/ ++size_t ext2_iNONE (int action) { return 0; } ++size_t ext2_wNONE (__u8 *ibuf, __u8 *obuf, void *wa, size_t ilen, size_t olen, int xarg) { return 0; } ++size_t ext2_rNONE (__u8 *ibuf, __u8 *obuf, void *wa, size_t ilen, size_t olen, int xarg) { return 0; } ++ ++/* ++ * Algorithm and method tables ++ */ ++struct ext2_algorithm ext2_algorithm_table[] = { ++ /* Note: all algorithms must have the `name' field filled in. ++ This is used to autoload algorithm modules (ext2-compr-%s), and ++ in kernel printk. */ ++ /* N.B. Do not renumber these algorithms! (To do so is to change ++ the binary format.) It's OK for `none' and `undef' to be ++ renumbered, though. */ ++ ++ /* Fields: ++ name; available; routines for: ++ init, compress, decompress. */ ++ {"lzv1", 0, ext2_iNONE, ext2_wNONE, ext2_rNONE}, ++ {"lzrw3a", 0, ext2_iNONE, ext2_wNONE, ext2_rNONE}, ++ {"gzip", 1, ext2_iZLIB, ext2_wZLIB, ext2_rZLIB}, //Andreas: workaround ++ {"bzip2", 0, ext2_iNONE, ext2_wNONE, ext2_rNONE}, ++ {"lzo", 0, ext2_iNONE, ext2_wNONE, ext2_rNONE}, ++ {"none", 1, ext2_iNONE, ext2_wNONE, ext2_rNONE}, ++ ++ /* This "algorithm" is for unused entries in the method table. ++ It differs from EXT2_NONE_ALG in that it is considered ++ unavailable, whereas `none' is always available. */ ++ {"undef", 0, ext2_iNONE, ext2_wNONE, ext2_rNONE}, ++ ++}; ++ ++/* Note: EXT2_N_ALGORITHMS can't be increased beyond 16 without ++ changing the width of the s_algorithms_used field in the in-memory ++ superblock. The on-disk s_algorithms_used field is 32 bits long. ++ (This is in a state of flux. Currently (1998-02-05) there is no ++ distinction: we always use the s_es copy. */ ++ ++/* The size of this table must be 32 to prevent Oopsen from ++ invalid data. We index this from 5 bits of i_flags, so ++ the size is (1 << 5) == 32. */ ++struct ext2_method ext2_method_table[32] = { ++ /* Fields: algorithm id, algorithm argument. */ ++ {EXT2_LZV1_ALG, 0}, ++ {EXT2_NONE_ALG, 0}, /* 1: auto */ ++ {EXT2_NONE_ALG, 0}, /* 2: defer */ ++ {EXT2_NONE_ALG, 0}, /* 3: never */ ++ {EXT2_BZIP2_ALG, 0}, /* 4: bzip2 */ ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_LZRW3A_ALG, 0}, /* 8: lzrw3a */ ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_LZO_ALG, 0}, /* 10: lzo1x_1 */ ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_GZIP_ALG, 1}, /* 16 */ ++ {EXT2_GZIP_ALG, 2}, ++ {EXT2_GZIP_ALG, 3}, ++ {EXT2_GZIP_ALG, 4}, ++ {EXT2_GZIP_ALG, 5}, ++ {EXT2_GZIP_ALG, 6}, ++ {EXT2_GZIP_ALG, 7}, ++ {EXT2_GZIP_ALG, 8}, ++ {EXT2_GZIP_ALG, 9}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0}, ++ {EXT2_UNDEF_ALG, 0} ++}; ++ ++ ++static void ext2_mark_algorithm_use(struct inode *inode, unsigned alg) ++{ ++ struct ext2_sb_info *sbi = EXT2_SB(inode->i_sb); ++ ++ /* Hopefully, lock_super() isn't needed here, as we don't ++ block in the critical region. True? */ ++ assert(alg < EXT2_N_ALGORITHMS); ++ if (sbi->s_es->s_feature_incompat ++ & cpu_to_le32(EXT2_FEATURE_INCOMPAT_COMPRESSION)) { ++ sbi->s_es->s_algorithm_usage_bitmap |= cpu_to_le32(1 << alg); ++ } else { ++ struct ext2_super_block *es = sbi->s_es; ++ ++ es->s_algorithm_usage_bitmap = cpu_to_le32(1 << alg); ++ es->s_feature_incompat ++ |= cpu_to_le32(EXT2_FEATURE_INCOMPAT_COMPRESSION); ++ if (es->s_rev_level < EXT2_DYNAMIC_REV) { ++ /* Raise the filesystem revision level to ++ EXT2_DYNAMIC_REV so that s_feature_incompat ++ is honoured (except in ancient kernels / ++ e2fsprogs). We must also initialize two ++ other dynamic-rev fields. The remaining ++ fields are assumed to be already correct ++ (e.g. still zeroed). */ ++ es->s_rev_level = cpu_to_le32(EXT2_DYNAMIC_REV); ++ es->s_first_ino = cpu_to_le32(EXT2_GOOD_OLD_FIRST_INO); ++ es->s_inode_size = cpu_to_le16(EXT2_GOOD_OLD_INODE_SIZE); ++ } ++ } ++ mark_buffer_dirty(sbi->s_sbh); ++} ++ ++ ++/* Displays an error message if algorithm ,alg` is not marked in use, ++ and then marks it in use. */ ++static void ext2_ensure_algorithm_use(struct inode *inode, unsigned alg) ++{ ++ assert(alg < EXT2_N_ALGORITHMS); ++ ++ if (!(EXT2_SB(inode->i_sb)->s_es->s_algorithm_usage_bitmap ++ & cpu_to_le32(1 << alg))) { ++ ext2_msg(inode->i_sb, "algorithm usage bitmap algorithm %s not marked used in inode %lu", ++ ext2_algorithm_table[alg].name, inode->i_ino); ++ ext2_mark_algorithm_use(inode, alg); ++ } ++} ++ ++ ++/*mw: out of cache bug fix 5-16-07 */ ++static void create_empty_buffers_e2c(struct page *page, ++ unsigned long blocksize, ++ unsigned long b_state, ++ struct inode *inode) ++{ ++ struct buffer_head *bh, *head, *tail; ++ ++ head = alloc_page_buffers(page, blocksize, 1); ++ bh = head; ++ do { ++ bh->b_state |= b_state; ++ tail = bh; ++ bh->b_bdev = NULL; //mw: make it like 2.4 ++ bh->b_blocknr = 0; //mw: make it like 2.4 ++ bh->b_end_io = NULL; //mw: make it like 2.4 ++ bh = bh->b_this_page; ++ } while (bh); ++ tail->b_this_page = head; ++ spin_lock(&inode->i_mapping->private_lock); ++ if (PageUptodate(page) || PageDirty(page)) { ++ bh = head; ++ do { ++ if (PageDirty(page)) ++ set_buffer_dirty(bh); ++ if (PageUptodate(page)) ++ set_buffer_uptodate(bh); ++ bh = bh->b_this_page; ++ } while (bh != head); ++ } ++ attach_page_buffers(page, head); ++ spin_unlock(&inode->i_mapping->private_lock); ++} ++ ++int ext2_get_cluster_pages(struct inode *inode, u32 cluster, ++ struct page *pg[], struct page *page, int compr) ++{ ++ int nbpg, npg, i; ++ u32 page0; /* = position within file (not position within fs). */ ++ u32 idx = 0; ++ ++ /*mw */ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) ++ pg[i] = NULL; ++ ++ page0 = ext2_cluster_page0(inode, cluster); ++ nbpg = ext2_cluster_npages(inode, cluster); ++ ++ if (compr && (((page0 + nbpg) << PAGE_CACHE_SHIFT) > inode->i_size)) ++ nbpg = ((inode->i_size - 1) >> PAGE_CACHE_SHIFT) - page0 + 1; ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c("ext2_get_cluster_pages: page0=%d, nbpg=%d page=%ld\n", ++ page0, nbpg, ((page != NULL) ? page->index : 0)); ++#endif ++ for (npg = 0; npg < nbpg; npg++) { ++ if ((page == NULL) || ((page0 + npg) != page->index)) { ++ //pg[npg] = __grab_cache_page(inode->i_mapping, page0+npg); /* &cached_page, &lru_pvec);*/ ++ pg[npg] = grab_cache_page_write_begin(inode->i_mapping, page0+npg, 0); ++ if (!pg[npg]) ++ goto error; ++ } else { ++ pg[npg] = page; ++ } ++ if (!page_has_buffers(pg[npg])) { ++ ClearPageUptodate(pg[npg]); ++ ClearPageDirty(pg[npg]); ++ create_empty_buffers_e2c(pg[npg], inode->i_sb->s_blocksize, 0, inode); ++ if (unlikely(!page_has_buffers(pg[npg]))) ++ trace_e2c("ext2_get_cluster_pages: NOMEM!\n"); ++ assert(!PageUptodate(pg[npg])); ++ assert(!PageDirty(pg[npg])); ++ } ++ } ++ //set remaining pages to NULL ++ for (idx = npg; idx < EXT2_MAX_CLUSTER_PAGES; idx++) ++ pg[idx] = NULL; ++ ++ return (npg); ++ ++ error: ++ while (--npg >= 0) { ++ if ((page == NULL) || ((page0 + npg) != page->index)) { ++ unlock_page(pg[npg]); ++ page_cache_release(pg[npg]); ++ } ++ pg[npg] = NULL; ++ } ++ trace_e2c("ext2_get_cluster_pages: error no page\n"); ++ return (-ENOMEM); ++} ++ ++ ++int ext2_get_cluster_extra_pages(struct inode *inode, u32 cluster, ++ struct page *pg[], struct page *epg[]) ++{ ++ struct page *page; ++ int nbpg, npg, i; ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) ++ epg[i] = NULL; ++ ++ nbpg = ext2_cluster_npages(inode, cluster); ++ for (npg = 0; npg < nbpg; npg++) { ++ if (pg[npg] == NULL) ++ break; ++ if (PageUptodate(pg[npg])) { ++ //page = page_cache_alloc(inode->i_mapping); ++ //mw: has gfp-mask of adress-space: gfp_t mapping_gfp_mask(struct address_space * mapping) ++ // don't trigger. shrink_dcache_memory which might call ext2_cleanup_compressed_inode with the SAME mutex. ++ page = __page_cache_alloc(GFP_NOFS); ++ ++ if (!page) { ++ goto error; ++ } ++ ClearPageError(page); ++ ClearPageReferenced(page); ++ ClearPageUptodate(page); ++ ClearPageDirty(page); ++ lock_page(page); ++ page->index = pg[npg]->index; ++ ++ if (!page_has_buffers(page)) { ++ create_empty_buffers_e2c(page, inode->i_sb->s_blocksize, 0, ++ inode); ++ /*mw : only the "extra_pages" for decompression need create_empty_buffers_unlocked, because ++ * they have no mapping-context and they must not have one. Otherwise they get need a page->index ++ * which belongs always to an address_space object (e.g.: inode). But I think this is not intented here. ++ * we just need thei buffers for a short time of decompression */ ++ if (unlikely(!page_has_buffers(page))) ++ return printk("Error: NOMEM!\n"); ++ } ++ ++ epg[npg] = page; ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c ++ ("ext2_get_cluster_extra_pages: allocated page idx=%ld\n", ++ pg[npg]->index); ++#endif ++ } else { ++ epg[npg] = NULL; ++ } ++ } ++ return (npg); ++ error: ++ while (--npg >= 0) ++ if (epg[npg]) { ++ ClearPageDirty(epg[npg]); ++ ClearPageUptodate(epg[npg]); ++ try_to_free_buffers(epg[npg]); ++ unlock_page(epg[npg]); ++ assert(page_count(epg[npg]) == 1); ++ page_cache_release(epg[npg]); ++ } ++ trace_e2c("ext2_get_cluster_extra_pages: error no page\n"); ++ return (-ENOMEM); ++ ++} ++ ++/* Read every block in the cluster. The blocks are stored in the bh ++ array, which must be big enough. ++ ++ Return the number of block contained in the cluster, or -errno if an ++ error occured. The buffers should be released by the caller ++ (unless an error occurred). ++ ++ The inode must be locked, otherwise it is possible that we return ++ some out of date blocks. ++ ++ Called by : ++ ++ ext2_decompress_cluster() [i_sem] ++ ext2_compress_cluster() [i_sem] ++ ext2_readpage() [i_sem] */ ++ ++ ++int ext2_get_cluster_blocks(struct inode *inode, u32 cluster, ++ struct buffer_head *bh[], struct page *pg[], ++ struct page *epg[], int compr) ++{ ++ struct buffer_head *br[EXT2_MAX_CLUSTER_BLOCKS]; ++ int nreq, nbh = 0, npg, i; ++ u32 clu_nblocks; ++ int err; ++ const int blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits; ++ ++ /*mw */ ++ for (i = 0; i < EXT2_MAX_CLUSTER_BLOCKS; i++) ++ bh[i] = NULL; ++ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); /* i.e. mutex_lock */ ++ ++ /* ++ * Request full cluster. ++ */ ++ { ++ u32 endblk; ++ u32 block; /* = position within file (not position within fs). */ ++ u32 nbpg; ++ u32 page0; /* = position within file (not position within fs). */ ++ u32 idx; ++ ++ block = ext2_cluster_block0(inode, cluster); ++ clu_nblocks = ext2_cluster_nblocks(inode, cluster); ++ /* impl: Don't shorten endblk for i_size. The ++ remaining blocks should be NULL anyway, except in ++ the case when called from ext2_decompress_cluster ++ from ext2_truncate, in which case i_size is short ++ and we _want_ to get all of the blocks. */ ++ endblk = block + clu_nblocks; ++ ++ page0 = ext2_cluster_page0(inode, cluster); ++ nbpg = ext2_cluster_npages(inode, cluster); ++ ++ if (compr ++ && (((page0 + nbpg) << PAGE_CACHE_SHIFT) > inode->i_size)) { ++ nbpg = ((inode->i_size - 1) >> PAGE_CACHE_SHIFT) - page0 + 1; ++ endblk = ++ block + ++ (nbpg << ++ (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)); ++ } ++ ++ idx = page0 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c("ext2_get_cluster_blocks: page0=%d, nbpg=%d\n", page0, ++ nbpg); ++#endif ++ for (npg = 0; npg < nbpg; npg++) { ++ struct buffer_head *buffer; ++ ++ if ((epg != NULL) && (epg[npg] != NULL)) ++ buffer = page_buffers(epg[npg]); ++ else ++ buffer = page_buffers(pg[npg]); ++ for (i = 0; i < blocks && (block + nbh) < endblk; ++ buffer = buffer->b_this_page, i++) { ++ if (idx == (block + nbh)) { ++ bh[nbh] = buffer; ++ nbh++; ++ } ++ idx++; ++ } ++ } ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c ++ ("ext2_get_cluster_blocks: get every pages and %d buffers\n", ++ nbh); ++#endif ++ ++ for (nbh = 0, nreq = 0; block < endblk; nbh++) { ++ assert(bh[nbh] != NULL); ++ bh[nbh]->b_blocknr = 0; ++ clear_bit(BH_Mapped, &bh[nbh]->b_state); ++ ++ //mw: does not work with 2.6 and holes!!! ++ //err=ext2_get_block(inode, block++, bh[nbh], (PageDirty(bh[nbh]->b_page) ? 1 : 0)); ++ err = ext2_get_block(inode, block++, bh[nbh], 0); ++ /* mw: 0: we dont' create non existing blocks here ++ * let's do it just before the writeback, when we know, which blocks we really need...*/ ++ //err=ext2_get_block(inode, block++, bh[nbh], (buffer_dirty(bh[nbh]) ? 1 : 0)); ++ ++ /* mw: bdev-bug-fix: for files which got compressed and now consume less buffers ++ * ext2_get_block returns 0, for a empty-block. As these buffer were used before ++ * the bh[nbh]->b_bdev might be != NULL or just invalid. So we set them explicitly ++ * to NULL. */ ++ //printk("Get Block cluster %i: (%#x):%i Blk-NR:%lu(%lu)[%lu-%lu] Bdev:%#x(%#x), PGDirty:%i, mapped:%i, PID: %lu\n", cluster, bh[nbh], nbh, block, ++ ++ //if we are not mapped, then the blocknr will be wrong ++ //we set a bdev here the we will write to some "random" block ++ if (!buffer_mapped(bh[nbh])) { ++ bh[nbh]->b_bdev = NULL; /* don't write wrongly mapped blocks !!! */ ++ /* mw: you encounter null pointer oops you MUST ++ * map your buffer using ext2_get_block()*/ ++ } ++ ++ if (bh[nbh]->b_blocknr != 0) { ++ if (!buffer_uptodate(bh[nbh]) ++ /* TODO: Do we need this ++ `!buffer_locked' test? */ ++ && !buffer_locked(bh[nbh]) ++ && !PageDirty(bh[nbh]->b_page)) ++ br[nreq++] = bh[nbh]; ++ } else if ((err != 0) ++ && (err != -EFBIG)) ++ /* impl: for some unknown reason, ++ ext2_getblk() returns -EFBIG if ++ !create and there's a hole. ==> not right any more in 2.4 */ ++ goto error; ++ } ++ for (i = nbh; i < EXT2_MAX_CLUSTER_BLOCKS; i++) { ++ bh[i] = NULL; ++ } ++ } ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("ext2_get_cluster_blocks: nreq=%d for cluster=%d\n", nreq, ++ cluster); ++#endif ++ ++ //read all blocks, which are not null-blocks ++ if (nreq > 0) ++ ll_rw_block(READ, nreq, br); ++ ++ /* ++ * Adjust nbh if we have some null blocks at end of cluster. ++ */ ++ while ((nbh != 0) && (bh[nbh - 1]->b_blocknr == 0)) ++ nbh--; ++ ++ /* ++ * Wait for blocks. ++ */ ++ err = -EIO; ++ CHECK_NOT_ATOMIC ++ for (i = 0; i < nbh; i++) ++ if ((!PageDirty(bh[i]->b_page)) && (bh[i]->b_blocknr != 0)) { ++ wait_on_buffer(bh[i]); ++ if (!buffer_uptodate(bh[i])) { /* Read error ??? */ ++ trace_e2c ++ ("ext2_get_cluster_blocks: wait_on_buffer error (blocknr=%ld)\n", ++ bh[i]->b_blocknr); ++ goto error; ++ } ++ } ++ assert(nbh <= EXT2_MAX_CLU_NBLOCKS); ++ ++ return nbh; ++ ++ error: ++ printk("ERROR: ext2_get_cluster_blocks()\n"); ++ return err; ++} ++ ++ ++/* Iterations over block in the inode are done with a generic ++ iteration key mechanism. We need one method to convert a block ++ number into a new key, one method to iterate (i.e., increment the ++ key) and one method to free the key. The code could be shared with ++ truncate.c, as this mechanism is very general. ++ ++ This code assumes tht nobody else can read or write the file ++ between ext2_get_key() and ext2_free_key(), so callers need to have ++ i_sem (which they all do anyway). */ ++ ++/* TODO: Get all of the bkey routines to return -errno instead of ++ true/false. */ ++/* TODO: The bkey routines currently assume tht address blocks are ++ allocated even if all contained addresses are NULL, but this is not ++ true. Make sure tht we differentiate between NULL block and error, ++ and then fix up ext2_set_key_blkaddr() and anything else (including ++ the pack/unpack routines). */ ++struct ext2_bkey { ++ int level; ++ u32 block; ++ struct inode *inode; ++ int off[4]; ++ u32 *ptr[4]; ++ struct buffer_head *ibh[4]; ++}; ++ ++ ++/* ++ * Method to convert a block number into a key. ++ * ++ * Returns 1 on success, 0 on failure. You may safely, but need ++ * not, free the key even if ext2_get_key() fails. ++ */ ++static int ext2_get_key(struct ext2_bkey *key, struct inode *inode, ++ u32 block) ++{ ++ int x, level; ++ int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); ++ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); ++ ++ /* ++ * The first step can be viewed as translating the ++ * original block number in a special base (powers ++ * of addr_per_block). ++ */ ++ ++ key->block = block; ++ ++ key->off[0] = key->off[1] = key->off[2] = key->off[3] = 0; ++ key->ibh[0] = key->ibh[1] = key->ibh[2] = key->ibh[3] = NULL; ++ key->ptr[0] = key->ptr[1] = key->ptr[2] = key->ptr[3] = NULL; ++ ++ if (block >= EXT2_NDIR_BLOCKS) { ++ block -= EXT2_NDIR_BLOCKS; ++ ++ if (block >= addr_per_block) { ++ block -= addr_per_block; ++ ++ if (block >= addr_per_block * addr_per_block) { ++ block -= addr_per_block * addr_per_block; ++ ++ key->off[0] = EXT2_TIND_BLOCK; ++ key->off[1] = (block / (addr_per_block * addr_per_block)); ++ key->off[2] = ++ (block % (addr_per_block * addr_per_block)) / ++ addr_per_block; ++ key->off[3] = (block % addr_per_block); ++ level = 3; ++ } else { ++ key->off[0] = EXT2_DIND_BLOCK; ++ key->off[1] = block / addr_per_block; ++ key->off[2] = block % addr_per_block; ++ level = 2; ++ } ++ } else { ++ key->off[0] = EXT2_IND_BLOCK; ++ key->off[1] = block; ++ level = 1; ++ } ++ } else { ++ key->off[0] = block; ++ level = 0; ++ } ++ ++ /* ++ * In the second step, we load the needed buffers. ++ */ ++ ++ key->level = level; ++ key->inode = inode; ++ ++ key->ptr[0] = (u32 *) (&(EXT2_I(inode)->i_data)); ++ ++ for (x = 1; x <= level; x++) { ++ u32 *ptr; ++ ++ ptr = key->ptr[x - 1]; ++ if (ptr == NULL) ++ break; ++/* Paul Whittaker tweak 19 Feb 2005 */ ++ block = le32_to_cpu(ptr[key->off[x - 1]]); ++ if (block == 0) ++ continue; // TLL 05/01/07 ++ if (x - 1 != 0) ++ block = le32_to_cpu(block); ++ if ((key->ibh[x] = __bread(inode->i_sb->s_bdev, ++ block, inode->i_sb->s_blocksize)) ++ == NULL) ++ goto error; ++ key->ptr[x] = (u32 *) (key->ibh[x]->b_data); ++ } ++ ++ return 1; ++ error: ++ for (; x != 0; x--) ++ if (key->ibh[x] != NULL) ++ brelse(key->ibh[x]); ++ return 0; ++} ++ ++ ++/* ++ * Find the block for a given key. Return 0 if there ++ * is no block for this key. ++ */ ++static inline u32 ext2_get_key_blkaddr(struct ext2_bkey *key) ++{ ++ assert(key->inode); ++ assert(atomic_read(&(key->inode)->i_mutex.count) <= 0); ++ ++/* Paul Whittaker tweak 19 Feb 2005 */ ++ if (key->ptr[key->level] == NULL) ++ return 0; ++ return le32_to_cpu(key->ptr[key->level][key->off[key->level]]); ++} ++ ++ ++/* ++ * Change the block for a given key. Return 0 on success, ++ * -errno on failure. ++ */ ++static inline int ext2_set_key_blkaddr(struct ext2_bkey *key, u32 blkaddr) ++{ ++ char bdn[BDEVNAME_SIZE]; ++ assert(key->inode); ++ assert(atomic_read(&(key->inode)->i_mutex.count) <= 0); ++ ++ if (key->ptr[key->level] == NULL) { ++ /* The reason that this "can't happen" is that this ++ routine is only used to shuffle block numbers or by ++ free_cluster_blocks. Cluster sizes are such that ++ clusters can't straddle address blocks. So the ++ indirect block address can't be zero. AFAIK, ptr ++ can only be NULL on error or on null indirect block ++ address. Hmm, come to think of it, I think there ++ are still some callers that don't check for errors ++ from ext2_get_key(), so this still can happen until ++ those are fixed up. */ ++ printk(KERN_ERR ++ "ext2_set_key_blkaddr: can't happen: NULL parent. " ++ "dev=%s, ino=%lu, level=%u.\n", ++ bdevname(key->inode->i_sb->s_bdev, bdn), ++ key->inode->i_ino, key->level); ++ return -ENOSYS; ++ } ++ /* Paul Whittaker tweak 19 Feb 2005 */ ++ key->ptr[key->level][key->off[key->level]] = le32_to_cpu(blkaddr); ++ if (key->level > 0) ++ mark_buffer_dirty(key->ibh[key->level]); ++ return 0; ++} ++ ++ ++/* ++ * Increment the key. Returns 0 if we go beyond the limits, ++ * 1 otherwise. ++ * ++ * Precondition: -key->off[level] <= incr < addr_per_block. ++ */ ++static int ext2_next_key(struct ext2_bkey *key, int incr) ++{ ++ int addr_per_block = EXT2_ADDR_PER_BLOCK(key->inode->i_sb); ++ int x, level = key->level; ++ u32 tmp; ++ ++ assert(key->inode); ++ assert(atomic_read(&(key->inode)->i_mutex.count) <= 0); ++ ++ ++ /* ++ * Increment the key. This is done in two step: first ++ * adjust the off array, then reload buffers that should ++ * be reloaded (we assume level > 0). ++ */ ++ ++ assert(key->off[level] >= -incr); ++ assert(incr < addr_per_block); ++ key->block += incr; ++ key->off[level] += incr; ++ ++ /* ++ * First step: should be thought as the propagation ++ * of a carry. ++ */ ++ ++ if (level == 0) { ++ if (key->off[0] >= EXT2_NDIR_BLOCKS) { ++ key->off[1] = key->off[0] - EXT2_NDIR_BLOCKS; ++ key->off[0] = EXT2_IND_BLOCK; ++ level = 1; ++ } ++ x = 0; ++ } else { ++ for (x = level; x > 0; x--) { ++ if (key->off[x] >= addr_per_block) { ++ key->off[x] -= addr_per_block; ++ key->off[x - 1]++; ++ ++ if (x == 1) { ++ if (++level < 4) { ++ key->off[level] = key->off[level - 1]; ++ key->off[level - 1] = 0; ++ } else ++ return 0; ++ } ++ } else ++ break; ++ } ++ } ++ ++ /* ++ * Second step: reload the buffers that have changed. ++ */ ++ ++ key->level = level; ++ ++ CHECK_NOT_ATOMIC ++ while (x++ < level) { ++ if (key->ibh[x] != NULL) { ++ if (IS_SYNC(key->inode) && buffer_dirty(key->ibh[x])) { ++ //mw: ++ assert(buffer_mapped(key->ibh[x]) ++ && (key->ibh[x]->b_bdev != NULL)); ++ ll_rw_block(WRITE, 1, &(key->ibh[x])); ++ wait_on_buffer(key->ibh[x]); ++ } ++ brelse(key->ibh[x]); ++ } ++/* Paul Whittaker tweak 19 Feb 2005 */ ++ if ((key->ptr[x - 1] != NULL) ++ && ((tmp = le32_to_cpu(key->ptr[x - 1][key->off[x - 1]])) != ++ 0)) { ++ if ((key->ibh[x] = ++ __bread(key->inode->i_sb->s_bdev, tmp, ++ key->inode->i_sb->s_blocksize)) ++ != NULL) ++ key->ptr[x] = (u32 *) (key->ibh[x]->b_data); ++ else ++ key->ptr[x] = NULL; ++ } else { ++ key->ibh[x] = NULL; ++ key->ptr[x] = NULL; ++ } ++ } ++ ++ return 1; ++} ++ ++ ++/* Method to free the key: just release buffers. ++ ++ Returns 0 on success, -errno on error. ++*/ ++ ++static int ext2_free_key(struct ext2_bkey *key) ++{ ++ int x, n; ++ struct buffer_head *bh[4]; ++ ++ assert(key->inode); ++ assert(atomic_read(&(key->inode)->i_mutex.count) <= 0); ++ ++ ++ for (x = 0, n = 0; x <= key->level; x++) { ++ if (key->ibh[x] != NULL) { ++ if (IS_SYNC(key->inode) && buffer_dirty(key->ibh[x])) ++ bh[n++] = key->ibh[x]; ++ else ++ brelse(key->ibh[x]); ++ } ++ } ++ ++ if (n > 0) { ++ int ncopy = n; ++ while (ncopy-- > 0) { ++ assert(buffer_mapped(bh[ncopy]) ++ && (bh[ncopy]->b_bdev != NULL)); ++ } ++ ++ ll_rw_block(WRITE, n, bh); ++ ++ CHECK_NOT_ATOMIC ++ ++ while (n-- > 0) { ++ wait_on_buffer(bh[n]); ++ /* TODO: Check for error. */ ++ brelse(bh[n]); ++ } ++ } ++ return 0; ++} ++ ++ ++/* Returns positive if specified cluster is compressed, ++ zero if not, ++ -errno if an error occurred. ++ ++ If you need the result to be accurate, then down i_sem before ++ calling this, and don't raise i_sem until after you've used the ++ result. */ ++int ext2_cluster_is_compressed_fn(struct inode *inode, unsigned cluster) ++{ ++ unsigned block = (ext2_cluster_block0(inode, cluster) ++ + ext2_cluster_nblocks(inode, cluster) ++ - 1); ++ struct ext2_bkey key; ++ int result; ++ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); ++ ++ /* impl: Not all callers of ext2_cluster_is_compressed_fn() have ++ i_sem down. Of course it is impossible to guarantee ++ up-to-date information for such callers (someone may ++ compress or decompress between when we check and when they ++ use the information), so hopefully it won't matter if the ++ information we return is slightly inaccurate (e.g. because ++ someone is de/compressing the cluster while we check). */ ++ if (!ext2_get_key(&key, inode, block)) ++ return -EIO; ++ ++ result = (ext2_get_key_blkaddr(&key) == EXT2_COMPRESSED_BLKADDR); ++ ext2_free_key(&key); ++ return result; ++} ++ ++ ++/* Support for the GETCOMPRRATIO ioctl() call. We calculate how many ++ blocks the file would hold if it weren't compressed. This requires ++ reading the cluster head for every compressed cluster. ++ ++ Returns either -EAGAIN or the number of blocks that the file would ++ take up if uncompressed. */ ++int ext2_count_blocks(struct inode *inode) ++{ ++ struct buffer_head *head_bh; ++ int count; ++ int cluster; ++ struct ext2_bkey key; ++ u32 end_blknr; ++ ++ if (!(EXT2_I(inode)->i_flags & EXT2_COMPRBLK_FL)) ++ return inode->i_blocks; ++ ++ mutex_lock(&inode->i_mutex); ++ end_blknr = ROUNDUP_RSHIFT(inode->i_size, ++ inode->i_sb->s_blocksize_bits); ++ ++ /* inode->i_blocks is stored in units of 512-byte blocks. It's ++ more convenient for us to work in units of s_blocksize. */ ++ { ++ u32 shift = inode->i_sb->s_blocksize_bits - 9; ++ ++ count = inode->i_blocks; ++ if (count & ((1 << shift) - 1)) ++ ext2_msg(inode->i_sb, ++ "ext2_count_blocks", ++ "i_blocks not multiple of blocksize"); ++ count >>= shift; ++ } ++ ++ cluster = 0; ++ if (!ext2_get_key(&key, inode, 0)) { ++ count = -EIO; ++ goto out; ++ } ++ while (key.block < end_blknr) { ++ u32 head_blkaddr = ext2_get_key_blkaddr(&key); ++ ++ /* bug fix: init head_bh for each iteration TLL 2/21/07 */ ++ head_bh = NULL; ++ if (head_blkaddr == EXT2_COMPRESSED_BLKADDR) { ++ count = -EXT2_ECOMPR; ++ break; ++ } ++ if (!ext2_next_key(&key, ext2_cluster_nblocks(inode, cluster) - 1)) ++ break; ++ if (ext2_get_key_blkaddr(&key) == EXT2_COMPRESSED_BLKADDR) { ++ struct ext2_cluster_head *head; ++ ++ if (head_blkaddr == 0) { ++ count = -EXT2_ECOMPR; ++ break; ++ } ++ head_bh = __getblk(inode->i_sb->s_bdev, ++ head_blkaddr, inode->i_sb->s_blocksize); ++ if (head_bh == NULL) { ++ /* Hmm, EAGAIN or EIO? */ ++ count = -EAGAIN; ++ break; ++ } ++ if (!buffer_uptodate(head_bh)) ++ ll_rw_block(READ, 1, &head_bh); ++ ++ CHECK_NOT_ATOMIC ++ ++ wait_on_buffer(head_bh); ++ ++#ifdef CONFIG_HIGHMEM ++ if (!page_address(head_bh->b_page)) { ++ BUG(); ++ } ++#endif ++ ++ head = (struct ext2_cluster_head *) head_bh->b_data; ++ /* remove clen > ulen test TLL 2/21/07 */ ++ if ((head->magic != cpu_to_le16(EXT2_COMPRESS_MAGIC_04X)) ++ || (le32_to_cpu(head->ulen) > EXT2_MAX_CLUSTER_BYTES) ++ || (head->holemap_nbytes > 4)) { ++ count = -EXT2_ECOMPR; ++ break; ++ } ++ assert(sizeof(struct ext2_cluster_head) == 16); ++ count += (ROUNDUP_RSHIFT(le32_to_cpu(head->ulen), ++ inode->i_sb->s_blocksize_bits) ++ - ROUNDUP_RSHIFT((le32_to_cpu(head->clen) ++ + sizeof(struct ext2_cluster_head) ++ + head->holemap_nbytes), ++ inode->i_sb->s_blocksize_bits)); ++ brelse(head_bh); ++ head_bh = NULL; ++ } ++ ++ if (!ext2_next_key(&key, 1)) ++ break; ++ cluster++; ++ } ++ ext2_free_key(&key); ++ if (head_bh != NULL) ++ brelse(head_bh); ++ out: ++ mutex_unlock(&inode->i_mutex); ++ if (count == -EXT2_ECOMPR) { ++ ext2_msg(inode->i_sb, ++ "ext2_count_blocks", ++ "invalid compressed cluster %u of inode %lu", ++ cluster, inode->i_ino); ++ EXT2_I(inode)->i_flags |= EXT2_ECOMPR_FL; ++ } ++ ++ /* The count should be in units of 512 (i.e. 1 << 9) bytes. */ ++ if (count >= 0) ++ count <<= inode->i_sb->s_blocksize_bits - 9; ++ return count; ++} ++ ++ ++/* Decompress some blocks previously obtained from a cluster. ++ Decompressed data is stored in ext2_rd_wa.u. Buffer heads in the bh ++ array are packed together at the begining of the array. The ulen ++ argument is an indication of how many bytes the caller wants to ++ obtain, excluding holes. (This can be less than head->ulen, as in the ++ case of readpage.) No hole processing is done; we don't even look at ++ head->holemap. ++ ++ Note the semantic difference between this and ++ (): the latter decompresses a cluster _and ++ stores it as such_, whereas ext2_decompress_blocks() just ++ decompresses the contents of the blocks into ext2_rd_wa.u. ++ ++ The working area is supposed to be available and locked. ++ ++ Returns a negative value on failure, the number of bytes ++ decompressed otherwise. ++ ++ Called by : ++ ++ ext2_decompress_cluster () [sem down] ++ ext2_readpage () [sem down, but only ifndef EXT2_LOCK_BUFFERS] */ ++ ++/* TODO: ext2_decompress_blocks() scribbles in ext2_rd_wa.c. ++ Check callers to make sure this isn't a problem. */ ++ ++/* mw: caller must already have done: "get_cpu_var(ext2_rd_wa)" */ ++size_t ++ext2_decompress_blocks(struct inode * inode, ++ struct buffer_head ** bh, ++ int nblk, size_t ulen, u32 cluster) ++{ ++ struct ext2_cluster_head *head; ++ int count, src_ix, x; ++ unsigned char *dst; ++ unsigned meth, alg; ++ char bdn[BDEVNAME_SIZE]; ++ ++#ifdef EXT2_COMPR_DEBUG ++ //mw: 30.04.2012: seems to fail... ? assert(in_atomic()); ++ assert(atomic_read(&inode->i_mutex.count) <= 0); /* i.e. mutex_lock */ ++#endif ++ ++ /* ++ We pack the buffer together before (and must take care ++ not to duplicate the buffer heads in the array). ++ ++ pjm 1998-01-09: Starting from e2compr-0.4.0, they should ++ already be packed together in the blkaddr array. TODO: ++ Insert appropriate assert() statements checking tht this is ++ the case. TODO: Check that callers have bh[] packed. */ ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c("ext2_decompress_blocks: nblk=%d\n", nblk); ++#endif ++ for (src_ix = 0, x = 0; src_ix < nblk; src_ix++) { ++ if (bh[src_ix] == NULL) ++ printk("no_bheader()\n"); ++ if ((bh[src_ix] != NULL) && (bh[src_ix]->b_blocknr != 0)) { ++ ++ if (x < src_ix) { ++ ext2_msg(inode->i_sb, "bad buffer table", ++ "inode = %lu", inode->i_ino); ++ goto error; ++ } ++ x++; ++ } ++ } ++ ++ nblk = x; ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("ext2_decompress_blocks (2): nblk=%d\n", nblk); ++#endif ++ if (nblk == 0) { ++ ext2_msg(inode->i_sb, "no block in cluster", "inode = %lu", ++ inode->i_ino); ++ goto error; ++ } ++ ++ restore_b_data_himem(bh[0]); ++ head = (struct ext2_cluster_head *) (bh[0]->b_data); ++ ++ /* ++ * Do some consistency checks. ++ */ ++ ++ if (head->magic != cpu_to_le16(EXT2_COMPRESS_MAGIC_04X)) { ++ ext2_msg(inode->i_sb, ++ "bad magic number", ++ "inode = %lu, magic = %#04x", ++ inode->i_ino, le16_to_cpu(head->magic)); ++ goto error; ++ } ++#if EXT2_GRAIN_SIZE & (EXT2_GRAIN_SIZE - 1) ++# error "This code assumes EXT2_GRAIN_SIZE to be a power of two." ++#endif ++ /* The macro also assumes that _a > 0, _b > 0. */ ++#define ROUNDUP_GE(_a, _b, _d) ( ( ((_a) - 1) \ ++ | ((_d) - 1)) \ ++ >= ( ((_b) - 1) \ ++ | ((_d) - 1))) ++ ++ //mw: following 3 just for debugging!!! ++ assert(!((le32_to_cpu(head->ulen) > EXT2_MAX_CLUSTER_BYTES))); ++ assert(!((head->clen == 0))); ++ assert(!(ROUNDUP_GE(le32_to_cpu(head->clen) ++ + head->holemap_nbytes + sizeof(struct ext2_cluster_head), ++ le32_to_cpu(head->ulen), EXT2_GRAIN_SIZE))); ++ ++ if ((le32_to_cpu(head->ulen) > EXT2_MAX_CLUSTER_BYTES) ++ || (head->clen == 0) ++ || ROUNDUP_GE(le32_to_cpu(head->clen) ++ + head->holemap_nbytes ++ + sizeof(struct ext2_cluster_head), ++ le32_to_cpu(head->ulen), EXT2_GRAIN_SIZE)) { ++ ext2_msg(inode->i_sb, ++ "invalid cluster len", ++ "inode = %lu, len = %u:%u", ++ inode->i_ino, ++ le32_to_cpu(head->clen), le32_to_cpu(head->ulen)); ++ goto error; ++ } ++#undef ROUNDUP_GE ++ ++ /* TODO: Test for `nblk != 1 + ...' instead of the current ++ one-sided test. However, first look at callers, and make ++ sure that they handle the situation properly (e.g. freeing ++ unneeded blocks) and tht they always pass a correct ++ value for nblk. */ ++ if (nblk <= ((le32_to_cpu(head->clen) ++ + head->holemap_nbytes + sizeof(struct ext2_cluster_head) ++ - 1) ++ / bh[0]->b_size)) { ++ int i; ++ ext2_msg(inode->i_sb, ++ "missing blocks", ++ "inode = %lu, blocks = %d/%u", ++ inode->i_ino, nblk, ((le32_to_cpu(head->clen) ++ + head->holemap_nbytes ++ + sizeof(struct ext2_cluster_head) ++ - 1) ++ / bh[0]->b_size) + 1); ++ printk("i_size=%d\n", (int) inode->i_size); ++ for (i = 0; i < 12; i++) ++ printk("i_data[%d]=%d\n", i, EXT2_I(inode)->i_data[i]); ++ printk("cluster_head (sizeof head=%u):\n\tmagic=0x%4x\n\tmethod=%d\n\t \ ++ holemap_nbytes=%d\n\tulen=%d\n\tclen=%d\n\tbh->b_size=%zu\n", ++ sizeof(struct ext2_cluster_head), head->magic, ++ (int) head->method, (int) head->holemap_nbytes, head->ulen, ++ head->clen, bh[0]->b_size); ++ goto error; ++ } ++ ++ /* I moved it here in case we need to load a module that ++ * needs more heap that is currently allocated. ++ * In such case "init_module" for that algorithm forces ++ * re-allocation of ext2_wa. It should be safe here b/c the ++ * first reference to ext2_wa comes just after and we have ++ * locked ext2_wa before. ++ * ++ * FIXME: Totally separate working areas for reading and writing. ++ * Jan R. ++ */ ++ meth = head->method; /* only a byte, so no swabbing needed. */ ++ if (meth >= EXT2_N_METHODS) { ++ ext2_msg(inode->i_sb, ++ "Ass: illegal method id", ++ "inode = %lu, id = %u", inode->i_ino, meth); ++ dump_stack(); ++ goto error; ++ } ++ alg = ext2_method_table[meth].alg; ++ ++ /* ++ * Adjust the length if too many bytes are requested. ++ * ++ * TODO: Traiter les bitmaps ici, et non plus au niveau de ++ * l'appelant. Faire un petit cache en memorisant le ++ * numero du dernier noeud decompresse et du dernier ++ * cluster. Le pb, c'est qu'on ne peut pas savoir si ++ * les blocs ont ete liberes et realloue entre temps ++ * -> il faut etre prevenu pour invalider le buffer. ++ * ++ * pjm fixme tr: Take care of the bitmaps here, ++ * instead of by the caller as we currently do. Keep ++ * a small cache that holds the number of the ++ * previous to have been ++ * decompressed. The problem is that we have no way ++ * of knowing whether the blocks have been freed and ++ * reallocated in the meantime / since last time -> ++ * we must be informed so that we can invalidate the ++ * buffer. */ ++ if (ulen > le32_to_cpu(head->ulen)) { ++ memset(__get_cpu_var(ext2_rd_wa)->u + le32_to_cpu(head->ulen), 0, ulen - le32_to_cpu(head->ulen)); ++ ulen = le32_to_cpu(head->ulen); ++ ++ assert((bh[0]->b_size & (bh[nblk - 1]->b_size - 1)) == 0); ++ if (((le32_to_cpu(head->clen) ++ + head->holemap_nbytes + sizeof(struct ext2_cluster_head) ++ - 1) ++ | (bh[0]->b_size - 1)) ++ >= ((ulen - 1) | (bh[0]->b_size - 1))) { ++ printk(KERN_WARNING ++ "ext2_decompress_blocks: " ++ "ulen (=%zu) or clen (=%u) wrong " ++ "in dev %s, inode %lu.\n", ++ ulen, le32_to_cpu(head->clen), ++ bdevname(inode->i_sb->s_bdev, bdn), inode->i_ino); ++ goto error; ++ } ++ } ++ ++ /* ++ * Now, decompress data. ++ */ ++ /* TODO: Is this (ulen == 0) possible? */ ++ if (ulen == 0) ++ return 0; ++ ++ for (x = 0, dst = __get_cpu_var(ext2_rd_wa)->c; x < nblk; dst += bh[x++]->b_size) { ++ restore_b_data_himem(bh[x]); ++ memcpy(dst, bh[x]->b_data, bh[x]->b_size); ++ } ++ ++ ++ if (!ext2_algorithm_table[alg].avail) { ++ ext2_msg(inode->i_sb, ++ "ext2_decompress_blocks", ++ "algorithm `%s' not available for inode %lu", ++ ext2_algorithm_table[alg].name, inode->i_ino); ++ ext2_mark_algorithm_use(inode, alg); ++ goto error; ++ } ++ ++ ++#ifdef EXT2_COMPR_DEBUG ++ { ++ struct ext2_cluster_head *wa1head = (struct ext2_cluster_head *) __get_cpu_var(ext2_rd_wa)->c; ++ unsigned clen = le32_to_cpu(wa1head->clen); ++ if (wa1head->checksum != ++ cpu_to_le32(ext2_adler32 ++ (le32_to_cpu(*(u32 *) __get_cpu_var(ext2_rd_wa)->c), ++ __get_cpu_var(ext2_rd_wa)->c + 8, ++ (sizeof(struct ext2_cluster_head) - 8 + ++ head->holemap_nbytes + clen)))) ++ { ++ head->checksum = cpu_to_le32(0); ++ ext2_msg(inode->i_sb, "ext2_decompress_blocks: corrupted compressed data ", ++ "in inode %lu", inode->i_ino); ++ //goto error; ++ //mw: we try to go on. if data is corrupt we will get an compression error anyway. ++ } ++ } ++#endif ++ ++ count = ext2_algorithm_table[alg].decompress(__get_cpu_var(ext2_rd_wa)->c + ++ sizeof(struct ++ ext2_cluster_head) + ++ head->holemap_nbytes, ++ __get_cpu_var(ext2_rd_wa)->u, ++ __get_cpu_var(ext2_rd_wa)->heap, ++ le32_to_cpu(head->clen), ulen, ++ ext2_method_table[meth].xarg); ++ ++ /* If we got fewer than ulen bytes, there is a problem, since ++ we corrected the ulen value before decompressing. Note ++ that it's OK for count to exceed ulen, because ulen can be ++ less than head->ulen. */ ++ if ((count < ulen) || (count != le32_to_cpu(head->ulen))) { ++ ext2_msg(inode->i_sb, ++ "ext2_decompress_blocks: corrupted compressed data ", "inode = %lu, count = %u of %zu (%u/%u)", ++ inode->i_ino, count, ulen, le32_to_cpu(head->clen), le32_to_cpu(head->ulen)); ++ goto error; ++ } ++ ext2_ensure_algorithm_use(inode, alg); ++ return count; ++ ++ error: ++ ++ /* Raise the ECOMPR flag for this file. What this means is ++ that the file cannot be written to, and can only be read if ++ the user raises the NOCOMPR flag. ++ ++ pjm 1997-01-16: I've changed it so that files with ECOMPR ++ still have read permission, so user can still read the rest ++ of the file but get an I/O error (errno = EXT2_ECOMPR) when ++ they try to access anything from this cluster. */ ++ ++ EXT2_I(inode)->i_flags |= EXT2_ECOMPR_FL; ++ ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty_sync(inode); ++ /* pjm 1998-02-21: We used to do `memset(ext2_rd_wa.u, 0, ulen)' ++ here because once upon a time the user could sometimes see ++ buf contents. I believe that this can never happen any ++ more. */ ++ return -EXT2_ECOMPR; ++} ++ ++ ++/* ext2_calc_free_ix: Calculates the position of the C_NBLK'th non-hole ++ block; equals C_NBLK plus the number of holes in the first CALC_FREE_IX() ++ block positions of the cluster. ++ ++ pre: 1 =< c_nblk < EXT2_MAX_CLUSTER_BLOCKS, ++ Number of 1 bits in ,ubitmap` > ,c_nblk`. ++ post: c_nblk =< calc_free_ix() < EXT2_MAX_CLUSTER_BLOCKS ++ ++ Called by: ++ ext2_decompress_cluster() ++ ext2_file_write() ++ ++ TODO: Have ext2_compress_cluster() call this. ++ */ ++unsigned ext2_calc_free_ix(unsigned holemap_nbytes, u8 const *holemap, ++ unsigned c_nblk) ++{ ++ unsigned i; ++ ++ assert(1 <= c_nblk); ++ assert(c_nblk < EXT2_MAX_CLUSTER_BLOCKS); ++ for (i = 0; (i < holemap_nbytes * 8) && (c_nblk > 0);) { ++ assert(i < EXT2_MAX_CLUSTER_BLOCKS - 1); ++ if ((holemap[i >> 3] & (1 << (i & 7))) == 0) ++ c_nblk--; ++ i++; ++ } ++ i += c_nblk; ++ assert(i < EXT2_MAX_CLUSTER_BLOCKS); ++ return i; ++} ++ ++ ++/* (): Prepare the blkaddr[] array for ++ decompression by moving non-hole blocks to their proper positions ++ (according to ubitmap) and zeroing any other blocks. ++ ++ Returns 0 on success, -errno on error. ++ ++ Note: We assume tht blkaddr[i] won't change under us forall ++ clu_block0 =< i < clu_block0 + clu_nblocks. Holding i_sem should ++ guarantee this. ++ ++ Called by: ++ ext2_decompress_cluster() ++ ext2_file_write() */ ++int ++ext2_unpack_blkaddrs(struct inode *inode, ++ struct buffer_head *bh[], ++ int mmcp, ++ unsigned holemap_nbytes, ++ u8 const *holemap, ++ unsigned c_nblk, ++ unsigned free_ix, ++ unsigned clu_block0, unsigned clu_nblocks) ++{ ++ struct ext2_bkey key; ++ u32 *blkaddr; ++ unsigned si, di; ++ ++ assert(clu_nblocks <= EXT2_MAX_CLUSTER_BLOCKS); ++ assert(1 <= c_nblk); ++ assert(c_nblk <= free_ix); ++ assert(free_ix < EXT2_MAX_CLUSTER_BLOCKS); ++ if (!ext2_get_key(&key, inode, clu_block0)) ++ return -EIO; ++ ++ if (key.ptr[key.level] == NULL) { ++ /* TODO: Call ext2_error(). */ ++ ext2_free_key(&key); ++ return -EIO; ++ } ++ ++ /* impl: Note tht we're relying on clusters not straddling ++ address block boundaries. */ ++ blkaddr = &key.ptr[key.level][key.off[key.level]]; ++ memset(blkaddr + free_ix, ++ 0, sizeof(*blkaddr) * (clu_nblocks - free_ix)); ++ si = c_nblk; ++ for (di = free_ix; di > si;) { ++ --di; ++ if (((di >> 3) < holemap_nbytes) ++ && (holemap[di >> 3] & (1 << (di & 7)))) { ++ blkaddr[di] = 0; ++ bh[di]->b_blocknr = 0; ++ clear_bit(BH_Mapped, &bh[di]->b_state); ++ } else { ++ if (si == 0) { ++ break; ++ } ++ blkaddr[di] = blkaddr[--si]; ++ assert(bh[di]->b_blocknr == 0); ++ assert(bh[si]->b_blocknr != 0); ++ assert(buffer_mapped(bh[si])); ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("unpack: di=%d sts=0x%x si=%d blk=%ld sts=0x%x\n", ++ di, (int) bh[di]->b_state, si, bh[si]->b_blocknr, ++ (int) bh[si]->b_state); ++#endif ++ bh[di]->b_blocknr = bh[si]->b_blocknr; ++ set_bit(BH_Mapped, &bh[di]->b_state); ++ bh[si]->b_blocknr = 0; ++ clear_bit(BH_Mapped, &bh[si]->b_state); ++ set_bit(BH_Uptodate, &bh[di]->b_state); ++ if (mmcp) { ++ restore_b_data_himem(bh[si]); ++ restore_b_data_himem(bh[di]); ++ memcpy(bh[di]->b_data, bh[si]->b_data, ++ inode->i_sb->s_blocksize); ++ } ++ } ++ } ++ if (key.level > 0) ++ mark_buffer_dirty(key.ibh[key.level]); ++ return ext2_free_key(&key); ++} ++ ++ ++/* ++ * Decompress one cluster. If already compressed, the cluster ++ * is decompressed in place, and the compress bitmap is updated. ++ * ++ * Returns the size of decompressed data on success, a negative ++ * value in case of failure, or 0 if the cluster was not compressed. ++ * ++ * The inode is supposed to be writable. ++ * ++ * Called by : ++ * ++ * ext2_decompress_inode() [sem down] ++ * ext2_file_write() [sem down] ++ * trunc_bitmap() [sem down] ++ */ ++int ext2_decompress_cluster(struct inode *inode, u32 cluster) ++{ ++ struct buffer_head *bh[EXT2_MAX_CLUSTER_BLOCKS]; ++ struct buffer_head *bhc[EXT2_MAX_CLUSTER_BLOCKS]; ++ struct page *pg[EXT2_MAX_CLUSTER_PAGES], *epg[EXT2_MAX_CLUSTER_PAGES]; ++ int result, nbh; ++ unsigned npg, c_nblk; ++ struct ext2_cluster_head *head; ++ int i = 0; ++ unsigned free_ix, clu_block0, clu_nblocks; ++ int d_npg = -1; /* number of decompressed page */ ++ unsigned long allpagesuptodate = 1; ++ struct buffer_head *bh_writeout[EXT2_MAX_CLUSTER_BLOCKS]; ++ int bhn_writeout; ++#ifdef CONFIG_HIGHMEM ++ int kmapped = 0; ++#endif ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_BLOCKS; i++) { ++ bh_writeout[i] = NULL; ++ bhn_writeout = 0; ++ } ++ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); /* i.e. mutex_lock */ ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) ++ epg[i] = NULL; ++ ++ /* ++ Get blocks from cluster. ++ Assign to variables head, ubitmap, clu_block0, clu_nblocks. ++ Shuffle blkaddr[] array and write zero to holes. ++ Allocate new blocks. ++ Get the working area. ++ Decompress. ++ Copy to bh[]->b_data (marking buffers uptodate and dirty). ++ Release working area. ++ Release bh[]. ++ */ ++ ++ nbh = 0; ++ npg = ext2_cluster_npages(inode, cluster); ++ result = ext2_get_cluster_pages(inode, cluster, pg, NULL, 0); ++ if (result <= 0) { ++ for (i = 0; i < npg; i++) ++ epg[i] = NULL; ++ goto out_err; ++ } ++ ++ for (i = 0; i < npg; i++) { ++ if ((pg[i]->index <= ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) && ++ !PageUptodate(pg[i])) { ++ allpagesuptodate = 0; ++ } ++ } ++ if (allpagesuptodate) { ++ //printk("DecompressPages: Ino:%lu\n", inode->i_ino); ++ result = ext2_decompress_pages(inode, cluster, pg); ++ if (result != 0) { ++ for (i = 0; i < npg; i++) ++ epg[i] = NULL; ++ if (result > 0) ++ goto cleanup; ++ else ++ goto out_err; ++ } ++ /*mw: if we continue here then in ext2_decompress_pages ++ * not all pages were up-to-date ++ */ ++ } ++ //printk("DecompressCluster: Ino:%lu\n", inode->i_ino); ++ result = ext2_get_cluster_extra_pages(inode, cluster, pg, epg); ++ if (result <= 0) { ++ goto out_err; ++ } ++#ifdef CONFIG_HIGHMEM ++ ext2_kmap_cluster_pages(NULL, pg, epg); ++ kmapped = 1; ++#endif ++ ++ result = ext2_get_cluster_blocks(inode, cluster, bh, pg, epg, 0); ++ if (result <= 0) { ++ goto out_err; ++ } ++ nbh = c_nblk = result; ++ ++ ++#ifdef EXT2_COMPR_REPORT ++ { ++ int j; ++ printk ++ (" > > > ext2_decompress_cluster %d: inode=%ld, size=%d nbh=%d\n", ++ cluster, inode->i_ino, (int) inode->i_size, nbh); ++#ifdef EXT2_COMPR_REPORT_VERBOSE ++ for (j = 0; j < nbh; j++) { ++ if (bh[j]) { ++ printk("0buffer_head[%d]: blocknr=%lu, addr=%p \n", j, ++ (unsigned long) bh[j]->b_blocknr, bh[j]); ++ if (bh[j]->b_page) ++ printk("0:[page->index=%ld]\n", bh[j]->b_page->index); ++ else ++ printk("[No page]\n"); ++ } else ++ printk("buffer_head[%d] is NULL\n", j); ++ } ++ while ((j < EXT2_MAX_CLUSTER_BLOCKS) && (bh[j] != NULL) && bh[j]->b_blocknr) { /*Add by Yabo Ding */ ++ printk ++ ("buffer_head[%d] is free but not NULL: blocknr=%lu, addr=%p\n", ++ j, (unsigned long) bh[j]->b_blocknr, bh[j]); ++ j++; ++ } ++#endif ++ } ++#endif ++ for (i = 0; i < nbh; i++) ++ assert(bh[i]->b_blocknr != 0); ++ ++ restore_b_data_himem(bh[0]); ++ ++ head = (struct ext2_cluster_head *) bh[0]->b_data; ++ if (head->magic != cpu_to_le16(EXT2_COMPRESS_MAGIC_04X)) { ++ ext2_msg(inode->i_sb, ++ "ext2_decompress_cluster: bad magic number", ++ "cluster %d: inode = %lu, magic = %#04x", ++ cluster, inode->i_ino, le16_to_cpu(head->magic)); ++ EXT2_I(inode)->i_flags |= EXT2_ECOMPR_FL; ++ result = -EXT2_ECOMPR; ++ goto out_err; ++ } ++ if (le32_to_cpu(head->ulen) - ++ (c_nblk << inode->i_sb->s_blocksize_bits) <= 0) { ++ ext2_error(inode->i_sb, "ext2_decompress_cluster", ++ "ulen too small for c_nblk. ulen=%u, c_nblk=%u, bs=%lu", ++ le32_to_cpu(head->ulen), c_nblk, ++ inode->i_sb->s_blocksize); ++ EXT2_I(inode)->i_flags |= EXT2_ECOMPR_FL; ++ result = -EXT2_ECOMPR; ++ goto out_err; ++ } ++ free_ix = ++ ext2_calc_free_ix(head->holemap_nbytes, (u8 const *) (&head[1]), ++ c_nblk); ++ clu_block0 = ext2_cluster_block0(inode, cluster); ++ clu_nblocks = ext2_cluster_nblocks(inode, cluster); ++ ext2_unpack_blkaddrs(inode, bh, 1, ++ head->holemap_nbytes, (u8 const *) (&head[1]), ++ c_nblk, free_ix, clu_block0, clu_nblocks); ++ ++ /* Allocate the extra blocks needed. */ ++ { ++ int data_left = le32_to_cpu(head->ulen); ++ ++ data_left -= c_nblk << inode->i_sb->s_blocksize_bits; ++ assert(data_left > 0); ++ for (i = free_ix; i < clu_nblocks; i++) ++ if (((i >> 3) >= head->holemap_nbytes) ++ || !(head->holemap[i >> 3] & (1 << (i & 7)))) { ++ result = ext2_get_block(inode, ++ clu_block0 + i, ++ bh[i], 1 /* create */ ); ++ if (bh[i]->b_blocknr == 0) ++ goto out_err; ++ d_npg = ++ (i >> ++ (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)) + ++ 1; ++ nbh++; ++ data_left -= inode->i_sb->s_blocksize; ++ if (data_left <= 0) ++ break; ++ } ++ } ++ ++ /* jmr 1998-10-28 Hope this is the last time I'm moving this code. ++ * Module loading must be done _before_ we lock wa, just think what ++ * can happen if we reallocate wa when somebody else uses it... ++ */ ++ { ++ unsigned meth; ++#ifdef CONFIG_KMOD ++ unsigned alg; ++#endif ++ ++ meth = head->method; /* only a byte, so no swabbing needed. */ ++ if (meth >= EXT2_N_METHODS) { ++ ext2_msg(inode->i_sb, ++ "Ass.: illegal method id", ++ "inode = %lu, id = %u", inode->i_ino, meth); ++ result = -EXT2_ECOMPR; ++ goto out_err; ++ } ++#ifdef CONFIG_KMOD ++ alg = ext2_method_table[meth].alg; ++ if (!ext2_algorithm_table[alg].avail) { ++ char str[32]; ++ ++ sprintf(str, "ext2-compr-%s", ext2_algorithm_table[alg].name); ++ request_module(str); ++ } ++#endif ++ } ++ ++ result = -EINTR; ++ ++ /* ++ * Then, decompress and copy back data. ++ */ ++ { ++ int ic; ++ ++ for (ic = 0, i = 0; i < clu_nblocks; i++) { ++ if (bh[i]->b_blocknr != 0) { ++ bhc[ic] = bh[i]; ++ ic++; ++ if (ic == c_nblk) { ++ break; ++ } ++ } ++ } ++ } ++ ++ ++#ifdef EXT2_COMPR_REPORT_WA ++ printk(KERN_DEBUG "pid %d locks wa\n", current->pid); ++#endif ++ if (get_cpu_var(ext2_rd_wa) == NULL) ++ { ++ ext2_alloc_rd_wa(); ++ } ++ assert(__get_cpu_var(ext2_rd_wa) != NULL); ++ ++ result = ext2_decompress_blocks(inode, bhc, c_nblk, ++ le32_to_cpu(head->ulen), cluster); ++ if (result != (int) le32_to_cpu(head->ulen)) { ++ if (result >= 0) { ++ /* I think this is impossible, as ++ ext2_decompress_blocks() checks against ++ head->ulen. */ ++ printk(KERN_WARNING "Unexpected return value %d " ++ "from ext2_decompress_blocks()\n", result); ++ result = -EXT2_ECOMPR; ++ } ++ ++#ifdef EXT2_COMPR_REPORT_WA ++ printk(KERN_DEBUG "pid %d unlocks wa\n", current->pid); ++#endif ++ put_cpu_var(ext2_rd_wa); ++ goto out_err; ++ } ++ ++#ifdef EXT2_COMPR_REPORT ++ printk(KERN_DEBUG "ext2: %04x:%lu: cluster %d+%d [%d] " ++ "decompressed into %d bytes\n", ++ inode->i_rdev, ++ inode->i_ino, clu_block0, clu_nblocks, c_nblk, result); ++#endif ++ ++ /* Copy back decompressed data. */ ++ { ++ int count = result; ++ unsigned char const *src; ++ int c, p; ++ int cbh; ++ int n; /* block index in page */ ++ struct buffer_head *bp; ++ unsigned addr0, b_start, b_end; ++ ++ assert(count > 0); ++ if (d_npg == -1) { ++ d_npg = ((count - 1) >> PAGE_CACHE_SHIFT) + 1; ++ } ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c ++ ("ext2_decompress_cluster: cnt=%d free_ix=%d d_npg=%d nbh=%d\n", ++ count, free_ix, d_npg, nbh); ++#endif ++ result = -EXT2_ECOMPR; ++ src = __get_cpu_var(ext2_rd_wa)->u; ++ cbh = 0; ++ for (c = 0; c < clu_nblocks; c++) { ++ ++ if (bh[c]->b_blocknr == 0) { ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("\t clear buf %d sts=0x%x\n", c, ++ (int) bh[c]->b_state); ++#endif ++ restore_b_data_himem(bh[c]); ++ memset(bh[c]->b_data, 0, inode->i_sb->s_blocksize); ++ continue; ++ } ++ if (cbh >= (nbh - 1)) { ++ break; ++ } ++ if (count < inode->i_sb->s_blocksize) { ++ put_cpu_var(ext2_rd_wa); ++ goto out_err; ++ } ++ cbh++; ++ count -= inode->i_sb->s_blocksize; ++ p = c >> (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); ++ if (!PageUptodate(pg[p])) { ++ addr0 = (clu_block0 << inode->i_sb->s_blocksize_bits); ++ b_start = addr0 + (c << inode->i_sb->s_blocksize_bits); ++ b_end = b_start + inode->i_sb->s_blocksize; ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("\t[%d] sts=0x%x e=%d s=%d sz=%d pg:%lu(%#x)\n", ++ c, (int) bh[c]->b_state, b_end, b_start, ++ (int) inode->i_size, pg[p]->index, ++ (unsigned int) pg[p]); ++#endif ++ if (b_end <= inode->i_size) { ++ /* Block is before end of file, copy data */ ++ restore_b_data_himem(bh[c]); ++ memcpy(bh[c]->b_data, src, inode->i_sb->s_blocksize); ++ ++ } else if (b_start < inode->i_size) { ++ /* Block contains end of file, copy to end */ ++ restore_b_data_himem(bh[c]); ++ memcpy(bh[c]->b_data, src, inode->i_size - b_start); ++ ++ } ++ set_buffer_uptodate(bh[c]); ++ set_buffer_dirty(bh[c]); ++ bh_writeout[bhn_writeout] = bh[c]; //mw ++ bhn_writeout++; //mw ++ } else { ++ //mw: DEBUG. buffer is uptodate now. compress will not reread! an get the compressed data!!! ++ // clear flag in extra page!!! ++ // clear_bit(BH_Uptodate, &bh[c]->b_state); ++ ++ n = c & ((PAGE_CACHE_SIZE - 1) >> inode->i_sb-> ++ s_blocksize_bits); ++ bp = page_buffers(pg[p]); ++ for (i = 0; i < n; i++) { ++ bp = bp->b_this_page; ++ } ++ result = ext2_get_block(inode, clu_block0 + c, bp, 0); ++ ++ //mw: needed to do a writeback of the non-epg-buffers ++ //no idea how it was done before ++ set_buffer_uptodate(bp); ++ set_buffer_dirty(bp); ++ bh_writeout[bhn_writeout] = bp; //mw ++ bhn_writeout++; //mw ++ ++ if (bp->b_blocknr == 0) { ++ put_cpu_var(ext2_rd_wa); ++ goto out_err; ++ } ++ assert(bp->b_blocknr == bh[c]->b_blocknr); ++ } ++ src += inode->i_sb->s_blocksize; ++ } ++ if (count > inode->i_sb->s_blocksize) { ++ put_cpu_var(ext2_rd_wa); ++ goto out_err; ++ } ++ p = c >> (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); ++ if (!PageUptodate(pg[p])) { ++ addr0 = (clu_block0 << inode->i_sb->s_blocksize_bits); ++ b_start = addr0 + (c << inode->i_sb->s_blocksize_bits); ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("\t[%d] sts=0x%x c=%d s=%d sz=%d pg:%lu(%#x)\n", c, ++ (int) bh[c]->b_state, count, b_start, ++ (int) inode->i_size, pg[p]->index, ++ (unsigned int) pg[p]); ++#endif ++ if (b_start >= inode->i_size) { ++ restore_b_data_himem(bh[c]); ++ memset(bh[c]->b_data, 0, inode->i_sb->s_blocksize); ++ ++ } else { ++ if ((inode->i_size - b_start) < count) { ++ restore_b_data_himem(bh[c]); ++ memcpy(bh[c]->b_data, src, inode->i_size - b_start); ++ memset(bh[c]->b_data + (inode->i_size - b_start), 0, ++ count - (inode->i_size - b_start)); ++ } else { ++ restore_b_data_himem(bh[c]); ++ memcpy(bh[c]->b_data, src, count); ++ } ++ } ++ set_buffer_uptodate(bh[c]); ++ set_buffer_dirty(bh[c]); ++ bh_writeout[bhn_writeout] = bh[c]; //mw ++ bhn_writeout++; //mw ++ } else { ++ assert(epg[p] != NULL); //mw ++ n = c & ((PAGE_CACHE_SIZE - 1) >> inode->i_sb-> ++ s_blocksize_bits); ++ bp = page_buffers(pg[p]); ++ for (i = 0; i < n; i++) { ++ bp = bp->b_this_page; ++ } ++ result = ext2_get_block(inode, clu_block0 + c, bp, 0); ++ ++ //mw: needed to do a writeback of the non-epg-buffers ++ //no idea how it was done before ++ set_buffer_uptodate(bp); ++ set_buffer_dirty(bp); ++ bh_writeout[bhn_writeout] = bp; //mw ++ bhn_writeout++; //mw ++ if (bp->b_blocknr == 0) { ++ put_cpu_var(ext2_rd_wa); ++ goto out_err; ++ } ++ assert(bp->b_blocknr == bh[c]->b_blocknr); ++ } ++ result = (nbh - 1) * inode->i_sb->s_blocksize + count; ++ } ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (pg[i] == NULL) ++ break; ++ if (i < d_npg) ++ SetPageUptodate(pg[i]); ++ } ++ ++#ifdef EXT2_COMPR_REPORT_WA ++ printk(KERN_DEBUG "pid %d unlocks wa\n", current->pid); ++#endif ++ put_cpu_var(ext2_rd_wa); ++ ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty_sync(inode); ++ /* If needed, EXT2_DIRTY_FL is raised by the caller. */ ++ ++#if 0 ++ /* TODO: SYNC */ ++ if (IS_SYNC(inode)) { ++ generic_osync_inode(inode, inode->i_mapping, ++ OSYNC_METADATA | OSYNC_DATA); ++ } ++#endif ++ assert(result >= 0); ++ ++ //Sync out changes: ++ assert(bhn_writeout <= EXT2_MAX_CLUSTER_BLOCKS); ++ assert(bhn_writeout >= 0); ++ ++ //mw: debug ++ for (i = 0; i < bhn_writeout; i++) { ++ if ((!buffer_mapped(bh_writeout[i])) ++ || (bh_writeout[i]->b_bdev == NULL)) { ++ u32 block = ext2_cluster_block0(inode, cluster); ++ ext2_get_block(inode, block + i, bh_writeout[i], 1); ++ //printk("ext2_get_block Block:%lu, Mapped:%i, Page:%lu, bdev: %#x\n", bh_writeout[i]->b_blocknr, (bh_writeout[i]->b_state & BH_Mapped), (bh_writeout[i]->b_page ? bh_writeout[i]->b_page->index : 0), bh_writeout[i]->b_bdev ); ++ } ++ assert(buffer_mapped(bh_writeout[i])); ++ assert(bh_writeout[i]->b_bdev != NULL); ++ assert(bh_writeout[i]->b_bdev == inode->i_sb->s_bdev); ++ /*if (bh_writeout[i]->b_bdev == NULL) ++ bh_writeout[i]->b_bdev = inode->i_sb->s_bdev; //fix bdev-bug */ ++ } ++ ++ ll_rw_block(WRITE, bhn_writeout, bh_writeout); ++ //mw: seems we have to wait here, otherwise: crash! ++ ++ CHECK_NOT_ATOMIC ++ for (i = 0; i < bhn_writeout; i++) { ++ if (bh_writeout[i]) ++ wait_on_buffer(bh_writeout[i]); ++ } ++ goto cleanup; ++ ++ out_err: ++ printk("Error in Decompressing cluster: Err=%i\n", result); ++ ++ cleanup: ++ ++#ifdef CONFIG_HIGHMEM ++ if (kmapped) ++ ext2_kunmap_cluster_pages(NULL, pg, epg); ++#endif ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (pg[i] == NULL) ++ break; ++ unlock_page(pg[i]); ++ page_cache_release(pg[i]); ++ } ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (epg[i] != NULL) { ++ ClearPageDirty(epg[i]); ++ ClearPageUptodate(epg[i]); ++ try_to_free_buffers(epg[i]); ++ unlock_page(epg[i]); ++ assert(page_count(epg[i]) == 1); ++ page_cache_release(epg[i]); ++ } ++ } ++ ++ /* ++ * Release buffers, don't forget to unlock the locked ones. ++ * pjm 1998-01-14: TO_DO: Locked ones? ++ */ ++ assert(nbh >= 0); ++ assert(nbh <= EXT2_MAX_CLUSTER_BLOCKS); ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c(" < < < ext2_decompress_cluster %d: inode=%ld, res=%i\n", ++ cluster, inode->i_ino, result); ++#endif ++ return result; ++} ++ ++ ++/* ++ * Function to decompress the pages of a cluster. ++ * ++ * Allocate buffers to pages what are not mapped on the device. ++ * ++ * Returns the size of decompressed data on success, a negative ++ * value in case of failure, or 0 if some pages are not uptodate. ++ * ++ * The inode is supposed to be writable. ++ * All the pages must be UPTODATE, ++ */ ++int ext2_decompress_pages(struct inode *inode, u32 cluster, ++ struct page *pg[]) ++{ ++ struct ext2_cluster_head *head; ++ struct buffer_head *bh0; ++ struct buffer_head *bh[EXT2_MAX_CLUSTER_BLOCKS]; ++ unsigned nbh, c_nblk; ++ unsigned free_ix, clu_block0, clu_nblocks; ++ int i, pagesPerCluster, data_left, size = 0; ++ long status = 0; ++ char *dp; ++ struct buffer_head *bh_writeout[EXT2_MAX_CLUSTER_BLOCKS]; ++ int bhn_writeout; ++#ifdef CONFIG_HIGHMEM ++ int kmapped = 0; ++ ++ ext2_kmap_cluster_pages(NULL, pg, NULL); ++ kmapped = 1; ++#endif ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_BLOCKS; i++) { ++ bh_writeout[i] = NULL; ++ bhn_writeout = 0; ++ } ++ ++ /* First, get cluster_head (For this, we need to re-read the first block of ++ the cluster, without overwriting the data of the page the buffer point to... */ ++ /* This suppose that cluster are aligned with PAGE_SIZE... To be improved */ ++ ++ /* Changed by Yabo Ding, ++ The old code cannot reread data from disk to a changed buffers data pointer in 2.6.x. ++ So, I copy memory data(decompressed) to a temporary buffer; ++ Then reread data(compressed) from disk, and copy to head; ++ Then copy back the memory data from temporary buffer. ++ It seems clumsy, but it works well. ++ */ ++ ++ bh0 = page_buffers(pg[0]); ++ restore_b_data_himem(bh0); ++ ++ head = (struct ext2_cluster_head *) kmalloc(bh0->b_size, GFP_KERNEL); ++ if (head == NULL) { ++ ext2_msg(inode->i_sb, "no more memory", "inode = %lu", ++ inode->i_ino); ++ status = -EIO; ++ goto out_x; ++ } ++ dp = kmalloc(bh0->b_size, GFP_KERNEL); ++ if (dp == NULL) { ++ ext2_msg(inode->i_sb, "no more memory", "inode = %lu", ++ inode->i_ino); ++ kfree(head); ++ status = -EIO; ++ goto out_x; ++ } ++ memcpy(dp, bh0->b_data, bh0->b_size); ++ clear_bit(BH_Uptodate, &bh0->b_state); ++ if (!buffer_mapped(bh0)) { ++ status = ++ ext2_get_block(inode, ext2_cluster_block0(inode, cluster), bh0, ++ 0); ++ if (bh0->b_blocknr == 0) { ++ trace_e2c ++ ("ext2_decompress_pages: ext2_get_block error %ld (cluster = %u)\n", ++ status, cluster); ++ kfree(head); ++ memcpy(bh0->b_data, dp, bh0->b_size); ++ kfree(dp); ++ status = -EIO; ++ goto out; ++ } ++ } ++ ll_rw_block(READ, 1, &bh0); ++ ++ CHECK_NOT_ATOMIC ++ wait_on_buffer(bh0); ++ //printk("RE-Read: Buffer: blocknr:%lu(%#x) \n", bh0->b_blocknr, bh0); ++ if (!buffer_uptodate(bh0)) { /* Read error ??? */ ++ trace_e2c("ext2_decompress_pages: IO error (cluster = %u)\n", ++ cluster); ++ kfree(head); ++ memcpy(bh0->b_data, dp, bh0->b_size); ++ kfree(dp); ++ status = -EIO; ++ goto out; ++ } ++ /* This suppose that cluster are aligned with PAGE_SIZE... To be improved ++ bh0->b_data = page_address(pg[0]); */ ++ memcpy((char *) head, bh0->b_data, bh0->b_size); ++ memcpy(bh0->b_data, dp, bh0->b_size); ++ kfree(dp); ++ ++ if (head->magic != cpu_to_le16(EXT2_COMPRESS_MAGIC_04X)) { ++ ext2_msg(inode->i_sb, ++ "ext2_decompress_pages: bad magic number", ++ "inode = %lu, magic = %#04x", inode->i_ino, ++ le16_to_cpu(head->magic)); ++ kfree(head); ++ status = -EIO; ++ goto out; ++ } ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c("ext2_decompress_pages: clt=%d i=%ld head=0x%x\n", cluster, ++ inode->i_ino, (unsigned) head); ++#endif ++ ++ /* Now, try to do the same as in ext2_decompress_cluster for moving/allocating blocks */ ++ nbh = 0; ++ pagesPerCluster = ext2_cluster_npages(inode, cluster); ++ for (i = 0; i < pagesPerCluster && pg[i]; i++) { ++ assert(PageLocked(pg[i])); ++ //if (!(PageUptodate(pg[i]))) { ++ //mw: do it like ext2_decompress_cluster to handle end of a file correctly ++ if (!(PageUptodate(pg[i])) ++ && (pg[i]->index <= ((inode->i_size - 1) >> PAGE_CACHE_SHIFT))) { ++ kfree(head); ++ printk("should never happen: not all pages uptodate!\n"); //mw ++ status = 0; ++ goto out_x; ++ } ++ } ++ ++ for (i = 0; i < pagesPerCluster && pg[i]; i++) { ++ struct buffer_head *bhead, *bhx; ++ int idx = 0; ++ ++ /* assert(PageUptodate(pg[i])); with ftruncate() can be false */ ++ if (!page_has_buffers(pg[i])) { ++ ClearPageUptodate(pg[i]); /*mw */ ++ ClearPageDirty(pg[i]); /*mw */ ++ assert(0); ++ create_empty_buffers_e2c(pg[i], inode->i_sb->s_blocksize, 0, ++ inode); ++ if (unlikely(!page_has_buffers(pg[i]))) ++ printk("Error: NOMEM!\n"); ++ } ++ bhead = page_buffers(pg[i]); ++ for (bhx = bhead; bhx != bhead || !idx; bhx = bhx->b_this_page) { ++ idx++; ++ bh[nbh] = bhx; ++ nbh++; ++ } ++ } ++ ++ while ((nbh != 0) && (bh[nbh - 1]->b_blocknr == 0)) ++ --nbh; ++ ++ c_nblk = nbh; ++ ++ free_ix = ++ ext2_calc_free_ix(head->holemap_nbytes, (u8 const *) (&head[1]), ++ c_nblk); ++ clu_block0 = ext2_cluster_block0(inode, cluster); ++ clu_nblocks = ext2_cluster_nblocks(inode, cluster); ++ ext2_unpack_blkaddrs(inode, bh, 0, head->holemap_nbytes, ++ (u8 const *) (&head[1]), c_nblk, free_ix, ++ clu_block0, clu_nblocks); ++ ++ /* Allocate the extra blocks needed. */ ++ data_left = size = le32_to_cpu(head->ulen); ++ ++ data_left -= c_nblk << inode->i_sb->s_blocksize_bits; ++ assert(data_left > 0); ++ for (i = 0; i < free_ix; i++) { ++ if (bh[i]->b_blocknr != 0) { ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("\t [%d] blk=%ld sts=0x%x\n", i, bh[i]->b_blocknr, ++ (int) bh[i]->b_state); ++#endif ++ set_buffer_dirty(bh[i]); ++ bh_writeout[bhn_writeout] = bh[i]; //mw ++ bhn_writeout++; //mw ++ } ++ } ++ ++ for (i = free_ix; i < clu_nblocks; i++) { ++ if (((i >> 3) >= head->holemap_nbytes) ++ || !(head->holemap[i >> 3] & (1 << (i & 7)))) { ++ status = ++ ext2_get_block(inode, clu_block0 + i, bh[i], ++ 1 /* create */ ); ++ if (status || bh[i]->b_blocknr == 0) { ++ status = -EIO; ++ goto out; ++ } ++#ifdef EXT2_COMPR_REPORT_CPR ++ trace_e2c("\t [%d] blk=%ld sts=0x%x\n", i, bh[i]->b_blocknr, ++ (int) bh[i]->b_state); ++#endif ++ set_bit(BH_Uptodate, &bh[i]->b_state); ++ set_buffer_dirty(bh[i]); ++ bh_writeout[bhn_writeout] = bh[i]; //mw ++ bhn_writeout++; //mw ++ nbh++; ++ data_left -= inode->i_sb->s_blocksize; ++ if (data_left <= 0) ++ break; ++ } ++ } ++ ++ out: ++ kfree(head); ++ ++ out_x: ++ ++ for (i = 0; i < bhn_writeout; i++) { ++ ++ if ((!buffer_mapped(bh_writeout[i])) ++ || (bh_writeout[i]->b_bdev == NULL)) { ++ u32 block = ext2_cluster_block0(inode, cluster); ++ ext2_get_block(inode, block + i, bh_writeout[i], 1); ++ //printk("ext2_get_block Block:%lu, Mapped:%i, Page:%lu, bdev: %#x\n", bh_writeout[i]->b_blocknr, (bh_writeout[i]->b_state & BH_Mapped), (bh_writeout[i]->b_page ? bh_writeout[i]->b_page->index : 0), bh_writeout[i]->b_bdev ); ++ } ++ assert(buffer_mapped(bh_writeout[i])); ++ assert(bh_writeout[i]->b_bdev != NULL); ++ assert(bh_writeout[i]->b_bdev == inode->i_sb->s_bdev); ++ /*if (bh_writeout[i]->b_bdev == NULL) ++ bh_writeout[i]->b_bdev = inode->i_sb->s_bdev; //fix bdev-bug */ ++ } ++ //Sync out changes: ++ ll_rw_block(WRITE, bhn_writeout, bh_writeout); ++ //mw: seems we have to wait here, otherwise: crash! ++ ++ CHECK_NOT_ATOMIC ++ for (i = 0; i < bhn_writeout; i++) { ++ if (bh_writeout[i]) ++ wait_on_buffer(bh_writeout[i]); ++ } ++ ++ ++#ifdef CONFIG_HIGHMEM ++ if (kmapped) ++ ext2_kunmap_cluster_pages(NULL, pg, NULL); ++#endif ++ ++ return (status ? status : size); ++} ++ ++ ++/* Decompress every cluster that is still compressed. ++ We stop and return -ENOSPC if we run out of space on device. ++ ++ The caller needs to check for EXT2_COMPRBLK_FL before calling. ++ ++ Returns 0 on success, -errno on failure. ++ ++ Called by ext2_ioctl(). */ ++int ext2_decompress_inode(struct inode *inode) ++{ ++ u32 cluster; ++ u32 n_clusters; ++ int err = 0; ++ struct ext2_inode_info *ei = EXT2_I(inode); ++ ++ assert(ei->i_flags & EXT2_COMPRBLK_FL); ++ ++ /* Quotas aren't otherwise kept if file is opened O_RDONLY. */ ++ dquot_initialize(inode); ++ ++ //mutex_lock(&inode->i_mutex); /* MW 5-16-07 */ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); /* i.e. mutex_lock */ ++ err = 0; ++ /* This test can succeed because down() (and I think DQUOT_INIT) can block. */ ++ if (!(ei->i_flags & EXT2_COMPRBLK_FL)) ++ goto out; ++ ++ n_clusters = ext2_n_clusters(inode); ++ for (cluster = 0; cluster < n_clusters; cluster++) { ++ err = ext2_cluster_is_compressed_fn(inode, cluster); ++ if (err > 0) { ++ err = ext2_decompress_cluster(inode, cluster); ++ /* If we later get an error, we'll need to recompress. */ ++ ei->i_flags |= EXT2_DIRTY_FL; ++ ei->i_compr_flags |= EXT2_CLEANUP_FL; ++ } ++ if (err < 0) ++ goto error; ++ } ++ assert(err >= 0); ++ err = 0; ++ ei->i_flags &= ~(EXT2_COMPRBLK_FL | EXT2_DIRTY_FL); ++ ei->i_compr_flags &= ~EXT2_CLEANUP_FL; ++ error: ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty_sync(inode); ++ out: ++// mutex_unlock(&inode->i_mutex); /* MW 5-16-07 */ ++ return err; ++} ++ ++ ++/* ++ TODO: SECRM_FL ++ ++ TODO: Avant de liberer les blocs, regarder si le compteur ++ est a 1, et marquer le noeud si ce n'est pas le cas ++ (pour preparer la recompression immediate). ++ ++ pjm fixme translation. ++ "Before freeing the blocks, check if the counter is 1, ++ and mark the inode if not (in order to prepare for ++ immediate recompression)." */ ++ ++/* This is called by ext2_compress_cluster to free the blocks now ++ available due to compression. We free ,nb` blocks beginning with ++ block ,block`. We set the address of each freed block to ++ EXT2_COMPRESSED_BLKADDR, thus marking the cluster as compressed. ++ N.B. It is up to the caller to adjust i_blocks. */ ++ ++/* TODO: ext2_truncate() is much more careful than this routine. ++ (E.g. it checks for bh->b_count > 1, and checks for things changing ++ underneath it. It also calls bforget instead of brelse if it's ++ going to free it.) Why? Maybe we should copy it. */ ++ ++/* effic: Reduce the number of calls to ext2_free_block() the way ++ ext2_trunc_direct() does. */ ++ ++/* fixme: I think tht we do indeed need to check if buffers are held by ++ somebody else before freeing them. */ ++static int ext2_free_cluster_blocks(struct inode *inode, u32 block, ++ unsigned nb) ++{ ++ u32 tmp; ++ struct ext2_bkey key; ++ int err; ++ ++/* ++ * whitpa 04 Oct 2004: although it may be true that using e2compr in ++ * conjunction with quotas is a Bad Idea, having quotas enabled for other ++ * filesystems doesn't necessarily mean that the quota feature will actually be ++ * used in this one, so many people find the following assertion very annoying. ++ * I have therefore disabled it. ++ */ ++/* assert (!inode->i_sb->dq_op || (inode->i_flags & S_QUOTA)); */ ++ if (!nb) ++ return 0; ++ if (nb > EXT2_MAX_CLU_NBLOCKS) { ++ assert((int) nb >= 0); ++ assert(nb <= EXT2_MAX_CLU_NBLOCKS); ++ return -EDOM; ++ } ++ assert(((block + nb) & 3) == 0); ++ if (!ext2_get_key(&key, inode, block)) ++ return -EIO; ++ ++ while (nb-- > 0) { ++ tmp = ext2_get_key_blkaddr(&key); ++ err = ext2_set_key_blkaddr(&key, EXT2_COMPRESSED_BLKADDR); ++ if (err) ++ goto out; ++ if (tmp != 0) { ++ assert(tmp != EXT2_COMPRESSED_BLKADDR); ++#ifdef EXT2_COMPR_REPORT_ALLOC ++ printk(KERN_DEBUG "ext2: free %d = (%d) %d:%d:%d:%d : %d\n", ++ key.block, ++ key.level, ++ key.off[0], key.off[1], key.off[2], key.off[3], tmp); ++#endif ++ ext2_free_blocks(inode, tmp, 1); ++ } ++ if (!ext2_next_key(&key, 1)) ++ break; ++ } ++ err = 0; ++ out: ++ ext2_free_key(&key); ++ return err; ++} ++ ++#ifdef EXT2_COMPR_DEBUG ++static unsigned count_bits(unsigned char *p, unsigned nb) ++{ ++ u32 x = le32_to_cpu(*(u32 *) p); ++ unsigned n = 0; ++ ++ assert(nb <= 4); ++ if (nb != 4) ++ x &= (1 << (nb * 8)) - 1; ++ while (x) { ++ x &= (x - 1); ++ n++; ++ } ++ return n; ++} ++#endif ++ ++/* ++ * __remove_compr_assoc_queue is used in invalidate_inode_buffers ++ * replacement code for ext2_compress_cluster(). TLL 02/21/07 ++ * Yeah, it is duplicate code, but using it does not require ++ * patching fs/buffer.c/__remove_assoc_queue to export it. ++ * The buffer's backing address_space's private_lock must be held. ++ */ ++/*static inline void __remove_compr_assoc_queue(struct buffer_head *bh) ++{ ++ list_del_init(&bh->b_assoc_buffers); ++}*/ ++ ++/* Compress one cluster. If the cluster uses fewer blocks once ++ compressed, it is stored in place of the original data. Unused ++ blocks are freed, and the cluster is marked as compressed. ++ ++ Returns a negative value on error, ++ 0 if the cluster does not compress well, ++ positive if it is compressed (whether it was already compressed ++ or whether we compressed it). ++ ++ Assume inode is writable. ++ ++ Called by : ++ ++ ext2_cleanup_compressed_inode () [i_sem] ++ ++ If ever we acquire new callers, make sure that quotas are ++ initialised, and COMPRBLK is handled correctly (i.e. such ++ that ioctl() can't change the cluster size on us), and that caller ++ tests for ext2_wa==NULL. ++*/ ++ ++int ext2_compress_cluster(struct inode *inode, u32 cluster) ++{ ++ struct buffer_head *bh[EXT2_MAX_CLUSTER_BLOCKS + 1]; ++ struct page *pg[EXT2_MAX_CLUSTER_PAGES]; ++ int s_nblk; /* Equals clu_nblocks less any trailing hole blocks. */ ++ unsigned u_nblk = (~(unsigned) 0), c_nblk; /* Number of blocks occupied by ++ un/compressed data. */ ++ int result, n, x; ++ int ulen, maxlen = 0, clen = 0; ++ unsigned char *dst; ++ u8 *src; ++ unsigned meth, alg; ++ int nbh = 0, npg, i; ++ unsigned char holemap_nbytes = 0; ++ unsigned last_hole_pos; ++ struct ext2_cluster_head *head; ++ unsigned r_nblk; ++ struct ext2_inode_info *ei = EXT2_I(inode); ++ unsigned long saved_isize; ++ //int dotrunc = 1; //mw ++ ++#ifdef CONFIG_HIGHMEM ++ int kmapped = 0; ++#endif ++ ++ /* impl: Otherwise, ioctl() could change the cluster size ++ beneath us. */ ++ /* TLL say not compressed and return -1 6-15-07 */ ++ if (!(ei->i_flags & EXT2_COMPRBLK_FL)) ++ return -1; ++ ++ //mw ++ saved_isize = inode->i_size; ++ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); /* i.e. mutex_lock */ ++ assert(!mapping_mapped(inode->i_mapping)); ++ ++ npg = ext2_cluster_npages(inode, cluster); ++ ++ result = ext2_get_cluster_pages(inode, cluster, pg, NULL, 1); ++ if (result <= 0) ++ goto done; ++ ++#ifdef CONFIG_HIGHMEM ++ ext2_kmap_cluster_pages(NULL, pg, NULL); ++ kmapped = 1; ++#endif ++ ++ /* effic: We ought to use the page cache. Using the page ++ cache always costs extra CPU time, but saves I/O if the ++ page is present. We still need to detect holes, which ++ unfortunately may still cause I/O. Testing for all-zero ++ could save us that I/O. */ ++ ++ nbh = ext2_get_cluster_blocks(inode, cluster, bh, pg, NULL, 1); ++ ++ s_nblk = nbh; ++ ++#ifdef EXT2_COMPR_REPORT ++ { ++ int i; ++ trace_e2c(" > > > ext2_compress_cluster %d: inode=%ld, size=%d\n", ++ cluster, inode->i_ino, (int) inode->i_size); ++#ifdef EXT2_COMPR_REPORT_CPR ++ for (i = 0; i < s_nblk; i++) { ++ if (bh[i]) { ++ printk(KERN_DEBUG ++ "bbuffer_head[%d]: blocknr=%lu, addr=0x%p ", i, ++ (unsigned long) bh[i]->b_blocknr, bh[i]); ++ if (bh[i]->b_page) ++ printk(KERN_DEBUG "bgn:[page->index=%ld]\n", ++ bh[i]->b_page->index); ++ else ++ printk(KERN_DEBUG "[No page]\n"); ++ } else ++ printk("bbuffer_head[%d] is NULL\n", i); ++ } ++#endif ++ } ++#endif ++ /* ++ * Did somebody else compress the cluster while we were waiting ? ++ * This should never arise ... ++ */ ++ result = ext2_cluster_is_compressed_fn(inode, cluster); ++ if (result != 0) { ++ if (result > 0) { ++ ext2_msg(inode->i_sb, ++ "ext2_compress_cluster", ++ "compressing compressed cluster"); ++ } ++ goto done; ++ } ++ ++ /* I moved it here in case we need to load a module that ++ * needs more heap that is currently allocated. ++ * In such case "init_module" for that algorithm forces ++ * re-allocation of ext2_wa. It should be safe here b/c the ++ * first reference to ext2_wa comes just after and we have ++ * locked ext2_wa before. ++ * ++ * I know that we may not need the compression at all ++ * (compressing 0 or 1 block) but it's better to sacrifice ++ * a bit than do make a total mess of this code. ++ * ++ * FIXME: Totally separate working areas for reading and writing. ++ * Jan R. ++ */ ++ ++ meth = ei->i_compr_method; ++ assert(meth < EXT2_N_METHODS); ++ alg = ext2_method_table[meth].alg; ++#ifdef CONFIG_KMOD ++ if (!ext2_algorithm_table[alg].avail) { ++ char str[32]; ++ ++ sprintf(str, "ext2-compr-%s", ext2_algorithm_table[alg].name); ++ request_module(str); ++ } ++#endif ++ ++ result = -EINTR; ++ ++ /* ++ * Try to get the working area. ++ */ ++#ifdef EXT2_COMPR_REPORT_WA ++ printk(KERN_DEBUG "pid %d enters critical region\n", current->pid); ++#endif ++ if (get_cpu_var(ext2_wr_wa) == NULL) ++ { ++ ext2_alloc_wr_wa(); ++ } ++ assert(__get_cpu_var(ext2_wr_wa) != NULL); ++ ++ ++ /* ++ * Now, we try to compress the cluster. If the cluster does ++ * not compress well, we just give up. Otherwise, we reuse ++ * the old blocks to store the compressed data (except that ++ * compressed data is contiguous in the file even if the ++ * uncompressed data had holes). ++ */ ++ ++ /* ++ * Compute the block bitmap, how many bytes of data we have ++ * in the cluster, and the maximum interesting length after ++ * compression. The bitmap will be used to reallocate blocks ++ * when decompressing the cluster, so that we don't create blocks ++ * that were previously missing. We also pack the buffers ++ * together. ++ */ ++ ++ head = (struct ext2_cluster_head *) __get_cpu_var(ext2_wr_wa)->c; ++#if EXT2_MAX_CLUSTER_BLOCKS > 32 ++# error "We need to zero more bits than this." ++#endif ++ *(u32 *) (&head[1]) = 0; ++ last_hole_pos = (unsigned) (-1); ++ assert(head->holemap[0] == 0); ++ assert(head->holemap[1] == 0); ++ assert(head->holemap[2] == 0); ++ assert(head->holemap[3] == 0); ++ assert(*(u32 *) head->holemap == 0); ++ assert(count_bits(head->holemap, 4) == 0); ++ ++ /* TODO: Check that i_size can't change beneath us. ++ do_truncate() is safe because it uses i_sem around changing ++ i_size. For the moment, I do a runtime check. */ ++ ++ saved_isize = inode->i_size; ++ ++#ifdef EXT2_COMPR_REPORT_VERBOSE ++ printk ++ ("00 ext2_compress_cluster[%u]: i_size=%u, s_blocksize_bits=%u, s_nblk=%u\n", ++ __LINE__, (unsigned) inode->i_size, inode->i_sb->s_blocksize_bits, ++ s_nblk); ++#endif ++// assert (ROUNDUP_RSHIFT(inode->i_size, inode->i_sb->s_blocksize_bits) ++// >= s_nblk); ++ /* This initial guess at ulen doesn't take holes into account ++ unless they're at end of cluster. We ,compensate for other ++ holes` during the loop below. */ ++ ulen = MIN(s_nblk << inode->i_sb->s_blocksize_bits, ++ inode->i_size - ext2_cluster_offset(inode, cluster)); ++ r_nblk = (((ulen - 1) >> inode->i_sb->s_blocksize_bits) + 1); ++ if (r_nblk <= 1) { ++ /* MW: required to remove Z flag, otherwise compress ++ * is tried on each access */ ++ result = 0; ++ goto no_compress; ++ } ++ /* Verify if more than 1 block to compress in the cluster */ ++ nbh = 0; ++ for (x = 0; x < s_nblk; x++) { ++ if ((bh[x] != NULL) && (bh[x]->b_blocknr != 0)) { ++ nbh++; ++ } else { ++ last_hole_pos = x; ++ head->holemap[x >> 3] |= 1 << (x & 7); ++ ulen -= inode->i_sb->s_blocksize; ++ /* impl: We know that it's a whole block because ++ ext2_get_cluster_blocks trims s_nblk for trailing ++ NULL blocks, and partial blocks only come at ++ the end, so there can't be partial NULL blocks. */ ++ } ++ } ++ /* We don't try to compress cluster that only have one block ++ or no block at all. (When fragments are implemented, this code ++ should be changed.) */ ++ if (nbh <= 1) { ++ /* MW: required to remove Z flag, otherwise compress ++ * is tried on each access */ ++ goto no_compress; ++ } ++ ++ u_nblk = nbh; ++ /* Copy the data in the compression area */ ++ dst = __get_cpu_var(ext2_wr_wa)->u; ++ for (x = 0; x < s_nblk; x++) { ++ if ((bh[x] != NULL) && (bh[x]->b_blocknr != 0)) { ++ restore_b_data_himem(bh[x]); ++ memcpy(dst, bh[x]->b_data, bh[x]->b_size); ++ dst += bh[x]->b_size; ++ } ++ } ++ ++ assert(count_bits(head->holemap, 4) == s_nblk - u_nblk); ++ ++#if EXT2_GRAIN_SIZE != EXT2_MIN_BLOCK_SIZE ++# error "this code ought to be changed" ++#endif ++ ++ /* ,maxlen` is the maximum length that the compressed data can ++ be while still taking up fewer blocks on disk. */ ++ holemap_nbytes = (last_hole_pos >> 3) + 1; ++ /* impl: Remember that ,last_hole_pos` starts off as being -1, ++ so the high 3 bits of ,last_hole_pos >> 3` can be wrong. ++ This doesn't matter if holemap_nbytes discards the high ++ bits. */ ++ ++ assert(sizeof(holemap_nbytes) < sizeof(unsigned)); ++ assert((last_hole_pos == (unsigned) -1) ++ == (holemap_nbytes == 0)); ++ maxlen = ++ ((((r_nblk < ++ u_nblk) ? r_nblk : u_nblk) - 1) * inode->i_sb->s_blocksize - ++ sizeof(struct ext2_cluster_head) ++ - holemap_nbytes); ++ clen = 0; ++ /* Handling of EXT2_AUTO_METH at the moment is just that we ++ use the kernel default algorithm. I hope that in future ++ this can be extended to the kernel deciding when to ++ compress and what algorithm to use, based on available disk ++ space, CPU time, algorithms currently used by the fs, ++ etc. */ ++ if ((meth == EXT2_AUTO_METH) ++ || !ext2_algorithm_table[alg].avail) { ++ meth = EXT2_DEFAULT_COMPR_METHOD; ++ alg = ext2_method_table[meth].alg; ++ assert(ext2_algorithm_table[alg].avail); ++ } ++ if (alg == EXT2_NONE_ALG) ++ goto no_compress; ++ ++ clen = ext2_algorithm_table[alg].compress(__get_cpu_var(ext2_wr_wa)->u, ++ __get_cpu_var(ext2_wr_wa)->c + sizeof(struct ext2_cluster_head) + holemap_nbytes, ++ __get_cpu_var(ext2_wr_wa)->heap, ulen, maxlen, ext2_method_table[meth].xarg); ++ ++#ifdef EXT2_COMPR_REPORT_ALGORITHMS ++ printk(KERN_DEBUG "03 ext2: %lu: cluster %d+%d [%d] compressed " ++ "into %d bytes (ulen = %d, maxlen = %d)\n", ++ inode->i_ino, ++ ext2_cluster_offset(inode, cluster), ++ ext2_cluster_nblocks(inode, cluster), ++ u_nblk, clen, ulen, maxlen); ++#endif ++ ++ if ((clen == 0) || (clen > maxlen)) { ++ no_compress: ++ ++ /* this chunk didn't compress. */ ++ assert(inode->i_size == saved_isize); ++#ifdef EXT2_COMPR_REPORT_WA ++ printk(KERN_DEBUG ++ "pid %d leaves critical region, nbh=%d, u_nblk=%d, " ++ "inode->i_size=%lu, saved_isize=%lu, clen=%d, ulen=%d, maxlen=%d\n", ++ current->pid, nbh, u_nblk, ++ (long unsigned) inode->i_size, saved_isize, clen, ulen, ++ maxlen); ++#endif ++ ++ result = 0; ++ put_cpu_var(ext2_wr_wa); ++ goto done; ++ } ++ ++ ++#if EXT2_MAX_CLUSTER_BLOCKS > 32 ++# error "We need to zero more bits than this." ++#endif ++ assert(-1 <= (int) last_hole_pos); ++ assert((int) last_hole_pos < 32); ++ assert((le32_to_cpu(*(u32 *) head->holemap) ++ & (~0u << (1 + last_hole_pos)) ++ & (~(~0u << (8 * holemap_nbytes)))) ++ == 0); ++ /* Don't change "~0u << (1 + last_hole_pos)" to "~1u << last_hole_pos" ++ as I almost did, as last_hole_pos can be -1 and cannot be 32. */ ++ assert(count_bits(head->holemap, holemap_nbytes) == s_nblk - u_nblk); ++ ++ /* Compress the blocks at the beginning of the cluster */ ++ for (x = 0, nbh = 0; x < s_nblk; x++) { ++ if ((bh[x] != NULL) && (bh[x]->b_blocknr != 0)) { ++ if (nbh != x) { ++ restore_b_data_himem(bh[x]); ++ bh[nbh]->b_blocknr = bh[x]->b_blocknr; ++ set_bit(BH_Mapped, &bh[nbh]->b_state); ++ bh[x]->b_blocknr = 0; ++ assert(buffer_mapped(bh[x])); ++ clear_bit(BH_Mapped, &bh[x]->b_state); ++ } ++ nbh++; ++ } ++ } ++ assert(nbh == u_nblk); ++ assert(count_bits(head->holemap, holemap_nbytes) == s_nblk - u_nblk); ++ ++ /* ++ * Compression was successful, so add the header and copy to blocks. ++ */ ++ ++ /* Header. */ ++ { ++ head->magic = cpu_to_le16(EXT2_COMPRESS_MAGIC_04X); ++ head->method = meth; ++ head->holemap_nbytes = holemap_nbytes; ++ head->ulen = cpu_to_le32(ulen); ++ head->clen = cpu_to_le32(clen); ++ ++ barrier(); //mw: "barrier" tells compiler not to re-order resulting asm statments, somehow. ++ head->checksum = ++ cpu_to_le32(ext2_adler32 ++ (le32_to_cpu(*(u32 *) __get_cpu_var(ext2_wr_wa)->c), ++ __get_cpu_var(ext2_wr_wa)->c + 8, ++ (sizeof(struct ext2_cluster_head) - 8 + ++ head->holemap_nbytes + clen))); ++ } ++ ++ assert((le32_to_cpu(*(u32 *) head->holemap) ++ & (~0 << (1 + last_hole_pos)) ++ & ((1 << (8 * holemap_nbytes)) - 1)) == 0); ++ result = clen += sizeof(struct ext2_cluster_head) + holemap_nbytes; ++ c_nblk = ROUNDUP_RSHIFT(clen, inode->i_sb->s_blocksize_bits); ++ ++ /* Release unneeded buffer heads. (Freeing is done later, ++ after unlocking ext2_wr_wa.) */ ++ assert(nbh == u_nblk); ++ nbh = c_nblk; ++ ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c("ext2_compress_cluster: head->clen=%d, clen=%d\n", head->clen, clen); ++#endif ++ src = __get_cpu_var(ext2_wr_wa)->c; ++ ++ for (n = 0; (int) clen > 0; n++) { ++ restore_b_data_himem(bh[n]); ++ if (clen >= inode->i_sb->s_blocksize) { ++ memcpy(bh[n]->b_data, src, inode->i_sb->s_blocksize); ++ } else { ++ memcpy(bh[n]->b_data, src, clen); ++ } ++ ++ /* TO_DO: OSYNC. means: write opertions are blocking until the ++ * the pages are written from page cache to disk */ ++ ++ set_buffer_uptodate(bh[n]); ++ set_buffer_dirty(bh[n]); ++ src += inode->i_sb->s_blocksize; ++ clen -= inode->i_sb->s_blocksize; ++ } ++ ++ i = 0; ++ assert(n == c_nblk); ++ assert((le32_to_cpu(*(u32 *) head->holemap) ++ & (~0 << (1 + last_hole_pos)) ++ & ((1 << (8 * holemap_nbytes)) - 1)) == 0); ++ ++ /* Runtime check that no-one can change i_size while i_sem is down. ++ (See where saved_isize is set, above.) */ ++ assert(inode->i_size == saved_isize); ++ assert(!mapping_mapped(inode->i_mapping)); ++ ++ /* Free the remaining blocks, and shuffle used blocks to start ++ of cluster in blkaddr array. */ ++ { ++ u32 free_ix, curr; ++ int err; ++ ++ /* Calculate free_ix. There should be ,c_nblk` ++ non-hole blocks among the first ,free_ix` ++ blocks. */ ++ { ++ assert((le32_to_cpu(*(u32 *) head->holemap) ++ & (~0 << (1 + last_hole_pos)) ++ & ((1 << (8 * holemap_nbytes)) - 1)) == 0); ++ assert(n == c_nblk); ++ for (free_ix = 0; ++ ((int) free_ix <= (int) last_hole_pos) && (n > 0); ++ free_ix++) ++ if (!(head->holemap[free_ix >> 3] ++ & (1 << (free_ix & 7)))) ++ n--; ++ free_ix += n; ++ ++ if ((free_ix < c_nblk) ++ || (free_ix + u_nblk > s_nblk + c_nblk) ++ || (free_ix >= ext2_cluster_nblocks(inode, cluster)) ++ || ((holemap_nbytes == 0) && (c_nblk != free_ix))) { ++ assert(free_ix >= c_nblk); ++ /*assert (free_ix - c_nblk <= s_nblk - u_nblk); */ ++ assert(free_ix + u_nblk <= s_nblk + c_nblk); ++ assert(free_ix < ext2_cluster_nblocks(inode, cluster)); ++ assert((holemap_nbytes != 0) || (c_nblk == free_ix)); ++ assert(1 <= c_nblk); ++ assert(c_nblk < u_nblk); ++ assert(u_nblk <= s_nblk); ++ assert(s_nblk <= ext2_cluster_nblocks(inode, cluster)); ++ assert(ext2_cluster_nblocks(inode, cluster) <= ++ EXT2_MAX_CLU_NBLOCKS); ++ ext2_error(inode->i_sb, "ext2_compress_cluster", ++ "re assertions: c=%d, u=%d, f=%d, s=%d, n=%d, " ++ "lhp=%d, hm=%x, hnb=%d, " "ino=%lu, clu=%u", ++ (int) c_nblk, (int) u_nblk, (int) free_ix, ++ (int) s_nblk, (int) ext2_cluster_nblocks(inode, ++ cluster), ++ (int) last_hole_pos, ++ (unsigned) le32_to_cpu(*(u32 *) head->holemap), ++ (int) holemap_nbytes, inode->i_ino, cluster); ++ } ++ } ++ ++ /*mw: put here: set all __get_cpu related pointers to NULL ++ as they become invalid with put_cpu */ ++ head = NULL; /* prevent any more stupid bugs */ ++ src = NULL; ++ dst = NULL; ++ put_cpu_var(ext2_wr_wa); ++ ++#ifdef EXT2_COMPR_DEBUG ++ /* TODO: remove this TEST */ ++ /* mw: ext2_free_cluster_blocks can sleep: check we are not atomic */ ++ schedule(); ++#endif ++ ++ /* Free unneeded blocks, and mark cluster as ++ compressed. */ ++ err = ext2_free_cluster_blocks ++ (inode, ++ ext2_cluster_block0(inode, cluster) + free_ix, ++ ext2_cluster_nblocks(inode, cluster) - free_ix); ++ /* pjm 1998-06-15: This should help reduce fragmentation. ++ Actually, we could set block to clu_block0 + clu_nbytes, ++ and goal to the last allocated blkaddr in the compressed ++ cluster. ++ It would be nice if we would transfer the freed blocks ++ to preallocation, while we're at it. */ ++// write_lock(&ei->i_meta_lock); ++ /* mw: i_next_alloc_goal and i_next_alloc_block were removed in 2.6.24.x ++ * so we dont need to set them to 0 (they are anyway, somehow). ++ */ ++ //ei->i_next_alloc_goal = ei->i_next_alloc_block = 0; ++// write_unlock(&ei->i_meta_lock); ++ if (err < 0) { ++ goto done; ++ } ++ /* Note that ext2_free_cluster_blocks() marks the ++ cluster as compressed. */ ++ ++ /* Shuffle used blocks to beginning of block-number array. */ ++ { ++ struct ext2_bkey key; ++ unsigned i; ++ ++ if (!ext2_get_key(&key, ++ inode, ++ ext2_cluster_block0(inode, cluster))) { ++ ei->i_flags |= EXT2_ECOMPR_FL; ++ result = -EIO; ++ free_ix = 0; ++ } ++ for (i = 0; i < free_ix; i++) { ++ curr = ext2_get_key_blkaddr(&key); ++ ++ if ((c_nblk == free_ix) ++ && (curr != bh[i]->b_blocknr)) { ++ /* "Can't happen", yet has ++ happened a couple of times. */ ++ ext2_error(inode->i_sb, "ext2_compress_cluster", ++ "c_nblk=free_ix=%d, " ++ "curr=%u, b_blocknr=%lu, " ++ "lhp=%d , hm=, " ++ "ino=%lu, blk=%u", ++ c_nblk, curr, ++ (unsigned long) bh[i]->b_blocknr, ++ (int) last_hole_pos, ++ /*mw: became invalid due put_cpu: ++ (unsigned) le32_to_cpu(*(u32 *) head-> ++ holemap),*/ ++ inode->i_ino, ++ (unsigned) ++ ext2_cluster_block0(inode, cluster) + i); ++ } ++ err = ext2_set_key_blkaddr(&key, ++ (i < c_nblk ++ ? bh[i]->b_blocknr ++ : EXT2_COMPRESSED_BLKADDR)); ++ if (err) ++ break; ++ if (!ext2_next_key(&key, 1)) { ++ ei->i_flags |= EXT2_ECOMPR_FL; /* sorry... */ ++ result = -EIO; ++ break; ++ } ++ } ++ ext2_free_key(&key); ++ } ++ } ++ ++ /* ++ * Unlock the working area. ++ */ ++ ++#ifdef EXT2_COMPR_REPORT_WA ++ printk(KERN_DEBUG "pid %d leaves critical region\n", current->pid); ++#endif ++ ++ assert(c_nblk < u_nblk); ++ ext2_mark_algorithm_use(inode, alg); ++ ++ /* TLL update b_assoc_map per 2.6.20 6-07-07 */ ++ for (i = 0; i < c_nblk; i++) ++ if (bh[i] != NULL) { ++ bh[i]->b_assoc_map = inode->i_mapping; ++ bh[i]->b_page->mapping = inode->i_mapping; //Andreas 5-24-07 : necessary? WRONG? ++ } ++ //mw: we must force the writeback, otherwise ext2_readpage will get confused ++ // yaboo ding had similiar code above. but I think it makes more sense after ++ // the block shuffeling. ++ // Note: generic_oysnc_inode() made trouble with USB-Sticks and caused a lot ++ // of IO, stalled system ... therefore ll_rw_block() replace it. Anyway we already operate ++ // with this low-level function. ++ ++ /*mw: new "hole" fix. hole == bdev bug! */ ++ for (i = 0; i < c_nblk; i++) { ++ ++ /* this was a hole (uncompressed) ++ * at the beginning of the cluster. ++ * so NO block was yet associated with it. ++ * But now we need it, because a compressed ++ * cluster always starts at the cluster.*/ ++ if (!buffer_mapped(bh[i]) || bh[i]->b_bdev == NULL) { ++ u32 block = ext2_cluster_block0(inode, cluster); ++ ext2_get_block(inode, block + i, bh[i], 1); ++ //printk("ext2_get_block Block:%lu, Mapped:%i, Page:%lu, bdev: %#x\n", bh[i]->b_blocknr, (bh[i]->b_state & BH_Mapped), (bh[i]->b_page ? bh[i]->b_page->index : 0), bh[i]->b_bdev ); ++ } ++ assert(buffer_mapped(bh[i])); ++ assert(bh[i]->b_bdev != NULL); ++ assert(bh[i]->b_bdev == inode->i_sb->s_bdev); ++ } ++ ++ ll_rw_block(WRITE, c_nblk, bh); ++ ++ CHECK_NOT_ATOMIC ++ //mw: seems we have to wait here, otherwise: crash! ++ for (i = 0; i < c_nblk; i++) { ++ if (bh[i]) ++ wait_on_buffer(bh[i]); ++ //printk("written compressed block: Block:%lu, Mapped:%i, Page:%lu, bdev: %#x\n", bh[i]->b_blocknr, (bh[i]->b_state & BH_Mapped), (bh[i]->b_page ? bh[i]->b_page->index : 0), bh[i]->b_bdev ); ++ } ++ ++ ++#ifdef CONFIG_HIGHMEM ++ if (kmapped) ++ ext2_kunmap_cluster_pages(NULL, pg, NULL); ++#endif ++ ++ inode->i_ctime = CURRENT_TIME; //mw: these two come always together. So I also put it here. ++ mark_inode_dirty_sync(inode); ++ ++ //ext2_update_inode(inode, inode_needs_sync(inode)); //mw: might be able to fix pipe_write vs. readpage. mutex-rec-locking ++ ++ /* COMPRBLK is already high, so no need to raise it. */ ++ { ++ for (i = c_nblk; (i < EXT2_MAX_CLUSTER_BLOCKS) && (bh[i] != NULL); ++ i++) { ++ clear_buffer_dirty(bh[i]); ++ bh[i]->b_blocknr = 0; ++ clear_bit(BH_Mapped, &bh[i]->b_state); ++ clear_bit(BH_Uptodate, &bh[i]->b_state); ++ } ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (pg[i] == NULL) { ++ break; ++ } ++ assert(PageLocked(pg[i])); ++ ClearPageUptodate(pg[i]); ++ unlock_page(pg[i]); ++ page_cache_release(pg[i]); ++ } ++ ++ /* invalidate_inode_buffers replacement code: TLL 02/21/07 ++ * e2compr on post 2.6.10 kernels do not have an uptodate ++ * mapping->assoc_mapping (other Vm(?) changes require it be ++ * made explicit, 2.4 kernels have it implicit). Therefore, when ++ * umount is called, a GPF ensues from a NULL ops pointer. ++ * e2c on a USB thumbdrive mounted as the root fs does not ++ * support repeated compress/uncompress cycles on a given file. ++ * Inlined the flush list code to explicityly force update to ++ * disk with a known valid bh list. ++ */ ++ ++ /* mw: I consider this code as ... not so good! */ ++ /* ++ if (inode_has_buffers(inode)) { ++ //struct address_space *mapping = &inode->i_data; ++ // struct address_space *buffer_mapping = mapping->assoc_mapping; ++ // requires: inode->i_data->mapping->assoc_mapping; to be set ++ invalidate_inode_buffers(inode); // TLL do it proper 5-25-07 ++ //if (dotrunc) ++ //ext2_truncate(inode); // TLL file size hack 6-19-07 ++ } ++ */ ++ ++ } ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c(" < < < ext2_compress_cluster %i: [done cpr] inode=%ld\n", cluster, inode->i_ino); ++#endif ++ return result; ++ ++ ++ done: ++ ++#ifdef CONFIG_HIGHMEM ++ if (kmapped) ++ ext2_kunmap_cluster_pages(NULL, pg, NULL); ++#endif ++ ++ { ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (pg[i] == NULL) { ++ break; ++ } ++ unlock_page(pg[i]); ++ page_cache_release(pg[i]); ++ } ++ /* TLL cp to compr dir bug fix 03-25-07 ++ Truncate uncompressed files to their uncompressed ++ length, i.e. force kernel to update inode and sb */ ++ //if(dotrunc) ++ //26.08.2011: ext2_truncate(inode) does not exist anymore ++ ext2_truncate_blocks(inode, inode->i_size); ++ ++ } ++#ifdef EXT2_COMPR_REPORT_VERBOSE ++ { ++ int i; ++ ++ printk(KERN_DEBUG "ext2_compress_cluster[end]: buffers kept for cluster=%d\n", cluster); ++ for (i = 0; i < nbh; i++) { ++ if (bh[i]) { ++ printk(KERN_DEBUG "2buffer_head[%d]: blocknr=%lu, addr=0x%p ", i, (unsigned long) bh[i]->b_blocknr, bh[i]); ++ if (bh[i]->b_page) ++ printk(KERN_DEBUG "2:[page->index=%ld]\n", bh[i]->b_page->index); ++ else ++ printk(KERN_DEBUG "[No page]\n"); ++ } else ++ printk(KERN_DEBUG "buffer_head[%d] is NULL\n", i); ++ } ++ } ++#endif ++ ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c(" < < < ext2_compress_cluster %i: [done NO cpr] inode=%ld\n", cluster, inode->i_ino); ++#endif ++ return result; ++} ++ ++ ++/* Go through all the clusters and compress them if not already ++ compressed. ++ ++ This is called by ext2_put_inode() and ext2_release_file(). Later, ++ we may have ext2_ioctl() call it (when EXT2_COMPR_FL rises). None ++ of the callers does any locking, so we do it here. ++ ++ Neither of the current callers uses the return code, but we get ready ++ for if we start using it. ++ ++ Returns 0 on "success" (whether or not we cleared EXT2_CLEANUP_FL ++ or EXT2_DIRTY_FL bits), -errno on error. */ ++int ext2_cleanup_compressed_inode(struct inode *inode) ++{ ++ u32 cluster; ++ u32 n_clusters; ++ int dirty = 0; ++ int err = 0; ++ u32 comprblk_mask; ++ atomic_t start_i_count = inode->i_count; ++ int retry = 0; ++ int have_downed; ++ struct ext2_inode_info *ei = EXT2_I(inode); ++#ifdef EXT2_COMPR_REPORT ++ char bdn[BDEVNAME_SIZE]; ++#endif ++ ++ /* impl: Actually, this assertion could fail if the kernel ++ isn't locked. I haven't looked, but I suppose that the ++ kernel always is locked when this is called. */ ++ assert(ei->i_compr_flags & EXT2_CLEANUP_FL); ++ ++#ifdef EXT2_COMPR_REPORT_PUT ++ printk(KERN_DEBUG "ext2_cleanup_compressed_inode() called for pid %d; " ++ "dev=%s, ino=%lu, i_state=0x%lx, i_count=%u\n", ++ current->pid, bdevname(inode->i_sb->s_bdev, bdn), inode->i_ino, ++ inode->i_state, atomic_read(&inode->i_count)); ++#endif ++ ++ /* Do these tests twice: once before down() and once after. */ ++ for (have_downed = 0;; have_downed++) { ++ if ((ei->i_flags & (EXT2_COMPR_FL | EXT2_DIRTY_FL)) ++ != (EXT2_COMPR_FL | EXT2_DIRTY_FL)) { ++ if (have_downed) ++ goto out; ++ /* TLL 5-25-07 changed from a warning to trace */ ++ /*trace_e2c("ext2_cleanup_compressed_inode: trying to un/compress an " ++ "uncompressable file.\n" ++ "i_flags=%#x. (dev=%s, ino=%lu, down=%d)\n", ++ ei->i_flags, bdevname(inode->i_sb->s_bdev, bdn), ++ inode->i_ino, have_downed); */ ++ return 0; ++ } ++ ++ /* test if file is mapped by mmap */ ++ if (mapping_mapped(inode->i_mapping)) ++ { ++ //trace_e2c("ext2_cleanup_compressed_inode: (dev. %s): ino=%ld: file mapped, does not compress cluster\n", bdevname(inode->i_sb->s_bdev, bdn), inode->i_ino); ++ if (have_downed) ++ goto out; ++ else ++ return 0; ++ } ++ ++ if (IS_RDONLY(inode) ++ || (ei->i_flags & EXT2_ECOMPR_FL)) { ++ ei->i_compr_flags &= ~EXT2_CLEANUP_FL; ++ if (have_downed) ++ goto out; ++ else ++ return 0; ++ } ++ ++ //mw ++ if (ext2_get_dcount(inode) > 1) { ++ err = 0; ++ //printk("Compress: file busy (dcount: %i>1)\n", ext2_get_dcount(inode)); ++ if (have_downed) ++ goto out; ++ else ++ return 0; ++ } ++ ++ if (have_downed) ++ break; ++ ++ /* Quotas aren't otherwise kept if file is opened O_RDONLY. */ ++ dquot_initialize(inode); ++ ++ /* Check whether OSYNC of inode is acutally running */ ++ //if (ei->i_compr_flags & EXT2_OSYNC_INODE) ++ //printk(KERN_DEBUG "OSYNC!\n"); ++ ++ /* I think: ++ * checking these flags should prevent that one Process aquires the MUTEX again, ++ * e.g. in a recursive call ++ * BUT: what happens acutally: two processes are working on this inode: pdflush and the userprogramm ++ * SO: the check might be correct if: ei->i_compr_flags & EXT2_OSYNC_INOD AND the same process already posesses this lock!!! ++ */ ++ //if (!(ei->i_compr_flags & EXT2_OSYNC_INODE)) ++ //{ ++ mutex_lock(&inode->i_mutex); ++#ifdef EXT2_COMPR_REPORT_MUTEX ++ printk(KERN_DEBUG "CLEANUP_LOCK of PID %u @ inode:%lu\n", current->pid, inode->i_ino); ++#endif ++ //} ++ } ++ n_clusters = ext2_n_clusters(inode); ++ ++#ifdef EXT2_COMPR_REPORT_PUT ++ printk(KERN_DEBUG "ext2: inode:%lu: put compressed, clusters = %d, flags = %x, pid = %u\n", ++ inode->i_ino, n_clusters, ei->i_flags, current->pid); ++#endif ++ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); /* i.e. mutex_lock */ ++ ++ /* Try to compress the clusters. We clear EXT2_DIRTY_FL only ++ if we looked at every cluster and if there was no error. */ ++ ++ /* impl: We raise EXT2_COMPRBLK_FL now so that ext2_ioctl() ++ doesn't try to change the cluster size beneath us. If need ++ be, we restore the bit to its original setting before ++ returning. Note that no-one else can _change_ ++ EXT2_COMPRBLK_FL while we work because i_sem is down. */ ++ /* impl: Note what's happening here with comprblk_mask. The ++ current state of COMPRBLK_FL (before we start) is that ++ (comprblk == 1) || (no compressed clusters). At the end of ++ the procedure, comprblk == one if (at least one compressed ++ cluster, or an error occurred preventing us from finding ++ out). */ ++ comprblk_mask = ~EXT2_COMPRBLK_FL | ei->i_flags; ++ ei->i_flags |= EXT2_COMPRBLK_FL; ++ ++ for (cluster = 0; cluster < n_clusters; cluster++) { ++ if (atomic_read(&inode->i_count) > atomic_read(&start_i_count)) { ++ /* This is a poor way of doing this (and doubly ++ poor now that the only users of i_count are ++ the dentries), but the idea is not to ++ compress things tht are likely to be ++ decompressed soon. I guess a better way of ++ doing this would be just to make sure tht ++ the stuff is in the page cache. */ ++ retry = 1; ++ break; ++ } ++ err = ext2_cluster_is_compressed_fn(inode, cluster); ++ if (err == 0) { ++ //mw: ext2_compress_cluster might clean EXT2_COMPRBLK_FL, therefore raise it for every new cluster ++ ei->i_flags |= EXT2_COMPRBLK_FL; ++ ++ err = ext2_compress_cluster(inode, cluster); ++ if (err < 0) ++ dirty = 1; ++ else if (err > 0) ++ comprblk_mask = ~0ul; ++ } else if (err < 0) ++ break; ++ else { ++ err = 0; ++ assert(comprblk_mask == ~0ul); /* i.e. that EXT2_COMPRBLK_FL was high. */ ++ } ++ } ++ ++ if ((cluster >= n_clusters) && !dirty) ++ ei->i_flags &= ~EXT2_DIRTY_FL; ++ if (!retry) { ++ ei->i_compr_flags &= ~EXT2_CLEANUP_FL; ++ ei->i_flags &= comprblk_mask; ++ } ++ ++ /* We clear EXT2_CLEANUP_FL because, otherwise, we'll get ++ called again almost immediately. */ ++ ++ /* ++ * The CLEANUP flag *MUST* be cleared, otherwise the iput routine ++ * calls ext2_put_inode() again (because i_dirt is set) and there ++ * is a loop. The control scheme (CLEANUP + DIRTY flags) could ++ * probably be improved. On the other hand, i_dirt MUST be set ++ * because we may have sleeped, and we must force the iput routine ++ * to look again at the i_count ... ++ */ ++ /* TODO: Have a look at this cleanup scheme. The above ++ comment sounds wrong. */ ++ ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty_sync(inode); ++ out: ++ ++#ifdef EXT2_COMPR_REPORT_MUTEX ++ printk(KERN_DEBUG "CLEANUP_UNLOCK of PID %u @ inode:%lu\n", current->pid, inode->i_ino); ++#endif ++ ++// if (!(ei->i_compr_flags & EXT2_OSYNC_INODE)) { /* MW 5-16-07 */ ++ mutex_unlock(&inode->i_mutex); ++// } /* MW 5-16-07 */ ++ return err; /* TODO: Check that ,err` is appropriate. */ ++} ++ ++ ++int ext2_recognize_compressed(struct inode *inode, unsigned cluster) ++{ ++ /* ext2_recognize_compressed(): Check tht the cluster is valid ++ in every way, and then do the EXT2_COMPRESSED_BLKADDR ++ thing. */ ++ /* nyi, fixme. All of the userspace stuff (EXT2_NOCOMPR_FL ++ etc.) needs work, so I might as well leave this. See ++ ioctl.c for a description of what it's supposed to do. */ ++ return -ENOSYS; ++} ++ ++ ++/* Look for compressed clusters. If none, then clear EXT2_COMPRBLK_FL. ++ ++ Called by: ++ ext2_truncate(). ++ */ ++void ext2_update_comprblk(struct inode *inode) ++{ ++ unsigned block, last_block; ++ struct ext2_bkey key; ++ struct ext2_inode_info *ei = EXT2_I(inode); ++ ++ assert(ei->i_flags & EXT2_COMPRBLK_FL); ++ if (inode->i_size == 0) { ++ ei->i_flags &= ~EXT2_COMPRBLK_FL; ++ trace_e2c("ext2_update_comprblk 1: inode: %lu removed EXT2_COMPRBLK_FL!\n", inode->i_ino); ++ return; ++ } ++ last_block = ROUNDUP_RSHIFT(inode->i_size, ++ inode->i_sb->s_blocksize_bits) - 1; ++ block = ext2_first_cluster_nblocks(inode) - 1; ++ ++ assert(atomic_read(&inode->i_mutex.count) <= 0); ++ ++ if (!ext2_get_key(&key, inode, block)) ++ return; ++ for (;;) { ++ if (ext2_get_key_blkaddr(&key) == EXT2_COMPRESSED_BLKADDR) ++ goto out; ++ if (block >= last_block) ++ goto clear; ++ if (!ext2_next_key(&key, ei->i_clu_nblocks)) ++ goto out; ++ block += ei->i_clu_nblocks; ++ } ++ clear: ++ trace_e2c("ext2_update_comprblk 2: inode: %lu removed EXT2_COMPRBLK_FL!\n", inode->i_ino); ++ ei->i_flags &= ~EXT2_COMPRBLK_FL; ++ out: ++ ext2_free_key(&key); ++ assert(atomic_read(&inode->i_mutex.count) <= 0); ++ ++} ++ ++ ++/* ++ * allocate working areas ++ */ ++ ++DEFINE_PER_CPU(struct ext2_wa_S *, ext2_rd_wa) = NULL; ++DEFINE_PER_CPU(struct ext2_wa_S *, ext2_wr_wa) = NULL; ++ ++/* SMP, setup wa's. caller must hold wa already via get_cpu_var */ ++void ext2_alloc_rd_wa(){ ++ if ((__get_cpu_var(ext2_rd_wa) == NULL) ) { ++ size_t rsize = 2 * EXT2_MAX_CLUSTER_BYTES; //mw: just guessing ++ ++ __get_cpu_var(ext2_rd_wa) = vmalloc (rsize); ++ if (__get_cpu_var(ext2_rd_wa) == NULL) ++ printk ("EXT2-fs: can't allocate working area; compression turned off.\n"); ++ else { ++ printk ("ext2-compression: allocated read buffer for CPU%i at %p-%p (%zu bytes)\n", ++ get_cpu(), __get_cpu_var(ext2_rd_wa), (char *)__get_cpu_var(ext2_rd_wa) + rsize, rsize); ++# ifdef EXT2_COMPR_REPORT_WA ++ printk (KERN_INFO "EXT2-fs: rd_wa=%p--%p (%d)\n", ++ ext2_rd_wa, (char *)ext2_rd_wa + rsize, rsize); ++# endif ++ put_cpu(); ++ } ++ } ++} ++ ++void ext2_alloc_wr_wa(){ ++ ++ if ((__get_cpu_var(ext2_wr_wa) == NULL) ) { ++ size_t wsize = 2 * EXT2_MAX_CLUSTER_BYTES; //mw: just guessing ++ __get_cpu_var(ext2_wr_wa) = vmalloc (wsize); ++ ++ if (__get_cpu_var(ext2_wr_wa) == NULL) ++ printk ("EXT2-fs: can't allocate working area; " ++ "compression turned off.\n"); ++ else { ++ printk ("ext2-compression: allocated write buffer for CPU%i at %p-%p (%zu bytes)\n", ++ get_cpu(), __get_cpu_var(ext2_wr_wa), (char *)__get_cpu_var(ext2_wr_wa) + wsize, wsize); ++#ifdef EXT2_COMPR_REPORT_WA ++ printk (KERN_INFO "EXT2-fs: wr_wa=%p--%p (%d)\n", ++ ext2_wr_wa, (char *)ext2_wr_wa + wsize, wsize); ++#endif ++ put_cpu(); ++ } ++ } ++} ++ ++ +--- linux-3.4-rc5/fs/ext2/e2zlib.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-3.4-rc5-e2c/fs/ext2/e2zlib.c 2012-04-30 04:11:03.795143099 -0400 +@@ -0,0 +1,74 @@ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static DEFINE_PER_CPU(struct crypto_comp *, tfm) = NULL; ++ ++size_t ext2_iZLIB(int action) ++{ ++ /*mw: we init tfm when we need it...*/ ++ return 0; ++} ++ ++ ++size_t ext2_wZLIB(__u8 * ibuf, __u8 * obuf, void *heap, ++ size_t ilen, size_t olen, int level) ++{ ++ int ret, dlen; ++ ++ if (!try_module_get(THIS_MODULE)) ++ return 0; ++ ++ /*check if we already have a tfm*/ ++ get_cpu_var(tfm); ++ if (__get_cpu_var(tfm) == NULL){ ++ __get_cpu_var(tfm) = crypto_alloc_comp("deflate", 0, CRYPTO_ALG_ASYNC); ++ } ++ assert(__get_cpu_var(tfm) != NULL); ++ ++ dlen = olen; ++ ret = crypto_comp_compress(__get_cpu_var(tfm) , ibuf, ilen, obuf, &dlen); ++ ++ put_cpu_var(tfm); ++ ++ if (ret) { ++ //printk(KERN_DEBUG "ext2_wZLIB: crypto_comp_compress failed: %d, ilen: %d, olen: %d\n", ret, ilen, olen); ++ return 0; ++ } ++ return dlen; ++} ++ ++ ++size_t ext2_rZLIB(__u8 * ibuf, __u8 * obuf, void *heap, ++ size_t ilen, size_t olen, int ignored) ++{ ++ int ret, dlen; ++ ++ if (!try_module_get(THIS_MODULE)) ++ return 0; ++ ++ /*check if we already have a tfm*/ ++ get_cpu_var(tfm); ++ if (__get_cpu_var(tfm) == NULL){ ++ __get_cpu_var(tfm) = crypto_alloc_comp("deflate", 0, CRYPTO_ALG_ASYNC); ++ } ++ assert(__get_cpu_var(tfm) != NULL); ++ ++ dlen = olen; ++ ret = crypto_comp_decompress(__get_cpu_var(tfm), ibuf, ilen, obuf, &dlen); ++ ++ put_cpu_var(tfm); ++ ++ if (ret) { ++ //printk(KERN_DEBUG "ext2_wZLIB: crypto_comp_decompress failed: %d, ilen: %d, olen: %d\n", ret, ilen, olen); ++ return 0; ++ } ++ ++ return dlen; ++} +--- linux-3.4-rc5/fs/ext2/adler32.c 1969-12-31 19:00:00.000000000 -0500 ++++ linux-3.4-rc5-e2c/fs/ext2/adler32.c 2012-04-30 04:11:03.795143099 -0400 +@@ -0,0 +1,43 @@ ++/* adler32.c -- compute the Adler-32 checksum of a data stream ++ * Copyright (C) 1995-1998 Mark Adler ++ * For conditions of distribution and use, see copyright notice in zlib.h ++ */ ++ ++/* @(#) $Id: e2compr2.6.25.patch,v 1.1.2.1 2008/04/17 09:49:32 winkler Exp $ */ ++ ++#define BASE 65521L /* largest prime smaller than 65536 */ ++#define NMAX 5552 ++/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ ++ ++#define DO1(buf,i) {s1 += buf[i]; s2 += s1;} ++#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1); ++#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2); ++#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); ++#define DO16(buf) DO8(buf,0); DO8(buf,8); ++ ++/* ========================================================================= */ ++unsigned long ext2_adler32(unsigned long adler, const unsigned char *buf, unsigned int len) ++{ ++ unsigned long s1 = adler & 0xffff; ++ unsigned long s2 = (adler >> 16) & 0xffff; ++ int k; ++ ++ if (buf == 0) return 1L; ++ ++ while (len > 0) { ++ k = len < NMAX ? len : NMAX; ++ len -= k; ++ while (k >= 16) { ++ DO16(buf); ++ buf += 16; ++ k -= 16; ++ } ++ if (k != 0) do { ++ s1 += *buf++; ++ s2 += s1; ++ } while (--k); ++ s1 %= BASE; ++ s2 %= BASE; ++ } ++ return (s2 << 16) | s1; ++} +--- linux-3.4-rc5/fs/ext2/super.c 2012-04-29 18:19:10.000000000 -0400 ++++ linux-3.4-rc5-e2c/fs/ext2/super.c 2012-04-30 04:11:03.797143097 -0400 +@@ -32,7 +32,12 @@ + #include + #include + #include ++#ifdef CONFIG_EXT2_COMPRESS ++#include ++#include ++#else + #include "ext2.h" ++#endif + #include "xattr.h" + #include "acl.h" + #include "xip.h" +@@ -392,7 +397,11 @@ enum { + Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, + Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, + Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, +- Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota, ++ Opt_acl, Opt_noacl, ++#ifdef CONFIG_EXT2_COMPRESS ++ Opt_force_compat, ++#endif ++ Opt_xip, Opt_ignore, Opt_err, Opt_quota, + Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation + }; + +@@ -425,6 +434,9 @@ static const match_table_t tokens = { + {Opt_ignore, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, ++#ifdef CONFIG_EXT2_COMPRESS ++ {Opt_force_compat, "force-compat"}, ++#endif + {Opt_reservation, "reservation"}, + {Opt_noreservation, "noreservation"}, + {Opt_err, NULL} +@@ -568,6 +580,11 @@ static int parse_options(char *options, + clear_opt(sbi->s_mount_opt, RESERVATION); + ext2_msg(sb, KERN_INFO, "reservations OFF"); + break; ++#ifdef CONFIG_EXT2_COMPRESS ++ case Opt_force_compat: ++ set_opt(sbi->s_mount_opt, FORCE_COMPAT); ++ break; ++#endif + case Opt_ignore: + break; + default: +@@ -584,6 +601,10 @@ static int ext2_setup_super (struct supe + int res = 0; + struct ext2_sb_info *sbi = EXT2_SB(sb); + ++#ifdef CONFIG_EXT2_COMPRESS ++ printk (KERN_INFO E2COMPR_VERSION "\n"); ++#endif ++ + if (le32_to_cpu(es->s_rev_level) > EXT2_MAX_SUPP_REV) { + ext2_msg(sb, KERN_ERR, + "error: revision level too high, " +@@ -875,6 +896,65 @@ static int ext2_fill_super(struct super_ + le32_to_cpu(features)); + goto failed_mount; + } ++#ifdef CONFIG_EXT2_COMPRESS ++ /* Check that required algorithms are available. */ ++ /* todo: Provide a mount option to override this. */ ++ /* ++ * Philosophical bug: we assume that an algorithm's ++ * module is available if and only if this kernel was ++ * compiled with that algorithm as a module. This is ++ * untrue, but it is unclear what the right thing to ++ * do is. ++ */ ++ j = 0; /* error flag */ ++ if ((es->s_feature_incompat ++ & cpu_to_le32(EXT2_FEATURE_INCOMPAT_COMPRESSION)) ++ && (es->s_algorithm_usage_bitmap ++ & ~cpu_to_le32(EXT2_ALGORITHMS_SUPP))) { ++ /* ++ * The filesystem employs an algorithm not ++ * supported by this filesystem. Issue warning or ++ * error. ++ */ ++ for (i = 0; i < 32; i++) { ++ if (!(es->s_algorithm_usage_bitmap ++ & cpu_to_le32(1 << i)) ++ || ((EXT2_ALGORITHMS_SUPP ++ & (1 << i)))) ++ continue; ++ /* ++ * TODO: Can't this message be moved outside ++ * of the for loop? ++ */ ++ if (!j) { ++ if (test_opt(sb, FORCE_COMPAT)) ++ printk(KERN_WARNING ++ "EXT2-fs: %s: " ++ "uses unsupported " ++ "compression algorithms", ++ sb->s_id); ++ else ++ printk("EXT2-fs: %s: couldn't mount " ++ "because of unsupported " ++ "compression algorithms", ++ sb->s_id); ++ j = 1; ++ } ++ if (i < EXT2_N_ALGORITHMS) ++ printk(" %s", ext2_algorithm_table[i].name); ++ else ++ printk(" %u", i); ++ } ++ } ++ if (j) { ++ if (test_opt(sb, FORCE_COMPAT)) ++ printk(" but ignoring as you request.\n"); ++ else { ++ printk(".\n"); ++ goto failed_mount; ++ } ++ } ++#endif /* CONFIG_EXT2_COMPRESS */ + if (!(sb->s_flags & MS_RDONLY) && + (features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){ + ext2_msg(sb, KERN_ERR, "error: couldn't mount RDWR because of " +--- linux-3.4-rc5/fs/ext2/ialloc.c 2012-04-29 18:19:10.000000000 -0400 ++++ linux-3.4-rc5-e2c/fs/ext2/ialloc.c 2012-04-30 04:11:03.797143097 -0400 +@@ -470,6 +470,9 @@ struct inode *ext2_new_inode(struct inod + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, group); + if (!bitmap_bh) { ++#ifdef CONFIG_EXT2_COMPRESS ++ EXT2_I(inode)->i_flags &= ~EXT2_COMPR_FL; ++#endif + err = -EIO; + goto fail; + } +@@ -558,6 +561,17 @@ got: + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_flags = + ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED); ++#ifdef CONFIG_EXT2_COMPRESS ++ /* ++ * The EXT2_COMPR flag is inherited from the parent ++ * directory as well as the cluster size and the compression ++ * algorithm. ++ */ ++ ei->i_log2_clu_nblocks = EXT2_I(dir)->i_log2_clu_nblocks; ++ ei->i_clu_nblocks = EXT2_I(dir)->i_clu_nblocks; ++ ei->i_compr_method = EXT2_I(dir)->i_compr_method; ++ ei->i_compr_flags = 0; ++#endif + ei->i_faddr = 0; + ei->i_frag_no = 0; + ei->i_frag_size = 0; +--- linux-3.4-rc5/fs/ext2/balloc.c 2012-04-29 18:19:10.000000000 -0400 ++++ linux-3.4-rc5-e2c/fs/ext2/balloc.c 2012-04-30 04:11:03.798143097 -0400 +@@ -11,8 +11,13 @@ + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + ++#ifdef CONFIG_EXT2_COMPRESS ++#include ++#include ++#else + #include "ext2.h" + #include ++#endif + #include + #include + #include +@@ -499,6 +504,13 @@ void ext2_free_blocks (struct inode * in + struct ext2_super_block * es = sbi->s_es; + unsigned freed = 0, group_freed; + ++ ++#ifdef CONFIG_EXT2_COMPRESS ++ assert((block != EXT2_COMPRESSED_BLKADDR) ++ || !S_ISREG(inode->i_mode) ++ || !(EXT2_SB(sb)->s_es->s_feature_incompat ++ & cpu_to_le32(EXT2_FEATURE_INCOMPAT_COMPRESSION))); ++#endif + if (block < le32_to_cpu(es->s_first_data_block) || + block + count < block || + block + count > le32_to_cpu(es->s_blocks_count)) { +--- linux-3.4-rc5/fs/ext2/inode.c 2012-04-29 18:19:10.000000000 -0400 ++++ linux-3.4-rc5-e2c/fs/ext2/inode.c 2012-04-30 04:11:03.803143097 -0400 +@@ -31,10 +31,45 @@ + #include + #include + #include ++#ifdef CONFIG_EXT2_COMPRESS ++#include ++#include ++#include ++#include ++#else + #include "ext2.h" ++#endif + #include "acl.h" + #include "xip.h" + ++#ifdef CONFIG_EXT2_COMPRESS ++/* mw: this function counts all references ++ * to this inode. this is necessary to ++ * refuse un/compression if the file has ++ * more than one refernce, I guess. */ ++int ext2_get_dcount(struct inode *inode) ++{ ++ struct dentry *dentry; ++ struct list_head *head, *next, *tmp; ++ int count; ++ ++ head = &inode->i_dentry; ++ next = inode->i_dentry.next; ++ count = 0; ++ while (next != head) { ++ dentry = list_entry(next, struct dentry, d_alias); ++ tmp = next; ++ next = tmp->next; ++ spin_lock(&dentry->d_lock); ++ count += dentry->d_count; ++ spin_unlock(&dentry->d_lock); ++ //mw: similar to fs/dcache.c ++ } ++ ++ return count; ++} ++#endif ++ + static int __ext2_write_inode(struct inode *inode, int do_sync); + + /* +@@ -49,7 +84,9 @@ static inline int ext2_inode_is_fast_sym + inode->i_blocks - ea_blocks == 0); + } + ++#ifndef CONFIG_EXT2_COMPRESS + static void ext2_truncate_blocks(struct inode *inode, loff_t offset); ++#endif + + static void ext2_write_failed(struct address_space *mapping, loff_t to) + { +@@ -235,7 +272,11 @@ static Indirect *ext2_get_branch(struct + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, EXT2_I(inode)->i_data + *offsets); ++#ifdef CONFIG_EXT2_COMPRESS ++ if (HOLE_BLKADDR(p->key)) ++#else + if (!p->key) ++#endif + goto no_block; + while (--depth) { + bh = sb_bread(sb, le32_to_cpu(p->key)); +@@ -246,7 +287,11 @@ static Indirect *ext2_get_branch(struct + goto changed; + add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); + read_unlock(&EXT2_I(inode)->i_meta_lock); ++#ifdef CONFIG_EXT2_COMPRESS ++ if (HOLE_BLKADDR(p->key)) ++#else + if (!p->key) ++#endif + goto no_block; + } + return NULL; +@@ -292,7 +337,11 @@ static ext2_fsblk_t ext2_find_near(struc + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) ++#ifdef CONFIG_EXT2_COMPRESS ++ if (!HOLE_BLKADDR(*p)) ++#else + if (*p) ++#endif + return le32_to_cpu(*p); + + /* No such thing, so let's try location of indirect block */ +@@ -493,7 +542,13 @@ static int ext2_alloc_branch(struct inod + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + branch[n].bh = bh; ++#ifndef CONFIG_EXT2_COMPRESS + lock_buffer(bh); ++#else ++ CHECK_NOT_ATOMIC ++ if (!buffer_uptodate(bh)) ++ wait_on_buffer(bh); ++#endif + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); +@@ -509,7 +564,9 @@ static int ext2_alloc_branch(struct inod + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + set_buffer_uptodate(bh); ++#ifndef CONFIG_EXT2_COMPRESS + unlock_buffer(bh); ++#endif + mark_buffer_dirty_inode(bh, inode); + /* We used to sync bh here if IS_SYNC(inode). + * But we now rely upon generic_write_sync() +@@ -670,6 +727,7 @@ static int ext2_get_blocks(struct inode + if (err == -EAGAIN || !verify_chain(chain, partial)) { + while (partial > chain) { + brelse(partial->bh); ++// bforget(partial->bh); /*mw: e2c-pre-2.6.30.4 used bforget here*/ + partial--; + } + partial = ext2_get_branch(inode, depth, offsets, chain, &err); +@@ -761,21 +819,608 @@ int ext2_fiemap(struct inode *inode, str + ext2_get_block); + } + ++#ifdef CONFIG_EXT2_COMPRESS ++/* ++ * Readpage method that will take care of decompression. ++ */ ++/* effic: I (pjm) think tht at present, reading a 32KB cluster 4KB at ++ a time does `decompress 4KB' for the first 4KB, then `decompress ++ 8KB' for the second, and so on. See if we can provide the page ++ cache with all the pages in a cluster. The problem is, we don't ++ want to erase anything tht hasn't been written to disk, so we can't ++ just call update_vm_cache(). The plan at present is to remember ++ what the contents of ext2_rd_wa.u come from, and don't bother ++ decompressing anything if the working area already contains the ++ right data. However, this is only a win where adjacent calls to ++ ext2_decompress_blocks() request the same cluster. We could force ++ that by copying some code from generic_file_read() (but check for ++ deadlocks before doing anything like that), but instead I'm taking ++ the more passive approach of hoping for the best. */ ++static int ext2_readpage(struct file *file, struct page *page) ++{ ++ struct inode *inode = page->mapping->host; ++ struct page *pg[EXT2_MAX_CLUSTER_PAGES], *epg[EXT2_MAX_CLUSTER_PAGES]; ++ u32 cluster0, max_cluster; ++ int i, blockOfCluster, blocksToDo, npg; ++ const int inc = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; ++ struct ext2_inode_info *ei = EXT2_I(page->mapping->host); ++#ifdef CONFIG_HIGHMEM ++ int kmapped = 0; //mw ++#endif ++ ++ int iClusterCnt; ++ ++ /* For directories, fall out through default routine */ ++ if (S_ISDIR(inode->i_mode)) ++ { ++ int rc; ++ ++ rc = block_read_full_page(page,ext2_get_block); ++ assert(!rc); ++ return rc; ++ } ++ ++ /* The semaphore prevents us trying to compress and decompress ++ the cluster at the same time, or compress a cluster in the ++ middle of reading it (thinking it to be uncompressed). ++ ++ You may not like the fact that we hold the semaphore across ++ readpage (given that it isn't held without e2compr compiled ++ in), but it does guarantee that we won't compress the ++ cluster during readpage. (OTOH, it's unlikely, if not ++ impossible, for someone to ,compress a cluster and rewrite ++ the blocks` before the readpage completes.) */ ++ /* This procedure used to have `#ifndef EXT2_LOCK_BUFFERS' ++ around all the semaphore stuff, and unlocked each buffer ++ before brelsing them ifdef EXT2_LOCK_BUFFERS. I (pjm, ++ 1998-01-20) have removed that because (a) EXT2_LOCK_BUFFERS ++ isn't #defined anywhere, and doesn't appear outside of this ++ function, and (b) I haven't looked at what effect locking ++ the buffers has. You may like to reintroduce the idea of ++ buffer locking to this function if you're more familiar ++ with buffer locking than I, and believe that the full i_sem ++ isn't necessary to protect from races (people seeing raw ++ compressed data) between readpage and ext2_file_write(), ++ ext2_compress_cluster() and ext2_truncate(). */ ++ unlock_page(page); ++ mutex_lock(&inode->i_mutex); ++ ++ assert (atomic_read(&inode->i_mutex.count) <= 0); /* i.e. mutex_lock */ ++ ++ //mw: added EXT2_COMPR_FL, because EXT2_COMPRBLK_FL mit change without mutex !!! ++ if ( !(ei->i_flags & (EXT2_COMPRBLK_FL|EXT2_COMPR_FL)) ++ || (ei->i_flags & EXT2_NOCOMPR_FL) ) ++ { ++ goto readpage_uncompressed; ++ } ++ ++ { ++ register u32 blockOfFile ++ = (page->index << PAGE_CACHE_SHIFT) >> inode->i_sb->s_blocksize_bits; ++ ++ blocksToDo = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; ++ cluster0 = ext2_block_to_cluster(inode, blockOfFile); ++ max_cluster = ext2_block_to_cluster ++ (inode, blockOfFile + blocksToDo - 1); ++ blockOfCluster ++ = blockOfFile - ext2_cluster_block0(inode, cluster0); ++ } ++ ++ /* return -???, any idea which code. do_generic_file_read() cares, ext2_readpages() doesn't. ++ maybe I should look at the "generic" readpage() and see what it returns in this case */ ++ ++ /* Check if any part of the requested area contains part of a ++ compressed cluster. If not, we can use default ext2_readpage(). ++ ++ (Note that we don't have to worry about a cluster becoming ++ compressed in the meantime, because we have the semaphore.) ++ ++ A page can cover up to 9 clusters. (The maximum can only ++ occur with 32KB pages, 4KB clusters, and a non-page-aligned ++ offset. Thanks go to Kurt Fitzner for reporting that ++ page offsets needn't be aligned; see generic_file_mmap().) */ ++ { ++ int isCmp[(PAGE_SIZE >> 12) + 1]; ++ u8 *dst; ++ unsigned clu_ix; ++ ++ assert (max_cluster - cluster0 < sizeof(isCmp)/sizeof(*isCmp)); ++ for (clu_ix = 0; cluster0 + clu_ix <= max_cluster; clu_ix++) { ++ isCmp[clu_ix] = ext2_cluster_is_compressed_fn (inode, cluster0 + clu_ix); ++ if (isCmp[clu_ix] < 0){ ++ printk("IO-ERROR: isCmp\n"); ++ goto io_error; ++ } ++ } ++ ++ for (clu_ix = 0; cluster0 + clu_ix <= max_cluster; clu_ix++) ++ if (isCmp[clu_ix] > 0) ++ goto readpage_compressed; ++ /* fall through */ ++ readpage_uncompressed: ++ { ++ int rc=0; ++ lock_page(page); ++ ++ /* Did somebody else fill it already? */ ++ if (PageUptodate(page) ){ //mw: necessary for DEBUG! anyway checked in do_generic_mapping_read ++ unlock_page(page); ++ } ++ else { ++ //try_to_free_buffers(page); ++ rc = block_read_full_page(page,ext2_get_block); ++ } ++ mutex_unlock(&inode->i_mutex); ++ assert(!rc); ++ return rc; ++ } ++ ++ readpage_compressed: ++ ++ /* Copied from block_read_full_page */ ++ /* if (!PageLocked(page)) */ ++ /* PAGE_BUG(page); */ ++ lock_page(page); ++ if (PageUptodate(page)) { ++ unlock_page(page); ++ mutex_unlock(&inode->i_mutex); ++ return(0); ++ } ++ get_page(page); ++ ++ ClearPageUptodate(page); ++ ClearPageError(page); ++ ++ dst = (u8 *) page_address(page); ++ for (clu_ix = 0; cluster0 + clu_ix <= max_cluster; clu_ix++) { ++ struct buffer_head *bh[EXT2_MAX_CLUSTER_BLOCKS]; ++ int nbh, blocksThisClu; ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ pg[i] = NULL; ++ epg[i] = NULL; ++ } ++ ++ /* clear_bit(PG_locked, &page->flags); */ ++ npg = ext2_cluster_npages(inode, cluster0 + clu_ix); ++ nbh = ext2_get_cluster_pages(inode, cluster0 + clu_ix, pg, page, 0); ++ ++ if (nbh <= 0) { ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) ++ printk("no pages\n"); ++ goto out; ++ } ++ iClusterCnt = ext2_cluster_npages(inode, cluster0); ++ ++ nbh = ext2_get_cluster_extra_pages(inode, cluster0 + clu_ix, pg, epg); ++ if (nbh <= 0) ++ { ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) ++ epg[i] = NULL; ++ printk("no extra pages\n"); ++ goto out; ++ } ++ assert (iClusterCnt = ext2_cluster_npages(inode, cluster0)); ++ ++#ifdef CONFIG_HIGHMEM ++ ext2_kmap_cluster_pages(page, pg, epg); ++ kmapped = 1; ++#endif ++ ++ nbh = ext2_get_cluster_blocks(inode, cluster0 + clu_ix, bh, pg, epg, 0); ++ if (nbh <= 0) ++ { ++ printk("no blocks\n"); ++ goto out; ++ } ++ ++ /* How many blocks (including holes) we need from this cluster. */ ++ { ++ blocksThisClu = (ext2_cluster_nblocks(inode, cluster0 + ++ clu_ix) - blockOfCluster); ++ if (blocksThisClu > blocksToDo) ++ blocksThisClu = blocksToDo; ++ } ++ ++ if (isCmp[clu_ix]) { ++ u8 const *src; ++ int n, nbytes_wanted; ++ struct ext2_cluster_head *head; ++ unsigned meth; ++# ifdef CONFIG_KMOD ++ unsigned alg; ++# endif ++ ++ bh[0]->b_data = page_address(bh[0]->b_page); ++ head = (struct ext2_cluster_head *) bh[0]->b_data; ++ ++ /* jmr 1998-10-28 Hope this is the last time I'm moving this code. ++ * Module loading must be done _before_ we lock wa, just think what ++ * can happen if we reallocate wa when somebody else uses it... ++ */ ++ meth = head->method; /* only a byte, so no swabbing needed. */ ++ if (meth >= EXT2_N_METHODS) { ++ printk("illegal method id\n"); ++ ext2_msg(inode->i_sb, ++ "illegal method id", ++ "inode = %lu, id = %u", ++ inode->i_ino, meth); ++ goto out; ++ } ++# ifdef CONFIG_KMOD ++ alg = ext2_method_table[meth].alg; ++ if (!ext2_algorithm_table[alg].avail) { ++ char str[32]; ++ ++ sprintf(str, "ext2-compr-%s", ext2_algorithm_table[alg].name); ++ request_module(str); ++ } ++# endif /* CONFIG_KMOD */ ++ ++ /* Calculate nbytes_wanted. */ ++ { ++ unsigned nblk_wanted, i; ++ ++ /* We want to decompress the whole cluster */ ++ //nblk_wanted = ext2_cluster_nblocks(inode, cluster0 + clu_ix); ++ nblk_wanted = npg << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); /*mw: FIXED */ ++ ++ for (i = nblk_wanted; i != 0;) ++ if (((--i >> 3) < head->holemap_nbytes) ++ && (head->holemap[i >> 3] & (1 << (i & 7)))) ++ --nblk_wanted; ++ nbytes_wanted = (nblk_wanted ++ << inode->i_sb->s_blocksize_bits); ++ } ++ ++ /* Decompress. */ ++ get_cpu_var(ext2_rd_wa); ++ if (__get_cpu_var(ext2_rd_wa) == NULL) ++ { ++ ext2_alloc_rd_wa(); ++ } ++ assert(__get_cpu_var(ext2_rd_wa) != NULL); ++ ++ n = ext2_decompress_blocks(inode, bh, nbh, nbytes_wanted, cluster0 + clu_ix); ++ if (n < 0) { ++ assert(nbh >= 0); ++ printk("ext2_readpage: noblocks decompressed\n"); ++ put_cpu_var(ext2_rd_wa); ++ goto out; ++ } ++ ++# ifdef EXT2_COMPR_REPORT_VERBOSE_INODE ++ if (ei->i_flags & EXT2_COMPR_FL) ++ printk(KERN_DEBUG "ext2: mmap %04x:%lu: blocksToDo=%d, blockOfCluster=%d, blocksThisClu=%d, clu_nblocks=%d\n", ++ inode->i_rdev, ++ inode->i_ino, ++ blocksToDo, ++ blockOfCluster, ++ blocksThisClu, ++ ext2_cluster_nblocks(inode, cluster0 + clu_ix)); ++# endif ++ ++ /* */ ++ { ++ unsigned i; ++ int ipg; ++ ++ i = ext2_cluster_nblocks(inode, cluster0 + clu_ix) - 1; ++ //i = (npg << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)) - 1; /*mw: FIXED!!! (here: shift = 2Bit) */ ++ //if(i+1 != ext2_cluster_nblocks(inode, cluster0 + clu_ix)) ++ //printk("npg=%i, nbh=%i, npgf=%i, nbhf =%i, cluster:%i, dec_blk:%i, b_wanted:%i, size:%i\n ", ext2_cluster_npages(inode, cluster0 + clu_ix), ext2_cluster_nblocks(inode, cluster0 + clu_ix), npgtest, i+1, cluster0 + clu_ix, n, nbytes_wanted, inode->i_size); ++ blockOfCluster = 0; ++ assert(n > 0); ++ src = __get_cpu_var(ext2_rd_wa)->u + nbytes_wanted - inode->i_sb->s_blocksize; ++#ifdef EXT2_COMPR_REPORT ++ trace_e2c("ext2_readpage: copy data inc=%d blocksThisClu=%d, n=%d\n", inc, blocksThisClu, n); ++#endif ++ for (ipg = npg - 1; ipg >= 0; ipg--) { ++ if (pg[ipg] == NULL) { ++ i -= inc; ++ src -= PAGE_SIZE; ++ continue; ++ } ++ if (((inode->i_size-1) >> PAGE_SHIFT) == pg[ipg]->index) { ++ n = ((inode->i_size-1) & (PAGE_SIZE -1)) >> inode->i_sb->s_blocksize_bits; ++ i -= ((blocksThisClu-1) - n); ++ src -= ((blocksThisClu-1) - n) << inode->i_sb->s_blocksize_bits; ++ } else { ++ n = blocksThisClu - 1; ++ } ++ if (PageUptodate(pg[ipg]) ) { ++ for (;n >= 0;n--, i--) { ++ if (((i >> 3) >= head->holemap_nbytes) ++ || !(head->holemap[i >> 3] & (1 << (i & 7)))) { ++ src -= inode->i_sb->s_blocksize; ++ } ++ } ++ } else { ++ ++ dst = (u8 *) page_address(pg[ipg]) + (n << inode->i_sb->s_blocksize_bits); ++ ++ for (; ++ n >= 0; ++ n--, i--, dst -= inode->i_sb->s_blocksize) { ++ assert(!buffer_dirty(bh[i])); ++ clear_buffer_dirty(bh[i]); //mw: had a refile_buffer in 2.4 ++ if (((i >> 3) >= head->holemap_nbytes) ++ || !(head->holemap[i >> 3] & (1 << (i & 7)))) { ++ assert(i >= 0); ++ memcpy(dst, src, inode->i_sb->s_blocksize); ++ src -= inode->i_sb->s_blocksize; ++ } else { ++ assert(i >= 0); ++ memset (dst, 0, inode->i_sb->s_blocksize); ++ } ++ //clear_bit(BH_Uptodate, &bh[i]->b_state); ++ } ++ SetPageUptodate(pg[ipg]); ++ } ++ } ++ } ++ put_cpu_var(ext2_rd_wa); ++ } else { ++ /* Uncompressed cluster. Just copy the data. */ ++ int n; ++ ++# ifdef EXT2_COMPR_REPORT_VERBOSE_INODE ++ if (ei->i_flags & EXT2_COMPR_FL) ++ printk(KERN_DEBUG ++ "ext2: mmap %lu: blocksToDo = %d, " ++ "blockOfCluster = %d, clu_nblocks = %d\n", ++ inode->i_ino, blocksToDo, blockOfCluster, ++ ext2_cluster_nblocks(inode, cluster0 + ++ clu_ix)); ++# endif ++ ++ for (n = 0; ++ n < blocksThisClu; ++ n++, dst += inode->i_sb->s_blocksize) { ++ if ((blockOfCluster + n < nbh) ++ && (bh[blockOfCluster + n] != NULL)) ++ { ++ memcpy(dst, ++ bh[blockOfCluster + n]->b_data, ++ inode->i_sb->s_blocksize); ++ } ++ else ++ { ++ memset(dst, 0, inode->i_sb->s_blocksize); ++ } ++ } ++ blockOfCluster = 0; ++ } // end uncompressed Cluster ++ ++ blocksToDo -= blocksThisClu; ++ ++#ifdef CONFIG_HIGHMEM ++ if (kmapped) ++ ext2_kunmap_cluster_pages(page, pg, epg); ++#endif ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (epg[i] != NULL) { ++ ++ ClearPageDirty(epg[i]); ++ ClearPageUptodate(epg[i]); ++ try_to_free_buffers(epg[i]); ++ unlock_page(epg[i]); ++ assert(page_count(epg[i]) <= 1); ++ page_cache_release(epg[i]); ++ } ++ } ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (pg[i] == NULL) ++ break; ++ if (pg[i] == page) ++ continue; ++ unlock_page(pg[i]); ++ page_cache_release(pg[i]); ++ } ++ //mw ++ assert (isCmp[clu_ix] == ext2_cluster_is_compressed_fn (inode, cluster0 + clu_ix)); ++ } // end for-loop: Cluster ++ } ++ ++ SetPageUptodate(page); ++ unlock_page(page); ++ atomic_dec(&page->_count); ++ mutex_unlock(&inode->i_mutex); ++ return 0; ++ ++ out: ++ ++#ifdef CONFIG_HIGHMEM ++ if (kmapped) ++ ext2_kunmap_cluster_pages(page, pg, epg); ++#endif ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (epg[i] != NULL) { ++ ++ ClearPageDirty(epg[i]); ++ ClearPageUptodate(epg[i]); ++ try_to_free_buffers(epg[i]); ++ unlock_page(epg[i]); ++ assert(page_count(epg[i]) <= 1); ++ page_cache_release(epg[i]); ++ } ++ } ++ ++ for (i = 0; i < EXT2_MAX_CLUSTER_PAGES; i++) { ++ if (pg[i] == NULL) ++ break; ++ if (pg[i] == page) ++ continue; ++ unlock_page(pg[i]); ++ page_cache_release(pg[i]); ++ } ++ mutex_unlock(&inode->i_mutex); ++ return 0; ++ ++ io_error: ++#ifdef CONFIG_HIGHMEM ++ if (kmapped) ++ ext2_kunmap_cluster_pages(page, pg, epg); ++#endif ++ SetPageError(page); ++ unlock_page(page); ++ atomic_dec(&page->_count); ++ mutex_unlock(&inode->i_mutex); ++ printk("Readpage: IOERROR\n"); ++ return -EIO; /* it is tested in do_generic_file_read(), ... */ ++} ++#endif /* CONFIG_EXT2_COMPRESS */ ++ + static int ext2_writepage(struct page *page, struct writeback_control *wbc) + { ++/* mw (24/06/2008): ++ * WRITEPAGE: this code was also in e2compr 2.4 and once removed by yaboo ding. ++ * ext2_writepage() is also called for dirty pages. Usually we write using file_write() which ++ * wraps correctly to compressed files. BUT: a writeable memory map might ++ * produce dirty pages, which will be written back normally. this should/might fail. ++ * The following code should fix this bug, but this was not tested yet. ++ */ ++#ifdef CONFIG_EXT2_COMPRESS ++#undef USE_WRITEPAGE ++//#define USE_WRITEPAGE ++#ifdef USE_WRITEPAGE ++ ++ struct ext2_inode_info *ei = EXT2_I(page->mapping->host); ++ int retval; ++ ++ struct inode *inode = page->mapping->host; ++ u32 cluster0, max_cluster; ++ int blocksToDo; ++ ++ unlock_page(page); ++ //mw: do we need this ??? ++ //if (!(ei->i_compr_flags & EXT2_OSYNC_INODE)) { ++ /* trace_e2c("ext2_writepage: inode"); */ ++ mutex_lock(&inode->i_mutex); ++ /* trace_e2c(" down\n"); */ ++ //} ++ if (!(ei->i_flags & EXT2_COMPRBLK_FL) ++ || (ei->i_flags & EXT2_NOCOMPR_FL) ) ++ { ++ //mw: do we need this ??? ++ //if (!(ei->i_compr_flags & EXT2_OSYNC_INODE)) { ++ /* trace_e2c("ext2_writepage: inode up 1\n"); */ ++ mutex_unlock(&inode->i_mutex); ++ //} ++ lock_page(page); ++ return block_write_full_page(page, ext2_get_block, wbc); ++ } ++ /* */ ++ { ++ register u32 blockOfFile ++ = (page->index << PAGE_CACHE_SHIFT) >> inode->i_sb->s_blocksize_bits; ++ ++ blocksToDo = PAGE_SIZE >> inode->i_sb->s_blocksize_bits; ++ cluster0 = ext2_block_to_cluster(inode, blockOfFile); ++ max_cluster = ext2_block_to_cluster(inode, blockOfFile + blocksToDo - 1); ++ } ++ ++ /* Check if any part of the requested area contains part of a ++ compressed cluster. If not, we can use default ext2_writepage(). ++ ++ (Note that we don't have to worry about a cluster becoming ++ compressed in the meantime, because we have the semaphore.) ++ ++ A page can cover up to 9 clusters. (The maximum can only ++ occur with 32KB pages, 4KB clusters, and a non-page-aligned ++ offset. Thanks go to Kurt Fitzner for reporting that ++ page offsets needn't be aligned; see generic_file_mmap().) */ ++ ++ { ++ int isCmp[(PAGE_SIZE >> 12) + 1]; ++ unsigned clu_ix; ++ ++ assert (max_cluster - cluster0 < sizeof(isCmp)/sizeof(*isCmp)); ++ for (clu_ix = 0; cluster0 + clu_ix <= max_cluster; clu_ix++) { ++ isCmp[clu_ix] = ext2_cluster_is_compressed_fn (inode, cluster0 + clu_ix); ++ if (isCmp[clu_ix] < 0) { ++ //mw: do we need this ???if (!(ei->i_compr_flags & EXT2_OSYNC_INODE)) { ++ /* trace_e2c("ext2_writepage: inode up 2\n"); */ ++ lock_page(page); ++ mutex_unlock(&inode->i_mutex); ++ //} ++ return -EIO; ++ } ++ } ++ ++ for (clu_ix = 0; cluster0 + clu_ix <= max_cluster; clu_ix++) ++ if (isCmp[clu_ix] > 0) ++ ext2_decompress_cluster(inode, cluster0 + clu_ix); ++ ++ //mw: do we need this ??? ++ //if (!(ei->i_compr_flags & EXT2_OSYNC_INODE)) { ++ /* trace_e2c("ext2_writepage: inode up 3\n"); */ ++ mutex_unlock(&inode->i_mutex); ++ //} ++ lock_page(page); ++ ++ /* fall through */ ++ } ++#endif /* CONFIG_EXT2_COMPRESS */ ++#endif + return block_write_full_page(page, ext2_get_block, wbc); + } + ++#ifndef CONFIG_EXT2_COMPRESS + static int ext2_readpage(struct file *file, struct page *page) + { + return mpage_readpage(page, ext2_get_block); + } ++#endif + + static int + ext2_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) + { ++#ifdef CONFIG_EXT2_COMPRESS ++/* ++ * For now, just read each page into cache and don't worry about emitting BIOs. ++ * (whitpa 02 Aug 2004). ++ */ ++ ++ unsigned page_idx; ++ struct pagevec lru_pvec; ++ int iError; ++ ++ pagevec_init(&lru_pvec, 0); ++ ++ for (page_idx = 0; page_idx < nr_pages; page_idx++) { ++ struct page *page = list_entry(pages->prev, struct page, lru); ++ ++ prefetchw(&page->flags); ++ list_del(&page->lru); ++ ++ iError = add_to_page_cache(page, mapping, page->index, GFP_KERNEL); ++ if (!iError) { ++ if (!PageUptodate(page)) ++ { ++ (void) ext2_readpage(file, page); ++ } ++ else ++ { ++ unlock_page(page); ++ } ++ if (!pagevec_add(&lru_pvec, page)) ++ __pagevec_lru_add_file(&lru_pvec); ++ } else { ++ page_cache_release(page); ++ } ++ ++ } ++ pagevec_lru_add_file(&lru_pvec); ++ BUG_ON(!list_empty(pages)); ++ return 0; ++#else + return mpage_readpages(mapping, pages, nr_pages, ext2_get_block); ++#endif + } + + static int +@@ -824,11 +1469,58 @@ static int ext2_nobh_writepage(struct pa + return nobh_writepage(page, ext2_get_block, wbc); + } + ++#ifdef CONFIG_EXT2_COMPRESS ++static sector_t ext2_do_bmap(struct address_space *mapping, sector_t block) ++#else + static sector_t ext2_bmap(struct address_space *mapping, sector_t block) ++#endif + { + return generic_block_bmap(mapping,block,ext2_get_block); + } + ++#ifdef CONFIG_EXT2_COMPRESS ++/* Return 0 instead of EXT2_COMPRESSED_BLKADDR if EXT2_NOCOMPR_FL ++ * high. This is necessary for us to be able to use ++ * generic_readpage() when EXT2_NOCOMPR_FL is high. ++ */ ++static sector_t ext2_bmap(struct address_space *mapping, sector_t block) ++{ ++ sector_t result; ++ struct inode *inode = mapping->host; ++ ++ if ((EXT2_I(inode)->i_flags & (EXT2_COMPRBLK_FL | EXT2_NOCOMPR_FL)) ++ == (EXT2_COMPRBLK_FL | 0)) { ++ int err; ++ ++ err = ext2_cluster_is_compressed_fn ++ (inode, ext2_block_to_cluster(inode, block)); ++ if (err > 0) ++ ext2_msg (inode->i_sb, "ext2_bmap", ++ "compressed cluster, inode %lu", ++ inode->i_ino); ++ if (err != 0) ++ return 0; ++ } ++ ++ result = ext2_do_bmap(mapping, block); ++ if (result != EXT2_COMPRESSED_BLKADDR) ++ return result; ++ ++ if (!(EXT2_SB(inode->i_sb)->s_es->s_feature_incompat ++ & cpu_to_le32(EXT2_FEATURE_INCOMPAT_COMPRESSION))) ++ ext2_error(inode->i_sb, "ext2_bmap", ++ "compressed_blkaddr (ino %lu, blk %lu) " ++ "on non-compressed fs", ++ inode->i_ino, (unsigned long) block); ++ if (!S_ISREG(inode->i_mode)) ++ ext2_error(inode->i_sb, "ext2_bmap", ++ "compressed_blkaddr for non-regular file " ++ "(ino %lu, blk %lu)", ++ inode->i_ino, (unsigned long) block); ++ return 0; ++} ++#endif /* CONFIG_EXT2_COMPRESS */ ++ + static ssize_t + ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +@@ -848,6 +1540,18 @@ ext2_direct_IO(int rw, struct kiocb *ioc + static int + ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) + { ++#ifdef CONFIG_EXT2_COMPRESS ++#ifdef USE_WRITEPAGE ++ struct ext2_inode_info *ei = EXT2_I(mapping->host); ++ if ( (ei->i_flags & EXT2_COMPRBLK_FL) ++ && !(ei->i_flags & EXT2_NOCOMPR_FL)) ++ { ++ //NULL will invoke ext2_writepage for writeback, hopefully. ++ return mpage_writepages(mapping, wbc, NULL); ++ } ++ else ++#endif ++#endif + return mpage_writepages(mapping, wbc, ext2_get_block); + } + +@@ -996,6 +1700,12 @@ static inline void ext2_free_data(struct + + for ( ; p < q ; p++) { + nr = le32_to_cpu(*p); ++#ifdef CONFIG_EXT2_COMPRESS ++ if (nr == EXT2_COMPRESSED_BLKADDR) { ++ *p = 0; ++ continue; ++ } ++#endif + if (nr) { + *p = 0; + /* accumulate blocks to free if they're contiguous */ +@@ -1040,6 +1750,12 @@ static void ext2_free_branches(struct in + nr = le32_to_cpu(*p); + if (!nr) + continue; ++#ifdef CONFIG_EXT2_COMPRESS ++ if (nr == EXT2_COMPRESSED_BLKADDR) { ++ *p = 0; ++ continue; ++ } ++#endif + *p = 0; + bh = sb_bread(inode->i_sb, nr); + /* +@@ -1064,6 +1780,96 @@ static void ext2_free_branches(struct in + ext2_free_data(inode, p, q); + } + ++/* pjm 1998-01-14: As far as I can tell, "I don't do any locking" is ++ no longer correct, as i_sem is downed for all write() and ++ truncate() stuff except where it doesn't matter (e.g. new inode). */ ++ ++#ifdef CONFIG_EXT2_COMPRESS ++/* If the EXT2_ECOMPR_FL bit is high, then things can go rather badly. ++ This can only happen if access permission was obtained before the ++ flag was raised. Also, it shouldn't be too much of a problem ++ unless the end point of truncation is a compressed cluster with a ++ compression error. */ ++ ++ /* From what I (Antoine) understand, the complexity of the truncate ++ code is due to the fact that we don't want to free blocks that ++ are still referenced. It does not ensure that concurrent read ++ operation will terminate properly, i.e., the semantic of reading ++ while somebody truncates is undefined (you can either get the old ++ data if you got the blocks before, or get plenty of zeros ++ otherwise). */ ++ ++/* todo: Provide error trapping in readiness for when i_op->truncate ++ allows a return code. */ ++static void fix_compression (struct inode * inode) ++{ ++ struct ext2_inode_info *ei = EXT2_I(inode); ++ /*if (atomic_read(&inode->i_mutex.count) > 0) ++ { ++ printk("Assert Mutex failed for file: %s \n", inode_name(inode, 0)); ++ dump_stack(); ++ }*/ ++ ++ assert (ei->i_flags & EXT2_COMPRBLK_FL); /* one or more compressed clusters */ ++ assert ((atomic_read(&inode->i_mutex.count) < 1) ++ || ((inode->i_nlink == 0) ++ && (atomic_read(&inode->i_count) == 0))); ++ /* pjm 1998-01-14: I think the below comment can safely be removed, as ++ it's impossible for someone to be compressing during truncate(), because ++ i_sem is down. */ ++ /* Dans le cas ou les clusters peuvent etre compresses, cela pose ++ un probleme : il faudrait stopper aussi si le cluster est ++ comprime et ne contient pas plus de donnees que i_size ne ++ permet. Sinon, on peut passer son temps a decompresser un ++ cluster que quelqu'un d'autre compresse en meme ++ temps... (TODO). Cela ne peut arriver que si on reverifie apres ++ coup si le cluster est non compresse (ce qu'on fait a l'heure ++ actuelle) => faire autrement. ++ ++ pjm fixme tr ++ ++ If the clusters can be compressed, we'd have a problem: we'd ++ also need to stop if the cluster is compressed and doesn't ++ contain more data than i_size permits. Otherwise we can spend ++ time decompressing a cluster that someone else is compressing ++ at the same time. (TODO.) This can only happen if we reverify ++ "apres coup" ("after the event"? "after each time"?) "si" ("if" ++ or "that") the cluster is not compressed (as we are currently ++ doing) => do differently. */ ++ ++ /* todo: Handle errors from ext2_cluster_is_compressed(). ++ (Except ext2_truncate() currently silently ignores errors ++ anyway.) */ ++ ++ if (!ext2_offset_is_clu_boundary(inode, inode->i_size) ++ && (! ( ei->i_flags & EXT2_NOCOMPR_FL)) ++ && (ext2_cluster_is_compressed_fn ++ (inode, ext2_offset_to_cluster (inode, inode->i_size)) ++ > 0)) { ++ trace_e2c("fix_compression: inode:%lu decompress_cluster!\n", inode->i_ino); ++ ext2_decompress_cluster(inode, ext2_offset_to_cluster(inode, inode->i_size)); ++ /* todo: Check the return code of ++ ext2_decompress_cluster(). (Then again, I don't ++ know how to report an error anyway. ++ ext2_truncate() silently ignores errors.) */ ++ ++ /* Organise for the cluster to be recompressed later. */ ++ assert (ei->i_flags & EXT2_COMPR_FL); ++ ++ ei->i_flags |= EXT2_DIRTY_FL; ++ ei->i_compr_flags |= EXT2_CLEANUP_FL; ++ mark_inode_dirty(inode); ++ } else ++ /* If there are no more compressed clusters, then ++ remove the EXT2_COMPRBLK_FL. Not essential from a ++ safety point of view, but friendlier. We only do ++ this in the `else' because the cleanup function ++ will handle it in the `if' case. */ ++ ext2_update_comprblk(inode); ++} ++#endif ++ ++ + static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) + { + __le32 *i_data = EXT2_I(inode)->i_data; +@@ -1076,6 +1882,27 @@ static void __ext2_truncate_blocks(struc + int n; + long iblock; + unsigned blocksize; ++ ++#ifdef CONFIG_EXT2_COMPRESS ++ /* If the new size is in the middle of a compressed cluster, ++ then we decompress it, and set things up to be recompressed ++ later. ++ ++ todo: It isn't very nice to get ENOSPC on truncate. We ++ can't completely remove the possibility (unless the ++ compression algorithms obey the rule `shorter input never ++ gives longer output') but we could greatly reduce the ++ possibility, e.g. by moving the fix_compression() function ++ to compress.c, and have it decompress and immediately ++ recompress the cluster, without allocating blocks for the ++ full decompressed data. */ ++ if (EXT2_I(inode)->i_flags & EXT2_COMPRBLK_FL) { ++ trace_e2c("ext2_truncate: ino=%ld sz=%d\n", inode->i_ino, (int)inode->i_size); ++ fix_compression(inode); ++ truncate_inode_pages(inode->i_mapping, inode->i_size); ++ } ++#endif ++ + blocksize = inode->i_sb->s_blocksize; + iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); + +@@ -1146,8 +1973,11 @@ do_indirects: + + mutex_unlock(&ei->truncate_mutex); + } +- ++#ifdef CONFIG_EXT2_COMPRESS ++void ext2_truncate_blocks(struct inode *inode, loff_t offset) ++#else + static void ext2_truncate_blocks(struct inode *inode, loff_t offset) ++#endif + { + /* + * XXX: it seems like a bug here that we don't allow +@@ -1335,7 +2165,73 @@ struct inode *ext2_iget (struct super_bl + goto bad_inode; + } + inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); ++#ifdef CONFIG_EXT2_COMPRESS ++ ei->i_flags = 0x807fffff & le32_to_cpu(raw_inode->i_flags); ++ ei->i_compr_flags = 0; ++ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) { ++ ++ if (S_ISDIR(inode->i_mode)) ++ { ++ //mw: ++ //mutex_lock(&inode->i_mutex); ++ if (S_ISDIR(inode->i_mode)) ++ { ++ ei->i_flags &= ~(EXT2_COMPRBLK_FL | EXT2_DIRTY_FL); //modify!!! ++ } ++ //mutex_unlock(&inode->i_mutex); ++ } ++ ++ /* The above shouldn't be necessary unless someone's ++ * been playing with EXT2_IOC_SETFLAGS on a non-e2compr ++ * kernel, or the inode has been scribbled on. ++ */ ++ if (ei->i_flags & (EXT2_COMPR_FL | EXT2_COMPRBLK_FL)) { ++ ei->i_compr_method ++ = (le32_to_cpu(raw_inode->i_flags) >> 26) & 0x1f; ++ ei->i_log2_clu_nblocks ++ = (le32_to_cpu(raw_inode->i_flags) >> 23) & 0x7; ++ if ((ei->i_log2_clu_nblocks < 2) ++ || (ei->i_log2_clu_nblocks > 5)) { ++ if ((ei->i_log2_clu_nblocks == 0) ++ && !(ei->i_flags & EXT2_COMPRBLK_FL)) { ++ /* The EXT2_COMPR_FL flag was ++ * raised under a kernel ++ * without e2compr support. ++ */ ++ if (S_ISREG(inode->i_mode)) ++ ei->i_flags |= EXT2_DIRTY_FL; ++ /* Todo: once we're sure the kernel can ++ * handle [log2_]clu_nblocks==0, get rid ++ * of the next statement. ++ */ ++ ei->i_log2_clu_nblocks ++ = EXT2_DEFAULT_LOG2_CLU_NBLOCKS; ++ } else { ++ ei->i_flags |= EXT2_ECOMPR_FL; ++ ext2_error(inode->i_sb, ++ "ext2_read_inode", ++ "inode %lu is corrupted: " ++ "log2_clu_nblocks=%u", ++ inode->i_ino, ++ ei->i_log2_clu_nblocks); ++ } ++ } ++ } else { ++ ei->i_compr_method = EXT2_DEFAULT_COMPR_METHOD; ++ ei->i_log2_clu_nblocks ++ = EXT2_DEFAULT_LOG2_CLU_NBLOCKS; ++ } ++ if (ei->i_log2_clu_nblocks > ++ (EXT2_LOG2_MAX_CLUSTER_BYTES - inode->i_sb->s_blocksize_bits)) ++ ei->i_log2_clu_nblocks = (EXT2_LOG2_MAX_CLUSTER_BYTES ++ - inode->i_sb->s_blocksize_bits); ++ ei->i_clu_nblocks = 1 << ei->i_log2_clu_nblocks; ++ if (ei->i_flags & EXT2_DIRTY_FL) ++ ei->i_compr_flags = EXT2_CLEANUP_FL; ++ } ++#else /* !CONFIG_EXT2_COMPRESS */ + ei->i_flags = le32_to_cpu(raw_inode->i_flags); ++#endif + ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); + ei->i_frag_no = raw_inode->i_frag; + ei->i_frag_size = raw_inode->i_fsize; +@@ -1458,7 +2354,35 @@ static int __ext2_write_inode(struct ino + + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); ++#ifdef CONFIG_EXT2_COMPRESS ++ if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) ++ && (ei->i_flags & (EXT2_COMPR_FL | EXT2_COMPRBLK_FL))) { ++ if ((ei->i_log2_clu_nblocks < 2) ++ || (ei->i_log2_clu_nblocks > 5)) { ++ ei->i_flags |= EXT2_ECOMPR_FL; ++ ext2_error (inode->i_sb, "ext2_write_inode", ++ "inode %lu is corrupted: log2_clu_nblocks=%u", ++ inode->i_ino, ei->i_log2_clu_nblocks); ++ } ++ assert (ei->i_clu_nblocks == (1 << ei->i_log2_clu_nblocks)); ++ assert (ei->i_compr_method < 0x20); ++ raw_inode->i_flags = cpu_to_le32 ++ ((ei->i_flags & 0x807fffff) ++ | (ei->i_compr_method << 26) ++ | (ei->i_log2_clu_nblocks << 23)); ++ } else ++ { ++ //mw: i_mutex was introduced and disabled again: deadlock with lilo ++ // mutex_lock(&inode->i_mutex); //mw ++ raw_inode->i_flags = cpu_to_le32 //modify !!! ++ (ei->i_flags ++ & 0x807fffff /* no compr meth/size */ ++ & ~(EXT2_COMPR_FL | EXT2_COMPRBLK_FL | EXT2_IMMUTABLE_FL | EXT2_ECOMPR_FL | EXT2_NOCOMPR_FL)); ++ // mutex_unlock(&inode->i_mutex); //mw ++ } ++#else + raw_inode->i_flags = cpu_to_le32(ei->i_flags); ++#endif + raw_inode->i_faddr = cpu_to_le32(ei->i_faddr); + raw_inode->i_frag = ei->i_frag_no; + raw_inode->i_fsize = ei->i_frag_size; +--- linux-3.4-rc5/fs/ext2/file.c 2012-04-29 18:19:10.000000000 -0400 ++++ linux-3.4-rc5-e2c/fs/ext2/file.c 2012-04-30 04:11:03.803143097 -0400 +@@ -18,10 +18,25 @@ + * (jj@sunsite.ms.mff.cuni.cz) + */ + ++#ifdef CONFIG_EXT2_COMPRESS ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#else + #include + #include + #include + #include "ext2.h" ++#endif ++ ++ + #include "xattr.h" + #include "acl.h" + +@@ -30,8 +45,39 @@ + * for a single struct file are closed. Note that different open() calls + * for the same file yield different struct file structures. + */ ++ ++/* ++ * pjm 1998-01-09: I would note that this is different from `when no ++ * process has the inode open'. ++ */ + static int ext2_release_file (struct inode * inode, struct file * filp) + { ++#ifdef CONFIG_EXT2_COMPRESS ++ /* ++ * Now's as good a time as any to clean up wrt compression. ++ * Previously (before 2.1.4x) we waited until ++ * ext2_put_inode(), but now the dcache sometimes delays that ++ * call until umount time. ++ */ ++ //printk(KERN_DEBUG "ext2_release_file: pid=%d, i_ino=%lu, i_count=%d\n", current->pid, inode->i_ino, atomic_read(&inode->i_count)); ++ ++ if (S_ISREG (inode->i_mode) ++ && inode->i_nlink ++ && (EXT2_I(inode)->i_compr_flags & EXT2_CLEANUP_FL)) { ++#ifdef EXT2_COMPR_REPORT_PUT ++ printk(KERN_DEBUG "ext2_release_file: pid=%d, i_ino=%lu, i_count=%d\n", current->pid, inode->i_ino, atomic_read(&inode->i_count)); ++#endif ++ /* ++ * todo: See how the return code of ++ * ext2_release_file() is used, and decide whether it ++ * might be appropriate to pass any errors to ++ * caller. ++ */ ++ //dump_stack(); ++ (void) ext2_cleanup_compressed_inode (inode); ++ } ++ ++#endif + if (filp->f_mode & FMODE_WRITE) { + mutex_lock(&EXT2_I(inode)->truncate_mutex); + ext2_discard_reservation(inode); +@@ -56,6 +102,456 @@ int ext2_fsync(struct file *file, loff_t + return ret; + } + ++#ifdef CONFIG_EXT2_COMPRESS ++struct page_cluster { ++ struct page * page; ++ loff_t pos; ++ unsigned bytes; ++ unsigned long offset; ++ unsigned char in_range; ++ const char * buf; ++}; ++ ++#define PAGE_IN_RANGE 1 ++#define PAGE_KMAPPED 2 ++ ++ ++/** ++ * generic_osync_inode - flush all dirty data for a given inode to disk ++ * @inode: inode to write ++ * @mapping: the address_space that should be flushed ++ * @what: what to write and wait upon ++ * ++ * This can be called by file_write functions for files which have the ++ * O_SYNC flag set, to flush dirty writes to disk. ++ * ++ * @what is a bitmask, specifying which part of the inode's data should be ++ * written and waited upon. ++ * ++ * OSYNC_DATA: i_mapping's dirty data ++ * OSYNC_METADATA: the buffers at i_mapping->private_list ++ * OSYNC_INODE: the inode itself ++ */ ++ ++/* mw: see generic_osync_inode() in kernel<2.6.30 for orginal method. ++ basically we want all of it: OSYNC_DATA and OSYNC_METADATA and OSYNC_INODE */ ++int ex_generic_osync_inode(struct inode *inode, struct address_space *mapping) //, int what) ++{ ++ int err = 0; ++ int need_write_inode_now = 0; ++ int err2; ++ ++ err = filemap_fdatawrite(mapping); ++ ++ err2 = sync_mapping_buffers(mapping); ++ if (!err) ++ err = err2; ++ ++ err2 = filemap_fdatawait(mapping); ++ if (!err) ++ err = err2; ++ ++ /* check if data is dirty */ ++ spin_lock(&inode->i_lock); ++ if (inode->i_state & I_DIRTY) ++ need_write_inode_now = 1; ++ spin_unlock(&inode->i_lock); ++ ++ if (need_write_inode_now) { ++ err2 = write_inode_now(inode, 1); ++ if (!err) ++ err = err2; ++ } ++ else ++ inode_sync_wait(inode); ++ ++ return err; ++} ++ ++ ++/* ++ * Write to a file through the page cache. ++ * ++ * We currently put everything into the page cache prior to writing it. ++ * This is not a problem when writing full pages. With partial pages, ++ * however, we first have to read the data into the cache, then ++ * dirty the page, and finally schedule it for writing. Alternatively, we ++ * could write-through just the portion of data that would go into that ++ * page, but that would kill performance for applications that write data ++ * line by line, and it's prone to race conditions. ++ * ++ * Note that this routine doesn't try to keep track of dirty pages. Each ++ * file system has to do this all by itself, unfortunately. ++ * okir@monad.swb.de ++ */ ++ssize_t ++ext2_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos) ++{ ++ struct address_space *mapping = file->f_dentry->d_inode->i_mapping; ++ struct inode *inode = mapping->host; ++ unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur, written, last_index; /* last page index */ ++ loff_t pos; ++ long status; ++ int err; ++ unsigned bytes; ++ u32 comprblk_mask=0; ++ struct ext2_inode_info *ei = EXT2_I(inode); ++ ++ if (!(ei->i_flags & (EXT2_COMPR_FL|EXT2_COMPRBLK_FL)) ++#undef DUD //mw: I think this is a buggy bug-fix ++#ifdef DUD ++ || (count < inode->i_sb->s_blocksize) ++#endif ++ ) ++ { ++ return do_sync_write(file, buf, count, ppos); ++ } ++ ++ if ((ssize_t) count < 0) ++ return -EINVAL; ++ ++ if (!access_ok(VERIFY_READ, buf, count)) ++ return -EFAULT; ++ ++#ifdef EXT2_COMPR_REPORT_MUTEX ++ printk(KERN_DEBUG "EXT2_FILE_WRITE_LOCK of PID %u @ inode:%lu\n", current->pid, inode->i_ino ); ++#endif ++ mutex_lock(&inode->i_mutex); ++ /* mw: down_read(&inode->i_alloc_sem); // as used by ocsf2 TLL 02/21/07 ++ was removed with kernel 3.1 */ ++ atomic_inc(&inode->i_dio_count); ++ ++ pos = *ppos; ++ err = -EINVAL; ++ if (pos < 0) ++ goto out; ++ ++ written = 0; ++ ++ /* FIXME: this is for backwards compatibility with 2.4 */ ++ if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) ++ { ++ pos = inode->i_size; ++ } ++ ++ /* ++ * Check whether we've reached the file size limit. ++ */ ++ err = -EFBIG; ++ ++ if (limit != RLIM_INFINITY) { ++ if (pos >= limit) { ++ send_sig(SIGXFSZ, current, 0); ++ goto out; ++ } ++ if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) { ++ /* send_sig(SIGXFSZ, current, 0); */ ++ count = limit - (u32)pos; ++ } ++ } ++ ++ /* ++ * LFS rule ++ */ ++ if ( pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { ++ if (pos >= MAX_NON_LFS) { ++ send_sig(SIGXFSZ, current, 0); ++ goto out; ++ } ++ if (count > MAX_NON_LFS - (u32)pos) { ++ /* send_sig(SIGXFSZ, current, 0); */ ++ count = MAX_NON_LFS - (u32)pos; ++ } ++ } ++ ++ /* ++ * Are we about to exceed the fs block limit ? ++ * ++ * If we have written data it becomes a short write ++ * If we have exceeded without writing data we send ++ * a signal and give them an EFBIG. ++ * ++ * Linus frestrict idea will clean these up nicely.. ++ */ ++ if (!S_ISBLK(inode->i_mode)) { ++ if (pos >= inode->i_sb->s_maxbytes) { ++ if (count || pos > inode->i_sb->s_maxbytes) { ++ send_sig(SIGXFSZ, current, 0); ++ err = -EFBIG; ++ goto out; ++ } ++ /* zero-length writes at ->s_maxbytes are OK */ ++ } ++ ++ if (pos + count > inode->i_sb->s_maxbytes) ++ count = inode->i_sb->s_maxbytes - pos; ++ } else { ++ if (bdev_read_only(inode->i_sb->s_bdev)) { ++ err = -EPERM; ++ goto out; ++ } ++ if (pos >= inode->i_size) { ++ if (count || pos > inode->i_size) { ++ err = -ENOSPC; ++ goto out; ++ } ++ } ++ ++ if (pos + count > inode->i_size) ++ { ++ count = inode->i_size - pos; ++ } ++ } ++ ++ err = 0; ++ if (count == 0) ++ goto out; ++ ++ status = 0; ++ ++ if (file->f_flags & O_DIRECT) ++ { ++ err = -EINVAL; ++ goto out; ++ } ++ /* ++ * We must still check for EXT2_ECOMPR_FL, as it may have been ++ * set after we got the write permission to this file. ++ */ ++ if ((ei->i_flags & (EXT2_ECOMPR_FL | EXT2_NOCOMPR_FL)) == (EXT2_ECOMPR_FL | 0)) ++ { ++ err = -EXT2_ECOMPR; ++ goto out; ++ } ++ ++ should_remove_suid(file->f_dentry); ++ inode->i_ctime = inode->i_mtime = CURRENT_TIME; ++ mark_inode_dirty_sync(inode); ++ ++ if ((pos+count) > inode->i_size) ++ last_index = (pos+count-1) >> PAGE_CACHE_SHIFT; ++ else ++ last_index = (inode->i_size-1) >> PAGE_CACHE_SHIFT; ++ ++ comprblk_mask = ei->i_flags | ~EXT2_COMPRBLK_FL; ++ ++ //mw: now do it cluster-wise ++ do { ++ //unsigned long index, offset, clusters_page_index0, ++ unsigned long index, nextClusterFirstByte, cluster_compressed=0; ++ u32 cluster=0; ++ status = -ENOMEM; /* we'll assign it later anyway */ ++ ++#ifdef EXT2_COMPRESS_WHEN_CLU ++ ei->i_flags |= EXT2_COMPRBLK_FL; ++ assert( (file->f_flags & O_DIRECT) == 0); ++ assert(mapping_mapped(inode->i_mapping) == 0); ++#endif ++ ++ index = pos >> PAGE_CACHE_SHIFT; /*mw: pageindex (start)*/ ++ cluster = ext2_page_to_cluster(inode, index); ++ ++ /* ++ * We decompress the cluster if needed, and write ++ * the data as normal. The cluster will be ++ * compressed again when the inode is cleaned up. ++ */ ++ if ((comprblk_mask == ~(u32)0) ++ && !(ei->i_flags & EXT2_NOCOMPR_FL)) { ++ /* AUFFÄLLIG 2*/ ++ /* assert (block == pos >> inode->i_sb->s_blocksize_bits); */ ++ ++ cluster_compressed = ext2_cluster_is_compressed_fn(inode, cluster); ++ if (cluster_compressed < 0) { ++ if (! written) ++ written = cluster_compressed; ++ break; ++ } ++ } ++ ++ if (cluster_compressed > 0) { ++ /* Here, decompression take place */ ++ cluster_compressed = ext2_decompress_cluster(inode, cluster); ++ if (cluster_compressed < 0) { ++ if (! written) { ++ written = cluster_compressed; ++ } ++ break; ++ } ++ } ++ ++ nextClusterFirstByte = (ext2_cluster_page0(inode, cluster+1) * PAGE_CACHE_SIZE); ++ bytes = nextClusterFirstByte - pos; /*mw: bytes todo in this cluster*/ ++ if (bytes > count) { ++ bytes = count; /*mw: if end of data*/ ++ } ++ ++#ifdef EXT2_COMPR_DEBUG ++ //assert we stay inside the cluster! ++ { ++ int endpos; ++ int endindex; ++ int endcluster; ++ unsigned long thisClusterFirstByte; ++ int relstart, relend, startblock, endblock; ++ ++ thisClusterFirstByte = (ext2_cluster_page0(inode, cluster) * PAGE_CACHE_SIZE); ++ ++ relstart = pos - thisClusterFirstByte; ++ relend = bytes + relstart; ++ ++ startblock = relstart >> 10; ++ endblock = relend >> 10; ++ ++ ++ endpos = pos + bytes; ++ //printk("do_sync_write cluster %d: inode:%lu, \t start:%i(%i), end:%i(%i), \t ccount:%d \t tcount:%d\n", cluster , inode->i_ino, relstart, startblock, relend , endblock, (int)bytes, count); ++ endindex = (endpos-1) >> PAGE_CACHE_SHIFT; /*mw: pageindex (start)*/ ++ endcluster = ext2_page_to_cluster(inode, endindex); ++ assert(cluster == endcluster); ++ } ++#endif ++ ++ //mw: must unlock here, do_sync_write() will aquire the mutex again ++ mutex_unlock(&inode->i_mutex); ++ ++ //mw: this is pretty clever: we use the generic method now :-) ++ //printk("do_sync_write cluster %d, mapped:%i\n", cluster, mapping_mapped(inode->i_mapping)); ++ //status = do_sync_write_nolock(file, buf, bytes, &pos); //without locking mutex ++ status = do_sync_write(file, buf, bytes, &pos); //with locking mutex ++ assert(status>=0); ++ ++ mutex_lock(&inode->i_mutex); ++ ++ written += status; ++ count -= status; ++ buf += status; ++ ++#ifdef EXT2_COMPRESS_WHEN_CLU ++ assert (ei->i_flags & EXT2_COMPRBLK_FL); ++ if ((ei->i_flags & EXT2_COMPR_FL) ++ && (ext2_offset_is_clu_boundary(inode, pos)) ) { ++ ++ if (mapping_mapped(inode->i_mapping) == 0 ) ++ /* ++ * Pierre Peiffer: For file mapped (via mmap, I mean), ++ * compression will occure when releasing the file. ++ * We must, in this case, avoid the pages (possibly ++ * mapped by a process) to be compressed under them. ++ */ ++ { ++ int error; ++ assert(mapping_mapped(inode->i_mapping) == 0); ++ error = ext2_compress_cluster(inode, cluster); ++ /*if (ext2_cluster_is_compressed_fn(inode, cluster)) ++ ext2_decompress_cluster(inode, cluster);*/ ++ assert(mapping_mapped(inode->i_mapping) == 0); ++ /* ++ * Actually, raising write_error may be a ++ * mistake. For example, ++ * ext2_cleanup_compressed_cluster() doesn't ++ * usually return any errors to user. todo: ++ * Have a look at ext2_compress_cluster, and ++ * check whether its errors are such that they ++ * should be returned to user. Some of the ++ * will be, of course, but it might be ++ * possible for it to return without ++ * change. ++ */ ++ if (error > 0) ++ comprblk_mask = ~(u32)0; ++ } else { ++#ifdef EXT2_COMPR_REPORT ++ char bdn[BDEVNAME_SIZE]; ++ bdevname(inode->i_sb->s_bdev, bdn); ++#endif ++ ++ trace_e2c("ext2_file_write: (dev. %s): " ++ "ino=%ld, cluster=%d: file mapped, does " ++ "not compress cluster\n", ++ bdn, inode->i_ino, cluster); ++ ei->i_flags |= EXT2_DIRTY_FL; ++ ei->i_compr_flags |= EXT2_CLEANUP_FL; ++ } ++ } ++#endif ++ ++ } while (count); ++ *ppos = pos; ++ ++ /* ++ * For now, when the user asks for O_SYNC, we'll actually ++ * provide O_DSYNC. ++ */ ++ if (status >= 0) { ++ if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { ++ /*if (ei->i_compr_flags & EXT2_OSYNC_INODE) { ++ osync_already = 1; ++ } else { ++ osync_already = 0; ++ ei->i_compr_flags |= EXT2_OSYNC_INODE; ++ }*/ ++ /* Should 2nd arg be inode->i_mapping? */ ++ status = ex_generic_osync_inode(inode, file->f_mapping ++ /*, OSYNC_METADATA|OSYNC_DATA*/); ++ /*if (osync_already == 0) { ++ ei->i_compr_flags &= ~EXT2_OSYNC_INODE; ++ }*/ ++ } ++ } ++ ++ err = written ? written : status; ++ ++# ifdef EXT2_COMPRESS_WHEN_CLU ++ //mw: ext2_compress_cluster() might remove EXT2_COMPRBLK_FL ++ //if the file does not compress at all. this is NO error: remove next line? ++ //assert (ei->i_flags & EXT2_COMPRBLK_FL); ++ ++ ei->i_flags &= comprblk_mask; ++ if ( (ei->i_flags & EXT2_COMPR_FL) ++ && (!ext2_offset_is_clu_boundary(inode, pos)) ) ++ { ++ ei->i_flags |= EXT2_DIRTY_FL; ++ ei->i_compr_flags |= EXT2_CLEANUP_FL; ++ } ++ ++# else ++ if (ei->i_flags & EXT2_COMPR_FL) { ++ ei->i_flags |= EXT2_DIRTY_FL; ++ ei->i_compr_flags |= EXT2_CLEANUP_FL; ++ } ++# endif ++out: ++ ++#ifdef EXT2_COMPR_REPORT_MUTEX ++ printk(KERN_DEBUG "EXT2_FILE_WRITE_UNLOCK of PID %u @ inode:%lu\n", current->pid, inode->i_ino); ++#endif ++ /* mw: up_read(&inode->i_alloc_sem); // as used by ocsf2 TLL 02/21/07 ++ was removed with kernel 3.1 */ ++ inode_dio_done(inode); ++ mutex_unlock(&inode->i_mutex); ++ return err; ++} ++ ++/* ++ * Called when an inode is about to be open. ++ * We use this to disallow opening RW large files on 32bit systems if ++ * the caller didn't specify O_LARGEFILE. On 64bit systems we force ++ * on this flag in sys_open. ++ * Prevent opening compressed file with O_DIRECT. ++ */ ++static int ext2_file_open(struct inode * inode, struct file * filp) ++{ ++ if ((filp->f_flags & O_DIRECT) && (EXT2_I(inode)->i_flags & ++ (EXT2_COMPR_FL|EXT2_COMPRBLK_FL))) ++ return -EINVAL; ++ if (!(filp->f_flags & O_LARGEFILE) && inode->i_size > MAX_NON_LFS) ++ return -EFBIG; ++ ++ return 0; ++ } ++#endif /* CONFIG_EXT2_COMPRESS*/ ++ + /* + * We have mostly NULL's here: the current defaults are ok for + * the ext2 filesystem. +@@ -63,7 +559,12 @@ int ext2_fsync(struct file *file, loff_t + const struct file_operations ext2_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, ++#ifdef CONFIG_EXT2_COMPRESS ++ .write = ext2_file_write, ++#else + .write = do_sync_write, ++#endif ++ + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .unlocked_ioctl = ext2_ioctl, +@@ -71,7 +572,11 @@ const struct file_operations ext2_file_o + .compat_ioctl = ext2_compat_ioctl, + #endif + .mmap = generic_file_mmap, ++#ifdef CONFIG_EXT2_COMPRESS ++ .open = ext2_file_open, ++#else + .open = dquot_file_open, ++#endif + .release = ext2_release_file, + .fsync = ext2_fsync, + .splice_read = generic_file_splice_read, +--- linux-3.4-rc5/fs/ext2/ioctl.c 2012-04-29 18:19:10.000000000 -0400 ++++ linux-3.4-rc5-e2c/fs/ext2/ioctl.c 2012-04-30 04:11:03.805143098 -0400 +@@ -7,7 +7,14 @@ + * Universite Pierre et Marie Curie (Paris VI) + */ + ++#ifdef CONFIG_EXT2_COMPRESS ++#include ++#include ++#include ++#include ++#else + #include "ext2.h" ++#endif + #include + #include + #include +@@ -17,6 +24,65 @@ + #include + + ++#ifdef CONFIG_EXT2_COMPRESS ++ ++#ifndef MIN ++# define MIN(a,b) ((a) < (b) ? (a) : (b)) ++#endif ++ ++#ifdef CONFIG_GZ_HACK ++static int check_name(struct inode *ino) ++{ ++ struct dentry *dentry = list_entry(ino->i_dentry.next, struct dentry, d_alias); ++ if (dentry) ++ if ( ++ ++ (dentry->d_name.len >= 4) && ++ (((dentry->d_name.name[dentry->d_name.len - 2] == 'g') ++ && (dentry->d_name.name[dentry->d_name.len - 1] == 'z') ++ && ((dentry->d_name.name[dentry->d_name.len - 3] == '.') ++ || (dentry->d_name.name[dentry->d_name.len - 4] == '.'))) ++ ++ || ((dentry->d_name.name[dentry->d_name.len - 3] == 't') ++ && (dentry->d_name.name[dentry->d_name.len - 2] == 'g') ++ && (dentry->d_name.name[dentry->d_name.len - 1] == 'z') ++ && (dentry->d_name.name[dentry->d_name.len - 4] == '.') ++ && (dentry->d_name.len >= 5)) ++ ++ || ((dentry->d_name.name[dentry->d_name.len - 3] == 'p') ++ && (dentry->d_name.name[dentry->d_name.len - 2] == 'n') ++ && (dentry->d_name.name[dentry->d_name.len - 1] == 'g') ++ && (dentry->d_name.name[dentry->d_name.len - 4] == '.') ++ && (dentry->d_name.len >= 5)) ++ ++ || ((dentry->d_name.name[dentry->d_name.len - 3] == 'j') ++ && (dentry->d_name.name[dentry->d_name.len - 2] == 'p') ++ && (dentry->d_name.name[dentry->d_name.len - 1] == 'g') ++ && (dentry->d_name.name[dentry->d_name.len - 4] == '.') ++ && (dentry->d_name.len >= 5)) ++ ++ || ((dentry->d_name.name[dentry->d_name.len - 3] == 'b') ++ && (dentry->d_name.name[dentry->d_name.len - 2] == 'z') ++ && (dentry->d_name.name[dentry->d_name.len - 1] == '2') ++ && (dentry->d_name.name[dentry->d_name.len - 4] == '.') ++ && (dentry->d_name.len >= 5)) ++ ++ || ((dentry->d_name.name[dentry->d_name.len - 3] == 'm') ++ && (dentry->d_name.name[dentry->d_name.len - 2] == 'n') ++ && (dentry->d_name.name[dentry->d_name.len - 1] == 'g') ++ && (dentry->d_name.name[dentry->d_name.len - 4] == '.') ++ && (dentry->d_name.len >= 5)) ++ ) ++ ) { ++ return 1; ++ } ++ return 0; ++} ++#endif ++#endif ++ ++ ++ + long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_dentry->d_inode; +@@ -24,6 +90,10 @@ long ext2_ioctl(struct file *filp, unsig + unsigned int flags; + unsigned short rsv_window_size; + int ret; ++#ifdef CONFIG_EXT2_COMPRESS ++ unsigned long datum; ++ int err; ++#endif + + ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg); + +@@ -75,7 +145,127 @@ long ext2_ioctl(struct file *filp, unsig + } + + flags = flags & EXT2_FL_USER_MODIFIABLE; ++#ifdef CONFIG_EXT2_COMPRESS ++ if (S_ISREG (inode->i_mode) || S_ISDIR (inode->i_mode)) { ++ ++ /* pjm 1998-01-14: In previous versions of ++ e2compr, the kernel forbade raising ++ EXT2_ECOMPR_FL from userspace. I can't ++ think of any purpose for forbidding this, ++ and I find it useful to raise ++ EXT2_ECOMPR_FL for testing purposes, so ++ I've removed the forbidding code. */ ++ if (S_ISREG (inode->i_mode) ++ && (EXT2_NOCOMPR_FL ++ & (flags ^ ei->i_flags))) { // mw hint: ^ is a (excluisive OR) ++ /* NOCOMPR_FL can only be changed if ++ nobody else has the file opened. */ ++ /* pjm 1998-02-16: inode->i_count is ++ useless to us because only dentries ++ use inodes now. Unfortunately, ++ there isn't an easy way of finding ++ the equivalent. We'd have to go ++ through all dentries using the ++ inode, and sum their d_count ++ values. Rather than do that, I'd ++ rather get rid of the exclusion ++ constraint. todo. */ ++ //printk("i_count: %i\n", atomic_read(&inode->i_count)); ++ //if (atomic_read(&inode->i_count) > 1) ++ //if (0) ++ if (ext2_get_dcount(inode) > 1) ++ { ++ mutex_unlock(&inode->i_mutex); /*mw*/ ++ return -ETXTBSY; ++ } ++ else { ++ /* pjm 970429: Discarding ++ cached pages is not very ++ clean, but should work. */ ++ /* pjm 980114: Not quite. We ++ should also sync any ++ mappings to buffers first. ++ This isn't very important, ++ as none of the current ++ e2compr programs can ++ trigger this, but todo. */ ++ invalidate_remote_inode (inode); ++ } ++ } ++ ++ if (EXT2_COMPR_FL ++ & (flags ^ ei->i_flags)) { ++ if (flags & EXT2_COMPR_FL) { ++ if (ei->i_flags & EXT2_COMPRBLK_FL) { ++ /* There shouldn't actually be any ++ compressed blocks, AFAIK. However, ++ this is still possible because sometimes ++ COMPRBLK gets raised just to stop ++ us changing cluster size at the wrong ++ time. ++ ++ todo: Call a function that just ++ checks that there are not compressed ++ clusters, and print a warning if any are ++ found. */ ++ } else { ++ int bits = MIN(EXT2_DEFAULT_LOG2_CLU_NBLOCKS, ++ (EXT2_LOG2_MAX_CLUSTER_BYTES ++ - inode->i_sb->s_blocksize_bits)); ++ ++ ei->i_log2_clu_nblocks = bits; ++ ei->i_clu_nblocks = 1 << bits; ++ } ++ ei->i_compr_method = EXT2_DEFAULT_COMPR_METHOD; ++ if (S_ISREG (inode->i_mode)) { ++ //compress ++#ifdef CONFIG_GZ_HACK ++ /* mw: check for .gz-files and similar ++ * I think this is the most clever place for ++ * rejecting files. They remain regular, uncompressed ++ * files and though can be read bypassing all ++ * compression stuff (= fast) :-). And it seems to save ++ * space... somehow */ ++ if (check_name (inode)) ++ { ++ //printk("non-compressable file extension\n"); ++ mutex_unlock(&inode->i_mutex); ++ return 0; ++ } ++#endif ++ //set flags to trigger compression later on ++ flags |= EXT2_DIRTY_FL; ++ ei->i_compr_flags |= EXT2_CLEANUP_FL; ++ } ++ } else if (S_ISREG (inode->i_mode)) { ++ if (ei->i_flags & EXT2_COMPRBLK_FL) { ++ int err; ++ ++ if (ext2_get_dcount(inode) > 1){ ++ mutex_unlock(&inode->i_mutex); //mw ++ return -ETXTBSY; ++ } ++ err = ext2_decompress_inode(inode); ++ if (err) ++ { ++ mutex_unlock(&inode->i_mutex); //mw ++ return err; ++ } ++ } ++ ei->i_flags &= ~EXT2_DIRTY_FL; ++ ei->i_compr_flags &= ~EXT2_CLEANUP_FL; ++ } ++ } ++ } ++#endif + flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE; ++#ifdef CONFIG_EXT2_COMPRESS ++ /* bug fix: scrub 'B' flag from uncompressed files TLL 02/28/07 */ ++ if (!(flags & EXT2_COMPR_FL) && (flags & EXT2_COMPRBLK_FL) ) ++ { ++ flags &= ~EXT2_COMPRBLK_FL; ++ } ++#endif + ei->i_flags = flags; + + ext2_set_inode_flags(inode); +@@ -158,6 +348,184 @@ setversion_out: + mnt_drop_write_file(filp); + return 0; + } ++#ifdef CONFIG_EXT2_COMPRESS ++ case EXT2_IOC_GETCOMPRMETHOD: /* Result means nothing if COMPR_FL is not set */ ++ return put_user (ei->i_compr_method, (long *) arg); ++ case EXT2_IOC_SETCOMPRMETHOD: ++ if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER)) ++ return -EPERM; ++ if (IS_RDONLY (inode)) ++ return -EROFS; ++ if (get_user (datum, (long*) arg)) ++ return -EFAULT; ++ if (!S_ISREG (inode->i_mode) && !S_ISDIR (inode->i_mode)) ++ return -ENOSYS; ++ /* todo: Allow the below, but set initial value of ++ i_compr_meth at read_inode() time (using default if ++ !/) instead of +c time. Same for cluster ++ size. */ ++ if ((unsigned) datum >= EXT2_N_METHODS) ++ return -EINVAL; ++ if (ei->i_compr_method != datum) { ++ if ((ei->i_compr_method == EXT2_NEVER_METH) ++ && (ei->i_flags & EXT2_COMPR_FL)) ++ return -EPERM; ++ /* If the previous method was `defer' then ++ take a look at all uncompressed clusters ++ and try to compress them. (pjm 1997-04-16) */ ++ if ((ei->i_compr_method == EXT2_DEFER_METH) ++ && S_ISREG (inode->i_mode)) { ++ ei->i_flags |= EXT2_DIRTY_FL; ++ ei->i_compr_flags |= EXT2_CLEANUP_FL; ++ } ++ if ((datum == EXT2_NEVER_METH) ++ && S_ISREG (inode->i_mode)) { ++ //printk("SETCOMPR\n"); ++ if ((ei->i_flags & EXT2_COMPRBLK_FL)) ++ { ++ /*mw*/ ++ mutex_lock(&inode->i_mutex); ++ if (ext2_get_dcount(inode) > 1){ ++ mutex_unlock(&inode->i_mutex); /*mw*/ ++ return -ETXTBSY; ++ } ++ err = ext2_decompress_inode(inode); ++ mutex_unlock(&inode->i_mutex); ++ if ( err < 0) ++ return err; ++ } ++ ei->i_flags &= ~EXT2_DIRTY_FL; ++ ei->i_compr_flags &= ~EXT2_CLEANUP_FL; ++ } ++ ei->i_compr_method = datum; ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty(inode); ++ } ++#ifdef CONFIG_KMOD ++ if (!ext2_algorithm_table[ext2_method_table[datum].alg].avail) { ++ char str[32]; ++ ++ sprintf(str, "ext2-compr-%s", ext2_algorithm_table[ext2_method_table[datum].alg].name); ++ request_module(str); ++ } ++#endif ++ datum = ((datum < EXT2_N_METHODS) ++ && (ext2_algorithm_table[ext2_method_table[datum].alg].avail)); ++ return put_user(datum, (long *)arg); ++ ++ case EXT2_IOC_GETCLUSTERBIT: ++ if (get_user (datum, (long*) arg)) ++ return -EFAULT; ++ if (!S_ISREG (inode->i_mode)) ++ return -ENOSYS; ++ /* We don't do `down(&inode->i_sem)' here because ++ there's no way for userspace to do the ++ corresponding up(). Userspace must rely on ++ EXT2_NOCOMPR_FL if it needs to lock. */ ++ err = ext2_cluster_is_compressed (inode, datum); ++ if (err < 0) ++ return err; ++ return put_user ((err ? 1 : 0), ++ (long *) arg); ++ ++ case EXT2_IOC_RECOGNIZE_COMPRESSED: ++ if (get_user (datum, (long*) arg)) ++ return -EFAULT; ++ if (!S_ISREG (inode->i_mode)) ++ return -ENOSYS; ++ if (IS_RDONLY (inode)) ++ return -EROFS; ++ return ext2_recognize_compressed (inode, datum); ++ ++ case EXT2_IOC_GETCLUSTERSIZE: ++ /* Result means nothing if COMPR_FL is not set (until ++ SETCLUSTERSIZE w/o COMPR_FL is implemented; ++ todo). */ ++ if (!S_ISREG (inode->i_mode) ++ && !S_ISDIR (inode->i_mode)) ++ return -ENOSYS; ++ return put_user (ei->i_clu_nblocks, (long *) arg); ++ ++ case EXT2_IOC_GETFIRSTCLUSTERSIZE: ++ /* Result means nothing if COMPR_FL is not set (until ++ SETCLUSTERSIZE w/o COMPR_FL is implemented; ++ todo). */ ++ if (!S_ISREG (inode->i_mode) ++ && !S_ISDIR (inode->i_mode)) ++ return -ENOSYS; ++ return put_user (ext2_first_cluster_nblocks(inode), (long *) arg); ++ ++ case EXT2_IOC_SETCLUSTERSIZE: ++ if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER)) ++ return -EPERM; ++ if (IS_RDONLY (inode)) ++ return -EROFS; ++ if (get_user (datum, (long *) arg)) ++ return -EFAULT; ++ if (!S_ISREG (inode->i_mode) ++ && !S_ISDIR (inode->i_mode)) ++ return -ENOSYS; ++ ++ /* These are the only possible cluster sizes. The ++ cluster size must be a power of two so that ++ clusters don't straddle address (aka indirect) ++ blocks. At the moment, the upper limit is constrained ++ by how much memory is allocated for de/compression. ++ Also, the gzip algorithms have some optimisations ++ that assume tht the input is no more than 32KB, ++ and in compress.c we would need to zero more bits ++ of head->holemap. (In previous releases, the file ++ format was limited to 32 blocks and under 64KB.) */ ++// #if EXT2_MAX_CLUSTER_BLOCKS > 32 || EXT2_MAX_CLUSTER_NBYTES > 32768 ++// # error "This code not updated for cluster size yet." ++// #endif ++ switch (datum) { ++ case (1 << 2): datum = 2; break; ++ case (1 << 3): datum = 3; break; ++ case (1 << 4): datum = 4; break; ++ case (1 << 5): datum = 5; break; ++ default: return -EINVAL; ++ } ++ ++ assert (ei->i_clu_nblocks == (1 << ei->i_log2_clu_nblocks)); ++ if (datum == ei->i_log2_clu_nblocks) ++ return 0; ++ ++ if (ei->i_flags & EXT2_ECOMPR_FL) ++ return -EPERM; ++ if (!(ei->i_flags & EXT2_COMPR_FL)) ++ return -ENOSYS; ++ ++ /* We currently lack a mechanism to change the cluster ++ size if there are already some compressed clusters. ++ The compression must be done in userspace ++ (e.g. with the e2compress program) instead. */ ++ if (ei->i_flags & EXT2_COMPRBLK_FL) ++ return -ENOSYS; ++ ++ if (datum + inode->i_sb->s_blocksize_bits ++ > EXT2_LOG2_MAX_CLUSTER_BYTES) ++ return -EINVAL; ++ ++ ei->i_log2_clu_nblocks = datum; ++ ei->i_clu_nblocks = 1 << datum; ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty(inode); ++ return 0; ++ ++ case EXT2_IOC_GETCOMPRRATIO: ++ if (!S_ISREG (inode->i_mode)) ++ return -ENOSYS; ++ if (ei->i_flags & EXT2_ECOMPR_FL) ++ return -EPERM; ++ if ((long) (datum = ext2_count_blocks (inode)) < 0) ++ return datum; ++ if ((err = put_user ((long) datum, (long*) arg))) ++ return err; ++ return put_user ((long) inode->i_blocks, (long*) arg + 1); ++ ++ ++#endif + default: + return -ENOTTY; + } +--- linux-3.4-rc5/fs/ext2/ext2.h 2012-04-29 18:19:10.000000000 -0400 ++++ linux-3.4-rc5-e2c/fs/ext2/ext2.h 2012-04-30 07:30:58.249092266 -0400 +@@ -378,6 +381,7 @@ struct ext2_inode { + #define EXT2_MOUNT_MINIX_DF 0x000080 /* Mimics the Minix statfs */ + #define EXT2_MOUNT_NOBH 0x000100 /* No buffer_heads */ + #define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */ ++#define EXT2_MOUNT_FORCE_COMPAT 0x000400 /* e2compr: Mount despite incompatibilities */ + #define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */ + #define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */ + #define EXT2_MOUNT_XIP 0x010000 /* Execute in place */ +@@ -543,8 +547,25 @@ struct ext2_super_block { + #define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff + + #define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR ++ ++/* ++ * e2compr specific ++ */ ++ ++#define EXT2_GRAIN_SIZE 1024 ++#define EXT2_NOCOMPR_FL FS_NOCOMP_FL /* Access raw data */ ++ ++ ++#ifdef CONFIG_EXT2_COMPRESS ++#define EXT2_FEATURE_INCOMPAT_SUPP (EXT2_FEATURE_INCOMPAT_COMPRESSION| \ ++ EXT2_FEATURE_INCOMPAT_FILETYPE| \ ++ EXT2_FEATURE_INCOMPAT_META_BG) ++#else + #define EXT2_FEATURE_INCOMPAT_SUPP (EXT2_FEATURE_INCOMPAT_FILETYPE| \ + EXT2_FEATURE_INCOMPAT_META_BG) ++#endif ++ ++ + #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT2_FEATURE_RO_COMPAT_BTREE_DIR) +@@ -668,6 +690,12 @@ struct ext2_inode_info { + struct ext2_block_alloc_info *i_block_alloc_info; + + __u32 i_dir_start_lookup; ++#ifdef CONFIG_EXT2_COMPRESS ++ __u8 i_log2_clu_nblocks; ++ __u8 i_clu_nblocks; ++ __u8 i_compr_method; ++ __u8 i_compr_flags; ++#endif + #ifdef CONFIG_EXT2_FS_XATTR + /* + * Extended attributes can be read independently of the main file +@@ -757,6 +785,7 @@ extern void ext2_set_inode_flags(struct + extern void ext2_get_inode_flags(struct ext2_inode_info *); + extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); ++extern void ext2_truncate_blocks(struct inode *inode, loff_t offset); + + /* ioctl.c */ + extern long ext2_ioctl(struct file *, unsigned int, unsigned long); +--- linux-3.4-rc5/include/linux/ext2_fs.h 2012-04-29 18:19:10.000000000 -0400 ++++ linux-3.4-rc5-e2c/include/linux/ext2_fs.h 2012-04-30 04:11:03.818143098 -0400 +@@ -39,4 +39,16 @@ static inline u64 ext2_image_size(void * + le32_to_cpup((__le32 *)(p + EXT2_SB_BSIZE_OFFSET)); + } + ++#ifndef __KERNEL__ ++/* This simplifies things for user programs (notably e2fsprogs) that ++ must compile whether or not is present, but ++ would prefer to include it. Presumably the file is present if the ++ user has this version of ext2_fs.h. */ ++ ++# /* Do not remove this comment. */ include ++ ++/* The comment between `#' and `include' prevents mkdep from generating ++ a dependency on ext2_fs_c.h. */ ++#endif ++ + #endif /* _LINUX_EXT2_FS_H */ +--- linux-3.4-rc5/fs/fcntl.c 2012-04-29 18:19:10.000000000 -0400 ++++ linux-3.4-rc5-e2c/fs/fcntl.c 2012-04-30 04:11:03.820143098 -0400 +@@ -25,6 +25,12 @@ + #include + #include + ++#ifdef CONFIG_EXT2_COMPRESS ++//mw: deny O_DIRECT on file with compression ++#include ++#include "ext2/ext2.h" ++#endif ++ + void set_close_on_exec(unsigned int fd, int flag) + { + struct files_struct *files = current->files; +@@ -171,6 +177,16 @@ static int setfl(int fd, struct file * f + if (!filp->f_mapping || !filp->f_mapping->a_ops || + !filp->f_mapping->a_ops->direct_IO) + return -EINVAL; ++ ++#ifdef CONFIG_EXT2_COMPRESS ++ //mw: if we have a compressed ext2 file: deny! ++ // TODO: maybe check fs-type first! ++ //assert(!(EXT2_I(inode)->i_flags & (EXT2_COMPR_FL|EXT2_COMPRBLK_FL))); ++ if (EXT2_I(inode)->i_flags & (EXT2_COMPR_FL|EXT2_COMPRBLK_FL)) ++ { ++ return -EINVAL; ++ } ++#endif + } + + if (filp->f_op && filp->f_op->check_flags) +--- linux-3.4-rc5/mm/truncate.c 2012-04-29 18:19:10.000000000 -0400 ++++ linux-3.4-rc5-e2c/mm/truncate.c 2012-05-02 16:13:52.383974864 -0400 +@@ -22,6 +22,9 @@ + #include + #include "internal.h" + ++#ifdef CONFIG_EXT2_COMPRESS ++#include ++#endif + + /** + * do_invalidatepage - invalidate part or all of a page +@@ -595,6 +598,11 @@ int vmtruncate(struct inode *inode, loff + if (error) + return error; + ++#ifdef CONFIG_EXT2_COMPRESS ++ if ((inode->i_op && inode->i_op->truncate) && ++ ((strcmp(inode->i_sb->s_type->name, "ext2") != 0) || ++ (!(EXT2_I(inode)->i_flags & EXT2_COMPRBLK_FL)))) ++#endif + truncate_setsize(inode, newsize); + if (inode->i_op->truncate) + inode->i_op->truncate(inode); +--- linux-3.4-rc5/mm/swapfile.c 2012-04-29 18:19:10.000000000 -0400 ++++ linux-3.4-rc5-e2c/mm/swapfile.c 2012-04-30 04:11:03.822143098 -0400 +@@ -31,6 +31,10 @@ + #include + #include + #include ++#ifdef CONFIG_EXT2_COMPRESS ++#include ++#endif ++ + + #include + #include +@@ -2060,6 +2064,24 @@ SYSCALL_DEFINE2(swapon, const char __use + } + + inode = mapping->host; ++ ++#ifdef CONFIG_EXT2_COMPRESS ++ /* ++ * Swapping not supported for e2compressed files. ++ * (Actually, this code is pretty useless because we ++ * should get an error later anyway because of the ++ * holes.) Yes, this is pretty horrible code... I'll ++ * improve it later. ++ */ ++ if ((strcmp(inode->i_sb->s_type->name, "ext2") == 0) ++ && (EXT2_I(inode)->i_flags & EXT2_COMPRBLK_FL)) ++ { ++ printk("Assertion: Error NO swap SWAP implemented!\n"); ++ error = -EINVAL; ++ goto bad_swap; ++ } ++#endif ++ + /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ + error = claim_swapfile(p, inode); + if (unlikely(error)) +--- linux-3.4-rc5/mm/filemap.c 2012-04-29 18:19:10.000000000 -0400 ++++ linux-3.4-rc5-e2c/mm/filemap.c 2012-04-30 04:11:03.824143098 -0400 +@@ -43,6 +43,10 @@ + + #include + ++#ifdef CONFIG_EXT2_COMPRESS ++# include ++#endif ++ + /* + * Shared mappings implemented 30.11.1994. It's not fully working yet, + * though. +@@ -277,7 +281,19 @@ int filemap_fdatawait_range(struct addre + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + unsigned i; ++#ifdef CONFIG_EXT2_COMPRESS ++/* ++ * I'm not sure that this is right. It has been reworked considerably since ++ * 2.6.5. - whitpa ++ */ ++ struct inode *inode = mapping->host; ++ //printk("wait_on_page_writeback_range\n"); + ++ if ((strcmp(inode->i_sb->s_type->name, "ext2") != 0) ++ || (atomic_read(&inode->i_mutex.count) > 0) ++ || (EXT2_I(inode)->i_compr_flags & ++ EXT2_OSYNC_INODE)) ++#endif + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + +@@ -1162,6 +1178,15 @@ page_ok: + } + nr = nr - offset; + ++#ifdef CONFIG_EXT2_COMPRESS ++ lock_page(page); ++ //check again: after locking still uptodate? ++ if(!PageUptodate(page)){ ++ unlock_page(page); ++ goto page_not_up_to_date; ++ } ++#endif ++ + /* If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. +@@ -1193,6 +1218,10 @@ page_ok: + offset &= ~PAGE_CACHE_MASK; + prev_offset = offset; + ++#ifdef CONFIG_EXT2_COMPRESS ++ unlock_page(page); ++#endif ++ + page_cache_release(page); + if (ret == nr && desc->count) + continue; +@@ -1202,7 +1231,12 @@ page_not_up_to_date: + /* Get exclusive access to the page ... */ + error = lock_page_killable(page); + if (unlikely(error)) ++ { ++ printk("Readpage Error: mw: page locking failed with code: %i\n", error); ++ printk("Readpage Error: mw: might happen as page was locked 'killable'\n"); ++ printk("Readpage Error: mw: was reading app killed?\n"); + goto readpage_error; ++ } + + page_not_up_to_date_locked: + /* Did it get truncated before we got the lock? */ +@@ -1233,13 +1267,17 @@ readpage: + page_cache_release(page); + goto find_page; + } ++ printk("Readpage Error: fs-specific readpage failed with code: %i\n", error); + goto readpage_error; + } + + if (!PageUptodate(page)) { + error = lock_page_killable(page); + if (unlikely(error)) ++ { ++ printk("Readpage Error: page was not uptodate after read. page locking failed with code: %i\n", error); + goto readpage_error; ++ } + if (!PageUptodate(page)) { + if (page->mapping == NULL) { + /* +@@ -1252,6 +1290,7 @@ readpage: + unlock_page(page); + shrink_readahead_size_eio(filp, ra); + error = -EIO; ++ printk("Readpage Error: page was not uptodate after read AND page locked. failed with code: %i\n", error); + goto readpage_error; + } + unlock_page(page); +@@ -1263,6 +1302,7 @@ readpage_error: + /* UHHUH! A synchronous read error occurred. Report it */ + desc->error = error; + page_cache_release(page); ++ printk("Readpage Error\n"); + goto out; + + no_cached_page: diff --git a/3.3.8/lschlv2.patch b/3.3.8/lschlv2.patch new file mode 100644 index 0000000..40ef6be --- /dev/null +++ b/3.3.8/lschlv2.patch @@ -0,0 +1,256 @@ +--- a/arch/arm/mach-kirkwood/include/mach/system.h ++++ b/arch/arm/mach-kirkwood/include/mach/system.h +@@ -9,6 +9,8 @@ + #ifndef __ASM_ARCH_SYSTEM_H + #define __ASM_ARCH_SYSTEM_H + ++#include ++#include + #include + + static inline void arch_idle(void) +--- a/arch/arm/mach-kirkwood/Kconfig ++++ b/arch/arm/mach-kirkwood/Kconfig +@@ -87,6 +87,12 @@ + Say 'Y' here if you want your kernel to support the + HP t5325 Thin Client. + ++config MACH_LINKSTATION_CHLV2 ++ bool "Buffalo LS-CHLv2 Series" ++ help ++ Say 'Y' here if you want your kernel to support the ++ Buffalo LS-CHLv2 Series. ++ + endmenu + + endif +--- a/arch/arm/mach-kirkwood/lschlv2-setup.c ++++ b/arch/arm/mach-kirkwood/lschlv2-setup.c +@@ -0,0 +1,210 @@ ++/* ++ * arch/arm/mach-kirkwood/lschlv2-setup.c ++ * ++ * Buffalo LS Kirkwood Series Setup ++ * ++ * This file is licensed under the terms of the GNU General Public ++ * License version 2. This program is licensed "as is" without any ++ * warranty of any kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "include/mach/system.h" ++#include ++#include "common.h" ++#include "mpp.h" ++ ++/***************************************************************************** ++ * 512KB SPI Flash on BOOT Device ++ ****************************************************************************/ ++static struct mtd_partition lschlv2_partitions[] = { ++ { ++ .name = "u-boot", ++ .offset = 0x00000, ++ .size = 0x70000, ++ .mask_flags = MTD_WRITEABLE, ++ }, ++ { ++ .name = "u-boot env", ++ .offset = MTDPART_OFS_APPEND, ++ .size = 0x10000, ++ } ++}; ++ ++static struct flash_platform_data lschlv2_spi_slave_data = { ++ .type = "m25p40", ++ .parts = lschlv2_partitions, ++ .nr_parts = ARRAY_SIZE(lschlv2_partitions), ++}; ++ ++static struct spi_board_info __initdata lschlv2_spi_slave_info[] = { ++ { ++ .modalias = "m25p80", ++ .platform_data = &lschlv2_spi_slave_data, ++ .irq = -1, ++ .max_speed_hz = 20000000, ++ .bus_num = 0, ++ .chip_select = 0, ++ } ++}; ++ ++static struct mv643xx_eth_platform_data lschlv2_ge00_data = { ++ .phy_addr = MV643XX_ETH_PHY_ADDR(0), ++}; ++ ++static struct mv643xx_eth_platform_data lschlv2_ge01_data = { ++ .phy_addr = MV643XX_ETH_PHY_ADDR(8), ++}; ++ ++static unsigned int lschlv2_mpp_config[] __initdata = { ++ MPP10_GPO, /* HDD Power */ ++ MPP11_GPIO, /* USB Vbus Power */ ++ MPP18_GPO, /* FAN High on:0, off:1 */ ++ MPP19_GPO, /* FAN Low on:0, off:1 */ ++ MPP36_GPIO, /* FUNC LED */ ++ MPP37_GPIO, /* ALARM LED */ ++ MPP38_GPIO, /* INFO LED */ ++ MPP39_GPIO, /* POWER LED */ ++ MPP40_GPIO, /* FAN LOCK */ ++ MPP41_GPIO, /* FUNC SW */ ++ MPP42_GPIO, /* POWER SW */ ++ MPP43_GPIO, /* POWER AUTO SW */ ++ MPP48_GPIO, /* FUNC RED LED */ ++ MPP49_GPIO, /* UART EN */ ++ 0 ++}; ++ ++static struct mv_sata_platform_data lschlv2_sata_data = { ++ .n_ports = 1, ++}; ++ ++static struct gpio_led lschlv2_led_pins[] = { ++ { ++ .name = "func", ++ .gpio = 36, ++ .active_low = 1, ++ }, ++ { ++ .name = "alarm", ++ .gpio = 37, ++ .active_low = 1, ++ }, ++ { ++ .name = "info", ++ .gpio = 38, ++ .active_low = 1, ++ }, ++ { ++ .name = "power", ++ .gpio = 39, ++ .default_trigger = "default-on", ++ .active_low = 1, ++ }, ++ { ++ .name = "func2", ++ .gpio = 48, ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_led_platform_data lschlv2_led_data = { ++ .leds = lschlv2_led_pins, ++ .num_leds = ARRAY_SIZE(lschlv2_led_pins), ++}; ++ ++static struct platform_device lschlv2_leds = { ++ .name = "leds-gpio", ++ .id = -1, ++ .dev = { ++ .platform_data = &lschlv2_led_data, ++ } ++}; ++ ++#define LSCHLv2_GPIO_USB_VBUS_EN 11 ++#define LSCHLv2_GPIO_KEY_FUNC 41 ++ ++static struct gpio_keys_button lschlv2_buttons[] = { ++ { ++ .code = KEY_OPTION, ++ .gpio = LSCHLv2_GPIO_KEY_FUNC, ++ .desc = "Function Button", ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_keys_platform_data lschlv2_button_data = { ++ .buttons = lschlv2_buttons, ++ .nbuttons = ARRAY_SIZE(lschlv2_buttons), ++}; ++ ++static struct platform_device lschlv2_button_device = { ++ .name = "gpio-keys", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lschlv2_button_data, ++ }, ++}; ++ ++static void lschlv2_power_off(void) ++{ ++ arch_reset(0, NULL); ++} ++ ++static void __init lschlv2_init(void) ++{ ++ /* ++ * Basic setup. Needs to be called early. ++ */ ++ kirkwood_init(); ++ kirkwood_mpp_conf(lschlv2_mpp_config); ++ ++ kirkwood_uart0_init(); ++ ++ if (gpio_request(LSCHLv2_GPIO_USB_VBUS_EN, "USB Power Enable") != 0 || ++ gpio_direction_output(LSCHLv2_GPIO_USB_VBUS_EN, 1) != 0) ++ printk(KERN_ERR "can't set up USB Power Enable\n"); ++ kirkwood_ehci_init(); ++ ++ kirkwood_ge00_init(&lschlv2_ge00_data); ++ kirkwood_ge01_init(&lschlv2_ge01_data); ++ ++ kirkwood_sata_init(&lschlv2_sata_data); ++ ++ kirkwood_spi_init(); ++ ++ platform_device_register(&lschlv2_leds); ++ platform_device_register(&lschlv2_button_device); ++ ++ spi_register_board_info(lschlv2_spi_slave_info, ++ ARRAY_SIZE(lschlv2_spi_slave_info)); ++ ++ /* register power-off method */ ++ pm_power_off = lschlv2_power_off; ++ ++ pr_info("%s: finished\n", __func__); ++} ++ ++ ++ ++MACHINE_START(LINKSTATION_CHLV2, "Buffalo Linkstation LS-CHLv2") ++ .atag_offset = 0x100, ++ .init_machine = lschlv2_init, ++ .map_io = kirkwood_map_io, ++ .init_early = kirkwood_init_early, ++ .init_irq = kirkwood_init_irq, ++ .timer = &kirkwood_timer, ++MACHINE_END +--- a/arch/arm/mach-kirkwood/Makefile ++++ b/arch/arm/mach-kirkwood/Makefile +@@ -20,3 +20,4 @@ + obj-$(CONFIG_MACH_T5325) += t5325-setup.o ++obj-$(CONFIG_MACH_LINKSTATION_CHLV2) += lschlv2-setup.o + + obj-$(CONFIG_CPU_IDLE) += cpuidle.o +--- a/arch/arm/mach-kirkwood/common.c ++++ b/arch/arm/mach-kirkwood/common.c +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + #include "common.h" + + /***************************************************************************** diff --git a/3.3.8/net-netfilter-IFWLOG-2.6.35-buildfix.patch b/3.3.8/net-netfilter-IFWLOG-2.6.35-buildfix.patch new file mode 100644 index 0000000..99d4d06 --- /dev/null +++ b/3.3.8/net-netfilter-IFWLOG-2.6.35-buildfix.patch @@ -0,0 +1,32 @@ +--- linux-2.6.35-rc6-git-mnb0.1/net/ipv4/netfilter/ipt_IFWLOG.c.orig 2010-07-30 21:17:30.000000000 +0300 ++++ linux-2.6.35-rc6-git-mnb0.1/net/ipv4/netfilter/ipt_IFWLOG.c 2010-07-31 13:46:33.834611944 +0300 +@@ -135,7 +135,7 @@ static void ipt_IFWLOG_packet(const stru + } + + static unsigned int ipt_IFWLOG_target(struct sk_buff *skb, +- const struct xt_target_param *target_param) ++ const struct xt_action_param *target_param) + { + const struct ipt_IFWLOG_info *info = target_param->targinfo; + +@@ -144,17 +144,17 @@ static unsigned int ipt_IFWLOG_target(st + return IPT_CONTINUE; + } + +-static bool ipt_IFWLOG_checkentry(const struct xt_tgchk_param *tgchk_param) ++static int ipt_IFWLOG_checkentry(const struct xt_tgchk_param *tgchk_param) + { + const struct ipt_IFWLOG_info *info = tgchk_param->targinfo; + + if (info->prefix[sizeof(info->prefix)-1] != '\0') { + DEBUGP("IFWLOG: prefix term %i\n", + info->prefix[sizeof(info->prefix)-1]); +- return false; ++ return -EINVAL; + } + +- return true; ++ return 0; + } + + static struct xt_target ipt_IFWLOG = { diff --git a/3.3.8/net-netfilter-IFWLOG-2.6.37-buildfix.patch b/3.3.8/net-netfilter-IFWLOG-2.6.37-buildfix.patch new file mode 100644 index 0000000..0ae95aa --- /dev/null +++ b/3.3.8/net-netfilter-IFWLOG-2.6.37-buildfix.patch @@ -0,0 +1,15 @@ + + net/ipv4/netfilter/ipt_IFWLOG.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- linux-2.6.37-rc3-git1-tmb0.3/net/ipv4/netfilter/ipt_IFWLOG.c.orig 2010-11-24 21:58:36.000000000 +0200 ++++ linux-2.6.37-rc3-git1-tmb0.3/net/ipv4/netfilter/ipt_IFWLOG.c 2010-11-25 13:08:55.719379646 +0200 +@@ -141,7 +141,7 @@ static unsigned int ipt_IFWLOG_target(st + + ipt_IFWLOG_packet(skb, target_param->in, target_param->out, info); + +- return IPT_CONTINUE; ++ return XT_CONTINUE; + } + + static int ipt_IFWLOG_checkentry(const struct xt_tgchk_param *tgchk_param) diff --git a/3.3.8/net-netfilter-IFWLOG-mdv.patch b/3.3.8/net-netfilter-IFWLOG-mdv.patch new file mode 100644 index 0000000..e5b9c92 --- /dev/null +++ b/3.3.8/net-netfilter-IFWLOG-mdv.patch @@ -0,0 +1,264 @@ +ipt_IFWLOG: Mandriva changes + +This patch holds all the Mandriva changes done in ipt_IFWLOG +netfilter module. + +This work is mostly done by Thomas Backlund, Herton R. Krzesinski +and Luiz Fernando N. Capitulino. + +Signed-off-by: Luiz Fernando N. Capitulino +Signed-off-by: Herton Ronaldo Krzesinski + +--- + include/linux/netfilter_ipv4/Kbuild | 1 + include/linux/netfilter_ipv4/ipt_IFWLOG.h | 23 +++++- + net/ipv4/netfilter/ipt_IFWLOG.c | 108 +++++++++++++++--------------- + 3 files changed, 77 insertions(+), 55 deletions(-) + +diff -p -up linux-2.6.28/include/linux/netfilter_ipv4/ipt_IFWLOG.h.orig linux-2.6.28/include/linux/netfilter_ipv4/ipt_IFWLOG.h +--- linux-2.6.28/include/linux/netfilter_ipv4/ipt_IFWLOG.h.orig 2008-12-12 10:55:07.000000000 -0500 ++++ linux-2.6.28/include/linux/netfilter_ipv4/ipt_IFWLOG.h 2008-12-12 10:56:30.000000000 -0500 +@@ -1,10 +1,25 @@ +-#ifndef _IPT_IFWLOG_H +-#define _IPT_IFWLOG_H ++#ifndef _LINUX_IPT_IFWLOG_H ++#define _LINUX_IPT_IFWLOG_H + + #ifndef NETLINK_IFWLOG +-#define NETLINK_IFWLOG 19 ++#define NETLINK_IFWLOG 20 + #endif + ++#ifndef __KERNEL__ ++/* Multicast groups - backwards compatiblility for userspace */ ++#define IFWLOG_NLGRP_NONE 0x00000000 ++#define IFWLOG_NLGRP_DEF 0x00000001 /* default message group */ ++#endif ++ ++enum { ++ IFWLOGNLGRP_NONE, ++#define IFWLOGNLGRP_NONE IFWLOGNLGRP_NONE ++ IFWLOGNLGRP_DEF, ++#define IFWLOGNLGRP_DEF IFWLOGNLGRP_DEF ++ __IFWLOGNLGRP_MAX ++}; ++#define IFWLOGNLGRP_MAX (__IFWLOGNLGRP_MAX - 1) ++ + #define PREFSIZ 32 + + struct nl_msg { /* Netlink message */ +@@ -23,4 +38,4 @@ struct ipt_IFWLOG_info { + char prefix[PREFSIZ]; + }; + +-#endif /* _IPT_IFWLOG_H */ ++#endif /* _LINUX_IPT_IFWLOG_H */ +diff -p -up linux-2.6.28/net/ipv4/netfilter/ipt_IFWLOG.c.orig linux-2.6.28/net/ipv4/netfilter/ipt_IFWLOG.c +--- linux-2.6.28/net/ipv4/netfilter/ipt_IFWLOG.c.orig 2008-12-12 10:55:07.000000000 -0500 ++++ linux-2.6.28/net/ipv4/netfilter/ipt_IFWLOG.c 2008-12-12 10:57:16.000000000 -0500 +@@ -4,6 +4,14 @@ + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. ++ * ++ * 2007-10-10 Thomas Backlund : build fixes for 2.6.22.9 ++ * 2007-11-11 Herton Krzesinski : build fixes for 2.6.24-rc ++ * 2007-12-03 Luiz Capitulino : v1.1 ++ * - Better multicast group usage ++ * - Coding style fixes ++ * - Do not return -EINVAL by default in ipt_ifwlog_init() ++ * - Minor refinements + */ + + #include +@@ -19,12 +27,10 @@ + #include + + #include ++#include + #include + #include + +-MODULE_LICENSE("GPL"); +-MODULE_AUTHOR("Samir Bellabes "); +-MODULE_DESCRIPTION("Interactive firewall logging and module"); + + #if 0 + #define DEBUGP PRINTR +@@ -36,44 +42,41 @@ MODULE_DESCRIPTION("Interactive firewall + + static struct sock *nl; + +-#define GROUP 10 +- + /* send struct to userspace */ +-static void send_packet(struct nl_msg msg) ++static void send_packet(const struct nl_msg *msg) + { + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; ++ unsigned int size; + +- skb = alloc_skb(NLMSG_SPACE(sizeof(struct nl_msg)), GFP_ATOMIC); ++ size = NLMSG_SPACE(sizeof(*msg)); ++ skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) { + PRINTR(KERN_WARNING "IFWLOG: OOM can't allocate skb\n"); +- return ; ++ return; + } + +- nlh = NLMSG_PUT(skb, 0, 0, 0, sizeof(struct nl_msg) - sizeof(*nlh)); ++ nlh = NLMSG_PUT(skb, 0, 0, 0, size - sizeof(*nlh)); + +- memcpy(NLMSG_DATA(nlh), (const void*)&msg, sizeof(struct nl_msg)); ++ memcpy(NLMSG_DATA(nlh), (const void *) msg, sizeof(*msg)); + + NETLINK_CB(skb).pid = 0; /* from kernel */ +- NETLINK_CB(skb).dst_pid = 0; /* multicast */ +- NETLINK_CB(skb).dst_group = 10; ++ NETLINK_CB(skb).dst_group = IFWLOGNLGRP_DEF; + + if (nl) { + DEBUGP(KERN_WARNING + "IFWLOG: nlmsg_len=%ld\nnlmsg_type=%d nlmsg_flags=%d\nnlmsg_seq=%ld nlmsg_pid = %ld\n", + (long)nlh->nlmsg_len, nlh->nlmsg_type, nlh->nlmsg_flags, + (long)nlh->nlmsg_seq, (long)nlh->nlmsg_pid); +- DEBUGP(KERN_WARNING "prefix : %s\n", msg.prefix); ++ DEBUGP(KERN_WARNING "prefix : %s\n", msg->prefix); + +- netlink_broadcast(nl, skb, 0, 10, GFP_ATOMIC); +- return ; ++ netlink_broadcast(nl, skb, 0, IFWLOGNLGRP_DEF, GFP_ATOMIC); ++ return; + } + +- nlmsg_failure: +- if (skb) +- kfree_skb(skb); +- PRINTR(KERN_WARNING "IFWLOG: Error sending netlink packet\n"); +- return ; ++nlmsg_failure: ++ kfree_skb(skb); ++ PRINTR(KERN_WARNING "IFWLOG: Error sending netlink packet\n"); + } + + /* fill struct for userspace */ +@@ -128,73 +131,76 @@ static void ipt_IFWLOG_packet(const stru + do_gettimeofday((struct timeval *)&tv); + msg.timestamp_sec = tv.tv_sec; + +- send_packet(msg); ++ send_packet(&msg); + } + +-static unsigned int ipt_IFWLOG_target(struct sk_buff **pskb, +- const struct net_device *in, +- const struct net_device *out, +- unsigned int hooknum, +- const void *targinfo, +- void *userinfo) ++static unsigned int ipt_IFWLOG_target(struct sk_buff *skb, ++ const struct xt_target_param *target_param) + { +- const struct ipt_IFWLOG_info *info = targinfo; ++ const struct ipt_IFWLOG_info *info = target_param->targinfo; + +- ipt_IFWLOG_packet(*pskb, in, out, info); ++ ipt_IFWLOG_packet(skb, target_param->in, target_param->out, info); + + return IPT_CONTINUE; + } + +-static int ipt_IFWLOG_checkentry(const char *tablename, +- const struct ipt_entry *e, +- void *targinfo, +- unsigned int targinfosize, +- unsigned int hook_mask) ++static bool ipt_IFWLOG_checkentry(const struct xt_tgchk_param *tgchk_param) + { +- const struct ipt_IFWLOG_info *info = targinfo; ++ const struct ipt_IFWLOG_info *info = tgchk_param->targinfo; + + if (info->prefix[sizeof(info->prefix)-1] != '\0') { + DEBUGP("IFWLOG: prefix term %i\n", + info->prefix[sizeof(info->prefix)-1]); +- return 0; ++ return false; + } + +- return 1; ++ return true; + } + +-static struct ipt_target ipt_IFWLOG = { ++static struct xt_target ipt_IFWLOG = { + .name = "IFWLOG", ++ .family = AF_INET, + .target = ipt_IFWLOG_target, + .targetsize = sizeof(struct ipt_IFWLOG_info), + .checkentry = ipt_IFWLOG_checkentry, + .me = THIS_MODULE, + }; + +-static int __init init(void) ++static int __init ipt_ifwlog_init(void) + { +- nl = (struct sock*) netlink_kernel_create(NETLINK_IFWLOG, GROUP, NULL, THIS_MODULE); +- if (!nl) { +- PRINTR(KERN_WARNING "IFWLOG: cannot create netlink socket\n"); +- return -EINVAL; +- } ++ int err; + +- if (ipt_register_target(&ipt_IFWLOG)) { ++ nl = netlink_kernel_create(&init_net, NETLINK_IFWLOG, IFWLOGNLGRP_MAX, ++ NULL, NULL, THIS_MODULE); ++ if (!nl) { ++ PRINTR(KERN_WARNING "IFWLOG: cannot create netlink socket\n"); ++ return -ENOMEM; ++ } ++ ++ err = xt_register_target(&ipt_IFWLOG); ++ if (err) { + if (nl && nl->sk_socket) + sock_release(nl->sk_socket); +- return -EINVAL; ++ return err; + } + + PRINTR(KERN_INFO "IFWLOG: register target\n"); + return 0; + } + +-static void __exit fini(void) ++static void __exit ipt_ifwlog_fini(void) + { + if (nl && nl->sk_socket) +- sock_release(nl->sk_socket); ++ sock_release(nl->sk_socket); + PRINTR(KERN_INFO "IFWLOG: unregister target\n"); +- ipt_unregister_target(&ipt_IFWLOG); ++ xt_unregister_target(&ipt_IFWLOG); + } + +-module_init(init); +-module_exit(fini); ++module_init(ipt_ifwlog_init); ++module_exit(ipt_ifwlog_fini); ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Samir Bellabes "); ++MODULE_AUTHOR("Luiz Capitulino "); ++MODULE_DESCRIPTION("Interactive firewall logging and module"); ++MODULE_VERSION("v1.1"); +--- linux/include/linux/netfilter_ipv4/Kbuild.net-netfilter-IFWLOG-mdv.orig 2012-05-21 01:29:13.000000000 +0300 ++++ linux/include/linux/netfilter_ipv4/Kbuild 2012-05-26 01:27:24.743139430 +0300 +@@ -2,6 +2,7 @@ header-y += ip_queue.h + header-y += ip_tables.h + header-y += ipt_CLUSTERIP.h + header-y += ipt_ECN.h ++header-y += ipt_IFWLOG.h + header-y += ipt_LOG.h + header-y += ipt_REJECT.h + header-y += ipt_TTL.h diff --git a/3.3.8/net-netfilter-IFWLOG.patch b/3.3.8/net-netfilter-IFWLOG.patch new file mode 100644 index 0000000..6efe89a --- /dev/null +++ b/3.3.8/net-netfilter-IFWLOG.patch @@ -0,0 +1,269 @@ +--- + include/linux/netfilter_ipv4/ipt_IFWLOG.h | 26 +++ + net/ipv4/netfilter/Kconfig | 11 + + net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/ipt_IFWLOG.c | 200 ++++++++++++++++++++++++++++++ + 4 files changed, 238 insertions(+) + +--- /dev/null ++++ b/net/ipv4/netfilter/ipt_IFWLOG.c +@@ -0,0 +1,200 @@ ++/* Interactive Firewall for Mandriva ++ * Samir Bellabes ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Samir Bellabes "); ++MODULE_DESCRIPTION("Interactive firewall logging and module"); ++ ++#if 0 ++#define DEBUGP PRINTR ++#else ++#define DEBUGP(format, args...) ++#endif ++ ++#define PRINTR(format, args...) do { if(net_ratelimit()) printk(format, ##args); } while(0) ++ ++static struct sock *nl; ++ ++#define GROUP 10 ++ ++/* send struct to userspace */ ++static void send_packet(struct nl_msg msg) ++{ ++ struct sk_buff *skb = NULL; ++ struct nlmsghdr *nlh; ++ ++ skb = alloc_skb(NLMSG_SPACE(sizeof(struct nl_msg)), GFP_ATOMIC); ++ if (!skb) { ++ PRINTR(KERN_WARNING "IFWLOG: OOM can't allocate skb\n"); ++ return ; ++ } ++ ++ nlh = NLMSG_PUT(skb, 0, 0, 0, sizeof(struct nl_msg) - sizeof(*nlh)); ++ ++ memcpy(NLMSG_DATA(nlh), (const void*)&msg, sizeof(struct nl_msg)); ++ ++ NETLINK_CB(skb).pid = 0; /* from kernel */ ++ NETLINK_CB(skb).dst_pid = 0; /* multicast */ ++ NETLINK_CB(skb).dst_group = 10; ++ ++ if (nl) { ++ DEBUGP(KERN_WARNING ++ "IFWLOG: nlmsg_len=%ld\nnlmsg_type=%d nlmsg_flags=%d\nnlmsg_seq=%ld nlmsg_pid = %ld\n", ++ (long)nlh->nlmsg_len, nlh->nlmsg_type, nlh->nlmsg_flags, ++ (long)nlh->nlmsg_seq, (long)nlh->nlmsg_pid); ++ DEBUGP(KERN_WARNING "prefix : %s\n", msg.prefix); ++ ++ netlink_broadcast(nl, skb, 0, 10, GFP_ATOMIC); ++ return ; ++ } ++ ++ nlmsg_failure: ++ if (skb) ++ kfree_skb(skb); ++ PRINTR(KERN_WARNING "IFWLOG: Error sending netlink packet\n"); ++ return ; ++} ++ ++/* fill struct for userspace */ ++static void ipt_IFWLOG_packet(const struct sk_buff *skb, ++ const struct net_device *in, ++ const struct net_device *out, ++ const struct ipt_IFWLOG_info *info) ++{ ++ struct iphdr iph; ++ struct tcphdr tcph; ++ struct udphdr udph; ++ struct nl_msg msg; ++ struct iphdr _iph, *ih; ++ struct timeval tv; ++ ++ memset(&msg, 0, sizeof(struct nl_msg)); ++ ++ ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); ++ if (ih == NULL) { ++ PRINTR(KERN_WARNING "IFWLOG: skb truncated"); ++ return; ++ } ++ ++ /* save interface name */ ++ if (in) ++ strcpy(msg.indev_name, in->name); ++ if (out) ++ strcpy(msg.outdev_name, out->name); ++ ++ /* save log-prefix */ ++ strcpy(msg.prefix, info->prefix); ++ ++ /* save ip header */ ++ skb_copy_bits(skb, 0, &iph, sizeof(iph)); ++ memcpy(&msg.ip, &iph, sizeof(struct iphdr)); ++ ++ /* save transport header */ ++ switch (iph.protocol){ ++ case IPPROTO_TCP: ++ skb_copy_bits(skb, iph.ihl*4 , &tcph, sizeof(tcph)); ++ memcpy(&msg.h.th, &tcph, sizeof(struct tcphdr)); ++ break; ++ case IPPROTO_UDP: ++ skb_copy_bits(skb, iph.ihl*4 , &udph, sizeof(udph)); ++ memcpy(&msg.h.uh, &udph, sizeof(struct udphdr)); ++ break; ++ default: ++ break; ++ } ++ ++ /* save timetamp */ ++ do_gettimeofday((struct timeval *)&tv); ++ msg.timestamp_sec = tv.tv_sec; ++ ++ send_packet(msg); ++} ++ ++static unsigned int ipt_IFWLOG_target(struct sk_buff **pskb, ++ const struct net_device *in, ++ const struct net_device *out, ++ unsigned int hooknum, ++ const void *targinfo, ++ void *userinfo) ++{ ++ const struct ipt_IFWLOG_info *info = targinfo; ++ ++ ipt_IFWLOG_packet(*pskb, in, out, info); ++ ++ return IPT_CONTINUE; ++} ++ ++static int ipt_IFWLOG_checkentry(const char *tablename, ++ const struct ipt_entry *e, ++ void *targinfo, ++ unsigned int targinfosize, ++ unsigned int hook_mask) ++{ ++ const struct ipt_IFWLOG_info *info = targinfo; ++ ++ if (info->prefix[sizeof(info->prefix)-1] != '\0') { ++ DEBUGP("IFWLOG: prefix term %i\n", ++ info->prefix[sizeof(info->prefix)-1]); ++ return 0; ++ } ++ ++ return 1; ++} ++ ++static struct ipt_target ipt_IFWLOG = { ++ .name = "IFWLOG", ++ .target = ipt_IFWLOG_target, ++ .targetsize = sizeof(struct ipt_IFWLOG_info), ++ .checkentry = ipt_IFWLOG_checkentry, ++ .me = THIS_MODULE, ++}; ++ ++static int __init init(void) ++{ ++ nl = (struct sock*) netlink_kernel_create(NETLINK_IFWLOG, GROUP, NULL, THIS_MODULE); ++ if (!nl) { ++ PRINTR(KERN_WARNING "IFWLOG: cannot create netlink socket\n"); ++ return -EINVAL; ++ } ++ ++ if (ipt_register_target(&ipt_IFWLOG)) { ++ if (nl && nl->sk_socket) ++ sock_release(nl->sk_socket); ++ return -EINVAL; ++ } ++ ++ PRINTR(KERN_INFO "IFWLOG: register target\n"); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ if (nl && nl->sk_socket) ++ sock_release(nl->sk_socket); ++ PRINTR(KERN_INFO "IFWLOG: unregister target\n"); ++ ipt_unregister_target(&ipt_IFWLOG); ++} ++ ++module_init(init); ++module_exit(fini); +--- a/net/ipv4/netfilter/Kconfig ++++ b/net/ipv4/netfilter/Kconfig +@@ -331,6 +331,17 @@ config IP_NF_TARGET_TTL + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_HL. + ++config IP_NF_TARGET_IFWLOG ++ tristate 'IFWLOG target support' ++ depends on IP_NF_IPTABLES ++ help ++ This option adds a `IFWLOG' target, which is used by ++ Interactive Firewall for sending informations to a userspace ++ daemon ++ ++ If you want to compile it as a module, say M here and read ++ Documentation/modules.txt. If unsure, say `N'. ++ + # raw + specific targets + config IP_NF_RAW + tristate 'raw table support (required for NOTRACK/TRACE)' +--- /dev/null ++++ b/include/linux/netfilter_ipv4/ipt_IFWLOG.h +@@ -0,0 +1,26 @@ ++#ifndef _IPT_IFWLOG_H ++#define _IPT_IFWLOG_H ++ ++#ifndef NETLINK_IFWLOG ++#define NETLINK_IFWLOG 19 ++#endif ++ ++#define PREFSIZ 32 ++ ++struct nl_msg { /* Netlink message */ ++ long timestamp_sec; /* time packet */ ++ char indev_name[IFNAMSIZ]; /* name of the ingoing interface */ ++ char outdev_name[IFNAMSIZ]; /* name of the outgoing interface */ ++ unsigned char prefix[PREFSIZ]; /* informations on the logging reason */ ++ struct iphdr ip; ++ union { ++ struct tcphdr th; ++ struct udphdr uh; ++ } h; ++}; ++ ++struct ipt_IFWLOG_info { ++ char prefix[PREFSIZ]; ++}; ++ ++#endif /* _IPT_IFWLOG_H */ +--- linux/net/ipv4/netfilter/Makefile.net-netfilter-IFWLOG.orig 2012-05-21 01:29:13.000000000 +0300 ++++ linux/net/ipv4/netfilter/Makefile 2012-05-26 01:23:57.511514194 +0300 +@@ -53,6 +53,7 @@ obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ip + + # targets + obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o ++obj-$(CONFIG_IP_NF_TARGET_IFWLOG) += ipt_IFWLOG.o + obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o + obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o + obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o diff --git a/3.3.8/net-netfilter-psd-2.6.35-buildfix.patch b/3.3.8/net-netfilter-psd-2.6.35-buildfix.patch new file mode 100644 index 0000000..218031c --- /dev/null +++ b/3.3.8/net-netfilter-psd-2.6.35-buildfix.patch @@ -0,0 +1,11 @@ +--- linux-2.6.35-rc6-git-mnb0.1/net/ipv4/netfilter/ipt_psd.c.orig 2010-07-30 21:17:30.000000000 +0300 ++++ linux-2.6.35-rc6-git-mnb0.1/net/ipv4/netfilter/ipt_psd.c 2010-07-31 13:29:00.623601957 +0300 +@@ -98,7 +98,7 @@ static inline int hashfunc(struct in_add + + static bool + ipt_psd_match(const struct sk_buff *pskb, +- const struct xt_match_param *match_param) ++ struct xt_action_param *match_param) + { + struct iphdr *ip_hdr; + struct tcphdr *tcp_hdr; diff --git a/3.3.8/net-netfilter-psd-mdv.patch b/3.3.8/net-netfilter-psd-mdv.patch new file mode 100644 index 0000000..68884aa --- /dev/null +++ b/3.3.8/net-netfilter-psd-mdv.patch @@ -0,0 +1,235 @@ +ipt_psd: Mandriva changes + +This patch holds all the Mandriva changes done in ipt_psd +netfilter module. + +Most of the time they're just upgrades to match with new +API in the kernel. + +This work is mostly done by Thomas Backlund, Herton R. +Krzesinski and Luiz Fernando N. Capitulino. + +Signed-off-by: Luiz Fernando N. Capitulino +Signed-off-by: Herton Ronaldo Krzesinski + +--- + include/linux/netfilter_ipv4/Kbuild | 1 + net/ipv4/netfilter/Kconfig | 8 ++ + net/ipv4/netfilter/ipt_psd.c | 113 ++++++++++++++---------------------- + 3 files changed, 55 insertions(+), 67 deletions(-) + +diff -p -up linux-2.6.28/net/ipv4/netfilter/ipt_psd.c.orig linux-2.6.28/net/ipv4/netfilter/ipt_psd.c +--- linux-2.6.28/net/ipv4/netfilter/ipt_psd.c.orig 2008-12-12 11:03:05.000000000 -0500 ++++ linux-2.6.28/net/ipv4/netfilter/ipt_psd.c 2008-12-12 11:04:03.000000000 -0500 +@@ -1,21 +1,24 @@ + /* +- This is a module which is used for PSD (portscan detection) +- Derived from scanlogd v2.1 written by Solar Designer +- and LOG target module. +- +- Copyright (C) 2000,2001 astaro AG +- +- This file is distributed under the terms of the GNU General Public +- License (GPL). Copies of the GPL can be obtained from: +- ftp://prep.ai.mit.edu/pub/gnu/GPL +- +- 2000-05-04 Markus Hennig : initial +- 2000-08-18 Dennis Koslowski : first release +- 2000-12-01 Dennis Koslowski : UDP scans detection added +- 2001-01-02 Dennis Koslowski : output modified +- 2001-02-04 Jan Rekorajski : converted from target to match +- 2004-05-05 Martijn Lievaart : ported to 2.6 +-*/ ++ * This is a module which is used for PSD (portscan detection) ++ * Derived from scanlogd v2.1 written by Solar Designer ++ * and LOG target module. ++ * ++ * Copyright (C) 2000,2001 astaro AG ++ * ++ * This file is distributed under the terms of the GNU General Public ++ * License (GPL). Copies of the GPL can be obtained from: ++ * ftp://prep.ai.mit.edu/pub/gnu/GPL ++ * ++ * 2000-05-04 Markus Hennig : initial ++ * 2000-08-18 Dennis Koslowski : first release ++ * 2000-12-01 Dennis Koslowski : UDP scans detection added ++ * 2001-01-02 Dennis Koslowski : output modified ++ * 2001-02-04 Jan Rekorajski : converted from target to match ++ * 2004-05-05 Martijn Lievaart : ported to 2.6 ++ * 2007-10-10 Thomas Backlund : 2.6.22 update ++ * 2007-11-14 Luiz Capitulino : 2.6.22 API usage fixes ++ * 2007-11-26 Herton Ronaldo Krzesinski : switch xt_match->match to bool ++ */ + + #include + #include +@@ -54,7 +57,7 @@ struct port { + */ + struct host { + struct host *next; /* Next entry with the same hash */ +- clock_t timestamp; /* Last update time */ ++ unsigned long timestamp; /* Last update time */ + struct in_addr src_addr; /* Source address */ + struct in_addr dest_addr; /* Destination address */ + unsigned short src_port; /* Source port */ +@@ -93,33 +96,29 @@ static inline int hashfunc(struct in_add + return hash & (HASH_SIZE - 1); + } + +-static int ++static bool + ipt_psd_match(const struct sk_buff *pskb, +- const struct net_device *in, +- const struct net_device *out, +- const void *matchinfo, +- int offset, +- int *hotdrop) ++ const struct xt_match_param *match_param) + { + struct iphdr *ip_hdr; + struct tcphdr *tcp_hdr; + struct in_addr addr; + u_int16_t src_port,dest_port; + u_int8_t tcp_flags, proto; +- clock_t now; ++ unsigned long now; + struct host *curr, *last, **head; + int hash, index, count; + + /* Parameters from userspace */ +- const struct ipt_psd_info *psdinfo = matchinfo; ++ const struct ipt_psd_info *psdinfo = match_param->matchinfo; + + /* IP header */ +- ip_hdr = pskb->nh.iph; ++ ip_hdr = ipip_hdr(pskb); + + /* Sanity check */ + if (ntohs(ip_hdr->frag_off) & IP_OFFSET) { + DEBUGP("PSD: sanity check failed\n"); +- return 0; ++ return false; + } + + /* TCP or UDP ? */ +@@ -127,7 +126,7 @@ ipt_psd_match(const struct sk_buff *pskb + + if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { + DEBUGP("PSD: protocol not supported\n"); +- return 0; ++ return false; + } + + /* Get the source address, source & destination ports, and TCP flags */ +@@ -151,7 +150,7 @@ ipt_psd_match(const struct sk_buff *pskb + * them spoof us. [DHCP needs this feature - HW] */ + if (!addr.s_addr) { + DEBUGP("PSD: spoofed source address (0.0.0.0)\n"); +- return 0; ++ return false; + } + + /* Use jiffies here not to depend on someone setting the time while we're +@@ -298,46 +297,26 @@ ipt_psd_match(const struct sk_buff *pskb + + out_no_match: + spin_unlock(&state.lock); +- return 0; ++ return false; + + out_match: + spin_unlock(&state.lock); +- return 1; ++ DEBUGP("PSD: Dropping packets from "NIPQUAD_FMT" \n", ++ NIPQUAD(curr->src_addr.s_addr)); ++ return true; + } + +-static int ipt_psd_checkentry(const char *tablename, +- const struct ipt_ip *e, +- void *matchinfo, +- unsigned int matchsize, +- unsigned int hook_mask) +-{ +-/* const struct ipt_psd_info *psdinfo = targinfo;*/ +- +- /* we accept TCP only */ +-/* if (e->ip.proto != IPPROTO_TCP) { */ +-/* DEBUGP("PSD: specified protocol may be TCP only\n"); */ +-/* return 0; */ +-/* } */ +- +- if (matchsize != IPT_ALIGN(sizeof(struct ipt_psd_info))) { +- DEBUGP("PSD: matchsize %u != %u\n", +- matchsize, +- IPT_ALIGN(sizeof(struct ipt_psd_info))); +- return 0; +- } +- +- return 1; +-} +- +-static struct ipt_match ipt_psd_reg = { +- .name = "psd", +- .match = ipt_psd_match, +- .checkentry = ipt_psd_checkentry, +- .me = THIS_MODULE }; ++static struct xt_match ipt_psd_reg = { ++ .name = "psd", ++ .family = AF_INET, ++ .match = ipt_psd_match, ++ .matchsize = sizeof(struct ipt_psd_info), ++ .me = THIS_MODULE ++}; + +-static int __init init(void) ++static int __init ipt_psd_init(void) + { +- if (ipt_register_match(&ipt_psd_reg)) ++ if (xt_register_match(&ipt_psd_reg)) + return -EINVAL; + + memset(&state, 0, sizeof(state)); +@@ -348,11 +327,11 @@ static int __init init(void) + return 0; + } + +-static void __exit fini(void) ++static void __exit ipt_psd_fini(void) + { +- ipt_unregister_match(&ipt_psd_reg); ++ xt_unregister_match(&ipt_psd_reg); + printk("netfilter PSD unloaded - (c) astaro AG\n"); + } + +-module_init(init); +-module_exit(fini); ++module_init(ipt_psd_init); ++module_exit(ipt_psd_fini); +--- a/net/ipv4/netfilter/Kconfig ++++ b/net/ipv4/netfilter/Kconfig +@@ -322,6 +322,14 @@ + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_HL. + ++config IP_NF_MATCH_PSD ++ tristate 'Port scanner detection support' ++ depends on NETFILTER_ADVANCED ++ help ++ Module used for PSD (portscan detection). ++ ++ To compile it as a module, choose M here. If unsure, say N. ++ + config IP_NF_TARGET_IFWLOG + tristate 'IFWLOG target support' + depends on IP_NF_IPTABLES +--- linux/include/linux/netfilter_ipv4/Kbuild.net-netfilter-psd-mdv.orig 2012-05-26 01:28:56.000000000 +0300 ++++ linux/include/linux/netfilter_ipv4/Kbuild 2012-05-26 01:30:21.493540796 +0300 +@@ -11,6 +11,7 @@ + header-y += ipt_addrtype.h + header-y += ipt_ah.h + header-y += ipt_ecn.h ++header-y += ipt_psd.h + header-y += ipt_realm.h + header-y += ipt_ttl.h + header-y += nf_nat.h diff --git a/3.3.8/net-netfilter-psd.patch b/3.3.8/net-netfilter-psd.patch new file mode 100644 index 0000000..8ec326f --- /dev/null +++ b/3.3.8/net-netfilter-psd.patch @@ -0,0 +1,420 @@ +--- + include/linux/netfilter_ipv4/ipt_psd.h | 40 +++ + net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/ipt_psd.c | 358 +++++++++++++++++++++++++++++++++ + 3 files changed, 399 insertions(+) + +--- /dev/null ++++ b/net/ipv4/netfilter/ipt_psd.c +@@ -0,0 +1,358 @@ ++/* ++ This is a module which is used for PSD (portscan detection) ++ Derived from scanlogd v2.1 written by Solar Designer ++ and LOG target module. ++ ++ Copyright (C) 2000,2001 astaro AG ++ ++ This file is distributed under the terms of the GNU General Public ++ License (GPL). Copies of the GPL can be obtained from: ++ ftp://prep.ai.mit.edu/pub/gnu/GPL ++ ++ 2000-05-04 Markus Hennig : initial ++ 2000-08-18 Dennis Koslowski : first release ++ 2000-12-01 Dennis Koslowski : UDP scans detection added ++ 2001-01-02 Dennis Koslowski : output modified ++ 2001-02-04 Jan Rekorajski : converted from target to match ++ 2004-05-05 Martijn Lievaart : ported to 2.6 ++*/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#if 0 ++#define DEBUGP printk ++#else ++#define DEBUGP(format, args...) ++#endif ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Dennis Koslowski "); ++ ++#define HF_DADDR_CHANGING 0x01 ++#define HF_SPORT_CHANGING 0x02 ++#define HF_TOS_CHANGING 0x04 ++#define HF_TTL_CHANGING 0x08 ++ ++/* ++ * Information we keep per each target port ++ */ ++struct port { ++ u_int16_t number; /* port number */ ++ u_int8_t proto; /* protocol number */ ++ u_int8_t and_flags; /* tcp ANDed flags */ ++ u_int8_t or_flags; /* tcp ORed flags */ ++}; ++ ++/* ++ * Information we keep per each source address. ++ */ ++struct host { ++ struct host *next; /* Next entry with the same hash */ ++ clock_t timestamp; /* Last update time */ ++ struct in_addr src_addr; /* Source address */ ++ struct in_addr dest_addr; /* Destination address */ ++ unsigned short src_port; /* Source port */ ++ int count; /* Number of ports in the list */ ++ int weight; /* Total weight of ports in the list */ ++ struct port ports[SCAN_MAX_COUNT - 1]; /* List of ports */ ++ unsigned char tos; /* TOS */ ++ unsigned char ttl; /* TTL */ ++ unsigned char flags; /* HF_ flags bitmask */ ++}; ++ ++/* ++ * State information. ++ */ ++static struct { ++ spinlock_t lock; ++ struct host list[LIST_SIZE]; /* List of source addresses */ ++ struct host *hash[HASH_SIZE]; /* Hash: pointers into the list */ ++ int index; /* Oldest entry to be replaced */ ++} state; ++ ++/* ++ * Convert an IP address into a hash table index. ++ */ ++static inline int hashfunc(struct in_addr addr) ++{ ++ unsigned int value; ++ int hash; ++ ++ value = addr.s_addr; ++ hash = 0; ++ do { ++ hash ^= value; ++ } while ((value >>= HASH_LOG)); ++ ++ return hash & (HASH_SIZE - 1); ++} ++ ++static int ++ipt_psd_match(const struct sk_buff *pskb, ++ const struct net_device *in, ++ const struct net_device *out, ++ const void *matchinfo, ++ int offset, ++ int *hotdrop) ++{ ++ struct iphdr *ip_hdr; ++ struct tcphdr *tcp_hdr; ++ struct in_addr addr; ++ u_int16_t src_port,dest_port; ++ u_int8_t tcp_flags, proto; ++ clock_t now; ++ struct host *curr, *last, **head; ++ int hash, index, count; ++ ++ /* Parameters from userspace */ ++ const struct ipt_psd_info *psdinfo = matchinfo; ++ ++ /* IP header */ ++ ip_hdr = pskb->nh.iph; ++ ++ /* Sanity check */ ++ if (ntohs(ip_hdr->frag_off) & IP_OFFSET) { ++ DEBUGP("PSD: sanity check failed\n"); ++ return 0; ++ } ++ ++ /* TCP or UDP ? */ ++ proto = ip_hdr->protocol; ++ ++ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { ++ DEBUGP("PSD: protocol not supported\n"); ++ return 0; ++ } ++ ++ /* Get the source address, source & destination ports, and TCP flags */ ++ ++ addr.s_addr = ip_hdr->saddr; ++ ++ tcp_hdr = (struct tcphdr*)((u_int32_t *)ip_hdr + ip_hdr->ihl); ++ ++ /* Yep, it´s dirty */ ++ src_port = tcp_hdr->source; ++ dest_port = tcp_hdr->dest; ++ ++ if (proto == IPPROTO_TCP) { ++ tcp_flags = *((u_int8_t*)tcp_hdr + 13); ++ } ++ else { ++ tcp_flags = 0x00; ++ } ++ ++ /* We're using IP address 0.0.0.0 for a special purpose here, so don't let ++ * them spoof us. [DHCP needs this feature - HW] */ ++ if (!addr.s_addr) { ++ DEBUGP("PSD: spoofed source address (0.0.0.0)\n"); ++ return 0; ++ } ++ ++ /* Use jiffies here not to depend on someone setting the time while we're ++ * running; we need to be careful with possible return value overflows. */ ++ now = jiffies; ++ ++ spin_lock(&state.lock); ++ ++ /* Do we know this source address already? */ ++ count = 0; ++ last = NULL; ++ if ((curr = *(head = &state.hash[hash = hashfunc(addr)]))) ++ do { ++ if (curr->src_addr.s_addr == addr.s_addr) break; ++ count++; ++ if (curr->next) last = curr; ++ } while ((curr = curr->next)); ++ ++ if (curr) { ++ ++ /* We know this address, and the entry isn't too old. Update it. */ ++ if (now - curr->timestamp <= (psdinfo->delay_threshold*HZ)/100 && ++ time_after_eq(now, curr->timestamp)) { ++ ++ /* Just update the appropriate list entry if we've seen this port already */ ++ for (index = 0; index < curr->count; index++) { ++ if (curr->ports[index].number == dest_port) { ++ curr->ports[index].proto = proto; ++ curr->ports[index].and_flags &= tcp_flags; ++ curr->ports[index].or_flags |= tcp_flags; ++ goto out_no_match; ++ } ++ } ++ ++ /* TCP/ACK and/or TCP/RST to a new port? This could be an outgoing connection. */ ++ if (proto == IPPROTO_TCP && (tcp_hdr->ack || tcp_hdr->rst)) ++ goto out_no_match; ++ ++ /* Packet to a new port, and not TCP/ACK: update the timestamp */ ++ curr->timestamp = now; ++ ++ /* Logged this scan already? Then drop the packet. */ ++ if (curr->weight >= psdinfo->weight_threshold) ++ goto out_match; ++ ++ /* Specify if destination address, source port, TOS or TTL are not fixed */ ++ if (curr->dest_addr.s_addr != ip_hdr->daddr) ++ curr->flags |= HF_DADDR_CHANGING; ++ if (curr->src_port != src_port) ++ curr->flags |= HF_SPORT_CHANGING; ++ if (curr->tos != ip_hdr->tos) ++ curr->flags |= HF_TOS_CHANGING; ++ if (curr->ttl != ip_hdr->ttl) ++ curr->flags |= HF_TTL_CHANGING; ++ ++ /* Update the total weight */ ++ curr->weight += (ntohs(dest_port) < 1024) ? ++ psdinfo->lo_ports_weight : psdinfo->hi_ports_weight; ++ ++ /* Got enough destination ports to decide that this is a scan? */ ++ /* Then log it and drop the packet. */ ++ if (curr->weight >= psdinfo->weight_threshold) ++ goto out_match; ++ ++ /* Remember the new port */ ++ if (curr->count < SCAN_MAX_COUNT) { ++ curr->ports[curr->count].number = dest_port; ++ curr->ports[curr->count].proto = proto; ++ curr->ports[curr->count].and_flags = tcp_flags; ++ curr->ports[curr->count].or_flags = tcp_flags; ++ curr->count++; ++ } ++ ++ goto out_no_match; ++ } ++ ++ /* We know this address, but the entry is outdated. Mark it unused, and ++ * remove from the hash table. We'll allocate a new entry instead since ++ * this one might get re-used too soon. */ ++ curr->src_addr.s_addr = 0; ++ if (last) ++ last->next = last->next->next; ++ else if (*head) ++ *head = (*head)->next; ++ last = NULL; ++ } ++ ++ /* We don't need an ACK from a new source address */ ++ if (proto == IPPROTO_TCP && tcp_hdr->ack) ++ goto out_no_match; ++ ++ /* Got too many source addresses with the same hash value? Then remove the ++ * oldest one from the hash table, so that they can't take too much of our ++ * CPU time even with carefully chosen spoofed IP addresses. */ ++ if (count >= HASH_MAX && last) last->next = NULL; ++ ++ /* We're going to re-use the oldest list entry, so remove it from the hash ++ * table first (if it is really already in use, and isn't removed from the ++ * hash table already because of the HASH_MAX check above). */ ++ ++ /* First, find it */ ++ if (state.list[state.index].src_addr.s_addr) ++ head = &state.hash[hashfunc(state.list[state.index].src_addr)]; ++ else ++ head = &last; ++ last = NULL; ++ if ((curr = *head)) ++ do { ++ if (curr == &state.list[state.index]) break; ++ last = curr; ++ } while ((curr = curr->next)); ++ ++ /* Then, remove it */ ++ if (curr) { ++ if (last) ++ last->next = last->next->next; ++ else if (*head) ++ *head = (*head)->next; ++ } ++ ++ /* Get our list entry */ ++ curr = &state.list[state.index++]; ++ if (state.index >= LIST_SIZE) state.index = 0; ++ ++ /* Link it into the hash table */ ++ head = &state.hash[hash]; ++ curr->next = *head; ++ *head = curr; ++ ++ /* And fill in the fields */ ++ curr->timestamp = now; ++ curr->src_addr = addr; ++ curr->dest_addr.s_addr = ip_hdr->daddr; ++ curr->src_port = src_port; ++ curr->count = 1; ++ curr->weight = (ntohs(dest_port) < 1024) ? ++ psdinfo->lo_ports_weight : psdinfo->hi_ports_weight; ++ curr->ports[0].number = dest_port; ++ curr->ports[0].proto = proto; ++ curr->ports[0].and_flags = tcp_flags; ++ curr->ports[0].or_flags = tcp_flags; ++ curr->tos = ip_hdr->tos; ++ curr->ttl = ip_hdr->ttl; ++ ++out_no_match: ++ spin_unlock(&state.lock); ++ return 0; ++ ++out_match: ++ spin_unlock(&state.lock); ++ return 1; ++} ++ ++static int ipt_psd_checkentry(const char *tablename, ++ const struct ipt_ip *e, ++ void *matchinfo, ++ unsigned int matchsize, ++ unsigned int hook_mask) ++{ ++/* const struct ipt_psd_info *psdinfo = targinfo;*/ ++ ++ /* we accept TCP only */ ++/* if (e->ip.proto != IPPROTO_TCP) { */ ++/* DEBUGP("PSD: specified protocol may be TCP only\n"); */ ++/* return 0; */ ++/* } */ ++ ++ if (matchsize != IPT_ALIGN(sizeof(struct ipt_psd_info))) { ++ DEBUGP("PSD: matchsize %u != %u\n", ++ matchsize, ++ IPT_ALIGN(sizeof(struct ipt_psd_info))); ++ return 0; ++ } ++ ++ return 1; ++} ++ ++static struct ipt_match ipt_psd_reg = { ++ .name = "psd", ++ .match = ipt_psd_match, ++ .checkentry = ipt_psd_checkentry, ++ .me = THIS_MODULE }; ++ ++static int __init init(void) ++{ ++ if (ipt_register_match(&ipt_psd_reg)) ++ return -EINVAL; ++ ++ memset(&state, 0, sizeof(state)); ++ ++ spin_lock_init(&(state.lock)); ++ ++ printk("netfilter PSD loaded - (c) astaro AG\n"); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ ipt_unregister_match(&ipt_psd_reg); ++ printk("netfilter PSD unloaded - (c) astaro AG\n"); ++} ++ ++module_init(init); ++module_exit(fini); +--- /dev/null ++++ b/include/linux/netfilter_ipv4/ipt_psd.h +@@ -0,0 +1,40 @@ ++#ifndef _IPT_PSD_H ++#define _IPT_PSD_H ++ ++#include ++#include ++ ++/* ++ * High port numbers have a lower weight to reduce the frequency of false ++ * positives, such as from passive mode FTP transfers. ++ */ ++#define PORT_WEIGHT_PRIV 3 ++#define PORT_WEIGHT_HIGH 1 ++ ++/* ++ * Port scan detection thresholds: at least COUNT ports need to be scanned ++ * from the same source, with no longer than DELAY ticks between ports. ++ */ ++#define SCAN_MIN_COUNT 7 ++#define SCAN_MAX_COUNT (SCAN_MIN_COUNT * PORT_WEIGHT_PRIV) ++#define SCAN_WEIGHT_THRESHOLD SCAN_MAX_COUNT ++#define SCAN_DELAY_THRESHOLD (300) /* old usage of HZ here was erroneously and broke under uml */ ++ ++/* ++ * Keep track of up to LIST_SIZE source addresses, using a hash table of ++ * HASH_SIZE entries for faster lookups, but limiting hash collisions to ++ * HASH_MAX source addresses per the same hash value. ++ */ ++#define LIST_SIZE 0x100 ++#define HASH_LOG 9 ++#define HASH_SIZE (1 << HASH_LOG) ++#define HASH_MAX 0x10 ++ ++struct ipt_psd_info { ++ unsigned int weight_threshold; ++ unsigned int delay_threshold; ++ unsigned short lo_ports_weight; ++ unsigned short hi_ports_weight; ++}; ++ ++#endif /*_IPT_PSD_H*/ +--- a/net/ipv4/netfilter/Makefile ++++ b/net/ipv4/netfilter/Makefile +@@ -49,6 +49,7 @@ + + # matches + obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o ++obj-$(CONFIG_IP_NF_MATCH_PSD) += ipt_psd.o + obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o + + # targets diff --git a/3.3.8/netfilter-implement-rfc-1123-for-ftp-conntrack.patch b/3.3.8/netfilter-implement-rfc-1123-for-ftp-conntrack.patch new file mode 100644 index 0000000..30cae8c --- /dev/null +++ b/3.3.8/netfilter-implement-rfc-1123-for-ftp-conntrack.patch @@ -0,0 +1,190 @@ +From: Jeff Mahoney +Subject: netfilter: Implement RFC 1123 for FTP conntrack +References: bnc#466279 bnc#681639 +Patch-mainline: Submitted via http://bugzilla.netfilter.org/show_bug.cgi?id=574 23 Jan 2011 + + The FTP conntrack code currently only accepts the following format for + the 227 response for PASV: + 227 Entering Passive Mode (148,100,81,40,31,161). + + It doesn't accept the following format from an obscure server: + 227 Data transfer will passively listen to 67,218,99,134,50,144 + + From RFC 1123: + The format of the 227 reply to a PASV command is not + well standardized. In particular, an FTP client cannot + assume that the parentheses shown on page 40 of RFC-959 + will be present (and in fact, Figure 3 on page 43 omits + them). Therefore, a User-FTP program that interprets + the PASV reply must scan the reply for the first digit + of the host and port numbers. + + This patch adds support for the RFC 1123 clarification by: + - Allowing a search filter to specify NUL as the terminator so that + try_number will return successfully if the array of numbers has been + filled when an unexpected character is encountered. + - Using space as the separator for the 227 reply and then scanning for + the first digit of the number sequence. The number sequence is parsed + out using the existing try_rfc959 but with a NUL terminator. + + Tracked in: https://bugzilla.novell.com/show_bug.cgi?id=466279 + +Reported-by: Mark Post +Signed-off-by: Jeff Mahoney +--- + net/netfilter/nf_conntrack_ftp.c | 73 ++++++++++++++++++++++++++++----------- + 1 file changed, 54 insertions(+), 19 deletions(-) + +--- a/net/netfilter/nf_conntrack_ftp.c ++++ b/net/netfilter/nf_conntrack_ftp.c +@@ -53,10 +53,14 @@ unsigned int (*nf_nat_ftp_hook)(struct s + struct nf_conntrack_expect *exp); + EXPORT_SYMBOL_GPL(nf_nat_ftp_hook); + +-static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char); +-static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char); ++static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, ++ char, unsigned int *); ++static int try_rfc1123(const char *, size_t, struct nf_conntrack_man *, ++ char, unsigned int *); ++static int try_eprt(const char *, size_t, struct nf_conntrack_man *, ++ char, unsigned int *); + static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, +- char); ++ char, unsigned int *); + + static struct ftp_search { + const char *pattern; +@@ -64,7 +68,7 @@ static struct ftp_search { + char skip; + char term; + enum nf_ct_ftp_type ftptype; +- int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); ++ int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *); + } search[IP_CT_DIR_MAX][2] = { + [IP_CT_DIR_ORIGINAL] = { + { +@@ -88,10 +92,8 @@ static struct ftp_search { + { + .pattern = "227 ", + .plen = sizeof("227 ") - 1, +- .skip = '(', +- .term = ')', + .ftptype = NF_CT_FTP_PASV, +- .getnum = try_rfc959, ++ .getnum = try_rfc1123, + }, + { + .pattern = "229 ", +@@ -130,8 +132,9 @@ static int try_number(const char *data, + i++; + else { + /* Unexpected character; true if it's the +- terminator and we're finished. */ +- if (*data == term && i == array_size - 1) ++ terminator (or we don't care about one) ++ and we're finished. */ ++ if ((*data == term || !term) && i == array_size - 1) + return len; + + pr_debug("Char %u (got %u nums) `%u' unexpected\n", +@@ -146,7 +149,8 @@ static int try_number(const char *data, + + /* Returns 0, or length of numbers: 192,168,1,1,5,6 */ + static int try_rfc959(const char *data, size_t dlen, +- struct nf_conntrack_man *cmd, char term) ++ struct nf_conntrack_man *cmd, char term, ++ unsigned int *offset) + { + int length; + u_int32_t array[6]; +@@ -161,6 +165,33 @@ static int try_rfc959(const char *data, + return length; + } + ++/* ++ * From RFC 1123: ++ * The format of the 227 reply to a PASV command is not ++ * well standardized. In particular, an FTP client cannot ++ * assume that the parentheses shown on page 40 of RFC-959 ++ * will be present (and in fact, Figure 3 on page 43 omits ++ * them). Therefore, a User-FTP program that interprets ++ * the PASV reply must scan the reply for the first digit ++ * of the host and port numbers. ++ */ ++static int try_rfc1123(const char *data, size_t dlen, ++ struct nf_conntrack_man *cmd, char term, ++ unsigned int *offset) ++{ ++ int i; ++ for (i = 0; i < dlen; i++) ++ if (isdigit(data[i])) ++ break; ++ ++ if (i == dlen) ++ return 0; ++ ++ *offset += i; ++ ++ return try_rfc959(data + i, dlen - i, cmd, 0, offset); ++} ++ + /* Grab port: number up to delimiter */ + static int get_port(const char *data, int start, size_t dlen, char delim, + __be16 *port) +@@ -189,7 +220,7 @@ static int get_port(const char *data, in + + /* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */ + static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, +- char term) ++ char term, unsigned int *offset) + { + char delim; + int length; +@@ -237,7 +268,8 @@ static int try_eprt(const char *data, si + + /* Returns 0, or length of numbers: |||6446| */ + static int try_epsv_response(const char *data, size_t dlen, +- struct nf_conntrack_man *cmd, char term) ++ struct nf_conntrack_man *cmd, char term, ++ unsigned int *offset) + { + char delim; + +@@ -259,9 +291,10 @@ static int find_pattern(const char *data + unsigned int *numlen, + struct nf_conntrack_man *cmd, + int (*getnum)(const char *, size_t, +- struct nf_conntrack_man *, char)) ++ struct nf_conntrack_man *, char, ++ unsigned int *)) + { +- size_t i; ++ size_t i = plen; + + pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen); + if (dlen == 0) +@@ -291,16 +324,18 @@ static int find_pattern(const char *data + pr_debug("Pattern matches!\n"); + /* Now we've found the constant string, try to skip + to the 'skip' character */ +- for (i = plen; data[i] != skip; i++) +- if (i == dlen - 1) return -1; ++ if (skip) { ++ for (i = plen; data[i] != skip; i++) ++ if (i == dlen - 1) return -1; + +- /* Skip over the last character */ +- i++; ++ /* Skip over the last character */ ++ i++; ++ } + + pr_debug("Skipped up to `%c'!\n", skip); + + *numoff = i; +- *numlen = getnum(data + i, dlen - i, cmd, term); ++ *numlen = getnum(data + i, dlen - i, cmd, term, numoff); + if (!*numlen) + return -1; + diff --git a/3.3.8/netfilter-ip_conntrack_slp.patch b/3.3.8/netfilter-ip_conntrack_slp.patch new file mode 100644 index 0000000..ff72d85 --- /dev/null +++ b/3.3.8/netfilter-ip_conntrack_slp.patch @@ -0,0 +1,185 @@ +From: Jiri Bohac +Subject: connection tracking helper for SLP +References: fate#301134 +Patch-mainline: Not yet + +A simple connection tracking helper for SLP. Marks replies to a +SLP broadcast query as ESTABLISHED to allow them to pass through the +firewall. + +Signed-off-by: Jiri Bohac + +--- + net/netfilter/Kconfig | 15 ++++ + net/netfilter/Makefile | 1 + net/netfilter/nf_conntrack_slp.c | 131 +++++++++++++++++++++++++++++++++++++++ + 3 files changed, 147 insertions(+) + +--- a/net/netfilter/Kconfig ++++ b/net/netfilter/Kconfig +@@ -290,6 +290,21 @@ config NF_CONNTRACK_TFTP + + To compile it as a module, choose M here. If unsure, say N. + ++config NF_CONNTRACK_SLP ++ tristate "SLP protocol support" ++ depends on NF_CONNTRACK ++ depends on NETFILTER_ADVANCED ++ help ++ SLP queries are sometimes sent as broadcast messages from an ++ unprivileged port and responded to with unicast messages to the ++ same port. This make them hard to firewall properly because connection ++ tracking doesn't deal with broadcasts. This helper tracks locally ++ originating broadcast SLP queries and the corresponding ++ responses. It relies on correct IP address configuration, specifically ++ netmask and broadcast address. ++ ++ To compile it as a module, choose M here. If unsure, say N. ++ + config NF_CT_NETLINK + tristate 'Connection tracking netlink interface' + select NETFILTER_NETLINK +--- a/net/netfilter/Makefile ++++ b/net/netfilter/Makefile +@@ -36,6 +36,7 @@ obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_co + obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o + obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o + obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o ++obj-$(CONFIG_NF_CONNTRACK_SLP) += nf_conntrack_slp.o + + # transparent proxy support + obj-$(CONFIG_NETFILTER_TPROXY) += nf_tproxy_core.o +--- /dev/null ++++ b/net/netfilter/nf_conntrack_slp.c +@@ -0,0 +1,131 @@ ++/* ++ * NetBIOS name service broadcast connection tracking helper ++ * ++ * (c) 2007 Jiri Bohac ++ * (c) 2005 Patrick McHardy ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++/* ++ * This helper tracks locally originating NetBIOS name service ++ * requests by issuing permanent expectations (valid until ++ * timing out) matching all reply connections from the ++ * destination network. The only NetBIOS specific thing is ++ * actually the port number. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#define SLP_PORT 427 ++ ++MODULE_AUTHOR("Jiri Bohac "); ++MODULE_DESCRIPTION("SLP broadcast connection tracking helper"); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS("ip_conntrack_slp"); ++ ++static unsigned int timeout __read_mostly = 3; ++module_param(timeout, uint, 0400); ++MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); ++ ++static int help(struct sk_buff *skb, unsigned int protoff, ++ struct nf_conn *ct, enum ip_conntrack_info ctinfo) ++{ ++ struct nf_conntrack_expect *exp; ++ struct rtable *rt = skb_rtable(skb); ++ struct in_device *in_dev; ++ __be32 mask = 0; ++ __be32 src = 0; ++ ++ /* we're only interested in locally generated packets */ ++ if (skb->sk == NULL) ++ goto out; ++ if (rt == NULL || !(rt->rt_flags & (RTCF_MULTICAST|RTCF_BROADCAST))) ++ goto out; ++ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) ++ goto out; ++ ++ rcu_read_lock(); ++ in_dev = __in_dev_get_rcu(rt->dst.dev); ++ if (in_dev != NULL) { ++ for_primary_ifa(in_dev) { ++ /* this is a hack as slp uses multicast we can't match ++ * the destination address to some broadcast address. So ++ * just take the first one. Better would be to install ++ * expectations for all addresses */ ++ mask = ifa->ifa_mask; ++ src = ifa->ifa_broadcast; ++ break; ++ } endfor_ifa(in_dev); ++ } ++ rcu_read_unlock(); ++ ++ if (mask == 0 || src == 0) ++ goto out; ++ ++ exp = nf_ct_expect_alloc(ct); ++ if (exp == NULL) ++ goto out; ++ ++ exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; ++ exp->tuple.src.u3.ip = src; ++ exp->tuple.src.u.udp.port = htons(SLP_PORT); ++ ++ exp->mask.src.u3.ip = mask; ++ exp->mask.src.u.udp.port = htons(0xFFFF); ++ ++ exp->expectfn = NULL; ++ exp->flags = NF_CT_EXPECT_PERMANENT; ++ exp->class = NF_CT_EXPECT_CLASS_DEFAULT; ++ exp->helper = NULL; ++ ++ nf_ct_expect_related(exp); ++ nf_ct_expect_put(exp); ++ ++ nf_ct_refresh(ct, skb, timeout * HZ); ++out: ++ return NF_ACCEPT; ++} ++ ++static struct nf_conntrack_expect_policy exp_policy = { ++ .max_expected = 1, ++}; ++ ++static struct nf_conntrack_helper helper __read_mostly = { ++ .name = "slp", ++ .tuple.src.l3num = AF_INET, ++ .tuple.src.u.udp.port = __constant_htons(SLP_PORT), ++ .tuple.dst.protonum = IPPROTO_UDP, ++ .me = THIS_MODULE, ++ .help = help, ++ .expect_policy = &exp_policy, ++}; ++ ++static int __init nf_conntrack_slp_init(void) ++{ ++ exp_policy.timeout = timeout; ++ return nf_conntrack_helper_register(&helper); ++} ++ ++static void __exit nf_conntrack_slp_fini(void) ++{ ++ nf_conntrack_helper_unregister(&helper); ++} ++ ++module_init(nf_conntrack_slp_init); ++module_exit(nf_conntrack_slp_fini); diff --git a/3.3.8/series b/3.3.8/series new file mode 100644 index 0000000..ec4f73f --- /dev/null +++ b/3.3.8/series @@ -0,0 +1,87 @@ +0001-block-cgroups-kconfig-build-bits-for-BFQ-v5-3.3.patch +0002-block-introduce-the-BFQ-v5-I-O-sched-for-3.3.patch + +3.3-ck1.patch + +620-sched_esfq.patch +621-sched_act_connmark.patch + +0001-AppArmor-compatibility-patch-for-v5-network-controll.patch +0002-AppArmor-compatibility-patch-for-v5-interface.patch +0003-AppArmor-Allow-dfa-backward-compatibility-with-broke.patch + +cloneconfig.patch +kbuild-compress-kernel-modules-on-installation.patch +ata-prefer-ata-drivers-over-ide-drivers-when-both-are-built.patch +910-kobject_uevent.patch +911-kobject_add_broadcast_uevent.patch +colored-printk-3.3.8.patch + +linux-2.6-x86-tune-generic.patch +hz-432-kconfig-option.patch +hz-864-kconfig-option.patch + +Add_CONFIG_VFAT_FS_DUALNAMES_option.patch +linux-2.6-defaults-fat-utf8.patch +aufs-3.x-rcN.patch +accessfs-3.2-0.26.patch +wrapfs-v3.3-rc1-429-g65388bc.patch + +imqmq-3.3.patch + +vserver-3.3.8-vs2.3.3.4.patch +uksm-0.1.2.1-for-v3.3.ge.8.patch + +600-netfilter_layer7_2.22.patch +601-netfilter_layer7_pktmatch.patch +602-netfilter_layer7_match.patch +603-netfilter_layer7_2.6.36_fix.patch +604-netfilter_cisco_794x_iphone.patch +610-netfilter_match_bypass_default_checks.patch +611-netfilter_match_bypass_default_table.patch +612-netfilter_match_reduce_memory_access.patch +613-netfilter_optional_tcp_window_check.patch +net-netfilter-IFWLOG.patch +net-netfilter-IFWLOG-mdv.patch +net-netfilter-IFWLOG-2.6.35-buildfix.patch +net-netfilter-IFWLOG-2.6.37-buildfix.patch +net-netfilter-psd.patch +net-netfilter-psd-mdv.patch +net-netfilter-psd-2.6.35-buildfix.patch +netfilter-implement-rfc-1123-for-ftp-conntrack.patch +netfilter-ip_conntrack_slp.patch + +v3.3-ARM-kirkwood-Add-support-for-Buffalo-LS-XHL.patch +v3.2-ARM-orion-Add-support-for-Buffalo-LS-PRODUO.patch +v3.3-ARM-orion-Add-support-for-Buffalo-LS-QL.patch +v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-VL.patch +v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-WVL.patch +v3.3-ARM-kirkwood-Add-support-for-Buffalo-LS-CHLv2.patch + + +3rd-3rdparty-1.0-tree.patch +3rd-3rdparty-merge.patch +3rd-3rdparty-netatop-0.1.1.patch +3rd-3rdparty-button_hotplug-0.4.1.patch +3rd-3rdparty-gpio_button_hotplug-0.1.patch +3rd-3rdparty-gpio_event_drv-0.1.patch + + +fs-ext4-fix-the-free-blocks-calculation-for-ext3-file-systems-w-uninit_bg.patch + +fs-udf-use-ret-instead-of-abusing-i-in-udf_load_logicalvol.patch +fs-udf-avoid-run-away-loop-when-partition-table-length-is-corrupted_CVE-2012-3400.patch +fs-udf-fortify-loading-of-sparing-table_CVE-2012-3400.patch + +fs-epoll-clear-the-tfile_check_list-on-ELOOP_CVE-2012-3375.patch + +fs-btrfs-run-delayed-directory-updates-during-log-replay.patch + +fs-ecryptfs-gracefully-refuse-miscdev-file-ops-on-inherited-passed-files.patch +fs-ecryptfs-fix-lockdep-warning-in-miscdev-operations.patch +fs-ecryptfs-properly-check-for-o_rdonly-flag-before-doing-privileged-open.patch + +fs-remove-easily-user-triggerable-bug-from-generic_setlease.patch + +fs-ext4-fix-duplicated-mnt_drop_write-call-in-ext4_ioc_move_ext.patch + diff --git a/3.3.8/uksm-0.1.2.1-for-v3.3.ge.8.patch b/3.3.8/uksm-0.1.2.1-for-v3.3.ge.8.patch new file mode 100644 index 0000000..700c72f --- /dev/null +++ b/3.3.8/uksm-0.1.2.1-for-v3.3.ge.8.patch @@ -0,0 +1,7023 @@ +diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX +index 5481c8b..7141876 100644 +--- a/Documentation/vm/00-INDEX ++++ b/Documentation/vm/00-INDEX +@@ -14,6 +14,8 @@ hwpoison.txt + - explains what hwpoison is + ksm.txt + - how to use the Kernel Samepage Merging feature. ++uksm.txt ++ - Introduction to Ultra KSM + locking + - info on how locking and synchronization is done in the Linux vm code. + map_hugetlb.c +diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt +new file mode 100644 +index 0000000..d4aaae8 +--- /dev/null ++++ b/Documentation/vm/uksm.txt +@@ -0,0 +1,56 @@ ++The Ultra Kernel Samepage Merging feature ++---------------------------------------------- ++/* ++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia ++ * ++ * This is an improvement upon KSM. Some basic data structures and routines ++ * are borrowed from ksm.c . ++ * ++ * Its new features: ++ * 1. Full system scan: ++ * It automatically scans all user processes' anonymous VMAs. Kernel-user ++ * interaction to submit a memory area to KSM is no longer needed. ++ * ++ * 2. Rich area detection: ++ * It automatically detects rich areas containing abundant duplicated ++ * pages based. Rich areas are given a full scan speed. Poor areas are ++ * sampled at a reasonable speed with very low CPU consumption. ++ * ++ * 3. Ultra Per-page scan speed improvement: ++ * A new hash algorithm is proposed. As a result, on a machine with ++ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it ++ * can scan memory areas that does not contain duplicated pages at speed of ++ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of ++ * 477MB/sec ~ 923MB/sec. ++ * ++ * 4. Thrashing area avoidance: ++ * Thrashing area(an VMA that has frequent Ksm page break-out) can be ++ * filtered out. My benchmark shows it's more efficient than KSM's per-page ++ * hash value based volatile page detection. ++ * ++ * ++ * 5. Misc changes upon KSM: ++ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page ++ * comparison. It's much faster than default C version on x86. ++ * * rmap_item now has an struct *page member to loosely cache a ++ * address-->page mapping, which reduces too much time-costly ++ * follow_page(). ++ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. ++ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ ++ * ksm is needed for this case. ++ * ++ * 6. Full Zero Page consideration(contributed by Figo Zhang) ++ * Now uksmd consider full zero pages as special pages and merge them to an ++ * special unswappable uksm zero page. ++ */ ++ ++ChangeLog: ++ ++2012-05-05 The creation of this Doc ++2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up. ++2012-05-28 UKSM 0.1.1.2 bug fix release ++2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2 ++2012-07-2 UKSM 0.1.2-beta2 ++2012-07-10 UKSM 0.1.2-beta3 ++2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization. ++2012-10-13 UKSM 0.1.2.1 Bug fixes. +diff --git a/fs/exec.c b/fs/exec.c +index ae42277..c1c65bc 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -19,7 +19,7 @@ + * current->executable is only used by the procfs. This allows a dispatch + * table to check for several different types of binary formats. We keep + * trying until we recognize the file or we run out of supported binary +- * formats. ++ * formats. + */ + + #include +@@ -55,6 +55,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -87,7 +88,7 @@ int __register_binfmt(struct linux_binfmt * fmt, int insert) + insert ? list_add(&fmt->lh, &formats) : + list_add_tail(&fmt->lh, &formats); + write_unlock(&binfmt_lock); +- return 0; ++ return 0; + } + + EXPORT_SYMBOL(__register_binfmt); +@@ -1174,7 +1175,7 @@ void setup_new_exec(struct linux_binprm * bprm) + group */ + + current->self_exec_id++; +- ++ + flush_signal_handlers(current, 0); + flush_old_files(current->files); + } +@@ -1269,8 +1270,8 @@ static int check_unsafe_exec(struct linux_binprm *bprm) + return res; + } + +-/* +- * Fill the binprm structure from the inode. ++/* ++ * Fill the binprm structure from the inode. + * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes + * + * This may be called multiple times for binary chains (scripts for example). +diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c +index 80e4645..33f9e9b 100644 +--- a/fs/proc/meminfo.c ++++ b/fs/proc/meminfo.c +@@ -87,6 +87,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) + "SUnreclaim: %8lu kB\n" + "KernelStack: %8lu kB\n" + "PageTables: %8lu kB\n" ++#ifdef CONFIG_UKSM ++ "KsmZeroPages: %8lu kB\n" ++#endif + #ifdef CONFIG_QUICKLIST + "Quicklists: %8lu kB\n" + #endif +@@ -146,6 +149,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) + K(global_page_state(NR_SLAB_UNRECLAIMABLE)), + global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, + K(global_page_state(NR_PAGETABLE)), ++#ifdef CONFIG_UKSM ++ K(global_page_state(NR_UKSM_ZERO_PAGES)), ++#endif + #ifdef CONFIG_QUICKLIST + K(quicklist_total_size()), + #endif +diff --git a/include/linux/ksm.h b/include/linux/ksm.h +index 3319a69..f4edf33 100644 +--- a/include/linux/ksm.h ++++ b/include/linux/ksm.h +@@ -22,21 +22,6 @@ struct page *ksm_does_need_to_copy(struct page *page, + #ifdef CONFIG_KSM + int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags); +-int __ksm_enter(struct mm_struct *mm); +-void __ksm_exit(struct mm_struct *mm); +- +-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +-{ +- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) +- return __ksm_enter(mm); +- return 0; +-} +- +-static inline void ksm_exit(struct mm_struct *mm) +-{ +- if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) +- __ksm_exit(mm); +-} + + /* + * A KSM page is one of those write-protected "shared pages" or "merged pages" +@@ -90,6 +75,33 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg); + void ksm_migrate_page(struct page *newpage, struct page *oldpage); + ++#ifdef CONFIG_KSM_LEGACY ++int __ksm_enter(struct mm_struct *mm); ++void __ksm_exit(struct mm_struct *mm); ++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) ++{ ++ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) ++ return __ksm_enter(mm); ++ return 0; ++} ++ ++static inline void ksm_exit(struct mm_struct *mm) ++{ ++ if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) ++ __ksm_exit(mm); ++} ++ ++#elif defined(CONFIG_UKSM) ++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) ++{ ++ return 0; ++} ++ ++static inline void ksm_exit(struct mm_struct *mm) ++{ ++} ++#endif /* !CONFIG_UKSM */ ++ + #else /* !CONFIG_KSM */ + + static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +@@ -142,4 +154,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) + #endif /* CONFIG_MMU */ + #endif /* !CONFIG_KSM */ + ++#include ++ + #endif /* __LINUX_KSM_H */ +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 3cc3062..9d8642d7 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -252,6 +252,9 @@ struct vm_area_struct { + #ifdef CONFIG_NUMA + struct mempolicy *vm_policy; /* NUMA policy for the VMA */ + #endif ++#ifdef CONFIG_UKSM ++ struct vma_slot *uksm_vma_slot; ++#endif + }; + + struct core_thread { +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index 650ba2f..2d1475f 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -116,6 +116,9 @@ enum zone_stat_item { + NUMA_OTHER, /* allocation from other node */ + #endif + NR_ANON_TRANSPARENT_HUGEPAGES, ++#ifdef CONFIG_UKSM ++ NR_UKSM_ZERO_PAGES, ++#endif + NR_VM_ZONE_STAT_ITEMS }; + + /* +@@ -753,7 +756,7 @@ static inline int is_normal_idx(enum zone_type idx) + } + + /** +- * is_highmem - helper function to quickly check if a struct zone is a ++ * is_highmem - helper function to quickly check if a struct zone is a + * highmem zone or not. This is an attempt to keep references + * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. + * @zone - pointer to struct zone variable +diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h +new file mode 100644 +index 0000000..6780fdb +--- /dev/null ++++ b/include/linux/sradix-tree.h +@@ -0,0 +1,77 @@ ++#ifndef _LINUX_SRADIX_TREE_H ++#define _LINUX_SRADIX_TREE_H ++ ++ ++#define INIT_SRADIX_TREE(root, mask) \ ++do { \ ++ (root)->height = 0; \ ++ (root)->gfp_mask = (mask); \ ++ (root)->rnode = NULL; \ ++} while (0) ++ ++#define ULONG_BITS (sizeof(unsigned long) * 8) ++#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) ++//#define SRADIX_TREE_MAP_SHIFT 6 ++//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT) ++//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1) ++ ++struct sradix_tree_node { ++ unsigned int height; /* Height from the bottom */ ++ unsigned int count; ++ unsigned int fulls; /* Number of full sublevel trees */ ++ struct sradix_tree_node *parent; ++ void *stores[0]; ++}; ++ ++/* A simple radix tree implementation */ ++struct sradix_tree_root { ++ unsigned int height; ++ struct sradix_tree_node *rnode; ++ ++ /* Where found to have available empty stores in its sublevels */ ++ struct sradix_tree_node *enter_node; ++ unsigned int shift; ++ unsigned int stores_size; ++ unsigned int mask; ++ unsigned long min; /* The first hole index */ ++ unsigned long num; ++ //unsigned long *height_to_maxindex; ++ ++ /* How the node is allocated and freed. */ ++ struct sradix_tree_node *(*alloc)(void); ++ void (*free)(struct sradix_tree_node *node); ++ ++ /* When a new node is added and removed */ ++ void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child); ++ void (*assign)(struct sradix_tree_node *node, unsigned index, void *item); ++ void (*rm)(struct sradix_tree_node *node, unsigned offset); ++}; ++ ++struct sradix_tree_path { ++ struct sradix_tree_node *node; ++ int offset; ++}; ++ ++static inline ++void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift) ++{ ++ root->height = 0; ++ root->rnode = NULL; ++ root->shift = shift; ++ root->stores_size = 1UL << shift; ++ root->mask = root->stores_size - 1; ++} ++ ++ ++extern void *sradix_tree_next(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index, ++ int (*iter)(void *, unsigned long)); ++ ++extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num); ++ ++extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index); ++ ++extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index); ++ ++#endif /* _LINUX_SRADIX_TREE_H */ +diff --git a/include/linux/uksm.h b/include/linux/uksm.h +new file mode 100644 +index 0000000..361eee2 +--- /dev/null ++++ b/include/linux/uksm.h +@@ -0,0 +1,145 @@ ++#ifndef __LINUX_UKSM_H ++#define __LINUX_UKSM_H ++/* ++ * Memory merging support. ++ * ++ * This code enables dynamic sharing of identical pages found in different ++ * memory areas, even if they are not shared by fork(). ++ */ ++ ++/* if !CONFIG_UKSM this file should not be compiled at all. */ ++#ifdef CONFIG_UKSM ++ ++#include ++#include ++#include ++#include ++#include ++ ++extern unsigned long zero_pfn __read_mostly; ++extern unsigned long uksm_zero_pfn __read_mostly; ++extern struct page *empty_uksm_zero_page; ++ ++/* must be done before linked to mm */ ++extern void uksm_vma_add_new(struct vm_area_struct *vma); ++extern void uksm_remove_vma(struct vm_area_struct *vma); ++ ++#define UKSM_SLOT_NEED_SORT (1 << 0) ++#define UKSM_SLOT_NEED_RERAND (1 << 1) ++#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */ ++#define UKSM_SLOT_FUL_SCANNED (1 << 3) ++#define UKSM_SLOT_IN_UKSM (1 << 4) ++ ++struct vma_slot { ++ struct sradix_tree_node *snode; ++ unsigned long sindex; ++ ++ struct list_head slot_list; ++ unsigned long fully_scanned_round; ++ unsigned long dedup_num; ++ unsigned long pages_scanned; ++ unsigned long last_scanned; ++ unsigned long pages_to_scan; ++ struct scan_rung *rung; ++ struct page **rmap_list_pool; ++ unsigned int *pool_counts; ++ unsigned long pool_size; ++ struct vm_area_struct *vma; ++ struct mm_struct *mm; ++ unsigned long ctime_j; ++ unsigned long pages; ++ unsigned long flags; ++ unsigned long pages_cowed; /* pages cowed this round */ ++ unsigned long pages_merged; /* pages merged this round */ ++ unsigned long pages_bemerged; ++ ++ /* when it has page merged in this eval round */ ++ struct list_head dedup_list; ++}; ++ ++static inline void uksm_unmap_zero_page(pte_t pte) ++{ ++ if (pte_pfn(pte) == uksm_zero_pfn) ++ __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); ++} ++ ++static inline void uksm_map_zero_page(pte_t pte) ++{ ++ if (pte_pfn(pte) == uksm_zero_pfn) ++ __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); ++} ++ ++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) ++{ ++ if (vma->uksm_vma_slot && PageKsm(page)) ++ vma->uksm_vma_slot->pages_cowed++; ++} ++ ++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) ++{ ++ if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn) ++ vma->uksm_vma_slot->pages_cowed++; ++} ++ ++static inline int uksm_flags_can_scan(unsigned long vm_flags) ++{ ++ return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND | ++ VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | ++ VM_NONLINEAR | VM_MIXEDMAP | VM_SAO | ++ VM_SHARED | VM_MAYSHARE | VM_GROWSUP ++ | VM_GROWSDOWN)); ++} ++ ++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) ++{ ++ if (uksm_flags_can_scan(*vm_flags_p)) ++ *vm_flags_p |= VM_MERGEABLE; ++} ++ ++/* ++ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will ++ * be removed when uksm zero page patch is stable enough. ++ */ ++static inline void uksm_bugon_zeropage(pte_t pte) ++{ ++ BUG_ON(pte_pfn(pte) == uksm_zero_pfn); ++} ++#else ++static inline void uksm_vma_add_new(struct vm_area_struct *vma) ++{ ++} ++ ++static inline void uksm_remove_vma(struct vm_area_struct *vma) ++{ ++} ++ ++static inline void uksm_unmap_zero_page(pte_t pte) ++{ ++} ++ ++static inline void uksm_map_zero_page(pte_t pte) ++{ ++} ++ ++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) ++{ ++} ++ ++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) ++{ ++} ++ ++static inline int uksm_flags_can_scan(unsigned long vm_flags) ++{ ++ return 0; ++} ++ ++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) ++{ ++} ++ ++static inline void uksm_bugon_zeropage(pte_t pte) ++{ ++} ++#endif /* !CONFIG_UKSM */ ++#endif /* __LINUX_UKSM_H */ +diff --git a/kernel/fork.c b/kernel/fork.c +index 423d5a4..7281f89 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -360,7 +360,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) + goto fail_nomem; + charge = len; + } +- tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); ++ tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (!tmp) + goto fail_nomem; + *tmp = *mpnt; +@@ -412,7 +412,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; +- ++ uksm_vma_add_new(tmp); + mm->map_count++; + retval = copy_page_range(mm, oldmm, mpnt); + +diff --git a/lib/Makefile b/lib/Makefile +index 18515f0..2df136b 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) + endif + + lib-y := ctype.o string.o vsprintf.o cmdline.o \ +- rbtree.o radix-tree.o dump_stack.o timerqueue.o\ ++ rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\ + idr.o int_sqrt.o extable.o prio_tree.o \ + sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ + proportions.o prio_heap.o ratelimit.o show_mem.o \ +diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c +new file mode 100644 +index 0000000..8d06329 +--- /dev/null ++++ b/lib/sradix-tree.c +@@ -0,0 +1,476 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) ++{ ++ return node->fulls == root->stores_size || ++ (node->height == 1 && node->count == root->stores_size); ++} ++ ++/* ++ * Extend a sradix tree so it can store key @index. ++ */ ++static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) ++{ ++ struct sradix_tree_node *node; ++ unsigned int height; ++ ++ if (unlikely(root->rnode == NULL)) { ++ if (!(node = root->alloc())) ++ return -ENOMEM; ++ ++ node->height = 1; ++ root->rnode = node; ++ root->height = 1; ++ } ++ ++ /* Figure out what the height should be. */ ++ height = root->height; ++ index >>= root->shift * height; ++ ++ while (index) { ++ index >>= root->shift; ++ height++; ++ } ++ ++ while (height > root->height) { ++ unsigned int newheight; ++ if (!(node = root->alloc())) ++ return -ENOMEM; ++ ++ /* Increase the height. */ ++ node->stores[0] = root->rnode; ++ root->rnode->parent = node; ++ if (root->extend) ++ root->extend(node, root->rnode); ++ ++ newheight = root->height + 1; ++ node->height = newheight; ++ node->count = 1; ++ if (sradix_node_full(root, root->rnode)) ++ node->fulls = 1; ++ ++ root->rnode = node; ++ root->height = newheight; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Search the next item from the current node, that is not NULL ++ * and can satify root->iter(). ++ */ ++void *sradix_tree_next(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index, ++ int (*iter)(void *item, unsigned long height)) ++{ ++ unsigned long offset; ++ void *item; ++ ++ if (unlikely(node == NULL)) { ++ node = root->rnode; ++ for (offset = 0; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (unlikely(offset >= root->stores_size)) ++ return NULL; ++ ++ if (node->height == 1) ++ return item; ++ else ++ goto go_down; ++ } ++ ++ while (node) { ++ offset = (index & root->mask) + 1; ++ for (;offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (offset < root->stores_size) ++ break; ++ ++ node = node->parent; ++ index >>= root->shift; ++ } ++ ++ if (!node) ++ return NULL; ++ ++ while (node->height > 1) { ++go_down: ++ node = item; ++ for (offset = 0; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (unlikely(offset >= root->stores_size)) ++ return NULL; ++ } ++ ++ BUG_ON(offset > root->stores_size); ++ ++ return item; ++} ++ ++/* ++ * Blindly insert the item to the tree. Typically, we reuse the ++ * first empty store item. ++ */ ++int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) ++{ ++ unsigned long index; ++ unsigned int height; ++ struct sradix_tree_node *node, *tmp = NULL; ++ int offset, offset_saved; ++ void **store = NULL; ++ int error, i, j, shift; ++ ++go_on: ++ index = root->min; ++ ++ if (root->enter_node && !sradix_node_full(root, root->enter_node)) { ++ node = root->enter_node; ++ BUG_ON((index >> (root->shift * root->height))); ++ } else { ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height)) ++ || sradix_node_full(root, node)) { ++ error = sradix_tree_extend(root, index); ++ if (error) ++ return error; ++ ++ node = root->rnode; ++ } ++ } ++ ++ ++ height = node->height; ++ shift = (height - 1) * root->shift; ++ offset = (index >> shift) & root->mask; ++ while (shift > 0) { ++ offset_saved = offset; ++ for (; offset < root->stores_size; offset++) { ++ store = &node->stores[offset]; ++ tmp = *store; ++ ++ if (!tmp || !sradix_node_full(root, tmp)) ++ break; ++ } ++ BUG_ON(offset >= root->stores_size); ++ ++ if (offset != offset_saved) { ++ index += (offset - offset_saved) << shift; ++ index &= ~((1UL << shift) - 1); ++ } ++ ++ if (!tmp) { ++ if (!(tmp = root->alloc())) ++ return -ENOMEM; ++ ++ tmp->height = shift / root->shift; ++ *store = tmp; ++ tmp->parent = node; ++ node->count++; ++// if (root->extend) ++// root->extend(node, tmp); ++ } ++ ++ node = tmp; ++ shift -= root->shift; ++ offset = (index >> shift) & root->mask; ++ } ++ ++ BUG_ON(node->height != 1); ++ ++ ++ store = &node->stores[offset]; ++ for (i = 0, j = 0; ++ j < root->stores_size - node->count && ++ i < root->stores_size - offset && j < num; i++) { ++ if (!store[i]) { ++ store[i] = item[j]; ++ if (root->assign) ++ root->assign(node, index + i, item[j]); ++ j++; ++ } ++ } ++ ++ node->count += j; ++ root->num += j; ++ num -= j; ++ ++ while (sradix_node_full(root, node)) { ++ node = node->parent; ++ if (!node) ++ break; ++ ++ node->fulls++; ++ } ++ ++ if (unlikely(!node)) { ++ /* All nodes are full */ ++ root->min = 1 << (root->height * root->shift); ++ root->enter_node = NULL; ++ } else { ++ root->min = index + i - 1; ++ root->min |= (1UL << (node->height - 1)) - 1; ++ root->min++; ++ root->enter_node = node; ++ } ++ ++ if (num) { ++ item += j; ++ goto go_on; ++ } ++ ++ return 0; ++} ++ ++ ++/** ++ * sradix_tree_shrink - shrink height of a sradix tree to minimal ++ * @root sradix tree root ++ * ++ */ ++static inline void sradix_tree_shrink(struct sradix_tree_root *root) ++{ ++ /* try to shrink tree height */ ++ while (root->height > 1) { ++ struct sradix_tree_node *to_free = root->rnode; ++ ++ /* ++ * The candidate node has more than one child, or its child ++ * is not at the leftmost store, we cannot shrink. ++ */ ++ if (to_free->count != 1 || !to_free->stores[0]) ++ break; ++ ++ root->rnode = to_free->stores[0]; ++ root->rnode->parent = NULL; ++ root->height--; ++ if (unlikely(root->enter_node == to_free)) { ++ root->enter_node = NULL; ++ } ++ root->free(to_free); ++ } ++} ++ ++/* ++ * Del the item on the known leaf node and index ++ */ ++void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index) ++{ ++ unsigned int offset; ++ struct sradix_tree_node *start, *end; ++ ++ BUG_ON(node->height != 1); ++ ++ start = node; ++ while (node && !(--node->count)) ++ node = node->parent; ++ ++ end = node; ++ if (!node) { ++ root->rnode = NULL; ++ root->height = 0; ++ root->min = 0; ++ root->num = 0; ++ root->enter_node = NULL; ++ } else { ++ offset = (index >> (root->shift * (node->height - 1))) & root->mask; ++ if (root->rm) ++ root->rm(node, offset); ++ node->stores[offset] = NULL; ++ root->num--; ++ if (root->min > index) { ++ root->min = index; ++ root->enter_node = node; ++ } ++ } ++ ++ if (start != end) { ++ do { ++ node = start; ++ start = start->parent; ++ if (unlikely(root->enter_node == node)) ++ root->enter_node = end; ++ root->free(node); ++ } while (start != end); ++ ++ /* ++ * Note that shrink may free "end", so enter_node still need to ++ * be checked inside. ++ */ ++ sradix_tree_shrink(root); ++ } else if (node->count == root->stores_size - 1) { ++ /* It WAS a full leaf node. Update the ancestors */ ++ node = node->parent; ++ while (node) { ++ node->fulls--; ++ if (node->fulls != root->stores_size - 1) ++ break; ++ ++ node = node->parent; ++ } ++ } ++} ++ ++void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node; ++ int shift; ++ ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height))) ++ return NULL; ++ ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ node = node->stores[offset]; ++ if (!node) ++ return NULL; ++ ++ shift -= root->shift; ++ } while (shift >= 0); ++ ++ return node; ++} ++ ++/* ++ * Return the item if it exists, otherwise create it in place ++ * and return the created item. ++ */ ++void *sradix_tree_lookup_create(struct sradix_tree_root *root, ++ unsigned long index, void *(*item_alloc)(void)) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node, *tmp; ++ void *item; ++ int shift, error; ++ ++ if (root->rnode == NULL || (index >> (root->shift * root->height))) { ++ if (item_alloc) { ++ error = sradix_tree_extend(root, index); ++ if (error) ++ return NULL; ++ } else { ++ return NULL; ++ } ++ } ++ ++ node = root->rnode; ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ if (!node->stores[offset]) { ++ if (!(tmp = root->alloc())) ++ return NULL; ++ ++ tmp->height = shift / root->shift; ++ node->stores[offset] = tmp; ++ tmp->parent = node; ++ node->count++; ++ node = tmp; ++ } else { ++ node = node->stores[offset]; ++ } ++ ++ shift -= root->shift; ++ } while (shift > 0); ++ ++ BUG_ON(node->height != 1); ++ offset = index & root->mask; ++ if (node->stores[offset]) { ++ return node->stores[offset]; ++ } else if (item_alloc) { ++ if (!(item = item_alloc())) ++ return NULL; ++ ++ node->stores[offset] = item; ++ ++ /* ++ * NOTE: we do NOT call root->assign here, since this item is ++ * newly created by us having no meaning. Caller can call this ++ * if it's necessary to do so. ++ */ ++ ++ node->count++; ++ root->num++; ++ ++ while (sradix_node_full(root, node)) { ++ node = node->parent; ++ if (!node) ++ break; ++ ++ node->fulls++; ++ } ++ ++ if (unlikely(!node)) { ++ /* All nodes are full */ ++ root->min = 1 << (root->height * root->shift); ++ } else { ++ if (root->min == index) { ++ root->min |= (1UL << (node->height - 1)) - 1; ++ root->min++; ++ root->enter_node = node; ++ } ++ } ++ ++ return item; ++ } else { ++ return NULL; ++ } ++ ++} ++ ++int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node; ++ int shift; ++ ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height))) ++ return -ENOENT; ++ ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ node = node->stores[offset]; ++ if (!node) ++ return -ENOENT; ++ ++ shift -= root->shift; ++ } while (shift > 0); ++ ++ offset = index & root->mask; ++ if (!node->stores[offset]) ++ return -ENOENT; ++ ++ sradix_tree_delete_from_leaf(root, node, index); ++ ++ return 0; ++} +diff --git a/mm/Kconfig b/mm/Kconfig +index e338407..8df1b4f 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -245,6 +245,32 @@ config KSM + See Documentation/vm/ksm.txt for more information: KSM is inactive + until a program has madvised that an area is MADV_MERGEABLE, and + root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). ++choice ++ prompt "Choose UKSM/KSM strategy" ++ default UKSM ++ depends on KSM ++ help ++ This option allows to select a UKSM/KSM stragety. ++ ++config UKSM ++ bool "Ultra-KSM for page merging" ++ depends on KSM ++ help ++ UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same ++ page Merging), but with a fundamentally rewritten core algorithm. With ++ an advanced algorithm, UKSM now can transparently scans all anonymously ++ mapped user space applications with an significantly improved scan speed ++ and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from ++ UKSM. Now UKSM has its first stable release and first real world enterprise user. ++ For more information, please goto its project page. ++ (www.kerneldedup.org) ++ ++config KSM_LEGACY ++ bool "Legacy KSM implementation" ++ depends on KSM ++ help ++ The legacy KSM implementation from Redhat. ++endchoice + + config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" +diff --git a/mm/Makefile b/mm/Makefile +index 50ec00e..c551bae 100644 +--- a/mm/Makefile ++++ b/mm/Makefile +@@ -34,7 +34,8 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o + obj-$(CONFIG_SLOB) += slob.o + obj-$(CONFIG_COMPACTION) += compaction.o + obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +-obj-$(CONFIG_KSM) += ksm.o ++obj-$(CONFIG_KSM_LEGACY) += ksm.o ++obj-$(CONFIG_UKSM) += uksm.o + obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o + obj-$(CONFIG_SLAB) += slab.o + obj-$(CONFIG_SLUB) += slub.o +diff --git a/mm/memory.c b/mm/memory.c +index 10b4dda..be73fff 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -112,6 +112,37 @@ __setup("norandmaps", disable_randmaps); + unsigned long zero_pfn __read_mostly; + unsigned long highest_memmap_pfn __read_mostly; + ++#ifdef CONFIG_UKSM ++unsigned long uksm_zero_pfn __read_mostly; ++struct page *empty_uksm_zero_page; ++ ++static int __init setup_uksm_zero_page(void) ++{ ++ unsigned long addr; ++ addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0); ++ if (!addr) ++ panic("Oh boy, that early out of memory?"); ++ ++ empty_uksm_zero_page = virt_to_page((void *) addr); ++ SetPageReserved(empty_uksm_zero_page); ++ ++ uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page); ++ ++ return 0; ++} ++core_initcall(setup_uksm_zero_page); ++ ++static inline int is_uksm_zero_pfn(unsigned long pfn) ++{ ++ return pfn == uksm_zero_pfn; ++} ++#else ++static inline int is_uksm_zero_pfn(unsigned long pfn) ++{ ++ return 0; ++} ++#endif ++ + /* + * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() + */ +@@ -123,6 +154,7 @@ static int __init init_zero_pfn(void) + core_initcall(init_zero_pfn); + + ++ + #if defined(SPLIT_RSS_COUNTING) + + static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) +@@ -739,8 +771,10 @@ static inline int is_cow_mapping(vm_flags_t flags) + #ifndef is_zero_pfn + static inline int is_zero_pfn(unsigned long pfn) + { +- return pfn == zero_pfn; ++ return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn)); + } ++#else ++#define is_zero_pfn(pfn) (is_zero_pfn(pfn) || is_uksm_zero_pfn(pfn)) + #endif + + #ifndef my_zero_pfn +@@ -926,6 +960,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; ++ ++ /* Should return NULL in vm_normal_page() */ ++ uksm_bugon_zeropage(pte); ++ } else { ++ uksm_map_zero_page(pte); + } + + out_set_pte: +@@ -1161,8 +1200,10 @@ again: + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + tlb_remove_tlb_entry(tlb, pte, addr); +- if (unlikely(!page)) ++ if (unlikely(!page)) { ++ uksm_unmap_zero_page(ptent); + continue; ++ } + if (unlikely(details) && details->nonlinear_vma + && linear_page_index(details->nonlinear_vma, + addr) != page->index) +@@ -1661,7 +1702,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); + +- /* ++ /* + * Require read or write permissions. + * If FOLL_FORCE is set, we only require the "MAY" flags. + */ +@@ -1708,7 +1749,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + page = vm_normal_page(vma, start, *pte); + if (!page) { + if (!(gup_flags & FOLL_DUMP) && +- is_zero_pfn(pte_pfn(*pte))) ++ (is_zero_pfn(pte_pfn(*pte)))) + page = pte_page(*pte); + else { + pte_unmap(pte); +@@ -2468,8 +2509,10 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo + clear_page(kaddr); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(dst); +- } else ++ } else { + copy_user_highpage(dst, src, va, vma); ++ uksm_cow_page(vma, src); ++ } + } + + /* +@@ -2667,6 +2710,7 @@ gotten: + new_page = alloc_zeroed_user_highpage_movable(vma, address); + if (!new_page) + goto oom; ++ uksm_cow_pte(vma, orig_pte); + } else { + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) +@@ -2688,8 +2732,11 @@ gotten: + dec_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); + } +- } else ++ uksm_bugon_zeropage(orig_pte); ++ } else { ++ uksm_unmap_zero_page(orig_pte); + inc_mm_counter_fast(mm, MM_ANONPAGES); ++ } + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = mk_pte(new_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); +diff --git a/mm/mmap.c b/mm/mmap.c +index da15a79..76cf74c 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -65,7 +66,7 @@ static void unmap_region(struct mm_struct *mm, + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes +- * ++ * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes +@@ -236,6 +237,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) + removed_exe_file_vma(vma->vm_mm); + } + mpol_put(vma_policy(vma)); ++ uksm_remove_vma(vma); + kmem_cache_free(vm_area_cachep, vma); + return next; + } +@@ -500,9 +502,16 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, + long adjust_next = 0; + int remove_next = 0; + ++/* ++ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is ++ * acquired ++ */ ++ uksm_remove_vma(vma); ++ + if (next && !insert) { + struct vm_area_struct *exporter = NULL; + ++ uksm_remove_vma(next); + if (end >= next->vm_end) { + /* + * vma expands, overlapping all the next, and +@@ -578,10 +587,10 @@ again: remove_next = 1 + (end > next->vm_end); + if (adjust_next) + vma_prio_tree_remove(next, root); + } +- + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; ++ + if (adjust_next) { + next->vm_start += adjust_next << PAGE_SHIFT; + next->vm_pgoff += adjust_next; +@@ -634,10 +643,15 @@ again: remove_next = 1 + (end > next->vm_end); + */ + if (remove_next == 2) { + next = vma->vm_next; ++ uksm_remove_vma(next); + goto again; + } ++ } else { ++ if (next && !insert) ++ uksm_vma_add_new(next); + } + ++ uksm_vma_add_new(vma); + validate_mm(mm); + + return 0; +@@ -992,6 +1006,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + ++ /* If uksm is enabled, we add VM_MERGABLE to new VMAs. */ ++ uksm_vm_flags_mod(&vm_flags); ++ + if (flags & MAP_LOCKED) + if (!can_do_mlock()) + return -EPERM; +@@ -1318,6 +1335,7 @@ munmap_back: + + vma_link(mm, vma, prev, rb_link, rb_parent); + file = vma->vm_file; ++ uksm_vma_add_new(vma); + + /* Once vma denies write, undo our temporary denial count */ + if (correct_wcount) +@@ -1344,6 +1362,7 @@ unmap_and_free_vma: + unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); + charged = 0; + free_vma: ++ uksm_remove_vma(vma); + kmem_cache_free(vm_area_cachep, vma); + unacct_error: + if (charged) +@@ -1419,7 +1438,7 @@ full_search: + addr = vma->vm_end; + } + } +-#endif ++#endif + + void arch_unmap_area(struct mm_struct *mm, unsigned long addr) + { +@@ -1969,6 +1988,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, + else + err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + ++ uksm_vma_add_new(new); ++ + /* Success. */ + if (!err) + return 0; +@@ -2138,6 +2159,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) + return error; + + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; ++ uksm_vm_flags_mod(&flags); + + error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (error & ~PAGE_MASK) +@@ -2206,6 +2228,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) + vma->vm_flags = flags; + vma->vm_page_prot = vm_get_page_prot(flags); + vma_link(mm, vma, prev, rb_link, rb_parent); ++ uksm_vma_add_new(vma); + out: + perf_event_mmap(vma); + mm->total_vm += len >> PAGE_SHIFT; +@@ -2229,6 +2252,12 @@ void exit_mmap(struct mm_struct *mm) + /* mm's last user has gone, and its about to be pulled down */ + mmu_notifier_release(mm); + ++ /* ++ * Taking write lock on mmap_sem does not harm others, ++ * but it's crucial for uksm to avoid races. ++ */ ++ down_write(&mm->mmap_sem); ++ + if (mm->locked_vm) { + vma = mm->mmap; + while (vma) { +@@ -2262,6 +2291,11 @@ void exit_mmap(struct mm_struct *mm) + while (vma) + vma = remove_vma(vma); + ++ mm->mmap = NULL; ++ mm->mm_rb = RB_ROOT; ++ mm->mmap_cache = NULL; ++ up_write(&mm->mmap_sem); ++ + BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + } + +@@ -2371,6 +2405,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + vma_link(mm, new_vma, prev, rb_link, rb_parent); ++ uksm_vma_add_new(new_vma); + } + } + return new_vma; +@@ -2476,10 +2511,10 @@ int install_special_mapping(struct mm_struct *mm, + ret = insert_vm_struct(mm, vma); + if (ret) + goto out; +- + mm->total_vm += len >> PAGE_SHIFT; + + perf_event_mmap(vma); ++ uksm_vma_add_new(vma); + + return 0; + +diff --git a/mm/rmap.c b/mm/rmap.c +index c8454e0..90a50d2 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -1017,9 +1017,9 @@ void page_move_anon_rmap(struct page *page, + + /** + * __page_set_anon_rmap - set up new anonymous rmap +- * @page: Page to add to rmap ++ * @page: Page to add to rmap + * @vma: VM area to add page to. +- * @address: User virtual address of the mapping ++ * @address: User virtual address of the mapping + * @exclusive: the page is exclusively owned by the current process + */ + static void __page_set_anon_rmap(struct page *page, +diff --git a/mm/uksm.c b/mm/uksm.c +new file mode 100644 +index 0000000..967c755 +--- /dev/null ++++ b/mm/uksm.c +@@ -0,0 +1,5616 @@ ++/* ++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia ++ * ++ * This is an improvement upon KSM. Some basic data structures and routines ++ * are borrowed from ksm.c . ++ * ++ * Its new features: ++ * 1. Full system scan: ++ * It automatically scans all user processes' anonymous VMAs. Kernel-user ++ * interaction to submit a memory area to KSM is no longer needed. ++ * ++ * 2. Rich area detection: ++ * It automatically detects rich areas containing abundant duplicated ++ * pages based. Rich areas are given a full scan speed. Poor areas are ++ * sampled at a reasonable speed with very low CPU consumption. ++ * ++ * 3. Ultra Per-page scan speed improvement: ++ * A new hash algorithm is proposed. As a result, on a machine with ++ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it ++ * can scan memory areas that does not contain duplicated pages at speed of ++ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of ++ * 477MB/sec ~ 923MB/sec. ++ * ++ * 4. Thrashing area avoidance: ++ * Thrashing area(an VMA that has frequent Ksm page break-out) can be ++ * filtered out. My benchmark shows it's more efficient than KSM's per-page ++ * hash value based volatile page detection. ++ * ++ * ++ * 5. Misc changes upon KSM: ++ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page ++ * comparison. It's much faster than default C version on x86. ++ * * rmap_item now has an struct *page member to loosely cache a ++ * address-->page mapping, which reduces too much time-costly ++ * follow_page(). ++ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. ++ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ ++ * ksm is needed for this case. ++ * ++ * 6. Full Zero Page consideration(contributed by Figo Zhang) ++ * Now uksmd consider full zero pages as special pages and merge them to an ++ * special unswappable uksm zero page. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include "internal.h" ++ ++#ifdef CONFIG_X86 ++#undef memcmp ++ ++#ifdef CONFIG_X86_32 ++#define memcmp memcmpx86_32 ++/* ++ * Compare 4-byte-aligned address s1 and s2, with length n ++ */ ++int memcmpx86_32(void *s1, void *s2, size_t n) ++{ ++ size_t num = n / 4; ++ register int res; ++ ++ __asm__ __volatile__ ++ ( ++ "testl %3,%3\n\t" ++ "repe; cmpsd\n\t" ++ "je 1f\n\t" ++ "sbbl %0,%0\n\t" ++ "orl $1,%0\n" ++ "1:" ++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) ++ : "0" (0) ++ : "cc"); ++ ++ return res; ++} ++ ++/* ++ * Check the page is all zero ? ++ */ ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned char same; ++ ++ len /= 4; ++ ++ __asm__ __volatile__ ++ ("repe; scasl;" ++ "sete %0" ++ : "=qm" (same), "+D" (s1), "+c" (len) ++ : "a" (0) ++ : "cc"); ++ ++ return same; ++} ++ ++ ++#elif defined(CONFIG_X86_64) ++#define memcmp memcmpx86_64 ++/* ++ * Compare 8-byte-aligned address s1 and s2, with length n ++ */ ++int memcmpx86_64(void *s1, void *s2, size_t n) ++{ ++ size_t num = n / 8; ++ register int res; ++ ++ __asm__ __volatile__ ++ ( ++ "testq %q3,%q3\n\t" ++ "repe; cmpsq\n\t" ++ "je 1f\n\t" ++ "sbbq %q0,%q0\n\t" ++ "orq $1,%q0\n" ++ "1:" ++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) ++ : "0" (0) ++ : "cc"); ++ ++ return res; ++} ++ ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned char same; ++ ++ len /= 8; ++ ++ __asm__ __volatile__ ++ ("repe; scasq;" ++ "sete %0" ++ : "=qm" (same), "+D" (s1), "+c" (len) ++ : "a" (0) ++ : "cc"); ++ ++ return same; ++} ++ ++#endif ++#else ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned long *src = s1; ++ int i; ++ ++ len /= sizeof(*src); ++ ++ for (i = 0; i < len; i++) { ++ if (src[i]) ++ return 0; ++ } ++ ++ return 1; ++} ++#endif ++ ++#define U64_MAX (~((u64)0)) ++#define UKSM_RUNG_ROUND_FINISHED (1 << 0) ++#define TIME_RATIO_SCALE 10000 ++ ++#define SLOT_TREE_NODE_SHIFT 8 ++#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT) ++struct slot_tree_node { ++ unsigned long size; ++ struct sradix_tree_node snode; ++ void *stores[SLOT_TREE_NODE_STORE_SIZE]; ++}; ++ ++static struct kmem_cache *slot_tree_node_cachep; ++ ++static struct sradix_tree_node *slot_tree_node_alloc(void) ++{ ++ struct slot_tree_node *p; ++ p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL); ++ if (!p) ++ return NULL; ++ ++ return &p->snode; ++} ++ ++static void slot_tree_node_free(struct sradix_tree_node *node) ++{ ++ struct slot_tree_node *p; ++ ++ p = container_of(node, struct slot_tree_node, snode); ++ kmem_cache_free(slot_tree_node_cachep, p); ++} ++ ++static void slot_tree_node_extend(struct sradix_tree_node *parent, ++ struct sradix_tree_node *child) ++{ ++ struct slot_tree_node *p, *c; ++ ++ p = container_of(parent, struct slot_tree_node, snode); ++ c = container_of(child, struct slot_tree_node, snode); ++ ++ p->size += c->size; ++} ++ ++void slot_tree_node_assign(struct sradix_tree_node *node, ++ unsigned index, void *item) ++{ ++ struct vma_slot *slot = item; ++ struct slot_tree_node *cur; ++ ++ slot->snode = node; ++ slot->sindex = index; ++ ++ while (node) { ++ cur = container_of(node, struct slot_tree_node, snode); ++ cur->size += slot->pages; ++ node = node->parent; ++ } ++} ++ ++void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset) ++{ ++ struct vma_slot *slot; ++ struct slot_tree_node *cur; ++ unsigned long pages; ++ ++ if (node->height == 1) { ++ slot = node->stores[offset]; ++ pages = slot->pages; ++ } else { ++ cur = container_of(node->stores[offset], ++ struct slot_tree_node, snode); ++ pages = cur->size; ++ } ++ ++ while (node) { ++ cur = container_of(node, struct slot_tree_node, snode); ++ cur->size -= pages; ++ node = node->parent; ++ } ++} ++ ++unsigned long slot_iter_index; ++int slot_iter(void *item, unsigned long height) ++{ ++ struct slot_tree_node *node; ++ struct vma_slot *slot; ++ ++ if (height == 1) { ++ slot = item; ++ if (slot_iter_index < slot->pages) { ++ /*in this one*/ ++ return 1; ++ } else { ++ slot_iter_index -= slot->pages; ++ return 0; ++ } ++ ++ } else { ++ node = container_of(item, struct slot_tree_node, snode); ++ if (slot_iter_index < node->size) { ++ /*in this one*/ ++ return 1; ++ } else { ++ slot_iter_index -= node->size; ++ return 0; ++ } ++ } ++} ++ ++ ++static inline void slot_tree_init_root(struct sradix_tree_root *root) ++{ ++ init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT); ++ root->alloc = slot_tree_node_alloc; ++ root->free = slot_tree_node_free; ++ root->extend = slot_tree_node_extend; ++ root->assign = slot_tree_node_assign; ++ root->rm = slot_tree_node_rm; ++} ++ ++void slot_tree_init(void) ++{ ++ slot_tree_node_cachep = kmem_cache_create("slot_tree_node", ++ sizeof(struct slot_tree_node), 0, ++ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, ++ NULL); ++} ++ ++ ++/* Each rung of this ladder is a list of VMAs having a same scan ratio */ ++struct scan_rung { ++ //struct list_head scanned_list; ++ struct sradix_tree_root vma_root; ++ struct sradix_tree_root vma_root2; ++ ++ struct vma_slot *current_scan; ++ unsigned long current_offset; ++ ++ /* ++ * The initial value for current_offset, it should loop over ++ * [0~ step - 1] to let all slot have its chance to be scanned. ++ */ ++ unsigned long offset_init; ++ unsigned long step; /* dynamic step for current_offset */ ++ unsigned int flags; ++ unsigned long pages_to_scan; ++ //unsigned long fully_scanned_slots; ++ /* ++ * a little bit tricky - if cpu_time_ratio > 0, then the value is the ++ * the cpu time ratio it can spend in rung_i for every scan ++ * period. if < 0, then it is the cpu time ratio relative to the ++ * max cpu percentage user specified. Both in unit of ++ * 1/TIME_RATIO_SCALE ++ */ ++ int cpu_ratio; ++ ++ /* ++ * How long it will take for all slots in this rung to be fully ++ * scanned? If it's zero, we don't care about the cover time: ++ * it's fully scanned. ++ */ ++ unsigned int cover_msecs; ++ //unsigned long vma_num; ++ //unsigned long pages; /* Sum of all slot's pages in rung */ ++}; ++ ++/** ++ * node of either the stable or unstale rbtree ++ * ++ */ ++struct tree_node { ++ struct rb_node node; /* link in the main (un)stable rbtree */ ++ struct rb_root sub_root; /* rb_root for sublevel collision rbtree */ ++ u32 hash; ++ unsigned long count; /* TODO: merged with sub_root */ ++ struct list_head all_list; /* all tree nodes in stable/unstable tree */ ++}; ++ ++/** ++ * struct stable_node - node of the stable rbtree ++ * @node: rb node of this ksm page in the stable tree ++ * @hlist: hlist head of rmap_items using this ksm page ++ * @kpfn: page frame number of this ksm page ++ */ ++struct stable_node { ++ struct rb_node node; /* link in sub-rbtree */ ++ struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */ ++ struct hlist_head hlist; ++ unsigned long kpfn; ++ u32 hash_max; /* if ==0 then it's not been calculated yet */ ++ struct list_head all_list; /* in a list for all stable nodes */ ++}; ++ ++/** ++ * struct node_vma - group rmap_items linked in a same stable ++ * node together. ++ */ ++struct node_vma { ++ union { ++ struct vma_slot *slot; ++ unsigned long key; /* slot is used as key sorted on hlist */ ++ }; ++ struct hlist_node hlist; ++ struct hlist_head rmap_hlist; ++ struct stable_node *head; ++}; ++ ++/** ++ * struct rmap_item - reverse mapping item for virtual addresses ++ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list ++ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree ++ * @mm: the memory structure this rmap_item is pointing into ++ * @address: the virtual address this rmap_item tracks (+ flags in low bits) ++ * @node: rb node of this rmap_item in the unstable tree ++ * @head: pointer to stable_node heading this list in the stable tree ++ * @hlist: link into hlist of rmap_items hanging off that stable_node ++ */ ++struct rmap_item { ++ struct vma_slot *slot; ++ struct page *page; ++ unsigned long address; /* + low bits used for flags below */ ++ unsigned long hash_round; ++ unsigned long entry_index; ++ union { ++ struct {/* when in unstable tree */ ++ struct rb_node node; ++ struct tree_node *tree_node; ++ u32 hash_max; ++ }; ++ struct { /* when in stable tree */ ++ struct node_vma *head; ++ struct hlist_node hlist; ++ struct anon_vma *anon_vma; ++ }; ++ }; ++} __attribute__((aligned(4))); ++ ++struct rmap_list_entry { ++ union { ++ struct rmap_item *item; ++ unsigned long addr; ++ }; ++ /* lowest bit is used for is_addr tag */ ++} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/ ++ ++ ++/* Basic data structure definition ends */ ++ ++ ++/* ++ * Flags for rmap_item to judge if it's listed in the stable/unstable tree. ++ * The flags use the low bits of rmap_item.address ++ */ ++#define UNSTABLE_FLAG 0x1 ++#define STABLE_FLAG 0x2 ++#define get_rmap_addr(x) ((x)->address & PAGE_MASK) ++ ++/* ++ * rmap_list_entry helpers ++ */ ++#define IS_ADDR_FLAG 1 ++#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG) ++#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG) ++#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG)) ++ ++ ++/* ++ * High speed caches for frequently allocated and freed structs ++ */ ++static struct kmem_cache *rmap_item_cache; ++static struct kmem_cache *stable_node_cache; ++static struct kmem_cache *node_vma_cache; ++static struct kmem_cache *vma_slot_cache; ++static struct kmem_cache *tree_node_cache; ++#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\ ++ sizeof(struct __struct), __alignof__(struct __struct),\ ++ (__flags), NULL) ++ ++/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */ ++#define SCAN_LADDER_SIZE 4 ++static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE]; ++ ++/* The evaluation rounds uksmd has finished */ ++static unsigned long long uksm_eval_round = 1; ++ ++/* ++ * we add 1 to this var when we consider we should rebuild the whole ++ * unstable tree. ++ */ ++static unsigned long uksm_hash_round = 1; ++ ++/* ++ * How many times the whole memory is scanned. ++ */ ++static unsigned long long fully_scanned_round = 1; ++ ++/* The total number of virtual pages of all vma slots */ ++static u64 uksm_pages_total; ++ ++/* The number of pages has been scanned since the start up */ ++static u64 uksm_pages_scanned; ++ ++static u64 scanned_virtual_pages; ++ ++/* The number of pages has been scanned since last encode_benefit call */ ++static u64 uksm_pages_scanned_last; ++ ++/* If the scanned number is tooo large, we encode it here */ ++static u64 pages_scanned_stored; ++ ++static unsigned long pages_scanned_base; ++ ++/* The number of nodes in the stable tree */ ++static unsigned long uksm_pages_shared; ++ ++/* The number of page slots additionally sharing those nodes */ ++static unsigned long uksm_pages_sharing; ++ ++/* The number of nodes in the unstable tree */ ++static unsigned long uksm_pages_unshared; ++ ++/* ++ * Milliseconds ksmd should sleep between scans, ++ * >= 100ms to be consistent with ++ * scan_time_to_sleep_msec() ++ */ ++static unsigned int uksm_sleep_jiffies; ++ ++/* The real value for the uksmd next sleep */ ++static unsigned int uksm_sleep_real; ++ ++/* Saved value for user input uksm_sleep_jiffies when it's enlarged */ ++static unsigned int uksm_sleep_saved; ++ ++/* Max percentage of cpu utilization ksmd can take to scan in one batch */ ++static unsigned int uksm_max_cpu_percentage; ++ ++static int uksm_cpu_governor; ++ ++static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" }; ++ ++struct uksm_cpu_preset_s { ++ int cpu_ratio[SCAN_LADDER_SIZE]; ++ unsigned int cover_msecs[SCAN_LADDER_SIZE]; ++ unsigned int max_cpu; /* percentage */ ++}; ++ ++struct uksm_cpu_preset_s uksm_cpu_preset[4] = { ++ { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95}, ++ { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50}, ++ { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20}, ++ { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1}, ++}; ++ ++/* The default value for uksm_ema_page_time if it's not initialized */ ++#define UKSM_PAGE_TIME_DEFAULT 500 ++ ++/*cost to scan one page by expotional moving average in nsecs */ ++static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; ++ ++/* The expotional moving average alpha weight, in percentage. */ ++#define EMA_ALPHA 20 ++ ++/* ++ * The threshold used to filter out thrashing areas, ++ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound ++ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio ++ * will be considered as having a zero duplication ratio. ++ */ ++static unsigned int uksm_thrash_threshold = 50; ++ ++/* How much dedup ratio is considered to be abundant*/ ++static unsigned int uksm_abundant_threshold = 10; ++ ++/* All slots having merged pages in this eval round. */ ++struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup); ++ ++/* How many times the ksmd has slept since startup */ ++static unsigned long long uksm_sleep_times; ++ ++#define UKSM_RUN_STOP 0 ++#define UKSM_RUN_MERGE 1 ++static unsigned int uksm_run = 1; ++ ++static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait); ++static DEFINE_MUTEX(uksm_thread_mutex); ++ ++/* ++ * List vma_slot_new is for newly created vma_slot waiting to be added by ++ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to ++ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding ++ * VMA has been removed/freed. ++ */ ++struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new); ++struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd); ++struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del); ++static DEFINE_SPINLOCK(vma_slot_list_lock); ++ ++/* The unstable tree heads */ ++static struct rb_root root_unstable_tree = RB_ROOT; ++ ++/* ++ * All tree_nodes are in a list to be freed at once when unstable tree is ++ * freed after each scan round. ++ */ ++static struct list_head unstable_tree_node_list = ++ LIST_HEAD_INIT(unstable_tree_node_list); ++ ++/* List contains all stable nodes */ ++static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list); ++ ++/* ++ * When the hash strength is changed, the stable tree must be delta_hashed and ++ * re-structured. We use two set of below structs to speed up the ++ * re-structuring of stable tree. ++ */ ++static struct list_head ++stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]), ++ LIST_HEAD_INIT(stable_tree_node_list[1])}; ++ ++static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0]; ++static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT}; ++static struct rb_root *root_stable_treep = &root_stable_tree[0]; ++static unsigned long stable_tree_index; ++ ++/* The hash strength needed to hash a full page */ ++#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32)) ++ ++/* The hash strength needed for loop-back hashing */ ++#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10) ++ ++/* The random offsets in a page */ ++static u32 *random_nums; ++ ++/* The hash strength */ ++static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4; ++ ++/* The delta value each time the hash strength increases or decreases */ ++static unsigned long hash_strength_delta; ++#define HASH_STRENGTH_DELTA_MAX 5 ++ ++/* The time we have saved due to random_sample_hash */ ++static u64 rshash_pos; ++ ++/* The time we have wasted due to hash collision */ ++static u64 rshash_neg; ++ ++struct uksm_benefit { ++ u64 pos; ++ u64 neg; ++ u64 scanned; ++ unsigned long base; ++} benefit; ++ ++/* ++ * The relative cost of memcmp, compared to 1 time unit of random sample ++ * hash, this value is tested when ksm module is initialized ++ */ ++static unsigned long memcmp_cost; ++ ++static unsigned long rshash_neg_cont_zero; ++static unsigned long rshash_cont_obscure; ++ ++/* The possible states of hash strength adjustment heuristic */ ++enum rshash_states { ++ RSHASH_STILL, ++ RSHASH_TRYUP, ++ RSHASH_TRYDOWN, ++ RSHASH_NEW, ++ RSHASH_PRE_STILL, ++}; ++ ++/* The possible direction we are about to adjust hash strength */ ++enum rshash_direct { ++ GO_UP, ++ GO_DOWN, ++ OBSCURE, ++ STILL, ++}; ++ ++/* random sampling hash state machine */ ++static struct { ++ enum rshash_states state; ++ enum rshash_direct pre_direct; ++ u8 below_count; ++ /* Keep a lookup window of size 5, iff above_count/below_count > 3 ++ * in this window we stop trying. ++ */ ++ u8 lookup_window_index; ++ u64 stable_benefit; ++ unsigned long turn_point_down; ++ unsigned long turn_benefit_down; ++ unsigned long turn_point_up; ++ unsigned long turn_benefit_up; ++ unsigned long stable_point; ++} rshash_state; ++ ++/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/ ++static u32 *zero_hash_table; ++ ++static inline struct node_vma *alloc_node_vma(void) ++{ ++ struct node_vma *node_vma; ++ node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL); ++ if (node_vma) { ++ INIT_HLIST_HEAD(&node_vma->rmap_hlist); ++ INIT_HLIST_NODE(&node_vma->hlist); ++ } ++ return node_vma; ++} ++ ++static inline void free_node_vma(struct node_vma *node_vma) ++{ ++ kmem_cache_free(node_vma_cache, node_vma); ++} ++ ++ ++static inline struct vma_slot *alloc_vma_slot(void) ++{ ++ struct vma_slot *slot; ++ ++ /* ++ * In case ksm is not initialized by now. ++ * Oops, we need to consider the call site of uksm_init() in the future. ++ */ ++ if (!vma_slot_cache) ++ return NULL; ++ ++ slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL); ++ if (slot) { ++ INIT_LIST_HEAD(&slot->slot_list); ++ INIT_LIST_HEAD(&slot->dedup_list); ++ slot->flags |= UKSM_SLOT_NEED_RERAND; ++ } ++ return slot; ++} ++ ++static inline void free_vma_slot(struct vma_slot *vma_slot) ++{ ++ kmem_cache_free(vma_slot_cache, vma_slot); ++} ++ ++ ++ ++static inline struct rmap_item *alloc_rmap_item(void) ++{ ++ struct rmap_item *rmap_item; ++ ++ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); ++ if (rmap_item) { ++ /* bug on lowest bit is not clear for flag use */ ++ BUG_ON(is_addr(rmap_item)); ++ } ++ return rmap_item; ++} ++ ++static inline void free_rmap_item(struct rmap_item *rmap_item) ++{ ++ rmap_item->slot = NULL; /* debug safety */ ++ kmem_cache_free(rmap_item_cache, rmap_item); ++} ++ ++static inline struct stable_node *alloc_stable_node(void) ++{ ++ struct stable_node *node; ++ node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC); ++ if (!node) ++ return NULL; ++ ++ INIT_HLIST_HEAD(&node->hlist); ++ list_add(&node->all_list, &stable_node_list); ++ return node; ++} ++ ++static inline void free_stable_node(struct stable_node *stable_node) ++{ ++ list_del(&stable_node->all_list); ++ kmem_cache_free(stable_node_cache, stable_node); ++} ++ ++static inline struct tree_node *alloc_tree_node(struct list_head *list) ++{ ++ struct tree_node *node; ++ node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC); ++ if (!node) ++ return NULL; ++ ++ list_add(&node->all_list, list); ++ return node; ++} ++ ++static inline void free_tree_node(struct tree_node *node) ++{ ++ list_del(&node->all_list); ++ kmem_cache_free(tree_node_cache, node); ++} ++ ++static void uksm_drop_anon_vma(struct rmap_item *rmap_item) ++{ ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ ++ put_anon_vma(anon_vma); ++} ++ ++ ++/** ++ * Remove a stable node from stable_tree, may unlink from its tree_node and ++ * may remove its parent tree_node if no other stable node is pending. ++ * ++ * @stable_node The node need to be removed ++ * @unlink_rb Will this node be unlinked from the rbtree? ++ * @remove_tree_ node Will its tree_node be removed if empty? ++ */ ++static void remove_node_from_stable_tree(struct stable_node *stable_node, ++ int unlink_rb, int remove_tree_node) ++{ ++ struct node_vma *node_vma; ++ struct rmap_item *rmap_item; ++ struct hlist_node *hlist, *rmap_hlist, *n; ++ ++ if (!hlist_empty(&stable_node->hlist)) { ++ hlist_for_each_entry_safe(node_vma, hlist, n, ++ &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, rmap_hlist, ++ &node_vma->rmap_hlist, hlist) { ++ uksm_pages_sharing--; ++ ++ uksm_drop_anon_vma(rmap_item); ++ rmap_item->address &= PAGE_MASK; ++ } ++ free_node_vma(node_vma); ++ cond_resched(); ++ } ++ ++ /* the last one is counted as shared */ ++ uksm_pages_shared--; ++ uksm_pages_sharing++; ++ } ++ ++ if (stable_node->tree_node && unlink_rb) { ++ rb_erase(&stable_node->node, ++ &stable_node->tree_node->sub_root); ++ ++ if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) && ++ remove_tree_node) { ++ rb_erase(&stable_node->tree_node->node, ++ root_stable_treep); ++ free_tree_node(stable_node->tree_node); ++ } else { ++ stable_node->tree_node->count--; ++ } ++ } ++ ++ free_stable_node(stable_node); ++} ++ ++ ++/* ++ * get_uksm_page: checks if the page indicated by the stable node ++ * is still its ksm page, despite having held no reference to it. ++ * In which case we can trust the content of the page, and it ++ * returns the gotten page; but if the page has now been zapped, ++ * remove the stale node from the stable tree and return NULL. ++ * ++ * You would expect the stable_node to hold a reference to the ksm page. ++ * But if it increments the page's count, swapping out has to wait for ++ * ksmd to come around again before it can free the page, which may take ++ * seconds or even minutes: much too unresponsive. So instead we use a ++ * "keyhole reference": access to the ksm page from the stable node peeps ++ * out through its keyhole to see if that page still holds the right key, ++ * pointing back to this stable node. This relies on freeing a PageAnon ++ * page to reset its page->mapping to NULL, and relies on no other use of ++ * a page to put something that might look like our key in page->mapping. ++ * ++ * include/linux/pagemap.h page_cache_get_speculative() is a good reference, ++ * but this is different - made simpler by uksm_thread_mutex being held, but ++ * interesting for assuming that no other use of the struct page could ever ++ * put our expected_mapping into page->mapping (or a field of the union which ++ * coincides with page->mapping). The RCU calls are not for KSM at all, but ++ * to keep the page_count protocol described with page_cache_get_speculative. ++ * ++ * Note: it is possible that get_uksm_page() will return NULL one moment, ++ * then page the next, if the page is in between page_freeze_refs() and ++ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page ++ * is on its way to being freed; but it is an anomaly to bear in mind. ++ * ++ * @unlink_rb: if the removal of this node will firstly unlink from ++ * its rbtree. stable_node_reinsert will prevent this when restructuring the ++ * node from its old tree. ++ * ++ * @remove_tree_node: if this is the last one of its tree_node, will the ++ * tree_node be freed ? If we are inserting stable node, this tree_node may ++ * be reused, so don't free it. ++ */ ++static struct page *get_uksm_page(struct stable_node *stable_node, ++ int unlink_rb, int remove_tree_node) ++{ ++ struct page *page; ++ void *expected_mapping; ++ ++ page = pfn_to_page(stable_node->kpfn); ++ expected_mapping = (void *)stable_node + ++ (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); ++ rcu_read_lock(); ++ if (page->mapping != expected_mapping) ++ goto stale; ++ if (!get_page_unless_zero(page)) ++ goto stale; ++ if (page->mapping != expected_mapping) { ++ put_page(page); ++ goto stale; ++ } ++ rcu_read_unlock(); ++ return page; ++stale: ++ rcu_read_unlock(); ++ remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node); ++ ++ return NULL; ++} ++ ++/* ++ * Removing rmap_item from stable or unstable tree. ++ * This function will clean the information from the stable/unstable tree. ++ */ ++static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item) ++{ ++ if (rmap_item->address & STABLE_FLAG) { ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct page *page; ++ ++ node_vma = rmap_item->head; ++ stable_node = node_vma->head; ++ page = get_uksm_page(stable_node, 1, 1); ++ if (!page) ++ goto out; ++ ++ /* ++ * page lock is needed because it's racing with ++ * try_to_unmap_ksm(), etc. ++ */ ++ lock_page(page); ++ hlist_del(&rmap_item->hlist); ++ ++ if (hlist_empty(&node_vma->rmap_hlist)) { ++ hlist_del(&node_vma->hlist); ++ free_node_vma(node_vma); ++ } ++ unlock_page(page); ++ ++ put_page(page); ++ if (hlist_empty(&stable_node->hlist)) { ++ /* do NOT call remove_node_from_stable_tree() here, ++ * it's possible for a forked rmap_item not in ++ * stable tree while the in-tree rmap_items were ++ * deleted. ++ */ ++ uksm_pages_shared--; ++ } else ++ uksm_pages_sharing--; ++ ++ ++ uksm_drop_anon_vma(rmap_item); ++ } else if (rmap_item->address & UNSTABLE_FLAG) { ++ if (rmap_item->hash_round == uksm_hash_round) { ++ ++ rb_erase(&rmap_item->node, ++ &rmap_item->tree_node->sub_root); ++ if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) { ++ rb_erase(&rmap_item->tree_node->node, ++ &root_unstable_tree); ++ ++ free_tree_node(rmap_item->tree_node); ++ } else ++ rmap_item->tree_node->count--; ++ } ++ uksm_pages_unshared--; ++ } ++ ++ rmap_item->address &= PAGE_MASK; ++ rmap_item->hash_max = 0; ++ ++out: ++ cond_resched(); /* we're called from many long loops */ ++} ++ ++static inline int slot_in_uksm(struct vma_slot *slot) ++{ ++ return list_empty(&slot->slot_list); ++} ++ ++/* ++ * Test if the mm is exiting ++ */ ++static inline bool uksm_test_exit(struct mm_struct *mm) ++{ ++ return atomic_read(&mm->mm_users) == 0; ++} ++ ++/** ++ * Need to do two things: ++ * 1. check if slot was moved to del list ++ * 2. make sure the mmap_sem is manipulated under valid vma. ++ * ++ * My concern here is that in some cases, this may make ++ * vma_slot_list_lock() waiters to serialized further by some ++ * sem->wait_lock, can this really be expensive? ++ * ++ * ++ * @return ++ * 0: if successfully locked mmap_sem ++ * -ENOENT: this slot was moved to del list ++ * -EBUSY: vma lock failed ++ */ ++static int try_down_read_slot_mmap_sem(struct vma_slot *slot) ++{ ++ struct vm_area_struct *vma; ++ struct mm_struct *mm; ++ struct rw_semaphore *sem; ++ ++ spin_lock(&vma_slot_list_lock); ++ ++ /* the slot_list was removed and inited from new list, when it enters ++ * uksm_list. If now it's not empty, then it must be moved to del list ++ */ ++ if (!slot_in_uksm(slot)) { ++ spin_unlock(&vma_slot_list_lock); ++ return -ENOENT; ++ } ++ ++ BUG_ON(slot->pages != vma_pages(slot->vma)); ++ /* Ok, vma still valid */ ++ vma = slot->vma; ++ mm = vma->vm_mm; ++ sem = &mm->mmap_sem; ++ ++ if (uksm_test_exit(mm)) { ++ spin_unlock(&vma_slot_list_lock); ++ return -ENOENT; ++ } ++ ++ if (down_read_trylock(sem)) { ++ spin_unlock(&vma_slot_list_lock); ++ return 0; ++ } ++ ++ spin_unlock(&vma_slot_list_lock); ++ return -EBUSY; ++} ++ ++static inline unsigned long ++vma_page_address(struct page *page, struct vm_area_struct *vma) ++{ ++ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); ++ unsigned long address; ++ ++ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); ++ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { ++ /* page should be within @vma mapping range */ ++ return -EFAULT; ++ } ++ return address; ++} ++ ++ ++/* return 0 on success with the item's mmap_sem locked */ ++static inline int get_mergeable_page_lock_mmap(struct rmap_item *item) ++{ ++ struct mm_struct *mm; ++ struct vma_slot *slot = item->slot; ++ int err = -EINVAL; ++ ++ struct page *page; ++ ++ /* ++ * try_down_read_slot_mmap_sem() returns non-zero if the slot ++ * has been removed by uksm_remove_vma(). ++ */ ++ if (try_down_read_slot_mmap_sem(slot)) ++ return -EBUSY; ++ ++ mm = slot->vma->vm_mm; ++ ++ if (uksm_test_exit(mm)) ++ goto failout_up; ++ ++ page = item->page; ++ rcu_read_lock(); ++ if (!get_page_unless_zero(page)) { ++ rcu_read_unlock(); ++ goto failout_up; ++ } ++ ++ /* No need to consider huge page here. */ ++ if (item->slot->vma->anon_vma != page_anon_vma(page) || ++ vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) { ++ /* ++ * TODO: ++ * should we release this item becase of its stale page ++ * mapping? ++ */ ++ put_page(page); ++ rcu_read_unlock(); ++ goto failout_up; ++ } ++ rcu_read_unlock(); ++ return 0; ++ ++failout_up: ++ up_read(&mm->mmap_sem); ++ return err; ++} ++ ++/* ++ * What kind of VMA is considered ? ++ */ ++static inline int vma_can_enter(struct vm_area_struct *vma) ++{ ++ return uksm_flags_can_scan(vma->vm_flags); ++} ++ ++/* ++ * Called whenever a fresh new vma is created A new vma_slot. ++ * is created and inserted into a global list Must be called. ++ * after vma is inserted to its mm . ++ */ ++void uksm_vma_add_new(struct vm_area_struct *vma) ++{ ++ struct vma_slot *slot; ++ ++ if (!vma_can_enter(vma)) { ++ vma->uksm_vma_slot = NULL; ++ return; ++ } ++ ++ slot = alloc_vma_slot(); ++ if (!slot) { ++ vma->uksm_vma_slot = NULL; ++ return; ++ } ++ ++ vma->uksm_vma_slot = slot; ++ vma->vm_flags |= VM_MERGEABLE; ++ slot->vma = vma; ++ slot->mm = vma->vm_mm; ++ slot->ctime_j = jiffies; ++ slot->pages = vma_pages(vma); ++ spin_lock(&vma_slot_list_lock); ++ list_add_tail(&slot->slot_list, &vma_slot_new); ++ spin_unlock(&vma_slot_list_lock); ++} ++ ++/* ++ * Called after vma is unlinked from its mm ++ */ ++void uksm_remove_vma(struct vm_area_struct *vma) ++{ ++ struct vma_slot *slot; ++ ++ if (!vma->uksm_vma_slot) ++ return; ++ ++ slot = vma->uksm_vma_slot; ++ spin_lock(&vma_slot_list_lock); ++ if (slot_in_uksm(slot)) { ++ /** ++ * This slot has been added by ksmd, so move to the del list ++ * waiting ksmd to free it. ++ */ ++ list_add_tail(&slot->slot_list, &vma_slot_del); ++ } else { ++ /** ++ * It's still on new list. It's ok to free slot directly. ++ */ ++ list_del(&slot->slot_list); ++ free_vma_slot(slot); ++ } ++ spin_unlock(&vma_slot_list_lock); ++ vma->uksm_vma_slot = NULL; ++} ++ ++/* 32/3 < they < 32/2 */ ++#define shiftl 8 ++#define shiftr 12 ++ ++#define HASH_FROM_TO(from, to) \ ++for (index = from; index < to; index++) { \ ++ pos = random_nums[index]; \ ++ hash += key[pos]; \ ++ hash += (hash << shiftl); \ ++ hash ^= (hash >> shiftr); \ ++} ++ ++ ++#define HASH_FROM_DOWN_TO(from, to) \ ++for (index = from - 1; index >= to; index--) { \ ++ hash ^= (hash >> shiftr); \ ++ hash ^= (hash >> (shiftr*2)); \ ++ hash -= (hash << shiftl); \ ++ hash += (hash << (shiftl*2)); \ ++ pos = random_nums[index]; \ ++ hash -= key[pos]; \ ++} ++ ++/* ++ * The main random sample hash function. ++ */ ++static u32 random_sample_hash(void *addr, u32 hash_strength) ++{ ++ u32 hash = 0xdeadbeef; ++ int index, pos, loop = hash_strength; ++ u32 *key = (u32 *)addr; ++ ++ if (loop > HASH_STRENGTH_FULL) ++ loop = HASH_STRENGTH_FULL; ++ ++ HASH_FROM_TO(0, loop); ++ ++ if (hash_strength > HASH_STRENGTH_FULL) { ++ loop = hash_strength - HASH_STRENGTH_FULL; ++ HASH_FROM_TO(0, loop); ++ } ++ ++ return hash; ++} ++ ++ ++/** ++ * It's used when hash strength is adjusted ++ * ++ * @addr The page's virtual address ++ * @from The original hash strength ++ * @to The hash strength changed to ++ * @hash The hash value generated with "from" hash value ++ * ++ * return the hash value ++ */ ++static u32 delta_hash(void *addr, int from, int to, u32 hash) ++{ ++ u32 *key = (u32 *)addr; ++ int index, pos; /* make sure they are int type */ ++ ++ if (to > from) { ++ if (from >= HASH_STRENGTH_FULL) { ++ from -= HASH_STRENGTH_FULL; ++ to -= HASH_STRENGTH_FULL; ++ HASH_FROM_TO(from, to); ++ } else if (to <= HASH_STRENGTH_FULL) { ++ HASH_FROM_TO(from, to); ++ } else { ++ HASH_FROM_TO(from, HASH_STRENGTH_FULL); ++ HASH_FROM_TO(0, to - HASH_STRENGTH_FULL); ++ } ++ } else { ++ if (from <= HASH_STRENGTH_FULL) { ++ HASH_FROM_DOWN_TO(from, to); ++ } else if (to >= HASH_STRENGTH_FULL) { ++ from -= HASH_STRENGTH_FULL; ++ to -= HASH_STRENGTH_FULL; ++ HASH_FROM_DOWN_TO(from, to); ++ } else { ++ HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0); ++ HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to); ++ } ++ } ++ ++ return hash; ++} ++ ++ ++ ++ ++#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta)) ++ ++/** ++ * ++ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round ++ * has finished. ++ * ++ * return 0 if no page has been scanned since last call, 1 otherwise. ++ */ ++static inline int encode_benefit(void) ++{ ++ u64 scanned_delta, pos_delta, neg_delta; ++ unsigned long base = benefit.base; ++ ++ scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last; ++ ++ if (!scanned_delta) ++ return 0; ++ ++ scanned_delta >>= base; ++ pos_delta = rshash_pos >> base; ++ neg_delta = rshash_neg >> base; ++ ++ if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) || ++ CAN_OVERFLOW_U64(benefit.neg, neg_delta) || ++ CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) { ++ benefit.scanned >>= 1; ++ benefit.neg >>= 1; ++ benefit.pos >>= 1; ++ benefit.base++; ++ scanned_delta >>= 1; ++ pos_delta >>= 1; ++ neg_delta >>= 1; ++ } ++ ++ benefit.pos += pos_delta; ++ benefit.neg += neg_delta; ++ benefit.scanned += scanned_delta; ++ ++ BUG_ON(!benefit.scanned); ++ ++ rshash_pos = rshash_neg = 0; ++ uksm_pages_scanned_last = uksm_pages_scanned; ++ ++ return 1; ++} ++ ++static inline void reset_benefit(void) ++{ ++ benefit.pos = 0; ++ benefit.neg = 0; ++ benefit.base = 0; ++ benefit.scanned = 0; ++} ++ ++static inline void inc_rshash_pos(unsigned long delta) ++{ ++ if (CAN_OVERFLOW_U64(rshash_pos, delta)) ++ encode_benefit(); ++ ++ rshash_pos += delta; ++} ++ ++static inline void inc_rshash_neg(unsigned long delta) ++{ ++ if (CAN_OVERFLOW_U64(rshash_neg, delta)) ++ encode_benefit(); ++ ++ rshash_neg += delta; ++} ++ ++ ++static inline u32 page_hash(struct page *page, unsigned long hash_strength, ++ int cost_accounting) ++{ ++ u32 val; ++ unsigned long delta; ++ ++ void *addr = kmap_atomic(page, KM_USER0); ++ ++ val = random_sample_hash(addr, hash_strength); ++ kunmap_atomic(addr, KM_USER0); ++ ++ if (cost_accounting) { ++ if (HASH_STRENGTH_FULL > hash_strength) ++ delta = HASH_STRENGTH_FULL - hash_strength; ++ else ++ delta = 0; ++ ++ inc_rshash_pos(delta); ++ } ++ ++ return val; ++} ++ ++static int memcmp_pages(struct page *page1, struct page *page2, ++ int cost_accounting) ++{ ++ char *addr1, *addr2; ++ int ret; ++ ++ addr1 = kmap_atomic(page1, KM_USER0); ++ addr2 = kmap_atomic(page2, KM_USER1); ++ ret = memcmp(addr1, addr2, PAGE_SIZE); ++ kunmap_atomic(addr2, KM_USER1); ++ kunmap_atomic(addr1, KM_USER0); ++ ++ if (cost_accounting) ++ inc_rshash_neg(memcmp_cost); ++ ++ return ret; ++} ++ ++static inline int pages_identical(struct page *page1, struct page *page2) ++{ ++ return !memcmp_pages(page1, page2, 0); ++} ++ ++static inline int is_page_full_zero(struct page *page) ++{ ++ char *addr; ++ int ret; ++ ++ addr = kmap_atomic(page, KM_USER0); ++ ret = is_full_zero(addr, PAGE_SIZE); ++ kunmap_atomic(addr, KM_USER0); ++ ++ return ret; ++} ++ ++static int write_protect_page(struct vm_area_struct *vma, struct page *page, ++ pte_t *orig_pte, pte_t *old_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ unsigned long addr; ++ pte_t *ptep; ++ spinlock_t *ptl; ++ int swapped; ++ int err = -EFAULT; ++ ++ addr = page_address_in_vma(page, vma); ++ if (addr == -EFAULT) ++ goto out; ++ ++ BUG_ON(PageTransCompound(page)); ++ ptep = page_check_address(page, mm, addr, &ptl, 0); ++ if (!ptep) ++ goto out; ++ ++ if (old_pte) ++ *old_pte = *ptep; ++ ++ if (pte_write(*ptep) || pte_dirty(*ptep)) { ++ pte_t entry; ++ ++ swapped = PageSwapCache(page); ++ flush_cache_page(vma, addr, page_to_pfn(page)); ++ /* ++ * Ok this is tricky, when get_user_pages_fast() run it doesnt ++ * take any lock, therefore the check that we are going to make ++ * with the pagecount against the mapcount is racey and ++ * O_DIRECT can happen right after the check. ++ * So we clear the pte and flush the tlb before the check ++ * this assure us that no O_DIRECT can happen after the check ++ * or in the middle of the check. ++ */ ++ entry = ptep_clear_flush(vma, addr, ptep); ++ /* ++ * Check that no O_DIRECT or similar I/O is in progress on the ++ * page ++ */ ++ if (page_mapcount(page) + 1 + swapped != page_count(page)) { ++ set_pte_at(mm, addr, ptep, entry); ++ goto out_unlock; ++ } ++ if (pte_dirty(entry)) ++ set_page_dirty(page); ++ entry = pte_mkclean(pte_wrprotect(entry)); ++ set_pte_at_notify(mm, addr, ptep, entry); ++ } ++ *orig_pte = *ptep; ++ err = 0; ++ ++out_unlock: ++ pte_unmap_unlock(ptep, ptl); ++out: ++ return err; ++} ++ ++#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */ ++#define MERGE_ERR_COLLI 2 /* there is a collision */ ++#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */ ++#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */ ++ ++ ++/** ++ * replace_page - replace page in vma by new ksm page ++ * @vma: vma that holds the pte pointing to page ++ * @page: the page we are replacing by kpage ++ * @kpage: the ksm page we replace page by ++ * @orig_pte: the original value of the pte ++ * ++ * Returns 0 on success, MERGE_ERR_PGERR on failure. ++ */ ++static int replace_page(struct vm_area_struct *vma, struct page *page, ++ struct page *kpage, pte_t orig_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *ptep; ++ spinlock_t *ptl; ++ pte_t entry; ++ ++ unsigned long addr; ++ int err = MERGE_ERR_PGERR; ++ ++ addr = page_address_in_vma(page, vma); ++ if (addr == -EFAULT) ++ goto out; ++ ++ pgd = pgd_offset(mm, addr); ++ if (!pgd_present(*pgd)) ++ goto out; ++ ++ pud = pud_offset(pgd, addr); ++ if (!pud_present(*pud)) ++ goto out; ++ ++ pmd = pmd_offset(pud, addr); ++ BUG_ON(pmd_trans_huge(*pmd)); ++ if (!pmd_present(*pmd)) ++ goto out; ++ ++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); ++ if (!pte_same(*ptep, orig_pte)) { ++ pte_unmap_unlock(ptep, ptl); ++ goto out; ++ } ++ ++ flush_cache_page(vma, addr, pte_pfn(*ptep)); ++ ptep_clear_flush(vma, addr, ptep); ++ entry = mk_pte(kpage, vma->vm_page_prot); ++ ++ /* special treatment is needed for zero_page */ ++ if ((page_to_pfn(kpage) == uksm_zero_pfn) || ++ (page_to_pfn(kpage) == zero_pfn)) ++ entry = pte_mkspecial(entry); ++ else { ++ get_page(kpage); ++ page_add_anon_rmap(kpage, vma, addr); ++ } ++ ++ set_pte_at_notify(mm, addr, ptep, entry); ++ ++ page_remove_rmap(page); ++ if (!page_mapped(page)) ++ try_to_free_swap(page); ++ put_page(page); ++ ++ pte_unmap_unlock(ptep, ptl); ++ err = 0; ++out: ++ return err; ++} ++ ++ ++/** ++ * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The ++ * zero hash value at HASH_STRENGTH_MAX is used to indicated that its ++ * hash_max member has not been calculated. ++ * ++ * @page The page needs to be hashed ++ * @hash_old The hash value calculated with current hash strength ++ * ++ * return the new hash value calculated at HASH_STRENGTH_MAX ++ */ ++static inline u32 page_hash_max(struct page *page, u32 hash_old) ++{ ++ u32 hash_max = 0; ++ void *addr; ++ ++ addr = kmap_atomic(page, KM_USER0); ++ hash_max = delta_hash(addr, hash_strength, ++ HASH_STRENGTH_MAX, hash_old); ++ ++ kunmap_atomic(addr, KM_USER0); ++ ++ if (!hash_max) ++ hash_max = 1; ++ ++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); ++ return hash_max; ++} ++ ++/* ++ * We compare the hash again, to ensure that it is really a hash collision ++ * instead of being caused by page write. ++ */ ++static inline int check_collision(struct rmap_item *rmap_item, ++ u32 hash) ++{ ++ int err; ++ struct page *page = rmap_item->page; ++ ++ /* if this rmap_item has already been hash_maxed, then the collision ++ * must appears in the second-level rbtree search. In this case we check ++ * if its hash_max value has been changed. Otherwise, the collision ++ * happens in the first-level rbtree search, so we check against it's ++ * current hash value. ++ */ ++ if (rmap_item->hash_max) { ++ inc_rshash_neg(memcmp_cost); ++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); ++ ++ if (rmap_item->hash_max == page_hash_max(page, hash)) ++ err = MERGE_ERR_COLLI; ++ else ++ err = MERGE_ERR_CHANGED; ++ } else { ++ inc_rshash_neg(memcmp_cost + hash_strength); ++ ++ if (page_hash(page, hash_strength, 0) == hash) ++ err = MERGE_ERR_COLLI; ++ else ++ err = MERGE_ERR_CHANGED; ++ } ++ ++ return err; ++} ++ ++static struct page *page_trans_compound_anon(struct page *page) ++{ ++ if (PageTransCompound(page)) { ++ struct page *head = compound_trans_head(page); ++ /* ++ * head may actually be splitted and freed from under ++ * us but it's ok here. ++ */ ++ if (PageAnon(head)) ++ return head; ++ } ++ return NULL; ++} ++ ++static int page_trans_compound_anon_split(struct page *page) ++{ ++ int ret = 0; ++ struct page *transhuge_head = page_trans_compound_anon(page); ++ if (transhuge_head) { ++ /* Get the reference on the head to split it. */ ++ if (get_page_unless_zero(transhuge_head)) { ++ /* ++ * Recheck we got the reference while the head ++ * was still anonymous. ++ */ ++ if (PageAnon(transhuge_head)) ++ ret = split_huge_page(transhuge_head); ++ else ++ /* ++ * Retry later if split_huge_page run ++ * from under us. ++ */ ++ ret = 1; ++ put_page(transhuge_head); ++ } else ++ /* Retry later if split_huge_page run from under us. */ ++ ret = 1; ++ } ++ return ret; ++} ++ ++/** ++ * Try to merge a rmap_item.page with a kpage in stable node. kpage must ++ * already be a ksm page. ++ * ++ * @return 0 if the pages were merged, -EFAULT otherwise. ++ */ ++static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item, ++ struct page *kpage, u32 hash) ++{ ++ struct vm_area_struct *vma = rmap_item->slot->vma; ++ struct mm_struct *mm = vma->vm_mm; ++ pte_t orig_pte = __pte(0); ++ int err = MERGE_ERR_PGERR; ++ struct page *page; ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ page = rmap_item->page; ++ ++ if (page == kpage) { /* ksm page forked */ ++ err = 0; ++ goto out; ++ } ++ ++ if (PageTransCompound(page) && page_trans_compound_anon_split(page)) ++ goto out; ++ BUG_ON(PageTransCompound(page)); ++ ++ if (!PageAnon(page) || !PageKsm(kpage)) ++ goto out; ++ ++ /* ++ * We need the page lock to read a stable PageSwapCache in ++ * write_protect_page(). We use trylock_page() instead of ++ * lock_page() because we don't want to wait here - we ++ * prefer to continue scanning and merging different pages, ++ * then come back to this page when it is unlocked. ++ */ ++ if (!trylock_page(page)) ++ goto out; ++ /* ++ * If this anonymous page is mapped only here, its pte may need ++ * to be write-protected. If it's mapped elsewhere, all of its ++ * ptes are necessarily already write-protected. But in either ++ * case, we need to lock and check page_count is not raised. ++ */ ++ if (write_protect_page(vma, page, &orig_pte, NULL) == 0) { ++ if (pages_identical(page, kpage)) ++ err = replace_page(vma, page, kpage, orig_pte); ++ else ++ err = check_collision(rmap_item, hash); ++ } ++ ++ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { ++ munlock_vma_page(page); ++ if (!PageMlocked(kpage)) { ++ unlock_page(page); ++ lock_page(kpage); ++ mlock_vma_page(kpage); ++ page = kpage; /* for final unlock */ ++ } ++ } ++ ++ unlock_page(page); ++out: ++ return err; ++} ++ ++ ++ ++/** ++ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance ++ * to restore a page mapping that has been changed in try_to_merge_two_pages. ++ * ++ * @return 0 on success. ++ */ ++static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr, ++ pte_t orig_pte, pte_t wprt_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *ptep; ++ spinlock_t *ptl; ++ ++ int err = -EFAULT; ++ ++ pgd = pgd_offset(mm, addr); ++ if (!pgd_present(*pgd)) ++ goto out; ++ ++ pud = pud_offset(pgd, addr); ++ if (!pud_present(*pud)) ++ goto out; ++ ++ pmd = pmd_offset(pud, addr); ++ if (!pmd_present(*pmd)) ++ goto out; ++ ++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); ++ if (!pte_same(*ptep, wprt_pte)) { ++ /* already copied, let it be */ ++ pte_unmap_unlock(ptep, ptl); ++ goto out; ++ } ++ ++ /* ++ * Good boy, still here. When we still get the ksm page, it does not ++ * return to the free page pool, there is no way that a pte was changed ++ * to other page and gets back to this page. And remind that ksm page ++ * do not reuse in do_wp_page(). So it's safe to restore the original ++ * pte. ++ */ ++ flush_cache_page(vma, addr, pte_pfn(*ptep)); ++ ptep_clear_flush(vma, addr, ptep); ++ set_pte_at_notify(mm, addr, ptep, orig_pte); ++ ++ pte_unmap_unlock(ptep, ptl); ++ err = 0; ++out: ++ return err; ++} ++ ++/** ++ * try_to_merge_two_pages() - take two identical pages and prepare ++ * them to be merged into one page(rmap_item->page) ++ * ++ * @return 0 if we successfully merged two identical pages into ++ * one ksm page. MERGE_ERR_COLLI if it's only a hash collision ++ * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been ++ * changed since it's hashed. MERGE_ERR_PGERR otherwise. ++ * ++ */ ++static int try_to_merge_two_pages(struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ u32 hash) ++{ ++ pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0); ++ pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0); ++ struct vm_area_struct *vma1 = rmap_item->slot->vma; ++ struct vm_area_struct *vma2 = tree_rmap_item->slot->vma; ++ struct page *page = rmap_item->page; ++ struct page *tree_page = tree_rmap_item->page; ++ int err = MERGE_ERR_PGERR; ++ struct address_space *saved_mapping; ++ ++ ++ if (rmap_item->page == tree_rmap_item->page) ++ goto out; ++ ++ if (PageTransCompound(page) && page_trans_compound_anon_split(page)) ++ goto out; ++ BUG_ON(PageTransCompound(page)); ++ ++ if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page)) ++ goto out; ++ BUG_ON(PageTransCompound(tree_page)); ++ ++ if (!PageAnon(page) || !PageAnon(tree_page)) ++ goto out; ++ ++ if (!trylock_page(page)) ++ goto out; ++ ++ ++ if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) { ++ unlock_page(page); ++ goto out; ++ } ++ ++ /* ++ * While we hold page lock, upgrade page from ++ * PageAnon+anon_vma to PageKsm+NULL stable_node: ++ * stable_tree_insert() will update stable_node. ++ */ ++ saved_mapping = page->mapping; ++ set_page_stable_node(page, NULL); ++ mark_page_accessed(page); ++ unlock_page(page); ++ ++ if (!trylock_page(tree_page)) ++ goto restore_out; ++ ++ if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if (pages_identical(page, tree_page)) { ++ err = replace_page(vma2, tree_page, page, wprt_pte2); ++ if (err) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if ((vma2->vm_flags & VM_LOCKED)) { ++ munlock_vma_page(tree_page); ++ if (!PageMlocked(page)) { ++ unlock_page(tree_page); ++ lock_page(page); ++ mlock_vma_page(page); ++ tree_page = page; /* for final unlock */ ++ } ++ } ++ ++ unlock_page(tree_page); ++ ++ goto out; /* success */ ++ ++ } else { ++ if (tree_rmap_item->hash_max && ++ tree_rmap_item->hash_max == rmap_item->hash_max) { ++ err = MERGE_ERR_COLLI_MAX; ++ } else if (page_hash(page, hash_strength, 0) == ++ page_hash(tree_page, hash_strength, 0)) { ++ inc_rshash_neg(memcmp_cost + hash_strength * 2); ++ err = MERGE_ERR_COLLI; ++ } else { ++ err = MERGE_ERR_CHANGED; ++ } ++ ++ unlock_page(tree_page); ++ } ++ ++restore_out: ++ lock_page(page); ++ if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item), ++ orig_pte1, wprt_pte1)) ++ page->mapping = saved_mapping; ++ ++ unlock_page(page); ++out: ++ return err; ++} ++ ++static inline int hash_cmp(u32 new_val, u32 node_val) ++{ ++ if (new_val > node_val) ++ return 1; ++ else if (new_val < node_val) ++ return -1; ++ else ++ return 0; ++} ++ ++static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash) ++{ ++ u32 hash_max = item->hash_max; ++ ++ if (!hash_max) { ++ hash_max = page_hash_max(item->page, hash); ++ ++ item->hash_max = hash_max; ++ } ++ ++ return hash_max; ++} ++ ++ ++ ++/** ++ * stable_tree_search() - search the stable tree for a page ++ * ++ * @item: the rmap_item we are comparing with ++ * @hash: the hash value of this item->page already calculated ++ * ++ * @return the page we have found, NULL otherwise. The page returned has ++ * been gotten. ++ */ ++static struct page *stable_tree_search(struct rmap_item *item, u32 hash) ++{ ++ struct rb_node *node = root_stable_treep->rb_node; ++ struct tree_node *tree_node; ++ unsigned long hash_max; ++ struct page *page = item->page; ++ struct stable_node *stable_node; ++ ++ stable_node = page_stable_node(page); ++ if (stable_node) { ++ /* ksm page forked, that is ++ * if (PageKsm(page) && !in_stable_tree(rmap_item)) ++ * it's actually gotten once outside. ++ */ ++ get_page(page); ++ return page; ++ } ++ ++ while (node) { ++ int cmp; ++ ++ tree_node = rb_entry(node, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) ++ node = node->rb_left; ++ else if (cmp > 0) ++ node = node->rb_right; ++ else ++ break; ++ } ++ ++ if (!node) ++ return NULL; ++ ++ if (tree_node->count == 1) { ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ BUG_ON(!stable_node); ++ ++ goto get_page_out; ++ } ++ ++ /* ++ * ok, we have to search the second ++ * level subtree, hash the page to a ++ * full strength. ++ */ ++ node = tree_node->sub_root.rb_node; ++ BUG_ON(!node); ++ hash_max = rmap_item_hash_max(item, hash); ++ ++ while (node) { ++ int cmp; ++ ++ stable_node = rb_entry(node, struct stable_node, node); ++ ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ if (cmp < 0) ++ node = node->rb_left; ++ else if (cmp > 0) ++ node = node->rb_right; ++ else ++ goto get_page_out; ++ } ++ ++ return NULL; ++ ++get_page_out: ++ page = get_uksm_page(stable_node, 1, 1); ++ return page; ++} ++ ++static int try_merge_rmap_item(struct rmap_item *item, ++ struct page *kpage, ++ struct page *tree_page) ++{ ++ spinlock_t *ptl; ++ pte_t *ptep; ++ unsigned long addr; ++ struct vm_area_struct *vma = item->slot->vma; ++ ++ addr = get_rmap_addr(item); ++ ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0); ++ if (!ptep) ++ return 0; ++ ++ if (pte_write(*ptep)) { ++ /* has changed, abort! */ ++ pte_unmap_unlock(ptep, ptl); ++ return 0; ++ } ++ ++ get_page(tree_page); ++ page_add_anon_rmap(tree_page, vma, addr); ++ ++ flush_cache_page(vma, addr, pte_pfn(*ptep)); ++ ptep_clear_flush(vma, addr, ptep); ++ set_pte_at_notify(vma->vm_mm, addr, ptep, ++ mk_pte(tree_page, vma->vm_page_prot)); ++ ++ page_remove_rmap(kpage); ++ put_page(kpage); ++ ++ pte_unmap_unlock(ptep, ptl); ++ ++ return 1; ++} ++ ++/** ++ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted ++ * into stable tree, the page was found to be identical to a stable ksm page, ++ * this is the last chance we can merge them into one. ++ * ++ * @item1: the rmap_item holding the page which we wanted to insert ++ * into stable tree. ++ * @item2: the other rmap_item we found when unstable tree search ++ * @oldpage: the page currently mapped by the two rmap_items ++ * @tree_page: the page we found identical in stable tree node ++ * @success1: return if item1 is successfully merged ++ * @success2: return if item2 is successfully merged ++ */ ++static void try_merge_with_stable(struct rmap_item *item1, ++ struct rmap_item *item2, ++ struct page **kpage, ++ struct page *tree_page, ++ int *success1, int *success2) ++{ ++ struct vm_area_struct *vma1 = item1->slot->vma; ++ struct vm_area_struct *vma2 = item2->slot->vma; ++ *success1 = 0; ++ *success2 = 0; ++ ++ if (unlikely(*kpage == tree_page)) { ++ /* I don't think this can really happen */ ++ printk(KERN_WARNING "UKSM: unexpected condition detected in " ++ "try_merge_with_stable() -- *kpage == tree_page !\n"); ++ *success1 = 1; ++ *success2 = 1; ++ return; ++ } ++ ++ if (!PageAnon(*kpage) || !PageKsm(*kpage)) ++ goto failed; ++ ++ if (!trylock_page(tree_page)) ++ goto failed; ++ ++ /* If the oldpage is still ksm and still pointed ++ * to in the right place, and still write protected, ++ * we are confident it's not changed, no need to ++ * memcmp anymore. ++ * be ware, we cannot take nested pte locks, ++ * deadlock risk. ++ */ ++ if (!try_merge_rmap_item(item1, *kpage, tree_page)) ++ goto unlock_failed; ++ ++ /* ok, then vma2, remind that pte1 already set */ ++ if (!try_merge_rmap_item(item2, *kpage, tree_page)) ++ goto success_1; ++ ++ *success2 = 1; ++success_1: ++ *success1 = 1; ++ ++ ++ if ((*success1 && vma1->vm_flags & VM_LOCKED) || ++ (*success2 && vma2->vm_flags & VM_LOCKED)) { ++ munlock_vma_page(*kpage); ++ if (!PageMlocked(tree_page)) ++ mlock_vma_page(tree_page); ++ } ++ ++ /* ++ * We do not need oldpage any more in the caller, so can break the lock ++ * now. ++ */ ++ unlock_page(*kpage); ++ *kpage = tree_page; /* Get unlocked outside. */ ++ return; ++ ++unlock_failed: ++ unlock_page(tree_page); ++failed: ++ return; ++} ++ ++static inline void stable_node_hash_max(struct stable_node *node, ++ struct page *page, u32 hash) ++{ ++ u32 hash_max = node->hash_max; ++ ++ if (!hash_max) { ++ hash_max = page_hash_max(page, hash); ++ node->hash_max = hash_max; ++ } ++} ++ ++static inline ++struct stable_node *new_stable_node(struct tree_node *tree_node, ++ struct page *kpage, u32 hash_max) ++{ ++ struct stable_node *new_stable_node; ++ ++ new_stable_node = alloc_stable_node(); ++ if (!new_stable_node) ++ return NULL; ++ ++ new_stable_node->kpfn = page_to_pfn(kpage); ++ new_stable_node->hash_max = hash_max; ++ new_stable_node->tree_node = tree_node; ++ set_page_stable_node(kpage, new_stable_node); ++ ++ return new_stable_node; ++} ++ ++static inline ++struct stable_node *first_level_insert(struct tree_node *tree_node, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ struct page **kpage, u32 hash, ++ int *success1, int *success2) ++{ ++ int cmp; ++ struct page *tree_page; ++ u32 hash_max = 0; ++ struct stable_node *stable_node, *new_snode; ++ struct rb_node *parent = NULL, **new; ++ ++ /* this tree node contains no sub-tree yet */ ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ cmp = memcmp_pages(*kpage, tree_page, 1); ++ if (!cmp) { ++ try_merge_with_stable(rmap_item, tree_rmap_item, kpage, ++ tree_page, success1, success2); ++ put_page(tree_page); ++ if (!*success1 && !*success2) ++ goto failed; ++ ++ return stable_node; ++ ++ } else { ++ /* ++ * collision in first level try to create a subtree. ++ * A new node need to be created. ++ */ ++ put_page(tree_page); ++ ++ stable_node_hash_max(stable_node, tree_page, ++ tree_node->hash); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ parent = &stable_node->node; ++ if (cmp < 0) { ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ new = &parent->rb_right; ++ } else { ++ goto failed; ++ } ++ } ++ ++ } else { ++ /* the only stable_node deleted, we reuse its tree_node. ++ */ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++ new_snode = new_stable_node(tree_node, *kpage, hash_max); ++ if (!new_snode) ++ goto failed; ++ ++ rb_link_node(&new_snode->node, parent, new); ++ rb_insert_color(&new_snode->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ ++ return new_snode; ++ ++failed: ++ return NULL; ++} ++ ++static inline ++struct stable_node *stable_subtree_insert(struct tree_node *tree_node, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ struct page **kpage, u32 hash, ++ int *success1, int *success2) ++{ ++ struct page *tree_page; ++ u32 hash_max; ++ struct stable_node *stable_node, *new_snode; ++ struct rb_node *parent, **new; ++ ++research: ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ BUG_ON(!*new); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ while (*new) { ++ int cmp; ++ ++ stable_node = rb_entry(*new, struct stable_node, node); ++ ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else { ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ cmp = memcmp_pages(*kpage, tree_page, 1); ++ if (!cmp) { ++ try_merge_with_stable(rmap_item, ++ tree_rmap_item, kpage, ++ tree_page, success1, success2); ++ ++ put_page(tree_page); ++ if (!*success1 && !*success2) ++ goto failed; ++ /* ++ * successfully merged with a stable ++ * node ++ */ ++ return stable_node; ++ } else { ++ put_page(tree_page); ++ goto failed; ++ } ++ } else { ++ /* ++ * stable node may be deleted, ++ * and subtree maybe ++ * restructed, cannot ++ * continue, research it. ++ */ ++ if (tree_node->count) { ++ goto research; ++ } else { ++ /* reuse the tree node*/ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ } ++ } ++ } ++ ++ new_snode = new_stable_node(tree_node, *kpage, hash_max); ++ if (!new_snode) ++ goto failed; ++ ++ rb_link_node(&new_snode->node, parent, new); ++ rb_insert_color(&new_snode->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ ++ return new_snode; ++ ++failed: ++ return NULL; ++} ++ ++ ++/** ++ * stable_tree_insert() - try to insert a merged page in unstable tree to ++ * the stable tree ++ * ++ * @kpage: the page need to be inserted ++ * @hash: the current hash of this page ++ * @rmap_item: the rmap_item being scanned ++ * @tree_rmap_item: the rmap_item found on unstable tree ++ * @success1: return if rmap_item is merged ++ * @success2: return if tree_rmap_item is merged ++ * ++ * @return the stable_node on stable tree if at least one ++ * rmap_item is inserted into stable tree, NULL ++ * otherwise. ++ */ ++static struct stable_node * ++stable_tree_insert(struct page **kpage, u32 hash, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ int *success1, int *success2) ++{ ++ struct rb_node **new = &root_stable_treep->rb_node; ++ struct rb_node *parent = NULL; ++ struct stable_node *stable_node; ++ struct tree_node *tree_node; ++ u32 hash_max = 0; ++ ++ *success1 = *success2 = 0; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ if (tree_node->count == 1) { ++ stable_node = first_level_insert(tree_node, rmap_item, ++ tree_rmap_item, kpage, ++ hash, success1, success2); ++ } else { ++ stable_node = stable_subtree_insert(tree_node, ++ rmap_item, tree_rmap_item, kpage, ++ hash, success1, success2); ++ } ++ } else { ++ ++ /* no tree node found */ ++ tree_node = alloc_tree_node(stable_tree_node_listp); ++ if (!tree_node) { ++ stable_node = NULL; ++ goto out; ++ } ++ ++ stable_node = new_stable_node(tree_node, *kpage, hash_max); ++ if (!stable_node) { ++ free_tree_node(tree_node); ++ goto out; ++ } ++ ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, root_stable_treep); ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ ++ rb_link_node(&stable_node->node, parent, new); ++ rb_insert_color(&stable_node->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ } ++ ++out: ++ return stable_node; ++} ++ ++ ++/** ++ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem ++ * ++ * @return 0 on success, -EBUSY if unable to lock the mmap_sem, ++ * -EINVAL if the page mapping has been changed. ++ */ ++static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item) ++{ ++ int err; ++ ++ err = get_mergeable_page_lock_mmap(tree_rmap_item); ++ ++ if (err == -EINVAL) { ++ /* its page map has been changed, remove it */ ++ remove_rmap_item_from_tree(tree_rmap_item); ++ } ++ ++ /* The page is gotten and mmap_sem is locked now. */ ++ return err; ++} ++ ++ ++/** ++ * unstable_tree_search_insert() - search an unstable tree rmap_item with the ++ * same hash value. Get its page and trylock the mmap_sem ++ */ ++static inline ++struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, ++ u32 hash) ++ ++{ ++ struct rb_node **new = &root_unstable_tree.rb_node; ++ struct rb_node *parent = NULL; ++ struct tree_node *tree_node; ++ u32 hash_max; ++ struct rmap_item *tree_rmap_item; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ /* got the tree_node */ ++ if (tree_node->count == 1) { ++ tree_rmap_item = rb_entry(tree_node->sub_root.rb_node, ++ struct rmap_item, node); ++ BUG_ON(!tree_rmap_item); ++ ++ goto get_page_out; ++ } ++ ++ /* well, search the collision subtree */ ++ new = &tree_node->sub_root.rb_node; ++ BUG_ON(!*new); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ ++ while (*new) { ++ int cmp; ++ ++ tree_rmap_item = rb_entry(*new, struct rmap_item, ++ node); ++ ++ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); ++ parent = *new; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto get_page_out; ++ } ++ } else { ++ /* alloc a new tree_node */ ++ tree_node = alloc_tree_node(&unstable_tree_node_list); ++ if (!tree_node) ++ return NULL; ++ ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, &root_unstable_tree); ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++ /* did not found even in sub-tree */ ++ rmap_item->tree_node = tree_node; ++ rmap_item->address |= UNSTABLE_FLAG; ++ rmap_item->hash_round = uksm_hash_round; ++ rb_link_node(&rmap_item->node, parent, new); ++ rb_insert_color(&rmap_item->node, &tree_node->sub_root); ++ ++ uksm_pages_unshared++; ++ return NULL; ++ ++get_page_out: ++ if (tree_rmap_item->page == rmap_item->page) ++ return NULL; ++ ++ if (get_tree_rmap_item_page(tree_rmap_item)) ++ return NULL; ++ ++ return tree_rmap_item; ++} ++ ++static void hold_anon_vma(struct rmap_item *rmap_item, ++ struct anon_vma *anon_vma) ++{ ++ rmap_item->anon_vma = anon_vma; ++ get_anon_vma(anon_vma); ++} ++ ++ ++/** ++ * stable_tree_append() - append a rmap_item to a stable node. Deduplication ++ * ratio statistics is done in this function. ++ * ++ */ ++static void stable_tree_append(struct rmap_item *rmap_item, ++ struct stable_node *stable_node, int logdedup) ++{ ++ struct node_vma *node_vma = NULL, *new_node_vma; ++ struct hlist_node *hlist = NULL, *cont_p = NULL; ++ unsigned long key = (unsigned long)rmap_item->slot; ++ unsigned long factor = rmap_item->slot->rung->step; ++ ++ BUG_ON(!stable_node); ++ rmap_item->address |= STABLE_FLAG; ++ ++ if (hlist_empty(&stable_node->hlist)) { ++ uksm_pages_shared++; ++ goto node_vma_new; ++ } else { ++ uksm_pages_sharing++; ++ } ++ ++ hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) { ++ if (node_vma->key >= key) ++ break; ++ ++ if (logdedup) { ++ node_vma->slot->pages_bemerged += factor; ++ if (list_empty(&node_vma->slot->dedup_list)) ++ list_add(&node_vma->slot->dedup_list, ++ &vma_slot_dedup); ++ } ++ } ++ ++ if (node_vma) { ++ if (node_vma->key == key) { ++ cont_p = hlist->next; ++ goto node_vma_ok; ++ } else if (node_vma->key > key) { ++ cont_p = hlist; ++ } ++ } ++ ++node_vma_new: ++ /* no same vma already in node, alloc a new node_vma */ ++ new_node_vma = alloc_node_vma(); ++ BUG_ON(!new_node_vma); ++ new_node_vma->head = stable_node; ++ new_node_vma->slot = rmap_item->slot; ++ ++ if (!node_vma) { ++ hlist_add_head(&new_node_vma->hlist, &stable_node->hlist); ++ } else if (node_vma->key != key) { ++ if (node_vma->key < key) ++ hlist_add_after(&node_vma->hlist, &new_node_vma->hlist); ++ else { ++ hlist_add_before(&new_node_vma->hlist, ++ &node_vma->hlist); ++ } ++ ++ } ++ node_vma = new_node_vma; ++ ++node_vma_ok: /* ok, ready to add to the list */ ++ rmap_item->head = node_vma; ++ hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist); ++ hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma); ++ if (logdedup) { ++ rmap_item->slot->pages_merged++; ++ if (cont_p) { ++ hlist_for_each_entry_continue(node_vma, ++ cont_p, hlist) { ++ node_vma->slot->pages_bemerged += factor; ++ if (list_empty(&node_vma->slot->dedup_list)) ++ list_add(&node_vma->slot->dedup_list, ++ &vma_slot_dedup); ++ } ++ } ++ } ++} ++ ++/* ++ * We use break_ksm to break COW on a ksm page: it's a stripped down ++ * ++ * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) ++ * put_page(page); ++ * ++ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, ++ * in case the application has unmapped and remapped mm,addr meanwhile. ++ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP ++ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. ++ */ ++static int break_ksm(struct vm_area_struct *vma, unsigned long addr) ++{ ++ struct page *page; ++ int ret = 0; ++ ++ do { ++ cond_resched(); ++ page = follow_page(vma, addr, FOLL_GET); ++ if (IS_ERR_OR_NULL(page)) ++ break; ++ if (PageKsm(page)) { ++ ret = handle_mm_fault(vma->vm_mm, vma, addr, ++ FAULT_FLAG_WRITE); ++ } else ++ ret = VM_FAULT_WRITE; ++ put_page(page); ++ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); ++ /* ++ * We must loop because handle_mm_fault() may back out if there's ++ * any difficulty e.g. if pte accessed bit gets updated concurrently. ++ * ++ * VM_FAULT_WRITE is what we have been hoping for: it indicates that ++ * COW has been broken, even if the vma does not permit VM_WRITE; ++ * but note that a concurrent fault might break PageKsm for us. ++ * ++ * VM_FAULT_SIGBUS could occur if we race with truncation of the ++ * backing file, which also invalidates anonymous pages: that's ++ * okay, that truncation will have unmapped the PageKsm for us. ++ * ++ * VM_FAULT_OOM: at the time of writing (late July 2009), setting ++ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the ++ * current task has TIF_MEMDIE set, and will be OOM killed on return ++ * to user; and ksmd, having no mm, would never be chosen for that. ++ * ++ * But if the mm is in a limited mem_cgroup, then the fault may fail ++ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and ++ * even ksmd can fail in this way - though it's usually breaking ksm ++ * just to undo a merge it made a moment before, so unlikely to oom. ++ * ++ * That's a pity: we might therefore have more kernel pages allocated ++ * than we're counting as nodes in the stable tree; but uksm_do_scan ++ * will retry to break_cow on each pass, so should recover the page ++ * in due course. The important thing is to not let VM_MERGEABLE ++ * be cleared while any such pages might remain in the area. ++ */ ++ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; ++} ++ ++static void break_cow(struct rmap_item *rmap_item) ++{ ++ struct vm_area_struct *vma = rmap_item->slot->vma; ++ struct mm_struct *mm = vma->vm_mm; ++ unsigned long addr = get_rmap_addr(rmap_item); ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ break_ksm(vma, addr); ++out: ++ return; ++} ++ ++/* ++ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather ++ * than check every pte of a given vma, the locking doesn't quite work for ++ * that - an rmap_item is assigned to the stable tree after inserting ksm ++ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing ++ * rmap_items from parent to child at fork time (so as not to waste time ++ * if exit comes before the next scan reaches it). ++ * ++ * Similarly, although we'd like to remove rmap_items (so updating counts ++ * and freeing memory) when unmerging an area, it's easier to leave that ++ * to the next pass of ksmd - consider, for example, how ksmd might be ++ * in cmp_and_merge_page on one of the rmap_items we would be removing. ++ */ ++inline int unmerge_uksm_pages(struct vm_area_struct *vma, ++ unsigned long start, unsigned long end) ++{ ++ unsigned long addr; ++ int err = 0; ++ ++ for (addr = start; addr < end && !err; addr += PAGE_SIZE) { ++ if (uksm_test_exit(vma->vm_mm)) ++ break; ++ if (signal_pending(current)) ++ err = -ERESTARTSYS; ++ else ++ err = break_ksm(vma, addr); ++ } ++ return err; ++} ++ ++static inline void inc_uksm_pages_scanned(void) ++{ ++ u64 delta; ++ ++ ++ if (uksm_pages_scanned == U64_MAX) { ++ encode_benefit(); ++ ++ delta = uksm_pages_scanned >> pages_scanned_base; ++ ++ if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) { ++ pages_scanned_stored >>= 1; ++ delta >>= 1; ++ pages_scanned_base++; ++ } ++ ++ pages_scanned_stored += delta; ++ ++ uksm_pages_scanned = uksm_pages_scanned_last = 0; ++ } ++ ++ uksm_pages_scanned++; ++} ++ ++static inline int find_zero_page_hash(int strength, u32 hash) ++{ ++ return (zero_hash_table[strength] == hash); ++} ++ ++static ++int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page) ++{ ++ struct page *zero_page = empty_uksm_zero_page; ++ struct mm_struct *mm = vma->vm_mm; ++ pte_t orig_pte = __pte(0); ++ int err = -EFAULT; ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ if (PageTransCompound(page) && page_trans_compound_anon_split(page)) ++ goto out; ++ BUG_ON(PageTransCompound(page)); ++ ++ if (!PageAnon(page)) ++ goto out; ++ ++ if (!trylock_page(page)) ++ goto out; ++ ++ if (write_protect_page(vma, page, &orig_pte, 0) == 0) { ++ if (is_page_full_zero(page)) ++ err = replace_page(vma, page, zero_page, orig_pte); ++ } ++ ++ unlock_page(page); ++out: ++ return err; ++} ++ ++/* ++ * cmp_and_merge_page() - first see if page can be merged into the stable ++ * tree; if not, compare hash to previous and if it's the same, see if page ++ * can be inserted into the unstable tree, or merged with a page already there ++ * and both transferred to the stable tree. ++ * ++ * @page: the page that we are searching identical page to. ++ * @rmap_item: the reverse mapping into the virtual address of this page ++ */ ++static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash) ++{ ++ struct rmap_item *tree_rmap_item; ++ struct page *page; ++ struct page *kpage = NULL; ++ u32 hash_max; ++ int err; ++ unsigned int success1, success2; ++ struct stable_node *snode; ++ int cmp; ++ struct rb_node *parent = NULL, **new; ++ ++ remove_rmap_item_from_tree(rmap_item); ++ page = rmap_item->page; ++ ++ /* We first start with searching the page inside the stable tree */ ++ kpage = stable_tree_search(rmap_item, hash); ++ if (kpage) { ++ err = try_to_merge_with_uksm_page(rmap_item, kpage, ++ hash); ++ if (!err) { ++ /* ++ * The page was successfully merged, add ++ * its rmap_item to the stable tree. ++ * page lock is needed because it's ++ * racing with try_to_unmap_ksm(), etc. ++ */ ++ lock_page(kpage); ++ snode = page_stable_node(kpage); ++ stable_tree_append(rmap_item, snode, 1); ++ unlock_page(kpage); ++ put_page(kpage); ++ return; /* success */ ++ } ++ put_page(kpage); ++ ++ /* ++ * if it's a collision and it has been search in sub-rbtree ++ * (hash_max != 0), we want to abort, because if it is ++ * successfully merged in unstable tree, the collision trends to ++ * happen again. ++ */ ++ if (err == MERGE_ERR_COLLI && rmap_item->hash_max) ++ return; ++ } ++ ++ tree_rmap_item = ++ unstable_tree_search_insert(rmap_item, hash); ++ if (tree_rmap_item) { ++ err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash); ++ /* ++ * As soon as we merge this page, we want to remove the ++ * rmap_item of the page we have merged with from the unstable ++ * tree, and insert it instead as new node in the stable tree. ++ */ ++ if (!err) { ++ kpage = page; ++ remove_rmap_item_from_tree(tree_rmap_item); ++ lock_page(kpage); ++ snode = stable_tree_insert(&kpage, hash, ++ rmap_item, tree_rmap_item, ++ &success1, &success2); ++ ++ /* ++ * Do not log dedup for tree item, it's not counted as ++ * scanned in this round. ++ */ ++ if (success2) ++ stable_tree_append(tree_rmap_item, snode, 0); ++ ++ /* ++ * The order of these two stable append is important: ++ * we are scanning rmap_item. ++ */ ++ if (success1) ++ stable_tree_append(rmap_item, snode, 1); ++ ++ /* ++ * The original kpage may be unlocked inside ++ * stable_tree_insert() already. This page ++ * should be unlocked before doing ++ * break_cow(). ++ */ ++ unlock_page(kpage); ++ ++ if (!success1) ++ break_cow(rmap_item); ++ ++ if (!success2) ++ break_cow(tree_rmap_item); ++ ++ } else if (err == MERGE_ERR_COLLI) { ++ BUG_ON(tree_rmap_item->tree_node->count > 1); ++ ++ rmap_item_hash_max(tree_rmap_item, ++ tree_rmap_item->tree_node->hash); ++ ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); ++ parent = &tree_rmap_item->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto put_up_out; ++ ++ rmap_item->tree_node = tree_rmap_item->tree_node; ++ rmap_item->address |= UNSTABLE_FLAG; ++ rmap_item->hash_round = uksm_hash_round; ++ rb_link_node(&rmap_item->node, parent, new); ++ rb_insert_color(&rmap_item->node, ++ &tree_rmap_item->tree_node->sub_root); ++ rmap_item->tree_node->count++; ++ } else { ++ /* ++ * either one of the page has changed or they collide ++ * at the max hash, we consider them as ill items. ++ */ ++ remove_rmap_item_from_tree(tree_rmap_item); ++ } ++put_up_out: ++ put_page(tree_rmap_item->page); ++ up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem); ++ } ++} ++ ++ ++ ++ ++static inline unsigned long get_pool_index(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT; ++ if (pool_index >= slot->pool_size) ++ BUG(); ++ return pool_index; ++} ++ ++static inline unsigned long index_page_offset(unsigned long index) ++{ ++ return offset_in_page(sizeof(struct rmap_list_entry *) * index); ++} ++ ++static inline ++struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot, ++ unsigned long index, int need_alloc) ++{ ++ unsigned long pool_index; ++ struct page *page; ++ void *addr; ++ ++ ++ pool_index = get_pool_index(slot, index); ++ if (!slot->rmap_list_pool[pool_index]) { ++ if (!need_alloc) ++ return NULL; ++ ++ page = alloc_page(GFP_KERNEL | __GFP_ZERO); ++ if (!page) ++ return NULL; ++ ++ slot->rmap_list_pool[pool_index] = page; ++ } ++ ++ addr = kmap(slot->rmap_list_pool[pool_index]); ++ addr += index_page_offset(index); ++ ++ return addr; ++} ++ ++static inline void put_rmap_list_entry(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ kunmap(slot->rmap_list_pool[pool_index]); ++} ++ ++static inline int entry_is_new(struct rmap_list_entry *entry) ++{ ++ return !entry->item; ++} ++ ++static inline unsigned long get_index_orig_addr(struct vma_slot *slot, ++ unsigned long index) ++{ ++ return slot->vma->vm_start + (index << PAGE_SHIFT); ++} ++ ++static inline unsigned long get_entry_address(struct rmap_list_entry *entry) ++{ ++ unsigned long addr; ++ ++ if (is_addr(entry->addr)) ++ addr = get_clean_addr(entry->addr); ++ else if (entry->item) ++ addr = get_rmap_addr(entry->item); ++ else ++ BUG(); ++ ++ return addr; ++} ++ ++static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry) ++{ ++ if (is_addr(entry->addr)) ++ return NULL; ++ ++ return entry->item; ++} ++ ++static inline void inc_rmap_list_pool_count(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ slot->pool_counts[pool_index]++; ++} ++ ++static inline void dec_rmap_list_pool_count(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ BUG_ON(!slot->pool_counts[pool_index]); ++ slot->pool_counts[pool_index]--; ++} ++ ++static inline int entry_has_rmap(struct rmap_list_entry *entry) ++{ ++ return !is_addr(entry->addr) && entry->item; ++} ++ ++static inline void swap_entries(struct rmap_list_entry *entry1, ++ unsigned long index1, ++ struct rmap_list_entry *entry2, ++ unsigned long index2) ++{ ++ struct rmap_list_entry tmp; ++ ++ /* swapping two new entries is meaningless */ ++ BUG_ON(entry_is_new(entry1) && entry_is_new(entry2)); ++ ++ tmp = *entry1; ++ *entry1 = *entry2; ++ *entry2 = tmp; ++ ++ if (entry_has_rmap(entry1)) ++ entry1->item->entry_index = index1; ++ ++ if (entry_has_rmap(entry2)) ++ entry2->item->entry_index = index2; ++ ++ if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) { ++ inc_rmap_list_pool_count(entry1->item->slot, index1); ++ dec_rmap_list_pool_count(entry1->item->slot, index2); ++ } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) { ++ inc_rmap_list_pool_count(entry2->item->slot, index2); ++ dec_rmap_list_pool_count(entry2->item->slot, index1); ++ } ++} ++ ++static inline void free_entry_item(struct rmap_list_entry *entry) ++{ ++ unsigned long index; ++ struct rmap_item *item; ++ ++ if (!is_addr(entry->addr)) { ++ BUG_ON(!entry->item); ++ item = entry->item; ++ entry->addr = get_rmap_addr(item); ++ set_is_addr(entry->addr); ++ index = item->entry_index; ++ remove_rmap_item_from_tree(item); ++ dec_rmap_list_pool_count(item->slot, index); ++ free_rmap_item(item); ++ } ++} ++ ++static inline int pool_entry_boundary(unsigned long index) ++{ ++ unsigned long linear_addr; ++ ++ linear_addr = sizeof(struct rmap_list_entry *) * index; ++ return index && !offset_in_page(linear_addr); ++} ++ ++static inline void try_free_last_pool(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ if (slot->rmap_list_pool[pool_index] && ++ !slot->pool_counts[pool_index]) { ++ __free_page(slot->rmap_list_pool[pool_index]); ++ slot->rmap_list_pool[pool_index] = NULL; ++ slot->flags |= UKSM_SLOT_NEED_SORT; ++ } ++ ++} ++ ++static inline unsigned long vma_item_index(struct vm_area_struct *vma, ++ struct rmap_item *item) ++{ ++ return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT; ++} ++ ++static int within_same_pool(struct vma_slot *slot, ++ unsigned long i, unsigned long j) ++{ ++ unsigned long pool_i, pool_j; ++ ++ pool_i = get_pool_index(slot, i); ++ pool_j = get_pool_index(slot, j); ++ ++ return (pool_i == pool_j); ++} ++ ++static void sort_rmap_entry_list(struct vma_slot *slot) ++{ ++ unsigned long i, j; ++ struct rmap_list_entry *entry, *swap_entry; ++ ++ entry = get_rmap_list_entry(slot, 0, 0); ++ for (i = 0; i < slot->pages; ) { ++ ++ if (!entry) ++ goto skip_whole_pool; ++ ++ if (entry_is_new(entry)) ++ goto next_entry; ++ ++ if (is_addr(entry->addr)) { ++ entry->addr = 0; ++ goto next_entry; ++ } ++ ++ j = vma_item_index(slot->vma, entry->item); ++ if (j == i) ++ goto next_entry; ++ ++ if (within_same_pool(slot, i, j)) ++ swap_entry = entry + j - i; ++ else ++ swap_entry = get_rmap_list_entry(slot, j, 1); ++ ++ swap_entries(entry, i, swap_entry, j); ++ if (!within_same_pool(slot, i, j)) ++ put_rmap_list_entry(slot, j); ++ continue; ++ ++skip_whole_pool: ++ i += PAGE_SIZE / sizeof(*entry); ++ if (i < slot->pages) ++ entry = get_rmap_list_entry(slot, i, 0); ++ continue; ++ ++next_entry: ++ if (i >= slot->pages - 1 || ++ !within_same_pool(slot, i, i + 1)) { ++ put_rmap_list_entry(slot, i); ++ if (i + 1 < slot->pages) ++ entry = get_rmap_list_entry(slot, i + 1, 0); ++ } else ++ entry++; ++ i++; ++ continue; ++ } ++ ++ /* free empty pool entries which contain no rmap_item */ ++ /* CAN be simplied to based on only pool_counts when bug freed !!!!! */ ++ for (i = 0; i < slot->pool_size; i++) { ++ unsigned char has_rmap; ++ void *addr; ++ ++ if (!slot->rmap_list_pool[i]) ++ continue; ++ ++ has_rmap = 0; ++ addr = kmap(slot->rmap_list_pool[i]); ++ BUG_ON(!addr); ++ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { ++ entry = (struct rmap_list_entry *)addr + j; ++ if (is_addr(entry->addr)) ++ continue; ++ if (!entry->item) ++ continue; ++ has_rmap = 1; ++ } ++ kunmap(slot->rmap_list_pool[i]); ++ if (!has_rmap) { ++ BUG_ON(slot->pool_counts[i]); ++ __free_page(slot->rmap_list_pool[i]); ++ slot->rmap_list_pool[i] = NULL; ++ } ++ } ++ ++ slot->flags &= ~UKSM_SLOT_NEED_SORT; ++} ++ ++/* ++ * vma_fully_scanned() - if all the pages in this slot have been scanned. ++ */ ++static inline int vma_fully_scanned(struct vma_slot *slot) ++{ ++ return slot->pages_scanned == slot->pages; ++} ++ ++/** ++ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to ++ * its random permutation. This function is embedded with the random ++ * permutation index management code. ++ */ ++static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash) ++{ ++ unsigned long rand_range, addr, swap_index, scan_index; ++ struct rmap_item *item = NULL; ++ struct rmap_list_entry *scan_entry, *swap_entry = NULL; ++ struct page *page; ++ ++ scan_index = swap_index = slot->pages_scanned % slot->pages; ++ ++ if (pool_entry_boundary(scan_index)) ++ try_free_last_pool(slot, scan_index - 1); ++ ++ if (vma_fully_scanned(slot)) { ++ if (slot->flags & UKSM_SLOT_NEED_SORT) ++ slot->flags |= UKSM_SLOT_NEED_RERAND; ++ else ++ slot->flags &= ~UKSM_SLOT_NEED_RERAND; ++ if (slot->flags & UKSM_SLOT_NEED_SORT) ++ sort_rmap_entry_list(slot); ++ } ++ ++ scan_entry = get_rmap_list_entry(slot, scan_index, 1); ++ if (!scan_entry) ++ return NULL; ++ ++ if (entry_is_new(scan_entry)) { ++ scan_entry->addr = get_index_orig_addr(slot, scan_index); ++ set_is_addr(scan_entry->addr); ++ } ++ ++ if (slot->flags & UKSM_SLOT_NEED_RERAND) { ++ rand_range = slot->pages - scan_index; ++ BUG_ON(!rand_range); ++ swap_index = scan_index + (random32() % rand_range); ++ } ++ ++ if (swap_index != scan_index) { ++ swap_entry = get_rmap_list_entry(slot, swap_index, 1); ++ if (entry_is_new(swap_entry)) { ++ swap_entry->addr = get_index_orig_addr(slot, ++ swap_index); ++ set_is_addr(swap_entry->addr); ++ } ++ swap_entries(scan_entry, scan_index, swap_entry, swap_index); ++ } ++ ++ addr = get_entry_address(scan_entry); ++ item = get_entry_item(scan_entry); ++ BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start); ++ ++ page = follow_page(slot->vma, addr, FOLL_GET); ++ if (IS_ERR_OR_NULL(page)) ++ goto nopage; ++ ++ if (!PageAnon(page) && !page_trans_compound_anon(page)) ++ goto putpage; ++ ++ /*check is zero_page pfn or uksm_zero_page*/ ++ if ((page_to_pfn(page) == zero_pfn) ++ || (page_to_pfn(page) == uksm_zero_pfn)) ++ goto putpage; ++ ++ flush_anon_page(slot->vma, page, addr); ++ flush_dcache_page(page); ++ ++ ++ *hash = page_hash(page, hash_strength, 1); ++ inc_uksm_pages_scanned(); ++ /*if the page content all zero, re-map to zero-page*/ ++ if (find_zero_page_hash(hash_strength, *hash)) { ++ if (!cmp_and_merge_zero_page(slot->vma, page)) { ++ slot->pages_merged++; ++ __inc_zone_page_state(page, NR_UKSM_ZERO_PAGES); ++ dec_mm_counter(slot->mm, MM_ANONPAGES); ++ ++ /* For full-zero pages, no need to create rmap item */ ++ goto putpage; ++ } else { ++ inc_rshash_neg(memcmp_cost / 2); ++ } ++ } ++ ++ if (!item) { ++ item = alloc_rmap_item(); ++ if (item) { ++ /* It has already been zeroed */ ++ item->slot = slot; ++ item->address = addr; ++ item->entry_index = scan_index; ++ scan_entry->item = item; ++ inc_rmap_list_pool_count(slot, scan_index); ++ } else ++ goto putpage; ++ } ++ ++ BUG_ON(item->slot != slot); ++ /* the page may have changed */ ++ item->page = page; ++ put_rmap_list_entry(slot, scan_index); ++ if (swap_entry) ++ put_rmap_list_entry(slot, swap_index); ++ return item; ++ ++putpage: ++ put_page(page); ++ page = NULL; ++nopage: ++ /* no page, store addr back and free rmap_item if possible */ ++ free_entry_item(scan_entry); ++ put_rmap_list_entry(slot, scan_index); ++ if (swap_entry) ++ put_rmap_list_entry(slot, swap_index); ++ return NULL; ++} ++ ++static inline int in_stable_tree(struct rmap_item *rmap_item) ++{ ++ return rmap_item->address & STABLE_FLAG; ++} ++ ++/** ++ * scan_vma_one_page() - scan the next page in a vma_slot. Called with ++ * mmap_sem locked. ++ */ ++static noinline void scan_vma_one_page(struct vma_slot *slot) ++{ ++ u32 hash; ++ struct mm_struct *mm; ++ struct rmap_item *rmap_item = NULL; ++ struct vm_area_struct *vma = slot->vma; ++ ++ mm = vma->vm_mm; ++ BUG_ON(!mm); ++ BUG_ON(!slot); ++ ++ rmap_item = get_next_rmap_item(slot, &hash); ++ if (!rmap_item) ++ goto out1; ++ ++ if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item)) ++ goto out2; ++ ++ cmp_and_merge_page(rmap_item, hash); ++out2: ++ put_page(rmap_item->page); ++out1: ++ slot->pages_scanned++; ++ if (slot->fully_scanned_round != fully_scanned_round) ++ scanned_virtual_pages++; ++ ++ if (vma_fully_scanned(slot)) ++ slot->fully_scanned_round = fully_scanned_round; ++} ++ ++static inline unsigned long rung_get_pages(struct scan_rung *rung) ++{ ++ struct slot_tree_node *node; ++ ++ if (!rung->vma_root.rnode) ++ return 0; ++ ++ node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode); ++ ++ return node->size; ++} ++ ++#define RUNG_SAMPLED_MIN 3 ++ ++static inline ++void uksm_calc_rung_step(struct scan_rung *rung, ++ unsigned long page_time, unsigned long ratio) ++{ ++ unsigned long sampled, pages; ++ ++ /* will be fully scanned ? */ ++ if (!rung->cover_msecs) { ++ rung->step = 1; ++ return; ++ } ++ ++ sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE) ++ * ratio / page_time; ++ ++ /* ++ * Before we finsish a scan round and expensive per-round jobs, ++ * we need to have a chance to estimate the per page time. So ++ * the sampled number can not be too small. ++ */ ++ if (sampled < RUNG_SAMPLED_MIN) ++ sampled = RUNG_SAMPLED_MIN; ++ ++ pages = rung_get_pages(rung); ++ if (likely(pages > sampled)) ++ rung->step = pages / sampled; ++ else ++ rung->step = 1; ++} ++ ++static inline int step_need_recalc(struct scan_rung *rung) ++{ ++ unsigned long pages, stepmax; ++ ++ pages = rung_get_pages(rung); ++ stepmax = pages / RUNG_SAMPLED_MIN; ++ ++ return pages && (rung->step > pages || ++ (stepmax && rung->step > stepmax)); ++} ++ ++static inline ++void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc) ++{ ++ struct vma_slot *slot; ++ ++ if (finished) ++ rung->flags |= UKSM_RUNG_ROUND_FINISHED; ++ ++ if (step_recalc || step_need_recalc(rung)) { ++ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); ++ BUG_ON(step_need_recalc(rung)); ++ } ++ ++ slot_iter_index = random32() % rung->step; ++ BUG_ON(!rung->vma_root.rnode); ++ slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter); ++ BUG_ON(!slot); ++ ++ rung->current_scan = slot; ++ rung->current_offset = slot_iter_index; ++} ++ ++static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot) ++{ ++ return &slot->rung->vma_root; ++} ++ ++/* ++ * return if resetted. ++ */ ++static int advance_current_scan(struct scan_rung *rung) ++{ ++ unsigned short n; ++ struct vma_slot *slot, *next = NULL; ++ ++ BUG_ON(!rung->vma_root.num); ++ ++ slot = rung->current_scan; ++ n = (slot->pages - rung->current_offset) % rung->step; ++ slot_iter_index = rung->step - n; ++ next = sradix_tree_next(&rung->vma_root, slot->snode, ++ slot->sindex, slot_iter); ++ ++ if (next) { ++ rung->current_offset = slot_iter_index; ++ rung->current_scan = next; ++ return 0; ++ } else { ++ reset_current_scan(rung, 1, 0); ++ return 1; ++ } ++} ++ ++static inline void rung_rm_slot(struct vma_slot *slot) ++{ ++ struct scan_rung *rung = slot->rung; ++ struct sradix_tree_root *root; ++ ++ if (rung->current_scan == slot) ++ advance_current_scan(rung); ++ ++ root = slot_get_root(slot); ++ sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex); ++ slot->snode = NULL; ++ if (step_need_recalc(rung)) { ++ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); ++ BUG_ON(step_need_recalc(rung)); ++ } ++ ++ /* In case advance_current_scan loop back to this slot again */ ++ if (rung->vma_root.num && rung->current_scan == slot) ++ reset_current_scan(slot->rung, 1, 0); ++} ++ ++static inline void rung_add_new_slots(struct scan_rung *rung, ++ struct vma_slot **slots, unsigned long num) ++{ ++ int err; ++ struct vma_slot *slot; ++ unsigned long i; ++ struct sradix_tree_root *root = &rung->vma_root; ++ ++ err = sradix_tree_enter(root, (void **)slots, num); ++ BUG_ON(err); ++ ++ for (i = 0; i < num; i++) { ++ slot = slots[i]; ++ slot->rung = rung; ++ BUG_ON(vma_fully_scanned(slot)); ++ } ++ ++ if (rung->vma_root.num == num) ++ reset_current_scan(rung, 0, 1); ++} ++ ++static inline int rung_add_one_slot(struct scan_rung *rung, ++ struct vma_slot *slot) ++{ ++ int err; ++ ++ err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1); ++ if (err) ++ return err; ++ ++ slot->rung = rung; ++ if (rung->vma_root.num == 1) ++ reset_current_scan(rung, 0, 1); ++ ++ return 0; ++} ++ ++/* ++ * Return true if the slot is deleted from its rung. ++ */ ++static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung) ++{ ++ struct scan_rung *old_rung = slot->rung; ++ int err; ++ ++ if (old_rung == rung) ++ return 0; ++ ++ rung_rm_slot(slot); ++ err = rung_add_one_slot(rung, slot); ++ if (err) { ++ err = rung_add_one_slot(old_rung, slot); ++ WARN_ON(err); /* OOPS, badly OOM, we lost this slot */ ++ } ++ ++ return 1; ++} ++ ++static inline int vma_rung_up(struct vma_slot *slot) ++{ ++ struct scan_rung *rung; ++ ++ rung = slot->rung; ++ if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1]) ++ rung++; ++ ++ return vma_rung_enter(slot, rung); ++} ++ ++static inline int vma_rung_down(struct vma_slot *slot) ++{ ++ struct scan_rung *rung; ++ ++ rung = slot->rung; ++ if (slot->rung != &uksm_scan_ladder[0]) ++ rung--; ++ ++ return vma_rung_enter(slot, rung); ++} ++ ++/** ++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. ++ */ ++static unsigned long cal_dedup_ratio(struct vma_slot *slot) ++{ ++ unsigned long ret; ++ ++ BUG_ON(slot->pages_scanned == slot->last_scanned); ++ ++ ret = slot->pages_merged; ++ ++ /* Thrashing area filtering */ ++ if (ret && uksm_thrash_threshold) { ++ if (slot->pages_cowed * 100 / slot->pages_merged ++ > uksm_thrash_threshold) { ++ ret = 0; ++ } else { ++ ret = slot->pages_merged - slot->pages_cowed; ++ } ++ } ++ ++ return ret; ++} ++ ++/** ++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. ++ */ ++static unsigned long cal_dedup_ratio_old(struct vma_slot *slot) ++{ ++ unsigned long ret; ++ unsigned long pages_scanned; ++ ++ pages_scanned = slot->pages_scanned; ++ if (!pages_scanned) { ++ if (uksm_thrash_threshold) ++ return 0; ++ else ++ pages_scanned = slot->pages_scanned; ++ } ++ ++ ret = slot->pages_bemerged * 100 / pages_scanned; ++ ++ /* Thrashing area filtering */ ++ if (ret && uksm_thrash_threshold) { ++ if (slot->pages_cowed * 100 / slot->pages_bemerged ++ > uksm_thrash_threshold) { ++ ret = 0; ++ } else { ++ ret = slot->pages_bemerged - slot->pages_cowed; ++ } ++ } ++ ++ return ret; ++} ++ ++/** ++ * stable_node_reinsert() - When the hash_strength has been adjusted, the ++ * stable tree need to be restructured, this is the function re-inserting the ++ * stable node. ++ */ ++static inline void stable_node_reinsert(struct stable_node *new_node, ++ struct page *page, ++ struct rb_root *root_treep, ++ struct list_head *tree_node_listp, ++ u32 hash) ++{ ++ struct rb_node **new = &root_treep->rb_node; ++ struct rb_node *parent = NULL; ++ struct stable_node *stable_node; ++ struct tree_node *tree_node; ++ struct page *tree_page; ++ int cmp; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ /* find a stable tree node with same first level hash value */ ++ stable_node_hash_max(new_node, page, hash); ++ if (tree_node->count == 1) { ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ stable_node_hash_max(stable_node, ++ tree_page, hash); ++ put_page(tree_page); ++ ++ /* prepare for stable node insertion */ ++ ++ cmp = hash_cmp(new_node->hash_max, ++ stable_node->hash_max); ++ parent = &stable_node->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto failed; ++ ++ goto add_node; ++ } else { ++ /* the only stable_node deleted, the tree node ++ * was not deleted. ++ */ ++ goto tree_node_reuse; ++ } ++ } ++ ++ /* well, search the collision subtree */ ++ new = &tree_node->sub_root.rb_node; ++ parent = NULL; ++ BUG_ON(!*new); ++ while (*new) { ++ int cmp; ++ ++ stable_node = rb_entry(*new, struct stable_node, node); ++ ++ cmp = hash_cmp(new_node->hash_max, ++ stable_node->hash_max); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else { ++ /* oh, no, still a collision */ ++ goto failed; ++ } ++ } ++ ++ goto add_node; ++ } ++ ++ /* no tree node found */ ++ tree_node = alloc_tree_node(tree_node_listp); ++ if (!tree_node) { ++ printk(KERN_ERR "UKSM: memory allocation error!\n"); ++ goto failed; ++ } else { ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, root_treep); ++ ++tree_node_reuse: ++ /* prepare for stable node insertion */ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++add_node: ++ rb_link_node(&new_node->node, parent, new); ++ rb_insert_color(&new_node->node, &tree_node->sub_root); ++ new_node->tree_node = tree_node; ++ tree_node->count++; ++ return; ++ ++failed: ++ /* This can only happen when two nodes have collided ++ * in two levels. ++ */ ++ new_node->tree_node = NULL; ++ return; ++} ++ ++static inline void free_all_tree_nodes(struct list_head *list) ++{ ++ struct tree_node *node, *tmp; ++ ++ list_for_each_entry_safe(node, tmp, list, all_list) { ++ free_tree_node(node); ++ } ++} ++ ++/** ++ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash ++ * strength to the current hash_strength. It re-structures the hole tree. ++ */ ++static inline void stable_tree_delta_hash(u32 prev_hash_strength) ++{ ++ struct stable_node *node, *tmp; ++ struct rb_root *root_new_treep; ++ struct list_head *new_tree_node_listp; ++ ++ stable_tree_index = (stable_tree_index + 1) % 2; ++ root_new_treep = &root_stable_tree[stable_tree_index]; ++ new_tree_node_listp = &stable_tree_node_list[stable_tree_index]; ++ *root_new_treep = RB_ROOT; ++ BUG_ON(!list_empty(new_tree_node_listp)); ++ ++ /* ++ * we need to be safe, the node could be removed by get_uksm_page() ++ */ ++ list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) { ++ void *addr; ++ struct page *node_page; ++ u32 hash; ++ ++ /* ++ * We are completely re-structuring the stable nodes to a new ++ * stable tree. We don't want to touch the old tree unlinks and ++ * old tree_nodes. The old tree_nodes will be freed at once. ++ */ ++ node_page = get_uksm_page(node, 0, 0); ++ if (!node_page) ++ continue; ++ ++ if (node->tree_node) { ++ hash = node->tree_node->hash; ++ ++ addr = kmap_atomic(node_page, KM_USER0); ++ ++ hash = delta_hash(addr, prev_hash_strength, ++ hash_strength, hash); ++ kunmap_atomic(addr, KM_USER0); ++ } else { ++ /* ++ *it was not inserted to rbtree due to collision in last ++ *round scan. ++ */ ++ hash = page_hash(node_page, hash_strength, 0); ++ } ++ ++ stable_node_reinsert(node, node_page, root_new_treep, ++ new_tree_node_listp, hash); ++ put_page(node_page); ++ } ++ ++ root_stable_treep = root_new_treep; ++ free_all_tree_nodes(stable_tree_node_listp); ++ BUG_ON(!list_empty(stable_tree_node_listp)); ++ stable_tree_node_listp = new_tree_node_listp; ++} ++ ++static inline void inc_hash_strength(unsigned long delta) ++{ ++ hash_strength += 1 << delta; ++ if (hash_strength > HASH_STRENGTH_MAX) ++ hash_strength = HASH_STRENGTH_MAX; ++} ++ ++static inline void dec_hash_strength(unsigned long delta) ++{ ++ unsigned long change = 1 << delta; ++ ++ if (hash_strength <= change + 1) ++ hash_strength = 1; ++ else ++ hash_strength -= change; ++} ++ ++static inline void inc_hash_strength_delta(void) ++{ ++ hash_strength_delta++; ++ if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX) ++ hash_strength_delta = HASH_STRENGTH_DELTA_MAX; ++} ++ ++/* ++static inline unsigned long get_current_neg_ratio(void) ++{ ++ if (!rshash_pos || rshash_neg > rshash_pos) ++ return 100; ++ ++ return div64_u64(100 * rshash_neg , rshash_pos); ++} ++*/ ++ ++static inline unsigned long get_current_neg_ratio(void) ++{ ++ u64 pos = benefit.pos; ++ u64 neg = benefit.neg; ++ ++ if (!neg) ++ return 0; ++ ++ if (!pos || neg > pos) ++ return 100; ++ ++ if (neg > div64_u64(U64_MAX, 100)) ++ pos = div64_u64(pos, 100); ++ else ++ neg *= 100; ++ ++ return div64_u64(neg, pos); ++} ++ ++static inline unsigned long get_current_benefit(void) ++{ ++ u64 pos = benefit.pos; ++ u64 neg = benefit.neg; ++ u64 scanned = benefit.scanned; ++ ++ if (neg > pos) ++ return 0; ++ ++ return div64_u64((pos - neg), scanned); ++} ++ ++static inline int judge_rshash_direction(void) ++{ ++ u64 current_neg_ratio, stable_benefit; ++ u64 current_benefit, delta = 0; ++ int ret = STILL; ++ ++ /* Try to probe a value after the boot, and in case the system ++ are still for a long time. */ ++ if ((fully_scanned_round & 0xFFULL) == 10) { ++ ret = OBSCURE; ++ goto out; ++ } ++ ++ current_neg_ratio = get_current_neg_ratio(); ++ ++ if (current_neg_ratio == 0) { ++ rshash_neg_cont_zero++; ++ if (rshash_neg_cont_zero > 2) ++ return GO_DOWN; ++ else ++ return STILL; ++ } ++ rshash_neg_cont_zero = 0; ++ ++ if (current_neg_ratio > 90) { ++ ret = GO_UP; ++ goto out; ++ } ++ ++ current_benefit = get_current_benefit(); ++ stable_benefit = rshash_state.stable_benefit; ++ ++ if (!stable_benefit) { ++ ret = OBSCURE; ++ goto out; ++ } ++ ++ if (current_benefit > stable_benefit) ++ delta = current_benefit - stable_benefit; ++ else if (current_benefit < stable_benefit) ++ delta = stable_benefit - current_benefit; ++ ++ delta = div64_u64(100 * delta , stable_benefit); ++ ++ if (delta > 50) { ++ rshash_cont_obscure++; ++ if (rshash_cont_obscure > 2) ++ return OBSCURE; ++ else ++ return STILL; ++ } ++ ++out: ++ rshash_cont_obscure = 0; ++ return ret; ++} ++ ++/** ++ * rshash_adjust() - The main function to control the random sampling state ++ * machine for hash strength adapting. ++ * ++ * return true if hash_strength has changed. ++ */ ++static inline int rshash_adjust(void) ++{ ++ unsigned long prev_hash_strength = hash_strength; ++ ++ if (!encode_benefit()) ++ return 0; ++ ++ switch (rshash_state.state) { ++ case RSHASH_STILL: ++ switch (judge_rshash_direction()) { ++ case GO_UP: ++ if (rshash_state.pre_direct == GO_DOWN) ++ hash_strength_delta = 0; ++ ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.pre_direct = GO_UP; ++ break; ++ ++ case GO_DOWN: ++ if (rshash_state.pre_direct == GO_UP) ++ hash_strength_delta = 0; ++ ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.pre_direct = GO_DOWN; ++ break; ++ ++ case OBSCURE: ++ rshash_state.stable_point = hash_strength; ++ rshash_state.turn_point_down = hash_strength; ++ rshash_state.turn_point_up = hash_strength; ++ rshash_state.turn_benefit_down = get_current_benefit(); ++ rshash_state.turn_benefit_up = get_current_benefit(); ++ rshash_state.lookup_window_index = 0; ++ rshash_state.state = RSHASH_TRYDOWN; ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ break; ++ ++ case STILL: ++ break; ++ default: ++ BUG(); ++ } ++ break; ++ ++ case RSHASH_TRYDOWN: ++ if (rshash_state.lookup_window_index++ % 5 == 0) ++ rshash_state.below_count = 0; ++ ++ if (get_current_benefit() < rshash_state.stable_benefit) ++ rshash_state.below_count++; ++ else if (get_current_benefit() > ++ rshash_state.turn_benefit_down) { ++ rshash_state.turn_point_down = hash_strength; ++ rshash_state.turn_benefit_down = get_current_benefit(); ++ } ++ ++ if (rshash_state.below_count >= 3 || ++ judge_rshash_direction() == GO_UP || ++ hash_strength == 1) { ++ hash_strength = rshash_state.stable_point; ++ hash_strength_delta = 0; ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.lookup_window_index = 0; ++ rshash_state.state = RSHASH_TRYUP; ++ hash_strength_delta = 0; ++ } else { ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ } ++ break; ++ ++ case RSHASH_TRYUP: ++ if (rshash_state.lookup_window_index++ % 5 == 0) ++ rshash_state.below_count = 0; ++ ++ if (get_current_benefit() < rshash_state.turn_benefit_down) ++ rshash_state.below_count++; ++ else if (get_current_benefit() > rshash_state.turn_benefit_up) { ++ rshash_state.turn_point_up = hash_strength; ++ rshash_state.turn_benefit_up = get_current_benefit(); ++ } ++ ++ if (rshash_state.below_count >= 3 || ++ judge_rshash_direction() == GO_DOWN || ++ hash_strength == HASH_STRENGTH_MAX) { ++ hash_strength = rshash_state.turn_benefit_up > ++ rshash_state.turn_benefit_down ? ++ rshash_state.turn_point_up : ++ rshash_state.turn_point_down; ++ ++ rshash_state.state = RSHASH_PRE_STILL; ++ } else { ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ } ++ ++ break; ++ ++ case RSHASH_NEW: ++ case RSHASH_PRE_STILL: ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.state = RSHASH_STILL; ++ hash_strength_delta = 0; ++ break; ++ default: ++ BUG(); ++ } ++ ++ /* rshash_neg = rshash_pos = 0; */ ++ reset_benefit(); ++ ++ if (prev_hash_strength != hash_strength) ++ stable_tree_delta_hash(prev_hash_strength); ++ ++ return prev_hash_strength != hash_strength; ++} ++ ++/** ++ * round_update_ladder() - The main function to do update of all the ++ * adjustments whenever a scan round is finished. ++ */ ++static noinline void round_update_ladder(void) ++{ ++ int i; ++ unsigned long dedup; ++ struct vma_slot *slot, *tmp_slot; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED; ++ } ++ ++ list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) { ++ ++ /* slot may be rung_rm_slot() when mm exits */ ++ if (slot->snode) { ++ dedup = cal_dedup_ratio_old(slot); ++ if (dedup && dedup >= uksm_abundant_threshold) ++ vma_rung_up(slot); ++ } ++ ++ slot->pages_bemerged = 0; ++ slot->pages_cowed = 0; ++ ++ list_del_init(&slot->dedup_list); ++ } ++} ++ ++static void uksm_del_vma_slot(struct vma_slot *slot) ++{ ++ int i, j; ++ struct rmap_list_entry *entry; ++ ++ if (slot->snode) { ++ /* ++ * In case it just failed when entering the rung, it's not ++ * necessary. ++ */ ++ rung_rm_slot(slot); ++ } ++ ++ if (!list_empty(&slot->dedup_list)) ++ list_del(&slot->dedup_list); ++ ++ if (!slot->rmap_list_pool || !slot->pool_counts) { ++ /* In case it OOMed in uksm_vma_enter() */ ++ goto out; ++ } ++ ++ for (i = 0; i < slot->pool_size; i++) { ++ void *addr; ++ ++ if (!slot->rmap_list_pool[i]) ++ continue; ++ ++ addr = kmap(slot->rmap_list_pool[i]); ++ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { ++ entry = (struct rmap_list_entry *)addr + j; ++ if (is_addr(entry->addr)) ++ continue; ++ if (!entry->item) ++ continue; ++ ++ remove_rmap_item_from_tree(entry->item); ++ free_rmap_item(entry->item); ++ slot->pool_counts[i]--; ++ } ++ BUG_ON(slot->pool_counts[i]); ++ kunmap(slot->rmap_list_pool[i]); ++ __free_page(slot->rmap_list_pool[i]); ++ } ++ kfree(slot->rmap_list_pool); ++ kfree(slot->pool_counts); ++ ++out: ++ slot->rung = NULL; ++ BUG_ON(uksm_pages_total < slot->pages); ++ if (slot->flags & UKSM_SLOT_IN_UKSM) ++ uksm_pages_total -= slot->pages; ++ ++ if (slot->fully_scanned_round == fully_scanned_round) ++ scanned_virtual_pages -= slot->pages; ++ else ++ scanned_virtual_pages -= slot->pages_scanned; ++ free_vma_slot(slot); ++} ++ ++ ++#define SPIN_LOCK_PERIOD 32 ++static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD]; ++static inline void cleanup_vma_slots(void) ++{ ++ struct vma_slot *slot; ++ int i; ++ ++ i = 0; ++ spin_lock(&vma_slot_list_lock); ++ while (!list_empty(&vma_slot_del)) { ++ slot = list_entry(vma_slot_del.next, ++ struct vma_slot, slot_list); ++ list_del(&slot->slot_list); ++ cleanup_slots[i++] = slot; ++ if (i == SPIN_LOCK_PERIOD) { ++ spin_unlock(&vma_slot_list_lock); ++ while (--i >= 0) ++ uksm_del_vma_slot(cleanup_slots[i]); ++ i = 0; ++ spin_lock(&vma_slot_list_lock); ++ } ++ } ++ spin_unlock(&vma_slot_list_lock); ++ ++ while (--i >= 0) ++ uksm_del_vma_slot(cleanup_slots[i]); ++} ++ ++/* ++*expotional moving average formula ++*/ ++static inline unsigned long ema(unsigned long curr, unsigned long last_ema) ++{ ++ /* ++ * For a very high burst, even the ema cannot work well, a false very ++ * high per-page time estimation can result in feedback in very high ++ * overhead of context swith and rung update -- this will then lead ++ * to higher per-paper time, this may not converge. ++ * ++ * Instead, we try to approach this value in a binary manner. ++ */ ++ if (curr > last_ema * 10) ++ return last_ema * 2; ++ ++ return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100; ++} ++ ++/* ++ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to ++ * nanoseconds based on current uksm_sleep_jiffies. ++ */ ++static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio) ++{ ++ return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) / ++ (TIME_RATIO_SCALE - ratio) * ratio; ++} ++ ++ ++static inline unsigned long rung_real_ratio(int cpu_time_ratio) ++{ ++ unsigned long ret; ++ ++ BUG_ON(!cpu_time_ratio); ++ ++ if (cpu_time_ratio > 0) ++ ret = cpu_time_ratio; ++ else ++ ret = (unsigned long)(-cpu_time_ratio) * ++ uksm_max_cpu_percentage / 100UL; ++ ++ return ret ? ret : 1; ++} ++ ++static noinline void uksm_calc_scan_pages(void) ++{ ++ struct scan_rung *ladder = uksm_scan_ladder; ++ unsigned long sleep_usecs, nsecs; ++ unsigned long ratio; ++ int i; ++ unsigned long per_page; ++ ++ if (uksm_ema_page_time > 100000 || ++ (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL)) ++ uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; ++ ++ per_page = uksm_ema_page_time; ++ BUG_ON(!per_page); ++ ++ /* ++ * For every 8 eval round, we try to probe a uksm_sleep_jiffies value ++ * based on saved user input. ++ */ ++ if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL) ++ uksm_sleep_jiffies = uksm_sleep_saved; ++ ++ /* We require a rung scan at least 1 page in a period. */ ++ nsecs = per_page; ++ ratio = rung_real_ratio(ladder[0].cpu_ratio); ++ if (cpu_ratio_to_nsec(ratio) < nsecs) { ++ sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio ++ / NSEC_PER_USEC; ++ uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ ratio = rung_real_ratio(ladder[i].cpu_ratio); ++ ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) / ++ per_page; ++ BUG_ON(!ladder[i].pages_to_scan); ++ uksm_calc_rung_step(&ladder[i], per_page, ratio); ++ } ++} ++ ++/* ++ * From the scan time of this round (ns) to next expected min sleep time ++ * (ms), be careful of the possible overflows. ratio is taken from ++ * rung_real_ratio() ++ */ ++static inline ++unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio) ++{ ++ scan_time >>= 20; /* to msec level now */ ++ BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE)); ++ ++ return (unsigned int) ((unsigned long) scan_time * ++ (TIME_RATIO_SCALE - ratio) / ratio); ++} ++ ++#define __round_mask(x, y) ((__typeof__(x))((y)-1)) ++#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) ++ ++static inline unsigned long vma_pool_size(struct vma_slot *slot) ++{ ++ return round_up(sizeof(struct rmap_list_entry) * slot->pages, ++ PAGE_SIZE) >> PAGE_SHIFT; ++} ++ ++static void uksm_vma_enter(struct vma_slot **slots, unsigned long num) ++{ ++ struct scan_rung *rung; ++ unsigned long pool_size, i; ++ struct vma_slot *slot; ++ int failed; ++ ++ rung = &uksm_scan_ladder[0]; ++ ++ failed = 0; ++ for (i = 0; i < num; i++) { ++ slot = slots[i]; ++ ++ pool_size = vma_pool_size(slot); ++ slot->rmap_list_pool = kzalloc(sizeof(struct page *) * ++ pool_size, GFP_KERNEL); ++ if (!slot->rmap_list_pool) ++ break; ++ ++ slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size, ++ GFP_KERNEL); ++ if (!slot->pool_counts) { ++ kfree(slot->rmap_list_pool); ++ break; ++ } ++ ++ slot->pool_size = pool_size; ++ BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages)); ++ slot->flags |= UKSM_SLOT_IN_UKSM; ++ uksm_pages_total += slot->pages; ++ } ++ ++ if (i) ++ rung_add_new_slots(rung, slots, i); ++ ++ return; ++} ++ ++static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE]; ++ ++static void uksm_enter_all_slots(void) ++{ ++ struct vma_slot *slot; ++ unsigned long index; ++ struct list_head empty_vma_list; ++ int i; ++ ++ i = 0; ++ index = 0; ++ INIT_LIST_HEAD(&empty_vma_list); ++ ++ spin_lock(&vma_slot_list_lock); ++ while (!list_empty(&vma_slot_new)) { ++ slot = list_entry(vma_slot_new.next, ++ struct vma_slot, slot_list); ++ ++ if (!slot->vma->anon_vma) { ++ list_move(&slot->slot_list, &empty_vma_list); ++ } else if (vma_can_enter(slot->vma)) { ++ batch_slots[index++] = slot; ++ list_del_init(&slot->slot_list); ++ } else { ++ list_move(&slot->slot_list, &vma_slot_noadd); ++ } ++ ++ if (++i == SPIN_LOCK_PERIOD || ++ (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) { ++ spin_unlock(&vma_slot_list_lock); ++ ++ if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) { ++ uksm_vma_enter(batch_slots, index); ++ index = 0; ++ } ++ i = 0; ++ cond_resched(); ++ spin_lock(&vma_slot_list_lock); ++ } ++ } ++ ++ list_splice(&empty_vma_list, &vma_slot_new); ++ ++ spin_unlock(&vma_slot_list_lock); ++ ++ if (index) ++ uksm_vma_enter(batch_slots, index); ++ ++} ++ ++static inline int rung_round_finished(struct scan_rung *rung) ++{ ++ return rung->flags & UKSM_RUNG_ROUND_FINISHED; ++} ++ ++static inline void judge_slot(struct vma_slot *slot) ++{ ++ struct scan_rung *rung = slot->rung; ++ unsigned long dedup; ++ int deleted; ++ ++ dedup = cal_dedup_ratio(slot); ++ if (vma_fully_scanned(slot) && uksm_thrash_threshold) ++ deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]); ++ else if (dedup && dedup >= uksm_abundant_threshold) ++ deleted = vma_rung_up(slot); ++ else ++ deleted = vma_rung_down(slot); ++ ++ slot->pages_merged = 0; ++ slot->pages_cowed = 0; ++ ++ if (vma_fully_scanned(slot)) ++ slot->pages_scanned = 0; ++ ++ slot->last_scanned = slot->pages_scanned; ++ ++ /* If its deleted in above, then rung was already advanced. */ ++ if (!deleted) ++ advance_current_scan(rung); ++} ++ ++ ++static inline int hash_round_finished(void) ++{ ++ if (scanned_virtual_pages > (uksm_pages_total >> 2)) { ++ scanned_virtual_pages = 0; ++ if (uksm_pages_scanned) ++ fully_scanned_round++; ++ ++ return 1; ++ } else { ++ return 0; ++ } ++} ++ ++#define UKSM_MMSEM_BATCH 5 ++/** ++ * uksm_do_scan() - the main worker function. ++ */ ++static noinline void uksm_do_scan(void) ++{ ++ struct vma_slot *slot, *iter; ++ struct mm_struct *busy_mm; ++ unsigned char round_finished, all_rungs_emtpy; ++ int i, err, mmsem_batch; ++ unsigned long pcost; ++ long long delta_exec; ++ unsigned long vpages, max_cpu_ratio; ++ unsigned long long start_time, end_time, scan_time; ++ unsigned int expected_jiffies; ++ ++ might_sleep(); ++ ++ vpages = 0; ++ ++ start_time = task_sched_runtime(current); ++ max_cpu_ratio = 0; ++ mmsem_batch = 0; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE;) { ++ struct scan_rung *rung = &uksm_scan_ladder[i]; ++ unsigned long ratio; ++ ++ if (!rung->pages_to_scan) { ++ i++; ++ continue; ++ } ++ ++ if (!rung->vma_root.num) { ++ rung->pages_to_scan = 0; ++ i++; ++ continue; ++ } ++ ++ ratio = rung_real_ratio(rung->cpu_ratio); ++ if (ratio > max_cpu_ratio) ++ max_cpu_ratio = ratio; ++ ++ /* ++ * Do not consider rung_round_finished() here, just used up the ++ * rung->pages_to_scan quota. ++ */ ++ while (rung->pages_to_scan && rung->vma_root.num && ++ likely(!freezing(current))) { ++ int reset = 0; ++ ++ slot = rung->current_scan; ++ ++ BUG_ON(vma_fully_scanned(slot)); ++ ++ if (mmsem_batch) { ++ err = 0; ++ } else { ++ err = try_down_read_slot_mmap_sem(slot); ++ } ++ ++ if (err == -ENOENT) { ++rm_slot: ++ rung_rm_slot(slot); ++ continue; ++ } ++ ++ busy_mm = slot->mm; ++ ++ if (err == -EBUSY) { ++ /* skip other vmas on the same mm */ ++ do { ++ reset = advance_current_scan(rung); ++ iter = rung->current_scan; ++ if (iter->vma->vm_mm != busy_mm) ++ break; ++ } while (!reset); ++ ++ if (iter->vma->vm_mm != busy_mm) { ++ continue; ++ } else { ++ /* scan round finsished */ ++ break; ++ } ++ } ++ ++ BUG_ON(!vma_can_enter(slot->vma)); ++ if (uksm_test_exit(slot->vma->vm_mm)) { ++ mmsem_batch = 0; ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ goto rm_slot; ++ } ++ ++ if (mmsem_batch) ++ mmsem_batch--; ++ else ++ mmsem_batch = UKSM_MMSEM_BATCH; ++ ++ /* Ok, we have take the mmap_sem, ready to scan */ ++ scan_vma_one_page(slot); ++ rung->pages_to_scan--; ++ vpages++; ++ ++ if (rung->current_offset + rung->step > slot->pages - 1 ++ || vma_fully_scanned(slot)) { ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ judge_slot(slot); ++ mmsem_batch = 0; ++ } else { ++ rung->current_offset += rung->step; ++ if (!mmsem_batch) ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ } ++ ++ cond_resched(); ++ } ++ ++ if (mmsem_batch) { ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ mmsem_batch = 0; ++ } ++ ++ if (freezing(current)) ++ break; ++ ++ cond_resched(); ++ } ++ end_time = task_sched_runtime(current); ++ delta_exec = end_time - start_time; ++ ++ if (freezing(current)) ++ return; ++ ++ cleanup_vma_slots(); ++ uksm_enter_all_slots(); ++ ++ round_finished = 1; ++ all_rungs_emtpy = 1; ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ struct scan_rung *rung = &uksm_scan_ladder[i]; ++ ++ if (rung->vma_root.num) { ++ all_rungs_emtpy = 0; ++ if (!rung_round_finished(rung)) ++ round_finished = 0; ++ } ++ } ++ ++ if (all_rungs_emtpy) ++ round_finished = 0; ++ ++ if (round_finished) { ++ round_update_ladder(); ++ uksm_eval_round++; ++ ++ if (hash_round_finished() && rshash_adjust()) { ++ /* Reset the unstable root iff hash strength changed */ ++ uksm_hash_round++; ++ root_unstable_tree = RB_ROOT; ++ free_all_tree_nodes(&unstable_tree_node_list); ++ } ++ ++ /* ++ * A number of pages can hang around indefinitely on per-cpu ++ * pagevecs, raised page count preventing write_protect_page ++ * from merging them. Though it doesn't really matter much, ++ * it is puzzling to see some stuck in pages_volatile until ++ * other activity jostles them out, and they also prevented ++ * LTP's KSM test from succeeding deterministically; so drain ++ * them here (here rather than on entry to uksm_do_scan(), ++ * so we don't IPI too often when pages_to_scan is set low). ++ */ ++ lru_add_drain_all(); ++ } ++ ++ ++ if (vpages && delta_exec > 0) { ++ pcost = (unsigned long) delta_exec / vpages; ++ if (likely(uksm_ema_page_time)) ++ uksm_ema_page_time = ema(pcost, uksm_ema_page_time); ++ else ++ uksm_ema_page_time = pcost; ++ } ++ ++ uksm_calc_scan_pages(); ++ uksm_sleep_real = uksm_sleep_jiffies; ++ /* in case of radical cpu bursts, apply the upper bound */ ++ end_time = task_sched_runtime(current); ++ if (max_cpu_ratio && end_time > start_time) { ++ scan_time = end_time - start_time; ++ expected_jiffies = msecs_to_jiffies( ++ scan_time_to_sleep(scan_time, max_cpu_ratio)); ++ ++ if (expected_jiffies > uksm_sleep_real) ++ uksm_sleep_real = expected_jiffies; ++ ++ /* We have a 1 second up bound for responsiveness. */ ++ if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC) ++ uksm_sleep_real = msecs_to_jiffies(1000); ++ } ++ ++ return; ++} ++ ++static int ksmd_should_run(void) ++{ ++ return uksm_run & UKSM_RUN_MERGE; ++} ++ ++static int uksm_scan_thread(void *nothing) ++{ ++ set_freezable(); ++ set_user_nice(current, 5); ++ ++ while (!kthread_should_stop()) { ++ mutex_lock(&uksm_thread_mutex); ++ if (ksmd_should_run()) { ++ uksm_do_scan(); ++ } ++ mutex_unlock(&uksm_thread_mutex); ++ ++ try_to_freeze(); ++ ++ if (ksmd_should_run()) { ++ schedule_timeout_interruptible(uksm_sleep_real); ++ uksm_sleep_times++; ++ } else { ++ wait_event_freezable(uksm_thread_wait, ++ ksmd_should_run() || kthread_should_stop()); ++ } ++ } ++ return 0; ++} ++ ++int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, ++ unsigned long *vm_flags) ++{ ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct rmap_item *rmap_item; ++ struct hlist_node *hlist, *rmap_hlist; ++ unsigned int mapcount = page_mapcount(page); ++ int referenced = 0; ++ int search_new_forks = 0; ++ unsigned long address; ++ ++ VM_BUG_ON(!PageKsm(page)); ++ VM_BUG_ON(!PageLocked(page)); ++ ++ stable_node = page_stable_node(page); ++ if (!stable_node) ++ return 0; ++ ++ ++again: ++ hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, rmap_hlist, ++ &node_vma->rmap_hlist, hlist) { ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ struct anon_vma_chain *vmac; ++ struct vm_area_struct *vma; ++ ++ anon_vma_lock(anon_vma); ++ list_for_each_entry(vmac, &anon_vma->head, ++ same_anon_vma) { ++ vma = vmac->vma; ++ address = get_rmap_addr(rmap_item); ++ ++ if (address < vma->vm_start || ++ address >= vma->vm_end) ++ continue; ++ /* ++ * Initially we examine only the vma which ++ * covers this rmap_item; but later, if there ++ * is still work to do, we examine covering ++ * vmas in other mms: in case they were forked ++ * from the original since ksmd passed. ++ */ ++ if ((rmap_item->slot->vma == vma) == ++ search_new_forks) ++ continue; ++ ++ if (memcg && ++ !mm_match_cgroup(vma->vm_mm, memcg)) ++ continue; ++ ++ referenced += ++ page_referenced_one(page, vma, ++ address, &mapcount, vm_flags); ++ if (!search_new_forks || !mapcount) ++ break; ++ } ++ ++ anon_vma_unlock(anon_vma); ++ if (!mapcount) ++ goto out; ++ } ++ } ++ if (!search_new_forks++) ++ goto again; ++out: ++ return referenced; ++} ++ ++int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) ++{ ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct hlist_node *hlist, *rmap_hlist; ++ struct rmap_item *rmap_item; ++ int ret = SWAP_AGAIN; ++ int search_new_forks = 0; ++ unsigned long address; ++ ++ VM_BUG_ON(!PageKsm(page)); ++ VM_BUG_ON(!PageLocked(page)); ++ ++ stable_node = page_stable_node(page); ++ if (!stable_node) ++ return SWAP_FAIL; ++again: ++ hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, rmap_hlist, ++ &node_vma->rmap_hlist, hlist) { ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ struct anon_vma_chain *vmac; ++ struct vm_area_struct *vma; ++ ++ anon_vma_lock(anon_vma); ++ list_for_each_entry(vmac, &anon_vma->head, ++ same_anon_vma) { ++ vma = vmac->vma; ++ address = get_rmap_addr(rmap_item); ++ ++ if (address < vma->vm_start || ++ address >= vma->vm_end) ++ continue; ++ /* ++ * Initially we examine only the vma which ++ * covers this rmap_item; but later, if there ++ * is still work to do, we examine covering ++ * vmas in other mms: in case they were forked ++ * from the original since ksmd passed. ++ */ ++ if ((rmap_item->slot->vma == vma) == ++ search_new_forks) ++ continue; ++ ++ ret = try_to_unmap_one(page, vma, ++ address, flags); ++ if (ret != SWAP_AGAIN || !page_mapped(page)) { ++ anon_vma_unlock(anon_vma); ++ goto out; ++ } ++ } ++ anon_vma_unlock(anon_vma); ++ } ++ } ++ if (!search_new_forks++) ++ goto again; ++out: ++ return ret; ++} ++ ++#ifdef CONFIG_MIGRATION ++int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, ++ struct vm_area_struct *, unsigned long, void *), void *arg) ++{ ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct hlist_node *hlist, *rmap_hlist; ++ struct rmap_item *rmap_item; ++ int ret = SWAP_AGAIN; ++ int search_new_forks = 0; ++ unsigned long address; ++ ++ VM_BUG_ON(!PageKsm(page)); ++ VM_BUG_ON(!PageLocked(page)); ++ ++ stable_node = page_stable_node(page); ++ if (!stable_node) ++ return ret; ++again: ++ hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, rmap_hlist, ++ &node_vma->rmap_hlist, hlist) { ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ struct anon_vma_chain *vmac; ++ struct vm_area_struct *vma; ++ ++ anon_vma_lock(anon_vma); ++ list_for_each_entry(vmac, &anon_vma->head, ++ same_anon_vma) { ++ vma = vmac->vma; ++ address = get_rmap_addr(rmap_item); ++ ++ if (address < vma->vm_start || ++ address >= vma->vm_end) ++ continue; ++ ++ if ((rmap_item->slot->vma == vma) == ++ search_new_forks) ++ continue; ++ ++ ret = rmap_one(page, vma, address, arg); ++ if (ret != SWAP_AGAIN) { ++ anon_vma_unlock(anon_vma); ++ goto out; ++ } ++ } ++ anon_vma_unlock(anon_vma); ++ } ++ } ++ if (!search_new_forks++) ++ goto again; ++out: ++ return ret; ++} ++ ++/* Common ksm interface but may be specific to uksm */ ++void ksm_migrate_page(struct page *newpage, struct page *oldpage) ++{ ++ struct stable_node *stable_node; ++ ++ VM_BUG_ON(!PageLocked(oldpage)); ++ VM_BUG_ON(!PageLocked(newpage)); ++ VM_BUG_ON(newpage->mapping != oldpage->mapping); ++ ++ stable_node = page_stable_node(newpage); ++ if (stable_node) { ++ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); ++ stable_node->kpfn = page_to_pfn(newpage); ++ } ++} ++#endif /* CONFIG_MIGRATION */ ++ ++#ifdef CONFIG_MEMORY_HOTREMOVE ++static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn, ++ unsigned long end_pfn) ++{ ++ struct rb_node *node; ++ ++ for (node = rb_first(root_stable_treep); node; node = rb_next(node)) { ++ struct stable_node *stable_node; ++ ++ stable_node = rb_entry(node, struct stable_node, node); ++ if (stable_node->kpfn >= start_pfn && ++ stable_node->kpfn < end_pfn) ++ return stable_node; ++ } ++ return NULL; ++} ++ ++static int uksm_memory_callback(struct notifier_block *self, ++ unsigned long action, void *arg) ++{ ++ struct memory_notify *mn = arg; ++ struct stable_node *stable_node; ++ ++ switch (action) { ++ case MEM_GOING_OFFLINE: ++ /* ++ * Keep it very simple for now: just lock out ksmd and ++ * MADV_UNMERGEABLE while any memory is going offline. ++ * mutex_lock_nested() is necessary because lockdep was alarmed ++ * that here we take uksm_thread_mutex inside notifier chain ++ * mutex, and later take notifier chain mutex inside ++ * uksm_thread_mutex to unlock it. But that's safe because both ++ * are inside mem_hotplug_mutex. ++ */ ++ mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING); ++ break; ++ ++ case MEM_OFFLINE: ++ /* ++ * Most of the work is done by page migration; but there might ++ * be a few stable_nodes left over, still pointing to struct ++ * pages which have been offlined: prune those from the tree. ++ */ ++ while ((stable_node = uksm_check_stable_tree(mn->start_pfn, ++ mn->start_pfn + mn->nr_pages)) != NULL) ++ remove_node_from_stable_tree(stable_node, 1, 1); ++ /* fallthrough */ ++ ++ case MEM_CANCEL_OFFLINE: ++ mutex_unlock(&uksm_thread_mutex); ++ break; ++ } ++ return NOTIFY_OK; ++} ++#endif /* CONFIG_MEMORY_HOTREMOVE */ ++ ++#ifdef CONFIG_SYSFS ++/* ++ * This all compiles without CONFIG_SYSFS, but is a waste of space. ++ */ ++ ++#define UKSM_ATTR_RO(_name) \ ++ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) ++#define UKSM_ATTR(_name) \ ++ static struct kobj_attribute _name##_attr = \ ++ __ATTR(_name, 0644, _name##_show, _name##_store) ++ ++static ssize_t max_cpu_percentage_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_max_cpu_percentage); ++} ++ ++static ssize_t max_cpu_percentage_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned long max_cpu_percentage; ++ int err; ++ ++ err = strict_strtoul(buf, 10, &max_cpu_percentage); ++ if (err || max_cpu_percentage > 100) ++ return -EINVAL; ++ ++ if (max_cpu_percentage == 100) ++ max_cpu_percentage = 99; ++ else if (max_cpu_percentage < 10) ++ max_cpu_percentage = 10; ++ ++ uksm_max_cpu_percentage = max_cpu_percentage; ++ ++ return count; ++} ++UKSM_ATTR(max_cpu_percentage); ++ ++static ssize_t sleep_millisecs_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies)); ++} ++ ++static ssize_t sleep_millisecs_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned long msecs; ++ int err; ++ ++ err = strict_strtoul(buf, 10, &msecs); ++ if (err || msecs > MSEC_PER_SEC) ++ return -EINVAL; ++ ++ uksm_sleep_jiffies = msecs_to_jiffies(msecs); ++ uksm_sleep_saved = uksm_sleep_jiffies; ++ ++ return count; ++} ++UKSM_ATTR(sleep_millisecs); ++ ++ ++static ssize_t cpu_governor_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); ++ int i; ++ ++ buf[0] = '\0'; ++ for (i = 0; i < n ; i++) { ++ if (uksm_cpu_governor == i) ++ strcat(buf, "["); ++ ++ strcat(buf, uksm_cpu_governor_str[i]); ++ ++ if (uksm_cpu_governor == i) ++ strcat(buf, "]"); ++ ++ strcat(buf, " "); ++ } ++ strcat(buf, "\n"); ++ ++ return strlen(buf); ++} ++ ++static inline void init_performance_values(void) ++{ ++ int i; ++ struct scan_rung *rung; ++ struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor; ++ ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = uksm_scan_ladder + i; ++ rung->cpu_ratio = preset->cpu_ratio[i]; ++ rung->cover_msecs = preset->cover_msecs[i]; ++ } ++ ++ uksm_max_cpu_percentage = preset->max_cpu; ++} ++ ++static ssize_t cpu_governor_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); ++ ++ for (n--; n >=0 ; n--) { ++ if (!strncmp(buf, uksm_cpu_governor_str[n], ++ strlen(uksm_cpu_governor_str[n]))) ++ break; ++ } ++ ++ if (n < 0) ++ return -EINVAL; ++ else ++ uksm_cpu_governor = n; ++ ++ init_performance_values(); ++ ++ return count; ++} ++UKSM_ATTR(cpu_governor); ++ ++static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_run); ++} ++ ++static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = strict_strtoul(buf, 10, &flags); ++ if (err || flags > UINT_MAX) ++ return -EINVAL; ++ if (flags > UKSM_RUN_MERGE) ++ return -EINVAL; ++ ++ mutex_lock(&uksm_thread_mutex); ++ if (uksm_run != flags) { ++ uksm_run = flags; ++ } ++ mutex_unlock(&uksm_thread_mutex); ++ ++ if (flags & UKSM_RUN_MERGE) ++ wake_up_interruptible(&uksm_thread_wait); ++ ++ return count; ++} ++UKSM_ATTR(run); ++ ++static ssize_t abundant_threshold_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_abundant_threshold); ++} ++ ++static ssize_t abundant_threshold_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = strict_strtoul(buf, 10, &flags); ++ if (err || flags > 99) ++ return -EINVAL; ++ ++ uksm_abundant_threshold = flags; ++ ++ return count; ++} ++UKSM_ATTR(abundant_threshold); ++ ++static ssize_t thrash_threshold_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_thrash_threshold); ++} ++ ++static ssize_t thrash_threshold_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = strict_strtoul(buf, 10, &flags); ++ if (err || flags > 99) ++ return -EINVAL; ++ ++ uksm_thrash_threshold = flags; ++ ++ return count; ++} ++UKSM_ATTR(thrash_threshold); ++ ++static ssize_t cpu_ratios_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int i, size; ++ struct scan_rung *rung; ++ char *p = buf; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ if (rung->cpu_ratio > 0) ++ size = sprintf(p, "%d ", rung->cpu_ratio); ++ else ++ size = sprintf(p, "MAX/%d ", ++ TIME_RATIO_SCALE / -rung->cpu_ratio); ++ ++ p += size; ++ } ++ ++ *p++ = '\n'; ++ *p = '\0'; ++ ++ return p - buf; ++} ++ ++static ssize_t cpu_ratios_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int i, cpuratios[SCAN_LADDER_SIZE], err; ++ unsigned long value; ++ struct scan_rung *rung; ++ char *p, *end = NULL; ++ ++ p = kzalloc(count, GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ memcpy(p, buf, count); ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ if (i != SCAN_LADDER_SIZE -1) { ++ end = strchr(p, ' '); ++ if (!end) ++ return -EINVAL; ++ ++ *end = '\0'; ++ } ++ ++ if (strstr(p, "MAX/")) { ++ p = strchr(p, '/') + 1; ++ err = strict_strtoul(p, 10, &value); ++ if (err || value > TIME_RATIO_SCALE || !value) ++ return -EINVAL; ++ ++ cpuratios[i] = - (int) (TIME_RATIO_SCALE / value); ++ } else { ++ err = strict_strtoul(p, 10, &value); ++ if (err || value > TIME_RATIO_SCALE || !value) ++ return -EINVAL; ++ ++ cpuratios[i] = value; ++ } ++ ++ p = end + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ rung->cpu_ratio = cpuratios[i]; ++ } ++ ++ return count; ++} ++UKSM_ATTR(cpu_ratios); ++ ++static ssize_t eval_intervals_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int i, size; ++ struct scan_rung *rung; ++ char *p = buf; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ size = sprintf(p, "%u ", rung->cover_msecs); ++ p += size; ++ } ++ ++ *p++ = '\n'; ++ *p = '\0'; ++ ++ return p - buf; ++} ++ ++static ssize_t eval_intervals_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int i, err; ++ unsigned long values[SCAN_LADDER_SIZE]; ++ struct scan_rung *rung; ++ char *p, *end = NULL; ++ ++ p = kzalloc(count, GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ memcpy(p, buf, count); ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ if (i != SCAN_LADDER_SIZE -1) { ++ end = strchr(p, ' '); ++ if (!end) ++ return -EINVAL; ++ ++ *end = '\0'; ++ } ++ ++ err = strict_strtoul(p, 10, &values[i]); ++ if (err) ++ return -EINVAL; ++ ++ p = end + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ rung->cover_msecs = values[i]; ++ } ++ ++ return count; ++} ++UKSM_ATTR(eval_intervals); ++ ++static ssize_t ema_per_page_time_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_ema_page_time); ++} ++UKSM_ATTR_RO(ema_per_page_time); ++ ++static ssize_t pages_shared_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_shared); ++} ++UKSM_ATTR_RO(pages_shared); ++ ++static ssize_t pages_sharing_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_sharing); ++} ++UKSM_ATTR_RO(pages_sharing); ++ ++static ssize_t pages_unshared_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_unshared); ++} ++UKSM_ATTR_RO(pages_unshared); ++ ++static ssize_t full_scans_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%llu\n", fully_scanned_round); ++} ++UKSM_ATTR_RO(full_scans); ++ ++static ssize_t pages_scanned_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ unsigned long base = 0; ++ u64 delta, ret; ++ ++ if (pages_scanned_stored) { ++ base = pages_scanned_base; ++ ret = pages_scanned_stored; ++ delta = uksm_pages_scanned >> base; ++ if (CAN_OVERFLOW_U64(ret, delta)) { ++ ret >>= 1; ++ delta >>= 1; ++ base++; ++ ret += delta; ++ } ++ } else { ++ ret = uksm_pages_scanned; ++ } ++ ++ while (ret > ULONG_MAX) { ++ ret >>= 1; ++ base++; ++ } ++ ++ if (base) ++ return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base); ++ else ++ return sprintf(buf, "%lu\n", (unsigned long)ret); ++} ++UKSM_ATTR_RO(pages_scanned); ++ ++static ssize_t hash_strength_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", hash_strength); ++} ++UKSM_ATTR_RO(hash_strength); ++ ++static ssize_t sleep_times_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%llu\n", uksm_sleep_times); ++} ++UKSM_ATTR_RO(sleep_times); ++ ++ ++static struct attribute *uksm_attrs[] = { ++ &max_cpu_percentage_attr.attr, ++ &sleep_millisecs_attr.attr, ++ &cpu_governor_attr.attr, ++ &run_attr.attr, ++ &ema_per_page_time_attr.attr, ++ &pages_shared_attr.attr, ++ &pages_sharing_attr.attr, ++ &pages_unshared_attr.attr, ++ &full_scans_attr.attr, ++ &pages_scanned_attr.attr, ++ &hash_strength_attr.attr, ++ &sleep_times_attr.attr, ++ &thrash_threshold_attr.attr, ++ &abundant_threshold_attr.attr, ++ &cpu_ratios_attr.attr, ++ &eval_intervals_attr.attr, ++ NULL, ++}; ++ ++static struct attribute_group uksm_attr_group = { ++ .attrs = uksm_attrs, ++ .name = "uksm", ++}; ++#endif /* CONFIG_SYSFS */ ++ ++static inline void init_scan_ladder(void) ++{ ++ int i; ++ struct scan_rung *rung; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = uksm_scan_ladder + i; ++ slot_tree_init_root(&rung->vma_root); ++ } ++ ++ init_performance_values(); ++ uksm_calc_scan_pages(); ++} ++ ++static inline int cal_positive_negative_costs(void) ++{ ++ struct page *p1, *p2; ++ unsigned char *addr1, *addr2; ++ unsigned long i, time_start, hash_cost; ++ unsigned long loopnum = 0; ++ ++ /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */ ++ volatile u32 hash; ++ volatile int ret; ++ ++ p1 = alloc_page(GFP_KERNEL); ++ if (!p1) ++ return -ENOMEM; ++ ++ p2 = alloc_page(GFP_KERNEL); ++ if (!p2) ++ return -ENOMEM; ++ ++ addr1 = kmap_atomic(p1, KM_USER0); ++ addr2 = kmap_atomic(p2, KM_USER1); ++ memset(addr1, random32(), PAGE_SIZE); ++ memcpy(addr2, addr1, PAGE_SIZE); ++ ++ /* make sure that the two pages differ in last byte */ ++ addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1]; ++ kunmap_atomic(addr2, KM_USER1); ++ kunmap_atomic(addr1, KM_USER0); ++ ++ time_start = jiffies; ++ while (jiffies - time_start < 100) { ++ for (i = 0; i < 100; i++) ++ hash = page_hash(p1, HASH_STRENGTH_FULL, 0); ++ loopnum += 100; ++ } ++ hash_cost = (jiffies - time_start); ++ ++ time_start = jiffies; ++ for (i = 0; i < loopnum; i++) ++ ret = pages_identical(p1, p2); ++ memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start); ++ memcmp_cost /= hash_cost; ++ printk(KERN_INFO "UKSM: relative memcmp_cost = %lu " ++ "hash=%u cmp_ret=%d.\n", ++ memcmp_cost, hash, ret); ++ ++ __free_page(p1); ++ __free_page(p2); ++ return 0; ++} ++ ++static int init_zeropage_hash_table(void) ++{ ++ struct page *page; ++ char *addr; ++ int i; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ addr = kmap_atomic(page, KM_USER0); ++ memset(addr, 0, PAGE_SIZE); ++ kunmap_atomic(addr, KM_USER0); ++ ++ zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32), ++ GFP_KERNEL); ++ if (!zero_hash_table) ++ return -ENOMEM; ++ ++ for (i = 0; i < HASH_STRENGTH_MAX; i++) ++ zero_hash_table[i] = page_hash(page, i, 0); ++ ++ __free_page(page); ++ ++ return 0; ++} ++ ++static inline int init_random_sampling(void) ++{ ++ unsigned long i; ++ random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL); ++ if (!random_nums) ++ return -ENOMEM; ++ ++ for (i = 0; i < HASH_STRENGTH_FULL; i++) ++ random_nums[i] = i; ++ ++ for (i = 0; i < HASH_STRENGTH_FULL; i++) { ++ unsigned long rand_range, swap_index, tmp; ++ ++ rand_range = HASH_STRENGTH_FULL - i; ++ swap_index = i + random32() % rand_range; ++ tmp = random_nums[i]; ++ random_nums[i] = random_nums[swap_index]; ++ random_nums[swap_index] = tmp; ++ } ++ ++ rshash_state.state = RSHASH_NEW; ++ rshash_state.below_count = 0; ++ rshash_state.lookup_window_index = 0; ++ ++ return cal_positive_negative_costs(); ++} ++ ++static int __init uksm_slab_init(void) ++{ ++ rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0); ++ if (!rmap_item_cache) ++ goto out; ++ ++ stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0); ++ if (!stable_node_cache) ++ goto out_free1; ++ ++ node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0); ++ if (!node_vma_cache) ++ goto out_free2; ++ ++ vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0); ++ if (!vma_slot_cache) ++ goto out_free3; ++ ++ tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0); ++ if (!tree_node_cache) ++ goto out_free4; ++ ++ return 0; ++ ++out_free4: ++ kmem_cache_destroy(vma_slot_cache); ++out_free3: ++ kmem_cache_destroy(node_vma_cache); ++out_free2: ++ kmem_cache_destroy(stable_node_cache); ++out_free1: ++ kmem_cache_destroy(rmap_item_cache); ++out: ++ return -ENOMEM; ++} ++ ++static void __init uksm_slab_free(void) ++{ ++ kmem_cache_destroy(stable_node_cache); ++ kmem_cache_destroy(rmap_item_cache); ++ kmem_cache_destroy(node_vma_cache); ++ kmem_cache_destroy(vma_slot_cache); ++ kmem_cache_destroy(tree_node_cache); ++} ++ ++/* Common interface to ksm, different to it. */ ++int ksm_madvise(struct vm_area_struct *vma, unsigned long start, ++ unsigned long end, int advice, unsigned long *vm_flags) ++{ ++ int err; ++ ++ switch (advice) { ++ case MADV_MERGEABLE: ++ return 0; /* just ignore the advice */ ++ ++ case MADV_UNMERGEABLE: ++ if (!(*vm_flags & VM_MERGEABLE)) ++ return 0; /* just ignore the advice */ ++ ++ if (vma->anon_vma) { ++ err = unmerge_uksm_pages(vma, start, end); ++ if (err) ++ return err; ++ } ++ ++ uksm_remove_vma(vma); ++ *vm_flags &= ~VM_MERGEABLE; ++ break; ++ } ++ ++ return 0; ++} ++ ++/* Common interface to ksm, actually the same. */ ++struct page *ksm_does_need_to_copy(struct page *page, ++ struct vm_area_struct *vma, unsigned long address) ++{ ++ struct page *new_page; ++ ++ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); ++ if (new_page) { ++ copy_user_highpage(new_page, page, address, vma); ++ ++ SetPageDirty(new_page); ++ __SetPageUptodate(new_page); ++ SetPageSwapBacked(new_page); ++ __set_page_locked(new_page); ++ ++ if (page_evictable(new_page, vma)) ++ lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); ++ else ++ add_page_to_unevictable_list(new_page); ++ } ++ ++ return new_page; ++} ++ ++static int __init uksm_init(void) ++{ ++ struct task_struct *uksm_thread; ++ int err; ++ ++ uksm_sleep_jiffies = msecs_to_jiffies(100); ++ uksm_sleep_saved = uksm_sleep_jiffies; ++ ++ slot_tree_init(); ++ init_scan_ladder(); ++ ++ ++ err = init_random_sampling(); ++ if (err) ++ goto out_free2; ++ ++ err = uksm_slab_init(); ++ if (err) ++ goto out_free1; ++ ++ err = init_zeropage_hash_table(); ++ if (err) ++ goto out_free0; ++ ++ uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd"); ++ if (IS_ERR(uksm_thread)) { ++ printk(KERN_ERR "uksm: creating kthread failed\n"); ++ err = PTR_ERR(uksm_thread); ++ goto out_free; ++ } ++ ++#ifdef CONFIG_SYSFS ++ err = sysfs_create_group(mm_kobj, &uksm_attr_group); ++ if (err) { ++ printk(KERN_ERR "uksm: register sysfs failed\n"); ++ kthread_stop(uksm_thread); ++ goto out_free; ++ } ++#else ++ uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */ ++ ++#endif /* CONFIG_SYSFS */ ++ ++#ifdef CONFIG_MEMORY_HOTREMOVE ++ /* ++ * Choose a high priority since the callback takes uksm_thread_mutex: ++ * later callbacks could only be taking locks which nest within that. ++ */ ++ hotplug_memory_notifier(uksm_memory_callback, 100); ++#endif ++ return 0; ++ ++out_free: ++ kfree(zero_hash_table); ++out_free0: ++ uksm_slab_free(); ++out_free1: ++ kfree(random_nums); ++out_free2: ++ kfree(uksm_scan_ladder); ++ return err; ++} ++ ++#ifdef MODULE ++module_init(uksm_init) ++#else ++late_initcall(uksm_init); ++#endif ++ +diff --git a/mm/vmstat.c b/mm/vmstat.c +index f600557..1e124c3 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -719,6 +719,9 @@ const char * const vmstat_text[] = { + "numa_other", + #endif + "nr_anon_transparent_hugepages", ++#ifdef CONFIG_UKSM ++ "nr_uksm_zero_pages", ++#endif + "nr_dirty_threshold", + "nr_dirty_background_threshold", + diff --git a/3.3.8/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-VL.patch b/3.3.8/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-VL.patch new file mode 100644 index 0000000..a26d2b1 --- /dev/null +++ b/3.3.8/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-VL.patch @@ -0,0 +1,381 @@ +diff -uNr linux-3.2.33-go.orig/arch/arm/mach-kirkwood/Kconfig linux-3.2.33-go/arch/arm/mach-kirkwood/Kconfig +--- linux-3.2.33-go.orig/arch/arm/mach-kirkwood/Kconfig 2012-11-14 21:20:22.326388580 +0100 ++++ linux-3.2.33-go/arch/arm/mach-kirkwood/Kconfig 2012-11-14 21:21:02.353908681 +0100 +@@ -136,6 +136,12 @@ + Say 'Y' here if you want your kernel to support the + Buffalo LS-XHL Series. + ++config MACH_LSVL ++ bool "Buffalo LS-VL Series" ++ help ++ Say 'Y' here if you want your kernel to support the ++ Buffalo LS-VL Series. ++ + endmenu + + endif +diff -uNr linux-3.2.33-go.orig/arch/arm/mach-kirkwood/lsvl-setup.c linux-3.2.33-go/arch/arm/mach-kirkwood/lsvl-setup.c +--- linux-3.2.33-go.orig/arch/arm/mach-kirkwood/lsvl-setup.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.33-go/arch/arm/mach-kirkwood/lsvl-setup.c 2012-11-14 21:22:54.158568343 +0100 +@@ -0,0 +1,340 @@ ++/* ++ * arch/arm/mach-kirkwood/lsvl-setup.c ++ * ++ * Buffalo LS-VL Series Setup ++ * ++ * This file is licensed under the terms of the GNU General Public ++ * License version 2. This program is licensed "as is" without any ++ * warranty of any kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "common.h" ++#include "mpp.h" ++ ++/***************************************************************************** ++ * 512KB SPI Flash on BOOT Device ++ ****************************************************************************/ ++static struct mtd_partition lsvl_partitions[] = { ++ { ++ .name = "u-boot", ++ .size = 0x80000, ++ .offset = 0x00000, ++ .mask_flags = MTD_WRITEABLE, /* force read-only */ ++ } ++}; ++ ++static struct flash_platform_data lsvl_spi_slave_data = { ++ .type = "m25p40-nonjedec", ++ .parts = lsvl_partitions, ++ .nr_parts = ARRAY_SIZE(lsvl_partitions), ++}; ++ ++static struct spi_board_info __initdata lsvl_spi_slave_info[] = { ++ { ++ .modalias = "m25p80", ++ .platform_data = &lsvl_spi_slave_data, ++ .irq = -1, ++ .max_speed_hz = 20000000, ++ .bus_num = 0, ++ .chip_select = 0, ++ } ++}; ++ ++/***************************************************************************** ++ * Ethernet ++ ****************************************************************************/ ++static struct mv643xx_eth_platform_data lsvl_ge00_data = { ++ .phy_addr = MV643XX_ETH_PHY_ADDR(0), ++}; ++ ++/***************************************************************************** ++ * SATA ++ ****************************************************************************/ ++static struct mv_sata_platform_data lsvl_sata_data = { ++ .n_ports = 1, ++}; ++ ++/***************************************************************************** ++ * LEDs attached to GPIO ++ ****************************************************************************/ ++#define LSVL_GPIO_LED_ALARM 36 ++#define LSVL_GPIO_LED_FUNC_RED 37 ++#define LSVL_GPIO_LED_INFO 38 ++#define LSVL_GPIO_LED_FUNC_BLUE 39 ++#define LSVL_GPIO_LED_PWR 40 ++ ++static struct gpio_led lsvl_led_pins[] = { ++ { ++ .name = "alarm:red", ++ .gpio = LSVL_GPIO_LED_ALARM, ++ }, ++ { ++ .name = "func:red:bottom", ++ .gpio = LSVL_GPIO_LED_FUNC_RED, ++ }, ++ { ++ .name = "info:amber", ++ .gpio = LSVL_GPIO_LED_INFO, ++ }, ++ { ++ .name = "func:blue:bottom", ++ .gpio = LSVL_GPIO_LED_FUNC_BLUE, ++ }, ++ ++ { ++ .name = "power:blue", ++ .default_trigger = "default-on", ++ .gpio = LSVL_GPIO_LED_PWR, ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_led_platform_data lsvl_led_data = { ++ .leds = lsvl_led_pins, ++ .num_leds = ARRAY_SIZE(lsvl_led_pins), ++}; ++ ++static struct platform_device lsvl_leds = { ++ .name = "leds-gpio", ++ .id = -1, ++ .dev = { ++ .platform_data = &lsvl_led_data, ++ } ++}; ++ ++/***************************************************************************** ++ * General Setup ++ ****************************************************************************/ ++#define LSVL_GPIO_HDD_POWER 8 ++#define LSVL_GPIO_USB_POWER 12 ++ ++/***************************************************************************** ++ * GPIO Attached Keys ++ ****************************************************************************/ ++/*#define LSVL_GPIO_KEY_FUNC 45 ++#define LSVL_GPIO_KEY_POWER 46 ++#define LSVL_GPIO_KEY_AUTOPOWER 47 ++#define LSVL_SW_POWER 0x00 ++#define LSVL_SW_AUTOPOWER 0x01 ++#define LSVL_SW_FUNC 0x02 ++ ++static struct gpio_keys_button lsvl_buttons[] = { ++ { ++ .type = EV_SW, ++ .code = LSVL_SW_POWER, ++ .gpio = LSVL_GPIO_KEY_POWER, ++ .desc = "Power-on Switch", ++ .active_low = 1, ++ }, { ++ .type = EV_SW, ++ .code = LSVL_SW_AUTOPOWER, ++ .gpio = LSVL_GPIO_KEY_AUTOPOWER, ++ .desc = "Power-auto Switch", ++ .active_low = 1, ++ }, { ++ .type = EV_SW, ++ .code = LSVL_SW_FUNC, ++ .gpio = LSVL_GPIO_KEY_FUNC, ++ .desc = "Function Button", ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_keys_platform_data lsvl_button_data = { ++ .buttons = lsvl_buttons, ++ .nbuttons = ARRAY_SIZE(lsvl_buttons), ++}; ++ ++static struct platform_device lsvl_button_device = { ++ .name = "gpio-keys", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lsvl_button_data, ++ }, ++}; ++*/ ++ ++/***************************************************************************** ++ * GPIO Fan ++ ****************************************************************************/ ++#define LSVL_GPIO_FAN_HIGH 16 ++#define LSVL_GPIO_FAN_LOW 17 ++#define LSVL_GPIO_FAN_LOCK 43 ++ ++static struct gpio_fan_alarm lsvl_alarm = { ++ .gpio = LSVL_GPIO_FAN_LOCK, ++}; ++ ++static struct gpio_fan_speed lsvl_speeds[] = { ++ { ++ .rpm = 0, ++ .ctrl_val = 3, ++ }, { ++ .rpm = 1500, ++ .ctrl_val = 1, ++ }, { ++ .rpm = 3250, ++ .ctrl_val = 2, ++ }, { ++ .rpm = 5000, ++ .ctrl_val = 0, ++ } ++}; ++ ++static int lsvl_gpio_list[] = { ++ LSVL_GPIO_FAN_HIGH, LSVL_GPIO_FAN_LOW, ++}; ++ ++static struct gpio_fan_platform_data lsvl_fan_data = { ++ .num_ctrl = ARRAY_SIZE(lsvl_gpio_list), ++ .ctrl = lsvl_gpio_list, ++ .alarm = &lsvl_alarm, ++ .num_speed = ARRAY_SIZE(lsvl_speeds), ++ .speed = lsvl_speeds, ++}; ++ ++static struct platform_device lsvl_fan_device = { ++ .name = "gpio-fan", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lsvl_fan_data, ++ }, ++}; ++ ++/***************************************************************************** ++ * GPIO Data ++ ****************************************************************************/ ++ ++static unsigned int lsvl_mpp_config[] __initdata = { ++ MPP0_NF_IO2, ++ MPP1_NF_IO3, ++ MPP2_NF_IO4, ++ MPP3_NF_IO5, ++ MPP4_NF_IO6, ++ MPP5_NF_IO7, ++ MPP6_SYSRST_OUTn, ++ MPP7_SPI_SCn, ++ MPP8_GPIO, /* HDD Power */ ++ MPP9_GPIO, ++ MPP10_UART0_TXD, ++ MPP11_UART0_RXD, ++ MPP12_GPO, /* USB VBUS EN */ ++ MPP13_GPIO, ++ MPP14_GPIO, ++ MPP15_GPIO, ++ MPP16_GPIO, /* FAN HIGH: on:0, off:1 */ ++ MPP17_GPIO, /* FAN LOW: on:0, off:1 */ ++ MPP18_NF_IO0, ++ MPP19_NF_IO1, ++ MPP20_GPIO, ++ MPP21_GPIO, ++ MPP22_GPIO, ++ MPP23_GPIO, ++ MPP24_GPIO, ++ MPP25_GPIO, ++ MPP26_GPIO, ++ MPP27_GPIO, ++ MPP28_GPIO, ++ MPP29_GPIO, ++ MPP30_GPIO, ++ MPP31_GPIO, ++ MPP32_GPIO, ++ MPP33_GPO, ++ MPP34_GPIO, ++ MPP35_GPIO, ++ MPP36_GPIO, /* ALARM LED */ ++ MPP37_GPIO, /* FUNC RED LED */ ++ MPP38_GPIO, /* INFO LED */ ++ MPP39_GPIO, /* FUNC LED */ ++ MPP40_GPIO, /* POWER LED */ ++ MPP41_GPIO, ++ MPP42_GPIO, ++ MPP43_GPIO, /* FAN LOCK */ ++ MPP44_GPIO, ++ MPP45_GPIO, /* FUNC SW */ ++ MPP46_GPIO, /* POWER SW */ ++ MPP47_GPIO, /* POWER AUTO SW */ ++ MPP48_GPIO, /* UART EN */ ++ MPP49_GPIO, ++ 0 ++}; ++ ++/***************************************************************************** ++ * LS-VL specific power off method: reboot ++ ****************************************************************************/ ++/* ++ * On the LS-VL, the shutdown process is following: ++ * - Userland monitors key events until the power switch goes to off position ++ * - The board reboots ++ * - U-boot starts and goes into an idle mode waiting for the user ++ * to move the switch to ON position ++ * ++ */ ++ ++static void lsvl_power_off(void) ++{ ++ arm_machine_restart('h', NULL); ++} ++ ++static void __init lsvl_init(void) ++{ ++ /* ++ * Basic setup. Needs to be called early. ++ */ ++ kirkwood_init(); ++ kirkwood_mpp_conf(lsvl_mpp_config); ++ ++ /* ++ * Configure peripherals. ++ */ ++ kirkwood_uart0_init(); ++ kirkwood_ehci_init(); ++ kirkwood_ge00_init(&lsvl_ge00_data); ++ kirkwood_sata_init(&lsvl_sata_data); ++ kirkwood_spi_init(); ++ ++ platform_device_register(&lsvl_leds); ++// platform_device_register(&lsvl_button_device); ++ platform_device_register(&lsvl_fan_device); ++ ++ spi_register_board_info(lsvl_spi_slave_info, ++ ARRAY_SIZE(lsvl_spi_slave_info)); ++ ++ /* usb power on */ ++ gpio_set_value(LSVL_GPIO_USB_POWER, 1); ++ ++ /* register power-off method */ ++ pm_power_off = lsvl_power_off; ++ ++ pr_info("%s: finished\n", __func__); ++} ++ ++MACHINE_START(LSVL, "Buffalo LS-VL Series") ++ .atag_offset = 0x100, ++ .init_machine = lsvl_init, ++ .map_io = kirkwood_map_io, ++ .init_early = kirkwood_init_early, ++ .init_irq = kirkwood_init_irq, ++ .timer = &kirkwood_timer, ++MACHINE_END ++ +diff -uNr linux-3.2.33-go.orig/arch/arm/mach-kirkwood/Makefile linux-3.2.33-go/arch/arm/mach-kirkwood/Makefile +--- linux-3.2.33-go.orig/arch/arm/mach-kirkwood/Makefile 2012-11-14 21:20:22.326388580 +0100 ++++ linux-3.2.33-go/arch/arm/mach-kirkwood/Makefile 2012-11-14 21:22:20.882968794 +0100 +@@ -19,5 +19,6 @@ + obj-$(CONFIG_MACH_NET5BIG_V2) += netxbig_v2-setup.o lacie_v2-common.o + obj-$(CONFIG_MACH_T5325) += t5325-setup.o + obj-$(CONFIG_MACH_LSXHL) += lsxhl-setup.o ++obj-$(CONFIG_MACH_LSVL) += lsvl-setup.o + + obj-$(CONFIG_CPU_IDLE) += cpuidle.o +diff -uNr linux-3.2.33-go.orig/arch/arm/tools/mach-types linux-3.2.33-go/arch/arm/tools/mach-types +--- linux-3.2.33-go.orig/arch/arm/tools/mach-types 2012-11-14 21:20:22.348388327 +0100 ++++ linux-3.2.33-go/arch/arm/tools/mach-types 2012-11-14 21:21:02.356908648 +0100 +@@ -118,6 +118,7 @@ + omap_osk MACH_OMAP_OSK OMAP_OSK 515 + tosa MACH_TOSA TOSA 520 + avila MACH_AVILA AVILA 526 ++lsvl MACH_LSVL LSVL 5277 + edb9302 MACH_EDB9302 EDB9302 538 + husky MACH_HUSKY HUSKY 543 + shepherd MACH_SHEPHERD SHEPHERD 545 diff --git a/3.3.8/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-WVL.patch b/3.3.8/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-WVL.patch new file mode 100644 index 0000000..ba58859 --- /dev/null +++ b/3.3.8/v3.2-ARM-kirkwood-Add-support-for-Buffalo-LS-WVL.patch @@ -0,0 +1,538 @@ +diff -uNr linux-3.2.34-go.orig/arch/arm/mach-kirkwood/Kconfig linux-3.2.34-go/arch/arm/mach-kirkwood/Kconfig +--- linux-3.2.34-go.orig/arch/arm/mach-kirkwood/Kconfig 2012-11-19 21:03:42.654743005 +0100 ++++ linux-3.2.34-go/arch/arm/mach-kirkwood/Kconfig 2012-11-19 21:04:02.744505974 +0100 +@@ -148,6 +148,12 @@ + Say 'Y' here if you want your kernel to support the + Buffalo LS-CHLv2 Series. + ++config MACH_LSWVL ++ bool "Buffalo LS-WVL Series" ++ help ++ Say 'Y' here if you want your kernel to support the ++ Buffalo LS-WVL/E-AP NAS ++ + endmenu + + endif +diff -uNr linux-3.2.34-go.orig/arch/arm/mach-kirkwood/lswvl-setup.c linux-3.2.34-go/arch/arm/mach-kirkwood/lswvl-setup.c +--- linux-3.2.34-go.orig/arch/arm/mach-kirkwood/lswvl-setup.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.2.34-go/arch/arm/mach-kirkwood/lswvl-setup.c 2012-11-19 21:04:02.745505962 +0100 +@@ -0,0 +1,366 @@ ++/* ++ * arch/arm/mach-kirkwood/lswvl-setup.c ++ * ++ * Buffalo LS-WVL Series Setup ++ * ++ * This file is licensed under the terms of the GNU General Public ++ * License version 2. This program is licensed "as is" without any ++ * warranty of any kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "common.h" ++#include "mpp.h" ++ ++ ++/***************************************************************************** ++ * 512MB NAND Flash on Device bus CS0 ++ ****************************************************************************/ ++static struct mtd_partition lswvl_nand_parts[] = { ++ { ++ .name = "boot", ++ .offset = 0, ++ .size = 16 * 1024 * 1024, ++ }, { ++ .name = "rootfs", ++ .offset = MTDPART_OFS_NXTBLK, ++ .size = 488 * 1024 * 1024, ++ }, { ++ .name = "reserve", ++ .offset = MTDPART_OFS_NXTBLK, ++ .size = MTDPART_SIZ_FULL, ++ }, ++}; ++ ++/***************************************************************************** ++ * 512KB NOR Flash on BOOT Device ++ ****************************************************************************/ ++static struct mtd_partition lswvl_partitions[] = { ++ { ++ .name = "u-boot", ++ .size = 0x80000, ++ .offset = 0x00000, ++ .mask_flags = MTD_WRITEABLE, /* force read-only */ ++ }, ++}; ++ ++static struct flash_platform_data lswvl_spi_slave_data = { ++ .parts = lswvl_partitions, ++ .nr_parts = ARRAY_SIZE(lswvl_partitions), ++}; ++ ++static struct spi_board_info __initdata lswvl_spi_slave_info[] = { ++ { ++ .modalias = "m25p80", ++ .platform_data = &lswvl_spi_slave_data, ++ .irq = -1, ++ .max_speed_hz = 20000000, ++ .bus_num = 0, ++ .chip_select = 0, ++ }, ++}; ++ ++/***************************************************************************** ++ * Ethernet ++ ****************************************************************************/ ++static struct mv643xx_eth_platform_data lswvl_ge00_data = { ++ .phy_addr = MV643XX_ETH_PHY_ADDR(0), ++}; ++ ++/***************************************************************************** ++ * SATA ++ ****************************************************************************/ ++static struct mv_sata_platform_data lswvl_sata_data = { ++ .n_ports = 2, ++}; ++ ++/***************************************************************************** ++ * LEDs attached to GPIO ++ ****************************************************************************/ ++#define LSWVL_GPIO_LED_HDDERR0 34 ++#define LSWVL_GPIO_LED_HDDERR1 35 ++#define LSWVL_GPIO_LED_ALARM 36 ++#define LSWVL_GPIO_LED_FUNC_RED 37 ++#define LSWVL_GPIO_LED_INFO 38 ++#define LSWVL_GPIO_LED_FUNC_BLUE 39 ++#define LSWVL_GPIO_LED_PWR 40 ++ ++static struct gpio_led lswvl_led_pins[] = { ++ { ++ .name = "lswvl:hdderr:0", ++ .gpio = LSWVL_GPIO_LED_HDDERR0, ++ }, { ++ .name = "lswvl:hdderr:1", ++ .gpio = LSWVL_GPIO_LED_HDDERR1, ++ }, { ++ .name = "lswvl:alarm:red", ++ .gpio = LSWVL_GPIO_LED_ALARM, ++ }, { ++ .name = "lswvl:func:red", ++ .gpio = LSWVL_GPIO_LED_FUNC_RED, ++ }, { ++ .name = "lswvl:info:amber", ++ .gpio = LSWVL_GPIO_LED_INFO, ++ }, { ++ .name = "lswvl:func:blue", ++ .gpio = LSWVL_GPIO_LED_FUNC_BLUE, ++ }, { ++ .name = "lswvl:power:blue", ++ .default_trigger = "default-on", ++ .gpio = LSWVL_GPIO_LED_PWR, ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_led_platform_data lswvl_led_data = { ++ .leds = lswvl_led_pins, ++ .num_leds = ARRAY_SIZE(lswvl_led_pins), ++}; ++ ++static struct platform_device lswvl_leds = { ++ .name = "leds-gpio", ++ .id = -1, ++ .dev = { ++ .platform_data = &lswvl_led_data, ++ } ++}; ++ ++/***************************************************************************** ++ * General Setup ++ ****************************************************************************/ ++#define LSWVL_GPIO_HDD0_POWER 8 ++#define LSWVL_GPIO_HDD1_POWER 9 ++#define LSWVL_GPIO_USB_POWER 12 ++ ++/***************************************************************************** ++ * GPIO Attached Keys ++ ****************************************************************************/ ++#define LSWVL_GPIO_KEY_FUNC 45 ++#define LSWVL_GPIO_KEY_POWER 46 ++#define LSWVL_GPIO_KEY_AUTOPOWER 47 ++#define LSWVL_SW_POWER 0x00 ++#define LSWVL_SW_AUTOPOWER 0x01 ++#define LSWVL_SW_FUNC 0x02 ++ ++static struct gpio_keys_button lswvl_buttons[] = { ++ { ++ .type = EV_KEY, ++ .code = BTN_1, ++ .gpio = LSWVL_GPIO_KEY_POWER, ++ .desc = "power-on", ++ .active_low = 1, ++ }, { ++ .type = EV_KEY, ++ .code = BTN_2, ++ .gpio = LSWVL_GPIO_KEY_AUTOPOWER, ++ .desc = "power-auto", ++ .active_low = 1, ++ }, { ++ .type = EV_KEY, ++ .code = BTN_0, ++ .gpio = LSWVL_GPIO_KEY_FUNC, ++ .desc = "function", ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_keys_platform_data lswvl_button_data = { ++ .buttons = lswvl_buttons, ++ .nbuttons = ARRAY_SIZE(lswvl_buttons), ++}; ++ ++static struct platform_device lswvl_button_device = { ++ .name = "gpio-keys", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lswvl_button_data, ++ }, ++}; ++ ++/***************************************************************************** ++ * GPIO Fan ++ ****************************************************************************/ ++#define LSWVL_GPIO_FAN_HIGH 16 ++#define LSWVL_GPIO_FAN_LOW 17 ++#define LSWVL_GPIO_FAN_LOCK 43 ++ ++static struct gpio_fan_alarm lswvl_alarm = { ++ .gpio = LSWVL_GPIO_FAN_LOCK, ++}; ++ ++static struct gpio_fan_speed lswvl_speeds[] = { ++ { ++ .rpm = 0, ++ .ctrl_val = 3, ++ }, { ++ .rpm = 1500, ++ .ctrl_val = 1, ++ }, { ++ .rpm = 3250, ++ .ctrl_val = 2, ++ }, { ++ .rpm = 5000, ++ .ctrl_val = 0, ++ } ++}; ++ ++static int lswvl_gpio_list[] = { ++ LSWVL_GPIO_FAN_HIGH, LSWVL_GPIO_FAN_LOW, ++}; ++ ++static struct gpio_fan_platform_data lswvl_fan_data = { ++ .num_ctrl = ARRAY_SIZE(lswvl_gpio_list), ++ .ctrl = lswvl_gpio_list, ++ .alarm = &lswvl_alarm, ++ .num_speed = ARRAY_SIZE(lswvl_speeds), ++ .speed = lswvl_speeds, ++}; ++ ++static struct platform_device lswvl_fan_device = { ++ .name = "gpio-fan", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lswvl_fan_data, ++ }, ++}; ++ ++/***************************************************************************** ++ * GPIO Data ++ ****************************************************************************/ ++ ++static unsigned int lswvl_mpp_config[] __initdata = { ++ MPP0_NF_IO2, ++ MPP1_NF_IO3, ++ MPP2_NF_IO4, ++ MPP3_NF_IO5, ++ MPP4_NF_IO6, ++ MPP5_NF_IO7, ++ MPP6_SYSRST_OUTn, ++ MPP7_SPI_SCn, ++ MPP8_GPIO, /* HDD Power */ ++ MPP9_GPIO, /* HDD Power */ ++ MPP10_UART0_TXD, ++ MPP11_UART0_RXD, ++ MPP12_GPO, /* USB VBUS EN */ ++ MPP13_GPIO, ++ MPP14_GPIO, ++ MPP15_GPIO, ++ MPP16_GPIO, /* FAN HIGH: on:0, off:1 */ ++ MPP17_GPIO, /* FAN LOW: on:0, off:1 */ ++ MPP18_NF_IO0, ++ MPP19_NF_IO1, ++ MPP20_GPIO, ++ MPP21_GPIO, ++ MPP22_GPIO, ++ MPP23_GPIO, ++ MPP24_GPIO, ++ MPP25_GPIO, ++ MPP26_GPIO, ++ MPP27_GPIO, ++ MPP28_GPIO, ++ MPP29_GPIO, ++ MPP30_GPIO, ++ MPP31_GPIO, ++ MPP32_GPIO, ++ MPP33_GPO, ++ MPP34_GPIO, /*HDD ERROR LED 0*/ ++ MPP35_GPIO, /*HDD ERROR LED 1*/ ++ MPP36_GPIO, /* ALARM LED */ ++ MPP37_GPIO, /* FUNC RED LED */ ++ MPP38_GPIO, /* INFO LED */ ++ MPP39_GPIO, /* FUNC LED */ ++ MPP40_GPIO, /* POWER LED */ ++ MPP41_GPIO, ++ MPP42_GPIO, ++ MPP43_GPIO, /* FAN LOCK */ ++ MPP44_GPIO, ++ MPP45_GPIO, /* FUNC SW */ ++ MPP46_GPIO, /* POWER SW */ ++ MPP47_GPIO, /* POWER AUTO SW */ ++ MPP48_GPIO, /* UART EN */ ++ MPP49_GPIO, ++ 0 ++}; ++ ++/***************************************************************************** ++ * LS-WVL specific power off method: reboot ++ ****************************************************************************/ ++/* ++ * On the LS-WVL, the shutdown process is following: ++ * - Userland monitors key events until the power switch goes to off position ++ * - The board reboots ++ * - U-boot starts and goes into an idle mode waiting for the user ++ * to move the switch to ON position ++ * ++ */ ++ ++static void lswvl_power_off(void) ++{ ++ kirkwood_restart('h', NULL); //arm_machine_restart('h', NULL); ++} ++ ++static void __init lswvl_init(void) ++{ ++ /* ++ * Basic setup. Needs to be called early. ++ */ ++ kirkwood_init(); ++ kirkwood_mpp_conf(lswvl_mpp_config); ++ ++ /* ++ * Configure peripherals. ++ */ ++ kirkwood_ge00_init(&lswvl_ge00_data); ++ kirkwood_uart0_init(); ++ kirkwood_uart1_init(); ++ kirkwood_ehci_init(); ++ kirkwood_sata_init(&lswvl_sata_data); ++ ++ spi_register_board_info(lswvl_spi_slave_info, ++ ARRAY_SIZE(lswvl_spi_slave_info)); ++ kirkwood_spi_init(); ++ kirkwood_nand_init(ARRAY_AND_SIZE(lswvl_nand_parts), 25); ++ ++ platform_device_register(&lswvl_leds); ++ platform_device_register(&lswvl_button_device); ++ platform_device_register(&lswvl_fan_device); ++ ++ /* usb power on */ ++ gpio_set_value(LSWVL_GPIO_USB_POWER, 1); ++ ++ /* register power-off method */ ++ pm_power_off = lswvl_power_off; ++ ++ pr_info("%s: finished\n", __func__); ++} ++ ++MACHINE_START(LSWVL, "Buffalo LS-WVL Series") ++ .atag_offset = 0x100, ++ .map_io = kirkwood_map_io, ++ .init_early = kirkwood_init_early, ++ .init_irq = kirkwood_init_irq, ++ .timer = &kirkwood_timer, ++ .init_machine = lswvl_init, ++ .restart = kirkwood_restart, ++MACHINE_END ++ +diff -uNr linux-3.2.34-go.orig/arch/arm/mach-kirkwood/Makefile linux-3.2.34-go/arch/arm/mach-kirkwood/Makefile +--- linux-3.2.34-go.orig/arch/arm/mach-kirkwood/Makefile 2012-11-19 21:03:42.653743017 +0100 ++++ linux-3.2.34-go/arch/arm/mach-kirkwood/Makefile 2012-11-19 21:04:42.686036907 +0100 +@@ -21,5 +21,6 @@ + obj-$(CONFIG_MACH_LINKSTATION_CHLV2) += lschlv2-setup.o + obj-$(CONFIG_MACH_LSXHL) += lsxhl-setup.o + obj-$(CONFIG_MACH_LSVL) += lsvl-setup.o ++obj-$(CONFIG_MACH_LSWVL) += lswvl-setup.o + + obj-$(CONFIG_CPU_IDLE) += cpuidle.o +diff -uNr linux-3.2.34-go.orig/arch/arm/plat-orion/mpp.c linux-3.2.34-go/arch/arm/plat-orion/mpp.c +--- linux-3.2.34-go.orig/arch/arm/plat-orion/mpp.c 2012-11-19 21:03:42.766741717 +0100 ++++ linux-3.2.34-go/arch/arm/plat-orion/mpp.c 2012-11-19 21:04:02.747505938 +0100 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + + /* Address of the ith MPP control register */ + static __init unsigned long mpp_ctrl_addr(unsigned int i, +@@ -75,3 +76,37 @@ + } + printk("\n"); + } ++ ++#ifdef CONFIG_MACH_LSWVL ++ ++static u32 boot_mpp_value = 0x21111111; ++/* ++ * change MPP[3:1] to SPI mode ++ */ ++void lswvl_setup_spi_mpp(void) ++{ ++ u32 spival = 0; ++ u32 bootval = 0; ++ ++ spival = 0x00002220; ++ boot_mpp_value = bootval = readl(mpp_ctrl_addr(0, DEV_BUS_VIRT_BASE)); ++ bootval &= 0xffff000f; ++ writel(spival | bootval, mpp_ctrl_addr(0, DEV_BUS_VIRT_BASE)); ++} ++ ++/* ++ * change back MPP[3:1] to default configuration ++ */ ++void lswvl_reset_mpp(void) ++{ ++ u32 spival = 0; ++ u32 bootval = 0; ++ ++ spival = readl(mpp_ctrl_addr(0, DEV_BUS_VIRT_BASE)); ++ spival &= 0xffff000f; ++ bootval = boot_mpp_value & ~0xffff000f; ++ writel(spival | bootval, mpp_ctrl_addr(0, DEV_BUS_VIRT_BASE)); ++} ++ ++#endif ++ +diff -uNr linux-3.2.34-go.orig/arch/arm/tools/mach-types linux-3.2.34-go/arch/arm/tools/mach-types +--- linux-3.2.34-go.orig/arch/arm/tools/mach-types 2012-11-19 21:03:42.675742765 +0100 ++++ linux-3.2.34-go/arch/arm/tools/mach-types 2012-11-19 21:22:29.653445807 +0100 +@@ -119,6 +119,7 @@ + tosa MACH_TOSA TOSA 520 + avila MACH_AVILA AVILA 526 + lsvl MACH_LSVL LSVL 5277 ++lswvl MACH_LSWVL LSWVL 5278 + edb9302 MACH_EDB9302 EDB9302 538 + husky MACH_HUSKY HUSKY 543 + shepherd MACH_SHEPHERD SHEPHERD 545 +diff -uNr linux-3.2.34-go.orig/drivers/spi/spi-orion.c linux-3.2.34-go/drivers/spi/spi-orion.c +--- linux-3.2.34-go.orig/drivers/spi/spi-orion.c 2012-11-19 21:03:41.809752734 +0100 ++++ linux-3.2.34-go/drivers/spi/spi-orion.c 2012-11-19 21:20:55.123558883 +0100 +@@ -19,6 +19,12 @@ + #include + #include + #include ++#include ++ ++#ifdef CONFIG_MACH_LSWVL ++void lswvl_setup_spi_mpp(void); ++void lswvl_reset_mpp(void); ++#endif + + #define DRIVER_NAME "orion_spi" + +@@ -141,6 +147,9 @@ + unsigned int bits_per_word = spi->bits_per_word; + int rc; + ++#ifdef CONFIG_MACH_LSWVL ++ lswvl_setup_spi_mpp(); ++#endif + orion_spi = spi_master_get_devdata(spi->master); + + if ((t != NULL) && t->speed_hz) +@@ -153,15 +162,37 @@ + if (rc) + return rc; + ++#ifdef CONFIG_MACH_LSWVL ++ rc = orion_spi_set_transfer_size(orion_spi, bits_per_word); ++ lswvl_reset_mpp(); ++ return rc; ++#else + return orion_spi_set_transfer_size(orion_spi, bits_per_word); ++#endif + } + + static void orion_spi_set_cs(struct orion_spi *orion_spi, int enable) + { + if (enable) ++#ifdef CONFIG_MACH_LSWVL ++ { ++ lswvl_setup_spi_mpp(); ++ udelay(1); ++ orion_spi_setbits(orion_spi, ORION_SPI_IF_CTRL_REG, 0x1); ++ } ++#else + orion_spi_setbits(orion_spi, ORION_SPI_IF_CTRL_REG, 0x1); ++#endif + else + orion_spi_clrbits(orion_spi, ORION_SPI_IF_CTRL_REG, 0x1); ++#ifdef CONFIG_MACH_LSWVL ++ { ++ orion_spi_clrbits(orion_spi, ORION_SPI_IF_CTRL_REG, 0x1); ++ lswvl_reset_mpp(); ++ } ++#else ++ orion_spi_clrbits(orion_spi, ORION_SPI_IF_CTRL_REG, 0x1); ++#endif + } + + static inline int orion_spi_wait_till_ready(struct orion_spi *orion_spi) +@@ -361,8 +392,17 @@ + + /* Fix ac timing if required. */ + if (orion_spi->spi_info->enable_clock_fix) ++#ifdef CONFIG_MACH_LSWVL ++ { ++ lswvl_setup_spi_mpp(); ++ orion_spi_setbits(orion_spi, ORION_SPI_IF_CONFIG_REG, ++ (1 << 14)); ++ lswvl_reset_mpp(); ++ } ++#else + orion_spi_setbits(orion_spi, ORION_SPI_IF_CONFIG_REG, + (1 << 14)); ++#endif + + if ((spi->max_speed_hz == 0) + || (spi->max_speed_hz > orion_spi->max_speed)) diff --git a/3.3.8/v3.2-ARM-orion-Add-support-for-Buffalo-LS-PRODUO.patch b/3.3.8/v3.2-ARM-orion-Add-support-for-Buffalo-LS-PRODUO.patch new file mode 100644 index 0000000..bc8a882 --- /dev/null +++ b/3.3.8/v3.2-ARM-orion-Add-support-for-Buffalo-LS-PRODUO.patch @@ -0,0 +1,569 @@ +diff -uprN linux-3.4-rc7/arch/arm/configs/orion5x_defconfig linux-3.4-rc7-wtgl/arch/arm/configs/orion5x_defconfig +--- linux-3.4-rc7/arch/arm/configs/orion5x_defconfig 2012-05-12 19:37:47.000000000 -0600 ++++ linux-3.4-rc7-wtgl/arch/arm/configs/orion5x_defconfig 2012-08-16 23:41:47.118502384 -0600 +@@ -19,6 +19,7 @@ CONFIG_MACH_TS209=y + CONFIG_MACH_TERASTATION_PRO2=y + CONFIG_MACH_LINKSTATION_PRO=y + CONFIG_MACH_LINKSTATION_MINI=y ++CONFIG_MACH_LINKSTATION_PRODUO=y + CONFIG_MACH_LINKSTATION_LS_HGL=y + CONFIG_MACH_TS409=y + CONFIG_MACH_WRT350N_V2=y +diff -uprN linux-3.4-rc7/arch/arm/mach-orion5x/Kconfig linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/Kconfig +--- linux-3.4-rc7/arch/arm/mach-orion5x/Kconfig 2012-05-12 19:37:47.000000000 -0600 ++++ linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/Kconfig 2012-08-16 23:47:02.334496150 -0600 +@@ -65,13 +65,52 @@ config MACH_LINKSTATION_MINI + Say 'Y' here if you want your kernel to support the + Buffalo Linkstation Mini platform. + ++config MACH_LINKSTATION_PRODUO ++ bool "Buffalo Linkstation Pro Duo" ++ select I2C_BOARDINFO ++ help ++ Say 'Y' here if you want your kernel to support the ++ Buffalo Linkstation Pro Duo platform. ++ ++ LS-W1.0TGL/R1 is the general model number. There ++ is no /R3 models, as /R1 stands for RAID1. ++ There are two hardware revisions of the product. ++ ++ The first revision has version 1.xx firmware, 64 MB RAM, ++ a single USB port, a power BUTTON, an Auto/Manual ++ power MODE SWITCH, and a RESET button. ++ ++ The second revision has version 3.xx firmware, 128 MB RAM, ++ two USB ports, an Off/On/Auto power SWITCH, and a FUNCTION button. ++ ++ choice ++ prompt "HW model" ++ depends on MACH_LINKSTATION_PRODUO ++ default MACH_LINKSTATION_PRODUO_REV1 ++ default MACH_LINKSTATION_PRODUO_REV2 ++ ++ config MACH_LINKSTATION_PRODUO_REV1 ++ bool "Revision 1" ++ help ++ The first revision has version 1.xx firmware, 64 MB RAM, ++ a single USB port, a power BUTTON, an Auto/Manual ++ power MODE SWITCH, and a RESET button. ++ ++ config MACH_LINKSTATION_PRODUO_REV2 ++ bool "Revision 2" ++ help ++ The second revision has version 3.xx firmware, 128 MB RAM, ++ two USB ports, an Off/On/Auto power SWITCH, and a FUNCTION button. ++ endchoice ++ ++ + config MACH_LINKSTATION_LS_HGL + bool "Buffalo Linkstation LS-HGL" + select I2C_BOARDINFO + help + Say 'Y' here if you want your kernel to support the + Buffalo Linkstation LS-HGL platform. +- ++ + config MACH_TS409 + bool "QNAP TS-409" + help +diff -uprN linux-3.4-rc7/arch/arm/mach-orion5x/Makefile linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/Makefile +--- linux-3.4-rc7/arch/arm/mach-orion5x/Makefile 2012-05-12 19:37:47.000000000 -0600 ++++ linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/Makefile 2012-08-16 23:48:26.182494492 -0600 +@@ -5,6 +5,7 @@ obj-$(CONFIG_MACH_KUROBOX_PRO) += kurobo + obj-$(CONFIG_MACH_TERASTATION_PRO2) += terastation_pro2-setup.o + obj-$(CONFIG_MACH_LINKSTATION_PRO) += kurobox_pro-setup.o + obj-$(CONFIG_MACH_LINKSTATION_MINI) += lsmini-setup.o ++obj-$(CONFIG_MACH_LINKSTATION_PRODUO) += lsproduo-setup.o + obj-$(CONFIG_MACH_LINKSTATION_LS_HGL) += ls_hgl-setup.o + obj-$(CONFIG_MACH_DNS323) += dns323-setup.o + obj-$(CONFIG_MACH_TS209) += ts209-setup.o tsx09-common.o +diff -uprN linux-3.4-rc7/arch/arm/mach-orion5x/lsproduo-setup.c linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/lsproduo-setup.c +--- linux-3.4-rc7/arch/arm/mach-orion5x/lsproduo-setup.c 1969-12-31 17:00:00.000000000 -0700 ++++ linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/lsproduo-setup.c 2012-08-16 23:52:09.630490073 -0600 +@@ -0,0 +1,459 @@ ++/* ++ * arch/arm/mach-orion5x/lsproduo-setup.c ++ * ++ * Source taken from arch/arm/mach-orion5x/lsmini-setup.c - kernel 2.6.30 ++ * Maintainer: Matt Gomboc ++ * ++ * This file is licensed under the terms of the GNU General Public ++ * License version 2. This program is licensed "as is" without any ++ * warranty of any kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "common.h" ++#include "mpp.h" ++#include ++#include ++#include ++ ++/***************************************************************************** ++ * Linkstation Pro Duo Info ++ ****************************************************************************/ ++ ++/* ++ * 256K NOR flash Device bus boot chip select ++ */ ++ ++#define LSPRODUO_NOR_BOOT_BASE 0xf4000000 ++#define LSPRODUO_NOR_BOOT_SIZE SZ_256K ++ ++/***************************************************************************** ++ * 256KB NOR Flash on BOOT Device ++ ****************************************************************************/ ++ ++static struct physmap_flash_data lsproduo_nor_flash_data = { ++ .width = 1, ++}; ++ ++static struct resource lsproduo_nor_flash_resource = { ++ .flags = IORESOURCE_MEM, ++ .start = LSPRODUO_NOR_BOOT_BASE, ++ .end = LSPRODUO_NOR_BOOT_BASE + LSPRODUO_NOR_BOOT_SIZE - 1, ++}; ++ ++static struct platform_device lsproduo_nor_flash = { ++ .name = "physmap-flash", ++ .id = 0, ++ .dev = { ++ .platform_data = &lsproduo_nor_flash_data, ++ }, ++ .num_resources = 1, ++ .resource = &lsproduo_nor_flash_resource, ++}; ++ ++/***************************************************************************** ++ * Ethernet ++ ****************************************************************************/ ++ ++static struct mv643xx_eth_platform_data lsproduo_eth_data = { ++ .phy_addr = 8, ++}; ++ ++/***************************************************************************** ++ * RTC 5C372a on I2C bus ++ ****************************************************************************/ ++ ++static struct i2c_board_info __initdata lsproduo_i2c_rtc = { ++ I2C_BOARD_INFO("rs5c372a", 0x32), ++}; ++ ++/***************************************************************************** ++ * LEDs attached to GPIO ++ ****************************************************************************/ ++ ++#define LSPRODUO_GPIO_LED_ALARM 2 ++#define LSPRODUO_GPIO_LED_INFO 3 ++#define LSPRODUO_GPIO_LED_PWR 0 ++ ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV2 ++ #define LSPRODUO_GPIO_LED_FUNC 18 ++#endif ++ ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV1 ++static struct gpio_led lsproduo_led_pins[] = { ++ { ++ .name = "alarm:red", ++ .gpio = LSPRODUO_GPIO_LED_ALARM, ++ .active_low = 1, ++ }, { ++ .name = "info:amber", ++ .gpio = LSPRODUO_GPIO_LED_INFO, ++ .active_low = 1, ++ }, { ++ .name = "power:greem", ++ .gpio = LSPRODUO_GPIO_LED_PWR, ++ .active_low = 1, ++ }, ++}; ++#endif ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV2 ++static struct gpio_led lsproduo_led_pins[] = { ++ { ++ .name = "alarm:red", ++ .gpio = LSPRODUO_GPIO_LED_ALARM, ++ .active_low = 1, ++ }, { ++ .name = "info:amber", ++ .gpio = LSPRODUO_GPIO_LED_INFO, ++ .active_low = 1, ++ }, { ++ .name = "power:green", ++ .gpio = LSPRODUO_GPIO_LED_PWR, ++ .active_low = 1, ++ },{ ++ .name = "func:blue", ++ .gpio = LSPRODUO_GPIO_LED_FUNC, ++ .active_low = 1, ++ }, ++}; ++#endif ++ ++ ++ ++static struct gpio_led_platform_data lsproduo_led_data = { ++ .leds = lsproduo_led_pins, ++ .num_leds = ARRAY_SIZE(lsproduo_led_pins), ++}; ++ ++static struct platform_device lsproduo_leds = { ++ .name = "leds-gpio", ++ .id = -1, ++ .dev = { ++ .platform_data = &lsproduo_led_data, ++ }, ++}; ++ ++/**************************************************************************** ++ * GPIO Attached Keys ++ ****************************************************************************/ ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV1 ++ #define LSPRODUO_GPIO_KEY_POWER 8 ++ #define LSPRODUO_GPIO_KEY_AUTOPOWER 10 ++ ++ #define LSPRODUO_SW_POWER 0x00 ++ #define LSPRODUO_SW_AUTOPOWER 0x01 ++ ++static struct gpio_keys_button lsproduo_buttons[] = { ++ { ++ .type = EV_SW, ++ .code = LSPRODUO_SW_POWER, ++ .gpio = LSPRODUO_GPIO_KEY_POWER, ++ .desc = "Power-on Switch", ++ .active_low = 1, ++ }, { ++ .type = EV_SW, ++ .code = LSPRODUO_SW_AUTOPOWER, ++ .gpio = LSPRODUO_GPIO_KEY_AUTOPOWER, ++ .desc = "Power-auto Switch", ++ .active_low = 1, ++ }, ++}; ++ ++#endif ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV2 ++ #define LSPRODUO_GPIO_KEY_POWER 10 ++ #define LSPRODUO_GPIO_KEY_AUTOPOWER 22 ++ #define LSPRODUO_GPIO_KEY_FUNC 8 ++ ++ #define LSPRODUO_SW_POWER 0x00 ++ #define LSPRODUO_SW_AUTOPOWER 0x01 ++ ++static struct gpio_keys_button lsproduo_buttons[] = { ++ { ++ .code = KEY_OPTION, ++ .gpio = LSPRODUO_GPIO_KEY_FUNC, ++ .desc = "Function Button", ++ .active_low = 1, ++ },{ ++ .type = EV_SW, ++ .code = LSPRODUO_SW_POWER, ++ .gpio = LSPRODUO_GPIO_KEY_POWER, ++ .desc = "Power-on Switch", ++ .active_low = 1, ++ }, { ++ .type = EV_SW, ++ .code = LSPRODUO_SW_AUTOPOWER, ++ .gpio = LSPRODUO_GPIO_KEY_AUTOPOWER, ++ .desc = "Power-auto Switch", ++ .active_low = 1, ++ }, ++}; ++ ++#endif ++ ++static struct gpio_keys_platform_data lsproduo_button_data = { ++ .buttons = lsproduo_buttons, ++ .nbuttons = ARRAY_SIZE(lsproduo_buttons), ++}; ++ ++static struct platform_device lsproduo_button_device = { ++ .name = "gpio-keys", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lsproduo_button_data, ++ }, ++}; ++ ++/**************************************************************************** ++ * GPIO Attached Fan ++ ****************************************************************************/ ++ ++/* Define max char len */ ++#define MAX_LEN 8 ++ ++#define LSPRODUO_GPIO_FAN_LOW 17 ++#define LSPRODUO_GPIO_FAN_HIGH 14 ++ ++static struct proc_dir_entry *lsproduo_proc_dir_root, *lsproduo_proc_dir_gpio, *lsproduo_fan_proc_file; ++static char lsproduo_fan_state[MAX_LEN]; ++ ++static int lsproduo_fan_get(char *buf, char **start, off_t offset, int count, int *eof, void *data) ++{ ++ int len; ++ ++ len = snprintf(buf, count, "state: %s\n", lsproduo_fan_state); ++ return len; ++} ++ ++static int lsproduo_fan_set( struct file *file, const char *buffer, unsigned long count, void *data ) ++{ ++ int len, ret; ++ char *ptr, tState[MAX_LEN]; ++ ++ if (count > MAX_LEN ) ++ len = MAX_LEN; ++ else ++ len = count; ++ ++ ret = copy_from_user(tState, buffer, len); ++ if(ret < 0) ++ { ++ printk(KERN_ERR "%s: Setting fan speed failed\n", "lsproduo"); ++ return -EFAULT; ++ } ++ ++ ptr = strrchr(tState, '\n'); ++ if(ptr) *ptr = '\0'; ++ ++ if (strcasecmp(tState, "off") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan off\n", "lsproduo"); ++ sprintf(lsproduo_fan_state, "off"); ++ gpio_set_value(LSPRODUO_GPIO_FAN_LOW, 1); ++ gpio_set_value(LSPRODUO_GPIO_FAN_HIGH, 1); ++ } else if (strcasecmp(tState, "slow") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan slow\n", "lsproduo"); ++ sprintf(lsproduo_fan_state, "slow"); ++ gpio_set_value(LSPRODUO_GPIO_FAN_LOW, 1); ++ gpio_set_value(LSPRODUO_GPIO_FAN_HIGH, 0); ++ } else if (strcasecmp(tState, "fast") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan fast\n", "lsproduo"); ++ sprintf(lsproduo_fan_state, "fast"); ++ gpio_set_value(LSPRODUO_GPIO_FAN_LOW, 0); ++ gpio_set_value(LSPRODUO_GPIO_FAN_HIGH, 1); ++ } else if (strcasecmp(tState, "full") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan full\n", "lsproduo"); ++ sprintf(lsproduo_fan_state, "full"); ++ gpio_set_value(LSPRODUO_GPIO_FAN_LOW, 0); ++ gpio_set_value(LSPRODUO_GPIO_FAN_HIGH, 0); ++ } else ++ { ++ printk(KERN_ERR "%s: unknown fan speed given\n", "lsproduo"); ++ } ++ ++ lsproduo_fan_state[len] = '\0'; ++ ++ return len; ++} ++ ++/***************************************************************************** ++ * SATA ++ ****************************************************************************/ ++static struct mv_sata_platform_data lsproduo_sata_data = { ++ .n_ports = 2, ++}; ++ ++ ++/***************************************************************************** ++ * Linkstation Pro Duo specific power off method: reboot ++ ****************************************************************************/ ++/* ++ * On the Linkstation Pro Duo, the shutdown process is following: ++ * - Userland monitors key events until the power switch goes to off position ++ * - The board reboots ++ * - U-boot starts and goes into an idle mode waiting for the user ++ * to move the switch to ON position ++ */ ++ ++static void lsproduo_power_off(void) ++{ ++ orion5x_restart('h', NULL); ++ /* arm_machine_restart(0, NULL); */ ++} ++ ++ ++/***************************************************************************** ++ * General Setup ++ ****************************************************************************/ ++#define LSPRODUO_GPIO_HDD_POWER0 1 ++#define LSPRODUO_GPIO_USB_POWER 9 ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV1 ++ #define LSPRODUO_GPIO_POWER 8 ++ #define LSPRODUO_GPIO_AUTO_POWER 10 ++#endif ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV2 ++ #define LSPRODUO_GPIO_POWER 10 ++ #define LSPRODUO_GPIO_USB_POWER2 19 ++ #define LSPRODUO_GPIO_AUTO_POWER 22 ++#endif ++ ++static unsigned int lsproduo_mpp_modes[] __initdata = { ++ MPP0_GPIO, /* LED_PWR */ ++ MPP1_GPIO, /* HDD_PWR */ ++ MPP2_GPIO, /* LED_ALARM */ ++ MPP3_GPIO, /* LED_INFO */ ++ MPP4_UNUSED, ++ MPP5_UNUSED, ++ MPP6_GPIO, /* FAN_LCK */ ++ MPP9_GPIO, /* USB_PWR */ ++ MPP11_UNUSED, /* LED_ETH dummy */ ++ MPP12_UNUSED, ++ MPP13_UNUSED, ++ MPP14_GPIO, /* FAN_HIGH */ ++ MPP15_UNUSED, ++ MPP16_UNUSED, ++ MPP17_GPIO, /* FAN_LOW */ ++ ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV1 ++ MPP7_GPIO, /* INIT */ ++ MPP8_GPIO, /* POWER */ ++ MPP10_GPIO, /* AUTO_POWER */ ++ MPP18_UNUSED, ++ MPP19_UNUSED, ++#endif ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV2 ++ MPP7_UNUSED, ++ MPP8_GPIO, /* FUNC */ ++ MPP10_GPIO, /* POWER */ ++ MPP18_GPIO, /* LED_FUNC*/ ++ MPP19_GPIO, /* USB_PWR2 */ ++ MPP22_GPIO, /* AUTO_POWER */ ++#endif ++ 0, ++}; ++ ++static void __init lsproduo_init(void) ++{ ++ /* ++ * Setup basic Orion functions. Need to be called early. ++ */ ++ orion5x_init(); ++ ++ orion5x_mpp_conf(lsproduo_mpp_modes); ++ ++ /* ++ * Configure peripherals. ++ */ ++ orion5x_ehci0_init(); ++ orion5x_ehci1_init(); ++ orion5x_eth_init(&lsproduo_eth_data); ++ orion5x_i2c_init(); ++ orion5x_sata_init(&lsproduo_sata_data); ++ orion5x_uart0_init(); ++ orion5x_xor_init(); ++ ++ orion5x_setup_dev_boot_win(LSPRODUO_NOR_BOOT_BASE, ++ LSPRODUO_NOR_BOOT_SIZE); ++ platform_device_register(&lsproduo_nor_flash); ++ ++ platform_device_register(&lsproduo_button_device); ++ ++ platform_device_register(&lsproduo_leds); ++ ++ i2c_register_board_info(0, &lsproduo_i2c_rtc, 1); ++ ++ /* enable USB power */ ++ gpio_set_value(LSPRODUO_GPIO_USB_POWER, 1); ++ ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV2 ++ gpio_set_value(LSPRODUO_GPIO_USB_POWER2, 1); ++#endif ++ ++ printk(KERN_INFO "Buffalo Linkstation Pro Duo fan driver loaded\n"); ++ sprintf(lsproduo_fan_state, "fast"); ++ gpio_set_value(LSPRODUO_GPIO_FAN_LOW, 1); ++ gpio_set_value(LSPRODUO_GPIO_FAN_HIGH, 0); ++ ++ lsproduo_proc_dir_root = proc_mkdir( "linkstation", NULL ); ++ lsproduo_proc_dir_gpio = proc_mkdir( "gpio", lsproduo_proc_dir_root ); ++ lsproduo_fan_proc_file = create_proc_entry( "fan", S_IRUGO, lsproduo_proc_dir_gpio ); ++ if( lsproduo_fan_proc_file ) { ++ lsproduo_fan_proc_file->read_proc = lsproduo_fan_get; ++ lsproduo_fan_proc_file->write_proc = lsproduo_fan_set; ++ lsproduo_fan_proc_file->data = NULL; ++ } else ++ { ++ printk(KERN_INFO "Registration of fan device failed\n"); ++ } ++ ++ /* register power-off method */ ++ pm_power_off = lsproduo_power_off; ++ ++ pr_info("%s: finished\n", __func__); ++} ++ ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV1 ++MACHINE_START(LINKSTATION_PRODUO, "Buffalo Linkstation Pro Duo - Revision 1") ++ .atag_offset = 0x00000100, ++ .init_machine = lsproduo_init, ++ .map_io = orion5x_map_io, ++ .init_early = orion5x_init_early, ++ .init_irq = orion5x_init_irq, ++ .timer = &orion5x_timer, ++ .fixup = tag_fixup_mem32, ++ .restart = orion5x_restart, ++MACHINE_END ++#endif ++ ++#ifdef CONFIG_MACH_LINKSTATION_PRODUO_REV2 ++MACHINE_START(LINKSTATION_PRODUO, "Buffalo Linkstation Pro Duo - Revision 2") ++ .atag_offset = 0x00000100, ++ .init_machine = lsproduo_init, ++ .map_io = orion5x_map_io, ++ .init_early = orion5x_init_early, ++ .init_irq = orion5x_init_irq, ++ .timer = &orion5x_timer, ++ .fixup = tag_fixup_mem32, ++ .restart = orion5x_restart, ++MACHINE_END ++#endif ++ ++ ++ +diff -uprN linux-3.4-rc7/arch/arm/mach-orion5x/mpp.h linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/mpp.h +--- linux-3.4-rc7/arch/arm/mach-orion5x/mpp.h 2012-05-12 19:37:47.000000000 -0600 ++++ linux-3.4-rc7-wtgl/arch/arm/mach-orion5x/mpp.h 2012-08-16 22:15:34.000000000 -0600 +@@ -122,7 +122,10 @@ + #define MPP19_GIGE MPP(19, 0x1, 0, 0, 1, 1, 1) + #define MPP19_UART MPP(19, 0x0, 0, 0, 0, 1, 1) + +-#define MPP_MAX 19 ++#define MPP22_GPIO MPP(22, 0x5, 1, 1, 0, 1, 0) ++ ++ ++#define MPP_MAX 22 + + void orion5x_mpp_conf(unsigned int *mpp_list); + +diff -uprN linux-3.4-rc7/arch/arm/tools/mach-types linux-3.4-rc7-wtgl/arch/arm/tools/mach-types +--- linux-3.4-rc7/arch/arm/tools/mach-types 2012-05-12 19:37:47.000000000 -0600 ++++ linux-3.4-rc7-wtgl/arch/arm/tools/mach-types 2012-08-16 23:43:59.830499760 -0600 +@@ -333,6 +333,8 @@ smdkc100 MACH_SMDKC100 SMDKC100 1826 + tavorevb MACH_TAVOREVB TAVOREVB 1827 + saar MACH_SAAR SAAR 1828 + at91sam9m10g45ek MACH_AT91SAM9M10G45EK AT91SAM9M10G45EK 1830 ++linkstation_produo MACH_LINKSTATION_PRODUO LINKSTATION_PRODUO 1831 ++##see header for btaining a new version, preferred to patching + usb_a9g20 MACH_USB_A9G20 USB_A9G20 1841 + mxlads MACH_MXLADS MXLADS 1851 + linkstation_mini MACH_LINKSTATION_MINI LINKSTATION_MINI 1858 diff --git a/3.3.8/v3.3-ARM-kirkwood-Add-support-for-Buffalo-LS-CHLv2.patch b/3.3.8/v3.3-ARM-kirkwood-Add-support-for-Buffalo-LS-CHLv2.patch new file mode 100644 index 0000000..452865b --- /dev/null +++ b/3.3.8/v3.3-ARM-kirkwood-Add-support-for-Buffalo-LS-CHLv2.patch @@ -0,0 +1,278 @@ +diff -uNr linux-3.3.8-go.orig/arch/arm/mach-kirkwood/common.c linux-3.3.8-go/arch/arm/mach-kirkwood/common.c +--- linux-3.3.8-go.orig/arch/arm/mach-kirkwood/common.c 2012-11-22 21:47:03.726499285 +0100 ++++ linux-3.3.8-go/arch/arm/mach-kirkwood/common.c 2012-11-22 21:50:05.917342298 +0100 +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + #include "common.h" + + /***************************************************************************** +diff -uNr linux-3.3.8-go.orig/arch/arm/mach-kirkwood/include/mach/system.h linux-3.3.8-go/arch/arm/mach-kirkwood/include/mach/system.h +--- linux-3.3.8-go.orig/arch/arm/mach-kirkwood/include/mach/system.h 2012-11-22 21:47:03.725499297 +0100 ++++ linux-3.3.8-go/arch/arm/mach-kirkwood/include/mach/system.h 2012-11-22 21:49:41.395632387 +0100 +@@ -9,6 +9,10 @@ + #ifndef __ASM_ARCH_SYSTEM_H + #define __ASM_ARCH_SYSTEM_H + ++#include ++#include ++#include ++ + static inline void arch_idle(void) + { + cpu_do_idle(); +diff -uNr linux-3.3.8-go.orig/arch/arm/mach-kirkwood/Kconfig linux-3.3.8-go/arch/arm/mach-kirkwood/Kconfig +--- linux-3.3.8-go.orig/arch/arm/mach-kirkwood/Kconfig 2012-11-22 21:47:03.726499285 +0100 ++++ linux-3.3.8-go/arch/arm/mach-kirkwood/Kconfig 2012-11-22 21:47:26.775227628 +0100 +@@ -148,6 +148,12 @@ + Say 'Y' here if you want your kernel to support the + Buffalo LS-WVL/E-AP NAS + ++config MACH_LINKSTATION_CHLV2 ++ bool "Buffalo LS-CHLv2 Series" ++ help ++ Say 'Y' here if you want your kernel to support the ++ Buffalo LS-CHLv2 Series. ++ + endmenu + + endif +diff -uNr linux-3.3.8-go.orig/arch/arm/mach-kirkwood/lschlv2-setup.c linux-3.3.8-go/arch/arm/mach-kirkwood/lschlv2-setup.c +--- linux-3.3.8-go.orig/arch/arm/mach-kirkwood/lschlv2-setup.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-go/arch/arm/mach-kirkwood/lschlv2-setup.c 2012-11-22 21:47:26.775227628 +0100 +@@ -0,0 +1,211 @@ ++/* ++ * arch/arm/mach-kirkwood/lschlv2-setup.c ++ * ++ * Buffalo LS Kirkwood Series Setup ++ * ++ * This file is licensed under the terms of the GNU General Public ++ * License version 2. This program is licensed "as is" without any ++ * warranty of any kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "include/mach/system.h" ++#include ++#include "common.h" ++#include "mpp.h" ++ ++/***************************************************************************** ++ * 512KB SPI Flash on BOOT Device ++ ****************************************************************************/ ++static struct mtd_partition lschlv2_partitions[] = { ++ { ++ .name = "u-boot", ++ .offset = 0x00000, ++ .size = 0x70000, ++ .mask_flags = MTD_WRITEABLE, ++ }, ++ { ++ .name = "u-boot env", ++ .offset = MTDPART_OFS_APPEND, ++ .size = 0x10000, ++ } ++}; ++ ++static struct flash_platform_data lschlv2_spi_slave_data = { ++ .type = "m25p40", ++ .parts = lschlv2_partitions, ++ .nr_parts = ARRAY_SIZE(lschlv2_partitions), ++}; ++ ++static struct spi_board_info __initdata lschlv2_spi_slave_info[] = { ++ { ++ .modalias = "m25p80", ++ .platform_data = &lschlv2_spi_slave_data, ++ .irq = -1, ++ .max_speed_hz = 20000000, ++ .bus_num = 0, ++ .chip_select = 0, ++ } ++}; ++ ++static struct mv643xx_eth_platform_data lschlv2_ge00_data = { ++ .phy_addr = MV643XX_ETH_PHY_ADDR(0), ++}; ++ ++static struct mv643xx_eth_platform_data lschlv2_ge01_data = { ++ .phy_addr = MV643XX_ETH_PHY_ADDR(8), ++}; ++ ++static unsigned int lschlv2_mpp_config[] __initdata = { ++ MPP10_GPO, /* HDD Power */ ++ MPP11_GPIO, /* USB Vbus Power */ ++ MPP18_GPO, /* FAN High on:0, off:1 */ ++ MPP19_GPO, /* FAN Low on:0, off:1 */ ++ MPP36_GPIO, /* FUNC LED */ ++ MPP37_GPIO, /* ALARM LED */ ++ MPP38_GPIO, /* INFO LED */ ++ MPP39_GPIO, /* POWER LED */ ++ MPP40_GPIO, /* FAN LOCK */ ++ MPP41_GPIO, /* FUNC SW */ ++ MPP42_GPIO, /* POWER SW */ ++ MPP43_GPIO, /* POWER AUTO SW */ ++ MPP48_GPIO, /* FUNC RED LED */ ++ MPP49_GPIO, /* UART EN */ ++ 0 ++}; ++ ++static struct mv_sata_platform_data lschlv2_sata_data = { ++ .n_ports = 1, ++}; ++ ++static struct gpio_led lschlv2_led_pins[] = { ++ { ++ .name = "func", ++ .gpio = 36, ++ .active_low = 1, ++ }, ++ { ++ .name = "alarm", ++ .gpio = 37, ++ .active_low = 1, ++ }, ++ { ++ .name = "info", ++ .gpio = 38, ++ .active_low = 1, ++ }, ++ { ++ .name = "power", ++ .gpio = 39, ++ .default_trigger = "default-on", ++ .active_low = 1, ++ }, ++ { ++ .name = "func2", ++ .gpio = 48, ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_led_platform_data lschlv2_led_data = { ++ .leds = lschlv2_led_pins, ++ .num_leds = ARRAY_SIZE(lschlv2_led_pins), ++}; ++ ++static struct platform_device lschlv2_leds = { ++ .name = "leds-gpio", ++ .id = -1, ++ .dev = { ++ .platform_data = &lschlv2_led_data, ++ } ++}; ++ ++#define LSCHLv2_GPIO_USB_VBUS_EN 11 ++#define LSCHLv2_GPIO_KEY_FUNC 41 ++ ++static struct gpio_keys_button lschlv2_buttons[] = { ++ { ++ .code = KEY_OPTION, ++ .gpio = LSCHLv2_GPIO_KEY_FUNC, ++ .desc = "Function Button", ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_keys_platform_data lschlv2_button_data = { ++ .buttons = lschlv2_buttons, ++ .nbuttons = ARRAY_SIZE(lschlv2_buttons), ++}; ++ ++static struct platform_device lschlv2_button_device = { ++ .name = "gpio-keys", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lschlv2_button_data, ++ }, ++}; ++ ++static void lschlv2_power_off(void) ++{ ++ kirkwood_restart('h', NULL); //arch_reset(0, NULL); ++} ++ ++static void __init lschlv2_init(void) ++{ ++ /* ++ * Basic setup. Needs to be called early. ++ */ ++ kirkwood_init(); ++ kirkwood_mpp_conf(lschlv2_mpp_config); ++ ++ kirkwood_uart0_init(); ++ ++ if (gpio_request(LSCHLv2_GPIO_USB_VBUS_EN, "USB Power Enable") != 0 || ++ gpio_direction_output(LSCHLv2_GPIO_USB_VBUS_EN, 1) != 0) ++ printk(KERN_ERR "can't set up USB Power Enable\n"); ++ kirkwood_ehci_init(); ++ ++ kirkwood_ge00_init(&lschlv2_ge00_data); ++ kirkwood_ge01_init(&lschlv2_ge01_data); ++ ++ kirkwood_sata_init(&lschlv2_sata_data); ++ ++ kirkwood_spi_init(); ++ ++ platform_device_register(&lschlv2_leds); ++ platform_device_register(&lschlv2_button_device); ++ ++ spi_register_board_info(lschlv2_spi_slave_info, ++ ARRAY_SIZE(lschlv2_spi_slave_info)); ++ ++ /* register power-off method */ ++ pm_power_off = lschlv2_power_off; ++ ++ pr_info("%s: finished\n", __func__); ++} ++ ++ ++ ++MACHINE_START(LINKSTATION_CHLV2, "Buffalo Linkstation LS-CHLv2") ++ .atag_offset = 0x100, ++ .init_machine = lschlv2_init, ++ .map_io = kirkwood_map_io, ++ .init_early = kirkwood_init_early, ++ .init_irq = kirkwood_init_irq, ++ .timer = &kirkwood_timer, ++ .restart = kirkwood_restart, ++MACHINE_END +diff -uNr linux-3.3.8-go.orig/arch/arm/mach-kirkwood/Makefile linux-3.3.8-go/arch/arm/mach-kirkwood/Makefile +--- linux-3.3.8-go.orig/arch/arm/mach-kirkwood/Makefile 2012-11-22 21:47:03.726499285 +0100 ++++ linux-3.3.8-go/arch/arm/mach-kirkwood/Makefile 2012-11-22 21:47:26.775227628 +0100 +@@ -18,6 +18,7 @@ + obj-$(CONFIG_MACH_NET2BIG_V2) += netxbig_v2-setup.o lacie_v2-common.o + obj-$(CONFIG_MACH_NET5BIG_V2) += netxbig_v2-setup.o lacie_v2-common.o + obj-$(CONFIG_MACH_T5325) += t5325-setup.o ++obj-$(CONFIG_MACH_LINKSTATION_CHLV2) += lschlv2-setup.o + obj-$(CONFIG_MACH_LSXHL) += lsxhl-setup.o + obj-$(CONFIG_MACH_LSVL) += lsvl-setup.o + obj-$(CONFIG_MACH_LSWVL) += lswvl-setup.o +diff -uNr linux-3.3.8-go.orig/arch/arm/tools/mach-types linux-3.3.8-go/arch/arm/tools/mach-types +--- linux-3.3.8-go.orig/arch/arm/tools/mach-types 2012-11-24 05:06:18.763371700 +0100 ++++ linux-3.3.8-go/arch/arm/tools/mach-types 2012-11-24 05:06:59.092895630 +0100 +@@ -522,6 +522,7 @@ + dockstar MACH_DOCKSTAR DOCKSTAR 2998 + ti8148evm MACH_TI8148EVM TI8148EVM 3004 + seaboard MACH_SEABOARD SEABOARD 3005 ++linkstation_chlv2 MACH_LINKSTATION_CHLV2 LINKSTATION_CHLV2 3006 + mx53_ard MACH_MX53_ARD MX53_ARD 3010 + mx53_smd MACH_MX53_SMD MX53_SMD 3011 + msm8x60_rumi3 MACH_MSM8X60_RUMI3 MSM8X60_RUMI3 3016 diff --git a/3.3.8/v3.3-ARM-kirkwood-Add-support-for-Buffalo-LS-XHL.patch b/3.3.8/v3.3-ARM-kirkwood-Add-support-for-Buffalo-LS-XHL.patch new file mode 100644 index 0000000..71e23b9 --- /dev/null +++ b/3.3.8/v3.3-ARM-kirkwood-Add-support-for-Buffalo-LS-XHL.patch @@ -0,0 +1,388 @@ +Add support for the Buffalo Linkstation XHL. This NAS box is based on a +Marvell Kirkwood chip at 1.2 GHz and features 256 MB RAM, 512kb SPI boot +flash, gigabit ethernet and one SATA port. + +Signed-off-by: Michael Walle +--- + arch/arm/configs/kirkwood_defconfig | 1 + + arch/arm/mach-kirkwood/Kconfig | 6 + + arch/arm/mach-kirkwood/Makefile | 1 + + arch/arm/mach-kirkwood/lsxhl-setup.c | 313 ++++++++++++++++++++++++++++++++++ + arch/arm/tools/mach-types | 1 + + 5 files changed, 322 insertions(+), 0 deletions(-) + create mode 100644 arch/arm/mach-kirkwood/lsxhl-setup.c + +diff --git a/arch/arm/configs/kirkwood_defconfig b/arch/arm/configs/kirkwood_defconfig +index aeb3af5..9f77811 100644 +--- a/arch/arm/configs/kirkwood_defconfig ++++ b/arch/arm/configs/kirkwood_defconfig +@@ -28,6 +28,7 @@ CONFIG_MACH_D2NET_V2=y + CONFIG_MACH_NET2BIG_V2=y + CONFIG_MACH_NET5BIG_V2=y + CONFIG_MACH_T5325=y ++CONFIG_MACH_LSXHL=y + # CONFIG_CPU_FEROCEON_OLD_ID is not set + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +diff --git a/arch/arm/mach-kirkwood/Kconfig b/arch/arm/mach-kirkwood/Kconfig +index 7fc603b..307cc99 100644 +--- a/arch/arm/mach-kirkwood/Kconfig ++++ b/arch/arm/mach-kirkwood/Kconfig +@@ -130,6 +130,12 @@ config MACH_T5325 + Say 'Y' here if you want your kernel to support the + HP t5325 Thin Client. + ++config MACH_LSXHL ++ bool "Buffalo LS-XHL Series" ++ help ++ Say 'Y' here if you want your kernel to support the ++ Buffalo LS-XHL Series. ++ + endmenu + + endif +diff --git a/arch/arm/mach-kirkwood/Makefile b/arch/arm/mach-kirkwood/Makefile +index 5dcaa81..221980b 100644 +--- a/arch/arm/mach-kirkwood/Makefile ++++ b/arch/arm/mach-kirkwood/Makefile +@@ -18,5 +18,6 @@ obj-$(CONFIG_MACH_D2NET_V2) += d2net_v2-setup.o lacie_v2-common.o + obj-$(CONFIG_MACH_NET2BIG_V2) += netxbig_v2-setup.o lacie_v2-common.o + obj-$(CONFIG_MACH_NET5BIG_V2) += netxbig_v2-setup.o lacie_v2-common.o + obj-$(CONFIG_MACH_T5325) += t5325-setup.o ++obj-$(CONFIG_MACH_LSXHL) += lsxhl-setup.o + + obj-$(CONFIG_CPU_IDLE) += cpuidle.o +diff --git a/arch/arm/mach-kirkwood/lsxhl-setup.c b/arch/arm/mach-kirkwood/lsxhl-setup.c +new file mode 100644 +index 0000000..783d257 +--- /dev/null ++++ b/arch/arm/mach-kirkwood/lsxhl-setup.c +@@ -0,0 +1,314 @@ ++/* ++ * arch/arm/mach-kirkwood/lsxhl-setup.c ++ * ++ * Buffalo LS-XHL Series Setup ++ * ++ * This file is licensed under the terms of the GNU General Public ++ * License version 2. This program is licensed "as is" without any ++ * warranty of any kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "common.h" ++#include "mpp.h" ++ ++/***************************************************************************** ++ * 512KB SPI Flash on BOOT Device ++ ****************************************************************************/ ++static struct mtd_partition lsxhl_partitions[] = { ++ { ++ .name = "u-boot", ++ .size = 0x70000, ++ .offset = 0x00000, ++ .mask_flags = MTD_WRITEABLE, ++ }, ++ { ++ .name = "u-boot env", ++ .size = 0x10000, ++ .offset = 0x70000, ++ } ++}; ++ ++static struct flash_platform_data lsxhl_spi_slave_data = { ++ .type = "m25p40", ++ .parts = lsxhl_partitions, ++ .nr_parts = ARRAY_SIZE(lsxhl_partitions), ++}; ++ ++static struct spi_board_info __initdata lsxhl_spi_slave_info[] = { ++ { ++ .modalias = "m25p80", ++ .platform_data = &lsxhl_spi_slave_data, ++ .irq = -1, ++ .max_speed_hz = 20000000, ++ .bus_num = 0, ++ .chip_select = 0, ++ } ++}; ++ ++/***************************************************************************** ++ * Ethernet ++ ****************************************************************************/ ++static struct mv643xx_eth_platform_data lsxhl_ge00_data = { ++ .phy_addr = MV643XX_ETH_PHY_ADDR(0), ++}; ++ ++static struct mv643xx_eth_platform_data lsxhl_ge01_data = { ++ .phy_addr = MV643XX_ETH_PHY_ADDR(8), ++}; ++ ++/***************************************************************************** ++ * SATA ++ ****************************************************************************/ ++static struct mv_sata_platform_data lsxhl_sata_data = { ++ .n_ports = 1, ++}; ++ ++/***************************************************************************** ++ * LEDs attached to GPIO ++ ****************************************************************************/ ++#define LSXHL_GPIO_LED_ALARM 37 ++#define LSXHL_GPIO_LED_INFO 38 ++#define LSXHL_GPIO_LED_PWR 39 ++#define LSXHL_GPIO_LED_FUNC_BLUE 36 ++#define LSXHL_GPIO_LED_FUNC_RED 48 ++ ++static struct gpio_led lsxhl_led_pins[] = { ++ { ++ .name = "alarm:red", ++ .gpio = LSXHL_GPIO_LED_ALARM, ++ .active_low = 1, ++ }, ++ { ++ .name = "info:amber", ++ .gpio = LSXHL_GPIO_LED_INFO, ++ .active_low = 1, ++ }, ++ { ++ .name = "power:blue", ++ .default_trigger = "default-on", ++ .gpio = LSXHL_GPIO_LED_PWR, ++ .active_low = 1, ++ }, ++ { ++ .name = "func:blue:bottom", ++ .gpio = LSXHL_GPIO_LED_FUNC_BLUE, ++ .active_low = 1, ++ }, ++ { ++ .name = "func:red:bottom", ++ .gpio = LSXHL_GPIO_LED_FUNC_RED, ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_led_platform_data lsxhl_led_data = { ++ .leds = lsxhl_led_pins, ++ .num_leds = ARRAY_SIZE(lsxhl_led_pins), ++}; ++ ++static struct platform_device lsxhl_leds = { ++ .name = "leds-gpio", ++ .id = -1, ++ .dev = { ++ .platform_data = &lsxhl_led_data, ++ } ++}; ++ ++/***************************************************************************** ++ * General Setup ++ ****************************************************************************/ ++#define LSXHL_GPIO_HDD_POWER 10 ++#define LSXHL_GPIO_USB_POWER 11 ++ ++/***************************************************************************** ++ * GPIO Attached Keys ++ ****************************************************************************/ ++#define LSXHL_GPIO_KEY_FUNC 41 ++#define LSXHL_GPIO_KEY_AUTOPOWER 42 ++#define LSXHL_GPIO_KEY_POWER 43 ++#define LSXHL_SW_POWER 0x00 ++#define LSXHL_SW_AUTOPOWER 0x01 ++#define LSXHL_SW_FUNC 0x02 ++ ++static struct gpio_keys_button lsxhl_buttons[] = { ++ { ++ .type = EV_SW, ++ .code = LSXHL_SW_POWER, ++ .gpio = LSXHL_GPIO_KEY_POWER, ++ .desc = "Power-on Switch", ++ .active_low = 1, ++ }, { ++ .type = EV_SW, ++ .code = LSXHL_SW_AUTOPOWER, ++ .gpio = LSXHL_GPIO_KEY_AUTOPOWER, ++ .desc = "Power-auto Switch", ++ .active_low = 1, ++ }, { ++ .type = EV_SW, ++ .code = LSXHL_SW_POWER, ++ .gpio = LSXHL_GPIO_KEY_FUNC, ++ .desc = "Function Button", ++ .active_low = 1, ++ }, ++}; ++ ++static struct gpio_keys_platform_data lsxhl_button_data = { ++ .buttons = lsxhl_buttons, ++ .nbuttons = ARRAY_SIZE(lsxhl_buttons), ++}; ++ ++static struct platform_device lsxhl_button_device = { ++ .name = "gpio-keys", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lsxhl_button_data, ++ }, ++}; ++ ++/***************************************************************************** ++ * GPIO Fan ++ ****************************************************************************/ ++#define LSXHL_GPIO_FAN_HIGH 18 ++#define LSXHL_GPIO_FAN_LOW 19 ++#define LSXHL_GPIO_FAN_LOCK 40 ++ ++static struct gpio_fan_alarm lsxhl_alarm = { ++ .gpio = LSXHL_GPIO_FAN_LOCK, ++}; ++ ++static struct gpio_fan_speed lsxhl_speeds[] = { ++ { ++ .rpm = 0, ++ .ctrl_val = 3, ++ }, { ++ .rpm = 1500, ++ .ctrl_val = 1, ++ }, { ++ .rpm = 3250, ++ .ctrl_val = 2, ++ }, { ++ .rpm = 5000, ++ .ctrl_val = 0, ++ } ++}; ++ ++static int lsxhl_gpio_list[] = { ++ LSXHL_GPIO_FAN_HIGH, LSXHL_GPIO_FAN_LOW, ++}; ++ ++static struct gpio_fan_platform_data lsxhl_fan_data = { ++ .num_ctrl = ARRAY_SIZE(lsxhl_gpio_list), ++ .ctrl = lsxhl_gpio_list, ++ .alarm = &lsxhl_alarm, ++ .num_speed = ARRAY_SIZE(lsxhl_speeds), ++ .speed = lsxhl_speeds, ++}; ++ ++static struct platform_device lsxhl_fan_device = { ++ .name = "gpio-fan", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lsxhl_fan_data, ++ }, ++}; ++ ++/***************************************************************************** ++ * GPIO Data ++ ****************************************************************************/ ++ ++static unsigned int lsxhl_mpp_config[] __initdata = { ++ MPP10_GPO, /* HDD Power Enable */ ++ MPP11_GPIO, /* USB Vbus Enable */ ++ MPP18_GPO, /* FAN High Enable# */ ++ MPP19_GPO, /* FAN Low Enable# */ ++ MPP36_GPIO, /* Function Blue LED */ ++ MPP37_GPIO, /* Alarm LED */ ++ MPP38_GPIO, /* Info LED */ ++ MPP39_GPIO, /* Power LED */ ++ MPP40_GPIO, /* Fan Lock */ ++ MPP41_GPIO, /* Function Button */ ++ MPP42_GPIO, /* Power Switch */ ++ MPP43_GPIO, /* Power Auto Switch */ ++ MPP48_GPIO, /* Function Red LED */ ++ 0 ++}; ++ ++/***************************************************************************** ++ * LS-XHL specific power off method: reboot ++ ****************************************************************************/ ++/* ++ * On the LS-XHL, the shutdown process is following: ++ * - Userland monitors key events until the power switch goes to off position ++ * - The board reboots ++ * - U-boot starts and goes into an idle mode waiting for the user ++ * to move the switch to ON position ++ * ++ */ ++ ++static void lsxhl_power_off(void) ++{ ++ kirkwood_restart('h', NULL); //arm_machine_restart('h', NULL); ++} ++ ++static void __init lsxhl_init(void) ++{ ++ /* ++ * Basic setup. Needs to be called early. ++ */ ++ kirkwood_init(); ++ kirkwood_mpp_conf(lsxhl_mpp_config); ++ ++ /* ++ * Configure peripherals. ++ */ ++ kirkwood_uart0_init(); ++ kirkwood_ehci_init(); ++ kirkwood_ge00_init(&lsxhl_ge00_data); ++ kirkwood_ge01_init(&lsxhl_ge01_data); ++ kirkwood_sata_init(&lsxhl_sata_data); ++ kirkwood_spi_init(); ++ ++ platform_device_register(&lsxhl_leds); ++ platform_device_register(&lsxhl_button_device); ++ platform_device_register(&lsxhl_fan_device); ++ ++ spi_register_board_info(lsxhl_spi_slave_info, ++ ARRAY_SIZE(lsxhl_spi_slave_info)); ++ ++ /* usb power on */ ++ gpio_set_value(LSXHL_GPIO_USB_POWER, 1); ++ ++ /* register power-off method */ ++ pm_power_off = lsxhl_power_off; ++ ++ pr_info("%s: finished\n", __func__); ++} ++ ++MACHINE_START(LSXHL, "Buffalo Linkstation LS-XHL") ++ .atag_offset = 0x100, ++ .init_machine = lsxhl_init, ++ .map_io = kirkwood_map_io, ++ .init_early = kirkwood_init_early, ++ .init_irq = kirkwood_init_irq, ++ .timer = &kirkwood_timer, ++ .restart = kirkwood_restart, ++MACHINE_END +diff --git a/arch/arm/tools/mach-types b/arch/arm/tools/mach-types +index 3b3776d..8acc587 100644 +--- a/arch/arm/tools/mach-types ++++ b/arch/arm/tools/mach-types +@@ -448,6 +448,7 @@ mityomapl138 MACH_MITYOMAPL138 MITYOMAPL138 2650 + guruplug MACH_GURUPLUG GURUPLUG 2659 + spear310 MACH_SPEAR310 SPEAR310 2660 + spear320 MACH_SPEAR320 SPEAR320 2661 ++lsxhl MACH_LSXHL LSXHL 2663 + aquila MACH_AQUILA AQUILA 2676 + sheeva_esata MACH_ESATA_SHEEVAPLUG ESATA_SHEEVAPLUG 2678 + msm7x30_surf MACH_MSM7X30_SURF MSM7X30_SURF 2679 +-- +1.7.2.3 diff --git a/3.3.8/v3.3-ARM-orion-Add-support-for-Buffalo-LS-QL.patch b/3.3.8/v3.3-ARM-orion-Add-support-for-Buffalo-LS-QL.patch new file mode 100644 index 0000000..2ee8b00 --- /dev/null +++ b/3.3.8/v3.3-ARM-orion-Add-support-for-Buffalo-LS-QL.patch @@ -0,0 +1,439 @@ +diff -uNr linux-3.3.8-go.orig/arch/arm/configs/orion5x_defconfig linux-3.3.8-go/arch/arm/configs/orion5x_defconfig +--- linux-3.3.8-go.orig/arch/arm/configs/orion5x_defconfig 2012-11-22 21:40:48.443921973 +0100 ++++ linux-3.3.8-go/arch/arm/configs/orion5x_defconfig 2012-11-22 21:41:29.388436783 +0100 +@@ -21,6 +21,7 @@ + CONFIG_MACH_LINKSTATION_MINI=y + CONFIG_MACH_LINKSTATION_PRODUO=y + CONFIG_MACH_LINKSTATION_LS_HGL=y ++CONFIG_MACH_LINKSTATION_LSQL=y + CONFIG_MACH_TS409=y + CONFIG_MACH_WRT350N_V2=y + CONFIG_MACH_TS78XX=y +diff -uNr linux-3.3.8-go.orig/arch/arm/mach-orion5x/Kconfig linux-3.3.8-go/arch/arm/mach-orion5x/Kconfig +--- linux-3.3.8-go.orig/arch/arm/mach-orion5x/Kconfig 2012-11-22 21:40:48.648919605 +0100 ++++ linux-3.3.8-go/arch/arm/mach-orion5x/Kconfig 2012-11-22 21:41:29.389436772 +0100 +@@ -111,6 +111,13 @@ + Say 'Y' here if you want your kernel to support the + Buffalo Linkstation LS-HGL platform. + ++config MACH_LINKSTATION_LSQL ++ bool "Buffalo Linkstation LS-QL" ++ select I2C_BOARDINFO ++ help ++ Say 'Y' here if you want your kernel to support the ++ Buffalo Linkstation LS-QL platform. ++ + config MACH_TS409 + bool "QNAP TS-409" + help +diff -uNr linux-3.3.8-go.orig/arch/arm/mach-orion5x/lsql-setup.c linux-3.3.8-go/arch/arm/mach-orion5x/lsql-setup.c +--- linux-3.3.8-go.orig/arch/arm/mach-orion5x/lsql-setup.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-go/arch/arm/mach-orion5x/lsql-setup.c 2012-11-22 21:41:29.390436761 +0100 +@@ -0,0 +1,388 @@ ++/* ++ * arch/arm/mach-orion5x/lsql-setup.c ++ * ++ * Source based off arch/arm/mach-orion5x/lsproduo-setup.c, which was from lsmini-setup.c ++ * Maintainer: Matt Gomboc ++ * ++ * This file is licensed under the terms of the GNU General Public ++ * License version 2. This program is licensed "as is" without any ++ * warranty of any kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "common.h" ++#include "mpp.h" ++#include ++#include ++#include ++ ++/***************************************************************************** ++ * Linkstation Quad LS-QL/R5 Info ++ ****************************************************************************/ ++ ++/* ++ * 256K NOR flash Device bus boot chip select ++ */ ++ ++#define LSQL_NOR_BOOT_BASE 0xf4000000 ++#define LSQL_NOR_BOOT_SIZE SZ_256K ++ ++/***************************************************************************** ++ * 256KB NOR Flash on BOOT Device ++ ****************************************************************************/ ++ ++static struct physmap_flash_data lsql_nor_flash_data = { ++ .width = 1, ++}; ++ ++static struct resource lsql_nor_flash_resource = { ++ .flags = IORESOURCE_MEM, ++ .start = LSQL_NOR_BOOT_BASE, ++ .end = LSQL_NOR_BOOT_BASE + LSQL_NOR_BOOT_SIZE - 1, ++}; ++ ++static struct platform_device lsql_nor_flash = { ++ .name = "physmap-flash", ++ .id = 0, ++ .dev = { ++ .platform_data = &lsql_nor_flash_data, ++ }, ++ .num_resources = 1, ++ .resource = &lsql_nor_flash_resource, ++}; ++ ++/***************************************************************************** ++ * Ethernet ++ ****************************************************************************/ ++ ++static struct mv643xx_eth_platform_data lsql_eth_data = { ++ .phy_addr = 8, ++}; ++ ++/***************************************************************************** ++ * RTC 5C372a on I2C bus ++ ****************************************************************************/ ++ ++static struct i2c_board_info __initdata lsql_i2c_rtc = { ++ I2C_BOARD_INFO("rs5c372a", 0x32), ++}; ++ ++/***************************************************************************** ++ * LEDs attached to GPIO ++ ****************************************************************************/ ++ ++#define LSQL_GPIO_LED_ALARM 2 /* looks like it should be 2 by the uboot sources, but doesnt successfully trigger the3 top LED*/ ++#define LSQL_GPIO_LED_INFO 3 ++#define LSQL_GPIO_LED_PWR 0 ++#define LSQL_GPIO_LED_FUNC 18 ++ ++ ++static struct gpio_led lsql_led_pins[] = { ++ { ++ .name = "alarm:red", ++ .gpio = LSQL_GPIO_LED_ALARM, ++ .active_low = 1, ++ }, { ++ .name = "info:amber", ++ .gpio = LSQL_GPIO_LED_INFO, ++ .active_low = 1, ++ }, { ++ .name = "power:blue", ++ .gpio = LSQL_GPIO_LED_PWR, ++ .active_low = 1, ++ },{ ++ .name = "func:blue", ++ .gpio = LSQL_GPIO_LED_FUNC, ++ .active_low = 1, ++ }, ++}; ++ ++ ++ ++static struct gpio_led_platform_data lsql_led_data = { ++ .leds = lsql_led_pins, ++ .num_leds = ARRAY_SIZE(lsql_led_pins), ++}; ++ ++ ++static struct platform_device lsql_leds = { ++ .name = "leds-gpio", ++ .id = -1, ++ .dev = { ++ .platform_data = &lsql_led_data, ++ }, ++}; ++ ++ ++/**************************************************************************** ++ * GPIO Attached Keys ++ ****************************************************************************/ ++ ++ #define LSQL_GPIO_KEY_POWER 10 ++ #define LSQL_GPIO_KEY_AUTOPOWER 22 ++ #define LSQL_GPIO_KEY_FUNC 7 ++ ++ #define LSQL_SW_POWER 0x00 ++ #define LSQL_SW_AUTOPOWER 0x01 ++ ++static struct gpio_keys_button lsql_buttons[] = { ++ { ++ .code = KEY_OPTION, ++ .gpio = LSQL_GPIO_KEY_FUNC, ++ .desc = "Function Button", ++ .active_low = 1, ++ },{ ++ .type = EV_SW, ++ .code = LSQL_SW_POWER, ++ .gpio = LSQL_GPIO_KEY_POWER, ++ .desc = "Power-on Switch", ++ .active_low = 1, ++ }, { ++ .type = EV_SW, ++ .code = LSQL_SW_AUTOPOWER, ++ .gpio = LSQL_GPIO_KEY_AUTOPOWER, ++ .desc = "Power-auto Switch", ++ .active_low = 1, ++ }, ++}; ++ ++ ++static struct gpio_keys_platform_data lsql_button_data = { ++ .buttons = lsql_buttons, ++ .nbuttons = ARRAY_SIZE(lsql_buttons), ++}; ++ ++static struct platform_device lsql_button_device = { ++ .name = "gpio-keys", ++ .id = -1, ++ .num_resources = 0, ++ .dev = { ++ .platform_data = &lsql_button_data, ++ }, ++}; ++ ++/**************************************************************************** ++ * GPIO Attached Fan ++ ****************************************************************************/ ++ ++/* Define max char len */ ++ ++#define MAX_LEN 8 ++ ++#define LSQL_GPIO_FAN_LOW 17 ++#define LSQL_GPIO_FAN_HIGH 14 ++ ++static struct proc_dir_entry *lsql_proc_dir_root, *lsql_proc_dir_gpio, *lsql_fan_proc_file; ++static char lsql_fan_state[MAX_LEN]; ++ ++static int lsql_fan_get(char *buf, char **start, off_t offset, int count, int *eof, void *data) ++{ ++ int len; ++ ++ len = snprintf(buf, count, "state: %s\n", lsql_fan_state); ++ return len; ++} ++ ++static int lsql_fan_set( struct file *file, const char *buffer, unsigned long count, void *data ) ++{ ++ int len, ret; ++ char *ptr, tState[MAX_LEN]; ++ ++ if (count > MAX_LEN ) ++ len = MAX_LEN; ++ else ++ len = count; ++ ++ ret = copy_from_user(tState, buffer, len); ++ if(ret < 0) ++ { ++ printk(KERN_ERR "%s: Setting fan speed failed\n", "lsql"); ++ return -EFAULT; ++ } ++ ++ ptr = strrchr(tState, '\n'); ++ if(ptr) *ptr = '\0'; ++ ++ if (strcasecmp(tState, "off") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan off\n", "lsql"); ++ sprintf(lsql_fan_state, "off"); ++ gpio_set_value(LSQL_GPIO_FAN_LOW, 1); ++ gpio_set_value(LSQL_GPIO_FAN_HIGH, 1); ++ } else if (strcasecmp(tState, "slow") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan slow\n", "lsql"); ++ sprintf(lsql_fan_state, "slow"); ++ gpio_set_value(LSQL_GPIO_FAN_LOW, 1); ++ gpio_set_value(LSQL_GPIO_FAN_HIGH, 0); ++ } else if (strcasecmp(tState, "fast") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan fast\n", "lsql"); ++ sprintf(lsql_fan_state, "fast"); ++ gpio_set_value(LSQL_GPIO_FAN_LOW, 0); ++ gpio_set_value(LSQL_GPIO_FAN_HIGH, 1); ++ } else if (strcasecmp(tState, "full") == 0) ++ { ++ printk(KERN_DEBUG "%s: set fan full\n", "lsql"); ++ sprintf(lsql_fan_state, "full"); ++ gpio_set_value(LSQL_GPIO_FAN_LOW, 0); ++ gpio_set_value(LSQL_GPIO_FAN_HIGH, 0); ++ } else ++ { ++ printk(KERN_ERR "%s: unknown fan speed given\n", "lsql"); ++ } ++ ++ lsql_fan_state[len] = '\0'; ++ ++ return len; ++} ++ ++/***************************************************************************** ++ * SATA ++ ****************************************************************************/ ++static struct mv_sata_platform_data lsql_sata_data = { ++ .n_ports = 2, /*maybe this should be 4, but works with 2 */ ++}; ++ ++ ++/***************************************************************************** ++ * Linkstation Quad specific power off method: reboot ++ ****************************************************************************/ ++/* ++ * On Linkstations in general, the shutdown process is following: ++ * - Userland monitors key events until the power switch goes to off position ++ * - The board reboots ++ * - U-boot starts and goes into an idle mode waiting for the user ++ * to move the switch to ON position ++ * ++ * on the Quad however, there is a power button on the upper, front, ++ * a function button on the lower front, ans a Auto/Manual power button on the back. ++ * After halting system, uboot waits the power button on the front panel to be pushed ++ * ++ * ++ */ ++ ++static void lsql_power_off(void) ++{ ++ orion5x_restart('h', NULL); /* arm_machine_restart(0, NULL); */ ++} ++ ++ ++/***************************************************************************** ++ * General Setup ++ ****************************************************************************/ ++#define LSQL_GPIO_USB_POWER 9 ++#define LSQL_GPIO_POWER 10 ++#define LSQL_GPIO_USB_POWER2 19 ++#define LSQL_GPIO_AUTO_POWER 22 ++ ++static unsigned int lsql_mpp_modes[] __initdata = { ++ MPP0_GPIO, /* LED_PWR */ ++ MPP1_GPIO, /* for debugging purposes, change to MPP1_UNUSED for final */ ++ MPP2_GPIO, /* LED_ALARM */ /* looks like it should be 2 by the uboot sources, but doesnt successfully trigger the3 top LED*/ ++ MPP3_GPIO, /* LED_INFO */ ++ MPP4_GPIO, ++ MPP5_GPIO, ++ MPP6_GPIO, /* FAN_LCK */ ++ MPP7_GPIO, /* FUNC */ ++ MPP8_GPIO, ++ MPP9_GPIO, /* USB_PWR */ ++ MPP10_GPIO, /* POWER */ ++ MPP11_GPIO, ++ MPP12_GPIO, ++ MPP13_GPIO, ++ MPP14_GPIO, /* FAN_HIGH */ ++ MPP15_GPIO, ++ MPP16_GPIO, ++ MPP17_GPIO, /* FAN_LOW */ ++ MPP18_GPIO, /* LED_FUNC*/ ++ MPP19_GPIO, /* USB_PWR2 */ ++ MPP22_GPIO, /* AUTO_POWER*/ ++ 0, ++}; ++ ++static void __init lsql_init(void) ++{ ++ /* ++ * Setup basic Orion functions. Need to be called early. ++ */ ++ orion5x_init(); ++ ++ orion5x_mpp_conf(lsql_mpp_modes); ++ ++ /* ++ * Configure peripherals. ++ */ ++ orion5x_ehci0_init(); ++ orion5x_ehci1_init(); ++ orion5x_eth_init(&lsql_eth_data); ++ orion5x_i2c_init(); ++ orion5x_sata_init(&lsql_sata_data); ++ orion5x_uart0_init(); ++ orion5x_xor_init(); ++ ++ orion5x_setup_dev_boot_win(LSQL_NOR_BOOT_BASE, ++ LSQL_NOR_BOOT_SIZE); ++ platform_device_register(&lsql_nor_flash); ++ ++ platform_device_register(&lsql_button_device); ++ ++ platform_device_register(&lsql_leds); ++ ++ i2c_register_board_info(0, &lsql_i2c_rtc, 1); ++ ++ /* enable USB power */ ++ gpio_set_value(LSQL_GPIO_USB_POWER, 1); ++ gpio_set_value(LSQL_GPIO_USB_POWER2, 1); ++ ++ ++ printk(KERN_INFO "Buffalo Linkstation fan driver loaded\n"); ++ sprintf(lsql_fan_state, "fast"); ++ gpio_set_value(LSQL_GPIO_FAN_LOW, 0); ++ gpio_set_value(LSQL_GPIO_FAN_HIGH, 1); ++ ++ lsql_proc_dir_root = proc_mkdir( "linkstation", NULL ); ++ lsql_proc_dir_gpio = proc_mkdir( "gpio", lsql_proc_dir_root ); ++ lsql_fan_proc_file = create_proc_entry( "fan", S_IRUGO, lsql_proc_dir_gpio ); ++ if( lsql_fan_proc_file ) { ++ lsql_fan_proc_file->read_proc = lsql_fan_get; ++ lsql_fan_proc_file->write_proc = lsql_fan_set; ++ lsql_fan_proc_file->data = NULL; ++ } else ++ { ++ printk(KERN_INFO "Registration of fan device failed\n"); ++ } ++ ++ /* register power-off method */ ++ pm_power_off = lsql_power_off; ++ ++ pr_info("%s: finished\n", __func__); ++} ++ ++#ifdef CONFIG_MACH_LINKSTATION_LSQL ++MACHINE_START(LINKSTATION_LSQL, "Buffalo Linkstation Quad QL/R5") ++ .atag_offset = 0x00000100, ++ .init_machine = lsql_init, ++ .map_io = orion5x_map_io, ++ .init_early = orion5x_init_early, ++ .init_irq = orion5x_init_irq, ++ .timer = &orion5x_timer, ++ .fixup = tag_fixup_mem32, ++ .restart = orion5x_restart, ++MACHINE_END ++#endif ++ ++ +diff -uNr linux-3.3.8-go.orig/arch/arm/mach-orion5x/Makefile linux-3.3.8-go/arch/arm/mach-orion5x/Makefile +--- linux-3.3.8-go.orig/arch/arm/mach-orion5x/Makefile 2012-11-22 21:40:48.647919616 +0100 ++++ linux-3.3.8-go/arch/arm/mach-orion5x/Makefile 2012-11-22 21:41:29.391436749 +0100 +@@ -7,6 +7,7 @@ + obj-$(CONFIG_MACH_LINKSTATION_MINI) += lsmini-setup.o + obj-$(CONFIG_MACH_LINKSTATION_PRODUO) += lsproduo-setup.o + obj-$(CONFIG_MACH_LINKSTATION_LS_HGL) += ls_hgl-setup.o ++obj-$(CONFIG_MACH_LINKSTATION_LSQL) += lsql-setup.o + obj-$(CONFIG_MACH_DNS323) += dns323-setup.o + obj-$(CONFIG_MACH_TS209) += ts209-setup.o tsx09-common.o + obj-$(CONFIG_MACH_TS409) += ts409-setup.o tsx09-common.o +diff -uNr linux-3.3.8-go.orig/arch/arm/tools/mach-types linux-3.3.8-go/arch/arm/tools/mach-types +--- linux-3.3.8-go.orig/arch/arm/tools/mach-types 2012-11-22 21:40:48.446921940 +0100 ++++ linux-3.3.8-go/arch/arm/tools/mach-types 2012-11-22 21:41:53.355153632 +0100 +@@ -1172,3 +1172,4 @@ + pov2 MACH_POV2 POV2 3889 + ipod_touch_2g MACH_IPOD_TOUCH_2G IPOD_TOUCH_2G 3890 + da850_pqab MACH_DA850_PQAB DA850_PQAB 3891 ++linkstation_lsql MACH_LINKSTATION_LSQL LINKSTATION_LSQL 4238 diff --git a/3.3.8/vserver-3.3.8-vs2.3.3.4.patch b/3.3.8/vserver-3.3.8-vs2.3.3.4.patch new file mode 100644 index 0000000..ac73224 --- /dev/null +++ b/3.3.8/vserver-3.3.8-vs2.3.3.4.patch @@ -0,0 +1,26065 @@ +diff -NurpP --minimal linux-3.3.8/Documentation/vserver/debug.txt linux-3.3.8-vs2.3.3.4/Documentation/vserver/debug.txt +--- linux-3.3.8/Documentation/vserver/debug.txt 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/Documentation/vserver/debug.txt 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,154 @@ ++ ++debug_cvirt: ++ ++ 2 4 "vx_map_tgid: %p/%llx: %d -> %d" ++ "vx_rmap_tgid: %p/%llx: %d -> %d" ++ ++debug_dlim: ++ ++ 0 1 "ALLOC (%p,#%d)%c inode (%d)" ++ "FREE (%p,#%d)%c inode" ++ 1 2 "ALLOC (%p,#%d)%c %lld bytes (%d)" ++ "FREE (%p,#%d)%c %lld bytes" ++ 2 4 "ADJUST: %lld,%lld on %ld,%ld [mult=%d]" ++ 3 8 "ext3_has_free_blocks(%p): %lu<%lu+1, %c, %u!=%u r=%d" ++ "ext3_has_free_blocks(%p): free=%lu, root=%lu" ++ "rcu_free_dl_info(%p)" ++ 4 10 "alloc_dl_info(%p,%d) = %p" ++ "dealloc_dl_info(%p)" ++ "get_dl_info(%p[#%d.%d])" ++ "put_dl_info(%p[#%d.%d])" ++ 5 20 "alloc_dl_info(%p,%d)*" ++ 6 40 "__hash_dl_info: %p[#%d]" ++ "__unhash_dl_info: %p[#%d]" ++ 7 80 "locate_dl_info(%p,#%d) = %p" ++ ++debug_misc: ++ ++ 0 1 "destroy_dqhash: %p [#0x%08x] c=%d" ++ "new_dqhash: %p [#0x%08x]" ++ "vroot[%d]_clr_dev: dev=%p[%lu,%d:%d]" ++ "vroot[%d]_get_real_bdev: dev=%p[%lu,%d:%d]" ++ "vroot[%d]_set_dev: dev=%p[%lu,%d:%d]" ++ "vroot_get_real_bdev not set" ++ 1 2 "cow_break_link(»%s«)" ++ "temp copy »%s«" ++ 2 4 "dentry_open(new): %p" ++ "dentry_open(old): %p" ++ "lookup_create(new): %p" ++ "old path »%s«" ++ "path_lookup(old): %d" ++ "vfs_create(new): %d" ++ "vfs_rename: %d" ++ "vfs_sendfile: %d" ++ 3 8 "fput(new_file=%p[#%d])" ++ "fput(old_file=%p[#%d])" ++ 4 10 "vx_info_kill(%p[#%d],%d,%d) = %d" ++ "vx_info_kill(%p[#%d],%d,%d)*" ++ 5 20 "vs_reboot(%p[#%d],%d)" ++ 6 40 "dropping task %p[#%u,%u] for %p[#%u,%u]" ++ ++debug_net: ++ ++ 2 4 "nx_addr_conflict(%p,%p) %d.%d,%d.%d" ++ 3 8 "inet_bind(%p) %d.%d.%d.%d, %d.%d.%d.%d, %d.%d.%d.%d" ++ "inet_bind(%p)* %p,%p;%lx %d.%d.%d.%d" ++ 4 10 "ip_route_connect(%p) %p,%p;%lx" ++ 5 20 "__addr_in_socket(%p,%d.%d.%d.%d) %p:%d.%d.%d.%d %p;%lx" ++ 6 40 "sk,egf: %p [#%d] (from %d)" ++ "sk,egn: %p [#%d] (from %d)" ++ "sk,req: %p [#%d] (from %d)" ++ "sk: %p [#%d] (from %d)" ++ "tw: %p [#%d] (from %d)" ++ 7 80 "__sock_recvmsg: %p[%p,%p,%p;%d]:%d/%d" ++ "__sock_sendmsg: %p[%p,%p,%p;%d]:%d/%d" ++ ++debug_nid: ++ ++ 0 1 "__lookup_nx_info(#%u): %p[#%u]" ++ "alloc_nx_info(%d) = %p" ++ "create_nx_info(%d) (dynamic rejected)" ++ "create_nx_info(%d) = %p (already there)" ++ "create_nx_info(%d) = %p (new)" ++ "dealloc_nx_info(%p)" ++ 1 2 "alloc_nx_info(%d)*" ++ "create_nx_info(%d)*" ++ 2 4 "get_nx_info(%p[#%d.%d])" ++ "put_nx_info(%p[#%d.%d])" ++ 3 8 "claim_nx_info(%p[#%d.%d.%d]) %p" ++ "clr_nx_info(%p[#%d.%d])" ++ "init_nx_info(%p[#%d.%d])" ++ "release_nx_info(%p[#%d.%d.%d]) %p" ++ "set_nx_info(%p[#%d.%d])" ++ 4 10 "__hash_nx_info: %p[#%d]" ++ "__nx_dynamic_id: [#%d]" ++ "__unhash_nx_info: %p[#%d.%d.%d]" ++ 5 20 "moved task %p into nxi:%p[#%d]" ++ "nx_migrate_task(%p,%p[#%d.%d.%d])" ++ "task_get_nx_info(%p)" ++ 6 40 "nx_clear_persistent(%p[#%d])" ++ ++debug_quota: ++ ++ 0 1 "quota_sync_dqh(%p,%d) discard inode %p" ++ 1 2 "quota_sync_dqh(%p,%d)" ++ "sync_dquots(%p,%d)" ++ "sync_dquots_dqh(%p,%d)" ++ 3 8 "do_quotactl(%p,%d,cmd=%d,id=%d,%p)" ++ ++debug_switch: ++ ++ 0 1 "vc: VCMD_%02d_%d[%d], %d,%p [%d,%d,%x,%x]" ++ 1 2 "vc: VCMD_%02d_%d[%d] = %08lx(%ld) [%d,%d]" ++ 4 10 "%s: (%s %s) returned %s with %d" ++ ++debug_tag: ++ ++ 7 80 "dx_parse_tag(»%s«): %d:#%d" ++ "dx_propagate_tag(%p[#%lu.%d]): %d,%d" ++ ++debug_xid: ++ ++ 0 1 "__lookup_vx_info(#%u): %p[#%u]" ++ "alloc_vx_info(%d) = %p" ++ "alloc_vx_info(%d)*" ++ "create_vx_info(%d) (dynamic rejected)" ++ "create_vx_info(%d) = %p (already there)" ++ "create_vx_info(%d) = %p (new)" ++ "dealloc_vx_info(%p)" ++ "loc_vx_info(%d) = %p (found)" ++ "loc_vx_info(%d) = %p (new)" ++ "loc_vx_info(%d) = %p (not available)" ++ 1 2 "create_vx_info(%d)*" ++ "loc_vx_info(%d)*" ++ 2 4 "get_vx_info(%p[#%d.%d])" ++ "put_vx_info(%p[#%d.%d])" ++ 3 8 "claim_vx_info(%p[#%d.%d.%d]) %p" ++ "clr_vx_info(%p[#%d.%d])" ++ "init_vx_info(%p[#%d.%d])" ++ "release_vx_info(%p[#%d.%d.%d]) %p" ++ "set_vx_info(%p[#%d.%d])" ++ 4 10 "__hash_vx_info: %p[#%d]" ++ "__unhash_vx_info: %p[#%d.%d.%d]" ++ "__vx_dynamic_id: [#%d]" ++ 5 20 "enter_vx_info(%p[#%d],%p) %p[#%d,%p]" ++ "leave_vx_info(%p[#%d,%p]) %p[#%d,%p]" ++ "moved task %p into vxi:%p[#%d]" ++ "task_get_vx_info(%p)" ++ "vx_migrate_task(%p,%p[#%d.%d])" ++ 6 40 "vx_clear_persistent(%p[#%d])" ++ "vx_exit_init(%p[#%d],%p[#%d,%d,%d])" ++ "vx_set_init(%p[#%d],%p[#%d,%d,%d])" ++ "vx_set_persistent(%p[#%d])" ++ "vx_set_reaper(%p[#%d],%p[#%d,%d])" ++ 7 80 "vx_child_reaper(%p[#%u,%u]) = %p[#%u,%u]" ++ ++ ++debug_limit: ++ ++ n 2^n "vx_acc_cres[%5d,%s,%2d]: %5d%s" ++ "vx_cres_avail[%5d,%s,%2d]: %5ld > %5d + %5d" ++ ++ m 2^m "vx_acc_page[%5d,%s,%2d]: %5d%s" ++ "vx_acc_pages[%5d,%s,%2d]: %5d += %5d" ++ "vx_pages_avail[%5d,%s,%2d]: %5ld > %5d + %5d" +diff -NurpP --minimal linux-3.3.8/arch/alpha/Kconfig linux-3.3.8-vs2.3.3.4/arch/alpha/Kconfig +--- linux-3.3.8/arch/alpha/Kconfig 2012-03-19 19:46:27.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/alpha/Kconfig 2012-02-24 03:55:06.000000000 +0100 +@@ -662,6 +662,8 @@ config DUMMY_CONSOLE + depends on VGA_HOSE + default y + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.3.8/arch/alpha/kernel/entry.S linux-3.3.8-vs2.3.3.4/arch/alpha/kernel/entry.S +--- linux-3.3.8/arch/alpha/kernel/entry.S 2010-10-21 13:06:45.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/arch/alpha/kernel/entry.S 2012-02-24 03:55:06.000000000 +0100 +@@ -860,24 +860,15 @@ sys_getxgid: + .globl sys_getxpid + .ent sys_getxpid + sys_getxpid: ++ lda $sp, -16($sp) ++ stq $26, 0($sp) + .prologue 0 +- ldq $2, TI_TASK($8) + +- /* See linux/kernel/timer.c sys_getppid for discussion +- about this loop. */ +- ldq $3, TASK_GROUP_LEADER($2) +- ldq $4, TASK_REAL_PARENT($3) +- ldl $0, TASK_TGID($2) +-1: ldl $1, TASK_TGID($4) +-#ifdef CONFIG_SMP +- mov $4, $5 +- mb +- ldq $3, TASK_GROUP_LEADER($2) +- ldq $4, TASK_REAL_PARENT($3) +- cmpeq $4, $5, $5 +- beq $5, 1b +-#endif +- stq $1, 80($sp) ++ lda $16, 96($sp) ++ jsr $26, do_getxpid ++ ldq $26, 0($sp) ++ ++ lda $sp, 16($sp) + ret + .end sys_getxpid + +diff -NurpP --minimal linux-3.3.8/arch/alpha/kernel/ptrace.c linux-3.3.8-vs2.3.3.4/arch/alpha/kernel/ptrace.c +--- linux-3.3.8/arch/alpha/kernel/ptrace.c 2011-01-05 21:48:40.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/alpha/kernel/ptrace.c 2012-02-24 03:55:06.000000000 +0100 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + + #include + #include +diff -NurpP --minimal linux-3.3.8/arch/alpha/kernel/systbls.S linux-3.3.8-vs2.3.3.4/arch/alpha/kernel/systbls.S +--- linux-3.3.8/arch/alpha/kernel/systbls.S 2012-01-09 16:13:54.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/alpha/kernel/systbls.S 2012-02-24 03:55:06.000000000 +0100 +@@ -446,7 +446,7 @@ sys_call_table: + .quad sys_stat64 /* 425 */ + .quad sys_lstat64 + .quad sys_fstat64 +- .quad sys_ni_syscall /* sys_vserver */ ++ .quad sys_vserver /* sys_vserver */ + .quad sys_ni_syscall /* sys_mbind */ + .quad sys_ni_syscall /* sys_get_mempolicy */ + .quad sys_ni_syscall /* sys_set_mempolicy */ +diff -NurpP --minimal linux-3.3.8/arch/alpha/kernel/traps.c linux-3.3.8-vs2.3.3.4/arch/alpha/kernel/traps.c +--- linux-3.3.8/arch/alpha/kernel/traps.c 2010-10-21 13:06:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/arch/alpha/kernel/traps.c 2012-02-24 03:55:06.000000000 +0100 +@@ -183,7 +183,8 @@ die_if_kernel(char * str, struct pt_regs + #ifdef CONFIG_SMP + printk("CPU %d ", hard_smp_processor_id()); + #endif +- printk("%s(%d): %s %ld\n", current->comm, task_pid_nr(current), str, err); ++ printk("%s(%d[#%u]): %s %ld\n", current->comm, ++ task_pid_nr(current), current->xid, str, err); + dik_show_regs(regs, r9_15); + add_taint(TAINT_DIE); + dik_show_trace((unsigned long *)(regs+1)); +diff -NurpP --minimal linux-3.3.8/arch/arm/Kconfig linux-3.3.8-vs2.3.3.4/arch/arm/Kconfig +--- linux-3.3.8/arch/arm/Kconfig 2012-06-08 15:23:43.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/arch/arm/Kconfig 2012-05-09 04:08:07.000000000 +0200 +@@ -2275,6 +2275,8 @@ source "fs/Kconfig" + + source "arch/arm/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.3.8/arch/arm/kernel/calls.S linux-3.3.8-vs2.3.3.4/arch/arm/kernel/calls.S +--- linux-3.3.8/arch/arm/kernel/calls.S 2012-01-09 16:13:54.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/arm/kernel/calls.S 2012-02-24 03:55:06.000000000 +0100 +@@ -322,7 +322,7 @@ + /* 310 */ CALL(sys_request_key) + CALL(sys_keyctl) + CALL(ABI(sys_semtimedop, sys_oabi_semtimedop)) +-/* vserver */ CALL(sys_ni_syscall) ++ CALL(sys_vserver) + CALL(sys_ioprio_set) + /* 315 */ CALL(sys_ioprio_get) + CALL(sys_inotify_init) +diff -NurpP --minimal linux-3.3.8/arch/arm/kernel/process.c linux-3.3.8-vs2.3.3.4/arch/arm/kernel/process.c +--- linux-3.3.8/arch/arm/kernel/process.c 2012-03-19 19:46:28.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/arm/kernel/process.c 2012-02-24 03:55:06.000000000 +0100 +@@ -353,7 +353,8 @@ void __show_regs(struct pt_regs *regs) + void show_regs(struct pt_regs * regs) + { + printk("\n"); +- printk("Pid: %d, comm: %20s\n", task_pid_nr(current), current->comm); ++ printk("Pid: %d[#%u], comm: %20s\n", ++ task_pid_nr(current), current->xid, current->comm); + __show_regs(regs); + dump_stack(); + } +diff -NurpP --minimal linux-3.3.8/arch/arm/kernel/traps.c linux-3.3.8-vs2.3.3.4/arch/arm/kernel/traps.c +--- linux-3.3.8/arch/arm/kernel/traps.c 2012-06-08 15:23:43.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/arch/arm/kernel/traps.c 2012-06-08 15:27:44.000000000 +0200 +@@ -244,8 +244,8 @@ static int __die(const char *str, int er + + print_modules(); + __show_regs(regs); +- printk(KERN_EMERG "Process %.*s (pid: %d, stack limit = 0x%p)\n", +- TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), thread + 1); ++ printk(KERN_EMERG "Process %.*s (pid: %d:#%u, stack limit = 0x%p)\n", ++ TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), tsk->xid, thread + 1); + + if (!user_mode(regs) || in_interrupt()) { + dump_mem(KERN_EMERG, "Stack: ", regs->ARM_sp, +diff -NurpP --minimal linux-3.3.8/arch/cris/Kconfig linux-3.3.8-vs2.3.3.4/arch/cris/Kconfig +--- linux-3.3.8/arch/cris/Kconfig 2012-03-19 19:46:39.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/cris/Kconfig 2012-02-24 03:55:06.000000000 +0100 +@@ -675,6 +675,8 @@ source "drivers/staging/Kconfig" + + source "arch/cris/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.3.8/arch/frv/kernel/kernel_thread.S linux-3.3.8-vs2.3.3.4/arch/frv/kernel/kernel_thread.S +--- linux-3.3.8/arch/frv/kernel/kernel_thread.S 2008-12-25 00:26:37.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/frv/kernel/kernel_thread.S 2012-02-24 03:55:06.000000000 +0100 +@@ -37,7 +37,7 @@ kernel_thread: + + # start by forking the current process, but with shared VM + setlos.p #__NR_clone,gr7 ; syscall number +- ori gr10,#CLONE_VM,gr8 ; first syscall arg [clone_flags] ++ ori gr10,#CLONE_KT,gr8 ; first syscall arg [clone_flags] + sethi.p #0xe4e4,gr9 ; second syscall arg [newsp] + setlo #0xe4e4,gr9 + setlos.p #0,gr10 ; third syscall arg [parent_tidptr] +diff -NurpP --minimal linux-3.3.8/arch/h8300/Kconfig linux-3.3.8-vs2.3.3.4/arch/h8300/Kconfig +--- linux-3.3.8/arch/h8300/Kconfig 2012-03-19 19:46:39.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/h8300/Kconfig 2012-02-24 03:55:06.000000000 +0100 +@@ -214,6 +214,8 @@ source "fs/Kconfig" + + source "arch/h8300/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.3.8/arch/ia64/Kconfig linux-3.3.8-vs2.3.3.4/arch/ia64/Kconfig +--- linux-3.3.8/arch/ia64/Kconfig 2012-03-19 19:46:39.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/ia64/Kconfig 2012-02-24 03:55:06.000000000 +0100 +@@ -654,6 +654,8 @@ source "fs/Kconfig" + + source "arch/ia64/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.3.8/arch/ia64/kernel/entry.S linux-3.3.8-vs2.3.3.4/arch/ia64/kernel/entry.S +--- linux-3.3.8/arch/ia64/kernel/entry.S 2012-03-19 19:46:40.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/ia64/kernel/entry.S 2012-02-24 03:55:06.000000000 +0100 +@@ -1714,7 +1714,7 @@ sys_call_table: + data8 sys_mq_notify + data8 sys_mq_getsetattr + data8 sys_kexec_load +- data8 sys_ni_syscall // reserved for vserver ++ data8 sys_vserver + data8 sys_waitid // 1270 + data8 sys_add_key + data8 sys_request_key +diff -NurpP --minimal linux-3.3.8/arch/ia64/kernel/process.c linux-3.3.8-vs2.3.3.4/arch/ia64/kernel/process.c +--- linux-3.3.8/arch/ia64/kernel/process.c 2011-03-15 18:06:39.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/ia64/kernel/process.c 2012-02-24 03:55:06.000000000 +0100 +@@ -109,8 +109,8 @@ show_regs (struct pt_regs *regs) + unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri; + + print_modules(); +- printk("\nPid: %d, CPU %d, comm: %20s\n", task_pid_nr(current), +- smp_processor_id(), current->comm); ++ printk("\nPid: %d[#%u], CPU %d, comm: %20s\n", task_pid_nr(current), ++ current->xid, smp_processor_id(), current->comm); + printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s (%s)\n", + regs->cr_ipsr, regs->cr_ifs, ip, print_tainted(), + init_utsname()->release); +diff -NurpP --minimal linux-3.3.8/arch/ia64/kernel/ptrace.c linux-3.3.8-vs2.3.3.4/arch/ia64/kernel/ptrace.c +--- linux-3.3.8/arch/ia64/kernel/ptrace.c 2012-03-19 19:46:40.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/ia64/kernel/ptrace.c 2012-02-24 03:55:06.000000000 +0100 +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + + #include + #include +diff -NurpP --minimal linux-3.3.8/arch/ia64/kernel/traps.c linux-3.3.8-vs2.3.3.4/arch/ia64/kernel/traps.c +--- linux-3.3.8/arch/ia64/kernel/traps.c 2010-07-07 18:31:01.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/arch/ia64/kernel/traps.c 2012-02-24 03:55:06.000000000 +0100 +@@ -59,8 +59,9 @@ die (const char *str, struct pt_regs *re + put_cpu(); + + if (++die.lock_owner_depth < 3) { +- printk("%s[%d]: %s %ld [%d]\n", +- current->comm, task_pid_nr(current), str, err, ++die_counter); ++ printk("%s[%d[#%u]]: %s %ld [%d]\n", ++ current->comm, task_pid_nr(current), current->xid, ++ str, err, ++die_counter); + if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) + != NOTIFY_STOP) + show_regs(regs); +@@ -323,8 +324,9 @@ handle_fpu_swa (int fp_fault, struct pt_ + if ((last.count & 15) < 5 && (ia64_fetchadd(1, &last.count, acq) & 15) < 5) { + last.time = current_jiffies + 5 * HZ; + printk(KERN_WARNING +- "%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n", +- current->comm, task_pid_nr(current), regs->cr_iip + ia64_psr(regs)->ri, isr); ++ "%s(%d[#%u]): floating-point assist fault at ip %016lx, isr %016lx\n", ++ current->comm, task_pid_nr(current), current->xid, ++ regs->cr_iip + ia64_psr(regs)->ri, isr); + } + } + } +diff -NurpP --minimal linux-3.3.8/arch/m32r/kernel/traps.c linux-3.3.8-vs2.3.3.4/arch/m32r/kernel/traps.c +--- linux-3.3.8/arch/m32r/kernel/traps.c 2011-10-24 18:44:58.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/arch/m32r/kernel/traps.c 2012-02-24 03:55:06.000000000 +0100 +@@ -196,8 +196,9 @@ static void show_registers(struct pt_reg + } else { + printk("SPI: %08lx\n", sp); + } +- printk("Process %s (pid: %d, process nr: %d, stackpage=%08lx)", +- current->comm, task_pid_nr(current), 0xffff & i, 4096+(unsigned long)current); ++ printk("Process %s (pid: %d[#%u], process nr: %d, stackpage=%08lx)", ++ current->comm, task_pid_nr(current), current->xid, ++ 0xffff & i, 4096+(unsigned long)current); + + /* + * When in-kernel, we also print out the stack and code at the +diff -NurpP --minimal linux-3.3.8/arch/m68k/Kconfig linux-3.3.8-vs2.3.3.4/arch/m68k/Kconfig +--- linux-3.3.8/arch/m68k/Kconfig 2012-03-19 19:46:40.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/m68k/Kconfig 2012-02-24 03:55:06.000000000 +0100 +@@ -145,6 +145,8 @@ source "fs/Kconfig" + + source "arch/m68k/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.3.8/arch/mips/Kconfig linux-3.3.8-vs2.3.3.4/arch/mips/Kconfig +--- linux-3.3.8/arch/mips/Kconfig 2012-03-19 19:46:41.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/mips/Kconfig 2012-02-24 03:55:06.000000000 +0100 +@@ -2514,6 +2514,8 @@ source "fs/Kconfig" + + source "arch/mips/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.3.8/arch/mips/kernel/ptrace.c linux-3.3.8-vs2.3.3.4/arch/mips/kernel/ptrace.c +--- linux-3.3.8/arch/mips/kernel/ptrace.c 2012-03-19 19:46:43.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/mips/kernel/ptrace.c 2012-02-24 03:55:06.000000000 +0100 +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -263,6 +264,9 @@ long arch_ptrace(struct task_struct *chi + void __user *datavp = (void __user *) data; + unsigned long __user *datalp = (void __user *) data; + ++ if (!vx_check(vx_task_xid(child), VS_WATCH_P | VS_IDENT)) ++ goto out; ++ + switch (request) { + /* when I and D space are separate, these will need to be fixed. */ + case PTRACE_PEEKTEXT: /* read word at location addr. */ +diff -NurpP --minimal linux-3.3.8/arch/mips/kernel/scall32-o32.S linux-3.3.8-vs2.3.3.4/arch/mips/kernel/scall32-o32.S +--- linux-3.3.8/arch/mips/kernel/scall32-o32.S 2012-01-09 16:14:05.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/mips/kernel/scall32-o32.S 2012-02-24 03:55:06.000000000 +0100 +@@ -523,7 +523,7 @@ einval: li v0, -ENOSYS + sys sys_mq_timedreceive 5 + sys sys_mq_notify 2 /* 4275 */ + sys sys_mq_getsetattr 3 +- sys sys_ni_syscall 0 /* sys_vserver */ ++ sys sys_vserver 3 + sys sys_waitid 5 + sys sys_ni_syscall 0 /* available, was setaltroot */ + sys sys_add_key 5 /* 4280 */ +diff -NurpP --minimal linux-3.3.8/arch/mips/kernel/scall64-64.S linux-3.3.8-vs2.3.3.4/arch/mips/kernel/scall64-64.S +--- linux-3.3.8/arch/mips/kernel/scall64-64.S 2012-01-09 16:14:05.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/mips/kernel/scall64-64.S 2012-02-24 03:55:06.000000000 +0100 +@@ -362,7 +362,7 @@ sys_call_table: + PTR sys_mq_timedreceive + PTR sys_mq_notify + PTR sys_mq_getsetattr /* 5235 */ +- PTR sys_ni_syscall /* sys_vserver */ ++ PTR sys_vserver + PTR sys_waitid + PTR sys_ni_syscall /* available, was setaltroot */ + PTR sys_add_key +diff -NurpP --minimal linux-3.3.8/arch/mips/kernel/scall64-n32.S linux-3.3.8-vs2.3.3.4/arch/mips/kernel/scall64-n32.S +--- linux-3.3.8/arch/mips/kernel/scall64-n32.S 2012-01-09 16:14:05.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/mips/kernel/scall64-n32.S 2012-02-24 03:55:06.000000000 +0100 +@@ -361,7 +361,7 @@ EXPORT(sysn32_call_table) + PTR compat_sys_mq_timedreceive + PTR compat_sys_mq_notify + PTR compat_sys_mq_getsetattr +- PTR sys_ni_syscall /* 6240, sys_vserver */ ++ PTR sys32_vserver /* 6240 */ + PTR compat_sys_waitid + PTR sys_ni_syscall /* available, was setaltroot */ + PTR sys_add_key +diff -NurpP --minimal linux-3.3.8/arch/mips/kernel/scall64-o32.S linux-3.3.8-vs2.3.3.4/arch/mips/kernel/scall64-o32.S +--- linux-3.3.8/arch/mips/kernel/scall64-o32.S 2012-01-09 16:14:05.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/mips/kernel/scall64-o32.S 2012-02-24 03:55:06.000000000 +0100 +@@ -480,7 +480,7 @@ sys_call_table: + PTR compat_sys_mq_timedreceive + PTR compat_sys_mq_notify /* 4275 */ + PTR compat_sys_mq_getsetattr +- PTR sys_ni_syscall /* sys_vserver */ ++ PTR sys32_vserver + PTR sys_32_waitid + PTR sys_ni_syscall /* available, was setaltroot */ + PTR sys_add_key /* 4280 */ +diff -NurpP --minimal linux-3.3.8/arch/mips/kernel/traps.c linux-3.3.8-vs2.3.3.4/arch/mips/kernel/traps.c +--- linux-3.3.8/arch/mips/kernel/traps.c 2012-03-19 19:46:43.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/mips/kernel/traps.c 2012-03-19 20:52:09.000000000 +0100 +@@ -344,9 +344,10 @@ void show_registers(struct pt_regs *regs + + __show_regs(regs); + print_modules(); +- printk("Process %s (pid: %d, threadinfo=%p, task=%p, tls=%0*lx)\n", +- current->comm, current->pid, current_thread_info(), current, +- field, current_thread_info()->tp_value); ++ printk("Process %s (pid: %d:#%u, threadinfo=%p, task=%p, tls=%0*lx)\n", ++ current->comm, task_pid_nr(current), current->xid, ++ current_thread_info(), current, ++ field, current_thread_info()->tp_value); + if (cpu_has_userlocal) { + unsigned long tls; + +diff -NurpP --minimal linux-3.3.8/arch/parisc/Kconfig linux-3.3.8-vs2.3.3.4/arch/parisc/Kconfig +--- linux-3.3.8/arch/parisc/Kconfig 2012-03-19 19:46:44.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/parisc/Kconfig 2012-02-24 03:55:06.000000000 +0100 +@@ -279,6 +279,8 @@ source "fs/Kconfig" + + source "arch/parisc/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.3.8/arch/parisc/kernel/syscall_table.S linux-3.3.8-vs2.3.3.4/arch/parisc/kernel/syscall_table.S +--- linux-3.3.8/arch/parisc/kernel/syscall_table.S 2011-10-24 18:45:00.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/arch/parisc/kernel/syscall_table.S 2012-02-24 03:55:06.000000000 +0100 +@@ -361,7 +361,7 @@ + ENTRY_COMP(mbind) /* 260 */ + ENTRY_COMP(get_mempolicy) + ENTRY_COMP(set_mempolicy) +- ENTRY_SAME(ni_syscall) /* 263: reserved for vserver */ ++ ENTRY_DIFF(vserver) + ENTRY_SAME(add_key) + ENTRY_SAME(request_key) /* 265 */ + ENTRY_SAME(keyctl) +diff -NurpP --minimal linux-3.3.8/arch/parisc/kernel/traps.c linux-3.3.8-vs2.3.3.4/arch/parisc/kernel/traps.c +--- linux-3.3.8/arch/parisc/kernel/traps.c 2011-10-24 18:45:00.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/arch/parisc/kernel/traps.c 2012-02-24 03:55:06.000000000 +0100 +@@ -236,8 +236,9 @@ void die_if_kernel(char *str, struct pt_ + if (err == 0) + return; /* STFU */ + +- printk(KERN_CRIT "%s (pid %d): %s (code %ld) at " RFMT "\n", +- current->comm, task_pid_nr(current), str, err, regs->iaoq[0]); ++ printk(KERN_CRIT "%s (pid %d:#%u): %s (code %ld) at " RFMT "\n", ++ current->comm, task_pid_nr(current), current->xid, ++ str, err, regs->iaoq[0]); + #ifdef PRINT_USER_FAULTS + /* XXX for debugging only */ + show_regs(regs); +@@ -270,8 +271,8 @@ void die_if_kernel(char *str, struct pt_ + pdc_console_restart(); + + if (err) +- printk(KERN_CRIT "%s (pid %d): %s (code %ld)\n", +- current->comm, task_pid_nr(current), str, err); ++ printk(KERN_CRIT "%s (pid %d:#%u): %s (code %ld)\n", ++ current->comm, task_pid_nr(current), current->xid, str, err); + + /* Wot's wrong wif bein' racy? */ + if (current->thread.flags & PARISC_KERNEL_DEATH) { +diff -NurpP --minimal linux-3.3.8/arch/parisc/mm/fault.c linux-3.3.8-vs2.3.3.4/arch/parisc/mm/fault.c +--- linux-3.3.8/arch/parisc/mm/fault.c 2010-08-02 16:52:06.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/arch/parisc/mm/fault.c 2012-02-24 03:55:06.000000000 +0100 +@@ -237,8 +237,9 @@ bad_area: + + #ifdef PRINT_USER_FAULTS + printk(KERN_DEBUG "\n"); +- printk(KERN_DEBUG "do_page_fault() pid=%d command='%s' type=%lu address=0x%08lx\n", +- task_pid_nr(tsk), tsk->comm, code, address); ++ printk(KERN_DEBUG "do_page_fault() pid=%d:#%u " ++ "command='%s' type=%lu address=0x%08lx\n", ++ task_pid_nr(tsk), tsk->xid, tsk->comm, code, address); + if (vma) { + printk(KERN_DEBUG "vm_start = 0x%08lx, vm_end = 0x%08lx\n", + vma->vm_start, vma->vm_end); +diff -NurpP --minimal linux-3.3.8/arch/powerpc/Kconfig linux-3.3.8-vs2.3.3.4/arch/powerpc/Kconfig +--- linux-3.3.8/arch/powerpc/Kconfig 2012-03-19 19:46:44.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/powerpc/Kconfig 2012-02-24 03:55:06.000000000 +0100 +@@ -997,6 +997,8 @@ source "lib/Kconfig" + + source "arch/powerpc/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + config KEYS_COMPAT +diff -NurpP --minimal linux-3.3.8/arch/powerpc/include/asm/unistd.h linux-3.3.8-vs2.3.3.4/arch/powerpc/include/asm/unistd.h +--- linux-3.3.8/arch/powerpc/include/asm/unistd.h 2012-01-09 16:14:05.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/powerpc/include/asm/unistd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -275,7 +275,7 @@ + #endif + #define __NR_rtas 255 + #define __NR_sys_debug_setcontext 256 +-/* Number 257 is reserved for vserver */ ++#define __NR_vserver 257 + #define __NR_migrate_pages 258 + #define __NR_mbind 259 + #define __NR_get_mempolicy 260 +diff -NurpP --minimal linux-3.3.8/arch/powerpc/kernel/process.c linux-3.3.8-vs2.3.3.4/arch/powerpc/kernel/process.c +--- linux-3.3.8/arch/powerpc/kernel/process.c 2012-03-19 19:46:45.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/powerpc/kernel/process.c 2012-02-24 03:55:06.000000000 +0100 +@@ -656,8 +656,9 @@ void show_regs(struct pt_regs * regs) + #else + printk("DAR: "REG", DSISR: %08lx\n", regs->dar, regs->dsisr); + #endif +- printk("TASK = %p[%d] '%s' THREAD: %p", +- current, task_pid_nr(current), current->comm, task_thread_info(current)); ++ printk("TASK = %p[%d,#%u] '%s' THREAD: %p", ++ current, task_pid_nr(current), current->xid, ++ current->comm, task_thread_info(current)); + + #ifdef CONFIG_SMP + printk(" CPU: %d", raw_smp_processor_id()); +diff -NurpP --minimal linux-3.3.8/arch/powerpc/kernel/traps.c linux-3.3.8-vs2.3.3.4/arch/powerpc/kernel/traps.c +--- linux-3.3.8/arch/powerpc/kernel/traps.c 2012-03-19 19:46:45.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/powerpc/kernel/traps.c 2012-02-24 03:55:06.000000000 +0100 +@@ -1105,8 +1105,9 @@ void nonrecoverable_exception(struct pt_ + + void trace_syscall(struct pt_regs *regs) + { +- printk("Task: %p(%d), PC: %08lX/%08lX, Syscall: %3ld, Result: %s%ld %s\n", +- current, task_pid_nr(current), regs->nip, regs->link, regs->gpr[0], ++ printk("Task: %p(%d[#%u]), PC: %08lX/%08lX, Syscall: %3ld, Result: %s%ld %s\n", ++ current, task_pid_nr(current), current->xid, ++ regs->nip, regs->link, regs->gpr[0], + regs->ccr&0x10000000?"Error=":"", regs->gpr[3], print_tainted()); + } + +diff -NurpP --minimal linux-3.3.8/arch/s390/Kconfig linux-3.3.8-vs2.3.3.4/arch/s390/Kconfig +--- linux-3.3.8/arch/s390/Kconfig 2012-06-08 15:23:43.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/arch/s390/Kconfig 2012-04-23 23:45:14.000000000 +0200 +@@ -637,6 +637,8 @@ source "fs/Kconfig" + + source "arch/s390/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.3.8/arch/s390/include/asm/tlb.h linux-3.3.8-vs2.3.3.4/arch/s390/include/asm/tlb.h +--- linux-3.3.8/arch/s390/include/asm/tlb.h 2012-06-08 15:23:43.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/arch/s390/include/asm/tlb.h 2012-04-23 23:45:14.000000000 +0200 +@@ -24,6 +24,7 @@ + #include + #include + #include ++ + #include + #include + #include +diff -NurpP --minimal linux-3.3.8/arch/s390/include/asm/unistd.h linux-3.3.8-vs2.3.3.4/arch/s390/include/asm/unistd.h +--- linux-3.3.8/arch/s390/include/asm/unistd.h 2012-03-19 19:46:48.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/s390/include/asm/unistd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -202,7 +202,7 @@ + #define __NR_clock_gettime (__NR_timer_create+6) + #define __NR_clock_getres (__NR_timer_create+7) + #define __NR_clock_nanosleep (__NR_timer_create+8) +-/* Number 263 is reserved for vserver */ ++#define __NR_vserver 263 + #define __NR_statfs64 265 + #define __NR_fstatfs64 266 + #define __NR_remap_file_pages 267 +diff -NurpP --minimal linux-3.3.8/arch/s390/kernel/ptrace.c linux-3.3.8-vs2.3.3.4/arch/s390/kernel/ptrace.c +--- linux-3.3.8/arch/s390/kernel/ptrace.c 2012-03-19 19:46:48.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/s390/kernel/ptrace.c 2012-03-19 20:53:54.000000000 +0100 +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + #include + #include + #include +diff -NurpP --minimal linux-3.3.8/arch/s390/kernel/syscalls.S linux-3.3.8-vs2.3.3.4/arch/s390/kernel/syscalls.S +--- linux-3.3.8/arch/s390/kernel/syscalls.S 2012-01-09 16:14:06.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/s390/kernel/syscalls.S 2012-02-24 03:55:06.000000000 +0100 +@@ -271,7 +271,7 @@ SYSCALL(sys_clock_settime,sys_clock_sett + SYSCALL(sys_clock_gettime,sys_clock_gettime,sys32_clock_gettime_wrapper) /* 260 */ + SYSCALL(sys_clock_getres,sys_clock_getres,sys32_clock_getres_wrapper) + SYSCALL(sys_clock_nanosleep,sys_clock_nanosleep,sys32_clock_nanosleep_wrapper) +-NI_SYSCALL /* reserved for vserver */ ++SYSCALL(sys_vserver,sys_vserver,sys32_vserver) + SYSCALL(sys_s390_fadvise64_64,sys_ni_syscall,sys32_fadvise64_64_wrapper) + SYSCALL(sys_statfs64,sys_statfs64,compat_sys_statfs64_wrapper) + SYSCALL(sys_fstatfs64,sys_fstatfs64,compat_sys_fstatfs64_wrapper) +diff -NurpP --minimal linux-3.3.8/arch/sh/Kconfig linux-3.3.8-vs2.3.3.4/arch/sh/Kconfig +--- linux-3.3.8/arch/sh/Kconfig 2012-03-19 19:46:49.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/sh/Kconfig 2012-02-24 03:55:06.000000000 +0100 +@@ -901,6 +901,8 @@ source "fs/Kconfig" + + source "arch/sh/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.3.8/arch/sh/kernel/irq.c linux-3.3.8-vs2.3.3.4/arch/sh/kernel/irq.c +--- linux-3.3.8/arch/sh/kernel/irq.c 2011-07-22 11:17:41.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/arch/sh/kernel/irq.c 2012-02-24 03:55:06.000000000 +0100 +@@ -14,6 +14,7 @@ + #include + #include + #include ++// #include + #include + #include + #include +diff -NurpP --minimal linux-3.3.8/arch/sparc/Kconfig linux-3.3.8-vs2.3.3.4/arch/sparc/Kconfig +--- linux-3.3.8/arch/sparc/Kconfig 2012-06-08 15:23:43.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/arch/sparc/Kconfig 2012-06-08 15:27:44.000000000 +0200 +@@ -597,6 +597,8 @@ source "fs/Kconfig" + + source "arch/sparc/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.3.8/arch/sparc/include/asm/unistd.h linux-3.3.8-vs2.3.3.4/arch/sparc/include/asm/unistd.h +--- linux-3.3.8/arch/sparc/include/asm/unistd.h 2012-01-09 16:14:07.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/sparc/include/asm/unistd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -335,7 +335,7 @@ + #define __NR_timer_getoverrun 264 + #define __NR_timer_delete 265 + #define __NR_timer_create 266 +-/* #define __NR_vserver 267 Reserved for VSERVER */ ++#define __NR_vserver 267 + #define __NR_io_setup 268 + #define __NR_io_destroy 269 + #define __NR_io_submit 270 +diff -NurpP --minimal linux-3.3.8/arch/sparc/kernel/systbls_32.S linux-3.3.8-vs2.3.3.4/arch/sparc/kernel/systbls_32.S +--- linux-3.3.8/arch/sparc/kernel/systbls_32.S 2012-01-09 16:14:09.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/sparc/kernel/systbls_32.S 2012-02-24 03:55:06.000000000 +0100 +@@ -70,7 +70,7 @@ sys_call_table: + /*250*/ .long sys_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_ni_syscall + /*255*/ .long sys_sync_file_range, sys_clock_settime, sys_clock_gettime, sys_clock_getres, sys_clock_nanosleep + /*260*/ .long sys_sched_getaffinity, sys_sched_setaffinity, sys_timer_settime, sys_timer_gettime, sys_timer_getoverrun +-/*265*/ .long sys_timer_delete, sys_timer_create, sys_nis_syscall, sys_io_setup, sys_io_destroy ++/*265*/ .long sys_timer_delete, sys_timer_create, sys_vserver, sys_io_setup, sys_io_destroy + /*270*/ .long sys_io_submit, sys_io_cancel, sys_io_getevents, sys_mq_open, sys_mq_unlink + /*275*/ .long sys_mq_timedsend, sys_mq_timedreceive, sys_mq_notify, sys_mq_getsetattr, sys_waitid + /*280*/ .long sys_tee, sys_add_key, sys_request_key, sys_keyctl, sys_openat +diff -NurpP --minimal linux-3.3.8/arch/sparc/kernel/systbls_64.S linux-3.3.8-vs2.3.3.4/arch/sparc/kernel/systbls_64.S +--- linux-3.3.8/arch/sparc/kernel/systbls_64.S 2012-06-08 15:23:43.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/arch/sparc/kernel/systbls_64.S 2012-06-08 15:27:44.000000000 +0200 +@@ -71,7 +71,7 @@ sys_call_table32: + /*250*/ .word sys_mremap, compat_sys_sysctl, sys32_getsid, sys_fdatasync, sys_nis_syscall + .word sys32_sync_file_range, compat_sys_clock_settime, compat_sys_clock_gettime, compat_sys_clock_getres, sys32_clock_nanosleep + /*260*/ .word compat_sys_sched_getaffinity, compat_sys_sched_setaffinity, sys32_timer_settime, compat_sys_timer_gettime, sys_timer_getoverrun +- .word sys_timer_delete, compat_sys_timer_create, sys_ni_syscall, compat_sys_io_setup, sys_io_destroy ++ .word sys_timer_delete, compat_sys_timer_create, sys32_vserver, compat_sys_io_setup, sys_io_destroy + /*270*/ .word sys32_io_submit, sys_io_cancel, compat_sys_io_getevents, sys32_mq_open, sys_mq_unlink + .word compat_sys_mq_timedsend, compat_sys_mq_timedreceive, compat_sys_mq_notify, compat_sys_mq_getsetattr, compat_sys_waitid + /*280*/ .word sys32_tee, sys_add_key, sys_request_key, compat_sys_keyctl, compat_sys_openat +@@ -148,7 +148,7 @@ sys_call_table: + /*250*/ .word sys_64_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_nis_syscall + .word sys_sync_file_range, sys_clock_settime, sys_clock_gettime, sys_clock_getres, sys_clock_nanosleep + /*260*/ .word sys_sched_getaffinity, sys_sched_setaffinity, sys_timer_settime, sys_timer_gettime, sys_timer_getoverrun +- .word sys_timer_delete, sys_timer_create, sys_ni_syscall, sys_io_setup, sys_io_destroy ++ .word sys_timer_delete, sys_timer_create, sys_vserver, sys_io_setup, sys_io_destroy + /*270*/ .word sys_io_submit, sys_io_cancel, sys_io_getevents, sys_mq_open, sys_mq_unlink + .word sys_mq_timedsend, sys_mq_timedreceive, sys_mq_notify, sys_mq_getsetattr, sys_waitid + /*280*/ .word sys_tee, sys_add_key, sys_request_key, sys_keyctl, sys_openat +diff -NurpP --minimal linux-3.3.8/arch/um/Kconfig.rest linux-3.3.8-vs2.3.3.4/arch/um/Kconfig.rest +--- linux-3.3.8/arch/um/Kconfig.rest 2012-01-09 16:14:09.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/um/Kconfig.rest 2012-02-24 03:55:06.000000000 +0100 +@@ -12,6 +12,8 @@ source "arch/um/Kconfig.net" + + source "fs/Kconfig" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.3.8/arch/um/include/shared/kern_constants.h linux-3.3.8-vs2.3.3.4/arch/um/include/shared/kern_constants.h +--- linux-3.3.8/arch/um/include/shared/kern_constants.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/um/include/shared/kern_constants.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1 @@ ++#include "../../../../include/generated/asm-offsets.h" +diff -NurpP --minimal linux-3.3.8/arch/um/include/shared/user_constants.h linux-3.3.8-vs2.3.3.4/arch/um/include/shared/user_constants.h +--- linux-3.3.8/arch/um/include/shared/user_constants.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/um/include/shared/user_constants.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,40 @@ ++/* ++ * DO NOT MODIFY. ++ * ++ * This file was generated by arch/um/Makefile ++ * ++ */ ++ ++#define HOST_SC_CR2 176 /* offsetof(struct sigcontext, cr2) # */ ++#define HOST_SC_ERR 152 /* offsetof(struct sigcontext, err) # */ ++#define HOST_SC_TRAPNO 160 /* offsetof(struct sigcontext, trapno) # */ ++#define HOST_FP_SIZE 64 /* sizeof(struct _fpstate) / sizeof(unsigned long) # */ ++#define HOST_RBX 5 /* RBX # */ ++#define HOST_RCX 11 /* RCX # */ ++#define HOST_RDI 14 /* RDI # */ ++#define HOST_RSI 13 /* RSI # */ ++#define HOST_RDX 12 /* RDX # */ ++#define HOST_RBP 4 /* RBP # */ ++#define HOST_RAX 10 /* RAX # */ ++#define HOST_R8 9 /* R8 # */ ++#define HOST_R9 8 /* R9 # */ ++#define HOST_R10 7 /* R10 # */ ++#define HOST_R11 6 /* R11 # */ ++#define HOST_R12 3 /* R12 # */ ++#define HOST_R13 2 /* R13 # */ ++#define HOST_R14 1 /* R14 # */ ++#define HOST_R15 0 /* R15 # */ ++#define HOST_ORIG_RAX 15 /* ORIG_RAX # */ ++#define HOST_CS 17 /* CS # */ ++#define HOST_SS 20 /* SS # */ ++#define HOST_EFLAGS 18 /* EFLAGS # */ ++#define HOST_IP 16 /* RIP # */ ++#define HOST_SP 19 /* RSP # */ ++#define UM_FRAME_SIZE 216 /* sizeof(struct user_regs_struct) # */ ++#define UM_POLLIN 1 /* POLLIN # */ ++#define UM_POLLPRI 2 /* POLLPRI # */ ++#define UM_POLLOUT 4 /* POLLOUT # */ ++#define UM_PROT_READ 1 /* PROT_READ # */ ++#define UM_PROT_WRITE 2 /* PROT_WRITE # */ ++#define UM_PROT_EXEC 4 /* PROT_EXEC # */ ++ +diff -NurpP --minimal linux-3.3.8/arch/x86/Kconfig linux-3.3.8-vs2.3.3.4/arch/x86/Kconfig +--- linux-3.3.8/arch/x86/Kconfig 2012-03-19 19:46:49.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/x86/Kconfig 2012-02-24 03:55:06.000000000 +0100 +@@ -2213,6 +2213,8 @@ source "fs/Kconfig" + + source "arch/x86/Kconfig.debug" + ++source "kernel/vserver/Kconfig" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff -NurpP --minimal linux-3.3.8/arch/x86/syscalls/syscall_32.tbl linux-3.3.8-vs2.3.3.4/arch/x86/syscalls/syscall_32.tbl +--- linux-3.3.8/arch/x86/syscalls/syscall_32.tbl 2012-06-08 15:23:44.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/arch/x86/syscalls/syscall_32.tbl 2012-04-03 03:02:12.000000000 +0200 +@@ -279,7 +279,7 @@ + 270 i386 tgkill sys_tgkill + 271 i386 utimes sys_utimes compat_sys_utimes + 272 i386 fadvise64_64 sys_fadvise64_64 sys32_fadvise64_64 +-273 i386 vserver ++273 i386 vserver sys_vserver sys32_vserver + 274 i386 mbind sys_mbind + 275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy + 276 i386 set_mempolicy sys_set_mempolicy +diff -NurpP --minimal linux-3.3.8/arch/x86/syscalls/syscall_64.tbl linux-3.3.8-vs2.3.3.4/arch/x86/syscalls/syscall_64.tbl +--- linux-3.3.8/arch/x86/syscalls/syscall_64.tbl 2012-03-19 19:46:51.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/arch/x86/syscalls/syscall_64.tbl 2012-02-24 16:29:25.000000000 +0100 +@@ -242,7 +242,7 @@ + 233 64 epoll_ctl sys_epoll_ctl + 234 64 tgkill sys_tgkill + 235 64 utimes sys_utimes +-236 64 vserver ++236 64 vserver sys_vserver + 237 64 mbind sys_mbind + 238 64 set_mempolicy sys_set_mempolicy + 239 64 get_mempolicy sys_get_mempolicy +diff -NurpP --minimal linux-3.3.8/drivers/block/Kconfig linux-3.3.8-vs2.3.3.4/drivers/block/Kconfig +--- linux-3.3.8/drivers/block/Kconfig 2012-03-19 19:46:52.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/drivers/block/Kconfig 2012-02-24 03:55:06.000000000 +0100 +@@ -290,6 +290,13 @@ config BLK_DEV_CRYPTOLOOP + + source "drivers/block/drbd/Kconfig" + ++config BLK_DEV_VROOT ++ tristate "Virtual Root device support" ++ depends on QUOTACTL ++ ---help--- ++ Saying Y here will allow you to use quota/fs ioctls on a shared ++ partition within a virtual server without compromising security. ++ + config BLK_DEV_NBD + tristate "Network block device support" + depends on NET +diff -NurpP --minimal linux-3.3.8/drivers/block/Makefile linux-3.3.8-vs2.3.3.4/drivers/block/Makefile +--- linux-3.3.8/drivers/block/Makefile 2012-03-19 19:46:52.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/drivers/block/Makefile 2012-02-24 03:55:06.000000000 +0100 +@@ -35,6 +35,7 @@ obj-$(CONFIG_VIODASD) += viodasd.o + obj-$(CONFIG_BLK_DEV_SX8) += sx8.o + obj-$(CONFIG_BLK_DEV_UB) += ub.o + obj-$(CONFIG_BLK_DEV_HD) += hd.o ++obj-$(CONFIG_BLK_DEV_VROOT) += vroot.o + + obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o + obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/ +diff -NurpP --minimal linux-3.3.8/drivers/block/loop.c linux-3.3.8-vs2.3.3.4/drivers/block/loop.c +--- linux-3.3.8/drivers/block/loop.c 2012-03-19 19:46:52.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/drivers/block/loop.c 2012-02-24 03:55:06.000000000 +0100 +@@ -76,6 +76,7 @@ + #include + #include + #include ++#include + + #include + +@@ -869,6 +870,7 @@ static int loop_set_fd(struct loop_devic + lo->lo_blocksize = lo_blocksize; + lo->lo_device = bdev; + lo->lo_flags = lo_flags; ++ lo->lo_xid = vx_current_xid(); + lo->lo_backing_file = file; + lo->transfer = transfer_none; + lo->ioctl = NULL; +@@ -1001,6 +1003,7 @@ static int loop_clr_fd(struct loop_devic + lo->lo_sizelimit = 0; + lo->lo_encrypt_key_size = 0; + lo->lo_thread = NULL; ++ lo->lo_xid = 0; + memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); + memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); + memset(lo->lo_file_name, 0, LO_NAME_SIZE); +@@ -1042,7 +1045,7 @@ loop_set_status(struct loop_device *lo, + + if (lo->lo_encrypt_key_size && + lo->lo_key_owner != uid && +- !capable(CAP_SYS_ADMIN)) ++ !vx_capable(CAP_SYS_ADMIN, VXC_ADMIN_CLOOP)) + return -EPERM; + if (lo->lo_state != Lo_bound) + return -ENXIO; +@@ -1132,7 +1135,8 @@ loop_get_status(struct loop_device *lo, + memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE); + info->lo_encrypt_type = + lo->lo_encryption ? lo->lo_encryption->number : 0; +- if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) { ++ if (lo->lo_encrypt_key_size && ++ vx_capable(CAP_SYS_ADMIN, VXC_ADMIN_CLOOP)) { + info->lo_encrypt_key_size = lo->lo_encrypt_key_size; + memcpy(info->lo_encrypt_key, lo->lo_encrypt_key, + lo->lo_encrypt_key_size); +@@ -1492,6 +1496,11 @@ static int lo_open(struct block_device * + goto out; + } + ++ if (!vx_check(lo->lo_xid, VS_IDENT|VS_HOSTID|VS_ADMIN_P)) { ++ err = -EACCES; ++ goto out; ++ } ++ + mutex_lock(&lo->lo_ctl_mutex); + lo->lo_refcnt++; + mutex_unlock(&lo->lo_ctl_mutex); +diff -NurpP --minimal linux-3.3.8/drivers/block/vroot.c linux-3.3.8-vs2.3.3.4/drivers/block/vroot.c +--- linux-3.3.8/drivers/block/vroot.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/drivers/block/vroot.c 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,291 @@ ++/* ++ * linux/drivers/block/vroot.c ++ * ++ * written by Herbert Pötzl, 9/11/2002 ++ * ported to 2.6.10 by Herbert Pötzl, 30/12/2004 ++ * ++ * based on the loop.c code by Theodore Ts'o. ++ * ++ * Copyright (C) 2002-2007 by Herbert Pötzl. ++ * Redistribution of this file is permitted under the ++ * GNU General Public License. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++ ++static int max_vroot = 8; ++ ++static struct vroot_device *vroot_dev; ++static struct gendisk **disks; ++ ++ ++static int vroot_set_dev( ++ struct vroot_device *vr, ++ struct block_device *bdev, ++ unsigned int arg) ++{ ++ struct block_device *real_bdev; ++ struct file *file; ++ struct inode *inode; ++ int error; ++ ++ error = -EBUSY; ++ if (vr->vr_state != Vr_unbound) ++ goto out; ++ ++ error = -EBADF; ++ file = fget(arg); ++ if (!file) ++ goto out; ++ ++ error = -EINVAL; ++ inode = file->f_dentry->d_inode; ++ ++ ++ if (S_ISBLK(inode->i_mode)) { ++ real_bdev = inode->i_bdev; ++ vr->vr_device = real_bdev; ++ __iget(real_bdev->bd_inode); ++ } else ++ goto out_fput; ++ ++ vxdprintk(VXD_CBIT(misc, 0), ++ "vroot[%d]_set_dev: dev=" VXF_DEV, ++ vr->vr_number, VXD_DEV(real_bdev)); ++ ++ vr->vr_state = Vr_bound; ++ error = 0; ++ ++ out_fput: ++ fput(file); ++ out: ++ return error; ++} ++ ++static int vroot_clr_dev( ++ struct vroot_device *vr, ++ struct block_device *bdev) ++{ ++ struct block_device *real_bdev; ++ ++ if (vr->vr_state != Vr_bound) ++ return -ENXIO; ++ if (vr->vr_refcnt > 1) /* we needed one fd for the ioctl */ ++ return -EBUSY; ++ ++ real_bdev = vr->vr_device; ++ ++ vxdprintk(VXD_CBIT(misc, 0), ++ "vroot[%d]_clr_dev: dev=" VXF_DEV, ++ vr->vr_number, VXD_DEV(real_bdev)); ++ ++ bdput(real_bdev); ++ vr->vr_state = Vr_unbound; ++ vr->vr_device = NULL; ++ return 0; ++} ++ ++ ++static int vr_ioctl(struct block_device *bdev, fmode_t mode, ++ unsigned int cmd, unsigned long arg) ++{ ++ struct vroot_device *vr = bdev->bd_disk->private_data; ++ int err; ++ ++ down(&vr->vr_ctl_mutex); ++ switch (cmd) { ++ case VROOT_SET_DEV: ++ err = vroot_set_dev(vr, bdev, arg); ++ break; ++ case VROOT_CLR_DEV: ++ err = vroot_clr_dev(vr, bdev); ++ break; ++ default: ++ err = -EINVAL; ++ break; ++ } ++ up(&vr->vr_ctl_mutex); ++ return err; ++} ++ ++static int vr_open(struct block_device *bdev, fmode_t mode) ++{ ++ struct vroot_device *vr = bdev->bd_disk->private_data; ++ ++ down(&vr->vr_ctl_mutex); ++ vr->vr_refcnt++; ++ up(&vr->vr_ctl_mutex); ++ return 0; ++} ++ ++static int vr_release(struct gendisk *disk, fmode_t mode) ++{ ++ struct vroot_device *vr = disk->private_data; ++ ++ down(&vr->vr_ctl_mutex); ++ --vr->vr_refcnt; ++ up(&vr->vr_ctl_mutex); ++ return 0; ++} ++ ++static struct block_device_operations vr_fops = { ++ .owner = THIS_MODULE, ++ .open = vr_open, ++ .release = vr_release, ++ .ioctl = vr_ioctl, ++}; ++ ++static void vroot_make_request(struct request_queue *q, struct bio *bio) ++{ ++ printk("vroot_make_request %p, %p\n", q, bio); ++ bio_io_error(bio); ++} ++ ++struct block_device *__vroot_get_real_bdev(struct block_device *bdev) ++{ ++ struct inode *inode = bdev->bd_inode; ++ struct vroot_device *vr; ++ struct block_device *real_bdev; ++ int minor = iminor(inode); ++ ++ vr = &vroot_dev[minor]; ++ real_bdev = vr->vr_device; ++ ++ vxdprintk(VXD_CBIT(misc, 0), ++ "vroot[%d]_get_real_bdev: dev=" VXF_DEV, ++ vr->vr_number, VXD_DEV(real_bdev)); ++ ++ if (vr->vr_state != Vr_bound) ++ return ERR_PTR(-ENXIO); ++ ++ __iget(real_bdev->bd_inode); ++ return real_bdev; ++} ++ ++ ++ ++/* ++ * And now the modules code and kernel interface. ++ */ ++ ++module_param(max_vroot, int, 0); ++ ++MODULE_PARM_DESC(max_vroot, "Maximum number of vroot devices (1-256)"); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS_BLOCKDEV_MAJOR(VROOT_MAJOR); ++ ++MODULE_AUTHOR ("Herbert Pötzl"); ++MODULE_DESCRIPTION ("Virtual Root Device Mapper"); ++ ++ ++int __init vroot_init(void) ++{ ++ int err, i; ++ ++ if (max_vroot < 1 || max_vroot > 256) { ++ max_vroot = MAX_VROOT_DEFAULT; ++ printk(KERN_WARNING "vroot: invalid max_vroot " ++ "(must be between 1 and 256), " ++ "using default (%d)\n", max_vroot); ++ } ++ ++ if (register_blkdev(VROOT_MAJOR, "vroot")) ++ return -EIO; ++ ++ err = -ENOMEM; ++ vroot_dev = kmalloc(max_vroot * sizeof(struct vroot_device), GFP_KERNEL); ++ if (!vroot_dev) ++ goto out_mem1; ++ memset(vroot_dev, 0, max_vroot * sizeof(struct vroot_device)); ++ ++ disks = kmalloc(max_vroot * sizeof(struct gendisk *), GFP_KERNEL); ++ if (!disks) ++ goto out_mem2; ++ ++ for (i = 0; i < max_vroot; i++) { ++ disks[i] = alloc_disk(1); ++ if (!disks[i]) ++ goto out_mem3; ++ disks[i]->queue = blk_alloc_queue(GFP_KERNEL); ++ if (!disks[i]->queue) ++ goto out_mem3; ++ blk_queue_make_request(disks[i]->queue, vroot_make_request); ++ } ++ ++ for (i = 0; i < max_vroot; i++) { ++ struct vroot_device *vr = &vroot_dev[i]; ++ struct gendisk *disk = disks[i]; ++ ++ memset(vr, 0, sizeof(*vr)); ++ sema_init(&vr->vr_ctl_mutex, 1); ++ vr->vr_number = i; ++ disk->major = VROOT_MAJOR; ++ disk->first_minor = i; ++ disk->fops = &vr_fops; ++ sprintf(disk->disk_name, "vroot%d", i); ++ disk->private_data = vr; ++ } ++ ++ err = register_vroot_grb(&__vroot_get_real_bdev); ++ if (err) ++ goto out_mem3; ++ ++ for (i = 0; i < max_vroot; i++) ++ add_disk(disks[i]); ++ printk(KERN_INFO "vroot: loaded (max %d devices)\n", max_vroot); ++ return 0; ++ ++out_mem3: ++ while (i--) ++ put_disk(disks[i]); ++ kfree(disks); ++out_mem2: ++ kfree(vroot_dev); ++out_mem1: ++ unregister_blkdev(VROOT_MAJOR, "vroot"); ++ printk(KERN_ERR "vroot: ran out of memory\n"); ++ return err; ++} ++ ++void vroot_exit(void) ++{ ++ int i; ++ ++ if (unregister_vroot_grb(&__vroot_get_real_bdev)) ++ printk(KERN_WARNING "vroot: cannot unregister grb\n"); ++ ++ for (i = 0; i < max_vroot; i++) { ++ del_gendisk(disks[i]); ++ put_disk(disks[i]); ++ } ++ unregister_blkdev(VROOT_MAJOR, "vroot"); ++ ++ kfree(disks); ++ kfree(vroot_dev); ++} ++ ++module_init(vroot_init); ++module_exit(vroot_exit); ++ ++#ifndef MODULE ++ ++static int __init max_vroot_setup(char *str) ++{ ++ max_vroot = simple_strtol(str, NULL, 0); ++ return 1; ++} ++ ++__setup("max_vroot=", max_vroot_setup); ++ ++#endif ++ +diff -NurpP --minimal linux-3.3.8/drivers/infiniband/Kconfig linux-3.3.8-vs2.3.3.4/drivers/infiniband/Kconfig +--- linux-3.3.8/drivers/infiniband/Kconfig 2012-03-19 19:46:54.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/drivers/infiniband/Kconfig 2012-02-24 03:55:06.000000000 +0100 +@@ -39,7 +39,7 @@ config INFINIBAND_USER_MEM + config INFINIBAND_ADDR_TRANS + bool + depends on INET +- depends on !(INFINIBAND = y && IPV6 = m) ++ depends on !(INFINIBAND = y && IPV6 = y) + default y + + source "drivers/infiniband/hw/mthca/Kconfig" +diff -NurpP --minimal linux-3.3.8/drivers/infiniband/core/addr.c linux-3.3.8-vs2.3.3.4/drivers/infiniband/core/addr.c +--- linux-3.3.8/drivers/infiniband/core/addr.c 2012-03-19 19:46:54.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/drivers/infiniband/core/addr.c 2012-02-24 03:55:06.000000000 +0100 +@@ -259,7 +259,7 @@ static int addr6_resolve(struct sockaddr + + if (ipv6_addr_any(&fl6.saddr)) { + ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev, +- &fl6.daddr, 0, &fl6.saddr); ++ &fl6.daddr, 0, &fl6.saddr, NULL); + if (ret) + goto put; + +diff -NurpP --minimal linux-3.3.8/drivers/md/dm-ioctl.c linux-3.3.8-vs2.3.3.4/drivers/md/dm-ioctl.c +--- linux-3.3.8/drivers/md/dm-ioctl.c 2012-03-19 19:46:59.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/drivers/md/dm-ioctl.c 2012-03-19 20:52:10.000000000 +0100 +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + + #include + +@@ -106,7 +107,8 @@ static struct hash_cell *__get_name_cell + unsigned int h = hash_str(str); + + list_for_each_entry (hc, _name_buckets + h, name_list) +- if (!strcmp(hc->name, str)) { ++ if (vx_check(dm_get_xid(hc->md), VS_WATCH_P | VS_IDENT) && ++ !strcmp(hc->name, str)) { + dm_get(hc->md); + return hc; + } +@@ -120,7 +122,8 @@ static struct hash_cell *__get_uuid_cell + unsigned int h = hash_str(str); + + list_for_each_entry (hc, _uuid_buckets + h, uuid_list) +- if (!strcmp(hc->uuid, str)) { ++ if (vx_check(dm_get_xid(hc->md), VS_WATCH_P | VS_IDENT) && ++ !strcmp(hc->uuid, str)) { + dm_get(hc->md); + return hc; + } +@@ -131,13 +134,15 @@ static struct hash_cell *__get_uuid_cell + static struct hash_cell *__get_dev_cell(uint64_t dev) + { + struct mapped_device *md; +- struct hash_cell *hc; ++ struct hash_cell *hc = NULL; + + md = dm_get_md(huge_decode_dev(dev)); + if (!md) + return NULL; + +- hc = dm_get_mdptr(md); ++ if (vx_check(dm_get_xid(md), VS_WATCH_P | VS_IDENT)) ++ hc = dm_get_mdptr(md); ++ + if (!hc) { + dm_put(md); + return NULL; +@@ -445,6 +450,9 @@ typedef int (*ioctl_fn)(struct dm_ioctl + + static int remove_all(struct dm_ioctl *param, size_t param_size) + { ++ if (!vx_check(0, VS_ADMIN)) ++ return -EPERM; ++ + dm_hash_remove_all(1); + param->data_size = 0; + return 0; +@@ -492,6 +500,8 @@ static int list_devices(struct dm_ioctl + */ + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_entry (hc, _name_buckets + i, name_list) { ++ if (!vx_check(dm_get_xid(hc->md), VS_WATCH_P | VS_IDENT)) ++ continue; + needed += sizeof(struct dm_name_list); + needed += strlen(hc->name) + 1; + needed += ALIGN_MASK; +@@ -515,6 +525,8 @@ static int list_devices(struct dm_ioctl + */ + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_entry (hc, _name_buckets + i, name_list) { ++ if (!vx_check(dm_get_xid(hc->md), VS_WATCH_P | VS_IDENT)) ++ continue; + if (old_nl) + old_nl->next = (uint32_t) ((void *) nl - + (void *) old_nl); +@@ -1615,8 +1627,8 @@ static int ctl_ioctl(uint command, struc + ioctl_fn fn = NULL; + size_t input_param_size; + +- /* only root can play with this */ +- if (!capable(CAP_SYS_ADMIN)) ++ /* only root and certain contexts can play with this */ ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_ADMIN_MAPPER)) + return -EACCES; + + if (_IOC_TYPE(command) != DM_IOCTL) +diff -NurpP --minimal linux-3.3.8/drivers/md/dm.c linux-3.3.8-vs2.3.3.4/drivers/md/dm.c +--- linux-3.3.8/drivers/md/dm.c 2012-03-19 19:46:59.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/drivers/md/dm.c 2012-02-24 03:55:06.000000000 +0100 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #include + +@@ -131,6 +132,7 @@ struct mapped_device { + rwlock_t map_lock; + atomic_t holders; + atomic_t open_count; ++ xid_t xid; + + unsigned long flags; + +@@ -343,6 +345,7 @@ int dm_deleting_md(struct mapped_device + static int dm_blk_open(struct block_device *bdev, fmode_t mode) + { + struct mapped_device *md; ++ int ret = -ENXIO; + + spin_lock(&_minor_lock); + +@@ -351,18 +354,19 @@ static int dm_blk_open(struct block_devi + goto out; + + if (test_bit(DMF_FREEING, &md->flags) || +- dm_deleting_md(md)) { +- md = NULL; ++ dm_deleting_md(md)) ++ goto out; ++ ++ ret = -EACCES; ++ if (!vx_check(md->xid, VS_IDENT|VS_HOSTID)) + goto out; +- } + + dm_get(md); + atomic_inc(&md->open_count); +- ++ ret = 0; + out: + spin_unlock(&_minor_lock); +- +- return md ? 0 : -ENXIO; ++ return ret; + } + + static int dm_blk_close(struct gendisk *disk, fmode_t mode) +@@ -583,6 +587,14 @@ int dm_set_geometry(struct mapped_device + return 0; + } + ++/* ++ * Get the xid associated with a dm device ++ */ ++xid_t dm_get_xid(struct mapped_device *md) ++{ ++ return md->xid; ++} ++ + /*----------------------------------------------------------------- + * CRUD START: + * A more elegant soln is in the works that uses the queue +@@ -1849,6 +1861,7 @@ static struct mapped_device *alloc_dev(i + INIT_LIST_HEAD(&md->uevent_list); + spin_lock_init(&md->uevent_lock); + ++ md->xid = vx_current_xid(); + md->queue = blk_alloc_queue(GFP_KERNEL); + if (!md->queue) + goto bad_queue; +diff -NurpP --minimal linux-3.3.8/drivers/md/dm.h linux-3.3.8-vs2.3.3.4/drivers/md/dm.h +--- linux-3.3.8/drivers/md/dm.h 2012-01-09 16:14:21.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/drivers/md/dm.h 2012-02-24 03:55:06.000000000 +0100 +@@ -41,6 +41,8 @@ struct dm_dev_internal { + struct dm_table; + struct dm_md_mempools; + ++xid_t dm_get_xid(struct mapped_device *md); ++ + /*----------------------------------------------------------------- + * Internal table functions. + *---------------------------------------------------------------*/ +diff -NurpP --minimal linux-3.3.8/drivers/net/tun.c linux-3.3.8-vs2.3.3.4/drivers/net/tun.c +--- linux-3.3.8/drivers/net/tun.c 2012-03-19 19:47:08.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/drivers/net/tun.c 2012-03-19 20:52:10.000000000 +0100 +@@ -64,6 +64,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -121,6 +122,7 @@ struct tun_struct { + unsigned int flags; + uid_t owner; + gid_t group; ++ nid_t nid; + + struct net_device *dev; + netdev_features_t set_features; +@@ -910,6 +912,7 @@ static void tun_setup(struct net_device + + tun->owner = -1; + tun->group = -1; ++ tun->nid = current->nid; + + dev->ethtool_ops = &tun_ethtool_ops; + dev->destructor = tun_free_netdev; +@@ -1068,7 +1071,7 @@ static int tun_set_iff(struct net *net, + + if (((tun->owner != -1 && cred->euid != tun->owner) || + (tun->group != -1 && !in_egroup_p(tun->group))) && +- !capable(CAP_NET_ADMIN)) ++ !cap_raised(current_cap(), CAP_NET_ADMIN)) + return -EPERM; + err = security_tun_dev_attach(tun->socket.sk); + if (err < 0) +@@ -1082,7 +1085,7 @@ static int tun_set_iff(struct net *net, + char *name; + unsigned long flags = 0; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!nx_capable(CAP_NET_ADMIN, NXC_TUN_CREATE)) + return -EPERM; + err = security_tun_dev_create(); + if (err < 0) +@@ -1151,6 +1154,9 @@ static int tun_set_iff(struct net *net, + + sk->sk_destruct = tun_sock_destruct; + ++ if (!nx_check(tun->nid, VS_IDENT | VS_HOSTID | VS_ADMIN_P)) ++ return -EPERM; ++ + err = tun_attach(tun, file); + if (err < 0) + goto failed; +@@ -1332,6 +1338,16 @@ static long __tun_chr_ioctl(struct file + tun_debug(KERN_INFO, tun, "group set to %d\n", tun->group); + break; + ++ case TUNSETNID: ++ if (!capable(CAP_CONTEXT)) ++ return -EPERM; ++ ++ /* Set nid owner of the device */ ++ tun->nid = (nid_t) arg; ++ ++ tun_debug(KERN_INFO, tun, "nid owner set to %u\n", tun->nid); ++ break; ++ + case TUNSETLINK: + /* Only allow setting the type when the interface is down */ + if (tun->dev->flags & IFF_UP) { +diff -NurpP --minimal linux-3.3.8/drivers/tty/sysrq.c linux-3.3.8-vs2.3.3.4/drivers/tty/sysrq.c +--- linux-3.3.8/drivers/tty/sysrq.c 2012-03-19 19:47:19.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/drivers/tty/sysrq.c 2012-02-24 04:03:15.000000000 +0100 +@@ -41,6 +41,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -395,6 +396,21 @@ static struct sysrq_key_op sysrq_unrt_op + .enable_mask = SYSRQ_ENABLE_RTNICE, + }; + ++ ++#ifdef CONFIG_VSERVER_DEBUG ++static void sysrq_handle_vxinfo(int key) ++{ ++ dump_vx_info_inactive((key == 'x') ? 0 : 1); ++} ++ ++static struct sysrq_key_op sysrq_showvxinfo_op = { ++ .handler = sysrq_handle_vxinfo, ++ .help_msg = "conteXt", ++ .action_msg = "Show Context Info", ++ .enable_mask = SYSRQ_ENABLE_DUMP, ++}; ++#endif ++ + /* Key Operations table and lock */ + static DEFINE_SPINLOCK(sysrq_key_table_lock); + +@@ -449,7 +465,11 @@ static struct sysrq_key_op *sysrq_key_ta + NULL, /* v */ + &sysrq_showstate_blocked_op, /* w */ + /* x: May be registered on ppc/powerpc for xmon */ ++#ifdef CONFIG_VSERVER_DEBUG ++ &sysrq_showvxinfo_op, /* x */ ++#else + NULL, /* x */ ++#endif + /* y: May be registered on sparc64 for global register dump */ + NULL, /* y */ + &sysrq_ftrace_dump_op, /* z */ +@@ -464,6 +484,8 @@ static int sysrq_key_table_key2index(int + retval = key - '0'; + else if ((key >= 'a') && (key <= 'z')) + retval = key + 10 - 'a'; ++ else if ((key >= 'A') && (key <= 'Z')) ++ retval = key + 10 - 'A'; + else + retval = -1; + return retval; +diff -NurpP --minimal linux-3.3.8/drivers/tty/tty_io.c linux-3.3.8-vs2.3.3.4/drivers/tty/tty_io.c +--- linux-3.3.8/drivers/tty/tty_io.c 2012-03-19 19:47:19.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/drivers/tty/tty_io.c 2012-02-24 03:55:06.000000000 +0100 +@@ -105,6 +105,7 @@ + + #include + #include ++#include + + #undef TTY_DEBUG_HANGUP + +@@ -2131,7 +2132,8 @@ static int tiocsti(struct tty_struct *tt + char ch, mbz = 0; + struct tty_ldisc *ld; + +- if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) ++ if (((current->signal->tty != tty) && ++ !vx_capable(CAP_SYS_ADMIN, VXC_TIOCSTI))) + return -EPERM; + if (get_user(ch, p)) + return -EFAULT; +@@ -2419,6 +2421,7 @@ static int tiocspgrp(struct tty_struct * + return -ENOTTY; + if (get_user(pgrp_nr, p)) + return -EFAULT; ++ pgrp_nr = vx_rmap_pid(pgrp_nr); + if (pgrp_nr < 0) + return -EINVAL; + rcu_read_lock(); +diff -NurpP --minimal linux-3.3.8/fs/attr.c linux-3.3.8-vs2.3.3.4/fs/attr.c +--- linux-3.3.8/fs/attr.c 2012-03-19 19:47:24.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/attr.c 2012-02-24 03:55:06.000000000 +0100 +@@ -14,6 +14,9 @@ + #include + #include + #include ++#include ++#include ++#include + + /** + * inode_change_ok - check if attribute changes to an inode are allowed +@@ -74,6 +77,10 @@ int inode_change_ok(const struct inode * + return -EPERM; + } + ++ /* check for inode tag permission */ ++ if (dx_permission(inode, MAY_WRITE)) ++ return -EACCES; ++ + return 0; + } + EXPORT_SYMBOL(inode_change_ok); +@@ -144,6 +151,8 @@ void setattr_copy(struct inode *inode, c + inode->i_uid = attr->ia_uid; + if (ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; ++ if ((ia_valid & ATTR_TAG) && IS_TAGGED(inode)) ++ inode->i_tag = attr->ia_tag; + if (ia_valid & ATTR_ATIME) + inode->i_atime = timespec_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); +@@ -171,7 +180,8 @@ int notify_change(struct dentry * dentry + struct timespec now; + unsigned int ia_valid = attr->ia_valid; + +- if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { ++ if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ++ ATTR_TAG | ATTR_TIMES_SET)) { + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + } +diff -NurpP --minimal linux-3.3.8/fs/block_dev.c linux-3.3.8-vs2.3.3.4/fs/block_dev.c +--- linux-3.3.8/fs/block_dev.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/block_dev.c 2012-06-08 15:27:44.000000000 +0200 +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + #include + #include "internal.h" + +@@ -580,6 +581,7 @@ struct block_device *bdget(dev_t dev) + bdev->bd_invalidated = 0; + inode->i_mode = S_IFBLK; + inode->i_rdev = dev; ++ inode->i_mdev = dev; + inode->i_bdev = bdev; + inode->i_data.a_ops = &def_blk_aops; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); +@@ -626,6 +628,11 @@ EXPORT_SYMBOL(bdput); + static struct block_device *bd_acquire(struct inode *inode) + { + struct block_device *bdev; ++ dev_t mdev; ++ ++ if (!vs_map_blkdev(inode->i_rdev, &mdev, DATTR_OPEN)) ++ return NULL; ++ inode->i_mdev = mdev; + + spin_lock(&bdev_lock); + bdev = inode->i_bdev; +@@ -636,7 +643,7 @@ static struct block_device *bd_acquire(s + } + spin_unlock(&bdev_lock); + +- bdev = bdget(inode->i_rdev); ++ bdev = bdget(mdev); + if (bdev) { + spin_lock(&bdev_lock); + if (!inode->i_bdev) { +diff -NurpP --minimal linux-3.3.8/fs/btrfs/ctree.h linux-3.3.8-vs2.3.3.4/fs/btrfs/ctree.h +--- linux-3.3.8/fs/btrfs/ctree.h 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/btrfs/ctree.h 2012-04-30 19:34:37.000000000 +0200 +@@ -646,11 +646,14 @@ struct btrfs_inode_item { + /* modification sequence number for NFS */ + __le64 sequence; + ++ __le16 tag; + /* + * a little future expansion, for more than this we can + * just grow the inode item and version it + */ +- __le64 reserved[4]; ++ __le16 reserved16; ++ __le32 reserved32; ++ __le64 reserved[3]; + struct btrfs_timespec atime; + struct btrfs_timespec ctime; + struct btrfs_timespec mtime; +@@ -1504,6 +1507,8 @@ struct btrfs_ioctl_defrag_range_args { + #define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) + #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) + ++#define BTRFS_MOUNT_TAGGED (1 << 24) ++ + #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) + #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) + #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ +@@ -1711,6 +1716,7 @@ BTRFS_SETGET_FUNCS(inode_block_group, st + BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); + BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); + BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); ++BTRFS_SETGET_FUNCS(inode_tag, struct btrfs_inode_item, tag, 16); + BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); + BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); + BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); +@@ -1764,6 +1770,10 @@ BTRFS_SETGET_FUNCS(extent_flags, struct + + BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32); + ++#define BTRFS_INODE_IXUNLINK (1 << 24) ++#define BTRFS_INODE_BARRIER (1 << 25) ++#define BTRFS_INODE_COW (1 << 26) ++ + + BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); + +@@ -2925,6 +2935,7 @@ extern const struct dentry_operations bt + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + void btrfs_update_iflags(struct inode *inode); + void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); ++int btrfs_sync_flags(struct inode *inode, int, int); + int btrfs_defrag_file(struct inode *inode, struct file *file, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_pages); +diff -NurpP --minimal linux-3.3.8/fs/btrfs/disk-io.c linux-3.3.8-vs2.3.3.4/fs/btrfs/disk-io.c +--- linux-3.3.8/fs/btrfs/disk-io.c 2012-03-19 19:47:24.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/btrfs/disk-io.c 2012-03-19 20:52:10.000000000 +0100 +@@ -2125,6 +2125,9 @@ int open_ctree(struct super_block *sb, + goto fail_alloc; + } + ++ if (btrfs_test_opt(tree_root, TAGGED)) ++ sb->s_flags |= MS_TAGGED; ++ + features = btrfs_super_incompat_flags(disk_super) & + ~BTRFS_FEATURE_INCOMPAT_SUPP; + if (features) { +diff -NurpP --minimal linux-3.3.8/fs/btrfs/inode.c linux-3.3.8-vs2.3.3.4/fs/btrfs/inode.c +--- linux-3.3.8/fs/btrfs/inode.c 2012-03-19 19:47:24.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/btrfs/inode.c 2012-03-19 20:52:10.000000000 +0100 +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + #include "compat.h" + #include "ctree.h" + #include "disk-io.h" +@@ -2350,6 +2351,8 @@ static void btrfs_read_locked_inode(stru + struct btrfs_key location; + int maybe_acls; + u32 rdev; ++ uid_t uid; ++ gid_t gid; + int ret; + bool filled = false; + +@@ -2377,8 +2380,13 @@ static void btrfs_read_locked_inode(stru + struct btrfs_inode_item); + inode->i_mode = btrfs_inode_mode(leaf, inode_item); + set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); +- inode->i_uid = btrfs_inode_uid(leaf, inode_item); +- inode->i_gid = btrfs_inode_gid(leaf, inode_item); ++ ++ uid = btrfs_inode_uid(leaf, inode_item); ++ gid = btrfs_inode_gid(leaf, inode_item); ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, ++ btrfs_inode_tag(leaf, inode_item)); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + + tspec = btrfs_inode_atime(inode_item); +@@ -2456,8 +2464,14 @@ static void fill_inode_item(struct btrfs + struct btrfs_inode_item *item, + struct inode *inode) + { +- btrfs_set_inode_uid(leaf, item, inode->i_uid); +- btrfs_set_inode_gid(leaf, item, inode->i_gid); ++ uid_t uid = TAGINO_UID(DX_TAG(inode), inode->i_uid, inode->i_tag); ++ gid_t gid = TAGINO_GID(DX_TAG(inode), inode->i_gid, inode->i_tag); ++ ++ btrfs_set_inode_uid(leaf, item, uid); ++ btrfs_set_inode_gid(leaf, item, gid); ++#ifdef CONFIG_TAGGING_INTERN ++ btrfs_set_inode_tag(leaf, item, inode->i_tag); ++#endif + btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); +@@ -7412,11 +7426,13 @@ static const struct inode_operations btr + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .permission = btrfs_permission, ++ .sync_flags = btrfs_sync_flags, + .get_acl = btrfs_get_acl, + }; + static const struct inode_operations btrfs_dir_ro_inode_operations = { + .lookup = btrfs_lookup, + .permission = btrfs_permission, ++ .sync_flags = btrfs_sync_flags, + .get_acl = btrfs_get_acl, + }; + +diff -NurpP --minimal linux-3.3.8/fs/btrfs/ioctl.c linux-3.3.8-vs2.3.3.4/fs/btrfs/ioctl.c +--- linux-3.3.8/fs/btrfs/ioctl.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/btrfs/ioctl.c 2012-04-23 23:45:14.000000000 +0200 +@@ -71,10 +71,13 @@ static unsigned int btrfs_flags_to_ioctl + { + unsigned int iflags = 0; + +- if (flags & BTRFS_INODE_SYNC) +- iflags |= FS_SYNC_FL; + if (flags & BTRFS_INODE_IMMUTABLE) + iflags |= FS_IMMUTABLE_FL; ++ if (flags & BTRFS_INODE_IXUNLINK) ++ iflags |= FS_IXUNLINK_FL; ++ ++ if (flags & BTRFS_INODE_SYNC) ++ iflags |= FS_SYNC_FL; + if (flags & BTRFS_INODE_APPEND) + iflags |= FS_APPEND_FL; + if (flags & BTRFS_INODE_NODUMP) +@@ -91,28 +94,78 @@ static unsigned int btrfs_flags_to_ioctl + else if (flags & BTRFS_INODE_NOCOMPRESS) + iflags |= FS_NOCOMP_FL; + ++ if (flags & BTRFS_INODE_BARRIER) ++ iflags |= FS_BARRIER_FL; ++ if (flags & BTRFS_INODE_COW) ++ iflags |= FS_COW_FL; + return iflags; + } + + /* +- * Update inode->i_flags based on the btrfs internal flags. ++ * Update inode->i_(v)flags based on the btrfs internal flags. + */ + void btrfs_update_iflags(struct inode *inode) + { + struct btrfs_inode *ip = BTRFS_I(inode); + +- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | ++ S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); + +- if (ip->flags & BTRFS_INODE_SYNC) +- inode->i_flags |= S_SYNC; + if (ip->flags & BTRFS_INODE_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; ++ if (ip->flags & BTRFS_INODE_IXUNLINK) ++ inode->i_flags |= S_IXUNLINK; ++ ++ if (ip->flags & BTRFS_INODE_SYNC) ++ inode->i_flags |= S_SYNC; + if (ip->flags & BTRFS_INODE_APPEND) + inode->i_flags |= S_APPEND; + if (ip->flags & BTRFS_INODE_NOATIME) + inode->i_flags |= S_NOATIME; + if (ip->flags & BTRFS_INODE_DIRSYNC) + inode->i_flags |= S_DIRSYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (ip->flags & BTRFS_INODE_BARRIER) ++ inode->i_vflags |= V_BARRIER; ++ if (ip->flags & BTRFS_INODE_COW) ++ inode->i_vflags |= V_COW; ++} ++ ++/* ++ * Update btrfs internal flags from inode->i_(v)flags. ++ */ ++void btrfs_update_flags(struct inode *inode) ++{ ++ struct btrfs_inode *ip = BTRFS_I(inode); ++ ++ unsigned int flags = inode->i_flags; ++ unsigned int vflags = inode->i_vflags; ++ ++ ip->flags &= ~(BTRFS_INODE_SYNC | BTRFS_INODE_APPEND | ++ BTRFS_INODE_IMMUTABLE | BTRFS_INODE_IXUNLINK | ++ BTRFS_INODE_NOATIME | BTRFS_INODE_DIRSYNC | ++ BTRFS_INODE_BARRIER | BTRFS_INODE_COW); ++ ++ if (flags & S_IMMUTABLE) ++ ip->flags |= BTRFS_INODE_IMMUTABLE; ++ if (flags & S_IXUNLINK) ++ ip->flags |= BTRFS_INODE_IXUNLINK; ++ ++ if (flags & S_SYNC) ++ ip->flags |= BTRFS_INODE_SYNC; ++ if (flags & S_APPEND) ++ ip->flags |= BTRFS_INODE_APPEND; ++ if (flags & S_NOATIME) ++ ip->flags |= BTRFS_INODE_NOATIME; ++ if (flags & S_DIRSYNC) ++ ip->flags |= BTRFS_INODE_DIRSYNC; ++ ++ if (vflags & V_BARRIER) ++ ip->flags |= BTRFS_INODE_BARRIER; ++ if (vflags & V_COW) ++ ip->flags |= BTRFS_INODE_COW; + } + + /* +@@ -128,6 +181,7 @@ void btrfs_inherit_iflags(struct inode * + return; + + flags = BTRFS_I(dir)->flags; ++ flags &= ~BTRFS_INODE_BARRIER; + + if (flags & BTRFS_INODE_NOCOMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; +@@ -143,6 +197,30 @@ void btrfs_inherit_iflags(struct inode * + btrfs_update_iflags(inode); + } + ++int btrfs_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ struct btrfs_inode *ip = BTRFS_I(inode); ++ struct btrfs_root *root = ip->root; ++ struct btrfs_trans_handle *trans; ++ int ret; ++ ++ trans = btrfs_join_transaction(root); ++ BUG_ON(!trans); ++ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ btrfs_update_flags(inode); ++ ++ ret = btrfs_update_inode(trans, root, inode); ++ BUG_ON(ret); ++ ++ btrfs_update_iflags(inode); ++ inode->i_ctime = CURRENT_TIME; ++ btrfs_end_transaction(trans, root); ++ ++ return 0; ++} ++ + static int btrfs_ioctl_getflags(struct file *file, void __user *arg) + { + struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode); +@@ -199,7 +277,8 @@ static int btrfs_ioctl_setflags(struct f + + flags = btrfs_mask_flags(inode->i_mode, flags); + oldflags = btrfs_flags_to_ioctl(ip->flags); +- if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { ++ if ((flags ^ oldflags) & (FS_APPEND_FL | ++ FS_IMMUTABLE_FL | FS_IXUNLINK_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + ret = -EPERM; + goto out_unlock; +@@ -210,14 +289,19 @@ static int btrfs_ioctl_setflags(struct f + if (ret) + goto out_unlock; + +- if (flags & FS_SYNC_FL) +- ip->flags |= BTRFS_INODE_SYNC; +- else +- ip->flags &= ~BTRFS_INODE_SYNC; + if (flags & FS_IMMUTABLE_FL) + ip->flags |= BTRFS_INODE_IMMUTABLE; + else + ip->flags &= ~BTRFS_INODE_IMMUTABLE; ++ if (flags & FS_IXUNLINK_FL) ++ ip->flags |= BTRFS_INODE_IXUNLINK; ++ else ++ ip->flags &= ~BTRFS_INODE_IXUNLINK; ++ ++ if (flags & FS_SYNC_FL) ++ ip->flags |= BTRFS_INODE_SYNC; ++ else ++ ip->flags &= ~BTRFS_INODE_SYNC; + if (flags & FS_APPEND_FL) + ip->flags |= BTRFS_INODE_APPEND; + else +diff -NurpP --minimal linux-3.3.8/fs/btrfs/super.c linux-3.3.8-vs2.3.3.4/fs/btrfs/super.c +--- linux-3.3.8/fs/btrfs/super.c 2012-03-19 19:47:24.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/btrfs/super.c 2012-02-24 04:42:16.000000000 +0100 +@@ -167,7 +167,7 @@ enum { + Opt_no_space_cache, Opt_recovery, Opt_skip_balance, + Opt_check_integrity, Opt_check_integrity_including_extent_data, + Opt_check_integrity_print_mask, +- Opt_err, ++ Opt_tag, Opt_notag, Opt_tagid, Opt_err, + }; + + static match_table_t tokens = { +@@ -206,6 +206,9 @@ static match_table_t tokens = { + {Opt_check_integrity, "check_int"}, + {Opt_check_integrity_including_extent_data, "check_int_data"}, + {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, ++ {Opt_tag, "tag"}, ++ {Opt_notag, "notag"}, ++ {Opt_tagid, "tagid=%u"}, + {Opt_err, NULL}, + }; + +@@ -438,6 +441,22 @@ int btrfs_parse_options(struct btrfs_roo + ret = -EINVAL; + goto out; + #endif ++#ifndef CONFIG_TAGGING_NONE ++ case Opt_tag: ++ printk(KERN_INFO "btrfs: use tagging\n"); ++ btrfs_set_opt(info->mount_opt, TAGGED); ++ break; ++ case Opt_notag: ++ printk(KERN_INFO "btrfs: disabled tagging\n"); ++ btrfs_clear_opt(info->mount_opt, TAGGED); ++ break; ++#endif ++#ifdef CONFIG_PROPAGATE ++ case Opt_tagid: ++ /* use args[0] */ ++ btrfs_set_opt(info->mount_opt, TAGGED); ++ break; ++#endif + case Opt_err: + printk(KERN_INFO "btrfs: unrecognized mount option " + "'%s'\n", p); +@@ -1005,6 +1024,12 @@ static int btrfs_remount(struct super_bl + if (ret) + return -EINVAL; + ++ if (btrfs_test_opt(root, TAGGED) && !(sb->s_flags & MS_TAGGED)) { ++ printk("btrfs: %s: tagging not permitted on remount.\n", ++ sb->s_id); ++ return -EINVAL; ++ } ++ + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + return 0; + +diff -NurpP --minimal linux-3.3.8/fs/char_dev.c linux-3.3.8-vs2.3.3.4/fs/char_dev.c +--- linux-3.3.8/fs/char_dev.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/char_dev.c 2012-02-24 03:55:06.000000000 +0100 +@@ -21,6 +21,8 @@ + #include + #include + #include ++#include ++#include + + #include "internal.h" + +@@ -371,14 +373,21 @@ static int chrdev_open(struct inode *ino + struct cdev *p; + struct cdev *new = NULL; + int ret = 0; ++ dev_t mdev; ++ ++ if (!vs_map_chrdev(inode->i_rdev, &mdev, DATTR_OPEN)) ++ return -EPERM; ++ inode->i_mdev = mdev; + + spin_lock(&cdev_lock); + p = inode->i_cdev; + if (!p) { + struct kobject *kobj; + int idx; ++ + spin_unlock(&cdev_lock); +- kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); ++ ++ kobj = kobj_lookup(cdev_map, mdev, &idx); + if (!kobj) + return -ENXIO; + new = container_of(kobj, struct cdev, kobj); +diff -NurpP --minimal linux-3.3.8/fs/dcache.c linux-3.3.8-vs2.3.3.4/fs/dcache.c +--- linux-3.3.8/fs/dcache.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/dcache.c 2012-04-03 03:02:12.000000000 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + #include "internal.h" + #include "mount.h" + +@@ -560,6 +561,8 @@ int d_invalidate(struct dentry * dentry) + spin_lock(&dentry->d_lock); + } + ++ vx_dentry_dec(dentry); ++ + /* + * Somebody else still using it? + * +@@ -589,6 +592,7 @@ EXPORT_SYMBOL(d_invalidate); + static inline void __dget_dlock(struct dentry *dentry) + { + dentry->d_count++; ++ vx_dentry_inc(dentry); + } + + static inline void __dget(struct dentry *dentry) +@@ -1213,6 +1217,9 @@ struct dentry *__d_alloc(struct super_bl + struct dentry *dentry; + char *dname; + ++ if (!vx_dentry_avail(1)) ++ return NULL; ++ + dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); + if (!dentry) + return NULL; +@@ -1235,6 +1242,7 @@ struct dentry *__d_alloc(struct super_bl + + dentry->d_count = 1; + dentry->d_flags = 0; ++ vx_dentry_inc(dentry); + spin_lock_init(&dentry->d_lock); + seqcount_init(&dentry->d_seq); + dentry->d_inode = NULL; +@@ -1920,6 +1928,7 @@ struct dentry *__d_lookup(struct dentry + } + + dentry->d_count++; ++ vx_dentry_inc(dentry); + found = dentry; + spin_unlock(&dentry->d_lock); + break; +diff -NurpP --minimal linux-3.3.8/fs/devpts/inode.c linux-3.3.8-vs2.3.3.4/fs/devpts/inode.c +--- linux-3.3.8/fs/devpts/inode.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/devpts/inode.c 2012-02-24 03:55:06.000000000 +0100 +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + + #define DEVPTS_DEFAULT_MODE 0600 + /* +@@ -36,6 +37,20 @@ + #define DEVPTS_DEFAULT_PTMX_MODE 0000 + #define PTMX_MINOR 2 + ++static int devpts_permission(struct inode *inode, int mask) ++{ ++ int ret = -EACCES; ++ ++ /* devpts is xid tagged */ ++ if (vx_check((xid_t)inode->i_tag, VS_WATCH_P | VS_IDENT)) ++ ret = generic_permission(inode, mask); ++ return ret; ++} ++ ++static struct inode_operations devpts_file_inode_operations = { ++ .permission = devpts_permission, ++}; ++ + extern int pty_limit; /* Config limit on Unix98 ptys */ + static DEFINE_MUTEX(allocated_ptys_lock); + +@@ -263,6 +278,34 @@ static int devpts_show_options(struct se + return 0; + } + ++static int devpts_filter(struct dentry *de) ++{ ++ xid_t xid = 0; ++ ++ /* devpts is xid tagged */ ++ if (de && de->d_inode) ++ xid = (xid_t)de->d_inode->i_tag; ++#ifdef CONFIG_VSERVER_WARN_DEVPTS ++ else ++ vxwprintk_task(1, "devpts " VS_Q("%.*s") " without inode.", ++ de->d_name.len, de->d_name.name); ++#endif ++ return vx_check(xid, VS_WATCH_P | VS_IDENT); ++} ++ ++static int devpts_readdir(struct file * filp, void * dirent, filldir_t filldir) ++{ ++ return dcache_readdir_filter(filp, dirent, filldir, devpts_filter); ++} ++ ++static struct file_operations devpts_dir_operations = { ++ .open = dcache_dir_open, ++ .release = dcache_dir_close, ++ .llseek = dcache_dir_lseek, ++ .read = generic_read_dir, ++ .readdir = devpts_readdir, ++}; ++ + static const struct super_operations devpts_sops = { + .statfs = simple_statfs, + .remount_fs = devpts_remount, +@@ -306,8 +349,10 @@ devpts_fill_super(struct super_block *s, + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; + inode->i_op = &simple_dir_inode_operations; +- inode->i_fop = &simple_dir_operations; ++ inode->i_fop = &devpts_dir_operations; + set_nlink(inode, 2); ++ /* devpts is xid tagged */ ++ inode->i_tag = (tag_t)vx_current_xid(); + + s->s_root = d_alloc_root(inode); + if (s->s_root) +@@ -492,6 +537,9 @@ int devpts_pty_new(struct inode *ptmx_in + inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + init_special_inode(inode, S_IFCHR|opts->mode, device); ++ /* devpts is xid tagged */ ++ inode->i_tag = (tag_t)vx_current_xid(); ++ inode->i_op = &devpts_file_inode_operations; + inode->i_private = tty; + tty->driver_data = inode; + +diff -NurpP --minimal linux-3.3.8/fs/ext2/balloc.c linux-3.3.8-vs2.3.3.4/fs/ext2/balloc.c +--- linux-3.3.8/fs/ext2/balloc.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext2/balloc.c 2012-02-24 03:55:06.000000000 +0100 +@@ -701,7 +701,6 @@ ext2_try_to_allocate(struct super_block + start = 0; + end = EXT2_BLOCKS_PER_GROUP(sb); + } +- + BUG_ON(start > EXT2_BLOCKS_PER_GROUP(sb)); + + repeat: +diff -NurpP --minimal linux-3.3.8/fs/ext2/ext2.h linux-3.3.8-vs2.3.3.4/fs/ext2/ext2.h +--- linux-3.3.8/fs/ext2/ext2.h 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext2/ext2.h 2012-02-24 03:55:06.000000000 +0100 +@@ -126,6 +126,7 @@ extern void ext2_set_inode_flags(struct + extern void ext2_get_inode_flags(struct ext2_inode_info *); + extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); ++extern int ext2_sync_flags(struct inode *, int, int); + + /* ioctl.c */ + extern long ext2_ioctl(struct file *, unsigned int, unsigned long); +diff -NurpP --minimal linux-3.3.8/fs/ext2/file.c linux-3.3.8-vs2.3.3.4/fs/ext2/file.c +--- linux-3.3.8/fs/ext2/file.c 2011-10-24 18:45:27.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/ext2/file.c 2012-02-24 03:55:06.000000000 +0100 +@@ -104,4 +104,5 @@ const struct inode_operations ext2_file_ + .setattr = ext2_setattr, + .get_acl = ext2_get_acl, + .fiemap = ext2_fiemap, ++ .sync_flags = ext2_sync_flags, + }; +diff -NurpP --minimal linux-3.3.8/fs/ext2/ialloc.c linux-3.3.8-vs2.3.3.4/fs/ext2/ialloc.c +--- linux-3.3.8/fs/ext2/ialloc.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext2/ialloc.c 2012-02-24 03:55:06.000000000 +0100 +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include "ext2.h" + #include "xattr.h" + #include "acl.h" +@@ -549,6 +550,7 @@ got: + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; ++ inode->i_tag = dx_current_fstag(sb); + } else + inode_init_owner(inode, dir, mode); + +diff -NurpP --minimal linux-3.3.8/fs/ext2/inode.c linux-3.3.8-vs2.3.3.4/fs/ext2/inode.c +--- linux-3.3.8/fs/ext2/inode.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext2/inode.c 2012-02-24 03:55:06.000000000 +0100 +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + #include "ext2.h" + #include "acl.h" + #include "xip.h" +@@ -1162,7 +1163,7 @@ static void ext2_truncate_blocks(struct + return; + if (ext2_inode_is_fast_symlink(inode)) + return; +- if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) ++ if (IS_APPEND(inode) || IS_IXORUNLINK(inode)) + return; + __ext2_truncate_blocks(inode, offset); + } +@@ -1253,36 +1254,61 @@ void ext2_set_inode_flags(struct inode * + { + unsigned int flags = EXT2_I(inode)->i_flags; + +- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | ++ S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); ++ ++ ++ if (flags & EXT2_IMMUTABLE_FL) ++ inode->i_flags |= S_IMMUTABLE; ++ if (flags & EXT2_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; ++ + if (flags & EXT2_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT2_APPEND_FL) + inode->i_flags |= S_APPEND; +- if (flags & EXT2_IMMUTABLE_FL) +- inode->i_flags |= S_IMMUTABLE; + if (flags & EXT2_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & EXT2_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (flags & EXT2_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ if (flags & EXT2_COW_FL) ++ inode->i_vflags |= V_COW; + } + + /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ + void ext2_get_inode_flags(struct ext2_inode_info *ei) + { + unsigned int flags = ei->vfs_inode.i_flags; ++ unsigned int vflags = ei->vfs_inode.i_vflags; ++ ++ ei->i_flags &= ~(EXT2_SYNC_FL | EXT2_APPEND_FL | ++ EXT2_IMMUTABLE_FL | EXT2_IXUNLINK_FL | ++ EXT2_NOATIME_FL | EXT2_DIRSYNC_FL | ++ EXT2_BARRIER_FL | EXT2_COW_FL); ++ ++ if (flags & S_IMMUTABLE) ++ ei->i_flags |= EXT2_IMMUTABLE_FL; ++ if (flags & S_IXUNLINK) ++ ei->i_flags |= EXT2_IXUNLINK_FL; + +- ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL| +- EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL); + if (flags & S_SYNC) + ei->i_flags |= EXT2_SYNC_FL; + if (flags & S_APPEND) + ei->i_flags |= EXT2_APPEND_FL; +- if (flags & S_IMMUTABLE) +- ei->i_flags |= EXT2_IMMUTABLE_FL; + if (flags & S_NOATIME) + ei->i_flags |= EXT2_NOATIME_FL; + if (flags & S_DIRSYNC) + ei->i_flags |= EXT2_DIRSYNC_FL; ++ ++ if (vflags & V_BARRIER) ++ ei->i_flags |= EXT2_BARRIER_FL; ++ if (vflags & V_COW) ++ ei->i_flags |= EXT2_COW_FL; + } + + struct inode *ext2_iget (struct super_block *sb, unsigned long ino) +@@ -1292,6 +1318,8 @@ struct inode *ext2_iget (struct super_bl + struct ext2_inode *raw_inode; + struct inode *inode; + long ret = -EIO; ++ uid_t uid; ++ gid_t gid; + int n; + + inode = iget_locked(sb, ino); +@@ -1310,12 +1338,16 @@ struct inode *ext2_iget (struct super_bl + } + + inode->i_mode = le16_to_cpu(raw_inode->i_mode); +- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); +- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); ++ uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); ++ gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (!(test_opt (inode->i_sb, NO_UID32))) { +- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; +- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; ++ uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; ++ gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, ++ le16_to_cpu(raw_inode->i_raw_tag)); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); + inode->i_size = le32_to_cpu(raw_inode->i_size); + inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); +@@ -1413,8 +1445,8 @@ static int __ext2_write_inode(struct ino + struct ext2_inode_info *ei = EXT2_I(inode); + struct super_block *sb = inode->i_sb; + ino_t ino = inode->i_ino; +- uid_t uid = inode->i_uid; +- gid_t gid = inode->i_gid; ++ uid_t uid = TAGINO_UID(DX_TAG(inode), inode->i_uid, inode->i_tag); ++ gid_t gid = TAGINO_GID(DX_TAG(inode), inode->i_gid, inode->i_tag); + struct buffer_head * bh; + struct ext2_inode * raw_inode = ext2_get_inode(sb, ino, &bh); + int n; +@@ -1450,6 +1482,9 @@ static int __ext2_write_inode(struct ino + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } ++#ifdef CONFIG_TAGGING_INTERN ++ raw_inode->i_raw_tag = cpu_to_le16(inode->i_tag); ++#endif + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le32(inode->i_size); + raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); +@@ -1530,7 +1565,8 @@ int ext2_setattr(struct dentry *dentry, + if (is_quota_modification(inode, iattr)) + dquot_initialize(inode); + if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || +- (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { ++ (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) || ++ (iattr->ia_valid & ATTR_TAG && iattr->ia_tag != inode->i_tag)) { + error = dquot_transfer(inode, iattr); + if (error) + return error; +diff -NurpP --minimal linux-3.3.8/fs/ext2/ioctl.c linux-3.3.8-vs2.3.3.4/fs/ext2/ioctl.c +--- linux-3.3.8/fs/ext2/ioctl.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext2/ioctl.c 2012-02-24 03:55:06.000000000 +0100 +@@ -17,6 +17,16 @@ + #include + + ++int ext2_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ ext2_get_inode_flags(EXT2_I(inode)); ++ inode->i_ctime = CURRENT_TIME_SEC; ++ mark_inode_dirty(inode); ++ return 0; ++} ++ + long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_dentry->d_inode; +@@ -51,6 +61,11 @@ long ext2_ioctl(struct file *filp, unsig + + flags = ext2_mask_flags(inode->i_mode, flags); + ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -EACCES; ++ } ++ + mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { +@@ -66,7 +81,9 @@ long ext2_ioctl(struct file *filp, unsig + * + * This test looks nicer. Thanks to Pauline Middelink + */ +- if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) { ++ if ((oldflags & EXT2_IMMUTABLE_FL) || ++ ((flags ^ oldflags) & (EXT2_APPEND_FL | ++ EXT2_IMMUTABLE_FL | EXT2_IXUNLINK_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); + ret = -EPERM; +@@ -74,7 +91,7 @@ long ext2_ioctl(struct file *filp, unsig + } + } + +- flags = flags & EXT2_FL_USER_MODIFIABLE; ++ flags &= EXT2_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE; + ei->i_flags = flags; + +diff -NurpP --minimal linux-3.3.8/fs/ext2/namei.c linux-3.3.8-vs2.3.3.4/fs/ext2/namei.c +--- linux-3.3.8/fs/ext2/namei.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext2/namei.c 2012-02-24 03:55:06.000000000 +0100 +@@ -32,6 +32,7 @@ + + #include + #include ++#include + #include "ext2.h" + #include "xattr.h" + #include "acl.h" +@@ -73,6 +74,7 @@ static struct dentry *ext2_lookup(struct + (unsigned long) ino); + return ERR_PTR(-EIO); + } ++ dx_propagate_tag(nd, inode); + } + return d_splice_alias(inode, dentry); + } +@@ -408,6 +410,7 @@ const struct inode_operations ext2_dir_i + .removexattr = generic_removexattr, + #endif + .setattr = ext2_setattr, ++ .sync_flags = ext2_sync_flags, + .get_acl = ext2_get_acl, + }; + +diff -NurpP --minimal linux-3.3.8/fs/ext2/super.c linux-3.3.8-vs2.3.3.4/fs/ext2/super.c +--- linux-3.3.8/fs/ext2/super.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext2/super.c 2012-02-24 03:55:06.000000000 +0100 +@@ -393,7 +393,8 @@ enum { + Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, + Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, + Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota, +- Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation ++ Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation, ++ Opt_tag, Opt_notag, Opt_tagid + }; + + static const match_table_t tokens = { +@@ -421,6 +422,9 @@ static const match_table_t tokens = { + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_xip, "xip"}, ++ {Opt_tag, "tag"}, ++ {Opt_notag, "notag"}, ++ {Opt_tagid, "tagid=%u"}, + {Opt_grpquota, "grpquota"}, + {Opt_ignore, "noquota"}, + {Opt_quota, "quota"}, +@@ -491,6 +495,20 @@ static int parse_options(char *options, + case Opt_nouid32: + set_opt (sbi->s_mount_opt, NO_UID32); + break; ++#ifndef CONFIG_TAGGING_NONE ++ case Opt_tag: ++ set_opt (sbi->s_mount_opt, TAGGED); ++ break; ++ case Opt_notag: ++ clear_opt (sbi->s_mount_opt, TAGGED); ++ break; ++#endif ++#ifdef CONFIG_PROPAGATE ++ case Opt_tagid: ++ /* use args[0] */ ++ set_opt (sbi->s_mount_opt, TAGGED); ++ break; ++#endif + case Opt_nocheck: + clear_opt (sbi->s_mount_opt, CHECK); + break; +@@ -849,6 +867,8 @@ static int ext2_fill_super(struct super_ + if (!parse_options((char *) data, sb)) + goto failed_mount; + ++ if (EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_TAGGED) ++ sb->s_flags |= MS_TAGGED; + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? + MS_POSIXACL : 0); +@@ -1223,6 +1243,14 @@ static int ext2_remount (struct super_bl + goto restore_opts; + } + ++ if ((sbi->s_mount_opt & EXT2_MOUNT_TAGGED) && ++ !(sb->s_flags & MS_TAGGED)) { ++ printk("EXT2-fs: %s: tagging not permitted on remount.\n", ++ sb->s_id); ++ err = -EINVAL; ++ goto restore_opts; ++ } ++ + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + +diff -NurpP --minimal linux-3.3.8/fs/ext3/file.c linux-3.3.8-vs2.3.3.4/fs/ext3/file.c +--- linux-3.3.8/fs/ext3/file.c 2011-10-24 18:45:27.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/ext3/file.c 2012-02-24 03:55:06.000000000 +0100 +@@ -80,5 +80,6 @@ const struct inode_operations ext3_file_ + #endif + .get_acl = ext3_get_acl, + .fiemap = ext3_fiemap, ++ .sync_flags = ext3_sync_flags, + }; + +diff -NurpP --minimal linux-3.3.8/fs/ext3/ialloc.c linux-3.3.8-vs2.3.3.4/fs/ext3/ialloc.c +--- linux-3.3.8/fs/ext3/ialloc.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext3/ialloc.c 2012-02-24 03:55:06.000000000 +0100 +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -496,6 +497,7 @@ got: + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; ++ inode->i_tag = dx_current_fstag(sb); + } else + inode_init_owner(inode, dir, mode); + +diff -NurpP --minimal linux-3.3.8/fs/ext3/inode.c linux-3.3.8-vs2.3.3.4/fs/ext3/inode.c +--- linux-3.3.8/fs/ext3/inode.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext3/inode.c 2012-02-24 03:55:06.000000000 +0100 +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + #include + #include "xattr.h" + #include "acl.h" +@@ -2855,36 +2856,60 @@ void ext3_set_inode_flags(struct inode * + { + unsigned int flags = EXT3_I(inode)->i_flags; + +- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | ++ S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); ++ ++ if (flags & EXT3_IMMUTABLE_FL) ++ inode->i_flags |= S_IMMUTABLE; ++ if (flags & EXT3_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; ++ + if (flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT3_APPEND_FL) + inode->i_flags |= S_APPEND; +- if (flags & EXT3_IMMUTABLE_FL) +- inode->i_flags |= S_IMMUTABLE; + if (flags & EXT3_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & EXT3_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (flags & EXT3_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ if (flags & EXT3_COW_FL) ++ inode->i_vflags |= V_COW; + } + + /* Propagate flags from i_flags to EXT3_I(inode)->i_flags */ + void ext3_get_inode_flags(struct ext3_inode_info *ei) + { + unsigned int flags = ei->vfs_inode.i_flags; ++ unsigned int vflags = ei->vfs_inode.i_vflags; ++ ++ ei->i_flags &= ~(EXT3_SYNC_FL | EXT3_APPEND_FL | ++ EXT3_IMMUTABLE_FL | EXT3_IXUNLINK_FL | ++ EXT3_NOATIME_FL | EXT3_DIRSYNC_FL | ++ EXT3_BARRIER_FL | EXT3_COW_FL); ++ ++ if (flags & S_IMMUTABLE) ++ ei->i_flags |= EXT3_IMMUTABLE_FL; ++ if (flags & S_IXUNLINK) ++ ei->i_flags |= EXT3_IXUNLINK_FL; + +- ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL| +- EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL); + if (flags & S_SYNC) + ei->i_flags |= EXT3_SYNC_FL; + if (flags & S_APPEND) + ei->i_flags |= EXT3_APPEND_FL; +- if (flags & S_IMMUTABLE) +- ei->i_flags |= EXT3_IMMUTABLE_FL; + if (flags & S_NOATIME) + ei->i_flags |= EXT3_NOATIME_FL; + if (flags & S_DIRSYNC) + ei->i_flags |= EXT3_DIRSYNC_FL; ++ ++ if (vflags & V_BARRIER) ++ ei->i_flags |= EXT3_BARRIER_FL; ++ if (vflags & V_COW) ++ ei->i_flags |= EXT3_COW_FL; + } + + struct inode *ext3_iget(struct super_block *sb, unsigned long ino) +@@ -2898,6 +2923,8 @@ struct inode *ext3_iget(struct super_blo + transaction_t *transaction; + long ret; + int block; ++ uid_t uid; ++ gid_t gid; + + inode = iget_locked(sb, ino); + if (!inode) +@@ -2914,12 +2941,16 @@ struct inode *ext3_iget(struct super_blo + bh = iloc.bh; + raw_inode = ext3_raw_inode(&iloc); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); +- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); +- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); ++ uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); ++ gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if(!(test_opt (inode->i_sb, NO_UID32))) { +- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; +- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; ++ uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; ++ gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, ++ le16_to_cpu(raw_inode->i_raw_tag)); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); + inode->i_size = le32_to_cpu(raw_inode->i_size); + inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); +@@ -3074,6 +3105,8 @@ static int ext3_do_update_inode(handle_t + struct ext3_inode *raw_inode = ext3_raw_inode(iloc); + struct ext3_inode_info *ei = EXT3_I(inode); + struct buffer_head *bh = iloc->bh; ++ uid_t uid = TAGINO_UID(DX_TAG(inode), inode->i_uid, inode->i_tag); ++ gid_t gid = TAGINO_GID(DX_TAG(inode), inode->i_gid, inode->i_tag); + int err = 0, rc, block; + + again: +@@ -3088,29 +3121,32 @@ again: + ext3_get_inode_flags(ei); + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + if(!(test_opt(inode->i_sb, NO_UID32))) { +- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); +- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); ++ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid)); ++ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid)); + /* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if(!ei->i_dtime) { + raw_inode->i_uid_high = +- cpu_to_le16(high_16_bits(inode->i_uid)); ++ cpu_to_le16(high_16_bits(uid)); + raw_inode->i_gid_high = +- cpu_to_le16(high_16_bits(inode->i_gid)); ++ cpu_to_le16(high_16_bits(gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = +- cpu_to_le16(fs_high2lowuid(inode->i_uid)); ++ cpu_to_le16(fs_high2lowuid(uid)); + raw_inode->i_gid_low = +- cpu_to_le16(fs_high2lowgid(inode->i_gid)); ++ cpu_to_le16(fs_high2lowgid(gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } ++#ifdef CONFIG_TAGGING_INTERN ++ raw_inode->i_raw_tag = cpu_to_le16(inode->i_tag); ++#endif + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le32(ei->i_disksize); + raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); +@@ -3270,7 +3306,8 @@ int ext3_setattr(struct dentry *dentry, + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || +- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { ++ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid) || ++ (ia_valid & ATTR_TAG && attr->ia_tag != inode->i_tag)) { + handle_t *handle; + + /* (user+group)*(old+new) structure, inode write (sb, +@@ -3292,6 +3329,8 @@ int ext3_setattr(struct dentry *dentry, + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; ++ if ((attr->ia_valid & ATTR_TAG) && IS_TAGGED(inode)) ++ inode->i_tag = attr->ia_tag; + error = ext3_mark_inode_dirty(handle, inode); + ext3_journal_stop(handle); + } +diff -NurpP --minimal linux-3.3.8/fs/ext3/ioctl.c linux-3.3.8-vs2.3.3.4/fs/ext3/ioctl.c +--- linux-3.3.8/fs/ext3/ioctl.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext3/ioctl.c 2012-02-24 03:55:06.000000000 +0100 +@@ -8,6 +8,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -17,6 +18,34 @@ + #include + #include + ++ ++int ext3_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ handle_t *handle = NULL; ++ struct ext3_iloc iloc; ++ int err; ++ ++ handle = ext3_journal_start(inode, 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ err = ext3_reserve_inode_write(handle, inode, &iloc); ++ if (err) ++ goto flags_err; ++ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ ext3_get_inode_flags(EXT3_I(inode)); ++ inode->i_ctime = CURRENT_TIME_SEC; ++ ++ err = ext3_mark_iloc_dirty(handle, inode, &iloc); ++flags_err: ++ ext3_journal_stop(handle); ++ return err; ++} ++ + long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_dentry->d_inode; +@@ -50,6 +79,11 @@ long ext3_ioctl(struct file *filp, unsig + + flags = ext3_mask_flags(inode->i_mode, flags); + ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -EACCES; ++ } ++ + mutex_lock(&inode->i_mutex); + + /* Is it quota file? Do not allow user to mess with it */ +@@ -68,7 +102,9 @@ long ext3_ioctl(struct file *filp, unsig + * + * This test looks nicer. Thanks to Pauline Middelink + */ +- if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { ++ if ((oldflags & EXT3_IMMUTABLE_FL) || ++ ((flags ^ oldflags) & (EXT3_APPEND_FL | ++ EXT3_IMMUTABLE_FL | EXT3_IXUNLINK_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; + } +@@ -93,7 +129,7 @@ long ext3_ioctl(struct file *filp, unsig + if (err) + goto flags_err; + +- flags = flags & EXT3_FL_USER_MODIFIABLE; ++ flags &= EXT3_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; + ei->i_flags = flags; + +diff -NurpP --minimal linux-3.3.8/fs/ext3/namei.c linux-3.3.8-vs2.3.3.4/fs/ext3/namei.c +--- linux-3.3.8/fs/ext3/namei.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext3/namei.c 2012-02-24 03:55:06.000000000 +0100 +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + #include + + #include "namei.h" +@@ -927,6 +928,7 @@ restart: + submit_bh(READ | REQ_META | REQ_PRIO, + bh); + } ++ dx_propagate_tag(nd, inode); + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) +@@ -2538,6 +2540,7 @@ const struct inode_operations ext3_dir_i + .listxattr = ext3_listxattr, + .removexattr = generic_removexattr, + #endif ++ .sync_flags = ext3_sync_flags, + .get_acl = ext3_get_acl, + }; + +diff -NurpP --minimal linux-3.3.8/fs/ext3/super.c linux-3.3.8-vs2.3.3.4/fs/ext3/super.c +--- linux-3.3.8/fs/ext3/super.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext3/super.c 2012-02-24 03:55:06.000000000 +0100 +@@ -830,7 +830,8 @@ enum { + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, + Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, +- Opt_resize, Opt_usrquota, Opt_grpquota ++ Opt_resize, Opt_usrquota, Opt_grpquota, ++ Opt_tag, Opt_notag, Opt_tagid + }; + + static const match_table_t tokens = { +@@ -887,6 +888,9 @@ static const match_table_t tokens = { + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_resize, "resize"}, ++ {Opt_tag, "tag"}, ++ {Opt_notag, "notag"}, ++ {Opt_tagid, "tagid=%u"}, + {Opt_err, NULL}, + }; + +@@ -1039,6 +1043,20 @@ static int parse_options (char *options, + case Opt_nouid32: + set_opt (sbi->s_mount_opt, NO_UID32); + break; ++#ifndef CONFIG_TAGGING_NONE ++ case Opt_tag: ++ set_opt (sbi->s_mount_opt, TAGGED); ++ break; ++ case Opt_notag: ++ clear_opt (sbi->s_mount_opt, TAGGED); ++ break; ++#endif ++#ifdef CONFIG_PROPAGATE ++ case Opt_tagid: ++ /* use args[0] */ ++ set_opt (sbi->s_mount_opt, TAGGED); ++ break; ++#endif + case Opt_nocheck: + clear_opt (sbi->s_mount_opt, CHECK); + break; +@@ -1737,6 +1755,9 @@ static int ext3_fill_super (struct super + NULL, 0)) + goto failed_mount; + ++ if (EXT3_SB(sb)->s_mount_opt & EXT3_MOUNT_TAGGED) ++ sb->s_flags |= MS_TAGGED; ++ + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + +@@ -2619,6 +2640,14 @@ static int ext3_remount (struct super_bl + if (test_opt(sb, ABORT)) + ext3_abort(sb, __func__, "Abort forced by user"); + ++ if ((sbi->s_mount_opt & EXT3_MOUNT_TAGGED) && ++ !(sb->s_flags & MS_TAGGED)) { ++ printk("EXT3-fs: %s: tagging not permitted on remount.\n", ++ sb->s_id); ++ err = -EINVAL; ++ goto restore_opts; ++ } ++ + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + +diff -NurpP --minimal linux-3.3.8/fs/ext4/ext4.h linux-3.3.8-vs2.3.3.4/fs/ext4/ext4.h +--- linux-3.3.8/fs/ext4/ext4.h 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/ext4/ext4.h 2012-04-23 23:45:14.000000000 +0200 +@@ -373,8 +373,12 @@ struct flex_groups { + #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ + #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ + #define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ ++#define EXT4_IXUNLINK_FL 0x08000000 /* Immutable invert on unlink */ + #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + ++#define EXT4_BARRIER_FL 0x04000000 /* Barrier for chroot() */ ++#define EXT4_COW_FL 0x20000000 /* Copy on Write marker */ ++ + #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ + #define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ + +@@ -643,7 +647,8 @@ struct ext4_inode { + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ +- __u32 l_i_reserved2; ++ __le16 l_i_tag; /* Context Tag */ ++ __u16 l_i_reserved2; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ +@@ -761,6 +766,7 @@ do { \ + #define i_gid_low i_gid + #define i_uid_high osd2.linux2.l_i_uid_high + #define i_gid_high osd2.linux2.l_i_gid_high ++#define i_raw_tag osd2.linux2.l_i_tag + #define i_reserved2 osd2.linux2.l_i_reserved2 + + #elif defined(__GNU__) +@@ -937,6 +943,7 @@ struct ext4_inode_info { + #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ + #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ + #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ ++#define EXT4_MOUNT_TAGGED 0x40000 /* Enable Context Tags */ + #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ + #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ + #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +@@ -2274,6 +2281,7 @@ extern int ext4_map_blocks(handle_t *han + struct ext4_map_blocks *map, int flags); + extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); ++extern int ext4_sync_flags(struct inode *, int, int); + /* move_extent.c */ + extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 start_orig, __u64 start_donor, +diff -NurpP --minimal linux-3.3.8/fs/ext4/file.c linux-3.3.8-vs2.3.3.4/fs/ext4/file.c +--- linux-3.3.8/fs/ext4/file.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext4/file.c 2012-02-24 03:55:06.000000000 +0100 +@@ -258,5 +258,6 @@ const struct inode_operations ext4_file_ + #endif + .get_acl = ext4_get_acl, + .fiemap = ext4_fiemap, ++ .sync_flags = ext4_sync_flags, + }; + +diff -NurpP --minimal linux-3.3.8/fs/ext4/ialloc.c linux-3.3.8-vs2.3.3.4/fs/ext4/ialloc.c +--- linux-3.3.8/fs/ext4/ialloc.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext4/ialloc.c 2012-02-24 03:55:06.000000000 +0100 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + + #include "ext4.h" +@@ -860,6 +861,7 @@ got: + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; ++ inode->i_tag = dx_current_fstag(sb); + } else + inode_init_owner(inode, dir, mode); + +diff -NurpP --minimal linux-3.3.8/fs/ext4/inode.c linux-3.3.8-vs2.3.3.4/fs/ext4/inode.c +--- linux-3.3.8/fs/ext4/inode.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/ext4/inode.c 2012-04-03 03:02:12.000000000 +0200 +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + + #include "ext4_jbd2.h" + #include "xattr.h" +@@ -3557,41 +3558,64 @@ void ext4_set_inode_flags(struct inode * + { + unsigned int flags = EXT4_I(inode)->i_flags; + +- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | ++ S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); ++ ++ if (flags & EXT4_IMMUTABLE_FL) ++ inode->i_flags |= S_IMMUTABLE; ++ if (flags & EXT4_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; ++ + if (flags & EXT4_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT4_APPEND_FL) + inode->i_flags |= S_APPEND; +- if (flags & EXT4_IMMUTABLE_FL) +- inode->i_flags |= S_IMMUTABLE; + if (flags & EXT4_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & EXT4_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (flags & EXT4_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ if (flags & EXT4_COW_FL) ++ inode->i_vflags |= V_COW; + } + + /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ + void ext4_get_inode_flags(struct ext4_inode_info *ei) + { +- unsigned int vfs_fl; ++ unsigned int vfs_fl, vfs_vf; + unsigned long old_fl, new_fl; + + do { + vfs_fl = ei->vfs_inode.i_flags; ++ vfs_vf = ei->vfs_inode.i_vflags; + old_fl = ei->i_flags; + new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| + EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| +- EXT4_DIRSYNC_FL); ++ EXT4_DIRSYNC_FL|EXT4_BARRIER_FL| ++ EXT4_COW_FL); ++ ++ if (vfs_fl & S_IMMUTABLE) ++ new_fl |= EXT4_IMMUTABLE_FL; ++ if (vfs_fl & S_IXUNLINK) ++ new_fl |= EXT4_IXUNLINK_FL; ++ + if (vfs_fl & S_SYNC) + new_fl |= EXT4_SYNC_FL; + if (vfs_fl & S_APPEND) + new_fl |= EXT4_APPEND_FL; +- if (vfs_fl & S_IMMUTABLE) +- new_fl |= EXT4_IMMUTABLE_FL; + if (vfs_fl & S_NOATIME) + new_fl |= EXT4_NOATIME_FL; + if (vfs_fl & S_DIRSYNC) + new_fl |= EXT4_DIRSYNC_FL; ++ ++ if (vfs_vf & V_BARRIER) ++ new_fl |= EXT4_BARRIER_FL; ++ if (vfs_vf & V_COW) ++ new_fl |= EXT4_COW_FL; + } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); + } + +@@ -3627,6 +3651,8 @@ struct inode *ext4_iget(struct super_blo + journal_t *journal = EXT4_SB(sb)->s_journal; + long ret; + int block; ++ uid_t uid; ++ gid_t gid; + + inode = iget_locked(sb, ino); + if (!inode) +@@ -3642,12 +3668,16 @@ struct inode *ext4_iget(struct super_blo + goto bad_inode; + raw_inode = ext4_raw_inode(&iloc); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); +- inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); +- inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); ++ uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); ++ gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (!(test_opt(inode->i_sb, NO_UID32))) { +- inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; +- inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; ++ uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; ++ gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, ++ le16_to_cpu(raw_inode->i_raw_tag)); + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); + + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ +@@ -3866,6 +3896,8 @@ static int ext4_do_update_inode(handle_t + struct ext4_inode *raw_inode = ext4_raw_inode(iloc); + struct ext4_inode_info *ei = EXT4_I(inode); + struct buffer_head *bh = iloc->bh; ++ uid_t uid = TAGINO_UID(DX_TAG(inode), inode->i_uid, inode->i_tag); ++ gid_t gid = TAGINO_GID(DX_TAG(inode), inode->i_gid, inode->i_tag); + int err = 0, rc, block; + + /* For fields not not tracking in the in-memory inode, +@@ -3876,29 +3908,32 @@ static int ext4_do_update_inode(handle_t + ext4_get_inode_flags(ei); + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + if (!(test_opt(inode->i_sb, NO_UID32))) { +- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); +- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); ++ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid)); ++ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid)); + /* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if (!ei->i_dtime) { + raw_inode->i_uid_high = +- cpu_to_le16(high_16_bits(inode->i_uid)); ++ cpu_to_le16(high_16_bits(uid)); + raw_inode->i_gid_high = +- cpu_to_le16(high_16_bits(inode->i_gid)); ++ cpu_to_le16(high_16_bits(gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = +- cpu_to_le16(fs_high2lowuid(inode->i_uid)); ++ cpu_to_le16(fs_high2lowuid(uid)); + raw_inode->i_gid_low = +- cpu_to_le16(fs_high2lowgid(inode->i_gid)); ++ cpu_to_le16(fs_high2lowgid(gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } ++#ifdef CONFIG_TAGGING_INTERN ++ raw_inode->i_raw_tag = cpu_to_le16(inode->i_tag); ++#endif + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + + EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); +@@ -4084,7 +4119,8 @@ int ext4_setattr(struct dentry *dentry, + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || +- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { ++ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid) || ++ (ia_valid & ATTR_TAG && attr->ia_tag != inode->i_tag)) { + handle_t *handle; + + /* (user+group)*(old+new) structure, inode write (sb, +@@ -4106,6 +4142,8 @@ int ext4_setattr(struct dentry *dentry, + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; ++ if ((attr->ia_valid & ATTR_TAG) && IS_TAGGED(inode)) ++ inode->i_tag = attr->ia_tag; + error = ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + } +diff -NurpP --minimal linux-3.3.8/fs/ext4/ioctl.c linux-3.3.8-vs2.3.3.4/fs/ext4/ioctl.c +--- linux-3.3.8/fs/ext4/ioctl.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext4/ioctl.c 2012-02-24 04:29:04.000000000 +0100 +@@ -14,12 +14,40 @@ + #include + #include + #include ++#include + #include + #include "ext4_jbd2.h" + #include "ext4.h" + + #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) + ++int ext4_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ handle_t *handle = NULL; ++ struct ext4_iloc iloc; ++ int err; ++ ++ handle = ext4_journal_start(inode, 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_SYNC(inode)) ++ ext4_handle_sync(handle); ++ err = ext4_reserve_inode_write(handle, inode, &iloc); ++ if (err) ++ goto flags_err; ++ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ ext4_get_inode_flags(EXT4_I(inode)); ++ inode->i_ctime = ext4_current_time(inode); ++ ++ err = ext4_mark_iloc_dirty(handle, inode, &iloc); ++flags_err: ++ ext4_journal_stop(handle); ++ return err; ++} ++ + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_dentry->d_inode; +@@ -53,6 +81,11 @@ long ext4_ioctl(struct file *filp, unsig + + flags = ext4_mask_flags(inode->i_mode, flags); + ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -EACCES; ++ } ++ + err = -EPERM; + mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ +@@ -70,7 +103,9 @@ long ext4_ioctl(struct file *filp, unsig + * + * This test looks nicer. Thanks to Pauline Middelink + */ +- if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { ++ if ((oldflags & EXT4_IMMUTABLE_FL) || ++ ((flags ^ oldflags) & (EXT4_APPEND_FL | ++ EXT4_IMMUTABLE_FL | EXT4_IXUNLINK_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; + } +diff -NurpP --minimal linux-3.3.8/fs/ext4/namei.c linux-3.3.8-vs2.3.3.4/fs/ext4/namei.c +--- linux-3.3.8/fs/ext4/namei.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ext4/namei.c 2012-02-24 03:55:06.000000000 +0100 +@@ -34,6 +34,7 @@ + #include + #include + #include ++#include + #include "ext4.h" + #include "ext4_jbd2.h" + +@@ -925,6 +926,7 @@ restart: + ll_rw_block(READ | REQ_META | REQ_PRIO, + 1, &bh); + } ++ dx_propagate_tag(nd, inode); + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; +@@ -2587,6 +2589,7 @@ const struct inode_operations ext4_dir_i + #endif + .get_acl = ext4_get_acl, + .fiemap = ext4_fiemap, ++ .sync_flags = ext4_sync_flags, + }; + + const struct inode_operations ext4_special_inode_operations = { +diff -NurpP --minimal linux-3.3.8/fs/ext4/super.c linux-3.3.8-vs2.3.3.4/fs/ext4/super.c +--- linux-3.3.8/fs/ext4/super.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/ext4/super.c 2012-04-23 23:45:14.000000000 +0200 +@@ -1333,6 +1333,7 @@ enum { + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_dioread_nolock, Opt_dioread_lock, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, ++ Opt_tag, Opt_notag, Opt_tagid + }; + + static const match_table_t tokens = { +@@ -1408,6 +1409,9 @@ static const match_table_t tokens = { + {Opt_init_itable, "init_itable=%u"}, + {Opt_init_itable, "init_itable"}, + {Opt_noinit_itable, "noinit_itable"}, ++ {Opt_tag, "tag"}, ++ {Opt_notag, "notag"}, ++ {Opt_tagid, "tagid=%u"}, + {Opt_err, NULL}, + }; + +@@ -1576,6 +1580,20 @@ static int parse_options(char *options, + case Opt_nouid32: + set_opt(sb, NO_UID32); + break; ++#ifndef CONFIG_TAGGING_NONE ++ case Opt_tag: ++ set_opt(sb, TAGGED); ++ break; ++ case Opt_notag: ++ clear_opt(sb, TAGGED); ++ break; ++#endif ++#ifdef CONFIG_PROPAGATE ++ case Opt_tagid: ++ /* use args[0] */ ++ set_opt(sb, TAGGED); ++ break; ++#endif + case Opt_debug: + set_opt(sb, DEBUG); + break; +@@ -3260,6 +3278,9 @@ static int ext4_fill_super(struct super_ + } + } + ++ if (EXT4_SB(sb)->s_mount_opt & EXT4_MOUNT_TAGGED) ++ sb->s_flags |= MS_TAGGED; ++ + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + +@@ -4412,6 +4433,14 @@ static int ext4_remount(struct super_blo + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) + ext4_abort(sb, "Abort forced by user"); + ++ if ((sbi->s_mount_opt & EXT4_MOUNT_TAGGED) && ++ !(sb->s_flags & MS_TAGGED)) { ++ printk("EXT4-fs: %s: tagging not permitted on remount.\n", ++ sb->s_id); ++ err = -EINVAL; ++ goto restore_opts; ++ } ++ + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + +diff -NurpP --minimal linux-3.3.8/fs/fcntl.c linux-3.3.8-vs2.3.3.4/fs/fcntl.c +--- linux-3.3.8/fs/fcntl.c 2011-05-22 16:17:52.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/fcntl.c 2012-02-24 03:55:06.000000000 +0100 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -103,6 +104,8 @@ SYSCALL_DEFINE3(dup3, unsigned int, oldf + + if (tofree) + filp_close(tofree, files); ++ else ++ vx_openfd_inc(newfd); /* fd was unused */ + + return newfd; + +@@ -447,6 +450,8 @@ SYSCALL_DEFINE3(fcntl, unsigned int, fd, + filp = fget_raw(fd); + if (!filp) + goto out; ++ if (!vx_files_avail(1)) ++ goto out; + + if (unlikely(filp->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) { +diff -NurpP --minimal linux-3.3.8/fs/file.c linux-3.3.8-vs2.3.3.4/fs/file.c +--- linux-3.3.8/fs/file.c 2011-05-22 16:17:52.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/file.c 2012-02-24 03:55:06.000000000 +0100 +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + + struct fdtable_defer { + spinlock_t lock; +@@ -359,6 +360,8 @@ struct files_struct *dup_fd(struct files + struct file *f = *old_fds++; + if (f) { + get_file(f); ++ /* TODO: sum it first for check and performance */ ++ vx_openfd_inc(open_files - i); + } else { + /* + * The fd may be claimed in the fd bitmap but not yet +@@ -466,6 +469,7 @@ repeat: + else + FD_CLR(fd, fdt->close_on_exec); + error = fd; ++ vx_openfd_inc(fd); + #if 1 + /* Sanity check */ + if (rcu_dereference_raw(fdt->fd[fd]) != NULL) { +diff -NurpP --minimal linux-3.3.8/fs/file_table.c linux-3.3.8-vs2.3.3.4/fs/file_table.c +--- linux-3.3.8/fs/file_table.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/file_table.c 2012-02-24 03:55:06.000000000 +0100 +@@ -24,6 +24,8 @@ + #include + #include + #include ++#include ++#include + + #include + +@@ -135,6 +137,8 @@ struct file *get_empty_filp(void) + spin_lock_init(&f->f_lock); + eventpoll_init_file(f); + /* f->f_version: 0 */ ++ f->f_xid = vx_current_xid(); ++ vx_files_inc(f); + return f; + + over: +@@ -253,6 +257,8 @@ static void __fput(struct file *file) + } + fops_put(file->f_op); + put_pid(file->f_owner.pid); ++ vx_files_dec(file); ++ file->f_xid = 0; + file_sb_list_del(file); + if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_dec(inode); +@@ -383,6 +389,8 @@ void put_filp(struct file *file) + { + if (atomic_long_dec_and_test(&file->f_count)) { + security_file_free(file); ++ vx_files_dec(file); ++ file->f_xid = 0; + file_sb_list_del(file); + file_free(file); + } +diff -NurpP --minimal linux-3.3.8/fs/fs_struct.c linux-3.3.8-vs2.3.3.4/fs/fs_struct.c +--- linux-3.3.8/fs/fs_struct.c 2011-03-15 18:07:31.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/fs_struct.c 2012-02-24 03:55:06.000000000 +0100 +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + #include "internal.h" + + static inline void path_get_longterm(struct path *path) +@@ -96,6 +97,7 @@ void free_fs_struct(struct fs_struct *fs + { + path_put_longterm(&fs->root); + path_put_longterm(&fs->pwd); ++ atomic_dec(&vs_global_fs); + kmem_cache_free(fs_cachep, fs); + } + +@@ -135,6 +137,7 @@ struct fs_struct *copy_fs_struct(struct + fs->pwd = old->pwd; + path_get_longterm(&fs->pwd); + spin_unlock(&old->lock); ++ atomic_inc(&vs_global_fs); + } + return fs; + } +diff -NurpP --minimal linux-3.3.8/fs/gfs2/file.c linux-3.3.8-vs2.3.3.4/fs/gfs2/file.c +--- linux-3.3.8/fs/gfs2/file.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/gfs2/file.c 2012-02-24 03:55:06.000000000 +0100 +@@ -143,6 +143,9 @@ static const u32 fsflags_to_gfs2[32] = { + [7] = GFS2_DIF_NOATIME, + [12] = GFS2_DIF_EXHASH, + [14] = GFS2_DIF_INHERIT_JDATA, ++ [27] = GFS2_DIF_IXUNLINK, ++ [26] = GFS2_DIF_BARRIER, ++ [29] = GFS2_DIF_COW, + }; + + static const u32 gfs2_to_fsflags[32] = { +@@ -152,6 +155,9 @@ static const u32 gfs2_to_fsflags[32] = { + [gfs2fl_NoAtime] = FS_NOATIME_FL, + [gfs2fl_ExHash] = FS_INDEX_FL, + [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, ++ [gfs2fl_IXUnlink] = FS_IXUNLINK_FL, ++ [gfs2fl_Barrier] = FS_BARRIER_FL, ++ [gfs2fl_Cow] = FS_COW_FL, + }; + + static int gfs2_get_flags(struct file *filp, u32 __user *ptr) +@@ -182,12 +188,18 @@ void gfs2_set_inode_flags(struct inode * + { + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int flags = inode->i_flags; ++ unsigned int vflags = inode->i_vflags; ++ ++ flags &= ~(S_IMMUTABLE | S_IXUNLINK | ++ S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC | S_NOSEC); + +- flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC); + if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode)) + inode->i_flags |= S_NOSEC; + if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) + flags |= S_IMMUTABLE; ++ if (ip->i_diskflags & GFS2_DIF_IXUNLINK) ++ flags |= S_IXUNLINK; ++ + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) + flags |= S_APPEND; + if (ip->i_diskflags & GFS2_DIF_NOATIME) +@@ -195,6 +207,43 @@ void gfs2_set_inode_flags(struct inode * + if (ip->i_diskflags & GFS2_DIF_SYNC) + flags |= S_SYNC; + inode->i_flags = flags; ++ ++ vflags &= ~(V_BARRIER | V_COW); ++ ++ if (ip->i_diskflags & GFS2_DIF_BARRIER) ++ vflags |= V_BARRIER; ++ if (ip->i_diskflags & GFS2_DIF_COW) ++ vflags |= V_COW; ++ inode->i_vflags = vflags; ++} ++ ++void gfs2_get_inode_flags(struct inode *inode) ++{ ++ struct gfs2_inode *ip = GFS2_I(inode); ++ unsigned int flags = inode->i_flags; ++ unsigned int vflags = inode->i_vflags; ++ ++ ip->i_diskflags &= ~(GFS2_DIF_APPENDONLY | ++ GFS2_DIF_NOATIME | GFS2_DIF_SYNC | ++ GFS2_DIF_IMMUTABLE | GFS2_DIF_IXUNLINK | ++ GFS2_DIF_BARRIER | GFS2_DIF_COW); ++ ++ if (flags & S_IMMUTABLE) ++ ip->i_diskflags |= GFS2_DIF_IMMUTABLE; ++ if (flags & S_IXUNLINK) ++ ip->i_diskflags |= GFS2_DIF_IXUNLINK; ++ ++ if (flags & S_APPEND) ++ ip->i_diskflags |= GFS2_DIF_APPENDONLY; ++ if (flags & S_NOATIME) ++ ip->i_diskflags |= GFS2_DIF_NOATIME; ++ if (flags & S_SYNC) ++ ip->i_diskflags |= GFS2_DIF_SYNC; ++ ++ if (vflags & V_BARRIER) ++ ip->i_diskflags |= GFS2_DIF_BARRIER; ++ if (vflags & V_COW) ++ ip->i_diskflags |= GFS2_DIF_COW; + } + + /* Flags that can be set by user space */ +@@ -306,6 +355,37 @@ static int gfs2_set_flags(struct file *f + return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); + } + ++int gfs2_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ struct gfs2_inode *ip = GFS2_I(inode); ++ struct gfs2_sbd *sdp = GFS2_SB(inode); ++ struct buffer_head *bh; ++ struct gfs2_holder gh; ++ int error; ++ ++ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ++ if (error) ++ return error; ++ error = gfs2_trans_begin(sdp, RES_DINODE, 0); ++ if (error) ++ goto out; ++ error = gfs2_meta_inode_buffer(ip, &bh); ++ if (error) ++ goto out_trans_end; ++ gfs2_trans_add_bh(ip->i_gl, bh, 1); ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ gfs2_get_inode_flags(inode); ++ gfs2_dinode_out(ip, bh->b_data); ++ brelse(bh); ++ gfs2_set_aops(inode); ++out_trans_end: ++ gfs2_trans_end(sdp); ++out: ++ gfs2_glock_dq_uninit(&gh); ++ return error; ++} ++ + static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + switch(cmd) { +diff -NurpP --minimal linux-3.3.8/fs/gfs2/inode.h linux-3.3.8-vs2.3.3.4/fs/gfs2/inode.h +--- linux-3.3.8/fs/gfs2/inode.h 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/gfs2/inode.h 2012-02-24 03:55:06.000000000 +0100 +@@ -120,6 +120,7 @@ extern const struct file_operations gfs2 + extern const struct file_operations gfs2_dir_fops_nolock; + + extern void gfs2_set_inode_flags(struct inode *inode); ++extern int gfs2_sync_flags(struct inode *inode, int flags, int vflags); + + #ifdef CONFIG_GFS2_FS_LOCKING_DLM + extern const struct file_operations gfs2_file_fops; +diff -NurpP --minimal linux-3.3.8/fs/inode.c linux-3.3.8-vs2.3.3.4/fs/inode.c +--- linux-3.3.8/fs/inode.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/inode.c 2012-03-19 20:52:10.000000000 +0100 +@@ -27,6 +27,7 @@ + #include + #include /* for inode_has_buffers */ + #include ++#include + #include "internal.h" + + /* +@@ -138,6 +139,9 @@ int inode_init_always(struct super_block + struct address_space *const mapping = &inode->i_data; + + inode->i_sb = sb; ++ ++ /* essential because of inode slab reuse */ ++ inode->i_tag = 0; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); +@@ -159,6 +163,7 @@ int inode_init_always(struct super_block + inode->i_bdev = NULL; + inode->i_cdev = NULL; + inode->i_rdev = 0; ++ inode->i_mdev = 0; + inode->dirtied_when = 0; + + if (security_inode_alloc(inode)) +@@ -480,6 +485,8 @@ void __insert_inode_hash(struct inode *i + } + EXPORT_SYMBOL(__insert_inode_hash); + ++EXPORT_SYMBOL_GPL(__iget); ++ + /** + * __remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash +@@ -1709,9 +1716,11 @@ void init_special_inode(struct inode *in + if (S_ISCHR(mode)) { + inode->i_fop = &def_chr_fops; + inode->i_rdev = rdev; ++ inode->i_mdev = rdev; + } else if (S_ISBLK(mode)) { + inode->i_fop = &def_blk_fops; + inode->i_rdev = rdev; ++ inode->i_mdev = rdev; + } else if (S_ISFIFO(mode)) + inode->i_fop = &def_fifo_fops; + else if (S_ISSOCK(mode)) +@@ -1740,6 +1749,7 @@ void inode_init_owner(struct inode *inod + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; ++ inode->i_tag = dx_current_fstag(inode->i_sb); + } + EXPORT_SYMBOL(inode_init_owner); + +diff -NurpP --minimal linux-3.3.8/fs/ioctl.c linux-3.3.8-vs2.3.3.4/fs/ioctl.c +--- linux-3.3.8/fs/ioctl.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ioctl.c 2012-02-24 03:55:06.000000000 +0100 +@@ -15,6 +15,9 @@ + #include + #include + #include ++#include ++#include ++#include + + #include + +diff -NurpP --minimal linux-3.3.8/fs/ioprio.c linux-3.3.8-vs2.3.3.4/fs/ioprio.c +--- linux-3.3.8/fs/ioprio.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ioprio.c 2012-02-24 03:55:06.000000000 +0100 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + int set_task_ioprio(struct task_struct *task, int ioprio) + { +@@ -104,6 +105,8 @@ SYSCALL_DEFINE3(ioprio_set, int, which, + else + pgrp = find_vpid(who); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ++ if (!vx_check(p->xid, VS_ADMIN_P | VS_IDENT)) ++ continue; + ret = set_task_ioprio(p, ioprio); + if (ret) + break; +@@ -193,6 +196,8 @@ SYSCALL_DEFINE2(ioprio_get, int, which, + else + pgrp = find_vpid(who); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ++ if (!vx_check(p->xid, VS_ADMIN_P | VS_IDENT)) ++ continue; + tmpio = get_task_ioprio(p); + if (tmpio < 0) + continue; +diff -NurpP --minimal linux-3.3.8/fs/jfs/file.c linux-3.3.8-vs2.3.3.4/fs/jfs/file.c +--- linux-3.3.8/fs/jfs/file.c 2011-10-24 18:45:27.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/jfs/file.c 2012-02-24 03:55:06.000000000 +0100 +@@ -109,7 +109,8 @@ int jfs_setattr(struct dentry *dentry, s + if (is_quota_modification(inode, iattr)) + dquot_initialize(inode); + if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || +- (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { ++ (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) || ++ (iattr->ia_valid & ATTR_TAG && iattr->ia_tag != inode->i_tag)) { + rc = dquot_transfer(inode, iattr); + if (rc) + return rc; +@@ -142,6 +143,7 @@ const struct inode_operations jfs_file_i + #ifdef CONFIG_JFS_POSIX_ACL + .get_acl = jfs_get_acl, + #endif ++ .sync_flags = jfs_sync_flags, + }; + + const struct file_operations jfs_file_operations = { +diff -NurpP --minimal linux-3.3.8/fs/jfs/ioctl.c linux-3.3.8-vs2.3.3.4/fs/jfs/ioctl.c +--- linux-3.3.8/fs/jfs/ioctl.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/jfs/ioctl.c 2012-02-24 03:55:06.000000000 +0100 +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -52,6 +53,16 @@ static long jfs_map_ext2(unsigned long f + } + + ++int jfs_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ jfs_get_inode_flags(JFS_IP(inode)); ++ inode->i_ctime = CURRENT_TIME_SEC; ++ mark_inode_dirty(inode); ++ return 0; ++} ++ + long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_dentry->d_inode; +@@ -85,6 +96,11 @@ long jfs_ioctl(struct file *filp, unsign + if (!S_ISDIR(inode->i_mode)) + flags &= ~JFS_DIRSYNC_FL; + ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -EACCES; ++ } ++ + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { + err = -EPERM; +@@ -102,8 +118,8 @@ long jfs_ioctl(struct file *filp, unsign + * the relevant capability. + */ + if ((oldflags & JFS_IMMUTABLE_FL) || +- ((flags ^ oldflags) & +- (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) { ++ ((flags ^ oldflags) & (JFS_APPEND_FL | ++ JFS_IMMUTABLE_FL | JFS_IXUNLINK_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); + err = -EPERM; +@@ -111,7 +127,7 @@ long jfs_ioctl(struct file *filp, unsign + } + } + +- flags = flags & JFS_FL_USER_MODIFIABLE; ++ flags &= JFS_FL_USER_MODIFIABLE; + flags |= oldflags & ~JFS_FL_USER_MODIFIABLE; + jfs_inode->mode2 = flags; + +diff -NurpP --minimal linux-3.3.8/fs/jfs/jfs_dinode.h linux-3.3.8-vs2.3.3.4/fs/jfs/jfs_dinode.h +--- linux-3.3.8/fs/jfs/jfs_dinode.h 2008-12-25 00:26:37.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/jfs/jfs_dinode.h 2012-02-24 03:55:06.000000000 +0100 +@@ -161,9 +161,13 @@ struct dinode { + + #define JFS_APPEND_FL 0x01000000 /* writes to file may only append */ + #define JFS_IMMUTABLE_FL 0x02000000 /* Immutable file */ ++#define JFS_IXUNLINK_FL 0x08000000 /* Immutable invert on unlink */ + +-#define JFS_FL_USER_VISIBLE 0x03F80000 +-#define JFS_FL_USER_MODIFIABLE 0x03F80000 ++#define JFS_BARRIER_FL 0x04000000 /* Barrier for chroot() */ ++#define JFS_COW_FL 0x20000000 /* Copy on Write marker */ ++ ++#define JFS_FL_USER_VISIBLE 0x07F80000 ++#define JFS_FL_USER_MODIFIABLE 0x07F80000 + #define JFS_FL_INHERIT 0x03C80000 + + /* These are identical to EXT[23]_IOC_GETFLAGS/SETFLAGS */ +diff -NurpP --minimal linux-3.3.8/fs/jfs/jfs_filsys.h linux-3.3.8-vs2.3.3.4/fs/jfs/jfs_filsys.h +--- linux-3.3.8/fs/jfs/jfs_filsys.h 2008-12-25 00:26:37.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/jfs/jfs_filsys.h 2012-02-24 03:55:06.000000000 +0100 +@@ -263,6 +263,7 @@ + #define JFS_NAME_MAX 255 + #define JFS_PATH_MAX BPSIZE + ++#define JFS_TAGGED 0x00800000 /* Context Tagging */ + + /* + * file system state (superblock state) +diff -NurpP --minimal linux-3.3.8/fs/jfs/jfs_imap.c linux-3.3.8-vs2.3.3.4/fs/jfs/jfs_imap.c +--- linux-3.3.8/fs/jfs/jfs_imap.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/jfs/jfs_imap.c 2012-02-24 03:55:06.000000000 +0100 +@@ -46,6 +46,7 @@ + #include + #include + #include ++#include + + #include "jfs_incore.h" + #include "jfs_inode.h" +@@ -3058,6 +3059,8 @@ static int copy_from_dinode(struct dinod + { + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); ++ uid_t uid; ++ gid_t gid; + + jfs_ip->fileset = le32_to_cpu(dip->di_fileset); + jfs_ip->mode2 = le32_to_cpu(dip->di_mode); +@@ -3078,14 +3081,18 @@ static int copy_from_dinode(struct dinod + } + set_nlink(ip, le32_to_cpu(dip->di_nlink)); + +- jfs_ip->saved_uid = le32_to_cpu(dip->di_uid); ++ uid = le32_to_cpu(dip->di_uid); ++ gid = le32_to_cpu(dip->di_gid); ++ ip->i_tag = INOTAG_TAG(DX_TAG(ip), uid, gid, 0); ++ ++ jfs_ip->saved_uid = INOTAG_UID(DX_TAG(ip), uid, gid); + if (sbi->uid == -1) + ip->i_uid = jfs_ip->saved_uid; + else { + ip->i_uid = sbi->uid; + } + +- jfs_ip->saved_gid = le32_to_cpu(dip->di_gid); ++ jfs_ip->saved_gid = INOTAG_GID(DX_TAG(ip), uid, gid); + if (sbi->gid == -1) + ip->i_gid = jfs_ip->saved_gid; + else { +@@ -3150,14 +3157,12 @@ static void copy_to_dinode(struct dinode + dip->di_size = cpu_to_le64(ip->i_size); + dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks)); + dip->di_nlink = cpu_to_le32(ip->i_nlink); +- if (sbi->uid == -1) +- dip->di_uid = cpu_to_le32(ip->i_uid); +- else +- dip->di_uid = cpu_to_le32(jfs_ip->saved_uid); +- if (sbi->gid == -1) +- dip->di_gid = cpu_to_le32(ip->i_gid); +- else +- dip->di_gid = cpu_to_le32(jfs_ip->saved_gid); ++ ++ dip->di_uid = cpu_to_le32(TAGINO_UID(DX_TAG(ip), ++ (sbi->uid == -1) ? ip->i_uid : jfs_ip->saved_uid, ip->i_tag)); ++ dip->di_gid = cpu_to_le32(TAGINO_GID(DX_TAG(ip), ++ (sbi->gid == -1) ? ip->i_gid : jfs_ip->saved_gid, ip->i_tag)); ++ + jfs_get_inode_flags(jfs_ip); + /* + * mode2 is only needed for storing the higher order bits. +diff -NurpP --minimal linux-3.3.8/fs/jfs/jfs_inode.c linux-3.3.8-vs2.3.3.4/fs/jfs/jfs_inode.c +--- linux-3.3.8/fs/jfs/jfs_inode.c 2012-01-09 16:14:54.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/jfs/jfs_inode.c 2012-02-24 03:55:06.000000000 +0100 +@@ -18,6 +18,7 @@ + + #include + #include ++#include + #include "jfs_incore.h" + #include "jfs_inode.h" + #include "jfs_filsys.h" +@@ -30,29 +31,46 @@ void jfs_set_inode_flags(struct inode *i + { + unsigned int flags = JFS_IP(inode)->mode2; + +- inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | +- S_NOATIME | S_DIRSYNC | S_SYNC); ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | ++ S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); + + if (flags & JFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; ++ if (flags & JFS_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; ++ ++ if (flags & JFS_SYNC_FL) ++ inode->i_flags |= S_SYNC; + if (flags & JFS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & JFS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & JFS_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +- if (flags & JFS_SYNC_FL) +- inode->i_flags |= S_SYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (flags & JFS_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ if (flags & JFS_COW_FL) ++ inode->i_vflags |= V_COW; + } + + void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip) + { + unsigned int flags = jfs_ip->vfs_inode.i_flags; ++ unsigned int vflags = jfs_ip->vfs_inode.i_vflags; ++ ++ jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_IXUNLINK_FL | ++ JFS_APPEND_FL | JFS_NOATIME_FL | ++ JFS_DIRSYNC_FL | JFS_SYNC_FL | ++ JFS_BARRIER_FL | JFS_COW_FL); + +- jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_APPEND_FL | JFS_NOATIME_FL | +- JFS_DIRSYNC_FL | JFS_SYNC_FL); + if (flags & S_IMMUTABLE) + jfs_ip->mode2 |= JFS_IMMUTABLE_FL; ++ if (flags & S_IXUNLINK) ++ jfs_ip->mode2 |= JFS_IXUNLINK_FL; ++ + if (flags & S_APPEND) + jfs_ip->mode2 |= JFS_APPEND_FL; + if (flags & S_NOATIME) +@@ -61,6 +79,11 @@ void jfs_get_inode_flags(struct jfs_inod + jfs_ip->mode2 |= JFS_DIRSYNC_FL; + if (flags & S_SYNC) + jfs_ip->mode2 |= JFS_SYNC_FL; ++ ++ if (vflags & V_BARRIER) ++ jfs_ip->mode2 |= JFS_BARRIER_FL; ++ if (vflags & V_COW) ++ jfs_ip->mode2 |= JFS_COW_FL; + } + + /* +diff -NurpP --minimal linux-3.3.8/fs/jfs/jfs_inode.h linux-3.3.8-vs2.3.3.4/fs/jfs/jfs_inode.h +--- linux-3.3.8/fs/jfs/jfs_inode.h 2011-10-24 18:45:27.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/jfs/jfs_inode.h 2012-02-24 03:55:06.000000000 +0100 +@@ -39,6 +39,7 @@ extern struct dentry *jfs_fh_to_dentry(s + extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); + extern void jfs_set_inode_flags(struct inode *); ++extern int jfs_sync_flags(struct inode *, int, int); + extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); + extern int jfs_setattr(struct dentry *, struct iattr *); + +diff -NurpP --minimal linux-3.3.8/fs/jfs/namei.c linux-3.3.8-vs2.3.3.4/fs/jfs/namei.c +--- linux-3.3.8/fs/jfs/namei.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/jfs/namei.c 2012-02-24 03:55:06.000000000 +0100 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include "jfs_incore.h" + #include "jfs_superblock.h" + #include "jfs_inode.h" +@@ -1474,6 +1475,7 @@ static struct dentry *jfs_lookup(struct + jfs_err("jfs_lookup: iget failed on inum %d", (uint)inum); + } + ++ dx_propagate_tag(nd, ip); + return d_splice_alias(ip, dentry); + } + +@@ -1538,6 +1540,7 @@ const struct inode_operations jfs_dir_in + #ifdef CONFIG_JFS_POSIX_ACL + .get_acl = jfs_get_acl, + #endif ++ .sync_flags = jfs_sync_flags, + }; + + const struct file_operations jfs_dir_operations = { +diff -NurpP --minimal linux-3.3.8/fs/jfs/super.c linux-3.3.8-vs2.3.3.4/fs/jfs/super.c +--- linux-3.3.8/fs/jfs/super.c 2012-03-19 19:47:25.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/jfs/super.c 2012-02-24 03:55:06.000000000 +0100 +@@ -197,7 +197,8 @@ static void jfs_put_super(struct super_b + enum { + Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize, + Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota, +- Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask ++ Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask, ++ Opt_tag, Opt_notag, Opt_tagid + }; + + static const match_table_t tokens = { +@@ -207,6 +208,10 @@ static const match_table_t tokens = { + {Opt_resize, "resize=%u"}, + {Opt_resize_nosize, "resize"}, + {Opt_errors, "errors=%s"}, ++ {Opt_tag, "tag"}, ++ {Opt_notag, "notag"}, ++ {Opt_tagid, "tagid=%u"}, ++ {Opt_tag, "tagxid"}, + {Opt_ignore, "noquota"}, + {Opt_ignore, "quota"}, + {Opt_usrquota, "usrquota"}, +@@ -341,6 +346,20 @@ static int parse_options(char *options, + } + break; + } ++#ifndef CONFIG_TAGGING_NONE ++ case Opt_tag: ++ *flag |= JFS_TAGGED; ++ break; ++ case Opt_notag: ++ *flag &= JFS_TAGGED; ++ break; ++#endif ++#ifdef CONFIG_PROPAGATE ++ case Opt_tagid: ++ /* use args[0] */ ++ *flag |= JFS_TAGGED; ++ break; ++#endif + default: + printk("jfs: Unrecognized mount option \"%s\" " + " or missing value\n", p); +@@ -372,6 +391,12 @@ static int jfs_remount(struct super_bloc + return -EINVAL; + } + ++ if ((flag & JFS_TAGGED) && !(sb->s_flags & MS_TAGGED)) { ++ printk(KERN_ERR "JFS: %s: tagging not permitted on remount.\n", ++ sb->s_id); ++ return -EINVAL; ++ } ++ + if (newLVSize) { + if (sb->s_flags & MS_RDONLY) { + printk(KERN_ERR +@@ -454,6 +479,9 @@ static int jfs_fill_super(struct super_b + #ifdef CONFIG_JFS_POSIX_ACL + sb->s_flags |= MS_POSIXACL; + #endif ++ /* map mount option tagxid */ ++ if (sbi->flag & JFS_TAGGED) ++ sb->s_flags |= MS_TAGGED; + + if (newLVSize) { + printk(KERN_ERR "resize option for remount only\n"); +diff -NurpP --minimal linux-3.3.8/fs/libfs.c linux-3.3.8-vs2.3.3.4/fs/libfs.c +--- linux-3.3.8/fs/libfs.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/libfs.c 2012-02-24 03:55:06.000000000 +0100 +@@ -135,7 +135,8 @@ static inline unsigned char dt_type(stru + * both impossible due to the lock on directory. + */ + +-int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) ++static inline int do_dcache_readdir_filter(struct file *filp, ++ void *dirent, filldir_t filldir, int (*filter)(struct dentry *dentry)) + { + struct dentry *dentry = filp->f_path.dentry; + struct dentry *cursor = filp->private_data; +@@ -166,6 +167,8 @@ int dcache_readdir(struct file * filp, v + for (p=q->next; p != &dentry->d_subdirs; p=p->next) { + struct dentry *next; + next = list_entry(p, struct dentry, d_u.d_child); ++ if (filter && !filter(next)) ++ continue; + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + if (!simple_positive(next)) { + spin_unlock(&next->d_lock); +@@ -192,6 +195,17 @@ int dcache_readdir(struct file * filp, v + return 0; + } + ++int dcache_readdir(struct file *filp, void *dirent, filldir_t filldir) ++{ ++ return do_dcache_readdir_filter(filp, dirent, filldir, NULL); ++} ++ ++int dcache_readdir_filter(struct file *filp, void *dirent, filldir_t filldir, ++ int (*filter)(struct dentry *)) ++{ ++ return do_dcache_readdir_filter(filp, dirent, filldir, filter); ++} ++ + ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) + { + return -EISDIR; +@@ -977,6 +991,7 @@ EXPORT_SYMBOL(dcache_dir_close); + EXPORT_SYMBOL(dcache_dir_lseek); + EXPORT_SYMBOL(dcache_dir_open); + EXPORT_SYMBOL(dcache_readdir); ++EXPORT_SYMBOL(dcache_readdir_filter); + EXPORT_SYMBOL(generic_read_dir); + EXPORT_SYMBOL(mount_pseudo); + EXPORT_SYMBOL(simple_write_begin); +diff -NurpP --minimal linux-3.3.8/fs/locks.c linux-3.3.8-vs2.3.3.4/fs/locks.c +--- linux-3.3.8/fs/locks.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/locks.c 2012-04-16 13:32:11.000000000 +0200 +@@ -126,6 +126,8 @@ + #include + #include + #include ++#include ++#include + + #include + +@@ -184,11 +186,17 @@ static void locks_init_lock_heads(struct + /* Allocate an empty lock structure. */ + struct file_lock *locks_alloc_lock(void) + { +- struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL); ++ struct file_lock *fl; + +- if (fl) +- locks_init_lock_heads(fl); ++ if (!vx_locks_avail(1)) ++ return NULL; + ++ fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL); ++ ++ if (fl) { ++ locks_init_lock_heads(fl); ++ fl->fl_xid = -1; ++ } + return fl; + } + EXPORT_SYMBOL_GPL(locks_alloc_lock); +@@ -216,6 +224,7 @@ void locks_free_lock(struct file_lock *f + BUG_ON(!list_empty(&fl->fl_block)); + BUG_ON(!list_empty(&fl->fl_link)); + ++ vx_locks_dec(fl); + locks_release_private(fl); + kmem_cache_free(filelock_cache, fl); + } +@@ -225,6 +234,7 @@ void locks_init_lock(struct file_lock *f + { + memset(fl, 0, sizeof(struct file_lock)); + locks_init_lock_heads(fl); ++ fl->fl_xid = -1; + } + + EXPORT_SYMBOL(locks_init_lock); +@@ -265,6 +275,7 @@ void locks_copy_lock(struct file_lock *n + new->fl_file = fl->fl_file; + new->fl_ops = fl->fl_ops; + new->fl_lmops = fl->fl_lmops; ++ new->fl_xid = fl->fl_xid; + + locks_copy_private(new, fl); + } +@@ -303,6 +314,11 @@ static int flock_make_lock(struct file * + fl->fl_flags = FL_FLOCK; + fl->fl_type = type; + fl->fl_end = OFFSET_MAX; ++ ++ vxd_assert(filp->f_xid == vx_current_xid(), ++ "f_xid(%d) == current(%d)", filp->f_xid, vx_current_xid()); ++ fl->fl_xid = filp->f_xid; ++ vx_locks_inc(fl); + + *lock = fl; + return 0; +@@ -452,6 +468,7 @@ static int lease_init(struct file *filp, + + fl->fl_owner = current->files; + fl->fl_pid = current->tgid; ++ fl->fl_xid = vx_current_xid(); + + fl->fl_file = filp; + fl->fl_flags = FL_LEASE; +@@ -471,6 +488,11 @@ static struct file_lock *lease_alloc(str + if (fl == NULL) + return ERR_PTR(error); + ++ fl->fl_xid = vx_current_xid(); ++ if (filp) ++ vxd_assert(filp->f_xid == fl->fl_xid, ++ "f_xid(%d) == fl_xid(%d)", filp->f_xid, fl->fl_xid); ++ vx_locks_inc(fl); + error = lease_init(filp, type, fl); + if (error) { + locks_free_lock(fl); +@@ -773,6 +795,7 @@ static int flock_lock_file(struct file * + lock_flocks(); + } + ++ new_fl->fl_xid = -1; + find_conflict: + for_each_lock(inode, before) { + struct file_lock *fl = *before; +@@ -793,6 +816,7 @@ find_conflict: + goto out; + locks_copy_lock(new_fl, request); + locks_insert_lock(before, new_fl); ++ vx_locks_inc(new_fl); + new_fl = NULL; + error = 0; + +@@ -803,7 +827,8 @@ out: + return error; + } + +-static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock) ++static int __posix_lock_file(struct inode *inode, struct file_lock *request, ++ struct file_lock *conflock, xid_t xid) + { + struct file_lock *fl; + struct file_lock *new_fl = NULL; +@@ -813,6 +838,8 @@ static int __posix_lock_file(struct inod + struct file_lock **before; + int error, added = 0; + ++ vxd_assert(xid == vx_current_xid(), ++ "xid(%d) == current(%d)", xid, vx_current_xid()); + /* + * We may need two file_lock structures for this operation, + * so we get them in advance to avoid races. +@@ -823,7 +850,11 @@ static int __posix_lock_file(struct inod + (request->fl_type != F_UNLCK || + request->fl_start != 0 || request->fl_end != OFFSET_MAX)) { + new_fl = locks_alloc_lock(); ++ new_fl->fl_xid = xid; ++ vx_locks_inc(new_fl); + new_fl2 = locks_alloc_lock(); ++ new_fl2->fl_xid = xid; ++ vx_locks_inc(new_fl2); + } + + lock_flocks(); +@@ -1022,7 +1053,8 @@ static int __posix_lock_file(struct inod + int posix_lock_file(struct file *filp, struct file_lock *fl, + struct file_lock *conflock) + { +- return __posix_lock_file(filp->f_path.dentry->d_inode, fl, conflock); ++ return __posix_lock_file(filp->f_path.dentry->d_inode, ++ fl, conflock, filp->f_xid); + } + EXPORT_SYMBOL(posix_lock_file); + +@@ -1112,7 +1144,7 @@ int locks_mandatory_area(int read_write, + fl.fl_end = offset + count - 1; + + for (;;) { +- error = __posix_lock_file(inode, &fl, NULL); ++ error = __posix_lock_file(inode, &fl, NULL, filp->f_xid); + if (error != FILE_LOCK_DEFERRED) + break; + error = wait_event_interruptible(fl.fl_wait, !fl.fl_next); +@@ -1407,6 +1439,7 @@ int generic_add_lease(struct file *filp, + goto out; + + locks_insert_lock(before, lease); ++ vx_locks_inc(lease); + return 0; + + out: +@@ -1847,6 +1880,11 @@ int fcntl_setlk(unsigned int fd, struct + if (file_lock == NULL) + return -ENOLCK; + ++ vxd_assert(filp->f_xid == vx_current_xid(), ++ "f_xid(%d) == current(%d)", filp->f_xid, vx_current_xid()); ++ file_lock->fl_xid = filp->f_xid; ++ vx_locks_inc(file_lock); ++ + /* + * This might block, so we do it before checking the inode. + */ +@@ -1965,6 +2003,11 @@ int fcntl_setlk64(unsigned int fd, struc + if (file_lock == NULL) + return -ENOLCK; + ++ vxd_assert(filp->f_xid == vx_current_xid(), ++ "f_xid(%d) == current(%d)", filp->f_xid, vx_current_xid()); ++ file_lock->fl_xid = filp->f_xid; ++ vx_locks_inc(file_lock); ++ + /* + * This might block, so we do it before checking the inode. + */ +@@ -2230,8 +2273,11 @@ static int locks_show(struct seq_file *f + + lock_get_status(f, fl, *((loff_t *)f->private), ""); + +- list_for_each_entry(bfl, &fl->fl_block, fl_block) ++ list_for_each_entry(bfl, &fl->fl_block, fl_block) { ++ if (!vx_check(fl->fl_xid, VS_WATCH_P | VS_IDENT)) ++ continue; + lock_get_status(f, bfl, *((loff_t *)f->private), " ->"); ++ } + + return 0; + } +diff -NurpP --minimal linux-3.3.8/fs/mount.h linux-3.3.8-vs2.3.3.4/fs/mount.h +--- linux-3.3.8/fs/mount.h 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/mount.h 2012-02-24 17:29:48.000000000 +0100 +@@ -47,6 +47,7 @@ struct mount { + int mnt_expiry_mark; /* true if marked for expiry */ + int mnt_pinned; + int mnt_ghosts; ++ tag_t mnt_tag; /* tagging used for vfsmount */ + }; + + static inline struct mount *real_mount(struct vfsmount *mnt) +diff -NurpP --minimal linux-3.3.8/fs/namei.c linux-3.3.8-vs2.3.3.4/fs/namei.c +--- linux-3.3.8/fs/namei.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/namei.c 2012-04-23 22:50:50.000000000 +0200 +@@ -33,6 +33,14 @@ + #include + #include + #include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include + #include + + #include "internal.h" +@@ -221,6 +229,89 @@ static int check_acl(struct inode *inode + return -EAGAIN; + } + ++static inline int dx_barrier(const struct inode *inode) ++{ ++ if (IS_BARRIER(inode) && !vx_check(0, VS_ADMIN | VS_WATCH)) { ++ vxwprintk_task(1, "did hit the barrier."); ++ return 1; ++ } ++ return 0; ++} ++ ++static int __dx_permission(const struct inode *inode, int mask) ++{ ++ if (dx_barrier(inode)) ++ return -EACCES; ++ ++ if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) { ++ /* devpts is xid tagged */ ++ if (S_ISDIR(inode->i_mode) || ++ vx_check((xid_t)inode->i_tag, VS_IDENT | VS_WATCH_P)) ++ return 0; ++ ++ /* just pretend we didn't find anything */ ++ return -ENOENT; ++ } ++ else if (inode->i_sb->s_magic == PROC_SUPER_MAGIC) { ++ struct proc_dir_entry *de = PDE(inode); ++ ++ if (de && !vx_hide_check(0, de->vx_flags)) ++ goto out; ++ ++ if ((mask & (MAY_WRITE | MAY_APPEND))) { ++ struct pid *pid; ++ struct task_struct *tsk; ++ ++ if (vx_check(0, VS_ADMIN | VS_WATCH_P) || ++ vx_flags(VXF_STATE_SETUP, 0)) ++ return 0; ++ ++ pid = PROC_I(inode)->pid; ++ if (!pid) ++ goto out; ++ ++ rcu_read_lock(); ++ tsk = pid_task(pid, PIDTYPE_PID); ++ vxdprintk(VXD_CBIT(tag, 0), "accessing %p[#%u]", ++ tsk, (tsk ? vx_task_xid(tsk) : 0)); ++ if (tsk && ++ vx_check(vx_task_xid(tsk), VS_IDENT | VS_WATCH_P)) { ++ rcu_read_unlock(); ++ return 0; ++ } ++ rcu_read_unlock(); ++ } ++ else { ++ /* FIXME: Should we block some entries here? */ ++ return 0; ++ } ++ } ++ else { ++ if (dx_notagcheck(inode->i_sb) || ++ dx_check(inode->i_tag, DX_HOSTID | DX_ADMIN | DX_WATCH | ++ DX_IDENT)) ++ return 0; ++ } ++ ++out: ++ return -EACCES; ++} ++ ++int dx_permission(const struct inode *inode, int mask) ++{ ++ int ret = __dx_permission(inode, mask); ++ if (unlikely(ret)) { ++#ifndef CONFIG_VSERVER_WARN_DEVPTS ++ if (inode->i_sb->s_magic != DEVPTS_SUPER_MAGIC) ++#endif ++ vxwprintk_task(1, ++ "denied [0x%x] access to inode %s:%p[#%d,%lu]", ++ mask, inode->i_sb->s_id, inode, inode->i_tag, ++ inode->i_ino); ++ } ++ return ret; ++} ++ + /* + * This does the basic permission checking + */ +@@ -356,10 +447,14 @@ int inode_permission(struct inode *inode + /* + * Nobody gets write access to an immutable file. + */ +- if (IS_IMMUTABLE(inode)) ++ if (IS_IMMUTABLE(inode) && !IS_COW(inode)) + return -EACCES; + } + ++ retval = dx_permission(inode, mask); ++ if (retval) ++ return retval; ++ + retval = do_inode_permission(inode, mask); + if (retval) + return retval; +@@ -1038,7 +1133,8 @@ static void follow_dotdot(struct nameida + + if (nd->path.dentry == nd->root.dentry && + nd->path.mnt == nd->root.mnt) { +- break; ++ /* for sane '/' avoid follow_mount() */ ++ return; + } + if (nd->path.dentry != nd->path.mnt->mnt_root) { + /* rare case of legitimate dget_parent()... */ +@@ -1149,6 +1245,9 @@ static int do_lookup(struct nameidata *n + } + if (unlikely(d_need_lookup(dentry))) + goto unlazy; ++ ++ /* FIXME: check dx permission */ ++ + path->mnt = mnt; + path->dentry = dentry; + if (unlikely(!__follow_mount_rcu(nd, path, inode))) +@@ -1210,6 +1309,8 @@ retry: + } + } + ++ /* FIXME: check dx permission */ ++ + path->mnt = mnt; + path->dentry = dentry; + err = follow_managed(path, nd->flags); +@@ -1926,7 +2027,7 @@ static int may_delete(struct inode *dir, + if (IS_APPEND(dir)) + return -EPERM; + if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)|| +- IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) ++ IS_IXORUNLINK(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + return -EPERM; + if (isdir) { + if (!S_ISDIR(victim->d_inode->i_mode)) +@@ -2006,19 +2107,25 @@ int vfs_create(struct inode *dir, struct + { + int error = may_create(dir, dentry); + +- if (error) ++ if (error) { ++ vxdprintk(VXD_CBIT(misc, 3), "may_create failed with %d", error); + return error; ++ } + + if (!dir->i_op->create) + return -EACCES; /* shouldn't it be ENOSYS? */ + mode &= S_IALLUGO; + mode |= S_IFREG; + error = security_inode_create(dir, dentry, mode); +- if (error) ++ if (error) { ++ vxdprintk(VXD_CBIT(misc, 3), "security_inode_create failed with %d", error); + return error; ++ } + error = dir->i_op->create(dir, dentry, mode, nd); + if (!error) + fsnotify_create(dir, dentry); ++ else ++ vxdprintk(VXD_CBIT(misc, 3), "i_op->create failed with %d", error); + return error; + } + +@@ -2053,6 +2160,15 @@ static int may_open(struct path *path, i + break; + } + ++#ifdef CONFIG_VSERVER_COWBL ++ if (IS_COW(inode) && ++ ((flag & O_ACCMODE) != O_RDONLY)) { ++ if (IS_COW_LINK(inode)) ++ return -EMLINK; ++ inode->i_flags &= ~(S_IXUNLINK|S_IMMUTABLE); ++ mark_inode_dirty(inode); ++ } ++#endif + error = inode_permission(inode, acc_mode); + if (error) + return error; +@@ -2277,6 +2393,16 @@ ok: + } + common: + error = may_open(&nd->path, acc_mode, open_flag); ++#ifdef CONFIG_VSERVER_COWBL ++ if (error == -EMLINK) { ++ struct dentry *dentry; ++ dentry = cow_break_link(pathname); ++ if (IS_ERR(dentry)) ++ error = PTR_ERR(dentry); ++ else ++ dput(dentry); ++ } ++#endif + if (error) + goto exit; + filp = nameidata_to_filp(nd); +@@ -2319,6 +2445,7 @@ static struct file *path_openat(int dfd, + struct path path; + int error; + ++restart: + filp = get_empty_filp(); + if (!filp) + return ERR_PTR(-ENFILE); +@@ -2356,6 +2483,17 @@ static struct file *path_openat(int dfd, + filp = do_last(nd, &path, op, pathname); + put_link(nd, &link, cookie); + } ++ ++#ifdef CONFIG_VSERVER_COWBL ++ if (filp == ERR_PTR(-EMLINK)) { ++ if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) ++ path_put(&nd->root); ++ if (base) ++ fput(base); ++ release_open_intent(nd); ++ goto restart; ++ } ++#endif + out: + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) + path_put(&nd->root); +@@ -2445,6 +2583,11 @@ struct dentry *kern_path_create(int dfd, + goto fail; + } + *path = nd.path; ++ vxdprintk(VXD_CBIT(misc, 3), "kern_path_create path.dentry = %p (%.*s), dentry = %p (%.*s), d_inode = %p", ++ path->dentry, path->dentry->d_name.len, ++ path->dentry->d_name.name, dentry, ++ dentry->d_name.len, dentry->d_name.name, ++ path->dentry->d_inode); + return dentry; + eexist: + dput(dentry); +@@ -2926,7 +3069,7 @@ int vfs_link(struct dentry *old_dentry, + /* + * A link to an append-only or immutable file cannot be created. + */ +- if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) ++ if (IS_APPEND(inode) || IS_IXORUNLINK(inode)) + return -EPERM; + if (!dir->i_op->link) + return -EPERM; +@@ -3307,6 +3450,253 @@ int vfs_follow_link(struct nameidata *nd + return __vfs_follow_link(nd, link); + } + ++ ++#ifdef CONFIG_VSERVER_COWBL ++ ++static inline ++long do_cow_splice(struct file *in, struct file *out, size_t len) ++{ ++ loff_t ppos = 0; ++ ++ return do_splice_direct(in, &ppos, out, len, 0); ++} ++ ++extern unsigned int mnt_get_count(struct mount *mnt); ++ ++struct dentry *cow_break_link(const char *pathname) ++{ ++ int ret, mode, pathlen, redo = 0; ++ struct nameidata old_nd, dir_nd; ++ struct path old_path, dir_path; ++ struct dentry *dir, *old_dentry, *new_dentry = NULL; ++ struct file *old_file; ++ struct file *new_file; ++ char *to, *path, pad='\251'; ++ loff_t size; ++ ++ vxdprintk(VXD_CBIT(misc, 1), ++ "cow_break_link(" VS_Q("%s") ")", pathname); ++ path = kmalloc(PATH_MAX, GFP_KERNEL); ++ ret = -ENOMEM; ++ if (!path) ++ goto out; ++ ++ /* old_nd will have refs to dentry and mnt */ ++ ret = do_path_lookup(AT_FDCWD, pathname, LOOKUP_FOLLOW, &old_nd); ++ vxdprintk(VXD_CBIT(misc, 2), ++ "do_path_lookup(old): %d [r=%d]", ++ ret, mnt_get_count(real_mount(old_nd.path.mnt))); ++ if (ret < 0) ++ goto out_free_path; ++ ++ old_path = old_nd.path; ++ old_dentry = old_path.dentry; ++ mode = old_dentry->d_inode->i_mode; ++ ++ to = d_path(&old_path, path, PATH_MAX-2); ++ pathlen = strlen(to); ++ vxdprintk(VXD_CBIT(misc, 2), ++ "old path " VS_Q("%s") " [%p:" VS_Q("%.*s") ":%d]", to, ++ old_dentry, ++ old_dentry->d_name.len, old_dentry->d_name.name, ++ old_dentry->d_name.len); ++ ++ to[pathlen + 1] = 0; ++retry: ++ new_dentry = NULL; ++ to[pathlen] = pad--; ++ ret = -ELOOP; ++ if (pad <= '\240') ++ goto out_rel_old; ++ ++ vxdprintk(VXD_CBIT(misc, 1), "temp copy " VS_Q("%s"), to); ++ /* dir_nd will have refs to dentry and mnt */ ++ ret = do_path_lookup(AT_FDCWD, to, ++ LOOKUP_PARENT | LOOKUP_OPEN | LOOKUP_CREATE, &dir_nd); ++ vxdprintk(VXD_CBIT(misc, 2), "do_path_lookup(new): %d", ret); ++ if (ret < 0) ++ goto retry; ++ ++ /* this puppy downs the dir inode mutex if successful */ ++ new_dentry = kern_path_create(AT_FDCWD, to, &dir_path, 0); ++ if (!new_dentry || IS_ERR(new_dentry)) { ++ path_put(&dir_nd.path); ++ vxdprintk(VXD_CBIT(misc, 2), ++ "kern_path_create(new) failed with %ld", ++ PTR_ERR(new_dentry)); ++ goto retry; ++ } ++ path_put(&dir_path); ++ vxdprintk(VXD_CBIT(misc, 2), ++ "kern_path_create(new): %p [" VS_Q("%.*s") ":%d]", ++ new_dentry, ++ new_dentry->d_name.len, new_dentry->d_name.name, ++ new_dentry->d_name.len); ++ ++ dir = dir_nd.path.dentry; ++ ++ ret = vfs_create(dir->d_inode, new_dentry, mode, &dir_nd); ++ vxdprintk(VXD_CBIT(misc, 2), ++ "vfs_create(new): %d", ret); ++ if (ret == -EEXIST) { ++ mutex_unlock(&dir->d_inode->i_mutex); ++ path_put(&dir_nd.path); ++ dput(new_dentry); ++ goto retry; ++ } ++ else if (ret < 0) ++ goto out_unlock_new; ++ ++ /* drop out early, ret passes ENOENT */ ++ ret = -ENOENT; ++ if ((redo = d_unhashed(old_dentry))) ++ goto out_unlock_new; ++ ++ path_get(&old_path); ++ /* this one cleans up the dentry/mnt in case of failure */ ++ old_file = dentry_open(old_dentry, old_path.mnt, ++ O_RDONLY, current_cred()); ++ vxdprintk(VXD_CBIT(misc, 2), ++ "dentry_open(old): %p", old_file); ++ if (IS_ERR(old_file)) { ++ ret = PTR_ERR(old_file); ++ goto out_unlock_new; ++ } ++ ++ dget(new_dentry); ++ mntget(old_path.mnt); ++ /* this one cleans up the dentry/mnt in case of failure */ ++ new_file = dentry_open(new_dentry, old_path.mnt, ++ O_WRONLY, current_cred()); ++ vxdprintk(VXD_CBIT(misc, 2), ++ "dentry_open(new): %p", new_file); ++ if (IS_ERR(new_file)) { ++ ret = PTR_ERR(new_file); ++ goto out_fput_old; ++ } ++ ++ size = i_size_read(old_file->f_dentry->d_inode); ++ ret = do_cow_splice(old_file, new_file, size); ++ vxdprintk(VXD_CBIT(misc, 2), "do_splice_direct: %d", ret); ++ if (ret < 0) { ++ goto out_fput_both; ++ } else if (ret < size) { ++ ret = -ENOSPC; ++ goto out_fput_both; ++ } else { ++ struct inode *old_inode = old_dentry->d_inode; ++ struct inode *new_inode = new_dentry->d_inode; ++ struct iattr attr = { ++ .ia_uid = old_inode->i_uid, ++ .ia_gid = old_inode->i_gid, ++ .ia_valid = ATTR_UID | ATTR_GID ++ }; ++ ++ setattr_copy(new_inode, &attr); ++ mark_inode_dirty(new_inode); ++ } ++ ++ mutex_lock(&old_dentry->d_inode->i_sb->s_vfs_rename_mutex); ++ ++ /* drop out late */ ++ ret = -ENOENT; ++ if ((redo = d_unhashed(old_dentry))) ++ goto out_unlock; ++ ++ vxdprintk(VXD_CBIT(misc, 2), ++ "vfs_rename: [" VS_Q("%*s") ":%d] -> [" VS_Q("%*s") ":%d]", ++ new_dentry->d_name.len, new_dentry->d_name.name, ++ new_dentry->d_name.len, ++ old_dentry->d_name.len, old_dentry->d_name.name, ++ old_dentry->d_name.len); ++ ret = vfs_rename(dir_nd.path.dentry->d_inode, new_dentry, ++ old_dentry->d_parent->d_inode, old_dentry); ++ vxdprintk(VXD_CBIT(misc, 2), "vfs_rename: %d", ret); ++ ++out_unlock: ++ mutex_unlock(&old_dentry->d_inode->i_sb->s_vfs_rename_mutex); ++ ++out_fput_both: ++ vxdprintk(VXD_CBIT(misc, 3), ++ "fput(new_file=%p[#%ld])", new_file, ++ atomic_long_read(&new_file->f_count)); ++ fput(new_file); ++ ++out_fput_old: ++ vxdprintk(VXD_CBIT(misc, 3), ++ "fput(old_file=%p[#%ld])", old_file, ++ atomic_long_read(&old_file->f_count)); ++ fput(old_file); ++ ++out_unlock_new: ++ mutex_unlock(&dir->d_inode->i_mutex); ++ if (!ret) ++ goto out_redo; ++ ++ /* error path cleanup */ ++ vfs_unlink(dir->d_inode, new_dentry); ++ ++out_redo: ++ if (!redo) ++ goto out_rel_both; ++ /* lookup dentry once again */ ++ /* old_nd.path is freed as old_path in out_rel_old */ ++ ret = do_path_lookup(AT_FDCWD, pathname, LOOKUP_FOLLOW, &old_nd); ++ if (ret) ++ goto out_rel_both; ++ ++ dput(new_dentry); ++ new_dentry = old_nd.path.dentry; ++ vxdprintk(VXD_CBIT(misc, 2), ++ "do_path_lookup(redo): %p [" VS_Q("%.*s") ":%d]", ++ new_dentry, ++ new_dentry->d_name.len, new_dentry->d_name.name, ++ new_dentry->d_name.len); ++ dget(new_dentry); ++ ++out_rel_both: ++ path_put(&dir_nd.path); ++out_rel_old: ++ path_put(&old_path); ++out_free_path: ++ kfree(path); ++out: ++ if (ret) { ++ dput(new_dentry); ++ new_dentry = ERR_PTR(ret); ++ } ++ vxdprintk(VXD_CBIT(misc, 3), ++ "cow_break_link returning with %p [r=%d]", ++ new_dentry, mnt_get_count(real_mount(old_nd.path.mnt))); ++ return new_dentry; ++} ++ ++#endif ++ ++int vx_info_mnt_namespace(struct mnt_namespace *ns, char *buffer) ++{ ++ struct path path; ++ struct vfsmount *vmnt; ++ char *pstr, *root; ++ int length = 0; ++ ++ pstr = kmalloc(PATH_MAX, GFP_KERNEL); ++ if (!pstr) ++ return 0; ++ ++ vmnt = &ns->root->mnt; ++ path.mnt = vmnt; ++ path.dentry = vmnt->mnt_root; ++ root = d_path(&path, pstr, PATH_MAX - 2); ++ length = sprintf(buffer + length, ++ "Namespace:\t%p [#%u]\n" ++ "RootPath:\t%s\n", ++ ns, atomic_read(&ns->count), ++ root); ++ kfree(pstr); ++ return length; ++} ++ + /* get the link contents into pagecache */ + static char *page_getlink(struct dentry * dentry, struct page **ppage) + { +@@ -3431,3 +3821,4 @@ EXPORT_SYMBOL(vfs_symlink); + EXPORT_SYMBOL(vfs_unlink); + EXPORT_SYMBOL(dentry_unhash); + EXPORT_SYMBOL(generic_readlink); ++EXPORT_SYMBOL(vx_info_mnt_namespace); +diff -NurpP --minimal linux-3.3.8/fs/namespace.c linux-3.3.8-vs2.3.3.4/fs/namespace.c +--- linux-3.3.8/fs/namespace.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/namespace.c 2012-02-24 17:38:42.000000000 +0100 +@@ -20,6 +20,11 @@ + #include /* get_fs_root et.al. */ + #include /* fsnotify_vfsmount_delete */ + #include ++#include ++#include ++#include ++#include ++#include + #include "pnode.h" + #include "internal.h" + +@@ -697,6 +702,10 @@ vfs_kern_mount(struct file_system_type * + if (!type) + return ERR_PTR(-ENODEV); + ++ if ((type->fs_flags & FS_BINARY_MOUNTDATA) && ++ !vx_capable(CAP_SYS_ADMIN, VXC_BINARY_MOUNT)) ++ return ERR_PTR(-EPERM); ++ + mnt = alloc_vfsmnt(name); + if (!mnt) + return ERR_PTR(-ENOMEM); +@@ -745,6 +754,7 @@ static struct mount *clone_mnt(struct mo + mnt->mnt.mnt_root = dget(root); + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + mnt->mnt_parent = mnt; ++ mnt->mnt_tag = old->mnt_tag; + br_write_lock(vfsmount_lock); + list_add_tail(&mnt->mnt_instance, &sb->s_mounts); + br_write_unlock(vfsmount_lock); +@@ -1209,7 +1219,7 @@ SYSCALL_DEFINE2(umount, char __user *, n + goto dput_and_out; + + retval = -EPERM; +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_SECURE_MOUNT)) + goto dput_and_out; + + retval = do_umount(mnt, flags); +@@ -1235,7 +1245,7 @@ SYSCALL_DEFINE1(oldumount, char __user * + + static int mount_is_safe(struct path *path) + { +- if (capable(CAP_SYS_ADMIN)) ++ if (vx_capable(CAP_SYS_ADMIN, VXC_SECURE_MOUNT)) + return 0; + return -EPERM; + #ifdef notyet +@@ -1548,7 +1558,7 @@ static int do_change_type(struct path *p + int type; + int err = 0; + +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_NAMESPACE)) + return -EPERM; + + if (path->dentry != path->mnt->mnt_root) +@@ -1564,6 +1574,7 @@ static int do_change_type(struct path *p + if (err) + goto out_unlock; + } ++ // mnt->mnt_flags = mnt_flags; + + br_write_lock(vfsmount_lock); + for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) +@@ -1579,12 +1590,14 @@ static int do_change_type(struct path *p + * do loopback mount. + */ + static int do_loopback(struct path *path, char *old_name, +- int recurse) ++ tag_t tag, unsigned long flags, int mnt_flags) + { + LIST_HEAD(umount_list); + struct path old_path; + struct mount *mnt = NULL, *old; + int err = mount_is_safe(path); ++ int recurse = flags & MS_REC; ++ + if (err) + return err; + if (!old_name || !*old_name) +@@ -1652,13 +1665,13 @@ static int change_mount_flags(struct vfs + * on it - tough luck. + */ + static int do_remount(struct path *path, int flags, int mnt_flags, +- void *data) ++ void *data, xid_t xid) + { + int err; + struct super_block *sb = path->mnt->mnt_sb; + struct mount *mnt = real_mount(path->mnt); + +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_SECURE_REMOUNT)) + return -EPERM; + + if (!check_mnt(mnt)) +@@ -1707,7 +1720,7 @@ static int do_move_mount(struct path *pa + struct mount *p; + struct mount *old; + int err = 0; +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_SECURE_MOUNT)) + return -EPERM; + if (!old_name || !*old_name) + return -EINVAL; +@@ -1858,7 +1871,7 @@ static int do_new_mount(struct path *pat + return -EINVAL; + + /* we need capabilities... */ +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_SECURE_MOUNT)) + return -EPERM; + + mnt = do_kern_mount(type, flags, name, data); +@@ -2128,6 +2141,7 @@ long do_mount(char *dev_name, char *dir_ + struct path path; + int retval = 0; + int mnt_flags = 0; ++ tag_t tag = 0; + + /* Discard magic */ + if ((flags & MS_MGC_MSK) == MS_MGC_VAL) +@@ -2155,6 +2169,12 @@ long do_mount(char *dev_name, char *dir_ + if (!(flags & MS_NOATIME)) + mnt_flags |= MNT_RELATIME; + ++ if (dx_parse_tag(data_page, &tag, 1, &mnt_flags, &flags)) { ++ /* FIXME: bind and re-mounts get the tag flag? */ ++ if (flags & (MS_BIND|MS_REMOUNT)) ++ flags |= MS_TAGID; ++ } ++ + /* Separate the per-mountpoint flags */ + if (flags & MS_NOSUID) + mnt_flags |= MNT_NOSUID; +@@ -2171,15 +2191,17 @@ long do_mount(char *dev_name, char *dir_ + if (flags & MS_RDONLY) + mnt_flags |= MNT_READONLY; + ++ if (!capable(CAP_SYS_ADMIN)) ++ mnt_flags |= MNT_NODEV; + flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | + MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | + MS_STRICTATIME); + + if (flags & MS_REMOUNT) + retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, +- data_page); ++ data_page, tag); + else if (flags & MS_BIND) +- retval = do_loopback(&path, dev_name, flags & MS_REC); ++ retval = do_loopback(&path, dev_name, tag, flags, mnt_flags); + else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + retval = do_change_type(&path, flags); + else if (flags & MS_MOVE) +@@ -2282,6 +2304,7 @@ static struct mnt_namespace *dup_mnt_ns( + q = next_mnt(q, new); + } + up_write(&namespace_sem); ++ atomic_inc(&vs_global_mnt_ns); + + if (rootmnt) + mntput(rootmnt); +@@ -2478,9 +2501,10 @@ SYSCALL_DEFINE2(pivot_root, const char _ + error = -EINVAL; + new_mnt = real_mount(new.mnt); + root_mnt = real_mount(root.mnt); +- if (IS_MNT_SHARED(real_mount(old.mnt)) || ++ if ((IS_MNT_SHARED(real_mount(old.mnt)) || + IS_MNT_SHARED(new_mnt->mnt_parent) || +- IS_MNT_SHARED(root_mnt->mnt_parent)) ++ IS_MNT_SHARED(root_mnt->mnt_parent)) && ++ !vx_flags(VXF_STATE_SETUP, 0)) + goto out4; + if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) + goto out4; +@@ -2601,6 +2625,7 @@ void put_mnt_ns(struct mnt_namespace *ns + br_write_unlock(vfsmount_lock); + up_write(&namespace_sem); + release_mounts(&umount_list); ++ atomic_dec(&vs_global_mnt_ns); + kfree(ns); + } + +diff -NurpP --minimal linux-3.3.8/fs/nfs/client.c linux-3.3.8-vs2.3.3.4/fs/nfs/client.c +--- linux-3.3.8/fs/nfs/client.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/nfs/client.c 2012-02-24 03:55:06.000000000 +0100 +@@ -784,6 +784,9 @@ static int nfs_init_server_rpcclient(str + if (server->flags & NFS_MOUNT_SOFT) + server->client->cl_softrtry = 1; + ++ server->client->cl_tag = 0; ++ if (server->flags & NFS_MOUNT_TAGGED) ++ server->client->cl_tag = 1; + return 0; + } + +@@ -958,6 +961,10 @@ static void nfs_server_set_fsinfo(struct + server->acdirmin = server->acdirmax = 0; + } + ++ /* FIXME: needs fsinfo ++ if (server->flags & NFS_MOUNT_TAGGED) ++ sb->s_flags |= MS_TAGGED; */ ++ + server->maxfilesize = fsinfo->maxfilesize; + + server->time_delta = fsinfo->time_delta; +diff -NurpP --minimal linux-3.3.8/fs/nfs/dir.c linux-3.3.8-vs2.3.3.4/fs/nfs/dir.c +--- linux-3.3.8/fs/nfs/dir.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/nfs/dir.c 2012-02-24 03:55:06.000000000 +0100 +@@ -35,6 +35,7 @@ + #include + #include + #include ++#include + + #include "delegation.h" + #include "iostat.h" +@@ -1311,6 +1312,7 @@ static struct dentry *nfs_lookup(struct + if (IS_ERR(res)) + goto out_unblock_sillyrename; + ++ dx_propagate_tag(nd, inode); + no_entry: + res = d_materialise_unique(dentry, inode); + if (res != NULL) { +diff -NurpP --minimal linux-3.3.8/fs/nfs/inode.c linux-3.3.8-vs2.3.3.4/fs/nfs/inode.c +--- linux-3.3.8/fs/nfs/inode.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/nfs/inode.c 2012-02-24 04:40:22.000000000 +0100 +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -274,6 +275,8 @@ nfs_fhget(struct super_block *sb, struct + if (inode->i_state & I_NEW) { + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long now = jiffies; ++ uid_t uid; ++ gid_t gid; + + /* We set i_ino for the few things that still rely on it, + * such as stat(2) */ +@@ -322,8 +325,8 @@ nfs_fhget(struct super_block *sb, struct + inode->i_version = 0; + inode->i_size = 0; + clear_nlink(inode); +- inode->i_uid = -2; +- inode->i_gid = -2; ++ uid = -2; ++ gid = -2; + inode->i_blocks = 0; + memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); + +@@ -360,13 +363,13 @@ nfs_fhget(struct super_block *sb, struct + else if (nfs_server_capable(inode, NFS_CAP_NLINK)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + if (fattr->valid & NFS_ATTR_FATTR_OWNER) +- inode->i_uid = fattr->uid; ++ uid = fattr->uid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; + if (fattr->valid & NFS_ATTR_FATTR_GROUP) +- inode->i_gid = fattr->gid; ++ gid = fattr->gid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS +@@ -379,6 +382,11 @@ nfs_fhget(struct super_block *sb, struct + */ + inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); + } ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, 0); ++ /* maybe fattr->xid someday */ ++ + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); + nfsi->attrtimeo_timestamp = now; + nfsi->access_cache = RB_ROOT; +@@ -495,6 +503,8 @@ void nfs_setattr_update_inode(struct ino + inode->i_uid = attr->ia_uid; + if ((attr->ia_valid & ATTR_GID) != 0) + inode->i_gid = attr->ia_gid; ++ if ((attr->ia_valid & ATTR_TAG) && IS_TAGGED(inode)) ++ inode->i_tag = attr->ia_tag; + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + spin_unlock(&inode->i_lock); + } +@@ -944,6 +954,9 @@ static int nfs_check_inode_attributes(st + struct nfs_inode *nfsi = NFS_I(inode); + loff_t cur_size, new_isize; + unsigned long invalid = 0; ++ uid_t uid; ++ gid_t gid; ++ tag_t tag; + + + /* Has the inode gone and changed behind our back? */ +@@ -967,13 +980,18 @@ static int nfs_check_inode_attributes(st + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + } + ++ uid = INOTAG_UID(DX_TAG(inode), fattr->uid, fattr->gid); ++ gid = INOTAG_GID(DX_TAG(inode), fattr->uid, fattr->gid); ++ tag = INOTAG_TAG(DX_TAG(inode), fattr->uid, fattr->gid, 0); ++ + /* Have any file permissions changed? */ + if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; +- if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid) ++ if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && uid != fattr->uid) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; +- if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid) ++ if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && gid != fattr->gid) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; ++ /* maybe check for tag too? */ + + /* Has the link count changed? */ + if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) +@@ -1210,6 +1228,9 @@ static int nfs_update_inode(struct inode + unsigned long invalid = 0; + unsigned long now = jiffies; + unsigned long save_cache_validity; ++ uid_t uid; ++ gid_t gid; ++ tag_t tag; + + dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", + __func__, inode->i_sb->s_id, inode->i_ino, +@@ -1317,6 +1338,9 @@ static int nfs_update_inode(struct inode + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); + ++ uid = TAGINO_UID(DX_TAG(inode), inode->i_uid, inode->i_tag); ++ gid = TAGINO_GID(DX_TAG(inode), inode->i_gid, inode->i_tag); ++ tag = inode->i_tag; + + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); +@@ -1338,9 +1362,9 @@ static int nfs_update_inode(struct inode + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_OWNER) { +- if (inode->i_uid != fattr->uid) { ++ if (uid != fattr->uid) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; +- inode->i_uid = fattr->uid; ++ uid = fattr->uid; + } + } else if (server->caps & NFS_CAP_OWNER) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +@@ -1349,9 +1373,9 @@ static int nfs_update_inode(struct inode + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_GROUP) { +- if (inode->i_gid != fattr->gid) { ++ if (gid != fattr->gid) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; +- inode->i_gid = fattr->gid; ++ gid = fattr->gid; + } + } else if (server->caps & NFS_CAP_OWNER_GROUP) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +@@ -1359,6 +1383,10 @@ static int nfs_update_inode(struct inode + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, tag); ++ + if (fattr->valid & NFS_ATTR_FATTR_NLINK) { + if (inode->i_nlink != fattr->nlink) { + invalid |= NFS_INO_INVALID_ATTR; +diff -NurpP --minimal linux-3.3.8/fs/nfs/nfs3xdr.c linux-3.3.8-vs2.3.3.4/fs/nfs/nfs3xdr.c +--- linux-3.3.8/fs/nfs/nfs3xdr.c 2011-03-15 18:07:32.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/nfs/nfs3xdr.c 2012-02-24 03:55:06.000000000 +0100 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + #include "internal.h" + + #define NFSDBG_FACILITY NFSDBG_XDR +@@ -562,7 +563,8 @@ static __be32 *xdr_decode_nfstime3(__be3 + * set_mtime mtime; + * }; + */ +-static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) ++static void encode_sattr3(struct xdr_stream *xdr, ++ const struct iattr *attr, int tag) + { + u32 nbytes; + __be32 *p; +@@ -594,15 +596,19 @@ static void encode_sattr3(struct xdr_str + } else + *p++ = xdr_zero; + +- if (attr->ia_valid & ATTR_UID) { ++ if (attr->ia_valid & ATTR_UID || ++ (tag && (attr->ia_valid & ATTR_TAG))) { + *p++ = xdr_one; +- *p++ = cpu_to_be32(attr->ia_uid); ++ *p++ = cpu_to_be32(TAGINO_UID(tag, ++ attr->ia_uid, attr->ia_tag)); + } else + *p++ = xdr_zero; + +- if (attr->ia_valid & ATTR_GID) { ++ if (attr->ia_valid & ATTR_GID || ++ (tag && (attr->ia_valid & ATTR_TAG))) { + *p++ = xdr_one; +- *p++ = cpu_to_be32(attr->ia_gid); ++ *p++ = cpu_to_be32(TAGINO_GID(tag, ++ attr->ia_gid, attr->ia_tag)); + } else + *p++ = xdr_zero; + +@@ -878,7 +884,7 @@ static void nfs3_xdr_enc_setattr3args(st + const struct nfs3_sattrargs *args) + { + encode_nfs_fh3(xdr, args->fh); +- encode_sattr3(xdr, args->sattr); ++ encode_sattr3(xdr, args->sattr, req->rq_task->tk_client->cl_tag); + encode_sattrguard3(xdr, args); + } + +@@ -1028,13 +1034,13 @@ static void nfs3_xdr_enc_write3args(stru + * }; + */ + static void encode_createhow3(struct xdr_stream *xdr, +- const struct nfs3_createargs *args) ++ const struct nfs3_createargs *args, int tag) + { + encode_uint32(xdr, args->createmode); + switch (args->createmode) { + case NFS3_CREATE_UNCHECKED: + case NFS3_CREATE_GUARDED: +- encode_sattr3(xdr, args->sattr); ++ encode_sattr3(xdr, args->sattr, tag); + break; + case NFS3_CREATE_EXCLUSIVE: + encode_createverf3(xdr, args->verifier); +@@ -1049,7 +1055,7 @@ static void nfs3_xdr_enc_create3args(str + const struct nfs3_createargs *args) + { + encode_diropargs3(xdr, args->fh, args->name, args->len); +- encode_createhow3(xdr, args); ++ encode_createhow3(xdr, args, req->rq_task->tk_client->cl_tag); + } + + /* +@@ -1065,7 +1071,7 @@ static void nfs3_xdr_enc_mkdir3args(stru + const struct nfs3_mkdirargs *args) + { + encode_diropargs3(xdr, args->fh, args->name, args->len); +- encode_sattr3(xdr, args->sattr); ++ encode_sattr3(xdr, args->sattr, req->rq_task->tk_client->cl_tag); + } + + /* +@@ -1082,9 +1088,9 @@ static void nfs3_xdr_enc_mkdir3args(stru + * }; + */ + static void encode_symlinkdata3(struct xdr_stream *xdr, +- const struct nfs3_symlinkargs *args) ++ const struct nfs3_symlinkargs *args, int tag) + { +- encode_sattr3(xdr, args->sattr); ++ encode_sattr3(xdr, args->sattr, tag); + encode_nfspath3(xdr, args->pages, args->pathlen); + } + +@@ -1093,7 +1099,7 @@ static void nfs3_xdr_enc_symlink3args(st + const struct nfs3_symlinkargs *args) + { + encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen); +- encode_symlinkdata3(xdr, args); ++ encode_symlinkdata3(xdr, args, req->rq_task->tk_client->cl_tag); + } + + /* +@@ -1121,24 +1127,24 @@ static void nfs3_xdr_enc_symlink3args(st + * }; + */ + static void encode_devicedata3(struct xdr_stream *xdr, +- const struct nfs3_mknodargs *args) ++ const struct nfs3_mknodargs *args, int tag) + { +- encode_sattr3(xdr, args->sattr); ++ encode_sattr3(xdr, args->sattr, tag); + encode_specdata3(xdr, args->rdev); + } + + static void encode_mknoddata3(struct xdr_stream *xdr, +- const struct nfs3_mknodargs *args) ++ const struct nfs3_mknodargs *args, int tag) + { + encode_ftype3(xdr, args->type); + switch (args->type) { + case NF3CHR: + case NF3BLK: +- encode_devicedata3(xdr, args); ++ encode_devicedata3(xdr, args, tag); + break; + case NF3SOCK: + case NF3FIFO: +- encode_sattr3(xdr, args->sattr); ++ encode_sattr3(xdr, args->sattr, tag); + break; + case NF3REG: + case NF3DIR: +@@ -1153,7 +1159,7 @@ static void nfs3_xdr_enc_mknod3args(stru + const struct nfs3_mknodargs *args) + { + encode_diropargs3(xdr, args->fh, args->name, args->len); +- encode_mknoddata3(xdr, args); ++ encode_mknoddata3(xdr, args, req->rq_task->tk_client->cl_tag); + } + + /* +diff -NurpP --minimal linux-3.3.8/fs/nfs/super.c linux-3.3.8-vs2.3.3.4/fs/nfs/super.c +--- linux-3.3.8/fs/nfs/super.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/nfs/super.c 2012-05-09 04:08:08.000000000 +0200 +@@ -52,6 +52,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -86,6 +87,7 @@ enum { + Opt_sharecache, Opt_nosharecache, + Opt_resvport, Opt_noresvport, + Opt_fscache, Opt_nofscache, ++ Opt_tag, Opt_notag, + + /* Mount options that take integer arguments */ + Opt_port, +@@ -99,6 +101,7 @@ enum { + Opt_mountvers, + Opt_nfsvers, + Opt_minorversion, ++ Opt_tagid, + + /* Mount options that take string arguments */ + Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, +@@ -179,6 +182,10 @@ static const match_table_t nfs_mount_opt + { Opt_fscache_uniq, "fsc=%s" }, + { Opt_local_lock, "local_lock=%s" }, + ++ { Opt_tag, "tag" }, ++ { Opt_notag, "notag" }, ++ { Opt_tagid, "tagid=%u" }, ++ + { Opt_err, NULL } + }; + +@@ -649,6 +656,7 @@ static void nfs_show_mount_options(struc + { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" }, + { NFS_MOUNT_UNSHARED, ",nosharecache", "" }, + { NFS_MOUNT_NORESVPORT, ",noresvport", "" }, ++ { NFS_MOUNT_TAGGED, ",tag", "" }, + { 0, NULL, NULL } + }; + const struct proc_nfs_info *nfs_infop; +@@ -1216,6 +1224,14 @@ static int nfs_parse_mount_options(char + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = NULL; + break; ++#ifndef CONFIG_TAGGING_NONE ++ case Opt_tag: ++ mnt->flags |= NFS_MOUNT_TAGGED; ++ break; ++ case Opt_notag: ++ mnt->flags &= ~NFS_MOUNT_TAGGED; ++ break; ++#endif + + /* + * options that take numeric values +@@ -1322,6 +1338,12 @@ static int nfs_parse_mount_options(char + goto out_invalid_value; + mnt->minorversion = option; + break; ++#ifdef CONFIG_PROPAGATE ++ case Opt_tagid: ++ /* use args[0] */ ++ nfs_data.flags |= NFS_MOUNT_TAGGED; ++ break; ++#endif + + /* + * options that take text values +diff -NurpP --minimal linux-3.3.8/fs/nfsd/auth.c linux-3.3.8-vs2.3.3.4/fs/nfsd/auth.c +--- linux-3.3.8/fs/nfsd/auth.c 2010-02-25 11:52:05.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/nfsd/auth.c 2012-02-24 03:55:06.000000000 +0100 +@@ -1,6 +1,7 @@ + /* Copyright (C) 1995, 1996 Olaf Kirch */ + + #include ++#include + #include "nfsd.h" + #include "auth.h" + +@@ -36,6 +37,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, + + new->fsuid = rqstp->rq_cred.cr_uid; + new->fsgid = rqstp->rq_cred.cr_gid; ++ /* FIXME: this desperately needs a tag :) ++ new->xid = (xid_t)INOTAG_TAG(DX_TAG_NFSD, cred.cr_uid, cred.cr_gid, 0); ++ */ + + rqgi = rqstp->rq_cred.cr_group_info; + +diff -NurpP --minimal linux-3.3.8/fs/nfsd/nfs3xdr.c linux-3.3.8-vs2.3.3.4/fs/nfsd/nfs3xdr.c +--- linux-3.3.8/fs/nfsd/nfs3xdr.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/nfsd/nfs3xdr.c 2012-04-30 19:34:37.000000000 +0200 +@@ -7,6 +7,7 @@ + */ + + #include ++#include + #include "xdr3.h" + #include "auth.h" + +@@ -95,6 +96,8 @@ static __be32 * + decode_sattr3(__be32 *p, struct iattr *iap) + { + u32 tmp; ++ uid_t uid = 0; ++ gid_t gid = 0; + + iap->ia_valid = 0; + +@@ -104,12 +107,15 @@ decode_sattr3(__be32 *p, struct iattr *i + } + if (*p++) { + iap->ia_valid |= ATTR_UID; +- iap->ia_uid = ntohl(*p++); ++ uid = ntohl(*p++); + } + if (*p++) { + iap->ia_valid |= ATTR_GID; +- iap->ia_gid = ntohl(*p++); ++ gid = ntohl(*p++); + } ++ iap->ia_uid = INOTAG_UID(DX_TAG_NFSD, uid, gid); ++ iap->ia_gid = INOTAG_GID(DX_TAG_NFSD, uid, gid); ++ iap->ia_tag = INOTAG_TAG(DX_TAG_NFSD, uid, gid, 0); + if (*p++) { + u64 newsize; + +@@ -165,8 +171,12 @@ encode_fattr3(struct svc_rqst *rqstp, __ + *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); + *p++ = htonl((u32) stat->mode); + *p++ = htonl((u32) stat->nlink); +- *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid)); +- *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid)); ++ *p++ = htonl((u32) nfsd_ruid(rqstp, ++ TAGINO_UID(0 /* FIXME: DX_TAG(dentry->d_inode) */, ++ stat->uid, stat->tag))); ++ *p++ = htonl((u32) nfsd_rgid(rqstp, ++ TAGINO_GID(0 /* FIXME: DX_TAG(dentry->d_inode) */, ++ stat->gid, stat->tag))); + if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) { + p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN); + } else { +diff -NurpP --minimal linux-3.3.8/fs/nfsd/nfs4xdr.c linux-3.3.8-vs2.3.3.4/fs/nfsd/nfs4xdr.c +--- linux-3.3.8/fs/nfsd/nfs4xdr.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/nfsd/nfs4xdr.c 2012-04-30 19:34:37.000000000 +0200 +@@ -46,6 +46,7 @@ + #include + #include + #include ++#include + + #include "idmap.h" + #include "acl.h" +@@ -2327,14 +2328,18 @@ out_acl: + WRITE32(stat.nlink); + } + if (bmval1 & FATTR4_WORD1_OWNER) { +- status = nfsd4_encode_user(rqstp, stat.uid, &p, &buflen); ++ status = nfsd4_encode_user(rqstp, ++ TAGINO_UID(DX_TAG(dentry->d_inode), ++ stat.uid, stat.tag), &p, &buflen); + if (status == nfserr_resource) + goto out_resource; + if (status) + goto out; + } + if (bmval1 & FATTR4_WORD1_OWNER_GROUP) { +- status = nfsd4_encode_group(rqstp, stat.gid, &p, &buflen); ++ status = nfsd4_encode_group(rqstp, ++ TAGINO_GID(DX_TAG(dentry->d_inode), ++ stat.gid, stat.tag), &p, &buflen); + if (status == nfserr_resource) + goto out_resource; + if (status) +diff -NurpP --minimal linux-3.3.8/fs/nfsd/nfsxdr.c linux-3.3.8-vs2.3.3.4/fs/nfsd/nfsxdr.c +--- linux-3.3.8/fs/nfsd/nfsxdr.c 2011-05-22 16:17:53.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/nfsd/nfsxdr.c 2012-02-24 03:55:06.000000000 +0100 +@@ -6,6 +6,7 @@ + + #include "xdr.h" + #include "auth.h" ++#include + + #define NFSDDBG_FACILITY NFSDDBG_XDR + +@@ -88,6 +89,8 @@ static __be32 * + decode_sattr(__be32 *p, struct iattr *iap) + { + u32 tmp, tmp1; ++ uid_t uid = 0; ++ gid_t gid = 0; + + iap->ia_valid = 0; + +@@ -101,12 +104,15 @@ decode_sattr(__be32 *p, struct iattr *ia + } + if ((tmp = ntohl(*p++)) != (u32)-1) { + iap->ia_valid |= ATTR_UID; +- iap->ia_uid = tmp; ++ uid = tmp; + } + if ((tmp = ntohl(*p++)) != (u32)-1) { + iap->ia_valid |= ATTR_GID; +- iap->ia_gid = tmp; ++ gid = tmp; + } ++ iap->ia_uid = INOTAG_UID(DX_TAG_NFSD, uid, gid); ++ iap->ia_gid = INOTAG_GID(DX_TAG_NFSD, uid, gid); ++ iap->ia_tag = INOTAG_TAG(DX_TAG_NFSD, uid, gid, 0); + if ((tmp = ntohl(*p++)) != (u32)-1) { + iap->ia_valid |= ATTR_SIZE; + iap->ia_size = tmp; +@@ -151,8 +157,10 @@ encode_fattr(struct svc_rqst *rqstp, __b + *p++ = htonl(nfs_ftypes[type >> 12]); + *p++ = htonl((u32) stat->mode); + *p++ = htonl((u32) stat->nlink); +- *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid)); +- *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid)); ++ *p++ = htonl((u32) nfsd_ruid(rqstp, ++ TAGINO_UID(DX_TAG(dentry->d_inode), stat->uid, stat->tag))); ++ *p++ = htonl((u32) nfsd_rgid(rqstp, ++ TAGINO_GID(DX_TAG(dentry->d_inode), stat->gid, stat->tag))); + + if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) { + *p++ = htonl(NFS_MAXPATHLEN); +diff -NurpP --minimal linux-3.3.8/fs/ocfs2/dlmglue.c linux-3.3.8-vs2.3.3.4/fs/ocfs2/dlmglue.c +--- linux-3.3.8/fs/ocfs2/dlmglue.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ocfs2/dlmglue.c 2012-02-24 03:55:06.000000000 +0100 +@@ -2047,6 +2047,7 @@ static void __ocfs2_stuff_meta_lvb(struc + lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); + lvb->lvb_iuid = cpu_to_be32(inode->i_uid); + lvb->lvb_igid = cpu_to_be32(inode->i_gid); ++ lvb->lvb_itag = cpu_to_be16(inode->i_tag); + lvb->lvb_imode = cpu_to_be16(inode->i_mode); + lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); + lvb->lvb_iatime_packed = +@@ -2097,6 +2098,7 @@ static void ocfs2_refresh_inode_from_lvb + + inode->i_uid = be32_to_cpu(lvb->lvb_iuid); + inode->i_gid = be32_to_cpu(lvb->lvb_igid); ++ inode->i_tag = be16_to_cpu(lvb->lvb_itag); + inode->i_mode = be16_to_cpu(lvb->lvb_imode); + set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); + ocfs2_unpack_timespec(&inode->i_atime, +diff -NurpP --minimal linux-3.3.8/fs/ocfs2/dlmglue.h linux-3.3.8-vs2.3.3.4/fs/ocfs2/dlmglue.h +--- linux-3.3.8/fs/ocfs2/dlmglue.h 2010-10-21 13:07:50.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/ocfs2/dlmglue.h 2012-02-24 03:55:06.000000000 +0100 +@@ -46,7 +46,8 @@ struct ocfs2_meta_lvb { + __be16 lvb_inlink; + __be32 lvb_iattr; + __be32 lvb_igeneration; +- __be32 lvb_reserved2; ++ __be16 lvb_itag; ++ __be16 lvb_reserved2; + }; + + #define OCFS2_QINFO_LVB_VERSION 1 +diff -NurpP --minimal linux-3.3.8/fs/ocfs2/file.c linux-3.3.8-vs2.3.3.4/fs/ocfs2/file.c +--- linux-3.3.8/fs/ocfs2/file.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ocfs2/file.c 2012-02-24 03:55:06.000000000 +0100 +@@ -1123,7 +1123,7 @@ int ocfs2_setattr(struct dentry *dentry, + attr->ia_valid &= ~ATTR_SIZE; + + #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ +- | ATTR_GID | ATTR_UID | ATTR_MODE) ++ | ATTR_GID | ATTR_UID | ATTR_TAG | ATTR_MODE) + if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) + return 0; + +diff -NurpP --minimal linux-3.3.8/fs/ocfs2/inode.c linux-3.3.8-vs2.3.3.4/fs/ocfs2/inode.c +--- linux-3.3.8/fs/ocfs2/inode.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ocfs2/inode.c 2012-02-24 03:55:06.000000000 +0100 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + #include + +@@ -78,11 +79,13 @@ void ocfs2_set_inode_flags(struct inode + { + unsigned int flags = OCFS2_I(inode)->ip_attr; + +- inode->i_flags &= ~(S_IMMUTABLE | ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | + S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); + + if (flags & OCFS2_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; ++ if (flags & OCFS2_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; + + if (flags & OCFS2_SYNC_FL) + inode->i_flags |= S_SYNC; +@@ -92,25 +95,44 @@ void ocfs2_set_inode_flags(struct inode + inode->i_flags |= S_NOATIME; + if (flags & OCFS2_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (flags & OCFS2_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ if (flags & OCFS2_COW_FL) ++ inode->i_vflags |= V_COW; + } + + /* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */ + void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi) + { + unsigned int flags = oi->vfs_inode.i_flags; ++ unsigned int vflags = oi->vfs_inode.i_vflags; ++ ++ oi->ip_attr &= ~(OCFS2_SYNC_FL | OCFS2_APPEND_FL | ++ OCFS2_IMMUTABLE_FL | OCFS2_IXUNLINK_FL | ++ OCFS2_NOATIME_FL | OCFS2_DIRSYNC_FL | ++ OCFS2_BARRIER_FL | OCFS2_COW_FL); ++ ++ if (flags & S_IMMUTABLE) ++ oi->ip_attr |= OCFS2_IMMUTABLE_FL; ++ if (flags & S_IXUNLINK) ++ oi->ip_attr |= OCFS2_IXUNLINK_FL; + +- oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL| +- OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL); + if (flags & S_SYNC) + oi->ip_attr |= OCFS2_SYNC_FL; + if (flags & S_APPEND) + oi->ip_attr |= OCFS2_APPEND_FL; +- if (flags & S_IMMUTABLE) +- oi->ip_attr |= OCFS2_IMMUTABLE_FL; + if (flags & S_NOATIME) + oi->ip_attr |= OCFS2_NOATIME_FL; + if (flags & S_DIRSYNC) + oi->ip_attr |= OCFS2_DIRSYNC_FL; ++ ++ if (vflags & V_BARRIER) ++ oi->ip_attr |= OCFS2_BARRIER_FL; ++ if (vflags & V_COW) ++ oi->ip_attr |= OCFS2_COW_FL; + } + + struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) +@@ -241,6 +263,8 @@ void ocfs2_populate_inode(struct inode * + struct super_block *sb; + struct ocfs2_super *osb; + int use_plocks = 1; ++ uid_t uid; ++ gid_t gid; + + sb = inode->i_sb; + osb = OCFS2_SB(sb); +@@ -269,8 +293,12 @@ void ocfs2_populate_inode(struct inode * + inode->i_generation = le32_to_cpu(fe->i_generation); + inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); + inode->i_mode = le16_to_cpu(fe->i_mode); +- inode->i_uid = le32_to_cpu(fe->i_uid); +- inode->i_gid = le32_to_cpu(fe->i_gid); ++ uid = le32_to_cpu(fe->i_uid); ++ gid = le32_to_cpu(fe->i_gid); ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, ++ /* le16_to_cpu(raw_inode->i_raw_tag)i */ 0); + + /* Fast symlinks will have i_size but no allocated clusters. */ + if (S_ISLNK(inode->i_mode) && !fe->i_clusters) +diff -NurpP --minimal linux-3.3.8/fs/ocfs2/inode.h linux-3.3.8-vs2.3.3.4/fs/ocfs2/inode.h +--- linux-3.3.8/fs/ocfs2/inode.h 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ocfs2/inode.h 2012-02-24 03:55:06.000000000 +0100 +@@ -154,6 +154,7 @@ struct buffer_head *ocfs2_bread(struct i + + void ocfs2_set_inode_flags(struct inode *inode); + void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi); ++int ocfs2_sync_flags(struct inode *inode, int, int); + + static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode) + { +diff -NurpP --minimal linux-3.3.8/fs/ocfs2/ioctl.c linux-3.3.8-vs2.3.3.4/fs/ocfs2/ioctl.c +--- linux-3.3.8/fs/ocfs2/ioctl.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ocfs2/ioctl.c 2012-02-24 03:55:06.000000000 +0100 +@@ -78,7 +78,41 @@ static int ocfs2_get_inode_attr(struct i + return status; + } + +-static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, ++int ocfs2_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); ++ struct buffer_head *bh = NULL; ++ handle_t *handle = NULL; ++ int status; ++ ++ status = ocfs2_inode_lock(inode, &bh, 1); ++ if (status < 0) { ++ mlog_errno(status); ++ return status; ++ } ++ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); ++ if (IS_ERR(handle)) { ++ status = PTR_ERR(handle); ++ mlog_errno(status); ++ goto bail_unlock; ++ } ++ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ ocfs2_get_inode_flags(OCFS2_I(inode)); ++ ++ status = ocfs2_mark_inode_dirty(handle, inode, bh); ++ if (status < 0) ++ mlog_errno(status); ++ ++ ocfs2_commit_trans(osb, handle); ++bail_unlock: ++ ocfs2_inode_unlock(inode, 1); ++ brelse(bh); ++ return status; ++} ++ ++int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, + unsigned mask) + { + struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode); +@@ -103,6 +137,11 @@ static int ocfs2_set_inode_attr(struct i + if (!S_ISDIR(inode->i_mode)) + flags &= ~OCFS2_DIRSYNC_FL; + ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ goto bail_unlock; ++ } ++ + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); +@@ -881,6 +920,7 @@ bail: + return status; + } + ++ + long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_path.dentry->d_inode; +diff -NurpP --minimal linux-3.3.8/fs/ocfs2/namei.c linux-3.3.8-vs2.3.3.4/fs/ocfs2/namei.c +--- linux-3.3.8/fs/ocfs2/namei.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ocfs2/namei.c 2012-03-19 20:52:10.000000000 +0100 +@@ -41,6 +41,7 @@ + #include + #include + #include ++#include + + #include + +@@ -475,6 +476,7 @@ static int __ocfs2_mknod_locked(struct i + struct ocfs2_dinode *fe = NULL; + struct ocfs2_extent_list *fel; + u16 feat; ++ tag_t tag; + + *new_fe_bh = NULL; + +@@ -512,8 +514,11 @@ static int __ocfs2_mknod_locked(struct i + fe->i_suballoc_loc = cpu_to_le64(suballoc_loc); + fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); + fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); +- fe->i_uid = cpu_to_le32(inode->i_uid); +- fe->i_gid = cpu_to_le32(inode->i_gid); ++ ++ tag = dx_current_fstag(osb->sb); ++ fe->i_uid = cpu_to_le32(TAGINO_UID(DX_TAG(inode), inode->i_uid, tag)); ++ fe->i_gid = cpu_to_le32(TAGINO_GID(DX_TAG(inode), inode->i_gid, tag)); ++ inode->i_tag = tag; + fe->i_mode = cpu_to_le16(inode->i_mode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); +diff -NurpP --minimal linux-3.3.8/fs/ocfs2/ocfs2.h linux-3.3.8-vs2.3.3.4/fs/ocfs2/ocfs2.h +--- linux-3.3.8/fs/ocfs2/ocfs2.h 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ocfs2/ocfs2.h 2012-02-24 03:55:06.000000000 +0100 +@@ -272,6 +272,7 @@ enum ocfs2_mount_options + writes */ + OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */ + OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ ++ OCFS2_MOUNT_TAGGED = 1 << 15, /* use tagging */ + }; + + #define OCFS2_OSB_SOFT_RO 0x0001 +diff -NurpP --minimal linux-3.3.8/fs/ocfs2/ocfs2_fs.h linux-3.3.8-vs2.3.3.4/fs/ocfs2/ocfs2_fs.h +--- linux-3.3.8/fs/ocfs2/ocfs2_fs.h 2011-05-22 16:17:53.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/ocfs2/ocfs2_fs.h 2012-02-24 03:55:06.000000000 +0100 +@@ -266,6 +266,11 @@ + #define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/ + #define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */ + ++#define OCFS2_IXUNLINK_FL FS_IXUNLINK_FL /* Immutable invert on unlink */ ++ ++#define OCFS2_BARRIER_FL FS_BARRIER_FL /* Barrier for chroot() */ ++#define OCFS2_COW_FL FS_COW_FL /* Copy on Write marker */ ++ + #define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */ + #define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */ + +diff -NurpP --minimal linux-3.3.8/fs/ocfs2/super.c linux-3.3.8-vs2.3.3.4/fs/ocfs2/super.c +--- linux-3.3.8/fs/ocfs2/super.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/ocfs2/super.c 2012-02-24 03:55:06.000000000 +0100 +@@ -185,6 +185,7 @@ enum { + Opt_coherency_full, + Opt_resv_level, + Opt_dir_resv_level, ++ Opt_tag, Opt_notag, Opt_tagid, + Opt_err, + }; + +@@ -216,6 +217,9 @@ static const match_table_t tokens = { + {Opt_coherency_full, "coherency=full"}, + {Opt_resv_level, "resv_level=%u"}, + {Opt_dir_resv_level, "dir_resv_level=%u"}, ++ {Opt_tag, "tag"}, ++ {Opt_notag, "notag"}, ++ {Opt_tagid, "tagid=%u"}, + {Opt_err, NULL} + }; + +@@ -662,6 +666,13 @@ static int ocfs2_remount(struct super_bl + goto out; + } + ++ if ((osb->s_mount_opt & OCFS2_MOUNT_TAGGED) != ++ (parsed_options.mount_opt & OCFS2_MOUNT_TAGGED)) { ++ ret = -EINVAL; ++ mlog(ML_ERROR, "Cannot change tagging on remount\n"); ++ goto out; ++ } ++ + /* We're going to/from readonly mode. */ + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + /* Disable quota accounting before remounting RO */ +@@ -1177,6 +1188,9 @@ static int ocfs2_fill_super(struct super + + ocfs2_complete_mount_recovery(osb); + ++ if (osb->s_mount_opt & OCFS2_MOUNT_TAGGED) ++ sb->s_flags |= MS_TAGGED; ++ + if (ocfs2_mount_local(osb)) + snprintf(nodestr, sizeof(nodestr), "local"); + else +@@ -1506,6 +1520,20 @@ static int ocfs2_parse_options(struct su + option < OCFS2_MAX_RESV_LEVEL) + mopt->dir_resv_level = option; + break; ++#ifndef CONFIG_TAGGING_NONE ++ case Opt_tag: ++ mopt->mount_opt |= OCFS2_MOUNT_TAGGED; ++ break; ++ case Opt_notag: ++ mopt->mount_opt &= ~OCFS2_MOUNT_TAGGED; ++ break; ++#endif ++#ifdef CONFIG_PROPAGATE ++ case Opt_tagid: ++ /* use args[0] */ ++ mopt->mount_opt |= OCFS2_MOUNT_TAGGED; ++ break; ++#endif + default: + mlog(ML_ERROR, + "Unrecognized mount option \"%s\" " +diff -NurpP --minimal linux-3.3.8/fs/open.c linux-3.3.8-vs2.3.3.4/fs/open.c +--- linux-3.3.8/fs/open.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/open.c 2012-02-24 03:55:06.000000000 +0100 +@@ -30,6 +30,11 @@ + #include + #include + #include ++#include ++#include ++#include ++#include ++#include + + #include "internal.h" + +@@ -74,6 +79,12 @@ static long do_sys_truncate(const char _ + error = user_path(pathname, &path); + if (error) + goto out; ++ ++#ifdef CONFIG_VSERVER_COWBL ++ error = cow_check_and_break(&path); ++ if (error) ++ goto dput_and_out; ++#endif + inode = path.dentry->d_inode; + + /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ +@@ -489,6 +500,10 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, cons + + error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); + if (!error) { ++#ifdef CONFIG_VSERVER_COWBL ++ error = cow_check_and_break(&path); ++ if (!error) ++#endif + error = chmod_common(&path, mode); + path_put(&path); + } +@@ -509,11 +524,11 @@ static int chown_common(struct path *pat + newattrs.ia_valid = ATTR_CTIME; + if (user != (uid_t) -1) { + newattrs.ia_valid |= ATTR_UID; +- newattrs.ia_uid = user; ++ newattrs.ia_uid = dx_map_uid(user); + } + if (group != (gid_t) -1) { + newattrs.ia_valid |= ATTR_GID; +- newattrs.ia_gid = group; ++ newattrs.ia_gid = dx_map_gid(group); + } + if (!S_ISDIR(inode->i_mode)) + newattrs.ia_valid |= +@@ -538,6 +553,10 @@ SYSCALL_DEFINE3(chown, const char __user + error = mnt_want_write(path.mnt); + if (error) + goto out_release; ++#ifdef CONFIG_VSERVER_COWBL ++ error = cow_check_and_break(&path); ++ if (!error) ++#endif + error = chown_common(&path, user, group); + mnt_drop_write(path.mnt); + out_release: +@@ -565,6 +584,10 @@ SYSCALL_DEFINE5(fchownat, int, dfd, cons + error = mnt_want_write(path.mnt); + if (error) + goto out_release; ++#ifdef CONFIG_VSERVER_COWBL ++ error = cow_check_and_break(&path); ++ if (!error) ++#endif + error = chown_common(&path, user, group); + mnt_drop_write(path.mnt); + out_release: +@@ -584,6 +607,10 @@ SYSCALL_DEFINE3(lchown, const char __use + error = mnt_want_write(path.mnt); + if (error) + goto out_release; ++#ifdef CONFIG_VSERVER_COWBL ++ error = cow_check_and_break(&path); ++ if (!error) ++#endif + error = chown_common(&path, user, group); + mnt_drop_write(path.mnt); + out_release: +@@ -839,6 +866,7 @@ static void __put_unused_fd(struct files + __FD_CLR(fd, fdt->open_fds); + if (fd < files->next_fd) + files->next_fd = fd; ++ vx_openfd_dec(fd); + } + + void put_unused_fd(unsigned int fd) +diff -NurpP --minimal linux-3.3.8/fs/proc/array.c linux-3.3.8-vs2.3.3.4/fs/proc/array.c +--- linux-3.3.8/fs/proc/array.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/proc/array.c 2012-02-24 03:55:06.000000000 +0100 +@@ -81,6 +81,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -170,6 +172,9 @@ static inline void task_state(struct seq + rcu_read_lock(); + ppid = pid_alive(p) ? + task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; ++ if (unlikely(vx_current_initpid(p->pid))) ++ ppid = 0; ++ + tpid = 0; + if (pid_alive(p)) { + struct task_struct *tracer = ptrace_parent(p); +@@ -287,7 +292,7 @@ static inline void task_sig(struct seq_f + } + + static void render_cap_t(struct seq_file *m, const char *header, +- kernel_cap_t *a) ++ struct vx_info *vxi, kernel_cap_t *a) + { + unsigned __capi; + +@@ -312,10 +317,11 @@ static inline void task_cap(struct seq_f + cap_bset = cred->cap_bset; + rcu_read_unlock(); + +- render_cap_t(m, "CapInh:\t", &cap_inheritable); +- render_cap_t(m, "CapPrm:\t", &cap_permitted); +- render_cap_t(m, "CapEff:\t", &cap_effective); +- render_cap_t(m, "CapBnd:\t", &cap_bset); ++ /* FIXME: maybe move the p->vx_info masking to __task_cred() ? */ ++ render_cap_t(m, "CapInh:\t", p->vx_info, &cap_inheritable); ++ render_cap_t(m, "CapPrm:\t", p->vx_info, &cap_permitted); ++ render_cap_t(m, "CapEff:\t", p->vx_info, &cap_effective); ++ render_cap_t(m, "CapBnd:\t", p->vx_info, &cap_bset); + } + + static inline void task_context_switch_counts(struct seq_file *m, +@@ -337,6 +343,42 @@ static void task_cpus_allowed(struct seq + seq_putc(m, '\n'); + } + ++int proc_pid_nsproxy(struct seq_file *m, struct pid_namespace *ns, ++ struct pid *pid, struct task_struct *task) ++{ ++ seq_printf(m, "Proxy:\t%p(%c)\n" ++ "Count:\t%u\n" ++ "uts:\t%p(%c)\n" ++ "ipc:\t%p(%c)\n" ++ "mnt:\t%p(%c)\n" ++ "pid:\t%p(%c)\n" ++ "net:\t%p(%c)\n", ++ task->nsproxy, ++ (task->nsproxy == init_task.nsproxy ? 'I' : '-'), ++ atomic_read(&task->nsproxy->count), ++ task->nsproxy->uts_ns, ++ (task->nsproxy->uts_ns == init_task.nsproxy->uts_ns ? 'I' : '-'), ++ task->nsproxy->ipc_ns, ++ (task->nsproxy->ipc_ns == init_task.nsproxy->ipc_ns ? 'I' : '-'), ++ task->nsproxy->mnt_ns, ++ (task->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns ? 'I' : '-'), ++ task->nsproxy->pid_ns, ++ (task->nsproxy->pid_ns == init_task.nsproxy->pid_ns ? 'I' : '-'), ++ task->nsproxy->net_ns, ++ (task->nsproxy->net_ns == init_task.nsproxy->net_ns ? 'I' : '-')); ++ return 0; ++} ++ ++void task_vs_id(struct seq_file *m, struct task_struct *task) ++{ ++ if (task_vx_flags(task, VXF_HIDE_VINFO, 0)) ++ return; ++ ++ seq_printf(m, "VxID: %d\n", vx_task_xid(task)); ++ seq_printf(m, "NxID: %d\n", nx_task_nid(task)); ++} ++ ++ + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) + { +@@ -353,6 +395,7 @@ int proc_pid_status(struct seq_file *m, + task_cap(m, task); + task_cpus_allowed(m, task); + cpuset_task_status_allowed(m, task); ++ task_vs_id(m, task); + task_context_switch_counts(m, task); + return 0; + } +@@ -462,6 +505,17 @@ static int do_task_stat(struct seq_file + /* convert nsec -> ticks */ + start_time = nsec_to_clock_t(start_time); + ++ /* fixup start time for virt uptime */ ++ if (vx_flags(VXF_VIRT_UPTIME, 0)) { ++ unsigned long long bias = ++ current->vx_info->cvirt.bias_clock; ++ ++ if (start_time > bias) ++ start_time -= bias; ++ else ++ start_time = 0; ++ } ++ + seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ + %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ + %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n", +diff -NurpP --minimal linux-3.3.8/fs/proc/base.c linux-3.3.8-vs2.3.3.4/fs/proc/base.c +--- linux-3.3.8/fs/proc/base.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/proc/base.c 2012-02-24 04:45:58.000000000 +0100 +@@ -84,6 +84,8 @@ + #include + #include + #include ++#include ++#include + #ifdef CONFIG_HARDWALL + #include + #endif +@@ -937,11 +939,16 @@ static ssize_t oom_adjust_write(struct f + goto err_task_lock; + } + +- if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { ++ if (oom_adjust < task->signal->oom_adj && ++ !vx_capable(CAP_SYS_RESOURCE, VXC_OOM_ADJUST)) { + err = -EACCES; + goto err_sighand; + } + ++ /* prevent guest processes from circumventing the oom killer */ ++ if (vx_current_xid() && (oom_adjust == OOM_DISABLE)) ++ oom_adjust = OOM_ADJUST_MIN; ++ + /* + * Warn that /proc/pid/oom_adj is deprecated, see + * Documentation/feature-removal-schedule.txt. +@@ -1542,6 +1549,8 @@ struct inode *proc_pid_make_inode(struct + inode->i_gid = cred->egid; + rcu_read_unlock(); + } ++ /* procfs is xid tagged */ ++ inode->i_tag = (tag_t)vx_task_xid(task); + security_task_to_inode(task, inode); + + out: +@@ -1587,6 +1596,8 @@ int pid_getattr(struct vfsmount *mnt, st + + /* dentry stuff */ + ++static unsigned name_to_int(struct dentry *dentry); ++ + /* + * Exceptional case: normally we are not allowed to unhash a busy + * directory. In this case, however, we can do it - no aliasing problems +@@ -1615,6 +1626,12 @@ int pid_revalidate(struct dentry *dentry + task = get_proc_task(inode); + + if (task) { ++ unsigned pid = name_to_int(dentry); ++ ++ if (pid != ~0U && pid != vx_map_pid(task->pid)) { ++ put_task_struct(task); ++ goto drop; ++ } + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + rcu_read_lock(); +@@ -1631,6 +1648,7 @@ int pid_revalidate(struct dentry *dentry + put_task_struct(task); + return 1; + } ++drop: + d_drop(dentry); + return 0; + } +@@ -2469,6 +2487,13 @@ static struct dentry *proc_pident_lookup + if (!task) + goto out_no_task; + ++ /* TODO: maybe we can come up with a generic approach? */ ++ if (task_vx_flags(task, VXF_HIDE_VINFO, 0) && ++ (dentry->d_name.len == 5) && ++ (!memcmp(dentry->d_name.name, "vinfo", 5) || ++ !memcmp(dentry->d_name.name, "ninfo", 5))) ++ goto out; ++ + /* + * Yes, it does not scale. And it should not. Don't add + * new entries into /proc// without very good reasons. +@@ -2854,7 +2879,7 @@ out_iput: + static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) + { + struct dentry *error; +- struct task_struct *task = get_proc_task(dir); ++ struct task_struct *task = get_proc_task_real(dir); + const struct pid_entry *p, *last; + + error = ERR_PTR(-ENOENT); +@@ -2961,6 +2986,9 @@ static int proc_pid_personality(struct s + static const struct file_operations proc_task_operations; + static const struct inode_operations proc_task_inode_operations; + ++extern int proc_pid_vx_info(struct task_struct *, char *); ++extern int proc_pid_nx_info(struct task_struct *, char *); ++ + static const struct pid_entry tgid_base_stuff[] = { + DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), + DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), +@@ -3027,6 +3055,8 @@ static const struct pid_entry tgid_base_ + #ifdef CONFIG_CGROUPS + REG("cgroup", S_IRUGO, proc_cgroup_operations), + #endif ++ INF("vinfo", S_IRUGO, proc_pid_vx_info), ++ INF("ninfo", S_IRUGO, proc_pid_nx_info), + INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +@@ -3046,6 +3076,7 @@ static const struct pid_entry tgid_base_ + #ifdef CONFIG_HARDWALL + INF("hardwall", S_IRUGO, proc_pid_hardwall), + #endif ++ ONE("nsproxy", S_IRUGO, proc_pid_nsproxy), + }; + + static int proc_tgid_base_readdir(struct file * filp, +@@ -3239,7 +3270,7 @@ retry: + iter.task = NULL; + pid = find_ge_pid(iter.tgid, ns); + if (pid) { +- iter.tgid = pid_nr_ns(pid, ns); ++ iter.tgid = pid_unmapped_nr_ns(pid, ns); + iter.task = pid_task(pid, PIDTYPE_PID); + /* What we to know is if the pid we have find is the + * pid of a thread_group_leader. Testing for task +@@ -3269,7 +3300,7 @@ static int proc_pid_fill_cache(struct fi + struct tgid_iter iter) + { + char name[PROC_NUMBUF]; +- int len = snprintf(name, sizeof(name), "%d", iter.tgid); ++ int len = snprintf(name, sizeof(name), "%d", vx_map_tgid(iter.tgid)); + return proc_fill_cache(filp, dirent, filldir, name, len, + proc_pid_instantiate, iter.task, NULL); + } +@@ -3293,7 +3324,7 @@ int proc_pid_readdir(struct file * filp, + goto out_no_task; + nr = filp->f_pos - FIRST_PROCESS_ENTRY; + +- reaper = get_proc_task(filp->f_path.dentry->d_inode); ++ reaper = get_proc_task_real(filp->f_path.dentry->d_inode); + if (!reaper) + goto out_no_task; + +@@ -3315,6 +3346,8 @@ int proc_pid_readdir(struct file * filp, + __filldir = fake_filldir; + + filp->f_pos = iter.tgid + TGID_OFFSET; ++ if (!vx_proc_task_visible(iter.task)) ++ continue; + if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) { + put_task_struct(iter.task); + goto out; +@@ -3468,6 +3501,8 @@ static struct dentry *proc_task_lookup(s + tid = name_to_int(dentry); + if (tid == ~0U) + goto out; ++ if (vx_current_initpid(tid)) ++ goto out; + + ns = dentry->d_sb->s_fs_info; + rcu_read_lock(); +diff -NurpP --minimal linux-3.3.8/fs/proc/generic.c linux-3.3.8-vs2.3.3.4/fs/proc/generic.c +--- linux-3.3.8/fs/proc/generic.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/proc/generic.c 2012-02-24 03:55:06.000000000 +0100 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + + #include "internal.h" +@@ -424,11 +425,15 @@ struct dentry *proc_lookup_de(struct pro + for (de = de->subdir; de ; de = de->next) { + if (de->namelen != dentry->d_name.len) + continue; ++ if (!vx_hide_check(0, de->vx_flags)) ++ continue; + if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { + pde_get(de); + spin_unlock(&proc_subdir_lock); + error = -EINVAL; + inode = proc_get_inode(dir->i_sb, de); ++ /* generic proc entries belong to the host */ ++ inode->i_tag = 0; + goto out_unlock; + } + } +@@ -506,6 +511,8 @@ int proc_readdir_de(struct proc_dir_entr + + /* filldir passes info to user space */ + pde_get(de); ++ if (!vx_hide_check(0, de->vx_flags)) ++ goto skip; + spin_unlock(&proc_subdir_lock); + if (filldir(dirent, de->name, de->namelen, filp->f_pos, + de->low_ino, de->mode >> 12) < 0) { +@@ -513,6 +520,7 @@ int proc_readdir_de(struct proc_dir_entr + goto out; + } + spin_lock(&proc_subdir_lock); ++ skip: + filp->f_pos++; + next = de->next; + pde_put(de); +@@ -626,6 +634,7 @@ static struct proc_dir_entry *__proc_cre + ent->nlink = nlink; + atomic_set(&ent->count, 1); + ent->pde_users = 0; ++ ent->vx_flags = IATTR_PROC_DEFAULT; + spin_lock_init(&ent->pde_unload_lock); + ent->pde_unload_completion = NULL; + INIT_LIST_HEAD(&ent->pde_openers); +@@ -649,7 +658,8 @@ struct proc_dir_entry *proc_symlink(cons + kfree(ent->data); + kfree(ent); + ent = NULL; +- } ++ } else ++ ent->vx_flags = IATTR_PROC_SYMLINK; + } else { + kfree(ent); + ent = NULL; +diff -NurpP --minimal linux-3.3.8/fs/proc/inode.c linux-3.3.8-vs2.3.3.4/fs/proc/inode.c +--- linux-3.3.8/fs/proc/inode.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/proc/inode.c 2012-02-24 03:55:06.000000000 +0100 +@@ -459,6 +459,8 @@ struct inode *proc_get_inode(struct supe + inode->i_uid = de->uid; + inode->i_gid = de->gid; + } ++ if (de->vx_flags) ++ PROC_I(inode)->vx_flags = de->vx_flags; + if (de->size) + inode->i_size = de->size; + if (de->nlink) +diff -NurpP --minimal linux-3.3.8/fs/proc/internal.h linux-3.3.8-vs2.3.3.4/fs/proc/internal.h +--- linux-3.3.8/fs/proc/internal.h 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/proc/internal.h 2012-02-24 03:55:06.000000000 +0100 +@@ -10,6 +10,7 @@ + */ + + #include ++#include + + extern struct proc_dir_entry proc_root; + #ifdef CONFIG_PROC_SYSCTL +@@ -51,6 +52,9 @@ extern int proc_pid_status(struct seq_fi + struct pid *pid, struct task_struct *task); + extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); ++extern int proc_pid_nsproxy(struct seq_file *m, struct pid_namespace *ns, ++ struct pid *pid, struct task_struct *task); ++ + extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); + + extern const struct file_operations proc_maps_operations; +@@ -76,11 +80,16 @@ static inline struct pid *proc_pid(struc + return PROC_I(inode)->pid; + } + +-static inline struct task_struct *get_proc_task(struct inode *inode) ++static inline struct task_struct *get_proc_task_real(struct inode *inode) + { + return get_pid_task(proc_pid(inode), PIDTYPE_PID); + } + ++static inline struct task_struct *get_proc_task(struct inode *inode) ++{ ++ return vx_get_proc_task(inode, proc_pid(inode)); ++} ++ + static inline int proc_fd(struct inode *inode) + { + return PROC_I(inode)->fd; +diff -NurpP --minimal linux-3.3.8/fs/proc/loadavg.c linux-3.3.8-vs2.3.3.4/fs/proc/loadavg.c +--- linux-3.3.8/fs/proc/loadavg.c 2009-09-10 15:26:23.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/proc/loadavg.c 2012-02-24 03:55:06.000000000 +0100 +@@ -12,15 +12,27 @@ + + static int loadavg_proc_show(struct seq_file *m, void *v) + { ++ unsigned long running; ++ unsigned int threads; + unsigned long avnrun[3]; + + get_avenrun(avnrun, FIXED_1/200, 0); + ++ if (vx_flags(VXF_VIRT_LOAD, 0)) { ++ struct vx_info *vxi = current_vx_info(); ++ ++ running = atomic_read(&vxi->cvirt.nr_running); ++ threads = atomic_read(&vxi->cvirt.nr_threads); ++ } else { ++ running = nr_running(); ++ threads = nr_threads; ++ } ++ + seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n", + LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), + LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), + LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), +- nr_running(), nr_threads, ++ running, threads, + task_active_pid_ns(current)->last_pid); + return 0; + } +diff -NurpP --minimal linux-3.3.8/fs/proc/meminfo.c linux-3.3.8-vs2.3.3.4/fs/proc/meminfo.c +--- linux-3.3.8/fs/proc/meminfo.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/proc/meminfo.c 2012-02-24 03:55:06.000000000 +0100 +@@ -39,7 +39,8 @@ static int meminfo_proc_show(struct seq_ + allowed = ((totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100) + total_swap_pages; + +- cached = global_page_state(NR_FILE_PAGES) - ++ cached = vx_flags(VXF_VIRT_MEM, 0) ? ++ vx_vsi_cached(&i) : global_page_state(NR_FILE_PAGES) - + total_swapcache_pages - i.bufferram; + if (cached < 0) + cached = 0; +diff -NurpP --minimal linux-3.3.8/fs/proc/root.c linux-3.3.8-vs2.3.3.4/fs/proc/root.c +--- linux-3.3.8/fs/proc/root.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/proc/root.c 2012-02-24 04:46:50.000000000 +0100 +@@ -19,9 +19,14 @@ + #include + #include + #include ++#include + + #include "internal.h" + ++struct proc_dir_entry *proc_virtual; ++ ++extern void proc_vx_init(void); ++ + static int proc_test_super(struct super_block *sb, void *data) + { + return sb->s_fs_info == data; +@@ -189,6 +194,7 @@ void __init proc_root_init(void) + #endif + proc_mkdir("bus", NULL); + proc_sys_init(); ++ proc_vx_init(); + } + + static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat +@@ -256,6 +262,7 @@ struct proc_dir_entry proc_root = { + .proc_iops = &proc_root_inode_operations, + .proc_fops = &proc_root_operations, + .parent = &proc_root, ++ .vx_flags = IATTR_ADMIN | IATTR_WATCH, + .name = "/proc", + }; + +diff -NurpP --minimal linux-3.3.8/fs/proc/stat.c linux-3.3.8-vs2.3.3.4/fs/proc/stat.c +--- linux-3.3.8/fs/proc/stat.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/proc/stat.c 2012-04-24 03:32:00.000000000 +0200 +@@ -9,6 +9,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -64,6 +65,10 @@ static int show_stat(struct seq_file *p, + irq = softirq = steal = 0; + guest = guest_nice = 0; + getboottime(&boottime); ++ ++ if (vx_flags(VXF_VIRT_UPTIME, 0)) ++ vx_vsi_boottime(&boottime); ++ + jif = boottime.tv_sec; + + for_each_possible_cpu(i) { +diff -NurpP --minimal linux-3.3.8/fs/proc/uptime.c linux-3.3.8-vs2.3.3.4/fs/proc/uptime.c +--- linux-3.3.8/fs/proc/uptime.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/proc/uptime.c 2012-02-24 03:55:06.000000000 +0100 +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + #include + + static int uptime_proc_show(struct seq_file *m, void *v) +@@ -25,6 +26,10 @@ static int uptime_proc_show(struct seq_f + nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + idle.tv_nsec = rem; ++ ++ if (vx_flags(VXF_VIRT_UPTIME, 0)) ++ vx_vsi_uptime(&uptime, &idle); ++ + seq_printf(m, "%lu.%02lu %lu.%02lu\n", + (unsigned long) uptime.tv_sec, + (uptime.tv_nsec / (NSEC_PER_SEC / 100)), +diff -NurpP --minimal linux-3.3.8/fs/proc_namespace.c linux-3.3.8-vs2.3.3.4/fs/proc_namespace.c +--- linux-3.3.8/fs/proc_namespace.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/proc_namespace.c 2012-04-03 16:00:38.000000000 +0200 +@@ -44,6 +44,8 @@ static int show_sb_opts(struct seq_file + { MS_SYNCHRONOUS, ",sync" }, + { MS_DIRSYNC, ",dirsync" }, + { MS_MANDLOCK, ",mand" }, ++ { MS_TAGGED, ",tag" }, ++ { MS_NOTAGCHECK, ",notagcheck" }, + { 0, NULL } + }; + const struct proc_fs_info *fs_infop; +@@ -80,6 +82,34 @@ static inline void mangle(struct seq_fil + seq_escape(m, s, " \t\n\\"); + } + ++static int mnt_is_reachable(struct vfsmount *vfsmnt) ++{ ++ struct path root; ++ struct dentry *point; ++ struct mount *mnt = real_mount(vfsmnt); ++ struct mount *root_mnt; ++ int ret; ++ ++ if (mnt == mnt->mnt_ns->root) ++ return 1; ++ ++ br_read_lock(vfsmount_lock); ++ root = current->fs->root; ++ root_mnt = real_mount(root.mnt); ++ point = root.dentry; ++ ++ while ((mnt != mnt->mnt_parent) && (mnt != root_mnt)) { ++ point = mnt->mnt_mountpoint; ++ mnt = mnt->mnt_parent; ++ } ++ ++ ret = (mnt == root_mnt) && is_subdir(point, root.dentry); ++ ++ br_read_unlock(vfsmount_lock); ++ ++ return ret; ++} ++ + static void show_type(struct seq_file *m, struct super_block *sb) + { + mangle(m, sb->s_type->name); +@@ -96,6 +126,17 @@ static int show_vfsmnt(struct seq_file * + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + struct super_block *sb = mnt_path.dentry->d_sb; + ++ if (vx_flags(VXF_HIDE_MOUNT, 0)) ++ return SEQ_SKIP; ++ if (!mnt_is_reachable(mnt) && !vx_check(0, VS_WATCH_P)) ++ return SEQ_SKIP; ++ ++ if (!vx_check(0, VS_ADMIN|VS_WATCH) && ++ mnt == current->fs->root.mnt) { ++ seq_puts(m, "/dev/root / "); ++ goto type; ++ } ++ + if (sb->s_op->show_devname) { + err = sb->s_op->show_devname(m, mnt_path.dentry); + if (err) +@@ -106,6 +147,7 @@ static int show_vfsmnt(struct seq_file * + seq_putc(m, ' '); + seq_path(m, &mnt_path, " \t\n\\"); + seq_putc(m, ' '); ++type: + show_type(m, sb); + seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); + err = show_sb_opts(m, sb); +@@ -128,6 +170,11 @@ static int show_mountinfo(struct seq_fil + struct path root = p->root; + int err = 0; + ++ if (vx_flags(VXF_HIDE_MOUNT, 0)) ++ return SEQ_SKIP; ++ if (!mnt_is_reachable(mnt) && !vx_check(0, VS_WATCH_P)) ++ return SEQ_SKIP; ++ + seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, + MAJOR(sb->s_dev), MINOR(sb->s_dev)); + if (sb->s_op->show_path) +@@ -187,6 +234,17 @@ static int show_vfsstat(struct seq_file + struct super_block *sb = mnt_path.dentry->d_sb; + int err = 0; + ++ if (vx_flags(VXF_HIDE_MOUNT, 0)) ++ return SEQ_SKIP; ++ if (!mnt_is_reachable(mnt) && !vx_check(0, VS_WATCH_P)) ++ return SEQ_SKIP; ++ ++ if (!vx_check(0, VS_ADMIN|VS_WATCH) && ++ mnt == current->fs->root.mnt) { ++ seq_puts(m, "device /dev/root mounted on / "); ++ goto type; ++ } ++ + /* device */ + if (sb->s_op->show_devname) { + seq_puts(m, "device "); +@@ -203,7 +261,7 @@ static int show_vfsstat(struct seq_file + seq_puts(m, " mounted on "); + seq_path(m, &mnt_path, " \t\n\\"); + seq_putc(m, ' '); +- ++type: + /* file system type */ + seq_puts(m, "with fstype "); + show_type(m, sb); +diff -NurpP --minimal linux-3.3.8/fs/quota/dquot.c linux-3.3.8-vs2.3.3.4/fs/quota/dquot.c +--- linux-3.3.8/fs/quota/dquot.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/quota/dquot.c 2012-02-24 03:55:06.000000000 +0100 +@@ -1547,6 +1547,9 @@ int __dquot_alloc_space(struct inode *in + int reserve = flags & DQUOT_SPACE_RESERVE; + int nofail = flags & DQUOT_SPACE_NOFAIL; + ++ if ((ret = dl_alloc_space(inode, number))) ++ return ret; ++ + /* + * First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex +@@ -1601,6 +1604,9 @@ int dquot_alloc_inode(const struct inode + int cnt, ret = 0; + char warntype[MAXQUOTAS]; + ++ if ((ret = dl_alloc_inode(inode))) ++ return ret; ++ + /* First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex */ + if (!dquot_active(inode)) +@@ -1671,6 +1677,8 @@ void __dquot_free_space(struct inode *in + char warntype[MAXQUOTAS]; + int reserve = flags & DQUOT_SPACE_RESERVE; + ++ dl_free_space(inode, number); ++ + /* First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex */ + if (!dquot_active(inode)) { +@@ -1709,6 +1717,8 @@ void dquot_free_inode(const struct inode + unsigned int cnt; + char warntype[MAXQUOTAS]; + ++ dl_free_inode(inode); ++ + /* First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex */ + if (!dquot_active(inode)) +diff -NurpP --minimal linux-3.3.8/fs/quota/quota.c linux-3.3.8-vs2.3.3.4/fs/quota/quota.c +--- linux-3.3.8/fs/quota/quota.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/quota/quota.c 2012-03-19 20:54:39.000000000 +0100 +@@ -8,6 +8,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -37,7 +38,7 @@ static int check_quotactl_permission(str + break; + /*FALLTHROUGH*/ + default: +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_QUOTA_CTL)) + return -EPERM; + } + +@@ -292,6 +293,46 @@ static int do_quotactl(struct super_bloc + } + } + ++#if defined(CONFIG_BLK_DEV_VROOT) || defined(CONFIG_BLK_DEV_VROOT_MODULE) ++ ++#include ++#include ++#include ++#include ++#include ++ ++static vroot_grb_func *vroot_get_real_bdev = NULL; ++ ++static DEFINE_SPINLOCK(vroot_grb_lock); ++ ++int register_vroot_grb(vroot_grb_func *func) { ++ int ret = -EBUSY; ++ ++ spin_lock(&vroot_grb_lock); ++ if (!vroot_get_real_bdev) { ++ vroot_get_real_bdev = func; ++ ret = 0; ++ } ++ spin_unlock(&vroot_grb_lock); ++ return ret; ++} ++EXPORT_SYMBOL(register_vroot_grb); ++ ++int unregister_vroot_grb(vroot_grb_func *func) { ++ int ret = -EINVAL; ++ ++ spin_lock(&vroot_grb_lock); ++ if (vroot_get_real_bdev) { ++ vroot_get_real_bdev = NULL; ++ ret = 0; ++ } ++ spin_unlock(&vroot_grb_lock); ++ return ret; ++} ++EXPORT_SYMBOL(unregister_vroot_grb); ++ ++#endif ++ + /* Return 1 if 'cmd' will block on frozen filesystem */ + static int quotactl_cmd_write(int cmd) + { +@@ -324,6 +365,22 @@ static struct super_block *quotactl_bloc + putname(tmp); + if (IS_ERR(bdev)) + return ERR_CAST(bdev); ++#if defined(CONFIG_BLK_DEV_VROOT) || defined(CONFIG_BLK_DEV_VROOT_MODULE) ++ if (bdev && bdev->bd_inode && ++ imajor(bdev->bd_inode) == VROOT_MAJOR) { ++ struct block_device *bdnew = (void *)-EINVAL; ++ ++ if (vroot_get_real_bdev) ++ bdnew = vroot_get_real_bdev(bdev); ++ else ++ vxdprintk(VXD_CBIT(misc, 0), ++ "vroot_get_real_bdev not set"); ++ bdput(bdev); ++ if (IS_ERR(bdnew)) ++ return ERR_PTR(PTR_ERR(bdnew)); ++ bdev = bdnew; ++ } ++#endif + if (quotactl_cmd_write(cmd)) + sb = get_super_thawed(bdev); + else +diff -NurpP --minimal linux-3.3.8/fs/reiserfs/file.c linux-3.3.8-vs2.3.3.4/fs/reiserfs/file.c +--- linux-3.3.8/fs/reiserfs/file.c 2011-10-24 18:45:27.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/reiserfs/file.c 2012-02-24 03:55:06.000000000 +0100 +@@ -319,5 +319,6 @@ const struct inode_operations reiserfs_f + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, ++ .sync_flags = reiserfs_sync_flags, + .get_acl = reiserfs_get_acl, + }; +diff -NurpP --minimal linux-3.3.8/fs/reiserfs/inode.c linux-3.3.8-vs2.3.3.4/fs/reiserfs/inode.c +--- linux-3.3.8/fs/reiserfs/inode.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/reiserfs/inode.c 2012-02-24 03:55:06.000000000 +0100 +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + + int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to); +@@ -1131,6 +1132,8 @@ static void init_inode(struct inode *ino + struct buffer_head *bh; + struct item_head *ih; + __u32 rdev; ++ uid_t uid; ++ gid_t gid; + //int version = ITEM_VERSION_1; + + bh = PATH_PLAST_BUFFER(path); +@@ -1151,12 +1154,13 @@ static void init_inode(struct inode *ino + (struct stat_data_v1 *)B_I_PITEM(bh, ih); + unsigned long blocks; + ++ uid = sd_v1_uid(sd); ++ gid = sd_v1_gid(sd); ++ + set_inode_item_key_version(inode, KEY_FORMAT_3_5); + set_inode_sd_version(inode, STAT_DATA_V1); + inode->i_mode = sd_v1_mode(sd); + set_nlink(inode, sd_v1_nlink(sd)); +- inode->i_uid = sd_v1_uid(sd); +- inode->i_gid = sd_v1_gid(sd); + inode->i_size = sd_v1_size(sd); + inode->i_atime.tv_sec = sd_v1_atime(sd); + inode->i_mtime.tv_sec = sd_v1_mtime(sd); +@@ -1198,11 +1202,12 @@ static void init_inode(struct inode *ino + // (directories and symlinks) + struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih); + ++ uid = sd_v2_uid(sd); ++ gid = sd_v2_gid(sd); ++ + inode->i_mode = sd_v2_mode(sd); + set_nlink(inode, sd_v2_nlink(sd)); +- inode->i_uid = sd_v2_uid(sd); + inode->i_size = sd_v2_size(sd); +- inode->i_gid = sd_v2_gid(sd); + inode->i_mtime.tv_sec = sd_v2_mtime(sd); + inode->i_atime.tv_sec = sd_v2_atime(sd); + inode->i_ctime.tv_sec = sd_v2_ctime(sd); +@@ -1232,6 +1237,10 @@ static void init_inode(struct inode *ino + sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode); + } + ++ inode->i_uid = INOTAG_UID(DX_TAG(inode), uid, gid); ++ inode->i_gid = INOTAG_GID(DX_TAG(inode), uid, gid); ++ inode->i_tag = INOTAG_TAG(DX_TAG(inode), uid, gid, 0); ++ + pathrelse(path); + if (S_ISREG(inode->i_mode)) { + inode->i_op = &reiserfs_file_inode_operations; +@@ -1254,13 +1263,15 @@ static void init_inode(struct inode *ino + static void inode2sd(void *sd, struct inode *inode, loff_t size) + { + struct stat_data *sd_v2 = (struct stat_data *)sd; ++ uid_t uid = TAGINO_UID(DX_TAG(inode), inode->i_uid, inode->i_tag); ++ gid_t gid = TAGINO_GID(DX_TAG(inode), inode->i_gid, inode->i_tag); + __u16 flags; + ++ set_sd_v2_uid(sd_v2, uid); ++ set_sd_v2_gid(sd_v2, gid); + set_sd_v2_mode(sd_v2, inode->i_mode); + set_sd_v2_nlink(sd_v2, inode->i_nlink); +- set_sd_v2_uid(sd_v2, inode->i_uid); + set_sd_v2_size(sd_v2, size); +- set_sd_v2_gid(sd_v2, inode->i_gid); + set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec); + set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec); + set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec); +@@ -2868,14 +2879,19 @@ int reiserfs_commit_write(struct file *f + void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode) + { + if (reiserfs_attrs(inode->i_sb)) { +- if (sd_attrs & REISERFS_SYNC_FL) +- inode->i_flags |= S_SYNC; +- else +- inode->i_flags &= ~S_SYNC; + if (sd_attrs & REISERFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; ++ if (sd_attrs & REISERFS_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; ++ else ++ inode->i_flags &= ~S_IXUNLINK; ++ ++ if (sd_attrs & REISERFS_SYNC_FL) ++ inode->i_flags |= S_SYNC; ++ else ++ inode->i_flags &= ~S_SYNC; + if (sd_attrs & REISERFS_APPEND_FL) + inode->i_flags |= S_APPEND; + else +@@ -2888,6 +2904,15 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, + REISERFS_I(inode)->i_flags |= i_nopack_mask; + else + REISERFS_I(inode)->i_flags &= ~i_nopack_mask; ++ ++ if (sd_attrs & REISERFS_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ else ++ inode->i_vflags &= ~V_BARRIER; ++ if (sd_attrs & REISERFS_COW_FL) ++ inode->i_vflags |= V_COW; ++ else ++ inode->i_vflags &= ~V_COW; + } + } + +@@ -2898,6 +2923,11 @@ void i_attrs_to_sd_attrs(struct inode *i + *sd_attrs |= REISERFS_IMMUTABLE_FL; + else + *sd_attrs &= ~REISERFS_IMMUTABLE_FL; ++ if (inode->i_flags & S_IXUNLINK) ++ *sd_attrs |= REISERFS_IXUNLINK_FL; ++ else ++ *sd_attrs &= ~REISERFS_IXUNLINK_FL; ++ + if (inode->i_flags & S_SYNC) + *sd_attrs |= REISERFS_SYNC_FL; + else +@@ -2910,6 +2940,15 @@ void i_attrs_to_sd_attrs(struct inode *i + *sd_attrs |= REISERFS_NOTAIL_FL; + else + *sd_attrs &= ~REISERFS_NOTAIL_FL; ++ ++ if (inode->i_vflags & V_BARRIER) ++ *sd_attrs |= REISERFS_BARRIER_FL; ++ else ++ *sd_attrs &= ~REISERFS_BARRIER_FL; ++ if (inode->i_vflags & V_COW) ++ *sd_attrs |= REISERFS_COW_FL; ++ else ++ *sd_attrs &= ~REISERFS_COW_FL; + } + } + +@@ -3155,7 +3194,8 @@ int reiserfs_setattr(struct dentry *dent + } + + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || +- (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { ++ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid) || ++ (ia_valid & ATTR_TAG && attr->ia_tag != inode->i_tag)) { + struct reiserfs_transaction_handle th; + int jbegin_count = + 2 * +@@ -3184,6 +3224,9 @@ int reiserfs_setattr(struct dentry *dent + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; ++ if ((attr->ia_valid & ATTR_TAG) && ++ IS_TAGGED(inode)) ++ inode->i_tag = attr->ia_tag; + mark_inode_dirty(inode); + error = journal_end(&th, inode->i_sb, jbegin_count); + if (error) +diff -NurpP --minimal linux-3.3.8/fs/reiserfs/ioctl.c linux-3.3.8-vs2.3.3.4/fs/reiserfs/ioctl.c +--- linux-3.3.8/fs/reiserfs/ioctl.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/reiserfs/ioctl.c 2012-02-24 03:55:06.000000000 +0100 +@@ -11,6 +11,21 @@ + #include + #include + ++ ++int reiserfs_sync_flags(struct inode *inode, int flags, int vflags) ++{ ++ __u16 sd_attrs = 0; ++ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ ++ i_attrs_to_sd_attrs(inode, &sd_attrs); ++ REISERFS_I(inode)->i_attrs = sd_attrs; ++ inode->i_ctime = CURRENT_TIME_SEC; ++ mark_inode_dirty(inode); ++ return 0; ++} ++ + /* + * reiserfs_ioctl - handler for ioctl for inode + * supported commands: +@@ -22,7 +37,7 @@ + long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_path.dentry->d_inode; +- unsigned int flags; ++ unsigned int flags, oldflags; + int err = 0; + + reiserfs_write_lock(inode->i_sb); +@@ -47,6 +62,7 @@ long reiserfs_ioctl(struct file *filp, u + + flags = REISERFS_I(inode)->i_attrs; + i_attrs_to_sd_attrs(inode, (__u16 *) & flags); ++ flags &= REISERFS_FL_USER_VISIBLE; + err = put_user(flags, (int __user *)arg); + break; + case REISERFS_IOC_SETFLAGS:{ +@@ -67,6 +83,10 @@ long reiserfs_ioctl(struct file *filp, u + err = -EFAULT; + goto setflags_out; + } ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -EACCES; ++ } + /* + * Is it quota file? Do not allow user to mess with it + */ +@@ -91,6 +111,10 @@ long reiserfs_ioctl(struct file *filp, u + goto setflags_out; + } + } ++ ++ oldflags = REISERFS_I(inode)->i_attrs; ++ flags &= REISERFS_FL_USER_MODIFIABLE; ++ flags |= oldflags & ~REISERFS_FL_USER_MODIFIABLE; + sd_attrs_to_i_attrs(flags, inode); + REISERFS_I(inode)->i_attrs = flags; + inode->i_ctime = CURRENT_TIME_SEC; +diff -NurpP --minimal linux-3.3.8/fs/reiserfs/namei.c linux-3.3.8-vs2.3.3.4/fs/reiserfs/namei.c +--- linux-3.3.8/fs/reiserfs/namei.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/reiserfs/namei.c 2012-02-24 03:55:06.000000000 +0100 +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + + #define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); } + #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i); +@@ -362,6 +363,7 @@ static struct dentry *reiserfs_lookup(st + if (retval == IO_ERROR) { + return ERR_PTR(-EIO); + } ++ dx_propagate_tag(nd, inode); + + return d_splice_alias(inode, dentry); + } +diff -NurpP --minimal linux-3.3.8/fs/reiserfs/super.c linux-3.3.8-vs2.3.3.4/fs/reiserfs/super.c +--- linux-3.3.8/fs/reiserfs/super.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/reiserfs/super.c 2012-02-24 03:55:06.000000000 +0100 +@@ -980,6 +980,14 @@ static int reiserfs_parse_options(struct + {"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT}, + {"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT}, + #endif ++#ifndef CONFIG_TAGGING_NONE ++ {"tagxid",.setmask = 1 << REISERFS_TAGGED}, ++ {"tag",.setmask = 1 << REISERFS_TAGGED}, ++ {"notag",.clrmask = 1 << REISERFS_TAGGED}, ++#endif ++#ifdef CONFIG_PROPAGATE ++ {"tag",.arg_required = 'T',.values = NULL}, ++#endif + #ifdef CONFIG_REISERFS_FS_POSIX_ACL + {"acl",.setmask = 1 << REISERFS_POSIXACL}, + {"noacl",.clrmask = 1 << REISERFS_POSIXACL}, +@@ -1298,6 +1306,14 @@ static int reiserfs_remount(struct super + handle_quota_files(s, qf_names, &qfmt); + #endif + ++ if ((mount_options & (1 << REISERFS_TAGGED)) && ++ !(s->s_flags & MS_TAGGED)) { ++ reiserfs_warning(s, "super-vs01", ++ "reiserfs: tagging not permitted on remount."); ++ err = -EINVAL; ++ goto out_err; ++ } ++ + handle_attrs(s); + + /* Add options that are safe here */ +@@ -1777,6 +1793,10 @@ static int reiserfs_fill_super(struct su + goto error_unlocked; + } + ++ /* map mount option tagxid */ ++ if (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_TAGGED)) ++ s->s_flags |= MS_TAGGED; ++ + rs = SB_DISK_SUPER_BLOCK(s); + /* Let's do basic sanity check to verify that underlying device is not + smaller than the filesystem. If the check fails then abort and scream, +diff -NurpP --minimal linux-3.3.8/fs/reiserfs/xattr.c linux-3.3.8-vs2.3.3.4/fs/reiserfs/xattr.c +--- linux-3.3.8/fs/reiserfs/xattr.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/reiserfs/xattr.c 2012-02-24 03:55:06.000000000 +0100 +@@ -40,6 +40,7 @@ + #include + #include + #include ++#include + #include + #include + #include +diff -NurpP --minimal linux-3.3.8/fs/stat.c linux-3.3.8-vs2.3.3.4/fs/stat.c +--- linux-3.3.8/fs/stat.c 2012-01-09 16:14:55.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/stat.c 2012-02-24 03:55:06.000000000 +0100 +@@ -26,6 +26,7 @@ void generic_fillattr(struct inode *inod + stat->nlink = inode->i_nlink; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; ++ stat->tag = inode->i_tag; + stat->rdev = inode->i_rdev; + stat->size = i_size_read(inode); + stat->atime = inode->i_atime; +diff -NurpP --minimal linux-3.3.8/fs/statfs.c linux-3.3.8-vs2.3.3.4/fs/statfs.c +--- linux-3.3.8/fs/statfs.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/statfs.c 2012-02-24 04:27:47.000000000 +0100 +@@ -7,6 +7,8 @@ + #include + #include + #include ++#include ++#include + #include "internal.h" + + static int flags_by_mnt(int mnt_flags) +@@ -60,6 +62,8 @@ static int statfs_by_dentry(struct dentr + retval = dentry->d_sb->s_op->statfs(dentry, buf); + if (retval == 0 && buf->f_frsize == 0) + buf->f_frsize = buf->f_bsize; ++ if (!vx_check(0, VS_ADMIN|VS_WATCH)) ++ vx_vsi_statfs(dentry->d_sb, buf); + return retval; + } + +diff -NurpP --minimal linux-3.3.8/fs/super.c linux-3.3.8-vs2.3.3.4/fs/super.c +--- linux-3.3.8/fs/super.c 2012-03-19 19:47:26.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/super.c 2012-03-19 20:52:10.000000000 +0100 +@@ -32,6 +32,9 @@ + #include + #include + #include ++#include ++#include ++#include + #include "internal.h" + + +@@ -1137,6 +1140,13 @@ mount_fs(struct file_system_type *type, + WARN_ON(sb->s_bdi == &default_backing_dev_info); + sb->s_flags |= MS_BORN; + ++ error = -EPERM; ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_BINARY_MOUNT) && ++ !sb->s_bdev && ++ (sb->s_magic != PROC_SUPER_MAGIC) && ++ (sb->s_magic != DEVPTS_SUPER_MAGIC)) ++ goto out_sb; ++ + error = security_sb_kern_mount(sb, flags, secdata); + if (error) + goto out_sb; +diff -NurpP --minimal linux-3.3.8/fs/sysfs/mount.c linux-3.3.8-vs2.3.3.4/fs/sysfs/mount.c +--- linux-3.3.8/fs/sysfs/mount.c 2011-07-22 11:18:06.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/sysfs/mount.c 2012-02-24 03:55:06.000000000 +0100 +@@ -47,7 +47,7 @@ static int sysfs_fill_super(struct super + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; +- sb->s_magic = SYSFS_MAGIC; ++ sb->s_magic = SYSFS_SUPER_MAGIC; + sb->s_op = &sysfs_ops; + sb->s_time_gran = 1; + +diff -NurpP --minimal linux-3.3.8/fs/utimes.c linux-3.3.8-vs2.3.3.4/fs/utimes.c +--- linux-3.3.8/fs/utimes.c 2011-05-22 16:17:54.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/utimes.c 2012-02-24 03:55:06.000000000 +0100 +@@ -8,6 +8,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + +@@ -52,12 +54,18 @@ static int utimes_common(struct path *pa + { + int error; + struct iattr newattrs; +- struct inode *inode = path->dentry->d_inode; ++ struct inode *inode; + + error = mnt_want_write(path->mnt); + if (error) + goto out; + ++ error = cow_check_and_break(path); ++ if (error) ++ goto mnt_drop_write_and_out; ++ ++ inode = path->dentry->d_inode; ++ + if (times && times[0].tv_nsec == UTIME_NOW && + times[1].tv_nsec == UTIME_NOW) + times = NULL; +diff -NurpP --minimal linux-3.3.8/fs/xattr.c linux-3.3.8-vs2.3.3.4/fs/xattr.c +--- linux-3.3.8/fs/xattr.c 2012-03-19 19:47:27.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/xattr.c 2012-02-24 03:55:06.000000000 +0100 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + #include + + +@@ -50,7 +51,7 @@ xattr_permission(struct inode *inode, co + * The trusted.* namespace can only be accessed by privileged users. + */ + if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_capable(CAP_SYS_ADMIN, VXC_FS_TRUSTED)) + return (mask & MAY_WRITE) ? -EPERM : -ENODATA; + return 0; + } +diff -NurpP --minimal linux-3.3.8/fs/xfs/xfs_dinode.h linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_dinode.h +--- linux-3.3.8/fs/xfs/xfs_dinode.h 2011-10-24 18:45:31.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_dinode.h 2012-02-24 03:55:06.000000000 +0100 +@@ -51,7 +51,9 @@ typedef struct xfs_dinode { + __be32 di_nlink; /* number of links to file */ + __be16 di_projid_lo; /* lower part of owner's project id */ + __be16 di_projid_hi; /* higher part owner's project id */ +- __u8 di_pad[6]; /* unused, zeroed space */ ++ __u8 di_pad[2]; /* unused, zeroed space */ ++ __be16 di_tag; /* context tagging */ ++ __be16 di_vflags; /* vserver specific flags */ + __be16 di_flushiter; /* incremented on flush */ + xfs_timestamp_t di_atime; /* time last accessed */ + xfs_timestamp_t di_mtime; /* time last modified */ +@@ -184,6 +186,8 @@ static inline void xfs_dinode_put_rdev(s + #define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ + #define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ + #define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ ++#define XFS_DIFLAG_IXUNLINK_BIT 15 /* Immutable inver on unlink */ ++ + #define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) + #define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) + #define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) +@@ -199,6 +203,7 @@ static inline void xfs_dinode_put_rdev(s + #define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) + #define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) + #define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT) ++#define XFS_DIFLAG_IXUNLINK (1 << XFS_DIFLAG_IXUNLINK_BIT) + + #ifdef CONFIG_XFS_RT + #define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) +@@ -211,6 +216,10 @@ static inline void xfs_dinode_put_rdev(s + XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ + XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ + XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ +- XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) ++ XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM | \ ++ XFS_DIFLAG_IXUNLINK) ++ ++#define XFS_DIVFLAG_BARRIER 0x01 ++#define XFS_DIVFLAG_COW 0x02 + + #endif /* __XFS_DINODE_H__ */ +diff -NurpP --minimal linux-3.3.8/fs/xfs/xfs_fs.h linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_fs.h +--- linux-3.3.8/fs/xfs/xfs_fs.h 2011-10-24 18:45:31.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_fs.h 2012-02-24 03:55:06.000000000 +0100 +@@ -67,6 +67,9 @@ struct fsxattr { + #define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ + #define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ + #define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ ++#define XFS_XFLAG_IXUNLINK 0x00008000 /* immutable invert on unlink */ ++#define XFS_XFLAG_BARRIER 0x10000000 /* chroot() barrier */ ++#define XFS_XFLAG_COW 0x20000000 /* copy on write mark */ + #define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ + + /* +@@ -302,7 +305,8 @@ typedef struct xfs_bstat { + #define bs_projid bs_projid_lo /* (previously just bs_projid) */ + __u16 bs_forkoff; /* inode fork offset in bytes */ + __u16 bs_projid_hi; /* higher part of project id */ +- unsigned char bs_pad[10]; /* pad space, unused */ ++ unsigned char bs_pad[8]; /* pad space, unused */ ++ __u16 bs_tag; /* context tagging */ + __u32 bs_dmevmask; /* DMIG event mask */ + __u16 bs_dmstate; /* DMIG state info */ + __u16 bs_aextents; /* attribute number of extents */ +diff -NurpP --minimal linux-3.3.8/fs/xfs/xfs_ialloc.c linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_ialloc.c +--- linux-3.3.8/fs/xfs/xfs_ialloc.c 2012-03-19 19:47:27.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_ialloc.c 2012-02-24 03:55:06.000000000 +0100 +@@ -37,7 +37,6 @@ + #include "xfs_error.h" + #include "xfs_bmap.h" + +- + /* + * Allocation group level functions. + */ +diff -NurpP --minimal linux-3.3.8/fs/xfs/xfs_inode.c linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_inode.c +--- linux-3.3.8/fs/xfs/xfs_inode.c 2012-03-19 19:47:27.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_inode.c 2012-02-24 03:55:06.000000000 +0100 +@@ -236,6 +236,7 @@ xfs_inotobp( + return 0; + } + ++#include + + /* + * This routine is called to map an inode to the buffer containing +@@ -631,15 +632,25 @@ xfs_iformat_btree( + STATIC void + xfs_dinode_from_disk( + xfs_icdinode_t *to, +- xfs_dinode_t *from) ++ xfs_dinode_t *from, ++ int tagged) + { ++ uint32_t uid, gid, tag; ++ + to->di_magic = be16_to_cpu(from->di_magic); + to->di_mode = be16_to_cpu(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = be16_to_cpu(from->di_onlink); +- to->di_uid = be32_to_cpu(from->di_uid); +- to->di_gid = be32_to_cpu(from->di_gid); ++ ++ uid = be32_to_cpu(from->di_uid); ++ gid = be32_to_cpu(from->di_gid); ++ tag = be16_to_cpu(from->di_tag); ++ ++ to->di_uid = INOTAG_UID(tagged, uid, gid); ++ to->di_gid = INOTAG_GID(tagged, uid, gid); ++ to->di_tag = INOTAG_TAG(tagged, uid, gid, tag); ++ + to->di_nlink = be32_to_cpu(from->di_nlink); + to->di_projid_lo = be16_to_cpu(from->di_projid_lo); + to->di_projid_hi = be16_to_cpu(from->di_projid_hi); +@@ -661,21 +672,26 @@ xfs_dinode_from_disk( + to->di_dmevmask = be32_to_cpu(from->di_dmevmask); + to->di_dmstate = be16_to_cpu(from->di_dmstate); + to->di_flags = be16_to_cpu(from->di_flags); ++ to->di_vflags = be16_to_cpu(from->di_vflags); + to->di_gen = be32_to_cpu(from->di_gen); + } + + void + xfs_dinode_to_disk( + xfs_dinode_t *to, +- xfs_icdinode_t *from) ++ xfs_icdinode_t *from, ++ int tagged) + { + to->di_magic = cpu_to_be16(from->di_magic); + to->di_mode = cpu_to_be16(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = cpu_to_be16(from->di_onlink); +- to->di_uid = cpu_to_be32(from->di_uid); +- to->di_gid = cpu_to_be32(from->di_gid); ++ ++ to->di_uid = cpu_to_be32(TAGINO_UID(tagged, from->di_uid, from->di_tag)); ++ to->di_gid = cpu_to_be32(TAGINO_GID(tagged, from->di_gid, from->di_tag)); ++ to->di_tag = cpu_to_be16(TAGINO_TAG(tagged, from->di_tag)); ++ + to->di_nlink = cpu_to_be32(from->di_nlink); + to->di_projid_lo = cpu_to_be16(from->di_projid_lo); + to->di_projid_hi = cpu_to_be16(from->di_projid_hi); +@@ -697,12 +713,14 @@ xfs_dinode_to_disk( + to->di_dmevmask = cpu_to_be32(from->di_dmevmask); + to->di_dmstate = cpu_to_be16(from->di_dmstate); + to->di_flags = cpu_to_be16(from->di_flags); ++ to->di_vflags = cpu_to_be16(from->di_vflags); + to->di_gen = cpu_to_be32(from->di_gen); + } + + STATIC uint + _xfs_dic2xflags( +- __uint16_t di_flags) ++ __uint16_t di_flags, ++ __uint16_t di_vflags) + { + uint flags = 0; + +@@ -713,6 +731,8 @@ _xfs_dic2xflags( + flags |= XFS_XFLAG_PREALLOC; + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= XFS_XFLAG_IMMUTABLE; ++ if (di_flags & XFS_DIFLAG_IXUNLINK) ++ flags |= XFS_XFLAG_IXUNLINK; + if (di_flags & XFS_DIFLAG_APPEND) + flags |= XFS_XFLAG_APPEND; + if (di_flags & XFS_DIFLAG_SYNC) +@@ -737,6 +757,10 @@ _xfs_dic2xflags( + flags |= XFS_XFLAG_FILESTREAM; + } + ++ if (di_vflags & XFS_DIVFLAG_BARRIER) ++ flags |= FS_BARRIER_FL; ++ if (di_vflags & XFS_DIVFLAG_COW) ++ flags |= FS_COW_FL; + return flags; + } + +@@ -746,7 +770,7 @@ xfs_ip2xflags( + { + xfs_icdinode_t *dic = &ip->i_d; + +- return _xfs_dic2xflags(dic->di_flags) | ++ return _xfs_dic2xflags(dic->di_flags, dic->di_vflags) | + (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); + } + +@@ -754,7 +778,8 @@ uint + xfs_dic2xflags( + xfs_dinode_t *dip) + { +- return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | ++ return _xfs_dic2xflags(be16_to_cpu(dip->di_flags), ++ be16_to_cpu(dip->di_vflags)) | + (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); + } + +@@ -787,7 +812,6 @@ xfs_iread( + if (error) + return error; + dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); +- + /* + * If we got something that isn't an inode it means someone + * (nfs or dmi) has a stale handle. +@@ -810,7 +834,8 @@ xfs_iread( + * Otherwise, just get the truly permanent information. + */ + if (dip->di_mode) { +- xfs_dinode_from_disk(&ip->i_d, dip); ++ xfs_dinode_from_disk(&ip->i_d, dip, ++ mp->m_flags & XFS_MOUNT_TAGGED); + error = xfs_iformat(ip, dip); + if (error) { + #ifdef DEBUG +@@ -998,6 +1023,7 @@ xfs_ialloc( + ASSERT(ip->i_d.di_nlink == nlink); + ip->i_d.di_uid = current_fsuid(); + ip->i_d.di_gid = current_fsgid(); ++ ip->i_d.di_tag = current_fstag(&ip->i_vnode); + xfs_set_projid(ip, prid); + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + +@@ -1057,6 +1083,7 @@ xfs_ialloc( + ip->i_d.di_dmevmask = 0; + ip->i_d.di_dmstate = 0; + ip->i_d.di_flags = 0; ++ ip->i_d.di_vflags = 0; + flags = XFS_ILOG_CORE; + switch (mode & S_IFMT) { + case S_IFIFO: +@@ -1726,6 +1753,7 @@ xfs_ifree( + } + ip->i_d.di_mode = 0; /* mark incore inode as free */ + ip->i_d.di_flags = 0; ++ ip->i_d.di_vflags = 0; + ip->i_d.di_dmevmask = 0; + ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ + ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; +@@ -2620,7 +2648,8 @@ xfs_iflush_int( + * because if the inode is dirty at all the core must + * be. + */ +- xfs_dinode_to_disk(dip, &ip->i_d); ++ xfs_dinode_to_disk(dip, &ip->i_d, ++ mp->m_flags & XFS_MOUNT_TAGGED); + + /* Wrap, we never let the log put out DI_MAX_FLUSH */ + if (ip->i_d.di_flushiter == DI_MAX_FLUSH) +diff -NurpP --minimal linux-3.3.8/fs/xfs/xfs_inode.h linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_inode.h +--- linux-3.3.8/fs/xfs/xfs_inode.h 2012-03-19 19:47:27.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_inode.h 2012-02-24 03:55:06.000000000 +0100 +@@ -134,7 +134,9 @@ typedef struct xfs_icdinode { + __uint32_t di_nlink; /* number of links to file */ + __uint16_t di_projid_lo; /* lower part of owner's project id */ + __uint16_t di_projid_hi; /* higher part of owner's project id */ +- __uint8_t di_pad[6]; /* unused, zeroed space */ ++ __uint8_t di_pad[2]; /* unused, zeroed space */ ++ __uint16_t di_tag; /* context tagging */ ++ __uint16_t di_vflags; /* vserver specific flags */ + __uint16_t di_flushiter; /* incremented on flush */ + xfs_ictimestamp_t di_atime; /* time last accessed */ + xfs_ictimestamp_t di_mtime; /* time last modified */ +@@ -556,7 +558,7 @@ int xfs_itobp(struct xfs_mount *, struc + int xfs_iread(struct xfs_mount *, struct xfs_trans *, + struct xfs_inode *, uint); + void xfs_dinode_to_disk(struct xfs_dinode *, +- struct xfs_icdinode *); ++ struct xfs_icdinode *, int); + void xfs_idestroy_fork(struct xfs_inode *, int); + void xfs_idata_realloc(struct xfs_inode *, int, int); + void xfs_iroot_realloc(struct xfs_inode *, int, int); +diff -NurpP --minimal linux-3.3.8/fs/xfs/xfs_ioctl.c linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_ioctl.c +--- linux-3.3.8/fs/xfs/xfs_ioctl.c 2012-03-19 19:47:27.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_ioctl.c 2012-02-24 03:55:06.000000000 +0100 +@@ -28,7 +28,7 @@ + #include "xfs_bmap_btree.h" + #include "xfs_dinode.h" + #include "xfs_inode.h" +-#include "xfs_ioctl.h" ++// #include "xfs_ioctl.h" + #include "xfs_rtalloc.h" + #include "xfs_itable.h" + #include "xfs_error.h" +@@ -748,6 +748,10 @@ xfs_merge_ioc_xflags( + xflags |= XFS_XFLAG_IMMUTABLE; + else + xflags &= ~XFS_XFLAG_IMMUTABLE; ++ if (flags & FS_IXUNLINK_FL) ++ xflags |= XFS_XFLAG_IXUNLINK; ++ else ++ xflags &= ~XFS_XFLAG_IXUNLINK; + if (flags & FS_APPEND_FL) + xflags |= XFS_XFLAG_APPEND; + else +@@ -776,6 +780,8 @@ xfs_di2lxflags( + + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; ++ if (di_flags & XFS_DIFLAG_IXUNLINK) ++ flags |= FS_IXUNLINK_FL; + if (di_flags & XFS_DIFLAG_APPEND) + flags |= FS_APPEND_FL; + if (di_flags & XFS_DIFLAG_SYNC) +@@ -836,6 +842,8 @@ xfs_set_diflags( + di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + if (xflags & XFS_XFLAG_IMMUTABLE) + di_flags |= XFS_DIFLAG_IMMUTABLE; ++ if (xflags & XFS_XFLAG_IXUNLINK) ++ di_flags |= XFS_DIFLAG_IXUNLINK; + if (xflags & XFS_XFLAG_APPEND) + di_flags |= XFS_DIFLAG_APPEND; + if (xflags & XFS_XFLAG_SYNC) +@@ -878,6 +886,10 @@ xfs_diflags_to_linux( + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; ++ if (xflags & XFS_XFLAG_IXUNLINK) ++ inode->i_flags |= S_IXUNLINK; ++ else ++ inode->i_flags &= ~S_IXUNLINK; + if (xflags & XFS_XFLAG_APPEND) + inode->i_flags |= S_APPEND; + else +@@ -1370,10 +1382,18 @@ xfs_file_ioctl( + case XFS_IOC_FSGETXATTRA: + return xfs_ioc_fsgetxattr(ip, 1, arg); + case XFS_IOC_FSSETXATTR: ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -XFS_ERROR(EACCES); ++ } + return xfs_ioc_fssetxattr(ip, filp, arg); + case XFS_IOC_GETXFLAGS: + return xfs_ioc_getxflags(ip, arg); + case XFS_IOC_SETXFLAGS: ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -XFS_ERROR(EACCES); ++ } + return xfs_ioc_setxflags(ip, filp, arg); + + case XFS_IOC_FSSETDM: { +diff -NurpP --minimal linux-3.3.8/fs/xfs/xfs_ioctl.h linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_ioctl.h +--- linux-3.3.8/fs/xfs/xfs_ioctl.h 2011-10-24 18:45:31.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_ioctl.h 2012-02-24 03:55:06.000000000 +0100 +@@ -70,6 +70,12 @@ xfs_handle_to_dentry( + void __user *uhandle, + u32 hlen); + ++extern int ++xfs_sync_flags( ++ struct inode *inode, ++ int flags, ++ int vflags); ++ + extern long + xfs_file_ioctl( + struct file *filp, +diff -NurpP --minimal linux-3.3.8/fs/xfs/xfs_iops.c linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_iops.c +--- linux-3.3.8/fs/xfs/xfs_iops.c 2012-03-19 19:47:27.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_iops.c 2012-02-24 03:55:06.000000000 +0100 +@@ -30,6 +30,7 @@ + #include "xfs_bmap_btree.h" + #include "xfs_dinode.h" + #include "xfs_inode.h" ++#include "xfs_ioctl.h" + #include "xfs_bmap.h" + #include "xfs_rtalloc.h" + #include "xfs_error.h" +@@ -49,6 +50,7 @@ + #include + #include + #include ++#include + + /* + * Bring the timestamps in the XFS inode uptodate. +@@ -474,6 +476,7 @@ xfs_vn_getattr( + stat->nlink = ip->i_d.di_nlink; + stat->uid = ip->i_d.di_uid; + stat->gid = ip->i_d.di_gid; ++ stat->tag = ip->i_d.di_tag; + stat->ino = ip->i_ino; + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; +@@ -1051,6 +1054,7 @@ static const struct inode_operations xfs + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .fiemap = xfs_vn_fiemap, ++ .sync_flags = xfs_sync_flags, + }; + + static const struct inode_operations xfs_dir_inode_operations = { +@@ -1076,6 +1080,7 @@ static const struct inode_operations xfs + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, ++ .sync_flags = xfs_sync_flags, + }; + + static const struct inode_operations xfs_dir_ci_inode_operations = { +@@ -1125,6 +1130,10 @@ xfs_diflags_to_iflags( + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; ++ if (ip->i_d.di_flags & XFS_DIFLAG_IXUNLINK) ++ inode->i_flags |= S_IXUNLINK; ++ else ++ inode->i_flags &= ~S_IXUNLINK; + if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + inode->i_flags |= S_APPEND; + else +@@ -1137,6 +1146,15 @@ xfs_diflags_to_iflags( + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; ++ ++ if (ip->i_d.di_vflags & XFS_DIVFLAG_BARRIER) ++ inode->i_vflags |= V_BARRIER; ++ else ++ inode->i_vflags &= ~V_BARRIER; ++ if (ip->i_d.di_vflags & XFS_DIVFLAG_COW) ++ inode->i_vflags |= V_COW; ++ else ++ inode->i_vflags &= ~V_COW; + } + + /* +@@ -1168,6 +1186,7 @@ xfs_setup_inode( + set_nlink(inode, ip->i_d.di_nlink); + inode->i_uid = ip->i_d.di_uid; + inode->i_gid = ip->i_d.di_gid; ++ inode->i_tag = ip->i_d.di_tag; + + switch (inode->i_mode & S_IFMT) { + case S_IFBLK: +diff -NurpP --minimal linux-3.3.8/fs/xfs/xfs_itable.c linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_itable.c +--- linux-3.3.8/fs/xfs/xfs_itable.c 2011-05-22 16:17:54.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_itable.c 2012-02-24 03:55:06.000000000 +0100 +@@ -98,6 +98,7 @@ xfs_bulkstat_one_int( + buf->bs_mode = dic->di_mode; + buf->bs_uid = dic->di_uid; + buf->bs_gid = dic->di_gid; ++ buf->bs_tag = dic->di_tag; + buf->bs_size = dic->di_size; + + /* +diff -NurpP --minimal linux-3.3.8/fs/xfs/xfs_linux.h linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_linux.h +--- linux-3.3.8/fs/xfs/xfs_linux.h 2011-10-24 18:45:31.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_linux.h 2012-02-24 03:55:06.000000000 +0100 +@@ -121,6 +121,7 @@ + + #define current_cpu() (raw_smp_processor_id()) + #define current_pid() (current->pid) ++#define current_fstag(vp) (dx_current_fstag((vp)->i_sb)) + #define current_test_flags(f) (current->flags & (f)) + #define current_set_flags_nested(sp, f) \ + (*(sp) = current->flags, current->flags |= (f)) +diff -NurpP --minimal linux-3.3.8/fs/xfs/xfs_log_recover.c linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_log_recover.c +--- linux-3.3.8/fs/xfs/xfs_log_recover.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_log_recover.c 2012-04-03 03:02:12.000000000 +0200 +@@ -2344,7 +2344,8 @@ xlog_recover_inode_pass2( + } + + /* The core is in in-core format */ +- xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr); ++ xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr, ++ mp->m_flags & XFS_MOUNT_TAGGED); + + /* the rest is in on-disk format */ + if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { +diff -NurpP --minimal linux-3.3.8/fs/xfs/xfs_mount.h linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_mount.h +--- linux-3.3.8/fs/xfs/xfs_mount.h 2012-03-19 19:47:27.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_mount.h 2012-02-24 03:55:06.000000000 +0100 +@@ -248,6 +248,7 @@ typedef struct xfs_mount { + allocator */ + #define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ + ++#define XFS_MOUNT_TAGGED (1ULL << 31) /* context tagging */ + + /* + * Default minimum read and write sizes. +diff -NurpP --minimal linux-3.3.8/fs/xfs/xfs_super.c linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_super.c +--- linux-3.3.8/fs/xfs/xfs_super.c 2012-03-19 19:47:27.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_super.c 2012-02-24 03:55:06.000000000 +0100 +@@ -113,6 +113,9 @@ mempool_t *xfs_ioend_pool; + #define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */ + #define MNTOPT_DISCARD "discard" /* Discard unused blocks */ + #define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ ++#define MNTOPT_TAGXID "tagxid" /* context tagging for inodes */ ++#define MNTOPT_TAGGED "tag" /* context tagging for inodes */ ++#define MNTOPT_NOTAGTAG "notag" /* do not use context tagging */ + + /* + * Table driven mount option parser. +@@ -121,10 +124,14 @@ mempool_t *xfs_ioend_pool; + * in the future, too. + */ + enum { ++ Opt_tag, Opt_notag, + Opt_barrier, Opt_nobarrier, Opt_err + }; + + static const match_table_t tokens = { ++ {Opt_tag, "tagxid"}, ++ {Opt_tag, "tag"}, ++ {Opt_notag, "notag"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_err, NULL} +@@ -373,6 +380,19 @@ xfs_parseargs( + } else if (!strcmp(this_char, "irixsgid")) { + xfs_warn(mp, + "irixsgid is now a sysctl(2) variable, option is deprecated."); ++#ifndef CONFIG_TAGGING_NONE ++ } else if (!strcmp(this_char, MNTOPT_TAGGED)) { ++ mp->m_flags |= XFS_MOUNT_TAGGED; ++ } else if (!strcmp(this_char, MNTOPT_NOTAGTAG)) { ++ mp->m_flags &= ~XFS_MOUNT_TAGGED; ++ } else if (!strcmp(this_char, MNTOPT_TAGXID)) { ++ mp->m_flags |= XFS_MOUNT_TAGGED; ++#endif ++#ifdef CONFIG_PROPAGATE ++ } else if (!strcmp(this_char, MNTOPT_TAGGED)) { ++ /* use value */ ++ mp->m_flags |= XFS_MOUNT_TAGGED; ++#endif + } else { + xfs_warn(mp, "unknown mount option [%s].", this_char); + return EINVAL; +@@ -1114,6 +1134,16 @@ xfs_fs_remount( + case Opt_nobarrier: + mp->m_flags &= ~XFS_MOUNT_BARRIER; + break; ++ case Opt_tag: ++ if (!(sb->s_flags & MS_TAGGED)) { ++ printk(KERN_INFO ++ "XFS: %s: tagging not permitted on remount.\n", ++ sb->s_id); ++ return -EINVAL; ++ } ++ break; ++ case Opt_notag: ++ break; + default: + /* + * Logically we would return an error here to prevent +@@ -1329,6 +1359,9 @@ xfs_fs_fill_super( + if (error) + goto out_free_sb; + ++ if (mp->m_flags & XFS_MOUNT_TAGGED) ++ sb->s_flags |= MS_TAGGED; ++ + /* + * we must configure the block size in the superblock before we run the + * full mount process as the mount process can lookup and cache inodes. +diff -NurpP --minimal linux-3.3.8/fs/xfs/xfs_vnodeops.c linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_vnodeops.c +--- linux-3.3.8/fs/xfs/xfs_vnodeops.c 2012-03-19 19:47:27.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/fs/xfs/xfs_vnodeops.c 2012-04-01 18:12:15.000000000 +0200 +@@ -106,6 +106,77 @@ xfs_readlink_bmap( + return error; + } + ++ ++STATIC void ++xfs_get_inode_flags( ++ xfs_inode_t *ip) ++{ ++ struct inode *inode = VFS_I(ip); ++ unsigned int flags = inode->i_flags; ++ unsigned int vflags = inode->i_vflags; ++ ++ if (flags & S_IMMUTABLE) ++ ip->i_d.di_flags |= XFS_DIFLAG_IMMUTABLE; ++ else ++ ip->i_d.di_flags &= ~XFS_DIFLAG_IMMUTABLE; ++ if (flags & S_IXUNLINK) ++ ip->i_d.di_flags |= XFS_DIFLAG_IXUNLINK; ++ else ++ ip->i_d.di_flags &= ~XFS_DIFLAG_IXUNLINK; ++ ++ if (vflags & V_BARRIER) ++ ip->i_d.di_vflags |= XFS_DIVFLAG_BARRIER; ++ else ++ ip->i_d.di_vflags &= ~XFS_DIVFLAG_BARRIER; ++ if (vflags & V_COW) ++ ip->i_d.di_vflags |= XFS_DIVFLAG_COW; ++ else ++ ip->i_d.di_vflags &= ~XFS_DIVFLAG_COW; ++} ++ ++int ++xfs_sync_flags( ++ struct inode *inode, ++ int flags, ++ int vflags) ++{ ++ struct xfs_inode *ip = XFS_I(inode); ++ struct xfs_mount *mp = ip->i_mount; ++ struct xfs_trans *tp; ++ unsigned int lock_flags = 0; ++ int code; ++ ++ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); ++ code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); ++ if (code) ++ goto error_out; ++ ++ xfs_ilock(ip, XFS_ILOCK_EXCL); ++ xfs_trans_ijoin(tp, ip, 0); ++ ++ inode->i_flags = flags; ++ inode->i_vflags = vflags; ++ xfs_get_inode_flags(ip); ++ ++ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); ++ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); ++ ++ XFS_STATS_INC(xs_ig_attrchg); ++ ++ if (mp->m_flags & XFS_MOUNT_WSYNC) ++ xfs_trans_set_sync(tp); ++ code = xfs_trans_commit(tp, 0); ++ xfs_iunlock(ip, XFS_ILOCK_EXCL); ++ return code; ++ ++error_out: ++ xfs_trans_cancel(tp, 0); ++ if (lock_flags) ++ xfs_iunlock(ip, XFS_ILOCK_EXCL); ++ return code; ++} ++ ++ + int + xfs_readlink( + xfs_inode_t *ip, +diff -NurpP --minimal linux-3.3.8/include/linux/Kbuild linux-3.3.8-vs2.3.3.4/include/linux/Kbuild +--- linux-3.3.8/include/linux/Kbuild 2012-03-19 19:47:27.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/Kbuild 2012-02-24 03:55:06.000000000 +0100 +@@ -17,6 +17,7 @@ header-y += netfilter_bridge/ + header-y += netfilter_ipv4/ + header-y += netfilter_ipv6/ + header-y += usb/ ++header-y += vserver/ + header-y += wimax/ + + objhdr-y += version.h +diff -NurpP --minimal linux-3.3.8/include/linux/capability.h linux-3.3.8-vs2.3.3.4/include/linux/capability.h +--- linux-3.3.8/include/linux/capability.h 2012-03-19 19:47:27.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/capability.h 2012-02-24 03:55:06.000000000 +0100 +@@ -280,6 +280,7 @@ struct cpu_vfs_cap_data { + arbitrary SCSI commands */ + /* Allow setting encryption key on loopback filesystem */ + /* Allow setting zone reclaim policy */ ++/* Allow the selection of a security context */ + + #define CAP_SYS_ADMIN 21 + +@@ -363,7 +364,12 @@ struct cpu_vfs_cap_data { + + #define CAP_LAST_CAP CAP_WAKE_ALARM + +-#define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) ++/* Allow context manipulations */ ++/* Allow changing context info on files */ ++ ++#define CAP_CONTEXT 63 ++ ++#define cap_valid(x) ((x) >= 0 && ((x) <= CAP_LAST_CAP || (x) == CAP_CONTEXT)) + + /* + * Bit location of each capability (used by user-space library and kernel) +diff -NurpP --minimal linux-3.3.8/include/linux/cred.h linux-3.3.8-vs2.3.3.4/include/linux/cred.h +--- linux-3.3.8/include/linux/cred.h 2012-03-19 19:47:27.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/cred.h 2012-02-24 03:55:06.000000000 +0100 +@@ -156,6 +156,7 @@ extern void exit_creds(struct task_struc + extern int copy_creds(struct task_struct *, unsigned long); + extern const struct cred *get_task_cred(struct task_struct *); + extern struct cred *cred_alloc_blank(void); ++extern struct cred *__prepare_creds(const struct cred *); + extern struct cred *prepare_creds(void); + extern struct cred *prepare_exec_creds(void); + extern int commit_creds(struct cred *); +@@ -209,6 +210,31 @@ static inline void validate_process_cred + } + #endif + ++static inline void set_cred_subscribers(struct cred *cred, int n) ++{ ++#ifdef CONFIG_DEBUG_CREDENTIALS ++ atomic_set(&cred->subscribers, n); ++#endif ++} ++ ++static inline int read_cred_subscribers(const struct cred *cred) ++{ ++#ifdef CONFIG_DEBUG_CREDENTIALS ++ return atomic_read(&cred->subscribers); ++#else ++ return 0; ++#endif ++} ++ ++static inline void alter_cred_subscribers(const struct cred *_cred, int n) ++{ ++#ifdef CONFIG_DEBUG_CREDENTIALS ++ struct cred *cred = (struct cred *) _cred; ++ ++ atomic_add(n, &cred->subscribers); ++#endif ++} ++ + /** + * get_new_cred - Get a reference on a new set of credentials + * @cred: The new credentials to reference +diff -NurpP --minimal linux-3.3.8/include/linux/devpts_fs.h linux-3.3.8-vs2.3.3.4/include/linux/devpts_fs.h +--- linux-3.3.8/include/linux/devpts_fs.h 2008-12-25 00:26:37.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/devpts_fs.h 2012-02-24 03:55:06.000000000 +0100 +@@ -45,5 +45,4 @@ static inline void devpts_pty_kill(struc + + #endif + +- + #endif /* _LINUX_DEVPTS_FS_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/ext2_fs.h linux-3.3.8-vs2.3.3.4/include/linux/ext2_fs.h +--- linux-3.3.8/include/linux/ext2_fs.h 2012-01-09 16:14:56.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/ext2_fs.h 2012-02-24 03:55:06.000000000 +0100 +@@ -190,8 +190,12 @@ struct ext2_group_desc + #define EXT2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */ + #define EXT2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */ + #define EXT2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/ ++#define EXT2_IXUNLINK_FL FS_IXUNLINK_FL /* Immutable invert on unlink */ + #define EXT2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */ + ++#define EXT2_BARRIER_FL FS_BARRIER_FL /* Barrier for chroot() */ ++#define EXT2_COW_FL FS_COW_FL /* Copy on Write marker */ ++ + #define EXT2_FL_USER_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */ + #define EXT2_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */ + +@@ -275,7 +279,8 @@ struct ext2_inode { + __u16 i_pad1; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ +- __u32 l_i_reserved2; ++ __le16 l_i_tag; /* Context Tag */ ++ __u16 l_i_reserved2; + } linux2; + struct { + __u8 h_i_frag; /* Fragment number */ +@@ -304,6 +309,7 @@ struct ext2_inode { + #define i_gid_low i_gid + #define i_uid_high osd2.linux2.l_i_uid_high + #define i_gid_high osd2.linux2.l_i_gid_high ++#define i_raw_tag osd2.linux2.l_i_tag + #define i_reserved2 osd2.linux2.l_i_reserved2 + #endif + +@@ -348,6 +354,7 @@ struct ext2_inode { + #define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ + #define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ + #define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ ++#define EXT2_MOUNT_TAGGED (1<<24) /* Enable Context Tags */ + + + #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt +diff -NurpP --minimal linux-3.3.8/include/linux/ext3_fs.h linux-3.3.8-vs2.3.3.4/include/linux/ext3_fs.h +--- linux-3.3.8/include/linux/ext3_fs.h 2012-03-19 19:47:27.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/ext3_fs.h 2012-02-24 03:55:06.000000000 +0100 +@@ -173,10 +173,14 @@ struct ext3_group_desc + #define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ + #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ + #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ ++#define EXT3_IXUNLINK_FL 0x08000000 /* Immutable invert on unlink */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +-#define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ ++#define EXT3_BARRIER_FL 0x04000000 /* Barrier for chroot() */ ++#define EXT3_COW_FL 0x20000000 /* Copy on Write marker */ ++ ++#define EXT3_FL_USER_VISIBLE 0x0103DFFF /* User visible flags */ ++#define EXT3_FL_USER_MODIFIABLE 0x010380FF /* User modifiable flags */ + + /* Flags that should be inherited by new inodes from their parent. */ + #define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\ +@@ -312,7 +316,8 @@ struct ext3_inode { + __u16 i_pad1; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ +- __u32 l_i_reserved2; ++ __le16 l_i_tag; /* Context Tag */ ++ __u16 l_i_reserved2; + } linux2; + struct { + __u8 h_i_frag; /* Fragment number */ +@@ -343,6 +348,7 @@ struct ext3_inode { + #define i_gid_low i_gid + #define i_uid_high osd2.linux2.l_i_uid_high + #define i_gid_high osd2.linux2.l_i_gid_high ++#define i_raw_tag osd2.linux2.l_i_tag + #define i_reserved2 osd2.linux2.l_i_reserved2 + + #elif defined(__GNU__) +@@ -405,6 +411,7 @@ struct ext3_inode { + #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ + #define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write + * error in ordered mode */ ++#define EXT3_MOUNT_TAGGED (1<<24) /* Enable Context Tags */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -918,6 +925,7 @@ extern void ext3_get_inode_flags(struct + extern void ext3_set_aops(struct inode *inode); + extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); ++extern int ext3_sync_flags(struct inode *, int, int); + + /* ioctl.c */ + extern long ext3_ioctl(struct file *, unsigned int, unsigned long); +diff -NurpP --minimal linux-3.3.8/include/linux/fs.h linux-3.3.8-vs2.3.3.4/include/linux/fs.h +--- linux-3.3.8/include/linux/fs.h 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/include/linux/fs.h 2012-06-08 15:27:44.000000000 +0200 +@@ -210,6 +210,9 @@ struct inodes_stat_t { + #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ + #define MS_I_VERSION (1<<23) /* Update inode I_version field */ + #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ ++#define MS_TAGGED (1<<25) /* use generic inode tagging */ ++#define MS_TAGID (1<<26) /* use specific tag for this mount */ ++#define MS_NOTAGCHECK (1<<27) /* don't check tags */ + #define MS_NOSEC (1<<28) + #define MS_BORN (1<<29) + #define MS_ACTIVE (1<<30) +@@ -241,6 +244,14 @@ struct inodes_stat_t { + #define S_IMA 1024 /* Inode has an associated IMA struct */ + #define S_AUTOMOUNT 2048 /* Automount/referral quasi-directory */ + #define S_NOSEC 4096 /* no suid or xattr security attributes */ ++#define S_IXUNLINK 8192 /* Immutable Invert on unlink */ ++ ++/* Linux-VServer related Inode flags */ ++ ++#define V_VALID 1 ++#define V_XATTR 2 ++#define V_BARRIER 4 /* Barrier for chroot() */ ++#define V_COW 8 /* Copy on Write */ + + /* + * Note that nosuid etc flags are inode-specific: setting some file-system +@@ -263,12 +274,15 @@ struct inodes_stat_t { + #define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \ + ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) + #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) +-#define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME) +-#define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION) ++#define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME) ++#define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION) ++#define IS_TAGGED(inode) __IS_FLG(inode, MS_TAGGED) + + #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) + #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) + #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) ++#define IS_IXUNLINK(inode) ((inode)->i_flags & S_IXUNLINK) ++#define IS_IXORUNLINK(inode) ((IS_IXUNLINK(inode) ? S_IMMUTABLE : 0) ^ IS_IMMUTABLE(inode)) + #define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL) + + #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) +@@ -279,6 +293,16 @@ struct inodes_stat_t { + #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) + #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) + ++#define IS_BARRIER(inode) (S_ISDIR((inode)->i_mode) && ((inode)->i_vflags & V_BARRIER)) ++ ++#ifdef CONFIG_VSERVER_COWBL ++# define IS_COW(inode) (IS_IXUNLINK(inode) && IS_IMMUTABLE(inode)) ++# define IS_COW_LINK(inode) (S_ISREG((inode)->i_mode) && ((inode)->i_nlink > 1)) ++#else ++# define IS_COW(inode) (0) ++# define IS_COW_LINK(inode) (0) ++#endif ++ + /* the read-only stuff doesn't really belong here, but any other place is + probably as bad and I don't want to create yet another include file. */ + +@@ -365,11 +389,14 @@ struct inodes_stat_t { + #define FS_EXTENT_FL 0x00080000 /* Extents */ + #define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */ + #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ ++#define FS_IXUNLINK_FL 0x08000000 /* Immutable invert on unlink */ + #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ + +-#define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +-#define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ ++#define FS_BARRIER_FL 0x04000000 /* Barrier for chroot() */ ++#define FS_COW_FL 0x20000000 /* Copy on Write marker */ + ++#define FS_FL_USER_VISIBLE 0x0103DFFF /* User visible flags */ ++#define FS_FL_USER_MODIFIABLE 0x010380FF /* User modifiable flags */ + + #define SYNC_FILE_RANGE_WAIT_BEFORE 1 + #define SYNC_FILE_RANGE_WRITE 2 +@@ -451,6 +478,7 @@ typedef void (dio_iodone_t)(struct kiocb + #define ATTR_KILL_PRIV (1 << 14) + #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ + #define ATTR_TIMES_SET (1 << 16) ++#define ATTR_TAG (1 << 17) + + /* + * This is the Inode Attributes structure, used for notify_change(). It +@@ -466,6 +494,7 @@ struct iattr { + umode_t ia_mode; + uid_t ia_uid; + gid_t ia_gid; ++ tag_t ia_tag; + loff_t ia_size; + struct timespec ia_atime; + struct timespec ia_mtime; +@@ -479,6 +508,9 @@ struct iattr { + struct file *ia_file; + }; + ++#define ATTR_FLAG_BARRIER 512 /* Barrier for chroot() */ ++#define ATTR_FLAG_IXUNLINK 1024 /* Immutable invert on unlink */ ++ + /* + * Includes for diskquotas. + */ +@@ -758,7 +790,9 @@ struct inode { + unsigned short i_opflags; + uid_t i_uid; + gid_t i_gid; +- unsigned int i_flags; ++ tag_t i_tag; ++ unsigned short i_flags; ++ unsigned short i_vflags; + + #ifdef CONFIG_FS_POSIX_ACL + struct posix_acl *i_acl; +@@ -787,6 +821,7 @@ struct inode { + unsigned int __i_nlink; + }; + dev_t i_rdev; ++ dev_t i_mdev; + struct timespec i_atime; + struct timespec i_mtime; + struct timespec i_ctime; +@@ -924,12 +959,12 @@ static inline void i_size_write(struct i + + static inline unsigned iminor(const struct inode *inode) + { +- return MINOR(inode->i_rdev); ++ return MINOR(inode->i_mdev); + } + + static inline unsigned imajor(const struct inode *inode) + { +- return MAJOR(inode->i_rdev); ++ return MAJOR(inode->i_mdev); + } + + extern struct block_device *I_BDEV(struct inode *inode); +@@ -996,6 +1031,7 @@ struct file { + loff_t f_pos; + struct fown_struct f_owner; + const struct cred *f_cred; ++ xid_t f_xid; + struct file_ra_state f_ra; + + u64 f_version; +@@ -1143,6 +1179,7 @@ struct file_lock { + struct file *fl_file; + loff_t fl_start; + loff_t fl_end; ++ xid_t fl_xid; + + struct fasync_struct * fl_fasync; /* for lease break notifications */ + /* for lease breaks: */ +@@ -1655,6 +1692,7 @@ struct inode_operations { + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); + ssize_t (*listxattr) (struct dentry *, char *, size_t); + int (*removexattr) (struct dentry *, const char *); ++ int (*sync_flags) (struct inode *, int, int); + void (*truncate_range)(struct inode *, loff_t, loff_t); + int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, + u64 len); +@@ -1674,6 +1712,7 @@ extern ssize_t vfs_readv(struct file *, + unsigned long, loff_t *); + extern ssize_t vfs_writev(struct file *, const struct iovec __user *, + unsigned long, loff_t *); ++ssize_t vfs_sendfile(struct file *, struct file *, loff_t *, size_t, loff_t); + + struct super_operations { + struct inode *(*alloc_inode)(struct super_block *sb); +@@ -2513,6 +2552,7 @@ extern int dcache_dir_open(struct inode + extern int dcache_dir_close(struct inode *, struct file *); + extern loff_t dcache_dir_lseek(struct file *, loff_t, int); + extern int dcache_readdir(struct file *, void *, filldir_t); ++extern int dcache_readdir_filter(struct file *, void *, filldir_t, int (*)(struct dentry *)); + extern int simple_setattr(struct dentry *, struct iattr *); + extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); + extern int simple_statfs(struct dentry *, struct kstatfs *); +diff -NurpP --minimal linux-3.3.8/include/linux/gfs2_ondisk.h linux-3.3.8-vs2.3.3.4/include/linux/gfs2_ondisk.h +--- linux-3.3.8/include/linux/gfs2_ondisk.h 2012-03-19 19:47:28.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/gfs2_ondisk.h 2012-02-24 03:55:06.000000000 +0100 +@@ -213,6 +213,9 @@ enum { + gfs2fl_NoAtime = 7, + gfs2fl_Sync = 8, + gfs2fl_System = 9, ++ gfs2fl_IXUnlink = 16, ++ gfs2fl_Barrier = 17, ++ gfs2fl_Cow = 18, + gfs2fl_TruncInProg = 29, + gfs2fl_InheritDirectio = 30, + gfs2fl_InheritJdata = 31, +@@ -229,6 +232,9 @@ enum { + #define GFS2_DIF_NOATIME 0x00000080 + #define GFS2_DIF_SYNC 0x00000100 + #define GFS2_DIF_SYSTEM 0x00000200 /* New in gfs2 */ ++#define GFS2_DIF_IXUNLINK 0x00010000 ++#define GFS2_DIF_BARRIER 0x00020000 ++#define GFS2_DIF_COW 0x00040000 + #define GFS2_DIF_TRUNC_IN_PROG 0x20000000 /* New in gfs2 */ + #define GFS2_DIF_INHERIT_DIRECTIO 0x40000000 + #define GFS2_DIF_INHERIT_JDATA 0x80000000 +diff -NurpP --minimal linux-3.3.8/include/linux/if_tun.h linux-3.3.8-vs2.3.3.4/include/linux/if_tun.h +--- linux-3.3.8/include/linux/if_tun.h 2010-08-02 16:52:54.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/include/linux/if_tun.h 2012-02-24 03:55:06.000000000 +0100 +@@ -53,6 +53,7 @@ + #define TUNDETACHFILTER _IOW('T', 214, struct sock_fprog) + #define TUNGETVNETHDRSZ _IOR('T', 215, int) + #define TUNSETVNETHDRSZ _IOW('T', 216, int) ++#define TUNSETNID _IOW('T', 217, int) + + /* TUNSETIFF ifr flags */ + #define IFF_TUN 0x0001 +diff -NurpP --minimal linux-3.3.8/include/linux/init_task.h linux-3.3.8-vs2.3.3.4/include/linux/init_task.h +--- linux-3.3.8/include/linux/init_task.h 2012-03-19 19:47:28.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/init_task.h 2012-02-24 03:55:06.000000000 +0100 +@@ -192,6 +192,10 @@ extern struct cred init_cred; + INIT_FTRACE_GRAPH \ + INIT_TRACE_RECURSION \ + INIT_TASK_RCU_PREEMPT(tsk) \ ++ .xid = 0, \ ++ .vx_info = NULL, \ ++ .nid = 0, \ ++ .nx_info = NULL, \ + } + + +diff -NurpP --minimal linux-3.3.8/include/linux/ipc.h linux-3.3.8-vs2.3.3.4/include/linux/ipc.h +--- linux-3.3.8/include/linux/ipc.h 2012-03-19 19:47:28.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/ipc.h 2012-02-24 03:55:06.000000000 +0100 +@@ -91,6 +91,7 @@ struct kern_ipc_perm + key_t key; + uid_t uid; + gid_t gid; ++ xid_t xid; + uid_t cuid; + gid_t cgid; + umode_t mode; +diff -NurpP --minimal linux-3.3.8/include/linux/ipc_namespace.h linux-3.3.8-vs2.3.3.4/include/linux/ipc_namespace.h +--- linux-3.3.8/include/linux/ipc_namespace.h 2011-10-24 18:45:32.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/include/linux/ipc_namespace.h 2012-02-24 03:55:06.000000000 +0100 +@@ -101,7 +101,8 @@ static inline int mq_init_ns(struct ipc_ + + #if defined(CONFIG_IPC_NS) + extern struct ipc_namespace *copy_ipcs(unsigned long flags, +- struct task_struct *tsk); ++ struct ipc_namespace *old_ns, ++ struct user_namespace *user_ns); + static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) + { + if (ns) +@@ -112,12 +113,13 @@ static inline struct ipc_namespace *get_ + extern void put_ipc_ns(struct ipc_namespace *ns); + #else + static inline struct ipc_namespace *copy_ipcs(unsigned long flags, +- struct task_struct *tsk) ++ struct ipc_namespace *old_ns, ++ struct user_namespace *user_ns) + { + if (flags & CLONE_NEWIPC) + return ERR_PTR(-EINVAL); + +- return tsk->nsproxy->ipc_ns; ++ return old_ns; + } + + static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) +diff -NurpP --minimal linux-3.3.8/include/linux/loop.h linux-3.3.8-vs2.3.3.4/include/linux/loop.h +--- linux-3.3.8/include/linux/loop.h 2012-01-09 16:14:58.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/loop.h 2012-02-24 03:55:06.000000000 +0100 +@@ -45,6 +45,7 @@ struct loop_device { + struct loop_func_table *lo_encryption; + __u32 lo_init[2]; + uid_t lo_key_owner; /* Who set the key */ ++ xid_t lo_xid; + int (*ioctl)(struct loop_device *, int cmd, + unsigned long arg); + +diff -NurpP --minimal linux-3.3.8/include/linux/magic.h linux-3.3.8-vs2.3.3.4/include/linux/magic.h +--- linux-3.3.8/include/linux/magic.h 2012-01-09 16:14:58.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/magic.h 2012-02-24 03:55:06.000000000 +0100 +@@ -3,7 +3,7 @@ + + #define ADFS_SUPER_MAGIC 0xadf5 + #define AFFS_SUPER_MAGIC 0xadff +-#define AFS_SUPER_MAGIC 0x5346414F ++#define AFS_SUPER_MAGIC 0x5346414F + #define AUTOFS_SUPER_MAGIC 0x0187 + #define CODA_SUPER_MAGIC 0x73757245 + #define CRAMFS_MAGIC 0x28cd3d45 /* some random number */ +@@ -41,6 +41,7 @@ + #define NFS_SUPER_MAGIC 0x6969 + #define OPENPROM_SUPER_MAGIC 0x9fa1 + #define PROC_SUPER_MAGIC 0x9fa0 ++#define DEVPTS_SUPER_MAGIC 0x1cd1 + #define QNX4_SUPER_MAGIC 0x002f /* qnx4 fs detection */ + + #define REISERFS_SUPER_MAGIC 0x52654973 /* used by gcc */ +diff -NurpP --minimal linux-3.3.8/include/linux/major.h linux-3.3.8-vs2.3.3.4/include/linux/major.h +--- linux-3.3.8/include/linux/major.h 2009-09-10 15:26:25.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/include/linux/major.h 2012-02-24 03:55:06.000000000 +0100 +@@ -15,6 +15,7 @@ + #define HD_MAJOR IDE0_MAJOR + #define PTY_SLAVE_MAJOR 3 + #define TTY_MAJOR 4 ++#define VROOT_MAJOR 4 + #define TTYAUX_MAJOR 5 + #define LP_MAJOR 6 + #define VCS_MAJOR 7 +diff -NurpP --minimal linux-3.3.8/include/linux/memcontrol.h linux-3.3.8-vs2.3.3.4/include/linux/memcontrol.h +--- linux-3.3.8/include/linux/memcontrol.h 2012-03-19 19:47:28.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/memcontrol.h 2012-03-19 20:52:10.000000000 +0100 +@@ -87,6 +87,13 @@ extern struct mem_cgroup *try_get_mem_cg + extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); + extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont); + ++extern u64 mem_cgroup_res_read_u64(struct mem_cgroup *mem, int member); ++extern u64 mem_cgroup_memsw_read_u64(struct mem_cgroup *mem, int member); ++ ++extern s64 mem_cgroup_stat_read_cache(struct mem_cgroup *mem); ++extern s64 mem_cgroup_stat_read_anon(struct mem_cgroup *mem); ++extern s64 mem_cgroup_stat_read_mapped(struct mem_cgroup *mem); ++ + static inline + int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) + { +diff -NurpP --minimal linux-3.3.8/include/linux/mm_types.h linux-3.3.8-vs2.3.3.4/include/linux/mm_types.h +--- linux-3.3.8/include/linux/mm_types.h 2012-03-19 19:47:28.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/mm_types.h 2012-02-24 03:55:06.000000000 +0100 +@@ -343,6 +343,7 @@ struct mm_struct { + + /* Architecture-specific MM context */ + mm_context_t context; ++ struct vx_info *mm_vx_info; + + /* Swap token stuff */ + /* +diff -NurpP --minimal linux-3.3.8/include/linux/mmzone.h linux-3.3.8-vs2.3.3.4/include/linux/mmzone.h +--- linux-3.3.8/include/linux/mmzone.h 2012-03-19 19:47:28.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/mmzone.h 2012-02-24 03:55:06.000000000 +0100 +@@ -683,6 +683,13 @@ typedef struct pglist_data { + __pgdat->node_start_pfn + __pgdat->node_spanned_pages;\ + }) + ++#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) ++ ++#define node_end_pfn(nid) ({\ ++ pg_data_t *__pgdat = NODE_DATA(nid);\ ++ __pgdat->node_start_pfn + __pgdat->node_spanned_pages;\ ++}) ++ + #include + + extern struct mutex zonelists_mutex; +diff -NurpP --minimal linux-3.3.8/include/linux/mount.h linux-3.3.8-vs2.3.3.4/include/linux/mount.h +--- linux-3.3.8/include/linux/mount.h 2012-03-19 19:47:28.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/mount.h 2012-02-24 17:29:28.000000000 +0100 +@@ -47,6 +47,9 @@ struct mnt_namespace; + + #define MNT_INTERNAL 0x4000 + ++#define MNT_TAGID 0x10000 ++#define MNT_NOTAG 0x20000 ++ + struct vfsmount { + struct dentry *mnt_root; /* root of the mounted tree */ + struct super_block *mnt_sb; /* pointer to superblock */ +diff -NurpP --minimal linux-3.3.8/include/linux/net.h linux-3.3.8-vs2.3.3.4/include/linux/net.h +--- linux-3.3.8/include/linux/net.h 2011-07-22 11:18:11.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/include/linux/net.h 2012-02-24 03:55:06.000000000 +0100 +@@ -72,6 +72,7 @@ struct net; + #define SOCK_NOSPACE 2 + #define SOCK_PASSCRED 3 + #define SOCK_PASSSEC 4 ++#define SOCK_USER_SOCKET 5 + + #ifndef ARCH_HAS_SOCKET_TYPES + /** +diff -NurpP --minimal linux-3.3.8/include/linux/netdevice.h linux-3.3.8-vs2.3.3.4/include/linux/netdevice.h +--- linux-3.3.8/include/linux/netdevice.h 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/include/linux/netdevice.h 2012-05-15 07:09:24.000000000 +0200 +@@ -1627,6 +1627,7 @@ extern void netdev_resync_ops(struct ne + + extern struct net_device *dev_get_by_index(struct net *net, int ifindex); + extern struct net_device *__dev_get_by_index(struct net *net, int ifindex); ++extern struct net_device *dev_get_by_index_real_rcu(struct net *net, int ifindex); + extern struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); + extern int dev_restart(struct net_device *dev); + #ifdef CONFIG_NETPOLL_TRAP +diff -NurpP --minimal linux-3.3.8/include/linux/nfs_mount.h linux-3.3.8-vs2.3.3.4/include/linux/nfs_mount.h +--- linux-3.3.8/include/linux/nfs_mount.h 2011-01-05 21:50:31.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/nfs_mount.h 2012-02-24 03:55:06.000000000 +0100 +@@ -63,7 +63,8 @@ struct nfs_mount_data { + #define NFS_MOUNT_SECFLAVOUR 0x2000 /* 5 */ + #define NFS_MOUNT_NORDIRPLUS 0x4000 /* 5 */ + #define NFS_MOUNT_UNSHARED 0x8000 /* 5 */ +-#define NFS_MOUNT_FLAGMASK 0xFFFF ++#define NFS_MOUNT_TAGGED 0x10000 /* context tagging */ ++#define NFS_MOUNT_FLAGMASK 0x1FFFF + + /* The following are for internal use only */ + #define NFS_MOUNT_LOOKUP_CACHE_NONEG 0x10000 +diff -NurpP --minimal linux-3.3.8/include/linux/nsproxy.h linux-3.3.8-vs2.3.3.4/include/linux/nsproxy.h +--- linux-3.3.8/include/linux/nsproxy.h 2011-10-24 18:45:32.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/include/linux/nsproxy.h 2012-02-24 03:55:06.000000000 +0100 +@@ -3,6 +3,7 @@ + + #include + #include ++#include + + struct mnt_namespace; + struct uts_namespace; +@@ -63,6 +64,7 @@ static inline struct nsproxy *task_nspro + } + + int copy_namespaces(unsigned long flags, struct task_struct *tsk); ++struct nsproxy *copy_nsproxy(struct nsproxy *orig); + void exit_task_namespaces(struct task_struct *tsk); + void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); + void free_nsproxy(struct nsproxy *ns); +@@ -70,16 +72,26 @@ int unshare_nsproxy_namespaces(unsigned + struct fs_struct *); + int __init nsproxy_cache_init(void); + +-static inline void put_nsproxy(struct nsproxy *ns) ++#define get_nsproxy(n) __get_nsproxy(n, __FILE__, __LINE__) ++ ++static inline void __get_nsproxy(struct nsproxy *ns, ++ const char *_file, int _line) + { +- if (atomic_dec_and_test(&ns->count)) { +- free_nsproxy(ns); +- } ++ vxlprintk(VXD_CBIT(space, 0), "get_nsproxy(%p[%u])", ++ ns, atomic_read(&ns->count), _file, _line); ++ atomic_inc(&ns->count); + } + +-static inline void get_nsproxy(struct nsproxy *ns) ++#define put_nsproxy(n) __put_nsproxy(n, __FILE__, __LINE__) ++ ++static inline void __put_nsproxy(struct nsproxy *ns, ++ const char *_file, int _line) + { +- atomic_inc(&ns->count); ++ vxlprintk(VXD_CBIT(space, 0), "put_nsproxy(%p[%u])", ++ ns, atomic_read(&ns->count), _file, _line); ++ if (atomic_dec_and_test(&ns->count)) { ++ free_nsproxy(ns); ++ } + } + + #endif +diff -NurpP --minimal linux-3.3.8/include/linux/pid.h linux-3.3.8-vs2.3.3.4/include/linux/pid.h +--- linux-3.3.8/include/linux/pid.h 2011-07-22 11:18:11.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/include/linux/pid.h 2012-02-24 03:55:06.000000000 +0100 +@@ -8,7 +8,8 @@ enum pid_type + PIDTYPE_PID, + PIDTYPE_PGID, + PIDTYPE_SID, +- PIDTYPE_MAX ++ PIDTYPE_MAX, ++ PIDTYPE_REALPID + }; + + /* +@@ -171,6 +172,7 @@ static inline pid_t pid_nr(struct pid *p + } + + pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns); ++pid_t pid_unmapped_nr_ns(struct pid *pid, struct pid_namespace *ns); + pid_t pid_vnr(struct pid *pid); + + #define do_each_pid_task(pid, type, task) \ +diff -NurpP --minimal linux-3.3.8/include/linux/proc_fs.h linux-3.3.8-vs2.3.3.4/include/linux/proc_fs.h +--- linux-3.3.8/include/linux/proc_fs.h 2012-03-19 19:47:28.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/proc_fs.h 2012-02-24 04:17:21.000000000 +0100 +@@ -54,6 +54,7 @@ struct proc_dir_entry { + nlink_t nlink; + uid_t uid; + gid_t gid; ++ int vx_flags; + loff_t size; + const struct inode_operations *proc_iops; + /* +@@ -252,12 +253,18 @@ extern const struct proc_ns_operations n + extern const struct proc_ns_operations utsns_operations; + extern const struct proc_ns_operations ipcns_operations; + ++struct vx_info; ++struct nx_info; ++ + union proc_op { + int (*proc_get_link)(struct dentry *, struct path *); + int (*proc_read)(struct task_struct *task, char *page); + int (*proc_show)(struct seq_file *m, + struct pid_namespace *ns, struct pid *pid, + struct task_struct *task); ++ int (*proc_vs_read)(char *page); ++ int (*proc_vxi_read)(struct vx_info *vxi, char *page); ++ int (*proc_nxi_read)(struct nx_info *nxi, char *page); + }; + + struct ctl_table_header; +@@ -265,6 +272,7 @@ struct ctl_table; + + struct proc_inode { + struct pid *pid; ++ int vx_flags; + int fd; + union proc_op op; + struct proc_dir_entry *pde; +diff -NurpP --minimal linux-3.3.8/include/linux/quotaops.h linux-3.3.8-vs2.3.3.4/include/linux/quotaops.h +--- linux-3.3.8/include/linux/quotaops.h 2012-01-09 16:14:58.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/quotaops.h 2012-02-24 03:55:06.000000000 +0100 +@@ -8,6 +8,7 @@ + #define _LINUX_QUOTAOPS_ + + #include ++#include + + #define DQUOT_SPACE_WARN 0x1 + #define DQUOT_SPACE_RESERVE 0x2 +@@ -204,11 +205,12 @@ static inline void dquot_drop(struct ino + + static inline int dquot_alloc_inode(const struct inode *inode) + { +- return 0; ++ return dl_alloc_inode(inode); + } + + static inline void dquot_free_inode(const struct inode *inode) + { ++ dl_free_inode(inode); + } + + static inline int dquot_transfer(struct inode *inode, struct iattr *iattr) +@@ -219,6 +221,10 @@ static inline int dquot_transfer(struct + static inline int __dquot_alloc_space(struct inode *inode, qsize_t number, + int flags) + { ++ int ret = 0; ++ ++ if ((ret = dl_alloc_space(inode, number))) ++ return ret; + if (!(flags & DQUOT_SPACE_RESERVE)) + inode_add_bytes(inode, number); + return 0; +@@ -229,6 +235,7 @@ static inline void __dquot_free_space(st + { + if (!(flags & DQUOT_SPACE_RESERVE)) + inode_sub_bytes(inode, number); ++ dl_free_space(inode, number); + } + + static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) +diff -NurpP --minimal linux-3.3.8/include/linux/reboot.h linux-3.3.8-vs2.3.3.4/include/linux/reboot.h +--- linux-3.3.8/include/linux/reboot.h 2011-10-24 18:45:32.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/include/linux/reboot.h 2012-02-24 03:55:06.000000000 +0100 +@@ -33,6 +33,7 @@ + #define LINUX_REBOOT_CMD_RESTART2 0xA1B2C3D4 + #define LINUX_REBOOT_CMD_SW_SUSPEND 0xD000FCE2 + #define LINUX_REBOOT_CMD_KEXEC 0x45584543 ++#define LINUX_REBOOT_CMD_OOM 0xDEADBEEF + + + #ifdef __KERNEL__ +diff -NurpP --minimal linux-3.3.8/include/linux/reiserfs_fs.h linux-3.3.8-vs2.3.3.4/include/linux/reiserfs_fs.h +--- linux-3.3.8/include/linux/reiserfs_fs.h 2012-03-19 19:47:28.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/reiserfs_fs.h 2012-02-24 03:55:06.000000000 +0100 +@@ -976,6 +976,11 @@ struct stat_data_v1 { + #define REISERFS_COMPR_FL FS_COMPR_FL + #define REISERFS_NOTAIL_FL FS_NOTAIL_FL + ++/* unfortunately reiserfs sdattr is only 16 bit */ ++#define REISERFS_IXUNLINK_FL (FS_IXUNLINK_FL >> 16) ++#define REISERFS_BARRIER_FL (FS_BARRIER_FL >> 16) ++#define REISERFS_COW_FL (FS_COW_FL >> 16) ++ + /* persistent flags that file inherits from the parent directory */ + #define REISERFS_INHERIT_MASK ( REISERFS_IMMUTABLE_FL | \ + REISERFS_SYNC_FL | \ +@@ -985,6 +990,9 @@ struct stat_data_v1 { + REISERFS_COMPR_FL | \ + REISERFS_NOTAIL_FL ) + ++#define REISERFS_FL_USER_VISIBLE 0x80FF ++#define REISERFS_FL_USER_MODIFIABLE 0x80FF ++ + /* Stat Data on disk (reiserfs version of UFS disk inode minus the + address blocks) */ + struct stat_data { +@@ -2074,6 +2082,7 @@ static inline void reiserfs_update_sd(st + void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); + void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs); + int reiserfs_setattr(struct dentry *dentry, struct iattr *attr); ++int reiserfs_sync_flags(struct inode *inode, int, int); + + int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); + +diff -NurpP --minimal linux-3.3.8/include/linux/reiserfs_fs_sb.h linux-3.3.8-vs2.3.3.4/include/linux/reiserfs_fs_sb.h +--- linux-3.3.8/include/linux/reiserfs_fs_sb.h 2012-03-19 19:47:28.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/reiserfs_fs_sb.h 2012-02-24 03:55:06.000000000 +0100 +@@ -477,6 +477,7 @@ enum reiserfs_mount_options { + REISERFS_EXPOSE_PRIVROOT, + REISERFS_BARRIER_NONE, + REISERFS_BARRIER_FLUSH, ++ REISERFS_TAGGED, + + /* Actions on error */ + REISERFS_ERROR_PANIC, +diff -NurpP --minimal linux-3.3.8/include/linux/sched.h linux-3.3.8-vs2.3.3.4/include/linux/sched.h +--- linux-3.3.8/include/linux/sched.h 2012-03-19 19:47:28.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/sched.h 2012-03-19 20:52:10.000000000 +0100 +@@ -1420,6 +1420,14 @@ struct task_struct { + #endif + seccomp_t seccomp; + ++/* vserver context data */ ++ struct vx_info *vx_info; ++ struct nx_info *nx_info; ++ ++ xid_t xid; ++ nid_t nid; ++ tag_t tag; ++ + /* Thread group tracking */ + u32 parent_exec_id; + u32 self_exec_id; +@@ -1669,6 +1677,11 @@ struct pid_namespace; + pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, + struct pid_namespace *ns); + ++#include ++#include ++#include ++#include ++ + static inline pid_t task_pid_nr(struct task_struct *tsk) + { + return tsk->pid; +@@ -1682,7 +1695,8 @@ static inline pid_t task_pid_nr_ns(struc + + static inline pid_t task_pid_vnr(struct task_struct *tsk) + { +- return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); ++ // return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); ++ return vx_map_pid(__task_pid_nr_ns(tsk, PIDTYPE_PID, NULL)); + } + + +@@ -1695,7 +1709,7 @@ pid_t task_tgid_nr_ns(struct task_struct + + static inline pid_t task_tgid_vnr(struct task_struct *tsk) + { +- return pid_vnr(task_tgid(tsk)); ++ return vx_map_tgid(pid_vnr(task_tgid(tsk))); + } + + +diff -NurpP --minimal linux-3.3.8/include/linux/shmem_fs.h linux-3.3.8-vs2.3.3.4/include/linux/shmem_fs.h +--- linux-3.3.8/include/linux/shmem_fs.h 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/shmem_fs.h 2012-02-24 03:55:06.000000000 +0100 +@@ -8,6 +8,9 @@ + + /* inode in-kernel data */ + ++#define TMPFS_SUPER_MAGIC 0x01021994 ++ ++ + struct shmem_inode_info { + spinlock_t lock; + unsigned long flags; +diff -NurpP --minimal linux-3.3.8/include/linux/stat.h linux-3.3.8-vs2.3.3.4/include/linux/stat.h +--- linux-3.3.8/include/linux/stat.h 2008-12-25 00:26:37.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/stat.h 2012-02-24 03:55:06.000000000 +0100 +@@ -66,6 +66,7 @@ struct kstat { + unsigned int nlink; + uid_t uid; + gid_t gid; ++ tag_t tag; + dev_t rdev; + loff_t size; + struct timespec atime; +diff -NurpP --minimal linux-3.3.8/include/linux/sunrpc/auth.h linux-3.3.8-vs2.3.3.4/include/linux/sunrpc/auth.h +--- linux-3.3.8/include/linux/sunrpc/auth.h 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/sunrpc/auth.h 2012-02-24 03:55:06.000000000 +0100 +@@ -25,6 +25,7 @@ + struct auth_cred { + uid_t uid; + gid_t gid; ++ tag_t tag; + struct group_info *group_info; + const char *principal; + unsigned char machine_cred : 1; +diff -NurpP --minimal linux-3.3.8/include/linux/sunrpc/clnt.h linux-3.3.8-vs2.3.3.4/include/linux/sunrpc/clnt.h +--- linux-3.3.8/include/linux/sunrpc/clnt.h 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/sunrpc/clnt.h 2012-02-24 03:55:06.000000000 +0100 +@@ -50,7 +50,8 @@ struct rpc_clnt { + unsigned int cl_softrtry : 1,/* soft timeouts */ + cl_discrtry : 1,/* disconnect before retry */ + cl_autobind : 1,/* use getport() */ +- cl_chatty : 1;/* be verbose */ ++ cl_chatty : 1,/* be verbose */ ++ cl_tag : 1;/* context tagging */ + + struct rpc_rtt * cl_rtt; /* RTO estimator data */ + const struct rpc_timeout *cl_timeout; /* Timeout strategy */ +diff -NurpP --minimal linux-3.3.8/include/linux/sysctl.h linux-3.3.8-vs2.3.3.4/include/linux/sysctl.h +--- linux-3.3.8/include/linux/sysctl.h 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/sysctl.h 2012-02-24 03:55:06.000000000 +0100 +@@ -60,6 +60,7 @@ enum + CTL_ABI=9, /* Binary emulation */ + CTL_CPU=10, /* CPU stuff (speed scaling, etc) */ + CTL_ARLAN=254, /* arlan wireless driver */ ++ CTL_VSERVER=4242, /* Linux-VServer debug */ + CTL_S390DBF=5677, /* s390 debug */ + CTL_SUNRPC=7249, /* sunrpc debug */ + CTL_PM=9899, /* frv power management */ +@@ -94,6 +95,7 @@ enum + + KERN_PANIC=15, /* int: panic timeout */ + KERN_REALROOTDEV=16, /* real root device to mount after initrd */ ++ KERN_VSHELPER=17, /* string: path to vshelper policy agent */ + + KERN_SPARC_REBOOT=21, /* reboot command on Sparc */ + KERN_CTLALTDEL=22, /* int: allow ctl-alt-del to reboot */ +diff -NurpP --minimal linux-3.3.8/include/linux/sysfs.h linux-3.3.8-vs2.3.3.4/include/linux/sysfs.h +--- linux-3.3.8/include/linux/sysfs.h 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/sysfs.h 2012-02-24 03:55:06.000000000 +0100 +@@ -19,6 +19,8 @@ + #include + #include + ++#define SYSFS_SUPER_MAGIC 0x62656572 ++ + struct kobject; + struct module; + enum kobj_ns_type; +diff -NurpP --minimal linux-3.3.8/include/linux/time.h linux-3.3.8-vs2.3.3.4/include/linux/time.h +--- linux-3.3.8/include/linux/time.h 2011-07-22 11:18:11.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/include/linux/time.h 2012-02-24 03:55:06.000000000 +0100 +@@ -256,6 +256,9 @@ static __always_inline void timespec_add + a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); + a->tv_nsec = ns; + } ++ ++#include ++ + #endif /* __KERNEL__ */ + + #define NFDBITS __NFDBITS +diff -NurpP --minimal linux-3.3.8/include/linux/types.h linux-3.3.8-vs2.3.3.4/include/linux/types.h +--- linux-3.3.8/include/linux/types.h 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/types.h 2012-02-24 03:55:06.000000000 +0100 +@@ -41,6 +41,9 @@ typedef __kernel_uid32_t uid_t; + typedef __kernel_gid32_t gid_t; + typedef __kernel_uid16_t uid16_t; + typedef __kernel_gid16_t gid16_t; ++typedef unsigned int xid_t; ++typedef unsigned int nid_t; ++typedef unsigned int tag_t; + + typedef unsigned long uintptr_t; + +diff -NurpP --minimal linux-3.3.8/include/linux/utsname.h linux-3.3.8-vs2.3.3.4/include/linux/utsname.h +--- linux-3.3.8/include/linux/utsname.h 2012-01-09 16:14:59.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/utsname.h 2012-02-24 03:55:06.000000000 +0100 +@@ -62,7 +62,8 @@ static inline void get_uts_ns(struct uts + } + + extern struct uts_namespace *copy_utsname(unsigned long flags, +- struct task_struct *tsk); ++ struct uts_namespace *old_ns, ++ struct user_namespace *user_ns); + extern void free_uts_ns(struct kref *kref); + + static inline void put_uts_ns(struct uts_namespace *ns) +@@ -79,12 +80,13 @@ static inline void put_uts_ns(struct uts + } + + static inline struct uts_namespace *copy_utsname(unsigned long flags, +- struct task_struct *tsk) ++ struct uts_namespace *old_ns, ++ struct user_namespace *user_ns) + { + if (flags & CLONE_NEWUTS) + return ERR_PTR(-EINVAL); + +- return tsk->nsproxy->uts_ns; ++ return old_ns; + } + #endif + +diff -NurpP --minimal linux-3.3.8/include/linux/vroot.h linux-3.3.8-vs2.3.3.4/include/linux/vroot.h +--- linux-3.3.8/include/linux/vroot.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vroot.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,51 @@ ++ ++/* ++ * include/linux/vroot.h ++ * ++ * written by Herbert Pötzl, 9/11/2002 ++ * ported to 2.6 by Herbert Pötzl, 30/12/2004 ++ * ++ * Copyright (C) 2002-2007 by Herbert Pötzl. ++ * Redistribution of this file is permitted under the ++ * GNU General Public License. ++ */ ++ ++#ifndef _LINUX_VROOT_H ++#define _LINUX_VROOT_H ++ ++ ++#ifdef __KERNEL__ ++ ++/* Possible states of device */ ++enum { ++ Vr_unbound, ++ Vr_bound, ++}; ++ ++struct vroot_device { ++ int vr_number; ++ int vr_refcnt; ++ ++ struct semaphore vr_ctl_mutex; ++ struct block_device *vr_device; ++ int vr_state; ++}; ++ ++ ++typedef struct block_device *(vroot_grb_func)(struct block_device *); ++ ++extern int register_vroot_grb(vroot_grb_func *); ++extern int unregister_vroot_grb(vroot_grb_func *); ++ ++#endif /* __KERNEL__ */ ++ ++#define MAX_VROOT_DEFAULT 8 ++ ++/* ++ * IOCTL commands --- we will commandeer 0x56 ('V') ++ */ ++ ++#define VROOT_SET_DEV 0x5600 ++#define VROOT_CLR_DEV 0x5601 ++ ++#endif /* _LINUX_VROOT_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vs_base.h linux-3.3.8-vs2.3.3.4/include/linux/vs_base.h +--- linux-3.3.8/include/linux/vs_base.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vs_base.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,10 @@ ++#ifndef _VS_BASE_H ++#define _VS_BASE_H ++ ++#include "vserver/base.h" ++#include "vserver/check.h" ++#include "vserver/debug.h" ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vs_context.h linux-3.3.8-vs2.3.3.4/include/linux/vs_context.h +--- linux-3.3.8/include/linux/vs_context.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vs_context.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,242 @@ ++#ifndef _VS_CONTEXT_H ++#define _VS_CONTEXT_H ++ ++#include "vserver/base.h" ++#include "vserver/check.h" ++#include "vserver/context.h" ++#include "vserver/history.h" ++#include "vserver/debug.h" ++ ++#include ++ ++ ++#define get_vx_info(i) __get_vx_info(i, __FILE__, __LINE__, __HERE__) ++ ++static inline struct vx_info *__get_vx_info(struct vx_info *vxi, ++ const char *_file, int _line, void *_here) ++{ ++ if (!vxi) ++ return NULL; ++ ++ vxlprintk(VXD_CBIT(xid, 2), "get_vx_info(%p[#%d.%d])", ++ vxi, vxi ? vxi->vx_id : 0, ++ vxi ? atomic_read(&vxi->vx_usecnt) : 0, ++ _file, _line); ++ __vxh_get_vx_info(vxi, _here); ++ ++ atomic_inc(&vxi->vx_usecnt); ++ return vxi; ++} ++ ++ ++extern void free_vx_info(struct vx_info *); ++ ++#define put_vx_info(i) __put_vx_info(i, __FILE__, __LINE__, __HERE__) ++ ++static inline void __put_vx_info(struct vx_info *vxi, ++ const char *_file, int _line, void *_here) ++{ ++ if (!vxi) ++ return; ++ ++ vxlprintk(VXD_CBIT(xid, 2), "put_vx_info(%p[#%d.%d])", ++ vxi, vxi ? vxi->vx_id : 0, ++ vxi ? atomic_read(&vxi->vx_usecnt) : 0, ++ _file, _line); ++ __vxh_put_vx_info(vxi, _here); ++ ++ if (atomic_dec_and_test(&vxi->vx_usecnt)) ++ free_vx_info(vxi); ++} ++ ++ ++#define init_vx_info(p, i) \ ++ __init_vx_info(p, i, __FILE__, __LINE__, __HERE__) ++ ++static inline void __init_vx_info(struct vx_info **vxp, struct vx_info *vxi, ++ const char *_file, int _line, void *_here) ++{ ++ if (vxi) { ++ vxlprintk(VXD_CBIT(xid, 3), ++ "init_vx_info(%p[#%d.%d])", ++ vxi, vxi ? vxi->vx_id : 0, ++ vxi ? atomic_read(&vxi->vx_usecnt) : 0, ++ _file, _line); ++ __vxh_init_vx_info(vxi, vxp, _here); ++ ++ atomic_inc(&vxi->vx_usecnt); ++ } ++ *vxp = vxi; ++} ++ ++ ++#define set_vx_info(p, i) \ ++ __set_vx_info(p, i, __FILE__, __LINE__, __HERE__) ++ ++static inline void __set_vx_info(struct vx_info **vxp, struct vx_info *vxi, ++ const char *_file, int _line, void *_here) ++{ ++ struct vx_info *vxo; ++ ++ if (!vxi) ++ return; ++ ++ vxlprintk(VXD_CBIT(xid, 3), "set_vx_info(%p[#%d.%d])", ++ vxi, vxi ? vxi->vx_id : 0, ++ vxi ? atomic_read(&vxi->vx_usecnt) : 0, ++ _file, _line); ++ __vxh_set_vx_info(vxi, vxp, _here); ++ ++ atomic_inc(&vxi->vx_usecnt); ++ vxo = xchg(vxp, vxi); ++ BUG_ON(vxo); ++} ++ ++ ++#define clr_vx_info(p) __clr_vx_info(p, __FILE__, __LINE__, __HERE__) ++ ++static inline void __clr_vx_info(struct vx_info **vxp, ++ const char *_file, int _line, void *_here) ++{ ++ struct vx_info *vxo; ++ ++ vxo = xchg(vxp, NULL); ++ if (!vxo) ++ return; ++ ++ vxlprintk(VXD_CBIT(xid, 3), "clr_vx_info(%p[#%d.%d])", ++ vxo, vxo ? vxo->vx_id : 0, ++ vxo ? atomic_read(&vxo->vx_usecnt) : 0, ++ _file, _line); ++ __vxh_clr_vx_info(vxo, vxp, _here); ++ ++ if (atomic_dec_and_test(&vxo->vx_usecnt)) ++ free_vx_info(vxo); ++} ++ ++ ++#define claim_vx_info(v, p) \ ++ __claim_vx_info(v, p, __FILE__, __LINE__, __HERE__) ++ ++static inline void __claim_vx_info(struct vx_info *vxi, ++ struct task_struct *task, ++ const char *_file, int _line, void *_here) ++{ ++ vxlprintk(VXD_CBIT(xid, 3), "claim_vx_info(%p[#%d.%d.%d]) %p", ++ vxi, vxi ? vxi->vx_id : 0, ++ vxi ? atomic_read(&vxi->vx_usecnt) : 0, ++ vxi ? atomic_read(&vxi->vx_tasks) : 0, ++ task, _file, _line); ++ __vxh_claim_vx_info(vxi, task, _here); ++ ++ atomic_inc(&vxi->vx_tasks); ++} ++ ++ ++extern void unhash_vx_info(struct vx_info *); ++ ++#define release_vx_info(v, p) \ ++ __release_vx_info(v, p, __FILE__, __LINE__, __HERE__) ++ ++static inline void __release_vx_info(struct vx_info *vxi, ++ struct task_struct *task, ++ const char *_file, int _line, void *_here) ++{ ++ vxlprintk(VXD_CBIT(xid, 3), "release_vx_info(%p[#%d.%d.%d]) %p", ++ vxi, vxi ? vxi->vx_id : 0, ++ vxi ? atomic_read(&vxi->vx_usecnt) : 0, ++ vxi ? atomic_read(&vxi->vx_tasks) : 0, ++ task, _file, _line); ++ __vxh_release_vx_info(vxi, task, _here); ++ ++ might_sleep(); ++ ++ if (atomic_dec_and_test(&vxi->vx_tasks)) ++ unhash_vx_info(vxi); ++} ++ ++ ++#define task_get_vx_info(p) \ ++ __task_get_vx_info(p, __FILE__, __LINE__, __HERE__) ++ ++static inline struct vx_info *__task_get_vx_info(struct task_struct *p, ++ const char *_file, int _line, void *_here) ++{ ++ struct vx_info *vxi; ++ ++ task_lock(p); ++ vxlprintk(VXD_CBIT(xid, 5), "task_get_vx_info(%p)", ++ p, _file, _line); ++ vxi = __get_vx_info(p->vx_info, _file, _line, _here); ++ task_unlock(p); ++ return vxi; ++} ++ ++ ++static inline void __wakeup_vx_info(struct vx_info *vxi) ++{ ++ if (waitqueue_active(&vxi->vx_wait)) ++ wake_up_interruptible(&vxi->vx_wait); ++} ++ ++ ++#define enter_vx_info(v, s) __enter_vx_info(v, s, __FILE__, __LINE__) ++ ++static inline void __enter_vx_info(struct vx_info *vxi, ++ struct vx_info_save *vxis, const char *_file, int _line) ++{ ++ vxlprintk(VXD_CBIT(xid, 5), "enter_vx_info(%p[#%d],%p) %p[#%d,%p]", ++ vxi, vxi ? vxi->vx_id : 0, vxis, current, ++ current->xid, current->vx_info, _file, _line); ++ vxis->vxi = xchg(¤t->vx_info, vxi); ++ vxis->xid = current->xid; ++ current->xid = vxi ? vxi->vx_id : 0; ++} ++ ++#define leave_vx_info(s) __leave_vx_info(s, __FILE__, __LINE__) ++ ++static inline void __leave_vx_info(struct vx_info_save *vxis, ++ const char *_file, int _line) ++{ ++ vxlprintk(VXD_CBIT(xid, 5), "leave_vx_info(%p[#%d,%p]) %p[#%d,%p]", ++ vxis, vxis->xid, vxis->vxi, current, ++ current->xid, current->vx_info, _file, _line); ++ (void)xchg(¤t->vx_info, vxis->vxi); ++ current->xid = vxis->xid; ++} ++ ++ ++static inline void __enter_vx_admin(struct vx_info_save *vxis) ++{ ++ vxis->vxi = xchg(¤t->vx_info, NULL); ++ vxis->xid = xchg(¤t->xid, (xid_t)0); ++} ++ ++static inline void __leave_vx_admin(struct vx_info_save *vxis) ++{ ++ (void)xchg(¤t->xid, vxis->xid); ++ (void)xchg(¤t->vx_info, vxis->vxi); ++} ++ ++#define task_is_init(p) \ ++ __task_is_init(p, __FILE__, __LINE__, __HERE__) ++ ++static inline int __task_is_init(struct task_struct *p, ++ const char *_file, int _line, void *_here) ++{ ++ int is_init = is_global_init(p); ++ ++ task_lock(p); ++ if (p->vx_info) ++ is_init = p->vx_info->vx_initpid == p->pid; ++ task_unlock(p); ++ return is_init; ++} ++ ++extern void exit_vx_info(struct task_struct *, int); ++extern void exit_vx_info_early(struct task_struct *, int); ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vs_cowbl.h linux-3.3.8-vs2.3.3.4/include/linux/vs_cowbl.h +--- linux-3.3.8/include/linux/vs_cowbl.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vs_cowbl.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,48 @@ ++#ifndef _VS_COWBL_H ++#define _VS_COWBL_H ++ ++#include ++#include ++#include ++#include ++ ++extern struct dentry *cow_break_link(const char *pathname); ++ ++static inline int cow_check_and_break(struct path *path) ++{ ++ struct inode *inode = path->dentry->d_inode; ++ int error = 0; ++ ++ /* do we need this check? */ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ ++ if (IS_COW(inode)) { ++ if (IS_COW_LINK(inode)) { ++ struct dentry *new_dentry, *old_dentry = path->dentry; ++ char *pp, *buf; ++ ++ buf = kmalloc(PATH_MAX, GFP_KERNEL); ++ if (!buf) { ++ return -ENOMEM; ++ } ++ pp = d_path(path, buf, PATH_MAX); ++ new_dentry = cow_break_link(pp); ++ kfree(buf); ++ if (!IS_ERR(new_dentry)) { ++ path->dentry = new_dentry; ++ dput(old_dentry); ++ } else ++ error = PTR_ERR(new_dentry); ++ } else { ++ inode->i_flags &= ~(S_IXUNLINK | S_IMMUTABLE); ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty(inode); ++ } ++ } ++ return error; ++} ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vs_cvirt.h linux-3.3.8-vs2.3.3.4/include/linux/vs_cvirt.h +--- linux-3.3.8/include/linux/vs_cvirt.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vs_cvirt.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,50 @@ ++#ifndef _VS_CVIRT_H ++#define _VS_CVIRT_H ++ ++#include "vserver/cvirt.h" ++#include "vserver/context.h" ++#include "vserver/base.h" ++#include "vserver/check.h" ++#include "vserver/debug.h" ++ ++ ++static inline void vx_activate_task(struct task_struct *p) ++{ ++ struct vx_info *vxi; ++ ++ if ((vxi = p->vx_info)) { ++ vx_update_load(vxi); ++ atomic_inc(&vxi->cvirt.nr_running); ++ } ++} ++ ++static inline void vx_deactivate_task(struct task_struct *p) ++{ ++ struct vx_info *vxi; ++ ++ if ((vxi = p->vx_info)) { ++ vx_update_load(vxi); ++ atomic_dec(&vxi->cvirt.nr_running); ++ } ++} ++ ++static inline void vx_uninterruptible_inc(struct task_struct *p) ++{ ++ struct vx_info *vxi; ++ ++ if ((vxi = p->vx_info)) ++ atomic_inc(&vxi->cvirt.nr_uninterruptible); ++} ++ ++static inline void vx_uninterruptible_dec(struct task_struct *p) ++{ ++ struct vx_info *vxi; ++ ++ if ((vxi = p->vx_info)) ++ atomic_dec(&vxi->cvirt.nr_uninterruptible); ++} ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vs_device.h linux-3.3.8-vs2.3.3.4/include/linux/vs_device.h +--- linux-3.3.8/include/linux/vs_device.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vs_device.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,45 @@ ++#ifndef _VS_DEVICE_H ++#define _VS_DEVICE_H ++ ++#include "vserver/base.h" ++#include "vserver/device.h" ++#include "vserver/debug.h" ++ ++ ++#ifdef CONFIG_VSERVER_DEVICE ++ ++int vs_map_device(struct vx_info *, dev_t, dev_t *, umode_t); ++ ++#define vs_device_perm(v, d, m, p) \ ++ ((vs_map_device(current_vx_info(), d, NULL, m) & (p)) == (p)) ++ ++#else ++ ++static inline ++int vs_map_device(struct vx_info *vxi, ++ dev_t device, dev_t *target, umode_t mode) ++{ ++ if (target) ++ *target = device; ++ return ~0; ++} ++ ++#define vs_device_perm(v, d, m, p) ((p) == (p)) ++ ++#endif ++ ++ ++#define vs_map_chrdev(d, t, p) \ ++ ((vs_map_device(current_vx_info(), d, t, S_IFCHR) & (p)) == (p)) ++#define vs_map_blkdev(d, t, p) \ ++ ((vs_map_device(current_vx_info(), d, t, S_IFBLK) & (p)) == (p)) ++ ++#define vs_chrdev_perm(d, p) \ ++ vs_device_perm(current_vx_info(), d, S_IFCHR, p) ++#define vs_blkdev_perm(d, p) \ ++ vs_device_perm(current_vx_info(), d, S_IFBLK, p) ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vs_dlimit.h linux-3.3.8-vs2.3.3.4/include/linux/vs_dlimit.h +--- linux-3.3.8/include/linux/vs_dlimit.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vs_dlimit.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,215 @@ ++#ifndef _VS_DLIMIT_H ++#define _VS_DLIMIT_H ++ ++#include ++ ++#include "vserver/dlimit.h" ++#include "vserver/base.h" ++#include "vserver/debug.h" ++ ++ ++#define get_dl_info(i) __get_dl_info(i, __FILE__, __LINE__) ++ ++static inline struct dl_info *__get_dl_info(struct dl_info *dli, ++ const char *_file, int _line) ++{ ++ if (!dli) ++ return NULL; ++ vxlprintk(VXD_CBIT(dlim, 4), "get_dl_info(%p[#%d.%d])", ++ dli, dli ? dli->dl_tag : 0, ++ dli ? atomic_read(&dli->dl_usecnt) : 0, ++ _file, _line); ++ atomic_inc(&dli->dl_usecnt); ++ return dli; ++} ++ ++ ++#define free_dl_info(i) \ ++ call_rcu(&(i)->dl_rcu, rcu_free_dl_info) ++ ++#define put_dl_info(i) __put_dl_info(i, __FILE__, __LINE__) ++ ++static inline void __put_dl_info(struct dl_info *dli, ++ const char *_file, int _line) ++{ ++ if (!dli) ++ return; ++ vxlprintk(VXD_CBIT(dlim, 4), "put_dl_info(%p[#%d.%d])", ++ dli, dli ? dli->dl_tag : 0, ++ dli ? atomic_read(&dli->dl_usecnt) : 0, ++ _file, _line); ++ if (atomic_dec_and_test(&dli->dl_usecnt)) ++ free_dl_info(dli); ++} ++ ++ ++#define __dlimit_char(d) ((d) ? '*' : ' ') ++ ++static inline int __dl_alloc_space(struct super_block *sb, ++ tag_t tag, dlsize_t nr, const char *file, int line) ++{ ++ struct dl_info *dli = NULL; ++ int ret = 0; ++ ++ if (nr == 0) ++ goto out; ++ dli = locate_dl_info(sb, tag); ++ if (!dli) ++ goto out; ++ ++ spin_lock(&dli->dl_lock); ++ ret = (dli->dl_space_used + nr > dli->dl_space_total); ++ if (!ret) ++ dli->dl_space_used += nr; ++ spin_unlock(&dli->dl_lock); ++ put_dl_info(dli); ++out: ++ vxlprintk(VXD_CBIT(dlim, 1), ++ "ALLOC (%p,#%d)%c %lld bytes (%d)", ++ sb, tag, __dlimit_char(dli), (long long)nr, ++ ret, file, line); ++ return ret ? -ENOSPC : 0; ++} ++ ++static inline void __dl_free_space(struct super_block *sb, ++ tag_t tag, dlsize_t nr, const char *_file, int _line) ++{ ++ struct dl_info *dli = NULL; ++ ++ if (nr == 0) ++ goto out; ++ dli = locate_dl_info(sb, tag); ++ if (!dli) ++ goto out; ++ ++ spin_lock(&dli->dl_lock); ++ if (dli->dl_space_used > nr) ++ dli->dl_space_used -= nr; ++ else ++ dli->dl_space_used = 0; ++ spin_unlock(&dli->dl_lock); ++ put_dl_info(dli); ++out: ++ vxlprintk(VXD_CBIT(dlim, 1), ++ "FREE (%p,#%d)%c %lld bytes", ++ sb, tag, __dlimit_char(dli), (long long)nr, ++ _file, _line); ++} ++ ++static inline int __dl_alloc_inode(struct super_block *sb, ++ tag_t tag, const char *_file, int _line) ++{ ++ struct dl_info *dli; ++ int ret = 0; ++ ++ dli = locate_dl_info(sb, tag); ++ if (!dli) ++ goto out; ++ ++ spin_lock(&dli->dl_lock); ++ dli->dl_inodes_used++; ++ ret = (dli->dl_inodes_used > dli->dl_inodes_total); ++ spin_unlock(&dli->dl_lock); ++ put_dl_info(dli); ++out: ++ vxlprintk(VXD_CBIT(dlim, 0), ++ "ALLOC (%p,#%d)%c inode (%d)", ++ sb, tag, __dlimit_char(dli), ret, _file, _line); ++ return ret ? -ENOSPC : 0; ++} ++ ++static inline void __dl_free_inode(struct super_block *sb, ++ tag_t tag, const char *_file, int _line) ++{ ++ struct dl_info *dli; ++ ++ dli = locate_dl_info(sb, tag); ++ if (!dli) ++ goto out; ++ ++ spin_lock(&dli->dl_lock); ++ if (dli->dl_inodes_used > 1) ++ dli->dl_inodes_used--; ++ else ++ dli->dl_inodes_used = 0; ++ spin_unlock(&dli->dl_lock); ++ put_dl_info(dli); ++out: ++ vxlprintk(VXD_CBIT(dlim, 0), ++ "FREE (%p,#%d)%c inode", ++ sb, tag, __dlimit_char(dli), _file, _line); ++} ++ ++static inline void __dl_adjust_block(struct super_block *sb, tag_t tag, ++ unsigned long long *free_blocks, unsigned long long *root_blocks, ++ const char *_file, int _line) ++{ ++ struct dl_info *dli; ++ uint64_t broot, bfree; ++ ++ dli = locate_dl_info(sb, tag); ++ if (!dli) ++ return; ++ ++ spin_lock(&dli->dl_lock); ++ broot = (dli->dl_space_total - ++ (dli->dl_space_total >> 10) * dli->dl_nrlmult) ++ >> sb->s_blocksize_bits; ++ bfree = (dli->dl_space_total - dli->dl_space_used) ++ >> sb->s_blocksize_bits; ++ spin_unlock(&dli->dl_lock); ++ ++ vxlprintk(VXD_CBIT(dlim, 2), ++ "ADJUST: %lld,%lld on %lld,%lld [mult=%d]", ++ (long long)bfree, (long long)broot, ++ *free_blocks, *root_blocks, dli->dl_nrlmult, ++ _file, _line); ++ if (free_blocks) { ++ if (*free_blocks > bfree) ++ *free_blocks = bfree; ++ } ++ if (root_blocks) { ++ if (*root_blocks > broot) ++ *root_blocks = broot; ++ } ++ put_dl_info(dli); ++} ++ ++#define dl_prealloc_space(in, bytes) \ ++ __dl_alloc_space((in)->i_sb, (in)->i_tag, (dlsize_t)(bytes), \ ++ __FILE__, __LINE__ ) ++ ++#define dl_alloc_space(in, bytes) \ ++ __dl_alloc_space((in)->i_sb, (in)->i_tag, (dlsize_t)(bytes), \ ++ __FILE__, __LINE__ ) ++ ++#define dl_reserve_space(in, bytes) \ ++ __dl_alloc_space((in)->i_sb, (in)->i_tag, (dlsize_t)(bytes), \ ++ __FILE__, __LINE__ ) ++ ++#define dl_claim_space(in, bytes) (0) ++ ++#define dl_release_space(in, bytes) \ ++ __dl_free_space((in)->i_sb, (in)->i_tag, (dlsize_t)(bytes), \ ++ __FILE__, __LINE__ ) ++ ++#define dl_free_space(in, bytes) \ ++ __dl_free_space((in)->i_sb, (in)->i_tag, (dlsize_t)(bytes), \ ++ __FILE__, __LINE__ ) ++ ++ ++ ++#define dl_alloc_inode(in) \ ++ __dl_alloc_inode((in)->i_sb, (in)->i_tag, __FILE__, __LINE__ ) ++ ++#define dl_free_inode(in) \ ++ __dl_free_inode((in)->i_sb, (in)->i_tag, __FILE__, __LINE__ ) ++ ++ ++#define dl_adjust_block(sb, tag, fb, rb) \ ++ __dl_adjust_block(sb, tag, fb, rb, __FILE__, __LINE__ ) ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vs_inet.h linux-3.3.8-vs2.3.3.4/include/linux/vs_inet.h +--- linux-3.3.8/include/linux/vs_inet.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vs_inet.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,353 @@ ++#ifndef _VS_INET_H ++#define _VS_INET_H ++ ++#include "vserver/base.h" ++#include "vserver/network.h" ++#include "vserver/debug.h" ++ ++#define IPI_LOOPBACK htonl(INADDR_LOOPBACK) ++ ++#define NXAV4(a) NIPQUAD((a)->ip[0]), NIPQUAD((a)->ip[1]), \ ++ NIPQUAD((a)->mask), (a)->type ++#define NXAV4_FMT "[" NIPQUAD_FMT "-" NIPQUAD_FMT "/" NIPQUAD_FMT ":%04x]" ++ ++#define NIPQUAD(addr) \ ++ ((unsigned char *)&addr)[0], \ ++ ((unsigned char *)&addr)[1], \ ++ ((unsigned char *)&addr)[2], \ ++ ((unsigned char *)&addr)[3] ++ ++#define NIPQUAD_FMT "%u.%u.%u.%u" ++ ++ ++static inline ++int v4_addr_match(struct nx_addr_v4 *nxa, __be32 addr, uint16_t tmask) ++{ ++ __be32 ip = nxa->ip[0].s_addr; ++ __be32 mask = nxa->mask.s_addr; ++ __be32 bcast = ip | ~mask; ++ int ret = 0; ++ ++ switch (nxa->type & tmask) { ++ case NXA_TYPE_MASK: ++ ret = (ip == (addr & mask)); ++ break; ++ case NXA_TYPE_ADDR: ++ ret = 3; ++ if (addr == ip) ++ break; ++ /* fall through to broadcast */ ++ case NXA_MOD_BCAST: ++ ret = ((tmask & NXA_MOD_BCAST) && (addr == bcast)); ++ break; ++ case NXA_TYPE_RANGE: ++ ret = ((nxa->ip[0].s_addr <= addr) && ++ (nxa->ip[1].s_addr > addr)); ++ break; ++ case NXA_TYPE_ANY: ++ ret = 2; ++ break; ++ } ++ ++ vxdprintk(VXD_CBIT(net, 0), ++ "v4_addr_match(%p" NXAV4_FMT "," NIPQUAD_FMT ",%04x) = %d", ++ nxa, NXAV4(nxa), NIPQUAD(addr), tmask, ret); ++ return ret; ++} ++ ++static inline ++int v4_addr_in_nx_info(struct nx_info *nxi, __be32 addr, uint16_t tmask) ++{ ++ struct nx_addr_v4 *nxa; ++ int ret = 1; ++ ++ if (!nxi) ++ goto out; ++ ++ ret = 2; ++ /* allow 127.0.0.1 when remapping lback */ ++ if ((tmask & NXA_LOOPBACK) && ++ (addr == IPI_LOOPBACK) && ++ nx_info_flags(nxi, NXF_LBACK_REMAP, 0)) ++ goto out; ++ ret = 3; ++ /* check for lback address */ ++ if ((tmask & NXA_MOD_LBACK) && ++ (nxi->v4_lback.s_addr == addr)) ++ goto out; ++ ret = 4; ++ /* check for broadcast address */ ++ if ((tmask & NXA_MOD_BCAST) && ++ (nxi->v4_bcast.s_addr == addr)) ++ goto out; ++ ret = 5; ++ /* check for v4 addresses */ ++ for (nxa = &nxi->v4; nxa; nxa = nxa->next) ++ if (v4_addr_match(nxa, addr, tmask)) ++ goto out; ++ ret = 0; ++out: ++ vxdprintk(VXD_CBIT(net, 0), ++ "v4_addr_in_nx_info(%p[#%u]," NIPQUAD_FMT ",%04x) = %d", ++ nxi, nxi ? nxi->nx_id : 0, NIPQUAD(addr), tmask, ret); ++ return ret; ++} ++ ++static inline ++int v4_nx_addr_match(struct nx_addr_v4 *nxa, struct nx_addr_v4 *addr, uint16_t mask) ++{ ++ /* FIXME: needs full range checks */ ++ return v4_addr_match(nxa, addr->ip[0].s_addr, mask); ++} ++ ++static inline ++int v4_nx_addr_in_nx_info(struct nx_info *nxi, struct nx_addr_v4 *nxa, uint16_t mask) ++{ ++ struct nx_addr_v4 *ptr; ++ ++ for (ptr = &nxi->v4; ptr; ptr = ptr->next) ++ if (v4_nx_addr_match(ptr, nxa, mask)) ++ return 1; ++ return 0; ++} ++ ++#include ++ ++/* ++ * Check if a given address matches for a socket ++ * ++ * nxi: the socket's nx_info if any ++ * addr: to be verified address ++ */ ++static inline ++int v4_sock_addr_match ( ++ struct nx_info *nxi, ++ struct inet_sock *inet, ++ __be32 addr) ++{ ++ __be32 saddr = inet->inet_rcv_saddr; ++ __be32 bcast = nxi ? nxi->v4_bcast.s_addr : INADDR_BROADCAST; ++ ++ if (addr && (saddr == addr || bcast == addr)) ++ return 1; ++ if (!saddr) ++ return v4_addr_in_nx_info(nxi, addr, NXA_MASK_BIND); ++ return 0; ++} ++ ++ ++/* inet related checks and helpers */ ++ ++ ++struct in_ifaddr; ++struct net_device; ++struct sock; ++ ++#ifdef CONFIG_INET ++ ++#include ++#include ++#include ++#include ++ ++ ++int dev_in_nx_info(struct net_device *, struct nx_info *); ++int v4_dev_in_nx_info(struct net_device *, struct nx_info *); ++int nx_v4_addr_conflict(struct nx_info *, struct nx_info *); ++ ++ ++/* ++ * check if address is covered by socket ++ * ++ * sk: the socket to check against ++ * addr: the address in question (must be != 0) ++ */ ++ ++static inline ++int __v4_addr_match_socket(const struct sock *sk, struct nx_addr_v4 *nxa) ++{ ++ struct nx_info *nxi = sk->sk_nx_info; ++ __be32 saddr = sk_rcv_saddr(sk); ++ ++ vxdprintk(VXD_CBIT(net, 5), ++ "__v4_addr_in_socket(%p," NXAV4_FMT ") %p:" NIPQUAD_FMT " %p;%lx", ++ sk, NXAV4(nxa), nxi, NIPQUAD(saddr), sk->sk_socket, ++ (sk->sk_socket?sk->sk_socket->flags:0)); ++ ++ if (saddr) { /* direct address match */ ++ return v4_addr_match(nxa, saddr, -1); ++ } else if (nxi) { /* match against nx_info */ ++ return v4_nx_addr_in_nx_info(nxi, nxa, -1); ++ } else { /* unrestricted any socket */ ++ return 1; ++ } ++} ++ ++ ++ ++static inline ++int nx_dev_visible(struct nx_info *nxi, struct net_device *dev) ++{ ++ vxdprintk(VXD_CBIT(net, 1), ++ "nx_dev_visible(%p[#%u],%p " VS_Q("%s") ") %d", ++ nxi, nxi ? nxi->nx_id : 0, dev, dev->name, ++ nxi ? dev_in_nx_info(dev, nxi) : 0); ++ ++ if (!nx_info_flags(nxi, NXF_HIDE_NETIF, 0)) ++ return 1; ++ if (dev_in_nx_info(dev, nxi)) ++ return 1; ++ return 0; ++} ++ ++ ++static inline ++int v4_ifa_in_nx_info(struct in_ifaddr *ifa, struct nx_info *nxi) ++{ ++ if (!nxi) ++ return 1; ++ if (!ifa) ++ return 0; ++ return v4_addr_in_nx_info(nxi, ifa->ifa_local, NXA_MASK_SHOW); ++} ++ ++static inline ++int nx_v4_ifa_visible(struct nx_info *nxi, struct in_ifaddr *ifa) ++{ ++ vxdprintk(VXD_CBIT(net, 1), "nx_v4_ifa_visible(%p[#%u],%p) %d", ++ nxi, nxi ? nxi->nx_id : 0, ifa, ++ nxi ? v4_ifa_in_nx_info(ifa, nxi) : 0); ++ ++ if (!nx_info_flags(nxi, NXF_HIDE_NETIF, 0)) ++ return 1; ++ if (v4_ifa_in_nx_info(ifa, nxi)) ++ return 1; ++ return 0; ++} ++ ++ ++struct nx_v4_sock_addr { ++ __be32 saddr; /* Address used for validation */ ++ __be32 baddr; /* Address used for socket bind */ ++}; ++ ++static inline ++int v4_map_sock_addr(struct inet_sock *inet, struct sockaddr_in *addr, ++ struct nx_v4_sock_addr *nsa) ++{ ++ struct sock *sk = &inet->sk; ++ struct nx_info *nxi = sk->sk_nx_info; ++ __be32 saddr = addr->sin_addr.s_addr; ++ __be32 baddr = saddr; ++ ++ vxdprintk(VXD_CBIT(net, 3), ++ "inet_bind(%p)* %p,%p;%lx " NIPQUAD_FMT, ++ sk, sk->sk_nx_info, sk->sk_socket, ++ (sk->sk_socket ? sk->sk_socket->flags : 0), ++ NIPQUAD(saddr)); ++ ++ if (nxi) { ++ if (saddr == INADDR_ANY) { ++ if (nx_info_flags(nxi, NXF_SINGLE_IP, 0)) ++ baddr = nxi->v4.ip[0].s_addr; ++ } else if (saddr == IPI_LOOPBACK) { ++ if (nx_info_flags(nxi, NXF_LBACK_REMAP, 0)) ++ baddr = nxi->v4_lback.s_addr; ++ } else if (!ipv4_is_multicast(saddr) || ++ !nx_info_ncaps(nxi, NXC_MULTICAST)) { ++ /* normal address bind */ ++ if (!v4_addr_in_nx_info(nxi, saddr, NXA_MASK_BIND)) ++ return -EADDRNOTAVAIL; ++ } ++ } ++ ++ vxdprintk(VXD_CBIT(net, 3), ++ "inet_bind(%p) " NIPQUAD_FMT ", " NIPQUAD_FMT, ++ sk, NIPQUAD(saddr), NIPQUAD(baddr)); ++ ++ nsa->saddr = saddr; ++ nsa->baddr = baddr; ++ return 0; ++} ++ ++static inline ++void v4_set_sock_addr(struct inet_sock *inet, struct nx_v4_sock_addr *nsa) ++{ ++ inet->inet_saddr = nsa->baddr; ++ inet->inet_rcv_saddr = nsa->baddr; ++} ++ ++ ++/* ++ * helper to simplify inet_lookup_listener ++ * ++ * nxi: the socket's nx_info if any ++ * addr: to be verified address ++ * saddr: socket address ++ */ ++static inline int v4_inet_addr_match ( ++ struct nx_info *nxi, ++ __be32 addr, ++ __be32 saddr) ++{ ++ if (addr && (saddr == addr)) ++ return 1; ++ if (!saddr) ++ return nxi ? v4_addr_in_nx_info(nxi, addr, NXA_MASK_BIND) : 1; ++ return 0; ++} ++ ++static inline __be32 nx_map_sock_lback(struct nx_info *nxi, __be32 addr) ++{ ++ if (nx_info_flags(nxi, NXF_HIDE_LBACK, 0) && ++ (addr == nxi->v4_lback.s_addr)) ++ return IPI_LOOPBACK; ++ return addr; ++} ++ ++static inline ++int nx_info_has_v4(struct nx_info *nxi) ++{ ++ if (!nxi) ++ return 1; ++ if (NX_IPV4(nxi)) ++ return 1; ++ if (nx_info_flags(nxi, NXF_LBACK_REMAP, 0)) ++ return 1; ++ return 0; ++} ++ ++#else /* CONFIG_INET */ ++ ++static inline ++int nx_dev_visible(struct nx_info *n, struct net_device *d) ++{ ++ return 1; ++} ++ ++static inline ++int nx_v4_addr_conflict(struct nx_info *n, uint32_t a, const struct sock *s) ++{ ++ return 1; ++} ++ ++static inline ++int v4_ifa_in_nx_info(struct in_ifaddr *a, struct nx_info *n) ++{ ++ return 1; ++} ++ ++static inline ++int nx_info_has_v4(struct nx_info *nxi) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_INET */ ++ ++#define current_nx_info_has_v4() \ ++ nx_info_has_v4(current_nx_info()) ++ ++#else ++// #warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vs_inet6.h linux-3.3.8-vs2.3.3.4/include/linux/vs_inet6.h +--- linux-3.3.8/include/linux/vs_inet6.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vs_inet6.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,246 @@ ++#ifndef _VS_INET6_H ++#define _VS_INET6_H ++ ++#include "vserver/base.h" ++#include "vserver/network.h" ++#include "vserver/debug.h" ++ ++#include ++ ++#define NXAV6(a) &(a)->ip, &(a)->mask, (a)->prefix, (a)->type ++#define NXAV6_FMT "[%pI6/%pI6/%d:%04x]" ++ ++ ++#ifdef CONFIG_IPV6 ++ ++static inline ++int v6_addr_match(struct nx_addr_v6 *nxa, ++ const struct in6_addr *addr, uint16_t mask) ++{ ++ int ret = 0; ++ ++ switch (nxa->type & mask) { ++ case NXA_TYPE_MASK: ++ ret = ipv6_masked_addr_cmp(&nxa->ip, &nxa->mask, addr); ++ break; ++ case NXA_TYPE_ADDR: ++ ret = ipv6_addr_equal(&nxa->ip, addr); ++ break; ++ case NXA_TYPE_ANY: ++ ret = 1; ++ break; ++ } ++ vxdprintk(VXD_CBIT(net, 0), ++ "v6_addr_match(%p" NXAV6_FMT ",%pI6,%04x) = %d", ++ nxa, NXAV6(nxa), addr, mask, ret); ++ return ret; ++} ++ ++static inline ++int v6_addr_in_nx_info(struct nx_info *nxi, ++ const struct in6_addr *addr, uint16_t mask) ++{ ++ struct nx_addr_v6 *nxa; ++ int ret = 1; ++ ++ if (!nxi) ++ goto out; ++ for (nxa = &nxi->v6; nxa; nxa = nxa->next) ++ if (v6_addr_match(nxa, addr, mask)) ++ goto out; ++ ret = 0; ++out: ++ vxdprintk(VXD_CBIT(net, 0), ++ "v6_addr_in_nx_info(%p[#%u],%pI6,%04x) = %d", ++ nxi, nxi ? nxi->nx_id : 0, addr, mask, ret); ++ return ret; ++} ++ ++static inline ++int v6_nx_addr_match(struct nx_addr_v6 *nxa, struct nx_addr_v6 *addr, uint16_t mask) ++{ ++ /* FIXME: needs full range checks */ ++ return v6_addr_match(nxa, &addr->ip, mask); ++} ++ ++static inline ++int v6_nx_addr_in_nx_info(struct nx_info *nxi, struct nx_addr_v6 *nxa, uint16_t mask) ++{ ++ struct nx_addr_v6 *ptr; ++ ++ for (ptr = &nxi->v6; ptr; ptr = ptr->next) ++ if (v6_nx_addr_match(ptr, nxa, mask)) ++ return 1; ++ return 0; ++} ++ ++ ++/* ++ * Check if a given address matches for a socket ++ * ++ * nxi: the socket's nx_info if any ++ * addr: to be verified address ++ */ ++static inline ++int v6_sock_addr_match ( ++ struct nx_info *nxi, ++ struct inet_sock *inet, ++ struct in6_addr *addr) ++{ ++ struct sock *sk = &inet->sk; ++ struct in6_addr *saddr = inet6_rcv_saddr(sk); ++ ++ if (!ipv6_addr_any(addr) && ++ ipv6_addr_equal(saddr, addr)) ++ return 1; ++ if (ipv6_addr_any(saddr)) ++ return v6_addr_in_nx_info(nxi, addr, -1); ++ return 0; ++} ++ ++/* ++ * check if address is covered by socket ++ * ++ * sk: the socket to check against ++ * addr: the address in question (must be != 0) ++ */ ++ ++static inline ++int __v6_addr_match_socket(const struct sock *sk, struct nx_addr_v6 *nxa) ++{ ++ struct nx_info *nxi = sk->sk_nx_info; ++ struct in6_addr *saddr = inet6_rcv_saddr(sk); ++ ++ vxdprintk(VXD_CBIT(net, 5), ++ "__v6_addr_in_socket(%p," NXAV6_FMT ") %p:%pI6 %p;%lx", ++ sk, NXAV6(nxa), nxi, saddr, sk->sk_socket, ++ (sk->sk_socket?sk->sk_socket->flags:0)); ++ ++ if (!ipv6_addr_any(saddr)) { /* direct address match */ ++ return v6_addr_match(nxa, saddr, -1); ++ } else if (nxi) { /* match against nx_info */ ++ return v6_nx_addr_in_nx_info(nxi, nxa, -1); ++ } else { /* unrestricted any socket */ ++ return 1; ++ } ++} ++ ++ ++/* inet related checks and helpers */ ++ ++ ++struct in_ifaddr; ++struct net_device; ++struct sock; ++ ++ ++#include ++#include ++#include ++ ++ ++int dev_in_nx_info(struct net_device *, struct nx_info *); ++int v6_dev_in_nx_info(struct net_device *, struct nx_info *); ++int nx_v6_addr_conflict(struct nx_info *, struct nx_info *); ++ ++ ++ ++static inline ++int v6_ifa_in_nx_info(struct inet6_ifaddr *ifa, struct nx_info *nxi) ++{ ++ if (!nxi) ++ return 1; ++ if (!ifa) ++ return 0; ++ return v6_addr_in_nx_info(nxi, &ifa->addr, -1); ++} ++ ++static inline ++int nx_v6_ifa_visible(struct nx_info *nxi, struct inet6_ifaddr *ifa) ++{ ++ vxdprintk(VXD_CBIT(net, 1), "nx_v6_ifa_visible(%p[#%u],%p) %d", ++ nxi, nxi ? nxi->nx_id : 0, ifa, ++ nxi ? v6_ifa_in_nx_info(ifa, nxi) : 0); ++ ++ if (!nx_info_flags(nxi, NXF_HIDE_NETIF, 0)) ++ return 1; ++ if (v6_ifa_in_nx_info(ifa, nxi)) ++ return 1; ++ return 0; ++} ++ ++ ++struct nx_v6_sock_addr { ++ struct in6_addr saddr; /* Address used for validation */ ++ struct in6_addr baddr; /* Address used for socket bind */ ++}; ++ ++static inline ++int v6_map_sock_addr(struct inet_sock *inet, struct sockaddr_in6 *addr, ++ struct nx_v6_sock_addr *nsa) ++{ ++ // struct sock *sk = &inet->sk; ++ // struct nx_info *nxi = sk->sk_nx_info; ++ struct in6_addr saddr = addr->sin6_addr; ++ struct in6_addr baddr = saddr; ++ ++ nsa->saddr = saddr; ++ nsa->baddr = baddr; ++ return 0; ++} ++ ++static inline ++void v6_set_sock_addr(struct inet_sock *inet, struct nx_v6_sock_addr *nsa) ++{ ++ // struct sock *sk = &inet->sk; ++ // struct in6_addr *saddr = inet6_rcv_saddr(sk); ++ ++ // *saddr = nsa->baddr; ++ // inet->inet_saddr = nsa->baddr; ++} ++ ++static inline ++int nx_info_has_v6(struct nx_info *nxi) ++{ ++ if (!nxi) ++ return 1; ++ if (NX_IPV6(nxi)) ++ return 1; ++ return 0; ++} ++ ++#else /* CONFIG_IPV6 */ ++ ++static inline ++int nx_v6_dev_visible(struct nx_info *n, struct net_device *d) ++{ ++ return 1; ++} ++ ++ ++static inline ++int nx_v6_addr_conflict(struct nx_info *n, uint32_t a, const struct sock *s) ++{ ++ return 1; ++} ++ ++static inline ++int v6_ifa_in_nx_info(struct in_ifaddr *a, struct nx_info *n) ++{ ++ return 1; ++} ++ ++static inline ++int nx_info_has_v6(struct nx_info *nxi) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_IPV6 */ ++ ++#define current_nx_info_has_v6() \ ++ nx_info_has_v6(current_nx_info()) ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vs_limit.h linux-3.3.8-vs2.3.3.4/include/linux/vs_limit.h +--- linux-3.3.8/include/linux/vs_limit.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vs_limit.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,140 @@ ++#ifndef _VS_LIMIT_H ++#define _VS_LIMIT_H ++ ++#include "vserver/limit.h" ++#include "vserver/base.h" ++#include "vserver/context.h" ++#include "vserver/debug.h" ++#include "vserver/context.h" ++#include "vserver/limit_int.h" ++ ++ ++#define vx_acc_cres(v, d, p, r) \ ++ __vx_acc_cres(v, r, d, p, __FILE__, __LINE__) ++ ++#define vx_acc_cres_cond(x, d, p, r) \ ++ __vx_acc_cres(((x) == vx_current_xid()) ? current_vx_info() : 0, \ ++ r, d, p, __FILE__, __LINE__) ++ ++ ++#define vx_add_cres(v, a, p, r) \ ++ __vx_add_cres(v, r, a, p, __FILE__, __LINE__) ++#define vx_sub_cres(v, a, p, r) vx_add_cres(v, -(a), p, r) ++ ++#define vx_add_cres_cond(x, a, p, r) \ ++ __vx_add_cres(((x) == vx_current_xid()) ? current_vx_info() : 0, \ ++ r, a, p, __FILE__, __LINE__) ++#define vx_sub_cres_cond(x, a, p, r) vx_add_cres_cond(x, -(a), p, r) ++ ++ ++/* process and file limits */ ++ ++#define vx_nproc_inc(p) \ ++ vx_acc_cres((p)->vx_info, 1, p, RLIMIT_NPROC) ++ ++#define vx_nproc_dec(p) \ ++ vx_acc_cres((p)->vx_info,-1, p, RLIMIT_NPROC) ++ ++#define vx_files_inc(f) \ ++ vx_acc_cres_cond((f)->f_xid, 1, f, RLIMIT_NOFILE) ++ ++#define vx_files_dec(f) \ ++ vx_acc_cres_cond((f)->f_xid,-1, f, RLIMIT_NOFILE) ++ ++#define vx_locks_inc(l) \ ++ vx_acc_cres_cond((l)->fl_xid, 1, l, RLIMIT_LOCKS) ++ ++#define vx_locks_dec(l) \ ++ vx_acc_cres_cond((l)->fl_xid,-1, l, RLIMIT_LOCKS) ++ ++#define vx_openfd_inc(f) \ ++ vx_acc_cres(current_vx_info(), 1, (void *)(long)(f), VLIMIT_OPENFD) ++ ++#define vx_openfd_dec(f) \ ++ vx_acc_cres(current_vx_info(),-1, (void *)(long)(f), VLIMIT_OPENFD) ++ ++ ++#define vx_cres_avail(v, n, r) \ ++ __vx_cres_avail(v, r, n, __FILE__, __LINE__) ++ ++ ++#define vx_nproc_avail(n) \ ++ vx_cres_avail(current_vx_info(), n, RLIMIT_NPROC) ++ ++#define vx_files_avail(n) \ ++ vx_cres_avail(current_vx_info(), n, RLIMIT_NOFILE) ++ ++#define vx_locks_avail(n) \ ++ vx_cres_avail(current_vx_info(), n, RLIMIT_LOCKS) ++ ++#define vx_openfd_avail(n) \ ++ vx_cres_avail(current_vx_info(), n, VLIMIT_OPENFD) ++ ++ ++/* dentry limits */ ++ ++#define vx_dentry_inc(d) do { \ ++ if ((d)->d_count == 1) \ ++ vx_acc_cres(current_vx_info(), 1, d, VLIMIT_DENTRY); \ ++ } while (0) ++ ++#define vx_dentry_dec(d) do { \ ++ if ((d)->d_count == 0) \ ++ vx_acc_cres(current_vx_info(),-1, d, VLIMIT_DENTRY); \ ++ } while (0) ++ ++#define vx_dentry_avail(n) \ ++ vx_cres_avail(current_vx_info(), n, VLIMIT_DENTRY) ++ ++ ++/* socket limits */ ++ ++#define vx_sock_inc(s) \ ++ vx_acc_cres((s)->sk_vx_info, 1, s, VLIMIT_NSOCK) ++ ++#define vx_sock_dec(s) \ ++ vx_acc_cres((s)->sk_vx_info,-1, s, VLIMIT_NSOCK) ++ ++#define vx_sock_avail(n) \ ++ vx_cres_avail(current_vx_info(), n, VLIMIT_NSOCK) ++ ++ ++/* ipc resource limits */ ++ ++#define vx_ipcmsg_add(v, u, a) \ ++ vx_add_cres(v, a, u, RLIMIT_MSGQUEUE) ++ ++#define vx_ipcmsg_sub(v, u, a) \ ++ vx_sub_cres(v, a, u, RLIMIT_MSGQUEUE) ++ ++#define vx_ipcmsg_avail(v, a) \ ++ vx_cres_avail(v, a, RLIMIT_MSGQUEUE) ++ ++ ++#define vx_ipcshm_add(v, k, a) \ ++ vx_add_cres(v, a, (void *)(long)(k), VLIMIT_SHMEM) ++ ++#define vx_ipcshm_sub(v, k, a) \ ++ vx_sub_cres(v, a, (void *)(long)(k), VLIMIT_SHMEM) ++ ++#define vx_ipcshm_avail(v, a) \ ++ vx_cres_avail(v, a, VLIMIT_SHMEM) ++ ++ ++#define vx_semary_inc(a) \ ++ vx_acc_cres(current_vx_info(), 1, a, VLIMIT_SEMARY) ++ ++#define vx_semary_dec(a) \ ++ vx_acc_cres(current_vx_info(), -1, a, VLIMIT_SEMARY) ++ ++ ++#define vx_nsems_add(a,n) \ ++ vx_add_cres(current_vx_info(), n, a, VLIMIT_NSEMS) ++ ++#define vx_nsems_sub(a,n) \ ++ vx_sub_cres(current_vx_info(), n, a, VLIMIT_NSEMS) ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vs_network.h linux-3.3.8-vs2.3.3.4/include/linux/vs_network.h +--- linux-3.3.8/include/linux/vs_network.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vs_network.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,169 @@ ++#ifndef _NX_VS_NETWORK_H ++#define _NX_VS_NETWORK_H ++ ++#include "vserver/context.h" ++#include "vserver/network.h" ++#include "vserver/base.h" ++#include "vserver/check.h" ++#include "vserver/debug.h" ++ ++#include ++ ++ ++#define get_nx_info(i) __get_nx_info(i, __FILE__, __LINE__) ++ ++static inline struct nx_info *__get_nx_info(struct nx_info *nxi, ++ const char *_file, int _line) ++{ ++ if (!nxi) ++ return NULL; ++ ++ vxlprintk(VXD_CBIT(nid, 2), "get_nx_info(%p[#%d.%d])", ++ nxi, nxi ? nxi->nx_id : 0, ++ nxi ? atomic_read(&nxi->nx_usecnt) : 0, ++ _file, _line); ++ ++ atomic_inc(&nxi->nx_usecnt); ++ return nxi; ++} ++ ++ ++extern void free_nx_info(struct nx_info *); ++ ++#define put_nx_info(i) __put_nx_info(i, __FILE__, __LINE__) ++ ++static inline void __put_nx_info(struct nx_info *nxi, const char *_file, int _line) ++{ ++ if (!nxi) ++ return; ++ ++ vxlprintk(VXD_CBIT(nid, 2), "put_nx_info(%p[#%d.%d])", ++ nxi, nxi ? nxi->nx_id : 0, ++ nxi ? atomic_read(&nxi->nx_usecnt) : 0, ++ _file, _line); ++ ++ if (atomic_dec_and_test(&nxi->nx_usecnt)) ++ free_nx_info(nxi); ++} ++ ++ ++#define init_nx_info(p, i) __init_nx_info(p, i, __FILE__, __LINE__) ++ ++static inline void __init_nx_info(struct nx_info **nxp, struct nx_info *nxi, ++ const char *_file, int _line) ++{ ++ if (nxi) { ++ vxlprintk(VXD_CBIT(nid, 3), ++ "init_nx_info(%p[#%d.%d])", ++ nxi, nxi ? nxi->nx_id : 0, ++ nxi ? atomic_read(&nxi->nx_usecnt) : 0, ++ _file, _line); ++ ++ atomic_inc(&nxi->nx_usecnt); ++ } ++ *nxp = nxi; ++} ++ ++ ++#define set_nx_info(p, i) __set_nx_info(p, i, __FILE__, __LINE__) ++ ++static inline void __set_nx_info(struct nx_info **nxp, struct nx_info *nxi, ++ const char *_file, int _line) ++{ ++ struct nx_info *nxo; ++ ++ if (!nxi) ++ return; ++ ++ vxlprintk(VXD_CBIT(nid, 3), "set_nx_info(%p[#%d.%d])", ++ nxi, nxi ? nxi->nx_id : 0, ++ nxi ? atomic_read(&nxi->nx_usecnt) : 0, ++ _file, _line); ++ ++ atomic_inc(&nxi->nx_usecnt); ++ nxo = xchg(nxp, nxi); ++ BUG_ON(nxo); ++} ++ ++#define clr_nx_info(p) __clr_nx_info(p, __FILE__, __LINE__) ++ ++static inline void __clr_nx_info(struct nx_info **nxp, ++ const char *_file, int _line) ++{ ++ struct nx_info *nxo; ++ ++ nxo = xchg(nxp, NULL); ++ if (!nxo) ++ return; ++ ++ vxlprintk(VXD_CBIT(nid, 3), "clr_nx_info(%p[#%d.%d])", ++ nxo, nxo ? nxo->nx_id : 0, ++ nxo ? atomic_read(&nxo->nx_usecnt) : 0, ++ _file, _line); ++ ++ if (atomic_dec_and_test(&nxo->nx_usecnt)) ++ free_nx_info(nxo); ++} ++ ++ ++#define claim_nx_info(v, p) __claim_nx_info(v, p, __FILE__, __LINE__) ++ ++static inline void __claim_nx_info(struct nx_info *nxi, ++ struct task_struct *task, const char *_file, int _line) ++{ ++ vxlprintk(VXD_CBIT(nid, 3), "claim_nx_info(%p[#%d.%d.%d]) %p", ++ nxi, nxi ? nxi->nx_id : 0, ++ nxi?atomic_read(&nxi->nx_usecnt):0, ++ nxi?atomic_read(&nxi->nx_tasks):0, ++ task, _file, _line); ++ ++ atomic_inc(&nxi->nx_tasks); ++} ++ ++ ++extern void unhash_nx_info(struct nx_info *); ++ ++#define release_nx_info(v, p) __release_nx_info(v, p, __FILE__, __LINE__) ++ ++static inline void __release_nx_info(struct nx_info *nxi, ++ struct task_struct *task, const char *_file, int _line) ++{ ++ vxlprintk(VXD_CBIT(nid, 3), "release_nx_info(%p[#%d.%d.%d]) %p", ++ nxi, nxi ? nxi->nx_id : 0, ++ nxi ? atomic_read(&nxi->nx_usecnt) : 0, ++ nxi ? atomic_read(&nxi->nx_tasks) : 0, ++ task, _file, _line); ++ ++ might_sleep(); ++ ++ if (atomic_dec_and_test(&nxi->nx_tasks)) ++ unhash_nx_info(nxi); ++} ++ ++ ++#define task_get_nx_info(i) __task_get_nx_info(i, __FILE__, __LINE__) ++ ++static __inline__ struct nx_info *__task_get_nx_info(struct task_struct *p, ++ const char *_file, int _line) ++{ ++ struct nx_info *nxi; ++ ++ task_lock(p); ++ vxlprintk(VXD_CBIT(nid, 5), "task_get_nx_info(%p)", ++ p, _file, _line); ++ nxi = __get_nx_info(p->nx_info, _file, _line); ++ task_unlock(p); ++ return nxi; ++} ++ ++ ++static inline void exit_nx_info(struct task_struct *p) ++{ ++ if (p->nx_info) ++ release_nx_info(p->nx_info, p); ++} ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vs_pid.h linux-3.3.8-vs2.3.3.4/include/linux/vs_pid.h +--- linux-3.3.8/include/linux/vs_pid.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vs_pid.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,50 @@ ++#ifndef _VS_PID_H ++#define _VS_PID_H ++ ++#include "vserver/base.h" ++#include "vserver/check.h" ++#include "vserver/context.h" ++#include "vserver/debug.h" ++#include "vserver/pid.h" ++#include ++ ++ ++#define VXF_FAKE_INIT (VXF_INFO_INIT | VXF_STATE_INIT) ++ ++static inline ++int vx_proc_task_visible(struct task_struct *task) ++{ ++ if ((task->pid == 1) && ++ !vx_flags(VXF_FAKE_INIT, VXF_FAKE_INIT)) ++ /* show a blend through init */ ++ goto visible; ++ if (vx_check(vx_task_xid(task), VS_WATCH | VS_IDENT)) ++ goto visible; ++ return 0; ++visible: ++ return 1; ++} ++ ++#define find_task_by_real_pid(pid) find_task_by_pid_ns(pid, &init_pid_ns) ++ ++ ++static inline ++struct task_struct *vx_get_proc_task(struct inode *inode, struct pid *pid) ++{ ++ struct task_struct *task = get_pid_task(pid, PIDTYPE_PID); ++ ++ if (task && !vx_proc_task_visible(task)) { ++ vxdprintk(VXD_CBIT(misc, 6), ++ "dropping task (get) %p[#%u,%u] for %p[#%u,%u]", ++ task, task->xid, task->pid, ++ current, current->xid, current->pid); ++ put_task_struct(task); ++ task = NULL; ++ } ++ return task; ++} ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vs_sched.h linux-3.3.8-vs2.3.3.4/include/linux/vs_sched.h +--- linux-3.3.8/include/linux/vs_sched.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vs_sched.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,40 @@ ++#ifndef _VS_SCHED_H ++#define _VS_SCHED_H ++ ++#include "vserver/base.h" ++#include "vserver/context.h" ++#include "vserver/sched.h" ++ ++ ++#define MAX_PRIO_BIAS 20 ++#define MIN_PRIO_BIAS -20 ++ ++static inline ++int vx_adjust_prio(struct task_struct *p, int prio, int max_user) ++{ ++ struct vx_info *vxi = p->vx_info; ++ ++ if (vxi) ++ prio += vx_cpu(vxi, sched_pc).prio_bias; ++ return prio; ++} ++ ++static inline void vx_account_user(struct vx_info *vxi, ++ cputime_t cputime, int nice) ++{ ++ if (!vxi) ++ return; ++ vx_cpu(vxi, sched_pc).user_ticks += cputime; ++} ++ ++static inline void vx_account_system(struct vx_info *vxi, ++ cputime_t cputime, int idle) ++{ ++ if (!vxi) ++ return; ++ vx_cpu(vxi, sched_pc).sys_ticks += cputime; ++} ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vs_socket.h linux-3.3.8-vs2.3.3.4/include/linux/vs_socket.h +--- linux-3.3.8/include/linux/vs_socket.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vs_socket.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,67 @@ ++#ifndef _VS_SOCKET_H ++#define _VS_SOCKET_H ++ ++#include "vserver/debug.h" ++#include "vserver/base.h" ++#include "vserver/cacct.h" ++#include "vserver/context.h" ++#include "vserver/tag.h" ++ ++ ++/* socket accounting */ ++ ++#include ++ ++static inline int vx_sock_type(int family) ++{ ++ switch (family) { ++ case PF_UNSPEC: ++ return VXA_SOCK_UNSPEC; ++ case PF_UNIX: ++ return VXA_SOCK_UNIX; ++ case PF_INET: ++ return VXA_SOCK_INET; ++ case PF_INET6: ++ return VXA_SOCK_INET6; ++ case PF_PACKET: ++ return VXA_SOCK_PACKET; ++ default: ++ return VXA_SOCK_OTHER; ++ } ++} ++ ++#define vx_acc_sock(v, f, p, s) \ ++ __vx_acc_sock(v, f, p, s, __FILE__, __LINE__) ++ ++static inline void __vx_acc_sock(struct vx_info *vxi, ++ int family, int pos, int size, char *file, int line) ++{ ++ if (vxi) { ++ int type = vx_sock_type(family); ++ ++ atomic_long_inc(&vxi->cacct.sock[type][pos].count); ++ atomic_long_add(size, &vxi->cacct.sock[type][pos].total); ++ } ++} ++ ++#define vx_sock_recv(sk, s) \ ++ vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 0, s) ++#define vx_sock_send(sk, s) \ ++ vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 1, s) ++#define vx_sock_fail(sk, s) \ ++ vx_acc_sock((sk)->sk_vx_info, (sk)->sk_family, 2, s) ++ ++ ++#define sock_vx_init(s) do { \ ++ (s)->sk_xid = 0; \ ++ (s)->sk_vx_info = NULL; \ ++ } while (0) ++ ++#define sock_nx_init(s) do { \ ++ (s)->sk_nid = 0; \ ++ (s)->sk_nx_info = NULL; \ ++ } while (0) ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vs_tag.h linux-3.3.8-vs2.3.3.4/include/linux/vs_tag.h +--- linux-3.3.8/include/linux/vs_tag.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vs_tag.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,47 @@ ++#ifndef _VS_TAG_H ++#define _VS_TAG_H ++ ++#include ++ ++/* check conditions */ ++ ++#define DX_ADMIN 0x0001 ++#define DX_WATCH 0x0002 ++#define DX_HOSTID 0x0008 ++ ++#define DX_IDENT 0x0010 ++ ++#define DX_ARG_MASK 0x0010 ++ ++ ++#define dx_task_tag(t) ((t)->tag) ++ ++#define dx_current_tag() dx_task_tag(current) ++ ++#define dx_check(c, m) __dx_check(dx_current_tag(), c, m) ++ ++#define dx_weak_check(c, m) ((m) ? dx_check(c, m) : 1) ++ ++ ++/* ++ * check current context for ADMIN/WATCH and ++ * optionally against supplied argument ++ */ ++static inline int __dx_check(tag_t cid, tag_t id, unsigned int mode) ++{ ++ if (mode & DX_ARG_MASK) { ++ if ((mode & DX_IDENT) && (id == cid)) ++ return 1; ++ } ++ return (((mode & DX_ADMIN) && (cid == 0)) || ++ ((mode & DX_WATCH) && (cid == 1)) || ++ ((mode & DX_HOSTID) && (id == 0))); ++} ++ ++struct inode; ++int dx_permission(const struct inode *inode, int mask); ++ ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vs_time.h linux-3.3.8-vs2.3.3.4/include/linux/vs_time.h +--- linux-3.3.8/include/linux/vs_time.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vs_time.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,19 @@ ++#ifndef _VS_TIME_H ++#define _VS_TIME_H ++ ++ ++/* time faking stuff */ ++ ++#ifdef CONFIG_VSERVER_VTIME ++ ++extern void vx_adjust_timespec(struct timespec *ts); ++extern int vx_settimeofday(const struct timespec *ts); ++ ++#else ++#define vx_adjust_timespec(t) do { } while (0) ++#define vx_settimeofday(t) do_settimeofday(t) ++#endif ++ ++#else ++#warning duplicate inclusion ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/Kbuild linux-3.3.8-vs2.3.3.4/include/linux/vserver/Kbuild +--- linux-3.3.8/include/linux/vserver/Kbuild 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/Kbuild 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,8 @@ ++ ++header-y += context_cmd.h network_cmd.h space_cmd.h \ ++ cacct_cmd.h cvirt_cmd.h limit_cmd.h dlimit_cmd.h \ ++ inode_cmd.h tag_cmd.h sched_cmd.h signal_cmd.h \ ++ debug_cmd.h device_cmd.h ++ ++header-y += switch.h network.h monitor.h inode.h device.h ++ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/base.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/base.h +--- linux-3.3.8/include/linux/vserver/base.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/base.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,178 @@ ++#ifndef _VX_BASE_H ++#define _VX_BASE_H ++ ++ ++/* context state changes */ ++ ++enum { ++ VSC_STARTUP = 1, ++ VSC_SHUTDOWN, ++ ++ VSC_NETUP, ++ VSC_NETDOWN, ++}; ++ ++ ++ ++#define vx_task_xid(t) ((t)->xid) ++ ++#define vx_current_xid() vx_task_xid(current) ++ ++#define current_vx_info() (current->vx_info) ++ ++ ++#define nx_task_nid(t) ((t)->nid) ++ ++#define nx_current_nid() nx_task_nid(current) ++ ++#define current_nx_info() (current->nx_info) ++ ++ ++/* generic flag merging */ ++ ++#define vs_check_flags(v, m, f) (((v) & (m)) ^ (f)) ++ ++#define vs_mask_flags(v, f, m) (((v) & ~(m)) | ((f) & (m))) ++ ++#define vs_mask_mask(v, f, m) (((v) & ~(m)) | ((v) & (f) & (m))) ++ ++#define vs_check_bit(v, n) ((v) & (1LL << (n))) ++ ++ ++/* context flags */ ++ ++#define __vx_flags(v) ((v) ? (v)->vx_flags : 0) ++ ++#define vx_current_flags() __vx_flags(current_vx_info()) ++ ++#define vx_info_flags(v, m, f) \ ++ vs_check_flags(__vx_flags(v), m, f) ++ ++#define task_vx_flags(t, m, f) \ ++ ((t) && vx_info_flags((t)->vx_info, m, f)) ++ ++#define vx_flags(m, f) vx_info_flags(current_vx_info(), m, f) ++ ++ ++/* context caps */ ++ ++#define __vx_ccaps(v) ((v) ? (v)->vx_ccaps : 0) ++ ++#define vx_current_ccaps() __vx_ccaps(current_vx_info()) ++ ++#define vx_info_ccaps(v, c) (__vx_ccaps(v) & (c)) ++ ++#define vx_ccaps(c) vx_info_ccaps(current_vx_info(), (c)) ++ ++ ++ ++/* network flags */ ++ ++#define __nx_flags(n) ((n) ? (n)->nx_flags : 0) ++ ++#define nx_current_flags() __nx_flags(current_nx_info()) ++ ++#define nx_info_flags(n, m, f) \ ++ vs_check_flags(__nx_flags(n), m, f) ++ ++#define task_nx_flags(t, m, f) \ ++ ((t) && nx_info_flags((t)->nx_info, m, f)) ++ ++#define nx_flags(m, f) nx_info_flags(current_nx_info(), m, f) ++ ++ ++/* network caps */ ++ ++#define __nx_ncaps(n) ((n) ? (n)->nx_ncaps : 0) ++ ++#define nx_current_ncaps() __nx_ncaps(current_nx_info()) ++ ++#define nx_info_ncaps(n, c) (__nx_ncaps(n) & (c)) ++ ++#define nx_ncaps(c) nx_info_ncaps(current_nx_info(), c) ++ ++ ++/* context mask capabilities */ ++ ++#define __vx_mcaps(v) ((v) ? (v)->vx_ccaps >> 32UL : ~0 ) ++ ++#define vx_info_mcaps(v, c) (__vx_mcaps(v) & (c)) ++ ++#define vx_mcaps(c) vx_info_mcaps(current_vx_info(), c) ++ ++ ++/* context bcap mask */ ++ ++#define __vx_bcaps(v) ((v)->vx_bcaps) ++ ++#define vx_current_bcaps() __vx_bcaps(current_vx_info()) ++ ++ ++/* mask given bcaps */ ++ ++#define vx_info_mbcaps(v, c) ((v) ? cap_intersect(__vx_bcaps(v), c) : c) ++ ++#define vx_mbcaps(c) vx_info_mbcaps(current_vx_info(), c) ++ ++ ++/* masked cap_bset */ ++ ++#define vx_info_cap_bset(v) vx_info_mbcaps(v, current->cap_bset) ++ ++#define vx_current_cap_bset() vx_info_cap_bset(current_vx_info()) ++ ++#if 0 ++#define vx_info_mbcap(v, b) \ ++ (!vx_info_flags(v, VXF_STATE_SETUP, 0) ? \ ++ vx_info_bcaps(v, b) : (b)) ++ ++#define task_vx_mbcap(t, b) \ ++ vx_info_mbcap((t)->vx_info, (t)->b) ++ ++#define vx_mbcap(b) task_vx_mbcap(current, b) ++#endif ++ ++#define vx_cap_raised(v, c, f) cap_raised(vx_info_mbcaps(v, c), f) ++ ++#define vx_capable(b, c) (capable(b) || \ ++ (cap_raised(current_cap(), b) && vx_ccaps(c))) ++ ++#define vx_ns_capable(n, b, c) (ns_capable(n, b) || \ ++ (cap_raised(current_cap(), b) && vx_ccaps(c))) ++ ++#define nx_capable(b, c) (capable(b) || \ ++ (cap_raised(current_cap(), b) && nx_ncaps(c))) ++ ++#define vx_task_initpid(t, n) \ ++ ((t)->vx_info && \ ++ ((t)->vx_info->vx_initpid == (n))) ++ ++#define vx_current_initpid(n) vx_task_initpid(current, n) ++ ++ ++/* context unshare mask */ ++ ++#define __vx_umask(v) ((v)->vx_umask) ++ ++#define vx_current_umask() __vx_umask(current_vx_info()) ++ ++#define vx_can_unshare(b, f) (capable(b) || \ ++ (cap_raised(current_cap(), b) && \ ++ !((f) & ~vx_current_umask()))) ++ ++ ++#define __vx_wmask(v) ((v)->vx_wmask) ++ ++#define vx_current_wmask() __vx_wmask(current_vx_info()) ++ ++ ++#define __vx_state(v) ((v) ? ((v)->vx_state) : 0) ++ ++#define vx_info_state(v, m) (__vx_state(v) & (m)) ++ ++ ++#define __nx_state(n) ((n) ? ((n)->nx_state) : 0) ++ ++#define nx_info_state(n, m) (__nx_state(n) & (m)) ++ ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/cacct.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/cacct.h +--- linux-3.3.8/include/linux/vserver/cacct.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/cacct.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,15 @@ ++#ifndef _VX_CACCT_H ++#define _VX_CACCT_H ++ ++ ++enum sock_acc_field { ++ VXA_SOCK_UNSPEC = 0, ++ VXA_SOCK_UNIX, ++ VXA_SOCK_INET, ++ VXA_SOCK_INET6, ++ VXA_SOCK_PACKET, ++ VXA_SOCK_OTHER, ++ VXA_SOCK_SIZE /* array size */ ++}; ++ ++#endif /* _VX_CACCT_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/cacct_cmd.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/cacct_cmd.h +--- linux-3.3.8/include/linux/vserver/cacct_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/cacct_cmd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,23 @@ ++#ifndef _VX_CACCT_CMD_H ++#define _VX_CACCT_CMD_H ++ ++ ++/* virtual host info name commands */ ++ ++#define VCMD_sock_stat VC_CMD(VSTAT, 5, 0) ++ ++struct vcmd_sock_stat_v0 { ++ uint32_t field; ++ uint32_t count[3]; ++ uint64_t total[3]; ++}; ++ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++extern int vc_sock_stat(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_CACCT_CMD_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/cacct_def.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/cacct_def.h +--- linux-3.3.8/include/linux/vserver/cacct_def.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/cacct_def.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,43 @@ ++#ifndef _VX_CACCT_DEF_H ++#define _VX_CACCT_DEF_H ++ ++#include ++#include ++ ++ ++struct _vx_sock_acc { ++ atomic_long_t count; ++ atomic_long_t total; ++}; ++ ++/* context sub struct */ ++ ++struct _vx_cacct { ++ struct _vx_sock_acc sock[VXA_SOCK_SIZE][3]; ++ atomic_t slab[8]; ++ atomic_t page[6][8]; ++}; ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ ++static inline void __dump_vx_cacct(struct _vx_cacct *cacct) ++{ ++ int i, j; ++ ++ printk("\t_vx_cacct:"); ++ for (i = 0; i < 6; i++) { ++ struct _vx_sock_acc *ptr = cacct->sock[i]; ++ ++ printk("\t [%d] =", i); ++ for (j = 0; j < 3; j++) { ++ printk(" [%d] = %8lu, %8lu", j, ++ atomic_long_read(&ptr[j].count), ++ atomic_long_read(&ptr[j].total)); ++ } ++ printk("\n"); ++ } ++} ++ ++#endif ++ ++#endif /* _VX_CACCT_DEF_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/cacct_int.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/cacct_int.h +--- linux-3.3.8/include/linux/vserver/cacct_int.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/cacct_int.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,21 @@ ++#ifndef _VX_CACCT_INT_H ++#define _VX_CACCT_INT_H ++ ++ ++#ifdef __KERNEL__ ++ ++static inline ++unsigned long vx_sock_count(struct _vx_cacct *cacct, int type, int pos) ++{ ++ return atomic_long_read(&cacct->sock[type][pos].count); ++} ++ ++ ++static inline ++unsigned long vx_sock_total(struct _vx_cacct *cacct, int type, int pos) ++{ ++ return atomic_long_read(&cacct->sock[type][pos].total); ++} ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_CACCT_INT_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/check.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/check.h +--- linux-3.3.8/include/linux/vserver/check.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/check.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,89 @@ ++#ifndef _VS_CHECK_H ++#define _VS_CHECK_H ++ ++ ++#define MAX_S_CONTEXT 65535 /* Arbitrary limit */ ++ ++#ifdef CONFIG_VSERVER_DYNAMIC_IDS ++#define MIN_D_CONTEXT 49152 /* dynamic contexts start here */ ++#else ++#define MIN_D_CONTEXT 65536 ++#endif ++ ++/* check conditions */ ++ ++#define VS_ADMIN 0x0001 ++#define VS_WATCH 0x0002 ++#define VS_HIDE 0x0004 ++#define VS_HOSTID 0x0008 ++ ++#define VS_IDENT 0x0010 ++#define VS_EQUIV 0x0020 ++#define VS_PARENT 0x0040 ++#define VS_CHILD 0x0080 ++ ++#define VS_ARG_MASK 0x00F0 ++ ++#define VS_DYNAMIC 0x0100 ++#define VS_STATIC 0x0200 ++ ++#define VS_ATR_MASK 0x0F00 ++ ++#ifdef CONFIG_VSERVER_PRIVACY ++#define VS_ADMIN_P (0) ++#define VS_WATCH_P (0) ++#else ++#define VS_ADMIN_P VS_ADMIN ++#define VS_WATCH_P VS_WATCH ++#endif ++ ++#define VS_HARDIRQ 0x1000 ++#define VS_SOFTIRQ 0x2000 ++#define VS_IRQ 0x4000 ++ ++#define VS_IRQ_MASK 0xF000 ++ ++#include ++ ++/* ++ * check current context for ADMIN/WATCH and ++ * optionally against supplied argument ++ */ ++static inline int __vs_check(int cid, int id, unsigned int mode) ++{ ++ if (mode & VS_ARG_MASK) { ++ if ((mode & VS_IDENT) && (id == cid)) ++ return 1; ++ } ++ if (mode & VS_ATR_MASK) { ++ if ((mode & VS_DYNAMIC) && ++ (id >= MIN_D_CONTEXT) && ++ (id <= MAX_S_CONTEXT)) ++ return 1; ++ if ((mode & VS_STATIC) && ++ (id > 1) && (id < MIN_D_CONTEXT)) ++ return 1; ++ } ++ if (mode & VS_IRQ_MASK) { ++ if ((mode & VS_IRQ) && unlikely(in_interrupt())) ++ return 1; ++ if ((mode & VS_HARDIRQ) && unlikely(in_irq())) ++ return 1; ++ if ((mode & VS_SOFTIRQ) && unlikely(in_softirq())) ++ return 1; ++ } ++ return (((mode & VS_ADMIN) && (cid == 0)) || ++ ((mode & VS_WATCH) && (cid == 1)) || ++ ((mode & VS_HOSTID) && (id == 0))); ++} ++ ++#define vx_check(c, m) __vs_check(vx_current_xid(), c, (m) | VS_IRQ) ++ ++#define vx_weak_check(c, m) ((m) ? vx_check(c, m) : 1) ++ ++ ++#define nx_check(c, m) __vs_check(nx_current_nid(), c, m) ++ ++#define nx_weak_check(c, m) ((m) ? nx_check(c, m) : 1) ++ ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/context.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/context.h +--- linux-3.3.8/include/linux/vserver/context.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/context.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,188 @@ ++#ifndef _VX_CONTEXT_H ++#define _VX_CONTEXT_H ++ ++#include ++#include ++ ++ ++/* context flags */ ++ ++#define VXF_INFO_SCHED 0x00000002 ++#define VXF_INFO_NPROC 0x00000004 ++#define VXF_INFO_PRIVATE 0x00000008 ++ ++#define VXF_INFO_INIT 0x00000010 ++#define VXF_INFO_HIDE 0x00000020 ++#define VXF_INFO_ULIMIT 0x00000040 ++#define VXF_INFO_NSPACE 0x00000080 ++ ++#define VXF_SCHED_HARD 0x00000100 ++#define VXF_SCHED_PRIO 0x00000200 ++#define VXF_SCHED_PAUSE 0x00000400 ++ ++#define VXF_VIRT_MEM 0x00010000 ++#define VXF_VIRT_UPTIME 0x00020000 ++#define VXF_VIRT_CPU 0x00040000 ++#define VXF_VIRT_LOAD 0x00080000 ++#define VXF_VIRT_TIME 0x00100000 ++ ++#define VXF_HIDE_MOUNT 0x01000000 ++/* was VXF_HIDE_NETIF 0x02000000 */ ++#define VXF_HIDE_VINFO 0x04000000 ++ ++#define VXF_STATE_SETUP (1ULL << 32) ++#define VXF_STATE_INIT (1ULL << 33) ++#define VXF_STATE_ADMIN (1ULL << 34) ++ ++#define VXF_SC_HELPER (1ULL << 36) ++#define VXF_REBOOT_KILL (1ULL << 37) ++#define VXF_PERSISTENT (1ULL << 38) ++ ++#define VXF_FORK_RSS (1ULL << 48) ++#define VXF_PROLIFIC (1ULL << 49) ++ ++#define VXF_IGNEG_NICE (1ULL << 52) ++ ++#define VXF_ONE_TIME (0x0007ULL << 32) ++ ++#define VXF_INIT_SET (VXF_STATE_SETUP | VXF_STATE_INIT | VXF_STATE_ADMIN) ++ ++ ++/* context migration */ ++ ++#define VXM_SET_INIT 0x00000001 ++#define VXM_SET_REAPER 0x00000002 ++ ++/* context caps */ ++ ++#define VXC_SET_UTSNAME 0x00000001 ++#define VXC_SET_RLIMIT 0x00000002 ++#define VXC_FS_SECURITY 0x00000004 ++#define VXC_FS_TRUSTED 0x00000008 ++#define VXC_TIOCSTI 0x00000010 ++ ++/* was VXC_RAW_ICMP 0x00000100 */ ++#define VXC_SYSLOG 0x00001000 ++#define VXC_OOM_ADJUST 0x00002000 ++#define VXC_AUDIT_CONTROL 0x00004000 ++ ++#define VXC_SECURE_MOUNT 0x00010000 ++#define VXC_SECURE_REMOUNT 0x00020000 ++#define VXC_BINARY_MOUNT 0x00040000 ++ ++#define VXC_QUOTA_CTL 0x00100000 ++#define VXC_ADMIN_MAPPER 0x00200000 ++#define VXC_ADMIN_CLOOP 0x00400000 ++ ++#define VXC_KTHREAD 0x01000000 ++#define VXC_NAMESPACE 0x02000000 ++ ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++ ++#include "limit_def.h" ++#include "sched_def.h" ++#include "cvirt_def.h" ++#include "cacct_def.h" ++#include "device_def.h" ++ ++#define VX_SPACES 2 ++ ++struct _vx_info_pc { ++ struct _vx_sched_pc sched_pc; ++ struct _vx_cvirt_pc cvirt_pc; ++}; ++ ++struct _vx_space { ++ unsigned long vx_nsmask; /* assignment mask */ ++ struct nsproxy *vx_nsproxy; /* private namespaces */ ++ struct fs_struct *vx_fs; /* private namespace fs */ ++ const struct cred *vx_cred; /* task credentials */ ++}; ++ ++struct vx_info { ++ struct hlist_node vx_hlist; /* linked list of contexts */ ++ xid_t vx_id; /* context id */ ++ atomic_t vx_usecnt; /* usage count */ ++ atomic_t vx_tasks; /* tasks count */ ++ struct vx_info *vx_parent; /* parent context */ ++ int vx_state; /* context state */ ++ ++ struct _vx_space space[VX_SPACES]; /* namespace store */ ++ ++ uint64_t vx_flags; /* context flags */ ++ uint64_t vx_ccaps; /* context caps (vserver) */ ++ uint64_t vx_umask; /* unshare mask (guest) */ ++ uint64_t vx_wmask; /* warn mask (guest) */ ++ kernel_cap_t vx_bcaps; /* bounding caps (system) */ ++ ++ struct task_struct *vx_reaper; /* guest reaper process */ ++ pid_t vx_initpid; /* PID of guest init */ ++ int64_t vx_badness_bias; /* OOM points bias */ ++ ++ struct _vx_limit limit; /* vserver limits */ ++ struct _vx_sched sched; /* vserver scheduler */ ++ struct _vx_cvirt cvirt; /* virtual/bias stuff */ ++ struct _vx_cacct cacct; /* context accounting */ ++ ++ struct _vx_device dmap; /* default device map targets */ ++ ++#ifndef CONFIG_SMP ++ struct _vx_info_pc info_pc; /* per cpu data */ ++#else ++ struct _vx_info_pc *ptr_pc; /* per cpu array */ ++#endif ++ ++ wait_queue_head_t vx_wait; /* context exit waitqueue */ ++ int reboot_cmd; /* last sys_reboot() cmd */ ++ int exit_code; /* last process exit code */ ++ ++ char vx_name[65]; /* vserver name */ ++}; ++ ++#ifndef CONFIG_SMP ++#define vx_ptr_pc(vxi) (&(vxi)->info_pc) ++#define vx_per_cpu(vxi, v, id) vx_ptr_pc(vxi)->v ++#else ++#define vx_ptr_pc(vxi) ((vxi)->ptr_pc) ++#define vx_per_cpu(vxi, v, id) per_cpu_ptr(vx_ptr_pc(vxi), id)->v ++#endif ++ ++#define vx_cpu(vxi, v) vx_per_cpu(vxi, v, smp_processor_id()) ++ ++ ++struct vx_info_save { ++ struct vx_info *vxi; ++ xid_t xid; ++}; ++ ++ ++/* status flags */ ++ ++#define VXS_HASHED 0x0001 ++#define VXS_PAUSED 0x0010 ++#define VXS_SHUTDOWN 0x0100 ++#define VXS_HELPER 0x1000 ++#define VXS_RELEASED 0x8000 ++ ++ ++extern void claim_vx_info(struct vx_info *, struct task_struct *); ++extern void release_vx_info(struct vx_info *, struct task_struct *); ++ ++extern struct vx_info *lookup_vx_info(int); ++extern struct vx_info *lookup_or_create_vx_info(int); ++ ++extern int get_xid_list(int, unsigned int *, int); ++extern int xid_is_hashed(xid_t); ++ ++extern int vx_migrate_task(struct task_struct *, struct vx_info *, int); ++ ++extern long vs_state_change(struct vx_info *, unsigned int); ++ ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_CONTEXT_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/context_cmd.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/context_cmd.h +--- linux-3.3.8/include/linux/vserver/context_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/context_cmd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,162 @@ ++#ifndef _VX_CONTEXT_CMD_H ++#define _VX_CONTEXT_CMD_H ++ ++ ++/* vinfo commands */ ++ ++#define VCMD_task_xid VC_CMD(VINFO, 1, 0) ++ ++#ifdef __KERNEL__ ++extern int vc_task_xid(uint32_t); ++ ++#endif /* __KERNEL__ */ ++ ++#define VCMD_vx_info VC_CMD(VINFO, 5, 0) ++ ++struct vcmd_vx_info_v0 { ++ uint32_t xid; ++ uint32_t initpid; ++ /* more to come */ ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_vx_info(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++#define VCMD_ctx_stat VC_CMD(VSTAT, 0, 0) ++ ++struct vcmd_ctx_stat_v0 { ++ uint32_t usecnt; ++ uint32_t tasks; ++ /* more to come */ ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_ctx_stat(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++/* context commands */ ++ ++#define VCMD_ctx_create_v0 VC_CMD(VPROC, 1, 0) ++#define VCMD_ctx_create VC_CMD(VPROC, 1, 1) ++ ++struct vcmd_ctx_create { ++ uint64_t flagword; ++}; ++ ++#define VCMD_ctx_migrate_v0 VC_CMD(PROCMIG, 1, 0) ++#define VCMD_ctx_migrate VC_CMD(PROCMIG, 1, 1) ++ ++struct vcmd_ctx_migrate { ++ uint64_t flagword; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_ctx_create(uint32_t, void __user *); ++extern int vc_ctx_migrate(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* flag commands */ ++ ++#define VCMD_get_cflags VC_CMD(FLAGS, 1, 0) ++#define VCMD_set_cflags VC_CMD(FLAGS, 2, 0) ++ ++struct vcmd_ctx_flags_v0 { ++ uint64_t flagword; ++ uint64_t mask; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_cflags(struct vx_info *, void __user *); ++extern int vc_set_cflags(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* context caps commands */ ++ ++#define VCMD_get_ccaps VC_CMD(FLAGS, 3, 1) ++#define VCMD_set_ccaps VC_CMD(FLAGS, 4, 1) ++ ++struct vcmd_ctx_caps_v1 { ++ uint64_t ccaps; ++ uint64_t cmask; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_ccaps(struct vx_info *, void __user *); ++extern int vc_set_ccaps(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* bcaps commands */ ++ ++#define VCMD_get_bcaps VC_CMD(FLAGS, 9, 0) ++#define VCMD_set_bcaps VC_CMD(FLAGS, 10, 0) ++ ++struct vcmd_bcaps { ++ uint64_t bcaps; ++ uint64_t bmask; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_bcaps(struct vx_info *, void __user *); ++extern int vc_set_bcaps(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* umask commands */ ++ ++#define VCMD_get_umask VC_CMD(FLAGS, 13, 0) ++#define VCMD_set_umask VC_CMD(FLAGS, 14, 0) ++ ++struct vcmd_umask { ++ uint64_t umask; ++ uint64_t mask; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_umask(struct vx_info *, void __user *); ++extern int vc_set_umask(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* wmask commands */ ++ ++#define VCMD_get_wmask VC_CMD(FLAGS, 15, 0) ++#define VCMD_set_wmask VC_CMD(FLAGS, 16, 0) ++ ++struct vcmd_wmask { ++ uint64_t wmask; ++ uint64_t mask; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_wmask(struct vx_info *, void __user *); ++extern int vc_set_wmask(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* OOM badness */ ++ ++#define VCMD_get_badness VC_CMD(MEMCTRL, 5, 0) ++#define VCMD_set_badness VC_CMD(MEMCTRL, 6, 0) ++ ++struct vcmd_badness_v0 { ++ int64_t bias; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_badness(struct vx_info *, void __user *); ++extern int vc_set_badness(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_CONTEXT_CMD_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/cvirt.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/cvirt.h +--- linux-3.3.8/include/linux/vserver/cvirt.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/cvirt.h 2012-04-24 03:32:01.000000000 +0200 +@@ -0,0 +1,22 @@ ++#ifndef _VX_CVIRT_H ++#define _VX_CVIRT_H ++ ++ ++#ifdef __KERNEL__ ++ ++struct timespec; ++ ++void vx_vsi_boottime(struct timespec *); ++ ++void vx_vsi_uptime(struct timespec *, struct timespec *); ++ ++ ++struct vx_info; ++ ++void vx_update_load(struct vx_info *); ++ ++ ++int vx_do_syslog(int, char __user *, int); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_CVIRT_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/cvirt_cmd.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/cvirt_cmd.h +--- linux-3.3.8/include/linux/vserver/cvirt_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/cvirt_cmd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,53 @@ ++#ifndef _VX_CVIRT_CMD_H ++#define _VX_CVIRT_CMD_H ++ ++ ++/* virtual host info name commands */ ++ ++#define VCMD_set_vhi_name VC_CMD(VHOST, 1, 0) ++#define VCMD_get_vhi_name VC_CMD(VHOST, 2, 0) ++ ++struct vcmd_vhi_name_v0 { ++ uint32_t field; ++ char name[65]; ++}; ++ ++ ++enum vhi_name_field { ++ VHIN_CONTEXT = 0, ++ VHIN_SYSNAME, ++ VHIN_NODENAME, ++ VHIN_RELEASE, ++ VHIN_VERSION, ++ VHIN_MACHINE, ++ VHIN_DOMAINNAME, ++}; ++ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++extern int vc_set_vhi_name(struct vx_info *, void __user *); ++extern int vc_get_vhi_name(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++#define VCMD_virt_stat VC_CMD(VSTAT, 3, 0) ++ ++struct vcmd_virt_stat_v0 { ++ uint64_t offset; ++ uint64_t uptime; ++ uint32_t nr_threads; ++ uint32_t nr_running; ++ uint32_t nr_uninterruptible; ++ uint32_t nr_onhold; ++ uint32_t nr_forks; ++ uint32_t load[3]; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_virt_stat(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_CVIRT_CMD_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/cvirt_def.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/cvirt_def.h +--- linux-3.3.8/include/linux/vserver/cvirt_def.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/cvirt_def.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,80 @@ ++#ifndef _VX_CVIRT_DEF_H ++#define _VX_CVIRT_DEF_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++ ++struct _vx_usage_stat { ++ uint64_t user; ++ uint64_t nice; ++ uint64_t system; ++ uint64_t softirq; ++ uint64_t irq; ++ uint64_t idle; ++ uint64_t iowait; ++}; ++ ++struct _vx_syslog { ++ wait_queue_head_t log_wait; ++ spinlock_t logbuf_lock; /* lock for the log buffer */ ++ ++ unsigned long log_start; /* next char to be read by syslog() */ ++ unsigned long con_start; /* next char to be sent to consoles */ ++ unsigned long log_end; /* most-recently-written-char + 1 */ ++ unsigned long logged_chars; /* #chars since last read+clear operation */ ++ ++ char log_buf[1024]; ++}; ++ ++ ++/* context sub struct */ ++ ++struct _vx_cvirt { ++ atomic_t nr_threads; /* number of current threads */ ++ atomic_t nr_running; /* number of running threads */ ++ atomic_t nr_uninterruptible; /* number of uninterruptible threads */ ++ ++ atomic_t nr_onhold; /* processes on hold */ ++ uint32_t onhold_last; /* jiffies when put on hold */ ++ ++ struct timespec bias_ts; /* time offset to the host */ ++ struct timespec bias_idle; ++ struct timespec bias_uptime; /* context creation point */ ++ uint64_t bias_clock; /* offset in clock_t */ ++ ++ spinlock_t load_lock; /* lock for the load averages */ ++ atomic_t load_updates; /* nr of load updates done so far */ ++ uint32_t load_last; /* last time load was calculated */ ++ uint32_t load[3]; /* load averages 1,5,15 */ ++ ++ atomic_t total_forks; /* number of forks so far */ ++ ++ struct _vx_syslog syslog; ++}; ++ ++struct _vx_cvirt_pc { ++ struct _vx_usage_stat cpustat; ++}; ++ ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ ++static inline void __dump_vx_cvirt(struct _vx_cvirt *cvirt) ++{ ++ printk("\t_vx_cvirt:\n"); ++ printk("\t threads: %4d, %4d, %4d, %4d\n", ++ atomic_read(&cvirt->nr_threads), ++ atomic_read(&cvirt->nr_running), ++ atomic_read(&cvirt->nr_uninterruptible), ++ atomic_read(&cvirt->nr_onhold)); ++ /* add rest here */ ++ printk("\t total_forks = %d\n", atomic_read(&cvirt->total_forks)); ++} ++ ++#endif ++ ++#endif /* _VX_CVIRT_DEF_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/debug.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/debug.h +--- linux-3.3.8/include/linux/vserver/debug.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/debug.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,145 @@ ++#ifndef _VX_DEBUG_H ++#define _VX_DEBUG_H ++ ++ ++#define VXD_CBIT(n, m) (vs_debug_ ## n & (1 << (m))) ++#define VXD_CMIN(n, m) (vs_debug_ ## n > (m)) ++#define VXD_MASK(n, m) (vs_debug_ ## n & (m)) ++ ++#define VXD_DEV(d) (d), (d)->bd_inode->i_ino, \ ++ imajor((d)->bd_inode), iminor((d)->bd_inode) ++#define VXF_DEV "%p[%lu,%d:%d]" ++ ++#if defined(CONFIG_QUOTES_UTF8) ++#define VS_Q_LQM "\xc2\xbb" ++#define VS_Q_RQM "\xc2\xab" ++#elif defined(CONFIG_QUOTES_ASCII) ++#define VS_Q_LQM "\x27" ++#define VS_Q_RQM "\x27" ++#else ++#define VS_Q_LQM "\xbb" ++#define VS_Q_RQM "\xab" ++#endif ++ ++#define VS_Q(f) VS_Q_LQM f VS_Q_RQM ++ ++ ++#define vxd_path(p) \ ++ ({ static char _buffer[PATH_MAX]; \ ++ d_path(p, _buffer, sizeof(_buffer)); }) ++ ++#define vxd_cond_path(n) \ ++ ((n) ? vxd_path(&(n)->path) : "" ) ++ ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ ++extern unsigned int vs_debug_switch; ++extern unsigned int vs_debug_xid; ++extern unsigned int vs_debug_nid; ++extern unsigned int vs_debug_tag; ++extern unsigned int vs_debug_net; ++extern unsigned int vs_debug_limit; ++extern unsigned int vs_debug_cres; ++extern unsigned int vs_debug_dlim; ++extern unsigned int vs_debug_quota; ++extern unsigned int vs_debug_cvirt; ++extern unsigned int vs_debug_space; ++extern unsigned int vs_debug_perm; ++extern unsigned int vs_debug_misc; ++ ++ ++#define VX_LOGLEVEL "vxD: " ++#define VX_PROC_FMT "%p: " ++#define VX_PROCESS current ++ ++#define vxdprintk(c, f, x...) \ ++ do { \ ++ if (c) \ ++ printk(VX_LOGLEVEL VX_PROC_FMT f "\n", \ ++ VX_PROCESS , ##x); \ ++ } while (0) ++ ++#define vxlprintk(c, f, x...) \ ++ do { \ ++ if (c) \ ++ printk(VX_LOGLEVEL f " @%s:%d\n", x); \ ++ } while (0) ++ ++#define vxfprintk(c, f, x...) \ ++ do { \ ++ if (c) \ ++ printk(VX_LOGLEVEL f " %s@%s:%d\n", x); \ ++ } while (0) ++ ++ ++struct vx_info; ++ ++void dump_vx_info(struct vx_info *, int); ++void dump_vx_info_inactive(int); ++ ++#else /* CONFIG_VSERVER_DEBUG */ ++ ++#define vs_debug_switch 0 ++#define vs_debug_xid 0 ++#define vs_debug_nid 0 ++#define vs_debug_tag 0 ++#define vs_debug_net 0 ++#define vs_debug_limit 0 ++#define vs_debug_cres 0 ++#define vs_debug_dlim 0 ++#define vs_debug_quota 0 ++#define vs_debug_cvirt 0 ++#define vs_debug_space 0 ++#define vs_debug_perm 0 ++#define vs_debug_misc 0 ++ ++#define vxdprintk(x...) do { } while (0) ++#define vxlprintk(x...) do { } while (0) ++#define vxfprintk(x...) do { } while (0) ++ ++#endif /* CONFIG_VSERVER_DEBUG */ ++ ++ ++#ifdef CONFIG_VSERVER_WARN ++ ++#define VX_WARNLEVEL KERN_WARNING "vxW: " ++#define VX_WARN_TASK "[" VS_Q("%s") ",%u:#%u|%u|%u] " ++#define VX_WARN_XID "[xid #%u] " ++#define VX_WARN_NID "[nid #%u] " ++#define VX_WARN_TAG "[tag #%u] " ++ ++#define vxwprintk(c, f, x...) \ ++ do { \ ++ if (c) \ ++ printk(VX_WARNLEVEL f "\n", ##x); \ ++ } while (0) ++ ++#else /* CONFIG_VSERVER_WARN */ ++ ++#define vxwprintk(x...) do { } while (0) ++ ++#endif /* CONFIG_VSERVER_WARN */ ++ ++#define vxwprintk_task(c, f, x...) \ ++ vxwprintk(c, VX_WARN_TASK f, \ ++ current->comm, current->pid, \ ++ current->xid, current->nid, current->tag, ##x) ++#define vxwprintk_xid(c, f, x...) \ ++ vxwprintk(c, VX_WARN_XID f, current->xid, x) ++#define vxwprintk_nid(c, f, x...) \ ++ vxwprintk(c, VX_WARN_NID f, current->nid, x) ++#define vxwprintk_tag(c, f, x...) \ ++ vxwprintk(c, VX_WARN_TAG f, current->tag, x) ++ ++#ifdef CONFIG_VSERVER_DEBUG ++#define vxd_assert_lock(l) assert_spin_locked(l) ++#define vxd_assert(c, f, x...) vxlprintk(!(c), \ ++ "assertion [" f "] failed.", ##x, __FILE__, __LINE__) ++#else ++#define vxd_assert_lock(l) do { } while (0) ++#define vxd_assert(c, f, x...) do { } while (0) ++#endif ++ ++ ++#endif /* _VX_DEBUG_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/debug_cmd.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/debug_cmd.h +--- linux-3.3.8/include/linux/vserver/debug_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/debug_cmd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,58 @@ ++#ifndef _VX_DEBUG_CMD_H ++#define _VX_DEBUG_CMD_H ++ ++ ++/* debug commands */ ++ ++#define VCMD_dump_history VC_CMD(DEBUG, 1, 0) ++ ++#define VCMD_read_history VC_CMD(DEBUG, 5, 0) ++#define VCMD_read_monitor VC_CMD(DEBUG, 6, 0) ++ ++struct vcmd_read_history_v0 { ++ uint32_t index; ++ uint32_t count; ++ char __user *data; ++}; ++ ++struct vcmd_read_monitor_v0 { ++ uint32_t index; ++ uint32_t count; ++ char __user *data; ++}; ++ ++ ++#ifdef __KERNEL__ ++ ++#ifdef CONFIG_COMPAT ++ ++#include ++ ++struct vcmd_read_history_v0_x32 { ++ uint32_t index; ++ uint32_t count; ++ compat_uptr_t data_ptr; ++}; ++ ++struct vcmd_read_monitor_v0_x32 { ++ uint32_t index; ++ uint32_t count; ++ compat_uptr_t data_ptr; ++}; ++ ++#endif /* CONFIG_COMPAT */ ++ ++extern int vc_dump_history(uint32_t); ++ ++extern int vc_read_history(uint32_t, void __user *); ++extern int vc_read_monitor(uint32_t, void __user *); ++ ++#ifdef CONFIG_COMPAT ++ ++extern int vc_read_history_x32(uint32_t, void __user *); ++extern int vc_read_monitor_x32(uint32_t, void __user *); ++ ++#endif /* CONFIG_COMPAT */ ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_DEBUG_CMD_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/device.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/device.h +--- linux-3.3.8/include/linux/vserver/device.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/device.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,15 @@ ++#ifndef _VX_DEVICE_H ++#define _VX_DEVICE_H ++ ++ ++#define DATTR_CREATE 0x00000001 ++#define DATTR_OPEN 0x00000002 ++ ++#define DATTR_REMAP 0x00000010 ++ ++#define DATTR_MASK 0x00000013 ++ ++ ++#else /* _VX_DEVICE_H */ ++#warning duplicate inclusion ++#endif /* _VX_DEVICE_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/device_cmd.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/device_cmd.h +--- linux-3.3.8/include/linux/vserver/device_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/device_cmd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,44 @@ ++#ifndef _VX_DEVICE_CMD_H ++#define _VX_DEVICE_CMD_H ++ ++ ++/* device vserver commands */ ++ ++#define VCMD_set_mapping VC_CMD(DEVICE, 1, 0) ++#define VCMD_unset_mapping VC_CMD(DEVICE, 2, 0) ++ ++struct vcmd_set_mapping_v0 { ++ const char __user *device; ++ const char __user *target; ++ uint32_t flags; ++}; ++ ++ ++#ifdef __KERNEL__ ++ ++#ifdef CONFIG_COMPAT ++ ++#include ++ ++struct vcmd_set_mapping_v0_x32 { ++ compat_uptr_t device_ptr; ++ compat_uptr_t target_ptr; ++ uint32_t flags; ++}; ++ ++#endif /* CONFIG_COMPAT */ ++ ++#include ++ ++extern int vc_set_mapping(struct vx_info *, void __user *); ++extern int vc_unset_mapping(struct vx_info *, void __user *); ++ ++#ifdef CONFIG_COMPAT ++ ++extern int vc_set_mapping_x32(struct vx_info *, void __user *); ++extern int vc_unset_mapping_x32(struct vx_info *, void __user *); ++ ++#endif /* CONFIG_COMPAT */ ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_DEVICE_CMD_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/device_def.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/device_def.h +--- linux-3.3.8/include/linux/vserver/device_def.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/device_def.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,17 @@ ++#ifndef _VX_DEVICE_DEF_H ++#define _VX_DEVICE_DEF_H ++ ++#include ++ ++struct vx_dmap_target { ++ dev_t target; ++ uint32_t flags; ++}; ++ ++struct _vx_device { ++#ifdef CONFIG_VSERVER_DEVICE ++ struct vx_dmap_target targets[2]; ++#endif ++}; ++ ++#endif /* _VX_DEVICE_DEF_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/dlimit.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/dlimit.h +--- linux-3.3.8/include/linux/vserver/dlimit.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/dlimit.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,54 @@ ++#ifndef _VX_DLIMIT_H ++#define _VX_DLIMIT_H ++ ++#include "switch.h" ++ ++ ++#ifdef __KERNEL__ ++ ++/* keep in sync with CDLIM_INFINITY */ ++ ++#define DLIM_INFINITY (~0ULL) ++ ++#include ++#include ++ ++struct super_block; ++ ++struct dl_info { ++ struct hlist_node dl_hlist; /* linked list of contexts */ ++ struct rcu_head dl_rcu; /* the rcu head */ ++ tag_t dl_tag; /* context tag */ ++ atomic_t dl_usecnt; /* usage count */ ++ atomic_t dl_refcnt; /* reference count */ ++ ++ struct super_block *dl_sb; /* associated superblock */ ++ ++ spinlock_t dl_lock; /* protect the values */ ++ ++ unsigned long long dl_space_used; /* used space in bytes */ ++ unsigned long long dl_space_total; /* maximum space in bytes */ ++ unsigned long dl_inodes_used; /* used inodes */ ++ unsigned long dl_inodes_total; /* maximum inodes */ ++ ++ unsigned int dl_nrlmult; /* non root limit mult */ ++}; ++ ++struct rcu_head; ++ ++extern void rcu_free_dl_info(struct rcu_head *); ++extern void unhash_dl_info(struct dl_info *); ++ ++extern struct dl_info *locate_dl_info(struct super_block *, tag_t); ++ ++ ++struct kstatfs; ++ ++extern void vx_vsi_statfs(struct super_block *, struct kstatfs *); ++ ++typedef uint64_t dlsize_t; ++ ++#endif /* __KERNEL__ */ ++#else /* _VX_DLIMIT_H */ ++#warning duplicate inclusion ++#endif /* _VX_DLIMIT_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/dlimit_cmd.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/dlimit_cmd.h +--- linux-3.3.8/include/linux/vserver/dlimit_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/dlimit_cmd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,109 @@ ++#ifndef _VX_DLIMIT_CMD_H ++#define _VX_DLIMIT_CMD_H ++ ++ ++/* dlimit vserver commands */ ++ ++#define VCMD_add_dlimit VC_CMD(DLIMIT, 1, 0) ++#define VCMD_rem_dlimit VC_CMD(DLIMIT, 2, 0) ++ ++#define VCMD_set_dlimit VC_CMD(DLIMIT, 5, 0) ++#define VCMD_get_dlimit VC_CMD(DLIMIT, 6, 0) ++ ++struct vcmd_ctx_dlimit_base_v0 { ++ const char __user *name; ++ uint32_t flags; ++}; ++ ++struct vcmd_ctx_dlimit_v0 { ++ const char __user *name; ++ uint32_t space_used; /* used space in kbytes */ ++ uint32_t space_total; /* maximum space in kbytes */ ++ uint32_t inodes_used; /* used inodes */ ++ uint32_t inodes_total; /* maximum inodes */ ++ uint32_t reserved; /* reserved for root in % */ ++ uint32_t flags; ++}; ++ ++#define CDLIM_UNSET ((uint32_t)0UL) ++#define CDLIM_INFINITY ((uint32_t)~0UL) ++#define CDLIM_KEEP ((uint32_t)~1UL) ++ ++#define DLIME_UNIT 0 ++#define DLIME_KILO 1 ++#define DLIME_MEGA 2 ++#define DLIME_GIGA 3 ++ ++#define DLIMF_SHIFT 0x10 ++ ++#define DLIMS_USED 0 ++#define DLIMS_TOTAL 2 ++ ++static inline ++uint64_t dlimit_space_32to64(uint32_t val, uint32_t flags, int shift) ++{ ++ int exp = (flags & DLIMF_SHIFT) ? ++ (flags >> shift) & DLIME_GIGA : DLIME_KILO; ++ return ((uint64_t)val) << (10 * exp); ++} ++ ++static inline ++uint32_t dlimit_space_64to32(uint64_t val, uint32_t *flags, int shift) ++{ ++ int exp = 0; ++ ++ if (*flags & DLIMF_SHIFT) { ++ while (val > (1LL << 32) && (exp < 3)) { ++ val >>= 10; ++ exp++; ++ } ++ *flags &= ~(DLIME_GIGA << shift); ++ *flags |= exp << shift; ++ } else ++ val >>= 10; ++ return val; ++} ++ ++#ifdef __KERNEL__ ++ ++#ifdef CONFIG_COMPAT ++ ++#include ++ ++struct vcmd_ctx_dlimit_base_v0_x32 { ++ compat_uptr_t name_ptr; ++ uint32_t flags; ++}; ++ ++struct vcmd_ctx_dlimit_v0_x32 { ++ compat_uptr_t name_ptr; ++ uint32_t space_used; /* used space in kbytes */ ++ uint32_t space_total; /* maximum space in kbytes */ ++ uint32_t inodes_used; /* used inodes */ ++ uint32_t inodes_total; /* maximum inodes */ ++ uint32_t reserved; /* reserved for root in % */ ++ uint32_t flags; ++}; ++ ++#endif /* CONFIG_COMPAT */ ++ ++#include ++ ++extern int vc_add_dlimit(uint32_t, void __user *); ++extern int vc_rem_dlimit(uint32_t, void __user *); ++ ++extern int vc_set_dlimit(uint32_t, void __user *); ++extern int vc_get_dlimit(uint32_t, void __user *); ++ ++#ifdef CONFIG_COMPAT ++ ++extern int vc_add_dlimit_x32(uint32_t, void __user *); ++extern int vc_rem_dlimit_x32(uint32_t, void __user *); ++ ++extern int vc_set_dlimit_x32(uint32_t, void __user *); ++extern int vc_get_dlimit_x32(uint32_t, void __user *); ++ ++#endif /* CONFIG_COMPAT */ ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_DLIMIT_CMD_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/global.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/global.h +--- linux-3.3.8/include/linux/vserver/global.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/global.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,19 @@ ++#ifndef _VX_GLOBAL_H ++#define _VX_GLOBAL_H ++ ++ ++extern atomic_t vx_global_ctotal; ++extern atomic_t vx_global_cactive; ++ ++extern atomic_t nx_global_ctotal; ++extern atomic_t nx_global_cactive; ++ ++extern atomic_t vs_global_nsproxy; ++extern atomic_t vs_global_fs; ++extern atomic_t vs_global_mnt_ns; ++extern atomic_t vs_global_uts_ns; ++extern atomic_t vs_global_user_ns; ++extern atomic_t vs_global_pid_ns; ++ ++ ++#endif /* _VX_GLOBAL_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/history.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/history.h +--- linux-3.3.8/include/linux/vserver/history.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/history.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,197 @@ ++#ifndef _VX_HISTORY_H ++#define _VX_HISTORY_H ++ ++ ++enum { ++ VXH_UNUSED = 0, ++ VXH_THROW_OOPS = 1, ++ ++ VXH_GET_VX_INFO, ++ VXH_PUT_VX_INFO, ++ VXH_INIT_VX_INFO, ++ VXH_SET_VX_INFO, ++ VXH_CLR_VX_INFO, ++ VXH_CLAIM_VX_INFO, ++ VXH_RELEASE_VX_INFO, ++ VXH_ALLOC_VX_INFO, ++ VXH_DEALLOC_VX_INFO, ++ VXH_HASH_VX_INFO, ++ VXH_UNHASH_VX_INFO, ++ VXH_LOC_VX_INFO, ++ VXH_LOOKUP_VX_INFO, ++ VXH_CREATE_VX_INFO, ++}; ++ ++struct _vxhe_vxi { ++ struct vx_info *ptr; ++ unsigned xid; ++ unsigned usecnt; ++ unsigned tasks; ++}; ++ ++struct _vxhe_set_clr { ++ void *data; ++}; ++ ++struct _vxhe_loc_lookup { ++ unsigned arg; ++}; ++ ++struct _vx_hist_entry { ++ void *loc; ++ unsigned short seq; ++ unsigned short type; ++ struct _vxhe_vxi vxi; ++ union { ++ struct _vxhe_set_clr sc; ++ struct _vxhe_loc_lookup ll; ++ }; ++}; ++ ++#ifdef CONFIG_VSERVER_HISTORY ++ ++extern unsigned volatile int vxh_active; ++ ++struct _vx_hist_entry *vxh_advance(void *loc); ++ ++ ++static inline ++void __vxh_copy_vxi(struct _vx_hist_entry *entry, struct vx_info *vxi) ++{ ++ entry->vxi.ptr = vxi; ++ if (vxi) { ++ entry->vxi.usecnt = atomic_read(&vxi->vx_usecnt); ++ entry->vxi.tasks = atomic_read(&vxi->vx_tasks); ++ entry->vxi.xid = vxi->vx_id; ++ } ++} ++ ++ ++#define __HERE__ current_text_addr() ++ ++#define __VXH_BODY(__type, __data, __here) \ ++ struct _vx_hist_entry *entry; \ ++ \ ++ preempt_disable(); \ ++ entry = vxh_advance(__here); \ ++ __data; \ ++ entry->type = __type; \ ++ preempt_enable(); ++ ++ ++ /* pass vxi only */ ++ ++#define __VXH_SMPL \ ++ __vxh_copy_vxi(entry, vxi) ++ ++static inline ++void __vxh_smpl(struct vx_info *vxi, int __type, void *__here) ++{ ++ __VXH_BODY(__type, __VXH_SMPL, __here) ++} ++ ++ /* pass vxi and data (void *) */ ++ ++#define __VXH_DATA \ ++ __vxh_copy_vxi(entry, vxi); \ ++ entry->sc.data = data ++ ++static inline ++void __vxh_data(struct vx_info *vxi, void *data, ++ int __type, void *__here) ++{ ++ __VXH_BODY(__type, __VXH_DATA, __here) ++} ++ ++ /* pass vxi and arg (long) */ ++ ++#define __VXH_LONG \ ++ __vxh_copy_vxi(entry, vxi); \ ++ entry->ll.arg = arg ++ ++static inline ++void __vxh_long(struct vx_info *vxi, long arg, ++ int __type, void *__here) ++{ ++ __VXH_BODY(__type, __VXH_LONG, __here) ++} ++ ++ ++static inline ++void __vxh_throw_oops(void *__here) ++{ ++ __VXH_BODY(VXH_THROW_OOPS, {}, __here); ++ /* prevent further acquisition */ ++ vxh_active = 0; ++} ++ ++ ++#define vxh_throw_oops() __vxh_throw_oops(__HERE__); ++ ++#define __vxh_get_vx_info(v, h) __vxh_smpl(v, VXH_GET_VX_INFO, h); ++#define __vxh_put_vx_info(v, h) __vxh_smpl(v, VXH_PUT_VX_INFO, h); ++ ++#define __vxh_init_vx_info(v, d, h) \ ++ __vxh_data(v, d, VXH_INIT_VX_INFO, h); ++#define __vxh_set_vx_info(v, d, h) \ ++ __vxh_data(v, d, VXH_SET_VX_INFO, h); ++#define __vxh_clr_vx_info(v, d, h) \ ++ __vxh_data(v, d, VXH_CLR_VX_INFO, h); ++ ++#define __vxh_claim_vx_info(v, d, h) \ ++ __vxh_data(v, d, VXH_CLAIM_VX_INFO, h); ++#define __vxh_release_vx_info(v, d, h) \ ++ __vxh_data(v, d, VXH_RELEASE_VX_INFO, h); ++ ++#define vxh_alloc_vx_info(v) \ ++ __vxh_smpl(v, VXH_ALLOC_VX_INFO, __HERE__); ++#define vxh_dealloc_vx_info(v) \ ++ __vxh_smpl(v, VXH_DEALLOC_VX_INFO, __HERE__); ++ ++#define vxh_hash_vx_info(v) \ ++ __vxh_smpl(v, VXH_HASH_VX_INFO, __HERE__); ++#define vxh_unhash_vx_info(v) \ ++ __vxh_smpl(v, VXH_UNHASH_VX_INFO, __HERE__); ++ ++#define vxh_loc_vx_info(v, l) \ ++ __vxh_long(v, l, VXH_LOC_VX_INFO, __HERE__); ++#define vxh_lookup_vx_info(v, l) \ ++ __vxh_long(v, l, VXH_LOOKUP_VX_INFO, __HERE__); ++#define vxh_create_vx_info(v, l) \ ++ __vxh_long(v, l, VXH_CREATE_VX_INFO, __HERE__); ++ ++extern void vxh_dump_history(void); ++ ++ ++#else /* CONFIG_VSERVER_HISTORY */ ++ ++#define __HERE__ 0 ++ ++#define vxh_throw_oops() do { } while (0) ++ ++#define __vxh_get_vx_info(v, h) do { } while (0) ++#define __vxh_put_vx_info(v, h) do { } while (0) ++ ++#define __vxh_init_vx_info(v, d, h) do { } while (0) ++#define __vxh_set_vx_info(v, d, h) do { } while (0) ++#define __vxh_clr_vx_info(v, d, h) do { } while (0) ++ ++#define __vxh_claim_vx_info(v, d, h) do { } while (0) ++#define __vxh_release_vx_info(v, d, h) do { } while (0) ++ ++#define vxh_alloc_vx_info(v) do { } while (0) ++#define vxh_dealloc_vx_info(v) do { } while (0) ++ ++#define vxh_hash_vx_info(v) do { } while (0) ++#define vxh_unhash_vx_info(v) do { } while (0) ++ ++#define vxh_loc_vx_info(v, l) do { } while (0) ++#define vxh_lookup_vx_info(v, l) do { } while (0) ++#define vxh_create_vx_info(v, l) do { } while (0) ++ ++#define vxh_dump_history() do { } while (0) ++ ++ ++#endif /* CONFIG_VSERVER_HISTORY */ ++ ++#endif /* _VX_HISTORY_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/inode.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/inode.h +--- linux-3.3.8/include/linux/vserver/inode.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/inode.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,39 @@ ++#ifndef _VX_INODE_H ++#define _VX_INODE_H ++ ++ ++#define IATTR_TAG 0x01000000 ++ ++#define IATTR_ADMIN 0x00000001 ++#define IATTR_WATCH 0x00000002 ++#define IATTR_HIDE 0x00000004 ++#define IATTR_FLAGS 0x00000007 ++ ++#define IATTR_BARRIER 0x00010000 ++#define IATTR_IXUNLINK 0x00020000 ++#define IATTR_IMMUTABLE 0x00040000 ++#define IATTR_COW 0x00080000 ++ ++#ifdef __KERNEL__ ++ ++ ++#ifdef CONFIG_VSERVER_PROC_SECURE ++#define IATTR_PROC_DEFAULT ( IATTR_ADMIN | IATTR_HIDE ) ++#define IATTR_PROC_SYMLINK ( IATTR_ADMIN ) ++#else ++#define IATTR_PROC_DEFAULT ( IATTR_ADMIN ) ++#define IATTR_PROC_SYMLINK ( IATTR_ADMIN ) ++#endif ++ ++#define vx_hide_check(c, m) (((m) & IATTR_HIDE) ? vx_check(c, m) : 1) ++ ++#endif /* __KERNEL__ */ ++ ++/* inode ioctls */ ++ ++#define FIOC_GETXFLG _IOR('x', 5, long) ++#define FIOC_SETXFLG _IOW('x', 6, long) ++ ++#else /* _VX_INODE_H */ ++#warning duplicate inclusion ++#endif /* _VX_INODE_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/inode_cmd.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/inode_cmd.h +--- linux-3.3.8/include/linux/vserver/inode_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/inode_cmd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,59 @@ ++#ifndef _VX_INODE_CMD_H ++#define _VX_INODE_CMD_H ++ ++ ++/* inode vserver commands */ ++ ++#define VCMD_get_iattr VC_CMD(INODE, 1, 1) ++#define VCMD_set_iattr VC_CMD(INODE, 2, 1) ++ ++#define VCMD_fget_iattr VC_CMD(INODE, 3, 0) ++#define VCMD_fset_iattr VC_CMD(INODE, 4, 0) ++ ++struct vcmd_ctx_iattr_v1 { ++ const char __user *name; ++ uint32_t tag; ++ uint32_t flags; ++ uint32_t mask; ++}; ++ ++struct vcmd_ctx_fiattr_v0 { ++ uint32_t tag; ++ uint32_t flags; ++ uint32_t mask; ++}; ++ ++ ++#ifdef __KERNEL__ ++ ++ ++#ifdef CONFIG_COMPAT ++ ++#include ++ ++struct vcmd_ctx_iattr_v1_x32 { ++ compat_uptr_t name_ptr; ++ uint32_t tag; ++ uint32_t flags; ++ uint32_t mask; ++}; ++ ++#endif /* CONFIG_COMPAT */ ++ ++#include ++ ++extern int vc_get_iattr(void __user *); ++extern int vc_set_iattr(void __user *); ++ ++extern int vc_fget_iattr(uint32_t, void __user *); ++extern int vc_fset_iattr(uint32_t, void __user *); ++ ++#ifdef CONFIG_COMPAT ++ ++extern int vc_get_iattr_x32(void __user *); ++extern int vc_set_iattr_x32(void __user *); ++ ++#endif /* CONFIG_COMPAT */ ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_INODE_CMD_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/limit.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/limit.h +--- linux-3.3.8/include/linux/vserver/limit.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/limit.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,71 @@ ++#ifndef _VX_LIMIT_H ++#define _VX_LIMIT_H ++ ++#define VLIMIT_NSOCK 16 ++#define VLIMIT_OPENFD 17 ++#define VLIMIT_ANON 18 ++#define VLIMIT_SHMEM 19 ++#define VLIMIT_SEMARY 20 ++#define VLIMIT_NSEMS 21 ++#define VLIMIT_DENTRY 22 ++#define VLIMIT_MAPPED 23 ++ ++ ++#ifdef __KERNEL__ ++ ++#define VLIM_NOCHECK ((1L << VLIMIT_DENTRY) | (1L << RLIMIT_RSS)) ++ ++/* keep in sync with CRLIM_INFINITY */ ++ ++#define VLIM_INFINITY (~0ULL) ++ ++#include ++#include ++ ++#ifndef RLIM_INFINITY ++#warning RLIM_INFINITY is undefined ++#endif ++ ++#define __rlim_val(l, r, v) ((l)->res[r].v) ++ ++#define __rlim_soft(l, r) __rlim_val(l, r, soft) ++#define __rlim_hard(l, r) __rlim_val(l, r, hard) ++ ++#define __rlim_rcur(l, r) __rlim_val(l, r, rcur) ++#define __rlim_rmin(l, r) __rlim_val(l, r, rmin) ++#define __rlim_rmax(l, r) __rlim_val(l, r, rmax) ++ ++#define __rlim_lhit(l, r) __rlim_val(l, r, lhit) ++#define __rlim_hit(l, r) atomic_inc(&__rlim_lhit(l, r)) ++ ++typedef atomic_long_t rlim_atomic_t; ++typedef unsigned long rlim_t; ++ ++#define __rlim_get(l, r) atomic_long_read(&__rlim_rcur(l, r)) ++#define __rlim_set(l, r, v) atomic_long_set(&__rlim_rcur(l, r), v) ++#define __rlim_inc(l, r) atomic_long_inc(&__rlim_rcur(l, r)) ++#define __rlim_dec(l, r) atomic_long_dec(&__rlim_rcur(l, r)) ++#define __rlim_add(l, r, v) atomic_long_add(v, &__rlim_rcur(l, r)) ++#define __rlim_sub(l, r, v) atomic_long_sub(v, &__rlim_rcur(l, r)) ++ ++ ++#if (RLIM_INFINITY == VLIM_INFINITY) ++#define VX_VLIM(r) ((long long)(long)(r)) ++#define VX_RLIM(v) ((rlim_t)(v)) ++#else ++#define VX_VLIM(r) (((r) == RLIM_INFINITY) \ ++ ? VLIM_INFINITY : (long long)(r)) ++#define VX_RLIM(v) (((v) == VLIM_INFINITY) \ ++ ? RLIM_INFINITY : (rlim_t)(v)) ++#endif ++ ++struct sysinfo; ++ ++void vx_vsi_meminfo(struct sysinfo *); ++void vx_vsi_swapinfo(struct sysinfo *); ++long vx_vsi_cached(struct sysinfo *); ++ ++#define NUM_LIMITS 24 ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_LIMIT_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/limit_cmd.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/limit_cmd.h +--- linux-3.3.8/include/linux/vserver/limit_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/limit_cmd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,71 @@ ++#ifndef _VX_LIMIT_CMD_H ++#define _VX_LIMIT_CMD_H ++ ++ ++/* rlimit vserver commands */ ++ ++#define VCMD_get_rlimit VC_CMD(RLIMIT, 1, 0) ++#define VCMD_set_rlimit VC_CMD(RLIMIT, 2, 0) ++#define VCMD_get_rlimit_mask VC_CMD(RLIMIT, 3, 0) ++#define VCMD_reset_hits VC_CMD(RLIMIT, 7, 0) ++#define VCMD_reset_minmax VC_CMD(RLIMIT, 9, 0) ++ ++struct vcmd_ctx_rlimit_v0 { ++ uint32_t id; ++ uint64_t minimum; ++ uint64_t softlimit; ++ uint64_t maximum; ++}; ++ ++struct vcmd_ctx_rlimit_mask_v0 { ++ uint32_t minimum; ++ uint32_t softlimit; ++ uint32_t maximum; ++}; ++ ++#define VCMD_rlimit_stat VC_CMD(VSTAT, 1, 0) ++ ++struct vcmd_rlimit_stat_v0 { ++ uint32_t id; ++ uint32_t hits; ++ uint64_t value; ++ uint64_t minimum; ++ uint64_t maximum; ++}; ++ ++#define CRLIM_UNSET (0ULL) ++#define CRLIM_INFINITY (~0ULL) ++#define CRLIM_KEEP (~1ULL) ++ ++#ifdef __KERNEL__ ++ ++#ifdef CONFIG_IA32_EMULATION ++ ++struct vcmd_ctx_rlimit_v0_x32 { ++ uint32_t id; ++ uint64_t minimum; ++ uint64_t softlimit; ++ uint64_t maximum; ++} __attribute__ ((packed)); ++ ++#endif /* CONFIG_IA32_EMULATION */ ++ ++#include ++ ++extern int vc_get_rlimit_mask(uint32_t, void __user *); ++extern int vc_get_rlimit(struct vx_info *, void __user *); ++extern int vc_set_rlimit(struct vx_info *, void __user *); ++extern int vc_reset_hits(struct vx_info *, void __user *); ++extern int vc_reset_minmax(struct vx_info *, void __user *); ++ ++extern int vc_rlimit_stat(struct vx_info *, void __user *); ++ ++#ifdef CONFIG_IA32_EMULATION ++ ++extern int vc_get_rlimit_x32(struct vx_info *, void __user *); ++extern int vc_set_rlimit_x32(struct vx_info *, void __user *); ++ ++#endif /* CONFIG_IA32_EMULATION */ ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_LIMIT_CMD_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/limit_def.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/limit_def.h +--- linux-3.3.8/include/linux/vserver/limit_def.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/limit_def.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,47 @@ ++#ifndef _VX_LIMIT_DEF_H ++#define _VX_LIMIT_DEF_H ++ ++#include ++#include ++ ++#include "limit.h" ++ ++ ++struct _vx_res_limit { ++ rlim_t soft; /* Context soft limit */ ++ rlim_t hard; /* Context hard limit */ ++ ++ rlim_atomic_t rcur; /* Current value */ ++ rlim_t rmin; /* Context minimum */ ++ rlim_t rmax; /* Context maximum */ ++ ++ atomic_t lhit; /* Limit hits */ ++}; ++ ++/* context sub struct */ ++ ++struct _vx_limit { ++ struct _vx_res_limit res[NUM_LIMITS]; ++}; ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ ++static inline void __dump_vx_limit(struct _vx_limit *limit) ++{ ++ int i; ++ ++ printk("\t_vx_limit:"); ++ for (i = 0; i < NUM_LIMITS; i++) { ++ printk("\t [%2d] = %8lu %8lu/%8lu, %8ld/%8ld, %8d\n", ++ i, (unsigned long)__rlim_get(limit, i), ++ (unsigned long)__rlim_rmin(limit, i), ++ (unsigned long)__rlim_rmax(limit, i), ++ (long)__rlim_soft(limit, i), ++ (long)__rlim_hard(limit, i), ++ atomic_read(&__rlim_lhit(limit, i))); ++ } ++} ++ ++#endif ++ ++#endif /* _VX_LIMIT_DEF_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/limit_int.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/limit_int.h +--- linux-3.3.8/include/linux/vserver/limit_int.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/limit_int.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,198 @@ ++#ifndef _VX_LIMIT_INT_H ++#define _VX_LIMIT_INT_H ++ ++#include "context.h" ++ ++#ifdef __KERNEL__ ++ ++#define VXD_RCRES_COND(r) VXD_CBIT(cres, r) ++#define VXD_RLIMIT_COND(r) VXD_CBIT(limit, r) ++ ++extern const char *vlimit_name[NUM_LIMITS]; ++ ++static inline void __vx_acc_cres(struct vx_info *vxi, ++ int res, int dir, void *_data, char *_file, int _line) ++{ ++ if (VXD_RCRES_COND(res)) ++ vxlprintk(1, "vx_acc_cres[%5d,%s,%2d]: %5ld%s (%p)", ++ (vxi ? vxi->vx_id : -1), vlimit_name[res], res, ++ (vxi ? (long)__rlim_get(&vxi->limit, res) : 0), ++ (dir > 0) ? "++" : "--", _data, _file, _line); ++ if (!vxi) ++ return; ++ ++ if (dir > 0) ++ __rlim_inc(&vxi->limit, res); ++ else ++ __rlim_dec(&vxi->limit, res); ++} ++ ++static inline void __vx_add_cres(struct vx_info *vxi, ++ int res, int amount, void *_data, char *_file, int _line) ++{ ++ if (VXD_RCRES_COND(res)) ++ vxlprintk(1, "vx_add_cres[%5d,%s,%2d]: %5ld += %5d (%p)", ++ (vxi ? vxi->vx_id : -1), vlimit_name[res], res, ++ (vxi ? (long)__rlim_get(&vxi->limit, res) : 0), ++ amount, _data, _file, _line); ++ if (amount == 0) ++ return; ++ if (!vxi) ++ return; ++ __rlim_add(&vxi->limit, res, amount); ++} ++ ++static inline ++int __vx_cres_adjust_max(struct _vx_limit *limit, int res, rlim_t value) ++{ ++ int cond = (value > __rlim_rmax(limit, res)); ++ ++ if (cond) ++ __rlim_rmax(limit, res) = value; ++ return cond; ++} ++ ++static inline ++int __vx_cres_adjust_min(struct _vx_limit *limit, int res, rlim_t value) ++{ ++ int cond = (value < __rlim_rmin(limit, res)); ++ ++ if (cond) ++ __rlim_rmin(limit, res) = value; ++ return cond; ++} ++ ++static inline ++void __vx_cres_fixup(struct _vx_limit *limit, int res, rlim_t value) ++{ ++ if (!__vx_cres_adjust_max(limit, res, value)) ++ __vx_cres_adjust_min(limit, res, value); ++} ++ ++ ++/* return values: ++ +1 ... no limit hit ++ -1 ... over soft limit ++ 0 ... over hard limit */ ++ ++static inline int __vx_cres_avail(struct vx_info *vxi, ++ int res, int num, char *_file, int _line) ++{ ++ struct _vx_limit *limit; ++ rlim_t value; ++ ++ if (VXD_RLIMIT_COND(res)) ++ vxlprintk(1, "vx_cres_avail[%5d,%s,%2d]: %5ld/%5ld > %5ld + %5d", ++ (vxi ? vxi->vx_id : -1), vlimit_name[res], res, ++ (vxi ? (long)__rlim_soft(&vxi->limit, res) : -1), ++ (vxi ? (long)__rlim_hard(&vxi->limit, res) : -1), ++ (vxi ? (long)__rlim_get(&vxi->limit, res) : 0), ++ num, _file, _line); ++ if (!vxi) ++ return 1; ++ ++ limit = &vxi->limit; ++ value = __rlim_get(limit, res); ++ ++ if (!__vx_cres_adjust_max(limit, res, value)) ++ __vx_cres_adjust_min(limit, res, value); ++ ++ if (num == 0) ++ return 1; ++ ++ if (__rlim_soft(limit, res) == RLIM_INFINITY) ++ return -1; ++ if (value + num <= __rlim_soft(limit, res)) ++ return -1; ++ ++ if (__rlim_hard(limit, res) == RLIM_INFINITY) ++ return 1; ++ if (value + num <= __rlim_hard(limit, res)) ++ return 1; ++ ++ __rlim_hit(limit, res); ++ return 0; ++} ++ ++ ++static const int VLA_RSS[] = { RLIMIT_RSS, VLIMIT_ANON, VLIMIT_MAPPED, 0 }; ++ ++static inline ++rlim_t __vx_cres_array_sum(struct _vx_limit *limit, const int *array) ++{ ++ rlim_t value, sum = 0; ++ int res; ++ ++ while ((res = *array++)) { ++ value = __rlim_get(limit, res); ++ __vx_cres_fixup(limit, res, value); ++ sum += value; ++ } ++ return sum; ++} ++ ++static inline ++rlim_t __vx_cres_array_fixup(struct _vx_limit *limit, const int *array) ++{ ++ rlim_t value = __vx_cres_array_sum(limit, array + 1); ++ int res = *array; ++ ++ if (value == __rlim_get(limit, res)) ++ return value; ++ ++ __rlim_set(limit, res, value); ++ /* now adjust min/max */ ++ if (!__vx_cres_adjust_max(limit, res, value)) ++ __vx_cres_adjust_min(limit, res, value); ++ ++ return value; ++} ++ ++static inline int __vx_cres_array_avail(struct vx_info *vxi, ++ const int *array, int num, char *_file, int _line) ++{ ++ struct _vx_limit *limit; ++ rlim_t value = 0; ++ int res; ++ ++ if (num == 0) ++ return 1; ++ if (!vxi) ++ return 1; ++ ++ limit = &vxi->limit; ++ res = *array; ++ value = __vx_cres_array_sum(limit, array + 1); ++ ++ __rlim_set(limit, res, value); ++ __vx_cres_fixup(limit, res, value); ++ ++ return __vx_cres_avail(vxi, res, num, _file, _line); ++} ++ ++ ++static inline void vx_limit_fixup(struct _vx_limit *limit, int id) ++{ ++ rlim_t value; ++ int res; ++ ++ /* complex resources first */ ++ if ((id < 0) || (id == RLIMIT_RSS)) ++ __vx_cres_array_fixup(limit, VLA_RSS); ++ ++ for (res = 0; res < NUM_LIMITS; res++) { ++ if ((id > 0) && (res != id)) ++ continue; ++ ++ value = __rlim_get(limit, res); ++ __vx_cres_fixup(limit, res, value); ++ ++ /* not supposed to happen, maybe warn? */ ++ if (__rlim_rmax(limit, res) > __rlim_hard(limit, res)) ++ __rlim_rmax(limit, res) = __rlim_hard(limit, res); ++ } ++} ++ ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_LIMIT_INT_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/monitor.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/monitor.h +--- linux-3.3.8/include/linux/vserver/monitor.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/monitor.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,96 @@ ++#ifndef _VX_MONITOR_H ++#define _VX_MONITOR_H ++ ++#include ++ ++enum { ++ VXM_UNUSED = 0, ++ ++ VXM_SYNC = 0x10, ++ ++ VXM_UPDATE = 0x20, ++ VXM_UPDATE_1, ++ VXM_UPDATE_2, ++ ++ VXM_RQINFO_1 = 0x24, ++ VXM_RQINFO_2, ++ ++ VXM_ACTIVATE = 0x40, ++ VXM_DEACTIVATE, ++ VXM_IDLE, ++ ++ VXM_HOLD = 0x44, ++ VXM_UNHOLD, ++ ++ VXM_MIGRATE = 0x48, ++ VXM_RESCHED, ++ ++ /* all other bits are flags */ ++ VXM_SCHED = 0x80, ++}; ++ ++struct _vxm_update_1 { ++ uint32_t tokens_max; ++ uint32_t fill_rate; ++ uint32_t interval; ++}; ++ ++struct _vxm_update_2 { ++ uint32_t tokens_min; ++ uint32_t fill_rate; ++ uint32_t interval; ++}; ++ ++struct _vxm_rqinfo_1 { ++ uint16_t running; ++ uint16_t onhold; ++ uint16_t iowait; ++ uint16_t uintr; ++ uint32_t idle_tokens; ++}; ++ ++struct _vxm_rqinfo_2 { ++ uint32_t norm_time; ++ uint32_t idle_time; ++ uint32_t idle_skip; ++}; ++ ++struct _vxm_sched { ++ uint32_t tokens; ++ uint32_t norm_time; ++ uint32_t idle_time; ++}; ++ ++struct _vxm_task { ++ uint16_t pid; ++ uint16_t state; ++}; ++ ++struct _vxm_event { ++ uint32_t jif; ++ union { ++ uint32_t seq; ++ uint32_t sec; ++ }; ++ union { ++ uint32_t tokens; ++ uint32_t nsec; ++ struct _vxm_task tsk; ++ }; ++}; ++ ++struct _vx_mon_entry { ++ uint16_t type; ++ uint16_t xid; ++ union { ++ struct _vxm_event ev; ++ struct _vxm_sched sd; ++ struct _vxm_update_1 u1; ++ struct _vxm_update_2 u2; ++ struct _vxm_rqinfo_1 q1; ++ struct _vxm_rqinfo_2 q2; ++ }; ++}; ++ ++ ++#endif /* _VX_MONITOR_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/network.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/network.h +--- linux-3.3.8/include/linux/vserver/network.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/network.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,148 @@ ++#ifndef _VX_NETWORK_H ++#define _VX_NETWORK_H ++ ++#include ++ ++ ++#define MAX_N_CONTEXT 65535 /* Arbitrary limit */ ++ ++ ++/* network flags */ ++ ++#define NXF_INFO_PRIVATE 0x00000008 ++ ++#define NXF_SINGLE_IP 0x00000100 ++#define NXF_LBACK_REMAP 0x00000200 ++#define NXF_LBACK_ALLOW 0x00000400 ++ ++#define NXF_HIDE_NETIF 0x02000000 ++#define NXF_HIDE_LBACK 0x04000000 ++ ++#define NXF_STATE_SETUP (1ULL << 32) ++#define NXF_STATE_ADMIN (1ULL << 34) ++ ++#define NXF_SC_HELPER (1ULL << 36) ++#define NXF_PERSISTENT (1ULL << 38) ++ ++#define NXF_ONE_TIME (0x0005ULL << 32) ++ ++ ++#define NXF_INIT_SET (__nxf_init_set()) ++ ++static inline uint64_t __nxf_init_set(void) { ++ return NXF_STATE_ADMIN ++#ifdef CONFIG_VSERVER_AUTO_LBACK ++ | NXF_LBACK_REMAP ++ | NXF_HIDE_LBACK ++#endif ++#ifdef CONFIG_VSERVER_AUTO_SINGLE ++ | NXF_SINGLE_IP ++#endif ++ | NXF_HIDE_NETIF; ++} ++ ++ ++/* network caps */ ++ ++#define NXC_TUN_CREATE 0x00000001 ++ ++#define NXC_RAW_ICMP 0x00000100 ++ ++#define NXC_MULTICAST 0x00001000 ++ ++ ++/* address types */ ++ ++#define NXA_TYPE_IPV4 0x0001 ++#define NXA_TYPE_IPV6 0x0002 ++ ++#define NXA_TYPE_NONE 0x0000 ++#define NXA_TYPE_ANY 0x00FF ++ ++#define NXA_TYPE_ADDR 0x0010 ++#define NXA_TYPE_MASK 0x0020 ++#define NXA_TYPE_RANGE 0x0040 ++ ++#define NXA_MASK_ALL (NXA_TYPE_ADDR | NXA_TYPE_MASK | NXA_TYPE_RANGE) ++ ++#define NXA_MOD_BCAST 0x0100 ++#define NXA_MOD_LBACK 0x0200 ++ ++#define NXA_LOOPBACK 0x1000 ++ ++#define NXA_MASK_BIND (NXA_MASK_ALL | NXA_MOD_BCAST | NXA_MOD_LBACK) ++#define NXA_MASK_SHOW (NXA_MASK_ALL | NXA_LOOPBACK) ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct nx_addr_v4 { ++ struct nx_addr_v4 *next; ++ struct in_addr ip[2]; ++ struct in_addr mask; ++ uint16_t type; ++ uint16_t flags; ++}; ++ ++struct nx_addr_v6 { ++ struct nx_addr_v6 *next; ++ struct in6_addr ip; ++ struct in6_addr mask; ++ uint32_t prefix; ++ uint16_t type; ++ uint16_t flags; ++}; ++ ++struct nx_info { ++ struct hlist_node nx_hlist; /* linked list of nxinfos */ ++ nid_t nx_id; /* vnet id */ ++ atomic_t nx_usecnt; /* usage count */ ++ atomic_t nx_tasks; /* tasks count */ ++ int nx_state; /* context state */ ++ ++ uint64_t nx_flags; /* network flag word */ ++ uint64_t nx_ncaps; /* network capabilities */ ++ ++ struct in_addr v4_lback; /* Loopback address */ ++ struct in_addr v4_bcast; /* Broadcast address */ ++ struct nx_addr_v4 v4; /* First/Single ipv4 address */ ++#ifdef CONFIG_IPV6 ++ struct nx_addr_v6 v6; /* First/Single ipv6 address */ ++#endif ++ char nx_name[65]; /* network context name */ ++}; ++ ++ ++/* status flags */ ++ ++#define NXS_HASHED 0x0001 ++#define NXS_SHUTDOWN 0x0100 ++#define NXS_RELEASED 0x8000 ++ ++extern struct nx_info *lookup_nx_info(int); ++ ++extern int get_nid_list(int, unsigned int *, int); ++extern int nid_is_hashed(nid_t); ++ ++extern int nx_migrate_task(struct task_struct *, struct nx_info *); ++ ++extern long vs_net_change(struct nx_info *, unsigned int); ++ ++struct sock; ++ ++ ++#define NX_IPV4(n) ((n)->v4.type != NXA_TYPE_NONE) ++#ifdef CONFIG_IPV6 ++#define NX_IPV6(n) ((n)->v6.type != NXA_TYPE_NONE) ++#else ++#define NX_IPV6(n) (0) ++#endif ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_NETWORK_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/network_cmd.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/network_cmd.h +--- linux-3.3.8/include/linux/vserver/network_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/network_cmd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,164 @@ ++#ifndef _VX_NETWORK_CMD_H ++#define _VX_NETWORK_CMD_H ++ ++ ++/* vinfo commands */ ++ ++#define VCMD_task_nid VC_CMD(VINFO, 2, 0) ++ ++#ifdef __KERNEL__ ++extern int vc_task_nid(uint32_t); ++ ++#endif /* __KERNEL__ */ ++ ++#define VCMD_nx_info VC_CMD(VINFO, 6, 0) ++ ++struct vcmd_nx_info_v0 { ++ uint32_t nid; ++ /* more to come */ ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_nx_info(struct nx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++#include ++#include ++ ++#define VCMD_net_create_v0 VC_CMD(VNET, 1, 0) ++#define VCMD_net_create VC_CMD(VNET, 1, 1) ++ ++struct vcmd_net_create { ++ uint64_t flagword; ++}; ++ ++#define VCMD_net_migrate VC_CMD(NETMIG, 1, 0) ++ ++#define VCMD_net_add VC_CMD(NETALT, 1, 0) ++#define VCMD_net_remove VC_CMD(NETALT, 2, 0) ++ ++struct vcmd_net_addr_v0 { ++ uint16_t type; ++ uint16_t count; ++ struct in_addr ip[4]; ++ struct in_addr mask[4]; ++}; ++ ++#define VCMD_net_add_ipv4_v1 VC_CMD(NETALT, 1, 1) ++#define VCMD_net_rem_ipv4_v1 VC_CMD(NETALT, 2, 1) ++ ++struct vcmd_net_addr_ipv4_v1 { ++ uint16_t type; ++ uint16_t flags; ++ struct in_addr ip; ++ struct in_addr mask; ++}; ++ ++#define VCMD_net_add_ipv4 VC_CMD(NETALT, 1, 2) ++#define VCMD_net_rem_ipv4 VC_CMD(NETALT, 2, 2) ++ ++struct vcmd_net_addr_ipv4_v2 { ++ uint16_t type; ++ uint16_t flags; ++ struct in_addr ip; ++ struct in_addr ip2; ++ struct in_addr mask; ++}; ++ ++#define VCMD_net_add_ipv6 VC_CMD(NETALT, 3, 1) ++#define VCMD_net_remove_ipv6 VC_CMD(NETALT, 4, 1) ++ ++struct vcmd_net_addr_ipv6_v1 { ++ uint16_t type; ++ uint16_t flags; ++ uint32_t prefix; ++ struct in6_addr ip; ++ struct in6_addr mask; ++}; ++ ++#define VCMD_add_match_ipv4 VC_CMD(NETALT, 5, 0) ++#define VCMD_get_match_ipv4 VC_CMD(NETALT, 6, 0) ++ ++struct vcmd_match_ipv4_v0 { ++ uint16_t type; ++ uint16_t flags; ++ uint16_t parent; ++ uint16_t prefix; ++ struct in_addr ip; ++ struct in_addr ip2; ++ struct in_addr mask; ++}; ++ ++#define VCMD_add_match_ipv6 VC_CMD(NETALT, 7, 0) ++#define VCMD_get_match_ipv6 VC_CMD(NETALT, 8, 0) ++ ++struct vcmd_match_ipv6_v0 { ++ uint16_t type; ++ uint16_t flags; ++ uint16_t parent; ++ uint16_t prefix; ++ struct in6_addr ip; ++ struct in6_addr ip2; ++ struct in6_addr mask; ++}; ++ ++ ++#ifdef __KERNEL__ ++extern int vc_net_create(uint32_t, void __user *); ++extern int vc_net_migrate(struct nx_info *, void __user *); ++ ++extern int vc_net_add(struct nx_info *, void __user *); ++extern int vc_net_remove(struct nx_info *, void __user *); ++ ++extern int vc_net_add_ipv4_v1(struct nx_info *, void __user *); ++extern int vc_net_add_ipv4(struct nx_info *, void __user *); ++ ++extern int vc_net_rem_ipv4_v1(struct nx_info *, void __user *); ++extern int vc_net_rem_ipv4(struct nx_info *, void __user *); ++ ++extern int vc_net_add_ipv6(struct nx_info *, void __user *); ++extern int vc_net_remove_ipv6(struct nx_info *, void __user *); ++ ++extern int vc_add_match_ipv4(struct nx_info *, void __user *); ++extern int vc_get_match_ipv4(struct nx_info *, void __user *); ++ ++extern int vc_add_match_ipv6(struct nx_info *, void __user *); ++extern int vc_get_match_ipv6(struct nx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* flag commands */ ++ ++#define VCMD_get_nflags VC_CMD(FLAGS, 5, 0) ++#define VCMD_set_nflags VC_CMD(FLAGS, 6, 0) ++ ++struct vcmd_net_flags_v0 { ++ uint64_t flagword; ++ uint64_t mask; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_nflags(struct nx_info *, void __user *); ++extern int vc_set_nflags(struct nx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++ ++/* network caps commands */ ++ ++#define VCMD_get_ncaps VC_CMD(FLAGS, 7, 0) ++#define VCMD_set_ncaps VC_CMD(FLAGS, 8, 0) ++ ++struct vcmd_net_caps_v0 { ++ uint64_t ncaps; ++ uint64_t cmask; ++}; ++ ++#ifdef __KERNEL__ ++extern int vc_get_ncaps(struct nx_info *, void __user *); ++extern int vc_set_ncaps(struct nx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_CONTEXT_CMD_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/percpu.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/percpu.h +--- linux-3.3.8/include/linux/vserver/percpu.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/percpu.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,14 @@ ++#ifndef _VX_PERCPU_H ++#define _VX_PERCPU_H ++ ++#include "cvirt_def.h" ++#include "sched_def.h" ++ ++struct _vx_percpu { ++ struct _vx_cvirt_pc cvirt; ++ struct _vx_sched_pc sched; ++}; ++ ++#define PERCPU_PERCTX (sizeof(struct _vx_percpu)) ++ ++#endif /* _VX_PERCPU_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/pid.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/pid.h +--- linux-3.3.8/include/linux/vserver/pid.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/pid.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,51 @@ ++#ifndef _VSERVER_PID_H ++#define _VSERVER_PID_H ++ ++/* pid faking stuff */ ++ ++#define vx_info_map_pid(v, p) \ ++ __vx_info_map_pid((v), (p), __func__, __FILE__, __LINE__) ++#define vx_info_map_tgid(v,p) vx_info_map_pid(v,p) ++#define vx_map_pid(p) vx_info_map_pid(current_vx_info(), p) ++#define vx_map_tgid(p) vx_map_pid(p) ++ ++static inline int __vx_info_map_pid(struct vx_info *vxi, int pid, ++ const char *func, const char *file, int line) ++{ ++ if (vx_info_flags(vxi, VXF_INFO_INIT, 0)) { ++ vxfprintk(VXD_CBIT(cvirt, 2), ++ "vx_map_tgid: %p/%llx: %d -> %d", ++ vxi, (long long)vxi->vx_flags, pid, ++ (pid && pid == vxi->vx_initpid) ? 1 : pid, ++ func, file, line); ++ if (pid == 0) ++ return 0; ++ if (pid == vxi->vx_initpid) ++ return 1; ++ } ++ return pid; ++} ++ ++#define vx_info_rmap_pid(v, p) \ ++ __vx_info_rmap_pid((v), (p), __func__, __FILE__, __LINE__) ++#define vx_rmap_pid(p) vx_info_rmap_pid(current_vx_info(), p) ++#define vx_rmap_tgid(p) vx_rmap_pid(p) ++ ++static inline int __vx_info_rmap_pid(struct vx_info *vxi, int pid, ++ const char *func, const char *file, int line) ++{ ++ if (vx_info_flags(vxi, VXF_INFO_INIT, 0)) { ++ vxfprintk(VXD_CBIT(cvirt, 2), ++ "vx_rmap_tgid: %p/%llx: %d -> %d", ++ vxi, (long long)vxi->vx_flags, pid, ++ (pid == 1) ? vxi->vx_initpid : pid, ++ func, file, line); ++ if ((pid == 1) && vxi->vx_initpid) ++ return vxi->vx_initpid; ++ if (pid == vxi->vx_initpid) ++ return ~0U; ++ } ++ return pid; ++} ++ ++#endif +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/sched.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/sched.h +--- linux-3.3.8/include/linux/vserver/sched.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/sched.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,23 @@ ++#ifndef _VX_SCHED_H ++#define _VX_SCHED_H ++ ++ ++#ifdef __KERNEL__ ++ ++struct timespec; ++ ++void vx_vsi_uptime(struct timespec *, struct timespec *); ++ ++ ++struct vx_info; ++ ++void vx_update_load(struct vx_info *); ++ ++ ++void vx_update_sched_param(struct _vx_sched *sched, ++ struct _vx_sched_pc *sched_pc); ++ ++#endif /* __KERNEL__ */ ++#else /* _VX_SCHED_H */ ++#warning duplicate inclusion ++#endif /* _VX_SCHED_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/sched_cmd.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/sched_cmd.h +--- linux-3.3.8/include/linux/vserver/sched_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/sched_cmd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,21 @@ ++#ifndef _VX_SCHED_CMD_H ++#define _VX_SCHED_CMD_H ++ ++ ++struct vcmd_prio_bias { ++ int32_t cpu_id; ++ int32_t prio_bias; ++}; ++ ++#define VCMD_set_prio_bias VC_CMD(SCHED, 4, 0) ++#define VCMD_get_prio_bias VC_CMD(SCHED, 5, 0) ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++extern int vc_set_prio_bias(struct vx_info *, void __user *); ++extern int vc_get_prio_bias(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_SCHED_CMD_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/sched_def.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/sched_def.h +--- linux-3.3.8/include/linux/vserver/sched_def.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/sched_def.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,38 @@ ++#ifndef _VX_SCHED_DEF_H ++#define _VX_SCHED_DEF_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++ ++/* context sub struct */ ++ ++struct _vx_sched { ++ int prio_bias; /* bias offset for priority */ ++ ++ cpumask_t update; /* CPUs which should update */ ++}; ++ ++struct _vx_sched_pc { ++ int prio_bias; /* bias offset for priority */ ++ ++ uint64_t user_ticks; /* token tick events */ ++ uint64_t sys_ticks; /* token tick events */ ++ uint64_t hold_ticks; /* token ticks paused */ ++}; ++ ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ ++static inline void __dump_vx_sched(struct _vx_sched *sched) ++{ ++ printk("\t_vx_sched:\n"); ++ printk("\t priority = %4d\n", sched->prio_bias); ++} ++ ++#endif ++ ++#endif /* _VX_SCHED_DEF_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/signal.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/signal.h +--- linux-3.3.8/include/linux/vserver/signal.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/signal.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,14 @@ ++#ifndef _VX_SIGNAL_H ++#define _VX_SIGNAL_H ++ ++ ++#ifdef __KERNEL__ ++ ++struct vx_info; ++ ++int vx_info_kill(struct vx_info *, int, int); ++ ++#endif /* __KERNEL__ */ ++#else /* _VX_SIGNAL_H */ ++#warning duplicate inclusion ++#endif /* _VX_SIGNAL_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/signal_cmd.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/signal_cmd.h +--- linux-3.3.8/include/linux/vserver/signal_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/signal_cmd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,43 @@ ++#ifndef _VX_SIGNAL_CMD_H ++#define _VX_SIGNAL_CMD_H ++ ++ ++/* signalling vserver commands */ ++ ++#define VCMD_ctx_kill VC_CMD(PROCTRL, 1, 0) ++#define VCMD_wait_exit VC_CMD(EVENT, 99, 0) ++ ++struct vcmd_ctx_kill_v0 { ++ int32_t pid; ++ int32_t sig; ++}; ++ ++struct vcmd_wait_exit_v0 { ++ int32_t reboot_cmd; ++ int32_t exit_code; ++}; ++ ++#ifdef __KERNEL__ ++ ++extern int vc_ctx_kill(struct vx_info *, void __user *); ++extern int vc_wait_exit(struct vx_info *, void __user *); ++ ++#endif /* __KERNEL__ */ ++ ++/* process alteration commands */ ++ ++#define VCMD_get_pflags VC_CMD(PROCALT, 5, 0) ++#define VCMD_set_pflags VC_CMD(PROCALT, 6, 0) ++ ++struct vcmd_pflags_v0 { ++ uint32_t flagword; ++ uint32_t mask; ++}; ++ ++#ifdef __KERNEL__ ++ ++extern int vc_get_pflags(uint32_t pid, void __user *); ++extern int vc_set_pflags(uint32_t pid, void __user *); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_SIGNAL_CMD_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/space.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/space.h +--- linux-3.3.8/include/linux/vserver/space.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/space.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,12 @@ ++#ifndef _VX_SPACE_H ++#define _VX_SPACE_H ++ ++#include ++ ++struct vx_info; ++ ++int vx_set_space(struct vx_info *vxi, unsigned long mask, unsigned index); ++ ++#else /* _VX_SPACE_H */ ++#warning duplicate inclusion ++#endif /* _VX_SPACE_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/space_cmd.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/space_cmd.h +--- linux-3.3.8/include/linux/vserver/space_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/space_cmd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,38 @@ ++#ifndef _VX_SPACE_CMD_H ++#define _VX_SPACE_CMD_H ++ ++ ++#define VCMD_enter_space_v0 VC_CMD(PROCALT, 1, 0) ++#define VCMD_enter_space_v1 VC_CMD(PROCALT, 1, 1) ++#define VCMD_enter_space VC_CMD(PROCALT, 1, 2) ++ ++#define VCMD_set_space_v0 VC_CMD(PROCALT, 3, 0) ++#define VCMD_set_space_v1 VC_CMD(PROCALT, 3, 1) ++#define VCMD_set_space VC_CMD(PROCALT, 3, 2) ++ ++#define VCMD_get_space_mask_v0 VC_CMD(PROCALT, 4, 0) ++ ++#define VCMD_get_space_mask VC_CMD(VSPACE, 0, 1) ++#define VCMD_get_space_default VC_CMD(VSPACE, 1, 0) ++ ++ ++struct vcmd_space_mask_v1 { ++ uint64_t mask; ++}; ++ ++struct vcmd_space_mask_v2 { ++ uint64_t mask; ++ uint32_t index; ++}; ++ ++ ++#ifdef __KERNEL__ ++ ++extern int vc_enter_space_v1(struct vx_info *, void __user *); ++extern int vc_set_space_v1(struct vx_info *, void __user *); ++extern int vc_enter_space(struct vx_info *, void __user *); ++extern int vc_set_space(struct vx_info *, void __user *); ++extern int vc_get_space_mask(void __user *, int); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_SPACE_CMD_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/switch.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/switch.h +--- linux-3.3.8/include/linux/vserver/switch.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/switch.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,98 @@ ++#ifndef _VX_SWITCH_H ++#define _VX_SWITCH_H ++ ++#include ++ ++ ++#define VC_CATEGORY(c) (((c) >> 24) & 0x3F) ++#define VC_COMMAND(c) (((c) >> 16) & 0xFF) ++#define VC_VERSION(c) ((c) & 0xFFF) ++ ++#define VC_CMD(c, i, v) ((((VC_CAT_ ## c) & 0x3F) << 24) \ ++ | (((i) & 0xFF) << 16) | ((v) & 0xFFF)) ++ ++/* ++ ++ Syscall Matrix V2.8 ++ ++ |VERSION|CREATE |MODIFY |MIGRATE|CONTROL|EXPERIM| |SPECIAL|SPECIAL| ++ |STATS |DESTROY|ALTER |CHANGE |LIMIT |TEST | | | | ++ |INFO |SETUP | |MOVE | | | | | | ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ SYSTEM |VERSION|VSETUP |VHOST | | | | |DEVICE | | ++ HOST | 00| 01| 02| 03| 04| 05| | 06| 07| ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ CPU | |VPROC |PROCALT|PROCMIG|PROCTRL| | |SCHED. | | ++ PROCESS| 08| 09| 10| 11| 12| 13| | 14| 15| ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ MEMORY | | | | |MEMCTRL| | |SWAP | | ++ | 16| 17| 18| 19| 20| 21| | 22| 23| ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ NETWORK| |VNET |NETALT |NETMIG |NETCTL | | |SERIAL | | ++ | 24| 25| 26| 27| 28| 29| | 30| 31| ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ DISK | | | |TAGMIG |DLIMIT | | |INODE | | ++ VFS | 32| 33| 34| 35| 36| 37| | 38| 39| ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ OTHER |VSTAT | | | | | | |VINFO | | ++ | 40| 41| 42| 43| 44| 45| | 46| 47| ++ =======+=======+=======+=======+=======+=======+=======+ +=======+=======+ ++ SPECIAL|EVENT | | | |FLAGS | | |VSPACE | | ++ | 48| 49| 50| 51| 52| 53| | 54| 55| ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ SPECIAL|DEBUG | | | |RLIMIT |SYSCALL| | |COMPAT | ++ | 56| 57| 58| 59| 60|TEST 61| | 62| 63| ++ -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ ++ ++*/ ++ ++#define VC_CAT_VERSION 0 ++ ++#define VC_CAT_VSETUP 1 ++#define VC_CAT_VHOST 2 ++ ++#define VC_CAT_DEVICE 6 ++ ++#define VC_CAT_VPROC 9 ++#define VC_CAT_PROCALT 10 ++#define VC_CAT_PROCMIG 11 ++#define VC_CAT_PROCTRL 12 ++ ++#define VC_CAT_SCHED 14 ++#define VC_CAT_MEMCTRL 20 ++ ++#define VC_CAT_VNET 25 ++#define VC_CAT_NETALT 26 ++#define VC_CAT_NETMIG 27 ++#define VC_CAT_NETCTRL 28 ++ ++#define VC_CAT_TAGMIG 35 ++#define VC_CAT_DLIMIT 36 ++#define VC_CAT_INODE 38 ++ ++#define VC_CAT_VSTAT 40 ++#define VC_CAT_VINFO 46 ++#define VC_CAT_EVENT 48 ++ ++#define VC_CAT_FLAGS 52 ++#define VC_CAT_VSPACE 54 ++#define VC_CAT_DEBUG 56 ++#define VC_CAT_RLIMIT 60 ++ ++#define VC_CAT_SYSTEST 61 ++#define VC_CAT_COMPAT 63 ++ ++/* query version */ ++ ++#define VCMD_get_version VC_CMD(VERSION, 0, 0) ++#define VCMD_get_vci VC_CMD(VERSION, 1, 0) ++ ++ ++#ifdef __KERNEL__ ++ ++#include ++ ++#endif /* __KERNEL__ */ ++ ++#endif /* _VX_SWITCH_H */ ++ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/tag.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/tag.h +--- linux-3.3.8/include/linux/vserver/tag.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/tag.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,143 @@ ++#ifndef _DX_TAG_H ++#define _DX_TAG_H ++ ++#include ++ ++ ++#define DX_TAG(in) (IS_TAGGED(in)) ++ ++ ++#ifdef CONFIG_TAG_NFSD ++#define DX_TAG_NFSD 1 ++#else ++#define DX_TAG_NFSD 0 ++#endif ++ ++ ++#ifdef CONFIG_TAGGING_NONE ++ ++#define MAX_UID 0xFFFFFFFF ++#define MAX_GID 0xFFFFFFFF ++ ++#define INOTAG_TAG(cond, uid, gid, tag) (0) ++ ++#define TAGINO_UID(cond, uid, tag) (uid) ++#define TAGINO_GID(cond, gid, tag) (gid) ++ ++#endif ++ ++ ++#ifdef CONFIG_TAGGING_GID16 ++ ++#define MAX_UID 0xFFFFFFFF ++#define MAX_GID 0x0000FFFF ++ ++#define INOTAG_TAG(cond, uid, gid, tag) \ ++ ((cond) ? (((gid) >> 16) & 0xFFFF) : 0) ++ ++#define TAGINO_UID(cond, uid, tag) (uid) ++#define TAGINO_GID(cond, gid, tag) \ ++ ((cond) ? (((gid) & 0xFFFF) | ((tag) << 16)) : (gid)) ++ ++#endif ++ ++ ++#ifdef CONFIG_TAGGING_ID24 ++ ++#define MAX_UID 0x00FFFFFF ++#define MAX_GID 0x00FFFFFF ++ ++#define INOTAG_TAG(cond, uid, gid, tag) \ ++ ((cond) ? ((((uid) >> 16) & 0xFF00) | (((gid) >> 24) & 0xFF)) : 0) ++ ++#define TAGINO_UID(cond, uid, tag) \ ++ ((cond) ? (((uid) & 0xFFFFFF) | (((tag) & 0xFF00) << 16)) : (uid)) ++#define TAGINO_GID(cond, gid, tag) \ ++ ((cond) ? (((gid) & 0xFFFFFF) | (((tag) & 0x00FF) << 24)) : (gid)) ++ ++#endif ++ ++ ++#ifdef CONFIG_TAGGING_UID16 ++ ++#define MAX_UID 0x0000FFFF ++#define MAX_GID 0xFFFFFFFF ++ ++#define INOTAG_TAG(cond, uid, gid, tag) \ ++ ((cond) ? (((uid) >> 16) & 0xFFFF) : 0) ++ ++#define TAGINO_UID(cond, uid, tag) \ ++ ((cond) ? (((uid) & 0xFFFF) | ((tag) << 16)) : (uid)) ++#define TAGINO_GID(cond, gid, tag) (gid) ++ ++#endif ++ ++ ++#ifdef CONFIG_TAGGING_INTERN ++ ++#define MAX_UID 0xFFFFFFFF ++#define MAX_GID 0xFFFFFFFF ++ ++#define INOTAG_TAG(cond, uid, gid, tag) \ ++ ((cond) ? (tag) : 0) ++ ++#define TAGINO_UID(cond, uid, tag) (uid) ++#define TAGINO_GID(cond, gid, tag) (gid) ++ ++#endif ++ ++ ++#ifndef CONFIG_TAGGING_NONE ++#define dx_current_fstag(sb) \ ++ ((sb)->s_flags & MS_TAGGED ? dx_current_tag() : 0) ++#else ++#define dx_current_fstag(sb) (0) ++#endif ++ ++#ifndef CONFIG_TAGGING_INTERN ++#define TAGINO_TAG(cond, tag) (0) ++#else ++#define TAGINO_TAG(cond, tag) ((cond) ? (tag) : 0) ++#endif ++ ++#define INOTAG_UID(cond, uid, gid) \ ++ ((cond) ? ((uid) & MAX_UID) : (uid)) ++#define INOTAG_GID(cond, uid, gid) \ ++ ((cond) ? ((gid) & MAX_GID) : (gid)) ++ ++ ++static inline uid_t dx_map_uid(uid_t uid) ++{ ++ if ((uid > MAX_UID) && (uid != -1)) ++ uid = -2; ++ return (uid & MAX_UID); ++} ++ ++static inline gid_t dx_map_gid(gid_t gid) ++{ ++ if ((gid > MAX_GID) && (gid != -1)) ++ gid = -2; ++ return (gid & MAX_GID); ++} ++ ++struct peer_tag { ++ int32_t xid; ++ int32_t nid; ++}; ++ ++#define dx_notagcheck(sb) ((sb) && ((sb)->s_flags & MS_NOTAGCHECK)) ++ ++int dx_parse_tag(char *string, tag_t *tag, int remove, int *mnt_flags, ++ unsigned long *flags); ++ ++#ifdef CONFIG_PROPAGATE ++ ++void __dx_propagate_tag(struct nameidata *nd, struct inode *inode); ++ ++#define dx_propagate_tag(n, i) __dx_propagate_tag(n, i) ++ ++#else ++#define dx_propagate_tag(n, i) do { } while (0) ++#endif ++ ++#endif /* _DX_TAG_H */ +diff -NurpP --minimal linux-3.3.8/include/linux/vserver/tag_cmd.h linux-3.3.8-vs2.3.3.4/include/linux/vserver/tag_cmd.h +--- linux-3.3.8/include/linux/vserver/tag_cmd.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/linux/vserver/tag_cmd.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,22 @@ ++#ifndef _VX_TAG_CMD_H ++#define _VX_TAG_CMD_H ++ ++ ++/* vinfo commands */ ++ ++#define VCMD_task_tag VC_CMD(VINFO, 3, 0) ++ ++#ifdef __KERNEL__ ++extern int vc_task_tag(uint32_t); ++ ++#endif /* __KERNEL__ */ ++ ++/* context commands */ ++ ++#define VCMD_tag_migrate VC_CMD(TAGMIG, 1, 0) ++ ++#ifdef __KERNEL__ ++extern int vc_tag_migrate(uint32_t); ++ ++#endif /* __KERNEL__ */ ++#endif /* _VX_TAG_CMD_H */ +diff -NurpP --minimal linux-3.3.8/include/net/addrconf.h linux-3.3.8-vs2.3.3.4/include/net/addrconf.h +--- linux-3.3.8/include/net/addrconf.h 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/net/addrconf.h 2012-02-24 03:55:06.000000000 +0100 +@@ -80,7 +80,8 @@ extern int ipv6_dev_get_saddr(struct n + struct net_device *dev, + const struct in6_addr *daddr, + unsigned int srcprefs, +- struct in6_addr *saddr); ++ struct in6_addr *saddr, ++ struct nx_info *nxi); + extern int ipv6_get_lladdr(struct net_device *dev, + struct in6_addr *addr, + unsigned char banned_flags); +diff -NurpP --minimal linux-3.3.8/include/net/af_unix.h linux-3.3.8-vs2.3.3.4/include/net/af_unix.h +--- linux-3.3.8/include/net/af_unix.h 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/net/af_unix.h 2012-02-24 03:55:06.000000000 +0100 +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + #include + + extern void unix_inflight(struct file *fp); +diff -NurpP --minimal linux-3.3.8/include/net/inet_timewait_sock.h linux-3.3.8-vs2.3.3.4/include/net/inet_timewait_sock.h +--- linux-3.3.8/include/net/inet_timewait_sock.h 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/net/inet_timewait_sock.h 2012-02-24 03:55:06.000000000 +0100 +@@ -112,6 +112,10 @@ struct inet_timewait_sock { + #define tw_net __tw_common.skc_net + #define tw_daddr __tw_common.skc_daddr + #define tw_rcv_saddr __tw_common.skc_rcv_saddr ++#define tw_xid __tw_common.skc_xid ++#define tw_vx_info __tw_common.skc_vx_info ++#define tw_nid __tw_common.skc_nid ++#define tw_nx_info __tw_common.skc_nx_info + int tw_timeout; + volatile unsigned char tw_substate; + unsigned char tw_rcv_wscale; +diff -NurpP --minimal linux-3.3.8/include/net/ip6_route.h linux-3.3.8-vs2.3.3.4/include/net/ip6_route.h +--- linux-3.3.8/include/net/ip6_route.h 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/net/ip6_route.h 2012-02-24 03:55:06.000000000 +0100 +@@ -88,7 +88,8 @@ extern int ip6_route_get_saddr(struct + struct rt6_info *rt, + const struct in6_addr *daddr, + unsigned int prefs, +- struct in6_addr *saddr); ++ struct in6_addr *saddr, ++ struct nx_info *nxi); + + extern struct rt6_info *rt6_lookup(struct net *net, + const struct in6_addr *daddr, +diff -NurpP --minimal linux-3.3.8/include/net/route.h linux-3.3.8-vs2.3.3.4/include/net/route.h +--- linux-3.3.8/include/net/route.h 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/net/route.h 2012-02-24 03:55:06.000000000 +0100 +@@ -202,6 +202,9 @@ static inline void ip_rt_put(struct rtab + dst_release(&rt->dst); + } + ++#include ++#include ++ + #define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3) + + extern const __u8 ip_tos2prio[16]; +@@ -253,6 +256,9 @@ static inline void ip_route_connect_init + protocol, flow_flags, dst, src, dport, sport); + } + ++extern struct rtable *ip_v4_find_src(struct net *net, struct nx_info *, ++ struct flowi4 *); ++ + static inline struct rtable *ip_route_connect(struct flowi4 *fl4, + __be32 dst, __be32 src, u32 tos, + int oif, u8 protocol, +@@ -261,11 +267,25 @@ static inline struct rtable *ip_route_co + { + struct net *net = sock_net(sk); + struct rtable *rt; ++ struct nx_info *nx_info = current_nx_info(); + + ip_route_connect_init(fl4, dst, src, tos, oif, protocol, + sport, dport, sk, can_sleep); + +- if (!dst || !src) { ++ if (sk) ++ nx_info = sk->sk_nx_info; ++ ++ vxdprintk(VXD_CBIT(net, 4), ++ "ip_route_connect(%p) %p,%p;%lx", ++ sk, nx_info, sk->sk_socket, ++ (sk->sk_socket?sk->sk_socket->flags:0)); ++ ++ rt = ip_v4_find_src(net, nx_info, fl4); ++ if (IS_ERR(rt)) ++ return rt; ++ ip_rt_put(rt); ++ ++ if (!fl4->daddr || !fl4->saddr) { + rt = __ip_route_output_key(net, fl4); + if (IS_ERR(rt)) + return rt; +diff -NurpP --minimal linux-3.3.8/include/net/sock.h linux-3.3.8-vs2.3.3.4/include/net/sock.h +--- linux-3.3.8/include/net/sock.h 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/include/net/sock.h 2012-02-24 03:55:06.000000000 +0100 +@@ -168,6 +168,10 @@ struct sock_common { + #ifdef CONFIG_NET_NS + struct net *skc_net; + #endif ++ xid_t skc_xid; ++ struct vx_info *skc_vx_info; ++ nid_t skc_nid; ++ struct nx_info *skc_nx_info; + /* + * fields between dontcopy_begin/dontcopy_end + * are not copied in sock_copy() +@@ -278,6 +282,10 @@ struct sock { + #define sk_bind_node __sk_common.skc_bind_node + #define sk_prot __sk_common.skc_prot + #define sk_net __sk_common.skc_net ++#define sk_xid __sk_common.skc_xid ++#define sk_vx_info __sk_common.skc_vx_info ++#define sk_nid __sk_common.skc_nid ++#define sk_nx_info __sk_common.skc_nx_info + socket_lock_t sk_lock; + struct sk_buff_head sk_receive_queue; + /* +diff -NurpP --minimal linux-3.3.8/init/Kconfig linux-3.3.8-vs2.3.3.4/init/Kconfig +--- linux-3.3.8/init/Kconfig 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/init/Kconfig 2012-02-24 03:55:06.000000000 +0100 +@@ -588,6 +588,7 @@ config HAVE_UNSTABLE_SCHED_CLOCK + menuconfig CGROUPS + boolean "Control Group support" + depends on EVENTFD ++ default y + help + This option adds support for grouping sets of processes together, for + use with process control subsystems such as Cpusets, CFS, memory +@@ -837,6 +838,7 @@ config IPC_NS + config USER_NS + bool "User namespace (EXPERIMENTAL)" + depends on EXPERIMENTAL ++ depends on VSERVER_DISABLED + default y + help + This allows containers, i.e. vservers, to use user namespaces +diff -NurpP --minimal linux-3.3.8/init/main.c linux-3.3.8-vs2.3.3.4/init/main.c +--- linux-3.3.8/init/main.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/init/main.c 2012-06-08 15:27:44.000000000 +0200 +@@ -68,6 +68,7 @@ + #include + #include + #include ++#include + + #include + #include +diff -NurpP --minimal linux-3.3.8/ipc/mqueue.c linux-3.3.8-vs2.3.3.4/ipc/mqueue.c +--- linux-3.3.8/ipc/mqueue.c 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/ipc/mqueue.c 2012-02-24 04:07:13.000000000 +0100 +@@ -34,6 +34,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include "util.h" +@@ -67,6 +69,7 @@ struct mqueue_inode_info { + struct sigevent notify; + struct pid* notify_owner; + struct user_struct *user; /* user who created, for accounting */ ++ struct vx_info *vxi; + struct sock *notify_sock; + struct sk_buff *notify_cookie; + +@@ -129,6 +132,7 @@ static struct inode *mqueue_get_inode(st + if (S_ISREG(mode)) { + struct mqueue_inode_info *info; + unsigned long mq_bytes, mq_msg_tblsz; ++ struct vx_info *vxi = current_vx_info(); + + inode->i_fop = &mqueue_file_operations; + inode->i_size = FILENT_SIZE; +@@ -141,6 +145,7 @@ static struct inode *mqueue_get_inode(st + info->notify_owner = NULL; + info->qsize = 0; + info->user = NULL; /* set when all is ok */ ++ info->vxi = NULL; + memset(&info->attr, 0, sizeof(info->attr)); + info->attr.mq_maxmsg = ipc_ns->mq_msg_max; + info->attr.mq_msgsize = ipc_ns->mq_msgsize_max; +@@ -158,17 +163,20 @@ static struct inode *mqueue_get_inode(st + + spin_lock(&mq_lock); + if (u->mq_bytes + mq_bytes < u->mq_bytes || +- u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) { ++ u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE) || ++ !vx_ipcmsg_avail(vxi, mq_bytes)) { + spin_unlock(&mq_lock); + /* mqueue_evict_inode() releases info->messages */ + ret = -EMFILE; + goto out_inode; + } + u->mq_bytes += mq_bytes; ++ vx_ipcmsg_add(vxi, u, mq_bytes); + spin_unlock(&mq_lock); + + /* all is ok */ + info->user = get_uid(u); ++ info->vxi = get_vx_info(vxi); + } else if (S_ISDIR(mode)) { + inc_nlink(inode); + /* Some things misbehave if size == 0 on a directory */ +@@ -277,8 +285,11 @@ static void mqueue_evict_inode(struct in + + info->attr.mq_msgsize); + user = info->user; + if (user) { ++ struct vx_info *vxi = info->vxi; ++ + spin_lock(&mq_lock); + user->mq_bytes -= mq_bytes; ++ vx_ipcmsg_sub(vxi, user, mq_bytes); + /* + * get_ns_from_inode() ensures that the + * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns +@@ -288,6 +299,7 @@ static void mqueue_evict_inode(struct in + if (ipc_ns) + ipc_ns->mq_queues_count--; + spin_unlock(&mq_lock); ++ put_vx_info(vxi); + free_uid(user); + } + if (ipc_ns) +diff -NurpP --minimal linux-3.3.8/ipc/msg.c linux-3.3.8-vs2.3.3.4/ipc/msg.c +--- linux-3.3.8/ipc/msg.c 2011-05-22 16:17:59.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/ipc/msg.c 2012-02-24 03:55:06.000000000 +0100 +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -190,6 +191,7 @@ static int newque(struct ipc_namespace * + + msq->q_perm.mode = msgflg & S_IRWXUGO; + msq->q_perm.key = key; ++ msq->q_perm.xid = vx_current_xid(); + + msq->q_perm.security = NULL; + retval = security_msg_queue_alloc(msq); +diff -NurpP --minimal linux-3.3.8/ipc/namespace.c linux-3.3.8-vs2.3.3.4/ipc/namespace.c +--- linux-3.3.8/ipc/namespace.c 2011-07-22 11:18:12.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/ipc/namespace.c 2012-02-24 03:55:06.000000000 +0100 +@@ -13,11 +13,12 @@ + #include + #include + #include ++#include ++#include + + #include "util.h" + +-static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk, +- struct ipc_namespace *old_ns) ++static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns) + { + struct ipc_namespace *ns; + int err; +@@ -46,19 +47,18 @@ static struct ipc_namespace *create_ipc_ + ipcns_notify(IPCNS_CREATED); + register_ipcns_notifier(ns); + +- ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); ++ ns->user_ns = get_user_ns(user_ns); + + return ns; + } + + struct ipc_namespace *copy_ipcs(unsigned long flags, +- struct task_struct *tsk) ++ struct ipc_namespace *old_ns, ++ struct user_namespace *user_ns) + { +- struct ipc_namespace *ns = tsk->nsproxy->ipc_ns; +- + if (!(flags & CLONE_NEWIPC)) +- return get_ipc_ns(ns); +- return create_ipc_ns(tsk, ns); ++ return get_ipc_ns(old_ns); ++ return create_ipc_ns(user_ns); + } + + /* +diff -NurpP --minimal linux-3.3.8/ipc/sem.c linux-3.3.8-vs2.3.3.4/ipc/sem.c +--- linux-3.3.8/ipc/sem.c 2012-01-09 16:14:59.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/ipc/sem.c 2012-02-24 03:55:06.000000000 +0100 +@@ -86,6 +86,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include "util.h" +@@ -306,6 +308,7 @@ static int newary(struct ipc_namespace * + + sma->sem_perm.mode = (semflg & S_IRWXUGO); + sma->sem_perm.key = key; ++ sma->sem_perm.xid = vx_current_xid(); + + sma->sem_perm.security = NULL; + retval = security_sem_alloc(sma); +@@ -321,6 +324,9 @@ static int newary(struct ipc_namespace * + return id; + } + ns->used_sems += nsems; ++ /* FIXME: obsoleted? */ ++ vx_semary_inc(sma); ++ vx_nsems_add(sma, nsems); + + sma->sem_base = (struct sem *) &sma[1]; + +@@ -770,6 +776,9 @@ static void freeary(struct ipc_namespace + + wake_up_sem_queue_do(&tasks); + ns->used_sems -= sma->sem_nsems; ++ /* FIXME: obsoleted? */ ++ vx_nsems_sub(sma, sma->sem_nsems); ++ vx_semary_dec(sma); + security_sem_free(sma); + ipc_rcu_putref(sma); + } +diff -NurpP --minimal linux-3.3.8/ipc/shm.c linux-3.3.8-vs2.3.3.4/ipc/shm.c +--- linux-3.3.8/ipc/shm.c 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/ipc/shm.c 2012-02-24 03:55:06.000000000 +0100 +@@ -39,6 +39,8 @@ + #include + #include + #include ++#include ++#include + + #include + +@@ -187,7 +189,12 @@ static void shm_open(struct vm_area_stru + */ + static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) + { +- ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ struct vx_info *vxi = lookup_vx_info(shp->shm_perm.xid); ++ int numpages = (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ ++ vx_ipcshm_sub(vxi, shp, numpages); ++ ns->shm_tot -= numpages; ++ + shm_rmid(ns, shp); + shm_unlock(shp); + if (!is_file_hugepages(shp->shm_file)) +@@ -197,6 +204,7 @@ static void shm_destroy(struct ipc_names + shp->mlock_user); + fput (shp->shm_file); + security_shm_free(shp); ++ put_vx_info(vxi); + ipc_rcu_putref(shp); + } + +@@ -462,11 +470,15 @@ static int newseg(struct ipc_namespace * + if (ns->shm_tot + numpages > ns->shm_ctlall) + return -ENOSPC; + ++ if (!vx_ipcshm_avail(current_vx_info(), numpages)) ++ return -ENOSPC; ++ + shp = ipc_rcu_alloc(sizeof(*shp)); + if (!shp) + return -ENOMEM; + + shp->shm_perm.key = key; ++ shp->shm_perm.xid = vx_current_xid(); + shp->shm_perm.mode = (shmflg & S_IRWXUGO); + shp->mlock_user = NULL; + +@@ -521,6 +533,7 @@ static int newseg(struct ipc_namespace * + ns->shm_tot += numpages; + error = shp->shm_perm.id; + shm_unlock(shp); ++ vx_ipcshm_add(current_vx_info(), key, numpages); + return error; + + no_id: +diff -NurpP --minimal linux-3.3.8/kernel/Makefile linux-3.3.8-vs2.3.3.4/kernel/Makefile +--- linux-3.3.8/kernel/Makefile 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/Makefile 2012-02-24 03:55:06.000000000 +0100 +@@ -25,6 +25,7 @@ endif + obj-y += sched/ + obj-y += power/ + ++obj-y += vserver/ + obj-$(CONFIG_FREEZER) += freezer.o + obj-$(CONFIG_PROFILING) += profile.o + obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o +diff -NurpP --minimal linux-3.3.8/kernel/auditsc.c linux-3.3.8-vs2.3.3.4/kernel/auditsc.c +--- linux-3.3.8/kernel/auditsc.c 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/auditsc.c 2012-02-24 16:47:49.000000000 +0100 +@@ -2308,7 +2308,7 @@ int audit_set_loginuid(uid_t loginuid) + if (task->loginuid != -1) + return -EPERM; + #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ +- if (!capable(CAP_AUDIT_CONTROL)) ++ if (!vx_capable(CAP_AUDIT_CONTROL, VXC_AUDIT_CONTROL)) + return -EPERM; + #endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + +diff -NurpP --minimal linux-3.3.8/kernel/capability.c linux-3.3.8-vs2.3.3.4/kernel/capability.c +--- linux-3.3.8/kernel/capability.c 2012-03-19 19:47:29.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/capability.c 2012-02-24 03:55:06.000000000 +0100 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + + /* +@@ -116,6 +117,7 @@ static int cap_validate_magic(cap_user_h + return 0; + } + ++ + /* + * The only thing that can change the capabilities of the current + * process is the current process. As such, we can't be in this code +@@ -349,6 +351,8 @@ bool has_ns_capability_noaudit(struct ta + return (ret == 0); + } + ++#include ++ + /** + * has_capability_noaudit - Does a task have a capability (unaudited) in the + * initial user ns +diff -NurpP --minimal linux-3.3.8/kernel/compat.c linux-3.3.8-vs2.3.3.4/kernel/compat.c +--- linux-3.3.8/kernel/compat.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/kernel/compat.c 2012-05-22 09:49:13.000000000 +0200 +@@ -1002,7 +1002,7 @@ asmlinkage long compat_sys_stime(compat_ + if (err) + return err; + +- do_settimeofday(&tv); ++ vx_settimeofday(&tv); + return 0; + } + +diff -NurpP --minimal linux-3.3.8/kernel/cred.c linux-3.3.8-vs2.3.3.4/kernel/cred.c +--- linux-3.3.8/kernel/cred.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/kernel/cred.c 2012-04-16 13:32:11.000000000 +0200 +@@ -61,31 +61,6 @@ struct cred init_cred = { + #endif + }; + +-static inline void set_cred_subscribers(struct cred *cred, int n) +-{ +-#ifdef CONFIG_DEBUG_CREDENTIALS +- atomic_set(&cred->subscribers, n); +-#endif +-} +- +-static inline int read_cred_subscribers(const struct cred *cred) +-{ +-#ifdef CONFIG_DEBUG_CREDENTIALS +- return atomic_read(&cred->subscribers); +-#else +- return 0; +-#endif +-} +- +-static inline void alter_cred_subscribers(const struct cred *_cred, int n) +-{ +-#ifdef CONFIG_DEBUG_CREDENTIALS +- struct cred *cred = (struct cred *) _cred; +- +- atomic_add(n, &cred->subscribers); +-#endif +-} +- + /* + * Dispose of the shared task group credentials + */ +@@ -281,21 +256,16 @@ error: + * + * Call commit_creds() or abort_creds() to clean up. + */ +-struct cred *prepare_creds(void) ++struct cred *__prepare_creds(const struct cred *old) + { +- struct task_struct *task = current; +- const struct cred *old; + struct cred *new; + +- validate_process_creds(); +- + new = kmem_cache_alloc(cred_jar, GFP_KERNEL); + if (!new) + return NULL; + + kdebug("prepare_creds() alloc %p", new); + +- old = task->cred; + memcpy(new, old, sizeof(struct cred)); + + atomic_set(&new->usage, 1); +@@ -322,6 +292,13 @@ error: + abort_creds(new); + return NULL; + } ++ ++struct cred *prepare_creds(void) ++{ ++ validate_process_creds(); ++ ++ return __prepare_creds(current->cred); ++} + EXPORT_SYMBOL(prepare_creds); + + /* +diff -NurpP --minimal linux-3.3.8/kernel/exit.c linux-3.3.8-vs2.3.3.4/kernel/exit.c +--- linux-3.3.8/kernel/exit.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/kernel/exit.c 2012-05-09 04:08:08.000000000 +0200 +@@ -48,6 +48,10 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + #include + #include + #include +@@ -481,9 +485,11 @@ static void close_files(struct files_str + filp_close(file, files); + cond_resched(); + } ++ vx_openfd_dec(i); + } + i++; + set >>= 1; ++ cond_resched(); + } + } + } +@@ -1035,10 +1041,15 @@ void do_exit(long code) + smp_mb(); + raw_spin_unlock_wait(&tsk->pi_lock); + ++ /* needs to stay after exit_notify() */ ++ exit_vx_info(tsk, code); ++ exit_nx_info(tsk); ++ + /* causes final put_task_struct in finish_task_switch(). */ + tsk->state = TASK_DEAD; + tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ + schedule(); ++ printk("bad task: %p [%lx]\n", current, current->state); + BUG(); + /* Avoid "noreturn function does return". */ + for (;;) +diff -NurpP --minimal linux-3.3.8/kernel/fork.c linux-3.3.8-vs2.3.3.4/kernel/fork.c +--- linux-3.3.8/kernel/fork.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/kernel/fork.c 2012-05-22 09:49:13.000000000 +0200 +@@ -68,6 +68,9 @@ + #include + #include + #include ++#include ++#include ++#include + + #include + #include +@@ -170,6 +173,8 @@ void free_task(struct task_struct *tsk) + account_kernel_stack(tsk->stack, -1); + free_thread_info(tsk->stack); + rt_mutex_debug_task_free(tsk); ++ clr_vx_info(&tsk->vx_info); ++ clr_nx_info(&tsk->nx_info); + ftrace_graph_exit_task(tsk); + free_task_struct(tsk); + } +@@ -505,6 +510,7 @@ static struct mm_struct *mm_init(struct + if (likely(!mm_alloc_pgd(mm))) { + mm->def_flags = 0; + mmu_notifier_mm_init(mm); ++ set_vx_info(&mm->mm_vx_info, p->vx_info); + return mm; + } + +@@ -542,6 +548,7 @@ void __mmdrop(struct mm_struct *mm) + #ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); + #endif ++ clr_vx_info(&mm->mm_vx_info); + free_mm(mm); + } + EXPORT_SYMBOL_GPL(__mmdrop); +@@ -777,6 +784,7 @@ struct mm_struct *dup_mm(struct task_str + goto fail_nomem; + + memcpy(mm, oldmm, sizeof(*mm)); ++ mm->mm_vx_info = NULL; + mm_init_cpumask(mm); + + /* Initializing for Swap token stuff */ +@@ -820,6 +828,7 @@ fail_nocontext: + * If init_new_context() failed, we cannot use mmput() to free the mm + * because it calls destroy_context() + */ ++ clr_vx_info(&mm->mm_vx_info); + mm_free_pgd(mm); + free_mm(mm); + return NULL; +@@ -1105,6 +1114,8 @@ static struct task_struct *copy_process( + int retval; + struct task_struct *p; + int cgroup_callbacks_done = 0; ++ struct vx_info *vxi; ++ struct nx_info *nxi; + + if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) + return ERR_PTR(-EINVAL); +@@ -1151,7 +1162,12 @@ static struct task_struct *copy_process( + DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); + DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); + #endif ++ init_vx_info(&p->vx_info, current_vx_info()); ++ init_nx_info(&p->nx_info, current_nx_info()); ++ + retval = -EAGAIN; ++ if (!vx_nproc_avail(1)) ++ goto bad_fork_free; + if (atomic_read(&p->real_cred->user->processes) >= + task_rlimit(p, RLIMIT_NPROC)) { + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && +@@ -1421,6 +1437,18 @@ static struct task_struct *copy_process( + + total_forks++; + spin_unlock(¤t->sighand->siglock); ++ ++ /* p is copy of current */ ++ vxi = p->vx_info; ++ if (vxi) { ++ claim_vx_info(vxi, p); ++ atomic_inc(&vxi->cvirt.nr_threads); ++ atomic_inc(&vxi->cvirt.total_forks); ++ vx_nproc_inc(p); ++ } ++ nxi = p->nx_info; ++ if (nxi) ++ claim_nx_info(nxi, p); + write_unlock_irq(&tasklist_lock); + proc_fork_connector(p); + cgroup_post_fork(p); +diff -NurpP --minimal linux-3.3.8/kernel/kthread.c linux-3.3.8-vs2.3.3.4/kernel/kthread.c +--- linux-3.3.8/kernel/kthread.c 2012-03-19 19:47:30.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/kthread.c 2012-02-24 03:55:06.000000000 +0100 +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + #include + + static DEFINE_SPINLOCK(kthread_create_lock); +diff -NurpP --minimal linux-3.3.8/kernel/nsproxy.c linux-3.3.8-vs2.3.3.4/kernel/nsproxy.c +--- linux-3.3.8/kernel/nsproxy.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/nsproxy.c 2012-02-24 16:59:37.000000000 +0100 +@@ -20,11 +20,14 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include + #include + #include ++#include "../fs/mount.h" + + static struct kmem_cache *nsproxy_cachep; + +@@ -46,8 +49,11 @@ static inline struct nsproxy *create_nsp + struct nsproxy *nsproxy; + + nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); +- if (nsproxy) ++ if (nsproxy) { + atomic_set(&nsproxy->count, 1); ++ atomic_inc(&vs_global_nsproxy); ++ } ++ vxdprintk(VXD_CBIT(space, 2), "create_nsproxy = %p[1]", nsproxy); + return nsproxy; + } + +@@ -56,8 +62,11 @@ static inline struct nsproxy *create_nsp + * Return the newly created nsproxy. Do not attach this to the task, + * leave it to the caller to do proper locking and attach it to task. + */ +-static struct nsproxy *create_new_namespaces(unsigned long flags, +- struct task_struct *tsk, struct fs_struct *new_fs) ++static struct nsproxy *unshare_namespaces(unsigned long flags, ++ struct nsproxy *orig, ++ struct fs_struct *new_fs, ++ struct user_namespace *new_user, ++ struct pid_namespace *new_pid) + { + struct nsproxy *new_nsp; + int err; +@@ -66,31 +75,31 @@ static struct nsproxy *create_new_namesp + if (!new_nsp) + return ERR_PTR(-ENOMEM); + +- new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); ++ new_nsp->mnt_ns = copy_mnt_ns(flags, orig->mnt_ns, new_fs); + if (IS_ERR(new_nsp->mnt_ns)) { + err = PTR_ERR(new_nsp->mnt_ns); + goto out_ns; + } + +- new_nsp->uts_ns = copy_utsname(flags, tsk); ++ new_nsp->uts_ns = copy_utsname(flags, orig->uts_ns, new_user); + if (IS_ERR(new_nsp->uts_ns)) { + err = PTR_ERR(new_nsp->uts_ns); + goto out_uts; + } + +- new_nsp->ipc_ns = copy_ipcs(flags, tsk); ++ new_nsp->ipc_ns = copy_ipcs(flags, orig->ipc_ns, new_user); + if (IS_ERR(new_nsp->ipc_ns)) { + err = PTR_ERR(new_nsp->ipc_ns); + goto out_ipc; + } + +- new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); ++ new_nsp->pid_ns = copy_pid_ns(flags, new_pid); + if (IS_ERR(new_nsp->pid_ns)) { + err = PTR_ERR(new_nsp->pid_ns); + goto out_pid; + } + +- new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); ++ new_nsp->net_ns = copy_net_ns(flags, orig->net_ns); + if (IS_ERR(new_nsp->net_ns)) { + err = PTR_ERR(new_nsp->net_ns); + goto out_net; +@@ -115,6 +124,40 @@ out_ns: + return ERR_PTR(err); + } + ++static struct nsproxy *create_new_namespaces(unsigned long flags, ++ struct task_struct *tsk, struct fs_struct *new_fs) ++{ ++ return unshare_namespaces(flags, tsk->nsproxy, ++ new_fs, task_cred_xxx(tsk, user)->user_ns, ++ task_active_pid_ns(tsk)); ++} ++ ++/* ++ * copies the nsproxy, setting refcount to 1, and grabbing a ++ * reference to all contained namespaces. ++ */ ++struct nsproxy *copy_nsproxy(struct nsproxy *orig) ++{ ++ struct nsproxy *ns = create_nsproxy(); ++ ++ if (ns) { ++ memcpy(ns, orig, sizeof(struct nsproxy)); ++ atomic_set(&ns->count, 1); ++ ++ if (ns->mnt_ns) ++ get_mnt_ns(ns->mnt_ns); ++ if (ns->uts_ns) ++ get_uts_ns(ns->uts_ns); ++ if (ns->ipc_ns) ++ get_ipc_ns(ns->ipc_ns); ++ if (ns->pid_ns) ++ get_pid_ns(ns->pid_ns); ++ if (ns->net_ns) ++ get_net(ns->net_ns); ++ } ++ return ns; ++} ++ + /* + * called from clone. This now handles copy for nsproxy and all + * namespaces therein. +@@ -122,9 +165,12 @@ out_ns: + int copy_namespaces(unsigned long flags, struct task_struct *tsk) + { + struct nsproxy *old_ns = tsk->nsproxy; +- struct nsproxy *new_ns; ++ struct nsproxy *new_ns = NULL; + int err = 0; + ++ vxdprintk(VXD_CBIT(space, 7), "copy_namespaces(0x%08lx,%p[%p])", ++ flags, tsk, old_ns); ++ + if (!old_ns) + return 0; + +@@ -134,7 +180,7 @@ int copy_namespaces(unsigned long flags, + CLONE_NEWPID | CLONE_NEWNET))) + return 0; + +- if (!capable(CAP_SYS_ADMIN)) { ++ if (!vx_can_unshare(CAP_SYS_ADMIN, flags)) { + err = -EPERM; + goto out; + } +@@ -161,6 +207,9 @@ int copy_namespaces(unsigned long flags, + + out: + put_nsproxy(old_ns); ++ vxdprintk(VXD_CBIT(space, 3), ++ "copy_namespaces(0x%08lx,%p[%p]) = %d [%p]", ++ flags, tsk, old_ns, err, new_ns); + return err; + } + +@@ -174,7 +223,9 @@ void free_nsproxy(struct nsproxy *ns) + put_ipc_ns(ns->ipc_ns); + if (ns->pid_ns) + put_pid_ns(ns->pid_ns); +- put_net(ns->net_ns); ++ if (ns->net_ns) ++ put_net(ns->net_ns); ++ atomic_dec(&vs_global_nsproxy); + kmem_cache_free(nsproxy_cachep, ns); + } + +@@ -187,11 +238,15 @@ int unshare_nsproxy_namespaces(unsigned + { + int err = 0; + ++ vxdprintk(VXD_CBIT(space, 4), ++ "unshare_nsproxy_namespaces(0x%08lx,[%p])", ++ unshare_flags, current->nsproxy); ++ + if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWNET))) + return 0; + +- if (!capable(CAP_SYS_ADMIN)) ++ if (!vx_can_unshare(CAP_SYS_ADMIN, unshare_flags)) + return -EPERM; + + *new_nsp = create_new_namespaces(unshare_flags, current, +diff -NurpP --minimal linux-3.3.8/kernel/pid.c linux-3.3.8-vs2.3.3.4/kernel/pid.c +--- linux-3.3.8/kernel/pid.c 2012-03-19 19:47:30.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/pid.c 2012-03-19 20:52:10.000000000 +0100 +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + + #define pid_hashfn(nr, ns) \ + hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) +@@ -344,7 +345,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns); + + struct pid *find_vpid(int nr) + { +- return find_pid_ns(nr, current->nsproxy->pid_ns); ++ return find_pid_ns(vx_rmap_pid(nr), current->nsproxy->pid_ns); + } + EXPORT_SYMBOL_GPL(find_vpid); + +@@ -404,6 +405,9 @@ void transfer_pid(struct task_struct *ol + struct task_struct *pid_task(struct pid *pid, enum pid_type type) + { + struct task_struct *result = NULL; ++ ++ if (type == PIDTYPE_REALPID) ++ type = PIDTYPE_PID; + if (pid) { + struct hlist_node *first; + first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), +@@ -423,7 +427,7 @@ struct task_struct *find_task_by_pid_ns( + rcu_lockdep_assert(rcu_read_lock_held(), + "find_task_by_pid_ns() needs rcu_read_lock()" + " protection"); +- return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); ++ return pid_task(find_pid_ns(vx_rmap_pid(nr), ns), PIDTYPE_PID); + } + + struct task_struct *find_task_by_vpid(pid_t vnr) +@@ -467,7 +471,7 @@ struct pid *find_get_pid(pid_t nr) + } + EXPORT_SYMBOL_GPL(find_get_pid); + +-pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) ++pid_t pid_unmapped_nr_ns(struct pid *pid, struct pid_namespace *ns) + { + struct upid *upid; + pid_t nr = 0; +@@ -480,6 +484,11 @@ pid_t pid_nr_ns(struct pid *pid, struct + return nr; + } + ++pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) ++{ ++ return vx_map_pid(pid_unmapped_nr_ns(pid, ns)); ++} ++ + pid_t pid_vnr(struct pid *pid) + { + return pid_nr_ns(pid, current->nsproxy->pid_ns); +diff -NurpP --minimal linux-3.3.8/kernel/pid_namespace.c linux-3.3.8-vs2.3.3.4/kernel/pid_namespace.c +--- linux-3.3.8/kernel/pid_namespace.c 2012-03-19 19:47:30.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/pid_namespace.c 2012-02-24 03:55:06.000000000 +0100 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + + #define BITS_PER_PAGE (PAGE_SIZE*8) + +@@ -88,6 +89,7 @@ static struct pid_namespace *create_pid_ + goto out_free_map; + + kref_init(&ns->kref); ++ atomic_inc(&vs_global_pid_ns); + ns->level = level; + ns->parent = get_pid_ns(parent_pid_ns); + +@@ -119,6 +121,7 @@ static void destroy_pid_namespace(struct + + for (i = 0; i < PIDMAP_ENTRIES; i++) + kfree(ns->pidmap[i].page); ++ atomic_dec(&vs_global_pid_ns); + kmem_cache_free(pid_ns_cachep, ns); + } + +diff -NurpP --minimal linux-3.3.8/kernel/posix-timers.c linux-3.3.8-vs2.3.3.4/kernel/posix-timers.c +--- linux-3.3.8/kernel/posix-timers.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/posix-timers.c 2012-02-24 03:55:06.000000000 +0100 +@@ -47,6 +47,7 @@ + #include + #include + #include ++#include + + /* + * Management arrays for POSIX timers. Timers are kept in slab memory +@@ -340,6 +341,7 @@ int posix_timer_event(struct k_itimer *t + { + struct task_struct *task; + int shared, ret = -1; ++ + /* + * FIXME: if ->sigq is queued we can race with + * dequeue_signal()->do_schedule_next_timer(). +@@ -356,10 +358,18 @@ int posix_timer_event(struct k_itimer *t + rcu_read_lock(); + task = pid_task(timr->it_pid, PIDTYPE_PID); + if (task) { ++ struct vx_info_save vxis; ++ struct vx_info *vxi; ++ ++ vxi = get_vx_info(task->vx_info); ++ enter_vx_info(vxi, &vxis); + shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); + ret = send_sigqueue(timr->sigq, task, shared); ++ leave_vx_info(&vxis); ++ put_vx_info(vxi); + } + rcu_read_unlock(); ++ + /* If we failed to send the signal the timer stops. */ + return ret > 0; + } +diff -NurpP --minimal linux-3.3.8/kernel/printk.c linux-3.3.8-vs2.3.3.4/kernel/printk.c +--- linux-3.3.8/kernel/printk.c 2012-03-19 19:47:30.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/printk.c 2012-03-19 20:52:10.000000000 +0100 +@@ -41,6 +41,7 @@ + #include + #include + #include ++#include + + #include + +@@ -314,7 +315,7 @@ static int check_syslog_permissions(int + return 0; + + if (syslog_action_restricted(type)) { +- if (capable(CAP_SYSLOG)) ++ if (vx_capable(CAP_SYSLOG, VXC_SYSLOG)) + return 0; + /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ + if (capable(CAP_SYS_ADMIN)) { +@@ -344,12 +345,9 @@ int do_syslog(int type, char __user *buf + if (error) + return error; + +- switch (type) { +- case SYSLOG_ACTION_CLOSE: /* Close log */ +- break; +- case SYSLOG_ACTION_OPEN: /* Open log */ +- break; +- case SYSLOG_ACTION_READ: /* Read from log */ ++ if ((type == SYSLOG_ACTION_READ) || ++ (type == SYSLOG_ACTION_READ_ALL) || ++ (type == SYSLOG_ACTION_READ_CLEAR)) { + error = -EINVAL; + if (!buf || len < 0) + goto out; +@@ -360,6 +358,16 @@ int do_syslog(int type, char __user *buf + error = -EFAULT; + goto out; + } ++ } ++ if (!vx_check(0, VS_ADMIN|VS_WATCH)) ++ return vx_do_syslog(type, buf, len); ++ ++ switch (type) { ++ case SYSLOG_ACTION_CLOSE: /* Close log */ ++ break; ++ case SYSLOG_ACTION_OPEN: /* Open log */ ++ break; ++ case SYSLOG_ACTION_READ: /* Read from log */ + error = wait_event_interruptible(log_wait, + (log_start - log_end)); + if (error) +@@ -386,16 +394,6 @@ int do_syslog(int type, char __user *buf + /* FALL THRU */ + /* Read last kernel messages */ + case SYSLOG_ACTION_READ_ALL: +- error = -EINVAL; +- if (!buf || len < 0) +- goto out; +- error = 0; +- if (!len) +- goto out; +- if (!access_ok(VERIFY_WRITE, buf, len)) { +- error = -EFAULT; +- goto out; +- } + count = len; + if (count > log_buf_len) + count = log_buf_len; +diff -NurpP --minimal linux-3.3.8/kernel/ptrace.c linux-3.3.8-vs2.3.3.4/kernel/ptrace.c +--- linux-3.3.8/kernel/ptrace.c 2012-03-19 19:47:30.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/ptrace.c 2012-02-24 03:55:06.000000000 +0100 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -217,6 +218,11 @@ ok: + dumpable = get_dumpable(task->mm); + if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) + return -EPERM; ++ if (!vx_check(task->xid, VS_ADMIN_P|VS_WATCH_P|VS_IDENT)) ++ return -EPERM; ++ if (!vx_check(task->xid, VS_IDENT) && ++ !task_vx_flags(task, VXF_STATE_ADMIN, 0)) ++ return -EACCES; + + return security_ptrace_access_check(task, mode); + } +diff -NurpP --minimal linux-3.3.8/kernel/sched/core.c linux-3.3.8-vs2.3.3.4/kernel/sched/core.c +--- linux-3.3.8/kernel/sched/core.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/kernel/sched/core.c 2012-05-09 04:08:08.000000000 +0200 +@@ -71,6 +71,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -2326,9 +2328,17 @@ static void calc_global_nohz(void) + */ + void get_avenrun(unsigned long *loads, unsigned long offset, int shift) + { +- loads[0] = (avenrun[0] + offset) << shift; +- loads[1] = (avenrun[1] + offset) << shift; +- loads[2] = (avenrun[2] + offset) << shift; ++ if (vx_flags(VXF_VIRT_LOAD, 0)) { ++ struct vx_info *vxi = current_vx_info(); ++ ++ loads[0] = (vxi->cvirt.load[0] + offset) << shift; ++ loads[1] = (vxi->cvirt.load[1] + offset) << shift; ++ loads[2] = (vxi->cvirt.load[2] + offset) << shift; ++ } else { ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++ } + } + + /* +@@ -2632,14 +2642,17 @@ static inline void task_group_account_fi + void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) + { ++ struct vx_info *vxi = p->vx_info; /* p is _always_ current */ ++ int nice = (TASK_NICE(p) > 0); + int index; + + /* Add user time to process. */ + p->utime += cputime; + p->utimescaled += cputime_scaled; ++ vx_account_user(vxi, cputime, nice); + account_group_user_time(p, cputime); + +- index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++ index = (nice) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ + task_group_account_field(p, index, (__force u64) cputime); +@@ -2686,9 +2699,12 @@ static inline + void __account_system_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled, int index) + { ++ struct vx_info *vxi = p->vx_info; /* p is _always_ current */ ++ + /* Add system time to process. */ + p->stime += cputime; + p->stimescaled += cputime_scaled; ++ vx_account_system(vxi, cputime, 0 /* do we have idle time? */); + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ +@@ -3885,7 +3901,7 @@ SYSCALL_DEFINE1(nice, int, increment) + nice = 19; + + if (increment < 0 && !can_nice(current, nice)) +- return -EPERM; ++ return vx_flags(VXF_IGNEG_NICE, 0) ? 0 : -EPERM; + + retval = security_task_setnice(current, nice); + if (retval) +diff -NurpP --minimal linux-3.3.8/kernel/sched/fair.c linux-3.3.8-vs2.3.3.4/kernel/sched/fair.c +--- linux-3.3.8/kernel/sched/fair.c 2012-03-19 19:47:30.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/sched/fair.c 2012-03-19 20:52:10.000000000 +0100 +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + + #include + +@@ -1126,6 +1127,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, st + __enqueue_entity(cfs_rq, se); + se->on_rq = 1; + ++ if (entity_is_task(se)) ++ vx_activate_task(task_of(se)); + if (cfs_rq->nr_running == 1) { + list_add_leaf_cfs_rq(cfs_rq); + check_enqueue_throttle(cfs_rq); +@@ -1206,6 +1209,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, st + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->on_rq = 0; ++ if (entity_is_task(se)) ++ vx_deactivate_task(task_of(se)); + update_cfs_load(cfs_rq, 0); + account_entity_dequeue(cfs_rq, se); + +diff -NurpP --minimal linux-3.3.8/kernel/signal.c linux-3.3.8-vs2.3.3.4/kernel/signal.c +--- linux-3.3.8/kernel/signal.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/kernel/signal.c 2012-05-09 04:08:08.000000000 +0200 +@@ -29,6 +29,8 @@ + #include + #include + #include ++#include ++#include + #define CREATE_TRACE_POINTS + #include + +@@ -790,9 +792,18 @@ static int check_kill_permission(int sig + struct pid *sid; + int error; + ++ vxdprintk(VXD_CBIT(misc, 7), ++ "check_kill_permission(%d,%p,%p[#%u,%u])", ++ sig, info, t, vx_task_xid(t), t->pid); ++ + if (!valid_signal(sig)) + return -EINVAL; + ++/* FIXME: needed? if so, why? ++ if ((info != SEND_SIG_NOINFO) && ++ (is_si_special(info) || !si_fromuser(info))) ++ goto skip; */ ++ + if (!si_fromuser(info)) + return 0; + +@@ -816,6 +827,20 @@ static int check_kill_permission(int sig + } + } + ++ error = -EPERM; ++ if (t->pid == 1 && current->xid) ++ return error; ++ ++ error = -ESRCH; ++ /* FIXME: we shouldn't return ESRCH ever, to avoid ++ loops, maybe ENOENT or EACCES? */ ++ if (!vx_check(vx_task_xid(t), VS_WATCH_P | VS_IDENT)) { ++ vxdprintk(current->xid || VXD_CBIT(misc, 7), ++ "signal %d[%p] xid mismatch %p[#%u,%u] xid=#%u", ++ sig, info, t, vx_task_xid(t), t->pid, current->xid); ++ return error; ++ } ++/* skip: */ + return security_task_kill(t, info, sig, 0); + } + +@@ -1351,7 +1376,7 @@ int kill_pid_info(int sig, struct siginf + rcu_read_lock(); + retry: + p = pid_task(pid, PIDTYPE_PID); +- if (p) { ++ if (p && vx_check(vx_task_xid(p), VS_IDENT)) { + error = group_send_sig_info(sig, info, p); + if (unlikely(error == -ESRCH)) + /* +@@ -1401,7 +1426,7 @@ int kill_pid_info_as_cred(int sig, struc + + rcu_read_lock(); + p = pid_task(pid, PIDTYPE_PID); +- if (!p) { ++ if (!p || !vx_check(vx_task_xid(p), VS_IDENT)) { + ret = -ESRCH; + goto out_unlock; + } +@@ -1453,8 +1478,10 @@ static int kill_something_info(int sig, + struct task_struct * p; + + for_each_process(p) { +- if (task_pid_vnr(p) > 1 && +- !same_thread_group(p, current)) { ++ if (vx_check(vx_task_xid(p), VS_ADMIN|VS_IDENT) && ++ task_pid_vnr(p) > 1 && ++ !same_thread_group(p, current) && ++ !vx_current_initpid(p->pid)) { + int err = group_send_sig_info(sig, info, p); + ++count; + if (err != -EPERM) +@@ -2299,6 +2326,11 @@ relock: + !sig_kernel_only(signr)) + continue; + ++ /* virtual init is protected against user signals */ ++ if ((info->si_code == SI_USER) && ++ vx_current_initpid(current->pid)) ++ continue; ++ + if (sig_kernel_stop(signr)) { + /* + * The default action is to stop all threads in +diff -NurpP --minimal linux-3.3.8/kernel/softirq.c linux-3.3.8-vs2.3.3.4/kernel/softirq.c +--- linux-3.3.8/kernel/softirq.c 2012-03-19 19:47:30.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/softirq.c 2012-02-24 03:55:06.000000000 +0100 +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + + #define CREATE_TRACE_POINTS + #include +diff -NurpP --minimal linux-3.3.8/kernel/sys.c linux-3.3.8-vs2.3.3.4/kernel/sys.c +--- linux-3.3.8/kernel/sys.c 2012-03-19 19:47:30.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/sys.c 2012-03-19 20:52:10.000000000 +0100 +@@ -45,6 +45,7 @@ + #include + #include + #include ++#include + + #include + /* Move somewhere else to avoid recompiling? */ +@@ -155,7 +156,10 @@ static int set_one_prio(struct task_stru + goto out; + } + if (niceval < task_nice(p) && !can_nice(p, niceval)) { +- error = -EACCES; ++ if (vx_flags(VXF_IGNEG_NICE, 0)) ++ error = 0; ++ else ++ error = -EACCES; + goto out; + } + no_nice = security_task_setnice(p, niceval); +@@ -205,6 +209,8 @@ SYSCALL_DEFINE3(setpriority, int, which, + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ++ if (!vx_check(p->xid, VS_ADMIN_P | VS_IDENT)) ++ continue; + error = set_one_prio(p, niceval, error); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; +@@ -268,6 +274,8 @@ SYSCALL_DEFINE2(getpriority, int, which, + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ++ if (!vx_check(p->xid, VS_ADMIN_P | VS_IDENT)) ++ continue; + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; +@@ -418,6 +426,8 @@ EXPORT_SYMBOL_GPL(kernel_power_off); + + static DEFINE_MUTEX(reboot_mutex); + ++long vs_reboot(unsigned int, void __user *); ++ + /* + * Reboot system call: for obvious reasons only root may call it, + * and even root needs to set up some magic numbers in the registers +@@ -450,6 +460,9 @@ SYSCALL_DEFINE4(reboot, int, magic1, int + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) + cmd = LINUX_REBOOT_CMD_HALT; + ++ if (!vx_check(0, VS_ADMIN|VS_WATCH)) ++ return vs_reboot(cmd, arg); ++ + mutex_lock(&reboot_mutex); + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: +@@ -1273,7 +1286,8 @@ SYSCALL_DEFINE2(sethostname, char __user + int errno; + char tmp[__NEW_UTS_LEN]; + +- if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) ++ if (!vx_ns_capable(current->nsproxy->uts_ns->user_ns, ++ CAP_SYS_ADMIN, VXC_SET_UTSNAME)) + return -EPERM; + + if (len < 0 || len > __NEW_UTS_LEN) +@@ -1324,7 +1338,8 @@ SYSCALL_DEFINE2(setdomainname, char __us + int errno; + char tmp[__NEW_UTS_LEN]; + +- if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) ++ if (!vx_ns_capable(current->nsproxy->uts_ns->user_ns, ++ CAP_SYS_ADMIN, VXC_SET_UTSNAME)) + return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; +@@ -1443,7 +1458,7 @@ int do_prlimit(struct task_struct *tsk, + /* Keep the capable check against init_user_ns until + cgroups can contain all limits */ + if (new_rlim->rlim_max > rlim->rlim_max && +- !capable(CAP_SYS_RESOURCE)) ++ !vx_capable(CAP_SYS_RESOURCE, VXC_SET_RLIMIT)) + retval = -EPERM; + if (!retval) + retval = security_task_setrlimit(tsk->group_leader, +@@ -1497,7 +1512,8 @@ static int check_prlimit_permission(stru + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + return 0; +- if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) ++ if (vx_ns_capable(tcred->user->user_ns, ++ CAP_SYS_RESOURCE, VXC_SET_RLIMIT)) + return 0; + + return -EPERM; +diff -NurpP --minimal linux-3.3.8/kernel/sysctl.c linux-3.3.8-vs2.3.3.4/kernel/sysctl.c +--- linux-3.3.8/kernel/sysctl.c 2012-06-08 15:23:46.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/kernel/sysctl.c 2012-04-16 13:32:11.000000000 +0200 +@@ -76,6 +76,7 @@ + #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) + #include + #endif ++extern char vshelper_path[]; + #ifdef CONFIG_CHR_DEV_SG + #include + #endif +@@ -572,6 +573,13 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dostring, + }, + #endif ++ { ++ .procname = "vshelper", ++ .data = &vshelper_path, ++ .maxlen = 256, ++ .mode = 0644, ++ .proc_handler = &proc_dostring, ++ }, + #ifdef CONFIG_CHR_DEV_SG + { + .procname = "sg-big-buff", +diff -NurpP --minimal linux-3.3.8/kernel/sysctl_binary.c linux-3.3.8-vs2.3.3.4/kernel/sysctl_binary.c +--- linux-3.3.8/kernel/sysctl_binary.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/sysctl_binary.c 2012-02-24 03:55:06.000000000 +0100 +@@ -73,6 +73,7 @@ static const struct bin_table bin_kern_t + + { CTL_INT, KERN_PANIC, "panic" }, + { CTL_INT, KERN_REALROOTDEV, "real-root-dev" }, ++ { CTL_STR, KERN_VSHELPER, "vshelper" }, + + { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" }, + { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" }, +diff -NurpP --minimal linux-3.3.8/kernel/time/timekeeping.c linux-3.3.8-vs2.3.3.4/kernel/time/timekeeping.c +--- linux-3.3.8/kernel/time/timekeeping.c 2012-03-19 19:47:30.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/time/timekeeping.c 2012-02-24 03:55:06.000000000 +0100 +@@ -233,6 +233,7 @@ void getnstimeofday(struct timespec *ts) + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); ++ vx_adjust_timespec(ts); + } + + EXPORT_SYMBOL(getnstimeofday); +diff -NurpP --minimal linux-3.3.8/kernel/time.c linux-3.3.8-vs2.3.3.4/kernel/time.c +--- linux-3.3.8/kernel/time.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/time.c 2012-02-24 03:55:06.000000000 +0100 +@@ -92,7 +92,7 @@ SYSCALL_DEFINE1(stime, time_t __user *, + if (err) + return err; + +- do_settimeofday(&tv); ++ vx_settimeofday(&tv); + return 0; + } + +@@ -177,7 +177,7 @@ int do_sys_settimeofday(const struct tim + /* SMP safe, again the code in arch/foo/time.c should + * globally block out interrupts when it runs. + */ +- return do_settimeofday(tv); ++ return vx_settimeofday(tv); + } + return 0; + } +diff -NurpP --minimal linux-3.3.8/kernel/timer.c linux-3.3.8-vs2.3.3.4/kernel/timer.c +--- linux-3.3.8/kernel/timer.c 2012-03-19 19:47:30.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/timer.c 2012-02-24 03:55:06.000000000 +0100 +@@ -40,6 +40,10 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + + #include + #include +@@ -1386,12 +1390,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, sec + + #endif + +-#ifndef __alpha__ +- +-/* +- * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this +- * should be moved into arch/i386 instead? +- */ + + /** + * sys_getpid - return the thread group id of the current process +@@ -1420,10 +1418,23 @@ SYSCALL_DEFINE0(getppid) + rcu_read_lock(); + pid = task_tgid_vnr(rcu_dereference(current->real_parent)); + rcu_read_unlock(); ++ return vx_map_pid(pid); ++} + +- return pid; ++#ifdef __alpha__ ++ ++/* ++ * The Alpha uses getxpid, getxuid, and getxgid instead. ++ */ ++ ++asmlinkage long do_getxpid(long *ppid) ++{ ++ *ppid = sys_getppid(); ++ return sys_getpid(); + } + ++#else /* _alpha_ */ ++ + SYSCALL_DEFINE0(getuid) + { + /* Only we change this so SMP safe */ +diff -NurpP --minimal linux-3.3.8/kernel/user_namespace.c linux-3.3.8-vs2.3.3.4/kernel/user_namespace.c +--- linux-3.3.8/kernel/user_namespace.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/user_namespace.c 2012-02-24 03:55:06.000000000 +0100 +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + + static struct kmem_cache *user_ns_cachep __read_mostly; + +@@ -33,6 +34,7 @@ int create_user_ns(struct cred *new) + return -ENOMEM; + + kref_init(&ns->kref); ++ atomic_inc(&vs_global_user_ns); + + for (n = 0; n < UIDHASH_SZ; ++n) + INIT_HLIST_HEAD(ns->uidhash_table + n); +@@ -81,6 +83,8 @@ void free_user_ns(struct kref *kref) + struct user_namespace *ns = + container_of(kref, struct user_namespace, kref); + ++ /* FIXME: maybe move into destroyer? */ ++ atomic_dec(&vs_global_user_ns); + INIT_WORK(&ns->destroyer, free_user_ns_work); + schedule_work(&ns->destroyer); + } +diff -NurpP --minimal linux-3.3.8/kernel/utsname.c linux-3.3.8-vs2.3.3.4/kernel/utsname.c +--- linux-3.3.8/kernel/utsname.c 2012-01-09 16:15:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/utsname.c 2012-02-24 03:55:06.000000000 +0100 +@@ -16,14 +16,17 @@ + #include + #include + #include ++#include + + static struct uts_namespace *create_uts_ns(void) + { + struct uts_namespace *uts_ns; + + uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); +- if (uts_ns) ++ if (uts_ns) { + kref_init(&uts_ns->kref); ++ atomic_inc(&vs_global_uts_ns); ++ } + return uts_ns; + } + +@@ -32,8 +35,8 @@ static struct uts_namespace *create_uts_ + * @old_ns: namespace to clone + * Return NULL on error (failure to kmalloc), new ns otherwise + */ +-static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, +- struct uts_namespace *old_ns) ++static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns, ++ struct user_namespace *old_user) + { + struct uts_namespace *ns; + +@@ -43,7 +46,7 @@ static struct uts_namespace *clone_uts_n + + down_read(&uts_sem); + memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); +- ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); ++ ns->user_ns = get_user_ns(old_user); + up_read(&uts_sem); + return ns; + } +@@ -55,9 +58,9 @@ static struct uts_namespace *clone_uts_n + * versa. + */ + struct uts_namespace *copy_utsname(unsigned long flags, +- struct task_struct *tsk) ++ struct uts_namespace *old_ns, ++ struct user_namespace *user_ns) + { +- struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; + struct uts_namespace *new_ns; + + BUG_ON(!old_ns); +@@ -66,7 +69,7 @@ struct uts_namespace *copy_utsname(unsig + if (!(flags & CLONE_NEWUTS)) + return old_ns; + +- new_ns = clone_uts_ns(tsk, old_ns); ++ new_ns = clone_uts_ns(old_ns, user_ns); + + put_uts_ns(old_ns); + return new_ns; +@@ -78,6 +81,7 @@ void free_uts_ns(struct kref *kref) + + ns = container_of(kref, struct uts_namespace, kref); + put_user_ns(ns->user_ns); ++ atomic_dec(&vs_global_uts_ns); + kfree(ns); + } + +diff -NurpP --minimal linux-3.3.8/kernel/vserver/Kconfig linux-3.3.8-vs2.3.3.4/kernel/vserver/Kconfig +--- linux-3.3.8/kernel/vserver/Kconfig 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/Kconfig 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,224 @@ ++# ++# Linux VServer configuration ++# ++ ++menu "Linux VServer" ++ ++config VSERVER_AUTO_LBACK ++ bool "Automatically Assign Loopback IP" ++ default y ++ help ++ Automatically assign a guest specific loopback ++ IP and add it to the kernel network stack on ++ startup. ++ ++config VSERVER_AUTO_SINGLE ++ bool "Automatic Single IP Special Casing" ++ depends on EXPERIMENTAL ++ default y ++ help ++ This allows network contexts with a single IP to ++ automatically remap 0.0.0.0 bindings to that IP, ++ avoiding further network checks and improving ++ performance. ++ ++ (note: such guests do not allow to change the ip ++ on the fly and do not show loopback addresses) ++ ++config VSERVER_COWBL ++ bool "Enable COW Immutable Link Breaking" ++ default y ++ help ++ This enables the COW (Copy-On-Write) link break code. ++ It allows you to treat unified files like normal files ++ when writing to them (which will implicitely break the ++ link and create a copy of the unified file) ++ ++config VSERVER_VTIME ++ bool "Enable Virtualized Guest Time" ++ depends on EXPERIMENTAL ++ default n ++ help ++ This enables per guest time offsets to allow for ++ adjusting the system clock individually per guest. ++ this adds some overhead to the time functions and ++ therefore should not be enabled without good reason. ++ ++config VSERVER_DEVICE ++ bool "Enable Guest Device Mapping" ++ depends on EXPERIMENTAL ++ default n ++ help ++ This enables generic device remapping. ++ ++config VSERVER_PROC_SECURE ++ bool "Enable Proc Security" ++ depends on PROC_FS ++ default y ++ help ++ This configures ProcFS security to initially hide ++ non-process entries for all contexts except the main and ++ spectator context (i.e. for all guests), which is a secure ++ default. ++ ++ (note: on 1.2x the entries were visible by default) ++ ++choice ++ prompt "Persistent Inode Tagging" ++ default TAGGING_ID24 ++ help ++ This adds persistent context information to filesystems ++ mounted with the tagxid option. Tagging is a requirement ++ for per-context disk limits and per-context quota. ++ ++ ++config TAGGING_NONE ++ bool "Disabled" ++ help ++ do not store per-context information in inodes. ++ ++config TAGGING_UID16 ++ bool "UID16/GID32" ++ help ++ reduces UID to 16 bit, but leaves GID at 32 bit. ++ ++config TAGGING_GID16 ++ bool "UID32/GID16" ++ help ++ reduces GID to 16 bit, but leaves UID at 32 bit. ++ ++config TAGGING_ID24 ++ bool "UID24/GID24" ++ help ++ uses the upper 8bit from UID and GID for XID tagging ++ which leaves 24bit for UID/GID each, which should be ++ more than sufficient for normal use. ++ ++config TAGGING_INTERN ++ bool "UID32/GID32" ++ help ++ this uses otherwise reserved inode fields in the on ++ disk representation, which limits the use to a few ++ filesystems (currently ext2 and ext3) ++ ++endchoice ++ ++config TAG_NFSD ++ bool "Tag NFSD User Auth and Files" ++ default n ++ help ++ Enable this if you do want the in-kernel NFS ++ Server to use the tagging specified above. ++ (will require patched clients too) ++ ++config VSERVER_PRIVACY ++ bool "Honor Privacy Aspects of Guests" ++ default n ++ help ++ When enabled, most context checks will disallow ++ access to structures assigned to a specific context, ++ like ptys or loop devices. ++ ++config VSERVER_CONTEXTS ++ int "Maximum number of Contexts (1-65533)" if EMBEDDED ++ range 1 65533 ++ default "768" if 64BIT ++ default "256" ++ help ++ This setting will optimize certain data structures ++ and memory allocations according to the expected ++ maximum. ++ ++ note: this is not a strict upper limit. ++ ++config VSERVER_WARN ++ bool "VServer Warnings" ++ default y ++ help ++ This enables various runtime warnings, which will ++ notify about potential manipulation attempts or ++ resource shortage. It is generally considered to ++ be a good idea to have that enabled. ++ ++config VSERVER_WARN_DEVPTS ++ bool "VServer DevPTS Warnings" ++ depends on VSERVER_WARN ++ default y ++ help ++ This enables DevPTS related warnings, issued when a ++ process inside a context tries to lookup or access ++ a dynamic pts from the host or a different context. ++ ++config VSERVER_DEBUG ++ bool "VServer Debugging Code" ++ default n ++ help ++ Set this to yes if you want to be able to activate ++ debugging output at runtime. It adds a very small ++ overhead to all vserver related functions and ++ increases the kernel size by about 20k. ++ ++config VSERVER_HISTORY ++ bool "VServer History Tracing" ++ depends on VSERVER_DEBUG ++ default n ++ help ++ Set this to yes if you want to record the history of ++ linux-vserver activities, so they can be replayed in ++ the event of a kernel panic or oops. ++ ++config VSERVER_HISTORY_SIZE ++ int "Per-CPU History Size (32-65536)" ++ depends on VSERVER_HISTORY ++ range 32 65536 ++ default 64 ++ help ++ This allows you to specify the number of entries in ++ the per-CPU history buffer. ++ ++choice ++ prompt "Quotes used in debug and warn messages" ++ default QUOTES_ISO8859 ++ ++config QUOTES_ISO8859 ++ bool "Extended ASCII (ISO 8859) angle quotes" ++ help ++ This uses the extended ASCII characters \xbb ++ and \xab for quoting file and process names. ++ ++config QUOTES_UTF8 ++ bool "UTF-8 angle quotes" ++ help ++ This uses the the UTF-8 sequences for angle ++ quotes to quote file and process names. ++ ++config QUOTES_ASCII ++ bool "ASCII single quotes" ++ help ++ This uses the ASCII single quote character ++ (\x27) to quote file and process names. ++ ++endchoice ++ ++endmenu ++ ++ ++config VSERVER ++ bool ++ default y ++ select NAMESPACES ++ select UTS_NS ++ select IPC_NS ++# select USER_NS ++ select SYSVIPC ++ ++config VSERVER_SECURITY ++ bool ++ depends on SECURITY ++ default y ++ select SECURITY_CAPABILITIES ++ ++config VSERVER_DISABLED ++ bool ++ default n ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/Makefile linux-3.3.8-vs2.3.3.4/kernel/vserver/Makefile +--- linux-3.3.8/kernel/vserver/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/Makefile 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,18 @@ ++# ++# Makefile for the Linux vserver routines. ++# ++ ++ ++obj-y += vserver.o ++ ++vserver-y := switch.o context.o space.o sched.o network.o inode.o \ ++ limit.o cvirt.o cacct.o signal.o helper.o init.o \ ++ dlimit.o tag.o ++ ++vserver-$(CONFIG_INET) += inet.o ++vserver-$(CONFIG_PROC_FS) += proc.o ++vserver-$(CONFIG_VSERVER_DEBUG) += sysctl.o debug.o ++vserver-$(CONFIG_VSERVER_HISTORY) += history.o ++vserver-$(CONFIG_VSERVER_MONITOR) += monitor.o ++vserver-$(CONFIG_VSERVER_DEVICE) += device.o ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/cacct.c linux-3.3.8-vs2.3.3.4/kernel/vserver/cacct.c +--- linux-3.3.8/kernel/vserver/cacct.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/cacct.c 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,42 @@ ++/* ++ * linux/kernel/vserver/cacct.c ++ * ++ * Virtual Server: Context Accounting ++ * ++ * Copyright (C) 2006-2007 Herbert Pötzl ++ * ++ * V0.01 added accounting stats ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++ ++int vc_sock_stat(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_sock_stat_v0 vc_data; ++ int j, field; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ field = vc_data.field; ++ if ((field < 0) || (field >= VXA_SOCK_SIZE)) ++ return -EINVAL; ++ ++ for (j = 0; j < 3; j++) { ++ vc_data.count[j] = vx_sock_count(&vxi->cacct, field, j); ++ vc_data.total[j] = vx_sock_total(&vxi->cacct, field, j); ++ } ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/cacct_init.h linux-3.3.8-vs2.3.3.4/kernel/vserver/cacct_init.h +--- linux-3.3.8/kernel/vserver/cacct_init.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/cacct_init.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,25 @@ ++ ++ ++static inline void vx_info_init_cacct(struct _vx_cacct *cacct) ++{ ++ int i, j; ++ ++ ++ for (i = 0; i < VXA_SOCK_SIZE; i++) { ++ for (j = 0; j < 3; j++) { ++ atomic_long_set(&cacct->sock[i][j].count, 0); ++ atomic_long_set(&cacct->sock[i][j].total, 0); ++ } ++ } ++ for (i = 0; i < 8; i++) ++ atomic_set(&cacct->slab[i], 0); ++ for (i = 0; i < 5; i++) ++ for (j = 0; j < 4; j++) ++ atomic_set(&cacct->page[i][j], 0); ++} ++ ++static inline void vx_info_exit_cacct(struct _vx_cacct *cacct) ++{ ++ return; ++} ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/cacct_proc.h linux-3.3.8-vs2.3.3.4/kernel/vserver/cacct_proc.h +--- linux-3.3.8/kernel/vserver/cacct_proc.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/cacct_proc.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,53 @@ ++#ifndef _VX_CACCT_PROC_H ++#define _VX_CACCT_PROC_H ++ ++#include ++ ++ ++#define VX_SOCKA_TOP \ ++ "Type\t recv #/bytes\t\t send #/bytes\t\t fail #/bytes\n" ++ ++static inline int vx_info_proc_cacct(struct _vx_cacct *cacct, char *buffer) ++{ ++ int i, j, length = 0; ++ static char *type[VXA_SOCK_SIZE] = { ++ "UNSPEC", "UNIX", "INET", "INET6", "PACKET", "OTHER" ++ }; ++ ++ length += sprintf(buffer + length, VX_SOCKA_TOP); ++ for (i = 0; i < VXA_SOCK_SIZE; i++) { ++ length += sprintf(buffer + length, "%s:", type[i]); ++ for (j = 0; j < 3; j++) { ++ length += sprintf(buffer + length, ++ "\t%10lu/%-10lu", ++ vx_sock_count(cacct, i, j), ++ vx_sock_total(cacct, i, j)); ++ } ++ buffer[length++] = '\n'; ++ } ++ ++ length += sprintf(buffer + length, "\n"); ++ length += sprintf(buffer + length, ++ "slab:\t %8u %8u %8u %8u\n", ++ atomic_read(&cacct->slab[1]), ++ atomic_read(&cacct->slab[4]), ++ atomic_read(&cacct->slab[0]), ++ atomic_read(&cacct->slab[2])); ++ ++ length += sprintf(buffer + length, "\n"); ++ for (i = 0; i < 5; i++) { ++ length += sprintf(buffer + length, ++ "page[%d]: %8u %8u %8u %8u\t %8u %8u %8u %8u\n", i, ++ atomic_read(&cacct->page[i][0]), ++ atomic_read(&cacct->page[i][1]), ++ atomic_read(&cacct->page[i][2]), ++ atomic_read(&cacct->page[i][3]), ++ atomic_read(&cacct->page[i][4]), ++ atomic_read(&cacct->page[i][5]), ++ atomic_read(&cacct->page[i][6]), ++ atomic_read(&cacct->page[i][7])); ++ } ++ return length; ++} ++ ++#endif /* _VX_CACCT_PROC_H */ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/context.c linux-3.3.8-vs2.3.3.4/kernel/vserver/context.c +--- linux-3.3.8/kernel/vserver/context.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/context.c 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,1107 @@ ++/* ++ * linux/kernel/vserver/context.c ++ * ++ * Virtual Server: Context Support ++ * ++ * Copyright (C) 2003-2011 Herbert Pötzl ++ * ++ * V0.01 context helper ++ * V0.02 vx_ctx_kill syscall command ++ * V0.03 replaced context_info calls ++ * V0.04 redesign of struct (de)alloc ++ * V0.05 rlimit basic implementation ++ * V0.06 task_xid and info commands ++ * V0.07 context flags and caps ++ * V0.08 switch to RCU based hash ++ * V0.09 revert to non RCU for now ++ * V0.10 and back to working RCU hash ++ * V0.11 and back to locking again ++ * V0.12 referenced context store ++ * V0.13 separate per cpu data ++ * V0.14 changed vcmds to vxi arg ++ * V0.15 added context stat ++ * V0.16 have __create claim() the vxi ++ * V0.17 removed older and legacy stuff ++ * V0.18 added user credentials ++ * V0.19 added warn mask ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#include "cvirt_init.h" ++#include "cacct_init.h" ++#include "limit_init.h" ++#include "sched_init.h" ++ ++ ++atomic_t vx_global_ctotal = ATOMIC_INIT(0); ++atomic_t vx_global_cactive = ATOMIC_INIT(0); ++ ++ ++/* now inactive context structures */ ++ ++static struct hlist_head vx_info_inactive = HLIST_HEAD_INIT; ++ ++static DEFINE_SPINLOCK(vx_info_inactive_lock); ++ ++ ++/* __alloc_vx_info() ++ ++ * allocate an initialized vx_info struct ++ * doesn't make it visible (hash) */ ++ ++static struct vx_info *__alloc_vx_info(xid_t xid) ++{ ++ struct vx_info *new = NULL; ++ int cpu, index; ++ ++ vxdprintk(VXD_CBIT(xid, 0), "alloc_vx_info(%d)*", xid); ++ ++ /* would this benefit from a slab cache? */ ++ new = kmalloc(sizeof(struct vx_info), GFP_KERNEL); ++ if (!new) ++ return 0; ++ ++ memset(new, 0, sizeof(struct vx_info)); ++#ifdef CONFIG_SMP ++ new->ptr_pc = alloc_percpu(struct _vx_info_pc); ++ if (!new->ptr_pc) ++ goto error; ++#endif ++ new->vx_id = xid; ++ INIT_HLIST_NODE(&new->vx_hlist); ++ atomic_set(&new->vx_usecnt, 0); ++ atomic_set(&new->vx_tasks, 0); ++ new->vx_parent = NULL; ++ new->vx_state = 0; ++ init_waitqueue_head(&new->vx_wait); ++ ++ /* prepare reaper */ ++ get_task_struct(init_pid_ns.child_reaper); ++ new->vx_reaper = init_pid_ns.child_reaper; ++ new->vx_badness_bias = 0; ++ ++ /* rest of init goes here */ ++ vx_info_init_limit(&new->limit); ++ vx_info_init_sched(&new->sched); ++ vx_info_init_cvirt(&new->cvirt); ++ vx_info_init_cacct(&new->cacct); ++ ++ /* per cpu data structures */ ++ for_each_possible_cpu(cpu) { ++ vx_info_init_sched_pc( ++ &vx_per_cpu(new, sched_pc, cpu), cpu); ++ vx_info_init_cvirt_pc( ++ &vx_per_cpu(new, cvirt_pc, cpu), cpu); ++ } ++ ++ new->vx_flags = VXF_INIT_SET; ++ new->vx_bcaps = CAP_FULL_SET; // maybe ~CAP_SETPCAP ++ new->vx_ccaps = 0; ++ new->vx_umask = 0; ++ new->vx_wmask = 0; ++ ++ new->reboot_cmd = 0; ++ new->exit_code = 0; ++ ++ // preconfig spaces ++ for (index = 0; index < VX_SPACES; index++) { ++ struct _vx_space *space = &new->space[index]; ++ ++ // filesystem ++ spin_lock(&init_fs.lock); ++ init_fs.users++; ++ spin_unlock(&init_fs.lock); ++ space->vx_fs = &init_fs; ++ ++ /* FIXME: do we want defaults? */ ++ // space->vx_real_cred = 0; ++ // space->vx_cred = 0; ++ } ++ ++ ++ vxdprintk(VXD_CBIT(xid, 0), ++ "alloc_vx_info(%d) = %p", xid, new); ++ vxh_alloc_vx_info(new); ++ atomic_inc(&vx_global_ctotal); ++ return new; ++#ifdef CONFIG_SMP ++error: ++ kfree(new); ++ return 0; ++#endif ++} ++ ++/* __dealloc_vx_info() ++ ++ * final disposal of vx_info */ ++ ++static void __dealloc_vx_info(struct vx_info *vxi) ++{ ++#ifdef CONFIG_VSERVER_WARN ++ struct vx_info_save vxis; ++ int cpu; ++#endif ++ vxdprintk(VXD_CBIT(xid, 0), ++ "dealloc_vx_info(%p)", vxi); ++ vxh_dealloc_vx_info(vxi); ++ ++#ifdef CONFIG_VSERVER_WARN ++ enter_vx_info(vxi, &vxis); ++ vx_info_exit_limit(&vxi->limit); ++ vx_info_exit_sched(&vxi->sched); ++ vx_info_exit_cvirt(&vxi->cvirt); ++ vx_info_exit_cacct(&vxi->cacct); ++ ++ for_each_possible_cpu(cpu) { ++ vx_info_exit_sched_pc( ++ &vx_per_cpu(vxi, sched_pc, cpu), cpu); ++ vx_info_exit_cvirt_pc( ++ &vx_per_cpu(vxi, cvirt_pc, cpu), cpu); ++ } ++ leave_vx_info(&vxis); ++#endif ++ ++ vxi->vx_id = -1; ++ vxi->vx_state |= VXS_RELEASED; ++ ++#ifdef CONFIG_SMP ++ free_percpu(vxi->ptr_pc); ++#endif ++ kfree(vxi); ++ atomic_dec(&vx_global_ctotal); ++} ++ ++static void __shutdown_vx_info(struct vx_info *vxi) ++{ ++ struct nsproxy *nsproxy; ++ struct fs_struct *fs; ++ struct cred *cred; ++ int index, kill; ++ ++ might_sleep(); ++ ++ vxi->vx_state |= VXS_SHUTDOWN; ++ vs_state_change(vxi, VSC_SHUTDOWN); ++ ++ for (index = 0; index < VX_SPACES; index++) { ++ struct _vx_space *space = &vxi->space[index]; ++ ++ nsproxy = xchg(&space->vx_nsproxy, NULL); ++ if (nsproxy) ++ put_nsproxy(nsproxy); ++ ++ fs = xchg(&space->vx_fs, NULL); ++ spin_lock(&fs->lock); ++ kill = !--fs->users; ++ spin_unlock(&fs->lock); ++ if (kill) ++ free_fs_struct(fs); ++ ++ cred = (struct cred *)xchg(&space->vx_cred, NULL); ++ if (cred) ++ abort_creds(cred); ++ } ++} ++ ++/* exported stuff */ ++ ++void free_vx_info(struct vx_info *vxi) ++{ ++ unsigned long flags; ++ unsigned index; ++ ++ /* check for reference counts first */ ++ BUG_ON(atomic_read(&vxi->vx_usecnt)); ++ BUG_ON(atomic_read(&vxi->vx_tasks)); ++ ++ /* context must not be hashed */ ++ BUG_ON(vx_info_state(vxi, VXS_HASHED)); ++ ++ /* context shutdown is mandatory */ ++ BUG_ON(!vx_info_state(vxi, VXS_SHUTDOWN)); ++ ++ /* spaces check */ ++ for (index = 0; index < VX_SPACES; index++) { ++ struct _vx_space *space = &vxi->space[index]; ++ ++ BUG_ON(space->vx_nsproxy); ++ BUG_ON(space->vx_fs); ++ // BUG_ON(space->vx_real_cred); ++ // BUG_ON(space->vx_cred); ++ } ++ ++ spin_lock_irqsave(&vx_info_inactive_lock, flags); ++ hlist_del(&vxi->vx_hlist); ++ spin_unlock_irqrestore(&vx_info_inactive_lock, flags); ++ ++ __dealloc_vx_info(vxi); ++} ++ ++ ++/* hash table for vx_info hash */ ++ ++#define VX_HASH_SIZE 13 ++ ++static struct hlist_head vx_info_hash[VX_HASH_SIZE] = ++ { [0 ... VX_HASH_SIZE-1] = HLIST_HEAD_INIT }; ++ ++static DEFINE_SPINLOCK(vx_info_hash_lock); ++ ++ ++static inline unsigned int __hashval(xid_t xid) ++{ ++ return (xid % VX_HASH_SIZE); ++} ++ ++ ++ ++/* __hash_vx_info() ++ ++ * add the vxi to the global hash table ++ * requires the hash_lock to be held */ ++ ++static inline void __hash_vx_info(struct vx_info *vxi) ++{ ++ struct hlist_head *head; ++ ++ vxd_assert_lock(&vx_info_hash_lock); ++ vxdprintk(VXD_CBIT(xid, 4), ++ "__hash_vx_info: %p[#%d]", vxi, vxi->vx_id); ++ vxh_hash_vx_info(vxi); ++ ++ /* context must not be hashed */ ++ BUG_ON(vx_info_state(vxi, VXS_HASHED)); ++ ++ vxi->vx_state |= VXS_HASHED; ++ head = &vx_info_hash[__hashval(vxi->vx_id)]; ++ hlist_add_head(&vxi->vx_hlist, head); ++ atomic_inc(&vx_global_cactive); ++} ++ ++/* __unhash_vx_info() ++ ++ * remove the vxi from the global hash table ++ * requires the hash_lock to be held */ ++ ++static inline void __unhash_vx_info(struct vx_info *vxi) ++{ ++ unsigned long flags; ++ ++ vxd_assert_lock(&vx_info_hash_lock); ++ vxdprintk(VXD_CBIT(xid, 4), ++ "__unhash_vx_info: %p[#%d.%d.%d]", vxi, vxi->vx_id, ++ atomic_read(&vxi->vx_usecnt), atomic_read(&vxi->vx_tasks)); ++ vxh_unhash_vx_info(vxi); ++ ++ /* context must be hashed */ ++ BUG_ON(!vx_info_state(vxi, VXS_HASHED)); ++ /* but without tasks */ ++ BUG_ON(atomic_read(&vxi->vx_tasks)); ++ ++ vxi->vx_state &= ~VXS_HASHED; ++ hlist_del_init(&vxi->vx_hlist); ++ spin_lock_irqsave(&vx_info_inactive_lock, flags); ++ hlist_add_head(&vxi->vx_hlist, &vx_info_inactive); ++ spin_unlock_irqrestore(&vx_info_inactive_lock, flags); ++ atomic_dec(&vx_global_cactive); ++} ++ ++ ++/* __lookup_vx_info() ++ ++ * requires the hash_lock to be held ++ * doesn't increment the vx_refcnt */ ++ ++static inline struct vx_info *__lookup_vx_info(xid_t xid) ++{ ++ struct hlist_head *head = &vx_info_hash[__hashval(xid)]; ++ struct hlist_node *pos; ++ struct vx_info *vxi; ++ ++ vxd_assert_lock(&vx_info_hash_lock); ++ hlist_for_each(pos, head) { ++ vxi = hlist_entry(pos, struct vx_info, vx_hlist); ++ ++ if (vxi->vx_id == xid) ++ goto found; ++ } ++ vxi = NULL; ++found: ++ vxdprintk(VXD_CBIT(xid, 0), ++ "__lookup_vx_info(#%u): %p[#%u]", ++ xid, vxi, vxi ? vxi->vx_id : 0); ++ vxh_lookup_vx_info(vxi, xid); ++ return vxi; ++} ++ ++ ++/* __create_vx_info() ++ ++ * create the requested context ++ * get(), claim() and hash it */ ++ ++static struct vx_info *__create_vx_info(int id) ++{ ++ struct vx_info *new, *vxi = NULL; ++ ++ vxdprintk(VXD_CBIT(xid, 1), "create_vx_info(%d)*", id); ++ ++ if (!(new = __alloc_vx_info(id))) ++ return ERR_PTR(-ENOMEM); ++ ++ /* required to make dynamic xids unique */ ++ spin_lock(&vx_info_hash_lock); ++ ++ /* static context requested */ ++ if ((vxi = __lookup_vx_info(id))) { ++ vxdprintk(VXD_CBIT(xid, 0), ++ "create_vx_info(%d) = %p (already there)", id, vxi); ++ if (vx_info_flags(vxi, VXF_STATE_SETUP, 0)) ++ vxi = ERR_PTR(-EBUSY); ++ else ++ vxi = ERR_PTR(-EEXIST); ++ goto out_unlock; ++ } ++ /* new context */ ++ vxdprintk(VXD_CBIT(xid, 0), ++ "create_vx_info(%d) = %p (new)", id, new); ++ claim_vx_info(new, NULL); ++ __hash_vx_info(get_vx_info(new)); ++ vxi = new, new = NULL; ++ ++out_unlock: ++ spin_unlock(&vx_info_hash_lock); ++ vxh_create_vx_info(IS_ERR(vxi) ? NULL : vxi, id); ++ if (new) ++ __dealloc_vx_info(new); ++ return vxi; ++} ++ ++ ++/* exported stuff */ ++ ++ ++void unhash_vx_info(struct vx_info *vxi) ++{ ++ spin_lock(&vx_info_hash_lock); ++ __unhash_vx_info(vxi); ++ spin_unlock(&vx_info_hash_lock); ++ __shutdown_vx_info(vxi); ++ __wakeup_vx_info(vxi); ++} ++ ++ ++/* lookup_vx_info() ++ ++ * search for a vx_info and get() it ++ * negative id means current */ ++ ++struct vx_info *lookup_vx_info(int id) ++{ ++ struct vx_info *vxi = NULL; ++ ++ if (id < 0) { ++ vxi = get_vx_info(current_vx_info()); ++ } else if (id > 1) { ++ spin_lock(&vx_info_hash_lock); ++ vxi = get_vx_info(__lookup_vx_info(id)); ++ spin_unlock(&vx_info_hash_lock); ++ } ++ return vxi; ++} ++ ++/* xid_is_hashed() ++ ++ * verify that xid is still hashed */ ++ ++int xid_is_hashed(xid_t xid) ++{ ++ int hashed; ++ ++ spin_lock(&vx_info_hash_lock); ++ hashed = (__lookup_vx_info(xid) != NULL); ++ spin_unlock(&vx_info_hash_lock); ++ return hashed; ++} ++ ++#ifdef CONFIG_PROC_FS ++ ++/* get_xid_list() ++ ++ * get a subset of hashed xids for proc ++ * assumes size is at least one */ ++ ++int get_xid_list(int index, unsigned int *xids, int size) ++{ ++ int hindex, nr_xids = 0; ++ ++ /* only show current and children */ ++ if (!vx_check(0, VS_ADMIN | VS_WATCH)) { ++ if (index > 0) ++ return 0; ++ xids[nr_xids] = vx_current_xid(); ++ return 1; ++ } ++ ++ for (hindex = 0; hindex < VX_HASH_SIZE; hindex++) { ++ struct hlist_head *head = &vx_info_hash[hindex]; ++ struct hlist_node *pos; ++ ++ spin_lock(&vx_info_hash_lock); ++ hlist_for_each(pos, head) { ++ struct vx_info *vxi; ++ ++ if (--index > 0) ++ continue; ++ ++ vxi = hlist_entry(pos, struct vx_info, vx_hlist); ++ xids[nr_xids] = vxi->vx_id; ++ if (++nr_xids >= size) { ++ spin_unlock(&vx_info_hash_lock); ++ goto out; ++ } ++ } ++ /* keep the lock time short */ ++ spin_unlock(&vx_info_hash_lock); ++ } ++out: ++ return nr_xids; ++} ++#endif ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ ++void dump_vx_info_inactive(int level) ++{ ++ struct hlist_node *entry, *next; ++ ++ hlist_for_each_safe(entry, next, &vx_info_inactive) { ++ struct vx_info *vxi = ++ list_entry(entry, struct vx_info, vx_hlist); ++ ++ dump_vx_info(vxi, level); ++ } ++} ++ ++#endif ++ ++#if 0 ++int vx_migrate_user(struct task_struct *p, struct vx_info *vxi) ++{ ++ struct user_struct *new_user, *old_user; ++ ++ if (!p || !vxi) ++ BUG(); ++ ++ if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0)) ++ return -EACCES; ++ ++ new_user = alloc_uid(vxi->vx_id, p->uid); ++ if (!new_user) ++ return -ENOMEM; ++ ++ old_user = p->user; ++ if (new_user != old_user) { ++ atomic_inc(&new_user->processes); ++ atomic_dec(&old_user->processes); ++ p->user = new_user; ++ } ++ free_uid(old_user); ++ return 0; ++} ++#endif ++ ++#if 0 ++void vx_mask_cap_bset(struct vx_info *vxi, struct task_struct *p) ++{ ++ // p->cap_effective &= vxi->vx_cap_bset; ++ p->cap_effective = ++ cap_intersect(p->cap_effective, vxi->cap_bset); ++ // p->cap_inheritable &= vxi->vx_cap_bset; ++ p->cap_inheritable = ++ cap_intersect(p->cap_inheritable, vxi->cap_bset); ++ // p->cap_permitted &= vxi->vx_cap_bset; ++ p->cap_permitted = ++ cap_intersect(p->cap_permitted, vxi->cap_bset); ++} ++#endif ++ ++ ++#include ++#include ++ ++static int vx_openfd_task(struct task_struct *tsk) ++{ ++ struct files_struct *files = tsk->files; ++ struct fdtable *fdt; ++ const unsigned long *bptr; ++ int count, total; ++ ++ /* no rcu_read_lock() because of spin_lock() */ ++ spin_lock(&files->file_lock); ++ fdt = files_fdtable(files); ++ bptr = fdt->open_fds->fds_bits; ++ count = fdt->max_fds / (sizeof(unsigned long) * 8); ++ for (total = 0; count > 0; count--) { ++ if (*bptr) ++ total += hweight_long(*bptr); ++ bptr++; ++ } ++ spin_unlock(&files->file_lock); ++ return total; ++} ++ ++ ++/* for *space compatibility */ ++ ++asmlinkage long sys_unshare(unsigned long); ++ ++/* ++ * migrate task to new context ++ * gets vxi, puts old_vxi on change ++ * optionally unshares namespaces (hack) ++ */ ++ ++int vx_migrate_task(struct task_struct *p, struct vx_info *vxi, int unshare) ++{ ++ struct vx_info *old_vxi; ++ int ret = 0; ++ ++ if (!p || !vxi) ++ BUG(); ++ ++ vxdprintk(VXD_CBIT(xid, 5), ++ "vx_migrate_task(%p,%p[#%d.%d])", p, vxi, ++ vxi->vx_id, atomic_read(&vxi->vx_usecnt)); ++ ++ if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0) && ++ !vx_info_flags(vxi, VXF_STATE_SETUP, 0)) ++ return -EACCES; ++ ++ if (vx_info_state(vxi, VXS_SHUTDOWN)) ++ return -EFAULT; ++ ++ old_vxi = task_get_vx_info(p); ++ if (old_vxi == vxi) ++ goto out; ++ ++// if (!(ret = vx_migrate_user(p, vxi))) { ++ { ++ int openfd; ++ ++ task_lock(p); ++ openfd = vx_openfd_task(p); ++ ++ if (old_vxi) { ++ atomic_dec(&old_vxi->cvirt.nr_threads); ++ atomic_dec(&old_vxi->cvirt.nr_running); ++ __rlim_dec(&old_vxi->limit, RLIMIT_NPROC); ++ /* FIXME: what about the struct files here? */ ++ __rlim_sub(&old_vxi->limit, VLIMIT_OPENFD, openfd); ++ /* account for the executable */ ++ __rlim_dec(&old_vxi->limit, VLIMIT_DENTRY); ++ } ++ atomic_inc(&vxi->cvirt.nr_threads); ++ atomic_inc(&vxi->cvirt.nr_running); ++ __rlim_inc(&vxi->limit, RLIMIT_NPROC); ++ /* FIXME: what about the struct files here? */ ++ __rlim_add(&vxi->limit, VLIMIT_OPENFD, openfd); ++ /* account for the executable */ ++ __rlim_inc(&vxi->limit, VLIMIT_DENTRY); ++ ++ if (old_vxi) { ++ release_vx_info(old_vxi, p); ++ clr_vx_info(&p->vx_info); ++ } ++ claim_vx_info(vxi, p); ++ set_vx_info(&p->vx_info, vxi); ++ p->xid = vxi->vx_id; ++ ++ vxdprintk(VXD_CBIT(xid, 5), ++ "moved task %p into vxi:%p[#%d]", ++ p, vxi, vxi->vx_id); ++ ++ // vx_mask_cap_bset(vxi, p); ++ task_unlock(p); ++ ++ /* hack for *spaces to provide compatibility */ ++ if (unshare) { ++ struct nsproxy *old_nsp, *new_nsp; ++ ++ ret = unshare_nsproxy_namespaces( ++ CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER, ++ &new_nsp, NULL); ++ if (ret) ++ goto out; ++ ++ old_nsp = xchg(&p->nsproxy, new_nsp); ++ vx_set_space(vxi, ++ CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER, 0); ++ put_nsproxy(old_nsp); ++ } ++ } ++out: ++ put_vx_info(old_vxi); ++ return ret; ++} ++ ++int vx_set_reaper(struct vx_info *vxi, struct task_struct *p) ++{ ++ struct task_struct *old_reaper; ++ ++ if (!vxi) ++ return -EINVAL; ++ ++ vxdprintk(VXD_CBIT(xid, 6), ++ "vx_set_reaper(%p[#%d],%p[#%d,%d])", ++ vxi, vxi->vx_id, p, p->xid, p->pid); ++ ++ old_reaper = vxi->vx_reaper; ++ if (old_reaper == p) ++ return 0; ++ ++ /* set new child reaper */ ++ get_task_struct(p); ++ vxi->vx_reaper = p; ++ put_task_struct(old_reaper); ++ return 0; ++} ++ ++int vx_set_init(struct vx_info *vxi, struct task_struct *p) ++{ ++ if (!vxi) ++ return -EINVAL; ++ ++ vxdprintk(VXD_CBIT(xid, 6), ++ "vx_set_init(%p[#%d],%p[#%d,%d,%d])", ++ vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid); ++ ++ vxi->vx_flags &= ~VXF_STATE_INIT; ++ // vxi->vx_initpid = p->tgid; ++ vxi->vx_initpid = p->pid; ++ return 0; ++} ++ ++void vx_exit_init(struct vx_info *vxi, struct task_struct *p, int code) ++{ ++ vxdprintk(VXD_CBIT(xid, 6), ++ "vx_exit_init(%p[#%d],%p[#%d,%d,%d])", ++ vxi, vxi->vx_id, p, p->xid, p->pid, p->tgid); ++ ++ vxi->exit_code = code; ++ vxi->vx_initpid = 0; ++} ++ ++ ++void vx_set_persistent(struct vx_info *vxi) ++{ ++ vxdprintk(VXD_CBIT(xid, 6), ++ "vx_set_persistent(%p[#%d])", vxi, vxi->vx_id); ++ ++ get_vx_info(vxi); ++ claim_vx_info(vxi, NULL); ++} ++ ++void vx_clear_persistent(struct vx_info *vxi) ++{ ++ vxdprintk(VXD_CBIT(xid, 6), ++ "vx_clear_persistent(%p[#%d])", vxi, vxi->vx_id); ++ ++ release_vx_info(vxi, NULL); ++ put_vx_info(vxi); ++} ++ ++void vx_update_persistent(struct vx_info *vxi) ++{ ++ if (vx_info_flags(vxi, VXF_PERSISTENT, 0)) ++ vx_set_persistent(vxi); ++ else ++ vx_clear_persistent(vxi); ++} ++ ++ ++/* task must be current or locked */ ++ ++void exit_vx_info(struct task_struct *p, int code) ++{ ++ struct vx_info *vxi = p->vx_info; ++ ++ if (vxi) { ++ atomic_dec(&vxi->cvirt.nr_threads); ++ vx_nproc_dec(p); ++ ++ vxi->exit_code = code; ++ release_vx_info(vxi, p); ++ } ++} ++ ++void exit_vx_info_early(struct task_struct *p, int code) ++{ ++ struct vx_info *vxi = p->vx_info; ++ ++ if (vxi) { ++ if (vxi->vx_initpid == p->pid) ++ vx_exit_init(vxi, p, code); ++ if (vxi->vx_reaper == p) ++ vx_set_reaper(vxi, init_pid_ns.child_reaper); ++ } ++} ++ ++ ++/* vserver syscall commands below here */ ++ ++/* taks xid and vx_info functions */ ++ ++#include ++ ++ ++int vc_task_xid(uint32_t id) ++{ ++ xid_t xid; ++ ++ if (id) { ++ struct task_struct *tsk; ++ ++ rcu_read_lock(); ++ tsk = find_task_by_real_pid(id); ++ xid = (tsk) ? tsk->xid : -ESRCH; ++ rcu_read_unlock(); ++ } else ++ xid = vx_current_xid(); ++ return xid; ++} ++ ++ ++int vc_vx_info(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_vx_info_v0 vc_data; ++ ++ vc_data.xid = vxi->vx_id; ++ vc_data.initpid = vxi->vx_initpid; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++int vc_ctx_stat(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_stat_v0 vc_data; ++ ++ vc_data.usecnt = atomic_read(&vxi->vx_usecnt); ++ vc_data.tasks = atomic_read(&vxi->vx_tasks); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++/* context functions */ ++ ++int vc_ctx_create(uint32_t xid, void __user *data) ++{ ++ struct vcmd_ctx_create vc_data = { .flagword = VXF_INIT_SET }; ++ struct vx_info *new_vxi; ++ int ret; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ if ((xid > MAX_S_CONTEXT) || (xid < 2)) ++ return -EINVAL; ++ ++ new_vxi = __create_vx_info(xid); ++ if (IS_ERR(new_vxi)) ++ return PTR_ERR(new_vxi); ++ ++ /* initial flags */ ++ new_vxi->vx_flags = vc_data.flagword; ++ ++ ret = -ENOEXEC; ++ if (vs_state_change(new_vxi, VSC_STARTUP)) ++ goto out; ++ ++ ret = vx_migrate_task(current, new_vxi, (!data)); ++ if (ret) ++ goto out; ++ ++ /* return context id on success */ ++ ret = new_vxi->vx_id; ++ ++ /* get a reference for persistent contexts */ ++ if ((vc_data.flagword & VXF_PERSISTENT)) ++ vx_set_persistent(new_vxi); ++out: ++ release_vx_info(new_vxi, NULL); ++ put_vx_info(new_vxi); ++ return ret; ++} ++ ++ ++int vc_ctx_migrate(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_migrate vc_data = { .flagword = 0 }; ++ int ret; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = vx_migrate_task(current, vxi, 0); ++ if (ret) ++ return ret; ++ if (vc_data.flagword & VXM_SET_INIT) ++ ret = vx_set_init(vxi, current); ++ if (ret) ++ return ret; ++ if (vc_data.flagword & VXM_SET_REAPER) ++ ret = vx_set_reaper(vxi, current); ++ return ret; ++} ++ ++ ++int vc_get_cflags(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_flags_v0 vc_data; ++ ++ vc_data.flagword = vxi->vx_flags; ++ ++ /* special STATE flag handling */ ++ vc_data.mask = vs_mask_flags(~0ULL, vxi->vx_flags, VXF_ONE_TIME); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_cflags(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_flags_v0 vc_data; ++ uint64_t mask, trigger; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ /* special STATE flag handling */ ++ mask = vs_mask_mask(vc_data.mask, vxi->vx_flags, VXF_ONE_TIME); ++ trigger = (mask & vxi->vx_flags) ^ (mask & vc_data.flagword); ++ ++ if (vxi == current_vx_info()) { ++ /* if (trigger & VXF_STATE_SETUP) ++ vx_mask_cap_bset(vxi, current); */ ++ if (trigger & VXF_STATE_INIT) { ++ int ret; ++ ++ ret = vx_set_init(vxi, current); ++ if (ret) ++ return ret; ++ ret = vx_set_reaper(vxi, current); ++ if (ret) ++ return ret; ++ } ++ } ++ ++ vxi->vx_flags = vs_mask_flags(vxi->vx_flags, ++ vc_data.flagword, mask); ++ if (trigger & VXF_PERSISTENT) ++ vx_update_persistent(vxi); ++ ++ return 0; ++} ++ ++ ++static inline uint64_t caps_from_cap_t(kernel_cap_t c) ++{ ++ uint64_t v = c.cap[0] | ((uint64_t)c.cap[1] << 32); ++ ++ // printk("caps_from_cap_t(%08x:%08x) = %016llx\n", c.cap[1], c.cap[0], v); ++ return v; ++} ++ ++static inline kernel_cap_t cap_t_from_caps(uint64_t v) ++{ ++ kernel_cap_t c = __cap_empty_set; ++ ++ c.cap[0] = v & 0xFFFFFFFF; ++ c.cap[1] = (v >> 32) & 0xFFFFFFFF; ++ ++ // printk("cap_t_from_caps(%016llx) = %08x:%08x\n", v, c.cap[1], c.cap[0]); ++ return c; ++} ++ ++ ++static int do_get_caps(struct vx_info *vxi, uint64_t *bcaps, uint64_t *ccaps) ++{ ++ if (bcaps) ++ *bcaps = caps_from_cap_t(vxi->vx_bcaps); ++ if (ccaps) ++ *ccaps = vxi->vx_ccaps; ++ ++ return 0; ++} ++ ++int vc_get_ccaps(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_caps_v1 vc_data; ++ int ret; ++ ++ ret = do_get_caps(vxi, NULL, &vc_data.ccaps); ++ if (ret) ++ return ret; ++ vc_data.cmask = ~0ULL; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int do_set_caps(struct vx_info *vxi, ++ uint64_t bcaps, uint64_t bmask, uint64_t ccaps, uint64_t cmask) ++{ ++ uint64_t bcold = caps_from_cap_t(vxi->vx_bcaps); ++ ++#if 0 ++ printk("do_set_caps(%16llx, %16llx, %16llx, %16llx)\n", ++ bcaps, bmask, ccaps, cmask); ++#endif ++ vxi->vx_bcaps = cap_t_from_caps( ++ vs_mask_flags(bcold, bcaps, bmask)); ++ vxi->vx_ccaps = vs_mask_flags(vxi->vx_ccaps, ccaps, cmask); ++ ++ return 0; ++} ++ ++int vc_set_ccaps(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_caps_v1 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_caps(vxi, 0, 0, vc_data.ccaps, vc_data.cmask); ++} ++ ++int vc_get_bcaps(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_bcaps vc_data; ++ int ret; ++ ++ ret = do_get_caps(vxi, &vc_data.bcaps, NULL); ++ if (ret) ++ return ret; ++ vc_data.bmask = ~0ULL; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_bcaps(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_bcaps vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_caps(vxi, vc_data.bcaps, vc_data.bmask, 0, 0); ++} ++ ++ ++int vc_get_umask(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_umask vc_data; ++ ++ vc_data.umask = vxi->vx_umask; ++ vc_data.mask = ~0ULL; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_umask(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_umask vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ vxi->vx_umask = vs_mask_flags(vxi->vx_umask, ++ vc_data.umask, vc_data.mask); ++ return 0; ++} ++ ++ ++int vc_get_wmask(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_wmask vc_data; ++ ++ vc_data.wmask = vxi->vx_wmask; ++ vc_data.mask = ~0ULL; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_wmask(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_wmask vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ vxi->vx_wmask = vs_mask_flags(vxi->vx_wmask, ++ vc_data.wmask, vc_data.mask); ++ return 0; ++} ++ ++ ++int vc_get_badness(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_badness_v0 vc_data; ++ ++ vc_data.bias = vxi->vx_badness_bias; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_badness(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_badness_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ vxi->vx_badness_bias = vc_data.bias; ++ return 0; ++} ++ ++#include ++ ++EXPORT_SYMBOL_GPL(free_vx_info); ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/cvirt.c linux-3.3.8-vs2.3.3.4/kernel/vserver/cvirt.c +--- linux-3.3.8/kernel/vserver/cvirt.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/cvirt.c 2012-04-24 03:32:01.000000000 +0200 +@@ -0,0 +1,313 @@ ++/* ++ * linux/kernel/vserver/cvirt.c ++ * ++ * Virtual Server: Context Virtualization ++ * ++ * Copyright (C) 2004-2007 Herbert Pötzl ++ * ++ * V0.01 broken out from limit.c ++ * V0.02 added utsname stuff ++ * V0.03 changed vcmds to vxi arg ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++ ++void vx_vsi_boottime(struct timespec *boottime) ++{ ++ struct vx_info *vxi = current_vx_info(); ++ ++ set_normalized_timespec(boottime, ++ boottime->tv_sec + vxi->cvirt.bias_uptime.tv_sec, ++ boottime->tv_nsec + vxi->cvirt.bias_uptime.tv_nsec); ++ return; ++} ++ ++void vx_vsi_uptime(struct timespec *uptime, struct timespec *idle) ++{ ++ struct vx_info *vxi = current_vx_info(); ++ ++ set_normalized_timespec(uptime, ++ uptime->tv_sec - vxi->cvirt.bias_uptime.tv_sec, ++ uptime->tv_nsec - vxi->cvirt.bias_uptime.tv_nsec); ++ if (!idle) ++ return; ++ set_normalized_timespec(idle, ++ idle->tv_sec - vxi->cvirt.bias_idle.tv_sec, ++ idle->tv_nsec - vxi->cvirt.bias_idle.tv_nsec); ++ return; ++} ++ ++uint64_t vx_idle_jiffies(void) ++{ ++ return init_task.utime + init_task.stime; ++} ++ ++ ++ ++static inline uint32_t __update_loadavg(uint32_t load, ++ int wsize, int delta, int n) ++{ ++ unsigned long long calc, prev; ++ ++ /* just set it to n */ ++ if (unlikely(delta >= wsize)) ++ return (n << FSHIFT); ++ ++ calc = delta * n; ++ calc <<= FSHIFT; ++ prev = (wsize - delta); ++ prev *= load; ++ calc += prev; ++ do_div(calc, wsize); ++ return calc; ++} ++ ++ ++void vx_update_load(struct vx_info *vxi) ++{ ++ uint32_t now, last, delta; ++ unsigned int nr_running, nr_uninterruptible; ++ unsigned int total; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&vxi->cvirt.load_lock, flags); ++ ++ now = jiffies; ++ last = vxi->cvirt.load_last; ++ delta = now - last; ++ ++ if (delta < 5*HZ) ++ goto out; ++ ++ nr_running = atomic_read(&vxi->cvirt.nr_running); ++ nr_uninterruptible = atomic_read(&vxi->cvirt.nr_uninterruptible); ++ total = nr_running + nr_uninterruptible; ++ ++ vxi->cvirt.load[0] = __update_loadavg(vxi->cvirt.load[0], ++ 60*HZ, delta, total); ++ vxi->cvirt.load[1] = __update_loadavg(vxi->cvirt.load[1], ++ 5*60*HZ, delta, total); ++ vxi->cvirt.load[2] = __update_loadavg(vxi->cvirt.load[2], ++ 15*60*HZ, delta, total); ++ ++ vxi->cvirt.load_last = now; ++out: ++ atomic_inc(&vxi->cvirt.load_updates); ++ spin_unlock_irqrestore(&vxi->cvirt.load_lock, flags); ++} ++ ++ ++/* ++ * Commands to do_syslog: ++ * ++ * 0 -- Close the log. Currently a NOP. ++ * 1 -- Open the log. Currently a NOP. ++ * 2 -- Read from the log. ++ * 3 -- Read all messages remaining in the ring buffer. ++ * 4 -- Read and clear all messages remaining in the ring buffer ++ * 5 -- Clear ring buffer. ++ * 6 -- Disable printk's to console ++ * 7 -- Enable printk's to console ++ * 8 -- Set level of messages printed to console ++ * 9 -- Return number of unread characters in the log buffer ++ * 10 -- Return size of the log buffer ++ */ ++int vx_do_syslog(int type, char __user *buf, int len) ++{ ++ int error = 0; ++ int do_clear = 0; ++ struct vx_info *vxi = current_vx_info(); ++ struct _vx_syslog *log; ++ ++ if (!vxi) ++ return -EINVAL; ++ log = &vxi->cvirt.syslog; ++ ++ switch (type) { ++ case 0: /* Close log */ ++ case 1: /* Open log */ ++ break; ++ case 2: /* Read from log */ ++ error = wait_event_interruptible(log->log_wait, ++ (log->log_start - log->log_end)); ++ if (error) ++ break; ++ spin_lock_irq(&log->logbuf_lock); ++ spin_unlock_irq(&log->logbuf_lock); ++ break; ++ case 4: /* Read/clear last kernel messages */ ++ do_clear = 1; ++ /* fall through */ ++ case 3: /* Read last kernel messages */ ++ return 0; ++ ++ case 5: /* Clear ring buffer */ ++ return 0; ++ ++ case 6: /* Disable logging to console */ ++ case 7: /* Enable logging to console */ ++ case 8: /* Set level of messages printed to console */ ++ break; ++ ++ case 9: /* Number of chars in the log buffer */ ++ return 0; ++ case 10: /* Size of the log buffer */ ++ return 0; ++ default: ++ error = -EINVAL; ++ break; ++ } ++ return error; ++} ++ ++ ++/* virtual host info names */ ++ ++static char *vx_vhi_name(struct vx_info *vxi, int id) ++{ ++ struct nsproxy *nsproxy; ++ struct uts_namespace *uts; ++ ++ if (id == VHIN_CONTEXT) ++ return vxi->vx_name; ++ ++ nsproxy = vxi->space[0].vx_nsproxy; ++ if (!nsproxy) ++ return NULL; ++ ++ uts = nsproxy->uts_ns; ++ if (!uts) ++ return NULL; ++ ++ switch (id) { ++ case VHIN_SYSNAME: ++ return uts->name.sysname; ++ case VHIN_NODENAME: ++ return uts->name.nodename; ++ case VHIN_RELEASE: ++ return uts->name.release; ++ case VHIN_VERSION: ++ return uts->name.version; ++ case VHIN_MACHINE: ++ return uts->name.machine; ++ case VHIN_DOMAINNAME: ++ return uts->name.domainname; ++ default: ++ return NULL; ++ } ++ return NULL; ++} ++ ++int vc_set_vhi_name(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_vhi_name_v0 vc_data; ++ char *name; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ name = vx_vhi_name(vxi, vc_data.field); ++ if (!name) ++ return -EINVAL; ++ ++ memcpy(name, vc_data.name, 65); ++ return 0; ++} ++ ++int vc_get_vhi_name(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_vhi_name_v0 vc_data; ++ char *name; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ name = vx_vhi_name(vxi, vc_data.field); ++ if (!name) ++ return -EINVAL; ++ ++ memcpy(vc_data.name, name, 65); ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++int vc_virt_stat(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_virt_stat_v0 vc_data; ++ struct _vx_cvirt *cvirt = &vxi->cvirt; ++ struct timespec uptime; ++ ++ do_posix_clock_monotonic_gettime(&uptime); ++ set_normalized_timespec(&uptime, ++ uptime.tv_sec - cvirt->bias_uptime.tv_sec, ++ uptime.tv_nsec - cvirt->bias_uptime.tv_nsec); ++ ++ vc_data.offset = timespec_to_ns(&cvirt->bias_ts); ++ vc_data.uptime = timespec_to_ns(&uptime); ++ vc_data.nr_threads = atomic_read(&cvirt->nr_threads); ++ vc_data.nr_running = atomic_read(&cvirt->nr_running); ++ vc_data.nr_uninterruptible = atomic_read(&cvirt->nr_uninterruptible); ++ vc_data.nr_onhold = atomic_read(&cvirt->nr_onhold); ++ vc_data.nr_forks = atomic_read(&cvirt->total_forks); ++ vc_data.load[0] = cvirt->load[0]; ++ vc_data.load[1] = cvirt->load[1]; ++ vc_data.load[2] = cvirt->load[2]; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++#ifdef CONFIG_VSERVER_VTIME ++ ++/* virtualized time base */ ++ ++void vx_adjust_timespec(struct timespec *ts) ++{ ++ struct vx_info *vxi; ++ ++ if (!vx_flags(VXF_VIRT_TIME, 0)) ++ return; ++ ++ vxi = current_vx_info(); ++ ts->tv_sec += vxi->cvirt.bias_ts.tv_sec; ++ ts->tv_nsec += vxi->cvirt.bias_ts.tv_nsec; ++ ++ if (ts->tv_nsec >= NSEC_PER_SEC) { ++ ts->tv_sec++; ++ ts->tv_nsec -= NSEC_PER_SEC; ++ } else if (ts->tv_nsec < 0) { ++ ts->tv_sec--; ++ ts->tv_nsec += NSEC_PER_SEC; ++ } ++} ++ ++int vx_settimeofday(const struct timespec *ts) ++{ ++ struct timespec ats, delta; ++ struct vx_info *vxi; ++ ++ if (!vx_flags(VXF_VIRT_TIME, 0)) ++ return do_settimeofday(ts); ++ ++ getnstimeofday(&ats); ++ delta = timespec_sub(*ts, ats); ++ ++ vxi = current_vx_info(); ++ vxi->cvirt.bias_ts = timespec_add(vxi->cvirt.bias_ts, delta); ++ return 0; ++} ++ ++#endif ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/cvirt_init.h linux-3.3.8-vs2.3.3.4/kernel/vserver/cvirt_init.h +--- linux-3.3.8/kernel/vserver/cvirt_init.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/cvirt_init.h 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,70 @@ ++ ++ ++extern uint64_t vx_idle_jiffies(void); ++ ++static inline void vx_info_init_cvirt(struct _vx_cvirt *cvirt) ++{ ++ uint64_t idle_jiffies = vx_idle_jiffies(); ++ uint64_t nsuptime; ++ ++ do_posix_clock_monotonic_gettime(&cvirt->bias_uptime); ++ nsuptime = (unsigned long long)cvirt->bias_uptime.tv_sec ++ * NSEC_PER_SEC + cvirt->bias_uptime.tv_nsec; ++ cvirt->bias_clock = nsec_to_clock_t(nsuptime); ++ cvirt->bias_ts.tv_sec = 0; ++ cvirt->bias_ts.tv_nsec = 0; ++ ++ jiffies_to_timespec(idle_jiffies, &cvirt->bias_idle); ++ atomic_set(&cvirt->nr_threads, 0); ++ atomic_set(&cvirt->nr_running, 0); ++ atomic_set(&cvirt->nr_uninterruptible, 0); ++ atomic_set(&cvirt->nr_onhold, 0); ++ ++ spin_lock_init(&cvirt->load_lock); ++ cvirt->load_last = jiffies; ++ atomic_set(&cvirt->load_updates, 0); ++ cvirt->load[0] = 0; ++ cvirt->load[1] = 0; ++ cvirt->load[2] = 0; ++ atomic_set(&cvirt->total_forks, 0); ++ ++ spin_lock_init(&cvirt->syslog.logbuf_lock); ++ init_waitqueue_head(&cvirt->syslog.log_wait); ++ cvirt->syslog.log_start = 0; ++ cvirt->syslog.log_end = 0; ++ cvirt->syslog.con_start = 0; ++ cvirt->syslog.logged_chars = 0; ++} ++ ++static inline ++void vx_info_init_cvirt_pc(struct _vx_cvirt_pc *cvirt_pc, int cpu) ++{ ++ // cvirt_pc->cpustat = { 0 }; ++} ++ ++static inline void vx_info_exit_cvirt(struct _vx_cvirt *cvirt) ++{ ++#ifdef CONFIG_VSERVER_WARN ++ int value; ++#endif ++ vxwprintk_xid((value = atomic_read(&cvirt->nr_threads)), ++ "!!! cvirt: %p[nr_threads] = %d on exit.", ++ cvirt, value); ++ vxwprintk_xid((value = atomic_read(&cvirt->nr_running)), ++ "!!! cvirt: %p[nr_running] = %d on exit.", ++ cvirt, value); ++ vxwprintk_xid((value = atomic_read(&cvirt->nr_uninterruptible)), ++ "!!! cvirt: %p[nr_uninterruptible] = %d on exit.", ++ cvirt, value); ++ vxwprintk_xid((value = atomic_read(&cvirt->nr_onhold)), ++ "!!! cvirt: %p[nr_onhold] = %d on exit.", ++ cvirt, value); ++ return; ++} ++ ++static inline ++void vx_info_exit_cvirt_pc(struct _vx_cvirt_pc *cvirt_pc, int cpu) ++{ ++ return; ++} ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/cvirt_proc.h linux-3.3.8-vs2.3.3.4/kernel/vserver/cvirt_proc.h +--- linux-3.3.8/kernel/vserver/cvirt_proc.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/cvirt_proc.h 2012-04-03 16:07:39.000000000 +0200 +@@ -0,0 +1,123 @@ ++#ifndef _VX_CVIRT_PROC_H ++#define _VX_CVIRT_PROC_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++extern int vx_info_mnt_namespace(struct mnt_namespace *, char *); ++ ++static inline ++int vx_info_proc_nsproxy(struct nsproxy *nsproxy, char *buffer) ++{ ++ struct mnt_namespace *ns; ++ struct uts_namespace *uts; ++ struct ipc_namespace *ipc; ++ int length = 0; ++ ++ if (!nsproxy) ++ goto out; ++ ++ length += sprintf(buffer + length, ++ "NSProxy:\t%p [%p,%p,%p]\n", ++ nsproxy, nsproxy->mnt_ns, ++ nsproxy->uts_ns, nsproxy->ipc_ns); ++ ++ ns = nsproxy->mnt_ns; ++ if (!ns) ++ goto skip_ns; ++ ++ length += vx_info_mnt_namespace(ns, buffer + length); ++ ++skip_ns: ++ ++ uts = nsproxy->uts_ns; ++ if (!uts) ++ goto skip_uts; ++ ++ length += sprintf(buffer + length, ++ "SysName:\t%.*s\n" ++ "NodeName:\t%.*s\n" ++ "Release:\t%.*s\n" ++ "Version:\t%.*s\n" ++ "Machine:\t%.*s\n" ++ "DomainName:\t%.*s\n", ++ __NEW_UTS_LEN, uts->name.sysname, ++ __NEW_UTS_LEN, uts->name.nodename, ++ __NEW_UTS_LEN, uts->name.release, ++ __NEW_UTS_LEN, uts->name.version, ++ __NEW_UTS_LEN, uts->name.machine, ++ __NEW_UTS_LEN, uts->name.domainname); ++skip_uts: ++ ++ ipc = nsproxy->ipc_ns; ++ if (!ipc) ++ goto skip_ipc; ++ ++ length += sprintf(buffer + length, ++ "SEMS:\t\t%d %d %d %d %d\n" ++ "MSG:\t\t%d %d %d\n" ++ "SHM:\t\t%lu %lu %d %d\n", ++ ipc->sem_ctls[0], ipc->sem_ctls[1], ++ ipc->sem_ctls[2], ipc->sem_ctls[3], ++ ipc->used_sems, ++ ipc->msg_ctlmax, ipc->msg_ctlmnb, ipc->msg_ctlmni, ++ (unsigned long)ipc->shm_ctlmax, ++ (unsigned long)ipc->shm_ctlall, ++ ipc->shm_ctlmni, ipc->shm_tot); ++skip_ipc: ++out: ++ return length; ++} ++ ++ ++#include ++ ++#define LOAD_INT(x) ((x) >> FSHIFT) ++#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1 - 1)) * 100) ++ ++static inline ++int vx_info_proc_cvirt(struct _vx_cvirt *cvirt, char *buffer) ++{ ++ int length = 0; ++ int a, b, c; ++ ++ length += sprintf(buffer + length, ++ "BiasUptime:\t%lu.%02lu\n", ++ (unsigned long)cvirt->bias_uptime.tv_sec, ++ (cvirt->bias_uptime.tv_nsec / (NSEC_PER_SEC / 100))); ++ ++ a = cvirt->load[0] + (FIXED_1 / 200); ++ b = cvirt->load[1] + (FIXED_1 / 200); ++ c = cvirt->load[2] + (FIXED_1 / 200); ++ length += sprintf(buffer + length, ++ "nr_threads:\t%d\n" ++ "nr_running:\t%d\n" ++ "nr_unintr:\t%d\n" ++ "nr_onhold:\t%d\n" ++ "load_updates:\t%d\n" ++ "loadavg:\t%d.%02d %d.%02d %d.%02d\n" ++ "total_forks:\t%d\n", ++ atomic_read(&cvirt->nr_threads), ++ atomic_read(&cvirt->nr_running), ++ atomic_read(&cvirt->nr_uninterruptible), ++ atomic_read(&cvirt->nr_onhold), ++ atomic_read(&cvirt->load_updates), ++ LOAD_INT(a), LOAD_FRAC(a), ++ LOAD_INT(b), LOAD_FRAC(b), ++ LOAD_INT(c), LOAD_FRAC(c), ++ atomic_read(&cvirt->total_forks)); ++ return length; ++} ++ ++static inline ++int vx_info_proc_cvirt_pc(struct _vx_cvirt_pc *cvirt_pc, ++ char *buffer, int cpu) ++{ ++ int length = 0; ++ return length; ++} ++ ++#endif /* _VX_CVIRT_PROC_H */ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/debug.c linux-3.3.8-vs2.3.3.4/kernel/vserver/debug.c +--- linux-3.3.8/kernel/vserver/debug.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/debug.c 2012-02-24 03:55:06.000000000 +0100 +@@ -0,0 +1,32 @@ ++/* ++ * kernel/vserver/debug.c ++ * ++ * Copyright (C) 2005-2007 Herbert Pötzl ++ * ++ * V0.01 vx_info dump support ++ * ++ */ ++ ++#include ++ ++#include ++ ++ ++void dump_vx_info(struct vx_info *vxi, int level) ++{ ++ printk("vx_info %p[#%d, %d.%d, %4x]\n", vxi, vxi->vx_id, ++ atomic_read(&vxi->vx_usecnt), ++ atomic_read(&vxi->vx_tasks), ++ vxi->vx_state); ++ if (level > 0) { ++ __dump_vx_limit(&vxi->limit); ++ __dump_vx_sched(&vxi->sched); ++ __dump_vx_cvirt(&vxi->cvirt); ++ __dump_vx_cacct(&vxi->cacct); ++ } ++ printk("---\n"); ++} ++ ++ ++EXPORT_SYMBOL_GPL(dump_vx_info); ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/device.c linux-3.3.8-vs2.3.3.4/kernel/vserver/device.c +--- linux-3.3.8/kernel/vserver/device.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/device.c 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,443 @@ ++/* ++ * linux/kernel/vserver/device.c ++ * ++ * Linux-VServer: Device Support ++ * ++ * Copyright (C) 2006 Herbert Pötzl ++ * Copyright (C) 2007 Daniel Hokka Zakrisson ++ * ++ * V0.01 device mapping basics ++ * V0.02 added defaults ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#define DMAP_HASH_BITS 4 ++ ++ ++struct vs_mapping { ++ union { ++ struct hlist_node hlist; ++ struct list_head list; ++ } u; ++#define dm_hlist u.hlist ++#define dm_list u.list ++ xid_t xid; ++ dev_t device; ++ struct vx_dmap_target target; ++}; ++ ++ ++static struct hlist_head dmap_main_hash[1 << DMAP_HASH_BITS]; ++ ++static DEFINE_SPINLOCK(dmap_main_hash_lock); ++ ++static struct vx_dmap_target dmap_defaults[2] = { ++ { .flags = DATTR_OPEN }, ++ { .flags = DATTR_OPEN }, ++}; ++ ++ ++struct kmem_cache *dmap_cachep __read_mostly; ++ ++int __init dmap_cache_init(void) ++{ ++ dmap_cachep = kmem_cache_create("dmap_cache", ++ sizeof(struct vs_mapping), 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ return 0; ++} ++ ++__initcall(dmap_cache_init); ++ ++ ++static inline unsigned int __hashval(dev_t dev, int bits) ++{ ++ return hash_long((unsigned long)dev, bits); ++} ++ ++ ++/* __hash_mapping() ++ * add the mapping to the hash table ++ */ ++static inline void __hash_mapping(struct vx_info *vxi, struct vs_mapping *vdm) ++{ ++ spinlock_t *hash_lock = &dmap_main_hash_lock; ++ struct hlist_head *head, *hash = dmap_main_hash; ++ int device = vdm->device; ++ ++ spin_lock(hash_lock); ++ vxdprintk(VXD_CBIT(misc, 8), "__hash_mapping: %p[#%d] %08x:%08x", ++ vxi, vxi ? vxi->vx_id : 0, device, vdm->target.target); ++ ++ head = &hash[__hashval(device, DMAP_HASH_BITS)]; ++ hlist_add_head(&vdm->dm_hlist, head); ++ spin_unlock(hash_lock); ++} ++ ++ ++static inline int __mode_to_default(umode_t mode) ++{ ++ switch (mode) { ++ case S_IFBLK: ++ return 0; ++ case S_IFCHR: ++ return 1; ++ default: ++ BUG(); ++ } ++} ++ ++ ++/* __set_default() ++ * set a default ++ */ ++static inline void __set_default(struct vx_info *vxi, umode_t mode, ++ struct vx_dmap_target *vdmt) ++{ ++ spinlock_t *hash_lock = &dmap_main_hash_lock; ++ spin_lock(hash_lock); ++ ++ if (vxi) ++ vxi->dmap.targets[__mode_to_default(mode)] = *vdmt; ++ else ++ dmap_defaults[__mode_to_default(mode)] = *vdmt; ++ ++ ++ spin_unlock(hash_lock); ++ ++ vxdprintk(VXD_CBIT(misc, 8), "__set_default: %p[#%u] %08x %04x", ++ vxi, vxi ? vxi->vx_id : 0, vdmt->target, vdmt->flags); ++} ++ ++ ++/* __remove_default() ++ * remove a default ++ */ ++static inline int __remove_default(struct vx_info *vxi, umode_t mode) ++{ ++ spinlock_t *hash_lock = &dmap_main_hash_lock; ++ spin_lock(hash_lock); ++ ++ if (vxi) ++ vxi->dmap.targets[__mode_to_default(mode)].flags = 0; ++ else /* remove == reset */ ++ dmap_defaults[__mode_to_default(mode)].flags = DATTR_OPEN | mode; ++ ++ spin_unlock(hash_lock); ++ return 0; ++} ++ ++ ++/* __find_mapping() ++ * find a mapping in the hash table ++ * ++ * caller must hold hash_lock ++ */ ++static inline int __find_mapping(xid_t xid, dev_t device, umode_t mode, ++ struct vs_mapping **local, struct vs_mapping **global) ++{ ++ struct hlist_head *hash = dmap_main_hash; ++ struct hlist_head *head = &hash[__hashval(device, DMAP_HASH_BITS)]; ++ struct hlist_node *pos; ++ struct vs_mapping *vdm; ++ ++ *local = NULL; ++ if (global) ++ *global = NULL; ++ ++ hlist_for_each(pos, head) { ++ vdm = hlist_entry(pos, struct vs_mapping, dm_hlist); ++ ++ if ((vdm->device == device) && ++ !((vdm->target.flags ^ mode) & S_IFMT)) { ++ if (vdm->xid == xid) { ++ *local = vdm; ++ return 1; ++ } else if (global && vdm->xid == 0) ++ *global = vdm; ++ } ++ } ++ ++ if (global && *global) ++ return 0; ++ else ++ return -ENOENT; ++} ++ ++ ++/* __lookup_mapping() ++ * find a mapping and store the result in target and flags ++ */ ++static inline int __lookup_mapping(struct vx_info *vxi, ++ dev_t device, dev_t *target, int *flags, umode_t mode) ++{ ++ spinlock_t *hash_lock = &dmap_main_hash_lock; ++ struct vs_mapping *vdm, *global; ++ struct vx_dmap_target *vdmt; ++ int ret = 0; ++ xid_t xid = vxi->vx_id; ++ int index; ++ ++ spin_lock(hash_lock); ++ if (__find_mapping(xid, device, mode, &vdm, &global) > 0) { ++ ret = 1; ++ vdmt = &vdm->target; ++ goto found; ++ } ++ ++ index = __mode_to_default(mode); ++ if (vxi && vxi->dmap.targets[index].flags) { ++ ret = 2; ++ vdmt = &vxi->dmap.targets[index]; ++ } else if (global) { ++ ret = 3; ++ vdmt = &global->target; ++ goto found; ++ } else { ++ ret = 4; ++ vdmt = &dmap_defaults[index]; ++ } ++ ++found: ++ if (target && (vdmt->flags & DATTR_REMAP)) ++ *target = vdmt->target; ++ else if (target) ++ *target = device; ++ if (flags) ++ *flags = vdmt->flags; ++ ++ spin_unlock(hash_lock); ++ ++ return ret; ++} ++ ++ ++/* __remove_mapping() ++ * remove a mapping from the hash table ++ */ ++static inline int __remove_mapping(struct vx_info *vxi, dev_t device, ++ umode_t mode) ++{ ++ spinlock_t *hash_lock = &dmap_main_hash_lock; ++ struct vs_mapping *vdm = NULL; ++ int ret = 0; ++ ++ spin_lock(hash_lock); ++ ++ ret = __find_mapping((vxi ? vxi->vx_id : 0), device, mode, &vdm, ++ NULL); ++ vxdprintk(VXD_CBIT(misc, 8), "__remove_mapping: %p[#%d] %08x %04x", ++ vxi, vxi ? vxi->vx_id : 0, device, mode); ++ if (ret < 0) ++ goto out; ++ hlist_del(&vdm->dm_hlist); ++ ++out: ++ spin_unlock(hash_lock); ++ if (vdm) ++ kmem_cache_free(dmap_cachep, vdm); ++ return ret; ++} ++ ++ ++ ++int vs_map_device(struct vx_info *vxi, ++ dev_t device, dev_t *target, umode_t mode) ++{ ++ int ret, flags = DATTR_MASK; ++ ++ if (!vxi) { ++ if (target) ++ *target = device; ++ goto out; ++ } ++ ret = __lookup_mapping(vxi, device, target, &flags, mode); ++ vxdprintk(VXD_CBIT(misc, 8), "vs_map_device: %08x target: %08x flags: %04x mode: %04x mapped=%d", ++ device, target ? *target : 0, flags, mode, ret); ++out: ++ return (flags & DATTR_MASK); ++} ++ ++ ++ ++static int do_set_mapping(struct vx_info *vxi, ++ dev_t device, dev_t target, int flags, umode_t mode) ++{ ++ if (device) { ++ struct vs_mapping *new; ++ ++ new = kmem_cache_alloc(dmap_cachep, GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ INIT_HLIST_NODE(&new->dm_hlist); ++ new->device = device; ++ new->target.target = target; ++ new->target.flags = flags | mode; ++ new->xid = (vxi ? vxi->vx_id : 0); ++ ++ vxdprintk(VXD_CBIT(misc, 8), "do_set_mapping: %08x target: %08x flags: %04x", device, target, flags); ++ __hash_mapping(vxi, new); ++ } else { ++ struct vx_dmap_target new = { ++ .target = target, ++ .flags = flags | mode, ++ }; ++ __set_default(vxi, mode, &new); ++ } ++ return 0; ++} ++ ++ ++static int do_unset_mapping(struct vx_info *vxi, ++ dev_t device, dev_t target, int flags, umode_t mode) ++{ ++ int ret = -EINVAL; ++ ++ if (device) { ++ ret = __remove_mapping(vxi, device, mode); ++ if (ret < 0) ++ goto out; ++ } else { ++ ret = __remove_default(vxi, mode); ++ if (ret < 0) ++ goto out; ++ } ++ ++out: ++ return ret; ++} ++ ++ ++static inline int __user_device(const char __user *name, dev_t *dev, ++ umode_t *mode) ++{ ++ struct nameidata nd; ++ int ret; ++ ++ if (!name) { ++ *dev = 0; ++ return 0; ++ } ++ ret = user_lpath(name, &nd.path); ++ if (ret) ++ return ret; ++ if (nd.path.dentry->d_inode) { ++ *dev = nd.path.dentry->d_inode->i_rdev; ++ *mode = nd.path.dentry->d_inode->i_mode; ++ } ++ path_put(&nd.path); ++ return 0; ++} ++ ++static inline int __mapping_mode(dev_t device, dev_t target, ++ umode_t device_mode, umode_t target_mode, umode_t *mode) ++{ ++ if (device) ++ *mode = device_mode & S_IFMT; ++ else if (target) ++ *mode = target_mode & S_IFMT; ++ else ++ return -EINVAL; ++ ++ /* if both given, device and target mode have to match */ ++ if (device && target && ++ ((device_mode ^ target_mode) & S_IFMT)) ++ return -EINVAL; ++ return 0; ++} ++ ++ ++static inline int do_mapping(struct vx_info *vxi, const char __user *device_path, ++ const char __user *target_path, int flags, int set) ++{ ++ dev_t device = ~0, target = ~0; ++ umode_t device_mode = 0, target_mode = 0, mode; ++ int ret; ++ ++ ret = __user_device(device_path, &device, &device_mode); ++ if (ret) ++ return ret; ++ ret = __user_device(target_path, &target, &target_mode); ++ if (ret) ++ return ret; ++ ++ ret = __mapping_mode(device, target, ++ device_mode, target_mode, &mode); ++ if (ret) ++ return ret; ++ ++ if (set) ++ return do_set_mapping(vxi, device, target, ++ flags, mode); ++ else ++ return do_unset_mapping(vxi, device, target, ++ flags, mode); ++} ++ ++ ++int vc_set_mapping(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_set_mapping_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_mapping(vxi, vc_data.device, vc_data.target, ++ vc_data.flags, 1); ++} ++ ++int vc_unset_mapping(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_set_mapping_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_mapping(vxi, vc_data.device, vc_data.target, ++ vc_data.flags, 0); ++} ++ ++ ++#ifdef CONFIG_COMPAT ++ ++int vc_set_mapping_x32(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_set_mapping_v0_x32 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_mapping(vxi, compat_ptr(vc_data.device_ptr), ++ compat_ptr(vc_data.target_ptr), vc_data.flags, 1); ++} ++ ++int vc_unset_mapping_x32(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_set_mapping_v0_x32 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_mapping(vxi, compat_ptr(vc_data.device_ptr), ++ compat_ptr(vc_data.target_ptr), vc_data.flags, 0); ++} ++ ++#endif /* CONFIG_COMPAT */ ++ ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/dlimit.c linux-3.3.8-vs2.3.3.4/kernel/vserver/dlimit.c +--- linux-3.3.8/kernel/vserver/dlimit.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/dlimit.c 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,531 @@ ++/* ++ * linux/kernel/vserver/dlimit.c ++ * ++ * Virtual Server: Context Disk Limits ++ * ++ * Copyright (C) 2004-2009 Herbert Pötzl ++ * ++ * V0.01 initial version ++ * V0.02 compat32 splitup ++ * V0.03 extended interface ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++// #include ++ ++#include ++ ++/* __alloc_dl_info() ++ ++ * allocate an initialized dl_info struct ++ * doesn't make it visible (hash) */ ++ ++static struct dl_info *__alloc_dl_info(struct super_block *sb, tag_t tag) ++{ ++ struct dl_info *new = NULL; ++ ++ vxdprintk(VXD_CBIT(dlim, 5), ++ "alloc_dl_info(%p,%d)*", sb, tag); ++ ++ /* would this benefit from a slab cache? */ ++ new = kmalloc(sizeof(struct dl_info), GFP_KERNEL); ++ if (!new) ++ return 0; ++ ++ memset(new, 0, sizeof(struct dl_info)); ++ new->dl_tag = tag; ++ new->dl_sb = sb; ++ // INIT_RCU_HEAD(&new->dl_rcu); ++ INIT_HLIST_NODE(&new->dl_hlist); ++ spin_lock_init(&new->dl_lock); ++ atomic_set(&new->dl_refcnt, 0); ++ atomic_set(&new->dl_usecnt, 0); ++ ++ /* rest of init goes here */ ++ ++ vxdprintk(VXD_CBIT(dlim, 4), ++ "alloc_dl_info(%p,%d) = %p", sb, tag, new); ++ return new; ++} ++ ++/* __dealloc_dl_info() ++ ++ * final disposal of dl_info */ ++ ++static void __dealloc_dl_info(struct dl_info *dli) ++{ ++ vxdprintk(VXD_CBIT(dlim, 4), ++ "dealloc_dl_info(%p)", dli); ++ ++ dli->dl_hlist.next = LIST_POISON1; ++ dli->dl_tag = -1; ++ dli->dl_sb = 0; ++ ++ BUG_ON(atomic_read(&dli->dl_usecnt)); ++ BUG_ON(atomic_read(&dli->dl_refcnt)); ++ ++ kfree(dli); ++} ++ ++ ++/* hash table for dl_info hash */ ++ ++#define DL_HASH_SIZE 13 ++ ++struct hlist_head dl_info_hash[DL_HASH_SIZE]; ++ ++static DEFINE_SPINLOCK(dl_info_hash_lock); ++ ++ ++static inline unsigned int __hashval(struct super_block *sb, tag_t tag) ++{ ++ return ((tag ^ (unsigned long)sb) % DL_HASH_SIZE); ++} ++ ++ ++ ++/* __hash_dl_info() ++ ++ * add the dli to the global hash table ++ * requires the hash_lock to be held */ ++ ++static inline void __hash_dl_info(struct dl_info *dli) ++{ ++ struct hlist_head *head; ++ ++ vxdprintk(VXD_CBIT(dlim, 6), ++ "__hash_dl_info: %p[#%d]", dli, dli->dl_tag); ++ get_dl_info(dli); ++ head = &dl_info_hash[__hashval(dli->dl_sb, dli->dl_tag)]; ++ hlist_add_head_rcu(&dli->dl_hlist, head); ++} ++ ++/* __unhash_dl_info() ++ ++ * remove the dli from the global hash table ++ * requires the hash_lock to be held */ ++ ++static inline void __unhash_dl_info(struct dl_info *dli) ++{ ++ vxdprintk(VXD_CBIT(dlim, 6), ++ "__unhash_dl_info: %p[#%d]", dli, dli->dl_tag); ++ hlist_del_rcu(&dli->dl_hlist); ++ put_dl_info(dli); ++} ++ ++ ++/* __lookup_dl_info() ++ ++ * requires the rcu_read_lock() ++ * doesn't increment the dl_refcnt */ ++ ++static inline struct dl_info *__lookup_dl_info(struct super_block *sb, tag_t tag) ++{ ++ struct hlist_head *head = &dl_info_hash[__hashval(sb, tag)]; ++ struct hlist_node *pos; ++ struct dl_info *dli; ++ ++ hlist_for_each_entry_rcu(dli, pos, head, dl_hlist) { ++ ++ if (dli->dl_tag == tag && dli->dl_sb == sb) { ++ return dli; ++ } ++ } ++ return NULL; ++} ++ ++ ++struct dl_info *locate_dl_info(struct super_block *sb, tag_t tag) ++{ ++ struct dl_info *dli; ++ ++ rcu_read_lock(); ++ dli = get_dl_info(__lookup_dl_info(sb, tag)); ++ vxdprintk(VXD_CBIT(dlim, 7), ++ "locate_dl_info(%p,#%d) = %p", sb, tag, dli); ++ rcu_read_unlock(); ++ return dli; ++} ++ ++void rcu_free_dl_info(struct rcu_head *head) ++{ ++ struct dl_info *dli = container_of(head, struct dl_info, dl_rcu); ++ int usecnt, refcnt; ++ ++ BUG_ON(!dli || !head); ++ ++ usecnt = atomic_read(&dli->dl_usecnt); ++ BUG_ON(usecnt < 0); ++ ++ refcnt = atomic_read(&dli->dl_refcnt); ++ BUG_ON(refcnt < 0); ++ ++ vxdprintk(VXD_CBIT(dlim, 3), ++ "rcu_free_dl_info(%p)", dli); ++ if (!usecnt) ++ __dealloc_dl_info(dli); ++ else ++ printk("!!! rcu didn't free\n"); ++} ++ ++ ++ ++ ++static int do_addrem_dlimit(uint32_t id, const char __user *name, ++ uint32_t flags, int add) ++{ ++ struct path path; ++ int ret; ++ ++ ret = user_lpath(name, &path); ++ if (!ret) { ++ struct super_block *sb; ++ struct dl_info *dli; ++ ++ ret = -EINVAL; ++ if (!path.dentry->d_inode) ++ goto out_release; ++ if (!(sb = path.dentry->d_inode->i_sb)) ++ goto out_release; ++ ++ if (add) { ++ dli = __alloc_dl_info(sb, id); ++ spin_lock(&dl_info_hash_lock); ++ ++ ret = -EEXIST; ++ if (__lookup_dl_info(sb, id)) ++ goto out_unlock; ++ __hash_dl_info(dli); ++ dli = NULL; ++ } else { ++ spin_lock(&dl_info_hash_lock); ++ dli = __lookup_dl_info(sb, id); ++ ++ ret = -ESRCH; ++ if (!dli) ++ goto out_unlock; ++ __unhash_dl_info(dli); ++ } ++ ret = 0; ++ out_unlock: ++ spin_unlock(&dl_info_hash_lock); ++ if (add && dli) ++ __dealloc_dl_info(dli); ++ out_release: ++ path_put(&path); ++ } ++ return ret; ++} ++ ++int vc_add_dlimit(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_base_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_addrem_dlimit(id, vc_data.name, vc_data.flags, 1); ++} ++ ++int vc_rem_dlimit(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_base_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_addrem_dlimit(id, vc_data.name, vc_data.flags, 0); ++} ++ ++#ifdef CONFIG_COMPAT ++ ++int vc_add_dlimit_x32(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_base_v0_x32 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_addrem_dlimit(id, ++ compat_ptr(vc_data.name_ptr), vc_data.flags, 1); ++} ++ ++int vc_rem_dlimit_x32(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_base_v0_x32 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_addrem_dlimit(id, ++ compat_ptr(vc_data.name_ptr), vc_data.flags, 0); ++} ++ ++#endif /* CONFIG_COMPAT */ ++ ++ ++static inline ++int do_set_dlimit(uint32_t id, const char __user *name, ++ uint32_t space_used, uint32_t space_total, ++ uint32_t inodes_used, uint32_t inodes_total, ++ uint32_t reserved, uint32_t flags) ++{ ++ struct path path; ++ int ret; ++ ++ ret = user_lpath(name, &path); ++ if (!ret) { ++ struct super_block *sb; ++ struct dl_info *dli; ++ ++ ret = -EINVAL; ++ if (!path.dentry->d_inode) ++ goto out_release; ++ if (!(sb = path.dentry->d_inode->i_sb)) ++ goto out_release; ++ ++ /* sanity checks */ ++ if ((reserved != CDLIM_KEEP && ++ reserved > 100) || ++ (inodes_used != CDLIM_KEEP && ++ inodes_used > inodes_total) || ++ (space_used != CDLIM_KEEP && ++ space_used > space_total)) ++ goto out_release; ++ ++ ret = -ESRCH; ++ dli = locate_dl_info(sb, id); ++ if (!dli) ++ goto out_release; ++ ++ spin_lock(&dli->dl_lock); ++ ++ if (inodes_used != CDLIM_KEEP) ++ dli->dl_inodes_used = inodes_used; ++ if (inodes_total != CDLIM_KEEP) ++ dli->dl_inodes_total = inodes_total; ++ if (space_used != CDLIM_KEEP) ++ dli->dl_space_used = dlimit_space_32to64( ++ space_used, flags, DLIMS_USED); ++ ++ if (space_total == CDLIM_INFINITY) ++ dli->dl_space_total = DLIM_INFINITY; ++ else if (space_total != CDLIM_KEEP) ++ dli->dl_space_total = dlimit_space_32to64( ++ space_total, flags, DLIMS_TOTAL); ++ ++ if (reserved != CDLIM_KEEP) ++ dli->dl_nrlmult = (1 << 10) * (100 - reserved) / 100; ++ ++ spin_unlock(&dli->dl_lock); ++ ++ put_dl_info(dli); ++ ret = 0; ++ ++ out_release: ++ path_put(&path); ++ } ++ return ret; ++} ++ ++int vc_set_dlimit(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_dlimit(id, vc_data.name, ++ vc_data.space_used, vc_data.space_total, ++ vc_data.inodes_used, vc_data.inodes_total, ++ vc_data.reserved, vc_data.flags); ++} ++ ++#ifdef CONFIG_COMPAT ++ ++int vc_set_dlimit_x32(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_v0_x32 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_dlimit(id, compat_ptr(vc_data.name_ptr), ++ vc_data.space_used, vc_data.space_total, ++ vc_data.inodes_used, vc_data.inodes_total, ++ vc_data.reserved, vc_data.flags); ++} ++ ++#endif /* CONFIG_COMPAT */ ++ ++ ++static inline ++int do_get_dlimit(uint32_t id, const char __user *name, ++ uint32_t *space_used, uint32_t *space_total, ++ uint32_t *inodes_used, uint32_t *inodes_total, ++ uint32_t *reserved, uint32_t *flags) ++{ ++ struct path path; ++ int ret; ++ ++ ret = user_lpath(name, &path); ++ if (!ret) { ++ struct super_block *sb; ++ struct dl_info *dli; ++ ++ ret = -EINVAL; ++ if (!path.dentry->d_inode) ++ goto out_release; ++ if (!(sb = path.dentry->d_inode->i_sb)) ++ goto out_release; ++ ++ ret = -ESRCH; ++ dli = locate_dl_info(sb, id); ++ if (!dli) ++ goto out_release; ++ ++ spin_lock(&dli->dl_lock); ++ *inodes_used = dli->dl_inodes_used; ++ *inodes_total = dli->dl_inodes_total; ++ ++ *space_used = dlimit_space_64to32( ++ dli->dl_space_used, flags, DLIMS_USED); ++ ++ if (dli->dl_space_total == DLIM_INFINITY) ++ *space_total = CDLIM_INFINITY; ++ else ++ *space_total = dlimit_space_64to32( ++ dli->dl_space_total, flags, DLIMS_TOTAL); ++ ++ *reserved = 100 - ((dli->dl_nrlmult * 100 + 512) >> 10); ++ spin_unlock(&dli->dl_lock); ++ ++ put_dl_info(dli); ++ ret = -EFAULT; ++ ++ ret = 0; ++ out_release: ++ path_put(&path); ++ } ++ return ret; ++} ++ ++ ++int vc_get_dlimit(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_v0 vc_data; ++ int ret; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = do_get_dlimit(id, vc_data.name, ++ &vc_data.space_used, &vc_data.space_total, ++ &vc_data.inodes_used, &vc_data.inodes_total, ++ &vc_data.reserved, &vc_data.flags); ++ if (ret) ++ return ret; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++#ifdef CONFIG_COMPAT ++ ++int vc_get_dlimit_x32(uint32_t id, void __user *data) ++{ ++ struct vcmd_ctx_dlimit_v0_x32 vc_data; ++ int ret; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = do_get_dlimit(id, compat_ptr(vc_data.name_ptr), ++ &vc_data.space_used, &vc_data.space_total, ++ &vc_data.inodes_used, &vc_data.inodes_total, ++ &vc_data.reserved, &vc_data.flags); ++ if (ret) ++ return ret; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++#endif /* CONFIG_COMPAT */ ++ ++ ++void vx_vsi_statfs(struct super_block *sb, struct kstatfs *buf) ++{ ++ struct dl_info *dli; ++ __u64 blimit, bfree, bavail; ++ __u32 ifree; ++ ++ dli = locate_dl_info(sb, dx_current_tag()); ++ if (!dli) ++ return; ++ ++ spin_lock(&dli->dl_lock); ++ if (dli->dl_inodes_total == (unsigned long)DLIM_INFINITY) ++ goto no_ilim; ++ ++ /* reduce max inodes available to limit */ ++ if (buf->f_files > dli->dl_inodes_total) ++ buf->f_files = dli->dl_inodes_total; ++ ++ ifree = dli->dl_inodes_total - dli->dl_inodes_used; ++ /* reduce free inodes to min */ ++ if (ifree < buf->f_ffree) ++ buf->f_ffree = ifree; ++ ++no_ilim: ++ if (dli->dl_space_total == DLIM_INFINITY) ++ goto no_blim; ++ ++ blimit = dli->dl_space_total >> sb->s_blocksize_bits; ++ ++ if (dli->dl_space_total < dli->dl_space_used) ++ bfree = 0; ++ else ++ bfree = (dli->dl_space_total - dli->dl_space_used) ++ >> sb->s_blocksize_bits; ++ ++ bavail = ((dli->dl_space_total >> 10) * dli->dl_nrlmult); ++ if (bavail < dli->dl_space_used) ++ bavail = 0; ++ else ++ bavail = (bavail - dli->dl_space_used) ++ >> sb->s_blocksize_bits; ++ ++ /* reduce max space available to limit */ ++ if (buf->f_blocks > blimit) ++ buf->f_blocks = blimit; ++ ++ /* reduce free space to min */ ++ if (bfree < buf->f_bfree) ++ buf->f_bfree = bfree; ++ ++ /* reduce avail space to min */ ++ if (bavail < buf->f_bavail) ++ buf->f_bavail = bavail; ++ ++no_blim: ++ spin_unlock(&dli->dl_lock); ++ put_dl_info(dli); ++ ++ return; ++} ++ ++#include ++ ++EXPORT_SYMBOL_GPL(locate_dl_info); ++EXPORT_SYMBOL_GPL(rcu_free_dl_info); ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/helper.c linux-3.3.8-vs2.3.3.4/kernel/vserver/helper.c +--- linux-3.3.8/kernel/vserver/helper.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/helper.c 2012-05-21 01:22:37.000000000 +0200 +@@ -0,0 +1,228 @@ ++/* ++ * linux/kernel/vserver/helper.c ++ * ++ * Virtual Context Support ++ * ++ * Copyright (C) 2004-2007 Herbert Pötzl ++ * ++ * V0.01 basic helper ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++ ++char vshelper_path[255] = "/sbin/vshelper"; ++ ++static int vshelper_init(struct subprocess_info *info, struct cred *new_cred) ++{ ++ current->flags &= ~PF_THREAD_BOUND; ++ return 0; ++} ++ ++static int do_vshelper(char *name, char *argv[], char *envp[], int sync) ++{ ++ int ret; ++ ++ if ((ret = call_usermodehelper_fns(name, argv, envp, sync, ++ vshelper_init, NULL, NULL))) { ++ printk(KERN_WARNING "%s: (%s %s) returned %s with %d\n", ++ name, argv[1], argv[2], ++ sync ? "sync" : "async", ret); ++ } ++ vxdprintk(VXD_CBIT(switch, 4), ++ "%s: (%s %s) returned %s with %d", ++ name, argv[1], argv[2], sync ? "sync" : "async", ret); ++ return ret; ++} ++ ++/* ++ * vshelper path is set via /proc/sys ++ * invoked by vserver sys_reboot(), with ++ * the following arguments ++ * ++ * argv [0] = vshelper_path; ++ * argv [1] = action: "restart", "halt", "poweroff", ... ++ * argv [2] = context identifier ++ * ++ * envp [*] = type-specific parameters ++ */ ++ ++long vs_reboot_helper(struct vx_info *vxi, int cmd, void __user *arg) ++{ ++ char id_buf[8], cmd_buf[16]; ++ char uid_buf[16], pid_buf[16]; ++ int ret; ++ ++ char *argv[] = {vshelper_path, NULL, id_buf, 0}; ++ char *envp[] = {"HOME=/", "TERM=linux", ++ "PATH=/sbin:/usr/sbin:/bin:/usr/bin", ++ uid_buf, pid_buf, cmd_buf, 0}; ++ ++ if (vx_info_state(vxi, VXS_HELPER)) ++ return -EAGAIN; ++ vxi->vx_state |= VXS_HELPER; ++ ++ snprintf(id_buf, sizeof(id_buf)-1, "%d", vxi->vx_id); ++ ++ snprintf(cmd_buf, sizeof(cmd_buf)-1, "VS_CMD=%08x", cmd); ++ snprintf(uid_buf, sizeof(uid_buf)-1, "VS_UID=%d", current_uid()); ++ snprintf(pid_buf, sizeof(pid_buf)-1, "VS_PID=%d", current->pid); ++ ++ switch (cmd) { ++ case LINUX_REBOOT_CMD_RESTART: ++ argv[1] = "restart"; ++ break; ++ ++ case LINUX_REBOOT_CMD_HALT: ++ argv[1] = "halt"; ++ break; ++ ++ case LINUX_REBOOT_CMD_POWER_OFF: ++ argv[1] = "poweroff"; ++ break; ++ ++ case LINUX_REBOOT_CMD_SW_SUSPEND: ++ argv[1] = "swsusp"; ++ break; ++ ++ case LINUX_REBOOT_CMD_OOM: ++ argv[1] = "oom"; ++ break; ++ ++ default: ++ vxi->vx_state &= ~VXS_HELPER; ++ return 0; ++ } ++ ++ ret = do_vshelper(vshelper_path, argv, envp, 0); ++ vxi->vx_state &= ~VXS_HELPER; ++ __wakeup_vx_info(vxi); ++ return (ret) ? -EPERM : 0; ++} ++ ++ ++long vs_reboot(unsigned int cmd, void __user *arg) ++{ ++ struct vx_info *vxi = current_vx_info(); ++ long ret = 0; ++ ++ vxdprintk(VXD_CBIT(misc, 5), ++ "vs_reboot(%p[#%d],%u)", ++ vxi, vxi ? vxi->vx_id : 0, cmd); ++ ++ ret = vs_reboot_helper(vxi, cmd, arg); ++ if (ret) ++ return ret; ++ ++ vxi->reboot_cmd = cmd; ++ if (vx_info_flags(vxi, VXF_REBOOT_KILL, 0)) { ++ switch (cmd) { ++ case LINUX_REBOOT_CMD_RESTART: ++ case LINUX_REBOOT_CMD_HALT: ++ case LINUX_REBOOT_CMD_POWER_OFF: ++ vx_info_kill(vxi, 0, SIGKILL); ++ vx_info_kill(vxi, 1, SIGKILL); ++ default: ++ break; ++ } ++ } ++ return 0; ++} ++ ++long vs_oom_action(unsigned int cmd) ++{ ++ struct vx_info *vxi = current_vx_info(); ++ long ret = 0; ++ ++ vxdprintk(VXD_CBIT(misc, 5), ++ "vs_oom_action(%p[#%d],%u)", ++ vxi, vxi ? vxi->vx_id : 0, cmd); ++ ++ ret = vs_reboot_helper(vxi, cmd, NULL); ++ if (ret) ++ return ret; ++ ++ vxi->reboot_cmd = cmd; ++ if (vx_info_flags(vxi, VXF_REBOOT_KILL, 0)) { ++ vx_info_kill(vxi, 0, SIGKILL); ++ vx_info_kill(vxi, 1, SIGKILL); ++ } ++ return 0; ++} ++ ++/* ++ * argv [0] = vshelper_path; ++ * argv [1] = action: "startup", "shutdown" ++ * argv [2] = context identifier ++ * ++ * envp [*] = type-specific parameters ++ */ ++ ++long vs_state_change(struct vx_info *vxi, unsigned int cmd) ++{ ++ char id_buf[8], cmd_buf[16]; ++ char *argv[] = {vshelper_path, NULL, id_buf, 0}; ++ char *envp[] = {"HOME=/", "TERM=linux", ++ "PATH=/sbin:/usr/sbin:/bin:/usr/bin", cmd_buf, 0}; ++ ++ if (!vx_info_flags(vxi, VXF_SC_HELPER, 0)) ++ return 0; ++ ++ snprintf(id_buf, sizeof(id_buf)-1, "%d", vxi->vx_id); ++ snprintf(cmd_buf, sizeof(cmd_buf)-1, "VS_CMD=%08x", cmd); ++ ++ switch (cmd) { ++ case VSC_STARTUP: ++ argv[1] = "startup"; ++ break; ++ case VSC_SHUTDOWN: ++ argv[1] = "shutdown"; ++ break; ++ default: ++ return 0; ++ } ++ ++ return do_vshelper(vshelper_path, argv, envp, 1); ++} ++ ++ ++/* ++ * argv [0] = vshelper_path; ++ * argv [1] = action: "netup", "netdown" ++ * argv [2] = context identifier ++ * ++ * envp [*] = type-specific parameters ++ */ ++ ++long vs_net_change(struct nx_info *nxi, unsigned int cmd) ++{ ++ char id_buf[8], cmd_buf[16]; ++ char *argv[] = {vshelper_path, NULL, id_buf, 0}; ++ char *envp[] = {"HOME=/", "TERM=linux", ++ "PATH=/sbin:/usr/sbin:/bin:/usr/bin", cmd_buf, 0}; ++ ++ if (!nx_info_flags(nxi, NXF_SC_HELPER, 0)) ++ return 0; ++ ++ snprintf(id_buf, sizeof(id_buf)-1, "%d", nxi->nx_id); ++ snprintf(cmd_buf, sizeof(cmd_buf)-1, "VS_CMD=%08x", cmd); ++ ++ switch (cmd) { ++ case VSC_NETUP: ++ argv[1] = "netup"; ++ break; ++ case VSC_NETDOWN: ++ argv[1] = "netdown"; ++ break; ++ default: ++ return 0; ++ } ++ ++ return do_vshelper(vshelper_path, argv, envp, 1); ++} ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/history.c linux-3.3.8-vs2.3.3.4/kernel/vserver/history.c +--- linux-3.3.8/kernel/vserver/history.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/history.c 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,258 @@ ++/* ++ * kernel/vserver/history.c ++ * ++ * Virtual Context History Backtrace ++ * ++ * Copyright (C) 2004-2007 Herbert Pötzl ++ * ++ * V0.01 basic structure ++ * V0.02 hash/unhash and trace ++ * V0.03 preemption fixes ++ * ++ */ ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++ ++#ifdef CONFIG_VSERVER_HISTORY ++#define VXH_SIZE CONFIG_VSERVER_HISTORY_SIZE ++#else ++#define VXH_SIZE 64 ++#endif ++ ++struct _vx_history { ++ unsigned int counter; ++ ++ struct _vx_hist_entry entry[VXH_SIZE + 1]; ++}; ++ ++ ++DEFINE_PER_CPU(struct _vx_history, vx_history_buffer); ++ ++unsigned volatile int vxh_active = 1; ++ ++static atomic_t sequence = ATOMIC_INIT(0); ++ ++ ++/* vxh_advance() ++ ++ * requires disabled preemption */ ++ ++struct _vx_hist_entry *vxh_advance(void *loc) ++{ ++ unsigned int cpu = smp_processor_id(); ++ struct _vx_history *hist = &per_cpu(vx_history_buffer, cpu); ++ struct _vx_hist_entry *entry; ++ unsigned int index; ++ ++ index = vxh_active ? (hist->counter++ % VXH_SIZE) : VXH_SIZE; ++ entry = &hist->entry[index]; ++ ++ entry->seq = atomic_inc_return(&sequence); ++ entry->loc = loc; ++ return entry; ++} ++ ++EXPORT_SYMBOL_GPL(vxh_advance); ++ ++ ++#define VXH_LOC_FMTS "(#%04x,*%d):%p" ++ ++#define VXH_LOC_ARGS(e) (e)->seq, cpu, (e)->loc ++ ++ ++#define VXH_VXI_FMTS "%p[#%d,%d.%d]" ++ ++#define VXH_VXI_ARGS(e) (e)->vxi.ptr, \ ++ (e)->vxi.ptr ? (e)->vxi.xid : 0, \ ++ (e)->vxi.ptr ? (e)->vxi.usecnt : 0, \ ++ (e)->vxi.ptr ? (e)->vxi.tasks : 0 ++ ++void vxh_dump_entry(struct _vx_hist_entry *e, unsigned cpu) ++{ ++ switch (e->type) { ++ case VXH_THROW_OOPS: ++ printk( VXH_LOC_FMTS " oops \n", VXH_LOC_ARGS(e)); ++ break; ++ ++ case VXH_GET_VX_INFO: ++ case VXH_PUT_VX_INFO: ++ printk( VXH_LOC_FMTS " %s_vx_info " VXH_VXI_FMTS "\n", ++ VXH_LOC_ARGS(e), ++ (e->type == VXH_GET_VX_INFO) ? "get" : "put", ++ VXH_VXI_ARGS(e)); ++ break; ++ ++ case VXH_INIT_VX_INFO: ++ case VXH_SET_VX_INFO: ++ case VXH_CLR_VX_INFO: ++ printk( VXH_LOC_FMTS " %s_vx_info " VXH_VXI_FMTS " @%p\n", ++ VXH_LOC_ARGS(e), ++ (e->type == VXH_INIT_VX_INFO) ? "init" : ++ ((e->type == VXH_SET_VX_INFO) ? "set" : "clr"), ++ VXH_VXI_ARGS(e), e->sc.data); ++ break; ++ ++ case VXH_CLAIM_VX_INFO: ++ case VXH_RELEASE_VX_INFO: ++ printk( VXH_LOC_FMTS " %s_vx_info " VXH_VXI_FMTS " @%p\n", ++ VXH_LOC_ARGS(e), ++ (e->type == VXH_CLAIM_VX_INFO) ? "claim" : "release", ++ VXH_VXI_ARGS(e), e->sc.data); ++ break; ++ ++ case VXH_ALLOC_VX_INFO: ++ case VXH_DEALLOC_VX_INFO: ++ printk( VXH_LOC_FMTS " %s_vx_info " VXH_VXI_FMTS "\n", ++ VXH_LOC_ARGS(e), ++ (e->type == VXH_ALLOC_VX_INFO) ? "alloc" : "dealloc", ++ VXH_VXI_ARGS(e)); ++ break; ++ ++ case VXH_HASH_VX_INFO: ++ case VXH_UNHASH_VX_INFO: ++ printk( VXH_LOC_FMTS " __%s_vx_info " VXH_VXI_FMTS "\n", ++ VXH_LOC_ARGS(e), ++ (e->type == VXH_HASH_VX_INFO) ? "hash" : "unhash", ++ VXH_VXI_ARGS(e)); ++ break; ++ ++ case VXH_LOC_VX_INFO: ++ case VXH_LOOKUP_VX_INFO: ++ case VXH_CREATE_VX_INFO: ++ printk( VXH_LOC_FMTS " __%s_vx_info [#%d] -> " VXH_VXI_FMTS "\n", ++ VXH_LOC_ARGS(e), ++ (e->type == VXH_CREATE_VX_INFO) ? "create" : ++ ((e->type == VXH_LOC_VX_INFO) ? "loc" : "lookup"), ++ e->ll.arg, VXH_VXI_ARGS(e)); ++ break; ++ } ++} ++ ++static void __vxh_dump_history(void) ++{ ++ unsigned int i, cpu; ++ ++ printk("History:\tSEQ: %8x\tNR_CPUS: %d\n", ++ atomic_read(&sequence), NR_CPUS); ++ ++ for (i = 0; i < VXH_SIZE; i++) { ++ for_each_online_cpu(cpu) { ++ struct _vx_history *hist = ++ &per_cpu(vx_history_buffer, cpu); ++ unsigned int index = (hist->counter - i) % VXH_SIZE; ++ struct _vx_hist_entry *entry = &hist->entry[index]; ++ ++ vxh_dump_entry(entry, cpu); ++ } ++ } ++} ++ ++void vxh_dump_history(void) ++{ ++ vxh_active = 0; ++#ifdef CONFIG_SMP ++ local_irq_enable(); ++ smp_send_stop(); ++ local_irq_disable(); ++#endif ++ __vxh_dump_history(); ++} ++ ++ ++/* vserver syscall commands below here */ ++ ++ ++int vc_dump_history(uint32_t id) ++{ ++ vxh_active = 0; ++ __vxh_dump_history(); ++ vxh_active = 1; ++ ++ return 0; ++} ++ ++ ++int do_read_history(struct __user _vx_hist_entry *data, ++ int cpu, uint32_t *index, uint32_t *count) ++{ ++ int pos, ret = 0; ++ struct _vx_history *hist = &per_cpu(vx_history_buffer, cpu); ++ int end = hist->counter; ++ int start = end - VXH_SIZE + 2; ++ int idx = *index; ++ ++ /* special case: get current pos */ ++ if (!*count) { ++ *index = end; ++ return 0; ++ } ++ ++ /* have we lost some data? */ ++ if (idx < start) ++ idx = start; ++ ++ for (pos = 0; (pos < *count) && (idx < end); pos++, idx++) { ++ struct _vx_hist_entry *entry = ++ &hist->entry[idx % VXH_SIZE]; ++ ++ /* send entry to userspace */ ++ ret = copy_to_user(&data[pos], entry, sizeof(*entry)); ++ if (ret) ++ break; ++ } ++ /* save new index and count */ ++ *index = idx; ++ *count = pos; ++ return ret ? ret : (*index < end); ++} ++ ++int vc_read_history(uint32_t id, void __user *data) ++{ ++ struct vcmd_read_history_v0 vc_data; ++ int ret; ++ ++ if (id >= NR_CPUS) ++ return -EINVAL; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = do_read_history((struct __user _vx_hist_entry *)vc_data.data, ++ id, &vc_data.index, &vc_data.count); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return ret; ++} ++ ++#ifdef CONFIG_COMPAT ++ ++int vc_read_history_x32(uint32_t id, void __user *data) ++{ ++ struct vcmd_read_history_v0_x32 vc_data; ++ int ret; ++ ++ if (id >= NR_CPUS) ++ return -EINVAL; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = do_read_history((struct __user _vx_hist_entry *) ++ compat_ptr(vc_data.data_ptr), ++ id, &vc_data.index, &vc_data.count); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return ret; ++} ++ ++#endif /* CONFIG_COMPAT */ ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/inet.c linux-3.3.8-vs2.3.3.4/kernel/vserver/inet.c +--- linux-3.3.8/kernel/vserver/inet.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/inet.c 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,226 @@ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++int nx_v4_addr_conflict(struct nx_info *nxi1, struct nx_info *nxi2) ++{ ++ int ret = 0; ++ ++ if (!nxi1 || !nxi2 || nxi1 == nxi2) ++ ret = 1; ++ else { ++ struct nx_addr_v4 *ptr; ++ ++ for (ptr = &nxi1->v4; ptr; ptr = ptr->next) { ++ if (v4_nx_addr_in_nx_info(nxi2, ptr, -1)) { ++ ret = 1; ++ break; ++ } ++ } ++ } ++ ++ vxdprintk(VXD_CBIT(net, 2), ++ "nx_v4_addr_conflict(%p,%p): %d", ++ nxi1, nxi2, ret); ++ ++ return ret; ++} ++ ++ ++#ifdef CONFIG_IPV6 ++ ++int nx_v6_addr_conflict(struct nx_info *nxi1, struct nx_info *nxi2) ++{ ++ int ret = 0; ++ ++ if (!nxi1 || !nxi2 || nxi1 == nxi2) ++ ret = 1; ++ else { ++ struct nx_addr_v6 *ptr; ++ ++ for (ptr = &nxi1->v6; ptr; ptr = ptr->next) { ++ if (v6_nx_addr_in_nx_info(nxi2, ptr, -1)) { ++ ret = 1; ++ break; ++ } ++ } ++ } ++ ++ vxdprintk(VXD_CBIT(net, 2), ++ "nx_v6_addr_conflict(%p,%p): %d", ++ nxi1, nxi2, ret); ++ ++ return ret; ++} ++ ++#endif ++ ++int v4_dev_in_nx_info(struct net_device *dev, struct nx_info *nxi) ++{ ++ struct in_device *in_dev; ++ struct in_ifaddr **ifap; ++ struct in_ifaddr *ifa; ++ int ret = 0; ++ ++ if (!dev) ++ goto out; ++ in_dev = in_dev_get(dev); ++ if (!in_dev) ++ goto out; ++ ++ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; ++ ifap = &ifa->ifa_next) { ++ if (v4_addr_in_nx_info(nxi, ifa->ifa_local, NXA_MASK_SHOW)) { ++ ret = 1; ++ break; ++ } ++ } ++ in_dev_put(in_dev); ++out: ++ return ret; ++} ++ ++ ++#ifdef CONFIG_IPV6 ++ ++int v6_dev_in_nx_info(struct net_device *dev, struct nx_info *nxi) ++{ ++ struct inet6_dev *in_dev; ++ struct inet6_ifaddr *ifa; ++ int ret = 0; ++ ++ if (!dev) ++ goto out; ++ in_dev = in6_dev_get(dev); ++ if (!in_dev) ++ goto out; ++ ++ // for (ifap = &in_dev->addr_list; (ifa = *ifap) != NULL; ++ list_for_each_entry(ifa, &in_dev->addr_list, if_list) { ++ if (v6_addr_in_nx_info(nxi, &ifa->addr, -1)) { ++ ret = 1; ++ break; ++ } ++ } ++ in6_dev_put(in_dev); ++out: ++ return ret; ++} ++ ++#endif ++ ++int dev_in_nx_info(struct net_device *dev, struct nx_info *nxi) ++{ ++ int ret = 1; ++ ++ if (!nxi) ++ goto out; ++ if (nxi->v4.type && v4_dev_in_nx_info(dev, nxi)) ++ goto out; ++#ifdef CONFIG_IPV6 ++ ret = 2; ++ if (nxi->v6.type && v6_dev_in_nx_info(dev, nxi)) ++ goto out; ++#endif ++ ret = 0; ++out: ++ vxdprintk(VXD_CBIT(net, 3), ++ "dev_in_nx_info(%p,%p[#%d]) = %d", ++ dev, nxi, nxi ? nxi->nx_id : 0, ret); ++ return ret; ++} ++ ++struct rtable *ip_v4_find_src(struct net *net, struct nx_info *nxi, ++ struct flowi4 *fl4) ++{ ++ struct rtable *rt; ++ ++ if (!nxi) ++ return NULL; ++ ++ /* FIXME: handle lback only case */ ++ if (!NX_IPV4(nxi)) ++ return ERR_PTR(-EPERM); ++ ++ vxdprintk(VXD_CBIT(net, 4), ++ "ip_v4_find_src(%p[#%u]) " NIPQUAD_FMT " -> " NIPQUAD_FMT, ++ nxi, nxi ? nxi->nx_id : 0, ++ NIPQUAD(fl4->saddr), NIPQUAD(fl4->daddr)); ++ ++ /* single IP is unconditional */ ++ if (nx_info_flags(nxi, NXF_SINGLE_IP, 0) && ++ (fl4->saddr == INADDR_ANY)) ++ fl4->saddr = nxi->v4.ip[0].s_addr; ++ ++ if (fl4->saddr == INADDR_ANY) { ++ struct nx_addr_v4 *ptr; ++ __be32 found = 0; ++ ++ rt = __ip_route_output_key(net, fl4); ++ if (!IS_ERR(rt)) { ++ found = fl4->saddr; ++ ip_rt_put(rt); ++ vxdprintk(VXD_CBIT(net, 4), ++ "ip_v4_find_src(%p[#%u]) rok[%u]: " NIPQUAD_FMT, ++ nxi, nxi ? nxi->nx_id : 0, fl4->flowi4_oif, NIPQUAD(found)); ++ if (v4_addr_in_nx_info(nxi, found, NXA_MASK_BIND)) ++ goto found; ++ } ++ ++ for (ptr = &nxi->v4; ptr; ptr = ptr->next) { ++ __be32 primary = ptr->ip[0].s_addr; ++ __be32 mask = ptr->mask.s_addr; ++ __be32 neta = primary & mask; ++ ++ vxdprintk(VXD_CBIT(net, 4), "ip_v4_find_src(%p[#%u]) chk: " ++ NIPQUAD_FMT "/" NIPQUAD_FMT "/" NIPQUAD_FMT, ++ nxi, nxi ? nxi->nx_id : 0, NIPQUAD(primary), ++ NIPQUAD(mask), NIPQUAD(neta)); ++ if ((found & mask) != neta) ++ continue; ++ ++ fl4->saddr = primary; ++ rt = __ip_route_output_key(net, fl4); ++ vxdprintk(VXD_CBIT(net, 4), ++ "ip_v4_find_src(%p[#%u]) rok[%u]: " NIPQUAD_FMT, ++ nxi, nxi ? nxi->nx_id : 0, fl4->flowi4_oif, NIPQUAD(primary)); ++ if (!IS_ERR(rt)) { ++ found = fl4->saddr; ++ ip_rt_put(rt); ++ if (found == primary) ++ goto found; ++ } ++ } ++ /* still no source ip? */ ++ found = ipv4_is_loopback(fl4->daddr) ++ ? IPI_LOOPBACK : nxi->v4.ip[0].s_addr; ++ found: ++ /* assign src ip to flow */ ++ fl4->saddr = found; ++ ++ } else { ++ if (!v4_addr_in_nx_info(nxi, fl4->saddr, NXA_MASK_BIND)) ++ return ERR_PTR(-EPERM); ++ } ++ ++ if (nx_info_flags(nxi, NXF_LBACK_REMAP, 0)) { ++ if (ipv4_is_loopback(fl4->daddr)) ++ fl4->daddr = nxi->v4_lback.s_addr; ++ if (ipv4_is_loopback(fl4->saddr)) ++ fl4->saddr = nxi->v4_lback.s_addr; ++ } else if (ipv4_is_loopback(fl4->daddr) && ++ !nx_info_flags(nxi, NXF_LBACK_ALLOW, 0)) ++ return ERR_PTR(-EPERM); ++ ++ return NULL; ++} ++ ++EXPORT_SYMBOL_GPL(ip_v4_find_src); ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/init.c linux-3.3.8-vs2.3.3.4/kernel/vserver/init.c +--- linux-3.3.8/kernel/vserver/init.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/init.c 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,45 @@ ++/* ++ * linux/kernel/init.c ++ * ++ * Virtual Server Init ++ * ++ * Copyright (C) 2004-2007 Herbert Pötzl ++ * ++ * V0.01 basic structure ++ * ++ */ ++ ++#include ++ ++int vserver_register_sysctl(void); ++void vserver_unregister_sysctl(void); ++ ++ ++static int __init init_vserver(void) ++{ ++ int ret = 0; ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ vserver_register_sysctl(); ++#endif ++ return ret; ++} ++ ++ ++static void __exit exit_vserver(void) ++{ ++ ++#ifdef CONFIG_VSERVER_DEBUG ++ vserver_unregister_sysctl(); ++#endif ++ return; ++} ++ ++/* FIXME: GFP_ZONETYPES gone ++long vx_slab[GFP_ZONETYPES]; */ ++long vx_area; ++ ++ ++module_init(init_vserver); ++module_exit(exit_vserver); ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/inode.c linux-3.3.8-vs2.3.3.4/kernel/vserver/inode.c +--- linux-3.3.8/kernel/vserver/inode.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/inode.c 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,437 @@ ++/* ++ * linux/kernel/vserver/inode.c ++ * ++ * Virtual Server: File System Support ++ * ++ * Copyright (C) 2004-2007 Herbert Pötzl ++ * ++ * V0.01 separated from vcontext V0.05 ++ * V0.02 moved to tag (instead of xid) ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++ ++static int __vc_get_iattr(struct inode *in, uint32_t *tag, uint32_t *flags, uint32_t *mask) ++{ ++ struct proc_dir_entry *entry; ++ ++ if (!in || !in->i_sb) ++ return -ESRCH; ++ ++ *flags = IATTR_TAG ++ | (IS_IMMUTABLE(in) ? IATTR_IMMUTABLE : 0) ++ | (IS_IXUNLINK(in) ? IATTR_IXUNLINK : 0) ++ | (IS_BARRIER(in) ? IATTR_BARRIER : 0) ++ | (IS_COW(in) ? IATTR_COW : 0); ++ *mask = IATTR_IXUNLINK | IATTR_IMMUTABLE | IATTR_COW; ++ ++ if (S_ISDIR(in->i_mode)) ++ *mask |= IATTR_BARRIER; ++ ++ if (IS_TAGGED(in)) { ++ *tag = in->i_tag; ++ *mask |= IATTR_TAG; ++ } ++ ++ switch (in->i_sb->s_magic) { ++ case PROC_SUPER_MAGIC: ++ entry = PROC_I(in)->pde; ++ ++ /* check for specific inodes? */ ++ if (entry) ++ *mask |= IATTR_FLAGS; ++ if (entry) ++ *flags |= (entry->vx_flags & IATTR_FLAGS); ++ else ++ *flags |= (PROC_I(in)->vx_flags & IATTR_FLAGS); ++ break; ++ ++ case DEVPTS_SUPER_MAGIC: ++ *tag = in->i_tag; ++ *mask |= IATTR_TAG; ++ break; ++ ++ default: ++ break; ++ } ++ return 0; ++} ++ ++int vc_get_iattr(void __user *data) ++{ ++ struct path path; ++ struct vcmd_ctx_iattr_v1 vc_data = { .tag = -1 }; ++ int ret; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = user_lpath(vc_data.name, &path); ++ if (!ret) { ++ ret = __vc_get_iattr(path.dentry->d_inode, ++ &vc_data.tag, &vc_data.flags, &vc_data.mask); ++ path_put(&path); ++ } ++ if (ret) ++ return ret; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ ret = -EFAULT; ++ return ret; ++} ++ ++#ifdef CONFIG_COMPAT ++ ++int vc_get_iattr_x32(void __user *data) ++{ ++ struct path path; ++ struct vcmd_ctx_iattr_v1_x32 vc_data = { .tag = -1 }; ++ int ret; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = user_lpath(compat_ptr(vc_data.name_ptr), &path); ++ if (!ret) { ++ ret = __vc_get_iattr(path.dentry->d_inode, ++ &vc_data.tag, &vc_data.flags, &vc_data.mask); ++ path_put(&path); ++ } ++ if (ret) ++ return ret; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ ret = -EFAULT; ++ return ret; ++} ++ ++#endif /* CONFIG_COMPAT */ ++ ++ ++int vc_fget_iattr(uint32_t fd, void __user *data) ++{ ++ struct file *filp; ++ struct vcmd_ctx_fiattr_v0 vc_data = { .tag = -1 }; ++ int ret; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ filp = fget(fd); ++ if (!filp || !filp->f_dentry || !filp->f_dentry->d_inode) ++ return -EBADF; ++ ++ ret = __vc_get_iattr(filp->f_dentry->d_inode, ++ &vc_data.tag, &vc_data.flags, &vc_data.mask); ++ ++ fput(filp); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ ret = -EFAULT; ++ return ret; ++} ++ ++ ++static int __vc_set_iattr(struct dentry *de, uint32_t *tag, uint32_t *flags, uint32_t *mask) ++{ ++ struct inode *in = de->d_inode; ++ int error = 0, is_proc = 0, has_tag = 0; ++ struct iattr attr = { 0 }; ++ ++ if (!in || !in->i_sb) ++ return -ESRCH; ++ ++ is_proc = (in->i_sb->s_magic == PROC_SUPER_MAGIC); ++ if ((*mask & IATTR_FLAGS) && !is_proc) ++ return -EINVAL; ++ ++ has_tag = IS_TAGGED(in) || ++ (in->i_sb->s_magic == DEVPTS_SUPER_MAGIC); ++ if ((*mask & IATTR_TAG) && !has_tag) ++ return -EINVAL; ++ ++ mutex_lock(&in->i_mutex); ++ if (*mask & IATTR_TAG) { ++ attr.ia_tag = *tag; ++ attr.ia_valid |= ATTR_TAG; ++ } ++ ++ if (*mask & IATTR_FLAGS) { ++ struct proc_dir_entry *entry = PROC_I(in)->pde; ++ unsigned int iflags = PROC_I(in)->vx_flags; ++ ++ iflags = (iflags & ~(*mask & IATTR_FLAGS)) ++ | (*flags & IATTR_FLAGS); ++ PROC_I(in)->vx_flags = iflags; ++ if (entry) ++ entry->vx_flags = iflags; ++ } ++ ++ if (*mask & (IATTR_IMMUTABLE | IATTR_IXUNLINK | ++ IATTR_BARRIER | IATTR_COW)) { ++ int iflags = in->i_flags; ++ int vflags = in->i_vflags; ++ ++ if (*mask & IATTR_IMMUTABLE) { ++ if (*flags & IATTR_IMMUTABLE) ++ iflags |= S_IMMUTABLE; ++ else ++ iflags &= ~S_IMMUTABLE; ++ } ++ if (*mask & IATTR_IXUNLINK) { ++ if (*flags & IATTR_IXUNLINK) ++ iflags |= S_IXUNLINK; ++ else ++ iflags &= ~S_IXUNLINK; ++ } ++ if (S_ISDIR(in->i_mode) && (*mask & IATTR_BARRIER)) { ++ if (*flags & IATTR_BARRIER) ++ vflags |= V_BARRIER; ++ else ++ vflags &= ~V_BARRIER; ++ } ++ if (S_ISREG(in->i_mode) && (*mask & IATTR_COW)) { ++ if (*flags & IATTR_COW) ++ vflags |= V_COW; ++ else ++ vflags &= ~V_COW; ++ } ++ if (in->i_op && in->i_op->sync_flags) { ++ error = in->i_op->sync_flags(in, iflags, vflags); ++ if (error) ++ goto out; ++ } ++ } ++ ++ if (attr.ia_valid) { ++ if (in->i_op && in->i_op->setattr) ++ error = in->i_op->setattr(de, &attr); ++ else { ++ error = inode_change_ok(in, &attr); ++ if (!error) { ++ setattr_copy(in, &attr); ++ mark_inode_dirty(in); ++ } ++ } ++ } ++ ++out: ++ mutex_unlock(&in->i_mutex); ++ return error; ++} ++ ++int vc_set_iattr(void __user *data) ++{ ++ struct path path; ++ struct vcmd_ctx_iattr_v1 vc_data; ++ int ret; ++ ++ if (!capable(CAP_LINUX_IMMUTABLE)) ++ return -EPERM; ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = user_lpath(vc_data.name, &path); ++ if (!ret) { ++ ret = __vc_set_iattr(path.dentry, ++ &vc_data.tag, &vc_data.flags, &vc_data.mask); ++ path_put(&path); ++ } ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ ret = -EFAULT; ++ return ret; ++} ++ ++#ifdef CONFIG_COMPAT ++ ++int vc_set_iattr_x32(void __user *data) ++{ ++ struct path path; ++ struct vcmd_ctx_iattr_v1_x32 vc_data; ++ int ret; ++ ++ if (!capable(CAP_LINUX_IMMUTABLE)) ++ return -EPERM; ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = user_lpath(compat_ptr(vc_data.name_ptr), &path); ++ if (!ret) { ++ ret = __vc_set_iattr(path.dentry, ++ &vc_data.tag, &vc_data.flags, &vc_data.mask); ++ path_put(&path); ++ } ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ ret = -EFAULT; ++ return ret; ++} ++ ++#endif /* CONFIG_COMPAT */ ++ ++int vc_fset_iattr(uint32_t fd, void __user *data) ++{ ++ struct file *filp; ++ struct vcmd_ctx_fiattr_v0 vc_data; ++ int ret; ++ ++ if (!capable(CAP_LINUX_IMMUTABLE)) ++ return -EPERM; ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ filp = fget(fd); ++ if (!filp || !filp->f_dentry || !filp->f_dentry->d_inode) ++ return -EBADF; ++ ++ ret = __vc_set_iattr(filp->f_dentry, &vc_data.tag, ++ &vc_data.flags, &vc_data.mask); ++ ++ fput(filp); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return ret; ++} ++ ++ ++enum { Opt_notagcheck, Opt_tag, Opt_notag, Opt_tagid, Opt_err }; ++ ++static match_table_t tokens = { ++ {Opt_notagcheck, "notagcheck"}, ++#ifdef CONFIG_PROPAGATE ++ {Opt_notag, "notag"}, ++ {Opt_tag, "tag"}, ++ {Opt_tagid, "tagid=%u"}, ++#endif ++ {Opt_err, NULL} ++}; ++ ++ ++static void __dx_parse_remove(char *string, char *opt) ++{ ++ char *p = strstr(string, opt); ++ char *q = p; ++ ++ if (p) { ++ while (*q != '\0' && *q != ',') ++ q++; ++ while (*q) ++ *p++ = *q++; ++ while (*p) ++ *p++ = '\0'; ++ } ++} ++ ++int dx_parse_tag(char *string, tag_t *tag, int remove, int *mnt_flags, ++ unsigned long *flags) ++{ ++ int set = 0; ++ substring_t args[MAX_OPT_ARGS]; ++ int token; ++ char *s, *p, *opts; ++#if defined(CONFIG_PROPAGATE) || defined(CONFIG_VSERVER_DEBUG) ++ int option = 0; ++#endif ++ ++ if (!string) ++ return 0; ++ s = kstrdup(string, GFP_KERNEL | GFP_ATOMIC); ++ if (!s) ++ return 0; ++ ++ opts = s; ++ while ((p = strsep(&opts, ",")) != NULL) { ++ token = match_token(p, tokens, args); ++ ++ switch (token) { ++#ifdef CONFIG_PROPAGATE ++ case Opt_tag: ++ if (tag) ++ *tag = 0; ++ if (remove) ++ __dx_parse_remove(s, "tag"); ++ *mnt_flags |= MNT_TAGID; ++ set |= MNT_TAGID; ++ break; ++ case Opt_notag: ++ if (remove) ++ __dx_parse_remove(s, "notag"); ++ *mnt_flags |= MNT_NOTAG; ++ set |= MNT_NOTAG; ++ break; ++ case Opt_tagid: ++ if (tag && !match_int(args, &option)) ++ *tag = option; ++ if (remove) ++ __dx_parse_remove(s, "tagid"); ++ *mnt_flags |= MNT_TAGID; ++ set |= MNT_TAGID; ++ break; ++#endif /* CONFIG_PROPAGATE */ ++ case Opt_notagcheck: ++ if (remove) ++ __dx_parse_remove(s, "notagcheck"); ++ *flags |= MS_NOTAGCHECK; ++ set |= MS_NOTAGCHECK; ++ break; ++ } ++ vxdprintk(VXD_CBIT(tag, 7), ++ "dx_parse_tag(" VS_Q("%s") "): %d:#%d", ++ p, token, option); ++ } ++ if (set) ++ strcpy(string, s); ++ kfree(s); ++ return set; ++} ++ ++#ifdef CONFIG_PROPAGATE ++ ++void __dx_propagate_tag(struct nameidata *nd, struct inode *inode) ++{ ++ tag_t new_tag = 0; ++ struct vfsmount *mnt; ++ int propagate; ++ ++ if (!nd) ++ return; ++ mnt = nd->path.mnt; ++ if (!mnt) ++ return; ++ ++ propagate = (mnt->mnt_flags & MNT_TAGID); ++ if (propagate) ++ new_tag = mnt->mnt_tag; ++ ++ vxdprintk(VXD_CBIT(tag, 7), ++ "dx_propagate_tag(%p[#%lu.%d]): %d,%d", ++ inode, inode->i_ino, inode->i_tag, ++ new_tag, (propagate) ? 1 : 0); ++ ++ if (propagate) ++ inode->i_tag = new_tag; ++} ++ ++#include ++ ++EXPORT_SYMBOL_GPL(__dx_propagate_tag); ++ ++#endif /* CONFIG_PROPAGATE */ ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/limit.c linux-3.3.8-vs2.3.3.4/kernel/vserver/limit.c +--- linux-3.3.8/kernel/vserver/limit.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/limit.c 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,330 @@ ++/* ++ * linux/kernel/vserver/limit.c ++ * ++ * Virtual Server: Context Limits ++ * ++ * Copyright (C) 2004-2010 Herbert Pötzl ++ * ++ * V0.01 broken out from vcontext V0.05 ++ * V0.02 changed vcmds to vxi arg ++ * V0.03 added memory cgroup support ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++ ++const char *vlimit_name[NUM_LIMITS] = { ++ [RLIMIT_CPU] = "CPU", ++ [RLIMIT_NPROC] = "NPROC", ++ [RLIMIT_NOFILE] = "NOFILE", ++ [RLIMIT_LOCKS] = "LOCKS", ++ [RLIMIT_SIGPENDING] = "SIGP", ++ [RLIMIT_MSGQUEUE] = "MSGQ", ++ ++ [VLIMIT_NSOCK] = "NSOCK", ++ [VLIMIT_OPENFD] = "OPENFD", ++ [VLIMIT_SHMEM] = "SHMEM", ++ [VLIMIT_DENTRY] = "DENTRY", ++}; ++ ++EXPORT_SYMBOL_GPL(vlimit_name); ++ ++#define MASK_ENTRY(x) (1 << (x)) ++ ++const struct vcmd_ctx_rlimit_mask_v0 vlimit_mask = { ++ /* minimum */ ++ 0 ++ , /* softlimit */ ++ 0 ++ , /* maximum */ ++ MASK_ENTRY( RLIMIT_NPROC ) | ++ MASK_ENTRY( RLIMIT_NOFILE ) | ++ MASK_ENTRY( RLIMIT_LOCKS ) | ++ MASK_ENTRY( RLIMIT_MSGQUEUE ) | ++ ++ MASK_ENTRY( VLIMIT_NSOCK ) | ++ MASK_ENTRY( VLIMIT_OPENFD ) | ++ MASK_ENTRY( VLIMIT_SHMEM ) | ++ MASK_ENTRY( VLIMIT_DENTRY ) | ++ 0 ++}; ++ /* accounting only */ ++uint32_t account_mask = ++ MASK_ENTRY( VLIMIT_SEMARY ) | ++ MASK_ENTRY( VLIMIT_NSEMS ) | ++ MASK_ENTRY( VLIMIT_MAPPED ) | ++ 0; ++ ++ ++static int is_valid_vlimit(int id) ++{ ++ uint32_t mask = vlimit_mask.minimum | ++ vlimit_mask.softlimit | vlimit_mask.maximum; ++ return mask & (1 << id); ++} ++ ++static int is_accounted_vlimit(int id) ++{ ++ if (is_valid_vlimit(id)) ++ return 1; ++ return account_mask & (1 << id); ++} ++ ++ ++static inline uint64_t vc_get_soft(struct vx_info *vxi, int id) ++{ ++ rlim_t limit = __rlim_soft(&vxi->limit, id); ++ return VX_VLIM(limit); ++} ++ ++static inline uint64_t vc_get_hard(struct vx_info *vxi, int id) ++{ ++ rlim_t limit = __rlim_hard(&vxi->limit, id); ++ return VX_VLIM(limit); ++} ++ ++static int do_get_rlimit(struct vx_info *vxi, uint32_t id, ++ uint64_t *minimum, uint64_t *softlimit, uint64_t *maximum) ++{ ++ if (!is_valid_vlimit(id)) ++ return -EINVAL; ++ ++ if (minimum) ++ *minimum = CRLIM_UNSET; ++ if (softlimit) ++ *softlimit = vc_get_soft(vxi, id); ++ if (maximum) ++ *maximum = vc_get_hard(vxi, id); ++ return 0; ++} ++ ++int vc_get_rlimit(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_rlimit_v0 vc_data; ++ int ret; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = do_get_rlimit(vxi, vc_data.id, ++ &vc_data.minimum, &vc_data.softlimit, &vc_data.maximum); ++ if (ret) ++ return ret; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int do_set_rlimit(struct vx_info *vxi, uint32_t id, ++ uint64_t minimum, uint64_t softlimit, uint64_t maximum) ++{ ++ if (!is_valid_vlimit(id)) ++ return -EINVAL; ++ ++ if (maximum != CRLIM_KEEP) ++ __rlim_hard(&vxi->limit, id) = VX_RLIM(maximum); ++ if (softlimit != CRLIM_KEEP) ++ __rlim_soft(&vxi->limit, id) = VX_RLIM(softlimit); ++ ++ /* clamp soft limit */ ++ if (__rlim_soft(&vxi->limit, id) > __rlim_hard(&vxi->limit, id)) ++ __rlim_soft(&vxi->limit, id) = __rlim_hard(&vxi->limit, id); ++ ++ return 0; ++} ++ ++int vc_set_rlimit(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_rlimit_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_rlimit(vxi, vc_data.id, ++ vc_data.minimum, vc_data.softlimit, vc_data.maximum); ++} ++ ++#ifdef CONFIG_IA32_EMULATION ++ ++int vc_set_rlimit_x32(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_rlimit_v0_x32 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_rlimit(vxi, vc_data.id, ++ vc_data.minimum, vc_data.softlimit, vc_data.maximum); ++} ++ ++int vc_get_rlimit_x32(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_rlimit_v0_x32 vc_data; ++ int ret; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ ret = do_get_rlimit(vxi, vc_data.id, ++ &vc_data.minimum, &vc_data.softlimit, &vc_data.maximum); ++ if (ret) ++ return ret; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++#endif /* CONFIG_IA32_EMULATION */ ++ ++ ++int vc_get_rlimit_mask(uint32_t id, void __user *data) ++{ ++ if (copy_to_user(data, &vlimit_mask, sizeof(vlimit_mask))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++static inline void vx_reset_hits(struct _vx_limit *limit) ++{ ++ int lim; ++ ++ for (lim = 0; lim < NUM_LIMITS; lim++) { ++ atomic_set(&__rlim_lhit(limit, lim), 0); ++ } ++} ++ ++int vc_reset_hits(struct vx_info *vxi, void __user *data) ++{ ++ vx_reset_hits(&vxi->limit); ++ return 0; ++} ++ ++static inline void vx_reset_minmax(struct _vx_limit *limit) ++{ ++ rlim_t value; ++ int lim; ++ ++ for (lim = 0; lim < NUM_LIMITS; lim++) { ++ value = __rlim_get(limit, lim); ++ __rlim_rmax(limit, lim) = value; ++ __rlim_rmin(limit, lim) = value; ++ } ++} ++ ++int vc_reset_minmax(struct vx_info *vxi, void __user *data) ++{ ++ vx_reset_minmax(&vxi->limit); ++ return 0; ++} ++ ++ ++int vc_rlimit_stat(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_rlimit_stat_v0 vc_data; ++ struct _vx_limit *limit = &vxi->limit; ++ int id; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ id = vc_data.id; ++ if (!is_accounted_vlimit(id)) ++ return -EINVAL; ++ ++ vx_limit_fixup(limit, id); ++ vc_data.hits = atomic_read(&__rlim_lhit(limit, id)); ++ vc_data.value = __rlim_get(limit, id); ++ vc_data.minimum = __rlim_rmin(limit, id); ++ vc_data.maximum = __rlim_rmax(limit, id); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++void vx_vsi_meminfo(struct sysinfo *val) ++{ ++#ifdef CONFIG_CGROUP_MEM_RES_CTLR ++ struct mem_cgroup *mcg = mem_cgroup_from_task(current); ++ u64 res_limit, res_usage; ++ ++ if (!mcg) ++ return; ++ ++ res_limit = mem_cgroup_res_read_u64(mcg, RES_LIMIT); ++ res_usage = mem_cgroup_res_read_u64(mcg, RES_USAGE); ++ ++ if (res_limit != RESOURCE_MAX) ++ val->totalram = (res_limit >> PAGE_SHIFT); ++ val->freeram = val->totalram - (res_usage >> PAGE_SHIFT); ++ val->bufferram = 0; ++ val->totalhigh = 0; ++ val->freehigh = 0; ++#endif /* CONFIG_CGROUP_MEM_RES_CTLR */ ++ return; ++} ++ ++void vx_vsi_swapinfo(struct sysinfo *val) ++{ ++#ifdef CONFIG_CGROUP_MEM_RES_CTLR ++#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP ++ struct mem_cgroup *mcg = mem_cgroup_from_task(current); ++ u64 res_limit, res_usage, memsw_limit, memsw_usage; ++ s64 swap_limit, swap_usage; ++ ++ if (!mcg) ++ return; ++ ++ res_limit = mem_cgroup_res_read_u64(mcg, RES_LIMIT); ++ res_usage = mem_cgroup_res_read_u64(mcg, RES_USAGE); ++ memsw_limit = mem_cgroup_memsw_read_u64(mcg, RES_LIMIT); ++ memsw_usage = mem_cgroup_memsw_read_u64(mcg, RES_USAGE); ++ ++ /* memory unlimited */ ++ if (res_limit == RESOURCE_MAX) ++ return; ++ ++ swap_limit = memsw_limit - res_limit; ++ /* we have a swap limit? */ ++ if (memsw_limit != RESOURCE_MAX) ++ val->totalswap = swap_limit >> PAGE_SHIFT; ++ ++ /* calculate swap part */ ++ swap_usage = (memsw_usage > res_usage) ? ++ memsw_usage - res_usage : 0; ++ ++ /* total shown minus usage gives free swap */ ++ val->freeswap = (swap_usage < swap_limit) ? ++ val->totalswap - (swap_usage >> PAGE_SHIFT) : 0; ++#else /* !CONFIG_CGROUP_MEM_RES_CTLR_SWAP */ ++ val->totalswap = 0; ++ val->freeswap = 0; ++#endif /* !CONFIG_CGROUP_MEM_RES_CTLR_SWAP */ ++#endif /* CONFIG_CGROUP_MEM_RES_CTLR */ ++ return; ++} ++ ++long vx_vsi_cached(struct sysinfo *val) ++{ ++#ifdef CONFIG_CGROUP_MEM_RES_CTLR ++ struct mem_cgroup *mcg = mem_cgroup_from_task(current); ++ ++ return mem_cgroup_stat_read_cache(mcg); ++#else ++ return 0; ++#endif ++} ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/limit_init.h linux-3.3.8-vs2.3.3.4/kernel/vserver/limit_init.h +--- linux-3.3.8/kernel/vserver/limit_init.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/limit_init.h 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,31 @@ ++ ++ ++static inline void vx_info_init_limit(struct _vx_limit *limit) ++{ ++ int lim; ++ ++ for (lim = 0; lim < NUM_LIMITS; lim++) { ++ __rlim_soft(limit, lim) = RLIM_INFINITY; ++ __rlim_hard(limit, lim) = RLIM_INFINITY; ++ __rlim_set(limit, lim, 0); ++ atomic_set(&__rlim_lhit(limit, lim), 0); ++ __rlim_rmin(limit, lim) = 0; ++ __rlim_rmax(limit, lim) = 0; ++ } ++} ++ ++static inline void vx_info_exit_limit(struct _vx_limit *limit) ++{ ++ rlim_t value; ++ int lim; ++ ++ for (lim = 0; lim < NUM_LIMITS; lim++) { ++ if ((1 << lim) & VLIM_NOCHECK) ++ continue; ++ value = __rlim_get(limit, lim); ++ vxwprintk_xid(value, ++ "!!! limit: %p[%s,%d] = %ld on exit.", ++ limit, vlimit_name[lim], lim, (long)value); ++ } ++} ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/limit_proc.h linux-3.3.8-vs2.3.3.4/kernel/vserver/limit_proc.h +--- linux-3.3.8/kernel/vserver/limit_proc.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/limit_proc.h 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,57 @@ ++#ifndef _VX_LIMIT_PROC_H ++#define _VX_LIMIT_PROC_H ++ ++#include ++ ++ ++#define VX_LIMIT_FMT ":\t%8ld\t%8ld/%8ld\t%8lld/%8lld\t%6d\n" ++#define VX_LIMIT_TOP \ ++ "Limit\t current\t min/max\t\t soft/hard\t\thits\n" ++ ++#define VX_LIMIT_ARG(r) \ ++ (unsigned long)__rlim_get(limit, r), \ ++ (unsigned long)__rlim_rmin(limit, r), \ ++ (unsigned long)__rlim_rmax(limit, r), \ ++ VX_VLIM(__rlim_soft(limit, r)), \ ++ VX_VLIM(__rlim_hard(limit, r)), \ ++ atomic_read(&__rlim_lhit(limit, r)) ++ ++static inline int vx_info_proc_limit(struct _vx_limit *limit, char *buffer) ++{ ++ vx_limit_fixup(limit, -1); ++ return sprintf(buffer, VX_LIMIT_TOP ++ "PROC" VX_LIMIT_FMT ++ "VM" VX_LIMIT_FMT ++ "VML" VX_LIMIT_FMT ++ "RSS" VX_LIMIT_FMT ++ "ANON" VX_LIMIT_FMT ++ "RMAP" VX_LIMIT_FMT ++ "FILES" VX_LIMIT_FMT ++ "OFD" VX_LIMIT_FMT ++ "LOCKS" VX_LIMIT_FMT ++ "SOCK" VX_LIMIT_FMT ++ "MSGQ" VX_LIMIT_FMT ++ "SHM" VX_LIMIT_FMT ++ "SEMA" VX_LIMIT_FMT ++ "SEMS" VX_LIMIT_FMT ++ "DENT" VX_LIMIT_FMT, ++ VX_LIMIT_ARG(RLIMIT_NPROC), ++ VX_LIMIT_ARG(RLIMIT_AS), ++ VX_LIMIT_ARG(RLIMIT_MEMLOCK), ++ VX_LIMIT_ARG(RLIMIT_RSS), ++ VX_LIMIT_ARG(VLIMIT_ANON), ++ VX_LIMIT_ARG(VLIMIT_MAPPED), ++ VX_LIMIT_ARG(RLIMIT_NOFILE), ++ VX_LIMIT_ARG(VLIMIT_OPENFD), ++ VX_LIMIT_ARG(RLIMIT_LOCKS), ++ VX_LIMIT_ARG(VLIMIT_NSOCK), ++ VX_LIMIT_ARG(RLIMIT_MSGQUEUE), ++ VX_LIMIT_ARG(VLIMIT_SHMEM), ++ VX_LIMIT_ARG(VLIMIT_SEMARY), ++ VX_LIMIT_ARG(VLIMIT_NSEMS), ++ VX_LIMIT_ARG(VLIMIT_DENTRY)); ++} ++ ++#endif /* _VX_LIMIT_PROC_H */ ++ ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/network.c linux-3.3.8-vs2.3.3.4/kernel/vserver/network.c +--- linux-3.3.8/kernel/vserver/network.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/network.c 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,912 @@ ++/* ++ * linux/kernel/vserver/network.c ++ * ++ * Virtual Server: Network Support ++ * ++ * Copyright (C) 2003-2007 Herbert Pötzl ++ * ++ * V0.01 broken out from vcontext V0.05 ++ * V0.02 cleaned up implementation ++ * V0.03 added equiv nx commands ++ * V0.04 switch to RCU based hash ++ * V0.05 and back to locking again ++ * V0.06 changed vcmds to nxi arg ++ * V0.07 have __create claim() the nxi ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++ ++atomic_t nx_global_ctotal = ATOMIC_INIT(0); ++atomic_t nx_global_cactive = ATOMIC_INIT(0); ++ ++static struct kmem_cache *nx_addr_v4_cachep = NULL; ++static struct kmem_cache *nx_addr_v6_cachep = NULL; ++ ++ ++static int __init init_network(void) ++{ ++ nx_addr_v4_cachep = kmem_cache_create("nx_v4_addr_cache", ++ sizeof(struct nx_addr_v4), 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ nx_addr_v6_cachep = kmem_cache_create("nx_v6_addr_cache", ++ sizeof(struct nx_addr_v6), 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ return 0; ++} ++ ++ ++/* __alloc_nx_addr_v4() */ ++ ++static inline struct nx_addr_v4 *__alloc_nx_addr_v4(void) ++{ ++ struct nx_addr_v4 *nxa = kmem_cache_alloc( ++ nx_addr_v4_cachep, GFP_KERNEL); ++ ++ if (!IS_ERR(nxa)) ++ memset(nxa, 0, sizeof(*nxa)); ++ return nxa; ++} ++ ++/* __dealloc_nx_addr_v4() */ ++ ++static inline void __dealloc_nx_addr_v4(struct nx_addr_v4 *nxa) ++{ ++ kmem_cache_free(nx_addr_v4_cachep, nxa); ++} ++ ++/* __dealloc_nx_addr_v4_all() */ ++ ++static inline void __dealloc_nx_addr_v4_all(struct nx_addr_v4 *nxa) ++{ ++ while (nxa) { ++ struct nx_addr_v4 *next = nxa->next; ++ ++ __dealloc_nx_addr_v4(nxa); ++ nxa = next; ++ } ++} ++ ++ ++#ifdef CONFIG_IPV6 ++ ++/* __alloc_nx_addr_v6() */ ++ ++static inline struct nx_addr_v6 *__alloc_nx_addr_v6(void) ++{ ++ struct nx_addr_v6 *nxa = kmem_cache_alloc( ++ nx_addr_v6_cachep, GFP_KERNEL); ++ ++ if (!IS_ERR(nxa)) ++ memset(nxa, 0, sizeof(*nxa)); ++ return nxa; ++} ++ ++/* __dealloc_nx_addr_v6() */ ++ ++static inline void __dealloc_nx_addr_v6(struct nx_addr_v6 *nxa) ++{ ++ kmem_cache_free(nx_addr_v6_cachep, nxa); ++} ++ ++/* __dealloc_nx_addr_v6_all() */ ++ ++static inline void __dealloc_nx_addr_v6_all(struct nx_addr_v6 *nxa) ++{ ++ while (nxa) { ++ struct nx_addr_v6 *next = nxa->next; ++ ++ __dealloc_nx_addr_v6(nxa); ++ nxa = next; ++ } ++} ++ ++#endif /* CONFIG_IPV6 */ ++ ++/* __alloc_nx_info() ++ ++ * allocate an initialized nx_info struct ++ * doesn't make it visible (hash) */ ++ ++static struct nx_info *__alloc_nx_info(nid_t nid) ++{ ++ struct nx_info *new = NULL; ++ ++ vxdprintk(VXD_CBIT(nid, 1), "alloc_nx_info(%d)*", nid); ++ ++ /* would this benefit from a slab cache? */ ++ new = kmalloc(sizeof(struct nx_info), GFP_KERNEL); ++ if (!new) ++ return 0; ++ ++ memset(new, 0, sizeof(struct nx_info)); ++ new->nx_id = nid; ++ INIT_HLIST_NODE(&new->nx_hlist); ++ atomic_set(&new->nx_usecnt, 0); ++ atomic_set(&new->nx_tasks, 0); ++ new->nx_state = 0; ++ ++ new->nx_flags = NXF_INIT_SET; ++ ++ /* rest of init goes here */ ++ ++ new->v4_lback.s_addr = htonl(INADDR_LOOPBACK); ++ new->v4_bcast.s_addr = htonl(INADDR_BROADCAST); ++ ++ vxdprintk(VXD_CBIT(nid, 0), ++ "alloc_nx_info(%d) = %p", nid, new); ++ atomic_inc(&nx_global_ctotal); ++ return new; ++} ++ ++/* __dealloc_nx_info() ++ ++ * final disposal of nx_info */ ++ ++static void __dealloc_nx_info(struct nx_info *nxi) ++{ ++ vxdprintk(VXD_CBIT(nid, 0), ++ "dealloc_nx_info(%p)", nxi); ++ ++ nxi->nx_hlist.next = LIST_POISON1; ++ nxi->nx_id = -1; ++ ++ BUG_ON(atomic_read(&nxi->nx_usecnt)); ++ BUG_ON(atomic_read(&nxi->nx_tasks)); ++ ++ __dealloc_nx_addr_v4_all(nxi->v4.next); ++ ++ nxi->nx_state |= NXS_RELEASED; ++ kfree(nxi); ++ atomic_dec(&nx_global_ctotal); ++} ++ ++static void __shutdown_nx_info(struct nx_info *nxi) ++{ ++ nxi->nx_state |= NXS_SHUTDOWN; ++ vs_net_change(nxi, VSC_NETDOWN); ++} ++ ++/* exported stuff */ ++ ++void free_nx_info(struct nx_info *nxi) ++{ ++ /* context shutdown is mandatory */ ++ BUG_ON(nxi->nx_state != NXS_SHUTDOWN); ++ ++ /* context must not be hashed */ ++ BUG_ON(nxi->nx_state & NXS_HASHED); ++ ++ BUG_ON(atomic_read(&nxi->nx_usecnt)); ++ BUG_ON(atomic_read(&nxi->nx_tasks)); ++ ++ __dealloc_nx_info(nxi); ++} ++ ++ ++void __nx_set_lback(struct nx_info *nxi) ++{ ++ int nid = nxi->nx_id; ++ __be32 lback = htonl(INADDR_LOOPBACK ^ ((nid & 0xFFFF) << 8)); ++ ++ nxi->v4_lback.s_addr = lback; ++} ++ ++extern int __nx_inet_add_lback(__be32 addr); ++extern int __nx_inet_del_lback(__be32 addr); ++ ++ ++/* hash table for nx_info hash */ ++ ++#define NX_HASH_SIZE 13 ++ ++struct hlist_head nx_info_hash[NX_HASH_SIZE]; ++ ++static DEFINE_SPINLOCK(nx_info_hash_lock); ++ ++ ++static inline unsigned int __hashval(nid_t nid) ++{ ++ return (nid % NX_HASH_SIZE); ++} ++ ++ ++ ++/* __hash_nx_info() ++ ++ * add the nxi to the global hash table ++ * requires the hash_lock to be held */ ++ ++static inline void __hash_nx_info(struct nx_info *nxi) ++{ ++ struct hlist_head *head; ++ ++ vxd_assert_lock(&nx_info_hash_lock); ++ vxdprintk(VXD_CBIT(nid, 4), ++ "__hash_nx_info: %p[#%d]", nxi, nxi->nx_id); ++ ++ /* context must not be hashed */ ++ BUG_ON(nx_info_state(nxi, NXS_HASHED)); ++ ++ nxi->nx_state |= NXS_HASHED; ++ head = &nx_info_hash[__hashval(nxi->nx_id)]; ++ hlist_add_head(&nxi->nx_hlist, head); ++ atomic_inc(&nx_global_cactive); ++} ++ ++/* __unhash_nx_info() ++ ++ * remove the nxi from the global hash table ++ * requires the hash_lock to be held */ ++ ++static inline void __unhash_nx_info(struct nx_info *nxi) ++{ ++ vxd_assert_lock(&nx_info_hash_lock); ++ vxdprintk(VXD_CBIT(nid, 4), ++ "__unhash_nx_info: %p[#%d.%d.%d]", nxi, nxi->nx_id, ++ atomic_read(&nxi->nx_usecnt), atomic_read(&nxi->nx_tasks)); ++ ++ /* context must be hashed */ ++ BUG_ON(!nx_info_state(nxi, NXS_HASHED)); ++ /* but without tasks */ ++ BUG_ON(atomic_read(&nxi->nx_tasks)); ++ ++ nxi->nx_state &= ~NXS_HASHED; ++ hlist_del(&nxi->nx_hlist); ++ atomic_dec(&nx_global_cactive); ++} ++ ++ ++/* __lookup_nx_info() ++ ++ * requires the hash_lock to be held ++ * doesn't increment the nx_refcnt */ ++ ++static inline struct nx_info *__lookup_nx_info(nid_t nid) ++{ ++ struct hlist_head *head = &nx_info_hash[__hashval(nid)]; ++ struct hlist_node *pos; ++ struct nx_info *nxi; ++ ++ vxd_assert_lock(&nx_info_hash_lock); ++ hlist_for_each(pos, head) { ++ nxi = hlist_entry(pos, struct nx_info, nx_hlist); ++ ++ if (nxi->nx_id == nid) ++ goto found; ++ } ++ nxi = NULL; ++found: ++ vxdprintk(VXD_CBIT(nid, 0), ++ "__lookup_nx_info(#%u): %p[#%u]", ++ nid, nxi, nxi ? nxi->nx_id : 0); ++ return nxi; ++} ++ ++ ++/* __create_nx_info() ++ ++ * create the requested context ++ * get(), claim() and hash it */ ++ ++static struct nx_info *__create_nx_info(int id) ++{ ++ struct nx_info *new, *nxi = NULL; ++ ++ vxdprintk(VXD_CBIT(nid, 1), "create_nx_info(%d)*", id); ++ ++ if (!(new = __alloc_nx_info(id))) ++ return ERR_PTR(-ENOMEM); ++ ++ /* required to make dynamic xids unique */ ++ spin_lock(&nx_info_hash_lock); ++ ++ /* static context requested */ ++ if ((nxi = __lookup_nx_info(id))) { ++ vxdprintk(VXD_CBIT(nid, 0), ++ "create_nx_info(%d) = %p (already there)", id, nxi); ++ if (nx_info_flags(nxi, NXF_STATE_SETUP, 0)) ++ nxi = ERR_PTR(-EBUSY); ++ else ++ nxi = ERR_PTR(-EEXIST); ++ goto out_unlock; ++ } ++ /* new context */ ++ vxdprintk(VXD_CBIT(nid, 0), ++ "create_nx_info(%d) = %p (new)", id, new); ++ claim_nx_info(new, NULL); ++ __nx_set_lback(new); ++ __hash_nx_info(get_nx_info(new)); ++ nxi = new, new = NULL; ++ ++out_unlock: ++ spin_unlock(&nx_info_hash_lock); ++ if (new) ++ __dealloc_nx_info(new); ++ return nxi; ++} ++ ++ ++ ++/* exported stuff */ ++ ++ ++void unhash_nx_info(struct nx_info *nxi) ++{ ++ __shutdown_nx_info(nxi); ++ spin_lock(&nx_info_hash_lock); ++ __unhash_nx_info(nxi); ++ spin_unlock(&nx_info_hash_lock); ++} ++ ++/* lookup_nx_info() ++ ++ * search for a nx_info and get() it ++ * negative id means current */ ++ ++struct nx_info *lookup_nx_info(int id) ++{ ++ struct nx_info *nxi = NULL; ++ ++ if (id < 0) { ++ nxi = get_nx_info(current_nx_info()); ++ } else if (id > 1) { ++ spin_lock(&nx_info_hash_lock); ++ nxi = get_nx_info(__lookup_nx_info(id)); ++ spin_unlock(&nx_info_hash_lock); ++ } ++ return nxi; ++} ++ ++/* nid_is_hashed() ++ ++ * verify that nid is still hashed */ ++ ++int nid_is_hashed(nid_t nid) ++{ ++ int hashed; ++ ++ spin_lock(&nx_info_hash_lock); ++ hashed = (__lookup_nx_info(nid) != NULL); ++ spin_unlock(&nx_info_hash_lock); ++ return hashed; ++} ++ ++ ++#ifdef CONFIG_PROC_FS ++ ++/* get_nid_list() ++ ++ * get a subset of hashed nids for proc ++ * assumes size is at least one */ ++ ++int get_nid_list(int index, unsigned int *nids, int size) ++{ ++ int hindex, nr_nids = 0; ++ ++ /* only show current and children */ ++ if (!nx_check(0, VS_ADMIN | VS_WATCH)) { ++ if (index > 0) ++ return 0; ++ nids[nr_nids] = nx_current_nid(); ++ return 1; ++ } ++ ++ for (hindex = 0; hindex < NX_HASH_SIZE; hindex++) { ++ struct hlist_head *head = &nx_info_hash[hindex]; ++ struct hlist_node *pos; ++ ++ spin_lock(&nx_info_hash_lock); ++ hlist_for_each(pos, head) { ++ struct nx_info *nxi; ++ ++ if (--index > 0) ++ continue; ++ ++ nxi = hlist_entry(pos, struct nx_info, nx_hlist); ++ nids[nr_nids] = nxi->nx_id; ++ if (++nr_nids >= size) { ++ spin_unlock(&nx_info_hash_lock); ++ goto out; ++ } ++ } ++ /* keep the lock time short */ ++ spin_unlock(&nx_info_hash_lock); ++ } ++out: ++ return nr_nids; ++} ++#endif ++ ++ ++/* ++ * migrate task to new network ++ * gets nxi, puts old_nxi on change ++ */ ++ ++int nx_migrate_task(struct task_struct *p, struct nx_info *nxi) ++{ ++ struct nx_info *old_nxi; ++ int ret = 0; ++ ++ if (!p || !nxi) ++ BUG(); ++ ++ vxdprintk(VXD_CBIT(nid, 5), ++ "nx_migrate_task(%p,%p[#%d.%d.%d])", ++ p, nxi, nxi->nx_id, ++ atomic_read(&nxi->nx_usecnt), ++ atomic_read(&nxi->nx_tasks)); ++ ++ if (nx_info_flags(nxi, NXF_INFO_PRIVATE, 0) && ++ !nx_info_flags(nxi, NXF_STATE_SETUP, 0)) ++ return -EACCES; ++ ++ if (nx_info_state(nxi, NXS_SHUTDOWN)) ++ return -EFAULT; ++ ++ /* maybe disallow this completely? */ ++ old_nxi = task_get_nx_info(p); ++ if (old_nxi == nxi) ++ goto out; ++ ++ task_lock(p); ++ if (old_nxi) ++ clr_nx_info(&p->nx_info); ++ claim_nx_info(nxi, p); ++ set_nx_info(&p->nx_info, nxi); ++ p->nid = nxi->nx_id; ++ task_unlock(p); ++ ++ vxdprintk(VXD_CBIT(nid, 5), ++ "moved task %p into nxi:%p[#%d]", ++ p, nxi, nxi->nx_id); ++ ++ if (old_nxi) ++ release_nx_info(old_nxi, p); ++ ret = 0; ++out: ++ put_nx_info(old_nxi); ++ return ret; ++} ++ ++ ++void nx_set_persistent(struct nx_info *nxi) ++{ ++ vxdprintk(VXD_CBIT(nid, 6), ++ "nx_set_persistent(%p[#%d])", nxi, nxi->nx_id); ++ ++ get_nx_info(nxi); ++ claim_nx_info(nxi, NULL); ++} ++ ++void nx_clear_persistent(struct nx_info *nxi) ++{ ++ vxdprintk(VXD_CBIT(nid, 6), ++ "nx_clear_persistent(%p[#%d])", nxi, nxi->nx_id); ++ ++ release_nx_info(nxi, NULL); ++ put_nx_info(nxi); ++} ++ ++void nx_update_persistent(struct nx_info *nxi) ++{ ++ if (nx_info_flags(nxi, NXF_PERSISTENT, 0)) ++ nx_set_persistent(nxi); ++ else ++ nx_clear_persistent(nxi); ++} ++ ++/* vserver syscall commands below here */ ++ ++/* taks nid and nx_info functions */ ++ ++#include ++ ++ ++int vc_task_nid(uint32_t id) ++{ ++ nid_t nid; ++ ++ if (id) { ++ struct task_struct *tsk; ++ ++ rcu_read_lock(); ++ tsk = find_task_by_real_pid(id); ++ nid = (tsk) ? tsk->nid : -ESRCH; ++ rcu_read_unlock(); ++ } else ++ nid = nx_current_nid(); ++ return nid; ++} ++ ++ ++int vc_nx_info(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_nx_info_v0 vc_data; ++ ++ vc_data.nid = nxi->nx_id; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++ ++/* network functions */ ++ ++int vc_net_create(uint32_t nid, void __user *data) ++{ ++ struct vcmd_net_create vc_data = { .flagword = NXF_INIT_SET }; ++ struct nx_info *new_nxi; ++ int ret; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ if ((nid > MAX_S_CONTEXT) || (nid < 2)) ++ return -EINVAL; ++ ++ new_nxi = __create_nx_info(nid); ++ if (IS_ERR(new_nxi)) ++ return PTR_ERR(new_nxi); ++ ++ /* initial flags */ ++ new_nxi->nx_flags = vc_data.flagword; ++ ++ ret = -ENOEXEC; ++ if (vs_net_change(new_nxi, VSC_NETUP)) ++ goto out; ++ ++ ret = nx_migrate_task(current, new_nxi); ++ if (ret) ++ goto out; ++ ++ /* return context id on success */ ++ ret = new_nxi->nx_id; ++ ++ /* get a reference for persistent contexts */ ++ if ((vc_data.flagword & NXF_PERSISTENT)) ++ nx_set_persistent(new_nxi); ++out: ++ release_nx_info(new_nxi, NULL); ++ put_nx_info(new_nxi); ++ return ret; ++} ++ ++ ++int vc_net_migrate(struct nx_info *nxi, void __user *data) ++{ ++ return nx_migrate_task(current, nxi); ++} ++ ++ ++ ++int do_add_v4_addr(struct nx_info *nxi, __be32 ip, __be32 ip2, __be32 mask, ++ uint16_t type, uint16_t flags) ++{ ++ struct nx_addr_v4 *nxa = &nxi->v4; ++ ++ if (NX_IPV4(nxi)) { ++ /* locate last entry */ ++ for (; nxa->next; nxa = nxa->next); ++ nxa->next = __alloc_nx_addr_v4(); ++ nxa = nxa->next; ++ ++ if (IS_ERR(nxa)) ++ return PTR_ERR(nxa); ++ } ++ ++ if (nxi->v4.next) ++ /* remove single ip for ip list */ ++ nxi->nx_flags &= ~NXF_SINGLE_IP; ++ ++ nxa->ip[0].s_addr = ip; ++ nxa->ip[1].s_addr = ip2; ++ nxa->mask.s_addr = mask; ++ nxa->type = type; ++ nxa->flags = flags; ++ return 0; ++} ++ ++int do_remove_v4_addr(struct nx_info *nxi, __be32 ip, __be32 ip2, __be32 mask, ++ uint16_t type, uint16_t flags) ++{ ++ struct nx_addr_v4 *nxa = &nxi->v4; ++ ++ switch (type) { ++/* case NXA_TYPE_ADDR: ++ break; */ ++ ++ case NXA_TYPE_ANY: ++ __dealloc_nx_addr_v4_all(xchg(&nxa->next, NULL)); ++ memset(nxa, 0, sizeof(*nxa)); ++ break; ++ ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++ ++int vc_net_add(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_v0 vc_data; ++ int index, ret = 0; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ switch (vc_data.type) { ++ case NXA_TYPE_IPV4: ++ if ((vc_data.count < 1) || (vc_data.count > 4)) ++ return -EINVAL; ++ ++ index = 0; ++ while (index < vc_data.count) { ++ ret = do_add_v4_addr(nxi, vc_data.ip[index].s_addr, 0, ++ vc_data.mask[index].s_addr, NXA_TYPE_ADDR, 0); ++ if (ret) ++ return ret; ++ index++; ++ } ++ ret = index; ++ break; ++ ++ case NXA_TYPE_IPV4|NXA_MOD_BCAST: ++ nxi->v4_bcast = vc_data.ip[0]; ++ ret = 1; ++ break; ++ ++ case NXA_TYPE_IPV4|NXA_MOD_LBACK: ++ nxi->v4_lback = vc_data.ip[0]; ++ ret = 1; ++ break; ++ ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ return ret; ++} ++ ++int vc_net_remove(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_v0 vc_data; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ switch (vc_data.type) { ++ case NXA_TYPE_ANY: ++ __dealloc_nx_addr_v4_all(xchg(&nxi->v4.next, NULL)); ++ memset(&nxi->v4, 0, sizeof(nxi->v4)); ++ break; ++ ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++ ++int vc_net_add_ipv4_v1(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_ipv4_v1 vc_data; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ switch (vc_data.type) { ++ case NXA_TYPE_ADDR: ++ case NXA_TYPE_MASK: ++ return do_add_v4_addr(nxi, vc_data.ip.s_addr, 0, ++ vc_data.mask.s_addr, vc_data.type, vc_data.flags); ++ ++ case NXA_TYPE_ADDR | NXA_MOD_BCAST: ++ nxi->v4_bcast = vc_data.ip; ++ break; ++ ++ case NXA_TYPE_ADDR | NXA_MOD_LBACK: ++ nxi->v4_lback = vc_data.ip; ++ break; ++ ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++int vc_net_add_ipv4(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_ipv4_v2 vc_data; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ switch (vc_data.type) { ++ case NXA_TYPE_ADDR: ++ case NXA_TYPE_MASK: ++ case NXA_TYPE_RANGE: ++ return do_add_v4_addr(nxi, vc_data.ip.s_addr, vc_data.ip2.s_addr, ++ vc_data.mask.s_addr, vc_data.type, vc_data.flags); ++ ++ case NXA_TYPE_ADDR | NXA_MOD_BCAST: ++ nxi->v4_bcast = vc_data.ip; ++ break; ++ ++ case NXA_TYPE_ADDR | NXA_MOD_LBACK: ++ nxi->v4_lback = vc_data.ip; ++ break; ++ ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++int vc_net_rem_ipv4_v1(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_ipv4_v1 vc_data; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_remove_v4_addr(nxi, vc_data.ip.s_addr, 0, ++ vc_data.mask.s_addr, vc_data.type, vc_data.flags); ++} ++ ++int vc_net_rem_ipv4(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_ipv4_v2 vc_data; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_remove_v4_addr(nxi, vc_data.ip.s_addr, vc_data.ip2.s_addr, ++ vc_data.mask.s_addr, vc_data.type, vc_data.flags); ++} ++ ++#ifdef CONFIG_IPV6 ++ ++int do_add_v6_addr(struct nx_info *nxi, ++ struct in6_addr *ip, struct in6_addr *mask, ++ uint32_t prefix, uint16_t type, uint16_t flags) ++{ ++ struct nx_addr_v6 *nxa = &nxi->v6; ++ ++ if (NX_IPV6(nxi)) { ++ /* locate last entry */ ++ for (; nxa->next; nxa = nxa->next); ++ nxa->next = __alloc_nx_addr_v6(); ++ nxa = nxa->next; ++ ++ if (IS_ERR(nxa)) ++ return PTR_ERR(nxa); ++ } ++ ++ nxa->ip = *ip; ++ nxa->mask = *mask; ++ nxa->prefix = prefix; ++ nxa->type = type; ++ nxa->flags = flags; ++ return 0; ++} ++ ++ ++int vc_net_add_ipv6(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_ipv6_v1 vc_data; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ switch (vc_data.type) { ++ case NXA_TYPE_ADDR: ++ memset(&vc_data.mask, ~0, sizeof(vc_data.mask)); ++ /* fallthrough */ ++ case NXA_TYPE_MASK: ++ return do_add_v6_addr(nxi, &vc_data.ip, &vc_data.mask, ++ vc_data.prefix, vc_data.type, vc_data.flags); ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++int vc_net_remove_ipv6(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_addr_ipv6_v1 vc_data; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ switch (vc_data.type) { ++ case NXA_TYPE_ANY: ++ __dealloc_nx_addr_v6_all(xchg(&nxi->v6.next, NULL)); ++ memset(&nxi->v6, 0, sizeof(nxi->v6)); ++ break; ++ ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++#endif /* CONFIG_IPV6 */ ++ ++ ++int vc_get_nflags(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_flags_v0 vc_data; ++ ++ vc_data.flagword = nxi->nx_flags; ++ ++ /* special STATE flag handling */ ++ vc_data.mask = vs_mask_flags(~0ULL, nxi->nx_flags, NXF_ONE_TIME); ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_nflags(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_flags_v0 vc_data; ++ uint64_t mask, trigger; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ /* special STATE flag handling */ ++ mask = vs_mask_mask(vc_data.mask, nxi->nx_flags, NXF_ONE_TIME); ++ trigger = (mask & nxi->nx_flags) ^ (mask & vc_data.flagword); ++ ++ nxi->nx_flags = vs_mask_flags(nxi->nx_flags, ++ vc_data.flagword, mask); ++ if (trigger & NXF_PERSISTENT) ++ nx_update_persistent(nxi); ++ ++ return 0; ++} ++ ++int vc_get_ncaps(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_caps_v0 vc_data; ++ ++ vc_data.ncaps = nxi->nx_ncaps; ++ vc_data.cmask = ~0ULL; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ ++int vc_set_ncaps(struct nx_info *nxi, void __user *data) ++{ ++ struct vcmd_net_caps_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ nxi->nx_ncaps = vs_mask_flags(nxi->nx_ncaps, ++ vc_data.ncaps, vc_data.cmask); ++ return 0; ++} ++ ++ ++#include ++ ++module_init(init_network); ++ ++EXPORT_SYMBOL_GPL(free_nx_info); ++EXPORT_SYMBOL_GPL(unhash_nx_info); ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/proc.c linux-3.3.8-vs2.3.3.4/kernel/vserver/proc.c +--- linux-3.3.8/kernel/vserver/proc.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/proc.c 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,1103 @@ ++/* ++ * linux/kernel/vserver/proc.c ++ * ++ * Virtual Context Support ++ * ++ * Copyright (C) 2003-2011 Herbert Pötzl ++ * ++ * V0.01 basic structure ++ * V0.02 adaptation vs1.3.0 ++ * V0.03 proc permissions ++ * V0.04 locking/generic ++ * V0.05 next generation procfs ++ * V0.06 inode validation ++ * V0.07 generic rewrite vid ++ * V0.08 remove inode type ++ * V0.09 added u/wmask info ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "cvirt_proc.h" ++#include "cacct_proc.h" ++#include "limit_proc.h" ++#include "sched_proc.h" ++#include "vci_config.h" ++ ++ ++static inline char *print_cap_t(char *buffer, kernel_cap_t *c) ++{ ++ unsigned __capi; ++ ++ CAP_FOR_EACH_U32(__capi) { ++ buffer += sprintf(buffer, "%08x", ++ c->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]); ++ } ++ return buffer; ++} ++ ++ ++static struct proc_dir_entry *proc_virtual; ++ ++static struct proc_dir_entry *proc_virtnet; ++ ++ ++/* first the actual feeds */ ++ ++ ++static int proc_vci(char *buffer) ++{ ++ return sprintf(buffer, ++ "VCIVersion:\t%04x:%04x\n" ++ "VCISyscall:\t%d\n" ++ "VCIKernel:\t%08x\n", ++ VCI_VERSION >> 16, ++ VCI_VERSION & 0xFFFF, ++ __NR_vserver, ++ vci_kernel_config()); ++} ++ ++static int proc_virtual_info(char *buffer) ++{ ++ return proc_vci(buffer); ++} ++ ++static int proc_virtual_status(char *buffer) ++{ ++ return sprintf(buffer, ++ "#CTotal:\t%d\n" ++ "#CActive:\t%d\n" ++ "#NSProxy:\t%d\t%d %d %d %d %d %d\n" ++ "#InitTask:\t%d\t%d %d\n", ++ atomic_read(&vx_global_ctotal), ++ atomic_read(&vx_global_cactive), ++ atomic_read(&vs_global_nsproxy), ++ atomic_read(&vs_global_fs), ++ atomic_read(&vs_global_mnt_ns), ++ atomic_read(&vs_global_uts_ns), ++ atomic_read(&nr_ipc_ns), ++ atomic_read(&vs_global_user_ns), ++ atomic_read(&vs_global_pid_ns), ++ atomic_read(&init_task.usage), ++ atomic_read(&init_task.nsproxy->count), ++ init_task.fs->users); ++} ++ ++ ++int proc_vxi_info(struct vx_info *vxi, char *buffer) ++{ ++ int length; ++ ++ length = sprintf(buffer, ++ "ID:\t%d\n" ++ "Info:\t%p\n" ++ "Init:\t%d\n" ++ "OOM:\t%lld\n", ++ vxi->vx_id, ++ vxi, ++ vxi->vx_initpid, ++ vxi->vx_badness_bias); ++ return length; ++} ++ ++int proc_vxi_status(struct vx_info *vxi, char *buffer) ++{ ++ char *orig = buffer; ++ ++ buffer += sprintf(buffer, ++ "UseCnt:\t%d\n" ++ "Tasks:\t%d\n" ++ "Flags:\t%016llx\n", ++ atomic_read(&vxi->vx_usecnt), ++ atomic_read(&vxi->vx_tasks), ++ (unsigned long long)vxi->vx_flags); ++ ++ buffer += sprintf(buffer, "BCaps:\t"); ++ buffer = print_cap_t(buffer, &vxi->vx_bcaps); ++ buffer += sprintf(buffer, "\n"); ++ ++ buffer += sprintf(buffer, ++ "CCaps:\t%016llx\n" ++ "Umask:\t%16llx\n" ++ "Wmask:\t%16llx\n" ++ "Spaces:\t%08lx %08lx\n", ++ (unsigned long long)vxi->vx_ccaps, ++ (unsigned long long)vxi->vx_umask, ++ (unsigned long long)vxi->vx_wmask, ++ vxi->space[0].vx_nsmask, vxi->space[1].vx_nsmask); ++ return buffer - orig; ++} ++ ++int proc_vxi_limit(struct vx_info *vxi, char *buffer) ++{ ++ return vx_info_proc_limit(&vxi->limit, buffer); ++} ++ ++int proc_vxi_sched(struct vx_info *vxi, char *buffer) ++{ ++ int cpu, length; ++ ++ length = vx_info_proc_sched(&vxi->sched, buffer); ++ for_each_online_cpu(cpu) { ++ length += vx_info_proc_sched_pc( ++ &vx_per_cpu(vxi, sched_pc, cpu), ++ buffer + length, cpu); ++ } ++ return length; ++} ++ ++int proc_vxi_nsproxy0(struct vx_info *vxi, char *buffer) ++{ ++ return vx_info_proc_nsproxy(vxi->space[0].vx_nsproxy, buffer); ++} ++ ++int proc_vxi_nsproxy1(struct vx_info *vxi, char *buffer) ++{ ++ return vx_info_proc_nsproxy(vxi->space[1].vx_nsproxy, buffer); ++} ++ ++int proc_vxi_cvirt(struct vx_info *vxi, char *buffer) ++{ ++ int cpu, length; ++ ++ vx_update_load(vxi); ++ length = vx_info_proc_cvirt(&vxi->cvirt, buffer); ++ for_each_online_cpu(cpu) { ++ length += vx_info_proc_cvirt_pc( ++ &vx_per_cpu(vxi, cvirt_pc, cpu), ++ buffer + length, cpu); ++ } ++ return length; ++} ++ ++int proc_vxi_cacct(struct vx_info *vxi, char *buffer) ++{ ++ return vx_info_proc_cacct(&vxi->cacct, buffer); ++} ++ ++ ++static int proc_virtnet_info(char *buffer) ++{ ++ return proc_vci(buffer); ++} ++ ++static int proc_virtnet_status(char *buffer) ++{ ++ return sprintf(buffer, ++ "#CTotal:\t%d\n" ++ "#CActive:\t%d\n", ++ atomic_read(&nx_global_ctotal), ++ atomic_read(&nx_global_cactive)); ++} ++ ++int proc_nxi_info(struct nx_info *nxi, char *buffer) ++{ ++ struct nx_addr_v4 *v4a; ++#ifdef CONFIG_IPV6 ++ struct nx_addr_v6 *v6a; ++#endif ++ int length, i; ++ ++ length = sprintf(buffer, ++ "ID:\t%d\n" ++ "Info:\t%p\n" ++ "Bcast:\t" NIPQUAD_FMT "\n" ++ "Lback:\t" NIPQUAD_FMT "\n", ++ nxi->nx_id, ++ nxi, ++ NIPQUAD(nxi->v4_bcast.s_addr), ++ NIPQUAD(nxi->v4_lback.s_addr)); ++ ++ if (!NX_IPV4(nxi)) ++ goto skip_v4; ++ for (i = 0, v4a = &nxi->v4; v4a; i++, v4a = v4a->next) ++ length += sprintf(buffer + length, "%d:\t" NXAV4_FMT "\n", ++ i, NXAV4(v4a)); ++skip_v4: ++#ifdef CONFIG_IPV6 ++ if (!NX_IPV6(nxi)) ++ goto skip_v6; ++ for (i = 0, v6a = &nxi->v6; v6a; i++, v6a = v6a->next) ++ length += sprintf(buffer + length, "%d:\t" NXAV6_FMT "\n", ++ i, NXAV6(v6a)); ++skip_v6: ++#endif ++ return length; ++} ++ ++int proc_nxi_status(struct nx_info *nxi, char *buffer) ++{ ++ int length; ++ ++ length = sprintf(buffer, ++ "UseCnt:\t%d\n" ++ "Tasks:\t%d\n" ++ "Flags:\t%016llx\n" ++ "NCaps:\t%016llx\n", ++ atomic_read(&nxi->nx_usecnt), ++ atomic_read(&nxi->nx_tasks), ++ (unsigned long long)nxi->nx_flags, ++ (unsigned long long)nxi->nx_ncaps); ++ return length; ++} ++ ++ ++ ++/* here the inode helpers */ ++ ++struct vs_entry { ++ int len; ++ char *name; ++ mode_t mode; ++ struct inode_operations *iop; ++ struct file_operations *fop; ++ union proc_op op; ++}; ++ ++static struct inode *vs_proc_make_inode(struct super_block *sb, struct vs_entry *p) ++{ ++ struct inode *inode = new_inode(sb); ++ ++ if (!inode) ++ goto out; ++ ++ inode->i_mode = p->mode; ++ if (p->iop) ++ inode->i_op = p->iop; ++ if (p->fop) ++ inode->i_fop = p->fop; ++ ++ set_nlink(inode, (p->mode & S_IFDIR) ? 2 : 1); ++ inode->i_flags |= S_IMMUTABLE; ++ ++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; ++ ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_tag = 0; ++out: ++ return inode; ++} ++ ++static struct dentry *vs_proc_instantiate(struct inode *dir, ++ struct dentry *dentry, int id, void *ptr) ++{ ++ struct vs_entry *p = ptr; ++ struct inode *inode = vs_proc_make_inode(dir->i_sb, p); ++ struct dentry *error = ERR_PTR(-EINVAL); ++ ++ if (!inode) ++ goto out; ++ ++ PROC_I(inode)->op = p->op; ++ PROC_I(inode)->fd = id; ++ d_add(dentry, inode); ++ error = NULL; ++out: ++ return error; ++} ++ ++/* Lookups */ ++ ++typedef struct dentry *instantiate_t(struct inode *, struct dentry *, int, void *); ++ ++/* ++ * Fill a directory entry. ++ * ++ * If possible create the dcache entry and derive our inode number and ++ * file type from dcache entry. ++ * ++ * Since all of the proc inode numbers are dynamically generated, the inode ++ * numbers do not exist until the inode is cache. This means creating the ++ * the dcache entry in readdir is necessary to keep the inode numbers ++ * reported by readdir in sync with the inode numbers reported ++ * by stat. ++ */ ++static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, ++ char *name, int len, instantiate_t instantiate, int id, void *ptr) ++{ ++ struct dentry *child, *dir = filp->f_dentry; ++ struct inode *inode; ++ struct qstr qname; ++ ino_t ino = 0; ++ unsigned type = DT_UNKNOWN; ++ ++ qname.name = name; ++ qname.len = len; ++ qname.hash = full_name_hash(name, len); ++ ++ child = d_lookup(dir, &qname); ++ if (!child) { ++ struct dentry *new; ++ new = d_alloc(dir, &qname); ++ if (new) { ++ child = instantiate(dir->d_inode, new, id, ptr); ++ if (child) ++ dput(new); ++ else ++ child = new; ++ } ++ } ++ if (!child || IS_ERR(child) || !child->d_inode) ++ goto end_instantiate; ++ inode = child->d_inode; ++ if (inode) { ++ ino = inode->i_ino; ++ type = inode->i_mode >> 12; ++ } ++ dput(child); ++end_instantiate: ++ if (!ino) ++ ino = find_inode_number(dir, &qname); ++ if (!ino) ++ ino = 1; ++ return filldir(dirent, name, len, filp->f_pos, ino, type); ++} ++ ++ ++ ++/* get and revalidate vx_info/xid */ ++ ++static inline ++struct vx_info *get_proc_vx_info(struct inode *inode) ++{ ++ return lookup_vx_info(PROC_I(inode)->fd); ++} ++ ++static int proc_xid_revalidate(struct dentry *dentry, struct nameidata *nd) ++{ ++ struct inode *inode = dentry->d_inode; ++ xid_t xid = PROC_I(inode)->fd; ++ ++ if (!xid || xid_is_hashed(xid)) ++ return 1; ++ d_drop(dentry); ++ return 0; ++} ++ ++ ++/* get and revalidate nx_info/nid */ ++ ++static int proc_nid_revalidate(struct dentry *dentry, struct nameidata *nd) ++{ ++ struct inode *inode = dentry->d_inode; ++ nid_t nid = PROC_I(inode)->fd; ++ ++ if (!nid || nid_is_hashed(nid)) ++ return 1; ++ d_drop(dentry); ++ return 0; ++} ++ ++ ++ ++#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024) ++ ++static ssize_t proc_vs_info_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct inode *inode = file->f_dentry->d_inode; ++ unsigned long page; ++ ssize_t length = 0; ++ ++ if (count > PROC_BLOCK_SIZE) ++ count = PROC_BLOCK_SIZE; ++ ++ /* fade that out as soon as stable */ ++ WARN_ON(PROC_I(inode)->fd); ++ ++ if (!(page = __get_free_page(GFP_KERNEL))) ++ return -ENOMEM; ++ ++ BUG_ON(!PROC_I(inode)->op.proc_vs_read); ++ length = PROC_I(inode)->op.proc_vs_read((char *)page); ++ ++ if (length >= 0) ++ length = simple_read_from_buffer(buf, count, ppos, ++ (char *)page, length); ++ ++ free_page(page); ++ return length; ++} ++ ++static ssize_t proc_vx_info_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct inode *inode = file->f_dentry->d_inode; ++ struct vx_info *vxi = NULL; ++ xid_t xid = PROC_I(inode)->fd; ++ unsigned long page; ++ ssize_t length = 0; ++ ++ if (count > PROC_BLOCK_SIZE) ++ count = PROC_BLOCK_SIZE; ++ ++ /* fade that out as soon as stable */ ++ WARN_ON(!xid); ++ vxi = lookup_vx_info(xid); ++ if (!vxi) ++ goto out; ++ ++ length = -ENOMEM; ++ if (!(page = __get_free_page(GFP_KERNEL))) ++ goto out_put; ++ ++ BUG_ON(!PROC_I(inode)->op.proc_vxi_read); ++ length = PROC_I(inode)->op.proc_vxi_read(vxi, (char *)page); ++ ++ if (length >= 0) ++ length = simple_read_from_buffer(buf, count, ppos, ++ (char *)page, length); ++ ++ free_page(page); ++out_put: ++ put_vx_info(vxi); ++out: ++ return length; ++} ++ ++static ssize_t proc_nx_info_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct inode *inode = file->f_dentry->d_inode; ++ struct nx_info *nxi = NULL; ++ nid_t nid = PROC_I(inode)->fd; ++ unsigned long page; ++ ssize_t length = 0; ++ ++ if (count > PROC_BLOCK_SIZE) ++ count = PROC_BLOCK_SIZE; ++ ++ /* fade that out as soon as stable */ ++ WARN_ON(!nid); ++ nxi = lookup_nx_info(nid); ++ if (!nxi) ++ goto out; ++ ++ length = -ENOMEM; ++ if (!(page = __get_free_page(GFP_KERNEL))) ++ goto out_put; ++ ++ BUG_ON(!PROC_I(inode)->op.proc_nxi_read); ++ length = PROC_I(inode)->op.proc_nxi_read(nxi, (char *)page); ++ ++ if (length >= 0) ++ length = simple_read_from_buffer(buf, count, ppos, ++ (char *)page, length); ++ ++ free_page(page); ++out_put: ++ put_nx_info(nxi); ++out: ++ return length; ++} ++ ++ ++ ++/* here comes the lower level */ ++ ++ ++#define NOD(NAME, MODE, IOP, FOP, OP) { \ ++ .len = sizeof(NAME) - 1, \ ++ .name = (NAME), \ ++ .mode = MODE, \ ++ .iop = IOP, \ ++ .fop = FOP, \ ++ .op = OP, \ ++} ++ ++ ++#define DIR(NAME, MODE, OTYPE) \ ++ NOD(NAME, (S_IFDIR | (MODE)), \ ++ &proc_ ## OTYPE ## _inode_operations, \ ++ &proc_ ## OTYPE ## _file_operations, { } ) ++ ++#define INF(NAME, MODE, OTYPE) \ ++ NOD(NAME, (S_IFREG | (MODE)), NULL, \ ++ &proc_vs_info_file_operations, \ ++ { .proc_vs_read = &proc_##OTYPE } ) ++ ++#define VINF(NAME, MODE, OTYPE) \ ++ NOD(NAME, (S_IFREG | (MODE)), NULL, \ ++ &proc_vx_info_file_operations, \ ++ { .proc_vxi_read = &proc_##OTYPE } ) ++ ++#define NINF(NAME, MODE, OTYPE) \ ++ NOD(NAME, (S_IFREG | (MODE)), NULL, \ ++ &proc_nx_info_file_operations, \ ++ { .proc_nxi_read = &proc_##OTYPE } ) ++ ++ ++static struct file_operations proc_vs_info_file_operations = { ++ .read = proc_vs_info_read, ++}; ++ ++static struct file_operations proc_vx_info_file_operations = { ++ .read = proc_vx_info_read, ++}; ++ ++static struct dentry_operations proc_xid_dentry_operations = { ++ .d_revalidate = proc_xid_revalidate, ++}; ++ ++static struct vs_entry vx_base_stuff[] = { ++ VINF("info", S_IRUGO, vxi_info), ++ VINF("status", S_IRUGO, vxi_status), ++ VINF("limit", S_IRUGO, vxi_limit), ++ VINF("sched", S_IRUGO, vxi_sched), ++ VINF("nsproxy", S_IRUGO, vxi_nsproxy0), ++ VINF("nsproxy1",S_IRUGO, vxi_nsproxy1), ++ VINF("cvirt", S_IRUGO, vxi_cvirt), ++ VINF("cacct", S_IRUGO, vxi_cacct), ++ {} ++}; ++ ++ ++ ++ ++static struct dentry *proc_xid_instantiate(struct inode *dir, ++ struct dentry *dentry, int id, void *ptr) ++{ ++ dentry->d_op = &proc_xid_dentry_operations; ++ return vs_proc_instantiate(dir, dentry, id, ptr); ++} ++ ++static struct dentry *proc_xid_lookup(struct inode *dir, ++ struct dentry *dentry, struct nameidata *nd) ++{ ++ struct vs_entry *p = vx_base_stuff; ++ struct dentry *error = ERR_PTR(-ENOENT); ++ ++ for (; p->name; p++) { ++ if (p->len != dentry->d_name.len) ++ continue; ++ if (!memcmp(dentry->d_name.name, p->name, p->len)) ++ break; ++ } ++ if (!p->name) ++ goto out; ++ ++ error = proc_xid_instantiate(dir, dentry, PROC_I(dir)->fd, p); ++out: ++ return error; ++} ++ ++static int proc_xid_readdir(struct file *filp, ++ void *dirent, filldir_t filldir) ++{ ++ struct dentry *dentry = filp->f_dentry; ++ struct inode *inode = dentry->d_inode; ++ struct vs_entry *p = vx_base_stuff; ++ int size = sizeof(vx_base_stuff) / sizeof(struct vs_entry); ++ int pos, index; ++ u64 ino; ++ ++ pos = filp->f_pos; ++ switch (pos) { ++ case 0: ++ ino = inode->i_ino; ++ if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ case 1: ++ ino = parent_ino(dentry); ++ if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ default: ++ index = pos - 2; ++ if (index >= size) ++ goto out; ++ for (p += index; p->name; p++) { ++ if (proc_fill_cache(filp, dirent, filldir, p->name, p->len, ++ vs_proc_instantiate, PROC_I(inode)->fd, p)) ++ goto out; ++ pos++; ++ } ++ } ++out: ++ filp->f_pos = pos; ++ return 1; ++} ++ ++ ++ ++static struct file_operations proc_nx_info_file_operations = { ++ .read = proc_nx_info_read, ++}; ++ ++static struct dentry_operations proc_nid_dentry_operations = { ++ .d_revalidate = proc_nid_revalidate, ++}; ++ ++static struct vs_entry nx_base_stuff[] = { ++ NINF("info", S_IRUGO, nxi_info), ++ NINF("status", S_IRUGO, nxi_status), ++ {} ++}; ++ ++ ++static struct dentry *proc_nid_instantiate(struct inode *dir, ++ struct dentry *dentry, int id, void *ptr) ++{ ++ dentry->d_op = &proc_nid_dentry_operations; ++ return vs_proc_instantiate(dir, dentry, id, ptr); ++} ++ ++static struct dentry *proc_nid_lookup(struct inode *dir, ++ struct dentry *dentry, struct nameidata *nd) ++{ ++ struct vs_entry *p = nx_base_stuff; ++ struct dentry *error = ERR_PTR(-ENOENT); ++ ++ for (; p->name; p++) { ++ if (p->len != dentry->d_name.len) ++ continue; ++ if (!memcmp(dentry->d_name.name, p->name, p->len)) ++ break; ++ } ++ if (!p->name) ++ goto out; ++ ++ error = proc_nid_instantiate(dir, dentry, PROC_I(dir)->fd, p); ++out: ++ return error; ++} ++ ++static int proc_nid_readdir(struct file *filp, ++ void *dirent, filldir_t filldir) ++{ ++ struct dentry *dentry = filp->f_dentry; ++ struct inode *inode = dentry->d_inode; ++ struct vs_entry *p = nx_base_stuff; ++ int size = sizeof(nx_base_stuff) / sizeof(struct vs_entry); ++ int pos, index; ++ u64 ino; ++ ++ pos = filp->f_pos; ++ switch (pos) { ++ case 0: ++ ino = inode->i_ino; ++ if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ case 1: ++ ino = parent_ino(dentry); ++ if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ default: ++ index = pos - 2; ++ if (index >= size) ++ goto out; ++ for (p += index; p->name; p++) { ++ if (proc_fill_cache(filp, dirent, filldir, p->name, p->len, ++ vs_proc_instantiate, PROC_I(inode)->fd, p)) ++ goto out; ++ pos++; ++ } ++ } ++out: ++ filp->f_pos = pos; ++ return 1; ++} ++ ++ ++#define MAX_MULBY10 ((~0U - 9) / 10) ++ ++static inline int atovid(const char *str, int len) ++{ ++ int vid, c; ++ ++ vid = 0; ++ while (len-- > 0) { ++ c = *str - '0'; ++ str++; ++ if (c > 9) ++ return -1; ++ if (vid >= MAX_MULBY10) ++ return -1; ++ vid *= 10; ++ vid += c; ++ if (!vid) ++ return -1; ++ } ++ return vid; ++} ++ ++/* now the upper level (virtual) */ ++ ++ ++static struct file_operations proc_xid_file_operations = { ++ .read = generic_read_dir, ++ .readdir = proc_xid_readdir, ++}; ++ ++static struct inode_operations proc_xid_inode_operations = { ++ .lookup = proc_xid_lookup, ++}; ++ ++static struct vs_entry vx_virtual_stuff[] = { ++ INF("info", S_IRUGO, virtual_info), ++ INF("status", S_IRUGO, virtual_status), ++ DIR(NULL, S_IRUGO | S_IXUGO, xid), ++}; ++ ++ ++static struct dentry *proc_virtual_lookup(struct inode *dir, ++ struct dentry *dentry, struct nameidata *nd) ++{ ++ struct vs_entry *p = vx_virtual_stuff; ++ struct dentry *error = ERR_PTR(-ENOENT); ++ int id = 0; ++ ++ for (; p->name; p++) { ++ if (p->len != dentry->d_name.len) ++ continue; ++ if (!memcmp(dentry->d_name.name, p->name, p->len)) ++ break; ++ } ++ if (p->name) ++ goto instantiate; ++ ++ id = atovid(dentry->d_name.name, dentry->d_name.len); ++ if ((id < 0) || !xid_is_hashed(id)) ++ goto out; ++ ++instantiate: ++ error = proc_xid_instantiate(dir, dentry, id, p); ++out: ++ return error; ++} ++ ++static struct file_operations proc_nid_file_operations = { ++ .read = generic_read_dir, ++ .readdir = proc_nid_readdir, ++}; ++ ++static struct inode_operations proc_nid_inode_operations = { ++ .lookup = proc_nid_lookup, ++}; ++ ++static struct vs_entry nx_virtnet_stuff[] = { ++ INF("info", S_IRUGO, virtnet_info), ++ INF("status", S_IRUGO, virtnet_status), ++ DIR(NULL, S_IRUGO | S_IXUGO, nid), ++}; ++ ++ ++static struct dentry *proc_virtnet_lookup(struct inode *dir, ++ struct dentry *dentry, struct nameidata *nd) ++{ ++ struct vs_entry *p = nx_virtnet_stuff; ++ struct dentry *error = ERR_PTR(-ENOENT); ++ int id = 0; ++ ++ for (; p->name; p++) { ++ if (p->len != dentry->d_name.len) ++ continue; ++ if (!memcmp(dentry->d_name.name, p->name, p->len)) ++ break; ++ } ++ if (p->name) ++ goto instantiate; ++ ++ id = atovid(dentry->d_name.name, dentry->d_name.len); ++ if ((id < 0) || !nid_is_hashed(id)) ++ goto out; ++ ++instantiate: ++ error = proc_nid_instantiate(dir, dentry, id, p); ++out: ++ return error; ++} ++ ++ ++#define PROC_MAXVIDS 32 ++ ++int proc_virtual_readdir(struct file *filp, ++ void *dirent, filldir_t filldir) ++{ ++ struct dentry *dentry = filp->f_dentry; ++ struct inode *inode = dentry->d_inode; ++ struct vs_entry *p = vx_virtual_stuff; ++ int size = sizeof(vx_virtual_stuff) / sizeof(struct vs_entry); ++ int pos, index; ++ unsigned int xid_array[PROC_MAXVIDS]; ++ char buf[PROC_NUMBUF]; ++ unsigned int nr_xids, i; ++ u64 ino; ++ ++ pos = filp->f_pos; ++ switch (pos) { ++ case 0: ++ ino = inode->i_ino; ++ if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ case 1: ++ ino = parent_ino(dentry); ++ if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ default: ++ index = pos - 2; ++ if (index >= size) ++ goto entries; ++ for (p += index; p->name; p++) { ++ if (proc_fill_cache(filp, dirent, filldir, p->name, p->len, ++ vs_proc_instantiate, 0, p)) ++ goto out; ++ pos++; ++ } ++ entries: ++ index = pos - size; ++ p = &vx_virtual_stuff[size - 1]; ++ nr_xids = get_xid_list(index, xid_array, PROC_MAXVIDS); ++ for (i = 0; i < nr_xids; i++) { ++ int n, xid = xid_array[i]; ++ unsigned int j = PROC_NUMBUF; ++ ++ n = xid; ++ do ++ buf[--j] = '0' + (n % 10); ++ while (n /= 10); ++ ++ if (proc_fill_cache(filp, dirent, filldir, ++ buf + j, PROC_NUMBUF - j, ++ vs_proc_instantiate, xid, p)) ++ goto out; ++ pos++; ++ } ++ } ++out: ++ filp->f_pos = pos; ++ return 0; ++} ++ ++static int proc_virtual_getattr(struct vfsmount *mnt, ++ struct dentry *dentry, struct kstat *stat) ++{ ++ struct inode *inode = dentry->d_inode; ++ ++ generic_fillattr(inode, stat); ++ stat->nlink = 2 + atomic_read(&vx_global_cactive); ++ return 0; ++} ++ ++static struct file_operations proc_virtual_dir_operations = { ++ .read = generic_read_dir, ++ .readdir = proc_virtual_readdir, ++}; ++ ++static struct inode_operations proc_virtual_dir_inode_operations = { ++ .getattr = proc_virtual_getattr, ++ .lookup = proc_virtual_lookup, ++}; ++ ++ ++ ++ ++ ++int proc_virtnet_readdir(struct file *filp, ++ void *dirent, filldir_t filldir) ++{ ++ struct dentry *dentry = filp->f_dentry; ++ struct inode *inode = dentry->d_inode; ++ struct vs_entry *p = nx_virtnet_stuff; ++ int size = sizeof(nx_virtnet_stuff) / sizeof(struct vs_entry); ++ int pos, index; ++ unsigned int nid_array[PROC_MAXVIDS]; ++ char buf[PROC_NUMBUF]; ++ unsigned int nr_nids, i; ++ u64 ino; ++ ++ pos = filp->f_pos; ++ switch (pos) { ++ case 0: ++ ino = inode->i_ino; ++ if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ case 1: ++ ino = parent_ino(dentry); ++ if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0) ++ goto out; ++ pos++; ++ /* fall through */ ++ default: ++ index = pos - 2; ++ if (index >= size) ++ goto entries; ++ for (p += index; p->name; p++) { ++ if (proc_fill_cache(filp, dirent, filldir, p->name, p->len, ++ vs_proc_instantiate, 0, p)) ++ goto out; ++ pos++; ++ } ++ entries: ++ index = pos - size; ++ p = &nx_virtnet_stuff[size - 1]; ++ nr_nids = get_nid_list(index, nid_array, PROC_MAXVIDS); ++ for (i = 0; i < nr_nids; i++) { ++ int n, nid = nid_array[i]; ++ unsigned int j = PROC_NUMBUF; ++ ++ n = nid; ++ do ++ buf[--j] = '0' + (n % 10); ++ while (n /= 10); ++ ++ if (proc_fill_cache(filp, dirent, filldir, ++ buf + j, PROC_NUMBUF - j, ++ vs_proc_instantiate, nid, p)) ++ goto out; ++ pos++; ++ } ++ } ++out: ++ filp->f_pos = pos; ++ return 0; ++} ++ ++static int proc_virtnet_getattr(struct vfsmount *mnt, ++ struct dentry *dentry, struct kstat *stat) ++{ ++ struct inode *inode = dentry->d_inode; ++ ++ generic_fillattr(inode, stat); ++ stat->nlink = 2 + atomic_read(&nx_global_cactive); ++ return 0; ++} ++ ++static struct file_operations proc_virtnet_dir_operations = { ++ .read = generic_read_dir, ++ .readdir = proc_virtnet_readdir, ++}; ++ ++static struct inode_operations proc_virtnet_dir_inode_operations = { ++ .getattr = proc_virtnet_getattr, ++ .lookup = proc_virtnet_lookup, ++}; ++ ++ ++ ++void proc_vx_init(void) ++{ ++ struct proc_dir_entry *ent; ++ ++ ent = proc_mkdir("virtual", 0); ++ if (ent) { ++ ent->proc_fops = &proc_virtual_dir_operations; ++ ent->proc_iops = &proc_virtual_dir_inode_operations; ++ } ++ proc_virtual = ent; ++ ++ ent = proc_mkdir("virtnet", 0); ++ if (ent) { ++ ent->proc_fops = &proc_virtnet_dir_operations; ++ ent->proc_iops = &proc_virtnet_dir_inode_operations; ++ } ++ proc_virtnet = ent; ++} ++ ++ ++ ++ ++/* per pid info */ ++ ++ ++int proc_pid_vx_info(struct task_struct *p, char *buffer) ++{ ++ struct vx_info *vxi; ++ char *orig = buffer; ++ ++ buffer += sprintf(buffer, "XID:\t%d\n", vx_task_xid(p)); ++ ++ vxi = task_get_vx_info(p); ++ if (!vxi) ++ goto out; ++ ++ buffer += sprintf(buffer, "BCaps:\t"); ++ buffer = print_cap_t(buffer, &vxi->vx_bcaps); ++ buffer += sprintf(buffer, "\n"); ++ buffer += sprintf(buffer, "CCaps:\t%016llx\n", ++ (unsigned long long)vxi->vx_ccaps); ++ buffer += sprintf(buffer, "CFlags:\t%016llx\n", ++ (unsigned long long)vxi->vx_flags); ++ buffer += sprintf(buffer, "CIPid:\t%d\n", vxi->vx_initpid); ++ ++ put_vx_info(vxi); ++out: ++ return buffer - orig; ++} ++ ++ ++int proc_pid_nx_info(struct task_struct *p, char *buffer) ++{ ++ struct nx_info *nxi; ++ struct nx_addr_v4 *v4a; ++#ifdef CONFIG_IPV6 ++ struct nx_addr_v6 *v6a; ++#endif ++ char *orig = buffer; ++ int i; ++ ++ buffer += sprintf(buffer, "NID:\t%d\n", nx_task_nid(p)); ++ ++ nxi = task_get_nx_info(p); ++ if (!nxi) ++ goto out; ++ ++ buffer += sprintf(buffer, "NCaps:\t%016llx\n", ++ (unsigned long long)nxi->nx_ncaps); ++ buffer += sprintf(buffer, "NFlags:\t%016llx\n", ++ (unsigned long long)nxi->nx_flags); ++ ++ buffer += sprintf(buffer, ++ "V4Root[bcast]:\t" NIPQUAD_FMT "\n", ++ NIPQUAD(nxi->v4_bcast.s_addr)); ++ buffer += sprintf (buffer, ++ "V4Root[lback]:\t" NIPQUAD_FMT "\n", ++ NIPQUAD(nxi->v4_lback.s_addr)); ++ if (!NX_IPV4(nxi)) ++ goto skip_v4; ++ for (i = 0, v4a = &nxi->v4; v4a; i++, v4a = v4a->next) ++ buffer += sprintf(buffer, "V4Root[%d]:\t" NXAV4_FMT "\n", ++ i, NXAV4(v4a)); ++skip_v4: ++#ifdef CONFIG_IPV6 ++ if (!NX_IPV6(nxi)) ++ goto skip_v6; ++ for (i = 0, v6a = &nxi->v6; v6a; i++, v6a = v6a->next) ++ buffer += sprintf(buffer, "V6Root[%d]:\t" NXAV6_FMT "\n", ++ i, NXAV6(v6a)); ++skip_v6: ++#endif ++ put_nx_info(nxi); ++out: ++ return buffer - orig; ++} ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/sched.c linux-3.3.8-vs2.3.3.4/kernel/vserver/sched.c +--- linux-3.3.8/kernel/vserver/sched.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/sched.c 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,82 @@ ++/* ++ * linux/kernel/vserver/sched.c ++ * ++ * Virtual Server: Scheduler Support ++ * ++ * Copyright (C) 2004-2010 Herbert Pötzl ++ * ++ * V0.01 adapted Sam Vilains version to 2.6.3 ++ * V0.02 removed legacy interface ++ * V0.03 changed vcmds to vxi arg ++ * V0.04 removed older and legacy interfaces ++ * V0.05 removed scheduler code/commands ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include ++ ++ ++void vx_update_sched_param(struct _vx_sched *sched, ++ struct _vx_sched_pc *sched_pc) ++{ ++ sched_pc->prio_bias = sched->prio_bias; ++} ++ ++static int do_set_prio_bias(struct vx_info *vxi, struct vcmd_prio_bias *data) ++{ ++ int cpu; ++ ++ if (data->prio_bias > MAX_PRIO_BIAS) ++ data->prio_bias = MAX_PRIO_BIAS; ++ if (data->prio_bias < MIN_PRIO_BIAS) ++ data->prio_bias = MIN_PRIO_BIAS; ++ ++ if (data->cpu_id != ~0) { ++ vxi->sched.update = cpumask_of_cpu(data->cpu_id); ++ cpus_and(vxi->sched.update, cpu_online_map, ++ vxi->sched.update); ++ } else ++ vxi->sched.update = cpu_online_map; ++ ++ for_each_cpu_mask(cpu, vxi->sched.update) ++ vx_update_sched_param(&vxi->sched, ++ &vx_per_cpu(vxi, sched_pc, cpu)); ++ return 0; ++} ++ ++int vc_set_prio_bias(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_prio_bias vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return do_set_prio_bias(vxi, &vc_data); ++} ++ ++int vc_get_prio_bias(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_prio_bias vc_data; ++ struct _vx_sched_pc *pcd; ++ int cpu; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ cpu = vc_data.cpu_id; ++ ++ if (!cpu_possible(cpu)) ++ return -EINVAL; ++ ++ pcd = &vx_per_cpu(vxi, sched_pc, cpu); ++ vc_data.prio_bias = pcd->prio_bias; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ return -EFAULT; ++ return 0; ++} ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/sched_init.h linux-3.3.8-vs2.3.3.4/kernel/vserver/sched_init.h +--- linux-3.3.8/kernel/vserver/sched_init.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/sched_init.h 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,27 @@ ++ ++static inline void vx_info_init_sched(struct _vx_sched *sched) ++{ ++ /* scheduling; hard code starting values as constants */ ++ sched->prio_bias = 0; ++} ++ ++static inline ++void vx_info_init_sched_pc(struct _vx_sched_pc *sched_pc, int cpu) ++{ ++ sched_pc->prio_bias = 0; ++ ++ sched_pc->user_ticks = 0; ++ sched_pc->sys_ticks = 0; ++ sched_pc->hold_ticks = 0; ++} ++ ++static inline void vx_info_exit_sched(struct _vx_sched *sched) ++{ ++ return; ++} ++ ++static inline ++void vx_info_exit_sched_pc(struct _vx_sched_pc *sched_pc, int cpu) ++{ ++ return; ++} +diff -NurpP --minimal linux-3.3.8/kernel/vserver/sched_proc.h linux-3.3.8-vs2.3.3.4/kernel/vserver/sched_proc.h +--- linux-3.3.8/kernel/vserver/sched_proc.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/sched_proc.h 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,32 @@ ++#ifndef _VX_SCHED_PROC_H ++#define _VX_SCHED_PROC_H ++ ++ ++static inline ++int vx_info_proc_sched(struct _vx_sched *sched, char *buffer) ++{ ++ int length = 0; ++ ++ length += sprintf(buffer, ++ "PrioBias:\t%8d\n", ++ sched->prio_bias); ++ return length; ++} ++ ++static inline ++int vx_info_proc_sched_pc(struct _vx_sched_pc *sched_pc, ++ char *buffer, int cpu) ++{ ++ int length = 0; ++ ++ length += sprintf(buffer + length, ++ "cpu %d: %lld %lld %lld", cpu, ++ (unsigned long long)sched_pc->user_ticks, ++ (unsigned long long)sched_pc->sys_ticks, ++ (unsigned long long)sched_pc->hold_ticks); ++ length += sprintf(buffer + length, ++ " %d\n", sched_pc->prio_bias); ++ return length; ++} ++ ++#endif /* _VX_SCHED_PROC_H */ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/signal.c linux-3.3.8-vs2.3.3.4/kernel/vserver/signal.c +--- linux-3.3.8/kernel/vserver/signal.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/signal.c 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,134 @@ ++/* ++ * linux/kernel/vserver/signal.c ++ * ++ * Virtual Server: Signal Support ++ * ++ * Copyright (C) 2003-2007 Herbert Pötzl ++ * ++ * V0.01 broken out from vcontext V0.05 ++ * V0.02 changed vcmds to vxi arg ++ * V0.03 adjusted siginfo for kill ++ * ++ */ ++ ++#include ++ ++#include ++#include ++#include ++ ++ ++int vx_info_kill(struct vx_info *vxi, int pid, int sig) ++{ ++ int retval, count = 0; ++ struct task_struct *p; ++ struct siginfo *sip = SEND_SIG_PRIV; ++ ++ retval = -ESRCH; ++ vxdprintk(VXD_CBIT(misc, 4), ++ "vx_info_kill(%p[#%d],%d,%d)*", ++ vxi, vxi->vx_id, pid, sig); ++ read_lock(&tasklist_lock); ++ switch (pid) { ++ case 0: ++ case -1: ++ for_each_process(p) { ++ int err = 0; ++ ++ if (vx_task_xid(p) != vxi->vx_id || p->pid <= 1 || ++ (pid && vxi->vx_initpid == p->pid)) ++ continue; ++ ++ err = group_send_sig_info(sig, sip, p); ++ ++count; ++ if (err != -EPERM) ++ retval = err; ++ } ++ break; ++ ++ case 1: ++ if (vxi->vx_initpid) { ++ pid = vxi->vx_initpid; ++ /* for now, only SIGINT to private init ... */ ++ if (!vx_info_flags(vxi, VXF_STATE_ADMIN, 0) && ++ /* ... as long as there are tasks left */ ++ (atomic_read(&vxi->vx_tasks) > 1)) ++ sig = SIGINT; ++ } ++ /* fallthrough */ ++ default: ++ rcu_read_lock(); ++ p = find_task_by_real_pid(pid); ++ rcu_read_unlock(); ++ if (p) { ++ if (vx_task_xid(p) == vxi->vx_id) ++ retval = group_send_sig_info(sig, sip, p); ++ } ++ break; ++ } ++ read_unlock(&tasklist_lock); ++ vxdprintk(VXD_CBIT(misc, 4), ++ "vx_info_kill(%p[#%d],%d,%d,%ld) = %d", ++ vxi, vxi->vx_id, pid, sig, (long)sip, retval); ++ return retval; ++} ++ ++int vc_ctx_kill(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_ctx_kill_v0 vc_data; ++ ++ if (copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ /* special check to allow guest shutdown */ ++ if (!vx_info_flags(vxi, VXF_STATE_ADMIN, 0) && ++ /* forbid killall pid=0 when init is present */ ++ (((vc_data.pid < 1) && vxi->vx_initpid) || ++ (vc_data.pid > 1))) ++ return -EACCES; ++ ++ return vx_info_kill(vxi, vc_data.pid, vc_data.sig); ++} ++ ++ ++static int __wait_exit(struct vx_info *vxi) ++{ ++ DECLARE_WAITQUEUE(wait, current); ++ int ret = 0; ++ ++ add_wait_queue(&vxi->vx_wait, &wait); ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++wait: ++ if (vx_info_state(vxi, ++ VXS_SHUTDOWN | VXS_HASHED | VXS_HELPER) == VXS_SHUTDOWN) ++ goto out; ++ if (signal_pending(current)) { ++ ret = -ERESTARTSYS; ++ goto out; ++ } ++ schedule(); ++ goto wait; ++ ++out: ++ set_current_state(TASK_RUNNING); ++ remove_wait_queue(&vxi->vx_wait, &wait); ++ return ret; ++} ++ ++ ++ ++int vc_wait_exit(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_wait_exit_v0 vc_data; ++ int ret; ++ ++ ret = __wait_exit(vxi); ++ vc_data.reboot_cmd = vxi->reboot_cmd; ++ vc_data.exit_code = vxi->exit_code; ++ ++ if (copy_to_user(data, &vc_data, sizeof(vc_data))) ++ ret = -EFAULT; ++ return ret; ++} ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/space.c linux-3.3.8-vs2.3.3.4/kernel/vserver/space.c +--- linux-3.3.8/kernel/vserver/space.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/space.c 2012-02-24 17:01:40.000000000 +0100 +@@ -0,0 +1,436 @@ ++/* ++ * linux/kernel/vserver/space.c ++ * ++ * Virtual Server: Context Space Support ++ * ++ * Copyright (C) 2003-2010 Herbert Pötzl ++ * ++ * V0.01 broken out from context.c 0.07 ++ * V0.02 added task locking for namespace ++ * V0.03 broken out vx_enter_namespace ++ * V0.04 added *space support and commands ++ * V0.05 added credential support ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++atomic_t vs_global_nsproxy = ATOMIC_INIT(0); ++atomic_t vs_global_fs = ATOMIC_INIT(0); ++atomic_t vs_global_mnt_ns = ATOMIC_INIT(0); ++atomic_t vs_global_uts_ns = ATOMIC_INIT(0); ++atomic_t vs_global_user_ns = ATOMIC_INIT(0); ++atomic_t vs_global_pid_ns = ATOMIC_INIT(0); ++ ++ ++/* namespace functions */ ++ ++#include ++#include ++#include ++#include ++#include ++#include "../fs/mount.h" ++ ++ ++static const struct vcmd_space_mask_v1 space_mask_v0 = { ++ .mask = CLONE_FS | ++ CLONE_NEWNS | ++#ifdef CONFIG_UTS_NS ++ CLONE_NEWUTS | ++#endif ++#ifdef CONFIG_IPC_NS ++ CLONE_NEWIPC | ++#endif ++#ifdef CONFIG_USER_NS ++ CLONE_NEWUSER | ++#endif ++ 0 ++}; ++ ++static const struct vcmd_space_mask_v1 space_mask = { ++ .mask = CLONE_FS | ++ CLONE_NEWNS | ++#ifdef CONFIG_UTS_NS ++ CLONE_NEWUTS | ++#endif ++#ifdef CONFIG_IPC_NS ++ CLONE_NEWIPC | ++#endif ++#ifdef CONFIG_USER_NS ++ CLONE_NEWUSER | ++#endif ++#ifdef CONFIG_PID_NS ++ CLONE_NEWPID | ++#endif ++#ifdef CONFIG_NET_NS ++ CLONE_NEWNET | ++#endif ++ 0 ++}; ++ ++static const struct vcmd_space_mask_v1 default_space_mask = { ++ .mask = CLONE_FS | ++ CLONE_NEWNS | ++#ifdef CONFIG_UTS_NS ++ CLONE_NEWUTS | ++#endif ++#ifdef CONFIG_IPC_NS ++ CLONE_NEWIPC | ++#endif ++#ifdef CONFIG_USER_NS ++ CLONE_NEWUSER | ++#endif ++#ifdef CONFIG_PID_NS ++// CLONE_NEWPID | ++#endif ++ 0 ++}; ++ ++/* ++ * build a new nsproxy mix ++ * assumes that both proxies are 'const' ++ * does not touch nsproxy refcounts ++ * will hold a reference on the result. ++ */ ++ ++struct nsproxy *vs_mix_nsproxy(struct nsproxy *old_nsproxy, ++ struct nsproxy *new_nsproxy, unsigned long mask) ++{ ++ struct mnt_namespace *old_ns; ++ struct uts_namespace *old_uts; ++ struct ipc_namespace *old_ipc; ++#ifdef CONFIG_PID_NS ++ struct pid_namespace *old_pid; ++#endif ++#ifdef CONFIG_NET_NS ++ struct net *old_net; ++#endif ++ struct nsproxy *nsproxy; ++ ++ nsproxy = copy_nsproxy(old_nsproxy); ++ if (!nsproxy) ++ goto out; ++ ++ if (mask & CLONE_NEWNS) { ++ old_ns = nsproxy->mnt_ns; ++ nsproxy->mnt_ns = new_nsproxy->mnt_ns; ++ if (nsproxy->mnt_ns) ++ get_mnt_ns(nsproxy->mnt_ns); ++ } else ++ old_ns = NULL; ++ ++ if (mask & CLONE_NEWUTS) { ++ old_uts = nsproxy->uts_ns; ++ nsproxy->uts_ns = new_nsproxy->uts_ns; ++ if (nsproxy->uts_ns) ++ get_uts_ns(nsproxy->uts_ns); ++ } else ++ old_uts = NULL; ++ ++ if (mask & CLONE_NEWIPC) { ++ old_ipc = nsproxy->ipc_ns; ++ nsproxy->ipc_ns = new_nsproxy->ipc_ns; ++ if (nsproxy->ipc_ns) ++ get_ipc_ns(nsproxy->ipc_ns); ++ } else ++ old_ipc = NULL; ++ ++#ifdef CONFIG_PID_NS ++ if (mask & CLONE_NEWPID) { ++ old_pid = nsproxy->pid_ns; ++ nsproxy->pid_ns = new_nsproxy->pid_ns; ++ if (nsproxy->pid_ns) ++ get_pid_ns(nsproxy->pid_ns); ++ } else ++ old_pid = NULL; ++#endif ++#ifdef CONFIG_NET_NS ++ if (mask & CLONE_NEWNET) { ++ old_net = nsproxy->net_ns; ++ nsproxy->net_ns = new_nsproxy->net_ns; ++ if (nsproxy->net_ns) ++ get_net(nsproxy->net_ns); ++ } else ++ old_net = NULL; ++#endif ++ if (old_ns) ++ put_mnt_ns(old_ns); ++ if (old_uts) ++ put_uts_ns(old_uts); ++ if (old_ipc) ++ put_ipc_ns(old_ipc); ++#ifdef CONFIG_PID_NS ++ if (old_pid) ++ put_pid_ns(old_pid); ++#endif ++#ifdef CONFIG_NET_NS ++ if (old_net) ++ put_net(old_net); ++#endif ++out: ++ return nsproxy; ++} ++ ++ ++/* ++ * merge two nsproxy structs into a new one. ++ * will hold a reference on the result. ++ */ ++ ++static inline ++struct nsproxy *__vs_merge_nsproxy(struct nsproxy *old, ++ struct nsproxy *proxy, unsigned long mask) ++{ ++ struct nsproxy null_proxy = { .mnt_ns = NULL }; ++ ++ if (!proxy) ++ return NULL; ++ ++ if (mask) { ++ /* vs_mix_nsproxy returns with reference */ ++ return vs_mix_nsproxy(old ? old : &null_proxy, ++ proxy, mask); ++ } ++ get_nsproxy(proxy); ++ return proxy; ++} ++ ++ ++int vx_enter_space(struct vx_info *vxi, unsigned long mask, unsigned index) ++{ ++ struct nsproxy *proxy, *proxy_cur, *proxy_new; ++ struct fs_struct *fs_cur, *fs = NULL; ++ struct _vx_space *space; ++ int ret, kill = 0; ++ ++ vxdprintk(VXD_CBIT(space, 8), "vx_enter_space(%p[#%u],0x%08lx,%d)", ++ vxi, vxi->vx_id, mask, index); ++ ++ if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0)) ++ return -EACCES; ++ ++ if (index >= VX_SPACES) ++ return -EINVAL; ++ ++ space = &vxi->space[index]; ++ ++ if (!mask) ++ mask = space->vx_nsmask; ++ ++ if ((mask & space->vx_nsmask) != mask) ++ return -EINVAL; ++ ++ if (mask & CLONE_FS) { ++ fs = copy_fs_struct(space->vx_fs); ++ if (!fs) ++ return -ENOMEM; ++ } ++ proxy = space->vx_nsproxy; ++ ++ vxdprintk(VXD_CBIT(space, 9), ++ "vx_enter_space(%p[#%u],0x%08lx,%d) -> (%p,%p)", ++ vxi, vxi->vx_id, mask, index, proxy, fs); ++ ++ task_lock(current); ++ fs_cur = current->fs; ++ ++ if (mask & CLONE_FS) { ++ spin_lock(&fs_cur->lock); ++ current->fs = fs; ++ kill = !--fs_cur->users; ++ spin_unlock(&fs_cur->lock); ++ } ++ ++ proxy_cur = current->nsproxy; ++ get_nsproxy(proxy_cur); ++ task_unlock(current); ++ ++ if (kill) ++ free_fs_struct(fs_cur); ++ ++ proxy_new = __vs_merge_nsproxy(proxy_cur, proxy, mask); ++ if (IS_ERR(proxy_new)) { ++ ret = PTR_ERR(proxy_new); ++ goto out_put; ++ } ++ ++ proxy_new = xchg(¤t->nsproxy, proxy_new); ++ ++ if (mask & CLONE_NEWUSER) { ++ struct cred *cred; ++ ++ vxdprintk(VXD_CBIT(space, 10), ++ "vx_enter_space(%p[#%u],%p) cred (%p,%p)", ++ vxi, vxi->vx_id, space->vx_cred, ++ current->real_cred, current->cred); ++ ++ if (space->vx_cred) { ++ cred = __prepare_creds(space->vx_cred); ++ if (cred) ++ commit_creds(cred); ++ } ++ } ++ ++ ret = 0; ++ ++ if (proxy_new) ++ put_nsproxy(proxy_new); ++out_put: ++ if (proxy_cur) ++ put_nsproxy(proxy_cur); ++ return ret; ++} ++ ++ ++int vx_set_space(struct vx_info *vxi, unsigned long mask, unsigned index) ++{ ++ struct nsproxy *proxy_vxi, *proxy_cur, *proxy_new; ++ struct fs_struct *fs_vxi, *fs; ++ struct _vx_space *space; ++ int ret, kill = 0; ++ ++ vxdprintk(VXD_CBIT(space, 8), "vx_set_space(%p[#%u],0x%08lx,%d)", ++ vxi, vxi->vx_id, mask, index); ++ ++ if ((mask & space_mask.mask) != mask) ++ return -EINVAL; ++ ++ if (index >= VX_SPACES) ++ return -EINVAL; ++ ++ space = &vxi->space[index]; ++ ++ proxy_vxi = space->vx_nsproxy; ++ fs_vxi = space->vx_fs; ++ ++ if (mask & CLONE_FS) { ++ fs = copy_fs_struct(current->fs); ++ if (!fs) ++ return -ENOMEM; ++ } ++ ++ task_lock(current); ++ ++ if (mask & CLONE_FS) { ++ spin_lock(&fs_vxi->lock); ++ space->vx_fs = fs; ++ kill = !--fs_vxi->users; ++ spin_unlock(&fs_vxi->lock); ++ } ++ ++ proxy_cur = current->nsproxy; ++ get_nsproxy(proxy_cur); ++ task_unlock(current); ++ ++ if (kill) ++ free_fs_struct(fs_vxi); ++ ++ proxy_new = __vs_merge_nsproxy(proxy_vxi, proxy_cur, mask); ++ if (IS_ERR(proxy_new)) { ++ ret = PTR_ERR(proxy_new); ++ goto out_put; ++ } ++ ++ proxy_new = xchg(&space->vx_nsproxy, proxy_new); ++ space->vx_nsmask |= mask; ++ ++ if (mask & CLONE_NEWUSER) { ++ struct cred *cred; ++ ++ vxdprintk(VXD_CBIT(space, 10), ++ "vx_set_space(%p[#%u],%p) cred (%p,%p)", ++ vxi, vxi->vx_id, space->vx_cred, ++ current->real_cred, current->cred); ++ ++ cred = prepare_creds(); ++ cred = (struct cred *)xchg(&space->vx_cred, cred); ++ if (cred) ++ abort_creds(cred); ++ } ++ ++ ret = 0; ++ ++ if (proxy_new) ++ put_nsproxy(proxy_new); ++out_put: ++ if (proxy_cur) ++ put_nsproxy(proxy_cur); ++ return ret; ++} ++ ++ ++int vc_enter_space_v1(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_space_mask_v1 vc_data = { .mask = 0 }; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return vx_enter_space(vxi, vc_data.mask, 0); ++} ++ ++int vc_enter_space(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_space_mask_v2 vc_data = { .mask = 0 }; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ if (vc_data.index >= VX_SPACES) ++ return -EINVAL; ++ ++ return vx_enter_space(vxi, vc_data.mask, vc_data.index); ++} ++ ++int vc_set_space_v1(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_space_mask_v1 vc_data = { .mask = 0 }; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ return vx_set_space(vxi, vc_data.mask, 0); ++} ++ ++int vc_set_space(struct vx_info *vxi, void __user *data) ++{ ++ struct vcmd_space_mask_v2 vc_data = { .mask = 0 }; ++ ++ if (data && copy_from_user(&vc_data, data, sizeof(vc_data))) ++ return -EFAULT; ++ ++ if (vc_data.index >= VX_SPACES) ++ return -EINVAL; ++ ++ return vx_set_space(vxi, vc_data.mask, vc_data.index); ++} ++ ++int vc_get_space_mask(void __user *data, int type) ++{ ++ const struct vcmd_space_mask_v1 *mask; ++ ++ if (type == 0) ++ mask = &space_mask_v0; ++ else if (type == 1) ++ mask = &space_mask; ++ else ++ mask = &default_space_mask; ++ ++ vxdprintk(VXD_CBIT(space, 10), ++ "vc_get_space_mask(%d) = %08llx", type, mask->mask); ++ ++ if (copy_to_user(data, mask, sizeof(*mask))) ++ return -EFAULT; ++ return 0; ++} ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/switch.c linux-3.3.8-vs2.3.3.4/kernel/vserver/switch.c +--- linux-3.3.8/kernel/vserver/switch.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/switch.c 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,556 @@ ++/* ++ * linux/kernel/vserver/switch.c ++ * ++ * Virtual Server: Syscall Switch ++ * ++ * Copyright (C) 2003-2011 Herbert Pötzl ++ * ++ * V0.01 syscall switch ++ * V0.02 added signal to context ++ * V0.03 added rlimit functions ++ * V0.04 added iattr, task/xid functions ++ * V0.05 added debug/history stuff ++ * V0.06 added compat32 layer ++ * V0.07 vcmd args and perms ++ * V0.08 added status commands ++ * V0.09 added tag commands ++ * V0.10 added oom bias ++ * V0.11 added device commands ++ * V0.12 added warn mask ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include "vci_config.h" ++ ++ ++static inline ++int vc_get_version(uint32_t id) ++{ ++ return VCI_VERSION; ++} ++ ++static inline ++int vc_get_vci(uint32_t id) ++{ ++ return vci_kernel_config(); ++} ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++ ++#ifdef CONFIG_COMPAT ++#define __COMPAT(name, id, data, compat) \ ++ (compat) ? name ## _x32(id, data) : name(id, data) ++#define __COMPAT_NO_ID(name, data, compat) \ ++ (compat) ? name ## _x32(data) : name(data) ++#else ++#define __COMPAT(name, id, data, compat) \ ++ name(id, data) ++#define __COMPAT_NO_ID(name, data, compat) \ ++ name(data) ++#endif ++ ++ ++static inline ++long do_vcmd(uint32_t cmd, uint32_t id, ++ struct vx_info *vxi, struct nx_info *nxi, ++ void __user *data, int compat) ++{ ++ switch (cmd) { ++ ++ case VCMD_get_version: ++ return vc_get_version(id); ++ case VCMD_get_vci: ++ return vc_get_vci(id); ++ ++ case VCMD_task_xid: ++ return vc_task_xid(id); ++ case VCMD_vx_info: ++ return vc_vx_info(vxi, data); ++ ++ case VCMD_task_nid: ++ return vc_task_nid(id); ++ case VCMD_nx_info: ++ return vc_nx_info(nxi, data); ++ ++ case VCMD_task_tag: ++ return vc_task_tag(id); ++ ++ case VCMD_set_space_v1: ++ return vc_set_space_v1(vxi, data); ++ /* this is version 2 */ ++ case VCMD_set_space: ++ return vc_set_space(vxi, data); ++ ++ case VCMD_get_space_mask_v0: ++ return vc_get_space_mask(data, 0); ++ /* this is version 1 */ ++ case VCMD_get_space_mask: ++ return vc_get_space_mask(data, 1); ++ ++ case VCMD_get_space_default: ++ return vc_get_space_mask(data, -1); ++ ++ case VCMD_set_umask: ++ return vc_set_umask(vxi, data); ++ ++ case VCMD_get_umask: ++ return vc_get_umask(vxi, data); ++ ++ case VCMD_set_wmask: ++ return vc_set_wmask(vxi, data); ++ ++ case VCMD_get_wmask: ++ return vc_get_wmask(vxi, data); ++#ifdef CONFIG_IA32_EMULATION ++ case VCMD_get_rlimit: ++ return __COMPAT(vc_get_rlimit, vxi, data, compat); ++ case VCMD_set_rlimit: ++ return __COMPAT(vc_set_rlimit, vxi, data, compat); ++#else ++ case VCMD_get_rlimit: ++ return vc_get_rlimit(vxi, data); ++ case VCMD_set_rlimit: ++ return vc_set_rlimit(vxi, data); ++#endif ++ case VCMD_get_rlimit_mask: ++ return vc_get_rlimit_mask(id, data); ++ case VCMD_reset_hits: ++ return vc_reset_hits(vxi, data); ++ case VCMD_reset_minmax: ++ return vc_reset_minmax(vxi, data); ++ ++ case VCMD_get_vhi_name: ++ return vc_get_vhi_name(vxi, data); ++ case VCMD_set_vhi_name: ++ return vc_set_vhi_name(vxi, data); ++ ++ case VCMD_ctx_stat: ++ return vc_ctx_stat(vxi, data); ++ case VCMD_virt_stat: ++ return vc_virt_stat(vxi, data); ++ case VCMD_sock_stat: ++ return vc_sock_stat(vxi, data); ++ case VCMD_rlimit_stat: ++ return vc_rlimit_stat(vxi, data); ++ ++ case VCMD_set_cflags: ++ return vc_set_cflags(vxi, data); ++ case VCMD_get_cflags: ++ return vc_get_cflags(vxi, data); ++ ++ /* this is version 1 */ ++ case VCMD_set_ccaps: ++ return vc_set_ccaps(vxi, data); ++ /* this is version 1 */ ++ case VCMD_get_ccaps: ++ return vc_get_ccaps(vxi, data); ++ case VCMD_set_bcaps: ++ return vc_set_bcaps(vxi, data); ++ case VCMD_get_bcaps: ++ return vc_get_bcaps(vxi, data); ++ ++ case VCMD_set_badness: ++ return vc_set_badness(vxi, data); ++ case VCMD_get_badness: ++ return vc_get_badness(vxi, data); ++ ++ case VCMD_set_nflags: ++ return vc_set_nflags(nxi, data); ++ case VCMD_get_nflags: ++ return vc_get_nflags(nxi, data); ++ ++ case VCMD_set_ncaps: ++ return vc_set_ncaps(nxi, data); ++ case VCMD_get_ncaps: ++ return vc_get_ncaps(nxi, data); ++ ++ case VCMD_set_prio_bias: ++ return vc_set_prio_bias(vxi, data); ++ case VCMD_get_prio_bias: ++ return vc_get_prio_bias(vxi, data); ++ case VCMD_add_dlimit: ++ return __COMPAT(vc_add_dlimit, id, data, compat); ++ case VCMD_rem_dlimit: ++ return __COMPAT(vc_rem_dlimit, id, data, compat); ++ case VCMD_set_dlimit: ++ return __COMPAT(vc_set_dlimit, id, data, compat); ++ case VCMD_get_dlimit: ++ return __COMPAT(vc_get_dlimit, id, data, compat); ++ ++ case VCMD_ctx_kill: ++ return vc_ctx_kill(vxi, data); ++ ++ case VCMD_wait_exit: ++ return vc_wait_exit(vxi, data); ++ ++ case VCMD_get_iattr: ++ return __COMPAT_NO_ID(vc_get_iattr, data, compat); ++ case VCMD_set_iattr: ++ return __COMPAT_NO_ID(vc_set_iattr, data, compat); ++ ++ case VCMD_fget_iattr: ++ return vc_fget_iattr(id, data); ++ case VCMD_fset_iattr: ++ return vc_fset_iattr(id, data); ++ ++ case VCMD_enter_space_v0: ++ return vc_enter_space_v1(vxi, NULL); ++ case VCMD_enter_space_v1: ++ return vc_enter_space_v1(vxi, data); ++ /* this is version 2 */ ++ case VCMD_enter_space: ++ return vc_enter_space(vxi, data); ++ ++ case VCMD_ctx_create_v0: ++ return vc_ctx_create(id, NULL); ++ case VCMD_ctx_create: ++ return vc_ctx_create(id, data); ++ case VCMD_ctx_migrate_v0: ++ return vc_ctx_migrate(vxi, NULL); ++ case VCMD_ctx_migrate: ++ return vc_ctx_migrate(vxi, data); ++ ++ case VCMD_net_create_v0: ++ return vc_net_create(id, NULL); ++ case VCMD_net_create: ++ return vc_net_create(id, data); ++ case VCMD_net_migrate: ++ return vc_net_migrate(nxi, data); ++ ++ case VCMD_tag_migrate: ++ return vc_tag_migrate(id); ++ ++ case VCMD_net_add: ++ return vc_net_add(nxi, data); ++ case VCMD_net_remove: ++ return vc_net_remove(nxi, data); ++ ++ case VCMD_net_add_ipv4_v1: ++ return vc_net_add_ipv4_v1(nxi, data); ++ /* this is version 2 */ ++ case VCMD_net_add_ipv4: ++ return vc_net_add_ipv4(nxi, data); ++ ++ case VCMD_net_rem_ipv4_v1: ++ return vc_net_rem_ipv4_v1(nxi, data); ++ /* this is version 2 */ ++ case VCMD_net_rem_ipv4: ++ return vc_net_rem_ipv4(nxi, data); ++#ifdef CONFIG_IPV6 ++ case VCMD_net_add_ipv6: ++ return vc_net_add_ipv6(nxi, data); ++ case VCMD_net_remove_ipv6: ++ return vc_net_remove_ipv6(nxi, data); ++#endif ++/* case VCMD_add_match_ipv4: ++ return vc_add_match_ipv4(nxi, data); ++ case VCMD_get_match_ipv4: ++ return vc_get_match_ipv4(nxi, data); ++#ifdef CONFIG_IPV6 ++ case VCMD_add_match_ipv6: ++ return vc_add_match_ipv6(nxi, data); ++ case VCMD_get_match_ipv6: ++ return vc_get_match_ipv6(nxi, data); ++#endif */ ++ ++#ifdef CONFIG_VSERVER_DEVICE ++ case VCMD_set_mapping: ++ return __COMPAT(vc_set_mapping, vxi, data, compat); ++ case VCMD_unset_mapping: ++ return __COMPAT(vc_unset_mapping, vxi, data, compat); ++#endif ++#ifdef CONFIG_VSERVER_HISTORY ++ case VCMD_dump_history: ++ return vc_dump_history(id); ++ case VCMD_read_history: ++ return __COMPAT(vc_read_history, id, data, compat); ++#endif ++ default: ++ vxwprintk_task(1, "unimplemented VCMD_%02d_%d[%d]", ++ VC_CATEGORY(cmd), VC_COMMAND(cmd), VC_VERSION(cmd)); ++ } ++ return -ENOSYS; ++} ++ ++ ++#define __VCMD(vcmd, _perm, _args, _flags) \ ++ case VCMD_ ## vcmd: perm = _perm; \ ++ args = _args; flags = _flags; break ++ ++ ++#define VCA_NONE 0x00 ++#define VCA_VXI 0x01 ++#define VCA_NXI 0x02 ++ ++#define VCF_NONE 0x00 ++#define VCF_INFO 0x01 ++#define VCF_ADMIN 0x02 ++#define VCF_ARES 0x06 /* includes admin */ ++#define VCF_SETUP 0x08 ++ ++#define VCF_ZIDOK 0x10 /* zero id okay */ ++ ++ ++static inline ++long do_vserver(uint32_t cmd, uint32_t id, void __user *data, int compat) ++{ ++ long ret; ++ int permit = -1, state = 0; ++ int perm = -1, args = 0, flags = 0; ++ struct vx_info *vxi = NULL; ++ struct nx_info *nxi = NULL; ++ ++ switch (cmd) { ++ /* unpriviledged commands */ ++ __VCMD(get_version, 0, VCA_NONE, 0); ++ __VCMD(get_vci, 0, VCA_NONE, 0); ++ __VCMD(get_rlimit_mask, 0, VCA_NONE, 0); ++ __VCMD(get_space_mask_v0,0, VCA_NONE, 0); ++ __VCMD(get_space_mask, 0, VCA_NONE, 0); ++ __VCMD(get_space_default,0, VCA_NONE, 0); ++ ++ /* info commands */ ++ __VCMD(task_xid, 2, VCA_NONE, 0); ++ __VCMD(reset_hits, 2, VCA_VXI, 0); ++ __VCMD(reset_minmax, 2, VCA_VXI, 0); ++ __VCMD(vx_info, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_bcaps, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_ccaps, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_cflags, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_umask, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_wmask, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_badness, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_vhi_name, 3, VCA_VXI, VCF_INFO); ++ __VCMD(get_rlimit, 3, VCA_VXI, VCF_INFO); ++ ++ __VCMD(ctx_stat, 3, VCA_VXI, VCF_INFO); ++ __VCMD(virt_stat, 3, VCA_VXI, VCF_INFO); ++ __VCMD(sock_stat, 3, VCA_VXI, VCF_INFO); ++ __VCMD(rlimit_stat, 3, VCA_VXI, VCF_INFO); ++ ++ __VCMD(task_nid, 2, VCA_NONE, 0); ++ __VCMD(nx_info, 3, VCA_NXI, VCF_INFO); ++ __VCMD(get_ncaps, 3, VCA_NXI, VCF_INFO); ++ __VCMD(get_nflags, 3, VCA_NXI, VCF_INFO); ++ ++ __VCMD(task_tag, 2, VCA_NONE, 0); ++ ++ __VCMD(get_iattr, 2, VCA_NONE, 0); ++ __VCMD(fget_iattr, 2, VCA_NONE, 0); ++ __VCMD(get_dlimit, 3, VCA_NONE, VCF_INFO); ++ __VCMD(get_prio_bias, 3, VCA_VXI, VCF_INFO); ++ ++ /* lower admin commands */ ++ __VCMD(wait_exit, 4, VCA_VXI, VCF_INFO); ++ __VCMD(ctx_create_v0, 5, VCA_NONE, 0); ++ __VCMD(ctx_create, 5, VCA_NONE, 0); ++ __VCMD(ctx_migrate_v0, 5, VCA_VXI, VCF_ADMIN); ++ __VCMD(ctx_migrate, 5, VCA_VXI, VCF_ADMIN); ++ __VCMD(enter_space_v0, 5, VCA_VXI, VCF_ADMIN); ++ __VCMD(enter_space_v1, 5, VCA_VXI, VCF_ADMIN); ++ __VCMD(enter_space, 5, VCA_VXI, VCF_ADMIN); ++ ++ __VCMD(net_create_v0, 5, VCA_NONE, 0); ++ __VCMD(net_create, 5, VCA_NONE, 0); ++ __VCMD(net_migrate, 5, VCA_NXI, VCF_ADMIN); ++ ++ __VCMD(tag_migrate, 5, VCA_NONE, VCF_ADMIN); ++ ++ /* higher admin commands */ ++ __VCMD(ctx_kill, 6, VCA_VXI, VCF_ARES); ++ __VCMD(set_space_v1, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_space, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ ++ __VCMD(set_ccaps, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_bcaps, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_cflags, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_umask, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_wmask, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_badness, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ ++ __VCMD(set_vhi_name, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_rlimit, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_prio_bias, 7, VCA_VXI, VCF_ARES | VCF_SETUP); ++ ++ __VCMD(set_ncaps, 7, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(set_nflags, 7, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(net_add, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(net_remove, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(net_add_ipv4_v1, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(net_rem_ipv4_v1, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(net_add_ipv4, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(net_rem_ipv4, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++#ifdef CONFIG_IPV6 ++ __VCMD(net_add_ipv6, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++ __VCMD(net_remove_ipv6, 8, VCA_NXI, VCF_ARES | VCF_SETUP); ++#endif ++ __VCMD(set_iattr, 7, VCA_NONE, 0); ++ __VCMD(fset_iattr, 7, VCA_NONE, 0); ++ __VCMD(set_dlimit, 7, VCA_NONE, VCF_ARES); ++ __VCMD(add_dlimit, 8, VCA_NONE, VCF_ARES); ++ __VCMD(rem_dlimit, 8, VCA_NONE, VCF_ARES); ++ ++#ifdef CONFIG_VSERVER_DEVICE ++ __VCMD(set_mapping, 8, VCA_VXI, VCF_ARES|VCF_ZIDOK); ++ __VCMD(unset_mapping, 8, VCA_VXI, VCF_ARES|VCF_ZIDOK); ++#endif ++ /* debug level admin commands */ ++#ifdef CONFIG_VSERVER_HISTORY ++ __VCMD(dump_history, 9, VCA_NONE, 0); ++ __VCMD(read_history, 9, VCA_NONE, 0); ++#endif ++ ++ default: ++ perm = -1; ++ } ++ ++ vxdprintk(VXD_CBIT(switch, 0), ++ "vc: VCMD_%02d_%d[%d], %d,%p [%d,%d,%x,%x]", ++ VC_CATEGORY(cmd), VC_COMMAND(cmd), ++ VC_VERSION(cmd), id, data, compat, ++ perm, args, flags); ++ ++ ret = -ENOSYS; ++ if (perm < 0) ++ goto out; ++ ++ state = 1; ++ if (!capable(CAP_CONTEXT)) ++ goto out; ++ ++ state = 2; ++ /* moved here from the individual commands */ ++ ret = -EPERM; ++ if ((perm > 1) && !capable(CAP_SYS_ADMIN)) ++ goto out; ++ ++ state = 3; ++ /* vcmd involves resource management */ ++ ret = -EPERM; ++ if ((flags & VCF_ARES) && !capable(CAP_SYS_RESOURCE)) ++ goto out; ++ ++ state = 4; ++ /* various legacy exceptions */ ++ switch (cmd) { ++ /* will go away when spectator is a cap */ ++ case VCMD_ctx_migrate_v0: ++ case VCMD_ctx_migrate: ++ if (id == 1) { ++ current->xid = 1; ++ ret = 1; ++ goto out; ++ } ++ break; ++ ++ /* will go away when spectator is a cap */ ++ case VCMD_net_migrate: ++ if (id == 1) { ++ current->nid = 1; ++ ret = 1; ++ goto out; ++ } ++ break; ++ } ++ ++ /* vcmds are fine by default */ ++ permit = 1; ++ ++ /* admin type vcmds require admin ... */ ++ if (flags & VCF_ADMIN) ++ permit = vx_check(0, VS_ADMIN) ? 1 : 0; ++ ++ /* ... but setup type vcmds override that */ ++ if (!permit && (flags & VCF_SETUP)) ++ permit = vx_flags(VXF_STATE_SETUP, 0) ? 2 : 0; ++ ++ state = 5; ++ ret = -EPERM; ++ if (!permit) ++ goto out; ++ ++ state = 6; ++ if (!id && (flags & VCF_ZIDOK)) ++ goto skip_id; ++ ++ ret = -ESRCH; ++ if (args & VCA_VXI) { ++ vxi = lookup_vx_info(id); ++ if (!vxi) ++ goto out; ++ ++ if ((flags & VCF_ADMIN) && ++ /* special case kill for shutdown */ ++ (cmd != VCMD_ctx_kill) && ++ /* can context be administrated? */ ++ !vx_info_flags(vxi, VXF_STATE_ADMIN, 0)) { ++ ret = -EACCES; ++ goto out_vxi; ++ } ++ } ++ state = 7; ++ if (args & VCA_NXI) { ++ nxi = lookup_nx_info(id); ++ if (!nxi) ++ goto out_vxi; ++ ++ if ((flags & VCF_ADMIN) && ++ /* can context be administrated? */ ++ !nx_info_flags(nxi, NXF_STATE_ADMIN, 0)) { ++ ret = -EACCES; ++ goto out_nxi; ++ } ++ } ++skip_id: ++ state = 8; ++ ret = do_vcmd(cmd, id, vxi, nxi, data, compat); ++ ++out_nxi: ++ if ((args & VCA_NXI) && nxi) ++ put_nx_info(nxi); ++out_vxi: ++ if ((args & VCA_VXI) && vxi) ++ put_vx_info(vxi); ++out: ++ vxdprintk(VXD_CBIT(switch, 1), ++ "vc: VCMD_%02d_%d[%d] = %08lx(%ld) [%d,%d]", ++ VC_CATEGORY(cmd), VC_COMMAND(cmd), ++ VC_VERSION(cmd), ret, ret, state, permit); ++ return ret; ++} ++ ++asmlinkage long ++sys_vserver(uint32_t cmd, uint32_t id, void __user *data) ++{ ++ return do_vserver(cmd, id, data, 0); ++} ++ ++#ifdef CONFIG_COMPAT ++ ++asmlinkage long ++sys32_vserver(uint32_t cmd, uint32_t id, void __user *data) ++{ ++ return do_vserver(cmd, id, data, 1); ++} ++ ++#endif /* CONFIG_COMPAT */ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/sysctl.c linux-3.3.8-vs2.3.3.4/kernel/vserver/sysctl.c +--- linux-3.3.8/kernel/vserver/sysctl.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/sysctl.c 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,247 @@ ++/* ++ * kernel/vserver/sysctl.c ++ * ++ * Virtual Context Support ++ * ++ * Copyright (C) 2004-2007 Herbert Pötzl ++ * ++ * V0.01 basic structure ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++enum { ++ CTL_DEBUG_ERROR = 0, ++ CTL_DEBUG_SWITCH = 1, ++ CTL_DEBUG_XID, ++ CTL_DEBUG_NID, ++ CTL_DEBUG_TAG, ++ CTL_DEBUG_NET, ++ CTL_DEBUG_LIMIT, ++ CTL_DEBUG_CRES, ++ CTL_DEBUG_DLIM, ++ CTL_DEBUG_QUOTA, ++ CTL_DEBUG_CVIRT, ++ CTL_DEBUG_SPACE, ++ CTL_DEBUG_PERM, ++ CTL_DEBUG_MISC, ++}; ++ ++ ++unsigned int vs_debug_switch = 0; ++unsigned int vs_debug_xid = 0; ++unsigned int vs_debug_nid = 0; ++unsigned int vs_debug_tag = 0; ++unsigned int vs_debug_net = 0; ++unsigned int vs_debug_limit = 0; ++unsigned int vs_debug_cres = 0; ++unsigned int vs_debug_dlim = 0; ++unsigned int vs_debug_quota = 0; ++unsigned int vs_debug_cvirt = 0; ++unsigned int vs_debug_space = 0; ++unsigned int vs_debug_perm = 0; ++unsigned int vs_debug_misc = 0; ++ ++ ++static struct ctl_table_header *vserver_table_header; ++static ctl_table vserver_root_table[]; ++ ++ ++void vserver_register_sysctl(void) ++{ ++ if (!vserver_table_header) { ++ vserver_table_header = register_sysctl_table(vserver_root_table); ++ } ++ ++} ++ ++void vserver_unregister_sysctl(void) ++{ ++ if (vserver_table_header) { ++ unregister_sysctl_table(vserver_table_header); ++ vserver_table_header = NULL; ++ } ++} ++ ++ ++static int proc_dodebug(ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ char tmpbuf[20], *p, c; ++ unsigned int value; ++ size_t left, len; ++ ++ if ((*ppos && !write) || !*lenp) { ++ *lenp = 0; ++ return 0; ++ } ++ ++ left = *lenp; ++ ++ if (write) { ++ if (!access_ok(VERIFY_READ, buffer, left)) ++ return -EFAULT; ++ p = (char *)buffer; ++ while (left && __get_user(c, p) >= 0 && isspace(c)) ++ left--, p++; ++ if (!left) ++ goto done; ++ ++ if (left > sizeof(tmpbuf) - 1) ++ return -EINVAL; ++ if (copy_from_user(tmpbuf, p, left)) ++ return -EFAULT; ++ tmpbuf[left] = '\0'; ++ ++ for (p = tmpbuf, value = 0; '0' <= *p && *p <= '9'; p++, left--) ++ value = 10 * value + (*p - '0'); ++ if (*p && !isspace(*p)) ++ return -EINVAL; ++ while (left && isspace(*p)) ++ left--, p++; ++ *(unsigned int *)table->data = value; ++ } else { ++ if (!access_ok(VERIFY_WRITE, buffer, left)) ++ return -EFAULT; ++ len = sprintf(tmpbuf, "%d", *(unsigned int *)table->data); ++ if (len > left) ++ len = left; ++ if (__copy_to_user(buffer, tmpbuf, len)) ++ return -EFAULT; ++ if ((left -= len) > 0) { ++ if (put_user('\n', (char *)buffer + len)) ++ return -EFAULT; ++ left--; ++ } ++ } ++ ++done: ++ *lenp -= left; ++ *ppos += *lenp; ++ return 0; ++} ++ ++static int zero; ++ ++#define CTL_ENTRY(ctl, name) \ ++ { \ ++ .procname = #name, \ ++ .data = &vs_ ## name, \ ++ .maxlen = sizeof(int), \ ++ .mode = 0644, \ ++ .proc_handler = &proc_dodebug, \ ++ .extra1 = &zero, \ ++ .extra2 = &zero, \ ++ } ++ ++static ctl_table vserver_debug_table[] = { ++ CTL_ENTRY(CTL_DEBUG_SWITCH, debug_switch), ++ CTL_ENTRY(CTL_DEBUG_XID, debug_xid), ++ CTL_ENTRY(CTL_DEBUG_NID, debug_nid), ++ CTL_ENTRY(CTL_DEBUG_TAG, debug_tag), ++ CTL_ENTRY(CTL_DEBUG_NET, debug_net), ++ CTL_ENTRY(CTL_DEBUG_LIMIT, debug_limit), ++ CTL_ENTRY(CTL_DEBUG_CRES, debug_cres), ++ CTL_ENTRY(CTL_DEBUG_DLIM, debug_dlim), ++ CTL_ENTRY(CTL_DEBUG_QUOTA, debug_quota), ++ CTL_ENTRY(CTL_DEBUG_CVIRT, debug_cvirt), ++ CTL_ENTRY(CTL_DEBUG_SPACE, debug_space), ++ CTL_ENTRY(CTL_DEBUG_PERM, debug_perm), ++ CTL_ENTRY(CTL_DEBUG_MISC, debug_misc), ++ { 0 } ++}; ++ ++static ctl_table vserver_root_table[] = { ++ { ++ .procname = "vserver", ++ .mode = 0555, ++ .child = vserver_debug_table ++ }, ++ { 0 } ++}; ++ ++ ++static match_table_t tokens = { ++ { CTL_DEBUG_SWITCH, "switch=%x" }, ++ { CTL_DEBUG_XID, "xid=%x" }, ++ { CTL_DEBUG_NID, "nid=%x" }, ++ { CTL_DEBUG_TAG, "tag=%x" }, ++ { CTL_DEBUG_NET, "net=%x" }, ++ { CTL_DEBUG_LIMIT, "limit=%x" }, ++ { CTL_DEBUG_CRES, "cres=%x" }, ++ { CTL_DEBUG_DLIM, "dlim=%x" }, ++ { CTL_DEBUG_QUOTA, "quota=%x" }, ++ { CTL_DEBUG_CVIRT, "cvirt=%x" }, ++ { CTL_DEBUG_SPACE, "space=%x" }, ++ { CTL_DEBUG_PERM, "perm=%x" }, ++ { CTL_DEBUG_MISC, "misc=%x" }, ++ { CTL_DEBUG_ERROR, NULL } ++}; ++ ++#define HANDLE_CASE(id, name, val) \ ++ case CTL_DEBUG_ ## id: \ ++ vs_debug_ ## name = val; \ ++ printk("vs_debug_" #name "=0x%x\n", val); \ ++ break ++ ++ ++static int __init vs_debug_setup(char *str) ++{ ++ char *p; ++ int token; ++ ++ printk("vs_debug_setup(%s)\n", str); ++ while ((p = strsep(&str, ",")) != NULL) { ++ substring_t args[MAX_OPT_ARGS]; ++ unsigned int value; ++ ++ if (!*p) ++ continue; ++ ++ token = match_token(p, tokens, args); ++ value = (token > 0) ? simple_strtoul(args[0].from, NULL, 0) : 0; ++ ++ switch (token) { ++ HANDLE_CASE(SWITCH, switch, value); ++ HANDLE_CASE(XID, xid, value); ++ HANDLE_CASE(NID, nid, value); ++ HANDLE_CASE(TAG, tag, value); ++ HANDLE_CASE(NET, net, value); ++ HANDLE_CASE(LIMIT, limit, value); ++ HANDLE_CASE(CRES, cres, value); ++ HANDLE_CASE(DLIM, dlim, value); ++ HANDLE_CASE(QUOTA, quota, value); ++ HANDLE_CASE(CVIRT, cvirt, value); ++ HANDLE_CASE(SPACE, space, value); ++ HANDLE_CASE(PERM, perm, value); ++ HANDLE_CASE(MISC, misc, value); ++ default: ++ return -EINVAL; ++ break; ++ } ++ } ++ return 1; ++} ++ ++__setup("vsdebug=", vs_debug_setup); ++ ++ ++ ++EXPORT_SYMBOL_GPL(vs_debug_switch); ++EXPORT_SYMBOL_GPL(vs_debug_xid); ++EXPORT_SYMBOL_GPL(vs_debug_nid); ++EXPORT_SYMBOL_GPL(vs_debug_net); ++EXPORT_SYMBOL_GPL(vs_debug_limit); ++EXPORT_SYMBOL_GPL(vs_debug_cres); ++EXPORT_SYMBOL_GPL(vs_debug_dlim); ++EXPORT_SYMBOL_GPL(vs_debug_quota); ++EXPORT_SYMBOL_GPL(vs_debug_cvirt); ++EXPORT_SYMBOL_GPL(vs_debug_space); ++EXPORT_SYMBOL_GPL(vs_debug_perm); ++EXPORT_SYMBOL_GPL(vs_debug_misc); ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/tag.c linux-3.3.8-vs2.3.3.4/kernel/vserver/tag.c +--- linux-3.3.8/kernel/vserver/tag.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/tag.c 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,63 @@ ++/* ++ * linux/kernel/vserver/tag.c ++ * ++ * Virtual Server: Shallow Tag Space ++ * ++ * Copyright (C) 2007 Herbert Pötzl ++ * ++ * V0.01 basic implementation ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++ ++ ++int dx_migrate_task(struct task_struct *p, tag_t tag) ++{ ++ if (!p) ++ BUG(); ++ ++ vxdprintk(VXD_CBIT(tag, 5), ++ "dx_migrate_task(%p[#%d],#%d)", p, p->tag, tag); ++ ++ task_lock(p); ++ p->tag = tag; ++ task_unlock(p); ++ ++ vxdprintk(VXD_CBIT(tag, 5), ++ "moved task %p into [#%d]", p, tag); ++ return 0; ++} ++ ++/* vserver syscall commands below here */ ++ ++/* taks xid and vx_info functions */ ++ ++ ++int vc_task_tag(uint32_t id) ++{ ++ tag_t tag; ++ ++ if (id) { ++ struct task_struct *tsk; ++ rcu_read_lock(); ++ tsk = find_task_by_real_pid(id); ++ tag = (tsk) ? tsk->tag : -ESRCH; ++ rcu_read_unlock(); ++ } else ++ tag = dx_current_tag(); ++ return tag; ++} ++ ++ ++int vc_tag_migrate(uint32_t tag) ++{ ++ return dx_migrate_task(current, tag & 0xFFFF); ++} ++ ++ +diff -NurpP --minimal linux-3.3.8/kernel/vserver/vci_config.h linux-3.3.8-vs2.3.3.4/kernel/vserver/vci_config.h +--- linux-3.3.8/kernel/vserver/vci_config.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/kernel/vserver/vci_config.h 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,76 @@ ++ ++/* interface version */ ++ ++#define VCI_VERSION 0x00020308 ++ ++ ++enum { ++ VCI_KCBIT_NO_DYNAMIC = 0, ++ ++ VCI_KCBIT_PROC_SECURE = 4, ++ /* VCI_KCBIT_HARDCPU = 5, */ ++ /* VCI_KCBIT_IDLELIMIT = 6, */ ++ /* VCI_KCBIT_IDLETIME = 7, */ ++ ++ VCI_KCBIT_COWBL = 8, ++ VCI_KCBIT_FULLCOWBL = 9, ++ VCI_KCBIT_SPACES = 10, ++ VCI_KCBIT_NETV2 = 11, ++ VCI_KCBIT_MEMCG = 12, ++ ++ VCI_KCBIT_DEBUG = 16, ++ VCI_KCBIT_HISTORY = 20, ++ VCI_KCBIT_TAGGED = 24, ++ VCI_KCBIT_PPTAG = 28, ++ ++ VCI_KCBIT_MORE = 31, ++}; ++ ++ ++static inline uint32_t vci_kernel_config(void) ++{ ++ return ++ (1 << VCI_KCBIT_NO_DYNAMIC) | ++ ++ /* configured features */ ++#ifdef CONFIG_VSERVER_PROC_SECURE ++ (1 << VCI_KCBIT_PROC_SECURE) | ++#endif ++#ifdef CONFIG_VSERVER_COWBL ++ (1 << VCI_KCBIT_COWBL) | ++ (1 << VCI_KCBIT_FULLCOWBL) | ++#endif ++ (1 << VCI_KCBIT_SPACES) | ++ (1 << VCI_KCBIT_NETV2) | ++#ifdef CONFIG_CGROUP_MEM_RES_CTLR ++ (1 << VCI_KCBIT_MEMCG) | ++#endif ++ ++ /* debug options */ ++#ifdef CONFIG_VSERVER_DEBUG ++ (1 << VCI_KCBIT_DEBUG) | ++#endif ++#ifdef CONFIG_VSERVER_HISTORY ++ (1 << VCI_KCBIT_HISTORY) | ++#endif ++ ++ /* inode context tagging */ ++#if defined(CONFIG_TAGGING_NONE) ++ (0 << VCI_KCBIT_TAGGED) | ++#elif defined(CONFIG_TAGGING_UID16) ++ (1 << VCI_KCBIT_TAGGED) | ++#elif defined(CONFIG_TAGGING_GID16) ++ (2 << VCI_KCBIT_TAGGED) | ++#elif defined(CONFIG_TAGGING_ID24) ++ (3 << VCI_KCBIT_TAGGED) | ++#elif defined(CONFIG_TAGGING_INTERN) ++ (4 << VCI_KCBIT_TAGGED) | ++#elif defined(CONFIG_TAGGING_RUNTIME) ++ (5 << VCI_KCBIT_TAGGED) | ++#else ++ (7 << VCI_KCBIT_TAGGED) | ++#endif ++ (1 << VCI_KCBIT_PPTAG) | ++ 0; ++} ++ +diff -NurpP --minimal linux-3.3.8/mm/memcontrol.c linux-3.3.8-vs2.3.3.4/mm/memcontrol.c +--- linux-3.3.8/mm/memcontrol.c 2012-06-08 15:23:47.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/mm/memcontrol.c 2012-05-22 09:49:13.000000000 +0200 +@@ -839,6 +839,31 @@ struct mem_cgroup *mem_cgroup_from_task( + struct mem_cgroup, css); + } + ++u64 mem_cgroup_res_read_u64(struct mem_cgroup *mem, int member) ++{ ++ return res_counter_read_u64(&mem->res, member); ++} ++ ++u64 mem_cgroup_memsw_read_u64(struct mem_cgroup *mem, int member) ++{ ++ return res_counter_read_u64(&mem->memsw, member); ++} ++ ++s64 mem_cgroup_stat_read_cache(struct mem_cgroup *mem) ++{ ++ return mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); ++} ++ ++s64 mem_cgroup_stat_read_anon(struct mem_cgroup *mem) ++{ ++ return mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); ++} ++ ++s64 mem_cgroup_stat_read_mapped(struct mem_cgroup *mem) ++{ ++ return mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); ++} ++ + struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) + { + struct mem_cgroup *memcg = NULL; +diff -NurpP --minimal linux-3.3.8/mm/oom_kill.c linux-3.3.8-vs2.3.3.4/mm/oom_kill.c +--- linux-3.3.8/mm/oom_kill.c 2012-03-19 19:47:30.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/mm/oom_kill.c 2012-02-24 04:15:27.000000000 +0100 +@@ -34,6 +34,8 @@ + #include + #include + #include ++#include ++#include + + #define CREATE_TRACE_POINTS + #include +@@ -154,11 +156,18 @@ struct task_struct *find_lock_task_mm(st + static bool oom_unkillable_task(struct task_struct *p, + const struct mem_cgroup *memcg, const nodemask_t *nodemask) + { +- if (is_global_init(p)) ++ unsigned xid = vx_current_xid(); ++ ++ /* skip the init task, global and per guest */ ++ if (task_is_init(p)) + return true; + if (p->flags & PF_KTHREAD) + return true; + ++ /* skip other guest and host processes if oom in guest */ ++ if (xid && vx_task_xid(p) != xid) ++ return true; ++ + /* When mem_cgroup_out_of_memory() and p is not member of the group */ + if (memcg && !task_in_mem_cgroup(p, memcg)) + return true; +@@ -446,8 +455,8 @@ static int oom_kill_task(struct task_str + /* mm cannot be safely dereferenced after task_unlock(p) */ + mm = p->mm; + +- pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", +- task_pid_nr(p), p->comm, K(p->mm->total_vm), ++ pr_err("Killed process %d:#%u (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", ++ task_pid_nr(p), p->xid, p->comm, K(p->mm->total_vm), + K(get_mm_counter(p->mm, MM_ANONPAGES)), + K(get_mm_counter(p->mm, MM_FILEPAGES))); + task_unlock(p); +@@ -505,8 +514,8 @@ static int oom_kill_process(struct task_ + } + + task_lock(p); +- pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", +- message, task_pid_nr(p), p->comm, points); ++ pr_err("%s: Kill process %d:#%u (%s) score %d or sacrifice child\n", ++ message, task_pid_nr(p), p->xid, p->comm, points); + task_unlock(p); + + /* +@@ -607,6 +616,8 @@ int unregister_oom_notifier(struct notif + } + EXPORT_SYMBOL_GPL(unregister_oom_notifier); + ++long vs_oom_action(unsigned int); ++ + /* + * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero + * if a parallel OOM killing is already taking place that includes a zone in +@@ -765,7 +776,12 @@ retry: + if (!p) { + dump_header(NULL, gfp_mask, order, NULL, mpol_mask); + read_unlock(&tasklist_lock); +- panic("Out of memory and no killable processes...\n"); ++ ++ /* avoid panic for guest OOM */ ++ if (current->xid) ++ vs_oom_action(LINUX_REBOOT_CMD_OOM); ++ else ++ panic("Out of memory and no killable processes...\n"); + } + + if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, +diff -NurpP --minimal linux-3.3.8/mm/page_alloc.c linux-3.3.8-vs2.3.3.4/mm/page_alloc.c +--- linux-3.3.8/mm/page_alloc.c 2012-03-19 19:47:30.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/mm/page_alloc.c 2012-03-19 20:52:10.000000000 +0100 +@@ -58,6 +58,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -2602,6 +2604,9 @@ void si_meminfo(struct sysinfo *val) + val->totalhigh = totalhigh_pages; + val->freehigh = nr_free_highpages(); + val->mem_unit = PAGE_SIZE; ++ ++ if (vx_flags(VXF_VIRT_MEM, 0)) ++ vx_vsi_meminfo(val); + } + + EXPORT_SYMBOL(si_meminfo); +@@ -2622,6 +2627,9 @@ void si_meminfo_node(struct sysinfo *val + val->freehigh = 0; + #endif + val->mem_unit = PAGE_SIZE; ++ ++ if (vx_flags(VXF_VIRT_MEM, 0)) ++ vx_vsi_meminfo(val); + } + #endif + +diff -NurpP --minimal linux-3.3.8/mm/pgtable-generic.c linux-3.3.8-vs2.3.3.4/mm/pgtable-generic.c +--- linux-3.3.8/mm/pgtable-generic.c 2011-03-15 18:07:42.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/mm/pgtable-generic.c 2012-02-24 03:55:07.000000000 +0100 +@@ -6,6 +6,8 @@ + * Copyright (C) 2010 Linus Torvalds + */ + ++#include ++ + #include + #include + #include +diff -NurpP --minimal linux-3.3.8/mm/shmem.c linux-3.3.8-vs2.3.3.4/mm/shmem.c +--- linux-3.3.8/mm/shmem.c 2012-03-19 19:47:30.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/mm/shmem.c 2012-02-24 03:55:07.000000000 +0100 +@@ -1460,7 +1460,7 @@ static int shmem_statfs(struct dentry *d + { + struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); + +- buf->f_type = TMPFS_MAGIC; ++ buf->f_type = TMPFS_SUPER_MAGIC; + buf->f_bsize = PAGE_CACHE_SIZE; + buf->f_namelen = NAME_MAX; + if (sbinfo->max_blocks) { +@@ -2217,7 +2217,7 @@ int shmem_fill_super(struct super_block + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; +- sb->s_magic = TMPFS_MAGIC; ++ sb->s_magic = TMPFS_SUPER_MAGIC; + sb->s_op = &shmem_ops; + sb->s_time_gran = 1; + #ifdef CONFIG_TMPFS_XATTR +diff -NurpP --minimal linux-3.3.8/mm/slab.c linux-3.3.8-vs2.3.3.4/mm/slab.c +--- linux-3.3.8/mm/slab.c 2012-03-19 19:47:30.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/mm/slab.c 2012-02-24 03:55:07.000000000 +0100 +@@ -413,6 +413,8 @@ static void kmem_list3_init(struct kmem_ + #define STATS_INC_FREEMISS(x) do { } while (0) + #endif + ++#include "slab_vs.h" ++ + #if DEBUG + + /* +@@ -3414,6 +3416,7 @@ retry: + + obj = slab_get_obj(cachep, slabp, nodeid); + check_slabp(cachep, slabp); ++ vx_slab_alloc(cachep, flags); + l3->free_objects--; + /* move slabp to correct slabp list: */ + list_del(&slabp->list); +@@ -3491,6 +3494,7 @@ __cache_alloc_node(struct kmem_cache *ca + /* ___cache_alloc_node can fall back to other nodes */ + ptr = ____cache_alloc_node(cachep, flags, nodeid); + out: ++ vx_slab_alloc(cachep, flags); + local_irq_restore(save_flags); + ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); + kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, +@@ -3678,6 +3682,7 @@ static inline void __cache_free(struct k + check_irq_off(); + kmemleak_free_recursive(objp, cachep->flags); + objp = cache_free_debugcheck(cachep, objp, caller); ++ vx_slab_free(cachep); + + kmemcheck_slab_free(cachep, objp, obj_size(cachep)); + +diff -NurpP --minimal linux-3.3.8/mm/slab_vs.h linux-3.3.8-vs2.3.3.4/mm/slab_vs.h +--- linux-3.3.8/mm/slab_vs.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/mm/slab_vs.h 2012-02-24 03:55:07.000000000 +0100 +@@ -0,0 +1,29 @@ ++ ++#include ++ ++#include ++ ++static inline ++void vx_slab_alloc(struct kmem_cache *cachep, gfp_t flags) ++{ ++ int what = gfp_zone(cachep->gfpflags); ++ struct vx_info *vxi = current_vx_info(); ++ ++ if (!vxi) ++ return; ++ ++ atomic_add(cachep->buffer_size, &vxi->cacct.slab[what]); ++} ++ ++static inline ++void vx_slab_free(struct kmem_cache *cachep) ++{ ++ int what = gfp_zone(cachep->gfpflags); ++ struct vx_info *vxi = current_vx_info(); ++ ++ if (!vxi) ++ return; ++ ++ atomic_sub(cachep->buffer_size, &vxi->cacct.slab[what]); ++} ++ +diff -NurpP --minimal linux-3.3.8/mm/swapfile.c linux-3.3.8-vs2.3.3.4/mm/swapfile.c +--- linux-3.3.8/mm/swapfile.c 2012-06-08 15:23:47.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/mm/swapfile.c 2012-06-08 15:27:44.000000000 +0200 +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + + static bool swap_count_continued(struct swap_info_struct *, pgoff_t, + unsigned char); +@@ -1752,6 +1753,16 @@ static int swap_show(struct seq_file *sw + + if (si == SEQ_START_TOKEN) { + seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); ++ if (vx_flags(VXF_VIRT_MEM, 0)) { ++ struct sysinfo si; ++ ++ vx_vsi_swapinfo(&si); ++ if (si.totalswap < (1 << 10)) ++ return 0; ++ seq_printf(swap, "%s\t\t\t\t\t%s\t%lu\t%lu\t%d\n", ++ "hdv0", "partition", si.totalswap >> 10, ++ (si.totalswap - si.freeswap) >> 10, -1); ++ } + return 0; + } + +@@ -2175,6 +2186,8 @@ void si_swapinfo(struct sysinfo *val) + val->freeswap = nr_swap_pages + nr_to_be_unused; + val->totalswap = total_swap_pages + nr_to_be_unused; + spin_unlock(&swap_lock); ++ if (vx_flags(VXF_VIRT_MEM, 0)) ++ vx_vsi_swapinfo(val); + } + + /* +diff -NurpP --minimal linux-3.3.8/net/bridge/br_multicast.c linux-3.3.8-vs2.3.3.4/net/bridge/br_multicast.c +--- linux-3.3.8/net/bridge/br_multicast.c 2012-06-08 15:23:47.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/net/bridge/br_multicast.c 2012-04-30 19:34:38.000000000 +0200 +@@ -445,7 +445,7 @@ static struct sk_buff *br_ip6_multicast_ + ip6h->hop_limit = 1; + ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1)); + if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0, +- &ip6h->saddr)) { ++ &ip6h->saddr, NULL)) { + kfree_skb(skb); + return NULL; + } +diff -NurpP --minimal linux-3.3.8/net/core/dev.c linux-3.3.8-vs2.3.3.4/net/core/dev.c +--- linux-3.3.8/net/core/dev.c 2012-06-08 15:23:47.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/net/core/dev.c 2012-05-15 07:09:25.000000000 +0200 +@@ -127,6 +127,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -622,7 +623,8 @@ struct net_device *__dev_get_by_name(str + struct hlist_head *head = dev_name_hash(net, name); + + hlist_for_each_entry(dev, p, head, name_hlist) +- if (!strncmp(dev->name, name, IFNAMSIZ)) ++ if (!strncmp(dev->name, name, IFNAMSIZ) && ++ nx_dev_visible(current_nx_info(), dev)) + return dev; + + return NULL; +@@ -648,7 +650,8 @@ struct net_device *dev_get_by_name_rcu(s + struct hlist_head *head = dev_name_hash(net, name); + + hlist_for_each_entry_rcu(dev, p, head, name_hlist) +- if (!strncmp(dev->name, name, IFNAMSIZ)) ++ if (!strncmp(dev->name, name, IFNAMSIZ) && ++ nx_dev_visible(current_nx_info(), dev)) + return dev; + + return NULL; +@@ -699,7 +702,8 @@ struct net_device *__dev_get_by_index(st + struct hlist_head *head = dev_index_hash(net, ifindex); + + hlist_for_each_entry(dev, p, head, index_hlist) +- if (dev->ifindex == ifindex) ++ if ((dev->ifindex == ifindex) && ++ nx_dev_visible(current_nx_info(), dev)) + return dev; + + return NULL; +@@ -717,7 +721,7 @@ EXPORT_SYMBOL(__dev_get_by_index); + * about locking. The caller must hold RCU lock. + */ + +-struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) ++struct net_device *dev_get_by_index_real_rcu(struct net *net, int ifindex) + { + struct hlist_node *p; + struct net_device *dev; +@@ -729,6 +733,16 @@ struct net_device *dev_get_by_index_rcu( + + return NULL; + } ++EXPORT_SYMBOL(dev_get_by_index_real_rcu); ++ ++struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) ++{ ++ struct net_device *dev = dev_get_by_index_real_rcu(net, ifindex); ++ ++ if (nx_dev_visible(current_nx_info(), dev)) ++ return dev; ++ return NULL; ++} + EXPORT_SYMBOL(dev_get_by_index_rcu); + + +@@ -777,7 +791,8 @@ struct net_device *dev_getbyhwaddr_rcu(s + + for_each_netdev_rcu(net, dev) + if (dev->type == type && +- !memcmp(dev->dev_addr, ha, dev->addr_len)) ++ !memcmp(dev->dev_addr, ha, dev->addr_len) && ++ nx_dev_visible(current_nx_info(), dev)) + return dev; + + return NULL; +@@ -789,9 +804,11 @@ struct net_device *__dev_getfirstbyhwtyp + struct net_device *dev; + + ASSERT_RTNL(); +- for_each_netdev(net, dev) +- if (dev->type == type) ++ for_each_netdev(net, dev) { ++ if ((dev->type == type) && ++ nx_dev_visible(current_nx_info(), dev)) + return dev; ++ } + + return NULL; + } +@@ -909,6 +926,8 @@ static int __dev_alloc_name(struct net * + continue; + if (i < 0 || i >= max_netdevices) + continue; ++ if (!nx_dev_visible(current_nx_info(), d)) ++ continue; + + /* avoid cases where sscanf is not exact inverse of printf */ + snprintf(buf, IFNAMSIZ, name, i); +@@ -4029,6 +4048,8 @@ static int dev_ifconf(struct net *net, c + + total = 0; + for_each_netdev(net, dev) { ++ if (!nx_dev_visible(current_nx_info(), dev)) ++ continue; + for (i = 0; i < NPROTO; i++) { + if (gifconf_list[i]) { + int done; +@@ -4131,6 +4152,10 @@ static void dev_seq_printf_stats(struct + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + ++ /* device visible inside network context? */ ++ if (!nx_dev_visible(current_nx_info(), dev)) ++ return; ++ + seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " + "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", + dev->name, stats->rx_bytes, stats->rx_packets, +diff -NurpP --minimal linux-3.3.8/net/core/rtnetlink.c linux-3.3.8-vs2.3.3.4/net/core/rtnetlink.c +--- linux-3.3.8/net/core/rtnetlink.c 2012-06-08 15:23:47.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/net/core/rtnetlink.c 2012-04-03 03:02:13.000000000 +0200 +@@ -1073,6 +1073,8 @@ static int rtnl_dump_ifinfo(struct sk_bu + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; ++ if (!nx_dev_visible(skb->sk->sk_nx_info, dev)) ++ continue; + if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, 0, +@@ -1955,6 +1957,9 @@ void rtmsg_ifinfo(int type, struct net_d + int err = -ENOBUFS; + size_t if_info_size; + ++ if (!nx_dev_visible(current_nx_info(), dev)) ++ return; ++ + skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL); + if (skb == NULL) + goto errout; +diff -NurpP --minimal linux-3.3.8/net/core/sock.c linux-3.3.8-vs2.3.3.4/net/core/sock.c +--- linux-3.3.8/net/core/sock.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/core/sock.c 2012-02-24 03:55:07.000000000 +0100 +@@ -130,6 +130,10 @@ + #include + + #include ++#include ++#include ++#include ++#include + + #include + +@@ -1127,6 +1131,8 @@ static struct sock *sk_prot_alloc(struct + goto out_free_sec; + sk_tx_queue_clear(sk); + } ++ sock_vx_init(sk); ++ sock_nx_init(sk); + + return sk; + +@@ -1235,6 +1241,11 @@ static void __sk_free(struct sock *sk) + put_cred(sk->sk_peer_cred); + put_pid(sk->sk_peer_pid); + put_net(sock_net(sk)); ++ vx_sock_dec(sk); ++ clr_vx_info(&sk->sk_vx_info); ++ sk->sk_xid = -1; ++ clr_nx_info(&sk->sk_nx_info); ++ sk->sk_nid = -1; + sk_prot_free(sk->sk_prot_creator, sk); + } + +@@ -1295,6 +1306,8 @@ struct sock *sk_clone_lock(const struct + + /* SANITY */ + get_net(sock_net(newsk)); ++ sock_vx_init(newsk); ++ sock_nx_init(newsk); + sk_node_init(&newsk->sk_node); + sock_lock_init(newsk); + bh_lock_sock(newsk); +@@ -1351,6 +1364,12 @@ struct sock *sk_clone_lock(const struct + smp_wmb(); + atomic_set(&newsk->sk_refcnt, 2); + ++ set_vx_info(&newsk->sk_vx_info, sk->sk_vx_info); ++ newsk->sk_xid = sk->sk_xid; ++ vx_sock_inc(newsk); ++ set_nx_info(&newsk->sk_nx_info, sk->sk_nx_info); ++ newsk->sk_nid = sk->sk_nid; ++ + /* + * Increment the counter in the same struct proto as the master + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that +@@ -2102,6 +2121,12 @@ void sock_init_data(struct socket *sock, + + sk->sk_stamp = ktime_set(-1L, 0); + ++ set_vx_info(&sk->sk_vx_info, current_vx_info()); ++ sk->sk_xid = vx_current_xid(); ++ vx_sock_inc(sk); ++ set_nx_info(&sk->sk_nx_info, current_nx_info()); ++ sk->sk_nid = nx_current_nid(); ++ + /* + * Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.txt for details) +diff -NurpP --minimal linux-3.3.8/net/ipv4/af_inet.c linux-3.3.8-vs2.3.3.4/net/ipv4/af_inet.c +--- linux-3.3.8/net/ipv4/af_inet.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv4/af_inet.c 2012-02-24 03:55:07.000000000 +0100 +@@ -117,6 +117,7 @@ + #ifdef CONFIG_IP_MROUTE + #include + #endif ++#include + + + /* The inetsw table contains everything that inet_create needs to +@@ -326,9 +327,13 @@ lookup_protocol: + } + + err = -EPERM; ++ if ((protocol == IPPROTO_ICMP) && ++ nx_capable(CAP_NET_RAW, NXC_RAW_ICMP)) ++ goto override; ++ + if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) + goto out_rcu_unlock; +- ++override: + err = -EAFNOSUPPORT; + if (!inet_netns_ok(net, protocol)) + goto out_rcu_unlock; +@@ -452,6 +457,7 @@ int inet_bind(struct socket *sock, struc + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); ++ struct nx_v4_sock_addr nsa; + unsigned short snum; + int chk_addr_ret; + int err; +@@ -475,7 +481,11 @@ int inet_bind(struct socket *sock, struc + goto out; + } + +- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); ++ err = v4_map_sock_addr(inet, addr, &nsa); ++ if (err) ++ goto out; ++ ++ chk_addr_ret = inet_addr_type(sock_net(sk), nsa.saddr); + + /* Not specified by any standard per-se, however it breaks too + * many applications when removed. It is unfortunate since +@@ -487,7 +497,7 @@ int inet_bind(struct socket *sock, struc + err = -EADDRNOTAVAIL; + if (!sysctl_ip_nonlocal_bind && + !(inet->freebind || inet->transparent) && +- addr->sin_addr.s_addr != htonl(INADDR_ANY) && ++ nsa.saddr != htonl(INADDR_ANY) && + chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && + chk_addr_ret != RTN_BROADCAST) +@@ -512,7 +522,7 @@ int inet_bind(struct socket *sock, struc + if (sk->sk_state != TCP_CLOSE || inet->inet_num) + goto out_release_sock; + +- inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; ++ v4_set_sock_addr(inet, &nsa); + if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) + inet->inet_saddr = 0; /* Use device */ + +@@ -715,11 +725,13 @@ int inet_getname(struct socket *sock, st + peer == 1)) + return -ENOTCONN; + sin->sin_port = inet->inet_dport; +- sin->sin_addr.s_addr = inet->inet_daddr; ++ sin->sin_addr.s_addr = ++ nx_map_sock_lback(sk->sk_nx_info, inet->inet_daddr); + } else { + __be32 addr = inet->inet_rcv_saddr; + if (!addr) + addr = inet->inet_saddr; ++ addr = nx_map_sock_lback(sk->sk_nx_info, addr); + sin->sin_port = inet->inet_sport; + sin->sin_addr.s_addr = addr; + } +diff -NurpP --minimal linux-3.3.8/net/ipv4/arp.c linux-3.3.8-vs2.3.3.4/net/ipv4/arp.c +--- linux-3.3.8/net/ipv4/arp.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv4/arp.c 2012-02-24 03:55:07.000000000 +0100 +@@ -1329,6 +1329,7 @@ static void arp_format_neigh_entry(struc + struct net_device *dev = n->dev; + int hatype = dev->type; + ++ /* FIXME: check for network context */ + read_lock(&n->lock); + /* Convert hardware address to XX:XX:XX:XX ... form. */ + #if IS_ENABLED(CONFIG_AX25) +@@ -1360,6 +1361,7 @@ static void arp_format_pneigh_entry(stru + int hatype = dev ? dev->type : 0; + char tbuf[16]; + ++ /* FIXME: check for network context */ + sprintf(tbuf, "%pI4", n->key); + seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", + tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00", +diff -NurpP --minimal linux-3.3.8/net/ipv4/devinet.c linux-3.3.8-vs2.3.3.4/net/ipv4/devinet.c +--- linux-3.3.8/net/ipv4/devinet.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv4/devinet.c 2012-02-24 03:55:07.000000000 +0100 +@@ -518,6 +518,7 @@ struct in_device *inetdev_by_index(struc + } + EXPORT_SYMBOL(inetdev_by_index); + ++ + /* Called only from RTNL semaphored context. No locks. */ + + struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, +@@ -759,6 +760,8 @@ int devinet_ioctl(struct net *net, unsig + + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) { ++ struct nx_info *nxi = current_nx_info(); ++ + if (tryaddrmatch) { + /* Matthias Andree */ + /* compare label and address (4.4BSD style) */ +@@ -767,6 +770,8 @@ int devinet_ioctl(struct net *net, unsig + This is checked above. */ + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + ifap = &ifa->ifa_next) { ++ if (!nx_v4_ifa_visible(nxi, ifa)) ++ continue; + if (!strcmp(ifr.ifr_name, ifa->ifa_label) && + sin_orig.sin_addr.s_addr == + ifa->ifa_local) { +@@ -779,9 +784,12 @@ int devinet_ioctl(struct net *net, unsig + comparing just the label */ + if (!ifa) { + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; +- ifap = &ifa->ifa_next) ++ ifap = &ifa->ifa_next) { ++ if (!nx_v4_ifa_visible(nxi, ifa)) ++ continue; + if (!strcmp(ifr.ifr_name, ifa->ifa_label)) + break; ++ } + } + } + +@@ -934,6 +942,8 @@ static int inet_gifconf(struct net_devic + goto out; + + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { ++ if (!nx_v4_ifa_visible(current_nx_info(), ifa)) ++ continue; + if (!buf) { + done += sizeof(ifr); + continue; +@@ -1294,6 +1304,7 @@ static int inet_dump_ifaddr(struct sk_bu + struct net_device *dev; + struct in_device *in_dev; + struct in_ifaddr *ifa; ++ struct sock *sk = skb->sk; + struct hlist_head *head; + struct hlist_node *node; + +@@ -1316,6 +1327,8 @@ static int inet_dump_ifaddr(struct sk_bu + + for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; + ifa = ifa->ifa_next, ip_idx++) { ++ if (sk && !nx_v4_ifa_visible(sk->sk_nx_info, ifa)) ++ continue; + if (ip_idx < s_ip_idx) + continue; + if (inet_fill_ifaddr(skb, ifa, +diff -NurpP --minimal linux-3.3.8/net/ipv4/fib_trie.c linux-3.3.8-vs2.3.3.4/net/ipv4/fib_trie.c +--- linux-3.3.8/net/ipv4/fib_trie.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv4/fib_trie.c 2012-02-24 03:55:07.000000000 +0100 +@@ -2556,6 +2556,7 @@ static int fib_route_seq_show(struct seq + || fa->fa_type == RTN_MULTICAST) + continue; + ++ /* FIXME: check for network context? */ + if (fi) + seq_printf(seq, + "%s\t%08X\t%08X\t%04X\t%d\t%u\t" +diff -NurpP --minimal linux-3.3.8/net/ipv4/inet_connection_sock.c linux-3.3.8-vs2.3.3.4/net/ipv4/inet_connection_sock.c +--- linux-3.3.8/net/ipv4/inet_connection_sock.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv4/inet_connection_sock.c 2012-02-24 03:55:07.000000000 +0100 +@@ -52,6 +52,37 @@ void inet_get_local_port_range(int *low, + } + EXPORT_SYMBOL(inet_get_local_port_range); + ++int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) ++{ ++ __be32 sk1_rcv_saddr = sk_rcv_saddr(sk1), ++ sk2_rcv_saddr = sk_rcv_saddr(sk2); ++ ++ if (inet_v6_ipv6only(sk2)) ++ return 0; ++ ++ if (sk1_rcv_saddr && ++ sk2_rcv_saddr && ++ sk1_rcv_saddr == sk2_rcv_saddr) ++ return 1; ++ ++ if (sk1_rcv_saddr && ++ !sk2_rcv_saddr && ++ v4_addr_in_nx_info(sk2->sk_nx_info, sk1_rcv_saddr, NXA_MASK_BIND)) ++ return 1; ++ ++ if (sk2_rcv_saddr && ++ !sk1_rcv_saddr && ++ v4_addr_in_nx_info(sk1->sk_nx_info, sk2_rcv_saddr, NXA_MASK_BIND)) ++ return 1; ++ ++ if (!sk1_rcv_saddr && ++ !sk2_rcv_saddr && ++ nx_v4_addr_conflict(sk1->sk_nx_info, sk2->sk_nx_info)) ++ return 1; ++ ++ return 0; ++} ++ + int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) + { +@@ -74,9 +105,7 @@ int inet_csk_bind_conflict(const struct + sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { + if (!reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) { +- const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); +- if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || +- sk2_rcv_saddr == sk_rcv_saddr(sk)) ++ if (ipv4_rcv_saddr_equal(sk, sk2)) + break; + } + } +diff -NurpP --minimal linux-3.3.8/net/ipv4/inet_diag.c linux-3.3.8-vs2.3.3.4/net/ipv4/inet_diag.c +--- linux-3.3.8/net/ipv4/inet_diag.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv4/inet_diag.c 2012-02-24 04:26:38.000000000 +0100 +@@ -31,6 +31,8 @@ + + #include + #include ++#include ++#include + + #include + #include +@@ -106,8 +108,10 @@ int inet_sk_diag_fill(struct sock *sk, s + + r->id.idiag_sport = inet->inet_sport; + r->id.idiag_dport = inet->inet_dport; +- r->id.idiag_src[0] = inet->inet_rcv_saddr; +- r->id.idiag_dst[0] = inet->inet_daddr; ++ r->id.idiag_src[0] = nx_map_sock_lback(sk->sk_nx_info, ++ inet->inet_rcv_saddr); ++ r->id.idiag_dst[0] = nx_map_sock_lback(sk->sk_nx_info, ++ inet->inet_daddr); + + /* IPv6 dual-stack sockets use inet->tos for IPv4 connections, + * hence this needs to be included regardless of socket family. +@@ -227,8 +231,8 @@ static int inet_twsk_diag_fill(struct in + sock_diag_save_cookie(tw, r->id.idiag_cookie); + r->id.idiag_sport = tw->tw_sport; + r->id.idiag_dport = tw->tw_dport; +- r->id.idiag_src[0] = tw->tw_rcv_saddr; +- r->id.idiag_dst[0] = tw->tw_daddr; ++ r->id.idiag_src[0] = nx_map_sock_lback(tw->tw_nx_info, tw->tw_rcv_saddr); ++ r->id.idiag_dst[0] = nx_map_sock_lback(tw->tw_nx_info, tw->tw_daddr); + r->idiag_state = tw->tw_substate; + r->idiag_timer = 3; + r->idiag_expires = DIV_ROUND_UP(tmo * 1000, HZ); +@@ -272,12 +276,14 @@ int inet_diag_dump_one_icsk(struct inet_ + + err = -EINVAL; + if (req->sdiag_family == AF_INET) { ++ /* TODO: lback */ + sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0], + req->id.idiag_dport, req->id.idiag_src[0], + req->id.idiag_sport, req->id.idiag_if); + } + #if IS_ENABLED(CONFIG_IPV6) + else if (req->sdiag_family == AF_INET6) { ++ /* TODO: lback */ + sk = inet6_lookup(&init_net, hashinfo, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, +@@ -472,6 +478,7 @@ int inet_diag_bc_sk(const struct nlattr + } else + #endif + { ++ /* TODO: lback */ + entry.saddr = &inet->inet_rcv_saddr; + entry.daddr = &inet->inet_daddr; + } +@@ -570,6 +577,7 @@ static int inet_twsk_diag_dump(struct in + } else + #endif + { ++ /* TODO: lback */ + entry.saddr = &tw->tw_rcv_saddr; + entry.daddr = &tw->tw_daddr; + } +@@ -615,8 +623,8 @@ static int inet_diag_fill_req(struct sk_ + + r->id.idiag_sport = inet->inet_sport; + r->id.idiag_dport = ireq->rmt_port; +- r->id.idiag_src[0] = ireq->loc_addr; +- r->id.idiag_dst[0] = ireq->rmt_addr; ++ r->id.idiag_src[0] = nx_map_sock_lback(sk->sk_nx_info, ireq->loc_addr); ++ r->id.idiag_dst[0] = nx_map_sock_lback(sk->sk_nx_info, ireq->rmt_addr); + r->idiag_expires = jiffies_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; +@@ -683,6 +691,7 @@ static int inet_diag_dump_reqs(struct sk + continue; + + if (bc) { ++ /* TODO: lback */ + entry.saddr = + #if IS_ENABLED(CONFIG_IPV6) + (entry.family == AF_INET6) ? +@@ -744,6 +753,8 @@ void inet_diag_dump_icsk(struct inet_has + sk_nulls_for_each(sk, node, &ilb->head) { + struct inet_sock *inet = inet_sk(sk); + ++ if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (num < s_num) { + num++; + continue; +@@ -814,6 +825,8 @@ skip_listen_ht: + sk_nulls_for_each(sk, node, &head->chain) { + struct inet_sock *inet = inet_sk(sk); + ++ if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (num < s_num) + goto next_normal; + if (!(r->idiag_states & (1 << sk->sk_state))) +@@ -841,6 +854,8 @@ next_normal: + inet_twsk_for_each(tw, node, + &head->twchain) { + ++ if (!nx_check(tw->tw_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (num < s_num) + goto next_dying; + if (r->sdiag_family != AF_UNSPEC && +diff -NurpP --minimal linux-3.3.8/net/ipv4/inet_hashtables.c linux-3.3.8-vs2.3.3.4/net/ipv4/inet_hashtables.c +--- linux-3.3.8/net/ipv4/inet_hashtables.c 2011-10-24 18:45:34.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/net/ipv4/inet_hashtables.c 2012-02-24 03:55:07.000000000 +0100 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + + /* +@@ -156,6 +157,11 @@ static inline int compute_score(struct s + if (rcv_saddr != daddr) + return -1; + score += 2; ++ } else { ++ /* block non nx_info ips */ ++ if (!v4_addr_in_nx_info(sk->sk_nx_info, ++ daddr, NXA_MASK_BIND)) ++ return -1; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) +@@ -173,7 +179,6 @@ static inline int compute_score(struct s + * wildcarded during the search since they can never be otherwise. + */ + +- + struct sock *__inet_lookup_listener(struct net *net, + struct inet_hashinfo *hashinfo, + const __be32 daddr, const unsigned short hnum, +@@ -196,6 +201,7 @@ begin: + hiscore = score; + } + } ++ + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. +diff -NurpP --minimal linux-3.3.8/net/ipv4/netfilter/nf_nat_helper.c linux-3.3.8-vs2.3.3.4/net/ipv4/netfilter/nf_nat_helper.c +--- linux-3.3.8/net/ipv4/netfilter/nf_nat_helper.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv4/netfilter/nf_nat_helper.c 2012-02-24 03:55:07.000000000 +0100 +@@ -20,6 +20,7 @@ + #include + + #include ++#include + #include + #include + #include +diff -NurpP --minimal linux-3.3.8/net/ipv4/netfilter.c linux-3.3.8-vs2.3.3.4/net/ipv4/netfilter.c +--- linux-3.3.8/net/ipv4/netfilter.c 2012-01-09 16:15:03.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv4/netfilter.c 2012-02-24 03:55:07.000000000 +0100 +@@ -6,7 +6,7 @@ + #include + #include + #include +-#include ++// #include + #include + #include + #include +diff -NurpP --minimal linux-3.3.8/net/ipv4/raw.c linux-3.3.8-vs2.3.3.4/net/ipv4/raw.c +--- linux-3.3.8/net/ipv4/raw.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv4/raw.c 2012-02-24 03:55:07.000000000 +0100 +@@ -118,7 +118,7 @@ static struct sock *__raw_v4_lookup(stru + + if (net_eq(sock_net(sk), net) && inet->inet_num == num && + !(inet->inet_daddr && inet->inet_daddr != raddr) && +- !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && ++ v4_sock_addr_match(sk->sk_nx_info, inet, laddr) && + !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + goto found; /* gotcha */ + } +@@ -390,6 +390,12 @@ static int raw_send_hdrinc(struct sock * + icmp_out_count(net, ((struct icmphdr *) + skb_transport_header(skb))->type); + ++ err = -EPERM; ++ if (!nx_check(0, VS_ADMIN) && !capable(CAP_NET_RAW) && ++ sk->sk_nx_info && ++ !v4_addr_in_nx_info(sk->sk_nx_info, iph->saddr, NXA_MASK_BIND)) ++ goto error_free; ++ + err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, + rt->dst.dev, dst_output); + if (err > 0) +@@ -577,6 +583,16 @@ static int raw_sendmsg(struct kiocb *ioc + goto done; + } + ++ if (sk->sk_nx_info) { ++ rt = ip_v4_find_src(sock_net(sk), sk->sk_nx_info, &fl4); ++ if (IS_ERR(rt)) { ++ err = PTR_ERR(rt); ++ rt = NULL; ++ goto done; ++ } ++ ip_rt_put(rt); ++ } ++ + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + if (IS_ERR(rt)) { +@@ -653,17 +669,19 @@ static int raw_bind(struct sock *sk, str + { + struct inet_sock *inet = inet_sk(sk); + struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; ++ struct nx_v4_sock_addr nsa = { 0 }; + int ret = -EINVAL; + int chk_addr_ret; + + if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) + goto out; +- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); ++ v4_map_sock_addr(inet, addr, &nsa); ++ chk_addr_ret = inet_addr_type(sock_net(sk), nsa.saddr); + ret = -EADDRNOTAVAIL; +- if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && ++ if (nsa.saddr && chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) + goto out; +- inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; ++ v4_set_sock_addr(inet, &nsa); + if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) + inet->inet_saddr = 0; /* Use device */ + sk_dst_reset(sk); +@@ -715,7 +733,8 @@ static int raw_recvmsg(struct kiocb *ioc + /* Copy the address. */ + if (sin) { + sin->sin_family = AF_INET; +- sin->sin_addr.s_addr = ip_hdr(skb)->saddr; ++ sin->sin_addr.s_addr = ++ nx_map_sock_lback(sk->sk_nx_info, ip_hdr(skb)->saddr); + sin->sin_port = 0; + memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +@@ -911,7 +930,8 @@ static struct sock *raw_get_first(struct + struct hlist_node *node; + + sk_for_each(sk, node, &state->h->ht[state->bucket]) +- if (sock_net(sk) == seq_file_net(seq)) ++ if ((sock_net(sk) == seq_file_net(seq)) && ++ nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) + goto found; + } + sk = NULL; +@@ -927,7 +947,8 @@ static struct sock *raw_get_next(struct + sk = sk_next(sk); + try_again: + ; +- } while (sk && sock_net(sk) != seq_file_net(seq)); ++ } while (sk && ((sock_net(sk) != seq_file_net(seq)) || ++ !nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT))); + + if (!sk && ++state->bucket < RAW_HTABLE_SIZE) { + sk = sk_head(&state->h->ht[state->bucket]); +diff -NurpP --minimal linux-3.3.8/net/ipv4/route.c linux-3.3.8-vs2.3.3.4/net/ipv4/route.c +--- linux-3.3.8/net/ipv4/route.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv4/route.c 2012-03-19 20:52:10.000000000 +0100 +@@ -2697,7 +2697,7 @@ static struct rtable *ip_route_output_sl + + + if (fl4->flowi4_oif) { +- dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); ++ dev_out = dev_get_by_index_real_rcu(net, fl4->flowi4_oif); + rth = ERR_PTR(-ENODEV); + if (dev_out == NULL) + goto out; +diff -NurpP --minimal linux-3.3.8/net/ipv4/tcp.c linux-3.3.8-vs2.3.3.4/net/ipv4/tcp.c +--- linux-3.3.8/net/ipv4/tcp.c 2012-06-08 15:23:47.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/net/ipv4/tcp.c 2012-05-22 09:49:13.000000000 +0200 +@@ -266,6 +266,7 @@ + #include + #include + #include ++#include + + #include + #include +diff -NurpP --minimal linux-3.3.8/net/ipv4/tcp_ipv4.c linux-3.3.8-vs2.3.3.4/net/ipv4/tcp_ipv4.c +--- linux-3.3.8/net/ipv4/tcp_ipv4.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv4/tcp_ipv4.c 2012-03-19 20:52:10.000000000 +0100 +@@ -2036,6 +2036,12 @@ static void *listening_get_next(struct s + req = req->dl_next; + while (1) { + while (req) { ++ vxdprintk(VXD_CBIT(net, 6), ++ "sk,req: %p [#%d] (from %d)", req->sk, ++ (req->sk)?req->sk->sk_nid:0, nx_current_nid()); ++ if (req->sk && ++ !nx_check(req->sk->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (req->rsk_ops->family == st->family) { + cur = req; + goto out; +@@ -2060,6 +2066,10 @@ get_req: + } + get_sk: + sk_nulls_for_each_from(sk, node) { ++ vxdprintk(VXD_CBIT(net, 6), "sk: %p [#%d] (from %d)", ++ sk, sk->sk_nid, nx_current_nid()); ++ if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (!net_eq(sock_net(sk), net)) + continue; + if (sk->sk_family == st->family) { +@@ -2136,6 +2146,11 @@ static void *established_get_first(struc + + spin_lock_bh(lock); + sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { ++ vxdprintk(VXD_CBIT(net, 6), ++ "sk,egf: %p [#%d] (from %d)", ++ sk, sk->sk_nid, nx_current_nid()); ++ if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (sk->sk_family != st->family || + !net_eq(sock_net(sk), net)) { + continue; +@@ -2146,6 +2161,11 @@ static void *established_get_first(struc + st->state = TCP_SEQ_STATE_TIME_WAIT; + inet_twsk_for_each(tw, node, + &tcp_hashinfo.ehash[st->bucket].twchain) { ++ vxdprintk(VXD_CBIT(net, 6), ++ "tw: %p [#%d] (from %d)", ++ tw, tw->tw_nid, nx_current_nid()); ++ if (!nx_check(tw->tw_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (tw->tw_family != st->family || + !net_eq(twsk_net(tw), net)) { + continue; +@@ -2175,7 +2195,9 @@ static void *established_get_next(struct + tw = cur; + tw = tw_next(tw); + get_tw: +- while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) { ++ while (tw && (tw->tw_family != st->family || ++ !net_eq(twsk_net(tw), net) || ++ !nx_check(tw->tw_nid, VS_WATCH_P | VS_IDENT))) { + tw = tw_next(tw); + } + if (tw) { +@@ -2199,6 +2221,11 @@ get_tw: + sk = sk_nulls_next(sk); + + sk_nulls_for_each_from(sk, node) { ++ vxdprintk(VXD_CBIT(net, 6), ++ "sk,egn: %p [#%d] (from %d)", ++ sk, sk->sk_nid, nx_current_nid()); ++ if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) + goto found; + } +@@ -2404,9 +2431,9 @@ static void get_openreq4(const struct so + seq_printf(f, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", + i, +- ireq->loc_addr, ++ nx_map_sock_lback(current_nx_info(), ireq->loc_addr), + ntohs(inet_sk(sk)->inet_sport), +- ireq->rmt_addr, ++ nx_map_sock_lback(current_nx_info(), ireq->rmt_addr), + ntohs(ireq->rmt_port), + TCP_SYN_RECV, + 0, 0, /* could print option size, but that is af dependent. */ +@@ -2428,8 +2455,8 @@ static void get_tcp4_sock(struct sock *s + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_sock *inet = inet_sk(sk); +- __be32 dest = inet->inet_daddr; +- __be32 src = inet->inet_rcv_saddr; ++ __be32 dest = nx_map_sock_lback(current_nx_info(), inet->inet_daddr); ++ __be32 src = nx_map_sock_lback(current_nx_info(), inet->inet_rcv_saddr); + __u16 destp = ntohs(inet->inet_dport); + __u16 srcp = ntohs(inet->inet_sport); + int rx_queue; +@@ -2486,8 +2513,8 @@ static void get_timewait4_sock(const str + if (ttd < 0) + ttd = 0; + +- dest = tw->tw_daddr; +- src = tw->tw_rcv_saddr; ++ dest = nx_map_sock_lback(current_nx_info(), tw->tw_daddr); ++ src = nx_map_sock_lback(current_nx_info(), tw->tw_rcv_saddr); + destp = ntohs(tw->tw_dport); + srcp = ntohs(tw->tw_sport); + +diff -NurpP --minimal linux-3.3.8/net/ipv4/tcp_minisocks.c linux-3.3.8-vs2.3.3.4/net/ipv4/tcp_minisocks.c +--- linux-3.3.8/net/ipv4/tcp_minisocks.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv4/tcp_minisocks.c 2012-02-24 04:25:11.000000000 +0100 +@@ -23,6 +23,9 @@ + #include + #include + #include ++#include ++#include ++#include + #include + #include + #include +@@ -336,6 +339,11 @@ void tcp_time_wait(struct sock *sk, int + tcptw->tw_ts_recent = tp->rx_opt.ts_recent; + tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + ++ tw->tw_xid = sk->sk_xid; ++ tw->tw_vx_info = NULL; ++ tw->tw_nid = sk->sk_nid; ++ tw->tw_nx_info = NULL; ++ + #if IS_ENABLED(CONFIG_IPV6) + if (tw->tw_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); +diff -NurpP --minimal linux-3.3.8/net/ipv4/udp.c linux-3.3.8-vs2.3.3.4/net/ipv4/udp.c +--- linux-3.3.8/net/ipv4/udp.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv4/udp.c 2012-02-24 03:55:07.000000000 +0100 +@@ -297,14 +297,7 @@ fail: + } + EXPORT_SYMBOL(udp_lib_get_port); + +-static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) +-{ +- struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); +- +- return (!ipv6_only_sock(sk2) && +- (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr || +- inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); +-} ++extern int ipv4_rcv_saddr_equal(const struct sock *, const struct sock *); + + static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr, + unsigned int port) +@@ -339,6 +332,11 @@ static inline int compute_score(struct s + if (inet->inet_rcv_saddr != daddr) + return -1; + score += 2; ++ } else { ++ /* block non nx_info ips */ ++ if (!v4_addr_in_nx_info(sk->sk_nx_info, ++ daddr, NXA_MASK_BIND)) ++ return -1; + } + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) +@@ -442,6 +440,7 @@ exact_match: + return result; + } + ++ + /* UDP is nearly always wildcards out the wazoo, it makes no sense to try + * harder than this. -DaveM + */ +@@ -487,6 +486,11 @@ begin: + sk_nulls_for_each_rcu(sk, node, &hslot->head) { + score = compute_score(sk, net, saddr, hnum, sport, + daddr, dport, dif); ++ /* FIXME: disabled? ++ if (score == 9) { ++ result = sk; ++ break; ++ } else */ + if (score > badness) { + result = sk; + badness = score; +@@ -500,6 +504,7 @@ begin: + if (get_nulls_value(node) != slot) + goto begin; + ++ + if (result) { + if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) + result = NULL; +@@ -509,6 +514,7 @@ begin: + goto begin; + } + } ++ + rcu_read_unlock(); + return result; + } +@@ -552,8 +558,7 @@ static inline struct sock *udp_v4_mcast_ + udp_sk(s)->udp_port_hash != hnum || + (inet->inet_daddr && inet->inet_daddr != rmt_addr) || + (inet->inet_dport != rmt_port && inet->inet_dport) || +- (inet->inet_rcv_saddr && +- inet->inet_rcv_saddr != loc_addr) || ++ !v4_sock_addr_match(sk->sk_nx_info, inet, loc_addr) || + ipv6_only_sock(s) || + (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) + continue; +@@ -931,6 +936,16 @@ int udp_sendmsg(struct kiocb *iocb, stru + inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP, + faddr, saddr, dport, inet->inet_sport); + ++ if (sk->sk_nx_info) { ++ rt = ip_v4_find_src(net, sk->sk_nx_info, fl4); ++ if (IS_ERR(rt)) { ++ err = PTR_ERR(rt); ++ rt = NULL; ++ goto out; ++ } ++ ip_rt_put(rt); ++ } ++ + security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + rt = ip_route_output_flow(net, fl4, sk); + if (IS_ERR(rt)) { +@@ -1229,7 +1244,8 @@ try_again: + if (sin) { + sin->sin_family = AF_INET; + sin->sin_port = udp_hdr(skb)->source; +- sin->sin_addr.s_addr = ip_hdr(skb)->saddr; ++ sin->sin_addr.s_addr = nx_map_sock_lback( ++ skb->sk->sk_nx_info, ip_hdr(skb)->saddr); + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + if (inet->cmsg_flags) +@@ -1976,6 +1992,8 @@ static struct sock *udp_get_first(struct + sk_nulls_for_each(sk, node, &hslot->head) { + if (!net_eq(sock_net(sk), net)) + continue; ++ if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (sk->sk_family == state->family) + goto found; + } +@@ -1993,7 +2011,9 @@ static struct sock *udp_get_next(struct + + do { + sk = sk_nulls_next(sk); +- } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); ++ } while (sk && (!net_eq(sock_net(sk), net) || ++ sk->sk_family != state->family || ++ !nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT))); + + if (!sk) { + if (state->bucket <= state->udp_table->mask) +diff -NurpP --minimal linux-3.3.8/net/ipv6/Kconfig linux-3.3.8-vs2.3.3.4/net/ipv6/Kconfig +--- linux-3.3.8/net/ipv6/Kconfig 2010-08-02 16:52:59.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/net/ipv6/Kconfig 2012-02-24 03:55:07.000000000 +0100 +@@ -4,8 +4,8 @@ + + # IPv6 as module will cause a CRASH if you try to unload it + menuconfig IPV6 +- tristate "The IPv6 protocol" +- default m ++ bool "The IPv6 protocol" ++ default n + ---help--- + This is complemental support for the IP version 6. + You will still be able to do traditional IPv4 networking as well. +diff -NurpP --minimal linux-3.3.8/net/ipv6/addrconf.c linux-3.3.8-vs2.3.3.4/net/ipv6/addrconf.c +--- linux-3.3.8/net/ipv6/addrconf.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv6/addrconf.c 2012-03-19 20:52:10.000000000 +0100 +@@ -88,6 +88,8 @@ + #include + #include + #include ++#include ++#include + + /* Set to 3 to get tracing... */ + #define ACONF_DEBUG 2 +@@ -1105,7 +1107,7 @@ out: + + int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev, + const struct in6_addr *daddr, unsigned int prefs, +- struct in6_addr *saddr) ++ struct in6_addr *saddr, struct nx_info *nxi) + { + struct ipv6_saddr_score scores[2], + *score = &scores[0], *hiscore = &scores[1]; +@@ -1177,6 +1179,8 @@ int ipv6_dev_get_saddr(struct net *net, + dev->name); + continue; + } ++ if (!v6_addr_in_nx_info(nxi, &score->ifa->addr, -1)) ++ continue; + + score->rule = -1; + bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX); +@@ -3162,7 +3166,10 @@ static void if6_seq_stop(struct seq_file + static int if6_seq_show(struct seq_file *seq, void *v) + { + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v; +- seq_printf(seq, "%pi6 %02x %02x %02x %02x %8s\n", ++ ++ if (nx_check(0, VS_ADMIN|VS_WATCH) || ++ v6_addr_in_nx_info(current_nx_info(), &ifp->addr, -1)) ++ seq_printf(seq, "%pi6 %02x %02x %02x %02x %8s\n", + &ifp->addr, + ifp->idev->dev->ifindex, + ifp->prefix_len, +@@ -3668,6 +3675,11 @@ static int in6_dump_addrs(struct inet6_d + struct ifacaddr6 *ifaca; + int err = 1; + int ip_idx = *p_ip_idx; ++ struct nx_info *nxi = skb->sk ? skb->sk->sk_nx_info : NULL; ++ ++ /* disable ipv6 on non v6 guests */ ++ if (nxi && !nx_info_has_v6(nxi)) ++ return skb->len; + + read_lock_bh(&idev->lock); + switch (type) { +@@ -3678,6 +3690,8 @@ static int in6_dump_addrs(struct inet6_d + list_for_each_entry(ifa, &idev->addr_list, if_list) { + if (++ip_idx < s_ip_idx) + continue; ++ if (!v6_addr_in_nx_info(nxi, &ifa->addr, -1)) ++ continue; + err = inet6_fill_ifaddr(skb, ifa, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, +@@ -3694,6 +3708,8 @@ static int in6_dump_addrs(struct inet6_d + ifmca = ifmca->next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; ++ if (!v6_addr_in_nx_info(nxi, &ifmca->mca_addr, -1)) ++ continue; + err = inet6_fill_ifmcaddr(skb, ifmca, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, +@@ -3709,6 +3725,8 @@ static int in6_dump_addrs(struct inet6_d + ifaca = ifaca->aca_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; ++ if (!v6_addr_in_nx_info(nxi, &ifaca->aca_addr, -1)) ++ continue; + err = inet6_fill_ifacaddr(skb, ifaca, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, +@@ -4094,6 +4112,11 @@ static int inet6_dump_ifinfo(struct sk_b + struct inet6_dev *idev; + struct hlist_head *head; + struct hlist_node *node; ++ struct nx_info *nxi = skb->sk ? skb->sk->sk_nx_info : NULL; ++ ++ /* FIXME: maybe disable ipv6 on non v6 guests? ++ if (skb->sk && skb->sk->sk_vx_info) ++ return skb->len; */ + + s_h = cb->args[0]; + s_idx = cb->args[1]; +@@ -4105,6 +4128,8 @@ static int inet6_dump_ifinfo(struct sk_b + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; ++ if (!v6_dev_in_nx_info(dev, nxi)) ++ goto cont; + idev = __in6_dev_get(dev); + if (!idev) + goto cont; +diff -NurpP --minimal linux-3.3.8/net/ipv6/af_inet6.c linux-3.3.8-vs2.3.3.4/net/ipv6/af_inet6.c +--- linux-3.3.8/net/ipv6/af_inet6.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv6/af_inet6.c 2012-02-24 04:23:27.000000000 +0100 +@@ -42,6 +42,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -160,9 +162,12 @@ lookup_protocol: + } + + err = -EPERM; ++ if ((protocol == IPPROTO_ICMPV6) && ++ nx_capable(CAP_NET_RAW, NXC_RAW_ICMP)) ++ goto override; + if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) + goto out_rcu_unlock; +- ++override: + sock->ops = answer->ops; + answer_prot = answer->prot; + answer_no_check = answer->no_check; +@@ -261,6 +266,7 @@ int inet6_bind(struct socket *sock, stru + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); ++ struct nx_v6_sock_addr nsa; + __be32 v4addr = 0; + unsigned short snum; + int addr_type = 0; +@@ -276,6 +282,10 @@ int inet6_bind(struct socket *sock, stru + if (addr->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + ++ err = v6_map_sock_addr(inet, addr, &nsa); ++ if (err) ++ return err; ++ + addr_type = ipv6_addr_type(&addr->sin6_addr); + if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) + return -EINVAL; +@@ -307,6 +317,7 @@ int inet6_bind(struct socket *sock, stru + /* Reproduce AF_INET checks to make the bindings consistent */ + v4addr = addr->sin6_addr.s6_addr32[3]; + chk_addr_ret = inet_addr_type(net, v4addr); ++ + if (!sysctl_ip_nonlocal_bind && + !(inet->freebind || inet->transparent) && + v4addr != htonl(INADDR_ANY) && +@@ -316,6 +327,10 @@ int inet6_bind(struct socket *sock, stru + err = -EADDRNOTAVAIL; + goto out; + } ++ if (!v4_addr_in_nx_info(sk->sk_nx_info, v4addr, NXA_MASK_BIND)) { ++ err = -EADDRNOTAVAIL; ++ goto out; ++ } + } else { + if (addr_type != IPV6_ADDR_ANY) { + struct net_device *dev = NULL; +@@ -342,6 +357,11 @@ int inet6_bind(struct socket *sock, stru + } + } + ++ if (!v6_addr_in_nx_info(sk->sk_nx_info, &addr->sin6_addr, -1)) { ++ err = -EADDRNOTAVAIL; ++ goto out; ++ } ++ + /* ipv4 addr of the socket is invalid. Only the + * unspecified and mapped address have a v4 equivalent. + */ +@@ -358,6 +378,9 @@ int inet6_bind(struct socket *sock, stru + } + } + ++ /* what's that for? */ ++ v6_set_sock_addr(inet, &nsa); ++ + inet->inet_rcv_saddr = v4addr; + inet->inet_saddr = v4addr; + +@@ -459,9 +482,11 @@ int inet6_getname(struct socket *sock, s + return -ENOTCONN; + sin->sin6_port = inet->inet_dport; + sin->sin6_addr = np->daddr; ++ /* FIXME: remap lback? */ + if (np->sndflow) + sin->sin6_flowinfo = np->flow_label; + } else { ++ /* FIXME: remap lback? */ + if (ipv6_addr_any(&np->rcv_saddr)) + sin->sin6_addr = np->saddr; + else +diff -NurpP --minimal linux-3.3.8/net/ipv6/datagram.c linux-3.3.8-vs2.3.3.4/net/ipv6/datagram.c +--- linux-3.3.8/net/ipv6/datagram.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv6/datagram.c 2012-02-24 03:55:07.000000000 +0100 +@@ -642,7 +642,7 @@ int datagram_send_ctl(struct net *net, s + + rcu_read_lock(); + if (fl6->flowi6_oif) { +- dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); ++ dev = dev_get_by_index_real_rcu(net, fl6->flowi6_oif); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; +diff -NurpP --minimal linux-3.3.8/net/ipv6/fib6_rules.c linux-3.3.8-vs2.3.3.4/net/ipv6/fib6_rules.c +--- linux-3.3.8/net/ipv6/fib6_rules.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv6/fib6_rules.c 2012-02-24 03:55:07.000000000 +0100 +@@ -91,7 +91,7 @@ static int fib6_rule_action(struct fib_r + ip6_dst_idev(&rt->dst)->dev, + &flp6->daddr, + rt6_flags2srcprefs(flags), +- &saddr)) ++ &saddr, NULL)) + goto again; + if (!ipv6_prefix_equal(&saddr, &r->src.addr, + r->src.plen)) +diff -NurpP --minimal linux-3.3.8/net/ipv6/inet6_hashtables.c linux-3.3.8-vs2.3.3.4/net/ipv6/inet6_hashtables.c +--- linux-3.3.8/net/ipv6/inet6_hashtables.c 2011-10-24 18:45:34.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/net/ipv6/inet6_hashtables.c 2012-02-24 03:55:07.000000000 +0100 +@@ -16,6 +16,7 @@ + + #include + #include ++#include + + #include + #include +@@ -83,7 +84,6 @@ struct sock *__inet6_lookup_established( + unsigned int slot = hash & hashinfo->ehash_mask; + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + +- + rcu_read_lock(); + begin: + sk_nulls_for_each_rcu(sk, node, &head->chain) { +@@ -95,7 +95,7 @@ begin: + sock_put(sk); + goto begin; + } +- goto out; ++ goto out; + } + } + if (get_nulls_value(node) != slot) +@@ -141,6 +141,9 @@ static inline int compute_score(struct s + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + return -1; + score++; ++ } else { ++ if (!v6_addr_in_nx_info(sk->sk_nx_info, daddr, -1)) ++ return -1; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) +diff -NurpP --minimal linux-3.3.8/net/ipv6/ip6_output.c linux-3.3.8-vs2.3.3.4/net/ipv6/ip6_output.c +--- linux-3.3.8/net/ipv6/ip6_output.c 2012-06-08 15:23:47.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/net/ipv6/ip6_output.c 2012-04-03 03:02:13.000000000 +0200 +@@ -968,7 +968,8 @@ static int ip6_dst_lookup_tail(struct so + struct rt6_info *rt = (struct rt6_info *) *dst; + err = ip6_route_get_saddr(net, rt, &fl6->daddr, + sk ? inet6_sk(sk)->srcprefs : 0, +- &fl6->saddr); ++ &fl6->saddr, ++ sk ? sk->sk_nx_info : NULL); + if (err) + goto out_err_release; + } +diff -NurpP --minimal linux-3.3.8/net/ipv6/ndisc.c linux-3.3.8-vs2.3.3.4/net/ipv6/ndisc.c +--- linux-3.3.8/net/ipv6/ndisc.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv6/ndisc.c 2012-03-19 20:52:10.000000000 +0100 +@@ -575,7 +575,7 @@ static void ndisc_send_na(struct net_dev + } else { + if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, + inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs, +- &tmpaddr)) ++ &tmpaddr, NULL)) + return; + src_addr = &tmpaddr; + } +diff -NurpP --minimal linux-3.3.8/net/ipv6/raw.c linux-3.3.8-vs2.3.3.4/net/ipv6/raw.c +--- linux-3.3.8/net/ipv6/raw.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv6/raw.c 2012-02-24 03:55:07.000000000 +0100 +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -285,6 +286,13 @@ static int rawv6_bind(struct sock *sk, s + goto out_unlock; + } + ++ if (!v6_addr_in_nx_info(sk->sk_nx_info, &addr->sin6_addr, -1)) { ++ err = -EADDRNOTAVAIL; ++ if (dev) ++ dev_put(dev); ++ goto out; ++ } ++ + /* ipv4 addr of the socket is invalid. Only the + * unspecified and mapped address have a v4 equivalent. + */ +diff -NurpP --minimal linux-3.3.8/net/ipv6/route.c linux-3.3.8-vs2.3.3.4/net/ipv6/route.c +--- linux-3.3.8/net/ipv6/route.c 2012-06-08 15:23:47.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/net/ipv6/route.c 2012-04-03 03:02:13.000000000 +0200 +@@ -55,6 +55,7 @@ + #include + #include + #include ++#include + + #include + +@@ -2107,15 +2108,17 @@ int ip6_route_get_saddr(struct net *net, + struct rt6_info *rt, + const struct in6_addr *daddr, + unsigned int prefs, +- struct in6_addr *saddr) ++ struct in6_addr *saddr, ++ struct nx_info *nxi) + { + struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt); + int err = 0; +- if (rt->rt6i_prefsrc.plen) ++ if (rt->rt6i_prefsrc.plen && (!nxi || ++ v6_addr_in_nx_info(nxi, &rt->rt6i_prefsrc.addr, NXA_TYPE_ADDR))) + *saddr = rt->rt6i_prefsrc.addr; + else + err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, +- daddr, prefs, saddr); ++ daddr, prefs, saddr, nxi); + return err; + } + +@@ -2446,7 +2449,8 @@ static int rt6_fill_node(struct net *net + NLA_PUT_U32(skb, RTA_IIF, iif); + } else if (dst) { + struct in6_addr saddr_buf; +- if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0) ++ if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf, ++ (skb->sk ? skb->sk->sk_nx_info : NULL)) == 0) + NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); + } + +@@ -2660,6 +2664,7 @@ static int rt6_info_route(struct rt6_inf + struct seq_file *m = p_arg; + struct neighbour *n; + ++ /* FIXME: check for network context? */ + seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); + + #ifdef CONFIG_IPV6_SUBTREES +diff -NurpP --minimal linux-3.3.8/net/ipv6/tcp_ipv6.c linux-3.3.8-vs2.3.3.4/net/ipv6/tcp_ipv6.c +--- linux-3.3.8/net/ipv6/tcp_ipv6.c 2012-06-08 15:23:47.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/net/ipv6/tcp_ipv6.c 2012-04-30 19:34:38.000000000 +0200 +@@ -71,6 +71,7 @@ + + #include + #include ++#include + + static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); + static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, +@@ -163,8 +164,15 @@ static int tcp_v6_connect(struct sock *s + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + +- if(ipv6_addr_any(&usin->sin6_addr)) +- usin->sin6_addr.s6_addr[15] = 0x1; ++ if(ipv6_addr_any(&usin->sin6_addr)) { ++ struct nx_info *nxi = sk->sk_nx_info; ++ ++ if (nxi && nx_info_has_v6(nxi)) ++ /* FIXME: remap lback? */ ++ usin->sin6_addr = nxi->v6.ip; ++ else ++ usin->sin6_addr.s6_addr[15] = 0x1; ++ } + + addr_type = ipv6_addr_type(&usin->sin6_addr); + +diff -NurpP --minimal linux-3.3.8/net/ipv6/udp.c linux-3.3.8-vs2.3.3.4/net/ipv6/udp.c +--- linux-3.3.8/net/ipv6/udp.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv6/udp.c 2012-02-24 03:55:07.000000000 +0100 +@@ -45,41 +45,67 @@ + #include + #include + #include ++#include + + #include + #include + #include "udp_impl.h" + +-int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) ++int ipv6_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) + { +- const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; ++ const struct in6_addr *sk1_rcv_saddr6 = &inet6_sk(sk1)->rcv_saddr; + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); +- __be32 sk1_rcv_saddr = sk_rcv_saddr(sk); ++ __be32 sk1_rcv_saddr = sk_rcv_saddr(sk1); + __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); +- int sk_ipv6only = ipv6_only_sock(sk); ++ int sk1_ipv6only = ipv6_only_sock(sk1); + int sk2_ipv6only = inet_v6_ipv6only(sk2); +- int addr_type = ipv6_addr_type(sk_rcv_saddr6); ++ int addr_type = ipv6_addr_type(sk1_rcv_saddr6); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + /* if both are mapped, treat as IPv4 */ +- if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) +- return (!sk2_ipv6only && ++ if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { ++ if (!sk2_ipv6only && + (!sk1_rcv_saddr || !sk2_rcv_saddr || +- sk1_rcv_saddr == sk2_rcv_saddr)); ++ sk1_rcv_saddr == sk2_rcv_saddr)) ++ goto vs_v4; ++ else ++ return 0; ++ } + + if (addr_type2 == IPV6_ADDR_ANY && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) +- return 1; ++ goto vs; + + if (addr_type == IPV6_ADDR_ANY && +- !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) +- return 1; ++ !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) ++ goto vs; + + if (sk2_rcv_saddr6 && +- ipv6_addr_equal(sk_rcv_saddr6, sk2_rcv_saddr6)) +- return 1; ++ ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) ++ goto vs; + + return 0; ++ ++vs_v4: ++ if (!sk1_rcv_saddr && !sk2_rcv_saddr) ++ return nx_v4_addr_conflict(sk1->sk_nx_info, sk2->sk_nx_info); ++ if (!sk2_rcv_saddr) ++ return v4_addr_in_nx_info(sk1->sk_nx_info, sk2_rcv_saddr, -1); ++ if (!sk1_rcv_saddr) ++ return v4_addr_in_nx_info(sk2->sk_nx_info, sk1_rcv_saddr, -1); ++ return 1; ++vs: ++ if (addr_type2 == IPV6_ADDR_ANY && addr_type == IPV6_ADDR_ANY) ++ return nx_v6_addr_conflict(sk1->sk_nx_info, sk2->sk_nx_info); ++ else if (addr_type2 == IPV6_ADDR_ANY) ++ return v6_addr_in_nx_info(sk2->sk_nx_info, sk1_rcv_saddr6, -1); ++ else if (addr_type == IPV6_ADDR_ANY) { ++ if (addr_type2 == IPV6_ADDR_MAPPED) ++ return nx_v4_addr_conflict(sk1->sk_nx_info, sk2->sk_nx_info); ++ else ++ return v6_addr_in_nx_info(sk1->sk_nx_info, sk2_rcv_saddr6, -1); ++ } ++ return 1; + } + + static unsigned int udp6_portaddr_hash(struct net *net, +@@ -143,6 +169,10 @@ static inline int compute_score(struct s + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + return -1; + score++; ++ } else { ++ /* block non nx_info ips */ ++ if (!v6_addr_in_nx_info(sk->sk_nx_info, daddr, -1)) ++ return -1; + } + if (!ipv6_addr_any(&np->daddr)) { + if (!ipv6_addr_equal(&np->daddr, saddr)) +diff -NurpP --minimal linux-3.3.8/net/ipv6/xfrm6_policy.c linux-3.3.8-vs2.3.3.4/net/ipv6/xfrm6_policy.c +--- linux-3.3.8/net/ipv6/xfrm6_policy.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/ipv6/xfrm6_policy.c 2012-02-24 03:55:07.000000000 +0100 +@@ -63,7 +63,7 @@ static int xfrm6_get_saddr(struct net *n + dev = ip6_dst_idev(dst)->dev; + ipv6_dev_get_saddr(dev_net(dev), dev, + (struct in6_addr *)&daddr->a6, 0, +- (struct in6_addr *)&saddr->a6); ++ (struct in6_addr *)&saddr->a6, NULL); + dst_release(dst); + return 0; + } +diff -NurpP --minimal linux-3.3.8/net/netfilter/ipvs/ip_vs_xmit.c linux-3.3.8-vs2.3.3.4/net/netfilter/ipvs/ip_vs_xmit.c +--- linux-3.3.8/net/netfilter/ipvs/ip_vs_xmit.c 2012-03-19 19:47:33.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/netfilter/ipvs/ip_vs_xmit.c 2012-02-24 03:55:07.000000000 +0100 +@@ -226,7 +226,7 @@ __ip_vs_route_output_v6(struct net *net, + return dst; + if (ipv6_addr_any(&fl6.saddr) && + ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, +- &fl6.daddr, 0, &fl6.saddr) < 0) ++ &fl6.daddr, 0, &fl6.saddr, NULL) < 0) + goto out_err; + if (do_xfrm) { + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); +diff -NurpP --minimal linux-3.3.8/net/netlink/af_netlink.c linux-3.3.8-vs2.3.3.4/net/netlink/af_netlink.c +--- linux-3.3.8/net/netlink/af_netlink.c 2012-06-08 15:23:47.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/net/netlink/af_netlink.c 2012-04-30 19:34:38.000000000 +0200 +@@ -55,6 +55,9 @@ + #include + #include + #include ++#include ++#include ++#include + + #include + #include +@@ -1910,6 +1913,8 @@ static struct sock *netlink_seq_socket_i + sk_for_each(s, node, &hash->table[j]) { + if (sock_net(s) != seq_file_net(seq)) + continue; ++ if (!nx_check(s->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (off == pos) { + iter->link = i; + iter->hash_idx = j; +@@ -1944,7 +1949,8 @@ static void *netlink_seq_next(struct seq + s = v; + do { + s = sk_next(s); +- } while (s && sock_net(s) != seq_file_net(seq)); ++ } while (s && (sock_net(s) != seq_file_net(seq) || ++ !nx_check(s->sk_nid, VS_WATCH_P | VS_IDENT))); + if (s) + return s; + +@@ -1956,7 +1962,8 @@ static void *netlink_seq_next(struct seq + + for (; j <= hash->mask; j++) { + s = sk_head(&hash->table[j]); +- while (s && sock_net(s) != seq_file_net(seq)) ++ while (s && (sock_net(s) != seq_file_net(seq) || ++ !nx_check(s->sk_nid, VS_WATCH_P | VS_IDENT))) + s = sk_next(s); + if (s) { + iter->link = i; +diff -NurpP --minimal linux-3.3.8/net/socket.c linux-3.3.8-vs2.3.3.4/net/socket.c +--- linux-3.3.8/net/socket.c 2012-06-08 15:23:47.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/net/socket.c 2012-04-30 19:34:38.000000000 +0200 +@@ -98,6 +98,10 @@ + + #include + #include ++#include ++#include ++#include ++#include + + #include + #include +@@ -548,6 +552,7 @@ static inline int __sock_sendmsg_nosec(s + struct msghdr *msg, size_t size) + { + struct sock_iocb *si = kiocb_to_siocb(iocb); ++ size_t len; + + sock_update_classid(sock->sk); + +@@ -558,7 +563,22 @@ static inline int __sock_sendmsg_nosec(s + si->msg = msg; + si->size = size; + +- return sock->ops->sendmsg(iocb, sock, msg, size); ++ len = sock->ops->sendmsg(iocb, sock, msg, size); ++ if (sock->sk) { ++ if (len == size) ++ vx_sock_send(sock->sk, size); ++ else ++ vx_sock_fail(sock->sk, size); ++ } ++ vxdprintk(VXD_CBIT(net, 7), ++ "__sock_sendmsg: %p[%p,%p,%p;%d/%d]:%d/%zu", ++ sock, sock->sk, ++ (sock->sk)?sock->sk->sk_nx_info:0, ++ (sock->sk)?sock->sk->sk_vx_info:0, ++ (sock->sk)?sock->sk->sk_xid:0, ++ (sock->sk)?sock->sk->sk_nid:0, ++ (unsigned int)size, len); ++ return len; + } + + static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, +@@ -714,6 +734,7 @@ static inline int __sock_recvmsg_nosec(s + struct msghdr *msg, size_t size, int flags) + { + struct sock_iocb *si = kiocb_to_siocb(iocb); ++ int len; + + sock_update_classid(sock->sk); + +@@ -723,7 +744,18 @@ static inline int __sock_recvmsg_nosec(s + si->size = size; + si->flags = flags; + +- return sock->ops->recvmsg(iocb, sock, msg, size, flags); ++ len = sock->ops->recvmsg(iocb, sock, msg, size, flags); ++ if ((len >= 0) && sock->sk) ++ vx_sock_recv(sock->sk, len); ++ vxdprintk(VXD_CBIT(net, 7), ++ "__sock_recvmsg: %p[%p,%p,%p;%d/%d]:%d/%d", ++ sock, sock->sk, ++ (sock->sk)?sock->sk->sk_nx_info:0, ++ (sock->sk)?sock->sk->sk_vx_info:0, ++ (sock->sk)?sock->sk->sk_xid:0, ++ (sock->sk)?sock->sk->sk_nid:0, ++ (unsigned int)size, len); ++ return len; + } + + static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, +@@ -1208,6 +1240,13 @@ int __sock_create(struct net *net, int f + if (type < 0 || type >= SOCK_MAX) + return -EINVAL; + ++ if (!nx_check(0, VS_ADMIN)) { ++ if (family == PF_INET && !current_nx_info_has_v4()) ++ return -EAFNOSUPPORT; ++ if (family == PF_INET6 && !current_nx_info_has_v6()) ++ return -EAFNOSUPPORT; ++ } ++ + /* Compatibility. + + This uglymoron is moved from INET layer to here to avoid +@@ -1343,6 +1382,7 @@ SYSCALL_DEFINE3(socket, int, family, int + if (retval < 0) + goto out; + ++ set_bit(SOCK_USER_SOCKET, &sock->flags); + retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); + if (retval < 0) + goto out_release; +@@ -1384,10 +1424,12 @@ SYSCALL_DEFINE4(socketpair, int, family, + err = sock_create(family, type, protocol, &sock1); + if (err < 0) + goto out; ++ set_bit(SOCK_USER_SOCKET, &sock1->flags); + + err = sock_create(family, type, protocol, &sock2); + if (err < 0) + goto out_release_1; ++ set_bit(SOCK_USER_SOCKET, &sock2->flags); + + err = sock1->ops->socketpair(sock1, sock2); + if (err < 0) +diff -NurpP --minimal linux-3.3.8/net/sunrpc/auth.c linux-3.3.8-vs2.3.3.4/net/sunrpc/auth.c +--- linux-3.3.8/net/sunrpc/auth.c 2011-10-24 18:45:34.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/net/sunrpc/auth.c 2012-02-24 03:55:07.000000000 +0100 +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + + #ifdef RPC_DEBUG + # define RPCDBG_FACILITY RPCDBG_AUTH +@@ -427,6 +428,7 @@ rpcauth_lookupcred(struct rpc_auth *auth + memset(&acred, 0, sizeof(acred)); + acred.uid = cred->fsuid; + acred.gid = cred->fsgid; ++ acred.tag = dx_current_tag(); + acred.group_info = get_group_info(((struct cred *)cred)->group_info); + + ret = auth->au_ops->lookup_cred(auth, &acred, flags); +@@ -467,6 +469,7 @@ rpcauth_bind_root_cred(struct rpc_task * + struct auth_cred acred = { + .uid = 0, + .gid = 0, ++ .tag = dx_current_tag(), + }; + + dprintk("RPC: %5u looking up %s cred\n", +diff -NurpP --minimal linux-3.3.8/net/sunrpc/auth_unix.c linux-3.3.8-vs2.3.3.4/net/sunrpc/auth_unix.c +--- linux-3.3.8/net/sunrpc/auth_unix.c 2012-01-09 16:15:04.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/sunrpc/auth_unix.c 2012-02-24 03:55:07.000000000 +0100 +@@ -12,12 +12,14 @@ + #include + #include + #include ++#include + + #define NFS_NGROUPS 16 + + struct unx_cred { + struct rpc_cred uc_base; + gid_t uc_gid; ++ tag_t uc_tag; + gid_t uc_gids[NFS_NGROUPS]; + }; + #define uc_uid uc_base.cr_uid +@@ -78,6 +80,7 @@ unx_create_cred(struct rpc_auth *auth, s + groups = NFS_NGROUPS; + + cred->uc_gid = acred->gid; ++ cred->uc_tag = acred->tag; + for (i = 0; i < groups; i++) + cred->uc_gids[i] = GROUP_AT(acred->group_info, i); + if (i < NFS_NGROUPS) +@@ -119,7 +122,9 @@ unx_match(struct auth_cred *acred, struc + unsigned int i; + + +- if (cred->uc_uid != acred->uid || cred->uc_gid != acred->gid) ++ if (cred->uc_uid != acred->uid || ++ cred->uc_gid != acred->gid || ++ cred->uc_tag != acred->tag) + return 0; + + if (acred->group_info != NULL) +@@ -145,7 +150,7 @@ unx_marshal(struct rpc_task *task, __be3 + struct rpc_clnt *clnt = task->tk_client; + struct unx_cred *cred = container_of(task->tk_rqstp->rq_cred, struct unx_cred, uc_base); + __be32 *base, *hold; +- int i; ++ int i, tag; + + *p++ = htonl(RPC_AUTH_UNIX); + base = p++; +@@ -155,9 +160,12 @@ unx_marshal(struct rpc_task *task, __be3 + * Copy the UTS nodename captured when the client was created. + */ + p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); ++ tag = task->tk_client->cl_tag; + +- *p++ = htonl((u32) cred->uc_uid); +- *p++ = htonl((u32) cred->uc_gid); ++ *p++ = htonl((u32) TAGINO_UID(tag, ++ cred->uc_uid, cred->uc_tag)); ++ *p++ = htonl((u32) TAGINO_GID(tag, ++ cred->uc_gid, cred->uc_tag)); + hold = p++; + for (i = 0; i < 16 && cred->uc_gids[i] != (gid_t) NOGROUP; i++) + *p++ = htonl((u32) cred->uc_gids[i]); +diff -NurpP --minimal linux-3.3.8/net/sunrpc/clnt.c linux-3.3.8-vs2.3.3.4/net/sunrpc/clnt.c +--- linux-3.3.8/net/sunrpc/clnt.c 2012-01-09 16:15:04.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/sunrpc/clnt.c 2012-02-24 03:55:07.000000000 +0100 +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -361,6 +362,9 @@ struct rpc_clnt *rpc_create(struct rpc_c + if (!(args->flags & RPC_CLNT_CREATE_QUIET)) + clnt->cl_chatty = 1; + ++ /* TODO: handle RPC_CLNT_CREATE_TAGGED ++ if (args->flags & RPC_CLNT_CREATE_TAGGED) ++ clnt->cl_tag = 1; */ + return clnt; + } + EXPORT_SYMBOL_GPL(rpc_create); +diff -NurpP --minimal linux-3.3.8/net/unix/af_unix.c linux-3.3.8-vs2.3.3.4/net/unix/af_unix.c +--- linux-3.3.8/net/unix/af_unix.c 2012-03-19 19:47:34.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/net/unix/af_unix.c 2012-02-24 03:55:07.000000000 +0100 +@@ -114,6 +114,8 @@ + #include + #include + #include ++#include ++#include + + struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; + EXPORT_SYMBOL_GPL(unix_socket_table); +@@ -261,6 +263,8 @@ static struct sock *__unix_find_socket_b + if (!net_eq(sock_net(s), net)) + continue; + ++ if (!nx_check(s->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (u->addr->len == len && + !memcmp(u->addr->name, sunname, len)) + goto found; +@@ -2235,6 +2239,8 @@ static struct sock *unix_seq_idx(struct + for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) { + if (sock_net(s) != seq_file_net(seq)) + continue; ++ if (!nx_check(s->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (off == pos) + return s; + ++off; +@@ -2259,7 +2265,8 @@ static void *unix_seq_next(struct seq_fi + sk = first_unix_socket(&iter->i); + else + sk = next_unix_socket(&iter->i, sk); +- while (sk && (sock_net(sk) != seq_file_net(seq))) ++ while (sk && (sock_net(sk) != seq_file_net(seq) || ++ !nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT))) + sk = next_unix_socket(&iter->i, sk); + return sk; + } +diff -NurpP --minimal linux-3.3.8/scripts/checksyscalls.sh linux-3.3.8-vs2.3.3.4/scripts/checksyscalls.sh +--- linux-3.3.8/scripts/checksyscalls.sh 2012-03-19 19:47:34.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/scripts/checksyscalls.sh 2012-02-24 03:55:07.000000000 +0100 +@@ -193,7 +193,6 @@ cat << EOF + #define __IGNORE_afs_syscall + #define __IGNORE_getpmsg + #define __IGNORE_putpmsg +-#define __IGNORE_vserver + EOF + } + +diff -NurpP --minimal linux-3.3.8/security/commoncap.c linux-3.3.8-vs2.3.3.4/security/commoncap.c +--- linux-3.3.8/security/commoncap.c 2012-06-08 15:23:47.000000000 +0200 ++++ linux-3.3.8-vs2.3.3.4/security/commoncap.c 2012-04-23 23:45:14.000000000 +0200 +@@ -75,14 +75,20 @@ int cap_netlink_send(struct sock *sk, st + int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, + int cap, int audit) + { ++ struct vx_info *vxi = current_vx_info(); /* FIXME: get vxi from cred? */ ++ + for (;;) { + /* The creator of the user namespace has all caps. */ + if (targ_ns != &init_user_ns && targ_ns->creator == cred->user) + return 0; + + /* Do we have the necessary capabilities? */ +- if (targ_ns == cred->user->user_ns) +- return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM; ++ if (targ_ns == cred->user->user_ns) { ++ if (vx_info_flags(vxi, VXF_STATE_SETUP, 0) && ++ cap_raised(cred->cap_effective, cap)) ++ return 0; ++ return vx_cap_raised(vxi, cred->cap_effective, cap) ? 0 : -EPERM; ++ } + + /* Have we tried all of the parent namespaces? */ + if (targ_ns == &init_user_ns) +@@ -611,7 +617,7 @@ int cap_inode_setxattr(struct dentry *de + + if (!strncmp(name, XATTR_SECURITY_PREFIX, + sizeof(XATTR_SECURITY_PREFIX) - 1) && +- !capable(CAP_SYS_ADMIN)) ++ !vx_capable(CAP_SYS_ADMIN, VXC_FS_SECURITY)) + return -EPERM; + return 0; + } +@@ -637,7 +643,7 @@ int cap_inode_removexattr(struct dentry + + if (!strncmp(name, XATTR_SECURITY_PREFIX, + sizeof(XATTR_SECURITY_PREFIX) - 1) && +- !capable(CAP_SYS_ADMIN)) ++ !vx_capable(CAP_SYS_ADMIN, VXC_FS_SECURITY)) + return -EPERM; + return 0; + } +diff -NurpP --minimal linux-3.3.8/security/selinux/hooks.c linux-3.3.8-vs2.3.3.4/security/selinux/hooks.c +--- linux-3.3.8/security/selinux/hooks.c 2012-03-19 19:47:34.000000000 +0100 ++++ linux-3.3.8-vs2.3.3.4/security/selinux/hooks.c 2012-02-24 03:55:07.000000000 +0100 +@@ -67,7 +67,6 @@ + #include + #include + #include /* for Unix socket types */ +-#include /* for Unix socket types */ + #include + #include + #include diff --git a/3.3.8/wrapfs-v3.3-rc1-429-g65388bc.patch b/3.3.8/wrapfs-v3.3-rc1-429-g65388bc.patch new file mode 100644 index 0000000..0c489fa --- /dev/null +++ b/3.3.8/wrapfs-v3.3-rc1-429-g65388bc.patch @@ -0,0 +1,1913 @@ +diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX +index 8c624a1..b8822ed 100644 +--- a/Documentation/filesystems/00-INDEX ++++ b/Documentation/filesystems/00-INDEX +@@ -114,6 +114,8 @@ vfat.txt + - info on using the VFAT filesystem used in Windows NT and Windows 95 + vfs.txt + - overview of the Virtual File System ++wrapfs.txt ++ - info and mount options for the stackable wrapper file system + xfs.txt + - info and mount options for the XFS filesystem. + xip.txt +diff --git a/MAINTAINERS b/MAINTAINERS +index 1b6e835..bdaad92 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -7395,6 +7395,16 @@ F: include/linux/workqueue.h + F: kernel/workqueue.c + F: Documentation/workqueue.txt + ++WRAPFS ++P: Erez Zadok ++M: ezk@cs.sunysb.edu ++L: wrapfs@filesystems.org ++W: http://wrapfs.filesystems.org/ ++T: git git.kernel.org/pub/scm/linux/kernel/git/ezk/wrapfs.git ++S: Maintained ++F: Documentation/filesystems/wrapfs.txt ++F: fs/wrapfs/ ++ + X.25 NETWORK LAYER + M: Andrew Hendry + L: linux-x25@vger.kernel.org +diff --git a/fs/Kconfig b/fs/Kconfig +index d621f02..6407aa4 100644 +--- a/fs/Kconfig ++++ b/fs/Kconfig +@@ -194,6 +194,7 @@ if MISC_FILESYSTEMS + source "fs/adfs/Kconfig" + source "fs/affs/Kconfig" + source "fs/ecryptfs/Kconfig" ++source "fs/wrapfs/Kconfig" + source "fs/hfs/Kconfig" + source "fs/hfsplus/Kconfig" + source "fs/befs/Kconfig" +diff --git a/fs/Makefile b/fs/Makefile +index 93804d4..a68e75a 100644 +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -83,6 +83,7 @@ obj-$(CONFIG_ISO9660_FS) += isofs/ + obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+ + obj-$(CONFIG_HFS_FS) += hfs/ + obj-$(CONFIG_ECRYPT_FS) += ecryptfs/ ++obj-$(CONFIG_WRAP_FS) += wrapfs/ + obj-$(CONFIG_VXFS_FS) += freevxfs/ + obj-$(CONFIG_NFS_FS) += nfs/ + obj-$(CONFIG_EXPORTFS) += exportfs/ +diff --git a/fs/wrapfs/Kconfig b/fs/wrapfs/Kconfig +new file mode 100644 +index 0000000..d790ccd +--- /dev/null ++++ b/fs/wrapfs/Kconfig +@@ -0,0 +1,9 @@ ++config WRAP_FS ++ tristate "Wrapfs stackable file system (EXPERIMENTAL)" ++ depends on EXPERIMENTAL ++ help ++ Wrapfs is a stackable file system which simply passes its ++ operations to the lower layer. It is designed as a useful ++ template for developing or debugging other stackable file systems, ++ and more (see Documentation/filesystems/wrapfs.txt). See ++ for details. +diff --git a/fs/wrapfs/Makefile b/fs/wrapfs/Makefile +new file mode 100644 +index 0000000..f318d11 +--- /dev/null ++++ b/fs/wrapfs/Makefile +@@ -0,0 +1,7 @@ ++WRAPFS_VERSION="0.1" ++ ++EXTRA_CFLAGS += -DWRAPFS_VERSION=\"$(WRAPFS_VERSION)\" ++ ++obj-$(CONFIG_WRAP_FS) += wrapfs.o ++ ++wrapfs-y := dentry.o file.o inode.o main.o super.o lookup.o mmap.o +diff --git a/fs/wrapfs/dentry.c b/fs/wrapfs/dentry.c +new file mode 100644 +index 0000000..b173153 +--- /dev/null ++++ b/fs/wrapfs/dentry.c +@@ -0,0 +1,52 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include "wrapfs.h" ++ ++/* ++ * returns: -ERRNO if error (returned to user) ++ * 0: tell VFS to invalidate dentry ++ * 1: dentry is valid ++ */ ++static int wrapfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) ++{ ++ struct path lower_path, saved_path; ++ struct dentry *lower_dentry; ++ int err = 1; ++ ++ if (nd && nd->flags & LOOKUP_RCU) ++ return -ECHILD; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) ++ goto out; ++ pathcpy(&saved_path, &nd->path); ++ pathcpy(&nd->path, &lower_path); ++ err = lower_dentry->d_op->d_revalidate(lower_dentry, nd); ++ pathcpy(&nd->path, &saved_path); ++out: ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++static void wrapfs_d_release(struct dentry *dentry) ++{ ++ /* release and reset the lower paths */ ++ wrapfs_put_reset_lower_path(dentry); ++ free_dentry_private_data(dentry); ++ return; ++} ++ ++const struct dentry_operations wrapfs_dops = { ++ .d_revalidate = wrapfs_d_revalidate, ++ .d_release = wrapfs_d_release, ++}; +diff --git a/fs/wrapfs/file.c b/fs/wrapfs/file.c +new file mode 100644 +index 0000000..7a7fe1e +--- /dev/null ++++ b/fs/wrapfs/file.c +@@ -0,0 +1,298 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include "wrapfs.h" ++ ++static ssize_t wrapfs_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ int err; ++ struct file *lower_file; ++ struct dentry *dentry = file->f_path.dentry; ++ ++ lower_file = wrapfs_lower_file(file); ++ err = vfs_read(lower_file, buf, count, ppos); ++ /* update our inode atime upon a successful lower read */ ++ if (err >= 0) ++ fsstack_copy_attr_atime(dentry->d_inode, ++ lower_file->f_path.dentry->d_inode); ++ ++ return err; ++} ++ ++static ssize_t wrapfs_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ int err = 0; ++ struct file *lower_file; ++ struct dentry *dentry = file->f_path.dentry; ++ ++ lower_file = wrapfs_lower_file(file); ++ err = vfs_write(lower_file, buf, count, ppos); ++ /* update our inode times+sizes upon a successful lower write */ ++ if (err >= 0) { ++ fsstack_copy_inode_size(dentry->d_inode, ++ lower_file->f_path.dentry->d_inode); ++ fsstack_copy_attr_times(dentry->d_inode, ++ lower_file->f_path.dentry->d_inode); ++ } ++ ++ return err; ++} ++ ++static int wrapfs_readdir(struct file *file, void *dirent, filldir_t filldir) ++{ ++ int err = 0; ++ struct file *lower_file = NULL; ++ struct dentry *dentry = file->f_path.dentry; ++ ++ lower_file = wrapfs_lower_file(file); ++ err = vfs_readdir(lower_file, filldir, dirent); ++ file->f_pos = lower_file->f_pos; ++ if (err >= 0) /* copy the atime */ ++ fsstack_copy_attr_atime(dentry->d_inode, ++ lower_file->f_path.dentry->d_inode); ++ return err; ++} ++ ++static long wrapfs_unlocked_ioctl(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ long err = -ENOTTY; ++ struct file *lower_file; ++ ++ lower_file = wrapfs_lower_file(file); ++ ++ /* XXX: use vfs_ioctl if/when VFS exports it */ ++ if (!lower_file || !lower_file->f_op) ++ goto out; ++ if (lower_file->f_op->unlocked_ioctl) ++ err = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); ++ ++out: ++ return err; ++} ++ ++#ifdef CONFIG_COMPAT ++static long wrapfs_compat_ioctl(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ long err = -ENOTTY; ++ struct file *lower_file; ++ ++ lower_file = wrapfs_lower_file(file); ++ ++ /* XXX: use vfs_ioctl if/when VFS exports it */ ++ if (!lower_file || !lower_file->f_op) ++ goto out; ++ if (lower_file->f_op->compat_ioctl) ++ err = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); ++ ++out: ++ return err; ++} ++#endif ++ ++static int wrapfs_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ int err = 0; ++ bool willwrite; ++ struct file *lower_file; ++ const struct vm_operations_struct *saved_vm_ops = NULL; ++ ++ /* this might be deferred to mmap's writepage */ ++ willwrite = ((vma->vm_flags | VM_SHARED | VM_WRITE) == vma->vm_flags); ++ ++ /* ++ * File systems which do not implement ->writepage may use ++ * generic_file_readonly_mmap as their ->mmap op. If you call ++ * generic_file_readonly_mmap with VM_WRITE, you'd get an -EINVAL. ++ * But we cannot call the lower ->mmap op, so we can't tell that ++ * writeable mappings won't work. Therefore, our only choice is to ++ * check if the lower file system supports the ->writepage, and if ++ * not, return EINVAL (the same error that ++ * generic_file_readonly_mmap returns in that case). ++ */ ++ lower_file = wrapfs_lower_file(file); ++ if (willwrite && !lower_file->f_mapping->a_ops->writepage) { ++ err = -EINVAL; ++ printk(KERN_ERR "wrapfs: lower file system does not " ++ "support writeable mmap\n"); ++ goto out; ++ } ++ ++ /* ++ * find and save lower vm_ops. ++ * ++ * XXX: the VFS should have a cleaner way of finding the lower vm_ops ++ */ ++ if (!WRAPFS_F(file)->lower_vm_ops) { ++ err = lower_file->f_op->mmap(lower_file, vma); ++ if (err) { ++ printk(KERN_ERR "wrapfs: lower mmap failed %d\n", err); ++ goto out; ++ } ++ saved_vm_ops = vma->vm_ops; /* save: came from lower ->mmap */ ++ err = do_munmap(current->mm, vma->vm_start, ++ vma->vm_end - vma->vm_start); ++ if (err) { ++ printk(KERN_ERR "wrapfs: do_munmap failed %d\n", err); ++ goto out; ++ } ++ } ++ ++ /* ++ * Next 3 lines are all I need from generic_file_mmap. I definitely ++ * don't want its test for ->readpage which returns -ENOEXEC. ++ */ ++ file_accessed(file); ++ vma->vm_ops = &wrapfs_vm_ops; ++ vma->vm_flags |= VM_CAN_NONLINEAR; ++ ++ file->f_mapping->a_ops = &wrapfs_aops; /* set our aops */ ++ if (!WRAPFS_F(file)->lower_vm_ops) /* save for our ->fault */ ++ WRAPFS_F(file)->lower_vm_ops = saved_vm_ops; ++ ++out: ++ return err; ++} ++ ++static int wrapfs_open(struct inode *inode, struct file *file) ++{ ++ int err = 0; ++ struct file *lower_file = NULL; ++ struct path lower_path; ++ ++ /* don't open unhashed/deleted files */ ++ if (d_unhashed(file->f_path.dentry)) { ++ err = -ENOENT; ++ goto out_err; ++ } ++ ++ file->private_data = ++ kzalloc(sizeof(struct wrapfs_file_info), GFP_KERNEL); ++ if (!WRAPFS_F(file)) { ++ err = -ENOMEM; ++ goto out_err; ++ } ++ ++ /* open lower object and link wrapfs's file struct to lower's */ ++ wrapfs_get_lower_path(file->f_path.dentry, &lower_path); ++ lower_file = dentry_open(lower_path.dentry, lower_path.mnt, ++ file->f_flags, current_cred()); ++ if (IS_ERR(lower_file)) { ++ err = PTR_ERR(lower_file); ++ lower_file = wrapfs_lower_file(file); ++ if (lower_file) { ++ wrapfs_set_lower_file(file, NULL); ++ fput(lower_file); /* fput calls dput for lower_dentry */ ++ } ++ } else { ++ wrapfs_set_lower_file(file, lower_file); ++ } ++ ++ if (err) ++ kfree(WRAPFS_F(file)); ++ else ++ fsstack_copy_attr_all(inode, wrapfs_lower_inode(inode)); ++out_err: ++ return err; ++} ++ ++static int wrapfs_flush(struct file *file, fl_owner_t id) ++{ ++ int err = 0; ++ struct file *lower_file = NULL; ++ ++ lower_file = wrapfs_lower_file(file); ++ if (lower_file && lower_file->f_op && lower_file->f_op->flush) ++ err = lower_file->f_op->flush(lower_file, id); ++ ++ return err; ++} ++ ++/* release all lower object references & free the file info structure */ ++static int wrapfs_file_release(struct inode *inode, struct file *file) ++{ ++ struct file *lower_file; ++ ++ lower_file = wrapfs_lower_file(file); ++ if (lower_file) { ++ wrapfs_set_lower_file(file, NULL); ++ fput(lower_file); ++ } ++ ++ kfree(WRAPFS_F(file)); ++ return 0; ++} ++ ++static int wrapfs_fsync(struct file *file, loff_t start, loff_t end, ++ int datasync) ++{ ++ int err; ++ struct file *lower_file; ++ struct path lower_path; ++ struct dentry *dentry = file->f_path.dentry; ++ ++ err = generic_file_fsync(file, start, end, datasync); ++ if (err) ++ goto out; ++ lower_file = wrapfs_lower_file(file); ++ wrapfs_get_lower_path(dentry, &lower_path); ++ err = vfs_fsync_range(lower_file, start, end, datasync); ++ wrapfs_put_lower_path(dentry, &lower_path); ++out: ++ return err; ++} ++ ++static int wrapfs_fasync(int fd, struct file *file, int flag) ++{ ++ int err = 0; ++ struct file *lower_file = NULL; ++ ++ lower_file = wrapfs_lower_file(file); ++ if (lower_file->f_op && lower_file->f_op->fasync) ++ err = lower_file->f_op->fasync(fd, lower_file, flag); ++ ++ return err; ++} ++ ++const struct file_operations wrapfs_main_fops = { ++ .llseek = generic_file_llseek, ++ .read = wrapfs_read, ++ .write = wrapfs_write, ++ .unlocked_ioctl = wrapfs_unlocked_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = wrapfs_compat_ioctl, ++#endif ++ .mmap = wrapfs_mmap, ++ .open = wrapfs_open, ++ .flush = wrapfs_flush, ++ .release = wrapfs_file_release, ++ .fsync = wrapfs_fsync, ++ .fasync = wrapfs_fasync, ++}; ++ ++/* trimmed directory options */ ++const struct file_operations wrapfs_dir_fops = { ++ .llseek = generic_file_llseek, ++ .read = generic_read_dir, ++ .readdir = wrapfs_readdir, ++ .unlocked_ioctl = wrapfs_unlocked_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = wrapfs_compat_ioctl, ++#endif ++ .open = wrapfs_open, ++ .release = wrapfs_file_release, ++ .flush = wrapfs_flush, ++ .fsync = wrapfs_fsync, ++ .fasync = wrapfs_fasync, ++}; +diff --git a/fs/wrapfs/inode.c b/fs/wrapfs/inode.c +new file mode 100644 +index 0000000..b2653b3 +--- /dev/null ++++ b/fs/wrapfs/inode.c +@@ -0,0 +1,514 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include "wrapfs.h" ++ ++static int wrapfs_create(struct inode *dir, struct dentry *dentry, ++ umode_t mode, struct nameidata *nd) ++{ ++ int err = 0; ++ struct dentry *lower_dentry; ++ struct dentry *lower_parent_dentry = NULL; ++ struct path lower_path, saved_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ lower_parent_dentry = lock_parent(lower_dentry); ++ ++ err = mnt_want_write(lower_path.mnt); ++ if (err) ++ goto out_unlock; ++ ++ pathcpy(&saved_path, &nd->path); ++ pathcpy(&nd->path, &lower_path); ++ err = vfs_create(lower_parent_dentry->d_inode, lower_dentry, mode, nd); ++ pathcpy(&nd->path, &saved_path); ++ if (err) ++ goto out; ++ ++ err = wrapfs_interpose(dentry, dir->i_sb, &lower_path); ++ if (err) ++ goto out; ++ fsstack_copy_attr_times(dir, wrapfs_lower_inode(dir)); ++ fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); ++ ++out: ++ mnt_drop_write(lower_path.mnt); ++out_unlock: ++ unlock_dir(lower_parent_dentry); ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++static int wrapfs_link(struct dentry *old_dentry, struct inode *dir, ++ struct dentry *new_dentry) ++{ ++ struct dentry *lower_old_dentry; ++ struct dentry *lower_new_dentry; ++ struct dentry *lower_dir_dentry; ++ u64 file_size_save; ++ int err; ++ struct path lower_old_path, lower_new_path; ++ ++ file_size_save = i_size_read(old_dentry->d_inode); ++ wrapfs_get_lower_path(old_dentry, &lower_old_path); ++ wrapfs_get_lower_path(new_dentry, &lower_new_path); ++ lower_old_dentry = lower_old_path.dentry; ++ lower_new_dentry = lower_new_path.dentry; ++ lower_dir_dentry = lock_parent(lower_new_dentry); ++ ++ err = mnt_want_write(lower_new_path.mnt); ++ if (err) ++ goto out_unlock; ++ ++ err = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode, ++ lower_new_dentry); ++ if (err || !lower_new_dentry->d_inode) ++ goto out; ++ ++ err = wrapfs_interpose(new_dentry, dir->i_sb, &lower_new_path); ++ if (err) ++ goto out; ++ fsstack_copy_attr_times(dir, lower_new_dentry->d_inode); ++ fsstack_copy_inode_size(dir, lower_new_dentry->d_inode); ++ set_nlink(old_dentry->d_inode, ++ wrapfs_lower_inode(old_dentry->d_inode)->i_nlink); ++ i_size_write(new_dentry->d_inode, file_size_save); ++out: ++ mnt_drop_write(lower_new_path.mnt); ++out_unlock: ++ unlock_dir(lower_dir_dentry); ++ wrapfs_put_lower_path(old_dentry, &lower_old_path); ++ wrapfs_put_lower_path(new_dentry, &lower_new_path); ++ return err; ++} ++ ++static int wrapfs_unlink(struct inode *dir, struct dentry *dentry) ++{ ++ int err; ++ struct dentry *lower_dentry; ++ struct inode *lower_dir_inode = wrapfs_lower_inode(dir); ++ struct dentry *lower_dir_dentry; ++ struct path lower_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ dget(lower_dentry); ++ lower_dir_dentry = lock_parent(lower_dentry); ++ ++ err = mnt_want_write(lower_path.mnt); ++ if (err) ++ goto out_unlock; ++ err = vfs_unlink(lower_dir_inode, lower_dentry); ++ ++ /* ++ * Note: unlinking on top of NFS can cause silly-renamed files. ++ * Trying to delete such files results in EBUSY from NFS ++ * below. Silly-renamed files will get deleted by NFS later on, so ++ * we just need to detect them here and treat such EBUSY errors as ++ * if the upper file was successfully deleted. ++ */ ++ if (err == -EBUSY && lower_dentry->d_flags & DCACHE_NFSFS_RENAMED) ++ err = 0; ++ if (err) ++ goto out; ++ fsstack_copy_attr_times(dir, lower_dir_inode); ++ fsstack_copy_inode_size(dir, lower_dir_inode); ++ set_nlink(dentry->d_inode, ++ wrapfs_lower_inode(dentry->d_inode)->i_nlink); ++ dentry->d_inode->i_ctime = dir->i_ctime; ++ d_drop(dentry); /* this is needed, else LTP fails (VFS won't do it) */ ++out: ++ mnt_drop_write(lower_path.mnt); ++out_unlock: ++ unlock_dir(lower_dir_dentry); ++ dput(lower_dentry); ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++static int wrapfs_symlink(struct inode *dir, struct dentry *dentry, ++ const char *symname) ++{ ++ int err = 0; ++ struct dentry *lower_dentry; ++ struct dentry *lower_parent_dentry = NULL; ++ struct path lower_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ lower_parent_dentry = lock_parent(lower_dentry); ++ ++ err = mnt_want_write(lower_path.mnt); ++ if (err) ++ goto out_unlock; ++ err = vfs_symlink(lower_parent_dentry->d_inode, lower_dentry, symname); ++ if (err) ++ goto out; ++ err = wrapfs_interpose(dentry, dir->i_sb, &lower_path); ++ if (err) ++ goto out; ++ fsstack_copy_attr_times(dir, wrapfs_lower_inode(dir)); ++ fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); ++ ++out: ++ mnt_drop_write(lower_path.mnt); ++out_unlock: ++ unlock_dir(lower_parent_dentry); ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++static int wrapfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ++{ ++ int err = 0; ++ struct dentry *lower_dentry; ++ struct dentry *lower_parent_dentry = NULL; ++ struct path lower_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ lower_parent_dentry = lock_parent(lower_dentry); ++ ++ err = mnt_want_write(lower_path.mnt); ++ if (err) ++ goto out_unlock; ++ err = vfs_mkdir(lower_parent_dentry->d_inode, lower_dentry, mode); ++ if (err) ++ goto out; ++ ++ err = wrapfs_interpose(dentry, dir->i_sb, &lower_path); ++ if (err) ++ goto out; ++ ++ fsstack_copy_attr_times(dir, wrapfs_lower_inode(dir)); ++ fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); ++ /* update number of links on parent directory */ ++ set_nlink(dir, wrapfs_lower_inode(dir)->i_nlink); ++ ++out: ++ mnt_drop_write(lower_path.mnt); ++out_unlock: ++ unlock_dir(lower_parent_dentry); ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++static int wrapfs_rmdir(struct inode *dir, struct dentry *dentry) ++{ ++ struct dentry *lower_dentry; ++ struct dentry *lower_dir_dentry; ++ int err; ++ struct path lower_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ lower_dir_dentry = lock_parent(lower_dentry); ++ ++ err = mnt_want_write(lower_path.mnt); ++ if (err) ++ goto out_unlock; ++ err = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry); ++ if (err) ++ goto out; ++ ++ d_drop(dentry); /* drop our dentry on success (why not VFS's job?) */ ++ if (dentry->d_inode) ++ clear_nlink(dentry->d_inode); ++ fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); ++ fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); ++ set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); ++ ++out: ++ mnt_drop_write(lower_path.mnt); ++out_unlock: ++ unlock_dir(lower_dir_dentry); ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++static int wrapfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, ++ dev_t dev) ++{ ++ int err = 0; ++ struct dentry *lower_dentry; ++ struct dentry *lower_parent_dentry = NULL; ++ struct path lower_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ lower_parent_dentry = lock_parent(lower_dentry); ++ ++ err = mnt_want_write(lower_path.mnt); ++ if (err) ++ goto out_unlock; ++ err = vfs_mknod(lower_parent_dentry->d_inode, lower_dentry, mode, dev); ++ if (err) ++ goto out; ++ ++ err = wrapfs_interpose(dentry, dir->i_sb, &lower_path); ++ if (err) ++ goto out; ++ fsstack_copy_attr_times(dir, wrapfs_lower_inode(dir)); ++ fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode); ++ ++out: ++ mnt_drop_write(lower_path.mnt); ++out_unlock: ++ unlock_dir(lower_parent_dentry); ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++/* ++ * The locking rules in wrapfs_rename are complex. We could use a simpler ++ * superblock-level name-space lock for renames and copy-ups. ++ */ ++static int wrapfs_rename(struct inode *old_dir, struct dentry *old_dentry, ++ struct inode *new_dir, struct dentry *new_dentry) ++{ ++ int err = 0; ++ struct dentry *lower_old_dentry = NULL; ++ struct dentry *lower_new_dentry = NULL; ++ struct dentry *lower_old_dir_dentry = NULL; ++ struct dentry *lower_new_dir_dentry = NULL; ++ struct dentry *trap = NULL; ++ struct path lower_old_path, lower_new_path; ++ ++ wrapfs_get_lower_path(old_dentry, &lower_old_path); ++ wrapfs_get_lower_path(new_dentry, &lower_new_path); ++ lower_old_dentry = lower_old_path.dentry; ++ lower_new_dentry = lower_new_path.dentry; ++ lower_old_dir_dentry = dget_parent(lower_old_dentry); ++ lower_new_dir_dentry = dget_parent(lower_new_dentry); ++ ++ trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); ++ /* source should not be ancestor of target */ ++ if (trap == lower_old_dentry) { ++ err = -EINVAL; ++ goto out; ++ } ++ /* target should not be ancestor of source */ ++ if (trap == lower_new_dentry) { ++ err = -ENOTEMPTY; ++ goto out; ++ } ++ ++ err = mnt_want_write(lower_old_path.mnt); ++ if (err) ++ goto out; ++ err = mnt_want_write(lower_new_path.mnt); ++ if (err) ++ goto out_drop_old_write; ++ ++ err = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, ++ lower_new_dir_dentry->d_inode, lower_new_dentry); ++ if (err) ++ goto out_err; ++ ++ fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); ++ fsstack_copy_inode_size(new_dir, lower_new_dir_dentry->d_inode); ++ if (new_dir != old_dir) { ++ fsstack_copy_attr_all(old_dir, ++ lower_old_dir_dentry->d_inode); ++ fsstack_copy_inode_size(old_dir, ++ lower_old_dir_dentry->d_inode); ++ } ++ ++out_err: ++ mnt_drop_write(lower_new_path.mnt); ++out_drop_old_write: ++ mnt_drop_write(lower_old_path.mnt); ++out: ++ unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); ++ dput(lower_old_dir_dentry); ++ dput(lower_new_dir_dentry); ++ wrapfs_put_lower_path(old_dentry, &lower_old_path); ++ wrapfs_put_lower_path(new_dentry, &lower_new_path); ++ return err; ++} ++ ++static int wrapfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) ++{ ++ int err; ++ struct dentry *lower_dentry; ++ struct path lower_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ if (!lower_dentry->d_inode->i_op || ++ !lower_dentry->d_inode->i_op->readlink) { ++ err = -EINVAL; ++ goto out; ++ } ++ ++ err = lower_dentry->d_inode->i_op->readlink(lower_dentry, ++ buf, bufsiz); ++ if (err < 0) ++ goto out; ++ fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode); ++ ++out: ++ wrapfs_put_lower_path(dentry, &lower_path); ++ return err; ++} ++ ++static void *wrapfs_follow_link(struct dentry *dentry, struct nameidata *nd) ++{ ++ char *buf; ++ int len = PAGE_SIZE, err; ++ mm_segment_t old_fs; ++ ++ /* This is freed by the put_link method assuming a successful call. */ ++ buf = kmalloc(len, GFP_KERNEL); ++ if (!buf) { ++ buf = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ /* read the symlink, and then we will follow it */ ++ old_fs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = wrapfs_readlink(dentry, buf, len); ++ set_fs(old_fs); ++ if (err < 0) { ++ kfree(buf); ++ buf = ERR_PTR(err); ++ } else { ++ buf[err] = '\0'; ++ } ++out: ++ nd_set_link(nd, buf); ++ return NULL; ++} ++ ++/* this @nd *IS* still used */ ++static void wrapfs_put_link(struct dentry *dentry, struct nameidata *nd, ++ void *cookie) ++{ ++ char *buf = nd_get_link(nd); ++ if (!IS_ERR(buf)) /* free the char* */ ++ kfree(buf); ++} ++ ++static int wrapfs_permission(struct inode *inode, int mask) ++{ ++ struct inode *lower_inode; ++ int err; ++ ++ lower_inode = wrapfs_lower_inode(inode); ++ err = inode_permission(lower_inode, mask); ++ return err; ++} ++ ++static int wrapfs_setattr(struct dentry *dentry, struct iattr *ia) ++{ ++ int err = 0; ++ struct dentry *lower_dentry; ++ struct inode *inode; ++ struct inode *lower_inode; ++ struct path lower_path; ++ struct iattr lower_ia; ++ ++ inode = dentry->d_inode; ++ ++ /* ++ * Check if user has permission to change inode. We don't check if ++ * this user can change the lower inode: that should happen when ++ * calling notify_change on the lower inode. ++ */ ++ err = inode_change_ok(inode, ia); ++ if (err) ++ goto out_err; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ lower_dentry = lower_path.dentry; ++ lower_inode = wrapfs_lower_inode(inode); ++ ++ /* prepare our own lower struct iattr (with the lower file) */ ++ memcpy(&lower_ia, ia, sizeof(lower_ia)); ++ if (ia->ia_valid & ATTR_FILE) ++ lower_ia.ia_file = wrapfs_lower_file(ia->ia_file); ++ ++ /* ++ * If shrinking, first truncate upper level to cancel writing dirty ++ * pages beyond the new eof; and also if its' maxbytes is more ++ * limiting (fail with -EFBIG before making any change to the lower ++ * level). There is no need to vmtruncate the upper level ++ * afterwards in the other cases: we fsstack_copy_inode_size from ++ * the lower level. ++ */ ++ if (ia->ia_valid & ATTR_SIZE) { ++ err = inode_newsize_ok(inode, ia->ia_size); ++ if (err) ++ goto out; ++ truncate_setsize(inode, ia->ia_size); ++ } ++ ++ /* ++ * mode change is for clearing setuid/setgid bits. Allow lower fs ++ * to interpret this in its own way. ++ */ ++ if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) ++ lower_ia.ia_valid &= ~ATTR_MODE; ++ ++ /* notify the (possibly copied-up) lower inode */ ++ /* ++ * Note: we use lower_dentry->d_inode, because lower_inode may be ++ * unlinked (no inode->i_sb and i_ino==0. This happens if someone ++ * tries to open(), unlink(), then ftruncate() a file. ++ */ ++ mutex_lock(&lower_dentry->d_inode->i_mutex); ++ err = notify_change(lower_dentry, &lower_ia); /* note: lower_ia */ ++ mutex_unlock(&lower_dentry->d_inode->i_mutex); ++ if (err) ++ goto out; ++ ++ /* get attributes from the lower inode */ ++ fsstack_copy_attr_all(inode, lower_inode); ++ /* ++ * Not running fsstack_copy_inode_size(inode, lower_inode), because ++ * VFS should update our inode size, and notify_change on ++ * lower_inode should update its size. ++ */ ++ ++out: ++ wrapfs_put_lower_path(dentry, &lower_path); ++out_err: ++ return err; ++} ++ ++const struct inode_operations wrapfs_symlink_iops = { ++ .readlink = wrapfs_readlink, ++ .permission = wrapfs_permission, ++ .follow_link = wrapfs_follow_link, ++ .setattr = wrapfs_setattr, ++ .put_link = wrapfs_put_link, ++}; ++ ++const struct inode_operations wrapfs_dir_iops = { ++ .create = wrapfs_create, ++ .lookup = wrapfs_lookup, ++ .link = wrapfs_link, ++ .unlink = wrapfs_unlink, ++ .symlink = wrapfs_symlink, ++ .mkdir = wrapfs_mkdir, ++ .rmdir = wrapfs_rmdir, ++ .mknod = wrapfs_mknod, ++ .rename = wrapfs_rename, ++ .permission = wrapfs_permission, ++ .setattr = wrapfs_setattr, ++}; ++ ++const struct inode_operations wrapfs_main_iops = { ++ .permission = wrapfs_permission, ++ .setattr = wrapfs_setattr, ++}; +diff --git a/fs/wrapfs/lookup.c b/fs/wrapfs/lookup.c +new file mode 100644 +index 0000000..325b2ba +--- /dev/null ++++ b/fs/wrapfs/lookup.c +@@ -0,0 +1,304 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include "wrapfs.h" ++ ++/* The dentry cache is just so we have properly sized dentries */ ++static struct kmem_cache *wrapfs_dentry_cachep; ++ ++int wrapfs_init_dentry_cache(void) ++{ ++ wrapfs_dentry_cachep = ++ kmem_cache_create("wrapfs_dentry", ++ sizeof(struct wrapfs_dentry_info), ++ 0, SLAB_RECLAIM_ACCOUNT, NULL); ++ ++ return wrapfs_dentry_cachep ? 0 : -ENOMEM; ++} ++ ++void wrapfs_destroy_dentry_cache(void) ++{ ++ if (wrapfs_dentry_cachep) ++ kmem_cache_destroy(wrapfs_dentry_cachep); ++} ++ ++void free_dentry_private_data(struct dentry *dentry) ++{ ++ if (!dentry || !dentry->d_fsdata) ++ return; ++ kmem_cache_free(wrapfs_dentry_cachep, dentry->d_fsdata); ++ dentry->d_fsdata = NULL; ++} ++ ++/* allocate new dentry private data */ ++int new_dentry_private_data(struct dentry *dentry) ++{ ++ struct wrapfs_dentry_info *info = WRAPFS_D(dentry); ++ ++ /* use zalloc to init dentry_info.lower_path */ ++ info = kmem_cache_zalloc(wrapfs_dentry_cachep, GFP_ATOMIC); ++ if (!info) ++ return -ENOMEM; ++ ++ spin_lock_init(&info->lock); ++ dentry->d_fsdata = info; ++ ++ return 0; ++} ++ ++static int wrapfs_inode_test(struct inode *inode, void *candidate_lower_inode) ++{ ++ struct inode *current_lower_inode = wrapfs_lower_inode(inode); ++ if (current_lower_inode == (struct inode *)candidate_lower_inode) ++ return 1; /* found a match */ ++ else ++ return 0; /* no match */ ++} ++ ++static int wrapfs_inode_set(struct inode *inode, void *lower_inode) ++{ ++ /* we do actual inode initialization in wrapfs_iget */ ++ return 0; ++} ++ ++struct inode *wrapfs_iget(struct super_block *sb, struct inode *lower_inode) ++{ ++ struct wrapfs_inode_info *info; ++ struct inode *inode; /* the new inode to return */ ++ int err; ++ ++ inode = iget5_locked(sb, /* our superblock */ ++ /* ++ * hashval: we use inode number, but we can ++ * also use "(unsigned long)lower_inode" ++ * instead. ++ */ ++ lower_inode->i_ino, /* hashval */ ++ wrapfs_inode_test, /* inode comparison function */ ++ wrapfs_inode_set, /* inode init function */ ++ lower_inode); /* data passed to test+set fxns */ ++ if (!inode) { ++ err = -EACCES; ++ iput(lower_inode); ++ return ERR_PTR(err); ++ } ++ /* if found a cached inode, then just return it */ ++ if (!(inode->i_state & I_NEW)) ++ return inode; ++ ++ /* initialize new inode */ ++ info = WRAPFS_I(inode); ++ ++ inode->i_ino = lower_inode->i_ino; ++ if (!igrab(lower_inode)) { ++ err = -ESTALE; ++ return ERR_PTR(err); ++ } ++ wrapfs_set_lower_inode(inode, lower_inode); ++ ++ inode->i_version++; ++ ++ /* use different set of inode ops for symlinks & directories */ ++ if (S_ISDIR(lower_inode->i_mode)) ++ inode->i_op = &wrapfs_dir_iops; ++ else if (S_ISLNK(lower_inode->i_mode)) ++ inode->i_op = &wrapfs_symlink_iops; ++ else ++ inode->i_op = &wrapfs_main_iops; ++ ++ /* use different set of file ops for directories */ ++ if (S_ISDIR(lower_inode->i_mode)) ++ inode->i_fop = &wrapfs_dir_fops; ++ else ++ inode->i_fop = &wrapfs_main_fops; ++ ++ inode->i_mapping->a_ops = &wrapfs_aops; ++ ++ inode->i_atime.tv_sec = 0; ++ inode->i_atime.tv_nsec = 0; ++ inode->i_mtime.tv_sec = 0; ++ inode->i_mtime.tv_nsec = 0; ++ inode->i_ctime.tv_sec = 0; ++ inode->i_ctime.tv_nsec = 0; ++ ++ /* properly initialize special inodes */ ++ if (S_ISBLK(lower_inode->i_mode) || S_ISCHR(lower_inode->i_mode) || ++ S_ISFIFO(lower_inode->i_mode) || S_ISSOCK(lower_inode->i_mode)) ++ init_special_inode(inode, lower_inode->i_mode, ++ lower_inode->i_rdev); ++ ++ /* all well, copy inode attributes */ ++ fsstack_copy_attr_all(inode, lower_inode); ++ fsstack_copy_inode_size(inode, lower_inode); ++ ++ unlock_new_inode(inode); ++ return inode; ++} ++ ++/* ++ * Connect a wrapfs inode dentry/inode with several lower ones. This is ++ * the classic stackable file system "vnode interposition" action. ++ * ++ * @dentry: wrapfs's dentry which interposes on lower one ++ * @sb: wrapfs's super_block ++ * @lower_path: the lower path (caller does path_get/put) ++ */ ++int wrapfs_interpose(struct dentry *dentry, struct super_block *sb, ++ struct path *lower_path) ++{ ++ int err = 0; ++ struct inode *inode; ++ struct inode *lower_inode; ++ struct super_block *lower_sb; ++ ++ lower_inode = lower_path->dentry->d_inode; ++ lower_sb = wrapfs_lower_super(sb); ++ ++ /* check that the lower file system didn't cross a mount point */ ++ if (lower_inode->i_sb != lower_sb) { ++ err = -EXDEV; ++ goto out; ++ } ++ ++ /* ++ * We allocate our new inode below by calling wrapfs_iget, ++ * which will initialize some of the new inode's fields ++ */ ++ ++ /* inherit lower inode number for wrapfs's inode */ ++ inode = wrapfs_iget(sb, lower_inode); ++ if (IS_ERR(inode)) { ++ err = PTR_ERR(inode); ++ goto out; ++ } ++ ++ d_add(dentry, inode); ++ ++out: ++ return err; ++} ++ ++/* ++ * Main driver function for wrapfs's lookup. ++ * ++ * Returns: NULL (ok), ERR_PTR if an error occurred. ++ * Fills in lower_parent_path with on success. ++ */ ++static struct dentry *__wrapfs_lookup(struct dentry *dentry, int flags, ++ struct path *lower_parent_path) ++{ ++ int err = 0; ++ struct vfsmount *lower_dir_mnt; ++ struct dentry *lower_dir_dentry = NULL; ++ struct dentry *lower_dentry; ++ const char *name; ++ struct path lower_path; ++ struct qstr this; ++ ++ /* must initialize dentry operations */ ++ d_set_d_op(dentry, &wrapfs_dops); ++ ++ if (IS_ROOT(dentry)) ++ goto out; ++ ++ name = dentry->d_name.name; ++ ++ /* now start the actual lookup procedure */ ++ lower_dir_dentry = lower_parent_path->dentry; ++ lower_dir_mnt = lower_parent_path->mnt; ++ ++ /* Use vfs_path_lookup to check if the dentry exists or not */ ++ err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name, 0, ++ &lower_path); ++ ++ /* no error: handle positive dentries */ ++ if (!err) { ++ wrapfs_set_lower_path(dentry, &lower_path); ++ err = wrapfs_interpose(dentry, dentry->d_sb, &lower_path); ++ if (err) /* path_put underlying path on error */ ++ wrapfs_put_reset_lower_path(dentry); ++ goto out; ++ } ++ ++ /* ++ * We don't consider ENOENT an error, and we want to return a ++ * negative dentry. ++ */ ++ if (err && err != -ENOENT) ++ goto out; ++ ++ /* instatiate a new negative dentry */ ++ this.name = name; ++ this.len = strlen(name); ++ this.hash = full_name_hash(this.name, this.len); ++ lower_dentry = d_lookup(lower_dir_dentry, &this); ++ if (lower_dentry) ++ goto setup_lower; ++ ++ lower_dentry = d_alloc(lower_dir_dentry, &this); ++ if (!lower_dentry) { ++ err = -ENOMEM; ++ goto out; ++ } ++ d_add(lower_dentry, NULL); /* instantiate and hash */ ++ ++setup_lower: ++ lower_path.dentry = lower_dentry; ++ lower_path.mnt = mntget(lower_dir_mnt); ++ wrapfs_set_lower_path(dentry, &lower_path); ++ ++ /* ++ * If the intent is to create a file, then don't return an error, so ++ * the VFS will continue the process of making this negative dentry ++ * into a positive one. ++ */ ++ if (flags & (LOOKUP_CREATE|LOOKUP_RENAME_TARGET)) ++ err = 0; ++ ++out: ++ return ERR_PTR(err); ++} ++ ++struct dentry *wrapfs_lookup(struct inode *dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct dentry *ret, *parent; ++ struct path lower_parent_path; ++ int err = 0; ++ ++ BUG_ON(!nd); ++ parent = dget_parent(dentry); ++ ++ wrapfs_get_lower_path(parent, &lower_parent_path); ++ ++ /* allocate dentry private data. We free it in ->d_release */ ++ err = new_dentry_private_data(dentry); ++ if (err) { ++ ret = ERR_PTR(err); ++ goto out; ++ } ++ ret = __wrapfs_lookup(dentry, nd->flags, &lower_parent_path); ++ if (IS_ERR(ret)) ++ goto out; ++ if (ret) ++ dentry = ret; ++ if (dentry->d_inode) ++ fsstack_copy_attr_times(dentry->d_inode, ++ wrapfs_lower_inode(dentry->d_inode)); ++ /* update parent directory's atime */ ++ fsstack_copy_attr_atime(parent->d_inode, ++ wrapfs_lower_inode(parent->d_inode)); ++ ++out: ++ wrapfs_put_lower_path(parent, &lower_parent_path); ++ dput(parent); ++ return ret; ++} +diff --git a/fs/wrapfs/main.c b/fs/wrapfs/main.c +new file mode 100644 +index 0000000..130aca6 +--- /dev/null ++++ b/fs/wrapfs/main.c +@@ -0,0 +1,173 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include "wrapfs.h" ++#include ++ ++/* ++ * There is no need to lock the wrapfs_super_info's rwsem as there is no ++ * way anyone can have a reference to the superblock at this point in time. ++ */ ++static int wrapfs_read_super(struct super_block *sb, void *raw_data, int silent) ++{ ++ int err = 0; ++ struct super_block *lower_sb; ++ struct path lower_path; ++ char *dev_name = (char *) raw_data; ++ struct inode *inode; ++ ++ if (!dev_name) { ++ printk(KERN_ERR ++ "wrapfs: read_super: missing dev_name argument\n"); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ /* parse lower path */ ++ err = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, ++ &lower_path); ++ if (err) { ++ printk(KERN_ERR "wrapfs: error accessing " ++ "lower directory '%s'\n", dev_name); ++ goto out; ++ } ++ ++ /* allocate superblock private data */ ++ sb->s_fs_info = kzalloc(sizeof(struct wrapfs_sb_info), GFP_KERNEL); ++ if (!WRAPFS_SB(sb)) { ++ printk(KERN_CRIT "wrapfs: read_super: out of memory\n"); ++ err = -ENOMEM; ++ goto out_free; ++ } ++ ++ /* set the lower superblock field of upper superblock */ ++ lower_sb = lower_path.dentry->d_sb; ++ atomic_inc(&lower_sb->s_active); ++ wrapfs_set_lower_super(sb, lower_sb); ++ ++ /* inherit maxbytes from lower file system */ ++ sb->s_maxbytes = lower_sb->s_maxbytes; ++ ++ /* ++ * Our c/m/atime granularity is 1 ns because we may stack on file ++ * systems whose granularity is as good. ++ */ ++ sb->s_time_gran = 1; ++ ++ sb->s_op = &wrapfs_sops; ++ ++ /* get a new inode and allocate our root dentry */ ++ inode = wrapfs_iget(sb, lower_path.dentry->d_inode); ++ if (IS_ERR(inode)) { ++ err = PTR_ERR(inode); ++ goto out_sput; ++ } ++ sb->s_root = d_alloc_root(inode); ++ if (!sb->s_root) { ++ err = -ENOMEM; ++ goto out_iput; ++ } ++ d_set_d_op(sb->s_root, &wrapfs_dops); ++ ++ /* link the upper and lower dentries */ ++ sb->s_root->d_fsdata = NULL; ++ err = new_dentry_private_data(sb->s_root); ++ if (err) ++ goto out_freeroot; ++ ++ /* if get here: cannot have error */ ++ ++ /* set the lower dentries for s_root */ ++ wrapfs_set_lower_path(sb->s_root, &lower_path); ++ ++ /* ++ * No need to call interpose because we already have a positive ++ * dentry, which was instantiated by d_alloc_root. Just need to ++ * d_rehash it. ++ */ ++ d_rehash(sb->s_root); ++ if (!silent) ++ printk(KERN_INFO ++ "wrapfs: mounted on top of %s type %s\n", ++ dev_name, lower_sb->s_type->name); ++ goto out; /* all is well */ ++ ++ /* no longer needed: free_dentry_private_data(sb->s_root); */ ++out_freeroot: ++ dput(sb->s_root); ++out_iput: ++ iput(inode); ++out_sput: ++ /* drop refs we took earlier */ ++ atomic_dec(&lower_sb->s_active); ++ kfree(WRAPFS_SB(sb)); ++ sb->s_fs_info = NULL; ++out_free: ++ path_put(&lower_path); ++ ++out: ++ return err; ++} ++ ++struct dentry *wrapfs_mount(struct file_system_type *fs_type, int flags, ++ const char *dev_name, void *raw_data) ++{ ++ void *lower_path_name = (void *) dev_name; ++ ++ return mount_nodev(fs_type, flags, lower_path_name, ++ wrapfs_read_super); ++} ++ ++static struct file_system_type wrapfs_fs_type = { ++ .owner = THIS_MODULE, ++ .name = WRAPFS_NAME, ++ .mount = wrapfs_mount, ++ .kill_sb = generic_shutdown_super, ++ .fs_flags = FS_REVAL_DOT, ++}; ++ ++static int __init init_wrapfs_fs(void) ++{ ++ int err; ++ ++ pr_info("Registering wrapfs " WRAPFS_VERSION "\n"); ++ ++ err = wrapfs_init_inode_cache(); ++ if (err) ++ goto out; ++ err = wrapfs_init_dentry_cache(); ++ if (err) ++ goto out; ++ err = register_filesystem(&wrapfs_fs_type); ++out: ++ if (err) { ++ wrapfs_destroy_inode_cache(); ++ wrapfs_destroy_dentry_cache(); ++ } ++ return err; ++} ++ ++static void __exit exit_wrapfs_fs(void) ++{ ++ wrapfs_destroy_inode_cache(); ++ wrapfs_destroy_dentry_cache(); ++ unregister_filesystem(&wrapfs_fs_type); ++ pr_info("Completed wrapfs module unload\n"); ++} ++ ++MODULE_AUTHOR("Erez Zadok, Filesystems and Storage Lab, Stony Brook University" ++ " (http://www.fsl.cs.sunysb.edu/)"); ++MODULE_DESCRIPTION("Wrapfs " WRAPFS_VERSION ++ " (http://wrapfs.filesystems.org/)"); ++MODULE_LICENSE("GPL"); ++ ++module_init(init_wrapfs_fs); ++module_exit(exit_wrapfs_fs); +diff --git a/fs/wrapfs/mmap.c b/fs/wrapfs/mmap.c +new file mode 100644 +index 0000000..c224fc3 +--- /dev/null ++++ b/fs/wrapfs/mmap.c +@@ -0,0 +1,53 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include "wrapfs.h" ++ ++static int wrapfs_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ++{ ++ int err; ++ struct file *file, *lower_file; ++ const struct vm_operations_struct *lower_vm_ops; ++ struct vm_area_struct lower_vma; ++ ++ memcpy(&lower_vma, vma, sizeof(struct vm_area_struct)); ++ file = lower_vma.vm_file; ++ lower_vm_ops = WRAPFS_F(file)->lower_vm_ops; ++ BUG_ON(!lower_vm_ops); ++ ++ lower_file = wrapfs_lower_file(file); ++ /* ++ * XXX: vm_ops->fault may be called in parallel. Because we have to ++ * resort to temporarily changing the vma->vm_file to point to the ++ * lower file, a concurrent invocation of wrapfs_fault could see a ++ * different value. In this workaround, we keep a different copy of ++ * the vma structure in our stack, so we never expose a different ++ * value of the vma->vm_file called to us, even temporarily. A ++ * better fix would be to change the calling semantics of ->fault to ++ * take an explicit file pointer. ++ */ ++ lower_vma.vm_file = lower_file; ++ err = lower_vm_ops->fault(&lower_vma, vmf); ++ return err; ++} ++ ++/* ++ * XXX: the default address_space_ops for wrapfs is empty. We cannot set ++ * our inode->i_mapping->a_ops to NULL because too many code paths expect ++ * the a_ops vector to be non-NULL. ++ */ ++const struct address_space_operations wrapfs_aops = { ++ /* empty on purpose */ ++}; ++ ++const struct vm_operations_struct wrapfs_vm_ops = { ++ .fault = wrapfs_fault, ++}; +diff --git a/fs/wrapfs/super.c b/fs/wrapfs/super.c +new file mode 100644 +index 0000000..89d277d +--- /dev/null ++++ b/fs/wrapfs/super.c +@@ -0,0 +1,168 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include "wrapfs.h" ++ ++/* ++ * The inode cache is used with alloc_inode for both our inode info and the ++ * vfs inode. ++ */ ++static struct kmem_cache *wrapfs_inode_cachep; ++ ++/* final actions when unmounting a file system */ ++static void wrapfs_put_super(struct super_block *sb) ++{ ++ struct wrapfs_sb_info *spd; ++ struct super_block *s; ++ ++ spd = WRAPFS_SB(sb); ++ if (!spd) ++ return; ++ ++ /* decrement lower super references */ ++ s = wrapfs_lower_super(sb); ++ wrapfs_set_lower_super(sb, NULL); ++ atomic_dec(&s->s_active); ++ ++ kfree(spd); ++ sb->s_fs_info = NULL; ++} ++ ++static int wrapfs_statfs(struct dentry *dentry, struct kstatfs *buf) ++{ ++ int err; ++ struct path lower_path; ++ ++ wrapfs_get_lower_path(dentry, &lower_path); ++ err = vfs_statfs(&lower_path, buf); ++ wrapfs_put_lower_path(dentry, &lower_path); ++ ++ /* set return buf to our f/s to avoid confusing user-level utils */ ++ buf->f_type = WRAPFS_SUPER_MAGIC; ++ ++ return err; ++} ++ ++/* ++ * @flags: numeric mount options ++ * @options: mount options string ++ */ ++static int wrapfs_remount_fs(struct super_block *sb, int *flags, char *options) ++{ ++ int err = 0; ++ ++ /* ++ * The VFS will take care of "ro" and "rw" flags among others. We ++ * can safely accept a few flags (RDONLY, MANDLOCK), and honor ++ * SILENT, but anything else left over is an error. ++ */ ++ if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT)) != 0) { ++ printk(KERN_ERR ++ "wrapfs: remount flags 0x%x unsupported\n", *flags); ++ err = -EINVAL; ++ } ++ ++ return err; ++} ++ ++/* ++ * Called by iput() when the inode reference count reached zero ++ * and the inode is not hashed anywhere. Used to clear anything ++ * that needs to be, before the inode is completely destroyed and put ++ * on the inode free list. ++ */ ++static void wrapfs_evict_inode(struct inode *inode) ++{ ++ struct inode *lower_inode; ++ ++ truncate_inode_pages(&inode->i_data, 0); ++ end_writeback(inode); ++ /* ++ * Decrement a reference to a lower_inode, which was incremented ++ * by our read_inode when it was created initially. ++ */ ++ lower_inode = wrapfs_lower_inode(inode); ++ wrapfs_set_lower_inode(inode, NULL); ++ iput(lower_inode); ++} ++ ++static struct inode *wrapfs_alloc_inode(struct super_block *sb) ++{ ++ struct wrapfs_inode_info *i; ++ ++ i = kmem_cache_alloc(wrapfs_inode_cachep, GFP_KERNEL); ++ if (!i) ++ return NULL; ++ ++ /* memset everything up to the inode to 0 */ ++ memset(i, 0, offsetof(struct wrapfs_inode_info, vfs_inode)); ++ ++ i->vfs_inode.i_version = 1; ++ return &i->vfs_inode; ++} ++ ++static void wrapfs_destroy_inode(struct inode *inode) ++{ ++ kmem_cache_free(wrapfs_inode_cachep, WRAPFS_I(inode)); ++} ++ ++/* wrapfs inode cache constructor */ ++static void init_once(void *obj) ++{ ++ struct wrapfs_inode_info *i = obj; ++ ++ inode_init_once(&i->vfs_inode); ++} ++ ++int wrapfs_init_inode_cache(void) ++{ ++ int err = 0; ++ ++ wrapfs_inode_cachep = ++ kmem_cache_create("wrapfs_inode_cache", ++ sizeof(struct wrapfs_inode_info), 0, ++ SLAB_RECLAIM_ACCOUNT, init_once); ++ if (!wrapfs_inode_cachep) ++ err = -ENOMEM; ++ return err; ++} ++ ++/* wrapfs inode cache destructor */ ++void wrapfs_destroy_inode_cache(void) ++{ ++ if (wrapfs_inode_cachep) ++ kmem_cache_destroy(wrapfs_inode_cachep); ++} ++ ++/* ++ * Used only in nfs, to kill any pending RPC tasks, so that subsequent ++ * code can actually succeed and won't leave tasks that need handling. ++ */ ++static void wrapfs_umount_begin(struct super_block *sb) ++{ ++ struct super_block *lower_sb; ++ ++ lower_sb = wrapfs_lower_super(sb); ++ if (lower_sb && lower_sb->s_op && lower_sb->s_op->umount_begin) ++ lower_sb->s_op->umount_begin(lower_sb); ++} ++ ++const struct super_operations wrapfs_sops = { ++ .put_super = wrapfs_put_super, ++ .statfs = wrapfs_statfs, ++ .remount_fs = wrapfs_remount_fs, ++ .evict_inode = wrapfs_evict_inode, ++ .umount_begin = wrapfs_umount_begin, ++ .show_options = generic_show_options, ++ .alloc_inode = wrapfs_alloc_inode, ++ .destroy_inode = wrapfs_destroy_inode, ++ .drop_inode = generic_delete_inode, ++}; +diff --git a/fs/wrapfs/wrapfs.h b/fs/wrapfs/wrapfs.h +new file mode 100644 +index 0000000..25b5795 +--- /dev/null ++++ b/fs/wrapfs/wrapfs.h +@@ -0,0 +1,204 @@ ++/* ++ * Copyright (c) 1998-2011 Erez Zadok ++ * Copyright (c) 2009 Shrikar Archak ++ * Copyright (c) 2003-2011 Stony Brook University ++ * Copyright (c) 2003-2011 The Research Foundation of SUNY ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#ifndef _WRAPFS_H_ ++#define _WRAPFS_H_ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* the file system name */ ++#define WRAPFS_NAME "wrapfs" ++ ++/* wrapfs root inode number */ ++#define WRAPFS_ROOT_INO 1 ++ ++/* useful for tracking code reachability */ ++#define UDBG printk(KERN_DEFAULT "DBG:%s:%s:%d\n", __FILE__, __func__, __LINE__) ++ ++/* operations vectors defined in specific files */ ++extern const struct file_operations wrapfs_main_fops; ++extern const struct file_operations wrapfs_dir_fops; ++extern const struct inode_operations wrapfs_main_iops; ++extern const struct inode_operations wrapfs_dir_iops; ++extern const struct inode_operations wrapfs_symlink_iops; ++extern const struct super_operations wrapfs_sops; ++extern const struct dentry_operations wrapfs_dops; ++extern const struct address_space_operations wrapfs_aops, wrapfs_dummy_aops; ++extern const struct vm_operations_struct wrapfs_vm_ops; ++ ++extern int wrapfs_init_inode_cache(void); ++extern void wrapfs_destroy_inode_cache(void); ++extern int wrapfs_init_dentry_cache(void); ++extern void wrapfs_destroy_dentry_cache(void); ++extern int new_dentry_private_data(struct dentry *dentry); ++extern void free_dentry_private_data(struct dentry *dentry); ++extern struct dentry *wrapfs_lookup(struct inode *dir, struct dentry *dentry, ++ struct nameidata *nd); ++extern struct inode *wrapfs_iget(struct super_block *sb, ++ struct inode *lower_inode); ++extern int wrapfs_interpose(struct dentry *dentry, struct super_block *sb, ++ struct path *lower_path); ++ ++/* file private data */ ++struct wrapfs_file_info { ++ struct file *lower_file; ++ const struct vm_operations_struct *lower_vm_ops; ++}; ++ ++/* wrapfs inode data in memory */ ++struct wrapfs_inode_info { ++ struct inode *lower_inode; ++ struct inode vfs_inode; ++}; ++ ++/* wrapfs dentry data in memory */ ++struct wrapfs_dentry_info { ++ spinlock_t lock; /* protects lower_path */ ++ struct path lower_path; ++}; ++ ++/* wrapfs super-block data in memory */ ++struct wrapfs_sb_info { ++ struct super_block *lower_sb; ++}; ++ ++/* ++ * inode to private data ++ * ++ * Since we use containers and the struct inode is _inside_ the ++ * wrapfs_inode_info structure, WRAPFS_I will always (given a non-NULL ++ * inode pointer), return a valid non-NULL pointer. ++ */ ++static inline struct wrapfs_inode_info *WRAPFS_I(const struct inode *inode) ++{ ++ return container_of(inode, struct wrapfs_inode_info, vfs_inode); ++} ++ ++/* dentry to private data */ ++#define WRAPFS_D(dent) ((struct wrapfs_dentry_info *)(dent)->d_fsdata) ++ ++/* superblock to private data */ ++#define WRAPFS_SB(super) ((struct wrapfs_sb_info *)(super)->s_fs_info) ++ ++/* file to private Data */ ++#define WRAPFS_F(file) ((struct wrapfs_file_info *)((file)->private_data)) ++ ++/* file to lower file */ ++static inline struct file *wrapfs_lower_file(const struct file *f) ++{ ++ return WRAPFS_F(f)->lower_file; ++} ++ ++static inline void wrapfs_set_lower_file(struct file *f, struct file *val) ++{ ++ WRAPFS_F(f)->lower_file = val; ++} ++ ++/* inode to lower inode. */ ++static inline struct inode *wrapfs_lower_inode(const struct inode *i) ++{ ++ return WRAPFS_I(i)->lower_inode; ++} ++ ++static inline void wrapfs_set_lower_inode(struct inode *i, struct inode *val) ++{ ++ WRAPFS_I(i)->lower_inode = val; ++} ++ ++/* superblock to lower superblock */ ++static inline struct super_block *wrapfs_lower_super( ++ const struct super_block *sb) ++{ ++ return WRAPFS_SB(sb)->lower_sb; ++} ++ ++static inline void wrapfs_set_lower_super(struct super_block *sb, ++ struct super_block *val) ++{ ++ WRAPFS_SB(sb)->lower_sb = val; ++} ++ ++/* path based (dentry/mnt) macros */ ++static inline void pathcpy(struct path *dst, const struct path *src) ++{ ++ dst->dentry = src->dentry; ++ dst->mnt = src->mnt; ++} ++/* Returns struct path. Caller must path_put it. */ ++static inline void wrapfs_get_lower_path(const struct dentry *dent, ++ struct path *lower_path) ++{ ++ spin_lock(&WRAPFS_D(dent)->lock); ++ pathcpy(lower_path, &WRAPFS_D(dent)->lower_path); ++ path_get(lower_path); ++ spin_unlock(&WRAPFS_D(dent)->lock); ++ return; ++} ++static inline void wrapfs_put_lower_path(const struct dentry *dent, ++ struct path *lower_path) ++{ ++ path_put(lower_path); ++ return; ++} ++static inline void wrapfs_set_lower_path(const struct dentry *dent, ++ struct path *lower_path) ++{ ++ spin_lock(&WRAPFS_D(dent)->lock); ++ pathcpy(&WRAPFS_D(dent)->lower_path, lower_path); ++ spin_unlock(&WRAPFS_D(dent)->lock); ++ return; ++} ++static inline void wrapfs_reset_lower_path(const struct dentry *dent) ++{ ++ spin_lock(&WRAPFS_D(dent)->lock); ++ WRAPFS_D(dent)->lower_path.dentry = NULL; ++ WRAPFS_D(dent)->lower_path.mnt = NULL; ++ spin_unlock(&WRAPFS_D(dent)->lock); ++ return; ++} ++static inline void wrapfs_put_reset_lower_path(const struct dentry *dent) ++{ ++ struct path lower_path; ++ spin_lock(&WRAPFS_D(dent)->lock); ++ pathcpy(&lower_path, &WRAPFS_D(dent)->lower_path); ++ WRAPFS_D(dent)->lower_path.dentry = NULL; ++ WRAPFS_D(dent)->lower_path.mnt = NULL; ++ spin_unlock(&WRAPFS_D(dent)->lock); ++ path_put(&lower_path); ++ return; ++} ++ ++/* locking helpers */ ++static inline struct dentry *lock_parent(struct dentry *dentry) ++{ ++ struct dentry *dir = dget_parent(dentry); ++ mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); ++ return dir; ++} ++ ++static inline void unlock_dir(struct dentry *dir) ++{ ++ mutex_unlock(&dir->d_inode->i_mutex); ++ dput(dir); ++} ++#endif /* not _WRAPFS_H_ */ +diff --git a/include/linux/magic.h b/include/linux/magic.h +index 2d4beab..8ef0170 100644 +--- a/include/linux/magic.h ++++ b/include/linux/magic.h +@@ -50,6 +50,8 @@ + #define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs" + #define REISER2FS_JR_SUPER_MAGIC_STRING "ReIsEr3Fs" + ++#define WRAPFS_SUPER_MAGIC 0xb550ca10 ++ + #define SMB_SUPER_MAGIC 0x517B + #define USBDEVICE_SUPER_MAGIC 0x9fa2 + #define CGROUP_SUPER_MAGIC 0x27e0eb